From 4c1bb00559051595432b215ca148e4bd810e2177 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Sat, 3 Sep 2022 19:42:34 +0800 Subject: [PATCH 0001/1411] Update flight definitions including backwards-incompatible change to GetSchema (#2586) * update flight doc and code * fix cliyyp * backward compatibility for schema result * fix lint * Update arrow/src/ipc/convert.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/ipc/convert.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/arrow.flight.protocol.rs | 129 +- arrow-flight/src/lib.rs | 90 +- .../src/sql/arrow.flight.protocol.sql.rs | 1398 ++++++++++------- arrow-flight/src/utils.rs | 6 +- arrow/src/ipc/convert.rs | 52 +- format/Flight.proto | 43 +- format/FlightSql.proto | 286 +++- format/Message.fbs | 3 +- format/Schema.fbs | 176 ++- 9 files changed, 1462 insertions(+), 721 deletions(-) diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index 2b085d6d1f6b..d9e4200030fa 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -1,31 +1,31 @@ // This file was automatically generated through the build.rs script, and should not be edited. /// -/// The request that a client provides to a server on handshake. +/// The request that a client provides to a server on handshake. #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeRequest { /// - /// A defined protocol version + /// A defined protocol version #[prost(uint64, tag="1")] pub protocol_version: u64, /// - /// Arbitrary auth/handshake info. + /// Arbitrary auth/handshake info. #[prost(bytes="vec", tag="2")] pub payload: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeResponse { /// - /// A defined protocol version + /// A defined protocol version #[prost(uint64, tag="1")] pub protocol_version: u64, /// - /// Arbitrary auth/handshake info. + /// Arbitrary auth/handshake info. #[prost(bytes="vec", tag="2")] pub payload: ::prost::alloc::vec::Vec, } /// -/// A message for doing simple auth. +/// A message for doing simple auth. #[derive(Clone, PartialEq, ::prost::Message)] pub struct BasicAuth { #[prost(string, tag="2")] @@ -37,8 +37,8 @@ pub struct BasicAuth { pub struct Empty { } /// -/// Describes an available action, including both the name used for execution -/// along with a short description of the purpose of the action. +/// Describes an available action, including both the name used for execution +/// along with a short description of the purpose of the action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionType { #[prost(string, tag="1")] @@ -47,15 +47,15 @@ pub struct ActionType { pub description: ::prost::alloc::string::String, } /// -/// A service specific expression that can be used to return a limited set -/// of available Arrow Flight streams. +/// A service specific expression that can be used to return a limited set +/// of available Arrow Flight streams. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Criteria { #[prost(bytes="vec", tag="1")] pub expression: ::prost::alloc::vec::Vec, } /// -/// An opaque action specific for the service. +/// An opaque action specific for the service. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Action { #[prost(string, tag="1")] @@ -64,54 +64,57 @@ pub struct Action { pub body: ::prost::alloc::vec::Vec, } /// -/// An opaque result returned after executing an action. +/// An opaque result returned after executing an action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Result { #[prost(bytes="vec", tag="1")] pub body: ::prost::alloc::vec::Vec, } /// -/// Wrap the result of a getSchema call +/// Wrap the result of a getSchema call #[derive(Clone, PartialEq, ::prost::Message)] pub struct SchemaResult { - /// schema of the dataset as described in Schema.fbs::Schema. + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema #[prost(bytes="vec", tag="1")] pub schema: ::prost::alloc::vec::Vec, } /// -/// The name or tag for a Flight. May be used as a way to retrieve or generate -/// a flight or be used to expose a set of previously defined flights. +/// The name or tag for a Flight. May be used as a way to retrieve or generate +/// a flight or be used to expose a set of previously defined flights. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightDescriptor { #[prost(enumeration="flight_descriptor::DescriptorType", tag="1")] pub r#type: i32, /// - /// Opaque value used to express a command. Should only be defined when - /// type = CMD. + /// Opaque value used to express a command. Should only be defined when + /// type = CMD. #[prost(bytes="vec", tag="2")] pub cmd: ::prost::alloc::vec::Vec, /// - /// List of strings identifying a particular dataset. Should only be defined - /// when type = PATH. + /// List of strings identifying a particular dataset. Should only be defined + /// when type = PATH. #[prost(string, repeated, tag="3")] pub path: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } /// Nested message and enum types in `FlightDescriptor`. pub mod flight_descriptor { /// - /// Describes what type of descriptor is defined. + /// Describes what type of descriptor is defined. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum DescriptorType { - /// Protobuf pattern, not used. + /// Protobuf pattern, not used. Unknown = 0, /// - /// A named path that identifies a dataset. A path is composed of a string - /// or list of strings describing a particular dataset. This is conceptually + /// A named path that identifies a dataset. A path is composed of a string + /// or list of strings describing a particular dataset. This is conceptually /// similar to a path inside a filesystem. Path = 1, /// - /// An opaque command to generate a dataset. + /// An opaque command to generate a dataset. Cmd = 2, } impl DescriptorType { @@ -129,86 +132,110 @@ pub mod flight_descriptor { } } /// -/// The access coordinates for retrieval of a dataset. With a FlightInfo, a -/// consumer is able to determine how to retrieve a dataset. +/// The access coordinates for retrieval of a dataset. With a FlightInfo, a +/// consumer is able to determine how to retrieve a dataset. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightInfo { - /// schema of the dataset as described in Schema.fbs::Schema. + /// The schema of the dataset in its IPC form: + /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + /// 4 bytes - the byte length of the payload + /// a flatbuffer Message whose header is the Schema #[prost(bytes="vec", tag="1")] pub schema: ::prost::alloc::vec::Vec, /// - /// The descriptor associated with this info. + /// The descriptor associated with this info. #[prost(message, optional, tag="2")] pub flight_descriptor: ::core::option::Option, /// - /// A list of endpoints associated with the flight. To consume the whole - /// flight, all endpoints must be consumed. + /// A list of endpoints associated with the flight. To consume the + /// whole flight, all endpoints (and hence all Tickets) must be + /// consumed. Endpoints can be consumed in any order. + /// + /// In other words, an application can use multiple endpoints to + /// represent partitioned data. + /// + /// There is no ordering defined on endpoints. Hence, if the returned + /// data has an ordering, it should be returned in a single endpoint. #[prost(message, repeated, tag="3")] pub endpoint: ::prost::alloc::vec::Vec, - /// Set these to -1 if unknown. + /// Set these to -1 if unknown. #[prost(int64, tag="4")] pub total_records: i64, #[prost(int64, tag="5")] pub total_bytes: i64, } /// -/// A particular stream or split associated with a flight. +/// A particular stream or split associated with a flight. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightEndpoint { /// - /// Token used to retrieve this stream. + /// Token used to retrieve this stream. #[prost(message, optional, tag="1")] pub ticket: ::core::option::Option, /// - /// A list of URIs where this ticket can be redeemed. If the list is - /// empty, the expectation is that the ticket can only be redeemed on the - /// current service where the ticket was generated. + /// A list of URIs where this ticket can be redeemed via DoGet(). + /// + /// If the list is empty, the expectation is that the ticket can only + /// be redeemed on the current service where the ticket was + /// generated. + /// + /// If the list is not empty, the expectation is that the ticket can + /// be redeemed at any of the locations, and that the data returned + /// will be equivalent. In this case, the ticket may only be redeemed + /// at one of the given locations, and not (necessarily) on the + /// current service. + /// + /// In other words, an application can use multiple locations to + /// represent redundant and/or load balanced services. #[prost(message, repeated, tag="2")] pub location: ::prost::alloc::vec::Vec, } /// -/// A location where a Flight service will accept retrieval of a particular -/// stream given a ticket. +/// A location where a Flight service will accept retrieval of a particular +/// stream given a ticket. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Location { #[prost(string, tag="1")] pub uri: ::prost::alloc::string::String, } /// -/// An opaque identifier that the service can use to retrieve a particular -/// portion of a stream. +/// An opaque identifier that the service can use to retrieve a particular +/// portion of a stream. +/// +/// Tickets are meant to be single use. It is an error/application-defined +/// behavior to reuse a ticket. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Ticket { #[prost(bytes="vec", tag="1")] pub ticket: ::prost::alloc::vec::Vec, } /// -/// A batch of Arrow data as part of a stream of batches. +/// A batch of Arrow data as part of a stream of batches. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightData { /// - /// The descriptor of the data. This is only relevant when a client is - /// starting a new DoPut stream. + /// The descriptor of the data. This is only relevant when a client is + /// starting a new DoPut stream. #[prost(message, optional, tag="1")] pub flight_descriptor: ::core::option::Option, /// - /// Header for message data as described in Message.fbs::Message. + /// Header for message data as described in Message.fbs::Message. #[prost(bytes="vec", tag="2")] pub data_header: ::prost::alloc::vec::Vec, /// - /// Application-defined metadata. + /// Application-defined metadata. #[prost(bytes="vec", tag="3")] pub app_metadata: ::prost::alloc::vec::Vec, /// - /// The actual batch of Arrow data. Preferably handled with minimal-copies - /// coming last in the definition to help with sidecar patterns (it is - /// expected that some implementations will fetch this field off the wire - /// with specialized code to avoid extra memory copies). + /// The actual batch of Arrow data. Preferably handled with minimal-copies + /// coming last in the definition to help with sidecar patterns (it is + /// expected that some implementations will fetch this field off the wire + /// with specialized code to avoid extra memory copies). #[prost(bytes="vec", tag="1000")] pub data_body: ::prost::alloc::vec::Vec, } /// * -/// The response message associated with the submission of a DoPut. +/// The response message associated with the submission of a DoPut. #[derive(Clone, PartialEq, ::prost::Message)] pub struct PutResult { #[prost(bytes="vec", tag="1")] diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 3f4f09855353..54f4d24b65ae 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -17,11 +17,9 @@ use arrow::datatypes::Schema; use arrow::error::{ArrowError, Result as ArrowResult}; -use arrow::ipc::{ - convert, size_prefixed_root_as_message, writer, writer::EncodedData, - writer::IpcWriteOptions, -}; +use arrow::ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; +use arrow::ipc::convert::try_schema_from_ipc_buffer; use std::{ convert::{TryFrom, TryInto}, fmt, @@ -254,10 +252,17 @@ impl From> for FlightData { } } -impl From> for SchemaResult { - fn from(schema_ipc: SchemaAsIpc) -> Self { - let IpcMessage(vals) = flight_schema_as_flatbuffer(schema_ipc.0, schema_ipc.1); - SchemaResult { schema: vals } +impl TryFrom> for SchemaResult { + type Error = ArrowError; + + fn try_from(schema_ipc: SchemaAsIpc) -> ArrowResult { + // According to the definition from `Flight.proto` + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + let IpcMessage(vals) = schema_to_ipc_format(schema_ipc)?; + Ok(SchemaResult { schema: vals }) } } @@ -275,19 +280,23 @@ impl TryFrom> for IpcMessage { type Error = ArrowError; fn try_from(schema_ipc: SchemaAsIpc) -> ArrowResult { - let pair = *schema_ipc; - let encoded_data = flight_schema_as_encoded_data(pair.0, pair.1); - - let mut schema = vec![]; - arrow::ipc::writer::write_message(&mut schema, encoded_data, pair.1)?; - Ok(IpcMessage(schema)) + schema_to_ipc_format(schema_ipc) } } +fn schema_to_ipc_format(schema_ipc: SchemaAsIpc) -> ArrowResult { + let pair = *schema_ipc; + let encoded_data = flight_schema_as_encoded_data(pair.0, pair.1); + + let mut schema = vec![]; + arrow::ipc::writer::write_message(&mut schema, encoded_data, pair.1)?; + Ok(IpcMessage(schema)) +} + impl TryFrom<&FlightData> for Schema { type Error = ArrowError; fn try_from(data: &FlightData) -> ArrowResult { - convert::schema_from_bytes(&data.data_header[..]).map_err(|err| { + convert::try_schema_from_flatbuffer_bytes(&data.data_header[..]).map_err(|err| { ArrowError::ParseError(format!( "Unable to convert flight data to Arrow schema: {}", err @@ -309,32 +318,14 @@ impl TryFrom for Schema { type Error = ArrowError; fn try_from(value: IpcMessage) -> ArrowResult { - // CONTINUATION TAKES 4 BYTES - // SIZE TAKES 4 BYTES (so read msg as size prefixed) - let msg = size_prefixed_root_as_message(&value.0[4..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert flight info to a message: {}", - err - )) - })?; - let ipc_schema = msg.header_as_schema().ok_or_else(|| { - ArrowError::ParseError( - "Unable to convert flight info to a schema".to_string(), - ) - })?; - Ok(convert::fb_to_schema(ipc_schema)) + try_schema_from_ipc_buffer(value.0.as_slice()) } } impl TryFrom<&SchemaResult> for Schema { type Error = ArrowError; fn try_from(data: &SchemaResult) -> ArrowResult { - convert::schema_from_bytes(&data.schema[..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert schema result to Arrow schema: {}", - err - )) - }) + try_schema_from_ipc_buffer(data.schema.as_slice()) } } @@ -405,6 +396,8 @@ impl<'a> SchemaAsIpc<'a> { #[cfg(test)] mod tests { use super::*; + use arrow::datatypes::{DataType, Field, TimeUnit}; + use arrow::ipc::MetadataVersion; struct TestVector(Vec, usize); @@ -448,4 +441,31 @@ mod tests { let expected = format!("{:?}", vec![91; 9]); assert_eq!(actual, expected); } + + #[test] + fn ser_deser_schema_result() { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Float64, true), + Field::new("c3", DataType::UInt32, false), + Field::new("c4", DataType::Boolean, true), + Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("c6", DataType::Time32(TimeUnit::Second), false), + ]); + // V5 with write_legacy_ipc_format = false + // this will write the continuation marker + let option = IpcWriteOptions::default(); + let schema_ipc = SchemaAsIpc::new(&schema, &option); + let result: SchemaResult = schema_ipc.try_into().unwrap(); + let des_schema: Schema = (&result).try_into().unwrap(); + assert_eq!(schema, des_schema); + + // V4 with write_legacy_ipc_format = true + // this will not write the continuation marker + let option = IpcWriteOptions::try_new(8, true, MetadataVersion::V4).unwrap(); + let schema_ipc = SchemaAsIpc::new(&schema, &option); + let result: SchemaResult = schema_ipc.try_into().unwrap(); + let des_schema: Schema = (&result).try_into().unwrap(); + assert_eq!(schema, des_schema); + } } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 77221dd1a489..284f6a15c526 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -1,13 +1,13 @@ // This file was automatically generated through the build.rs script, and should not be edited. /// -/// Represents a metadata request. Used in the command member of FlightDescriptor -/// for the following RPC calls: +/// Represents a metadata request. Used in the command member of FlightDescriptor +/// for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// info_name: uint32 not null, /// value: dense_union< /// string_value: utf8, @@ -16,185 +16,260 @@ /// int32_bitmask: int32, /// string_list: list /// int32_to_int32_list_map: map> -/// > -/// where there is one row per requested piece of metadata information. +/// > +/// where there is one row per requested piece of metadata information. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetSqlInfo { /// - /// Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide - /// Flight SQL clients with basic, SQL syntax and SQL functions related information. - /// More information types can be added in future releases. - /// E.g. more SQL syntax support types, scalar functions support, type conversion support etc. + /// Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide + /// Flight SQL clients with basic, SQL syntax and SQL functions related information. + /// More information types can be added in future releases. + /// E.g. more SQL syntax support types, scalar functions support, type conversion support etc. /// - /// Note that the set of metadata may expand. + /// Note that the set of metadata may expand. /// - /// Initially, Flight SQL will support the following information types: - /// - Server Information - Range [0-500) - /// - Syntax Information - Range [500-1000) - /// Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). - /// Custom options should start at 10,000. + /// Initially, Flight SQL will support the following information types: + /// - Server Information - Range [0-500) + /// - Syntax Information - Range [500-1000) + /// Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). + /// Custom options should start at 10,000. /// - /// If omitted, then all metadata will be retrieved. - /// Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must - /// at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. - /// If additional metadata is included, the metadata IDs should start from 10,000. + /// If omitted, then all metadata will be retrieved. + /// Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must + /// at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. + /// If additional metadata is included, the metadata IDs should start from 10,000. #[prost(uint32, repeated, tag="1")] pub info: ::prost::alloc::vec::Vec, } /// -/// Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. -/// The definition of a catalog depends on vendor/implementation. It is usually the database itself -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. +/// +/// The returned schema will be: +/// < +/// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), +/// data_type: int not null (The SQL data type), +/// column_size: int (The maximum size supported by that column. +/// In case of exact numeric types, this represents the maximum precision. +/// In case of string types, this represents the character length. +/// In case of datetime data types, this represents the length in characters of the string representation. +/// NULL is returned for data types where column size is not applicable.), +/// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for +/// data types where a literal prefix is not applicable.), +/// literal_suffix: utf8 (Character or characters used to terminate a literal, +/// NULL is returned for data types where a literal suffix is not applicable.), +/// create_params: list +/// (A list of keywords corresponding to which parameters can be used when creating +/// a column for that specific type. +/// NULL is returned if there are no parameters for the data type definition.), +/// nullable: int not null (Shows if the data type accepts a NULL value. The possible values can be seen in the +/// Nullable enum.), +/// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), +/// searchable: int not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the +/// Searchable enum.), +/// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is +/// not applicable to the data type or the data type is not numeric.), +/// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), +/// auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute +/// is not applicable to the data type or the data type is not numeric.), +/// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL +/// is returned if a localized name is not supported by the data source), +/// minimum_scale: int (The minimum scale of the data type on the data source. +/// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE +/// columns both contain this value. NULL is returned if scale is not applicable.), +/// maximum_scale: int (The maximum scale of the data type on the data source. +/// NULL is returned if scale is not applicable.), +/// sql_data_type: int not null (The value of the SQL DATA TYPE which has the same values +/// as data_type value. Except for interval and datetime, which +/// uses generic values. More info about those types can be +/// obtained through datetime_subcode. The possible values can be seen +/// in the XdbcDataType enum.), +/// datetime_subcode: int (Only used when the SQL DATA TYPE is interval or datetime. It contains +/// its sub types. For type different from interval and datetime, this value +/// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), +/// num_prec_radix: int (If the data type is an approximate numeric type, this column contains +/// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For +/// exact numeric types, this column contains the value 10 to indicate that +/// column size specifies a number of decimal digits. Otherwise, this column is NULL.), +/// interval_precision: int (If the data type is an interval data type, then this column contains the value +/// of the interval leading precision. Otherwise, this column is NULL. This fields +/// is only relevant to be used by ODBC). +/// > +/// The returned data should be ordered by data_type and then by type_name. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct CommandGetXdbcTypeInfo { + /// + /// Specifies the data type to search for the info. + #[prost(int32, optional, tag="1")] + pub data_type: ::core::option::Option, +} +/// +/// Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. +/// The definition of a catalog depends on vendor/implementation. It is usually the database itself +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// catalog_name: utf8 not null -/// > -/// The returned data should be ordered by catalog_name. +/// > +/// The returned data should be ordered by catalog_name. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCatalogs { } /// -/// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. -/// The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. +/// The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// catalog_name: utf8, /// db_schema_name: utf8 not null -/// > -/// The returned data should be ordered by catalog_name, then db_schema_name. +/// > +/// The returned data should be ordered by catalog_name, then db_schema_name. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetDbSchemas { /// - /// Specifies the Catalog to search for the tables. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the Catalog to search for the tables. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for schemas to search for. - /// When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. - /// In the pattern string, two special characters can be used to denote matching rules: + /// Specifies a filter pattern for schemas to search for. + /// When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. + /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. #[prost(string, optional, tag="2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, } /// -/// Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// catalog_name: utf8, /// db_schema_name: utf8, /// table_name: utf8 not null, /// table_type: utf8 not null, /// \[optional\] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, /// it is serialized as an IPC message.) -/// > -/// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. +/// > +/// Fields on table_schema may contain the following metadata: +/// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +/// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +/// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +/// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +/// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +/// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +/// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +/// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTables { /// - /// Specifies the Catalog to search for the tables. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the Catalog to search for the tables. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for schemas to search for. - /// When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. - /// In the pattern string, two special characters can be used to denote matching rules: + /// Specifies a filter pattern for schemas to search for. + /// When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. + /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. #[prost(string, optional, tag="2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for tables to search for. - /// When no table_name_filter_pattern is provided, all tables matching other filters are searched. - /// In the pattern string, two special characters can be used to denote matching rules: + /// Specifies a filter pattern for tables to search for. + /// When no table_name_filter_pattern is provided, all tables matching other filters are searched. + /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. #[prost(string, optional, tag="3")] pub table_name_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter of table types which must match. - /// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. - /// TABLE, VIEW, and SYSTEM TABLE are commonly supported. + /// Specifies a filter of table types which must match. + /// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + /// TABLE, VIEW, and SYSTEM TABLE are commonly supported. #[prost(string, repeated, tag="4")] pub table_types: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// Specifies if the Arrow schema should be returned for found tables. + /// Specifies if the Arrow schema should be returned for found tables. #[prost(bool, tag="5")] pub include_schema: bool, } /// -/// Represents a request to retrieve the list of table types on a Flight SQL enabled backend. -/// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. -/// TABLE, VIEW, and SYSTEM TABLE are commonly supported. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve the list of table types on a Flight SQL enabled backend. +/// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. +/// TABLE, VIEW, and SYSTEM TABLE are commonly supported. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// table_type: utf8 not null -/// > -/// The returned data should be ordered by table_type. +/// > +/// The returned data should be ordered by table_type. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTableTypes { } /// -/// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// catalog_name: utf8, /// db_schema_name: utf8, /// table_name: utf8 not null, /// column_name: utf8 not null, /// key_name: utf8, /// key_sequence: int not null -/// > -/// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. +/// > +/// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetPrimaryKeys { /// - /// Specifies the catalog to search for the table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the table to get the primary keys for. + /// Specifies the table to get the primary keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve a description of the foreign key columns that reference the given table's -/// primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve a description of the foreign key columns that reference the given table's +/// primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// pk_catalog_name: utf8, /// pk_db_schema_name: utf8, /// pk_table_name: utf8 not null, @@ -208,35 +283,35 @@ pub struct CommandGetPrimaryKeys { /// pk_key_name: utf8, /// update_rule: uint1 not null, /// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. +/// > +/// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetExportedKeys { /// - /// Specifies the catalog to search for the foreign key table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the foreign key table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the foreign key table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the foreign key table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the foreign key table to get the foreign keys for. + /// Specifies the foreign key table to get the foreign keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// pk_catalog_name: utf8, /// pk_db_schema_name: utf8, /// pk_table_name: utf8 not null, @@ -250,9 +325,9 @@ pub struct CommandGetExportedKeys { /// pk_key_name: utf8, /// update_rule: uint1 not null, /// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions: +/// > +/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions: /// - 0 = CASCADE /// - 1 = RESTRICT /// - 2 = SET NULL @@ -261,31 +336,31 @@ pub struct CommandGetExportedKeys { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetImportedKeys { /// - /// Specifies the catalog to search for the primary key table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the primary key table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the primary key table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the primary key table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the primary key table to get the foreign keys for. + /// Specifies the primary key table to get the foreign keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve a description of the foreign key columns in the given foreign key table that -/// reference the primary key or the columns representing a unique constraint of the parent table (could be the same -/// or a different table) on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: +/// Represents a request to retrieve a description of the foreign key columns in the given foreign key table that +/// reference the primary key or the columns representing a unique constraint of the parent table (could be the same +/// or a different table) on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. /// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < +/// The returned Arrow schema will be: +/// < /// pk_catalog_name: utf8, /// pk_db_schema_name: utf8, /// pk_table_name: utf8 not null, @@ -299,9 +374,9 @@ pub struct CommandGetImportedKeys { /// pk_key_name: utf8, /// update_rule: uint1 not null, /// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions: +/// > +/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions: /// - 0 = CASCADE /// - 1 = RESTRICT /// - 2 = SET NULL @@ -310,697 +385,722 @@ pub struct CommandGetImportedKeys { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCrossReference { /// * - /// The catalog name where the parent table is. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// The catalog name where the parent table is. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub pk_catalog: ::core::option::Option<::prost::alloc::string::String>, /// * - /// The Schema name where the parent table is. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// The Schema name where the parent table is. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub pk_db_schema: ::core::option::Option<::prost::alloc::string::String>, /// * - /// The parent table name. It cannot be null. + /// The parent table name. It cannot be null. #[prost(string, tag="3")] pub pk_table: ::prost::alloc::string::String, /// * - /// The catalog name where the foreign table is. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// The catalog name where the foreign table is. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="4")] pub fk_catalog: ::core::option::Option<::prost::alloc::string::String>, /// * - /// The schema name where the foreign table is. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// The schema name where the foreign table is. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="5")] pub fk_db_schema: ::core::option::Option<::prost::alloc::string::String>, /// * - /// The foreign table name. It cannot be null. + /// The foreign table name. It cannot be null. #[prost(string, tag="6")] pub fk_table: ::prost::alloc::string::String, } -// SQL Execution Action Messages +// SQL Execution Action Messages /// -/// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. +/// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementRequest { - /// The valid SQL string to create a prepared statement for. + /// The valid SQL string to create a prepared statement for. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } /// -/// Wrap the result of a "GetPreparedStatement" action. +/// Wrap the result of a "GetPreparedStatement" action. /// -/// The resultant PreparedStatement can be closed either: -/// - Manually, through the "ClosePreparedStatement" action; -/// - Automatically, by a server timeout. +/// The resultant PreparedStatement can be closed either: +/// - Manually, through the "ClosePreparedStatement" action; +/// - Automatically, by a server timeout. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, - /// If a result set generating query was provided, dataset_schema contains the - /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. + /// If a result set generating query was provided, dataset_schema contains the + /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. #[prost(bytes="vec", tag="2")] pub dataset_schema: ::prost::alloc::vec::Vec, - /// If the query provided contained parameters, parameter_schema contains the - /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. + /// If the query provided contained parameters, parameter_schema contains the + /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. #[prost(bytes="vec", tag="3")] pub parameter_schema: ::prost::alloc::vec::Vec, } /// -/// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. -/// Closes server resources associated with the prepared statement handle. +/// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. +/// Closes server resources associated with the prepared statement handle. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionClosePreparedStatementRequest { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } -// SQL Execution Messages. +// SQL Execution Messages. /// -/// Represents a SQL query. Used in the command member of FlightDescriptor -/// for the following RPC calls: +/// Represents a SQL query. Used in the command member of FlightDescriptor +/// for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. +/// Fields on this schema may contain the following metadata: +/// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +/// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +/// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +/// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +/// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +/// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +/// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - GetFlightInfo: execute the query. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementQuery { - /// The SQL syntax. + /// The SQL syntax. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } /// * -/// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. -/// This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. +/// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. +/// This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. #[derive(Clone, PartialEq, ::prost::Message)] pub struct TicketStatementQuery { - /// Unique identifier for the instance of the statement to execute. + /// Unique identifier for the instance of the statement to execute. #[prost(bytes="vec", tag="1")] pub statement_handle: ::prost::alloc::vec::Vec, } /// -/// Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for -/// the following RPC calls: +/// Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for +/// the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// Fields on this schema may contain the following metadata: +/// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +/// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +/// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +/// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +/// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +/// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +/// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. /// - GetFlightInfo: execute the prepared statement instance. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementQuery { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// -/// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included SQL update. +/// Represents a SQL update query. Used in the command member of FlightDescriptor +/// for the the RPC call DoPut to cause the server to execute the included SQL update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementUpdate { - /// The SQL syntax. + /// The SQL syntax. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } /// -/// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included -/// prepared statement handle as an update. +/// Represents a SQL update query. Used in the command member of FlightDescriptor +/// for the the RPC call DoPut to cause the server to execute the included +/// prepared statement handle as an update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementUpdate { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// -/// Returned from the RPC call DoPut when a CommandStatementUpdate -/// CommandPreparedStatementUpdate was in the request, containing -/// results from the update. +/// Returned from the RPC call DoPut when a CommandStatementUpdate +/// CommandPreparedStatementUpdate was in the request, containing +/// results from the update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { - /// The number of records updated. A return value of -1 represents - /// an unknown updated record count. + /// The number of records updated. A return value of -1 represents + /// an unknown updated record count. #[prost(int64, tag="1")] pub record_count: i64, } -/// Options for CommandGetSqlInfo. +/// Options for CommandGetSqlInfo. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlInfo { - // Server Information [0-500): Provides basic information about the Flight SQL Server. + // Server Information [0-500): Provides basic information about the Flight SQL Server. - /// Retrieves a UTF-8 string with the name of the Flight SQL Server. + /// Retrieves a UTF-8 string with the name of the Flight SQL Server. FlightSqlServerName = 0, - /// Retrieves a UTF-8 string with the native version of the Flight SQL Server. + /// Retrieves a UTF-8 string with the native version of the Flight SQL Server. FlightSqlServerVersion = 1, - /// Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. + /// Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. FlightSqlServerArrowVersion = 2, - /// - /// Retrieves a boolean value indicating whether the Flight SQL Server is read only. /// - /// Returns: - /// - false: if read-write - /// - true: if read only + /// Retrieves a boolean value indicating whether the Flight SQL Server is read only. + /// + /// Returns: + /// - false: if read-write + /// - true: if read only FlightSqlServerReadOnly = 3, - // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. + // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. /// - /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. + /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of catalogs. - /// - true: if it supports CREATE and DROP of catalogs. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of catalogs. + /// - true: if it supports CREATE and DROP of catalogs. SqlDdlCatalog = 500, /// - /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. + /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of schemas. - /// - true: if it supports CREATE and DROP of schemas. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of schemas. + /// - true: if it supports CREATE and DROP of schemas. SqlDdlSchema = 501, /// - /// Indicates whether the Flight SQL Server supports CREATE and DROP of tables. + /// Indicates whether the Flight SQL Server supports CREATE and DROP of tables. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of tables. - /// - true: if it supports CREATE and DROP of tables. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of tables. + /// - true: if it supports CREATE and DROP of tables. SqlDdlTable = 502, /// - /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of catalog, table, schema and table names. + /// Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. /// - /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlIdentifierCase = 503, - /// Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. + /// Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. SqlIdentifierQuoteChar = 504, /// - /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of quoted identifiers. + /// Retrieves a int32 describing the case sensitivity of quoted identifiers. /// - /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlQuotedIdentifierCase = 505, /// - /// Retrieves a boolean value indicating whether all tables are selectable. + /// Retrieves a boolean value indicating whether all tables are selectable. /// - /// Returns: - /// - false: if not all tables are selectable or if none are; - /// - true: if all tables are selectable. + /// Returns: + /// - false: if not all tables are selectable or if none are; + /// - true: if all tables are selectable. SqlAllTablesAreSelectable = 506, /// - /// Retrieves the null ordering. + /// Retrieves the null ordering. /// - /// Returns a uint32 ordinal for the null ordering being used, as described in - /// `arrow.flight.protocol.sql.SqlNullOrdering`. + /// Returns a int32 ordinal for the null ordering being used, as described in + /// `arrow.flight.protocol.sql.SqlNullOrdering`. SqlNullOrdering = 507, - /// Retrieves a UTF-8 string list with values of the supported keywords. + /// Retrieves a UTF-8 string list with values of the supported keywords. SqlKeywords = 508, - /// Retrieves a UTF-8 string list with values of the supported numeric functions. + /// Retrieves a UTF-8 string list with values of the supported numeric functions. SqlNumericFunctions = 509, - /// Retrieves a UTF-8 string list with values of the supported string functions. + /// Retrieves a UTF-8 string list with values of the supported string functions. SqlStringFunctions = 510, - /// Retrieves a UTF-8 string list with values of the supported system functions. + /// Retrieves a UTF-8 string list with values of the supported system functions. SqlSystemFunctions = 511, - /// Retrieves a UTF-8 string list with values of the supported datetime functions. + /// Retrieves a UTF-8 string list with values of the supported datetime functions. SqlDatetimeFunctions = 512, /// - /// Retrieves the UTF-8 string that can be used to escape wildcard characters. - /// This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern - /// (and therefore use one of the wildcard characters). - /// The '_' character represents any single character; the '%' character represents any sequence of zero or more - /// characters. + /// Retrieves the UTF-8 string that can be used to escape wildcard characters. + /// This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern + /// (and therefore use one of the wildcard characters). + /// The '_' character represents any single character; the '%' character represents any sequence of zero or more + /// characters. SqlSearchStringEscape = 513, /// - /// Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names - /// (those beyond a-z, A-Z, 0-9 and _). + /// Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names + /// (those beyond a-z, A-Z, 0-9 and _). SqlExtraNameCharacters = 514, /// - /// Retrieves a boolean value indicating whether column aliasing is supported. - /// If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns - /// as required. + /// Retrieves a boolean value indicating whether column aliasing is supported. + /// If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns + /// as required. /// - /// Returns: - /// - false: if column aliasing is unsupported; - /// - true: if column aliasing is supported. + /// Returns: + /// - false: if column aliasing is unsupported; + /// - true: if column aliasing is supported. SqlSupportsColumnAliasing = 515, /// - /// Retrieves a boolean value indicating whether concatenations between null and non-null values being - /// null are supported. + /// Retrieves a boolean value indicating whether concatenations between null and non-null values being + /// null are supported. /// - /// - Returns: - /// - false: if concatenations between null and non-null values being null are unsupported; - /// - true: if concatenations between null and non-null values being null are supported. + /// - Returns: + /// - false: if concatenations between null and non-null values being null are unsupported; + /// - true: if concatenations between null and non-null values being null are supported. SqlNullPlusNullIsNull = 516, /// - /// Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, - /// indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on - /// SqlSupportsConvert enum. - /// The returned map will be: map> + /// Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, + /// indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on + /// SqlSupportsConvert enum. + /// The returned map will be: map> SqlSupportsConvert = 517, /// - /// Retrieves a boolean value indicating whether, when table correlation names are supported, - /// they are restricted to being different from the names of the tables. + /// Retrieves a boolean value indicating whether, when table correlation names are supported, + /// they are restricted to being different from the names of the tables. /// - /// Returns: - /// - false: if table correlation names are unsupported; - /// - true: if table correlation names are supported. + /// Returns: + /// - false: if table correlation names are unsupported; + /// - true: if table correlation names are supported. SqlSupportsTableCorrelationNames = 518, /// - /// Retrieves a boolean value indicating whether, when table correlation names are supported, - /// they are restricted to being different from the names of the tables. + /// Retrieves a boolean value indicating whether, when table correlation names are supported, + /// they are restricted to being different from the names of the tables. /// - /// Returns: - /// - false: if different table correlation names are unsupported; - /// - true: if different table correlation names are supported + /// Returns: + /// - false: if different table correlation names are unsupported; + /// - true: if different table correlation names are supported SqlSupportsDifferentTableCorrelationNames = 519, /// - /// Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. + /// Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. /// - /// Returns: - /// - false: if expressions in ORDER BY are unsupported; - /// - true: if expressions in ORDER BY are supported; + /// Returns: + /// - false: if expressions in ORDER BY are unsupported; + /// - true: if expressions in ORDER BY are supported; SqlSupportsExpressionsInOrderBy = 520, /// - /// Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY - /// clause is supported. + /// Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY + /// clause is supported. /// - /// Returns: - /// - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; - /// - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. + /// Returns: + /// - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; + /// - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. SqlSupportsOrderByUnrelated = 521, /// - /// Retrieves the supported GROUP BY commands; + /// Retrieves the supported GROUP BY commands; /// - /// Returns an int32 bitmask value representing the supported commands. - /// The returned bitmask should be parsed in order to retrieve the supported commands. + /// Returns an int32 bitmask value representing the supported commands. + /// The returned bitmask should be parsed in order to retrieve the supported commands. /// - /// For instance: - /// - return 0 (\b0) => [] (GROUP BY is unsupported); - /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; - /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; - /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. - /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. + /// For instance: + /// - return 0 (\b0) => [] (GROUP BY is unsupported); + /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; + /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; + /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlSupportedGroupBy = 522, /// - /// Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. + /// Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. /// - /// Returns: - /// - false: if specifying a LIKE escape clause is unsupported; - /// - true: if specifying a LIKE escape clause is supported. + /// Returns: + /// - false: if specifying a LIKE escape clause is unsupported; + /// - true: if specifying a LIKE escape clause is supported. SqlSupportsLikeEscapeClause = 523, /// - /// Retrieves a boolean value indicating whether columns may be defined as non-nullable. + /// Retrieves a boolean value indicating whether columns may be defined as non-nullable. /// - /// Returns: - /// - false: if columns cannot be defined as non-nullable; - /// - true: if columns may be defined as non-nullable. + /// Returns: + /// - false: if columns cannot be defined as non-nullable; + /// - true: if columns may be defined as non-nullable. SqlSupportsNonNullableColumns = 524, /// - /// Retrieves the supported SQL grammar level as per the ODBC specification. - /// - /// Returns an int32 bitmask value representing the supported SQL grammar level. - /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. - /// - /// For instance: - /// - return 0 (\b0) => [] (SQL grammar is unsupported); - /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; - /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; - /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; - /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; - /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. - /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. + /// Retrieves the supported SQL grammar level as per the ODBC specification. + /// + /// Returns an int32 bitmask value representing the supported SQL grammar level. + /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. + /// + /// For instance: + /// - return 0 (\b0) => [] (SQL grammar is unsupported); + /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; + /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; + /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; + /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlSupportedGrammar = 525, /// - /// Retrieves the supported ANSI92 SQL grammar level. - /// - /// Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. - /// The returned bitmask should be parsed in order to retrieve the supported commands. - /// - /// For instance: - /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); - /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; - /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; - /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; - /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; - /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; - /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; - /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. - /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. + /// Retrieves the supported ANSI92 SQL grammar level. + /// + /// Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. + /// The returned bitmask should be parsed in order to retrieve the supported commands. + /// + /// For instance: + /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; + /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; + /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; + /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; + /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; + /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlAnsi92SupportedLevel = 526, /// - /// Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. + /// Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. /// - /// Returns: - /// - false: if the SQL Integrity Enhancement Facility is supported; - /// - true: if the SQL Integrity Enhancement Facility is supported. + /// Returns: + /// - false: if the SQL Integrity Enhancement Facility is supported; + /// - true: if the SQL Integrity Enhancement Facility is supported. SqlSupportsIntegrityEnhancementFacility = 527, /// - /// Retrieves the support level for SQL OUTER JOINs. + /// Retrieves the support level for SQL OUTER JOINs. /// - /// Returns a uint3 uint32 ordinal for the SQL ordering being used, as described in - /// `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. + /// Returns a int32 ordinal for the SQL ordering being used, as described in + /// `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. SqlOuterJoinsSupportLevel = 528, - /// Retrieves a UTF-8 string with the preferred term for "schema". + /// Retrieves a UTF-8 string with the preferred term for "schema". SqlSchemaTerm = 529, - /// Retrieves a UTF-8 string with the preferred term for "procedure". + /// Retrieves a UTF-8 string with the preferred term for "procedure". SqlProcedureTerm = 530, - /// Retrieves a UTF-8 string with the preferred term for "catalog". + /// + /// Retrieves a UTF-8 string with the preferred term for "catalog". + /// If a empty string is returned its assumed that the server does NOT supports catalogs. SqlCatalogTerm = 531, /// - /// Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. + /// Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. /// - /// - false: if a catalog does not appear at the start of a fully qualified table name; - /// - true: if a catalog appears at the start of a fully qualified table name. + /// - false: if a catalog does not appear at the start of a fully qualified table name; + /// - true: if a catalog appears at the start of a fully qualified table name. SqlCatalogAtStart = 532, /// - /// Retrieves the supported actions for a SQL schema. - /// - /// Returns an int32 bitmask value representing the supported actions for a SQL schema. - /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL schema); - /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; - /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + /// Retrieves the supported actions for a SQL schema. + /// + /// Returns an int32 bitmask value representing the supported actions for a SQL schema. + /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported actions for SQL schema); + /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; + /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; + /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlSchemasSupportedActions = 533, /// - /// Retrieves the supported actions for a SQL schema. - /// - /// Returns an int32 bitmask value representing the supported actions for a SQL catalog. - /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL catalog); - /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; - /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + /// Retrieves the supported actions for a SQL schema. + /// + /// Returns an int32 bitmask value representing the supported actions for a SQL catalog. + /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported actions for SQL catalog); + /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; + /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; + /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlCatalogsSupportedActions = 534, /// - /// Retrieves the supported SQL positioned commands. + /// Retrieves the supported SQL positioned commands. /// - /// Returns an int32 bitmask value representing the supported SQL positioned commands. - /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. + /// Returns an int32 bitmask value representing the supported SQL positioned commands. + /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); - /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; - /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; - /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; + /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; + /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlSupportedPositionedCommands = 535, /// - /// Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. + /// Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. /// - /// Returns: - /// - false: if SELECT FOR UPDATE statements are unsupported; - /// - true: if SELECT FOR UPDATE statements are supported. + /// Returns: + /// - false: if SELECT FOR UPDATE statements are unsupported; + /// - true: if SELECT FOR UPDATE statements are supported. SqlSelectForUpdateSupported = 536, /// - /// Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax - /// are supported. + /// Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax + /// are supported. /// - /// Returns: - /// - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; - /// - true: if stored procedure calls that use the stored procedure escape syntax are supported. + /// Returns: + /// - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; + /// - true: if stored procedure calls that use the stored procedure escape syntax are supported. SqlStoredProceduresSupported = 537, /// - /// Retrieves the supported SQL subqueries. - /// - /// Returns an int32 bitmask value representing the supported SQL subqueries. - /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL subqueries); - /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; - /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; - /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; - /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; - /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; - /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; - /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - ... - /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. + /// Retrieves the supported SQL subqueries. + /// + /// Returns an int32 bitmask value representing the supported SQL subqueries. + /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL subqueries); + /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; + /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; + /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; + /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; + /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - ... + /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlSupportedSubqueries = 538, /// - /// Retrieves a boolean value indicating whether correlated subqueries are supported. + /// Retrieves a boolean value indicating whether correlated subqueries are supported. /// - /// Returns: - /// - false: if correlated subqueries are unsupported; - /// - true: if correlated subqueries are supported. + /// Returns: + /// - false: if correlated subqueries are unsupported; + /// - true: if correlated subqueries are supported. SqlCorrelatedSubqueriesSupported = 539, /// - /// Retrieves the supported SQL UNIONs. + /// Retrieves the supported SQL UNIONs. /// - /// Returns an int32 bitmask value representing the supported SQL UNIONs. - /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. + /// Returns an int32 bitmask value representing the supported SQL UNIONs. + /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); - /// - return 1 (\b1) => \[SQL_UNION\]; - /// - return 2 (\b10) => \[SQL_UNION_ALL\]; - /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 1 (\b1) => \[SQL_UNION\]; + /// - return 2 (\b10) => \[SQL_UNION_ALL\]; + /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. SqlSupportedUnions = 540, - /// Retrieves a uint32 value representing the maximum number of hex characters allowed in an inline binary literal. + /// Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. SqlMaxBinaryLiteralLength = 541, - /// Retrieves a uint32 value representing the maximum number of characters allowed for a character literal. + /// Retrieves a int64 value representing the maximum number of characters allowed for a character literal. SqlMaxCharLiteralLength = 542, - /// Retrieves a uint32 value representing the maximum number of characters allowed for a column name. + /// Retrieves a int64 value representing the maximum number of characters allowed for a column name. SqlMaxColumnNameLength = 543, - /// Retrieves a uint32 value representing the the maximum number of columns allowed in a GROUP BY clause. + /// Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. SqlMaxColumnsInGroupBy = 544, - /// Retrieves a uint32 value representing the maximum number of columns allowed in an index. + /// Retrieves a int64 value representing the maximum number of columns allowed in an index. SqlMaxColumnsInIndex = 545, - /// Retrieves a uint32 value representing the maximum number of columns allowed in an ORDER BY clause. + /// Retrieves a int64 value representing the maximum number of columns allowed in an ORDER BY clause. SqlMaxColumnsInOrderBy = 546, - /// Retrieves a uint32 value representing the maximum number of columns allowed in a SELECT list. + /// Retrieves a int64 value representing the maximum number of columns allowed in a SELECT list. SqlMaxColumnsInSelect = 547, - /// Retrieves a uint32 value representing the maximum number of columns allowed in a table. + /// Retrieves a int64 value representing the maximum number of columns allowed in a table. SqlMaxColumnsInTable = 548, - /// Retrieves a uint32 value representing the maximum number of concurrent connections possible. + /// Retrieves a int64 value representing the maximum number of concurrent connections possible. SqlMaxConnections = 549, - /// Retrieves a uint32 value the maximum number of characters allowed in a cursor name. + /// Retrieves a int64 value the maximum number of characters allowed in a cursor name. SqlMaxCursorNameLength = 550, /// - /// Retrieves a uint32 value representing the maximum number of bytes allowed for an index, - /// including all of the parts of the index. + /// Retrieves a int64 value representing the maximum number of bytes allowed for an index, + /// including all of the parts of the index. SqlMaxIndexLength = 551, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a schema name. + /// Retrieves a int64 value representing the maximum number of characters allowed in a schema name. SqlDbSchemaNameLength = 552, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a procedure name. + /// Retrieves a int64 value representing the maximum number of characters allowed in a procedure name. SqlMaxProcedureNameLength = 553, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a catalog name. + /// Retrieves a int64 value representing the maximum number of characters allowed in a catalog name. SqlMaxCatalogNameLength = 554, - /// Retrieves a uint32 value representing the maximum number of bytes allowed in a single row. + /// Retrieves a int64 value representing the maximum number of bytes allowed in a single row. SqlMaxRowSize = 555, /// - /// Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL - /// data types LONGVARCHAR and LONGVARBINARY. + /// Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL + /// data types LONGVARCHAR and LONGVARBINARY. /// - /// Returns: - /// - false: if return value for the JDBC method getMaxRowSize does + /// Returns: + /// - false: if return value for the JDBC method getMaxRowSize does /// not include the SQL data types LONGVARCHAR and LONGVARBINARY; - /// - true: if return value for the JDBC method getMaxRowSize includes + /// - true: if return value for the JDBC method getMaxRowSize includes /// the SQL data types LONGVARCHAR and LONGVARBINARY. SqlMaxRowSizeIncludesBlobs = 556, /// - /// Retrieves a uint32 value representing the maximum number of characters allowed for an SQL statement; - /// a result of 0 (zero) means that there is no limit or the limit is not known. + /// Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; + /// a result of 0 (zero) means that there is no limit or the limit is not known. SqlMaxStatementLength = 557, - /// Retrieves a uint32 value representing the maximum number of active statements that can be open at the same time. + /// Retrieves a int64 value representing the maximum number of active statements that can be open at the same time. SqlMaxStatements = 558, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a table name. + /// Retrieves a int64 value representing the maximum number of characters allowed in a table name. SqlMaxTableNameLength = 559, - /// Retrieves a uint32 value representing the maximum number of tables allowed in a SELECT statement. + /// Retrieves a int64 value representing the maximum number of tables allowed in a SELECT statement. SqlMaxTablesInSelect = 560, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a user name. + /// Retrieves a int64 value representing the maximum number of characters allowed in a user name. SqlMaxUsernameLength = 561, /// - /// Retrieves this database's default transaction isolation level as described in - /// `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + /// Retrieves this database's default transaction isolation level as described in + /// `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. /// - /// Returns a uint32 ordinal for the SQL transaction isolation level. + /// Returns a int32 ordinal for the SQL transaction isolation level. SqlDefaultTransactionIsolation = 562, /// - /// Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a - /// noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + /// Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a + /// noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. /// - /// Returns: - /// - false: if transactions are unsupported; - /// - true: if transactions are supported. + /// Returns: + /// - false: if transactions are unsupported; + /// - true: if transactions are supported. SqlTransactionsSupported = 563, /// - /// Retrieves the supported transactions isolation levels. - /// - /// Returns an int32 bitmask value representing the supported transactions isolation levels. - /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); - /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; - /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; - /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; - /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; - /// - ... - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + /// Retrieves the supported transactions isolation levels. + /// + /// Returns an int32 bitmask value representing the supported transactions isolation levels. + /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; + /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; + /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; + /// - ... + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. SqlSupportedTransactionsIsolationLevels = 564, /// - /// Retrieves a boolean value indicating whether a data definition statement within a transaction forces - /// the transaction to commit. + /// Retrieves a boolean value indicating whether a data definition statement within a transaction forces + /// the transaction to commit. /// - /// Returns: - /// - false: if a data definition statement within a transaction does not force the transaction to commit; - /// - true: if a data definition statement within a transaction forces the transaction to commit. + /// Returns: + /// - false: if a data definition statement within a transaction does not force the transaction to commit; + /// - true: if a data definition statement within a transaction forces the transaction to commit. SqlDataDefinitionCausesTransactionCommit = 565, /// - /// Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. + /// Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. /// - /// Returns: - /// - false: if a data definition statement within a transaction is taken into account; - /// - true: a data definition statement within a transaction is ignored. + /// Returns: + /// - false: if a data definition statement within a transaction is taken into account; + /// - true: a data definition statement within a transaction is ignored. SqlDataDefinitionsInTransactionsIgnored = 566, /// - /// Retrieves an int32 bitmask value representing the supported result set types. - /// The returned bitmask should be parsed in order to retrieve the supported result set types. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported result set types); - /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; - /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; - /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; - /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; - /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; - /// - ... - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. + /// Retrieves an int32 bitmask value representing the supported result set types. + /// The returned bitmask should be parsed in order to retrieve the supported result set types. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported result set types); + /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; + /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; + /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; + /// - ... + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. SqlSupportedResultSetTypes = 567, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetUnspecified = 568, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetForwardOnly = 569, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollSensitive = 570, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollInsensitive = 571, /// - /// Retrieves a boolean value indicating whether this database supports batch updates. + /// Retrieves a boolean value indicating whether this database supports batch updates. /// - /// - false: if this database does not support batch updates; - /// - true: if this database supports batch updates. + /// - false: if this database does not support batch updates; + /// - true: if this database supports batch updates. SqlBatchUpdatesSupported = 572, /// - /// Retrieves a boolean value indicating whether this database supports savepoints. + /// Retrieves a boolean value indicating whether this database supports savepoints. /// - /// Returns: - /// - false: if this database does not support savepoints; - /// - true: if this database supports savepoints. + /// Returns: + /// - false: if this database does not support savepoints; + /// - true: if this database supports savepoints. SqlSavepointsSupported = 573, /// - /// Retrieves a boolean value indicating whether named parameters are supported in callable statements. + /// Retrieves a boolean value indicating whether named parameters are supported in callable statements. /// - /// Returns: - /// - false: if named parameters in callable statements are unsupported; - /// - true: if named parameters in callable statements are supported. + /// Returns: + /// - false: if named parameters in callable statements are unsupported; + /// - true: if named parameters in callable statements are supported. SqlNamedParametersSupported = 574, /// - /// Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. + /// Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. /// - /// Returns: - /// - false: if updates made to a LOB are made directly to the LOB; - /// - true: if updates made to a LOB are made on a copy. + /// Returns: + /// - false: if updates made to a LOB are made directly to the LOB; + /// - true: if updates made to a LOB are made on a copy. SqlLocatorsUpdateCopy = 575, /// - /// Retrieves a boolean value indicating whether invoking user-defined or vendor functions - /// using the stored procedure escape syntax is supported. + /// Retrieves a boolean value indicating whether invoking user-defined or vendor functions + /// using the stored procedure escape syntax is supported. /// - /// Returns: - /// - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; - /// - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. + /// Returns: + /// - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; + /// - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. SqlStoredFunctionsUsingCallSyntaxSupported = 576, } impl SqlInfo { @@ -1434,6 +1534,202 @@ impl SqlSupportsConvert { } } } +/// * +/// The JDBC/ODBC-defined type of any object. +/// All the values here are the sames as in the JDBC and ODBC specs. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum XdbcDataType { + XdbcUnknownType = 0, + XdbcChar = 1, + XdbcNumeric = 2, + XdbcDecimal = 3, + XdbcInteger = 4, + XdbcSmallint = 5, + XdbcFloat = 6, + XdbcReal = 7, + XdbcDouble = 8, + XdbcDatetime = 9, + XdbcInterval = 10, + XdbcVarchar = 12, + XdbcDate = 91, + XdbcTime = 92, + XdbcTimestamp = 93, + XdbcLongvarchar = -1, + XdbcBinary = -2, + XdbcVarbinary = -3, + XdbcLongvarbinary = -4, + XdbcBigint = -5, + XdbcTinyint = -6, + XdbcBit = -7, + XdbcWchar = -8, + XdbcWvarchar = -9, +} +impl XdbcDataType { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + XdbcDataType::XdbcUnknownType => "XDBC_UNKNOWN_TYPE", + XdbcDataType::XdbcChar => "XDBC_CHAR", + XdbcDataType::XdbcNumeric => "XDBC_NUMERIC", + XdbcDataType::XdbcDecimal => "XDBC_DECIMAL", + XdbcDataType::XdbcInteger => "XDBC_INTEGER", + XdbcDataType::XdbcSmallint => "XDBC_SMALLINT", + XdbcDataType::XdbcFloat => "XDBC_FLOAT", + XdbcDataType::XdbcReal => "XDBC_REAL", + XdbcDataType::XdbcDouble => "XDBC_DOUBLE", + XdbcDataType::XdbcDatetime => "XDBC_DATETIME", + XdbcDataType::XdbcInterval => "XDBC_INTERVAL", + XdbcDataType::XdbcVarchar => "XDBC_VARCHAR", + XdbcDataType::XdbcDate => "XDBC_DATE", + XdbcDataType::XdbcTime => "XDBC_TIME", + XdbcDataType::XdbcTimestamp => "XDBC_TIMESTAMP", + XdbcDataType::XdbcLongvarchar => "XDBC_LONGVARCHAR", + XdbcDataType::XdbcBinary => "XDBC_BINARY", + XdbcDataType::XdbcVarbinary => "XDBC_VARBINARY", + XdbcDataType::XdbcLongvarbinary => "XDBC_LONGVARBINARY", + XdbcDataType::XdbcBigint => "XDBC_BIGINT", + XdbcDataType::XdbcTinyint => "XDBC_TINYINT", + XdbcDataType::XdbcBit => "XDBC_BIT", + XdbcDataType::XdbcWchar => "XDBC_WCHAR", + XdbcDataType::XdbcWvarchar => "XDBC_WVARCHAR", + } + } +} +/// * +/// Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum XdbcDatetimeSubcode { + XdbcSubcodeUnknown = 0, + XdbcSubcodeYear = 1, + XdbcSubcodeTime = 2, + XdbcSubcodeTimestamp = 3, + XdbcSubcodeTimeWithTimezone = 4, + XdbcSubcodeTimestampWithTimezone = 5, + XdbcSubcodeSecond = 6, + XdbcSubcodeYearToMonth = 7, + XdbcSubcodeDayToHour = 8, + XdbcSubcodeDayToMinute = 9, + XdbcSubcodeDayToSecond = 10, + XdbcSubcodeHourToMinute = 11, + XdbcSubcodeHourToSecond = 12, + XdbcSubcodeMinuteToSecond = 13, + XdbcSubcodeIntervalYear = 101, + XdbcSubcodeIntervalMonth = 102, + XdbcSubcodeIntervalDay = 103, + XdbcSubcodeIntervalHour = 104, + XdbcSubcodeIntervalMinute = 105, + XdbcSubcodeIntervalSecond = 106, + XdbcSubcodeIntervalYearToMonth = 107, + XdbcSubcodeIntervalDayToHour = 108, + XdbcSubcodeIntervalDayToMinute = 109, + XdbcSubcodeIntervalDayToSecond = 110, + XdbcSubcodeIntervalHourToMinute = 111, + XdbcSubcodeIntervalHourToSecond = 112, + XdbcSubcodeIntervalMinuteToSecond = 113, +} +impl XdbcDatetimeSubcode { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + XdbcDatetimeSubcode::XdbcSubcodeUnknown => "XDBC_SUBCODE_UNKNOWN", + XdbcDatetimeSubcode::XdbcSubcodeYear => "XDBC_SUBCODE_YEAR", + XdbcDatetimeSubcode::XdbcSubcodeTime => "XDBC_SUBCODE_TIME", + XdbcDatetimeSubcode::XdbcSubcodeTimestamp => "XDBC_SUBCODE_TIMESTAMP", + XdbcDatetimeSubcode::XdbcSubcodeTimeWithTimezone => "XDBC_SUBCODE_TIME_WITH_TIMEZONE", + XdbcDatetimeSubcode::XdbcSubcodeTimestampWithTimezone => "XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE", + XdbcDatetimeSubcode::XdbcSubcodeSecond => "XDBC_SUBCODE_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeYearToMonth => "XDBC_SUBCODE_YEAR_TO_MONTH", + XdbcDatetimeSubcode::XdbcSubcodeDayToHour => "XDBC_SUBCODE_DAY_TO_HOUR", + XdbcDatetimeSubcode::XdbcSubcodeDayToMinute => "XDBC_SUBCODE_DAY_TO_MINUTE", + XdbcDatetimeSubcode::XdbcSubcodeDayToSecond => "XDBC_SUBCODE_DAY_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeHourToMinute => "XDBC_SUBCODE_HOUR_TO_MINUTE", + XdbcDatetimeSubcode::XdbcSubcodeHourToSecond => "XDBC_SUBCODE_HOUR_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeMinuteToSecond => "XDBC_SUBCODE_MINUTE_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeIntervalYear => "XDBC_SUBCODE_INTERVAL_YEAR", + XdbcDatetimeSubcode::XdbcSubcodeIntervalMonth => "XDBC_SUBCODE_INTERVAL_MONTH", + XdbcDatetimeSubcode::XdbcSubcodeIntervalDay => "XDBC_SUBCODE_INTERVAL_DAY", + XdbcDatetimeSubcode::XdbcSubcodeIntervalHour => "XDBC_SUBCODE_INTERVAL_HOUR", + XdbcDatetimeSubcode::XdbcSubcodeIntervalMinute => "XDBC_SUBCODE_INTERVAL_MINUTE", + XdbcDatetimeSubcode::XdbcSubcodeIntervalSecond => "XDBC_SUBCODE_INTERVAL_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeIntervalYearToMonth => "XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH", + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToHour => "XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR", + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToMinute => "XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE", + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToSecond => "XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToMinute => "XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE", + XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToSecond => "XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeIntervalMinuteToSecond => "XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND", + } + } +} +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum Nullable { + /// * + /// Indicates that the fields does not allow the use of null values. + NullabilityNoNulls = 0, + /// * + /// Indicates that the fields allow the use of null values. + NullabilityNullable = 1, + /// * + /// Indicates that nullability of the fields can not be determined. + NullabilityUnknown = 2, +} +impl Nullable { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Nullable::NullabilityNoNulls => "NULLABILITY_NO_NULLS", + Nullable::NullabilityNullable => "NULLABILITY_NULLABLE", + Nullable::NullabilityUnknown => "NULLABILITY_UNKNOWN", + } + } +} +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum Searchable { + /// * + /// Indicates that column can not be used in a WHERE clause. + None = 0, + /// * + /// Indicates that the column can be used in a WHERE clause if it is using a + /// LIKE operator. + Char = 1, + /// * + /// Indicates that the column can be used In a WHERE clause with any + /// operator other than LIKE. + /// + /// - Allowed operators: comparison, quantified comparison, BETWEEN, + /// DISTINCT, IN, MATCH, and UNIQUE. + Basic = 2, + /// * + /// Indicates that the column can be used in a WHERE clause using any operator. + Full = 3, +} +impl Searchable { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Searchable::None => "SEARCHABLE_NONE", + Searchable::Char => "SEARCHABLE_CHAR", + Searchable::Basic => "SEARCHABLE_BASIC", + Searchable::Full => "SEARCHABLE_FULL", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum UpdateDeleteRules { diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 21a5a8572246..4a30b2d5aef8 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -80,13 +80,13 @@ pub fn flight_data_to_arrow_batch( /// Convert a `Schema` to `SchemaResult` by converting to an IPC message #[deprecated( since = "4.4.0", - note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).into()" + note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).try_into()" )] pub fn flight_schema_from_arrow_schema( schema: &Schema, options: &IpcWriteOptions, -) -> SchemaResult { - SchemaAsIpc::new(schema, options).into() +) -> Result { + SchemaAsIpc::new(schema, options).try_into() } /// Convert a `Schema` to `FlightData` by converting to an IPC message diff --git a/arrow/src/ipc/convert.rs b/arrow/src/ipc/convert.rs index 00503d50e338..9f6cda37c650 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow/src/ipc/convert.rs @@ -26,6 +26,7 @@ use flatbuffers::{ }; use std::collections::{BTreeMap, HashMap}; +use crate::ipc::{size_prefixed_root_as_message, CONTINUATION_MARKER}; use DataType::*; /// Serialize a schema in IPC format @@ -103,7 +104,7 @@ impl<'a> From> for Field { } } -/// Deserialize a Schema table from IPC format to Schema data type +/// Deserialize a Schema table from flat buffer format to Schema data type pub fn fb_to_schema(fb: ipc::Schema) -> Schema { let mut fields: Vec = vec![]; let c_fields = fb.fields().unwrap(); @@ -136,8 +137,8 @@ pub fn fb_to_schema(fb: ipc::Schema) -> Schema { Schema::new_with_metadata(fields, metadata) } -/// Deserialize an IPC message into a schema -pub fn schema_from_bytes(bytes: &[u8]) -> Result { +/// Try deserialize flat buffer format bytes into a schema +pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result { if let Ok(ipc) = ipc::root_as_message(bytes) { if let Some(schema) = ipc.header_as_schema().map(fb_to_schema) { Ok(schema) @@ -153,6 +154,51 @@ pub fn schema_from_bytes(bytes: &[u8]) -> Result { } } +/// Try deserialize the IPC format bytes into a schema +pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { + // There are two protocol types: https://issues.apache.org/jira/browse/ARROW-6313 + // The original protocal is: + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + // The latest version of protocol is: + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + if buffer.len() >= 4 { + // check continuation maker + let continuation_maker = &buffer[0..4]; + let begin_offset: usize = if continuation_maker.eq(&CONTINUATION_MARKER) { + // 4 bytes: CONTINUATION_MARKER + // 4 bytes: length + // buffer + 4 + } else { + // backward compatibility for buffer without the continuation maker + // 4 bytes: length + // buffer + 0 + }; + let msg = + size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { + ArrowError::ParseError(format!( + "Unable to convert flight info to a message: {}", + err + )) + })?; + let ipc_schema = msg.header_as_schema().ok_or_else(|| { + ArrowError::ParseError( + "Unable to convert flight info to a schema".to_string(), + ) + })?; + Ok(fb_to_schema(ipc_schema)) + } else { + Err(ArrowError::ParseError( + "The buffer length is less than 4 and missing the continuation maker or length of buffer".to_string() + )) + } +} + /// Get the Arrow data type from the flatbuffer Field table pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { if let Some(dictionary) = field.dictionary() { diff --git a/format/Flight.proto b/format/Flight.proto index b291d9dbd9aa..635b1793d2ba 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -19,7 +19,7 @@ syntax = "proto3"; option java_package = "org.apache.arrow.flight.impl"; -option go_package = "github.com/apache/arrow/go/flight;flight"; +option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; option csharp_namespace = "Apache.Arrow.Flight.Protocol"; package arrow.flight.protocol; @@ -193,7 +193,10 @@ message Result { * Wrap the result of a getSchema call */ message SchemaResult { - // schema of the dataset as described in Schema.fbs::Schema. + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema bytes schema = 1; } @@ -244,7 +247,10 @@ message FlightDescriptor { * consumer is able to determine how to retrieve a dataset. */ message FlightInfo { - // schema of the dataset as described in Schema.fbs::Schema. + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema bytes schema = 1; /* @@ -253,8 +259,15 @@ message FlightInfo { FlightDescriptor flight_descriptor = 2; /* - * A list of endpoints associated with the flight. To consume the whole - * flight, all endpoints must be consumed. + * A list of endpoints associated with the flight. To consume the + * whole flight, all endpoints (and hence all Tickets) must be + * consumed. Endpoints can be consumed in any order. + * + * In other words, an application can use multiple endpoints to + * represent partitioned data. + * + * There is no ordering defined on endpoints. Hence, if the returned + * data has an ordering, it should be returned in a single endpoint. */ repeated FlightEndpoint endpoint = 3; @@ -274,9 +287,20 @@ message FlightEndpoint { Ticket ticket = 1; /* - * A list of URIs where this ticket can be redeemed. If the list is - * empty, the expectation is that the ticket can only be redeemed on the - * current service where the ticket was generated. + * A list of URIs where this ticket can be redeemed via DoGet(). + * + * If the list is empty, the expectation is that the ticket can only + * be redeemed on the current service where the ticket was + * generated. + * + * If the list is not empty, the expectation is that the ticket can + * be redeemed at any of the locations, and that the data returned + * will be equivalent. In this case, the ticket may only be redeemed + * at one of the given locations, and not (necessarily) on the + * current service. + * + * In other words, an application can use multiple locations to + * represent redundant and/or load balanced services. */ repeated Location location = 2; } @@ -292,6 +316,9 @@ message Location { /* * An opaque identifier that the service can use to retrieve a particular * portion of a stream. + * + * Tickets are meant to be single use. It is an error/application-defined + * behavior to reuse a ticket. */ message Ticket { bytes ticket = 1; diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 3e85e348bc9c..859427b68804 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -20,7 +20,7 @@ syntax = "proto3"; import "google/protobuf/descriptor.proto"; option java_package = "org.apache.arrow.flight.sql.impl"; - +option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; package arrow.flight.protocol.sql; /* @@ -56,7 +56,7 @@ message CommandGetSqlInfo { * Initially, Flight SQL will support the following information types: * - Server Information - Range [0-500) * - Syntax Information - Range [500-1000) - * Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). + * Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). * Custom options should start at 10,000. * * If omitted, then all metadata will be retrieved. @@ -81,7 +81,7 @@ enum SqlInfo { // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. FLIGHT_SQL_SERVER_ARROW_VERSION = 2; - /* + /* * Retrieves a boolean value indicating whether the Flight SQL Server is read only. * * Returns: @@ -121,7 +121,7 @@ enum SqlInfo { SQL_DDL_TABLE = 502; /* - * Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of catalog, table, schema and table names. + * Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. * * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. */ @@ -131,7 +131,7 @@ enum SqlInfo { SQL_IDENTIFIER_QUOTE_CHAR = 504; /* - * Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of quoted identifiers. + * Retrieves a int32 describing the case sensitivity of quoted identifiers. * * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. */ @@ -149,7 +149,7 @@ enum SqlInfo { /* * Retrieves the null ordering. * - * Returns a uint32 ordinal for the null ordering being used, as described in + * Returns a int32 ordinal for the null ordering being used, as described in * `arrow.flight.protocol.sql.SqlNullOrdering`. */ SQL_NULL_ORDERING = 507; @@ -335,7 +335,7 @@ enum SqlInfo { /* * Retrieves the support level for SQL OUTER JOINs. * - * Returns a uint3 uint32 ordinal for the SQL ordering being used, as described in + * Returns a int32 ordinal for the SQL ordering being used, as described in * `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. */ SQL_OUTER_JOINS_SUPPORT_LEVEL = 528; @@ -346,7 +346,10 @@ enum SqlInfo { // Retrieves a UTF-8 string with the preferred term for "procedure". SQL_PROCEDURE_TERM = 530; - // Retrieves a UTF-8 string with the preferred term for "catalog". + /* + * Retrieves a UTF-8 string with the preferred term for "catalog". + * If a empty string is returned its assumed that the server does NOT supports catalogs. + */ SQL_CATALOG_TERM = 531; /* @@ -481,52 +484,52 @@ enum SqlInfo { */ SQL_SUPPORTED_UNIONS = 540; - // Retrieves a uint32 value representing the maximum number of hex characters allowed in an inline binary literal. + // Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. SQL_MAX_BINARY_LITERAL_LENGTH = 541; - // Retrieves a uint32 value representing the maximum number of characters allowed for a character literal. + // Retrieves a int64 value representing the maximum number of characters allowed for a character literal. SQL_MAX_CHAR_LITERAL_LENGTH = 542; - // Retrieves a uint32 value representing the maximum number of characters allowed for a column name. + // Retrieves a int64 value representing the maximum number of characters allowed for a column name. SQL_MAX_COLUMN_NAME_LENGTH = 543; - // Retrieves a uint32 value representing the the maximum number of columns allowed in a GROUP BY clause. + // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. SQL_MAX_COLUMNS_IN_GROUP_BY = 544; - // Retrieves a uint32 value representing the maximum number of columns allowed in an index. + // Retrieves a int64 value representing the maximum number of columns allowed in an index. SQL_MAX_COLUMNS_IN_INDEX = 545; - // Retrieves a uint32 value representing the maximum number of columns allowed in an ORDER BY clause. + // Retrieves a int64 value representing the maximum number of columns allowed in an ORDER BY clause. SQL_MAX_COLUMNS_IN_ORDER_BY = 546; - // Retrieves a uint32 value representing the maximum number of columns allowed in a SELECT list. + // Retrieves a int64 value representing the maximum number of columns allowed in a SELECT list. SQL_MAX_COLUMNS_IN_SELECT = 547; - // Retrieves a uint32 value representing the maximum number of columns allowed in a table. + // Retrieves a int64 value representing the maximum number of columns allowed in a table. SQL_MAX_COLUMNS_IN_TABLE = 548; - // Retrieves a uint32 value representing the maximum number of concurrent connections possible. + // Retrieves a int64 value representing the maximum number of concurrent connections possible. SQL_MAX_CONNECTIONS = 549; - // Retrieves a uint32 value the maximum number of characters allowed in a cursor name. + // Retrieves a int64 value the maximum number of characters allowed in a cursor name. SQL_MAX_CURSOR_NAME_LENGTH = 550; /* - * Retrieves a uint32 value representing the maximum number of bytes allowed for an index, + * Retrieves a int64 value representing the maximum number of bytes allowed for an index, * including all of the parts of the index. */ SQL_MAX_INDEX_LENGTH = 551; - // Retrieves a uint32 value representing the maximum number of characters allowed in a schema name. + // Retrieves a int64 value representing the maximum number of characters allowed in a schema name. SQL_DB_SCHEMA_NAME_LENGTH = 552; - // Retrieves a uint32 value representing the maximum number of characters allowed in a procedure name. + // Retrieves a int64 value representing the maximum number of characters allowed in a procedure name. SQL_MAX_PROCEDURE_NAME_LENGTH = 553; - // Retrieves a uint32 value representing the maximum number of characters allowed in a catalog name. + // Retrieves a int64 value representing the maximum number of characters allowed in a catalog name. SQL_MAX_CATALOG_NAME_LENGTH = 554; - // Retrieves a uint32 value representing the maximum number of bytes allowed in a single row. + // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. SQL_MAX_ROW_SIZE = 555; /* @@ -542,28 +545,28 @@ enum SqlInfo { SQL_MAX_ROW_SIZE_INCLUDES_BLOBS = 556; /* - * Retrieves a uint32 value representing the maximum number of characters allowed for an SQL statement; + * Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; * a result of 0 (zero) means that there is no limit or the limit is not known. */ SQL_MAX_STATEMENT_LENGTH = 557; - // Retrieves a uint32 value representing the maximum number of active statements that can be open at the same time. + // Retrieves a int64 value representing the maximum number of active statements that can be open at the same time. SQL_MAX_STATEMENTS = 558; - // Retrieves a uint32 value representing the maximum number of characters allowed in a table name. + // Retrieves a int64 value representing the maximum number of characters allowed in a table name. SQL_MAX_TABLE_NAME_LENGTH = 559; - // Retrieves a uint32 value representing the maximum number of tables allowed in a SELECT statement. + // Retrieves a int64 value representing the maximum number of tables allowed in a SELECT statement. SQL_MAX_TABLES_IN_SELECT = 560; - // Retrieves a uint32 value representing the maximum number of characters allowed in a user name. + // Retrieves a int64 value representing the maximum number of characters allowed in a user name. SQL_MAX_USERNAME_LENGTH = 561; /* * Retrieves this database's default transaction isolation level as described in * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. * - * Returns a uint32 ordinal for the SQL transaction isolation level. + * Returns a int32 ordinal for the SQL transaction isolation level. */ SQL_DEFAULT_TRANSACTION_ISOLATION = 562; @@ -868,6 +871,187 @@ enum SqlSupportsConvert { SQL_CONVERT_VARCHAR = 19; } +/** + * The JDBC/ODBC-defined type of any object. + * All the values here are the sames as in the JDBC and ODBC specs. + */ +enum XdbcDataType { + XDBC_UNKNOWN_TYPE = 0; + XDBC_CHAR = 1; + XDBC_NUMERIC = 2; + XDBC_DECIMAL = 3; + XDBC_INTEGER = 4; + XDBC_SMALLINT = 5; + XDBC_FLOAT = 6; + XDBC_REAL = 7; + XDBC_DOUBLE = 8; + XDBC_DATETIME = 9; + XDBC_INTERVAL = 10; + XDBC_VARCHAR = 12; + XDBC_DATE = 91; + XDBC_TIME = 92; + XDBC_TIMESTAMP = 93; + XDBC_LONGVARCHAR = -1; + XDBC_BINARY = -2; + XDBC_VARBINARY = -3; + XDBC_LONGVARBINARY = -4; + XDBC_BIGINT = -5; + XDBC_TINYINT = -6; + XDBC_BIT = -7; + XDBC_WCHAR = -8; + XDBC_WVARCHAR = -9; +} + +/** + * Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. + */ +enum XdbcDatetimeSubcode { + option allow_alias = true; + XDBC_SUBCODE_UNKNOWN = 0; + XDBC_SUBCODE_YEAR = 1; + XDBC_SUBCODE_DATE = 1; + XDBC_SUBCODE_TIME = 2; + XDBC_SUBCODE_MONTH = 2; + XDBC_SUBCODE_TIMESTAMP = 3; + XDBC_SUBCODE_DAY = 3; + XDBC_SUBCODE_TIME_WITH_TIMEZONE = 4; + XDBC_SUBCODE_HOUR = 4; + XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE = 5; + XDBC_SUBCODE_MINUTE = 5; + XDBC_SUBCODE_SECOND = 6; + XDBC_SUBCODE_YEAR_TO_MONTH = 7; + XDBC_SUBCODE_DAY_TO_HOUR = 8; + XDBC_SUBCODE_DAY_TO_MINUTE = 9; + XDBC_SUBCODE_DAY_TO_SECOND = 10; + XDBC_SUBCODE_HOUR_TO_MINUTE = 11; + XDBC_SUBCODE_HOUR_TO_SECOND = 12; + XDBC_SUBCODE_MINUTE_TO_SECOND = 13; + XDBC_SUBCODE_INTERVAL_YEAR = 101; + XDBC_SUBCODE_INTERVAL_MONTH = 102; + XDBC_SUBCODE_INTERVAL_DAY = 103; + XDBC_SUBCODE_INTERVAL_HOUR = 104; + XDBC_SUBCODE_INTERVAL_MINUTE = 105; + XDBC_SUBCODE_INTERVAL_SECOND = 106; + XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH = 107; + XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR = 108; + XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE = 109; + XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND = 110; + XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE = 111; + XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND = 112; + XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND = 113; +} + +enum Nullable { + /** + * Indicates that the fields does not allow the use of null values. + */ + NULLABILITY_NO_NULLS = 0; + + /** + * Indicates that the fields allow the use of null values. + */ + NULLABILITY_NULLABLE = 1; + + /** + * Indicates that nullability of the fields can not be determined. + */ + NULLABILITY_UNKNOWN = 2; +} + +enum Searchable { + /** + * Indicates that column can not be used in a WHERE clause. + */ + SEARCHABLE_NONE = 0; + + /** + * Indicates that the column can be used in a WHERE clause if it is using a + * LIKE operator. + */ + SEARCHABLE_CHAR = 1; + + /** + * Indicates that the column can be used In a WHERE clause with any + * operator other than LIKE. + * + * - Allowed operators: comparison, quantified comparison, BETWEEN, + * DISTINCT, IN, MATCH, and UNIQUE. + */ + SEARCHABLE_BASIC = 2; + + /** + * Indicates that the column can be used in a WHERE clause using any operator. + */ + SEARCHABLE_FULL = 3; +} + +/* + * Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned schema will be: + * < + * type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), + * data_type: int not null (The SQL data type), + * column_size: int (The maximum size supported by that column. + * In case of exact numeric types, this represents the maximum precision. + * In case of string types, this represents the character length. + * In case of datetime data types, this represents the length in characters of the string representation. + * NULL is returned for data types where column size is not applicable.), + * literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for + * data types where a literal prefix is not applicable.), + * literal_suffix: utf8 (Character or characters used to terminate a literal, + * NULL is returned for data types where a literal suffix is not applicable.), + * create_params: list + * (A list of keywords corresponding to which parameters can be used when creating + * a column for that specific type. + * NULL is returned if there are no parameters for the data type definition.), + * nullable: int not null (Shows if the data type accepts a NULL value. The possible values can be seen in the + * Nullable enum.), + * case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), + * searchable: int not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the + * Searchable enum.), + * unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is + * not applicable to the data type or the data type is not numeric.), + * fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), + * auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute + * is not applicable to the data type or the data type is not numeric.), + * local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL + * is returned if a localized name is not supported by the data source), + * minimum_scale: int (The minimum scale of the data type on the data source. + * If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE + * columns both contain this value. NULL is returned if scale is not applicable.), + * maximum_scale: int (The maximum scale of the data type on the data source. + * NULL is returned if scale is not applicable.), + * sql_data_type: int not null (The value of the SQL DATA TYPE which has the same values + * as data_type value. Except for interval and datetime, which + * uses generic values. More info about those types can be + * obtained through datetime_subcode. The possible values can be seen + * in the XdbcDataType enum.), + * datetime_subcode: int (Only used when the SQL DATA TYPE is interval or datetime. It contains + * its sub types. For type different from interval and datetime, this value + * is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), + * num_prec_radix: int (If the data type is an approximate numeric type, this column contains + * the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For + * exact numeric types, this column contains the value 10 to indicate that + * column size specifies a number of decimal digits. Otherwise, this column is NULL.), + * interval_precision: int (If the data type is an interval data type, then this column contains the value + * of the interval leading precision. Otherwise, this column is NULL. This fields + * is only relevant to be used by ODBC). + * > + * The returned data should be ordered by data_type and then by type_name. + */ +message CommandGetXdbcTypeInfo { + option (experimental) = true; + + /* + * Specifies the data type to search for the info. + */ + optional int32 data_type = 1; +} + /* * Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. * The definition of a catalog depends on vendor/implementation. It is usually the database itself @@ -934,6 +1118,17 @@ message CommandGetDbSchemas { * [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, * it is serialized as an IPC message.) * > + * Fields on table_schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. */ message CommandGetTables { @@ -1236,11 +1431,11 @@ message ActionCreatePreparedStatementResult { // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; - // If a result set generating query was provided, dataset_schema contains the + // If a result set generating query was provided, dataset_schema contains the // schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. bytes dataset_schema = 2; - // If the query provided contained parameters, parameter_schema contains the + // If the query provided contained parameters, parameter_schema contains the // schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. bytes parameter_schema = 3; } @@ -1263,6 +1458,17 @@ message ActionClosePreparedStatementRequest { * Represents a SQL query. Used in the command member of FlightDescriptor * for the following RPC calls: * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * - GetFlightInfo: execute the query. */ message CommandStatementQuery { @@ -1286,6 +1492,18 @@ message TicketStatementQuery { /* * Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for * the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. * - GetFlightInfo: execute the prepared statement instance. */ @@ -1309,7 +1527,7 @@ message CommandStatementUpdate { /* * Represents a SQL update query. Used in the command member of FlightDescriptor - * for the the RPC call DoPut to cause the server to execute the included + * for the the RPC call DoPut to cause the server to execute the included * prepared statement handle as an update. */ message CommandPreparedStatementUpdate { @@ -1322,12 +1540,12 @@ message CommandPreparedStatementUpdate { /* * Returned from the RPC call DoPut when a CommandStatementUpdate * CommandPreparedStatementUpdate was in the request, containing - * results from the update. + * results from the update. */ message DoPutUpdateResult { option (experimental) = true; - // The number of records updated. A return value of -1 represents + // The number of records updated. A return value of -1 represents // an unknown updated record count. int64 record_count = 1; } diff --git a/format/Message.fbs b/format/Message.fbs index f1c18d765d46..170ea8fbced8 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -70,7 +70,8 @@ enum BodyCompressionMethod:byte { /// bodies. Intended for use with RecordBatch but could be used for other /// message types table BodyCompression { - /// Compressor library + /// Compressor library. + /// For LZ4_FRAME, each compressed buffer must consist of a single frame. codec: CompressionType = LZ4_FRAME; /// Indicates the way the record batch body was compressed diff --git a/format/Schema.fbs b/format/Schema.fbs index 9da095177c7d..7ee827b5de8d 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -17,6 +17,11 @@ /// Logical types, vector layouts, and schemas +/// Format Version History. +/// Version 1.0 - Forward and backwards compatibility guaranteed. +/// Version 1.1 - Add Decimal256 (No format release). +/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO + namespace org.apache.arrow.flatbuf; enum MetadataVersion:short { @@ -194,8 +199,8 @@ enum DateUnit: short { MILLISECOND } -/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX -/// epoch (1970-01-01), stored in either of two units: +/// Date is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units: /// /// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no /// leap seconds), where the values are evenly divisible by 86400000 @@ -206,43 +211,143 @@ table Date { enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } -/// Time type. The physical storage type depends on the unit -/// - SECOND and MILLISECOND: 32 bits -/// - MICROSECOND and NANOSECOND: 64 bits +/// Time is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since midnight, stored in either of four units: seconds, +/// milliseconds, microseconds or nanoseconds. +/// +/// The integer `bitWidth` depends on the `unit` and must be one of the following: +/// * SECOND and MILLISECOND: 32 bits +/// * MICROSECOND and NANOSECOND: 64 bits +/// +/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds +/// (exclusive), adjusted for the time unit (for example, up to 86400000 +/// exclusive for the MILLISECOND unit). +/// This definition doesn't allow for leap seconds. Time values from +/// measurements with leap seconds will need to be corrected when ingesting +/// into Arrow (for example by replacing the value 86400 with 86399). table Time { unit: TimeUnit = MILLISECOND; bitWidth: int = 32; } -/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding -/// leap seconds, as a 64-bit integer. Note that UNIX time does not include -/// leap seconds. +/// Timestamp is a 64-bit signed integer representing an elapsed time since a +/// fixed epoch, stored in either of four units: seconds, milliseconds, +/// microseconds or nanoseconds, and is optionally annotated with a timezone. +/// +/// Timestamp values do not include any leap seconds (in other words, all +/// days are considered 86400 seconds long). +/// +/// Timestamps with a non-empty timezone +/// ------------------------------------ +/// +/// If a Timestamp column has a non-empty timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone +/// (the Unix epoch), regardless of the Timestamp's own timezone. +/// +/// Therefore, timestamp values with a non-empty timezone correspond to +/// physical points in time together with some additional information about +/// how the data was obtained and/or how to display it (the timezone). +/// +/// For example, the timestamp value 0 with the timezone string "Europe/Paris" +/// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the +/// application may prefer to display it as "January 1st 1970, 01h00" in +/// the Europe/Paris timezone (which is the same physical point in time). +/// +/// One consequence is that timestamp values with a non-empty timezone +/// can be compared and ordered directly, since they all share the same +/// well-known point of reference (the Unix epoch). +/// +/// Timestamps with an unset / empty timezone +/// ----------------------------------------- +/// +/// If a Timestamp column has no timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. +/// +/// Therefore, timestamp values without a timezone cannot be meaningfully +/// interpreted as physical points in time, but only as calendar / clock +/// indications ("wall clock time") in an unspecified timezone. +/// +/// For example, the timestamp value 0 with an empty timezone string +/// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there +/// is not enough information to interpret it as a well-defined physical +/// point in time. /// -/// The Timestamp metadata supports both "time zone naive" and "time zone -/// aware" timestamps. Read about the timezone attribute for more detail +/// One consequence is that timestamp values without a timezone cannot +/// be reliably compared or ordered, since they may have different points of +/// reference. In particular, it is *not* possible to interpret an unset +/// or empty timezone as the same as "UTC". +/// +/// Conversion between timezones +/// ---------------------------- +/// +/// If a Timestamp column has a non-empty timezone, changing the timezone +/// to a different non-empty value is a metadata-only operation: +/// the timestamp values need not change as their point of reference remains +/// the same (the Unix epoch). +/// +/// However, if a Timestamp column has no timezone value, changing it to a +/// non-empty value requires to think about the desired semantics. +/// One possibility is to assume that the original timestamp values are +/// relative to the epoch of the timezone being set; timestamp values should +/// then adjusted to the Unix epoch (for example, changing the timezone from +/// empty to "Europe/Paris" would require converting the timestamp values +/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is +/// nevertheless correct). +/// +/// Guidelines for encoding data from external libraries +/// ---------------------------------------------------- +/// +/// Date & time libraries often have multiple different data types for temporal +/// data. In order to ease interoperability between different implementations the +/// Arrow project has some recommendations for encoding these types into a Timestamp +/// column. +/// +/// An "instant" represents a physical point in time that has no relevant timezone +/// (for example, astronomical data). To encode an instant, use a Timestamp with +/// the timezone string set to "UTC", and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// A "zoned date-time" represents a physical point in time annotated with an +/// informative timezone (for example, the timezone in which the data was +/// recorded). To encode a zoned date-time, use a Timestamp with the timezone +/// string set to the name of the timezone, and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// (There is some ambiguity between an instant and a zoned date-time with the +/// UTC timezone. Both of these are stored the same in Arrow. Typically, +/// this distinction does not matter. If it does, then an application should +/// use custom metadata or an extension type to distinguish between the two cases.) +/// +/// An "offset date-time" represents a physical point in time combined with an +/// explicit offset from UTC. To encode an offset date-time, use a Timestamp +/// with the timezone string set to the numeric timezone offset string +/// (e.g. "+03:00"), and make sure the Timestamp values are relative to +/// the UTC epoch (January 1st 1970, midnight). +/// +/// A "naive date-time" (also called "local date-time" in some libraries) +/// represents a wall clock time combined with a calendar date, but with +/// no indication of how to map this information to a physical point in time. +/// Naive date-times must be handled with care because of this missing +/// information, and also because daylight saving time (DST) may make +/// some values ambiguous or non-existent. A naive date-time may be +/// stored as a struct with Date and Time fields. However, it may also be +/// encoded into a Timestamp column with an empty timezone. The timestamp +/// values should be computed "as if" the timezone of the date-time values +/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would +/// be encoded as timestamp value 0. table Timestamp { unit: TimeUnit; - /// The time zone is a string indicating the name of a time zone, one of: + /// The timezone is an optional string indicating the name of a timezone, + /// one of: /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// * As used in the Olson timezone database (the "tz database" or + /// "tzdata"), such as "America/New_York". + /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", + /// such as "+07:30". /// /// Whether a timezone string is present indicates different semantics about - /// the data: - /// - /// * If the time zone is null or equal to an empty string, the data is "time - /// zone naive" and shall be displayed *as is* to the user, not localized - /// to the locale of the user. This data can be though of as UTC but - /// without having "UTC" as the time zone, it is not considered to be - /// localized to any time zone - /// - /// * If the time zone is set to a valid value, values can be displayed as - /// "localized" to that time zone, even though the underlying 64-bit - /// integers are identical to the same data stored in UTC. Converting - /// between time zones is a metadata-only operation and does not change the - /// underlying values + /// the data (see above). timezone: string; } @@ -252,18 +357,19 @@ enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} // days can differ in length during day light savings time transitions). // All integers in the types below are stored in the endianness indicated // by the schema. +// // YEAR_MONTH - Indicates the number of elapsed whole months, stored as // 4-byte signed integers. -// DAY_TIME - Indicates the number of elapsed days and milliseconds, -// stored as 2 contiguous 32-bit integers (8-bytes in total). Support +// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds), +// stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support // of this IntervalUnit is not required for full arrow compatibility. // MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. -// The values are stored contiguously in 16 byte blocks. Months and -// days are encoded as 32 bit integers and nanoseconds is encoded as a -// 64 bit integer. All integers are signed. Each field is independent -// (e.g. there is no constraint that nanoseconds have the same sign -// as days or that the quantity of nanoseconds represents less -// than a day's worth of time). +// The values are stored contiguously in 16-byte blocks. Months and days are +// encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit +// signed integer. Nanoseconds does not allow for leap seconds. Each field is +// independent (e.g. there is no constraint that nanoseconds have the same +// sign as days or that the quantity of nanoseconds represents less than a +// day's worth of time). table Interval { unit: IntervalUnit; } From 6d86472fa3c68986dc1813d3cb027748472ec22f Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 4 Sep 2022 02:44:36 -0700 Subject: [PATCH 0002/1411] Add overflow-checking variant for primitive arithmetic kernels and explicitly define overflow behavior (#2643) * Add overflow-checking variant for add kernel and explicitly define overflow behavior for add * For subtract, multiply, divide * Fix tests * Fix different error message * Fix typo * Rename APIs and add more comments. Print values in error message. * Add one more test to distinct divide_by_zero behavior on divide. * Fix clippy * Update divide doc with dividing by zero behavior for other numeric types. * Hide ArrowNativeTypeOp * Fix a typo --- arrow/benches/arithmetic_kernels.rs | 4 +- arrow/src/compute/kernels/arithmetic.rs | 262 ++++++++++++++++++++++-- arrow/src/datatypes/native.rs | 106 ++++++++++ 3 files changed, 352 insertions(+), 20 deletions(-) diff --git a/arrow/benches/arithmetic_kernels.rs b/arrow/benches/arithmetic_kernels.rs index 4be4a26933aa..10af0b5432ef 100644 --- a/arrow/benches/arithmetic_kernels.rs +++ b/arrow/benches/arithmetic_kernels.rs @@ -55,13 +55,13 @@ fn bench_multiply(arr_a: &ArrayRef, arr_b: &ArrayRef) { fn bench_divide(arr_a: &ArrayRef, arr_b: &ArrayRef) { let arr_a = arr_a.as_any().downcast_ref::().unwrap(); let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide(arr_a, arr_b).unwrap()); + criterion::black_box(divide_checked(arr_a, arr_b).unwrap()); } fn bench_divide_unchecked(arr_a: &ArrayRef, arr_b: &ArrayRef) { let arr_a = arr_a.as_any().downcast_ref::().unwrap(); let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide_unchecked(arr_a, arr_b).unwrap()); + criterion::black_box(divide(arr_a, arr_b).unwrap()); } fn bench_divide_scalar(array: &ArrayRef, divisor: f32) { diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index fff687e18b3c..53f48570d927 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -35,8 +35,9 @@ use crate::compute::unary_dyn; use crate::compute::util::combine_option_bitmap; use crate::datatypes; use crate::datatypes::{ - ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, - IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, + native_op::ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, DataType, + Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, + IntervalYearMonthType, }; use crate::datatypes::{ Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, @@ -103,6 +104,106 @@ where Ok(PrimitiveArray::::from(data)) } +/// This is similar to `math_op` as it performs given operation between two input primitive arrays. +/// But the given operation can return `None` if overflow is detected. For the case, this function +/// returns an `Err`. +fn math_checked_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + LT: ArrowNumericType, + RT: ArrowNumericType, + F: Fn(LT::Native, RT::Native) -> Option, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on arrays of different length".to_string(), + )); + } + + let left_iter = ArrayIter::new(left); + let right_iter = ArrayIter::new(right); + + let values: Result::Native>>> = left_iter + .into_iter() + .zip(right_iter.into_iter()) + .map(|(l, r)| { + if let (Some(l), Some(r)) = (l, r) { + let result = op(l, r); + if let Some(r) = result { + Ok(Some(r)) + } else { + // Overflow + Err(ArrowError::ComputeError(format!( + "Overflow happened on: {:?}, {:?}", + l, r + ))) + } + } else { + Ok(None) + } + }) + .collect(); + + let values = values?; + + Ok(PrimitiveArray::::from_iter(values)) +} + +/// This is similar to `math_checked_op` but just for divide op. +fn math_checked_divide( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + LT: ArrowNumericType, + RT: ArrowNumericType, + RT::Native: One + Zero, + F: Fn(LT::Native, RT::Native) -> Option, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform math operation on arrays of different length".to_string(), + )); + } + + let left_iter = ArrayIter::new(left); + let right_iter = ArrayIter::new(right); + + let values: Result::Native>>> = left_iter + .into_iter() + .zip(right_iter.into_iter()) + .map(|(l, r)| { + if let (Some(l), Some(r)) = (l, r) { + let result = op(l, r); + if let Some(r) = result { + Ok(Some(r)) + } else if r.is_zero() { + Err(ArrowError::ComputeError(format!( + "DivideByZero on: {:?}, {:?}", + l, r + ))) + } else { + // Overflow + Err(ArrowError::ComputeError(format!( + "Overflow happened on: {:?}, {:?}", + l, r + ))) + } + } else { + Ok(None) + } + }) + .collect(); + + let values = values?; + + Ok(PrimitiveArray::::from_iter(values)) +} + /// Helper function for operations where a valid `0` on the right array should /// result in an [ArrowError::DivideByZero], namely the division and modulo operations /// @@ -760,15 +861,34 @@ where /// Perform `left + right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `add_checked` instead. pub fn add( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where T: ArrowNumericType, - T::Native: Add, + T::Native: ArrowNativeTypeOp, +{ + math_op(left, right, |a, b| a.add_wrapping(b)) +} + +/// Perform `left + right` operation on two arrays. If either left or right value is null +/// then the result is also null. Once +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `add` instead. +pub fn add_checked( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - math_op(left, right, |a, b| a + b) + math_checked_op(left, right, |a, b| a.add_checked(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -856,15 +976,34 @@ where /// Perform `left - right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `subtract_checked` instead. pub fn subtract( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: Sub, + T::Native: ArrowNativeTypeOp, { - math_op(left, right, |a, b| a - b) + math_op(left, right, |a, b| a.sub_wrapping(b)) +} + +/// Perform `left - right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `subtract` instead. +pub fn subtract_checked( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + math_checked_op(left, right, |a, b| a.sub_checked(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -933,15 +1072,34 @@ where /// Perform `left * right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_check` instead. pub fn multiply( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: Mul, + T::Native: ArrowNativeTypeOp, { - math_op(left, right, |a, b| a * b) + math_op(left, right, |a, b| a.mul_wrapping(b)) +} + +/// Perform `left * right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `multiply` instead. +pub fn multiply_checked( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: datatypes::ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + math_checked_op(left, right, |a, b| a.mul_checked(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1013,18 +1171,21 @@ where /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. -pub fn divide( +/// +/// When `simd` feature is not enabled. This detects overflow and returns an `Err` for that. +/// For an non-overflow-checking variant, use `divide` instead. +pub fn divide_checked( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: Div + Zero + One, + T::Native: ArrowNativeTypeOp + Zero + One, { #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| a / b); #[cfg(not(feature = "simd"))] - return math_checked_divide_op(left, right, |a, b| a / b); + return math_checked_divide(left, right, |a, b| a.div_checked(b)); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1040,17 +1201,21 @@ pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { } /// Perform `left / right` operation on two arrays without checking for division by zero. -/// The result of dividing by zero follows normal floating point rules. +/// For floating point types, the result of dividing by zero follows normal floating point +/// rules. For other numeric types, dividing by zero will panic, /// If either left or right value is null then the result is also null. If any right hand value is zero then the result of this -pub fn divide_unchecked( +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `divide_checked` instead. +pub fn divide( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowFloatNumericType, - T::Native: Div, + T: datatypes::ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - math_op(left, right, |a, b| a / b) + math_op(left, right, |a, b| a.div_wrapping(b)) } /// Modulus every value in an array by a scalar. If any value in the array is null then the @@ -1769,7 +1934,7 @@ mod tests { fn test_primitive_array_divide_with_nulls() { let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); let b = Int32Array::from(vec![Some(5), Some(6), Some(8), Some(9), None, None]); - let c = divide(&a, &b).unwrap(); + let c = divide_checked(&a, &b).unwrap(); assert_eq!(3, c.value(0)); assert!(c.is_null(1)); assert_eq!(1, c.value(2)); @@ -1854,7 +2019,7 @@ mod tests { let b = b.slice(8, 6); let b = b.as_any().downcast_ref::().unwrap(); - let c = divide(a, b).unwrap(); + let c = divide_checked(a, b).unwrap(); assert_eq!(6, c.len()); assert_eq!(3, c.value(0)); assert!(c.is_null(1)); @@ -1919,6 +2084,14 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] + fn test_primitive_array_divide_by_zero_with_checked() { + let a = Int32Array::from(vec![15]); + let b = Int32Array::from(vec![0]); + divide_checked(&a, &b).unwrap(); + } + + #[test] + #[should_panic(expected = "attempt to divide by zero")] fn test_primitive_array_divide_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); @@ -2019,4 +2192,57 @@ mod tests { let expected = Float64Array::from(vec![Some(1.0), None, Some(9.0)]); assert_eq!(expected, actual); } + + #[test] + fn test_primitive_add_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + + let wrapped = add(&a, &b); + let expected = Int32Array::from(vec![-2147483648, -2147483647]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = add_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_subtract_wrapping_overflow() { + let a = Int32Array::from(vec![-2]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = subtract(&a, &b); + let expected = Int32Array::from(vec![i32::MAX]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = subtract_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_mul_wrapping_overflow() { + let a = Int32Array::from(vec![10]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = multiply(&a, &b); + let expected = Int32Array::from(vec![-10]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = multiply_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + #[cfg(not(feature = "simd"))] + fn test_primitive_div_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + + let wrapped = divide(&a, &b); + let expected = Int32Array::from(vec![-2147483648]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = divide_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } } diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 207e8cb40330..444f2b27dce6 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -114,6 +114,112 @@ pub trait ArrowPrimitiveType: 'static { } } +pub(crate) mod native_op { + use super::ArrowNativeType; + use std::ops::{Add, Div, Mul, Sub}; + + /// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking + /// variants for arithmetic operations. For floating point types, this provides some + /// default implementations. Integer types that need to deal with overflow can implement + /// this trait. + /// + /// The APIs with `_wrapping` suffix are the variant of non-overflow-checking. If overflow + /// occurred, they will supposedly wrap around the boundary of the type. + /// + /// The APIs with `_checked` suffix are the variant of overflow-checking which return `None` + /// if overflow occurred. + pub trait ArrowNativeTypeOp: + ArrowNativeType + + Add + + Sub + + Mul + + Div + { + fn add_checked(self, rhs: Self) -> Option { + Some(self + rhs) + } + + fn add_wrapping(self, rhs: Self) -> Self { + self + rhs + } + + fn sub_checked(self, rhs: Self) -> Option { + Some(self - rhs) + } + + fn sub_wrapping(self, rhs: Self) -> Self { + self - rhs + } + + fn mul_checked(self, rhs: Self) -> Option { + Some(self * rhs) + } + + fn mul_wrapping(self, rhs: Self) -> Self { + self * rhs + } + + fn div_checked(self, rhs: Self) -> Option { + Some(self / rhs) + } + + fn div_wrapping(self, rhs: Self) -> Self { + self / rhs + } + } +} + +macro_rules! native_type_op { + ($t:tt) => { + impl native_op::ArrowNativeTypeOp for $t { + fn add_checked(self, rhs: Self) -> Option { + self.checked_add(rhs) + } + + fn add_wrapping(self, rhs: Self) -> Self { + self.wrapping_add(rhs) + } + + fn sub_checked(self, rhs: Self) -> Option { + self.checked_sub(rhs) + } + + fn sub_wrapping(self, rhs: Self) -> Self { + self.wrapping_sub(rhs) + } + + fn mul_checked(self, rhs: Self) -> Option { + self.checked_mul(rhs) + } + + fn mul_wrapping(self, rhs: Self) -> Self { + self.wrapping_mul(rhs) + } + + fn div_checked(self, rhs: Self) -> Option { + self.checked_div(rhs) + } + + fn div_wrapping(self, rhs: Self) -> Self { + self.wrapping_div(rhs) + } + } + }; +} + +native_type_op!(i8); +native_type_op!(i16); +native_type_op!(i32); +native_type_op!(i64); +native_type_op!(u8); +native_type_op!(u16); +native_type_op!(u32); +native_type_op!(u64); + +impl native_op::ArrowNativeTypeOp for f16 {} +impl native_op::ArrowNativeTypeOp for f32 {} +impl native_op::ArrowNativeTypeOp for f64 {} + impl private::Sealed for i8 {} impl ArrowNativeType for i8 { #[inline] From b46fc9287da22534f0599651597fc13cb675ddff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 4 Sep 2022 18:55:49 +0100 Subject: [PATCH 0003/1411] Help LLVM vectorize comparison kernel ~50-80% faster (#2646) * Help LLVM vectorize comparison kernel * Add MutableBuffer::collect_bool * Add SAFETY comments --- arrow/src/buffer/mutable.rs | 68 ++++++++++++++----------- arrow/src/compute/kernels/comparison.rs | 19 +++---- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/arrow/src/buffer/mutable.rs b/arrow/src/buffer/mutable.rs index 1c662ec23eef..96c837922999 100644 --- a/arrow/src/buffer/mutable.rs +++ b/arrow/src/buffer/mutable.rs @@ -373,6 +373,41 @@ impl MutableBuffer { assert!(len <= self.capacity()); self.len = len; } + + /// Invokes `f` with values `0..len` collecting the boolean results into a new `MutableBuffer` + /// + /// This is similar to `from_trusted_len_iter_bool`, however, can be significantly faster + /// as it eliminates the conditional `Iterator::next` + #[inline] + pub(crate) fn collect_bool bool>(len: usize, mut f: F) -> Self { + let mut buffer = Self::new(bit_util::ceil(len, 8)); + + let chunks = len / 8; + let remainder = len % 8; + for chunk in 0..chunks { + let mut packed = 0; + for bit_idx in 0..8 { + let i = bit_idx + chunk * 8; + packed |= (f(i) as u8) << bit_idx; + } + + // SAFETY: Already allocated sufficient capacity + unsafe { buffer.push_unchecked(packed) } + } + + if remainder != 0 { + let mut packed = 0; + for bit_idx in 0..remainder { + let i = bit_idx + chunks * 8; + packed |= (f(i) as u8) << bit_idx; + } + + // SAFETY: Already allocated sufficient capacity + unsafe { buffer.push_unchecked(packed) } + } + + buffer + } } /// # Safety @@ -496,38 +531,9 @@ impl MutableBuffer { mut iterator: I, ) -> Self { let (_, upper) = iterator.size_hint(); - let upper = upper.expect("from_trusted_len_iter requires an upper limit"); - - let mut result = { - let byte_capacity: usize = upper.saturating_add(7) / 8; - MutableBuffer::new(byte_capacity) - }; + let len = upper.expect("from_trusted_len_iter requires an upper limit"); - 'a: loop { - let mut byte_accum: u8 = 0; - let mut mask: u8 = 1; - - //collect (up to) 8 bits into a byte - while mask != 0 { - if let Some(value) = iterator.next() { - byte_accum |= match value { - true => mask, - false => 0, - }; - mask <<= 1; - } else { - if mask != 1 { - // Add last byte - result.push_unchecked(byte_accum); - } - break 'a; - } - } - - // Soundness: from_trusted_len - result.push_unchecked(byte_accum); - } - result + Self::collect_bool(len, |_| iterator.next().unwrap()) } /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length or errors diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index dd9d4fc5d492..cba2d6e7dfd9 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -59,12 +59,10 @@ where let null_bit_buffer = combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; - // Safety: - // `i < $left.len()` and $left.len() == $right.len() - let comparison = (0..left.len()) - .map(|i| unsafe { op(left.value_unchecked(i), right.value_unchecked(i)) }); - // same size as $left.len() and $right.len() - let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; + let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + // SAFETY: i in range 0..len + op(left.value_unchecked(i), right.value_unchecked(i)) + }); let data = unsafe { ArrayData::new_unchecked( @@ -91,11 +89,10 @@ where .null_buffer() .map(|b| b.bit_slice(left.offset(), left.len())); - // Safety: - // `i < $left.len()` - let comparison = (0..left.len()).map(|i| unsafe { op(left.value_unchecked(i)) }); - // same as $left.len() - let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; + let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + // SAFETY: i in range 0..len + op(left.value_unchecked(i)) + }); let data = unsafe { ArrayData::new_unchecked( From 30ab9bba5c133bb32be6cde307d5111c0088bce1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 5 Sep 2022 04:06:15 -0700 Subject: [PATCH 0004/1411] Use `downcast_primitive_array` in arithmetic kernels (#2640) * Use downcast_primitive_array in arithmetic kernels * Constrain two sides to be same type --- arrow/src/array/cast.rs | 249 ++++++++++++++++++++++++ arrow/src/compute/kernels/arithmetic.rs | 117 +++++------ 2 files changed, 294 insertions(+), 72 deletions(-) diff --git a/arrow/src/array/cast.rs b/arrow/src/array/cast.rs index 2b68cbbe6424..2c8366ff5f9c 100644 --- a/arrow/src/array/cast.rs +++ b/arrow/src/array/cast.rs @@ -225,6 +225,255 @@ macro_rules! downcast_primitive_array { $($p => $fallback,)* } }; + + (($values1:ident, $values2:ident) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match ($values1.data_type(), $values2.data_type()) { + ($crate::datatypes::DataType::Int8, $crate::datatypes::DataType::Int8) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Int8Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Int8Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Int16, $crate::datatypes::DataType::Int16) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Int16Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Int16Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Int32, $crate::datatypes::DataType::Int32) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Int32Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Int32Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Int64, $crate::datatypes::DataType::Int64) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Int64Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Int64Type, + >($values2); + $e + } + ($crate::datatypes::DataType::UInt8, $crate::datatypes::DataType::UInt8) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt8Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt8Type, + >($values2); + $e + } + ($crate::datatypes::DataType::UInt16, $crate::datatypes::DataType::UInt16) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt16Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt16Type, + >($values2); + $e + } + ($crate::datatypes::DataType::UInt32, $crate::datatypes::DataType::UInt32) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt32Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt32Type, + >($values2); + $e + } + ($crate::datatypes::DataType::UInt64, $crate::datatypes::DataType::UInt64) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt64Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::UInt64Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Float32, $crate::datatypes::DataType::Float32) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Float32Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Float32Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Float64, $crate::datatypes::DataType::Float64) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Float64Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Float64Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Date32, $crate::datatypes::DataType::Date32) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Date32Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Date32Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Date64, $crate::datatypes::DataType::Date64) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Date64Type, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Date64Type, + >($values2); + $e + } + ($crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Second), $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Second)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Time32SecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Time32SecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Millisecond), $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Millisecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Time32MillisecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Time32MillisecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Microsecond), $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Microsecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Time64MicrosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Time64MicrosecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Nanosecond), $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Nanosecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::Time64NanosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::Time64NanosecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Second, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Second, _)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampSecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampSecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Millisecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Millisecond, _)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampMillisecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampMillisecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Microsecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Microsecond, _)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampMicrosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampMicrosecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Nanosecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Nanosecond, _)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampNanosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::TimestampNanosecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::YearMonth), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::YearMonth)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalYearMonthType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalYearMonthType, + >($values2); + $e + } + ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::DayTime), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::DayTime)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalDayTimeType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalDayTimeType, + >($values2); + $e + } + ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::MonthDayNano), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::MonthDayNano)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalMonthDayNanoType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::IntervalMonthDayNanoType, + >($values2); + $e + } + ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Second), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Second)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationSecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationSecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Millisecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Millisecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationMillisecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationMillisecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Microsecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Microsecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationMicrosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationMicrosecondType, + >($values2); + $e + } + ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Nanosecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Nanosecond)) => { + let $values1 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationNanosecondType, + >($values1); + let $values2 = $crate::array::as_primitive_array::< + $crate::datatypes::DurationNanosecondType, + >($values2); + $e + } + $($p => $fallback,)* + } + }; } /// Force downcast of an [`Array`], such as an [`ArrayRef`], to diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 53f48570d927..eab4d2136aa1 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -33,7 +33,6 @@ use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; use crate::compute::unary_dyn; use crate::compute::util::combine_option_bitmap; -use crate::datatypes; use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, @@ -44,8 +43,8 @@ use crate::datatypes::{ UInt32Type, UInt64Type, UInt8Type, }; use crate::error::{ArrowError, Result}; +use crate::{datatypes, downcast_primitive_array}; use num::traits::Pow; -use std::any::type_name; #[cfg(feature = "simd")] use std::borrow::BorrowMut; #[cfg(feature = "simd")] @@ -671,72 +670,6 @@ macro_rules! typed_dict_math_op { }}; } -macro_rules! typed_op { - ($LEFT: expr, $RIGHT: expr, $T: ident, $OP: expr, $MATH_OP: ident) => {{ - let left = $LEFT - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::CastError(format!( - "Left array cannot be cast to {}", - type_name::<$T>() - )) - })?; - let right = $RIGHT - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::CastError(format!( - "Right array cannot be cast to {}", - type_name::<$T>(), - )) - })?; - let array = $MATH_OP(left, right, $OP)?; - Ok(Arc::new(array)) - }}; -} - -macro_rules! typed_math_op { - ($LEFT: expr, $RIGHT: expr, $OP: expr, $MATH_OP: ident) => {{ - match $LEFT.data_type() { - DataType::Int8 => { - typed_op!($LEFT, $RIGHT, Int8Type, $OP, $MATH_OP) - } - DataType::Int16 => { - typed_op!($LEFT, $RIGHT, Int16Type, $OP, $MATH_OP) - } - DataType::Int32 => { - typed_op!($LEFT, $RIGHT, Int32Type, $OP, $MATH_OP) - } - DataType::Int64 => { - typed_op!($LEFT, $RIGHT, Int64Type, $OP, $MATH_OP) - } - DataType::UInt8 => { - typed_op!($LEFT, $RIGHT, UInt8Type, $OP, $MATH_OP) - } - DataType::UInt16 => { - typed_op!($LEFT, $RIGHT, UInt16Type, $OP, $MATH_OP) - } - DataType::UInt32 => { - typed_op!($LEFT, $RIGHT, UInt32Type, $OP, $MATH_OP) - } - DataType::UInt64 => { - typed_op!($LEFT, $RIGHT, UInt64Type, $OP, $MATH_OP) - } - DataType::Float32 => { - typed_op!($LEFT, $RIGHT, Float32Type, $OP, $MATH_OP) - } - DataType::Float64 => { - typed_op!($LEFT, $RIGHT, Float64Type, $OP, $MATH_OP) - } - t => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation on arrays of type {}", - t - ))), - } - }}; -} - /// Helper function to perform math lambda function on values from two dictionary arrays, this /// version does not attempt to use SIMD explicitly (though the compiler may auto vectorize) macro_rules! math_dict_op { @@ -946,7 +879,17 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { ))), } } - _ => typed_math_op!(left, right, |a, b| a + b, math_op), + _ => { + downcast_primitive_array!( + (left, right) => { + math_op(left, right, |a, b| a + b).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } } } @@ -1013,7 +956,17 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a - b, math_op_dict) } - _ => typed_math_op!(left, right, |a, b| a - b, math_op), + _ => { + downcast_primitive_array!( + (left, right) => { + math_op(left, right, |a, b| a - b).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } } } @@ -1109,7 +1062,17 @@ pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a * b, math_op_dict) } - _ => typed_math_op!(left, right, |a, b| a * b, math_op), + _ => { + downcast_primitive_array!( + (left, right) => { + math_op(left, right, |a, b| a * b).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } } } @@ -1196,7 +1159,17 @@ pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a / b, math_divide_checked_op_dict) } - _ => typed_math_op!(left, right, |a, b| a / b, math_checked_divide_op), + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| a / b).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } } } From dc4ccf825e612776806d8c9934eae21c99dc7893 Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Mon, 5 Sep 2022 07:17:47 -0400 Subject: [PATCH 0005/1411] Fix bug in page skipping (#2552) * Fix bug in page skipping * Revert * When skipping records, only read dictionary page * Fix remaining page skipping bug and add unit tests --- parquet/src/arrow/arrow_reader/selection.rs | 19 +++ parquet/src/arrow/async_reader.rs | 124 ++++++++++++++++++++ parquet/src/arrow/record_reader/mod.rs | 12 +- parquet/src/column/reader.rs | 42 ++++++- 4 files changed, 193 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index b6ee273ab569..544b7931a265 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -595,5 +595,24 @@ mod tests { // assert_eq!(mask, vec![false, true, true, false, true, true, true]); assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to remaining in page and first row of next page + RowSelector::skip(5), + RowSelector::select(6), + // Skip remaining + RowSelector::skip(50), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 30..40]); } } diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 201f2afcf0e8..77537b7b4f2d 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -798,6 +798,7 @@ mod tests { use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; use arrow::error::Result as ArrowResult; use futures::TryStreamExt; + use rand::{thread_rng, Rng}; use std::sync::Mutex; struct TestReader { @@ -936,6 +937,129 @@ mod tests { assert_eq!(async_batches, sync_batches); } + #[tokio::test] + async fn test_async_reader_skip_pages() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); + + let selection = RowSelection::from(vec![ + RowSelector::skip(21), // Skip first page + RowSelector::select(21), // Select page to boundary + RowSelector::skip(41), // Skip multiple pages + RowSelector::select(41), // Select multiple pages + RowSelector::skip(25), // Skip page across boundary + RowSelector::select(25), // Select across page boundary + RowSelector::skip(7116), // Skip to final page boundary + RowSelector::select(10), // Select final page + ]); + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![9]); + + let stream = builder + .with_projection(mask.clone()) + .with_row_selection(selection.clone()) + .build() + .expect("building stream"); + + let async_batches: Vec<_> = stream.try_collect().await.unwrap(); + + let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data) + .unwrap() + .with_projection(mask) + .with_batch_size(1024) + .with_row_selection(selection) + .build() + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + } + + #[tokio::test] + async fn test_fuzz_async_reader_selection() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let mut rand = thread_rng(); + + for _ in 0..100 { + let mut expected_rows = 0; + let mut total_rows = 0; + let mut skip = false; + let mut selectors = vec![]; + + while total_rows < 7300 { + let row_count: usize = rand.gen_range(0..100); + + let row_count = row_count.min(7300 - total_rows); + + selectors.push(RowSelector { row_count, skip }); + + total_rows += row_count; + if !skip { + expected_rows += row_count; + } + + skip = !skip; + } + + let selection = RowSelection::from(selectors); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); + + let col_idx: usize = rand.gen_range(0..13); + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); + + let stream = builder + .with_projection(mask.clone()) + .with_row_selection(selection.clone()) + .build() + .expect("building stream"); + + let async_batches: Vec<_> = stream.try_collect().await.unwrap(); + + let actual_rows: usize = + async_batches.into_iter().map(|b| b.num_rows()).sum(); + + assert_eq!(actual_rows, expected_rows); + } + } + #[tokio::test] async fn test_row_filter() { let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]); diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index 6c1c61039610..b7318af9e85a 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -145,7 +145,11 @@ where loop { // Try to find some records from buffers that has been read into memory // but not counted as seen records. - let end_of_column = !self.column_reader.as_mut().unwrap().has_next()?; + + // Check to see if the column is exhausted. Only peek the next page since in + // case we are reading to a page boundary and do not actually need to read + // the next page. + let end_of_column = !self.column_reader.as_mut().unwrap().peek_next()?; let (record_count, value_count) = self.count_records(num_records - records_read, end_of_column); @@ -154,7 +158,9 @@ where self.num_values += value_count; records_read += record_count; - if records_read == num_records || end_of_column { + if records_read == num_records + || !self.column_reader.as_mut().unwrap().has_next()? + { break; } @@ -198,7 +204,7 @@ where pub fn skip_records(&mut self, num_records: usize) -> Result { // First need to clear the buffer let end_of_column = match self.column_reader.as_mut() { - Some(reader) => !reader.has_next()?, + Some(reader) => !reader.peek_next()?, None => return Ok(0), }; diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index f96ccc3ea3e5..09254999bdd3 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -306,7 +306,7 @@ where // If dictionary, we must read it if metadata.is_dict { - self.read_new_page()?; + self.read_dictionary_page()?; continue; } @@ -362,6 +362,24 @@ where Ok(num_records - remaining) } + /// Read the next page as a dictionary page. If the next page is not a dictionary page, + /// this will return an error. + fn read_dictionary_page(&mut self) -> Result<()> { + match self.page_reader.get_next_page()? { + Some(Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + }) => self + .values_decoder + .set_dict(buf, num_values, encoding, is_sorted), + _ => Err(ParquetError::General( + "Invalid page. Expecting dictionary page".to_string(), + )), + } + } + /// Reads a new page and set up the decoders for levels, values or dictionary. /// Returns false if there's no page left. fn read_new_page(&mut self) -> Result { @@ -493,6 +511,28 @@ where } } + /// Check whether there is more data to read from this column, + /// If the current page is fully decoded, this will NOT load the next page + /// into the buffer + #[inline] + pub(crate) fn peek_next(&mut self) -> Result { + if self.num_buffered_values == 0 + || self.num_buffered_values == self.num_decoded_values + { + // TODO: should we return false if read_new_page() = true and + // num_buffered_values = 0? + match self.page_reader.peek_next_page()? { + Some(next_page) => Ok(next_page.num_rows != 0), + None => Ok(false), + } + } else { + Ok(true) + } + } + + /// Check whether there is more data to read from this column, + /// If the current page is fully decoded, this will load the next page + /// (if it exists) into the buffer #[inline] pub(crate) fn has_next(&mut self) -> Result { if self.num_buffered_values == 0 From 43d8474429227a267141ea49d9e2ee5df1bd9239 Mon Sep 17 00:00:00 2001 From: Michael Edwards Date: Mon, 5 Sep 2022 17:46:56 +0200 Subject: [PATCH 0006/1411] MutableBuffer::typed_data - shared ref access to the typed slice (#2652) --- arrow/src/buffer/immutable.rs | 2 +- arrow/src/buffer/mutable.rs | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs index 28042a3817be..6d4d624efc13 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow/src/buffer/immutable.rs @@ -181,7 +181,7 @@ impl Buffer { unsafe { self.data.ptr().as_ptr().add(self.offset) } } - /// View buffer as typed slice. + /// View buffer as a slice of a specific type. /// /// # Panics /// diff --git a/arrow/src/buffer/mutable.rs b/arrow/src/buffer/mutable.rs index 96c837922999..d1e633993936 100644 --- a/arrow/src/buffer/mutable.rs +++ b/arrow/src/buffer/mutable.rs @@ -288,7 +288,7 @@ impl MutableBuffer { Buffer::from_bytes(bytes) } - /// View this buffer as a slice of a specific type. + /// View this buffer as a mutable slice of a specific type. /// /// # Panics /// @@ -304,6 +304,21 @@ impl MutableBuffer { offsets } + /// View buffer as a immutable slice of a specific type. + /// + /// # Panics + /// + /// This function panics if the underlying buffer is not aligned + /// correctly for type `T`. + pub fn typed_data(&self) -> &[T] { + // SAFETY + // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect + // implementation outside this crate, and this method checks alignment + let (prefix, offsets, suffix) = unsafe { self.as_slice().align_to::() }; + assert!(prefix.is_empty() && suffix.is_empty()); + offsets + } + /// Extends this buffer from a slice of items that can be represented in bytes, increasing its capacity if needed. /// # Example /// ``` From 7429b5cf99c025d170afc26e9a821baff6ebf36a Mon Sep 17 00:00:00 2001 From: JanKaul Date: Mon, 5 Sep 2022 18:52:19 +0200 Subject: [PATCH 0007/1411] update doc for object_store copy_if_not_exists (#2653) --- object_store/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 9ed9db9e928c..16f0c6f3a2aa 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -290,6 +290,10 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Copy an object from one path to another, only if destination is empty. /// /// Will return an error if the destination already has an object. + /// + /// Performs an atomic operation if the underlying object storage supports it. + /// If atomic operations are not supported by the underlying object storage (like S3) + /// it will return an error. async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()>; /// Move an object from one path to another in the same object store. From 773f1b9d49d11a71129aaaebc9884559b7690cd3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 5 Sep 2022 12:44:25 -0700 Subject: [PATCH 0008/1411] Ignore flaky test (#2655) --- parquet/src/arrow/async_reader.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 77537b7b4f2d..564166ecbe5e 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -995,6 +995,7 @@ mod tests { } #[tokio::test] + #[ignore] async fn test_fuzz_async_reader_selection() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); From 8ea6ca1cb8bba3e9293cc5c35f08f25c5b1e8336 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Tue, 6 Sep 2022 10:44:45 +0800 Subject: [PATCH 0009/1411] support `CastOption` for casting numeric (#2649) * cast numeric to numeric with error * fix comment --- arrow/src/compute/kernels/cast.rs | 436 ++++++++++++++++++++++-------- 1 file changed, 327 insertions(+), 109 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 6b4f224708da..d451484028fd 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -870,105 +870,269 @@ pub fn cast_with_options( }, // start numeric casts - (UInt8, UInt16) => cast_numeric_arrays::(array), - (UInt8, UInt32) => cast_numeric_arrays::(array), - (UInt8, UInt64) => cast_numeric_arrays::(array), - (UInt8, Int8) => cast_numeric_arrays::(array), - (UInt8, Int16) => cast_numeric_arrays::(array), - (UInt8, Int32) => cast_numeric_arrays::(array), - (UInt8, Int64) => cast_numeric_arrays::(array), - (UInt8, Float32) => cast_numeric_arrays::(array), - (UInt8, Float64) => cast_numeric_arrays::(array), - - (UInt16, UInt8) => cast_numeric_arrays::(array), - (UInt16, UInt32) => cast_numeric_arrays::(array), - (UInt16, UInt64) => cast_numeric_arrays::(array), - (UInt16, Int8) => cast_numeric_arrays::(array), - (UInt16, Int16) => cast_numeric_arrays::(array), - (UInt16, Int32) => cast_numeric_arrays::(array), - (UInt16, Int64) => cast_numeric_arrays::(array), - (UInt16, Float32) => cast_numeric_arrays::(array), - (UInt16, Float64) => cast_numeric_arrays::(array), - - (UInt32, UInt8) => cast_numeric_arrays::(array), - (UInt32, UInt16) => cast_numeric_arrays::(array), - (UInt32, UInt64) => cast_numeric_arrays::(array), - (UInt32, Int8) => cast_numeric_arrays::(array), - (UInt32, Int16) => cast_numeric_arrays::(array), - (UInt32, Int32) => cast_numeric_arrays::(array), - (UInt32, Int64) => cast_numeric_arrays::(array), - (UInt32, Float32) => cast_numeric_arrays::(array), - (UInt32, Float64) => cast_numeric_arrays::(array), - - (UInt64, UInt8) => cast_numeric_arrays::(array), - (UInt64, UInt16) => cast_numeric_arrays::(array), - (UInt64, UInt32) => cast_numeric_arrays::(array), - (UInt64, Int8) => cast_numeric_arrays::(array), - (UInt64, Int16) => cast_numeric_arrays::(array), - (UInt64, Int32) => cast_numeric_arrays::(array), - (UInt64, Int64) => cast_numeric_arrays::(array), - (UInt64, Float32) => cast_numeric_arrays::(array), - (UInt64, Float64) => cast_numeric_arrays::(array), - - (Int8, UInt8) => cast_numeric_arrays::(array), - (Int8, UInt16) => cast_numeric_arrays::(array), - (Int8, UInt32) => cast_numeric_arrays::(array), - (Int8, UInt64) => cast_numeric_arrays::(array), - (Int8, Int16) => cast_numeric_arrays::(array), - (Int8, Int32) => cast_numeric_arrays::(array), - (Int8, Int64) => cast_numeric_arrays::(array), - (Int8, Float32) => cast_numeric_arrays::(array), - (Int8, Float64) => cast_numeric_arrays::(array), - - (Int16, UInt8) => cast_numeric_arrays::(array), - (Int16, UInt16) => cast_numeric_arrays::(array), - (Int16, UInt32) => cast_numeric_arrays::(array), - (Int16, UInt64) => cast_numeric_arrays::(array), - (Int16, Int8) => cast_numeric_arrays::(array), - (Int16, Int32) => cast_numeric_arrays::(array), - (Int16, Int64) => cast_numeric_arrays::(array), - (Int16, Float32) => cast_numeric_arrays::(array), - (Int16, Float64) => cast_numeric_arrays::(array), - - (Int32, UInt8) => cast_numeric_arrays::(array), - (Int32, UInt16) => cast_numeric_arrays::(array), - (Int32, UInt32) => cast_numeric_arrays::(array), - (Int32, UInt64) => cast_numeric_arrays::(array), - (Int32, Int8) => cast_numeric_arrays::(array), - (Int32, Int16) => cast_numeric_arrays::(array), - (Int32, Int64) => cast_numeric_arrays::(array), - (Int32, Float32) => cast_numeric_arrays::(array), - (Int32, Float64) => cast_numeric_arrays::(array), - - (Int64, UInt8) => cast_numeric_arrays::(array), - (Int64, UInt16) => cast_numeric_arrays::(array), - (Int64, UInt32) => cast_numeric_arrays::(array), - (Int64, UInt64) => cast_numeric_arrays::(array), - (Int64, Int8) => cast_numeric_arrays::(array), - (Int64, Int16) => cast_numeric_arrays::(array), - (Int64, Int32) => cast_numeric_arrays::(array), - (Int64, Float32) => cast_numeric_arrays::(array), - (Int64, Float64) => cast_numeric_arrays::(array), - - (Float32, UInt8) => cast_numeric_arrays::(array), - (Float32, UInt16) => cast_numeric_arrays::(array), - (Float32, UInt32) => cast_numeric_arrays::(array), - (Float32, UInt64) => cast_numeric_arrays::(array), - (Float32, Int8) => cast_numeric_arrays::(array), - (Float32, Int16) => cast_numeric_arrays::(array), - (Float32, Int32) => cast_numeric_arrays::(array), - (Float32, Int64) => cast_numeric_arrays::(array), - (Float32, Float64) => cast_numeric_arrays::(array), - - (Float64, UInt8) => cast_numeric_arrays::(array), - (Float64, UInt16) => cast_numeric_arrays::(array), - (Float64, UInt32) => cast_numeric_arrays::(array), - (Float64, UInt64) => cast_numeric_arrays::(array), - (Float64, Int8) => cast_numeric_arrays::(array), - (Float64, Int16) => cast_numeric_arrays::(array), - (Float64, Int32) => cast_numeric_arrays::(array), - (Float64, Int64) => cast_numeric_arrays::(array), - (Float64, Float32) => cast_numeric_arrays::(array), + (UInt8, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt8, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt8, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (UInt16, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Int8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt16, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (UInt32, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Int8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt32, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (UInt64, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Int8) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (UInt64, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Int8, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int8, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int8, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int8, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int8, Int16) => cast_numeric_arrays::(array, cast_options), + (Int8, Int32) => cast_numeric_arrays::(array, cast_options), + (Int8, Int64) => cast_numeric_arrays::(array, cast_options), + (Int8, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int8, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Int16, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, Int8) => cast_numeric_arrays::(array, cast_options), + (Int16, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int16, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Int32, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, Int8) => cast_numeric_arrays::(array, cast_options), + (Int32, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int32, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Int64, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, Int8) => cast_numeric_arrays::(array, cast_options), + (Int64, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, Float32) => { + cast_numeric_arrays::(array, cast_options) + } + (Int64, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Float32, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, Int8) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (Float32, Float64) => { + cast_numeric_arrays::(array, cast_options) + } + + (Float64, UInt8) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, UInt16) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, UInt32) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, UInt64) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, Int8) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, Int16) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, Int32) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (Float64, Float32) => { + cast_numeric_arrays::(array, cast_options) + } // end numeric casts // temporal casts @@ -1229,9 +1393,10 @@ pub fn cast_with_options( } (Duration(_), Int64) => cast_array_data::(array, to_type.clone()), (Interval(from_type), Int64) => match from_type { - IntervalUnit::YearMonth => { - cast_numeric_arrays::(array) - } + IntervalUnit::YearMonth => cast_numeric_arrays::< + IntervalYearMonthType, + Int64Type, + >(array, cast_options), IntervalUnit::DayTime => cast_array_data::(array, to_type.clone()), IntervalUnit::MonthDayNano => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", @@ -1456,21 +1621,62 @@ where } /// Convert Array into a PrimitiveArray of type, and apply numeric cast -fn cast_numeric_arrays(from: &ArrayRef) -> Result +fn cast_numeric_arrays( + from: &ArrayRef, + cast_options: &CastOptions, +) -> Result where FROM: ArrowNumericType, TO: ArrowNumericType, FROM::Native: num::NumCast, TO::Native: num::NumCast, { - Ok(Arc::new(numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - ))) + if cast_options.safe { + // If the value can't be casted to the `TO::Native`, return null + Ok(Arc::new(numeric_cast::( + from.as_any() + .downcast_ref::>() + .unwrap(), + ))) + } else { + // If the value can't be casted to the `TO::Native`, return error + Ok(Arc::new(numeric_cast_with_error::( + from.as_any() + .downcast_ref::>() + .unwrap(), + )?)) + } } -/// Natural cast between numeric types +// Natural cast between numeric types +// If the value of T can't be casted to R, will throw error +fn numeric_cast_with_error(from: &PrimitiveArray) -> Result> +where + T: ArrowNumericType, + R: ArrowNumericType, + T::Native: num::NumCast, + R::Native: num::NumCast, +{ + let iter = from + .iter() + .map(|v| match v { + None => Ok(None), + Some(value) => match num::cast::cast::(value) { + None => Err(ArrowError::CastError(format!( + "Can't cast value {:?} to type {}", + value, + R::DATA_TYPE + ))), + Some(v) => Ok(Some(v)), + }, + }) + .collect::>>>()?; + + Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) +} + +// Natural cast between numeric types +// If the value of T can't be casted to R, it will be converted to null fn numeric_cast(from: &PrimitiveArray) -> PrimitiveArray where T: ArrowNumericType, @@ -3009,6 +3215,18 @@ mod tests { assert!(!c.is_valid(4)); } + #[test] + #[should_panic(expected = "Can't cast value -5 to type UInt8")] + fn test_cast_int32_to_u8_with_error() { + let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); + let array = Arc::new(a) as ArrayRef; + // overflow with the error + let cast_option = CastOptions { safe: false }; + let result = cast_with_options(&array, &DataType::UInt8, &cast_option); + assert!(result.is_err()); + result.unwrap(); + } + #[test] fn test_cast_i32_to_u8_sliced() { let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); From 59767849dcd4373e1473bc7f708ad904eff44e14 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Sep 2022 00:55:58 -0700 Subject: [PATCH 0010/1411] ffi feature also requires layout (#2660) --- arrow/src/array/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 6ad2c26fee5d..7a60db4d91a1 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -194,8 +194,10 @@ pub use self::data::ArrayData; pub use self::data::ArrayDataBuilder; pub use self::data::ArrayDataRef; +#[cfg(any(feature = "ipc", feature = "ffi"))] +pub(crate) use self::data::layout; #[cfg(feature = "ipc")] -pub(crate) use self::data::{layout, BufferSpec}; +pub(crate) use self::data::BufferSpec; pub use self::array_binary::BinaryArray; pub use self::array_binary::LargeBinaryArray; From 31aaef2733ee2bb1804f9f38600a539604cf177d Mon Sep 17 00:00:00 2001 From: X <6884440+dingxiangfei2009@users.noreply.github.com> Date: Tue, 6 Sep 2022 18:07:13 +0800 Subject: [PATCH 0011/1411] Fully qualifying parquet items (#2638) * fully qualifying parquet items * rustfmt * remove glob import * remove unnecessary borrow * import repeated items * rustfmt --- parquet_derive/src/lib.rs | 30 +++---- parquet_derive/src/parquet_field.rs | 130 +++++++++++++--------------- parquet_derive_test/src/lib.rs | 11 +-- 3 files changed, 78 insertions(+), 93 deletions(-) diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index fc7af20ca3f1..6525513cbaa1 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -40,9 +40,8 @@ mod parquet_field; /// Example: /// /// ```ignore -/// use parquet; -/// use parquet::record::RecordWriter; -/// use parquet::schema::parser::parse_message_type; +/// use parquet::file::properties::WriterProperties; +/// use parquet::file::writer::SerializedFileWriter; /// /// use std::sync::Arc; // @@ -97,11 +96,13 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke field_infos.iter().map(|x| x.parquet_type()).collect(); (quote! { - impl #generics RecordWriter<#derived_for #generics> for &[#derived_for #generics] { - fn write_to_row_group( + impl #generics ::parquet::record::RecordWriter<#derived_for #generics> for &[#derived_for #generics] { + fn write_to_row_group( &self, - row_group_writer: &mut parquet::file::writer::SerializedRowGroupWriter<'_, W> - ) -> Result<(), parquet::errors::ParquetError> { + row_group_writer: &mut ::parquet::file::writer::SerializedRowGroupWriter<'_, W> + ) -> Result<(), ::parquet::errors::ParquetError> { + use ::parquet::column::writer::ColumnWriter; + let mut row_group_writer = row_group_writer; let records = &self; // Used by all the writer snippets to be more clear @@ -112,7 +113,7 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke #writer_snippets column_writer.close()?; } else { - return Err(parquet::errors::ParquetError::General("Failed to get next column".into())) + return Err(::parquet::errors::ParquetError::General("Failed to get next column".into())) } } );* @@ -120,17 +121,16 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke Ok(()) } - fn schema(&self) -> Result { - use parquet::schema::types::Type as ParquetType; - use parquet::schema::types::TypePtr; - use parquet::basic::LogicalType; - use parquet::basic::*; + fn schema(&self) -> Result<::parquet::schema::types::TypePtr, ::parquet::errors::ParquetError> { + use ::parquet::schema::types::Type as ParquetType; + use ::parquet::schema::types::TypePtr; + use ::parquet::basic::LogicalType; - let mut fields: Vec = Vec::new(); + let mut fields: ::std::vec::Vec = ::std::vec::Vec::new(); #( #field_types );*; - let group = parquet::schema::types::Type::group_type_builder("rust_schema") + let group = ParquetType::group_type_builder("rust_schema") .with_fields(&mut fields) .build()?; Ok(group.into()) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 835ac793e409..0642e23327f7 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -181,28 +181,28 @@ impl Field { let field_name = &self.ident.to_string(); let physical_type = match self.ty.physical_type() { parquet::basic::Type::BOOLEAN => quote! { - parquet::basic::Type::BOOLEAN + ::parquet::basic::Type::BOOLEAN }, parquet::basic::Type::INT32 => quote! { - parquet::basic::Type::INT32 + ::parquet::basic::Type::INT32 }, parquet::basic::Type::INT64 => quote! { - parquet::basic::Type::INT64 + ::parquet::basic::Type::INT64 }, parquet::basic::Type::INT96 => quote! { - parquet::basic::Type::INT96 + ::parquet::basic::Type::INT96 }, parquet::basic::Type::FLOAT => quote! { - parquet::basic::Type::FLOAT + ::parquet::basic::Type::FLOAT }, parquet::basic::Type::DOUBLE => quote! { - parquet::basic::Type::DOUBLE + ::parquet::basic::Type::DOUBLE }, parquet::basic::Type::BYTE_ARRAY => quote! { - parquet::basic::Type::BYTE_ARRAY + ::parquet::basic::Type::BYTE_ARRAY }, parquet::basic::Type::FIXED_LEN_BYTE_ARRAY => quote! { - parquet::basic::Type::FIXED_LEN_BYTE_ARRAY + ::parquet::basic::Type::FIXED_LEN_BYTE_ARRAY }, }; let logical_type = self.ty.logical_type(); @@ -250,7 +250,7 @@ impl Field { let some = if is_a_timestamp { quote! { Some(inner.timestamp_millis()) } } else if is_a_date { - quote! { Some(inner.signed_duration_since(chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32) } + quote! { Some(inner.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32) } } else if is_a_uuid { quote! { Some((&inner.to_string()[..]).into()) } } else if is_a_byte_buf { @@ -286,7 +286,7 @@ impl Field { let access = if is_a_timestamp { quote! { rec.#field_name.timestamp_millis() } } else if is_a_date { - quote! { rec.#field_name.signed_duration_since(chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } + quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } } else if is_a_uuid { quote! { (&rec.#field_name.to_string()[..]).into() } } else if is_a_byte_buf { @@ -336,29 +336,19 @@ impl Type { match self.physical_type() { BasicType::BOOLEAN => { - syn::parse_quote!(parquet::column::writer::ColumnWriter::BoolColumnWriter) + syn::parse_quote!(ColumnWriter::BoolColumnWriter) + } + BasicType::INT32 => syn::parse_quote!(ColumnWriter::Int32ColumnWriter), + BasicType::INT64 => syn::parse_quote!(ColumnWriter::Int64ColumnWriter), + BasicType::INT96 => syn::parse_quote!(ColumnWriter::Int96ColumnWriter), + BasicType::FLOAT => syn::parse_quote!(ColumnWriter::FloatColumnWriter), + BasicType::DOUBLE => syn::parse_quote!(ColumnWriter::DoubleColumnWriter), + BasicType::BYTE_ARRAY => { + syn::parse_quote!(ColumnWriter::ByteArrayColumnWriter) + } + BasicType::FIXED_LEN_BYTE_ARRAY => { + syn::parse_quote!(ColumnWriter::FixedLenByteArrayColumnWriter) } - BasicType::INT32 => syn::parse_quote!( - parquet::column::writer::ColumnWriter::Int32ColumnWriter - ), - BasicType::INT64 => syn::parse_quote!( - parquet::column::writer::ColumnWriter::Int64ColumnWriter - ), - BasicType::INT96 => syn::parse_quote!( - parquet::column::writer::ColumnWriter::Int96ColumnWriter - ), - BasicType::FLOAT => syn::parse_quote!( - parquet::column::writer::ColumnWriter::FloatColumnWriter - ), - BasicType::DOUBLE => syn::parse_quote!( - parquet::column::writer::ColumnWriter::DoubleColumnWriter - ), - BasicType::BYTE_ARRAY => syn::parse_quote!( - parquet::column::writer::ColumnWriter::ByteArrayColumnWriter - ), - BasicType::FIXED_LEN_BYTE_ARRAY => syn::parse_quote!( - parquet::column::writer::ColumnWriter::FixedLenByteArrayColumnWriter - ), } } @@ -557,16 +547,18 @@ impl Type { let last_part = self.last_part(); match last_part.trim() { - "NaiveDateTime" => Some(quote! { ConvertedType::TIMESTAMP_MILLIS }), + "NaiveDateTime" => { + Some(quote! { ::parquet::basic::ConvertedType::TIMESTAMP_MILLIS }) + } _ => None, } } fn repetition(&self) -> proc_macro2::TokenStream { - match &self { - Type::Option(_) => quote! { Repetition::OPTIONAL }, + match self { + Type::Option(_) => quote! { ::parquet::basic::Repetition::OPTIONAL }, Type::Reference(_, ty) => ty.repetition(), - _ => quote! { Repetition::REQUIRED }, + _ => quote! { ::parquet::basic::Repetition::REQUIRED }, } } @@ -666,7 +658,7 @@ mod test { { let vals : Vec < _ > = records . iter ( ) . map ( | rec | rec . counter as i64 ) . collect ( ); - if let parquet::column::writer::ColumnWriter::Int64ColumnWriter ( ref mut typed ) = column_writer.untyped() { + if let ColumnWriter::Int64ColumnWriter ( ref mut typed ) = column_writer.untyped() { typed . write_batch ( & vals [ .. ] , None , None ) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ counter } ) @@ -703,7 +695,7 @@ mod test { } }).collect(); - if let parquet::column::writer::ColumnWriter::ByteArrayColumnWriter ( ref mut typed ) = column_writer.untyped() { + if let ColumnWriter::ByteArrayColumnWriter ( ref mut typed ) = column_writer.untyped() { typed . write_batch ( & vals [ .. ] , Some(&definition_levels[..]) , None ) ? ; } else { panic!("Schema and struct disagree on type for {}" , stringify ! { optional_str } ) @@ -727,7 +719,7 @@ mod test { } }).collect(); - if let parquet::column::writer::ColumnWriter::ByteArrayColumnWriter ( ref mut typed ) = column_writer.untyped() { + if let ColumnWriter::ByteArrayColumnWriter ( ref mut typed ) = column_writer.untyped() { typed . write_batch ( & vals [ .. ] , Some(&definition_levels[..]) , None ) ? ; } else { panic!("Schema and struct disagree on type for {}" , stringify ! { optional_string } ) @@ -750,7 +742,7 @@ mod test { } }).collect(); - if let parquet::column::writer::ColumnWriter::Int32ColumnWriter ( ref mut typed ) = column_writer.untyped() { + if let ColumnWriter::Int32ColumnWriter ( ref mut typed ) = column_writer.untyped() { typed . write_batch ( & vals [ .. ] , Some(&definition_levels[..]) , None ) ? ; } else { panic!("Schema and struct disagree on type for {}" , stringify ! { optional_dumb_int } ) @@ -779,12 +771,8 @@ mod test { assert_eq!( column_writers, vec![ - syn::parse_quote!( - parquet::column::writer::ColumnWriter::BoolColumnWriter - ), - syn::parse_quote!( - parquet::column::writer::ColumnWriter::ByteArrayColumnWriter - ) + syn::parse_quote!(ColumnWriter::BoolColumnWriter), + syn::parse_quote!(ColumnWriter::ByteArrayColumnWriter) ] ); } @@ -833,9 +821,9 @@ mod test { let snippet: proc_macro2::TokenStream = quote! { struct LotsOfInnerTypes { a_vec: Vec, - a_option: std::option::Option, - a_silly_string: std::string::String, - a_complicated_thing: std::option::Option>, + a_option: ::std::option::Option, + a_silly_string: ::std::string::String, + a_complicated_thing: ::std::option::Option<::std::result::Result<(),()>>, } }; @@ -855,8 +843,8 @@ mod test { vec![ "u8", "bool", - "std :: string :: String", - "std :: result :: Result < () , () >" + ":: std :: string :: String", + ":: std :: result :: Result < () , () >" ] ) } @@ -866,13 +854,13 @@ mod test { use parquet::basic::Type as BasicType; let snippet: proc_macro2::TokenStream = quote! { struct LotsOfInnerTypes { - a_buf: Vec, + a_buf: ::std::vec::Vec, a_number: i32, - a_verbose_option: std::option::Option, - a_silly_string: std::string::String, + a_verbose_option: ::std::option::Option, + a_silly_string: String, a_fix_byte_buf: [u8; 10], - a_complex_option: Option<&Vec>, - a_complex_vec: &Vec<&Option>, + a_complex_option: ::std::option::Option<&Vec>, + a_complex_vec: &::std::vec::Vec<&Option>, } }; @@ -901,10 +889,10 @@ mod test { fn test_convert_comprehensive_owned_struct() { let snippet: proc_macro2::TokenStream = quote! { struct VecHolder { - a_vec: Vec, - a_option: std::option::Option, - a_silly_string: std::string::String, - a_complicated_thing: std::option::Option>, + a_vec: ::std::vec::Vec, + a_option: ::std::option::Option, + a_silly_string: ::std::string::String, + a_complicated_thing: ::std::option::Option<::std::result::Result<(),()>>, } }; @@ -916,9 +904,9 @@ mod test { vec![ Type::Vec(Box::new(Type::TypePath(syn::parse_quote!(u8)))), Type::Option(Box::new(Type::TypePath(syn::parse_quote!(bool)))), - Type::TypePath(syn::parse_quote!(std::string::String)), + Type::TypePath(syn::parse_quote!(::std::string::String)), Type::Option(Box::new(Type::TypePath( - syn::parse_quote!(std::result::Result<(),()>) + syn::parse_quote!(::std::result::Result<(),()>) ))), ] ); @@ -975,7 +963,7 @@ mod test { assert_eq!(when.writer_snippet().to_string(),(quote!{ { let vals : Vec<_> = records.iter().map(|rec| rec.henceforth.timestamp_millis() ).collect(); - if let parquet::column::writer::ColumnWriter::Int64ColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::Int64ColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], None, None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ henceforth }) @@ -995,7 +983,7 @@ mod test { } }).collect(); - if let parquet::column::writer::ColumnWriter::Int64ColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::Int64ColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_happened }) @@ -1017,8 +1005,8 @@ mod test { let when = Field::from(&fields[0]); assert_eq!(when.writer_snippet().to_string(),(quote!{ { - let vals : Vec<_> = records.iter().map(|rec| rec.henceforth.signed_duration_since(chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32).collect(); - if let parquet::column::writer::ColumnWriter::Int32ColumnWriter(ref mut typed) = column_writer.untyped() { + let vals : Vec<_> = records.iter().map(|rec| rec.henceforth.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32).collect(); + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], None, None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ henceforth }) @@ -1032,13 +1020,13 @@ mod test { let definition_levels : Vec = self.iter().map(|rec| if rec.maybe_happened.is_some() { 1 } else { 0 }).collect(); let vals : Vec<_> = records.iter().filter_map(|rec| { if let Some(inner) = rec.maybe_happened { - Some(inner.signed_duration_since(chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32) + Some(inner.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32) } else { None } }).collect(); - if let parquet::column::writer::ColumnWriter::Int32ColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::Int32ColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_happened }) @@ -1061,7 +1049,7 @@ mod test { assert_eq!(when.writer_snippet().to_string(),(quote!{ { let vals : Vec<_> = records.iter().map(|rec| (&rec.unique_id.to_string()[..]).into() ).collect(); - if let parquet::column::writer::ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], None, None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ unique_id }) @@ -1081,7 +1069,7 @@ mod test { } }).collect(); - if let parquet::column::writer::ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { + if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() { typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?; } else { panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_unique_id }) @@ -1105,7 +1093,7 @@ mod test { let converted_type = time.ty.converted_type(); assert_eq!( converted_type.unwrap().to_string(), - quote! { ConvertedType::TIMESTAMP_MILLIS }.to_string() + quote! { ::parquet::basic::ConvertedType::TIMESTAMP_MILLIS }.to_string() ); } } diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index 189802b9a527..746644793ff2 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -17,12 +17,7 @@ #![allow(clippy::approx_constant)] -extern crate parquet; - -#[macro_use] -extern crate parquet_derive; - -use parquet::record::RecordWriter; +use parquet_derive::ParquetRecordWriter; #[derive(ParquetRecordWriter)] struct ACompleteRecord<'a> { @@ -53,11 +48,13 @@ struct ACompleteRecord<'a> { mod tests { use super::*; + use std::{env, fs, io::Write, sync::Arc}; + use parquet::{ file::{properties::WriterProperties, writer::SerializedFileWriter}, + record::RecordWriter, schema::parser::parse_message_type, }; - use std::{env, fs, io::Write, sync::Arc}; #[test] fn test_parquet_derive_hello() { From a37c038c6a8076987c9e8f2a357e700d7edb4893 Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Tue, 6 Sep 2022 09:39:36 -0400 Subject: [PATCH 0012/1411] Fix generate random selection (#2656) --- parquet/src/arrow/async_reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 564166ecbe5e..4d3b2732aa18 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -995,7 +995,6 @@ mod tests { } #[tokio::test] - #[ignore] async fn test_fuzz_async_reader_selection() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); @@ -1015,7 +1014,7 @@ mod tests { let mut selectors = vec![]; while total_rows < 7300 { - let row_count: usize = rand.gen_range(0..100); + let row_count: usize = rand.gen_range(1..100); let row_count = row_count.min(7300 - total_rows); From 613b66b2217d8bf61f3e682539e9137b35b76db9 Mon Sep 17 00:00:00 2001 From: Kohei Suzuki Date: Wed, 7 Sep 2022 00:56:41 +0900 Subject: [PATCH 0013/1411] json feature always requires base64 feature (#2668) It should fix build failure when only json feature is enabled. ``` % cargo build --no-default-features --features json error[E0433]: failed to resolve: use of undeclared crate or module `base64` --> parquet/src/record/api.rs:691:46 | 691 | Field::Bytes(b) => Value::String(base64::encode(b.data())), | ^^^^^^ use of undeclared crate or module `base64` For more information about this error, try `rustc --explain E0433`. error: could not compile `parquet` due to previous error ``` --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index eb03033c52df..6ddb3a615a2e 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -74,7 +74,7 @@ arrow = ["dep:arrow", "base64"] # Enable CLI tools cli = ["json", "base64", "clap", "arrow/csv"] # Enable JSON APIs -json = ["serde_json"] +json = ["serde_json", "base64"] # Enable internal testing APIs test_common = ["arrow/test_utils"] # Experimental, unstable functionality primarily used for testing From 463240adc92197c3f2260007cc6e5e574cbd1942 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Sep 2022 09:06:58 -0700 Subject: [PATCH 0014/1411] Change macro to generic helper function (#2658) --- arrow/src/compute/kernels/aggregate.rs | 123 +++++++------------------ 1 file changed, 35 insertions(+), 88 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index d7726fbf92aa..c2e3e8cc2577 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -41,7 +41,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_helper(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b) + min_max_helper::(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b) } /// Returns the maximum value in the array, according to the natural order. @@ -52,44 +52,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_helper(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b) -} - -/// Helper function to perform min/max lambda function on values from a numeric array. -#[multiversion] -#[clone(target = "x86_64+avx")] -fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option -where - T: ArrowNumericType, - F: Fn(&T::Native, &T::Native) -> bool, -{ - let null_count = array.null_count(); - - // Includes case array.len() == 0 - if null_count == array.len() { - return None; - } - - let data = array.data(); - let m = array.values(); - let mut n; - - if null_count == 0 { - // optimized path for arrays without null values - n = m[1..] - .iter() - .fold(m[0], |max, item| if cmp(&max, item) { *item } else { max }); - } else { - n = T::default_value(); - let mut has_value = false; - for (i, item) in m.iter().enumerate() { - if data.is_valid(i) && (!has_value || cmp(&n, item)) { - has_value = true; - n = *item - } - } - } - Some(n) + min_max_helper::(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b) } /// Returns the minimum value in the boolean array. @@ -142,46 +105,48 @@ pub fn max_boolean(array: &BooleanArray) -> Option { .or(Some(false)) } -/// Helper to compute min/max of [`GenericStringArray`] and [`GenericBinaryArray`] -macro_rules! min_max_binary_string { - ($array: expr, $cmp: expr) => {{ - let null_count = $array.null_count(); - if null_count == $array.len() { - None - } else if null_count == 0 { - // JUSTIFICATION - // Benefit: ~8% speedup - // Soundness: `i` is always within the array bounds - (0..$array.len()) - .map(|i| unsafe { $array.value_unchecked(i) }) - .reduce(|acc, item| if $cmp(acc, item) { item } else { acc }) - } else { - $array - .iter() - .flatten() - .reduce(|acc, item| if $cmp(acc, item) { item } else { acc }) - } - }}; +/// Helper to compute min/max of [`ArrayAccessor`]. +#[multiversion] +#[clone(target = "x86_64+avx")] +fn min_max_helper, F>(array: A, cmp: F) -> Option +where + F: Fn(&T, &T) -> bool, +{ + let null_count = array.null_count(); + if null_count == array.len() { + None + } else if null_count == 0 { + // JUSTIFICATION + // Benefit: ~8% speedup + // Soundness: `i` is always within the array bounds + (0..array.len()) + .map(|i| unsafe { array.value_unchecked(i) }) + .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) + } else { + let iter = ArrayIter::new(array); + iter.flatten() + .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) + } } /// Returns the maximum value in the binary array, according to the natural order. pub fn max_binary(array: &GenericBinaryArray) -> Option<&[u8]> { - min_max_binary_string!(array, |a, b| a < b) + min_max_helper::<&[u8], _, _>(array, |a, b| *a < *b) } /// Returns the minimum value in the binary array, according to the natural order. pub fn min_binary(array: &GenericBinaryArray) -> Option<&[u8]> { - min_max_binary_string!(array, |a, b| a > b) + min_max_helper::<&[u8], _, _>(array, |a, b| *a > *b) } /// Returns the maximum value in the string array, according to the natural order. pub fn max_string(array: &GenericStringArray) -> Option<&str> { - min_max_binary_string!(array, |a, b| a < b) + min_max_helper::<&str, _, _>(array, |a, b| *a < *b) } /// Returns the minimum value in the string array, according to the natural order. pub fn min_string(array: &GenericStringArray) -> Option<&str> { - min_max_binary_string!(array, |a, b| a > b) + min_max_helper::<&str, _, _>(array, |a, b| *a > *b) } /// Returns the sum of values in the array. @@ -215,7 +180,8 @@ where } } -/// Returns the min of values in the array. +/// Returns the min of values in the array of `ArrowNumericType` type, or dictionary +/// array with value of `ArrowNumericType` type. pub fn min_array>(array: A) -> Option where T: ArrowNumericType, @@ -223,12 +189,13 @@ where { min_max_array_helper::( array, - |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, + |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, min, ) } -/// Returns the max of values in the array. +/// Returns the max of values in the array of `ArrowNumericType` type, or dictionary +/// array with value of `ArrowNumericType` type. pub fn max_array>(array: A) -> Option where T: ArrowNumericType, @@ -236,7 +203,7 @@ where { min_max_array_helper::( array, - |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, + |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, max, ) } @@ -252,27 +219,7 @@ where M: Fn(&PrimitiveArray) -> Option, { match array.data_type() { - DataType::Dictionary(_, _) => { - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - let mut has_value = false; - let mut n = T::default_value(); - let iter = ArrayIter::new(array); - iter.into_iter().for_each(|value| { - if let Some(value) = value { - if !has_value || cmp(&value, &n) { - has_value = true; - n = value; - } - } - }); - - Some(n) - } + DataType::Dictionary(_, _) => min_max_helper::(array, cmp), _ => m(as_primitive_array(&array)), } } From 0c85233bbcaa05f675a3f7c1625ba2dcd37c193b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 6 Sep 2022 21:02:53 +0100 Subject: [PATCH 0015/1411] Use downcast_dictionary_array in unary_dyn (#2663) * Use downcast_dictionary_array in unary_dyn * Further cleanups * Clippy --- arrow/src/compute/kernels/arity.rs | 125 ++++++----------------------- 1 file changed, 23 insertions(+), 102 deletions(-) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index be9d56ebb19b..89151c286343 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -19,10 +19,8 @@ use crate::array::{Array, ArrayData, ArrayRef, DictionaryArray, PrimitiveArray}; use crate::buffer::Buffer; -use crate::datatypes::{ - ArrowNumericType, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, - Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; +use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; +use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use std::sync::Arc; @@ -31,14 +29,13 @@ fn into_primitive_array_data( array: &PrimitiveArray, buffer: Buffer, ) -> ArrayData { + let data = array.data(); unsafe { ArrayData::new_unchecked( O::DATA_TYPE, array.len(), - None, - array - .data_ref() - .null_buffer() + Some(data.null_count()), + data.null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())), 0, vec![buffer], @@ -84,39 +81,15 @@ where } /// A helper function that applies an unary function to a dictionary array with primitive value type. -#[allow(clippy::redundant_closure)] fn unary_dict(array: &DictionaryArray, op: F) -> Result where K: ArrowNumericType, T: ArrowPrimitiveType, F: Fn(T::Native) -> T::Native, { - let dict_values = array - .values() - .as_any() - .downcast_ref::>() - .unwrap(); - - let values = dict_values - .iter() - .map(|v| v.map(|value| op(value))) - .collect::>(); - - let keys = array.keys(); - - let mut data = ArrayData::builder(array.data_type().clone()) - .len(keys.len()) - .add_buffer(keys.data().buffers()[0].clone()) - .add_child_data(values.data().clone()); - - match keys.data().null_buffer() { - Some(buffer) if keys.data().null_count() > 0 => { - data = data - .null_bit_buffer(Some(buffer.clone())) - .null_count(keys.data().null_count()); - } - _ => data = data.null_count(0), - } + let dict_values = array.values().as_any().downcast_ref().unwrap(); + let values = unary::(dict_values, op).into_data(); + let data = array.data().clone().into_builder().child_data(vec![values]); let new_dict: DictionaryArray = unsafe { data.build_unchecked() }.into(); Ok(Arc::new(new_dict)) @@ -128,73 +101,21 @@ where T: ArrowPrimitiveType, F: Fn(T::Native) -> T::Native, { - match array.data_type() { - DataType::Dictionary(key_type, _) => match key_type.as_ref() { - DataType::Int8 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::Int16 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::Int32 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::Int64 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::UInt8 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::UInt16 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::UInt32 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - DataType::UInt64 => unary_dict::<_, F, T>( - array - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ), - t => Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation on dictionary array of key type {}.", - t - ))), - }, - _ => Ok(Arc::new(unary::( - array.as_any().downcast_ref::>().unwrap(), - op, - ))), + downcast_dictionary_array! { + array => unary_dict::<_, F, T>(array, op), + t => { + if t == &T::DATA_TYPE { + Ok(Arc::new(unary::( + array.as_any().downcast_ref::>().unwrap(), + op, + ))) + } else { + Err(ArrowError::NotYetImplemented(format!( + "Cannot perform unary operation on array of type {}", + t + ))) + } + } } } From 4e65952f4441946f04a374eda68dc0de71d216dd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Sep 2022 13:31:41 -0700 Subject: [PATCH 0016/1411] Support comparison between dictionary array and binary array (#2645) * Support comparison between dictionary array and binary array * Use downcast_dictionary_array --- arrow/src/compute/kernels/comparison.rs | 210 +++++++++++++++++++----- 1 file changed, 165 insertions(+), 45 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index cba2d6e7dfd9..978a2d9f4d34 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -34,6 +34,8 @@ use crate::datatypes::{ TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; +#[allow(unused_imports)] +use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_util; use regex::Regex; @@ -2172,50 +2174,6 @@ macro_rules! typed_dict_string_array_cmp { }}; } -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_dict_boolean_array_cmp { - ($LEFT: expr, $RIGHT: expr, $LEFT_KEY_TYPE: expr, $OP: expr) => {{ - match $LEFT_KEY_TYPE { - DataType::Int8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::Int16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::Int32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::Int64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::UInt8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::UInt16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::UInt32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - DataType::UInt64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - t => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - t - ))), - } - }}; -} - #[cfg(feature = "dyn_cmp_dict")] macro_rules! typed_cmp_dict_non_dict { ($LEFT: expr, $RIGHT: expr, $OP_BOOL: expr, $OP: expr, $OP_FLOAT: expr) => {{ @@ -2223,7 +2181,16 @@ macro_rules! typed_cmp_dict_non_dict { (DataType::Dictionary(left_key_type, left_value_type), right_type) => { match (left_value_type.as_ref(), right_type) { (DataType::Boolean, DataType::Boolean) => { - typed_dict_boolean_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), $OP_BOOL) + let left = $LEFT; + downcast_dictionary_array!( + left => { + cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "Cannot compare dictionary array of key type {}", + left_key_type.as_ref() + ))), + ) } (DataType::Int8, DataType::Int8) => { typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int8Type, $OP_BOOL, $OP) @@ -2261,6 +2228,30 @@ macro_rules! typed_cmp_dict_non_dict { (DataType::LargeUtf8, DataType::LargeUtf8) => { typed_dict_string_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), i64, $OP) } + (DataType::Binary, DataType::Binary) => { + let left = $LEFT; + downcast_dictionary_array!( + left => { + cmp_dict_binary_array::<_, i32, _>(left, $RIGHT, $OP) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "Cannot compare dictionary array of key type {}", + left_key_type.as_ref() + ))), + ) + } + (DataType::LargeBinary, DataType::LargeBinary) => { + let left = $LEFT; + downcast_dictionary_array!( + left => { + cmp_dict_binary_array::<_, i64, _>(left, $RIGHT, $OP) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "Cannot compare dictionary array of key type {}", + left_key_type.as_ref() + ))), + ) + } (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( "Comparing dictionary array of type {} with array of type {} is not yet implemented", t1, t2 @@ -2672,6 +2663,29 @@ where ) } +/// Perform given operation on `DictionaryArray` and `GenericBinaryArray`. The value +/// type of `DictionaryArray` is same as `GenericBinaryArray`'s type. +#[cfg(feature = "dyn_cmp_dict")] +fn cmp_dict_binary_array( + left: &DictionaryArray, + right: &dyn Array, + op: F, +) -> Result +where + K: ArrowNumericType, + F: Fn(&[u8], &[u8]) -> bool, +{ + compare_op( + left.downcast_dict::>() + .unwrap(), + right + .as_any() + .downcast_ref::>() + .unwrap(), + op, + ) +} + /// Perform given operation on two `DictionaryArray`s which value type is /// primitive type. Returns an error if the two arrays have different value /// type @@ -6149,6 +6163,112 @@ mod tests { ); } + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_eq_dyn_neq_dyn_dictionary_to_binary_array() { + let values: BinaryArray = ["hello", "", "parquet"] + .into_iter() + .map(|b| Some(b.as_bytes())) + .collect(); + + let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]); + let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let array: BinaryArray = ["hello", "", "parquet", "test"] + .into_iter() + .map(|b| Some(b.as_bytes())) + .collect(); + + let result = eq_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]) + ); + + let result = eq_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]) + ); + + let result = neq_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(true)]) + ); + + let result = neq_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(true)]) + ); + } + + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_binary_array() { + let values: BinaryArray = ["hello", "", "parquet"] + .into_iter() + .map(|b| Some(b.as_bytes())) + .collect(); + + let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]); + let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let array: BinaryArray = ["hello", "", "parquet", "test"] + .into_iter() + .map(|b| Some(b.as_bytes())) + .collect(); + + let result = lt_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(true)]) + ); + + let result = lt_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(false)]) + ); + + let result = lt_eq_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(true)]) + ); + + let result = lt_eq_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]) + ); + + let result = gt_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(false)]) + ); + + let result = gt_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(false), None, Some(false), Some(true)]) + ); + + let result = gt_eq_dyn(&dict_array, &array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(false)]) + ); + + let result = gt_eq_dyn(&array, &dict_array); + assert_eq!( + result.unwrap(), + BooleanArray::from(vec![Some(true), None, Some(true), Some(true)]) + ); + } + #[test] fn test_dict_nlike_kernels() { let data = From d73d78f0bb3dd06ea2f7c5975df53a91a20f2e69 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Sep 2022 15:17:28 -0700 Subject: [PATCH 0017/1411] Add ffi, json compilation check (#2671) * Add ffi compilation check * Add compilation check for json --- .github/workflows/arrow.yml | 3 +++ .github/workflows/parquet.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d34ee3b49b5c..d81a551a3b49 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -97,6 +97,9 @@ jobs: - name: Check compilation --no-default-features --all-targets --features test_utils run: | cargo check -p arrow --no-default-features --all-targets --features test_utils + - name: Check compilation --no-default-features --all-targets --features --ffi + run: | + cargo check -p arrow --no-default-features --all-targets --features ffi # test the --features "simd" of the arrow crate. This requires nightly Rust. linux-test-simd: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 42cb06bb0a86..8497db798a97 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -111,6 +111,9 @@ jobs: - name: Check compilation --all-targets --all-features run: | cargo check -p parquet --all-targets --all-features + - name: Check compilation --all-targets --no-default-features --features json + run: | + cargo check -p parquet --all-targets --no-default-features --features json clippy: name: Clippy From c8bf1ca39d28d2983feb2639309bac0f5318b04a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 7 Sep 2022 00:11:01 -0700 Subject: [PATCH 0018/1411] Support DictionaryArray in temporal kernels (#2623) * Support dictionary array in temporal kernels * Support dictionary array in temporal kernels * Prepare for merging conflicts * Keep same kernel signature for primitive array * For review * Add doc --- arrow/src/array/array_primitive.rs | 4 +- arrow/src/array/mod.rs | 2 + arrow/src/compute/kernels/cast.rs | 19 +- arrow/src/compute/kernels/temporal.rs | 871 ++++++++++++++++++++------ 4 files changed, 697 insertions(+), 199 deletions(-) diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 7818e6ff01d5..57168b7b9e60 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -206,7 +206,7 @@ impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { } } -fn as_datetime(v: i64) -> Option { +pub(crate) fn as_datetime(v: i64) -> Option { match T::DATA_TYPE { DataType::Date32 => Some(temporal_conversions::date32_to_datetime(v as i32)), DataType::Date64 => Some(temporal_conversions::date64_to_datetime(v)), @@ -233,7 +233,7 @@ fn as_date(v: i64) -> Option { as_datetime::(v).map(|datetime| datetime.date()) } -fn as_time(v: i64) -> Option { +pub(crate) fn as_time(v: i64) -> Option { match T::DATA_TYPE { DataType::Time32(unit) => { // safe to immediately cast to u32 as `self.value(i)` is positive i32 diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 7a60db4d91a1..8e9bc20b4487 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -223,6 +223,8 @@ pub use self::array::make_array; pub use self::array::new_empty_array; pub use self::array::new_null_array; +pub(crate) use self::array_primitive::{as_datetime, as_time}; + /// /// # Example: Using `collect` /// ``` diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index d451484028fd..71a4fcc955ba 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -42,6 +42,7 @@ use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; +use crate::array::as_datetime; use crate::buffer::MutableBuffer; use crate::compute::divide_scalar; use crate::compute::kernels::arithmetic::{divide, multiply}; @@ -1708,21 +1709,31 @@ where if let Some(tz) = tz { let mut scratch = Parsed::new(); - // The macro calls `value_as_datetime_with_tz` on timestamp values of the array. + // The macro calls `as_datetime` on timestamp values of the array. // After applying timezone offset on the datatime, calling `to_string` to get // the strings. + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, builder, to_string, - value_as_datetime_with_tz, + |value, tz| as_datetime::(>::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(>::from(value)), |h| h ) } else { // No timezone available. Calling `to_string` on the datatime value simply. - extract_component_from_array!(array, builder, to_string, value_as_datetime, |h| h) + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + builder, + to_string, + |value| as_datetime::(>::from(value)), + |h| h + ) } Ok(Arc::new(builder.finish()) as ArrayRef) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 1bec1d84f681..f46cf7f5ab5a 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -19,6 +19,7 @@ use chrono::{Datelike, Timelike}; +use crate::array::as_datetime; use crate::array::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; @@ -28,40 +29,40 @@ use chrono::format::{parse, Parsed}; use chrono::FixedOffset; macro_rules! extract_component_from_array { - ($array:ident, $builder:ident, $extract_fn:ident, $using:ident, $convert:expr) => { - for i in 0..$array.len() { - if $array.is_null(i) { - $builder.append_null(); - } else { - match $array.$using(i) { + ($iter:ident, $builder:ident, $extract_fn:ident, $using:expr, $convert:expr) => { + $iter.into_iter().for_each(|value| { + if let Some(value) = value { + match $using(value) { Some(dt) => $builder.append_value($convert(dt.$extract_fn())), None => $builder.append_null(), } + } else { + $builder.append_null(); } - } + }) }; - ($array:ident, $builder:ident, $extract_fn1:ident, $extract_fn2:ident, $using:ident, $convert:expr) => { - for i in 0..$array.len() { - if $array.is_null(i) { - $builder.append_null(); - } else { - match $array.$using(i) { + ($iter:ident, $builder:ident, $extract_fn1:ident, $extract_fn2:ident, $using:expr, $convert:expr) => { + $iter.into_iter().for_each(|value| { + if let Some(value) = value { + match $using(value) { Some(dt) => { $builder.append_value($convert(dt.$extract_fn1().$extract_fn2())); } None => $builder.append_null(), } + } else { + $builder.append_null(); } - } + }) }; - ($array:ident, $builder:ident, $extract_fn:ident, $using:ident, $tz:ident, $parsed:ident, $convert:expr) => { + ($iter:ident, $builder:ident, $extract_fn:ident, $using:expr, $tz:ident, $parsed:ident, $value_as_datetime:expr, $convert:expr) => { if ($tz.starts_with('+') || $tz.starts_with('-')) && !$tz.contains(':') { return_compute_error_with!( "Invalid timezone", "Expected format [+-]XX:XX".to_string() ) } else { - let tz_parse_result = parse(&mut $parsed, $tz, StrftimeItems::new("%z")); + let tz_parse_result = parse(&mut $parsed, &$tz, StrftimeItems::new("%z")); let fixed_offset_from_parsed = match tz_parse_result { Ok(_) => match $parsed.to_fixed_offset() { Ok(fo) => Some(fo), @@ -70,16 +71,14 @@ macro_rules! extract_component_from_array { _ => None, }; - for i in 0..$array.len() { - if $array.is_null(i) { - $builder.append_null(); - } else { - match $array.value_as_datetime(i) { + for value in $iter.into_iter() { + if let Some(value) = value { + match $value_as_datetime(value) { Some(utc) => { let fixed_offset = match fixed_offset_from_parsed { Some(fo) => fo, None => match using_chrono_tz_and_utc_naive_date_time( - $tz, utc, + &$tz, utc, ) { Some(fo) => fo, err => return_compute_error_with!( @@ -88,7 +87,7 @@ macro_rules! extract_component_from_array { ), }, }; - match $array.$using(i, fixed_offset) { + match $using(value, fixed_offset) { Some(dt) => { $builder.append_value($convert(dt.$extract_fn())); } @@ -100,6 +99,8 @@ macro_rules! extract_component_from_array { err ), } + } else { + $builder.append_null(); } } } @@ -171,335 +172,752 @@ pub fn using_chrono_tz_and_utc_naive_date_time( .ok() } -/// Extracts the hours of a given temporal array as an array of integers +/// Extracts the hours of a given temporal primitive array as an array of integers within +/// the range of [0, 23]. pub fn hour(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + hour_generic::(array) +} + +/// Extracts the hours of a given temporal array as an array of integers within +/// the range of [0, 23]. +pub fn hour_generic>(array: A) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + hour_internal::(array, value_type.as_ref()) + } + dt => hour_internal::(array, &dt), + } +} + +/// Extracts the hours of a given temporal array as an array of integers +fn hour_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Time32(_) | &DataType::Time64(_) => { - extract_component_from_array!(array, b, hour, value_as_time, |h| h as i32) + match dt { + DataType::Time32(_) | DataType::Time64(_) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + hour, + |value| as_time::(i64::from(value)), + |h| h as i32 + ); } - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, hour, value_as_datetime, |h| h as i32) + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + hour, + |value| as_datetime::(i64::from(value)), + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, hour, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("hour does not support", dt), + _ => return_compute_error_with!("hour does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the years of a given temporal array as an array of integers +/// Extracts the years of a given temporal primitive array as an array of integers pub fn year(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + year_generic::(array) +} + +/// Extracts the years of a given temporal array as an array of integers +pub fn year_generic>(array: A) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + year_internal::(array, value_type.as_ref()) + } + dt => year_internal::(array, &dt), + } +} + +/// Extracts the years of a given temporal array as an array of integers +fn year_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, _) => { - extract_component_from_array!(array, b, year, value_as_datetime, |h| h as i32) + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + year, + |value| as_datetime::(i64::from(value)), + |h| h as i32 + ) } - dt => return_compute_error_with!("year does not support", dt), + _t => return_compute_error_with!("year does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the quarter of a given temporal array as an array of integers +/// Extracts the quarter of a given temporal primitive array as an array of integers within +/// the range of [1, 4]. pub fn quarter(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + quarter_generic::(array) +} + +/// Extracts the quarter of a given temporal array as an array of integersa within +/// the range of [1, 4]. +pub fn quarter_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + quarter_internal::(array, value_type.as_ref()) + } + dt => quarter_internal::(array, &dt), + } +} + +/// Extracts the quarter of a given temporal array as an array of integers +fn quarter_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, quarter, value_as_datetime, |h| h - as i32) + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + quarter, + |value| as_datetime::(i64::from(value)), + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, quarter, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("quarter does not support", dt), + _ => return_compute_error_with!("quarter does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the month of a given temporal array as an array of integers +/// Extracts the month of a given temporal primitive array as an array of integers within +/// the range of [1, 12]. pub fn month(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + month_generic::(array) +} + +/// Extracts the month of a given temporal array as an array of integers +pub fn month_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + month_internal::(array, value_type.as_ref()) + } + dt => month_internal::(array, &dt), + } +} + +/// Extracts the month of a given temporal array as an array of integers +fn month_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, month, value_as_datetime, |h| h - as i32) + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + month, + |value| as_datetime::(i64::from(value)), + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, month, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("month does not support", dt), + _ => return_compute_error_with!("month does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the day of week of a given temporal array as an array of +/// Extracts the day of week of a given temporal primitive array as an array of /// integers. /// /// Monday is encoded as `0`, Tuesday as `1`, etc. /// /// See also [`num_days_from_sunday`] which starts at Sunday. pub fn num_days_from_monday(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + num_days_from_monday_generic::(array) +} + +/// Extracts the day of week of a given temporal array as an array of +/// integers. +/// +/// Monday is encoded as `0`, Tuesday as `1`, etc. +/// +/// See also [`num_days_from_sunday`] which starts at Sunday. +pub fn num_days_from_monday_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + num_days_from_monday_internal::(array, value_type.as_ref()) + } + dt => num_days_from_monday_internal::(array, &dt), + } +} + +/// Extracts the day of week of a given temporal array as an array of +/// integers. +/// +/// Monday is encoded as `0`, Tuesday as `1`, etc. +/// +/// See also [`num_days_from_sunday`] which starts at Sunday. +fn num_days_from_monday_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, num_days_from_monday, - value_as_datetime, + |value| { as_datetime::(i64::from(value)) }, |h| h as i32 ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, num_days_from_monday, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("weekday does not support", dt), + _ => return_compute_error_with!("weekday does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the day of week of a given temporal array as an array of +/// Extracts the day of week of a given temporal primitive array as an array of /// integers, starting at Sunday. /// /// Sunday is encoded as `0`, Monday as `1`, etc. /// /// See also [`num_days_from_monday`] which starts at Monday. pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + num_days_from_sunday_generic::(array) +} + +/// Extracts the day of week of a given temporal array as an array of +/// integers, starting at Sunday. +/// +/// Sunday is encoded as `0`, Monday as `1`, etc. +/// +/// See also [`num_days_from_monday`] which starts at Monday. +pub fn num_days_from_sunday_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + num_days_from_sunday_internal::(array, value_type.as_ref()) + } + dt => num_days_from_sunday_internal::(array, &dt), + } +} + +/// Extracts the day of week of a given temporal array as an array of +/// integers, starting at Sunday. +/// +/// Sunday is encoded as `0`, Monday as `1`, etc. +/// +/// See also [`num_days_from_monday`] which starts at Monday. +fn num_days_from_sunday_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, num_days_from_sunday, - value_as_datetime, + |value| { as_datetime::(i64::from(value)) }, |h| h as i32 ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, num_days_from_sunday, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("num_days_from_sunday does not support", dt), + _ => return_compute_error_with!( + "num_days_from_sunday does not support", + array.data_type() + ), } Ok(b.finish()) } -/// Extracts the day of a given temporal array as an array of integers +/// Extracts the day of a given temporal primitive array as an array of integers pub fn day(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + day_generic::(array) +} + +/// Extracts the day of a given temporal array as an array of integers +pub fn day_generic>(array: A) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + day_internal::(array, value_type.as_ref()) + } + dt => day_internal::(array, &dt), + } +} + +/// Extracts the day of a given temporal array as an array of integers +fn day_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, day, value_as_datetime, |h| h as i32) + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + day, + |value| { as_datetime::(i64::from(value)) }, + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(ref tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, day, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("day does not support", dt), + _ => return_compute_error_with!("day does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the day of year of a given temporal array as an array of integers +/// Extracts the day of year of a given temporal primitive array as an array of integers /// The day of year that ranges from 1 to 366 pub fn doy(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, +{ + doy_generic::(array) +} + +/// Extracts the day of year of a given temporal array as an array of integers +/// The day of year that ranges from 1 to 366 +pub fn doy_generic>(array: A) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + doy_internal::(array, value_type.as_ref()) + } + dt => doy_internal::(array, &dt), + } +} + +/// Extracts the day of year of a given temporal array as an array of integers +/// The day of year that ranges from 1 to 366 +fn doy_internal>( + array: A, + dt: &DataType, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + T::Native: ArrowNativeType, + i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, ordinal, value_as_datetime, |h| h - as i32) + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + ordinal, + |value| { as_datetime::(i64::from(value)) }, + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(ref tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, ordinal, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("doy does not support", dt), + _ => return_compute_error_with!("doy does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the minutes of a given temporal array as an array of integers +/// Extracts the minutes of a given temporal primitive array as an array of integers pub fn minute(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + minute_generic::(array) +} + +/// Extracts the minutes of a given temporal array as an array of integers +pub fn minute_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + minute_internal::(array, value_type.as_ref()) + } + dt => minute_internal::(array, &dt), + } +} + +/// Extracts the minutes of a given temporal array as an array of integers +fn minute_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, minute, value_as_datetime, |h| h - as i32) + match dt { + DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + minute, + |value| { as_datetime::(i64::from(value)) }, + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, minute, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("minute does not support", dt), + _ => return_compute_error_with!("minute does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the week of a given temporal array as an array of integers +/// Extracts the week of a given temporal primitive array as an array of integers pub fn week(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + week_generic::(array) +} + +/// Extracts the week of a given temporal array as an array of integers +pub fn week_generic>(array: A) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + week_internal::(array, value_type.as_ref()) + } + dt => week_internal::(array, &dt), + } +} + +/// Extracts the week of a given temporal array as an array of integers +fn week_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, None) => { + match dt { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, iso_week, week, - value_as_datetime, + |value| { as_datetime::(i64::from(value)) }, |h| h as i32 ) } - dt => return_compute_error_with!("week does not support", dt), + _ => return_compute_error_with!("week does not support", array.data_type()), } Ok(b.finish()) } -/// Extracts the seconds of a given temporal array as an array of integers +/// Extracts the seconds of a given temporal primitive array as an array of integers pub fn second(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + second_generic::(array) +} + +/// Extracts the seconds of a given temporal array as an array of integers +pub fn second_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: std::convert::From, +{ + match array.data_type().clone() { + DataType::Dictionary(_, value_type) => { + second_internal::(array, value_type.as_ref()) + } + dt => second_internal::(array, &dt), + } +} + +/// Extracts the seconds of a given temporal array as an array of integers +fn second_internal>( + array: A, + dt: &DataType, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { let mut b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - &DataType::Date64 | &DataType::Timestamp(_, None) => { - extract_component_from_array!(array, b, second, value_as_datetime, |h| h - as i32) + match dt { + DataType::Date64 | DataType::Timestamp(_, None) => { + let iter = ArrayIter::new(array); + extract_component_from_array!( + iter, + b, + second, + |value| { as_datetime::(i64::from(value)) }, + |h| h as i32 + ) } - &DataType::Timestamp(_, Some(ref tz)) => { + DataType::Timestamp(_, Some(tz)) => { let mut scratch = Parsed::new(); + let iter = ArrayIter::new(array); extract_component_from_array!( - array, + iter, b, second, - value_as_datetime_with_tz, + |value, tz| as_datetime::(i64::from(value)) + .map(|datetime| datetime + tz), tz, scratch, + |value| as_datetime::(i64::from(value)), |h| h as i32 ) } - dt => return_compute_error_with!("second does not support", dt), + _ => return_compute_error_with!("second does not support", array.data_type()), } Ok(b.finish()) @@ -606,19 +1024,13 @@ mod tests { #[test] fn test_temporal_array_timestamp_quarter_with_timezone() { - use std::sync::Arc; - // 24 * 60 * 60 = 86400 - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400 * 90], - Some("+00:00".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![86400 * 90], Some("+00:00".to_string())); let b = quarter(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400 * 90], - Some("-10:00".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![86400 * 90], Some("-10:00".to_string())); let b = quarter(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -648,38 +1060,24 @@ mod tests { #[test] fn test_temporal_array_timestamp_month_with_timezone() { - use std::sync::Arc; - // 24 * 60 * 60 = 86400 - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400 * 31], - Some("+00:00".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![86400 * 31], Some("+00:00".to_string())); let b = month(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400 * 31], - Some("-10:00".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![86400 * 31], Some("-10:00".to_string())); let b = month(&a).unwrap(); assert_eq!(1, b.value(0)); } #[test] fn test_temporal_array_timestamp_day_with_timezone() { - use std::sync::Arc; - // 24 * 60 * 60 = 86400 - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400], - Some("+00:00".to_string()), - )); + let a = TimestampSecondArray::from_vec(vec![86400], Some("+00:00".to_string())); let b = day(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = Arc::new(TimestampSecondArray::from_vec( - vec![86400], - Some("-10:00".to_string()), - )); + let a = TimestampSecondArray::from_vec(vec![86400], Some("-10:00".to_string())); let b = day(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -859,12 +1257,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_second_with_timezone() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![10, 20], - Some("+00:00".to_string()), - )); + let a = TimestampSecondArray::from_vec(vec![10, 20], Some("+00:00".to_string())); let b = second(&a).unwrap(); assert_eq!(10, b.value(0)); assert_eq!(20, b.value(1)); @@ -872,12 +1265,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_timezone() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![0, 60], - Some("+00:50".to_string()), - )); + let a = TimestampSecondArray::from_vec(vec![0, 60], Some("+00:50".to_string())); let b = minute(&a).unwrap(); assert_eq!(50, b.value(0)); assert_eq!(51, b.value(1)); @@ -885,70 +1273,49 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_negative_timezone() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![60 * 55], - Some("-00:50".to_string()), - )); + let a = TimestampSecondArray::from_vec(vec![60 * 55], Some("-00:50".to_string())); let b = minute(&a).unwrap(); assert_eq!(5, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( + let a = TimestampSecondArray::from_vec( vec![60 * 60 * 10], Some("+01:00".to_string()), - )); + ); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("+0100".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("+0100".to_string())); assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_initial_sign() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("0100".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("0100".to_string())); assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) } #[test] fn test_temporal_array_timestamp_hour_with_timezone_with_only_colon() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("01:00".to_string()), - )); + let a = + TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("01:00".to_string())); assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) } #[cfg(feature = "chrono-tz")] #[test] fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( + let a = TimestampSecondArray::from_vec( vec![60 * 60 * 10], Some("Asia/Kolkata".to_string()), - )); + ); let b = hour(&a).unwrap(); assert_eq!(15, b.value(0)); } @@ -972,12 +1339,10 @@ mod tests { #[cfg(not(feature = "chrono-tz"))] #[test] fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - use std::sync::Arc; - - let a = Arc::new(TimestampSecondArray::from_vec( + let a = TimestampSecondArray::from_vec( vec![60 * 60 * 10], Some("Asia/Kolkatta".to_string()), - )); + ); assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) } @@ -1038,4 +1403,124 @@ mod tests { Some(sydney_offset_with_dst) ); } + + #[test] + fn test_hour_minute_second_dictionary_array() { + let a = TimestampSecondArray::from_vec( + vec![60 * 60 * 10 + 61, 60 * 60 * 20 + 122, 60 * 60 * 30 + 183], + Some("+01:00".to_string()), + ); + + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 1]); + let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + + let b = hour_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + + let expected = Int32Array::from(vec![11, 11, 21, 7, 21]); + assert_eq!(expected, b); + + let b = minute_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + + let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); + assert_eq!(expected, b); + + let b = second_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + + let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); + assert_eq!(expected, b); + } + + #[test] + fn test_year_dictionary_array() { + let a: PrimitiveArray = + vec![Some(1514764800000), Some(1550636625000)].into(); + + let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); + let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + + let b = + year_generic::(dict.downcast_dict::().unwrap()) + .unwrap(); + + let expected = Int32Array::from(vec![2018, 2019, 2019, 2018]); + assert_eq!(expected, b); + } + + #[test] + fn test_quarter_month_dictionary_array() { + //1514764800000 -> 2018-01-01 + //1566275025000 -> 2019-08-20 + let a: PrimitiveArray = + vec![Some(1514764800000), Some(1566275025000)].into(); + + let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); + let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + + let b = quarter_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + + let expected = Int32Array::from(vec![1, 3, 3, 1]); + assert_eq!(expected, b); + + let b = + month_generic::(dict.downcast_dict::().unwrap()) + .unwrap(); + + let expected = Int32Array::from(vec![1, 8, 8, 1]); + assert_eq!(expected, b); + } + + #[test] + fn test_num_days_from_monday_sunday_day_doy_week_dictionary_array() { + //1514764800000 -> 2018-01-01 (Monday) + //1550636625000 -> 2019-02-20 (Wednesday) + let a: PrimitiveArray = + vec![Some(1514764800000), Some(1550636625000)].into(); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), Some(0), None]); + let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + + let b = num_days_from_monday_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + let expected = Int32Array::from(vec![Some(0), Some(2), Some(2), Some(0), None]); + assert_eq!(expected, b); + + let b = num_days_from_sunday_generic::( + dict.downcast_dict::().unwrap(), + ) + .unwrap(); + let expected = Int32Array::from(vec![Some(1), Some(3), Some(3), Some(1), None]); + assert_eq!(expected, b); + + let b = + day_generic::(dict.downcast_dict::().unwrap()) + .unwrap(); + let expected = Int32Array::from(vec![Some(1), Some(20), Some(20), Some(1), None]); + assert_eq!(expected, b); + + let b = + doy_generic::(dict.downcast_dict::().unwrap()) + .unwrap(); + let expected = Int32Array::from(vec![Some(1), Some(51), Some(51), Some(1), None]); + assert_eq!(expected, b); + + let b = + week_generic::(dict.downcast_dict::().unwrap()) + .unwrap(); + let expected = Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]); + assert_eq!(expected, b); + } } From b36bc88296e0a63e8bcaae1fd63f0f3d3ec9331c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 7 Sep 2022 06:42:37 -0700 Subject: [PATCH 0019/1411] Use BitIndexIterator (#2675) --- arrow/src/compute/kernels/aggregate.rs | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index c2e3e8cc2577..d6cc3ecc1047 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -25,6 +25,7 @@ use crate::array::{ GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; use crate::datatypes::{ArrowNativeType, ArrowNumericType, DataType}; +use crate::util::bit_iterator::BitIndexIterator; /// Generic test for NaN, the optimizer should be able to remove this for integer types. #[inline] @@ -123,9 +124,27 @@ where .map(|i| unsafe { array.value_unchecked(i) }) .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { - let iter = ArrayIter::new(array); - iter.flatten() - .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) + let null_buffer = array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())); + let iter = BitIndexIterator::new( + null_buffer.as_deref().unwrap(), + array.offset(), + array.len(), + ); + unsafe { + let idx = iter.reduce(|acc_idx, idx| { + let acc = array.value_unchecked(acc_idx); + let item = array.value_unchecked(idx); + if cmp(&acc, &item) { + idx + } else { + acc_idx + } + }); + idx.map(|idx| array.value_unchecked(idx)) + } } } From c25d16e082a218276a2303d4ab0a1cfb53b8c6ac Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Wed, 7 Sep 2022 19:37:33 +0530 Subject: [PATCH 0020/1411] Faster Null Path Selection in ArrayData Equality (#2676) * Use contains_nulls if we don't need null count * Inline function call --- arrow/src/array/equal/decimal.rs | 9 ++++----- arrow/src/array/equal/dictionary.rs | 9 ++++----- arrow/src/array/equal/fixed_binary.rs | 9 ++++----- arrow/src/array/equal/fixed_list.rs | 9 ++++----- arrow/src/array/equal/primitive.rs | 9 ++++----- arrow/src/array/equal/structure.rs | 10 ++++------ 6 files changed, 24 insertions(+), 31 deletions(-) diff --git a/arrow/src/array/equal/decimal.rs b/arrow/src/array/equal/decimal.rs index 42a7d29e27d2..49112608c3a5 100644 --- a/arrow/src/array/equal/decimal.rs +++ b/arrow/src/array/equal/decimal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::count_nulls, ArrayData}; +use crate::array::{data::contains_nulls, ArrayData}; use crate::datatypes::DataType; use crate::util::bit_util::get_bit; @@ -37,10 +37,9 @@ pub(super) fn decimal_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { equal_len( lhs_values, rhs_values, diff --git a/arrow/src/array/equal/dictionary.rs b/arrow/src/array/equal/dictionary.rs index 4c9bcf798760..1474da5e2d21 100644 --- a/arrow/src/array/equal/dictionary.rs +++ b/arrow/src/array/equal/dictionary.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::count_nulls, ArrayData}; +use crate::array::{data::contains_nulls, ArrayData}; use crate::datatypes::ArrowNativeType; use crate::util::bit_util::get_bit; @@ -34,10 +34,9 @@ pub(super) fn dictionary_equal( let lhs_values = &lhs.child_data()[0]; let rhs_values = &rhs.child_data()[0]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; diff --git a/arrow/src/array/equal/fixed_binary.rs b/arrow/src/array/equal/fixed_binary.rs index aea0e08a9ebf..58eb22bb19b0 100644 --- a/arrow/src/array/equal/fixed_binary.rs +++ b/arrow/src/array/equal/fixed_binary.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::count_nulls, ArrayData}; +use crate::array::{data::contains_nulls, ArrayData}; use crate::datatypes::DataType; use crate::util::bit_util::get_bit; @@ -36,10 +36,9 @@ pub(super) fn fixed_binary_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { equal_len( lhs_values, rhs_values, diff --git a/arrow/src/array/equal/fixed_list.rs b/arrow/src/array/equal/fixed_list.rs index 82a347c86574..055bcece1358 100644 --- a/arrow/src/array/equal/fixed_list.rs +++ b/arrow/src/array/equal/fixed_list.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::count_nulls, ArrayData}; +use crate::array::{data::contains_nulls, ArrayData}; use crate::datatypes::DataType; use crate::util::bit_util::get_bit; @@ -36,10 +36,9 @@ pub(super) fn fixed_list_equal( let lhs_values = &lhs.child_data()[0]; let rhs_values = &rhs.child_data()[0]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { equal_range( lhs_values, rhs_values, diff --git a/arrow/src/array/equal/primitive.rs b/arrow/src/array/equal/primitive.rs index 09882cd78509..b82d3213ab03 100644 --- a/arrow/src/array/equal/primitive.rs +++ b/arrow/src/array/equal/primitive.rs @@ -17,7 +17,7 @@ use std::mem::size_of; -use crate::array::{data::count_nulls, ArrayData}; +use crate::array::{data::contains_nulls, ArrayData}; use crate::util::bit_util::get_bit; use super::utils::equal_len; @@ -33,10 +33,9 @@ pub(super) fn primitive_equal( let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * byte_width..]; let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * byte_width..]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { // without nulls, we just need to compare slices equal_len( lhs_values, diff --git a/arrow/src/array/equal/structure.rs b/arrow/src/array/equal/structure.rs index 0f943e40cac6..384376c3468d 100644 --- a/arrow/src/array/equal/structure.rs +++ b/arrow/src/array/equal/structure.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::data::count_nulls, array::ArrayData, util::bit_util::get_bit}; +use crate::{array::data::contains_nulls, array::ArrayData, util::bit_util::get_bit}; use super::equal_range; @@ -43,11 +43,9 @@ pub(super) fn struct_equal( rhs_start: usize, len: usize, ) -> bool { - // we have to recalculate null counts from the null buffers - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); - - if lhs_null_count == 0 && rhs_null_count == 0 { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { equal_child_values(lhs, rhs, lhs_start, rhs_start, len) } else { // get a ref of the null buffer bytes, to use in testing for nullness From df4906d76992e26b7b196c1680755ca360272650 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 7 Sep 2022 17:10:38 -0700 Subject: [PATCH 0021/1411] Support building comparator for dictionaries of primitive integer values (#2673) * Support comparing dictionary of primitive value. * Change to generic function * Trigger Build --- arrow/src/array/ord.rs | 129 ++++++++++++++++++++++++++++++++++------- 1 file changed, 107 insertions(+), 22 deletions(-) diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index dd6539589c13..998c06e50ebd 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -80,6 +80,31 @@ where Box::new(move |i, j| left.value(i).cmp(right.value(j))) } +fn compare_dict_primitive(left: &dyn Array, right: &dyn Array) -> DynComparator +where + K: ArrowDictionaryKeyType, + V: ArrowPrimitiveType, + V::Native: Ord, +{ + let left = left.as_any().downcast_ref::>().unwrap(); + let right = right.as_any().downcast_ref::>().unwrap(); + + let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().data().clone()); + let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().data().clone()); + let left_values: PrimitiveArray = + PrimitiveArray::from(left.values().data().clone()); + let right_values: PrimitiveArray = + PrimitiveArray::from(right.values().data().clone()); + + Box::new(move |i: usize, j: usize| { + let key_left = left_keys.value(i).to_usize().unwrap(); + let key_right = right_keys.value(j).to_usize().unwrap(); + let left = left_values.value(key_left); + let right = right_values.value(key_right); + left.cmp(&right) + }) +} + fn compare_dict_string(left: &dyn Array, right: &dyn Array) -> DynComparator where T: ArrowDictionaryKeyType, @@ -101,6 +126,35 @@ where }) } +fn cmp_dict_primitive( + key_type: &DataType, + left: &dyn Array, + right: &dyn Array, +) -> Result +where + VT: ArrowPrimitiveType, + VT::Native: Ord, +{ + use DataType::*; + + Ok(match key_type { + UInt8 => compare_dict_primitive::(left, right), + UInt16 => compare_dict_primitive::(left, right), + UInt32 => compare_dict_primitive::(left, right), + UInt64 => compare_dict_primitive::(left, right), + Int8 => compare_dict_primitive::(left, right), + Int16 => compare_dict_primitive::(left, right), + Int32 => compare_dict_primitive::(left, right), + Int64 => compare_dict_primitive::(left, right), + t => { + return Err(ArrowError::InvalidArgumentError(format!( + "Dictionaries do not support keys of type {:?}", + t + ))); + } + }) +} + /// returns a comparison function that compares two values at two different positions /// between the two arrays. /// The arrays' types must be equal. @@ -195,32 +249,43 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { - if value_type_lhs.as_ref() != &DataType::Utf8 - || value_type_rhs.as_ref() != &DataType::Utf8 - { + if key_type_lhs != key_type_rhs || value_type_lhs != value_type_rhs { return Err(ArrowError::InvalidArgumentError( - "Arrow still does not support comparisons of non-string dictionary arrays" - .to_string(), + "Can't compare arrays of different types".to_string(), )); } - match (key_type_lhs.as_ref(), key_type_rhs.as_ref()) { - (a, b) if a != b => { - return Err(ArrowError::InvalidArgumentError( - "Can't compare arrays of different types".to_string(), - )); - } - (UInt8, UInt8) => compare_dict_string::(left, right), - (UInt16, UInt16) => compare_dict_string::(left, right), - (UInt32, UInt32) => compare_dict_string::(left, right), - (UInt64, UInt64) => compare_dict_string::(left, right), - (Int8, Int8) => compare_dict_string::(left, right), - (Int16, Int16) => compare_dict_string::(left, right), - (Int32, Int32) => compare_dict_string::(left, right), - (Int64, Int64) => compare_dict_string::(left, right), - (lhs, _) => { + + let key_type_lhs = key_type_lhs.as_ref(); + + match value_type_lhs.as_ref() { + Int8 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Int16 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Int32 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Int64 => cmp_dict_primitive::(key_type_lhs, left, right)?, + UInt8 => cmp_dict_primitive::(key_type_lhs, left, right)?, + UInt16 => cmp_dict_primitive::(key_type_lhs, left, right)?, + UInt32 => cmp_dict_primitive::(key_type_lhs, left, right)?, + UInt64 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Utf8 => match key_type_lhs { + UInt8 => compare_dict_string::(left, right), + UInt16 => compare_dict_string::(left, right), + UInt32 => compare_dict_string::(left, right), + UInt64 => compare_dict_string::(left, right), + Int8 => compare_dict_string::(left, right), + Int16 => compare_dict_string::(left, right), + Int32 => compare_dict_string::(left, right), + Int64 => compare_dict_string::(left, right), + lhs => { + return Err(ArrowError::InvalidArgumentError(format!( + "Dictionaries do not support keys of type {:?}", + lhs + ))); + } + }, + t => { return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {:?}", - lhs + "Dictionaries of value data type {:?} are not supported", + t ))); } } @@ -339,4 +404,24 @@ pub mod tests { assert_eq!(Ordering::Greater, (cmp)(1, 3)); Ok(()) } + + #[test] + fn test_primitive_dict() -> Result<()> { + let values = Int32Array::from(vec![1_i32, 0, 2, 5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = Int32Array::from(vec![2_i32, 3, 4, 5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2)?; + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + Ok(()) + } } From 0b59f05cd412249cb51d257b4542075cd2b6efa4 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Thu, 8 Sep 2022 04:02:43 -0400 Subject: [PATCH 0022/1411] Skip RowSelectors with zero rows (#2678) * Skip RowSelectors with zero rows * include test for zero RowSelector fix Co-authored-by: srib --- parquet/src/arrow/arrow_reader/mod.rs | 6 +++ parquet/src/arrow/async_reader.rs | 69 +++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 76e247ae1f1f..02d11817ec5a 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -484,6 +484,12 @@ impl Iterator for ParquetRecordBatchReader { continue; } + //Currently, when RowSelectors with row_count = 0 are included then its interpreted as end of reader. + //Fix is to skip such entries. See https://github.com/apache/arrow-rs/issues/2669 + if front.row_count == 0 { + continue; + } + // try to read record let need_read = self.batch_size - read_records; let to_read = match front.row_count.checked_sub(need_read) { diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 4d3b2732aa18..a77da8d6f5ff 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -1060,6 +1060,75 @@ mod tests { } } + #[tokio::test] + async fn test_async_reader_zero_row_selector() { + //See https://github.com/apache/arrow-rs/issues/2669 + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let mut rand = thread_rng(); + + let mut expected_rows = 0; + let mut total_rows = 0; + let mut skip = false; + let mut selectors = vec![]; + + selectors.push(RowSelector { + row_count: 0, + skip: false, + }); + + while total_rows < 7300 { + let row_count: usize = rand.gen_range(1..100); + + let row_count = row_count.min(7300 - total_rows); + + selectors.push(RowSelector { row_count, skip }); + + total_rows += row_count; + if !skip { + expected_rows += row_count; + } + + skip = !skip; + } + + let selection = RowSelection::from(selectors); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); + + let col_idx: usize = rand.gen_range(0..13); + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); + + let stream = builder + .with_projection(mask.clone()) + .with_row_selection(selection.clone()) + .build() + .expect("building stream"); + + let async_batches: Vec<_> = stream.try_collect().await.unwrap(); + + let actual_rows: usize = async_batches.into_iter().map(|b| b.num_rows()).sum(); + + assert_eq!(actual_rows, expected_rows); + } + #[tokio::test] async fn test_row_filter() { let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]); From 566ef3d66a441f4c4fb2ad6654c7039d1f58690d Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 8 Sep 2022 11:05:50 +0200 Subject: [PATCH 0023/1411] Fix multiple part uploads at once making vector size inconsistent (#2681) * Fix multiple part uploads at once making vector size inconsistent * Calculate total_parts prior to resizing the vector --- object_store/src/multipart.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 1985d8694e50..102d8bedaa46 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -94,9 +94,9 @@ where if self.tasks.is_empty() { return Ok(()); } - let total_parts = self.completed_parts.len(); while let Poll::Ready(Some(res)) = self.tasks.poll_next_unpin(cx) { let (part_idx, part) = res?; + let total_parts = self.completed_parts.len(); self.completed_parts .resize(std::cmp::max(part_idx + 1, total_parts), None); self.completed_parts[part_idx] = Some(part); From dd58805b1c46691fcbe5b46412b2581ae3bd2a58 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 12:20:03 +0100 Subject: [PATCH 0024/1411] Prepare object_store 0.5 release (#2682) * Prepare object_store 0.5 release * Review feedback --- object_store/.github_changelog_generator | 2 +- object_store/CHANGELOG-old.md | 71 +++++++++++++++++++ object_store/CHANGELOG.md | 63 +++++++--------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 5 files changed, 102 insertions(+), 40 deletions(-) create mode 100644 object_store/CHANGELOG-old.md diff --git a/object_store/.github_changelog_generator b/object_store/.github_changelog_generator index cbd8aa0c4b48..69b574ab0349 100644 --- a/object_store/.github_changelog_generator +++ b/object_store/.github_changelog_generator @@ -23,5 +23,5 @@ add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":[" # so that the component is shown associated with the issue issue-line-labels=object-store # skip non object_store issues -exclude-labels=development-process,invalid,arrow,parquet,arrow-flight +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight,parquet-derive breaking_labels=api-change diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md new file mode 100644 index 000000000000..a6bda3ceef49 --- /dev/null +++ b/object_store/CHANGELOG-old.md @@ -0,0 +1,71 @@ + + +# Historical Changelog + +## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) + +**Implemented enhancements:** + +- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) +- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) +- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* + diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 93faa678ffa8..538eebf77c62 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,52 +19,43 @@ # Changelog -## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) +## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) + +**Breaking changes:** + +- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) -- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) +- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) -**Documentation updates:** +**Merged pull requests:** -- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) +- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -**Merged pull requests:** -- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) -- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) -- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) -- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) -- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) -- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) -- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) -- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) -- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) -- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) -- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) -- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) -- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index b0201e2af983..0f5b0fd680b6 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.4.0" +version = "0.5.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index ebd50df7ffc0..e737e044666b 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.3.0" -FUTURE_RELEASE="object_store_0.4.0" +SINCE_TAG="object_store_0.4.0" +FUTURE_RELEASE="object_store_0.5.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 326ea5e1d5272a566439c0d5dc44d083421066d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 16:55:36 +0100 Subject: [PATCH 0025/1411] Temporarily disable Golang integration tests re-enabled JS (#2689) --- .github/workflows/integration.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 10a8e30212a9..242fc3d85ee6 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -77,13 +77,13 @@ jobs: run: conda run --no-capture-output ci/scripts/cpp_build.sh $PWD /build - name: Build C# run: conda run --no-capture-output ci/scripts/csharp_build.sh $PWD /build - - name: Build Go - run: conda run --no-capture-output ci/scripts/go_build.sh $PWD + # Temporarily disable Golang #2688 + # - name: Build Go + # run: conda run --no-capture-output ci/scripts/go_build.sh $PWD - name: Build Java run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - # Temporarily disable JS https://issues.apache.org/jira/browse/ARROW-17410 - # - name: Build JS - # run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build + - name: Build JS + run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build - name: Install archery run: conda run --no-capture-output pip install -e dev/archery - name: Run integration tests @@ -93,8 +93,8 @@ jobs: --with-cpp=1 \ --with-csharp=1 \ --with-java=1 \ - --with-js=0 \ - --with-go=1 \ + --with-js=1 \ + --with-go=0 \ --with-rust=1 \ --gold-dirs=testing/data/arrow-ipc-stream/integration/0.14.1 \ --gold-dirs=testing/data/arrow-ipc-stream/integration/0.17.1 \ From 7a22ec0cf297f9c9d0c229bf3b699ec9f4edffc0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 18:27:23 +0100 Subject: [PATCH 0026/1411] Deprecate RecordBatch::concat (#2594) (#2683) --- arrow/src/compute/kernels/concat.rs | 104 +++++++++++++++++++++++++++- arrow/src/record_batch.rs | 100 +------------------------- 2 files changed, 105 insertions(+), 99 deletions(-) diff --git a/arrow/src/compute/kernels/concat.rs b/arrow/src/compute/kernels/concat.rs index a7a3ffc782c6..df6436efe843 100644 --- a/arrow/src/compute/kernels/concat.rs +++ b/arrow/src/compute/kernels/concat.rs @@ -31,8 +31,9 @@ //! ``` use crate::array::*; -use crate::datatypes::DataType; +use crate::datatypes::{DataType, SchemaRef}; use crate::error::{ArrowError, Result}; +use crate::record_batch::RecordBatch; fn compute_str_values_length(arrays: &[&ArrayData]) -> usize { arrays @@ -102,6 +103,35 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { Ok(make_array(mutable.freeze())) } +/// Concatenates `batches` together into a single record batch. +pub fn concat_batches(schema: &SchemaRef, batches: &[RecordBatch]) -> Result { + if batches.is_empty() { + return Ok(RecordBatch::new_empty(schema.clone())); + } + if let Some((i, _)) = batches + .iter() + .enumerate() + .find(|&(_, batch)| batch.schema() != *schema) + { + return Err(ArrowError::InvalidArgumentError(format!( + "batches[{}] schema is different with argument schema.", + i + ))); + } + let field_num = schema.fields().len(); + let mut arrays = Vec::with_capacity(field_num); + for i in 0..field_num { + let array = concat( + &batches + .iter() + .map(|batch| batch.column(i).as_ref()) + .collect::>(), + )?; + arrays.push(array); + } + RecordBatch::try_new(schema.clone(), arrays) +} + #[cfg(test)] mod tests { use super::*; @@ -569,4 +599,76 @@ mod tests { assert!(!copy.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); assert!(!new.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); } + + #[test] + fn concat_record_batches() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8, false), + ])); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![3, 4])), + Arc::new(StringArray::from(vec!["c", "d"])), + ], + ) + .unwrap(); + let new_batch = RecordBatch::concat(&schema, &[batch1, batch2]).unwrap(); + assert_eq!(new_batch.schema().as_ref(), schema.as_ref()); + assert_eq!(2, new_batch.num_columns()); + assert_eq!(4, new_batch.num_rows()); + } + + #[test] + fn concat_empty_record_batch() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8, false), + ])); + let batch = RecordBatch::concat(&schema, &[]).unwrap(); + assert_eq!(batch.schema().as_ref(), schema.as_ref()); + assert_eq!(0, batch.num_rows()); + } + + #[test] + fn concat_record_batches_of_different_schemas() { + let schema1 = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8, false), + ])); + let schema2 = Arc::new(Schema::new(vec![ + Field::new("c", DataType::Int32, false), + Field::new("d", DataType::Utf8, false), + ])); + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let batch2 = RecordBatch::try_new( + schema2, + vec![ + Arc::new(Int32Array::from(vec![3, 4])), + Arc::new(StringArray::from(vec!["c", "d"])), + ], + ) + .unwrap(); + let error = RecordBatch::concat(&schema1, &[batch1, batch2]).unwrap_err(); + assert_eq!( + error.to_string(), + "Invalid argument error: batches[1] schema is different with argument schema.", + ); + } } diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs index 47257b496c1b..d1db1f1a4c1f 100644 --- a/arrow/src/record_batch.rs +++ b/arrow/src/record_batch.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use crate::array::*; -use crate::compute::kernels::concat::concat; use crate::datatypes::*; use crate::error::{ArrowError, Result}; @@ -390,32 +389,9 @@ impl RecordBatch { } /// Concatenates `batches` together into a single record batch. + #[deprecated(note = "please use arrow::compute::concat_batches")] pub fn concat(schema: &SchemaRef, batches: &[Self]) -> Result { - if batches.is_empty() { - return Ok(RecordBatch::new_empty(schema.clone())); - } - if let Some((i, _)) = batches - .iter() - .enumerate() - .find(|&(_, batch)| batch.schema() != *schema) - { - return Err(ArrowError::InvalidArgumentError(format!( - "batches[{}] schema is different with argument schema.", - i - ))); - } - let field_num = schema.fields().len(); - let mut arrays = Vec::with_capacity(field_num); - for i in 0..field_num { - let array = concat( - &batches - .iter() - .map(|batch| batch.column(i).as_ref()) - .collect::>(), - )?; - arrays.push(array); - } - Self::try_new(schema.clone(), arrays) + crate::compute::concat_batches(schema, batches) } } @@ -713,78 +689,6 @@ mod tests { assert_eq!(batch.column(1).as_ref(), int.as_ref()); } - #[test] - fn concat_record_batches() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Utf8, false), - ])); - let batch1 = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(StringArray::from(vec!["a", "b"])), - ], - ) - .unwrap(); - let batch2 = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![3, 4])), - Arc::new(StringArray::from(vec!["c", "d"])), - ], - ) - .unwrap(); - let new_batch = RecordBatch::concat(&schema, &[batch1, batch2]).unwrap(); - assert_eq!(new_batch.schema().as_ref(), schema.as_ref()); - assert_eq!(2, new_batch.num_columns()); - assert_eq!(4, new_batch.num_rows()); - } - - #[test] - fn concat_empty_record_batch() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Utf8, false), - ])); - let batch = RecordBatch::concat(&schema, &[]).unwrap(); - assert_eq!(batch.schema().as_ref(), schema.as_ref()); - assert_eq!(0, batch.num_rows()); - } - - #[test] - fn concat_record_batches_of_different_schemas() { - let schema1 = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Utf8, false), - ])); - let schema2 = Arc::new(Schema::new(vec![ - Field::new("c", DataType::Int32, false), - Field::new("d", DataType::Utf8, false), - ])); - let batch1 = RecordBatch::try_new( - schema1.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(StringArray::from(vec!["a", "b"])), - ], - ) - .unwrap(); - let batch2 = RecordBatch::try_new( - schema2, - vec![ - Arc::new(Int32Array::from(vec![3, 4])), - Arc::new(StringArray::from(vec!["c", "d"])), - ], - ) - .unwrap(); - let error = RecordBatch::concat(&schema1, &[batch1, batch2]).unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument error: batches[1] schema is different with argument schema.", - ); - } - #[test] fn record_batch_equality() { let id_arr1 = Int32Array::from(vec![1, 2, 3, 4]); From 2ee09bb0d49ac73515158f0c684a6d05a2ec7cc6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 19:01:34 +0100 Subject: [PATCH 0027/1411] Fix master (#2692) --- arrow/src/compute/kernels/concat.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arrow/src/compute/kernels/concat.rs b/arrow/src/compute/kernels/concat.rs index df6436efe843..b6edf8c991cf 100644 --- a/arrow/src/compute/kernels/concat.rs +++ b/arrow/src/compute/kernels/concat.rs @@ -104,7 +104,10 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { } /// Concatenates `batches` together into a single record batch. -pub fn concat_batches(schema: &SchemaRef, batches: &[RecordBatch]) -> Result { +pub fn concat_batches( + schema: &SchemaRef, + batches: &[RecordBatch], +) -> Result { if batches.is_empty() { return Ok(RecordBatch::new_empty(schema.clone())); } @@ -613,7 +616,7 @@ mod tests { Arc::new(StringArray::from(vec!["a", "b"])), ], ) - .unwrap(); + .unwrap(); let batch2 = RecordBatch::try_new( schema.clone(), vec![ @@ -621,8 +624,8 @@ mod tests { Arc::new(StringArray::from(vec!["c", "d"])), ], ) - .unwrap(); - let new_batch = RecordBatch::concat(&schema, &[batch1, batch2]).unwrap(); + .unwrap(); + let new_batch = concat_batches(&schema, &[batch1, batch2]).unwrap(); assert_eq!(new_batch.schema().as_ref(), schema.as_ref()); assert_eq!(2, new_batch.num_columns()); assert_eq!(4, new_batch.num_rows()); @@ -634,7 +637,7 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Utf8, false), ])); - let batch = RecordBatch::concat(&schema, &[]).unwrap(); + let batch = concat_batches(&schema, &[]).unwrap(); assert_eq!(batch.schema().as_ref(), schema.as_ref()); assert_eq!(0, batch.num_rows()); } @@ -656,7 +659,7 @@ mod tests { Arc::new(StringArray::from(vec!["a", "b"])), ], ) - .unwrap(); + .unwrap(); let batch2 = RecordBatch::try_new( schema2, vec![ @@ -664,8 +667,8 @@ mod tests { Arc::new(StringArray::from(vec!["c", "d"])), ], ) - .unwrap(); - let error = RecordBatch::concat(&schema1, &[batch1, batch2]).unwrap_err(); + .unwrap(); + let error = concat_batches(&schema1, &[batch1, batch2]).unwrap_err(); assert_eq!( error.to_string(), "Invalid argument error: batches[1] schema is different with argument schema.", From 5e2b4c7ba85a44791e6a72a908b2296be02dd019 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Sep 2022 22:16:26 +0100 Subject: [PATCH 0028/1411] Simplify DictionaryBuilder constructors (#2684) (#2054) (#2685) * Simplify DictionaryBuilder constructors (#2684) (#2054) * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh Co-authored-by: Liang-Chi Hsieh --- arrow/benches/string_dictionary_builder.rs | 8 +-- arrow/src/array/array_dictionary.rs | 22 +++---- .../builder/primitive_dictionary_builder.rs | 47 ++++++++++----- .../builder/string_dictionary_builder.rs | 56 ++++++++++++------ arrow/src/array/equal/mod.rs | 8 +-- arrow/src/array/transform/mod.rs | 8 +-- arrow/src/compute/kernels/arithmetic.rs | 58 +++++-------------- arrow/src/compute/kernels/arity.rs | 12 +--- arrow/src/compute/kernels/cast.rs | 26 +++------ arrow/src/compute/kernels/comparison.rs | 54 ++++++----------- arrow/src/compute/kernels/take.rs | 5 +- arrow/src/json/reader.rs | 4 +- arrow/src/util/pretty.rs | 12 ++-- parquet/src/arrow/arrow_writer/mod.rs | 4 +- 14 files changed, 142 insertions(+), 182 deletions(-) diff --git a/arrow/benches/string_dictionary_builder.rs b/arrow/benches/string_dictionary_builder.rs index 1a3b95917207..411df3d69b52 100644 --- a/arrow/benches/string_dictionary_builder.rs +++ b/arrow/benches/string_dictionary_builder.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Int32Builder, StringBuilder, StringDictionaryBuilder}; +use arrow::array::StringDictionaryBuilder; +use arrow::datatypes::Int32Type; use criterion::{criterion_group, criterion_main, Criterion}; use rand::{thread_rng, Rng}; @@ -43,12 +44,11 @@ fn criterion_benchmark(c: &mut Criterion) { |b| { let strings = build_strings(dict_size, total_size, key_len); b.iter(|| { - let keys = Int32Builder::with_capacity(strings.len()); - let values = StringBuilder::with_capacity( + let mut builder = StringDictionaryBuilder::::with_capacity( + strings.len(), key_len + 1, (key_len + 1) * dict_size, ); - let mut builder = StringDictionaryBuilder::new(keys, values); for val in &strings { builder.append(val).unwrap(); diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index 79f2969df688..acdb427a22ab 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -22,8 +22,8 @@ use std::iter::IntoIterator; use std::{convert::From, iter::FromIterator}; use super::{ - make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder, - StringArray, StringBuilder, StringDictionaryBuilder, + make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray, + StringDictionaryBuilder, }; use crate::datatypes::{ ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType, @@ -329,9 +329,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator> for Dictionary fn from_iter>>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); - let key_builder = PrimitiveBuilder::::with_capacity(lower); - let value_builder = StringBuilder::with_capacity(256, 1024); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024); it.for_each(|i| { if let Some(i) = i { // Note: impl ... for Result> fails with @@ -367,9 +365,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray fn from_iter>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); - let key_builder = PrimitiveBuilder::::with_capacity(lower); - let value_builder = StringBuilder::with_capacity(256, 1024); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024); it.for_each(|i| { builder .append(i) @@ -589,9 +585,8 @@ mod tests { #[test] fn test_dictionary_array_fmt_debug() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -601,9 +596,8 @@ mod tests { format!("{:?}", array) ); - let key_builder = PrimitiveBuilder::::with_capacity(20); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(20, 2); for _ in 0..20 { builder.append(1).unwrap(); } diff --git a/arrow/src/array/builder/primitive_dictionary_builder.rs b/arrow/src/array/builder/primitive_dictionary_builder.rs index 71223c688283..0fd41a181f55 100644 --- a/arrow/src/array/builder/primitive_dictionary_builder.rs +++ b/arrow/src/array/builder/primitive_dictionary_builder.rs @@ -60,9 +60,7 @@ impl Eq for Value {} /// }; /// use arrow::datatypes::{UInt8Type, UInt32Type}; /// -/// let key_builder = PrimitiveBuilder::::with_capacity(3); -/// let value_builder = PrimitiveBuilder::::with_capacity(2); -/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); +/// let mut builder = PrimitiveDictionaryBuilder::::new(); /// builder.append(12345678).unwrap(); /// builder.append_null(); /// builder.append(22345678).unwrap(); @@ -95,22 +93,41 @@ where map: HashMap, K::Native>, } +impl Default for PrimitiveDictionaryBuilder +where + K: ArrowPrimitiveType, + V: ArrowPrimitiveType, +{ + fn default() -> Self { + Self::new() + } +} + impl PrimitiveDictionaryBuilder where K: ArrowPrimitiveType, V: ArrowPrimitiveType, { - /// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder. - pub fn new( - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - ) -> Self { + /// Creates a new `PrimitiveDictionaryBuilder`. + pub fn new() -> Self { Self { - keys_builder, - values_builder, + keys_builder: PrimitiveBuilder::new(), + values_builder: PrimitiveBuilder::new(), map: HashMap::new(), } } + + /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities + /// + /// `keys_capacity`: the number of keys, i.e. length of array to build + /// `values_capacity`: the number of distinct dictionary values, i.e. size of dictionary + pub fn with_capacity(keys_capacity: usize, values_capacity: usize) -> Self { + Self { + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), + values_builder: PrimitiveBuilder::with_capacity(values_capacity), + map: HashMap::with_capacity(values_capacity), + } + } } impl ArrayBuilder for PrimitiveDictionaryBuilder @@ -211,9 +228,8 @@ mod tests { #[test] fn test_primitive_dictionary_builder() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -239,9 +255,8 @@ mod tests { #[test] #[should_panic(expected = "DictionaryKeyOverflowError")] fn test_primitive_dictionary_overflow() { - let key_builder = PrimitiveBuilder::::with_capacity(257); - let value_builder = PrimitiveBuilder::::with_capacity(257); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(257, 257); // 256 unique keys. for i in 0..256 { builder.append(i + 1000).unwrap(); diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs index 6ad4e9075524..3816e0be1ddb 100644 --- a/arrow/src/array/builder/string_dictionary_builder.rs +++ b/arrow/src/array/builder/string_dictionary_builder.rs @@ -42,9 +42,7 @@ use std::sync::Arc; /// // Create a dictionary array indexed by bytes whose values are Strings. /// // It can thus hold up to 256 distinct string values. /// -/// let key_builder = PrimitiveBuilder::::with_capacity(100); -/// let value_builder = StringBuilder::new(); -/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); +/// let mut builder = StringDictionaryBuilder::::new(); /// /// // The builder builds the dictionary value by value /// builder.append("abc").unwrap(); @@ -84,12 +82,23 @@ where values_builder: StringBuilder, } +impl Default for StringDictionaryBuilder +where + K: ArrowDictionaryKeyType, +{ + fn default() -> Self { + Self::new() + } +} + impl StringDictionaryBuilder where K: ArrowDictionaryKeyType, { - /// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder. - pub fn new(keys_builder: PrimitiveBuilder, values_builder: StringBuilder) -> Self { + /// Creates a new `StringDictionaryBuilder` + pub fn new() -> Self { + let keys_builder = PrimitiveBuilder::new(); + let values_builder = StringBuilder::new(); Self { state: Default::default(), dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()), @@ -98,7 +107,25 @@ where } } - /// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary + /// Creates a new `StringDictionaryBuilder` with the provided capacities + /// + /// `keys_capacity`: the number of keys, i.e. length of array to build + /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary + /// `string_capacity`: the total number of bytes of all distinct strings in the dictionary + pub fn with_capacity( + keys_capacity: usize, + value_capacity: usize, + string_capacity: usize, + ) -> Self { + Self { + state: Default::default(), + dedup: Default::default(), + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), + values_builder: StringBuilder::with_capacity(value_capacity, string_capacity), + } + } + + /// Creates a new `StringDictionaryBuilder` from a keys capacity and a dictionary /// which is initialized with the given values. /// The indices of those dictionary values are used as keys. /// @@ -111,7 +138,7 @@ where /// /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); /// - /// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::::with_capacity(3), &dictionary_values).unwrap(); + /// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap(); /// builder.append("def").unwrap(); /// builder.append_null(); /// builder.append("abc").unwrap(); @@ -123,7 +150,7 @@ where /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); /// ``` pub fn new_with_dictionary( - keys_builder: PrimitiveBuilder, + keys_capacity: usize, dictionary_values: &StringArray, ) -> Result { let state = ahash::RandomState::default(); @@ -162,7 +189,7 @@ where Ok(Self { state, dedup, - keys_builder, + keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), values_builder, }) } @@ -290,9 +317,7 @@ mod tests { #[test] fn test_string_dictionary_builder() { - let key_builder = PrimitiveBuilder::::with_capacity(5); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -317,10 +342,8 @@ mod tests { fn test_string_dictionary_builder_with_existing_dictionary() { let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); - let key_builder = PrimitiveBuilder::::with_capacity(6); let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) - .unwrap(); + StringDictionaryBuilder::new_with_dictionary(6, &dictionary).unwrap(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -349,9 +372,8 @@ mod tests { let dictionary: Vec> = vec![None]; let dictionary = StringArray::from(dictionary); - let key_builder = PrimitiveBuilder::::with_capacity(4); let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) + StringDictionaryBuilder::::new_with_dictionary(4, &dictionary) .unwrap(); builder.append("abc").unwrap(); builder.append_null(); diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 34df0bda0b1f..52be64a3fa76 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -265,8 +265,8 @@ mod tests { use crate::array::{ array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, - ListBuilder, NullArray, PrimitiveBuilder, StringArray, StringDictionaryBuilder, - StructArray, UnionBuilder, + ListBuilder, NullArray, StringArray, StringDictionaryBuilder, StructArray, + UnionBuilder, }; use crate::array::{GenericStringArray, Int32Array}; use crate::buffer::Buffer; @@ -1245,8 +1245,8 @@ mod tests { fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::new_with_dictionary( - PrimitiveBuilder::::with_capacity(3), + let mut builder = StringDictionaryBuilder::::new_with_dictionary( + keys.len(), &values, ) .unwrap(); diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 48859922a26e..29d4434aafaa 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -675,8 +675,8 @@ mod tests { array::{ Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray, FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array, - Int64Builder, ListBuilder, MapBuilder, NullArray, PrimitiveBuilder, - StringArray, StringDictionaryBuilder, StructArray, UInt8Array, + Int64Builder, ListBuilder, MapBuilder, NullArray, StringArray, + StringDictionaryBuilder, StructArray, UInt8Array, }, buffer::Buffer, datatypes::Field, @@ -963,8 +963,8 @@ mod tests { fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::new_with_dictionary( - PrimitiveBuilder::::with_capacity(3), + let mut builder = StringDictionaryBuilder::::new_with_dictionary( + keys.len(), &values, ) .unwrap(); diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index eab4d2136aa1..9bf4b00c3132 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1367,9 +1367,7 @@ mod tests { #[test] fn test_primitive_array_add_dyn_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append(6).unwrap(); builder.append(7).unwrap(); @@ -1377,9 +1375,7 @@ mod tests { builder.append(9).unwrap(); let a = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(6).unwrap(); builder.append(7).unwrap(); builder.append(8).unwrap(); @@ -1408,9 +1404,7 @@ mod tests { assert!(c.is_null(3)); assert_eq!(10, c.value(4)); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append_null(); builder.append(7).unwrap(); @@ -1451,9 +1445,7 @@ mod tests { #[test] fn test_primitive_array_subtract_dyn_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(15).unwrap(); builder.append(8).unwrap(); builder.append(7).unwrap(); @@ -1461,9 +1453,7 @@ mod tests { builder.append(20).unwrap(); let a = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(6).unwrap(); builder.append(7).unwrap(); builder.append(8).unwrap(); @@ -1492,9 +1482,7 @@ mod tests { assert!(c.is_null(3)); assert_eq!(8, c.value(4)); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append_null(); builder.append(7).unwrap(); @@ -1535,9 +1523,7 @@ mod tests { #[test] fn test_primitive_array_multiply_dyn_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append(6).unwrap(); builder.append(7).unwrap(); @@ -1545,9 +1531,7 @@ mod tests { builder.append(9).unwrap(); let a = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(6).unwrap(); builder.append(7).unwrap(); builder.append(8).unwrap(); @@ -1579,9 +1563,7 @@ mod tests { #[test] fn test_primitive_array_divide_dyn_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(15).unwrap(); builder.append(6).unwrap(); builder.append(1).unwrap(); @@ -1589,9 +1571,7 @@ mod tests { builder.append(9).unwrap(); let a = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append(3).unwrap(); builder.append(1).unwrap(); @@ -1620,9 +1600,7 @@ mod tests { assert!(c.is_null(3)); assert_eq!(18, c.value(4)); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append_null(); builder.append(7).unwrap(); @@ -1806,9 +1784,7 @@ mod tests { assert!(c.is_null(3)); assert_eq!(4, c.value(4)); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append_null(); builder.append(7).unwrap(); @@ -2082,15 +2058,13 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] fn test_primitive_array_divide_dyn_by_zero_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(1); - let value_builder = PrimitiveBuilder::::with_capacity(1); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(15).unwrap(); let a = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(1); - let value_builder = PrimitiveBuilder::::with_capacity(1); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(0).unwrap(); let b = builder.finish(); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 89151c286343..1251baf52fd8 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -122,9 +122,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::{ - as_primitive_array, Float64Array, PrimitiveBuilder, PrimitiveDictionaryBuilder, - }; + use crate::array::{as_primitive_array, Float64Array, PrimitiveDictionaryBuilder}; use crate::datatypes::{Float64Type, Int32Type, Int8Type}; #[test] @@ -149,9 +147,7 @@ mod tests { #[test] fn test_unary_dict_and_unary_dyn() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); builder.append(6).unwrap(); builder.append(7).unwrap(); @@ -160,9 +156,7 @@ mod tests { builder.append(9).unwrap(); let dictionary_array = builder.finish(); - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(6).unwrap(); builder.append(7).unwrap(); builder.append(8).unwrap(); diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 71a4fcc955ba..7d67bffdf4ea 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -2563,9 +2563,8 @@ where .downcast_ref::>() .unwrap(); - let keys_builder = PrimitiveBuilder::::with_capacity(values.len()); - let values_builder = PrimitiveBuilder::::with_capacity(values.len()); - let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); + let mut b = + PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); // copy each element one at a time for i in 0..values.len() { @@ -2589,10 +2588,7 @@ where { let cast_values = cast_with_options(array, &DataType::Utf8, cast_options)?; let values = cast_values.as_any().downcast_ref::().unwrap(); - - let keys_builder = PrimitiveBuilder::::with_capacity(values.len()); - let values_builder = StringBuilder::with_capacity(1024, values.len()); - let mut b = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut b = StringDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); // copy each element one at a time for i in 0..values.len() { @@ -5001,9 +4997,7 @@ mod tests { // FROM a dictionary with of Utf8 values use DataType::*; - let keys_builder = PrimitiveBuilder::::new(); - let values_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("one").unwrap(); builder.append_null(); builder.append("three").unwrap(); @@ -5062,9 +5056,7 @@ mod tests { // that are out of bounds for a particular other kind of // index. - let keys_builder = PrimitiveBuilder::::new(); - let values_builder = PrimitiveBuilder::::new(); - let mut builder = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); // add 200 distinct values (which can be stored by a // dictionary indexed by int32, but not a dictionary indexed @@ -5093,9 +5085,7 @@ mod tests { // Same test as test_cast_dict_to_dict_bad_index_value but use // string values (and encode the expected behavior here); - let keys_builder = PrimitiveBuilder::::new(); - let values_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut builder = StringDictionaryBuilder::::new(); // add 200 distinct values (which can be stored by a // dictionary indexed by int32, but not a dictionary indexed @@ -5124,9 +5114,7 @@ mod tests { // FROM a dictionary with of INT32 values use DataType::*; - let keys_builder = PrimitiveBuilder::::new(); - let values_builder = PrimitiveBuilder::::new(); - let mut builder = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(1).unwrap(); builder.append_null(); builder.append(3).unwrap(); diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 978a2d9f4d34..5a79c2e82df1 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -4884,9 +4884,8 @@ mod tests { #[test] fn test_eq_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -4928,9 +4927,8 @@ mod tests { #[test] fn test_lt_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -4971,9 +4969,8 @@ mod tests { } #[test] fn test_lt_eq_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = PrimitiveBuilder::::new(); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -5015,9 +5012,8 @@ mod tests { #[test] fn test_gt_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -5059,9 +5055,8 @@ mod tests { #[test] fn test_gt_eq_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = PrimitiveBuilder::::new(); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(22).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -5103,9 +5098,8 @@ mod tests { #[test] fn test_neq_dyn_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = PrimitiveBuilder::::new(); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(22).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -5247,9 +5241,7 @@ mod tests { #[test] fn test_eq_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -5275,9 +5267,7 @@ mod tests { } #[test] fn test_lt_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -5304,9 +5294,7 @@ mod tests { } #[test] fn test_lt_eq_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -5333,9 +5321,7 @@ mod tests { } #[test] fn test_gt_eq_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -5363,9 +5349,7 @@ mod tests { #[test] fn test_gt_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); @@ -5392,9 +5376,7 @@ mod tests { } #[test] fn test_neq_dyn_utf8_scalar_with_dict() { - let key_builder = PrimitiveBuilder::::new(); - let value_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("abc").unwrap(); builder.append_null(); builder.append("def").unwrap(); diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 19eb1b17ca21..8f1aab27b534 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -1960,10 +1960,7 @@ mod tests { #[test] fn test_take_dict() { - let keys_builder = Int16Builder::new(); - let values_builder = StringBuilder::new(); - - let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut dict_builder = StringDictionaryBuilder::::new(); dict_builder.append("foo").unwrap(); dict_builder.append("bar").unwrap(); diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index fb8f6cfab477..c32e5ca18488 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -914,9 +914,7 @@ impl Decoder { where T: ArrowPrimitiveType + ArrowDictionaryKeyType, { - let key_builder = PrimitiveBuilder::::with_capacity(row_len); - let values_builder = StringBuilder::with_capacity(row_len, row_len * 5); - StringDictionaryBuilder::new(key_builder, values_builder) + StringDictionaryBuilder::with_capacity(row_len, row_len, row_len * 5) } #[inline(always)] diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index b0013619b50c..f819e389f96e 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -108,10 +108,10 @@ mod tests { use crate::{ array::{ self, new_null_array, Array, Date32Array, Date64Array, - FixedSizeBinaryBuilder, Float16Array, Int32Array, PrimitiveBuilder, - StringArray, StringBuilder, StringDictionaryBuilder, StructArray, - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, - Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + FixedSizeBinaryBuilder, Float16Array, Int32Array, StringArray, + StringDictionaryBuilder, StructArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UnionArray, UnionBuilder, }, buffer::Buffer, @@ -241,9 +241,7 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); - let keys_builder = PrimitiveBuilder::::with_capacity(10); - let values_builder = StringBuilder::new(); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut builder = StringDictionaryBuilder::::new(); builder.append("one")?; builder.append_null(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 6f9d5b3aff81..1fef695dc47f 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1653,9 +1653,7 @@ mod tests { )])); // create some data - let key_builder = PrimitiveBuilder::::with_capacity(3); - let value_builder = PrimitiveBuilder::::with_capacity(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); + let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); From 5ccf73ea596e25f601aeabc10029254cc18a55e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 9 Sep 2022 04:50:22 +0200 Subject: [PATCH 0029/1411] Add support for empty projection in RecordBatch::project (#2691) * Add support for empty projection in RecordBatch::project * Simplify --- arrow/src/record_batch.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs index d1db1f1a4c1f..4b0d36a43e5b 100644 --- a/arrow/src/record_batch.rs +++ b/arrow/src/record_batch.rs @@ -212,7 +212,14 @@ impl RecordBatch { }) .collect::>>()?; - RecordBatch::try_new(SchemaRef::new(projected_schema), batch_fields) + RecordBatch::try_new_with_options( + SchemaRef::new(projected_schema), + batch_fields, + &RecordBatchOptions { + match_field_names: true, + row_count: Some(self.row_count), + }, + ) } /// Returns the number of columns in the record batch. @@ -865,6 +872,26 @@ mod tests { assert_eq!(expected, record_batch.project(&[0, 2]).unwrap()); } + #[test] + fn project_empty() { + let c: ArrayRef = Arc::new(StringArray::from(vec!["d", "e", "f"])); + + let record_batch = + RecordBatch::try_from_iter(vec![("c", c.clone())]).expect("valid conversion"); + + let expected = RecordBatch::try_new_with_options( + Arc::new(Schema::empty()), + vec![], + &RecordBatchOptions { + match_field_names: true, + row_count: Some(3), + }, + ) + .expect("valid conversion"); + + assert_eq!(expected, record_batch.project(&[]).unwrap()); + } + #[test] fn test_no_column_record_batch() { let schema = Arc::new(Schema::new(vec![])); From 04bd39521e264693d38048059c372b0712cc87a2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 10 Sep 2022 00:22:38 -0700 Subject: [PATCH 0030/1411] Support sorting dictionary encoded primitive integer arrays (#2680) * Support sorting dictionary encoded primitive arrays * Reduce combinatorial fanout * Change from &SortOptions to SortOptions * Fix value order and add a test * Fix null ordering and add test * Add comment and increase test coverage. --- arrow/src/compute/kernels/sort.rs | 405 ++++++++++++++++++++++++++---- 1 file changed, 363 insertions(+), 42 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 0e2273e92525..7a2d47786af0 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -21,8 +21,10 @@ use crate::array::*; use crate::buffer::MutableBuffer; use crate::compute::take; use crate::datatypes::*; +use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use std::cmp::Ordering; +use std::collections::HashMap; use TimeUnit::*; /// Sort the `ArrayRef` using `SortOptions`. @@ -311,41 +313,121 @@ pub fn sort_to_indices( ))); } }, - DataType::Dictionary(key_type, value_type) - if *value_type.as_ref() == DataType::Utf8 => - { - match key_type.as_ref() { - DataType::Int8 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int16 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int32 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int64 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt8 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt16 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - t => { - return Err(ArrowError::ComputeError(format!( - "Sort not supported for dictionary key type {:?}", - t - ))); - } - } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + values => match values.values().data_type() { + DataType::Int8 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + // When sorting dictionary in descending order, we take inverse of of null ordering + // when sorting the values. Because if `nulls_first` is true, null must be in front + // of non-null value. As we take the sorted order of value array to sort dictionary + // keys, these null values will be treated as smallest ones and be sorted to the end + // of sorted result. So we set `nulls_first` to false when sorting dictionary value + // array to make them as largest ones, then null values will be put at the beginning + // of sorted dictionary result. + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) + }, + DataType::Int16 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) + }, + DataType::Int32 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) + }, + DataType::Int64 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) + }, + DataType::UInt8 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) + }, + DataType::UInt16 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) + }, + DataType::UInt32 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) + }, + DataType::UInt64 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) + }, + DataType::Utf8 => sort_string_dictionary::<_>(values, v, n, &options, limit), + t => return Err(ArrowError::ComputeError(format!( + "Unsupported dictionary value type {}", t + ))), + }, + t => return Err(ArrowError::ComputeError(format!( + "Unsupported datatype {}", t + ))), + ) } DataType::Binary | DataType::FixedSizeBinary(_) => { sort_binary::(values, v, n, &options, limit) @@ -489,7 +571,14 @@ where .into_iter() .map(|index| (index, decimal_array.value(index as usize).as_i128())) .collect::>(); - sort_primitive_inner(decimal_values, null_indices, cmp, options, limit, valids) + sort_primitive_inner( + decimal_values.len(), + null_indices, + cmp, + options, + limit, + valids, + ) } /// Sort primitive values @@ -514,12 +603,55 @@ where .map(|index| (index, values.value(index as usize))) .collect::>() }; - sort_primitive_inner(values, null_indices, cmp, options, limit, valids) + sort_primitive_inner(values.len(), null_indices, cmp, options, limit, valids) +} + +/// A helper function used to convert sorted value indices to a map that we can look up sorted order +/// for a value index later. +fn prepare_indices_map(sorted_value_indices: &UInt32Array) -> HashMap { + sorted_value_indices + .into_iter() + .enumerate() + .map(|(idx, index)| { + // Indices don't have None value + let index = index.unwrap(); + (index as usize, idx as u32) + }) + .collect::>() +} + +/// Sort dictionary encoded primitive values +fn sort_primitive_dictionary( + values: &DictionaryArray, + value_indices_map: &HashMap, + value_indices: Vec, + null_indices: Vec, + options: SortOptions, + limit: Option, + cmp: F, +) -> UInt32Array +where + K: ArrowDictionaryKeyType, + F: Fn(u32, u32) -> std::cmp::Ordering, +{ + let keys: &PrimitiveArray = values.keys(); + + // create tuples that are used for sorting + let valids = value_indices + .into_iter() + .map(|index| { + let key: K::Native = keys.value(index as usize); + let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap(); + (index, *value_order) + }) + .collect::>(); + + sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, &options, limit, valids) } // sort is instantiated a lot so we only compile this inner version for each native type fn sort_primitive_inner( - values: &ArrayRef, + value_len: usize, null_indices: Vec, cmp: F, options: &SortOptions, @@ -535,7 +667,7 @@ where let valids_len = valids.len(); let nulls_len = nulls.len(); - let mut len = values.len(); + let mut len = value_len; if let Some(limit) = limit { len = limit.min(len); @@ -620,14 +752,12 @@ fn sort_string( /// Sort dictionary encoded strings fn sort_string_dictionary( - values: &ArrayRef, + values: &DictionaryArray, value_indices: Vec, null_indices: Vec, options: &SortOptions, limit: Option, ) -> UInt32Array { - let values: &DictionaryArray = as_dictionary_array::(values); - let keys: &PrimitiveArray = values.keys(); let dict = values.values(); @@ -1239,6 +1369,59 @@ mod tests { assert_eq!(sorted_strings, expected) } + fn test_sort_primitive_dict_arrays( + keys: PrimitiveArray, + values: PrimitiveArray, + options: Option, + limit: Option, + expected_data: Vec>, + ) where + PrimitiveArray: From>>, + { + let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array_values = array.values().clone(); + let dict = array_values + .as_any() + .downcast_ref::>() + .expect("Unable to get dictionary values"); + + let sorted = match limit { + Some(_) => { + sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap() + } + _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), + }; + let sorted = sorted + .as_any() + .downcast_ref::>() + .unwrap(); + let sorted_values = sorted.values(); + let sorted_dict = sorted_values + .as_any() + .downcast_ref::>() + .expect("Unable to get dictionary values"); + let sorted_keys = sorted.keys(); + + assert_eq!(sorted_dict, dict); + + let sorted_values: PrimitiveArray = From::>>::from( + (0..sorted.len()) + .map(|i| { + let key = sorted_keys.value(i).to_usize().unwrap(); + if sorted.is_valid(i) && sorted_dict.is_valid(key) { + Some(sorted_dict.value(key)) + } else { + None + } + }) + .collect::>>(), + ); + let expected: PrimitiveArray = + From::>>::from(expected_data); + + assert_eq!(sorted_values, expected) + } + fn test_sort_list_arrays( data: Vec>>>, options: Option, @@ -3222,4 +3405,142 @@ mod tests { partial_sort(&mut before, last, |a, b| a.cmp(b)); assert_eq!(&d[0..last], &before[0..last]); } + + #[test] + fn test_sort_int8_dicts() { + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Int8Array::from(vec![1, 3, 5]); + test_sort_primitive_dict_arrays::( + keys, + values, + None, + None, + vec![None, None, Some(1), Some(3), Some(5), Some(5)], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Int8Array::from(vec![1, 3, 5]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![Some(5), Some(5), Some(3), Some(1), None, None], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Int8Array::from(vec![1, 3, 5]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![Some(1), Some(3), Some(5), Some(5), None, None], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Int8Array::from(vec![1, 3, 5]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(3), + vec![None, None, Some(5)], + ); + + // Values have `None`. + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); + test_sort_primitive_dict_arrays::( + keys, + values, + None, + None, + vec![None, None, None, Some(1), Some(3), Some(5), Some(5)], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![Some(1), Some(3), Some(5), Some(5), None, None, None], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![Some(5), Some(5), Some(3), Some(1), None, None, None], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Int8Array::from(vec![Some(1), Some(3), None, Some(5)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![None, None, None, Some(5), Some(5), Some(3), Some(1)], + ); + } } From 41e0187883330fe9a4b45979feca5b519e37723f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 10 Sep 2022 18:55:46 +0100 Subject: [PATCH 0031/1411] Update quick-xml to 0.25 (#2695) --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 0f5b0fd680b6..9e4e68d59119 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.13", default-features = false, optional = true } -quick-xml = { version = "0.24.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.25.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From a1d24e4d86a8c7659cd259b9f38567cfabefd14c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 10 Sep 2022 21:27:37 +0100 Subject: [PATCH 0032/1411] Comparable Row Format (#2593) * Add row format * Skip miri on heavier tests * Handle nulls in dictionary values * Don't fuzz test dictionaries with null values * Add docs * Add error plumbing * Review feedback * Fix docs --- arrow/Cargo.toml | 5 + arrow/benches/row_format.rs | 114 ++++ arrow/src/compute/kernels/sort.rs | 6 +- arrow/src/lib.rs | 1 + arrow/src/row/fixed.rs | 160 ++++++ arrow/src/row/interner.rs | 451 +++++++++++++++ arrow/src/row/mod.rs | 893 ++++++++++++++++++++++++++++++ arrow/src/row/variable.rs | 107 ++++ 8 files changed, 1734 insertions(+), 3 deletions(-) create mode 100644 arrow/benches/row_format.rs create mode 100644 arrow/src/row/fixed.rs create mode 100644 arrow/src/row/interner.rs create mode 100644 arrow/src/row/mod.rs create mode 100644 arrow/src/row/variable.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index cedd48e4d313..1b2bb6fd775b 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -233,3 +233,8 @@ harness = false [[bench]] name = "decimal_validate" harness = false + +[[bench]] +name = "row_format" +harness = false +required-features = ["test_utils"] diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs new file mode 100644 index 000000000000..2802aa6ece0b --- /dev/null +++ b/arrow/benches/row_format.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +extern crate core; + +use arrow::array::ArrayRef; +use arrow::datatypes::{DataType, Int64Type, UInt64Type}; +use arrow::row::{RowConverter, SortField}; +use arrow::util::bench_util::{create_primitive_array, create_string_array_with_len}; +use criterion::{black_box, Criterion}; +use std::sync::Arc; + +fn row_bench(c: &mut Criterion) { + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + + c.bench_function("row_batch 4096 u64(0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::UInt64)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + + c.bench_function("row_batch 4096 i64(0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Int64)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; + + c.bench_function("row_batch 4096 string(10, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; + + c.bench_function("row_batch 4096 string(30, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; + + c.bench_function("row_batch 4096 string(100, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; + + c.bench_function("row_batch 4096 string(100, 0.5)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = [ + Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, + Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, + Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + + let fields = [ + SortField::new(DataType::Utf8), + SortField::new(DataType::Utf8), + SortField::new(DataType::Utf8), + SortField::new(DataType::Int64), + ]; + + c.bench_function( + "row_batch 4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", + |b| { + b.iter(|| { + let mut converter = RowConverter::new(fields.to_vec()); + black_box(converter.convert_columns(&cols)) + }); + }, + ); +} + +criterion_group!(benches, row_bench); +criterion_main!(benches); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 7a2d47786af0..e4eb35279064 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -1071,13 +1071,13 @@ type LexicographicalCompareItem<'a> = ( /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data /// at given two indices. The lifetime is the same at the data wrapped. -pub(super) struct LexicographicalComparator<'a> { +pub(crate) struct LexicographicalComparator<'a> { compare_items: Vec>, } impl LexicographicalComparator<'_> { /// lexicographically compare values at the wrapped columns with given indices. - pub(super) fn compare<'a, 'b>( + pub(crate) fn compare<'a, 'b>( &'a self, a_idx: &'b usize, b_idx: &'b usize, @@ -1121,7 +1121,7 @@ impl LexicographicalComparator<'_> { /// Create a new lex comparator that will wrap the given sort columns and give comparison /// results with two indices. - pub(super) fn try_new( + pub(crate) fn try_new( columns: &[SortColumn], ) -> Result> { let compare_items = columns diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index d1fb0cae0da2..87a4799e3e2a 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -269,6 +269,7 @@ pub mod json; #[cfg(feature = "pyarrow")] pub mod pyarrow; pub mod record_batch; +pub mod row; pub mod temporal_conversions; pub mod tensor; pub mod util; diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs new file mode 100644 index 000000000000..78108274241b --- /dev/null +++ b/arrow/src/row/fixed.rs @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::PrimitiveArray; +use crate::compute::SortOptions; +use crate::datatypes::ArrowPrimitiveType; +use crate::row::Rows; +use crate::util::decimal::{Decimal128, Decimal256}; +use half::f16; + +/// Encodes a value of a particular fixed width type into bytes according to the rules +/// described on [`super::RowConverter`] +pub trait FixedLengthEncoding: Copy { + const ENCODED_LEN: usize = 1 + N; + + fn encode(self) -> [u8; N]; +} + +impl FixedLengthEncoding<1> for bool { + fn encode(self) -> [u8; 1] { + [self as u8] + } +} + +macro_rules! encode_signed { + ($n:expr, $t:ty) => { + impl FixedLengthEncoding<$n> for $t { + fn encode(self) -> [u8; $n] { + let mut b = self.to_be_bytes(); + // Toggle top "sign" bit to ensure consistent sort order + b[0] ^= 0x80; + b + } + } + }; +} + +encode_signed!(1, i8); +encode_signed!(2, i16); +encode_signed!(4, i32); +encode_signed!(8, i64); +encode_signed!(16, i128); + +macro_rules! encode_unsigned { + ($n:expr, $t:ty) => { + impl FixedLengthEncoding<$n> for $t { + fn encode(self) -> [u8; $n] { + self.to_be_bytes() + } + } + }; +} + +encode_unsigned!(1, u8); +encode_unsigned!(2, u16); +encode_unsigned!(4, u32); +encode_unsigned!(8, u64); + +impl FixedLengthEncoding<2> for f16 { + fn encode(self) -> [u8; 2] { + // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 + let s = self.to_bits() as i16; + let val = s ^ (((s >> 15) as u16) >> 1) as i16; + val.encode() + } +} + +impl FixedLengthEncoding<4> for f32 { + fn encode(self) -> [u8; 4] { + // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 + let s = self.to_bits() as i32; + let val = s ^ (((s >> 31) as u32) >> 1) as i32; + val.encode() + } +} + +impl FixedLengthEncoding<8> for f64 { + fn encode(self) -> [u8; 8] { + // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 + let s = self.to_bits() as i64; + let val = s ^ (((s >> 63) as u64) >> 1) as i64; + val.encode() + } +} + +impl FixedLengthEncoding<16> for Decimal128 { + fn encode(self) -> [u8; 16] { + let mut val = *self.raw_value(); + // Convert to big endian representation + val.reverse(); + // Toggle top "sign" bit to ensure consistent sort order + val[0] ^= 0x80; + val + } +} + +impl FixedLengthEncoding<32> for Decimal256 { + fn encode(self) -> [u8; 32] { + let mut val = *self.raw_value(); + // Convert to big endian representation + val.reverse(); + // Toggle top "sign" bit to ensure consistent sort order + val[0] ^= 0x80; + val + } +} + +/// Returns the total encoded length (including null byte) for a value of type `T::Native` +pub const fn encoded_len(_col: &PrimitiveArray) -> usize +where + T: ArrowPrimitiveType, + T::Native: FixedLengthEncoding, +{ + T::Native::ENCODED_LEN +} + +/// Fixed width types are encoded as +/// +/// - 1 byte `0` if null or `1` if valid +/// - bytes of [`FixedLengthEncoding`] +pub fn encode< + const N: usize, + T: FixedLengthEncoding, + I: IntoIterator>, +>( + out: &mut Rows, + i: I, + opts: SortOptions, +) { + for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { + let end_offset = *offset + N + 1; + if let Some(val) = maybe_val { + let to_write = &mut out.buffer[*offset..end_offset]; + to_write[0] = 1; + let mut encoded = val.encode(); + if opts.descending { + // Flip bits to reverse order + encoded.iter_mut().for_each(|v| *v = !*v) + } + to_write[1..].copy_from_slice(&encoded) + } else if !opts.nulls_first { + out.buffer[*offset] = 0xFF; + } + *offset = end_offset; + } +} diff --git a/arrow/src/row/interner.rs b/arrow/src/row/interner.rs new file mode 100644 index 000000000000..77edb97e8d1f --- /dev/null +++ b/arrow/src/row/interner.rs @@ -0,0 +1,451 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use hashbrown::hash_map::RawEntryMut; +use hashbrown::HashMap; +use std::cmp::Ordering; +use std::num::NonZeroU32; +use std::ops::Index; + +/// An interned value +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct Interned(NonZeroU32); // We use NonZeroU32 so that `Option` is 32 bits + +/// A byte array interner that generates normalized keys that are sorted with respect +/// to the interned values, e.g. `inter(a) < intern(b) => a < b` +#[derive(Debug, Default)] +pub struct OrderPreservingInterner { + /// Provides a lookup from [`Interned`] to the normalized key + keys: InternBuffer, + /// Provides a lookup from [`Interned`] to the normalized value + values: InternBuffer, + /// Key allocation data structure + bucket: Box, + + // A hash table used to perform faster re-keying, and detect duplicates + hasher: ahash::RandomState, + lookup: HashMap, +} + +impl OrderPreservingInterner { + /// Interns an iterator of values returning a list of [`Interned`] which can be + /// used with [`Self::normalized_key`] to retrieve the normalized keys with a + /// lifetime not tied to the mutable borrow passed to this method + pub fn intern(&mut self, input: I) -> Vec> + where + I: IntoIterator>, + V: AsRef<[u8]>, + { + let iter = input.into_iter(); + let capacity = iter.size_hint().0; + let mut out = Vec::with_capacity(capacity); + + // (index in output, hash value, value) + let mut to_intern: Vec<(usize, u64, V)> = Vec::with_capacity(capacity); + let mut to_intern_len = 0; + + for (idx, item) in iter.enumerate() { + let value: V = match item { + Some(value) => value, + None => { + out.push(None); + continue; + } + }; + + let v = value.as_ref(); + let hash = self.hasher.hash_one(v); + let entry = self + .lookup + .raw_entry_mut() + .from_hash(hash, |a| &self.values[*a] == v); + + match entry { + RawEntryMut::Occupied(o) => out.push(Some(*o.key())), + RawEntryMut::Vacant(_) => { + // Push placeholder + out.push(None); + to_intern_len += v.len(); + to_intern.push((idx, hash, value)); + } + }; + } + + to_intern.sort_unstable_by(|(_, _, a), (_, _, b)| a.as_ref().cmp(b.as_ref())); + + self.keys.offsets.reserve(to_intern.len()); + self.keys.values.reserve(to_intern.len()); // Approximation + self.values.offsets.reserve(to_intern.len()); + self.values.values.reserve(to_intern_len); + + for (idx, hash, value) in to_intern { + let val = value.as_ref(); + + let entry = self + .lookup + .raw_entry_mut() + .from_hash(hash, |a| &self.values[*a] == val); + + match entry { + RawEntryMut::Occupied(o) => { + out[idx] = Some(*o.key()); + } + RawEntryMut::Vacant(v) => { + let val = value.as_ref(); + self.bucket + .insert(&mut self.values, val, &mut self.keys.values); + self.keys.values.push(0); + let interned = self.keys.append(); + + let hasher = &mut self.hasher; + let values = &self.values; + v.insert_with_hasher(hash, interned, (), |key| { + hasher.hash_one(&values[*key]) + }); + out[idx] = Some(interned); + } + } + } + + out + } + + /// Returns a null-terminated byte array that can be compared against other normalized_key + /// returned by this instance, to establish ordering of the interned values + pub fn normalized_key(&self, key: Interned) -> &[u8] { + &self.keys[key] + } +} + +/// A buffer of `[u8]` indexed by `[Interned]` +#[derive(Debug)] +struct InternBuffer { + /// Raw values + values: Vec, + /// The ith value is `&values[offsets[i]..offsets[i+1]]` + offsets: Vec, +} + +impl Default for InternBuffer { + fn default() -> Self { + Self { + values: Default::default(), + offsets: vec![0], + } + } +} + +impl InternBuffer { + /// Insert `data` returning the corresponding [`Interned`] + fn insert(&mut self, data: &[u8]) -> Interned { + self.values.extend_from_slice(data); + self.append() + } + + /// Appends the next value based on data written to `self.values` + /// returning the corresponding [`Interned`] + fn append(&mut self) -> Interned { + let idx: u32 = self.offsets.len().try_into().unwrap(); + let key = Interned(NonZeroU32::new(idx).unwrap()); + self.offsets.push(self.values.len()); + key + } +} + +impl Index for InternBuffer { + type Output = [u8]; + + fn index(&self, key: Interned) -> &Self::Output { + let index = key.0.get() as usize; + let end = self.offsets[index]; + let start = self.offsets[index - 1]; + // SAFETY: + // self.values is never reduced in size and values appended + // to self.offsets are always less than self.values at the time + unsafe { self.values.get_unchecked(start..end) } + } +} + +/// A slot corresponds to a single byte-value in the generated normalized key +/// +/// It may contain a value, if not the first slot, and may contain a child [`Bucket`] representing +/// the next byte in the generated normalized key +#[derive(Debug, Default, Clone)] +struct Slot { + value: Option, + /// Child values less than `self.value` if any + child: Option>, +} + +/// Bucket is the root of the data-structure used to allocate normalized keys +/// +/// In particular it needs to generate keys that +/// +/// * Contain no `0` bytes other than the null terminator +/// * Compare lexicographically in the same manner as the encoded `data` +/// +/// The data structure consists of 255 slots, each of which can store a value. +/// Additionally each slot may contain a child bucket, containing values smaller +/// than the value within the slot +/// +/// # Allocation Strategy +/// +/// To find the insertion point within a Bucket we perform a binary search of the slots, but +/// capping the search range at 4. Visualizing this as a search tree, the root would have 64 +/// children, with subsequent non-leaf nodes each containing two children. +/// +/// The insertion point is the first empty slot we encounter, otherwise it is the first slot +/// that contains a value greater than the value being inserted +/// +/// For example, initially all slots are empty +/// +/// ```ignore +/// 0: +/// 1: +/// . +/// . +/// 254: +/// ``` +/// +/// Insert `1000` +/// +/// ```ignore +/// 0: +/// 1: +/// 2: +/// 3: 1000 <- 1. slot is empty, insert here +/// 4: +/// . +/// . +/// 254: +/// ``` +/// +/// Insert `500` +/// +/// ```ignore +/// 0: +/// 1: 500 <- 2. slot is empty, insert here +/// 2: +/// 3: 1000 <- 1. compare against slot value +/// 4. +/// . +/// . +/// 254: +/// ``` +/// +/// Insert `600` +/// +/// ```ignore +/// 0: +/// 1: 500 <- 2. compare against slot value +/// 2: 600 <- 3. slot is empty, insert here +/// 3: 1000 <- 1. compare against slot value +/// 4. +/// . +/// . +/// 254: +/// ``` +/// +/// Insert `400` +/// +/// ```ignore +/// 0: 400 <- 3. slot is empty, insert here +/// 1: 500 <- 2. compare against slot value +/// 2: 600 +/// 3: 1000 <- 1. compare against slot value +/// 4. +/// . +/// . +/// 254: +/// ``` +/// +/// Insert `700` +/// +/// ```ignore +/// 0: 400 +/// 1: 500 <- 2. compare against slot value +/// 2: 600 <- 3. slot is occupied and end of search +/// 3: 1000 <- 1. compare against slot value +/// 4. +/// . +/// . +/// 254: +/// ``` +/// +/// In this case we reach the end of our search and need to insert a value between +/// slots 2 and 3. To do this we create a new bucket under slot 3, and repeat +/// the process for that bucket. +/// +/// The final key will consists of the slot indexes visited incremented by 1, +/// with the final value incremented by 2, followed by a null terminator. +/// +/// So in the above example we would have +/// +/// ```ignore +/// 400: &[2, 0] +/// 500: &[3, 0] +/// 600: &[4, 0] +/// 700: &[4, 5, 0] +/// 1000: &[5, 0] +/// ``` +/// +#[derive(Debug, Clone)] +struct Bucket { + slots: Box<[Slot]>, +} + +impl Default for Bucket { + fn default() -> Self { + let slots = (0..255).map(|_| Slot::default()).collect::>().into(); + Self { slots } + } +} + +impl Bucket { + /// Perform a skewed binary search to find the first slot that is empty or less + /// + /// Returns `Ok(idx)` if an exact match is found, otherwise returns `Err(idx)` + /// containing the slot index to insert at + fn insert_pos(&self, values_buf: &InternBuffer, data: &[u8]) -> Result { + let mut size = self.slots.len() - 1; + let mut left = 0; + let mut right = size; + while left < right { + // Skew binary search to leave gaps of at most 3 elements + let mid = left + (size / 2).min(3); + + let slot = &self.slots[mid]; + let val = match slot.value { + Some(val) => val, + None => return Err(mid), + }; + + let cmp = values_buf[val].cmp(data); + if cmp == Ordering::Less { + left = mid + 1; + } else if cmp == Ordering::Greater { + right = mid; + } else { + return Ok(mid); + } + + size = right - left; + } + Err(left) + } + + /// Insert `data` into this bucket or one of its children, appending the + /// normalized key to `out` as it is constructed + /// + /// # Panics + /// + /// Panics if the value already exists + fn insert(&mut self, values_buf: &mut InternBuffer, data: &[u8], out: &mut Vec) { + match self.insert_pos(values_buf, data) { + Ok(_) => unreachable!("value already exists"), + Err(idx) => { + let slot = &mut self.slots[idx]; + // Cannot insert a value into slot 254 as would overflow byte, but also + // would prevent inserting any larger values, as the child bucket can + // only contain values less than the slot + if idx != 254 && slot.value.is_none() { + out.push(idx as u8 + 2); + slot.value = Some(values_buf.insert(data)) + } else { + out.push(idx as u8 + 1); + slot.child + .get_or_insert_with(Default::default) + .insert(values_buf, data, out); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::prelude::*; + + // Clippy isn't smart enough to understand dropping mutability + #[allow(clippy::needless_collect)] + fn test_intern_values(values: &[u64]) { + let mut interner = OrderPreservingInterner::default(); + + // Intern a single value at a time to check ordering + let interned: Vec<_> = values + .iter() + .flat_map(|v| interner.intern([Some(&v.to_be_bytes())])) + .map(Option::unwrap) + .collect(); + + let interned: Vec<_> = interned + .into_iter() + .map(|x| interner.normalized_key(x)) + .collect(); + + for (i, a) in interned.iter().enumerate() { + for (j, b) in interned.iter().enumerate() { + let interned_cmp = a.cmp(b); + let values_cmp = values[i].cmp(&values[j]); + assert_eq!( + interned_cmp, values_cmp, + "({:?} vs {:?}) vs ({} vs {})", + a, b, values[i], values[j] + ) + } + } + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_interner() { + test_intern_values(&[8, 6, 5, 7]); + + let mut values: Vec<_> = (0_u64..2000).collect(); + test_intern_values(&values); + + let mut rng = thread_rng(); + values.shuffle(&mut rng); + test_intern_values(&values); + } + + #[test] + fn test_intern_duplicates() { + // Unsorted with duplicates + let values = vec![0_u8, 1, 8, 4, 1, 0]; + let mut interner = OrderPreservingInterner::default(); + + let interned = interner.intern(values.iter().map(std::slice::from_ref).map(Some)); + let interned: Vec<_> = interned.into_iter().map(Option::unwrap).collect(); + + assert_eq!(interned[0], interned[5]); + assert_eq!(interned[1], interned[4]); + assert!( + interner.normalized_key(interned[0]) < interner.normalized_key(interned[1]) + ); + assert!( + interner.normalized_key(interned[1]) < interner.normalized_key(interned[2]) + ); + assert!( + interner.normalized_key(interned[1]) < interner.normalized_key(interned[3]) + ); + assert!( + interner.normalized_key(interned[3]) < interner.normalized_key(interned[2]) + ); + } +} diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs new file mode 100644 index 000000000000..88c8a9166631 --- /dev/null +++ b/arrow/src/row/mod.rs @@ -0,0 +1,893 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A comparable row-oriented representation of a collection of [`Array`] + +use crate::array::{ + as_boolean_array, as_generic_binary_array, as_largestring_array, as_string_array, + Array, ArrayRef, Decimal128Array, Decimal256Array, +}; +use crate::compute::SortOptions; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; +use crate::row::interner::{Interned, OrderPreservingInterner}; +use crate::util::decimal::{Decimal128, Decimal256}; +use crate::{downcast_dictionary_array, downcast_primitive_array}; + +mod fixed; +mod interner; +mod variable; + +/// Converts [`ArrayRef`] columns into a row-oriented format that are [normalized for sorting]. +/// +/// In particular, a byte-wise comparison of the rows, e.g. [`memcmp`], is sufficient +/// to establish the ordering of two rows, allowing for extremely fast comparisons, +/// and permitting the use of [non-comparison sorts] such as [radix sort] +/// +/// Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to +/// yield a meaningful ordering +/// +/// # Format +/// +/// The encoding of the row format should not be considered stable, but is documented here +/// for reference. +/// +/// ## Unsigned Integer Encoding +/// +/// A null integer is encoded as a `0_u8`, followed by a zero-ed number of bytes corresponding +/// to the integer's length +/// +/// A valid integer is encoded as `1_u8`, followed by the big-endian representation of the +/// integer +/// +/// ## Signed Integer Encoding +/// +/// Signed integers have their most significant sign bit flipped, and are then encoded in the +/// same manner as an unsigned integer +/// +/// ## Float Encoding +/// +/// Floats are converted from IEEE 754 representation to a signed integer representation +/// by flipping all bar the sign bit if they are negative. +/// +/// They are then encoded in the same manner as a signed integer +/// +/// ## Variable Length Bytes Encoding +/// +/// A null is encoded as a `0_u8` +/// +/// An empty byte array is encoded as `1_u8` +/// +/// A non-null, non-empty byte array is encoded as `2_u8` followed by the byte array +/// encoded using a block based scheme described below. +/// +/// The byte array is broken up into 32-byte blocks, each block is written in turn +/// to the output, followed by `0xFF_u8`. The final block is padded to 32-bytes +/// with `0_u8` and written to the output, followed by the un-padded length in bytes +/// of this final block as a `u8` +/// +/// This is loosely inspired by [COBS] encoding, and chosen over more traditional +/// [byte stuffing] as it is more amenable to vectorisation, in particular AVX-256. +/// +/// ## Dictionary Encoding +/// +/// [`RowConverter`] needs to support converting dictionary encoded arrays with unsorted, and +/// potentially distinct dictionaries. One simple mechanism to avoid this would be to reverse +/// the dictionary encoding, and encode the array values directly, however, this would lose +/// the benefits of dictionary encoding to reduce memory and CPU consumption. +/// +/// As such the [`RowConverter`] maintains an order-preserving dictionary encoding for each +/// dictionary encoded column. As this is a variable-length encoding, new dictionary values +/// can be added whilst preserving the sort order. +/// +/// A null dictionary value is encoded as `0_u8`. +/// +/// A non-null dictionary value is encoded as `1_u8` followed by a null-terminated byte array +/// key determined by the order-preserving dictionary encoding +/// +/// # Ordering +/// +/// ## Float Ordering +/// +/// Floats are totally ordered in accordance to the `totalOrder` predicate as defined +/// in the IEEE 754 (2008 revision) floating point standard. +/// +/// The ordering established by this does not always agree with the +/// [`PartialOrd`] and [`PartialEq`] implementations of `f32`. For example, +/// they consider negative and positive zero equal, while this does not +/// +/// ## Null Ordering +/// +/// The encoding described above will order nulls first, this can be inverted by representing +/// nulls as `0xFF_u8` instead of `0_u8` +/// +/// ## Reverse Column Ordering +/// +/// The order of a given column can be reversed by negating the encoded bytes of non-null values +/// +/// ## Reconstruction +/// +/// Given a schema it would theoretically be possible to reconstruct the columnar data from +/// the row format, however, this is currently not implemented. It is recommended that the row +/// format is instead used to obtain a sorted list of row indices, which can then be used +/// with [`take`](crate::compute::take) to obtain a sorted [`Array`] +/// +/// [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] +/// [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] +/// [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] +/// [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] +/// [COBS]:[https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing] +/// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing] +#[derive(Debug)] +pub struct RowConverter { + fields: Vec, + /// interning state for column `i`, if column`i` is a dictionary + interners: Vec>>, +} + +/// Configure the data type and sort order for a given column +#[derive(Debug, Clone)] +pub struct SortField { + /// Sort options + options: SortOptions, + /// Data type + data_type: DataType, +} + +impl SortField { + /// Create a new column with the given data type + pub fn new(data_type: DataType) -> Self { + Self::new_with_options(data_type, Default::default()) + } + + /// Create a new column with the given data type and [`SortOptions`] + pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self { + Self { options, data_type } + } +} + +impl RowConverter { + /// Create a new [`RowConverter`] with the provided schema + pub fn new(fields: Vec) -> Self { + let interners = (0..fields.len()).map(|_| None).collect(); + Self { fields, interners } + } + + /// Convert [`ArrayRef`] columns into [`Rows`] + /// + /// See [`Row`] for information on when [`Row`] can be compared + /// + /// # Panics + /// + /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`] + pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { + if columns.len() != self.fields.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of arrays provided to RowConverter, expected {} got {}", + self.fields.len(), + columns.len() + ))); + } + + let dictionaries = columns + .iter() + .zip(&mut self.interners) + .zip(&self.fields) + .map(|((column, interner), field)| { + if !column.data_type().equals_datatype(&field.data_type) { + return Err(ArrowError::InvalidArgumentError(format!( + "RowConverter column schema mismatch, expected {} got {}", + field.data_type, + column.data_type() + ))); + } + + let values = downcast_dictionary_array! { + column => column.values(), + _ => return Ok(None) + }; + + let interner = interner.get_or_insert_with(Default::default); + + let mapping: Vec<_> = compute_dictionary_mapping(interner, values)? + .into_iter() + .map(|maybe_interned| { + maybe_interned.map(|interned| interner.normalized_key(interned)) + }) + .collect(); + + Ok(Some(mapping)) + }) + .collect::>>()?; + + let mut rows = new_empty_rows(columns, &dictionaries)?; + + for ((column, field), dictionary) in + columns.iter().zip(&self.fields).zip(dictionaries) + { + // We encode a column at a time to minimise dispatch overheads + encode_column(&mut rows, column, field.options, dictionary.as_deref()) + } + + if cfg!(debug_assertions) { + assert_eq!(*rows.offsets.last().unwrap(), rows.buffer.len()); + rows.offsets + .windows(2) + .for_each(|w| assert!(w[0] < w[1], "offsets should be monotonic")); + } + + Ok(rows) + } +} + +/// A row-oriented representation of arrow data, that is normalized for comparison +/// +/// See [`RowConverter`] +#[derive(Debug)] +pub struct Rows { + /// Underlying row bytes + buffer: Box<[u8]>, + /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]` + offsets: Box<[usize]>, +} + +impl Rows { + pub fn row(&self, row: usize) -> Row<'_> { + let end = self.offsets[row + 1]; + let start = self.offsets[row]; + Row(&self.buffer[start..end]) + } + + pub fn num_rows(&self) -> usize { + self.offsets.len() - 1 + } +} + +/// A comparable representation of a row +/// +/// Two [`Row`] can be compared if they both belong to [`Rows`] returned by calls to +/// [`RowConverter::convert_columns`] on the same [`RowConverter`] +/// +/// Otherwise any ordering established by comparing the [`Row`] is arbitrary +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Row<'a>(&'a [u8]); + +impl<'a> AsRef<[u8]> for Row<'a> { + fn as_ref(&self) -> &[u8] { + self.0 + } +} + +/// Computes the dictionary mapping for the given dictionary values +fn compute_dictionary_mapping( + interner: &mut OrderPreservingInterner, + values: &ArrayRef, +) -> Result>> { + use fixed::FixedLengthEncoding; + Ok(downcast_primitive_array! { + values => interner + .intern(values.iter().map(|x| x.map(|x| x.encode()))), + DataType::Binary => { + let iter = as_generic_binary_array::(values).iter(); + interner.intern(iter) + } + DataType::LargeBinary => { + let iter = as_generic_binary_array::(values).iter(); + interner.intern(iter) + } + DataType::Utf8 => { + let iter = as_string_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + interner.intern(iter) + } + DataType::LargeUtf8 => { + let iter = as_largestring_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + interner.intern(iter) + } + t => return Err(ArrowError::NotYetImplemented(format!("dictionary value {} is not supported", t))), + }) +} + +/// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`] +fn new_empty_rows( + cols: &[ArrayRef], + dictionaries: &[Option>>], +) -> Result { + use fixed::FixedLengthEncoding; + + let num_rows = cols.first().map(|x| x.len()).unwrap_or(0); + let mut lengths = vec![0; num_rows]; + + for (array, dict) in cols.iter().zip(dictionaries) { + downcast_primitive_array! { + array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), + DataType::Null => lengths.iter_mut().for_each(|x| *x += 1), + DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), + DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += Decimal128::ENCODED_LEN), + DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += Decimal256::ENCODED_LEN), + DataType::Binary => as_generic_binary_array::(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| *length += variable::encoded_len(slice)), + DataType::LargeBinary => as_generic_binary_array::(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| *length += variable::encoded_len(slice)), + DataType::Utf8 => as_string_array(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| { + *length += variable::encoded_len(slice.map(|x| x.as_bytes())) + }), + DataType::LargeUtf8 => as_largestring_array(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| { + *length += variable::encoded_len(slice.map(|x| x.as_bytes())) + }), + DataType::Dictionary(_, _) => downcast_dictionary_array! { + array => { + let dict = dict.as_ref().unwrap(); + for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { + match v.and_then(|v| dict[v as usize]) { + Some(k) => *length += k.len() + 1, + None => *length += 1, + } + } + } + _ => unreachable!(), + } + t => return Err(ArrowError::NotYetImplemented(format!("not yet implemented: {}", t))) + } + } + + let mut offsets = Vec::with_capacity(num_rows + 1); + offsets.push(0); + + // We initialize the offsets shifted down by one row index. + // + // As the rows are appended to the offsets will be incremented to match + // + // For example, consider the case of 3 rows of length 3, 4, and 6 respectively. + // The offsets would be initialized to `0, 0, 3, 7` + // + // Writing the first row entirely would yield `0, 3, 3, 7` + // The second, `0, 3, 7, 7` + // The third, `0, 3, 7, 13` + // + // This would be the final offsets for reading + // + // In this way offsets tracks the position during writing whilst eventually serving + // as identifying the offsets of the written rows + let mut cur_offset = 0_usize; + for l in lengths { + offsets.push(cur_offset); + cur_offset = cur_offset.checked_add(l).expect("overflow"); + } + + let buffer = vec![0_u8; cur_offset]; + + Ok(Rows { + buffer: buffer.into(), + offsets: offsets.into(), + }) +} + +/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses +fn encode_column( + out: &mut Rows, + column: &ArrayRef, + opts: SortOptions, + dictionary: Option<&[Option<&[u8]>]>, +) { + downcast_primitive_array! { + column => fixed::encode(out, column, opts), + DataType::Null => { + fixed::encode(out, std::iter::repeat(None::).take(column.len()), opts) + } + DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), + DataType::Decimal128(_, _) => fixed::encode( + out, + column.as_any().downcast_ref::().unwrap(), + opts, + ), + DataType::Decimal256(_, _) => fixed::encode( + out, + column.as_any().downcast_ref::().unwrap(), + opts, + ), + DataType::Binary => { + variable::encode(out, as_generic_binary_array::(column).iter(), opts) + } + DataType::LargeBinary => { + variable::encode(out, as_generic_binary_array::(column).iter(), opts) + } + DataType::Utf8 => variable::encode( + out, + as_string_array(column).iter().map(|x| x.map(|x| x.as_bytes())), + opts, + ), + DataType::LargeUtf8 => variable::encode( + out, + as_largestring_array(column) + .iter() + .map(|x| x.map(|x| x.as_bytes())), + opts, + ), + DataType::Dictionary(_, _) => downcast_dictionary_array! { + column => { + let dict = dictionary.unwrap(); + for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { + match k.and_then(|k| dict[k as usize]) { + Some(v) => { + let end_offset = *offset + 1 + v.len(); + out.buffer[*offset] = 1; + out.buffer[*offset+1..end_offset].copy_from_slice(v); + if opts.descending { + out.buffer[*offset..end_offset].iter_mut().for_each(|v| *v = !*v) + } + *offset = end_offset; + } + None => { + if !opts.nulls_first { + out.buffer[*offset] = 0xFF; + } + *offset += 1; + } + } + } + }, + _ => unreachable!() + } + t => unimplemented!("not yet implemented: {}", t) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::{ + BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, + Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray, + PrimitiveDictionaryBuilder, StringArray, + }; + use crate::compute::{LexicographicalComparator, SortColumn}; + use crate::util::display::array_value_to_string; + use rand::distributions::uniform::SampleUniform; + use rand::distributions::{Distribution, Standard}; + use rand::{thread_rng, Rng}; + use std::sync::Arc; + + #[test] + fn test_fixed_width() { + let cols = [ + Arc::new(Int16Array::from_iter([ + Some(1), + Some(2), + None, + Some(-5), + Some(2), + Some(2), + Some(0), + ])) as ArrayRef, + Arc::new(Float32Array::from_iter([ + Some(1.3), + Some(2.5), + None, + Some(4.), + Some(0.1), + Some(-4.), + Some(-0.), + ])) as ArrayRef, + ]; + + let mut converter = RowConverter::new(vec![ + SortField::new(DataType::Int16), + SortField::new(DataType::Float32), + ]); + let rows = converter.convert_columns(&cols).unwrap(); + + assert_eq!(rows.offsets.as_ref(), &[0, 8, 16, 24, 32, 40, 48, 56]); + assert_eq!( + rows.buffer.as_ref(), + &[ + 1, 128, 1, // + 1, 191, 166, 102, 102, // + 1, 128, 2, // + 1, 192, 32, 0, 0, // + 0, 0, 0, // + 0, 0, 0, 0, 0, // + 1, 127, 251, // + 1, 192, 128, 0, 0, // + 1, 128, 2, // + 1, 189, 204, 204, 205, // + 1, 128, 2, // + 1, 63, 127, 255, 255, // + 1, 128, 0, // + 1, 127, 255, 255, 255 // + ] + ); + + assert!(rows.row(3) < rows.row(6)); + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(3) < rows.row(0)); + assert!(rows.row(4) < rows.row(1)); + assert!(rows.row(5) < rows.row(4)) + } + + #[test] + fn test_bool() { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]); + + let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])); + let rows = converter.convert_columns(&[col]).unwrap(); + assert!(rows.row(2) > rows.row(1)); + assert!(rows.row(2) > rows.row(0)); + assert!(rows.row(1) > rows.row(0)); + + let mut converter = RowConverter::new(vec![SortField::new_with_options( + DataType::Boolean, + SortOptions { + descending: true, + nulls_first: false, + }, + )]); + + let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])); + let rows = converter.convert_columns(&[col]).unwrap(); + assert!(rows.row(2) < rows.row(1)); + assert!(rows.row(2) < rows.row(0)); + assert!(rows.row(1) < rows.row(0)); + } + + #[test] + fn test_variable_width() { + let col = Arc::new(StringArray::from_iter([ + Some("hello"), + Some("he"), + None, + Some("foo"), + Some(""), + ])); + + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + let rows = converter.convert_columns(&[col]).unwrap(); + + assert!(rows.row(1) < rows.row(0)); + assert!(rows.row(2) < rows.row(4)); + assert!(rows.row(3) < rows.row(0)); + assert!(rows.row(3) < rows.row(1)); + + let col = Arc::new(BinaryArray::from_iter([ + None, + Some(vec![0_u8; 0]), + Some(vec![0_u8; 6]), + Some(vec![0_u8; variable::BLOCK_SIZE]), + Some(vec![0_u8; variable::BLOCK_SIZE + 1]), + Some(vec![1_u8; 6]), + Some(vec![1_u8; variable::BLOCK_SIZE]), + Some(vec![1_u8; variable::BLOCK_SIZE + 1]), + Some(vec![0xFF_u8; 6]), + Some(vec![0xFF_u8; variable::BLOCK_SIZE]), + Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]), + ])) as ArrayRef; + + let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]); + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); + + for i in 0..rows.num_rows() { + for j in i + 1..rows.num_rows() { + assert!( + rows.row(i) < rows.row(j), + "{} < {} - {:?} < {:?}", + i, + j, + rows.row(i), + rows.row(j) + ); + } + } + + let mut converter = RowConverter::new(vec![SortField::new_with_options( + DataType::Binary, + SortOptions { + descending: true, + nulls_first: false, + }, + )]); + let rows = converter.convert_columns(&[col]).unwrap(); + + for i in 0..rows.num_rows() { + for j in i + 1..rows.num_rows() { + assert!( + rows.row(i) > rows.row(j), + "{} > {} - {:?} > {:?}", + i, + j, + rows.row(i), + rows.row(j) + ); + } + } + } + + #[test] + fn test_string_dictionary() { + let a = Arc::new(DictionaryArray::::from_iter([ + Some("foo"), + Some("hello"), + Some("he"), + None, + Some("hello"), + Some(""), + Some("hello"), + Some("hello"), + ])) as ArrayRef; + + let mut converter = + RowConverter::new(vec![SortField::new(a.data_type().clone())]); + let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); + + assert!(rows_a.row(3) < rows_a.row(5)); + assert!(rows_a.row(2) < rows_a.row(1)); + assert!(rows_a.row(0) < rows_a.row(1)); + assert!(rows_a.row(3) < rows_a.row(0)); + + assert_eq!(rows_a.row(1), rows_a.row(4)); + assert_eq!(rows_a.row(1), rows_a.row(6)); + assert_eq!(rows_a.row(1), rows_a.row(7)); + + let b = Arc::new(DictionaryArray::::from_iter([ + Some("hello"), + None, + Some("cupcakes"), + ])); + + let rows_b = converter.convert_columns(&[b]).unwrap(); + assert_eq!(rows_a.row(1), rows_b.row(0)); + assert_eq!(rows_a.row(3), rows_b.row(1)); + assert!(rows_b.row(2) < rows_a.row(0)); + + let mut converter = RowConverter::new(vec![SortField::new_with_options( + a.data_type().clone(), + SortOptions { + descending: true, + nulls_first: false, + }, + )]); + + let rows_c = converter.convert_columns(&[a]).unwrap(); + assert!(rows_c.row(3) > rows_c.row(5)); + assert!(rows_c.row(2) > rows_c.row(1)); + assert!(rows_c.row(0) > rows_c.row(1)); + assert!(rows_c.row(3) > rows_c.row(0)); + } + + #[test] + fn test_primitive_dictionary() { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.append(2).unwrap(); + builder.append(3).unwrap(); + builder.append(0).unwrap(); + builder.append_null(); + builder.append(5).unwrap(); + builder.append(3).unwrap(); + builder.append(-1).unwrap(); + + let a = builder.finish(); + + let mut converter = + RowConverter::new(vec![SortField::new(a.data_type().clone())]); + let rows = converter.convert_columns(&[Arc::new(a)]).unwrap(); + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(2) < rows.row(0)); + assert!(rows.row(3) < rows.row(2)); + assert!(rows.row(6) < rows.row(2)); + assert!(rows.row(3) < rows.row(6)); + } + + #[test] + fn test_dictionary_nulls() { + let values = + Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]).into_data(); + let keys = + Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None]) + .into_data(); + + let data_type = + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)); + let data = keys + .into_builder() + .data_type(data_type.clone()) + .child_data(vec![values]) + .build() + .unwrap(); + + let mut converter = RowConverter::new(vec![SortField::new(data_type)]); + let rows = converter + .convert_columns(&[Arc::new(DictionaryArray::::from(data))]) + .unwrap(); + + assert_eq!(rows.row(0), rows.row(1)); + assert_eq!(rows.row(3), rows.row(4)); + assert_eq!(rows.row(4), rows.row(5)); + assert!(rows.row(3) < rows.row(0)); + } + + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray + where + K: ArrowPrimitiveType, + Standard: Distribution, + { + let mut rng = thread_rng(); + (0..len) + .map(|_| rng.gen_bool(valid_percent).then(|| rng.gen())) + .collect() + } + + fn generate_strings( + len: usize, + valid_percent: f64, + ) -> GenericStringArray { + let mut rng = thread_rng(); + (0..len) + .map(|_| { + rng.gen_bool(valid_percent).then(|| { + let len = rng.gen_range(0..100); + let bytes = (0..len).map(|_| rng.gen_range(0..128)).collect(); + String::from_utf8(bytes).unwrap() + }) + }) + .collect() + } + + fn generate_dictionary( + values: ArrayRef, + len: usize, + valid_percent: f64, + ) -> DictionaryArray + where + K: ArrowDictionaryKeyType, + K::Native: SampleUniform, + { + let mut rng = thread_rng(); + let min_key = K::Native::from_usize(0).unwrap(); + let max_key = K::Native::from_usize(values.len()).unwrap(); + let keys: PrimitiveArray = (0..len) + .map(|_| { + rng.gen_bool(valid_percent) + .then(|| rng.gen_range(min_key..max_key)) + }) + .collect(); + + let data_type = DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let data = keys + .into_data() + .into_builder() + .data_type(data_type) + .add_child_data(values.data().clone()) + .build() + .unwrap(); + + DictionaryArray::from(data) + } + + fn generate_column(len: usize) -> ArrayRef { + let mut rng = thread_rng(); + match rng.gen_range(0..9) { + 0 => Arc::new(generate_primitive_array::(len, 0.8)), + 1 => Arc::new(generate_primitive_array::(len, 0.8)), + 2 => Arc::new(generate_primitive_array::(len, 0.8)), + 3 => Arc::new(generate_primitive_array::(len, 0.8)), + 4 => Arc::new(generate_primitive_array::(len, 0.8)), + 5 => Arc::new(generate_primitive_array::(len, 0.8)), + 6 => Arc::new(generate_strings::(len, 0.8)), + 7 => Arc::new(generate_dictionary::( + // Cannot test dictionaries containing null values because of #2687 + Arc::new(generate_strings::(rng.gen_range(1..len), 1.0)), + len, + 0.8, + )), + 8 => Arc::new(generate_dictionary::( + // Cannot test dictionaries containing null values because of #2687 + Arc::new(generate_primitive_array::( + rng.gen_range(1..len), + 1.0, + )), + len, + 0.8, + )), + _ => unreachable!(), + } + } + + fn print_row(cols: &[SortColumn], row: usize) -> String { + let t: Vec<_> = cols + .iter() + .map(|x| array_value_to_string(&x.values, row).unwrap()) + .collect(); + t.join(",") + } + + fn print_col_types(cols: &[SortColumn]) -> String { + let t: Vec<_> = cols + .iter() + .map(|x| x.values.data_type().to_string()) + .collect(); + t.join(",") + } + + #[test] + #[cfg_attr(miri, ignore)] + fn fuzz_test() { + for _ in 0..100 { + let mut rng = thread_rng(); + let num_columns = rng.gen_range(1..5); + let len = rng.gen_range(5..100); + let arrays: Vec<_> = (0..num_columns).map(|_| generate_column(len)).collect(); + + let options: Vec<_> = (0..num_columns) + .map(|_| SortOptions { + descending: rng.gen_bool(0.5), + nulls_first: rng.gen_bool(0.5), + }) + .collect(); + + let sort_columns: Vec<_> = options + .iter() + .zip(&arrays) + .map(|(o, c)| SortColumn { + values: Arc::clone(c), + options: Some(*o), + }) + .collect(); + + let comparator = LexicographicalComparator::try_new(&sort_columns).unwrap(); + + let columns = options + .into_iter() + .zip(&arrays) + .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o)) + .collect(); + + let mut converter = RowConverter::new(columns); + let rows = converter.convert_columns(&arrays).unwrap(); + + for i in 0..len { + for j in 0..len { + let row_i = rows.row(i); + let row_j = rows.row(j); + let row_cmp = row_i.cmp(&row_j); + let lex_cmp = comparator.compare(&i, &j); + assert_eq!( + row_cmp, + lex_cmp, + "({:?} vs {:?}) vs ({:?} vs {:?}) for types {}", + print_row(&sort_columns, i), + print_row(&sort_columns, j), + row_i, + row_j, + print_col_types(&sort_columns) + ); + } + } + } + } +} diff --git a/arrow/src/row/variable.rs b/arrow/src/row/variable.rs new file mode 100644 index 000000000000..2213dad9e788 --- /dev/null +++ b/arrow/src/row/variable.rs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::compute::SortOptions; +use crate::row::Rows; +use crate::util::bit_util::ceil; + +/// The block size of the variable length encoding +pub const BLOCK_SIZE: usize = 32; + +/// Returns the length of the encoded representation of a byte array, including the null byte +pub fn encoded_len(a: Option<&[u8]>) -> usize { + match a { + Some(a) => 1 + ceil(a.len(), BLOCK_SIZE) * (BLOCK_SIZE + 1), + None => 1, + } +} + +/// Variable length values are encoded as +/// +/// - single `0_u8` if null +/// - single `1_u8` if empty array +/// - `2_u8` if not empty, followed by one or more blocks +/// +/// where a block is encoded as +/// +/// - [`BLOCK_SIZE`] bytes of string data, padded with 0s +/// - `0xFF_u8` if this is not the last block for this string +/// - otherwise the length of the block as a `u8` +pub fn encode<'a, I: Iterator>>( + out: &mut Rows, + i: I, + opts: SortOptions, +) { + for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { + match maybe_val { + Some(val) if val.is_empty() => { + out.buffer[*offset] = match opts.descending { + true => !1, + false => 1, + }; + *offset += 1; + } + Some(val) => { + let block_count = ceil(val.len(), BLOCK_SIZE); + let end_offset = *offset + 1 + block_count * (BLOCK_SIZE + 1); + let to_write = &mut out.buffer[*offset..end_offset]; + + // Write `2_u8` to demarcate as non-empty, non-null string + to_write[0] = 2; + + let chunks = val.chunks_exact(BLOCK_SIZE); + let remainder = chunks.remainder(); + for (input, output) in chunks + .clone() + .zip(to_write[1..].chunks_exact_mut(BLOCK_SIZE + 1)) + { + let input: &[u8; BLOCK_SIZE] = input.try_into().unwrap(); + let out_block: &mut [u8; BLOCK_SIZE] = + (&mut output[..BLOCK_SIZE]).try_into().unwrap(); + + *out_block = *input; + + // Indicate that there are further blocks to follow + output[BLOCK_SIZE] = u8::MAX; + } + + if !remainder.is_empty() { + let start_offset = 1 + (block_count - 1) * (BLOCK_SIZE + 1); + to_write[start_offset..start_offset + remainder.len()] + .copy_from_slice(remainder); + *to_write.last_mut().unwrap() = remainder.len() as u8; + } else { + // We must overwrite the continuation marker written by the loop above + *to_write.last_mut().unwrap() = BLOCK_SIZE as u8; + } + + *offset = end_offset; + + if opts.descending { + // Invert bits + to_write.iter_mut().for_each(|v| *v = !*v) + } + } + None => { + if !opts.nulls_first { + out.buffer[*offset] = 0xFF; + } + *offset += 1; + } + } + } +} From 2f360e1ea4671ee170184eefcba4b85ac95b2327 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 10 Sep 2022 14:24:47 -0700 Subject: [PATCH 0033/1411] Sort indices of dictionary string values (#2698) * Refactor dictionary string sorting * Fix clippy --- arrow/benches/comparison_kernels.rs | 4 ++-- arrow/benches/filter_kernels.rs | 4 ++-- arrow/benches/sort_kernel.rs | 15 ++++++++++++ arrow/src/compute/kernels/sort.rs | 37 ++++++++++++++++++----------- arrow/src/util/bench_util.rs | 3 ++- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 21d83e07eec3..4ad139b879fd 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -287,8 +287,8 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$")) }); - let dict_arr_a = create_string_dict_array::(size, 0.0); - let dict_arr_b = create_string_dict_array::(size, 0.0); + let dict_arr_a = create_string_dict_array::(size, 0.0, 4); + let dict_arr_b = create_string_dict_array::(size, 0.0, 4); c.bench_function("dict eq string", |b| { b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b)) diff --git a/arrow/benches/filter_kernels.rs b/arrow/benches/filter_kernels.rs index be6d9027a8db..bd6129946630 100644 --- a/arrow/benches/filter_kernels.rs +++ b/arrow/benches/filter_kernels.rs @@ -155,7 +155,7 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_built_filter(&sparse_filter, &data_array)) }); - let data_array = create_string_dict_array::(size, 0.0); + let data_array = create_string_dict_array::(size, 0.0, 4); c.bench_function("filter context string dictionary (kept 1/2)", |b| { b.iter(|| bench_built_filter(&filter, &data_array)) }); @@ -168,7 +168,7 @@ fn add_benchmark(c: &mut Criterion) { |b| b.iter(|| bench_built_filter(&sparse_filter, &data_array)), ); - let data_array = create_string_dict_array::(size, 0.5); + let data_array = create_string_dict_array::(size, 0.5, 4); c.bench_function("filter context string dictionary w NULLs (kept 1/2)", |b| { b.iter(|| bench_built_filter(&filter, &data_array)) }); diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index f9f5f24c15a6..c4c6819df097 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -24,6 +24,8 @@ use std::sync::Arc; extern crate arrow; use arrow::compute::kernels::sort::{lexsort, SortColumn}; +use arrow::compute::sort_to_indices; +use arrow::datatypes::Int32Type; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; @@ -55,6 +57,10 @@ fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { criterion::black_box(lexsort(&columns, limit).unwrap()); } +fn bench_sort_to_indices(array: &ArrayRef, limit: Option) { + criterion::black_box(sort_to_indices(array, None, limit).unwrap()); +} + fn add_benchmark(c: &mut Criterion) { let arr_a = create_f32_array(2u64.pow(10) as usize, false); let arr_b = create_f32_array(2u64.pow(10) as usize, false); @@ -92,6 +98,15 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_sort(&arr_a, &arr_b, None)) }); + let dict_arr = Arc::new(create_string_dict_array::( + 2u64.pow(12) as usize, + 0.0, + 1, + )) as ArrayRef; + c.bench_function("dict string 2^12", |b| { + b.iter(|| bench_sort_to_indices(&dict_arr, None)) + }); + // with limit { let arr_a = create_f32_array(2u64.pow(12) as usize, false); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index e4eb35279064..34a321910c30 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -419,7 +419,18 @@ pub fn sort_to_indices( let value_indices_map = prepare_indices_map(&sorted_value_indices); sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) }, - DataType::Utf8 => sort_string_dictionary::<_>(values, v, n, &options, limit), + DataType::Utf8 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) + }, t => return Err(ArrowError::ComputeError(format!( "Unsupported dictionary value type {}", t ))), @@ -753,6 +764,7 @@ fn sort_string( /// Sort dictionary encoded strings fn sort_string_dictionary( values: &DictionaryArray, + value_indices_map: &HashMap, value_indices: Vec, null_indices: Vec, options: &SortOptions, @@ -760,20 +772,17 @@ fn sort_string_dictionary( ) -> UInt32Array { let keys: &PrimitiveArray = values.keys(); - let dict = values.values(); - let dict: &StringArray = as_string_array(dict); + // create tuples that are used for sorting + let valids = value_indices + .into_iter() + .map(|index| { + let key: T::Native = keys.value(index as usize); + let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap(); + (index, *value_order) + }) + .collect::>(); - sort_string_helper( - keys, - value_indices, - null_indices, - options, - limit, - |array: &PrimitiveArray, idx| -> &str { - let key: T::Native = array.value(idx as usize); - dict.value(key.to_usize().unwrap()) - }, - ) + sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options, limit, valids) } /// shared implementation between dictionary encoded and plain string arrays diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 395f3702d57a..3b89e7982a6b 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -124,6 +124,7 @@ pub fn create_string_array_with_len( pub fn create_string_dict_array( size: usize, null_density: f32, + str_len: usize, ) -> DictionaryArray { let rng = &mut seedable_rng(); @@ -132,7 +133,7 @@ pub fn create_string_dict_array( if rng.gen::() < null_density { None } else { - let value = rng.sample_iter(&Alphanumeric).take(4).collect(); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); let value = String::from_utf8(value).unwrap(); Some(value) } From d88ed6a4eb72eb2f87544f2811c23fd55dc706be Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 10 Sep 2022 22:26:27 +0100 Subject: [PATCH 0034/1411] Update thrift v0.16 and vendor parquet-format (#2502) (#2626) * Update thrift v0.16 vendor parquet-format (#2502) * Lint * Add linguist-generated * Use archlinux docker image * Review feedback * Fix doc * Format --- .gitattributes | 7 +- parquet/CONTRIBUTING.md | 11 +- parquet/Cargo.toml | 3 +- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/async_reader.rs | 2 +- parquet/src/basic.rs | 561 +- parquet/src/column/page.rs | 8 +- parquet/src/column/writer/mod.rs | 6 +- parquet/src/file/footer.rs | 4 +- parquet/src/file/metadata.rs | 21 +- parquet/src/file/page_encoding_stats.rs | 17 +- parquet/src/file/page_index/index.rs | 2 +- parquet/src/file/page_index/index_reader.rs | 2 +- parquet/src/file/page_index/range.rs | 6 +- parquet/src/file/serialized_reader.rs | 46 +- parquet/src/file/statistics.rs | 2 +- parquet/src/file/writer.rs | 6 +- parquet/src/format.rs | 5181 +++++++++++++++++++ parquet/src/lib.rs | 5 + parquet/src/schema/types.rs | 15 +- 22 files changed, 5585 insertions(+), 328 deletions(-) create mode 100644 parquet/src/format.rs diff --git a/.gitattributes b/.gitattributes index fac7bf85a77f..51008d2e3b4a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1 @@ -r/R/RcppExports.R linguist-generated=true -r/R/arrowExports.R linguist-generated=true -r/src/RcppExports.cpp linguist-generated=true -r/src/arrowExports.cpp linguist-generated=true -r/man/*.Rd linguist-generated=true - +parquet/src/format.rs linguist-generated diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 77e9f417e49a..903126d9f4f8 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -60,7 +60,12 @@ Run `cargo bench` for benchmarks. To build documentation, run `cargo doc --no-deps`. To compile and view in the browser, run `cargo doc --no-deps --open`. -## Update Supported Parquet Version +## Update Parquet Format -To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) -version is available. Then simply update version of `parquet-format` crate in Cargo.toml. +To generate the parquet format (thrift definitions) code run from the repository root run + +``` +$ docker run -v $(pwd):/thrift/src -it archlinux pacman -Sy --noconfirm thrift && wget https://raw.githubusercontent.com/apache/parquet-format/apache-parquet-format-2.9.0/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && thrift --gen rs /tmp/parquet.thrift && sed -i '/use thrift::server::TProcessor;/d' parquet.rs && mv parquet.rs parquet/src/format.rs +``` + +You may need to manually patch up doc comments that contain unescaped `[]` diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 6ddb3a615a2e..1d442b426cdf 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -31,9 +31,8 @@ rust-version = "1.62" [dependencies] ahash = "0.8" -parquet-format = { version = "4.0.0", default-features = false } bytes = { version = "1.1", default-features = false, features = ["std"] } -thrift = { version = "0.13", default-features = false } +thrift = { version = "0.16", default-features = false } snap = { version = "1.0", default-features = false, optional = true } brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 02d11817ec5a..b00afc475154 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1668,7 +1668,7 @@ mod tests { schema: TypePtr, field: Option, opts: &TestOptions, - ) -> Result { + ) -> Result { let mut writer_props = opts.writer_props(); if let Some(field) = field { let arrow_schema = Schema::new(vec![field]); diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 544b7931a265..495e346e0f8a 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -121,7 +121,7 @@ impl RowSelection { #[cfg(any(test, feature = "async"))] pub(crate) fn scan_ranges( &self, - page_locations: &[parquet_format::PageLocation], + page_locations: &[crate::format::PageLocation], ) -> Vec> { let mut ranges = vec![]; let mut row_offset = 0; @@ -302,7 +302,7 @@ impl From for VecDeque { #[cfg(test)] mod tests { use super::*; - use parquet_format::PageLocation; + use crate::format::PageLocation; use rand::{thread_rng, Rng}; #[test] diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 1fef695dc47f..2c3d498bcca8 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -230,7 +230,7 @@ impl ArrowWriter { } /// Close and finalize the underlying Parquet writer - pub fn close(mut self) -> Result { + pub fn close(mut self) -> Result { self.flush()?; self.writer.close() } diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index a77da8d6f5ff..d444d20d52cc 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -84,11 +84,11 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use crate::format::OffsetIndex; use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::ready; use futures::stream::Stream; -use parquet_format::OffsetIndex; use thrift::protocol::TCompactInputProtocol; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 7adbc8c1b6d0..b0f591c7a9f7 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -18,14 +18,14 @@ //! Contains Rust mappings for Thrift definition. //! Refer to `parquet.thrift` file to see raw definitions. -use std::{fmt, result, str}; +use std::{fmt, str}; -use parquet_format as parquet; +use crate::format as parquet; -use crate::errors::ParquetError; +use crate::errors::{ParquetError, Result}; -// Re-export parquet_format types used in this module -pub use parquet_format::{ +// Re-export crate::format types used in this module +pub use crate::format::{ BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, }; @@ -496,32 +496,35 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl From for Type { - fn from(value: parquet::Type) -> Self { - match value { - parquet::Type::Boolean => Type::BOOLEAN, - parquet::Type::Int32 => Type::INT32, - parquet::Type::Int64 => Type::INT64, - parquet::Type::Int96 => Type::INT96, - parquet::Type::Float => Type::FLOAT, - parquet::Type::Double => Type::DOUBLE, - parquet::Type::ByteArray => Type::BYTE_ARRAY, - parquet::Type::FixedLenByteArray => Type::FIXED_LEN_BYTE_ARRAY, - } +impl TryFrom for Type { + type Error = ParquetError; + + fn try_from(value: parquet::Type) -> Result { + Ok(match value { + parquet::Type::BOOLEAN => Type::BOOLEAN, + parquet::Type::INT32 => Type::INT32, + parquet::Type::INT64 => Type::INT64, + parquet::Type::INT96 => Type::INT96, + parquet::Type::FLOAT => Type::FLOAT, + parquet::Type::DOUBLE => Type::DOUBLE, + parquet::Type::BYTE_ARRAY => Type::BYTE_ARRAY, + parquet::Type::FIXED_LEN_BYTE_ARRAY => Type::FIXED_LEN_BYTE_ARRAY, + _ => return Err(general_err!("unexpected parquet type: {}", value.0)), + }) } } impl From for parquet::Type { fn from(value: Type) -> Self { match value { - Type::BOOLEAN => parquet::Type::Boolean, - Type::INT32 => parquet::Type::Int32, - Type::INT64 => parquet::Type::Int64, - Type::INT96 => parquet::Type::Int96, - Type::FLOAT => parquet::Type::Float, - Type::DOUBLE => parquet::Type::Double, - Type::BYTE_ARRAY => parquet::Type::ByteArray, - Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FixedLenByteArray, + Type::BOOLEAN => parquet::Type::BOOLEAN, + Type::INT32 => parquet::Type::INT32, + Type::INT64 => parquet::Type::INT64, + Type::INT96 => parquet::Type::INT96, + Type::FLOAT => parquet::Type::FLOAT, + Type::DOUBLE => parquet::Type::DOUBLE, + Type::BYTE_ARRAY => parquet::Type::BYTE_ARRAY, + Type::FIXED_LEN_BYTE_ARRAY => parquet::Type::FIXED_LEN_BYTE_ARRAY, } } } @@ -529,39 +532,47 @@ impl From for parquet::Type { // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl From> for ConvertedType { - fn from(option: Option) -> Self { - match option { +impl TryFrom> for ConvertedType { + type Error = ParquetError; + + fn try_from(option: Option) -> Result { + Ok(match option { None => ConvertedType::NONE, Some(value) => match value { - parquet::ConvertedType::Utf8 => ConvertedType::UTF8, - parquet::ConvertedType::Map => ConvertedType::MAP, - parquet::ConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, - parquet::ConvertedType::List => ConvertedType::LIST, - parquet::ConvertedType::Enum => ConvertedType::ENUM, - parquet::ConvertedType::Decimal => ConvertedType::DECIMAL, - parquet::ConvertedType::Date => ConvertedType::DATE, - parquet::ConvertedType::TimeMillis => ConvertedType::TIME_MILLIS, - parquet::ConvertedType::TimeMicros => ConvertedType::TIME_MICROS, - parquet::ConvertedType::TimestampMillis => { + parquet::ConvertedType::UTF8 => ConvertedType::UTF8, + parquet::ConvertedType::MAP => ConvertedType::MAP, + parquet::ConvertedType::MAP_KEY_VALUE => ConvertedType::MAP_KEY_VALUE, + parquet::ConvertedType::LIST => ConvertedType::LIST, + parquet::ConvertedType::ENUM => ConvertedType::ENUM, + parquet::ConvertedType::DECIMAL => ConvertedType::DECIMAL, + parquet::ConvertedType::DATE => ConvertedType::DATE, + parquet::ConvertedType::TIME_MILLIS => ConvertedType::TIME_MILLIS, + parquet::ConvertedType::TIME_MICROS => ConvertedType::TIME_MICROS, + parquet::ConvertedType::TIMESTAMP_MILLIS => { ConvertedType::TIMESTAMP_MILLIS } - parquet::ConvertedType::TimestampMicros => { + parquet::ConvertedType::TIMESTAMP_MICROS => { ConvertedType::TIMESTAMP_MICROS } - parquet::ConvertedType::Uint8 => ConvertedType::UINT_8, - parquet::ConvertedType::Uint16 => ConvertedType::UINT_16, - parquet::ConvertedType::Uint32 => ConvertedType::UINT_32, - parquet::ConvertedType::Uint64 => ConvertedType::UINT_64, - parquet::ConvertedType::Int8 => ConvertedType::INT_8, - parquet::ConvertedType::Int16 => ConvertedType::INT_16, - parquet::ConvertedType::Int32 => ConvertedType::INT_32, - parquet::ConvertedType::Int64 => ConvertedType::INT_64, - parquet::ConvertedType::Json => ConvertedType::JSON, - parquet::ConvertedType::Bson => ConvertedType::BSON, - parquet::ConvertedType::Interval => ConvertedType::INTERVAL, + parquet::ConvertedType::UINT_8 => ConvertedType::UINT_8, + parquet::ConvertedType::UINT_16 => ConvertedType::UINT_16, + parquet::ConvertedType::UINT_32 => ConvertedType::UINT_32, + parquet::ConvertedType::UINT_64 => ConvertedType::UINT_64, + parquet::ConvertedType::INT_8 => ConvertedType::INT_8, + parquet::ConvertedType::INT_16 => ConvertedType::INT_16, + parquet::ConvertedType::INT_32 => ConvertedType::INT_32, + parquet::ConvertedType::INT_64 => ConvertedType::INT_64, + parquet::ConvertedType::JSON => ConvertedType::JSON, + parquet::ConvertedType::BSON => ConvertedType::BSON, + parquet::ConvertedType::INTERVAL => ConvertedType::INTERVAL, + _ => { + return Err(general_err!( + "unexpected parquet converted type: {}", + value.0 + )) + } }, - } + }) } } @@ -569,32 +580,32 @@ impl From for Option { fn from(value: ConvertedType) -> Self { match value { ConvertedType::NONE => None, - ConvertedType::UTF8 => Some(parquet::ConvertedType::Utf8), - ConvertedType::MAP => Some(parquet::ConvertedType::Map), - ConvertedType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), - ConvertedType::LIST => Some(parquet::ConvertedType::List), - ConvertedType::ENUM => Some(parquet::ConvertedType::Enum), - ConvertedType::DECIMAL => Some(parquet::ConvertedType::Decimal), - ConvertedType::DATE => Some(parquet::ConvertedType::Date), - ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), - ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), + ConvertedType::UTF8 => Some(parquet::ConvertedType::UTF8), + ConvertedType::MAP => Some(parquet::ConvertedType::MAP), + ConvertedType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MAP_KEY_VALUE), + ConvertedType::LIST => Some(parquet::ConvertedType::LIST), + ConvertedType::ENUM => Some(parquet::ConvertedType::ENUM), + ConvertedType::DECIMAL => Some(parquet::ConvertedType::DECIMAL), + ConvertedType::DATE => Some(parquet::ConvertedType::DATE), + ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TIME_MILLIS), + ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TIME_MICROS), ConvertedType::TIMESTAMP_MILLIS => { - Some(parquet::ConvertedType::TimestampMillis) + Some(parquet::ConvertedType::TIMESTAMP_MILLIS) } ConvertedType::TIMESTAMP_MICROS => { - Some(parquet::ConvertedType::TimestampMicros) + Some(parquet::ConvertedType::TIMESTAMP_MICROS) } - ConvertedType::UINT_8 => Some(parquet::ConvertedType::Uint8), - ConvertedType::UINT_16 => Some(parquet::ConvertedType::Uint16), - ConvertedType::UINT_32 => Some(parquet::ConvertedType::Uint32), - ConvertedType::UINT_64 => Some(parquet::ConvertedType::Uint64), - ConvertedType::INT_8 => Some(parquet::ConvertedType::Int8), - ConvertedType::INT_16 => Some(parquet::ConvertedType::Int16), - ConvertedType::INT_32 => Some(parquet::ConvertedType::Int32), - ConvertedType::INT_64 => Some(parquet::ConvertedType::Int64), - ConvertedType::JSON => Some(parquet::ConvertedType::Json), - ConvertedType::BSON => Some(parquet::ConvertedType::Bson), - ConvertedType::INTERVAL => Some(parquet::ConvertedType::Interval), + ConvertedType::UINT_8 => Some(parquet::ConvertedType::UINT_8), + ConvertedType::UINT_16 => Some(parquet::ConvertedType::UINT_16), + ConvertedType::UINT_32 => Some(parquet::ConvertedType::UINT_32), + ConvertedType::UINT_64 => Some(parquet::ConvertedType::UINT_64), + ConvertedType::INT_8 => Some(parquet::ConvertedType::INT_8), + ConvertedType::INT_16 => Some(parquet::ConvertedType::INT_16), + ConvertedType::INT_32 => Some(parquet::ConvertedType::INT_32), + ConvertedType::INT_64 => Some(parquet::ConvertedType::INT_64), + ConvertedType::JSON => Some(parquet::ConvertedType::JSON), + ConvertedType::BSON => Some(parquet::ConvertedType::BSON), + ConvertedType::INTERVAL => Some(parquet::ConvertedType::INTERVAL), } } } @@ -730,22 +741,30 @@ impl From> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl From for Repetition { - fn from(value: parquet::FieldRepetitionType) -> Self { - match value { - parquet::FieldRepetitionType::Required => Repetition::REQUIRED, - parquet::FieldRepetitionType::Optional => Repetition::OPTIONAL, - parquet::FieldRepetitionType::Repeated => Repetition::REPEATED, - } +impl TryFrom for Repetition { + type Error = ParquetError; + + fn try_from(value: parquet::FieldRepetitionType) -> Result { + Ok(match value { + parquet::FieldRepetitionType::REQUIRED => Repetition::REQUIRED, + parquet::FieldRepetitionType::OPTIONAL => Repetition::OPTIONAL, + parquet::FieldRepetitionType::REPEATED => Repetition::REPEATED, + _ => { + return Err(general_err!( + "unexpected parquet repetition type: {}", + value.0 + )) + } + }) } } impl From for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { - Repetition::REQUIRED => parquet::FieldRepetitionType::Required, - Repetition::OPTIONAL => parquet::FieldRepetitionType::Optional, - Repetition::REPEATED => parquet::FieldRepetitionType::Repeated, + Repetition::REQUIRED => parquet::FieldRepetitionType::REQUIRED, + Repetition::OPTIONAL => parquet::FieldRepetitionType::OPTIONAL, + Repetition::REPEATED => parquet::FieldRepetitionType::REPEATED, } } } @@ -753,34 +772,41 @@ impl From for parquet::FieldRepetitionType { // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl From for Encoding { - fn from(value: parquet::Encoding) -> Self { - match value { - parquet::Encoding::Plain => Encoding::PLAIN, - parquet::Encoding::PlainDictionary => Encoding::PLAIN_DICTIONARY, - parquet::Encoding::Rle => Encoding::RLE, - parquet::Encoding::BitPacked => Encoding::BIT_PACKED, - parquet::Encoding::DeltaBinaryPacked => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DeltaLengthByteArray => Encoding::DELTA_LENGTH_BYTE_ARRAY, - parquet::Encoding::DeltaByteArray => Encoding::DELTA_BYTE_ARRAY, - parquet::Encoding::RleDictionary => Encoding::RLE_DICTIONARY, - parquet::Encoding::ByteStreamSplit => Encoding::BYTE_STREAM_SPLIT, - } +impl TryFrom for Encoding { + type Error = ParquetError; + + fn try_from(value: parquet::Encoding) -> Result { + Ok(match value { + parquet::Encoding::PLAIN => Encoding::PLAIN, + parquet::Encoding::PLAIN_DICTIONARY => Encoding::PLAIN_DICTIONARY, + parquet::Encoding::RLE => Encoding::RLE, + parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, + parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { + Encoding::DELTA_LENGTH_BYTE_ARRAY + } + parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, + parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, + parquet::Encoding::BYTE_STREAM_SPLIT => Encoding::BYTE_STREAM_SPLIT, + _ => return Err(general_err!("unexpected parquet encoding: {}", value.0)), + }) } } impl From for parquet::Encoding { fn from(value: Encoding) -> Self { match value { - Encoding::PLAIN => parquet::Encoding::Plain, - Encoding::PLAIN_DICTIONARY => parquet::Encoding::PlainDictionary, - Encoding::RLE => parquet::Encoding::Rle, - Encoding::BIT_PACKED => parquet::Encoding::BitPacked, - Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DeltaBinaryPacked, - Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DeltaLengthByteArray, - Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DeltaByteArray, - Encoding::RLE_DICTIONARY => parquet::Encoding::RleDictionary, - Encoding::BYTE_STREAM_SPLIT => parquet::Encoding::ByteStreamSplit, + Encoding::PLAIN => parquet::Encoding::PLAIN, + Encoding::PLAIN_DICTIONARY => parquet::Encoding::PLAIN_DICTIONARY, + Encoding::RLE => parquet::Encoding::RLE, + Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, + Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, + Encoding::DELTA_LENGTH_BYTE_ARRAY => { + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY + } + Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, + Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, + Encoding::BYTE_STREAM_SPLIT => parquet::Encoding::BYTE_STREAM_SPLIT, } } } @@ -788,30 +814,38 @@ impl From for parquet::Encoding { // ---------------------------------------------------------------------- // parquet::CompressionCodec <=> Compression conversion -impl From for Compression { - fn from(value: parquet::CompressionCodec) -> Self { - match value { - parquet::CompressionCodec::Uncompressed => Compression::UNCOMPRESSED, - parquet::CompressionCodec::Snappy => Compression::SNAPPY, - parquet::CompressionCodec::Gzip => Compression::GZIP, - parquet::CompressionCodec::Lzo => Compression::LZO, - parquet::CompressionCodec::Brotli => Compression::BROTLI, - parquet::CompressionCodec::Lz4 => Compression::LZ4, - parquet::CompressionCodec::Zstd => Compression::ZSTD, - } +impl TryFrom for Compression { + type Error = ParquetError; + + fn try_from(value: parquet::CompressionCodec) -> Result { + Ok(match value { + parquet::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, + parquet::CompressionCodec::SNAPPY => Compression::SNAPPY, + parquet::CompressionCodec::GZIP => Compression::GZIP, + parquet::CompressionCodec::LZO => Compression::LZO, + parquet::CompressionCodec::BROTLI => Compression::BROTLI, + parquet::CompressionCodec::LZ4 => Compression::LZ4, + parquet::CompressionCodec::ZSTD => Compression::ZSTD, + _ => { + return Err(general_err!( + "unexpected parquet compression codec: {}", + value.0 + )) + } + }) } } impl From for parquet::CompressionCodec { fn from(value: Compression) -> Self { match value { - Compression::UNCOMPRESSED => parquet::CompressionCodec::Uncompressed, - Compression::SNAPPY => parquet::CompressionCodec::Snappy, - Compression::GZIP => parquet::CompressionCodec::Gzip, - Compression::LZO => parquet::CompressionCodec::Lzo, - Compression::BROTLI => parquet::CompressionCodec::Brotli, - Compression::LZ4 => parquet::CompressionCodec::Lz4, - Compression::ZSTD => parquet::CompressionCodec::Zstd, + Compression::UNCOMPRESSED => parquet::CompressionCodec::UNCOMPRESSED, + Compression::SNAPPY => parquet::CompressionCodec::SNAPPY, + Compression::GZIP => parquet::CompressionCodec::GZIP, + Compression::LZO => parquet::CompressionCodec::LZO, + Compression::BROTLI => parquet::CompressionCodec::BROTLI, + Compression::LZ4 => parquet::CompressionCodec::LZ4, + Compression::ZSTD => parquet::CompressionCodec::ZSTD, } } } @@ -819,24 +853,27 @@ impl From for parquet::CompressionCodec { // ---------------------------------------------------------------------- // parquet::PageType <=> PageType conversion -impl From for PageType { - fn from(value: parquet::PageType) -> Self { - match value { - parquet::PageType::DataPage => PageType::DATA_PAGE, - parquet::PageType::IndexPage => PageType::INDEX_PAGE, - parquet::PageType::DictionaryPage => PageType::DICTIONARY_PAGE, - parquet::PageType::DataPageV2 => PageType::DATA_PAGE_V2, - } +impl TryFrom for PageType { + type Error = ParquetError; + + fn try_from(value: parquet::PageType) -> Result { + Ok(match value { + parquet::PageType::DATA_PAGE => PageType::DATA_PAGE, + parquet::PageType::INDEX_PAGE => PageType::INDEX_PAGE, + parquet::PageType::DICTIONARY_PAGE => PageType::DICTIONARY_PAGE, + parquet::PageType::DATA_PAGE_V2 => PageType::DATA_PAGE_V2, + _ => return Err(general_err!("unexpected parquet page type: {}", value.0)), + }) } } impl From for parquet::PageType { fn from(value: PageType) -> Self { match value { - PageType::DATA_PAGE => parquet::PageType::DataPage, - PageType::INDEX_PAGE => parquet::PageType::IndexPage, - PageType::DICTIONARY_PAGE => parquet::PageType::DictionaryPage, - PageType::DATA_PAGE_V2 => parquet::PageType::DataPageV2, + PageType::DATA_PAGE => parquet::PageType::DATA_PAGE, + PageType::INDEX_PAGE => parquet::PageType::INDEX_PAGE, + PageType::DICTIONARY_PAGE => parquet::PageType::DICTIONARY_PAGE, + PageType::DATA_PAGE_V2 => parquet::PageType::DATA_PAGE_V2, } } } @@ -847,12 +884,12 @@ impl From for parquet::PageType { impl str::FromStr for Repetition { type Err = ParquetError; - fn from_str(s: &str) -> result::Result { + fn from_str(s: &str) -> Result { match s { "REQUIRED" => Ok(Repetition::REQUIRED), "OPTIONAL" => Ok(Repetition::OPTIONAL), "REPEATED" => Ok(Repetition::REPEATED), - other => Err(general_err!("Invalid repetition {}", other)), + other => Err(general_err!("Invalid parquet repetition {}", other)), } } } @@ -860,7 +897,7 @@ impl str::FromStr for Repetition { impl str::FromStr for Type { type Err = ParquetError; - fn from_str(s: &str) -> result::Result { + fn from_str(s: &str) -> Result { match s { "BOOLEAN" => Ok(Type::BOOLEAN), "INT32" => Ok(Type::INT32), @@ -870,7 +907,7 @@ impl str::FromStr for Type { "DOUBLE" => Ok(Type::DOUBLE), "BYTE_ARRAY" | "BINARY" => Ok(Type::BYTE_ARRAY), "FIXED_LEN_BYTE_ARRAY" => Ok(Type::FIXED_LEN_BYTE_ARRAY), - other => Err(general_err!("Invalid type {}", other)), + other => Err(general_err!("Invalid parquet type {}", other)), } } } @@ -878,7 +915,7 @@ impl str::FromStr for Type { impl str::FromStr for ConvertedType { type Err = ParquetError; - fn from_str(s: &str) -> result::Result { + fn from_str(s: &str) -> Result { match s { "NONE" => Ok(ConvertedType::NONE), "UTF8" => Ok(ConvertedType::UTF8), @@ -903,7 +940,7 @@ impl str::FromStr for ConvertedType { "JSON" => Ok(ConvertedType::JSON), "BSON" => Ok(ConvertedType::BSON), "INTERVAL" => Ok(ConvertedType::INTERVAL), - other => Err(general_err!("Invalid converted type {}", other)), + other => Err(general_err!("Invalid parquet converted type {}", other)), } } } @@ -911,7 +948,7 @@ impl str::FromStr for ConvertedType { impl str::FromStr for LogicalType { type Err = ParquetError; - fn from_str(s: &str) -> result::Result { + fn from_str(s: &str) -> Result { match s { // The type is a placeholder that gets updated elsewhere "INTEGER" => Ok(LogicalType::Integer { @@ -939,8 +976,10 @@ impl str::FromStr for LogicalType { "BSON" => Ok(LogicalType::Bson), "UUID" => Ok(LogicalType::Uuid), "UNKNOWN" => Ok(LogicalType::Unknown), - "INTERVAL" => Err(general_err!("Interval logical type not yet supported")), - other => Err(general_err!("Invalid logical type {}", other)), + "INTERVAL" => Err(general_err!( + "Interval parquet logical type not yet supported" + )), + other => Err(general_err!("Invalid parquet logical type {}", other)), } } } @@ -966,30 +1005,36 @@ mod tests { #[test] fn test_from_type() { - assert_eq!(Type::from(parquet::Type::Boolean), Type::BOOLEAN); - assert_eq!(Type::from(parquet::Type::Int32), Type::INT32); - assert_eq!(Type::from(parquet::Type::Int64), Type::INT64); - assert_eq!(Type::from(parquet::Type::Int96), Type::INT96); - assert_eq!(Type::from(parquet::Type::Float), Type::FLOAT); - assert_eq!(Type::from(parquet::Type::Double), Type::DOUBLE); - assert_eq!(Type::from(parquet::Type::ByteArray), Type::BYTE_ARRAY); - assert_eq!( - Type::from(parquet::Type::FixedLenByteArray), + assert_eq!( + Type::try_from(parquet::Type::BOOLEAN).unwrap(), + Type::BOOLEAN + ); + assert_eq!(Type::try_from(parquet::Type::INT32).unwrap(), Type::INT32); + assert_eq!(Type::try_from(parquet::Type::INT64).unwrap(), Type::INT64); + assert_eq!(Type::try_from(parquet::Type::INT96).unwrap(), Type::INT96); + assert_eq!(Type::try_from(parquet::Type::FLOAT).unwrap(), Type::FLOAT); + assert_eq!(Type::try_from(parquet::Type::DOUBLE).unwrap(), Type::DOUBLE); + assert_eq!( + Type::try_from(parquet::Type::BYTE_ARRAY).unwrap(), + Type::BYTE_ARRAY + ); + assert_eq!( + Type::try_from(parquet::Type::FIXED_LEN_BYTE_ARRAY).unwrap(), Type::FIXED_LEN_BYTE_ARRAY ); } #[test] fn test_into_type() { - assert_eq!(parquet::Type::Boolean, Type::BOOLEAN.into()); - assert_eq!(parquet::Type::Int32, Type::INT32.into()); - assert_eq!(parquet::Type::Int64, Type::INT64.into()); - assert_eq!(parquet::Type::Int96, Type::INT96.into()); - assert_eq!(parquet::Type::Float, Type::FLOAT.into()); - assert_eq!(parquet::Type::Double, Type::DOUBLE.into()); - assert_eq!(parquet::Type::ByteArray, Type::BYTE_ARRAY.into()); - assert_eq!( - parquet::Type::FixedLenByteArray, + assert_eq!(parquet::Type::BOOLEAN, Type::BOOLEAN.into()); + assert_eq!(parquet::Type::INT32, Type::INT32.into()); + assert_eq!(parquet::Type::INT64, Type::INT64.into()); + assert_eq!(parquet::Type::INT96, Type::INT96.into()); + assert_eq!(parquet::Type::FLOAT, Type::FLOAT.into()); + assert_eq!(parquet::Type::DOUBLE, Type::DOUBLE.into()); + assert_eq!(parquet::Type::BYTE_ARRAY, Type::BYTE_ARRAY.into()); + assert_eq!( + parquet::Type::FIXED_LEN_BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY.into() ); } @@ -1072,97 +1117,102 @@ mod tests { #[test] fn test_from_converted_type() { let parquet_conv_none: Option = None; - assert_eq!(ConvertedType::from(parquet_conv_none), ConvertedType::NONE); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Utf8)), + ConvertedType::try_from(parquet_conv_none).unwrap(), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::try_from(Some(parquet::ConvertedType::UTF8)).unwrap(), ConvertedType::UTF8 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Map)), + ConvertedType::try_from(Some(parquet::ConvertedType::MAP)).unwrap(), ConvertedType::MAP ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::MapKeyValue)), + ConvertedType::try_from(Some(parquet::ConvertedType::MAP_KEY_VALUE)).unwrap(), ConvertedType::MAP_KEY_VALUE ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::List)), + ConvertedType::try_from(Some(parquet::ConvertedType::LIST)).unwrap(), ConvertedType::LIST ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Enum)), + ConvertedType::try_from(Some(parquet::ConvertedType::ENUM)).unwrap(), ConvertedType::ENUM ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Decimal)), + ConvertedType::try_from(Some(parquet::ConvertedType::DECIMAL)).unwrap(), ConvertedType::DECIMAL ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Date)), + ConvertedType::try_from(Some(parquet::ConvertedType::DATE)).unwrap(), ConvertedType::DATE ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::TimeMillis)), + ConvertedType::try_from(Some(parquet::ConvertedType::TIME_MILLIS)).unwrap(), ConvertedType::TIME_MILLIS ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::TimeMicros)), + ConvertedType::try_from(Some(parquet::ConvertedType::TIME_MICROS)).unwrap(), ConvertedType::TIME_MICROS ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::TimestampMillis)), + ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MILLIS)) + .unwrap(), ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::TimestampMicros)), + ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MICROS)) + .unwrap(), ConvertedType::TIMESTAMP_MICROS ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Uint8)), + ConvertedType::try_from(Some(parquet::ConvertedType::UINT_8)).unwrap(), ConvertedType::UINT_8 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Uint16)), + ConvertedType::try_from(Some(parquet::ConvertedType::UINT_16)).unwrap(), ConvertedType::UINT_16 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Uint32)), + ConvertedType::try_from(Some(parquet::ConvertedType::UINT_32)).unwrap(), ConvertedType::UINT_32 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Uint64)), + ConvertedType::try_from(Some(parquet::ConvertedType::UINT_64)).unwrap(), ConvertedType::UINT_64 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Int8)), + ConvertedType::try_from(Some(parquet::ConvertedType::INT_8)).unwrap(), ConvertedType::INT_8 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Int16)), + ConvertedType::try_from(Some(parquet::ConvertedType::INT_16)).unwrap(), ConvertedType::INT_16 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Int32)), + ConvertedType::try_from(Some(parquet::ConvertedType::INT_32)).unwrap(), ConvertedType::INT_32 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Int64)), + ConvertedType::try_from(Some(parquet::ConvertedType::INT_64)).unwrap(), ConvertedType::INT_64 ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Json)), + ConvertedType::try_from(Some(parquet::ConvertedType::JSON)).unwrap(), ConvertedType::JSON ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Bson)), + ConvertedType::try_from(Some(parquet::ConvertedType::BSON)).unwrap(), ConvertedType::BSON ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Interval)), + ConvertedType::try_from(Some(parquet::ConvertedType::INTERVAL)).unwrap(), ConvertedType::INTERVAL ); assert_eq!( - ConvertedType::from(Some(parquet::ConvertedType::Decimal)), + ConvertedType::try_from(Some(parquet::ConvertedType::DECIMAL)).unwrap(), ConvertedType::DECIMAL ) } @@ -1172,92 +1222,92 @@ mod tests { let converted_type: Option = None; assert_eq!(converted_type, ConvertedType::NONE.into()); assert_eq!( - Some(parquet::ConvertedType::Utf8), + Some(parquet::ConvertedType::UTF8), ConvertedType::UTF8.into() ); - assert_eq!(Some(parquet::ConvertedType::Map), ConvertedType::MAP.into()); + assert_eq!(Some(parquet::ConvertedType::MAP), ConvertedType::MAP.into()); assert_eq!( - Some(parquet::ConvertedType::MapKeyValue), + Some(parquet::ConvertedType::MAP_KEY_VALUE), ConvertedType::MAP_KEY_VALUE.into() ); assert_eq!( - Some(parquet::ConvertedType::List), + Some(parquet::ConvertedType::LIST), ConvertedType::LIST.into() ); assert_eq!( - Some(parquet::ConvertedType::Enum), + Some(parquet::ConvertedType::ENUM), ConvertedType::ENUM.into() ); assert_eq!( - Some(parquet::ConvertedType::Decimal), + Some(parquet::ConvertedType::DECIMAL), ConvertedType::DECIMAL.into() ); assert_eq!( - Some(parquet::ConvertedType::Date), + Some(parquet::ConvertedType::DATE), ConvertedType::DATE.into() ); assert_eq!( - Some(parquet::ConvertedType::TimeMillis), + Some(parquet::ConvertedType::TIME_MILLIS), ConvertedType::TIME_MILLIS.into() ); assert_eq!( - Some(parquet::ConvertedType::TimeMicros), + Some(parquet::ConvertedType::TIME_MICROS), ConvertedType::TIME_MICROS.into() ); assert_eq!( - Some(parquet::ConvertedType::TimestampMillis), + Some(parquet::ConvertedType::TIMESTAMP_MILLIS), ConvertedType::TIMESTAMP_MILLIS.into() ); assert_eq!( - Some(parquet::ConvertedType::TimestampMicros), + Some(parquet::ConvertedType::TIMESTAMP_MICROS), ConvertedType::TIMESTAMP_MICROS.into() ); assert_eq!( - Some(parquet::ConvertedType::Uint8), + Some(parquet::ConvertedType::UINT_8), ConvertedType::UINT_8.into() ); assert_eq!( - Some(parquet::ConvertedType::Uint16), + Some(parquet::ConvertedType::UINT_16), ConvertedType::UINT_16.into() ); assert_eq!( - Some(parquet::ConvertedType::Uint32), + Some(parquet::ConvertedType::UINT_32), ConvertedType::UINT_32.into() ); assert_eq!( - Some(parquet::ConvertedType::Uint64), + Some(parquet::ConvertedType::UINT_64), ConvertedType::UINT_64.into() ); assert_eq!( - Some(parquet::ConvertedType::Int8), + Some(parquet::ConvertedType::INT_8), ConvertedType::INT_8.into() ); assert_eq!( - Some(parquet::ConvertedType::Int16), + Some(parquet::ConvertedType::INT_16), ConvertedType::INT_16.into() ); assert_eq!( - Some(parquet::ConvertedType::Int32), + Some(parquet::ConvertedType::INT_32), ConvertedType::INT_32.into() ); assert_eq!( - Some(parquet::ConvertedType::Int64), + Some(parquet::ConvertedType::INT_64), ConvertedType::INT_64.into() ); assert_eq!( - Some(parquet::ConvertedType::Json), + Some(parquet::ConvertedType::JSON), ConvertedType::JSON.into() ); assert_eq!( - Some(parquet::ConvertedType::Bson), + Some(parquet::ConvertedType::BSON), ConvertedType::BSON.into() ); assert_eq!( - Some(parquet::ConvertedType::Interval), + Some(parquet::ConvertedType::INTERVAL), ConvertedType::INTERVAL.into() ); assert_eq!( - Some(parquet::ConvertedType::Decimal), + Some(parquet::ConvertedType::DECIMAL), ConvertedType::DECIMAL.into() ) } @@ -1591,15 +1641,15 @@ mod tests { #[test] fn test_from_repetition() { assert_eq!( - Repetition::from(parquet::FieldRepetitionType::Required), + Repetition::try_from(parquet::FieldRepetitionType::REQUIRED).unwrap(), Repetition::REQUIRED ); assert_eq!( - Repetition::from(parquet::FieldRepetitionType::Optional), + Repetition::try_from(parquet::FieldRepetitionType::OPTIONAL).unwrap(), Repetition::OPTIONAL ); assert_eq!( - Repetition::from(parquet::FieldRepetitionType::Repeated), + Repetition::try_from(parquet::FieldRepetitionType::REPEATED).unwrap(), Repetition::REPEATED ); } @@ -1607,15 +1657,15 @@ mod tests { #[test] fn test_into_repetition() { assert_eq!( - parquet::FieldRepetitionType::Required, + parquet::FieldRepetitionType::REQUIRED, Repetition::REQUIRED.into() ); assert_eq!( - parquet::FieldRepetitionType::Optional, + parquet::FieldRepetitionType::OPTIONAL, Repetition::OPTIONAL.into() ); assert_eq!( - parquet::FieldRepetitionType::Repeated, + parquet::FieldRepetitionType::REPEATED, Repetition::REPEATED.into() ); } @@ -1665,49 +1715,55 @@ mod tests { #[test] fn test_from_encoding() { - assert_eq!(Encoding::from(parquet::Encoding::Plain), Encoding::PLAIN); assert_eq!( - Encoding::from(parquet::Encoding::PlainDictionary), + Encoding::try_from(parquet::Encoding::PLAIN).unwrap(), + Encoding::PLAIN + ); + assert_eq!( + Encoding::try_from(parquet::Encoding::PLAIN_DICTIONARY).unwrap(), Encoding::PLAIN_DICTIONARY ); - assert_eq!(Encoding::from(parquet::Encoding::Rle), Encoding::RLE); assert_eq!( - Encoding::from(parquet::Encoding::BitPacked), + Encoding::try_from(parquet::Encoding::RLE).unwrap(), + Encoding::RLE + ); + assert_eq!( + Encoding::try_from(parquet::Encoding::BIT_PACKED).unwrap(), Encoding::BIT_PACKED ); assert_eq!( - Encoding::from(parquet::Encoding::DeltaBinaryPacked), + Encoding::try_from(parquet::Encoding::DELTA_BINARY_PACKED).unwrap(), Encoding::DELTA_BINARY_PACKED ); assert_eq!( - Encoding::from(parquet::Encoding::DeltaLengthByteArray), + Encoding::try_from(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY).unwrap(), Encoding::DELTA_LENGTH_BYTE_ARRAY ); assert_eq!( - Encoding::from(parquet::Encoding::DeltaByteArray), + Encoding::try_from(parquet::Encoding::DELTA_BYTE_ARRAY).unwrap(), Encoding::DELTA_BYTE_ARRAY ); } #[test] fn test_into_encoding() { - assert_eq!(parquet::Encoding::Plain, Encoding::PLAIN.into()); + assert_eq!(parquet::Encoding::PLAIN, Encoding::PLAIN.into()); assert_eq!( - parquet::Encoding::PlainDictionary, + parquet::Encoding::PLAIN_DICTIONARY, Encoding::PLAIN_DICTIONARY.into() ); - assert_eq!(parquet::Encoding::Rle, Encoding::RLE.into()); - assert_eq!(parquet::Encoding::BitPacked, Encoding::BIT_PACKED.into()); + assert_eq!(parquet::Encoding::RLE, Encoding::RLE.into()); + assert_eq!(parquet::Encoding::BIT_PACKED, Encoding::BIT_PACKED.into()); assert_eq!( - parquet::Encoding::DeltaBinaryPacked, + parquet::Encoding::DELTA_BINARY_PACKED, Encoding::DELTA_BINARY_PACKED.into() ); assert_eq!( - parquet::Encoding::DeltaLengthByteArray, + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, Encoding::DELTA_LENGTH_BYTE_ARRAY.into() ); assert_eq!( - parquet::Encoding::DeltaByteArray, + parquet::Encoding::DELTA_BYTE_ARRAY, Encoding::DELTA_BYTE_ARRAY.into() ); } @@ -1726,31 +1782,31 @@ mod tests { #[test] fn test_from_compression() { assert_eq!( - Compression::from(parquet::CompressionCodec::Uncompressed), + Compression::try_from(parquet::CompressionCodec::UNCOMPRESSED).unwrap(), Compression::UNCOMPRESSED ); assert_eq!( - Compression::from(parquet::CompressionCodec::Snappy), + Compression::try_from(parquet::CompressionCodec::SNAPPY).unwrap(), Compression::SNAPPY ); assert_eq!( - Compression::from(parquet::CompressionCodec::Gzip), + Compression::try_from(parquet::CompressionCodec::GZIP).unwrap(), Compression::GZIP ); assert_eq!( - Compression::from(parquet::CompressionCodec::Lzo), + Compression::try_from(parquet::CompressionCodec::LZO).unwrap(), Compression::LZO ); assert_eq!( - Compression::from(parquet::CompressionCodec::Brotli), + Compression::try_from(parquet::CompressionCodec::BROTLI).unwrap(), Compression::BROTLI ); assert_eq!( - Compression::from(parquet::CompressionCodec::Lz4), + Compression::try_from(parquet::CompressionCodec::LZ4).unwrap(), Compression::LZ4 ); assert_eq!( - Compression::from(parquet::CompressionCodec::Zstd), + Compression::try_from(parquet::CompressionCodec::ZSTD).unwrap(), Compression::ZSTD ); } @@ -1758,21 +1814,21 @@ mod tests { #[test] fn test_into_compression() { assert_eq!( - parquet::CompressionCodec::Uncompressed, + parquet::CompressionCodec::UNCOMPRESSED, Compression::UNCOMPRESSED.into() ); assert_eq!( - parquet::CompressionCodec::Snappy, + parquet::CompressionCodec::SNAPPY, Compression::SNAPPY.into() ); - assert_eq!(parquet::CompressionCodec::Gzip, Compression::GZIP.into()); - assert_eq!(parquet::CompressionCodec::Lzo, Compression::LZO.into()); + assert_eq!(parquet::CompressionCodec::GZIP, Compression::GZIP.into()); + assert_eq!(parquet::CompressionCodec::LZO, Compression::LZO.into()); assert_eq!( - parquet::CompressionCodec::Brotli, + parquet::CompressionCodec::BROTLI, Compression::BROTLI.into() ); - assert_eq!(parquet::CompressionCodec::Lz4, Compression::LZ4.into()); - assert_eq!(parquet::CompressionCodec::Zstd, Compression::ZSTD.into()); + assert_eq!(parquet::CompressionCodec::LZ4, Compression::LZ4.into()); + assert_eq!(parquet::CompressionCodec::ZSTD, Compression::ZSTD.into()); } #[test] @@ -1786,32 +1842,35 @@ mod tests { #[test] fn test_from_page_type() { assert_eq!( - PageType::from(parquet::PageType::DataPage), + PageType::try_from(parquet::PageType::DATA_PAGE).unwrap(), PageType::DATA_PAGE ); assert_eq!( - PageType::from(parquet::PageType::IndexPage), + PageType::try_from(parquet::PageType::INDEX_PAGE).unwrap(), PageType::INDEX_PAGE ); assert_eq!( - PageType::from(parquet::PageType::DictionaryPage), + PageType::try_from(parquet::PageType::DICTIONARY_PAGE).unwrap(), PageType::DICTIONARY_PAGE ); assert_eq!( - PageType::from(parquet::PageType::DataPageV2), + PageType::try_from(parquet::PageType::DATA_PAGE_V2).unwrap(), PageType::DATA_PAGE_V2 ); } #[test] fn test_into_page_type() { - assert_eq!(parquet::PageType::DataPage, PageType::DATA_PAGE.into()); - assert_eq!(parquet::PageType::IndexPage, PageType::INDEX_PAGE.into()); + assert_eq!(parquet::PageType::DATA_PAGE, PageType::DATA_PAGE.into()); + assert_eq!(parquet::PageType::INDEX_PAGE, PageType::INDEX_PAGE.into()); assert_eq!( - parquet::PageType::DictionaryPage, + parquet::PageType::DICTIONARY_PAGE, PageType::DICTIONARY_PAGE.into() ); - assert_eq!(parquet::PageType::DataPageV2, PageType::DATA_PAGE_V2.into()); + assert_eq!( + parquet::PageType::DATA_PAGE_V2, + PageType::DATA_PAGE_V2.into() + ); } #[test] diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index ab2d885a23f7..ddb6d243ebd3 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -20,9 +20,9 @@ use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; +use crate::format::PageHeader; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; -use parquet_format::PageHeader; /// Parquet Page definition. /// @@ -209,15 +209,15 @@ impl TryFrom<&PageHeader> for PageMetadata { fn try_from(value: &PageHeader) -> std::result::Result { match value.type_ { - parquet_format::PageType::DataPage => Ok(PageMetadata { + crate::format::PageType::DATA_PAGE => Ok(PageMetadata { num_rows: value.data_page_header.as_ref().unwrap().num_values as usize, is_dict: false, }), - parquet_format::PageType::DictionaryPage => Ok(PageMetadata { + crate::format::PageType::DICTIONARY_PAGE => Ok(PageMetadata { num_rows: usize::MIN, is_dict: true, }), - parquet_format::PageType::DataPageV2 => Ok(PageMetadata { + crate::format::PageType::DATA_PAGE_V2 => Ok(PageMetadata { num_rows: value.data_page_header_v2.as_ref().unwrap().num_rows as usize, is_dict: false, }), diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 05e32f7e48ad..f9dd2d8d39be 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -16,7 +16,7 @@ // under the License. //! Contains column writer API. -use parquet_format::{ColumnIndex, OffsetIndex}; +use crate::format::{ColumnIndex, OffsetIndex}; use std::collections::{BTreeSet, VecDeque}; use crate::basic::{Compression, ConvertedType, Encoding, LogicalType, PageType, Type}; @@ -1089,8 +1089,8 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { #[cfg(test)] mod tests { + use crate::format::BoundaryOrder; use bytes::Bytes; - use parquet_format::BoundaryOrder; use rand::distributions::uniform::SampleUniform; use std::sync::Arc; @@ -2086,7 +2086,7 @@ mod tests { // column index assert_eq!(2, column_index.null_pages.len()); assert_eq!(2, offset_index.page_locations.len()); - assert_eq!(BoundaryOrder::Unordered, column_index.boundary_order); + assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); for idx in 0..2 { assert!(!column_index.null_pages[idx]); assert_eq!(0, column_index.null_counts.as_ref().unwrap()[idx]); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 30afec55eb3a..e8a114db75b4 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -17,7 +17,7 @@ use std::{io::Read, sync::Arc}; -use parquet_format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; +use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; use thrift::protocol::TCompactInputProtocol; use crate::basic::ColumnOrder; @@ -149,8 +149,8 @@ mod tests { use crate::basic::SortOrder; use crate::basic::Type; + use crate::format::TypeDefinedOrder; use crate::schema::types::Type as SchemaType; - use parquet_format::TypeDefinedOrder; #[test] fn test_parse_metadata_size_smaller_than_footer() { diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 018dd95d9f35..0804890c22a0 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -35,7 +35,7 @@ use std::sync::Arc; -use parquet_format::{ +use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, }; @@ -122,7 +122,7 @@ impl ParquetMetaData { } } -pub type KeyValue = parquet_format::KeyValue; +pub type KeyValue = crate::format::KeyValue; /// Reference counted pointer for [`FileMetaData`]. pub type FileMetaDataPtr = Arc; @@ -553,14 +553,14 @@ impl ColumnChunkMetaData { return Err(general_err!("Expected to have column metadata")); } let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap(); - let column_type = Type::from(col_metadata.type_); + let column_type = Type::try_from(col_metadata.type_)?; let column_path = ColumnPath::new(col_metadata.path_in_schema); let encodings = col_metadata .encodings .drain(0..) - .map(Encoding::from) - .collect(); - let compression = Compression::from(col_metadata.codec); + .map(Encoding::try_from) + .collect::>()?; + let compression = Compression::try_from(col_metadata.codec)?; let file_path = cc.file_path; let file_offset = cc.file_offset; let num_values = col_metadata.num_values; @@ -573,7 +573,12 @@ impl ColumnChunkMetaData { let encoding_stats = col_metadata .encoding_stats .as_ref() - .map(|vec| vec.iter().map(page_encoding_stats::from_thrift).collect()); + .map(|vec| { + vec.iter() + .map(page_encoding_stats::try_from_thrift) + .collect::>() + }) + .transpose()?; let bloom_filter_offset = col_metadata.bloom_filter_offset; let offset_index_offset = cc.offset_index_offset; let offset_index_length = cc.offset_index_length; @@ -846,7 +851,7 @@ impl ColumnIndexBuilder { null_pages: Vec::new(), min_values: Vec::new(), max_values: Vec::new(), - boundary_order: BoundaryOrder::Unordered, + boundary_order: BoundaryOrder::UNORDERED, null_counts: Vec::new(), valid: true, } diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index e499a094ae00..eb26804784a9 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -16,7 +16,8 @@ // under the License. use crate::basic::{Encoding, PageType}; -use parquet_format::{ +use crate::errors::Result; +use crate::format::{ Encoding as TEncoding, PageEncodingStats as TPageEncodingStats, PageType as TPageType, }; @@ -32,16 +33,18 @@ pub struct PageEncodingStats { } /// Converts Thrift definition into `PageEncodingStats`. -pub fn from_thrift(thrift_encoding_stats: &TPageEncodingStats) -> PageEncodingStats { - let page_type = PageType::from(thrift_encoding_stats.page_type); - let encoding = Encoding::from(thrift_encoding_stats.encoding); +pub fn try_from_thrift( + thrift_encoding_stats: &TPageEncodingStats, +) -> Result { + let page_type = PageType::try_from(thrift_encoding_stats.page_type)?; + let encoding = Encoding::try_from(thrift_encoding_stats.encoding)?; let count = thrift_encoding_stats.count; - PageEncodingStats { + Ok(PageEncodingStats { page_type, encoding, count, - } + }) } /// Converts `PageEncodingStats` into Thrift definition. @@ -70,6 +73,6 @@ mod tests { count: 1, }; - assert_eq!(from_thrift(&to_thrift(&stats)), stats); + assert_eq!(try_from_thrift(&to_thrift(&stats)).unwrap(), stats); } } diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index f29b80accae2..062dc3966ead 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -19,8 +19,8 @@ use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::Int96; use crate::errors::ParquetError; +use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; -use parquet_format::{BoundaryOrder, ColumnIndex}; use std::fmt::Debug; /// The statistics in one page diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index e6a4e5981022..99877a92105a 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -21,7 +21,7 @@ use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{BooleanIndex, ByteArrayIndex, Index, NativeIndex}; use crate::file::reader::ChunkReader; -use parquet_format::{ColumnIndex, OffsetIndex, PageLocation}; +use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use std::io::{Cursor, Read}; use thrift::protocol::TCompactInputProtocol; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs index e9741ec8e7fd..816ea4025f20 100644 --- a/parquet/src/file/page_index/range.rs +++ b/parquet/src/file/page_index/range.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use crate::errors::ParquetError; -use parquet_format::PageLocation; +use crate::format::PageLocation; use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::RangeInclusive; @@ -284,7 +284,7 @@ mod tests { use crate::basic::Type::INT32; use crate::file::page_index::index::{NativeIndex, PageIndex}; use crate::file::page_index::range::{compute_row_ranges, Range, RowRanges}; - use parquet_format::{BoundaryOrder, PageLocation}; + use crate::format::{BoundaryOrder, PageLocation}; #[test] fn test_binary_search_overlap() { @@ -445,7 +445,7 @@ mod tests { null_count: Some(0), }, ], - boundary_order: BoundaryOrder::Ascending, + boundary_order: BoundaryOrder::ASCENDING, }; let locations = &[ PageLocation { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f3beb57c02e5..cd90b0d0b67a 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,8 +22,8 @@ use std::collections::VecDeque; use std::io::Cursor; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; +use crate::format::{PageHeader, PageLocation, PageType}; use bytes::{Buf, Bytes}; -use parquet_format::{PageHeader, PageLocation, PageType}; use thrift::protocol::TCompactInputProtocol; use crate::basic::{Encoding, Type}; @@ -436,37 +436,37 @@ pub(crate) fn decode_page( }; let result = match page_header.type_ { - PageType::DictionaryPage => { + PageType::DICTIONARY_PAGE => { assert!(page_header.dictionary_page_header.is_some()); let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: buffer, num_values: dict_header.num_values as u32, - encoding: Encoding::from(dict_header.encoding), + encoding: Encoding::try_from(dict_header.encoding)?, is_sorted, } } - PageType::DataPage => { + PageType::DATA_PAGE => { assert!(page_header.data_page_header.is_some()); let header = page_header.data_page_header.unwrap(); Page::DataPage { buf: buffer, num_values: header.num_values as u32, - encoding: Encoding::from(header.encoding), - def_level_encoding: Encoding::from(header.definition_level_encoding), - rep_level_encoding: Encoding::from(header.repetition_level_encoding), + encoding: Encoding::try_from(header.encoding)?, + def_level_encoding: Encoding::try_from(header.definition_level_encoding)?, + rep_level_encoding: Encoding::try_from(header.repetition_level_encoding)?, statistics: statistics::from_thrift(physical_type, header.statistics), } } - PageType::DataPageV2 => { + PageType::DATA_PAGE_V2 => { assert!(page_header.data_page_header_v2.is_some()); let header = page_header.data_page_header_v2.unwrap(); let is_compressed = header.is_compressed.unwrap_or(true); Page::DataPageV2 { buf: buffer, num_values: header.num_values as u32, - encoding: Encoding::from(header.encoding), + encoding: Encoding::try_from(header.encoding)?, num_nulls: header.num_nulls as u32, num_rows: header.num_rows as u32, def_levels_byte_len: header.definition_levels_byte_length as u32, @@ -600,7 +600,7 @@ impl PageReader for SerializedPageReader { *offset += data_len; *remaining -= data_len; - if header.type_ == PageType::IndexPage { + if header.type_ == PageType::INDEX_PAGE { continue; } @@ -754,7 +754,7 @@ impl PageReader for SerializedPageReader { mod tests { use std::sync::Arc; - use parquet_format::BoundaryOrder; + use crate::format::BoundaryOrder; use crate::basic::{self, ColumnOrder}; use crate::data_type::private::ParquetValueType; @@ -1281,7 +1281,7 @@ mod tests { unreachable!() }; - assert_eq!(index.boundary_order, BoundaryOrder::Ascending); + assert_eq!(index.boundary_order, BoundaryOrder::ASCENDING); let index_in_pages = &index.indexes; //only one page group @@ -1330,7 +1330,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 0), - BoundaryOrder::Unordered, + BoundaryOrder::UNORDERED, ); assert_eq!(row_group_offset_indexes[0].len(), 325); } else { @@ -1349,7 +1349,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 2), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[2].len(), 325); } else { @@ -1361,7 +1361,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 3), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[3].len(), 325); } else { @@ -1373,7 +1373,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 4), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[4].len(), 325); } else { @@ -1385,7 +1385,7 @@ mod tests { index, 528, get_row_group_min_max_bytes(row_group_metadata, 5), - BoundaryOrder::Unordered, + BoundaryOrder::UNORDERED, ); assert_eq!(row_group_offset_indexes[5].len(), 528); } else { @@ -1397,7 +1397,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 6), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[6].len(), 325); } else { @@ -1409,7 +1409,7 @@ mod tests { index, 528, get_row_group_min_max_bytes(row_group_metadata, 7), - BoundaryOrder::Unordered, + BoundaryOrder::UNORDERED, ); assert_eq!(row_group_offset_indexes[7].len(), 528); } else { @@ -1421,7 +1421,7 @@ mod tests { index, 974, get_row_group_min_max_bytes(row_group_metadata, 8), - BoundaryOrder::Unordered, + BoundaryOrder::UNORDERED, ); assert_eq!(row_group_offset_indexes[8].len(), 974); } else { @@ -1433,7 +1433,7 @@ mod tests { index, 352, get_row_group_min_max_bytes(row_group_metadata, 9), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[9].len(), 352); } else { @@ -1452,7 +1452,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 11), - BoundaryOrder::Ascending, + BoundaryOrder::ASCENDING, ); assert_eq!(row_group_offset_indexes[11].len(), 325); } else { @@ -1464,7 +1464,7 @@ mod tests { index, 325, get_row_group_min_max_bytes(row_group_metadata, 12), - BoundaryOrder::Unordered, + BoundaryOrder::UNORDERED, ); assert_eq!(row_group_offset_indexes[12].len(), 325); } else { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index da2ec2e9a149..35b5179d36bb 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -39,7 +39,7 @@ use std::fmt; -use parquet_format::Statistics as TStatistics; +use crate::format::Statistics as TStatistics; use crate::basic::Type; use crate::data_type::private::ParquetValueType; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7af4b0fa2c94..8cb6df974e4a 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -20,8 +20,8 @@ use std::{io::Write, sync::Arc}; -use parquet_format as parquet; -use parquet_format::{ColumnIndex, OffsetIndex, RowGroup}; +use crate::format as parquet; +use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; use crate::basic::PageType; @@ -1110,7 +1110,7 @@ mod tests { fn test_file_roundtrip( file: File, data: Vec>, - ) -> parquet_format::FileMetaData { + ) -> crate::format::FileMetaData { let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(&mut vec![Arc::new( diff --git a/parquet/src/format.rs b/parquet/src/format.rs new file mode 100644 index 000000000000..00a89a4c7e85 --- /dev/null +++ b/parquet/src/format.rs @@ -0,0 +1,5181 @@ +// Autogenerated by Thrift Compiler (0.16.0) +// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + +#![allow(unused_imports)] +#![allow(unused_extern_crates)] +#![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box)] +#![cfg_attr(rustfmt, rustfmt_skip)] + +use std::cell::RefCell; +use std::collections::{BTreeMap, BTreeSet}; +use std::convert::{From, TryFrom}; +use std::default::Default; +use std::error::Error; +use std::fmt; +use std::fmt::{Display, Formatter}; +use std::rc::Rc; + +use thrift::OrderedFloat; +use thrift::{ApplicationError, ApplicationErrorKind, ProtocolError, ProtocolErrorKind, TThriftClient}; +use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType}; +use thrift::protocol::field_id; +use thrift::protocol::verify_expected_message_type; +use thrift::protocol::verify_expected_sequence_number; +use thrift::protocol::verify_expected_service_call; +use thrift::protocol::verify_required_field_exists; + +/// Types supported by Parquet. These types are intended to be used in combination +/// with the encodings to control the on disk storage format. +/// For example INT16 is not included as a type since a good encoding of INT32 +/// would handle this. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Type(pub i32); + +impl Type { + pub const BOOLEAN: Type = Type(0); + pub const INT32: Type = Type(1); + pub const INT64: Type = Type(2); + pub const INT96: Type = Type(3); + pub const FLOAT: Type = Type(4); + pub const DOUBLE: Type = Type(5); + pub const BYTE_ARRAY: Type = Type(6); + pub const FIXED_LEN_BYTE_ARRAY: Type = Type(7); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::BOOLEAN, + Self::INT32, + Self::INT64, + Self::INT96, + Self::FLOAT, + Self::DOUBLE, + Self::BYTE_ARRAY, + Self::FIXED_LEN_BYTE_ARRAY, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(Type::from(enum_value)) + } +} + +impl From for Type { + fn from(i: i32) -> Self { + match i { + 0 => Type::BOOLEAN, + 1 => Type::INT32, + 2 => Type::INT64, + 3 => Type::INT96, + 4 => Type::FLOAT, + 5 => Type::DOUBLE, + 6 => Type::BYTE_ARRAY, + 7 => Type::FIXED_LEN_BYTE_ARRAY, + _ => Type(i) + } + } +} + +impl From<&i32> for Type { + fn from(i: &i32) -> Self { + Type::from(*i) + } +} + +impl From for i32 { + fn from(e: Type) -> i32 { + e.0 + } +} + +impl From<&Type> for i32 { + fn from(e: &Type) -> i32 { + e.0 + } +} + +/// DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet. +/// ConvertedType is superseded by LogicalType. This enum should not be extended. +/// +/// See LogicalTypes.md for conversion between ConvertedType and LogicalType. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ConvertedType(pub i32); + +impl ConvertedType { + /// a BYTE_ARRAY actually contains UTF8 encoded chars + pub const UTF8: ConvertedType = ConvertedType(0); + /// a map is converted as an optional field containing a repeated key/value pair + pub const MAP: ConvertedType = ConvertedType(1); + /// a key/value pair is converted into a group of two fields + pub const MAP_KEY_VALUE: ConvertedType = ConvertedType(2); + /// a list is converted into an optional field containing a repeated field for its + /// values + pub const LIST: ConvertedType = ConvertedType(3); + /// an enum is converted into a binary field + pub const ENUM: ConvertedType = ConvertedType(4); + /// A decimal value. + /// + /// This may be used to annotate binary or fixed primitive types. The + /// underlying byte array stores the unscaled value encoded as two's + /// complement using big-endian byte order (the most significant byte is the + /// zeroth element). The value of the decimal is the value * 10^{-scale}. + /// + /// This must be accompanied by a (maximum) precision and a scale in the + /// SchemaElement. The precision specifies the number of digits in the decimal + /// and the scale stores the location of the decimal point. For example 1.23 + /// would have precision 3 (3 total digits) and scale 2 (the decimal point is + /// 2 digits over). + pub const DECIMAL: ConvertedType = ConvertedType(5); + /// A Date + /// + /// Stored as days since Unix epoch, encoded as the INT32 physical type. + /// + pub const DATE: ConvertedType = ConvertedType(6); + /// A time + /// + /// The total number of milliseconds since midnight. The value is stored + /// as an INT32 physical type. + pub const TIME_MILLIS: ConvertedType = ConvertedType(7); + /// A time. + /// + /// The total number of microseconds since midnight. The value is stored as + /// an INT64 physical type. + pub const TIME_MICROS: ConvertedType = ConvertedType(8); + /// A date/time combination + /// + /// Date and time recorded as milliseconds since the Unix epoch. Recorded as + /// a physical type of INT64. + pub const TIMESTAMP_MILLIS: ConvertedType = ConvertedType(9); + /// A date/time combination + /// + /// Date and time recorded as microseconds since the Unix epoch. The value is + /// stored as an INT64 physical type. + pub const TIMESTAMP_MICROS: ConvertedType = ConvertedType(10); + /// An unsigned integer value. + /// + /// The number describes the maximum number of meaningful data bits in + /// the stored value. 8, 16 and 32 bit values are stored using the + /// INT32 physical type. 64 bit values are stored using the INT64 + /// physical type. + /// + pub const UINT_8: ConvertedType = ConvertedType(11); + pub const UINT_16: ConvertedType = ConvertedType(12); + pub const UINT_32: ConvertedType = ConvertedType(13); + pub const UINT_64: ConvertedType = ConvertedType(14); + /// A signed integer value. + /// + /// The number describes the maximum number of meaningful data bits in + /// the stored value. 8, 16 and 32 bit values are stored using the + /// INT32 physical type. 64 bit values are stored using the INT64 + /// physical type. + /// + pub const INT_8: ConvertedType = ConvertedType(15); + pub const INT_16: ConvertedType = ConvertedType(16); + pub const INT_32: ConvertedType = ConvertedType(17); + pub const INT_64: ConvertedType = ConvertedType(18); + /// An embedded JSON document + /// + /// A JSON document embedded within a single UTF8 column. + pub const JSON: ConvertedType = ConvertedType(19); + /// An embedded BSON document + /// + /// A BSON document embedded within a single BINARY column. + pub const BSON: ConvertedType = ConvertedType(20); + /// An interval of time + /// + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 + /// This data is composed of three separate little endian unsigned + /// integers. Each stores a component of a duration of time. The first + /// integer identifies the number of months associated with the duration, + /// the second identifies the number of days associated with the duration + /// and the third identifies the number of milliseconds associated with + /// the provided duration. This duration of time is independent of any + /// particular timezone or date. + pub const INTERVAL: ConvertedType = ConvertedType(21); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::UTF8, + Self::MAP, + Self::MAP_KEY_VALUE, + Self::LIST, + Self::ENUM, + Self::DECIMAL, + Self::DATE, + Self::TIME_MILLIS, + Self::TIME_MICROS, + Self::TIMESTAMP_MILLIS, + Self::TIMESTAMP_MICROS, + Self::UINT_8, + Self::UINT_16, + Self::UINT_32, + Self::UINT_64, + Self::INT_8, + Self::INT_16, + Self::INT_32, + Self::INT_64, + Self::JSON, + Self::BSON, + Self::INTERVAL, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(ConvertedType::from(enum_value)) + } +} + +impl From for ConvertedType { + fn from(i: i32) -> Self { + match i { + 0 => ConvertedType::UTF8, + 1 => ConvertedType::MAP, + 2 => ConvertedType::MAP_KEY_VALUE, + 3 => ConvertedType::LIST, + 4 => ConvertedType::ENUM, + 5 => ConvertedType::DECIMAL, + 6 => ConvertedType::DATE, + 7 => ConvertedType::TIME_MILLIS, + 8 => ConvertedType::TIME_MICROS, + 9 => ConvertedType::TIMESTAMP_MILLIS, + 10 => ConvertedType::TIMESTAMP_MICROS, + 11 => ConvertedType::UINT_8, + 12 => ConvertedType::UINT_16, + 13 => ConvertedType::UINT_32, + 14 => ConvertedType::UINT_64, + 15 => ConvertedType::INT_8, + 16 => ConvertedType::INT_16, + 17 => ConvertedType::INT_32, + 18 => ConvertedType::INT_64, + 19 => ConvertedType::JSON, + 20 => ConvertedType::BSON, + 21 => ConvertedType::INTERVAL, + _ => ConvertedType(i) + } + } +} + +impl From<&i32> for ConvertedType { + fn from(i: &i32) -> Self { + ConvertedType::from(*i) + } +} + +impl From for i32 { + fn from(e: ConvertedType) -> i32 { + e.0 + } +} + +impl From<&ConvertedType> for i32 { + fn from(e: &ConvertedType) -> i32 { + e.0 + } +} + +/// Representation of Schemas +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct FieldRepetitionType(pub i32); + +impl FieldRepetitionType { + /// This field is required (can not be null) and each record has exactly 1 value. + pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0); + /// The field is optional (can be null) and each record has 0 or 1 values. + pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1); + /// The field is repeated and can contain 0 or more values + pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::REQUIRED, + Self::OPTIONAL, + Self::REPEATED, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(FieldRepetitionType::from(enum_value)) + } +} + +impl From for FieldRepetitionType { + fn from(i: i32) -> Self { + match i { + 0 => FieldRepetitionType::REQUIRED, + 1 => FieldRepetitionType::OPTIONAL, + 2 => FieldRepetitionType::REPEATED, + _ => FieldRepetitionType(i) + } + } +} + +impl From<&i32> for FieldRepetitionType { + fn from(i: &i32) -> Self { + FieldRepetitionType::from(*i) + } +} + +impl From for i32 { + fn from(e: FieldRepetitionType) -> i32 { + e.0 + } +} + +impl From<&FieldRepetitionType> for i32 { + fn from(e: &FieldRepetitionType) -> i32 { + e.0 + } +} + +/// Encodings supported by Parquet. Not all encodings are valid for all types. These +/// enums are also used to specify the encoding of definition and repetition levels. +/// See the accompanying doc for the details of the more complicated encodings. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Encoding(pub i32); + +impl Encoding { + /// Default encoding. + /// BOOLEAN - 1 bit per value. 0 is false; 1 is true. + /// INT32 - 4 bytes per value. Stored as little-endian. + /// INT64 - 8 bytes per value. Stored as little-endian. + /// FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + /// DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + /// BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + /// FIXED_LEN_BYTE_ARRAY - Just the bytes. + pub const PLAIN: Encoding = Encoding(0); + /// Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + /// plain type. + /// in a data page use RLE_DICTIONARY instead. + /// in a Dictionary page use PLAIN instead + pub const PLAIN_DICTIONARY: Encoding = Encoding(2); + /// Group packed run length encoding. Usable for definition/repetition levels + /// encoding and Booleans (on one bit: 0 is false; 1 is true.) + pub const RLE: Encoding = Encoding(3); + /// Bit packed encoding. This can only be used if the data has a known max + /// width. Usable for definition/repetition levels encoding. + pub const BIT_PACKED: Encoding = Encoding(4); + /// Delta encoding for integers. This can be used for int columns and works best + /// on sorted data + pub const DELTA_BINARY_PACKED: Encoding = Encoding(5); + /// Encoding for byte arrays to separate the length values and the data. The lengths + /// are encoded using DELTA_BINARY_PACKED + pub const DELTA_LENGTH_BYTE_ARRAY: Encoding = Encoding(6); + /// Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + /// Suffixes are stored as delta length byte arrays. + pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7); + /// Dictionary encoding: the ids are encoded using the RLE encoding + pub const RLE_DICTIONARY: Encoding = Encoding(8); + /// Encoding for floating-point data. + /// K byte-streams are created where K is the size in bytes of the data type. + /// The individual bytes of an FP value are scattered to the corresponding stream and + /// the streams are concatenated. + /// This itself does not reduce the size of the data but can lead to better compression + /// afterwards. + pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::PLAIN, + Self::PLAIN_DICTIONARY, + Self::RLE, + Self::BIT_PACKED, + Self::DELTA_BINARY_PACKED, + Self::DELTA_LENGTH_BYTE_ARRAY, + Self::DELTA_BYTE_ARRAY, + Self::RLE_DICTIONARY, + Self::BYTE_STREAM_SPLIT, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(Encoding::from(enum_value)) + } +} + +impl From for Encoding { + fn from(i: i32) -> Self { + match i { + 0 => Encoding::PLAIN, + 2 => Encoding::PLAIN_DICTIONARY, + 3 => Encoding::RLE, + 4 => Encoding::BIT_PACKED, + 5 => Encoding::DELTA_BINARY_PACKED, + 6 => Encoding::DELTA_LENGTH_BYTE_ARRAY, + 7 => Encoding::DELTA_BYTE_ARRAY, + 8 => Encoding::RLE_DICTIONARY, + 9 => Encoding::BYTE_STREAM_SPLIT, + _ => Encoding(i) + } + } +} + +impl From<&i32> for Encoding { + fn from(i: &i32) -> Self { + Encoding::from(*i) + } +} + +impl From for i32 { + fn from(e: Encoding) -> i32 { + e.0 + } +} + +impl From<&Encoding> for i32 { + fn from(e: &Encoding) -> i32 { + e.0 + } +} + +/// Supported compression algorithms. +/// +/// Codecs added in format version X.Y can be read by readers based on X.Y and later. +/// Codec support may vary between readers based on the format version and +/// libraries available at runtime. +/// +/// See Compression.md for a detailed specification of these algorithms. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct CompressionCodec(pub i32); + +impl CompressionCodec { + pub const UNCOMPRESSED: CompressionCodec = CompressionCodec(0); + pub const SNAPPY: CompressionCodec = CompressionCodec(1); + pub const GZIP: CompressionCodec = CompressionCodec(2); + pub const LZO: CompressionCodec = CompressionCodec(3); + pub const BROTLI: CompressionCodec = CompressionCodec(4); + pub const LZ4: CompressionCodec = CompressionCodec(5); + pub const ZSTD: CompressionCodec = CompressionCodec(6); + pub const LZ4_RAW: CompressionCodec = CompressionCodec(7); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::UNCOMPRESSED, + Self::SNAPPY, + Self::GZIP, + Self::LZO, + Self::BROTLI, + Self::LZ4, + Self::ZSTD, + Self::LZ4_RAW, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(CompressionCodec::from(enum_value)) + } +} + +impl From for CompressionCodec { + fn from(i: i32) -> Self { + match i { + 0 => CompressionCodec::UNCOMPRESSED, + 1 => CompressionCodec::SNAPPY, + 2 => CompressionCodec::GZIP, + 3 => CompressionCodec::LZO, + 4 => CompressionCodec::BROTLI, + 5 => CompressionCodec::LZ4, + 6 => CompressionCodec::ZSTD, + 7 => CompressionCodec::LZ4_RAW, + _ => CompressionCodec(i) + } + } +} + +impl From<&i32> for CompressionCodec { + fn from(i: &i32) -> Self { + CompressionCodec::from(*i) + } +} + +impl From for i32 { + fn from(e: CompressionCodec) -> i32 { + e.0 + } +} + +impl From<&CompressionCodec> for i32 { + fn from(e: &CompressionCodec) -> i32 { + e.0 + } +} + +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PageType(pub i32); + +impl PageType { + pub const DATA_PAGE: PageType = PageType(0); + pub const INDEX_PAGE: PageType = PageType(1); + pub const DICTIONARY_PAGE: PageType = PageType(2); + pub const DATA_PAGE_V2: PageType = PageType(3); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::DATA_PAGE, + Self::INDEX_PAGE, + Self::DICTIONARY_PAGE, + Self::DATA_PAGE_V2, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(PageType::from(enum_value)) + } +} + +impl From for PageType { + fn from(i: i32) -> Self { + match i { + 0 => PageType::DATA_PAGE, + 1 => PageType::INDEX_PAGE, + 2 => PageType::DICTIONARY_PAGE, + 3 => PageType::DATA_PAGE_V2, + _ => PageType(i) + } + } +} + +impl From<&i32> for PageType { + fn from(i: &i32) -> Self { + PageType::from(*i) + } +} + +impl From for i32 { + fn from(e: PageType) -> i32 { + e.0 + } +} + +impl From<&PageType> for i32 { + fn from(e: &PageType) -> i32 { + e.0 + } +} + +/// Enum to annotate whether lists of min/max elements inside ColumnIndex +/// are ordered and if so, in which direction. +#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct BoundaryOrder(pub i32); + +impl BoundaryOrder { + pub const UNORDERED: BoundaryOrder = BoundaryOrder(0); + pub const ASCENDING: BoundaryOrder = BoundaryOrder(1); + pub const DESCENDING: BoundaryOrder = BoundaryOrder(2); + pub const ENUM_VALUES: &'static [Self] = &[ + Self::UNORDERED, + Self::ASCENDING, + Self::DESCENDING, + ]; + #[allow(clippy::trivially_copy_pass_by_ref)] + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + o_prot.write_i32(self.0) + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let enum_value = i_prot.read_i32()?; + Ok(BoundaryOrder::from(enum_value)) + } +} + +impl From for BoundaryOrder { + fn from(i: i32) -> Self { + match i { + 0 => BoundaryOrder::UNORDERED, + 1 => BoundaryOrder::ASCENDING, + 2 => BoundaryOrder::DESCENDING, + _ => BoundaryOrder(i) + } + } +} + +impl From<&i32> for BoundaryOrder { + fn from(i: &i32) -> Self { + BoundaryOrder::from(*i) + } +} + +impl From for i32 { + fn from(e: BoundaryOrder) -> i32 { + e.0 + } +} + +impl From<&BoundaryOrder> for i32 { + fn from(e: &BoundaryOrder) -> i32 { + e.0 + } +} + +// +// Statistics +// + +/// Statistics per row group and per page +/// All fields are optional. +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Statistics { + /// DEPRECATED: min and max value of the column. Use min_value and max_value. + /// + /// Values are encoded using PLAIN encoding, except that variable-length byte + /// arrays do not include a length prefix. + /// + /// These fields encode min and max values determined by signed comparison + /// only. New files should use the correct order for a column's logical type + /// and store the values in the min_value and max_value fields. + /// + /// To support older readers, these may be set when the column order is + /// signed. + pub max: Option>, + pub min: Option>, + /// count of null value in the column + pub null_count: Option, + /// count of distinct values occurring + pub distinct_count: Option, + /// Min and max values for the column, determined by its ColumnOrder. + /// + /// Values are encoded using PLAIN encoding, except that variable-length byte + /// arrays do not include a length prefix. + pub max_value: Option>, + pub min_value: Option>, +} + +impl Statistics { + pub fn new(max: F1, min: F2, null_count: F3, distinct_count: F4, max_value: F5, min_value: F6) -> Statistics where F1: Into>>, F2: Into>>, F3: Into>, F4: Into>, F5: Into>>, F6: Into>> { + Statistics { + max: max.into(), + min: min.into(), + null_count: null_count.into(), + distinct_count: distinct_count.into(), + max_value: max_value.into(), + min_value: min_value.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option> = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option> = None; + let mut f_6: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_bytes()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bytes()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i64()?; + f_3 = Some(val); + }, + 4 => { + let val = i_prot.read_i64()?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_bytes()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_bytes()?; + f_6 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = Statistics { + max: f_1, + min: f_2, + null_count: f_3, + distinct_count: f_4, + max_value: f_5, + min_value: f_6, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("Statistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(ref fld_var) = self.max { + o_prot.write_field_begin(&TFieldIdentifier::new("max", TType::String, 1))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.min { + o_prot.write_field_begin(&TFieldIdentifier::new("min", TType::String, 2))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.null_count { + o_prot.write_field_begin(&TFieldIdentifier::new("null_count", TType::I64, 3))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.distinct_count { + o_prot.write_field_begin(&TFieldIdentifier::new("distinct_count", TType::I64, 4))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.max_value { + o_prot.write_field_begin(&TFieldIdentifier::new("max_value", TType::String, 5))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.min_value { + o_prot.write_field_begin(&TFieldIdentifier::new("min_value", TType::String, 6))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for Statistics { + fn default() -> Self { + Statistics{ + max: Some(Vec::new()), + min: Some(Vec::new()), + null_count: Some(0), + distinct_count: Some(0), + max_value: Some(Vec::new()), + min_value: Some(Vec::new()), + } + } +} + +// +// StringType +// + +/// Empty structs to use as logical type annotations +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct StringType { +} + +impl StringType { + pub fn new() -> StringType { + StringType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = StringType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("StringType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for StringType { + fn default() -> Self { + StringType{} + } +} + +// +// UUIDType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct UUIDType { +} + +impl UUIDType { + pub fn new() -> UUIDType { + UUIDType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = UUIDType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("UUIDType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for UUIDType { + fn default() -> Self { + UUIDType{} + } +} + +// +// MapType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct MapType { +} + +impl MapType { + pub fn new() -> MapType { + MapType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = MapType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("MapType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for MapType { + fn default() -> Self { + MapType{} + } +} + +// +// ListType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ListType { +} + +impl ListType { + pub fn new() -> ListType { + ListType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = ListType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ListType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for ListType { + fn default() -> Self { + ListType{} + } +} + +// +// EnumType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct EnumType { +} + +impl EnumType { + pub fn new() -> EnumType { + EnumType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = EnumType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("EnumType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for EnumType { + fn default() -> Self { + EnumType{} + } +} + +// +// DateType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct DateType { +} + +impl DateType { + pub fn new() -> DateType { + DateType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = DateType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("DateType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for DateType { + fn default() -> Self { + DateType{} + } +} + +// +// NullType +// + +/// Logical type to annotate a column that is always null. +/// +/// Sometimes when discovering the schema of existing data, values are always +/// null and the physical type can't be determined. This annotation signals +/// the case where the physical type was guessed from all null values. +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct NullType { +} + +impl NullType { + pub fn new() -> NullType { + NullType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = NullType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("NullType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for NullType { + fn default() -> Self { + NullType{} + } +} + +// +// DecimalType +// + +/// Decimal logical type annotation +/// +/// To maintain forward-compatibility in v1, implementations using this logical +/// type must also set scale and precision on the annotated SchemaElement. +/// +/// Allowed for physical types: INT32, INT64, FIXED, and BINARY +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct DecimalType { + pub scale: i32, + pub precision: i32, +} + +impl DecimalType { + pub fn new(scale: i32, precision: i32) -> DecimalType { + DecimalType { + scale, + precision, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i32()?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("DecimalType.scale", &f_1)?; + verify_required_field_exists("DecimalType.precision", &f_2)?; + let ret = DecimalType { + scale: f_1.expect("auto-generated code should have checked for presence of required fields"), + precision: f_2.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("DecimalType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("scale", TType::I32, 1))?; + o_prot.write_i32(self.scale)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("precision", TType::I32, 2))?; + o_prot.write_i32(self.precision)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// MilliSeconds +// + +/// Time units for logical types +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct MilliSeconds { +} + +impl MilliSeconds { + pub fn new() -> MilliSeconds { + MilliSeconds {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = MilliSeconds {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("MilliSeconds"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for MilliSeconds { + fn default() -> Self { + MilliSeconds{} + } +} + +// +// MicroSeconds +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct MicroSeconds { +} + +impl MicroSeconds { + pub fn new() -> MicroSeconds { + MicroSeconds {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = MicroSeconds {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("MicroSeconds"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for MicroSeconds { + fn default() -> Self { + MicroSeconds{} + } +} + +// +// NanoSeconds +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct NanoSeconds { +} + +impl NanoSeconds { + pub fn new() -> NanoSeconds { + NanoSeconds {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = NanoSeconds {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("NanoSeconds"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for NanoSeconds { + fn default() -> Self { + NanoSeconds{} + } +} + +// +// TimeUnit +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum TimeUnit { + MILLIS(MilliSeconds), + MICROS(MicroSeconds), + NANOS(NanoSeconds), +} + +impl TimeUnit { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = MilliSeconds::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(TimeUnit::MILLIS(val)); + } + received_field_count += 1; + }, + 2 => { + let val = MicroSeconds::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(TimeUnit::MICROS(val)); + } + received_field_count += 1; + }, + 3 => { + let val = NanoSeconds::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(TimeUnit::NANOS(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote TimeUnit" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote TimeUnit" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("TimeUnit"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + TimeUnit::MILLIS(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("MILLIS", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + TimeUnit::MICROS(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("MICROS", TType::Struct, 2))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + TimeUnit::NANOS(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("NANOS", TType::Struct, 3))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// TimestampType +// + +/// Timestamp logical type annotation +/// +/// Allowed for physical types: INT64 +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct TimestampType { + pub is_adjusted_to_u_t_c: bool, + pub unit: TimeUnit, +} + +impl TimestampType { + pub fn new(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> TimestampType { + TimestampType { + is_adjusted_to_u_t_c, + unit, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_bool()?; + f_1 = Some(val); + }, + 2 => { + let val = TimeUnit::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("TimestampType.is_adjusted_to_u_t_c", &f_1)?; + verify_required_field_exists("TimestampType.unit", &f_2)?; + let ret = TimestampType { + is_adjusted_to_u_t_c: f_1.expect("auto-generated code should have checked for presence of required fields"), + unit: f_2.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("TimestampType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; + o_prot.write_bool(self.is_adjusted_to_u_t_c)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("unit", TType::Struct, 2))?; + self.unit.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// TimeType +// + +/// Time logical type annotation +/// +/// Allowed for physical types: INT32 (millis), INT64 (micros, nanos) +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct TimeType { + pub is_adjusted_to_u_t_c: bool, + pub unit: TimeUnit, +} + +impl TimeType { + pub fn new(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> TimeType { + TimeType { + is_adjusted_to_u_t_c, + unit, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_bool()?; + f_1 = Some(val); + }, + 2 => { + let val = TimeUnit::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("TimeType.is_adjusted_to_u_t_c", &f_1)?; + verify_required_field_exists("TimeType.unit", &f_2)?; + let ret = TimeType { + is_adjusted_to_u_t_c: f_1.expect("auto-generated code should have checked for presence of required fields"), + unit: f_2.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("TimeType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; + o_prot.write_bool(self.is_adjusted_to_u_t_c)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("unit", TType::Struct, 2))?; + self.unit.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// IntType +// + +/// Integer logical type annotation +/// +/// bitWidth must be 8, 16, 32, or 64. +/// +/// Allowed for physical types: INT32, INT64 +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct IntType { + pub bit_width: i8, + pub is_signed: bool, +} + +impl IntType { + pub fn new(bit_width: i8, is_signed: bool) -> IntType { + IntType { + bit_width, + is_signed, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i8()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bool()?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("IntType.bit_width", &f_1)?; + verify_required_field_exists("IntType.is_signed", &f_2)?; + let ret = IntType { + bit_width: f_1.expect("auto-generated code should have checked for presence of required fields"), + is_signed: f_2.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("IntType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("bitWidth", TType::I08, 1))?; + o_prot.write_i8(self.bit_width)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("isSigned", TType::Bool, 2))?; + o_prot.write_bool(self.is_signed)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// JsonType +// + +/// Embedded JSON logical type annotation +/// +/// Allowed for physical types: BINARY +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct JsonType { +} + +impl JsonType { + pub fn new() -> JsonType { + JsonType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = JsonType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("JsonType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for JsonType { + fn default() -> Self { + JsonType{} + } +} + +// +// BsonType +// + +/// Embedded BSON logical type annotation +/// +/// Allowed for physical types: BINARY +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct BsonType { +} + +impl BsonType { + pub fn new() -> BsonType { + BsonType {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = BsonType {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("BsonType"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for BsonType { + fn default() -> Self { + BsonType{} + } +} + +// +// LogicalType +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum LogicalType { + STRING(StringType), + MAP(MapType), + LIST(ListType), + ENUM(EnumType), + DECIMAL(DecimalType), + DATE(DateType), + TIME(TimeType), + TIMESTAMP(TimestampType), + INTEGER(IntType), + UNKNOWN(NullType), + JSON(JsonType), + BSON(BsonType), + UUID(UUIDType), +} + +impl LogicalType { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = StringType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::STRING(val)); + } + received_field_count += 1; + }, + 2 => { + let val = MapType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::MAP(val)); + } + received_field_count += 1; + }, + 3 => { + let val = ListType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::LIST(val)); + } + received_field_count += 1; + }, + 4 => { + let val = EnumType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::ENUM(val)); + } + received_field_count += 1; + }, + 5 => { + let val = DecimalType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::DECIMAL(val)); + } + received_field_count += 1; + }, + 6 => { + let val = DateType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::DATE(val)); + } + received_field_count += 1; + }, + 7 => { + let val = TimeType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::TIME(val)); + } + received_field_count += 1; + }, + 8 => { + let val = TimestampType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::TIMESTAMP(val)); + } + received_field_count += 1; + }, + 10 => { + let val = IntType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::INTEGER(val)); + } + received_field_count += 1; + }, + 11 => { + let val = NullType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::UNKNOWN(val)); + } + received_field_count += 1; + }, + 12 => { + let val = JsonType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::JSON(val)); + } + received_field_count += 1; + }, + 13 => { + let val = BsonType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::BSON(val)); + } + received_field_count += 1; + }, + 14 => { + let val = UUIDType::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::UUID(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote LogicalType" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote LogicalType" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("LogicalType"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + LogicalType::STRING(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("STRING", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::MAP(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("MAP", TType::Struct, 2))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::LIST(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("LIST", TType::Struct, 3))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::ENUM(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("ENUM", TType::Struct, 4))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::DECIMAL(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("DECIMAL", TType::Struct, 5))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::DATE(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("DATE", TType::Struct, 6))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::TIME(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("TIME", TType::Struct, 7))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::TIMESTAMP(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("TIMESTAMP", TType::Struct, 8))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::INTEGER(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("INTEGER", TType::Struct, 10))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::UNKNOWN(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("UNKNOWN", TType::Struct, 11))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::JSON(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("JSON", TType::Struct, 12))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::BSON(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("BSON", TType::Struct, 13))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + LogicalType::UUID(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("UUID", TType::Struct, 14))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// SchemaElement +// + +/// Represents a element inside a schema definition. +/// - if it is a group (inner node) then type is undefined and num_children is defined +/// - if it is a primitive type (leaf) then type is defined and num_children is undefined +/// the nodes are listed in depth first traversal order. +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SchemaElement { + /// Data type for this field. Not set if the current element is a non-leaf node + pub type_: Option, + /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. + /// Otherwise, if specified, this is the maximum bit length to store any of the values. + /// (e.g. a low cardinality INT col could have this set to 3). Note that this is + /// in the schema, and therefore fixed for the entire file. + pub type_length: Option, + /// repetition of the field. The root of the schema does not have a repetition_type. + /// All other nodes must have one + pub repetition_type: Option, + /// Name of the field in the schema + pub name: String, + /// Nested fields. Since thrift does not support nested fields, + /// the nesting is flattened to a single list by a depth-first traversal. + /// The children count is used to construct the nested relationship. + /// This field is not set when the element is a primitive type + pub num_children: Option, + /// DEPRECATED: When the schema is the result of a conversion from another model. + /// Used to record the original type to help with cross conversion. + /// + /// This is superseded by logicalType. + pub converted_type: Option, + /// DEPRECATED: Used when this column contains decimal data. + /// See the DECIMAL converted type for more details. + /// + /// This is superseded by using the DecimalType annotation in logicalType. + pub scale: Option, + pub precision: Option, + /// When the original schema supports field ids, this will save the + /// original field id in the parquet schema + pub field_id: Option, + /// The logical type of this SchemaElement + /// + /// LogicalType replaces ConvertedType, but ConvertedType is still required + /// for some logical types to ensure forward-compatibility in format v1. + pub logical_type: Option, +} + +impl SchemaElement { + pub fn new(type_: F1, type_length: F2, repetition_type: F3, name: String, num_children: F5, converted_type: F6, scale: F7, precision: F8, field_id: F9, logical_type: F10) -> SchemaElement where F1: Into>, F2: Into>, F3: Into>, F5: Into>, F6: Into>, F7: Into>, F8: Into>, F9: Into>, F10: Into> { + SchemaElement { + type_: type_.into(), + type_length: type_length.into(), + repetition_type: repetition_type.into(), + name, + num_children: num_children.into(), + converted_type: converted_type.into(), + scale: scale.into(), + precision: precision.into(), + field_id: field_id.into(), + logical_type: logical_type.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + let mut f_8: Option = None; + let mut f_9: Option = None; + let mut f_10: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = Type::read_from_in_protocol(i_prot)?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i32()?; + f_2 = Some(val); + }, + 3 => { + let val = FieldRepetitionType::read_from_in_protocol(i_prot)?; + f_3 = Some(val); + }, + 4 => { + let val = i_prot.read_string()?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_i32()?; + f_5 = Some(val); + }, + 6 => { + let val = ConvertedType::read_from_in_protocol(i_prot)?; + f_6 = Some(val); + }, + 7 => { + let val = i_prot.read_i32()?; + f_7 = Some(val); + }, + 8 => { + let val = i_prot.read_i32()?; + f_8 = Some(val); + }, + 9 => { + let val = i_prot.read_i32()?; + f_9 = Some(val); + }, + 10 => { + let val = LogicalType::read_from_in_protocol(i_prot)?; + f_10 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("SchemaElement.name", &f_4)?; + let ret = SchemaElement { + type_: f_1, + type_length: f_2, + repetition_type: f_3, + name: f_4.expect("auto-generated code should have checked for presence of required fields"), + num_children: f_5, + converted_type: f_6, + scale: f_7, + precision: f_8, + field_id: f_9, + logical_type: f_10, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SchemaElement"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(ref fld_var) = self.type_ { + o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.type_length { + o_prot.write_field_begin(&TFieldIdentifier::new("type_length", TType::I32, 2))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_type { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_type", TType::I32, 3))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + o_prot.write_field_begin(&TFieldIdentifier::new("name", TType::String, 4))?; + o_prot.write_string(&self.name)?; + o_prot.write_field_end()?; + if let Some(fld_var) = self.num_children { + o_prot.write_field_begin(&TFieldIdentifier::new("num_children", TType::I32, 5))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.converted_type { + o_prot.write_field_begin(&TFieldIdentifier::new("converted_type", TType::I32, 6))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.scale { + o_prot.write_field_begin(&TFieldIdentifier::new("scale", TType::I32, 7))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.precision { + o_prot.write_field_begin(&TFieldIdentifier::new("precision", TType::I32, 8))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.field_id { + o_prot.write_field_begin(&TFieldIdentifier::new("field_id", TType::I32, 9))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.logical_type { + o_prot.write_field_begin(&TFieldIdentifier::new("logicalType", TType::Struct, 10))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// DataPageHeader +// + +/// Data page header +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct DataPageHeader { + /// Number of values, including NULLs, in this data page. * + pub num_values: i32, + /// Encoding used for this data page * + pub encoding: Encoding, + /// Encoding used for definition levels * + pub definition_level_encoding: Encoding, + /// Encoding used for repetition levels * + pub repetition_level_encoding: Encoding, + /// Optional statistics for the data in this page* + pub statistics: Option, +} + +impl DataPageHeader { + pub fn new(num_values: i32, encoding: Encoding, definition_level_encoding: Encoding, repetition_level_encoding: Encoding, statistics: F5) -> DataPageHeader where F5: Into> { + DataPageHeader { + num_values, + encoding, + definition_level_encoding, + repetition_level_encoding, + statistics: statistics.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + 3 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_3 = Some(val); + }, + 4 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_4 = Some(val); + }, + 5 => { + let val = Statistics::read_from_in_protocol(i_prot)?; + f_5 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("DataPageHeader.num_values", &f_1)?; + verify_required_field_exists("DataPageHeader.encoding", &f_2)?; + verify_required_field_exists("DataPageHeader.definition_level_encoding", &f_3)?; + verify_required_field_exists("DataPageHeader.repetition_level_encoding", &f_4)?; + let ret = DataPageHeader { + num_values: f_1.expect("auto-generated code should have checked for presence of required fields"), + encoding: f_2.expect("auto-generated code should have checked for presence of required fields"), + definition_level_encoding: f_3.expect("auto-generated code should have checked for presence of required fields"), + repetition_level_encoding: f_4.expect("auto-generated code should have checked for presence of required fields"), + statistics: f_5, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("DataPageHeader"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; + o_prot.write_i32(self.num_values)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("encoding", TType::I32, 2))?; + self.encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_encoding", TType::I32, 3))?; + self.definition_level_encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_encoding", TType::I32, 4))?; + self.repetition_level_encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("statistics", TType::Struct, 5))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// IndexPageHeader +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct IndexPageHeader { +} + +impl IndexPageHeader { + pub fn new() -> IndexPageHeader { + IndexPageHeader {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = IndexPageHeader {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("IndexPageHeader"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for IndexPageHeader { + fn default() -> Self { + IndexPageHeader{} + } +} + +// +// DictionaryPageHeader +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct DictionaryPageHeader { + /// Number of values in the dictionary * + pub num_values: i32, + /// Encoding using this dictionary page * + pub encoding: Encoding, + /// If true, the entries in the dictionary are sorted in ascending order * + pub is_sorted: Option, +} + +impl DictionaryPageHeader { + pub fn new(num_values: i32, encoding: Encoding, is_sorted: F3) -> DictionaryPageHeader where F3: Into> { + DictionaryPageHeader { + num_values, + encoding, + is_sorted: is_sorted.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_bool()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("DictionaryPageHeader.num_values", &f_1)?; + verify_required_field_exists("DictionaryPageHeader.encoding", &f_2)?; + let ret = DictionaryPageHeader { + num_values: f_1.expect("auto-generated code should have checked for presence of required fields"), + encoding: f_2.expect("auto-generated code should have checked for presence of required fields"), + is_sorted: f_3, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("DictionaryPageHeader"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; + o_prot.write_i32(self.num_values)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("encoding", TType::I32, 2))?; + self.encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + if let Some(fld_var) = self.is_sorted { + o_prot.write_field_begin(&TFieldIdentifier::new("is_sorted", TType::Bool, 3))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// DataPageHeaderV2 +// + +/// New page format allowing reading levels without decompressing the data +/// Repetition and definition levels are uncompressed +/// The remaining section containing the data is compressed if is_compressed is true +/// +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct DataPageHeaderV2 { + /// Number of values, including NULLs, in this data page. * + pub num_values: i32, + /// Number of NULL values, in this data page. + /// Number of non-null = num_values - num_nulls which is also the number of values in the data section * + pub num_nulls: i32, + /// Number of rows in this data page. which means pages change on record boundaries (r = 0) * + pub num_rows: i32, + /// Encoding used for data in this page * + pub encoding: Encoding, + /// length of the definition levels + pub definition_levels_byte_length: i32, + /// length of the repetition levels + pub repetition_levels_byte_length: i32, + /// whether the values are compressed. + /// Which means the section of the page between + /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) + /// is compressed with the compression_codec. + /// If missing it is considered compressed + pub is_compressed: Option, + /// optional statistics for the data in this page * + pub statistics: Option, +} + +impl DataPageHeaderV2 { + pub fn new(num_values: i32, num_nulls: i32, num_rows: i32, encoding: Encoding, definition_levels_byte_length: i32, repetition_levels_byte_length: i32, is_compressed: F7, statistics: F8) -> DataPageHeaderV2 where F7: Into>, F8: Into> { + DataPageHeaderV2 { + num_values, + num_nulls, + num_rows, + encoding, + definition_levels_byte_length, + repetition_levels_byte_length, + is_compressed: is_compressed.into(), + statistics: statistics.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + let mut f_8: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i32()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i32()?; + f_3 = Some(val); + }, + 4 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_i32()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_i32()?; + f_6 = Some(val); + }, + 7 => { + let val = i_prot.read_bool()?; + f_7 = Some(val); + }, + 8 => { + let val = Statistics::read_from_in_protocol(i_prot)?; + f_8 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("DataPageHeaderV2.num_values", &f_1)?; + verify_required_field_exists("DataPageHeaderV2.num_nulls", &f_2)?; + verify_required_field_exists("DataPageHeaderV2.num_rows", &f_3)?; + verify_required_field_exists("DataPageHeaderV2.encoding", &f_4)?; + verify_required_field_exists("DataPageHeaderV2.definition_levels_byte_length", &f_5)?; + verify_required_field_exists("DataPageHeaderV2.repetition_levels_byte_length", &f_6)?; + let ret = DataPageHeaderV2 { + num_values: f_1.expect("auto-generated code should have checked for presence of required fields"), + num_nulls: f_2.expect("auto-generated code should have checked for presence of required fields"), + num_rows: f_3.expect("auto-generated code should have checked for presence of required fields"), + encoding: f_4.expect("auto-generated code should have checked for presence of required fields"), + definition_levels_byte_length: f_5.expect("auto-generated code should have checked for presence of required fields"), + repetition_levels_byte_length: f_6.expect("auto-generated code should have checked for presence of required fields"), + is_compressed: f_7, + statistics: f_8, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("DataPageHeaderV2"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; + o_prot.write_i32(self.num_values)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_nulls", TType::I32, 2))?; + o_prot.write_i32(self.num_nulls)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_rows", TType::I32, 3))?; + o_prot.write_i32(self.num_rows)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("encoding", TType::I32, 4))?; + self.encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("definition_levels_byte_length", TType::I32, 5))?; + o_prot.write_i32(self.definition_levels_byte_length)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_levels_byte_length", TType::I32, 6))?; + o_prot.write_i32(self.repetition_levels_byte_length)?; + o_prot.write_field_end()?; + if let Some(fld_var) = self.is_compressed { + o_prot.write_field_begin(&TFieldIdentifier::new("is_compressed", TType::Bool, 7))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("statistics", TType::Struct, 8))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// SplitBlockAlgorithm +// + +/// Block-based algorithm type annotation. * +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SplitBlockAlgorithm { +} + +impl SplitBlockAlgorithm { + pub fn new() -> SplitBlockAlgorithm { + SplitBlockAlgorithm {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SplitBlockAlgorithm {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SplitBlockAlgorithm"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for SplitBlockAlgorithm { + fn default() -> Self { + SplitBlockAlgorithm{} + } +} + +// +// BloomFilterAlgorithm +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum BloomFilterAlgorithm { + BLOCK(SplitBlockAlgorithm), +} + +impl BloomFilterAlgorithm { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = SplitBlockAlgorithm::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(BloomFilterAlgorithm::BLOCK(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote BloomFilterAlgorithm" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote BloomFilterAlgorithm" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("BloomFilterAlgorithm"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + BloomFilterAlgorithm::BLOCK(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("BLOCK", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// XxHash +// + +/// Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash +/// algorithm. It uses 64 bits version of xxHash. +/// +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct XxHash { +} + +impl XxHash { + pub fn new() -> XxHash { + XxHash {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = XxHash {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("XxHash"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for XxHash { + fn default() -> Self { + XxHash{} + } +} + +// +// BloomFilterHash +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum BloomFilterHash { + XXHASH(XxHash), +} + +impl BloomFilterHash { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = XxHash::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(BloomFilterHash::XXHASH(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote BloomFilterHash" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote BloomFilterHash" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("BloomFilterHash"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + BloomFilterHash::XXHASH(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("XXHASH", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// Uncompressed +// + +/// The compression used in the Bloom filter. +/// +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Uncompressed { +} + +impl Uncompressed { + pub fn new() -> Uncompressed { + Uncompressed {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = Uncompressed {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("Uncompressed"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for Uncompressed { + fn default() -> Self { + Uncompressed{} + } +} + +// +// BloomFilterCompression +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum BloomFilterCompression { + UNCOMPRESSED(Uncompressed), +} + +impl BloomFilterCompression { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = Uncompressed::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(BloomFilterCompression::UNCOMPRESSED(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote BloomFilterCompression" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote BloomFilterCompression" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("BloomFilterCompression"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + BloomFilterCompression::UNCOMPRESSED(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("UNCOMPRESSED", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// BloomFilterHeader +// + +/// Bloom filter header is stored at beginning of Bloom filter data of each column +/// and followed by its bitset. +/// +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct BloomFilterHeader { + /// The size of bitset in bytes * + pub num_bytes: i32, + /// The algorithm for setting bits. * + pub algorithm: BloomFilterAlgorithm, + /// The hash function used for Bloom filter. * + pub hash: BloomFilterHash, + /// The compression used in the Bloom filter * + pub compression: BloomFilterCompression, +} + +impl BloomFilterHeader { + pub fn new(num_bytes: i32, algorithm: BloomFilterAlgorithm, hash: BloomFilterHash, compression: BloomFilterCompression) -> BloomFilterHeader { + BloomFilterHeader { + num_bytes, + algorithm, + hash, + compression, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = BloomFilterAlgorithm::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + 3 => { + let val = BloomFilterHash::read_from_in_protocol(i_prot)?; + f_3 = Some(val); + }, + 4 => { + let val = BloomFilterCompression::read_from_in_protocol(i_prot)?; + f_4 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("BloomFilterHeader.num_bytes", &f_1)?; + verify_required_field_exists("BloomFilterHeader.algorithm", &f_2)?; + verify_required_field_exists("BloomFilterHeader.hash", &f_3)?; + verify_required_field_exists("BloomFilterHeader.compression", &f_4)?; + let ret = BloomFilterHeader { + num_bytes: f_1.expect("auto-generated code should have checked for presence of required fields"), + algorithm: f_2.expect("auto-generated code should have checked for presence of required fields"), + hash: f_3.expect("auto-generated code should have checked for presence of required fields"), + compression: f_4.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("BloomFilterHeader"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("numBytes", TType::I32, 1))?; + o_prot.write_i32(self.num_bytes)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("algorithm", TType::Struct, 2))?; + self.algorithm.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("hash", TType::Struct, 3))?; + self.hash.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("compression", TType::Struct, 4))?; + self.compression.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// PageHeader +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PageHeader { + /// the type of the page: indicates which of the *_header fields is set * + pub type_: PageType, + /// Uncompressed page size in bytes (not including this header) * + pub uncompressed_page_size: i32, + /// Compressed (and potentially encrypted) page size in bytes, not including this header * + pub compressed_page_size: i32, + /// The 32bit CRC for the page, to be be calculated as follows: + /// - Using the standard CRC32 algorithm + /// - On the data only, i.e. this header should not be included. 'Data' + /// hereby refers to the concatenation of the repetition levels, the + /// definition levels and the column value, in this exact order. + /// - On the encoded versions of the repetition levels, definition levels and + /// column values + /// - On the compressed versions of the repetition levels, definition levels + /// and column values where possible; + /// - For v1 data pages, the repetition levels, definition levels and column + /// values are always compressed together. If a compression scheme is + /// specified, the CRC shall be calculated on the compressed version of + /// this concatenation. If no compression scheme is specified, the CRC + /// shall be calculated on the uncompressed version of this concatenation. + /// - For v2 data pages, the repetition levels and definition levels are + /// handled separately from the data and are never compressed (only + /// encoded). If a compression scheme is specified, the CRC shall be + /// calculated on the concatenation of the uncompressed repetition levels, + /// uncompressed definition levels and the compressed column values. + /// If no compression scheme is specified, the CRC shall be calculated on + /// the uncompressed concatenation. + /// - In encrypted columns, CRC is calculated after page encryption; the + /// encryption itself is performed after page compression (if compressed) + /// If enabled, this allows for disabling checksumming in HDFS if only a few + /// pages need to be read. + /// + pub crc: Option, + pub data_page_header: Option, + pub index_page_header: Option, + pub dictionary_page_header: Option, + pub data_page_header_v2: Option, +} + +impl PageHeader { + pub fn new(type_: PageType, uncompressed_page_size: i32, compressed_page_size: i32, crc: F4, data_page_header: F5, index_page_header: F6, dictionary_page_header: F7, data_page_header_v2: F8) -> PageHeader where F4: Into>, F5: Into>, F6: Into>, F7: Into>, F8: Into> { + PageHeader { + type_, + uncompressed_page_size, + compressed_page_size, + crc: crc.into(), + data_page_header: data_page_header.into(), + index_page_header: index_page_header.into(), + dictionary_page_header: dictionary_page_header.into(), + data_page_header_v2: data_page_header_v2.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + let mut f_8: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = PageType::read_from_in_protocol(i_prot)?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i32()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i32()?; + f_3 = Some(val); + }, + 4 => { + let val = i_prot.read_i32()?; + f_4 = Some(val); + }, + 5 => { + let val = DataPageHeader::read_from_in_protocol(i_prot)?; + f_5 = Some(val); + }, + 6 => { + let val = IndexPageHeader::read_from_in_protocol(i_prot)?; + f_6 = Some(val); + }, + 7 => { + let val = DictionaryPageHeader::read_from_in_protocol(i_prot)?; + f_7 = Some(val); + }, + 8 => { + let val = DataPageHeaderV2::read_from_in_protocol(i_prot)?; + f_8 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("PageHeader.type_", &f_1)?; + verify_required_field_exists("PageHeader.uncompressed_page_size", &f_2)?; + verify_required_field_exists("PageHeader.compressed_page_size", &f_3)?; + let ret = PageHeader { + type_: f_1.expect("auto-generated code should have checked for presence of required fields"), + uncompressed_page_size: f_2.expect("auto-generated code should have checked for presence of required fields"), + compressed_page_size: f_3.expect("auto-generated code should have checked for presence of required fields"), + crc: f_4, + data_page_header: f_5, + index_page_header: f_6, + dictionary_page_header: f_7, + data_page_header_v2: f_8, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("PageHeader"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; + self.type_.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("uncompressed_page_size", TType::I32, 2))?; + o_prot.write_i32(self.uncompressed_page_size)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("compressed_page_size", TType::I32, 3))?; + o_prot.write_i32(self.compressed_page_size)?; + o_prot.write_field_end()?; + if let Some(fld_var) = self.crc { + o_prot.write_field_begin(&TFieldIdentifier::new("crc", TType::I32, 4))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.data_page_header { + o_prot.write_field_begin(&TFieldIdentifier::new("data_page_header", TType::Struct, 5))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.index_page_header { + o_prot.write_field_begin(&TFieldIdentifier::new("index_page_header", TType::Struct, 6))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.dictionary_page_header { + o_prot.write_field_begin(&TFieldIdentifier::new("dictionary_page_header", TType::Struct, 7))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.data_page_header_v2 { + o_prot.write_field_begin(&TFieldIdentifier::new("data_page_header_v2", TType::Struct, 8))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// KeyValue +// + +/// Wrapper struct to store key values +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct KeyValue { + pub key: String, + pub value: Option, +} + +impl KeyValue { + pub fn new(key: String, value: F2) -> KeyValue where F2: Into> { + KeyValue { + key, + value: value.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_string()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_string()?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("KeyValue.key", &f_1)?; + let ret = KeyValue { + key: f_1.expect("auto-generated code should have checked for presence of required fields"), + value: f_2, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("KeyValue"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("key", TType::String, 1))?; + o_prot.write_string(&self.key)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.value { + o_prot.write_field_begin(&TFieldIdentifier::new("value", TType::String, 2))?; + o_prot.write_string(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// SortingColumn +// + +/// Wrapper struct to specify sort order +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SortingColumn { + /// The column index (in this row group) * + pub column_idx: i32, + /// If true, indicates this column is sorted in descending order. * + pub descending: bool, + /// If true, nulls will come before non-null values, otherwise, + /// nulls go at the end. + pub nulls_first: bool, +} + +impl SortingColumn { + pub fn new(column_idx: i32, descending: bool, nulls_first: bool) -> SortingColumn { + SortingColumn { + column_idx, + descending, + nulls_first, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bool()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_bool()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("SortingColumn.column_idx", &f_1)?; + verify_required_field_exists("SortingColumn.descending", &f_2)?; + verify_required_field_exists("SortingColumn.nulls_first", &f_3)?; + let ret = SortingColumn { + column_idx: f_1.expect("auto-generated code should have checked for presence of required fields"), + descending: f_2.expect("auto-generated code should have checked for presence of required fields"), + nulls_first: f_3.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SortingColumn"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("column_idx", TType::I32, 1))?; + o_prot.write_i32(self.column_idx)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("descending", TType::Bool, 2))?; + o_prot.write_bool(self.descending)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("nulls_first", TType::Bool, 3))?; + o_prot.write_bool(self.nulls_first)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// PageEncodingStats +// + +/// statistics of a given page type and encoding +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PageEncodingStats { + /// the page type (data/dic/...) * + pub page_type: PageType, + /// encoding of the page * + pub encoding: Encoding, + /// number of pages of this type with this encoding * + pub count: i32, +} + +impl PageEncodingStats { + pub fn new(page_type: PageType, encoding: Encoding, count: i32) -> PageEncodingStats { + PageEncodingStats { + page_type, + encoding, + count, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = PageType::read_from_in_protocol(i_prot)?; + f_1 = Some(val); + }, + 2 => { + let val = Encoding::read_from_in_protocol(i_prot)?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i32()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("PageEncodingStats.page_type", &f_1)?; + verify_required_field_exists("PageEncodingStats.encoding", &f_2)?; + verify_required_field_exists("PageEncodingStats.count", &f_3)?; + let ret = PageEncodingStats { + page_type: f_1.expect("auto-generated code should have checked for presence of required fields"), + encoding: f_2.expect("auto-generated code should have checked for presence of required fields"), + count: f_3.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("PageEncodingStats"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("page_type", TType::I32, 1))?; + self.page_type.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("encoding", TType::I32, 2))?; + self.encoding.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("count", TType::I32, 3))?; + o_prot.write_i32(self.count)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// ColumnMetaData +// + +/// Description for column metadata +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ColumnMetaData { + /// Type of this column * + pub type_: Type, + /// Set of all encodings used for this column. The purpose is to validate + /// whether we can decode those pages. * + pub encodings: Vec, + /// Path in schema * + pub path_in_schema: Vec, + /// Compression codec * + pub codec: CompressionCodec, + /// Number of values in this column * + pub num_values: i64, + /// total byte size of all uncompressed pages in this column chunk (including the headers) * + pub total_uncompressed_size: i64, + /// total byte size of all compressed, and potentially encrypted, pages + /// in this column chunk (including the headers) * + pub total_compressed_size: i64, + /// Optional key/value metadata * + pub key_value_metadata: Option>, + /// Byte offset from beginning of file to first data page * + pub data_page_offset: i64, + /// Byte offset from beginning of file to root index page * + pub index_page_offset: Option, + /// Byte offset from the beginning of file to first (only) dictionary page * + pub dictionary_page_offset: Option, + /// optional statistics for this column chunk + pub statistics: Option, + /// Set of all encodings used for pages in this column chunk. + /// This information can be used to determine if all data pages are + /// dictionary encoded for example * + pub encoding_stats: Option>, + /// Byte offset from beginning of file to Bloom filter data. * + pub bloom_filter_offset: Option, +} + +impl ColumnMetaData { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into> { + ColumnMetaData { + type_, + encodings, + path_in_schema, + codec, + num_values, + total_uncompressed_size, + total_compressed_size, + key_value_metadata: key_value_metadata.into(), + data_page_offset, + index_page_offset: index_page_offset.into(), + dictionary_page_offset: dictionary_page_offset.into(), + statistics: statistics.into(), + encoding_stats: encoding_stats.into(), + bloom_filter_offset: bloom_filter_offset.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + let mut f_8: Option> = None; + let mut f_9: Option = None; + let mut f_10: Option = None; + let mut f_11: Option = None; + let mut f_12: Option = None; + let mut f_13: Option> = None; + let mut f_14: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = Type::read_from_in_protocol(i_prot)?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_string()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + 4 => { + let val = CompressionCodec::read_from_in_protocol(i_prot)?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_i64()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_i64()?; + f_6 = Some(val); + }, + 7 => { + let val = i_prot.read_i64()?; + f_7 = Some(val); + }, + 8 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); + } + i_prot.read_list_end()?; + f_8 = Some(val); + }, + 9 => { + let val = i_prot.read_i64()?; + f_9 = Some(val); + }, + 10 => { + let val = i_prot.read_i64()?; + f_10 = Some(val); + }, + 11 => { + let val = i_prot.read_i64()?; + f_11 = Some(val); + }, + 12 => { + let val = Statistics::read_from_in_protocol(i_prot)?; + f_12 = Some(val); + }, + 13 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_3); + } + i_prot.read_list_end()?; + f_13 = Some(val); + }, + 14 => { + let val = i_prot.read_i64()?; + f_14 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("ColumnMetaData.type_", &f_1)?; + verify_required_field_exists("ColumnMetaData.encodings", &f_2)?; + verify_required_field_exists("ColumnMetaData.path_in_schema", &f_3)?; + verify_required_field_exists("ColumnMetaData.codec", &f_4)?; + verify_required_field_exists("ColumnMetaData.num_values", &f_5)?; + verify_required_field_exists("ColumnMetaData.total_uncompressed_size", &f_6)?; + verify_required_field_exists("ColumnMetaData.total_compressed_size", &f_7)?; + verify_required_field_exists("ColumnMetaData.data_page_offset", &f_9)?; + let ret = ColumnMetaData { + type_: f_1.expect("auto-generated code should have checked for presence of required fields"), + encodings: f_2.expect("auto-generated code should have checked for presence of required fields"), + path_in_schema: f_3.expect("auto-generated code should have checked for presence of required fields"), + codec: f_4.expect("auto-generated code should have checked for presence of required fields"), + num_values: f_5.expect("auto-generated code should have checked for presence of required fields"), + total_uncompressed_size: f_6.expect("auto-generated code should have checked for presence of required fields"), + total_compressed_size: f_7.expect("auto-generated code should have checked for presence of required fields"), + key_value_metadata: f_8, + data_page_offset: f_9.expect("auto-generated code should have checked for presence of required fields"), + index_page_offset: f_10, + dictionary_page_offset: f_11, + statistics: f_12, + encoding_stats: f_13, + bloom_filter_offset: f_14, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ColumnMetaData"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; + self.type_.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("encodings", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I32, self.encodings.len() as i32))?; + for e in &self.encodings { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("path_in_schema", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::String, self.path_in_schema.len() as i32))?; + for e in &self.path_in_schema { + o_prot.write_string(e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("codec", TType::I32, 4))?; + self.codec.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I64, 5))?; + o_prot.write_i64(self.num_values)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("total_uncompressed_size", TType::I64, 6))?; + o_prot.write_i64(self.total_uncompressed_size)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("total_compressed_size", TType::I64, 7))?; + o_prot.write_i64(self.total_compressed_size)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.key_value_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("key_value_metadata", TType::List, 8))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?; + for e in fld_var { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_begin(&TFieldIdentifier::new("data_page_offset", TType::I64, 9))?; + o_prot.write_i64(self.data_page_offset)?; + o_prot.write_field_end()?; + if let Some(fld_var) = self.index_page_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("index_page_offset", TType::I64, 10))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.dictionary_page_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("dictionary_page_offset", TType::I64, 11))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("statistics", TType::Struct, 12))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.encoding_stats { + o_prot.write_field_begin(&TFieldIdentifier::new("encoding_stats", TType::List, 13))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?; + for e in fld_var { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.bloom_filter_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("bloom_filter_offset", TType::I64, 14))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// EncryptionWithFooterKey +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct EncryptionWithFooterKey { +} + +impl EncryptionWithFooterKey { + pub fn new() -> EncryptionWithFooterKey { + EncryptionWithFooterKey {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = EncryptionWithFooterKey {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("EncryptionWithFooterKey"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for EncryptionWithFooterKey { + fn default() -> Self { + EncryptionWithFooterKey{} + } +} + +// +// EncryptionWithColumnKey +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct EncryptionWithColumnKey { + /// Column path in schema * + pub path_in_schema: Vec, + /// Retrieval metadata of column encryption key * + pub key_metadata: Option>, +} + +impl EncryptionWithColumnKey { + pub fn new(path_in_schema: Vec, key_metadata: F2) -> EncryptionWithColumnKey where F2: Into>> { + EncryptionWithColumnKey { + path_in_schema, + key_metadata: key_metadata.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_4 = i_prot.read_string()?; + val.push(list_elem_4); + } + i_prot.read_list_end()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bytes()?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("EncryptionWithColumnKey.path_in_schema", &f_1)?; + let ret = EncryptionWithColumnKey { + path_in_schema: f_1.expect("auto-generated code should have checked for presence of required fields"), + key_metadata: f_2, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("EncryptionWithColumnKey"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("path_in_schema", TType::List, 1))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::String, self.path_in_schema.len() as i32))?; + for e in &self.path_in_schema { + o_prot.write_string(e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.key_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("key_metadata", TType::String, 2))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// ColumnCryptoMetaData +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum ColumnCryptoMetaData { + ENCRYPTIONWITHFOOTERKEY(EncryptionWithFooterKey), + ENCRYPTIONWITHCOLUMNKEY(EncryptionWithColumnKey), +} + +impl ColumnCryptoMetaData { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = EncryptionWithFooterKey::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(val)); + } + received_field_count += 1; + }, + 2 => { + let val = EncryptionWithColumnKey::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote ColumnCryptoMetaData" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote ColumnCryptoMetaData" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ColumnCryptoMetaData"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + ColumnCryptoMetaData::ENCRYPTIONWITHFOOTERKEY(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("ENCRYPTION_WITH_FOOTER_KEY", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + ColumnCryptoMetaData::ENCRYPTIONWITHCOLUMNKEY(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("ENCRYPTION_WITH_COLUMN_KEY", TType::Struct, 2))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// ColumnChunk +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ColumnChunk { + /// File where column data is stored. If not set, assumed to be same file as + /// metadata. This path is relative to the current file. + /// + pub file_path: Option, + /// Byte offset in file_path to the ColumnMetaData * + pub file_offset: i64, + /// Column metadata for this chunk. This is the same content as what is at + /// file_path/file_offset. Having it here has it replicated in the file + /// metadata. + /// + pub meta_data: Option, + /// File offset of ColumnChunk's OffsetIndex * + pub offset_index_offset: Option, + /// Size of ColumnChunk's OffsetIndex, in bytes * + pub offset_index_length: Option, + /// File offset of ColumnChunk's ColumnIndex * + pub column_index_offset: Option, + /// Size of ColumnChunk's ColumnIndex, in bytes * + pub column_index_length: Option, + /// Crypto metadata of encrypted columns * + pub crypto_metadata: Option, + /// Encrypted column metadata for this chunk * + pub encrypted_column_metadata: Option>, +} + +impl ColumnChunk { + pub fn new(file_path: F1, file_offset: i64, meta_data: F3, offset_index_offset: F4, offset_index_length: F5, column_index_offset: F6, column_index_length: F7, crypto_metadata: F8, encrypted_column_metadata: F9) -> ColumnChunk where F1: Into>, F3: Into>, F4: Into>, F5: Into>, F6: Into>, F7: Into>, F8: Into>, F9: Into>> { + ColumnChunk { + file_path: file_path.into(), + file_offset, + meta_data: meta_data.into(), + offset_index_offset: offset_index_offset.into(), + offset_index_length: offset_index_length.into(), + column_index_offset: column_index_offset.into(), + column_index_length: column_index_length.into(), + crypto_metadata: crypto_metadata.into(), + encrypted_column_metadata: encrypted_column_metadata.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + let mut f_8: Option = None; + let mut f_9: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_string()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i64()?; + f_2 = Some(val); + }, + 3 => { + let val = ColumnMetaData::read_from_in_protocol(i_prot)?; + f_3 = Some(val); + }, + 4 => { + let val = i_prot.read_i64()?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_i32()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_i64()?; + f_6 = Some(val); + }, + 7 => { + let val = i_prot.read_i32()?; + f_7 = Some(val); + }, + 8 => { + let val = ColumnCryptoMetaData::read_from_in_protocol(i_prot)?; + f_8 = Some(val); + }, + 9 => { + let val = i_prot.read_bytes()?; + f_9 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("ColumnChunk.file_offset", &f_2)?; + let ret = ColumnChunk { + file_path: f_1, + file_offset: f_2.expect("auto-generated code should have checked for presence of required fields"), + meta_data: f_3, + offset_index_offset: f_4, + offset_index_length: f_5, + column_index_offset: f_6, + column_index_length: f_7, + crypto_metadata: f_8, + encrypted_column_metadata: f_9, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ColumnChunk"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(ref fld_var) = self.file_path { + o_prot.write_field_begin(&TFieldIdentifier::new("file_path", TType::String, 1))?; + o_prot.write_string(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_begin(&TFieldIdentifier::new("file_offset", TType::I64, 2))?; + o_prot.write_i64(self.file_offset)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.meta_data { + o_prot.write_field_begin(&TFieldIdentifier::new("meta_data", TType::Struct, 3))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.offset_index_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("offset_index_offset", TType::I64, 4))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.offset_index_length { + o_prot.write_field_begin(&TFieldIdentifier::new("offset_index_length", TType::I32, 5))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.column_index_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("column_index_offset", TType::I64, 6))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.column_index_length { + o_prot.write_field_begin(&TFieldIdentifier::new("column_index_length", TType::I32, 7))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.crypto_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("crypto_metadata", TType::Struct, 8))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.encrypted_column_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("encrypted_column_metadata", TType::String, 9))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// RowGroup +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct RowGroup { + /// Metadata for each column chunk in this row group. + /// This list must have the same order as the SchemaElement list in FileMetaData. + /// + pub columns: Vec, + /// Total byte size of all the uncompressed column data in this row group * + pub total_byte_size: i64, + /// Number of rows in this row group * + pub num_rows: i64, + /// If set, specifies a sort ordering of the rows in this RowGroup. + /// The sorting columns can be a subset of all the columns. + pub sorting_columns: Option>, + /// Byte offset from beginning of file to first page (data or dictionary) + /// in this row group * + pub file_offset: Option, + /// Total byte size of all compressed (and potentially encrypted) column data + /// in this row group * + pub total_compressed_size: Option, + /// Row group ordinal in the file * + pub ordinal: Option, +} + +impl RowGroup { + pub fn new(columns: Vec, total_byte_size: i64, num_rows: i64, sorting_columns: F4, file_offset: F5, total_compressed_size: F6, ordinal: F7) -> RowGroup where F4: Into>>, F5: Into>, F6: Into>, F7: Into> { + RowGroup { + columns, + total_byte_size, + num_rows, + sorting_columns: sorting_columns.into(), + file_offset: file_offset.into(), + total_compressed_size: total_compressed_size.into(), + ordinal: ordinal.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + let mut f_4: Option> = None; + let mut f_5: Option = None; + let mut f_6: Option = None; + let mut f_7: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); + } + i_prot.read_list_end()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i64()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i64()?; + f_3 = Some(val); + }, + 4 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_6); + } + i_prot.read_list_end()?; + f_4 = Some(val); + }, + 5 => { + let val = i_prot.read_i64()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_i64()?; + f_6 = Some(val); + }, + 7 => { + let val = i_prot.read_i16()?; + f_7 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("RowGroup.columns", &f_1)?; + verify_required_field_exists("RowGroup.total_byte_size", &f_2)?; + verify_required_field_exists("RowGroup.num_rows", &f_3)?; + let ret = RowGroup { + columns: f_1.expect("auto-generated code should have checked for presence of required fields"), + total_byte_size: f_2.expect("auto-generated code should have checked for presence of required fields"), + num_rows: f_3.expect("auto-generated code should have checked for presence of required fields"), + sorting_columns: f_4, + file_offset: f_5, + total_compressed_size: f_6, + ordinal: f_7, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("RowGroup"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("columns", TType::List, 1))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.columns.len() as i32))?; + for e in &self.columns { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("total_byte_size", TType::I64, 2))?; + o_prot.write_i64(self.total_byte_size)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_rows", TType::I64, 3))?; + o_prot.write_i64(self.num_rows)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.sorting_columns { + o_prot.write_field_begin(&TFieldIdentifier::new("sorting_columns", TType::List, 4))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?; + for e in fld_var { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.file_offset { + o_prot.write_field_begin(&TFieldIdentifier::new("file_offset", TType::I64, 5))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.total_compressed_size { + o_prot.write_field_begin(&TFieldIdentifier::new("total_compressed_size", TType::I64, 6))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.ordinal { + o_prot.write_field_begin(&TFieldIdentifier::new("ordinal", TType::I16, 7))?; + o_prot.write_i16(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// TypeDefinedOrder +// + +/// Empty struct to signal the order defined by the physical or logical type +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct TypeDefinedOrder { +} + +impl TypeDefinedOrder { + pub fn new() -> TypeDefinedOrder { + TypeDefinedOrder {} + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = TypeDefinedOrder {}; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("TypeDefinedOrder"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for TypeDefinedOrder { + fn default() -> Self { + TypeDefinedOrder{} + } +} + +// +// ColumnOrder +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum ColumnOrder { + TYPEORDER(TypeDefinedOrder), +} + +impl ColumnOrder { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = TypeDefinedOrder::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(ColumnOrder::TYPEORDER(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote ColumnOrder" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote ColumnOrder" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ColumnOrder"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + ColumnOrder::TYPEORDER(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("TYPE_ORDER", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// PageLocation +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PageLocation { + /// Offset of the page in the file * + pub offset: i64, + /// Size of the page, including header. Sum of compressed_page_size and header + /// length + pub compressed_page_size: i32, + /// Index within the RowGroup of the first row of the page; this means pages + /// change on record boundaries (r = 0). + pub first_row_index: i64, +} + +impl PageLocation { + pub fn new(offset: i64, compressed_page_size: i32, first_row_index: i64) -> PageLocation { + PageLocation { + offset, + compressed_page_size, + first_row_index, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_i32()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i64()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("PageLocation.offset", &f_1)?; + verify_required_field_exists("PageLocation.compressed_page_size", &f_2)?; + verify_required_field_exists("PageLocation.first_row_index", &f_3)?; + let ret = PageLocation { + offset: f_1.expect("auto-generated code should have checked for presence of required fields"), + compressed_page_size: f_2.expect("auto-generated code should have checked for presence of required fields"), + first_row_index: f_3.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("PageLocation"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("offset", TType::I64, 1))?; + o_prot.write_i64(self.offset)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("compressed_page_size", TType::I32, 2))?; + o_prot.write_i32(self.compressed_page_size)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("first_row_index", TType::I64, 3))?; + o_prot.write_i64(self.first_row_index)?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// OffsetIndex +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct OffsetIndex { + /// PageLocations, ordered by increasing PageLocation.offset. It is required + /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. + pub page_locations: Vec, +} + +impl OffsetIndex { + pub fn new(page_locations: Vec) -> OffsetIndex { + OffsetIndex { + page_locations, + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); + } + i_prot.read_list_end()?; + f_1 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; + let ret = OffsetIndex { + page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("OffsetIndex"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("page_locations", TType::List, 1))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.page_locations.len() as i32))?; + for e in &self.page_locations { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// ColumnIndex +// + +/// Description for ColumnIndex. +/// Each \[i\] refers to the page at OffsetIndex.page_locations\[i\] +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ColumnIndex { + /// A list of Boolean values to determine the validity of the corresponding + /// min and max values. If true, a page contains only null values, and writers + /// have to set the corresponding entries in min_values and max_values to + /// byte\[0\], so that all lists have the same length. If false, the + /// corresponding entries in min_values and max_values must be valid. + pub null_pages: Vec, + /// Two lists containing lower and upper bounds for the values of each page. + /// These may be the actual minimum and maximum values found on a page, but + /// can also be (more compact) values that do not exist on a page. For + /// example, instead of storing ""Blart Versenwald III", a writer may set + /// min_values\[i\]="B", max_values\[i\]="C". Such more compact values must still + /// be valid values within the column's logical type. Readers must make sure + /// that list entries are populated before using them by inspecting null_pages. + pub min_values: Vec>, + pub max_values: Vec>, + /// Stores whether both min_values and max_values are orderd and if so, in + /// which direction. This allows readers to perform binary searches in both + /// lists. Readers cannot assume that max_values\[i\] <= min_values\[i+1\], even + /// if the lists are ordered. + pub boundary_order: BoundaryOrder, + /// A list containing the number of null values for each page * + pub null_counts: Option>, +} + +impl ColumnIndex { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + ColumnIndex { + null_pages, + min_values, + max_values, + boundary_order, + null_counts: null_counts.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option>> = None; + let mut f_3: Option>> = None; + let mut f_4: Option = None; + let mut f_5: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_8 = i_prot.read_bool()?; + val.push(list_elem_8); + } + i_prot.read_list_end()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_9 = i_prot.read_bytes()?; + val.push(list_elem_9); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_bytes()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + 4 => { + let val = BoundaryOrder::read_from_in_protocol(i_prot)?; + f_4 = Some(val); + }, + 5 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_11 = i_prot.read_i64()?; + val.push(list_elem_11); + } + i_prot.read_list_end()?; + f_5 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("ColumnIndex.null_pages", &f_1)?; + verify_required_field_exists("ColumnIndex.min_values", &f_2)?; + verify_required_field_exists("ColumnIndex.max_values", &f_3)?; + verify_required_field_exists("ColumnIndex.boundary_order", &f_4)?; + let ret = ColumnIndex { + null_pages: f_1.expect("auto-generated code should have checked for presence of required fields"), + min_values: f_2.expect("auto-generated code should have checked for presence of required fields"), + max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), + boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), + null_counts: f_5, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("ColumnIndex"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("null_pages", TType::List, 1))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Bool, self.null_pages.len() as i32))?; + for e in &self.null_pages { + o_prot.write_bool(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("min_values", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::String, self.min_values.len() as i32))?; + for e in &self.min_values { + o_prot.write_bytes(e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("max_values", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::String, self.max_values.len() as i32))?; + for e in &self.max_values { + o_prot.write_bytes(e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("boundary_order", TType::I32, 4))?; + self.boundary_order.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.null_counts { + o_prot.write_field_begin(&TFieldIdentifier::new("null_counts", TType::List, 5))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// AesGcmV1 +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct AesGcmV1 { + /// AAD prefix * + pub aad_prefix: Option>, + /// Unique file identifier part of AAD suffix * + pub aad_file_unique: Option>, + /// In files encrypted with AAD prefix without storing it, + /// readers must supply the prefix * + pub supply_aad_prefix: Option, +} + +impl AesGcmV1 { + pub fn new(aad_prefix: F1, aad_file_unique: F2, supply_aad_prefix: F3) -> AesGcmV1 where F1: Into>>, F2: Into>>, F3: Into> { + AesGcmV1 { + aad_prefix: aad_prefix.into(), + aad_file_unique: aad_file_unique.into(), + supply_aad_prefix: supply_aad_prefix.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option> = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_bytes()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bytes()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_bool()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = AesGcmV1 { + aad_prefix: f_1, + aad_file_unique: f_2, + supply_aad_prefix: f_3, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("AesGcmV1"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(ref fld_var) = self.aad_prefix { + o_prot.write_field_begin(&TFieldIdentifier::new("aad_prefix", TType::String, 1))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.aad_file_unique { + o_prot.write_field_begin(&TFieldIdentifier::new("aad_file_unique", TType::String, 2))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.supply_aad_prefix { + o_prot.write_field_begin(&TFieldIdentifier::new("supply_aad_prefix", TType::Bool, 3))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for AesGcmV1 { + fn default() -> Self { + AesGcmV1{ + aad_prefix: Some(Vec::new()), + aad_file_unique: Some(Vec::new()), + supply_aad_prefix: Some(false), + } + } +} + +// +// AesGcmCtrV1 +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct AesGcmCtrV1 { + /// AAD prefix * + pub aad_prefix: Option>, + /// Unique file identifier part of AAD suffix * + pub aad_file_unique: Option>, + /// In files encrypted with AAD prefix without storing it, + /// readers must supply the prefix * + pub supply_aad_prefix: Option, +} + +impl AesGcmCtrV1 { + pub fn new(aad_prefix: F1, aad_file_unique: F2, supply_aad_prefix: F3) -> AesGcmCtrV1 where F1: Into>>, F2: Into>>, F3: Into> { + AesGcmCtrV1 { + aad_prefix: aad_prefix.into(), + aad_file_unique: aad_file_unique.into(), + supply_aad_prefix: supply_aad_prefix.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option> = None; + let mut f_2: Option> = None; + let mut f_3: Option = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_bytes()?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bytes()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_bool()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = AesGcmCtrV1 { + aad_prefix: f_1, + aad_file_unique: f_2, + supply_aad_prefix: f_3, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("AesGcmCtrV1"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(ref fld_var) = self.aad_prefix { + o_prot.write_field_begin(&TFieldIdentifier::new("aad_prefix", TType::String, 1))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.aad_file_unique { + o_prot.write_field_begin(&TFieldIdentifier::new("aad_file_unique", TType::String, 2))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.supply_aad_prefix { + o_prot.write_field_begin(&TFieldIdentifier::new("supply_aad_prefix", TType::Bool, 3))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +impl Default for AesGcmCtrV1 { + fn default() -> Self { + AesGcmCtrV1{ + aad_prefix: Some(Vec::new()), + aad_file_unique: Some(Vec::new()), + supply_aad_prefix: Some(false), + } + } +} + +// +// EncryptionAlgorithm +// + +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum EncryptionAlgorithm { + AESGCMV1(AesGcmV1), + AESGCMCTRV1(AesGcmCtrV1), +} + +impl EncryptionAlgorithm { + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + let mut ret: Option = None; + let mut received_field_count = 0; + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = AesGcmV1::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(EncryptionAlgorithm::AESGCMV1(val)); + } + received_field_count += 1; + }, + 2 => { + let val = AesGcmCtrV1::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(EncryptionAlgorithm::AESGCMCTRV1(val)); + } + received_field_count += 1; + }, + _ => { + i_prot.skip(field_ident.field_type)?; + received_field_count += 1; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + if received_field_count == 0 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received empty union from remote EncryptionAlgorithm" + ) + ) + ) + } else if received_field_count > 1 { + Err( + thrift::Error::Protocol( + ProtocolError::new( + ProtocolErrorKind::InvalidData, + "received multiple fields for union from remote EncryptionAlgorithm" + ) + ) + ) + } else { + Ok(ret.expect("return value should have been constructed")) + } + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("EncryptionAlgorithm"); + o_prot.write_struct_begin(&struct_ident)?; + match *self { + EncryptionAlgorithm::AESGCMV1(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("AES_GCM_V1", TType::Struct, 1))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + EncryptionAlgorithm::AESGCMCTRV1(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("AES_GCM_CTR_V1", TType::Struct, 2))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// FileMetaData +// + +/// Description for file metadata +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct FileMetaData { + /// Version of this file * + pub version: i32, + /// Parquet schema for this file. This schema contains metadata for all the columns. + /// The schema is represented as a tree with a single root. The nodes of the tree + /// are flattened to a list by doing a depth-first traversal. + /// The column metadata contains the path in the schema for that column which can be + /// used to map columns to nodes in the schema. + /// The first element is the root * + pub schema: Vec, + /// Number of rows in this file * + pub num_rows: i64, + /// Row groups in this file * + pub row_groups: Vec, + /// Optional key/value metadata * + pub key_value_metadata: Option>, + /// String for application that wrote this file. This should be in the format + /// version (build ). + /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) + /// + pub created_by: Option, + /// Sort order used for the min_value and max_value fields of each column in + /// this file. Sort orders are listed in the order matching the columns in the + /// schema. The indexes are not necessary the same though, because only leaf + /// nodes of the schema are represented in the list of sort orders. + /// + /// Without column_orders, the meaning of the min_value and max_value fields is + /// undefined. To ensure well-defined behaviour, if min_value and max_value are + /// written to a Parquet file, column_orders must be written as well. + /// + /// The obsolete min and max fields are always sorted by signed comparison + /// regardless of column_orders. + pub column_orders: Option>, + /// Encryption algorithm. This field is set only in encrypted files + /// with plaintext footer. Files with encrypted footer store algorithm id + /// in FileCryptoMetaData structure. + pub encryption_algorithm: Option, + /// Retrieval metadata of key used for signing the footer. + /// Used only in encrypted files with plaintext footer. + pub footer_signing_key_metadata: Option>, +} + +impl FileMetaData { + pub fn new(version: i32, schema: Vec, num_rows: i64, row_groups: Vec, key_value_metadata: F5, created_by: F6, column_orders: F7, encryption_algorithm: F8, footer_signing_key_metadata: F9) -> FileMetaData where F5: Into>>, F6: Into>, F7: Into>>, F8: Into>, F9: Into>> { + FileMetaData { + version, + schema, + num_rows, + row_groups, + key_value_metadata: key_value_metadata.into(), + created_by: created_by.into(), + column_orders: column_orders.into(), + encryption_algorithm: encryption_algorithm.into(), + footer_signing_key_metadata: footer_signing_key_metadata.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option = None; + let mut f_4: Option> = None; + let mut f_5: Option> = None; + let mut f_6: Option = None; + let mut f_7: Option> = None; + let mut f_8: Option = None; + let mut f_9: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i32()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_12); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let val = i_prot.read_i64()?; + f_3 = Some(val); + }, + 4 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_13); + } + i_prot.read_list_end()?; + f_4 = Some(val); + }, + 5 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_14); + } + i_prot.read_list_end()?; + f_5 = Some(val); + }, + 6 => { + let val = i_prot.read_string()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, + 8 => { + let val = EncryptionAlgorithm::read_from_in_protocol(i_prot)?; + f_8 = Some(val); + }, + 9 => { + let val = i_prot.read_bytes()?; + f_9 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("FileMetaData.version", &f_1)?; + verify_required_field_exists("FileMetaData.schema", &f_2)?; + verify_required_field_exists("FileMetaData.num_rows", &f_3)?; + verify_required_field_exists("FileMetaData.row_groups", &f_4)?; + let ret = FileMetaData { + version: f_1.expect("auto-generated code should have checked for presence of required fields"), + schema: f_2.expect("auto-generated code should have checked for presence of required fields"), + num_rows: f_3.expect("auto-generated code should have checked for presence of required fields"), + row_groups: f_4.expect("auto-generated code should have checked for presence of required fields"), + key_value_metadata: f_5, + created_by: f_6, + column_orders: f_7, + encryption_algorithm: f_8, + footer_signing_key_metadata: f_9, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("FileMetaData"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("version", TType::I32, 1))?; + o_prot.write_i32(self.version)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("schema", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.schema.len() as i32))?; + for e in &self.schema { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("num_rows", TType::I64, 3))?; + o_prot.write_i64(self.num_rows)?; + o_prot.write_field_end()?; + o_prot.write_field_begin(&TFieldIdentifier::new("row_groups", TType::List, 4))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.row_groups.len() as i32))?; + for e in &self.row_groups { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.key_value_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("key_value_metadata", TType::List, 5))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?; + for e in fld_var { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.created_by { + o_prot.write_field_begin(&TFieldIdentifier::new("created_by", TType::String, 6))?; + o_prot.write_string(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.column_orders { + o_prot.write_field_begin(&TFieldIdentifier::new("column_orders", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?; + for e in fld_var { + e.write_to_out_protocol(o_prot)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.encryption_algorithm { + o_prot.write_field_begin(&TFieldIdentifier::new("encryption_algorithm", TType::Struct, 8))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.footer_signing_key_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("footer_signing_key_metadata", TType::String, 9))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + +// +// FileCryptoMetaData +// + +/// Crypto metadata for files with encrypted footer * +#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct FileCryptoMetaData { + /// Encryption algorithm. This field is only used for files + /// with encrypted footer. Files with plaintext footer store algorithm id + /// inside footer (FileMetaData structure). + pub encryption_algorithm: EncryptionAlgorithm, + /// Retrieval metadata of key used for encryption of footer, + /// and (possibly) columns * + pub key_metadata: Option>, +} + +impl FileCryptoMetaData { + pub fn new(encryption_algorithm: EncryptionAlgorithm, key_metadata: F2) -> FileCryptoMetaData where F2: Into>> { + FileCryptoMetaData { + encryption_algorithm, + key_metadata: key_metadata.into(), + } + } + pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = EncryptionAlgorithm::read_from_in_protocol(i_prot)?; + f_1 = Some(val); + }, + 2 => { + let val = i_prot.read_bytes()?; + f_2 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + verify_required_field_exists("FileCryptoMetaData.encryption_algorithm", &f_1)?; + let ret = FileCryptoMetaData { + encryption_algorithm: f_1.expect("auto-generated code should have checked for presence of required fields"), + key_metadata: f_2, + }; + Ok(ret) + } + pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("FileCryptoMetaData"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_begin(&TFieldIdentifier::new("encryption_algorithm", TType::Struct, 1))?; + self.encryption_algorithm.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + if let Some(ref fld_var) = self.key_metadata { + o_prot.write_field_begin(&TFieldIdentifier::new("key_metadata", TType::String, 2))?; + o_prot.write_bytes(fld_var)?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 90fe399e78d7..b34d9aa8ae83 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -61,6 +61,11 @@ macro_rules! experimental { pub mod errors; pub mod basic; +/// Automatically generated code for reading parquet thrift definition. +// see parquet/CONTRIBUTING.md for instructions on regenerating +#[allow(clippy::derivable_impls, clippy::match_single_binding)] +pub mod format; + #[macro_use] pub mod data_type; diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 823803167ca1..efb0b82b3230 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -19,7 +19,7 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; -use parquet_format::SchemaElement; +use crate::format::SchemaElement; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType, @@ -1042,7 +1042,7 @@ fn from_thrift_helper( )); } let element = &elements[index]; - let converted_type = ConvertedType::from(element.converted_type); + let converted_type = ConvertedType::try_from(element.converted_type)?; // LogicalType is only present in v2 Parquet files. ConvertedType is always // populated, regardless of the version of the file (v1 or v2). let logical_type = element @@ -1063,8 +1063,9 @@ fn from_thrift_helper( "Repetition level must be defined for a primitive type" )); } - let repetition = Repetition::from(elements[index].repetition_type.unwrap()); - let physical_type = PhysicalType::from(elements[index].type_.unwrap()); + let repetition = + Repetition::try_from(elements[index].repetition_type.unwrap())?; + let physical_type = PhysicalType::try_from(elements[index].type_.unwrap())?; let length = elements[index].type_length.unwrap_or(-1); let scale = elements[index].scale.unwrap_or(-1); let precision = elements[index].precision.unwrap_or(-1); @@ -1082,7 +1083,11 @@ fn from_thrift_helper( Ok((index + 1, Arc::new(builder.build()?))) } Some(n) => { - let repetition = elements[index].repetition_type.map(Repetition::from); + let repetition = elements[index] + .repetition_type + .map(Repetition::try_from) + .transpose()?; + let mut fields = vec![]; let mut next_index = index + 1; for _ in 0..n { From 2d28010ad2691bfdd7429f98848f4be32538bd6f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 11 Sep 2022 07:34:12 +0100 Subject: [PATCH 0035/1411] Add try_unary, binary, try_binary kernels (#2666) --- arrow/benches/arithmetic_kernels.rs | 143 +++++++------------ arrow/src/array/iterator.rs | 47 +++++- arrow/src/compute/kernels/arithmetic.rs | 182 ++++++------------------ arrow/src/compute/kernels/arity.rs | 178 ++++++++++++++++++++--- arrow/src/util/bit_iterator.rs | 42 ++++++ 5 files changed, 336 insertions(+), 256 deletions(-) diff --git a/arrow/benches/arithmetic_kernels.rs b/arrow/benches/arithmetic_kernels.rs index 10af0b5432ef..2aa2e7191a68 100644 --- a/arrow/benches/arithmetic_kernels.rs +++ b/arrow/benches/arithmetic_kernels.rs @@ -20,107 +20,62 @@ extern crate criterion; use criterion::Criterion; use rand::Rng; -use std::sync::Arc; - extern crate arrow; +use arrow::datatypes::Float32Type; use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; use arrow::{compute::kernels::arithmetic::*, util::test_util::seedable_rng}; -fn create_array(size: usize, with_nulls: bool) -> ArrayRef { - let null_density = if with_nulls { 0.5 } else { 0.0 }; - let array = create_primitive_array::(size, null_density); - Arc::new(array) -} - -fn bench_add(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(add(arr_a, arr_b).unwrap()); -} - -fn bench_subtract(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(subtract(arr_a, arr_b).unwrap()); -} - -fn bench_multiply(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(multiply(arr_a, arr_b).unwrap()); -} - -fn bench_divide(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide_checked(arr_a, arr_b).unwrap()); -} - -fn bench_divide_unchecked(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide(arr_a, arr_b).unwrap()); -} - -fn bench_divide_scalar(array: &ArrayRef, divisor: f32) { - let array = array.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide_scalar(array, divisor).unwrap()); -} - -fn bench_modulo(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(modulus(arr_a, arr_b).unwrap()); -} - -fn bench_modulo_scalar(array: &ArrayRef, divisor: f32) { - let array = array.as_any().downcast_ref::().unwrap(); - criterion::black_box(modulus_scalar(array, divisor).unwrap()); -} - fn add_benchmark(c: &mut Criterion) { const BATCH_SIZE: usize = 64 * 1024; - let arr_a = create_array(BATCH_SIZE, false); - let arr_b = create_array(BATCH_SIZE, false); - let scalar = seedable_rng().gen(); - - c.bench_function("add", |b| b.iter(|| bench_add(&arr_a, &arr_b))); - c.bench_function("subtract", |b| b.iter(|| bench_subtract(&arr_a, &arr_b))); - c.bench_function("multiply", |b| b.iter(|| bench_multiply(&arr_a, &arr_b))); - c.bench_function("divide", |b| b.iter(|| bench_divide(&arr_a, &arr_b))); - c.bench_function("divide_unchecked", |b| { - b.iter(|| bench_divide_unchecked(&arr_a, &arr_b)) - }); - c.bench_function("divide_scalar", |b| { - b.iter(|| bench_divide_scalar(&arr_a, scalar)) - }); - c.bench_function("modulo", |b| b.iter(|| bench_modulo(&arr_a, &arr_b))); - c.bench_function("modulo_scalar", |b| { - b.iter(|| bench_modulo_scalar(&arr_a, scalar)) - }); - - let arr_a_nulls = create_array(BATCH_SIZE, true); - let arr_b_nulls = create_array(BATCH_SIZE, true); - c.bench_function("add_nulls", |b| { - b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("divide_nulls", |b| { - b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("divide_nulls_unchecked", |b| { - b.iter(|| bench_divide_unchecked(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("divide_scalar_nulls", |b| { - b.iter(|| bench_divide_scalar(&arr_a_nulls, scalar)) - }); - c.bench_function("modulo_nulls", |b| { - b.iter(|| bench_modulo(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("modulo_scalar_nulls", |b| { - b.iter(|| bench_modulo_scalar(&arr_a_nulls, scalar)) - }); + for null_density in [0., 0.1, 0.5, 0.9, 1.0] { + let arr_a = create_primitive_array::(BATCH_SIZE, null_density); + let arr_b = create_primitive_array::(BATCH_SIZE, null_density); + let scalar = seedable_rng().gen(); + + c.bench_function(&format!("add({})", null_density), |b| { + b.iter(|| criterion::black_box(add(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("add_checked({})", null_density), |b| { + b.iter(|| criterion::black_box(add_checked(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("add_scalar({})", null_density), |b| { + b.iter(|| criterion::black_box(add_scalar(&arr_a, scalar).unwrap())) + }); + c.bench_function(&format!("subtract({})", null_density), |b| { + b.iter(|| criterion::black_box(subtract(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("subtract_checked({})", null_density), |b| { + b.iter(|| criterion::black_box(subtract_checked(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("subtract_scalar({})", null_density), |b| { + b.iter(|| criterion::black_box(subtract_scalar(&arr_a, scalar).unwrap())) + }); + c.bench_function(&format!("multiply({})", null_density), |b| { + b.iter(|| criterion::black_box(multiply(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("multiply_checked({})", null_density), |b| { + b.iter(|| criterion::black_box(multiply_checked(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("multiply_scalar({})", null_density), |b| { + b.iter(|| criterion::black_box(multiply_scalar(&arr_a, scalar).unwrap())) + }); + c.bench_function(&format!("divide({})", null_density), |b| { + b.iter(|| criterion::black_box(divide(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("divide_checked({})", null_density), |b| { + b.iter(|| criterion::black_box(divide_checked(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("divide_scalar({})", null_density), |b| { + b.iter(|| criterion::black_box(divide_scalar(&arr_a, scalar).unwrap())) + }); + c.bench_function(&format!("modulo({})", null_density), |b| { + b.iter(|| criterion::black_box(modulus(&arr_a, &arr_b).unwrap())) + }); + c.bench_function(&format!("modulo_scalar({})", null_density), |b| { + b.iter(|| criterion::black_box(modulus_scalar(&arr_a, scalar).unwrap())) + }); + } } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 4269e99625b7..e64712fa883a 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -24,8 +24,51 @@ use super::{ PrimitiveArray, }; -/// an iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] -// Note: This implementation is based on std's [Vec]s' [IntoIter]. +/// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] +/// +/// # Performance +/// +/// [`ArrayIter`] provides an idiomatic way to iterate over an array, however, this +/// comes at the cost of performance. In particular the interleaved handling of +/// the null mask is often sub-optimal. +/// +/// If performing an infallible operation, it is typically faster to perform the operation +/// on every index of the array, and handle the null mask separately. For [`PrimitiveArray`] +/// this functionality is provided by [`compute::unary`] +/// +/// ``` +/// # use arrow::array::PrimitiveArray; +/// # use arrow::compute::unary; +/// # use arrow::datatypes::Int32Type; +/// +/// fn add(a: &PrimitiveArray, b: i32) -> PrimitiveArray { +/// unary(a, |a| a + b) +/// } +/// ``` +/// +/// If performing a fallible operation, it isn't possible to perform the operation independently +/// of the null mask, as this might result in a spurious failure on a null index. However, +/// there are more efficient ways to iterate over just the non-null indices, this functionality +/// is provided by [`compute::try_unary`] +/// +/// ``` +/// # use arrow::array::PrimitiveArray; +/// # use arrow::compute::try_unary; +/// # use arrow::datatypes::Int32Type; +/// # use arrow::error::{ArrowError, Result}; +/// +/// fn checked_add(a: &PrimitiveArray, b: i32) -> Result> { +/// try_unary(a, |a| { +/// a.checked_add(b).ok_or_else(|| { +/// ArrowError::CastError(format!("overflow adding {} to {}", a, b)) +/// }) +/// }) +/// } +/// ``` +/// +/// [`PrimitiveArray`]: [crate::array::PrimitiveArray] +/// [`compute::unary`]: [crate::compute::unary] +/// [`compute::try_unary`]: [crate::compute::try_unary] #[derive(Debug)] pub struct ArrayIter { array: T, diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 9bf4b00c3132..17850f2a8cff 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -31,12 +31,11 @@ use crate::buffer::Buffer; #[cfg(feature = "simd")] use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; -use crate::compute::unary_dyn; use crate::compute::util::combine_option_bitmap; +use crate::compute::{binary, try_binary, unary_dyn}; use crate::datatypes::{ - native_op::ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, DataType, - Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, + native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, + IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, }; use crate::datatypes::{ Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, @@ -74,33 +73,7 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; - - let values = left - .values() - .iter() - .zip(right.values().iter()) - .map(|(l, r)| op(*l, *r)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size from a PrimitiveArray - let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - - let data = unsafe { - ArrayData::new_unchecked( - LT::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - Ok(PrimitiveArray::::from(data)) + Ok(binary(left, right, op)) } /// This is similar to `math_op` as it performs given operation between two input primitive arrays. @@ -122,85 +95,11 @@ where )); } - let left_iter = ArrayIter::new(left); - let right_iter = ArrayIter::new(right); - - let values: Result::Native>>> = left_iter - .into_iter() - .zip(right_iter.into_iter()) - .map(|(l, r)| { - if let (Some(l), Some(r)) = (l, r) { - let result = op(l, r); - if let Some(r) = result { - Ok(Some(r)) - } else { - // Overflow - Err(ArrowError::ComputeError(format!( - "Overflow happened on: {:?}, {:?}", - l, r - ))) - } - } else { - Ok(None) - } - }) - .collect(); - - let values = values?; - - Ok(PrimitiveArray::::from_iter(values)) -} - -/// This is similar to `math_checked_op` but just for divide op. -fn math_checked_divide( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - RT::Native: One + Zero, - F: Fn(LT::Native, RT::Native) -> Option, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - let left_iter = ArrayIter::new(left); - let right_iter = ArrayIter::new(right); - - let values: Result::Native>>> = left_iter - .into_iter() - .zip(right_iter.into_iter()) - .map(|(l, r)| { - if let (Some(l), Some(r)) = (l, r) { - let result = op(l, r); - if let Some(r) = result { - Ok(Some(r)) - } else if r.is_zero() { - Err(ArrowError::ComputeError(format!( - "DivideByZero on: {:?}, {:?}", - l, r - ))) - } else { - // Overflow - Err(ArrowError::ComputeError(format!( - "Overflow happened on: {:?}, {:?}", - l, r - ))) - } - } else { - Ok(None) - } + try_binary(left, right, |a, b| { + op(a, b).ok_or_else(|| { + ArrowError::ComputeError(format!("Overflow happened on: {:?}, {:?}", a, b)) }) - .collect(); - - let values = values?; - - Ok(PrimitiveArray::::from_iter(values)) + }) } /// Helper function for operations where a valid `0` on the right array should @@ -211,15 +110,16 @@ where /// This function errors if: /// * the arrays have different lengths /// * there is an element where both left and right values are valid and the right value is `0` -fn math_checked_divide_op( - left: &PrimitiveArray, - right: &PrimitiveArray, +fn math_checked_divide_op( + left: &PrimitiveArray, + right: &PrimitiveArray, op: F, -) -> Result> +) -> Result> where - T: ArrowNumericType, - T::Native: One + Zero, - F: Fn(T::Native, T::Native) -> T::Native, + LT: ArrowNumericType, + RT: ArrowNumericType, + RT::Native: One + Zero, + F: Fn(LT::Native, RT::Native) -> Option, { if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -227,16 +127,18 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; - - math_checked_divide_op_on_iters( - left.into_iter(), - right.into_iter(), - op, - left.len(), - null_bit_buffer, - ) + try_binary(left, right, |l, r| { + if r.is_zero() { + Err(ArrowError::DivideByZero) + } else { + op(l, r).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?}, {:?}", + l, r + )) + }) + } + }) } /// Helper function for operations where a valid `0` on the right array should @@ -900,7 +802,7 @@ pub fn add_scalar( scalar: T::Native, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Add, { Ok(unary(array, |value| value + scalar)) @@ -911,7 +813,7 @@ where /// the scalar, or a `DictionaryArray` of the value type same as the scalar. pub fn add_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Add, { unary_dyn::<_, T>(array, |value| value + scalar) @@ -927,7 +829,7 @@ pub fn subtract( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_op(left, right, |a, b| a.sub_wrapping(b)) @@ -943,7 +845,7 @@ pub fn subtract_checked( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_checked_op(left, right, |a, b| a.sub_checked(b)) @@ -1033,7 +935,7 @@ pub fn multiply( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_op(left, right, |a, b| a.mul_wrapping(b)) @@ -1049,7 +951,7 @@ pub fn multiply_checked( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_checked_op(left, right, |a, b| a.mul_checked(b)) @@ -1100,7 +1002,7 @@ where /// the scalar, or a `DictionaryArray` of the value type same as the scalar. pub fn multiply_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Add + Sub + Mul @@ -1120,7 +1022,7 @@ pub fn modulus( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Rem + Zero + One, { #[cfg(feature = "simd")] @@ -1128,7 +1030,7 @@ where a % b }); #[cfg(not(feature = "simd"))] - return math_checked_divide_op(left, right, |a, b| a % b); + return math_checked_divide_op(left, right, |a, b| Some(a % b)); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1148,7 +1050,7 @@ where #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| a / b); #[cfg(not(feature = "simd"))] - return math_checked_divide(left, right, |a, b| a.div_checked(b)); + return math_checked_divide_op(left, right, |a, b| a.div_checked(b)); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1162,7 +1064,7 @@ pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { _ => { downcast_primitive_array!( (left, right) => { - math_checked_divide_op(left, right, |a, b| a / b).map(|a| Arc::new(a) as ArrayRef) + math_checked_divide_op(left, right, |a, b| Some(a / b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -1199,7 +1101,7 @@ pub fn modulus_scalar( modulo: T::Native, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Rem + Zero, { if modulo.is_zero() { @@ -1217,7 +1119,7 @@ pub fn divide_scalar( divisor: T::Native, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Div + Zero, { if divisor.is_zero() { @@ -1232,7 +1134,7 @@ where /// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. pub fn divide_scalar_dyn(array: &dyn Array, divisor: T::Native) -> Result where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Div + Zero, { if divisor.is_zero() { diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 1251baf52fd8..ee3ff5e23a83 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -17,37 +17,41 @@ //! Defines kernels suitable to perform operations to primitive arrays. -use crate::array::{Array, ArrayData, ArrayRef, DictionaryArray, PrimitiveArray}; +use crate::array::{ + Array, ArrayData, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, +}; use crate::buffer::Buffer; +use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; +use crate::util::bit_iterator::try_for_each_valid_idx; use std::sync::Arc; #[inline] -fn into_primitive_array_data( - array: &PrimitiveArray, +unsafe fn build_primitive_array( + len: usize, buffer: Buffer, -) -> ArrayData { - let data = array.data(); - unsafe { - ArrayData::new_unchecked( - O::DATA_TYPE, - array.len(), - Some(data.null_count()), - data.null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![buffer], - vec![], - ) - } + null_count: usize, + null_buffer: Option, +) -> PrimitiveArray { + PrimitiveArray::from(ArrayData::new_unchecked( + O::DATA_TYPE, + len, + Some(null_count), + null_buffer, + 0, + vec![buffer], + vec![], + )) } /// Applies an unary and infallible function to a primitive array. /// This is the fastest way to perform an operation on a primitive array when -/// the benefits of a vectorized operation outweights the cost of branching nulls and non-nulls. +/// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. +/// /// # Implementation +/// /// This will apply the function for all values, including those on null slots. /// This implies that the operation must be infallible for any value of the corresponding type /// or this function may panic. @@ -68,6 +72,14 @@ where O: ArrowPrimitiveType, F: Fn(I::Native) -> O::Native, { + let data = array.data(); + let len = data.len(); + let null_count = data.null_count(); + + let null_buffer = data + .null_buffer() + .map(|b| b.bit_slice(data.offset(), data.len())); + let values = array.values().iter().map(|v| op(*v)); // JUSTIFICATION // Benefit @@ -75,9 +87,40 @@ where // Soundness // `values` is an iterator with a known size because arrays are sized. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; + unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } +} + +/// Applies a unary and fallible function to all valid values in a primitive array +/// +/// This is unlike [`unary`] which will apply an infallible function to all rows regardless +/// of validity, in many cases this will be significantly faster and should be preferred +/// if `op` is infallible. +/// +/// Note: LLVM is currently unable to effectively vectorize fallible operations +pub fn try_unary(array: &PrimitiveArray, op: F) -> Result> +where + I: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(I::Native) -> Result, +{ + let len = array.len(); + let null_count = array.null_count(); + + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(array.len()); + let slice = buffer.as_slice_mut(); + + let null_buffer = array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())); - let data = into_primitive_array_data::<_, O>(array, buffer); - PrimitiveArray::::from(data) + try_for_each_valid_idx(array.len(), 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { *slice.get_unchecked_mut(idx) = op(array.value_unchecked(idx))? }; + Ok::<_, ArrowError>(()) + })?; + + Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) } /// A helper function that applies an unary function to a dictionary array with primitive value type. @@ -119,6 +162,101 @@ where } } +/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, collecting +/// the results in a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the +/// corresponding index in the result will also be null +/// +/// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This +/// is beneficial when the cost of the operation is low compared to the cost of branching, and +/// especially when the operation can be vectorised, however, requires `op` to be infallible +/// for all possible values of its inputs +/// +/// # Panic +/// +/// Panics if the arrays have different lengths +pub fn binary( + a: &PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> PrimitiveArray +where + A: ArrowPrimitiveType, + B: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(A::Native, B::Native) -> O::Native, +{ + assert_eq!(a.len(), b.len()); + let len = a.len(); + + if a.is_empty() { + return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + } + + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits()) + .unwrap_or_default(); + + let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size from a PrimitiveArray + let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; + + unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } +} + +/// Applies the provided fallible binary operation across `a` and `b`, returning any error, +/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a` +/// or `b`, the corresponding index in the result will also be null +/// +/// Like [`try_unary`] the function is only evaluated for non-null indices +/// +/// # Panic +/// +/// Panics if the arrays have different lengths +pub fn try_binary( + a: &PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> Result> +where + A: ArrowPrimitiveType, + B: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(A::Native, B::Native) -> Result, +{ + assert_eq!(a.len(), b.len()); + let len = a.len(); + + if a.is_empty() { + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); + } + + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits()) + .unwrap_or_default(); + + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(len); + let slice = buffer.as_slice_mut(); + + try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { + *slice.get_unchecked_mut(idx) = + op(a.value_unchecked(idx), b.value_unchecked(idx))? + }; + Ok::<_, ArrowError>(()) + })?; + + Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/util/bit_iterator.rs b/arrow/src/util/bit_iterator.rs index bba9dac60a4b..ceefaa860cb1 100644 --- a/arrow/src/util/bit_iterator.rs +++ b/arrow/src/util/bit_iterator.rs @@ -16,6 +16,7 @@ // under the License. use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use std::result::Result; /// Iterator of contiguous ranges of set bits within a provided packed bitmask /// @@ -157,4 +158,45 @@ impl<'a> Iterator for BitIndexIterator<'a> { } } +/// Calls the provided closure for each index in the provided null mask that is set, +/// using an adaptive strategy based on the null count +/// +/// Ideally this would be encapsulated in an [`Iterator`] that would determine the optimal +/// strategy up front, and then yield indexes based on this. +/// +/// Unfortunately, external iteration based on the resulting [`Iterator`] would match the strategy +/// variant on each call to [`Iterator::next`], and LLVM generally cannot eliminate this. +/// +/// One solution to this might be internal iteration, e.g. [`Iterator::try_fold`], however, +/// it is currently [not possible] to override this for custom iterators in stable Rust. +/// +/// As such this is the next best option +/// +/// [not possible]: https://github.com/rust-lang/rust/issues/69595 +#[inline] +pub fn try_for_each_valid_idx Result<(), E>>( + len: usize, + offset: usize, + null_count: usize, + nulls: Option<&[u8]>, + f: F, +) -> Result<(), E> { + let valid_count = len - null_count; + + if valid_count == len { + (0..len).try_for_each(f) + } else if null_count != len { + let selectivity = valid_count as f64 / len as f64; + if selectivity > 0.8 { + BitSliceIterator::new(nulls.unwrap(), offset, len) + .flat_map(|(start, end)| start..end) + .try_for_each(f) + } else { + BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f) + } + } else { + Ok(()) + } +} + // Note: tests located in filter module From 8206f013363addad1bf37b2ff96aa44003d70f22 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 11 Sep 2022 07:34:51 +0100 Subject: [PATCH 0036/1411] Verify valid UTF-8 when converting byte array (#2205) (#2686) * Verify valid UTF-8 when converting byte array (#2205) * Add doc comment --- arrow/src/array/array_string.rs | 15 +++++++++------ arrow/src/array/builder/generic_string_builder.rs | 12 ++++++++++-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 62743a20a119..f3ecaa2d5591 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -129,8 +129,13 @@ impl GenericStringArray { } /// Convert a list array to a string array. - /// This method is unsound because it does - /// not check the utf-8 validation for each element. + /// + /// Note: this performs potentially expensive UTF-8 validation, consider using + /// [`StringBuilder`][crate::array::StringBuilder] to avoid this + /// + /// # Panics + /// + /// This method panics if the array contains non-UTF-8 data fn from_list(v: GenericListArray) -> Self { assert_eq!( v.data_ref().child_data().len(), @@ -164,8 +169,7 @@ impl GenericStringArray { .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data().null_buffer().cloned()); - let array_data = unsafe { builder.build_unchecked() }; - Self::from(array_data) + Self::from(builder.build().unwrap()) } /// Creates a [`GenericStringArray`] based on an iterator of values without nulls @@ -352,8 +356,7 @@ impl From> { fn from(v: GenericBinaryArray) -> Self { let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - let data = unsafe { builder.build_unchecked() }; - Self::from(data) + Self::from(builder.build().unwrap()) } } diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs index 8f69f5d9c7be..0dd6440a2652 100644 --- a/arrow/src/array/builder/generic_string_builder.rs +++ b/arrow/src/array/builder/generic_string_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait}; +use crate::array::{Array, ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait}; use std::any::Any; use std::sync::Arc; @@ -67,7 +67,15 @@ impl GenericStringBuilder { /// Builds the [`GenericStringArray`] and reset this builder. pub fn finish(&mut self) -> GenericStringArray { - GenericStringArray::::from(self.builder.finish()) + let t = GenericStringArray::::DATA_TYPE; + let v = self.builder.finish(); + let builder = v.into_data().into_builder().data_type(t); + + // SAFETY: + // Data must be UTF-8 as only support writing `str` + // Offsets must be valid as guaranteed by `GenericBinaryBuilder` + let data = unsafe { builder.build_unchecked() }; + data.into() } /// Returns the current values buffer as a slice From e646ae86b345906c9ac76a61afcdc0aa71fded48 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 11 Sep 2022 14:15:21 +0200 Subject: [PATCH 0037/1411] Clarify docs of binary and string builders (#2699) * Clarify docs of binary and string builders * Improve doc of with capacity based on review feedback --- .../array/builder/generic_binary_builder.rs | 11 +++++++---- .../array/builder/generic_string_builder.rs | 19 +++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs index 26501ba099da..7f83a945343a 100644 --- a/arrow/src/array/builder/generic_binary_builder.rs +++ b/arrow/src/array/builder/generic_binary_builder.rs @@ -38,9 +38,12 @@ impl GenericBinaryBuilder { Self::with_capacity(1024, 1024) } - /// Creates a new [`GenericBinaryBuilder`], - /// `item_capacity` is the number of items to pre-allocate space for in this builder - /// `data_capacity` is the number of bytes to pre-allocate space for in this builder + /// Creates a new [`GenericBinaryBuilder`]. + /// + /// - `item_capacity` is the number of items to pre-allocate. + /// The size of the preallocated buffer of offsets is the number of items plus one. + /// - `data_capacity` is the total number of bytes of string data to pre-allocate + /// (for all items, not per item). pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); offsets_builder.append(OffsetSize::zero()); @@ -60,7 +63,7 @@ impl GenericBinaryBuilder { .append(OffsetSize::from_usize(self.value_builder.len()).unwrap()); } - /// Append a null value to the array. + /// Append a null value into the builder. #[inline] pub fn append_null(&mut self) { self.null_buffer_builder.append(false); diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs index 0dd6440a2652..f36e499b8462 100644 --- a/arrow/src/array/builder/generic_string_builder.rs +++ b/arrow/src/array/builder/generic_string_builder.rs @@ -28,16 +28,19 @@ pub struct GenericStringBuilder { } impl GenericStringBuilder { - /// Creates a new [`GenericStringBuilder`], + /// Creates a new [`GenericStringBuilder`]. pub fn new() -> Self { Self { builder: GenericBinaryBuilder::new(), } } - /// Creates a new [`GenericStringBuilder`], - /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder - /// `item_capacity` is the number of items to pre-allocate space for in this builder + /// Creates a new [`GenericStringBuilder`]. + /// + /// - `item_capacity` is the number of items to pre-allocate. + /// The size of the preallocated buffer of offsets is the number of items plus one. + /// - `data_capacity` is the total number of bytes of string data to pre-allocate + /// (for all items, not per item). pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { Self { builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity), @@ -50,13 +53,13 @@ impl GenericStringBuilder { self.builder.append_value(value.as_ref().as_bytes()); } - /// Append a null value to the array. + /// Append a null value into the builder. #[inline] pub fn append_null(&mut self) { self.builder.append_null() } - /// Append an `Option` value to the array. + /// Append an `Option` value into the builder. #[inline] pub fn append_option(&mut self, value: Option>) { match value { @@ -78,12 +81,12 @@ impl GenericStringBuilder { data.into() } - /// Returns the current values buffer as a slice + /// Returns the current values buffer as a slice. pub fn values_slice(&self) -> &[u8] { self.builder.values_slice() } - /// Returns the current offsets buffer as a slice + /// Returns the current offsets buffer as a slice. pub fn offsets_slice(&self) -> &[OffsetSize] { self.builder.offsets_slice() } From e1f8ed8f894e71e58423f1fa776b03072bdb3f39 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 12 Sep 2022 00:43:35 -0700 Subject: [PATCH 0038/1411] Overflow-checking variant of arithmetic scalar kernels (#2650) * Overflow-checking variant of arithmetic scalar kernels * Remove division scalar change for now. --- arrow/src/compute/kernels/arithmetic.rs | 135 +++++++++++++++++++++--- 1 file changed, 118 insertions(+), 17 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 17850f2a8cff..6638ae1e87df 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -32,7 +32,7 @@ use crate::buffer::Buffer; use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; use crate::compute::util::combine_option_bitmap; -use crate::compute::{binary, try_binary, unary_dyn}; +use crate::compute::{binary, try_binary, try_unary, unary_dyn}; use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, @@ -797,15 +797,38 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// Add every value in an array by a scalar. If any value in the array is null then the /// result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `add_scalar_checked` instead. pub fn add_scalar( array: &PrimitiveArray, scalar: T::Native, ) -> Result> where T: ArrowNumericType, - T::Native: Add, + T::Native: ArrowNativeTypeOp, { - Ok(unary(array, |value| value + scalar)) + Ok(unary(array, |value| value.add_wrapping(scalar))) +} + +/// Add every value in an array by a scalar. If any value in the array is null then the +/// result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `add_scalar` instead. +pub fn add_scalar_checked( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + try_unary(array, |value| { + value.add_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) + }) + }) } /// Add every value in an array by a scalar. If any value in the array is null then the @@ -874,19 +897,41 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// Subtract every value in an array by a scalar. If any value in the array is null then the /// result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `subtract_scalar_checked` instead. pub fn subtract_scalar( array: &PrimitiveArray, scalar: T::Native, ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero, + T::Native: ArrowNativeTypeOp + Zero, { - Ok(unary(array, |value| value - scalar)) + Ok(unary(array, |value| value.sub_wrapping(scalar))) +} + +/// Subtract every value in an array by a scalar. If any value in the array is null then the +/// result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `subtract_scalar` instead. +pub fn subtract_scalar_checked( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero, +{ + try_unary(array, |value| { + value.sub_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow: subtracting {:?} from {:?}", + scalar, value + )) + }) + }) } /// Subtract every value in an array by a scalar. If any value in the array is null then the @@ -980,21 +1025,41 @@ pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// Multiply every value in an array by a scalar. If any value in the array is null then the /// result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_scalar_checked` instead. pub fn multiply_scalar( array: &PrimitiveArray, scalar: T::Native, ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Rem - + Zero - + One, + T::Native: ArrowNativeTypeOp + Zero + One, { - Ok(unary(array, |value| value * scalar)) + Ok(unary(array, |value| value.mul_wrapping(scalar))) +} + +/// Multiply every value in an array by a scalar. If any value in the array is null then the +/// result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `multiply_scalar` instead. +pub fn multiply_scalar_checked( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero + One, +{ + try_unary(array, |value| { + value.mul_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow: multiplying {:?} by {:?}", + value, scalar, + )) + }) + }) } /// Multiply every value in an array by a scalar. If any value in the array is null then the @@ -2094,4 +2159,40 @@ mod tests { let overflow = divide_checked(&a, &b); overflow.expect_err("overflow should be detected"); } + + #[test] + fn test_primitive_add_scalar_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + + let wrapped = add_scalar(&a, 1); + let expected = Int32Array::from(vec![-2147483648, -2147483647]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = add_scalar_checked(&a, 1); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_subtract_scalar_wrapping_overflow() { + let a = Int32Array::from(vec![-2]); + + let wrapped = subtract_scalar(&a, i32::MAX); + let expected = Int32Array::from(vec![i32::MAX]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = subtract_scalar_checked(&a, i32::MAX); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_mul_scalar_wrapping_overflow() { + let a = Int32Array::from(vec![10]); + + let wrapped = multiply_scalar(&a, i32::MAX); + let expected = Int32Array::from(vec![-10]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = multiply_scalar_checked(&a, i32::MAX); + overflow.expect_err("overflow should be detected"); + } } From be33fb3d0c6af03516e6b3a89167346192f5a6ff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Sep 2022 16:32:46 +0100 Subject: [PATCH 0039/1411] Update criterion requirement from 0.3 to 0.4 (#2706) Updates the requirements on [criterion](https://github.com/bheisler/criterion.rs) to permit the latest version. - [Release notes](https://github.com/bheisler/criterion.rs/releases) - [Changelog](https://github.com/bheisler/criterion.rs/blob/master/CHANGELOG.md) - [Commits](https://github.com/bheisler/criterion.rs/compare/0.3.0...0.4.0) --- updated-dependencies: - dependency-name: criterion dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 1b2bb6fd775b..2de4db64276f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -96,7 +96,7 @@ dyn_cmp_dict = [] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } -criterion = { version = "0.3", default-features = false } +criterion = { version = "0.4", default-features = false } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } tempfile = { version = "3", default-features = false } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 1d442b426cdf..a2d11eb5862b 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -53,7 +53,7 @@ hashbrown = { version = "0.12", default-features = false } [dev-dependencies] base64 = { version = "0.13", default-features = false, features = ["std"] } -criterion = { version = "0.3", default-features = false } +criterion = { version = "0.4", default-features = false } snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } brotli = { version = "3.3", default-features = false, features = ["std"] } From 0ba5c5bf47941d0aa407b592ad594d902979ce50 Mon Sep 17 00:00:00 2001 From: Konstantin Fastov Date: Mon, 12 Sep 2022 22:12:03 +0300 Subject: [PATCH 0040/1411] Add DataType::is_nested() (#2707) * Add DataType::is_nested() * Fix documentation for is_nested Co-authored-by: Liang-Chi Hsieh Co-authored-by: Liang-Chi Hsieh --- arrow/src/datatypes/datatype.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index b65bfd7725ac..04d139b67272 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -1439,6 +1439,20 @@ impl DataType { ) } + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, or Map) + pub fn is_nested(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + List(_) + | FixedSizeList(_, _) + | LargeList(_) + | Struct(_) + | Union(_, _, _) + | Map(_, _) + ) + } + /// Compares the datatype with another, ignoring nested field names /// and metadata. pub fn equals_datatype(&self, other: &DataType) -> bool { From 259a3028b22f47d788b5e32b4651731206b1b4b6 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Tue, 13 Sep 2022 11:12:32 +0800 Subject: [PATCH 0041/1411] Support bitwise and operation in the kernel (#2703) * add bitwise add op * address comments: change the test --- arrow/src/compute/kernels/bitwise.rs | 109 +++++++++++++++++++++++++++ arrow/src/compute/kernels/mod.rs | 1 + 2 files changed, 110 insertions(+) create mode 100644 arrow/src/compute/kernels/bitwise.rs diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow/src/compute/kernels/bitwise.rs new file mode 100644 index 000000000000..18b9f4bb760c --- /dev/null +++ b/arrow/src/compute/kernels/bitwise.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::PrimitiveArray; +use crate::compute::{binary, unary}; +use crate::datatypes::ArrowNumericType; +use crate::error::{ArrowError, Result}; +use std::ops::BitAnd; + +// The helper function for bitwise operation with two array +fn bitwise_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> T::Native, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform bitwise operation on arrays of different length".to_string(), + )); + } + Ok(binary(left, right, op)) +} + +/// Perform `left & right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_and( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitAnd, +{ + bitwise_op(left, right, |a, b| a & b) +} + +/// Perform bitwise and every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_and_scalar( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitAnd, +{ + Ok(unary(array, |value| value & scalar)) +} + +#[cfg(test)] +mod tests { + use crate::array::{Int32Array, UInt64Array}; + use crate::compute::kernels::bitwise::{bitwise_and, bitwise_and_scalar}; + use crate::error::Result; + + #[test] + fn test_bitwise_and_array() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12)]); + let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let result = bitwise_and(&left, &right)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = Int32Array::from(vec![Some(5), Some(10), Some(8), Some(12)]); + let expected = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let result = bitwise_and(&left, &right)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_and_array_scalar() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); + let scalar = 7; + let expected = UInt64Array::from(vec![Some(7), Some(2), None, Some(4)]); + let result = bitwise_and_scalar(&left, scalar)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let scalar = 20; + let expected = Int32Array::from(vec![Some(0), Some(0), None, Some(4)]); + let result = bitwise_and_scalar(&left, scalar)?; + assert_eq!(expected, result); + Ok(()) + } +} diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index c615d3a55e1a..99cdcf460ce1 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -20,6 +20,7 @@ pub mod aggregate; pub mod arithmetic; pub mod arity; +pub mod bitwise; pub mod boolean; pub mod cast; pub mod cast_utils; From f56b5733bc18688a938158529a0d138c7b2e1b47 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Tue, 13 Sep 2022 14:06:27 +0800 Subject: [PATCH 0042/1411] optimize the `numeric_cast_with_error` (#2661) * optimize the numeric_cast_with_error * fix error message and change the function name --- arrow/src/compute/kernels/cast.rs | 37 +++++++++++++------------------ 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 7d67bffdf4ea..dcd80ab11d68 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -50,7 +50,7 @@ use crate::compute::kernels::arity::unary; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; use crate::compute::kernels::temporal::extract_component_from_array; use crate::compute::kernels::temporal::return_compute_error_with; -use crate::compute::using_chrono_tz_and_utc_naive_date_time; +use crate::compute::{try_unary, using_chrono_tz_and_utc_naive_date_time}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::temporal_conversions::{ @@ -1514,8 +1514,8 @@ fn cast_decimal_to_decimal( v.as_ref().and_then(|v| v.to_i128()) .ok_or_else(|| { ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) }) .map(Some) } @@ -1569,8 +1569,8 @@ fn cast_decimal_to_decimal( v.as_ref().and_then(|v| v.to_i128()) .ok_or_else(|| { ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) }) .map(Some) } @@ -1641,7 +1641,7 @@ where ))) } else { // If the value can't be casted to the `TO::Native`, return error - Ok(Arc::new(numeric_cast_with_error::( + Ok(Arc::new(try_numeric_cast::( from.as_any() .downcast_ref::>() .unwrap(), @@ -1651,29 +1651,22 @@ where // Natural cast between numeric types // If the value of T can't be casted to R, will throw error -fn numeric_cast_with_error(from: &PrimitiveArray) -> Result> +fn try_numeric_cast(from: &PrimitiveArray) -> Result> where T: ArrowNumericType, R: ArrowNumericType, T::Native: num::NumCast, R::Native: num::NumCast, { - let iter = from - .iter() - .map(|v| match v { - None => Ok(None), - Some(value) => match num::cast::cast::(value) { - None => Err(ArrowError::CastError(format!( - "Can't cast value {:?} to type {}", - value, - R::DATA_TYPE - ))), - Some(v) => Ok(Some(v)), - }, + try_unary(from, |value| { + num::cast::cast::(value).ok_or_else(|| { + ArrowError::CastError(format!( + "Can't cast value {:?} to type {}", + value, + R::DATA_TYPE + )) }) - .collect::>>>()?; - - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) + }) } // Natural cast between numeric types From 7e47fa6484e0951a58956b2a748486ef140c1cc4 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Wed, 14 Sep 2022 03:08:47 +0800 Subject: [PATCH 0043/1411] support bitwise op: or,xor,not (#2716) --- arrow/src/compute/kernels/bitwise.rs | 167 ++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 5 deletions(-) diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow/src/compute/kernels/bitwise.rs index 18b9f4bb760c..2f3c9e490f4c 100644 --- a/arrow/src/compute/kernels/bitwise.rs +++ b/arrow/src/compute/kernels/bitwise.rs @@ -19,7 +19,7 @@ use crate::array::PrimitiveArray; use crate::compute::{binary, unary}; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; -use std::ops::BitAnd; +use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array fn bitwise_op( @@ -52,7 +52,43 @@ where bitwise_op(left, right, |a, b| a & b) } -/// Perform bitwise and every value in an array with the scalar. If any value in the array is null then the +/// Perform `left | right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_or( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitOr, +{ + bitwise_op(left, right, |a, b| a | b) +} + +/// Perform `left ^ right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_xor( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitXor, +{ + bitwise_op(left, right, |a, b| a ^ b) +} + +/// Perform `!array` operation on array. If array value is null +/// then the result is also null. +pub fn bitwise_not(array: &PrimitiveArray) -> Result> +where + T: ArrowNumericType, + T::Native: Not, +{ + Ok(unary(array, |value| !value)) +} + +/// Perform bitwise `and` every value in an array with the scalar. If any value in the array is null then the /// result is also null. pub fn bitwise_and_scalar( array: &PrimitiveArray, @@ -65,10 +101,39 @@ where Ok(unary(array, |value| value & scalar)) } +/// Perform bitwise `or` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_or_scalar( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitOr, +{ + Ok(unary(array, |value| value | scalar)) +} + +/// Perform bitwise `xor` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_xor_scalar( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitXor, +{ + Ok(unary(array, |value| value ^ scalar)) +} + #[cfg(test)] mod tests { use crate::array::{Int32Array, UInt64Array}; - use crate::compute::kernels::bitwise::{bitwise_and, bitwise_and_scalar}; + use crate::compute::kernels::bitwise::{ + bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, + bitwise_xor, bitwise_xor_scalar, + }; use crate::error::Result; #[test] @@ -82,7 +147,7 @@ mod tests { // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); - let right = Int32Array::from(vec![Some(5), Some(10), Some(8), Some(12)]); + let right = Int32Array::from(vec![Some(5), Some(-10), Some(8), Some(12)]); let expected = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let result = bitwise_and(&left, &right)?; assert_eq!(expected, result); @@ -100,10 +165,102 @@ mod tests { // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); - let scalar = 20; + let scalar = -20; let expected = Int32Array::from(vec![Some(0), Some(0), None, Some(4)]); let result = bitwise_and_scalar(&left, scalar)?; assert_eq!(expected, result); Ok(()) } + + #[test] + fn test_bitwise_or_array() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); + let expected = UInt64Array::from(vec![Some(7), Some(7), None, Some(13)]); + let result = bitwise_or(&left, &right)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = Int32Array::from(vec![Some(-7), Some(-5), Some(8), Some(13)]); + let expected = Int32Array::from(vec![Some(-7), Some(-5), None, Some(13)]); + let result = bitwise_or(&left, &right)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_not_array() -> Result<()> { + // unsigned value + let array = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let expected = UInt64Array::from(vec![ + Some(18446744073709551614), + Some(18446744073709551613), + None, + Some(18446744073709551611), + ]); + let result = bitwise_not(&array)?; + assert_eq!(expected, result); + // signed value + let array = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let expected = Int32Array::from(vec![Some(-2), Some(-3), None, Some(-5)]); + let result = bitwise_not(&array)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_or_array_scalar() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); + let scalar = 7; + let expected = UInt64Array::from(vec![Some(15), Some(7), None, Some(7)]); + let result = bitwise_or_scalar(&left, scalar)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let scalar = 20; + let expected = Int32Array::from(vec![Some(21), Some(22), None, Some(20)]); + let result = bitwise_or_scalar(&left, scalar)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_xor_array() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); + let expected = UInt64Array::from(vec![Some(6), Some(7), None, Some(9)]); + let result = bitwise_xor(&left, &right)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = Int32Array::from(vec![Some(-7), Some(5), Some(8), Some(-13)]); + let expected = Int32Array::from(vec![Some(-8), Some(7), None, Some(-9)]); + let result = bitwise_xor(&left, &right)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_xor_array_scalar() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); + let scalar = 7; + let expected = UInt64Array::from(vec![Some(8), Some(5), None, Some(3)]); + let result = bitwise_xor_scalar(&left, scalar)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let scalar = -20; + let expected = Int32Array::from(vec![Some(-19), Some(-18), None, Some(-24)]); + let result = bitwise_xor_scalar(&left, scalar)?; + assert_eq!(expected, result); + Ok(()) + } } From 4f52a252374da49d7346aeb2e1b996133f8cf6b2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 13 Sep 2022 14:23:52 -0700 Subject: [PATCH 0044/1411] Add divide_opt kernel which produce null values on division by zero error (#2710) * Add divide_opt kernel * Add fast-path for non-null arrays * Add doc --- arrow/src/compute/kernels/arithmetic.rs | 51 +++++++++++++++++++++-- arrow/src/compute/kernels/arity.rs | 55 ++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 6638ae1e87df..a344407e426d 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -32,7 +32,7 @@ use crate::buffer::Buffer; use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; use crate::compute::util::combine_option_bitmap; -use crate::compute::{binary, try_binary, try_unary, unary_dyn}; +use crate::compute::{binary, binary_opt, try_binary, try_unary, unary_dyn}; use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, @@ -711,7 +711,7 @@ where } /// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. Once +/// then the result is also null. /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add` instead. @@ -1118,6 +1118,32 @@ where return math_checked_divide_op(left, right, |a, b| a.div_checked(b)); } +/// Perform `left / right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// If any right hand value is zero, the operation value will be replaced with null in the +/// result. +/// +/// Unlike `divide` or `divide_checked`, division by zero will get a null value instead +/// returning an `Err`, this also doesn't check overflowing, overflowing will just wrap +/// the result around. +pub fn divide_opt( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero + One, +{ + Ok(binary_opt(left, right, |a, b| { + if b.is_zero() { + None + } else { + Some(a.div_wrapping(b)) + } + })) +} + /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. @@ -1152,7 +1178,7 @@ pub fn divide( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_op(left, right, |a, b| a.div_wrapping(b)) @@ -2195,4 +2221,23 @@ mod tests { let overflow = multiply_scalar_checked(&a, i32::MAX); overflow.expect_err("overflow should be detected"); } + + #[test] + fn test_primitive_div_opt_overflow_division_by_zero() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + + let wrapped = divide(&a, &b); + let expected = Int32Array::from(vec![-2147483648]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = divide_opt(&a, &b); + let expected = Int32Array::from(vec![-2147483648]); + assert_eq!(expected, overflow.unwrap()); + + let b = Int32Array::from(vec![0]); + let overflow = divide_opt(&a, &b); + let expected = Int32Array::from(vec![None]); + assert_eq!(expected, overflow.unwrap()); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index ee3ff5e23a83..fffa81af8190 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -18,7 +18,7 @@ //! Defines kernels suitable to perform operations to primitive arrays. use crate::array::{ - Array, ArrayData, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, + Array, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, }; use crate::buffer::Buffer; use crate::compute::util::combine_option_bitmap; @@ -257,6 +257,59 @@ where Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) } +/// Applies the provided binary operation across `a` and `b`, collecting the optional results +/// into a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the corresponding +/// index in the result will also be null. The binary operation could return `None` which +/// results in a new null in the collected [`PrimitiveArray`]. +/// +/// The function is only evaluated for non-null indices +/// +/// # Panic +/// +/// Panics if the arrays have different lengths +pub(crate) fn binary_opt( + a: &PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> PrimitiveArray +where + A: ArrowPrimitiveType, + B: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(A::Native, B::Native) -> Option, +{ + assert_eq!(a.len(), b.len()); + + if a.is_empty() { + return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + } + + if a.null_count() == 0 && b.null_count() == 0 { + a.values() + .iter() + .zip(b.values().iter()) + .map(|(a, b)| op(*a, *b)) + .collect() + } else { + let iter_a = ArrayIter::new(a); + let iter_b = ArrayIter::new(b); + + let values = + iter_a + .into_iter() + .zip(iter_b.into_iter()) + .map(|(item_a, item_b)| { + if let (Some(a), Some(b)) = (item_a, item_b) { + op(a, b) + } else { + None + } + }); + + values.collect() + } +} + #[cfg(test)] mod tests { use super::*; From 51466634f11b7d965ca3c912835c91e0f84a6c92 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:42:11 +0100 Subject: [PATCH 0045/1411] Move JSON Test Format To integration-testing (#2724) * Move JSON Test Format To integration-testing * Fix RAT --- arrow/src/datatypes/datatype.rs | 344 ------- arrow/src/datatypes/field.rs | 277 ------ arrow/src/datatypes/mod.rs | 938 +----------------- arrow/src/datatypes/schema.rs | 81 -- integration-testing/src/lib.rs | 2 +- integration-testing/src/util/datatype.rs | 383 +++++++ integration-testing/src/util/field.rs | 586 +++++++++++ .../src/{util.rs => util/mod.rs} | 14 +- integration-testing/src/util/schema.rs | 733 ++++++++++++++ 9 files changed, 1716 insertions(+), 1642 deletions(-) create mode 100644 integration-testing/src/util/datatype.rs create mode 100644 integration-testing/src/util/field.rs rename integration-testing/src/{util.rs => util/mod.rs} (99%) create mode 100644 integration-testing/src/util/schema.rs diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 04d139b67272..2ca71ef77725 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -1052,350 +1052,6 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } impl DataType { - /// Parse a data type from a JSON representation. - #[cfg(feature = "json")] - pub(crate) fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - let default_field = Field::new("", DataType::Boolean, true); - match *json { - Value::Object(ref map) => match map.get("name") { - Some(s) if s == "null" => Ok(DataType::Null), - Some(s) if s == "bool" => Ok(DataType::Boolean), - Some(s) if s == "binary" => Ok(DataType::Binary), - Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), - Some(s) if s == "utf8" => Ok(DataType::Utf8), - Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), - Some(s) if s == "fixedsizebinary" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("byteWidth") { - Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) - } else { - Err(ArrowError::ParseError( - "Expecting a byteWidth for fixedsizebinary".to_string(), - )) - } - } - Some(s) if s == "decimal" => { - // return a list with any type as its child isn't defined in the map - let precision = match map.get("precision") { - Some(p) => Ok(p.as_u64().unwrap().try_into().unwrap()), - None => Err(ArrowError::ParseError( - "Expecting a precision for decimal".to_string(), - )), - }?; - let scale = match map.get("scale") { - Some(s) => Ok(s.as_u64().unwrap().try_into().unwrap()), - _ => Err(ArrowError::ParseError( - "Expecting a scale for decimal".to_string(), - )), - }?; - let bit_width: usize = match map.get("bitWidth") { - Some(b) => b.as_u64().unwrap() as usize, - _ => 128, // Default bit width - }; - - if bit_width == 128 { - Ok(DataType::Decimal128(precision, scale)) - } else if bit_width == 256 { - Ok(DataType::Decimal256(precision, scale)) - } else { - Err(ArrowError::ParseError( - "Decimal bit_width invalid".to_string(), - )) - } - } - Some(s) if s == "floatingpoint" => match map.get("precision") { - Some(p) if p == "HALF" => Ok(DataType::Float16), - Some(p) if p == "SINGLE" => Ok(DataType::Float32), - Some(p) if p == "DOUBLE" => Ok(DataType::Float64), - _ => Err(ArrowError::ParseError( - "floatingpoint precision missing or invalid".to_string(), - )), - }, - Some(s) if s == "timestamp" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "timestamp unit missing or invalid".to_string(), - )), - }; - let tz = match map.get("timezone") { - None => Ok(None), - Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), - _ => Err(ArrowError::ParseError( - "timezone must be a string".to_string(), - )), - }; - Ok(DataType::Timestamp(unit?, tz?)) - } - Some(s) if s == "date" => match map.get("unit") { - Some(p) if p == "DAY" => Ok(DataType::Date32), - Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), - _ => Err(ArrowError::ParseError( - "date unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "time" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }; - match map.get("bitWidth") { - Some(p) if p == 32 => Ok(DataType::Time32(unit?)), - Some(p) if p == 64 => Ok(DataType::Time64(unit?)), - _ => Err(ArrowError::ParseError( - "time bitWidth missing or invalid".to_string(), - )), - } - } - Some(s) if s == "duration" => match map.get("unit") { - Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => { - Ok(DataType::Duration(TimeUnit::Millisecond)) - } - Some(p) if p == "MICROSECOND" => { - Ok(DataType::Duration(TimeUnit::Microsecond)) - } - Some(p) if p == "NANOSECOND" => { - Ok(DataType::Duration(TimeUnit::Nanosecond)) - } - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => { - Ok(DataType::Interval(IntervalUnit::DayTime)) - } - Some(p) if p == "YEAR_MONTH" => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) - } - Some(p) if p == "MONTH_DAY_NANO" => { - Ok(DataType::Interval(IntervalUnit::MonthDayNano)) - } - _ => Err(ArrowError::ParseError( - "interval unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "int" => match map.get("isSigned") { - Some(&Value::Bool(true)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::Int8), - Some(16) => Ok(DataType::Int16), - Some(32) => Ok(DataType::Int32), - Some(64) => Ok(DataType::Int64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - Some(&Value::Bool(false)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::UInt8), - Some(16) => Ok(DataType::UInt16), - Some(32) => Ok(DataType::UInt32), - Some(64) => Ok(DataType::UInt64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int signed missing or invalid".to_string(), - )), - }, - Some(s) if s == "list" => { - // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_field))) - } - Some(s) if s == "largelist" => { - // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_field))) - } - Some(s) if s == "fixedsizelist" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("listSize") { - Ok(DataType::FixedSizeList( - Box::new(default_field), - size.as_i64().unwrap() as i32, - )) - } else { - Err(ArrowError::ParseError( - "Expecting a listSize for fixedsizelist".to_string(), - )) - } - } - Some(s) if s == "struct" => { - // return an empty `struct` type as its children aren't defined in the map - Ok(DataType::Struct(vec![])) - } - Some(s) if s == "map" => { - if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { - // Return a map with an empty type as its children aren't defined in the map - Ok(DataType::Map(Box::new(default_field), *keys_sorted)) - } else { - Err(ArrowError::ParseError( - "Expecting a keysSorted for map".to_string(), - )) - } - } - Some(s) if s == "union" => { - if let Some(Value::String(mode)) = map.get("mode") { - let union_mode = if mode == "SPARSE" { - UnionMode::Sparse - } else if mode == "DENSE" { - UnionMode::Dense - } else { - return Err(ArrowError::ParseError(format!( - "Unknown union mode {:?} for union", - mode - ))); - }; - if let Some(type_ids) = map.get("typeIds") { - let type_ids = type_ids - .as_array() - .unwrap() - .iter() - .map(|t| t.as_i64().unwrap() as i8) - .collect::>(); - - let default_fields = type_ids - .iter() - .map(|_| default_field.clone()) - .collect::>(); - - Ok(DataType::Union(default_fields, type_ids, union_mode)) - } else { - Err(ArrowError::ParseError( - "Expecting a typeIds for union ".to_string(), - )) - } - } else { - Err(ArrowError::ParseError( - "Expecting a mode for union".to_string(), - )) - } - } - Some(other) => Err(ArrowError::ParseError(format!( - "invalid or unsupported type name: {} in {:?}", - other, json - ))), - None => Err(ArrowError::ParseError("type name missing".to_string())), - }, - _ => Err(ArrowError::ParseError( - "invalid json value type".to_string(), - )), - } - } - - /// Generate a JSON representation of the data type. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - use serde_json::json; - match self { - DataType::Null => json!({"name": "null"}), - DataType::Boolean => json!({"name": "bool"}), - DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), - DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), - DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), - DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), - DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), - DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), - DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), - DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), - DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), - DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), - DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), - DataType::Utf8 => json!({"name": "utf8"}), - DataType::LargeUtf8 => json!({"name": "largeutf8"}), - DataType::Binary => json!({"name": "binary"}), - DataType::LargeBinary => json!({"name": "largebinary"}), - DataType::FixedSizeBinary(byte_width) => { - json!({"name": "fixedsizebinary", "byteWidth": byte_width}) - } - DataType::Struct(_) => json!({"name": "struct"}), - DataType::Union(_, _, _) => json!({"name": "union"}), - DataType::List(_) => json!({ "name": "list"}), - DataType::LargeList(_) => json!({ "name": "largelist"}), - DataType::FixedSizeList(_, length) => { - json!({"name":"fixedsizelist", "listSize": length}) - } - DataType::Time32(unit) => { - json!({"name": "time", "bitWidth": 32, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Time64(unit) => { - json!({"name": "time", "bitWidth": 64, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Date32 => { - json!({"name": "date", "unit": "DAY"}) - } - DataType::Date64 => { - json!({"name": "date", "unit": "MILLISECOND"}) - } - DataType::Timestamp(unit, None) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Timestamp(unit, Some(tz)) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }, "timezone": tz}) - } - DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { - IntervalUnit::YearMonth => "YEAR_MONTH", - IntervalUnit::DayTime => "DAY_TIME", - IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", - }}), - DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}), - DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), - DataType::Decimal128(precision, scale) => { - json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) - } - DataType::Decimal256(precision, scale) => { - json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 256}) - } - DataType::Map(_, keys_sorted) => { - json!({"name": "map", "keysSorted": keys_sorted}) - } - } - } - /// Returns true if this type is numeric: (UInt*, Int*, or Float*). pub fn is_numeric(t: &DataType) -> bool { use DataType::*; diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index ac966cafe34f..03d07807743d 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -250,283 +250,6 @@ impl Field { } } - /// Parse a `Field` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref map) => { - let name = match map.get("name") { - Some(&Value::String(ref name)) => name.to_string(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'name' attribute".to_string(), - )); - } - }; - let nullable = match map.get("nullable") { - Some(&Value::Bool(b)) => b, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'nullable' attribute".to_string(), - )); - } - }; - let data_type = match map.get("type") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'type' attribute".to_string(), - )); - } - }; - - // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz - let metadata = match map.get("metadata") { - Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for value in values { - match value.as_object() { - Some(map) => { - if map.len() != 2 { - return Err(ArrowError::ParseError( - "Field 'metadata' must have exact two entries for each key-value map".to_string(), - )); - } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { - res.insert( - k_str.to_string().clone(), - v_str.to_string().clone(), - ); - } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); - } - } else { - return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); - } - } - _ => { - return Err(ArrowError::ParseError( - "Field 'metadata' contains non-object key-value pair".to_string(), - )); - } - } - } - Some(res) - } - // We also support map format, because Schema's metadata supports this. - // See https://github.com/apache/arrow/pull/5907 - Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for (k, v) in values { - if let Some(str_value) = v.as_str() { - res.insert(k.clone(), str_value.to_string().clone()); - } else { - return Err(ArrowError::ParseError( - format!("Field 'metadata' contains non-string value for key {}", k), - )); - } - } - Some(res) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field `metadata` is not json array".to_string(), - )); - } - _ => None, - }; - - // if data_type is a struct or list, get its children - let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { - return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); - } - match data_type { - DataType::List(_) => { - DataType::List(Box::new(Self::from(&values[0])?)) - } - DataType::LargeList(_) => { - DataType::LargeList(Box::new(Self::from(&values[0])?)) - } - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Struct(mut fields) => match map.get("children") { - Some(Value::Array(values)) => { - let struct_fields: Result> = - values.iter().map(Field::from).collect(); - fields.append(&mut struct_fields?); - DataType::Struct(fields) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Map(_, keys_sorted) => { - match map.get("children") { - Some(Value::Array(values)) if values.len() == 1 => { - let child = Self::from(&values[0])?; - // child must be a struct - match child.data_type() { - DataType::Struct(map_fields) if map_fields.len() == 2 => { - DataType::Map(Box::new(child), keys_sorted) - } - t => { - return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {:?}", t) - )) - } - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array with 1 element" - .to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - } - } - DataType::Union(_, type_ids, mode) => match map.get("children") { - Some(Value::Array(values)) => { - let union_fields: Vec = - values.iter().map(Field::from).collect::>()?; - DataType::Union(union_fields, type_ids, mode) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - _ => data_type, - }; - - let mut dict_id = 0; - let mut dict_is_ordered = false; - - let data_type = match map.get("dictionary") { - Some(dictionary) => { - let index_type = match dictionary.get("indexType") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'indexType' attribute".to_string(), - )); - } - }; - dict_id = match dictionary.get("id") { - Some(Value::Number(n)) => n.as_i64().unwrap(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'id' attribute".to_string(), - )); - } - }; - dict_is_ordered = match dictionary.get("isOrdered") { - Some(&Value::Bool(n)) => n, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'isOrdered' attribute".to_string(), - )); - } - }; - DataType::Dictionary(Box::new(index_type), Box::new(data_type)) - } - _ => data_type, - }; - Ok(Field { - name, - data_type, - nullable, - dict_id, - dict_is_ordered, - metadata, - }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for field".to_string(), - )), - } - } - - /// Generate a JSON representation of the `Field`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - let children: Vec = match self.data_type() { - DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => vec![field.to_json()], - _ => vec![], - }; - match self.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": value_type.to_json(), - "children": children, - "dictionary": { - "id": self.dict_id, - "indexType": index_type.to_json(), - "isOrdered": self.dict_is_ordered - } - }), - _ => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": self.data_type.to_json(), - "children": children - }), - } - } - /// Merge this field into self if it is compatible. /// /// Struct fields are merged recursively. diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 38b6c7bf9744..1586d563cd3f 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -57,7 +57,7 @@ mod tests { #[cfg(feature = "json")] use serde_json::{ - Number, Value, + Number, Value::{Bool, Number as VNumber, String as VString}, }; @@ -174,942 +174,6 @@ mod tests { assert_eq!(person, deserialized); } - #[test] - #[cfg(feature = "json")] - fn struct_field_to_json() { - let f = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "address", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "street", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "zip", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn map_field_to_json() { - let f = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn primitive_field_to_json() { - let f = Field::new("first_name", DataType::Utf8, false); - let value: Value = serde_json::from_str( - r#"{ - "name": "first_name", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - #[test] - #[cfg(feature = "json")] - fn parse_struct_from_json() { - let json = r#" - { - "name": "address", - "type": { - "name": "struct" - }, - "nullable": false, - "children": [ - { - "name": "street", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "zip", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_map_from_json() { - let json = r#" - { - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_union_from_json() { - let json = r#" - { - "name": "my_union", - "nullable": false, - "type": { - "name": "union", - "mode": "SPARSE", - "typeIds": [ - 5, - 7 - ] - }, - "children": [ - { - "name": "f1", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "f2", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_union", - DataType::Union( - vec![ - Field::new("f1", DataType::Int32, true), - Field::new("f2", DataType::Utf8, true), - ], - vec![5, 7], - UnionMode::Sparse, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_utf8_from_json() { - let json = "{\"name\":\"utf8\"}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Utf8, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_int32_from_json() { - let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Int32, dt); - } - - #[test] - #[cfg(feature = "json")] - fn schema_json() { - // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); - - let schema = Schema::new_with_metadata( - vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Binary, false), - Field::new("c3", DataType::FixedSizeBinary(3), false), - Field::new("c4", DataType::Boolean, false), - Field::new("c5", DataType::Date32, false), - Field::new("c6", DataType::Date64, false), - Field::new("c7", DataType::Time32(TimeUnit::Second), false), - Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), - Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), - Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), - Field::new("c11", DataType::Time64(TimeUnit::Second), false), - Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), - Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), - Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), - Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), - Field::new( - "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), - false, - ), - Field::new( - "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), - ), - false, - ), - Field::new( - "c18", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), - Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), - Field::new( - "c22", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - false, - ), - Field::new( - "c23", - DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), - 5, - ), - false, - ), - Field::new( - "c24", - DataType::List(Box::new(Field::new( - "inner_list", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - true, - ))), - false, - ))), - true, - ), - Field::new( - "c25", - DataType::Struct(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::UInt16, false), - ]), - false, - ), - Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), - Field::new("c29", DataType::Duration(TimeUnit::Second), false), - Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), - Field::new_dict( - "c33", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), - true, - 123, - true, - ), - Field::new("c34", DataType::LargeBinary, true), - Field::new("c35", DataType::LargeUtf8, true), - Field::new( - "c36", - DataType::LargeList(Box::new(Field::new( - "inner_large_list", - DataType::LargeList(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - false, - ))), - true, - ))), - true, - ), - Field::new( - "c37", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ), - ], - metadata, - ); - - let expected = schema.to_json(); - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "c2", - "nullable": false, - "type": { - "name": "binary" - }, - "children": [] - }, - { - "name": "c3", - "nullable": false, - "type": { - "name": "fixedsizebinary", - "byteWidth": 3 - }, - "children": [] - }, - { - "name": "c4", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - }, - { - "name": "c5", - "nullable": false, - "type": { - "name": "date", - "unit": "DAY" - }, - "children": [] - }, - { - "name": "c6", - "nullable": false, - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c7", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c8", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c9", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c10", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c11", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c12", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c13", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c14", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c15", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c16", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "UTC" - }, - "children": [] - }, - { - "name": "c17", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "Africa/Johannesburg" - }, - "children": [] - }, - { - "name": "c18", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c19", - "nullable": false, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c20", - "nullable": false, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c21", - "nullable": false, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c22", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c23", - "nullable": false, - "type": { - "name": "fixedsizelist", - "listSize": 5 - }, - "children": [ - { - "name": "bools", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c24", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "inner_list", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "struct", - "nullable": true, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c25", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "a", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "b", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }, - { - "name": "c26", - "nullable": true, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c27", - "nullable": true, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c28", - "nullable": true, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c29", - "nullable": false, - "type": { - "name": "duration", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c30", - "nullable": false, - "type": { - "name": "duration", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c31", - "nullable": false, - "type": { - "name": "duration", - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c32", - "nullable": false, - "type": { - "name": "duration", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c33", - "nullable": true, - "children": [], - "type": { - "name": "utf8" - }, - "dictionary": { - "id": 123, - "indexType": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "isOrdered": true - } - }, - { - "name": "c34", - "nullable": true, - "type": { - "name": "largebinary" - }, - "children": [] - }, - { - "name": "c35", - "nullable": true, - "type": { - "name": "largeutf8" - }, - "children": [] - }, - { - "name": "c36", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "inner_large_list", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "struct", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c37", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - ], - "metadata" : { - "Key": "Value" - } - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - assert_eq!(expected, value); - - // convert back to a schema - let value: Value = serde_json::from_str(json).unwrap(); - let schema2 = Schema::from(&value).unwrap(); - - assert_eq!(schema, schema2); - - // Check that empty metadata produces empty value in JSON and can be parsed - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ], - "metadata": {} - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - - // Check that metadata field is not required in the JSON. - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ] - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - } - #[test] fn create_schema_string() { let schema = person_schema(); diff --git a/arrow/src/datatypes/schema.rs b/arrow/src/datatypes/schema.rs index efde4edefa66..b0eca6114742 100644 --- a/arrow/src/datatypes/schema.rs +++ b/arrow/src/datatypes/schema.rs @@ -233,80 +233,6 @@ impl Schema { .find(|&(_, c)| c.name() == name) } - /// Generate a JSON representation of the `Schema`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - serde_json::json!({ - "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), - "metadata": serde_json::to_value(&self.metadata).unwrap() - }) - } - - /// Parse a `Schema` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref schema) => { - let fields = if let Some(Value::Array(fields)) = schema.get("fields") { - fields.iter().map(Field::from).collect::>()? - } else { - return Err(ArrowError::ParseError( - "Schema fields should be an array".to_string(), - )); - }; - - let metadata = if let Some(value) = schema.get("metadata") { - Self::from_metadata(value)? - } else { - HashMap::default() - }; - - Ok(Self { fields, metadata }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for schema".to_string(), - )), - } - } - - /// Parse a `metadata` definition from a JSON representation. - /// The JSON can either be an Object or an Array of Objects. - #[cfg(feature = "json")] - fn from_metadata(json: &serde_json::Value) -> Result> { - use serde_json::Value; - match json { - Value::Array(_) => { - let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::JsonError( - "Unable to parse object into key-value pair".to_string(), - ) - })?; - for meta in values { - hashmap.insert(meta.key.clone(), meta.value); - } - Ok(hashmap) - } - Value::Object(md) => md - .iter() - .map(|(k, v)| { - if let Value::String(v) = v { - Ok((k.to_string(), v.to_string())) - } else { - Err(ArrowError::ParseError( - "metadata `value` field must be a string".to_string(), - )) - } - }) - .collect::>(), - _ => Err(ArrowError::ParseError( - "`metadata` field must be an object".to_string(), - )), - } - } - /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: /// /// * `self` and `other` should contain the same number of fields @@ -355,13 +281,6 @@ impl Hash for Schema { } } -#[cfg(feature = "json")] -#[derive(serde::Deserialize)] -struct MetadataKeyValue { - key: String, - value: String, -} - #[cfg(test)] mod tests { use crate::datatypes::DataType; diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index ffe112af72cd..2345f1967f24 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -50,7 +50,7 @@ pub fn read_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); let arrow_json: Value = serde_json::from_reader(reader).unwrap(); - let schema = Schema::from(&arrow_json["schema"])?; + let schema = schema_from_json(&arrow_json["schema"])?; // read dictionaries let mut dictionaries = HashMap::new(); if let Some(dicts) = arrow_json.get("dictionaries") { diff --git a/integration-testing/src/util/datatype.rs b/integration-testing/src/util/datatype.rs new file mode 100644 index 000000000000..dd0b95b0a836 --- /dev/null +++ b/integration-testing/src/util/datatype.rs @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use arrow::error::{ArrowError, Result}; + +/// Parse a data type from a JSON representation. +pub fn data_type_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + let default_field = Field::new("", DataType::Boolean, true); + match *json { + Value::Object(ref map) => match map.get("name") { + Some(s) if s == "null" => Ok(DataType::Null), + Some(s) if s == "bool" => Ok(DataType::Boolean), + Some(s) if s == "binary" => Ok(DataType::Binary), + Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), + Some(s) if s == "utf8" => Ok(DataType::Utf8), + Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), + Some(s) if s == "fixedsizebinary" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("byteWidth") { + Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) + } else { + Err(ArrowError::ParseError( + "Expecting a byteWidth for fixedsizebinary".to_string(), + )) + } + } + Some(s) if s == "decimal" => { + // return a list with any type as its child isn't defined in the map + let precision = match map.get("precision") { + Some(p) => Ok(p.as_u64().unwrap().try_into().unwrap()), + None => Err(ArrowError::ParseError( + "Expecting a precision for decimal".to_string(), + )), + }?; + let scale = match map.get("scale") { + Some(s) => Ok(s.as_u64().unwrap().try_into().unwrap()), + _ => Err(ArrowError::ParseError( + "Expecting a scale for decimal".to_string(), + )), + }?; + let bit_width: usize = match map.get("bitWidth") { + Some(b) => b.as_u64().unwrap() as usize, + _ => 128, // Default bit width + }; + + if bit_width == 128 { + Ok(DataType::Decimal128(precision, scale)) + } else if bit_width == 256 { + Ok(DataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::ParseError( + "Decimal bit_width invalid".to_string(), + )) + } + } + Some(s) if s == "floatingpoint" => match map.get("precision") { + Some(p) if p == "HALF" => Ok(DataType::Float16), + Some(p) if p == "SINGLE" => Ok(DataType::Float32), + Some(p) if p == "DOUBLE" => Ok(DataType::Float64), + _ => Err(ArrowError::ParseError( + "floatingpoint precision missing or invalid".to_string(), + )), + }, + Some(s) if s == "timestamp" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "timestamp unit missing or invalid".to_string(), + )), + }; + let tz = match map.get("timezone") { + None => Ok(None), + Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), + _ => Err(ArrowError::ParseError( + "timezone must be a string".to_string(), + )), + }; + Ok(DataType::Timestamp(unit?, tz?)) + } + Some(s) if s == "date" => match map.get("unit") { + Some(p) if p == "DAY" => Ok(DataType::Date32), + Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), + _ => Err(ArrowError::ParseError( + "date unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "time" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }; + match map.get("bitWidth") { + Some(p) if p == 32 => Ok(DataType::Time32(unit?)), + Some(p) if p == 64 => Ok(DataType::Time64(unit?)), + _ => Err(ArrowError::ParseError( + "time bitWidth missing or invalid".to_string(), + )), + } + } + Some(s) if s == "duration" => match map.get("unit") { + Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), + Some(p) if p == "MILLISECOND" => { + Ok(DataType::Duration(TimeUnit::Millisecond)) + } + Some(p) if p == "MICROSECOND" => { + Ok(DataType::Duration(TimeUnit::Microsecond)) + } + Some(p) if p == "NANOSECOND" => { + Ok(DataType::Duration(TimeUnit::Nanosecond)) + } + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "interval" => match map.get("unit") { + Some(p) if p == "DAY_TIME" => { + Ok(DataType::Interval(IntervalUnit::DayTime)) + } + Some(p) if p == "YEAR_MONTH" => { + Ok(DataType::Interval(IntervalUnit::YearMonth)) + } + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } + _ => Err(ArrowError::ParseError( + "interval unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "int" => match map.get("isSigned") { + Some(&Value::Bool(true)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::Int8), + Some(16) => Ok(DataType::Int16), + Some(32) => Ok(DataType::Int32), + Some(64) => Ok(DataType::Int64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + Some(&Value::Bool(false)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::UInt8), + Some(16) => Ok(DataType::UInt16), + Some(32) => Ok(DataType::UInt32), + Some(64) => Ok(DataType::UInt64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int signed missing or invalid".to_string(), + )), + }, + Some(s) if s == "list" => { + // return a list with any type as its child isn't defined in the map + Ok(DataType::List(Box::new(default_field))) + } + Some(s) if s == "largelist" => { + // return a largelist with any type as its child isn't defined in the map + Ok(DataType::LargeList(Box::new(default_field))) + } + Some(s) if s == "fixedsizelist" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("listSize") { + Ok(DataType::FixedSizeList( + Box::new(default_field), + size.as_i64().unwrap() as i32, + )) + } else { + Err(ArrowError::ParseError( + "Expecting a listSize for fixedsizelist".to_string(), + )) + } + } + Some(s) if s == "struct" => { + // return an empty `struct` type as its children aren't defined in the map + Ok(DataType::Struct(vec![])) + } + Some(s) if s == "map" => { + if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { + // Return a map with an empty type as its children aren't defined in the map + Ok(DataType::Map(Box::new(default_field), *keys_sorted)) + } else { + Err(ArrowError::ParseError( + "Expecting a keysSorted for map".to_string(), + )) + } + } + Some(s) if s == "union" => { + if let Some(Value::String(mode)) = map.get("mode") { + let union_mode = if mode == "SPARSE" { + UnionMode::Sparse + } else if mode == "DENSE" { + UnionMode::Dense + } else { + return Err(ArrowError::ParseError(format!( + "Unknown union mode {:?} for union", + mode + ))); + }; + if let Some(type_ids) = map.get("typeIds") { + let type_ids = type_ids + .as_array() + .unwrap() + .iter() + .map(|t| t.as_i64().unwrap() as i8) + .collect::>(); + + let default_fields = type_ids + .iter() + .map(|_| default_field.clone()) + .collect::>(); + + Ok(DataType::Union(default_fields, type_ids, union_mode)) + } else { + Err(ArrowError::ParseError( + "Expecting a typeIds for union ".to_string(), + )) + } + } else { + Err(ArrowError::ParseError( + "Expecting a mode for union".to_string(), + )) + } + } + Some(other) => Err(ArrowError::ParseError(format!( + "invalid or unsupported type name: {} in {:?}", + other, json + ))), + None => Err(ArrowError::ParseError("type name missing".to_string())), + }, + _ => Err(ArrowError::ParseError( + "invalid json value type".to_string(), + )), + } +} + +/// Generate a JSON representation of the data type. +pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { + use serde_json::json; + match data_type { + DataType::Null => json!({"name": "null"}), + DataType::Boolean => json!({"name": "bool"}), + DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), + DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), + DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), + DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), + DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), + DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), + DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), + DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), + DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), + DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), + DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), + DataType::Utf8 => json!({"name": "utf8"}), + DataType::LargeUtf8 => json!({"name": "largeutf8"}), + DataType::Binary => json!({"name": "binary"}), + DataType::LargeBinary => json!({"name": "largebinary"}), + DataType::FixedSizeBinary(byte_width) => { + json!({"name": "fixedsizebinary", "byteWidth": byte_width}) + } + DataType::Struct(_) => json!({"name": "struct"}), + DataType::Union(_, _, _) => json!({"name": "union"}), + DataType::List(_) => json!({ "name": "list"}), + DataType::LargeList(_) => json!({ "name": "largelist"}), + DataType::FixedSizeList(_, length) => { + json!({"name":"fixedsizelist", "listSize": length}) + } + DataType::Time32(unit) => { + json!({"name": "time", "bitWidth": 32, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Time64(unit) => { + json!({"name": "time", "bitWidth": 64, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Date32 => { + json!({"name": "date", "unit": "DAY"}) + } + DataType::Date64 => { + json!({"name": "date", "unit": "MILLISECOND"}) + } + DataType::Timestamp(unit, None) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Timestamp(unit, Some(tz)) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }, "timezone": tz}) + } + DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { + IntervalUnit::YearMonth => "YEAR_MONTH", + IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", + }}), + DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), + DataType::Decimal128(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) + } + DataType::Decimal256(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 256}) + } + DataType::Map(_, keys_sorted) => { + json!({"name": "map", "keysSorted": keys_sorted}) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::Value; + + #[test] + fn parse_utf8_from_json() { + let json = "{\"name\":\"utf8\"}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Utf8, dt); + } + + #[test] + fn parse_int32_from_json() { + let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Int32, dt); + } +} diff --git a/integration-testing/src/util/field.rs b/integration-testing/src/util/field.rs new file mode 100644 index 000000000000..a2becc004d13 --- /dev/null +++ b/integration-testing/src/util/field.rs @@ -0,0 +1,586 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::datatype::{data_type_from_json, data_type_to_json}; +use arrow::datatypes::{DataType, Field}; +use arrow::error::{ArrowError, Result}; +use std::collections::BTreeMap; + +/// Parse a `Field` definition from a JSON representation. +pub fn field_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref map) => { + let name = match map.get("name") { + Some(&Value::String(ref name)) => name.to_string(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'name' attribute".to_string(), + )); + } + }; + let nullable = match map.get("nullable") { + Some(&Value::Bool(b)) => b, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'nullable' attribute".to_string(), + )); + } + }; + let data_type = match map.get("type") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'type' attribute".to_string(), + )); + } + }; + + // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz + let metadata = match map.get("metadata") { + Some(&Value::Array(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for value in values { + match value.as_object() { + Some(map) => { + if map.len() != 2 { + return Err(ArrowError::ParseError( + "Field 'metadata' must have exact two entries for each key-value map".to_string(), + )); + } + if let (Some(k), Some(v)) = + (map.get("key"), map.get("value")) + { + if let (Some(k_str), Some(v_str)) = + (k.as_str(), v.as_str()) + { + res.insert( + k_str.to_string().clone(), + v_str.to_string().clone(), + ); + } else { + return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); + } + } else { + return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); + } + } + _ => { + return Err(ArrowError::ParseError( + "Field 'metadata' contains non-object key-value pair" + .to_string(), + )); + } + } + } + Some(res) + } + // We also support map format, because Schema's metadata supports this. + // See https://github.com/apache/arrow/pull/5907 + Some(&Value::Object(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for (k, v) in values { + if let Some(str_value) = v.as_str() { + res.insert(k.clone(), str_value.to_string().clone()); + } else { + return Err(ArrowError::ParseError(format!( + "Field 'metadata' contains non-string value for key {}", + k + ))); + } + } + Some(res) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field `metadata` is not json array".to_string(), + )); + } + _ => None, + }; + + // if data_type is a struct or list, get its children + let data_type = match data_type { + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) => match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'children' must have one element for a list data type".to_string(), + )); + } + match data_type { + DataType::List(_) => { + DataType::List(Box::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => DataType::LargeList(Box::new( + field_from_json(&values[0])?, + )), + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Box::new(field_from_json(&values[0])?), + int, + ), + _ => unreachable!( + "Data type should be a list, largelist or fixedsizelist" + ), + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Struct(mut fields) => match map.get("children") { + Some(Value::Array(values)) => { + let struct_fields: Result> = + values.iter().map(field_from_json).collect(); + fields.append(&mut struct_fields?); + DataType::Struct(fields) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Map(_, keys_sorted) => { + match map.get("children") { + Some(Value::Array(values)) if values.len() == 1 => { + let child = field_from_json(&values[0])?; + // child must be a struct + match child.data_type() { + DataType::Struct(map_fields) if map_fields.len() == 2 => { + DataType::Map(Box::new(child), keys_sorted) + } + t => { + return Err(ArrowError::ParseError( + format!("Map children should be a struct with 2 fields, found {:?}", t) + )) + } + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array with 1 element" + .to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + } + } + DataType::Union(_, type_ids, mode) => match map.get("children") { + Some(Value::Array(values)) => { + let union_fields: Vec = + values.iter().map(field_from_json).collect::>()?; + DataType::Union(union_fields, type_ids, mode) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + _ => data_type, + }; + + let mut dict_id = 0; + let mut dict_is_ordered = false; + + let data_type = match map.get("dictionary") { + Some(dictionary) => { + let index_type = match dictionary.get("indexType") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'indexType' attribute".to_string(), + )); + } + }; + dict_id = match dictionary.get("id") { + Some(Value::Number(n)) => n.as_i64().unwrap(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'id' attribute".to_string(), + )); + } + }; + dict_is_ordered = match dictionary.get("isOrdered") { + Some(&Value::Bool(n)) => n, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'isOrdered' attribute".to_string(), + )); + } + }; + DataType::Dictionary(Box::new(index_type), Box::new(data_type)) + } + _ => data_type, + }; + + let mut field = + Field::new_dict(&name, data_type, nullable, dict_id, dict_is_ordered); + field.set_metadata(metadata); + Ok(field) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for field".to_string(), + )), + } +} + +/// Generate a JSON representation of the `Field`. +pub fn field_to_json(field: &Field) -> serde_json::Value { + let children: Vec = match field.data_type() { + DataType::Struct(fields) => fields.iter().map(field_to_json).collect(), + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => vec![field_to_json(field)], + _ => vec![], + }; + + match field.data_type() { + DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": field.dict_id().unwrap(), + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }), + _ => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(field.data_type()), + "children": children + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::UnionMode; + use serde_json::Value; + + #[test] + fn struct_field_to_json() { + let f = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "address", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "street", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "zip", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn map_field_to_json() { + let f = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn primitive_field_to_json() { + let f = Field::new("first_name", DataType::Utf8, false); + let value: Value = serde_json::from_str( + r#"{ + "name": "first_name", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + #[test] + fn parse_struct_from_json() { + let json = r#" + { + "name": "address", + "type": { + "name": "struct" + }, + "nullable": false, + "children": [ + { + "name": "street", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [] + }, + { + "name": "zip", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_map_from_json() { + let json = r#" + { + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_union_from_json() { + let json = r#" + { + "name": "my_union", + "nullable": false, + "type": { + "name": "union", + "mode": "SPARSE", + "typeIds": [ + 5, + 7 + ] + }, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_union", + DataType::Union( + vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ], + vec![5, 7], + UnionMode::Sparse, + ), + false, + ); + + assert_eq!(expected, dt); + } +} diff --git a/integration-testing/src/util.rs b/integration-testing/src/util/mod.rs similarity index 99% rename from integration-testing/src/util.rs rename to integration-testing/src/util/mod.rs index e098c4e1491a..9ecd301360fe 100644 --- a/integration-testing/src/util.rs +++ b/integration-testing/src/util/mod.rs @@ -36,7 +36,17 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader}; use arrow::util::bit_util; use arrow::util::decimal::Decimal256; +mod datatype; +mod field; +mod schema; + +use crate::util::datatype::data_type_to_json; +use crate::util::field::field_from_json; +pub use schema::*; + /// A struct that represents an Arrow file with a schema and record batches +/// +/// See #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJson { pub schema: ArrowJsonSchema, @@ -90,7 +100,7 @@ impl From<&Field> for ArrowJsonField { Self { name: field.name().to_string(), - field_type: field.data_type().to_json(), + field_type: data_type_to_json(field.data_type()), nullable: field.is_nullable(), children: vec![], dictionary: None, // TODO: not enough info @@ -256,7 +266,7 @@ impl ArrowJsonField { fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it let field = serde_json::to_value(self)?; - Field::from(&field) + field_from_json(&field) } } diff --git a/integration-testing/src/util/schema.rs b/integration-testing/src/util/schema.rs new file mode 100644 index 000000000000..7e3475e6f460 --- /dev/null +++ b/integration-testing/src/util/schema.rs @@ -0,0 +1,733 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::field::{field_from_json, field_to_json}; +use arrow::datatypes::Schema; +use arrow::error::{ArrowError, Result}; +use std::collections::HashMap; + +/// Generate a JSON representation of the `Schema`. +pub fn schema_to_json(schema: &Schema) -> serde_json::Value { + serde_json::json!({ + "fields": schema.fields().iter().map(field_to_json).collect::>(), + "metadata": serde_json::to_value(schema.metadata()).unwrap() + }) +} + +/// Parse a `Schema` definition from a JSON representation. +pub fn schema_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref schema) => { + let fields = if let Some(Value::Array(fields)) = schema.get("fields") { + fields.iter().map(field_from_json).collect::>()? + } else { + return Err(ArrowError::ParseError( + "Schema fields should be an array".to_string(), + )); + }; + + let metadata = if let Some(value) = schema.get("metadata") { + from_metadata(value)? + } else { + HashMap::default() + }; + + Ok(Schema::new_with_metadata(fields, metadata)) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for schema".to_string(), + )), + } +} + +/// Parse a `metadata` definition from a JSON representation. +/// The JSON can either be an Object or an Array of Objects. +fn from_metadata(json: &serde_json::Value) -> Result> { + use serde_json::Value; + match json { + Value::Array(_) => { + let mut hashmap = HashMap::new(); + let values: Vec = serde_json::from_value(json.clone()) + .map_err(|_| { + ArrowError::JsonError( + "Unable to parse object into key-value pair".to_string(), + ) + })?; + for meta in values { + hashmap.insert(meta.key.clone(), meta.value); + } + Ok(hashmap) + } + Value::Object(md) => md + .iter() + .map(|(k, v)| { + if let Value::String(v) = v { + Ok((k.to_string(), v.to_string())) + } else { + Err(ArrowError::ParseError( + "metadata `value` field must be a string".to_string(), + )) + } + }) + .collect::>(), + _ => Err(ArrowError::ParseError( + "`metadata` field must be an object".to_string(), + )), + } +} + +#[derive(serde::Deserialize)] +struct MetadataKeyValue { + key: String, + value: String, +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; + use serde_json::Value; + + #[test] + fn schema_json() { + // Add some custom metadata + let metadata: HashMap = + [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); + + let schema = Schema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Binary, false), + Field::new("c3", DataType::FixedSizeBinary(3), false), + Field::new("c4", DataType::Boolean, false), + Field::new("c5", DataType::Date32, false), + Field::new("c6", DataType::Date64, false), + Field::new("c7", DataType::Time32(TimeUnit::Second), false), + Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), + Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), + Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), + Field::new("c11", DataType::Time64(TimeUnit::Second), false), + Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), + Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), + Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), + Field::new( + "c16", + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), + false, + ), + Field::new( + "c17", + DataType::Timestamp( + TimeUnit::Microsecond, + Some("Africa/Johannesburg".to_string()), + ), + false, + ), + Field::new( + "c18", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), + Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), + Field::new( + "c22", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + false, + ), + Field::new( + "c23", + DataType::FixedSizeList( + Box::new(Field::new("bools", DataType::Boolean, false)), + 5, + ), + false, + ), + Field::new( + "c24", + DataType::List(Box::new(Field::new( + "inner_list", + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + true, + ))), + false, + ))), + true, + ), + Field::new( + "c25", + DataType::Struct(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::UInt16, false), + ]), + false, + ), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new("c29", DataType::Duration(TimeUnit::Second), false), + Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), + Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), + Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new_dict( + "c33", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ), + true, + 123, + true, + ), + Field::new("c34", DataType::LargeBinary, true), + Field::new("c35", DataType::LargeUtf8, true), + Field::new( + "c36", + DataType::LargeList(Box::new(Field::new( + "inner_large_list", + DataType::LargeList(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + false, + ))), + true, + ))), + true, + ), + Field::new( + "c37", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ), + ], + metadata, + ); + + let expected = schema_to_json(&schema); + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "c2", + "nullable": false, + "type": { + "name": "binary" + }, + "children": [] + }, + { + "name": "c3", + "nullable": false, + "type": { + "name": "fixedsizebinary", + "byteWidth": 3 + }, + "children": [] + }, + { + "name": "c4", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + }, + { + "name": "c5", + "nullable": false, + "type": { + "name": "date", + "unit": "DAY" + }, + "children": [] + }, + { + "name": "c6", + "nullable": false, + "type": { + "name": "date", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c7", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c8", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c9", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c10", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c11", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c12", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c13", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c14", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c15", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c16", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MILLISECOND", + "timezone": "UTC" + }, + "children": [] + }, + { + "name": "c17", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MICROSECOND", + "timezone": "Africa/Johannesburg" + }, + "children": [] + }, + { + "name": "c18", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c19", + "nullable": false, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c20", + "nullable": false, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c21", + "nullable": false, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c22", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "item", + "nullable": true, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c23", + "nullable": false, + "type": { + "name": "fixedsizelist", + "listSize": 5 + }, + "children": [ + { + "name": "bools", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c24", + "nullable": true, + "type": { + "name": "list" + }, + "children": [ + { + "name": "inner_list", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "struct", + "nullable": true, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c25", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "a", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "b", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }, + { + "name": "c26", + "nullable": true, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c27", + "nullable": true, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c28", + "nullable": true, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c29", + "nullable": false, + "type": { + "name": "duration", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c30", + "nullable": false, + "type": { + "name": "duration", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c31", + "nullable": false, + "type": { + "name": "duration", + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c32", + "nullable": false, + "type": { + "name": "duration", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c33", + "nullable": true, + "children": [], + "type": { + "name": "utf8" + }, + "dictionary": { + "id": 123, + "indexType": { + "name": "int", + "bitWidth": 32, + "isSigned": true + }, + "isOrdered": true + } + }, + { + "name": "c34", + "nullable": true, + "type": { + "name": "largebinary" + }, + "children": [] + }, + { + "name": "c35", + "nullable": true, + "type": { + "name": "largeutf8" + }, + "children": [] + }, + { + "name": "c36", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "inner_large_list", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "struct", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c37", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + ], + "metadata" : { + "Key": "Value" + } + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + assert_eq!(expected, value); + + // convert back to a schema + let value: Value = serde_json::from_str(json).unwrap(); + let schema2 = schema_from_json(&value).unwrap(); + + assert_eq!(schema, schema2); + + // Check that empty metadata produces empty value in JSON and can be parsed + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ], + "metadata": {} + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + + // Check that metadata field is not required in the JSON. + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ] + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + } +} From 2a0fc7703420f99d28141516cabdd0408a583dfc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Sep 2022 14:05:58 -0700 Subject: [PATCH 0046/1411] Add support of sorting dictionary of other primitive arrays (#2701) * Add support of sorting dictionary of other primitive arrays * Collapse match statements * Add one helper to match primitive types --- arrow/src/compute/kernels/sort.rs | 259 ++++++++++++++++++------------ arrow/src/datatypes/datatype.rs | 24 +++ 2 files changed, 179 insertions(+), 104 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 34a321910c30..0bc2d39481e3 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -314,119 +314,32 @@ pub fn sort_to_indices( } }, DataType::Dictionary(_, _) => { + let value_null_first = if options.descending { + // When sorting dictionary in descending order, we take inverse of of null ordering + // when sorting the values. Because if `nulls_first` is true, null must be in front + // of non-null value. As we take the sorted order of value array to sort dictionary + // keys, these null values will be treated as smallest ones and be sorted to the end + // of sorted result. So we set `nulls_first` to false when sorting dictionary value + // array to make them as largest ones, then null values will be put at the beginning + // of sorted dictionary result. + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { + descending: false, + nulls_first: value_null_first, + }); downcast_dictionary_array!( values => match values.values().data_type() { - DataType::Int8 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - // When sorting dictionary in descending order, we take inverse of of null ordering - // when sorting the values. Because if `nulls_first` is true, null must be in front - // of non-null value. As we take the sorted order of value array to sort dictionary - // keys, these null values will be treated as smallest ones and be sorted to the end - // of sorted result. So we set `nulls_first` to false when sorting dictionary value - // array to make them as largest ones, then null values will be put at the beginning - // of sorted dictionary result. - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) - }, - DataType::Int16 => { + dt if DataType::is_primitive(dt) => { let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) - }, - DataType::Int32 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) - }, - DataType::Int64 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) - }, - DataType::UInt8 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) - }, - DataType::UInt16 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) - }, - DataType::UInt32 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map,v, n, options, limit, cmp) - }, - DataType::UInt64 => { - let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; let value_indices_map = prepare_indices_map(&sorted_value_indices); sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) }, DataType::Utf8 => { let dict_values = values.values(); - let value_null_first = if options.descending { - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; let value_indices_map = prepare_indices_map(&sorted_value_indices); sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) @@ -3552,4 +3465,142 @@ mod tests { vec![None, None, None, Some(5), Some(5), Some(3), Some(1)], ); } + + #[test] + fn test_sort_f32_dicts() { + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Float32Array::from(vec![1.2, 3.0, 5.1]); + test_sort_primitive_dict_arrays::( + keys, + values, + None, + None, + vec![None, None, Some(1.2), Some(3.0), Some(5.1), Some(5.1)], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Float32Array::from(vec![1.2, 3.0, 5.1]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![Some(5.1), Some(5.1), Some(3.0), Some(1.2), None, None], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Float32Array::from(vec![1.2, 3.0, 5.1]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![Some(1.2), Some(3.0), Some(5.1), Some(5.1), None, None], + ); + + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let values = Float32Array::from(vec![1.2, 3.0, 5.1]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(3), + vec![None, None, Some(5.1)], + ); + + // Values have `None`. + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); + test_sort_primitive_dict_arrays::( + keys, + values, + None, + None, + vec![None, None, None, Some(1.2), Some(3.0), Some(5.1), Some(5.1)], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![Some(1.2), Some(3.0), Some(5.1), Some(5.1), None, None, None], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![Some(5.1), Some(5.1), Some(3.0), Some(1.2), None, None, None], + ); + + let keys = Int8Array::from(vec![ + Some(1_i8), + None, + Some(3), + None, + Some(2), + Some(3), + Some(0), + ]); + let values = Float32Array::from(vec![Some(1.2), Some(3.0), None, Some(5.1)]); + test_sort_primitive_dict_arrays::( + keys, + values, + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)], + ); + } } diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 2ca71ef77725..d3189b8b18cc 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -1070,6 +1070,30 @@ impl DataType { ) } + /// Returns true if the type is primitive: (numeric, temporal). + pub fn is_primitive(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + Int8 | Int16 + | Int32 + | Int64 + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Float32 + | Float64 + | Date32 + | Date64 + | Time32(_) + | Time64(_) + | Timestamp(_, _) + | Interval(_) + | Duration(_) + ) + } + /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). pub fn is_temporal(t: &DataType) -> bool { use DataType::*; From 7594db6367515473efdb130e7de91060079a4d88 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 14 Sep 2022 16:54:23 -0700 Subject: [PATCH 0047/1411] Add overflow-checking variants of arithmetic scalar dyn kernels (#2713) * Add overflow-checking variants of arithmetic scalar dyn kernels * Update doc * For review --- arrow/src/compute/kernels/arithmetic.rs | 199 +++++++++++++++++++++--- arrow/src/compute/kernels/arity.rs | 50 +++++- 2 files changed, 226 insertions(+), 23 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index a344407e426d..04fe2393ec4d 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -22,7 +22,7 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; +use std::ops::{Div, Neg, Rem}; use num::{One, Zero}; @@ -32,7 +32,9 @@ use crate::buffer::Buffer; use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; use crate::compute::util::combine_option_bitmap; -use crate::compute::{binary, binary_opt, try_binary, try_unary, unary_dyn}; +use crate::compute::{ + binary, binary_opt, try_binary, try_unary, try_unary_dyn, unary_dyn, +}; use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, @@ -834,12 +836,39 @@ where /// Add every value in an array by a scalar. If any value in the array is null then the /// result is also null. The given array must be a `PrimitiveArray` of the type same as /// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `add_scalar_checked_dyn` instead. +/// +/// This returns an `Err` when the input array is not supported for adding operation. pub fn add_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result where T: ArrowNumericType, - T::Native: Add, + T::Native: ArrowNativeTypeOp, +{ + unary_dyn::<_, T>(array, |value| value.add_wrapping(scalar)) +} + +/// Add every value in an array by a scalar. If any value in the array is null then the +/// result is also null. The given array must be a `PrimitiveArray` of the type same as +/// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `add_scalar_dyn` instead. +/// +/// As this kernel has the branching costs and also prevents LLVM from vectorising it correctly, +/// it is usually much slower than non-checking variant. +pub fn add_scalar_checked_dyn(array: &dyn Array, scalar: T::Native) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - unary_dyn::<_, T>(array, |value| value + scalar) + try_unary_dyn::<_, T>(array, |value| { + value.add_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) + }) + }) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -937,16 +966,40 @@ where /// Subtract every value in an array by a scalar. If any value in the array is null then the /// result is also null. The given array must be a `PrimitiveArray` of the type same as /// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `subtract_scalar_checked_dyn` instead. pub fn subtract_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result where - T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero, + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + unary_dyn::<_, T>(array, |value| value.sub_wrapping(scalar)) +} + +/// Subtract every value in an array by a scalar. If any value in the array is null then the +/// result is also null. The given array must be a `PrimitiveArray` of the type same as +/// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `subtract_scalar_dyn` instead. +pub fn subtract_scalar_checked_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - unary_dyn::<_, T>(array, |value| value - scalar) + try_unary_dyn::<_, T>(array, |value| { + value.sub_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow: subtracting {:?} from {:?}", + scalar, value + )) + }) + }) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `-` operation on an array. If value is null then the result is also null. @@ -1065,18 +1118,40 @@ where /// Multiply every value in an array by a scalar. If any value in the array is null then the /// result is also null. The given array must be a `PrimitiveArray` of the type same as /// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_scalar_checked_dyn` instead. pub fn multiply_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result where T: ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Rem - + Zero - + One, + T::Native: ArrowNativeTypeOp, +{ + unary_dyn::<_, T>(array, |value| value.mul_wrapping(scalar)) +} + +/// Subtract every value in an array by a scalar. If any value in the array is null then the +/// result is also null. The given array must be a `PrimitiveArray` of the type same as +/// the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `multiply_scalar_dyn` instead. +pub fn multiply_scalar_checked_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - unary_dyn::<_, T>(array, |value| value * scalar) + try_unary_dyn::<_, T>(array, |value| { + value.mul_checked(scalar).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow: multiplying {:?} by {:?}", + value, scalar + )) + }) + }) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left % right` operation on two arrays. If either left or right value is null @@ -1223,15 +1298,48 @@ where /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. The given array must be a `PrimitiveArray` of the type /// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `divide_scalar_checked_dyn` instead. pub fn divide_scalar_dyn(array: &dyn Array, divisor: T::Native) -> Result where T: ArrowNumericType, - T::Native: Div + Zero, + T::Native: ArrowNativeTypeOp + Zero, +{ + if divisor.is_zero() { + return Err(ArrowError::DivideByZero); + } + unary_dyn::<_, T>(array, |value| value.div_wrapping(divisor)) +} + +/// Divide every value in an array by a scalar. If any value in the array is null then the +/// result is also null. If the scalar is zero then the result of this operation will be +/// `Err(ArrowError::DivideByZero)`. The given array must be a `PrimitiveArray` of the type +/// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `divide_scalar_dyn` instead. +pub fn divide_scalar_checked_dyn( + array: &dyn Array, + divisor: T::Native, +) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero, { if divisor.is_zero() { return Err(ArrowError::DivideByZero); } - unary_dyn::<_, T>(array, |value| value / divisor) + + try_unary_dyn::<_, T>(array, |value| { + value.div_checked(divisor).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow: dividing {:?} by {:?}", + value, divisor + )) + }) + }) + .map(|a| Arc::new(a) as ArrayRef) } #[cfg(test)] @@ -2222,6 +2330,55 @@ mod tests { overflow.expect_err("overflow should be detected"); } + #[test] + fn test_primitive_add_scalar_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + + let wrapped = add_scalar_dyn::(&a, 1).unwrap(); + let expected = + Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = add_scalar_checked_dyn::(&a, 1); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_subtract_scalar_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![-2]); + + let wrapped = subtract_scalar_dyn::(&a, i32::MAX).unwrap(); + let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = subtract_scalar_checked_dyn::(&a, i32::MAX); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_mul_scalar_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![10]); + + let wrapped = multiply_scalar_dyn::(&a, i32::MAX).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = multiply_scalar_checked_dyn::(&a, i32::MAX); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_div_scalar_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MIN]); + + let wrapped = divide_scalar_dyn::(&a, -1).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = divide_scalar_checked_dyn::(&a, -1); + overflow.expect_err("overflow should be detected"); + } + #[test] fn test_primitive_div_opt_overflow_division_by_zero() { let a = Int32Array::from(vec![i32::MIN]); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index fffa81af8190..21c633116ee0 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -123,7 +123,7 @@ where Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) } -/// A helper function that applies an unary function to a dictionary array with primitive value type. +/// A helper function that applies an infallible unary function to a dictionary array with primitive value type. fn unary_dict(array: &DictionaryArray, op: F) -> Result where K: ArrowNumericType, @@ -138,7 +138,22 @@ where Ok(Arc::new(new_dict)) } -/// Applies an unary function to an array with primitive values. +/// A helper function that applies a fallible unary function to a dictionary array with primitive value type. +fn try_unary_dict(array: &DictionaryArray, op: F) -> Result +where + K: ArrowNumericType, + T: ArrowPrimitiveType, + F: Fn(T::Native) -> Result, +{ + let dict_values = array.values().as_any().downcast_ref().unwrap(); + let values = try_unary::(dict_values, op)?.into_data(); + let data = array.data().clone().into_builder().child_data(vec![values]); + + let new_dict: DictionaryArray = unsafe { data.build_unchecked() }.into(); + Ok(Arc::new(new_dict)) +} + +/// Applies an infallible unary function to an array with primitive values. pub fn unary_dyn(array: &dyn Array, op: F) -> Result where T: ArrowPrimitiveType, @@ -162,6 +177,37 @@ where } } +/// Applies a fallible unary function to an array with primitive values. +pub fn try_unary_dyn(array: &dyn Array, op: F) -> Result +where + T: ArrowPrimitiveType, + F: Fn(T::Native) -> Result, +{ + downcast_dictionary_array! { + array => if array.values().data_type() == &T::DATA_TYPE { + try_unary_dict::<_, F, T>(array, op) + } else { + Err(ArrowError::NotYetImplemented(format!( + "Cannot perform unary operation on dictionary array of type {}", + array.data_type() + ))) + }, + t => { + if t == &T::DATA_TYPE { + Ok(Arc::new(try_unary::( + array.as_any().downcast_ref::>().unwrap(), + op, + )?)) + } else { + Err(ArrowError::NotYetImplemented(format!( + "Cannot perform unary operation on array of type {}", + t + ))) + } + } + } +} + /// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, collecting /// the results in a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the /// corresponding index in the result will also be null From fb016566ea4f46d461230e1586f7bb95c29d5934 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:30:54 +0100 Subject: [PATCH 0048/1411] Split out arrow-buffer crate (#2594) (#2693) * Split out arrow-buffer crate (#2594) * Fix doc * Review feedback * Review feedback * Use 64-bit wide collect_bool --- Cargo.toml | 3 +- arrow-buffer/Cargo.toml | 47 +++ .../src/alloc/alignment.rs | 0 {arrow => arrow-buffer}/src/alloc/mod.rs | 32 +- .../src/buffer/immutable.rs | 4 +- arrow-buffer/src/buffer/mod.rs | 29 ++ {arrow => arrow-buffer}/src/buffer/mutable.rs | 45 +-- {arrow => arrow-buffer}/src/buffer/ops.rs | 21 +- {arrow => arrow-buffer}/src/buffer/scalar.rs | 2 +- {arrow => arrow-buffer}/src/bytes.rs | 2 +- arrow-buffer/src/lib.rs | 24 ++ arrow-buffer/src/native.rs | 303 ++++++++++++++++++ .../src/util/bit_chunk_iterator.rs | 4 +- {arrow => arrow-buffer}/src/util/bit_util.rs | 0 arrow-buffer/src/util/mod.rs | 19 ++ arrow/Cargo.toml | 2 + arrow/benches/buffer_bit_ops.rs | 6 +- arrow/src/alloc/types.rs | 73 ----- arrow/src/array/array_list.rs | 4 +- arrow/src/bitmap.rs | 29 +- arrow/src/buffer/mod.rs | 72 ----- arrow/src/compute/kernels/boolean.rs | 73 ++--- arrow/src/datatypes/native.rs | 286 +---------------- arrow/src/lib.rs | 5 +- arrow/src/util/mod.rs | 4 +- 25 files changed, 542 insertions(+), 547 deletions(-) create mode 100644 arrow-buffer/Cargo.toml rename {arrow => arrow-buffer}/src/alloc/alignment.rs (100%) rename {arrow => arrow-buffer}/src/alloc/mod.rs (84%) rename {arrow => arrow-buffer}/src/buffer/immutable.rs (99%) create mode 100644 arrow-buffer/src/buffer/mod.rs rename {arrow => arrow-buffer}/src/buffer/mutable.rs (95%) rename {arrow => arrow-buffer}/src/buffer/ops.rs (89%) rename {arrow => arrow-buffer}/src/buffer/scalar.rs (99%) rename {arrow => arrow-buffer}/src/bytes.rs (98%) create mode 100644 arrow-buffer/src/lib.rs create mode 100644 arrow-buffer/src/native.rs rename {arrow => arrow-buffer}/src/util/bit_chunk_iterator.rs (99%) rename {arrow => arrow-buffer}/src/util/bit_util.rs (100%) create mode 100644 arrow-buffer/src/util/mod.rs delete mode 100644 arrow/src/alloc/types.rs delete mode 100644 arrow/src/buffer/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 9bf55c0f2360..d0233ccb376a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,10 +18,11 @@ [workspace] members = [ "arrow", + "arrow-buffer", + "arrow-flight", "parquet", "parquet_derive", "parquet_derive_test", - "arrow-flight", "integration-testing", "object_store", ] diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml new file mode 100644 index 000000000000..87019111efcc --- /dev/null +++ b/arrow-buffer/Cargo.toml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-buffer" +version = "22.0.0" +description = "Buffer abstractions for Apache Arrow" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_buffer" +path = "src/lib.rs" +bench = false + +[dependencies] +num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "2.0", default-features = false } + +[dev-dependencies] +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[build-dependencies] diff --git a/arrow/src/alloc/alignment.rs b/arrow-buffer/src/alloc/alignment.rs similarity index 100% rename from arrow/src/alloc/alignment.rs rename to arrow-buffer/src/alloc/alignment.rs diff --git a/arrow/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs similarity index 84% rename from arrow/src/alloc/mod.rs rename to arrow-buffer/src/alloc/mod.rs index 526850685c48..6b09c4b31b9a 100644 --- a/arrow/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -20,34 +20,29 @@ use std::alloc::{handle_alloc_error, Layout}; use std::fmt::{Debug, Formatter}; -use std::mem::size_of; use std::panic::RefUnwindSafe; use std::ptr::NonNull; use std::sync::Arc; mod alignment; -mod types; pub use alignment::ALIGNMENT; -pub use types::NativeType; #[inline] -unsafe fn null_pointer() -> NonNull { - NonNull::new_unchecked(ALIGNMENT as *mut T) +unsafe fn null_pointer() -> NonNull { + NonNull::new_unchecked(ALIGNMENT as *mut u8) } /// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. /// This is more performant than using [allocate_aligned_zeroed] when all bytes will have /// an unknown or non-zero value and is semantically similar to `malloc`. -pub fn allocate_aligned(size: usize) -> NonNull { +pub fn allocate_aligned(size: usize) -> NonNull { unsafe { if size == 0 { null_pointer() } else { - let size = size * size_of::(); - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc(layout) as *mut T; + let raw_ptr = std::alloc::alloc(layout); NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) } } @@ -56,15 +51,13 @@ pub fn allocate_aligned(size: usize) -> NonNull { /// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. /// This is more performant than using [allocate_aligned] and setting all bytes to zero /// and is semantically similar to `calloc`. -pub fn allocate_aligned_zeroed(size: usize) -> NonNull { +pub fn allocate_aligned_zeroed(size: usize) -> NonNull { unsafe { if size == 0 { null_pointer() } else { - let size = size * size_of::(); - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T; + let raw_ptr = std::alloc::alloc_zeroed(layout); NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) } } @@ -78,9 +71,8 @@ pub fn allocate_aligned_zeroed(size: usize) -> NonNull { /// * ptr must denote a block of memory currently allocated via this allocator, /// /// * size must be the same size that was used to allocate that block of memory, -pub unsafe fn free_aligned(ptr: NonNull, size: usize) { +pub unsafe fn free_aligned(ptr: NonNull, size: usize) { if ptr != null_pointer() { - let size = size * size_of::(); std::alloc::dealloc( ptr.as_ptr() as *mut u8, Layout::from_size_align_unchecked(size, ALIGNMENT), @@ -99,13 +91,11 @@ pub unsafe fn free_aligned(ptr: NonNull, size: usize) { /// /// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., /// the rounded value must be less than usize::MAX). -pub unsafe fn reallocate( - ptr: NonNull, +pub unsafe fn reallocate( + ptr: NonNull, old_size: usize, new_size: usize, -) -> NonNull { - let old_size = old_size * size_of::(); - let new_size = new_size * size_of::(); +) -> NonNull { if ptr == null_pointer() { return allocate_aligned(new_size); } @@ -119,7 +109,7 @@ pub unsafe fn reallocate( ptr.as_ptr() as *mut u8, Layout::from_size_align_unchecked(old_size, ALIGNMENT), new_size, - ) as *mut T; + ); NonNull::new(raw_ptr).unwrap_or_else(|| { handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) }) diff --git a/arrow/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs similarity index 99% rename from arrow/src/buffer/immutable.rs rename to arrow-buffer/src/buffer/immutable.rs index 6d4d624efc13..c60d28afc782 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -23,7 +23,7 @@ use std::{convert::AsRef, usize}; use crate::alloc::{Allocation, Deallocation}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; -use crate::{bytes::Bytes, datatypes::ArrowNativeType}; +use crate::{bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::MutableBuffer; @@ -271,7 +271,7 @@ impl Buffer { /// Prefer this to `collect` whenever possible, as it is ~60% faster. /// # Example /// ``` - /// # use arrow::buffer::Buffer; + /// # use arrow_buffer::buffer::Buffer; /// let v = vec![1u32]; /// let iter = v.iter().map(|x| x * 2); /// let buffer = unsafe { Buffer::from_trusted_len_iter(iter) }; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs new file mode 100644 index 000000000000..b9201f774fe0 --- /dev/null +++ b/arrow-buffer/src/buffer/mod.rs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents +//! a contiguous memory region that can be shared via `offsets`. + +mod immutable; +pub use immutable::*; +mod mutable; +pub use mutable::*; +mod ops; +mod scalar; +pub use scalar::*; + +pub use ops::*; diff --git a/arrow/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs similarity index 95% rename from arrow/src/buffer/mutable.rs rename to arrow-buffer/src/buffer/mutable.rs index d1e633993936..80644b63d113 100644 --- a/arrow/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -20,7 +20,7 @@ use crate::alloc::Deallocation; use crate::{ alloc, bytes::Bytes, - datatypes::{ArrowNativeType, ToByteSlice}, + native::{ArrowNativeType, ToByteSlice}, util::bit_util, }; use std::ptr::NonNull; @@ -31,12 +31,12 @@ use std::ptr::NonNull; /// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] /// to insert many items, and `into` to convert it to [`Buffer`]. /// -/// For a safe, strongly typed API consider using [`crate::array::BufferBuilder`] +/// For a safe, strongly typed API consider using `arrow::array::BufferBuilder` /// /// # Example /// /// ``` -/// # use arrow::buffer::{Buffer, MutableBuffer}; +/// # use arrow_buffer::buffer::{Buffer, MutableBuffer}; /// let mut buffer = MutableBuffer::new(0); /// buffer.push(256u32); /// buffer.extend_from_slice(&[1u32]); @@ -75,7 +75,7 @@ impl MutableBuffer { /// all bytes are guaranteed to be `0u8`. /// # Example /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; + /// # use arrow_buffer::buffer::{Buffer, MutableBuffer}; /// let mut buffer = MutableBuffer::from_len_zeroed(127); /// assert_eq!(buffer.len(), 127); /// assert!(buffer.capacity() >= 127); @@ -131,7 +131,7 @@ impl MutableBuffer { /// `self.len + additional > capacity`. /// # Example /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; + /// # use arrow_buffer::buffer::{Buffer, MutableBuffer}; /// let mut buffer = MutableBuffer::new(0); /// buffer.reserve(253); // allocates for the first time /// (0..253u8).for_each(|i| buffer.push(i)); // no reallocation @@ -171,7 +171,7 @@ impl MutableBuffer { /// growing it (potentially reallocating it) and writing `value` in the newly available bytes. /// # Example /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; + /// # use arrow_buffer::buffer::{Buffer, MutableBuffer}; /// let mut buffer = MutableBuffer::new(0); /// buffer.resize(253, 2); // allocates for the first time /// assert_eq!(buffer.as_slice()[252], 2u8); @@ -195,7 +195,7 @@ impl MutableBuffer { /// /// # Example /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; + /// # use arrow_buffer::buffer::{Buffer, MutableBuffer}; /// // 2 cache lines /// let mut buffer = MutableBuffer::new(128); /// assert_eq!(buffer.capacity(), 128); @@ -322,7 +322,7 @@ impl MutableBuffer { /// Extends this buffer from a slice of items that can be represented in bytes, increasing its capacity if needed. /// # Example /// ``` - /// # use arrow::buffer::MutableBuffer; + /// # use arrow_buffer::buffer::MutableBuffer; /// let mut buffer = MutableBuffer::new(0); /// buffer.extend_from_slice(&[2u32, 0]); /// assert_eq!(buffer.len(), 8) // u32 has 4 bytes @@ -346,7 +346,7 @@ impl MutableBuffer { /// Extends the buffer with a new item, increasing its capacity if needed. /// # Example /// ``` - /// # use arrow::buffer::MutableBuffer; + /// # use arrow_buffer::buffer::MutableBuffer; /// let mut buffer = MutableBuffer::new(0); /// buffer.push(256u32); /// assert_eq!(buffer.len(), 4) // u32 has 4 bytes @@ -384,7 +384,7 @@ impl MutableBuffer { /// # Safety /// The caller must ensure that the buffer was properly initialized up to `len`. #[inline] - pub(crate) unsafe fn set_len(&mut self, len: usize) { + pub unsafe fn set_len(&mut self, len: usize) { assert!(len <= self.capacity()); self.len = len; } @@ -394,16 +394,16 @@ impl MutableBuffer { /// This is similar to `from_trusted_len_iter_bool`, however, can be significantly faster /// as it eliminates the conditional `Iterator::next` #[inline] - pub(crate) fn collect_bool bool>(len: usize, mut f: F) -> Self { - let mut buffer = Self::new(bit_util::ceil(len, 8)); + pub fn collect_bool bool>(len: usize, mut f: F) -> Self { + let mut buffer = Self::new(bit_util::ceil(len, 64) * 8); - let chunks = len / 8; - let remainder = len % 8; + let chunks = len / 64; + let remainder = len % 64; for chunk in 0..chunks { let mut packed = 0; - for bit_idx in 0..8 { - let i = bit_idx + chunk * 8; - packed |= (f(i) as u8) << bit_idx; + for bit_idx in 0..64 { + let i = bit_idx + chunk * 64; + packed |= (f(i) as u64) << bit_idx; } // SAFETY: Already allocated sufficient capacity @@ -413,14 +413,15 @@ impl MutableBuffer { if remainder != 0 { let mut packed = 0; for bit_idx in 0..remainder { - let i = bit_idx + chunks * 8; - packed |= (f(i) as u8) << bit_idx; + let i = bit_idx + chunks * 64; + packed |= (f(i) as u64) << bit_idx; } // SAFETY: Already allocated sufficient capacity unsafe { buffer.push_unchecked(packed) } } + buffer.truncate(bit_util::ceil(len, 8)); buffer } } @@ -484,7 +485,7 @@ impl MutableBuffer { /// Prefer this to `collect` whenever possible, as it is faster ~60% faster. /// # Example /// ``` - /// # use arrow::buffer::MutableBuffer; + /// # use arrow_buffer::buffer::MutableBuffer; /// let v = vec![1u32]; /// let iter = v.iter().map(|x| x * 2); /// let buffer = unsafe { MutableBuffer::from_trusted_len_iter(iter) }; @@ -525,10 +526,10 @@ impl MutableBuffer { } /// Creates a [`MutableBuffer`] from a boolean [`Iterator`] with a trusted (upper) length. - /// # use arrow::buffer::MutableBuffer; + /// # use arrow_buffer::buffer::MutableBuffer; /// # Example /// ``` - /// # use arrow::buffer::MutableBuffer; + /// # use arrow_buffer::buffer::MutableBuffer; /// let v = vec![false, true, false]; /// let iter = v.iter().map(|x| *x || true); /// let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(iter) }; diff --git a/arrow/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs similarity index 89% rename from arrow/src/buffer/ops.rs rename to arrow-buffer/src/buffer/ops.rs index 7000f39767cb..c1295ad9ab7e 100644 --- a/arrow/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -20,26 +20,19 @@ use crate::util::bit_util::ceil; /// Apply a bitwise operation `op` to four inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. -#[allow(clippy::too_many_arguments)] -pub(crate) fn bitwise_quaternary_op_helper( - first: &Buffer, - first_offset_in_bits: usize, - second: &Buffer, - second_offset_in_bits: usize, - third: &Buffer, - third_offset_in_bits: usize, - fourth: &Buffer, - fourth_offset_in_bits: usize, +pub fn bitwise_quaternary_op_helper( + buffers: [&Buffer; 4], + offsets: [usize; 4], len_in_bits: usize, op: F, ) -> Buffer where F: Fn(u64, u64, u64, u64) -> u64, { - let first_chunks = first.bit_chunks(first_offset_in_bits, len_in_bits); - let second_chunks = second.bit_chunks(second_offset_in_bits, len_in_bits); - let third_chunks = third.bit_chunks(third_offset_in_bits, len_in_bits); - let fourth_chunks = fourth.bit_chunks(fourth_offset_in_bits, len_in_bits); + let first_chunks = buffers[0].bit_chunks(offsets[0], len_in_bits); + let second_chunks = buffers[1].bit_chunks(offsets[1], len_in_bits); + let third_chunks = buffers[2].bit_chunks(offsets[2], len_in_bits); + let fourth_chunks = buffers[3].bit_chunks(offsets[3], len_in_bits); let chunks = first_chunks .iter() diff --git a/arrow/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs similarity index 99% rename from arrow/src/buffer/scalar.rs rename to arrow-buffer/src/buffer/scalar.rs index 7d663cd2bf96..a9f2df3d9ff3 100644 --- a/arrow/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -16,7 +16,7 @@ // under the License. use crate::buffer::Buffer; -use crate::datatypes::ArrowNativeType; +use crate::native::ArrowNativeType; use std::ops::Deref; /// Provides a safe API for interpreting a [`Buffer`] as a slice of [`ArrowNativeType`] diff --git a/arrow/src/bytes.rs b/arrow-buffer/src/bytes.rs similarity index 98% rename from arrow/src/bytes.rs rename to arrow-buffer/src/bytes.rs index 75137a55295b..20bf5a474b47 100644 --- a/arrow/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -111,7 +111,7 @@ impl Drop for Bytes { fn drop(&mut self) { match &self.deallocation { Deallocation::Arrow(capacity) => { - unsafe { alloc::free_aligned::(self.ptr, *capacity) }; + unsafe { alloc::free_aligned(self.ptr, *capacity) }; } // The automatic drop implementation will free the memory once the reference count reaches zero Deallocation::Custom(_allocation) => (), diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs new file mode 100644 index 000000000000..a8aca7c3dad2 --- /dev/null +++ b/arrow-buffer/src/lib.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Buffer abstractions for [Apache Arrow](https://docs.rs/arrow) + +pub mod alloc; +pub mod buffer; +mod bytes; +pub mod native; +pub mod util; diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs new file mode 100644 index 000000000000..d8431953c430 --- /dev/null +++ b/arrow-buffer/src/native.rs @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use half::f16; + +mod private { + pub trait Sealed {} +} + +/// Trait expressing a Rust type that has the same in-memory representation +/// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). +/// +/// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers +/// as is. +/// +/// # Transmute Safety +/// +/// A type T implementing this trait means that any arbitrary slice of bytes of length and +/// alignment `size_of::()` can be safely interpreted as a value of that type without +/// being unsound, i.e. potentially resulting in undefined behaviour. +/// +/// Note: in the case of floating point numbers this transmutation can result in a signalling +/// NaN, which, whilst sound, can be unwieldy. In general, whilst it is perfectly sound to +/// reinterpret bytes as different types using this trait, it is likely unwise. For more information +/// see [f32::from_bits] and [f64::from_bits]. +/// +/// Note: `bool` is restricted to `0` or `1`, and so `bool: !ArrowNativeType` +/// +/// # Sealed +/// +/// Due to the above restrictions, this trait is sealed to prevent accidental misuse +pub trait ArrowNativeType: + std::fmt::Debug + + Send + + Sync + + Copy + + PartialOrd + + std::str::FromStr + + Default + + private::Sealed + + 'static +{ + /// Convert native type from usize. + #[inline] + fn from_usize(_: usize) -> Option { + None + } + + /// Convert native type to usize. + #[inline] + fn to_usize(&self) -> Option { + None + } + + /// Convert native type to isize. + #[inline] + fn to_isize(&self) -> Option { + None + } + + /// Convert native type from i32. + #[inline] + fn from_i32(_: i32) -> Option { + None + } + + /// Convert native type from i64. + #[inline] + fn from_i64(_: i64) -> Option { + None + } + + /// Convert native type from i128. + #[inline] + fn from_i128(_: i128) -> Option { + None + } +} + +impl private::Sealed for i8 {} +impl ArrowNativeType for i8 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl private::Sealed for i16 {} +impl ArrowNativeType for i16 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl private::Sealed for i32 {} +impl ArrowNativeType for i32 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } + + /// Convert native type from i32. + #[inline] + fn from_i32(val: i32) -> Option { + Some(val) + } +} + +impl private::Sealed for i64 {} +impl ArrowNativeType for i64 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } + + /// Convert native type from i64. + #[inline] + fn from_i64(val: i64) -> Option { + Some(val) + } +} + +impl private::Sealed for i128 {} +impl ArrowNativeType for i128 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } + + /// Convert native type from i128. + #[inline] + fn from_i128(val: i128) -> Option { + Some(val) + } +} + +impl private::Sealed for u8 {} +impl ArrowNativeType for u8 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl private::Sealed for u16 {} +impl ArrowNativeType for u16 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl private::Sealed for u32 {} +impl ArrowNativeType for u32 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl private::Sealed for u64 {} +impl ArrowNativeType for u64 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + +impl ArrowNativeType for f16 {} +impl private::Sealed for f16 {} +impl ArrowNativeType for f32 {} +impl private::Sealed for f32 {} +impl ArrowNativeType for f64 {} +impl private::Sealed for f64 {} + +/// Allows conversion from supported Arrow types to a byte slice. +pub trait ToByteSlice { + /// Converts this instance into a byte slice + fn to_byte_slice(&self) -> &[u8]; +} + +impl ToByteSlice for [T] { + #[inline] + fn to_byte_slice(&self) -> &[u8] { + let raw_ptr = self.as_ptr() as *const T as *const u8; + unsafe { + std::slice::from_raw_parts(raw_ptr, self.len() * std::mem::size_of::()) + } + } +} + +impl ToByteSlice for T { + #[inline] + fn to_byte_slice(&self) -> &[u8] { + let raw_ptr = self as *const T as *const u8; + unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of::()) } + } +} diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs similarity index 99% rename from arrow/src/util/bit_chunk_iterator.rs rename to arrow-buffer/src/util/bit_chunk_iterator.rs index f0127ed2267f..ba028204da10 100644 --- a/arrow/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -153,7 +153,7 @@ impl<'a> UnalignedBitChunk<'a> { self.chunks } - pub(crate) fn iter(&self) -> UnalignedBitChunkIterator<'a> { + pub fn iter(&self) -> UnalignedBitChunkIterator<'a> { self.prefix .into_iter() .chain(self.chunks.iter().cloned()) @@ -166,7 +166,7 @@ impl<'a> UnalignedBitChunk<'a> { } } -pub(crate) type UnalignedBitChunkIterator<'a> = std::iter::Chain< +pub type UnalignedBitChunkIterator<'a> = std::iter::Chain< std::iter::Chain< std::option::IntoIter, std::iter::Cloned>, diff --git a/arrow/src/util/bit_util.rs b/arrow-buffer/src/util/bit_util.rs similarity index 100% rename from arrow/src/util/bit_util.rs rename to arrow-buffer/src/util/bit_util.rs diff --git a/arrow-buffer/src/util/mod.rs b/arrow-buffer/src/util/mod.rs new file mode 100644 index 000000000000..c1cb284dcc1f --- /dev/null +++ b/arrow-buffer/src/util/mod.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod bit_chunk_iterator; +pub mod bit_util; diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2de4db64276f..c66cef612029 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,6 +44,8 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] +arrow-buffer = { path = "../arrow-buffer", version = "22.0.0" } + serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs index 6c6bb0463b28..68b22df4b134 100644 --- a/arrow/benches/buffer_bit_ops.rs +++ b/arrow/benches/buffer_bit_ops.rs @@ -38,15 +38,15 @@ fn create_buffer(size: usize) -> Buffer { } fn bench_buffer_and(left: &Buffer, right: &Buffer) { - criterion::black_box((left & right).unwrap()); + criterion::black_box(buffer_bin_and(left, 0, right, 0, left.len() * 8)); } fn bench_buffer_or(left: &Buffer, right: &Buffer) { - criterion::black_box((left | right).unwrap()); + criterion::black_box(buffer_bin_or(left, 0, right, 0, left.len() * 8)); } fn bench_buffer_not(buffer: &Buffer) { - criterion::black_box(!buffer); + criterion::black_box(buffer_unary_not(buffer, 0, buffer.len() * 8)); } fn bench_buffer_and_with_offsets( diff --git a/arrow/src/alloc/types.rs b/arrow/src/alloc/types.rs deleted file mode 100644 index 026e1241f46b..000000000000 --- a/arrow/src/alloc/types.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::datatypes::DataType; -use half::f16; - -/// A type that Rust's custom allocator knows how to allocate and deallocate. -/// This is implemented for all Arrow's physical types whose in-memory representation -/// matches Rust's physical types. Consider this trait sealed. -/// # Safety -/// Do not implement this trait. -pub unsafe trait NativeType: - Sized + Copy + std::fmt::Debug + std::fmt::Display + PartialEq + Default + Sized + 'static -{ - type Bytes: AsRef<[u8]>; - - /// Whether a DataType is a valid type for this physical representation. - fn is_valid(data_type: &DataType) -> bool; - - /// How this type represents itself as bytes in little endianess. - /// This is used for IPC, where data is communicated with a specific endianess. - fn to_le_bytes(&self) -> Self::Bytes; -} - -macro_rules! create_native { - ($native_ty:ty,$($impl_pattern:pat_param)|+) => { - unsafe impl NativeType for $native_ty { - type Bytes = [u8; std::mem::size_of::()]; - - #[inline] - fn to_le_bytes(&self) -> Self::Bytes { - Self::to_le_bytes(*self) - } - - #[inline] - fn is_valid(data_type: &DataType) -> bool { - matches!(data_type, $($impl_pattern)|+) - } - } - }; -} - -create_native!(u8, DataType::UInt8); -create_native!(u16, DataType::UInt16); -create_native!(u32, DataType::UInt32); -create_native!(u64, DataType::UInt64); -create_native!(i8, DataType::Int8); -create_native!(i16, DataType::Int16); -create_native!( - i32, - DataType::Int32 | DataType::Date32 | DataType::Time32(_) -); -create_native!( - i64, - DataType::Int64 | DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) -); -create_native!(f16, DataType::Float16); -create_native!(f32, DataType::Float32); -create_native!(f64, DataType::Float64); diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index b9c05014c3f7..e830acdc2b92 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -844,7 +844,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] fn test_primitive_array_alignment() { - let ptr = alloc::allocate_aligned::(8); + let ptr = alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); let array_data = ArrayData::builder(DataType::Int32) @@ -860,7 +860,7 @@ mod tests { // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_list_array_alignment() { - let ptr = alloc::allocate_aligned::(8); + let ptr = alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); diff --git a/arrow/src/bitmap.rs b/arrow/src/bitmap.rs index 4ba1bb9f8882..4491da4632b4 100644 --- a/arrow/src/bitmap.rs +++ b/arrow/src/bitmap.rs @@ -18,10 +18,11 @@ //! Defines [Bitmap] for tracking validity bitmaps use crate::buffer::Buffer; -use crate::error::Result; +use crate::error::{ArrowError, Result}; use crate::util::bit_util; use std::mem; +use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or}; use std::ops::{BitAnd, BitOr}; #[derive(Debug, Clone)] @@ -79,7 +80,18 @@ impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { type Output = Result; fn bitand(self, rhs: &'b Bitmap) -> Result { - Ok(Bitmap::from((&self.bits & &rhs.bits)?)) + if self.bits.len() != rhs.bits.len() { + return Err(ArrowError::ComputeError( + "Buffers must be the same size to apply Bitwise AND.".to_string(), + )); + } + Ok(Bitmap::from(buffer_bin_and( + &self.bits, + 0, + &rhs.bits, + 0, + self.bit_len(), + ))) } } @@ -87,7 +99,18 @@ impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { type Output = Result; fn bitor(self, rhs: &'b Bitmap) -> Result { - Ok(Bitmap::from((&self.bits | &rhs.bits)?)) + if self.bits.len() != rhs.bits.len() { + return Err(ArrowError::ComputeError( + "Buffers must be the same size to apply Bitwise OR.".to_string(), + )); + } + Ok(Bitmap::from(buffer_bin_or( + &self.bits, + 0, + &rhs.bits, + 0, + self.bit_len(), + ))) } } diff --git a/arrow/src/buffer/mod.rs b/arrow/src/buffer/mod.rs deleted file mode 100644 index b392b0583d6d..000000000000 --- a/arrow/src/buffer/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents -//! a contiguous memory region that can be shared via `offsets`. - -mod immutable; -pub use immutable::*; -mod mutable; -pub use mutable::*; -mod ops; -mod scalar; -pub use scalar::*; - -pub use ops::*; - -use crate::error::{ArrowError, Result}; -use std::ops::{BitAnd, BitOr, Not}; - -impl<'a, 'b> BitAnd<&'b Buffer> for &'a Buffer { - type Output = Result; - - fn bitand(self, rhs: &'b Buffer) -> Result { - if self.len() != rhs.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise AND.".to_string(), - )); - } - - let len_in_bits = self.len() * 8; - Ok(buffer_bin_and(self, 0, rhs, 0, len_in_bits)) - } -} - -impl<'a, 'b> BitOr<&'b Buffer> for &'a Buffer { - type Output = Result; - - fn bitor(self, rhs: &'b Buffer) -> Result { - if self.len() != rhs.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise OR.".to_string(), - )); - } - - let len_in_bits = self.len() * 8; - - Ok(buffer_bin_or(self, 0, rhs, 0, len_in_bits)) - } -} - -impl Not for &Buffer { - type Output = Buffer; - - fn not(self) -> Buffer { - let len_in_bits = self.len() * 8; - buffer_unary_not(self, 0, len_in_bits) - } -} diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index c51953a7540c..b8719ad2d6c7 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -22,8 +22,6 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use std::ops::Not; - use crate::array::{Array, ArrayData, BooleanArray, PrimitiveArray}; use crate::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, @@ -85,14 +83,13 @@ pub(crate) fn build_null_buffer_for_and_kleene( // The final null bits are: // (a | (c & !d)) & (c | (a & !b)) Some(bitwise_quaternary_op_helper( - left_null_buffer, - left_offset, - left_buffer, - left_offset, - right_null_buffer, - right_offset, - right_buffer, - right_offset, + [ + left_null_buffer, + left_buffer, + right_null_buffer, + right_buffer, + ], + [left_offset, left_offset, right_offset, right_offset], len_in_bits, |a, b, c, d| (a | (c & !d)) & (c | (a & !b)), )) @@ -163,14 +160,13 @@ pub(crate) fn build_null_buffer_for_or_kleene( // The final null bits are: // (a | (c & d)) & (c | (a & b)) Some(bitwise_quaternary_op_helper( - left_null_buffer, - left_offset, - left_buffer, - left_offset, - right_null_buffer, - right_offset, - right_buffer, - right_offset, + [ + left_null_buffer, + left_buffer, + right_null_buffer, + right_buffer, + ], + [left_offset, left_offset, right_offset, right_offset], len_in_bits, |a, b, c, d| (a | (c & d)) & (c | (a & b)), )) @@ -493,7 +489,6 @@ where )); } let left_data = left.data(); - let right_data = right.data(); // If left has no bitmap, create a new one with all values set for nullity op later // left=0 (null) right=null output bitmap=null @@ -507,33 +502,31 @@ where // // Do the right expression !(right_values & right_bitmap) first since there are two steps // TRICK: convert BooleanArray buffer as a bitmap for faster operation - let right_combo_buffer = match right.data().null_bitmap() { + let rcb = match right.data().null_bitmap() { Some(right_bitmap) => { - // NOTE: right values and bitmaps are combined and stay at bit offset right.offset() - (right.values() & &right_bitmap.bits).ok().map(|b| b.not()) + let and = buffer_bin_and( + right.values(), + right.offset(), + &right_bitmap.bits, + right.offset(), + right.len(), + ); + buffer_unary_not(&and, 0, right.len()) } - None => Some(!right.values()), + None => buffer_unary_not(right.values(), right.offset(), right.len()), }; // AND of original left null bitmap with right expression // Here we take care of the possible offsets of the left and right arrays all at once. let modified_null_buffer = match left_data.null_bitmap() { - Some(left_null_bitmap) => match right_combo_buffer { - Some(rcb) => Some(buffer_bin_and( - &left_null_bitmap.bits, - left_data.offset(), - &rcb, - right_data.offset(), - left_data.len(), - )), - None => Some( - left_null_bitmap - .bits - .bit_slice(left_data.offset(), left.len()), - ), - }, - None => right_combo_buffer - .map(|rcb| rcb.bit_slice(right_data.offset(), right_data.len())), + Some(left_null_bitmap) => buffer_bin_and( + &left_null_bitmap.bits, + left_data.offset(), + &rcb, + 0, + left_data.len(), + ), + None => rcb, }; // Align/shift left data on offset as needed, since new bitmaps are shifted and aligned to 0 already @@ -556,7 +549,7 @@ where T::DATA_TYPE, left.len(), None, // force new to compute the number of null bits - modified_null_buffer, + Some(modified_null_buffer), 0, // No need for offset since left data has been shifted data_buffers, left_data.child_data().to_vec(), diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 444f2b27dce6..8c329a066e5c 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,82 +16,9 @@ // under the License. use super::DataType; +pub use arrow_buffer::native::{ArrowNativeType, ToByteSlice}; use half::f16; -mod private { - pub trait Sealed {} -} - -/// Trait expressing a Rust type that has the same in-memory representation -/// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). -/// -/// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers -/// as is. -/// -/// # Transmute Safety -/// -/// A type T implementing this trait means that any arbitrary slice of bytes of length and -/// alignment `size_of::()` can be safely interpreted as a value of that type without -/// being unsound, i.e. potentially resulting in undefined behaviour. -/// -/// Note: in the case of floating point numbers this transmutation can result in a signalling -/// NaN, which, whilst sound, can be unwieldy. In general, whilst it is perfectly sound to -/// reinterpret bytes as different types using this trait, it is likely unwise. For more information -/// see [f32::from_bits] and [f64::from_bits]. -/// -/// Note: `bool` is restricted to `0` or `1`, and so `bool: !ArrowNativeType` -/// -/// # Sealed -/// -/// Due to the above restrictions, this trait is sealed to prevent accidental misuse -pub trait ArrowNativeType: - std::fmt::Debug - + Send - + Sync - + Copy - + PartialOrd - + std::str::FromStr - + Default - + private::Sealed - + 'static -{ - /// Convert native type from usize. - #[inline] - fn from_usize(_: usize) -> Option { - None - } - - /// Convert native type to usize. - #[inline] - fn to_usize(&self) -> Option { - None - } - - /// Convert native type to isize. - #[inline] - fn to_isize(&self) -> Option { - None - } - - /// Convert native type from i32. - #[inline] - fn from_i32(_: i32) -> Option { - None - } - - /// Convert native type from i64. - #[inline] - fn from_i64(_: i64) -> Option { - None - } - - /// Convert native type from i128. - #[inline] - fn from_i128(_: i128) -> Option { - None - } -} - /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the /// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. pub trait ArrowPrimitiveType: 'static { @@ -219,214 +146,3 @@ native_type_op!(u64); impl native_op::ArrowNativeTypeOp for f16 {} impl native_op::ArrowNativeTypeOp for f32 {} impl native_op::ArrowNativeTypeOp for f64 {} - -impl private::Sealed for i8 {} -impl ArrowNativeType for i8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for i16 {} -impl ArrowNativeType for i16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for i32 {} -impl ArrowNativeType for i32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i32. - #[inline] - fn from_i32(val: i32) -> Option { - Some(val) - } -} - -impl private::Sealed for i64 {} -impl ArrowNativeType for i64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i64. - #[inline] - fn from_i64(val: i64) -> Option { - Some(val) - } -} - -impl private::Sealed for i128 {} -impl ArrowNativeType for i128 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i128. - #[inline] - fn from_i128(val: i128) -> Option { - Some(val) - } -} - -impl private::Sealed for u8 {} -impl ArrowNativeType for u8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for u16 {} -impl ArrowNativeType for u16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for u32 {} -impl ArrowNativeType for u32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for u64 {} -impl ArrowNativeType for u64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl ArrowNativeType for f16 {} -impl private::Sealed for f16 {} -impl ArrowNativeType for f32 {} -impl private::Sealed for f32 {} -impl ArrowNativeType for f64 {} -impl private::Sealed for f64 {} - -/// Allows conversion from supported Arrow types to a byte slice. -pub trait ToByteSlice { - /// Converts this instance into a byte slice - fn to_byte_slice(&self) -> &[u8]; -} - -impl ToByteSlice for [T] { - #[inline] - fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self.as_ptr() as *const T as *const u8; - unsafe { - std::slice::from_raw_parts(raw_ptr, self.len() * std::mem::size_of::()) - } - } -} - -impl ToByteSlice for T { - #[inline] - fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self as *const T as *const u8; - unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of::()) } - } -} diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 87a4799e3e2a..a4d864754cd5 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -248,11 +248,10 @@ #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] -pub mod alloc; +pub use arrow_buffer::{alloc, buffer}; + pub mod array; pub mod bitmap; -pub mod buffer; -mod bytes; pub mod compute; #[cfg(feature = "csv")] pub mod csv; diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 6f68398e7703..5453c11ab8a6 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. +pub use arrow_buffer::util::{bit_chunk_iterator, bit_util}; + #[cfg(feature = "test_utils")] pub mod bench_util; -pub mod bit_chunk_iterator; pub mod bit_iterator; pub(crate) mod bit_mask; -pub mod bit_util; #[cfg(feature = "test_utils")] pub mod data_gen; pub mod display; From 5f441eedff2b7621c46aded8b1caf3b665b8e8a9 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Thu, 15 Sep 2022 17:22:53 +0100 Subject: [PATCH 0049/1411] Fix multipart uploads on Minio (#2731) The official Minio SDK uses "uploads=" as the URL when it initiates a multipart upload instead of "uploads". This affects the AWSV4 signature and causes object_store to fail a signature check when initiating the upload to Minio. It's possible that this contradicts the AWS S3 API docs: https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html#API_CreateMultipartUpload_RequestSyntax and we need to instead keep the URL as `?uploads` and change the URL that goes into the signature instead. --- object_store/src/aws/client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index d8ab3bba8f20..f800fec3dc5d 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -411,7 +411,7 @@ impl S3Client { pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; let url = format!( - "{}/{}/{}?uploads", + "{}/{}/{}?uploads=", self.config.endpoint, self.config.bucket, encode_path(location) From a7a93295bd4a143d55fa31a1c6ac92045d73dc05 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 15 Sep 2022 17:23:22 +0100 Subject: [PATCH 0050/1411] Update read parquet example in parquet/arrow home (#2730) * Update example to read parquet * Remove outdated comment --- parquet/src/arrow/mod.rs | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index c0de656bf9c5..c5fe0fa2a627 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -66,26 +66,23 @@ //! # Example of reading parquet file into arrow record batch //! //! ```rust -//! use arrow::record_batch::RecordBatchReader; -//! use parquet::file::reader::{FileReader, SerializedFileReader}; -//! use parquet::arrow::{ParquetFileArrowReader, ArrowReader, ProjectionMask}; -//! use std::sync::Arc; //! use std::fs::File; +//! use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; //! +//! # use std::sync::Arc; //! # use arrow::array::Int32Array; //! # use arrow::datatypes::{DataType, Field, Schema}; //! # use arrow::record_batch::RecordBatch; //! # use parquet::arrow::arrow_writer::ArrowWriter; +//! # //! # let ids = Int32Array::from(vec![1, 2, 3, 4]); //! # let schema = Arc::new(Schema::new(vec![ -//! # Field::new("id", DataType::Int32, false), +//! # Field::new("id", DataType::Int32, false), //! # ])); //! # -//! # // Write to a memory buffer (can also write to a File) //! # let file = File::create("data.parquet").unwrap(); //! # -//! # let batch = -//! # RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids)]).unwrap(); +//! # let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids)]).unwrap(); //! # let batches = vec![batch]; //! # //! # let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), None).unwrap(); @@ -97,26 +94,14 @@ //! //! let file = File::open("data.parquet").unwrap(); //! -//! let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); -//! let mask = ProjectionMask::leaves(arrow_reader.parquet_schema(), [0]); -//! -//! println!("Converted arrow schema is: {}", arrow_reader.get_schema().unwrap()); -//! println!("Arrow schema after projection is: {}", -//! arrow_reader.get_schema_by_columns(mask.clone()).unwrap()); +//! let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); +//! println!("Converted arrow schema is: {}", builder.schema()); //! -//! let mut unprojected = arrow_reader.get_record_reader(2048).unwrap(); -//! println!("Unprojected reader schema: {}", unprojected.schema()); +//! let mut reader = builder.build().unwrap(); //! -//! let mut record_batch_reader = arrow_reader.get_record_reader_by_columns(mask, 2048).unwrap(); +//! let record_batch = reader.next().unwrap().unwrap(); //! -//! for maybe_record_batch in record_batch_reader { -//! let record_batch = maybe_record_batch.unwrap(); -//! if record_batch.num_rows() > 0 { -//! println!("Read {} records.", record_batch.num_rows()); -//! } else { -//! println!("End of file!"); -//! } -//!} +//! println!("Read {} records.", record_batch.num_rows()); //! ``` experimental!(mod array_reader); From eb9b456fdde92d4ca12c7573fb38faf6e6657fc3 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Fri, 16 Sep 2022 03:12:06 +0800 Subject: [PATCH 0051/1411] benchmark: bitwise operation (#2718) * add benchmark for bitwise operation * add bench for bitwise or xor not --- arrow/Cargo.toml | 5 ++ arrow/benches/bitwise_kernel.rs | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 arrow/benches/bitwise_kernel.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index c66cef612029..e52940b4fc4c 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -240,3 +240,8 @@ harness = false name = "row_format" harness = false required-features = ["test_utils"] + +[[bench]] +name = "bitwise_kernel" +harness = false +required-features = ["test_utils"] diff --git a/arrow/benches/bitwise_kernel.rs b/arrow/benches/bitwise_kernel.rs new file mode 100644 index 000000000000..741eb96125a2 --- /dev/null +++ b/arrow/benches/bitwise_kernel.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; + +use arrow::compute::kernels::bitwise::{ + bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, + bitwise_xor, bitwise_xor_scalar, +}; +use arrow::datatypes::Int64Type; +use criterion::{black_box, Criterion}; +use rand::RngCore; + +extern crate arrow; + +use arrow::util::bench_util::create_primitive_array; +use arrow::util::test_util::seedable_rng; + +fn bitwise_array_benchmark(c: &mut Criterion) { + let size = 64 * 1024_usize; + let left_without_null = create_primitive_array::(size, 0 as f32); + let right_without_null = create_primitive_array::(size, 0 as f32); + let left_with_null = create_primitive_array::(size, 0.2_f32); + let right_with_null = create_primitive_array::(size, 0.2_f32); + // array and + let mut group = c.benchmark_group("bench bitwise array: and"); + group.bench_function("bitwise array and, no nulls", |b| { + b.iter(|| { + black_box(bitwise_and(&left_without_null, &right_without_null).unwrap()) + }) + }); + group.bench_function("bitwise array and, 20% nulls", |b| { + b.iter(|| black_box(bitwise_and(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // array or + let mut group = c.benchmark_group("bench bitwise: or"); + group.bench_function("bitwise array or, no nulls", |b| { + b.iter(|| black_box(bitwise_or(&left_without_null, &right_without_null).unwrap())) + }); + group.bench_function("bitwise array or, 20% nulls", |b| { + b.iter(|| black_box(bitwise_or(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // xor + let mut group = c.benchmark_group("bench bitwise: xor"); + group.bench_function("bitwise array xor, no nulls", |b| { + b.iter(|| { + black_box(bitwise_xor(&left_without_null, &right_without_null).unwrap()) + }) + }); + group.bench_function("bitwise array xor, 20% nulls", |b| { + b.iter(|| black_box(bitwise_xor(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // not + let mut group = c.benchmark_group("bench bitwise: not"); + group.bench_function("bitwise array not, no nulls", |b| { + b.iter(|| black_box(bitwise_not(&left_without_null).unwrap())) + }); + group.bench_function("bitwise array not, 20% nulls", |b| { + b.iter(|| black_box(bitwise_not(&left_with_null).unwrap())) + }); + group.finish(); +} + +fn bitwise_array_scalar_benchmark(c: &mut Criterion) { + let size = 64 * 1024_usize; + let array_without_null = create_primitive_array::(size, 0 as f32); + let array_with_null = create_primitive_array::(size, 0.2_f32); + let scalar = seedable_rng().next_u64() as i64; + // array scalar and + let mut group = c.benchmark_group("bench bitwise array scalar: and"); + group.bench_function("bitwise array scalar and, no nulls", |b| { + b.iter(|| black_box(bitwise_and_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array and, 20% nulls", |b| { + b.iter(|| black_box(bitwise_and_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); + // array scalar or + let mut group = c.benchmark_group("bench bitwise array scalar: or"); + group.bench_function("bitwise array scalar or, no nulls", |b| { + b.iter(|| black_box(bitwise_or_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array scalar or, 20% nulls", |b| { + b.iter(|| black_box(bitwise_or_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); + // array scalar xor + let mut group = c.benchmark_group("bench bitwise array scalar: xor"); + group.bench_function("bitwise array scalar xor, no nulls", |b| { + b.iter(|| black_box(bitwise_xor_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array scalar xor, 20% nulls", |b| { + b.iter(|| black_box(bitwise_xor_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); +} + +criterion_group!( + benches, + bitwise_array_benchmark, + bitwise_array_scalar_benchmark +); +criterion_main!(benches); From 5238789244be27380347b19b0747c9dcd9938470 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Thu, 15 Sep 2022 15:21:11 -0400 Subject: [PATCH 0052/1411] Automate updates to `CHANGELOG-old.md` (#2732) * feature complete * fix footer issue * fix duplicate changelog issue * use tac instead of head for head -n - is not universal * adjust blank lines * fix footer dropping * line adj * add .bak2 to gitignore --- .gitignore | 2 +- dev/release/update_change_log.sh | 35 +++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 2a21776aa545..b8506ea06cb0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,7 @@ __blobstorage__ # .bak files *.bak - +*.bak2 # OS-specific .gitignores # Mac .gitignore diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 252cd285d92b..a3af50a8a6ea 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,16 +29,45 @@ set -e -SINCE_TAG="21.0.0" -FUTURE_RELEASE="22.0.0" +SINCE_TAG="22.0.0" +FUTURE_RELEASE="23.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" +OLD_OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG-old.md" # remove license header so github-changelog-generator has a clean base to append -sed -i.bak '1,18d' "${OUTPUT_PATH}" +sed -i.bak '1,21d' "${OUTPUT_PATH}" +sed -i.bak '1,21d' "${OLD_OUTPUT_PATH}" +# remove the github-changelog-generator footer from the old CHANGELOG.md +LINE_COUNT=$(wc -l <"${OUTPUT_PATH}") +sed -i.bak2 "$(( $LINE_COUNT-4+1 )),$ d" "${OUTPUT_PATH}" + +# Copy the previous CHANGELOG.md to CHANGELOG-old.md +echo ' + +# Historical Changelog +' | cat - "${OUTPUT_PATH}" "${OLD_OUTPUT_PATH}" > "${OLD_OUTPUT_PATH}".tmp +mv "${OLD_OUTPUT_PATH}".tmp "${OLD_OUTPUT_PATH}" # use exclude-tags-regex to filter out tags used for object_store # crates and only only look at tags that DO NOT begin with `object_store_` From 0ebd71e0d3d132250a2e5743f24f952c58c236d3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 15 Sep 2022 22:44:22 +0100 Subject: [PATCH 0053/1411] Partially flatten arrow-buffer (#2737) * Partially flatten arrow-buffer * Format --- arrow-buffer/src/lib.rs | 9 +++++++-- arrow/src/bitmap.rs | 3 +-- arrow/src/datatypes/native.rs | 2 +- arrow/src/util/mod.rs | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index a8aca7c3dad2..74d2bd5ec869 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -19,6 +19,11 @@ pub mod alloc; pub mod buffer; +pub use buffer::{Buffer, MutableBuffer}; + mod bytes; -pub mod native; -pub mod util; +mod native; + +pub use native::*; +mod util; +pub use util::*; diff --git a/arrow/src/bitmap.rs b/arrow/src/bitmap.rs index 4491da4632b4..dbf9706677a5 100644 --- a/arrow/src/bitmap.rs +++ b/arrow/src/bitmap.rs @@ -17,12 +17,11 @@ //! Defines [Bitmap] for tracking validity bitmaps -use crate::buffer::Buffer; use crate::error::{ArrowError, Result}; use crate::util::bit_util; use std::mem; -use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or}; +use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; use std::ops::{BitAnd, BitOr}; #[derive(Debug, Clone)] diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 8c329a066e5c..de35c4804fa0 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,7 +16,7 @@ // under the License. use super::DataType; -pub use arrow_buffer::native::{ArrowNativeType, ToByteSlice}; +pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 5453c11ab8a6..adafc9f5053b 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -pub use arrow_buffer::util::{bit_chunk_iterator, bit_util}; +pub use arrow_buffer::{bit_chunk_iterator, bit_util}; #[cfg(feature = "test_utils")] pub mod bench_util; From 43d912c010b7374ceb3a632eedda4f55f90545d0 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 16 Sep 2022 05:59:39 -0400 Subject: [PATCH 0054/1411] Better construction of RecordBatchOptions (#2729) * include builder for RecordBatchOptions * fix clippy warnings * fix clippy warnings * remove builder struct * removed a wrong comment * Update comment in arrow/src/record_batch.rs Co-authored-by: Andrew Lamb * Update comment in arrow/src/record_batch.rs Co-authored-by: Andrew Lamb Co-authored-by: askoa Co-authored-by: Andrew Lamb --- arrow/src/ipc/reader.rs | 12 ++++-------- arrow/src/record_batch.rs | 35 +++++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 969c8c43f026..a784f54e20ca 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -578,10 +578,7 @@ pub fn read_record_batch( let mut node_index = 0; let mut arrays = vec![]; - let options = RecordBatchOptions { - row_count: Some(batch.length() as usize), - ..Default::default() - }; + let options = RecordBatchOptions::new().with_row_count(Some(batch.length() as usize)); if let Some(projection) = projection { // project fields @@ -1692,10 +1689,9 @@ mod tests { #[test] fn test_no_columns_batch() { let schema = Arc::new(Schema::new(vec![])); - let options = RecordBatchOptions { - match_field_names: true, - row_count: Some(10), - }; + let options = RecordBatchOptions::new() + .with_match_field_names(true) + .with_row_count(Some(10)); let input_batch = RecordBatch::try_new_with_options(schema, vec![], &options).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs index 4b0d36a43e5b..f71c67fe7746 100644 --- a/arrow/src/record_batch.rs +++ b/arrow/src/record_batch.rs @@ -80,7 +80,7 @@ impl RecordBatch { /// # } /// ``` pub fn try_new(schema: SchemaRef, columns: Vec) -> Result { - let options = RecordBatchOptions::default(); + let options = RecordBatchOptions::new(); Self::try_new_impl(schema, columns, &options) } @@ -413,15 +413,29 @@ pub struct RecordBatchOptions { pub row_count: Option, } -impl Default for RecordBatchOptions { - fn default() -> Self { +impl RecordBatchOptions { + pub fn new() -> Self { Self { match_field_names: true, row_count: None, } } + /// Sets the row_count of RecordBatchOptions and returns self + pub fn with_row_count(mut self, row_count: Option) -> Self { + self.row_count = row_count; + self + } + /// Sets the match_field_names of RecordBatchOptions and returns self + pub fn with_match_field_names(mut self, match_field_names: bool) -> Self { + self.match_field_names = match_field_names; + self + } +} +impl Default for RecordBatchOptions { + fn default() -> Self { + Self::new() + } } - impl From<&StructArray> for RecordBatch { /// Create a record batch from struct array, where each field of /// the `StructArray` becomes a `Field` in the schema. @@ -901,10 +915,7 @@ mod tests { .to_string() .contains("must either specify a row count or at least one column")); - let options = RecordBatchOptions { - row_count: Some(10), - ..Default::default() - }; + let options = RecordBatchOptions::new().with_row_count(Some(10)); let ok = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); @@ -929,4 +940,12 @@ mod tests { ); assert_eq!("Invalid argument error: Column 'a' is declared as non-nullable but contains null values", format!("{}", maybe_batch.err().unwrap())); } + #[test] + fn test_record_batch_options() { + let options = RecordBatchOptions::new() + .with_match_field_names(false) + .with_row_count(Some(20)); + assert!(!options.match_field_names); + assert_eq!(options.row_count.unwrap(), 20) + } } From f572ec1bef4a66a00b78f1d80a39992d63444ec2 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Fri, 16 Sep 2022 18:47:20 +0800 Subject: [PATCH 0055/1411] Update `try_binary` and `checked_ops`, and remove `math_checked_op` (#2717) * update try_binary delete math_checked_op update the return type of checked ops Signed-off-by: remzi <13716567376yh@gmail.com> * float div not panic on zero Signed-off-by: remzi <13716567376yh@gmail.com> * fix nan test Signed-off-by: remzi <13716567376yh@gmail.com> * add float divide by zero Signed-off-by: remzi <13716567376yh@gmail.com> * add float tests Signed-off-by: remzi <13716567376yh@gmail.com> * fix compile error Signed-off-by: remzi <13716567376yh@gmail.com> Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/Cargo.toml | 2 +- arrow/src/compute/kernels/arithmetic.rs | 220 ++++++++++-------------- arrow/src/compute/kernels/arity.rs | 14 +- arrow/src/datatypes/native.rs | 66 +++++-- 4 files changed, 153 insertions(+), 149 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index e52940b4fc4c..1580856dfc01 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -51,7 +51,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false } +half = { version = "2.0", default-features = false, features = ["num-traits"]} hashbrown = { version = "0.12", default-features = false } csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 04fe2393ec4d..7b91a261c7e1 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -78,32 +78,6 @@ where Ok(binary(left, right, op)) } -/// This is similar to `math_op` as it performs given operation between two input primitive arrays. -/// But the given operation can return `None` if overflow is detected. For the case, this function -/// returns an `Err`. -fn math_checked_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Option, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - try_binary(left, right, |a, b| { - op(a, b).ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: {:?}, {:?}", a, b)) - }) - }) -} - /// Helper function for operations where a valid `0` on the right array should /// result in an [ArrowError::DivideByZero], namely the division and modulo operations /// @@ -121,26 +95,9 @@ where LT: ArrowNumericType, RT: ArrowNumericType, RT::Native: One + Zero, - F: Fn(LT::Native, RT::Native) -> Option, + F: Fn(LT::Native, RT::Native) -> Result, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - try_binary(left, right, |l, r| { - if r.is_zero() { - Err(ArrowError::DivideByZero) - } else { - op(l, r).ok_or_else(|| { - ArrowError::ComputeError(format!( - "Overflow happened on: {:?}, {:?}", - l, r - )) - }) - } - }) + try_binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -161,16 +118,12 @@ fn math_checked_divide_op_on_iters( where T: ArrowNumericType, T::Native: One + Zero, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, { let buffer = if null_bit_buffer.is_some() { let values = left.zip(right).map(|(left, right)| { if let (Some(l), Some(r)) = (left, right) { - if r.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(op(l, r)) - } + op(l, r) } else { Ok(T::default_value()) } @@ -179,15 +132,10 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } } else { // no value is null - let values = left.map(|l| l.unwrap()).zip(right.map(|r| r.unwrap())).map( - |(left, right)| { - if right.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(op(left, right)) - } - }, - ); + let values = left + .map(|l| l.unwrap()) + .zip(right.map(|r| r.unwrap())) + .map(|(left, right)| op(left, right)); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly unsafe { Buffer::try_from_trusted_len_iter(values) } }?; @@ -654,7 +602,7 @@ where K: ArrowNumericType, T: ArrowNumericType, T::Native: One + Zero, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( @@ -725,7 +673,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.add_checked(b)) + try_binary(left, right, |a, b| a.add_checked(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -826,11 +774,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary(array, |value| { - value.add_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) - }) - }) + try_unary(array, |value| value.add_checked(scalar)) } /// Add every value in an array by a scalar. If any value in the array is null then the @@ -863,12 +807,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.add_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.add_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -900,7 +840,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.sub_checked(b)) + try_binary(left, right, |a, b| a.sub_checked(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -953,14 +893,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero, { - try_unary(array, |value| { - value.sub_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: subtracting {:?} from {:?}", - scalar, value - )) - }) - }) + try_unary(array, |value| value.sub_checked(scalar)) } /// Subtract every value in an array by a scalar. If any value in the array is null then the @@ -991,15 +924,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.sub_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: subtracting {:?} from {:?}", - scalar, value - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.sub_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `-` operation on an array. If value is null then the result is also null. @@ -1052,7 +978,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.mul_checked(b)) + try_binary(left, right, |a, b| a.mul_checked(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1105,14 +1031,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero + One, { - try_unary(array, |value| { - value.mul_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: multiplying {:?} by {:?}", - value, scalar, - )) - }) - }) + try_unary(array, |value| value.mul_checked(scalar)) } /// Multiply every value in an array by a scalar. If any value in the array is null then the @@ -1143,15 +1062,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.mul_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: multiplying {:?} by {:?}", - value, scalar - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.mul_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left % right` operation on two arrays. If either left or right value is null @@ -1170,7 +1082,13 @@ where a % b }); #[cfg(not(feature = "simd"))] - return math_checked_divide_op(left, right, |a, b| Some(a % b)); + return try_binary(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a % b) + } + }); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1225,12 +1143,17 @@ where pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a / b, math_divide_checked_op_dict) + typed_dict_math_op!( + left, + right, + |a, b| a.div_checked(b), + math_divide_checked_op_dict + ) } _ => { downcast_primitive_array!( (left, right) => { - math_checked_divide_op(left, right, |a, b| Some(a / b)).map(|a| Arc::new(a) as ArrayRef) + math_checked_divide_op(left, right, |a, b| a.div_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -1331,15 +1254,8 @@ where return Err(ArrowError::DivideByZero); } - try_unary_dyn::<_, T>(array, |value| { - value.div_checked(divisor).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: dividing {:?} by {:?}", - value, divisor - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.div_checked(divisor)) + .map(|a| Arc::new(a) as ArrayRef) } #[cfg(test)] @@ -2134,23 +2050,41 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_by_zero_with_checked() { + fn test_int_array_divide_by_zero_with_checked() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide_checked(&a, &b).unwrap(); } + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_f32_array_divide_by_zero_with_checked() { + let a = Float32Array::from(vec![15.0]); + let b = Float32Array::from(vec![0.0]); + divide_checked(&a, &b).unwrap(); + } + #[test] #[should_panic(expected = "attempt to divide by zero")] - fn test_primitive_array_divide_by_zero() { + fn test_int_array_divide_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide(&a, &b).unwrap(); } + #[test] + fn test_f32_array_divide_by_zero() { + let a = Float32Array::from(vec![1.5, 0.0, -1.5]); + let b = Float32Array::from(vec![0.0, 0.0, 0.0]); + let result = divide(&a, &b).unwrap(); + assert_eq!(result.value(0), f32::INFINITY); + assert!(result.value(1).is_nan()); + assert_eq!(result.value(2), f32::NEG_INFINITY); + } + #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_dyn_by_zero() { + fn test_int_array_divide_dyn_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide_dyn(&a, &b).unwrap(); @@ -2158,7 +2092,15 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_dyn_by_zero_dict() { + fn test_f32_array_divide_dyn_by_zero() { + let a = Float32Array::from(vec![1.5]); + let b = Float32Array::from(vec![0.0]); + divide_dyn(&a, &b).unwrap(); + } + + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_int_array_divide_dyn_by_zero_dict() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(15).unwrap(); @@ -2174,14 +2116,38 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_modulus_by_zero() { + fn test_f32_dict_array_divide_dyn_by_zero() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(1.5).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(0.0).unwrap(); + let b = builder.finish(); + + divide_dyn(&a, &b).unwrap(); + } + + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_i32_array_modulus_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); modulus(&a, &b).unwrap(); } #[test] - fn test_primitive_array_divide_f64() { + #[should_panic(expected = "DivideByZero")] + fn test_f32_array_modulus_by_zero() { + let a = Float32Array::from(vec![1.5]); + let b = Float32Array::from(vec![0.0]); + modulus(&a, &b).unwrap(); + } + + #[test] + fn test_f64_array_divide() { let a = Float64Array::from(vec![15.0, 15.0, 8.0]); let b = Float64Array::from(vec![5.0, 6.0, 8.0]); let c = divide(&a, &b).unwrap(); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 21c633116ee0..5060234c71b0 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -261,9 +261,10 @@ where /// /// Like [`try_unary`] the function is only evaluated for non-null indices /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// Return an error if the arrays have different lengths or +/// the operation is under erroneous pub fn try_binary( a: &PrimitiveArray, b: &PrimitiveArray, @@ -275,13 +276,16 @@ where O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> Result, { - assert_eq!(a.len(), b.len()); - let len = a.len(); - + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform a binary operation on arrays of different length".to_string(), + )); + } if a.is_empty() { return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } + let len = a.len(); let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); let null_count = null_buffer .as_ref() diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index de35c4804fa0..dec0cc4b53b0 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,8 +16,10 @@ // under the License. use super::DataType; +use crate::error::{ArrowError, Result}; pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; +use num::Zero; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the /// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. @@ -43,6 +45,8 @@ pub trait ArrowPrimitiveType: 'static { pub(crate) mod native_op { use super::ArrowNativeType; + use crate::error::{ArrowError, Result}; + use num::Zero; use std::ops::{Add, Div, Mul, Sub}; /// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking @@ -61,33 +65,38 @@ pub(crate) mod native_op { + Sub + Mul + Div + + Zero { - fn add_checked(self, rhs: Self) -> Option { - Some(self + rhs) + fn add_checked(self, rhs: Self) -> Result { + Ok(self + rhs) } fn add_wrapping(self, rhs: Self) -> Self { self + rhs } - fn sub_checked(self, rhs: Self) -> Option { - Some(self - rhs) + fn sub_checked(self, rhs: Self) -> Result { + Ok(self - rhs) } fn sub_wrapping(self, rhs: Self) -> Self { self - rhs } - fn mul_checked(self, rhs: Self) -> Option { - Some(self * rhs) + fn mul_checked(self, rhs: Self) -> Result { + Ok(self * rhs) } fn mul_wrapping(self, rhs: Self) -> Self { self * rhs } - fn div_checked(self, rhs: Self) -> Option { - Some(self / rhs) + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self / rhs) + } } fn div_wrapping(self, rhs: Self) -> Self { @@ -99,32 +108,56 @@ pub(crate) mod native_op { macro_rules! native_type_op { ($t:tt) => { impl native_op::ArrowNativeTypeOp for $t { - fn add_checked(self, rhs: Self) -> Option { - self.checked_add(rhs) + fn add_checked(self, rhs: Self) -> Result { + self.checked_add(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} + {:?}", + self, rhs + )) + }) } fn add_wrapping(self, rhs: Self) -> Self { self.wrapping_add(rhs) } - fn sub_checked(self, rhs: Self) -> Option { - self.checked_sub(rhs) + fn sub_checked(self, rhs: Self) -> Result { + self.checked_sub(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} - {:?}", + self, rhs + )) + }) } fn sub_wrapping(self, rhs: Self) -> Self { self.wrapping_sub(rhs) } - fn mul_checked(self, rhs: Self) -> Option { - self.checked_mul(rhs) + fn mul_checked(self, rhs: Self) -> Result { + self.checked_mul(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} * {:?}", + self, rhs + )) + }) } fn mul_wrapping(self, rhs: Self) -> Self { self.wrapping_mul(rhs) } - fn div_checked(self, rhs: Self) -> Option { - self.checked_div(rhs) + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + self.checked_div(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} / {:?}", + self, rhs + )) + }) + } } fn div_wrapping(self, rhs: Self) -> Self { @@ -138,6 +171,7 @@ native_type_op!(i8); native_type_op!(i16); native_type_op!(i32); native_type_op!(i64); +native_type_op!(i128); native_type_op!(u8); native_type_op!(u16); native_type_op!(u32); From 968a7673c7e1341431bc4d55a4f50e9fa6aff7d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 16 Sep 2022 14:52:48 +0200 Subject: [PATCH 0056/1411] Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug (#2743) * Speedup string == "" * neq too * Simplify kernels * Simplify kernels * Fix test * Escape contains * Fmt * Fix --- arrow/benches/equal.rs | 10 ++ arrow/src/compute/kernels/comparison.rs | 216 +++++++++--------------- 2 files changed, 89 insertions(+), 137 deletions(-) diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs index af535506e86d..f54aff1b5cc7 100644 --- a/arrow/benches/equal.rs +++ b/arrow/benches/equal.rs @@ -20,6 +20,7 @@ #[macro_use] extern crate criterion; +use arrow::compute::eq_utf8_scalar; use criterion::Criterion; extern crate arrow; @@ -31,6 +32,10 @@ fn bench_equal>(arr_a: &A) { criterion::black_box(arr_a == arr_a); } +fn bench_equal_utf8_scalar(arr_a: &GenericStringArray, right: &str) { + criterion::black_box(eq_utf8_scalar(arr_a, right).unwrap()); +} + fn add_benchmark(c: &mut Criterion) { let arr_a = create_primitive_array::(512, 0.0); c.bench_function("equal_512", |b| b.iter(|| bench_equal(&arr_a))); @@ -41,6 +46,11 @@ fn add_benchmark(c: &mut Criterion) { let arr_a = create_string_array::(512, 0.0); c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a))); + let arr_a = create_string_array::(512, 0.0); + c.bench_function("equal_string_scalar_empty_512", |b| { + b.iter(|| bench_equal_utf8_scalar(&arr_a, "")) + }); + let arr_a_nulls = create_string_array::(512, 0.5); c.bench_function("equal_string_nulls_512", |b| { b.iter(|| bench_equal(&arr_a_nulls)) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 5a79c2e82df1..d4eb5a3e1d2b 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -233,61 +233,35 @@ pub fn like_utf8( } #[inline] -fn like_scalar<'a, L: ArrayAccessor>( +fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( left: L, right: &str, + op: F, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - if !right.contains(is_like_pattern) { // fast path, can use equals - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i) == right { - bit_util::set_bit(bool_slice, i); - } - } - } + compare_op_scalar(left, |item| op(item == right)) } else if right.ends_with('%') && !right.ends_with("\\%") && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use starts_with let starts_with = &right[..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).starts_with(starts_with) { - bit_util::set_bit(bool_slice, i); - } - } - } + + compare_op_scalar(left, |item| op(item.starts_with(starts_with))) } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { // fast path, can use ends_with let ends_with = &right[1..]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).ends_with(ends_with) { - bit_util::set_bit(bool_slice, i); - } - } - } + compare_op_scalar(left, |item| op(item.ends_with(ends_with))) } else if right.starts_with('%') && right.ends_with('%') + && !right.ends_with("\\%") && !right[1..right.len() - 1].contains(is_like_pattern) { - // fast path, can use contains let contains = &right[1..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).contains(contains) { - bit_util::set_bit(bool_slice, i); - } - } - } + + compare_op_scalar(left, |item| op(item.contains(contains))) } else { let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { @@ -297,26 +271,16 @@ fn like_scalar<'a, L: ArrayAccessor>( )) })?; - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; + compare_op_scalar(left, |item| op(re.is_match(item))) + } +} - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +#[inline] +fn like_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| x) } /// Perform SQL `left LIKE right` operation on [`StringArray`] / @@ -415,86 +379,7 @@ fn nlike_scalar<'a, L: ArrayAccessor>( left: L, right: &str, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i) != right { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).starts_with(starts_with)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).ends_with(ends_with)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).contains(contains)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if !re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + like_scalar_op(left, right, |x| !x) } /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / @@ -966,11 +851,48 @@ pub fn eq_utf8( compare_op(left, right, |a, b| a == b) } +fn utf8_empty( + left: &GenericStringArray, +) -> Result { + let null_bit_buffer = left + .data() + .null_buffer() + .map(|b| b.bit_slice(left.offset(), left.len())); + + let buffer = unsafe { + MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( + |offset| { + if EQ { + offset[1].to_usize().unwrap() == offset[0].to_usize().unwrap() + } else { + offset[1].to_usize().unwrap() > offset[0].to_usize().unwrap() + } + }, + )) + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. pub fn eq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { + if right.is_empty() { + return utf8_empty::<_, true>(left); + } compare_op_scalar(left, |a| a == right) } @@ -1167,6 +1089,9 @@ pub fn neq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { + if right.is_empty() { + return utf8_empty::<_, false>(left); + } compare_op_scalar(left, |a| a != right) } @@ -4324,13 +4249,22 @@ mod tests { #[test] fn test_utf8_eq_scalar_on_slice() { - let a = StringArray::from(vec![Some("hi"), None, Some("hello"), Some("world")]); - let a = a.slice(1, 3); + let a = StringArray::from( + vec![Some("hi"), None, Some("hello"), Some("world"), Some("")], + ); + let a = a.slice(1, 4); let a = as_string_array(&a); let a_eq = eq_utf8_scalar(a, "hello").unwrap(); assert_eq!( a_eq, - BooleanArray::from(vec![None, Some(true), Some(false)]) + BooleanArray::from(vec![None, Some(true), Some(false), Some(false)]) + ); + + let a_eq2 = eq_utf8_scalar(a, "").unwrap(); + + assert_eq!( + a_eq2, + BooleanArray::from(vec![None, Some(false), Some(false), Some(true)]) ); } @@ -4528,6 +4462,14 @@ mod tests { vec![true, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_escape_contains, + vec!["ba%", "ba\\x"], + "%a\\%", + like_utf8_scalar, + vec![true, false] + ); + test_utf8!( test_utf8_scalar_ilike_regex, vec!["%%%"], From 1da2bfbc82de12ac6fb699d2579d4a129929e004 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Fri, 16 Sep 2022 12:23:29 -0400 Subject: [PATCH 0057/1411] Update version to `23.0.0` and update `CHANGELOG`, add `label_issue.py` script (#2734) * feature complete * fix footer issue * fix duplicate changelog issue * use tac instead of head for head -n - is not universal * adjust blank lines * fix footer dropping * line adj * add .bak2 to gitignore * Create changelog * Update version * Add initial relabeling script * more script * tweaks * Runnable as a script * Update changelog * updates * remove overzealous api change labeling Co-authored-by: Andrew Lamb --- CHANGELOG-old.md | 115 +++++++++++- CHANGELOG.md | 183 +++++++++---------- arrow-flight/Cargo.toml | 4 +- arrow-flight/README.md | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow/Cargo.toml | 2 +- arrow/README.md | 4 +- dev/release/README.md | 2 +- dev/release/label_issues.py | 153 ++++++++++++++++ integration-testing/Cargo.toml | 2 +- parquet/Cargo.toml | 6 +- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 14 files changed, 373 insertions(+), 118 deletions(-) create mode 100755 dev/release/label_issues.py diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 70322b5cfd1d..02cb7ec2449e 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -17,9 +17,122 @@ under the License. --> - # Historical Changelog +## [22.0.0](https://github.com/apache/arrow-rs/tree/22.0.0) (2022-09-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/21.0.0...22.0.0) + +**Breaking changes:** + +- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2614](https://github.com/apache/arrow-rs/pull/2614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Gate dyn comparison of dictionary arrays behind `dyn_cmp_dict` [\#2597](https://github.com/apache/arrow-rs/pull/2597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move JsonSerializable to json module \(\#2300\) [\#2595](https://github.com/apache/arrow-rs/pull/2595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Decimal precision scale datatype change [\#2532](https://github.com/apache/arrow-rs/pull/2532) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor PrimitiveBuilder Constructors [\#2518](https://github.com/apache/arrow-rs/pull/2518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactoring DecimalBuilder constructors [\#2517](https://github.com/apache/arrow-rs/pull/2517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor FixedSizeBinaryBuilder Constructors [\#2516](https://github.com/apache/arrow-rs/pull/2516) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor BooleanBuilder Constructors [\#2515](https://github.com/apache/arrow-rs/pull/2515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor UnionBuilder Constructors [\#2488](https://github.com/apache/arrow-rs/pull/2488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) + +**Implemented enhancements:** + +- Add Macros to assist with static dispatch [\#2635](https://github.com/apache/arrow-rs/issues/2635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support comparison between DictionaryArray and BooleanArray [\#2617](https://github.com/apache/arrow-rs/issues/2617) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2613](https://github.com/apache/arrow-rs/issues/2613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support empty projection in CSV, JSON readers [\#2603](https://github.com/apache/arrow-rs/issues/2603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support SQL-compliant NaN ordering between for DictionaryArray and non-DictionaryArray [\#2599](https://github.com/apache/arrow-rs/issues/2599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `dyn_cmp_dict` feature flag to gate dyn comparison of dictionary arrays [\#2596](https://github.com/apache/arrow-rs/issues/2596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2584](https://github.com/apache/arrow-rs/issues/2584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow FlightSQL implementers to extend `do_get()` [\#2581](https://github.com/apache/arrow-rs/issues/2581) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support SQL-compliant behavior on `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2569](https://github.com/apache/arrow-rs/issues/2569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add sql-compliant feature for enabling sql-compliant kernel behavior [\#2568](https://github.com/apache/arrow-rs/issues/2568) +- Calculate `sum` for dictionary array [\#2565](https://github.com/apache/arrow-rs/issues/2565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add test for float nan comparison [\#2556](https://github.com/apache/arrow-rs/issues/2556) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with string array [\#2548](https://github.com/apache/arrow-rs/issues/2548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with primitive array in `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2538](https://github.com/apache/arrow-rs/issues/2538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2535](https://github.com/apache/arrow-rs/issues/2535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- UnionBuilder Create Children With Capacity [\#2523](https://github.com/apache/arrow-rs/issues/2523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up `like_utf8_scalar` for `%pat%` [\#2519](https://github.com/apache/arrow-rs/issues/2519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace macro with TypedDictionaryArray in comparison kernels [\#2513](https://github.com/apache/arrow-rs/issues/2513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use same codebase for boolean kernels [\#2507](https://github.com/apache/arrow-rs/issues/2507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use u8 for Decimal Precision and Scale [\#2496](https://github.com/apache/arrow-rs/issues/2496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Integrate skip row without pageIndex in SerializedPageReader in Fuzz Test [\#2475](https://github.com/apache/arrow-rs/issues/2475) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Avoid unecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add GenericColumnReader::skip\_records Missing OffsetIndex Fallback [\#2433](https://github.com/apache/arrow-rs/issues/2433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support Reading PageIndex with ParquetRecordBatchStream [\#2430](https://github.com/apache/arrow-rs/issues/2430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Specialize FixedLenByteArrayReader for Parquet [\#2318](https://github.com/apache/arrow-rs/issues/2318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make JSON support Optional via Feature Flag [\#2300](https://github.com/apache/arrow-rs/issues/2300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Casting timestamp array to string should not ignore timezone [\#2607](https://github.com/apache/arrow-rs/issues/2607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Ilike\_ut8\_scalar kernals have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Always validate the array data when creating array in IPC reader [\#2541](https://github.com/apache/arrow-rs/issues/2541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Int96Converter Truncates Timestamps [\#2480](https://github.com/apache/arrow-rs/issues/2480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Error Reading Page Index When Not Available [\#2434](https://github.com/apache/arrow-rs/issues/2434) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ParquetFileArrowReader::get_record_reader[_by_colum]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Document All Arrow Features in docs.rs [\#2633](https://github.com/apache/arrow-rs/issues/2633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Add support for CAST from `Interval(DayTime)` to `Timestamp(Nanosecond, None)` [\#2606](https://github.com/apache/arrow-rs/issues/2606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Why do we check for null in TypedDictionaryArray value function [\#2564](https://github.com/apache/arrow-rs/issues/2564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add the `length` field for `Buffer` [\#2524](https://github.com/apache/arrow-rs/issues/2524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Avoid large over allocate buffer in async reader [\#2512](https://github.com/apache/arrow-rs/issues/2512) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rewriting Decimal Builders using `const_generic`. [\#2390](https://github.com/apache/arrow-rs/issues/2390) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Rewrite Decimal Array using `const_generic` [\#2384](https://github.com/apache/arrow-rs/issues/2384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Add downcast macros \(\#2635\) [\#2636](https://github.com/apache/arrow-rs/pull/2636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document all arrow features in docs.rs \(\#2633\) [\#2634](https://github.com/apache/arrow-rs/pull/2634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document dyn\_cmp\_dict [\#2624](https://github.com/apache/arrow-rs/pull/2624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support comparison between DictionaryArray and BooleanArray [\#2618](https://github.com/apache/arrow-rs/pull/2618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cast timestamp array to string array with timezone [\#2608](https://github.com/apache/arrow-rs/pull/2608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support empty projection in CSV and JSON readers [\#2604](https://github.com/apache/arrow-rs/pull/2604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make JSON support optional via a feature flag \(\#2300\) [\#2601](https://github.com/apache/arrow-rs/pull/2601) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support SQL-compliant NaN ordering for DictionaryArray and non-DictionaryArray [\#2600](https://github.com/apache/arrow-rs/pull/2600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split out integration test plumbing \(\#2594\) \(\#2300\) [\#2598](https://github.com/apache/arrow-rs/pull/2598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Refactor Binary Builder and String Builder Constructors [\#2592](https://github.com/apache/arrow-rs/pull/2592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Dictionary like scalar kernels [\#2591](https://github.com/apache/arrow-rs/pull/2591) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Validate dictionary key in TypedDictionaryArray \(\#2578\) [\#2589](https://github.com/apache/arrow-rs/pull/2589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2585](https://github.com/apache/arrow-rs/pull/2585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Code cleanup of array value functions [\#2583](https://github.com/apache/arrow-rs/pull/2583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Allow overriding of do\_get & export useful macro [\#2582](https://github.com/apache/arrow-rs/pull/2582) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- MINOR: Upgrade to pyo3 0.17 [\#2576](https://github.com/apache/arrow-rs/pull/2576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- Support SQL-compliant NaN behavior on eq\_dyn, neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#2570](https://github.com/apache/arrow-rs/pull/2570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add sum\_dyn to calculate sum for dictionary array [\#2566](https://github.com/apache/arrow-rs/pull/2566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- struct UnionBuilder will create child buffers with capacity [\#2560](https://github.com/apache/arrow-rs/pull/2560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kastolars](https://github.com/kastolars)) +- Don't panic on RleValueEncoder::flush\_buffer if empty \(\#2558\) [\#2559](https://github.com/apache/arrow-rs/pull/2559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add the `length` field for Buffer and use more `Buffer` in IPC reader to avoid memory copy. [\#2557](https://github.com/apache/arrow-rs/pull/2557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([HaoYang670](https://github.com/HaoYang670)) +- Add test for float nan comparison [\#2555](https://github.com/apache/arrow-rs/pull/2555) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Compare dictionary array with string array [\#2549](https://github.com/apache/arrow-rs/pull/2549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Always validate the array data \(except the `Decimal`\) when creating array in IPC reader [\#2547](https://github.com/apache/arrow-rs/pull/2547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- MINOR: Fix test\_row\_type\_validation test [\#2546](https://github.com/apache/arrow-rs/pull/2546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix ilike\_utf8\_scalar kernals [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- fix typo [\#2540](https://github.com/apache/arrow-rs/pull/2540) ([00Masato](https://github.com/00Masato)) +- Compare dictionary array and primitive array in lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn kernels [\#2539](https://github.com/apache/arrow-rs/pull/2539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- \[MINOR\]Avoid large over allocate buffer in async reader [\#2537](https://github.com/apache/arrow-rs/pull/2537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2533](https://github.com/apache/arrow-rs/pull/2533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add iterator for FixedSizeBinaryArray [\#2531](https://github.com/apache/arrow-rs/pull/2531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- add bench: decimal with byte array and fixed length byte array [\#2529](https://github.com/apache/arrow-rs/pull/2529) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Add FixedLengthByteArrayReader Remove ComplexObjectArrayReader [\#2528](https://github.com/apache/arrow-rs/pull/2528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Split out byte array decoders \(\#2318\) [\#2527](https://github.com/apache/arrow-rs/pull/2527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use offset index in ParquetRecordBatchStream [\#2526](https://github.com/apache/arrow-rs/pull/2526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Clean the `create_array` in IPC reader. [\#2525](https://github.com/apache/arrow-rs/pull/2525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove DecimalByteArrayConvert \(\#2480\) [\#2522](https://github.com/apache/arrow-rs/pull/2522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Improve performance of `%pat%` \(\>3x speedup\) [\#2521](https://github.com/apache/arrow-rs/pull/2521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- remove len field from MapBuilder [\#2520](https://github.com/apache/arrow-rs/pull/2520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Replace macro with TypedDictionaryArray in comparison kernels [\#2514](https://github.com/apache/arrow-rs/pull/2514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Avoid large over allocate buffer in sync reader [\#2511](https://github.com/apache/arrow-rs/pull/2511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Avoid useless memory copies in IPC reader. [\#2510](https://github.com/apache/arrow-rs/pull/2510) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor boolean kernels to use same codebase [\#2508](https://github.com/apache/arrow-rs/pull/2508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove Int96Converter \(\#2480\) [\#2481](https://github.com/apache/arrow-rs/pull/2481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + ## [21.0.0](https://github.com/apache/arrow-rs/tree/21.0.0) (2022-08-18) [Full Changelog](https://github.com/apache/arrow-rs/compare/20.0.0...21.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69f2b8af6cf8..4a063594dc99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,119 +19,108 @@ # Changelog -## [22.0.0](https://github.com/apache/arrow-rs/tree/22.0.0) (2022-09-02) +## [23.0.0](https://github.com/apache/arrow-rs/tree/23.0.0) (2022-09-16) -[Full Changelog](https://github.com/apache/arrow-rs/compare/21.0.0...22.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...23.0.0) **Breaking changes:** -- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2614](https://github.com/apache/arrow-rs/pull/2614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Gate dyn comparison of dictionary arrays behind `dyn_cmp_dict` [\#2597](https://github.com/apache/arrow-rs/pull/2597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move JsonSerializable to json module \(\#2300\) [\#2595](https://github.com/apache/arrow-rs/pull/2595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Decimal precision scale datatype change [\#2532](https://github.com/apache/arrow-rs/pull/2532) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor PrimitiveBuilder Constructors [\#2518](https://github.com/apache/arrow-rs/pull/2518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactoring DecimalBuilder constructors [\#2517](https://github.com/apache/arrow-rs/pull/2517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor FixedSizeBinaryBuilder Constructors [\#2516](https://github.com/apache/arrow-rs/pull/2516) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor BooleanBuilder Constructors [\#2515](https://github.com/apache/arrow-rs/pull/2515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor UnionBuilder Constructors [\#2488](https://github.com/apache/arrow-rs/pull/2488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Move JSON Test Format To integration-testing [\#2724](https://github.com/apache/arrow-rs/pull/2724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-buffer crate \(\#2594\) [\#2693](https://github.com/apache/arrow-rs/pull/2693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify DictionaryBuilder constructors \(\#2684\) \(\#2054\) [\#2685](https://github.com/apache/arrow-rs/pull/2685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate RecordBatch::concat replace with concat\_batches \(\#2594\) [\#2683](https://github.com/apache/arrow-rs/pull/2683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add overflow-checking variant for primitive arithmetic kernels and explicitly define overflow behavior [\#2643](https://github.com/apache/arrow-rs/pull/2643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update thrift v0.16 and vendor parquet-format \(\#2502\) [\#2626](https://github.com/apache/arrow-rs/pull/2626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update flight definitions including backwards-incompatible change to GetSchema [\#2586](https://github.com/apache/arrow-rs/pull/2586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([liukun4515](https://github.com/liukun4515)) **Implemented enhancements:** -- Add Macros to assist with static dispatch [\#2635](https://github.com/apache/arrow-rs/issues/2635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support comparison between DictionaryArray and BooleanArray [\#2617](https://github.com/apache/arrow-rs/issues/2617) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2613](https://github.com/apache/arrow-rs/issues/2613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support empty projection in CSV, JSON readers [\#2603](https://github.com/apache/arrow-rs/issues/2603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support SQL-compliant NaN ordering between for DictionaryArray and non-DictionaryArray [\#2599](https://github.com/apache/arrow-rs/issues/2599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `dyn_cmp_dict` feature flag to gate dyn comparison of dictionary arrays [\#2596](https://github.com/apache/arrow-rs/issues/2596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2584](https://github.com/apache/arrow-rs/issues/2584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow FlightSQL implementers to extend `do_get()` [\#2581](https://github.com/apache/arrow-rs/issues/2581) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support SQL-compliant behavior on `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2569](https://github.com/apache/arrow-rs/issues/2569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add sql-compliant feature for enabling sql-compliant kernel behavior [\#2568](https://github.com/apache/arrow-rs/issues/2568) -- Calculate `sum` for dictionary array [\#2565](https://github.com/apache/arrow-rs/issues/2565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add test for float nan comparison [\#2556](https://github.com/apache/arrow-rs/issues/2556) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with string array [\#2548](https://github.com/apache/arrow-rs/issues/2548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with primitive array in `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2538](https://github.com/apache/arrow-rs/issues/2538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2535](https://github.com/apache/arrow-rs/issues/2535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- UnionBuilder Create Children With Capacity [\#2523](https://github.com/apache/arrow-rs/issues/2523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up `like_utf8_scalar` for `%pat%` [\#2519](https://github.com/apache/arrow-rs/issues/2519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace macro with TypedDictionaryArray in comparison kernels [\#2513](https://github.com/apache/arrow-rs/issues/2513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use same codebase for boolean kernels [\#2507](https://github.com/apache/arrow-rs/issues/2507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use u8 for Decimal Precision and Scale [\#2496](https://github.com/apache/arrow-rs/issues/2496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Integrate skip row without pageIndex in SerializedPageReader in Fuzz Test [\#2475](https://github.com/apache/arrow-rs/issues/2475) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Avoid unecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add GenericColumnReader::skip\_records Missing OffsetIndex Fallback [\#2433](https://github.com/apache/arrow-rs/issues/2433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support Reading PageIndex with ParquetRecordBatchStream [\#2430](https://github.com/apache/arrow-rs/issues/2430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Specialize FixedLenByteArrayReader for Parquet [\#2318](https://github.com/apache/arrow-rs/issues/2318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Make JSON support Optional via Feature Flag [\#2300](https://github.com/apache/arrow-rs/issues/2300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cleanup like and nlike utf8 kernels [\#2744](https://github.com/apache/arrow-rs/issues/2744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speedup eq and neq kernels for utf8 arrays [\#2742](https://github.com/apache/arrow-rs/issues/2742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- API for more ergonomic construction of `RecordBatchOptions` [\#2728](https://github.com/apache/arrow-rs/issues/2728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Automate updates to `CHANGELOG-old.md` [\#2726](https://github.com/apache/arrow-rs/issues/2726) +- Don't check the `DivideByZero` error for float modulus [\#2720](https://github.com/apache/arrow-rs/issues/2720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `try_binary` should not panic on unequaled array length. [\#2715](https://github.com/apache/arrow-rs/issues/2715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add benchmark for bitwise operation [\#2714](https://github.com/apache/arrow-rs/issues/2714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2712](https://github.com/apache/arrow-rs/issues/2712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide\_opt kernel which produce null values on division by zero error [\#2709](https://github.com/apache/arrow-rs/issues/2709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DataType` function to detect nested types [\#2704](https://github.com/apache/arrow-rs/issues/2704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support of sorting dictionary of other primitive types [\#2700](https://github.com/apache/arrow-rs/issues/2700) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sort indices of dictionary string values [\#2697](https://github.com/apache/arrow-rs/issues/2697) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support empty projection in `RecordBatch::project` [\#2690](https://github.com/apache/arrow-rs/issues/2690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support sorting dictionary encoded primitive integer arrays [\#2679](https://github.com/apache/arrow-rs/issues/2679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use BitIndexIterator in min\_max\_helper [\#2674](https://github.com/apache/arrow-rs/issues/2674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support building comparator for dictionaries of primitive integer values [\#2672](https://github.com/apache/arrow-rs/issues/2672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Change max/min string macro to generic helper function `min_max_helper` [\#2657](https://github.com/apache/arrow-rs/issues/2657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant of arithmetic scalar kernels [\#2651](https://github.com/apache/arrow-rs/issues/2651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with binary array [\#2644](https://github.com/apache/arrow-rs/issues/2644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant for primitive arithmetic kernels [\#2642](https://github.com/apache/arrow-rs/issues/2642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `downcast_primitive_array` in arithmetic kernels [\#2639](https://github.com/apache/arrow-rs/issues/2639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support DictionaryArray in temporal kernels [\#2622](https://github.com/apache/arrow-rs/issues/2622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inline Generated Thift Code Into Parquet Crate [\#2502](https://github.com/apache/arrow-rs/issues/2502) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Casting timestamp array to string should not ignore timezone [\#2607](https://github.com/apache/arrow-rs/issues/2607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Ilike\_ut8\_scalar kernals have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Always validate the array data when creating array in IPC reader [\#2541](https://github.com/apache/arrow-rs/issues/2541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Int96Converter Truncates Timestamps [\#2480](https://github.com/apache/arrow-rs/issues/2480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Error Reading Page Index When Not Available [\#2434](https://github.com/apache/arrow-rs/issues/2434) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ParquetFileArrowReader::get_record_reader[_by_colum]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Documentation updates:** - -- Document All Arrow Features in docs.rs [\#2633](https://github.com/apache/arrow-rs/issues/2633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Escape contains patterns for utf8 like kernels [\#2745](https://github.com/apache/arrow-rs/issues/2745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Float Array should not panic on `DivideByZero` in the `Divide` kernel [\#2719](https://github.com/apache/arrow-rs/issues/2719) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DictionaryBuilders can Create Invalid DictionaryArrays [\#2684](https://github.com/apache/arrow-rs/issues/2684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow` crate does not build with `features = ["ffi"]` and `default_features = false`. [\#2670](https://github.com/apache/arrow-rs/issues/2670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Invalid results with `RowSelector` having `row_count` of 0 [\#2669](https://github.com/apache/arrow-rs/issues/2669) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- clippy error: unresolved import `crate::array::layout` [\#2659](https://github.com/apache/arrow-rs/issues/2659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast the numeric without the `CastOptions` [\#2648](https://github.com/apache/arrow-rs/issues/2648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Explicitly define overflow behavior for primitive arithmetic kernels [\#2641](https://github.com/apache/arrow-rs/issues/2641) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- update the `flight.proto` and fix schema to SchemaResult [\#2571](https://github.com/apache/arrow-rs/issues/2571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Panic when first data page is skipped using ColumnChunkData::Sparse [\#2543](https://github.com/apache/arrow-rs/issues/2543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `SchemaResult` in IPC deviates from other implementations [\#2445](https://github.com/apache/arrow-rs/issues/2445) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Closed issues:** -- Add support for CAST from `Interval(DayTime)` to `Timestamp(Nanosecond, None)` [\#2606](https://github.com/apache/arrow-rs/issues/2606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Why do we check for null in TypedDictionaryArray value function [\#2564](https://github.com/apache/arrow-rs/issues/2564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add the `length` field for `Buffer` [\#2524](https://github.com/apache/arrow-rs/issues/2524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Avoid large over allocate buffer in async reader [\#2512](https://github.com/apache/arrow-rs/issues/2512) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Rewriting Decimal Builders using `const_generic`. [\#2390](https://github.com/apache/arrow-rs/issues/2390) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Rewrite Decimal Array using `const_generic` [\#2384](https://github.com/apache/arrow-rs/issues/2384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement collect for int values [\#2696](https://github.com/apache/arrow-rs/issues/2696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Add downcast macros \(\#2635\) [\#2636](https://github.com/apache/arrow-rs/pull/2636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Document all arrow features in docs.rs \(\#2633\) [\#2634](https://github.com/apache/arrow-rs/pull/2634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Document dyn\_cmp\_dict [\#2624](https://github.com/apache/arrow-rs/pull/2624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support comparison between DictionaryArray and BooleanArray [\#2618](https://github.com/apache/arrow-rs/pull/2618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cast timestamp array to string array with timezone [\#2608](https://github.com/apache/arrow-rs/pull/2608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support empty projection in CSV and JSON readers [\#2604](https://github.com/apache/arrow-rs/pull/2604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Make JSON support optional via a feature flag \(\#2300\) [\#2601](https://github.com/apache/arrow-rs/pull/2601) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support SQL-compliant NaN ordering for DictionaryArray and non-DictionaryArray [\#2600](https://github.com/apache/arrow-rs/pull/2600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Split out integration test plumbing \(\#2594\) \(\#2300\) [\#2598](https://github.com/apache/arrow-rs/pull/2598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Refactor Binary Builder and String Builder Constructors [\#2592](https://github.com/apache/arrow-rs/pull/2592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Dictionary like scalar kernels [\#2591](https://github.com/apache/arrow-rs/pull/2591) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Validate dictionary key in TypedDictionaryArray \(\#2578\) [\#2589](https://github.com/apache/arrow-rs/pull/2589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2585](https://github.com/apache/arrow-rs/pull/2585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Code cleanup of array value functions [\#2583](https://github.com/apache/arrow-rs/pull/2583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Allow overriding of do\_get & export useful macro [\#2582](https://github.com/apache/arrow-rs/pull/2582) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) -- MINOR: Upgrade to pyo3 0.17 [\#2576](https://github.com/apache/arrow-rs/pull/2576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) -- Support SQL-compliant NaN behavior on eq\_dyn, neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#2570](https://github.com/apache/arrow-rs/pull/2570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add sum\_dyn to calculate sum for dictionary array [\#2566](https://github.com/apache/arrow-rs/pull/2566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- struct UnionBuilder will create child buffers with capacity [\#2560](https://github.com/apache/arrow-rs/pull/2560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kastolars](https://github.com/kastolars)) -- Don't panic on RleValueEncoder::flush\_buffer if empty \(\#2558\) [\#2559](https://github.com/apache/arrow-rs/pull/2559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add the `length` field for Buffer and use more `Buffer` in IPC reader to avoid memory copy. [\#2557](https://github.com/apache/arrow-rs/pull/2557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([HaoYang670](https://github.com/HaoYang670)) -- Add test for float nan comparison [\#2555](https://github.com/apache/arrow-rs/pull/2555) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Compare dictionary array with string array [\#2549](https://github.com/apache/arrow-rs/pull/2549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Always validate the array data \(except the `Decimal`\) when creating array in IPC reader [\#2547](https://github.com/apache/arrow-rs/pull/2547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- MINOR: Fix test\_row\_type\_validation test [\#2546](https://github.com/apache/arrow-rs/pull/2546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix ilike\_utf8\_scalar kernals [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- fix typo [\#2540](https://github.com/apache/arrow-rs/pull/2540) ([00Masato](https://github.com/00Masato)) -- Compare dictionary array and primitive array in lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn kernels [\#2539](https://github.com/apache/arrow-rs/pull/2539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- \[MINOR\]Avoid large over allocate buffer in async reader [\#2537](https://github.com/apache/arrow-rs/pull/2537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2533](https://github.com/apache/arrow-rs/pull/2533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add iterator for FixedSizeBinaryArray [\#2531](https://github.com/apache/arrow-rs/pull/2531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- add bench: decimal with byte array and fixed length byte array [\#2529](https://github.com/apache/arrow-rs/pull/2529) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) -- Add FixedLengthByteArrayReader Remove ComplexObjectArrayReader [\#2528](https://github.com/apache/arrow-rs/pull/2528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Split out byte array decoders \(\#2318\) [\#2527](https://github.com/apache/arrow-rs/pull/2527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Use offset index in ParquetRecordBatchStream [\#2526](https://github.com/apache/arrow-rs/pull/2526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- Clean the `create_array` in IPC reader. [\#2525](https://github.com/apache/arrow-rs/pull/2525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove DecimalByteArrayConvert \(\#2480\) [\#2522](https://github.com/apache/arrow-rs/pull/2522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Improve performance of `%pat%` \(\>3x speedup\) [\#2521](https://github.com/apache/arrow-rs/pull/2521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- remove len field from MapBuilder [\#2520](https://github.com/apache/arrow-rs/pull/2520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Replace macro with TypedDictionaryArray in comparison kernels [\#2514](https://github.com/apache/arrow-rs/pull/2514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Avoid large over allocate buffer in sync reader [\#2511](https://github.com/apache/arrow-rs/pull/2511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Avoid useless memory copies in IPC reader. [\#2510](https://github.com/apache/arrow-rs/pull/2510) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Refactor boolean kernels to use same codebase [\#2508](https://github.com/apache/arrow-rs/pull/2508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove Int96Converter \(\#2480\) [\#2481](https://github.com/apache/arrow-rs/pull/2481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug [\#2743](https://github.com/apache/arrow-rs/pull/2743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Partially flatten arrow-buffer [\#2737](https://github.com/apache/arrow-rs/pull/2737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Automate updates to `CHANGELOG-old.md` [\#2732](https://github.com/apache/arrow-rs/pull/2732) ([iajoiner](https://github.com/iajoiner)) +- Update read parquet example in parquet/arrow home [\#2730](https://github.com/apache/arrow-rs/pull/2730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([datapythonista](https://github.com/datapythonista)) +- Better construction of RecordBatchOptions [\#2729](https://github.com/apache/arrow-rs/pull/2729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- benchmark: bitwise operation [\#2718](https://github.com/apache/arrow-rs/pull/2718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Update `try_binary` and `checked_ops`, and remove `math_checked_op` [\#2717](https://github.com/apache/arrow-rs/pull/2717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support bitwise op in kernel: or,xor,not [\#2716](https://github.com/apache/arrow-rs/pull/2716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2713](https://github.com/apache/arrow-rs/pull/2713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add divide\_opt kernel which produce null values on division by zero error [\#2710](https://github.com/apache/arrow-rs/pull/2710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add DataType::is\_nested\(\) [\#2707](https://github.com/apache/arrow-rs/pull/2707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kfastov](https://github.com/kfastov)) +- Update criterion requirement from 0.3 to 0.4 [\#2706](https://github.com/apache/arrow-rs/pull/2706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support bitwise and operation in the kernel [\#2703](https://github.com/apache/arrow-rs/pull/2703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add support of sorting dictionary of other primitive arrays [\#2701](https://github.com/apache/arrow-rs/pull/2701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Clarify docs of binary and string builders [\#2699](https://github.com/apache/arrow-rs/pull/2699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([datapythonista](https://github.com/datapythonista)) +- Sort indices of dictionary string values [\#2698](https://github.com/apache/arrow-rs/pull/2698) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add support for empty projection in RecordBatch::project [\#2691](https://github.com/apache/arrow-rs/pull/2691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Temporarily disable Golang integration tests re-enable JS [\#2689](https://github.com/apache/arrow-rs/pull/2689) ([tustvold](https://github.com/tustvold)) +- Verify valid UTF-8 when converting byte array \(\#2205\) [\#2686](https://github.com/apache/arrow-rs/pull/2686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support sorting dictionary encoded primitive integer arrays [\#2680](https://github.com/apache/arrow-rs/pull/2680) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Skip RowSelectors with zero rows [\#2678](https://github.com/apache/arrow-rs/pull/2678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- Faster Null Path Selection in ArrayData Equality [\#2676](https://github.com/apache/arrow-rs/pull/2676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dhruv9vats](https://github.com/dhruv9vats)) +- Use BitIndexIterator in min\_max\_helper [\#2675](https://github.com/apache/arrow-rs/pull/2675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support building comparator for dictionaries of primitive integer values [\#2673](https://github.com/apache/arrow-rs/pull/2673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- json feature always requires base64 feature [\#2668](https://github.com/apache/arrow-rs/pull/2668) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([eagletmt](https://github.com/eagletmt)) +- Add try\_unary, binary, try\_binary kernels ~90% faster [\#2666](https://github.com/apache/arrow-rs/pull/2666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use downcast\_dictionary\_array in unary\_dyn [\#2663](https://github.com/apache/arrow-rs/pull/2663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- optimize the `numeric_cast_with_error` [\#2661](https://github.com/apache/arrow-rs/pull/2661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- ffi feature also requires layout [\#2660](https://github.com/apache/arrow-rs/pull/2660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Change max/min string macro to generic helper function min\_max\_helper [\#2658](https://github.com/apache/arrow-rs/pull/2658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix flaky test `test_fuzz_async_reader_selection` [\#2656](https://github.com/apache/arrow-rs/pull/2656) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- MINOR: Ignore flaky test test\_fuzz\_async\_reader\_selection [\#2655](https://github.com/apache/arrow-rs/pull/2655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- MutableBuffer::typed\_data - shared ref access to the typed slice [\#2652](https://github.com/apache/arrow-rs/pull/2652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([medwards](https://github.com/medwards)) +- Overflow-checking variant of arithmetic scalar kernels [\#2650](https://github.com/apache/arrow-rs/pull/2650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- support `CastOption` for casting numeric [\#2649](https://github.com/apache/arrow-rs/pull/2649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Help LLVM vectorize comparison kernel ~50-80% faster [\#2646](https://github.com/apache/arrow-rs/pull/2646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support comparison between dictionary array and binary array [\#2645](https://github.com/apache/arrow-rs/pull/2645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use `downcast_primitive_array` in arithmetic kernels [\#2640](https://github.com/apache/arrow-rs/pull/2640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fully qualifying parquet items [\#2638](https://github.com/apache/arrow-rs/pull/2638) ([dingxiangfei2009](https://github.com/dingxiangfei2009)) +- Support DictionaryArray in temporal kernels [\#2623](https://github.com/apache/arrow-rs/pull/2623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Comparable Row Format [\#2593](https://github.com/apache/arrow-rs/pull/2593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix bug in page skipping [\#2552](https://github.com/apache/arrow-rs/pull/2552) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index ecf02625c9d3..a6fb8751c2df 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "22.0.0" +version = "23.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "22.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 9e9a18ad4789..e01809f3813f 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "22.0.0" +arrow-flight = "23.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 9aef5a0570a3..38bbcf9e8bc3 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "22.0.0" +version = "23.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "22.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "23.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 1580856dfc01..f1918fccd1f9 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "22.0.0" +version = "23.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/README.md b/arrow/README.md index 7a95df0f2252..a1c0e6279a50 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `22.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `23.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags @@ -61,7 +61,7 @@ The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists ## Safety -Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/22.0.01/18/soundness-pledge.html). Specifically: +Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/23.0.01/18/soundness-pledge.html). Specifically: > The intent of this crate is to be free of soundness bugs. The developers will do their best to avoid them, and welcome help in analyzing and fixing them diff --git a/dev/release/README.md b/dev/release/README.md index 3783301e9bed..48748eccbe85 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -78,7 +78,7 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/22.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/23.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` diff --git a/dev/release/label_issues.py b/dev/release/label_issues.py new file mode 100755 index 000000000000..b004b7fa7f86 --- /dev/null +++ b/dev/release/label_issues.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +############################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +############################################################################## + +# Python script to add labels to github issues from the PRs that closed them +# +# Required setup: +# $ pip install PyGithub +# +# ARROW_GITHUB_API_TOKEN needs to be set to your github token +from github import Github +import os +import re + + + +# get all cross referenced issues from the named issue +# (aka linked PRs) +# issue = arrow_repo.get_issue(issue_number) +def get_cross_referenced_issues(issue): + all_issues = set() + for timeline_item in issue.get_timeline(): + if timeline_item.event == 'cross-referenced' and timeline_item.source.type == 'issue': + all_issues.add(timeline_item.source.issue) + + # convert to list + return [i for i in all_issues] + + +# labels not to transfer +BLACKLIST_LABELS = {'development-process', 'api-change'} + +# Adds labels to the specified issue with the labels from linked pull requests +def relabel_issue(arrow_repo, issue_number): + #print(issue_number, 'fetching issue') + issue = arrow_repo.get_issue(issue_number) + print('considering issue', issue.html_url) + linked_issues = get_cross_referenced_issues(issue) + #print(' ', 'cross referenced issues:', linked_issues) + + # Figure out what labels need to be added, if any + existing_labels = set() + for label in issue.labels: + existing_labels.add(label.name) + + # find all labels to add + for linked_issue in linked_issues: + if linked_issue.pull_request is None: + print(' ', 'not pull request, skipping', linked_issue.html_url) + continue + + if linked_issue.repository.name != 'arrow-rs': + print(' ', 'not in arrow-rs, skipping', linked_issue.html_url) + continue + + print(' ', 'finding labels for linked pr', linked_issue.html_url) + linked_labels = set() + for label in linked_issue.labels: + linked_labels.add(label.name) + #print(' ', 'existing labels:', existing_labels) + + labels_to_add = linked_labels.difference(existing_labels) + + # remove any blacklist labels, if any + for l in BLACKLIST_LABELS: + labels_to_add.discard(l) + + if len(labels_to_add) > 0: + print(' ', 'adding labels: ', labels_to_add, 'to', issue.number) + for label in labels_to_add: + issue.add_to_labels(label) + print(' ', 'added', label) + existing_labels.add(label) + + # leave a note about what updated these labels + issue.create_comment('`label_issue.py` automatically added labels {} from #{}'.format(labels_to_add, linked_issue.number)) + + +# what section headings in the CHANGELOG.md file contain closed issues that may need relabeling +ISSUE_SECTION_NAMES = ['Closed issues:', 'Fixed bugs:', 'Implemented enhancements:'] + +# find all possible issues / bugs by scraping CHANGELOG.md +# +# TODO: Find all tickets merged since this tag +# The compare api can find all commits since that tag +# I could not find a good way in the github API to find the PRs connected to a commit +#since_tag = '22.0.0' + +def find_issues_from_changelog(): + script_dir = os.path.dirname(os.path.realpath(__file__)) + path = os.path.join(script_dir, '..', '..', 'CHANGELOG.md') + + issues = set() + + # Flag that + in_issue_section = False + + with open(path, 'r') as f: + for line in f: + #print('line: ', line) + line = line.strip() + if line.startswith('**'): + section_name = line.replace('**', '') + if section_name in ISSUE_SECTION_NAMES: + #print(' ', 'is issue section', section_name) + in_issue_section = True + else: + #print(' ', 'is not issue section', section_name) + in_issue_section = False + + if in_issue_section: + match = re.search('#([\d]+)', line) + if match is not None: + #print(' ', 'reference', match.group(1)) + issues.add(match.group(1)) + + # Convert to list of number + return sorted([int(i) for i in issues]) + + +if __name__ == '__main__': + print('Attempting to label github issues from their corresponding PRs') + + issues = find_issues_from_changelog() + print('Issues found in CHANGELOG: ', issues) + + github_token = os.environ.get("ARROW_GITHUB_API_TOKEN") + + print('logging into GITHUB...') + github = Github(github_token) + + print('getting github repo...') + arrow_repo = github.get_repo('apache/arrow-rs') + + for issue in issues: + relabel_issue(arrow_repo, issue) diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index b9f6cf81855e..e45b812dd6a4 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "22.0.0" +version = "23.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a2d11eb5862b..9b95868f3fc1 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "22.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "23.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -61,7 +61,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "22.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index e32ee1ace5b8..54aa6d52f1ec 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "22.0.0" } +parquet = { path = "../parquet", version = "23.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index d3d7f56ebf67..4aae73dfc2ef 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "22.0.0" -parquet_derive = "22.0.0" +parquet = "23.0.0" +parquet_derive = "23.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 4b814c4c088d..dd8486da2ca2 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "22.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "22.0.0", default-features = false } +parquet = { path = "../parquet", version = "23.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "23.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From 5a55406cf24171600a143a83a95046c7513fd92c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Sep 2022 12:40:37 -0400 Subject: [PATCH 0058/1411] update new `arrow-buffer` crate to 23.0.0 (#2748) * update new `arrow-buffer` crate to 23.0.0 * Update dependency --- arrow-buffer/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 87019111efcc..c1bcd9f63068 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "22.0.0" +version = "23.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f1918fccd1f9..7391ffcf827a 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,7 +44,7 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { path = "../arrow-buffer", version = "22.0.0" } +arrow-buffer = { path = "../arrow-buffer", version = "23.0.0" } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } From ca00b671500b693f8c5e07ac4ea600269adfa2b6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Sep 2022 19:42:20 -0400 Subject: [PATCH 0059/1411] Fix `verify_release_candidate.sh` for new arrow subcrates (#2752) --- dev/release/verify-release-candidate.sh | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index cf8050c1c9f2..98c582c2e178 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -116,21 +116,16 @@ test_source_distribution() { export ARROW_TEST_DATA=$PWD/arrow-testing-data/data export PARQUET_TEST_DATA=$PWD/parquet-testing-data/data - # use local modules because we don't publish modules to crates.io yet - sed \ - -i.bak \ - -E \ - -e 's/^arrow = "([^"]*)"/arrow = { version = "\1", path = "..\/arrow" }/g' \ - -e 's/^parquet = "([^"]*)"/parquet = { version = "\1", path = "..\/parquet" }/g' \ - */Cargo.toml - (cd arrow && cargo build && cargo test) (cd arrow-flight && cargo build && cargo test) (cd parquet && cargo build && cargo test) (cd parquet_derive && cargo build && cargo test) - # verify that the crates can be published to crates.io - pushd arrow + # verify that the leaf crates can be published to crates.io + # we can't verify crates that depend on others + # (because the others haven't yet been published to crates.io) + + pushd arrow-buffer cargo publish --dry-run popd From 46fcb0c93c7b6e2067ff6a5b5bc0b0108ca3c2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 17 Sep 2022 05:58:09 +0200 Subject: [PATCH 0060/1411] Speed up checked kernels for non-null data (~1.4-5x faster) (#2749) * Speed up checked kernels * Fast path for non-null * Move some code --- arrow/src/compute/kernels/arity.rs | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 5060234c71b0..216e3bfcac30 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -106,15 +106,26 @@ where let len = array.len(); let null_count = array.null_count(); - let mut buffer = BufferBuilder::::new(len); - buffer.append_n_zeroed(array.len()); - let slice = buffer.as_slice_mut(); + if null_count == 0 { + let values = array.values().iter().map(|v| op(*v)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size because arrays are sized. + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; + return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + } let null_buffer = array .data_ref() .null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())); + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(array.len()); + let slice = buffer.as_slice_mut(); + try_for_each_valid_idx(array.len(), 0, null_count, null_buffer.as_deref(), |idx| { unsafe { *slice.get_unchecked_mut(idx) = op(array.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) @@ -284,9 +295,21 @@ where if a.is_empty() { return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let len = a.len(); + + if a.null_count() == 0 && b.null_count() == 0 { + let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; + // JUSTIFICATION + // Benefit + // ~75% speedup + // Soundness + // `values` is an iterator with a known size from a PrimitiveArray + return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + } + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer .as_ref() .map(|x| len - x.count_set_bits()) From 5e83ef9cc7e426171f4cb9451fa004c55c7c95be Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 17 Sep 2022 01:04:02 -0700 Subject: [PATCH 0061/1411] Add value type check in try_unary_dict (#2755) --- arrow/src/compute/kernels/arity.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 216e3bfcac30..12cf9721f976 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -156,6 +156,13 @@ where T: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { + if array.value_type() != T::DATA_TYPE { + return Err(ArrowError::CastError(format!( + "Cannot perform the unary operation on dictionary array of value type {}", + array.value_type() + ))); + } + let dict_values = array.values().as_any().downcast_ref().unwrap(); let values = try_unary::(dict_values, op)?.into_data(); let data = array.data().clone().into_builder().child_data(vec![values]); From 3bf6eb98ceb3962e1d9419da6dc93e609f7893e6 Mon Sep 17 00:00:00 2001 From: aksharau Date: Mon, 19 Sep 2022 11:18:23 +0530 Subject: [PATCH 0062/1411] Fix: Issue 2721 : binary function should not panic but return error when array lengths are unequal (#2750) --- arrow/src/compute/kernels/arithmetic.rs | 14 +++------- arrow/src/compute/kernels/arity.rs | 36 +++++++++++++++---------- arrow/src/compute/kernels/bitwise.rs | 9 ++----- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 7b91a261c7e1..b1a62ccfd6af 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -69,13 +69,7 @@ where RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> LT::Native, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - Ok(binary(left, right, op)) + binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -1128,13 +1122,13 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero + One, { - Ok(binary_opt(left, right, |a, b| { + binary_opt(left, right, |a, b| { if b.is_zero() { None } else { Some(a.div_wrapping(b)) } - })) + }) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1670,7 +1664,7 @@ mod tests { let b = Int32Array::from(vec![6, 7, 8]); let e = add(&a, &b).expect_err("should have failed due to different lengths"); assert_eq!( - "ComputeError(\"Cannot perform math operation on arrays of different length\")", + "ComputeError(\"Cannot perform binary operation on arrays of different length\")", format!("{:?}", e) ); } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 12cf9721f976..2347502f96e7 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -235,25 +235,29 @@ where /// especially when the operation can be vectorised, however, requires `op` to be infallible /// for all possible values of its inputs /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// This function gives error if the arrays have different lengths pub fn binary( a: &PrimitiveArray, b: &PrimitiveArray, op: F, -) -> PrimitiveArray +) -> Result> where A: ArrowPrimitiveType, B: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> O::Native, { - assert_eq!(a.len(), b.len()); + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + )); + } let len = a.len(); if a.is_empty() { - return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); @@ -270,7 +274,7 @@ where // `values` is an iterator with a known size from a PrimitiveArray let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } + Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) } /// Applies the provided fallible binary operation across `a` and `b`, returning any error, @@ -344,32 +348,36 @@ where /// /// The function is only evaluated for non-null indices /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// This function gives error if the arrays have different lengths pub(crate) fn binary_opt( a: &PrimitiveArray, b: &PrimitiveArray, op: F, -) -> PrimitiveArray +) -> Result> where A: ArrowPrimitiveType, B: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> Option, { - assert_eq!(a.len(), b.len()); + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + )); + } if a.is_empty() { - return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } if a.null_count() == 0 && b.null_count() == 0 { - a.values() + Ok(a.values() .iter() .zip(b.values().iter()) .map(|(a, b)| op(*a, *b)) - .collect() + .collect()) } else { let iter_a = ArrayIter::new(a); let iter_b = ArrayIter::new(b); @@ -386,7 +394,7 @@ where } }); - values.collect() + Ok(values.collect()) } } diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow/src/compute/kernels/bitwise.rs index 2f3c9e490f4c..0b877b326482 100644 --- a/arrow/src/compute/kernels/bitwise.rs +++ b/arrow/src/compute/kernels/bitwise.rs @@ -18,7 +18,7 @@ use crate::array::PrimitiveArray; use crate::compute::{binary, unary}; use crate::datatypes::ArrowNumericType; -use crate::error::{ArrowError, Result}; +use crate::error::Result; use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array @@ -31,12 +31,7 @@ where T: ArrowNumericType, F: Fn(T::Native, T::Native) -> T::Native, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation on arrays of different length".to_string(), - )); - } - Ok(binary(left, right, op)) + binary(left, right, op) } /// Perform `left & right` operation on two arrays. If either left or right value is null From 9599178c953a7980ec1841d06e2232a671b5cbb3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 20 Sep 2022 03:30:37 -0700 Subject: [PATCH 0063/1411] Add overflow-checking variants of arithmetic dyn kernels (#2740) * Init * More * More * Add tests * Fix clippy * Remove macro * Update doc * Fix clippy * Remove length check * Tweak try_binary to coordinate latest optimization * Fix clippy * Use for loop * Split non-null variant into never inline function * Add value type check * Multiply by get_byte_width of output type. --- arrow/src/compute/kernels/arithmetic.rs | 470 +++++++++++++++++++++--- arrow/src/compute/kernels/arity.rs | 82 +++-- 2 files changed, 466 insertions(+), 86 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b1a62ccfd6af..aa6c8cd66941 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -68,10 +68,30 @@ where LT: ArrowNumericType, RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> LT::Native, + LT::Native: ArrowNativeTypeOp, + RT::Native: ArrowNativeTypeOp, { binary(left, right, op) } +/// This is similar to `math_op` as it performs given operation between two input primitive arrays. +/// But the given operation can return `Err` if overflow is detected. For the case, this function +/// returns an `Err`. +fn math_checked_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + LT: ArrowNumericType, + RT: ArrowNumericType, + F: Fn(LT::Native, RT::Native) -> Result, + LT::Native: ArrowNativeTypeOp, + RT::Native: ArrowNativeTypeOp, +{ + try_binary(left, right, op) +} + /// Helper function for operations where a valid `0` on the right array should /// result in an [ArrowError::DivideByZero], namely the division and modulo operations /// @@ -516,57 +536,64 @@ macro_rules! typed_dict_math_op { }}; } -/// Helper function to perform math lambda function on values from two dictionary arrays, this -/// version does not attempt to use SIMD explicitly (though the compiler may auto vectorize) -macro_rules! math_dict_op { - ($left: expr, $right:expr, $op:expr, $value_ty:ty) => {{ - if $left.len() != $right.len() { - return Err(ArrowError::ComputeError(format!( - "Cannot perform operation on arrays of different length ({}, {})", - $left.len(), - $right.len() - ))); - } +/// Perform given operation on two `DictionaryArray`s. +/// Returns an error if the two arrays have different value type +fn math_op_dict( + left: &DictionaryArray, + right: &DictionaryArray, + op: F, +) -> Result> +where + K: ArrowNumericType, + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> T::Native, + T::Native: ArrowNativeTypeOp, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError(format!( + "Cannot perform operation on arrays of different length ({}, {})", + left.len(), + right.len() + ))); + } - // Safety justification: Since the inputs are valid Arrow arrays, all values are - // valid indexes into the dictionary (which is verified during construction) - - let left_iter = unsafe { - $left - .values() - .as_any() - .downcast_ref::<$value_ty>() - .unwrap() - .take_iter_unchecked($left.keys_iter()) - }; - - let right_iter = unsafe { - $right - .values() - .as_any() - .downcast_ref::<$value_ty>() - .unwrap() - .take_iter_unchecked($right.keys_iter()) - }; - - let result = left_iter - .zip(right_iter) - .map(|(left_value, right_value)| { - if let (Some(left), Some(right)) = (left_value, right_value) { - Some($op(left, right)) - } else { - None - } - }) - .collect(); + // Safety justification: Since the inputs are valid Arrow arrays, all values are + // valid indexes into the dictionary (which is verified during construction) - Ok(result) - }}; + let left_iter = unsafe { + left.values() + .as_any() + .downcast_ref::>() + .unwrap() + .take_iter_unchecked(left.keys_iter()) + }; + + let right_iter = unsafe { + right + .values() + .as_any() + .downcast_ref::>() + .unwrap() + .take_iter_unchecked(right.keys_iter()) + }; + + let result = left_iter + .zip(right_iter) + .map(|(left_value, right_value)| { + if let (Some(left), Some(right)) = (left_value, right_value) { + Some(op(left, right)) + } else { + None + } + }) + .collect(); + + Ok(result) } /// Perform given operation on two `DictionaryArray`s. /// Returns an error if the two arrays have different value type -fn math_op_dict( +fn math_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, @@ -574,9 +601,21 @@ fn math_op_dict( where K: ArrowNumericType, T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, + T::Native: ArrowNativeTypeOp, { - math_dict_op!(left, right, op, PrimitiveArray) + // left and right's value types are supposed to be same as guaranteed by the caller macro now. + if left.value_type() != T::DATA_TYPE { + return Err(ArrowError::NotYetImplemented(format!( + "Cannot perform provided operation on dictionary array of value type {}", + left.value_type() + ))); + } + + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + try_binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -672,10 +711,13 @@ where /// Perform `left + right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `add_dyn_checked` instead. pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a + b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.add_wrapping(b), math_op_dict) } DataType::Date32 => { let l = as_primitive_array::(left); @@ -728,7 +770,84 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a + b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.add_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left + right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `add_dyn` instead. +pub fn add_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.add_checked(b), + math_checked_op_dict + ) + } + DataType::Date32 => { + let l = as_primitive_array::(left); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Date64 => { + let l = as_primitive_array::(left); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.add_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -839,15 +958,47 @@ where /// Perform `left - right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `subtract_dyn_checked` instead. pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a - b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.sub_wrapping(b), math_op_dict) } _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a - b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.sub_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left - right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `subtract_dyn` instead. +pub fn subtract_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.sub_checked(b), + math_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.sub_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -977,15 +1128,47 @@ where /// Perform `left * right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_dyn_checked` instead. pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a * b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.mul_wrapping(b), math_op_dict) } _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a * b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.mul_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left * right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `multiply_dyn` instead. +pub fn multiply_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.mul_checked(b), + math_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.mul_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -1134,7 +1317,52 @@ where /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `divide_dyn_checked` instead. pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }, + math_divide_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left / right` operation on two arrays. If either left or right value is null +/// then the result is also null. If any right hand value is zero then the result of this +/// operation will be `Err(ArrowError::DivideByZero)`. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `divide_dyn` instead. +pub fn divide_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -2357,4 +2585,140 @@ mod tests { let expected = Int32Array::from(vec![None]); assert_eq!(expected, overflow.unwrap()); } + + #[test] + fn test_primitive_add_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + + let wrapped = add_dyn(&a, &b).unwrap(); + let expected = + Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = add_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_add_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(2, 2); + builder.append(i32::MAX).unwrap(); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(2, 2); + builder.append(1).unwrap(); + builder.append(1).unwrap(); + let b = builder.finish(); + + let wrapped = add_dyn(&a, &b).unwrap(); + let expected = + Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = add_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_subtract_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![-2]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = subtract_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = subtract_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_subtract_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(-2).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MAX).unwrap(); + let b = builder.finish(); + + let wrapped = subtract_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = subtract_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_mul_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![10]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = multiply_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = multiply_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_mul_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(10).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MAX).unwrap(); + let b = builder.finish(); + + let wrapped = multiply_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = multiply_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_div_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + + let wrapped = divide_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = divide_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_div_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(-1).unwrap(); + let b = builder.finish(); + + let wrapped = divide_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = divide_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 2347502f96e7..bf10289683f1 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -18,7 +18,8 @@ //! Defines kernels suitable to perform operations to primitive arrays. use crate::array::{ - Array, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, + Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, + PrimitiveArray, }; use crate::buffer::Buffer; use crate::compute::util::combine_option_bitmap; @@ -26,6 +27,7 @@ use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_iterator::try_for_each_valid_idx; +use arrow_buffer::MutableBuffer; use std::sync::Arc; #[inline] @@ -287,16 +289,14 @@ where /// /// Return an error if the arrays have different lengths or /// the operation is under erroneous -pub fn try_binary( - a: &PrimitiveArray, - b: &PrimitiveArray, +pub fn try_binary( + a: A, + b: B, op: F, ) -> Result> where - A: ArrowPrimitiveType, - B: ArrowPrimitiveType, O: ArrowPrimitiveType, - F: Fn(A::Native, B::Native) -> Result, + F: Fn(A::Item, B::Item) -> Result, { if a.len() != b.len() { return Err(ArrowError::ComputeError( @@ -309,36 +309,52 @@ where let len = a.len(); if a.null_count() == 0 && b.null_count() == 0 { - let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; - // JUSTIFICATION - // Benefit - // ~75% speedup - // Soundness - // `values` is an iterator with a known size from a PrimitiveArray - return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + try_binary_no_nulls(len, a, b, op) + } else { + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits()) + .unwrap_or_default(); + + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(len); + let slice = buffer.as_slice_mut(); + + try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { + *slice.get_unchecked_mut(idx) = + op(a.value_unchecked(idx), b.value_unchecked(idx))? + }; + Ok::<_, ArrowError>(()) + })?; + + Ok(unsafe { + build_primitive_array(len, buffer.finish(), null_count, null_buffer) + }) } +} - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); - - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits()) - .unwrap_or_default(); - - let mut buffer = BufferBuilder::::new(len); - buffer.append_n_zeroed(len); - let slice = buffer.as_slice_mut(); - - try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { +/// This intentional inline(never) attribute helps LLVM optimize the loop. +#[inline(never)] +fn try_binary_no_nulls( + len: usize, + a: A, + b: B, + op: F, +) -> Result> +where + O: ArrowPrimitiveType, + F: Fn(A::Item, B::Item) -> Result, +{ + let mut buffer = MutableBuffer::new(len * O::get_byte_width()); + for idx in 0..len { unsafe { - *slice.get_unchecked_mut(idx) = - op(a.value_unchecked(idx), b.value_unchecked(idx))? + buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?); }; - Ok::<_, ArrowError>(()) - })?; - - Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) + } + Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) } /// Applies the provided binary operation across `a` and `b`, collecting the optional results From 5b601b3065d1c239feef6badf3ff68b6d72916a3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Sep 2022 06:30:55 -0400 Subject: [PATCH 0064/1411] MINOR: tweak arrow release instructions (#2758) --- dev/release/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/release/README.md b/dev/release/README.md index 48748eccbe85..d418a09d070f 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -74,9 +74,12 @@ git checkout -b make-release # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh +# run automated script to copy labels to issues based on referenced PRs +python dev/release/label_issues.py # review change log / edit issues and labels if needed, rerun git commit -a -m 'Create changelog' + # update versions sed -i '' -e 's/14.0.0/23.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' @@ -228,6 +231,7 @@ following commands Rust Arrow Crates: ```shell +(cd arrow-buffer && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From 74f639ca8661c868a1aaa2aa6fe23e01f46f97d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Sep 2022 22:30:27 +0100 Subject: [PATCH 0065/1411] Add dyn_arith_dict feature flag (#2760) * Add dyn_arith_dict feature flag * Document feature flag --- .github/workflows/arrow.yml | 6 +-- arrow/Cargo.toml | 3 ++ arrow/README.md | 1 + arrow/src/compute/kernels/arithmetic.rs | 51 ++++++++++++++++++++----- 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d81a551a3b49..cdd87ca1639e 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -51,9 +51,9 @@ jobs: - name: Test run: | cargo test -p arrow - - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict + - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict run: | - cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict + cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict - name: Run examples run: | # Test arrow examples @@ -177,4 +177,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict --all-targets -- -D warnings + cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict --all-targets -- -D warnings diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 7391ffcf827a..f8dbf1481b58 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -95,6 +95,9 @@ ffi = [] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars dyn_cmp_dict = [] +# Enable dyn-arithmetic kernels for dictionary arrays +# Note: this does not impact arithmetic with scalars +dyn_arith_dict = [] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow/README.md b/arrow/README.md index a1c0e6279a50..e168d4a09eee 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -54,6 +54,7 @@ The `arrow` crate provides the following features which may be enabled in your ` - `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - `pyarrow` - bindings for pyo3 to call arrow-rs from python - `dyn_cmp_dict` - enables comparison of dictionary arrays within dyn comparison kernels +- `dyn_arith_dict` - enables arithmetic on dictionary arrays within dyn arithmetic kernels ## Arrow Feature Status diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index aa6c8cd66941..b44cb8b947e2 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -27,11 +27,9 @@ use std::ops::{Div, Neg, Rem}; use num::{One, Zero}; use crate::array::*; -use crate::buffer::Buffer; #[cfg(feature = "simd")] use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; -use crate::compute::util::combine_option_bitmap; use crate::compute::{ binary, binary_opt, try_binary, try_unary, try_unary_dyn, unary_dyn, }; @@ -39,6 +37,7 @@ use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, }; +#[cfg(feature = "dyn_arith_dict")] use crate::datatypes::{ Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, @@ -122,12 +121,13 @@ where /// This function errors if: /// * the arrays have different lengths /// * there is an element where both left and right values are valid and the right value is `0` +#[cfg(feature = "dyn_arith_dict")] fn math_checked_divide_op_on_iters( left: impl Iterator>, right: impl Iterator>, op: F, len: usize, - null_bit_buffer: Option, + null_bit_buffer: Option, ) -> Result> where T: ArrowNumericType, @@ -143,7 +143,7 @@ where } }); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { Buffer::try_from_trusted_len_iter(values) } + unsafe { crate::buffer::Buffer::try_from_trusted_len_iter(values) } } else { // no value is null let values = left @@ -151,7 +151,7 @@ where .zip(right.map(|r| r.unwrap())) .map(|(left, right)| op(left, right)); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { Buffer::try_from_trusted_len_iter(values) } + unsafe { crate::buffer::Buffer::try_from_trusted_len_iter(values) } }?; let data = unsafe { @@ -316,8 +316,10 @@ where } // Create the combined `Bitmap` - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; + let null_bit_buffer = crate::compute::util::combine_option_bitmap( + &[left.data_ref(), right.data_ref()], + left.len(), + )?; let lanes = T::lanes(); let buffer_size = left.len() * std::mem::size_of::(); @@ -425,6 +427,7 @@ where } /// Applies $OP to $LEFT and $RIGHT which are two dictionaries which have (the same) key type $KT +#[cfg(feature = "dyn_arith_dict")] macro_rules! typed_dict_op { ($LEFT: expr, $RIGHT: expr, $OP: expr, $KT: tt, $MATH_OP: ident) => {{ match ($LEFT.value_type(), $RIGHT.value_type()) { @@ -476,6 +479,7 @@ macro_rules! typed_dict_op { }}; } +#[cfg(feature = "dyn_arith_dict")] macro_rules! typed_dict_math_op { // Applies `LEFT OP RIGHT` when `LEFT` and `RIGHT` both are `DictionaryArray` ($LEFT: expr, $RIGHT: expr, $OP: expr, $MATH_OP: ident) => {{ @@ -536,8 +540,20 @@ macro_rules! typed_dict_math_op { }}; } +#[cfg(not(feature = "dyn_arith_dict"))] +macro_rules! typed_dict_math_op { + // Applies `LEFT OP RIGHT` when `LEFT` and `RIGHT` both are `DictionaryArray` + ($LEFT: expr, $RIGHT: expr, $OP: expr, $MATH_OP: ident) => {{ + Err(ArrowError::CastError(format!( + "Arithmetic on arrays of type {} with array of type {} requires \"dyn_arith_dict\" feature", + $LEFT.data_type(), $RIGHT.data_type() + ))) + }}; +} + /// Perform given operation on two `DictionaryArray`s. /// Returns an error if the two arrays have different value type +#[cfg(feature = "dyn_arith_dict")] fn math_op_dict( left: &DictionaryArray, right: &DictionaryArray, @@ -593,6 +609,7 @@ where /// Perform given operation on two `DictionaryArray`s. /// Returns an error if the two arrays have different value type +#[cfg(feature = "dyn_arith_dict")] fn math_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, @@ -626,6 +643,7 @@ where /// This function errors if: /// * the arrays have different lengths /// * there is an element where both left and right values are valid and the right value is `0` +#[cfg(feature = "dyn_arith_dict")] fn math_divide_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, @@ -645,8 +663,10 @@ where ))); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; + let null_bit_buffer = crate::compute::util::combine_option_bitmap( + &[left.data_ref(), right.data_ref()], + left.len(), + )?; // Safety justification: Since the inputs are valid Arrow arrays, all values are // valid indexes into the dictionary (which is verified during construction) @@ -1484,7 +1504,7 @@ where mod tests { use super::*; use crate::array::Int32Array; - use crate::datatypes::Date64Type; + use crate::datatypes::{Date64Type, Int32Type, Int8Type}; use chrono::NaiveDate; #[test] @@ -1605,6 +1625,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_primitive_array_add_dyn_dict() { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); @@ -1683,6 +1704,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_primitive_array_subtract_dyn_dict() { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(15).unwrap(); @@ -1761,6 +1783,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_primitive_array_multiply_dyn_dict() { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); @@ -1801,6 +1824,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_primitive_array_divide_dyn_dict() { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(15).unwrap(); @@ -2322,6 +2346,7 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] + #[cfg(feature = "dyn_arith_dict")] fn test_int_array_divide_dyn_by_zero_dict() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); @@ -2338,7 +2363,9 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] + #[cfg(feature = "dyn_arith_dict")] fn test_f32_dict_array_divide_dyn_by_zero() { + use crate::datatypes::Float32Type; let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(1.5).unwrap(); @@ -2601,6 +2628,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_dictionary_add_dyn_wrapping_overflow() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(2, 2); @@ -2637,6 +2665,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_dictionary_subtract_dyn_wrapping_overflow() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); @@ -2670,6 +2699,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_dictionary_mul_dyn_wrapping_overflow() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); @@ -2703,6 +2733,7 @@ mod tests { } #[test] + #[cfg(feature = "dyn_arith_dict")] fn test_dictionary_div_dyn_wrapping_overflow() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); From 48cc8beaf029dec15b72fd70579d188049c79cd1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Sep 2022 13:04:52 +0100 Subject: [PATCH 0066/1411] Split out arrow-schema (#2594) (#2711) * Split out arrow-schema (#2594) * Flatten schema * Move decimal logic * Fix doc * Fix tests * Fix integration-test * Remove pyarrow orphan * PyArrow fixes * Move ArrowError to arrow-schema * Fix pyarrow * Fix test * Fix conflicts * Fix pyarrow * Tweak feature flags * Test juggling * Derive PyArrowConvert for Vec --- .github/workflows/arrow.yml | 2 + Cargo.toml | 1 + arrow-pyarrow-integration-testing/src/lib.rs | 45 +- arrow-schema/Cargo.toml | 47 ++ arrow-schema/src/datatype.rs | 492 +++++++++++ arrow-schema/src/error.rs | 103 +++ .../datatypes => arrow-schema/src}/field.rs | 14 +- arrow-schema/src/lib.rs | 27 + arrow-schema/src/schema.rs | 782 ++++++++++++++++++ arrow/Cargo.toml | 7 +- arrow/src/csv/mod.rs | 19 + arrow/src/csv/reader.rs | 7 +- arrow/src/csv/writer.rs | 8 +- .../src/datatypes/{datatype.rs => decimal.rs} | 382 +-------- arrow/src/datatypes/mod.rs | 557 +------------ arrow/src/datatypes/schema.rs | 386 --------- arrow/src/error.rs | 115 +-- arrow/src/json/mod.rs | 33 + arrow/src/json/writer.rs | 5 +- arrow/src/pyarrow.rs | 83 +- integration-testing/src/util/mod.rs | 3 +- 21 files changed, 1625 insertions(+), 1493 deletions(-) create mode 100644 arrow-schema/Cargo.toml create mode 100644 arrow-schema/src/datatype.rs create mode 100644 arrow-schema/src/error.rs rename {arrow/src/datatypes => arrow-schema/src}/field.rs (98%) create mode 100644 arrow-schema/src/lib.rs create mode 100644 arrow-schema/src/schema.rs rename arrow/src/datatypes/{datatype.rs => decimal.rs} (68%) delete mode 100644 arrow/src/datatypes/schema.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index cdd87ca1639e..797f63b5ae1b 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -63,6 +63,8 @@ jobs: cargo run --example read_csv_infer_schema - name: Run non-archery based integration-tests run: cargo test -p arrow-integration-testing + - name: Test arrow-schema with all features + run: cargo test -p arrow-schema --all-features # test compilaton features linux-features: diff --git a/Cargo.toml b/Cargo.toml index d0233ccb376a..355c65a8b805 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "arrow", + "arrow-schema", "arrow-buffer", "arrow-flight", "parquet", diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index 086b21834657..f9e70eb8d77a 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -28,9 +28,13 @@ use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; use arrow::ffi_stream::ArrowArrayStreamReader; -use arrow::pyarrow::PyArrowConvert; +use arrow::pyarrow::{PyArrowConvert, PyArrowException, PyArrowType}; use arrow::record_batch::RecordBatch; +fn to_py_err(err: ArrowError) -> PyErr { + PyArrowException::new_err(err.to_string()) +} + /// Returns `array + array` of an int64 array. #[pyfunction] fn double(array: &PyAny, py: Python) -> PyResult { @@ -41,8 +45,10 @@ fn double(array: &PyAny, py: Python) -> PyResult { let array = array .as_any() .downcast_ref::() - .ok_or(ArrowError::ParseError("Expects an int64".to_string()))?; - let array = kernels::arithmetic::add(array, array)?; + .ok_or_else(|| ArrowError::ParseError("Expects an int64".to_string())) + .map_err(to_py_err)?; + + let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; // export array.to_pyarrow(py) @@ -66,56 +72,61 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { /// Returns the substring #[pyfunction] -fn substring(array: ArrayData, start: i64) -> PyResult { +fn substring( + array: PyArrowType, + start: i64, +) -> PyResult> { // import - let array = ArrayRef::from(array); + let array = ArrayRef::from(array.0); // substring - let array = kernels::substring::substring(array.as_ref(), start, None)?; + let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; - Ok(array.data().to_owned()) + Ok(array.data().to_owned().into()) } /// Returns the concatenate #[pyfunction] -fn concatenate(array: ArrayData, py: Python) -> PyResult { - let array = ArrayRef::from(array); +fn concatenate(array: PyArrowType, py: Python) -> PyResult { + let array = ArrayRef::from(array.0); // concat - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()])?; + let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; array.to_pyarrow(py) } #[pyfunction] -fn round_trip_type(obj: DataType) -> PyResult { +fn round_trip_type(obj: PyArrowType) -> PyResult> { Ok(obj) } #[pyfunction] -fn round_trip_field(obj: Field) -> PyResult { +fn round_trip_field(obj: PyArrowType) -> PyResult> { Ok(obj) } #[pyfunction] -fn round_trip_schema(obj: Schema) -> PyResult { +fn round_trip_schema(obj: PyArrowType) -> PyResult> { Ok(obj) } #[pyfunction] -fn round_trip_array(obj: ArrayData) -> PyResult { +fn round_trip_array(obj: PyArrowType) -> PyResult> { Ok(obj) } #[pyfunction] -fn round_trip_record_batch(obj: RecordBatch) -> PyResult { +fn round_trip_record_batch( + obj: PyArrowType, +) -> PyResult> { Ok(obj) } #[pyfunction] fn round_trip_record_batch_reader( - obj: ArrowArrayStreamReader, -) -> PyResult { + obj: PyArrowType, +) -> PyResult> { Ok(obj) } diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml new file mode 100644 index 000000000000..d35a99a6d15a --- /dev/null +++ b/arrow-schema/Cargo.toml @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-schema" +version = "23.0.0" +description = "Defines the logical types for arrow arrays" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_schema" +path = "src/lib.rs" +bench = false + +[dependencies] +serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true } + +[features] +default = [] + +[dev-dependencies] +serde_json = "1.0" diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs new file mode 100644 index 000000000000..9037f7c9a53c --- /dev/null +++ b/arrow-schema/src/datatype.rs @@ -0,0 +1,492 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt; + +use crate::field::Field; + +/// The set of datatypes that are supported by this implementation of Apache Arrow. +/// +/// The Arrow specification on data types includes some more types. +/// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) +/// for Arrow's specification. +/// +/// The variants of this enum include primitive fixed size types as well as parametric or +/// nested types. +/// Currently the Rust implementation supports the following nested types: +/// - `List` +/// - `Struct` +/// +/// Nested types can themselves be nested within other arrays. +/// For more information on these types please see +/// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum DataType { + /// Null type + Null, + /// A boolean datatype representing the values `true` and `false`. + Boolean, + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, + /// A 16-bit floating point number. + Float16, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// A timestamp with an optional timezone. + /// + /// Time is measured as a Unix epoch, counting the seconds from + /// 00:00:00.000 on 1 January 1970, excluding leap seconds, + /// as a 64-bit integer. + /// + /// The time zone is a string indicating the name of a time zone, one of: + /// + /// * As used in the Olson time zone database (the "tz database" or + /// "tzdata"), such as "America/New_York" + /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Timestamps with a non-empty timezone + /// ------------------------------------ + /// + /// If a Timestamp column has a non-empty timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone + /// (the Unix epoch), regardless of the Timestamp's own timezone. + /// + /// Therefore, timestamp values with a non-empty timezone correspond to + /// physical points in time together with some additional information about + /// how the data was obtained and/or how to display it (the timezone). + /// + /// For example, the timestamp value 0 with the timezone string "Europe/Paris" + /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the + /// application may prefer to display it as "January 1st 1970, 01h00" in + /// the Europe/Paris timezone (which is the same physical point in time). + /// + /// One consequence is that timestamp values with a non-empty timezone + /// can be compared and ordered directly, since they all share the same + /// well-known point of reference (the Unix epoch). + /// + /// Timestamps with an unset / empty timezone + /// ----------------------------------------- + /// + /// If a Timestamp column has no timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. + /// + /// Therefore, timestamp values without a timezone cannot be meaningfully + /// interpreted as physical points in time, but only as calendar / clock + /// indications ("wall clock time") in an unspecified timezone. + /// + /// For example, the timestamp value 0 with an empty timezone string + /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there + /// is not enough information to interpret it as a well-defined physical + /// point in time. + /// + /// One consequence is that timestamp values without a timezone cannot + /// be reliably compared or ordered, since they may have different points of + /// reference. In particular, it is *not* possible to interpret an unset + /// or empty timezone as the same as "UTC". + /// + /// Conversion between timezones + /// ---------------------------- + /// + /// If a Timestamp column has a non-empty timezone, changing the timezone + /// to a different non-empty value is a metadata-only operation: + /// the timestamp values need not change as their point of reference remains + /// the same (the Unix epoch). + /// + /// However, if a Timestamp column has no timezone value, changing it to a + /// non-empty value requires to think about the desired semantics. + /// One possibility is to assume that the original timestamp values are + /// relative to the epoch of the timezone being set; timestamp values should + /// then adjusted to the Unix epoch (for example, changing the timezone from + /// empty to "Europe/Paris" would require converting the timestamp values + /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is + /// nevertheless correct). + Timestamp(TimeUnit, Option), + /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in days (32 bits). + Date32, + /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in milliseconds (64 bits). Values are evenly divisible by 86400000. + Date64, + /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time32(TimeUnit), + /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time64(TimeUnit), + /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. + Duration(TimeUnit), + /// A "calendar" interval which models types that don't necessarily + /// have a precise duration without the context of a base timestamp (e.g. + /// days can differ in length during day light savings time transitions). + Interval(IntervalUnit), + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + /// Enum parameter specifies the number of bytes per value. + FixedSizeBinary(i32), + /// Opaque binary data of variable length and 64-bit offsets. + LargeBinary, + /// A variable-length string in Unicode with UTF-8 encoding. + Utf8, + /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. + LargeUtf8, + /// A list of some logical data type with variable length. + List(Box), + /// A list of some logical data type with fixed length. + FixedSizeList(Box, i32), + /// A list of some logical data type with variable length and 64-bit offsets. + LargeList(Box), + /// A nested datatype that contains a number of sub-fields. + Struct(Vec), + /// A nested datatype that can represent slots of differing types. Components: + /// + /// 1. [`Field`] for each possible child type the Union can hold + /// 2. The corresponding `type_id` used to identify which Field + /// 3. The type of union (Sparse or Dense) + Union(Vec, Vec, UnionMode), + /// A dictionary encoded array (`key_type`, `value_type`), where + /// each array element is an index of `key_type` into an + /// associated dictionary of `value_type`. + /// + /// Dictionary arrays are used to store columns of `value_type` + /// that contain many repeated values using less memory, but with + /// a higher CPU overhead for some operations. + /// + /// This type mostly used to represent low cardinality string + /// arrays or a limited set of primitive types as integers. + Dictionary(Box, Box), + /// Exact 128-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + Decimal128(u8, u8), + /// Exact 256-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + Decimal256(u8, u8), + /// A Map is a logical nested type that is represented as + /// + /// `List>` + /// + /// The keys and values are each respectively contiguous. + /// The key and value types are not constrained, but keys should be + /// hashable and unique. + /// Whether the keys are sorted can be set in the `bool` after the `Field`. + /// + /// In a field with Map type, the field has a child Struct field, which then + /// has two children: key type and the second the value type. The names of the + /// child fields may be respectively "entries", "key", and "value", but this is + /// not enforced. + Map(Box, bool), +} + +/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum TimeUnit { + /// Time in seconds. + Second, + /// Time in milliseconds. + Millisecond, + /// Time in microseconds. + Microsecond, + /// Time in nanoseconds. + Nanosecond, +} + +/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum IntervalUnit { + /// Indicates the number of elapsed whole months, stored as 4-byte integers. + YearMonth, + /// Indicates the number of elapsed days and milliseconds, + /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). + DayTime, + /// A triple of the number of elapsed months, days, and nanoseconds. + /// The values are stored contiguously in 16 byte blocks. Months and + /// days are encoded as 32 bit integers and nanoseconds is encoded as a + /// 64 bit integer. All integers are signed. Each field is independent + /// (e.g. there is no constraint that nanoseconds have the same sign + /// as days or that the quantity of nanoseconds represents less + /// than a day's worth of time). + MonthDayNano, +} + +// Sparse or Dense union layouts +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum UnionMode { + Sparse, + Dense, +} + +impl fmt::Display for DataType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} + +impl DataType { + /// Returns true if the type is primitive: (numeric, temporal). + pub fn is_primitive(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + Int8 | Int16 + | Int32 + | Int64 + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Float32 + | Float64 + | Date32 + | Date64 + | Time32(_) + | Time64(_) + | Timestamp(_, _) + | Interval(_) + | Duration(_) + ) + } + + /// Returns true if this type is numeric: (UInt*, Int*, or Float*). + pub fn is_numeric(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + ) + } + + /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). + pub fn is_temporal(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + Date32 + | Date64 + | Timestamp(_, _) + | Time32(_) + | Time64(_) + | Duration(_) + | Interval(_) + ) + } + + /// Returns true if this type is valid as a dictionary key + pub fn is_dictionary_key_type(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 + ) + } + + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, or Map) + pub fn is_nested(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + List(_) + | FixedSizeList(_, _) + | LargeList(_) + | Struct(_) + | Union(_, _, _) + | Map(_, _) + ) + } + + /// Compares the datatype with another, ignoring nested field names + /// and metadata. + pub fn equals_datatype(&self, other: &DataType) -> bool { + match (&self, other) { + (DataType::List(a), DataType::List(b)) + | (DataType::LargeList(a), DataType::LargeList(b)) => { + a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + } + (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { + a_size == b_size + && a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + } + (DataType::Struct(a), DataType::Struct(b)) => { + a.len() == b.len() + && a.iter().zip(b).all(|(a, b)| { + a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + }) + } + ( + DataType::Map(a_field, a_is_sorted), + DataType::Map(b_field, b_is_sorted), + ) => a_field == b_field && a_is_sorted == b_is_sorted, + _ => self == other, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[cfg(feature = "serde")] + fn serde_struct_type() { + use std::collections::BTreeMap; + + let kv_array = [("k".to_string(), "v".to_string())]; + let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + + // Non-empty map: should be converted as JSON obj { ... } + let first_name = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(field_metadata)); + + // Empty map: should be omitted. + let last_name = Field::new("last_name", DataType::Utf8, false) + .with_metadata(Some(BTreeMap::default())); + + let person = DataType::Struct(vec![ + first_name, + last_name, + Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ), + ]); + + let serialized = serde_json::to_string(&person).unwrap(); + + // NOTE that this is testing the default (derived) serialization format, not the + // JSON format specified in metadata.md + + assert_eq!( + "{\"Struct\":[\ + {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ + {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"address\",\"data_type\":{\"Struct\":\ + [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ + ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", + serialized + ); + + let deserialized = serde_json::from_str(&serialized).unwrap(); + + assert_eq!(person, deserialized); + } + + #[test] + fn test_list_datatype_equality() { + // tests that list type equality is checked while ignoring list names + let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true))); + let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true))); + assert!(list_a.equals_datatype(&list_b)); + assert!(!list_a.equals_datatype(&list_c)); + assert!(!list_b.equals_datatype(&list_c)); + assert!(!list_a.equals_datatype(&list_d)); + + let list_e = + DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3); + let list_f = + DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3); + let list_g = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)), + 3, + ); + assert!(list_e.equals_datatype(&list_f)); + assert!(!list_e.equals_datatype(&list_g)); + assert!(!list_f.equals_datatype(&list_g)); + + let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]); + let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]); + let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]); + let list_k = DataType::Struct(vec![ + Field::new("f1", list_f.clone(), false), + Field::new("f2", list_g.clone(), false), + Field::new("f3", DataType::Utf8, true), + ]); + let list_l = DataType::Struct(vec![ + Field::new("ff1", list_f.clone(), false), + Field::new("ff2", list_g.clone(), false), + Field::new("ff3", DataType::LargeUtf8, true), + ]); + let list_m = DataType::Struct(vec![ + Field::new("ff1", list_f, false), + Field::new("ff2", list_g, false), + Field::new("ff3", DataType::Utf8, true), + ]); + assert!(list_h.equals_datatype(&list_i)); + assert!(!list_h.equals_datatype(&list_j)); + assert!(!list_k.equals_datatype(&list_l)); + assert!(list_k.equals_datatype(&list_m)); + } + + #[test] + fn create_struct_type() { + let _person = DataType::Struct(vec![ + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, false), + Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ), + ]); + } +} diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs new file mode 100644 index 000000000000..105d4d5e21f0 --- /dev/null +++ b/arrow-schema/src/error.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines `ArrowError` for representing failures in various Arrow operations. +use std::fmt::{Debug, Display, Formatter}; +use std::io::Write; + +use std::error::Error; + +/// Many different operations in the `arrow` crate return this error type. +#[derive(Debug)] +pub enum ArrowError { + /// Returned when functionality is not yet available. + NotYetImplemented(String), + ExternalError(Box), + CastError(String), + MemoryError(String), + ParseError(String), + SchemaError(String), + ComputeError(String), + DivideByZero, + CsvError(String), + JsonError(String), + IoError(String), + InvalidArgumentError(String), + ParquetError(String), + /// Error during import or export to/from the C Data Interface + CDataInterface(String), + DictionaryKeyOverflowError, +} + +impl ArrowError { + /// Wraps an external error in an `ArrowError`. + pub fn from_external_error(error: Box) -> Self { + Self::ExternalError(error) + } +} + +impl From<::std::io::Error> for ArrowError { + fn from(error: std::io::Error) -> Self { + ArrowError::IoError(error.to_string()) + } +} + +impl From<::std::string::FromUtf8Error> for ArrowError { + fn from(error: std::string::FromUtf8Error) -> Self { + ArrowError::ParseError(error.to_string()) + } +} + +impl From<::std::io::IntoInnerError> for ArrowError { + fn from(error: std::io::IntoInnerError) -> Self { + ArrowError::IoError(error.to_string()) + } +} + +impl Display for ArrowError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + ArrowError::NotYetImplemented(source) => { + write!(f, "Not yet implemented: {}", &source) + } + ArrowError::ExternalError(source) => write!(f, "External error: {}", &source), + ArrowError::CastError(desc) => write!(f, "Cast error: {}", desc), + ArrowError::MemoryError(desc) => write!(f, "Memory error: {}", desc), + ArrowError::ParseError(desc) => write!(f, "Parser error: {}", desc), + ArrowError::SchemaError(desc) => write!(f, "Schema error: {}", desc), + ArrowError::ComputeError(desc) => write!(f, "Compute error: {}", desc), + ArrowError::DivideByZero => write!(f, "Divide by zero error"), + ArrowError::CsvError(desc) => write!(f, "Csv error: {}", desc), + ArrowError::JsonError(desc) => write!(f, "Json error: {}", desc), + ArrowError::IoError(desc) => write!(f, "Io error: {}", desc), + ArrowError::InvalidArgumentError(desc) => { + write!(f, "Invalid argument error: {}", desc) + } + ArrowError::ParquetError(desc) => { + write!(f, "Parquet argument error: {}", desc) + } + ArrowError::CDataInterface(desc) => { + write!(f, "C Data interface error: {}", desc) + } + ArrowError::DictionaryKeyOverflowError => { + write!(f, "Dictionary key bigger than the key type") + } + } + } +} + +impl Error for ArrowError {} diff --git a/arrow/src/datatypes/field.rs b/arrow-schema/src/field.rs similarity index 98% rename from arrow/src/datatypes/field.rs rename to arrow-schema/src/field.rs index 03d07807743d..adafbfa9b72c 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow-schema/src/field.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; +use crate::error::ArrowError; use std::cmp::Ordering; use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; -use super::DataType; +use crate::datatype::DataType; /// Describes a single column in a [`Schema`](super::Schema). /// @@ -145,7 +145,7 @@ impl Field { /// Set the name of the [`Field`] and returns self. /// /// ``` - /// # use arrow::datatypes::*; + /// # use arrow_schema::*; /// let field = Field::new("c1", DataType::Int64, false) /// .with_name("c2"); /// @@ -165,7 +165,7 @@ impl Field { /// Set [`DataType`] of the [`Field`] and returns self. /// /// ``` - /// # use arrow::datatypes::*; + /// # use arrow_schema::*; /// let field = Field::new("c1", DataType::Int64, false) /// .with_data_type(DataType::Utf8); /// @@ -185,7 +185,7 @@ impl Field { /// Set `nullable` of the [`Field`] and returns self. /// /// ``` - /// # use arrow::datatypes::*; + /// # use arrow_schema::*; /// let field = Field::new("c1", DataType::Int64, false) /// .with_nullable(true); /// @@ -259,12 +259,12 @@ impl Field { /// Example: /// /// ``` - /// # use arrow::datatypes::*; + /// # use arrow_schema::*; /// let mut field = Field::new("c1", DataType::Int64, false); /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok()); /// assert!(field.is_nullable()); /// ``` - pub fn try_merge(&mut self, from: &Field) -> Result<()> { + pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError( "Fail to merge schema Field due to conflicting dict_id".to_string(), diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs new file mode 100644 index 000000000000..34030f2d356e --- /dev/null +++ b/arrow-schema/src/lib.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow logical types + +mod datatype; +pub use datatype::*; +mod error; +pub use error::*; +mod field; +pub use field::*; +mod schema; +pub use schema::*; diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs new file mode 100644 index 000000000000..9605cdda720b --- /dev/null +++ b/arrow-schema/src/schema.rs @@ -0,0 +1,782 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::fmt; +use std::hash::Hash; + +use crate::error::ArrowError; +use crate::field::Field; + +/// Describes the meta-data of an ordered sequence of relative types. +/// +/// Note that this information is only part of the meta-data and not part of the physical +/// memory layout. +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Schema { + pub fields: Vec, + /// A map of key-value pairs containing additional meta data. + #[cfg_attr( + feature = "serde", + serde(skip_serializing_if = "HashMap::is_empty", default) + )] + pub metadata: HashMap, +} + +impl Schema { + /// Creates an empty `Schema` + pub fn empty() -> Self { + Self { + fields: vec![], + metadata: HashMap::new(), + } + } + + /// Creates a new [`Schema`] from a sequence of [`Field`] values. + /// + /// # Example + /// + /// ``` + /// # use arrow_schema::*; + /// let field_a = Field::new("a", DataType::Int64, false); + /// let field_b = Field::new("b", DataType::Boolean, false); + /// + /// let schema = Schema::new(vec![field_a, field_b]); + /// ``` + pub fn new(fields: Vec) -> Self { + Self::new_with_metadata(fields, HashMap::new()) + } + + /// Creates a new [`Schema`] from a sequence of [`Field`] values + /// and adds additional metadata in form of key value pairs. + /// + /// # Example + /// + /// ``` + /// # use arrow_schema::*; + /// # use std::collections::HashMap; + /// + /// let field_a = Field::new("a", DataType::Int64, false); + /// let field_b = Field::new("b", DataType::Boolean, false); + /// + /// let mut metadata: HashMap = HashMap::new(); + /// metadata.insert("row_count".to_string(), "100".to_string()); + /// + /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); + /// ``` + #[inline] + pub const fn new_with_metadata( + fields: Vec, + metadata: HashMap, + ) -> Self { + Self { fields, metadata } + } + + /// Sets the metadata of this `Schema` to be `metadata` and returns self + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + self.metadata = metadata; + self + } + + /// Returns a new schema with only the specified columns in the new schema + /// This carries metadata from the parent schema over as well + pub fn project(&self, indices: &[usize]) -> Result { + let new_fields = indices + .iter() + .map(|i| { + self.fields.get(*i).cloned().ok_or_else(|| { + ArrowError::SchemaError(format!( + "project index {} out of bounds, max field {}", + i, + self.fields().len() + )) + }) + }) + .collect::, _>>()?; + Ok(Self::new_with_metadata(new_fields, self.metadata.clone())) + } + + /// Merge schema into self if it is compatible. Struct fields will be merged recursively. + /// + /// Example: + /// + /// ``` + /// # use arrow_schema::*; + /// + /// let merged = Schema::try_merge(vec![ + /// Schema::new(vec![ + /// Field::new("c1", DataType::Int64, false), + /// Field::new("c2", DataType::Utf8, false), + /// ]), + /// Schema::new(vec![ + /// Field::new("c1", DataType::Int64, true), + /// Field::new("c2", DataType::Utf8, false), + /// Field::new("c3", DataType::Utf8, false), + /// ]), + /// ]).unwrap(); + /// + /// assert_eq!( + /// merged, + /// Schema::new(vec![ + /// Field::new("c1", DataType::Int64, true), + /// Field::new("c2", DataType::Utf8, false), + /// Field::new("c3", DataType::Utf8, false), + /// ]), + /// ); + /// ``` + pub fn try_merge( + schemas: impl IntoIterator, + ) -> Result { + schemas + .into_iter() + .try_fold(Self::empty(), |mut merged, schema| { + let Schema { metadata, fields } = schema; + for (key, value) in metadata.into_iter() { + // merge metadata + if let Some(old_val) = merged.metadata.get(&key) { + if old_val != &value { + return Err(ArrowError::SchemaError(format!( + "Fail to merge schema due to conflicting metadata. \ + Key '{}' has different values '{}' and '{}'", + key, old_val, value + ))); + } + } + merged.metadata.insert(key, value); + } + // merge fields + for field in fields.into_iter() { + let merged_field = + merged.fields.iter_mut().find(|f| f.name() == field.name()); + match merged_field { + Some(merged_field) => merged_field.try_merge(&field)?, + // found a new field, add to field list + None => merged.fields.push(field), + } + } + Ok(merged) + }) + } + + /// Returns an immutable reference of the vector of `Field` instances. + #[inline] + pub const fn fields(&self) -> &Vec { + &self.fields + } + + /// Returns a vector with references to all fields (including nested fields) + #[inline] + pub fn all_fields(&self) -> Vec<&Field> { + self.fields.iter().flat_map(|f| f.fields()).collect() + } + + /// Returns an immutable reference of a specific [`Field`] instance selected using an + /// offset within the internal `fields` vector. + pub fn field(&self, i: usize) -> &Field { + &self.fields[i] + } + + /// Returns an immutable reference of a specific [`Field`] instance selected by name. + pub fn field_with_name(&self, name: &str) -> Result<&Field, ArrowError> { + Ok(&self.fields[self.index_of(name)?]) + } + + /// Returns a vector of immutable references to all [`Field`] instances selected by + /// the dictionary ID they use. + pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { + self.fields + .iter() + .flat_map(|f| f.fields_with_dict_id(dict_id)) + .collect() + } + + /// Find the index of the column with the given name. + pub fn index_of(&self, name: &str) -> Result { + (0..self.fields.len()) + .find(|idx| self.fields[*idx].name() == name) + .ok_or_else(|| { + let valid_fields: Vec = + self.fields.iter().map(|f| f.name().clone()).collect(); + ArrowError::SchemaError(format!( + "Unable to get field named \"{}\". Valid fields: {:?}", + name, valid_fields + )) + }) + } + + /// Returns an immutable reference to the Map of custom metadata key-value pairs. + #[inline] + pub const fn metadata(&self) -> &HashMap { + &self.metadata + } + + /// Look up a column by name and return a immutable reference to the column along with + /// its index. + pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> { + self.fields + .iter() + .enumerate() + .find(|&(_, c)| c.name() == name) + } + + /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: + /// + /// * `self` and `other` should contain the same number of fields + /// * for every field `f` in `other`, the field in `self` with corresponding index should be a + /// superset of `f`. + /// * self.metadata is a superset of other.metadata + /// + /// In other words, any record conforms to `other` should also conform to `self`. + pub fn contains(&self, other: &Schema) -> bool { + self.fields.len() == other.fields.len() + && self.fields.iter().zip(other.fields.iter()).all(|(f1, f2)| f1.contains(f2)) + // make sure self.metadata is a superset of other.metadata + && other.metadata.iter().all(|(k, v1)| match self.metadata.get(k) { + Some(v2) => v1 == v2, + _ => false, + }) + } +} + +impl fmt::Display for Schema { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str( + &self + .fields + .iter() + .map(|c| c.to_string()) + .collect::>() + .join(", "), + ) + } +} + +// need to implement `Hash` manually because `HashMap` implement Eq but no `Hash` +#[allow(clippy::derive_hash_xor_eq)] +impl Hash for Schema { + fn hash(&self, state: &mut H) { + self.fields.hash(state); + + // ensure deterministic key order + let mut keys: Vec<&String> = self.metadata.keys().collect(); + keys.sort(); + for k in keys { + k.hash(state); + self.metadata.get(k).expect("key valid").hash(state); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::datatype::DataType; + use crate::{TimeUnit, UnionMode}; + use std::collections::BTreeMap; + + #[test] + #[cfg(feature = "serde")] + fn test_ser_de_metadata() { + // ser/de with empty metadata + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]); + + let json = serde_json::to_string(&schema).unwrap(); + let de_schema = serde_json::from_str(&json).unwrap(); + + assert_eq!(schema, de_schema); + + // ser/de with non-empty metadata + let schema = schema + .with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect()); + let json = serde_json::to_string(&schema).unwrap(); + let de_schema = serde_json::from_str(&json).unwrap(); + + assert_eq!(schema, de_schema); + } + + #[test] + fn test_projection() { + let mut metadata = HashMap::new(); + metadata.insert("meta".to_string(), "data".to_string()); + + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]) + .with_metadata(metadata); + + let projected: Schema = schema.project(&[0, 2]).unwrap(); + + assert_eq!(projected.fields().len(), 2); + assert_eq!(projected.fields()[0].name(), "name"); + assert_eq!(projected.fields()[1].name(), "priority"); + assert_eq!(projected.metadata.get("meta").unwrap(), "data") + } + + #[test] + fn test_oob_projection() { + let mut metadata = HashMap::new(); + metadata.insert("meta".to_string(), "data".to_string()); + + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]) + .with_metadata(metadata); + + let projected = schema.project(&[0, 3]); + + assert!(projected.is_err()); + if let Err(e) = projected { + assert_eq!( + e.to_string(), + "Schema error: project index 3 out of bounds, max field 3".to_string() + ) + } + } + + #[test] + fn test_schema_contains() { + let mut metadata1 = HashMap::new(); + metadata1.insert("meta".to_string(), "data".to_string()); + + let schema1 = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]) + .with_metadata(metadata1.clone()); + + let mut metadata2 = HashMap::new(); + metadata2.insert("meta".to_string(), "data".to_string()); + metadata2.insert("meta2".to_string(), "data".to_string()); + let schema2 = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]) + .with_metadata(metadata2); + + // reflexivity + assert!(schema1.contains(&schema1)); + assert!(schema2.contains(&schema2)); + + assert!(!schema1.contains(&schema2)); + assert!(schema2.contains(&schema1)); + } + + #[test] + fn schema_equality() { + let schema1 = Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Float64, true), + Field::new("c3", DataType::LargeBinary, true), + ]); + let schema2 = Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Float64, true), + Field::new("c3", DataType::LargeBinary, true), + ]); + + assert_eq!(schema1, schema2); + + let schema3 = Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Float32, true), + ]); + let schema4 = Schema::new(vec![ + Field::new("C1", DataType::Utf8, false), + Field::new("C2", DataType::Float64, true), + ]); + + assert_ne!(schema1, schema3); + assert_ne!(schema1, schema4); + assert_ne!(schema2, schema3); + assert_ne!(schema2, schema4); + assert_ne!(schema3, schema4); + + let f = Field::new("c1", DataType::Utf8, false).with_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let schema5 = Schema::new(vec![ + f, + Field::new("c2", DataType::Float64, true), + Field::new("c3", DataType::LargeBinary, true), + ]); + assert_ne!(schema1, schema5); + } + + #[test] + fn create_schema_string() { + let schema = person_schema(); + assert_eq!(schema.to_string(), + "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \ + Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"address\", data_type: Struct([\ + Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\ + ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }") + } + + #[test] + fn schema_field_accessors() { + let schema = person_schema(); + + // test schema accessors + assert_eq!(schema.fields().len(), 4); + + // test field accessors + let first_name = &schema.fields()[0]; + assert_eq!(first_name.name(), "first_name"); + assert_eq!(first_name.data_type(), &DataType::Utf8); + assert!(!first_name.is_nullable()); + assert_eq!(first_name.dict_id(), None); + assert_eq!(first_name.dict_is_ordered(), None); + + let metadata = first_name.metadata(); + assert!(metadata.is_some()); + let md = metadata.as_ref().unwrap(); + assert_eq!(md.len(), 1); + let key = md.get("k"); + assert!(key.is_some()); + assert_eq!(key.unwrap(), "v"); + + let interests = &schema.fields()[3]; + assert_eq!(interests.name(), "interests"); + assert_eq!( + interests.data_type(), + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) + ); + assert_eq!(interests.dict_id(), Some(123)); + assert_eq!(interests.dict_is_ordered(), Some(true)); + } + + #[test] + #[should_panic( + expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" + )] + fn schema_index_of() { + let schema = person_schema(); + assert_eq!(schema.index_of("first_name").unwrap(), 0); + assert_eq!(schema.index_of("last_name").unwrap(), 1); + schema.index_of("nickname").unwrap(); + } + + #[test] + #[should_panic( + expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" + )] + fn schema_field_with_name() { + let schema = person_schema(); + assert_eq!( + schema.field_with_name("first_name").unwrap().name(), + "first_name" + ); + assert_eq!( + schema.field_with_name("last_name").unwrap().name(), + "last_name" + ); + schema.field_with_name("nickname").unwrap(); + } + + #[test] + fn schema_field_with_dict_id() { + let schema = person_schema(); + + let fields_dict_123: Vec<_> = schema + .fields_with_dict_id(123) + .iter() + .map(|f| f.name()) + .collect(); + assert_eq!(fields_dict_123, vec!["interests"]); + + assert!(schema.fields_with_dict_id(456).is_empty()); + } + + fn person_schema() -> Schema { + let kv_array = [("k".to_string(), "v".to_string())]; + let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + let first_name = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(field_metadata)); + + Schema::new(vec![ + first_name, + Field::new("last_name", DataType::Utf8, false), + Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ), + Field::new_dict( + "interests", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + 123, + true, + ), + ]) + } + + #[test] + fn test_try_merge_field_with_metadata() { + // 1. Different values for the same key should cause error. + let metadata1: BTreeMap = + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(); + let f1 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata1)); + + let metadata2: BTreeMap = + [("foo".to_string(), "baz".to_string())] + .iter() + .cloned() + .collect(); + let f2 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata2)); + + assert!( + Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]) + .is_err() + ); + + // 2. None + Some + let mut f1 = Field::new("first_name", DataType::Utf8, false); + let metadata2: BTreeMap = + [("missing".to_string(), "value".to_string())] + .iter() + .cloned() + .collect(); + let f2 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata2)); + + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata().is_some()); + assert_eq!( + f1.metadata().as_ref().unwrap(), + f2.metadata().as_ref().unwrap() + ); + + // 3. Some + Some + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + [("foo2".to_string(), "bar2".to_string())] + .iter() + .cloned() + .collect(), + )); + + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata().is_some()); + assert_eq!( + f1.metadata().cloned().unwrap(), + [ + ("foo".to_string(), "bar".to_string()), + ("foo2".to_string(), "bar2".to_string()) + ] + .iter() + .cloned() + .collect() + ); + + // 4. Some + None. + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let f2 = Field::new("first_name", DataType::Utf8, false); + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata().is_some()); + assert_eq!( + f1.metadata().cloned().unwrap(), + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect() + ); + + // 5. None + None. + let mut f1 = Field::new("first_name", DataType::Utf8, false); + let f2 = Field::new("first_name", DataType::Utf8, false); + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata().is_none()); + } + + #[test] + fn test_schema_merge() { + let merged = Schema::try_merge(vec![ + Schema::new(vec![ + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, false), + Field::new( + "address", + DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]), + false, + ), + ]), + Schema::new_with_metadata( + vec![ + // nullable merge + Field::new("last_name", DataType::Utf8, true), + Field::new( + "address", + DataType::Struct(vec![ + // add new nested field + Field::new("street", DataType::Utf8, false), + // nullable merge on nested field + Field::new("zip", DataType::UInt16, true), + ]), + false, + ), + // new field + Field::new("number", DataType::Utf8, true), + ], + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect::>(), + ), + ]) + .unwrap(); + + assert_eq!( + merged, + Schema::new_with_metadata( + vec![ + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, true), + Field::new( + "address", + DataType::Struct(vec![ + Field::new("zip", DataType::UInt16, true), + Field::new("street", DataType::Utf8, false), + ]), + false, + ), + Field::new("number", DataType::Utf8, true), + ], + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect::>() + ) + ); + + // support merge union fields + assert_eq!( + Schema::try_merge(vec![ + Schema::new(vec![Field::new( + "c1", + DataType::Union( + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + ], + vec![0, 1], + UnionMode::Dense + ), + false + ),]), + Schema::new(vec![Field::new( + "c1", + DataType::Union( + vec![ + Field::new("c12", DataType::Utf8, true), + Field::new("c13", DataType::Time64(TimeUnit::Second), true), + ], + vec![1, 2], + UnionMode::Dense + ), + false + ),]) + ]) + .unwrap(), + Schema::new(vec![Field::new( + "c1", + DataType::Union( + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + Field::new("c13", DataType::Time64(TimeUnit::Second), true), + ], + vec![0, 1, 2], + UnionMode::Dense + ), + false + ),]), + ); + + // incompatible field should throw error + assert!(Schema::try_merge(vec![ + Schema::new(vec![ + Field::new("first_name", DataType::Utf8, false), + Field::new("last_name", DataType::Utf8, false), + ]), + Schema::new(vec![Field::new("last_name", DataType::Int64, false),]) + ]) + .is_err()); + + // incompatible metadata should throw error + let res = Schema::try_merge(vec![ + Schema::new_with_metadata( + vec![Field::new("first_name", DataType::Utf8, false)], + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect::>(), + ), + Schema::new_with_metadata( + vec![Field::new("last_name", DataType::Utf8, false)], + [("foo".to_string(), "baz".to_string())] + .iter() + .cloned() + .collect::>(), + ), + ]) + .unwrap_err(); + + let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'"; + assert!( + res.to_string().contains(expected), + "Could not find expected string '{}' in '{}'", + expected, + res + ); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f8dbf1481b58..d49acef335d6 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,9 +44,8 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { path = "../arrow-buffer", version = "23.0.0" } - -serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "23.0.0", path = "../arrow-schema" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } @@ -77,7 +76,7 @@ default = ["csv", "ipc", "json"] ipc_compression = ["ipc", "zstd", "lz4"] csv = ["csv_crate"] ipc = ["flatbuffers"] -json = ["serde", "serde_json"] +json = ["serde_json"] simd = ["packed_simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but diff --git a/arrow/src/csv/mod.rs b/arrow/src/csv/mod.rs index ffe82f335801..46ba7d71e200 100644 --- a/arrow/src/csv/mod.rs +++ b/arrow/src/csv/mod.rs @@ -25,3 +25,22 @@ pub use self::reader::Reader; pub use self::reader::ReaderBuilder; pub use self::writer::Writer; pub use self::writer::WriterBuilder; +use arrow_schema::ArrowError; + +fn map_csv_error(error: csv_crate::Error) -> ArrowError { + match error.kind() { + csv_crate::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), + csv_crate::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( + "Encountered UTF-8 error while reading CSV file: {}", + err + )), + csv_crate::ErrorKind::UnequalLengths { + expected_len, len, .. + } => ArrowError::CsvError(format!( + "Encountered unequal lengths between records on CSV file. Expected {} \ + records, found {} records", + len, expected_len + )), + _ => ArrowError::CsvError("Error reading CSV file".to_string()), + } +} diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index d164d35c3c8c..3ec605dd0482 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -58,6 +58,7 @@ use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchOptions}; use crate::util::reader_parser::Parser; +use crate::csv::map_csv_error; use csv_crate::{ByteRecord, StringRecord}; use std::ops::Neg; @@ -187,10 +188,10 @@ fn infer_reader_schema_with_csv_options( // get or create header names // when has_header is false, creates default column names with column_ prefix let headers: Vec = if roptions.has_header { - let headers = &csv_reader.headers()?.clone(); + let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); headers.iter().map(|s| s.to_string()).collect() } else { - let first_record_count = &csv_reader.headers()?.len(); + let first_record_count = &csv_reader.headers().map_err(map_csv_error)?.len(); (0..*first_record_count) .map(|i| format!("column_{}", i + 1)) .collect() @@ -208,7 +209,7 @@ fn infer_reader_schema_with_csv_options( let mut record = StringRecord::new(); let max_records = roptions.max_read_records.unwrap_or(usize::MAX); while records_count < max_records { - if !csv_reader.read_record(&mut record)? { + if !csv_reader.read_record(&mut record).map_err(map_csv_error)? { break; } records_count += 1; diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index 7097706ba5f3..1b377c38b370 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -70,11 +70,13 @@ use crate::compute::kernels::temporal::using_chrono_tz_and_utc_naive_date_time; #[cfg(feature = "chrono-tz")] use chrono::{DateTime, Utc}; +use crate::csv::map_csv_error; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; use crate::util::display::make_string_from_decimal; use crate::{array::*, util::serialization::lexical_to_string}; + const DEFAULT_DATE_FORMAT: &str = "%F"; const DEFAULT_TIME_FORMAT: &str = "%T"; const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; @@ -343,7 +345,9 @@ impl Writer { .fields() .iter() .for_each(|field| headers.push(field.name().to_string())); - self.writer.write_record(&headers[..])?; + self.writer + .write_record(&headers[..]) + .map_err(map_csv_error)?; } self.beginning = false; } @@ -364,7 +368,7 @@ impl Writer { for row_index in 0..batch.num_rows() { self.convert(columns.as_slice(), row_index, &mut buffer)?; - self.writer.write_record(&buffer)?; + self.writer.write_record(&buffer).map_err(map_csv_error)?; } self.writer.flush()?; diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/decimal.rs similarity index 68% rename from arrow/src/datatypes/datatype.rs rename to arrow/src/datatypes/decimal.rs index d3189b8b18cc..ffdb04e0d775 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/decimal.rs @@ -15,256 +15,10 @@ // specific language governing permissions and limitations // under the License. -use num::BigInt; -use std::cmp::Ordering; -use std::fmt; - use crate::error::{ArrowError, Result}; use crate::util::decimal::singed_cmp_le_bytes; - -use super::Field; - -/// The set of datatypes that are supported by this implementation of Apache Arrow. -/// -/// The Arrow specification on data types includes some more types. -/// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) -/// for Arrow's specification. -/// -/// The variants of this enum include primitive fixed size types as well as parametric or -/// nested types. -/// Currently the Rust implementation supports the following nested types: -/// - `List` -/// - `Struct` -/// -/// Nested types can themselves be nested within other arrays. -/// For more information on these types please see -/// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -pub enum DataType { - /// Null type - Null, - /// A boolean datatype representing the values `true` and `false`. - Boolean, - /// A signed 8-bit integer. - Int8, - /// A signed 16-bit integer. - Int16, - /// A signed 32-bit integer. - Int32, - /// A signed 64-bit integer. - Int64, - /// An unsigned 8-bit integer. - UInt8, - /// An unsigned 16-bit integer. - UInt16, - /// An unsigned 32-bit integer. - UInt32, - /// An unsigned 64-bit integer. - UInt64, - /// A 16-bit floating point number. - Float16, - /// A 32-bit floating point number. - Float32, - /// A 64-bit floating point number. - Float64, - /// A timestamp with an optional timezone. - /// - /// Time is measured as a Unix epoch, counting the seconds from - /// 00:00:00.000 on 1 January 1970, excluding leap seconds, - /// as a 64-bit integer. - /// - /// The time zone is a string indicating the name of a time zone, one of: - /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - /// - /// Timestamps with a non-empty timezone - /// ------------------------------------ - /// - /// If a Timestamp column has a non-empty timezone value, its epoch is - /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone - /// (the Unix epoch), regardless of the Timestamp's own timezone. - /// - /// Therefore, timestamp values with a non-empty timezone correspond to - /// physical points in time together with some additional information about - /// how the data was obtained and/or how to display it (the timezone). - /// - /// For example, the timestamp value 0 with the timezone string "Europe/Paris" - /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the - /// application may prefer to display it as "January 1st 1970, 01h00" in - /// the Europe/Paris timezone (which is the same physical point in time). - /// - /// One consequence is that timestamp values with a non-empty timezone - /// can be compared and ordered directly, since they all share the same - /// well-known point of reference (the Unix epoch). - /// - /// Timestamps with an unset / empty timezone - /// ----------------------------------------- - /// - /// If a Timestamp column has no timezone value, its epoch is - /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. - /// - /// Therefore, timestamp values without a timezone cannot be meaningfully - /// interpreted as physical points in time, but only as calendar / clock - /// indications ("wall clock time") in an unspecified timezone. - /// - /// For example, the timestamp value 0 with an empty timezone string - /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there - /// is not enough information to interpret it as a well-defined physical - /// point in time. - /// - /// One consequence is that timestamp values without a timezone cannot - /// be reliably compared or ordered, since they may have different points of - /// reference. In particular, it is *not* possible to interpret an unset - /// or empty timezone as the same as "UTC". - /// - /// Conversion between timezones - /// ---------------------------- - /// - /// If a Timestamp column has a non-empty timezone, changing the timezone - /// to a different non-empty value is a metadata-only operation: - /// the timestamp values need not change as their point of reference remains - /// the same (the Unix epoch). - /// - /// However, if a Timestamp column has no timezone value, changing it to a - /// non-empty value requires to think about the desired semantics. - /// One possibility is to assume that the original timestamp values are - /// relative to the epoch of the timezone being set; timestamp values should - /// then adjusted to the Unix epoch (for example, changing the timezone from - /// empty to "Europe/Paris" would require converting the timestamp values - /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is - /// nevertheless correct). - Timestamp(TimeUnit, Option), - /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) - /// in days (32 bits). - Date32, - /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) - /// in milliseconds (64 bits). Values are evenly divisible by 86400000. - Date64, - /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - Time32(TimeUnit), - /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - Time64(TimeUnit), - /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. - Duration(TimeUnit), - /// A "calendar" interval which models types that don't necessarily - /// have a precise duration without the context of a base timestamp (e.g. - /// days can differ in length during day light savings time transitions). - Interval(IntervalUnit), - /// Opaque binary data of variable length. - Binary, - /// Opaque binary data of fixed size. - /// Enum parameter specifies the number of bytes per value. - FixedSizeBinary(i32), - /// Opaque binary data of variable length and 64-bit offsets. - LargeBinary, - /// A variable-length string in Unicode with UTF-8 encoding. - Utf8, - /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. - LargeUtf8, - /// A list of some logical data type with variable length. - List(Box), - /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), - /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), - /// A nested datatype that contains a number of sub-fields. - Struct(Vec), - /// A nested datatype that can represent slots of differing types. Components: - /// - /// 1. [`Field`] for each possible child type the Union can hold - /// 2. The corresponding `type_id` used to identify which Field - /// 3. The type of union (Sparse or Dense) - Union(Vec, Vec, UnionMode), - /// A dictionary encoded array (`key_type`, `value_type`), where - /// each array element is an index of `key_type` into an - /// associated dictionary of `value_type`. - /// - /// Dictionary arrays are used to store columns of `value_type` - /// that contain many repeated values using less memory, but with - /// a higher CPU overhead for some operations. - /// - /// This type mostly used to represent low cardinality string - /// arrays or a limited set of primitive types as integers. - Dictionary(Box, Box), - /// Exact 128-bit width decimal value with precision and scale - /// - /// * precision is the total number of digits - /// * scale is the number of digits past the decimal - /// - /// For example the number 123.45 has precision 5 and scale 2. - Decimal128(u8, u8), - /// Exact 256-bit width decimal value with precision and scale - /// - /// * precision is the total number of digits - /// * scale is the number of digits past the decimal - /// - /// For example the number 123.45 has precision 5 and scale 2. - Decimal256(u8, u8), - /// A Map is a logical nested type that is represented as - /// - /// `List>` - /// - /// The keys and values are each respectively contiguous. - /// The key and value types are not constrained, but keys should be - /// hashable and unique. - /// Whether the keys are sorted can be set in the `bool` after the `Field`. - /// - /// In a field with Map type, the field has a child Struct field, which then - /// has two children: key type and the second the value type. The names of the - /// child fields may be respectively "entries", "key", and "value", but this is - /// not enforced. - Map(Box, bool), -} - -/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -pub enum TimeUnit { - /// Time in seconds. - Second, - /// Time in milliseconds. - Millisecond, - /// Time in microseconds. - Microsecond, - /// Time in nanoseconds. - Nanosecond, -} - -/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -pub enum IntervalUnit { - /// Indicates the number of elapsed whole months, stored as 4-byte integers. - YearMonth, - /// Indicates the number of elapsed days and milliseconds, - /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). - DayTime, - /// A triple of the number of elapsed months, days, and nanoseconds. - /// The values are stored contiguously in 16 byte blocks. Months and - /// days are encoded as 32 bit integers and nanoseconds is encoded as a - /// 64 bit integer. All integers are signed. Each field is independent - /// (e.g. there is no constraint that nanoseconds have the same sign - /// as days or that the quantity of nanoseconds represents less - /// than a day's worth of time). - MonthDayNano, -} - -// Sparse or Dense union layouts -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -pub enum UnionMode { - Sparse, - Dense, -} - -impl fmt::Display for DataType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) - } -} +use num::BigInt; +use std::cmp::Ordering; // MAX decimal256 value of little-endian format for each precision. // Each element is the max value of signed 256-bit integer for the specified precision which @@ -887,7 +641,7 @@ pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ ]; /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value -/// that can be stored in [DataType::Decimal128] value of precision `p` +/// that can be stored in [arrow_schema::DataType::Decimal128] value of precision `p` pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 9, 99, @@ -930,7 +684,7 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ ]; /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value -/// that can be stored in a [DataType::Decimal128] value of precision `p` +/// that can be stored in a [arrow_schema::DataType::Decimal128] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -9, -99, @@ -972,19 +726,20 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; -/// The maximum precision for [DataType::Decimal128] values +/// The maximum precision for [arrow_schema::DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; -/// The maximum scale for [DataType::Decimal128] values +/// The maximum scale for [arrow_schema::DataType::Decimal128] values pub const DECIMAL128_MAX_SCALE: u8 = 38; -/// The maximum precision for [DataType::Decimal256] values +/// The maximum precision for [arrow_schema::DataType::Decimal256] values pub const DECIMAL256_MAX_PRECISION: u8 = 76; -/// The maximum scale for [DataType::Decimal256] values +/// The maximum scale for [arrow_schema::DataType::Decimal256] values pub const DECIMAL256_MAX_SCALE: u8 = 76; -/// The default scale for [DataType::Decimal128] and [DataType::Decimal256] values +/// The default scale for [arrow_schema::DataType::Decimal128] and +/// [arrow_schema::DataType::Decimal256] values pub const DECIMAL_DEFAULT_SCALE: u8 = 10; /// Validates that the specified `i128` value can be properly @@ -1051,124 +806,9 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } } -impl DataType { - /// Returns true if this type is numeric: (UInt*, Int*, or Float*). - pub fn is_numeric(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float32 - | Float64 - ) - } - - /// Returns true if the type is primitive: (numeric, temporal). - pub fn is_primitive(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - Int8 | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - | Float64 - | Date32 - | Date64 - | Time32(_) - | Time64(_) - | Timestamp(_, _) - | Interval(_) - | Duration(_) - ) - } - - /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). - pub fn is_temporal(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - Date32 - | Date64 - | Timestamp(_, _) - | Time32(_) - | Time64(_) - | Duration(_) - | Interval(_) - ) - } - - /// Returns true if this type is valid as a dictionary key - /// (e.g. [`super::ArrowDictionaryKeyType`] - pub fn is_dictionary_key_type(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 - ) - } - - /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, or Map) - pub fn is_nested(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - List(_) - | FixedSizeList(_, _) - | LargeList(_) - | Struct(_) - | Union(_, _, _) - | Map(_, _) - ) - } - - /// Compares the datatype with another, ignoring nested field names - /// and metadata. - pub fn equals_datatype(&self, other: &DataType) -> bool { - match (&self, other) { - (DataType::List(a), DataType::List(b)) - | (DataType::LargeList(a), DataType::LargeList(b)) => { - a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - } - (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { - a_size == b_size - && a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - } - (DataType::Struct(a), DataType::Struct(b)) => { - a.len() == b.len() - && a.iter().zip(b).all(|(a, b)| { - a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - }) - } - ( - DataType::Map(a_field, a_is_sorted), - DataType::Map(b_field, b_is_sorted), - ) => a_field == b_field && a_is_sorted == b_is_sorted, - _ => self == other, - } - } -} - #[cfg(test)] mod test { - use crate::datatypes::datatype::{ - MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, - MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, - }; + use super::*; use crate::util::decimal::Decimal256; use num::{BigInt, Num}; diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 1586d563cd3f..2f83871127fd 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -26,17 +26,15 @@ use std::sync::Arc; mod native; pub use native::*; -mod field; -pub use field::*; -mod schema; -pub use schema::*; mod numeric; pub use numeric::*; mod types; pub use types::*; -mod datatype; -pub use datatype::*; +mod decimal; mod delta; +pub use decimal::*; + +pub use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit, UnionMode}; #[cfg(feature = "ffi")] mod ffi; @@ -45,550 +43,3 @@ pub use ffi::*; /// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). pub type SchemaRef = Arc; - -#[cfg(test)] -mod tests { - use super::*; - use crate::error::Result; - use std::collections::{BTreeMap, HashMap}; - - #[cfg(feature = "json")] - use crate::json::JsonSerializable; - - #[cfg(feature = "json")] - use serde_json::{ - Number, - Value::{Bool, Number as VNumber, String as VString}, - }; - - #[test] - fn test_list_datatype_equality() { - // tests that list type equality is checked while ignoring list names - let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true))); - let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true))); - assert!(list_a.equals_datatype(&list_b)); - assert!(!list_a.equals_datatype(&list_c)); - assert!(!list_b.equals_datatype(&list_c)); - assert!(!list_a.equals_datatype(&list_d)); - - let list_e = - DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3); - let list_f = - DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3); - let list_g = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)), - 3, - ); - assert!(list_e.equals_datatype(&list_f)); - assert!(!list_e.equals_datatype(&list_g)); - assert!(!list_f.equals_datatype(&list_g)); - - let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]); - let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]); - let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]); - let list_k = DataType::Struct(vec![ - Field::new("f1", list_f.clone(), false), - Field::new("f2", list_g.clone(), false), - Field::new("f3", DataType::Utf8, true), - ]); - let list_l = DataType::Struct(vec![ - Field::new("ff1", list_f.clone(), false), - Field::new("ff2", list_g.clone(), false), - Field::new("ff3", DataType::LargeUtf8, true), - ]); - let list_m = DataType::Struct(vec![ - Field::new("ff1", list_f, false), - Field::new("ff2", list_g, false), - Field::new("ff3", DataType::Utf8, true), - ]); - assert!(list_h.equals_datatype(&list_i)); - assert!(!list_h.equals_datatype(&list_j)); - assert!(!list_k.equals_datatype(&list_l)); - assert!(list_k.equals_datatype(&list_m)); - } - - #[test] - #[cfg(feature = "json")] - fn create_struct_type() { - let _person = DataType::Struct(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - ]); - } - - #[test] - #[cfg(feature = "json")] - fn serde_struct_type() { - let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - - // Non-empty map: should be converted as JSON obj { ... } - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); - - // Empty map: should be omitted. - let last_name = Field::new("last_name", DataType::Utf8, false) - .with_metadata(Some(BTreeMap::default())); - - let person = DataType::Struct(vec![ - first_name, - last_name, - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - ]); - - let serialized = serde_json::to_string(&person).unwrap(); - - // NOTE that this is testing the default (derived) serialization format, not the - // JSON format specified in metadata.md - - assert_eq!( - "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ - {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"address\",\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", - serialized - ); - - let deserialized = serde_json::from_str(&serialized).unwrap(); - - assert_eq!(person, deserialized); - } - - #[test] - fn create_schema_string() { - let schema = person_schema(); - assert_eq!(schema.to_string(), - "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \ - Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"address\", data_type: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\ - ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }") - } - - #[test] - fn schema_field_accessors() { - let schema = person_schema(); - - // test schema accessors - assert_eq!(schema.fields().len(), 4); - - // test field accessors - let first_name = &schema.fields()[0]; - assert_eq!(first_name.name(), "first_name"); - assert_eq!(first_name.data_type(), &DataType::Utf8); - assert!(!first_name.is_nullable()); - assert_eq!(first_name.dict_id(), None); - assert_eq!(first_name.dict_is_ordered(), None); - - let metadata = first_name.metadata(); - assert!(metadata.is_some()); - let md = metadata.as_ref().unwrap(); - assert_eq!(md.len(), 1); - let key = md.get("k"); - assert!(key.is_some()); - assert_eq!(key.unwrap(), "v"); - - let interests = &schema.fields()[3]; - assert_eq!(interests.name(), "interests"); - assert_eq!( - interests.data_type(), - &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) - ); - assert_eq!(interests.dict_id(), Some(123)); - assert_eq!(interests.dict_is_ordered(), Some(true)); - } - - #[test] - #[should_panic( - expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" - )] - fn schema_index_of() { - let schema = person_schema(); - assert_eq!(schema.index_of("first_name").unwrap(), 0); - assert_eq!(schema.index_of("last_name").unwrap(), 1); - schema.index_of("nickname").unwrap(); - } - - #[test] - #[should_panic( - expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" - )] - fn schema_field_with_name() { - let schema = person_schema(); - assert_eq!( - schema.field_with_name("first_name").unwrap().name(), - "first_name" - ); - assert_eq!( - schema.field_with_name("last_name").unwrap().name(), - "last_name" - ); - schema.field_with_name("nickname").unwrap(); - } - - #[test] - fn schema_field_with_dict_id() { - let schema = person_schema(); - - let fields_dict_123: Vec<_> = schema - .fields_with_dict_id(123) - .iter() - .map(|f| f.name()) - .collect(); - assert_eq!(fields_dict_123, vec!["interests"]); - - assert!(schema.fields_with_dict_id(456).is_empty()); - } - - #[test] - fn schema_equality() { - let schema1 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - let schema2 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - - assert_eq!(schema1, schema2); - - let schema3 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float32, true), - ]); - let schema4 = Schema::new(vec![ - Field::new("C1", DataType::Utf8, false), - Field::new("C2", DataType::Float64, true), - ]); - - assert!(schema1 != schema3); - assert!(schema1 != schema4); - assert!(schema2 != schema3); - assert!(schema2 != schema4); - assert!(schema3 != schema4); - - let f = Field::new("c1", DataType::Utf8, false).with_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let schema5 = Schema::new(vec![ - f, - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - assert!(schema1 != schema5); - } - - #[test] - #[cfg(feature = "json")] - fn test_arrow_native_type_to_json() { - assert_eq!(Some(Bool(true)), true.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); - assert_eq!(Some(VString("1".to_string())), 1i128.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); - assert_eq!( - Some(VNumber(Number::from_f64(0.01f64).unwrap())), - 0.01.into_json_value() - ); - assert_eq!( - Some(VNumber(Number::from_f64(0.01f64).unwrap())), - 0.01f64.into_json_value() - ); - assert_eq!(None, f32::NAN.into_json_value()); - } - - fn person_schema() -> Schema { - let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); - - Schema::new(vec![ - first_name, - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - Field::new_dict( - "interests", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - 123, - true, - ), - ]) - } - - #[test] - fn test_try_merge_field_with_metadata() { - // 1. Different values for the same key should cause error. - let metadata1: BTreeMap = - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(); - let f1 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata1)); - - let metadata2: BTreeMap = - [("foo".to_string(), "baz".to_string())] - .iter() - .cloned() - .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); - - assert!( - Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]) - .is_err() - ); - - // 2. None + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false); - let metadata2: BTreeMap = - [("missing".to_string(), "value".to_string())] - .iter() - .cloned() - .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); - - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().as_ref().unwrap(), - f2.metadata().as_ref().unwrap() - ); - - // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( - [("foo2".to_string(), "bar2".to_string())] - .iter() - .cloned() - .collect(), - )); - - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().cloned().unwrap(), - [ - ("foo".to_string(), "bar".to_string()), - ("foo2".to_string(), "bar2".to_string()) - ] - .iter() - .cloned() - .collect() - ); - - // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let f2 = Field::new("first_name", DataType::Utf8, false); - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().cloned().unwrap(), - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect() - ); - - // 5. None + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false); - let f2 = Field::new("first_name", DataType::Utf8, false); - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_none()); - } - - #[test] - fn test_schema_merge() -> Result<()> { - let merged = Schema::try_merge(vec![ - Schema::new(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]), - false, - ), - ]), - Schema::new_with_metadata( - vec![ - // nullable merge - Field::new("last_name", DataType::Utf8, true), - Field::new( - "address", - DataType::Struct(vec![ - // add new nested field - Field::new("street", DataType::Utf8, false), - // nullable merge on nested field - Field::new("zip", DataType::UInt16, true), - ]), - false, - ), - // new field - Field::new("number", DataType::Utf8, true), - ], - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect::>(), - ), - ])?; - - assert_eq!( - merged, - Schema::new_with_metadata( - vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, true), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("zip", DataType::UInt16, true), - Field::new("street", DataType::Utf8, false), - ]), - false, - ), - Field::new("number", DataType::Utf8, true), - ], - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect::>() - ) - ); - - // support merge union fields - assert_eq!( - Schema::try_merge(vec![ - Schema::new(vec![Field::new( - "c1", - DataType::Union( - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - ], - vec![0, 1], - UnionMode::Dense - ), - false - ),]), - Schema::new(vec![Field::new( - "c1", - DataType::Union( - vec![ - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ], - vec![1, 2], - UnionMode::Dense - ), - false - ),]) - ])?, - Schema::new(vec![Field::new( - "c1", - DataType::Union( - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ], - vec![0, 1, 2], - UnionMode::Dense - ), - false - ),]), - ); - - // incompatible field should throw error - assert!(Schema::try_merge(vec![ - Schema::new(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - ]), - Schema::new(vec![Field::new("last_name", DataType::Int64, false),]) - ]) - .is_err()); - - // incompatible metadata should throw error - let res = Schema::try_merge(vec![ - Schema::new_with_metadata( - vec![Field::new("first_name", DataType::Utf8, false)], - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect::>(), - ), - Schema::new_with_metadata( - vec![Field::new("last_name", DataType::Utf8, false)], - [("foo".to_string(), "baz".to_string())] - .iter() - .cloned() - .collect::>(), - ), - ]) - .unwrap_err(); - - let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'"; - assert!( - res.to_string().contains(expected), - "Could not find expected string '{}' in '{}'", - expected, - res - ); - - Ok(()) - } -} diff --git a/arrow/src/datatypes/schema.rs b/arrow/src/datatypes/schema.rs deleted file mode 100644 index b0eca6114742..000000000000 --- a/arrow/src/datatypes/schema.rs +++ /dev/null @@ -1,386 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::HashMap; -use std::fmt; -use std::hash::Hash; - -use crate::error::{ArrowError, Result}; - -use super::Field; - -/// Describes the meta-data of an ordered sequence of relative types. -/// -/// Note that this information is only part of the meta-data and not part of the physical -/// memory layout. -#[derive(Debug, Clone, PartialEq, Eq)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -pub struct Schema { - pub fields: Vec, - /// A map of key-value pairs containing additional meta data. - #[cfg_attr( - feature = "serde", - serde(skip_serializing_if = "HashMap::is_empty", default) - )] - pub metadata: HashMap, -} - -impl Schema { - /// Creates an empty `Schema` - pub fn empty() -> Self { - Self { - fields: vec![], - metadata: HashMap::new(), - } - } - - /// Creates a new [`Schema`] from a sequence of [`Field`] values. - /// - /// # Example - /// - /// ``` - /// # use arrow::datatypes::{Field, DataType, Schema}; - /// let field_a = Field::new("a", DataType::Int64, false); - /// let field_b = Field::new("b", DataType::Boolean, false); - /// - /// let schema = Schema::new(vec![field_a, field_b]); - /// ``` - pub fn new(fields: Vec) -> Self { - Self::new_with_metadata(fields, HashMap::new()) - } - - /// Creates a new [`Schema`] from a sequence of [`Field`] values - /// and adds additional metadata in form of key value pairs. - /// - /// # Example - /// - /// ``` - /// # use arrow::datatypes::{Field, DataType, Schema}; - /// # use std::collections::HashMap; - /// let field_a = Field::new("a", DataType::Int64, false); - /// let field_b = Field::new("b", DataType::Boolean, false); - /// - /// let mut metadata: HashMap = HashMap::new(); - /// metadata.insert("row_count".to_string(), "100".to_string()); - /// - /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); - /// ``` - #[inline] - pub const fn new_with_metadata( - fields: Vec, - metadata: HashMap, - ) -> Self { - Self { fields, metadata } - } - - /// Sets the metadata of this `Schema` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: HashMap) -> Self { - self.metadata = metadata; - self - } - - /// Returns a new schema with only the specified columns in the new schema - /// This carries metadata from the parent schema over as well - pub fn project(&self, indices: &[usize]) -> Result { - let new_fields = indices - .iter() - .map(|i| { - self.fields.get(*i).cloned().ok_or_else(|| { - ArrowError::SchemaError(format!( - "project index {} out of bounds, max field {}", - i, - self.fields().len() - )) - }) - }) - .collect::>>()?; - Ok(Self::new_with_metadata(new_fields, self.metadata.clone())) - } - - /// Merge schema into self if it is compatible. Struct fields will be merged recursively. - /// - /// Example: - /// - /// ``` - /// use arrow::datatypes::*; - /// - /// let merged = Schema::try_merge(vec![ - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, false), - /// Field::new("c2", DataType::Utf8, false), - /// ]), - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, true), - /// Field::new("c2", DataType::Utf8, false), - /// Field::new("c3", DataType::Utf8, false), - /// ]), - /// ]).unwrap(); - /// - /// assert_eq!( - /// merged, - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, true), - /// Field::new("c2", DataType::Utf8, false), - /// Field::new("c3", DataType::Utf8, false), - /// ]), - /// ); - /// ``` - pub fn try_merge(schemas: impl IntoIterator) -> Result { - schemas - .into_iter() - .try_fold(Self::empty(), |mut merged, schema| { - let Schema { metadata, fields } = schema; - for (key, value) in metadata.into_iter() { - // merge metadata - if let Some(old_val) = merged.metadata.get(&key) { - if old_val != &value { - return Err(ArrowError::SchemaError(format!( - "Fail to merge schema due to conflicting metadata. \ - Key '{}' has different values '{}' and '{}'", - key, old_val, value - ))); - } - } - merged.metadata.insert(key, value); - } - // merge fields - for field in fields.into_iter() { - let merged_field = - merged.fields.iter_mut().find(|f| f.name() == field.name()); - match merged_field { - Some(merged_field) => merged_field.try_merge(&field)?, - // found a new field, add to field list - None => merged.fields.push(field), - } - } - Ok(merged) - }) - } - - /// Returns an immutable reference of the vector of `Field` instances. - #[inline] - pub const fn fields(&self) -> &Vec { - &self.fields - } - - /// Returns a vector with references to all fields (including nested fields) - #[inline] - #[cfg(feature = "ipc")] - pub(crate) fn all_fields(&self) -> Vec<&Field> { - self.fields.iter().flat_map(|f| f.fields()).collect() - } - - /// Returns an immutable reference of a specific [`Field`] instance selected using an - /// offset within the internal `fields` vector. - pub fn field(&self, i: usize) -> &Field { - &self.fields[i] - } - - /// Returns an immutable reference of a specific [`Field`] instance selected by name. - pub fn field_with_name(&self, name: &str) -> Result<&Field> { - Ok(&self.fields[self.index_of(name)?]) - } - - /// Returns a vector of immutable references to all [`Field`] instances selected by - /// the dictionary ID they use. - pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { - self.fields - .iter() - .flat_map(|f| f.fields_with_dict_id(dict_id)) - .collect() - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - (0..self.fields.len()) - .find(|idx| self.fields[*idx].name() == name) - .ok_or_else(|| { - let valid_fields: Vec = - self.fields.iter().map(|f| f.name().clone()).collect(); - ArrowError::InvalidArgumentError(format!( - "Unable to get field named \"{}\". Valid fields: {:?}", - name, valid_fields - )) - }) - } - - /// Returns an immutable reference to the Map of custom metadata key-value pairs. - #[inline] - pub const fn metadata(&self) -> &HashMap { - &self.metadata - } - - /// Look up a column by name and return a immutable reference to the column along with - /// its index. - pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> { - self.fields - .iter() - .enumerate() - .find(|&(_, c)| c.name() == name) - } - - /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: - /// - /// * `self` and `other` should contain the same number of fields - /// * for every field `f` in `other`, the field in `self` with corresponding index should be a - /// superset of `f`. - /// * self.metadata is a superset of other.metadata - /// - /// In other words, any record conforms to `other` should also conform to `self`. - pub fn contains(&self, other: &Schema) -> bool { - self.fields.len() == other.fields.len() - && self.fields.iter().zip(other.fields.iter()).all(|(f1, f2)| f1.contains(f2)) - // make sure self.metadata is a superset of other.metadata - && other.metadata.iter().all(|(k, v1)| match self.metadata.get(k) { - Some(v2) => v1 == v2, - _ => false, - }) - } -} - -impl fmt::Display for Schema { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str( - &self - .fields - .iter() - .map(|c| c.to_string()) - .collect::>() - .join(", "), - ) - } -} - -// need to implement `Hash` manually because `HashMap` implement Eq but no `Hash` -#[allow(clippy::derive_hash_xor_eq)] -impl Hash for Schema { - fn hash(&self, state: &mut H) { - self.fields.hash(state); - - // ensure deterministic key order - let mut keys: Vec<&String> = self.metadata.keys().collect(); - keys.sort(); - for k in keys { - k.hash(state); - self.metadata.get(k).expect("key valid").hash(state); - } - } -} - -#[cfg(test)] -mod tests { - use crate::datatypes::DataType; - - use super::*; - - #[test] - #[cfg(feature = "json")] - fn test_ser_de_metadata() { - // ser/de with empty metadata - let schema = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]); - - let json = serde_json::to_string(&schema).unwrap(); - let de_schema = serde_json::from_str(&json).unwrap(); - - assert_eq!(schema, de_schema); - - // ser/de with non-empty metadata - let schema = schema - .with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect()); - let json = serde_json::to_string(&schema).unwrap(); - let de_schema = serde_json::from_str(&json).unwrap(); - - assert_eq!(schema, de_schema); - } - - #[test] - fn test_projection() { - let mut metadata = HashMap::new(); - metadata.insert("meta".to_string(), "data".to_string()); - - let schema = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]) - .with_metadata(metadata); - - let projected: Schema = schema.project(&[0, 2]).unwrap(); - - assert_eq!(projected.fields().len(), 2); - assert_eq!(projected.fields()[0].name(), "name"); - assert_eq!(projected.fields()[1].name(), "priority"); - assert_eq!(projected.metadata.get("meta").unwrap(), "data") - } - - #[test] - fn test_oob_projection() { - let mut metadata = HashMap::new(); - metadata.insert("meta".to_string(), "data".to_string()); - - let schema = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]) - .with_metadata(metadata); - - let projected: Result = schema.project(&[0, 3]); - - assert!(projected.is_err()); - if let Err(e) = projected { - assert_eq!( - e.to_string(), - "Schema error: project index 3 out of bounds, max field 3".to_string() - ) - } - } - - #[test] - fn test_schema_contains() { - let mut metadata1 = HashMap::new(); - metadata1.insert("meta".to_string(), "data".to_string()); - - let schema1 = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]) - .with_metadata(metadata1.clone()); - - let mut metadata2 = HashMap::new(); - metadata2.insert("meta".to_string(), "data".to_string()); - metadata2.insert("meta2".to_string(), "data".to_string()); - let schema2 = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]) - .with_metadata(metadata2); - - // reflexivity - assert!(schema1.contains(&schema1)); - assert!(schema2.contains(&schema2)); - - assert!(!schema1.contains(&schema2)); - assert!(schema2.contains(&schema1)); - } -} diff --git a/arrow/src/error.rs b/arrow/src/error.rs index 5d92fb930170..f7acec0b34d7 100644 --- a/arrow/src/error.rs +++ b/arrow/src/error.rs @@ -16,120 +16,7 @@ // under the License. //! Defines `ArrowError` for representing failures in various Arrow operations. -use std::fmt::{Debug, Display, Formatter}; -use std::io::Write; -use std::error::Error; - -/// Many different operations in the `arrow` crate return this error type. -#[derive(Debug)] -pub enum ArrowError { - /// Returned when functionality is not yet available. - NotYetImplemented(String), - ExternalError(Box), - CastError(String), - MemoryError(String), - ParseError(String), - SchemaError(String), - ComputeError(String), - DivideByZero, - CsvError(String), - JsonError(String), - IoError(String), - InvalidArgumentError(String), - ParquetError(String), - /// Error during import or export to/from the C Data Interface - CDataInterface(String), - DictionaryKeyOverflowError, -} - -impl ArrowError { - /// Wraps an external error in an `ArrowError`. - pub fn from_external_error( - error: Box, - ) -> Self { - Self::ExternalError(error) - } -} - -impl From<::std::io::Error> for ArrowError { - fn from(error: std::io::Error) -> Self { - ArrowError::IoError(error.to_string()) - } -} - -#[cfg(feature = "csv")] -impl From for ArrowError { - fn from(error: csv_crate::Error) -> Self { - match error.kind() { - csv_crate::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), - csv_crate::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( - "Encountered UTF-8 error while reading CSV file: {}", - err - )), - csv_crate::ErrorKind::UnequalLengths { - expected_len, len, .. - } => ArrowError::CsvError(format!( - "Encountered unequal lengths between records on CSV file. Expected {} \ - records, found {} records", - len, expected_len - )), - _ => ArrowError::CsvError("Error reading CSV file".to_string()), - } - } -} - -impl From<::std::string::FromUtf8Error> for ArrowError { - fn from(error: std::string::FromUtf8Error) -> Self { - ArrowError::ParseError(error.to_string()) - } -} - -#[cfg(feature = "json")] -impl From for ArrowError { - fn from(error: serde_json::Error) -> Self { - ArrowError::JsonError(error.to_string()) - } -} - -impl From<::std::io::IntoInnerError> for ArrowError { - fn from(error: std::io::IntoInnerError) -> Self { - ArrowError::IoError(error.to_string()) - } -} - -impl Display for ArrowError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - ArrowError::NotYetImplemented(source) => { - write!(f, "Not yet implemented: {}", &source) - } - ArrowError::ExternalError(source) => write!(f, "External error: {}", &source), - ArrowError::CastError(desc) => write!(f, "Cast error: {}", desc), - ArrowError::MemoryError(desc) => write!(f, "Memory error: {}", desc), - ArrowError::ParseError(desc) => write!(f, "Parser error: {}", desc), - ArrowError::SchemaError(desc) => write!(f, "Schema error: {}", desc), - ArrowError::ComputeError(desc) => write!(f, "Compute error: {}", desc), - ArrowError::DivideByZero => write!(f, "Divide by zero error"), - ArrowError::CsvError(desc) => write!(f, "Csv error: {}", desc), - ArrowError::JsonError(desc) => write!(f, "Json error: {}", desc), - ArrowError::IoError(desc) => write!(f, "Io error: {}", desc), - ArrowError::InvalidArgumentError(desc) => { - write!(f, "Invalid argument error: {}", desc) - } - ArrowError::ParquetError(desc) => { - write!(f, "Parquet argument error: {}", desc) - } - ArrowError::CDataInterface(desc) => { - write!(f, "C Data interface error: {}", desc) - } - ArrowError::DictionaryKeyOverflowError => { - write!(f, "Dictionary key bigger than the key type") - } - } - } -} - -impl Error for ArrowError {} +pub use arrow_schema::ArrowError; pub type Result = std::result::Result; diff --git a/arrow/src/json/mod.rs b/arrow/src/json/mod.rs index 836145bb08e4..21f96d90a5d0 100644 --- a/arrow/src/json/mod.rs +++ b/arrow/src/json/mod.rs @@ -80,3 +80,36 @@ impl JsonSerializable for f64 { Number::from_f64(self).map(Value::Number) } } + +#[cfg(test)] +mod tests { + use super::*; + + use serde_json::{ + Number, + Value::{Bool, Number as VNumber, String as VString}, + }; + + #[test] + fn test_arrow_native_type_to_json() { + assert_eq!(Some(Bool(true)), true.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); + assert_eq!(Some(VString("1".to_string())), 1i128.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); + assert_eq!( + Some(VNumber(Number::from_f64(0.01f64).unwrap())), + 0.01.into_json_value() + ); + assert_eq!( + Some(VNumber(Number::from_f64(0.01f64).unwrap())), + 0.01f64.into_json_value() + ); + assert_eq!(None, f32::NAN.into_json_value()); + } +} diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs index bf40b31b494e..beee02582ff8 100644 --- a/arrow/src/json/writer.rs +++ b/arrow/src/json/writer.rs @@ -700,7 +700,10 @@ where } self.format.start_row(&mut self.writer, is_first_row)?; - self.writer.write_all(&serde_json::to_vec(row)?)?; + self.writer.write_all( + &serde_json::to_vec(row) + .map_err(|error| ArrowError::JsonError(error.to_string()))?, + )?; self.format.end_row(&mut self.writer)?; Ok(()) } diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 89463e4c8fd3..90caa2e3a5c7 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -39,10 +39,8 @@ use crate::record_batch::RecordBatch; import_exception!(pyarrow, ArrowException); pub type PyArrowException = ArrowException; -impl From for PyErr { - fn from(err: ArrowError) -> PyErr { - PyArrowException::new_err(err.to_string()) - } +fn to_py_err(err: ArrowError) -> PyErr { + PyArrowException::new_err(err.to_string()) } pub trait PyArrowConvert: Sized { @@ -55,12 +53,12 @@ impl PyArrowConvert for DataType { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; - let dtype = DataType::try_from(&c_schema)?; + let dtype = DataType::try_from(&c_schema).map_err(to_py_err)?; Ok(dtype) } fn to_pyarrow(&self, py: Python) -> PyResult { - let c_schema = FFI_ArrowSchema::try_from(self)?; + let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("DataType")?; @@ -75,12 +73,12 @@ impl PyArrowConvert for Field { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; - let field = Field::try_from(&c_schema)?; + let field = Field::try_from(&c_schema).map_err(to_py_err)?; Ok(field) } fn to_pyarrow(&self, py: Python) -> PyResult { - let c_schema = FFI_ArrowSchema::try_from(self)?; + let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("Field")?; @@ -95,12 +93,12 @@ impl PyArrowConvert for Schema { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; - let schema = Schema::try_from(&c_schema)?; + let schema = Schema::try_from(&c_schema).map_err(to_py_err)?; Ok(schema) } fn to_pyarrow(&self, py: Python) -> PyResult { - let c_schema = FFI_ArrowSchema::try_from(self)?; + let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("Schema")?; @@ -127,15 +125,17 @@ impl PyArrowConvert for ArrayData { ), )?; - let ffi_array = - unsafe { ffi::ArrowArray::try_from_raw(array_pointer, schema_pointer)? }; - let data = ArrayData::try_from(ffi_array)?; + let ffi_array = unsafe { + ffi::ArrowArray::try_from_raw(array_pointer, schema_pointer) + .map_err(to_py_err)? + }; + let data = ArrayData::try_from(ffi_array).map_err(to_py_err)?; Ok(data) } fn to_pyarrow(&self, py: Python) -> PyResult { - let array = ffi::ArrowArray::try_from(self.clone())?; + let array = ffi::ArrowArray::try_from(self.clone()).map_err(to_py_err)?; let (array_pointer, schema_pointer) = ffi::ArrowArray::into_raw(array); let module = py.import("pyarrow")?; @@ -151,6 +151,21 @@ impl PyArrowConvert for ArrayData { } } +impl PyArrowConvert for Vec { + fn from_pyarrow(value: &PyAny) -> PyResult { + let list = value.downcast::()?; + list.iter().map(|x| T::from_pyarrow(&x)).collect() + } + + fn to_pyarrow(&self, py: Python) -> PyResult { + let values = self + .iter() + .map(|v| v.to_pyarrow(py)) + .collect::>>()?; + Ok(values.to_object(py)) + } +} + impl PyArrowConvert for T where T: Array + From, @@ -176,7 +191,7 @@ impl PyArrowConvert for RecordBatch { .map(ArrayRef::from_pyarrow) .collect::>()?; - let batch = RecordBatch::try_new(schema, arrays)?; + let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; Ok(batch) } @@ -237,25 +252,25 @@ impl PyArrowConvert for ArrowArrayStreamReader { } } -macro_rules! add_conversion { - ($typ:ty) => { - impl<'source> FromPyObject<'source> for $typ { - fn extract(value: &'source PyAny) -> PyResult { - Self::from_pyarrow(value) - } - } +/// A newtype wrapper around a `T: PyArrowConvert` that implements +/// [`FromPyObject`] and [`IntoPy`] allowing usage with pyo3 macros +#[derive(Debug)] +pub struct PyArrowType(pub T); - impl<'a> IntoPy for $typ { - fn into_py(self, py: Python) -> PyObject { - self.to_pyarrow(py).unwrap() - } - } - }; +impl<'source, T: PyArrowConvert> FromPyObject<'source> for PyArrowType { + fn extract(value: &'source PyAny) -> PyResult { + Ok(Self(T::from_pyarrow(value)?)) + } +} + +impl<'a, T: PyArrowConvert> IntoPy for PyArrowType { + fn into_py(self, py: Python) -> PyObject { + self.0.to_pyarrow(py).unwrap() + } } -add_conversion!(DataType); -add_conversion!(Field); -add_conversion!(Schema); -add_conversion!(ArrayData); -add_conversion!(RecordBatch); -add_conversion!(ArrowArrayStreamReader); +impl From for PyArrowType { + fn from(s: T) -> Self { + Self(s) + } +} diff --git a/integration-testing/src/util/mod.rs b/integration-testing/src/util/mod.rs index 9ecd301360fe..f9ddc0e6f4b7 100644 --- a/integration-testing/src/util/mod.rs +++ b/integration-testing/src/util/mod.rs @@ -265,7 +265,8 @@ impl ArrowJsonField { /// TODO: convert to use an Into fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it - let field = serde_json::to_value(self)?; + let field = serde_json::to_value(self) + .map_err(|error| ArrowError::JsonError(error.to_string()))?; field_from_json(&field) } } From 80c0f1a480ee51bc05a2224108575dfad1be8deb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 21 Sep 2022 19:12:24 -0700 Subject: [PATCH 0067/1411] Add divide dyn kernel which produces null for division by zero (#2764) * Add divide_dyn_opt kernel * Add test * Fix clippy * Rename function --- arrow/src/compute/kernels/arithmetic.rs | 102 ++++++++++++++++++++++++ arrow/src/compute/kernels/arity.rs | 69 +++++++++------- 2 files changed, 143 insertions(+), 28 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b44cb8b947e2..d33827594af5 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -697,6 +697,39 @@ where ) } +#[cfg(feature = "dyn_arith_dict")] +fn math_divide_safe_op_dict( + left: &DictionaryArray, + right: &DictionaryArray, + op: F, +) -> Result +where + K: ArrowNumericType, + T: ArrowNumericType, + T::Native: One + Zero, + F: Fn(T::Native, T::Native) -> Option, +{ + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + let array: PrimitiveArray = binary_opt::<_, _, _, T>(left, right, op)?; + Ok(Arc::new(array) as ArrayRef) +} + +fn math_safe_divide_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result +where + LT: ArrowNumericType, + RT: ArrowNumericType, + RT::Native: One + Zero, + F: Fn(LT::Native, RT::Native) -> Option, +{ + let array: PrimitiveArray = binary_opt::<_, _, _, LT>(left, right, op)?; + Ok(Arc::new(array) as ArrayRef) +} + /// Perform `left + right` operation on two arrays. If either left or right value is null /// then the result is also null. /// @@ -1406,6 +1439,51 @@ pub fn divide_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| { + if b.is_zero() { + None + } else { + Some(a.div_wrapping(b)) + } + }, + math_divide_safe_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_safe_divide_op(left, right, |a, b| { + if b.is_zero() { + None + } else { + Some(a.div_wrapping(b)) + } + }) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + /// Perform `left / right` operation on two arrays without checking for division by zero. /// For floating point types, the result of dividing by zero follows normal floating point /// rules. For other numeric types, dividing by zero will panic, @@ -2752,4 +2830,28 @@ mod tests { let overflow = divide_dyn_checked(&a, &b); overflow.expect_err("overflow should be detected"); } + + #[test] + #[cfg(feature = "dyn_arith_dict")] + fn test_div_dyn_opt_overflow_division_by_zero() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![0]); + + let division_by_zero = divide_dyn_opt(&a, &b); + let expected = Arc::new(Int32Array::from(vec![None])) as ArrayRef; + assert_eq!(&expected, &division_by_zero.unwrap()); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(0).unwrap(); + let b = builder.finish(); + + let division_by_zero = divide_dyn_opt(&a, &b); + assert_eq!(&expected, &division_by_zero.unwrap()); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index bf10289683f1..5f875e6ddf29 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -357,6 +357,26 @@ where Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) } +#[inline(never)] +fn try_binary_opt_no_nulls( + len: usize, + a: A, + b: B, + op: F, +) -> Result> +where + O: ArrowPrimitiveType, + F: Fn(A::Item, B::Item) -> Option, +{ + let mut buffer = Vec::with_capacity(10); + for idx in 0..len { + unsafe { + buffer.push(op(a.value_unchecked(idx), b.value_unchecked(idx))); + }; + } + Ok(buffer.iter().collect()) +} + /// Applies the provided binary operation across `a` and `b`, collecting the optional results /// into a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the corresponding /// index in the result will also be null. The binary operation could return `None` which @@ -367,16 +387,14 @@ where /// # Error /// /// This function gives error if the arrays have different lengths -pub(crate) fn binary_opt( - a: &PrimitiveArray, - b: &PrimitiveArray, +pub(crate) fn binary_opt( + a: A, + b: B, op: F, ) -> Result> where - A: ArrowPrimitiveType, - B: ArrowPrimitiveType, O: ArrowPrimitiveType, - F: Fn(A::Native, B::Native) -> Option, + F: Fn(A::Item, B::Item) -> Option, { if a.len() != b.len() { return Err(ArrowError::ComputeError( @@ -389,29 +407,24 @@ where } if a.null_count() == 0 && b.null_count() == 0 { - Ok(a.values() - .iter() - .zip(b.values().iter()) - .map(|(a, b)| op(*a, *b)) - .collect()) - } else { - let iter_a = ArrayIter::new(a); - let iter_b = ArrayIter::new(b); - - let values = - iter_a - .into_iter() - .zip(iter_b.into_iter()) - .map(|(item_a, item_b)| { - if let (Some(a), Some(b)) = (item_a, item_b) { - op(a, b) - } else { - None - } - }); - - Ok(values.collect()) + return try_binary_opt_no_nulls(a.len(), a, b, op); } + + let iter_a = ArrayIter::new(a); + let iter_b = ArrayIter::new(b); + + let values = iter_a + .into_iter() + .zip(iter_b.into_iter()) + .map(|(item_a, item_b)| { + if let (Some(a), Some(b)) = (item_a, item_b) { + op(a, b) + } else { + None + } + }); + + Ok(values.collect()) } #[cfg(test)] From a927d7aa8cfb47ff8ccdd6efdea9170a2d37dc62 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 22 Sep 2022 10:35:57 +0100 Subject: [PATCH 0068/1411] Split out `arrow-data` into a separate crate (#2746) * Split out arrow-data * RAT * Fix integration test --- Cargo.toml | 1 + arrow-data/Cargo.toml | 56 + .../util => arrow-data/src}/bit_iterator.rs | 2 +- .../src/util => arrow-data/src}/bit_mask.rs | 4 +- {arrow => arrow-data}/src/bitmap.rs | 16 +- {arrow/src/array => arrow-data/src}/data.rs | 1245 +----------- .../datatypes => arrow-data/src}/decimal.rs | 69 +- .../array => arrow-data/src}/equal/boolean.rs | 33 +- .../array => arrow-data/src}/equal/decimal.rs | 6 +- .../src}/equal/dictionary.rs | 5 +- .../src}/equal/fixed_binary.rs | 6 +- .../src}/equal/fixed_list.rs | 6 +- .../array => arrow-data/src}/equal/list.rs | 62 +- arrow-data/src/equal/mod.rs | 171 ++ .../array => arrow-data/src}/equal/null.rs | 2 +- .../src}/equal/primitive.rs | 4 +- .../src}/equal/structure.rs | 3 +- .../array => arrow-data/src}/equal/union.rs | 3 +- .../array => arrow-data/src}/equal/utils.rs | 7 +- .../src}/equal/variable_size.rs | 13 +- arrow-data/src/lib.rs | 30 + .../src}/transform/boolean.rs | 4 +- .../src}/transform/fixed_binary.rs | 4 +- .../src}/transform/fixed_size_list.rs | 4 +- .../src}/transform/list.rs | 11 +- arrow-data/src/transform/mod.rs | 672 +++++++ .../src}/transform/null.rs | 3 +- .../src}/transform/primitive.rs | 4 +- .../src}/transform/structure.rs | 3 +- .../src}/transform/union.rs | 3 +- .../src}/transform/utils.rs | 9 +- .../src}/transform/variable_size.rs | 22 +- arrow-pyarrow-integration-testing/src/lib.rs | 12 +- arrow/Cargo.toml | 3 +- arrow/src/array/array.rs | 98 +- arrow/src/array/array_decimal.rs | 6 +- arrow/src/array/equal/mod.rs | 1464 -------------- arrow/src/array/mod.rs | 16 +- arrow/src/array/transform/mod.rs | 1715 ----------------- arrow/src/compute/kernels/boolean.rs | 6 +- arrow/src/compute/kernels/cast.rs | 10 +- arrow/src/compute/kernels/comparison.rs | 2 +- arrow/src/datatypes/mod.rs | 3 +- arrow/src/lib.rs | 5 +- arrow/src/pyarrow.rs | 6 +- arrow/src/util/mod.rs | 5 +- arrow/tests/array_equal.rs | 1274 ++++++++++++ arrow/tests/array_transform.rs | 1005 ++++++++++ arrow/tests/array_validation.rs | 1100 +++++++++++ 49 files changed, 4620 insertions(+), 4593 deletions(-) create mode 100644 arrow-data/Cargo.toml rename {arrow/src/util => arrow-data/src}/bit_iterator.rs (98%) rename {arrow/src/util => arrow-data/src}/bit_mask.rs (98%) rename {arrow => arrow-data}/src/bitmap.rs (93%) rename {arrow/src/array => arrow-data/src}/data.rs (62%) rename {arrow/src/datatypes => arrow-data/src}/decimal.rs (95%) rename {arrow/src/array => arrow-data/src}/equal/boolean.rs (77%) rename {arrow/src/array => arrow-data/src}/equal/decimal.rs (95%) rename {arrow/src/array => arrow-data/src}/equal/dictionary.rs (95%) rename {arrow/src/array => arrow-data/src}/equal/fixed_binary.rs (95%) rename {arrow/src/array => arrow-data/src}/equal/fixed_list.rs (95%) rename {arrow/src/array => arrow-data/src}/equal/list.rs (75%) create mode 100644 arrow-data/src/equal/mod.rs rename {arrow/src/array => arrow-data/src}/equal/null.rs (97%) rename {arrow/src/array => arrow-data/src}/equal/primitive.rs (96%) rename {arrow/src/array => arrow-data/src}/equal/structure.rs (96%) rename {arrow/src/array => arrow-data/src}/equal/union.rs (98%) rename {arrow/src/array => arrow-data/src}/equal/utils.rs (96%) rename {arrow/src/array => arrow-data/src}/equal/variable_size.rs (92%) create mode 100644 arrow-data/src/lib.rs rename {arrow/src/array => arrow-data/src}/transform/boolean.rs (95%) rename {arrow/src/array => arrow-data/src}/transform/fixed_binary.rs (97%) rename {arrow/src/array => arrow-data/src}/transform/fixed_size_list.rs (97%) rename {arrow/src/array => arrow-data/src}/transform/list.rs (92%) create mode 100644 arrow-data/src/transform/mod.rs rename {arrow/src/array => arrow-data/src}/transform/null.rs (97%) rename {arrow/src/array => arrow-data/src}/transform/primitive.rs (96%) rename {arrow/src/array => arrow-data/src}/transform/structure.rs (98%) rename {arrow/src/array => arrow-data/src}/transform/union.rs (98%) rename {arrow/src/array => arrow-data/src}/transform/utils.rs (89%) rename {arrow/src/array => arrow-data/src}/transform/variable_size.rs (87%) delete mode 100644 arrow/src/array/equal/mod.rs delete mode 100644 arrow/src/array/transform/mod.rs create mode 100644 arrow/tests/array_equal.rs create mode 100644 arrow/tests/array_transform.rs create mode 100644 arrow/tests/array_validation.rs diff --git a/Cargo.toml b/Cargo.toml index 355c65a8b805..270d23f26c94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "arrow", + "arrow-data", "arrow-schema", "arrow-buffer", "arrow-flight", diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml new file mode 100644 index 000000000000..289b1bbd0eb3 --- /dev/null +++ b/arrow-data/Cargo.toml @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-data" +version = "23.0.0" +description = "Array data abstractions for Apache Arrow" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_data" +path = "src/lib.rs" +bench = false + +[features] +# force_validate runs full data validation for all arrays that are created +# this is not enabled by default as it is too computationally expensive +# but is run as part of our CI checks +force_validate = [] + +[dependencies] + +arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "23.0.0", path = "../arrow-schema" } + +num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "2.0", default-features = false } + +[dev-dependencies] + +[build-dependencies] diff --git a/arrow/src/util/bit_iterator.rs b/arrow-data/src/bit_iterator.rs similarity index 98% rename from arrow/src/util/bit_iterator.rs rename to arrow-data/src/bit_iterator.rs index ceefaa860cb1..45a42c3910f7 100644 --- a/arrow/src/util/bit_iterator.rs +++ b/arrow-data/src/bit_iterator.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use arrow_buffer::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; use std::result::Result; /// Iterator of contiguous ranges of set bits within a provided packed bitmask diff --git a/arrow/src/util/bit_mask.rs b/arrow-data/src/bit_mask.rs similarity index 98% rename from arrow/src/util/bit_mask.rs rename to arrow-data/src/bit_mask.rs index da542a2bb1f9..6a0a46038992 100644 --- a/arrow/src/util/bit_mask.rs +++ b/arrow-data/src/bit_mask.rs @@ -17,8 +17,8 @@ //! Utils for working with packed bit masks -use crate::util::bit_chunk_iterator::BitChunks; -use crate::util::bit_util::{ceil, get_bit, set_bit}; +use arrow_buffer::bit_chunk_iterator::BitChunks; +use arrow_buffer::bit_util::{ceil, get_bit, set_bit}; /// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the /// bits in `data` in the range `[offset_read..offset_read+len]` diff --git a/arrow/src/bitmap.rs b/arrow-data/src/bitmap.rs similarity index 93% rename from arrow/src/bitmap.rs rename to arrow-data/src/bitmap.rs index dbf9706677a5..0002ef022122 100644 --- a/arrow/src/bitmap.rs +++ b/arrow-data/src/bitmap.rs @@ -17,8 +17,8 @@ //! Defines [Bitmap] for tracking validity bitmaps -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; +use arrow_buffer::bit_util; +use arrow_schema::ArrowError; use std::mem; use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; @@ -56,6 +56,10 @@ impl Bitmap { unsafe { bit_util::get_bit_raw(self.bits.as_ptr(), i) } } + pub fn buffer(&self) -> &Buffer { + &self.bits + } + pub fn buffer_ref(&self) -> &Buffer { &self.bits } @@ -76,9 +80,9 @@ impl Bitmap { } impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { - type Output = Result; + type Output = Result; - fn bitand(self, rhs: &'b Bitmap) -> Result { + fn bitand(self, rhs: &'b Bitmap) -> Result { if self.bits.len() != rhs.bits.len() { return Err(ArrowError::ComputeError( "Buffers must be the same size to apply Bitwise AND.".to_string(), @@ -95,9 +99,9 @@ impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { } impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { - type Output = Result; + type Output = Result; - fn bitor(self, rhs: &'b Bitmap) -> Result { + fn bitor(self, rhs: &'b Bitmap) -> Result { if self.bits.len() != rhs.bits.len() { return Err(ArrowError::ComputeError( "Buffers must be the same size to apply Bitwise OR.".to_string(), diff --git a/arrow/src/array/data.rs b/arrow-data/src/data.rs similarity index 62% rename from arrow/src/array/data.rs rename to arrow-data/src/data.rs index 7571ba210d7d..37c059748fe7 100644 --- a/arrow/src/array/data.rs +++ b/arrow-data/src/data.rs @@ -18,24 +18,19 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::datatypes::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DataType, - IntervalUnit, UnionMode, -}; -use crate::error::{ArrowError, Result}; -use crate::util::bit_iterator::BitSliceIterator; -use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::bit_util, +use crate::decimal::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, }; +use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use std::convert::TryInto; use std::mem; use std::ops::Range; use std::sync::Arc; -use super::equal::equal; +use crate::equal; #[inline] pub(crate) fn contains_nulls( @@ -346,7 +341,7 @@ impl ArrayData { offset: usize, buffers: Vec, child_data: Vec, - ) -> Result { + ) -> Result { // we must check the length of `null_bit_buffer` first // because we use this buffer to calculate `null_count` // in `Self::new_unchecked`. @@ -390,33 +385,6 @@ impl ArrayData { &self.data_type } - /// Updates the [DataType] of this ArrayData/ - /// - /// panic's if the new DataType is not compatible with the - /// existing type. - /// - /// Note: currently only changing a [DataType::Decimal128]s or - /// [DataType::Decimal256]s precision and scale are supported - #[inline] - pub(crate) fn with_data_type(mut self, new_data_type: DataType) -> Self { - if matches!(self.data_type, DataType::Decimal128(_, _)) { - assert!( - matches!(new_data_type, DataType::Decimal128(_, _)), - "only 128-bit DecimalType is supported for new datatype" - ); - } else if matches!(self.data_type, DataType::Decimal256(_, _)) { - assert!( - matches!(new_data_type, DataType::Decimal256(_, _)), - "only 256-bit DecimalType is supported for new datatype" - ); - } else { - panic!("only DecimalType is supported.") - } - - self.data_type = new_data_type; - self - } - /// Returns a slice of buffers for this array data pub fn buffers(&self) -> &[Buffer] { &self.buffers[..] @@ -562,7 +530,7 @@ impl ArrayData { /// * the buffer is not byte-aligned with type T, or /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) #[inline] - pub(crate) fn buffer(&self, buffer: usize) -> &[T] { + pub fn buffer(&self, buffer: usize) -> &[T] { let values = unsafe { self.buffers[buffer].as_slice().align_to::() }; if !values.0.is_empty() || !values.2.is_empty() { panic!("The buffer is not byte-aligned with its interpretation") @@ -654,7 +622,7 @@ impl ArrayData { /// /// See [ArrayData::validate_full] to validate fully the offset content /// and the validitiy of utf8 data - pub fn validate(&self) -> Result<()> { + pub fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; @@ -769,7 +737,7 @@ impl ArrayData { /// entries. /// /// For an empty array, the `buffer` can also be empty. - fn typed_offsets(&self) -> Result<&[T]> { + fn typed_offsets(&self) -> Result<&[T], ArrowError> { // An empty list-like array can have 0 offsets if self.len == 0 && self.buffers[0].is_empty() { return Ok(&[]); @@ -783,7 +751,7 @@ impl ArrayData { &self, idx: usize, len: usize, - ) -> Result<&[T]> { + ) -> Result<&[T], ArrowError> { let buffer = &self.buffers[idx]; let required_len = (len + self.offset) * std::mem::size_of::(); @@ -806,7 +774,7 @@ impl ArrayData { fn validate_offsets( &self, values_length: usize, - ) -> Result<()> { + ) -> Result<(), ArrowError> { // Justification: buffer size was validated above let offsets = self.typed_offsets::()?; if offsets.is_empty() { @@ -852,7 +820,7 @@ impl ArrayData { } /// Validates the layout of `child_data` ArrayData structures - fn validate_child_data(&self) -> Result<()> { + fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::List(field) | DataType::Map(field, _) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -943,13 +911,13 @@ impl ArrayData { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result<&ArrayData> { + ) -> Result<&ArrayData, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } /// Returns `Err` if self.child_data does not have exactly `expected_len` elements - fn validate_num_child_data(&self, expected_len: usize) -> Result<()> { + fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { if self.child_data().len() != expected_len { Err(ArrowError::InvalidArgumentError(format!( "Value data for {} should contain {} child data array(s), had {}", @@ -968,7 +936,7 @@ impl ArrayData { &self, i: usize, expected_type: &DataType, - ) -> Result<&ArrayData> { + ) -> Result<&ArrayData, ArrowError> { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -999,7 +967,7 @@ impl ArrayData { /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) /// Note calls `validate()` internally - pub fn validate_full(&self) -> Result<()> { + pub fn validate_full(&self) -> Result<(), ArrowError> { // Check all buffer sizes prior to looking at them more deeply in this function self.validate()?; @@ -1034,7 +1002,7 @@ impl ArrayData { Ok(()) } - pub fn validate_values(&self) -> Result<()> { + pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Decimal128(p, _) => { let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; @@ -1106,10 +1074,14 @@ impl ArrayData { /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` - fn validate_each_offset(&self, offset_limit: usize, validate: V) -> Result<()> + fn validate_each_offset( + &self, + offset_limit: usize, + validate: V, + ) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, - V: Fn(usize, Range) -> Result<()>, + V: Fn(usize, Range) -> Result<(), ArrowError>, { self.typed_offsets::()? .iter() @@ -1147,7 +1119,7 @@ impl ArrayData { } }) .skip(1) // the first element is meaningless - .try_for_each(|res: Result<(usize, Range)>| { + .try_for_each(|res: Result<(usize, Range), ArrowError>| { let (item_index, range) = res?; validate(item_index-1, range) }) @@ -1155,7 +1127,7 @@ impl ArrayData { /// Ensures that all strings formed by the offsets in `buffers[0]` /// into `buffers[1]` are valid utf8 sequences - fn validate_utf8(&self) -> Result<()> + fn validate_utf8(&self) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1195,7 +1167,7 @@ impl ArrayData { /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are /// between `0` and `offset_limit` - fn validate_offsets_full(&self, offset_limit: usize) -> Result<()> + fn validate_offsets_full(&self, offset_limit: usize) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1208,7 +1180,7 @@ impl ArrayData { /// Validates that each value in self.buffers (typed as T) /// is within the range [0, max_value], inclusive - fn check_bounds(&self, max_value: i64) -> Result<()> + fn check_bounds(&self, max_value: i64) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1288,7 +1260,7 @@ impl ArrayData { /// Return the expected [`DataTypeLayout`] Arrays of this data /// type are expected to have -pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { +pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) use std::mem::size_of; @@ -1381,9 +1353,9 @@ pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { } /// Layout specification for a data type -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] // Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 -pub(crate) struct DataTypeLayout { +pub struct DataTypeLayout { /// A vector of buffer layout specifications, one for each expected buffer pub buffers: Vec, @@ -1429,8 +1401,8 @@ impl DataTypeLayout { } /// Layout specification for a single data type buffer -#[derive(Debug, PartialEq)] -pub(crate) enum BufferSpec { +#[derive(Debug, PartialEq, Eq)] +pub enum BufferSpec { /// each element has a fixed width FixedWidth { byte_width: usize }, /// Variable width, such as string data for utf8 data @@ -1449,7 +1421,7 @@ pub(crate) enum BufferSpec { impl PartialEq for ArrayData { fn eq(&self, other: &Self) -> bool { - equal(self, other) + equal::equal(self, other) } } @@ -1545,7 +1517,7 @@ impl ArrayDataBuilder { } /// Creates an array data, validating all inputs - pub fn build(self) -> Result { + pub fn build(self) -> Result { ArrayData::try_new( self.data_type, self.len, @@ -1576,16 +1548,19 @@ impl From for ArrayDataBuilder { #[cfg(test)] mod tests { use super::*; - use std::ptr::NonNull; + use arrow_schema::Field; + + // See arrow/tests/array_data_validation.rs for test of array validation + + /// returns a buffer initialized with some constant value for tests + fn make_i32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42i32; n]) + } - use crate::array::{ - make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, - Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, - UInt8Builder, - }; - use crate::buffer::Buffer; - use crate::datatypes::Field; - use crate::util::bit_util; + /// returns a buffer initialized with some constant value for tests + fn make_f32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42f32; n]) + } #[test] fn test_builder() { @@ -1771,1124 +1746,30 @@ mod tests { } #[test] - #[should_panic( - expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" - )] - fn test_buffer_too_small() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) - ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); - } - - #[test] - #[should_panic( - expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" - )] - fn test_buffer_too_small_offset() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - // should fail -- size is ok, but also has offset - ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); - } - - #[test] - #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] - fn test_bad_number_of_buffers() { - let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); - let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) - .unwrap(); - } - - #[test] - #[should_panic(expected = "integer overflow computing min buffer size")] - fn test_fixed_width_overflow() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) - .unwrap(); - } - - #[test] - #[should_panic(expected = "null_bit_buffer size too small. got 1 needed 2")] - fn test_bitmap_too_small() { - let buffer = make_i32_buffer(9); - let null_bit_buffer = Buffer::from(vec![0b11111111]); - - ArrayData::try_new( - DataType::Int32, - 9, - Some(null_bit_buffer), - 0, - vec![buffer], - vec![], - ) - .unwrap(); - } - - // Test creating a dictionary with a non integer type - #[test] - #[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] - fn test_non_int_dictionary() { - let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - let data_type = - DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); - let child_data = ArrayData::try_new( - DataType::Int32, - 1, - None, - 0, - vec![i32_buffer.clone()], - vec![], - ) - .unwrap(); - ArrayData::try_new( - data_type, - 1, - None, - 0, - vec![i32_buffer.clone(), i32_buffer], - vec![child_data], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] - fn test_mismatched_dictionary_types() { - // test w/ dictionary created with a child array data that has type different than declared - let string_array: StringArray = - vec![Some("foo"), Some("bar")].into_iter().collect(); - let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); - // Dict says LargeUtf8 but array is Utf8 - let data_type = DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::LargeUtf8), - ); - let child_data = string_array.into_data(); - ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_empty_offsets_buffer() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from(&[]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_single_zero_offset() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] - fn test_empty_utf8_array_with_invalid_offset() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[1i32]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_empty_utf8_array_with_non_zero_offset() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); - ArrayData::try_new( - DataType::Utf8, - 0, - None, - 3, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" - )] - fn test_empty_large_utf8_array_with_wrong_type_offsets() { - let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); - ArrayData::try_new( - DataType::LargeUtf8, - 0, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8" - )] - fn test_validate_offsets_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" - )] - fn test_validate_offsets_i64() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); - ArrayData::try_new( - DataType::LargeUtf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] - fn test_validate_offsets_negative_first_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] - fn test_validate_offsets_negative_last_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] - fn test_validate_offsets_range_too_small() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // start offset is larger than end - let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] - fn test_validate_offsets_range_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] - fn test_validate_offsets_first_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - fn test_validate_offsets_first_too_large_skipped() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer, but offset starts at 1 so it is skipped - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); - let data = ArrayData::try_new( - DataType::Utf8, - 2, - None, - 1, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - let array: StringArray = data.into(); - let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); - assert_eq!(array, expected); - } + fn test_contains_nulls() { + let buffer: Buffer = + MutableBuffer::from_iter([false, false, false, true, true, false]).into(); - #[test] - #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] - fn test_validate_offsets_last_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); - ArrayData::try_new( - DataType::Utf8, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); + assert!(contains_nulls(Some(&buffer), 0, 6)); + assert!(contains_nulls(Some(&buffer), 0, 3)); + assert!(!contains_nulls(Some(&buffer), 3, 2)); + assert!(!contains_nulls(Some(&buffer), 0, 0)); } #[test] - #[should_panic( - expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" - )] - fn test_validate_fixed_size_list() { - // child has 4 elements, - let child_array = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - // but claim we have 3 elements for a fixed size of 2 - // 10 is off the end of the buffer - let field = Field::new("field", DataType::Int32, true); - ArrayData::try_new( - DataType::FixedSizeList(Box::new(field), 2), - 3, - None, - 0, - vec![], - vec![child_array.into_data()], - ) - .unwrap(); - } + fn test_into_buffers() { + let data_types = vec![ + DataType::Union(vec![], vec![], UnionMode::Dense), + DataType::Union(vec![], vec![], UnionMode::Sparse), + ]; - #[test] - #[should_panic(expected = "Child type mismatch for Struct")] - fn test_validate_struct_child_type() { - let field1 = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - // validate the the type of struct fields matches child fields - ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), - 3, - None, - 0, - vec![], - vec![field1.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" - )] - fn test_validate_struct_child_length() { - // field length only has 4 items, but array claims to have 6 - let field1 = vec![Some(1), Some(2), Some(3), None] - .into_iter() - .collect::(); - - ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), - 6, - None, - 0, - vec![], - vec![field1.into_data()], - ) - .unwrap(); - } - - /// Test that the array of type `data_type` that has invalid utf8 data errors - fn check_utf8_validation(data_type: DataType) { - // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself - let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); - let offsets: Vec = [0, 2, 3] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] - fn test_validate_utf8_content() { - check_utf8_validation::(DataType::Utf8); - } - - #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] - fn test_validate_large_utf8_content() { - check_utf8_validation::(DataType::LargeUtf8); - } - - /// Tests that offsets are at valid codepoint boundaries - fn check_utf8_char_boundary(data_type: DataType) { - let data_buffer = Buffer::from("🙀".as_bytes()); - let offsets: Vec = [0, 1, data_buffer.len()] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] - fn test_validate_utf8_char_boundary() { - check_utf8_char_boundary::(DataType::Utf8); - } - - #[test] - #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] - fn test_validate_large_utf8_char_boundary() { - check_utf8_char_boundary::(DataType::LargeUtf8); - } - - /// Test that the array of type `data_type` that has invalid indexes (out of bounds) - fn check_index_out_of_bounds_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); - // First two offsets are fine, then 5 is out of bounds - let offsets: Vec = [0, 1, 2, 5, 2] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 4, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_utf8_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::Utf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_large_utf8_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::LargeUtf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_binary_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::Binary); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" - )] - fn test_validate_large_binary_out_of_bounds() { - check_index_out_of_bounds_validation::(DataType::LargeBinary); - } - - // validate that indexes don't go bacwards check indexes that go backwards - fn check_index_backwards_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); - // First three offsets are fine, then 1 goes backwards - let offsets: Vec = [0, 1, 2, 2, 1] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - - let offsets_buffer = Buffer::from_slice_ref(&offsets); - ArrayData::try_new( - data_type, - 4, - None, - 0, - vec![offsets_buffer, data_buffer], - vec![], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_utf8_index_backwards() { - check_index_backwards_validation::(DataType::Utf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_large_utf8_index_backwards() { - check_index_backwards_validation::(DataType::LargeUtf8); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_binary_index_backwards() { - check_index_backwards_validation::(DataType::Binary); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" - )] - fn test_validate_large_binary_index_backwards() { - check_index_backwards_validation::(DataType::LargeBinary); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" - )] - fn test_validate_dictionary_index_too_large() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // 3 is not a valid index into the values (only 0 and 1) - let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]" - )] - fn test_validate_dictionary_index_negative() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all! - let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - fn test_validate_dictionary_index_negative_but_not_referenced() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all, but the array is length 1 - // so the -1 should not be looked at - let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - // Expect this not to panic - ArrayData::try_new( - data_type, - 1, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" - )] - fn test_validate_dictionary_index_giant_negative() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - - // -1 is not a valid index at all! - let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); - - let data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - ArrayData::try_new( - data_type, - 2, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - .unwrap(); - } - - /// Test that the list of type `data_type` generates correct offset out of bounds errors - fn check_list_offsets(data_type: DataType) { - let values: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); - - // 5 is an invalid offset into a list of only three values - let offsets: Vec = [0, 2, 5, 4] - .iter() - .map(|&v| T::from_usize(v).unwrap()) - .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); - - ArrayData::try_new( - data_type, - 3, - None, - 0, - vec![offsets_buffer], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" - )] - fn test_validate_list_offsets() { - let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::List(Box::new(field_type))); - } - - #[test] - #[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" - )] - fn test_validate_large_list_offsets() { - let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::LargeList(Box::new(field_type))); - } - - /// Test that the list of type `data_type` generates correct errors for negative offsets - #[test] - #[should_panic( - expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" - )] - fn test_validate_list_negative_offsets() { - let values: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); - let field_type = Field::new("f", values.data_type().clone(), true); - let data_type = DataType::List(Box::new(field_type)); - - // -1 is an invalid offset any way you look at it - let offsets: Vec = vec![0, 2, -1, 4]; - let offsets_buffer = Buffer::from_slice_ref(&offsets); - - ArrayData::try_new( - data_type, - 3, - None, - 0, - vec![offsets_buffer], - vec![values.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])" - )] - /// test that children are validated recursively (aka bugs in child data of struct also are flagged) - fn test_validate_recursive() { - // Form invalid dictionary array - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - // -1 is not a valid index - let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); - - let dict_data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - // purposely create an invalid child data - let dict_data = unsafe { - ArrayData::new_unchecked( - dict_data_type, - 2, - None, - None, - 0, - vec![keys.data().buffers[0].clone()], - vec![values.into_data()], - ) - }; - - // Now, try and create a struct with this invalid child data (and expect an error) - let data_type = - DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); - - ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); - } - - /// returns a buffer initialized with some constant value for tests - fn make_i32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42i32; n]) - } - - /// returns a buffer initialized with some constant value for tests - fn make_f32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42f32; n]) - } - - #[test] - #[should_panic(expected = "Expected Int64 but child data had Int32")] - fn test_validate_union_different_types() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1), Some(2)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), // data is int32 - ], - vec![0, 1], - UnionMode::Sparse, - ), - 2, - None, - 0, - vec![type_ids], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - // sparse with wrong sized children - #[test] - #[should_panic( - expected = "Sparse union child array #1 has length smaller than expected for union array (1 < 2)" - )] - fn test_validate_union_sparse_different_child_len() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - // field 2 only has 1 item but array should have 2 - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Sparse, - ), - 2, - None, - 0, - vec![type_ids], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic(expected = "Expected 2 buffers in array of type Union")] - fn test_validate_union_dense_without_offsets() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - 2, - None, - 0, - vec![type_ids], // need offsets buffer here too - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - #[should_panic( - expected = "Need at least 8 bytes in buffers[1] in array of type Union" - )] - fn test_validate_union_dense_with_bad_len() { - let field1 = vec![Some(1), Some(2)].into_iter().collect::(); - - let field2 = vec![Some(1)].into_iter().collect::(); - - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 - - ArrayData::try_new( - DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - 2, - None, - 0, - vec![type_ids, offsets], - vec![field1.into_data(), field2.into_data()], - ) - .unwrap(); - } - - #[test] - fn test_try_new_sliced_struct() { - let mut builder = StructBuilder::new( - vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Boolean, true), - ], - vec![ - Box::new(Int32Builder::with_capacity(5)), - Box::new(BooleanBuilder::with_capacity(5)), - ], - ); - - // struct[0] = { a: 10, b: true } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(10)); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(true)); - builder.append(true); - - // struct[1] = null - builder - .field_builder::(0) - .unwrap() - .append_option(None); - builder - .field_builder::(1) - .unwrap() - .append_option(None); - builder.append(false); - - // struct[2] = { a: null, b: false } - builder - .field_builder::(0) - .unwrap() - .append_option(None); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(false)); - builder.append(true); - - // struct[3] = { a: 21, b: null } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(21)); - builder - .field_builder::(1) - .unwrap() - .append_option(None); - builder.append(true); - - // struct[4] = { a: 18, b: false } - builder - .field_builder::(0) - .unwrap() - .append_option(Some(18)); - builder - .field_builder::(1) - .unwrap() - .append_option(Some(false)); - builder.append(true); - - let struct_array = builder.finish(); - let struct_array_slice = struct_array.slice(1, 3); - let struct_array_data = struct_array_slice.data(); - - let cloned_data = ArrayData::try_new( - struct_array_slice.data_type().clone(), - struct_array_slice.len(), - struct_array_data.null_buffer().cloned(), - struct_array_slice.offset(), - struct_array_data.buffers().to_vec(), - struct_array_data.child_data().to_vec(), - ) - .unwrap(); - let cloned = crate::array::make_array(cloned_data); - - assert_eq!(&struct_array_slice, &cloned); - } - - #[test] - fn test_into_buffers() { - let data_types = vec![ - DataType::Union(vec![], vec![], UnionMode::Dense), - DataType::Union(vec![], vec![], UnionMode::Sparse), - ]; - - for data_type in data_types { - let buffers = new_buffers(&data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(&data_type, buffer1, buffer2); + for data_type in data_types { + let buffers = new_buffers(&data_type, 0); + let [buffer1, buffer2] = buffers; + let buffers = into_buffers(&data_type, buffer1, buffer2); let layout = layout(&data_type); assert_eq!(buffers.len(), layout.buffers.len()); } } - - #[test] - fn test_string_data_from_foreign() { - let mut strings = "foobarfoobar".to_owned(); - let mut offsets = vec![0_i32, 0, 3, 6, 12]; - let mut bitmap = vec![0b1110_u8]; - - let strings_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(strings.as_mut_ptr()), - strings.len(), - Arc::new(strings), - ) - }; - let offsets_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(offsets.as_mut_ptr() as *mut u8), - offsets.len() * std::mem::size_of::(), - Arc::new(offsets), - ) - }; - let null_buffer = unsafe { - Buffer::from_custom_allocation( - NonNull::new_unchecked(bitmap.as_mut_ptr()), - bitmap.len(), - Arc::new(bitmap), - ) - }; - - let data = ArrayData::try_new( - DataType::Utf8, - 4, - Some(null_buffer), - 0, - vec![offsets_buffer, strings_buffer], - vec![], - ) - .unwrap(); - - let array = make_array(data); - let array = array.as_any().downcast_ref::().unwrap(); - - let expected = - StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); - - assert_eq!(array, &expected); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_full_validation() { - let values_builder = UInt8Builder::with_capacity(10); - let byte_width = 16; - let mut fixed_size_builder = - FixedSizeListBuilder::new(values_builder, byte_width); - let value_as_bytes = 123456_i128.to_le_bytes(); - fixed_size_builder - .values() - .append_slice(value_as_bytes.as_slice()); - fixed_size_builder.append(true); - let fixed_size_array = fixed_size_builder.finish(); - - // Build ArrayData for Decimal - let builder = ArrayData::builder(DataType::Decimal128(5, 3)) - .len(fixed_size_array.len()) - .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); - let array_data = unsafe { builder.build_unchecked() }; - let validation_result = array_data.validate_full(); - let error = validation_result.unwrap_err(); - assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", - error.to_string() - ); - } - - #[test] - fn test_decimal_validation() { - let mut builder = Decimal128Builder::with_capacity(4, 10, 4); - builder.append_value(10000).unwrap(); - builder.append_value(20000).unwrap(); - let array = builder.finish(); - - array.data().validate_full().unwrap(); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_sliced_array_child() { - let values = Int32Array::from_iter_values([1, 2, 3]); - let values_sliced = values.slice(1, 2); - let offsets = Buffer::from_iter([1_i32, 3_i32]); - - let list_field = Field::new("element", DataType::Int32, false); - let data_type = DataType::List(Box::new(list_field)); - - let data = unsafe { - ArrayData::new_unchecked( - data_type, - 1, - None, - None, - 0, - vec![offsets], - vec![values_sliced.into_data()], - ) - }; - - let err = data.validate_values().unwrap_err(); - assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); - } - - #[test] - fn test_contains_nulls() { - let buffer: Buffer = - MutableBuffer::from_iter([false, false, false, true, true, false]).into(); - - assert!(contains_nulls(Some(&buffer), 0, 6)); - assert!(contains_nulls(Some(&buffer), 0, 3)); - assert!(!contains_nulls(Some(&buffer), 3, 2)); - assert!(!contains_nulls(Some(&buffer), 0, 0)); - } } diff --git a/arrow/src/datatypes/decimal.rs b/arrow-data/src/decimal.rs similarity index 95% rename from arrow/src/datatypes/decimal.rs rename to arrow-data/src/decimal.rs index ffdb04e0d775..592a461ad5cd 100644 --- a/arrow/src/datatypes/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; -use crate::util::decimal::singed_cmp_le_bytes; +use arrow_schema::ArrowError; use num::BigInt; use std::cmp::Ordering; @@ -745,7 +744,7 @@ pub const DECIMAL_DEFAULT_SCALE: u8 = 10; /// Validates that the specified `i128` value can be properly /// interpreted as a Decimal number with precision `precision` #[inline] -pub(crate) fn validate_decimal_precision(value: i128, precision: u8) -> Result<()> { +pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL128_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal128 is {}, but got {}", @@ -774,10 +773,10 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: u8) -> Result<( /// Validates that the specified `byte_array` of little-endian format /// value can be properly interpreted as a Decimal256 number with precision `precision` #[inline] -pub(crate) fn validate_decimal256_precision_with_lt_bytes( +pub fn validate_decimal256_precision_with_lt_bytes( lt_value: &[u8], precision: u8, -) -> Result<()> { +) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal256 is {}, but got {}", @@ -806,28 +805,44 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } } -#[cfg(test)] -mod test { - use super::*; - use crate::util::decimal::Decimal256; - use num::{BigInt, Num}; - - #[test] - fn test_decimal256_min_max_for_precision() { - // The precision from 1 to 76 - let mut max_value = "9".to_string(); - let mut min_value = "-9".to_string(); - for i in 1..77 { - let max_decimal = - Decimal256::from(BigInt::from_str_radix(max_value.as_str(), 10).unwrap()); - let min_decimal = - Decimal256::from(BigInt::from_str_radix(min_value.as_str(), 10).unwrap()); - let max_bytes = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; - let min_bytes = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; - max_value += "9"; - min_value += "9"; - assert_eq!(max_decimal.raw_value(), &max_bytes); - assert_eq!(min_decimal.raw_value(), &min_bytes); +// compare two signed integer which are encoded with little endian. +// left bytes and right bytes must have the same length. +#[inline] +pub fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { + assert_eq!( + left.len(), + right.len(), + "Can't compare bytes array with different len: {}, {}", + left.len(), + right.len() + ); + assert_ne!(left.len(), 0, "Can't compare bytes array of length 0"); + let len = left.len(); + // the sign bit is 1, the value is negative + let left_negative = left[len - 1] >= 0x80_u8; + let right_negative = right[len - 1] >= 0x80_u8; + if left_negative != right_negative { + return match left_negative { + true => { + // left is negative value + // right is positive value + Ordering::Less + } + false => Ordering::Greater, + }; + } + for i in 0..len { + let l_byte = left[len - 1 - i]; + let r_byte = right[len - 1 - i]; + match l_byte.cmp(&r_byte) { + Ordering::Less => { + return Ordering::Less; + } + Ordering::Greater => { + return Ordering::Greater; + } + Ordering::Equal => {} } } + Ordering::Equal } diff --git a/arrow/src/array/equal/boolean.rs b/arrow-data/src/equal/boolean.rs similarity index 77% rename from arrow/src/array/equal/boolean.rs rename to arrow-data/src/equal/boolean.rs index fddf21b963ad..52e822f03f30 100644 --- a/arrow/src/array/equal/boolean.rs +++ b/arrow-data/src/equal/boolean.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::util::bit_iterator::BitIndexIterator; -use crate::util::bit_util::get_bit; +use crate::bit_iterator::BitIndexIterator; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::utils::{equal_bits, equal_len}; @@ -88,30 +88,3 @@ pub(super) fn boolean_equal( }) } } - -#[cfg(test)] -mod tests { - use crate::array::{Array, BooleanArray}; - - #[test] - fn test_boolean_slice() { - let array = BooleanArray::from(vec![true; 32]); - let slice = array.slice(4, 12); - assert_eq!(slice.data(), slice.data()); - - let slice = array.slice(8, 12); - assert_eq!(slice.data(), slice.data()); - - let slice = array.slice(8, 24); - assert_eq!(slice.data(), slice.data()); - } - - #[test] - fn test_sliced_nullable_boolean_array() { - let a = BooleanArray::from(vec![None; 32]); - let b = BooleanArray::from(vec![true; 32]); - let slice_a = a.slice(1, 12); - let slice_b = b.slice(1, 12); - assert_ne!(slice_a.data(), slice_b.data()); - } -} diff --git a/arrow/src/array/equal/decimal.rs b/arrow-data/src/equal/decimal.rs similarity index 95% rename from arrow/src/array/equal/decimal.rs rename to arrow-data/src/equal/decimal.rs index 49112608c3a5..15703389cb8a 100644 --- a/arrow/src/array/equal/decimal.rs +++ b/arrow-data/src/equal/decimal.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::utils::equal_len; diff --git a/arrow/src/array/equal/dictionary.rs b/arrow-data/src/equal/dictionary.rs similarity index 95% rename from arrow/src/array/equal/dictionary.rs rename to arrow-data/src/equal/dictionary.rs index 1474da5e2d21..5638c5c91c5c 100644 --- a/arrow/src/array/equal/dictionary.rs +++ b/arrow-data/src/equal/dictionary.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::ArrowNativeType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::{bit_util::get_bit, ArrowNativeType}; use super::equal_range; diff --git a/arrow/src/array/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs similarity index 95% rename from arrow/src/array/equal/fixed_binary.rs rename to arrow-data/src/equal/fixed_binary.rs index 58eb22bb19b0..d6af208016fa 100644 --- a/arrow/src/array/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::utils::equal_len; diff --git a/arrow/src/array/equal/fixed_list.rs b/arrow-data/src/equal/fixed_list.rs similarity index 95% rename from arrow/src/array/equal/fixed_list.rs rename to arrow-data/src/equal/fixed_list.rs index 055bcece1358..204a8658e747 100644 --- a/arrow/src/array/equal/fixed_list.rs +++ b/arrow-data/src/equal/fixed_list.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{data::contains_nulls, ArrayData}; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_schema::DataType; use super::equal_range; diff --git a/arrow/src/array/equal/list.rs b/arrow-data/src/equal/list.rs similarity index 75% rename from arrow/src/array/equal/list.rs rename to arrow-data/src/equal/list.rs index b3bca9a69228..25273f8bad63 100644 --- a/arrow/src/array/equal/list.rs +++ b/arrow-data/src/equal/list.rs @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - array::ArrayData, - array::{data::count_nulls, OffsetSizeTrait}, - util::bit_util::get_bit, -}; +use crate::data::{count_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_buffer::ArrowNativeType; +use num::Integer; use super::equal_range; -fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { +fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { // invariant from `base_equal` debug_assert_eq!(lhs.len(), rhs.len()); @@ -45,7 +44,7 @@ fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { }) } -pub(super) fn list_equal( +pub(super) fn list_equal( lhs: &ArrayData, rhs: &ArrayData, lhs_start: usize, @@ -149,52 +148,3 @@ pub(super) fn list_equal( }) } } - -#[cfg(test)] -mod tests { - use crate::{ - array::{Array, Int64Builder, ListArray, ListBuilder}, - datatypes::Int32Type, - }; - - #[test] - fn list_array_non_zero_nulls() { - // Tests handling of list arrays with non-empty null ranges - let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); - builder.values().append_value(1); - builder.values().append_value(2); - builder.values().append_value(3); - builder.append(true); - builder.append(false); - let array1 = builder.finish(); - - let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); - builder.values().append_value(1); - builder.values().append_value(2); - builder.values().append_value(3); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.append(false); - let array2 = builder.finish(); - - assert_eq!(array1, array2); - } - - #[test] - fn test_list_different_offsets() { - let a = ListArray::from_iter_primitive::([ - Some([Some(0), Some(0)]), - Some([Some(1), Some(2)]), - Some([None, None]), - ]); - let b = ListArray::from_iter_primitive::([ - Some([Some(1), Some(2)]), - Some([None, None]), - Some([None, None]), - ]); - let a_slice = a.slice(1, 2); - let b_slice = b.slice(0, 2); - assert_eq!(&a_slice, &b_slice); - } -} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs new file mode 100644 index 000000000000..063ef64d4d84 --- /dev/null +++ b/arrow-data/src/equal/mod.rs @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module containing functionality to compute array equality. +//! This module uses [ArrayData] and does not +//! depend on dynamic casting of `Array`. + +use crate::data::ArrayData; +use arrow_schema::{DataType, IntervalUnit}; +use half::f16; + +mod boolean; +mod decimal; +mod dictionary; +mod fixed_binary; +mod fixed_list; +mod list; +mod null; +mod primitive; +mod structure; +mod union; +mod utils; +mod variable_size; + +// these methods assume the same type, len and null count. +// For this reason, they are not exposed and are instead used +// to build the generic functions below (`equal_range` and `equal`). +use boolean::boolean_equal; +use decimal::decimal_equal; +use dictionary::dictionary_equal; +use fixed_binary::fixed_binary_equal; +use fixed_list::fixed_list_equal; +use list::list_equal; +use null::null_equal; +use primitive::primitive_equal; +use structure::struct_equal; +use union::union_equal; +use variable_size::variable_sized_equal; + +/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively +/// for `len` slots. +#[inline] +fn equal_values( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + match lhs.data_type() { + DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Boolean => boolean_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Date64 + | DataType::Interval(IntervalUnit::DayTime) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Utf8 | DataType::Binary => { + variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::LargeUtf8 | DataType::LargeBinary => { + variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::FixedSizeBinary(_) => { + fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { + decimal_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::FixedSizeList(_, _) => { + fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Union(_, _, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Dictionary(data_type, _) => match data_type.as_ref() { + DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int16 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Int32 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Int64 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt8 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt16 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt32 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::UInt64 => { + dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + _ => unreachable!(), + }, + DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + } +} + +fn equal_range( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + utils::equal_nulls(lhs, rhs, lhs_start, rhs_start, len) + && equal_values(lhs, rhs, lhs_start, rhs_start, len) +} + +/// Logically compares two [ArrayData]. +/// Two arrays are logically equal if and only if: +/// * their data types are equal +/// * their lengths are equal +/// * their null counts are equal +/// * their null bitmaps are equal +/// * each of their items are equal +/// two items are equal when their in-memory representation is physically equal (i.e. same bit content). +/// The physical comparison depend on the data type. +/// # Panics +/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification. +/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`) +pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { + utils::base_equal(lhs, rhs) + && lhs.null_count() == rhs.null_count() + && utils::equal_nulls(lhs, rhs, 0, 0, lhs.len()) + && equal_values(lhs, rhs, 0, 0, lhs.len()) +} + +// See arrow/tests/array_equal.rs for tests diff --git a/arrow/src/array/equal/null.rs b/arrow-data/src/equal/null.rs similarity index 97% rename from arrow/src/array/equal/null.rs rename to arrow-data/src/equal/null.rs index f287a382507a..1478e448cec2 100644 --- a/arrow/src/array/equal/null.rs +++ b/arrow-data/src/equal/null.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; +use crate::data::ArrayData; #[inline] pub(super) fn null_equal( diff --git a/arrow/src/array/equal/primitive.rs b/arrow-data/src/equal/primitive.rs similarity index 96% rename from arrow/src/array/equal/primitive.rs rename to arrow-data/src/equal/primitive.rs index b82d3213ab03..e619375d5314 100644 --- a/arrow/src/array/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -17,8 +17,8 @@ use std::mem::size_of; -use crate::array::{data::contains_nulls, ArrayData}; -use crate::util::bit_util::get_bit; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::utils::equal_len; diff --git a/arrow/src/array/equal/structure.rs b/arrow-data/src/equal/structure.rs similarity index 96% rename from arrow/src/array/equal/structure.rs rename to arrow-data/src/equal/structure.rs index 384376c3468d..25ab340cd3f8 100644 --- a/arrow/src/array/equal/structure.rs +++ b/arrow-data/src/equal/structure.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::data::contains_nulls, array::ArrayData, util::bit_util::get_bit}; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; use super::equal_range; diff --git a/arrow/src/array/equal/union.rs b/arrow-data/src/equal/union.rs similarity index 98% rename from arrow/src/array/equal/union.rs rename to arrow-data/src/equal/union.rs index e8b9d27b6f0f..fdf770096867 100644 --- a/arrow/src/array/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType, datatypes::UnionMode}; +use crate::data::ArrayData; +use arrow_schema::{DataType, UnionMode}; use super::equal_range; diff --git a/arrow/src/array/equal/utils.rs b/arrow-data/src/equal/utils.rs similarity index 96% rename from arrow/src/array/equal/utils.rs rename to arrow-data/src/equal/utils.rs index 449055d366ec..b3f7fc0b06ef 100644 --- a/arrow/src/array/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::data::contains_nulls; -use crate::array::ArrayData; -use crate::datatypes::DataType; -use crate::util::bit_chunk_iterator::BitChunks; +use crate::data::{contains_nulls, ArrayData}; +use arrow_buffer::bit_chunk_iterator::BitChunks; +use arrow_schema::DataType; // whether bits along the positions are equal // `lhs_start`, `rhs_start` and `len` are _measured in bits_. diff --git a/arrow/src/array/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs similarity index 92% rename from arrow/src/array/equal/variable_size.rs rename to arrow-data/src/equal/variable_size.rs index f40f79e404ac..b4445db54bb1 100644 --- a/arrow/src/array/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::util::bit_util::get_bit; -use crate::{ - array::data::count_nulls, - array::{ArrayData, OffsetSizeTrait}, -}; +use crate::data::{count_nulls, ArrayData}; +use arrow_buffer::bit_util::get_bit; +use arrow_buffer::ArrowNativeType; +use num::Integer; use super::utils::equal_len; -fn offset_value_equal( +fn offset_value_equal( lhs_values: &[u8], rhs_values: &[u8], lhs_offsets: &[T], @@ -47,7 +46,7 @@ fn offset_value_equal( ) } -pub(super) fn variable_sized_equal( +pub(super) fn variable_sized_equal( lhs: &ArrayData, rhs: &ArrayData, lhs_start: usize, diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs new file mode 100644 index 000000000000..9b7e307db360 --- /dev/null +++ b/arrow-data/src/lib.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Buffer abstractions for [Apache Arrow](https://docs.rs/arrow) + +mod bitmap; +pub use bitmap::Bitmap; +mod data; +pub use data::*; + +mod equal; +pub mod transform; + +pub mod bit_iterator; +pub mod bit_mask; +pub mod decimal; diff --git a/arrow/src/array/transform/boolean.rs b/arrow-data/src/transform/boolean.rs similarity index 95% rename from arrow/src/array/transform/boolean.rs rename to arrow-data/src/transform/boolean.rs index e0b6231a226e..d93fa15a4e0f 100644 --- a/arrow/src/array/transform/boolean.rs +++ b/arrow-data/src/transform/boolean.rs @@ -16,8 +16,8 @@ // under the License. use super::{Extend, _MutableArrayData, utils::resize_for_bits}; -use crate::array::ArrayData; -use crate::util::bit_mask::set_bits; +use crate::bit_mask::set_bits; +use crate::ArrayData; pub(super) fn build_extend(array: &ArrayData) -> Extend { let values = array.buffers()[0].as_slice(); diff --git a/arrow/src/array/transform/fixed_binary.rs b/arrow-data/src/transform/fixed_binary.rs similarity index 97% rename from arrow/src/array/transform/fixed_binary.rs rename to arrow-data/src/transform/fixed_binary.rs index 6d6262ca3c4e..fe21a6bc382d 100644 --- a/arrow/src/array/transform/fixed_binary.rs +++ b/arrow-data/src/transform/fixed_binary.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::ArrayData, datatypes::DataType}; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; +use arrow_schema::DataType; pub(super) fn build_extend(array: &ArrayData) -> Extend { let size = match array.data_type() { diff --git a/arrow/src/array/transform/fixed_size_list.rs b/arrow-data/src/transform/fixed_size_list.rs similarity index 97% rename from arrow/src/array/transform/fixed_size_list.rs rename to arrow-data/src/transform/fixed_size_list.rs index 77912a7026fd..ad369c2be8a0 100644 --- a/arrow/src/array/transform/fixed_size_list.rs +++ b/arrow-data/src/transform/fixed_size_list.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; -use crate::datatypes::DataType; +use crate::ArrayData; +use arrow_schema::DataType; use super::{Extend, _MutableArrayData}; diff --git a/arrow/src/array/transform/list.rs b/arrow-data/src/transform/list.rs similarity index 92% rename from arrow/src/array/transform/list.rs rename to arrow-data/src/transform/list.rs index 8eb2bd1778d3..f318d46f498e 100644 --- a/arrow/src/array/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -15,14 +15,15 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayData, OffsetSizeTrait}; - use super::{ Extend, _MutableArrayData, utils::{extend_offsets, get_last_offset}, }; +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; +use num::Integer; -pub(super) fn build_extend(array: &ArrayData) -> Extend { +pub(super) fn build_extend(array: &ArrayData) -> Extend { let offsets = array.buffer::(0); if array.null_count() == 0 { // fast case where we can copy regions without nullability checks @@ -69,7 +70,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { (start..start + len).for_each(|i| { if array.is_valid(i) { // compute the new offset - last_offset += offsets[i + 1] - offsets[i]; + last_offset = last_offset + offsets[i + 1] - offsets[i]; // append value child.extend( @@ -86,7 +87,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { } } -pub(super) fn extend_nulls( +pub(super) fn extend_nulls( mutable: &mut _MutableArrayData, len: usize, ) { diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs new file mode 100644 index 000000000000..c34376aaba29 --- /dev/null +++ b/arrow-data/src/transform/mod.rs @@ -0,0 +1,672 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::{ + data::{into_buffers, new_buffers}, + ArrayData, ArrayDataBuilder, +}; +use crate::bit_mask::set_bits; +use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; +use half::f16; +use num::Integer; +use std::mem; + +mod boolean; +mod fixed_binary; +mod fixed_size_list; +mod list; +mod null; +mod primitive; +mod structure; +mod union; +mod utils; +mod variable_size; + +type ExtendNullBits<'a> = Box; +// function that extends `[start..start+len]` to the mutable array. +// this is dynamic because different data_types influence how buffers and children are extended. +type Extend<'a> = Box; + +type ExtendNulls = Box; + +/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. +/// This is just a data container. +#[derive(Debug)] +struct _MutableArrayData<'a> { + pub data_type: DataType, + pub null_count: usize, + + pub len: usize, + pub null_buffer: MutableBuffer, + + // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). + // Thus, we place them in the stack to avoid bound checks and greater data locality. + pub buffer1: MutableBuffer, + pub buffer2: MutableBuffer, + pub child_data: Vec>, +} + +impl<'a> _MutableArrayData<'a> { + fn freeze(self, dictionary: Option) -> ArrayDataBuilder { + let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); + + let child_data = match self.data_type { + DataType::Dictionary(_, _) => vec![dictionary.unwrap()], + _ => { + let mut child_data = Vec::with_capacity(self.child_data.len()); + for child in self.child_data { + child_data.push(child.freeze()); + } + child_data + } + }; + + ArrayDataBuilder::new(self.data_type) + .offset(0) + .len(self.len) + .null_count(self.null_count) + .buffers(buffers) + .child_data(child_data) + .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) + } +} + +fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { + if let Some(bitmap) = array.null_bitmap() { + let bytes = bitmap.buffer().as_slice(); + Box::new(move |mutable, start, len| { + utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); + mutable.null_count += set_bits( + mutable.null_buffer.as_slice_mut(), + bytes, + mutable.len, + array.offset() + start, + len, + ); + }) + } else if use_nulls { + Box::new(|mutable, _, len| { + utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); + let write_data = mutable.null_buffer.as_slice_mut(); + let offset = mutable.len; + (0..len).for_each(|i| { + bit_util::set_bit(write_data, offset + i); + }); + }) + } else { + Box::new(|_, _, _| {}) + } +} + +/// Struct to efficiently and interactively create an [ArrayData] from an existing [ArrayData] by +/// copying chunks. +/// +/// The main use case of this struct is to perform unary operations to arrays of arbitrary types, +/// such as `filter` and `take`. +pub struct MutableArrayData<'a> { + #[allow(dead_code)] + arrays: Vec<&'a ArrayData>, + // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to + // mutability invariants (interior mutability): + // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not + // [MutableArrayData] itself + data: _MutableArrayData<'a>, + + // the child data of the `Array` in Dictionary arrays. + // This is not stored in `MutableArrayData` because these values constant and only needed + // at the end, when freezing [_MutableArrayData]. + dictionary: Option, + + // function used to extend values from arrays. This function's lifetime is bound to the array + // because it reads values from it. + extend_values: Vec>, + // function used to extend nulls from arrays. This function's lifetime is bound to the array + // because it reads nulls from it. + extend_null_bits: Vec>, + + // function used to extend nulls. + // this is independent of the arrays and therefore has no lifetime. + extend_nulls: ExtendNulls, +} + +impl<'a> std::fmt::Debug for MutableArrayData<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // ignores the closures. + f.debug_struct("MutableArrayData") + .field("data", &self.data) + .finish() + } +} + +/// Builds an extend that adds `offset` to the source primitive +/// Additionally validates that `max` fits into the +/// the underlying primitive returning None if not +fn build_extend_dictionary( + array: &ArrayData, + offset: usize, + max: usize, +) -> Option { + macro_rules! validate_and_build { + ($dt: ty) => {{ + let _: $dt = max.try_into().ok()?; + let offset: $dt = offset.try_into().ok()?; + Some(primitive::build_extend_with_offset(array, offset)) + }}; + } + match array.data_type() { + DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { + DataType::UInt8 => validate_and_build!(u8), + DataType::UInt16 => validate_and_build!(u16), + DataType::UInt32 => validate_and_build!(u32), + DataType::UInt64 => validate_and_build!(u64), + DataType::Int8 => validate_and_build!(i8), + DataType::Int16 => validate_and_build!(i16), + DataType::Int32 => validate_and_build!(i32), + DataType::Int64 => validate_and_build!(i64), + _ => unreachable!(), + }, + _ => None, + } +} + +fn build_extend(array: &ArrayData) -> Extend { + match array.data_type() { + DataType::Decimal128(_, _) => primitive::build_extend::(array), + DataType::Null => null::build_extend(array), + DataType::Boolean => boolean::build_extend(array), + DataType::UInt8 => primitive::build_extend::(array), + DataType::UInt16 => primitive::build_extend::(array), + DataType::UInt32 => primitive::build_extend::(array), + DataType::UInt64 => primitive::build_extend::(array), + DataType::Int8 => primitive::build_extend::(array), + DataType::Int16 => primitive::build_extend::(array), + DataType::Int32 => primitive::build_extend::(array), + DataType::Int64 => primitive::build_extend::(array), + DataType::Float32 => primitive::build_extend::(array), + DataType::Float64 => primitive::build_extend::(array), + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + primitive::build_extend::(array) + } + DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => { + primitive::build_extend::(array) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive::build_extend::(array) + } + DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), + DataType::LargeUtf8 | DataType::LargeBinary => { + variable_size::build_extend::(array) + } + DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), + DataType::LargeList(_) => list::build_extend::(array), + DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), + DataType::Struct(_) => structure::build_extend(array), + DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { + fixed_binary::build_extend(array) + } + DataType::Float16 => primitive::build_extend::(array), + DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), + DataType::Union(_, _, mode) => match mode { + UnionMode::Sparse => union::build_extend_sparse(array), + UnionMode::Dense => union::build_extend_dense(array), + }, + } +} + +fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { + Box::new(match data_type { + DataType::Decimal128(_, _) => primitive::extend_nulls::, + DataType::Null => null::extend_nulls, + DataType::Boolean => boolean::extend_nulls, + DataType::UInt8 => primitive::extend_nulls::, + DataType::UInt16 => primitive::extend_nulls::, + DataType::UInt32 => primitive::extend_nulls::, + DataType::UInt64 => primitive::extend_nulls::, + DataType::Int8 => primitive::extend_nulls::, + DataType::Int16 => primitive::extend_nulls::, + DataType::Int32 => primitive::extend_nulls::, + DataType::Int64 => primitive::extend_nulls::, + DataType::Float32 => primitive::extend_nulls::, + DataType::Float64 => primitive::extend_nulls::, + DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, + DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, + DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, + DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, + DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, + DataType::LargeList(_) => list::extend_nulls::, + DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { + DataType::UInt8 => primitive::extend_nulls::, + DataType::UInt16 => primitive::extend_nulls::, + DataType::UInt32 => primitive::extend_nulls::, + DataType::UInt64 => primitive::extend_nulls::, + DataType::Int8 => primitive::extend_nulls::, + DataType::Int16 => primitive::extend_nulls::, + DataType::Int32 => primitive::extend_nulls::, + DataType::Int64 => primitive::extend_nulls::, + _ => unreachable!(), + }, + DataType::Struct(_) => structure::extend_nulls, + DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { + fixed_binary::extend_nulls + } + DataType::Float16 => primitive::extend_nulls::, + DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, + DataType::Union(_, _, mode) => match mode { + UnionMode::Sparse => union::extend_nulls_sparse, + UnionMode::Dense => union::extend_nulls_dense, + }, + }) +} + +fn preallocate_offset_and_binary_buffer( + capacity: usize, + binary_size: usize, +) -> [MutableBuffer; 2] { + // offsets + let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); + // safety: `unsafe` code assumes that this buffer is initialized with one element + buffer.push(Offset::zero()); + + [ + buffer, + MutableBuffer::new(binary_size * mem::size_of::()), + ] +} + +/// Define capacities of child data or data buffers. +#[derive(Debug, Clone)] +pub enum Capacities { + /// Binary, Utf8 and LargeUtf8 data types + /// Define + /// * the capacity of the array offsets + /// * the capacity of the binary/ str buffer + Binary(usize, Option), + /// List and LargeList data types + /// Define + /// * the capacity of the array offsets + /// * the capacity of the child data + List(usize, Option>), + /// Struct type + /// * the capacity of the array + /// * the capacities of the fields + Struct(usize, Option>), + /// Dictionary type + /// * the capacity of the array/keys + /// * the capacity of the values + Dictionary(usize, Option>), + /// Don't preallocate inner buffers and rely on array growth strategy + Array(usize), +} +impl<'a> MutableArrayData<'a> { + /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an + /// [ArrayData] from multiple `arrays`. + /// + /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls + /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. + /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. + pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { + Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) + } + + /// Similar to [MutableArrayData::new], but lets users define the preallocated capacities of the array. + /// See also [MutableArrayData::new] for more information on the arguments. + /// + /// # Panic + /// This function panics if the given `capacities` don't match the data type of `arrays`. Or when + /// a [Capacities] variant is not yet supported. + pub fn with_capacities( + arrays: Vec<&'a ArrayData>, + use_nulls: bool, + capacities: Capacities, + ) -> Self { + let data_type = arrays[0].data_type(); + + // if any of the arrays has nulls, insertions from any array requires setting bits + // as there is at least one array with nulls. + let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); + + let mut array_capacity; + + let [buffer1, buffer2] = match (data_type, &capacities) { + ( + DataType::LargeUtf8 | DataType::LargeBinary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { + array_capacity = *capacity; + preallocate_offset_and_binary_buffer::(*capacity, *value_cap) + } + ( + DataType::Utf8 | DataType::Binary, + Capacities::Binary(capacity, Some(value_cap)), + ) => { + array_capacity = *capacity; + preallocate_offset_and_binary_buffer::(*capacity, *value_cap) + } + (_, Capacities::Array(capacity)) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } + ( + DataType::List(_) | DataType::LargeList(_), + Capacities::List(capacity, _), + ) => { + array_capacity = *capacity; + new_buffers(data_type, *capacity) + } + _ => panic!("Capacities: {:?} not yet supported", capacities), + }; + + let child_data = match &data_type { + DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::Null + | DataType::Boolean + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Timestamp(_, _) + | DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::Interval(_) + | DataType::FixedSizeBinary(_) => vec![], + DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { + let childs = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + + let capacities = if let Capacities::List(capacity, ref child_capacities) = + capacities + { + child_capacities + .clone() + .map(|c| *c) + .unwrap_or(Capacities::Array(capacity)) + } else { + Capacities::Array(array_capacity) + }; + + vec![MutableArrayData::with_capacities( + childs, use_nulls, capacities, + )] + } + // the dictionary type just appends keys and clones the values. + DataType::Dictionary(_, _) => vec![], + DataType::Struct(fields) => match capacities { + Capacities::Struct(capacity, Some(ref child_capacities)) => { + array_capacity = capacity; + (0..fields.len()) + .zip(child_capacities) + .map(|(i, child_cap)| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::with_capacities( + child_arrays, + use_nulls, + child_cap.clone(), + ) + }) + .collect::>() + } + Capacities::Struct(capacity, None) => { + array_capacity = capacity; + (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, capacity) + }) + .collect::>() + } + _ => (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, array_capacity) + }) + .collect::>(), + }, + DataType::FixedSizeList(_, _) => { + let childs = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + vec![MutableArrayData::new(childs, use_nulls, array_capacity)] + } + DataType::Union(fields, _, _) => (0..fields.len()) + .map(|i| { + let child_arrays = arrays + .iter() + .map(|array| &array.child_data()[i]) + .collect::>(); + MutableArrayData::new(child_arrays, use_nulls, array_capacity) + }) + .collect::>(), + }; + + // Get the dictionary if any, and if it is a concatenation of multiple + let (dictionary, dict_concat) = match &data_type { + DataType::Dictionary(_, _) => { + // If more than one dictionary, concatenate dictionaries together + let dict_concat = !arrays + .windows(2) + .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); + + match dict_concat { + false => (Some(arrays[0].child_data()[0].clone()), false), + true => { + if let Capacities::Dictionary(_, _) = capacities { + panic!("dictionary capacity not yet supported") + } + let dictionaries: Vec<_> = + arrays.iter().map(|array| &array.child_data()[0]).collect(); + let lengths: Vec<_> = dictionaries + .iter() + .map(|dictionary| dictionary.len()) + .collect(); + let capacity = lengths.iter().sum(); + + let mut mutable = + MutableArrayData::new(dictionaries, false, capacity); + + for (i, len) in lengths.iter().enumerate() { + mutable.extend(i, 0, *len) + } + + (Some(mutable.freeze()), true) + } + } + } + _ => (None, false), + }; + + let extend_nulls = build_extend_nulls(data_type); + + let extend_null_bits = arrays + .iter() + .map(|array| build_extend_null_bits(array, use_nulls)) + .collect(); + + let null_buffer = if use_nulls { + let null_bytes = bit_util::ceil(array_capacity, 8); + MutableBuffer::from_len_zeroed(null_bytes) + } else { + // create 0 capacity mutable buffer with the intention that it won't be used + MutableBuffer::with_capacity(0) + }; + + let extend_values = match &data_type { + DataType::Dictionary(_, _) => { + let mut next_offset = 0; + let extend_values: Result, _> = arrays + .iter() + .map(|array| { + let offset = next_offset; + let dict_len = array.child_data()[0].len(); + + if dict_concat { + next_offset += dict_len; + } + + build_extend_dictionary(array, offset, offset + dict_len) + .ok_or(ArrowError::DictionaryKeyOverflowError) + }) + .collect(); + + extend_values.expect("MutableArrayData::new is infallible") + } + _ => arrays.iter().map(|array| build_extend(array)).collect(), + }; + + let data = _MutableArrayData { + data_type: data_type.clone(), + len: 0, + null_count: 0, + null_buffer, + buffer1, + buffer2, + child_data, + }; + Self { + arrays, + data, + dictionary, + extend_values, + extend_null_bits, + extend_nulls, + } + } + + /// Extends this array with a chunk of its source arrays + /// + /// # Arguments + /// * `index` - the index of array that you what to copy values from + /// * `start` - the start index of the chunk (inclusive) + /// * `end` - the end index of the chunk (exclusive) + /// + /// # Panic + /// This function panics if there is an invalid index, + /// i.e. `index` >= the number of source arrays + /// or `end` > the length of the `index`th array + pub fn extend(&mut self, index: usize, start: usize, end: usize) { + let len = end - start; + (self.extend_null_bits[index])(&mut self.data, start, len); + (self.extend_values[index])(&mut self.data, index, start, len); + self.data.len += len; + } + + /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays + pub fn extend_nulls(&mut self, len: usize) { + // TODO: null_buffer should probably be extended here as well + // otherwise is_valid() could later panic + // add test to confirm + self.data.null_count += len; + (self.extend_nulls)(&mut self.data, len); + self.data.len += len; + } + + /// Returns the current length + #[inline] + pub fn len(&self) -> usize { + self.data.len + } + + /// Returns true if len is 0 + #[inline] + pub fn is_empty(&self) -> bool { + self.data.len == 0 + } + + /// Returns the current null count + #[inline] + pub fn null_count(&self) -> usize { + self.data.null_count + } + + /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. + pub fn freeze(self) -> ArrayData { + unsafe { self.data.freeze(self.dictionary).build_unchecked() } + } + + /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. + /// This is useful for extending the default behavior of MutableArrayData. + pub fn into_builder(self) -> ArrayDataBuilder { + self.data.freeze(self.dictionary) + } +} + +// See arrow/tests/array_transform.rs for tests of transform functionality + +#[cfg(test)] +mod test { + use super::*; + use arrow_schema::Field; + + #[test] + fn test_list_append_with_capacities() { + let array = ArrayData::new_empty(&DataType::List(Box::new(Field::new( + "element", + DataType::Int64, + false, + )))); + + let mutable = MutableArrayData::with_capacities( + vec![&array], + false, + Capacities::List(6, Some(Box::new(Capacities::Array(17)))), + ); + + // capacities are rounded up to multiples of 64 by MutableBuffer + assert_eq!(mutable.data.buffer1.capacity(), 64); + assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); + } +} diff --git a/arrow/src/array/transform/null.rs b/arrow-data/src/transform/null.rs similarity index 97% rename from arrow/src/array/transform/null.rs rename to arrow-data/src/transform/null.rs index e1335e179713..5d1535564d9e 100644 --- a/arrow/src/array/transform/null.rs +++ b/arrow-data/src/transform/null.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend(_: &ArrayData) -> Extend { Box::new(move |_, _, _, _| {}) diff --git a/arrow/src/array/transform/primitive.rs b/arrow-data/src/transform/primitive.rs similarity index 96% rename from arrow/src/array/transform/primitive.rs rename to arrow-data/src/transform/primitive.rs index 4c765c0c0d95..b5c826438bfc 100644 --- a/arrow/src/array/transform/primitive.rs +++ b/arrow-data/src/transform/primitive.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; use std::mem::size_of; use std::ops::Add; -use crate::{array::ArrayData, datatypes::ArrowNativeType}; - use super::{Extend, _MutableArrayData}; pub(super) fn build_extend(array: &ArrayData) -> Extend { diff --git a/arrow/src/array/transform/structure.rs b/arrow-data/src/transform/structure.rs similarity index 98% rename from arrow/src/array/transform/structure.rs rename to arrow-data/src/transform/structure.rs index 5c41d76a7f1c..c6841da4d83c 100644 --- a/arrow/src/array/transform/structure.rs +++ b/arrow-data/src/transform/structure.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend(array: &ArrayData) -> Extend { if array.null_count() == 0 { diff --git a/arrow/src/array/transform/union.rs b/arrow-data/src/transform/union.rs similarity index 98% rename from arrow/src/array/transform/union.rs rename to arrow-data/src/transform/union.rs index bbea508219d0..8d1ea34c314d 100644 --- a/arrow/src/array/transform/union.rs +++ b/arrow-data/src/transform/union.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayData; - use super::{Extend, _MutableArrayData}; +use crate::ArrayData; pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend { let type_ids = array.buffer::(0); diff --git a/arrow/src/array/transform/utils.rs b/arrow-data/src/transform/utils.rs similarity index 89% rename from arrow/src/array/transform/utils.rs rename to arrow-data/src/transform/utils.rs index 68aee79c41bb..6a4c240c9ae3 100644 --- a/arrow/src/array/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util}; +use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; +use num::Integer; /// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero. #[inline] @@ -26,7 +27,7 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { } } -pub(super) fn extend_offsets( +pub(super) fn extend_offsets( buffer: &mut MutableBuffer, mut last_offset: T, offsets: &[T], @@ -35,13 +36,13 @@ pub(super) fn extend_offsets( offsets.windows(2).for_each(|offsets| { // compute the new offset let length = offsets[1] - offsets[0]; - last_offset += length; + last_offset = last_offset + length; buffer.push(last_offset); }); } #[inline] -pub(super) unsafe fn get_last_offset( +pub(super) unsafe fn get_last_offset( offset_buffer: &MutableBuffer, ) -> T { // JUSTIFICATION diff --git a/arrow/src/array/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs similarity index 87% rename from arrow/src/array/transform/variable_size.rs rename to arrow-data/src/transform/variable_size.rs index c9304dbca200..73c4783189dc 100644 --- a/arrow/src/array/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - array::{ArrayData, OffsetSizeTrait}, - buffer::MutableBuffer, -}; +use crate::ArrayData; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use num::traits::AsPrimitive; +use num::Integer; use super::{ Extend, _MutableArrayData, @@ -26,20 +26,22 @@ use super::{ }; #[inline] -fn extend_offset_values( +fn extend_offset_values>( buffer: &mut MutableBuffer, offsets: &[T], values: &[u8], start: usize, len: usize, ) { - let start_values = offsets[start].to_usize().unwrap(); - let end_values = offsets[start + len].to_usize().unwrap(); + let start_values = offsets[start].as_(); + let end_values = offsets[start + len].as_(); let new_values = &values[start_values..end_values]; buffer.extend_from_slice(new_values); } -pub(super) fn build_extend(array: &ArrayData) -> Extend { +pub(super) fn build_extend>( + array: &ArrayData, +) -> Extend { let offsets = array.buffer::(0); let values = array.buffers()[1].as_slice(); if array.null_count() == 0 { @@ -77,7 +79,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { if array.is_valid(i) { // compute the new offset let length = offsets[i + 1] - offsets[i]; - last_offset += length; + last_offset = last_offset + length; // append value let bytes = &values[offsets[i].to_usize().unwrap() @@ -92,7 +94,7 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { } } -pub(super) fn extend_nulls( +pub(super) fn extend_nulls( mutable: &mut _MutableArrayData, len: usize, ) { diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index f9e70eb8d77a..edfe2c680daf 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use arrow::array::{ArrayData, ArrayRef, Int64Array}; +use arrow::array::{ArrayData, ArrayRef, Int64Array, make_array}; use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; @@ -39,7 +39,7 @@ fn to_py_err(err: ArrowError) -> PyErr { #[pyfunction] fn double(array: &PyAny, py: Python) -> PyResult { // import - let array = ArrayRef::from_pyarrow(array)?; + let array = make_array(ArrayData::from_pyarrow(array)?); // perform some operation let array = array @@ -65,7 +65,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { // to py let pyarray = array.to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; - let array = ArrayRef::from_pyarrow(pyarray)?; + let array = make_array(ArrayData::from_pyarrow(pyarray)?); Ok(array == expected) } @@ -77,7 +77,7 @@ fn substring( start: i64, ) -> PyResult> { // import - let array = ArrayRef::from(array.0); + let array = make_array(array.0); // substring let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; @@ -88,12 +88,12 @@ fn substring( /// Returns the concatenate #[pyfunction] fn concatenate(array: PyArrowType, py: Python) -> PyResult { - let array = ArrayRef::from(array.0); + let array = make_array(array.0); // concat let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; - array.to_pyarrow(py) + array.data().to_pyarrow(py) } #[pyfunction] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d49acef335d6..f29c4e317914 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -45,6 +45,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] [dependencies] arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-data = { version = "23.0.0", path = "../arrow-data" } arrow-schema = { version = "23.0.0", path = "../arrow-schema" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } @@ -88,7 +89,7 @@ pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive # but is run as part of our CI checks -force_validate = [] +force_validate = ["arrow-data/force_validate"] # Enable ffi support ffi = [] # Enable dyn-comparison of dictionary arrays with other arrays diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 38ba2025a2e3..2c2969c925d5 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -229,7 +229,7 @@ impl Array for ArrayRef { } fn into_data(self) -> ArrayData { - self.into() + self.data().clone() } fn data_ref(&self) -> &ArrayData { @@ -358,6 +358,90 @@ pub trait ArrayAccessor: Array { unsafe fn value_unchecked(&self, index: usize) -> Self::Item; } +impl PartialEq for dyn Array { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for dyn Array { + fn eq(&self, other: &T) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for NullArray { + fn eq(&self, other: &NullArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for PrimitiveArray { + fn eq(&self, other: &PrimitiveArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for DictionaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for BooleanArray { + fn eq(&self, other: &BooleanArray) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericStringArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericBinaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for FixedSizeBinaryArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for Decimal128Array { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for GenericListArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for MapArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for FixedSizeListArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + +impl PartialEq for StructArray { + fn eq(&self, other: &Self) -> bool { + self.data().eq(other.data()) + } +} + /// Constructs an array using the input `data`. /// Returns a reference-counted `Array` instance. pub fn make_array(data: ArrayData) -> ArrayRef { @@ -470,18 +554,6 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } } -impl From for ArrayRef { - fn from(data: ArrayData) -> Self { - make_array(data) - } -} - -impl From for ArrayData { - fn from(array: ArrayRef) -> Self { - array.data().clone() - } -} - /// Creates a new empty array /// /// ``` diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 543fda1b1a8a..f6a2dda2da5a 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -284,7 +284,11 @@ impl DecimalArray { // safety: self.data is valid DataType::Decimal as checked above let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); - Ok(self.data().clone().with_data_type(new_data_type).into()) + let data = self.data().clone().into_builder().data_type(new_data_type); + + // SAFETY + // Validated data above + Ok(unsafe { data.build_unchecked().into() }) } // validate that the new precision and scale are valid or not diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs deleted file mode 100644 index 52be64a3fa76..000000000000 --- a/arrow/src/array/equal/mod.rs +++ /dev/null @@ -1,1464 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Module containing functionality to compute array equality. -//! This module uses [ArrayData] and does not -//! depend on dynamic casting of `Array`. - -use super::{ - Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, - FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, - GenericStringArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, - StructArray, -}; -use crate::datatypes::{ArrowPrimitiveType, DataType, IntervalUnit}; -use half::f16; - -mod boolean; -mod decimal; -mod dictionary; -mod fixed_binary; -mod fixed_list; -mod list; -mod null; -mod primitive; -mod structure; -mod union; -mod utils; -mod variable_size; - -// these methods assume the same type, len and null count. -// For this reason, they are not exposed and are instead used -// to build the generic functions below (`equal_range` and `equal`). -use boolean::boolean_equal; -use decimal::decimal_equal; -use dictionary::dictionary_equal; -use fixed_binary::fixed_binary_equal; -use fixed_list::fixed_list_equal; -use list::list_equal; -use null::null_equal; -use primitive::primitive_equal; -use structure::struct_equal; -use union::union_equal; -use variable_size::variable_sized_equal; - -impl PartialEq for dyn Array { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for dyn Array { - fn eq(&self, other: &T) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for NullArray { - fn eq(&self, other: &NullArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, other: &PrimitiveArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for DictionaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for BooleanArray { - fn eq(&self, other: &BooleanArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericStringArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for Decimal128Array { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for MapArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for StructArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively -/// for `len` slots. -#[inline] -fn equal_values( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - match lhs.data_type() { - DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Boolean => boolean_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::UInt64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int8 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Date64 - | DataType::Interval(IntervalUnit::DayTime) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Utf8 | DataType::Binary => { - variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - decimal_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Union(_, _, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Dictionary(data_type, _) => match data_type.as_ref() { - DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt8 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - _ => unreachable!(), - }, - DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - } -} - -fn equal_range( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - utils::equal_nulls(lhs, rhs, lhs_start, rhs_start, len) - && equal_values(lhs, rhs, lhs_start, rhs_start, len) -} - -/// Logically compares two [ArrayData]. -/// Two arrays are logically equal if and only if: -/// * their data types are equal -/// * their lengths are equal -/// * their null counts are equal -/// * their null bitmaps are equal -/// * each of their items are equal -/// two items are equal when their in-memory representation is physically equal (i.e. same bit content). -/// The physical comparison depend on the data type. -/// # Panics -/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification. -/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`) -pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { - utils::base_equal(lhs, rhs) - && lhs.null_count() == rhs.null_count() - && utils::equal_nulls(lhs, rhs, 0, 0, lhs.len()) - && equal_values(lhs, rhs, 0, 0, lhs.len()) -} - -#[cfg(test)] -mod tests { - use std::convert::TryFrom; - use std::sync::Arc; - - use crate::array::{ - array::Array, ArrayData, ArrayDataBuilder, ArrayRef, BooleanArray, - FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, Int32Builder, - ListBuilder, NullArray, StringArray, StringDictionaryBuilder, StructArray, - UnionBuilder, - }; - use crate::array::{GenericStringArray, Int32Array}; - use crate::buffer::Buffer; - use crate::datatypes::{Field, Int16Type, Int32Type, ToByteSlice}; - - use super::*; - - #[test] - fn test_null_equal() { - let a = NullArray::new(12); - let a = a.data(); - let b = NullArray::new(12); - let b = b.data(); - test_equal(a, b, true); - - let b = NullArray::new(10); - let b = b.data(); - test_equal(a, b, false); - - // Test the case where offset != 0 - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(5, 4); - let b_slice = b.slice(3, 3); - test_equal(&a_slice, &b_slice, false); - } - - #[test] - fn test_boolean_equal() { - let a = BooleanArray::from(vec![false, false, true]); - let a = a.data(); - let b = BooleanArray::from(vec![false, false, true]); - let b = b.data(); - test_equal(a, b, true); - - let b = BooleanArray::from(vec![false, false, false]); - let b = b.data(); - test_equal(a, b, false); - } - - #[test] - fn test_boolean_equal_nulls() { - let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let a = a.data(); - let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, true); - - let b = BooleanArray::from(vec![None, None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); - - let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); - } - - #[test] - fn test_boolean_equal_offset() { - let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); - let a = a.data(); - let b = - BooleanArray::from(vec![true, false, false, false, true, false, true, true]); - let b = b.data(); - assert!(!equal(a, b)); - assert!(!equal(b, a)); - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(3, 3); - assert!(equal(&a_slice, &b_slice)); - assert!(equal(&b_slice, &a_slice)); - - let a_slice = a.slice(3, 4); - let b_slice = b.slice(4, 4); - assert!(!equal(&a_slice, &b_slice)); - assert!(!equal(&b_slice, &a_slice)); - - // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) - - // Elements fill in `u8`'s exactly. - let mut vector = vec![false, false, true, true, true, true, true, true]; - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector.clone()); - let b = b.data(); - test_equal(a, b, true); - - // Elements fill in `u8`s + suffix bits. - vector.push(true); - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector); - let b = b.data(); - test_equal(a, b, true); - } - - #[test] - fn test_primitive() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(3)], - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(4)], - false, - ), - ( - vec![Some(1), Some(2), None], - vec![Some(1), Some(2), None], - true, - ), - ( - vec![Some(1), None, Some(3)], - vec![Some(1), Some(2), None], - false, - ), - ( - vec![Some(1), None, None], - vec![Some(1), Some(2), None], - false, - ), - ]; - - for (lhs, rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_primitive_slice() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - (0, 1), - vec![Some(1), Some(2), Some(3)], - (0, 1), - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - (1, 1), - vec![Some(1), Some(2), Some(3)], - (2, 1), - false, - ), - ( - vec![Some(1), Some(2), None], - (1, 1), - vec![Some(1), None, Some(2)], - (2, 1), - true, - ), - ( - vec![None, Some(2), None], - (1, 1), - vec![None, None, Some(2)], - (2, 1), - true, - ), - ( - vec![Some(1), None, Some(2), None, Some(3)], - (2, 2), - vec![None, Some(2), None, Some(3)], - (1, 2), - true, - ), - ( - vec![Some(1), Some(2), None, Some(0)], - (2, 2), - vec![Some(4), Some(5), Some(0), None], - (2, 2), - false, - ), - ]; - - for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); - - test_equal(&lhs, &rhs, expected); - } - } - - fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { - // equality is symmetric - assert!(equal(lhs, lhs), "\n{:?}\n{:?}", lhs, lhs); - assert!(equal(rhs, rhs), "\n{:?}\n{:?}", rhs, rhs); - - assert_eq!(equal(lhs, rhs), expected, "\n{:?}\n{:?}", lhs, rhs); - assert_eq!(equal(rhs, lhs), expected, "\n{:?}\n{:?}", rhs, lhs); - } - - type OptionString = Option; - - fn binary_cases() -> Vec<(Vec, Vec, bool)> { - let base = vec![ - Some("hello".to_owned()), - None, - None, - Some("world".to_owned()), - None, - None, - ]; - let not_base = vec![ - Some("hello".to_owned()), - Some("foo".to_owned()), - None, - Some("world".to_owned()), - None, - None, - ]; - vec![ - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("world".to_owned())], - true, - ), - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("arrow".to_owned())], - false, - ), - (base.clone(), base.clone(), true), - (base, not_base, false), - ] - } - - fn test_generic_string_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs: GenericStringArray = lhs.into_iter().collect(); - let lhs = lhs.data(); - let rhs: GenericStringArray = rhs.into_iter().collect(); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_string_equal() { - test_generic_string_equal::() - } - - #[test] - fn test_large_string_equal() { - test_generic_string_equal::() - } - - fn test_generic_binary_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs = lhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let rhs = rhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let lhs = GenericBinaryArray::::from_opt_vec(lhs); - let lhs = lhs.data(); - let rhs = GenericBinaryArray::::from_opt_vec(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_large_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_fixed_size_binary_array() { - let a_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; - let a = FixedSizeBinaryArray::try_from_iter(a_input_arg.into_iter()).unwrap(); - let a = a.data(); - - let b_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; - let b = FixedSizeBinaryArray::try_from_iter(b_input_arg.into_iter()).unwrap(); - let b = b.data(); - - test_equal(a, b, true); - } - - #[test] - fn test_string_offset() { - let a = StringArray::from(vec![Some("a"), None, Some("b")]); - let a = a.data(); - let a = a.slice(2, 1); - let b = StringArray::from(vec![Some("b")]); - let b = b.data(); - - test_equal(&a, b, true); - } - - #[test] - fn test_string_offset_larger() { - let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); - let a = a.data(); - let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); - let b = b.data(); - - test_equal(&a.slice(2, 2), &b.slice(0, 2), false); - test_equal(&a.slice(2, 2), &b.slice(1, 2), true); - test_equal(&a.slice(2, 2), &b.slice(2, 2), false); - } - - #[test] - fn test_null() { - let a = NullArray::new(2); - let a = a.data(); - let b = NullArray::new(2); - let b = b.data(); - test_equal(a, b, true); - - let b = NullArray::new(1); - let b = b.data(); - test_equal(a, b, false); - } - - fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { - let mut builder = ListBuilder::new(Int32Builder::with_capacity(10)); - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - builder.append(false); - } - } - builder.finish().into_data() - } - - #[test] - fn test_list_equal() { - let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - #[test] - fn test_empty_offsets_list_equal() { - let empty: Vec = vec![]; - let values = Int32Array::from(empty); - let empty_offsets: [u8; 0] = []; - - let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) - .null_bit_buffer(Some(Buffer::from(&empty_offsets))) - .build() - .unwrap(); - - let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) - .null_bit_buffer(Some(Buffer::from(&empty_offsets))) - .build() - .unwrap(); - - test_equal(&a, &b, true); - - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(0) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data( - Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) - .data() - .clone(), - ) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - - test_equal(&a, &c, true); - } - - // Test the case where null_count > 0 - #[test] - fn test_list_null() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - test_equal(&a, &b, true); - - let b = create_list_array(&[ - Some(&[1, 2]), - None, - Some(&[5, 6]), - Some(&[3, 4]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - test_equal(&a, &b, false); - - // a list where the nullness of values is determined by the list's bitmap - let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(c_values.into_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - - let d_values = Int32Array::from(vec![ - Some(1), - Some(2), - None, - None, - Some(3), - Some(4), - None, - None, - ]); - let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(d_values.into_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) - .build() - .unwrap(); - test_equal(&c, &d, true); - } - - // Test the case where offset != 0 - #[test] - fn test_list_offsets() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - fn create_fixed_size_binary_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); - - for d in data.as_ref() { - if let Some(v) = d { - builder.append_value(v.as_ref()).unwrap(); - } else { - builder.append_null(); - } - } - builder.finish().into_data() - } - - #[test] - fn test_fixed_size_binary_equal() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_size_binary_null() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); - test_equal(&a, &b, false); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); - test_equal(&a, &b, false); - } - - #[test] - fn test_fixed_size_binary_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"world"), - None, - None, - ]); - let b = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"arrow"), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 1); - let b_slice = b.slice(3, 1); - test_equal(&a_slice, &b_slice, false); - } - - fn create_decimal_array(data: Vec>) -> ArrayData { - data.into_iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap() - .into() - } - - #[test] - fn test_decimal_equal() { - let a = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); - let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = create_decimal_array(vec![Some(15_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_decimal_null() { - let a = - create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); - let b = - create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = - create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000), None]); - test_equal(&a, &b, false); - - let b = - create_decimal_array(vec![Some(15_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - #[test] - fn test_decimal_offsets() { - // Test the case where offset != 0 - let a = create_decimal_array(vec![ - Some(8_887_000_000), - None, - None, - Some(-8_887_000_000), - None, - None, - ]); - let b = create_decimal_array(vec![ - None, - Some(8_887_000_000), - None, - None, - Some(15_887_000_000), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(1, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(5, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 3); - let b_slice = b.slice(4, 3); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(1, 3); - let b_slice = b.slice(2, 3); - test_equal(&a_slice, &b_slice, false); - - let b = create_decimal_array(vec![ - None, - None, - None, - Some(-8_887_000_000), - Some(-3_000), - None, - ]); - let a_slice = a.slice(1, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeListBuilder::new(Int32Builder::with_capacity(10), 3); - - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - for _ in 0..builder.value_length() { - builder.values().append_null(); - } - builder.append(false); - } - } - builder.finish().into_data() - } - - #[test] - fn test_fixed_size_list_equal() { - let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_list_null() { - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - Some(&[7, 8, 9]), - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = create_fixed_size_list_array(&[None, Some(&[4, 5, 6]), None, None]); - - test_equal(&a.slice(2, 4), &b, true); - test_equal(&a.slice(3, 3), &b.slice(1, 3), true); - } - - #[test] - fn test_fixed_list_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - #[test] - fn test_struct_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let a = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let a = a.data(); - - let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); - let b = b.data(); - - test_equal(a, b, true); - } - - #[test] - fn test_struct_equal_null() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); - - let a = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); - let c = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(c_ints_non_null.data_ref().clone()) - .build() - .unwrap(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - - // test a nested struct - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - a.data_type().clone(), - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) - .len(5) - .add_child_data(a.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - // reconstruct b, but with different data where the first struct is null - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joanne"), // difference - None, - None, - Some("mark"), - Some("doe"), - ])); - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build() - .unwrap(); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - b.data_type().clone(), - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) - .len(5) - .add_child_data(b) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - } - - #[test] - fn test_struct_equal_null_variable_size() { - // the string arrays differ, but where the struct array is null - let strings1: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doel"), - ])); - let strings2: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joel"), - None, - None, - Some("mark"), - Some("doe"), - ])); - - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) - .len(5) - .add_child_data(strings1.data_ref().clone()) - .build() - .unwrap(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) - .len(5) - .add_child_data(strings2.data_ref().clone()) - .build() - .unwrap(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let strings3: ArrayRef = Arc::new(StringArray::from(vec![ - Some("mark"), - None, - None, - Some("doe"), - Some("joe"), - ])); - let c = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) - .len(5) - .add_child_data(strings3.data_ref().clone()) - .build() - .unwrap(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::::new_with_dictionary( - keys.len(), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null() - } - } - builder.finish().into_data() - } - - #[test] - fn test_dictionary_equal() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different len - let b = - create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a")]); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), Some("b"), Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } - - #[test] - fn test_dictionary_equal_null() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), None, Some("a"), Some("c")], - ); - - // equal to self - test_equal(&a, &a, true); - - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different null position - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), None], - ); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), None, Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } - - #[test] - fn test_non_null_empty_strings() { - let s = StringArray::from(vec![Some(""), Some(""), Some("")]); - - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); - - // string2 is identical to string1 except that it has no validity buffer but since there - // are no nulls, string1 and string2 are equal - test_equal(string1, &string2, true); - } - - #[test] - fn test_null_empty_strings() { - let s = StringArray::from(vec![Some(""), None, Some("")]); - - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); - - // string2 is identical to string1 except that it has no validity buffer since string1 has - // nulls in it, string1 and string2 are not equal - test_equal(string1, &string2, false); - } - - #[test] - fn test_union_equal_dense() { - let mut builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union1 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union2 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 5).unwrap(); - builder.append::("c", 4).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union3 = builder.build().unwrap(); - - builder = UnionBuilder::new_dense(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("c").unwrap(); - builder.append_null::("b").unwrap(); - builder.append::("b", 7).unwrap(); - let union4 = builder.build().unwrap(); - - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); - } - - #[test] - fn test_union_equal_sparse() { - let mut builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union1 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union2 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 5).unwrap(); - builder.append::("c", 4).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union3 = builder.build().unwrap(); - - builder = UnionBuilder::new_sparse(); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append_null::("a").unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("b", 7).unwrap(); - let union4 = builder.build().unwrap(); - - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); - } -} diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 8e9bc20b4487..32a1da17f848 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -173,15 +173,12 @@ mod array_struct; mod array_union; mod builder; mod cast; -mod data; -mod equal; #[cfg(feature = "ffi")] mod ffi; mod iterator; mod null; mod ord; mod raw_pointer; -mod transform; use crate::datatypes::*; @@ -190,14 +187,9 @@ use crate::datatypes::*; pub use self::array::Array; pub use self::array::ArrayAccessor; pub use self::array::ArrayRef; -pub use self::data::ArrayData; -pub use self::data::ArrayDataBuilder; -pub use self::data::ArrayDataRef; - -#[cfg(any(feature = "ipc", feature = "ffi"))] -pub(crate) use self::data::layout; -#[cfg(feature = "ipc")] -pub(crate) use self::data::BufferSpec; +pub use arrow_data::{ + layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, DataTypeLayout, +}; pub use self::array_binary::BinaryArray; pub use self::array_binary::LargeBinaryArray; @@ -592,7 +584,7 @@ pub type DurationMillisecondBuilder = PrimitiveBuilder; pub type DurationMicrosecondBuilder = PrimitiveBuilder; pub type DurationNanosecondBuilder = PrimitiveBuilder; -pub use self::transform::{Capacities, MutableArrayData}; +pub use arrow_data::transform::{Capacities, MutableArrayData}; // --------------------- Array Iterator --------------------- diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs deleted file mode 100644 index 29d4434aafaa..000000000000 --- a/arrow/src/array/transform/mod.rs +++ /dev/null @@ -1,1715 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, ArrayDataBuilder, OffsetSizeTrait, -}; -use crate::{ - buffer::MutableBuffer, - datatypes::DataType, - error::{ArrowError, Result}, - util::bit_util, -}; -use half::f16; -use std::mem; - -mod boolean; -mod fixed_binary; -mod fixed_size_list; -mod list; -mod null; -mod primitive; -mod structure; -mod union; -mod utils; -mod variable_size; - -type ExtendNullBits<'a> = Box; -// function that extends `[start..start+len]` to the mutable array. -// this is dynamic because different data_types influence how buffers and children are extended. -type Extend<'a> = Box; - -type ExtendNulls = Box; - -/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. -/// This is just a data container. -#[derive(Debug)] -struct _MutableArrayData<'a> { - pub data_type: DataType, - pub null_count: usize, - - pub len: usize, - pub null_buffer: MutableBuffer, - - // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). - // Thus, we place them in the stack to avoid bound checks and greater data locality. - pub buffer1: MutableBuffer, - pub buffer2: MutableBuffer, - pub child_data: Vec>, -} - -impl<'a> _MutableArrayData<'a> { - fn freeze(self, dictionary: Option) -> ArrayDataBuilder { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - - ArrayDataBuilder::new(self.data_type) - .offset(0) - .len(self.len) - .null_count(self.null_count) - .buffers(buffers) - .child_data(child_data) - .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) - } -} - -fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { - if let Some(bitmap) = array.null_bitmap() { - let bytes = bitmap.bits.as_slice(); - Box::new(move |mutable, start, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - mutable.null_count += crate::util::bit_mask::set_bits( - mutable.null_buffer.as_slice_mut(), - bytes, - mutable.len, - array.offset() + start, - len, - ); - }) - } else if use_nulls { - Box::new(|mutable, _, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - let write_data = mutable.null_buffer.as_slice_mut(); - let offset = mutable.len; - (0..len).for_each(|i| { - bit_util::set_bit(write_data, offset + i); - }); - }) - } else { - Box::new(|_, _, _| {}) - } -} - -/// Struct to efficiently and interactively create an [ArrayData] from an existing [ArrayData] by -/// copying chunks. -/// The main use case of this struct is to perform unary operations to arrays of arbitrary types, such as `filter` and `take`. -/// # Example: -/// -/// ``` -/// use arrow::{array::{Int32Array, Array, MutableArrayData}}; -/// -/// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); -/// let array = array.data(); -/// // Create a new `MutableArrayData` from an array and with a capacity of 4. -/// // Capacity here is equivalent to `Vec::with_capacity` -/// let arrays = vec![array]; -/// let mut mutable = MutableArrayData::new(arrays, false, 4); -/// mutable.extend(0, 1, 3); // extend from the slice [1..3], [2,3] -/// mutable.extend(0, 0, 3); // extend from the slice [0..3], [1,2,3] -/// // `.freeze()` to convert `MutableArrayData` into a `ArrayData`. -/// let new_array = Int32Array::from(mutable.freeze()); -/// assert_eq!(Int32Array::from(vec![2, 3, 1, 2, 3]), new_array); -/// ``` -pub struct MutableArrayData<'a> { - #[allow(dead_code)] - arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself - data: _MutableArrayData<'a>, - - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. - dictionary: Option, - - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. - extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. - extend_null_bits: Vec>, - - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. - extend_nulls: ExtendNulls, -} - -impl<'a> std::fmt::Debug for MutableArrayData<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // ignores the closures. - f.debug_struct("MutableArrayData") - .field("data", &self.data) - .finish() - } -} - -/// Builds an extend that adds `offset` to the source primitive -/// Additionally validates that `max` fits into the -/// the underlying primitive returning None if not -fn build_extend_dictionary( - array: &ArrayData, - offset: usize, - max: usize, -) -> Option { - use crate::datatypes::*; - macro_rules! validate_and_build { - ($dt: ty) => {{ - let _: $dt = max.try_into().ok()?; - let offset: $dt = offset.try_into().ok()?; - Some(primitive::build_extend_with_offset(array, offset)) - }}; - } - match array.data_type() { - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => validate_and_build!(u8), - DataType::UInt16 => validate_and_build!(u16), - DataType::UInt32 => validate_and_build!(u32), - DataType::UInt64 => validate_and_build!(u64), - DataType::Int8 => validate_and_build!(i8), - DataType::Int16 => validate_and_build!(i16), - DataType::Int32 => validate_and_build!(i32), - DataType::Int64 => validate_and_build!(i64), - _ => unreachable!(), - }, - _ => None, - } -} - -fn build_extend(array: &ArrayData) -> Extend { - use crate::datatypes::*; - match array.data_type() { - DataType::Decimal128(_, _) => primitive::build_extend::(array), - DataType::Null => null::build_extend(array), - DataType::Boolean => boolean::build_extend(array), - DataType::UInt8 => primitive::build_extend::(array), - DataType::UInt16 => primitive::build_extend::(array), - DataType::UInt32 => primitive::build_extend::(array), - DataType::UInt64 => primitive::build_extend::(array), - DataType::Int8 => primitive::build_extend::(array), - DataType::Int16 => primitive::build_extend::(array), - DataType::Int32 => primitive::build_extend::(array), - DataType::Int64 => primitive::build_extend::(array), - DataType::Float32 => primitive::build_extend::(array), - DataType::Float64 => primitive::build_extend::(array), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive::build_extend::(array) - } - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - primitive::build_extend::(array) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive::build_extend::(array) - } - DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_size::build_extend::(array) - } - DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), - DataType::LargeList(_) => list::build_extend::(array), - DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), - DataType::Struct(_) => structure::build_extend(array), - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::build_extend(array) - } - DataType::Float16 => primitive::build_extend::(array), - DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), - DataType::Union(_, _, mode) => match mode { - UnionMode::Sparse => union::build_extend_sparse(array), - UnionMode::Dense => union::build_extend_dense(array), - }, - } -} - -fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { - use crate::datatypes::*; - Box::new(match data_type { - DataType::Decimal128(_, _) => primitive::extend_nulls::, - DataType::Null => null::extend_nulls, - DataType::Boolean => boolean::extend_nulls, - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - DataType::Float32 => primitive::extend_nulls::, - DataType::Float64 => primitive::extend_nulls::, - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, - DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, - DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, - DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, - DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, - DataType::LargeList(_) => list::extend_nulls::, - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - _ => unreachable!(), - }, - DataType::Struct(_) => structure::extend_nulls, - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::extend_nulls - } - DataType::Float16 => primitive::extend_nulls::, - DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, - DataType::Union(_, _, mode) => match mode { - UnionMode::Sparse => union::extend_nulls_sparse, - UnionMode::Dense => union::extend_nulls_dense, - }, - }) -} - -fn preallocate_offset_and_binary_buffer( - capacity: usize, - binary_size: usize, -) -> [MutableBuffer; 2] { - // offsets - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - // safety: `unsafe` code assumes that this buffer is initialized with one element - buffer.push(Offset::zero()); - - [ - buffer, - MutableBuffer::new(binary_size * mem::size_of::()), - ] -} - -/// Define capacities of child data or data buffers. -#[derive(Debug, Clone)] -pub enum Capacities { - /// Binary, Utf8 and LargeUtf8 data types - /// Define - /// * the capacity of the array offsets - /// * the capacity of the binary/ str buffer - Binary(usize, Option), - /// List and LargeList data types - /// Define - /// * the capacity of the array offsets - /// * the capacity of the child data - List(usize, Option>), - /// Struct type - /// * the capacity of the array - /// * the capacities of the fields - Struct(usize, Option>), - /// Dictionary type - /// * the capacity of the array/keys - /// * the capacity of the values - Dictionary(usize, Option>), - /// Don't preallocate inner buffers and rely on array growth strategy - Array(usize), -} -impl<'a> MutableArrayData<'a> { - /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an - /// [ArrayData] from multiple `arrays`. - /// - /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls - /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. - /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. - pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self { - Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity)) - } - - /// Similar to [MutableArrayData::new], but lets users define the preallocated capacities of the array. - /// See also [MutableArrayData::new] for more information on the arguments. - /// - /// # Panic - /// This function panics if the given `capacities` don't match the data type of `arrays`. Or when - /// a [Capacities] variant is not yet supported. - pub fn with_capacities( - arrays: Vec<&'a ArrayData>, - use_nulls: bool, - capacities: Capacities, - ) -> Self { - let data_type = arrays[0].data_type(); - use crate::datatypes::*; - - // if any of the arrays has nulls, insertions from any array requires setting bits - // as there is at least one array with nulls. - let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); - - let mut array_capacity; - - let [buffer1, buffer2] = match (data_type, &capacities) { - ( - DataType::LargeUtf8 | DataType::LargeBinary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { - array_capacity = *capacity; - preallocate_offset_and_binary_buffer::(*capacity, *value_cap) - } - ( - DataType::Utf8 | DataType::Binary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { - array_capacity = *capacity; - preallocate_offset_and_binary_buffer::(*capacity, *value_cap) - } - (_, Capacities::Array(capacity)) => { - array_capacity = *capacity; - new_buffers(data_type, *capacity) - } - ( - DataType::List(_) | DataType::LargeList(_), - Capacities::List(capacity, _), - ) => { - array_capacity = *capacity; - new_buffers(data_type, *capacity) - } - _ => panic!("Capacities: {:?} not yet supported", capacities), - }; - - let child_data = match &data_type { - DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) - | DataType::Null - | DataType::Boolean - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) - | DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::FixedSizeBinary(_) => vec![], - DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { - let childs = arrays - .iter() - .map(|array| &array.child_data()[0]) - .collect::>(); - - let capacities = if let Capacities::List(capacity, ref child_capacities) = - capacities - { - child_capacities - .clone() - .map(|c| *c) - .unwrap_or(Capacities::Array(capacity)) - } else { - Capacities::Array(array_capacity) - }; - - vec![MutableArrayData::with_capacities( - childs, use_nulls, capacities, - )] - } - // the dictionary type just appends keys and clones the values. - DataType::Dictionary(_, _) => vec![], - DataType::Struct(fields) => match capacities { - Capacities::Struct(capacity, Some(ref child_capacities)) => { - array_capacity = capacity; - (0..fields.len()) - .zip(child_capacities) - .map(|(i, child_cap)| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::with_capacities( - child_arrays, - use_nulls, - child_cap.clone(), - ) - }) - .collect::>() - } - Capacities::Struct(capacity, None) => { - array_capacity = capacity; - (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, capacity) - }) - .collect::>() - } - _ => (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, array_capacity) - }) - .collect::>(), - }, - DataType::FixedSizeList(_, _) => { - let childs = arrays - .iter() - .map(|array| &array.child_data()[0]) - .collect::>(); - vec![MutableArrayData::new(childs, use_nulls, array_capacity)] - } - DataType::Union(fields, _, _) => (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, array_capacity) - }) - .collect::>(), - }; - - // Get the dictionary if any, and if it is a concatenation of multiple - let (dictionary, dict_concat) = match &data_type { - DataType::Dictionary(_, _) => { - // If more than one dictionary, concatenate dictionaries together - let dict_concat = !arrays - .windows(2) - .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0])); - - match dict_concat { - false => (Some(arrays[0].child_data()[0].clone()), false), - true => { - if let Capacities::Dictionary(_, _) = capacities { - panic!("dictionary capacity not yet supported") - } - let dictionaries: Vec<_> = - arrays.iter().map(|array| &array.child_data()[0]).collect(); - let lengths: Vec<_> = dictionaries - .iter() - .map(|dictionary| dictionary.len()) - .collect(); - let capacity = lengths.iter().sum(); - - let mut mutable = - MutableArrayData::new(dictionaries, false, capacity); - - for (i, len) in lengths.iter().enumerate() { - mutable.extend(i, 0, *len) - } - - (Some(mutable.freeze()), true) - } - } - } - _ => (None, false), - }; - - let extend_nulls = build_extend_nulls(data_type); - - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(array, use_nulls)) - .collect(); - - let null_buffer = if use_nulls { - let null_bytes = bit_util::ceil(array_capacity, 8); - MutableBuffer::from_len_zeroed(null_bytes) - } else { - // create 0 capacity mutable buffer with the intention that it won't be used - MutableBuffer::with_capacity(0) - }; - - let extend_values = match &data_type { - DataType::Dictionary(_, _) => { - let mut next_offset = 0; - let extend_values: Result> = arrays - .iter() - .map(|array| { - let offset = next_offset; - let dict_len = array.child_data()[0].len(); - - if dict_concat { - next_offset += dict_len; - } - - build_extend_dictionary(array, offset, offset + dict_len) - .ok_or(ArrowError::DictionaryKeyOverflowError) - }) - .collect(); - - extend_values.expect("MutableArrayData::new is infallible") - } - _ => arrays.iter().map(|array| build_extend(array)).collect(), - }; - - let data = _MutableArrayData { - data_type: data_type.clone(), - len: 0, - null_count: 0, - null_buffer, - buffer1, - buffer2, - child_data, - }; - Self { - arrays, - data, - dictionary, - extend_values, - extend_null_bits, - extend_nulls, - } - } - - /// Extends this array with a chunk of its source arrays - /// - /// # Arguments - /// * `index` - the index of array that you what to copy values from - /// * `start` - the start index of the chunk (inclusive) - /// * `end` - the end index of the chunk (exclusive) - /// - /// # Panic - /// This function panics if there is an invalid index, - /// i.e. `index` >= the number of source arrays - /// or `end` > the length of the `index`th array - pub fn extend(&mut self, index: usize, start: usize, end: usize) { - let len = end - start; - (self.extend_null_bits[index])(&mut self.data, start, len); - (self.extend_values[index])(&mut self.data, index, start, len); - self.data.len += len; - } - - /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays - pub fn extend_nulls(&mut self, len: usize) { - // TODO: null_buffer should probably be extended here as well - // otherwise is_valid() could later panic - // add test to confirm - self.data.null_count += len; - (self.extend_nulls)(&mut self.data, len); - self.data.len += len; - } - - /// Returns the current length - #[inline] - pub fn len(&self) -> usize { - self.data.len - } - - /// Returns true if len is 0 - #[inline] - pub fn is_empty(&self) -> bool { - self.data.len == 0 - } - - /// Returns the current null count - #[inline] - pub fn null_count(&self) -> usize { - self.data.null_count - } - - /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. - pub fn freeze(self) -> ArrayData { - unsafe { self.data.freeze(self.dictionary).build_unchecked() } - } - - /// Creates a [ArrayDataBuilder] from the pushed regions up to this point, consuming `self`. - /// This is useful for extending the default behavior of MutableArrayData. - pub fn into_builder(self) -> ArrayDataBuilder { - self.data.freeze(self.dictionary) - } -} - -#[cfg(test)] -mod tests { - use std::{convert::TryFrom, sync::Arc}; - - use super::*; - use crate::array::Decimal128Array; - use crate::{ - array::{ - Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray, - FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array, - Int64Builder, ListBuilder, MapBuilder, NullArray, StringArray, - StringDictionaryBuilder, StructArray, UInt8Array, - }, - buffer::Buffer, - datatypes::Field, - }; - use crate::{ - array::{ListArray, StringBuilder}, - error::Result, - }; - - fn create_decimal_array( - array: Vec>, - precision: u8, - scale: u8, - ) -> Decimal128Array { - array - .into_iter() - .collect::() - .with_precision_and_scale(precision, scale) - .unwrap() - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let arrays = vec![Array::data(&decimal_array)]; - let mut a = MutableArrayData::new(arrays, true, 3); - a.extend(0, 0, 3); - a.extend(0, 2, 3); - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array(vec![Some(1), Some(2), None, None], 10, 3); - assert_eq!(array, expected); - } - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_offset() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array(vec![Some(2), None], 10, 3); - assert_eq!(array, expected); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_null_offset_nulls() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); // 2, null - a.extend_nulls(3); // 2, null, null, null, null - a.extend(0, 1, 3); //2, null, null, null, null, null, 3 - let result = a.freeze(); - let array = Decimal128Array::from(result); - let expected = create_decimal_array( - vec![Some(2), None, None, None, None, None, Some(3)], - 10, - 3, - ); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array w/ offset nor nulls - #[test] - fn test_primitive() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 3); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(1), Some(2)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset w/ nulls - #[test] - fn test_primitive_offset() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(2), Some(3)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset and nulls - #[test] - fn test_primitive_null_offset() { - let b = UInt8Array::from(vec![Some(1), None, Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_primitive_null_offset_nulls() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); - a.extend_nulls(3); - a.extend(0, 1, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = - UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_list_null_offset() { - let int_builder = Int64Builder::with_capacity(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.append(true); - let array = builder.finish(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let array = ListArray::from(result); - - let int_builder = Int64Builder::with_capacity(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - let expected = builder.finish(); - - assert_eq!(array, expected); - } - - /// tests extending from a variable-sized (strings and binary) array w/ offset with nulls - #[test] - fn test_variable_sized_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None]); - assert_eq!(result, expected); - } - - /// tests extending from a variable-sized (strings and binary) array - /// with an offset and nulls - #[test] - fn test_variable_sized_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_multiple_with_nulls() { - let array1 = StringArray::from(vec!["hello", "world"]); - let array2 = StringArray::from(vec![Some("1"), None]); - - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 5); - - mutable.extend(0, 0, 2); - mutable.extend(1, 0, 2); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = - StringArray::from(vec![Some("hello"), Some("world"), Some("1"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_null_offset_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, true, 0); - - mutable.extend(0, 1, 3); - mutable.extend_nulls(1); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![None, Some("defh"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_bool() { - let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = BooleanArray::from(result); - - let expected = BooleanArray::from(vec![Some(true), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_null() { - let array1 = NullArray::new(10); - let array2 = NullArray::new(5); - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 1); - - let result = mutable.freeze(); - let result = NullArray::from(result); - - let expected = NullArray::new(3); - assert_eq!(result, expected); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::::new_with_dictionary( - keys.len(), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null() - } - } - builder.finish().into_data() - } - - #[test] - fn test_dictionary() { - // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) - let array = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), None, Some("c")], - ); - let arrays = vec![&array]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = DictionaryArray::from(result); - - let expected = Int16Array::from(vec![Some(1), None]); - assert_eq!(result.keys(), &expected); - } - - #[test] - fn test_struct() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected = StructArray::try_from(vec![ - ("f1", strings.slice(1, 2)), - ("f2", ints.slice(1, 2)), - ]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_offset() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .slice(1, 3); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_strings: ArrayRef = - Arc::new(StringArray::from(vec![None, Some("mark")])); - let expected = StructArray::try_from(vec![ - ("f1", expected_strings), - ("f2", ints.slice(2, 2)), - ]) - .unwrap(); - - assert_eq!(array, expected); - } - - #[test] - fn test_struct_nulls() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; - let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_many() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data(), array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 2); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = - Arc::new(StringArray::from(vec![None, None, Some("joe"), None])) as ArrayRef; - let expected_int = - Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_binary_fixed_sized_offsets() { - let array = FixedSizeBinaryArray::try_from_iter( - vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), - ) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - let array = array.slice(1, 2); - // = [[0, 1], [0, 2]] due to the offset = 1 - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 2); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let result = FixedSizeBinaryArray::from(result); - - let expected = - FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(result, expected); - } - - #[test] - fn test_list_append() { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(24)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - - let a_builder = Int64Builder::with_capacity(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13]); - a_builder.append(true); - a_builder.append(true); - a_builder.values().append_slice(&[14, 15]); - a_builder.append(true); - let b = a_builder.finish(); - - let c = b.slice(1, 2); - - let mut mutable = - MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - - let finished = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - // append first array - Some(12), - Some(13), - Some(14), - Some(15), - // append second array - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 8, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.into_data()], - ) - .unwrap(); - assert_eq!(finished, expected_list_data); - } - - #[test] - fn test_list_nulls_append() -> Result<()> { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(32)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.append(false); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - let a = a.data(); - - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(32)); - builder.values().append_slice(&[12, 13]); - builder.append(true); - builder.append(false); - builder.append(true); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[14, 15]); - builder.append(true); - let b = builder.finish(); - let b = b.data(); - let c = b.slice(1, 2); - let d = b.slice(2, 2); - - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); - let result = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 12, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![list_value_offsets], - vec![expected_int_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_append_with_capacities() { - let mut builder = - ListBuilder::::new(Int64Builder::with_capacity(24)); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true); - builder.values().append_slice(&[4, 5]); - builder.append(true); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true); - let a = builder.finish(); - - let a_builder = Int64Builder::with_capacity(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13]); - a_builder.append(true); - a_builder.append(true); - a_builder.values().append_slice(&[14, 15, 16, 17]); - a_builder.append(true); - let b = a_builder.finish(); - - let mutable = MutableArrayData::with_capacities( - vec![a.data(), b.data()], - false, - Capacities::List(6, Some(Box::new(Capacities::Array(17)))), - ); - - // capacities are rounded up to multiples of 64 by MutableBuffer - assert_eq!(mutable.data.buffer1.capacity(), 64); - assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192); - } - - #[test] - fn test_map_nulls_append() -> Result<()> { - let mut builder = MapBuilder::::new( - None, - Int64Builder::with_capacity(32), - Int64Builder::with_capacity(32), - ); - builder.keys().append_slice(&[1, 2, 3]); - builder.values().append_slice(&[1, 2, 3]); - builder.append(true).unwrap(); - builder.keys().append_slice(&[4, 5]); - builder.values().append_slice(&[4, 5]); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.keys().append_slice(&[6, 7, 8, 100, 101, 9, 10, 11]); - builder.values().append_slice(&[6, 7, 8]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[9, 10, 11]); - builder.append(true).unwrap(); - - let a = builder.finish(); - let a = a.data(); - - let mut builder = MapBuilder::::new( - None, - Int64Builder::with_capacity(32), - Int64Builder::with_capacity(32), - ); - - builder.keys().append_slice(&[12, 13]); - builder.values().append_slice(&[12, 13]); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.append(true).unwrap(); - builder.keys().append_slice(&[100, 101, 14, 15]); - builder.values().append_null(); - builder.values().append_null(); - builder.values().append_slice(&[14, 15]); - builder.append(true).unwrap(); - - let b = builder.finish(); - let b = b.data(); - let c = b.slice(1, 2); - let d = b.slice(2, 2); - - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); - let result = mutable.freeze(); - - let expected_key_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(100), - Some(101), - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - Some(100), - Some(101), - Some(14), - Some(15), - // slice(1, 2) results in no values added - Some(100), - Some(101), - Some(14), - Some(15), - ]); - - let expected_value_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - - let expected_entry_array = StructArray::from(vec![ - ( - Field::new("keys", DataType::Int64, false), - Arc::new(expected_key_array) as ArrayRef, - ), - ( - Field::new("values", DataType::Int64, true), - Arc::new(expected_value_array) as ArrayRef, - ), - ]); - - let map_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - - let expected_list_data = ArrayData::try_new( - DataType::Map( - Box::new(Field::new( - "entries", - DataType::Struct(vec![ - Field::new("keys", DataType::Int64, false), - Field::new("values", DataType::Int64, true), - ]), - false, - )), - false, - ), - 12, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![map_offsets], - vec![expected_entry_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_of_strings_append() -> Result<()> { - // [["alpha", "beta", None]] - let mut builder = ListBuilder::new(StringBuilder::new()); - builder.values().append_value("Hello"); - builder.values().append_value("Arrow"); - builder.values().append_null(); - builder.append(true); - let a = builder.finish(); - - // [["alpha", "beta"], [None], ["gamma", "delta", None]] - let mut builder = ListBuilder::new(StringBuilder::new()); - builder.values().append_value("alpha"); - builder.values().append_value("beta"); - builder.append(true); - builder.values().append_null(); - builder.append(true); - builder.values().append_value("gamma"); - builder.values().append_value("delta"); - builder.values().append_null(); - builder.append(true); - let b = builder.finish(); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 3); - mutable.extend(1, 0, 0); - let result = mutable.freeze(); - - let expected_string_array = StringArray::from(vec![ - // extend a[0..a.len()] - // a[0] - Some("Hello"), - Some("Arrow"), - None, - // extend b[0..b.len()] - // b[0] - Some("alpha"), - Some("beta"), - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[1..3] - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[0..0] - ]); - let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); - let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 6, - None, - 0, - vec![list_value_offsets], - vec![expected_string_array.into_data()], - ) - .unwrap(); - assert_eq!(result, expected_list_data); - Ok(()) - } - - #[test] - fn test_fixed_size_binary_append() { - let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; - let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let b = vec![ - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - ]; - let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 4); - mutable.extend(1, 2, 3); - mutable.extend(1, 5, 5); - let result = mutable.freeze(); - - let expected = vec![ - // a - Some(vec![1, 2]), - Some(vec![3, 4]), - Some(vec![5, 6]), - // b - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - // b[1..4] - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - // b[2..3] - Some(vec![9, 10]), - // b[4..4] - ]; - let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(&result, expected.data()); - } - - /* - // this is an old test used on a meanwhile removed dead code - // that is still useful when `MutableArrayData` supports fixed-size lists. - #[test] - fn test_fixed_size_list_append() -> Result<()> { - let int_builder = UInt16Builder::new(64); - let mut builder = FixedSizeListBuilder::::new(int_builder, 2); - builder.values().append_slice(&[1, 2])?; - builder.append(true)?; - builder.values().append_slice(&[3, 4])?; - builder.append(false)?; - builder.values().append_slice(&[5, 6])?; - builder.append(true)?; - - let a_builder = UInt16Builder::new(64); - let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); - a_builder.values().append_slice(&[7, 8])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[9, 10])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[11, 12])?; - a_builder.append(false)?; - a_builder.values().append_slice(&[13, 14])?; - a_builder.append(true)?; - a_builder.values().append_null()?; - a_builder.values().append_null()?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 3).data(), - a.slice(2, 1).data(), - a.slice(5, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = UInt16Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - // append first array - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - None, - None, - // append slice(1, 3) - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - // append slice(2, 1) - Some(11), - Some(12), - ]); - let expected_list_data = ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), - 2, - ), - 12, - None, - None, - 0, - vec![], - vec![expected_int_array.data()], - ); - let expected_list = - FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayData); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - */ -} diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index b8719ad2d6c7..34921ca97eec 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -366,7 +366,7 @@ pub fn not(left: &BooleanArray) -> Result { let null_bit_buffer = data .null_bitmap() .as_ref() - .map(|b| b.bits.bit_slice(left_offset, len)); + .map(|b| b.buffer().bit_slice(left_offset, len)); let values = buffer_unary_not(&data.buffers()[0], left_offset, len); @@ -507,7 +507,7 @@ where let and = buffer_bin_and( right.values(), right.offset(), - &right_bitmap.bits, + right_bitmap.buffer(), right.offset(), right.len(), ); @@ -520,7 +520,7 @@ where // Here we take care of the possible offsets of the left and right arrays all at once. let modified_null_buffer = match left_data.null_bitmap() { Some(left_null_bitmap) => buffer_bin_and( - &left_null_bitmap.bits, + left_null_bitmap.buffer(), left_data.offset(), &rcb, 0, diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index dcd80ab11d68..791363574c52 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -1612,7 +1612,7 @@ where .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), array.data().offset(), array.data().buffers().to_vec(), vec![], @@ -2408,7 +2408,7 @@ fn dictionary_cast( .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), cast_keys.data().offset(), cast_keys.data().buffers().to_vec(), vec![cast_values.into_data()], @@ -2622,7 +2622,7 @@ fn cast_primitive_to_list( .data() .null_bitmap() .cloned() - .map(|bitmap| bitmap.bits), + .map(|bitmap| bitmap.into_buffer()), 0, vec![offsets.into()], vec![cast_array.into_data()], @@ -2649,7 +2649,9 @@ fn cast_list_inner( to_type.clone(), array.len(), Some(data.null_count()), - data.null_bitmap().cloned().map(|bitmap| bitmap.bits), + data.null_bitmap() + .cloned() + .map(|bitmap| bitmap.into_buffer()), array.offset(), // reuse offset buffer data.buffers().to_vec(), diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index d4eb5a3e1d2b..fec464b93286 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -945,7 +945,7 @@ pub fn eq_bool_scalar(left: &BooleanArray, right: bool) -> Result left.data_ref() .null_bitmap() .as_ref() - .map(|b| b.bits.bit_slice(left_offset, len)), + .map(|b| b.buffer().bit_slice(left_offset, len)), 0, vec![values], vec![], diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 2f83871127fd..e7d9bfd5a4f6 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -30,10 +30,9 @@ mod numeric; pub use numeric::*; mod types; pub use types::*; -mod decimal; mod delta; -pub use decimal::*; +pub use arrow_data::decimal::*; pub use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit, UnionMode}; #[cfg(feature = "ffi")] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index a4d864754cd5..5cc264b1392e 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -250,8 +250,11 @@ pub use arrow_buffer::{alloc, buffer}; +pub mod bitmap { + pub use arrow_data::Bitmap; +} + pub mod array; -pub mod bitmap; pub mod compute; #[cfg(feature = "csv")] pub mod csv; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 90caa2e3a5c7..a775b2ce8bc4 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -26,7 +26,7 @@ use pyo3::import_exception; use pyo3::prelude::*; use pyo3::types::{PyList, PyTuple}; -use crate::array::{Array, ArrayData, ArrayRef}; +use crate::array::{make_array, Array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; @@ -188,7 +188,7 @@ impl PyArrowConvert for RecordBatch { let arrays = value.getattr("columns")?.downcast::()?; let arrays = arrays .iter() - .map(ArrayRef::from_pyarrow) + .map(|a| Ok(make_array(ArrayData::from_pyarrow(a)?))) .collect::>()?; let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; @@ -204,7 +204,7 @@ impl PyArrowConvert for RecordBatch { let columns = self.columns().iter(); for (array, field) in columns.zip(fields) { - py_arrays.push(array.to_pyarrow(py)?); + py_arrays.push(array.data().to_pyarrow(py)?); py_names.push(field.name()); } diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index adafc9f5053b..310ffb8ee7a0 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -17,10 +17,11 @@ pub use arrow_buffer::{bit_chunk_iterator, bit_util}; +pub use arrow_data::bit_iterator; +pub use arrow_data::bit_mask; + #[cfg(feature = "test_utils")] pub mod bench_util; -pub mod bit_iterator; -pub(crate) mod bit_mask; #[cfg(feature = "test_utils")] pub mod data_gen; pub mod display; diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs new file mode 100644 index 000000000000..5a1b48c009df --- /dev/null +++ b/arrow/tests/array_equal.rs @@ -0,0 +1,1274 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray, + FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, GenericStringArray, + Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, + OffsetSizeTrait, StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, +}; +use arrow::datatypes::{Int16Type, Int32Type}; +use arrow_buffer::{Buffer, ToByteSlice}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{DataType, Field}; +use std::sync::Arc; + +#[test] +fn test_null_equal() { + let a = NullArray::new(12); + let a = a.data(); + let b = NullArray::new(12); + let b = b.data(); + test_equal(a, b, true); + + let b = NullArray::new(10); + let b = b.data(); + test_equal(a, b, false); + + // Test the case where offset != 0 + + let a_slice = a.slice(2, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(5, 4); + let b_slice = b.slice(3, 3); + test_equal(&a_slice, &b_slice, false); +} + +#[test] +fn test_boolean_equal() { + let a = BooleanArray::from(vec![false, false, true]); + let a = a.data(); + let b = BooleanArray::from(vec![false, false, true]); + let b = b.data(); + test_equal(a, b, true); + + let b = BooleanArray::from(vec![false, false, false]); + let b = b.data(); + test_equal(a, b, false); +} + +#[test] +fn test_boolean_equal_nulls() { + let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); + let a = a.data(); + let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, true); + + let b = BooleanArray::from(vec![None, None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, false); + + let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); + let b = b.data(); + test_equal(a, b, false); +} + +#[test] +fn test_boolean_equal_offset() { + let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); + let a = a.data(); + let b = BooleanArray::from(vec![true, false, false, false, true, false, true, true]); + let b = b.data(); + assert_ne!(a, b); + assert_ne!(b, a); + + let a_slice = a.slice(2, 3); + let b_slice = b.slice(3, 3); + assert_eq!(a_slice, b_slice); + assert_eq!(b_slice, a_slice); + + let a_slice = a.slice(3, 4); + let b_slice = b.slice(4, 4); + assert_ne!(a_slice, b_slice); + assert_ne!(b_slice, a_slice); + + // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) + + // Elements fill in `u8`'s exactly. + let mut vector = vec![false, false, true, true, true, true, true, true]; + let a = BooleanArray::from(vector.clone()); + let a = a.data(); + let b = BooleanArray::from(vector.clone()); + let b = b.data(); + test_equal(a, b, true); + + // Elements fill in `u8`s + suffix bits. + vector.push(true); + let a = BooleanArray::from(vector.clone()); + let a = a.data(); + let b = BooleanArray::from(vector); + let b = b.data(); + test_equal(a, b, true); +} + +#[test] +fn test_primitive() { + let cases = vec![ + ( + vec![Some(1), Some(2), Some(3)], + vec![Some(1), Some(2), Some(3)], + true, + ), + ( + vec![Some(1), Some(2), Some(3)], + vec![Some(1), Some(2), Some(4)], + false, + ), + ( + vec![Some(1), Some(2), None], + vec![Some(1), Some(2), None], + true, + ), + ( + vec![Some(1), None, Some(3)], + vec![Some(1), Some(2), None], + false, + ), + ( + vec![Some(1), None, None], + vec![Some(1), Some(2), None], + false, + ), + ]; + + for (lhs, rhs, expected) in cases { + let lhs = Int32Array::from(lhs); + let lhs = lhs.data(); + let rhs = Int32Array::from(rhs); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_primitive_slice() { + let cases = vec![ + ( + vec![Some(1), Some(2), Some(3)], + (0, 1), + vec![Some(1), Some(2), Some(3)], + (0, 1), + true, + ), + ( + vec![Some(1), Some(2), Some(3)], + (1, 1), + vec![Some(1), Some(2), Some(3)], + (2, 1), + false, + ), + ( + vec![Some(1), Some(2), None], + (1, 1), + vec![Some(1), None, Some(2)], + (2, 1), + true, + ), + ( + vec![None, Some(2), None], + (1, 1), + vec![None, None, Some(2)], + (2, 1), + true, + ), + ( + vec![Some(1), None, Some(2), None, Some(3)], + (2, 2), + vec![None, Some(2), None, Some(3)], + (1, 2), + true, + ), + ( + vec![Some(1), Some(2), None, Some(0)], + (2, 2), + vec![Some(4), Some(5), Some(0), None], + (2, 2), + false, + ), + ]; + + for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { + let lhs = Int32Array::from(lhs); + let lhs = lhs.data(); + let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); + let rhs = Int32Array::from(rhs); + let rhs = rhs.data(); + let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); + + test_equal(&lhs, &rhs, expected); + } +} + +#[allow(clippy::eq_op)] +fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { + // equality is symmetric + assert_eq!(lhs, lhs); + assert_eq!(rhs, rhs); + + match expected { + true => { + assert_eq!(lhs, rhs); + assert_eq!(rhs, lhs); + } + false => { + assert_ne!(lhs, rhs); + assert_ne!(rhs, lhs); + } + } +} + +type OptionString = Option; + +fn binary_cases() -> Vec<(Vec, Vec, bool)> { + let base = vec![ + Some("hello".to_owned()), + None, + None, + Some("world".to_owned()), + None, + None, + ]; + let not_base = vec![ + Some("hello".to_owned()), + Some("foo".to_owned()), + None, + Some("world".to_owned()), + None, + None, + ]; + vec![ + ( + vec![Some("hello".to_owned()), Some("world".to_owned())], + vec![Some("hello".to_owned()), Some("world".to_owned())], + true, + ), + ( + vec![Some("hello".to_owned()), Some("world".to_owned())], + vec![Some("hello".to_owned()), Some("arrow".to_owned())], + false, + ), + (base.clone(), base.clone(), true), + (base, not_base, false), + ] +} + +fn test_generic_string_equal() { + let cases = binary_cases(); + + for (lhs, rhs, expected) in cases { + let lhs: GenericStringArray = lhs.into_iter().collect(); + let lhs = lhs.data(); + let rhs: GenericStringArray = rhs.into_iter().collect(); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_string_equal() { + test_generic_string_equal::() +} + +#[test] +fn test_large_string_equal() { + test_generic_string_equal::() +} + +fn test_generic_binary_equal() { + let cases = binary_cases(); + + for (lhs, rhs, expected) in cases { + let lhs = lhs + .iter() + .map(|x| x.as_deref().map(|x| x.as_bytes())) + .collect(); + let rhs = rhs + .iter() + .map(|x| x.as_deref().map(|x| x.as_bytes())) + .collect(); + let lhs = GenericBinaryArray::::from_opt_vec(lhs); + let lhs = lhs.data(); + let rhs = GenericBinaryArray::::from_opt_vec(rhs); + let rhs = rhs.data(); + test_equal(lhs, rhs, expected); + } +} + +#[test] +fn test_binary_equal() { + test_generic_binary_equal::() +} + +#[test] +fn test_large_binary_equal() { + test_generic_binary_equal::() +} + +#[test] +fn test_fixed_size_binary_array() { + let a_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; + let a = FixedSizeBinaryArray::try_from_iter(a_input_arg.into_iter()).unwrap(); + let a = a.data(); + + let b_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; + let b = FixedSizeBinaryArray::try_from_iter(b_input_arg.into_iter()).unwrap(); + let b = b.data(); + + test_equal(a, b, true); +} + +#[test] +fn test_string_offset() { + let a = StringArray::from(vec![Some("a"), None, Some("b")]); + let a = a.data(); + let a = a.slice(2, 1); + let b = StringArray::from(vec![Some("b")]); + let b = b.data(); + + test_equal(&a, b, true); +} + +#[test] +fn test_string_offset_larger() { + let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); + let a = a.data(); + let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); + let b = b.data(); + + test_equal(&a.slice(2, 2), &b.slice(0, 2), false); + test_equal(&a.slice(2, 2), &b.slice(1, 2), true); + test_equal(&a.slice(2, 2), &b.slice(2, 2), false); +} + +#[test] +fn test_null() { + let a = NullArray::new(2); + let a = a.data(); + let b = NullArray::new(2); + let b = b.data(); + test_equal(a, b, true); + + let b = NullArray::new(1); + let b = b.data(); + test_equal(a, b, false); +} + +fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { + let mut builder = ListBuilder::new(Int32Builder::with_capacity(10)); + for d in data.as_ref() { + if let Some(v) = d { + builder.values().append_slice(v.as_ref()); + builder.append(true); + } else { + builder.append(false); + } + } + builder.finish().into_data() +} + +#[test] +fn test_list_equal() { + let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + test_equal(&a, &b, true); + + let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + test_equal(&a, &b, false); +} + +#[test] +fn test_empty_offsets_list_equal() { + let empty: Vec = vec![]; + let values = Int32Array::from(empty); + let empty_offsets: [u8; 0] = []; + + let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(&empty_offsets)) + .add_child_data(values.data().clone()) + .null_bit_buffer(Some(Buffer::from(&empty_offsets))) + .build() + .unwrap(); + + test_equal(&a, &b, true); + + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(0) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data( + Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) + .data() + .clone(), + ) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + + test_equal(&a, &c, true); +} + +// Test the case where null_count > 0 +#[test] +fn test_list_null() { + let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + test_equal(&a, &b, true); + + let b = create_list_array(&[ + Some(&[1, 2]), + None, + Some(&[5, 6]), + Some(&[3, 4]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + test_equal(&a, &b, false); + + // a list where the nullness of values is determined by the list's bitmap + let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); + let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(c_values.into_data()) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + + let d_values = Int32Array::from(vec![ + Some(1), + Some(2), + None, + None, + Some(3), + Some(4), + None, + None, + ]); + let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + "item", + DataType::Int32, + true, + )))) + .len(6) + .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_child_data(d_values.into_data()) + .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) + .build() + .unwrap(); + test_equal(&c, &d, true); +} + +// Test the case where offset != 0 +#[test] +fn test_list_offsets() { + let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); +} + +fn create_fixed_size_binary_array, T: AsRef<[Option]>>( + data: T, +) -> ArrayData { + let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); + + for d in data.as_ref() { + if let Some(v) = d { + builder.append_value(v.as_ref()).unwrap(); + } else { + builder.append_null(); + } + } + builder.finish().into_data() +} + +#[test] +fn test_fixed_size_binary_equal() { + let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); + test_equal(&a, &b, true); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_fixed_size_binary_null() { + let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); + let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); + test_equal(&a, &b, true); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); + test_equal(&a, &b, false); + + let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); + test_equal(&a, &b, false); +} + +#[test] +fn test_fixed_size_binary_offsets() { + // Test the case where offset != 0 + let a = create_fixed_size_binary_array(&[ + Some(b"hello"), + None, + None, + Some(b"world"), + None, + None, + ]); + let b = create_fixed_size_binary_array(&[ + Some(b"hello"), + None, + None, + Some(b"arrow"), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(3, 1); + let b_slice = b.slice(3, 1); + test_equal(&a_slice, &b_slice, false); +} + +fn create_decimal_array(data: Vec>) -> ArrayData { + data.into_iter() + .collect::() + .with_precision_and_scale(23, 6) + .unwrap() + .into() +} + +#[test] +fn test_decimal_equal() { + let a = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); + let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000)]); + test_equal(&a, &b, true); + + let b = create_decimal_array(vec![Some(15_887_000_000), Some(-8_887_000_000)]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_decimal_null() { + let a = create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); + let b = create_decimal_array(vec![Some(8_887_000_000), None, Some(-8_887_000_000)]); + test_equal(&a, &b, true); + + let b = create_decimal_array(vec![Some(8_887_000_000), Some(-8_887_000_000), None]); + test_equal(&a, &b, false); + + let b = create_decimal_array(vec![Some(15_887_000_000), None, Some(-8_887_000_000)]); + test_equal(&a, &b, false); +} + +#[test] +fn test_decimal_offsets() { + // Test the case where offset != 0 + let a = create_decimal_array(vec![ + Some(8_887_000_000), + None, + None, + Some(-8_887_000_000), + None, + None, + ]); + let b = create_decimal_array(vec![ + None, + Some(8_887_000_000), + None, + None, + Some(15_887_000_000), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(1, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(5, 1); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(3, 3); + let b_slice = b.slice(4, 3); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(1, 3); + let b_slice = b.slice(2, 3); + test_equal(&a_slice, &b_slice, false); + + let b = create_decimal_array(vec![ + None, + None, + None, + Some(-8_887_000_000), + Some(-3_000), + None, + ]); + let a_slice = a.slice(1, 3); + let b_slice = b.slice(1, 3); + test_equal(&a_slice, &b_slice, true); +} + +/// Create a fixed size list of 2 value lengths +fn create_fixed_size_list_array, T: AsRef<[Option]>>( + data: T, +) -> ArrayData { + let mut builder = FixedSizeListBuilder::new(Int32Builder::with_capacity(10), 3); + + for d in data.as_ref() { + if let Some(v) = d { + builder.values().append_slice(v.as_ref()); + builder.append(true); + } else { + for _ in 0..builder.value_length() { + builder.values().append_null(); + } + builder.append(false); + } + } + builder.finish().into_data() +} + +#[test] +fn test_fixed_size_list_equal() { + let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + test_equal(&a, &b, true); + + let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + test_equal(&a, &b, false); +} + +// Test the case where null_count > 0 +#[test] +fn test_fixed_list_null() { + let a = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + test_equal(&a, &b, true); + + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + Some(&[7, 8, 9]), + Some(&[4, 5, 6]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[3, 6, 9]), + None, + None, + ]); + test_equal(&a, &b, false); + + let b = create_fixed_size_list_array(&[None, Some(&[4, 5, 6]), None, None]); + + test_equal(&a.slice(2, 4), &b, true); + test_equal(&a.slice(3, 3), &b.slice(1, 3), true); +} + +#[test] +fn test_fixed_list_offsets() { + // Test the case where offset != 0 + let a = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[4, 5, 6]), + None, + None, + ]); + let b = create_fixed_size_list_array(&[ + Some(&[1, 2, 3]), + None, + None, + Some(&[3, 6, 9]), + None, + None, + ]); + + let a_slice = a.slice(0, 3); + let b_slice = b.slice(0, 3); + test_equal(&a_slice, &b_slice, true); + + let a_slice = a.slice(0, 5); + let b_slice = b.slice(0, 5); + test_equal(&a_slice, &b_slice, false); + + let a_slice = a.slice(4, 1); + let b_slice = b.slice(4, 1); + test_equal(&a_slice, &b_slice, true); +} + +#[test] +fn test_struct_equal() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let a = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let a = a.data(); + + let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); + let b = b.data(); + + test_equal(a, b, true); +} + +#[test] +fn test_struct_equal_null() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); + + let a = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + let b = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints_non_null.data_ref().clone()) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); + + // test with arrays that are not equal + let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); + let c = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(c_ints_non_null.data_ref().clone()) + .build() + .unwrap(); + let c = make_array(c); + + test_equal(a.data_ref(), c.data_ref(), false); + + // test a nested struct + let a = ArrayData::builder(DataType::Struct(vec![Field::new( + "f3", + a.data_type().clone(), + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) + .len(5) + .add_child_data(a.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + // reconstruct b, but with different data where the first struct is null + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joanne"), // difference + None, + None, + Some("mark"), + Some("doe"), + ])); + let b = ArrayData::builder(DataType::Struct(vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings.data_ref().clone()) + .add_child_data(ints_non_null.data_ref().clone()) + .build() + .unwrap(); + + let b = ArrayData::builder(DataType::Struct(vec![Field::new( + "f3", + b.data_type().clone(), + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) + .len(5) + .add_child_data(b) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); +} + +#[test] +fn test_struct_equal_null_variable_size() { + // the string arrays differ, but where the struct array is null + let strings1: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doel"), + ])); + let strings2: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joel"), + None, + None, + Some("mark"), + Some("doe"), + ])); + + let a = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) + .len(5) + .add_child_data(strings1.data_ref().clone()) + .build() + .unwrap(); + let a = make_array(a); + + let b = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) + .len(5) + .add_child_data(strings2.data_ref().clone()) + .build() + .unwrap(); + let b = make_array(b); + + test_equal(a.data_ref(), b.data_ref(), true); + + // test with arrays that are not equal + let strings3: ArrayRef = Arc::new(StringArray::from(vec![ + Some("mark"), + None, + None, + Some("doe"), + Some("joe"), + ])); + let c = ArrayData::builder(DataType::Struct(vec![Field::new( + "f1", + DataType::Utf8, + true, + )])) + .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) + .len(5) + .add_child_data(strings3.data_ref().clone()) + .build() + .unwrap(); + let c = make_array(c); + + test_equal(a.data_ref(), c.data_ref(), false); +} + +fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { + let values = StringArray::from(values.to_vec()); + let mut builder = + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) + .unwrap(); + for key in keys { + if let Some(v) = key { + builder.append(v).unwrap(); + } else { + builder.append_null() + } + } + builder.finish().into_data() +} + +#[test] +fn test_dictionary_equal() { + // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) + let a = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), Some("b"), Some("a"), Some("c")], + ); + // different representation (values and keys are swapped), same result + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), Some("c")], + ); + test_equal(&a, &b, true); + + // different len + let b = create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a")]); + test_equal(&a, &b, false); + + // different key + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), Some("a")], + ); + test_equal(&a, &b, false); + + // different values, same keys + let b = create_dictionary_array( + &["a", "b", "d"], + &[Some("a"), Some("b"), Some("a"), Some("d")], + ); + test_equal(&a, &b, false); +} + +#[test] +fn test_dictionary_equal_null() { + // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) + let a = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), None, Some("a"), Some("c")], + ); + + // equal to self + test_equal(&a, &a, true); + + // different representation (values and keys are swapped), same result + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), None, Some("a"), Some("c")], + ); + test_equal(&a, &b, true); + + // different null position + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), Some("b"), Some("a"), None], + ); + test_equal(&a, &b, false); + + // different key + let b = create_dictionary_array( + &["a", "c", "b"], + &[Some("a"), None, Some("a"), Some("a")], + ); + test_equal(&a, &b, false); + + // different values, same keys + let b = create_dictionary_array( + &["a", "b", "d"], + &[Some("a"), None, Some("a"), Some("d")], + ); + test_equal(&a, &b, false); +} + +#[test] +fn test_non_null_empty_strings() { + let s = StringArray::from(vec![Some(""), Some(""), Some("")]); + + let string1 = s.data(); + + let string2 = ArrayData::builder(DataType::Utf8) + .len(string1.len()) + .buffers(string1.buffers().to_vec()) + .build() + .unwrap(); + + // string2 is identical to string1 except that it has no validity buffer but since there + // are no nulls, string1 and string2 are equal + test_equal(string1, &string2, true); +} + +#[test] +fn test_null_empty_strings() { + let s = StringArray::from(vec![Some(""), None, Some("")]); + + let string1 = s.data(); + + let string2 = ArrayData::builder(DataType::Utf8) + .len(string1.len()) + .buffers(string1.buffers().to_vec()) + .build() + .unwrap(); + + // string2 is identical to string1 except that it has no validity buffer since string1 has + // nulls in it, string1 and string2 are not equal + test_equal(string1, &string2, false); +} + +#[test] +fn test_union_equal_dense() { + let mut builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union1 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union2 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 5).unwrap(); + builder.append::("c", 4).unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union3 = builder.build().unwrap(); + + builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("c").unwrap(); + builder.append_null::("b").unwrap(); + builder.append::("b", 7).unwrap(); + let union4 = builder.build().unwrap(); + + test_equal(union1.data(), union2.data(), true); + test_equal(union1.data(), union3.data(), false); + test_equal(union1.data(), union4.data(), false); +} + +#[test] +fn test_union_equal_sparse() { + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union1 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union2 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 5).unwrap(); + builder.append::("c", 4).unwrap(); + builder.append::("a", 6).unwrap(); + builder.append::("b", 7).unwrap(); + let union3 = builder.build().unwrap(); + + builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("c", 3).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append_null::("a").unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("b", 7).unwrap(); + let union4 = builder.build().unwrap(); + + test_equal(union1.data(), union2.data(), true); + test_equal(union1.data(), union3.data(), false); + test_equal(union1.data(), union4.data(), false); +} + +#[test] +fn test_boolean_slice() { + let array = BooleanArray::from(vec![true; 32]); + let slice = array.slice(4, 12); + assert_eq!(slice.data(), slice.data()); + + let slice = array.slice(8, 12); + assert_eq!(slice.data(), slice.data()); + + let slice = array.slice(8, 24); + assert_eq!(slice.data(), slice.data()); +} + +#[test] +fn test_sliced_nullable_boolean_array() { + let a = BooleanArray::from(vec![None; 32]); + let b = BooleanArray::from(vec![true; 32]); + let slice_a = a.slice(1, 12); + let slice_b = b.slice(1, 12); + assert_ne!(slice_a.data(), slice_b.data()); +} + +#[test] +fn list_array_non_zero_nulls() { + // Tests handling of list arrays with non-empty null ranges + let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); + builder.values().append_value(1); + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); + builder.append(false); + let array1 = builder.finish(); + + let mut builder = ListBuilder::new(Int64Builder::with_capacity(10)); + builder.values().append_value(1); + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + let array2 = builder.finish(); + + assert_eq!(array1, array2); +} + +#[test] +fn test_list_different_offsets() { + let a = ListArray::from_iter_primitive::([ + Some([Some(0), Some(0)]), + Some([Some(1), Some(2)]), + Some([None, None]), + ]); + let b = ListArray::from_iter_primitive::([ + Some([Some(1), Some(2)]), + Some([None, None]), + Some([None, None]), + ]); + let a_slice = a.slice(1, 2); + let b_slice = b.slice(0, 2); + assert_eq!(&a_slice, &b_slice); +} diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs new file mode 100644 index 000000000000..3619abacdc9d --- /dev/null +++ b/arrow/tests/array_transform.rs @@ -0,0 +1,1005 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, + FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, + ListBuilder, MapBuilder, NullArray, StringArray, StringBuilder, + StringDictionaryBuilder, StructArray, UInt8Array, +}; +use arrow::datatypes::Int16Type; +use arrow_buffer::Buffer; +use arrow_data::transform::MutableArrayData; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field}; +use std::sync::Arc; + +fn create_decimal_array( + array: Vec>, + precision: u8, + scale: u8, +) -> Decimal128Array { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + .unwrap() +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let arrays = vec![Array::data(&decimal_array)]; + let mut a = MutableArrayData::new(arrays, true, 3); + a.extend(0, 0, 3); + a.extend(0, 2, 3); + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = create_decimal_array(vec![Some(1), Some(2), None, None], 10, 3); + assert_eq!(array, expected); +} +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_offset() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 + let arrays = vec![decimal_array.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); // 2, null + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = create_decimal_array(vec![Some(2), None], 10, 3); + assert_eq!(array, expected); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_null_offset_nulls() { + let decimal_array = + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 + let arrays = vec![decimal_array.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); // 2, null + a.extend_nulls(3); // 2, null, null, null, null + a.extend(0, 1, 3); //2, null, null, null, null, null, 3 + let result = a.freeze(); + let array = Decimal128Array::from(result); + let expected = + create_decimal_array(vec![Some(2), None, None, None, None, None, Some(3)], 10, 3); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array w/ offset nor nulls +#[test] +fn test_primitive() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 3); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(1), Some(2)]); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array with offset w/ nulls +#[test] +fn test_primitive_offset() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 2); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(2), Some(3)]); + assert_eq!(array, expected); +} + +/// tests extending from a primitive array with offset and nulls +#[test] +fn test_primitive_null_offset() { + let b = UInt8Array::from(vec![Some(1), None, Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, false, 2); + a.extend(0, 0, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![None, Some(3)]); + assert_eq!(array, expected); +} + +#[test] +fn test_primitive_null_offset_nulls() { + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = b.slice(1, 2); + let arrays = vec![b.data()]; + let mut a = MutableArrayData::new(arrays, true, 2); + a.extend(0, 0, 2); + a.extend_nulls(3); + a.extend(0, 1, 2); + let result = a.freeze(); + let array = UInt8Array::from(result); + let expected = UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); + assert_eq!(array, expected); +} + +#[test] +fn test_list_null_offset() { + let int_builder = Int64Builder::with_capacity(24); + let mut builder = ListBuilder::::new(int_builder); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.values().append_slice(&[6, 7, 8]); + builder.append(true); + let array = builder.finish(); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + mutable.extend(0, 0, 1); + + let result = mutable.freeze(); + let array = ListArray::from(result); + + let int_builder = Int64Builder::with_capacity(24); + let mut builder = ListBuilder::::new(int_builder); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + let expected = builder.finish(); + + assert_eq!(array, expected); +} + +/// tests extending from a variable-sized (strings and binary) array w/ offset with nulls +#[test] +fn test_variable_sized_nulls() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None]); + assert_eq!(result, expected); +} + +/// tests extending from a variable-sized (strings and binary) array +/// with an offset and nulls +#[test] +fn test_variable_sized_offsets() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 0, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); + assert_eq!(result, expected); +} + +#[test] +fn test_string_offsets() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 0, 3); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); + assert_eq!(result, expected); +} + +#[test] +fn test_multiple_with_nulls() { + let array1 = StringArray::from(vec!["hello", "world"]); + let array2 = StringArray::from(vec![Some("1"), None]); + + let arrays = vec![array1.data(), array2.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 5); + + mutable.extend(0, 0, 2); + mutable.extend(1, 0, 2); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![Some("hello"), Some("world"), Some("1"), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_string_null_offset_nulls() { + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); + let array = array.slice(1, 3); + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, true, 0); + + mutable.extend(0, 1, 3); + mutable.extend_nulls(1); + + let result = mutable.freeze(); + let result = StringArray::from(result); + + let expected = StringArray::from(vec![None, Some("defh"), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_bool() { + let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = BooleanArray::from(result); + + let expected = BooleanArray::from(vec![Some(true), None]); + assert_eq!(result, expected); +} + +#[test] +fn test_null() { + let array1 = NullArray::new(10); + let array2 = NullArray::new(5); + let arrays = vec![array1.data(), array2.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + mutable.extend(1, 0, 1); + + let result = mutable.freeze(); + let result = NullArray::from(result); + + let expected = NullArray::new(3); + assert_eq!(result, expected); +} + +fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { + let values = StringArray::from(values.to_vec()); + let mut builder = + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) + .unwrap(); + for key in keys { + if let Some(v) = key { + builder.append(v).unwrap(); + } else { + builder.append_null() + } + } + builder.finish().into_data() +} + +#[test] +fn test_dictionary() { + // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) + let array = create_dictionary_array( + &["a", "b", "c"], + &[Some("a"), Some("b"), None, Some("c")], + ); + let arrays = vec![&array]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + + let result = mutable.freeze(); + let result = DictionaryArray::from(result); + + let expected = Int16Array::from(vec![Some(1), None]); + assert_eq!(result.keys(), &expected); +} + +#[test] +fn test_struct() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected = StructArray::try_from(vec![ + ("f1", strings.slice(1, 2)), + ("f2", ints.slice(1, 2)), + ]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_struct_offset() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .slice(1, 3); + let arrays = vec![array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_strings: ArrayRef = + Arc::new(StringArray::from(vec![None, Some("mark")])); + let expected = + StructArray::try_from(vec![("f1", expected_strings), ("f2", ints.slice(2, 2))]) + .unwrap(); + + assert_eq!(array, expected); +} + +#[test] +fn test_struct_nulls() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; + let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; + + let expected = + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_struct_many() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + None, + None, + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + None, + Some(4), + Some(5), + ])); + + let array = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap(); + let arrays = vec![array.data(), array.data()]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 3); + mutable.extend(1, 0, 2); + let data = mutable.freeze(); + let array = StructArray::from(data); + + let expected_string = + Arc::new(StringArray::from(vec![None, None, Some("joe"), None])) as ArrayRef; + let expected_int = + Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; + + let expected = + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) + .unwrap(); + assert_eq!(array, expected) +} + +#[test] +fn test_binary_fixed_sized_offsets() { + let array = FixedSizeBinaryArray::try_from_iter( + vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), + ) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + let array = array.slice(1, 2); + // = [[0, 1], [0, 2]] due to the offset = 1 + + let arrays = vec![array.data()]; + + let mut mutable = MutableArrayData::new(arrays, false, 0); + + mutable.extend(0, 1, 2); + mutable.extend(0, 0, 1); + + let result = mutable.freeze(); + let result = FixedSizeBinaryArray::from(result); + + let expected = + FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + assert_eq!(result, expected); +} + +#[test] +fn test_list_append() { + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(24)); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true); + let a = builder.finish(); + + let a_builder = Int64Builder::with_capacity(24); + let mut a_builder = ListBuilder::::new(a_builder); + a_builder.values().append_slice(&[12, 13]); + a_builder.append(true); + a_builder.append(true); + a_builder.values().append_slice(&[14, 15]); + a_builder.append(true); + let b = a_builder.finish(); + + let c = b.slice(1, 2); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + + let finished = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + // append first array + Some(12), + Some(13), + Some(14), + Some(15), + // append second array + Some(14), + Some(15), + ]); + let list_value_offsets = + Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 8, + None, + 0, + vec![list_value_offsets], + vec![expected_int_array.into_data()], + ) + .unwrap(); + assert_eq!(finished, expected_list_data); +} + +#[test] +fn test_list_nulls_append() { + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(32)); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5]); + builder.append(true); + builder.append(false); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true); + let a = builder.finish(); + let a = a.data(); + + let mut builder = ListBuilder::::new(Int64Builder::with_capacity(32)); + builder.values().append_slice(&[12, 13]); + builder.append(true); + builder.append(false); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[14, 15]); + builder.append(true); + let b = builder.finish(); + let b = b.data(); + let c = b.slice(1, 2); + let d = b.slice(2, 2); + + let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + mutable.extend(3, 0, d.len()); + let result = mutable.freeze(); + + let expected_int_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + None, + None, + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + None, + None, + Some(14), + Some(15), + // slice(1, 2) results in no values added + None, + None, + Some(14), + Some(15), + ]); + let list_value_offsets = + Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 12, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![list_value_offsets], + vec![expected_int_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_map_nulls_append() { + let mut builder = MapBuilder::::new( + None, + Int64Builder::with_capacity(32), + Int64Builder::with_capacity(32), + ); + builder.keys().append_slice(&[1, 2, 3]); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true).unwrap(); + builder.keys().append_slice(&[4, 5]); + builder.values().append_slice(&[4, 5]); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.keys().append_slice(&[6, 7, 8, 100, 101, 9, 10, 11]); + builder.values().append_slice(&[6, 7, 8]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[9, 10, 11]); + builder.append(true).unwrap(); + + let a = builder.finish(); + let a = a.data(); + + let mut builder = MapBuilder::::new( + None, + Int64Builder::with_capacity(32), + Int64Builder::with_capacity(32), + ); + + builder.keys().append_slice(&[12, 13]); + builder.values().append_slice(&[12, 13]); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + builder.append(true).unwrap(); + builder.keys().append_slice(&[100, 101, 14, 15]); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_slice(&[14, 15]); + builder.append(true).unwrap(); + + let b = builder.finish(); + let b = b.data(); + let c = b.slice(1, 2); + let d = b.slice(2, 2); + + let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(2, 0, c.len()); + mutable.extend(3, 0, d.len()); + let result = mutable.freeze(); + + let expected_key_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(100), + Some(101), + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + Some(100), + Some(101), + Some(14), + Some(15), + // slice(1, 2) results in no values added + Some(100), + Some(101), + Some(14), + Some(15), + ]); + + let expected_value_array = Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + None, + None, + Some(9), + Some(10), + Some(11), + // second array + Some(12), + Some(13), + None, + None, + Some(14), + Some(15), + // slice(1, 2) results in no values added + None, + None, + Some(14), + Some(15), + ]); + + let expected_entry_array = StructArray::from(vec![ + ( + Field::new("keys", DataType::Int64, false), + Arc::new(expected_key_array) as ArrayRef, + ), + ( + Field::new("values", DataType::Int64, true), + Arc::new(expected_value_array) as ArrayRef, + ), + ]); + + let map_offsets = + Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + + let expected_list_data = ArrayData::try_new( + DataType::Map( + Box::new(Field::new( + "entries", + DataType::Struct(vec![ + Field::new("keys", DataType::Int64, false), + Field::new("values", DataType::Int64, true), + ]), + false, + )), + false, + ), + 12, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![map_offsets], + vec![expected_entry_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_list_of_strings_append() { + // [["alpha", "beta", None]] + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.values().append_value("Hello"); + builder.values().append_value("Arrow"); + builder.values().append_null(); + builder.append(true); + let a = builder.finish(); + + // [["alpha", "beta"], [None], ["gamma", "delta", None]] + let mut builder = ListBuilder::new(StringBuilder::new()); + builder.values().append_value("alpha"); + builder.values().append_value("beta"); + builder.append(true); + builder.values().append_null(); + builder.append(true); + builder.values().append_value("gamma"); + builder.values().append_value("delta"); + builder.values().append_null(); + builder.append(true); + let b = builder.finish(); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 3); + mutable.extend(1, 0, 0); + let result = mutable.freeze(); + + let expected_string_array = StringArray::from(vec![ + // extend a[0..a.len()] + // a[0] + Some("Hello"), + Some("Arrow"), + None, + // extend b[0..b.len()] + // b[0] + Some("alpha"), + Some("beta"), + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[1..3] + // b[1] + None, + // b[2] + Some("gamma"), + Some("delta"), + None, + // extend b[0..0] + ]); + let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); + let expected_list_data = ArrayData::try_new( + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + 6, + None, + 0, + vec![list_value_offsets], + vec![expected_string_array.into_data()], + ) + .unwrap(); + assert_eq!(result, expected_list_data); +} + +#[test] +fn test_fixed_size_binary_append() { + let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; + let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + + let b = vec![ + None, + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + ]; + let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + + let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + + mutable.extend(0, 0, a.len()); + mutable.extend(1, 0, b.len()); + mutable.extend(1, 1, 4); + mutable.extend(1, 2, 3); + mutable.extend(1, 5, 5); + let result = mutable.freeze(); + + let expected = vec![ + // a + Some(vec![1, 2]), + Some(vec![3, 4]), + Some(vec![5, 6]), + // b + None, + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + None, + // b[1..4] + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + // b[2..3] + Some(vec![9, 10]), + // b[4..4] + ]; + let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); + assert_eq!(&result, expected.data()); +} + +/* +// this is an old test used on a meanwhile removed dead code +// that is still useful when `MutableArrayData` supports fixed-size lists. +#[test] +fn test_fixed_size_list_append() -> Result<()> { + let int_builder = UInt16Builder::new(64); + let mut builder = FixedSizeListBuilder::::new(int_builder, 2); + builder.values().append_slice(&[1, 2])?; + builder.append(true)?; + builder.values().append_slice(&[3, 4])?; + builder.append(false)?; + builder.values().append_slice(&[5, 6])?; + builder.append(true)?; + + let a_builder = UInt16Builder::new(64); + let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); + a_builder.values().append_slice(&[7, 8])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[9, 10])?; + a_builder.append(true)?; + a_builder.values().append_slice(&[11, 12])?; + a_builder.append(false)?; + a_builder.values().append_slice(&[13, 14])?; + a_builder.append(true)?; + a_builder.values().append_null()?; + a_builder.values().append_null()?; + a_builder.append(true)?; + let a = a_builder.finish(); + + // append array + builder.append_data(&[ + a.data(), + a.slice(1, 3).data(), + a.slice(2, 1).data(), + a.slice(5, 0).data(), + ])?; + let finished = builder.finish(); + + let expected_int_array = UInt16Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + // append first array + Some(7), + Some(8), + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + None, + None, + // append slice(1, 3) + Some(9), + Some(10), + Some(11), + Some(12), + Some(13), + Some(14), + // append slice(2, 1) + Some(11), + Some(12), + ]); + let expected_list_data = ArrayData::new( + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt16, true)), + 2, + ), + 12, + None, + None, + 0, + vec![], + vec![expected_int_array.data()], + ); + let expected_list = + FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayData); + assert_eq!(&expected_list.values(), &finished.values()); + assert_eq!(expected_list.len(), finished.len()); + + Ok(()) +} +*/ diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs new file mode 100644 index 000000000000..93b936e7c2f9 --- /dev/null +++ b/arrow/tests/array_validation.rs @@ -0,0 +1,1100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, + Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, + UInt8Builder, +}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field, UnionMode}; +use std::ptr::NonNull; +use std::sync::Arc; + +#[test] +#[should_panic( + expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" +)] +fn test_buffer_too_small() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) + ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); +} + +#[test] +#[should_panic( + expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" +)] +fn test_buffer_too_small_offset() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + // should fail -- size is ok, but also has offset + ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); +} + +#[test] +#[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] +fn test_bad_number_of_buffers() { + let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) + .unwrap(); +} + +#[test] +#[should_panic(expected = "integer overflow computing min buffer size")] +fn test_fixed_width_overflow() { + let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) + .unwrap(); +} + +#[test] +#[should_panic(expected = "null_bit_buffer size too small. got 1 needed 2")] +fn test_bitmap_too_small() { + let buffer = make_i32_buffer(9); + let null_bit_buffer = Buffer::from(vec![0b11111111]); + + ArrayData::try_new( + DataType::Int32, + 9, + Some(null_bit_buffer), + 0, + vec![buffer], + vec![], + ) + .unwrap(); +} + +// Test creating a dictionary with a non integer type +#[test] +#[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] +fn test_non_int_dictionary() { + let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let data_type = + DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); + let child_data = ArrayData::try_new( + DataType::Int32, + 1, + None, + 0, + vec![i32_buffer.clone()], + vec![], + ) + .unwrap(); + ArrayData::try_new( + data_type, + 1, + None, + 0, + vec![i32_buffer.clone(), i32_buffer], + vec![child_data], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] +fn test_mismatched_dictionary_types() { + // test w/ dictionary created with a child array data that has type different than declared + let string_array: StringArray = vec![Some("foo"), Some("bar")].into_iter().collect(); + let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); + // Dict says LargeUtf8 but array is Utf8 + let data_type = + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::LargeUtf8)); + let child_data = string_array.into_data(); + ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_empty_offsets_buffer() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from(&[]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_single_zero_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] +fn test_empty_utf8_array_with_invalid_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[1i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_empty_utf8_array_with_non_zero_offset() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + 3, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" +)] +fn test_empty_large_utf8_array_with_wrong_type_offsets() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::LargeUtf8, + 0, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8")] +fn test_validate_offsets_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" +)] +fn test_validate_offsets_i64() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); + ArrayData::try_new( + DataType::LargeUtf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] +fn test_validate_offsets_negative_first_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] +fn test_validate_offsets_negative_last_i32() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] +fn test_validate_offsets_range_too_small() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // start offset is larger than end + let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] +fn test_validate_offsets_range_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] +fn test_validate_offsets_first_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +fn test_validate_offsets_first_too_large_skipped() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer, but offset starts at 1 so it is skipped + let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); + let data = ArrayData::try_new( + DataType::Utf8, + 2, + None, + 1, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + let array: StringArray = data.into(); + let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); + assert_eq!(array, expected); +} + +#[test] +#[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] +fn test_validate_offsets_last_too_large() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + // 10 is off the end of the buffer + let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); + ArrayData::try_new( + DataType::Utf8, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" +)] +fn test_validate_fixed_size_list() { + // child has 4 elements, + let child_array = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // but claim we have 3 elements for a fixed size of 2 + // 10 is off the end of the buffer + let field = Field::new("field", DataType::Int32, true); + ArrayData::try_new( + DataType::FixedSizeList(Box::new(field), 2), + 3, + None, + 0, + vec![], + vec![child_array.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Child type mismatch for Struct")] +fn test_validate_struct_child_type() { + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + // validate the the type of struct fields matches child fields + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), + 3, + None, + 0, + vec![], + vec![field1.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" +)] +fn test_validate_struct_child_length() { + // field length only has 4 items, but array claims to have 6 + let field1 = vec![Some(1), Some(2), Some(3), None] + .into_iter() + .collect::(); + + ArrayData::try_new( + DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), + 6, + None, + 0, + vec![], + vec![field1.into_data()], + ) + .unwrap(); +} + +/// Test that the array of type `data_type` that has invalid utf8 data errors +fn check_utf8_validation(data_type: DataType) { + // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself + let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); + let offsets: Vec = [0, 2, 3] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] +fn test_validate_utf8_content() { + check_utf8_validation::(DataType::Utf8); +} + +#[test] +#[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] +fn test_validate_large_utf8_content() { + check_utf8_validation::(DataType::LargeUtf8); +} + +/// Tests that offsets are at valid codepoint boundaries +fn check_utf8_char_boundary(data_type: DataType) { + let data_buffer = Buffer::from("🙀".as_bytes()); + let offsets: Vec = [0, 1, data_buffer.len()] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] +fn test_validate_utf8_char_boundary() { + check_utf8_char_boundary::(DataType::Utf8); +} + +#[test] +#[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] +fn test_validate_large_utf8_char_boundary() { + check_utf8_char_boundary::(DataType::LargeUtf8); +} + +/// Test that the array of type `data_type` that has invalid indexes (out of bounds) +fn check_index_out_of_bounds_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First two offsets are fine, then 5 is out of bounds + let offsets: Vec = [0, 1, 2, 5, 2] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Utf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_large_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeUtf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Binary); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" +)] +fn test_validate_large_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeBinary); +} + +// validate that indexes don't go bacwards check indexes that go backwards +fn check_index_backwards_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First three offsets are fine, then 1 goes backwards + let offsets: Vec = [0, 1, 2, 2, 1] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_utf8_index_backwards() { + check_index_backwards_validation::(DataType::Utf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_large_utf8_index_backwards() { + check_index_backwards_validation::(DataType::LargeUtf8); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_binary_index_backwards() { + check_index_backwards_validation::(DataType::Binary); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" +)] +fn test_validate_large_binary_index_backwards() { + check_index_backwards_validation::(DataType::LargeBinary); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])")] +fn test_validate_dictionary_index_too_large() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // 3 is not a valid index into the values (only 0 and 1) + let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]")] +fn test_validate_dictionary_index_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +fn test_validate_dictionary_index_negative_but_not_referenced() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all, but the array is length 1 + // so the -1 should not be looked at + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // Expect this not to panic + ArrayData::try_new( + data_type, + 1, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" +)] +fn test_validate_dictionary_index_giant_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + .unwrap(); +} + +/// Test that the list of type `data_type` generates correct offset out of bounds errors +fn check_list_offsets(data_type: DataType) { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + // 5 is an invalid offset into a list of only three values + let offsets: Vec = [0, 2, 5, 4] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + 0, + vec![offsets_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" +)] +fn test_validate_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::List(Box::new(field_type))); +} + +#[test] +#[should_panic( + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" +)] +fn test_validate_large_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::LargeList(Box::new(field_type))); +} + +/// Test that the list of type `data_type` generates correct errors for negative offsets +#[test] +#[should_panic( + expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" +)] +fn test_validate_list_negative_offsets() { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let field_type = Field::new("f", values.data_type().clone(), true); + let data_type = DataType::List(Box::new(field_type)); + + // -1 is an invalid offset any way you look at it + let offsets: Vec = vec![0, 2, -1, 4]; + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + 0, + vec![offsets_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])")] +/// test that children are validated recursively (aka bugs in child data of struct also are flagged) +fn test_validate_recursive() { + // Form invalid dictionary array + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + // -1 is not a valid index + let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); + + let dict_data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // purposely create an invalid child data + let dict_data = unsafe { + ArrayData::new_unchecked( + dict_data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers()[0].clone()], + vec![values.into_data()], + ) + }; + + // Now, try and create a struct with this invalid child data (and expect an error) + let data_type = + DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); + + ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); +} + +/// returns a buffer initialized with some constant value for tests +fn make_i32_buffer(n: usize) -> Buffer { + Buffer::from_slice_ref(&vec![42i32; n]) +} + +#[test] +#[should_panic(expected = "Expected Int64 but child data had Int32")] +fn test_validate_union_different_types() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1), Some(2)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), // data is int32 + ], + vec![0, 1], + UnionMode::Sparse, + ), + 2, + None, + 0, + vec![type_ids], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +// sparse with wrong sized children +#[test] +#[should_panic( + expected = "Sparse union child array #1 has length smaller than expected for union array (1 < 2)" +)] +fn test_validate_union_sparse_different_child_len() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + // field 2 only has 1 item but array should have 2 + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Sparse, + ), + 2, + None, + 0, + vec![type_ids], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Expected 2 buffers in array of type Union")] +fn test_validate_union_dense_without_offsets() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + 2, + None, + 0, + vec![type_ids], // need offsets buffer here too + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Need at least 8 bytes in buffers[1] in array of type Union")] +fn test_validate_union_dense_with_bad_len() { + let field1 = vec![Some(1), Some(2)].into_iter().collect::(); + + let field2 = vec![Some(1)].into_iter().collect::(); + + let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 + + ArrayData::try_new( + DataType::Union( + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + 2, + None, + 0, + vec![type_ids, offsets], + vec![field1.into_data(), field2.into_data()], + ) + .unwrap(); +} + +#[test] +fn test_try_new_sliced_struct() { + let mut builder = StructBuilder::new( + vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Boolean, true), + ], + vec![ + Box::new(Int32Builder::with_capacity(5)), + Box::new(BooleanBuilder::with_capacity(5)), + ], + ); + + // struct[0] = { a: 10, b: true } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(10)); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(true)); + builder.append(true); + + // struct[1] = null + builder + .field_builder::(0) + .unwrap() + .append_option(None); + builder + .field_builder::(1) + .unwrap() + .append_option(None); + builder.append(false); + + // struct[2] = { a: null, b: false } + builder + .field_builder::(0) + .unwrap() + .append_option(None); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(false)); + builder.append(true); + + // struct[3] = { a: 21, b: null } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(21)); + builder + .field_builder::(1) + .unwrap() + .append_option(None); + builder.append(true); + + // struct[4] = { a: 18, b: false } + builder + .field_builder::(0) + .unwrap() + .append_option(Some(18)); + builder + .field_builder::(1) + .unwrap() + .append_option(Some(false)); + builder.append(true); + + let struct_array = builder.finish(); + let struct_array_slice = struct_array.slice(1, 3); + let struct_array_data = struct_array_slice.data(); + + let cloned_data = ArrayData::try_new( + struct_array_slice.data_type().clone(), + struct_array_slice.len(), + struct_array_data.null_buffer().cloned(), + struct_array_slice.offset(), + struct_array_data.buffers().to_vec(), + struct_array_data.child_data().to_vec(), + ) + .unwrap(); + let cloned = make_array(cloned_data); + + assert_eq!(&struct_array_slice, &cloned); +} + +#[test] +fn test_string_data_from_foreign() { + let mut strings = "foobarfoobar".to_owned(); + let mut offsets = vec![0_i32, 0, 3, 6, 12]; + let mut bitmap = vec![0b1110_u8]; + + let strings_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(strings.as_mut_ptr()), + strings.len(), + Arc::new(strings), + ) + }; + let offsets_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(offsets.as_mut_ptr() as *mut u8), + offsets.len() * std::mem::size_of::(), + Arc::new(offsets), + ) + }; + let null_buffer = unsafe { + Buffer::from_custom_allocation( + NonNull::new_unchecked(bitmap.as_mut_ptr()), + bitmap.len(), + Arc::new(bitmap), + ) + }; + + let data = ArrayData::try_new( + DataType::Utf8, + 4, + Some(null_buffer), + 0, + vec![offsets_buffer, strings_buffer], + vec![], + ) + .unwrap(); + + let array = make_array(data); + let array = array.as_any().downcast_ref::().unwrap(); + + let expected = + StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); + + assert_eq!(array, &expected); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_decimal_full_validation() { + let values_builder = UInt8Builder::with_capacity(10); + let byte_width = 16; + let mut fixed_size_builder = FixedSizeListBuilder::new(values_builder, byte_width); + let value_as_bytes = 123456_i128.to_le_bytes(); + fixed_size_builder + .values() + .append_slice(value_as_bytes.as_slice()); + fixed_size_builder.append(true); + let fixed_size_array = fixed_size_builder.finish(); + + // Build ArrayData for Decimal + let builder = ArrayData::builder(DataType::Decimal128(5, 3)) + .len(fixed_size_array.len()) + .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); + let array_data = unsafe { builder.build_unchecked() }; + let validation_result = array_data.validate_full(); + let error = validation_result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", + error.to_string() + ); +} + +#[test] +fn test_decimal_validation() { + let mut builder = Decimal128Builder::with_capacity(4, 10, 4); + builder.append_value(10000).unwrap(); + builder.append_value(20000).unwrap(); + let array = builder.finish(); + + array.data().validate_full().unwrap(); +} + +#[test] +#[cfg(not(feature = "force_validate"))] +fn test_sliced_array_child() { + let values = Int32Array::from_iter_values([1, 2, 3]); + let values_sliced = values.slice(1, 2); + let offsets = Buffer::from_iter([1_i32, 3_i32]); + + let list_field = Field::new("element", DataType::Int32, false); + let data_type = DataType::List(Box::new(list_field)); + + let data = unsafe { + ArrayData::new_unchecked( + data_type, + 1, + None, + None, + 0, + vec![offsets], + vec![values_sliced.into_data()], + ) + }; + + let err = data.validate_values().unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); +} From 9833d29eb77796f5e549c423e7346f9081b9fda4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 22 Sep 2022 09:51:06 -0700 Subject: [PATCH 0069/1411] Add divide_scalar_opt_dyn (#2768) --- arrow/src/compute/kernels/arithmetic.rs | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index d33827594af5..1c28c9895240 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1578,6 +1578,33 @@ where .map(|a| Arc::new(a) as ArrayRef) } +/// Divide every value in an array by a scalar. If any value in the array is null then the +/// result is also null. The given array must be a `PrimitiveArray` of the type +/// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. +/// +/// If any right hand value is zero, the operation value will be replaced with null in the +/// result. +/// +/// Unlike `divide_scalar_dyn` or `divide_scalar_checked_dyn`, division by zero will get a +/// null value instead returning an `Err`, this also doesn't check overflowing, overflowing +/// will just wrap the result around. +pub fn divide_scalar_opt_dyn(array: &dyn Array, divisor: T::Native) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero, +{ + if divisor.is_zero() { + match array.data_type() { + DataType::Dictionary(_, value_type) => { + return Ok(new_null_array(value_type.as_ref(), array.len())) + } + _ => return Ok(new_null_array(array.data_type(), array.len())), + } + } + + unary_dyn::<_, T>(array, |value| value.div_wrapping(divisor)) +} + #[cfg(test)] mod tests { use super::*; @@ -2854,4 +2881,21 @@ mod tests { let division_by_zero = divide_dyn_opt(&a, &b); assert_eq!(&expected, &division_by_zero.unwrap()); } + + #[test] + fn test_div_scalar_dyn_opt_overflow_division_by_zero() { + let a = Int32Array::from(vec![i32::MIN]); + + let division_by_zero = divide_scalar_opt_dyn::(&a, 0); + let expected = Arc::new(Int32Array::from(vec![None])) as ArrayRef; + assert_eq!(&expected, &division_by_zero.unwrap()); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let division_by_zero = divide_scalar_opt_dyn::(&a, 0); + assert_eq!(&expected, &division_by_zero.unwrap()); + } } From c2972c1db521d718f66bf8d6565f034b9a770758 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 22 Sep 2022 11:47:28 -0700 Subject: [PATCH 0070/1411] Fix clippy (#2771) --- arrow-buffer/src/util/bit_chunk_iterator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index ba028204da10..a739a9694200 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -178,7 +178,7 @@ pub type UnalignedBitChunkIterator<'a> = std::iter::Chain< fn read_u64(input: &[u8]) -> u64 { let len = input.len().min(8); let mut buf = [0_u8; 8]; - (&mut buf[..len]).copy_from_slice(input); + buf[..len].copy_from_slice(input); u64::from_le_bytes(buf) } From b7bcfd8dae15feff5cdba20654b175aa6e3d1a3f Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 22 Sep 2022 13:43:37 -0700 Subject: [PATCH 0071/1411] MINOR: Fix clippy for rust 1.64.0 (#2772) * Fix clippy * More --- arrow/src/ffi.rs | 4 ++-- parquet/src/column/writer/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 528f3adc2d84..77d277afa300 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1392,8 +1392,8 @@ mod tests { // verify assert_eq!(array, Int32Array::from(vec![2, 4, 6])); - Box::from_raw(out_array_ptr); - Box::from_raw(out_schema_ptr); + drop(Box::from_raw(out_array_ptr)); + drop(Box::from_raw(out_schema_ptr)); } Ok(()) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index f9dd2d8d39be..55e667043d35 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1071,10 +1071,10 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { if a_length != b_length { let not_equal = if a_length > b_length { let lead_length = a_length - b_length; - (&a[0..lead_length]).iter().any(|&x| x != extension) + a[0..lead_length].iter().any(|&x| x != extension) } else { let lead_length = b_length - a_length; - (&b[0..lead_length]).iter().any(|&x| x != extension) + b[0..lead_length].iter().any(|&x| x != extension) }; if not_equal { From dc62e404caeae242ad90cb4f03ce9c366fea860c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 23 Sep 2022 15:12:02 +0100 Subject: [PATCH 0072/1411] Re-enable golang integration tests (#2688) (#2773) --- .github/workflows/integration.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 242fc3d85ee6..d78f02c95a48 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -77,9 +77,8 @@ jobs: run: conda run --no-capture-output ci/scripts/cpp_build.sh $PWD /build - name: Build C# run: conda run --no-capture-output ci/scripts/csharp_build.sh $PWD /build - # Temporarily disable Golang #2688 - # - name: Build Go - # run: conda run --no-capture-output ci/scripts/go_build.sh $PWD + - name: Build Go + run: conda run --no-capture-output ci/scripts/go_build.sh $PWD - name: Build Java run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - name: Build JS @@ -94,7 +93,7 @@ jobs: --with-csharp=1 \ --with-java=1 \ --with-js=1 \ - --with-go=0 \ + --with-go=1 \ --with-rust=1 \ --gold-dirs=testing/data/arrow-ipc-stream/integration/0.14.1 \ --gold-dirs=testing/data/arrow-ipc-stream/integration/0.17.1 \ From 7c8080c6752a55256630f6f6e6c82bf8a540d20b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 24 Sep 2022 17:20:49 +0100 Subject: [PATCH 0073/1411] Trim parquet row selection (#2705) --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index b00afc475154..59abf9ad8dc5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -570,7 +570,7 @@ impl ParquetRecordBatchReader { batch_size, array_reader, schema: Arc::new(schema), - selection: selection.map(Into::into), + selection: selection.map(|s| s.trim().into()), } } } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 495e346e0f8a..6a965dc9bc56 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -285,6 +285,14 @@ impl RowSelection { pub fn selects_any(&self) -> bool { self.selectors.iter().any(|x| !x.skip) } + + /// Trims this [`RowSelection`] removing any trailing skips + pub(crate) fn trim(mut self) -> Self { + while self.selectors.last().map(|x| x.skip).unwrap_or(false) { + self.selectors.pop(); + } + self + } } impl From> for RowSelection { From d52cae0df1d7ba651d2c9a7d4904666363f40a76 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 24 Sep 2022 17:26:24 +0100 Subject: [PATCH 0074/1411] Remove ArrowNativeType: FromStr (#2775) * Remove ArrowNativeType: FromStr * Format --- arrow-buffer/src/native.rs | 10 +------ arrow/src/lib.rs | 1 + arrow/src/util/reader_parser.rs | 47 +++++++++++++++------------------ 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index d8431953c430..90855872d18e 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -44,15 +44,7 @@ mod private { /// /// Due to the above restrictions, this trait is sealed to prevent accidental misuse pub trait ArrowNativeType: - std::fmt::Debug - + Send - + Sync - + Copy - + PartialOrd - + std::str::FromStr - + Default - + private::Sealed - + 'static + std::fmt::Debug + Send + Sync + Copy + PartialOrd + Default + private::Sealed + 'static { /// Convert native type from usize. #[inline] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 5cc264b1392e..ce171ec861a4 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -117,6 +117,7 @@ //! fn parse_to_primitive<'a, T, I>(iter: I) -> PrimitiveArray //! where //! T: ArrowPrimitiveType, +//! T::Native: FromStr, //! I: IntoIterator, //! { //! PrimitiveArray::from_iter(iter.into_iter().map(|val| T::Native::from_str(val).ok())) diff --git a/arrow/src/util/reader_parser.rs b/arrow/src/util/reader_parser.rs index 6b6f24f82a43..91b362df86fd 100644 --- a/arrow/src/util/reader_parser.rs +++ b/arrow/src/util/reader_parser.rs @@ -21,9 +21,7 @@ use crate::datatypes::*; /// Specialized parsing implementations /// used by csv and json reader pub(crate) trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option { - string.parse::().ok() - } + fn parse(string: &str) -> Option; fn parse_formatted(string: &str, _format: &str) -> Option { Self::parse(string) @@ -42,21 +40,23 @@ impl Parser for Float64Type { } } -impl Parser for UInt64Type {} - -impl Parser for UInt32Type {} - -impl Parser for UInt16Type {} - -impl Parser for UInt8Type {} - -impl Parser for Int64Type {} - -impl Parser for Int32Type {} - -impl Parser for Int16Type {} - -impl Parser for Int8Type {} +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + string.parse::().ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); impl Parser for TimestampNanosecondType { fn parse(string: &str) -> Option { @@ -85,13 +85,10 @@ impl Parser for TimestampSecondType { } } -impl Parser for Time64NanosecondType {} - -impl Parser for Time64MicrosecondType {} - -impl Parser for Time32MillisecondType {} - -impl Parser for Time32SecondType {} +parser_primitive!(Time64NanosecondType); +parser_primitive!(Time64MicrosecondType); +parser_primitive!(Time32MillisecondType); +parser_primitive!(Time32SecondType); /// Number of days between 0001-01-01 and 1970-01-01 const EPOCH_DAYS_FROM_CE: i32 = 719_163; From cf1e778b8c34155e7b598907a829ff6c8e52a1ea Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 24 Sep 2022 19:39:13 +0100 Subject: [PATCH 0075/1411] Fix Backwards Compatible Parquet List Encodings (#1915) (#2774) * Fix schema for non-list repeated fields (#1915) * Clippy --- parquet/src/arrow/array_reader/builder.rs | 229 ++++++++++++------- parquet/src/arrow/array_reader/list_array.rs | 31 +-- parquet/src/arrow/arrow_reader/mod.rs | 99 +++++++- parquet/src/arrow/async_reader.rs | 28 ++- parquet/src/arrow/schema.rs | 22 +- sample.parquet | Bin 0 -> 686 bytes 6 files changed, 282 insertions(+), 127 deletions(-) create mode 100644 sample.parquet diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 5f3ce75824ae..c0216466d489 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use arrow::datatypes::{DataType, SchemaRef}; +use arrow::datatypes::DataType; use crate::arrow::array_reader::empty_array::make_empty_array_reader; use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader; @@ -26,40 +26,43 @@ use crate::arrow::array_reader::{ ListArrayReader, MapArrayReader, NullArrayReader, PrimitiveArrayReader, RowGroupCollection, StructArrayReader, }; -use crate::arrow::schema::{convert_schema, ParquetField, ParquetFieldType}; +use crate::arrow::schema::{ParquetField, ParquetFieldType}; use crate::arrow::ProjectionMask; use crate::basic::Type as PhysicalType; use crate::data_type::{ BoolType, DoubleType, FloatType, Int32Type, Int64Type, Int96Type, }; -use crate::errors::Result; +use crate::errors::{ParquetError, Result}; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; /// Create array reader from parquet schema, projection mask, and parquet file reader. pub fn build_array_reader( - arrow_schema: SchemaRef, - mask: ProjectionMask, + field: Option<&ParquetField>, + mask: &ProjectionMask, row_groups: &dyn RowGroupCollection, ) -> Result> { - let field = convert_schema(&row_groups.schema(), mask, Some(arrow_schema.as_ref()))?; + let reader = field + .and_then(|field| build_reader(field, mask, row_groups).transpose()) + .transpose()? + .unwrap_or_else(|| make_empty_array_reader(row_groups.num_rows())); - match &field { - Some(field) => build_reader(field, row_groups), - None => Ok(make_empty_array_reader(row_groups.num_rows())), - } + Ok(reader) } fn build_reader( field: &ParquetField, + mask: &ProjectionMask, row_groups: &dyn RowGroupCollection, -) -> Result> { +) -> Result>> { match field.field_type { - ParquetFieldType::Primitive { .. } => build_primitive_reader(field, row_groups), + ParquetFieldType::Primitive { .. } => { + build_primitive_reader(field, mask, row_groups) + } ParquetFieldType::Group { .. } => match &field.arrow_type { - DataType::Map(_, _) => build_map_reader(field, row_groups), - DataType::Struct(_) => build_struct_reader(field, row_groups), - DataType::List(_) => build_list_reader(field, false, row_groups), - DataType::LargeList(_) => build_list_reader(field, true, row_groups), + DataType::Map(_, _) => build_map_reader(field, mask, row_groups), + DataType::Struct(_) => build_struct_reader(field, mask, row_groups), + DataType::List(_) => build_list_reader(field, mask, false, row_groups), + DataType::LargeList(_) => build_list_reader(field, mask, true, row_groups), d => unimplemented!("reading group type {} not implemented", d), }, } @@ -68,59 +71,106 @@ fn build_reader( /// Build array reader for map type. fn build_map_reader( field: &ParquetField, + mask: &ProjectionMask, row_groups: &dyn RowGroupCollection, -) -> Result> { +) -> Result>> { let children = field.children().unwrap(); assert_eq!(children.len(), 2); - let key_reader = build_reader(&children[0], row_groups)?; - let value_reader = build_reader(&children[1], row_groups)?; + let key_reader = build_reader(&children[0], mask, row_groups)?; + let value_reader = build_reader(&children[1], mask, row_groups)?; - Ok(Box::new(MapArrayReader::new( - key_reader, - value_reader, - field.arrow_type.clone(), - field.def_level, - field.rep_level, - field.nullable, - ))) + match (key_reader, value_reader) { + (Some(key_reader), Some(value_reader)) => { + let key_type = key_reader.get_data_type().clone(); + let value_type = value_reader.get_data_type().clone(); + + let data_type = match &field.arrow_type { + DataType::Map(map_field, is_sorted) => match map_field.data_type() { + DataType::Struct(fields) => { + assert_eq!(fields.len(), 2); + let struct_field = + map_field.clone().with_data_type(DataType::Struct(vec![ + fields[0].clone().with_data_type(key_type), + fields[1].clone().with_data_type(value_type), + ])); + DataType::Map(Box::new(struct_field), *is_sorted) + } + _ => unreachable!(), + }, + _ => unreachable!(), + }; + + Ok(Some(Box::new(MapArrayReader::new( + key_reader, + value_reader, + data_type, + field.def_level, + field.rep_level, + field.nullable, + )))) + } + (None, None) => Ok(None), + _ => { + Err(general_err!( + "partial projection of MapArray is not supported" + )) + } + } } /// Build array reader for list type. fn build_list_reader( field: &ParquetField, + mask: &ProjectionMask, is_large: bool, row_groups: &dyn RowGroupCollection, -) -> Result> { +) -> Result>> { let children = field.children().unwrap(); assert_eq!(children.len(), 1); - let data_type = field.arrow_type.clone(); - let item_reader = build_reader(&children[0], row_groups)?; + let reader = match build_reader(&children[0], mask, row_groups)? { + Some(item_reader) => { + let item_type = item_reader.get_data_type().clone(); + let data_type = match &field.arrow_type { + DataType::List(f) => { + DataType::List(Box::new(f.clone().with_data_type(item_type))) + } + DataType::LargeList(f) => { + DataType::LargeList(Box::new(f.clone().with_data_type(item_type))) + } + _ => unreachable!(), + }; - match is_large { - false => Ok(Box::new(ListArrayReader::::new( - item_reader, - data_type, - field.def_level, - field.rep_level, - field.nullable, - )) as _), - true => Ok(Box::new(ListArrayReader::::new( - item_reader, - data_type, - field.def_level, - field.rep_level, - field.nullable, - )) as _), - } + let reader = match is_large { + false => Box::new(ListArrayReader::::new( + item_reader, + data_type, + field.def_level, + field.rep_level, + field.nullable, + )) as _, + true => Box::new(ListArrayReader::::new( + item_reader, + data_type, + field.def_level, + field.rep_level, + field.nullable, + )) as _, + }; + Some(reader) + } + None => None, + }; + Ok(reader) } /// Creates primitive array reader for each primitive type. fn build_primitive_reader( field: &ParquetField, + mask: &ProjectionMask, row_groups: &dyn RowGroupCollection, -) -> Result> { +) -> Result>> { let (col_idx, primitive_type) = match &field.field_type { ParquetFieldType::Primitive { col_idx, @@ -132,6 +182,10 @@ fn build_primitive_reader( _ => unreachable!(), }; + if !mask.leaf_included(col_idx) { + return Ok(None); + } + let physical_type = primitive_type.get_physical_type(); // We don't track the column path in ParquetField as it adds a potential source @@ -150,81 +204,99 @@ fn build_primitive_reader( let page_iterator = row_groups.column_chunks(col_idx)?; let arrow_type = Some(field.arrow_type.clone()); - match physical_type { - PhysicalType::BOOLEAN => Ok(Box::new(PrimitiveArrayReader::::new( + let reader = match physical_type { + PhysicalType::BOOLEAN => Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)), + )?) as _, PhysicalType::INT32 => { if let Some(DataType::Null) = arrow_type { - Ok(Box::new(NullArrayReader::::new( + Box::new(NullArrayReader::::new( page_iterator, column_desc, - )?)) + )?) as _ } else { - Ok(Box::new(PrimitiveArrayReader::::new( + Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)) + )?) as _ } } - PhysicalType::INT64 => Ok(Box::new(PrimitiveArrayReader::::new( + PhysicalType::INT64 => Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)), - PhysicalType::INT96 => Ok(Box::new(PrimitiveArrayReader::::new( + )?) as _, + PhysicalType::INT96 => Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)), - PhysicalType::FLOAT => Ok(Box::new(PrimitiveArrayReader::::new( + )?) as _, + PhysicalType::FLOAT => Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)), - PhysicalType::DOUBLE => Ok(Box::new(PrimitiveArrayReader::::new( + )?) as _, + PhysicalType::DOUBLE => Box::new(PrimitiveArrayReader::::new( page_iterator, column_desc, arrow_type, - )?)), + )?) as _, PhysicalType::BYTE_ARRAY => match arrow_type { Some(DataType::Dictionary(_, _)) => { - make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type) + make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)? } - _ => make_byte_array_reader(page_iterator, column_desc, arrow_type), + _ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?, }, PhysicalType::FIXED_LEN_BYTE_ARRAY => { - make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type) + make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)? } - } + }; + Ok(Some(reader)) } fn build_struct_reader( field: &ParquetField, + mask: &ProjectionMask, row_groups: &dyn RowGroupCollection, -) -> Result> { +) -> Result>> { + let arrow_fields = match &field.arrow_type { + DataType::Struct(children) => children, + _ => unreachable!(), + }; let children = field.children().unwrap(); - let children_reader = children - .iter() - .map(|child| build_reader(child, row_groups)) - .collect::>>()?; + assert_eq!(arrow_fields.len(), children.len()); + + let mut readers = Vec::with_capacity(children.len()); + let mut projected_fields = Vec::with_capacity(children.len()); + + for (arrow, parquet) in arrow_fields.iter().zip(children) { + if let Some(reader) = build_reader(parquet, mask, row_groups)? { + let child_type = reader.get_data_type().clone(); + projected_fields.push(arrow.clone().with_data_type(child_type)); + readers.push(reader); + } + } + + if readers.is_empty() { + return Ok(None); + } - Ok(Box::new(StructArrayReader::new( - field.arrow_type.clone(), - children_reader, + Ok(Some(Box::new(StructArrayReader::new( + DataType::Struct(projected_fields), + readers, field.def_level, field.rep_level, field.nullable, - )) as _) + )))) } #[cfg(test)] mod tests { use super::*; - use crate::arrow::parquet_to_arrow_schema; + use crate::arrow::schema::parquet_to_array_schema_and_fields; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::util::test_common::file_util::get_test_file; use arrow::datatypes::Field; @@ -238,14 +310,15 @@ mod tests { let file_metadata = file_reader.metadata().file_metadata(); let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]); - let arrow_schema = parquet_to_arrow_schema( + let (_, fields) = parquet_to_array_schema_and_fields( file_metadata.schema_descr(), + ProjectionMask::all(), file_metadata.key_value_metadata(), ) .unwrap(); let array_reader = - build_array_reader(Arc::new(arrow_schema), mask, &file_reader).unwrap(); + build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap(); // Create arrow types let arrow_type = DataType::Struct(vec![Field::new( diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index d2fa94611906..f0b5092e1ad4 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -251,6 +251,7 @@ mod tests { use crate::arrow::array_reader::build_array_reader; use crate::arrow::array_reader::list_array::ListArrayReader; use crate::arrow::array_reader::test_util::InMemoryArrayReader; + use crate::arrow::schema::parquet_to_array_schema_and_fields; use crate::arrow::{parquet_to_arrow_schema, ArrowWriter, ProjectionMask}; use crate::file::properties::WriterProperties; use crate::file::reader::{FileReader, SerializedFileReader}; @@ -389,21 +390,10 @@ mod tests { true, ); - let l2 = ListArrayReader::::new( - Box::new(l3), - l2_type, - 3, - 2, - false, - ); + let l2 = ListArrayReader::::new(Box::new(l3), l2_type, 3, 2, false); - let mut l1 = ListArrayReader::::new( - Box::new(l2), - l1_type, - 2, - 1, - true, - ); + let mut l1 = + ListArrayReader::::new(Box::new(l2), l1_type, 2, 1, true); let expected_1 = expected.slice(0, 2); let expected_2 = expected.slice(2, 2); @@ -573,18 +563,17 @@ mod tests { Arc::new(SerializedFileReader::new(file).unwrap()); let file_metadata = file_reader.metadata().file_metadata(); - let arrow_schema = parquet_to_arrow_schema( - file_metadata.schema_descr(), + let schema = file_metadata.schema_descr(); + let mask = ProjectionMask::leaves(schema, vec![0]); + let (_, fields) = parquet_to_array_schema_and_fields( + schema, + ProjectionMask::all(), file_metadata.key_value_metadata(), ) .unwrap(); - let schema = file_metadata.schema_descr_ptr(); - let mask = ProjectionMask::leaves(&schema, vec![0]); - let mut array_reader = - build_array_reader(Arc::new(arrow_schema), mask, &file_reader) - .unwrap(); + build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap(); let batch = array_reader.next_batch(100).unwrap(); assert_eq!(batch.data_type(), array_reader.get_data_type()); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 59abf9ad8dc5..5ee963916da9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -30,8 +30,8 @@ use arrow::{array::StructArray, error::ArrowError}; use crate::arrow::array_reader::{ build_array_reader, ArrayReader, FileReaderRowGroupCollection, RowGroupCollection, }; -use crate::arrow::schema::parquet_to_arrow_schema; -use crate::arrow::schema::parquet_to_arrow_schema_by_columns; +use crate::arrow::schema::{parquet_to_array_schema_and_fields, parquet_to_arrow_schema}; +use crate::arrow::schema::{parquet_to_arrow_schema_by_columns, ParquetField}; use crate::arrow::ProjectionMask; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{KeyValue, ParquetMetaData}; @@ -60,6 +60,8 @@ pub struct ArrowReaderBuilder { pub(crate) schema: SchemaRef, + pub(crate) fields: Option, + pub(crate) batch_size: usize, pub(crate) row_groups: Option>, @@ -82,15 +84,17 @@ impl ArrowReaderBuilder { false => metadata.file_metadata().key_value_metadata(), }; - let schema = Arc::new(parquet_to_arrow_schema( + let (schema, fields) = parquet_to_array_schema_and_fields( metadata.file_metadata().schema_descr(), + ProjectionMask::all(), kv_metadata, - )?); + )?; Ok(Self { input, metadata, - schema, + schema: Arc::new(schema), + fields, batch_size: 1024, row_groups: None, projection: ProjectionMask::all(), @@ -283,8 +287,16 @@ impl ArrowReader for ParquetFileArrowReader { mask: ProjectionMask, batch_size: usize, ) -> Result { - let array_reader = - build_array_reader(Arc::new(self.get_schema()?), mask, &self.file_reader)?; + let (_, field) = parquet_to_array_schema_and_fields( + self.parquet_schema(), + mask, + self.get_kv_metadata(), + )?; + let array_reader = build_array_reader( + field.as_ref(), + &ProjectionMask::all(), + &self.file_reader, + )?; // Try to avoid allocate large buffer let batch_size = self.file_reader.num_rows().min(batch_size); @@ -420,9 +432,11 @@ impl ArrowReaderBuilder> { break; } - let projection = predicate.projection().clone(); - let array_reader = - build_array_reader(Arc::clone(&self.schema), projection, &reader)?; + let array_reader = build_array_reader( + self.fields.as_ref(), + predicate.projection(), + &reader, + )?; selection = Some(evaluate_predicate( batch_size, @@ -433,7 +447,8 @@ impl ArrowReaderBuilder> { } } - let array_reader = build_array_reader(self.schema, self.projection, &reader)?; + let array_reader = + build_array_reader(self.fields.as_ref(), &self.projection, &reader)?; // If selection is empty, truncate if !selects_any(selection.as_ref()) { @@ -2313,4 +2328,66 @@ mod tests { assert_ne!(1024, num_rows); assert_eq!(reader.batch_size, num_rows as usize); } + + #[test] + fn test_raw_repetition() { + const MESSAGE_TYPE: &str = " + message Log { + OPTIONAL INT32 eventType; + REPEATED INT32 category; + REPEATED group filter { + OPTIONAL INT32 error; + } + } + "; + let schema = Arc::new(parse_message_type(MESSAGE_TYPE).unwrap()); + let props = Arc::new(WriterProperties::builder().build()); + + let mut buf = Vec::with_capacity(1024); + let mut writer = SerializedFileWriter::new(&mut buf, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + // column 0 + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&[1], Some(&[1]), None) + .unwrap(); + col_writer.close().unwrap(); + // column 1 + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&[1, 1], Some(&[1, 1]), Some(&[0, 1])) + .unwrap(); + col_writer.close().unwrap(); + // column 2 + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&[1], Some(&[1]), Some(&[0])) + .unwrap(); + col_writer.close().unwrap(); + + let rg_md = row_group_writer.close().unwrap(); + assert_eq!(rg_md.num_rows(), 1); + writer.close().unwrap(); + + let bytes = Bytes::from(buf); + + let mut no_mask = ParquetRecordBatchReader::try_new(bytes.clone(), 1024).unwrap(); + let full = no_mask.next().unwrap().unwrap(); + + assert_eq!(full.num_columns(), 3); + + for idx in 0..3 { + let b = ParquetRecordBatchReaderBuilder::try_new(bytes.clone()).unwrap(); + let mask = ProjectionMask::leaves(b.parquet_schema(), [idx]); + let mut reader = b.with_projection(mask).build().unwrap(); + let projected = reader.next().unwrap().unwrap(); + + assert_eq!(projected.num_columns(), 1); + assert_eq!(full.column(idx), projected.column(0)); + } + } } diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index d444d20d52cc..b6b5d7ff7de6 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -101,6 +101,7 @@ use crate::arrow::arrow_reader::{ evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, RowSelection, }; +use crate::arrow::schema::ParquetField; use crate::arrow::ProjectionMask; use crate::column::page::{PageIterator, PageReader}; @@ -337,7 +338,7 @@ impl ArrowReaderBuilder> { input: self.input.0, filter: self.filter, metadata: self.metadata.clone(), - schema: self.schema.clone(), + fields: self.fields, }; Ok(ParquetRecordBatchStream { @@ -360,7 +361,7 @@ type ReadResult = Result<(ReaderFactory, Option) struct ReaderFactory { metadata: Arc, - schema: SchemaRef, + fields: Option, input: T, @@ -397,13 +398,13 @@ where return Ok((self, None)); } - let predicate_projection = predicate.projection().clone(); + let predicate_projection = predicate.projection(); row_group - .fetch(&mut self.input, &predicate_projection, selection.as_ref()) + .fetch(&mut self.input, predicate_projection, selection.as_ref()) .await?; let array_reader = build_array_reader( - self.schema.clone(), + self.fields.as_ref(), predicate_projection, &row_group, )?; @@ -427,7 +428,7 @@ where let reader = ParquetRecordBatchReader::new( batch_size, - build_array_reader(self.schema.clone(), projection, &row_group)?, + build_array_reader(self.fields.as_ref(), &projection, &row_group)?, selection, ); @@ -792,7 +793,8 @@ mod tests { use crate::arrow::arrow_reader::{ ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowSelector, }; - use crate::arrow::{parquet_to_arrow_schema, ArrowWriter}; + use crate::arrow::schema::parquet_to_array_schema_and_fields; + use crate::arrow::ArrowWriter; use crate::file::footer::parse_metadata; use crate::file::page_index::index_reader; use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; @@ -1278,10 +1280,12 @@ mod tests { }; let requests = async_reader.requests.clone(); - let schema = Arc::new( - parquet_to_arrow_schema(metadata.file_metadata().schema_descr(), None) - .expect("building arrow schema"), - ); + let (_, fields) = parquet_to_array_schema_and_fields( + metadata.file_metadata().schema_descr(), + ProjectionMask::all(), + None, + ) + .unwrap(); let _schema_desc = metadata.file_metadata().schema_descr(); @@ -1290,7 +1294,7 @@ mod tests { let reader_factory = ReaderFactory { metadata, - schema, + fields, input: async_reader, filter: None, }; diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index ad5b6b1f5f80..7803385e7f01 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -41,7 +41,7 @@ mod complex; mod primitive; use crate::arrow::ProjectionMask; -pub(crate) use complex::{convert_schema, ParquetField, ParquetFieldType}; +pub(crate) use complex::{ParquetField, ParquetFieldType}; /// Convert Parquet schema to Arrow schema including optional metadata. /// Attempts to decode any existing Arrow schema metadata, falling back @@ -64,6 +64,15 @@ pub fn parquet_to_arrow_schema_by_columns( mask: ProjectionMask, key_value_metadata: Option<&Vec>, ) -> Result { + Ok(parquet_to_array_schema_and_fields(parquet_schema, mask, key_value_metadata)?.0) +} + +/// Extracts the arrow metadata +pub(crate) fn parquet_to_array_schema_and_fields( + parquet_schema: &SchemaDescriptor, + mask: ProjectionMask, + key_value_metadata: Option<&Vec>, +) -> Result<(Schema, Option)> { let mut metadata = parse_key_value_metadata(key_value_metadata).unwrap_or_default(); let maybe_schema = metadata .remove(super::ARROW_SCHEMA_META_KEY) @@ -77,12 +86,15 @@ pub fn parquet_to_arrow_schema_by_columns( }); } - match convert_schema(parquet_schema, mask, maybe_schema.as_ref())? { - Some(field) => match field.arrow_type { - DataType::Struct(fields) => Ok(Schema::new_with_metadata(fields, metadata)), + match complex::convert_schema(parquet_schema, mask, maybe_schema.as_ref())? { + Some(field) => match &field.arrow_type { + DataType::Struct(fields) => Ok(( + Schema::new_with_metadata(fields.clone(), metadata), + Some(field), + )), _ => unreachable!(), }, - None => Ok(Schema::new_with_metadata(vec![], metadata)), + None => Ok((Schema::new_with_metadata(vec![], metadata), None)), } } diff --git a/sample.parquet b/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..093b6438a491946c7cbe623e4516f562becbffa0 GIT binary patch literal 686 zcmah{v1$TA5S`oGlZ#jw$t<^U1PNrZFk%w4N|P>yg@{E88wnc`f@ju?D5UU<{Dgdf zm88lC-INsMl~SYNyG8-y)VCo``pj6;Mm|15OzaIxRju6k_HP zdd5@qzkZ4@!p%KA1M$YJLnS%vn&%fCUzkyGtj(w&4T@LSm1?C5r}pMG?M?j#G;U(T literal 0 HcmV?d00001 From 6bee57671744bf8b334a4d2d5a1196bf20af52b0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 24 Sep 2022 19:46:10 +0100 Subject: [PATCH 0076/1411] Remove sample.parquet (#2776) --- sample.parquet | Bin 686 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 sample.parquet diff --git a/sample.parquet b/sample.parquet deleted file mode 100644 index 093b6438a491946c7cbe623e4516f562becbffa0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 686 zcmah{v1$TA5S`oGlZ#jw$t<^U1PNrZFk%w4N|P>yg@{E88wnc`f@ju?D5UU<{Dgdf zm88lC-INsMl~SYNyG8-y)VCo``pj6;Mm|15OzaIxRju6k_HP zdd5@qzkZ4@!p%KA1M$YJLnS%vn&%fCUzkyGtj(w&4T@LSm1?C5r}pMG?M?j#G;U(T From 06c204c1b5367999ad848b7de3d587316fc923ff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 26 Sep 2022 13:52:10 +0100 Subject: [PATCH 0077/1411] Split out `arrow-array` crate (#2769) * Split out arrow-array * Fix ffi compilation * Fix data_gen * Fix doc * Doc tweaks * Fix pyarrow --- Cargo.toml | 1 + arrow-array/Cargo.toml | 59 ++ .../src/array/binary_array.rs | 36 +- .../src/array/boolean_array.rs | 33 +- .../src/array/decimal_array.rs | 68 +- .../src/array/dictionary_array.rs | 206 +++-- .../src/array/fixed_size_binary_array.rs | 46 +- .../src/array/fixed_size_list_array.rs | 29 +- .../src/array/list_array.rs | 58 +- .../src/array/map_array.rs | 33 +- .../array.rs => arrow-array/src/array/mod.rs | 100 ++- .../src/array/null_array.rs | 17 +- .../src/array/primitive_array.rs | 303 ++++--- .../src/array/string_array.rs | 40 +- .../src/array/struct_array.rs | 41 +- .../src/array/union_array.rs | 44 +- .../src}/builder/boolean_buffer_builder.rs | 10 +- .../src}/builder/boolean_builder.rs | 67 +- .../src}/builder/buffer_builder.rs | 115 ++- .../src}/builder/decimal_builder.rs | 43 +- .../src}/builder/fixed_size_binary_builder.rs | 20 +- .../src}/builder/fixed_size_list_builder.rs | 20 +- .../src}/builder/generic_binary_builder.rs | 10 +- .../src}/builder/generic_list_builder.rs | 21 +- .../src}/builder/generic_string_builder.rs | 8 +- .../src}/builder/map_builder.rs | 25 +- .../array => arrow-array/src}/builder/mod.rs | 54 +- .../src}/builder/null_buffer_builder.rs | 5 +- .../src}/builder/primitive_builder.rs | 43 +- .../builder/primitive_dictionary_builder.rs | 27 +- .../src}/builder/string_dictionary_builder.rs | 35 +- .../src}/builder/struct_builder.rs | 24 +- .../src}/builder/union_builder.rs | 41 +- arrow-array/src/cast.rs | 767 ++++++++++++++++++ .../src/util => arrow-array/src}/decimal.rs | 32 +- .../datatypes => arrow-array/src}/delta.rs | 0 .../src/array => arrow-array/src}/iterator.rs | 42 +- arrow-array/src/lib.rs | 209 +++++ .../array => arrow-array/src}/raw_pointer.rs | 4 +- {arrow => arrow-array}/src/record_batch.rs | 110 +-- .../src/temporal_conversions.rs | 76 +- .../util => arrow-array/src}/trusted_len.rs | 6 +- .../datatypes => arrow-array/src}/types.rs | 9 +- arrow-pyarrow-integration-testing/src/lib.rs | 6 +- arrow-schema/src/schema.rs | 3 + arrow/Cargo.toml | 1 + arrow/src/array/cast.rs | 761 ----------------- arrow/src/array/mod.rs | 617 +------------- arrow/src/compute/kernels/cast.rs | 5 +- arrow/src/compute/kernels/filter.rs | 8 +- arrow/src/compute/kernels/temporal.rs | 2 +- arrow/src/csv/reader.rs | 7 +- arrow/src/datatypes/mod.rs | 13 +- arrow/src/datatypes/native.rs | 23 +- arrow/src/json/reader.rs | 7 +- arrow/src/lib.rs | 30 +- arrow/src/pyarrow.rs | 13 - arrow/src/util/data_gen.rs | 5 +- arrow/src/util/mod.rs | 5 +- 59 files changed, 2119 insertions(+), 2324 deletions(-) create mode 100644 arrow-array/Cargo.toml rename arrow/src/array/array_binary.rs => arrow-array/src/array/binary_array.rs (97%) rename arrow/src/array/array_boolean.rs => arrow-array/src/array/boolean_array.rs (94%) rename arrow/src/array/array_decimal.rs => arrow-array/src/array/decimal_array.rs (95%) rename arrow/src/array/array_dictionary.rs => arrow-array/src/array/dictionary_array.rs (81%) rename arrow/src/array/array_fixed_size_binary.rs => arrow-array/src/array/fixed_size_binary_array.rs (95%) rename arrow/src/array/array_fixed_size_list.rs => arrow-array/src/array/fixed_size_list_array.rs (95%) rename arrow/src/array/array_list.rs => arrow-array/src/array/list_array.rs (96%) rename arrow/src/array/array_map.rs => arrow-array/src/array/map_array.rs (96%) rename arrow/src/array/array.rs => arrow-array/src/array/mod.rs (94%) rename arrow/src/array/null.rs => arrow-array/src/array/null_array.rs (93%) rename arrow/src/array/array_primitive.rs => arrow-array/src/array/primitive_array.rs (81%) rename arrow/src/array/array_string.rs => arrow-array/src/array/string_array.rs (97%) rename arrow/src/array/array_struct.rs => arrow-array/src/array/struct_array.rs (95%) rename arrow/src/array/array_union.rs => arrow-array/src/array/union_array.rs (97%) rename {arrow/src/array => arrow-array/src}/builder/boolean_buffer_builder.rs (98%) rename {arrow/src/array => arrow-array/src}/builder/boolean_builder.rs (85%) rename {arrow/src/array => arrow-array/src}/builder/buffer_builder.rs (75%) rename {arrow/src/array => arrow-array/src}/builder/decimal_builder.rs (94%) rename {arrow/src/array => arrow-array/src}/builder/fixed_size_binary_builder.rs (94%) rename {arrow/src/array => arrow-array/src}/builder/fixed_size_list_builder.rs (95%) rename {arrow/src/array => arrow-array/src}/builder/generic_binary_builder.rs (97%) rename {arrow/src/array => arrow-array/src}/builder/generic_list_builder.rs (96%) rename {arrow/src/array => arrow-array/src}/builder/generic_string_builder.rs (96%) rename {arrow/src/array => arrow-array/src}/builder/map_builder.rs (93%) rename {arrow/src/array => arrow-array/src}/builder/mod.rs (73%) rename {arrow/src/array => arrow-array/src}/builder/null_buffer_builder.rs (98%) rename {arrow/src/array => arrow-array/src}/builder/primitive_builder.rs (84%) rename {arrow/src/array => arrow-array/src}/builder/primitive_dictionary_builder.rs (92%) rename {arrow/src/array => arrow-array/src}/builder/string_dictionary_builder.rs (94%) rename {arrow/src/array => arrow-array/src}/builder/struct_builder.rs (97%) rename {arrow/src/array => arrow-array/src}/builder/union_builder.rs (92%) create mode 100644 arrow-array/src/cast.rs rename {arrow/src/util => arrow-array/src}/decimal.rs (95%) rename {arrow/src/datatypes => arrow-array/src}/delta.rs (100%) rename {arrow/src/array => arrow-array/src}/iterator.rs (88%) create mode 100644 arrow-array/src/lib.rs rename {arrow/src/array => arrow-array/src}/raw_pointer.rs (95%) rename {arrow => arrow-array}/src/record_batch.rs (91%) rename {arrow => arrow-array}/src/temporal_conversions.rs (71%) rename {arrow/src/util => arrow-array/src}/trusted_len.rs (96%) rename {arrow/src/datatypes => arrow-array/src}/types.rs (98%) delete mode 100644 arrow/src/array/cast.rs diff --git a/Cargo.toml b/Cargo.toml index 270d23f26c94..28517265b3c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "arrow", + "arrow-array", "arrow-data", "arrow-schema", "arrow-buffer", diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml new file mode 100644 index 000000000000..8e66bf3b763c --- /dev/null +++ b/arrow-array/Cargo.toml @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-array" +version = "23.0.0" +description = "Array abstractions for Apache Arrow" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_array" +path = "src/lib.rs" +bench = false + + +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } + +[dependencies] +arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "23.0.0", path = "../arrow-schema" } +arrow-data = { version = "23.0.0", path = "../arrow-data" } +chrono = { version = "0.4", default-features = false, features = ["clock"] } +num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "2.0", default-features = false } +hashbrown = { version = "0.12", default-features = false } + +[dev-dependencies] +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[build-dependencies] diff --git a/arrow/src/array/array_binary.rs b/arrow-array/src/array/binary_array.rs similarity index 97% rename from arrow/src/array/array_binary.rs rename to arrow-array/src/array/binary_array.rs index 1c63e8e24b29..cb168daf0720 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow-array/src/array/binary_array.rs @@ -15,18 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::convert::From; -use std::fmt; -use std::{any::Any, iter::FromIterator}; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericBinaryIter, - GenericListArray, OffsetSizeTrait, -}; -use crate::array::array::ArrayAccessor; -use crate::buffer::Buffer; -use crate::util::bit_util; -use crate::{buffer::MutableBuffer, datatypes::DataType}; +use crate::iterator::GenericBinaryIter; +use crate::raw_pointer::RawPtrBox; +use crate::{print_long_array, Array, ArrayAccessor, GenericListArray, OffsetSizeTrait}; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing /// binary data. @@ -239,13 +234,13 @@ impl GenericBinaryArray { } } -impl fmt::Debug for GenericBinaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for GenericBinaryArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let prefix = OffsetSize::PREFIX; write!(f, "{}BinaryArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -387,7 +382,7 @@ impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { /// Create a BinaryArray from a vector of byte slices. /// /// ``` -/// use arrow::array::{Array, BinaryArray}; +/// use arrow_array::{Array, BinaryArray}; /// let values: Vec<&[u8]> = /// vec![b"one", b"two", b"", b"three"]; /// let array = BinaryArray::from_vec(values); @@ -401,7 +396,7 @@ impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { /// Create a BinaryArray from a vector of Optional (null) byte slices. /// /// ``` -/// use arrow::array::{Array, BinaryArray}; +/// use arrow_array::{Array, BinaryArray}; /// let values: Vec> = /// vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; /// let array = BinaryArray::from_opt_vec(values); @@ -427,7 +422,7 @@ pub type BinaryArray = GenericBinaryArray; /// Create a LargeBinaryArray from a vector of byte slices. /// /// ``` -/// use arrow::array::{Array, LargeBinaryArray}; +/// use arrow_array::{Array, LargeBinaryArray}; /// let values: Vec<&[u8]> = /// vec![b"one", b"two", b"", b"three"]; /// let array = LargeBinaryArray::from_vec(values); @@ -441,7 +436,7 @@ pub type BinaryArray = GenericBinaryArray; /// Create a LargeBinaryArray from a vector of Optional (null) byte slices. /// /// ``` -/// use arrow::array::{Array, LargeBinaryArray}; +/// use arrow_array::{Array, LargeBinaryArray}; /// let values: Vec> = /// vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; /// let array = LargeBinaryArray::from_opt_vec(values); @@ -462,7 +457,8 @@ pub type LargeBinaryArray = GenericBinaryArray; #[cfg(test)] mod tests { use super::*; - use crate::{array::ListArray, datatypes::Field}; + use crate::ListArray; + use arrow_schema::Field; #[test] fn test_binary_array() { diff --git a/arrow/src/array/array_boolean.rs b/arrow-array/src/array/boolean_array.rs similarity index 94% rename from arrow/src/array/array_boolean.rs rename to arrow-array/src/array/boolean_array.rs index 7ea18ea62036..24be122c933c 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -15,23 +15,21 @@ // specific language governing permissions and limitations // under the License. -use crate::array::array::ArrayAccessor; -use std::borrow::Borrow; -use std::convert::From; -use std::iter::{FromIterator, IntoIterator}; -use std::{any::Any, fmt}; - -use super::*; -use super::{array::print_long_array, raw_pointer::RawPtrBox}; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::util::bit_util; +use crate::builder::BooleanBuilder; +use crate::iterator::BooleanIter; +use crate::raw_pointer::RawPtrBox; +use crate::{print_long_array, Array, ArrayAccessor}; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; /// Array of bools /// /// # Example /// /// ``` -/// use arrow::array::{Array, BooleanArray}; +/// use arrow_array::{Array, BooleanArray}; /// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); /// assert_eq!(4, arr.len()); /// assert_eq!(1, arr.null_count()); @@ -50,7 +48,7 @@ use crate::util::bit_util; /// /// Using `from_iter` /// ``` -/// use arrow::array::{Array, BooleanArray}; +/// use arrow_array::{Array, BooleanArray}; /// let v = vec![Some(false), Some(true), Some(false), Some(true)]; /// let arr = v.into_iter().collect::(); /// assert_eq!(4, arr.len()); @@ -72,11 +70,11 @@ pub struct BooleanArray { raw_values: RawPtrBox, } -impl fmt::Debug for BooleanArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for BooleanArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "BooleanArray\n[\n")?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -238,7 +236,7 @@ impl<'a> BooleanArray { } } -impl>> FromIterator for BooleanArray { +impl>> FromIterator for BooleanArray { fn from_iter>(iter: I) -> Self { let iter = iter.into_iter(); let (_, data_len) = iter.size_hint(); @@ -279,9 +277,6 @@ impl>> FromIterator for BooleanArray { mod tests { use super::*; - use crate::buffer::Buffer; - use crate::datatypes::DataType; - #[test] fn test_boolean_fmt_debug() { let arr = BooleanArray::from(vec![true, false, false]); diff --git a/arrow/src/array/array_decimal.rs b/arrow-array/src/array/decimal_array.rs similarity index 95% rename from arrow/src/array/array_decimal.rs rename to arrow-array/src/array/decimal_array.rs index f6a2dda2da5a..34b424092e4b 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow-array/src/array/decimal_array.rs @@ -15,25 +15,22 @@ // specific language governing permissions and limitations // under the License. -use crate::array::ArrayAccessor; -use std::convert::From; -use std::fmt; -use std::marker::PhantomData; -use std::{any::Any, iter::FromIterator}; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, +use crate::builder::BooleanBufferBuilder; +use crate::decimal::{Decimal, Decimal256}; +use crate::iterator::DecimalIter; +use crate::raw_pointer::RawPtrBox; +use crate::types::{Decimal128Type, Decimal256Type, DecimalType, NativeDecimalType}; +use crate::{ + print_long_array, Array, ArrayAccessor, FixedSizeBinaryArray, FixedSizeListArray, }; -use super::{BooleanBufferBuilder, DecimalIter, FixedSizeBinaryArray}; -#[allow(deprecated)] -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::validate_decimal_precision; -use crate::datatypes::{ - validate_decimal256_precision_with_lt_bytes, DataType, Decimal128Type, - Decimal256Type, DecimalType, NativeDecimalType, +use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_data::decimal::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, }; -use crate::error::{ArrowError, Result}; -use crate::util::decimal::{Decimal, Decimal256}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use std::any::Any; +use std::marker::PhantomData; /// `Decimal128Array` stores fixed width decimal numbers, /// with a fixed precision and scale. @@ -41,8 +38,8 @@ use crate::util::decimal::{Decimal, Decimal256}; /// # Examples /// /// ``` -/// use arrow::array::{Array, DecimalArray, Decimal128Array}; -/// use arrow::datatypes::DataType; +/// use arrow_array::{Array, DecimalArray, Decimal128Array}; +/// use arrow_schema::DataType; /// /// // Create a DecimalArray with the default precision and scale /// let decimal_array: Decimal128Array = vec![ @@ -268,7 +265,11 @@ impl DecimalArray { /// 1. `precision` is larger than [`Self::MAX_PRECISION`] /// 2. `scale` is larger than [`Self::MAX_SCALE`]; /// 3. `scale` is > `precision` - pub fn with_precision_and_scale(self, precision: u8, scale: u8) -> Result + pub fn with_precision_and_scale( + self, + precision: u8, + scale: u8, + ) -> Result where Self: Sized, { @@ -292,7 +293,11 @@ impl DecimalArray { } // validate that the new precision and scale are valid or not - fn validate_precision_scale(&self, precision: u8, scale: u8) -> Result<()> { + fn validate_precision_scale( + &self, + precision: u8, + scale: u8, + ) -> Result<(), ArrowError> { if precision > Self::MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "precision {} is greater than max {}", @@ -320,7 +325,7 @@ impl DecimalArray { } // validate all the data in the array are valid within the new precision or not - fn validate_data(&self, precision: u8) -> Result<()> { + fn validate_data(&self, precision: u8) -> Result<(), ArrowError> { // TODO: Move into DecimalType match Self::VALUE_LENGTH { 16 => self @@ -361,7 +366,7 @@ impl Decimal128Array { // Validates decimal128 values in this array can be properly interpreted // with the specified precision. - fn validate_decimal_precision(&self, precision: u8) -> Result<()> { + fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { (0..self.len()).try_for_each(|idx| { if self.is_valid(idx) { let decimal = unsafe { self.value_unchecked(idx) }; @@ -376,7 +381,7 @@ impl Decimal128Array { impl Decimal256Array { // Validates decimal256 values in this array can be properly interpreted // with the specified precision. - fn validate_decimal_precision(&self, precision: u8) -> Result<()> { + fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { (0..self.len()).try_for_each(|idx| { if self.is_valid(idx) { let raw_val = unsafe { @@ -504,8 +509,8 @@ impl From> for ArrayData { } } -impl fmt::Debug for DecimalArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for DecimalArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, "Decimal{}Array<{}, {}>\n[\n", @@ -552,13 +557,12 @@ impl<'a, T: DecimalType> DecimalArray { #[cfg(test)] mod tests { - use crate::array::Decimal256Builder; - use crate::datatypes::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; - use crate::util::decimal::Decimal128; - use crate::{array::Decimal128Builder, datatypes::Field}; - use num::{BigInt, Num}; - use super::*; + use crate::builder::{Decimal128Builder, Decimal256Builder}; + use crate::decimal::Decimal128; + use arrow_data::decimal::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; + use arrow_schema::Field; + use num::{BigInt, Num}; #[test] fn test_decimal_array() { diff --git a/arrow/src/array/array_dictionary.rs b/arrow-array/src/array/dictionary_array.rs similarity index 81% rename from arrow/src/array/array_dictionary.rs rename to arrow-array/src/array/dictionary_array.rs index acdb427a22ab..35d243fde9ae 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -15,20 +15,130 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayAccessor, ArrayIter}; +use crate::builder::StringDictionaryBuilder; +use crate::iterator::ArrayIter; +use crate::types::*; +use crate::{ + make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, PrimitiveArray, + StringArray, +}; +use arrow_buffer::ArrowNativeType; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; -use std::fmt; -use std::iter::IntoIterator; -use std::{convert::From, iter::FromIterator}; -use super::{ - make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray, - StringDictionaryBuilder, -}; -use crate::datatypes::{ - ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType, -}; -use crate::error::Result; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int8DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int16DictionaryArray, Int16Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int16DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int32DictionaryArray, Int32Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int32DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int64DictionaryArray, Int64Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int64DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, UInt8DictionaryArray, UInt8Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type UInt8DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, UInt16DictionaryArray, UInt16Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type UInt16DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, UInt32DictionaryArray, UInt32Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type UInt32DictionaryArray = DictionaryArray; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, UInt64DictionaryArray, UInt64Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type UInt64DictionaryArray = DictionaryArray; /// A dictionary array where each element is a single value indexed by an integer key. /// This is mostly used to represent strings or a limited set of primitive types as integers, @@ -65,8 +175,7 @@ use crate::error::Result; /// Example **with nullable** data: /// /// ``` -/// use arrow::array::{DictionaryArray, Int8Array}; -/// use arrow::datatypes::Int8Type; +/// use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); /// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)])); @@ -75,8 +184,7 @@ use crate::error::Result; /// Example **without nullable** data: /// /// ``` -/// use arrow::array::{DictionaryArray, Int8Array}; -/// use arrow::datatypes::Int8Type; +/// use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.into_iter().collect(); /// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); @@ -85,8 +193,7 @@ use crate::error::Result; /// Example from existing arrays: /// /// ``` -/// use arrow::array::{DictionaryArray, Int8Array, StringArray}; -/// use arrow::datatypes::Int8Type; +/// use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type}; /// // You can form your own DictionaryArray by providing the /// // values (dictionary) and keys (indexes into the dictionary): /// let values = StringArray::from_iter_values(["a", "b", "c"]); @@ -120,7 +227,10 @@ impl DictionaryArray { /// (indexes into the dictionary) and values (dictionary) /// array. Returns an error if there are any keys that are outside /// of the dictionary array. - pub fn try_new(keys: &PrimitiveArray, values: &dyn Array) -> Result { + pub fn try_new( + keys: &PrimitiveArray, + values: &dyn Array, + ) -> Result { let dict_data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), @@ -152,28 +262,6 @@ impl DictionaryArray { Ok(array.into()) } - /// Create a new DictionaryArray directly from specified keys - /// (indexes into the dictionary) and values (dictionary) - /// array, and the corresponding ArrayData. This is used internally - /// for the usage like filter kernel. - /// - /// # Safety - /// - /// The input keys, values and data must form a valid DictionaryArray, - /// or undefined behavior can occur. - pub(crate) unsafe fn try_new_unchecked( - keys: PrimitiveArray, - values: ArrayRef, - data: ArrayData, - ) -> Self { - Self { - data, - keys, - values, - is_ordered: false, - } - } - /// Return an array view of the keys of this dictionary as a PrimitiveArray. pub fn keys(&self) -> &PrimitiveArray { &self.keys @@ -239,8 +327,7 @@ impl DictionaryArray { /// Downcast this dictionary to a [`TypedDictionaryArray`] /// /// ``` - /// use arrow::array::{Array, ArrayAccessor, DictionaryArray, StringArray}; - /// use arrow::datatypes::Int32Type; + /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::Int32Type}; /// /// let orig = [Some("a"), Some("b"), None]; /// let dictionary = DictionaryArray::::from_iter(orig); @@ -312,8 +399,7 @@ impl From> for ArrayData { /// /// # Example: /// ``` -/// use arrow::array::{DictionaryArray, PrimitiveArray, StringArray}; -/// use arrow::datatypes::Int8Type; +/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type}; /// /// let test = vec!["a", "a", "b", "c"]; /// let array: DictionaryArray = test @@ -351,8 +437,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator> for Dictionary /// # Example: /// /// ``` -/// use arrow::array::{DictionaryArray, PrimitiveArray, StringArray}; -/// use arrow::datatypes::Int8Type; +/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type}; /// /// let test = vec!["a", "a", "b", "c"]; /// let array: DictionaryArray = test.into_iter().collect(); @@ -390,8 +475,8 @@ impl Array for DictionaryArray { } } -impl fmt::Debug for DictionaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for DictionaryArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!( f, "DictionaryArray {{keys: {:?} values: {:?}}}", @@ -404,8 +489,7 @@ impl fmt::Debug for DictionaryArray { /// allowing fast access to its elements /// /// ``` -/// use arrow::array::{ArrayIter, DictionaryArray, StringArray}; -/// use arrow::datatypes::Int32Type; +/// use arrow_array::{DictionaryArray, StringArray, types::Int32Type}; /// /// let orig = ["a", "b", "a", "b"]; /// let dictionary = DictionaryArray::::from_iter(orig); @@ -436,8 +520,8 @@ impl<'a, K: ArrowPrimitiveType, V> Clone for TypedDictionaryArray<'a, K, V> { impl<'a, K: ArrowPrimitiveType, V> Copy for TypedDictionaryArray<'a, K, V> {} -impl<'a, K: ArrowPrimitiveType, V> fmt::Debug for TypedDictionaryArray<'a, K, V> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl<'a, K: ArrowPrimitiveType, V> std::fmt::Debug for TypedDictionaryArray<'a, K, V> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) } } @@ -516,18 +600,12 @@ where #[cfg(test)] mod tests { use super::*; - - use crate::array::{Float32Array, Int8Array}; - use crate::datatypes::{Float32Type, Int16Type}; - use crate::{ - array::Int16DictionaryArray, array::PrimitiveDictionaryBuilder, - datatypes::DataType, - }; - use crate::{ - array::{Int16Array, Int32Array}, - datatypes::{Int32Type, Int8Type, UInt32Type, UInt8Type}, + use crate::builder::PrimitiveDictionaryBuilder; + use crate::types::{ + Float32Type, Int16Type, Int32Type, Int8Type, UInt32Type, UInt8Type, }; - use crate::{buffer::Buffer, datatypes::ToByteSlice}; + use crate::{Float32Array, Int16Array, Int32Array, Int8Array}; + use arrow_buffer::{Buffer, ToByteSlice}; #[test] fn test_dictionary_array() { diff --git a/arrow/src/array/array_fixed_size_binary.rs b/arrow-array/src/array/fixed_size_binary_array.rs similarity index 95% rename from arrow/src/array/array_fixed_size_binary.rs rename to arrow-array/src/array/fixed_size_binary_array.rs index 22eac1435a8d..f37d1e3e5c38 100644 --- a/arrow/src/array/array_fixed_size_binary.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -15,18 +15,13 @@ // specific language governing permissions and limitations // under the License. +use crate::iterator::FixedSizeBinaryIter; +use crate::raw_pointer::RawPtrBox; +use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray}; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; -use std::convert::From; -use std::fmt; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, -}; -use crate::array::{ArrayAccessor, FixedSizeBinaryIter}; -use crate::buffer::Buffer; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; -use crate::{buffer::MutableBuffer, datatypes::DataType}; /// An array where each element is a fixed-size sequence of bytes. /// @@ -35,7 +30,7 @@ use crate::{buffer::MutableBuffer, datatypes::DataType}; /// Create an array from an iterable argument of byte slices. /// /// ``` -/// use arrow::array::{Array, FixedSizeBinaryArray}; +/// use arrow_array::{Array, FixedSizeBinaryArray}; /// let input_arg = vec![ vec![1, 2], vec![3, 4], vec![5, 6] ]; /// let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); /// @@ -45,7 +40,7 @@ use crate::{buffer::MutableBuffer, datatypes::DataType}; /// Create an array from an iterable argument of sparse byte slices. /// Sparsity means that the input argument can contain `None` items. /// ``` -/// use arrow::array::{Array, FixedSizeBinaryArray}; +/// use arrow_array::{Array, FixedSizeBinaryArray}; /// let input_arg = vec![ None, Some(vec![7, 8]), Some(vec![9, 10]), None, Some(vec![13, 14]) ]; /// let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); /// assert_eq!(5, arr.len()) @@ -119,7 +114,7 @@ impl FixedSizeBinaryArray { /// # Examples /// /// ``` - /// use arrow::array::FixedSizeBinaryArray; + /// use arrow_array::FixedSizeBinaryArray; /// let input_arg = vec![ /// None, /// Some(vec![7, 8]), @@ -134,7 +129,7 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_sparse_iter(mut iter: T) -> Result + pub fn try_from_sparse_iter(mut iter: T) -> Result where T: Iterator>, U: AsRef<[u8]>, @@ -145,7 +140,7 @@ impl FixedSizeBinaryArray { let mut null_buf = MutableBuffer::from_len_zeroed(0); let mut buffer = MutableBuffer::from_len_zeroed(0); let mut prepend = 0; - iter.try_for_each(|item| -> Result<()> { + iter.try_for_each(|item| -> Result<(), ArrowError> { // extend null bitmask by one byte per each 8 items if byte == 0 { null_buf.push(0u8); @@ -206,7 +201,7 @@ impl FixedSizeBinaryArray { /// # Examples /// /// ``` - /// use arrow::array::FixedSizeBinaryArray; + /// use arrow_array::FixedSizeBinaryArray; /// let input_arg = vec![ /// vec![1, 2], /// vec![3, 4], @@ -218,7 +213,7 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_iter(mut iter: T) -> Result + pub fn try_from_iter(mut iter: T) -> Result where T: Iterator, U: AsRef<[u8]>, @@ -226,7 +221,7 @@ impl FixedSizeBinaryArray { let mut len = 0; let mut size = None; let mut buffer = MutableBuffer::from_len_zeroed(0); - iter.try_for_each(|item| -> Result<()> { + iter.try_for_each(|item| -> Result<(), ArrowError> { let slice = item.as_ref(); if let Some(size) = size { if size != slice.len() { @@ -348,11 +343,11 @@ impl From> for FixedSizeBinaryArray { } } -impl fmt::Debug for FixedSizeBinaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for FixedSizeBinaryArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -395,13 +390,10 @@ impl<'a> IntoIterator for &'a FixedSizeBinaryArray { #[cfg(test)] mod tests { + use crate::RecordBatch; + use arrow_schema::{Field, Schema}; use std::sync::Arc; - use crate::{ - datatypes::{Field, Schema}, - record_batch::RecordBatch, - }; - use super::*; #[test] diff --git a/arrow/src/array/array_fixed_size_list.rs b/arrow-array/src/array/fixed_size_list_array.rs similarity index 95% rename from arrow/src/array/array_fixed_size_list.rs rename to arrow-array/src/array/fixed_size_list_array.rs index fc568d54a831..a10c1d28dab4 100644 --- a/arrow/src/array/array_fixed_size_list.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -15,12 +15,10 @@ // specific language governing permissions and limitations // under the License. +use crate::{make_array, print_long_array, Array, ArrayAccessor, ArrayRef}; +use arrow_data::ArrayData; +use arrow_schema::DataType; use std::any::Any; -use std::fmt; - -use super::{array::print_long_array, make_array, Array, ArrayData, ArrayRef}; -use crate::array::array::ArrayAccessor; -use crate::datatypes::DataType; /// A list array where each element is a fixed-size sequence of values with the same /// type whose maximum length is represented by a i32. @@ -28,9 +26,10 @@ use crate::datatypes::DataType; /// # Example /// /// ``` -/// # use arrow::array::{Array, ArrayData, FixedSizeListArray, Int32Array}; -/// # use arrow::datatypes::{DataType, Field}; -/// # use arrow::buffer::Buffer; +/// # use arrow_array::{Array, FixedSizeListArray, Int32Array}; +/// # use arrow_data::ArrayData; +/// # use arrow_schema::{DataType, Field}; +/// # use arrow_buffer::Buffer; /// // Construct a value array /// let value_data = ArrayData::builder(DataType::Int32) /// .len(9) @@ -174,11 +173,11 @@ impl ArrayAccessor for FixedSizeListArray { } } -impl fmt::Debug for FixedSizeListArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for FixedSizeListArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -186,12 +185,10 @@ impl fmt::Debug for FixedSizeListArray { #[cfg(test)] mod tests { - use crate::{ - array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, - util::bit_util, - }; - use super::*; + use crate::Int32Array; + use arrow_buffer::{bit_util, Buffer}; + use arrow_schema::Field; #[test] fn test_fixed_size_list_array() { diff --git a/arrow/src/array/array_list.rs b/arrow-array/src/array/list_array.rs similarity index 96% rename from arrow/src/array/array_list.rs rename to arrow-array/src/array/list_array.rs index e830acdc2b92..83b0c6d5bd46 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow-array/src/array/list_array.rs @@ -15,21 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::fmt; - -use num::Integer; - -use super::{ - array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData, - ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray, -}; -use crate::array::array::ArrayAccessor; +use crate::array::make_array; use crate::{ - buffer::MutableBuffer, - datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field}, - error::ArrowError, + builder::BooleanBufferBuilder, iterator::GenericListArrayIter, print_long_array, + raw_pointer::RawPtrBox, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, + PrimitiveArray, }; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field}; +use num::Integer; +use std::any::Any; /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { @@ -137,8 +133,9 @@ impl GenericListArray { /// Creates a [`GenericListArray`] from an iterator of primitive values /// # Example /// ``` - /// # use arrow::array::ListArray; - /// # use arrow::datatypes::Int32Type; + /// # use arrow_array::ListArray; + /// # use arrow_array::types::Int32Type; + /// /// let data = vec![ /// Some(vec![Some(0), Some(1), Some(2)]), /// None, @@ -281,13 +278,13 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray fmt::Debug for GenericListArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for GenericListArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let prefix = OffsetSize::PREFIX; write!(f, "{}ListArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -299,8 +296,8 @@ impl fmt::Debug for GenericListArray { /// # Example /// /// ``` -/// # use arrow::array::{Array, ListArray, Int32Array}; -/// # use arrow::datatypes::{DataType, Int32Type}; +/// # use arrow_array::{Array, ListArray, Int32Array, types::Int32Type}; +/// # use arrow_schema::DataType; /// let data = vec![ /// Some(vec![]), /// None, @@ -326,8 +323,8 @@ pub type ListArray = GenericListArray; /// # Example /// /// ``` -/// # use arrow::array::{Array, LargeListArray, Int32Array}; -/// # use arrow::datatypes::{DataType, Int32Type}; +/// # use arrow_array::{Array, LargeListArray, Int32Array, types::Int32Type}; +/// # use arrow_schema::DataType; /// let data = vec![ /// Some(vec![]), /// None, @@ -350,17 +347,10 @@ pub type LargeListArray = GenericListArray; #[cfg(test)] mod tests { - use crate::{ - alloc, - array::ArrayData, - array::Int32Array, - buffer::Buffer, - datatypes::Field, - datatypes::{Int32Type, ToByteSlice}, - util::bit_util, - }; - use super::*; + use crate::types::Int32Type; + use crate::Int32Array; + use arrow_buffer::{bit_util, Buffer, ToByteSlice}; fn create_from_buffers() -> ListArray { // Construct a value array @@ -844,7 +834,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] fn test_primitive_array_alignment() { - let ptr = alloc::allocate_aligned(8); + let ptr = arrow_buffer::alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); let array_data = ArrayData::builder(DataType::Int32) @@ -860,7 +850,7 @@ mod tests { // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_list_array_alignment() { - let ptr = alloc::allocate_aligned(8); + let ptr = arrow_buffer::alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); diff --git a/arrow/src/array/array_map.rs b/arrow-array/src/array/map_array.rs similarity index 96% rename from arrow/src/array/array_map.rs rename to arrow-array/src/array/map_array.rs index 471d56c9c604..bfe8d407274c 100644 --- a/arrow/src/array/array_map.rs +++ b/arrow-array/src/array/map_array.rs @@ -15,20 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{StringArray, StructArray}; -use crate::buffer::Buffer; +use crate::raw_pointer::RawPtrBox; +use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, StructArray}; +use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; -use std::fmt; -use std::mem; use std::sync::Arc; -use super::make_array; -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayRef, -}; -use crate::datatypes::{ArrowNativeType, DataType, Field, ToByteSlice}; -use crate::error::ArrowError; - /// A nested array type where each record is a key-value map. /// Keys should always be non-null, but values can be null. /// @@ -221,15 +215,15 @@ impl Array for MapArray { /// Returns the total number of bytes of memory occupied physically by this [MapArray]. fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) + self.data.get_array_memory_size() + std::mem::size_of_val(self) } } -impl fmt::Debug for MapArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for MapArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "MapArray\n[\n")?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -237,16 +231,9 @@ impl fmt::Debug for MapArray { #[cfg(test)] mod tests { + use crate::{Int32Array, UInt32Array}; use std::sync::Arc; - use crate::{ - array::ArrayData, - array::{Int32Array, StructArray, UInt32Array}, - buffer::Buffer, - datatypes::Field, - datatypes::ToByteSlice, - }; - use super::*; fn create_from_buffers() -> MapArray { diff --git a/arrow/src/array/array.rs b/arrow-array/src/array/mod.rs similarity index 94% rename from arrow/src/array/array.rs rename to arrow-array/src/array/mod.rs index 2c2969c925d5..e2ea61549125 100644 --- a/arrow/src/array/array.rs +++ b/arrow-array/src/array/mod.rs @@ -15,42 +15,79 @@ // specific language governing permissions and limitations // under the License. +//! The concrete array definitions + +mod binary_array; + +use crate::types::*; +use arrow_buffer::{Buffer, MutableBuffer, ToByteSlice}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; -use std::convert::From; -use std::fmt; use std::sync::Arc; -use super::*; -use crate::buffer::{Buffer, MutableBuffer}; +pub use binary_array::*; + +mod boolean_array; +pub use boolean_array::*; + +mod decimal_array; +pub use decimal_array::*; + +mod dictionary_array; +pub use dictionary_array::*; + +mod fixed_size_binary_array; +pub use fixed_size_binary_array::*; + +mod fixed_size_list_array; +pub use fixed_size_list_array::*; + +mod list_array; +pub use list_array::*; + +mod map_array; +pub use map_array::*; + +mod null_array; +pub use null_array::*; + +mod primitive_array; +pub use primitive_array::*; + +mod string_array; +pub use string_array::*; + +mod struct_array; +pub use struct_array::*; + +mod union_array; +pub use union_array::*; /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. -pub trait Array: fmt::Debug + Send + Sync { +pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`](std::any::Any) so that it can be /// downcasted to a specific implementation. /// /// # Example: /// /// ``` - /// use std::sync::Arc; - /// use arrow::array::Int32Array; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{Schema, Field, DataType, ArrowError}; /// - /// # fn main() -> arrow::error::Result<()> { /// let id = Int32Array::from(vec![1, 2, 3, 4, 5]); /// let batch = RecordBatch::try_new( /// Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])), /// vec![Arc::new(id)] - /// )?; + /// ).unwrap(); /// /// let int32array = batch /// .column(0) /// .as_any() /// .downcast_ref::() /// .expect("Failed to downcast"); - /// # Ok(()) - /// # } /// ``` fn as_any(&self) -> &dyn Any; @@ -65,13 +102,13 @@ pub trait Array: fmt::Debug + Send + Sync { self.data() } - /// Returns a reference to the [`DataType`](crate::datatypes::DataType) of this array. + /// Returns a reference to the [`DataType`](arrow_schema::DataType) of this array. /// /// # Example: /// /// ``` - /// use arrow::datatypes::DataType; - /// use arrow::array::{Array, Int32Array}; + /// use arrow_schema::DataType; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// @@ -86,7 +123,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// // Make slice over the values [2, 3, 4] @@ -103,7 +140,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// @@ -118,7 +155,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// @@ -135,7 +172,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// // Make slice over the values [2, 3, 4] @@ -154,7 +191,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![Some(1), None]); /// @@ -171,7 +208,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// let array = Int32Array::from(vec![Some(1), None]); /// @@ -187,7 +224,7 @@ pub trait Array: fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow::array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array}; /// /// // Construct an array with values [1, NULL, NULL] /// let array = Int32Array::from(vec![Some(1), None, None]); @@ -558,8 +595,8 @@ pub fn make_array(data: ArrayData) -> ArrayRef { /// /// ``` /// use std::sync::Arc; -/// use arrow::datatypes::DataType; -/// use arrow::array::{ArrayRef, Int32Array, new_empty_array}; +/// use arrow_schema::DataType; +/// use arrow_array::{ArrayRef, Int32Array, new_empty_array}; /// /// let empty_array = new_empty_array(&DataType::Int32); /// let array: ArrayRef = Arc::new(Int32Array::from(vec![] as Vec)); @@ -576,8 +613,8 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef { /// /// ``` /// use std::sync::Arc; -/// use arrow::datatypes::DataType; -/// use arrow::array::{ArrayRef, Int32Array, new_null_array}; +/// use arrow_schema::DataType; +/// use arrow_array::{ArrayRef, Int32Array, new_null_array}; /// /// let null_array = new_null_array(&DataType::Int32, 3); /// let array: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None])); @@ -788,14 +825,14 @@ fn new_null_sized_decimal( } // Helper function for printing potentially long arrays. -pub(super) fn print_long_array( +pub(crate) fn print_long_array( array: &A, - f: &mut fmt::Formatter, + f: &mut std::fmt::Formatter, print_item: F, -) -> fmt::Result +) -> std::fmt::Result where A: Array, - F: Fn(&A, usize, &mut fmt::Formatter) -> fmt::Result, + F: Fn(&A, usize, &mut std::fmt::Formatter) -> std::fmt::Result, { let head = std::cmp::min(10, array.len()); @@ -831,6 +868,7 @@ where #[cfg(test)] mod tests { use super::*; + use arrow_schema::Field; #[test] fn test_empty_primitive() { diff --git a/arrow/src/array/null.rs b/arrow-array/src/array/null_array.rs similarity index 93% rename from arrow/src/array/null.rs rename to arrow-array/src/array/null_array.rs index 467121f6ccfa..d796324f663f 100644 --- a/arrow/src/array/null.rs +++ b/arrow-array/src/array/null_array.rs @@ -17,11 +17,10 @@ //! Contains the `NullArray` type. +use crate::Array; +use arrow_data::ArrayData; +use arrow_schema::DataType; use std::any::Any; -use std::fmt; - -use crate::array::{Array, ArrayData}; -use crate::datatypes::*; /// An Array where all elements are nulls /// @@ -30,16 +29,12 @@ use crate::datatypes::*; /// # Example: Create an array /// /// ``` -/// use arrow::array::{Array, NullArray}; +/// use arrow_array::{Array, NullArray}; /// -/// # fn main() -> arrow::error::Result<()> { /// let array = NullArray::new(10); /// /// assert_eq!(array.len(), 10); /// assert_eq!(array.null_count(), 10); -/// -/// # Ok(()) -/// # } /// ``` pub struct NullArray { data: ArrayData, @@ -116,8 +111,8 @@ impl From for ArrayData { } } -impl fmt::Debug for NullArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for NullArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "NullArray({})", self.len()) } } diff --git a/arrow/src/array/array_primitive.rs b/arrow-array/src/array/primitive_array.rs similarity index 81% rename from arrow/src/array/array_primitive.rs rename to arrow-array/src/array/primitive_array.rs index 57168b7b9e60..f9e4e7675da2 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -15,34 +15,194 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::convert::From; -use std::fmt; -use std::iter::{FromIterator, IntoIterator}; -use std::mem; - -use chrono::{prelude::*, Duration}; - -use super::array::print_long_array; -use super::raw_pointer::RawPtrBox; -use super::*; -use crate::temporal_conversions; -use crate::util::bit_util; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::trusted_len_unzip, -}; - -use crate::array::array::ArrayAccessor; +use crate::builder::{BooleanBufferBuilder, PrimitiveBuilder}; +use crate::iterator::PrimitiveIter; +use crate::raw_pointer::RawPtrBox; +use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; +use crate::trusted_len::trusted_len_unzip; +use crate::types::*; +use crate::{print_long_array, Array, ArrayAccessor}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; +use std::any::Any; + +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Int8Array; +/// let arr : Int8Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type Int8Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Int16Array; +/// let arr : Int16Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type Int16Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Int32Array; +/// let arr : Int32Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type Int32Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Int64Array; +/// let arr : Int64Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type Int64Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::UInt8Array; +/// let arr : UInt8Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type UInt8Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::UInt16Array; +/// let arr : UInt16Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type UInt16Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::UInt32Array; +/// let arr : UInt32Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type UInt32Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::UInt64Array; +/// let arr : UInt64Array = [Some(1), Some(2)].into_iter().collect(); +/// ``` +pub type UInt64Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Float16Array; +/// use half::f16; +/// let arr : Float16Array = [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))].into_iter().collect(); +/// ``` +pub type Float16Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Float32Array; +/// let arr : Float32Array = [Some(1.0), Some(2.0)].into_iter().collect(); +/// ``` +pub type Float32Array = PrimitiveArray; +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::Float64Array; +/// let arr : Float64Array = [Some(1.0), Some(2.0)].into_iter().collect(); +/// ``` +pub type Float64Array = PrimitiveArray; + +/// +/// A primitive array where each element is of type [TimestampSecondType]. +/// See also [`Timestamp`](arrow_schema::DataType::Timestamp). +/// +/// # Example: UTC timestamps post epoch +/// ``` +/// # use arrow_array::TimestampSecondArray; +/// use chrono::FixedOffset; +/// // Corresponds to single element array with entry 1970-05-09T14:25:11+0:00 +/// let arr = TimestampSecondArray::from_vec(vec![11111111], None); +/// // OR +/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], None); +/// let utc_offset = FixedOffset::east(0); +/// +/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1970-05-09 14:25:11") +/// ``` +/// +/// # Example: UTC timestamps pre epoch +/// ``` +/// # use arrow_array::TimestampSecondArray; +/// use chrono::FixedOffset; +/// // Corresponds to single element array with entry 1969-08-25T09:34:49+0:00 +/// let arr = TimestampSecondArray::from_vec(vec![-11111111], None); +/// // OR +/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(-11111111)], None); +/// let utc_offset = FixedOffset::east(0); +/// +/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1969-08-25 09:34:49") +/// ``` +/// +/// # Example: With timezone specified +/// ``` +/// # use arrow_array::TimestampSecondArray; +/// use chrono::FixedOffset; +/// // Corresponds to single element array with entry 1970-05-10T00:25:11+10:00 +/// let arr = TimestampSecondArray::from_vec(vec![11111111], Some("+10:00".to_string())); +/// // OR +/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], Some("+10:00".to_string())); +/// let sydney_offset = FixedOffset::east(10 * 60 * 60); +/// +/// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_offset).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11") +/// ``` +/// +pub type TimestampSecondArray = PrimitiveArray; +/// A primitive array where each element is of type `TimestampMillisecondType.` +/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) +pub type TimestampMillisecondArray = PrimitiveArray; +/// A primitive array where each element is of type `TimestampMicrosecondType.` +/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) +pub type TimestampMicrosecondArray = PrimitiveArray; +/// A primitive array where each element is of type `TimestampNanosecondType.` +/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) +pub type TimestampNanosecondArray = PrimitiveArray; +pub type Date32Array = PrimitiveArray; +pub type Date64Array = PrimitiveArray; +pub type Time32SecondArray = PrimitiveArray; +pub type Time32MillisecondArray = PrimitiveArray; +pub type Time64MicrosecondArray = PrimitiveArray; +pub type Time64NanosecondArray = PrimitiveArray; +pub type IntervalYearMonthArray = PrimitiveArray; +pub type IntervalDayTimeArray = PrimitiveArray; +pub type IntervalMonthDayNanoArray = PrimitiveArray; +pub type DurationSecondArray = PrimitiveArray; +pub type DurationMillisecondArray = PrimitiveArray; +pub type DurationMicrosecondArray = PrimitiveArray; +pub type DurationNanosecondArray = PrimitiveArray; + +/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the +/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. +pub trait ArrowPrimitiveType: 'static { + /// Corresponding Rust native type for the primitive type. + type Native: ArrowNativeType; + + /// the corresponding Arrow data type of this primitive type. + const DATA_TYPE: DataType; + + /// Returns the byte width of this primitive type. + fn get_byte_width() -> usize { + std::mem::size_of::() + } + + /// Returns a default value of this primitive type. + /// + /// This is useful for aggregate array ops like `sum()`, `mean()`. + fn default_value() -> Self::Native { + Default::default() + } +} /// Array whose elements are of primitive types. /// /// # Example: From an iterator of values /// /// ``` -/// use arrow::array::{Array, PrimitiveArray}; -/// use arrow::datatypes::Int32Type; +/// use arrow_array::{Array, PrimitiveArray, types::Int32Type}; /// let arr: PrimitiveArray = PrimitiveArray::from_iter_values((0..10).map(|x| x + 1)); /// assert_eq!(10, arr.len()); /// assert_eq!(0, arr.null_count()); @@ -125,7 +285,7 @@ impl PrimitiveArray { let data = unsafe { ArrayData::new_unchecked( T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), + val_buf.len() / std::mem::size_of::<::Native>(), None, None, 0, @@ -143,7 +303,7 @@ impl PrimitiveArray { let data = unsafe { ArrayData::new_unchecked( T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), + val_buf.len() / std::mem::size_of::<::Native>(), None, None, 0, @@ -206,79 +366,9 @@ impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { } } -pub(crate) fn as_datetime(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Date32 => Some(temporal_conversions::date32_to_datetime(v as i32)), - DataType::Date64 => Some(temporal_conversions::date64_to_datetime(v)), - DataType::Time32(_) | DataType::Time64(_) => None, - DataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Some(temporal_conversions::timestamp_s_to_datetime(v)), - TimeUnit::Millisecond => { - Some(temporal_conversions::timestamp_ms_to_datetime(v)) - } - TimeUnit::Microsecond => { - Some(temporal_conversions::timestamp_us_to_datetime(v)) - } - TimeUnit::Nanosecond => { - Some(temporal_conversions::timestamp_ns_to_datetime(v)) - } - }, - // interval is not yet fully documented [ARROW-3097] - DataType::Interval(_) => None, - _ => None, - } -} - -fn as_date(v: i64) -> Option { - as_datetime::(v).map(|datetime| datetime.date()) -} - -pub(crate) fn as_time(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Time32(unit) => { - // safe to immediately cast to u32 as `self.value(i)` is positive i32 - let v = v as u32; - match unit { - TimeUnit::Second => Some(temporal_conversions::time32s_to_time(v as i32)), - TimeUnit::Millisecond => { - Some(temporal_conversions::time32ms_to_time(v as i32)) - } - _ => None, - } - } - DataType::Time64(unit) => match unit { - TimeUnit::Microsecond => Some(temporal_conversions::time64us_to_time(v)), - TimeUnit::Nanosecond => Some(temporal_conversions::time64ns_to_time(v)), - _ => None, - }, - DataType::Timestamp(_, _) => as_datetime::(v).map(|datetime| datetime.time()), - DataType::Date32 | DataType::Date64 => Some(NaiveTime::from_hms(0, 0, 0)), - DataType::Interval(_) => None, - _ => None, - } -} - -fn as_duration(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Duration(unit) => match unit { - TimeUnit::Second => Some(temporal_conversions::duration_s_to_duration(v)), - TimeUnit::Millisecond => { - Some(temporal_conversions::duration_ms_to_duration(v)) - } - TimeUnit::Microsecond => { - Some(temporal_conversions::duration_us_to_duration(v)) - } - TimeUnit::Nanosecond => { - Some(temporal_conversions::duration_ns_to_duration(v)) - } - }, - _ => None, - } -} - -impl PrimitiveArray +impl PrimitiveArray where - i64: std::convert::From, + i64: From, { /// Returns value as a chrono `NaiveDateTime`, handling time resolution /// @@ -322,8 +412,8 @@ where } } -impl fmt::Debug for PrimitiveArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for PrimitiveArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "PrimitiveArray<{:?}>\n[\n", T::DATA_TYPE)?; print_long_array(self, f, |array, index, f| match T::DATA_TYPE { DataType::Date32 | DataType::Date64 => { @@ -347,7 +437,7 @@ impl fmt::Debug for PrimitiveArray { None => write!(f, "null"), } } - _ => fmt::Debug::fmt(&array.value(index), f), + _ => std::fmt::Debug::fmt(&array.value(index), f), })?; write!(f, "]") } @@ -574,7 +664,7 @@ impl PrimitiveArray { // TODO: duplicated from def_numeric_from_vec! macro, it looks possible to convert to generic let data_len = data.len(); let mut null_buf = MutableBuffer::new_null(data_len); - let mut val_buf = MutableBuffer::new(data_len * mem::size_of::()); + let mut val_buf = MutableBuffer::new(data_len * std::mem::size_of::()); { let null_slice = null_buf.as_slice_mut(); @@ -618,12 +708,7 @@ impl From for PrimitiveArray { #[cfg(test)] mod tests { use super::*; - - use std::thread; - - use crate::buffer::Buffer; - use crate::compute::eq_dyn; - use crate::datatypes::DataType; + use crate::BooleanArray; #[test] fn test_primitive_array_from_vec() { @@ -1099,7 +1184,7 @@ mod tests { #[test] fn test_access_array_concurrently() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let ret = thread::spawn(move || a.value(3)).join(); + let ret = std::thread::spawn(move || a.value(3)).join(); assert!(ret.is_ok()); assert_eq!(8, ret.ok().unwrap()); @@ -1110,11 +1195,7 @@ mod tests { let array1: Int8Array = [10_i8, 11, 12, 13, 14].into_iter().collect(); let array2: Int8Array = [10_i8, 11, 12, 13, 14].into_iter().map(Some).collect(); - let result = eq_dyn(&array1, &array2); - assert_eq!( - result.unwrap(), - BooleanArray::from(vec![true, true, true, true, true]) - ); + assert_eq!(array1, array2); } #[cfg(feature = "chrono-tz")] diff --git a/arrow/src/array/array_string.rs b/arrow-array/src/array/string_array.rs similarity index 97% rename from arrow/src/array/array_string.rs rename to arrow-array/src/array/string_array.rs index f3ecaa2d5591..22ad81eaa3f9 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow-array/src/array/string_array.rs @@ -15,18 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::convert::From; -use std::fmt; -use std::{any::Any, iter::FromIterator}; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, - GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait, +use crate::iterator::GenericStringIter; +use crate::raw_pointer::RawPtrBox; +use crate::{ + print_long_array, Array, ArrayAccessor, GenericBinaryArray, GenericListArray, + OffsetSizeTrait, }; -use crate::array::array::ArrayAccessor; -use crate::buffer::Buffer; -use crate::util::bit_util; -use crate::{buffer::MutableBuffer, datatypes::DataType}; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; /// Generic struct for \[Large\]StringArray /// @@ -131,7 +129,7 @@ impl GenericStringArray { /// Convert a list array to a string array. /// /// Note: this performs potentially expensive UTF-8 validation, consider using - /// [`StringBuilder`][crate::array::StringBuilder] to avoid this + /// [`StringBuilder`][crate::builder::StringBuilder] to avoid this /// /// # Panics /// @@ -303,13 +301,13 @@ impl<'a, T: OffsetSizeTrait> GenericStringArray { } } -impl fmt::Debug for GenericStringArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for GenericStringArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let prefix = OffsetSize::PREFIX; write!(f, "{}StringArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) + std::fmt::Debug::fmt(&array.value(index), f) })?; write!(f, "]") } @@ -414,7 +412,7 @@ impl From> for Array /// Example /// /// ``` -/// use arrow::array::StringArray; +/// use arrow_array::StringArray; /// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]); /// assert_eq!(array.value(0), "foo"); /// ``` @@ -426,7 +424,7 @@ pub type StringArray = GenericStringArray; /// Example /// /// ``` -/// use arrow::array::LargeStringArray; +/// use arrow_array::LargeStringArray; /// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]); /// assert_eq!(array.value(2), "bar"); /// ``` @@ -434,13 +432,9 @@ pub type LargeStringArray = GenericStringArray; #[cfg(test)] mod tests { - - use crate::{ - array::{ListBuilder, StringBuilder}, - datatypes::Field, - }; - use super::*; + use crate::builder::{ListBuilder, StringBuilder}; + use arrow_schema::Field; #[test] fn test_string_array_from_u8_slice() { diff --git a/arrow/src/array/array_struct.rs b/arrow-array/src/array/struct_array.rs similarity index 95% rename from arrow/src/array/array_struct.rs rename to arrow-array/src/array/struct_array.rs index a6c3146aef2e..841d3235f64b 100644 --- a/arrow/src/array/array_struct.rs +++ b/arrow-array/src/array/struct_array.rs @@ -15,18 +15,12 @@ // specific language governing permissions and limitations // under the License. +use crate::{make_array, Array, ArrayRef}; +use arrow_buffer::buffer::buffer_bin_or; +use arrow_buffer::Buffer; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; -use std::convert::{From, TryFrom}; -use std::fmt; -use std::iter::IntoIterator; - -use super::{make_array, Array, ArrayData, ArrayRef}; -use crate::datatypes::DataType; -use crate::error::{ArrowError, Result}; -use crate::{ - buffer::{buffer_bin_or, Buffer}, - datatypes::Field, -}; /// A nested array type where each child (called *field*) is represented by a separate /// array. @@ -34,8 +28,8 @@ use crate::{ /// /// ``` /// use std::sync::Arc; -/// use arrow::array::{Array, ArrayRef, BooleanArray, Int32Array, StructArray}; -/// use arrow::datatypes::{DataType, Field}; +/// use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, StructArray}; +/// use arrow_schema::{DataType, Field}; /// /// let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); /// let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); @@ -130,7 +124,7 @@ impl TryFrom> for StructArray { /// builds a StructArray from a vector of names and arrays. /// This errors if the values have a different length. /// An entry is set to Null when all values are null. - fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { + fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { let values_len = values.len(); // these will be populated @@ -239,8 +233,8 @@ impl From> for StructArray { } } -impl fmt::Debug for StructArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for StructArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "StructArray\n[\n")?; for (child_index, name) in self.column_names().iter().enumerate() { let column = self.column(child_index); @@ -251,7 +245,7 @@ impl fmt::Debug for StructArray { name, column.data_type() )?; - fmt::Debug::fmt(column, f)?; + std::fmt::Debug::fmt(column, f)?; writeln!(f)?; } write!(f, "]") @@ -290,17 +284,12 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { mod tests { use super::*; - use std::sync::Arc; - use crate::{ - array::BooleanArray, array::Float32Array, array::Float64Array, array::Int32Array, - array::StringArray, bitmap::Bitmap, + BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray, }; - use crate::{ - array::Int64Array, - datatypes::{DataType, Field}, - }; - use crate::{buffer::Buffer, datatypes::ToByteSlice}; + use arrow_buffer::ToByteSlice; + use arrow_data::Bitmap; + use std::sync::Arc; #[test] fn test_struct_array_builder() { diff --git a/arrow/src/array/array_union.rs b/arrow-array/src/array/union_array.rs similarity index 97% rename from arrow/src/array/array_union.rs rename to arrow-array/src/array/union_array.rs index b221239b2dbe..f62a84cf03ce 100644 --- a/arrow/src/array/array_union.rs +++ b/arrow-array/src/array/union_array.rs @@ -15,26 +15,24 @@ // specific language governing permissions and limitations // under the License. +use crate::{make_array, Array, ArrayRef}; +use arrow_buffer::Buffer; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field, UnionMode}; /// Contains the `UnionArray` type. /// -use crate::array::{make_array, Array, ArrayData, ArrayRef}; -use crate::buffer::Buffer; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - -use core::fmt; use std::any::Any; /// An Array that can represent slots of varying types. /// /// Each slot in a [UnionArray] can have a value chosen from a number /// of types. Each of the possible types are named like the fields of -/// a [`StructArray`](crate::array::StructArray). A `UnionArray` can +/// a [`StructArray`](crate::StructArray). A `UnionArray` can /// have two possible memory layouts, "dense" or "sparse". For more /// information on please see the /// [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). /// -/// [UnionBuilder](crate::array::UnionBuilder) can be used to +/// [UnionBuilder](crate::builder::UnionBuilder) can be used to /// create [UnionArray]'s of primitive types. `UnionArray`'s of nested /// types are also supported but not via `UnionBuilder`, see the tests /// for examples. @@ -42,10 +40,10 @@ use std::any::Any; /// # Examples /// ## Create a dense UnionArray `[1, 3.2, 34]` /// ``` -/// use arrow::buffer::Buffer; -/// use arrow::datatypes::*; +/// use arrow_buffer::Buffer; +/// use arrow_schema::*; /// use std::sync::Arc; -/// use arrow::array::{Array, Int32Array, Float64Array, UnionArray}; +/// use arrow_array::{Array, Int32Array, Float64Array, UnionArray}; /// /// let int_array = Int32Array::from(vec![1, 34]); /// let float_array = Float64Array::from(vec![3.2]); @@ -76,10 +74,10 @@ use std::any::Any; /// /// ## Create a sparse UnionArray `[1, 3.2, 34]` /// ``` -/// use arrow::buffer::Buffer; -/// use arrow::datatypes::*; +/// use arrow_buffer::Buffer; +/// use arrow_schema::*; /// use std::sync::Arc; -/// use arrow::array::{Array, Int32Array, Float64Array, UnionArray}; +/// use arrow_array::{Array, Int32Array, Float64Array, UnionArray}; /// /// let int_array = Int32Array::from(vec![Some(1), None, Some(34)]); /// let float_array = Float64Array::from(vec![None, Some(3.2), None]); @@ -174,7 +172,7 @@ impl UnionArray { type_ids: Buffer, value_offsets: Option, child_arrays: Vec<(Field, ArrayRef)>, - ) -> Result { + ) -> Result { if let Some(b) = &value_offsets { if ((type_ids.len()) * 4) != b.len() { return Err(ArrowError::InvalidArgumentError( @@ -339,8 +337,8 @@ impl Array for UnionArray { } } -impl fmt::Debug for UnionArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +impl std::fmt::Debug for UnionArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let header = if self.is_dense() { "UnionArray(Dense)\n[" } else { @@ -365,7 +363,7 @@ impl fmt::Debug for UnionArray { *name, column.data_type() )?; - fmt::Debug::fmt(column, f)?; + std::fmt::Debug::fmt(column, f)?; writeln!(f)?; } writeln!(f, "]") @@ -376,13 +374,13 @@ impl fmt::Debug for UnionArray { mod tests { use super::*; + use crate::builder::UnionBuilder; + use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type}; + use crate::RecordBatch; + use crate::{Float64Array, Int32Array, Int64Array, StringArray}; + use arrow_schema::Schema; use std::sync::Arc; - use crate::array::*; - use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field}; - use crate::record_batch::RecordBatch; - #[test] fn test_dense_i32() { let mut builder = UnionBuilder::new_dense(); diff --git a/arrow/src/array/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs similarity index 98% rename from arrow/src/array/builder/boolean_buffer_builder.rs rename to arrow-array/src/builder/boolean_buffer_builder.rs index 5b6d1ce48478..16c6750d1d9f 100644 --- a/arrow/src/array/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -15,11 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::buffer::{Buffer, MutableBuffer}; - -use super::Range; - -use crate::util::bit_util; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::bit_mask; +use std::ops::Range; #[derive(Debug)] pub struct BooleanBufferBuilder { @@ -139,7 +137,7 @@ impl BooleanBufferBuilder { let offset_write = self.len; let len = range.end - range.start; self.advance(len); - crate::util::bit_mask::set_bits( + bit_mask::set_bits( self.buffer.as_slice_mut(), to_set, offset_write, diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs similarity index 85% rename from arrow/src/array/builder/boolean_builder.rs rename to arrow-array/src/builder/boolean_builder.rs index eed14a55fd91..96711dd1f6f6 100644 --- a/arrow/src/array/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -15,21 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, BooleanBufferBuilder}; +use crate::{ArrayRef, BooleanArray}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -use crate::array::ArrayBuilder; -use crate::array::ArrayData; -use crate::array::ArrayRef; -use crate::array::BooleanArray; -use crate::datatypes::DataType; - -use crate::error::ArrowError; -use crate::error::Result; - -use super::BooleanBufferBuilder; -use super::NullBufferBuilder; - /// Array builder for fixed-width primitive types /// /// # Example @@ -37,28 +30,29 @@ use super::NullBufferBuilder; /// Create a `BooleanArray` from a `BooleanBuilder` /// /// ``` -/// use arrow::array::{Array, BooleanArray, BooleanBuilder}; /// -/// let mut b = BooleanBuilder::new(); -/// b.append_value(true); -/// b.append_null(); -/// b.append_value(false); -/// b.append_value(true); -/// let arr = b.finish(); +/// # use arrow_array::{Array, BooleanArray, builder::BooleanBuilder}; +/// +/// let mut b = BooleanBuilder::new(); +/// b.append_value(true); +/// b.append_null(); +/// b.append_value(false); +/// b.append_value(true); +/// let arr = b.finish(); /// -/// assert_eq!(4, arr.len()); -/// assert_eq!(1, arr.null_count()); -/// assert_eq!(true, arr.value(0)); -/// assert!(arr.is_valid(0)); -/// assert!(!arr.is_null(0)); -/// assert!(!arr.is_valid(1)); -/// assert!(arr.is_null(1)); -/// assert_eq!(false, arr.value(2)); -/// assert!(arr.is_valid(2)); -/// assert!(!arr.is_null(2)); -/// assert_eq!(true, arr.value(3)); -/// assert!(arr.is_valid(3)); -/// assert!(!arr.is_null(3)); +/// assert_eq!(4, arr.len()); +/// assert_eq!(1, arr.null_count()); +/// assert_eq!(true, arr.value(0)); +/// assert!(arr.is_valid(0)); +/// assert!(!arr.is_null(0)); +/// assert!(!arr.is_valid(1)); +/// assert!(arr.is_null(1)); +/// assert_eq!(false, arr.value(2)); +/// assert!(arr.is_valid(2)); +/// assert!(!arr.is_null(2)); +/// assert_eq!(true, arr.value(3)); +/// assert!(arr.is_valid(3)); +/// assert!(!arr.is_null(3)); /// ``` #[derive(Debug)] pub struct BooleanBuilder { @@ -132,7 +126,11 @@ impl BooleanBuilder { /// /// Returns an error if the slices are of different lengths #[inline] - pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> { + pub fn append_values( + &mut self, + values: &[bool], + is_valid: &[bool], + ) -> Result<(), ArrowError> { if values.len() != is_valid.len() { Err(ArrowError::InvalidArgumentError( "Value and validity lengths must be equal".to_string(), @@ -193,7 +191,8 @@ impl ArrayBuilder for BooleanBuilder { #[cfg(test)] mod tests { use super::*; - use crate::{array::Array, buffer::Buffer}; + use crate::Array; + use arrow_buffer::Buffer; #[test] fn test_boolean_array_builder() { diff --git a/arrow/src/array/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs similarity index 75% rename from arrow/src/array/builder/buffer_builder.rs rename to arrow-array/src/builder/buffer_builder.rs index a6a81dfd6c0e..2da11cb23203 100644 --- a/arrow/src/array/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -15,17 +15,60 @@ // specific language governing permissions and limitations // under the License. -use std::mem; - -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::ArrowNativeType; - -use super::PhantomData; - -/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object. +use crate::array::ArrowPrimitiveType; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use std::marker::PhantomData; + +use crate::types::*; + +pub type Int8BufferBuilder = BufferBuilder; +pub type Int16BufferBuilder = BufferBuilder; +pub type Int32BufferBuilder = BufferBuilder; +pub type Int64BufferBuilder = BufferBuilder; +pub type UInt8BufferBuilder = BufferBuilder; +pub type UInt16BufferBuilder = BufferBuilder; +pub type UInt32BufferBuilder = BufferBuilder; +pub type UInt64BufferBuilder = BufferBuilder; +pub type Float32BufferBuilder = BufferBuilder; +pub type Float64BufferBuilder = BufferBuilder; + +pub type TimestampSecondBufferBuilder = + BufferBuilder<::Native>; +pub type TimestampMillisecondBufferBuilder = + BufferBuilder<::Native>; +pub type TimestampMicrosecondBufferBuilder = + BufferBuilder<::Native>; +pub type TimestampNanosecondBufferBuilder = + BufferBuilder<::Native>; +pub type Date32BufferBuilder = BufferBuilder<::Native>; +pub type Date64BufferBuilder = BufferBuilder<::Native>; +pub type Time32SecondBufferBuilder = + BufferBuilder<::Native>; +pub type Time32MillisecondBufferBuilder = + BufferBuilder<::Native>; +pub type Time64MicrosecondBufferBuilder = + BufferBuilder<::Native>; +pub type Time64NanosecondBufferBuilder = + BufferBuilder<::Native>; +pub type IntervalYearMonthBufferBuilder = + BufferBuilder<::Native>; +pub type IntervalDayTimeBufferBuilder = + BufferBuilder<::Native>; +pub type IntervalMonthDayNanoBufferBuilder = + BufferBuilder<::Native>; +pub type DurationSecondBufferBuilder = + BufferBuilder<::Native>; +pub type DurationMillisecondBufferBuilder = + BufferBuilder<::Native>; +pub type DurationMicrosecondBufferBuilder = + BufferBuilder<::Native>; +pub type DurationNanosecondBufferBuilder = + BufferBuilder<::Native>; + +/// Builder for creating a [`Buffer`](arrow_buffer::Buffer) object. /// -/// A [`Buffer`](crate::buffer::Buffer) is the underlying data -/// structure of Arrow's [`Arrays`](crate::array::Array). +/// A [`Buffer`](arrow_buffer::Buffer) is the underlying data +/// structure of Arrow's [`Arrays`](crate::Array). /// /// For all supported types, there are type definitions for the /// generic version of `BufferBuilder`, e.g. `UInt8BufferBuilder`. @@ -33,17 +76,14 @@ use super::PhantomData; /// # Example: /// /// ``` -/// use arrow::array::UInt8BufferBuilder; +/// # use arrow_array::builder::UInt8BufferBuilder; /// -/// # fn main() -> arrow::error::Result<()> { /// let mut builder = UInt8BufferBuilder::new(100); /// builder.append_slice(&[42, 43, 44]); /// builder.append(45); /// let buffer = builder.finish(); /// /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); -/// # Ok(()) -/// # } /// ``` #[derive(Debug)] pub struct BufferBuilder { @@ -67,7 +107,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// @@ -75,7 +115,7 @@ impl BufferBuilder { /// ``` #[inline] pub fn new(capacity: usize) -> Self { - let buffer = MutableBuffer::new(capacity * mem::size_of::()); + let buffer = MutableBuffer::new(capacity * std::mem::size_of::()); Self { buffer, @@ -89,7 +129,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append(42); @@ -105,7 +145,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append(42); @@ -136,7 +176,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.advance(2); @@ -145,7 +185,7 @@ impl BufferBuilder { /// ``` #[inline] pub fn advance(&mut self, i: usize) { - self.buffer.extend_zeros(i * mem::size_of::()); + self.buffer.extend_zeros(i * std::mem::size_of::()); self.len += i; } @@ -154,7 +194,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.reserve(10); @@ -163,7 +203,7 @@ impl BufferBuilder { /// ``` #[inline] pub fn reserve(&mut self, n: usize) { - self.buffer.reserve(n * mem::size_of::()); + self.buffer.reserve(n * std::mem::size_of::()); } /// Appends a value of type `T` into the builder, @@ -172,7 +212,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append(42); @@ -192,7 +232,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append_n(10, 42); @@ -213,7 +253,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt32BufferBuilder; + /// # use arrow_array::builder::UInt32BufferBuilder; /// /// let mut builder = UInt32BufferBuilder::new(10); /// builder.append_n_zeroed(3); @@ -222,7 +262,7 @@ impl BufferBuilder { /// assert_eq!(builder.as_slice(), &[0, 0, 0]) #[inline] pub fn append_n_zeroed(&mut self, n: usize) { - self.buffer.extend_zeros(n * mem::size_of::()); + self.buffer.extend_zeros(n * std::mem::size_of::()); self.len += n; } @@ -231,7 +271,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append_slice(&[42, 44, 46]); @@ -247,7 +287,7 @@ impl BufferBuilder { /// View the contents of this buffer as a slice /// /// ``` - /// use arrow::array::Float64BufferBuilder; + /// # use arrow_array::builder::Float64BufferBuilder; /// /// let mut builder = Float64BufferBuilder::new(10); /// builder.append(1.3); @@ -270,7 +310,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::Float32BufferBuilder; + /// # use arrow_array::builder::Float32BufferBuilder; /// /// let mut builder = Float32BufferBuilder::new(10); /// @@ -297,7 +337,7 @@ impl BufferBuilder { /// # Example: /// /// ``` - /// use arrow::array::UInt16BufferBuilder; + /// # use arrow_array::builder::UInt16BufferBuilder; /// /// let mut builder = UInt16BufferBuilder::new(10); /// @@ -312,7 +352,7 @@ impl BufferBuilder { /// ``` #[inline] pub fn truncate(&mut self, len: usize) { - self.buffer.truncate(len * mem::size_of::()); + self.buffer.truncate(len * std::mem::size_of::()); self.len = len; } @@ -333,12 +373,12 @@ impl BufferBuilder { self.len += len; } - /// Resets this builder and returns an immutable [`Buffer`](crate::buffer::Buffer). + /// Resets this builder and returns an immutable [`Buffer`](arrow_buffer::Buffer). /// /// # Example: /// /// ``` - /// use arrow::array::UInt8BufferBuilder; + /// # use arrow_array::builder::UInt8BufferBuilder; /// /// let mut builder = UInt8BufferBuilder::new(10); /// builder.append_slice(&[42, 44, 46]); @@ -357,11 +397,10 @@ impl BufferBuilder { #[cfg(test)] mod tests { - use crate::array::array::Array; - use crate::array::builder::ArrayBuilder; - use crate::array::Int32BufferBuilder; - use crate::array::Int8Builder; - use crate::array::UInt8BufferBuilder; + use crate::builder::{ + ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder, + }; + use crate::Array; #[test] fn test_builder_i32_empty() { diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow-array/src/builder/decimal_builder.rs similarity index 94% rename from arrow/src/array/builder/decimal_builder.rs rename to arrow-array/src/builder/decimal_builder.rs index daa30eebed92..096cbec3a6c8 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow-array/src/builder/decimal_builder.rs @@ -15,20 +15,15 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::sync::Arc; - -use crate::array::array_decimal::Decimal256Array; -use crate::array::ArrayRef; -use crate::array::Decimal128Array; -use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder}; - -use crate::error::{ArrowError, Result}; - -use crate::datatypes::{ +use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder}; +use crate::decimal::Decimal256; +use crate::{ArrayRef, Decimal128Array, Decimal256Array}; +use arrow_data::decimal::{ validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, }; -use crate::util::decimal::Decimal256; +use arrow_schema::ArrowError; +use std::any::Any; +use std::sync::Arc; /// Array Builder for [`Decimal128Array`] /// @@ -90,7 +85,7 @@ impl Decimal128Builder { /// Appends a decimal value into the builder. #[inline] - pub fn append_value(&mut self, value: impl Into) -> Result<()> { + pub fn append_value(&mut self, value: impl Into) -> Result<(), ArrowError> { let value = value.into(); if self.value_validation { validate_decimal_precision(value, self.precision)? @@ -107,7 +102,10 @@ impl Decimal128Builder { /// Appends an `Option>` into the builder. #[inline] - pub fn append_option(&mut self, value: Option>) -> Result<()> { + pub fn append_option( + &mut self, + value: Option>, + ) -> Result<(), ArrowError> { match value { None => { self.append_null(); @@ -192,7 +190,7 @@ impl Decimal256Builder { /// /// Returns an error if `value` has different precision, scale or length in bytes than this builder #[inline] - pub fn append_value(&mut self, value: &Decimal256) -> Result<()> { + pub fn append_value(&mut self, value: &Decimal256) -> Result<(), ArrowError> { let value = if self.value_validation { let raw_bytes = value.raw_value(); validate_decimal256_precision_with_lt_bytes(raw_bytes, self.precision)?; @@ -225,7 +223,10 @@ impl Decimal256Builder { /// Appends an `Option<&Decimal256>` into the builder. #[inline] - pub fn append_option(&mut self, value: Option<&Decimal256>) -> Result<()> { + pub fn append_option( + &mut self, + value: Option<&Decimal256>, + ) -> Result<(), ArrowError> { match value { None => { self.append_null(); @@ -248,13 +249,11 @@ impl Decimal256Builder { #[cfg(test)] mod tests { use super::*; + use crate::decimal::Decimal128; + use crate::Array; + use arrow_schema::DataType; use num::{BigInt, Num}; - use crate::array::array_decimal::Decimal128Array; - use crate::array::{array_decimal, Array}; - use crate::datatypes::DataType; - use crate::util::decimal::{Decimal128, Decimal256}; - #[test] fn test_decimal_builder() { let mut builder = Decimal128Builder::new(38, 6); @@ -377,7 +376,7 @@ mod tests { .expect("should not validate invalid value at builder"); let array = builder.finish(); - let array_data = array_decimal::DecimalArray::data(&array); + let array_data = array.data(); array_data.validate_values().unwrap(); } } diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs similarity index 94% rename from arrow/src/array/builder/fixed_size_binary_builder.rs rename to arrow-array/src/builder/fixed_size_binary_builder.rs index 30c25e0a62b9..15b840d0a95d 100644 --- a/arrow/src/array/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -15,16 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ - ArrayBuilder, ArrayData, ArrayRef, FixedSizeBinaryArray, UInt8BufferBuilder, -}; -use crate::datatypes::DataType; -use crate::error::{ArrowError, Result}; +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, UInt8BufferBuilder}; +use crate::{ArrayRef, FixedSizeBinaryArray}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -use super::NullBufferBuilder; - #[derive(Debug)] pub struct FixedSizeBinaryBuilder { values_builder: UInt8BufferBuilder, @@ -58,7 +56,7 @@ impl FixedSizeBinaryBuilder { /// Automatically update the null buffer to delimit the slice appended in as a /// distinct value element. #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { + pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> { if self.value_length != value.as_ref().len() as i32 { Err(ArrowError::InvalidArgumentError( "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() @@ -127,9 +125,9 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { mod tests { use super::*; - use crate::array::Array; - use crate::array::FixedSizeBinaryArray; - use crate::datatypes::DataType; + use crate::Array; + use crate::FixedSizeBinaryArray; + use arrow_schema::DataType; #[test] fn test_fixed_size_binary_builder() { diff --git a/arrow/src/array/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs similarity index 95% rename from arrow/src/array/builder/fixed_size_list_builder.rs rename to arrow-array/src/builder/fixed_size_list_builder.rs index da850d156243..e15708ed6c33 100644 --- a/arrow/src/array/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -15,18 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, FixedSizeListArray}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field}; use std::any::Any; use std::sync::Arc; -use crate::array::ArrayData; -use crate::array::ArrayRef; -use crate::array::FixedSizeListArray; -use crate::datatypes::DataType; -use crate::datatypes::Field; - -use super::ArrayBuilder; -use super::NullBufferBuilder; - /// Array builder for [`FixedSizeListArray`] #[derive(Debug)] pub struct FixedSizeListBuilder { @@ -150,9 +146,9 @@ where mod tests { use super::*; - use crate::array::Array; - use crate::array::Int32Array; - use crate::array::Int32Builder; + use crate::builder::Int32Builder; + use crate::Array; + use crate::Int32Array; #[test] fn test_fixed_size_list_array_builder() { diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow-array/src/builder/generic_binary_builder.rs similarity index 97% rename from arrow/src/array/builder/generic_binary_builder.rs rename to arrow-array/src/builder/generic_binary_builder.rs index 7f83a945343a..c806bebf9a0b 100644 --- a/arrow/src/array/builder/generic_binary_builder.rs +++ b/arrow-array/src/builder/generic_binary_builder.rs @@ -15,15 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ - ArrayBuilder, ArrayDataBuilder, ArrayRef, GenericBinaryArray, OffsetSizeTrait, - UInt8BufferBuilder, -}; +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; +use crate::{ArrayRef, GenericBinaryArray, OffsetSizeTrait}; +use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; -use super::{BufferBuilder, NullBufferBuilder}; - /// Array builder for [`GenericBinaryArray`] #[derive(Debug)] pub struct GenericBinaryBuilder { diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs similarity index 96% rename from arrow/src/array/builder/generic_list_builder.rs rename to arrow-array/src/builder/generic_list_builder.rs index 1beda7114171..3f5892ff037d 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -15,17 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::{ArrayRef, GenericListArray, OffsetSizeTrait}; +use arrow_data::ArrayData; +use arrow_schema::Field; use std::any::Any; use std::sync::Arc; -use crate::array::ArrayData; -use crate::array::ArrayRef; -use crate::array::GenericListArray; -use crate::array::OffsetSizeTrait; -use crate::datatypes::Field; - -use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; - /// Array builder for [`GenericListArray`] #[derive(Debug)] pub struct GenericListBuilder { @@ -155,10 +152,10 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::builder::ListBuilder; - use crate::array::{Array, Int32Array, Int32Builder}; - use crate::buffer::Buffer; - use crate::datatypes::DataType; + use crate::builder::{Int32Builder, ListBuilder}; + use crate::{Array, Int32Array}; + use arrow_buffer::Buffer; + use arrow_schema::DataType; fn _test_generic_list_array_builder() { let values_builder = Int32Builder::with_capacity(10); diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow-array/src/builder/generic_string_builder.rs similarity index 96% rename from arrow/src/array/builder/generic_string_builder.rs rename to arrow-array/src/builder/generic_string_builder.rs index f36e499b8462..f766b6f55f2a 100644 --- a/arrow/src/array/builder/generic_string_builder.rs +++ b/arrow-array/src/builder/generic_string_builder.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{Array, ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait}; +use crate::builder::{ArrayBuilder, GenericBinaryBuilder}; +use crate::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; use std::any::Any; use std::sync::Arc; -use super::GenericBinaryBuilder; - /// Array builder for [`GenericStringArray`] #[derive(Debug)] pub struct GenericStringBuilder { @@ -134,7 +133,8 @@ impl ArrayBuilder for GenericStringBuilder() { let mut builder = GenericStringBuilder::::new(); diff --git a/arrow/src/array/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs similarity index 93% rename from arrow/src/array/builder/map_builder.rs rename to arrow-array/src/builder/map_builder.rs index 766e8a56b387..78f49550071a 100644 --- a/arrow/src/array/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -15,20 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::{Array, ArrayRef, MapArray, StructArray}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; -use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; -use crate::array::array::Array; -use crate::array::ArrayData; -use crate::array::ArrayRef; -use crate::array::MapArray; -use crate::array::StructArray; -use crate::datatypes::DataType; -use crate::datatypes::Field; -use crate::error::ArrowError; -use crate::error::Result; - #[derive(Debug)] pub struct MapBuilder { offsets_builder: BufferBuilder, @@ -96,7 +90,7 @@ impl MapBuilder { /// /// Returns an error if the key and values builders are in an inconsistent state. #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { + pub fn append(&mut self, is_valid: bool) -> Result<(), ArrowError> { if self.key_builder.len() != self.value_builder.len() { return Err(ArrowError::InvalidArgumentError(format!( "Cannot append to a map builder when its keys and values have unequal lengths of {} and {}", @@ -189,11 +183,10 @@ impl ArrayBuilder for MapBuilder { #[cfg(test)] mod tests { use super::*; + use arrow_buffer::Buffer; + use arrow_data::Bitmap; - use crate::array::builder::StringBuilder; - use crate::array::Int32Builder; - use crate::bitmap::Bitmap; - use crate::buffer::Buffer; + use crate::builder::{Int32Builder, StringBuilder}; // TODO: add a test that finishes building, after designing a spec-compliant // way of inserting values to the map. diff --git a/arrow/src/array/builder/mod.rs b/arrow-array/src/builder/mod.rs similarity index 73% rename from arrow/src/array/builder/mod.rs rename to arrow-array/src/builder/mod.rs index c02acb32653f..cd4a82890a2e 100644 --- a/arrow/src/array/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -15,63 +15,53 @@ // specific language governing permissions and limitations // under the License. -//! Defines a [`BufferBuilder`](crate::array::BufferBuilder) capable -//! of creating a [`Buffer`](crate::buffer::Buffer) which can be used -//! as an internal buffer in an [`ArrayData`](crate::array::ArrayData) -//! object. +//! Defines builders for the various array types mod boolean_buffer_builder; +pub use boolean_buffer_builder::*; + mod boolean_builder; +pub use boolean_builder::*; mod buffer_builder; +pub use buffer_builder::*; mod decimal_builder; +pub use decimal_builder::*; mod fixed_size_binary_builder; +pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; +pub use fixed_size_list_builder::*; mod generic_binary_builder; +pub use generic_binary_builder::*; mod generic_list_builder; +pub use generic_list_builder::*; mod generic_string_builder; +pub use generic_string_builder::*; mod map_builder; +pub use map_builder::*; mod null_buffer_builder; mod primitive_builder; +pub use primitive_builder::*; mod primitive_dictionary_builder; +pub use primitive_dictionary_builder::*; mod string_dictionary_builder; +pub use string_dictionary_builder::*; mod struct_builder; +pub use struct_builder::*; mod union_builder; +pub use union_builder::*; +use crate::ArrayRef; use std::any::Any; -use std::marker::PhantomData; -use std::ops::Range; - -use super::ArrayRef; - -pub use boolean_buffer_builder::BooleanBufferBuilder; -pub use boolean_builder::BooleanBuilder; -pub use buffer_builder::BufferBuilder; -pub use decimal_builder::Decimal128Builder; -pub use decimal_builder::Decimal256Builder; -pub use fixed_size_binary_builder::FixedSizeBinaryBuilder; -pub use fixed_size_list_builder::FixedSizeListBuilder; -pub use generic_binary_builder::GenericBinaryBuilder; -pub use generic_list_builder::GenericListBuilder; -pub use generic_string_builder::GenericStringBuilder; -pub use map_builder::{MapBuilder, MapFieldNames}; -use null_buffer_builder::NullBufferBuilder; -pub use primitive_builder::PrimitiveBuilder; -pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder; -pub use string_dictionary_builder::StringDictionaryBuilder; -pub use struct_builder::{make_builder, StructBuilder}; -pub use union_builder::UnionBuilder; /// Trait for dealing with different array builders at runtime /// /// # Example /// /// ``` -/// # use arrow::{ -/// # array::{ArrayBuilder, ArrayRef, Float64Builder, Int64Builder, StringArray, StringBuilder}, -/// # error::ArrowError, -/// # }; -/// # fn main() -> std::result::Result<(), ArrowError> { /// // Create +/// # use arrow_array::{ArrayRef, StringArray}; +/// # use arrow_array::builder::{ArrayBuilder, Float64Builder, Int64Builder, StringBuilder}; +/// /// let mut data_builders: Vec> = vec![ /// Box::new(Float64Builder::new()), /// Box::new(Int64Builder::new()), @@ -110,8 +100,6 @@ pub use union_builder::UnionBuilder; /// .value(0), /// "🍎" /// ); -/// # Ok(()) -/// # } /// ``` pub trait ArrayBuilder: Any + Send { /// Returns the number of array slots in the builder diff --git a/arrow/src/array/builder/null_buffer_builder.rs b/arrow-array/src/builder/null_buffer_builder.rs similarity index 98% rename from arrow/src/array/builder/null_buffer_builder.rs rename to arrow-array/src/builder/null_buffer_builder.rs index ef2e4c50ab9c..b2aa622ca7a4 100644 --- a/arrow/src/array/builder/null_buffer_builder.rs +++ b/arrow-array/src/builder/null_buffer_builder.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::buffer::Buffer; - -use super::BooleanBufferBuilder; +use crate::builder::BooleanBufferBuilder; +use arrow_buffer::Buffer; /// Builder for creating the null bit buffer. /// This builder only materializes the buffer when we append `false`. diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs similarity index 84% rename from arrow/src/array/builder/primitive_builder.rs rename to arrow-array/src/builder/primitive_builder.rs index 38c8b4471477..c5b8c9557072 100644 --- a/arrow/src/array/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -15,15 +15,42 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::{ArrayBuilder, BufferBuilder}; +use crate::types::*; +use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use arrow_data::ArrayData; use std::any::Any; use std::sync::Arc; -use crate::array::ArrayData; -use crate::array::ArrayRef; -use crate::array::PrimitiveArray; -use crate::datatypes::ArrowPrimitiveType; - -use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; +pub type Int8Builder = PrimitiveBuilder; +pub type Int16Builder = PrimitiveBuilder; +pub type Int32Builder = PrimitiveBuilder; +pub type Int64Builder = PrimitiveBuilder; +pub type UInt8Builder = PrimitiveBuilder; +pub type UInt16Builder = PrimitiveBuilder; +pub type UInt32Builder = PrimitiveBuilder; +pub type UInt64Builder = PrimitiveBuilder; +pub type Float32Builder = PrimitiveBuilder; +pub type Float64Builder = PrimitiveBuilder; + +pub type TimestampSecondBuilder = PrimitiveBuilder; +pub type TimestampMillisecondBuilder = PrimitiveBuilder; +pub type TimestampMicrosecondBuilder = PrimitiveBuilder; +pub type TimestampNanosecondBuilder = PrimitiveBuilder; +pub type Date32Builder = PrimitiveBuilder; +pub type Date64Builder = PrimitiveBuilder; +pub type Time32SecondBuilder = PrimitiveBuilder; +pub type Time32MillisecondBuilder = PrimitiveBuilder; +pub type Time64MicrosecondBuilder = PrimitiveBuilder; +pub type Time64NanosecondBuilder = PrimitiveBuilder; +pub type IntervalYearMonthBuilder = PrimitiveBuilder; +pub type IntervalDayTimeBuilder = PrimitiveBuilder; +pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; +pub type DurationSecondBuilder = PrimitiveBuilder; +pub type DurationMillisecondBuilder = PrimitiveBuilder; +pub type DurationMicrosecondBuilder = PrimitiveBuilder; +pub type DurationNanosecondBuilder = PrimitiveBuilder; /// Array builder for fixed-width primitive types #[derive(Debug)] @@ -179,14 +206,14 @@ impl PrimitiveBuilder { #[cfg(test)] mod tests { use super::*; + use arrow_buffer::Buffer; use crate::array::Array; use crate::array::BooleanArray; use crate::array::Date32Array; use crate::array::Int32Array; - use crate::array::Int32Builder; use crate::array::TimestampSecondArray; - use crate::buffer::Buffer; + use crate::builder::Int32Builder; #[test] fn test_primitive_array_builder_i32() { diff --git a/arrow/src/array/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs similarity index 92% rename from arrow/src/array/builder/primitive_dictionary_builder.rs rename to arrow-array/src/builder/primitive_dictionary_builder.rs index 0fd41a181f55..c43416e5af30 100644 --- a/arrow/src/array/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -15,18 +15,15 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::{ArrayBuilder, PrimitiveBuilder}; +use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use arrow_buffer::{ArrowNativeType, ToByteSlice}; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::Arc; -use crate::array::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; -use crate::datatypes::{ArrowNativeType, DataType, ToByteSlice}; -use crate::error::{ArrowError, Result}; - -use super::ArrayBuilder; -use super::PrimitiveBuilder; - /// Wraps a type implementing `ToByteSlice` implementing `Hash` and `Eq` for it /// /// This is necessary to handle types such as f32, which don't natively implement these @@ -54,13 +51,12 @@ impl Eq for Value {} /// # Example: /// /// ``` -/// use arrow::array::{ -/// Array, PrimitiveBuilder, PrimitiveDictionaryBuilder, -/// UInt8Array, UInt32Array, -/// }; -/// use arrow::datatypes::{UInt8Type, UInt32Type}; /// -/// let mut builder = PrimitiveDictionaryBuilder::::new(); +/// # use arrow_array::builder::PrimitiveDictionaryBuilder; +/// # use arrow_array::types::{UInt32Type, UInt8Type}; +/// # use arrow_array::{Array, UInt32Array, UInt8Array}; +/// +/// let mut builder = PrimitiveDictionaryBuilder::::new(); /// builder.append(12345678).unwrap(); /// builder.append_null(); /// builder.append(22345678).unwrap(); @@ -175,7 +171,7 @@ where /// if already present in the values array or a new index if the /// value is appended to the values array. #[inline] - pub fn append(&mut self, value: V::Native) -> Result { + pub fn append(&mut self, value: V::Native) -> Result { let key = match self.map.entry(Value(value)) { Entry::Vacant(vacant) => { // Append new value. @@ -223,8 +219,7 @@ mod tests { use crate::array::Array; use crate::array::UInt32Array; use crate::array::UInt8Array; - use crate::datatypes::UInt32Type; - use crate::datatypes::UInt8Type; + use crate::types::{UInt32Type, UInt8Type}; #[test] fn test_primitive_dictionary_builder() { diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow-array/src/builder/string_dictionary_builder.rs similarity index 94% rename from arrow/src/array/builder/string_dictionary_builder.rs rename to arrow-array/src/builder/string_dictionary_builder.rs index 3816e0be1ddb..bab17d4a9f6e 100644 --- a/arrow/src/array/builder/string_dictionary_builder.rs +++ b/arrow-array/src/builder/string_dictionary_builder.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use super::PrimitiveBuilder; -use crate::array::{ - Array, ArrayBuilder, ArrayRef, DictionaryArray, StringArray, StringBuilder, -}; -use crate::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; -use crate::error::{ArrowError, Result}; +use crate::builder::{ArrayBuilder, PrimitiveBuilder, StringBuilder}; +use crate::types::ArrowDictionaryKeyType; +use crate::{Array, ArrayRef, DictionaryArray, StringArray}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::{ArrowError, DataType}; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; use std::any::Any; @@ -31,17 +30,13 @@ use std::sync::Arc; /// arrays or result in an ordered dictionary. /// /// ``` -/// use arrow::{ -/// array::{ -/// Int8Array, StringArray, -/// PrimitiveBuilder, StringBuilder, StringDictionaryBuilder, -/// }, -/// datatypes::Int8Type, -/// }; -/// /// // Create a dictionary array indexed by bytes whose values are Strings. /// // It can thus hold up to 256 distinct string values. /// +/// # use arrow_array::builder::StringDictionaryBuilder; +/// # use arrow_array::{Int8Array, StringArray}; +/// # use arrow_array::types::Int8Type; +/// /// let mut builder = StringDictionaryBuilder::::new(); /// /// // The builder builds the dictionary value by value @@ -132,9 +127,8 @@ where /// # Example /// /// ``` - /// use arrow::datatypes::Int16Type; - /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array}; - /// use std::convert::TryFrom; + /// # use arrow_array::builder::StringDictionaryBuilder; + /// # use arrow_array::{Int16Array, StringArray}; /// /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); /// @@ -152,7 +146,7 @@ where pub fn new_with_dictionary( keys_capacity: usize, dictionary_values: &StringArray, - ) -> Result { + ) -> Result { let state = ahash::RandomState::default(); let dict_len = dictionary_values.len(); @@ -239,7 +233,7 @@ where /// value is appended to the values array. /// /// Returns an error if the new index would overflow the key type. - pub fn append(&mut self, value: impl AsRef) -> Result { + pub fn append(&mut self, value: impl AsRef) -> Result { let value = value.as_ref(); let state = &self.state; @@ -312,8 +306,7 @@ mod tests { use crate::array::Array; use crate::array::Int8Array; - use crate::datatypes::Int16Type; - use crate::datatypes::Int8Type; + use crate::types::{Int16Type, Int8Type}; #[test] fn test_string_dictionary_builder() { diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs similarity index 97% rename from arrow/src/array/builder/struct_builder.rs rename to arrow-array/src/builder/struct_builder.rs index c5db09119e08..cadc8a529f5f 100644 --- a/arrow/src/array/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -15,17 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::*; +use crate::{Array, ArrayRef, StructArray}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; use std::any::Any; -use std::fmt; use std::sync::Arc; -use crate::array::builder::decimal_builder::Decimal128Builder; -use crate::array::*; -use crate::datatypes::DataType; -use crate::datatypes::Field; - -use super::NullBufferBuilder; - /// Array builder for Struct types. /// /// Note that callers should make sure that methods of all the child field builders are @@ -36,8 +33,8 @@ pub struct StructBuilder { null_buffer_builder: NullBufferBuilder, } -impl fmt::Debug for StructBuilder { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Debug for StructBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StructBuilder") .field("fields", &self.fields) .field("bitmap_builder", &self.null_buffer_builder) @@ -94,6 +91,7 @@ impl ArrayBuilder for StructBuilder { /// This function is useful to construct arrays from an arbitrary vectors with known/expected /// schema. pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + use crate::builder::*; match datatype { DataType::Null => unimplemented!(), DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)), @@ -218,7 +216,7 @@ impl StructBuilder { let mut child_data = Vec::with_capacity(self.field_builders.len()); for f in &mut self.field_builders { let arr = f.finish(); - child_data.push(arr.into_data()); + child_data.push(arr.data().clone()); } let length = self.len(); let null_bit_buffer = self.null_buffer_builder.finish(); @@ -248,10 +246,10 @@ impl StructBuilder { #[cfg(test)] mod tests { use super::*; + use arrow_buffer::Buffer; + use arrow_data::Bitmap; use crate::array::Array; - use crate::bitmap::Bitmap; - use crate::buffer::Buffer; #[test] fn test_struct_array_builder() { diff --git a/arrow/src/array/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs similarity index 92% rename from arrow/src/array/builder/union_builder.rs rename to arrow-array/src/builder/union_builder.rs index c0ae76853dd2..def1e1eca063 100644 --- a/arrow/src/array/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -15,24 +15,16 @@ // specific language governing permissions and limitations // under the License. +use crate::builder::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder}; +use crate::builder::null_buffer_builder::NullBufferBuilder; +use crate::builder::BufferBuilder; +use crate::{make_array, ArrowPrimitiveType, UnionArray}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::collections::HashMap; -use crate::array::ArrayDataBuilder; -use crate::array::Int32BufferBuilder; -use crate::array::Int8BufferBuilder; -use crate::array::UnionArray; -use crate::buffer::Buffer; - -use crate::datatypes::DataType; -use crate::datatypes::Field; -use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType}; -use crate::error::{ArrowError, Result}; - -use super::{BufferBuilder, NullBufferBuilder}; - -use crate::array::make_array; - /// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. #[derive(Debug)] struct FieldData { @@ -112,8 +104,8 @@ impl FieldData { /// Example: **Dense Memory Layout** /// /// ``` -/// use arrow::array::UnionBuilder; -/// use arrow::datatypes::{Float64Type, Int32Type}; +/// # use arrow_array::builder::UnionBuilder; +/// # use arrow_array::types::{Float64Type, Int32Type}; /// /// let mut builder = UnionBuilder::new_dense(); /// builder.append::("a", 1).unwrap(); @@ -132,8 +124,8 @@ impl FieldData { /// /// Example: **Sparse Memory Layout** /// ``` -/// use arrow::array::UnionBuilder; -/// use arrow::datatypes::{Float64Type, Int32Type}; +/// # use arrow_array::builder::UnionBuilder; +/// # use arrow_array::types::{Float64Type, Int32Type}; /// /// let mut builder = UnionBuilder::new_sparse(); /// builder.append::("a", 1).unwrap(); @@ -203,7 +195,10 @@ impl UnionBuilder { /// is part of the final array, appending a NULL requires /// specifying which field (child) to use. #[inline] - pub fn append_null(&mut self, type_name: &str) -> Result<()> { + pub fn append_null( + &mut self, + type_name: &str, + ) -> Result<(), ArrowError> { self.append_option::(type_name, None) } @@ -213,7 +208,7 @@ impl UnionBuilder { &mut self, type_name: &str, v: T::Native, - ) -> Result<()> { + ) -> Result<(), ArrowError> { self.append_option::(type_name, Some(v)) } @@ -221,7 +216,7 @@ impl UnionBuilder { &mut self, type_name: &str, v: Option, - ) -> Result<()> { + ) -> Result<(), ArrowError> { let type_name = type_name.to_string(); let mut field_data = match self.fields.remove(&type_name) { @@ -278,7 +273,7 @@ impl UnionBuilder { } /// Builds this builder creating a new `UnionArray`. - pub fn build(mut self) -> Result { + pub fn build(mut self) -> Result { let type_id_buffer = self.type_id_builder.finish(); let value_offsets_buffer = self.value_offset_builder.map(|mut b| b.finish()); let mut children = Vec::new(); diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs new file mode 100644 index 000000000000..653836b8d4e4 --- /dev/null +++ b/arrow-array/src/cast.rs @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines helper functions for downcasting [`dyn Array`](Array) to concrete types + +use crate::array::*; +use crate::types::*; + +/// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`] +/// accepts a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, downcast_primitive_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_primitive(array: &dyn Array) { +/// downcast_primitive_array!( +/// array => { +/// for v in array { +/// println!("{:?}", v); +/// } +/// } +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_primitive_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + downcast_primitive_array!($values => {$e} $($p => $fallback)*) + }; + + ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match $values.data_type() { + arrow_schema::DataType::Int8 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values); + $e + } + arrow_schema::DataType::Int16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values); + $e + } + arrow_schema::DataType::Int32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values); + $e + } + arrow_schema::DataType::Int64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values); + $e + } + arrow_schema::DataType::UInt8 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values); + $e + } + arrow_schema::DataType::UInt16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values); + $e + } + arrow_schema::DataType::UInt32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values); + $e + } + arrow_schema::DataType::UInt64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values); + $e + } + arrow_schema::DataType::Float16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float16Type, + >($values); + $e + } + arrow_schema::DataType::Float32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values); + $e + } + arrow_schema::DataType::Float64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values); + $e + } + arrow_schema::DataType::Date32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values); + $e + } + arrow_schema::DataType::Date64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values); + $e + } + arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values); + $e + } + arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values); + $e + } + arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values); + $e + } + $($p => $fallback,)* + } + }; + + (($values1:ident, $values2:ident) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match ($values1.data_type(), $values2.data_type()) { + (arrow_schema::DataType::Int8, arrow_schema::DataType::Int8) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values2); + $e + } + (arrow_schema::DataType::Int16, arrow_schema::DataType::Int16) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values2); + $e + } + (arrow_schema::DataType::Int32, arrow_schema::DataType::Int32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values2); + $e + } + (arrow_schema::DataType::Int64, arrow_schema::DataType::Int64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt8, arrow_schema::DataType::UInt8) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt16, arrow_schema::DataType::UInt16) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt32, arrow_schema::DataType::UInt32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt64, arrow_schema::DataType::UInt64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values2); + $e + } + (arrow_schema::DataType::Float32, arrow_schema::DataType::Float32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values2); + $e + } + (arrow_schema::DataType::Float64, arrow_schema::DataType::Float64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values2); + $e + } + (arrow_schema::DataType::Date32, arrow_schema::DataType::Date32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values2); + $e + } + (arrow_schema::DataType::Date64, arrow_schema::DataType::Date64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values2); + $e + } + (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values2); + $e + } + $($p => $fallback,)* + } + }; +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`], to +/// [`PrimitiveArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, Int32Array}; +/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_array::types::Int32Type; +/// +/// let arr: ArrayRef = Arc::new(Int32Array::from(vec![Some(1)])); +/// +/// // Downcast an `ArrayRef` to Int32Array / PrimiveArray: +/// let primitive_array: &Int32Array = as_primitive_array(&arr); +/// +/// // Equivalently: +/// let primitive_array = as_primitive_array::(&arr); +/// +/// // This is the equivalent of: +/// let primitive_array = arr +/// .as_any() +/// .downcast_ref::() +/// .unwrap(); +/// ``` + +pub fn as_primitive_array(arr: &dyn Array) -> &PrimitiveArray +where + T: ArrowPrimitiveType, +{ + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to primitive array") +} + +/// Downcast an [`Array`] to a [`DictionaryArray`] based on its [`DataType`], accepts +/// a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, StringArray, downcast_dictionary_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_strings(array: &dyn Array) { +/// downcast_dictionary_array!( +/// array => match array.values().data_type() { +/// DataType::Utf8 => { +/// for v in array.downcast_dict::().unwrap() { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported dictionary value type {}", t), +/// }, +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_dictionary_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + downcast_dictionary_array!($values => {$e} $($p => $fallback)*) + }; + + ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match $values.data_type() { + arrow_schema::DataType::Dictionary(k, _) => match k.as_ref() { + arrow_schema::DataType::Int8 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int8Type, + >($values); + $e + }, + arrow_schema::DataType::Int16 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int16Type, + >($values); + $e + }, + arrow_schema::DataType::Int32 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int32Type, + >($values); + $e + }, + arrow_schema::DataType::Int64 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int64Type, + >($values); + $e + }, + arrow_schema::DataType::UInt8 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt8Type, + >($values); + $e + }, + arrow_schema::DataType::UInt16 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt16Type, + >($values); + $e + }, + arrow_schema::DataType::UInt32 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt32Type, + >($values); + $e + }, + arrow_schema::DataType::UInt64 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt64Type, + >($values); + $e + }, + k => unreachable!("unsupported dictionary key type: {}", k) + } + $($p => $fallback,)* + } + } +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`DictionaryArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use arrow_array::{ArrayRef, DictionaryArray}; +/// # use arrow_array::cast::as_dictionary_array; +/// # use arrow_array::types::Int32Type; +/// +/// let arr: DictionaryArray = vec![Some("foo")].into_iter().collect(); +/// let arr: ArrayRef = std::sync::Arc::new(arr); +/// let dict_array: &DictionaryArray = as_dictionary_array::(&arr); +/// ``` +pub fn as_dictionary_array(arr: &dyn Array) -> &DictionaryArray +where + T: ArrowDictionaryKeyType, +{ + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to dictionary array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`GenericListArray`], panic'ing on failure. +pub fn as_generic_list_array( + arr: &dyn Array, +) -> &GenericListArray { + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to list array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`ListArray`], panic'ing on failure. +#[inline] +pub fn as_list_array(arr: &dyn Array) -> &ListArray { + as_generic_list_array::(arr) +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`LargeListArray`], panic'ing on failure. +#[inline] +pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { + as_generic_list_array::(arr) +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`GenericBinaryArray`], panic'ing on failure. +#[inline] +pub fn as_generic_binary_array( + arr: &dyn Array, +) -> &GenericBinaryArray { + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to binary array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`StringArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::cast::as_string_array; +/// # use arrow_array::{ArrayRef, StringArray}; +/// +/// let arr: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("foo")])); +/// let string_array = as_string_array(&arr); +/// ``` +pub fn as_string_array(arr: &dyn Array) -> &StringArray { + arr.as_any() + .downcast_ref::() + .expect("Unable to downcast to StringArray") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`BooleanArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray}; +/// # use arrow_array::cast::as_boolean_array; +/// +/// let arr: ArrayRef = Arc::new(BooleanArray::from_iter(vec![Some(true)])); +/// let boolean_array = as_boolean_array(&arr); +/// ``` +pub fn as_boolean_array(arr: &dyn Array) -> &BooleanArray { + arr.as_any() + .downcast_ref::() + .expect("Unable to downcast to BooleanArray") +} + +macro_rules! array_downcast_fn { + ($name: ident, $arrty: ty, $arrty_str:expr) => { + #[doc = "Force downcast of an [`Array`], such as an [`ArrayRef`] to "] + #[doc = $arrty_str] + pub fn $name(arr: &dyn Array) -> &$arrty { + arr.as_any().downcast_ref::<$arrty>().expect(concat!( + "Unable to downcast to typed array through ", + stringify!($name) + )) + } + }; + + // use recursive macro to generate dynamic doc string for a given array type + ($name: ident, $arrty: ty) => { + array_downcast_fn!( + $name, + $arrty, + concat!("[`", stringify!($arrty), "`], panic'ing on failure.") + ); + }; +} + +array_downcast_fn!(as_largestring_array, LargeStringArray); +array_downcast_fn!(as_null_array, NullArray); +array_downcast_fn!(as_struct_array, StructArray); +array_downcast_fn!(as_union_array, UnionArray); +array_downcast_fn!(as_map_array, MapArray); +array_downcast_fn!(as_decimal_array, Decimal128Array); + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + + #[test] + fn test_as_decimal_array_ref() { + let array: Decimal128Array = vec![Some(123), None, Some(1111)] + .into_iter() + .collect::() + .with_precision_and_scale(10, 2) + .unwrap(); + assert!(!as_decimal_array(&array).is_empty()); + let result_decimal = as_decimal_array(&array); + assert_eq!(result_decimal, &array); + } + + #[test] + fn test_as_primitive_array_ref() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + assert!(!as_primitive_array::(&array).is_empty()); + + // should also work when wrapped in an Arc + let array: ArrayRef = Arc::new(array); + assert!(!as_primitive_array::(&array).is_empty()); + } + + #[test] + fn test_as_string_array_ref() { + let array: StringArray = vec!["foo", "bar"].into_iter().map(Some).collect(); + assert!(!as_string_array(&array).is_empty()); + + // should also work when wrapped in an Arc + let array: ArrayRef = Arc::new(array); + assert!(!as_string_array(&array).is_empty()) + } +} diff --git a/arrow/src/util/decimal.rs b/arrow-array/src/decimal.rs similarity index 95% rename from arrow/src/util/decimal.rs rename to arrow-array/src/decimal.rs index 421942df5c1b..323281d9233c 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow-array/src/decimal.rs @@ -15,15 +15,12 @@ // specific language governing permissions and limitations // under the License. -//! Decimal related utils - -use crate::datatypes::{ - DataType, Decimal128Type, Decimal256Type, DecimalType, DECIMAL256_MAX_PRECISION, - DECIMAL_DEFAULT_SCALE, -}; -use crate::error::{ArrowError, Result}; -use num::bigint::BigInt; -use num::Signed; +//! Decimal related utilities, types and functions + +use crate::types::{Decimal128Type, Decimal256Type, DecimalType}; +use arrow_data::decimal::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; +use arrow_schema::{ArrowError, DataType}; +use num::{BigInt, Signed}; use std::cmp::{min, Ordering}; /// [`Decimal`] is the generic representation of a single decimal value @@ -76,7 +73,11 @@ impl Decimal { /// Safety: /// This method doesn't validate if the decimal value represented by the bytes /// can be fitted into the specified precision. - pub fn try_new_from_bytes(precision: u8, scale: u8, bytes: &T::Native) -> Result + pub fn try_new_from_bytes( + precision: u8, + scale: u8, + bytes: &T::Native, + ) -> Result where Self: Sized, { @@ -203,8 +204,7 @@ pub type Decimal128 = Decimal; impl Decimal128 { /// Creates `Decimal128` from an `i128` value. - #[allow(dead_code)] - pub(crate) fn new_from_i128(precision: u8, scale: u8, value: i128) -> Self { + pub fn new_from_i128(precision: u8, scale: u8, value: i128) -> Self { Decimal128 { precision, scale, @@ -230,7 +230,11 @@ pub type Decimal256 = Decimal; impl Decimal256 { /// Constructs a `Decimal256` value from a `BigInt`. - pub fn from_big_int(num: &BigInt, precision: u8, scale: u8) -> Result { + pub fn from_big_int( + num: &BigInt, + precision: u8, + scale: u8, + ) -> Result { let mut bytes = if num.is_negative() { [255_u8; 32] } else { @@ -242,7 +246,7 @@ impl Decimal256 { } /// Constructs a `BigInt` from this `Decimal256` value. - pub(crate) fn to_big_int(self) -> BigInt { + pub fn to_big_int(self) -> BigInt { BigInt::from_signed_bytes_le(&self.value) } } diff --git a/arrow/src/datatypes/delta.rs b/arrow-array/src/delta.rs similarity index 100% rename from arrow/src/datatypes/delta.rs rename to arrow-array/src/delta.rs diff --git a/arrow/src/array/iterator.rs b/arrow-array/src/iterator.rs similarity index 88% rename from arrow/src/array/iterator.rs rename to arrow-array/src/iterator.rs index e64712fa883a..25727e0d75fb 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -15,14 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::array::array::ArrayAccessor; -use crate::array::{DecimalArray, FixedSizeBinaryArray}; -use crate::datatypes::{Decimal128Type, Decimal256Type}; +//! Idiomatic iterators for [`Array`](crate::Array) -use super::{ - BooleanArray, GenericBinaryArray, GenericListArray, GenericStringArray, - PrimitiveArray, +use crate::array::{ + ArrayAccessor, BooleanArray, DecimalArray, FixedSizeBinaryArray, GenericBinaryArray, + GenericListArray, GenericStringArray, PrimitiveArray, }; +use crate::types::{Decimal128Type, Decimal256Type}; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -36,39 +35,14 @@ use super::{ /// on every index of the array, and handle the null mask separately. For [`PrimitiveArray`] /// this functionality is provided by [`compute::unary`] /// -/// ``` -/// # use arrow::array::PrimitiveArray; -/// # use arrow::compute::unary; -/// # use arrow::datatypes::Int32Type; -/// -/// fn add(a: &PrimitiveArray, b: i32) -> PrimitiveArray { -/// unary(a, |a| a + b) -/// } -/// ``` -/// /// If performing a fallible operation, it isn't possible to perform the operation independently /// of the null mask, as this might result in a spurious failure on a null index. However, /// there are more efficient ways to iterate over just the non-null indices, this functionality /// is provided by [`compute::try_unary`] /// -/// ``` -/// # use arrow::array::PrimitiveArray; -/// # use arrow::compute::try_unary; -/// # use arrow::datatypes::Int32Type; -/// # use arrow::error::{ArrowError, Result}; -/// -/// fn checked_add(a: &PrimitiveArray, b: i32) -> Result> { -/// try_unary(a, |a| { -/// a.checked_add(b).ok_or_else(|| { -/// ArrowError::CastError(format!("overflow adding {} to {}", a, b)) -/// }) -/// }) -/// } -/// ``` -/// -/// [`PrimitiveArray`]: [crate::array::PrimitiveArray] -/// [`compute::unary`]: [crate::compute::unary] -/// [`compute::try_unary`]: [crate::compute::try_unary] +/// [`PrimitiveArray`]: [crate::PrimitiveArray] +/// [`compute::unary`]: [arrow::compute::unary] +/// [`compute::try_unary`]: [arrow::compute::try_unary] #[derive(Debug)] pub struct ArrayIter { array: T, diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs new file mode 100644 index 000000000000..16e46f68ba07 --- /dev/null +++ b/arrow-array/src/lib.rs @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The central type in Apache Arrow are arrays, which are a known-length sequence of values +//! all having the same type. This module provides concrete implementations of each type, as +//! well as an [`Array`] trait that can be used for type-erasure. +//! +//! # Downcasting an Array +//! +//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. +//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. +//! +//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, +//! it is often the case that you wish to interact with the data directly. +//! +//! This requires downcasting to the concrete type of the array: +//! +//! ``` +//! # use arrow_array::{Array, Float32Array, Int32Array}; +//! +//! fn sum_int32(array: &dyn Array) -> i32 { +//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); +//! integers.iter().map(|val| val.unwrap_or_default()).sum() +//! } +//! +//! // Note: the values for positions corresponding to nulls will be arbitrary +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_any().downcast_ref::().unwrap().values() +//! } +//! ``` +//! +//! Additionally, there are convenient functions to do this casting +//! such as [`cast::as_primitive_array`] and [`cast::as_string_array`]: +//! +//! ``` +//! # use arrow_array::Array; +//! # use arrow_array::cast::as_primitive_array; +//! # use arrow_array::types::Float32Type; +//! +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! // use as_primtive_array +//! as_primitive_array::(array).values() +//! } +//! ``` + +//! # Building an Array +//! +//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] +//! +//! ``` +//! # use arrow_array::{Int32Array, ListArray, StringArray}; +//! # use arrow_array::types::Int32Type; +//! +//! Int32Array::from(vec![1, 2]); +//! Int32Array::from(vec![Some(1), None]); +//! Int32Array::from_iter([1, 2, 3, 4]); +//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]); +//! +//! StringArray::from(vec!["foo", "bar"]); +//! StringArray::from(vec![Some("foo"), None]); +//! StringArray::from_iter([Some("foo"), None]); +//! StringArray::from_iter_values(["foo", "bar"]); +//! +//! ListArray::from_iter_primitive::([ +//! Some(vec![Some(1), None, Some(3)]), +//! None, +//! Some(vec![]) +//! ]); +//! ``` +//! +//! Additionally [`ArrayBuilder`](builder::ArrayBuilder) implementations can be +//! used to construct arrays with a push-based interface +//! +//! ``` +//! # use arrow_array::Int16Array; +//! # +//! // Create a new builder with a capacity of 100 +//! let mut builder = Int16Array::builder(100); +//! +//! // Append a single primitive value +//! builder.append_value(1); +//! +//! // Append a null value +//! builder.append_null(); +//! +//! // Append a slice of primitive values +//! builder.append_slice(&[2, 3, 4]); +//! +//! // Build the array +//! let array = builder.finish(); +//! +//! assert_eq!( +//! 5, +//! array.len(), +//! "The array has 5 values, counting the null value" +//! ); +//! +//! assert_eq!(2, array.value(2), "Get the value with index 2"); +//! +//! assert_eq!( +//! &array.values()[3..5], +//! &[3, 4], +//! "Get slice of len 2 starting at idx 3" +//! ) +//! ``` +//! +//! # Zero-Copy Slicing +//! +//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this +//! data. Internally this just increments some ref-counts, and so is incredibly cheap +//! +//! ```rust +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, Int32Array}; +//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; +//! +//! // Slice with offset 1 and length 2 +//! let sliced = array.slice(1, 2); +//! let ints = sliced.as_any().downcast_ref::().unwrap(); +//! assert_eq!(ints.values(), &[2, 3]); +//! ``` +//! +//! # Internal Representation +//! +//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of +//! which depend on the array’s data type, as documented in the [Arrow specification]. +//! +//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * A contiguous [`Buffer`] of 16-bit integers +//! +//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer +//! * A values [`Buffer`] of UTF-8 encoded string data +//! +//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html +//! [`&dyn Array`]: Array +//! [`Bitmap`]: arrow_data::Bitmap +//! [`Buffer`]: arrow_buffer::Buffer +//! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html +//! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html +//! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html + +pub mod array; +pub use array::*; + +mod record_batch; +pub use record_batch::{RecordBatch, RecordBatchOptions}; + +pub mod builder; +pub mod cast; +pub mod decimal; +mod delta; +pub mod iterator; +mod raw_pointer; +pub mod temporal_conversions; +mod trusted_len; +pub mod types; + +#[cfg(test)] +mod tests { + use crate::builder::*; + + #[test] + fn test_buffer_builder_availability() { + let _builder = Int8BufferBuilder::new(10); + let _builder = Int16BufferBuilder::new(10); + let _builder = Int32BufferBuilder::new(10); + let _builder = Int64BufferBuilder::new(10); + let _builder = UInt16BufferBuilder::new(10); + let _builder = UInt32BufferBuilder::new(10); + let _builder = Float32BufferBuilder::new(10); + let _builder = Float64BufferBuilder::new(10); + let _builder = TimestampSecondBufferBuilder::new(10); + let _builder = TimestampMillisecondBufferBuilder::new(10); + let _builder = TimestampMicrosecondBufferBuilder::new(10); + let _builder = TimestampNanosecondBufferBuilder::new(10); + let _builder = Date32BufferBuilder::new(10); + let _builder = Date64BufferBuilder::new(10); + let _builder = Time32SecondBufferBuilder::new(10); + let _builder = Time32MillisecondBufferBuilder::new(10); + let _builder = Time64MicrosecondBufferBuilder::new(10); + let _builder = Time64NanosecondBufferBuilder::new(10); + let _builder = IntervalYearMonthBufferBuilder::new(10); + let _builder = IntervalDayTimeBufferBuilder::new(10); + let _builder = IntervalMonthDayNanoBufferBuilder::new(10); + let _builder = DurationSecondBufferBuilder::new(10); + let _builder = DurationMillisecondBufferBuilder::new(10); + let _builder = DurationMicrosecondBufferBuilder::new(10); + let _builder = DurationNanosecondBufferBuilder::new(10); + } +} diff --git a/arrow/src/array/raw_pointer.rs b/arrow-array/src/raw_pointer.rs similarity index 95% rename from arrow/src/array/raw_pointer.rs rename to arrow-array/src/raw_pointer.rs index 1016b808bc5a..3e4233ea1b24 100644 --- a/arrow/src/array/raw_pointer.rs +++ b/arrow-array/src/raw_pointer.rs @@ -18,8 +18,8 @@ use std::ptr::NonNull; /// This struct is highly `unsafe` and offers the possibility to -/// self-reference a [crate::buffer::Buffer] from -/// [crate::array::ArrayData], as a pointer to the beginning of its +/// self-reference a [arrow_buffer::Buffer] from +/// [arrow_data::ArrayData], as a pointer to the beginning of its /// contents. pub(super) struct RawPtrBox { ptr: NonNull, diff --git a/arrow/src/record_batch.rs b/arrow-array/src/record_batch.rs similarity index 91% rename from arrow/src/record_batch.rs rename to arrow-array/src/record_batch.rs index f71c67fe7746..58462449ea31 100644 --- a/arrow/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -16,16 +16,14 @@ // under the License. //! A two-dimensional batch of column-oriented data with a defined -//! [schema](crate::datatypes::Schema). +//! [schema](arrow_schema::Schema). +use crate::{new_empty_array, Array, ArrayRef, StructArray}; +use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; use std::sync::Arc; -use crate::array::*; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - /// A two-dimensional batch of column-oriented data with a defined -/// [schema](crate::datatypes::Schema). +/// [schema](arrow_schema::Schema). /// /// A `RecordBatch` is a two-dimensional dataset of a number of /// contiguous arrays, each the same length. @@ -34,8 +32,6 @@ use crate::error::{ArrowError, Result}; /// /// Record batches are a convenient unit of work for various /// serialization and computation functions, possibly incremental. -/// See also [CSV reader](crate::csv::Reader) and -/// [JSON reader](crate::json::Reader). #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { schema: SchemaRef, @@ -61,12 +57,10 @@ impl RecordBatch { /// # Example /// /// ``` - /// use std::sync::Arc; - /// use arrow::array::Int32Array; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; /// - /// # fn main() -> arrow::error::Result<()> { /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false) @@ -75,11 +69,12 @@ impl RecordBatch { /// let batch = RecordBatch::try_new( /// Arc::new(schema), /// vec![Arc::new(id_array)] - /// )?; - /// # Ok(()) - /// # } + /// ).unwrap(); /// ``` - pub fn try_new(schema: SchemaRef, columns: Vec) -> Result { + pub fn try_new( + schema: SchemaRef, + columns: Vec, + ) -> Result { let options = RecordBatchOptions::new(); Self::try_new_impl(schema, columns, &options) } @@ -92,7 +87,7 @@ impl RecordBatch { schema: SchemaRef, columns: Vec, options: &RecordBatchOptions, - ) -> Result { + ) -> Result { Self::try_new_impl(schema, columns, options) } @@ -117,7 +112,7 @@ impl RecordBatch { schema: SchemaRef, columns: Vec, options: &RecordBatchOptions, - ) -> Result { + ) -> Result { // check that number of fields in schema match column length if schema.fields().len() != columns.len() { return Err(ArrowError::InvalidArgumentError(format!( @@ -191,13 +186,13 @@ impl RecordBatch { }) } - /// Returns the [`Schema`](crate::datatypes::Schema) of the record batch. + /// Returns the [`Schema`](arrow_schema::Schema) of the record batch. pub fn schema(&self) -> SchemaRef { self.schema.clone() } /// Projects the schema onto the specified columns - pub fn project(&self, indices: &[usize]) -> Result { + pub fn project(&self, indices: &[usize]) -> Result { let projected_schema = self.schema.project(indices)?; let batch_fields = indices .iter() @@ -210,7 +205,7 @@ impl RecordBatch { )) }) }) - .collect::>>()?; + .collect::, _>>()?; RecordBatch::try_new_with_options( SchemaRef::new(projected_schema), @@ -227,22 +222,18 @@ impl RecordBatch { /// # Example /// /// ``` - /// use std::sync::Arc; - /// use arrow::array::Int32Array; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; /// - /// # fn main() -> arrow::error::Result<()> { /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false) /// ]); /// - /// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)])?; + /// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap(); /// /// assert_eq!(batch.num_columns(), 1); - /// # Ok(()) - /// # } /// ``` pub fn num_columns(&self) -> usize { self.columns.len() @@ -253,22 +244,18 @@ impl RecordBatch { /// # Example /// /// ``` - /// use std::sync::Arc; - /// use arrow::array::Int32Array; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; /// - /// # fn main() -> arrow::error::Result<()> { /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false) /// ]); /// - /// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)])?; + /// let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap(); /// /// assert_eq!(batch.num_rows(), 5); - /// # Ok(()) - /// # } /// ``` pub fn num_rows(&self) -> usize { self.row_count @@ -322,10 +309,8 @@ impl RecordBatch { /// /// Example: /// ``` - /// use std::sync::Arc; - /// use arrow::array::{ArrayRef, Int32Array, StringArray}; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray}; /// /// let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); /// let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b"])); @@ -335,7 +320,7 @@ impl RecordBatch { /// ("b", b), /// ]); /// ``` - pub fn try_from_iter(value: I) -> Result + pub fn try_from_iter(value: I) -> Result where I: IntoIterator, F: AsRef, @@ -359,10 +344,8 @@ impl RecordBatch { /// /// Example: /// ``` - /// use std::sync::Arc; - /// use arrow::array::{ArrayRef, Int32Array, StringArray}; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; + /// # use std::sync::Arc; + /// # use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray}; /// /// let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); /// let b: ArrayRef = Arc::new(StringArray::from(vec![Some("a"), Some("b")])); @@ -374,7 +357,7 @@ impl RecordBatch { /// ("b", b, true), /// ]); /// ``` - pub fn try_from_iter_with_nullable(value: I) -> Result + pub fn try_from_iter_with_nullable(value: I) -> Result where I: IntoIterator, F: AsRef, @@ -394,12 +377,6 @@ impl RecordBatch { let schema = Arc::new(Schema::new(fields)); RecordBatch::try_new(schema, columns) } - - /// Concatenates `batches` together into a single record batch. - #[deprecated(note = "please use arrow::compute::concat_batches")] - pub fn concat(schema: &SchemaRef, batches: &[Self]) -> Result { - crate::compute::concat_batches(schema, batches) - } } /// Options that control the behaviour used when creating a [`RecordBatch`]. @@ -469,29 +446,14 @@ impl From for StructArray { } } -/// Trait for types that can read `RecordBatch`'s. -pub trait RecordBatchReader: Iterator> { - /// Returns the schema of this `RecordBatchReader`. - /// - /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this - /// reader should have the same schema as returned from this method. - fn schema(&self) -> SchemaRef; - - /// Reads the next `RecordBatch`. - #[deprecated( - since = "2.0.0", - note = "This method is deprecated in favour of `next` from the trait Iterator." - )] - fn next_batch(&mut self) -> Result> { - self.next().transpose() - } -} - #[cfg(test)] mod tests { use super::*; - - use crate::buffer::Buffer; + use crate::{ + BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, + }; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_data::ArrayDataBuilder; #[test] fn create_record_batch() { diff --git a/arrow/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs similarity index 71% rename from arrow/src/temporal_conversions.rs rename to arrow-array/src/temporal_conversions.rs index 14fa82f6e7dc..4a371fc788e9 100644 --- a/arrow/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -17,21 +17,23 @@ //! Conversion methods for dates and times. -use chrono::{Duration, NaiveDateTime, NaiveTime}; +use crate::ArrowPrimitiveType; +use arrow_schema::{DataType, TimeUnit}; +use chrono::{Duration, NaiveDate, NaiveDateTime, NaiveTime}; /// Number of seconds in a day -pub(crate) const SECONDS_IN_DAY: i64 = 86_400; +pub const SECONDS_IN_DAY: i64 = 86_400; /// Number of milliseconds in a second -pub(crate) const MILLISECONDS: i64 = 1_000; +pub const MILLISECONDS: i64 = 1_000; /// Number of microseconds in a second -pub(crate) const MICROSECONDS: i64 = 1_000_000; +pub const MICROSECONDS: i64 = 1_000_000; /// Number of nanoseconds in a second -pub(crate) const NANOSECONDS: i64 = 1_000_000_000; +pub const NANOSECONDS: i64 = 1_000_000_000; /// Number of milliseconds in a day -pub(crate) const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; +pub const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; /// Number of days between 0001-01-01 and 1970-01-01 -pub(crate) const EPOCH_DAYS_FROM_CE: i32 = 719_163; +pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; /// converts a `i32` representing a `date32` to [`NaiveDateTime`] #[inline] @@ -167,6 +169,66 @@ pub fn duration_ns_to_duration(v: i64) -> Duration { Duration::nanoseconds(v) } +/// Converts an [`ArrowPrimitiveType`] to [`NaiveDateTime`] +pub fn as_datetime(v: i64) -> Option { + match T::DATA_TYPE { + DataType::Date32 => Some(date32_to_datetime(v as i32)), + DataType::Date64 => Some(date64_to_datetime(v)), + DataType::Time32(_) | DataType::Time64(_) => None, + DataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => Some(timestamp_s_to_datetime(v)), + TimeUnit::Millisecond => Some(timestamp_ms_to_datetime(v)), + TimeUnit::Microsecond => Some(timestamp_us_to_datetime(v)), + TimeUnit::Nanosecond => Some(timestamp_ns_to_datetime(v)), + }, + // interval is not yet fully documented [ARROW-3097] + DataType::Interval(_) => None, + _ => None, + } +} + +/// Converts an [`ArrowPrimitiveType`] to [`NaiveDate`] +pub fn as_date(v: i64) -> Option { + as_datetime::(v).map(|datetime| datetime.date()) +} + +/// Converts an [`ArrowPrimitiveType`] to [`NaiveTime`] +pub fn as_time(v: i64) -> Option { + match T::DATA_TYPE { + DataType::Time32(unit) => { + // safe to immediately cast to u32 as `self.value(i)` is positive i32 + let v = v as u32; + match unit { + TimeUnit::Second => Some(time32s_to_time(v as i32)), + TimeUnit::Millisecond => Some(time32ms_to_time(v as i32)), + _ => None, + } + } + DataType::Time64(unit) => match unit { + TimeUnit::Microsecond => Some(time64us_to_time(v)), + TimeUnit::Nanosecond => Some(time64ns_to_time(v)), + _ => None, + }, + DataType::Timestamp(_, _) => as_datetime::(v).map(|datetime| datetime.time()), + DataType::Date32 | DataType::Date64 => Some(NaiveTime::from_hms(0, 0, 0)), + DataType::Interval(_) => None, + _ => None, + } +} + +/// Converts an [`ArrowPrimitiveType`] to [`Duration`] +pub fn as_duration(v: i64) -> Option { + match T::DATA_TYPE { + DataType::Duration(unit) => match unit { + TimeUnit::Second => Some(duration_s_to_duration(v)), + TimeUnit::Millisecond => Some(duration_ms_to_duration(v)), + TimeUnit::Microsecond => Some(duration_us_to_duration(v)), + TimeUnit::Nanosecond => Some(duration_ns_to_duration(v)), + }, + _ => None, + } +} + #[cfg(test)] mod tests { use crate::temporal_conversions::{ diff --git a/arrow/src/util/trusted_len.rs b/arrow-array/src/trusted_len.rs similarity index 96% rename from arrow/src/util/trusted_len.rs rename to arrow-array/src/trusted_len.rs index 84a66238b634..fdec18b78781 100644 --- a/arrow/src/util/trusted_len.rs +++ b/arrow-array/src/trusted_len.rs @@ -15,11 +15,7 @@ // specific language governing permissions and limitations // under the License. -use super::bit_util; -use crate::{ - buffer::{Buffer, MutableBuffer}, - datatypes::ArrowNativeType, -}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; /// Creates two [`Buffer`]s from an iterator of `Option`. /// The first buffer corresponds to a bitmap buffer, the second one diff --git a/arrow/src/datatypes/types.rs b/arrow-array/src/types.rs similarity index 98% rename from arrow/src/datatypes/types.rs rename to arrow-array/src/types.rs index 1b7d0675bb43..581fdc767c24 100644 --- a/arrow/src/datatypes/types.rs +++ b/arrow-array/src/types.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. -use super::{ArrowPrimitiveType, DataType, IntervalUnit, TimeUnit}; -use crate::datatypes::delta::shift_months; -use crate::datatypes::{ +//! Zero-sized types used to parameterize generic array implementations + +use crate::array::ArrowPrimitiveType; +use crate::delta::shift_months; +use arrow_data::decimal::{ DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; use half::f16; use std::ops::{Add, Sub}; diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index edfe2c680daf..2e74f0cf66b4 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use arrow::array::{ArrayData, ArrayRef, Int64Array, make_array}; +use arrow::array::{Array, ArrayData, ArrayRef, Int64Array, make_array}; use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; @@ -51,7 +51,7 @@ fn double(array: &PyAny, py: Python) -> PyResult { let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; // export - array.to_pyarrow(py) + array.data().to_pyarrow(py) } /// calls a lambda function that receives and returns an array @@ -63,7 +63,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as ArrayRef; // to py - let pyarray = array.to_pyarrow(py)?; + let pyarray = array.data().to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; let array = make_array(ArrayData::from_pyarrow(pyarray)?); diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 9605cdda720b..60fe3c6ca9a0 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -22,6 +22,9 @@ use std::hash::Hash; use crate::error::ArrowError; use crate::field::Field; +/// A reference-counted reference to a [`Schema`]. +pub type SchemaRef = std::sync::Arc; + /// Describes the meta-data of an ordered sequence of relative types. /// /// Note that this information is only part of the meta-data and not part of the physical diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f29c4e317914..48b7f39547ef 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -47,6 +47,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } arrow-data = { version = "23.0.0", path = "../arrow-data" } arrow-schema = { version = "23.0.0", path = "../arrow-schema" } +arrow-array = { version = "23.0.0", path = "../arrow-array" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/arrow/src/array/cast.rs b/arrow/src/array/cast.rs deleted file mode 100644 index 2c8366ff5f9c..000000000000 --- a/arrow/src/array/cast.rs +++ /dev/null @@ -1,761 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines helper functions for force [`Array`] downcasts - -use crate::array::*; -use crate::datatypes::*; - -/// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`], accepts -/// a number of subsequent patterns to match the data type -/// -/// ``` -/// # use arrow::downcast_primitive_array; -/// # use arrow::array::Array; -/// # use arrow::datatypes::DataType; -/// # use arrow::array::as_string_array; -/// -/// fn print_primitive(array: &dyn Array) { -/// downcast_primitive_array!( -/// array => { -/// for v in array { -/// println!("{:?}", v); -/// } -/// } -/// DataType::Utf8 => { -/// for v in as_string_array(array) { -/// println!("{:?}", v); -/// } -/// } -/// t => println!("Unsupported datatype {}", t) -/// ) -/// } -/// ``` -/// -#[macro_export] -macro_rules! downcast_primitive_array { - ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { - downcast_primitive_array!($values => {$e} $($p => $fallback)*) - }; - - ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { - match $values.data_type() { - $crate::datatypes::DataType::Int8 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Int8Type, - >($values); - $e - } - $crate::datatypes::DataType::Int16 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Int16Type, - >($values); - $e - } - $crate::datatypes::DataType::Int32 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Int32Type, - >($values); - $e - } - $crate::datatypes::DataType::Int64 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Int64Type, - >($values); - $e - } - $crate::datatypes::DataType::UInt8 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::UInt8Type, - >($values); - $e - } - $crate::datatypes::DataType::UInt16 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::UInt16Type, - >($values); - $e - } - $crate::datatypes::DataType::UInt32 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::UInt32Type, - >($values); - $e - } - $crate::datatypes::DataType::UInt64 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::UInt64Type, - >($values); - $e - } - $crate::datatypes::DataType::Float16 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Float16Type, - >($values); - $e - } - $crate::datatypes::DataType::Float32 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Float32Type, - >($values); - $e - } - $crate::datatypes::DataType::Float64 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Float64Type, - >($values); - $e - } - $crate::datatypes::DataType::Date32 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Date32Type, - >($values); - $e - } - $crate::datatypes::DataType::Date64 => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Date64Type, - >($values); - $e - } - $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Second) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Time32SecondType, - >($values); - $e - } - $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Millisecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Time32MillisecondType, - >($values); - $e - } - $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Microsecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Time64MicrosecondType, - >($values); - $e - } - $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Nanosecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::Time64NanosecondType, - >($values); - $e - } - $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Second, _) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampSecondType, - >($values); - $e - } - $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Millisecond, _) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMillisecondType, - >($values); - $e - } - $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Microsecond, _) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMicrosecondType, - >($values); - $e - } - $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Nanosecond, _) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampNanosecondType, - >($values); - $e - } - $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::YearMonth) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalYearMonthType, - >($values); - $e - } - $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::DayTime) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalDayTimeType, - >($values); - $e - } - $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::MonthDayNano) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalMonthDayNanoType, - >($values); - $e - } - $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Second) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::DurationSecondType, - >($values); - $e - } - $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Millisecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMillisecondType, - >($values); - $e - } - $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Microsecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMicrosecondType, - >($values); - $e - } - $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Nanosecond) => { - let $values = $crate::array::as_primitive_array::< - $crate::datatypes::DurationNanosecondType, - >($values); - $e - } - $($p => $fallback,)* - } - }; - - (($values1:ident, $values2:ident) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { - match ($values1.data_type(), $values2.data_type()) { - ($crate::datatypes::DataType::Int8, $crate::datatypes::DataType::Int8) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Int8Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Int8Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Int16, $crate::datatypes::DataType::Int16) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Int16Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Int16Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Int32, $crate::datatypes::DataType::Int32) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Int32Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Int32Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Int64, $crate::datatypes::DataType::Int64) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Int64Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Int64Type, - >($values2); - $e - } - ($crate::datatypes::DataType::UInt8, $crate::datatypes::DataType::UInt8) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt8Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt8Type, - >($values2); - $e - } - ($crate::datatypes::DataType::UInt16, $crate::datatypes::DataType::UInt16) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt16Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt16Type, - >($values2); - $e - } - ($crate::datatypes::DataType::UInt32, $crate::datatypes::DataType::UInt32) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt32Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt32Type, - >($values2); - $e - } - ($crate::datatypes::DataType::UInt64, $crate::datatypes::DataType::UInt64) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt64Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::UInt64Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Float32, $crate::datatypes::DataType::Float32) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Float32Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Float32Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Float64, $crate::datatypes::DataType::Float64) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Float64Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Float64Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Date32, $crate::datatypes::DataType::Date32) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Date32Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Date32Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Date64, $crate::datatypes::DataType::Date64) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Date64Type, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Date64Type, - >($values2); - $e - } - ($crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Second), $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Second)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Time32SecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Time32SecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Millisecond), $crate::datatypes::DataType::Time32($crate::datatypes::TimeUnit::Millisecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Time32MillisecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Time32MillisecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Microsecond), $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Microsecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Time64MicrosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Time64MicrosecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Nanosecond), $crate::datatypes::DataType::Time64($crate::datatypes::TimeUnit::Nanosecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::Time64NanosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::Time64NanosecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Second, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Second, _)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampSecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampSecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Millisecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Millisecond, _)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMillisecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMillisecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Microsecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Microsecond, _)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMicrosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampMicrosecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Nanosecond, _), $crate::datatypes::DataType::Timestamp($crate::datatypes::TimeUnit::Nanosecond, _)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampNanosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::TimestampNanosecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::YearMonth), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::YearMonth)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalYearMonthType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalYearMonthType, - >($values2); - $e - } - ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::DayTime), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::DayTime)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalDayTimeType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalDayTimeType, - >($values2); - $e - } - ($crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::MonthDayNano), $crate::datatypes::DataType::Interval($crate::datatypes::IntervalUnit::MonthDayNano)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalMonthDayNanoType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::IntervalMonthDayNanoType, - >($values2); - $e - } - ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Second), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Second)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationSecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationSecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Millisecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Millisecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMillisecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMillisecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Microsecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Microsecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMicrosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationMicrosecondType, - >($values2); - $e - } - ($crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Nanosecond), $crate::datatypes::DataType::Duration($crate::datatypes::TimeUnit::Nanosecond)) => { - let $values1 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationNanosecondType, - >($values1); - let $values2 = $crate::array::as_primitive_array::< - $crate::datatypes::DurationNanosecondType, - >($values2); - $e - } - $($p => $fallback,)* - } - }; -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`], to -/// [`PrimitiveArray`], panic'ing on failure. -/// -/// # Example -/// -/// ``` -/// # use arrow::array::*; -/// # use arrow::datatypes::*; -/// # use std::sync::Arc; -/// let arr: ArrayRef = Arc::new(Int32Array::from(vec![Some(1)])); -/// -/// // Downcast an `ArrayRef` to Int32Array / PrimiveArray: -/// let primitive_array: &Int32Array = as_primitive_array(&arr); -/// -/// // Equivalently: -/// let primitive_array = as_primitive_array::(&arr); -/// -/// // This is the equivalent of: -/// let primitive_array = arr -/// .as_any() -/// .downcast_ref::() -/// .unwrap(); -/// ``` - -pub fn as_primitive_array(arr: &dyn Array) -> &PrimitiveArray -where - T: ArrowPrimitiveType, -{ - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to primitive array") -} - -/// Downcast an [`Array`] to a [`DictionaryArray`] based on its [`DataType`], accepts -/// a number of subsequent patterns to match the data type -/// -/// ``` -/// # use arrow::downcast_dictionary_array; -/// # use arrow::array::{Array, StringArray}; -/// # use arrow::datatypes::DataType; -/// # use arrow::array::as_string_array; -/// -/// fn print_strings(array: &dyn Array) { -/// downcast_dictionary_array!( -/// array => match array.values().data_type() { -/// DataType::Utf8 => { -/// for v in array.downcast_dict::().unwrap() { -/// println!("{:?}", v); -/// } -/// } -/// t => println!("Unsupported dictionary value type {}", t), -/// }, -/// DataType::Utf8 => { -/// for v in as_string_array(array) { -/// println!("{:?}", v); -/// } -/// } -/// t => println!("Unsupported datatype {}", t) -/// ) -/// } -/// ``` -#[macro_export] -macro_rules! downcast_dictionary_array { - ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { - downcast_dictionary_array!($values => {$e} $($p => $fallback)*) - }; - - ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { - match $values.data_type() { - $crate::datatypes::DataType::Dictionary(k, _) => match k.as_ref() { - $crate::datatypes::DataType::Int8 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::Int8Type, - >($values); - $e - }, - $crate::datatypes::DataType::Int16 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::Int16Type, - >($values); - $e - }, - $crate::datatypes::DataType::Int32 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::Int32Type, - >($values); - $e - }, - $crate::datatypes::DataType::Int64 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::Int64Type, - >($values); - $e - }, - $crate::datatypes::DataType::UInt8 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::UInt8Type, - >($values); - $e - }, - $crate::datatypes::DataType::UInt16 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::UInt16Type, - >($values); - $e - }, - $crate::datatypes::DataType::UInt32 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::UInt32Type, - >($values); - $e - }, - $crate::datatypes::DataType::UInt64 => { - let $values = $crate::array::as_dictionary_array::< - $crate::datatypes::UInt64Type, - >($values); - $e - }, - k => unreachable!("unsupported dictionary key type: {}", k) - } - $($p => $fallback,)* - } - } -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`DictionaryArray`], panic'ing on failure. -/// -/// # Example -/// -/// ``` -/// # use arrow::array::*; -/// # use arrow::datatypes::*; -/// # use std::sync::Arc; -/// let arr: DictionaryArray = vec![Some("foo")].into_iter().collect(); -/// let arr: ArrayRef = std::sync::Arc::new(arr); -/// let dict_array: &DictionaryArray = as_dictionary_array::(&arr); -/// ``` -pub fn as_dictionary_array(arr: &dyn Array) -> &DictionaryArray -where - T: ArrowDictionaryKeyType, -{ - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to dictionary array") -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`GenericListArray`], panic'ing on failure. -pub fn as_generic_list_array( - arr: &dyn Array, -) -> &GenericListArray { - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to list array") -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`ListArray`], panic'ing on failure. -#[inline] -pub fn as_list_array(arr: &dyn Array) -> &ListArray { - as_generic_list_array::(arr) -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`LargeListArray`], panic'ing on failure. -#[inline] -pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { - as_generic_list_array::(arr) -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`GenericBinaryArray`], panic'ing on failure. -#[inline] -pub fn as_generic_binary_array( - arr: &dyn Array, -) -> &GenericBinaryArray { - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to binary array") -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`StringArray`], panic'ing on failure. -/// -/// # Example -/// -/// ``` -/// # use arrow::array::*; -/// # use std::sync::Arc; -/// let arr: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("foo")])); -/// let string_array = as_string_array(&arr); -/// ``` -pub fn as_string_array(arr: &dyn Array) -> &StringArray { - arr.as_any() - .downcast_ref::() - .expect("Unable to downcast to StringArray") -} - -/// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`BooleanArray`], panic'ing on failure. -/// -/// # Example -/// -/// ``` -/// # use arrow::array::*; -/// # use std::sync::Arc; -/// let arr: ArrayRef = Arc::new(BooleanArray::from_iter(vec![Some(true)])); -/// let boolean_array = as_boolean_array(&arr); -/// ``` -pub fn as_boolean_array(arr: &dyn Array) -> &BooleanArray { - arr.as_any() - .downcast_ref::() - .expect("Unable to downcast to BooleanArray") -} - -macro_rules! array_downcast_fn { - ($name: ident, $arrty: ty, $arrty_str:expr) => { - #[doc = "Force downcast of an [`Array`], such as an [`ArrayRef`] to "] - #[doc = $arrty_str] - pub fn $name(arr: &dyn Array) -> &$arrty { - arr.as_any().downcast_ref::<$arrty>().expect(concat!( - "Unable to downcast to typed array through ", - stringify!($name) - )) - } - }; - - // use recursive macro to generate dynamic doc string for a given array type - ($name: ident, $arrty: ty) => { - array_downcast_fn!( - $name, - $arrty, - concat!("[`", stringify!($arrty), "`], panic'ing on failure.") - ); - }; -} - -array_downcast_fn!(as_largestring_array, LargeStringArray); -array_downcast_fn!(as_null_array, NullArray); -array_downcast_fn!(as_struct_array, StructArray); -array_downcast_fn!(as_union_array, UnionArray); -array_downcast_fn!(as_map_array, MapArray); -array_downcast_fn!(as_decimal_array, Decimal128Array); - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use super::*; - - #[test] - fn test_as_decimal_array_ref() { - let array: Decimal128Array = vec![Some(123), None, Some(1111)] - .into_iter() - .collect::() - .with_precision_and_scale(10, 2) - .unwrap(); - assert!(!as_decimal_array(&array).is_empty()); - let result_decimal = as_decimal_array(&array); - assert_eq!(result_decimal, &array); - } - - #[test] - fn test_as_primitive_array_ref() { - let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); - assert!(!as_primitive_array::(&array).is_empty()); - - // should also work when wrapped in an Arc - let array: ArrayRef = Arc::new(array); - assert!(!as_primitive_array::(&array).is_empty()); - } - - #[test] - fn test_as_string_array_ref() { - let array: StringArray = vec!["foo", "bar"].into_iter().map(Some).collect(); - assert!(!as_string_array(&array).is_empty()); - - // should also work when wrapped in an Arc - let array: ArrayRef = Arc::new(array); - assert!(!as_string_array(&array).is_empty()) - } -} diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 32a1da17f848..10009f5abde9 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -15,629 +15,26 @@ // specific language governing permissions and limitations // under the License. -//! The central type in Apache Arrow are arrays, which are a known-length sequence of values -//! all having the same type. This module provides concrete implementations of each type, as -//! well as an [`Array`] trait that can be used for type-erasure. -//! -//! # Downcasting an Array -//! -//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. -//! For example, [`RecordBatch`](`crate::record_batch::RecordBatch`) stores columns as [`ArrayRef`]. -//! -//! Whilst these arrays can be passed directly to the -//! [`compute`](crate::compute), [`csv`](crate::csv), -//! [`json`](crate::json), etc... APIs, it is often the case that you -//! wish to interact with the data directly. This requires downcasting -//! to the concrete type of the array: -//! -//! ``` -//! # use arrow::array::{Array, Float32Array, Int32Array}; -//! # -//! fn sum_int32(array: &dyn Array) -> i32 { -//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); -//! integers.iter().map(|val| val.unwrap_or_default()).sum() -//! } -//! -//! // Note: the values for positions corresponding to nulls will be arbitrary -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! array.as_any().downcast_ref::().unwrap().values() -//! } -//! ``` -//! -//! Additionally, there are convenient functions to do this casting -//! such as [`as_primitive_array`] and [`as_string_array`]: -//! -//! ``` -//! # use arrow::array::*; -//! # use arrow::datatypes::*; -//! # -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! // use as_primtive_array -//! as_primitive_array::(array).values() -//! } -//! ``` +//! Re-exports APIs from [arrow_array] -//! # Building an Array -//! -//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] -//! -//! ``` -//! # use arrow::array::Int32Array; -//! # use arrow::array::StringArray; -//! # use arrow::array::ListArray; -//! # use arrow::datatypes::Int32Type; -//! # -//! Int32Array::from(vec![1, 2]); -//! Int32Array::from(vec![Some(1), None]); -//! Int32Array::from_iter([1, 2, 3, 4]); -//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]); -//! -//! StringArray::from(vec!["foo", "bar"]); -//! StringArray::from(vec![Some("foo"), None]); -//! StringArray::from_iter([Some("foo"), None]); -//! StringArray::from_iter_values(["foo", "bar"]); -//! -//! ListArray::from_iter_primitive::([ -//! Some(vec![Some(1), None, Some(3)]), -//! None, -//! Some(vec![]) -//! ]); -//! ``` -//! -//! Additionally [`ArrayBuilder`](crate::array::ArrayBuilder) implementations can be -//! used to construct arrays with a push-based interface -//! -//! ``` -//! # use arrow::array::Int16Array; -//! # -//! // Create a new builder with a capacity of 100 -//! let mut builder = Int16Array::builder(100); -//! -//! // Append a single primitive value -//! builder.append_value(1); -//! -//! // Append a null value -//! builder.append_null(); -//! -//! // Append a slice of primitive values -//! builder.append_slice(&[2, 3, 4]); -//! -//! // Build the array -//! let array = builder.finish(); -//! -//! assert_eq!( -//! 5, -//! array.len(), -//! "The array has 5 values, counting the null value" -//! ); -//! -//! assert_eq!(2, array.value(2), "Get the value with index 2"); -//! -//! assert_eq!( -//! &array.values()[3..5], -//! &[3, 4], -//! "Get slice of len 2 starting at idx 3" -//! ) -//! ``` -//! -//! # Zero-Copy Slicing -//! -//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this -//! data. Internally this just increments some ref-counts, and so is incredibly cheap -//! -//! ```rust -//! # use std::sync::Arc; -//! # use arrow::array::{Array, Int32Array, ArrayRef}; -//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; -//! -//! // Slice with offset 1 and length 2 -//! let sliced = array.slice(1, 2); -//! let ints = sliced.as_any().downcast_ref::().unwrap(); -//! assert_eq!(ints.values(), &[2, 3]); -//! ``` -//! -//! # Internal Representation -//! -//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of -//! which depend on the array’s data type, as documented in the [Arrow specification]. -//! -//! For example, the type `Int16Array` represents an array of 16-bit integers and consists of: -//! -//! * An optional [`Bitmap`] identifying any null values -//! * A contiguous [`Buffer`] of 16-bit integers -//! -//! Similarly, the type `StringArray` represents an array of UTF-8 strings and consists of: -//! -//! * An optional [`Bitmap`] identifying any null values -//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer -//! * A values [`Buffer`] of UTF-8 encoded string data -//! -//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html -//! [`&dyn Array`]: Array -//! [`Bitmap`]: crate::bitmap::Bitmap -//! [`Buffer`]: crate::buffer::Buffer - -#[allow(clippy::module_inception)] -mod array; -mod array_binary; -mod array_boolean; -mod array_decimal; -mod array_dictionary; -mod array_fixed_size_binary; -mod array_fixed_size_list; -mod array_list; -mod array_map; -mod array_primitive; -mod array_string; -mod array_struct; -mod array_union; -mod builder; -mod cast; #[cfg(feature = "ffi")] mod ffi; -mod iterator; -mod null; mod ord; -mod raw_pointer; - -use crate::datatypes::*; // --------------------- Array & ArrayData --------------------- - -pub use self::array::Array; -pub use self::array::ArrayAccessor; -pub use self::array::ArrayRef; +pub use arrow_array::array::*; +pub use arrow_array::builder::*; +pub use arrow_array::cast::*; +pub use arrow_array::iterator::*; pub use arrow_data::{ layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, DataTypeLayout, }; -pub use self::array_binary::BinaryArray; -pub use self::array_binary::LargeBinaryArray; -pub use self::array_boolean::BooleanArray; -pub use self::array_decimal::Decimal128Array; -pub use self::array_decimal::Decimal256Array; -pub use self::array_decimal::DecimalArray; -pub use self::array_fixed_size_binary::FixedSizeBinaryArray; -pub use self::array_fixed_size_list::FixedSizeListArray; - -pub use self::array_dictionary::{DictionaryArray, TypedDictionaryArray}; -pub use self::array_list::LargeListArray; -pub use self::array_list::ListArray; -pub use self::array_map::MapArray; -pub use self::array_primitive::PrimitiveArray; -pub use self::array_string::LargeStringArray; -pub use self::array_string::StringArray; -pub use self::array_struct::StructArray; -pub use self::array_union::UnionArray; -pub use self::null::NullArray; - -pub use self::array::make_array; -pub use self::array::new_empty_array; -pub use self::array::new_null_array; - -pub(crate) use self::array_primitive::{as_datetime, as_time}; - -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Int8Array; -/// let arr : Int8Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type Int8Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Int16Array; -/// let arr : Int16Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type Int16Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Int32Array; -/// let arr : Int32Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type Int32Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Int64Array; -/// let arr : Int64Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type Int64Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::UInt8Array; -/// let arr : UInt8Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type UInt8Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::UInt16Array; -/// let arr : UInt16Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type UInt16Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::UInt32Array; -/// let arr : UInt32Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type UInt32Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::UInt64Array; -/// let arr : UInt64Array = [Some(1), Some(2)].into_iter().collect(); -/// ``` -pub type UInt64Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Float16Array; -/// use half::f16; -/// let arr : Float16Array = [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))].into_iter().collect(); -/// ``` -pub type Float16Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Float32Array; -/// let arr : Float32Array = [Some(1.0), Some(2.0)].into_iter().collect(); -/// ``` -pub type Float32Array = PrimitiveArray; -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::Float64Array; -/// let arr : Float64Array = [Some(1.0), Some(2.0)].into_iter().collect(); -/// ``` -pub type Float64Array = PrimitiveArray; - -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, Int8DictionaryArray, Int8Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type Int8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, Int16DictionaryArray, Int16Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type Int16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, Int32DictionaryArray, Int32Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type Int32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, Int64DictionaryArray, Int64Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type Int64DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, UInt8DictionaryArray, UInt8Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type UInt8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, UInt16DictionaryArray, UInt16Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type UInt16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, UInt32DictionaryArray, UInt32Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type UInt32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. -/// -/// # Example: Using `collect` -/// ``` -/// # use arrow::array::{Array, UInt64DictionaryArray, UInt64Array, StringArray}; -/// # use std::sync::Arc; -/// -/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); -/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2])); -/// assert_eq!(array.values(), &values); -/// ``` -pub type UInt64DictionaryArray = DictionaryArray; -/// -/// A primitive array where each element is of type [TimestampSecondType]. -/// See also [`Timestamp`](crate::datatypes::DataType::Timestamp). -/// -/// # Example: UTC timestamps post epoch -/// ``` -/// # use arrow::array::TimestampSecondArray; -/// use chrono::FixedOffset; -/// // Corresponds to single element array with entry 1970-05-09T14:25:11+0:00 -/// let arr = TimestampSecondArray::from_vec(vec![11111111], None); -/// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], None); -/// let utc_offset = FixedOffset::east(0); -/// -/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1970-05-09 14:25:11") -/// ``` -/// -/// # Example: UTC timestamps pre epoch -/// ``` -/// # use arrow::array::TimestampSecondArray; -/// use chrono::FixedOffset; -/// // Corresponds to single element array with entry 1969-08-25T09:34:49+0:00 -/// let arr = TimestampSecondArray::from_vec(vec![-11111111], None); -/// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(-11111111)], None); -/// let utc_offset = FixedOffset::east(0); -/// -/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1969-08-25 09:34:49") -/// ``` -/// -/// # Example: With timezone specified -/// ``` -/// # use arrow::array::TimestampSecondArray; -/// use chrono::FixedOffset; -/// // Corresponds to single element array with entry 1970-05-10T00:25:11+10:00 -/// let arr = TimestampSecondArray::from_vec(vec![11111111], Some("+10:00".to_string())); -/// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], Some("+10:00".to_string())); -/// let sydney_offset = FixedOffset::east(10 * 60 * 60); -/// -/// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_offset).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11") -/// ``` -/// -pub type TimestampSecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMillisecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) -pub type TimestampMillisecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMicrosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) -pub type TimestampMicrosecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampNanosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) -pub type TimestampNanosecondArray = PrimitiveArray; -pub type Date32Array = PrimitiveArray; -pub type Date64Array = PrimitiveArray; -pub type Time32SecondArray = PrimitiveArray; -pub type Time32MillisecondArray = PrimitiveArray; -pub type Time64MicrosecondArray = PrimitiveArray; -pub type Time64NanosecondArray = PrimitiveArray; -pub type IntervalYearMonthArray = PrimitiveArray; -pub type IntervalDayTimeArray = PrimitiveArray; -pub type IntervalMonthDayNanoArray = PrimitiveArray; -pub type DurationSecondArray = PrimitiveArray; -pub type DurationMillisecondArray = PrimitiveArray; -pub type DurationMicrosecondArray = PrimitiveArray; -pub type DurationNanosecondArray = PrimitiveArray; - -pub use self::array_binary::GenericBinaryArray; -pub use self::array_list::GenericListArray; -pub use self::array_list::OffsetSizeTrait; -pub use self::array_string::GenericStringArray; - -// --------------------- Array Builder --------------------- - -pub use self::builder::ArrayBuilder; -pub use self::builder::BinaryBuilder; -pub use self::builder::BooleanBufferBuilder; -pub use self::builder::BooleanBuilder; -pub use self::builder::BufferBuilder; -pub use self::builder::Decimal128Builder; -pub use self::builder::Decimal256Builder; - -#[deprecated(note = "Please use `Decimal128Builder` instead")] -pub type DecimalBuilder = Decimal128Builder; - -pub use self::builder::FixedSizeBinaryBuilder; -pub use self::builder::FixedSizeListBuilder; -pub use self::builder::GenericListBuilder; -pub use self::builder::GenericStringBuilder; -pub use self::builder::LargeBinaryBuilder; -pub use self::builder::LargeListBuilder; -pub use self::builder::LargeStringBuilder; -pub use self::builder::ListBuilder; -pub use self::builder::MapBuilder; -pub use self::builder::PrimitiveBuilder; -pub use self::builder::PrimitiveDictionaryBuilder; -pub use self::builder::StringBuilder; -pub use self::builder::StringDictionaryBuilder; -pub use self::builder::StructBuilder; -pub use self::builder::UnionBuilder; - -pub use self::builder::make_builder; - -pub type Int8BufferBuilder = BufferBuilder; -pub type Int16BufferBuilder = BufferBuilder; -pub type Int32BufferBuilder = BufferBuilder; -pub type Int64BufferBuilder = BufferBuilder; -pub type UInt8BufferBuilder = BufferBuilder; -pub type UInt16BufferBuilder = BufferBuilder; -pub type UInt32BufferBuilder = BufferBuilder; -pub type UInt64BufferBuilder = BufferBuilder; -pub type Float32BufferBuilder = BufferBuilder; -pub type Float64BufferBuilder = BufferBuilder; - -pub type TimestampSecondBufferBuilder = - BufferBuilder<::Native>; -pub type TimestampMillisecondBufferBuilder = - BufferBuilder<::Native>; -pub type TimestampMicrosecondBufferBuilder = - BufferBuilder<::Native>; -pub type TimestampNanosecondBufferBuilder = - BufferBuilder<::Native>; -pub type Date32BufferBuilder = BufferBuilder<::Native>; -pub type Date64BufferBuilder = BufferBuilder<::Native>; -pub type Time32SecondBufferBuilder = - BufferBuilder<::Native>; -pub type Time32MillisecondBufferBuilder = - BufferBuilder<::Native>; -pub type Time64MicrosecondBufferBuilder = - BufferBuilder<::Native>; -pub type Time64NanosecondBufferBuilder = - BufferBuilder<::Native>; -pub type IntervalYearMonthBufferBuilder = - BufferBuilder<::Native>; -pub type IntervalDayTimeBufferBuilder = - BufferBuilder<::Native>; -pub type IntervalMonthDayNanoBufferBuilder = - BufferBuilder<::Native>; -pub type DurationSecondBufferBuilder = - BufferBuilder<::Native>; -pub type DurationMillisecondBufferBuilder = - BufferBuilder<::Native>; -pub type DurationMicrosecondBufferBuilder = - BufferBuilder<::Native>; -pub type DurationNanosecondBufferBuilder = - BufferBuilder<::Native>; - -pub type Int8Builder = PrimitiveBuilder; -pub type Int16Builder = PrimitiveBuilder; -pub type Int32Builder = PrimitiveBuilder; -pub type Int64Builder = PrimitiveBuilder; -pub type UInt8Builder = PrimitiveBuilder; -pub type UInt16Builder = PrimitiveBuilder; -pub type UInt32Builder = PrimitiveBuilder; -pub type UInt64Builder = PrimitiveBuilder; -pub type Float32Builder = PrimitiveBuilder; -pub type Float64Builder = PrimitiveBuilder; - -pub type TimestampSecondBuilder = PrimitiveBuilder; -pub type TimestampMillisecondBuilder = PrimitiveBuilder; -pub type TimestampMicrosecondBuilder = PrimitiveBuilder; -pub type TimestampNanosecondBuilder = PrimitiveBuilder; -pub type Date32Builder = PrimitiveBuilder; -pub type Date64Builder = PrimitiveBuilder; -pub type Time32SecondBuilder = PrimitiveBuilder; -pub type Time32MillisecondBuilder = PrimitiveBuilder; -pub type Time64MicrosecondBuilder = PrimitiveBuilder; -pub type Time64NanosecondBuilder = PrimitiveBuilder; -pub type IntervalYearMonthBuilder = PrimitiveBuilder; -pub type IntervalDayTimeBuilder = PrimitiveBuilder; -pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; -pub type DurationSecondBuilder = PrimitiveBuilder; -pub type DurationMillisecondBuilder = PrimitiveBuilder; -pub type DurationMicrosecondBuilder = PrimitiveBuilder; -pub type DurationNanosecondBuilder = PrimitiveBuilder; - pub use arrow_data::transform::{Capacities, MutableArrayData}; -// --------------------- Array Iterator --------------------- - -pub use self::iterator::*; - -// --------------------- Array's values comparison --------------------- - -pub use self::ord::{build_compare, DynComparator}; - -// --------------------- Array downcast helper functions --------------------- - -pub use self::cast::{ - as_boolean_array, as_decimal_array, as_dictionary_array, as_generic_binary_array, - as_generic_list_array, as_large_list_array, as_largestring_array, as_list_array, - as_map_array, as_null_array, as_primitive_array, as_string_array, as_struct_array, - as_union_array, -}; - -// ------------------------------ C Data Interface --------------------------- - #[cfg(feature = "ffi")] pub use self::ffi::{export_array_into_raw, make_array_from_raw}; -#[cfg(test)] -mod tests { - use crate::array::*; +// --------------------- Array's values comparison --------------------- - #[test] - fn test_buffer_builder_availability() { - let _builder = Int8BufferBuilder::new(10); - let _builder = Int16BufferBuilder::new(10); - let _builder = Int32BufferBuilder::new(10); - let _builder = Int64BufferBuilder::new(10); - let _builder = UInt16BufferBuilder::new(10); - let _builder = UInt32BufferBuilder::new(10); - let _builder = Float32BufferBuilder::new(10); - let _builder = Float64BufferBuilder::new(10); - let _builder = TimestampSecondBufferBuilder::new(10); - let _builder = TimestampMillisecondBufferBuilder::new(10); - let _builder = TimestampMicrosecondBufferBuilder::new(10); - let _builder = TimestampNanosecondBufferBuilder::new(10); - let _builder = Date32BufferBuilder::new(10); - let _builder = Date64BufferBuilder::new(10); - let _builder = Time32SecondBufferBuilder::new(10); - let _builder = Time32MillisecondBufferBuilder::new(10); - let _builder = Time64MicrosecondBufferBuilder::new(10); - let _builder = Time64NanosecondBufferBuilder::new(10); - let _builder = IntervalYearMonthBufferBuilder::new(10); - let _builder = IntervalDayTimeBufferBuilder::new(10); - let _builder = IntervalMonthDayNanoBufferBuilder::new(10); - let _builder = DurationSecondBufferBuilder::new(10); - let _builder = DurationMillisecondBufferBuilder::new(10); - let _builder = DurationMicrosecondBufferBuilder::new(10); - let _builder = DurationNanosecondBufferBuilder::new(10); - } -} +pub use self::ord::{build_compare, DynComparator}; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 791363574c52..eab3dafda13a 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -42,7 +42,6 @@ use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; -use crate::array::as_datetime; use crate::buffer::MutableBuffer; use crate::compute::divide_scalar; use crate::compute::kernels::arithmetic::{divide, multiply}; @@ -54,8 +53,8 @@ use crate::compute::{try_unary, using_chrono_tz_and_utc_naive_date_time}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::temporal_conversions::{ - EPOCH_DAYS_FROM_CE, MICROSECONDS, MILLISECONDS, MILLISECONDS_IN_DAY, NANOSECONDS, - SECONDS_IN_DAY, + as_datetime, EPOCH_DAYS_FROM_CE, MICROSECONDS, MILLISECONDS, MILLISECONDS_IN_DAY, + NANOSECONDS, SECONDS_IN_DAY, }; use crate::{array::*, compute::take}; use crate::{buffer::Buffer, util::serialization::lexical_to_string}; diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index 52664a175447..291324e65a85 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -675,13 +675,7 @@ where ) }; - unsafe { - DictionaryArray::::try_new_unchecked( - filtered_keys, - array.values().clone(), - data, - ) - } + DictionaryArray::from(data) } #[cfg(test)] diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index f46cf7f5ab5a..e61fec999add 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -19,10 +19,10 @@ use chrono::{Datelike, Timelike}; -use crate::array::as_datetime; use crate::array::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; +use arrow_array::temporal_conversions::{as_datetime, as_time}; use chrono::format::strftime::StrftimeItems; use chrono::format::{parse, Parsed}; diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 3ec605dd0482..a8c71a8e0191 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -676,10 +676,9 @@ fn parse( RecordBatch::try_new_with_options( projected_schema, arr, - &RecordBatchOptions { - match_field_names: true, - row_count: Some(rows.len()), - }, + &RecordBatchOptions::new() + .with_match_field_names(true) + .with_row_count(Some(rows.len())), ) }) } diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index e7d9bfd5a4f6..01462aeca96f 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -22,23 +22,18 @@ //! * [`Field`](crate::datatypes::Field) to describe one field within a schema. //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. -use std::sync::Arc; - mod native; pub use native::*; mod numeric; pub use numeric::*; -mod types; -pub use types::*; -mod delta; +pub use arrow_array::types::*; pub use arrow_data::decimal::*; -pub use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit, UnionMode}; +pub use arrow_schema::{ + DataType, Field, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionMode, +}; #[cfg(feature = "ffi")] mod ffi; #[cfg(feature = "ffi")] pub use ffi::*; - -/// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). -pub type SchemaRef = Arc; diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index dec0cc4b53b0..6ab82688e52d 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -15,33 +15,12 @@ // specific language governing permissions and limitations // under the License. -use super::DataType; use crate::error::{ArrowError, Result}; pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; use num::Zero; -/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the -/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. -pub trait ArrowPrimitiveType: 'static { - /// Corresponding Rust native type for the primitive type. - type Native: ArrowNativeType; - - /// the corresponding Arrow data type of this primitive type. - const DATA_TYPE: DataType; - - /// Returns the byte width of this primitive type. - fn get_byte_width() -> usize { - std::mem::size_of::() - } - - /// Returns a default value of this primitive type. - /// - /// This is useful for aggregate array ops like `sum()`, `mean()`. - fn default_value() -> Self::Native { - Default::default() - } -} +pub use arrow_array::ArrowPrimitiveType; pub(crate) mod native_op { use super::ArrowNativeType; diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index c32e5ca18488..d15894024809 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -719,10 +719,9 @@ impl Decoder { RecordBatch::try_new_with_options( projected_schema, arr, - &RecordBatchOptions { - match_field_names: true, - row_count: Some(rows.len()), - }, + &RecordBatchOptions::new() + .with_match_field_names(true) + .with_row_count(Some(rows.len())), ) .map(Some) }) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index ce171ec861a4..8967efce50b5 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -249,6 +249,8 @@ #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] +pub use arrow_array::{downcast_dictionary_array, downcast_primitive_array}; + pub use arrow_buffer::{alloc, buffer}; pub mod bitmap { @@ -271,8 +273,32 @@ pub mod ipc; pub mod json; #[cfg(feature = "pyarrow")] pub mod pyarrow; -pub mod record_batch; + +pub mod record_batch { + pub use arrow_array::{RecordBatch, RecordBatchOptions}; + use arrow_schema::{ArrowError, SchemaRef}; + + /// Trait for types that can read `RecordBatch`'s. + pub trait RecordBatchReader: + Iterator> + { + /// Returns the schema of this `RecordBatchReader`. + /// + /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this + /// reader should have the same schema as returned from this method. + fn schema(&self) -> SchemaRef; + + /// Reads the next `RecordBatch`. + #[deprecated( + since = "2.0.0", + note = "This method is deprecated in favour of `next` from the trait Iterator." + )] + fn next_batch(&mut self) -> Result, ArrowError> { + self.next().transpose() + } + } +} pub mod row; -pub mod temporal_conversions; +pub use arrow_array::temporal_conversions; pub mod tensor; pub mod util; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index a775b2ce8bc4..d8d5eee532e7 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -166,19 +166,6 @@ impl PyArrowConvert for Vec { } } -impl PyArrowConvert for T -where - T: Array + From, -{ - fn from_pyarrow(value: &PyAny) -> PyResult { - Ok(ArrayData::from_pyarrow(value)?.into()) - } - - fn to_pyarrow(&self, py: Python) -> PyResult { - self.data().to_pyarrow(py) - } -} - impl PyArrowConvert for RecordBatch { fn from_pyarrow(value: &PyAny) -> PyResult { // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 4d974409a0ee..5dda410f0087 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -47,10 +47,7 @@ pub fn create_random_batch( RecordBatch::try_new_with_options( schema, columns, - &RecordBatchOptions { - match_field_names: false, - row_count: None, - }, + &RecordBatchOptions::new().with_match_field_names(false), ) } diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 310ffb8ee7a0..a20657b58229 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -32,8 +32,5 @@ pub mod string_writer; #[cfg(any(test, feature = "test_utils"))] pub mod test_util; -mod trusted_len; -pub(crate) use trusted_len::trusted_len_unzip; - -pub mod decimal; +pub use arrow_array::decimal; pub(crate) mod reader_parser; From a7cf274765945af4111fddaeec26d672715de9d0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 26 Sep 2022 20:23:38 +0100 Subject: [PATCH 0078/1411] Fix min/max computation for sliced arrays (#2779) (#2780) * Don't apply array offset twice (#2779) * More tests --- arrow/src/compute/kernels/aggregate.rs | 104 ++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 9 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index d6cc3ecc1047..c215e23953e5 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -124,15 +124,8 @@ where .map(|i| unsafe { array.value_unchecked(i) }) .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { - let null_buffer = array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())); - let iter = BitIndexIterator::new( - null_buffer.as_deref().unwrap(), - array.offset(), - array.len(), - ); + let null_buffer = array.data_ref().null_buffer().unwrap(); + let iter = BitIndexIterator::new(null_buffer, array.offset(), array.len()); unsafe { let idx = iter.reduce(|acc_idx, idx| { let acc = array.value_unchecked(acc_idx); @@ -685,6 +678,7 @@ mod tests { use crate::array::*; use crate::compute::add; use crate::datatypes::{Float32Type, Int32Type, Int8Type}; + use arrow_array::types::Float64Type; #[test] fn test_primitive_array_sum() { @@ -1130,4 +1124,96 @@ mod tests { let array = dict_array.downcast_dict::().unwrap(); assert_eq!(2.0_f32, min_array::(array).unwrap()); } + + #[test] + fn test_min_max_sliced_primitive() { + let expected = Some(4.0); + let input: Float64Array = vec![None, Some(4.0)].into_iter().collect(); + let actual = min(&input); + assert_eq!(actual, expected); + let actual = max(&input); + assert_eq!(actual, expected); + + let sliced_input: Float64Array = vec![None, None, None, None, None, Some(4.0)] + .into_iter() + .collect(); + let sliced_input = sliced_input.slice(4, 2); + let sliced_input = as_primitive_array::(&sliced_input); + + assert_eq!(sliced_input, &input); + + let actual = min(sliced_input); + assert_eq!(actual, expected); + let actual = max(sliced_input); + assert_eq!(actual, expected); + } + + #[test] + fn test_min_max_sliced_boolean() { + let expected = Some(true); + let input: BooleanArray = vec![None, Some(true)].into_iter().collect(); + let actual = min_boolean(&input); + assert_eq!(actual, expected); + let actual = max_boolean(&input); + assert_eq!(actual, expected); + + let sliced_input: BooleanArray = vec![None, None, None, None, None, Some(true)] + .into_iter() + .collect(); + let sliced_input = sliced_input.slice(4, 2); + let sliced_input = as_boolean_array(&sliced_input); + + assert_eq!(sliced_input, &input); + + let actual = min_boolean(sliced_input); + assert_eq!(actual, expected); + let actual = max_boolean(sliced_input); + assert_eq!(actual, expected); + } + + #[test] + fn test_min_max_sliced_string() { + let expected = Some("foo"); + let input: StringArray = vec![None, Some("foo")].into_iter().collect(); + let actual = min_string(&input); + assert_eq!(actual, expected); + let actual = max_string(&input); + assert_eq!(actual, expected); + + let sliced_input: StringArray = vec![None, None, None, None, None, Some("foo")] + .into_iter() + .collect(); + let sliced_input = sliced_input.slice(4, 2); + let sliced_input = as_string_array(&sliced_input); + + assert_eq!(sliced_input, &input); + + let actual = min_string(sliced_input); + assert_eq!(actual, expected); + let actual = max_string(sliced_input); + assert_eq!(actual, expected); + } + + #[test] + fn test_min_max_sliced_binary() { + let expected: Option<&[u8]> = Some(&[5]); + let input: BinaryArray = vec![None, Some(&[5])].into_iter().collect(); + let actual = min_binary(&input); + assert_eq!(actual, expected); + let actual = max_binary(&input); + assert_eq!(actual, expected); + + let sliced_input: BinaryArray = vec![None, None, None, None, None, Some(&[5])] + .into_iter() + .collect(); + let sliced_input = sliced_input.slice(4, 2); + let sliced_input = as_generic_binary_array::(&sliced_input); + + assert_eq!(sliced_input, &input); + + let actual = min_binary(sliced_input); + assert_eq!(actual, expected); + let actual = max_binary(sliced_input); + assert_eq!(actual, expected); + } } From 70bcfbcfaab2513f36c39b32427ca812e8e201e4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Sep 2022 15:07:47 +0100 Subject: [PATCH 0079/1411] Update arrow CI for split crates (#2594) (#2778) * Update arrow CI for split crates (#2594) * Update more --- .github/workflows/arrow.yml | 21 +++++++++++++++------ .github/workflows/arrow_flight.yml | 4 ++++ .github/workflows/integration.yml | 4 ++++ .github/workflows/miri.yaml | 4 ++++ .github/workflows/parquet.yml | 4 ++++ 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 797f63b5ae1b..466f0b12ec36 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -26,6 +26,10 @@ on: pull_request: paths: - arrow/** + - arrow-array/** + - arrow-buffer/** + - arrow-data/** + - arrow-schema/** - .github/** jobs: @@ -48,9 +52,16 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: stable - - name: Test - run: | - cargo test -p arrow + - name: Test arrow-buffer with all features + run: cargo test -p arrow-buffer --all-features + - name: Test arrow-data with all features + run: cargo test -p arrow-data --all-features + - name: Test arrow-schema with all features + run: cargo test -p arrow-schema --all-features + - name: Test arrow-array with all features + run: cargo test -p arrow-array --all-features + - name: Test arrow + run: cargo test -p arrow - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict run: | cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict @@ -63,10 +74,8 @@ jobs: cargo run --example read_csv_infer_schema - name: Run non-archery based integration-tests run: cargo test -p arrow-integration-testing - - name: Test arrow-schema with all features - run: cargo test -p arrow-schema --all-features - # test compilaton features + # test compilation features linux-features: name: Check Compilation runs-on: ubuntu-latest diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 86a67ff9a6a4..3e785f056dc3 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -28,6 +28,10 @@ on: pull_request: paths: - arrow/** + - arrow-array/** + - arrow-buffer/** + - arrow-data/** + - arrow-schema/** - arrow-flight/** - .github/** diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d78f02c95a48..0f183990ed49 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -25,6 +25,10 @@ on: pull_request: paths: - arrow/** + - arrow-array/** + - arrow-buffer/** + - arrow-data/** + - arrow-schema/** - arrow-pyarrow-integration-testing/** - integration-testing/** - .github/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index b4669bbcccc0..bb75fcbbbb2a 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -25,6 +25,10 @@ on: pull_request: paths: - arrow/** + - arrow-array/** + - arrow-buffer/** + - arrow-data/** + - arrow-schema/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 8497db798a97..0d5dc63a7da2 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -28,6 +28,10 @@ on: pull_request: paths: - arrow/** + - arrow-array/** + - arrow-buffer/** + - arrow-data/** + - arrow-schema/** - parquet/** - .github/** From 333633e07f63f360f549709a56653b7852c6e59a Mon Sep 17 00:00:00 2001 From: "chunshao.rcs" Date: Tue, 27 Sep 2022 23:32:16 +0800 Subject: [PATCH 0080/1411] fix(arrow-array): fix BinaryBuilder and StringBuilder initialization parameters in struct_builder (#2784) --- arrow-array/src/builder/struct_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index cadc8a529f5f..619931403946 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -105,14 +105,14 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box Box::new(UInt64Builder::with_capacity(capacity)), DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)), DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), - DataType::Binary => Box::new(BinaryBuilder::with_capacity(1024, capacity)), + DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } DataType::Decimal128(precision, scale) => Box::new( Decimal128Builder::with_capacity(capacity, *precision, *scale), ), - DataType::Utf8 => Box::new(StringBuilder::with_capacity(1024, capacity)), + DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), DataType::Time32(TimeUnit::Second) => { From 2ba130761ce5ba90e177b6e4e3496d83a9d4b912 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Sep 2022 18:20:14 +0100 Subject: [PATCH 0081/1411] Move unary kernels to arrow-array (#2787) (#2789) * Move primitive arity kernels (#2787) * Fix doc --- arrow-array/src/array/primitive_array.rs | 119 ++++++++++++++++++++--- arrow/src/compute/kernels/arity.rs | 76 +-------------- 2 files changed, 108 insertions(+), 87 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index f9e4e7675da2..4ac191ac977b 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{BooleanBufferBuilder, PrimitiveBuilder}; +use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; use crate::iterator::PrimitiveIter; use crate::raw_pointer::RawPtrBox; use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; @@ -23,6 +23,7 @@ use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{print_long_array, Array, ArrayAccessor}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::DataType; use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; @@ -298,20 +299,10 @@ impl PrimitiveArray { /// Creates a PrimitiveArray based on a constant value with `count` elements pub fn from_value(value: T::Native, count: usize) -> Self { - // # Safety: iterator (0..count) correctly reports its length - let val_buf = unsafe { Buffer::from_trusted_len_iter((0..count).map(|_| value)) }; - let data = unsafe { - ArrayData::new_unchecked( - T::DATA_TYPE, - val_buf.len() / std::mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ) - }; - PrimitiveArray::from(data) + unsafe { + let val_buf = Buffer::from_trusted_len_iter((0..count).map(|_| value)); + build_primitive_array(count, val_buf, 0, None) + } } /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i` @@ -332,6 +323,104 @@ impl PrimitiveArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + + /// Applies an unary and infallible function to a primitive array. + /// This is the fastest way to perform an operation on a primitive array when + /// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. + /// + /// # Implementation + /// + /// This will apply the function for all values, including those on null slots. + /// This implies that the operation must be infallible for any value of the corresponding type + /// or this function may panic. + /// # Example + /// ```rust + /// # use arrow_array::{Int32Array, types::Int32Type}; + /// # fn main() { + /// let array = Int32Array::from(vec![Some(5), Some(7), None]); + /// let c = array.unary(|x| x * 2 + 1); + /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// # } + /// ``` + pub fn unary(&self, op: F) -> PrimitiveArray + where + O: ArrowPrimitiveType, + F: Fn(T::Native) -> O::Native, + { + let data = self.data(); + let len = self.len(); + let null_count = self.null_count(); + + let null_buffer = data.null_buffer().map(|b| b.bit_slice(data.offset(), len)); + let values = self.values().iter().map(|v| op(*v)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size because arrays are sized. + let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; + unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } + } + + /// Applies a unary and fallible function to all valid values in a primitive array + /// + /// This is unlike [`Self::unary`] which will apply an infallible function to all rows + /// regardless of validity, in many cases this will be significantly faster and should + /// be preferred if `op` is infallible. + /// + /// Note: LLVM is currently unable to effectively vectorize fallible operations + pub fn try_unary(&self, op: F) -> Result, E> + where + O: ArrowPrimitiveType, + F: Fn(T::Native) -> Result, + { + let data = self.data(); + let len = self.len(); + let null_count = self.null_count(); + + if null_count == 0 { + let values = self.values().iter().map(|v| op(*v)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size because arrays are sized. + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; + return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + } + + let null_buffer = data.null_buffer().map(|b| b.bit_slice(data.offset(), len)); + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(len); + let slice = buffer.as_slice_mut(); + + try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { *slice.get_unchecked_mut(idx) = op(self.value_unchecked(idx))? }; + Ok::<_, E>(()) + })?; + + Ok(unsafe { + build_primitive_array(len, buffer.finish(), null_count, null_buffer) + }) + } +} + +#[inline] +unsafe fn build_primitive_array( + len: usize, + buffer: Buffer, + null_count: usize, + null_buffer: Option, +) -> PrimitiveArray { + PrimitiveArray::from(ArrayData::new_unchecked( + O::DATA_TYPE, + len, + Some(null_count), + null_buffer, + 0, + vec![buffer], + vec![], + )) } impl From> for ArrayData { diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 5f875e6ddf29..cb5184c0e9d4 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -48,92 +48,24 @@ unsafe fn build_primitive_array( )) } -/// Applies an unary and infallible function to a primitive array. -/// This is the fastest way to perform an operation on a primitive array when -/// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. -/// -/// # Implementation -/// -/// This will apply the function for all values, including those on null slots. -/// This implies that the operation must be infallible for any value of the corresponding type -/// or this function may panic. -/// # Example -/// ```rust -/// # use arrow::array::Int32Array; -/// # use arrow::datatypes::Int32Type; -/// # use arrow::compute::kernels::arity::unary; -/// # fn main() { -/// let array = Int32Array::from(vec![Some(5), Some(7), None]); -/// let c = unary::<_, _, Int32Type>(&array, |x| x * 2 + 1); -/// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); -/// # } -/// ``` +/// See [`PrimitiveArray::unary`] pub fn unary(array: &PrimitiveArray, op: F) -> PrimitiveArray where I: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(I::Native) -> O::Native, { - let data = array.data(); - let len = data.len(); - let null_count = data.null_count(); - - let null_buffer = data - .null_buffer() - .map(|b| b.bit_slice(data.offset(), data.len())); - - let values = array.values().iter().map(|v| op(*v)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size because arrays are sized. - let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } + array.unary(op) } -/// Applies a unary and fallible function to all valid values in a primitive array -/// -/// This is unlike [`unary`] which will apply an infallible function to all rows regardless -/// of validity, in many cases this will be significantly faster and should be preferred -/// if `op` is infallible. -/// -/// Note: LLVM is currently unable to effectively vectorize fallible operations +/// See [`PrimitiveArray::try_unary`] pub fn try_unary(array: &PrimitiveArray, op: F) -> Result> where I: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(I::Native) -> Result, { - let len = array.len(); - let null_count = array.null_count(); - - if null_count == 0 { - let values = array.values().iter().map(|v| op(*v)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size because arrays are sized. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); - } - - let null_buffer = array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())); - - let mut buffer = BufferBuilder::::new(len); - buffer.append_n_zeroed(array.len()); - let slice = buffer.as_slice_mut(); - - try_for_each_valid_idx(array.len(), 0, null_count, null_buffer.as_deref(), |idx| { - unsafe { *slice.get_unchecked_mut(idx) = op(array.value_unchecked(idx))? }; - Ok::<_, ArrowError>(()) - })?; - - Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) + array.try_unary(op) } /// A helper function that applies an infallible unary function to a dictionary array with primitive value type. From e97bd6f43c026abb18e47cd763ad07fa2403e98b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Sep 2022 18:49:26 +0100 Subject: [PATCH 0082/1411] Disable test harness for string_dictionary_builder benchmark (#2788) --- arrow/Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 48b7f39547ef..b3500f4e5b06 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -227,6 +227,10 @@ name = "buffer_create" harness = false required-features = ["test_utils"] +[[bench]] +name = "string_dictionary_builder" +harness = false + [[bench]] name = "substring_kernels" harness = false From 322d351057b5e79fcc62939513ebfe899a669d1f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Sep 2022 20:05:09 +0100 Subject: [PATCH 0083/1411] Add PrimitiveArray::reinterpret_cast (#2785) (#2786) --- arrow-array/src/array/primitive_array.rs | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4ac191ac977b..e5f5cd481af5 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -324,6 +324,35 @@ impl PrimitiveArray { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + /// Reinterprets this array's contents as a different data type without copying + /// + /// This can be used to efficiently convert between primitive arrays with the + /// same underlying representation + /// + /// Note: this will not modify the underlying values, and therefore may change + /// the semantic values of the array, e.g. 100 milliseconds in a [`TimestampNanosecondArray`] + /// will become 100 seconds in a [`TimestampSecondArray`]. + /// + /// For casts that preserve the semantic value, check out the [compute kernels] + /// + /// [compute kernels](https://docs.rs/arrow/latest/arrow/compute/kernels/cast/index.html) + /// + /// ``` + /// # use arrow_array::{Int64Array, TimestampNanosecondArray}; + /// let a = Int64Array::from_iter_values([1, 2, 3, 4]); + /// let b: TimestampNanosecondArray = a.reinterpret_cast(); + /// ``` + pub fn reinterpret_cast(&self) -> PrimitiveArray + where + K: ArrowPrimitiveType, + { + let d = self.data.clone().into_builder().data_type(K::DATA_TYPE); + + // SAFETY: + // Native type is the same + PrimitiveArray::from(unsafe { d.build_unchecked() }) + } + /// Applies an unary and infallible function to a primitive array. /// This is the fastest way to perform an operation on a primitive array when /// the benefits of a vectorized operation outweigh the cost of branching nulls and non-nulls. From 7639f28dceb7397a235f3e32dc60fe6ba1fa90da Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Sep 2022 21:07:24 +0100 Subject: [PATCH 0084/1411] Add struct equality test case (#514) (#2791) --- arrow/tests/array_equal.rs | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 5a1b48c009df..a5f3f42a1dfd 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -22,6 +22,7 @@ use arrow::array::{ OffsetSizeTrait, StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; +use arrow_array::builder::{StringBuilder, StructBuilder}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field}; @@ -1272,3 +1273,65 @@ fn test_list_different_offsets() { let b_slice = b.slice(0, 2); assert_eq!(&a_slice, &b_slice); } + +fn make_struct( + elements: Vec, Option)>>, +) -> StructArray { + let mut builder = StructBuilder::new( + vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), + ], + vec![ + Box::new(StringBuilder::new()), + Box::new(Int32Builder::new()), + ], + ); + + for element in elements { + match element.and_then(|e| e.0) { + None => builder + .field_builder::(0) + .unwrap() + .append_null(), + Some(s) => builder + .field_builder::(0) + .unwrap() + .append_value(s), + }; + + builder + .field_builder::(1) + .unwrap() + .append_option(element.and_then(|e| e.1)); + + builder.append(element.is_some()); + } + + builder.finish() +} + +#[test] +fn test_struct_equal_slice() { + let a = make_struct(vec![ + None, + Some((Some("joe"), Some(1))), + Some((None, Some(2))), + Some((None, None)), + Some((Some("mark"), Some(4))), + Some((Some("doe"), Some(5))), + ]); + let a = a.slice(1, 5); + let a = a.as_any().downcast_ref::().unwrap(); + + let b = make_struct(vec![ + Some((Some("joe"), Some(1))), + Some((None, Some(2))), + Some((None, None)), + Some((Some("mark"), Some(4))), + Some((Some("doe"), Some(5))), + ]); + assert_eq!(a, &b); + + test_equal(a.data(), b.data(), true); +} From 178319b9f0978fa9be538661981fbfc5925dc17c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 28 Sep 2022 22:06:52 +0100 Subject: [PATCH 0085/1411] Replace checked casts with as for performance (#1918) (#2793) Remove num dependency from arrow-buffer Deprecate unnecessary methods --- arrow-array/src/array/dictionary_array.rs | 13 +- arrow-array/src/array/list_array.rs | 14 +- .../src/builder/string_dictionary_builder.rs | 6 +- arrow-buffer/Cargo.toml | 1 - arrow-buffer/src/native.rs | 295 ++++++------------ arrow-buffer/src/util/bit_util.rs | 3 +- arrow-data/src/equal/variable_size.rs | 4 +- arrow-data/src/transform/list.rs | 8 +- arrow/src/array/ord.rs | 8 +- arrow/src/compute/kernels/comparison.rs | 6 +- arrow/src/compute/kernels/concat_elements.rs | 19 +- arrow/src/compute/kernels/filter.rs | 2 +- arrow/src/compute/kernels/sort.rs | 8 +- arrow/src/compute/kernels/substring.rs | 16 +- arrow/src/compute/kernels/take.rs | 2 +- arrow/src/util/reader_parser.rs | 10 +- parquet/src/arrow/arrow_writer/levels.rs | 8 +- parquet/src/arrow/buffer/offset_buffer.rs | 8 +- 18 files changed, 157 insertions(+), 274 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 35d243fde9ae..69a7b1961ea9 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -308,20 +308,13 @@ impl DictionaryArray { /// Return an iterator over the keys (indexes into the dictionary) pub fn keys_iter(&self) -> impl Iterator> + '_ { - self.keys - .iter() - .map(|key| key.map(|k| k.to_usize().expect("Dictionary index not usize"))) + self.keys.iter().map(|key| key.map(|k| k.as_usize())) } /// Return the value of `keys` (the dictionary key) at index `i`, /// cast to `usize`, `None` if the value at `i` is `NULL`. pub fn key(&self, i: usize) -> Option { - self.keys.is_valid(i).then(|| { - self.keys - .value(i) - .to_usize() - .expect("Dictionary index not usize") - }) + self.keys.is_valid(i).then(|| self.keys.value(i).as_usize()) } /// Downcast this dictionary to a [`TypedDictionaryArray`] @@ -586,7 +579,7 @@ where unsafe fn value_unchecked(&self, index: usize) -> Self::Item { let val = self.dictionary.keys.value_unchecked(index); - let value_idx = val.to_usize().unwrap(); + let value_idx = val.as_usize(); // As dictionary keys are only verified for non-null indexes // we must check the value is within bounds diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 83b0c6d5bd46..b45a0f9257f2 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -80,18 +80,16 @@ impl GenericListArray { /// # Safety /// Caller must ensure that the index is within the array bounds pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { - let end = *self.value_offsets().get_unchecked(i + 1); - let start = *self.value_offsets().get_unchecked(i); - self.values - .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) + let end = self.value_offsets().get_unchecked(i + 1).as_usize(); + let start = self.value_offsets().get_unchecked(i).as_usize(); + self.values.slice(start, end - start) } /// Returns ith value of this list array. pub fn value(&self, i: usize) -> ArrayRef { - let end = self.value_offsets()[i + 1]; - let start = self.value_offsets()[i]; - self.values - .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) + let end = self.value_offsets()[i + 1].as_usize(); + let start = self.value_offsets()[i].as_usize(); + self.values.slice(start, end - start) } /// Returns the offset values in the offsets buffer diff --git a/arrow-array/src/builder/string_dictionary_builder.rs b/arrow-array/src/builder/string_dictionary_builder.rs index bab17d4a9f6e..e41086c872f1 100644 --- a/arrow-array/src/builder/string_dictionary_builder.rs +++ b/arrow-array/src/builder/string_dictionary_builder.rs @@ -293,9 +293,9 @@ fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a let offsets = values.offsets_slice(); let values = values.values_slice(); - let idx = key.to_usize().unwrap(); - let end_offset = offsets[idx + 1].to_usize().unwrap(); - let start_offset = offsets[idx].to_usize().unwrap(); + let idx = key.as_usize(); + let end_offset = offsets[idx + 1].as_usize(); + let start_offset = offsets[idx].as_usize(); &values[start_offset..end_offset] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index c1bcd9f63068..c5a81b30f294 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -38,7 +38,6 @@ path = "src/lib.rs" bench = false [dependencies] -num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } [dev-dependencies] diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 90855872d18e..d371e7e110f2 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -46,229 +46,128 @@ mod private { pub trait ArrowNativeType: std::fmt::Debug + Send + Sync + Copy + PartialOrd + Default + private::Sealed + 'static { - /// Convert native type from usize. - #[inline] - fn from_usize(_: usize) -> Option { - None - } + /// Convert native integer type from usize + /// + /// Returns `None` if [`Self`] is not an integer or conversion would result + /// in truncation/overflow + fn from_usize(_: usize) -> Option; + + /// Convert to usize according to the [`as`] operator + /// + /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast + fn as_usize(self) -> usize; /// Convert native type to usize. - #[inline] - fn to_usize(&self) -> Option { - None - } + /// + /// Returns `None` if [`Self`] is not an integer or conversion would result + /// in truncation/overflow + fn to_usize(self) -> Option; /// Convert native type to isize. - #[inline] - fn to_isize(&self) -> Option { - None - } + /// + /// Returns `None` if [`Self`] is not an integer or conversion would result + /// in truncation/overflow + fn to_isize(self) -> Option; /// Convert native type from i32. - #[inline] + /// + /// Returns `None` if [`Self`] is not `i32` + #[deprecated(note = "please use `Option::Some` instead")] fn from_i32(_: i32) -> Option { None } /// Convert native type from i64. - #[inline] + /// + /// Returns `None` if [`Self`] is not `i64` + #[deprecated(note = "please use `Option::Some` instead")] fn from_i64(_: i64) -> Option { None } /// Convert native type from i128. - #[inline] + /// + /// Returns `None` if [`Self`] is not `i128` + #[deprecated(note = "please use `Option::Some` instead")] fn from_i128(_: i128) -> Option { None } } -impl private::Sealed for i8 {} -impl ArrowNativeType for i8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for i16 {} -impl ArrowNativeType for i16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for i32 {} -impl ArrowNativeType for i32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i32. - #[inline] - fn from_i32(val: i32) -> Option { - Some(val) - } -} - -impl private::Sealed for i64 {} -impl ArrowNativeType for i64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i64. - #[inline] - fn from_i64(val: i64) -> Option { - Some(val) - } -} - -impl private::Sealed for i128 {} -impl ArrowNativeType for i128 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i128. - #[inline] - fn from_i128(val: i128) -> Option { - Some(val) - } -} - -impl private::Sealed for u8 {} -impl ArrowNativeType for u8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for u16 {} -impl ArrowNativeType for u16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl private::Sealed for u32 {} -impl ArrowNativeType for u32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } +macro_rules! native_integer { + ($t: ty $(, $from:ident)*) => { + impl private::Sealed for $t {} + impl ArrowNativeType for $t { + #[inline] + fn from_usize(v: usize) -> Option { + v.try_into().ok() + } + + #[inline] + fn to_usize(self) -> Option { + self.try_into().ok() + } + + #[inline] + fn to_isize(self) -> Option { + self.try_into().ok() + } + + #[inline] + fn as_usize(self) -> usize { + self as _ + } + + $( + #[inline] + fn $from(v: $t) -> Option { + Some(v) + } + )* + } + }; } -impl private::Sealed for u64 {} -impl ArrowNativeType for u64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } +native_integer!(i8); +native_integer!(i16); +native_integer!(i32, from_i32); +native_integer!(i64, from_i64); +native_integer!(i128, from_i128); +native_integer!(u8); +native_integer!(u16); +native_integer!(u32); +native_integer!(u64); + +macro_rules! native_float { + ($t:ty, $s:ident, $as_usize: expr) => { + impl private::Sealed for $t {} + impl ArrowNativeType for $t { + #[inline] + fn from_usize(_: usize) -> Option { + None + } + + #[inline] + fn to_usize(self) -> Option { + None + } + + #[inline] + fn to_isize(self) -> Option { + None + } + + #[inline] + fn as_usize($s) -> usize { + $as_usize + } + } + }; } -impl ArrowNativeType for f16 {} -impl private::Sealed for f16 {} -impl ArrowNativeType for f32 {} -impl private::Sealed for f32 {} -impl ArrowNativeType for f64 {} -impl private::Sealed for f64 {} +native_float!(f16, self, self.to_f32() as _); +native_float!(f32, self, self as _); +native_float!(f64, self, self as _); /// Allows conversion from supported Arrow types to a byte slice. pub trait ToByteSlice { diff --git a/arrow-buffer/src/util/bit_util.rs b/arrow-buffer/src/util/bit_util.rs index 5752c5df972e..de4bc96f9daf 100644 --- a/arrow-buffer/src/util/bit_util.rs +++ b/arrow-buffer/src/util/bit_util.rs @@ -17,7 +17,6 @@ //! Utils for working with bits -use num::Integer; #[cfg(feature = "simd")] use packed_simd::u8x64; @@ -102,7 +101,7 @@ pub unsafe fn unset_bit_raw(data: *mut u8, i: usize) { pub fn ceil(value: usize, divisor: usize) -> usize { // Rewrite as `value.div_ceil(&divisor)` after // https://github.com/rust-lang/rust/issues/88581 is merged. - Integer::div_ceil(&value, &divisor) + value / divisor + (0 != value % divisor) as usize } /// Performs SIMD bitwise binary operations. diff --git a/arrow-data/src/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs index b4445db54bb1..f661c614d301 100644 --- a/arrow-data/src/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -31,8 +31,8 @@ fn offset_value_equal( rhs_pos: usize, len: usize, ) -> bool { - let lhs_start = lhs_offsets[lhs_pos].to_usize().unwrap(); - let rhs_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_start = lhs_offsets[lhs_pos].as_usize(); + let rhs_start = rhs_offsets[rhs_pos].as_usize(); let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index f318d46f498e..2f14f2fb514a 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -46,8 +46,8 @@ pub(super) fn build_extend(array: &ArrayData) -> E mutable.child_data[0].extend( index, - offsets[start].to_usize().unwrap(), - offsets[start + len].to_usize().unwrap(), + offsets[start].as_usize(), + offsets[start + len].as_usize(), ) }, ) @@ -75,8 +75,8 @@ pub(super) fn build_extend(array: &ArrayData) -> E // append value child.extend( index, - offsets[i].to_usize().unwrap(), - offsets[i + 1].to_usize().unwrap(), + offsets[i].as_usize(), + offsets[i + 1].as_usize(), ); } // append offset diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index 998c06e50ebd..3fc62f807bef 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -97,8 +97,8 @@ where PrimitiveArray::from(right.values().data().clone()); Box::new(move |i: usize, j: usize| { - let key_left = left_keys.value(i).to_usize().unwrap(); - let key_right = right_keys.value(j).to_usize().unwrap(); + let key_left = left_keys.value(i).as_usize(); + let key_right = right_keys.value(j).as_usize(); let left = left_values.value(key_left); let right = right_values.value(key_right); left.cmp(&right) @@ -118,8 +118,8 @@ where let right_values = StringArray::from(right.values().data().clone()); Box::new(move |i: usize, j: usize| { - let key_left = left_keys.value(i).to_usize().unwrap(); - let key_right = right_keys.value(j).to_usize().unwrap(); + let key_left = left_keys.value(i).as_usize(); + let key_right = right_keys.value(j).as_usize(); let left = left_values.value(key_left); let right = right_values.value(key_right); left.cmp(right) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index fec464b93286..49aecfb67fa6 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -863,9 +863,9 @@ fn utf8_empty( MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( |offset| { if EQ { - offset[1].to_usize().unwrap() == offset[0].to_usize().unwrap() + offset[1].as_usize() == offset[0].as_usize() } else { - offset[1].to_usize().unwrap() > offset[0].to_usize().unwrap() + offset[1].as_usize() > offset[0].as_usize() } }, )) @@ -1793,7 +1793,7 @@ where .map(|key| { key.map(|key| unsafe { // safety lengths were verified above - let key = key.to_usize().expect("Dictionary index not usize"); + let key = key.as_usize(); dict_comparison.value_unchecked(key) }) }) diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow/src/compute/kernels/concat_elements.rs index ac365a0968ec..1c0a0925df74 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow/src/compute/kernels/concat_elements.rs @@ -57,20 +57,17 @@ pub fn concat_elements_utf8( let mut output_values = BufferBuilder::::new( left_values.len() + right_values.len() - - left_offsets[0].to_usize().unwrap() - - right_offsets[0].to_usize().unwrap(), + - left_offsets[0].as_usize() + - right_offsets[0].as_usize(), ); let mut output_offsets = BufferBuilder::::new(left_offsets.len()); output_offsets.append(Offset::zero()); for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { + output_values + .append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); output_values.append_slice( - &left_values - [left_idx[0].to_usize().unwrap()..left_idx[1].to_usize().unwrap()], - ); - output_values.append_slice( - &right_values - [right_idx[0].to_usize().unwrap()..right_idx[1].to_usize().unwrap()], + &right_values[right_idx[0].as_usize()..right_idx[1].as_usize()], ); output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); } @@ -137,7 +134,7 @@ pub fn concat_elements_utf8_many( data_values .iter() .zip(offsets.iter_mut()) - .map(|(data, offset)| data.len() - offset.peek().unwrap().to_usize().unwrap()) + .map(|(data, offset)| data.len() - offset.peek().unwrap().as_usize()) .sum(), ); @@ -148,8 +145,8 @@ pub fn concat_elements_utf8_many( .iter() .zip(offsets.iter_mut()) .for_each(|(values, offset)| { - let index_start = offset.next().unwrap().to_usize().unwrap(); - let index_end = offset.peek().unwrap().to_usize().unwrap(); + let index_start = offset.next().unwrap().as_usize(); + let index_end = offset.peek().unwrap().as_usize(); output_values.append_slice(&values[index_start..index_end]); }); output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index 291324e65a85..d528b0632486 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -568,7 +568,7 @@ where /// Returns the byte offset at `idx` #[inline] fn get_value_offset(&self, idx: usize) -> usize { - self.src_offsets[idx].to_usize().expect("illegal offset") + self.src_offsets[idx].as_usize() } /// Returns the start and end of the value at index `idx` along with its length diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 0bc2d39481e3..5eaed4bc62bc 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -565,7 +565,7 @@ where .into_iter() .map(|index| { let key: K::Native = keys.value(index as usize); - let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap(); + let value_order = value_indices_map.get(&key.as_usize()).unwrap(); (index, *value_order) }) .collect::>(); @@ -690,7 +690,7 @@ fn sort_string_dictionary( .into_iter() .map(|index| { let key: T::Native = keys.value(index as usize); - let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap(); + let value_order = value_indices_map.get(&key.as_usize()).unwrap(); (index, *value_order) }) .collect::>(); @@ -1277,7 +1277,7 @@ mod tests { (0..sorted.len()) .map(|i| { if sorted.is_valid(i) { - Some(sorted_dict.value(sorted_keys.value(i).to_usize().unwrap())) + Some(sorted_dict.value(sorted_keys.value(i).as_usize())) } else { None } @@ -1329,7 +1329,7 @@ mod tests { let sorted_values: PrimitiveArray = From::>>::from( (0..sorted.len()) .map(|i| { - let key = sorted_keys.value(i).to_usize().unwrap(); + let key = sorted_keys.value(i).as_usize(); if sorted.is_valid(i) && sorted_dict.is_valid(key) { Some(sorted_dict.value(key)) } else { diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 5190d0bf0b67..f52ddb3bc30b 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -280,14 +280,13 @@ fn binary_substring( }); // concatenate substrings into a buffer - let mut new_values = - MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap()); + let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); new_starts_ends .iter() .map(|(start, end)| { - let start = start.to_usize().unwrap(); - let end = end.to_usize().unwrap(); + let start = start.as_usize(); + let end = end.as_usize(); &data[start..end] }) .for_each(|slice| new_values.extend_from_slice(slice)); @@ -375,7 +374,7 @@ fn utf8_substring( // Safety: a StringArray must contain valid UTF8 data let data_str = unsafe { std::str::from_utf8_unchecked(data) }; |offset: OffsetSize| { - let offset_usize = offset.to_usize().unwrap(); + let offset_usize = offset.as_usize(); if data_str.is_char_boundary(offset_usize) { Ok(offset) } else { @@ -411,14 +410,13 @@ fn utf8_substring( })?; // concatenate substrings into a buffer - let mut new_values = - MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap()); + let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); new_starts_ends .iter() .map(|(start, end)| { - let start = start.to_usize().unwrap(); - let end = end.to_usize().unwrap(); + let start = start.as_usize(); + let end = end.as_usize(); &data[start..end] }) .for_each(|slice| new_values.extend_from_slice(slice)); diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 8f1aab27b534..1aa4473c0444 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -190,7 +190,7 @@ where .iter() .map(|index| { if let Some(index) = index { - struct_.is_valid(ArrowNativeType::to_usize(&index).unwrap()) + struct_.is_valid(index.to_usize().unwrap()) } else { false } diff --git a/arrow/src/util/reader_parser.rs b/arrow/src/util/reader_parser.rs index 91b362df86fd..60082e8dd551 100644 --- a/arrow/src/util/reader_parser.rs +++ b/arrow/src/util/reader_parser.rs @@ -97,20 +97,20 @@ impl Parser for Date32Type { fn parse(string: &str) -> Option { use chrono::Datelike; let date = string.parse::().ok()?; - Self::Native::from_i32(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } fn parse_formatted(string: &str, format: &str) -> Option { use chrono::Datelike; let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; - Self::Native::from_i32(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } } impl Parser for Date64Type { fn parse(string: &str) -> Option { let date_time = string.parse::().ok()?; - Self::Native::from_i64(date_time.timestamp_millis()) + Some(date_time.timestamp_millis()) } fn parse_formatted(string: &str, format: &str) -> Option { @@ -132,10 +132,10 @@ impl Parser for Date64Type { }); if has_zone { let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Self::Native::from_i64(date_time.timestamp_millis()) + Some(date_time.timestamp_millis()) } else { let date_time = chrono::NaiveDateTime::parse_from_str(string, format).ok()?; - Self::Native::from_i64(date_time.timestamp_millis()) + Some(date_time.timestamp_millis()) } } } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 49f997ac81ff..f5e26a7281b7 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -276,8 +276,8 @@ impl LevelInfoBuilder { // TODO: Faster bitmask iteration (#1757) for (idx, w) in offsets.windows(2).enumerate() { let is_valid = nulls.is_set(idx + null_offset); - let start_idx = w[0].to_usize().unwrap(); - let end_idx = w[1].to_usize().unwrap(); + let start_idx = w[0].as_usize(); + let end_idx = w[1].as_usize(); if !is_valid { write_null_slice(child) } else if start_idx == end_idx { @@ -289,8 +289,8 @@ impl LevelInfoBuilder { } None => { for w in offsets.windows(2) { - let start_idx = w[0].to_usize().unwrap(); - let end_idx = w[1].to_usize().unwrap(); + let start_idx = w[0].as_usize(); + let end_idx = w[1].as_usize(); if start_idx == end_idx { write_empty_slice(child) } else { diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 2d73e3f146b6..48eb70137392 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -95,15 +95,15 @@ impl OffsetBuffer { dict_values: &[u8], ) -> Result<()> { for key in keys { - let index = key.to_usize().unwrap(); + let index = key.as_usize(); if index + 1 >= dict_offsets.len() { return Err(general_err!( "dictionary key beyond bounds of dictionary: 0..{}", dict_offsets.len().saturating_sub(1) )); } - let start_offset = dict_offsets[index].to_usize().unwrap(); - let end_offset = dict_offsets[index + 1].to_usize().unwrap(); + let start_offset = dict_offsets[index].as_usize(); + let end_offset = dict_offsets[index + 1].as_usize(); // Dictionary values are verified when decoding dictionary page self.try_push(&dict_values[start_offset..end_offset], false)?; @@ -167,7 +167,7 @@ impl BufferQueue for OffsetBuffer { Self { offsets: std::mem::replace(&mut self.offsets, new_offsets), - values: self.values.take(end_offset.to_usize().unwrap()), + values: self.values.take(end_offset.as_usize()), } } From f845d6e7b82dbcdfac31266ed91111d2c5a7eb69 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 29 Sep 2022 07:33:31 +0100 Subject: [PATCH 0086/1411] Handle incomplete HTTP redirects missing LOCATION (#2795) (#2796) --- object_store/src/client/retry.rs | 119 ++++++++++++++++++++++++------- 1 file changed, 92 insertions(+), 27 deletions(-) diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index d66628aec458..cee86b3442ca 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -20,49 +20,62 @@ use crate::client::backoff::{Backoff, BackoffConfig}; use futures::future::BoxFuture; use futures::FutureExt; +use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; -use snafu::Snafu; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug, Snafu)] -#[snafu(display( - "response error \"{}\", after {} retries: {}", - message, - retries, - source -))] +#[derive(Debug)] pub struct Error { retries: usize, message: String, - source: reqwest::Error, + source: Option, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "response error \"{}\", after {} retries", + self.message, self.retries + )?; + if let Some(source) = &self.source { + write!(f, ": {}", source)?; + } + Ok(()) + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.as_ref().map(|e| e as _) + } } impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.source.status() + self.source.as_ref().and_then(|e| e.status()) } } impl From for std::io::Error { fn from(err: Error) -> Self { use std::io::ErrorKind; - if err.source.is_builder() || err.source.is_request() { - Self::new(ErrorKind::InvalidInput, err) - } else if let Some(s) = err.source.status() { - match s { - StatusCode::NOT_FOUND => Self::new(ErrorKind::NotFound, err), - StatusCode::BAD_REQUEST => Self::new(ErrorKind::InvalidInput, err), - _ => Self::new(ErrorKind::Other, err), + match (&err.source, err.status()) { + (Some(source), _) if source.is_builder() || source.is_request() => { + Self::new(ErrorKind::InvalidInput, err) + } + (_, Some(StatusCode::NOT_FOUND)) => Self::new(ErrorKind::NotFound, err), + (_, Some(StatusCode::BAD_REQUEST)) => Self::new(ErrorKind::InvalidInput, err), + (Some(source), None) if source.is_timeout() => { + Self::new(ErrorKind::TimedOut, err) + } + (Some(source), None) if source.is_connect() => { + Self::new(ErrorKind::NotConnected, err) } - } else if err.source.is_timeout() { - Self::new(ErrorKind::TimedOut, err) - } else if err.source.is_connect() { - Self::new(ErrorKind::NotConnected, err) - } else { - Self::new(ErrorKind::Other, err) + _ => Self::new(ErrorKind::Other, err), } } } @@ -131,7 +144,21 @@ impl RetryExt for reqwest::RequestBuilder { let s = self.try_clone().expect("request body must be cloneable"); match s.send().await { Ok(r) => match r.error_for_status_ref() { - Ok(_) => return Ok(r), + Ok(_) if r.status().is_success() => return Ok(r), + Ok(r) => { + let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); + let message = match is_bare_redirect { + true => "Received redirect without LOCATION, this normally indicates an incorrectly configured region".to_string(), + // Not actually sure if this is reachable, but here for completeness + false => format!("request unsuccessful: {}", r.status()), + }; + + return Err(Error{ + message, + retries, + source: None, + }) + } Err(e) => { let status = r.status(); @@ -152,7 +179,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, - source: e, + source: Some(e), }) } @@ -168,7 +195,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ retries, message: "request error".to_string(), - source: e + source: Some(e) }) } } @@ -253,7 +280,7 @@ mod tests { let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::NO_CONTENT); - // Follows redirects + // Follows 402 redirects mock.push( Response::builder() .status(StatusCode::FOUND) @@ -266,6 +293,44 @@ mod tests { assert_eq!(r.status(), StatusCode::OK); assert_eq!(r.url().path(), "/foo"); + // Follows 401 redirects + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .header(LOCATION, "/bar") + .body(Body::empty()) + .unwrap(), + ); + + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + assert_eq!(r.url().path(), "/bar"); + + // Handles redirect loop + for _ in 0..10 { + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .header(LOCATION, "/bar") + .body(Body::empty()) + .unwrap(), + ); + } + + let e = do_request().await.unwrap_err().to_string(); + assert!(e.ends_with("too many redirects"), "{}", e); + + // Handles redirect missing location + mock.push( + Response::builder() + .status(StatusCode::FOUND) + .body(Body::empty()) + .unwrap(), + ); + + let e = do_request().await.unwrap_err(); + assert_eq!(e.message, "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); + // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { mock.push( From da8f7427c27f0d26092d0536d1343421250fc4cf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 29 Sep 2022 15:46:03 +0100 Subject: [PATCH 0087/1411] Fix S3 query canonicalization (#2800) (#2801) * Fix S3 query canonicalization (#2800) * Disable listing with spaces on azurite and localstack --- object_store/src/aws/client.rs | 16 ++----------- object_store/src/aws/credential.rs | 37 +++++++++++++++++++++++++++++- object_store/src/aws/mod.rs | 20 ++++++++++++++-- object_store/src/azure/mod.rs | 15 +++++------- object_store/src/lib.rs | 22 ++++++++++++++++++ object_store/src/path/mod.rs | 11 +++++++++ 6 files changed, 95 insertions(+), 26 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index f800fec3dc5d..5ec9390ec898 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -16,6 +16,7 @@ // under the License. use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; +use crate::aws::STRICT_PATH_ENCODE_SET; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::multipart::UploadPart; @@ -26,26 +27,13 @@ use crate::{ }; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; -use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode, NON_ALPHANUMERIC}; +use percent_encoding::{utf8_percent_encode, PercentEncode}; use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::ops::Range; use std::sync::Arc; -// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html -// -// Do not URI-encode any of the unreserved characters that RFC 3986 defines: -// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); - -/// This struct is used to maintain the URI path encoding -const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); - /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 1abf42be9103..d4461645f3c3 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::STRICT_ENCODE_SET; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; @@ -22,6 +23,7 @@ use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; use futures::TryFutureExt; +use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; @@ -29,6 +31,7 @@ use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; use tracing::warn; +use url::Url; type StdError = Box; @@ -103,13 +106,14 @@ impl<'a> RequestSigner<'a> { request.headers_mut().insert(HASH_HEADER, header_digest); let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + let canonical_query = canonicalize_query(request.url()); // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", request.method().as_str(), request.url().path(), // S3 doesn't percent encode this like other services - request.url().query().unwrap_or(""), // This assumes the query pairs are in order + canonical_query, canonical_headers, signed_headers, digest @@ -207,6 +211,37 @@ fn hex_encode(bytes: &[u8]) -> String { out } +/// Canonicalizes query parameters into the AWS canonical form +/// +/// +fn canonicalize_query(url: &Url) -> String { + use std::fmt::Write; + + let capacity = match url.query() { + Some(q) if !q.is_empty() => q.len(), + _ => return String::new(), + }; + let mut encoded = String::with_capacity(capacity + 1); + + let mut headers = url.query_pairs().collect::>(); + headers.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + let mut first = true; + for (k, v) in headers { + if !first { + encoded.push('&'); + } + first = false; + let _ = write!( + encoded, + "{}={}", + utf8_percent_encode(k.as_ref(), &STRICT_ENCODE_SET), + utf8_percent_encode(v.as_ref(), &STRICT_ENCODE_SET) + ); + } + encoded +} + /// Canonicalizes headers into the AWS Canonical Form. /// /// diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index d1d0a12cdaf9..d186c7f47e36 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -58,6 +58,20 @@ use crate::{ mod client; mod credential; +// http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html +// +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = + percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -551,7 +565,7 @@ mod tests { use super::*; use crate::tests::{ get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; use std::env; @@ -677,9 +691,11 @@ mod tests { #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); - put_get_delete_list(&integration).await; + // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, is_local).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index dd1cde9c7a2a..f7ca4cf4e8c4 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -595,7 +595,7 @@ mod tests { use super::*; use crate::tests::{ copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, }; use std::env; @@ -663,9 +663,10 @@ mod tests { #[tokio::test] async fn azure_blob_test() { + let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok(); let integration = maybe_skip_integration!().build().unwrap(); - - put_get_delete_list(&integration).await; + // Azurite doesn't support listing with spaces - https://github.com/localstack/localstack/issues/6328 + put_get_delete_list_opts(&integration, use_emulator).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -687,13 +688,9 @@ mod tests { .with_container_name( env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), ) - .with_client_secret_authorization( - env::var("AZURE_STORAGE_CLIENT_ID") + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY") .expect("must be set AZURE_STORAGE_CLIENT_ID"), - env::var("AZURE_STORAGE_CLIENT_SECRET") - .expect("must be set AZURE_STORAGE_CLIENT_SECRET"), - env::var("AZURE_STORAGE_TENANT_ID") - .expect("must be set AZURE_STORAGE_TENANT_ID"), ); let integration = builder.build().unwrap(); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 16f0c6f3a2aa..5eaaabaf2944 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -506,6 +506,13 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { + put_get_delete_list_opts(storage, false).await + } + + pub(crate) async fn put_get_delete_list_opts( + storage: &DynObjectStore, + skip_list_with_spaces: bool, + ) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -701,6 +708,21 @@ mod tests { assert_eq!(files, vec![path.clone()]); storage.delete(&path).await.unwrap(); + + let path = Path::parse("foo bar/I contain spaces.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + storage.head(&path).await.unwrap(); + + if !skip_list_with_spaces { + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + } + storage.delete(&path).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert!(files.is_empty(), "{:?}", files); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index e5a7b6443bb1..80e0f792aa55 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -534,4 +534,15 @@ mod tests { needle ); } + + #[test] + fn path_containing_spaces() { + let a = Path::from_iter(["foo bar", "baz"]); + let b = Path::from("foo bar/baz"); + let c = Path::parse("foo bar/baz").unwrap(); + + assert_eq!(a.raw, "foo bar/baz"); + assert_eq!(a.raw, b.raw); + assert_eq!(b.raw, c.raw); + } } From e2bf158946e5d81912bc9166d87b86f0ad442afb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 29 Sep 2022 15:47:49 -0600 Subject: [PATCH 0088/1411] add field name to parquet PrimitiveTypeBuilder error messages (#2805) --- parquet/src/schema/types.rs | 78 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index efb0b82b3230..da6419424490 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -281,8 +281,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { // Check length before logical type, since it is used for logical type validation. if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 { return Err(general_err!( - "Invalid FIXED_LEN_BYTE_ARRAY length: {}", - self.length + "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'", + self.length, + self.name )); } @@ -295,9 +296,10 @@ impl<'a> PrimitiveTypeBuilder<'a> { != self.converted_type { return Err(general_err!( - "Logical type {:?} is imcompatible with converted type {}", + "Logical type {:?} is incompatible with converted type {} for field '{}'", logical_type, - self.converted_type + self.converted_type, + self.name )); } } else { @@ -308,8 +310,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { match (logical_type, self.physical_type) { (LogicalType::Map, _) | (LogicalType::List, _) => { return Err(general_err!( - "{:?} cannot be applied to a primitive type", - logical_type + "{:?} cannot be applied to a primitive type for field '{}'", + logical_type, + self.name )); } (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {} @@ -317,16 +320,18 @@ impl<'a> PrimitiveTypeBuilder<'a> { // Check that scale and precision are consistent with legacy values if *scale != self.scale { return Err(general_err!( - "DECIMAL logical type scale {} must match self.scale {}", + "DECIMAL logical type scale {} must match self.scale {} for field '{}'", scale, - self.scale + self.scale, + self.name )); } if *precision != self.precision { return Err(general_err!( - "DECIMAL logical type precision {} must match self.precision {}", + "DECIMAL logical type precision {} must match self.precision {} for field '{}'", precision, - self.precision + self.precision, + self.name )); } self.check_decimal_precision_scale()?; @@ -342,7 +347,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { (LogicalType::Time { unit, .. }, PhysicalType::INT64) => { if *unit == TimeUnit::MILLIS(Default::default()) { return Err(general_err!( - "Cannot use millisecond unit on INT64 type" + "Cannot use millisecond unit on INT64 type for field '{}'", + self.name )); } } @@ -359,9 +365,10 @@ impl<'a> PrimitiveTypeBuilder<'a> { (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {} (a, b) => { return Err(general_err!( - "Cannot annotate {:?} from {} fields", + "Cannot annotate {:?} from {} for field '{}'", a, - b + b, + self.name )) } } @@ -374,8 +381,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => { if self.physical_type != PhysicalType::BYTE_ARRAY { return Err(general_err!( - "{} can only annotate BYTE_ARRAY fields", - self.converted_type + "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field", + self.converted_type, + self.name )); } } @@ -392,8 +400,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { | ConvertedType::INT_32 => { if self.physical_type != PhysicalType::INT32 { return Err(general_err!( - "{} can only annotate INT32", - self.converted_type + "{} cannot annotate field '{}' because it is not a INT32 field", + self.converted_type, + self.name )); } } @@ -404,8 +413,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { | ConvertedType::INT_64 => { if self.physical_type != PhysicalType::INT64 { return Err(general_err!( - "{} can only annotate INT64", - self.converted_type + "{} cannot annotate field '{}' because it is not a INT64 field", + self.converted_type, + self.name )); } } @@ -414,19 +424,21 @@ impl<'a> PrimitiveTypeBuilder<'a> { || self.length != 12 { return Err(general_err!( - "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field", + self.name )); } } ConvertedType::ENUM => { if self.physical_type != PhysicalType::BYTE_ARRAY { - return Err(general_err!("ENUM can only annotate BYTE_ARRAY fields")); + return Err(general_err!("ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field", self.name)); } } _ => { return Err(general_err!( - "{} cannot be applied to a primitive type", - self.converted_type + "{} cannot be applied to primitive field '{}'", + self.converted_type, + self.name )); } } @@ -1258,7 +1270,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 fields" + "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'" ); } @@ -1271,7 +1283,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: BSON can only annotate BYTE_ARRAY fields" + "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field" ); } @@ -1302,7 +1314,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: DECIMAL logical type scale 32 must match self.scale -1" + "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'" ); } @@ -1419,7 +1431,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: UINT_8 can only annotate INT32" + "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field" ); } @@ -1431,7 +1443,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: TIME_MICROS can only annotate INT64" + "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field" ); } @@ -1443,7 +1455,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field" ); } @@ -1456,7 +1468,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)" + "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field" ); } @@ -1468,7 +1480,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: ENUM can only annotate BYTE_ARRAY fields" + "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field" ); } @@ -1480,7 +1492,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: MAP cannot be applied to a primitive type" + "Parquet error: MAP cannot be applied to primitive field 'foo'" ); } @@ -1493,7 +1505,7 @@ mod tests { if let Err(e) = result { assert_eq!( format!("{}", e), - "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1" + "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'" ); } } From 6477db16ca051b656bc0fe1f11bca85a23dbcd14 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Fri, 30 Sep 2022 15:05:56 -0400 Subject: [PATCH 0089/1411] Prepare for `24.0.0` (#2808) * Create changelog * Update version * Update instructions to reflect label_issues.py needs admin * Update release instructions with notes about the github token * Update changelog * update log Co-authored-by: Ian Joiner Co-authored-by: Andrew Lamb --- CHANGELOG-old.md | 103 +++++++++++++++ CHANGELOG.md | 126 ++++++------------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 4 +- arrow-flight/README.md | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow/Cargo.toml | 10 +- arrow/README.md | 4 +- dev/release/README.md | 11 +- dev/release/update_change_log.sh | 6 +- integration-testing/Cargo.toml | 2 +- parquet/Cargo.toml | 6 +- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 18 files changed, 187 insertions(+), 123 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 02cb7ec2449e..3305a6cfd2af 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,109 @@ # Historical Changelog +## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-16) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...24.0.0) + +**Breaking changes:** + +- Move JSON Test Format To integration-testing [\#2724](https://github.com/apache/arrow-rs/pull/2724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-buffer crate \(\#2594\) [\#2693](https://github.com/apache/arrow-rs/pull/2693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify DictionaryBuilder constructors \(\#2684\) \(\#2054\) [\#2685](https://github.com/apache/arrow-rs/pull/2685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate RecordBatch::concat replace with concat\_batches \(\#2594\) [\#2683](https://github.com/apache/arrow-rs/pull/2683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add overflow-checking variant for primitive arithmetic kernels and explicitly define overflow behavior [\#2643](https://github.com/apache/arrow-rs/pull/2643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update thrift v0.16 and vendor parquet-format \(\#2502\) [\#2626](https://github.com/apache/arrow-rs/pull/2626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update flight definitions including backwards-incompatible change to GetSchema [\#2586](https://github.com/apache/arrow-rs/pull/2586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([liukun4515](https://github.com/liukun4515)) + +**Implemented enhancements:** + +- Cleanup like and nlike utf8 kernels [\#2744](https://github.com/apache/arrow-rs/issues/2744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speedup eq and neq kernels for utf8 arrays [\#2742](https://github.com/apache/arrow-rs/issues/2742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- API for more ergonomic construction of `RecordBatchOptions` [\#2728](https://github.com/apache/arrow-rs/issues/2728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Automate updates to `CHANGELOG-old.md` [\#2726](https://github.com/apache/arrow-rs/issues/2726) +- Don't check the `DivideByZero` error for float modulus [\#2720](https://github.com/apache/arrow-rs/issues/2720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `try_binary` should not panic on unequaled array length. [\#2715](https://github.com/apache/arrow-rs/issues/2715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add benchmark for bitwise operation [\#2714](https://github.com/apache/arrow-rs/issues/2714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2712](https://github.com/apache/arrow-rs/issues/2712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide\_opt kernel which produce null values on division by zero error [\#2709](https://github.com/apache/arrow-rs/issues/2709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DataType` function to detect nested types [\#2704](https://github.com/apache/arrow-rs/issues/2704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support of sorting dictionary of other primitive types [\#2700](https://github.com/apache/arrow-rs/issues/2700) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sort indices of dictionary string values [\#2697](https://github.com/apache/arrow-rs/issues/2697) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support empty projection in `RecordBatch::project` [\#2690](https://github.com/apache/arrow-rs/issues/2690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support sorting dictionary encoded primitive integer arrays [\#2679](https://github.com/apache/arrow-rs/issues/2679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use BitIndexIterator in min\_max\_helper [\#2674](https://github.com/apache/arrow-rs/issues/2674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support building comparator for dictionaries of primitive integer values [\#2672](https://github.com/apache/arrow-rs/issues/2672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Change max/min string macro to generic helper function `min_max_helper` [\#2657](https://github.com/apache/arrow-rs/issues/2657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant of arithmetic scalar kernels [\#2651](https://github.com/apache/arrow-rs/issues/2651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with binary array [\#2644](https://github.com/apache/arrow-rs/issues/2644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant for primitive arithmetic kernels [\#2642](https://github.com/apache/arrow-rs/issues/2642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `downcast_primitive_array` in arithmetic kernels [\#2639](https://github.com/apache/arrow-rs/issues/2639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support DictionaryArray in temporal kernels [\#2622](https://github.com/apache/arrow-rs/issues/2622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inline Generated Thift Code Into Parquet Crate [\#2502](https://github.com/apache/arrow-rs/issues/2502) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Escape contains patterns for utf8 like kernels [\#2745](https://github.com/apache/arrow-rs/issues/2745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Float Array should not panic on `DivideByZero` in the `Divide` kernel [\#2719](https://github.com/apache/arrow-rs/issues/2719) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DictionaryBuilders can Create Invalid DictionaryArrays [\#2684](https://github.com/apache/arrow-rs/issues/2684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow` crate does not build with `features = ["ffi"]` and `default_features = false`. [\#2670](https://github.com/apache/arrow-rs/issues/2670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Invalid results with `RowSelector` having `row_count` of 0 [\#2669](https://github.com/apache/arrow-rs/issues/2669) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- clippy error: unresolved import `crate::array::layout` [\#2659](https://github.com/apache/arrow-rs/issues/2659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast the numeric without the `CastOptions` [\#2648](https://github.com/apache/arrow-rs/issues/2648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Explicitly define overflow behavior for primitive arithmetic kernels [\#2641](https://github.com/apache/arrow-rs/issues/2641) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- update the `flight.proto` and fix schema to SchemaResult [\#2571](https://github.com/apache/arrow-rs/issues/2571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Panic when first data page is skipped using ColumnChunkData::Sparse [\#2543](https://github.com/apache/arrow-rs/issues/2543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `SchemaResult` in IPC deviates from other implementations [\#2445](https://github.com/apache/arrow-rs/issues/2445) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Closed issues:** + +- Implement collect for int values [\#2696](https://github.com/apache/arrow-rs/issues/2696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug [\#2743](https://github.com/apache/arrow-rs/pull/2743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Partially flatten arrow-buffer [\#2737](https://github.com/apache/arrow-rs/pull/2737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Automate updates to `CHANGELOG-old.md` [\#2732](https://github.com/apache/arrow-rs/pull/2732) ([iajoiner](https://github.com/iajoiner)) +- Update read parquet example in parquet/arrow home [\#2730](https://github.com/apache/arrow-rs/pull/2730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([datapythonista](https://github.com/datapythonista)) +- Better construction of RecordBatchOptions [\#2729](https://github.com/apache/arrow-rs/pull/2729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- benchmark: bitwise operation [\#2718](https://github.com/apache/arrow-rs/pull/2718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Update `try_binary` and `checked_ops`, and remove `math_checked_op` [\#2717](https://github.com/apache/arrow-rs/pull/2717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support bitwise op in kernel: or,xor,not [\#2716](https://github.com/apache/arrow-rs/pull/2716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2713](https://github.com/apache/arrow-rs/pull/2713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add divide\_opt kernel which produce null values on division by zero error [\#2710](https://github.com/apache/arrow-rs/pull/2710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add DataType::is\_nested\(\) [\#2707](https://github.com/apache/arrow-rs/pull/2707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kfastov](https://github.com/kfastov)) +- Update criterion requirement from 0.3 to 0.4 [\#2706](https://github.com/apache/arrow-rs/pull/2706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support bitwise and operation in the kernel [\#2703](https://github.com/apache/arrow-rs/pull/2703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add support of sorting dictionary of other primitive arrays [\#2701](https://github.com/apache/arrow-rs/pull/2701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Clarify docs of binary and string builders [\#2699](https://github.com/apache/arrow-rs/pull/2699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([datapythonista](https://github.com/datapythonista)) +- Sort indices of dictionary string values [\#2698](https://github.com/apache/arrow-rs/pull/2698) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add support for empty projection in RecordBatch::project [\#2691](https://github.com/apache/arrow-rs/pull/2691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Temporarily disable Golang integration tests re-enable JS [\#2689](https://github.com/apache/arrow-rs/pull/2689) ([tustvold](https://github.com/tustvold)) +- Verify valid UTF-8 when converting byte array \(\#2205\) [\#2686](https://github.com/apache/arrow-rs/pull/2686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support sorting dictionary encoded primitive integer arrays [\#2680](https://github.com/apache/arrow-rs/pull/2680) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Skip RowSelectors with zero rows [\#2678](https://github.com/apache/arrow-rs/pull/2678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- Faster Null Path Selection in ArrayData Equality [\#2676](https://github.com/apache/arrow-rs/pull/2676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dhruv9vats](https://github.com/dhruv9vats)) +- Use BitIndexIterator in min\_max\_helper [\#2675](https://github.com/apache/arrow-rs/pull/2675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support building comparator for dictionaries of primitive integer values [\#2673](https://github.com/apache/arrow-rs/pull/2673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- json feature always requires base64 feature [\#2668](https://github.com/apache/arrow-rs/pull/2668) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([eagletmt](https://github.com/eagletmt)) +- Add try\_unary, binary, try\_binary kernels ~90% faster [\#2666](https://github.com/apache/arrow-rs/pull/2666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use downcast\_dictionary\_array in unary\_dyn [\#2663](https://github.com/apache/arrow-rs/pull/2663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- optimize the `numeric_cast_with_error` [\#2661](https://github.com/apache/arrow-rs/pull/2661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- ffi feature also requires layout [\#2660](https://github.com/apache/arrow-rs/pull/2660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Change max/min string macro to generic helper function min\_max\_helper [\#2658](https://github.com/apache/arrow-rs/pull/2658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix flaky test `test_fuzz_async_reader_selection` [\#2656](https://github.com/apache/arrow-rs/pull/2656) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- MINOR: Ignore flaky test test\_fuzz\_async\_reader\_selection [\#2655](https://github.com/apache/arrow-rs/pull/2655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- MutableBuffer::typed\_data - shared ref access to the typed slice [\#2652](https://github.com/apache/arrow-rs/pull/2652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([medwards](https://github.com/medwards)) +- Overflow-checking variant of arithmetic scalar kernels [\#2650](https://github.com/apache/arrow-rs/pull/2650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- support `CastOption` for casting numeric [\#2649](https://github.com/apache/arrow-rs/pull/2649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Help LLVM vectorize comparison kernel ~50-80% faster [\#2646](https://github.com/apache/arrow-rs/pull/2646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support comparison between dictionary array and binary array [\#2645](https://github.com/apache/arrow-rs/pull/2645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use `downcast_primitive_array` in arithmetic kernels [\#2640](https://github.com/apache/arrow-rs/pull/2640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fully qualifying parquet items [\#2638](https://github.com/apache/arrow-rs/pull/2638) ([dingxiangfei2009](https://github.com/dingxiangfei2009)) +- Support DictionaryArray in temporal kernels [\#2623](https://github.com/apache/arrow-rs/pull/2623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Comparable Row Format [\#2593](https://github.com/apache/arrow-rs/pull/2593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix bug in page skipping [\#2552](https://github.com/apache/arrow-rs/pull/2552) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) + ## [22.0.0](https://github.com/apache/arrow-rs/tree/22.0.0) (2022-09-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/21.0.0...22.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a063594dc99..9b334b699816 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,108 +19,62 @@ # Changelog -## [23.0.0](https://github.com/apache/arrow-rs/tree/23.0.0) (2022-09-16) +## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...23.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/23.0.0...24.0.0) **Breaking changes:** -- Move JSON Test Format To integration-testing [\#2724](https://github.com/apache/arrow-rs/pull/2724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-buffer crate \(\#2594\) [\#2693](https://github.com/apache/arrow-rs/pull/2693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Simplify DictionaryBuilder constructors \(\#2684\) \(\#2054\) [\#2685](https://github.com/apache/arrow-rs/pull/2685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Deprecate RecordBatch::concat replace with concat\_batches \(\#2594\) [\#2683](https://github.com/apache/arrow-rs/pull/2683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add overflow-checking variant for primitive arithmetic kernels and explicitly define overflow behavior [\#2643](https://github.com/apache/arrow-rs/pull/2643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update thrift v0.16 and vendor parquet-format \(\#2502\) [\#2626](https://github.com/apache/arrow-rs/pull/2626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update flight definitions including backwards-incompatible change to GetSchema [\#2586](https://github.com/apache/arrow-rs/pull/2586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([liukun4515](https://github.com/liukun4515)) +- Cleanup `ArrowNativeType` \(\#1918\) [\#2793](https://github.com/apache/arrow-rs/pull/2793) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `ArrowNativeType::FromStr` [\#2775](https://github.com/apache/arrow-rs/pull/2775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out `arrow-array` crate \(\#2594\) [\#2769](https://github.com/apache/arrow-rs/pull/2769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `dyn_arith_dict` feature flag [\#2760](https://github.com/apache/arrow-rs/pull/2760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out `arrow-data` into a separate crate [\#2746](https://github.com/apache/arrow-rs/pull/2746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-schema \(\#2594\) [\#2711](https://github.com/apache/arrow-rs/pull/2711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Cleanup like and nlike utf8 kernels [\#2744](https://github.com/apache/arrow-rs/issues/2744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speedup eq and neq kernels for utf8 arrays [\#2742](https://github.com/apache/arrow-rs/issues/2742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- API for more ergonomic construction of `RecordBatchOptions` [\#2728](https://github.com/apache/arrow-rs/issues/2728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Automate updates to `CHANGELOG-old.md` [\#2726](https://github.com/apache/arrow-rs/issues/2726) -- Don't check the `DivideByZero` error for float modulus [\#2720](https://github.com/apache/arrow-rs/issues/2720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `try_binary` should not panic on unequaled array length. [\#2715](https://github.com/apache/arrow-rs/issues/2715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add benchmark for bitwise operation [\#2714](https://github.com/apache/arrow-rs/issues/2714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2712](https://github.com/apache/arrow-rs/issues/2712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add divide\_opt kernel which produce null values on division by zero error [\#2709](https://github.com/apache/arrow-rs/issues/2709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `DataType` function to detect nested types [\#2704](https://github.com/apache/arrow-rs/issues/2704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support of sorting dictionary of other primitive types [\#2700](https://github.com/apache/arrow-rs/issues/2700) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Sort indices of dictionary string values [\#2697](https://github.com/apache/arrow-rs/issues/2697) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support empty projection in `RecordBatch::project` [\#2690](https://github.com/apache/arrow-rs/issues/2690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support sorting dictionary encoded primitive integer arrays [\#2679](https://github.com/apache/arrow-rs/issues/2679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use BitIndexIterator in min\_max\_helper [\#2674](https://github.com/apache/arrow-rs/issues/2674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support building comparator for dictionaries of primitive integer values [\#2672](https://github.com/apache/arrow-rs/issues/2672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Change max/min string macro to generic helper function `min_max_helper` [\#2657](https://github.com/apache/arrow-rs/issues/2657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add overflow-checking variant of arithmetic scalar kernels [\#2651](https://github.com/apache/arrow-rs/issues/2651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with binary array [\#2644](https://github.com/apache/arrow-rs/issues/2644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add overflow-checking variant for primitive arithmetic kernels [\#2642](https://github.com/apache/arrow-rs/issues/2642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use `downcast_primitive_array` in arithmetic kernels [\#2639](https://github.com/apache/arrow-rs/issues/2639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support DictionaryArray in temporal kernels [\#2622](https://github.com/apache/arrow-rs/issues/2622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Inline Generated Thift Code Into Parquet Crate [\#2502](https://github.com/apache/arrow-rs/issues/2502) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Include field name in Parquet PrimitiveTypeBuilder error messages [\#2804](https://github.com/apache/arrow-rs/issues/2804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add PrimitiveArray::reinterpret\_cast [\#2785](https://github.com/apache/arrow-rs/issues/2785) +- BinaryBuilder and StringBuilder initialization parameters in struct\_builder may be wrong [\#2783](https://github.com/apache/arrow-rs/issues/2783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide scalar dyn kernel which produces null for division by zero [\#2767](https://github.com/apache/arrow-rs/issues/2767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide dyn kernel which produces null for division by zero [\#2763](https://github.com/apache/arrow-rs/issues/2763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of checked kernels on non-null data [\#2747](https://github.com/apache/arrow-rs/issues/2747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variants of arithmetic dyn kernels [\#2739](https://github.com/apache/arrow-rs/issues/2739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The `binary` function should not panic on unequaled array length. [\#2721](https://github.com/apache/arrow-rs/issues/2721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Escape contains patterns for utf8 like kernels [\#2745](https://github.com/apache/arrow-rs/issues/2745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Float Array should not panic on `DivideByZero` in the `Divide` kernel [\#2719](https://github.com/apache/arrow-rs/issues/2719) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- DictionaryBuilders can Create Invalid DictionaryArrays [\#2684](https://github.com/apache/arrow-rs/issues/2684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `arrow` crate does not build with `features = ["ffi"]` and `default_features = false`. [\#2670](https://github.com/apache/arrow-rs/issues/2670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Invalid results with `RowSelector` having `row_count` of 0 [\#2669](https://github.com/apache/arrow-rs/issues/2669) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- clippy error: unresolved import `crate::array::layout` [\#2659](https://github.com/apache/arrow-rs/issues/2659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cast the numeric without the `CastOptions` [\#2648](https://github.com/apache/arrow-rs/issues/2648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Explicitly define overflow behavior for primitive arithmetic kernels [\#2641](https://github.com/apache/arrow-rs/issues/2641) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- update the `flight.proto` and fix schema to SchemaResult [\#2571](https://github.com/apache/arrow-rs/issues/2571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Panic when first data page is skipped using ColumnChunkData::Sparse [\#2543](https://github.com/apache/arrow-rs/issues/2543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `SchemaResult` in IPC deviates from other implementations [\#2445](https://github.com/apache/arrow-rs/issues/2445) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- min compute kernel is incorrect with sliced buffers in arrow 23 [\#2779](https://github.com/apache/arrow-rs/issues/2779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `try_unary_dict` should check value type of dictionary array [\#2754](https://github.com/apache/arrow-rs/issues/2754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Implement collect for int values [\#2696](https://github.com/apache/arrow-rs/issues/2696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add back JSON import/export for schema [\#2762](https://github.com/apache/arrow-rs/issues/2762) +- null casting and coercion for Decimal128 [\#2761](https://github.com/apache/arrow-rs/issues/2761) +- Json decoder behavior changed from versions 21 to 21 and returns non-sensical num\_rows for RecordBatch [\#2722](https://github.com/apache/arrow-rs/issues/2722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release Arrow `23.0.0` \(next release after `22.0.0`\) [\#2665](https://github.com/apache/arrow-rs/issues/2665) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Merged pull requests:** -- Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug [\#2743](https://github.com/apache/arrow-rs/pull/2743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Partially flatten arrow-buffer [\#2737](https://github.com/apache/arrow-rs/pull/2737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Automate updates to `CHANGELOG-old.md` [\#2732](https://github.com/apache/arrow-rs/pull/2732) ([iajoiner](https://github.com/iajoiner)) -- Update read parquet example in parquet/arrow home [\#2730](https://github.com/apache/arrow-rs/pull/2730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([datapythonista](https://github.com/datapythonista)) -- Better construction of RecordBatchOptions [\#2729](https://github.com/apache/arrow-rs/pull/2729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- benchmark: bitwise operation [\#2718](https://github.com/apache/arrow-rs/pull/2718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Update `try_binary` and `checked_ops`, and remove `math_checked_op` [\#2717](https://github.com/apache/arrow-rs/pull/2717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Support bitwise op in kernel: or,xor,not [\#2716](https://github.com/apache/arrow-rs/pull/2716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2713](https://github.com/apache/arrow-rs/pull/2713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add divide\_opt kernel which produce null values on division by zero error [\#2710](https://github.com/apache/arrow-rs/pull/2710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add DataType::is\_nested\(\) [\#2707](https://github.com/apache/arrow-rs/pull/2707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kfastov](https://github.com/kfastov)) -- Update criterion requirement from 0.3 to 0.4 [\#2706](https://github.com/apache/arrow-rs/pull/2706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support bitwise and operation in the kernel [\#2703](https://github.com/apache/arrow-rs/pull/2703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Add support of sorting dictionary of other primitive arrays [\#2701](https://github.com/apache/arrow-rs/pull/2701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Clarify docs of binary and string builders [\#2699](https://github.com/apache/arrow-rs/pull/2699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([datapythonista](https://github.com/datapythonista)) -- Sort indices of dictionary string values [\#2698](https://github.com/apache/arrow-rs/pull/2698) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add support for empty projection in RecordBatch::project [\#2691](https://github.com/apache/arrow-rs/pull/2691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Temporarily disable Golang integration tests re-enable JS [\#2689](https://github.com/apache/arrow-rs/pull/2689) ([tustvold](https://github.com/tustvold)) -- Verify valid UTF-8 when converting byte array \(\#2205\) [\#2686](https://github.com/apache/arrow-rs/pull/2686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support sorting dictionary encoded primitive integer arrays [\#2680](https://github.com/apache/arrow-rs/pull/2680) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Skip RowSelectors with zero rows [\#2678](https://github.com/apache/arrow-rs/pull/2678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) -- Faster Null Path Selection in ArrayData Equality [\#2676](https://github.com/apache/arrow-rs/pull/2676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dhruv9vats](https://github.com/dhruv9vats)) -- Use BitIndexIterator in min\_max\_helper [\#2675](https://github.com/apache/arrow-rs/pull/2675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support building comparator for dictionaries of primitive integer values [\#2673](https://github.com/apache/arrow-rs/pull/2673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- json feature always requires base64 feature [\#2668](https://github.com/apache/arrow-rs/pull/2668) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([eagletmt](https://github.com/eagletmt)) -- Add try\_unary, binary, try\_binary kernels ~90% faster [\#2666](https://github.com/apache/arrow-rs/pull/2666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use downcast\_dictionary\_array in unary\_dyn [\#2663](https://github.com/apache/arrow-rs/pull/2663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- optimize the `numeric_cast_with_error` [\#2661](https://github.com/apache/arrow-rs/pull/2661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- ffi feature also requires layout [\#2660](https://github.com/apache/arrow-rs/pull/2660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Change max/min string macro to generic helper function min\_max\_helper [\#2658](https://github.com/apache/arrow-rs/pull/2658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix flaky test `test_fuzz_async_reader_selection` [\#2656](https://github.com/apache/arrow-rs/pull/2656) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- MINOR: Ignore flaky test test\_fuzz\_async\_reader\_selection [\#2655](https://github.com/apache/arrow-rs/pull/2655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- MutableBuffer::typed\_data - shared ref access to the typed slice [\#2652](https://github.com/apache/arrow-rs/pull/2652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([medwards](https://github.com/medwards)) -- Overflow-checking variant of arithmetic scalar kernels [\#2650](https://github.com/apache/arrow-rs/pull/2650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- support `CastOption` for casting numeric [\#2649](https://github.com/apache/arrow-rs/pull/2649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Help LLVM vectorize comparison kernel ~50-80% faster [\#2646](https://github.com/apache/arrow-rs/pull/2646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support comparison between dictionary array and binary array [\#2645](https://github.com/apache/arrow-rs/pull/2645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Use `downcast_primitive_array` in arithmetic kernels [\#2640](https://github.com/apache/arrow-rs/pull/2640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fully qualifying parquet items [\#2638](https://github.com/apache/arrow-rs/pull/2638) ([dingxiangfei2009](https://github.com/dingxiangfei2009)) -- Support DictionaryArray in temporal kernels [\#2623](https://github.com/apache/arrow-rs/pull/2623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Comparable Row Format [\#2593](https://github.com/apache/arrow-rs/pull/2593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix bug in page skipping [\#2552](https://github.com/apache/arrow-rs/pull/2552) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- add field name to parquet PrimitiveTypeBuilder error messages [\#2805](https://github.com/apache/arrow-rs/pull/2805) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([andygrove](https://github.com/andygrove)) +- Add struct equality test case \(\#514\) [\#2791](https://github.com/apache/arrow-rs/pull/2791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move unary kernels to arrow-array \(\#2787\) [\#2789](https://github.com/apache/arrow-rs/pull/2789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Disable test harness for string\_dictionary\_builder benchmark [\#2788](https://github.com/apache/arrow-rs/pull/2788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add PrimitiveArray::reinterpret\_cast \(\#2785\) [\#2786](https://github.com/apache/arrow-rs/pull/2786) ([tustvold](https://github.com/tustvold)) +- Fix BinaryBuilder and StringBuilder Capacity Allocation in StructBuilder [\#2784](https://github.com/apache/arrow-rs/pull/2784) ([chunshao90](https://github.com/chunshao90)) +- Fix min/max computation for sliced arrays \(\#2779\) [\#2780](https://github.com/apache/arrow-rs/pull/2780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix Backwards Compatible Parquet List Encodings \(\#1915\) [\#2774](https://github.com/apache/arrow-rs/pull/2774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- MINOR: Fix clippy for rust 1.64.0 [\#2772](https://github.com/apache/arrow-rs/pull/2772) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- MINOR: Fix clippy for rust 1.64.0 [\#2771](https://github.com/apache/arrow-rs/pull/2771) ([viirya](https://github.com/viirya)) +- Add divide scalar dyn kernel which produces null for division by zero [\#2768](https://github.com/apache/arrow-rs/pull/2768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add divide dyn kernel which produces null for division by zero [\#2764](https://github.com/apache/arrow-rs/pull/2764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add value type check in try\_unary\_dict [\#2755](https://github.com/apache/arrow-rs/pull/2755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix `verify_release_candidate.sh` for new arrow subcrates [\#2752](https://github.com/apache/arrow-rs/pull/2752) ([alamb](https://github.com/alamb)) +- Fix: Issue 2721 : binary function should not panic but return error w… [\#2750](https://github.com/apache/arrow-rs/pull/2750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aksharau](https://github.com/aksharau)) +- Speed up checked kernels for non-null data \(~1.4-5x faster\) [\#2749](https://github.com/apache/arrow-rs/pull/2749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Add overflow-checking variants of arithmetic dyn kernels [\#2740](https://github.com/apache/arrow-rs/pull/2740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Trim parquet row selection [\#2705](https://github.com/apache/arrow-rs/pull/2705) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 8e66bf3b763c..30fa311b243c 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "23.0.0" +version = "24.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "23.0.0", path = "../arrow-schema" } -arrow-data = { version = "23.0.0", path = "../arrow-data" } +arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "24.0.0", path = "../arrow-schema" } +arrow-data = { version = "24.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index c5a81b30f294..8ca95b95635d 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "23.0.0" +version = "24.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 289b1bbd0eb3..4dbc5fa3f1c1 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "23.0.0" +version = "24.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "23.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "24.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index a6fb8751c2df..7247679213f5 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "23.0.0" +version = "24.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "24.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index e01809f3813f..4ebf98c06ce2 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "23.0.0" +arrow-flight = "24.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 38bbcf9e8bc3..3e32aab98453 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "23.0.0" +version = "24.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "23.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "24.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index d35a99a6d15a..d052c9061600 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "23.0.0" +version = "24.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index b3500f4e5b06..a7726b96ce49 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "23.0.0" +version = "24.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,10 +44,10 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "23.0.0", path = "../arrow-buffer" } -arrow-data = { version = "23.0.0", path = "../arrow-data" } -arrow-schema = { version = "23.0.0", path = "../arrow-schema" } -arrow-array = { version = "23.0.0", path = "../arrow-array" } +arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } +arrow-data = { version = "24.0.0", path = "../arrow-data" } +arrow-schema = { version = "24.0.0", path = "../arrow-schema" } +arrow-array = { version = "24.0.0", path = "../arrow-array" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/arrow/README.md b/arrow/README.md index e168d4a09eee..ade41311c6c8 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `23.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `24.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags @@ -62,7 +62,7 @@ The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists ## Safety -Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/23.0.01/18/soundness-pledge.html). Specifically: +Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/24.0.01/18/soundness-pledge.html). Specifically: > The intent of this crate is to be free of soundness bugs. The developers will do their best to avoid them, and welcome help in analyzing and fixing them diff --git a/dev/release/README.md b/dev/release/README.md index d418a09d070f..82cb9fbfcfac 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -71,17 +71,24 @@ git checkout -b make-release # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md +# ensure your github token is available +export ARROW_GITHUB_API_TOKEN= + # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog -CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh +./dev/release/update_change_log.sh + # run automated script to copy labels to issues based on referenced PRs +# (NOTE this must be done by a committer / other who has +# write access to the repository) python dev/release/label_issues.py + # review change log / edit issues and labels if needed, rerun git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/23.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/24.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index a3af50a8a6ea..67f772d37d88 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="22.0.0" -FUTURE_RELEASE="23.0.0" +SINCE_TAG="23.0.0" +FUTURE_RELEASE="24.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -72,7 +72,7 @@ mv "${OLD_OUTPUT_PATH}".tmp "${OLD_OUTPUT_PATH}" # use exclude-tags-regex to filter out tags used for object_store # crates and only only look at tags that DO NOT begin with `object_store_` pushd "${SOURCE_TOP_DIR}" -docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ +docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$ARROW_GITHUB_API_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ --user apache \ --project arrow-rs \ --cache-file=.githubchangeloggenerator.cache \ diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index e45b812dd6a4..687e91ac4dfd 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "23.0.0" +version = "24.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 9b95868f3fc1..e1593e4b9c97 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "23.0.0" +version = "24.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "23.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "24.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -61,7 +61,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "24.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 54aa6d52f1ec..2fc6f55afbb7 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "23.0.0" +version = "24.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "23.0.0" } +parquet = { path = "../parquet", version = "24.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 4aae73dfc2ef..907a71432b1a 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "23.0.0" -parquet_derive = "23.0.0" +parquet = "24.0.0" +parquet_derive = "24.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index dd8486da2ca2..78207399fd0e 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "23.0.0" +version = "24.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "23.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "23.0.0", default-features = false } +parquet = { path = "../parquet", version = "24.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "24.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From 3999c774f010457a2dd9e19ac8c41b7a43cf1285 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sat, 1 Oct 2022 00:17:22 -0700 Subject: [PATCH 0090/1411] Support for overriding instance metadata endpoint (#2811) * Support for setting instance metadata endpoint * Actually implement * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/aws/credential.rs | 4 +-- object_store/src/aws/mod.rs | 39 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index d4461645f3c3..ada855b4848a 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -321,17 +321,17 @@ pub struct InstanceCredentialProvider { pub client: Client, pub retry_config: RetryConfig, pub imdsv1_fallback: bool, + pub metadata_endpoint: String, } impl InstanceCredentialProvider { async fn get_credential(&self) -> Result> { self.cache .get_or_insert_with(|| { - const METADATA_ENDPOINT: &str = "http://169.254.169.254"; instance_creds( &self.client, &self.retry_config, - METADATA_ENDPOINT, + &self.metadata_endpoint, self.imdsv1_fallback, ) .map_err(|source| crate::Error::Generic { diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index d186c7f47e36..a6026032efb2 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -72,6 +72,9 @@ pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); +/// Default metadata endpoint +static METADATA_ENDPOINT: &str = "http://169.254.169.254"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -354,6 +357,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, allow_http: bool, imdsv1_fallback: bool, + metadata_endpoint: Option, } impl AmazonS3Builder { @@ -370,6 +374,7 @@ impl AmazonS3Builder { /// * AWS_DEFAULT_REGION -> region /// * AWS_ENDPOINT -> endpoint /// * AWS_SESSION_TOKEN -> token + /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -401,6 +406,15 @@ impl AmazonS3Builder { builder.token = Some(token); } + // This env var is set in ECS + // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html + if let Ok(metadata_relative_uri) = + std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") + { + builder.metadata_endpoint = + Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); + } + builder } @@ -478,6 +492,16 @@ impl AmazonS3Builder { self } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), + /// used primarily within AWS EC2. + /// + /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 + /// endpoint http://fd00:ec2::254. + pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { + self.metadata_endpoint = Some(endpoint.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { @@ -536,6 +560,9 @@ impl AmazonS3Builder { client, retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), }) } }, @@ -667,6 +694,10 @@ mod tests { let aws_session_token = env::var("AWS_SESSION_TOKEN") .unwrap_or_else(|_| "object_store:fake_session_token".into()); + let container_creds_relative_uri = + env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") + .unwrap_or_else(|_| "/object_store/fake_credentials_uri".into()); + // required env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); env::set_var("AWS_SECRET_ACCESS_KEY", &aws_secret_access_key); @@ -675,6 +706,10 @@ mod tests { // optional env::set_var("AWS_ENDPOINT", &aws_endpoint); env::set_var("AWS_SESSION_TOKEN", &aws_session_token); + env::set_var( + "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", + &container_creds_relative_uri, + ); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -686,6 +721,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + + let metadata_uri = + format!("{}{}", METADATA_ENDPOINT, container_creds_relative_uri); + assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } #[tokio::test] From 0052d2572ba9d70d6832221d48feab0286d0312a Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sat, 1 Oct 2022 11:09:00 -0400 Subject: [PATCH 0091/1411] Handle S3 virtual host request type (#2782) * include s2 virtual host request type * formatting changes * fix issues highlighted in PR comments * initialize bucket_endpoint * some imporments on endpoint initialization * fix issue in initalizing bucket_endpoint * incorporating PR comments * incorporate PR comments * fix typo in comment Co-authored-by: askoa --- object_store/src/aws/client.rs | 12 +++----- object_store/src/aws/mod.rs | 54 +++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 5ec9390ec898..29621626c8b6 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -197,6 +197,7 @@ pub struct S3Config { pub region: String, pub endpoint: String, pub bucket: String, + pub bucket_endpoint: String, pub credentials: CredentialProvider, pub retry_config: RetryConfig, pub allow_http: bool, @@ -204,7 +205,7 @@ pub struct S3Config { impl S3Config { fn path_url(&self, path: &Path) -> String { - format!("{}/{}/{}", self.endpoint, self.bucket, encode_path(path)) + format!("{}/{}", self.bucket_endpoint, encode_path(path)) } } @@ -342,7 +343,7 @@ impl S3Client { token: Option<&str>, ) -> Result<(ListResult, Option)> { let credential = self.get_credential().await?; - let url = format!("{}/{}", self.config.endpoint, self.config.bucket); + let url = self.config.bucket_endpoint.clone(); let mut query = Vec::with_capacity(4); @@ -398,12 +399,7 @@ impl S3Client { pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; - let url = format!( - "{}/{}/{}?uploads=", - self.config.endpoint, - self.config.bucket, - encode_path(location) - ); + let url = format!("{}?uploads=", self.config.path_url(location),); let response = self .client diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a6026032efb2..e3510b3e2ea3 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -357,6 +357,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, allow_http: bool, imdsv1_fallback: bool, + virtual_hosted_style_request: bool, metadata_endpoint: Option, } @@ -446,10 +447,13 @@ impl AmazonS3Builder { } /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. + /// is based on region. The `endpoint` field should be consistent with + /// the field `virtual_hosted_style_request'. /// /// For example, this might be set to `"http://localhost:4566:` /// for testing against a localstack instance. + /// If `virtual_hosted_style_request` is set to true then `endpoint` + /// should have bucket name included. pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { self.endpoint = Some(endpoint.into()); self @@ -469,6 +473,23 @@ impl AmazonS3Builder { self } + /// Sets if virtual hosted style request has to be used. + /// If `virtual_hosted_style_request` is : + /// * false (default): Path style request is used + /// * true: Virtual hosted style request is used + /// + /// If the `endpoint` is provided then it should be + /// consistent with `virtual_hosted_style_request`. + /// i.e. if `virtual_hosted_style_request` is set to true + /// then `endpoint` should have bucket name included. + pub fn with_virtual_hosted_style_request( + mut self, + virtual_hosted_style_request: bool, + ) -> Self { + self.virtual_hosted_style_request = virtual_hosted_style_request; + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -568,14 +589,29 @@ impl AmazonS3Builder { }, }; - let endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); + let endpoint: String; + let bucket_endpoint: String; + + //If `endpoint` is provided then its assumed to be consistent with + // `virutal_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // `endpoint` should have bucket name included. + if self.virtual_hosted_style_request { + endpoint = self.endpoint.unwrap_or_else(|| { + format!("https://{}.s3.{}.amazonaws.com", bucket, region) + }); + bucket_endpoint = endpoint.clone(); + } else { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); + bucket_endpoint = format!("{}/{}", endpoint, bucket); + } let config = S3Config { region, endpoint, bucket, + bucket_endpoint, credentials, retry_config: self.retry_config, allow_http: self.allow_http, @@ -674,6 +710,16 @@ mod tests { config }; + let config = if let Some(virtual_hosted_style_request) = + env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST").ok() + { + config.with_virtual_hosted_style_request( + virtual_hosted_style_request.trim().parse().unwrap(), + ) + } else { + config + }; + config } }}; From 41dd12d5b30357aad2ab2a818f58f2e1b0014e46 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 2 Oct 2022 12:57:00 +0100 Subject: [PATCH 0092/1411] Add DictionaryArray::with_values (#2797) (#2798) --- arrow-array/src/array/dictionary_array.rs | 54 +++++++++++++++++++++++ arrow/src/compute/kernels/arity.rs | 14 ++---- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 69a7b1961ea9..96e91f729ab1 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -337,6 +337,60 @@ impl DictionaryArray { values, }) } + + /// Returns a new dictionary with the same keys as the current instance + /// but with a different set of dictionary values + /// + /// This can be used to perform an operation on the values of a dictionary + /// + /// # Panics + /// + /// Panics if `values` has a length less than the current values + /// + /// ``` + /// use arrow_array::builder::PrimitiveDictionaryBuilder; + /// use arrow_array::{Int8Array, Int64Array, ArrayAccessor}; + /// use arrow_array::types::{Int32Type, Int8Type}; + /// + /// // Construct a Dict(Int32, Int8) + /// let mut builder = PrimitiveDictionaryBuilder::::with_capacity(2, 200); + /// for i in 0..100 { + /// builder.append(i % 2).unwrap(); + /// } + /// + /// let dictionary = builder.finish(); + /// + /// // Perform a widening cast of dictionary values + /// let typed_dictionary = dictionary.downcast_dict::().unwrap(); + /// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64); + /// + /// // Create a Dict(Int32, + /// let new = dictionary.with_values(&values); + /// + /// // Verify values are as expected + /// let new_typed = new.downcast_dict::().unwrap(); + /// for i in 0..100 { + /// assert_eq!(new_typed.value(i), (i % 2) as i64) + /// } + /// ``` + /// + pub fn with_values(&self, values: &dyn Array) -> Self { + assert!(values.len() >= self.values.len()); + + let builder = self + .data + .clone() + .into_builder() + .data_type(DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + )) + .child_data(vec![values.data().clone()]); + + // SAFETY: + // Offsets were valid before and verified length is greater than or equal + Self::from(unsafe { builder.build_unchecked() }) + } } /// Constructs a `DictionaryArray` from an array data reference. diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index cb5184c0e9d4..11ae5a204c5c 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -76,11 +76,8 @@ where F: Fn(T::Native) -> T::Native, { let dict_values = array.values().as_any().downcast_ref().unwrap(); - let values = unary::(dict_values, op).into_data(); - let data = array.data().clone().into_builder().child_data(vec![values]); - - let new_dict: DictionaryArray = unsafe { data.build_unchecked() }.into(); - Ok(Arc::new(new_dict)) + let values = unary::(dict_values, op); + Ok(Arc::new(array.with_values(&values))) } /// A helper function that applies a fallible unary function to a dictionary array with primitive value type. @@ -98,11 +95,8 @@ where } let dict_values = array.values().as_any().downcast_ref().unwrap(); - let values = try_unary::(dict_values, op)?.into_data(); - let data = array.data().clone().into_builder().child_data(vec![values]); - - let new_dict: DictionaryArray = unsafe { data.build_unchecked() }.into(); - Ok(Arc::new(new_dict)) + let values = try_unary::(dict_values, op)?; + Ok(Arc::new(array.with_values(&values))) } /// Applies an infallible unary function to an array with primitive values. From 9b590819baffb117383dfd026f92987f4f9f6950 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 3 Oct 2022 05:52:38 -0700 Subject: [PATCH 0093/1411] Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from_env (#2807) * Allow HTTP * Update docs --- object_store/src/aws/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index e3510b3e2ea3..c08a6353fa82 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -376,6 +376,7 @@ impl AmazonS3Builder { /// * AWS_ENDPOINT -> endpoint /// * AWS_SESSION_TOKEN -> token /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> + /// * AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -416,6 +417,10 @@ impl AmazonS3Builder { Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); } + if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { + builder.allow_http = text == "true"; + } + builder } From 4df1f3c62f6d089d8cf02892168b9179e4c8c6fd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Oct 2022 13:52:52 +0100 Subject: [PATCH 0094/1411] Add i256 (#2637) (#2781) * Add i256 (#2637) * RAT * Fix doc comments * Store as parts * Custom multiply implementation * Make from_parts public * Document mulx * Remove branch from to_i128 * Clippy --- arrow-buffer/Cargo.toml | 1 + arrow-buffer/src/bigint.rs | 369 +++++++++++++++++++++++++++++++++++++ arrow-buffer/src/lib.rs | 2 + arrow-buffer/src/native.rs | 47 +++++ 4 files changed, 419 insertions(+) create mode 100644 arrow-buffer/src/bigint.rs diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 8ca95b95635d..af0bd9861dad 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -38,6 +38,7 @@ path = "src/lib.rs" bench = false [dependencies] +num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } [dev-dependencies] diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs new file mode 100644 index 000000000000..a08d280ca883 --- /dev/null +++ b/arrow-buffer/src/bigint.rs @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use num::BigInt; +use std::cmp::Ordering; + +/// A signed 256-bit integer +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Default, Eq, PartialEq, Hash)] +pub struct i256 { + low: u128, + high: i128, +} + +impl std::fmt::Debug for i256 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl std::fmt::Display for i256 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", BigInt::from_signed_bytes_le(&self.to_le_bytes())) + } +} + +impl PartialOrd for i256 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for i256 { + fn cmp(&self, other: &Self) -> Ordering { + // This is 25x faster than using a variable length encoding such + // as BigInt as it avoids allocation and branching + self.high.cmp(&other.high).then(self.low.cmp(&other.low)) + } +} + +impl i256 { + /// The additive identity for this integer type, i.e. `0`. + pub const ZERO: Self = i256 { low: 0, high: 0 }; + + /// The multiplicative identity for this integer type, i.e. `1`. + pub const ONE: Self = i256 { low: 1, high: 0 }; + + /// The multiplicative inverse for this integer type, i.e. `-1`. + pub const MINUS_ONE: Self = i256 { + low: u128::MAX, + high: -1, + }; + + /// Create an integer value from its representation as a byte array in little-endian. + #[inline] + pub fn from_le_bytes(b: [u8; 32]) -> Self { + Self { + high: i128::from_le_bytes(b[16..32].try_into().unwrap()), + low: u128::from_le_bytes(b[0..16].try_into().unwrap()), + } + } + + /// Create an i256 from the provided low u128 and high i128 + #[inline] + pub fn from_parts(low: u128, high: i128) -> Self { + Self { low, high } + } + + /// Returns this `i256` as a low u128 and high i128 + pub fn to_parts(self) -> (u128, i128) { + (self.low, self.high) + } + + /// Converts this `i256` into an `i128` returning `None` if this would result + /// in truncation/overflow + pub fn to_i128(self) -> Option { + let as_i128 = self.low as i128; + + let high_negative = self.high < 0; + let low_negative = as_i128 < 0; + let high_valid = self.high == -1 || self.high == 0; + + (high_negative == low_negative && high_valid).then_some(self.low as i128) + } + + /// Return the memory representation of this integer as a byte array in little-endian byte order. + #[inline] + pub fn to_le_bytes(self) -> [u8; 32] { + let mut t = [0; 32]; + let t_low: &mut [u8; 16] = (&mut t[0..16]).try_into().unwrap(); + *t_low = self.low.to_le_bytes(); + let t_high: &mut [u8; 16] = (&mut t[16..32]).try_into().unwrap(); + *t_high = self.high.to_le_bytes(); + t + } + + /// Create an i256 from the provided [`BigInt`] returning a bool indicating + /// if overflow occurred + fn from_bigint_with_overflow(v: BigInt) -> (Self, bool) { + let v_bytes = v.to_signed_bytes_le(); + match v_bytes.len().cmp(&32) { + Ordering::Less => { + let mut bytes = if num::Signed::is_negative(&v) { + [255_u8; 32] + } else { + [0; 32] + }; + bytes[0..v_bytes.len()].copy_from_slice(&v_bytes[..v_bytes.len()]); + (Self::from_le_bytes(bytes), false) + } + Ordering::Equal => (Self::from_le_bytes(v_bytes.try_into().unwrap()), false), + Ordering::Greater => { + (Self::from_le_bytes(v_bytes[..32].try_into().unwrap()), true) + } + } + } + + /// Performs wrapping addition + #[inline] + pub fn wrapping_add(self, other: Self) -> Self { + let (low, carry) = self.low.overflowing_add(other.low); + let high = self.high.wrapping_add(other.high).wrapping_add(carry as _); + Self { low, high } + } + + /// Performs checked addition + #[inline] + pub fn checked_add(self, other: Self) -> Option { + let (low, carry) = self.low.overflowing_add(other.low); + let high = self.high.checked_add(other.high)?.checked_add(carry as _)?; + Some(Self { low, high }) + } + + /// Performs wrapping subtraction + #[inline] + pub fn wrapping_sub(self, other: Self) -> Self { + let (low, carry) = self.low.overflowing_sub(other.low); + let high = self.high.wrapping_sub(other.high).wrapping_sub(carry as _); + Self { low, high } + } + + /// Performs checked subtraction + #[inline] + pub fn checked_sub(self, other: Self) -> Option { + let (low, carry) = self.low.overflowing_sub(other.low); + let high = self.high.checked_sub(other.high)?.checked_sub(carry as _)?; + Some(Self { low, high }) + } + + /// Performs wrapping multiplication + #[inline] + pub fn wrapping_mul(self, other: Self) -> Self { + let (low, high) = mulx(self.low, other.low); + + // Compute the high multiples, only impacting the high 128-bits + let hl = self.high.wrapping_mul(other.low as i128); + let lh = (self.low as i128).wrapping_mul(other.high); + + Self { + low, + high: (high as i128).wrapping_add(hl).wrapping_add(lh), + } + } + + /// Performs checked multiplication + #[inline] + pub fn checked_mul(self, other: Self) -> Option { + let (low, high) = mulx(self.low, other.low); + + // Compute the high multiples, only impacting the high 128-bits + let hl = self.high.checked_mul(other.low as i128)?; + let lh = (self.low as i128).checked_mul(other.high)?; + + Some(Self { + low, + high: (high as i128).checked_add(hl)?.checked_add(lh)?, + }) + } + + /// Performs wrapping division + #[inline] + pub fn wrapping_div(self, other: Self) -> Self { + let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); + let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); + Self::from_bigint_with_overflow(l / r).0 + } + + /// Performs checked division + #[inline] + pub fn checked_div(self, other: Self) -> Option { + let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); + let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); + let (val, overflow) = Self::from_bigint_with_overflow(l / r); + (!overflow).then_some(val) + } + + /// Performs wrapping remainder + #[inline] + pub fn wrapping_rem(self, other: Self) -> Self { + let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); + let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); + Self::from_bigint_with_overflow(l % r).0 + } + + /// Performs checked remainder + #[inline] + pub fn checked_rem(self, other: Self) -> Option { + if other == Self::ZERO { + return None; + } + + let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); + let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); + let (val, overflow) = Self::from_bigint_with_overflow(l % r); + (!overflow).then_some(val) + } +} + +/// Performs an unsigned multiplication of `a * b` returning a tuple of +/// `(low, high)` where `low` contains the lower 128-bits of the result +/// and `high` the higher 128-bits +/// +/// This mirrors the x86 mulx instruction but for 128-bit types +#[inline] +fn mulx(a: u128, b: u128) -> (u128, u128) { + let split = |a: u128| (a & (u64::MAX as u128), a >> 64); + + const MASK: u128 = u64::MAX as _; + + let (a_low, a_high) = split(a); + let (b_low, b_high) = split(b); + + // Carry stores the upper 64-bits of low and lower 64-bits of high + let (mut low, mut carry) = split(a_low * b_low); + carry += a_high * b_low; + + // Update low and high with corresponding parts of carry + low += carry << 64; + let mut high = carry >> 64; + + // Update carry with overflow from low + carry = low >> 64; + low &= MASK; + + // Perform multiply including overflow from low + carry += b_high * a_low; + + // Update low and high with values from carry + low += carry << 64; + high += carry >> 64; + + // Perform 4th multiplication + high += a_high * b_high; + + (low, high) +} + +#[cfg(test)] +mod tests { + use super::*; + use num::{BigInt, FromPrimitive, ToPrimitive}; + use rand::{thread_rng, Rng}; + + #[test] + fn test_signed_cmp() { + let a = i256::from_parts(i128::MAX as u128, 12); + let b = i256::from_parts(i128::MIN as u128, 12); + assert!(a < b); + + let a = i256::from_parts(i128::MAX as u128, 12); + let b = i256::from_parts(i128::MIN as u128, -12); + assert!(a > b); + } + + #[test] + fn test_to_i128() { + let vals = [ + BigInt::from_i128(-1).unwrap(), + BigInt::from_i128(i128::MAX).unwrap(), + BigInt::from_i128(i128::MIN).unwrap(), + BigInt::from_u128(u128::MIN).unwrap(), + BigInt::from_u128(u128::MAX).unwrap(), + ]; + + for v in vals { + let (t, overflow) = i256::from_bigint_with_overflow(v.clone()); + assert!(!overflow); + assert_eq!(t.to_i128(), v.to_i128(), "{} vs {}", v, t); + } + } + + #[test] + fn test_i256() { + let mut rng = thread_rng(); + + for _ in 0..1000 { + let mut l = [0_u8; 32]; + let len = rng.gen_range(0..32); + l.iter_mut().take(len).for_each(|x| *x = rng.gen()); + + let mut r = [0_u8; 32]; + let len = rng.gen_range(0..32); + r.iter_mut().take(len).for_each(|x| *x = rng.gen()); + + let il = i256::from_le_bytes(l); + let ir = i256::from_le_bytes(r); + + let bl = BigInt::from_signed_bytes_le(&l); + let br = BigInt::from_signed_bytes_le(&r); + + // Comparison + assert_eq!(il.cmp(&ir), bl.cmp(&br), "{} cmp {}", bl, br); + + // To i128 + assert_eq!(il.to_i128(), bl.to_i128(), "{}", bl); + assert_eq!(ir.to_i128(), br.to_i128(), "{}", br); + + // Addition + let actual = il.wrapping_add(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() + br.clone()); + assert_eq!(actual, expected); + + let checked = il.checked_add(ir); + match overflow { + true => assert!(checked.is_none()), + false => assert_eq!(checked.unwrap(), actual), + } + + // Subtraction + let actual = il.wrapping_sub(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() - br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let checked = il.checked_sub(ir); + match overflow { + true => assert!(checked.is_none()), + false => assert_eq!(checked.unwrap(), actual), + } + + // Multiplication + let actual = il.wrapping_mul(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() * br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let checked = il.checked_mul(ir); + match overflow { + true => assert!(checked.is_none()), + false => assert_eq!(checked.unwrap(), actual), + } + } + } +} diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 74d2bd5ec869..13d44e4d57ff 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -21,8 +21,10 @@ pub mod alloc; pub mod buffer; pub use buffer::{Buffer, MutableBuffer}; +mod bigint; mod bytes; mod native; +pub use bigint::i256; pub use native::*; mod util; diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index d371e7e110f2..6ac11a16f4d3 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::i256; use half::f16; mod private { @@ -169,6 +170,29 @@ native_float!(f16, self, self.to_f32() as _); native_float!(f32, self, self as _); native_float!(f64, self, self as _); +impl private::Sealed for i256 {} +impl ArrowNativeType for i256 { + fn from_usize(u: usize) -> Option { + Some(Self::from_parts(u as u128, 0)) + } + + fn as_usize(self) -> usize { + self.to_parts().0 as usize + } + + fn to_usize(self) -> Option { + let (low, high) = self.to_parts(); + if high != 0 { + return None; + } + low.try_into().ok() + } + + fn to_isize(self) -> Option { + self.to_i128()?.try_into().ok() + } +} + /// Allows conversion from supported Arrow types to a byte slice. pub trait ToByteSlice { /// Converts this instance into a byte slice @@ -192,3 +216,26 @@ impl ToByteSlice for T { unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of::()) } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_i256() { + let a = i256::from_parts(0, 0); + assert_eq!(a.as_usize(), 0); + assert_eq!(a.to_usize().unwrap(), 0); + assert_eq!(a.to_isize().unwrap(), 0); + + let a = i256::from_parts(0, -1); + assert_eq!(a.as_usize(), 0); + assert!(a.to_usize().is_none()); + assert!(a.to_usize().is_none()); + + let a = i256::from_parts(u128::MAX, -1); + assert_eq!(a.as_usize(), usize::MAX); + assert!(a.to_usize().is_none()); + assert_eq!(a.to_isize().unwrap(), -1); + } +} From 15f8cfd03651bfc169b910c440e1560a244272ec Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Mon, 3 Oct 2022 20:54:01 +0800 Subject: [PATCH 0095/1411] fix timestamp parsing while no explicit timezone given (#2814) * fix timestamp parsing * add test cases --- arrow/src/compute/kernels/cast_utils.rs | 98 ++++++++++++------------- arrow/src/csv/reader.rs | 38 +++------- 2 files changed, 57 insertions(+), 79 deletions(-) diff --git a/arrow/src/compute/kernels/cast_utils.rs b/arrow/src/compute/kernels/cast_utils.rs index e43961b4ab8a..718ea5ac64a3 100644 --- a/arrow/src/compute/kernels/cast_utils.rs +++ b/arrow/src/compute/kernels/cast_utils.rs @@ -16,7 +16,7 @@ // under the License. use crate::error::{ArrowError, Result}; -use chrono::{prelude::*, LocalResult}; +use chrono::prelude::*; /// Accepts a string in RFC3339 / ISO8601 standard format and some /// variants and converts it to a nanosecond precision timestamp. @@ -96,27 +96,27 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // without a timezone specifier as a local time, using T as a separator // Example: 2020-09-08T13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { - return naive_datetime_to_timestamp(s, ts); + return Ok(ts.timestamp_nanos()); } // without a timezone specifier as a local time, using T as a // separator, no fractional seconds // Example: 2020-09-08T13:42:29 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); + return Ok(ts.timestamp_nanos()); } // without a timezone specifier as a local time, using ' ' as a separator // Example: 2020-09-08 13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") { - return naive_datetime_to_timestamp(s, ts); + return Ok(ts.timestamp_nanos()); } // without a timezone specifier as a local time, using ' ' as a // separator, no fractional seconds // Example: 2020-09-08 13:42:29 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); + return Ok(ts.timestamp_nanos()); } // Note we don't pass along the error message from the underlying @@ -130,30 +130,6 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } -/// Converts the naive datetime (which has no specific timezone) to a -/// nanosecond epoch timestamp relative to UTC. -fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { - let l = Local {}; - - match l.from_local_datetime(&datetime) { - LocalResult::None => Err(ArrowError::CastError(format!( - "Error parsing '{}' as timestamp: local time representation is invalid", - s - ))), - LocalResult::Single(local_datetime) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - // Ambiguous times can happen if the timestamp is exactly when - // a daylight savings time transition occurs, for example, and - // so the datetime could validly be said to be in two - // potential offsets. However, since we are about to convert - // to UTC anyways, we can pick one arbitrarily - LocalResult::Ambiguous(local_datetime, _) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -202,23 +178,6 @@ mod tests { Ok(()) } - /// Interprets a naive_datetime (with no explicit timezone offset) - /// using the local timezone and returns the timestamp in UTC (0 - /// offset) - fn naive_datetime_to_timestamp(naive_datetime: &NaiveDateTime) -> i64 { - // Note: Use chrono APIs that are different than - // naive_datetime_to_timestamp to compute the utc offset to - // try and double check the logic - let utc_offset_secs = match Local.offset_from_local_datetime(naive_datetime) { - LocalResult::Single(local_offset) => { - local_offset.fix().local_minus_utc() as i64 - } - _ => panic!("Unexpected failure converting to local datetime"), - }; - let utc_offset_nanos = utc_offset_secs * 1_000_000_000; - naive_datetime.timestamp_nanos() - utc_offset_nanos - } - #[test] #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime fn string_to_timestamp_no_timezone() -> Result<()> { @@ -232,12 +191,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), + naive_datetime.timestamp_nanos(), parse_timestamp("2020-09-08T13:42:29.190855")? ); assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), + naive_datetime.timestamp_nanos(), parse_timestamp("2020-09-08 13:42:29.190855")? ); @@ -250,12 +209,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), + naive_datetime_whole_secs.timestamp_nanos(), parse_timestamp("2020-09-08T13:42:29")? ); assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), + naive_datetime_whole_secs.timestamp_nanos(), parse_timestamp("2020-09-08 13:42:29")? ); @@ -297,4 +256,43 @@ mod tests { } } } + + #[test] + fn string_without_timezone_to_timestamp() -> Result<()> { + // string without timezone should always output the same regardless the local or session timezone + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd(2020, 9, 8), + NaiveTime::from_hms_nano(13, 42, 29, 190855000), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855")? + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855")? + ); + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd(2020, 9, 8), + NaiveTime::from_hms_nano(13, 42, 29, 0), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29")? + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29")? + ); + + Ok(()) + } } diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index a8c71a8e0191..0c7536053ffc 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1136,7 +1136,7 @@ mod tests { use crate::array::*; use crate::compute::cast; use crate::datatypes::Field; - use chrono::{prelude::*, LocalResult}; + use chrono::prelude::*; #[test] fn test_csv() { @@ -1696,26 +1696,6 @@ mod tests { } } - /// Interprets a naive_datetime (with no explicit timezone offset) - /// using the local timezone and returns the timestamp in UTC (0 - /// offset) - fn naive_datetime_to_timestamp(naive_datetime: &NaiveDateTime) -> i64 { - // Note: Use chrono APIs that are different than - // naive_datetime_to_timestamp to compute the utc offset to - // try and double check the logic - let utc_offset_secs = match Local.offset_from_local_datetime(naive_datetime) { - LocalResult::Single(local_offset) => { - local_offset.fix().local_minus_utc() as i64 - } - _ => panic!( - "Unexpected failure converting {} to local datetime", - naive_datetime - ), - }; - let utc_offset_nanos = utc_offset_secs * 1_000_000_000; - naive_datetime.timestamp_nanos() - utc_offset_nanos - } - #[test] fn test_parse_timestamp_microseconds() { assert_eq!( @@ -1728,11 +1708,11 @@ mod tests { ); assert_eq!( parse_item::("2018-11-13T17:11:10").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) / 1000 + naive_datetime.timestamp_nanos() / 1000 ); assert_eq!( parse_item::("2018-11-13 17:11:10").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) / 1000 + naive_datetime.timestamp_nanos() / 1000 ); let naive_datetime = NaiveDateTime::new( NaiveDate::from_ymd(2018, 11, 13), @@ -1740,7 +1720,7 @@ mod tests { ); assert_eq!( parse_item::("2018-11-13T17:11:10.011").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) / 1000 + naive_datetime.timestamp_nanos() / 1000 ); let naive_datetime = NaiveDateTime::new( NaiveDate::from_ymd(1900, 2, 28), @@ -1748,7 +1728,7 @@ mod tests { ); assert_eq!( parse_item::("1900-02-28T12:34:56").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) / 1000 + naive_datetime.timestamp_nanos() / 1000 ); } @@ -1764,11 +1744,11 @@ mod tests { ); assert_eq!( parse_item::("2018-11-13T17:11:10").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) + naive_datetime.timestamp_nanos() ); assert_eq!( parse_item::("2018-11-13 17:11:10").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) + naive_datetime.timestamp_nanos() ); let naive_datetime = NaiveDateTime::new( NaiveDate::from_ymd(2018, 11, 13), @@ -1776,7 +1756,7 @@ mod tests { ); assert_eq!( parse_item::("2018-11-13T17:11:10.011").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) + naive_datetime.timestamp_nanos() ); let naive_datetime = NaiveDateTime::new( NaiveDate::from_ymd(1900, 2, 28), @@ -1784,7 +1764,7 @@ mod tests { ); assert_eq!( parse_item::("1900-02-28T12:34:56").unwrap(), - naive_datetime_to_timestamp(&naive_datetime) + naive_datetime.timestamp_nanos() ); } From 70054fd9fa8d3879c1a73c744f3527d2ecb3561a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Oct 2022 14:49:53 +0100 Subject: [PATCH 0096/1411] Add OrderPreservingInterner::lookup (#2677) (#2815) --- arrow/src/row/interner.rs | 52 +++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/arrow/src/row/interner.rs b/arrow/src/row/interner.rs index 77edb97e8d1f..e48670984c98 100644 --- a/arrow/src/row/interner.rs +++ b/arrow/src/row/interner.rs @@ -129,6 +129,40 @@ impl OrderPreservingInterner { pub fn normalized_key(&self, key: Interned) -> &[u8] { &self.keys[key] } + + /// Converts a normalized key returned by [`Self::normalized_key`] to [`Interned`] + /// returning `None` if it cannot be found + #[allow(dead_code)] + pub fn lookup(&self, normalized_key: &[u8]) -> Option { + let len = normalized_key.len(); + + let mut current_slot: Option<&Slot> = None; + if len > 2 { + for v in normalized_key.iter().take(len - 2) { + let slot_idx = v.checked_sub(1)?; + current_slot = Some(match current_slot { + None => &self.bucket.slots[slot_idx as usize], + Some(b) => &b.child.as_ref()?.slots[slot_idx as usize], + }); + } + } + + if len > 1 { + let slot_idx = normalized_key[len - 2].checked_sub(2)?; + current_slot = Some(match current_slot { + None => &self.bucket.slots[slot_idx as usize], + Some(b) => &b.child.as_ref()?.slots[slot_idx as usize], + }); + } + + current_slot.as_ref()?.value + } + + /// Returns the interned value for a given [`Interned`] + #[allow(dead_code)] + pub fn value(&self, key: Interned) -> &[u8] { + self.values.index(key) + } } /// A buffer of `[u8]` indexed by `[Interned]` @@ -393,13 +427,21 @@ mod tests { .map(Option::unwrap) .collect(); - let interned: Vec<_> = interned - .into_iter() - .map(|x| interner.normalized_key(x)) + for (value, interned) in values.iter().zip(&interned) { + assert_eq!(interner.value(*interned), &value.to_be_bytes()); + } + + let normalized_keys: Vec<_> = interned + .iter() + .map(|x| interner.normalized_key(*x)) .collect(); - for (i, a) in interned.iter().enumerate() { - for (j, b) in interned.iter().enumerate() { + for (interned, normalized) in interned.iter().zip(&normalized_keys) { + assert_eq!(*interned, interner.lookup(normalized).unwrap()); + } + + for (i, a) in normalized_keys.iter().enumerate() { + for (j, b) in normalized_keys.iter().enumerate() { let interned_cmp = a.cmp(b); let values_cmp = values[i].cmp(&values[j]); assert_eq!( From 931c6fcc45f4aca441191f007cf31daa19eb66d2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Oct 2022 14:53:53 +0100 Subject: [PATCH 0097/1411] Simplify FixedLengthEncoding (#2812) --- arrow/src/row/fixed.rs | 56 ++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 78108274241b..04b9a30ecad8 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -24,13 +24,17 @@ use half::f16; /// Encodes a value of a particular fixed width type into bytes according to the rules /// described on [`super::RowConverter`] -pub trait FixedLengthEncoding: Copy { - const ENCODED_LEN: usize = 1 + N; +pub trait FixedLengthEncoding: Copy { + const ENCODED_LEN: usize = 1 + std::mem::size_of::(); - fn encode(self) -> [u8; N]; + type Encoded: Sized + Copy + AsRef<[u8]> + AsMut<[u8]>; + + fn encode(self) -> Self::Encoded; } -impl FixedLengthEncoding<1> for bool { +impl FixedLengthEncoding for bool { + type Encoded = [u8; 1]; + fn encode(self) -> [u8; 1] { [self as u8] } @@ -38,7 +42,9 @@ impl FixedLengthEncoding<1> for bool { macro_rules! encode_signed { ($n:expr, $t:ty) => { - impl FixedLengthEncoding<$n> for $t { + impl FixedLengthEncoding for $t { + type Encoded = [u8; $n]; + fn encode(self) -> [u8; $n] { let mut b = self.to_be_bytes(); // Toggle top "sign" bit to ensure consistent sort order @@ -57,7 +63,9 @@ encode_signed!(16, i128); macro_rules! encode_unsigned { ($n:expr, $t:ty) => { - impl FixedLengthEncoding<$n> for $t { + impl FixedLengthEncoding for $t { + type Encoded = [u8; $n]; + fn encode(self) -> [u8; $n] { self.to_be_bytes() } @@ -70,7 +78,9 @@ encode_unsigned!(2, u16); encode_unsigned!(4, u32); encode_unsigned!(8, u64); -impl FixedLengthEncoding<2> for f16 { +impl FixedLengthEncoding for f16 { + type Encoded = [u8; 2]; + fn encode(self) -> [u8; 2] { // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 let s = self.to_bits() as i16; @@ -79,7 +89,9 @@ impl FixedLengthEncoding<2> for f16 { } } -impl FixedLengthEncoding<4> for f32 { +impl FixedLengthEncoding for f32 { + type Encoded = [u8; 4]; + fn encode(self) -> [u8; 4] { // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 let s = self.to_bits() as i32; @@ -88,7 +100,9 @@ impl FixedLengthEncoding<4> for f32 { } } -impl FixedLengthEncoding<8> for f64 { +impl FixedLengthEncoding for f64 { + type Encoded = [u8; 8]; + fn encode(self) -> [u8; 8] { // https://github.com/rust-lang/rust/blob/9c20b2a8cc7588decb6de25ac6a7912dcef24d65/library/core/src/num/f32.rs#L1176-L1260 let s = self.to_bits() as i64; @@ -97,7 +111,9 @@ impl FixedLengthEncoding<8> for f64 { } } -impl FixedLengthEncoding<16> for Decimal128 { +impl FixedLengthEncoding for Decimal128 { + type Encoded = [u8; 16]; + fn encode(self) -> [u8; 16] { let mut val = *self.raw_value(); // Convert to big endian representation @@ -108,7 +124,9 @@ impl FixedLengthEncoding<16> for Decimal128 { } } -impl FixedLengthEncoding<32> for Decimal256 { +impl FixedLengthEncoding for Decimal256 { + type Encoded = [u8; 32]; + fn encode(self) -> [u8; 32] { let mut val = *self.raw_value(); // Convert to big endian representation @@ -120,10 +138,10 @@ impl FixedLengthEncoding<32> for Decimal256 { } /// Returns the total encoded length (including null byte) for a value of type `T::Native` -pub const fn encoded_len(_col: &PrimitiveArray) -> usize +pub const fn encoded_len(_col: &PrimitiveArray) -> usize where T: ArrowPrimitiveType, - T::Native: FixedLengthEncoding, + T::Native: FixedLengthEncoding, { T::Native::ENCODED_LEN } @@ -132,26 +150,22 @@ where /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode< - const N: usize, - T: FixedLengthEncoding, - I: IntoIterator>, ->( +pub fn encode>>( out: &mut Rows, i: I, opts: SortOptions, ) { for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { - let end_offset = *offset + N + 1; + let end_offset = *offset + T::ENCODED_LEN; if let Some(val) = maybe_val { let to_write = &mut out.buffer[*offset..end_offset]; to_write[0] = 1; let mut encoded = val.encode(); if opts.descending { // Flip bits to reverse order - encoded.iter_mut().for_each(|v| *v = !*v) + encoded.as_mut().iter_mut().for_each(|v| *v = !*v) } - to_write[1..].copy_from_slice(&encoded) + to_write[1..].copy_from_slice(encoded.as_ref()) } else if !opts.nulls_first { out.buffer[*offset] = 0xFF; } From 9c1748f9cb6a125e18e64bd5da17cc1782a4b2a5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Oct 2022 15:59:42 +0100 Subject: [PATCH 0098/1411] Implement ArrowNumericType for Float16Type (#2810) * Implement ArrowNumericType for Float16Type * Remove unnecessary safety comments --- arrow/src/compute/kernels/arithmetic.rs | 31 ++++++-- arrow/src/compute/kernels/comparison.rs | 7 +- arrow/src/datatypes/numeric.rs | 96 +++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 11 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 1c28c9895240..b2e95ad5e4a9 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -332,9 +332,7 @@ where // process data in chunks of 64 elements since we also get 64 bits of validity information at a time - // safety: result is newly created above, always written as a T below - let mut result_chunks = - unsafe { result.typed_data_mut().chunks_exact_mut(64) }; + let mut result_chunks = result.typed_data_mut().chunks_exact_mut(64); let mut left_chunks = left.values().chunks_exact(64); let mut right_chunks = right.values().chunks_exact(64); @@ -380,9 +378,7 @@ where )?; } None => { - // safety: result is newly created above, always written as a T below - let mut result_chunks = - unsafe { result.typed_data_mut().chunks_exact_mut(lanes) }; + let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); let mut left_chunks = left.values().chunks_exact(lanes); let mut right_chunks = right.values().chunks_exact(lanes); @@ -1611,6 +1607,7 @@ mod tests { use crate::array::Int32Array; use crate::datatypes::{Date64Type, Int32Type, Int8Type}; use chrono::NaiveDate; + use half::f16; #[test] fn test_primitive_array_add() { @@ -2898,4 +2895,26 @@ mod tests { let division_by_zero = divide_scalar_opt_dyn::(&a, 0); assert_eq!(&expected, &division_by_zero.unwrap()); } + + #[test] + fn test_sum_f16() { + let a = Float16Array::from_iter_values([ + f16::from_f32(0.1), + f16::from_f32(0.2), + f16::from_f32(1.5), + f16::from_f32(-0.1), + ]); + let b = Float16Array::from_iter_values([ + f16::from_f32(5.1), + f16::from_f32(6.2), + f16::from_f32(-1.), + f16::from_f32(-2.1), + ]); + let expected = Float16Array::from_iter_values( + a.values().iter().zip(b.values()).map(|(a, b)| a + b), + ); + + let c = add(&a, &b).unwrap(); + assert_eq!(c, expected); + } } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 49aecfb67fa6..1ea433150f01 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -1792,7 +1792,6 @@ where .iter() .map(|key| { key.map(|key| unsafe { - // safety lengths were verified above let key = key.as_usize(); dict_comparison.value_unchecked(key) }) @@ -1845,8 +1844,7 @@ where let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE); let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE); - // safety: result is newly created above, always written as a T below - let result_chunks = unsafe { result.typed_data_mut() }; + let result_chunks = result.typed_data_mut(); let result_remainder = left_chunks .borrow_mut() .zip(right_chunks.borrow_mut()) @@ -1937,8 +1935,7 @@ where let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE); let simd_right = T::init(right); - // safety: result is newly created above, always written as a T below - let result_chunks = unsafe { result.typed_data_mut() }; + let result_chunks = result.typed_data_mut(); let result_remainder = left_chunks .borrow_mut() diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs index b8fa87197c38..e74764d4c0ea 100644 --- a/arrow/src/datatypes/numeric.rs +++ b/arrow/src/datatypes/numeric.rs @@ -366,6 +366,102 @@ make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8); +#[cfg(not(feature = "simd"))] +impl ArrowNumericType for Float16Type {} + +#[cfg(feature = "simd")] +impl ArrowNumericType for Float16Type { + type Simd = ::Simd; + type SimdMask = ::SimdMask; + + fn lanes() -> usize { + Float32Type::lanes() + } + + fn init(value: Self::Native) -> Self::Simd { + Float32Type::init(value.to_f32()) + } + + fn load(slice: &[Self::Native]) -> Self::Simd { + let mut s = [0_f32; Self::Simd::lanes()]; + s.iter_mut().zip(slice).for_each(|(o, a)| *o = a.to_f32()); + Float32Type::load(&s) + } + + fn mask_init(value: bool) -> Self::SimdMask { + Float32Type::mask_init(value) + } + + fn mask_from_u64(mask: u64) -> Self::SimdMask { + Float32Type::mask_from_u64(mask) + } + + fn mask_to_u64(mask: &Self::SimdMask) -> u64 { + Float32Type::mask_to_u64(mask) + } + + fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool { + Float32Type::mask_get(mask, idx) + } + + fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask { + Float32Type::mask_set(mask, idx, value) + } + + fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { + Float32Type::mask_select(mask, a, b) + } + + fn mask_any(mask: Self::SimdMask) -> bool { + Float32Type::mask_any(mask) + } + + fn bin_op Self::Simd>( + left: Self::Simd, + right: Self::Simd, + op: F, + ) -> Self::Simd { + op(left, right) + } + + fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::eq(left, right) + } + + fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::ne(left, right) + } + + fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::lt(left, right) + } + + fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::le(left, right) + } + + fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::gt(left, right) + } + + fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + Float32Type::ge(left, right) + } + + fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { + let mut s = [0_f32; Self::Simd::lanes()]; + Float32Type::write(simd_result, &mut s); + slice + .iter_mut() + .zip(s) + .for_each(|(o, i)| *o = half::f16::from_f32(i)) + } + + fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { + Float32Type::unary_op(a, op) + } +} + #[cfg(feature = "simd")] pub trait ArrowFloatNumericType: ArrowNumericType { fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd; From 76da62443aeaf0b5085f56be511eabe067e25597 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Oct 2022 16:00:13 +0100 Subject: [PATCH 0099/1411] Add string_dictionary benches for row format (#2677) (#2816) * Add string_dictionary benches for row format (#2677) * Fix copy-pasta --- arrow/benches/row_format.rs | 69 ++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index 2802aa6ece0b..ec872c127060 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -22,7 +22,10 @@ extern crate core; use arrow::array::ArrayRef; use arrow::datatypes::{DataType, Int64Type, UInt64Type}; use arrow::row::{RowConverter, SortField}; -use arrow::util::bench_util::{create_primitive_array, create_string_array_with_len}; +use arrow::util::bench_util::{ + create_primitive_array, create_string_array_with_len, create_string_dict_array, +}; +use arrow_array::types::Int32Type; use criterion::{black_box, Criterion}; use std::sync::Arc; @@ -85,6 +88,46 @@ fn row_bench(c: &mut Criterion) { }); }); + let cols = + vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; + + c.bench_function("row_batch 4096 string_dictionary(10, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; + + c.bench_function("row_batch 4096 string_dictionary(30, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; + + c.bench_function("row_batch 4096 string_dictionary(100, 0)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + + let cols = + vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; + + c.bench_function("row_batch 4096 string_dictionary(100, 0.5)", |b| { + b.iter(|| { + let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + black_box(converter.convert_columns(&cols)) + }); + }); + let cols = [ Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, @@ -108,6 +151,30 @@ fn row_bench(c: &mut Criterion) { }); }, ); + + let cols = [ + Arc::new(create_string_dict_array::(4096, 0.5, 20)) as ArrayRef, + Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef, + Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + + let fields = [ + SortField::new(DataType::Utf8), + SortField::new(DataType::Utf8), + SortField::new(DataType::Utf8), + SortField::new(DataType::Int64), + ]; + + c.bench_function( + "row_batch 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", + |b| { + b.iter(|| { + let mut converter = RowConverter::new(fields.to_vec()); + black_box(converter.convert_columns(&cols)) + }); + }, + ); } criterion_group!(benches, row_bench); From 35c313ba068d070c2f1e81f531293655cf5f3bcd Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Mon, 3 Oct 2022 16:52:07 -0400 Subject: [PATCH 0100/1411] Automate generation of release PRs (#2736) * Add file_release_pr.sh and update README * fixup Co-authored-by: Andrew Lamb --- dev/release/README.md | 19 ++++++++++++---- dev/release/file_release_pr.sh | 40 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 dev/release/file_release_pr.sh diff --git a/dev/release/README.md b/dev/release/README.md index 82cb9fbfcfac..3ee7a7d5e0bf 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -67,7 +67,11 @@ For `object_store` the same process is done in the `object_store` directory. Exa ```bash git checkout master git pull -git checkout -b make-release +git checkout -b + +# Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. +sed -i '' -e 's/14.0.0/24.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md @@ -87,9 +91,16 @@ python dev/release/label_issues.py git commit -a -m 'Create changelog' -# update versions -sed -i '' -e 's/14.0.0/24.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` -git commit -a -m 'Update version' +# Manully edit ./dev/release/update_change_log.sh to reflect the release version +# Create the changelog +CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh +# Review change log / edit issues and labels if needed, rerun +git commit -a -m 'Create changelog' + +git push + +# File the release PR +export BRANCH= && export GITHUB_USERNAME= && export GITHUB_TOKEN= && ./file_release_pr.sh ``` Note that when reviewing the change log, rather than editing the diff --git a/dev/release/file_release_pr.sh b/dev/release/file_release_pr.sh new file mode 100644 index 000000000000..71c7547cd4cc --- /dev/null +++ b/dev/release/file_release_pr.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# generates a standard release PR like this: +# https://github.com/apache/arrow-rs/pull/2734 +# +# Usage: +# export BRANCH= && export GITHUB_USERNAME= && export GITHUB_TOKEN= && ./file_release_pr.sh + +set -e + +FUTURE_RELEASE="23.0.0" +ISSUE_NUMBER=2665 + +TITLE="Update version to \`$FUTURE_RELEASE\` and update \`CHANGELOG\`" +BODY="# Which issue does this PR close?\n\nCloses #$ISSUE_NUMBER.\n\n# Rationale for this change\nPrepare for biweekly release\n\n# What changes are included in this PR?\n\n# Are there any user-facing changes?\nYes" +DATA="{\"title\":\"$TITLE\", \"body\":\"$BODY\", \"head\":\"$GITHUB_USERNAME:$BRANCH\",\"base\":\"master\"}" + +# Create the pull request +curl -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + https://api.github.com/repos/apache/arrow-rs/pulls \ + -d "$DATA" From 191eaef0906f61dc0dc6ca6cea96a99f86e6c5a4 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Tue, 4 Oct 2022 07:26:24 +0800 Subject: [PATCH 0101/1411] Add modulus ops into `ArrowNativeTypeOp` (#2756) * add 3 mod ops and tests Signed-off-by: remzi <13716567376yh@gmail.com> * fix simd error Signed-off-by: remzi <13716567376yh@gmail.com> * remove_mod_divide_by_zero Signed-off-by: remzi <13716567376yh@gmail.com> * overflow panic simd Signed-off-by: remzi <13716567376yh@gmail.com> * address comment Signed-off-by: remzi <13716567376yh@gmail.com> Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/arithmetic.rs | 64 +++++++++++++++++++++---- arrow/src/datatypes/native.rs | 32 ++++++++++++- 2 files changed, 85 insertions(+), 11 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b2e95ad5e4a9..1e6e55248b71 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -22,7 +22,7 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use std::ops::{Div, Neg, Rem}; +use std::ops::{Div, Neg}; use num::{One, Zero}; @@ -182,7 +182,7 @@ fn simd_checked_modulus( right: T::Simd, ) -> Result where - T::Native: One + Zero, + T::Native: ArrowNativeTypeOp + One, { let zero = T::init(T::Native::zero()); let one = T::init(T::Native::one()); @@ -305,7 +305,7 @@ fn simd_checked_divide_op( ) -> Result> where T: ArrowNumericType, - T::Native: One + Zero, + T::Native: ArrowNativeTypeOp, SI: Fn(Option, T::Simd, T::Simd) -> Result, SC: Fn(T::Native, T::Native) -> T::Native, { @@ -1301,7 +1301,7 @@ pub fn modulus( ) -> Result> where T: ArrowNumericType, - T::Native: Rem + Zero + One, + T::Native: ArrowNativeTypeOp + One, { #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_modulus::, |a, b| { @@ -1312,7 +1312,7 @@ where if b.is_zero() { Err(ArrowError::DivideByZero) } else { - Ok(a % b) + Ok(a.mod_wrapping(b)) } }); } @@ -1507,13 +1507,13 @@ pub fn modulus_scalar( ) -> Result> where T: ArrowNumericType, - T::Native: Rem + Zero, + T::Native: ArrowNativeTypeOp, { if modulo.is_zero() { return Err(ArrowError::DivideByZero); } - Ok(unary(array, |a| a % modulo)) + Ok(unary(array, |a| a.mod_wrapping(modulo))) } /// Divide every value in an array by a scalar. If any value in the array is null then the @@ -2117,7 +2117,7 @@ mod tests { } #[test] - fn test_primitive_array_modulus() { + fn test_int_array_modulus() { let a = Int32Array::from(vec![15, 15, 8, 1, 9]); let b = Int32Array::from(vec![5, 6, 8, 9, 1]); let c = modulus(&a, &b).unwrap(); @@ -2128,6 +2128,34 @@ mod tests { assert_eq!(0, c.value(4)); } + #[test] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" + )] + fn test_int_array_modulus_divide_by_zero() { + let a = Int32Array::from(vec![1]); + let b = Int32Array::from(vec![0]); + modulus(&a, &b).unwrap(); + } + + #[test] + #[cfg(not(feature = "simd"))] + fn test_int_array_modulus_overflow_wrapping() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + let result = modulus(&a, &b).unwrap(); + assert_eq!(0, result.value(0)) + } + + #[test] + #[cfg(feature = "simd")] + #[should_panic(expected = "attempt to calculate the remainder with overflow")] + fn test_int_array_modulus_overflow_panic() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + let _ = modulus(&a, &b).unwrap(); + } + #[test] fn test_primitive_array_divide_scalar() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); @@ -2190,7 +2218,7 @@ mod tests { } #[test] - fn test_primitive_array_modulus_scalar() { + fn test_int_array_modulus_scalar() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); let b = 3; let c = modulus_scalar(&a, b).unwrap(); @@ -2199,7 +2227,7 @@ mod tests { } #[test] - fn test_primitive_array_modulus_scalar_sliced() { + fn test_int_array_modulus_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); let a = as_primitive_array(&a); @@ -2208,6 +2236,22 @@ mod tests { assert_eq!(actual, expected); } + #[test] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" + )] + fn test_int_array_modulus_scalar_divide_by_zero() { + let a = Int32Array::from(vec![1]); + modulus_scalar(&a, 0).unwrap(); + } + + #[test] + fn test_int_array_modulus_scalar_overflow_wrapping() { + let a = Int32Array::from(vec![i32::MIN]); + let result = modulus_scalar(&a, -1).unwrap(); + assert_eq!(0, result.value(0)) + } + #[test] fn test_primitive_array_divide_sliced() { let a = Int32Array::from(vec![0, 0, 0, 15, 15, 8, 1, 9, 0]); diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 6ab82688e52d..654b939500a2 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -26,7 +26,7 @@ pub(crate) mod native_op { use super::ArrowNativeType; use crate::error::{ArrowError, Result}; use num::Zero; - use std::ops::{Add, Div, Mul, Sub}; + use std::ops::{Add, Div, Mul, Rem, Sub}; /// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking /// variants for arithmetic operations. For floating point types, this provides some @@ -44,6 +44,7 @@ pub(crate) mod native_op { + Sub + Mul + Div + + Rem + Zero { fn add_checked(self, rhs: Self) -> Result { @@ -81,6 +82,18 @@ pub(crate) mod native_op { fn div_wrapping(self, rhs: Self) -> Self { self / rhs } + + fn mod_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self % rhs) + } + } + + fn mod_wrapping(self, rhs: Self) -> Self { + self % rhs + } } } @@ -142,6 +155,23 @@ macro_rules! native_type_op { fn div_wrapping(self, rhs: Self) -> Self { self.wrapping_div(rhs) } + + fn mod_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + self.checked_rem(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} % {:?}", + self, rhs + )) + }) + } + } + + fn mod_wrapping(self, rhs: Self) -> Self { + self.wrapping_rem(rhs) + } } }; } From 8290a4f3fb90f6715ba977e71618df73f6c66d20 Mon Sep 17 00:00:00 2001 From: George Andronchik Date: Tue, 4 Oct 2022 17:04:27 +0800 Subject: [PATCH 0102/1411] feat: cast List / LargeList to Utf8 / LargeUtf8 (#2588) --- arrow/src/compute/kernels/cast.rs | 89 ++++++++++++++++++++++++++++++- arrow/src/util/display.rs | 17 ++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index eab3dafda13a..31ac738fa93a 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -57,7 +57,10 @@ use crate::temporal_conversions::{ NANOSECONDS, SECONDS_IN_DAY, }; use crate::{array::*, compute::take}; -use crate::{buffer::Buffer, util::serialization::lexical_to_string}; +use crate::{ + buffer::Buffer, util::display::array_value_to_string, + util::serialization::lexical_to_string, +}; use num::cast::AsPrimitive; use num::{BigInt, NumCast, ToPrimitive}; @@ -136,6 +139,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(list_from), LargeList(list_to)) => { list_from.data_type() == list_to.data_type() } + (LargeList(list_from), List(list_to)) => { + list_from.data_type() == list_to.data_type() + } + (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => can_cast_types(list_from.data_type(), to_type), (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), @@ -408,6 +415,21 @@ macro_rules! cast_decimal_to_float { }}; } +// cast the List array to Utf8 array +macro_rules! cast_list_to_string { + ($ARRAY:expr, $SIZE:ident) => {{ + let mut value_builder: GenericStringBuilder<$SIZE> = GenericStringBuilder::new(); + for i in 0..$ARRAY.len() { + if $ARRAY.is_null(i) { + value_builder.append_null(); + } else { + value_builder.append_value(array_value_to_string($ARRAY, i)?); + } + } + Ok(Arc::new(value_builder.finish())) + }}; +} + /// Cast `array` to the provided data type and return a new Array with /// type `to_type`, if possible. It accepts `CastOptions` to allow consumers /// to configure cast behavior. @@ -585,6 +607,8 @@ pub fn cast_with_options( cast_list_container::(&**array, cast_options) } } + (List(_) | LargeList(_), Utf8) => cast_list_to_string!(array, i32), + (List(_) | LargeList(_), LargeUtf8) => cast_list_to_string!(array, i64), (List(_), _) => Err(ArrowError::CastError( "Cannot cast list to non-list data types".to_string(), )), @@ -5764,4 +5788,67 @@ mod tests { &expected ); } + + #[test] + fn test_list_to_string() { + let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); + let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_data = ArrayData::builder(DataType::Utf8) + .len(str_array.len()) + .buffers(str_array.data().buffers().to_vec()) + .build() + .unwrap(); + + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + let array = Arc::new(ListArray::from(list_data)) as ArrayRef; + + let out = cast(&array, &DataType::Utf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .flatten() + .collect::>(); + assert_eq!(&out, &vec!["[a, b, c]", "[d, e, f]", "[g, h]"]); + + let out = cast(&array, &DataType::LargeUtf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .flatten() + .collect::>(); + assert_eq!(&out, &vec!["[a, b, c]", "[d, e, f]", "[g, h]"]); + + let array = Arc::new(make_list_array()) as ArrayRef; + let out = cast(&array, &DataType::Utf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .flatten() + .collect::>(); + assert_eq!(&out, &vec!["[0, 1, 2]", "[3, 4, 5]", "[6, 7]"]); + + let array = Arc::new(make_large_list_array()) as ArrayRef; + let out = cast(&array, &DataType::LargeUtf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .flatten() + .collect::>(); + assert_eq!(&out, &vec!["[0, 1, 2]", "[3, 4, 5]", "[6, 7]"]); + } } diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index aa4fd4200870..cf8394efaa6b 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -235,6 +235,22 @@ macro_rules! make_string_from_list { }}; } +macro_rules! make_string_from_large_list { + ($column: ident, $row: ident) => {{ + let list = $column + .as_any() + .downcast_ref::() + .ok_or(ArrowError::InvalidArgumentError(format!( + "Repl error: could not convert large list column to list array." + )))? + .value($row); + let string_values = (0..list.len()) + .map(|i| array_value_to_string(&list, i)) + .collect::>>()?; + Ok(format!("[{}]", string_values.join(", "))) + }}; +} + macro_rules! make_string_from_fixed_size_list { ($column: ident, $row: ident) => {{ let list = $column @@ -357,6 +373,7 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result make_string_from_list!(column, row), + DataType::LargeList(_) => make_string_from_large_list!(column, row), DataType::Dictionary(index_type, _value_type) => match **index_type { DataType::Int8 => dict_array_value_to_string::(column, row), DataType::Int16 => dict_array_value_to_string::(column, row), From 11213a4230990fdc85b5931028ad278119e8ecdd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 4 Oct 2022 17:24:35 +0100 Subject: [PATCH 0103/1411] Update labeller for split crates (#2823) --- .github/workflows/dev_pr/labeler.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index aadf9c377c64..42ab6a639c68 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -17,6 +17,10 @@ arrow: - arrow/**/* + - arrow-array/**/* + - arrow-buffer/**/* + - arrow-data/**/* + - arrow-schema/**/* arrow-flight: - arrow-flight/**/* From 8a54e95850fe27ac5865a02ef4be2de0937de5b3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 4 Oct 2022 18:52:54 +0100 Subject: [PATCH 0104/1411] Prepare object_store 0.5.1 (#2824) --- object_store/CHANGELOG-old.md | 37 +++++++++++++++++ object_store/CHANGELOG.md | 40 +++++++------------ object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index a6bda3ceef49..bf1ef6219fc2 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,43 @@ # Historical Changelog +## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) + +**Breaking changes:** + +- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) +- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) + +**Merged pull requests:** + +- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) +- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) +- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) + ## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 538eebf77c62..6919111099fd 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,42 +19,30 @@ # Changelog -## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) +## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) - -**Breaking changes:** - -- Replace azure sdk with custom implementation [\#2509](https://github.com/apache/arrow-rs/pull/2509) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Replace rusoto with custom implementation for AWS \(\#2176\) [\#2352](https://github.com/apache/arrow-rs/pull/2352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) **Implemented enhancements:** -- IMDSv1 Fallback for S3 [\#2609](https://github.com/apache/arrow-rs/issues/2609) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Print Response Body On Error [\#2572](https://github.com/apache/arrow-rs/issues/2572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Coalesce Ranges Parallel Fetch [\#2562](https://github.com/apache/arrow-rs/issues/2562) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support Coalescing Out-of-Order Ranges [\#2561](https://github.com/apache/arrow-rs/issues/2561) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Add TokenProvider authorization to azure [\#2373](https://github.com/apache/arrow-rs/issues/2373) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- AmazonS3Builder::from\_env to populate credentials from environment [\#2361](https://github.com/apache/arrow-rs/issues/2361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- AmazonS3 Support IMDSv2 [\#2350](https://github.com/apache/arrow-rs/issues/2350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) +- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) +- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- Retry Logic Fails to Retry Server Errors [\#2573](https://github.com/apache/arrow-rs/issues/2573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Fix multiple part uploads at once making vector size inconsistent [\#2681](https://github.com/apache/arrow-rs/pull/2681) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gruuya](https://github.com/gruuya)) -- Fix panic in `object_store::util::coalesce_ranges` [\#2554](https://github.com/apache/arrow-rs/pull/2554) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thinkharderdev](https://github.com/thinkharderdev)) +- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- update doc for object\_store copy\_if\_not\_exists [\#2653](https://github.com/apache/arrow-rs/pull/2653) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([JanKaul](https://github.com/JanKaul)) -- Update quick-xml 0.24 [\#2625](https://github.com/apache/arrow-rs/pull/2625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add IMDSv1 fallback \(\#2609\) [\#2610](https://github.com/apache/arrow-rs/pull/2610) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- ObjectStore cleanup \(\#2587\) [\#2590](https://github.com/apache/arrow-rs/pull/2590) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix retry logic \(\#2573\) \(\#2572\) [\#2574](https://github.com/apache/arrow-rs/pull/2574) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Improve coalesce\_ranges \(\#2561\) \(\#2562\) [\#2563](https://github.com/apache/arrow-rs/pull/2563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update environment variable name for amazonS3builder in integration \(\#2550\) [\#2553](https://github.com/apache/arrow-rs/pull/2553) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) -- Build AmazonS3builder from environment variables \(\#2361\) [\#2536](https://github.com/apache/arrow-rs/pull/2536) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([amrltqt](https://github.com/amrltqt)) -- feat: add token provider authorization to azure store [\#2374](https://github.com/apache/arrow-rs/pull/2374) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 9e4e68d59119..6abb390fc800 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.0" +version = "0.5.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index e737e044666b..865acdeb0286 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.4.0" -FUTURE_RELEASE="object_store_0.5.0" +SINCE_TAG="object_store_0.5.0" +FUTURE_RELEASE="object_store_0.5.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 4eb99086030502ee3c10ad7711c5f78524f09905 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Wed, 5 Oct 2022 11:20:59 +0200 Subject: [PATCH 0105/1411] Update Clap dependency to version 4 (#2819) * #2817: Update to clap 4 * #2817: Update to clap 4 * Fix clap features (help, usage & error) --- integration-testing/Cargo.toml | 2 +- .../src/bin/arrow-json-integration-test.rs | 4 +- .../src/bin/flight-test-integration-client.rs | 4 +- .../src/bin/flight-test-integration-server.rs | 4 +- parquet/Cargo.toml | 2 +- parquet/src/bin/parquet-fromcsv-help.txt | 97 +++++++++---------- parquet/src/bin/parquet-fromcsv.rs | 24 +++-- parquet/src/bin/parquet-rowcount.rs | 2 +- 8 files changed, 72 insertions(+), 67 deletions(-) diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 687e91ac4dfd..f4f6a336498a 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -34,7 +34,7 @@ logging = ["tracing-subscriber"] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } arrow-flight = { path = "../arrow-flight", default-features = false } async-trait = { version = "0.1.41", default-features = false } -clap = { version = "3", default-features = false, features = ["std", "derive"] } +clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false, features = ["std"] } prost = { version = "0.11", default-features = false } diff --git a/integration-testing/src/bin/arrow-json-integration-test.rs b/integration-testing/src/bin/arrow-json-integration-test.rs index a7d7cf6ee7cb..b84680f6f4b3 100644 --- a/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/integration-testing/src/bin/arrow-json-integration-test.rs @@ -24,7 +24,7 @@ use arrow_integration_testing::{read_json_file, util::*}; use clap::Parser; use std::fs::File; -#[derive(clap::ArgEnum, Debug, Clone)] +#[derive(clap::ValueEnum, Debug, Clone)] #[clap(rename_all = "SCREAMING_SNAKE_CASE")] enum Mode { ArrowToJson, @@ -41,7 +41,7 @@ struct Args { arrow: String, #[clap(short, long, help("Path to JSON file"))] json: String, - #[clap(arg_enum, short, long, default_value_t = Mode::Validate, help="Mode of integration testing tool")] + #[clap(value_enum, short, long, default_value_t = Mode::Validate, help="Mode of integration testing tool")] mode: Mode, #[clap(short, long)] verbose: bool, diff --git a/integration-testing/src/bin/flight-test-integration-client.rs b/integration-testing/src/bin/flight-test-integration-client.rs index fa99b424e378..d46b4fac759e 100644 --- a/integration-testing/src/bin/flight-test-integration-client.rs +++ b/integration-testing/src/bin/flight-test-integration-client.rs @@ -20,7 +20,7 @@ use clap::Parser; type Error = Box; type Result = std::result::Result; -#[derive(clap::ArgEnum, Debug, Clone)] +#[derive(clap::ValueEnum, Debug, Clone)] enum Scenario { Middleware, #[clap(name = "auth:basic_proto")] @@ -40,7 +40,7 @@ struct Args { help = "path to the descriptor file, only used when scenario is not provided. See https://arrow.apache.org/docs/format/Integration.html#json-test-data-format" )] path: Option, - #[clap(long, arg_enum)] + #[clap(long, value_enum)] scenario: Option, } diff --git a/integration-testing/src/bin/flight-test-integration-server.rs b/integration-testing/src/bin/flight-test-integration-server.rs index 6ed22ad81d90..5310d07d4f8e 100644 --- a/integration-testing/src/bin/flight-test-integration-server.rs +++ b/integration-testing/src/bin/flight-test-integration-server.rs @@ -21,7 +21,7 @@ use clap::Parser; type Error = Box; type Result = std::result::Result; -#[derive(clap::ArgEnum, Debug, Clone)] +#[derive(clap::ValueEnum, Debug, Clone)] enum Scenario { Middleware, #[clap(name = "auth:basic_proto")] @@ -33,7 +33,7 @@ enum Scenario { struct Args { #[clap(long)] port: u16, - #[clap(long, arg_enum)] + #[clap(long, value_enum)] scenario: Option, } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index e1593e4b9c97..f47f556b257f 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -43,7 +43,7 @@ num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } arrow = { path = "../arrow", version = "24.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } -clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } +clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } seq-macro = { version = "0.3", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet/src/bin/parquet-fromcsv-help.txt b/parquet/src/bin/parquet-fromcsv-help.txt index f4fe704ab267..f599a13f0f18 100644 --- a/parquet/src/bin/parquet-fromcsv-help.txt +++ b/parquet/src/bin/parquet-fromcsv-help.txt @@ -1,66 +1,65 @@ -Apache Arrow -Binary to convert csv to Parquet -USAGE: - parquet [OPTIONS] --schema --input-file --output-file +Usage: parquet [OPTIONS] --schema --input-file --output-file -OPTIONS: - -b, --batch-size - batch size - - [env: PARQUET_FROM_CSV_BATCHSIZE=] - [default: 1000] +Options: + -s, --schema + message schema for output Parquet - -c, --parquet-compression - compression mode - - [default: SNAPPY] + -i, --input-file + input CSV file - -d, --delimiter - field delimiter - - default value: when input_format==CSV: ',' when input_format==TSV: 'TAB' + -o, --output-file + output Parquet file - -D, --double-quote - double quote + -f, --input-format + input file format + + [default: csv] + [possible values: csv, tsv] - -e, --escape-char - escape charactor + -b, --batch-size + batch size + + [env: PARQUET_FROM_CSV_BATCHSIZE=] + [default: 1000] - -f, --input-format - input file format - - [default: csv] - [possible values: csv, tsv] + -h, --has-header + has header - -h, --has-header - has header + -d, --delimiter + field delimiter + + default value: when input_format==CSV: ',' when input_format==TSV: 'TAB' - --help - Print help information + -r, --record-terminator + record terminator + + [possible values: lf, crlf, cr] - -i, --input-file - input CSV file + -e, --escape-char + escape charactor - -m, --max-row-group-size - max row group size + -q, --quote-char + quate charactor - -o, --output-file - output Parquet file + -D, --double-quote + double quote + + [possible values: true, false] - -q, --quote-char - quate charactor + -c, --parquet-compression + compression mode + + [default: SNAPPY] - -r, --record-terminator - record terminator - - [possible values: lf, crlf, cr] + -w, --writer-version + writer version - -s, --schema - message schema for output Parquet + -m, --max-row-group-size + max row group size - -V, --version - Print version information + --help + display usage help - -w, --writer-version - writer version + -V, --version + Print version information diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 827aa7311f58..90e0a68625f9 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -72,7 +72,7 @@ use std::{ }; use arrow::{csv::ReaderBuilder, datatypes::Schema, error::ArrowError}; -use clap::{ArgEnum, Parser}; +use clap::{Parser, ValueEnum}; use parquet::{ arrow::{parquet_to_arrow_schema, ArrowWriter}, basic::Compression, @@ -140,7 +140,7 @@ impl Display for ParquetFromCsvError { } #[derive(Debug, Parser)] -#[clap(author, version, about("Binary to convert csv to Parquet"), long_about=None)] +#[clap(author, version, disable_help_flag=true, about("Binary to convert csv to Parquet"), long_about=None)] struct Args { /// Path to a text file containing a parquet schema definition #[clap(short, long, help("message schema for output Parquet"))] @@ -153,7 +153,7 @@ struct Args { output_file: PathBuf, /// input file format #[clap( - arg_enum, + value_enum, short('f'), long, help("input file format"), @@ -179,7 +179,7 @@ struct Args { /// when input_format==TSV: 'TAB' #[clap(short, long, help("field delimiter"))] delimiter: Option, - #[clap(arg_enum, short, long, help("record terminator"))] + #[clap(value_enum, short, long, help("record terminator"))] record_terminator: Option, #[clap(short, long, help("escape charactor"))] escape_char: Option, @@ -188,14 +188,17 @@ struct Args { #[clap(short('D'), long, help("double quote"))] double_quote: Option, #[clap(short('c'), long, help("compression mode"), default_value_t=Compression::SNAPPY)] - #[clap(parse(try_from_str =compression_from_str))] + #[clap(value_parser=compression_from_str)] parquet_compression: Compression, #[clap(short, long, help("writer version"))] - #[clap(parse(try_from_str =writer_version_from_str))] + #[clap(value_parser=writer_version_from_str)] writer_version: Option, #[clap(short, long, help("max row group size"))] max_row_group_size: Option, + + #[clap(long, action=clap::ArgAction::Help, help("display usage help"))] + help: Option, } fn compression_from_str(cmp: &str) -> Result { @@ -208,7 +211,7 @@ fn compression_from_str(cmp: &str) -> Result { "LZ4" => Ok(Compression::LZ4), "ZSTD" => Ok(Compression::ZSTD), v => Err( - format!("Unknown compression {0} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD ",v) + format!("Unknown compression {0} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help",v) ) } } @@ -263,13 +266,13 @@ impl Args { } } -#[derive(Debug, Clone, Copy, ArgEnum, PartialEq)] +#[derive(Debug, Clone, Copy, ValueEnum, PartialEq)] enum CsvDialect { Csv, Tsv, } -#[derive(Debug, Clone, Copy, ArgEnum, PartialEq)] +#[derive(Debug, Clone, Copy, ValueEnum, PartialEq)] enum RecordTerminator { LF, Crlf, @@ -544,6 +547,7 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + help: None, }; let arrow_schema = Arc::new(Schema::new(vec![ Field::new("field1", DataType::Utf8, false), @@ -577,6 +581,7 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + help: None, }; let arrow_schema = Arc::new(Schema::new(vec![ Field::new("field1", DataType::Utf8, false), @@ -630,6 +635,7 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + help: None, }; convert_csv_to_parquet(&args).unwrap(); } diff --git a/parquet/src/bin/parquet-rowcount.rs b/parquet/src/bin/parquet-rowcount.rs index d2f0311cf7a0..491f582c5103 100644 --- a/parquet/src/bin/parquet-rowcount.rs +++ b/parquet/src/bin/parquet-rowcount.rs @@ -47,7 +47,7 @@ struct Args { #[clap( short, long, - multiple_values(true), + number_of_values(1), help("List of Parquet files to read from separated by space") )] file_paths: Vec, From e79ba40a6fa6f6e20211eee51c529332c6ed6f96 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 5 Oct 2022 10:51:58 -0700 Subject: [PATCH 0106/1411] Add overflow-checking variant of sum kernel (#2822) * Define overflow-checking behavior of sum kernels * Add sum_checked. * Add sum_array_checked. --- arrow/src/compute/kernels/aggregate.rs | 129 +++++++++++++++++++++++-- 1 file changed, 122 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index c215e23953e5..083defdde7dc 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -17,14 +17,19 @@ //! Defines aggregations over Arrow arrays. +use arrow_data::bit_iterator::try_for_each_valid_idx; +use arrow_schema::ArrowError; use multiversion::multiversion; -use std::ops::Add; +#[allow(unused_imports)] +use std::ops::{Add, Deref}; use crate::array::{ as_primitive_array, Array, ArrayAccessor, ArrayIter, BooleanArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; +use crate::datatypes::native_op::ArrowNativeTypeOp; use crate::datatypes::{ArrowNativeType, ArrowNumericType, DataType}; +use crate::error::Result; use crate::util::bit_iterator::BitIndexIterator; /// Generic test for NaN, the optimizer should be able to remove this for integer types. @@ -162,10 +167,13 @@ pub fn min_string(array: &GenericStringArray) -> Option<& } /// Returns the sum of values in the array. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `sum_array_checked` instead. pub fn sum_array>(array: A) -> Option where T: ArrowNumericType, - T::Native: Add, + T::Native: ArrowNativeTypeOp, { match array.data_type() { DataType::Dictionary(_, _) => { @@ -180,7 +188,7 @@ where .into_iter() .fold(T::default_value(), |accumulator, value| { if let Some(value) = value { - accumulator + value + accumulator.add_wrapping(value) } else { accumulator } @@ -192,6 +200,42 @@ where } } +/// Returns the sum of values in the array. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `sum_array` instead. +pub fn sum_array_checked>( + array: A, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + match array.data_type() { + DataType::Dictionary(_, _) => { + let null_count = array.null_count(); + + if null_count == array.len() { + return Ok(None); + } + + let iter = ArrayIter::new(array); + let sum = + iter.into_iter() + .try_fold(T::default_value(), |accumulator, value| { + if let Some(value) = value { + accumulator.add_checked(value) + } else { + Ok(accumulator) + } + })?; + + Ok(Some(sum)) + } + _ => sum_checked::(as_primitive_array(&array)), + } +} + /// Returns the min of values in the array of `ArrowNumericType` type, or dictionary /// array with value of `ArrowNumericType` type. pub fn min_array>(array: A) -> Option @@ -239,11 +283,14 @@ where /// Returns the sum of values in the primitive array. /// /// Returns `None` if the array is empty or only contains null values. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `sum_checked` instead. #[cfg(not(feature = "simd"))] pub fn sum(array: &PrimitiveArray) -> Option where T: ArrowNumericType, - T::Native: Add, + T::Native: ArrowNativeTypeOp, { let null_count = array.null_count(); @@ -256,7 +303,7 @@ where match array.data().null_buffer() { None => { let sum = data.iter().fold(T::default_value(), |accumulator, value| { - accumulator + *value + accumulator.add_wrapping(*value) }); Some(sum) @@ -274,7 +321,7 @@ where let mut index_mask = 1; chunk.iter().for_each(|value| { if (mask & index_mask) != 0 { - sum = sum + *value; + sum = sum.add_wrapping(*value); } index_mask <<= 1; }); @@ -284,7 +331,7 @@ where remainder.iter().enumerate().for_each(|(i, value)| { if remainder_bits & (1 << i) != 0 { - sum = sum + *value; + sum = sum.add_wrapping(*value); } }); @@ -293,6 +340,54 @@ where } } +/// Returns the sum of values in the primitive array. +/// +/// Returns `Ok(None)` if the array is empty or only contains null values. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `sum` instead. +pub fn sum_checked(array: &PrimitiveArray) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + let null_count = array.null_count(); + + if null_count == array.len() { + return Ok(None); + } + + let data: &[T::Native] = array.values(); + + match array.data().null_buffer() { + None => { + let sum = data + .iter() + .try_fold(T::default_value(), |accumulator, value| { + accumulator.add_checked(*value) + })?; + + Ok(Some(sum)) + } + Some(buffer) => { + let mut sum = T::default_value(); + + try_for_each_valid_idx( + array.len(), + array.offset(), + null_count, + Some(buffer.deref()), + |idx| { + unsafe { sum = sum.add_checked(array.value_unchecked(idx))? }; + Ok::<_, ArrowError>(()) + }, + )?; + + Ok(Some(sum)) + } + } +} + #[cfg(feature = "simd")] mod simd { use super::is_nan; @@ -638,6 +733,9 @@ mod simd { /// Returns the sum of values in the primitive array. /// /// Returns `None` if the array is empty or only contains null values. +/// +/// This doesn't detect overflow in release mode by default. Once overflowing, the result will +/// wrap around. For an overflow-checking variant, use `sum_checked` instead. #[cfg(feature = "simd")] pub fn sum(array: &PrimitiveArray) -> Option where @@ -1216,4 +1314,21 @@ mod tests { let actual = max_binary(sliced_input); assert_eq!(actual, expected); } + + #[test] + #[cfg(not(feature = "simd"))] + fn test_sum_overflow() { + let a = Int32Array::from(vec![i32::MAX, 1]); + + assert_eq!(sum(&a).unwrap(), -2147483648); + assert_eq!(sum_array::(&a).unwrap(), -2147483648); + } + + #[test] + fn test_sum_checked_overflow() { + let a = Int32Array::from(vec![i32::MAX, 1]); + + sum_checked(&a).expect_err("overflow should be detected"); + sum_array_checked::(&a).expect_err("overflow should be detected"); + } } From 24e9b7c5ce76f1f46a95ef4bb8cfd2d55943221b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 Oct 2022 17:30:33 -0400 Subject: [PATCH 0107/1411] Tweak release instructions + scripts (#2828) --- dev/release/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/release/README.md b/dev/release/README.md index 3ee7a7d5e0bf..18542c5f603d 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -250,6 +250,9 @@ Rust Arrow Crates: ```shell (cd arrow-buffer && cargo publish) +(cd arrow-schema && cargo publish) +(cd arrow-data && cargo publish) +(cd arrow-array && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From a0a263f7cbe26ee7d72ecacd0e47a8c05279e5bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 6 Oct 2022 06:30:59 +0100 Subject: [PATCH 0108/1411] Fix i256 checked multiplication (#2818) * Fix i256 checked multiplication * More specific tests, fix bugs --- arrow-buffer/src/bigint.rs | 210 +++++++++++++++++++++++++++---------- 1 file changed, 154 insertions(+), 56 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index a08d280ca883..df4cfd8ea594 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -65,6 +65,18 @@ impl i256 { high: -1, }; + /// The maximum value that can be represented by this integer type + pub const MAX: Self = i256 { + low: u128::MAX, + high: i128::MAX, + }; + + /// The minimum value that can be represented by this integer type + pub const MIN: Self = i256 { + low: u128::MIN, + high: i128::MIN, + }; + /// Create an integer value from its representation as a byte array in little-endian. #[inline] pub fn from_le_bytes(b: [u8; 32]) -> Self { @@ -129,6 +141,23 @@ impl i256 { } } + /// Computes the absolute value of this i256 + #[inline] + pub fn wrapping_abs(self) -> Self { + // -1 if negative, otherwise 0 + let sa = self.high >> 127; + let sa = Self::from_parts(sa as u128, sa); + + // Inverted if negative + Self::from_parts(self.low ^ sa.low, self.high ^ sa.high).wrapping_sub(sa) + } + + /// Computes the absolute value of this i256 returning `None` if `Self == Self::MIN` + #[inline] + pub fn checked_abs(self) -> Option { + (self != Self::MIN).then(|| self.wrapping_abs()) + } + /// Performs wrapping addition #[inline] pub fn wrapping_add(self, other: Self) -> Self { @@ -179,16 +208,34 @@ impl i256 { /// Performs checked multiplication #[inline] pub fn checked_mul(self, other: Self) -> Option { - let (low, high) = mulx(self.low, other.low); + // Shift sign bit down to construct mask of all set bits if negative + let l_sa = self.high >> 127; + let r_sa = other.high >> 127; + let out_sa = l_sa ^ r_sa; + + // Compute absolute values + let l_abs = self.wrapping_abs(); + let r_abs = other.wrapping_abs(); + + // Overflow if both high parts are non-zero + if l_abs.high != 0 && r_abs.high != 0 { + return None; + } + + // Perform checked multiplication on absolute values + let (low, high) = mulx(l_abs.low, r_abs.low); // Compute the high multiples, only impacting the high 128-bits - let hl = self.high.checked_mul(other.low as i128)?; - let lh = (self.low as i128).checked_mul(other.high)?; + let hl = (l_abs.high as u128).checked_mul(r_abs.low)?; + let lh = l_abs.low.checked_mul(r_abs.high as u128)?; - Some(Self { - low, - high: (high as i128).checked_add(hl)?.checked_add(lh)?, - }) + let high: i128 = high.checked_add(hl)?.checked_add(lh)?.try_into().ok()?; + + // Reverse absolute value, if necessary + let (low, c) = (low ^ out_sa as u128).overflowing_sub(out_sa as u128); + let high = (high ^ out_sa).wrapping_sub(out_sa).wrapping_sub(c as i128); + + Some(Self { low, high }) } /// Performs wrapping division @@ -272,7 +319,7 @@ fn mulx(a: u128, b: u128) -> (u128, u128) { #[cfg(test)] mod tests { use super::*; - use num::{BigInt, FromPrimitive, ToPrimitive}; + use num::{BigInt, FromPrimitive, Signed, ToPrimitive}; use rand::{thread_rng, Rng}; #[test] @@ -303,8 +350,106 @@ mod tests { } } + /// Tests operations against the two provided [`i256`] + fn test_ops(il: i256, ir: i256) { + let bl = BigInt::from_signed_bytes_le(&il.to_le_bytes()); + let br = BigInt::from_signed_bytes_le(&ir.to_le_bytes()); + + // Comparison + assert_eq!(il.cmp(&ir), bl.cmp(&br), "{} cmp {}", bl, br); + + // To i128 + assert_eq!(il.to_i128(), bl.to_i128(), "{}", bl); + assert_eq!(ir.to_i128(), br.to_i128(), "{}", br); + + // Absolute value + let (abs, overflow) = i256::from_bigint_with_overflow(bl.abs()); + assert_eq!(il.wrapping_abs(), abs); + assert_eq!(il.checked_abs().is_none(), overflow); + + let (abs, overflow) = i256::from_bigint_with_overflow(br.abs()); + assert_eq!(ir.wrapping_abs(), abs); + assert_eq!(ir.checked_abs().is_none(), overflow); + + // Addition + let actual = il.wrapping_add(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() + br.clone()); + assert_eq!(actual, expected); + + let checked = il.checked_add(ir); + match overflow { + true => assert!(checked.is_none()), + false => assert_eq!(checked.unwrap(), actual), + } + + // Subtraction + let actual = il.wrapping_sub(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() - br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let checked = il.checked_sub(ir); + match overflow { + true => assert!(checked.is_none()), + false => assert_eq!(checked.unwrap(), actual), + } + + // Multiplication + let actual = il.wrapping_mul(ir); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone() * br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let checked = il.checked_mul(ir); + match overflow { + true => assert!( + checked.is_none(), + "{} * {} = {} vs {} * {} = {}", + il, + ir, + actual, + bl, + br, + expected + ), + false => assert_eq!( + checked.unwrap(), + actual, + "{} * {} = {} vs {} * {} = {}", + il, + ir, + actual, + bl, + br, + expected + ), + } + } + #[test] fn test_i256() { + let candidates = [ + i256::from_parts(0, 0), + i256::from_parts(0, 1), + i256::from_parts(0, -1), + i256::from_parts(u128::MAX, 1), + i256::from_parts(u128::MAX, -1), + i256::from_parts(0, 1), + i256::from_parts(0, -1), + i256::from_parts(100, 32), + ]; + + for il in candidates { + for ir in candidates { + test_ops(il, ir) + } + } + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_i256_fuzz() { let mut rng = thread_rng(); for _ in 0..1000 { @@ -316,54 +461,7 @@ mod tests { let len = rng.gen_range(0..32); r.iter_mut().take(len).for_each(|x| *x = rng.gen()); - let il = i256::from_le_bytes(l); - let ir = i256::from_le_bytes(r); - - let bl = BigInt::from_signed_bytes_le(&l); - let br = BigInt::from_signed_bytes_le(&r); - - // Comparison - assert_eq!(il.cmp(&ir), bl.cmp(&br), "{} cmp {}", bl, br); - - // To i128 - assert_eq!(il.to_i128(), bl.to_i128(), "{}", bl); - assert_eq!(ir.to_i128(), br.to_i128(), "{}", br); - - // Addition - let actual = il.wrapping_add(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() + br.clone()); - assert_eq!(actual, expected); - - let checked = il.checked_add(ir); - match overflow { - true => assert!(checked.is_none()), - false => assert_eq!(checked.unwrap(), actual), - } - - // Subtraction - let actual = il.wrapping_sub(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() - br.clone()); - assert_eq!(actual.to_string(), expected.to_string()); - - let checked = il.checked_sub(ir); - match overflow { - true => assert!(checked.is_none()), - false => assert_eq!(checked.unwrap(), actual), - } - - // Multiplication - let actual = il.wrapping_mul(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() * br.clone()); - assert_eq!(actual.to_string(), expected.to_string()); - - let checked = il.checked_mul(ir); - match overflow { - true => assert!(checked.is_none()), - false => assert_eq!(checked.unwrap(), actual), - } + test_ops(i256::from_le_bytes(l), i256::from_le_bytes(r)) } } } From f8c40372242ab096cd53118021d016cac4f0daed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 6 Oct 2022 10:51:27 +0100 Subject: [PATCH 0109/1411] Convert rows to arrays (#2677) (#2826) * Convert rows to arrays (#2677) * Review feedback * Clippy --- arrow/benches/row_format.rs | 140 +++------- arrow/src/row/dictionary.rs | 337 +++++++++++++++++++++++ arrow/src/row/fixed.rs | 239 ++++++++++++++-- arrow/src/row/interner.rs | 4 +- arrow/src/row/mod.rs | 528 +++++++++++++++++++++++++++++------- arrow/src/row/variable.rs | 140 +++++++++- 6 files changed, 1158 insertions(+), 230 deletions(-) create mode 100644 arrow/src/row/dictionary.rs diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index ec872c127060..ff505781a0a1 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -20,161 +20,95 @@ extern crate criterion; extern crate core; use arrow::array::ArrayRef; -use arrow::datatypes::{DataType, Int64Type, UInt64Type}; +use arrow::datatypes::{Int64Type, UInt64Type}; use arrow::row::{RowConverter, SortField}; use arrow::util::bench_util::{ create_primitive_array, create_string_array_with_len, create_string_dict_array, }; use arrow_array::types::Int32Type; +use arrow_array::Array; use criterion::{black_box, Criterion}; use std::sync::Arc; -fn row_bench(c: &mut Criterion) { - let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; +fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { + let fields: Vec<_> = cols + .iter() + .map(|x| SortField::new(x.data_type().clone())) + .collect(); - c.bench_function("row_batch 4096 u64(0)", |b| { + c.bench_function(&format!("convert_columns {}", name), |b| { b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::UInt64)]); - black_box(converter.convert_columns(&cols)) + let mut converter = RowConverter::new(fields.clone()); + black_box(converter.convert_columns(&cols).unwrap()) }); }); - let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + let mut converter = RowConverter::new(fields); + let rows = converter.convert_columns(&cols).unwrap(); - c.bench_function("row_batch 4096 i64(0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Int64)]); - black_box(converter.convert_columns(&cols)) - }); + c.bench_function(&format!("convert_rows {}", name), |b| { + b.iter(|| black_box(converter.convert_rows(&rows).unwrap())); }); +} + +fn row_bench(c: &mut Criterion) { + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 u64(0)", cols); + + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 i64(0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; - - c.bench_function("row_batch 4096 string(10, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string(10, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; - - c.bench_function("row_batch 4096 string(30, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string(30, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; - - c.bench_function("row_batch 4096 string(100, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string(100, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; - - c.bench_function("row_batch 4096 string(100, 0.5)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string(100, 0.5)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; - - c.bench_function("row_batch 4096 string_dictionary(10, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string_dictionary(10, 0)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; - - c.bench_function("row_batch 4096 string_dictionary(30, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string_dictionary(30, 0)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; - - c.bench_function("row_batch 4096 string_dictionary(100, 0)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); + do_bench(c, "4096 string_dictionary(100, 0)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; + do_bench(c, "4096 string_dictionary(100, 0.5)", cols); - c.bench_function("row_batch 4096 string_dictionary(100, 0.5)", |b| { - b.iter(|| { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - black_box(converter.convert_columns(&cols)) - }); - }); - - let cols = [ + let cols = vec![ Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef, Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, ]; - - let fields = [ - SortField::new(DataType::Utf8), - SortField::new(DataType::Utf8), - SortField::new(DataType::Utf8), - SortField::new(DataType::Int64), - ]; - - c.bench_function( - "row_batch 4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", - |b| { - b.iter(|| { - let mut converter = RowConverter::new(fields.to_vec()); - black_box(converter.convert_columns(&cols)) - }); - }, + do_bench( + c, + "4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", + cols, ); - let cols = [ + let cols = vec![ Arc::new(create_string_dict_array::(4096, 0.5, 20)) as ArrayRef, Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef, Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef, Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, ]; - - let fields = [ - SortField::new(DataType::Utf8), - SortField::new(DataType::Utf8), - SortField::new(DataType::Utf8), - SortField::new(DataType::Int64), - ]; - - c.bench_function( - "row_batch 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", - |b| { - b.iter(|| { - let mut converter = RowConverter::new(fields.to_vec()); - black_box(converter.convert_columns(&cols)) - }); - }, - ); + do_bench(c, "4096 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", cols); } criterion_group!(benches, row_bench); diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs new file mode 100644 index 000000000000..4a048fbce86d --- /dev/null +++ b/arrow/src/row/dictionary.rs @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::compute::SortOptions; +use crate::row::fixed::{FixedLengthEncoding, FromSlice, RawDecimal}; +use crate::row::interner::{Interned, OrderPreservingInterner}; +use crate::row::{null_sentinel, Rows}; +use arrow_array::builder::*; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; +use std::collections::hash_map::Entry; +use std::collections::HashMap; + +/// Computes the dictionary mapping for the given dictionary values +pub fn compute_dictionary_mapping( + interner: &mut OrderPreservingInterner, + values: &ArrayRef, +) -> Result>, ArrowError> { + Ok(downcast_primitive_array! { + values => interner + .intern(values.iter().map(|x| x.map(|x| x.encode()))), + DataType::Binary => { + let iter = as_generic_binary_array::(values).iter(); + interner.intern(iter) + } + DataType::LargeBinary => { + let iter = as_generic_binary_array::(values).iter(); + interner.intern(iter) + } + DataType::Utf8 => { + let iter = as_string_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + interner.intern(iter) + } + DataType::LargeUtf8 => { + let iter = as_largestring_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + interner.intern(iter) + } + t => return Err(ArrowError::NotYetImplemented(format!("dictionary value {} is not supported", t))), + }) +} + +/// Dictionary types are encoded as +/// +/// - single `0_u8` if null +/// - the bytes of the corresponding normalized key including the null terminator +pub fn encode_dictionary( + out: &mut Rows, + column: &DictionaryArray, + normalized_keys: &[Option<&[u8]>], + opts: SortOptions, +) { + for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { + match k.and_then(|k| normalized_keys[k.as_usize()]) { + Some(normalized_key) => { + let end_offset = *offset + 1 + normalized_key.len(); + out.buffer[*offset] = 1; + out.buffer[*offset + 1..end_offset].copy_from_slice(normalized_key); + // Negate if descending + if opts.descending { + out.buffer[*offset..end_offset] + .iter_mut() + .for_each(|v| *v = !*v) + } + *offset = end_offset; + } + None => { + out.buffer[*offset] = null_sentinel(opts); + *offset += 1; + } + } + } +} + +/// Decodes a string array from `rows` with the provided `options` +/// +/// # Safety +/// +/// `interner` must contain valid data for the provided `value_type` +pub unsafe fn decode_dictionary( + interner: &OrderPreservingInterner, + value_type: &DataType, + options: SortOptions, + rows: &mut [&[u8]], +) -> Result, ArrowError> { + let len = rows.len(); + let mut dictionary: HashMap = HashMap::with_capacity(len); + + let null_sentinel = null_sentinel(options); + + // If descending, the null terminator will have been negated + let null_terminator = match options.descending { + true => 0xFF, + false => 0_u8, + }; + + let mut null_builder = BooleanBufferBuilder::new(len); + let mut keys = BufferBuilder::::new(len); + let mut values = Vec::with_capacity(len); + let mut null_count = 0; + let mut key_scratch = Vec::new(); + + for row in rows { + if row[0] == null_sentinel { + null_builder.append(false); + null_count += 1; + *row = &row[1..]; + keys.append(K::Native::default()); + continue; + } + + let key_offset = row + .iter() + .skip(1) + .position(|x| *x == null_terminator) + .unwrap(); + + // Extract the normalized key including the null terminator + let key = &row[1..key_offset + 2]; + *row = &row[key_offset + 2..]; + + let interned = match options.descending { + true => { + // If options.descending the normalized key will have been + // negated we must first reverse this + key_scratch.clear(); + key_scratch.extend_from_slice(key); + key_scratch.iter_mut().for_each(|o| *o = !*o); + interner.lookup(&key_scratch).unwrap() + } + false => interner.lookup(key).unwrap(), + }; + + let k = match dictionary.entry(interned) { + Entry::Vacant(v) => { + let k = values.len(); + values.push(interner.value(interned)); + let key = K::Native::from_usize(k) + .ok_or(ArrowError::DictionaryKeyOverflowError)?; + *v.insert(key) + } + Entry::Occupied(o) => *o.get(), + }; + + keys.append(k); + null_builder.append(true); + } + + let child = match &value_type { + DataType::Null => NullArray::new(values.len()).into_data(), + DataType::Boolean => decode_bool(&values), + DataType::Int8 => decode_primitive::(&values), + DataType::Int16 => decode_primitive::(&values), + DataType::Int32 => decode_primitive::(&values), + DataType::Int64 => decode_primitive::(&values), + DataType::UInt8 => decode_primitive::(&values), + DataType::UInt16 => decode_primitive::(&values), + DataType::UInt32 => decode_primitive::(&values), + DataType::UInt64 => decode_primitive::(&values), + DataType::Float16 => decode_primitive::(&values), + DataType::Float32 => decode_primitive::(&values), + DataType::Float64 => decode_primitive::(&values), + DataType::Timestamp(TimeUnit::Second, _) => { + decode_primitive::(&values) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + decode_primitive::(&values) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + decode_primitive::(&values) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + decode_primitive::(&values) + } + DataType::Date32 => decode_primitive::(&values), + DataType::Date64 => decode_primitive::(&values), + DataType::Time32(t) => match t { + TimeUnit::Second => decode_primitive::(&values), + TimeUnit::Millisecond => decode_primitive::(&values), + _ => unreachable!(), + }, + DataType::Time64(t) => match t { + TimeUnit::Microsecond => decode_primitive::(&values), + TimeUnit::Nanosecond => decode_primitive::(&values), + _ => unreachable!(), + }, + DataType::Duration(TimeUnit::Second) => { + decode_primitive::(&values) + } + DataType::Duration(TimeUnit::Millisecond) => { + decode_primitive::(&values) + } + DataType::Duration(TimeUnit::Microsecond) => { + decode_primitive::(&values) + } + DataType::Duration(TimeUnit::Nanosecond) => { + decode_primitive::(&values) + } + DataType::Interval(IntervalUnit::DayTime) => { + decode_primitive::(&values) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + decode_primitive::(&values) + } + DataType::Interval(IntervalUnit::YearMonth) => { + decode_primitive::(&values) + } + DataType::Decimal128(p, s) => { + decode_decimal::<16, Decimal128Type>(&values, *p, *s) + } + DataType::Decimal256(p, s) => { + decode_decimal::<32, Decimal256Type>(&values, *p, *s) + } + DataType::Utf8 => decode_string::(&values), + DataType::LargeUtf8 => decode_string::(&values), + DataType::Binary => decode_binary::(&values), + DataType::LargeBinary => decode_binary::(&values), + _ => { + return Err(ArrowError::NotYetImplemented(format!( + "decoding dictionary values of {}", + value_type + ))) + } + }; + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(value_type.clone())); + + let builder = ArrayDataBuilder::new(data_type) + .len(len) + .null_bit_buffer(Some(null_builder.finish())) + .null_count(null_count) + .add_buffer(keys.finish()) + .add_child_data(child); + + Ok(DictionaryArray::from(builder.build_unchecked())) +} + +/// Decodes a binary array from dictionary values +/// +/// # Safety +/// +/// Values must be valid UTF-8 +fn decode_binary(values: &[&[u8]]) -> ArrayData { + let capacity = values.iter().map(|x| x.len()).sum(); + let mut builder = GenericBinaryBuilder::::with_capacity(values.len(), capacity); + for v in values { + builder.append_value(v) + } + builder.finish().into_data() +} + +/// Decodes a string array from dictionary values +/// +/// # Safety +/// +/// Values must be valid UTF-8 +unsafe fn decode_string(values: &[&[u8]]) -> ArrayData { + let d = match O::IS_LARGE { + true => DataType::LargeUtf8, + false => DataType::Utf8, + }; + + decode_binary::(values) + .into_builder() + .data_type(d) + .build_unchecked() +} + +/// Decodes a boolean array from dictionary values +fn decode_bool(values: &[&[u8]]) -> ArrayData { + let mut builder = BooleanBufferBuilder::new(values.len()); + for value in values { + builder.append(bool::decode([value[0]])) + } + + let builder = ArrayDataBuilder::new(DataType::Boolean) + .len(values.len()) + .add_buffer(builder.finish()); + + // SAFETY: Buffers correct length + unsafe { builder.build_unchecked() } +} + +/// Decodes a fixed length type array from dictionary values +fn decode_fixed( + values: &[&[u8]], + data_type: DataType, +) -> ArrayData { + let mut buffer = MutableBuffer::new(std::mem::size_of::() * values.len()); + + for value in values { + let value = T::Encoded::from_slice(value, false); + buffer.push(T::decode(value)) + } + + let builder = ArrayDataBuilder::new(data_type) + .len(values.len()) + .add_buffer(buffer.into()); + + // SAFETY: Buffers correct length + unsafe { builder.build_unchecked() } +} + +/// Decodes a `PrimitiveArray` from dictionary values +fn decode_primitive(values: &[&[u8]]) -> ArrayData +where + T::Native: FixedLengthEncoding, +{ + decode_fixed::(values, T::DATA_TYPE) +} + +/// Decodes a `DecimalArray` from dictionary values +fn decode_decimal( + values: &[&[u8]], + precision: u8, + scale: u8, +) -> ArrayData { + decode_fixed::>(values, T::TYPE_CONSTRUCTOR(precision, scale)) +} diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 04b9a30ecad8..9952ee094bfb 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -18,18 +18,39 @@ use crate::array::PrimitiveArray; use crate::compute::SortOptions; use crate::datatypes::ArrowPrimitiveType; -use crate::row::Rows; -use crate::util::decimal::{Decimal128, Decimal256}; +use crate::row::{null_sentinel, Rows}; +use arrow_array::types::DecimalType; +use arrow_array::{BooleanArray, DecimalArray}; +use arrow_buffer::{bit_util, MutableBuffer, ToByteSlice}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::DataType; use half::f16; +pub trait FromSlice { + fn from_slice(slice: &[u8], invert: bool) -> Self; +} + +impl FromSlice for [u8; N] { + #[inline] + fn from_slice(slice: &[u8], invert: bool) -> Self { + let mut t: Self = slice.try_into().unwrap(); + if invert { + t.iter_mut().for_each(|o| *o = !*o); + } + t + } +} + /// Encodes a value of a particular fixed width type into bytes according to the rules /// described on [`super::RowConverter`] pub trait FixedLengthEncoding: Copy { const ENCODED_LEN: usize = 1 + std::mem::size_of::(); - type Encoded: Sized + Copy + AsRef<[u8]> + AsMut<[u8]>; + type Encoded: Sized + Copy + FromSlice + AsRef<[u8]> + AsMut<[u8]>; fn encode(self) -> Self::Encoded; + + fn decode(encoded: Self::Encoded) -> Self; } impl FixedLengthEncoding for bool { @@ -38,6 +59,10 @@ impl FixedLengthEncoding for bool { fn encode(self) -> [u8; 1] { [self as u8] } + + fn decode(encoded: Self::Encoded) -> Self { + encoded[0] != 0 + } } macro_rules! encode_signed { @@ -51,6 +76,12 @@ macro_rules! encode_signed { b[0] ^= 0x80; b } + + fn decode(mut encoded: Self::Encoded) -> Self { + // Toggle top "sign" bit + encoded[0] ^= 0x80; + Self::from_be_bytes(encoded) + } } }; } @@ -69,6 +100,10 @@ macro_rules! encode_unsigned { fn encode(self) -> [u8; $n] { self.to_be_bytes() } + + fn decode(encoded: Self::Encoded) -> Self { + Self::from_be_bytes(encoded) + } } }; } @@ -87,6 +122,12 @@ impl FixedLengthEncoding for f16 { let val = s ^ (((s >> 15) as u16) >> 1) as i16; val.encode() } + + fn decode(encoded: Self::Encoded) -> Self { + let bits = i16::decode(encoded); + let val = bits ^ (((bits >> 15) as u16) >> 1) as i16; + Self::from_bits(val as u16) + } } impl FixedLengthEncoding for f32 { @@ -98,6 +139,12 @@ impl FixedLengthEncoding for f32 { let val = s ^ (((s >> 31) as u32) >> 1) as i32; val.encode() } + + fn decode(encoded: Self::Encoded) -> Self { + let bits = i32::decode(encoded); + let val = bits ^ (((bits >> 31) as u32) >> 1) as i32; + Self::from_bits(val as u32) + } } impl FixedLengthEncoding for f64 { @@ -109,32 +156,44 @@ impl FixedLengthEncoding for f64 { let val = s ^ (((s >> 63) as u64) >> 1) as i64; val.encode() } + + fn decode(encoded: Self::Encoded) -> Self { + let bits = i64::decode(encoded); + let val = bits ^ (((bits >> 63) as u64) >> 1) as i64; + Self::from_bits(val as u64) + } } -impl FixedLengthEncoding for Decimal128 { - type Encoded = [u8; 16]; +pub type RawDecimal128 = RawDecimal<16>; +pub type RawDecimal256 = RawDecimal<32>; - fn encode(self) -> [u8; 16] { - let mut val = *self.raw_value(); - // Convert to big endian representation - val.reverse(); - // Toggle top "sign" bit to ensure consistent sort order - val[0] ^= 0x80; - val +/// The raw bytes of a decimal +#[derive(Copy, Clone)] +pub struct RawDecimal(pub [u8; N]); + +impl ToByteSlice for RawDecimal { + fn to_byte_slice(&self) -> &[u8] { + &self.0 } } -impl FixedLengthEncoding for Decimal256 { - type Encoded = [u8; 32]; +impl FixedLengthEncoding for RawDecimal { + type Encoded = [u8; N]; - fn encode(self) -> [u8; 32] { - let mut val = *self.raw_value(); + fn encode(self) -> [u8; N] { + let mut val = self.0; // Convert to big endian representation val.reverse(); // Toggle top "sign" bit to ensure consistent sort order val[0] ^= 0x80; val } + + fn decode(mut encoded: Self::Encoded) -> Self { + encoded[0] ^= 0x80; + encoded.reverse(); + Self(encoded) + } } /// Returns the total encoded length (including null byte) for a value of type `T::Native` @@ -166,9 +225,153 @@ pub fn encode>>( encoded.as_mut().iter_mut().for_each(|v| *v = !*v) } to_write[1..].copy_from_slice(encoded.as_ref()) - } else if !opts.nulls_first { - out.buffer[*offset] = 0xFF; + } else { + out.buffer[*offset] = null_sentinel(opts); } *offset = end_offset; } } + +/// Splits `len` bytes from `src` +#[inline] +fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] { + let v = &src[..len]; + *src = &src[len..]; + v +} + +/// Decodes a `BooleanArray` from rows +pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray { + let true_val = match options.descending { + true => !1, + false => 1, + }; + + let len = rows.len(); + + let mut null_count = 0; + let mut nulls = MutableBuffer::new(bit_util::ceil(len, 64) * 8); + let mut values = MutableBuffer::new(bit_util::ceil(len, 64) * 8); + + let chunks = len / 64; + let remainder = len % 64; + for chunk in 0..chunks { + let mut null_packed = 0; + let mut values_packed = 0; + + for bit_idx in 0..64 { + let i = split_off(&mut rows[bit_idx + chunk * 64], 2); + let (null, value) = (i[0] == 1, i[1] == true_val); + null_count += !null as usize; + null_packed |= (null as u64) << bit_idx; + values_packed |= (value as u64) << bit_idx; + } + + nulls.push(null_packed); + values.push(values_packed); + } + + if remainder != 0 { + let mut null_packed = 0; + let mut values_packed = 0; + + for bit_idx in 0..remainder { + let i = split_off(&mut rows[bit_idx + chunks * 64], 2); + let (null, value) = (i[0] == 1, i[1] == true_val); + null_count += !null as usize; + null_packed |= (null as u64) << bit_idx; + values_packed |= (value as u64) << bit_idx; + } + + nulls.push(null_packed); + values.push(values_packed); + } + + let builder = ArrayDataBuilder::new(DataType::Boolean) + .len(rows.len()) + .null_count(null_count) + .add_buffer(values.into()) + .null_bit_buffer(Some(nulls.into())); + + // SAFETY: + // Buffers are the correct length + unsafe { BooleanArray::from(builder.build_unchecked()) } +} + +/// Decodes a `ArrayData` from rows based on the provided `FixedLengthEncoding` `T` +fn decode_fixed( + rows: &mut [&[u8]], + data_type: DataType, + options: SortOptions, +) -> ArrayData { + let len = rows.len(); + + let mut null_count = 0; + let mut nulls = MutableBuffer::new(bit_util::ceil(len, 64) * 8); + let mut values = MutableBuffer::new(std::mem::size_of::() * len); + + let chunks = len / 64; + let remainder = len % 64; + for chunk in 0..chunks { + let mut null_packed = 0; + + for bit_idx in 0..64 { + let i = split_off(&mut rows[bit_idx + chunk * 64], T::ENCODED_LEN); + let null = i[0] == 1; + null_count += !null as usize; + null_packed |= (null as u64) << bit_idx; + + let value = T::Encoded::from_slice(&i[1..], options.descending); + values.push(T::decode(value)); + } + + nulls.push(null_packed); + } + + if remainder != 0 { + let mut null_packed = 0; + + for bit_idx in 0..remainder { + let i = split_off(&mut rows[bit_idx + chunks * 64], T::ENCODED_LEN); + let null = i[0] == 1; + null_count += !null as usize; + null_packed |= (null as u64) << bit_idx; + + let value = T::Encoded::from_slice(&i[1..], options.descending); + values.push(T::decode(value)); + } + + nulls.push(null_packed); + } + + let builder = ArrayDataBuilder::new(data_type) + .len(rows.len()) + .null_count(null_count) + .add_buffer(values.into()) + .null_bit_buffer(Some(nulls.into())); + + // SAFETY: Buffers correct length + unsafe { builder.build_unchecked() } +} + +/// Decodes a `DecimalArray` from rows +pub fn decode_decimal( + rows: &mut [&[u8]], + options: SortOptions, + precision: u8, + scale: u8, +) -> DecimalArray { + decode_fixed::>(rows, T::TYPE_CONSTRUCTOR(precision, scale), options) + .into() +} + +/// Decodes a `PrimitiveArray` from rows +pub fn decode_primitive( + rows: &mut [&[u8]], + options: SortOptions, +) -> PrimitiveArray +where + T::Native: FixedLengthEncoding + ToByteSlice, +{ + decode_fixed::(rows, T::DATA_TYPE, options).into() +} diff --git a/arrow/src/row/interner.rs b/arrow/src/row/interner.rs index e48670984c98..156d23465bfd 100644 --- a/arrow/src/row/interner.rs +++ b/arrow/src/row/interner.rs @@ -22,7 +22,7 @@ use std::num::NonZeroU32; use std::ops::Index; /// An interned value -#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct Interned(NonZeroU32); // We use NonZeroU32 so that `Option` is 32 bits /// A byte array interner that generates normalized keys that are sorted with respect @@ -132,7 +132,6 @@ impl OrderPreservingInterner { /// Converts a normalized key returned by [`Self::normalized_key`] to [`Interned`] /// returning `None` if it cannot be found - #[allow(dead_code)] pub fn lookup(&self, normalized_key: &[u8]) -> Option { let len = normalized_key.len(); @@ -159,7 +158,6 @@ impl OrderPreservingInterner { } /// Returns the interned value for a given [`Interned`] - #[allow(dead_code)] pub fn value(&self, key: Interned) -> &[u8] { self.values.index(key) } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 88c8a9166631..f5ac570320bd 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -17,17 +17,28 @@ //! A comparable row-oriented representation of a collection of [`Array`] -use crate::array::{ - as_boolean_array, as_generic_binary_array, as_largestring_array, as_string_array, - Array, ArrayRef, Decimal128Array, Decimal256Array, -}; +use std::cmp::Ordering; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use arrow_array::cast::*; +use arrow_array::*; + use crate::compute::SortOptions; use crate::datatypes::*; use crate::error::{ArrowError, Result}; -use crate::row::interner::{Interned, OrderPreservingInterner}; -use crate::util::decimal::{Decimal128, Decimal256}; +use crate::row::dictionary::{ + compute_dictionary_mapping, decode_dictionary, encode_dictionary, +}; +use crate::row::fixed::{ + decode_bool, decode_decimal, decode_primitive, RawDecimal, RawDecimal128, + RawDecimal256, +}; +use crate::row::interner::OrderPreservingInterner; +use crate::row::variable::{decode_binary, decode_string}; use crate::{downcast_dictionary_array, downcast_primitive_array}; +mod dictionary; mod fixed; mod interner; mod variable; @@ -134,13 +145,13 @@ mod variable; /// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing] #[derive(Debug)] pub struct RowConverter { - fields: Vec, + fields: Arc<[SortField]>, /// interning state for column `i`, if column`i` is a dictionary interners: Vec>>, } /// Configure the data type and sort order for a given column -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct SortField { /// Sort options options: SortOptions, @@ -164,7 +175,10 @@ impl RowConverter { /// Create a new [`RowConverter`] with the provided schema pub fn new(fields: Vec) -> Self { let interners = (0..fields.len()).map(|_| None).collect(); - Self { fields, interners } + Self { + fields: fields.into(), + interners, + } } /// Convert [`ArrayRef`] columns into [`Rows`] @@ -186,7 +200,7 @@ impl RowConverter { let dictionaries = columns .iter() .zip(&mut self.interners) - .zip(&self.fields) + .zip(self.fields.iter()) .map(|((column, interner), field)| { if !column.data_type().equals_datatype(&field.data_type) { return Err(ArrowError::InvalidArgumentError(format!( @@ -214,10 +228,10 @@ impl RowConverter { }) .collect::>>()?; - let mut rows = new_empty_rows(columns, &dictionaries)?; + let mut rows = new_empty_rows(columns, &dictionaries, Arc::clone(&self.fields))?; for ((column, field), dictionary) in - columns.iter().zip(&self.fields).zip(dictionaries) + columns.iter().zip(self.fields.iter()).zip(dictionaries) { // We encode a column at a time to minimise dispatch overheads encode_column(&mut rows, column, field.options, dictionary.as_deref()) @@ -227,11 +241,44 @@ impl RowConverter { assert_eq!(*rows.offsets.last().unwrap(), rows.buffer.len()); rows.offsets .windows(2) - .for_each(|w| assert!(w[0] < w[1], "offsets should be monotonic")); + .for_each(|w| assert!(w[0] <= w[1], "offsets should be monotonic")); } Ok(rows) } + + /// Convert [`Rows`] columns into [`ArrayRef`] + /// + /// # Panics + /// + /// Panics if the rows were not produced by this [`RowConverter`] + pub fn convert_rows<'a, I>(&self, rows: I) -> Result> + where + I: IntoIterator>, + { + let mut rows: Vec<_> = rows + .into_iter() + .map(|row| { + assert!( + Arc::ptr_eq(row.fields, &self.fields), + "rows were not produced by this RowConverter" + ); + + row.data + }) + .collect(); + + self.fields + .iter() + .zip(&self.interners) + .map(|(field, interner)| { + // SAFETY + // We have validated that the rows came from this [`RowConverter`] + // and therefore must be valid + unsafe { decode_column(field, &mut rows, interner.as_deref()) } + }) + .collect() + } } /// A row-oriented representation of arrow data, that is normalized for comparison @@ -243,13 +290,18 @@ pub struct Rows { buffer: Box<[u8]>, /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]` offsets: Box<[usize]>, + /// The schema for these rows + fields: Arc<[SortField]>, } impl Rows { pub fn row(&self, row: usize) -> Row<'_> { let end = self.offsets[row + 1]; let start = self.offsets[row]; - Row(&self.buffer[start..end]) + Row { + data: &self.buffer[start..end], + fields: &self.fields, + } } pub fn num_rows(&self) -> usize { @@ -257,54 +309,127 @@ impl Rows { } } +impl<'a> IntoIterator for &'a Rows { + type Item = Row<'a>; + type IntoIter = RowsIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + RowsIter { + rows: self, + start: 0, + end: self.num_rows(), + } + } +} + +/// An iterator over [`Rows`] +#[derive(Debug)] +pub struct RowsIter<'a> { + rows: &'a Rows, + start: usize, + end: usize, +} + +impl<'a> Iterator for RowsIter<'a> { + type Item = Row<'a>; + + fn next(&mut self) -> Option { + if self.end == self.start { + return None; + } + let row = self.rows.row(self.start); + self.start += 1; + Some(row) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.len(); + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for RowsIter<'a> { + fn len(&self) -> usize { + self.end - self.start + } +} + +impl<'a> DoubleEndedIterator for RowsIter<'a> { + fn next_back(&mut self) -> Option { + if self.end == self.start { + return None; + } + let row = self.rows.row(self.end); + self.end -= 1; + Some(row) + } +} + /// A comparable representation of a row /// /// Two [`Row`] can be compared if they both belong to [`Rows`] returned by calls to /// [`RowConverter::convert_columns`] on the same [`RowConverter`] /// /// Otherwise any ordering established by comparing the [`Row`] is arbitrary -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Row<'a>(&'a [u8]); +#[derive(Debug, Copy, Clone)] +pub struct Row<'a> { + data: &'a [u8], + fields: &'a Arc<[SortField]>, +} + +// Manually derive these as don't wish to include `fields` + +impl<'a> PartialEq for Row<'a> { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.data.eq(other.data) + } +} + +impl<'a> Eq for Row<'a> {} + +impl<'a> PartialOrd for Row<'a> { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + self.data.partial_cmp(other.data) + } +} + +impl<'a> Ord for Row<'a> { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + self.data.cmp(other.data) + } +} + +impl<'a> Hash for Row<'a> { + #[inline] + fn hash(&self, state: &mut H) { + self.data.hash(state) + } +} impl<'a> AsRef<[u8]> for Row<'a> { + #[inline] fn as_ref(&self) -> &[u8] { - self.0 + self.data } } -/// Computes the dictionary mapping for the given dictionary values -fn compute_dictionary_mapping( - interner: &mut OrderPreservingInterner, - values: &ArrayRef, -) -> Result>> { - use fixed::FixedLengthEncoding; - Ok(downcast_primitive_array! { - values => interner - .intern(values.iter().map(|x| x.map(|x| x.encode()))), - DataType::Binary => { - let iter = as_generic_binary_array::(values).iter(); - interner.intern(iter) - } - DataType::LargeBinary => { - let iter = as_generic_binary_array::(values).iter(); - interner.intern(iter) - } - DataType::Utf8 => { - let iter = as_string_array(values).iter().map(|x| x.map(|x| x.as_bytes())); - interner.intern(iter) - } - DataType::LargeUtf8 => { - let iter = as_largestring_array(values).iter().map(|x| x.map(|x| x.as_bytes())); - interner.intern(iter) - } - t => return Err(ArrowError::NotYetImplemented(format!("dictionary value {} is not supported", t))), - }) +/// Returns the null sentinel, negated if `invert` is true +#[inline] +fn null_sentinel(options: SortOptions) -> u8 { + match options.nulls_first { + true => 0, + false => 0xFF, + } } /// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`] fn new_empty_rows( cols: &[ArrayRef], dictionaries: &[Option>>], + fields: Arc<[SortField]>, ) -> Result { use fixed::FixedLengthEncoding; @@ -314,10 +439,10 @@ fn new_empty_rows( for (array, dict) in cols.iter().zip(dictionaries) { downcast_primitive_array! { array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), - DataType::Null => lengths.iter_mut().for_each(|x| *x += 1), + DataType::Null => {}, DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), - DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += Decimal128::ENCODED_LEN), - DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += Decimal256::ENCODED_LEN), + DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal128::ENCODED_LEN), + DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal256::ENCODED_LEN), DataType::Binary => as_generic_binary_array::(array) .iter() .zip(lengths.iter_mut()) @@ -383,6 +508,7 @@ fn new_empty_rows( Ok(Rows { buffer: buffer.into(), offsets: offsets.into(), + fields, }) } @@ -395,20 +521,28 @@ fn encode_column( ) { downcast_primitive_array! { column => fixed::encode(out, column, opts), - DataType::Null => { - fixed::encode(out, std::iter::repeat(None::).take(column.len()), opts) - } + DataType::Null => {} DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), - DataType::Decimal128(_, _) => fixed::encode( - out, - column.as_any().downcast_ref::().unwrap(), - opts, - ), - DataType::Decimal256(_, _) => fixed::encode( - out, - column.as_any().downcast_ref::().unwrap(), - opts, - ), + DataType::Decimal128(_, _) => { + let iter = column + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .map(|x| x.map(|x| RawDecimal(*x.raw_value()))); + + fixed::encode(out, iter, opts) + }, + DataType::Decimal256(_, _) => { + let iter = column + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .map(|x| x.map(|x| RawDecimal(*x.raw_value()))); + + fixed::encode(out, iter, opts) + }, DataType::Binary => { variable::encode(out, as_generic_binary_array::(column).iter(), opts) } @@ -428,37 +562,183 @@ fn encode_column( opts, ), DataType::Dictionary(_, _) => downcast_dictionary_array! { - column => { - let dict = dictionary.unwrap(); - for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { - match k.and_then(|k| dict[k as usize]) { - Some(v) => { - let end_offset = *offset + 1 + v.len(); - out.buffer[*offset] = 1; - out.buffer[*offset+1..end_offset].copy_from_slice(v); - if opts.descending { - out.buffer[*offset..end_offset].iter_mut().for_each(|v| *v = !*v) - } - *offset = end_offset; - } - None => { - if !opts.nulls_first { - out.buffer[*offset] = 0xFF; - } - *offset += 1; - } - } - } - }, + column => encode_dictionary(out, column, dictionary.unwrap(), opts), _ => unreachable!() } t => unimplemented!("not yet implemented: {}", t) } } +/// Decodes a the provided `field` from `rows` +/// +/// # Safety +/// +/// Rows must contain valid data for the provided field +unsafe fn decode_column( + field: &SortField, + rows: &mut [&[u8]], + interner: Option<&OrderPreservingInterner>, +) -> Result { + let options = field.options; + let array: ArrayRef = match &field.data_type { + DataType::Null => Arc::new(NullArray::new(rows.len())), + DataType::Boolean => Arc::new(decode_bool(rows, options)), + DataType::Int8 => Arc::new(decode_primitive::(rows, options)), + DataType::Int16 => Arc::new(decode_primitive::(rows, options)), + DataType::Int32 => Arc::new(decode_primitive::(rows, options)), + DataType::Int64 => Arc::new(decode_primitive::(rows, options)), + DataType::UInt8 => Arc::new(decode_primitive::(rows, options)), + DataType::UInt16 => Arc::new(decode_primitive::(rows, options)), + DataType::UInt32 => Arc::new(decode_primitive::(rows, options)), + DataType::UInt64 => Arc::new(decode_primitive::(rows, options)), + DataType::Float16 => Arc::new(decode_primitive::(rows, options)), + DataType::Float32 => Arc::new(decode_primitive::(rows, options)), + DataType::Float64 => Arc::new(decode_primitive::(rows, options)), + DataType::Timestamp(TimeUnit::Second, _) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Date32 => Arc::new(decode_primitive::(rows, options)), + DataType::Date64 => Arc::new(decode_primitive::(rows, options)), + DataType::Time32(t) => match t { + TimeUnit::Second => { + Arc::new(decode_primitive::(rows, options)) + } + TimeUnit::Millisecond => { + Arc::new(decode_primitive::(rows, options)) + } + _ => unreachable!(), + }, + DataType::Time64(t) => match t { + TimeUnit::Microsecond => { + Arc::new(decode_primitive::(rows, options)) + } + TimeUnit::Nanosecond => { + Arc::new(decode_primitive::(rows, options)) + } + _ => unreachable!(), + }, + DataType::Duration(TimeUnit::Second) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Duration(TimeUnit::Millisecond) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Duration(TimeUnit::Microsecond) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Duration(TimeUnit::Nanosecond) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Interval(IntervalUnit::DayTime) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Interval(IntervalUnit::YearMonth) => { + Arc::new(decode_primitive::(rows, options)) + } + DataType::Binary => Arc::new(decode_binary::(rows, options)), + DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), + DataType::Utf8 => Arc::new(decode_string::(rows, options)), + DataType::LargeUtf8 => Arc::new(decode_string::(rows, options)), + DataType::Decimal128(p, s) => { + Arc::new(decode_decimal::<16, Decimal128Type>(rows, options, *p, *s)) + } + DataType::Decimal256(p, s) => { + Arc::new(decode_decimal::<32, Decimal256Type>(rows, options, *p, *s)) + } + DataType::Dictionary(k, v) => match k.as_ref() { + DataType::Int8 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::Int16 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::Int32 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::Int64 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::UInt8 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::UInt16 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::UInt32 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + DataType::UInt64 => Arc::new(decode_dictionary::( + interner.unwrap(), + v.as_ref(), + options, + rows, + )?), + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "{} is not a valid dictionary key type", + field.data_type + ))); + } + }, + DataType::FixedSizeBinary(_) + | DataType::List(_) + | DataType::FixedSizeList(_, _) + | DataType::LargeList(_) + | DataType::Struct(_) + | DataType::Union(_, _, _) + | DataType::Map(_, _) => { + return Err(ArrowError::NotYetImplemented(format!( + "converting {} row is not supported", + field.data_type + ))) + } + }; + Ok(array) +} + #[cfg(test)] mod tests { - use super::*; + use std::sync::Arc; + + use rand::distributions::uniform::SampleUniform; + use rand::distributions::{Distribution, Standard}; + use rand::{thread_rng, Rng}; + + use arrow_array::NullArray; + use crate::array::{ BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray, @@ -466,10 +746,8 @@ mod tests { }; use crate::compute::{LexicographicalComparator, SortColumn}; use crate::util::display::array_value_to_string; - use rand::distributions::uniform::SampleUniform; - use rand::distributions::{Distribution, Standard}; - use rand::{thread_rng, Rng}; - use std::sync::Arc; + + use super::*; #[test] fn test_fixed_width() { @@ -525,19 +803,29 @@ mod tests { assert!(rows.row(0) < rows.row(1)); assert!(rows.row(3) < rows.row(0)); assert!(rows.row(4) < rows.row(1)); - assert!(rows.row(5) < rows.row(4)) + assert!(rows.row(5) < rows.row(4)); + + let back = converter.convert_rows(&rows).unwrap(); + for (expected, actual) in cols.iter().zip(&back) { + assert_eq!(expected, actual); + } } #[test] fn test_bool() { let mut converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]); - let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])); - let rows = converter.convert_columns(&[col]).unwrap(); + let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) + as ArrayRef; + + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(2) > rows.row(1)); assert!(rows.row(2) > rows.row(0)); assert!(rows.row(1) > rows.row(0)); + let cols = converter.convert_rows(&rows).unwrap(); + assert_eq!(&cols[0], &col); + let mut converter = RowConverter::new(vec![SortField::new_with_options( DataType::Boolean, SortOptions { @@ -546,11 +834,21 @@ mod tests { }, )]); - let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])); - let rows = converter.convert_columns(&[col]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(2) < rows.row(1)); assert!(rows.row(2) < rows.row(0)); assert!(rows.row(1) < rows.row(0)); + let cols = converter.convert_rows(&rows).unwrap(); + assert_eq!(&cols[0], &col); + } + + #[test] + fn test_null_encoding() { + let col = Arc::new(NullArray::new(10)); + let mut converter = RowConverter::new(vec![SortField::new(DataType::Null)]); + let rows = converter.convert_columns(&[col]).unwrap(); + assert_eq!(rows.num_rows(), 10); + assert_eq!(rows.row(1).data.len(), 0); } #[test] @@ -561,16 +859,19 @@ mod tests { None, Some("foo"), Some(""), - ])); + ])) as ArrayRef; let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); - let rows = converter.convert_columns(&[col]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(1) < rows.row(0)); assert!(rows.row(2) < rows.row(4)); assert!(rows.row(3) < rows.row(0)); assert!(rows.row(3) < rows.row(1)); + let cols = converter.convert_rows(&rows).unwrap(); + assert_eq!(&cols[0], &col); + let col = Arc::new(BinaryArray::from_iter([ None, Some(vec![0_u8; 0]), @@ -601,6 +902,9 @@ mod tests { } } + let cols = converter.convert_rows(&rows).unwrap(); + assert_eq!(&cols[0], &col); + let mut converter = RowConverter::new(vec![SortField::new_with_options( DataType::Binary, SortOptions { @@ -608,7 +912,7 @@ mod tests { nulls_first: false, }, )]); - let rows = converter.convert_columns(&[col]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); for i in 0..rows.num_rows() { for j in i + 1..rows.num_rows() { @@ -622,6 +926,9 @@ mod tests { ); } } + + let cols = converter.convert_rows(&rows).unwrap(); + assert_eq!(&cols[0], &col); } #[test] @@ -650,17 +957,23 @@ mod tests { assert_eq!(rows_a.row(1), rows_a.row(6)); assert_eq!(rows_a.row(1), rows_a.row(7)); + let cols = converter.convert_rows(&rows_a).unwrap(); + assert_eq!(&cols[0], &a); + let b = Arc::new(DictionaryArray::::from_iter([ Some("hello"), None, Some("cupcakes"), - ])); + ])) as ArrayRef; - let rows_b = converter.convert_columns(&[b]).unwrap(); + let rows_b = converter.convert_columns(&[Arc::clone(&b)]).unwrap(); assert_eq!(rows_a.row(1), rows_b.row(0)); assert_eq!(rows_a.row(3), rows_b.row(1)); assert!(rows_b.row(2) < rows_a.row(0)); + let cols = converter.convert_rows(&rows_b).unwrap(); + assert_eq!(&cols[0], &b); + let mut converter = RowConverter::new(vec![SortField::new_with_options( a.data_type().clone(), SortOptions { @@ -669,11 +982,14 @@ mod tests { }, )]); - let rows_c = converter.convert_columns(&[a]).unwrap(); + let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); assert!(rows_c.row(3) > rows_c.row(5)); assert!(rows_c.row(2) > rows_c.row(1)); assert!(rows_c.row(0) > rows_c.row(1)); assert!(rows_c.row(3) > rows_c.row(0)); + + let cols = converter.convert_rows(&rows_c).unwrap(); + assert_eq!(&cols[0], &a); } #[test] @@ -727,6 +1043,17 @@ mod tests { assert!(rows.row(3) < rows.row(0)); } + #[test] + #[should_panic(expected = "rows were not produced by this RowConverter")] + fn test_different_converter() { + let values = Arc::new(Int32Array::from_iter([Some(1), Some(-1)])); + let mut converter = RowConverter::new(vec![SortField::new(DataType::Int32)]); + let rows = converter.convert_columns(&[values]).unwrap(); + + let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]); + let _ = converter.convert_rows(&rows); + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, @@ -888,6 +1215,11 @@ mod tests { ); } } + + let back = converter.convert_rows(&rows).unwrap(); + for (actual, expected) in back.iter().zip(&arrays) { + assert_eq!(actual, expected) + } } } } diff --git a/arrow/src/row/variable.rs b/arrow/src/row/variable.rs index 2213dad9e788..36f337e658b6 100644 --- a/arrow/src/row/variable.rs +++ b/arrow/src/row/variable.rs @@ -16,12 +16,26 @@ // under the License. use crate::compute::SortOptions; -use crate::row::Rows; +use crate::row::{null_sentinel, Rows}; use crate::util::bit_util::ceil; +use arrow_array::builder::BufferBuilder; +use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait}; +use arrow_buffer::MutableBuffer; +use arrow_data::ArrayDataBuilder; +use arrow_schema::DataType; /// The block size of the variable length encoding pub const BLOCK_SIZE: usize = 32; +/// The continuation token +pub const BLOCK_CONTINUATION: u8 = 0xFF; + +/// Indicates an empty string +pub const EMPTY_SENTINEL: u8 = 1; + +/// Indicates a non-empty string +pub const NON_EMPTY_SENTINEL: u8 = 2; + /// Returns the length of the encoded representation of a byte array, including the null byte pub fn encoded_len(a: Option<&[u8]>) -> usize { match a { @@ -50,8 +64,8 @@ pub fn encode<'a, I: Iterator>>( match maybe_val { Some(val) if val.is_empty() => { out.buffer[*offset] = match opts.descending { - true => !1, - false => 1, + true => !EMPTY_SENTINEL, + false => EMPTY_SENTINEL, }; *offset += 1; } @@ -61,7 +75,7 @@ pub fn encode<'a, I: Iterator>>( let to_write = &mut out.buffer[*offset..end_offset]; // Write `2_u8` to demarcate as non-empty, non-null string - to_write[0] = 2; + to_write[0] = NON_EMPTY_SENTINEL; let chunks = val.chunks_exact(BLOCK_SIZE); let remainder = chunks.remainder(); @@ -76,7 +90,7 @@ pub fn encode<'a, I: Iterator>>( *out_block = *input; // Indicate that there are further blocks to follow - output[BLOCK_SIZE] = u8::MAX; + output[BLOCK_SIZE] = BLOCK_CONTINUATION; } if !remainder.is_empty() { @@ -97,11 +111,121 @@ pub fn encode<'a, I: Iterator>>( } } None => { - if !opts.nulls_first { - out.buffer[*offset] = 0xFF; - } + out.buffer[*offset] = null_sentinel(opts); *offset += 1; } } } } + +/// Returns the number of bytes of encoded data +fn decoded_len(row: &[u8], options: SortOptions) -> usize { + let (non_empty_sentinel, continuation) = match options.descending { + true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION), + false => (NON_EMPTY_SENTINEL, BLOCK_CONTINUATION), + }; + + if row[0] != non_empty_sentinel { + // Empty or null string + return 0; + } + + let mut str_len = 0; + let mut idx = 1; + loop { + let sentinel = row[idx + BLOCK_SIZE]; + if sentinel == continuation { + idx += BLOCK_SIZE + 1; + str_len += BLOCK_SIZE; + continue; + } + let block_len = match options.descending { + true => !sentinel, + false => sentinel, + }; + return str_len + block_len as usize; + } +} + +/// Decodes a binary array from `rows` with the provided `options` +pub fn decode_binary( + rows: &mut [&[u8]], + options: SortOptions, +) -> GenericBinaryArray { + let len = rows.len(); + let mut null_count = 0; + let nulls = MutableBuffer::collect_bool(len, |x| { + let valid = rows[x][0] != null_sentinel(options); + null_count += !valid as usize; + valid + }); + + let values_capacity = rows.iter().map(|row| decoded_len(row, options)).sum(); + let mut offsets = BufferBuilder::::new(len + 1); + offsets.append(I::zero()); + let mut values = MutableBuffer::new(values_capacity); + + for row in rows { + let str_length = decoded_len(row, options); + let mut to_read = str_length; + let mut offset = 1; + while to_read >= BLOCK_SIZE { + to_read -= BLOCK_SIZE; + + values.extend_from_slice(&row[offset..offset + BLOCK_SIZE]); + offset += BLOCK_SIZE + 1; + } + + if to_read != 0 { + values.extend_from_slice(&row[offset..offset + to_read]); + offset += BLOCK_SIZE + 1; + } + *row = &row[offset..]; + + offsets.append(I::from_usize(values.len()).expect("offset overflow")) + } + + if options.descending { + values.as_slice_mut().iter_mut().for_each(|o| *o = !*o) + } + + let d = match I::IS_LARGE { + true => DataType::LargeBinary, + false => DataType::Binary, + }; + + let builder = ArrayDataBuilder::new(d) + .len(len) + .null_count(null_count) + .null_bit_buffer(Some(nulls.into())) + .add_buffer(offsets.finish()) + .add_buffer(values.into()); + + // SAFETY: + // Valid by construction above + unsafe { GenericBinaryArray::from(builder.build_unchecked()) } +} + +/// Decodes a string array from `rows` with the provided `options` +/// +/// # Safety +/// +/// The row must contain valid UTF-8 data +pub unsafe fn decode_string( + rows: &mut [&[u8]], + options: SortOptions, +) -> GenericStringArray { + let d = match I::IS_LARGE { + true => DataType::LargeUtf8, + false => DataType::Utf8, + }; + + let builder = decode_binary::(rows, options) + .into_data() + .into_builder() + .data_type(d); + + // SAFETY: + // Row data must have come from a valid UTF-8 array + GenericStringArray::from(builder.build_unchecked()) +} From c93ce39567b73c56f11d4e731de102c532e3654d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 6 Oct 2022 10:45:23 -0700 Subject: [PATCH 0110/1411] Add NaN handling in dyn scalar comparison kernels (#2830) * Add NaN handling in dyn scalar comparison kernels * Use trait * Fix clippy * Prepare for simd and non-simd tests * Restore flight protocal files. * Use ArrowNativeTypeOp as type bound * Remove num::ToPrimitive addition --- arrow/benches/comparison_kernels.rs | 74 ++++-------- arrow/src/compute/kernels/comparison.rs | 153 +++++++++++++++++++++--- arrow/src/datatypes/native.rs | 60 +++++++++- 3 files changed, 217 insertions(+), 70 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 4ad139b879fd..e2afa99fb2ce 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -33,13 +33,6 @@ where eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_neq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -47,13 +40,6 @@ where neq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_neq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - neq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_lt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -61,13 +47,6 @@ where lt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_lt_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - lt_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_lt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -75,13 +54,6 @@ where lt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_lt_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - lt_eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_gt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -89,13 +61,6 @@ where gt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_gt_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - gt_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_gt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -103,13 +68,6 @@ where gt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } -fn bench_gt_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - gt_eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) { like_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); } @@ -164,39 +122,57 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); c.bench_function("eq scalar Float32", |b| { - b.iter(|| bench_eq_scalar(&arr_a, 1.0)) + b.iter(|| { + eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("neq Float32", |b| b.iter(|| bench_neq(&arr_a, &arr_b))); c.bench_function("neq scalar Float32", |b| { - b.iter(|| bench_neq_scalar(&arr_a, 1.0)) + b.iter(|| { + neq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("lt Float32", |b| b.iter(|| bench_lt(&arr_a, &arr_b))); c.bench_function("lt scalar Float32", |b| { - b.iter(|| bench_lt_scalar(&arr_a, 1.0)) + b.iter(|| { + lt_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("lt_eq Float32", |b| b.iter(|| bench_lt_eq(&arr_a, &arr_b))); c.bench_function("lt_eq scalar Float32", |b| { - b.iter(|| bench_lt_eq_scalar(&arr_a, 1.0)) + b.iter(|| { + lt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("gt Float32", |b| b.iter(|| bench_gt(&arr_a, &arr_b))); c.bench_function("gt scalar Float32", |b| { - b.iter(|| bench_gt_scalar(&arr_a, 1.0)) + b.iter(|| { + gt_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("gt_eq Float32", |b| b.iter(|| bench_gt_eq(&arr_a, &arr_b))); c.bench_function("gt_eq scalar Float32", |b| { - b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0)) + b.iter(|| { + gt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() + }) }); c.bench_function("eq MonthDayNano", |b| { b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b)) }); c.bench_function("eq scalar MonthDayNano", |b| { - b.iter(|| bench_eq_scalar(&arr_month_day_nano_a, 123)) + b.iter(|| { + eq_scalar( + criterion::black_box(&arr_month_day_nano_a), + criterion::black_box(123), + ) + .unwrap() + }) }); c.bench_function("like_utf8 scalar equals", |b| { diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 1ea433150f01..d1d1e470e51a 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -27,12 +27,13 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ - ArrowNativeType, ArrowNumericType, DataType, Date32Type, Date64Type, Float32Type, - Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, - IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimeUnit, - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + native_op::ArrowNativeTypeOp, ArrowNativeType, ArrowNumericType, DataType, + Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, + Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, + IntervalYearMonthType, Time32MillisecondType, Time32SecondType, + Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; #[allow(unused_imports)] use crate::downcast_dictionary_array; @@ -1328,7 +1329,12 @@ macro_rules! dyn_compare_utf8_scalar { } /// Perform `left == right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -1342,7 +1348,12 @@ where } /// Perform `left < right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -1356,7 +1367,12 @@ where } /// Perform `left <= right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -1370,7 +1386,12 @@ where } /// Perform `left > right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -1384,7 +1405,12 @@ where } /// Perform `left >= right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -1398,7 +1424,12 @@ where } /// Perform `left != right` operation on an array and a numeric scalar -/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values +/// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, @@ -3016,14 +3047,20 @@ where } /// Perform `left == right` operation on a [`PrimitiveArray`] and a scalar value. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::eq, |a, b| a == b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a == right); + return compare_op_scalar(left, |a| a.is_eq(right)); } /// Applies an unary and infallible comparison function to a primitive array. @@ -3047,14 +3084,20 @@ where } /// Perform `left != right` operation on a [`PrimitiveArray`] and a scalar value. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::ne, |a, b| a != b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a != right); + return compare_op_scalar(left, |a| a.is_ne(right)); } /// Perform `left < right` operation on two [`PrimitiveArray`]s. Null values are less than non-null @@ -3071,14 +3114,20 @@ where /// Perform `left < right` operation on a [`PrimitiveArray`] and a scalar value. /// Null values are less than non-null values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::lt, |a, b| a < b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a < right); + return compare_op_scalar(left, |a| a.is_lt(right)); } /// Perform `left <= right` operation on two [`PrimitiveArray`]s. Null values are less than non-null @@ -3098,14 +3147,20 @@ where /// Perform `left <= right` operation on a [`PrimitiveArray`] and a scalar value. /// Null values are less than non-null values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::le, |a, b| a <= b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a <= right); + return compare_op_scalar(left, |a| a.is_le(right)); } /// Perform `left > right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null @@ -3122,14 +3177,20 @@ where /// Perform `left > right` operation on a [`PrimitiveArray`] and a scalar value. /// Non-null values are greater than null values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::gt, |a, b| a > b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a > right); + return compare_op_scalar(left, |a| a.is_gt(right)); } /// Perform `left >= right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null @@ -3149,14 +3210,20 @@ where /// Perform `left >= right` operation on a [`PrimitiveArray`] and a scalar value. /// Non-null values are greater than null values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op_scalar(left, right, T::ge, |a, b| a >= b); #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a >= right); + return compare_op_scalar(left, |a| a.is_ge(right)); } /// Checks if a [`GenericListArray`] contains a value in the [`PrimitiveArray`] @@ -5848,28 +5915,48 @@ mod tests { .into_iter() .map(Some) .collect(); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(true)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) .collect(); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); assert_eq!(eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(true)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(neq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } @@ -5879,28 +5966,48 @@ mod tests { .into_iter() .map(Some) .collect(); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(lt_dyn_scalar(&array, f32::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(lt_eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) .collect(); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(lt_dyn_scalar(&array, f64::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); assert_eq!(lt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } @@ -5915,9 +6022,14 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, f32::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); assert_eq!(gt_eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] @@ -5929,9 +6041,14 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, f64::NAN).unwrap(), expected); + #[cfg(feature = "simd")] let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); assert_eq!(gt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 654b939500a2..374d0b950ea9 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -94,6 +94,30 @@ pub(crate) mod native_op { fn mod_wrapping(self, rhs: Self) -> Self { self % rhs } + + fn is_eq(self, rhs: Self) -> bool { + self == rhs + } + + fn is_ne(self, rhs: Self) -> bool { + self != rhs + } + + fn is_lt(self, rhs: Self) -> bool { + self < rhs + } + + fn is_le(self, rhs: Self) -> bool { + self <= rhs + } + + fn is_gt(self, rhs: Self) -> bool { + self > rhs + } + + fn is_ge(self, rhs: Self) -> bool { + self >= rhs + } } } @@ -186,6 +210,36 @@ native_type_op!(u16); native_type_op!(u32); native_type_op!(u64); -impl native_op::ArrowNativeTypeOp for f16 {} -impl native_op::ArrowNativeTypeOp for f32 {} -impl native_op::ArrowNativeTypeOp for f64 {} +macro_rules! native_type_float_op { + ($t:tt) => { + impl native_op::ArrowNativeTypeOp for $t { + fn is_eq(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_eq() + } + + fn is_ne(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_ne() + } + + fn is_lt(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_lt() + } + + fn is_le(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_le() + } + + fn is_gt(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_gt() + } + + fn is_ge(self, rhs: Self) -> bool { + self.total_cmp(&rhs).is_ge() + } + } + }; +} + +native_type_float_op!(f16); +native_type_float_op!(f32); +native_type_float_op!(f64); From 37c867921c434974e908b22bafe0fc84f0de2ad0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 6 Oct 2022 20:48:13 +0100 Subject: [PATCH 0111/1411] Derive ArrowPrimitiveType for Decimal128Type and Decimal256Type (#2637) (#2833) --- arrow-array/src/array/primitive_array.rs | 44 +++++++++++++++++++++++- arrow-array/src/types.rs | 13 +++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index e5f5cd481af5..3550e291c035 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -22,7 +22,7 @@ use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{print_long_array, Array, ArrayAccessor}; -use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -609,6 +609,8 @@ def_from_for_primitive!(UInt64Type, u64); def_from_for_primitive!(Float16Type, f16); def_from_for_primitive!(Float32Type, f32); def_from_for_primitive!(Float64Type, f64); +def_from_for_primitive!(Decimal128Type, i128); +def_from_for_primitive!(Decimal256Type, i256); impl From::Native>> for NativeAdapter @@ -733,6 +735,8 @@ def_numeric_from_vec!(UInt32Type); def_numeric_from_vec!(UInt64Type); def_numeric_from_vec!(Float32Type); def_numeric_from_vec!(Float64Type); +def_numeric_from_vec!(Decimal128Type); +def_numeric_from_vec!(Decimal256Type); def_numeric_from_vec!(Date32Type); def_numeric_from_vec!(Date64Type); @@ -1342,4 +1346,42 @@ mod tests { array.value(4); } + + #[test] + fn test_decimal128() { + let values: Vec<_> = vec![0, 1, -1, i128::MIN, i128::MAX]; + let array: PrimitiveArray = + PrimitiveArray::from_iter(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array: PrimitiveArray = + PrimitiveArray::from_iter_values(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(values.clone()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(array.data().clone()); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_decimal256() { + let values: Vec<_> = + vec![i256::ZERO, i256::ONE, i256::MINUS_ONE, i256::MIN, i256::MAX]; + + let array: PrimitiveArray = + PrimitiveArray::from_iter(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array: PrimitiveArray = + PrimitiveArray::from_iter_values(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(values.clone()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(array.data().clone()); + assert_eq!(array.values(), &values); + } } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 581fdc767c24..9bd433692580 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -19,6 +19,7 @@ use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; +use arrow_buffer::i256; use arrow_data::decimal::{ DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, @@ -515,6 +516,12 @@ impl DecimalType for Decimal128Type { DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); } +impl ArrowPrimitiveType for Decimal128Type { + type Native = i128; + + const DATA_TYPE: DataType = ::DEFAULT_TYPE; +} + /// The decimal type for a Decimal256Array #[derive(Debug)] pub struct Decimal256Type {} @@ -530,6 +537,12 @@ impl DecimalType for Decimal256Type { DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); } +impl ArrowPrimitiveType for Decimal256Type { + type Native = i256; + + const DATA_TYPE: DataType = ::DEFAULT_TYPE; +} + #[cfg(test)] mod tests { use super::*; From 8dd94a97441822c88274d31b6782fb6bf3e4ecd8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 7 Oct 2022 01:09:56 -0700 Subject: [PATCH 0112/1411] Expose ArrowNativeTypeOp (#2840) --- arrow/src/compute/kernels/aggregate.rs | 3 +- arrow/src/compute/kernels/arithmetic.rs | 2 +- arrow/src/compute/kernels/comparison.rs | 14 +- arrow/src/datatypes/native.rs | 164 ++++++++++++------------ 4 files changed, 91 insertions(+), 92 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index 083defdde7dc..e6c927c24868 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -27,8 +27,7 @@ use crate::array::{ as_primitive_array, Array, ArrayAccessor, ArrayIter, BooleanArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; -use crate::datatypes::native_op::ArrowNativeTypeOp; -use crate::datatypes::{ArrowNativeType, ArrowNumericType, DataType}; +use crate::datatypes::{ArrowNativeType, ArrowNativeTypeOp, ArrowNumericType, DataType}; use crate::error::Result; use crate::util::bit_iterator::BitIndexIterator; diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 1e6e55248b71..fe222c3d15d2 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -34,7 +34,7 @@ use crate::compute::{ binary, binary_opt, try_binary, try_unary, try_unary_dyn, unary_dyn, }; use crate::datatypes::{ - native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, + ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, }; #[cfg(feature = "dyn_arith_dict")] diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index d1d1e470e51a..143050ea97f3 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -27,13 +27,13 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ - native_op::ArrowNativeTypeOp, ArrowNativeType, ArrowNumericType, DataType, - Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, - Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, Time32MillisecondType, Time32SecondType, - Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + ArrowNativeType, ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, + Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, + Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, + TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, + UInt8Type, }; #[allow(unused_imports)] use crate::downcast_dictionary_array; diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 374d0b950ea9..2ff9574c78a6 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,114 +16,113 @@ // under the License. use crate::error::{ArrowError, Result}; +pub use arrow_array::ArrowPrimitiveType; pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; use num::Zero; +use std::ops::{Add, Div, Mul, Rem, Sub}; -pub use arrow_array::ArrowPrimitiveType; +mod private { + pub trait Sealed {} +} -pub(crate) mod native_op { - use super::ArrowNativeType; - use crate::error::{ArrowError, Result}; - use num::Zero; - use std::ops::{Add, Div, Mul, Rem, Sub}; - - /// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking - /// variants for arithmetic operations. For floating point types, this provides some - /// default implementations. Integer types that need to deal with overflow can implement - /// this trait. - /// - /// The APIs with `_wrapping` suffix are the variant of non-overflow-checking. If overflow - /// occurred, they will supposedly wrap around the boundary of the type. - /// - /// The APIs with `_checked` suffix are the variant of overflow-checking which return `None` - /// if overflow occurred. - pub trait ArrowNativeTypeOp: - ArrowNativeType - + Add - + Sub - + Mul - + Div - + Rem - + Zero - { - fn add_checked(self, rhs: Self) -> Result { - Ok(self + rhs) - } +/// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking +/// variants for arithmetic operations. For floating point types, this provides some +/// default implementations. Integer types that need to deal with overflow can implement +/// this trait. +/// +/// The APIs with `_wrapping` suffix are the variant of non-overflow-checking. If overflow +/// occurred, they will supposedly wrap around the boundary of the type. +/// +/// The APIs with `_checked` suffix are the variant of overflow-checking which return `None` +/// if overflow occurred. +pub trait ArrowNativeTypeOp: + ArrowNativeType + + Add + + Sub + + Mul + + Div + + Rem + + Zero + + private::Sealed +{ + fn add_checked(self, rhs: Self) -> Result { + Ok(self + rhs) + } - fn add_wrapping(self, rhs: Self) -> Self { - self + rhs - } + fn add_wrapping(self, rhs: Self) -> Self { + self + rhs + } - fn sub_checked(self, rhs: Self) -> Result { - Ok(self - rhs) - } + fn sub_checked(self, rhs: Self) -> Result { + Ok(self - rhs) + } - fn sub_wrapping(self, rhs: Self) -> Self { - self - rhs - } + fn sub_wrapping(self, rhs: Self) -> Self { + self - rhs + } - fn mul_checked(self, rhs: Self) -> Result { - Ok(self * rhs) - } + fn mul_checked(self, rhs: Self) -> Result { + Ok(self * rhs) + } - fn mul_wrapping(self, rhs: Self) -> Self { - self * rhs - } + fn mul_wrapping(self, rhs: Self) -> Self { + self * rhs + } - fn div_checked(self, rhs: Self) -> Result { - if rhs.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(self / rhs) - } + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self / rhs) } + } - fn div_wrapping(self, rhs: Self) -> Self { - self / rhs - } + fn div_wrapping(self, rhs: Self) -> Self { + self / rhs + } - fn mod_checked(self, rhs: Self) -> Result { - if rhs.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(self % rhs) - } + fn mod_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self % rhs) } + } - fn mod_wrapping(self, rhs: Self) -> Self { - self % rhs - } + fn mod_wrapping(self, rhs: Self) -> Self { + self % rhs + } - fn is_eq(self, rhs: Self) -> bool { - self == rhs - } + fn is_eq(self, rhs: Self) -> bool { + self == rhs + } - fn is_ne(self, rhs: Self) -> bool { - self != rhs - } + fn is_ne(self, rhs: Self) -> bool { + self != rhs + } - fn is_lt(self, rhs: Self) -> bool { - self < rhs - } + fn is_lt(self, rhs: Self) -> bool { + self < rhs + } - fn is_le(self, rhs: Self) -> bool { - self <= rhs - } + fn is_le(self, rhs: Self) -> bool { + self <= rhs + } - fn is_gt(self, rhs: Self) -> bool { - self > rhs - } + fn is_gt(self, rhs: Self) -> bool { + self > rhs + } - fn is_ge(self, rhs: Self) -> bool { - self >= rhs - } + fn is_ge(self, rhs: Self) -> bool { + self >= rhs } } macro_rules! native_type_op { ($t:tt) => { - impl native_op::ArrowNativeTypeOp for $t { + impl private::Sealed for $t {} + impl ArrowNativeTypeOp for $t { fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( @@ -212,7 +211,8 @@ native_type_op!(u64); macro_rules! native_type_float_op { ($t:tt) => { - impl native_op::ArrowNativeTypeOp for $t { + impl private::Sealed for $t {} + impl ArrowNativeTypeOp for $t { fn is_eq(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_eq() } From fe6c66c4cb2e3d37fed25b85c138ecf9a03928f0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Oct 2022 12:54:40 +0100 Subject: [PATCH 0113/1411] Update actions/labeler to v4.0.2 (#2843) --- .github/workflows/dev_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 38bb39390097..8fb584a4fdb3 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -37,7 +37,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@v4.0.1 + uses: actions/labeler@v4.0.2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml From 88267f867bca16b57603d255a167c064e602d747 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Oct 2022 13:47:26 +0100 Subject: [PATCH 0114/1411] Add labeller permissions block (#2844) --- .github/workflows/dev_pr.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 8fb584a4fdb3..2f13b726bcfa 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -29,6 +29,9 @@ jobs: process: name: Process runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write steps: - uses: actions/checkout@v3 From 5cf46d43e8ede2ff3f291de85a27fce25ff2e9c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanislav=20Luke=C5=A1?= Date: Fri, 7 Oct 2022 17:36:05 +0000 Subject: [PATCH 0115/1411] parquet: Add `snap` option to README (#2847) I tried using parquet with `default-features = false` (and with some options explicitly enabled), but it didn't support snappy compression. Turns out there is an undocumented option for it. --- parquet/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/README.md b/parquet/README.md index 689a664b6326..96a34d7c2881 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -47,6 +47,7 @@ The `parquet` crate provides the following features which may be enabled in your - `flate2` (default) - support for parquet using `gzip` compression - `lz4` (default) - support for parquet using `lz4` compression - `zstd` (default) - support for parquet using `zstd` compression +- `snap` (default) - support for parquet using `snappy` compression - `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/master/parquet/src/bin) - `experimental` - Experimental APIs which may change, even between minor releases From 9db75183114ba0bb8dbbe4d3c52a60aead66f0e4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 8 Oct 2022 10:14:34 +0100 Subject: [PATCH 0116/1411] Cleanup cast kernel (#2846) * Cleanup cast kernel * Fix timezone handling --- arrow-array/src/array/primitive_array.rs | 7 +- arrow/src/compute/kernels/cast.rs | 581 ++++++++++------------- 2 files changed, 261 insertions(+), 327 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 3550e291c035..e362f0d7e84d 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -769,11 +769,16 @@ impl PrimitiveArray { /// Construct a timestamp array with new timezone pub fn with_timezone(&self, timezone: String) -> Self { + self.with_timezone_opt(Some(timezone)) + } + + /// Construct a timestamp array with an optional timezone + pub fn with_timezone_opt(&self, timezone: Option) -> Self { let array_data = unsafe { self.data .clone() .into_builder() - .data_type(DataType::Timestamp(T::get_time_unit(), Some(timezone))) + .data_type(DataType::Timestamp(T::get_time_unit(), timezone)) .build_unchecked() }; PrimitiveArray::from(array_data) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 31ac738fa93a..912ea28830eb 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -43,12 +43,10 @@ use std::str; use std::sync::Arc; use crate::buffer::MutableBuffer; -use crate::compute::divide_scalar; -use crate::compute::kernels::arithmetic::{divide, multiply}; -use crate::compute::kernels::arity::unary; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; use crate::compute::kernels::temporal::extract_component_from_array; use crate::compute::kernels::temporal::return_compute_error_with; +use crate::compute::{divide_scalar, multiply_scalar}; use crate::compute::{try_unary, using_chrono_tz_and_utc_naive_date_time}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; @@ -315,7 +313,7 @@ fn cast_primitive_to_decimal( op: F, precision: u8, scale: u8, -) -> Result> +) -> Result where F: Fn(T::Item) -> i128, { @@ -332,7 +330,7 @@ fn cast_integer_to_decimal( array: &PrimitiveArray, precision: u8, scale: u8, -) -> Result> +) -> Result where ::Native: AsPrimitive, { @@ -347,7 +345,7 @@ fn cast_floating_point_to_decimal( array: &PrimitiveArray, precision: u8, scale: u8, -) -> Result> +) -> Result where ::Native: AsPrimitive, { @@ -365,6 +363,18 @@ where ) } +/// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] +fn cast_reinterpret_arrays< + I: ArrowPrimitiveType, + O: ArrowPrimitiveType, +>( + array: &dyn Array, +) -> Result { + Ok(Arc::new( + as_primitive_array::(array).reinterpret_cast::(), + )) +} + // cast the decimal array to integer array macro_rules! cast_decimal_to_integer { ($ARRAY:expr, $SCALE : ident, $VALUE_BUILDER: ident, $NATIVE_TYPE : ident, $DATA_TYPE : expr) => {{ @@ -430,6 +440,35 @@ macro_rules! cast_list_to_string { }}; } +fn make_timestamp_array( + array: &PrimitiveArray, + unit: TimeUnit, + tz: Option, +) -> ArrayRef { + match unit { + TimeUnit::Second => Arc::new( + array + .reinterpret_cast::() + .with_timezone_opt(tz), + ), + TimeUnit::Millisecond => Arc::new( + array + .reinterpret_cast::() + .with_timezone_opt(tz), + ), + TimeUnit::Microsecond => Arc::new( + array + .reinterpret_cast::() + .with_timezone_opt(tz), + ), + TimeUnit::Nanosecond => Arc::new( + array + .reinterpret_cast::() + .with_timezone_opt(tz), + ), + } +} + /// Cast `array` to the provided data type and return a new Array with /// type `to_type`, if possible. It accepts `CastOptions` to allow consumers /// to configure cast behavior. @@ -619,50 +658,28 @@ pub fn cast_with_options( cast_primitive_to_list::(array, to, to_type, cast_options) } (Dictionary(index_type, _), _) => match **index_type { - DataType::Int8 => dictionary_cast::(array, to_type, cast_options), - DataType::Int16 => dictionary_cast::(array, to_type, cast_options), - DataType::Int32 => dictionary_cast::(array, to_type, cast_options), - DataType::Int64 => dictionary_cast::(array, to_type, cast_options), - DataType::UInt8 => dictionary_cast::(array, to_type, cast_options), - DataType::UInt16 => { - dictionary_cast::(array, to_type, cast_options) - } - DataType::UInt32 => { - dictionary_cast::(array, to_type, cast_options) - } - DataType::UInt64 => { - dictionary_cast::(array, to_type, cast_options) - } + Int8 => dictionary_cast::(array, to_type, cast_options), + Int16 => dictionary_cast::(array, to_type, cast_options), + Int32 => dictionary_cast::(array, to_type, cast_options), + Int64 => dictionary_cast::(array, to_type, cast_options), + UInt8 => dictionary_cast::(array, to_type, cast_options), + UInt16 => dictionary_cast::(array, to_type, cast_options), + UInt32 => dictionary_cast::(array, to_type, cast_options), + UInt64 => dictionary_cast::(array, to_type, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from dictionary type {:?} to {:?} not supported", from_type, to_type, ))), }, (_, Dictionary(index_type, value_type)) => match **index_type { - DataType::Int8 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int16 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int32 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int64 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt8 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt16 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt32 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt64 => { - cast_to_dictionary::(array, value_type, cast_options) - } + Int8 => cast_to_dictionary::(array, value_type, cast_options), + Int16 => cast_to_dictionary::(array, value_type, cast_options), + Int32 => cast_to_dictionary::(array, value_type, cast_options), + Int64 => cast_to_dictionary::(array, value_type, cast_options), + UInt8 => cast_to_dictionary::(array, value_type, cast_options), + UInt16 => cast_to_dictionary::(array, value_type, cast_options), + UInt32 => cast_to_dictionary::(array, value_type, cast_options), + UInt64 => cast_to_dictionary::(array, value_type, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from type {:?} to dictionary type {:?} not supported", from_type, to_type, @@ -757,20 +774,18 @@ pub fn cast_with_options( Int64 => cast_numeric_to_string::(array), Float32 => cast_numeric_to_string::(array), Float64 => cast_numeric_to_string::(array), - Timestamp(unit, tz) => match unit { - TimeUnit::Nanosecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Microsecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Millisecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Second => { - cast_timestamp_to_string::(array, tz) - } - }, + Timestamp(TimeUnit::Nanosecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Microsecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Millisecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Second, tz) => { + cast_timestamp_to_string::(array, tz) + } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), Binary => { @@ -813,20 +828,18 @@ pub fn cast_with_options( Int64 => cast_numeric_to_string::(array), Float32 => cast_numeric_to_string::(array), Float64 => cast_numeric_to_string::(array), - Timestamp(unit, tz) => match unit { - TimeUnit::Nanosecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Microsecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Millisecond => { - cast_timestamp_to_string::(array, tz) - } - TimeUnit::Second => { - cast_timestamp_to_string::(array, tz) - } - }, + Timestamp(TimeUnit::Nanosecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Microsecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Millisecond, tz) => { + cast_timestamp_to_string::(array, tz) + } + Timestamp(TimeUnit::Second, tz) => { + cast_timestamp_to_string::(array, tz) + } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), Binary => { @@ -1160,167 +1173,146 @@ pub fn cast_with_options( // end numeric casts // temporal casts - (Int32, Date32) => cast_array_data::(array, to_type.clone()), + (Int32, Date32) => cast_reinterpret_arrays::(array), (Int32, Date64) => cast_with_options( - &cast_with_options(array, &DataType::Date32, cast_options)?, - &DataType::Date64, + &cast_with_options(array, &Date32, cast_options)?, + &Date64, cast_options, ), (Int32, Time32(TimeUnit::Second)) => { - cast_array_data::(array, to_type.clone()) + cast_reinterpret_arrays::(array) } (Int32, Time32(TimeUnit::Millisecond)) => { - cast_array_data::(array, to_type.clone()) + cast_reinterpret_arrays::(array) } // No support for microsecond/nanosecond with i32 - (Date32, Int32) => cast_array_data::(array, to_type.clone()), + (Date32, Int32) => cast_reinterpret_arrays::(array), (Date32, Int64) => cast_with_options( - &cast_with_options(array, &DataType::Int32, cast_options)?, - &DataType::Int64, + &cast_with_options(array, &Int32, cast_options)?, + &Int64, cast_options, ), - (Time32(_), Int32) => cast_array_data::(array, to_type.clone()), - (Int64, Date64) => cast_array_data::(array, to_type.clone()), + (Time32(TimeUnit::Second), Int32) => { + cast_reinterpret_arrays::(array) + } + (Time32(TimeUnit::Millisecond), Int32) => { + cast_reinterpret_arrays::(array) + } + (Int64, Date64) => cast_reinterpret_arrays::(array), (Int64, Date32) => cast_with_options( - &cast_with_options(array, &DataType::Int32, cast_options)?, - &DataType::Date32, + &cast_with_options(array, &Int32, cast_options)?, + &Date32, cast_options, ), // No support for second/milliseconds with i64 (Int64, Time64(TimeUnit::Microsecond)) => { - cast_array_data::(array, to_type.clone()) + cast_reinterpret_arrays::(array) } (Int64, Time64(TimeUnit::Nanosecond)) => { - cast_array_data::(array, to_type.clone()) + cast_reinterpret_arrays::(array) } - (Date64, Int64) => cast_array_data::(array, to_type.clone()), + (Date64, Int64) => cast_reinterpret_arrays::(array), (Date64, Int32) => cast_with_options( - &cast_with_options(array, &DataType::Int64, cast_options)?, - &DataType::Int32, + &cast_with_options(array, &Int64, cast_options)?, + &Int32, cast_options, ), - (Time64(_), Int64) => cast_array_data::(array, to_type.clone()), - (Date32, Date64) => { - let date_array = array.as_any().downcast_ref::().unwrap(); - - let values = - unary::<_, _, Date64Type>(date_array, |x| x as i64 * MILLISECONDS_IN_DAY); - - Ok(Arc::new(values) as ArrayRef) + (Time64(TimeUnit::Microsecond), Int64) => { + cast_reinterpret_arrays::(array) } - (Date64, Date32) => { - let date_array = array.as_any().downcast_ref::().unwrap(); - - let values = unary::<_, _, Date32Type>(date_array, |x| { - (x / MILLISECONDS_IN_DAY) as i32 - }); - - Ok(Arc::new(values) as ArrayRef) + (Time64(TimeUnit::Nanosecond), Int64) => { + cast_reinterpret_arrays::(array) } - (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => { - let time_array = array.as_any().downcast_ref::().unwrap(); + (Date32, Date64) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Date64Type>(|x| x as i64 * MILLISECONDS_IN_DAY), + )), + (Date64, Date32) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Date32Type>(|x| (x / MILLISECONDS_IN_DAY) as i32), + )), - let values = unary::<_, _, Time32MillisecondType>(time_array, |x| { - x * MILLISECONDS as i32 - }); + (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32MillisecondType>(|x| x * MILLISECONDS as i32), + )), + (Time32(TimeUnit::Second), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64MicrosecondType>(|x| x as i64 * MICROSECONDS), + )), + (Time32(TimeUnit::Second), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64NanosecondType>(|x| x as i64 * NANOSECONDS), + )), - Ok(Arc::new(values) as ArrayRef) - } - (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); + (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32SecondType>(|x| x / MILLISECONDS as i32), + )), + (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64MicrosecondType>(|x| { + x as i64 * (MICROSECONDS / MILLISECONDS) + }), + )), + (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64NanosecondType>(|x| { + x as i64 * (MICROSECONDS / NANOSECONDS) + }), + )), - let values = unary::<_, _, Time32SecondType>(time_array, |x| { - x / (MILLISECONDS as i32) - }); + (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Second)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32SecondType>(|x| (x / MICROSECONDS) as i32), + )), + (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32MillisecondType>(|x| { + (x / (MICROSECONDS / MILLISECONDS)) as i32 + }), + )), + (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64NanosecondType>(|x| x * (NANOSECONDS / MICROSECONDS)), + )), - Ok(Arc::new(values) as ArrayRef) - } - //(Time32(TimeUnit::Second), Time64(_)) => {}, - (Time32(from_unit), Time64(to_unit)) => { - let time_array = Int32Array::from(array.data().clone()); - // note: (numeric_cast + SIMD multiply) is faster than (cast & multiply) - let c: Int64Array = numeric_cast(&time_array); - let from_size = time_unit_multiple(from_unit); - let to_size = time_unit_multiple(to_unit); - // from is only smaller than to if 64milli/64second don't exist - let mult = Int64Array::from(vec![to_size / from_size; array.len()]); - let converted = multiply(&c, &mult)?; - let array_ref = Arc::new(converted) as ArrayRef; - use TimeUnit::*; - match to_unit { - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - _ => unreachable!("array type not supported"), - } - } - (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); + (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Second)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32SecondType>(|x| (x / NANOSECONDS) as i32), + )), + (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time32MillisecondType>(|x| { + (x / (NANOSECONDS / MILLISECONDS)) as i32 + }), + )), + (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), + )), - let values = - unary::<_, _, Time64NanosecondType>(time_array, |x| x * MILLISECONDS); - Ok(Arc::new(values) as ArrayRef) + (Timestamp(TimeUnit::Second, _), Int64) => { + cast_reinterpret_arrays::(array) } - (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); - - let values = - unary::<_, _, Time64MicrosecondType>(time_array, |x| x / MILLISECONDS); - Ok(Arc::new(values) as ArrayRef) + (Timestamp(TimeUnit::Millisecond, _), Int64) => { + cast_reinterpret_arrays::(array) } - (Time64(from_unit), Time32(to_unit)) => { - let time_array = Int64Array::from(array.data().clone()); - let from_size = time_unit_multiple(from_unit); - let to_size = time_unit_multiple(to_unit); - let divisor = from_size / to_size; - match to_unit { - TimeUnit::Second => { - let values = unary::<_, _, Time32SecondType>(&time_array, |x| { - (x as i64 / divisor) as i32 - }); - Ok(Arc::new(values) as ArrayRef) - } - TimeUnit::Millisecond => { - let values = unary::<_, _, Time32MillisecondType>(&time_array, |x| { - (x as i64 / divisor) as i32 - }); - Ok(Arc::new(values) as ArrayRef) - } - _ => unreachable!("array type not supported"), - } + (Timestamp(TimeUnit::Microsecond, _), Int64) => { + cast_reinterpret_arrays::(array) } - (Timestamp(_, _), Int64) => cast_array_data::(array, to_type.clone()), - (Int64, Timestamp(to_unit, _)) => { - use TimeUnit::*; - match to_unit { - Second => cast_array_data::(array, to_type.clone()), - Millisecond => { - cast_array_data::(array, to_type.clone()) - } - Microsecond => { - cast_array_data::(array, to_type.clone()) - } - Nanosecond => { - cast_array_data::(array, to_type.clone()) - } - } + (Timestamp(TimeUnit::Nanosecond, _), Int64) => { + cast_reinterpret_arrays::(array) } - (Timestamp(from_unit, _), Timestamp(to_unit, _)) => { + + (Int64, Timestamp(unit, tz)) => Ok(make_timestamp_array( + as_primitive_array(array), + unit.clone(), + tz.clone(), + )), + + (Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => { let time_array = Int64Array::from(array.data().clone()); let from_size = time_unit_multiple(from_unit); let to_size = time_unit_multiple(to_unit); @@ -1329,30 +1321,13 @@ pub fn cast_with_options( let converted = if from_size >= to_size { divide_scalar(&time_array, from_size / to_size)? } else { - multiply( - &time_array, - &Int64Array::from(vec![to_size / from_size; array.len()]), - )? + multiply_scalar(&time_array, to_size / from_size)? }; - let array_ref = Arc::new(converted) as ArrayRef; - use TimeUnit::*; - match to_unit { - Second => { - cast_array_data::(&array_ref, to_type.clone()) - } - Millisecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - } + Ok(make_timestamp_array( + &converted, + to_unit.clone(), + to_tz.clone(), + )) } (Timestamp(from_unit, _), Date32) => { let time_array = Int64Array::from(array.data().clone()); @@ -1371,80 +1346,61 @@ pub fn cast_with_options( Ok(Arc::new(b.finish()) as ArrayRef) } - (Timestamp(from_unit, _), Date64) => { - let from_size = time_unit_multiple(from_unit); - let to_size = MILLISECONDS; - - // Scale time_array by (to_size / from_size) using a - // single integer operation, but need to avoid integer - // math rounding down to zero - - match to_size.cmp(&from_size) { - std::cmp::Ordering::Less => { - let time_array = Date64Array::from(array.data().clone()); - Ok(Arc::new(divide( - &time_array, - &Date64Array::from(vec![from_size / to_size; array.len()]), - )?) as ArrayRef) - } - std::cmp::Ordering::Equal => { - cast_array_data::(array, to_type.clone()) - } - std::cmp::Ordering::Greater => { - let time_array = Date64Array::from(array.data().clone()); - Ok(Arc::new(multiply( - &time_array, - &Date64Array::from(vec![to_size / from_size; array.len()]), - )?) as ArrayRef) - } - } + (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Date64Type>(|x| x * MILLISECONDS), + )), + (Timestamp(TimeUnit::Millisecond, _), Date64) => { + cast_reinterpret_arrays::(array) } + (Timestamp(TimeUnit::Microsecond, _), Date64) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Date64Type>(|x| x / (MICROSECONDS / MILLISECONDS)), + )), + (Timestamp(TimeUnit::Nanosecond, _), Date64) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), + )), + // date64 to timestamp might not make sense, - (Int64, Duration(to_unit)) => { - use TimeUnit::*; - match to_unit { - Second => cast_array_data::(array, to_type.clone()), - Millisecond => { - cast_array_data::(array, to_type.clone()) - } - Microsecond => { - cast_array_data::(array, to_type.clone()) - } - Nanosecond => { - cast_array_data::(array, to_type.clone()) - } - } + (Int64, Duration(TimeUnit::Second)) => { + cast_reinterpret_arrays::(array) + } + (Int64, Duration(TimeUnit::Millisecond)) => { + cast_reinterpret_arrays::(array) + } + (Int64, Duration(TimeUnit::Microsecond)) => { + cast_reinterpret_arrays::(array) + } + (Int64, Duration(TimeUnit::Nanosecond)) => { + cast_reinterpret_arrays::(array) + } + + (Duration(TimeUnit::Second), Int64) => { + cast_reinterpret_arrays::(array) + } + (Duration(TimeUnit::Millisecond), Int64) => { + cast_reinterpret_arrays::(array) + } + (Duration(TimeUnit::Microsecond), Int64) => { + cast_reinterpret_arrays::(array) + } + (Duration(TimeUnit::Nanosecond), Int64) => { + cast_reinterpret_arrays::(array) + } + + (Interval(IntervalUnit::YearMonth), Int64) => { + cast_numeric_arrays::(array, cast_options) + } + (Interval(IntervalUnit::DayTime), Int64) => { + cast_reinterpret_arrays::(array) + } + (Int32, Interval(IntervalUnit::YearMonth)) => { + cast_reinterpret_arrays::(array) + } + (Int64, Interval(IntervalUnit::DayTime)) => { + cast_reinterpret_arrays::(array) } - (Duration(_), Int64) => cast_array_data::(array, to_type.clone()), - (Interval(from_type), Int64) => match from_type { - IntervalUnit::YearMonth => cast_numeric_arrays::< - IntervalYearMonthType, - Int64Type, - >(array, cast_options), - IntervalUnit::DayTime => cast_array_data::(array, to_type.clone()), - IntervalUnit::MonthDayNano => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (Int32, Interval(to_type)) => match to_type { - IntervalUnit::YearMonth => { - cast_array_data::(array, Interval(to_type.clone())) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (Int64, Interval(to_type)) => match to_type { - IntervalUnit::DayTime => { - cast_array_data::(array, Interval(to_type.clone())) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, (_, _) => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -1617,33 +1573,6 @@ fn cast_decimal_to_decimal( } } -/// Cast an array by changing its array_data type to the desired type -/// -/// Arrays should have the same primitive data type, otherwise this should fail. -/// We do not perform this check on primitive data types as we only use this -/// function internally, where it is guaranteed to be infallible. -fn cast_array_data(array: &ArrayRef, to_type: DataType) -> Result -where - TO: ArrowNumericType, -{ - let data = unsafe { - ArrayData::new_unchecked( - to_type, - array.len(), - Some(array.null_count()), - array - .data() - .null_bitmap() - .cloned() - .map(|bitmap| bitmap.into_buffer()), - array.data().offset(), - array.data().buffers().to_vec(), - vec![], - ) - }; - Ok(Arc::new(PrimitiveArray::::from(data)) as ArrayRef) -} - /// Convert Array into a PrimitiveArray of type, and apply numeric cast fn cast_numeric_arrays( from: &ArrayRef, @@ -1652,8 +1581,8 @@ fn cast_numeric_arrays( where FROM: ArrowNumericType, TO: ArrowNumericType, - FROM::Native: num::NumCast, - TO::Native: num::NumCast, + FROM::Native: NumCast, + TO::Native: NumCast, { if cast_options.safe { // If the value can't be casted to the `TO::Native`, return null @@ -1678,8 +1607,8 @@ fn try_numeric_cast(from: &PrimitiveArray) -> Result> where T: ArrowNumericType, R: ArrowNumericType, - T::Native: num::NumCast, - R::Native: num::NumCast, + T::Native: NumCast, + R::Native: NumCast, { try_unary(from, |value| { num::cast::cast::(value).ok_or_else(|| { @@ -1698,8 +1627,8 @@ fn numeric_cast(from: &PrimitiveArray) -> PrimitiveArray where T: ArrowNumericType, R: ArrowNumericType, - T::Native: num::NumCast, - R::Native: num::NumCast, + T::Native: NumCast, + R::Native: NumCast, { let iter = from .iter() From 97480a97ec50c1df8c0dc31338216d808a6eb4e3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 8 Oct 2022 10:16:09 +0100 Subject: [PATCH 0117/1411] Simplify filter_dict (#2831) --- arrow/src/compute/kernels/filter.rs | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index d528b0632486..d1e2ad17593d 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -660,22 +660,15 @@ where T: ArrowPrimitiveType, T::Native: num::Num, { - let filtered_keys = filter_primitive::(array.keys(), predicate); - let filtered_data = filtered_keys.data_ref(); - - let data = unsafe { - ArrayData::new_unchecked( - array.data_type().clone(), - filtered_data.len(), - Some(filtered_data.null_count()), - filtered_data.null_buffer().cloned(), - filtered_data.offset(), - filtered_data.buffers().to_vec(), - array.data().child_data().to_vec(), - ) - }; - - DictionaryArray::from(data) + let builder = filter_primitive::(array.keys(), predicate) + .into_data() + .into_builder() + .data_type(array.data_type().clone()) + .child_data(array.data().child_data().to_vec()); + + // SAFETY: + // Keys were valid before, filtered subset is therefore still valid + DictionaryArray::from(unsafe { builder.build_unchecked() }) } #[cfg(test)] From c8321f47ebfb146f9e90d597d5c30ea993e2b533 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 8 Oct 2022 12:29:01 +0100 Subject: [PATCH 0118/1411] Simplify ArrowNativeType (#2841) * Simplify ArrowNativeType * Update tests --- arrow/src/compute/kernels/aggregate.rs | 9 +- arrow/src/compute/kernels/arithmetic.rs | 98 +++++----- arrow/src/datatypes/native.rs | 228 ++++++++++++++---------- 3 files changed, 187 insertions(+), 148 deletions(-) diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow/src/compute/kernels/aggregate.rs index e6c927c24868..4e726974f66c 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow/src/compute/kernels/aggregate.rs @@ -391,9 +391,8 @@ where mod simd { use super::is_nan; use crate::array::{Array, PrimitiveArray}; - use crate::datatypes::ArrowNumericType; + use crate::datatypes::{ArrowNativeTypeOp, ArrowNumericType}; use std::marker::PhantomData; - use std::ops::Add; pub(super) trait SimdAggregate { type ScalarAccumulator; @@ -434,7 +433,7 @@ mod simd { impl SimdAggregate for SumAggregate where - T::Native: Add, + T::Native: ArrowNativeTypeOp, { type ScalarAccumulator = T::Native; type SimdAccumulator = T::Simd; @@ -463,7 +462,7 @@ mod simd { } fn accumulate_scalar(accumulator: &mut T::Native, value: T::Native) { - *accumulator = *accumulator + value + *accumulator = accumulator.add_wrapping(value) } fn reduce( @@ -738,7 +737,7 @@ mod simd { #[cfg(feature = "simd")] pub fn sum(array: &PrimitiveArray) -> Option where - T::Native: Add, + T::Native: ArrowNativeTypeOp, { use simd::*; diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index fe222c3d15d2..a73ee7eee151 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -22,9 +22,7 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use std::ops::{Div, Neg}; - -use num::{One, Zero}; +use std::ops::Neg; use crate::array::*; #[cfg(feature = "simd")] @@ -107,7 +105,6 @@ fn math_checked_divide_op( where LT: ArrowNumericType, RT: ArrowNumericType, - RT::Native: One + Zero, F: Fn(LT::Native, RT::Native) -> Result, { try_binary(left, right, op) @@ -131,7 +128,6 @@ fn math_checked_divide_op_on_iters( ) -> Result> where T: ArrowNumericType, - T::Native: One + Zero, F: Fn(T::Native, T::Native) -> Result, { let buffer = if null_bit_buffer.is_some() { @@ -182,10 +178,10 @@ fn simd_checked_modulus( right: T::Simd, ) -> Result where - T::Native: ArrowNativeTypeOp + One, + T::Native: ArrowNativeTypeOp, { - let zero = T::init(T::Native::zero()); - let one = T::init(T::Native::one()); + let zero = T::init(T::Native::ZERO); + let one = T::init(T::Native::ONE); let right_no_invalid_zeros = match valid_mask { Some(mask) => { @@ -219,10 +215,10 @@ fn simd_checked_divide( right: T::Simd, ) -> Result where - T::Native: One + Zero, + T::Native: ArrowNativeTypeOp, { - let zero = T::init(T::Native::zero()); - let one = T::init(T::Native::one()); + let zero = T::init(T::Native::ZERO); + let one = T::init(T::Native::ONE); let right_no_invalid_zeros = match valid_mask { Some(mask) => { @@ -260,7 +256,7 @@ fn simd_checked_divide_op_remainder( ) -> Result<()> where T: ArrowNumericType, - T::Native: Zero, + T::Native: ArrowNativeTypeOp, F: Fn(T::Native, T::Native) -> T::Native, { let result_remainder = result_chunks.into_remainder(); @@ -273,7 +269,7 @@ where .enumerate() .try_for_each(|(i, (result_scalar, (left_scalar, right_scalar)))| { if valid_mask.map(|mask| mask & (1 << i) != 0).unwrap_or(true) { - if *right_scalar == T::Native::zero() { + if right_scalar.is_zero() { return Err(ArrowError::DivideByZero); } *result_scalar = op(*left_scalar, *right_scalar); @@ -648,7 +644,6 @@ fn math_divide_checked_op_dict( where K: ArrowNumericType, T: ArrowNumericType, - T::Native: One + Zero, F: Fn(T::Native, T::Native) -> Result, { if left.len() != right.len() { @@ -702,7 +697,6 @@ fn math_divide_safe_op_dict( where K: ArrowNumericType, T: ArrowNumericType, - T::Native: One + Zero, F: Fn(T::Native, T::Native) -> Option, { let left = left.downcast_dict::>().unwrap(); @@ -719,7 +713,6 @@ fn math_safe_divide_op( where LT: ArrowNumericType, RT: ArrowNumericType, - RT::Native: One + Zero, F: Fn(LT::Native, RT::Native) -> Option, { let array: PrimitiveArray = binary_opt::<_, _, _, LT>(left, right, op)?; @@ -1068,8 +1061,8 @@ pub fn subtract_scalar( scalar: T::Native, ) -> Result> where - T: datatypes::ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero, + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { Ok(unary(array, |value| value.sub_wrapping(scalar))) } @@ -1085,7 +1078,7 @@ pub fn subtract_scalar_checked( ) -> Result> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero, + T::Native: ArrowNativeTypeOp, { try_unary(array, |value| value.sub_checked(scalar)) } @@ -1125,7 +1118,7 @@ where /// Perform `-` operation on an array. If value is null then the result is also null. pub fn negate(array: &PrimitiveArray) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: Neg, { Ok(unary(array, |x| -x)) @@ -1239,7 +1232,7 @@ pub fn multiply_scalar( ) -> Result> where T: datatypes::ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero + One, + T::Native: ArrowNativeTypeOp, { Ok(unary(array, |value| value.mul_wrapping(scalar))) } @@ -1255,7 +1248,7 @@ pub fn multiply_scalar_checked( ) -> Result> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero + One, + T::Native: ArrowNativeTypeOp, { try_unary(array, |value| value.mul_checked(scalar)) } @@ -1301,11 +1294,11 @@ pub fn modulus( ) -> Result> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + One, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_modulus::, |a, b| { - a % b + a.mod_wrapping(b) }); #[cfg(not(feature = "simd"))] return try_binary(left, right, |a, b| { @@ -1328,11 +1321,13 @@ pub fn divide_checked( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero + One, + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] - return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| a / b); + return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| { + a.div_wrapping(b) + }); #[cfg(not(feature = "simd"))] return math_checked_divide_op(left, right, |a, b| a.div_checked(b)); } @@ -1343,16 +1338,21 @@ where /// If any right hand value is zero, the operation value will be replaced with null in the /// result. /// -/// Unlike `divide` or `divide_checked`, division by zero will get a null value instead -/// returning an `Err`, this also doesn't check overflowing, overflowing will just wrap -/// the result around. +/// Unlike [`divide`] or [`divide_checked`], division by zero will yield a null value in the +/// result instead of returning an `Err`. +/// +/// For floating point types overflow will saturate at INF or -INF +/// preserving the expected sign value. +/// +/// For integer types overflow will wrap around. +/// pub fn divide_opt( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero + One, + T::Native: ArrowNativeTypeOp, { binary_opt(left, right, |a, b| { if b.is_zero() { @@ -1480,12 +1480,16 @@ pub fn divide_dyn_opt(left: &dyn Array, right: &dyn Array) -> Result { } } -/// Perform `left / right` operation on two arrays without checking for division by zero. -/// For floating point types, the result of dividing by zero follows normal floating point -/// rules. For other numeric types, dividing by zero will panic, -/// If either left or right value is null then the result is also null. If any right hand value is zero then the result of this +/// Perform `left / right` operation on two arrays without checking for +/// division by zero or overflow. +/// +/// For floating point types, overflow and division by zero follows normal floating point rules +/// +/// For integer types overflow will wrap around. Division by zero will currently panic, although +/// this may be subject to change see +/// +/// If either left or right value is null then the result is also null. /// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_checked` instead. pub fn divide( left: &PrimitiveArray, @@ -1495,6 +1499,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { + // TODO: This is incorrect as div_wrapping has side-effects for integer types + // and so may panic on null values (#2647) math_op(left, right, |a, b| a.div_wrapping(b)) } @@ -1525,12 +1531,12 @@ pub fn divide_scalar( ) -> Result> where T: ArrowNumericType, - T::Native: Div + Zero, + T::Native: ArrowNativeTypeOp, { if divisor.is_zero() { return Err(ArrowError::DivideByZero); } - Ok(unary(array, |a| a / divisor)) + Ok(unary(array, |a| a.div_wrapping(divisor))) } /// Divide every value in an array by a scalar. If any value in the array is null then the @@ -1543,7 +1549,7 @@ where pub fn divide_scalar_dyn(array: &dyn Array, divisor: T::Native) -> Result where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero, + T::Native: ArrowNativeTypeOp, { if divisor.is_zero() { return Err(ArrowError::DivideByZero); @@ -1564,7 +1570,7 @@ pub fn divide_scalar_checked_dyn( ) -> Result where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero, + T::Native: ArrowNativeTypeOp, { if divisor.is_zero() { return Err(ArrowError::DivideByZero); @@ -1587,7 +1593,7 @@ where pub fn divide_scalar_opt_dyn(array: &dyn Array, divisor: T::Native) -> Result where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp + Zero, + T::Native: ArrowNativeTypeOp, { if divisor.is_zero() { match array.data_type() { @@ -2139,7 +2145,6 @@ mod tests { } #[test] - #[cfg(not(feature = "simd"))] fn test_int_array_modulus_overflow_wrapping() { let a = Int32Array::from(vec![i32::MIN]); let b = Int32Array::from(vec![-1]); @@ -2147,15 +2152,6 @@ mod tests { assert_eq!(0, result.value(0)) } - #[test] - #[cfg(feature = "simd")] - #[should_panic(expected = "attempt to calculate the remainder with overflow")] - fn test_int_array_modulus_overflow_panic() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![-1]); - let _ = modulus(&a, &b).unwrap(); - } - #[test] fn test_primitive_array_divide_scalar() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 2ff9574c78a6..444ba39e0b6d 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -19,110 +19,72 @@ use crate::error::{ArrowError, Result}; pub use arrow_array::ArrowPrimitiveType; pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; -use num::Zero; -use std::ops::{Add, Div, Mul, Rem, Sub}; -mod private { - pub trait Sealed {} -} - -/// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking -/// variants for arithmetic operations. For floating point types, this provides some -/// default implementations. Integer types that need to deal with overflow can implement -/// this trait. +/// Trait for [`ArrowNativeType`] that adds checked and unchecked arithmetic operations, +/// and totally ordered comparison operations /// -/// The APIs with `_wrapping` suffix are the variant of non-overflow-checking. If overflow -/// occurred, they will supposedly wrap around the boundary of the type. +/// The APIs with `_wrapping` suffix do not perform overflow-checking. For integer +/// types they will wrap around the boundary of the type. For floating point types they +/// will overflow to INF or -INF preserving the expected sign value /// -/// The APIs with `_checked` suffix are the variant of overflow-checking which return `None` -/// if overflow occurred. -pub trait ArrowNativeTypeOp: - ArrowNativeType - + Add - + Sub - + Mul - + Div - + Rem - + Zero - + private::Sealed -{ - fn add_checked(self, rhs: Self) -> Result { - Ok(self + rhs) - } - - fn add_wrapping(self, rhs: Self) -> Self { - self + rhs - } - - fn sub_checked(self, rhs: Self) -> Result { - Ok(self - rhs) - } - - fn sub_wrapping(self, rhs: Self) -> Self { - self - rhs - } - - fn mul_checked(self, rhs: Self) -> Result { - Ok(self * rhs) - } - - fn mul_wrapping(self, rhs: Self) -> Self { - self * rhs - } - - fn div_checked(self, rhs: Self) -> Result { - if rhs.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(self / rhs) - } - } +/// Note `div_wrapping` and `mod_wrapping` will panic for integer types if `rhs` is zero +/// although this may be subject to change +/// +/// The APIs with `_checked` suffix perform overflow-checking. For integer types +/// these will return `Err` instead of wrapping. For floating point types they will +/// overflow to INF or -INF preserving the expected sign value +/// +/// Comparison of integer types is as per normal integer comparison rules, floating +/// point values are compared as per IEEE 754's totalOrder predicate see [`f32::total_cmp`] +/// +pub trait ArrowNativeTypeOp: ArrowNativeType { + /// The additive identity + const ZERO: Self; - fn div_wrapping(self, rhs: Self) -> Self { - self / rhs - } + /// The multiplicative identity + const ONE: Self; - fn mod_checked(self, rhs: Self) -> Result { - if rhs.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(self % rhs) - } - } + fn add_checked(self, rhs: Self) -> Result; + + fn add_wrapping(self, rhs: Self) -> Self; + + fn sub_checked(self, rhs: Self) -> Result; + + fn sub_wrapping(self, rhs: Self) -> Self; - fn mod_wrapping(self, rhs: Self) -> Self { - self % rhs - } + fn mul_checked(self, rhs: Self) -> Result; - fn is_eq(self, rhs: Self) -> bool { - self == rhs - } + fn mul_wrapping(self, rhs: Self) -> Self; - fn is_ne(self, rhs: Self) -> bool { - self != rhs - } + fn div_checked(self, rhs: Self) -> Result; - fn is_lt(self, rhs: Self) -> bool { - self < rhs - } + fn div_wrapping(self, rhs: Self) -> Self; - fn is_le(self, rhs: Self) -> bool { - self <= rhs - } + fn mod_checked(self, rhs: Self) -> Result; - fn is_gt(self, rhs: Self) -> bool { - self > rhs - } + fn mod_wrapping(self, rhs: Self) -> Self; - fn is_ge(self, rhs: Self) -> bool { - self >= rhs - } + fn is_zero(self) -> bool; + + fn is_eq(self, rhs: Self) -> bool; + + fn is_ne(self, rhs: Self) -> bool; + + fn is_lt(self, rhs: Self) -> bool; + + fn is_le(self, rhs: Self) -> bool; + + fn is_gt(self, rhs: Self) -> bool; + + fn is_ge(self, rhs: Self) -> bool; } macro_rules! native_type_op { ($t:tt) => { - impl private::Sealed for $t {} impl ArrowNativeTypeOp for $t { + const ZERO: Self = 0; + const ONE: Self = 1; + fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( @@ -195,6 +157,34 @@ macro_rules! native_type_op { fn mod_wrapping(self, rhs: Self) -> Self { self.wrapping_rem(rhs) } + + fn is_zero(self) -> bool { + self == 0 + } + + fn is_eq(self, rhs: Self) -> bool { + self == rhs + } + + fn is_ne(self, rhs: Self) -> bool { + self != rhs + } + + fn is_lt(self, rhs: Self) -> bool { + self < rhs + } + + fn is_le(self, rhs: Self) -> bool { + self <= rhs + } + + fn is_gt(self, rhs: Self) -> bool { + self > rhs + } + + fn is_ge(self, rhs: Self) -> bool { + self >= rhs + } } }; } @@ -210,9 +200,63 @@ native_type_op!(u32); native_type_op!(u64); macro_rules! native_type_float_op { - ($t:tt) => { - impl private::Sealed for $t {} + ($t:tt, $zero:expr, $one:expr) => { impl ArrowNativeTypeOp for $t { + const ZERO: Self = $zero; + const ONE: Self = $one; + + fn add_checked(self, rhs: Self) -> Result { + Ok(self + rhs) + } + + fn add_wrapping(self, rhs: Self) -> Self { + self + rhs + } + + fn sub_checked(self, rhs: Self) -> Result { + Ok(self - rhs) + } + + fn sub_wrapping(self, rhs: Self) -> Self { + self - rhs + } + + fn mul_checked(self, rhs: Self) -> Result { + Ok(self * rhs) + } + + fn mul_wrapping(self, rhs: Self) -> Self { + self * rhs + } + + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self / rhs) + } + } + + fn div_wrapping(self, rhs: Self) -> Self { + self / rhs + } + + fn mod_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self % rhs) + } + } + + fn mod_wrapping(self, rhs: Self) -> Self { + self % rhs + } + + fn is_zero(self) -> bool { + self == $zero + } + fn is_eq(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_eq() } @@ -240,6 +284,6 @@ macro_rules! native_type_float_op { }; } -native_type_float_op!(f16); -native_type_float_op!(f32); -native_type_float_op!(f64); +native_type_float_op!(f16, f16::ONE, f16::ZERO); +native_type_float_op!(f32, 0., 1.); +native_type_float_op!(f64, 0., 1.); From 2ae23093c3f7edc278fe6daf57daf167c430143b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 9 Oct 2022 08:45:06 +0100 Subject: [PATCH 0119/1411] Simplify downcast_primitive_array (#2809) --- arrow-array/src/cast.rs | 442 ++++++++++------------------------------ 1 file changed, 105 insertions(+), 337 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 653836b8d4e4..6eb5407966f1 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -20,6 +20,18 @@ use crate::array::*; use crate::types::*; +/// Repeats the provided pattern based on the number of comma separated identifiers +#[doc(hidden)] +#[macro_export] +macro_rules! repeat_pat { + ($e:pat, $v_:ident) => { + $e + }; + ($e:pat, $v_:ident $(, $tail:ident)+) => { + ($e, $crate::repeat_pat!($e $(, $tail)+)) + } +} + /// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`] /// accepts a number of subsequent patterns to match the data type /// @@ -50,427 +62,183 @@ macro_rules! downcast_primitive_array { ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { downcast_primitive_array!($values => {$e} $($p => $fallback)*) }; - - ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { - match $values.data_type() { - arrow_schema::DataType::Int8 => { - let $values = $crate::cast::as_primitive_array::< + (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + match ($($values.data_type()),+) { + $crate::repeat_pat!(arrow_schema::DataType::Int8, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Int8Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Int16 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Int16, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Int16Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Int32 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Int32, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Int32Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Int64 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Int64, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Int64Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::UInt8 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::UInt8, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::UInt8Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::UInt16 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::UInt16, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::UInt16Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::UInt32 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::UInt32, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::UInt32Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::UInt64 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::UInt64, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::UInt64Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Float16 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Float16, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Float16Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Float32 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Float32, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Float32Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Float64 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Float64, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Float64Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Date32 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Date32, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Date32Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Date64 => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Date64, $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Date64Type, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Time32SecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Time32MillisecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Time64MicrosecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::Time64NanosecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::TimestampSecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::TimestampMillisecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::TimestampMicrosecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::TimestampNanosecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::IntervalYearMonthType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::IntervalDayTimeType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::IntervalMonthDayNanoType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::DurationSecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::DurationMillisecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::DurationMicrosecondType, - >($values); + >($values);)+ $e } - arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond) => { - let $values = $crate::cast::as_primitive_array::< + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), $($values),+) => { + $(let $values = $crate::cast::as_primitive_array::< $crate::types::DurationNanosecondType, - >($values); - $e - } - $($p => $fallback,)* - } - }; - - (($values1:ident, $values2:ident) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { - match ($values1.data_type(), $values2.data_type()) { - (arrow_schema::DataType::Int8, arrow_schema::DataType::Int8) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Int8Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Int8Type, - >($values2); - $e - } - (arrow_schema::DataType::Int16, arrow_schema::DataType::Int16) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Int16Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Int16Type, - >($values2); - $e - } - (arrow_schema::DataType::Int32, arrow_schema::DataType::Int32) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Int32Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Int32Type, - >($values2); - $e - } - (arrow_schema::DataType::Int64, arrow_schema::DataType::Int64) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Int64Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Int64Type, - >($values2); - $e - } - (arrow_schema::DataType::UInt8, arrow_schema::DataType::UInt8) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::UInt8Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::UInt8Type, - >($values2); - $e - } - (arrow_schema::DataType::UInt16, arrow_schema::DataType::UInt16) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::UInt16Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::UInt16Type, - >($values2); - $e - } - (arrow_schema::DataType::UInt32, arrow_schema::DataType::UInt32) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::UInt32Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::UInt32Type, - >($values2); - $e - } - (arrow_schema::DataType::UInt64, arrow_schema::DataType::UInt64) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::UInt64Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::UInt64Type, - >($values2); - $e - } - (arrow_schema::DataType::Float32, arrow_schema::DataType::Float32) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Float32Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Float32Type, - >($values2); - $e - } - (arrow_schema::DataType::Float64, arrow_schema::DataType::Float64) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Float64Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Float64Type, - >($values2); - $e - } - (arrow_schema::DataType::Date32, arrow_schema::DataType::Date32) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Date32Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Date32Type, - >($values2); - $e - } - (arrow_schema::DataType::Date64, arrow_schema::DataType::Date64) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Date64Type, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Date64Type, - >($values2); + >($values);)+ $e } - (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Time32SecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Time32SecondType, - >($values2); - $e - } - (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Time32MillisecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Time32MillisecondType, - >($values2); - $e - } - (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Time64MicrosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Time64MicrosecondType, - >($values2); - $e - } - (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::Time64NanosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::Time64NanosecondType, - >($values2); - $e - } - (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::TimestampSecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::TimestampSecondType, - >($values2); - $e - } - (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::TimestampMillisecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::TimestampMillisecondType, - >($values2); - $e - } - (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::TimestampMicrosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::TimestampMicrosecondType, - >($values2); - $e - } - (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::TimestampNanosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::TimestampNanosecondType, - >($values2); - $e - } - (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::IntervalYearMonthType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::IntervalYearMonthType, - >($values2); - $e - } - (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::IntervalDayTimeType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::IntervalDayTimeType, - >($values2); - $e - } - (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::IntervalMonthDayNanoType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::IntervalMonthDayNanoType, - >($values2); - $e - } - (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::DurationSecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::DurationSecondType, - >($values2); - $e - } - (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::DurationMillisecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::DurationMillisecondType, - >($values2); - $e - } - (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::DurationMicrosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::DurationMicrosecondType, - >($values2); - $e - } - (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond)) => { - let $values1 = $crate::cast::as_primitive_array::< - $crate::types::DurationNanosecondType, - >($values1); - let $values2 = $crate::cast::as_primitive_array::< - $crate::types::DurationNanosecondType, - >($values2); - $e - } - $($p => $fallback,)* + $(($($p),+) => $fallback,)* } }; } From c3aac93454c67b7b1b2ee38cd33aa93c1a8e568e Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Mon, 10 Oct 2022 16:24:16 +0800 Subject: [PATCH 0120/1411] [feat] Add pub api for checking column index is sorted. (#2849) * [feat]Add pub api for checking column index is sorted. * export boundary_order * simplify the code --- parquet/src/file/page_index/index.rs | 27 +++++++++++++++++++++++++++ parquet/src/file/serialized_reader.rs | 16 ++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 062dc3966ead..7adf2c08a9fd 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -63,6 +63,33 @@ pub enum Index { FIXED_LEN_BYTE_ARRAY(ByteArrayIndex), } +impl Index { + /// Return min/max elements inside ColumnIndex are ordered or not. + pub fn is_sorted(&self) -> bool { + // 0:UNORDERED, 1:ASCENDING ,2:DESCENDING, + if let Some(order) = self.get_boundary_order() { + order.0 > (BoundaryOrder::UNORDERED.0) + } else { + false + } + } + + /// Get boundary_order of this page index. + pub fn get_boundary_order(&self) -> Option { + match self { + Index::NONE => None, + Index::BOOLEAN(index) => Some(index.boundary_order), + Index::INT32(index) => Some(index.boundary_order), + Index::INT64(index) => Some(index.boundary_order), + Index::INT96(index) => Some(index.boundary_order), + Index::FLOAT(index) => Some(index.boundary_order), + Index::DOUBLE(index) => Some(index.boundary_order), + Index::BYTE_ARRAY(index) => Some(index.boundary_order), + Index::FIXED_LEN_BYTE_ARRAY(index) => Some(index.boundary_order), + } + } +} + /// An index of a column of [`Type`] physical representation #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index cd90b0d0b67a..6b416e34dc65 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1325,6 +1325,10 @@ mod tests { let row_group_metadata = metadata.row_group(0); //col0->id: INT32 UNCOMPRESSED DO:0 FPO:4 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 7299, num_nulls: 0] + assert!(!&page_indexes[0][0].is_sorted()); + let boundary_order = &page_indexes[0][0].get_boundary_order(); + assert!(boundary_order.is_some()); + matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED); if let Index::INT32(index) = &page_indexes[0][0] { check_native_page_index( index, @@ -1337,6 +1341,7 @@ mod tests { unreachable!() }; //col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0] + assert!(&page_indexes[0][1].is_sorted()); if let Index::BOOLEAN(index) = &page_indexes[0][1] { assert_eq!(index.indexes.len(), 82); assert_eq!(row_group_offset_indexes[1].len(), 82); @@ -1344,6 +1349,7 @@ mod tests { unreachable!() }; //col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] + assert!(&page_indexes[0][2].is_sorted()); if let Index::INT32(index) = &page_indexes[0][2] { check_native_page_index( index, @@ -1356,6 +1362,7 @@ mod tests { unreachable!() }; //col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] + assert!(&page_indexes[0][3].is_sorted()); if let Index::INT32(index) = &page_indexes[0][3] { check_native_page_index( index, @@ -1368,6 +1375,7 @@ mod tests { unreachable!() }; //col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] + assert!(&page_indexes[0][4].is_sorted()); if let Index::INT32(index) = &page_indexes[0][4] { check_native_page_index( index, @@ -1380,6 +1388,7 @@ mod tests { unreachable!() }; //col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90, num_nulls: 0] + assert!(!&page_indexes[0][5].is_sorted()); if let Index::INT64(index) = &page_indexes[0][5] { check_native_page_index( index, @@ -1392,6 +1401,7 @@ mod tests { unreachable!() }; //col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9, num_nulls: 0] + assert!(&page_indexes[0][6].is_sorted()); if let Index::FLOAT(index) = &page_indexes[0][6] { check_native_page_index( index, @@ -1404,6 +1414,7 @@ mod tests { unreachable!() }; //col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 90.89999999999999, num_nulls: 0] + assert!(!&page_indexes[0][7].is_sorted()); if let Index::DOUBLE(index) = &page_indexes[0][7] { check_native_page_index( index, @@ -1416,6 +1427,7 @@ mod tests { unreachable!() }; //col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max: 12/31/10, num_nulls: 0] + assert!(!&page_indexes[0][8].is_sorted()); if let Index::BYTE_ARRAY(index) = &page_indexes[0][8] { check_bytes_page_index( index, @@ -1428,6 +1440,7 @@ mod tests { unreachable!() }; //col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795 SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] + assert!(&page_indexes[0][9].is_sorted()); if let Index::BYTE_ARRAY(index) = &page_indexes[0][9] { check_bytes_page_index( index, @@ -1441,12 +1454,14 @@ mod tests { }; //col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0, min/max not defined] //Notice: min_max values for each page for this col not exits. + assert!(!&page_indexes[0][10].is_sorted()); if let Index::NONE = &page_indexes[0][10] { assert_eq!(row_group_offset_indexes[10].len(), 974); } else { unreachable!() }; //col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0] + assert!(&page_indexes[0][11].is_sorted()); if let Index::INT32(index) = &page_indexes[0][11] { check_native_page_index( index, @@ -1459,6 +1474,7 @@ mod tests { unreachable!() }; //col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0] + assert!(!&page_indexes[0][12].is_sorted()); if let Index::INT32(index) = &page_indexes[0][12] { check_native_page_index( index, From 0268bba4c01c2b83986c023258ad4405c29cabff Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Mon, 10 Oct 2022 04:25:30 -0400 Subject: [PATCH 0121/1411] Fix page size on dictionary fallback (#2854) * Fix page size on dictionary fallback * Make test deterministic * Comments and improve test --- parquet/src/arrow/arrow_writer/byte_array.rs | 5 +- parquet/src/arrow/arrow_writer/mod.rs | 66 ++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index a25bd8d5c505..9ea3767a28ed 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -551,7 +551,10 @@ where match &mut encoder.dict_encoder { Some(dict_encoder) => dict_encoder.encode(values, indices), - None => encoder.fallback.encode(values, indices), + None => { + encoder.num_values += indices.len(); + encoder.fallback.encode(values, indices) + } } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2c3d498bcca8..b5c0b50127d4 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -624,6 +624,7 @@ mod tests { use crate::basic::Encoding; use crate::file::metadata::ParquetMetaData; + use crate::file::page_index::index_reader::read_pages_locations; use crate::file::properties::WriterVersion; use crate::file::{ reader::{FileReader, SerializedFileReader}, @@ -1108,6 +1109,71 @@ mod tests { roundtrip(batch, Some(SMALL_SIZE / 2)); } + #[test] + fn arrow_writer_page_size() { + let schema = + Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); + + let mut builder = StringBuilder::with_capacity(100, 329 * 10_000); + + // Generate an array of 10 unique 10 character string + for i in 0..10 { + let value = i + .to_string() + .repeat(10) + .chars() + .take(10) + .collect::(); + + builder.append_value(value); + } + + let array = Arc::new(builder.finish()); + + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + + let file = tempfile::tempfile().unwrap(); + + // Set everything very low so we fallback to PLAIN encoding after the first row + let props = WriterProperties::builder() + .set_data_pagesize_limit(1) + .set_dictionary_pagesize_limit(1) + .set_write_batch_size(1) + .build(); + + let mut writer = + ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), Some(props)) + .expect("Unable to write file"); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file.try_clone().unwrap()).unwrap(); + + let column = reader.metadata().row_group(0).columns(); + + assert_eq!(column.len(), 1); + + // We should write one row before falling back to PLAIN encoding so there should still be a + // dictionary page. + assert!( + column[0].dictionary_page_offset().is_some(), + "Expected a dictionary page" + ); + + let page_locations = read_pages_locations(&file, column).unwrap(); + + let offset_index = page_locations[0].clone(); + + // We should fallback to PLAIN encoding after the first row and our max page size is 1 bytes + // so we expect one dictionary encoded page and then a page per row thereafter. + assert_eq!( + offset_index.len(), + 10, + "Expected 9 pages but got {:#?}", + offset_index + ); + } + const SMALL_SIZE: usize = 7; fn roundtrip( From 3bb7f3882c4866087b6ec43a31f0b6430f5103d1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 12 Oct 2022 08:35:54 -0700 Subject: [PATCH 0122/1411] Fix some clippy errors (#2862) --- arrow-array/src/array/primitive_array.rs | 2 +- arrow-buffer/src/buffer/mutable.rs | 2 +- arrow-flight/src/lib.rs | 3 +++ arrow/src/csv/writer.rs | 1 + arrow/src/lib.rs | 2 +- arrow/src/pyarrow.rs | 2 +- arrow/src/util/test_util.rs | 4 ++-- integration-testing/src/util/mod.rs | 2 +- parquet/src/arrow/arrow_writer/levels.rs | 2 +- parquet/src/encodings/rle.rs | 14 +++++++------- parquet/src/format.rs | 4 ++-- parquet_derive/src/parquet_field.rs | 12 ++++++------ 12 files changed, 27 insertions(+), 23 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index e362f0d7e84d..928135463cca 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -786,7 +786,7 @@ impl PrimitiveArray { } impl PrimitiveArray { - /// Construct a timestamp array from a vec of Option values and an optional timezone + /// Construct a timestamp array from a vec of `Option` values and an optional timezone pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self { // TODO: duplicated from def_numeric_from_vec! macro, it looks possible to convert to generic let data_len = data.len(); diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 80644b63d113..bd139466ae92 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -365,7 +365,7 @@ impl MutableBuffer { /// Extends the buffer with a new item, without checking for sufficient capacity /// # Safety - /// Caller must ensure that the capacity()-len()>=size_of() + /// Caller must ensure that the capacity()-len()>=`size_of`() #[inline] pub unsafe fn push_unchecked(&mut self, item: T) { let additional = std::mem::size_of::(); diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 54f4d24b65ae..054981707085 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#![allow(rustdoc::invalid_html_tags)] + use arrow::datatypes::Schema; use arrow::error::{ArrowError, Result as ArrowResult}; use arrow::ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; @@ -27,6 +29,7 @@ use std::{ }; #[allow(clippy::derive_partial_eq_without_eq)] + mod gen { include!("arrow.flight.protocol.rs"); } diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index 1b377c38b370..eb7a8fd5be88 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -103,6 +103,7 @@ pub struct Writer { /// The datetime format for datetime arrays datetime_format: String, /// The timestamp format for timestamp arrays + #[allow(dead_code)] timestamp_format: String, /// The timestamp format for timestamp (with timezone) arrays #[allow(dead_code)] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 8967efce50b5..324803cb1a90 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -248,7 +248,7 @@ #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] - +#![allow(rustdoc::invalid_html_tags)] pub use arrow_array::{downcast_dictionary_array, downcast_primitive_array}; pub use arrow_buffer::{alloc, buffer}; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index d8d5eee532e7..7c365a4344a5 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -219,7 +219,7 @@ impl PyArrowConvert for ArrowArrayStreamReader { unsafe { ArrowArrayStreamReader::from_raw(stream_ptr).unwrap() }; unsafe { - Box::from_raw(stream_ptr); + drop(Box::from_raw(stream_ptr)); } Ok(stream_reader) diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs index cae148a53d5c..836bda6f98ca 100644 --- a/arrow/src/util/test_util.rs +++ b/arrow/src/util/test_util.rs @@ -167,8 +167,8 @@ pub struct BadIterator { } impl BadIterator { - /// Create a new iterator for items, but that reports to - /// produce items. Must provide at least 1 item. + /// Create a new iterator for `` items, but that reports to + /// produce `` items. Must provide at least 1 item. pub fn new(limit: usize, claimed: usize, items: Vec) -> Self { assert!(!items.is_empty()); Self { diff --git a/integration-testing/src/util/mod.rs b/integration-testing/src/util/mod.rs index f9ddc0e6f4b7..c0eb80a35711 100644 --- a/integration-testing/src/util/mod.rs +++ b/integration-testing/src/util/mod.rs @@ -978,7 +978,7 @@ pub fn dictionary_array_from_json( } } -/// A helper to create a null buffer from a Vec +/// A helper to create a null buffer from a `Vec` fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { let num_bytes = bit_util::ceil(json_col.count, 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index f5e26a7281b7..5736f05fdcfe 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -213,7 +213,7 @@ impl LevelInfoBuilder { /// Write `range` elements from ListArray `array` /// - /// Note: MapArrays are ListArray under the hood and so are dispatched to this method + /// Note: MapArrays are `ListArray` under the hood and so are dispatched to this method fn write_list( &mut self, offsets: &[O], diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 39a0aa4d03da..93dd4ab565ca 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -27,17 +27,17 @@ use crate::util::{ /// The grammar for this encoding looks like the following (copied verbatim /// from ): /// -/// rle-bit-packed-hybrid: -/// length := length of the in bytes stored as 4 bytes little endian -/// encoded-data := * -/// run := | -/// bit-packed-run := -/// bit-packed-header := varint-encode( << 1 | 1) +/// rle-bit-packed-hybrid: `` `` +/// length := length of the `` in bytes stored as 4 bytes little endian +/// encoded-data := ``* +/// run := `` | `` +/// bit-packed-run := `` `` +/// bit-packed-header := varint-encode(`` << 1 | 1) /// we always bit-pack a multiple of 8 values at a time, so we only store the number of /// values / 8 /// bit-pack-count := (number of values in this run) / 8 /// bit-packed-values := *see 1 below* -/// rle-run := +/// rle-run := `` `` /// rle-header := varint-encode( (number of times repeated) << 1) /// repeated-value := value that is repeated, using a fixed-width of /// round-up-to-next-byte(bit-width) diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 00a89a4c7e85..6fb2e32ebcfc 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -4452,7 +4452,7 @@ impl OffsetIndex { // /// Description for ColumnIndex. -/// Each \[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding @@ -4905,7 +4905,7 @@ pub struct FileMetaData { /// Optional key/value metadata * pub key_value_metadata: Option>, /// String for application that wrote this file. This should be in the format - /// version (build ). + /// `` version `` (build ``). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) /// pub created_by: Option, diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 0642e23327f7..82e3b5112fe0 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -68,7 +68,7 @@ impl Field { /// /// struct Record { /// a_bool: bool, - /// maybe_a_bool: Option + /// maybe_a_bool: `Option` /// } /// /// but not @@ -355,9 +355,9 @@ impl Type { /// Helper to simplify a nested field definition to its leaf type /// /// Ex: - /// Option<&String> => Type::TypePath(String) - /// &Option => Type::TypePath(i32) - /// Vec> => Type::Vec(u8) + /// `Option<&String>` => Type::TypePath(String) + /// `&Option` => Type::TypePath(i32) + /// `Vec>` => Type::Vec(u8) /// /// Useful in determining the physical type of a field and the /// definition levels. @@ -404,7 +404,7 @@ impl Type { /// /// Ex: /// std::string::String => String - /// Vec => Vec + /// `Vec` => `Vec` /// chrono::NaiveDateTime => NaiveDateTime /// /// Does run the risk of mis-identifying a type if import @@ -427,7 +427,7 @@ impl Type { /// /// Ex: /// [u8; 10] => FIXED_LEN_BYTE_ARRAY - /// Vec => BYTE_ARRAY + /// `Vec` => BYTE_ARRAY /// String => BYTE_ARRAY /// i32 => INT32 fn physical_type(&self) -> parquet::basic::Type { From f8254e7f01ad8ccb4b86c33ad429a04e9ffc6737 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 12 Oct 2022 16:30:13 -0700 Subject: [PATCH 0123/1411] Replace complicated temporal macro with generic functions (#2850) * Remove complicated macro * Fix clippy * Add doc. --- arrow/src/compute/kernels/cast.rs | 117 ++++++- arrow/src/compute/kernels/temporal.rs | 472 ++++++++++---------------- 2 files changed, 280 insertions(+), 309 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 912ea28830eb..b573c65d026f 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -37,14 +37,13 @@ use chrono::format::strftime::StrftimeItems; use chrono::format::{parse, Parsed}; -use chrono::Timelike; +use chrono::{NaiveDateTime, Timelike}; use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; use crate::buffer::MutableBuffer; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::compute::kernels::temporal::extract_component_from_array; use crate::compute::kernels::temporal::return_compute_error_with; use crate::compute::{divide_scalar, multiply_scalar}; use crate::compute::{try_unary, using_chrono_tz_and_utc_naive_date_time}; @@ -1638,6 +1637,98 @@ where unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } +fn as_time_with_string_op< + A: ArrayAccessor, + OffsetSize, + T: ArrowTemporalType, + F, +>( + iter: ArrayIter, + mut builder: GenericStringBuilder, + op: F, +) -> ArrayRef +where + OffsetSize: OffsetSizeTrait, + F: Fn(NaiveDateTime) -> String, + i64: From, +{ + iter.into_iter().for_each(|value| { + if let Some(value) = value { + match as_datetime::(>::from(value)) { + Some(dt) => builder.append_value(op(dt)), + None => builder.append_null(), + } + } else { + builder.append_null(); + } + }); + + Arc::new(builder.finish()) +} + +fn extract_component_from_datatime_array< + A: ArrayAccessor, + OffsetSize, + T: ArrowTemporalType, + F, +>( + iter: ArrayIter, + mut builder: GenericStringBuilder, + tz: &str, + mut parsed: Parsed, + op: F, +) -> Result +where + OffsetSize: OffsetSizeTrait, + F: Fn(NaiveDateTime) -> String, + i64: From, +{ + if (tz.starts_with('+') || tz.starts_with('-')) && !tz.contains(':') { + return_compute_error_with!( + "Invalid timezone", + "Expected format [+-]XX:XX".to_string() + ) + } else { + let tz_parse_result = parse(&mut parsed, tz, StrftimeItems::new("%z")); + let fixed_offset_from_parsed = match tz_parse_result { + Ok(_) => match parsed.to_fixed_offset() { + Ok(fo) => Some(fo), + err => return_compute_error_with!("Invalid timezone", err), + }, + _ => None, + }; + + for value in iter { + if let Some(value) = value { + match as_datetime::(>::from(value)) { + Some(utc) => { + let fixed_offset = match fixed_offset_from_parsed { + Some(fo) => fo, + None => { + match using_chrono_tz_and_utc_naive_date_time(tz, utc) { + Some(fo) => fo, + err => return_compute_error_with!( + "Unable to parse timezone", + err + ), + } + } + }; + builder.append_value(op(utc + fixed_offset)); + } + err => return_compute_error_with!( + "Unable to read value as datetime", + err + ), + } + } else { + builder.append_null(); + } + } + } + Ok(Arc::new(builder.finish())) +} + /// Cast timestamp types to Utf8/LargeUtf8 fn cast_timestamp_to_string( array: &ArrayRef, @@ -1650,38 +1741,30 @@ where { let array = array.as_any().downcast_ref::>().unwrap(); - let mut builder = GenericStringBuilder::::new(); + let builder = GenericStringBuilder::::new(); if let Some(tz) = tz { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); // The macro calls `as_datetime` on timestamp values of the array. // After applying timezone offset on the datatime, calling `to_string` to get // the strings. let iter = ArrayIter::new(array); - extract_component_from_array!( + extract_component_from_datatime_array::<_, OffsetSize, T, _>( iter, builder, - to_string, - |value, tz| as_datetime::(>::from(value)) - .map(|datetime| datetime + tz), tz, scratch, - |value| as_datetime::(>::from(value)), - |h| h + |t| t.to_string(), ) } else { // No timezone available. Calling `to_string` on the datatime value simply. let iter = ArrayIter::new(array); - extract_component_from_array!( + Ok(as_time_with_string_op::<_, OffsetSize, T, _>( iter, builder, - to_string, - |value| as_datetime::(>::from(value)), - |h| h - ) + |t| t.to_string(), + )) } - - Ok(Arc::new(builder.finish()) as ArrayRef) } /// Cast date32 types to Utf8/LargeUtf8 diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index e61fec999add..220b7dadcc56 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -17,7 +17,7 @@ //! Defines temporal kernels for time and date related functions. -use chrono::{Datelike, Timelike}; +use chrono::{Datelike, NaiveDateTime, NaiveTime, Timelike}; use crate::array::*; use crate::datatypes::*; @@ -28,83 +28,124 @@ use chrono::format::strftime::StrftimeItems; use chrono::format::{parse, Parsed}; use chrono::FixedOffset; -macro_rules! extract_component_from_array { - ($iter:ident, $builder:ident, $extract_fn:ident, $using:expr, $convert:expr) => { - $iter.into_iter().for_each(|value| { - if let Some(value) = value { - match $using(value) { - Some(dt) => $builder.append_value($convert(dt.$extract_fn())), - None => $builder.append_null(), - } - } else { - $builder.append_null(); +/// This function takes an `ArrayIter` of input array and an extractor `op` which takes +/// an input `NaiveTime` and returns time component (e.g. hour) as `i32` value. +/// The extracted values are built by the given `builder` to be an `Int32Array`. +fn as_time_with_op, T: ArrowTemporalType, F>( + iter: ArrayIter, + mut builder: PrimitiveBuilder, + op: F, +) -> Int32Array +where + F: Fn(NaiveTime) -> i32, + i64: From, +{ + iter.into_iter().for_each(|value| { + if let Some(value) = value { + match as_time::(i64::from(value)) { + Some(dt) => builder.append_value(op(dt)), + None => builder.append_null(), } - }) - }; - ($iter:ident, $builder:ident, $extract_fn1:ident, $extract_fn2:ident, $using:expr, $convert:expr) => { - $iter.into_iter().for_each(|value| { - if let Some(value) = value { - match $using(value) { - Some(dt) => { - $builder.append_value($convert(dt.$extract_fn1().$extract_fn2())); - } - None => $builder.append_null(), - } - } else { - $builder.append_null(); + } else { + builder.append_null(); + } + }); + + builder.finish() +} + +/// This function takes an `ArrayIter` of input array and an extractor `op` which takes +/// an input `NaiveDateTime` and returns data time component (e.g. hour) as `i32` value. +/// The extracted values are built by the given `builder` to be an `Int32Array`. +fn as_datetime_with_op, T: ArrowTemporalType, F>( + iter: ArrayIter, + mut builder: PrimitiveBuilder, + op: F, +) -> Int32Array +where + F: Fn(NaiveDateTime) -> i32, + i64: From, +{ + iter.into_iter().for_each(|value| { + if let Some(value) = value { + match as_datetime::(i64::from(value)) { + Some(dt) => builder.append_value(op(dt)), + None => builder.append_null(), } - }) - }; - ($iter:ident, $builder:ident, $extract_fn:ident, $using:expr, $tz:ident, $parsed:ident, $value_as_datetime:expr, $convert:expr) => { - if ($tz.starts_with('+') || $tz.starts_with('-')) && !$tz.contains(':') { - return_compute_error_with!( - "Invalid timezone", - "Expected format [+-]XX:XX".to_string() - ) } else { - let tz_parse_result = parse(&mut $parsed, &$tz, StrftimeItems::new("%z")); - let fixed_offset_from_parsed = match tz_parse_result { - Ok(_) => match $parsed.to_fixed_offset() { - Ok(fo) => Some(fo), - err => return_compute_error_with!("Invalid timezone", err), - }, - _ => None, - }; - - for value in $iter.into_iter() { - if let Some(value) = value { - match $value_as_datetime(value) { - Some(utc) => { - let fixed_offset = match fixed_offset_from_parsed { - Some(fo) => fo, - None => match using_chrono_tz_and_utc_naive_date_time( - &$tz, utc, - ) { + builder.append_null(); + } + }); + + builder.finish() +} + +/// This function extracts date time component (e.g. hour) from an array of datatime. +/// `iter` is the `ArrayIter` of input datatime array. `builder` is used to build the +/// returned `Int32Array` containing the extracted components. `tz` is timezone string +/// which will be added to datetime values in the input array. `parsed` is a `Parsed` +/// object used to parse timezone string. `op` is the extractor closure which takes +/// data time object of `NaiveDateTime` type and returns `i32` value of extracted +/// component. +fn extract_component_from_datatime_array< + A: ArrayAccessor, + T: ArrowTemporalType, + F, +>( + iter: ArrayIter, + mut builder: PrimitiveBuilder, + tz: &str, + mut parsed: Parsed, + op: F, +) -> Result +where + F: Fn(NaiveDateTime) -> i32, + i64: From, +{ + if (tz.starts_with('+') || tz.starts_with('-')) && !tz.contains(':') { + return_compute_error_with!( + "Invalid timezone", + "Expected format [+-]XX:XX".to_string() + ) + } else { + let tz_parse_result = parse(&mut parsed, tz, StrftimeItems::new("%z")); + let fixed_offset_from_parsed = match tz_parse_result { + Ok(_) => match parsed.to_fixed_offset() { + Ok(fo) => Some(fo), + err => return_compute_error_with!("Invalid timezone", err), + }, + _ => None, + }; + + for value in iter { + if let Some(value) = value { + match as_datetime::(i64::from(value)) { + Some(utc) => { + let fixed_offset = match fixed_offset_from_parsed { + Some(fo) => fo, + None => { + match using_chrono_tz_and_utc_naive_date_time(tz, utc) { Some(fo) => fo, err => return_compute_error_with!( "Unable to parse timezone", err ), - }, - }; - match $using(value, fixed_offset) { - Some(dt) => { - $builder.append_value($convert(dt.$extract_fn())); } - None => $builder.append_null(), } - } - err => return_compute_error_with!( - "Unable to read value as datetime", - err - ), + }; + builder.append_value(op(utc + fixed_offset)); } - } else { - $builder.append_null(); + err => return_compute_error_with!( + "Unable to read value as datetime", + err + ), } + } else { + builder.append_null(); } } - }; + } + Ok(builder.finish()) } macro_rules! return_compute_error_with { @@ -113,7 +154,6 @@ macro_rules! return_compute_error_with { }; } -pub(crate) use extract_component_from_array; pub(crate) use return_compute_error_with; // Internal trait, which is used for mapping values from DateLike structures @@ -206,47 +246,25 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Time32(_) | DataType::Time64(_) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - hour, - |value| as_time::(i64::from(value)), - |h| h as i32 - ); + Ok(as_time_with_op::(iter, b, |t| t.hour() as i32)) } DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - hour, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| t.hour() as i32)) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - hour, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.hour() as i32 + }) } _ => return_compute_error_with!("hour does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the years of a given temporal primitive array as an array of integers @@ -281,22 +299,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { + let b = Int32Builder::with_capacity(array.len()); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - year, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| t.year())) } _t => return_compute_error_with!("year does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the quarter of a given temporal primitive array as an array of integers within @@ -335,37 +345,23 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - quarter, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.quarter() as i32 + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - quarter, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.quarter() as i32 + }) } _ => return_compute_error_with!("quarter does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the month of a given temporal primitive array as an array of integers within @@ -403,37 +399,23 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - month, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.month() as i32 + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - month, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.month() as i32 + }) } _ => return_compute_error_with!("month does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -485,37 +467,23 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - num_days_from_monday, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.num_days_from_monday() + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - num_days_from_monday, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.num_days_from_monday() + }) } _ => return_compute_error_with!("weekday does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -567,40 +535,26 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - num_days_from_sunday, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.num_days_from_sunday() + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - num_days_from_sunday, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.num_days_from_sunday() + }) } _ => return_compute_error_with!( "num_days_from_sunday does not support", array.data_type() ), } - - Ok(b.finish()) } /// Extracts the day of a given temporal primitive array as an array of integers @@ -635,37 +589,21 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - day, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| t.day() as i32)) } DataType::Timestamp(_, Some(ref tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - day, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.day() as i32 + }) } _ => return_compute_error_with!("day does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the day of year of a given temporal primitive array as an array of integers @@ -704,37 +642,23 @@ where T::Native: ArrowNativeType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - ordinal, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.ordinal() as i32 + })) } DataType::Timestamp(_, Some(ref tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - ordinal, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.ordinal() as i32 + }) } _ => return_compute_error_with!("doy does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the minutes of a given temporal primitive array as an array of integers @@ -771,37 +695,23 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - minute, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.minute() as i32 + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - minute, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.minute() as i32 + }) } _ => return_compute_error_with!("minute does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the week of a given temporal primitive array as an array of integers @@ -836,24 +746,16 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); - match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { + let b = Int32Builder::with_capacity(array.len()); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - iso_week, - week, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.iso_week().week() as i32 + })) } _ => return_compute_error_with!("week does not support", array.data_type()), } - - Ok(b.finish()) } /// Extracts the seconds of a given temporal primitive array as an array of integers @@ -890,37 +792,23 @@ where T: ArrowTemporalType + ArrowNumericType, i64: std::convert::From, { - let mut b = Int32Builder::with_capacity(array.len()); + let b = Int32Builder::with_capacity(array.len()); match dt { DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - second, - |value| { as_datetime::(i64::from(value)) }, - |h| h as i32 - ) + Ok(as_datetime_with_op::(iter, b, |t| { + t.second() as i32 + })) } DataType::Timestamp(_, Some(tz)) => { - let mut scratch = Parsed::new(); + let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_array!( - iter, - b, - second, - |value, tz| as_datetime::(i64::from(value)) - .map(|datetime| datetime + tz), - tz, - scratch, - |value| as_datetime::(i64::from(value)), - |h| h as i32 - ) + extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + t.second() as i32 + }) } _ => return_compute_error_with!("second does not support", array.data_type()), } - - Ok(b.finish()) } #[cfg(test)] From f4ee8b9acbd2ad3110dfc1bf3cb8b93bd876adb5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Oct 2022 06:11:27 +0100 Subject: [PATCH 0124/1411] Handle empty offsets buffer (#1824) (#2836) * Handle empty offsets buffer (#1824) * Review feedback --- arrow-array/src/array/binary_array.rs | 30 +++++++++++++++++-- arrow-array/src/array/list_array.rs | 42 +++++++++++++++++++++++++-- arrow-array/src/array/string_array.rs | 31 ++++++++++++++++++-- 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index cb168daf0720..851fb60c0787 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -17,7 +17,10 @@ use crate::iterator::GenericBinaryIter; use crate::raw_pointer::RawPtrBox; -use crate::{print_long_array, Array, ArrayAccessor, GenericListArray, OffsetSizeTrait}; +use crate::{ + empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray, + OffsetSizeTrait, +}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -286,7 +289,11 @@ impl From for GenericBinaryArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -845,4 +852,23 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } + + #[test] + fn test_empty_offsets() { + let string = BinaryArray::from( + ArrayData::builder(DataType::Binary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeBinaryArray::from( + ArrayData::builder(DataType::LargeBinary) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index b45a0f9257f2..3022db023ab6 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -43,6 +43,17 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } +/// Returns a slice of `OffsetSize` consisting of a single zero value +#[inline] +pub(crate) fn empty_offsets() -> &'static [OffsetSize] { + static OFFSET: &[i64] = &[0]; + // SAFETY: + // OffsetSize is ArrowNativeType and is therefore trivially transmutable + let (prefix, val, suffix) = unsafe { OFFSET.align_to::() }; + assert!(prefix.is_empty() && suffix.is_empty()); + val +} + /// Generic struct for a variable-size list array. /// /// Columnar format in Apache Arrow: @@ -240,8 +251,13 @@ impl GenericListArray { } let values = make_array(values); - let value_offsets = data.buffers()[0].as_ptr(); - let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; + // Handle case of empty offsets + let offsets = match data.is_empty() && data.buffers()[0].is_empty() { + true => empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; + + let value_offsets = unsafe { RawPtrBox::new(offsets) }; Ok(Self { data, values, @@ -941,4 +957,26 @@ mod tests { false, ); } + + #[test] + fn test_empty_offsets() { + let f = Box::new(Field::new("element", DataType::Int32, true)); + let string = ListArray::from( + ArrayData::builder(DataType::List(f.clone())) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + let string = LargeListArray::from( + ArrayData::builder(DataType::LargeList(f)) + .buffers(vec![Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 22ad81eaa3f9..7e2ed3667e22 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -18,8 +18,8 @@ use crate::iterator::GenericStringIter; use crate::raw_pointer::RawPtrBox; use crate::{ - print_long_array, Array, ArrayAccessor, GenericBinaryArray, GenericListArray, - OffsetSizeTrait, + empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, + GenericListArray, OffsetSizeTrait, }; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -370,7 +370,11 @@ impl From for GenericStringArray empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; let values = data.buffers()[1].as_ptr(); Self { data, @@ -823,4 +827,25 @@ mod tests { fn test_large_string_array_from_list_array_wrong_type() { _test_generic_string_array_from_list_array_wrong_type::(); } + + #[test] + fn test_empty_offsets() { + let string = StringArray::from( + ArrayData::builder(DataType::Utf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + + let string = LargeStringArray::from( + ArrayData::builder(DataType::LargeUtf8) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + } } From 65d55768c7fae6ad733f1c45421f4e992297cf2b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Oct 2022 20:29:42 +0100 Subject: [PATCH 0125/1411] Simplify OrderPreservingInterner allocation strategy (#2677) (#2827) --- arrow/src/row/interner.rs | 234 +++++++++++++------------------------- 1 file changed, 76 insertions(+), 158 deletions(-) diff --git a/arrow/src/row/interner.rs b/arrow/src/row/interner.rs index 156d23465bfd..e6c8f0972417 100644 --- a/arrow/src/row/interner.rs +++ b/arrow/src/row/interner.rs @@ -17,7 +17,6 @@ use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use std::cmp::Ordering; use std::num::NonZeroU32; use std::ops::Index; @@ -134,27 +133,24 @@ impl OrderPreservingInterner { /// returning `None` if it cannot be found pub fn lookup(&self, normalized_key: &[u8]) -> Option { let len = normalized_key.len(); + if len <= 1 { + return None; + } - let mut current_slot: Option<&Slot> = None; + let mut bucket = self.bucket.as_ref(); if len > 2 { for v in normalized_key.iter().take(len - 2) { - let slot_idx = v.checked_sub(1)?; - current_slot = Some(match current_slot { - None => &self.bucket.slots[slot_idx as usize], - Some(b) => &b.child.as_ref()?.slots[slot_idx as usize], - }); + if *v == 255 { + bucket = bucket.next.as_ref()?; + } else { + let bucket_idx = v.checked_sub(1)?; + bucket = bucket.slots.get(bucket_idx as usize)?.child.as_ref()?; + } } } - if len > 1 { - let slot_idx = normalized_key[len - 2].checked_sub(2)?; - current_slot = Some(match current_slot { - None => &self.bucket.slots[slot_idx as usize], - Some(b) => &b.child.as_ref()?.slots[slot_idx as usize], - }); - } - - current_slot.as_ref()?.value + let slot_idx = normalized_key[len - 2].checked_sub(2)?; + Some(bucket.slots.get(slot_idx as usize)?.value) } /// Returns the interned value for a given [`Interned`] @@ -216,9 +212,9 @@ impl Index for InternBuffer { /// /// It may contain a value, if not the first slot, and may contain a child [`Bucket`] representing /// the next byte in the generated normalized key -#[derive(Debug, Default, Clone)] +#[derive(Debug, Clone)] struct Slot { - value: Option, + value: Interned, /// Child values less than `self.value` if any child: Option>, } @@ -230,156 +226,57 @@ struct Slot { /// * Contain no `0` bytes other than the null terminator /// * Compare lexicographically in the same manner as the encoded `data` /// -/// The data structure consists of 255 slots, each of which can store a value. +/// The data structure consists of 254 slots, each of which can store a value. /// Additionally each slot may contain a child bucket, containing values smaller -/// than the value within the slot -/// -/// # Allocation Strategy -/// -/// To find the insertion point within a Bucket we perform a binary search of the slots, but -/// capping the search range at 4. Visualizing this as a search tree, the root would have 64 -/// children, with subsequent non-leaf nodes each containing two children. -/// -/// The insertion point is the first empty slot we encounter, otherwise it is the first slot -/// that contains a value greater than the value being inserted -/// -/// For example, initially all slots are empty -/// -/// ```ignore -/// 0: -/// 1: -/// . -/// . -/// 254: -/// ``` -/// -/// Insert `1000` -/// -/// ```ignore -/// 0: -/// 1: -/// 2: -/// 3: 1000 <- 1. slot is empty, insert here -/// 4: -/// . -/// . -/// 254: -/// ``` -/// -/// Insert `500` -/// -/// ```ignore -/// 0: -/// 1: 500 <- 2. slot is empty, insert here -/// 2: -/// 3: 1000 <- 1. compare against slot value -/// 4. -/// . -/// . -/// 254: -/// ``` +/// than the value within the slot. /// -/// Insert `600` +/// Each bucket also may contain a child bucket, containing values greater than +/// all values in the current bucket /// -/// ```ignore -/// 0: -/// 1: 500 <- 2. compare against slot value -/// 2: 600 <- 3. slot is empty, insert here -/// 3: 1000 <- 1. compare against slot value -/// 4. -/// . -/// . -/// 254: -/// ``` +/// # Allocation Strategy /// -/// Insert `400` +/// The contiguous slice of slots containing values is searched to find the insertion +/// point for the new value, according to the sort order. /// -/// ```ignore -/// 0: 400 <- 3. slot is empty, insert here -/// 1: 500 <- 2. compare against slot value -/// 2: 600 -/// 3: 1000 <- 1. compare against slot value -/// 4. -/// . -/// . -/// 254: -/// ``` +/// If the insertion position exceeds 254, the number of slots, the value is inserted +/// into the child bucket of the current bucket. /// -/// Insert `700` +/// If the insertion position already contains a value, the value is inserted into the +/// child bucket of that slot. /// -/// ```ignore -/// 0: 400 -/// 1: 500 <- 2. compare against slot value -/// 2: 600 <- 3. slot is occupied and end of search -/// 3: 1000 <- 1. compare against slot value -/// 4. -/// . -/// . -/// 254: -/// ``` -/// -/// In this case we reach the end of our search and need to insert a value between -/// slots 2 and 3. To do this we create a new bucket under slot 3, and repeat -/// the process for that bucket. +/// If the slot is not occupied, the value is inserted into that slot. /// -/// The final key will consists of the slot indexes visited incremented by 1, +/// The final key consists of the slot indexes visited incremented by 1, /// with the final value incremented by 2, followed by a null terminator. /// -/// So in the above example we would have +/// Consider the case of the integers `[8, 6, 5, 7]` inserted in that order /// /// ```ignore -/// 400: &[2, 0] -/// 500: &[3, 0] -/// 600: &[4, 0] -/// 700: &[4, 5, 0] -/// 1000: &[5, 0] +/// 8: &[2, 0] +/// 6: &[1, 2, 0] +/// 5: &[1, 1, 2, 0] +/// 7: &[1, 3, 0] /// ``` /// +/// Note: this allocation strategy is optimised for interning values in sorted order +/// #[derive(Debug, Clone)] struct Bucket { - slots: Box<[Slot]>, + slots: Vec, + /// Bucket containing values larger than all of `slots` + next: Option>, } impl Default for Bucket { fn default() -> Self { - let slots = (0..255).map(|_| Slot::default()).collect::>().into(); - Self { slots } + Self { + slots: Vec::with_capacity(254), + next: None, + } } } impl Bucket { - /// Perform a skewed binary search to find the first slot that is empty or less - /// - /// Returns `Ok(idx)` if an exact match is found, otherwise returns `Err(idx)` - /// containing the slot index to insert at - fn insert_pos(&self, values_buf: &InternBuffer, data: &[u8]) -> Result { - let mut size = self.slots.len() - 1; - let mut left = 0; - let mut right = size; - while left < right { - // Skew binary search to leave gaps of at most 3 elements - let mid = left + (size / 2).min(3); - - let slot = &self.slots[mid]; - let val = match slot.value { - Some(val) => val, - None => return Err(mid), - }; - - let cmp = values_buf[val].cmp(data); - if cmp == Ordering::Less { - left = mid + 1; - } else if cmp == Ordering::Greater { - right = mid; - } else { - return Ok(mid); - } - - size = right - left; - } - Err(left) - } - /// Insert `data` into this bucket or one of its children, appending the /// normalized key to `out` as it is constructed /// @@ -387,23 +284,44 @@ impl Bucket { /// /// Panics if the value already exists fn insert(&mut self, values_buf: &mut InternBuffer, data: &[u8], out: &mut Vec) { - match self.insert_pos(values_buf, data) { - Ok(_) => unreachable!("value already exists"), - Err(idx) => { - let slot = &mut self.slots[idx]; - // Cannot insert a value into slot 254 as would overflow byte, but also - // would prevent inserting any larger values, as the child bucket can - // only contain values less than the slot - if idx != 254 && slot.value.is_none() { - out.push(idx as u8 + 2); - slot.value = Some(values_buf.insert(data)) + let slots_len = self.slots.len() as u8; + // We optimise the case of inserting a value directly after those already inserted + // as [`OrderPreservingInterner::intern`] sorts values prior to interning them + match self.slots.last() { + Some(slot) => { + if &values_buf[slot.value] < data { + if slots_len == 254 { + out.push(255); + self.next + .get_or_insert_with(Default::default) + .insert(values_buf, data, out) + } else { + out.push(slots_len + 2); + let value = values_buf.insert(data); + self.slots.push(Slot { value, child: None }); + } } else { - out.push(idx as u8 + 1); - slot.child - .get_or_insert_with(Default::default) - .insert(values_buf, data, out); + // Find insertion point + match self + .slots + .binary_search_by(|slot| values_buf[slot.value].cmp(data)) + { + Ok(_) => unreachable!("value already exists"), + Err(idx) => { + out.push(idx as u8 + 1); + self.slots[idx] + .child + .get_or_insert_with(Default::default) + .insert(values_buf, data, out) + } + } } } + None => { + out.push(2); + let value = values_buf.insert(data); + self.slots.push(Slot { value, child: None }) + } } } } From 1397fb4a0071b89ec6846762a38da6d279d4152b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 13 Oct 2022 21:37:55 +0200 Subject: [PATCH 0126/1411] Don't try to infer nullability in CSV reader (#2860) * Don't try to infer nulls in CSV reader * Clippy * Fix tests * Update arrow/src/csv/reader.rs Co-authored-by: Andrew Lamb * Add comment about nullability of fields * Lint Co-authored-by: Andrew Lamb --- arrow/src/csv/reader.rs | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 0c7536053ffc..2f4ec1a1ca3a 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -121,6 +121,8 @@ pub struct ReaderOptions { /// /// Return inferred schema and number of records used for inference. This function does not change /// reader cursor offset. +/// +/// The inferred schema will always have each field set as nullable. pub fn infer_file_schema( reader: R, delimiter: u8, @@ -200,8 +202,6 @@ fn infer_reader_schema_with_csv_options( let header_length = headers.len(); // keep track of inferred field types let mut column_types: Vec> = vec![HashSet::new(); header_length]; - // keep track of columns with nulls - let mut nulls: Vec = vec![false; header_length]; let mut records_count = 0; let mut fields = vec![]; @@ -214,12 +214,12 @@ fn infer_reader_schema_with_csv_options( } records_count += 1; - for i in 0..header_length { + // Note since we may be looking at a sample of the data, we make the safe assumption that + // they could be nullable + for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) { if let Some(string) = record.get(i) { - if string.is_empty() { - nulls[i] = true; - } else { - column_types[i] + if !string.is_empty() { + column_type .insert(infer_field_schema(string, roptions.datetime_re.clone())); } } @@ -229,7 +229,6 @@ fn infer_reader_schema_with_csv_options( // build schema from inference results for i in 0..header_length { let possibilities = &column_types[i]; - let has_nulls = nulls[i]; let field_name = &headers[i]; // determine data type based on possible types @@ -237,7 +236,7 @@ fn infer_reader_schema_with_csv_options( match possibilities.len() { 1 => { for dtype in possibilities.iter() { - fields.push(Field::new(field_name, dtype.clone(), has_nulls)); + fields.push(Field::new(field_name, dtype.clone(), true)); } } 2 => { @@ -245,13 +244,13 @@ fn infer_reader_schema_with_csv_options( && possibilities.contains(&DataType::Float64) { // we have an integer and double, fall down to double - fields.push(Field::new(field_name, DataType::Float64, has_nulls)); + fields.push(Field::new(field_name, DataType::Float64, true)); } else { // default to Utf8 for conflicting datatypes (e.g bool and int) - fields.push(Field::new(field_name, DataType::Utf8, has_nulls)); + fields.push(Field::new(field_name, DataType::Utf8, true)); } } - _ => fields.push(Field::new(field_name, DataType::Utf8, has_nulls)), + _ => fields.push(Field::new(field_name, DataType::Utf8, true)), } } @@ -1287,9 +1286,9 @@ mod tests { let mut csv = builder.build(file).unwrap(); let expected_schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), + Field::new("city", DataType::Utf8, true), + Field::new("lat", DataType::Float64, true), + Field::new("lng", DataType::Float64, true), ]); assert_eq!(Arc::new(expected_schema), csv.schema()); let batch = csv.next().unwrap().unwrap(); @@ -1514,10 +1513,10 @@ mod tests { ] ); - assert!(!schema.field(0).is_nullable()); + assert!(schema.field(0).is_nullable()); assert!(schema.field(1).is_nullable()); assert!(schema.field(2).is_nullable()); - assert!(!schema.field(3).is_nullable()); + assert!(schema.field(3).is_nullable()); assert!(schema.field(4).is_nullable()); assert!(schema.field(5).is_nullable()); @@ -1798,10 +1797,10 @@ mod tests { )?; assert_eq!(schema.fields().len(), 4); - assert!(!schema.field(0).is_nullable()); + assert!(schema.field(0).is_nullable()); assert!(schema.field(1).is_nullable()); - assert!(!schema.field(2).is_nullable()); - assert!(!schema.field(3).is_nullable()); + assert!(schema.field(2).is_nullable()); + assert!(schema.field(3).is_nullable()); assert_eq!(&DataType::Int64, schema.field(0).data_type()); assert_eq!(&DataType::Utf8, schema.field(1).data_type()); From 8adebca35253943fffb0653e7521eaf7a25b0153 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Oct 2022 21:08:22 +0100 Subject: [PATCH 0127/1411] Validate ArrayData type when converting to Array (#2834) (#2835) * Validate ArrayData type when converting to Array (#2834) * Fix cast kernel and take kernel tests * Clippy * Fix parquet * Clippy --- arrow-array/src/array/binary_array.rs | 9 +++++++ arrow-array/src/array/boolean_array.rs | 17 ++++++++++++ arrow-array/src/array/decimal_array.rs | 25 ++++++++++++++--- arrow-array/src/array/dictionary_array.rs | 22 ++++++++++++--- arrow-array/src/array/list_array.rs | 15 +++++++++++ arrow-array/src/array/map_array.rs | 23 ++++++++++++++++ arrow-array/src/array/primitive_array.rs | 19 +++++++++++++ arrow/src/compute/kernels/cast.rs | 11 ++++---- arrow/src/compute/kernels/take.rs | 4 +-- .../src/arrow/array_reader/primitive_array.rs | 27 ++++++++++++------- 10 files changed, 148 insertions(+), 24 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 851fb60c0787..c8407b252ef1 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -297,6 +297,8 @@ impl From for GenericBinaryArray>> for BooleanArray { impl From for BooleanArray { fn from(data: ArrayData) -> Self { + assert_eq!( + data.data_type(), + &DataType::Boolean, + "BooleanArray expected ArrayData with type {} got {}", + DataType::Boolean, + data.data_type() + ); assert_eq!( data.buffers().len(), 1, @@ -209,6 +216,8 @@ impl From for BooleanArray { let ptr = data.buffers()[0].as_ptr(); Self { data, + // SAFETY: + // ArrayData must be valid, and validated data type above raw_values: unsafe { RawPtrBox::new(ptr) }, } } @@ -414,4 +423,12 @@ mod tests { }; drop(BooleanArray::from(data)); } + + #[test] + #[should_panic( + expected = "BooleanArray expected ArrayData with type Boolean got Int32" + )] + fn test_from_array_data_validation() { + let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32)); + } } diff --git a/arrow-array/src/array/decimal_array.rs b/arrow-array/src/array/decimal_array.rs index 34b424092e4b..5ca9b0715cf1 100644 --- a/arrow-array/src/array/decimal_array.rs +++ b/arrow-array/src/array/decimal_array.rs @@ -407,13 +407,21 @@ impl From for DecimalArray { "DecimalArray data should contain 1 buffer only (values)" ); let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match (data.data_type(), Self::VALUE_LENGTH) { - (DataType::Decimal128(precision, scale), 16) - | (DataType::Decimal256(precision, scale), 32) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal"), + let (precision, scale) = match (data.data_type(), Self::DEFAULT_TYPE) { + (DataType::Decimal128(precision, scale), DataType::Decimal128(_, _)) + | (DataType::Decimal256(precision, scale), DataType::Decimal256(_, _)) => { + (*precision, *scale) + } + _ => panic!( + "Expected data type to match {} got {}", + Self::DEFAULT_TYPE, + data.data_type() + ), }; Self { data, + // SAFETY: + // ArrayData must be valid, and verified data type above value_data: unsafe { RawPtrBox::new(values) }, precision, scale, @@ -977,4 +985,13 @@ mod tests { array.value(4); } + + #[test] + #[should_panic( + expected = "Expected data type to match Decimal256(76, 10) got Decimal128(38, 10)" + )] + fn test_from_array_data_validation() { + let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); + let _ = Decimal256Array::from(array.into_data()); + } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 96e91f729ab1..002ee6f47820 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -408,10 +408,17 @@ impl From for DictionaryArray { ); if let DataType::Dictionary(key_data_type, _) = data.data_type() { - if key_data_type.as_ref() != &T::DATA_TYPE { - panic!("DictionaryArray's data type must match.") - }; + assert_eq!( + &T::DATA_TYPE, + key_data_type.as_ref(), + "DictionaryArray's data type must match, expected {} got {}", + T::DATA_TYPE, + key_data_type + ); + // create a zero-copy of the keys' data + // SAFETY: + // ArrayData is valid and verified type above let keys = PrimitiveArray::::from(unsafe { ArrayData::new_unchecked( T::DATA_TYPE, @@ -925,4 +932,13 @@ mod tests { let keys: Float32Array = [Some(0_f32), None, Some(3_f32)].into_iter().collect(); DictionaryArray::::try_new(&keys, &values).unwrap(); } + + #[test] + #[should_panic( + expected = "DictionaryArray's data type must match, expected Int64 got Int32" + )] + fn test_from_array_data_validation() { + let a = DictionaryArray::::from_iter(["32"]); + let _ = DictionaryArray::::from(a.into_data()); + } } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 3022db023ab6..cdc7531d99fb 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -257,6 +257,8 @@ impl GenericListArray { false => data.buffers()[0].as_ptr(), }; + // SAFETY: + // Verified list type in call to `Self::get_type` let value_offsets = unsafe { RawPtrBox::new(offsets) }; Ok(Self { data, @@ -362,6 +364,7 @@ pub type LargeListArray = GenericListArray; #[cfg(test)] mod tests { use super::*; + use crate::builder::{Int32Builder, ListBuilder}; use crate::types::Int32Type; use crate::Int32Array; use arrow_buffer::{bit_util, Buffer, ToByteSlice}; @@ -820,6 +823,18 @@ mod tests { drop(ListArray::from(list_data)); } + #[test] + #[should_panic( + expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List" + )] + fn test_from_array_data_validation() { + let mut builder = ListBuilder::new(Int32Builder::new()); + builder.values().append_value(1); + builder.append(true); + let array = builder.finish(); + let _ = LargeListArray::from(array.into_data()); + } + #[test] fn test_list_array_offsets_need_not_start_at_zero() { let value_data = ArrayData::builder(DataType::Int32) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index bfe8d407274c..0f3ae2e689a2 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -109,6 +109,12 @@ impl From for ArrayData { impl MapArray { fn try_new_from_array_data(data: ArrayData) -> Result { + assert!( + matches!(data.data_type(), DataType::Map(_, _)), + "MapArray expected ArrayData with DataType::Map got {}", + data.data_type() + ); + if data.buffers().len() != 1 { return Err(ArrowError::InvalidArgumentError( format!("MapArray data should contain a single buffer only (value offsets), had {}", @@ -141,6 +147,8 @@ impl MapArray { let values = make_array(entries); let value_offsets = data.buffers()[0].as_ptr(); + // SAFETY: + // ArrayData is valid, and verified type above let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; unsafe { if (*value_offsets.as_ptr().offset(0)) != 0 { @@ -467,6 +475,21 @@ mod tests { map_array.value(map_array.len()); } + #[test] + #[should_panic( + expected = "MapArray expected ArrayData with DataType::Map got Dictionary" + )] + fn test_from_array_data_validation() { + // A DictionaryArray has similar buffer layout to a MapArray + // but the meaning of the values differs + let struct_t = DataType::Struct(vec![ + Field::new("keys", DataType::Int32, true), + Field::new("values", DataType::UInt32, true), + ]); + let dict_t = DataType::Dictionary(Box::new(DataType::Int32), Box::new(struct_t)); + let _ = MapArray::from(ArrayData::new_empty(&dict_t)); + } + #[test] fn test_new_from_strings() { let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"]; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 928135463cca..895c80b07530 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -818,6 +818,14 @@ impl PrimitiveArray { /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { + // Use discriminant to allow for decimals + assert_eq!( + std::mem::discriminant(&T::DATA_TYPE), + std::mem::discriminant(data.data_type()), + "PrimitiveArray expected ArrayData with type {} got {}", + T::DATA_TYPE, + data.data_type() + ); assert_eq!( data.buffers().len(), 1, @@ -827,6 +835,8 @@ impl From for PrimitiveArray { let ptr = data.buffers()[0].as_ptr(); Self { data, + // SAFETY: + // ArrayData must be valid, and validated data type above raw_values: unsafe { RawPtrBox::new(ptr) }, } } @@ -1352,6 +1362,15 @@ mod tests { array.value(4); } + #[test] + #[should_panic( + expected = "PrimitiveArray expected ArrayData with type Int64 got Int32" + )] + fn test_from_array_data_validation() { + let foo = PrimitiveArray::::from_iter([1, 2, 3]); + let _ = PrimitiveArray::::from(foo.into_data()); + } + #[test] fn test_decimal128() { let values: Vec<_> = vec![0, 1, -1, i128::MIN, i128::MAX]; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index b573c65d026f..49a9b18d85f6 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -1312,15 +1312,16 @@ pub fn cast_with_options( )), (Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => { - let time_array = Int64Array::from(array.data().clone()); + let array = cast_with_options(array, &Int64, cast_options)?; + let time_array = as_primitive_array::(array.as_ref()); let from_size = time_unit_multiple(from_unit); let to_size = time_unit_multiple(to_unit); // we either divide or multiply, depending on size of each unit // units are never the same when the types are the same let converted = if from_size >= to_size { - divide_scalar(&time_array, from_size / to_size)? + divide_scalar(time_array, from_size / to_size)? } else { - multiply_scalar(&time_array, to_size / from_size)? + multiply_scalar(time_array, to_size / from_size)? }; Ok(make_timestamp_array( &converted, @@ -1329,10 +1330,10 @@ pub fn cast_with_options( )) } (Timestamp(from_unit, _), Date32) => { - let time_array = Int64Array::from(array.data().clone()); + let array = cast_with_options(array, &Int64, cast_options)?; + let time_array = as_primitive_array::(array.as_ref()); let from_size = time_unit_multiple(from_unit) * SECONDS_IN_DAY; - // Int32Array::from_iter(tim.iter) let mut b = Date32Builder::with_capacity(array.len()); for i in 0..array.len() { diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 1aa4473c0444..b9cfae516f89 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -1398,7 +1398,7 @@ mod tests { fn test_take_bool_nullable_index() { // indices where the masked invalid elements would be out of bounds let index_data = ArrayData::try_new( - DataType::Int32, + DataType::UInt32, 6, Some(Buffer::from_iter(vec![ false, true, false, true, false, true, @@ -1421,7 +1421,7 @@ mod tests { fn test_take_bool_nullable_index_nonnull_values() { // indices where the masked invalid elements would be out of bounds let index_data = ArrayData::try_new( - DataType::Int32, + DataType::UInt32, 6, Some(Buffer::from_iter(vec![ false, true, false, true, false, true, diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index d4f96e6a8d60..5fc5e639de92 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -26,7 +26,8 @@ use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow::array::{ ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, Decimal128Array, - Float32Array, Float64Array, Int32Array, Int64Array,TimestampNanosecondArray, TimestampNanosecondBufferBuilder, + Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, + TimestampNanosecondBufferBuilder, UInt32Array, UInt64Array, }; use arrow::buffer::Buffer; use arrow::datatypes::{DataType as ArrowType, TimeUnit}; @@ -169,15 +170,21 @@ where .null_bit_buffer(self.record_reader.consume_bitmap_buffer()); let array_data = unsafe { array_data.build_unchecked() }; - let array = match T::get_physical_type() { - PhysicalType::BOOLEAN => Arc::new(BooleanArray::from(array_data)) as ArrayRef, - PhysicalType::INT32 => Arc::new(Int32Array::from(array_data)) as ArrayRef, - PhysicalType::INT64 => Arc::new(Int64Array::from(array_data)) as ArrayRef, - PhysicalType::FLOAT => Arc::new(Float32Array::from(array_data)) as ArrayRef, - PhysicalType::DOUBLE => Arc::new(Float64Array::from(array_data)) as ArrayRef, - PhysicalType::INT96 => { - Arc::new(TimestampNanosecondArray::from(array_data)) as ArrayRef - } + let array: ArrayRef = match T::get_physical_type() { + PhysicalType::BOOLEAN => Arc::new(BooleanArray::from(array_data)), + PhysicalType::INT32 => match array_data.data_type() { + ArrowType::UInt32 => Arc::new(UInt32Array::from(array_data)), + ArrowType::Int32 => Arc::new(Int32Array::from(array_data)), + _ => unreachable!(), + }, + PhysicalType::INT64 => match array_data.data_type() { + ArrowType::UInt64 => Arc::new(UInt64Array::from(array_data)), + ArrowType::Int64 => Arc::new(Int64Array::from(array_data)), + _ => unreachable!(), + }, + PhysicalType::FLOAT => Arc::new(Float32Array::from(array_data)), + PhysicalType::DOUBLE => Arc::new(Float64Array::from(array_data)), + PhysicalType::INT96 => Arc::new(TimestampNanosecondArray::from(array_data)), PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => { unreachable!( "PrimitiveArrayReaders don't support complex physical types" From fa1d079678bb1bc4bcc1ed3d6c9d41598a4f9adb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Oct 2022 10:01:24 +1300 Subject: [PATCH 0128/1411] Add `interleave` kernel (#1523) (#2838) * Add interleave kernel (#1523) * RAT * Review feedback --- arrow/src/compute/kernels/interleave.rs | 214 ++++++++++++++++++++++++ arrow/src/compute/kernels/mod.rs | 1 + arrow/src/compute/kernels/take.rs | 9 +- arrow/src/compute/mod.rs | 1 + 4 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 arrow/src/compute/kernels/interleave.rs diff --git a/arrow/src/compute/kernels/interleave.rs b/arrow/src/compute/kernels/interleave.rs new file mode 100644 index 000000000000..01ac0fc8fe36 --- /dev/null +++ b/arrow/src/compute/kernels/interleave.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{make_array, new_empty_array, Array, ArrayRef}; +use arrow_data::transform::MutableArrayData; +use arrow_schema::ArrowError; + +/// +/// Takes elements by index from a list of [`Array`], creating a new [`Array`] from those values. +/// +/// Each element in `indices` is a pair of `usize` with the first identifying the index +/// of the [`Array`] in `values`, and the second the index of the value within that [`Array`] +/// +/// ```text +/// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ +/// │ A │ │ (0, 0) │ interleave( │ A │ +/// ├─────────────────┤ ├─────────┤ [values0, values1], ├─────────────────┤ +/// │ D │ │ (1, 0) │ indices │ B │ +/// └─────────────────┘ ├─────────┤ ) ├─────────────────┤ +/// values array 0 │ (1, 1) │ ─────────────────────────▶ │ C │ +/// ├─────────┤ ├─────────────────┤ +/// │ (0, 1) │ │ D │ +/// └─────────┘ └─────────────────┘ +/// ┌─────────────────┐ indices +/// │ B │ array +/// ├─────────────────┤ result +/// │ C │ +/// ├─────────────────┤ +/// │ E │ +/// └─────────────────┘ +/// values array 1 +/// ``` +/// +/// For selecting values by index from a single array see [compute::take](crate::compute::take) +pub fn interleave( + values: &[&dyn Array], + indices: &[(usize, usize)], +) -> Result { + if values.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "interleave requires input of at least one array".to_string(), + )); + } + let data_type = values[0].data_type(); + + for array in values.iter().skip(1) { + if array.data_type() != data_type { + return Err(ArrowError::InvalidArgumentError( + format!("It is not possible to interleave arrays of different data types ({} and {})", + data_type, array.data_type()), + )); + } + } + + if indices.is_empty() { + return Ok(new_empty_array(data_type)); + } + + // TODO: Add specialized implementations (#2864) + + interleave_fallback(values, indices) +} + +/// Fallback implementation of interleave using [`MutableArrayData`] +fn interleave_fallback( + values: &[&dyn Array], + indices: &[(usize, usize)], +) -> Result { + let arrays: Vec<_> = values.iter().map(|x| x.data()).collect(); + let mut array_data = MutableArrayData::new(arrays, false, indices.len()); + + let mut cur_array = indices[0].0; + let mut start_row_idx = indices[0].1; + let mut end_row_idx = start_row_idx + 1; + + for (array, row) in indices.iter().skip(1).copied() { + if array == cur_array && row == end_row_idx { + // subsequent row in same batch + end_row_idx += 1; + continue; + } + + // emit current batch of rows for current buffer + array_data.extend(cur_array, start_row_idx, end_row_idx); + + // start new batch of rows + cur_array = array; + start_row_idx = row; + end_row_idx = start_row_idx + 1; + } + + // emit final batch of rows + array_data.extend(cur_array, start_row_idx, end_row_idx); + Ok(make_array(array_data.freeze())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::builder::{Int32Builder, ListBuilder}; + use arrow_array::cast::{as_primitive_array, as_string_array}; + use arrow_array::types::Int32Type; + use arrow_array::{Int32Array, ListArray, StringArray}; + use arrow_schema::DataType; + + #[test] + fn test_primitive() { + let a = Int32Array::from_iter_values([1, 2, 3, 4]); + let b = Int32Array::from_iter_values([5, 6, 7]); + let c = Int32Array::from_iter_values([8, 9, 10]); + let values = + interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); + let v = as_primitive_array::(&values); + assert_eq!(v.values(), &[4, 4, 10, 8, 6]); + } + + #[test] + fn test_primitive_nulls() { + let a = Int32Array::from_iter_values([1, 2, 3, 4]); + let b = Int32Array::from_iter([Some(1), Some(4), None]); + let values = + interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (0, 2)]).unwrap(); + let v: Vec<_> = as_primitive_array::(&values) + .into_iter() + .collect(); + assert_eq!(&v, &[Some(2), None, None, Some(4), Some(3)]) + } + + #[test] + fn test_primitive_empty() { + let a = Int32Array::from_iter_values([1, 2, 3, 4]); + let v = interleave(&[&a], &[]).unwrap(); + assert!(v.is_empty()); + assert_eq!(v.data_type(), &DataType::Int32); + } + + #[test] + fn test_strings() { + let a = StringArray::from_iter_values(["a", "b", "c"]); + let b = StringArray::from_iter_values(["hello", "world", "foo"]); + let values = + interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); + let v = as_string_array(&values); + let values: Vec<_> = v.into_iter().collect(); + assert_eq!( + &values, + &[ + Some("c"), + Some("c"), + Some("hello"), + Some("world"), + Some("b") + ] + ) + } + + #[test] + fn test_lists() { + // [[1, 2], null, [3]] + let mut a = ListBuilder::new(Int32Builder::new()); + a.values().append_value(1); + a.values().append_value(2); + a.append(true); + a.append(false); + a.values().append_value(3); + a.append(true); + let a = a.finish(); + + // [[4], null, [5, 6, null]] + let mut b = ListBuilder::new(Int32Builder::new()); + b.values().append_value(4); + b.append(true); + b.append(false); + b.values().append_value(5); + b.values().append_value(6); + b.values().append_null(); + b.append(true); + let b = b.finish(); + + let values = + interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap(); + let v = values.as_any().downcast_ref::().unwrap(); + + // [[3], null, [4], [5, 6, null], null] + let mut expected = ListBuilder::new(Int32Builder::new()); + expected.values().append_value(3); + expected.append(true); + expected.append(false); + expected.values().append_value(4); + expected.append(true); + expected.values().append_value(5); + expected.values().append_value(6); + expected.values().append_null(); + expected.append(true); + expected.append(false); + let expected = expected.finish(); + + assert_eq!(v, &expected); + } +} diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 99cdcf460ce1..8301f69bbf8b 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -28,6 +28,7 @@ pub mod comparison; pub mod concat; pub mod concat_elements; pub mod filter; +pub mod interleave; pub mod length; pub mod limit; pub mod partition; diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index b9cfae516f89..714c29772a50 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -46,15 +46,20 @@ use num::{ToPrimitive, Zero}; /// ├─────────────────┤ └─────────┘ └─────────────────┘ /// │ E │ /// └─────────────────┘ -/// values array indicies array result +/// values array indices array result /// ``` /// +/// For selecting values by index from multiple arrays see [compute::interleave](crate::compute::interleave) +/// /// # Errors /// This function errors whenever: /// * An index cannot be casted to `usize` (typically 32 bit architectures) /// * An index is out of bounds and `options` is set to check bounds. +/// /// # Safety -/// When `options` is not set to check bounds (default), taking indexes after `len` is undefined behavior. +/// +/// When `options` is not set to check bounds, taking indexes after `len` will panic. +/// /// # Examples /// ``` /// use arrow::array::{StringArray, UInt32Array}; diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index 2b3b9a76873a..28e5e6b520bc 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -29,6 +29,7 @@ pub use self::kernels::cast::*; pub use self::kernels::comparison::*; pub use self::kernels::concat::*; pub use self::kernels::filter::*; +pub use self::kernels::interleave::*; pub use self::kernels::limit::*; pub use self::kernels::partition::*; pub use self::kernels::regexp::*; From eeb12612217b7658897702d778f6c92939320007 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 13 Oct 2022 14:31:28 -0700 Subject: [PATCH 0129/1411] Make DecimalArray as PrimitiveArray (#2857) * Make DecimalArray as PrimitiveArray * Add decimal array tests back * Remove value validation in with_precision_and_scale * Add "force_validate" cfg check back * Trigger Build Co-authored-by: Raphael Taylor-Davies --- arrow-array/src/array/decimal_array.rs | 997 ------------------- arrow-array/src/array/mod.rs | 9 - arrow-array/src/array/primitive_array.rs | 378 ++++++- arrow-array/src/builder/decimal_builder.rs | 382 ------- arrow-array/src/builder/mod.rs | 2 - arrow-array/src/builder/primitive_builder.rs | 3 + arrow-array/src/builder/struct_builder.rs | 6 +- arrow-array/src/decimal.rs | 14 +- arrow-array/src/iterator.rs | 12 +- arrow-array/src/types.rs | 36 +- arrow-buffer/src/bigint.rs | 10 + arrow/benches/array_from_vec.rs | 8 +- arrow/benches/builder.rs | 34 +- arrow/benches/cast_kernels.rs | 24 +- arrow/benches/decimal_validate.rs | 25 +- arrow/src/compute/kernels/cast.rs | 125 ++- arrow/src/compute/kernels/sort.rs | 2 +- arrow/src/compute/kernels/take.rs | 4 +- arrow/src/csv/reader.rs | 31 +- arrow/src/row/fixed.rs | 6 +- arrow/src/row/mod.rs | 4 +- arrow/src/util/display.rs | 3 +- arrow/tests/array_validation.rs | 6 +- integration-testing/Cargo.toml | 1 + integration-testing/src/util/mod.rs | 33 +- parquet/src/arrow/arrow_reader/mod.rs | 6 +- parquet/src/arrow/arrow_writer/mod.rs | 4 +- 27 files changed, 592 insertions(+), 1573 deletions(-) delete mode 100644 arrow-array/src/array/decimal_array.rs delete mode 100644 arrow-array/src/builder/decimal_builder.rs diff --git a/arrow-array/src/array/decimal_array.rs b/arrow-array/src/array/decimal_array.rs deleted file mode 100644 index 5ca9b0715cf1..000000000000 --- a/arrow-array/src/array/decimal_array.rs +++ /dev/null @@ -1,997 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder::BooleanBufferBuilder; -use crate::decimal::{Decimal, Decimal256}; -use crate::iterator::DecimalIter; -use crate::raw_pointer::RawPtrBox; -use crate::types::{Decimal128Type, Decimal256Type, DecimalType, NativeDecimalType}; -use crate::{ - print_long_array, Array, ArrayAccessor, FixedSizeBinaryArray, FixedSizeListArray, -}; -use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::decimal::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, -}; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType}; -use std::any::Any; -use std::marker::PhantomData; - -/// `Decimal128Array` stores fixed width decimal numbers, -/// with a fixed precision and scale. -/// -/// # Examples -/// -/// ``` -/// use arrow_array::{Array, DecimalArray, Decimal128Array}; -/// use arrow_schema::DataType; -/// -/// // Create a DecimalArray with the default precision and scale -/// let decimal_array: Decimal128Array = vec![ -/// Some(8_887_000_000_i128), -/// None, -/// Some(-8_887_000_000_i128), -/// ] -/// .into_iter().collect(); -/// -/// // set precision and scale so values are interpreted -/// // as `8887.000000`, `Null`, and `-8887.000000` -/// let decimal_array = decimal_array -/// .with_precision_and_scale(23, 6) -/// .unwrap(); -/// -/// assert_eq!(&DataType::Decimal128(23, 6), decimal_array.data_type()); -/// assert_eq!(8_887_000_000_i128, decimal_array.value(0).as_i128()); -/// assert_eq!("8887.000000", decimal_array.value_as_string(0)); -/// assert_eq!(3, decimal_array.len()); -/// assert_eq!(1, decimal_array.null_count()); -/// assert_eq!(32, decimal_array.value_offset(2)); -/// assert_eq!(16, decimal_array.value_length()); -/// assert_eq!(23, decimal_array.precision()); -/// assert_eq!(6, decimal_array.scale()); -/// ``` -/// -pub type Decimal128Array = DecimalArray; - -/// `Decimal256Array` stores fixed width decimal numbers, -/// with a fixed precision and scale -pub type Decimal256Array = DecimalArray; - -/// A generic [`Array`] for fixed width decimal numbers -/// -/// See [`Decimal128Array`] and [`Decimal256Array`] -pub struct DecimalArray { - data: ArrayData, - value_data: RawPtrBox, - precision: u8, - scale: u8, - _phantom: PhantomData, -} - -impl DecimalArray { - pub const VALUE_LENGTH: i32 = T::BYTE_LENGTH as i32; - const DEFAULT_TYPE: DataType = T::DEFAULT_TYPE; - pub const MAX_PRECISION: u8 = T::MAX_PRECISION; - pub const MAX_SCALE: u8 = T::MAX_SCALE; - const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = T::TYPE_CONSTRUCTOR; - - pub fn data(&self) -> &ArrayData { - &self.data - } - - /// Return the precision (total digits) that can be stored by this array - pub fn precision(&self) -> u8 { - self.precision - } - - /// Return the scale (digits after the decimal) that can be stored by this array - pub fn scale(&self) -> u8 { - self.scale - } - - /// Returns the element at index `i`. - /// # Panics - /// Panics if index `i` is out of bounds. - pub fn value(&self, i: usize) -> Decimal { - assert!( - i < self.data().len(), - "Trying to access an element at index {} from a DecimalArray of length {}", - i, - self.len() - ); - - unsafe { self.value_unchecked(i) } - } - - /// Returns the element at index `i`. - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn value_unchecked(&self, i: usize) -> Decimal { - let data = self.data(); - let offset = i + data.offset(); - let raw_val = { - let pos = self.value_offset_at(offset); - T::Native::from_slice(std::slice::from_raw_parts( - self.raw_value_data_ptr().offset(pos as isize), - Self::VALUE_LENGTH as usize, - )) - }; - Decimal::new(self.precision(), self.scale(), &raw_val) - } - - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data().offset() + i) - } - - /// Returns the length for an element. - /// - /// All elements have the same length as the array is a fixed size. - #[inline] - pub fn value_length(&self) -> i32 { - Self::VALUE_LENGTH - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data().buffers()[0].clone() - } - - #[inline] - pub fn value_offset_at(&self, i: usize) -> i32 { - Self::VALUE_LENGTH * i as i32 - } - - #[inline] - pub fn value_as_string(&self, row: usize) -> String { - self.value(row).to_string() - } - - /// Build a decimal array from [`FixedSizeBinaryArray`]. - /// - /// NB: This function does not validate that each value is in the permissible - /// range for a decimal - pub fn from_fixed_size_binary_array( - v: FixedSizeBinaryArray, - precision: u8, - scale: u8, - ) -> Self { - assert!( - v.value_length() == Self::VALUE_LENGTH, - "Value length of the array ({}) must equal to the byte width of the decimal ({})", - v.value_length(), - Self::VALUE_LENGTH, - ); - let data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal128(precision, scale) - } else { - DataType::Decimal256(precision, scale) - }; - let builder = v.into_data().into_builder().data_type(data_type); - - let array_data = unsafe { builder.build_unchecked() }; - Self::from(array_data) - } - - /// Build a decimal array from [`FixedSizeListArray`]. - /// - /// NB: This function does not validate that each value is in the permissible - /// range for a decimal. - #[deprecated(note = "please use `from_fixed_size_binary_array` instead")] - pub fn from_fixed_size_list_array( - v: FixedSizeListArray, - precision: u8, - scale: u8, - ) -> Self { - assert_eq!( - v.data_ref().child_data().len(), - 1, - "DecimalArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - ); - let child_data = &v.data_ref().child_data()[0]; - - assert_eq!( - child_data.child_data().len(), - 0, - "DecimalArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - ); - assert_eq!( - child_data.data_type(), - &DataType::UInt8, - "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." - ); - assert!( - v.value_length() == Self::VALUE_LENGTH, - "Value length of the array ({}) must equal to the byte width of the decimal ({})", - v.value_length(), - Self::VALUE_LENGTH, - ); - assert_eq!( - v.data_ref().child_data()[0].null_count(), - 0, - "The child array cannot contain null values." - ); - - let list_offset = v.offset(); - let child_offset = child_data.offset(); - let data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal128(precision, scale) - } else { - DataType::Decimal256(precision, scale) - }; - let builder = ArrayData::builder(data_type) - .len(v.len()) - .add_buffer(child_data.buffers()[0].slice(child_offset)) - .null_bit_buffer(v.data_ref().null_buffer().cloned()) - .offset(list_offset); - - let array_data = unsafe { builder.build_unchecked() }; - Self::from(array_data) - } - - /// The default precision and scale used when not specified. - pub const fn default_type() -> DataType { - Self::DEFAULT_TYPE - } - - fn raw_value_data_ptr(&self) -> *const u8 { - self.value_data.as_ptr() - } - - /// Returns a Decimal array with the same data as self, with the - /// specified precision. - /// - /// Returns an Error if: - /// 1. `precision` is larger than [`Self::MAX_PRECISION`] - /// 2. `scale` is larger than [`Self::MAX_SCALE`]; - /// 3. `scale` is > `precision` - pub fn with_precision_and_scale( - self, - precision: u8, - scale: u8, - ) -> Result - where - Self: Sized, - { - // validate precision and scale - self.validate_precision_scale(precision, scale)?; - - // Ensure that all values are within the requested - // precision. For performance, only check if the precision is - // decreased - if precision < self.precision { - self.validate_data(precision)?; - } - - // safety: self.data is valid DataType::Decimal as checked above - let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); - let data = self.data().clone().into_builder().data_type(new_data_type); - - // SAFETY - // Validated data above - Ok(unsafe { data.build_unchecked().into() }) - } - - // validate that the new precision and scale are valid or not - fn validate_precision_scale( - &self, - precision: u8, - scale: u8, - ) -> Result<(), ArrowError> { - if precision > Self::MAX_PRECISION { - return Err(ArrowError::InvalidArgumentError(format!( - "precision {} is greater than max {}", - precision, - Self::MAX_PRECISION - ))); - } - if scale > Self::MAX_SCALE { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is greater than max {}", - scale, - Self::MAX_SCALE - ))); - } - if scale > precision { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is greater than precision {}", - scale, precision - ))); - } - let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale); - assert_eq!(self.data().data_type(), &data_type); - - Ok(()) - } - - // validate all the data in the array are valid within the new precision or not - fn validate_data(&self, precision: u8) -> Result<(), ArrowError> { - // TODO: Move into DecimalType - match Self::VALUE_LENGTH { - 16 => self - .as_any() - .downcast_ref::() - .unwrap() - .validate_decimal_precision(precision), - 32 => self - .as_any() - .downcast_ref::() - .unwrap() - .validate_decimal_precision(precision), - other_width => { - panic!("invalid byte width {}", other_width); - } - } - } -} - -impl Decimal128Array { - /// Creates a [Decimal128Array] with default precision and scale, - /// based on an iterator of `i128` values without nulls - pub fn from_iter_values>(iter: I) -> Self { - let val_buf: Buffer = iter.into_iter().collect(); - let data = unsafe { - ArrayData::new_unchecked( - Self::default_type(), - val_buf.len() / std::mem::size_of::(), - None, - None, - 0, - vec![val_buf], - vec![], - ) - }; - Decimal128Array::from(data) - } - - // Validates decimal128 values in this array can be properly interpreted - // with the specified precision. - fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { - (0..self.len()).try_for_each(|idx| { - if self.is_valid(idx) { - let decimal = unsafe { self.value_unchecked(idx) }; - validate_decimal_precision(decimal.as_i128(), precision) - } else { - Ok(()) - } - }) - } -} - -impl Decimal256Array { - // Validates decimal256 values in this array can be properly interpreted - // with the specified precision. - fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { - (0..self.len()).try_for_each(|idx| { - if self.is_valid(idx) { - let raw_val = unsafe { - let pos = self.value_offset(idx); - std::slice::from_raw_parts( - self.raw_value_data_ptr().offset(pos as isize), - Self::VALUE_LENGTH as usize, - ) - }; - validate_decimal256_precision_with_lt_bytes(raw_val, precision) - } else { - Ok(()) - } - }) - } -} - -impl From for DecimalArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "DecimalArray data should contain 1 buffer only (values)" - ); - let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match (data.data_type(), Self::DEFAULT_TYPE) { - (DataType::Decimal128(precision, scale), DataType::Decimal128(_, _)) - | (DataType::Decimal256(precision, scale), DataType::Decimal256(_, _)) => { - (*precision, *scale) - } - _ => panic!( - "Expected data type to match {} got {}", - Self::DEFAULT_TYPE, - data.data_type() - ), - }; - Self { - data, - // SAFETY: - // ArrayData must be valid, and verified data type above - value_data: unsafe { RawPtrBox::new(values) }, - precision, - scale, - _phantom: Default::default(), - } - } -} - -fn build_decimal_array_from( - null_buf: BooleanBufferBuilder, - buffer: Buffer, -) -> DecimalArray { - let data = unsafe { - ArrayData::new_unchecked( - DecimalArray::::default_type(), - null_buf.len(), - None, - Some(null_buf.into()), - 0, - vec![buffer], - vec![], - ) - }; - DecimalArray::from(data) -} - -impl> FromIterator> for Decimal256Array { - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (lower, upper) = iter.size_hint(); - let size_hint = upper.unwrap_or(lower); - - let mut null_buf = BooleanBufferBuilder::new(size_hint); - - let mut buffer = MutableBuffer::with_capacity(size_hint); - - iter.for_each(|item| { - if let Some(a) = item { - null_buf.append(true); - buffer.extend_from_slice(Into::into(a).raw_value()); - } else { - null_buf.append(false); - buffer.extend_zeros(32); - } - }); - - build_decimal_array_from(null_buf, buffer.into()) - } -} - -impl> FromIterator> for Decimal128Array { - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (lower, upper) = iter.size_hint(); - let size_hint = upper.unwrap_or(lower); - - let mut null_buf = BooleanBufferBuilder::new(size_hint); - - let buffer: Buffer = iter - .map(|item| { - if let Some(a) = item { - null_buf.append(true); - a.into() - } else { - null_buf.append(false); - // arbitrary value for NULL - 0 - } - }) - .collect(); - - build_decimal_array_from(null_buf, buffer) - } -} - -impl Array for DecimalArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl From> for ArrayData { - fn from(array: DecimalArray) -> Self { - array.data - } -} - -impl std::fmt::Debug for DecimalArray { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!( - f, - "Decimal{}Array<{}, {}>\n[\n", - T::BYTE_LENGTH * 8, - self.precision, - self.scale - )?; - print_long_array(self, f, |array, index, f| { - let formatted_decimal = array.value_as_string(index); - - write!(f, "{}", formatted_decimal) - })?; - write!(f, "]") - } -} - -impl<'a, T: DecimalType> ArrayAccessor for &'a DecimalArray { - type Item = Decimal; - - fn value(&self, index: usize) -> Self::Item { - DecimalArray::::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - DecimalArray::::value_unchecked(self, index) - } -} - -impl<'a, T: DecimalType> IntoIterator for &'a DecimalArray { - type Item = Option>; - type IntoIter = DecimalIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - DecimalIter::<'a, T>::new(self) - } -} - -impl<'a, T: DecimalType> DecimalArray { - /// constructs a new iterator - pub fn iter(&'a self) -> DecimalIter<'a, T> { - DecimalIter::<'a, T>::new(self) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::builder::{Decimal128Builder, Decimal256Builder}; - use crate::decimal::Decimal128; - use arrow_data::decimal::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; - use arrow_schema::Field; - use num::{BigInt, Num}; - - #[test] - fn test_decimal_array() { - // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; - let values: [u8; 32] = [ - 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]; - let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) - .len(2) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - let decimal_array = Decimal128Array::from(array_data); - assert_eq!(8_887_000_000_i128, decimal_array.value(0).into()); - assert_eq!(-8_887_000_000_i128, decimal_array.value(1).into()); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - #[cfg(not(feature = "force_validate"))] - fn test_decimal_append_error_value() { - let mut decimal_builder = Decimal128Builder::with_capacity(10, 5, 3); - let mut result = decimal_builder.append_value(123456); - let mut error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", - error.to_string() - ); - - unsafe { - decimal_builder.disable_value_validation(); - } - result = decimal_builder.append_value(123456); - assert!(result.is_ok()); - decimal_builder.append_value(12345).unwrap(); - let arr = decimal_builder.finish(); - assert_eq!("12.345", arr.value_as_string(1)); - - decimal_builder = Decimal128Builder::new(2, 1); - result = decimal_builder.append_value(100); - error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: 100 is too large to store in a Decimal128 of precision 2. Max is 99", - error.to_string() - ); - - unsafe { - decimal_builder.disable_value_validation(); - } - result = decimal_builder.append_value(100); - assert!(result.is_ok()); - decimal_builder.append_value(99).unwrap(); - result = decimal_builder.append_value(-100); - assert!(result.is_ok()); - decimal_builder.append_value(-99).unwrap(); - let arr = decimal_builder.finish(); - assert_eq!("9.9", arr.value_as_string(1)); - assert_eq!("-9.9", arr.value_as_string(3)); - } - - #[test] - fn test_decimal_from_iter_values() { - let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); - assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); - assert!(!array.is_null(0)); - assert_eq!(0_i128, array.value(1).into()); - assert!(!array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); - assert!(!array.is_null(2)); - } - - #[test] - fn test_decimal_from_iter() { - let array: Decimal128Array = - vec![Some(-100), None, Some(101)].into_iter().collect(); - assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); - assert!(!array.is_null(0)); - assert!(array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); - assert!(!array.is_null(2)); - } - - #[test] - fn test_decimal_iter() { - let data = vec![Some(-100), None, Some(101)]; - let array: Decimal128Array = data.clone().into_iter().collect(); - - let collected: Vec<_> = array.iter().map(|d| d.map(|v| v.as_i128())).collect(); - assert_eq!(data, collected); - } - - #[test] - fn test_decimal_into_iter() { - let data = vec![Some(-100), None, Some(101)]; - let array: Decimal128Array = data.clone().into_iter().collect(); - - let collected: Vec<_> = - array.into_iter().map(|d| d.map(|v| v.as_i128())).collect(); - assert_eq!(data, collected); - } - - #[test] - fn test_decimal_iter_sized() { - let data = vec![Some(-100), None, Some(101)]; - let array: Decimal128Array = data.into_iter().collect(); - let mut iter = array.into_iter(); - - // is exact sized - assert_eq!(array.len(), 3); - - // size_hint is reported correctly - assert_eq!(iter.size_hint(), (3, Some(3))); - iter.next().unwrap(); - assert_eq!(iter.size_hint(), (2, Some(2))); - iter.next().unwrap(); - iter.next().unwrap(); - assert_eq!(iter.size_hint(), (0, Some(0))); - assert!(iter.next().is_none()); - assert_eq!(iter.size_hint(), (0, Some(0))); - } - - #[test] - fn test_decimal_array_value_as_string() { - let arr = [123450, -123450, 100, -100, 10, -10, 0] - .into_iter() - .map(Some) - .collect::() - .with_precision_and_scale(6, 3) - .unwrap(); - - assert_eq!("123.450", arr.value_as_string(0)); - assert_eq!("-123.450", arr.value_as_string(1)); - assert_eq!("0.100", arr.value_as_string(2)); - assert_eq!("-0.100", arr.value_as_string(3)); - assert_eq!("0.010", arr.value_as_string(4)); - assert_eq!("-0.010", arr.value_as_string(5)); - assert_eq!("0.000", arr.value_as_string(6)); - } - - #[test] - fn test_decimal_array_with_precision_and_scale() { - let arr = Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) - .with_precision_and_scale(20, 2) - .unwrap(); - - assert_eq!(arr.data_type(), &DataType::Decimal128(20, 2)); - assert_eq!(arr.precision(), 20); - assert_eq!(arr.scale(), 2); - - let actual: Vec<_> = (0..arr.len()).map(|i| arr.value_as_string(i)).collect(); - let expected = vec!["123.45", "4.56", "78.90", "-1232234234324.32"]; - - assert_eq!(actual, expected); - } - - #[test] - #[should_panic( - expected = "-123223423432432 is too small to store in a Decimal128 of precision 5. Min is -99999" - )] - fn test_decimal_array_with_precision_and_scale_out_of_range() { - Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) - // precision is too small to hold value - .with_precision_and_scale(5, 2) - .unwrap(); - } - - #[test] - #[should_panic(expected = "precision 40 is greater than max 38")] - fn test_decimal_array_with_precision_and_scale_invalid_precision() { - Decimal128Array::from_iter_values([12345, 456]) - .with_precision_and_scale(40, 2) - .unwrap(); - } - - #[test] - #[should_panic(expected = "scale 40 is greater than max 38")] - fn test_decimal_array_with_precision_and_scale_invalid_scale() { - Decimal128Array::from_iter_values([12345, 456]) - .with_precision_and_scale(20, 40) - .unwrap(); - } - - #[test] - #[should_panic(expected = "scale 10 is greater than precision 4")] - fn test_decimal_array_with_precision_and_scale_invalid_precision_and_scale() { - Decimal128Array::from_iter_values([12345, 456]) - .with_precision_and_scale(4, 10) - .unwrap(); - } - - #[test] - fn test_decimal_array_fmt_debug() { - let arr = [Some(8887000000_i128), Some(-8887000000_i128), None] - .into_iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap(); - - assert_eq!( - "Decimal128Array<23, 6>\n[\n 8887.000000,\n -8887.000000,\n null,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_decimal_array_from_fixed_size_binary() { - let value_data = ArrayData::builder(DataType::FixedSizeBinary(16)) - .offset(1) - .len(3) - .add_buffer(Buffer::from_slice_ref(&[99999_i128, 2, 34, 560])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010]))) - .build() - .unwrap(); - - let binary_array = FixedSizeBinaryArray::from(value_data); - let decimal = Decimal128Array::from_fixed_size_binary_array(binary_array, 38, 1); - - assert_eq!(decimal.len(), 3); - assert_eq!(decimal.value_as_string(0), "0.2".to_string()); - assert!(decimal.is_null(1)); - assert_eq!(decimal.value_as_string(2), "56.0".to_string()); - } - - #[test] - #[should_panic( - expected = "Value length of the array (8) must equal to the byte width of the decimal (16)" - )] - fn test_decimal_array_from_fixed_size_binary_wrong_length() { - let value_data = ArrayData::builder(DataType::FixedSizeBinary(8)) - .offset(1) - .len(3) - .add_buffer(Buffer::from_slice_ref(&[99999_i64, 2, 34, 560])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010]))) - .build() - .unwrap(); - - let binary_array = FixedSizeBinaryArray::from(value_data); - let _ = Decimal128Array::from_fixed_size_binary_array(binary_array, 38, 1); - } - - #[test] - #[allow(deprecated)] - fn test_decimal_array_from_fixed_size_list() { - let value_data = ArrayData::builder(DataType::UInt8) - .offset(16) - .len(48) - .add_buffer(Buffer::from_slice_ref(&[99999_i128, 12, 34, 56])) - .build() - .unwrap(); - - let null_buffer = Buffer::from_slice_ref(&[0b101]); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, false)), - 16, - ); - let list_data = ArrayData::builder(list_data_type) - .len(2) - .null_bit_buffer(Some(null_buffer)) - .offset(1) - .add_child_data(value_data) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - let decimal = Decimal128Array::from_fixed_size_list_array(list_array, 38, 0); - - assert_eq!(decimal.len(), 2); - assert!(decimal.is_null(0)); - assert_eq!(decimal.value_as_string(1), "56".to_string()); - } - - #[test] - #[allow(deprecated)] - #[should_panic(expected = "The child array cannot contain null values.")] - fn test_decimal_array_from_fixed_size_list_with_child_nulls_failed() { - let value_data = ArrayData::builder(DataType::UInt8) - .len(16) - .add_buffer(Buffer::from_slice_ref(&[12_i128])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) - .build() - .unwrap(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, false)), - 16, - ); - let list_data = ArrayData::builder(list_data_type) - .len(1) - .add_child_data(value_data) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - drop(Decimal128Array::from_fixed_size_list_array( - list_array, 38, 0, - )); - } - - #[test] - #[allow(deprecated)] - #[should_panic( - expected = "Value length of the array (8) must equal to the byte width of the decimal (16)" - )] - fn test_decimal_array_from_fixed_size_list_with_wrong_length() { - let value_data = ArrayData::builder(DataType::UInt8) - .len(16) - .add_buffer(Buffer::from_slice_ref(&[12_i128])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) - .build() - .unwrap(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, false)), - 8, - ); - let list_data = ArrayData::builder(list_data_type) - .len(2) - .add_child_data(value_data) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - drop(Decimal128Array::from_fixed_size_list_array( - list_array, 38, 0, - )); - } - - #[test] - fn test_decimal256_iter() { - let mut builder = Decimal256Builder::with_capacity(30, 76, 6); - let value = BigInt::from_str_radix("12345", 10).unwrap(); - let decimal1 = Decimal256::from_big_int(&value, 76, 6).unwrap(); - builder.append_value(&decimal1).unwrap(); - - builder.append_null(); - - let value = BigInt::from_str_radix("56789", 10).unwrap(); - let decimal2 = Decimal256::from_big_int(&value, 76, 6).unwrap(); - builder.append_value(&decimal2).unwrap(); - - let array: Decimal256Array = builder.finish(); - - let collected: Vec<_> = array.iter().collect(); - assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); - } - - #[test] - fn test_from_iter_decimal256array() { - let value1 = BigInt::from_str_radix("12345", 10).unwrap(); - let value2 = BigInt::from_str_radix("56789", 10).unwrap(); - - let array: Decimal256Array = - vec![Some(value1.clone()), None, Some(value2.clone())] - .into_iter() - .collect(); - assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal256(76, 10)); - assert_eq!( - Decimal256::from_big_int( - &value1, - DECIMAL256_MAX_PRECISION, - DECIMAL_DEFAULT_SCALE, - ) - .unwrap(), - array.value(0) - ); - assert!(!array.is_null(0)); - assert!(array.is_null(1)); - assert_eq!( - Decimal256::from_big_int( - &value2, - DECIMAL256_MAX_PRECISION, - DECIMAL_DEFAULT_SCALE, - ) - .unwrap(), - array.value(2) - ); - assert!(!array.is_null(2)); - } - - #[test] - fn test_from_iter_decimal128array() { - let array: Decimal128Array = vec![ - Some(Decimal128::new_from_i128(38, 10, -100)), - None, - Some(Decimal128::new_from_i128(38, 10, 101)), - ] - .into_iter() - .collect(); - assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); - assert!(!array.is_null(0)); - assert!(array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); - assert!(!array.is_null(2)); - } - - #[test] - #[should_panic( - expected = "Trying to access an element at index 4 from a DecimalArray of length 3" - )] - fn test_fixed_size_binary_array_get_value_index_out_of_bound() { - let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); - - array.value(4); - } - - #[test] - #[should_panic( - expected = "Expected data type to match Decimal256(76, 10) got Decimal128(38, 10)" - )] - fn test_from_array_data_validation() { - let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); - let _ = Decimal256Array::from(array.into_data()); - } -} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index e2ea61549125..1613e4a69b86 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -31,9 +31,6 @@ pub use binary_array::*; mod boolean_array; pub use boolean_array::*; -mod decimal_array; -pub use decimal_array::*; - mod dictionary_array; pub use dictionary_array::*; @@ -449,12 +446,6 @@ impl PartialEq for FixedSizeBinaryArray { } } -impl PartialEq for Decimal128Array { - fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) - } -} - impl PartialEq for GenericListArray { fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 895c80b07530..377523267a44 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -16,6 +16,7 @@ // under the License. use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; +use crate::decimal::Decimal; use crate::iterator::PrimitiveIter; use crate::raw_pointer::RawPtrBox; use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; @@ -25,7 +26,7 @@ use crate::{print_long_array, Array, ArrayAccessor}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; use std::any::Any; @@ -176,6 +177,9 @@ pub type DurationMillisecondArray = PrimitiveArray; pub type DurationMicrosecondArray = PrimitiveArray; pub type DurationNanosecondArray = PrimitiveArray; +pub type Decimal128Array = PrimitiveArray; +pub type Decimal256Array = PrimitiveArray; + /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the /// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. pub trait ArrowPrimitiveType: 'static { @@ -842,9 +846,146 @@ impl From for PrimitiveArray { } } +impl PrimitiveArray { + /// Returns a Decimal array with the same data as self, with the + /// specified precision. + /// + /// Returns an Error if: + /// 1. `precision` is larger than `T:MAX_PRECISION` + /// 2. `scale` is larger than `T::MAX_SCALE` + /// 3. `scale` is > `precision` + pub fn with_precision_and_scale( + self, + precision: u8, + scale: u8, + ) -> Result + where + Self: Sized, + { + // validate precision and scale + self.validate_precision_scale(precision, scale)?; + + // safety: self.data is valid DataType::Decimal as checked above + let new_data_type = T::TYPE_CONSTRUCTOR(precision, scale); + let data = self.data().clone().into_builder().data_type(new_data_type); + + // SAFETY + // Validated data above + Ok(unsafe { data.build_unchecked().into() }) + } + + // validate that the new precision and scale are valid or not + fn validate_precision_scale( + &self, + precision: u8, + scale: u8, + ) -> Result<(), ArrowError> { + if precision > T::MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "precision {} is greater than max {}", + precision, + Decimal128Type::MAX_PRECISION + ))); + } + if scale > T::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than max {}", + scale, + Decimal128Type::MAX_SCALE + ))); + } + if scale > precision { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than precision {}", + scale, precision + ))); + } + + Ok(()) + } + + /// Validates values in this array can be properly interpreted + /// with the specified precision. + pub fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { + (0..self.len()).try_for_each(|idx| { + if self.is_valid(idx) { + let decimal = unsafe { self.value_unchecked(idx) }; + T::validate_decimal_precision(decimal, precision) + } else { + Ok(()) + } + }) + } + + pub fn value_as_string(&self, row: usize) -> Result { + let p = self.precision()?; + let s = self.scale()?; + Ok(Decimal::::new(p, s, &T::to_native(self.value(row))).to_string()) + } + + pub fn precision(&self) -> Result { + match T::BYTE_LENGTH { + 16 => { + if let DataType::Decimal128(p, _) = self.data().data_type() { + Ok(*p) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "Decimal128Array datatype is not DataType::Decimal128 but {}", + self.data_type() + ))) + } + } + 32 => { + if let DataType::Decimal256(p, _) = self.data().data_type() { + Ok(*p) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "Decimal256Array datatype is not DataType::Decimal256 but {}", + self.data_type() + ))) + } + } + other => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported byte length for decimal array {}", + other + ))), + } + } + + pub fn scale(&self) -> Result { + match T::BYTE_LENGTH { + 16 => { + if let DataType::Decimal128(_, s) = self.data().data_type() { + Ok(*s) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "Decimal128Array datatype is not DataType::Decimal128 but {}", + self.data_type() + ))) + } + } + 32 => { + if let DataType::Decimal256(_, s) = self.data().data_type() { + Ok(*s) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "Decimal256Array datatype is not DataType::Decimal256 but {}", + self.data_type() + ))) + } + } + other => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported byte length for decimal array {}", + other + ))), + } + } +} + #[cfg(test)] mod tests { use super::*; + use crate::builder::{Decimal128Builder, Decimal256Builder}; use crate::BooleanArray; #[test] @@ -1408,4 +1549,239 @@ mod tests { let array = PrimitiveArray::::from(array.data().clone()); assert_eq!(array.values(), &values); } + + #[test] + fn test_decimal_array() { + // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; + let values: [u8; 32] = [ + 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ]; + let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) + .len(2) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + let decimal_array = Decimal128Array::from(array_data); + assert_eq!(8_887_000_000_i128, decimal_array.value(0).into()); + assert_eq!(-8_887_000_000_i128, decimal_array.value(1).into()); + } + + #[test] + fn test_decimal_append_error_value() { + let mut decimal_builder = Decimal128Builder::with_capacity(10); + decimal_builder.append_value(123456); + decimal_builder.append_value(12345); + let result = decimal_builder.finish().with_precision_and_scale(5, 3); + assert!(result.is_ok()); + let arr = result.unwrap(); + assert_eq!("12.345", arr.value_as_string(1).unwrap()); + + // Validate it explicitly + let result = arr.validate_decimal_precision(5); + let error = result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", + error.to_string() + ); + + decimal_builder = Decimal128Builder::new(); + decimal_builder.append_value(100); + decimal_builder.append_value(99); + decimal_builder.append_value(-100); + decimal_builder.append_value(-99); + let result = decimal_builder.finish().with_precision_and_scale(2, 1); + assert!(result.is_ok()); + let arr = result.unwrap(); + assert_eq!("9.9", arr.value_as_string(1).unwrap()); + assert_eq!("-9.9", arr.value_as_string(3).unwrap()); + + // Validate it explicitly + let result = arr.validate_decimal_precision(2); + let error = result.unwrap_err(); + assert_eq!( + "Invalid argument error: 100 is too large to store in a Decimal128 of precision 2. Max is 99", + error.to_string() + ); + } + + #[test] + fn test_decimal_from_iter_values() { + let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); + assert_eq!(-100_i128, array.value(0).into()); + assert!(!array.is_null(0)); + assert_eq!(0_i128, array.value(1).into()); + assert!(!array.is_null(1)); + assert_eq!(101_i128, array.value(2).into()); + assert!(!array.is_null(2)); + } + + #[test] + fn test_decimal_from_iter() { + let array: Decimal128Array = + vec![Some(-100), None, Some(101)].into_iter().collect(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); + assert_eq!(-100_i128, array.value(0).into()); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!(101_i128, array.value(2).into()); + assert!(!array.is_null(2)); + } + + #[test] + fn test_decimal_iter_sized() { + let data = vec![Some(-100), None, Some(101)]; + let array: Decimal128Array = data.into_iter().collect(); + let mut iter = array.into_iter(); + + // is exact sized + assert_eq!(array.len(), 3); + + // size_hint is reported correctly + assert_eq!(iter.size_hint(), (3, Some(3))); + iter.next().unwrap(); + assert_eq!(iter.size_hint(), (2, Some(2))); + iter.next().unwrap(); + iter.next().unwrap(); + assert_eq!(iter.size_hint(), (0, Some(0))); + assert!(iter.next().is_none()); + assert_eq!(iter.size_hint(), (0, Some(0))); + } + + #[test] + fn test_decimal_array_value_as_string() { + let arr = [123450, -123450, 100, -100, 10, -10, 0] + .into_iter() + .map(Some) + .collect::() + .with_precision_and_scale(6, 3) + .unwrap(); + + assert_eq!("123.450", arr.value_as_string(0).unwrap()); + assert_eq!("-123.450", arr.value_as_string(1).unwrap()); + assert_eq!("0.100", arr.value_as_string(2).unwrap()); + assert_eq!("-0.100", arr.value_as_string(3).unwrap()); + assert_eq!("0.010", arr.value_as_string(4).unwrap()); + assert_eq!("-0.010", arr.value_as_string(5).unwrap()); + assert_eq!("0.000", arr.value_as_string(6).unwrap()); + } + + #[test] + fn test_decimal_array_with_precision_and_scale() { + let arr = Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) + .with_precision_and_scale(20, 2) + .unwrap(); + + assert_eq!(arr.data_type(), &DataType::Decimal128(20, 2)); + assert_eq!(arr.precision().unwrap(), 20); + assert_eq!(arr.scale().unwrap(), 2); + + let actual: Vec<_> = (0..arr.len()) + .map(|i| arr.value_as_string(i).unwrap()) + .collect(); + let expected = vec!["123.45", "4.56", "78.90", "-1232234234324.32"]; + + assert_eq!(actual, expected); + } + + #[test] + #[should_panic( + expected = "-123223423432432 is too small to store in a Decimal128 of precision 5. Min is -99999" + )] + fn test_decimal_array_with_precision_and_scale_out_of_range() { + let arr = Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) + // precision is too small to hold value + .with_precision_and_scale(5, 2) + .unwrap(); + arr.validate_decimal_precision(5).unwrap(); + } + + #[test] + #[should_panic(expected = "precision 40 is greater than max 38")] + fn test_decimal_array_with_precision_and_scale_invalid_precision() { + Decimal128Array::from_iter_values([12345, 456]) + .with_precision_and_scale(40, 2) + .unwrap(); + } + + #[test] + #[should_panic(expected = "scale 40 is greater than max 38")] + fn test_decimal_array_with_precision_and_scale_invalid_scale() { + Decimal128Array::from_iter_values([12345, 456]) + .with_precision_and_scale(20, 40) + .unwrap(); + } + + #[test] + #[should_panic(expected = "scale 10 is greater than precision 4")] + fn test_decimal_array_with_precision_and_scale_invalid_precision_and_scale() { + Decimal128Array::from_iter_values([12345, 456]) + .with_precision_and_scale(4, 10) + .unwrap(); + } + + #[test] + fn test_decimal256_iter() { + let mut builder = Decimal256Builder::with_capacity(30); + let decimal1 = i256::from_i128(12345); + builder.append_value(decimal1); + + builder.append_null(); + + let decimal2 = i256::from_i128(56789); + builder.append_value(decimal2); + + let array: Decimal256Array = + builder.finish().with_precision_and_scale(76, 6).unwrap(); + + let collected: Vec<_> = array.iter().collect(); + assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); + } + + #[test] + fn test_from_iter_decimal256array() { + let value1 = i256::from_i128(12345); + let value2 = i256::from_i128(56789); + + let mut array: Decimal256Array = + vec![Some(value1.clone()), None, Some(value2.clone())] + .into_iter() + .collect(); + array = array.with_precision_and_scale(76, 10).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal256(76, 10)); + assert_eq!(value1, array.value(0)); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!(value2, array.value(2)); + assert!(!array.is_null(2)); + } + + #[test] + fn test_from_iter_decimal128array() { + let mut array: Decimal128Array = + vec![Some(-100), None, Some(101)].into_iter().collect(); + array = array.with_precision_and_scale(38, 10).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); + assert_eq!(-100_i128, array.value(0).into()); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!(101_i128, array.value(2).into()); + assert!(!array.is_null(2)); + } + + #[test] + #[should_panic( + expected = "Trying to access an element at index 4 from a PrimitiveArray of length 3" + )] + fn test_fixed_size_binary_array_get_value_index_out_of_bound() { + let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); + + array.value(4); + } } diff --git a/arrow-array/src/builder/decimal_builder.rs b/arrow-array/src/builder/decimal_builder.rs deleted file mode 100644 index 096cbec3a6c8..000000000000 --- a/arrow-array/src/builder/decimal_builder.rs +++ /dev/null @@ -1,382 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder}; -use crate::decimal::Decimal256; -use crate::{ArrayRef, Decimal128Array, Decimal256Array}; -use arrow_data::decimal::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, -}; -use arrow_schema::ArrowError; -use std::any::Any; -use std::sync::Arc; - -/// Array Builder for [`Decimal128Array`] -/// -/// See [`Decimal128Array`] for example. -/// -#[derive(Debug)] -pub struct Decimal128Builder { - builder: FixedSizeBinaryBuilder, - precision: u8, - scale: u8, - - /// Should i128 values be validated for compatibility with scale and precision? - /// defaults to true - value_validation: bool, -} - -/// Array Builder for [`Decimal256Array`] -/// -/// See [`Decimal256Array`] for example. -#[derive(Debug)] -pub struct Decimal256Builder { - builder: FixedSizeBinaryBuilder, - precision: u8, - scale: u8, - - /// Should decimal values be validated for compatibility with scale and precision? - /// defaults to true - value_validation: bool, -} - -impl Decimal128Builder { - const BYTE_LENGTH: i32 = 16; - - /// Creates a new [`Decimal128Builder`] - pub fn new(precision: u8, scale: u8) -> Self { - Self::with_capacity(1024, precision, scale) - } - - /// Creates a new [`Decimal128Builder`], `capacity` is the number of decimal values - /// that can be appended without reallocating - pub fn with_capacity(capacity: usize, precision: u8, scale: u8) -> Self { - Self { - builder: FixedSizeBinaryBuilder::with_capacity(capacity, Self::BYTE_LENGTH), - precision, - scale, - value_validation: true, - } - } - - /// Disable validation - /// - /// # Safety - /// - /// After disabling validation, caller must ensure that appended values are compatible - /// for the specified precision and scale. - pub unsafe fn disable_value_validation(&mut self) { - self.value_validation = false; - } - - /// Appends a decimal value into the builder. - #[inline] - pub fn append_value(&mut self, value: impl Into) -> Result<(), ArrowError> { - let value = value.into(); - if self.value_validation { - validate_decimal_precision(value, self.precision)? - } - let value_as_bytes: [u8; 16] = value.to_le_bytes(); - self.builder.append_value(value_as_bytes.as_slice()) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) { - self.builder.append_null() - } - - /// Appends an `Option>` into the builder. - #[inline] - pub fn append_option( - &mut self, - value: Option>, - ) -> Result<(), ArrowError> { - match value { - None => { - self.append_null(); - Ok(()) - } - Some(value) => self.append_value(value), - } - } - - /// Builds the `Decimal128Array` and reset this builder. - pub fn finish(&mut self) -> Decimal128Array { - Decimal128Array::from_fixed_size_binary_array( - self.builder.finish(), - self.precision, - self.scale, - ) - } -} - -impl ArrayBuilder for Decimal128Builder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl Decimal256Builder { - const BYTE_LENGTH: i32 = 32; - - /// Creates a new [`Decimal256Builder`] - pub fn new(precision: u8, scale: u8) -> Self { - Self::with_capacity(1024, precision, scale) - } - - /// Creates a new [`Decimal256Builder`], `capacity` is the number of decimal values - /// that can be appended without reallocating - pub fn with_capacity(capacity: usize, precision: u8, scale: u8) -> Self { - Self { - builder: FixedSizeBinaryBuilder::with_capacity(capacity, Self::BYTE_LENGTH), - precision, - scale, - value_validation: true, - } - } - - /// Disable validation - /// - /// # Safety - /// - /// After disabling validation, caller must ensure that appended values are compatible - /// for the specified precision and scale. - pub unsafe fn disable_value_validation(&mut self) { - self.value_validation = false; - } - - /// Appends a [`Decimal256`] number into the builder. - /// - /// Returns an error if `value` has different precision, scale or length in bytes than this builder - #[inline] - pub fn append_value(&mut self, value: &Decimal256) -> Result<(), ArrowError> { - let value = if self.value_validation { - let raw_bytes = value.raw_value(); - validate_decimal256_precision_with_lt_bytes(raw_bytes, self.precision)?; - value - } else { - value - }; - - if self.precision != value.precision() || self.scale != value.scale() { - return Err(ArrowError::InvalidArgumentError( - "Decimal value does not have the same precision or scale as Decimal256Builder".to_string() - )); - } - - let value_as_bytes = value.raw_value(); - - if Self::BYTE_LENGTH != value_as_bytes.len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as Decimal256Builder value lengths".to_string() - )); - } - self.builder.append_value(value_as_bytes) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) { - self.builder.append_null() - } - - /// Appends an `Option<&Decimal256>` into the builder. - #[inline] - pub fn append_option( - &mut self, - value: Option<&Decimal256>, - ) -> Result<(), ArrowError> { - match value { - None => { - self.append_null(); - Ok(()) - } - Some(value) => self.append_value(value), - } - } - - /// Builds the [`Decimal256Array`] and reset this builder. - pub fn finish(&mut self) -> Decimal256Array { - Decimal256Array::from_fixed_size_binary_array( - self.builder.finish(), - self.precision, - self.scale, - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::decimal::Decimal128; - use crate::Array; - use arrow_schema::DataType; - use num::{BigInt, Num}; - - #[test] - fn test_decimal_builder() { - let mut builder = Decimal128Builder::new(38, 6); - - builder.append_value(8_887_000_000_i128).unwrap(); - builder.append_null(); - builder.append_value(-8_887_000_000_i128).unwrap(); - builder.append_option(None::).unwrap(); - builder.append_option(Some(8_887_000_000_i128)).unwrap(); - let decimal_array: Decimal128Array = builder.finish(); - - assert_eq!(&DataType::Decimal128(38, 6), decimal_array.data_type()); - assert_eq!(5, decimal_array.len()); - assert_eq!(2, decimal_array.null_count()); - assert_eq!(32, decimal_array.value_offset(2)); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - fn test_decimal_builder_with_decimal128() { - let mut builder = Decimal128Builder::new(38, 6); - - builder - .append_value(Decimal128::new_from_i128(30, 38, 8_887_000_000_i128)) - .unwrap(); - builder.append_null(); - builder - .append_value(Decimal128::new_from_i128(30, 38, -8_887_000_000_i128)) - .unwrap(); - let decimal_array: Decimal128Array = builder.finish(); - - assert_eq!(&DataType::Decimal128(38, 6), decimal_array.data_type()); - assert_eq!(3, decimal_array.len()); - assert_eq!(1, decimal_array.null_count()); - assert_eq!(32, decimal_array.value_offset(2)); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - fn test_decimal256_builder() { - let mut builder = Decimal256Builder::new(40, 6); - - let mut bytes = [0_u8; 32]; - bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); - builder.append_value(&value).unwrap(); - - builder.append_null(); - - bytes = [255; 32]; - let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); - builder.append_value(&value).unwrap(); - - bytes = [0; 32]; - bytes[0..16].clone_from_slice(&0_i128.to_le_bytes()); - bytes[15] = 128; - let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); - builder.append_value(&value).unwrap(); - - builder.append_option(None::<&Decimal256>).unwrap(); - builder.append_option(Some(&value)).unwrap(); - - let decimal_array: Decimal256Array = builder.finish(); - - assert_eq!(&DataType::Decimal256(40, 6), decimal_array.data_type()); - assert_eq!(6, decimal_array.len()); - assert_eq!(2, decimal_array.null_count()); - assert_eq!(64, decimal_array.value_offset(2)); - assert_eq!(32, decimal_array.value_length()); - - assert_eq!(decimal_array.value(0).to_string(), "8887.000000"); - assert!(decimal_array.is_null(1)); - assert_eq!(decimal_array.value(2).to_string(), "-0.000001"); - assert_eq!( - decimal_array.value(3).to_string(), - "170141183460469231731687303715884.105728" - ); - } - - #[test] - #[should_panic( - expected = "Decimal value does not have the same precision or scale as Decimal256Builder" - )] - fn test_decimal256_builder_unmatched_precision_scale() { - let mut builder = Decimal256Builder::with_capacity(30, 10, 6); - - let mut bytes = [0_u8; 32]; - bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); - builder.append_value(&value).unwrap(); - } - - #[test] - #[should_panic( - expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" - )] - fn test_decimal256_builder_out_of_range_precision_scale() { - let mut builder = Decimal256Builder::new(75, 6); - - let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); - let value = Decimal256::from_big_int(&big_value, 75, 6).unwrap(); - builder.append_value(&value).unwrap(); - } - - #[test] - #[should_panic( - expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" - )] - fn test_decimal256_data_validation() { - let mut builder = Decimal256Builder::new(75, 6); - // Disable validation at builder - unsafe { - builder.disable_value_validation(); - } - - let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); - let value = Decimal256::from_big_int(&big_value, 75, 6).unwrap(); - builder - .append_value(&value) - .expect("should not validate invalid value at builder"); - - let array = builder.finish(); - let array_data = array.data(); - array_data.validate_values().unwrap(); - } -} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index cd4a82890a2e..5edf011d7bf6 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -24,8 +24,6 @@ mod boolean_builder; pub use boolean_builder::*; mod buffer_builder; pub use buffer_builder::*; -mod decimal_builder; -pub use decimal_builder::*; mod fixed_size_binary_builder; pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index c5b8c9557072..ed3594c60df9 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -52,6 +52,9 @@ pub type DurationMillisecondBuilder = PrimitiveBuilder; pub type DurationMicrosecondBuilder = PrimitiveBuilder; pub type DurationNanosecondBuilder = PrimitiveBuilder; +pub type Decimal128Builder = PrimitiveBuilder; +pub type Decimal256Builder = PrimitiveBuilder; + /// Array builder for fixed-width primitive types #[derive(Debug)] pub struct PrimitiveBuilder { diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 619931403946..69c092c0368d 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -109,9 +109,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } - DataType::Decimal128(precision, scale) => Box::new( - Decimal128Builder::with_capacity(capacity, *precision, *scale), - ), + DataType::Decimal128(_precision, _scale) => { + Box::new(Decimal128Builder::with_capacity(capacity)) + } DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), diff --git a/arrow-array/src/decimal.rs b/arrow-array/src/decimal.rs index 323281d9233c..343053330640 100644 --- a/arrow-array/src/decimal.rs +++ b/arrow-array/src/decimal.rs @@ -18,6 +18,7 @@ //! Decimal related utilities, types and functions use crate::types::{Decimal128Type, Decimal256Type, DecimalType}; +use arrow_buffer::i256; use arrow_data::decimal::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; use arrow_schema::{ArrowError, DataType}; use num::{BigInt, Signed}; @@ -33,7 +34,7 @@ use std::cmp::{min, Ordering}; pub struct Decimal { precision: u8, scale: u8, - value: T::Native, + value: T::DecimalNative, } /// Manually implement to avoid `T: Debug` bound @@ -76,7 +77,7 @@ impl Decimal { pub fn try_new_from_bytes( precision: u8, scale: u8, - bytes: &T::Native, + bytes: &T::DecimalNative, ) -> Result where Self: Sized, @@ -111,15 +112,16 @@ impl Decimal { /// Safety: /// This method doesn't check if the precision and scale are valid. /// Use `try_new_from_bytes` for safe constructor. - pub fn new(precision: u8, scale: u8, bytes: &T::Native) -> Self { + pub fn new(precision: u8, scale: u8, bytes: &T::DecimalNative) -> Self { Self { precision, scale, value: *bytes, } } + /// Returns the raw bytes of the integer representation of the decimal. - pub fn raw_value(&self) -> &T::Native { + pub fn raw_value(&self) -> &T::DecimalNative { &self.value } @@ -245,6 +247,10 @@ impl Decimal256 { Decimal256::try_new_from_bytes(precision, scale, &bytes) } + pub fn from_i256(precision: u8, scale: u8, value: i256) -> Self { + Decimal256::new(precision, scale, &value.to_le_bytes()) + } + /// Constructs a `BigInt` from this `Decimal256` value. pub fn to_big_int(self) -> BigInt { BigInt::from_signed_bytes_le(&self.value) diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 25727e0d75fb..351f90bacfc6 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -18,10 +18,9 @@ //! Idiomatic iterators for [`Array`](crate::Array) use crate::array::{ - ArrayAccessor, BooleanArray, DecimalArray, FixedSizeBinaryArray, GenericBinaryArray, + ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; -use crate::types::{Decimal128Type, Decimal256Type}; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -123,15 +122,6 @@ pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; -pub type DecimalIter<'a, T> = ArrayIter<&'a DecimalArray>; -/// an iterator that returns `Some(Decimal128)` or `None`, that can be used on a -/// [`super::Decimal128Array`] -pub type Decimal128Iter<'a> = DecimalIter<'a, Decimal128Type>; - -/// an iterator that returns `Some(Decimal256)` or `None`, that can be used on a -/// [`super::Decimal256Array`] -pub type Decimal256Iter<'a> = DecimalIter<'a, Decimal256Type>; - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 9bd433692580..2e161813dbc9 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -21,10 +21,11 @@ use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; use arrow_buffer::i256; use arrow_data::decimal::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; -use arrow_schema::{DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; use half::f16; use std::ops::{Add, Sub}; @@ -491,14 +492,23 @@ impl NativeDecimalType for [u8; N] { /// [`DecimalArray`]: [crate::array::DecimalArray] /// [`Decimal128Array`]: [crate::array::Decimal128Array] /// [`Decimal256Array`]: [crate::array::Decimal256Array] -pub trait DecimalType: 'static + Send + Sync + private::DecimalTypeSealed { - type Native: NativeDecimalType; +pub trait DecimalType: + 'static + Send + Sync + ArrowPrimitiveType + private::DecimalTypeSealed +{ + type DecimalNative: NativeDecimalType; const BYTE_LENGTH: usize; const MAX_PRECISION: u8; const MAX_SCALE: u8; const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType; const DEFAULT_TYPE: DataType; + + fn to_native(num: ::Native) -> Self::DecimalNative; + + fn validate_decimal_precision( + num: ::Native, + precision: u8, + ) -> Result<(), ArrowError>; } /// The decimal type for a Decimal128Array @@ -506,7 +516,7 @@ pub trait DecimalType: 'static + Send + Sync + private::DecimalTypeSealed { pub struct Decimal128Type {} impl DecimalType for Decimal128Type { - type Native = [u8; 16]; + type DecimalNative = [u8; 16]; const BYTE_LENGTH: usize = 16; const MAX_PRECISION: u8 = DECIMAL128_MAX_PRECISION; @@ -514,6 +524,14 @@ impl DecimalType for Decimal128Type { const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal128; const DEFAULT_TYPE: DataType = DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); + + fn to_native(num: i128) -> [u8; 16] { + num.to_le_bytes() + } + + fn validate_decimal_precision(num: i128, precision: u8) -> Result<(), ArrowError> { + validate_decimal_precision(num, precision) + } } impl ArrowPrimitiveType for Decimal128Type { @@ -527,7 +545,7 @@ impl ArrowPrimitiveType for Decimal128Type { pub struct Decimal256Type {} impl DecimalType for Decimal256Type { - type Native = [u8; 32]; + type DecimalNative = [u8; 32]; const BYTE_LENGTH: usize = 32; const MAX_PRECISION: u8 = DECIMAL256_MAX_PRECISION; @@ -535,6 +553,14 @@ impl DecimalType for Decimal256Type { const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal256; const DEFAULT_TYPE: DataType = DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); + + fn to_native(num: i256) -> [u8; 32] { + num.to_le_bytes() + } + + fn validate_decimal_precision(num: i256, precision: u8) -> Result<(), ArrowError> { + validate_decimal256_precision_with_lt_bytes(&num.to_le_bytes(), precision) + } } impl ArrowPrimitiveType for Decimal256Type { diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index df4cfd8ea594..7873064b45fb 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -86,6 +86,16 @@ impl i256 { } } + pub fn from_i128(v: i128) -> Self { + let mut bytes = if num::Signed::is_negative(&v) { + [255_u8; 32] + } else { + [0; 32] + }; + bytes[0..16].copy_from_slice(&v.to_le_bytes()); + Self::from_le_bytes(bytes) + } + /// Create an i256 from the provided low u128 and high i128 #[inline] pub fn from_parts(low: u128, high: i128) -> Self { diff --git a/arrow/benches/array_from_vec.rs b/arrow/benches/array_from_vec.rs index 59bef65a18c6..229ac0b87d41 100644 --- a/arrow/benches/array_from_vec.rs +++ b/arrow/benches/array_from_vec.rs @@ -23,8 +23,7 @@ use criterion::Criterion; extern crate arrow; use arrow::array::*; -use arrow::util::decimal::Decimal256; -use num::BigInt; +use arrow_buffer::i256; use rand::Rng; use std::{convert::TryFrom, sync::Arc}; @@ -87,7 +86,7 @@ fn decimal128_array_from_vec(array: &[Option]) { ); } -fn decimal256_array_from_vec(array: &[Option]) { +fn decimal256_array_from_vec(array: &[Option]) { criterion::black_box( array .iter() @@ -117,8 +116,7 @@ fn decimal_benchmark(c: &mut Criterion) { let mut array = vec![]; let mut rng = rand::thread_rng(); for _ in 0..size { - let decimal = - Decimal256::from(BigInt::from(rng.gen_range::(0..9999999999999))); + let decimal = i256::from_i128(rng.gen_range::(0..9999999999999)); array.push(Some(decimal)); } diff --git a/arrow/benches/builder.rs b/arrow/benches/builder.rs index c2ebcb3daa50..8cb226e89056 100644 --- a/arrow/benches/builder.rs +++ b/arrow/benches/builder.rs @@ -22,12 +22,11 @@ extern crate rand; use std::mem::size_of; use criterion::*; -use num::BigInt; use rand::distributions::Standard; use arrow::array::*; -use arrow::util::decimal::Decimal256; use arrow::util::test_util::seedable_rng; +use arrow_buffer::i256; use rand::Rng; // Build arrays with 512k elements. @@ -112,13 +111,16 @@ fn bench_decimal128(c: &mut Criterion) { c.bench_function("bench_decimal128_builder", |b| { b.iter(|| { let mut rng = rand::thread_rng(); - let mut decimal_builder = Decimal128Builder::with_capacity(BATCH_SIZE, 38, 0); + let mut decimal_builder = Decimal128Builder::with_capacity(BATCH_SIZE); for _ in 0..BATCH_SIZE { - decimal_builder - .append_value(rng.gen_range::(0..9999999999)) - .unwrap(); + decimal_builder.append_value(rng.gen_range::(0..9999999999)); } - black_box(decimal_builder.finish()); + black_box( + decimal_builder + .finish() + .with_precision_and_scale(38, 0) + .unwrap(), + ); }) }); } @@ -127,16 +129,18 @@ fn bench_decimal256(c: &mut Criterion) { c.bench_function("bench_decimal128_builder", |b| { b.iter(|| { let mut rng = rand::thread_rng(); - let mut decimal_builder = - Decimal256Builder::with_capacity(BATCH_SIZE, 76, 10); + let mut decimal_builder = Decimal256Builder::with_capacity(BATCH_SIZE); for _ in 0..BATCH_SIZE { - decimal_builder - .append_value(&Decimal256::from(BigInt::from( - rng.gen_range::(0..99999999999), - ))) - .unwrap() + decimal_builder.append_value(i256::from_i128( + rng.gen_range::(0..99999999999), + )); } - black_box(decimal_builder.finish()); + black_box( + decimal_builder + .finish() + .with_precision_and_scale(76, 10) + .unwrap(), + ); }) }); } diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ac8fc08d9210..2c3d8cd1678a 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -29,8 +29,8 @@ use arrow::array::*; use arrow::compute::cast; use arrow::datatypes::*; use arrow::util::bench_util::*; -use arrow::util::decimal::Decimal256; use arrow::util::test_util::seedable_rng; +use arrow_buffer::i256; fn build_array(size: usize) -> ArrayRef where @@ -84,24 +84,34 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { fn build_decimal128_array(size: usize, precision: u8, scale: u8) -> ArrayRef { let mut rng = seedable_rng(); - let mut builder = Decimal128Builder::with_capacity(size, precision, scale); + let mut builder = Decimal128Builder::with_capacity(size); for _ in 0..size { - let _ = builder.append_value(rng.gen_range::(0..1000000000)); + builder.append_value(rng.gen_range::(0..1000000000)); } - Arc::new(builder.finish()) + Arc::new( + builder + .finish() + .with_precision_and_scale(precision, scale) + .unwrap(), + ) } fn build_decimal256_array(size: usize, precision: u8, scale: u8) -> ArrayRef { let mut rng = seedable_rng(); - let mut builder = Decimal256Builder::with_capacity(size, precision, scale); + let mut builder = Decimal256Builder::with_capacity(size); let mut bytes = [0; 32]; for _ in 0..size { let num = rng.gen_range::(0..1000000000); bytes[0..16].clone_from_slice(&num.to_le_bytes()); - let _ = builder.append_value(&Decimal256::new(precision, scale, &bytes)); + builder.append_value(i256::from_le_bytes(bytes)); } - Arc::new(builder.finish()) + Arc::new( + builder + .finish() + .with_precision_and_scale(precision, scale) + .unwrap(), + ) } // cast array from specified primitive array type to desired data type diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs index 555373e4a634..a70da1d2cfb7 100644 --- a/arrow/benches/decimal_validate.rs +++ b/arrow/benches/decimal_validate.rs @@ -22,12 +22,11 @@ use arrow::array::{ Array, Decimal128Array, Decimal128Builder, Decimal256Array, Decimal256Builder, }; use criterion::Criterion; -use num::BigInt; use rand::Rng; extern crate arrow; -use arrow::util::decimal::Decimal256; +use arrow_buffer::i256; fn validate_decimal128_array(array: Decimal128Array) { array.with_precision_and_scale(35, 0).unwrap(); @@ -40,13 +39,14 @@ fn validate_decimal256_array(array: Decimal256Array) { fn validate_decimal128_benchmark(c: &mut Criterion) { let mut rng = rand::thread_rng(); let size: i128 = 20000; - let mut decimal_builder = Decimal128Builder::with_capacity(size as usize, 38, 0); + let mut decimal_builder = Decimal128Builder::with_capacity(size as usize); for _ in 0..size { - decimal_builder - .append_value(rng.gen_range::(0..999999999999)) - .unwrap(); + decimal_builder.append_value(rng.gen_range::(0..999999999999)); } - let decimal_array = decimal_builder.finish(); + let decimal_array = decimal_builder + .finish() + .with_precision_and_scale(38, 0) + .unwrap(); let data = decimal_array.into_data(); c.bench_function("validate_decimal128_array 20000", |b| { b.iter(|| { @@ -59,13 +59,16 @@ fn validate_decimal128_benchmark(c: &mut Criterion) { fn validate_decimal256_benchmark(c: &mut Criterion) { let mut rng = rand::thread_rng(); let size: i128 = 20000; - let mut decimal_builder = Decimal256Builder::with_capacity(size as usize, 76, 0); + let mut decimal_builder = Decimal256Builder::with_capacity(size as usize); for _ in 0..size { let v = rng.gen_range::(0..999999999999999); - let decimal = Decimal256::from_big_int(&BigInt::from(v), 76, 0).unwrap(); - decimal_builder.append_value(&decimal).unwrap(); + let decimal = i256::from_i128(v); + decimal_builder.append_value(decimal); } - let decimal_array256_data = decimal_builder.finish(); + let decimal_array256_data = decimal_builder + .finish() + .with_precision_and_scale(76, 0) + .unwrap(); let data = decimal_array256_data.into_data(); c.bench_function("validate_decimal256_array 20000", |b| { b.iter(|| { diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 49a9b18d85f6..b05e4c4ba7f4 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -38,7 +38,6 @@ use chrono::format::strftime::StrftimeItems; use chrono::format::{parse, Parsed}; use chrono::{NaiveDateTime, Timelike}; -use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; @@ -58,8 +57,9 @@ use crate::{ buffer::Buffer, util::display::array_value_to_string, util::serialization::lexical_to_string, }; +use arrow_buffer::i256; use num::cast::AsPrimitive; -use num::{BigInt, NumCast, ToPrimitive}; +use num::{NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors #[derive(Debug)] @@ -386,7 +386,7 @@ macro_rules! cast_decimal_to_integer { if array.is_null(i) { value_builder.append_null(); } else { - let v = array.value(i).as_i128() / div; + let v = array.value(i) / div; // check the overflow // For example: Decimal(128,10,0) as i8 // 128 is out of range i8 @@ -416,7 +416,7 @@ macro_rules! cast_decimal_to_float { } else { // The range of f32 or f64 is larger than i128, we don't need to check overflow. // cast the i128 to f64 will lose precision, for example the `112345678901234568` will be as `112345678901234560`. - let v = (array.value(i).as_i128() as f64 / div) as $NATIVE_TYPE; + let v = (array.value(i) as f64 / div) as $NATIVE_TYPE; value_builder.append_value(v); } } @@ -1466,7 +1466,7 @@ fn cast_decimal_to_decimal( let div = 10_i128.pow((input_scale - output_scale) as u32); if BYTE_WIDTH1 == 16 { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.as_i128() / div)); + let iter = array.iter().map(|v| v.map(|v| v.wrapping_div(div))); if BYTE_WIDTH2 == 16 { let output_array = iter .collect::() @@ -1475,7 +1475,7 @@ fn cast_decimal_to_decimal( Ok(Arc::new(output_array)) } else { let output_array = iter - .map(|v| v.map(BigInt::from)) + .map(|v| v.map(i256::from_i128)) .collect::() .with_precision_and_scale(*output_precision, *output_scale)?; @@ -1483,7 +1483,8 @@ fn cast_decimal_to_decimal( } } else { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.to_big_int().div(div))); + let div = i256::from_i128(div); + let iter = array.iter().map(|v| v.map(|v| v.wrapping_div(div))); if BYTE_WIDTH2 == 16 { let values = iter .map(|v| { @@ -1521,7 +1522,7 @@ fn cast_decimal_to_decimal( let mul = 10_i128.pow((output_scale - input_scale) as u32); if BYTE_WIDTH1 == 16 { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.as_i128() * mul)); + let iter = array.iter().map(|v| v.map(|v| v.wrapping_mul(mul))); if BYTE_WIDTH2 == 16 { let output_array = iter .collect::() @@ -1530,7 +1531,7 @@ fn cast_decimal_to_decimal( Ok(Arc::new(output_array)) } else { let output_array = iter - .map(|v| v.map(BigInt::from)) + .map(|v| v.map(i256::from_i128)) .collect::() .with_precision_and_scale(*output_precision, *output_scale)?; @@ -1538,7 +1539,8 @@ fn cast_decimal_to_decimal( } } else { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.to_big_int().mul(mul))); + let mul = i256::from_i128(mul); + let iter = array.iter().map(|v| v.map(|v| v.wrapping_mul(mul))); if BYTE_WIDTH2 == 16 { let values = iter .map(|v| { @@ -2825,7 +2827,6 @@ where mod tests { use super::*; use crate::datatypes::TimeUnit; - use crate::util::decimal::{Decimal128, Decimal256}; use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { @@ -2865,7 +2866,7 @@ mod tests { } fn create_decimal256_array( - array: Vec>, + array: Vec>, precision: u8, scale: u8, ) -> Result { @@ -2876,6 +2877,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); let output_type = DataType::Decimal128(20, 4); @@ -2888,9 +2890,9 @@ mod tests { Decimal128Array, &output_type, vec![ - Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), - Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), - Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), + Some(11234560_i128), + Some(21234560_i128), + Some(31234560_i128), None ] ); @@ -2899,9 +2901,12 @@ mod tests { let input_decimal_array = create_decimal_array(array, 10, 0).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; let result = cast(&array, &DataType::Decimal128(2, 2)); - assert!(result.is_err()); + assert!(result.is_ok()); + let array = result.unwrap(); + let array: &Decimal128Array = as_primitive_array(&array); + let err = array.validate_decimal_precision(2); assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99", - result.unwrap_err().to_string()); + err.unwrap_err().to_string()); } #[test] @@ -2917,18 +2922,9 @@ mod tests { Decimal256Array, &output_type, vec![ - Some( - Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) - .unwrap() - ), - Some( - Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) - .unwrap() - ), - Some( - Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) - .unwrap() - ), + Some(i256::from_i128(11234560_i128)), + Some(i256::from_i128(21234560_i128)), + Some(i256::from_i128(31234560_i128)), None ] ); @@ -2940,9 +2936,9 @@ mod tests { let output_type = DataType::Decimal128(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![ - Some(BigInt::from(1123456)), - Some(BigInt::from(2123456)), - Some(BigInt::from(3123456)), + Some(i256::from_i128(1123456)), + Some(i256::from_i128(2123456)), + Some(i256::from_i128(3123456)), None, ]; let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); @@ -2952,9 +2948,9 @@ mod tests { Decimal128Array, &output_type, vec![ - Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), - Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), - Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), + Some(11234560_i128), + Some(21234560_i128), + Some(31234560_i128), None ] ); @@ -2966,9 +2962,9 @@ mod tests { let output_type = DataType::Decimal256(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![ - Some(BigInt::from(1123456)), - Some(BigInt::from(2123456)), - Some(BigInt::from(3123456)), + Some(i256::from_i128(1123456)), + Some(i256::from_i128(2123456)), + Some(i256::from_i128(3123456)), None, ]; let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); @@ -2978,18 +2974,9 @@ mod tests { Decimal256Array, &output_type, vec![ - Some( - Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) - .unwrap() - ), - Some( - Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) - .unwrap() - ), - Some( - Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) - .unwrap() - ), + Some(i256::from_i128(11234560_i128)), + Some(i256::from_i128(21234560_i128)), + Some(i256::from_i128(31234560_i128)), None ] ); @@ -3116,6 +3103,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal() { // test negative cast type let decimal_type = DataType::Decimal128(38, 6); @@ -3158,11 +3146,11 @@ mod tests { Decimal128Array, &decimal_type, vec![ - Some(Decimal128::new_from_i128(38, 6, 1000000_i128)), - Some(Decimal128::new_from_i128(38, 6, 2000000_i128)), - Some(Decimal128::new_from_i128(38, 6, 3000000_i128)), + Some(1000000_i128), + Some(2000000_i128), + Some(3000000_i128), None, - Some(Decimal128::new_from_i128(38, 6, 5000000_i128)) + Some(5000000_i128) ] ); } @@ -3172,8 +3160,11 @@ mod tests { let array = Int8Array::from(vec![1, 2, 3, 4, 100]); let array = Arc::new(array) as ArrayRef; let casted_array = cast(&array, &DataType::Decimal128(3, 1)); - assert!(casted_array.is_err()); - assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", casted_array.unwrap_err().to_string()); + assert!(casted_array.is_ok()); + let array = casted_array.unwrap(); + let array: &Decimal128Array = as_primitive_array(&array); + let err = array.validate_decimal_precision(3); + assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); // test f32 to decimal type let array = Float32Array::from(vec![ @@ -3190,12 +3181,12 @@ mod tests { Decimal128Array, &decimal_type, vec![ - Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), - Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), - Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), + Some(1100000_i128), + Some(2200000_i128), + Some(4400000_i128), None, - Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), - Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(1123456_i128), + Some(1123456_i128), ] ); @@ -3215,13 +3206,13 @@ mod tests { Decimal128Array, &decimal_type, vec![ - Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), - Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), - Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), + Some(1100000_i128), + Some(2200000_i128), + Some(4400000_i128), None, - Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), - Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), - Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(1123456_i128), + Some(1123456_i128), + Some(1123456_i128), ] ); } diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 5eaed4bc62bc..ef423fcbf428 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -493,7 +493,7 @@ where .expect("Unable to downcast to decimal array"); let valids = value_indices .into_iter() - .map(|index| (index, decimal_array.value(index as usize).as_i128())) + .map(|index| (index, decimal_array.value(index as usize))) .collect::>(); sort_primitive_inner( decimal_values.len(), diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 714c29772a50..a399f0602004 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -451,7 +451,7 @@ where if decimal_values.is_null(index) { Ok(None) } else { - Ok(Some(decimal_values.value(index).as_i128())) + Ok(Some(decimal_values.value(index))) } }); let t: Result>> = t.transpose(); @@ -461,7 +461,7 @@ where .collect::>()? // PERF: we could avoid re-validating that the data in // Decimal128Array was in range as we know it came from a valid Decimal128Array - .with_precision_and_scale(decimal_values.precision(), decimal_values.scale()) + .with_precision_and_scale(decimal_values.precision()?, decimal_values.scale()?) } /// `take` implementation for all primitive arrays diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 2f4ec1a1ca3a..ab5947b4ecef 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -707,8 +707,7 @@ fn build_decimal_array( precision: u8, scale: u8, ) -> Result { - let mut decimal_builder = - Decimal128Builder::with_capacity(rows.len(), precision, scale); + let mut decimal_builder = Decimal128Builder::with_capacity(rows.len()); for row in rows { let col_s = row.get(col_idx); match col_s { @@ -725,7 +724,7 @@ fn build_decimal_array( parse_decimal_with_parameter(s, precision, scale); match decimal_value { Ok(v) => { - decimal_builder.append_value(v)?; + decimal_builder.append_value(v); } Err(e) => { return Err(e); @@ -735,7 +734,11 @@ fn build_decimal_array( } } } - Ok(Arc::new(decimal_builder.finish())) + Ok(Arc::new( + decimal_builder + .finish() + .with_precision_and_scale(precision, scale)?, + )) } // Parse the string format decimal value to i128 format and checking the precision and scale. @@ -1237,16 +1240,16 @@ mod tests { .downcast_ref::() .unwrap(); - assert_eq!("57.653484", lat.value_as_string(0)); - assert_eq!("53.002666", lat.value_as_string(1)); - assert_eq!("52.412811", lat.value_as_string(2)); - assert_eq!("51.481583", lat.value_as_string(3)); - assert_eq!("12.123456", lat.value_as_string(4)); - assert_eq!("50.760000", lat.value_as_string(5)); - assert_eq!("0.123000", lat.value_as_string(6)); - assert_eq!("123.000000", lat.value_as_string(7)); - assert_eq!("123.000000", lat.value_as_string(8)); - assert_eq!("-50.760000", lat.value_as_string(9)); + assert_eq!("57.653484", lat.value_as_string(0).unwrap()); + assert_eq!("53.002666", lat.value_as_string(1).unwrap()); + assert_eq!("52.412811", lat.value_as_string(2).unwrap()); + assert_eq!("51.481583", lat.value_as_string(3).unwrap()); + assert_eq!("12.123456", lat.value_as_string(4).unwrap()); + assert_eq!("50.760000", lat.value_as_string(5).unwrap()); + assert_eq!("0.123000", lat.value_as_string(6).unwrap()); + assert_eq!("123.000000", lat.value_as_string(7).unwrap()); + assert_eq!("123.000000", lat.value_as_string(8).unwrap()); + assert_eq!("-50.760000", lat.value_as_string(9).unwrap()); } #[test] diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 9952ee094bfb..ec7afd8e30aa 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -20,7 +20,7 @@ use crate::compute::SortOptions; use crate::datatypes::ArrowPrimitiveType; use crate::row::{null_sentinel, Rows}; use arrow_array::types::DecimalType; -use arrow_array::{BooleanArray, DecimalArray}; +use arrow_array::BooleanArray; use arrow_buffer::{bit_util, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; @@ -355,12 +355,12 @@ fn decode_fixed( } /// Decodes a `DecimalArray` from rows -pub fn decode_decimal( +pub fn decode_decimal( rows: &mut [&[u8]], options: SortOptions, precision: u8, scale: u8, -) -> DecimalArray { +) -> PrimitiveArray { decode_fixed::>(rows, T::TYPE_CONSTRUCTOR(precision, scale), options) .into() } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index f5ac570320bd..f604f65706d5 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -529,7 +529,7 @@ fn encode_column( .downcast_ref::() .unwrap() .into_iter() - .map(|x| x.map(|x| RawDecimal(*x.raw_value()))); + .map(|x| x.map(|x| RawDecimal(x.to_le_bytes()))); fixed::encode(out, iter, opts) }, @@ -539,7 +539,7 @@ fn encode_column( .downcast_ref::() .unwrap() .into_iter() - .map(|x| x.map(|x| RawDecimal(*x.raw_value()))); + .map(|x| x.map(|x| RawDecimal(x.to_le_bytes()))); fixed::encode(out, iter, opts) }, diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index cf8394efaa6b..8b8db1be5758 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -274,8 +274,7 @@ pub fn make_string_from_decimal(column: &Arc, row: usize) -> Result() .unwrap(); - let formatted_decimal = array.value_as_string(row); - Ok(formatted_decimal) + array.value_as_string(row) } fn append_struct_field_string( diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 93b936e7c2f9..f4dcda2e8de9 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -1065,9 +1065,9 @@ fn test_decimal_full_validation() { #[test] fn test_decimal_validation() { - let mut builder = Decimal128Builder::with_capacity(4, 10, 4); - builder.append_value(10000).unwrap(); - builder.append_value(20000).unwrap(); + let mut builder = Decimal128Builder::with_capacity(4); + builder.append_value(10000); + builder.append_value(20000); let array = builder.finish(); array.data().validate_full().unwrap(); diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index f4f6a336498a..7b28f399f246 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -33,6 +33,7 @@ logging = ["tracing-subscriber"] [dependencies] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } arrow-flight = { path = "../arrow-flight", default-features = false } +arrow-buffer = { path = "../arrow-buffer", default-features = false } async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } diff --git a/integration-testing/src/util/mod.rs b/integration-testing/src/util/mod.rs index c0eb80a35711..72ecfaa00f0a 100644 --- a/integration-testing/src/util/mod.rs +++ b/integration-testing/src/util/mod.rs @@ -34,7 +34,7 @@ use arrow::datatypes::*; use arrow::error::{ArrowError, Result}; use arrow::record_batch::{RecordBatch, RecordBatchReader}; use arrow::util::bit_util; -use arrow::util::decimal::Decimal256; +use arrow_buffer::i256; mod datatype; mod field; @@ -787,12 +787,7 @@ pub fn array_from_json( } } DataType::Decimal128(precision, scale) => { - let mut b = - Decimal128Builder::with_capacity(json_col.count, *precision, *scale); - // C++ interop tests involve incompatible decimal values - unsafe { - b.disable_value_validation(); - } + let mut b = Decimal128Builder::with_capacity(json_col.count); for (is_valid, value) in json_col .validity .as_ref() @@ -801,21 +796,16 @@ pub fn array_from_json( .zip(json_col.data.unwrap()) { match is_valid { - 1 => { - b.append_value(value.as_str().unwrap().parse::().unwrap())? - } + 1 => b.append_value(value.as_str().unwrap().parse::().unwrap()), _ => b.append_null(), }; } - Ok(Arc::new(b.finish())) + Ok(Arc::new( + b.finish().with_precision_and_scale(*precision, *scale)?, + )) } DataType::Decimal256(precision, scale) => { - let mut b = - Decimal256Builder::with_capacity(json_col.count, *precision, *scale); - // C++ interop tests involve incompatible decimal values - unsafe { - b.disable_value_validation(); - } + let mut b = Decimal256Builder::with_capacity(json_col.count); for (is_valid, value) in json_col .validity .as_ref() @@ -835,15 +825,14 @@ pub fn array_from_json( }; bytes[0..integer_bytes.len()] .copy_from_slice(integer_bytes.as_slice()); - let decimal = - Decimal256::try_new_from_bytes(*precision, *scale, &bytes) - .unwrap(); - b.append_value(&decimal)?; + b.append_value(i256::from_le_bytes(bytes)); } _ => b.append_null(), } } - Ok(Arc::new(b.finish())) + Ok(Arc::new( + b.finish().with_precision_and_scale(*precision, *scale)?, + )) } DataType::Map(child_field, _) => { let null_buf = create_null_buf(&json_col); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5ee963916da9..0390b43aaa98 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1176,11 +1176,11 @@ mod tests { let expected = 1..25; - assert_eq!(col.precision(), target_precision); - assert_eq!(col.scale(), 2); + assert_eq!(col.precision().unwrap(), target_precision); + assert_eq!(col.scale().unwrap(), 2); for (i, v) in expected.enumerate() { - assert_eq!(col.value(i).as_i128(), v * 100_i128); + assert_eq!(col.value(i), v * 100_i128); } } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index b5c0b50127d4..79d9d28095cb 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -583,9 +583,9 @@ fn get_decimal_array_slice( indices: &[usize], ) -> Vec { let mut values = Vec::with_capacity(indices.len()); - let size = decimal_length_from_precision(array.precision()); + let size = decimal_length_from_precision(array.precision().unwrap()); for i in indices { - let as_be_bytes = array.value(*i).as_i128().to_be_bytes(); + let as_be_bytes = array.value(*i).to_be_bytes(); let resized_value = as_be_bytes[(16 - size)..].to_vec(); values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); } From 1b69dd7b0d59616f0ad32259fd3e3f27361fa278 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Oct 2022 15:34:10 +1300 Subject: [PATCH 0130/1411] Update test_parse_arg_compression_format_fail (#2867) --- parquet/src/bin/parquet-fromcsv.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 90e0a68625f9..8c62241e34f5 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -516,7 +516,7 @@ mod tests { Ok(_) => panic!("unexpected success"), Err(e) => assert_eq!( format!("{}", e), - "error: Invalid value \"zip\" for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), + "error: Invalid value 'zip' for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), } } From d7f994c2af59653db5a18c52920f0b5dbd27ff6f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Oct 2022 17:21:23 +1300 Subject: [PATCH 0131/1411] Take decimal as primitive (#2637) (#2869) --- arrow/src/compute/kernels/take.rs | 52 ++++++++----------------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index a399f0602004..0ef2025cf382 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -134,10 +134,19 @@ where let values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(take_boolean(values, indices)?)) } - DataType::Decimal128(_, _) => { - let decimal_values = - values.as_any().downcast_ref::().unwrap(); - Ok(Arc::new(take_decimal128(decimal_values, indices)?)) + DataType::Decimal128(p, s) => { + let decimal_values = values.as_any().downcast_ref::().unwrap(); + let array = take_primitive(decimal_values, indices)? + .with_precision_and_scale(*p, *s) + .unwrap(); + Ok(Arc::new(array)) + } + DataType::Decimal256(p, s) => { + let decimal_values = values.as_any().downcast_ref::().unwrap(); + let array = take_primitive(decimal_values, indices)? + .with_precision_and_scale(*p, *s) + .unwrap(); + Ok(Arc::new(array)) } DataType::Utf8 => { let values = values @@ -429,41 +438,6 @@ where Ok((buffer, nulls)) } -/// `take` implementation for decimal arrays -fn take_decimal128( - decimal_values: &Decimal128Array, - indices: &PrimitiveArray, -) -> Result -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - indices - .iter() - .map(|index| { - // Use type annotations below for readability (was blowing - // my mind otherwise) - let t: Option>> = index.map(|index| { - let index = ToPrimitive::to_usize(&index).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if decimal_values.is_null(index) { - Ok(None) - } else { - Ok(Some(decimal_values.value(index))) - } - }); - let t: Result>> = t.transpose(); - let t: Result> = t.map(|t| t.flatten()); - t - }) - .collect::>()? - // PERF: we could avoid re-validating that the data in - // Decimal128Array was in range as we know it came from a valid Decimal128Array - .with_precision_and_scale(decimal_values.precision()?, decimal_values.scale()?) -} - /// `take` implementation for all primitive arrays /// /// This checks if an `indices` slot is populated, and gets the value from `values` From d67d5fb0086ee5a18bf3a148c874ccb11533044d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Oct 2022 20:03:54 +1300 Subject: [PATCH 0132/1411] Decimal cleanup (#2637) (#2865) * Decimal cleanup (#2637) * Format --- .github/workflows/arrow.yml | 10 +- arrow-array/src/array/primitive_array.rs | 98 +++-- arrow-array/src/decimal.rs | 484 ----------------------- arrow-array/src/lib.rs | 1 - arrow-array/src/types.rs | 40 +- arrow/src/csv/reader.rs | 20 +- arrow/src/util/display.rs | 2 +- arrow/src/util/mod.rs | 1 - parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- 10 files changed, 95 insertions(+), 567 deletions(-) delete mode 100644 arrow-array/src/decimal.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 466f0b12ec36..c4b975517335 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -186,6 +186,14 @@ jobs: - name: Setup Clippy run: | rustup component add clippy - - name: Run clippy + - name: Clippy arrow-buffer with all features + run: cargo clippy -p arrow-buffer --all-features + - name: Clippy arrow-data with all features + run: cargo clippy -p arrow-data --all-features + - name: Clippy arrow-schema with all features + run: cargo clippy -p arrow-schema --all-features + - name: Clippy arrow-array with all features + run: cargo clippy -p arrow-array --all-features + - name: Clippy arrow run: | cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict --all-targets -- -D warnings diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 377523267a44..4722cec67c65 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -16,7 +16,6 @@ // under the License. use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; -use crate::decimal::Decimal; use crate::iterator::PrimitiveIter; use crate::raw_pointer::RawPtrBox; use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; @@ -917,67 +916,62 @@ impl PrimitiveArray { }) } - pub fn value_as_string(&self, row: usize) -> Result { - let p = self.precision()?; - let s = self.scale()?; - Ok(Decimal::::new(p, s, &T::to_native(self.value(row))).to_string()) + /// Returns [`Self::value`] formatted as a string + pub fn value_as_string(&self, row: usize) -> String { + T::format_decimal(self.value(row), self.precision(), self.scale()) } - pub fn precision(&self) -> Result { + /// Returns the decimal precision of this array + pub fn precision(&self) -> u8 { match T::BYTE_LENGTH { 16 => { if let DataType::Decimal128(p, _) = self.data().data_type() { - Ok(*p) + *p } else { - Err(ArrowError::InvalidArgumentError(format!( + unreachable!( "Decimal128Array datatype is not DataType::Decimal128 but {}", self.data_type() - ))) + ) } } 32 => { if let DataType::Decimal256(p, _) = self.data().data_type() { - Ok(*p) + *p } else { - Err(ArrowError::InvalidArgumentError(format!( + unreachable!( "Decimal256Array datatype is not DataType::Decimal256 but {}", self.data_type() - ))) + ) } } - other => Err(ArrowError::InvalidArgumentError(format!( - "Unsupported byte length for decimal array {}", - other - ))), + other => unreachable!("Unsupported byte length for decimal array {}", other), } } - pub fn scale(&self) -> Result { + /// Returns the decimal scale of this array + pub fn scale(&self) -> u8 { match T::BYTE_LENGTH { 16 => { if let DataType::Decimal128(_, s) = self.data().data_type() { - Ok(*s) + *s } else { - Err(ArrowError::InvalidArgumentError(format!( + unreachable!( "Decimal128Array datatype is not DataType::Decimal128 but {}", self.data_type() - ))) + ) } } 32 => { if let DataType::Decimal256(_, s) = self.data().data_type() { - Ok(*s) + *s } else { - Err(ArrowError::InvalidArgumentError(format!( + unreachable!( "Decimal256Array datatype is not DataType::Decimal256 but {}", self.data_type() - ))) + ) } } - other => Err(ArrowError::InvalidArgumentError(format!( - "Unsupported byte length for decimal array {}", - other - ))), + other => unreachable!("Unsupported byte length for decimal array {}", other), } } } @@ -1564,8 +1558,8 @@ mod tests { .build() .unwrap(); let decimal_array = Decimal128Array::from(array_data); - assert_eq!(8_887_000_000_i128, decimal_array.value(0).into()); - assert_eq!(-8_887_000_000_i128, decimal_array.value(1).into()); + assert_eq!(8_887_000_000_i128, decimal_array.value(0)); + assert_eq!(-8_887_000_000_i128, decimal_array.value(1)); } #[test] @@ -1576,7 +1570,7 @@ mod tests { let result = decimal_builder.finish().with_precision_and_scale(5, 3); assert!(result.is_ok()); let arr = result.unwrap(); - assert_eq!("12.345", arr.value_as_string(1).unwrap()); + assert_eq!("12.345", arr.value_as_string(1)); // Validate it explicitly let result = arr.validate_decimal_precision(5); @@ -1594,8 +1588,8 @@ mod tests { let result = decimal_builder.finish().with_precision_and_scale(2, 1); assert!(result.is_ok()); let arr = result.unwrap(); - assert_eq!("9.9", arr.value_as_string(1).unwrap()); - assert_eq!("-9.9", arr.value_as_string(3).unwrap()); + assert_eq!("9.9", arr.value_as_string(1)); + assert_eq!("-9.9", arr.value_as_string(3)); // Validate it explicitly let result = arr.validate_decimal_precision(2); @@ -1611,11 +1605,11 @@ mod tests { let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); + assert_eq!(-100_i128, array.value(0)); assert!(!array.is_null(0)); - assert_eq!(0_i128, array.value(1).into()); + assert_eq!(0_i128, array.value(1)); assert!(!array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); + assert_eq!(101_i128, array.value(2)); assert!(!array.is_null(2)); } @@ -1625,10 +1619,10 @@ mod tests { vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); + assert_eq!(-100_i128, array.value(0)); assert!(!array.is_null(0)); assert!(array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); + assert_eq!(101_i128, array.value(2)); assert!(!array.is_null(2)); } @@ -1661,13 +1655,13 @@ mod tests { .with_precision_and_scale(6, 3) .unwrap(); - assert_eq!("123.450", arr.value_as_string(0).unwrap()); - assert_eq!("-123.450", arr.value_as_string(1).unwrap()); - assert_eq!("0.100", arr.value_as_string(2).unwrap()); - assert_eq!("-0.100", arr.value_as_string(3).unwrap()); - assert_eq!("0.010", arr.value_as_string(4).unwrap()); - assert_eq!("-0.010", arr.value_as_string(5).unwrap()); - assert_eq!("0.000", arr.value_as_string(6).unwrap()); + assert_eq!("123.450", arr.value_as_string(0)); + assert_eq!("-123.450", arr.value_as_string(1)); + assert_eq!("0.100", arr.value_as_string(2)); + assert_eq!("-0.100", arr.value_as_string(3)); + assert_eq!("0.010", arr.value_as_string(4)); + assert_eq!("-0.010", arr.value_as_string(5)); + assert_eq!("0.000", arr.value_as_string(6)); } #[test] @@ -1677,12 +1671,10 @@ mod tests { .unwrap(); assert_eq!(arr.data_type(), &DataType::Decimal128(20, 2)); - assert_eq!(arr.precision().unwrap(), 20); - assert_eq!(arr.scale().unwrap(), 2); + assert_eq!(arr.precision(), 20); + assert_eq!(arr.scale(), 2); - let actual: Vec<_> = (0..arr.len()) - .map(|i| arr.value_as_string(i).unwrap()) - .collect(); + let actual: Vec<_> = (0..arr.len()).map(|i| arr.value_as_string(i)).collect(); let expected = vec!["123.45", "4.56", "78.90", "-1232234234324.32"]; assert_eq!(actual, expected); @@ -1748,9 +1740,7 @@ mod tests { let value2 = i256::from_i128(56789); let mut array: Decimal256Array = - vec![Some(value1.clone()), None, Some(value2.clone())] - .into_iter() - .collect(); + vec![Some(value1), None, Some(value2)].into_iter().collect(); array = array.with_precision_and_scale(76, 10).unwrap(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal256(76, 10)); @@ -1768,10 +1758,10 @@ mod tests { array = array.with_precision_and_scale(38, 10).unwrap(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); - assert_eq!(-100_i128, array.value(0).into()); + assert_eq!(-100_i128, array.value(0)); assert!(!array.is_null(0)); assert!(array.is_null(1)); - assert_eq!(101_i128, array.value(2).into()); + assert_eq!(101_i128, array.value(2)); assert!(!array.is_null(2)); } diff --git a/arrow-array/src/decimal.rs b/arrow-array/src/decimal.rs deleted file mode 100644 index 343053330640..000000000000 --- a/arrow-array/src/decimal.rs +++ /dev/null @@ -1,484 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Decimal related utilities, types and functions - -use crate::types::{Decimal128Type, Decimal256Type, DecimalType}; -use arrow_buffer::i256; -use arrow_data::decimal::{DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE}; -use arrow_schema::{ArrowError, DataType}; -use num::{BigInt, Signed}; -use std::cmp::{min, Ordering}; - -/// [`Decimal`] is the generic representation of a single decimal value -/// -/// See [`Decimal128`] and [`Decimal256`] for the value types of [`Decimal128Array`] -/// and [`Decimal256Array`] respectively -/// -/// [`Decimal128Array`]: [crate::array::Decimal128Array] -/// [`Decimal256Array`]: [crate::array::Decimal256Array] -pub struct Decimal { - precision: u8, - scale: u8, - value: T::DecimalNative, -} - -/// Manually implement to avoid `T: Debug` bound -impl std::fmt::Debug for Decimal { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Decimal") - .field("scale", &self.precision) - .field("precision", &self.precision) - // TODO: Could format this better - .field("value", &self.value.as_ref()) - .finish() - } -} - -/// Manually implement to avoid `T: Debug` bound -impl Clone for Decimal { - fn clone(&self) -> Self { - Self { - precision: self.precision, - scale: self.scale, - value: self.value, - } - } -} - -impl Copy for Decimal {} - -impl Decimal { - pub const MAX_PRECISION: u8 = T::MAX_PRECISION; - pub const MAX_SCALE: u8 = T::MAX_SCALE; - pub const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = T::TYPE_CONSTRUCTOR; - pub const DEFAULT_TYPE: DataType = T::DEFAULT_TYPE; - - /// Tries to create a decimal value from precision, scale and bytes. - /// The bytes should be stored in little-endian order. - /// - /// Safety: - /// This method doesn't validate if the decimal value represented by the bytes - /// can be fitted into the specified precision. - pub fn try_new_from_bytes( - precision: u8, - scale: u8, - bytes: &T::DecimalNative, - ) -> Result - where - Self: Sized, - { - if precision > Self::MAX_PRECISION { - return Err(ArrowError::InvalidArgumentError(format!( - "precision {} is greater than max {}", - precision, - Self::MAX_PRECISION - ))); - } - if scale > Self::MAX_SCALE { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is greater than max {}", - scale, - Self::MAX_SCALE - ))); - } - - if precision < scale { - return Err(ArrowError::InvalidArgumentError(format!( - "Precision {} is less than scale {}", - precision, scale - ))); - } - - Ok(Self::new(precision, scale, bytes)) - } - - /// Creates a decimal value from precision, scale, and bytes. - /// - /// Safety: - /// This method doesn't check if the precision and scale are valid. - /// Use `try_new_from_bytes` for safe constructor. - pub fn new(precision: u8, scale: u8, bytes: &T::DecimalNative) -> Self { - Self { - precision, - scale, - value: *bytes, - } - } - - /// Returns the raw bytes of the integer representation of the decimal. - pub fn raw_value(&self) -> &T::DecimalNative { - &self.value - } - - /// Returns the precision of the decimal. - pub fn precision(&self) -> u8 { - self.precision - } - - /// Returns the scale of the decimal. - pub fn scale(&self) -> u8 { - self.scale - } - - /// Returns the string representation of the decimal. - /// If the string representation cannot be fitted with the precision of the decimal, - /// the string will be truncated. - #[allow(clippy::inherent_to_string)] - pub fn to_string(&self) -> String { - let raw_bytes = self.raw_value(); - let integer = BigInt::from_signed_bytes_le(raw_bytes.as_ref()); - let value_str = integer.to_string(); - let (sign, rest) = - value_str.split_at(if integer >= BigInt::from(0) { 0 } else { 1 }); - let bound = min(usize::from(self.precision()), rest.len()) + sign.len(); - let value_str = &value_str[0..bound]; - let scale_usize = usize::from(self.scale()); - - if self.scale() == 0 { - value_str.to_string() - } else if rest.len() > scale_usize { - // Decimal separator is in the middle of the string - let (whole, decimal) = value_str.split_at(value_str.len() - scale_usize); - format!("{}.{}", whole, decimal) - } else { - // String has to be padded - format!("{}0.{:0>width$}", sign, rest, width = scale_usize) - } - } -} - -impl PartialOrd for Decimal { - fn partial_cmp(&self, other: &Self) -> Option { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - Some(singed_cmp_le_bytes( - self.value.as_ref(), - other.value.as_ref(), - )) - } -} - -impl Ord for Decimal { - fn cmp(&self, other: &Self) -> Ordering { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - singed_cmp_le_bytes(self.value.as_ref(), other.value.as_ref()) - } -} - -impl PartialEq for Decimal { - fn eq(&self, other: &Self) -> bool { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - self.value.as_ref().eq(other.value.as_ref()) - } -} - -impl Eq for Decimal {} - -/// Represents a decimal value with precision and scale. -/// The decimal value could represented by a signed 128-bit integer. -pub type Decimal128 = Decimal; - -impl Decimal128 { - /// Creates `Decimal128` from an `i128` value. - pub fn new_from_i128(precision: u8, scale: u8, value: i128) -> Self { - Decimal128 { - precision, - scale, - value: value.to_le_bytes(), - } - } - - /// Returns `i128` representation of the decimal. - pub fn as_i128(&self) -> i128 { - i128::from_le_bytes(self.value) - } -} - -impl From for i128 { - fn from(decimal: Decimal128) -> Self { - decimal.as_i128() - } -} - -/// Represents a decimal value with precision and scale. -/// The decimal value could be represented by a signed 256-bit integer. -pub type Decimal256 = Decimal; - -impl Decimal256 { - /// Constructs a `Decimal256` value from a `BigInt`. - pub fn from_big_int( - num: &BigInt, - precision: u8, - scale: u8, - ) -> Result { - let mut bytes = if num.is_negative() { - [255_u8; 32] - } else { - [0; 32] - }; - let num_bytes = &num.to_signed_bytes_le(); - bytes[0..num_bytes.len()].clone_from_slice(num_bytes); - Decimal256::try_new_from_bytes(precision, scale, &bytes) - } - - pub fn from_i256(precision: u8, scale: u8, value: i256) -> Self { - Decimal256::new(precision, scale, &value.to_le_bytes()) - } - - /// Constructs a `BigInt` from this `Decimal256` value. - pub fn to_big_int(self) -> BigInt { - BigInt::from_signed_bytes_le(&self.value) - } -} - -impl From for Decimal256 { - fn from(bigint: BigInt) -> Self { - Decimal256::from_big_int(&bigint, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE) - .unwrap() - } -} - -// compare two signed integer which are encoded with little endian. -// left bytes and right bytes must have the same length. -#[inline] -pub(crate) fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { - assert_eq!( - left.len(), - right.len(), - "Can't compare bytes array with different len: {}, {}", - left.len(), - right.len() - ); - assert_ne!(left.len(), 0, "Can't compare bytes array of length 0"); - let len = left.len(); - // the sign bit is 1, the value is negative - let left_negative = left[len - 1] >= 0x80_u8; - let right_negative = right[len - 1] >= 0x80_u8; - if left_negative != right_negative { - return match left_negative { - true => { - // left is negative value - // right is positive value - Ordering::Less - } - false => Ordering::Greater, - }; - } - for i in 0..len { - let l_byte = left[len - 1 - i]; - let r_byte = right[len - 1 - i]; - match l_byte.cmp(&r_byte) { - Ordering::Less => { - return Ordering::Less; - } - Ordering::Greater => { - return Ordering::Greater; - } - Ordering::Equal => {} - } - } - Ordering::Equal -} - -#[cfg(test)] -mod tests { - use super::*; - use num::{BigInt, Num}; - use rand::random; - - #[test] - fn decimal_128_to_string() { - let mut value = Decimal128::new_from_i128(5, 2, 100); - assert_eq!(value.to_string(), "1.00"); - - value = Decimal128::new_from_i128(5, 3, 100); - assert_eq!(value.to_string(), "0.100"); - } - - #[test] - fn decimal_invalid_precision_scale() { - let bytes = 100_i128.to_le_bytes(); - let err = Decimal128::try_new_from_bytes(5, 6, &bytes); - assert!(err.is_err()); - } - - #[test] - fn decimal_128_from_bytes() { - let mut bytes = 100_i128.to_le_bytes(); - let value = Decimal128::try_new_from_bytes(5, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "1.00"); - - bytes = (-1_i128).to_le_bytes(); - let value = Decimal128::try_new_from_bytes(5, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "-0.01"); - - bytes = i128::MAX.to_le_bytes(); - let value = Decimal128::try_new_from_bytes(38, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "170141183460469231731687303715884105.72"); - - bytes = i128::MIN.to_le_bytes(); - let value = Decimal128::try_new_from_bytes(38, 2, &bytes).unwrap(); - assert_eq!( - value.to_string(), - "-170141183460469231731687303715884105.72" - ); - - // Truncated - bytes = 12345_i128.to_le_bytes(); - let value = Decimal128::try_new_from_bytes(3, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "1.23"); - - bytes = (-12345_i128).to_le_bytes(); - let value = Decimal128::try_new_from_bytes(3, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "-1.23"); - } - - #[test] - fn decimal_256_from_bytes() { - let mut bytes = [0_u8; 32]; - bytes[0..16].clone_from_slice(&100_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "1.00"); - - bytes[0..16].clone_from_slice(&i128::MAX.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 4, &bytes).unwrap(); - assert_eq!( - value.to_string(), - "17014118346046923173168730371588410.5727" - ); - - // i128 maximum + 1 - bytes[0..16].clone_from_slice(&0_i128.to_le_bytes()); - bytes[15] = 128; - let value = Decimal256::try_new_from_bytes(40, 4, &bytes).unwrap(); - assert_eq!( - value.to_string(), - "17014118346046923173168730371588410.5728" - ); - - // smaller than i128 minimum - bytes = [255; 32]; - bytes[31] = 128; - let value = Decimal256::try_new_from_bytes(76, 4, &bytes).unwrap(); - assert_eq!( - value.to_string(), - "-574437317700748313234121683441537667865831564552201235664496608164256541.5731" - ); - - bytes = [255; 32]; - let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); - assert_eq!(value.to_string(), "-0.01"); - } - - fn i128_func(value: impl Into) -> i128 { - value.into() - } - - #[test] - fn decimal_128_to_i128() { - let value = Decimal128::new_from_i128(5, 2, 100); - let integer = i128_func(value); - assert_eq!(integer, 100); - } - - #[test] - fn bigint_to_decimal256() { - let num = BigInt::from_str_radix("123456789", 10).unwrap(); - let value = Decimal256::from_big_int(&num, 30, 2).unwrap(); - assert_eq!(value.to_string(), "1234567.89"); - - let num = BigInt::from_str_radix("-5744373177007483132341216834415376678658315645522012356644966081642565415731", 10).unwrap(); - let value = Decimal256::from_big_int(&num, 76, 4).unwrap(); - assert_eq!(value.to_string(), "-574437317700748313234121683441537667865831564552201235664496608164256541.5731"); - } - - #[test] - fn test_lt_cmp_byte() { - for _i in 0..100 { - let left = random::(); - let right = random::(); - let result = singed_cmp_le_bytes( - left.to_le_bytes().as_slice(), - right.to_le_bytes().as_slice(), - ); - assert_eq!(left.cmp(&right), result); - } - for _i in 0..100 { - let left = random::(); - let right = random::(); - let result = singed_cmp_le_bytes( - left.to_le_bytes().as_slice(), - right.to_le_bytes().as_slice(), - ); - assert_eq!(left.cmp(&right), result); - } - } - - #[test] - fn compare_decimal128() { - let v1 = -100_i128; - let v2 = 10000_i128; - let right = Decimal128::new_from_i128(20, 3, v2); - for v in v1..v2 { - let left = Decimal128::new_from_i128(20, 3, v); - assert!(left < right); - } - - for _i in 0..100 { - let left = random::(); - let right = random::(); - let left_decimal = Decimal128::new_from_i128(38, 2, left); - let right_decimal = Decimal128::new_from_i128(38, 2, right); - assert_eq!(left < right, left_decimal < right_decimal); - assert_eq!(left == right, left_decimal == right_decimal) - } - } - - #[test] - fn compare_decimal256() { - let v1 = -100_i128; - let v2 = 10000_i128; - let right = Decimal256::from_big_int(&BigInt::from(v2), 75, 2).unwrap(); - for v in v1..v2 { - let left = Decimal256::from_big_int(&BigInt::from(v), 75, 2).unwrap(); - assert!(left < right); - } - - for _i in 0..100 { - let left = random::(); - let right = random::(); - let left_decimal = - Decimal256::from_big_int(&BigInt::from(left), 75, 2).unwrap(); - let right_decimal = - Decimal256::from_big_int(&BigInt::from(right), 75, 2).unwrap(); - assert_eq!(left < right, left_decimal < right_decimal); - assert_eq!(left == right, left_decimal == right_decimal) - } - } -} diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 16e46f68ba07..cc963925d653 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -166,7 +166,6 @@ pub use record_batch::{RecordBatch, RecordBatchOptions}; pub mod builder; pub mod cast; -pub mod decimal; mod delta; pub mod iterator; mod raw_pointer; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 2e161813dbc9..70c43a2a4948 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -495,18 +495,18 @@ impl NativeDecimalType for [u8; N] { pub trait DecimalType: 'static + Send + Sync + ArrowPrimitiveType + private::DecimalTypeSealed { - type DecimalNative: NativeDecimalType; - const BYTE_LENGTH: usize; const MAX_PRECISION: u8; const MAX_SCALE: u8; const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType; const DEFAULT_TYPE: DataType; - fn to_native(num: ::Native) -> Self::DecimalNative; + /// Formats the decimal value with the provided precision and scale + fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String; + /// Validates that `value` contains no more than `precision` decimal digits fn validate_decimal_precision( - num: ::Native, + value: Self::Native, precision: u8, ) -> Result<(), ArrowError>; } @@ -516,8 +516,6 @@ pub trait DecimalType: pub struct Decimal128Type {} impl DecimalType for Decimal128Type { - type DecimalNative = [u8; 16]; - const BYTE_LENGTH: usize = 16; const MAX_PRECISION: u8 = DECIMAL128_MAX_PRECISION; const MAX_SCALE: u8 = DECIMAL128_MAX_SCALE; @@ -525,8 +523,8 @@ impl DecimalType for Decimal128Type { const DEFAULT_TYPE: DataType = DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); - fn to_native(num: i128) -> [u8; 16] { - num.to_le_bytes() + fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale as usize) } fn validate_decimal_precision(num: i128, precision: u8) -> Result<(), ArrowError> { @@ -545,8 +543,6 @@ impl ArrowPrimitiveType for Decimal128Type { pub struct Decimal256Type {} impl DecimalType for Decimal256Type { - type DecimalNative = [u8; 32]; - const BYTE_LENGTH: usize = 32; const MAX_PRECISION: u8 = DECIMAL256_MAX_PRECISION; const MAX_SCALE: u8 = DECIMAL256_MAX_SCALE; @@ -554,8 +550,8 @@ impl DecimalType for Decimal256Type { const DEFAULT_TYPE: DataType = DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); - fn to_native(num: i256) -> [u8; 32] { - num.to_le_bytes() + fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale as usize) } fn validate_decimal_precision(num: i256, precision: u8) -> Result<(), ArrowError> { @@ -569,6 +565,26 @@ impl ArrowPrimitiveType for Decimal256Type { const DATA_TYPE: DataType = ::DEFAULT_TYPE; } +fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String { + let (sign, rest) = match value_str.strip_prefix('-') { + Some(stripped) => ("-", stripped), + None => ("", value_str), + }; + let bound = precision.min(rest.len()) + sign.len(); + let value_str = &value_str[0..bound]; + + if scale == 0 { + value_str.to_string() + } else if rest.len() > scale { + // Decimal separator is in the middle of the string + let (whole, decimal) = value_str.split_at(value_str.len() - scale); + format!("{}.{}", whole, decimal) + } else { + // String has to be padded + format!("{}0.{:0>width$}", sign, rest, width = scale) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index ab5947b4ecef..123c5e1c6716 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1240,16 +1240,16 @@ mod tests { .downcast_ref::() .unwrap(); - assert_eq!("57.653484", lat.value_as_string(0).unwrap()); - assert_eq!("53.002666", lat.value_as_string(1).unwrap()); - assert_eq!("52.412811", lat.value_as_string(2).unwrap()); - assert_eq!("51.481583", lat.value_as_string(3).unwrap()); - assert_eq!("12.123456", lat.value_as_string(4).unwrap()); - assert_eq!("50.760000", lat.value_as_string(5).unwrap()); - assert_eq!("0.123000", lat.value_as_string(6).unwrap()); - assert_eq!("123.000000", lat.value_as_string(7).unwrap()); - assert_eq!("123.000000", lat.value_as_string(8).unwrap()); - assert_eq!("-50.760000", lat.value_as_string(9).unwrap()); + assert_eq!("57.653484", lat.value_as_string(0)); + assert_eq!("53.002666", lat.value_as_string(1)); + assert_eq!("52.412811", lat.value_as_string(2)); + assert_eq!("51.481583", lat.value_as_string(3)); + assert_eq!("12.123456", lat.value_as_string(4)); + assert_eq!("50.760000", lat.value_as_string(5)); + assert_eq!("0.123000", lat.value_as_string(6)); + assert_eq!("123.000000", lat.value_as_string(7)); + assert_eq!("123.000000", lat.value_as_string(8)); + assert_eq!("-50.760000", lat.value_as_string(9)); } #[test] diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index 8b8db1be5758..7c0b5a28f89e 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -274,7 +274,7 @@ pub fn make_string_from_decimal(column: &Arc, row: usize) -> Result() .unwrap(); - array.value_as_string(row) + Ok(array.value_as_string(row)) } fn append_struct_field_string( diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index a20657b58229..f0b9e0076ba1 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -32,5 +32,4 @@ pub mod string_writer; #[cfg(any(test, feature = "test_utils"))] pub mod test_util; -pub use arrow_array::decimal; pub(crate) mod reader_parser; diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 0390b43aaa98..51b09302cdf1 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1176,8 +1176,8 @@ mod tests { let expected = 1..25; - assert_eq!(col.precision().unwrap(), target_precision); - assert_eq!(col.scale().unwrap(), 2); + assert_eq!(col.precision(), target_precision); + assert_eq!(col.scale(), 2); for (i, v) in expected.enumerate() { assert_eq!(col.value(i), v * 100_i128); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 79d9d28095cb..bc68874ebabd 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -583,7 +583,7 @@ fn get_decimal_array_slice( indices: &[usize], ) -> Vec { let mut values = Vec::with_capacity(indices.len()); - let size = decimal_length_from_precision(array.precision().unwrap()); + let size = decimal_length_from_precision(array.precision()); for i in indices { let as_be_bytes = array.value(*i).to_be_bytes(); let resized_value = as_be_bytes[(16 - size)..].to_vec(); From 9c5ba92f144014011b88f8859191708c5641cf1b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 15 Oct 2022 07:49:28 +1300 Subject: [PATCH 0133/1411] Split out arrow-integration-test crate (#2868) * Split out arrow-integration-test crate * Update RAT * Format * Update Markdown format --- .github/workflows/arrow.yml | 3 ++ .github/workflows/dev.yml | 2 +- .github/workflows/integration.yml | 3 +- Cargo.toml | 7 +-- arrow-integration-test/Cargo.toml | 48 +++++++++++++++++++ .../data/integration.json | 0 .../src}/datatype.rs | 0 .../src}/field.rs | 2 +- .../src/lib.rs | 8 ++-- .../src}/schema.rs | 2 +- .../Cargo.toml | 1 + .../README.md | 0 .../src/bin/arrow-file-to-stream.rs | 0 .../src/bin/arrow-json-integration-test.rs | 3 +- .../src/bin/arrow-stream-to-file.rs | 0 .../src/bin/flight-test-integration-client.rs | 0 .../src/bin/flight-test-integration-server.rs | 0 .../src/flight_client_scenarios.rs | 0 .../auth_basic_proto.rs | 0 .../integration_test.rs | 0 .../src/flight_client_scenarios/middleware.rs | 0 .../src/flight_server_scenarios.rs | 0 .../auth_basic_proto.rs | 0 .../integration_test.rs | 0 .../src/flight_server_scenarios/middleware.rs | 0 .../src/lib.rs | 4 +- .../tests/ipc_reader.rs | 0 .../tests/ipc_writer.rs | 0 dev/release/rat_exclude_files.txt | 2 +- 29 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 arrow-integration-test/Cargo.toml rename {integration-testing => arrow-integration-test}/data/integration.json (100%) rename {integration-testing/src/util => arrow-integration-test/src}/datatype.rs (100%) rename {integration-testing/src/util => arrow-integration-test/src}/field.rs (99%) rename integration-testing/src/util/mod.rs => arrow-integration-test/src/lib.rs (99%) rename {integration-testing/src/util => arrow-integration-test/src}/schema.rs (99%) rename {integration-testing => arrow-integration-testing}/Cargo.toml (96%) rename {integration-testing => arrow-integration-testing}/README.md (100%) rename {integration-testing => arrow-integration-testing}/src/bin/arrow-file-to-stream.rs (100%) rename {integration-testing => arrow-integration-testing}/src/bin/arrow-json-integration-test.rs (98%) rename {integration-testing => arrow-integration-testing}/src/bin/arrow-stream-to-file.rs (100%) rename {integration-testing => arrow-integration-testing}/src/bin/flight-test-integration-client.rs (100%) rename {integration-testing => arrow-integration-testing}/src/bin/flight-test-integration-server.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_client_scenarios.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_client_scenarios/auth_basic_proto.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_client_scenarios/integration_test.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_client_scenarios/middleware.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_server_scenarios.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_server_scenarios/auth_basic_proto.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_server_scenarios/integration_test.rs (100%) rename {integration-testing => arrow-integration-testing}/src/flight_server_scenarios/middleware.rs (100%) rename {integration-testing => arrow-integration-testing}/src/lib.rs (99%) rename {integration-testing => arrow-integration-testing}/tests/ipc_reader.rs (100%) rename {integration-testing => arrow-integration-testing}/tests/ipc_writer.rs (100%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index c4b975517335..2838a35144c2 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -30,6 +30,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-integration-test/** - .github/** jobs: @@ -60,6 +61,8 @@ jobs: run: cargo test -p arrow-schema --all-features - name: Test arrow-array with all features run: cargo test -p arrow-array --all-features + - name: Test arrow-integration-test with all features + run: cargo test -p arrow-integration-test --all-features - name: Test arrow run: cargo test -p arrow - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 57dc19482761..214a11d5ec80 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -53,5 +53,5 @@ jobs: - name: Prettier check run: | # if you encounter error, run the command below and commit the changes - npx prettier@2.3.2 --write {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md + npx prettier@2.3.2 --write {arrow,arrow-flight,dev,arrow-integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md git diff --exit-code diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 0f183990ed49..bdf576af98d8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -30,7 +30,8 @@ on: - arrow-data/** - arrow-schema/** - arrow-pyarrow-integration-testing/** - - integration-testing/** + - arrow-integration-test/** + - arrow-integration-testing/** - .github/** jobs: diff --git a/Cargo.toml b/Cargo.toml index 28517265b3c3..e57de7711d71 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,14 +19,15 @@ members = [ "arrow", "arrow-array", - "arrow-data", - "arrow-schema", "arrow-buffer", + "arrow-data", "arrow-flight", + "arrow-integration-test", + "arrow-integration-testing", + "arrow-schema", "parquet", "parquet_derive", "parquet_derive_test", - "integration-testing", "object_store", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml new file mode 100644 index 000000000000..be54cc4d6239 --- /dev/null +++ b/arrow-integration-test/Cargo.toml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-integration-test" +version = "24.0.0" +description = "Support for the Apache Arrow JSON test data format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_integration_test" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow = { version = "24.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } +hex = { version = "0.4", default-features = false, features = ["std"] } +serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +num = { version = "0.4", default-features = false, features = ["std"] } + +[build-dependencies] diff --git a/integration-testing/data/integration.json b/arrow-integration-test/data/integration.json similarity index 100% rename from integration-testing/data/integration.json rename to arrow-integration-test/data/integration.json diff --git a/integration-testing/src/util/datatype.rs b/arrow-integration-test/src/datatype.rs similarity index 100% rename from integration-testing/src/util/datatype.rs rename to arrow-integration-test/src/datatype.rs diff --git a/integration-testing/src/util/field.rs b/arrow-integration-test/src/field.rs similarity index 99% rename from integration-testing/src/util/field.rs rename to arrow-integration-test/src/field.rs index a2becc004d13..9b1a8f5f9ba6 100644 --- a/integration-testing/src/util/field.rs +++ b/arrow-integration-test/src/field.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::util::datatype::{data_type_from_json, data_type_to_json}; +use crate::{data_type_from_json, data_type_to_json}; use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; use std::collections::BTreeMap; diff --git a/integration-testing/src/util/mod.rs b/arrow-integration-test/src/lib.rs similarity index 99% rename from integration-testing/src/util/mod.rs rename to arrow-integration-test/src/lib.rs index 72ecfaa00f0a..cf7024dc0264 100644 --- a/integration-testing/src/util/mod.rs +++ b/arrow-integration-test/src/lib.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -//! Utils for JSON integration testing +//! Support for the [Apache Arrow JSON test data format](https://github.com/apache/arrow/blob/master/docs/source/format/Integration.rst#json-test-data-format) //! //! These utilities define structs that read the integration JSON format for integration testing purposes. +//! +//! This is not a canonical format, but provides a human-readable way of verifying language implementations use hex::decode; use num::BigInt; @@ -40,8 +42,8 @@ mod datatype; mod field; mod schema; -use crate::util::datatype::data_type_to_json; -use crate::util::field::field_from_json; +pub use datatype::*; +pub use field::*; pub use schema::*; /// A struct that represents an Arrow file with a schema and record batches diff --git a/integration-testing/src/util/schema.rs b/arrow-integration-test/src/schema.rs similarity index 99% rename from integration-testing/src/util/schema.rs rename to arrow-integration-test/src/schema.rs index 7e3475e6f460..8147589390a3 100644 --- a/integration-testing/src/util/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::util::field::{field_from_json, field_to_json}; +use crate::{field_from_json, field_to_json}; use arrow::datatypes::Schema; use arrow::error::{ArrowError, Result}; use std::collections::HashMap; diff --git a/integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml similarity index 96% rename from integration-testing/Cargo.toml rename to arrow-integration-testing/Cargo.toml index 7b28f399f246..0f43447de4c0 100644 --- a/integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -34,6 +34,7 @@ logging = ["tracing-subscriber"] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } arrow-flight = { path = "../arrow-flight", default-features = false } arrow-buffer = { path = "../arrow-buffer", default-features = false } +arrow-integration-test = { path = "../arrow-integration-test", default-features = false } async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } diff --git a/integration-testing/README.md b/arrow-integration-testing/README.md similarity index 100% rename from integration-testing/README.md rename to arrow-integration-testing/README.md diff --git a/integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs similarity index 100% rename from integration-testing/src/bin/arrow-file-to-stream.rs rename to arrow-integration-testing/src/bin/arrow-file-to-stream.rs diff --git a/integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs similarity index 98% rename from integration-testing/src/bin/arrow-json-integration-test.rs rename to arrow-integration-testing/src/bin/arrow-json-integration-test.rs index b84680f6f4b3..5eb443b08a85 100644 --- a/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -20,7 +20,8 @@ use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; -use arrow_integration_testing::{read_json_file, util::*}; +use arrow_integration_test::*; +use arrow_integration_testing::read_json_file; use clap::Parser; use std::fs::File; diff --git a/integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs similarity index 100% rename from integration-testing/src/bin/arrow-stream-to-file.rs rename to arrow-integration-testing/src/bin/arrow-stream-to-file.rs diff --git a/integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs similarity index 100% rename from integration-testing/src/bin/flight-test-integration-client.rs rename to arrow-integration-testing/src/bin/flight-test-integration-client.rs diff --git a/integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs similarity index 100% rename from integration-testing/src/bin/flight-test-integration-server.rs rename to arrow-integration-testing/src/bin/flight-test-integration-server.rs diff --git a/integration-testing/src/flight_client_scenarios.rs b/arrow-integration-testing/src/flight_client_scenarios.rs similarity index 100% rename from integration-testing/src/flight_client_scenarios.rs rename to arrow-integration-testing/src/flight_client_scenarios.rs diff --git a/integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs similarity index 100% rename from integration-testing/src/flight_client_scenarios/auth_basic_proto.rs rename to arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs diff --git a/integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs similarity index 100% rename from integration-testing/src/flight_client_scenarios/integration_test.rs rename to arrow-integration-testing/src/flight_client_scenarios/integration_test.rs diff --git a/integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs similarity index 100% rename from integration-testing/src/flight_client_scenarios/middleware.rs rename to arrow-integration-testing/src/flight_client_scenarios/middleware.rs diff --git a/integration-testing/src/flight_server_scenarios.rs b/arrow-integration-testing/src/flight_server_scenarios.rs similarity index 100% rename from integration-testing/src/flight_server_scenarios.rs rename to arrow-integration-testing/src/flight_server_scenarios.rs diff --git a/integration-testing/src/flight_server_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs similarity index 100% rename from integration-testing/src/flight_server_scenarios/auth_basic_proto.rs rename to arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs diff --git a/integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs similarity index 100% rename from integration-testing/src/flight_server_scenarios/integration_test.rs rename to arrow-integration-testing/src/flight_server_scenarios/integration_test.rs diff --git a/integration-testing/src/flight_server_scenarios/middleware.rs b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs similarity index 100% rename from integration-testing/src/flight_server_scenarios/middleware.rs rename to arrow-integration-testing/src/flight_server_scenarios/middleware.rs diff --git a/integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs similarity index 99% rename from integration-testing/src/lib.rs rename to arrow-integration-testing/src/lib.rs index 2345f1967f24..2edd0ed28389 100644 --- a/integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -19,12 +19,11 @@ use serde_json::Value; -use util::*; - use arrow::datatypes::Schema; use arrow::error::Result; use arrow::record_batch::RecordBatch; use arrow::util::test_util::arrow_test_data; +use arrow_integration_test::*; use std::collections::HashMap; use std::fs::File; use std::io::BufReader; @@ -36,7 +35,6 @@ pub const AUTH_PASSWORD: &str = "flight"; pub mod flight_client_scenarios; pub mod flight_server_scenarios; -pub mod util; pub struct ArrowFile { pub schema: Schema, diff --git a/integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs similarity index 100% rename from integration-testing/tests/ipc_reader.rs rename to arrow-integration-testing/tests/ipc_reader.rs diff --git a/integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs similarity index 100% rename from integration-testing/tests/ipc_writer.rs rename to arrow-integration-testing/tests/ipc_writer.rs diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 609a5851cad3..bafee11edb7e 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -4,7 +4,7 @@ target/* dev/release/rat_exclude_files.txt arrow/test/data/* arrow/test/dependency/* -integration-testing/data/* +arrow-integration-test/data/* parquet_derive/test/dependency/* .gitattributes **.gitignore From 1eb19b5394b84eaa0dbb24f65e74018defb3332b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 14 Oct 2022 16:32:01 -0400 Subject: [PATCH 0134/1411] Update version to `25.0.0` and update Changelog (#2877) * Chore: update version to 25.0.0 * Update instructions * Update for changelog * Changelog * Updates * udpates * fix link --- CHANGELOG-old.md | 63 +++++++++++++- CHANGELOG.md | 92 +++++++++++--------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 4 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 4 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow/Cargo.toml | 10 +-- arrow/README.md | 4 +- dev/release/README.md | 8 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 6 +- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 19 files changed, 157 insertions(+), 82 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 3305a6cfd2af..3f5c541df903 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,9 +19,68 @@ # Historical Changelog -## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-16) +## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...24.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/23.0.0...24.0.0) + +**Breaking changes:** + +- Cleanup `ArrowNativeType` \(\#1918\) [\#2793](https://github.com/apache/arrow-rs/pull/2793) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `ArrowNativeType::FromStr` [\#2775](https://github.com/apache/arrow-rs/pull/2775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out `arrow-array` crate \(\#2594\) [\#2769](https://github.com/apache/arrow-rs/pull/2769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `dyn_arith_dict` feature flag [\#2760](https://github.com/apache/arrow-rs/pull/2760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out `arrow-data` into a separate crate [\#2746](https://github.com/apache/arrow-rs/pull/2746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-schema \(\#2594\) [\#2711](https://github.com/apache/arrow-rs/pull/2711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Include field name in Parquet PrimitiveTypeBuilder error messages [\#2804](https://github.com/apache/arrow-rs/issues/2804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add PrimitiveArray::reinterpret\_cast [\#2785](https://github.com/apache/arrow-rs/issues/2785) +- BinaryBuilder and StringBuilder initialization parameters in struct\_builder may be wrong [\#2783](https://github.com/apache/arrow-rs/issues/2783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide scalar dyn kernel which produces null for division by zero [\#2767](https://github.com/apache/arrow-rs/issues/2767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide dyn kernel which produces null for division by zero [\#2763](https://github.com/apache/arrow-rs/issues/2763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of checked kernels on non-null data [\#2747](https://github.com/apache/arrow-rs/issues/2747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variants of arithmetic dyn kernels [\#2739](https://github.com/apache/arrow-rs/issues/2739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The `binary` function should not panic on unequaled array length. [\#2721](https://github.com/apache/arrow-rs/issues/2721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- min compute kernel is incorrect with sliced buffers in arrow 23 [\#2779](https://github.com/apache/arrow-rs/issues/2779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `try_unary_dict` should check value type of dictionary array [\#2754](https://github.com/apache/arrow-rs/issues/2754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Add back JSON import/export for schema [\#2762](https://github.com/apache/arrow-rs/issues/2762) +- null casting and coercion for Decimal128 [\#2761](https://github.com/apache/arrow-rs/issues/2761) +- Json decoder behavior changed from versions 21 to 21 and returns non-sensical num\_rows for RecordBatch [\#2722](https://github.com/apache/arrow-rs/issues/2722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release Arrow `23.0.0` \(next release after `22.0.0`\) [\#2665](https://github.com/apache/arrow-rs/issues/2665) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Merged pull requests:** + +- add field name to parquet PrimitiveTypeBuilder error messages [\#2805](https://github.com/apache/arrow-rs/pull/2805) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([andygrove](https://github.com/andygrove)) +- Add struct equality test case \(\#514\) [\#2791](https://github.com/apache/arrow-rs/pull/2791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move unary kernels to arrow-array \(\#2787\) [\#2789](https://github.com/apache/arrow-rs/pull/2789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Disable test harness for string\_dictionary\_builder benchmark [\#2788](https://github.com/apache/arrow-rs/pull/2788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add PrimitiveArray::reinterpret\_cast \(\#2785\) [\#2786](https://github.com/apache/arrow-rs/pull/2786) ([tustvold](https://github.com/tustvold)) +- Fix BinaryBuilder and StringBuilder Capacity Allocation in StructBuilder [\#2784](https://github.com/apache/arrow-rs/pull/2784) ([chunshao90](https://github.com/chunshao90)) +- Fix min/max computation for sliced arrays \(\#2779\) [\#2780](https://github.com/apache/arrow-rs/pull/2780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix Backwards Compatible Parquet List Encodings \(\#1915\) [\#2774](https://github.com/apache/arrow-rs/pull/2774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- MINOR: Fix clippy for rust 1.64.0 [\#2772](https://github.com/apache/arrow-rs/pull/2772) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- MINOR: Fix clippy for rust 1.64.0 [\#2771](https://github.com/apache/arrow-rs/pull/2771) ([viirya](https://github.com/viirya)) +- Add divide scalar dyn kernel which produces null for division by zero [\#2768](https://github.com/apache/arrow-rs/pull/2768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add divide dyn kernel which produces null for division by zero [\#2764](https://github.com/apache/arrow-rs/pull/2764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add value type check in try\_unary\_dict [\#2755](https://github.com/apache/arrow-rs/pull/2755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix `verify_release_candidate.sh` for new arrow subcrates [\#2752](https://github.com/apache/arrow-rs/pull/2752) ([alamb](https://github.com/alamb)) +- Fix: Issue 2721 : binary function should not panic but return error w… [\#2750](https://github.com/apache/arrow-rs/pull/2750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aksharau](https://github.com/aksharau)) +- Speed up checked kernels for non-null data \(~1.4-5x faster\) [\#2749](https://github.com/apache/arrow-rs/pull/2749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Add overflow-checking variants of arithmetic dyn kernels [\#2740](https://github.com/apache/arrow-rs/pull/2740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Trim parquet row selection [\#2705](https://github.com/apache/arrow-rs/pull/2705) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + + + +## [23.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-16) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...23.0.0) **Breaking changes:** diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b334b699816..d69d8705e1c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,62 +19,74 @@ # Changelog -## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-30) +## [25.0.0](https://github.com/apache/arrow-rs/tree/25.0.0) (2022-10-14) -[Full Changelog](https://github.com/apache/arrow-rs/compare/23.0.0...24.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/24.0.0...25.0.0) **Breaking changes:** -- Cleanup `ArrowNativeType` \(\#1918\) [\#2793](https://github.com/apache/arrow-rs/pull/2793) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove `ArrowNativeType::FromStr` [\#2775](https://github.com/apache/arrow-rs/pull/2775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out `arrow-array` crate \(\#2594\) [\#2769](https://github.com/apache/arrow-rs/pull/2769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add `dyn_arith_dict` feature flag [\#2760](https://github.com/apache/arrow-rs/pull/2760) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out `arrow-data` into a separate crate [\#2746](https://github.com/apache/arrow-rs/pull/2746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-schema \(\#2594\) [\#2711](https://github.com/apache/arrow-rs/pull/2711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make DecimalArray as PrimitiveArray [\#2857](https://github.com/apache/arrow-rs/pull/2857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix timestamp parsing while no explicit timezone given [\#2814](https://github.com/apache/arrow-rs/pull/2814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Support Arbitrary Number of Arrays in downcast\_primitive\_array [\#2809](https://github.com/apache/arrow-rs/pull/2809) ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Include field name in Parquet PrimitiveTypeBuilder error messages [\#2804](https://github.com/apache/arrow-rs/issues/2804) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add PrimitiveArray::reinterpret\_cast [\#2785](https://github.com/apache/arrow-rs/issues/2785) -- BinaryBuilder and StringBuilder initialization parameters in struct\_builder may be wrong [\#2783](https://github.com/apache/arrow-rs/issues/2783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add divide scalar dyn kernel which produces null for division by zero [\#2767](https://github.com/apache/arrow-rs/issues/2767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add divide dyn kernel which produces null for division by zero [\#2763](https://github.com/apache/arrow-rs/issues/2763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of checked kernels on non-null data [\#2747](https://github.com/apache/arrow-rs/issues/2747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add overflow-checking variants of arithmetic dyn kernels [\#2739](https://github.com/apache/arrow-rs/issues/2739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The `binary` function should not panic on unequaled array length. [\#2721](https://github.com/apache/arrow-rs/issues/2721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Restore Integration test JSON schema serialization [\#2876](https://github.com/apache/arrow-rs/issues/2876) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix various invalid\_html\_tags clippy error [\#2861](https://github.com/apache/arrow-rs/issues/2861) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Replace complicated temporal macro with generic functions [\#2851](https://github.com/apache/arrow-rs/issues/2851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add NaN handling in dyn scalar comparison kernels [\#2829](https://github.com/apache/arrow-rs/issues/2829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant of sum kernel [\#2821](https://github.com/apache/arrow-rs/issues/2821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update to Clap 4 [\#2817](https://github.com/apache/arrow-rs/issues/2817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Safe API to Operate on Dictionary Values [\#2797](https://github.com/apache/arrow-rs/issues/2797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add modulus op into `ArrowNativeTypeOp` [\#2753](https://github.com/apache/arrow-rs/issues/2753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow creating of TimeUnit instances without direct dependency on parquet-format [\#2708](https://github.com/apache/arrow-rs/issues/2708) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Arrow Row Format [\#2677](https://github.com/apache/arrow-rs/issues/2677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- min compute kernel is incorrect with sliced buffers in arrow 23 [\#2779](https://github.com/apache/arrow-rs/issues/2779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `try_unary_dict` should check value type of dictionary array [\#2754](https://github.com/apache/arrow-rs/issues/2754) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Don't try to infer nulls in CSV schema inference [\#2859](https://github.com/apache/arrow-rs/issues/2859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `parquet::arrow::arrow_writer::ArrowWriter` ignores page size properties [\#2853](https://github.com/apache/arrow-rs/issues/2853) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Introducing ArrowNativeTypeOp made it impossible to call kernels from generics [\#2839](https://github.com/apache/arrow-rs/issues/2839) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Unsound ArrayData to Array Conversions [\#2834](https://github.com/apache/arrow-rs/issues/2834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression: `the trait bound for<'de> arrow::datatypes::Schema: serde::de::Deserialize<'de> is not satisfied` [\#2825](https://github.com/apache/arrow-rs/issues/2825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- convert string to timestamp shouldn't apply local timezone offset if there's no explicit timezone info in the string [\#2813](https://github.com/apache/arrow-rs/issues/2813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Add back JSON import/export for schema [\#2762](https://github.com/apache/arrow-rs/issues/2762) -- null casting and coercion for Decimal128 [\#2761](https://github.com/apache/arrow-rs/issues/2761) -- Json decoder behavior changed from versions 21 to 21 and returns non-sensical num\_rows for RecordBatch [\#2722](https://github.com/apache/arrow-rs/issues/2722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Release Arrow `23.0.0` \(next release after `22.0.0`\) [\#2665](https://github.com/apache/arrow-rs/issues/2665) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add pub api for checking column index is sorted [\#2848](https://github.com/apache/arrow-rs/issues/2848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Merged pull requests:** -- add field name to parquet PrimitiveTypeBuilder error messages [\#2805](https://github.com/apache/arrow-rs/pull/2805) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([andygrove](https://github.com/andygrove)) -- Add struct equality test case \(\#514\) [\#2791](https://github.com/apache/arrow-rs/pull/2791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move unary kernels to arrow-array \(\#2787\) [\#2789](https://github.com/apache/arrow-rs/pull/2789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Disable test harness for string\_dictionary\_builder benchmark [\#2788](https://github.com/apache/arrow-rs/pull/2788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add PrimitiveArray::reinterpret\_cast \(\#2785\) [\#2786](https://github.com/apache/arrow-rs/pull/2786) ([tustvold](https://github.com/tustvold)) -- Fix BinaryBuilder and StringBuilder Capacity Allocation in StructBuilder [\#2784](https://github.com/apache/arrow-rs/pull/2784) ([chunshao90](https://github.com/chunshao90)) -- Fix min/max computation for sliced arrays \(\#2779\) [\#2780](https://github.com/apache/arrow-rs/pull/2780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix Backwards Compatible Parquet List Encodings \(\#1915\) [\#2774](https://github.com/apache/arrow-rs/pull/2774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- MINOR: Fix clippy for rust 1.64.0 [\#2772](https://github.com/apache/arrow-rs/pull/2772) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- MINOR: Fix clippy for rust 1.64.0 [\#2771](https://github.com/apache/arrow-rs/pull/2771) ([viirya](https://github.com/viirya)) -- Add divide scalar dyn kernel which produces null for division by zero [\#2768](https://github.com/apache/arrow-rs/pull/2768) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add divide dyn kernel which produces null for division by zero [\#2764](https://github.com/apache/arrow-rs/pull/2764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add value type check in try\_unary\_dict [\#2755](https://github.com/apache/arrow-rs/pull/2755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix `verify_release_candidate.sh` for new arrow subcrates [\#2752](https://github.com/apache/arrow-rs/pull/2752) ([alamb](https://github.com/alamb)) -- Fix: Issue 2721 : binary function should not panic but return error w… [\#2750](https://github.com/apache/arrow-rs/pull/2750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aksharau](https://github.com/aksharau)) -- Speed up checked kernels for non-null data \(~1.4-5x faster\) [\#2749](https://github.com/apache/arrow-rs/pull/2749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Add overflow-checking variants of arithmetic dyn kernels [\#2740](https://github.com/apache/arrow-rs/pull/2740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Trim parquet row selection [\#2705](https://github.com/apache/arrow-rs/pull/2705) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Take decimal as primitive \(\#2637\) [\#2869](https://github.com/apache/arrow-rs/pull/2869) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-integration-test crate [\#2868](https://github.com/apache/arrow-rs/pull/2868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Decimal cleanup \(\#2637\) [\#2865](https://github.com/apache/arrow-rs/pull/2865) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix various invalid\_html\_tags clippy errors [\#2862](https://github.com/apache/arrow-rs/pull/2862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Don't try to infer nullability in CSV reader [\#2860](https://github.com/apache/arrow-rs/pull/2860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Fix page size on dictionary fallback [\#2854](https://github.com/apache/arrow-rs/pull/2854) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Replace complicated temporal macro with generic functions [\#2850](https://github.com/apache/arrow-rs/pull/2850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- \[feat\] Add pub api for checking column index is sorted. [\#2849](https://github.com/apache/arrow-rs/pull/2849) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- parquet: Add `snap` option to README [\#2847](https://github.com/apache/arrow-rs/pull/2847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([exyi](https://github.com/exyi)) +- Cleanup cast kernel [\#2846](https://github.com/apache/arrow-rs/pull/2846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify ArrowNativeType [\#2841](https://github.com/apache/arrow-rs/pull/2841) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Expose ArrowNativeTypeOp trait to make it useful for type bound [\#2840](https://github.com/apache/arrow-rs/pull/2840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add `interleave` kernel \(\#1523\) [\#2838](https://github.com/apache/arrow-rs/pull/2838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Handle empty offsets buffer \(\#1824\) [\#2836](https://github.com/apache/arrow-rs/pull/2836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate ArrayData type when converting to Array \(\#2834\) [\#2835](https://github.com/apache/arrow-rs/pull/2835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Derive ArrowPrimitiveType for Decimal128Type and Decimal256Type \(\#2637\) [\#2833](https://github.com/apache/arrow-rs/pull/2833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add NaN handling in dyn scalar comparison kernels [\#2830](https://github.com/apache/arrow-rs/pull/2830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify OrderPreservingInterner allocation strategy ~97% faster \(\#2677\) [\#2827](https://github.com/apache/arrow-rs/pull/2827) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Convert rows to arrays \(\#2677\) [\#2826](https://github.com/apache/arrow-rs/pull/2826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add overflow-checking variant of sum kernel [\#2822](https://github.com/apache/arrow-rs/pull/2822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update Clap dependency to version 4 [\#2819](https://github.com/apache/arrow-rs/pull/2819) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jgoday](https://github.com/jgoday)) +- Fix i256 checked multiplication [\#2818](https://github.com/apache/arrow-rs/pull/2818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add string\_dictionary benches for row format \(\#2677\) [\#2816](https://github.com/apache/arrow-rs/pull/2816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add OrderPreservingInterner::lookup \(\#2677\) [\#2815](https://github.com/apache/arrow-rs/pull/2815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify FixedLengthEncoding [\#2812](https://github.com/apache/arrow-rs/pull/2812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement ArrowNumericType for Float16Type [\#2810](https://github.com/apache/arrow-rs/pull/2810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add DictionaryArray::with\_values to make it easier to operate on dictionary values [\#2798](https://github.com/apache/arrow-rs/pull/2798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add i256 \(\#2637\) [\#2781](https://github.com/apache/arrow-rs/pull/2781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add modulus ops into `ArrowNativeTypeOp` [\#2756](https://github.com/apache/arrow-rs/pull/2756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- feat: cast List / LargeList to Utf8 / LargeUtf8 [\#2588](https://github.com/apache/arrow-rs/pull/2588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gandronchik](https://github.com/gandronchik)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 30fa311b243c..accc1d3e69eb 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "24.0.0" +version = "25.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "24.0.0", path = "../arrow-schema" } -arrow-data = { version = "24.0.0", path = "../arrow-data" } +arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "25.0.0", path = "../arrow-schema" } +arrow-data = { version = "25.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index af0bd9861dad..f2b4dba509e3 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "24.0.0" +version = "25.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 4dbc5fa3f1c1..be477802622d 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "24.0.0" +version = "25.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "24.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "25.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 7247679213f5..1f696f5387b5 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "24.0.0" +version = "25.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "24.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "25.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 4ebf98c06ce2..f09977263a0a 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "24.0.0" +arrow-flight = "25.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index be54cc4d6239..8d8c0fda916e 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "24.0.0" +version = "25.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "24.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } +arrow = { version = "25.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 0f43447de4c0..13088d3dfe6a 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -17,8 +17,8 @@ [package] name = "arrow-integration-testing" -description = "Binaries used in the Arrow integration tests" -version = "24.0.0" +description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" +version = "25.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 3e32aab98453..955d311a7900 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "24.0.0" +version = "25.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "24.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "25.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index d052c9061600..fae422b77be4 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "24.0.0" +version = "25.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a7726b96ce49..134d274a8393 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "24.0.0" +version = "25.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,10 +44,10 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "24.0.0", path = "../arrow-buffer" } -arrow-data = { version = "24.0.0", path = "../arrow-data" } -arrow-schema = { version = "24.0.0", path = "../arrow-schema" } -arrow-array = { version = "24.0.0", path = "../arrow-array" } +arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } +arrow-data = { version = "25.0.0", path = "../arrow-data" } +arrow-schema = { version = "25.0.0", path = "../arrow-schema" } +arrow-array = { version = "25.0.0", path = "../arrow-array" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/arrow/README.md b/arrow/README.md index ade41311c6c8..c687a205a2ae 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `24.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `25.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags @@ -62,7 +62,7 @@ The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists ## Safety -Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/24.0.01/18/soundness-pledge.html). Specifically: +Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/2020/01/18/soundness-pledge.html). Specifically: > The intent of this crate is to be free of soundness bugs. The developers will do their best to avoid them, and welcome help in analyzing and fixing them diff --git a/dev/release/README.md b/dev/release/README.md index 18542c5f603d..6392716371e1 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/24.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/25.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md @@ -78,13 +78,16 @@ git commit -a -m 'Update version' # ensure your github token is available export ARROW_GITHUB_API_TOKEN= + # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog ./dev/release/update_change_log.sh # run automated script to copy labels to issues based on referenced PRs -# (NOTE this must be done by a committer / other who has +# (NOTE 1: this must be done by a committer / other who has # write access to the repository) +# +# NOTE 2: this must be done after creating the initial CHANGELOG file python dev/release/label_issues.py # review change log / edit issues and labels if needed, rerun @@ -257,6 +260,7 @@ Rust Arrow Crates: (cd arrow-flight && cargo publish) (cd parquet && cargo publish) (cd parquet_derive && cargo publish) +(cd arrow-integration-test && cargo publish) ``` `object_store` diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 67f772d37d88..1dc45115678d 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="23.0.0" -FUTURE_RELEASE="24.0.0" +SINCE_TAG="24.0.0" +FUTURE_RELEASE="25.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index f47f556b257f..819f41bca32a 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "24.0.0" +version = "25.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "24.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "25.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -61,7 +61,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "24.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "25.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 2fc6f55afbb7..5665038eb200 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "24.0.0" +version = "25.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "24.0.0" } +parquet = { path = "../parquet", version = "25.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 907a71432b1a..12ba2d98e130 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "24.0.0" -parquet_derive = "24.0.0" +parquet = "25.0.0" +parquet_derive = "25.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 78207399fd0e..0c2758fdc290 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "24.0.0" +version = "25.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "24.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "24.0.0", default-features = false } +parquet = { path = "../parquet", version = "25.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "25.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From 4d7d411af340bb768afe3ebf590cd6c425dbc064 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 15 Oct 2022 10:28:51 +1300 Subject: [PATCH 0135/1411] Don't validate decimal precision in ArrayData (#2637) (#2873) * Don't validate decimal precision in ArrayData (#2637) * Format --- arrow-data/src/data.rs | 19 ------------------- arrow/tests/array_validation.rs | 11 ++++++++--- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 37c059748fe7..b53e9f0af4de 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -18,9 +18,6 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::decimal::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, -}; use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; @@ -1004,22 +1001,6 @@ impl ArrayData { pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { - DataType::Decimal128(p, _) => { - let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; - for value in values_buffer { - validate_decimal_precision(*value, *p)?; - } - Ok(()) - } - DataType::Decimal256(p, _) => { - let values = self.buffers()[0].as_slice(); - for pos in 0..self.len() { - let offset = pos * 32; - let raw_bytes = &values[offset..offset + 32]; - validate_decimal256_precision_with_lt_bytes(raw_bytes, *p)?; - } - Ok(()) - } DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index f4dcda2e8de9..16f031a1eb15 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -20,6 +20,7 @@ use arrow::array::{ Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, UInt8Builder, }; +use arrow_array::Decimal128Array; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, Field, UnionMode}; @@ -1038,7 +1039,6 @@ fn test_string_data_from_foreign() { } #[test] -#[cfg(not(feature = "force_validate"))] fn test_decimal_full_validation() { let values_builder = UInt8Builder::with_capacity(10); let byte_width = 16; @@ -1055,8 +1055,13 @@ fn test_decimal_full_validation() { .len(fixed_size_array.len()) .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); let array_data = unsafe { builder.build_unchecked() }; - let validation_result = array_data.validate_full(); - let error = validation_result.unwrap_err(); + array_data.validate_full().unwrap(); + + let array = Decimal128Array::from(array_data); + let error = array + .validate_decimal_precision(array.precision()) + .unwrap_err(); + assert_eq!( "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", error.to_string() From c7f7606361d10299f68385e70555257f3503f1cb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 15 Oct 2022 12:15:09 -0700 Subject: [PATCH 0136/1411] Fix compilation error under `chrono-tz` feature (#2879) * Fix compilation error * Add compilation check --- .github/workflows/arrow.yml | 5 ++++- arrow/src/compute/kernels/cast.rs | 9 +++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 2838a35144c2..613f52f870f5 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -111,9 +111,12 @@ jobs: - name: Check compilation --no-default-features --all-targets --features test_utils run: | cargo check -p arrow --no-default-features --all-targets --features test_utils - - name: Check compilation --no-default-features --all-targets --features --ffi + - name: Check compilation --no-default-features --all-targets --features ffi run: | cargo check -p arrow --no-default-features --all-targets --features ffi + - name: Check compilation --no-default-features --all-targets --features chrono-tz + run: | + cargo check -p arrow --no-default-features --all-targets --features chrono-tz # test the --features "simd" of the arrow crate. This requires nightly Rust. linux-test-simd: diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index b05e4c4ba7f4..66a04e91ed30 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -5640,10 +5640,9 @@ mod tests { /// Creates a dictionary with primitive dictionary values, and keys of type K #[cfg(feature = "chrono-tz")] fn make_dictionary_primitive() -> ArrayRef { - let keys_builder = PrimitiveBuilder::::new(); // Pick Int32 arbitrarily for dictionary values - let values_builder = PrimitiveBuilder::::new(); - let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); + let mut b: PrimitiveDictionaryBuilder = + PrimitiveDictionaryBuilder::new(); b.append(1).unwrap(); b.append(2).unwrap(); Arc::new(b.finish()) @@ -5652,10 +5651,8 @@ mod tests { /// Creates a dictionary with utf8 values, and keys of type K #[cfg(feature = "chrono-tz")] fn make_dictionary_utf8() -> ArrayRef { - let keys_builder = PrimitiveBuilder::::new(); // Pick Int32 arbitrarily for dictionary values - let values_builder = StringBuilder::new(); - let mut b = StringDictionaryBuilder::new(keys_builder, values_builder); + let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); b.append("foo").unwrap(); b.append("bar").unwrap(); Arc::new(b.finish()) From f055f51c9d80799c54b68f790cf6656ce5ad090a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 16 Oct 2022 08:54:14 +1300 Subject: [PATCH 0137/1411] Validate decimal IPC read (#2387) (#2880) --- arrow/src/ipc/reader.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index a784f54e20ca..63c587455d5a 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -486,16 +486,12 @@ fn create_primitive_array( .unwrap(), Decimal128(_, _) | Decimal256(_, _) => { // read 2 buffers: null buffer (optional) and data buffer - let builder = ArrayData::builder(data_type.clone()) + ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer); - - // Don't validate the decimal array so far, - // becasue validating decimal is some what complicated - // and there is no conclusion on whether we should do it. - // For more infomation, please look at https://github.com/apache/arrow-rs/issues/2387 - unsafe { builder.build_unchecked() } + .null_bit_buffer(null_buffer) + .build() + .unwrap() } t => unreachable!("Data type {:?} either unsupported or not primitive", t), }; From ede36d7ab1c144ac27a0c78a99668a1af2ae6413 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 16 Oct 2022 09:52:03 +1300 Subject: [PATCH 0138/1411] Filter DecimalArray as PrimitiveArray ~80% Faster (#2637) (#2870) * Filter DecimalArray as PrimitiveArray (#2637) * Add decimal filter benches * Format --- arrow/benches/filter_kernels.rs | 24 ++++++++++++++++++++++++ arrow/src/compute/kernels/filter.rs | 10 ++++++++++ 2 files changed, 34 insertions(+) diff --git a/arrow/benches/filter_kernels.rs b/arrow/benches/filter_kernels.rs index bd6129946630..9dd3e7ebba09 100644 --- a/arrow/benches/filter_kernels.rs +++ b/arrow/benches/filter_kernels.rs @@ -26,6 +26,7 @@ use arrow::array::*; use arrow::compute::filter; use arrow::datatypes::{Field, Float32Type, Int32Type, Schema, UInt8Type}; +use arrow_array::types::Decimal128Type; use criterion::{criterion_group, criterion_main, Criterion}; fn bench_filter(data_array: &dyn Array, filter_array: &BooleanArray) { @@ -143,6 +144,29 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_built_filter(&sparse_filter, &data_array)) }); + let data_array = create_primitive_array::(size, 0.0); + c.bench_function("filter decimal128 (kept 1/2)", |b| { + b.iter(|| bench_filter(&data_array, &filter_array)) + }); + c.bench_function("filter decimal128 high selectivity (kept 1023/1024)", |b| { + b.iter(|| bench_filter(&data_array, &dense_filter_array)) + }); + c.bench_function("filter decimal128 low selectivity (kept 1/1024)", |b| { + b.iter(|| bench_filter(&data_array, &sparse_filter_array)) + }); + + c.bench_function("filter context decimal128 (kept 1/2)", |b| { + b.iter(|| bench_built_filter(&filter, &data_array)) + }); + c.bench_function( + "filter context decimal128 high selectivity (kept 1023/1024)", + |b| b.iter(|| bench_built_filter(&dense_filter, &data_array)), + ); + c.bench_function( + "filter context decimal128 low selectivity (kept 1/1024)", + |b| b.iter(|| bench_built_filter(&sparse_filter, &data_array)), + ); + let data_array = create_string_array::(size, 0.5); c.bench_function("filter context string (kept 1/2)", |b| { b.iter(|| bench_built_filter(&filter, &data_array)) diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs index d1e2ad17593d..150253b1c0de 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow/src/compute/kernels/filter.rs @@ -338,6 +338,16 @@ fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result downcast_primitive_array! { values => Ok(Arc::new(filter_primitive(values, predicate))), + DataType::Decimal128(p, s) => { + let values = values.as_any().downcast_ref::().unwrap(); + let filtered = filter_primitive(values, predicate); + Ok(Arc::new(filtered.with_precision_and_scale(*p, *s).unwrap())) + } + DataType::Decimal256(p, s) => { + let values = values.as_any().downcast_ref::().unwrap(); + let filtered = filter_primitive(values, predicate); + Ok(Arc::new(filtered.with_precision_and_scale(*p, *s).unwrap())) + } DataType::Boolean => { let values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(filter_boolean(values, predicate))) From a3effc19cc13d5612ffcca5c04c44dee0995dd46 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 16 Oct 2022 19:14:30 +1300 Subject: [PATCH 0139/1411] Increase default IPC alignment to 64 (#2883) (#2884) * Increase default IPC alignment to 64 (#2883) * Update test --- arrow/src/ipc/writer.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs index 63f1520a5e9c..4f40574ab12e 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow/src/ipc/writer.rs @@ -46,7 +46,7 @@ use ipc::CONTINUATION_MARKER; #[derive(Debug, Clone)] pub struct IpcWriteOptions { /// Write padding after memory buffers to this multiple of bytes. - /// Generally 8 or 64, defaults to 8 + /// Generally 8 or 64, defaults to 64 alignment: usize, /// The legacy format is for releases before 0.15.0, and uses metadata V4 write_legacy_ipc_format: bool, @@ -132,7 +132,7 @@ impl IpcWriteOptions { impl Default for IpcWriteOptions { fn default() -> Self { Self { - alignment: 8, + alignment: 64, write_legacy_ipc_format: false, metadata_version: ipc::MetadataVersion::V5, batch_compression_type: None, @@ -788,7 +788,8 @@ impl StreamWriter { /// /// ``` /// # use arrow::datatypes::Schema; - /// # use arrow::ipc::writer::StreamWriter; + /// # use arrow::ipc::writer::{StreamWriter, IpcWriteOptions}; + /// # use arrow::ipc::MetadataVersion; /// # use arrow::error::ArrowError; /// # fn main() -> Result<(), ArrowError> { /// // The result we expect from an empty schema @@ -807,7 +808,8 @@ impl StreamWriter { /// /// let schema = Schema::new(vec![]); /// let buffer: Vec = Vec::new(); - /// let stream_writer = StreamWriter::try_new(buffer, &schema)?; + /// let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5)?; + /// let stream_writer = StreamWriter::try_new_with_options(buffer, &schema, options)?; /// /// assert_eq!(stream_writer.into_inner()?, expected); /// # Ok(()) From bfd87bd8d26d3d3d4851ea0e93035de28e59681e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 16 Oct 2022 12:35:01 -0700 Subject: [PATCH 0140/1411] Copying inappropriately aligned buffer in ipc reader (#2883) * Fix ptr alignment error. * Rewrite test * Add values * Copy buffer if it is not aligned properly * Move to a function * Cover IntervalMonthDayNanoType too * Remove unnecessary change * For review * Make it generic for i256 * Lift into parent match and use minimum length. --- arrow/src/ipc/reader.rs | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 63c587455d5a..cc45b22373de 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -20,6 +20,7 @@ //! The `FileReader` and `StreamReader` have similar interfaces, //! however the `FileReader` expects a reader that supports `Seek`ing +use arrow_buffer::i256; use std::collections::HashMap; use std::fmt; use std::io::{BufReader, Read, Seek, SeekFrom}; @@ -477,18 +478,30 @@ fn create_primitive_array( | Timestamp(_, _) | Date64 | Duration(_) - | Interval(IntervalUnit::DayTime) - | Interval(IntervalUnit::MonthDayNano) => ArrayData::builder(data_type.clone()) + | Interval(IntervalUnit::DayTime) => ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) .build() .unwrap(), - Decimal128(_, _) | Decimal256(_, _) => { + Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { + let buffer = get_aligned_buffer::(&buffers[1], length); + // read 2 buffers: null buffer (optional) and data buffer ArrayData::builder(data_type.clone()) .len(length) - .add_buffer(buffers[1].clone()) + .add_buffer(buffer) + .null_bit_buffer(null_buffer) + .build() + .unwrap() + } + Decimal256(_, _) => { + let buffer = get_aligned_buffer::(&buffers[1], length); + + // read 2 buffers: null buffer (optional) and data buffer + ArrayData::builder(data_type.clone()) + .len(length) + .add_buffer(buffer) .null_bit_buffer(null_buffer) .build() .unwrap() @@ -499,6 +512,24 @@ fn create_primitive_array( make_array(array_data) } +/// Checks if given `Buffer` is properly aligned with `T`. +/// If not, copying the data and padded it for alignment. +fn get_aligned_buffer(buffer: &Buffer, length: usize) -> Buffer { + let ptr = buffer.as_ptr(); + let align_req = std::mem::align_of::(); + let align_offset = ptr.align_offset(align_req); + // The buffer is not aligned properly. The writer might use a smaller alignment + // e.g. 8 bytes, but on some platform (e.g. ARM) i128 requires 16 bytes alignment. + // We need to copy the buffer as fallback. + if align_offset != 0 { + let len_in_bytes = (length * std::mem::size_of::()).min(buffer.len()); + let slice = &buffer.as_slice()[0..len_in_bytes]; + Buffer::from_slice_ref(&slice) + } else { + buffer.clone() + } +} + /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_list_array( From 17d1aade3572d1609cf6ed0e3db15f3d68511460 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Oct 2022 07:40:19 +1300 Subject: [PATCH 0141/1411] Improve row format docs (#2888) * Improve row format docs * Format --- arrow/src/row/mod.rs | 85 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index f604f65706d5..8d6732054412 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -16,6 +16,67 @@ // under the License. //! A comparable row-oriented representation of a collection of [`Array`] +//! +//! As [`Row`] are [normalized for sorting], they can be very efficiently [compared](PartialOrd), +//! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. This +//! makes the row format ideal for implementing efficient multi-column sorting, +//! grouping, aggregation, windowing and more. +//! +//! _Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to +//! yield a meaningful ordering_ +//! ``` +//! # use std::sync::Arc; +//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_array::{ArrayRef, Int32Array, StringArray}; +//! # use arrow_array::cast::{as_primitive_array, as_string_array}; +//! # use arrow_array::types::Int32Type; +//! # use arrow_schema::DataType; +//! +//! let a1 = Arc::new(Int32Array::from_iter_values([-1, -1, 0, 3, 3])) as ArrayRef; +//! let a2 = Arc::new(StringArray::from_iter_values(["a", "b", "c", "d", "d"])) as ArrayRef; +//! let arrays = vec![a1, a2]; +//! +//! // Convert arrays to rows +//! let mut converter = RowConverter::new(vec![ +//! SortField::new(DataType::Int32), +//! SortField::new(DataType::Utf8), +//! ]); +//! let rows = converter.convert_columns(&arrays).unwrap(); +//! +//! // Compare rows +//! for i in 0..4 { +//! assert!(rows.row(i) <= rows.row(i + 1)); +//! } +//! assert_eq!(rows.row(3), rows.row(4)); +//! +//! // Convert rows back to arrays +//! let converted = converter.convert_rows(&rows).unwrap(); +//! assert_eq!(arrays, converted); +//! +//! // Compare rows from different arrays +//! let a1 = Arc::new(Int32Array::from_iter_values([3, 4])) as ArrayRef; +//! let a2 = Arc::new(StringArray::from_iter_values(["e", "f"])) as ArrayRef; +//! let arrays = vec![a1, a2]; +//! let rows2 = converter.convert_columns(&arrays).unwrap(); +//! +//! assert!(rows.row(4) < rows2.row(0)); +//! assert!(rows.row(4) < rows2.row(1)); +//! +//! // Convert selection of rows back to arrays +//! let selection = [rows.row(0), rows2.row(1), rows.row(2), rows2.row(0)]; +//! let converted = converter.convert_rows(selection).unwrap(); +//! let c1 = as_primitive_array::(converted[0].as_ref()); +//! assert_eq!(c1.values(), &[-1, 4, 0, 3]); +//! +//! let c2 = as_string_array(converted[1].as_ref()); +//! let c2_values: Vec<_> = c2.iter().flatten().collect(); +//! assert_eq!(&c2_values, &["a", "f", "c", "e"]); +//! ``` +//! +//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] +//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] +//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] +//! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] use std::cmp::Ordering; use std::hash::{Hash, Hasher}; @@ -43,14 +104,7 @@ mod fixed; mod interner; mod variable; -/// Converts [`ArrayRef`] columns into a row-oriented format that are [normalized for sorting]. -/// -/// In particular, a byte-wise comparison of the rows, e.g. [`memcmp`], is sufficient -/// to establish the ordering of two rows, allowing for extremely fast comparisons, -/// and permitting the use of [non-comparison sorts] such as [radix sort] -/// -/// Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to -/// yield a meaningful ordering +/// Converts [`ArrayRef`] columns into a row-oriented format. /// /// # Format /// @@ -130,17 +184,6 @@ mod variable; /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values /// -/// ## Reconstruction -/// -/// Given a schema it would theoretically be possible to reconstruct the columnar data from -/// the row format, however, this is currently not implemented. It is recommended that the row -/// format is instead used to obtain a sorted list of row indices, which can then be used -/// with [`take`](crate::compute::take) to obtain a sorted [`Array`] -/// -/// [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] -/// [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] -/// [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] -/// [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] /// [COBS]:[https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing] /// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing] #[derive(Debug)] @@ -307,6 +350,10 @@ impl Rows { pub fn num_rows(&self) -> usize { self.offsets.len() - 1 } + + pub fn iter(&self) -> RowsIter<'_> { + self.into_iter() + } } impl<'a> IntoIterator for &'a Rows { From 02cab5443c0703ce5fa86647b834f184bba172ba Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Oct 2022 15:49:01 +1300 Subject: [PATCH 0142/1411] Add downcast_integer and downcast_primitive (#2872) * Add downcast_integer and downcast_primitive * Fix doc * Review feedback --- arrow-array/src/cast.rs | 408 ++++++++++++++++-------------------- arrow/src/row/dictionary.rs | 67 +----- arrow/src/row/mod.rs | 81 +------ 3 files changed, 204 insertions(+), 352 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 6eb5407966f1..e4e290501443 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -24,221 +24,221 @@ use crate::types::*; #[doc(hidden)] #[macro_export] macro_rules! repeat_pat { - ($e:pat, $v_:ident) => { + ($e:pat, $v_:expr) => { $e }; - ($e:pat, $v_:ident $(, $tail:ident)+) => { + ($e:pat, $v_:expr $(, $tail:expr)+) => { ($e, $crate::repeat_pat!($e $(, $tail)+)) } } -/// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`] -/// accepts a number of subsequent patterns to match the data type +/// Given one or more expressions evaluating to an integer [`DataType`] invokes the provided macro +/// `m` with the corresponding integer [`ArrowPrimitiveType`], followed by any additional arguments /// /// ``` -/// # use arrow_array::{Array, downcast_primitive_array, cast::as_string_array}; +/// # use arrow_array::{downcast_primitive, ArrowPrimitiveType, downcast_integer}; /// # use arrow_schema::DataType; /// -/// fn print_primitive(array: &dyn Array) { -/// downcast_primitive_array!( -/// array => { -/// for v in array { -/// println!("{:?}", v); -/// } -/// } -/// DataType::Utf8 => { -/// for v in as_string_array(array) { -/// println!("{:?}", v); -/// } -/// } -/// t => println!("Unsupported datatype {}", t) -/// ) +/// macro_rules! dictionary_key_size_helper { +/// ($t:ty, $o:ty) => { +/// std::mem::size_of::<<$t as ArrowPrimitiveType>::Native>() as $o +/// }; +/// } +/// +/// fn dictionary_key_size(t: &DataType) -> u8 { +/// match t { +/// DataType::Dictionary(k, _) => downcast_integer! { +/// k.as_ref() => (dictionary_key_size_helper, u8), +/// _ => unreachable!(), +/// }, +/// _ => u8::MAX, +/// } /// } +/// +/// assert_eq!(dictionary_key_size(&DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))), 4); +/// assert_eq!(dictionary_key_size(&DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8))), 8); +/// assert_eq!(dictionary_key_size(&DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8))), 2); /// ``` /// /// [`DataType`]: arrow_schema::DataType #[macro_export] -macro_rules! downcast_primitive_array { - ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { - downcast_primitive_array!($values => {$e} $($p => $fallback)*) - }; - (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { - $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) - }; - (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { - $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) - }; - ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { - match ($($values.data_type()),+) { - $crate::repeat_pat!(arrow_schema::DataType::Int8, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Int8Type, - >($values);)+ - $e +macro_rules! downcast_integer { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + match ($($data_type),+) { + $crate::repeat_pat!(arrow_schema::DataType::Int8, $($data_type),+) => { + $m!($crate::types::Int8Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Int16, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Int16Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Int16, $($data_type),+) => { + $m!($crate::types::Int16Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Int32, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Int32Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Int32, $($data_type),+) => { + $m!($crate::types::Int32Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Int64, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Int64Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Int64, $($data_type),+) => { + $m!($crate::types::Int64Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::UInt8, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::UInt8Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::UInt8, $($data_type),+) => { + $m!($crate::types::UInt8Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::UInt16, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::UInt16Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::UInt16, $($data_type),+) => { + $m!($crate::types::UInt16Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::UInt32, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::UInt32Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::UInt32, $($data_type),+) => { + $m!($crate::types::UInt32Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::UInt64, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::UInt64Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::UInt64, $($data_type),+) => { + $m!($crate::types::UInt64Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Float16, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Float16Type, - >($values);)+ - $e + $(($($p),+) => $fallback,)* + } + }; +} + +/// Given one or more expressions evaluating to primitive [`DataType`] invokes the provided macro +/// `m` with the corresponding [`ArrowPrimitiveType`], followed by any additional arguments +/// +/// ``` +/// # use arrow_array::{downcast_primitive, ArrowPrimitiveType}; +/// # use arrow_schema::DataType; +/// +/// macro_rules! primitive_size_helper { +/// ($t:ty, $o:ty) => { +/// std::mem::size_of::<<$t as ArrowPrimitiveType>::Native>() as $o +/// }; +/// } +/// +/// fn primitive_size(t: &DataType) -> u8 { +/// downcast_primitive! { +/// t => (primitive_size_helper, u8), +/// _ => u8::MAX +/// } +/// } +/// +/// assert_eq!(primitive_size(&DataType::Int32), 4); +/// assert_eq!(primitive_size(&DataType::Int64), 8); +/// assert_eq!(primitive_size(&DataType::Float16), 2); +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_primitive { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_integer! { + $($data_type),+ => ($m $(, $args)*), + $crate::repeat_pat!(arrow_schema::DataType::Float16, $($data_type),+) => { + $m!($crate::types::Float16Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Float32, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Float32Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Float32, $($data_type),+) => { + $m!($crate::types::Float32Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Float64, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Float64Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Float64, $($data_type),+) => { + $m!($crate::types::Float64Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Date32, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Date32Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Date32, $($data_type),+) => { + $m!($crate::types::Date32Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Date64, $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Date64Type, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Date64, $($data_type),+) => { + $m!($crate::types::Date64Type $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Time32SecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), $($data_type),+) => { + $m!($crate::types::Time32SecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Time32MillisecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), $($data_type),+) => { + $m!($crate::types::Time32MillisecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Time64MicrosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), $($data_type),+) => { + $m!($crate::types::Time64MicrosecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::Time64NanosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), $($data_type),+) => { + $m!($crate::types::Time64NanosecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::TimestampSecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), $($data_type),+) => { + $m!($crate::types::TimestampSecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::TimestampMillisecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), $($data_type),+) => { + $m!($crate::types::TimestampMillisecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::TimestampMicrosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), $($data_type),+) => { + $m!($crate::types::TimestampMicrosecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::TimestampNanosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), $($data_type),+) => { + $m!($crate::types::TimestampNanosecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::IntervalYearMonthType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($data_type),+) => { + $m!($crate::types::IntervalYearMonthType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::IntervalDayTimeType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), $($data_type),+) => { + $m!($crate::types::IntervalDayTimeType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::IntervalMonthDayNanoType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), $($data_type),+) => { + $m!($crate::types::IntervalMonthDayNanoType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::DurationSecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), $($data_type),+) => { + $m!($crate::types::DurationSecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::DurationMillisecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), $($data_type),+) => { + $m!($crate::types::DurationMillisecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::DurationMicrosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), $($data_type),+) => { + $m!($crate::types::DurationMicrosecondType $(, $args)*) } - $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), $($values),+) => { - $(let $values = $crate::cast::as_primitive_array::< - $crate::types::DurationNanosecondType, - >($values);)+ - $e + $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), $($data_type),+) => { + $m!($crate::types::DurationNanosecondType $(, $args)*) } - $(($($p),+) => $fallback,)* + $($($p),+ => $fallback,)* + } + }; +} + +#[macro_export] +#[doc(hidden)] +macro_rules! downcast_primitive_array_helper { + ($t:ty, $($values:ident),+, $e:block) => {{ + $(let $values = $crate::cast::as_primitive_array::<$t>($values);)+ + $e + }}; +} + +/// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`] +/// accepts a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, downcast_primitive_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_primitive(array: &dyn Array) { +/// downcast_primitive_array!( +/// array => { +/// for v in array { +/// println!("{:?}", v); +/// } +/// } +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_primitive_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($values => {$e} $($p => $fallback)*) + }; + (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive!{ + $($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e), + $($($p),+ => $fallback,)* } }; } @@ -278,6 +278,15 @@ where .expect("Unable to downcast to primitive array") } +#[macro_export] +#[doc(hidden)] +macro_rules! downcast_dictionary_array_helper { + ($t:ty, $($values:ident),+, $e:block) => {{ + $(let $values = $crate::cast::as_dictionary_array::<$t>($values);)+ + $e + }}; +} + /// Downcast an [`Array`] to a [`DictionaryArray`] based on its [`DataType`], accepts /// a number of subsequent patterns to match the data type /// @@ -314,56 +323,11 @@ macro_rules! downcast_dictionary_array { ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { match $values.data_type() { - arrow_schema::DataType::Dictionary(k, _) => match k.as_ref() { - arrow_schema::DataType::Int8 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::Int8Type, - >($values); - $e - }, - arrow_schema::DataType::Int16 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::Int16Type, - >($values); - $e - }, - arrow_schema::DataType::Int32 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::Int32Type, - >($values); - $e - }, - arrow_schema::DataType::Int64 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::Int64Type, - >($values); - $e - }, - arrow_schema::DataType::UInt8 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::UInt8Type, - >($values); - $e - }, - arrow_schema::DataType::UInt16 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::UInt16Type, - >($values); - $e - }, - arrow_schema::DataType::UInt32 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::UInt32Type, - >($values); - $e - }, - arrow_schema::DataType::UInt64 => { - let $values = $crate::cast::as_dictionary_array::< - $crate::types::UInt64Type, - >($values); - $e - }, - k => unreachable!("unsupported dictionary key type: {}", k) + arrow_schema::DataType::Dictionary(k, _) => { + $crate::downcast_integer! { + k.as_ref() => ($crate::downcast_dictionary_array_helper, $values, $e), + k => unreachable!("unsupported dictionary key type: {}", k) + } } $($p => $fallback,)* } diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs index 4a048fbce86d..b06688224760 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow/src/row/dictionary.rs @@ -25,7 +25,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{ArrowError, DataType}; use std::collections::hash_map::Entry; use std::collections::HashMap; @@ -89,6 +89,12 @@ pub fn encode_dictionary( } } +macro_rules! decode_primitive_helper { + ($t:ty, $values: ident) => { + decode_primitive::<$t>(&$values) + }; +} + /// Decodes a string array from `rows` with the provided `options` /// /// # Safety @@ -163,65 +169,10 @@ pub unsafe fn decode_dictionary( null_builder.append(true); } - let child = match &value_type { + let child = downcast_primitive! { + &value_type => (decode_primitive_helper, values), DataType::Null => NullArray::new(values.len()).into_data(), DataType::Boolean => decode_bool(&values), - DataType::Int8 => decode_primitive::(&values), - DataType::Int16 => decode_primitive::(&values), - DataType::Int32 => decode_primitive::(&values), - DataType::Int64 => decode_primitive::(&values), - DataType::UInt8 => decode_primitive::(&values), - DataType::UInt16 => decode_primitive::(&values), - DataType::UInt32 => decode_primitive::(&values), - DataType::UInt64 => decode_primitive::(&values), - DataType::Float16 => decode_primitive::(&values), - DataType::Float32 => decode_primitive::(&values), - DataType::Float64 => decode_primitive::(&values), - DataType::Timestamp(TimeUnit::Second, _) => { - decode_primitive::(&values) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - decode_primitive::(&values) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - decode_primitive::(&values) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - decode_primitive::(&values) - } - DataType::Date32 => decode_primitive::(&values), - DataType::Date64 => decode_primitive::(&values), - DataType::Time32(t) => match t { - TimeUnit::Second => decode_primitive::(&values), - TimeUnit::Millisecond => decode_primitive::(&values), - _ => unreachable!(), - }, - DataType::Time64(t) => match t { - TimeUnit::Microsecond => decode_primitive::(&values), - TimeUnit::Nanosecond => decode_primitive::(&values), - _ => unreachable!(), - }, - DataType::Duration(TimeUnit::Second) => { - decode_primitive::(&values) - } - DataType::Duration(TimeUnit::Millisecond) => { - decode_primitive::(&values) - } - DataType::Duration(TimeUnit::Microsecond) => { - decode_primitive::(&values) - } - DataType::Duration(TimeUnit::Nanosecond) => { - decode_primitive::(&values) - } - DataType::Interval(IntervalUnit::DayTime) => { - decode_primitive::(&values) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - decode_primitive::(&values) - } - DataType::Interval(IntervalUnit::YearMonth) => { - decode_primitive::(&values) - } DataType::Decimal128(p, s) => { decode_decimal::<16, Decimal128Type>(&values, *p, *s) } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 8d6732054412..77c70a5fd5ed 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -616,6 +616,12 @@ fn encode_column( } } +macro_rules! decode_primitive_helper { + ($t:ty, $rows: ident, $options:ident) => { + Arc::new(decode_primitive::<$t>($rows, $options)) + }; +} + /// Decodes a the provided `field` from `rows` /// /// # Safety @@ -627,73 +633,10 @@ unsafe fn decode_column( interner: Option<&OrderPreservingInterner>, ) -> Result { let options = field.options; - let array: ArrayRef = match &field.data_type { + let array: ArrayRef = downcast_primitive! { + &field.data_type => (decode_primitive_helper, rows, options), DataType::Null => Arc::new(NullArray::new(rows.len())), DataType::Boolean => Arc::new(decode_bool(rows, options)), - DataType::Int8 => Arc::new(decode_primitive::(rows, options)), - DataType::Int16 => Arc::new(decode_primitive::(rows, options)), - DataType::Int32 => Arc::new(decode_primitive::(rows, options)), - DataType::Int64 => Arc::new(decode_primitive::(rows, options)), - DataType::UInt8 => Arc::new(decode_primitive::(rows, options)), - DataType::UInt16 => Arc::new(decode_primitive::(rows, options)), - DataType::UInt32 => Arc::new(decode_primitive::(rows, options)), - DataType::UInt64 => Arc::new(decode_primitive::(rows, options)), - DataType::Float16 => Arc::new(decode_primitive::(rows, options)), - DataType::Float32 => Arc::new(decode_primitive::(rows, options)), - DataType::Float64 => Arc::new(decode_primitive::(rows, options)), - DataType::Timestamp(TimeUnit::Second, _) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Date32 => Arc::new(decode_primitive::(rows, options)), - DataType::Date64 => Arc::new(decode_primitive::(rows, options)), - DataType::Time32(t) => match t { - TimeUnit::Second => { - Arc::new(decode_primitive::(rows, options)) - } - TimeUnit::Millisecond => { - Arc::new(decode_primitive::(rows, options)) - } - _ => unreachable!(), - }, - DataType::Time64(t) => match t { - TimeUnit::Microsecond => { - Arc::new(decode_primitive::(rows, options)) - } - TimeUnit::Nanosecond => { - Arc::new(decode_primitive::(rows, options)) - } - _ => unreachable!(), - }, - DataType::Duration(TimeUnit::Second) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Duration(TimeUnit::Millisecond) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Duration(TimeUnit::Microsecond) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Duration(TimeUnit::Nanosecond) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - Arc::new(decode_primitive::(rows, options)) - } - DataType::Interval(IntervalUnit::YearMonth) => { - Arc::new(decode_primitive::(rows, options)) - } DataType::Binary => Arc::new(decode_binary::(rows, options)), DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options)), @@ -760,13 +703,7 @@ unsafe fn decode_column( ))); } }, - DataType::FixedSizeBinary(_) - | DataType::List(_) - | DataType::FixedSizeList(_, _) - | DataType::LargeList(_) - | DataType::Struct(_) - | DataType::Union(_, _, _) - | DataType::Map(_, _) => { + _ => { return Err(ArrowError::NotYetImplemented(format!( "converting {} row is not supported", field.data_type From bf64ab528234bf27c3e09985d35cda0e2718fd45 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Oct 2022 07:49:54 +1300 Subject: [PATCH 0143/1411] Simplify ListArray::from_iter_primitive (#2886) --- arrow-array/src/array/list_array.rs | 63 ++++++++++------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index cdc7531d99fb..0db40a796964 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -16,12 +16,12 @@ // under the License. use crate::array::make_array; +use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ - builder::BooleanBufferBuilder, iterator::GenericListArrayIter, print_long_array, - raw_pointer::RawPtrBox, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, - PrimitiveArray, + iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, Array, + ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; -use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; @@ -157,47 +157,26 @@ impl GenericListArray { pub fn from_iter_primitive(iter: I) -> Self where T: ArrowPrimitiveType, - P: AsRef<[Option<::Native>]> - + IntoIterator::Native>>, + P: IntoIterator::Native>>, I: IntoIterator>, { - let iterator = iter.into_iter(); - let (lower, _) = iterator.size_hint(); - - let mut offsets = - MutableBuffer::new((lower + 1) * std::mem::size_of::()); - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - let mut null_buf = BooleanBufferBuilder::new(lower); - - let values: PrimitiveArray = iterator - .filter_map(|maybe_slice| { - // regardless of whether the item is Some, the offsets and null buffers must be updated. - match &maybe_slice { - Some(x) => { - length_so_far += - OffsetSize::from_usize(x.as_ref().len()).unwrap(); - null_buf.append(true); + let iter = iter.into_iter(); + let size_hint = iter.size_hint().0; + let mut builder = + GenericListBuilder::with_capacity(PrimitiveBuilder::::new(), size_hint); + + for i in iter { + match i { + Some(p) => { + for t in p { + builder.values().append_option(t); } - None => null_buf.append(false), - }; - offsets.push(length_so_far); - maybe_slice - }) - .flatten() - .collect(); - - let field = Box::new(Field::new("item", T::DATA_TYPE, true)); - let data_type = Self::DATA_TYPE_CONSTRUCTOR(field); - let array_data = ArrayData::builder(data_type) - .len(null_buf.len()) - .add_buffer(offsets.into()) - .add_child_data(values.into_data()) - .null_bit_buffer(Some(null_buf.into())); - let array_data = unsafe { array_data.build_unchecked() }; - - Self::from(array_data) + builder.append(true); + } + None => builder.append(false), + } + } + builder.finish() } } From 4aeb64e0c6a28dd787d64717bb192c87d6bf58cc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Oct 2022 08:09:36 +1300 Subject: [PATCH 0144/1411] Add FixedSizeList::from_iter_primitive (#2887) --- .../src/array/fixed_size_list_array.rs | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index a10c1d28dab4..c536a422e82f 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::{make_array, print_long_array, Array, ArrayAccessor, ArrayRef}; +use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; +use crate::{ + make_array, print_long_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, +}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; @@ -100,6 +103,53 @@ impl FixedSizeListArray { const fn value_offset_at(&self, i: usize) -> i32 { i as i32 * self.length } + + /// Creates a [`FixedSizeListArray`] from an iterator of primitive values + /// # Example + /// ``` + /// # use arrow_array::FixedSizeListArray; + /// # use arrow_array::types::Int32Type; + /// + /// let data = vec![ + /// Some(vec![Some(0), Some(1), Some(2)]), + /// None, + /// Some(vec![Some(3), None, Some(5)]), + /// Some(vec![Some(6), Some(7), Some(45)]), + /// ]; + /// let list_array = FixedSizeListArray::from_iter_primitive::(data, 3); + /// println!("{:?}", list_array); + /// ``` + pub fn from_iter_primitive(iter: I, length: i32) -> Self + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, + { + let l = length as usize; + let iter = iter.into_iter(); + let size_hint = iter.size_hint().0; + let mut builder = FixedSizeListBuilder::with_capacity( + PrimitiveBuilder::::with_capacity(size_hint * l), + length, + size_hint, + ); + + for i in iter { + match i { + Some(p) => { + for t in p { + builder.values().append_option(t); + } + builder.append(true); + } + None => { + builder.values().append_nulls(l); + builder.append(false) + } + } + } + builder.finish() + } } impl From for FixedSizeListArray { From 07024f6a16b870fda81cba5779b8817b20386ebf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Oct 2022 08:10:58 +1300 Subject: [PATCH 0145/1411] Treat DecimalArray as PrimitiveArray in row format (#2866) --- arrow-buffer/src/bigint.rs | 20 ++++++++ arrow/src/row/dictionary.rs | 21 +++----- arrow/src/row/fixed.rs | 47 +---------------- arrow/src/row/mod.rs | 100 ++++++++++++++++++++++++++++-------- 4 files changed, 108 insertions(+), 80 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 7873064b45fb..3518b85e4eb8 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -86,6 +86,15 @@ impl i256 { } } + /// Create an integer value from its representation as a byte array in little-endian. + #[inline] + pub fn from_be_bytes(b: [u8; 32]) -> Self { + Self { + high: i128::from_be_bytes(b[0..16].try_into().unwrap()), + low: u128::from_be_bytes(b[16..32].try_into().unwrap()), + } + } + pub fn from_i128(v: i128) -> Self { let mut bytes = if num::Signed::is_negative(&v) { [255_u8; 32] @@ -130,6 +139,17 @@ impl i256 { t } + /// Return the memory representation of this integer as a byte array in big-endian byte order. + #[inline] + pub fn to_be_bytes(self) -> [u8; 32] { + let mut t = [0; 32]; + let t_low: &mut [u8; 16] = (&mut t[0..16]).try_into().unwrap(); + *t_low = self.high.to_be_bytes(); + let t_high: &mut [u8; 16] = (&mut t[16..32]).try_into().unwrap(); + *t_high = self.low.to_be_bytes(); + t + } + /// Create an i256 from the provided [`BigInt`] returning a bool indicating /// if overflow occurred fn from_bigint_with_overflow(v: BigInt) -> (Self, bool) { diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs index b06688224760..1ec7c2a2145c 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow/src/row/dictionary.rs @@ -16,7 +16,7 @@ // under the License. use crate::compute::SortOptions; -use crate::row::fixed::{FixedLengthEncoding, FromSlice, RawDecimal}; +use crate::row::fixed::{FixedLengthEncoding, FromSlice}; use crate::row::interner::{Interned, OrderPreservingInterner}; use crate::row::{null_sentinel, Rows}; use arrow_array::builder::*; @@ -173,12 +173,8 @@ pub unsafe fn decode_dictionary( &value_type => (decode_primitive_helper, values), DataType::Null => NullArray::new(values.len()).into_data(), DataType::Boolean => decode_bool(&values), - DataType::Decimal128(p, s) => { - decode_decimal::<16, Decimal128Type>(&values, *p, *s) - } - DataType::Decimal256(p, s) => { - decode_decimal::<32, Decimal256Type>(&values, *p, *s) - } + DataType::Decimal128(p, s) => decode_decimal::(&values, *p, *s), + DataType::Decimal256(p, s) => decode_decimal::(&values, *p, *s), DataType::Utf8 => decode_string::(&values), DataType::LargeUtf8 => decode_string::(&values), DataType::Binary => decode_binary::(&values), @@ -279,10 +275,9 @@ where } /// Decodes a `DecimalArray` from dictionary values -fn decode_decimal( - values: &[&[u8]], - precision: u8, - scale: u8, -) -> ArrayData { - decode_fixed::>(values, T::TYPE_CONSTRUCTOR(precision, scale)) +fn decode_decimal(values: &[&[u8]], precision: u8, scale: u8) -> ArrayData +where + T::Native: FixedLengthEncoding, +{ + decode_fixed::(values, T::TYPE_CONSTRUCTOR(precision, scale)) } diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index ec7afd8e30aa..d5935cfb6472 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -19,9 +19,8 @@ use crate::array::PrimitiveArray; use crate::compute::SortOptions; use crate::datatypes::ArrowPrimitiveType; use crate::row::{null_sentinel, Rows}; -use arrow_array::types::DecimalType; use arrow_array::BooleanArray; -use arrow_buffer::{bit_util, MutableBuffer, ToByteSlice}; +use arrow_buffer::{bit_util, i256, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use half::f16; @@ -91,6 +90,7 @@ encode_signed!(2, i16); encode_signed!(4, i32); encode_signed!(8, i64); encode_signed!(16, i128); +encode_signed!(32, i256); macro_rules! encode_unsigned { ($n:expr, $t:ty) => { @@ -164,38 +164,6 @@ impl FixedLengthEncoding for f64 { } } -pub type RawDecimal128 = RawDecimal<16>; -pub type RawDecimal256 = RawDecimal<32>; - -/// The raw bytes of a decimal -#[derive(Copy, Clone)] -pub struct RawDecimal(pub [u8; N]); - -impl ToByteSlice for RawDecimal { - fn to_byte_slice(&self) -> &[u8] { - &self.0 - } -} - -impl FixedLengthEncoding for RawDecimal { - type Encoded = [u8; N]; - - fn encode(self) -> [u8; N] { - let mut val = self.0; - // Convert to big endian representation - val.reverse(); - // Toggle top "sign" bit to ensure consistent sort order - val[0] ^= 0x80; - val - } - - fn decode(mut encoded: Self::Encoded) -> Self { - encoded[0] ^= 0x80; - encoded.reverse(); - Self(encoded) - } -} - /// Returns the total encoded length (including null byte) for a value of type `T::Native` pub const fn encoded_len(_col: &PrimitiveArray) -> usize where @@ -354,17 +322,6 @@ fn decode_fixed( unsafe { builder.build_unchecked() } } -/// Decodes a `DecimalArray` from rows -pub fn decode_decimal( - rows: &mut [&[u8]], - options: SortOptions, - precision: u8, - scale: u8, -) -> PrimitiveArray { - decode_fixed::>(rows, T::TYPE_CONSTRUCTOR(precision, scale), options) - .into() -} - /// Decodes a `PrimitiveArray` from rows pub fn decode_primitive( rows: &mut [&[u8]], diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 77c70a5fd5ed..c3aa9ea4c5a7 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -84,6 +84,7 @@ use std::sync::Arc; use arrow_array::cast::*; use arrow_array::*; +use arrow_buffer::i256; use crate::compute::SortOptions; use crate::datatypes::*; @@ -91,10 +92,7 @@ use crate::error::{ArrowError, Result}; use crate::row::dictionary::{ compute_dictionary_mapping, decode_dictionary, encode_dictionary, }; -use crate::row::fixed::{ - decode_bool, decode_decimal, decode_primitive, RawDecimal, RawDecimal128, - RawDecimal256, -}; +use crate::row::fixed::{decode_bool, decode_primitive}; use crate::row::interner::OrderPreservingInterner; use crate::row::variable::{decode_binary, decode_string}; use crate::{downcast_dictionary_array, downcast_primitive_array}; @@ -488,8 +486,8 @@ fn new_empty_rows( array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), DataType::Null => {}, DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), - DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal128::ENCODED_LEN), - DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal256::ENCODED_LEN), + DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += i128::ENCODED_LEN), + DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += i256::ENCODED_LEN), DataType::Binary => as_generic_binary_array::(array) .iter() .zip(lengths.iter_mut()) @@ -571,24 +569,20 @@ fn encode_column( DataType::Null => {} DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), DataType::Decimal128(_, _) => { - let iter = column + let column = column .as_any() .downcast_ref::() - .unwrap() - .into_iter() - .map(|x| x.map(|x| RawDecimal(x.to_le_bytes()))); + .unwrap(); - fixed::encode(out, iter, opts) + fixed::encode(out, column, opts) }, DataType::Decimal256(_, _) => { - let iter = column + let column = column .as_any() .downcast_ref::() - .unwrap() - .into_iter() - .map(|x| x.map(|x| RawDecimal(x.to_le_bytes()))); + .unwrap(); - fixed::encode(out, iter, opts) + fixed::encode(out, column, opts) }, DataType::Binary => { variable::encode(out, as_generic_binary_array::(column).iter(), opts) @@ -641,12 +635,16 @@ unsafe fn decode_column( DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options)), - DataType::Decimal128(p, s) => { - Arc::new(decode_decimal::<16, Decimal128Type>(rows, options, *p, *s)) - } - DataType::Decimal256(p, s) => { - Arc::new(decode_decimal::<32, Decimal256Type>(rows, options, *p, *s)) - } + DataType::Decimal128(p, s) => Arc::new( + decode_primitive::(rows, options) + .with_precision_and_scale(*p, *s) + .unwrap(), + ), + DataType::Decimal256(p, s) => Arc::new( + decode_primitive::(rows, options) + .with_precision_and_scale(*p, *s) + .unwrap(), + ), DataType::Dictionary(k, v) => match k.as_ref() { DataType::Int8 => Arc::new(decode_dictionary::( interner.unwrap(), @@ -795,6 +793,64 @@ mod tests { } } + #[test] + fn test_decimal128() { + let mut converter = RowConverter::new(vec![SortField::new( + DataType::Decimal128(DECIMAL128_MAX_PRECISION, 7), + )]); + let col = Arc::new( + Decimal128Array::from_iter([ + None, + Some(i128::MIN), + Some(-13), + Some(46_i128), + Some(5456_i128), + Some(i128::MAX), + ]) + .with_precision_and_scale(38, 7) + .unwrap(), + ) as ArrayRef; + + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); + for i in 0..rows.num_rows() - 1 { + assert!(rows.row(i) < rows.row(i + 1)); + } + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(col.as_ref(), back[0].as_ref()) + } + + #[test] + fn test_decimal256() { + let mut converter = RowConverter::new(vec![SortField::new( + DataType::Decimal256(DECIMAL256_MAX_PRECISION, 7), + )]); + let col = Arc::new( + Decimal256Array::from_iter([ + None, + Some(i256::MIN), + Some(i256::from_parts(0, -1)), + Some(i256::from_parts(u128::MAX, -1)), + Some(i256::from_parts(u128::MAX, 0)), + Some(i256::from_parts(0, 46_i128)), + Some(i256::from_parts(5, 46_i128)), + Some(i256::MAX), + ]) + .with_precision_and_scale(DECIMAL256_MAX_PRECISION, 7) + .unwrap(), + ) as ArrayRef; + + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); + for i in 0..rows.num_rows() - 1 { + assert!(rows.row(i) < rows.row(i + 1)); + } + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(col.as_ref(), back[0].as_ref()) + } + #[test] fn test_bool() { let mut converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]); From e118ae2d3b3ed9945bd0721a10d001a829d41854 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 18 Oct 2022 14:58:55 -0700 Subject: [PATCH 0146/1411] Support overflow-checking variant of negate kernel (#2893) --- arrow/src/compute/kernels/arithmetic.rs | 32 +++++++++++++++++++++---- arrow/src/datatypes/native.rs | 22 +++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index a73ee7eee151..e7fcb50cc254 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -22,8 +22,6 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use std::ops::Neg; - use crate::array::*; #[cfg(feature = "simd")] use crate::buffer::MutableBuffer; @@ -1116,12 +1114,27 @@ where } /// Perform `-` operation on an array. If value is null then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `negate_checked` instead. pub fn negate(array: &PrimitiveArray) -> Result> where T: ArrowNumericType, - T::Native: Neg, + T::Native: ArrowNativeTypeOp, +{ + Ok(unary(array, |x| x.neg_wrapping())) +} + +/// Perform `-` operation on an array. If value is null then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `negate` instead. +pub fn negate_checked(array: &PrimitiveArray) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { - Ok(unary(array, |x| -x)) + try_unary(array, |value| value.neg_checked()) } /// Raise array with floating point values to the power of a scalar. @@ -2567,6 +2580,17 @@ mod tests { assert_eq!(expected, actual); } + #[test] + fn test_primitive_array_negate_checked_overflow() { + let a = Int32Array::from(vec![i32::MIN]); + let actual = negate(&a).unwrap(); + let expected = Int32Array::from(vec![i32::MIN]); + assert_eq!(expected, actual); + + let err = negate_checked(&a); + err.expect_err("negate_checked should detect overflow"); + } + #[test] fn test_arithmetic_kernel_should_not_rely_on_padding() { let a: UInt8Array = (0..128_u8).into_iter().map(Some).collect(); diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 444ba39e0b6d..2643025f1573 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -64,6 +64,10 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { fn mod_wrapping(self, rhs: Self) -> Self; + fn neg_checked(self) -> Result; + + fn neg_wrapping(self) -> Self; + fn is_zero(self) -> bool; fn is_eq(self, rhs: Self) -> bool; @@ -158,6 +162,16 @@ macro_rules! native_type_op { self.wrapping_rem(rhs) } + fn neg_checked(self) -> Result { + self.checked_neg().ok_or_else(|| { + ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) + }) + } + + fn neg_wrapping(self) -> Self { + self.wrapping_neg() + } + fn is_zero(self) -> bool { self == 0 } @@ -253,6 +267,14 @@ macro_rules! native_type_float_op { self % rhs } + fn neg_checked(self) -> Result { + Ok(-self) + } + + fn neg_wrapping(self) -> Self { + -self + } + fn is_zero(self) -> bool { self == $zero } From a881083c425dfbc1a87fb510bc32644bca16fbf3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Oct 2022 19:03:52 +1300 Subject: [PATCH 0147/1411] Update flatbuffers (#2895) * Update flatbuffers * Format --- arrow/Cargo.toml | 2 +- arrow/src/ipc/gen/File.rs | 206 ++-- arrow/src/ipc/gen/Message.rs | 505 ++++++---- arrow/src/ipc/gen/Schema.rs | 1474 ++++++++++++++++++----------- arrow/src/ipc/gen/SparseTensor.rs | 634 ++++++++----- arrow/src/ipc/gen/Tensor.rs | 246 +++-- arrow/src/ipc/reader.rs | 93 +- 7 files changed, 1964 insertions(+), 1196 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 134d274a8393..88ed493965ca 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -62,7 +62,7 @@ lz4 = { version = "1.23", default-features = false, optional = true } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.6", default-features = false, optional = true } -flatbuffers = { version = "2.1.2", default-features = false, features = ["thiserror"], optional = true } +flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow/src/ipc/gen/File.rs b/arrow/src/ipc/gen/File.rs index 04cbc6441377..9aafe910ba2c 100644 --- a/arrow/src/ipc/gen/File.rs +++ b/arrow/src/ipc/gen/File.rs @@ -27,8 +27,13 @@ use std::{cmp::Ordering, mem}; #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] pub struct Block(pub [u8; 24]); -impl std::fmt::Debug for Block { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl Default for Block { + fn default() -> Self { + Self([0; 24]) + } +} +impl core::fmt::Debug for Block { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("Block") .field("offset", &self.offset()) .field("metaDataLength", &self.metaDataLength()) @@ -38,39 +43,28 @@ impl std::fmt::Debug for Block { } impl flatbuffers::SimpleToVerifyInSlice for Block {} -impl flatbuffers::SafeSliceAccess for Block {} impl<'a> flatbuffers::Follow<'a> for Block { type Inner = &'a Block; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { <&'a Block>::follow(buf, loc) } } impl<'a> flatbuffers::Follow<'a> for &'a Block { type Inner = &'a Block; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { flatbuffers::follow_cast_ref::(buf, loc) } } impl<'b> flatbuffers::Push for Block { type Output = Block; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts(self as *const Block as *const u8, Self::size()) - }; - dst.copy_from_slice(src); - } -} -impl<'b> flatbuffers::Push for &'b Block { - type Output = Block; - - #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts(*self as *const Block as *const u8, Self::size()) - }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + let src = ::core::slice::from_raw_parts( + self as *const Block as *const u8, + Self::size(), + ); dst.copy_from_slice(src); } } @@ -85,7 +79,8 @@ impl<'a> flatbuffers::Verifiable for Block { v.in_buffer::(pos) } } -impl Block { + +impl<'a> Block { #[allow(clippy::too_many_arguments)] pub fn new(offset: i64, metaDataLength: i32, bodyLength: i64) -> Self { let mut s = Self([0; 24]); @@ -97,50 +92,60 @@ impl Block { /// Index to the start of the RecordBlock (note this is past the Message header) pub fn offset(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[0..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_offset(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[0..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } /// Length of the metadata pub fn metaDataLength(&self) -> i32 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[8..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_metaDataLength(&mut self, x: i32) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i32 as *const u8, + &x_le as *const _ as *const u8, self.0[8..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -148,25 +153,30 @@ impl Block { /// Length of the data (this is aligned so there can be a gap between this and /// the metadata). pub fn bodyLength(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[16..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_bodyLength(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[16..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -185,16 +195,22 @@ pub struct Footer<'a> { impl<'a> flatbuffers::Follow<'a> for Footer<'a> { type Inner = Footer<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Footer<'a> { + pub const VT_VERSION: flatbuffers::VOffsetT = 4; + pub const VT_SCHEMA: flatbuffers::VOffsetT = 6; + pub const VT_DICTIONARIES: flatbuffers::VOffsetT = 8; + pub const VT_RECORDBATCHES: flatbuffers::VOffsetT = 10; + pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 12; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Footer { _tab: table } } #[allow(unused_mut)] @@ -219,49 +235,66 @@ impl<'a> Footer<'a> { builder.finish() } - pub const VT_VERSION: flatbuffers::VOffsetT = 4; - pub const VT_SCHEMA: flatbuffers::VOffsetT = 6; - pub const VT_DICTIONARIES: flatbuffers::VOffsetT = 8; - pub const VT_RECORDBATCHES: flatbuffers::VOffsetT = 10; - pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 12; - #[inline] pub fn version(&self) -> MetadataVersion { - self._tab - .get::(Footer::VT_VERSION, Some(MetadataVersion::V1)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Footer::VT_VERSION, Some(MetadataVersion::V1)) + .unwrap() + } } #[inline] pub fn schema(&self) -> Option> { - self._tab - .get::>(Footer::VT_SCHEMA, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(Footer::VT_SCHEMA, None) + } } #[inline] - pub fn dictionaries(&self) -> Option<&'a [Block]> { - self._tab - .get::>>( - Footer::VT_DICTIONARIES, - None, - ) - .map(|v| v.safe_slice()) + pub fn dictionaries(&self) -> Option> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Footer::VT_DICTIONARIES, + None, + ) + } } #[inline] - pub fn recordBatches(&self) -> Option<&'a [Block]> { - self._tab - .get::>>( - Footer::VT_RECORDBATCHES, - None, - ) - .map(|v| v.safe_slice()) + pub fn recordBatches(&self) -> Option> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Footer::VT_RECORDBATCHES, + None, + ) + } } /// User-defined metadata #[inline] pub fn custom_metadata( &self, ) -> Option>>> { - self._tab.get::>, - >>(Footer::VT_CUSTOM_METADATA, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Footer::VT_CUSTOM_METADATA, None) + } } } @@ -273,25 +306,25 @@ impl flatbuffers::Verifiable for Footer<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"version", Self::VT_VERSION, false)? + .visit_field::("version", Self::VT_VERSION, false)? .visit_field::>( - &"schema", + "schema", Self::VT_SCHEMA, false, )? .visit_field::>>( - &"dictionaries", + "dictionaries", Self::VT_DICTIONARIES, false, )? .visit_field::>>( - &"recordBatches", + "recordBatches", Self::VT_RECORDBATCHES, false, )? .visit_field::>, - >>(&"custom_metadata", Self::VT_CUSTOM_METADATA, false)? + >>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? .finish(); Ok(()) } @@ -319,6 +352,7 @@ impl<'a> Default for FooterArgs<'a> { } } } + pub struct FooterBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -389,8 +423,8 @@ impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { } } -impl std::fmt::Debug for Footer<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Footer<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Footer"); ds.field("version", &self.version()); ds.field("schema", &self.schema()); @@ -400,18 +434,6 @@ impl std::fmt::Debug for Footer<'_> { ds.finish() } } -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_footer<'a>(buf: &'a [u8]) -> Footer<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_footer<'a>(buf: &'a [u8]) -> Footer<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - #[inline] /// Verifies that a buffer of bytes contains a `Footer` /// and returns it. diff --git a/arrow/src/ipc/gen/Message.rs b/arrow/src/ipc/gen/Message.rs index 707c62c0f76e..d4b3a57f164e 100644 --- a/arrow/src/ipc/gen/Message.rs +++ b/arrow/src/ipc/gen/Message.rs @@ -43,7 +43,7 @@ pub const ENUM_MAX_COMPRESSION_TYPE: i8 = 1; pub const ENUM_VALUES_COMPRESSION_TYPE: [CompressionType; 2] = [CompressionType::LZ4_FRAME, CompressionType::ZSTD]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct CompressionType(pub i8); #[allow(non_upper_case_globals)] @@ -63,8 +63,8 @@ impl CompressionType { } } } -impl std::fmt::Debug for CompressionType { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for CompressionType { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -75,8 +75,8 @@ impl std::fmt::Debug for CompressionType { impl<'a> flatbuffers::Follow<'a> for CompressionType { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -84,20 +84,21 @@ impl<'a> flatbuffers::Follow<'a> for CompressionType { impl flatbuffers::Push for CompressionType { type Output = CompressionType; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for CompressionType { + type Scalar = i8; #[inline] - fn to_little_endian(self) -> Self { - let b = i8::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i8 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i8::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i8) -> Self { + let b = i8::from_le(v); Self(b) } } @@ -135,7 +136,7 @@ pub const ENUM_VALUES_BODY_COMPRESSION_METHOD: [BodyCompressionMethod; 1] = /// Provided for forward compatibility in case we need to support different /// strategies for compressing the IPC message body (like whole-body /// compression rather than buffer-level) in the future -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct BodyCompressionMethod(pub i8); #[allow(non_upper_case_globals)] @@ -160,8 +161,8 @@ impl BodyCompressionMethod { } } } -impl std::fmt::Debug for BodyCompressionMethod { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for BodyCompressionMethod { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -172,8 +173,8 @@ impl std::fmt::Debug for BodyCompressionMethod { impl<'a> flatbuffers::Follow<'a> for BodyCompressionMethod { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -181,20 +182,21 @@ impl<'a> flatbuffers::Follow<'a> for BodyCompressionMethod { impl flatbuffers::Push for BodyCompressionMethod { type Output = BodyCompressionMethod; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for BodyCompressionMethod { + type Scalar = i8; #[inline] - fn to_little_endian(self) -> Self { - let b = i8::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i8 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i8::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i8) -> Self { + let b = i8::from_le(v); Self(b) } } @@ -243,7 +245,7 @@ pub const ENUM_VALUES_MESSAGE_HEADER: [MessageHeader; 6] = [ /// Arrow implementations do not need to implement all of the message types, /// which may include experimental metadata types. For maximum compatibility, /// it is best to send data using RecordBatch -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct MessageHeader(pub u8); #[allow(non_upper_case_globals)] @@ -278,8 +280,8 @@ impl MessageHeader { } } } -impl std::fmt::Debug for MessageHeader { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for MessageHeader { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -287,12 +289,11 @@ impl std::fmt::Debug for MessageHeader { } } } -pub struct MessageHeaderUnionTableOffset {} impl<'a> flatbuffers::Follow<'a> for MessageHeader { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -300,20 +301,21 @@ impl<'a> flatbuffers::Follow<'a> for MessageHeader { impl flatbuffers::Push for MessageHeader { type Output = MessageHeader; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for MessageHeader { + type Scalar = u8; #[inline] - fn to_little_endian(self) -> Self { - let b = u8::to_le(self.0); - Self(b) + fn to_little_endian(self) -> u8 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = u8::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: u8) -> Self { + let b = u8::from_le(v); Self(b) } } @@ -330,6 +332,8 @@ impl<'a> flatbuffers::Verifiable for MessageHeader { } impl flatbuffers::SimpleToVerifyInSlice for MessageHeader {} +pub struct MessageHeaderUnionTableOffset {} + /// ---------------------------------------------------------------------- /// Data structures for describing a table row batch (a collection of /// equal-length Arrow arrays) @@ -343,8 +347,13 @@ impl flatbuffers::SimpleToVerifyInSlice for MessageHeader {} #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] pub struct FieldNode(pub [u8; 16]); -impl std::fmt::Debug for FieldNode { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl Default for FieldNode { + fn default() -> Self { + Self([0; 16]) + } +} +impl core::fmt::Debug for FieldNode { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("FieldNode") .field("length", &self.length()) .field("null_count", &self.null_count()) @@ -353,45 +362,28 @@ impl std::fmt::Debug for FieldNode { } impl flatbuffers::SimpleToVerifyInSlice for FieldNode {} -impl flatbuffers::SafeSliceAccess for FieldNode {} impl<'a> flatbuffers::Follow<'a> for FieldNode { type Inner = &'a FieldNode; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { <&'a FieldNode>::follow(buf, loc) } } impl<'a> flatbuffers::Follow<'a> for &'a FieldNode { type Inner = &'a FieldNode; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { flatbuffers::follow_cast_ref::(buf, loc) } } impl<'b> flatbuffers::Push for FieldNode { type Output = FieldNode; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts( - self as *const FieldNode as *const u8, - Self::size(), - ) - }; - dst.copy_from_slice(src); - } -} -impl<'b> flatbuffers::Push for &'b FieldNode { - type Output = FieldNode; - - #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts( - *self as *const FieldNode as *const u8, - Self::size(), - ) - }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + let src = ::core::slice::from_raw_parts( + self as *const FieldNode as *const u8, + Self::size(), + ); dst.copy_from_slice(src); } } @@ -406,7 +398,8 @@ impl<'a> flatbuffers::Verifiable for FieldNode { v.in_buffer::(pos) } } -impl FieldNode { + +impl<'a> FieldNode { #[allow(clippy::too_many_arguments)] pub fn new(length: i64, null_count: i64) -> Self { let mut s = Self([0; 16]); @@ -418,25 +411,30 @@ impl FieldNode { /// The number of value slots in the Arrow array at this level of a nested /// tree pub fn length(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[0..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_length(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[0..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -445,25 +443,30 @@ impl FieldNode { /// to write their physical validity bitmap out as a materialized buffer, /// instead setting the length of the bitmap buffer to 0. pub fn null_count(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[8..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_null_count(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[8..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -482,16 +485,19 @@ pub struct BodyCompression<'a> { impl<'a> flatbuffers::Follow<'a> for BodyCompression<'a> { type Inner = BodyCompression<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> BodyCompression<'a> { + pub const VT_CODEC: flatbuffers::VOffsetT = 4; + pub const VT_METHOD: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { BodyCompression { _tab: table } } #[allow(unused_mut)] @@ -505,28 +511,36 @@ impl<'a> BodyCompression<'a> { builder.finish() } - pub const VT_CODEC: flatbuffers::VOffsetT = 4; - pub const VT_METHOD: flatbuffers::VOffsetT = 6; - - /// Compressor library + /// Compressor library. + /// For LZ4_FRAME, each compressed buffer must consist of a single frame. #[inline] pub fn codec(&self) -> CompressionType { - self._tab - .get::( - BodyCompression::VT_CODEC, - Some(CompressionType::LZ4_FRAME), - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::( + BodyCompression::VT_CODEC, + Some(CompressionType::LZ4_FRAME), + ) + .unwrap() + } } /// Indicates the way the record batch body was compressed #[inline] pub fn method(&self) -> BodyCompressionMethod { - self._tab - .get::( - BodyCompression::VT_METHOD, - Some(BodyCompressionMethod::BUFFER), - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::( + BodyCompression::VT_METHOD, + Some(BodyCompressionMethod::BUFFER), + ) + .unwrap() + } } } @@ -538,8 +552,8 @@ impl flatbuffers::Verifiable for BodyCompression<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"codec", Self::VT_CODEC, false)? - .visit_field::(&"method", Self::VT_METHOD, false)? + .visit_field::("codec", Self::VT_CODEC, false)? + .visit_field::("method", Self::VT_METHOD, false)? .finish(); Ok(()) } @@ -557,6 +571,7 @@ impl<'a> Default for BodyCompressionArgs { } } } + pub struct BodyCompressionBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -595,8 +610,8 @@ impl<'a: 'b, 'b> BodyCompressionBuilder<'a, 'b> { } } -impl std::fmt::Debug for BodyCompression<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for BodyCompression<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("BodyCompression"); ds.field("codec", &self.codec()); ds.field("method", &self.method()); @@ -616,16 +631,21 @@ pub struct RecordBatch<'a> { impl<'a> flatbuffers::Follow<'a> for RecordBatch<'a> { type Inner = RecordBatch<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> RecordBatch<'a> { + pub const VT_LENGTH: flatbuffers::VOffsetT = 4; + pub const VT_NODES: flatbuffers::VOffsetT = 6; + pub const VT_BUFFERS: flatbuffers::VOffsetT = 8; + pub const VT_COMPRESSION: flatbuffers::VOffsetT = 10; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { RecordBatch { _tab: table } } #[allow(unused_mut)] @@ -647,28 +667,32 @@ impl<'a> RecordBatch<'a> { builder.finish() } - pub const VT_LENGTH: flatbuffers::VOffsetT = 4; - pub const VT_NODES: flatbuffers::VOffsetT = 6; - pub const VT_BUFFERS: flatbuffers::VOffsetT = 8; - pub const VT_COMPRESSION: flatbuffers::VOffsetT = 10; - /// number of records / rows. The arrays in the batch should all have this /// length #[inline] pub fn length(&self) -> i64 { - self._tab - .get::(RecordBatch::VT_LENGTH, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(RecordBatch::VT_LENGTH, Some(0)) + .unwrap() + } } /// Nodes correspond to the pre-ordered flattened logical schema #[inline] - pub fn nodes(&self) -> Option<&'a [FieldNode]> { - self._tab - .get::>>( - RecordBatch::VT_NODES, - None, - ) - .map(|v| v.safe_slice()) + pub fn nodes(&self) -> Option> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + RecordBatch::VT_NODES, + None, + ) + } } /// Buffers correspond to the pre-ordered flattened buffer tree /// @@ -677,22 +701,31 @@ impl<'a> RecordBatch<'a> { /// bitmap and 1 for the values. For struct arrays, there will only be a /// single buffer for the validity (nulls) bitmap #[inline] - pub fn buffers(&self) -> Option<&'a [Buffer]> { - self._tab - .get::>>( - RecordBatch::VT_BUFFERS, - None, - ) - .map(|v| v.safe_slice()) + pub fn buffers(&self) -> Option> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + RecordBatch::VT_BUFFERS, + None, + ) + } } /// Optional compression of the message body #[inline] pub fn compression(&self) -> Option> { - self._tab - .get::>( - RecordBatch::VT_COMPRESSION, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + RecordBatch::VT_COMPRESSION, + None, + ) + } } } @@ -704,10 +737,10 @@ impl flatbuffers::Verifiable for RecordBatch<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"length", Self::VT_LENGTH, false)? - .visit_field::>>(&"nodes", Self::VT_NODES, false)? - .visit_field::>>(&"buffers", Self::VT_BUFFERS, false)? - .visit_field::>(&"compression", Self::VT_COMPRESSION, false)? + .visit_field::("length", Self::VT_LENGTH, false)? + .visit_field::>>("nodes", Self::VT_NODES, false)? + .visit_field::>>("buffers", Self::VT_BUFFERS, false)? + .visit_field::>("compression", Self::VT_COMPRESSION, false)? .finish(); Ok(()) } @@ -729,6 +762,7 @@ impl<'a> Default for RecordBatchArgs<'a> { } } } + pub struct RecordBatchBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -785,8 +819,8 @@ impl<'a: 'b, 'b> RecordBatchBuilder<'a, 'b> { } } -impl std::fmt::Debug for RecordBatch<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for RecordBatch<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("RecordBatch"); ds.field("length", &self.length()); ds.field("nodes", &self.nodes()); @@ -811,16 +845,20 @@ pub struct DictionaryBatch<'a> { impl<'a> flatbuffers::Follow<'a> for DictionaryBatch<'a> { type Inner = DictionaryBatch<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> DictionaryBatch<'a> { + pub const VT_ID: flatbuffers::VOffsetT = 4; + pub const VT_DATA: flatbuffers::VOffsetT = 6; + pub const VT_ISDELTA: flatbuffers::VOffsetT = 8; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { DictionaryBatch { _tab: table } } #[allow(unused_mut)] @@ -837,31 +875,42 @@ impl<'a> DictionaryBatch<'a> { builder.finish() } - pub const VT_ID: flatbuffers::VOffsetT = 4; - pub const VT_DATA: flatbuffers::VOffsetT = 6; - pub const VT_ISDELTA: flatbuffers::VOffsetT = 8; - #[inline] pub fn id(&self) -> i64 { - self._tab - .get::(DictionaryBatch::VT_ID, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(DictionaryBatch::VT_ID, Some(0)) + .unwrap() + } } #[inline] pub fn data(&self) -> Option> { - self._tab.get::>( - DictionaryBatch::VT_DATA, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>( + DictionaryBatch::VT_DATA, + None, + ) + } } /// If isDelta is true the values in the dictionary are to be appended to a /// dictionary with the indicated id. If isDelta is false this dictionary /// should replace the existing dictionary. #[inline] pub fn isDelta(&self) -> bool { - self._tab - .get::(DictionaryBatch::VT_ISDELTA, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(DictionaryBatch::VT_ISDELTA, Some(false)) + .unwrap() + } } } @@ -873,13 +922,13 @@ impl flatbuffers::Verifiable for DictionaryBatch<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"id", Self::VT_ID, false)? + .visit_field::("id", Self::VT_ID, false)? .visit_field::>( - &"data", + "data", Self::VT_DATA, false, )? - .visit_field::(&"isDelta", Self::VT_ISDELTA, false)? + .visit_field::("isDelta", Self::VT_ISDELTA, false)? .finish(); Ok(()) } @@ -899,6 +948,7 @@ impl<'a> Default for DictionaryBatchArgs<'a> { } } } + pub struct DictionaryBatchBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -938,8 +988,8 @@ impl<'a: 'b, 'b> DictionaryBatchBuilder<'a, 'b> { } } -impl std::fmt::Debug for DictionaryBatch<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for DictionaryBatch<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("DictionaryBatch"); ds.field("id", &self.id()); ds.field("data", &self.data()); @@ -957,16 +1007,22 @@ pub struct Message<'a> { impl<'a> flatbuffers::Follow<'a> for Message<'a> { type Inner = Message<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Message<'a> { + pub const VT_VERSION: flatbuffers::VOffsetT = 4; + pub const VT_HEADER_TYPE: flatbuffers::VOffsetT = 6; + pub const VT_HEADER: flatbuffers::VOffsetT = 8; + pub const VT_BODYLENGTH: flatbuffers::VOffsetT = 10; + pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 12; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Message { _tab: table } } #[allow(unused_mut)] @@ -987,51 +1043,75 @@ impl<'a> Message<'a> { builder.finish() } - pub const VT_VERSION: flatbuffers::VOffsetT = 4; - pub const VT_HEADER_TYPE: flatbuffers::VOffsetT = 6; - pub const VT_HEADER: flatbuffers::VOffsetT = 8; - pub const VT_BODYLENGTH: flatbuffers::VOffsetT = 10; - pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 12; - #[inline] pub fn version(&self) -> MetadataVersion { - self._tab - .get::(Message::VT_VERSION, Some(MetadataVersion::V1)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Message::VT_VERSION, Some(MetadataVersion::V1)) + .unwrap() + } } #[inline] pub fn header_type(&self) -> MessageHeader { - self._tab - .get::(Message::VT_HEADER_TYPE, Some(MessageHeader::NONE)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Message::VT_HEADER_TYPE, Some(MessageHeader::NONE)) + .unwrap() + } } #[inline] pub fn header(&self) -> Option> { - self._tab - .get::>>( - Message::VT_HEADER, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Message::VT_HEADER, + None, + ) + } } #[inline] pub fn bodyLength(&self) -> i64 { - self._tab - .get::(Message::VT_BODYLENGTH, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Message::VT_BODYLENGTH, Some(0)) + .unwrap() + } } #[inline] pub fn custom_metadata( &self, ) -> Option>>> { - self._tab.get::>, - >>(Message::VT_CUSTOM_METADATA, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Message::VT_CUSTOM_METADATA, None) + } } #[inline] #[allow(non_snake_case)] pub fn header_as_schema(&self) -> Option> { if self.header_type() == MessageHeader::Schema { - self.header().map(Schema::init_from_table) + self.header().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Schema::init_from_table(t) } + }) } else { None } @@ -1041,7 +1121,12 @@ impl<'a> Message<'a> { #[allow(non_snake_case)] pub fn header_as_dictionary_batch(&self) -> Option> { if self.header_type() == MessageHeader::DictionaryBatch { - self.header().map(DictionaryBatch::init_from_table) + self.header().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { DictionaryBatch::init_from_table(t) } + }) } else { None } @@ -1051,7 +1136,12 @@ impl<'a> Message<'a> { #[allow(non_snake_case)] pub fn header_as_record_batch(&self) -> Option> { if self.header_type() == MessageHeader::RecordBatch { - self.header().map(RecordBatch::init_from_table) + self.header().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { RecordBatch::init_from_table(t) } + }) } else { None } @@ -1061,7 +1151,12 @@ impl<'a> Message<'a> { #[allow(non_snake_case)] pub fn header_as_tensor(&self) -> Option> { if self.header_type() == MessageHeader::Tensor { - self.header().map(Tensor::init_from_table) + self.header().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Tensor::init_from_table(t) } + }) } else { None } @@ -1071,7 +1166,12 @@ impl<'a> Message<'a> { #[allow(non_snake_case)] pub fn header_as_sparse_tensor(&self) -> Option> { if self.header_type() == MessageHeader::SparseTensor { - self.header().map(SparseTensor::init_from_table) + self.header().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { SparseTensor::init_from_table(t) } + }) } else { None } @@ -1086,8 +1186,8 @@ impl flatbuffers::Verifiable for Message<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"version", Self::VT_VERSION, false)? - .visit_union::(&"header_type", Self::VT_HEADER_TYPE, &"header", Self::VT_HEADER, false, |key, v, pos| { + .visit_field::("version", Self::VT_VERSION, false)? + .visit_union::("header_type", Self::VT_HEADER_TYPE, "header", Self::VT_HEADER, false, |key, v, pos| { match key { MessageHeader::Schema => v.verify_union_variant::>("MessageHeader::Schema", pos), MessageHeader::DictionaryBatch => v.verify_union_variant::>("MessageHeader::DictionaryBatch", pos), @@ -1097,8 +1197,8 @@ impl flatbuffers::Verifiable for Message<'_> { _ => Ok(()), } })? - .visit_field::(&"bodyLength", Self::VT_BODYLENGTH, false)? - .visit_field::>>>(&"custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .visit_field::("bodyLength", Self::VT_BODYLENGTH, false)? + .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? .finish(); Ok(()) } @@ -1126,6 +1226,7 @@ impl<'a> Default for MessageArgs<'a> { } } } + pub struct MessageBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1189,8 +1290,8 @@ impl<'a: 'b, 'b> MessageBuilder<'a, 'b> { } } -impl std::fmt::Debug for Message<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Message<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Message"); ds.field("version", &self.version()); ds.field("header_type", &self.header_type()); @@ -1255,18 +1356,6 @@ impl std::fmt::Debug for Message<'_> { ds.finish() } } -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_message<'a>(buf: &'a [u8]) -> Message<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_message<'a>(buf: &'a [u8]) -> Message<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - #[inline] /// Verifies that a buffer of bytes contains a `Message` /// and returns it. diff --git a/arrow/src/ipc/gen/Schema.rs b/arrow/src/ipc/gen/Schema.rs index dd204e0704df..6479bece7213 100644 --- a/arrow/src/ipc/gen/Schema.rs +++ b/arrow/src/ipc/gen/Schema.rs @@ -45,7 +45,7 @@ pub const ENUM_VALUES_METADATA_VERSION: [MetadataVersion; 5] = [ MetadataVersion::V5, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct MetadataVersion(pub i16); #[allow(non_upper_case_globals)] @@ -83,8 +83,8 @@ impl MetadataVersion { } } } -impl std::fmt::Debug for MetadataVersion { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for MetadataVersion { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -95,8 +95,8 @@ impl std::fmt::Debug for MetadataVersion { impl<'a> flatbuffers::Follow<'a> for MetadataVersion { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -104,20 +104,21 @@ impl<'a> flatbuffers::Follow<'a> for MetadataVersion { impl flatbuffers::Push for MetadataVersion { type Output = MetadataVersion; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for MetadataVersion { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -172,7 +173,7 @@ pub const ENUM_VALUES_FEATURE: [Feature; 3] = [ /// Enums added to this list should be assigned power-of-two values /// to facilitate exchanging and comparing bitmaps for supported /// features. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Feature(pub i64); #[allow(non_upper_case_globals)] @@ -204,8 +205,8 @@ impl Feature { } } } -impl std::fmt::Debug for Feature { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for Feature { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -216,8 +217,8 @@ impl std::fmt::Debug for Feature { impl<'a> flatbuffers::Follow<'a> for Feature { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -225,20 +226,21 @@ impl<'a> flatbuffers::Follow<'a> for Feature { impl flatbuffers::Push for Feature { type Output = Feature; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for Feature { + type Scalar = i64; #[inline] - fn to_little_endian(self) -> Self { - let b = i64::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i64 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i64::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i64) -> Self { + let b = i64::from_le(v); Self(b) } } @@ -272,7 +274,7 @@ pub const ENUM_MAX_UNION_MODE: i16 = 1; #[allow(non_camel_case_types)] pub const ENUM_VALUES_UNION_MODE: [UnionMode; 2] = [UnionMode::Sparse, UnionMode::Dense]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct UnionMode(pub i16); #[allow(non_upper_case_globals)] @@ -292,8 +294,8 @@ impl UnionMode { } } } -impl std::fmt::Debug for UnionMode { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for UnionMode { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -304,8 +306,8 @@ impl std::fmt::Debug for UnionMode { impl<'a> flatbuffers::Follow<'a> for UnionMode { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -313,20 +315,21 @@ impl<'a> flatbuffers::Follow<'a> for UnionMode { impl flatbuffers::Push for UnionMode { type Output = UnionMode; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for UnionMode { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -361,7 +364,7 @@ pub const ENUM_MAX_PRECISION: i16 = 2; pub const ENUM_VALUES_PRECISION: [Precision; 3] = [Precision::HALF, Precision::SINGLE, Precision::DOUBLE]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Precision(pub i16); #[allow(non_upper_case_globals)] @@ -383,8 +386,8 @@ impl Precision { } } } -impl std::fmt::Debug for Precision { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for Precision { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -395,8 +398,8 @@ impl std::fmt::Debug for Precision { impl<'a> flatbuffers::Follow<'a> for Precision { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -404,20 +407,21 @@ impl<'a> flatbuffers::Follow<'a> for Precision { impl flatbuffers::Push for Precision { type Output = Precision; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for Precision { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -451,7 +455,7 @@ pub const ENUM_MAX_DATE_UNIT: i16 = 1; #[allow(non_camel_case_types)] pub const ENUM_VALUES_DATE_UNIT: [DateUnit; 2] = [DateUnit::DAY, DateUnit::MILLISECOND]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct DateUnit(pub i16); #[allow(non_upper_case_globals)] @@ -471,8 +475,8 @@ impl DateUnit { } } } -impl std::fmt::Debug for DateUnit { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for DateUnit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -483,8 +487,8 @@ impl std::fmt::Debug for DateUnit { impl<'a> flatbuffers::Follow<'a> for DateUnit { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -492,20 +496,21 @@ impl<'a> flatbuffers::Follow<'a> for DateUnit { impl flatbuffers::Push for DateUnit { type Output = DateUnit; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for DateUnit { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -544,7 +549,7 @@ pub const ENUM_VALUES_TIME_UNIT: [TimeUnit; 4] = [ TimeUnit::NANOSECOND, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct TimeUnit(pub i16); #[allow(non_upper_case_globals)] @@ -573,8 +578,8 @@ impl TimeUnit { } } } -impl std::fmt::Debug for TimeUnit { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for TimeUnit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -585,8 +590,8 @@ impl std::fmt::Debug for TimeUnit { impl<'a> flatbuffers::Follow<'a> for TimeUnit { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -594,20 +599,21 @@ impl<'a> flatbuffers::Follow<'a> for TimeUnit { impl flatbuffers::Push for TimeUnit { type Output = TimeUnit; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for TimeUnit { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -633,7 +639,7 @@ pub const ENUM_MIN_INTERVAL_UNIT: i16 = 0; since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] -pub const ENUM_MAX_INTERVAL_UNIT: i16 = 1; +pub const ENUM_MAX_INTERVAL_UNIT: i16 = 2; #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." @@ -645,7 +651,7 @@ pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 3] = [ IntervalUnit::MONTH_DAY_NANO, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct IntervalUnit(pub i16); #[allow(non_upper_case_globals)] @@ -668,8 +674,8 @@ impl IntervalUnit { } } } -impl std::fmt::Debug for IntervalUnit { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for IntervalUnit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -680,8 +686,8 @@ impl std::fmt::Debug for IntervalUnit { impl<'a> flatbuffers::Follow<'a> for IntervalUnit { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -689,20 +695,21 @@ impl<'a> flatbuffers::Follow<'a> for IntervalUnit { impl flatbuffers::Push for IntervalUnit { type Output = IntervalUnit; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for IntervalUnit { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -762,7 +769,7 @@ pub const ENUM_VALUES_TYPE: [Type; 22] = [ /// ---------------------------------------------------------------------- /// Top-level Type value, enabling extensible type-specific metadata. We can /// add new logical types to Type without breaking backwards compatibility -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Type(pub u8); #[allow(non_upper_case_globals)] @@ -845,8 +852,8 @@ impl Type { } } } -impl std::fmt::Debug for Type { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for Type { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -854,12 +861,11 @@ impl std::fmt::Debug for Type { } } } -pub struct TypeUnionTableOffset {} impl<'a> flatbuffers::Follow<'a> for Type { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -867,20 +873,21 @@ impl<'a> flatbuffers::Follow<'a> for Type { impl flatbuffers::Push for Type { type Output = Type; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for Type { + type Scalar = u8; #[inline] - fn to_little_endian(self) -> Self { - let b = u8::to_le(self.0); - Self(b) + fn to_little_endian(self) -> u8 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = u8::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: u8) -> Self { + let b = u8::from_le(v); Self(b) } } @@ -897,6 +904,8 @@ impl<'a> flatbuffers::Verifiable for Type { } impl flatbuffers::SimpleToVerifyInSlice for Type {} +pub struct TypeUnionTableOffset {} + #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." @@ -919,7 +928,7 @@ pub const ENUM_VALUES_DICTIONARY_KIND: [DictionaryKind; 1] = [DictionaryKind::De /// Maintained for forwards compatibility, in the future /// Dictionaries might be explicit maps between integers and values /// allowing for non-contiguous index values -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct DictionaryKind(pub i16); #[allow(non_upper_case_globals)] @@ -937,8 +946,8 @@ impl DictionaryKind { } } } -impl std::fmt::Debug for DictionaryKind { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for DictionaryKind { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -949,8 +958,8 @@ impl std::fmt::Debug for DictionaryKind { impl<'a> flatbuffers::Follow<'a> for DictionaryKind { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -958,20 +967,21 @@ impl<'a> flatbuffers::Follow<'a> for DictionaryKind { impl flatbuffers::Push for DictionaryKind { type Output = DictionaryKind; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for DictionaryKind { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -1007,7 +1017,7 @@ pub const ENUM_VALUES_ENDIANNESS: [Endianness; 2] = [Endianness::Little, Endiann /// ---------------------------------------------------------------------- /// Endianness of the platform producing the data -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Endianness(pub i16); #[allow(non_upper_case_globals)] @@ -1027,8 +1037,8 @@ impl Endianness { } } } -impl std::fmt::Debug for Endianness { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for Endianness { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -1039,8 +1049,8 @@ impl std::fmt::Debug for Endianness { impl<'a> flatbuffers::Follow<'a> for Endianness { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -1048,20 +1058,21 @@ impl<'a> flatbuffers::Follow<'a> for Endianness { impl flatbuffers::Push for Endianness { type Output = Endianness; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for Endianness { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -1084,8 +1095,13 @@ impl flatbuffers::SimpleToVerifyInSlice for Endianness {} #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] pub struct Buffer(pub [u8; 16]); -impl std::fmt::Debug for Buffer { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl Default for Buffer { + fn default() -> Self { + Self([0; 16]) + } +} +impl core::fmt::Debug for Buffer { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("Buffer") .field("offset", &self.offset()) .field("length", &self.length()) @@ -1094,42 +1110,28 @@ impl std::fmt::Debug for Buffer { } impl flatbuffers::SimpleToVerifyInSlice for Buffer {} -impl flatbuffers::SafeSliceAccess for Buffer {} impl<'a> flatbuffers::Follow<'a> for Buffer { type Inner = &'a Buffer; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { <&'a Buffer>::follow(buf, loc) } } impl<'a> flatbuffers::Follow<'a> for &'a Buffer { type Inner = &'a Buffer; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { flatbuffers::follow_cast_ref::(buf, loc) } } impl<'b> flatbuffers::Push for Buffer { type Output = Buffer; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts(self as *const Buffer as *const u8, Self::size()) - }; - dst.copy_from_slice(src); - } -} -impl<'b> flatbuffers::Push for &'b Buffer { - type Output = Buffer; - - #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts( - *self as *const Buffer as *const u8, - Self::size(), - ) - }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + let src = ::core::slice::from_raw_parts( + self as *const Buffer as *const u8, + Self::size(), + ); dst.copy_from_slice(src); } } @@ -1144,7 +1146,8 @@ impl<'a> flatbuffers::Verifiable for Buffer { v.in_buffer::(pos) } } -impl Buffer { + +impl<'a> Buffer { #[allow(clippy::too_many_arguments)] pub fn new(offset: i64, length: i64) -> Self { let mut s = Self([0; 16]); @@ -1156,25 +1159,30 @@ impl Buffer { /// The relative offset into the shared memory page where the bytes for this /// buffer starts pub fn offset(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[0..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_offset(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[0..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -1185,25 +1193,30 @@ impl Buffer { /// after a buffer, but such padding bytes do not need to be accounted for in /// the size here. pub fn length(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { + let mut mem = core::mem::MaybeUninit::<::Scalar>::uninit(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot + EndianScalar::from_little_endian(unsafe { core::ptr::copy_nonoverlapping( self.0[8..].as_ptr(), mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); mem.assume_init() - } - .from_little_endian() + }) } pub fn set_length(&mut self, x: i64) { let x_le = x.to_little_endian(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid value in this slot unsafe { core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, + &x_le as *const _ as *const u8, self.0[8..].as_mut_ptr(), - core::mem::size_of::(), + core::mem::size_of::<::Scalar>(), ); } } @@ -1220,16 +1233,16 @@ pub struct Null<'a> { impl<'a> flatbuffers::Follow<'a> for Null<'a> { type Inner = Null<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Null<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Null { _tab: table } } #[allow(unused_mut)] @@ -1260,6 +1273,7 @@ impl<'a> Default for NullArgs { NullArgs {} } } + pub struct NullBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1280,8 +1294,8 @@ impl<'a: 'b, 'b> NullBuilder<'a, 'b> { } } -impl std::fmt::Debug for Null<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Null<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Null"); ds.finish() } @@ -1299,16 +1313,16 @@ pub struct Struct_<'a> { impl<'a> flatbuffers::Follow<'a> for Struct_<'a> { type Inner = Struct_<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Struct_<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Struct_ { _tab: table } } #[allow(unused_mut)] @@ -1339,6 +1353,7 @@ impl<'a> Default for Struct_Args { Struct_Args {} } } + pub struct Struct_Builder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1361,8 +1376,8 @@ impl<'a: 'b, 'b> Struct_Builder<'a, 'b> { } } -impl std::fmt::Debug for Struct_<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Struct_<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Struct_"); ds.finish() } @@ -1377,16 +1392,16 @@ pub struct List<'a> { impl<'a> flatbuffers::Follow<'a> for List<'a> { type Inner = List<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> List<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { List { _tab: table } } #[allow(unused_mut)] @@ -1417,6 +1432,7 @@ impl<'a> Default for ListArgs { ListArgs {} } } + pub struct ListBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1437,8 +1453,8 @@ impl<'a: 'b, 'b> ListBuilder<'a, 'b> { } } -impl std::fmt::Debug for List<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for List<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("List"); ds.finish() } @@ -1455,16 +1471,16 @@ pub struct LargeList<'a> { impl<'a> flatbuffers::Follow<'a> for LargeList<'a> { type Inner = LargeList<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> LargeList<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { LargeList { _tab: table } } #[allow(unused_mut)] @@ -1495,6 +1511,7 @@ impl<'a> Default for LargeListArgs { LargeListArgs {} } } + pub struct LargeListBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1517,8 +1534,8 @@ impl<'a: 'b, 'b> LargeListBuilder<'a, 'b> { } } -impl std::fmt::Debug for LargeList<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for LargeList<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("LargeList"); ds.finish() } @@ -1533,16 +1550,18 @@ pub struct FixedSizeList<'a> { impl<'a> flatbuffers::Follow<'a> for FixedSizeList<'a> { type Inner = FixedSizeList<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> FixedSizeList<'a> { + pub const VT_LISTSIZE: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { FixedSizeList { _tab: table } } #[allow(unused_mut)] @@ -1555,14 +1574,17 @@ impl<'a> FixedSizeList<'a> { builder.finish() } - pub const VT_LISTSIZE: flatbuffers::VOffsetT = 4; - /// Number of list items per value #[inline] pub fn listSize(&self) -> i32 { - self._tab - .get::(FixedSizeList::VT_LISTSIZE, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(FixedSizeList::VT_LISTSIZE, Some(0)) + .unwrap() + } } } @@ -1574,7 +1596,7 @@ impl flatbuffers::Verifiable for FixedSizeList<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"listSize", Self::VT_LISTSIZE, false)? + .visit_field::("listSize", Self::VT_LISTSIZE, false)? .finish(); Ok(()) } @@ -1588,6 +1610,7 @@ impl<'a> Default for FixedSizeListArgs { FixedSizeListArgs { listSize: 0 } } } + pub struct FixedSizeListBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1615,8 +1638,8 @@ impl<'a: 'b, 'b> FixedSizeListBuilder<'a, 'b> { } } -impl std::fmt::Debug for FixedSizeList<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for FixedSizeList<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("FixedSizeList"); ds.field("listSize", &self.listSize()); ds.finish() @@ -1657,16 +1680,18 @@ pub struct Map<'a> { impl<'a> flatbuffers::Follow<'a> for Map<'a> { type Inner = Map<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Map<'a> { + pub const VT_KEYSSORTED: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Map { _tab: table } } #[allow(unused_mut)] @@ -1679,14 +1704,17 @@ impl<'a> Map<'a> { builder.finish() } - pub const VT_KEYSSORTED: flatbuffers::VOffsetT = 4; - /// Set to true if the keys within each value are sorted #[inline] pub fn keysSorted(&self) -> bool { - self._tab - .get::(Map::VT_KEYSSORTED, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Map::VT_KEYSSORTED, Some(false)) + .unwrap() + } } } @@ -1698,7 +1726,7 @@ impl flatbuffers::Verifiable for Map<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"keysSorted", Self::VT_KEYSSORTED, false)? + .visit_field::("keysSorted", Self::VT_KEYSSORTED, false)? .finish(); Ok(()) } @@ -1712,6 +1740,7 @@ impl<'a> Default for MapArgs { MapArgs { keysSorted: false } } } + pub struct MapBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1737,8 +1766,8 @@ impl<'a: 'b, 'b> MapBuilder<'a, 'b> { } } -impl std::fmt::Debug for Map<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Map<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Map"); ds.field("keysSorted", &self.keysSorted()); ds.finish() @@ -1758,16 +1787,19 @@ pub struct Union<'a> { impl<'a> flatbuffers::Follow<'a> for Union<'a> { type Inner = Union<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Union<'a> { + pub const VT_MODE: flatbuffers::VOffsetT = 4; + pub const VT_TYPEIDS: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Union { _tab: table } } #[allow(unused_mut)] @@ -1783,22 +1815,29 @@ impl<'a> Union<'a> { builder.finish() } - pub const VT_MODE: flatbuffers::VOffsetT = 4; - pub const VT_TYPEIDS: flatbuffers::VOffsetT = 6; - #[inline] pub fn mode(&self) -> UnionMode { - self._tab - .get::(Union::VT_MODE, Some(UnionMode::Sparse)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Union::VT_MODE, Some(UnionMode::Sparse)) + .unwrap() + } } #[inline] pub fn typeIds(&self) -> Option> { - self._tab - .get::>>( - Union::VT_TYPEIDS, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Union::VT_TYPEIDS, + None, + ) + } } } @@ -1810,9 +1849,9 @@ impl flatbuffers::Verifiable for Union<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"mode", Self::VT_MODE, false)? + .visit_field::("mode", Self::VT_MODE, false)? .visit_field::>>( - &"typeIds", + "typeIds", Self::VT_TYPEIDS, false, )? @@ -1833,6 +1872,7 @@ impl<'a> Default for UnionArgs<'a> { } } } + pub struct UnionBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1866,8 +1906,8 @@ impl<'a: 'b, 'b> UnionBuilder<'a, 'b> { } } -impl std::fmt::Debug for Union<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Union<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Union"); ds.field("mode", &self.mode()); ds.field("typeIds", &self.typeIds()); @@ -1884,16 +1924,19 @@ pub struct Int<'a> { impl<'a> flatbuffers::Follow<'a> for Int<'a> { type Inner = Int<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Int<'a> { + pub const VT_BITWIDTH: flatbuffers::VOffsetT = 4; + pub const VT_IS_SIGNED: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Int { _tab: table } } #[allow(unused_mut)] @@ -1907,18 +1950,23 @@ impl<'a> Int<'a> { builder.finish() } - pub const VT_BITWIDTH: flatbuffers::VOffsetT = 4; - pub const VT_IS_SIGNED: flatbuffers::VOffsetT = 6; - #[inline] pub fn bitWidth(&self) -> i32 { - self._tab.get::(Int::VT_BITWIDTH, Some(0)).unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { self._tab.get::(Int::VT_BITWIDTH, Some(0)).unwrap() } } #[inline] pub fn is_signed(&self) -> bool { - self._tab - .get::(Int::VT_IS_SIGNED, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Int::VT_IS_SIGNED, Some(false)) + .unwrap() + } } } @@ -1930,8 +1978,8 @@ impl flatbuffers::Verifiable for Int<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"bitWidth", Self::VT_BITWIDTH, false)? - .visit_field::(&"is_signed", Self::VT_IS_SIGNED, false)? + .visit_field::("bitWidth", Self::VT_BITWIDTH, false)? + .visit_field::("is_signed", Self::VT_IS_SIGNED, false)? .finish(); Ok(()) } @@ -1949,6 +1997,7 @@ impl<'a> Default for IntArgs { } } } + pub struct IntBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1978,8 +2027,8 @@ impl<'a: 'b, 'b> IntBuilder<'a, 'b> { } } -impl std::fmt::Debug for Int<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Int<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Int"); ds.field("bitWidth", &self.bitWidth()); ds.field("is_signed", &self.is_signed()); @@ -1996,16 +2045,18 @@ pub struct FloatingPoint<'a> { impl<'a> flatbuffers::Follow<'a> for FloatingPoint<'a> { type Inner = FloatingPoint<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> FloatingPoint<'a> { + pub const VT_PRECISION: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { FloatingPoint { _tab: table } } #[allow(unused_mut)] @@ -2018,13 +2069,16 @@ impl<'a> FloatingPoint<'a> { builder.finish() } - pub const VT_PRECISION: flatbuffers::VOffsetT = 4; - #[inline] pub fn precision(&self) -> Precision { - self._tab - .get::(FloatingPoint::VT_PRECISION, Some(Precision::HALF)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(FloatingPoint::VT_PRECISION, Some(Precision::HALF)) + .unwrap() + } } } @@ -2036,7 +2090,7 @@ impl flatbuffers::Verifiable for FloatingPoint<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"precision", Self::VT_PRECISION, false)? + .visit_field::("precision", Self::VT_PRECISION, false)? .finish(); Ok(()) } @@ -2052,6 +2106,7 @@ impl<'a> Default for FloatingPointArgs { } } } + pub struct FloatingPointBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2082,8 +2137,8 @@ impl<'a: 'b, 'b> FloatingPointBuilder<'a, 'b> { } } -impl std::fmt::Debug for FloatingPoint<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for FloatingPoint<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("FloatingPoint"); ds.field("precision", &self.precision()); ds.finish() @@ -2100,16 +2155,16 @@ pub struct Utf8<'a> { impl<'a> flatbuffers::Follow<'a> for Utf8<'a> { type Inner = Utf8<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Utf8<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Utf8 { _tab: table } } #[allow(unused_mut)] @@ -2140,6 +2195,7 @@ impl<'a> Default for Utf8Args { Utf8Args {} } } + pub struct Utf8Builder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2160,8 +2216,8 @@ impl<'a: 'b, 'b> Utf8Builder<'a, 'b> { } } -impl std::fmt::Debug for Utf8<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Utf8<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Utf8"); ds.finish() } @@ -2177,16 +2233,16 @@ pub struct Binary<'a> { impl<'a> flatbuffers::Follow<'a> for Binary<'a> { type Inner = Binary<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Binary<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Binary { _tab: table } } #[allow(unused_mut)] @@ -2217,6 +2273,7 @@ impl<'a> Default for BinaryArgs { BinaryArgs {} } } + pub struct BinaryBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2239,8 +2296,8 @@ impl<'a: 'b, 'b> BinaryBuilder<'a, 'b> { } } -impl std::fmt::Debug for Binary<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Binary<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Binary"); ds.finish() } @@ -2257,16 +2314,16 @@ pub struct LargeUtf8<'a> { impl<'a> flatbuffers::Follow<'a> for LargeUtf8<'a> { type Inner = LargeUtf8<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> LargeUtf8<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { LargeUtf8 { _tab: table } } #[allow(unused_mut)] @@ -2297,6 +2354,7 @@ impl<'a> Default for LargeUtf8Args { LargeUtf8Args {} } } + pub struct LargeUtf8Builder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2319,8 +2377,8 @@ impl<'a: 'b, 'b> LargeUtf8Builder<'a, 'b> { } } -impl std::fmt::Debug for LargeUtf8<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for LargeUtf8<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("LargeUtf8"); ds.finish() } @@ -2337,16 +2395,16 @@ pub struct LargeBinary<'a> { impl<'a> flatbuffers::Follow<'a> for LargeBinary<'a> { type Inner = LargeBinary<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> LargeBinary<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { LargeBinary { _tab: table } } #[allow(unused_mut)] @@ -2377,6 +2435,7 @@ impl<'a> Default for LargeBinaryArgs { LargeBinaryArgs {} } } + pub struct LargeBinaryBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2399,8 +2458,8 @@ impl<'a: 'b, 'b> LargeBinaryBuilder<'a, 'b> { } } -impl std::fmt::Debug for LargeBinary<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for LargeBinary<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("LargeBinary"); ds.finish() } @@ -2415,16 +2474,18 @@ pub struct FixedSizeBinary<'a> { impl<'a> flatbuffers::Follow<'a> for FixedSizeBinary<'a> { type Inner = FixedSizeBinary<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> FixedSizeBinary<'a> { + pub const VT_BYTEWIDTH: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { FixedSizeBinary { _tab: table } } #[allow(unused_mut)] @@ -2437,14 +2498,17 @@ impl<'a> FixedSizeBinary<'a> { builder.finish() } - pub const VT_BYTEWIDTH: flatbuffers::VOffsetT = 4; - /// Number of bytes per value #[inline] pub fn byteWidth(&self) -> i32 { - self._tab - .get::(FixedSizeBinary::VT_BYTEWIDTH, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(FixedSizeBinary::VT_BYTEWIDTH, Some(0)) + .unwrap() + } } } @@ -2456,7 +2520,7 @@ impl flatbuffers::Verifiable for FixedSizeBinary<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"byteWidth", Self::VT_BYTEWIDTH, false)? + .visit_field::("byteWidth", Self::VT_BYTEWIDTH, false)? .finish(); Ok(()) } @@ -2470,6 +2534,7 @@ impl<'a> Default for FixedSizeBinaryArgs { FixedSizeBinaryArgs { byteWidth: 0 } } } + pub struct FixedSizeBinaryBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2497,8 +2562,8 @@ impl<'a: 'b, 'b> FixedSizeBinaryBuilder<'a, 'b> { } } -impl std::fmt::Debug for FixedSizeBinary<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for FixedSizeBinary<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("FixedSizeBinary"); ds.field("byteWidth", &self.byteWidth()); ds.finish() @@ -2514,16 +2579,16 @@ pub struct Bool<'a> { impl<'a> flatbuffers::Follow<'a> for Bool<'a> { type Inner = Bool<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Bool<'a> { #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Bool { _tab: table } } #[allow(unused_mut)] @@ -2554,6 +2619,7 @@ impl<'a> Default for BoolArgs { BoolArgs {} } } + pub struct BoolBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2574,8 +2640,8 @@ impl<'a: 'b, 'b> BoolBuilder<'a, 'b> { } } -impl std::fmt::Debug for Bool<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Bool<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Bool"); ds.finish() } @@ -2594,16 +2660,20 @@ pub struct Decimal<'a> { impl<'a> flatbuffers::Follow<'a> for Decimal<'a> { type Inner = Decimal<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Decimal<'a> { + pub const VT_PRECISION: flatbuffers::VOffsetT = 4; + pub const VT_SCALE: flatbuffers::VOffsetT = 6; + pub const VT_BITWIDTH: flatbuffers::VOffsetT = 8; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Decimal { _tab: table } } #[allow(unused_mut)] @@ -2618,29 +2688,38 @@ impl<'a> Decimal<'a> { builder.finish() } - pub const VT_PRECISION: flatbuffers::VOffsetT = 4; - pub const VT_SCALE: flatbuffers::VOffsetT = 6; - pub const VT_BITWIDTH: flatbuffers::VOffsetT = 8; - /// Total number of decimal digits #[inline] pub fn precision(&self) -> i32 { - self._tab - .get::(Decimal::VT_PRECISION, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Decimal::VT_PRECISION, Some(0)) + .unwrap() + } } /// Number of digits after the decimal point "." #[inline] pub fn scale(&self) -> i32 { - self._tab.get::(Decimal::VT_SCALE, Some(0)).unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { self._tab.get::(Decimal::VT_SCALE, Some(0)).unwrap() } } /// Number of bits per value. The only accepted widths are 128 and 256. /// We use bitWidth for consistency with Int::bitWidth. #[inline] pub fn bitWidth(&self) -> i32 { - self._tab - .get::(Decimal::VT_BITWIDTH, Some(128)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Decimal::VT_BITWIDTH, Some(128)) + .unwrap() + } } } @@ -2652,9 +2731,9 @@ impl flatbuffers::Verifiable for Decimal<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"precision", Self::VT_PRECISION, false)? - .visit_field::(&"scale", Self::VT_SCALE, false)? - .visit_field::(&"bitWidth", Self::VT_BITWIDTH, false)? + .visit_field::("precision", Self::VT_PRECISION, false)? + .visit_field::("scale", Self::VT_SCALE, false)? + .visit_field::("bitWidth", Self::VT_BITWIDTH, false)? .finish(); Ok(()) } @@ -2674,6 +2753,7 @@ impl<'a> Default for DecimalArgs { } } } + pub struct DecimalBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2710,8 +2790,8 @@ impl<'a: 'b, 'b> DecimalBuilder<'a, 'b> { } } -impl std::fmt::Debug for Decimal<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Decimal<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Decimal"); ds.field("precision", &self.precision()); ds.field("scale", &self.scale()); @@ -2722,8 +2802,8 @@ impl std::fmt::Debug for Decimal<'_> { pub enum DateOffset {} #[derive(Copy, Clone, PartialEq)] -/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX -/// epoch (1970-01-01), stored in either of two units: +/// Date is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units: /// /// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no /// leap seconds), where the values are evenly divisible by 86400000 @@ -2735,16 +2815,18 @@ pub struct Date<'a> { impl<'a> flatbuffers::Follow<'a> for Date<'a> { type Inner = Date<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Date<'a> { + pub const VT_UNIT: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Date { _tab: table } } #[allow(unused_mut)] @@ -2757,13 +2839,16 @@ impl<'a> Date<'a> { builder.finish() } - pub const VT_UNIT: flatbuffers::VOffsetT = 4; - #[inline] pub fn unit(&self) -> DateUnit { - self._tab - .get::(Date::VT_UNIT, Some(DateUnit::MILLISECOND)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Date::VT_UNIT, Some(DateUnit::MILLISECOND)) + .unwrap() + } } } @@ -2775,7 +2860,7 @@ impl flatbuffers::Verifiable for Date<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"unit", Self::VT_UNIT, false)? + .visit_field::("unit", Self::VT_UNIT, false)? .finish(); Ok(()) } @@ -2791,6 +2876,7 @@ impl<'a> Default for DateArgs { } } } + pub struct DateBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2816,8 +2902,8 @@ impl<'a: 'b, 'b> DateBuilder<'a, 'b> { } } -impl std::fmt::Debug for Date<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Date<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Date"); ds.field("unit", &self.unit()); ds.finish() @@ -2826,9 +2912,20 @@ impl std::fmt::Debug for Date<'_> { pub enum TimeOffset {} #[derive(Copy, Clone, PartialEq)] -/// Time type. The physical storage type depends on the unit -/// - SECOND and MILLISECOND: 32 bits -/// - MICROSECOND and NANOSECOND: 64 bits +/// Time is either a 32-bit or 64-bit signed integer type representing an +/// elapsed time since midnight, stored in either of four units: seconds, +/// milliseconds, microseconds or nanoseconds. +/// +/// The integer `bitWidth` depends on the `unit` and must be one of the following: +/// * SECOND and MILLISECOND: 32 bits +/// * MICROSECOND and NANOSECOND: 64 bits +/// +/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds +/// (exclusive), adjusted for the time unit (for example, up to 86400000 +/// exclusive for the MILLISECOND unit). +/// This definition doesn't allow for leap seconds. Time values from +/// measurements with leap seconds will need to be corrected when ingesting +/// into Arrow (for example by replacing the value 86400 with 86399). pub struct Time<'a> { pub _tab: flatbuffers::Table<'a>, } @@ -2836,16 +2933,19 @@ pub struct Time<'a> { impl<'a> flatbuffers::Follow<'a> for Time<'a> { type Inner = Time<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Time<'a> { + pub const VT_UNIT: flatbuffers::VOffsetT = 4; + pub const VT_BITWIDTH: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Time { _tab: table } } #[allow(unused_mut)] @@ -2859,18 +2959,23 @@ impl<'a> Time<'a> { builder.finish() } - pub const VT_UNIT: flatbuffers::VOffsetT = 4; - pub const VT_BITWIDTH: flatbuffers::VOffsetT = 6; - #[inline] pub fn unit(&self) -> TimeUnit { - self._tab - .get::(Time::VT_UNIT, Some(TimeUnit::MILLISECOND)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Time::VT_UNIT, Some(TimeUnit::MILLISECOND)) + .unwrap() + } } #[inline] pub fn bitWidth(&self) -> i32 { - self._tab.get::(Time::VT_BITWIDTH, Some(32)).unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { self._tab.get::(Time::VT_BITWIDTH, Some(32)).unwrap() } } } @@ -2882,8 +2987,8 @@ impl flatbuffers::Verifiable for Time<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"unit", Self::VT_UNIT, false)? - .visit_field::(&"bitWidth", Self::VT_BITWIDTH, false)? + .visit_field::("unit", Self::VT_UNIT, false)? + .visit_field::("bitWidth", Self::VT_BITWIDTH, false)? .finish(); Ok(()) } @@ -2901,6 +3006,7 @@ impl<'a> Default for TimeArgs { } } } + pub struct TimeBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -2930,8 +3036,8 @@ impl<'a: 'b, 'b> TimeBuilder<'a, 'b> { } } -impl std::fmt::Debug for Time<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Time<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Time"); ds.field("unit", &self.unit()); ds.field("bitWidth", &self.bitWidth()); @@ -2941,12 +3047,111 @@ impl std::fmt::Debug for Time<'_> { pub enum TimestampOffset {} #[derive(Copy, Clone, PartialEq)] -/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding -/// leap seconds, as a 64-bit integer. Note that UNIX time does not include -/// leap seconds. +/// Timestamp is a 64-bit signed integer representing an elapsed time since a +/// fixed epoch, stored in either of four units: seconds, milliseconds, +/// microseconds or nanoseconds, and is optionally annotated with a timezone. +/// +/// Timestamp values do not include any leap seconds (in other words, all +/// days are considered 86400 seconds long). +/// +/// Timestamps with a non-empty timezone +/// ------------------------------------ +/// +/// If a Timestamp column has a non-empty timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone +/// (the Unix epoch), regardless of the Timestamp's own timezone. +/// +/// Therefore, timestamp values with a non-empty timezone correspond to +/// physical points in time together with some additional information about +/// how the data was obtained and/or how to display it (the timezone). +/// +/// For example, the timestamp value 0 with the timezone string "Europe/Paris" +/// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the +/// application may prefer to display it as "January 1st 1970, 01h00" in +/// the Europe/Paris timezone (which is the same physical point in time). +/// +/// One consequence is that timestamp values with a non-empty timezone +/// can be compared and ordered directly, since they all share the same +/// well-known point of reference (the Unix epoch). +/// +/// Timestamps with an unset / empty timezone +/// ----------------------------------------- +/// +/// If a Timestamp column has no timezone value, its epoch is +/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. +/// +/// Therefore, timestamp values without a timezone cannot be meaningfully +/// interpreted as physical points in time, but only as calendar / clock +/// indications ("wall clock time") in an unspecified timezone. +/// +/// For example, the timestamp value 0 with an empty timezone string +/// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there +/// is not enough information to interpret it as a well-defined physical +/// point in time. +/// +/// One consequence is that timestamp values without a timezone cannot +/// be reliably compared or ordered, since they may have different points of +/// reference. In particular, it is *not* possible to interpret an unset +/// or empty timezone as the same as "UTC". +/// +/// Conversion between timezones +/// ---------------------------- /// -/// The Timestamp metadata supports both "time zone naive" and "time zone -/// aware" timestamps. Read about the timezone attribute for more detail +/// If a Timestamp column has a non-empty timezone, changing the timezone +/// to a different non-empty value is a metadata-only operation: +/// the timestamp values need not change as their point of reference remains +/// the same (the Unix epoch). +/// +/// However, if a Timestamp column has no timezone value, changing it to a +/// non-empty value requires to think about the desired semantics. +/// One possibility is to assume that the original timestamp values are +/// relative to the epoch of the timezone being set; timestamp values should +/// then adjusted to the Unix epoch (for example, changing the timezone from +/// empty to "Europe/Paris" would require converting the timestamp values +/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is +/// nevertheless correct). +/// +/// Guidelines for encoding data from external libraries +/// ---------------------------------------------------- +/// +/// Date & time libraries often have multiple different data types for temporal +/// data. In order to ease interoperability between different implementations the +/// Arrow project has some recommendations for encoding these types into a Timestamp +/// column. +/// +/// An "instant" represents a physical point in time that has no relevant timezone +/// (for example, astronomical data). To encode an instant, use a Timestamp with +/// the timezone string set to "UTC", and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// A "zoned date-time" represents a physical point in time annotated with an +/// informative timezone (for example, the timezone in which the data was +/// recorded). To encode a zoned date-time, use a Timestamp with the timezone +/// string set to the name of the timezone, and make sure the Timestamp values +/// are relative to the UTC epoch (January 1st 1970, midnight). +/// +/// (There is some ambiguity between an instant and a zoned date-time with the +/// UTC timezone. Both of these are stored the same in Arrow. Typically, +/// this distinction does not matter. If it does, then an application should +/// use custom metadata or an extension type to distinguish between the two cases.) +/// +/// An "offset date-time" represents a physical point in time combined with an +/// explicit offset from UTC. To encode an offset date-time, use a Timestamp +/// with the timezone string set to the numeric timezone offset string +/// (e.g. "+03:00"), and make sure the Timestamp values are relative to +/// the UTC epoch (January 1st 1970, midnight). +/// +/// A "naive date-time" (also called "local date-time" in some libraries) +/// represents a wall clock time combined with a calendar date, but with +/// no indication of how to map this information to a physical point in time. +/// Naive date-times must be handled with care because of this missing +/// information, and also because daylight saving time (DST) may make +/// some values ambiguous or non-existent. A naive date-time may be +/// stored as a struct with Date and Time fields. However, it may also be +/// encoded into a Timestamp column with an empty timezone. The timestamp +/// values should be computed "as if" the timezone of the date-time values +/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would +/// be encoded as timestamp value 0. pub struct Timestamp<'a> { pub _tab: flatbuffers::Table<'a>, } @@ -2954,16 +3159,19 @@ pub struct Timestamp<'a> { impl<'a> flatbuffers::Follow<'a> for Timestamp<'a> { type Inner = Timestamp<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Timestamp<'a> { + pub const VT_UNIT: flatbuffers::VOffsetT = 4; + pub const VT_TIMEZONE: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Timestamp { _tab: table } } #[allow(unused_mut)] @@ -2979,39 +3187,36 @@ impl<'a> Timestamp<'a> { builder.finish() } - pub const VT_UNIT: flatbuffers::VOffsetT = 4; - pub const VT_TIMEZONE: flatbuffers::VOffsetT = 6; - #[inline] pub fn unit(&self) -> TimeUnit { - self._tab - .get::(Timestamp::VT_UNIT, Some(TimeUnit::SECOND)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Timestamp::VT_UNIT, Some(TimeUnit::SECOND)) + .unwrap() + } } - /// The time zone is a string indicating the name of a time zone, one of: + /// The timezone is an optional string indicating the name of a timezone, + /// one of: /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// * As used in the Olson timezone database (the "tz database" or + /// "tzdata"), such as "America/New_York". + /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX", + /// such as "+07:30". /// /// Whether a timezone string is present indicates different semantics about - /// the data: - /// - /// * If the time zone is null or equal to an empty string, the data is "time - /// zone naive" and shall be displayed *as is* to the user, not localized - /// to the locale of the user. This data can be though of as UTC but - /// without having "UTC" as the time zone, it is not considered to be - /// localized to any time zone - /// - /// * If the time zone is set to a valid value, values can be displayed as - /// "localized" to that time zone, even though the underlying 64-bit - /// integers are identical to the same data stored in UTC. Converting - /// between time zones is a metadata-only operation and does not change the - /// underlying values + /// the data (see above). #[inline] pub fn timezone(&self) -> Option<&'a str> { - self._tab - .get::>(Timestamp::VT_TIMEZONE, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(Timestamp::VT_TIMEZONE, None) + } } } @@ -3023,9 +3228,9 @@ impl flatbuffers::Verifiable for Timestamp<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"unit", Self::VT_UNIT, false)? + .visit_field::("unit", Self::VT_UNIT, false)? .visit_field::>( - &"timezone", + "timezone", Self::VT_TIMEZONE, false, )? @@ -3046,6 +3251,7 @@ impl<'a> Default for TimestampArgs<'a> { } } } + pub struct TimestampBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -3080,8 +3286,8 @@ impl<'a: 'b, 'b> TimestampBuilder<'a, 'b> { } } -impl std::fmt::Debug for Timestamp<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Timestamp<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Timestamp"); ds.field("unit", &self.unit()); ds.field("timezone", &self.timezone()); @@ -3098,16 +3304,18 @@ pub struct Interval<'a> { impl<'a> flatbuffers::Follow<'a> for Interval<'a> { type Inner = Interval<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Interval<'a> { + pub const VT_UNIT: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Interval { _tab: table } } #[allow(unused_mut)] @@ -3120,13 +3328,16 @@ impl<'a> Interval<'a> { builder.finish() } - pub const VT_UNIT: flatbuffers::VOffsetT = 4; - #[inline] pub fn unit(&self) -> IntervalUnit { - self._tab - .get::(Interval::VT_UNIT, Some(IntervalUnit::YEAR_MONTH)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Interval::VT_UNIT, Some(IntervalUnit::YEAR_MONTH)) + .unwrap() + } } } @@ -3138,7 +3349,7 @@ impl flatbuffers::Verifiable for Interval<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"unit", Self::VT_UNIT, false)? + .visit_field::("unit", Self::VT_UNIT, false)? .finish(); Ok(()) } @@ -3154,6 +3365,7 @@ impl<'a> Default for IntervalArgs { } } } + pub struct IntervalBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -3184,8 +3396,8 @@ impl<'a: 'b, 'b> IntervalBuilder<'a, 'b> { } } -impl std::fmt::Debug for Interval<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Interval<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Interval"); ds.field("unit", &self.unit()); ds.finish() @@ -3201,16 +3413,18 @@ pub struct Duration<'a> { impl<'a> flatbuffers::Follow<'a> for Duration<'a> { type Inner = Duration<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Duration<'a> { + pub const VT_UNIT: flatbuffers::VOffsetT = 4; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Duration { _tab: table } } #[allow(unused_mut)] @@ -3223,13 +3437,16 @@ impl<'a> Duration<'a> { builder.finish() } - pub const VT_UNIT: flatbuffers::VOffsetT = 4; - #[inline] pub fn unit(&self) -> TimeUnit { - self._tab - .get::(Duration::VT_UNIT, Some(TimeUnit::MILLISECOND)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Duration::VT_UNIT, Some(TimeUnit::MILLISECOND)) + .unwrap() + } } } @@ -3241,7 +3458,7 @@ impl flatbuffers::Verifiable for Duration<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"unit", Self::VT_UNIT, false)? + .visit_field::("unit", Self::VT_UNIT, false)? .finish(); Ok(()) } @@ -3257,6 +3474,7 @@ impl<'a> Default for DurationArgs { } } } + pub struct DurationBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -3284,8 +3502,8 @@ impl<'a: 'b, 'b> DurationBuilder<'a, 'b> { } } -impl std::fmt::Debug for Duration<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Duration<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Duration"); ds.field("unit", &self.unit()); ds.finish() @@ -3304,16 +3522,19 @@ pub struct KeyValue<'a> { impl<'a> flatbuffers::Follow<'a> for KeyValue<'a> { type Inner = KeyValue<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> KeyValue<'a> { + pub const VT_KEY: flatbuffers::VOffsetT = 4; + pub const VT_VALUE: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { KeyValue { _tab: table } } #[allow(unused_mut)] @@ -3331,18 +3552,25 @@ impl<'a> KeyValue<'a> { builder.finish() } - pub const VT_KEY: flatbuffers::VOffsetT = 4; - pub const VT_VALUE: flatbuffers::VOffsetT = 6; - #[inline] pub fn key(&self) -> Option<&'a str> { - self._tab - .get::>(KeyValue::VT_KEY, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(KeyValue::VT_KEY, None) + } } #[inline] pub fn value(&self) -> Option<&'a str> { - self._tab - .get::>(KeyValue::VT_VALUE, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(KeyValue::VT_VALUE, None) + } } } @@ -3355,12 +3583,12 @@ impl flatbuffers::Verifiable for KeyValue<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::>( - &"key", + "key", Self::VT_KEY, false, )? .visit_field::>( - &"value", + "value", Self::VT_VALUE, false, )? @@ -3381,6 +3609,7 @@ impl<'a> Default for KeyValueArgs<'a> { } } } + pub struct KeyValueBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -3413,8 +3642,8 @@ impl<'a: 'b, 'b> KeyValueBuilder<'a, 'b> { } } -impl std::fmt::Debug for KeyValue<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for KeyValue<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("KeyValue"); ds.field("key", &self.key()); ds.field("value", &self.value()); @@ -3431,16 +3660,21 @@ pub struct DictionaryEncoding<'a> { impl<'a> flatbuffers::Follow<'a> for DictionaryEncoding<'a> { type Inner = DictionaryEncoding<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> DictionaryEncoding<'a> { + pub const VT_ID: flatbuffers::VOffsetT = 4; + pub const VT_INDEXTYPE: flatbuffers::VOffsetT = 6; + pub const VT_ISORDERED: flatbuffers::VOffsetT = 8; + pub const VT_DICTIONARYKIND: flatbuffers::VOffsetT = 10; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { DictionaryEncoding { _tab: table } } #[allow(unused_mut)] @@ -3458,19 +3692,19 @@ impl<'a> DictionaryEncoding<'a> { builder.finish() } - pub const VT_ID: flatbuffers::VOffsetT = 4; - pub const VT_INDEXTYPE: flatbuffers::VOffsetT = 6; - pub const VT_ISORDERED: flatbuffers::VOffsetT = 8; - pub const VT_DICTIONARYKIND: flatbuffers::VOffsetT = 10; - /// The known dictionary id in the application where this data is used. In /// the file or streaming formats, the dictionary ids are found in the /// DictionaryBatch messages #[inline] pub fn id(&self) -> i64 { - self._tab - .get::(DictionaryEncoding::VT_ID, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(DictionaryEncoding::VT_ID, Some(0)) + .unwrap() + } } /// The dictionary indices are constrained to be non-negative integers. If /// this field is null, the indices must be signed int32. To maximize @@ -3479,10 +3713,15 @@ impl<'a> DictionaryEncoding<'a> { /// and to avoid uint64 indices unless they are required by an application. #[inline] pub fn indexType(&self) -> Option> { - self._tab.get::>( - DictionaryEncoding::VT_INDEXTYPE, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>( + DictionaryEncoding::VT_INDEXTYPE, + None, + ) + } } /// By default, dictionaries are not ordered, or the order does not have /// semantic meaning. In some statistical, applications, dictionary-encoding @@ -3490,18 +3729,28 @@ impl<'a> DictionaryEncoding<'a> { /// preserve that metadata here #[inline] pub fn isOrdered(&self) -> bool { - self._tab - .get::(DictionaryEncoding::VT_ISORDERED, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(DictionaryEncoding::VT_ISORDERED, Some(false)) + .unwrap() + } } #[inline] pub fn dictionaryKind(&self) -> DictionaryKind { - self._tab - .get::( - DictionaryEncoding::VT_DICTIONARYKIND, - Some(DictionaryKind::DenseArray), - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::( + DictionaryEncoding::VT_DICTIONARYKIND, + Some(DictionaryKind::DenseArray), + ) + .unwrap() + } } } @@ -3513,15 +3762,15 @@ impl flatbuffers::Verifiable for DictionaryEncoding<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"id", Self::VT_ID, false)? + .visit_field::("id", Self::VT_ID, false)? .visit_field::>( - &"indexType", + "indexType", Self::VT_INDEXTYPE, false, )? - .visit_field::(&"isOrdered", Self::VT_ISORDERED, false)? + .visit_field::("isOrdered", Self::VT_ISORDERED, false)? .visit_field::( - &"dictionaryKind", + "dictionaryKind", Self::VT_DICTIONARYKIND, false, )? @@ -3546,6 +3795,7 @@ impl<'a> Default for DictionaryEncodingArgs<'a> { } } } + pub struct DictionaryEncodingBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -3592,8 +3842,8 @@ impl<'a: 'b, 'b> DictionaryEncodingBuilder<'a, 'b> { } } -impl std::fmt::Debug for DictionaryEncoding<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for DictionaryEncoding<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("DictionaryEncoding"); ds.field("id", &self.id()); ds.field("indexType", &self.indexType()); @@ -3615,16 +3865,24 @@ pub struct Field<'a> { impl<'a> flatbuffers::Follow<'a> for Field<'a> { type Inner = Field<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Field<'a> { + pub const VT_NAME: flatbuffers::VOffsetT = 4; + pub const VT_NULLABLE: flatbuffers::VOffsetT = 6; + pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 8; + pub const VT_TYPE_: flatbuffers::VOffsetT = 10; + pub const VT_DICTIONARY: flatbuffers::VOffsetT = 12; + pub const VT_CHILDREN: flatbuffers::VOffsetT = 14; + pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 16; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Field { _tab: table } } #[allow(unused_mut)] @@ -3653,50 +3911,67 @@ impl<'a> Field<'a> { builder.finish() } - pub const VT_NAME: flatbuffers::VOffsetT = 4; - pub const VT_NULLABLE: flatbuffers::VOffsetT = 6; - pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 8; - pub const VT_TYPE_: flatbuffers::VOffsetT = 10; - pub const VT_DICTIONARY: flatbuffers::VOffsetT = 12; - pub const VT_CHILDREN: flatbuffers::VOffsetT = 14; - pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 16; - /// Name is not required, in i.e. a List #[inline] pub fn name(&self) -> Option<&'a str> { - self._tab - .get::>(Field::VT_NAME, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(Field::VT_NAME, None) + } } /// Whether or not this field can contain nulls. Should be true in general. #[inline] pub fn nullable(&self) -> bool { - self._tab - .get::(Field::VT_NULLABLE, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Field::VT_NULLABLE, Some(false)) + .unwrap() + } } #[inline] pub fn type_type(&self) -> Type { - self._tab - .get::(Field::VT_TYPE_TYPE, Some(Type::NONE)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Field::VT_TYPE_TYPE, Some(Type::NONE)) + .unwrap() + } } /// This is the type of the decoded value if the field is dictionary encoded. #[inline] pub fn type_(&self) -> Option> { - self._tab - .get::>>( - Field::VT_TYPE_, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Field::VT_TYPE_, + None, + ) + } } /// Present only if the field is dictionary encoded. #[inline] pub fn dictionary(&self) -> Option> { - self._tab - .get::>( - Field::VT_DICTIONARY, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + Field::VT_DICTIONARY, + None, + ) + } } /// children apply only to nested data types like Struct, List and Union. For /// primitive types children will have length 0. @@ -3704,24 +3979,39 @@ impl<'a> Field<'a> { pub fn children( &self, ) -> Option>>> { - self._tab.get::>, - >>(Field::VT_CHILDREN, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Field::VT_CHILDREN, None) + } } /// User-defined metadata #[inline] pub fn custom_metadata( &self, ) -> Option>>> { - self._tab.get::>, - >>(Field::VT_CUSTOM_METADATA, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Field::VT_CUSTOM_METADATA, None) + } } #[inline] #[allow(non_snake_case)] pub fn type_as_null(&self) -> Option> { if self.type_type() == Type::Null { - self.type_().map(Null::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Null::init_from_table(t) } + }) } else { None } @@ -3731,7 +4021,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_int(&self) -> Option> { if self.type_type() == Type::Int { - self.type_().map(Int::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Int::init_from_table(t) } + }) } else { None } @@ -3741,7 +4036,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_floating_point(&self) -> Option> { if self.type_type() == Type::FloatingPoint { - self.type_().map(FloatingPoint::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { FloatingPoint::init_from_table(t) } + }) } else { None } @@ -3751,7 +4051,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_binary(&self) -> Option> { if self.type_type() == Type::Binary { - self.type_().map(Binary::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Binary::init_from_table(t) } + }) } else { None } @@ -3761,7 +4066,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_utf_8(&self) -> Option> { if self.type_type() == Type::Utf8 { - self.type_().map(Utf8::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Utf8::init_from_table(t) } + }) } else { None } @@ -3771,7 +4081,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_bool(&self) -> Option> { if self.type_type() == Type::Bool { - self.type_().map(Bool::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Bool::init_from_table(t) } + }) } else { None } @@ -3781,7 +4096,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_decimal(&self) -> Option> { if self.type_type() == Type::Decimal { - self.type_().map(Decimal::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Decimal::init_from_table(t) } + }) } else { None } @@ -3791,7 +4111,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_date(&self) -> Option> { if self.type_type() == Type::Date { - self.type_().map(Date::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Date::init_from_table(t) } + }) } else { None } @@ -3801,7 +4126,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_time(&self) -> Option> { if self.type_type() == Type::Time { - self.type_().map(Time::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Time::init_from_table(t) } + }) } else { None } @@ -3811,7 +4141,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_timestamp(&self) -> Option> { if self.type_type() == Type::Timestamp { - self.type_().map(Timestamp::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Timestamp::init_from_table(t) } + }) } else { None } @@ -3821,7 +4156,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_interval(&self) -> Option> { if self.type_type() == Type::Interval { - self.type_().map(Interval::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Interval::init_from_table(t) } + }) } else { None } @@ -3831,7 +4171,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_list(&self) -> Option> { if self.type_type() == Type::List { - self.type_().map(List::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { List::init_from_table(t) } + }) } else { None } @@ -3841,7 +4186,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_struct_(&self) -> Option> { if self.type_type() == Type::Struct_ { - self.type_().map(Struct_::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Struct_::init_from_table(t) } + }) } else { None } @@ -3851,7 +4201,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_union(&self) -> Option> { if self.type_type() == Type::Union { - self.type_().map(Union::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Union::init_from_table(t) } + }) } else { None } @@ -3861,7 +4216,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_fixed_size_binary(&self) -> Option> { if self.type_type() == Type::FixedSizeBinary { - self.type_().map(FixedSizeBinary::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { FixedSizeBinary::init_from_table(t) } + }) } else { None } @@ -3871,7 +4231,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_fixed_size_list(&self) -> Option> { if self.type_type() == Type::FixedSizeList { - self.type_().map(FixedSizeList::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { FixedSizeList::init_from_table(t) } + }) } else { None } @@ -3881,7 +4246,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_map(&self) -> Option> { if self.type_type() == Type::Map { - self.type_().map(Map::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Map::init_from_table(t) } + }) } else { None } @@ -3891,7 +4261,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_duration(&self) -> Option> { if self.type_type() == Type::Duration { - self.type_().map(Duration::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { Duration::init_from_table(t) } + }) } else { None } @@ -3901,7 +4276,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_large_binary(&self) -> Option> { if self.type_type() == Type::LargeBinary { - self.type_().map(LargeBinary::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { LargeBinary::init_from_table(t) } + }) } else { None } @@ -3911,7 +4291,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_large_utf_8(&self) -> Option> { if self.type_type() == Type::LargeUtf8 { - self.type_().map(LargeUtf8::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { LargeUtf8::init_from_table(t) } + }) } else { None } @@ -3921,7 +4306,12 @@ impl<'a> Field<'a> { #[allow(non_snake_case)] pub fn type_as_large_list(&self) -> Option> { if self.type_type() == Type::LargeList { - self.type_().map(LargeList::init_from_table) + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { LargeList::init_from_table(t) } + }) } else { None } @@ -3936,9 +4326,9 @@ impl flatbuffers::Verifiable for Field<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::>(&"name", Self::VT_NAME, false)? - .visit_field::(&"nullable", Self::VT_NULLABLE, false)? - .visit_union::(&"type_type", Self::VT_TYPE_TYPE, &"type_", Self::VT_TYPE_, false, |key, v, pos| { + .visit_field::>("name", Self::VT_NAME, false)? + .visit_field::("nullable", Self::VT_NULLABLE, false)? + .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, false, |key, v, pos| { match key { Type::Null => v.verify_union_variant::>("Type::Null", pos), Type::Int => v.verify_union_variant::>("Type::Int", pos), @@ -3964,9 +4354,9 @@ impl flatbuffers::Verifiable for Field<'_> { _ => Ok(()), } })? - .visit_field::>(&"dictionary", Self::VT_DICTIONARY, false)? - .visit_field::>>>(&"children", Self::VT_CHILDREN, false)? - .visit_field::>>>(&"custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .visit_field::>("dictionary", Self::VT_DICTIONARY, false)? + .visit_field::>>>("children", Self::VT_CHILDREN, false)? + .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? .finish(); Ok(()) } @@ -4002,6 +4392,7 @@ impl<'a> Default for FieldArgs<'a> { } } } + pub struct FieldBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -4078,8 +4469,8 @@ impl<'a: 'b, 'b> FieldBuilder<'a, 'b> { } } -impl std::fmt::Debug for Field<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Field<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Field"); ds.field("name", &self.name()); ds.field("nullable", &self.nullable()); @@ -4318,16 +4709,21 @@ pub struct Schema<'a> { impl<'a> flatbuffers::Follow<'a> for Schema<'a> { type Inner = Schema<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Schema<'a> { + pub const VT_ENDIANNESS: flatbuffers::VOffsetT = 4; + pub const VT_FIELDS: flatbuffers::VOffsetT = 6; + pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 8; + pub const VT_FEATURES: flatbuffers::VOffsetT = 10; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Schema { _tab: table } } #[allow(unused_mut)] @@ -4349,44 +4745,59 @@ impl<'a> Schema<'a> { builder.finish() } - pub const VT_ENDIANNESS: flatbuffers::VOffsetT = 4; - pub const VT_FIELDS: flatbuffers::VOffsetT = 6; - pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 8; - pub const VT_FEATURES: flatbuffers::VOffsetT = 10; - /// endianness of the buffer /// it is Little Endian by default /// if endianness doesn't match the underlying system then the vectors need to be converted #[inline] pub fn endianness(&self) -> Endianness { - self._tab - .get::(Schema::VT_ENDIANNESS, Some(Endianness::Little)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Schema::VT_ENDIANNESS, Some(Endianness::Little)) + .unwrap() + } } #[inline] pub fn fields( &self, ) -> Option>>> { - self._tab.get::>, - >>(Schema::VT_FIELDS, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Schema::VT_FIELDS, None) + } } #[inline] pub fn custom_metadata( &self, ) -> Option>>> { - self._tab.get::>, - >>(Schema::VT_CUSTOM_METADATA, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab.get::>, + >>(Schema::VT_CUSTOM_METADATA, None) + } } /// Features used in the stream/file. #[inline] pub fn features(&self) -> Option> { - self._tab - .get::>>( - Schema::VT_FEATURES, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Schema::VT_FEATURES, + None, + ) + } } } @@ -4398,10 +4809,10 @@ impl flatbuffers::Verifiable for Schema<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"endianness", Self::VT_ENDIANNESS, false)? - .visit_field::>>>(&"fields", Self::VT_FIELDS, false)? - .visit_field::>>>(&"custom_metadata", Self::VT_CUSTOM_METADATA, false)? - .visit_field::>>(&"features", Self::VT_FEATURES, false)? + .visit_field::("endianness", Self::VT_ENDIANNESS, false)? + .visit_field::>>>("fields", Self::VT_FIELDS, false)? + .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .visit_field::>>("features", Self::VT_FEATURES, false)? .finish(); Ok(()) } @@ -4431,6 +4842,7 @@ impl<'a> Default for SchemaArgs<'a> { } } } + pub struct SchemaBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -4491,8 +4903,8 @@ impl<'a: 'b, 'b> SchemaBuilder<'a, 'b> { } } -impl std::fmt::Debug for Schema<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Schema<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Schema"); ds.field("endianness", &self.endianness()); ds.field("fields", &self.fields()); @@ -4501,18 +4913,6 @@ impl std::fmt::Debug for Schema<'_> { ds.finish() } } -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_schema<'a>(buf: &'a [u8]) -> Schema<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_schema<'a>(buf: &'a [u8]) -> Schema<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - #[inline] /// Verifies that a buffer of bytes contains a `Schema` /// and returns it. diff --git a/arrow/src/ipc/gen/SparseTensor.rs b/arrow/src/ipc/gen/SparseTensor.rs index 986d21cda84f..317831c59ef0 100644 --- a/arrow/src/ipc/gen/SparseTensor.rs +++ b/arrow/src/ipc/gen/SparseTensor.rs @@ -44,7 +44,7 @@ pub const ENUM_VALUES_SPARSE_MATRIX_COMPRESSED_AXIS: [SparseMatrixCompressedAxis SparseMatrixCompressedAxis::Column, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct SparseMatrixCompressedAxis(pub i16); #[allow(non_upper_case_globals)] @@ -64,8 +64,8 @@ impl SparseMatrixCompressedAxis { } } } -impl std::fmt::Debug for SparseMatrixCompressedAxis { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for SparseMatrixCompressedAxis { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -76,8 +76,8 @@ impl std::fmt::Debug for SparseMatrixCompressedAxis { impl<'a> flatbuffers::Follow<'a> for SparseMatrixCompressedAxis { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -85,20 +85,21 @@ impl<'a> flatbuffers::Follow<'a> for SparseMatrixCompressedAxis { impl flatbuffers::Push for SparseMatrixCompressedAxis { type Output = SparseMatrixCompressedAxis; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for SparseMatrixCompressedAxis { + type Scalar = i16; #[inline] - fn to_little_endian(self) -> Self { - let b = i16::to_le(self.0); - Self(b) + fn to_little_endian(self) -> i16 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = i16::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: i16) -> Self { + let b = i16::from_le(v); Self(b) } } @@ -137,7 +138,7 @@ pub const ENUM_VALUES_SPARSE_TENSOR_INDEX: [SparseTensorIndex; 4] = [ SparseTensorIndex::SparseTensorIndexCSF, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct SparseTensorIndex(pub u8); #[allow(non_upper_case_globals)] @@ -166,8 +167,8 @@ impl SparseTensorIndex { } } } -impl std::fmt::Debug for SparseTensorIndex { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl core::fmt::Debug for SparseTensorIndex { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if let Some(name) = self.variant_name() { f.write_str(name) } else { @@ -175,12 +176,11 @@ impl std::fmt::Debug for SparseTensorIndex { } } } -pub struct SparseTensorIndexUnionTableOffset {} impl<'a> flatbuffers::Follow<'a> for SparseTensorIndex { type Inner = Self; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - let b = unsafe { flatbuffers::read_scalar_at::(buf, loc) }; + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + let b = flatbuffers::read_scalar_at::(buf, loc); Self(b) } } @@ -188,20 +188,21 @@ impl<'a> flatbuffers::Follow<'a> for SparseTensorIndex { impl flatbuffers::Push for SparseTensorIndex { type Output = SparseTensorIndex; #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { + flatbuffers::emplace_scalar::(dst, self.0); } } impl flatbuffers::EndianScalar for SparseTensorIndex { + type Scalar = u8; #[inline] - fn to_little_endian(self) -> Self { - let b = u8::to_le(self.0); - Self(b) + fn to_little_endian(self) -> u8 { + self.0.to_le() } #[inline] - fn from_little_endian(self) -> Self { - let b = u8::from_le(self.0); + #[allow(clippy::wrong_self_convention)] + fn from_little_endian(v: u8) -> Self { + let b = u8::from_le(v); Self(b) } } @@ -218,6 +219,8 @@ impl<'a> flatbuffers::Verifiable for SparseTensorIndex { } impl flatbuffers::SimpleToVerifyInSlice for SparseTensorIndex {} +pub struct SparseTensorIndexUnionTableOffset {} + pub enum SparseTensorIndexCOOOffset {} #[derive(Copy, Clone, PartialEq)] @@ -260,16 +263,21 @@ pub struct SparseTensorIndexCOO<'a> { impl<'a> flatbuffers::Follow<'a> for SparseTensorIndexCOO<'a> { type Inner = SparseTensorIndexCOO<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> SparseTensorIndexCOO<'a> { + pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 4; + pub const VT_INDICESSTRIDES: flatbuffers::VOffsetT = 6; + pub const VT_INDICESBUFFER: flatbuffers::VOffsetT = 8; + pub const VT_ISCANONICAL: flatbuffers::VOffsetT = 10; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { SparseTensorIndexCOO { _tab: table } } #[allow(unused_mut)] @@ -291,37 +299,47 @@ impl<'a> SparseTensorIndexCOO<'a> { builder.finish() } - pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 4; - pub const VT_INDICESSTRIDES: flatbuffers::VOffsetT = 6; - pub const VT_INDICESBUFFER: flatbuffers::VOffsetT = 8; - pub const VT_ISCANONICAL: flatbuffers::VOffsetT = 10; - /// The type of values in indicesBuffer #[inline] pub fn indicesType(&self) -> Int<'a> { - self._tab - .get::>( - SparseTensorIndexCOO::VT_INDICESTYPE, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + SparseTensorIndexCOO::VT_INDICESTYPE, + None, + ) + .unwrap() + } } /// Non-negative byte offsets to advance one value cell along each dimension /// If omitted, default to row-major order (C-like). #[inline] pub fn indicesStrides(&self) -> Option> { - self._tab - .get::>>( - SparseTensorIndexCOO::VT_INDICESSTRIDES, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensorIndexCOO::VT_INDICESSTRIDES, + None, + ) + } } /// The location and size of the indices matrix's data #[inline] pub fn indicesBuffer(&self) -> &'a Buffer { - self._tab - .get::(SparseTensorIndexCOO::VT_INDICESBUFFER, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseTensorIndexCOO::VT_INDICESBUFFER, None) + .unwrap() + } } /// This flag is true if and only if the indices matrix is sorted in /// row-major order, and does not have duplicated entries. @@ -330,9 +348,14 @@ impl<'a> SparseTensorIndexCOO<'a> { /// (SciPy employs column-major order for its coo_matrix). #[inline] pub fn isCanonical(&self) -> bool { - self._tab - .get::(SparseTensorIndexCOO::VT_ISCANONICAL, Some(false)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseTensorIndexCOO::VT_ISCANONICAL, Some(false)) + .unwrap() + } } } @@ -345,17 +368,17 @@ impl flatbuffers::Verifiable for SparseTensorIndexCOO<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::>( - &"indicesType", + "indicesType", Self::VT_INDICESTYPE, true, )? .visit_field::>>( - &"indicesStrides", + "indicesStrides", Self::VT_INDICESSTRIDES, false, )? - .visit_field::(&"indicesBuffer", Self::VT_INDICESBUFFER, true)? - .visit_field::(&"isCanonical", Self::VT_ISCANONICAL, false)? + .visit_field::("indicesBuffer", Self::VT_INDICESBUFFER, true)? + .visit_field::("isCanonical", Self::VT_ISCANONICAL, false)? .finish(); Ok(()) } @@ -377,6 +400,7 @@ impl<'a> Default for SparseTensorIndexCOOArgs<'a> { } } } + pub struct SparseTensorIndexCOOBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -435,8 +459,8 @@ impl<'a: 'b, 'b> SparseTensorIndexCOOBuilder<'a, 'b> { } } -impl std::fmt::Debug for SparseTensorIndexCOO<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for SparseTensorIndexCOO<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("SparseTensorIndexCOO"); ds.field("indicesType", &self.indicesType()); ds.field("indicesStrides", &self.indicesStrides()); @@ -456,16 +480,22 @@ pub struct SparseMatrixIndexCSX<'a> { impl<'a> flatbuffers::Follow<'a> for SparseMatrixIndexCSX<'a> { type Inner = SparseMatrixIndexCSX<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> SparseMatrixIndexCSX<'a> { + pub const VT_COMPRESSEDAXIS: flatbuffers::VOffsetT = 4; + pub const VT_INDPTRTYPE: flatbuffers::VOffsetT = 6; + pub const VT_INDPTRBUFFER: flatbuffers::VOffsetT = 8; + pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 10; + pub const VT_INDICESBUFFER: flatbuffers::VOffsetT = 12; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { SparseMatrixIndexCSX { _tab: table } } #[allow(unused_mut)] @@ -490,31 +520,35 @@ impl<'a> SparseMatrixIndexCSX<'a> { builder.finish() } - pub const VT_COMPRESSEDAXIS: flatbuffers::VOffsetT = 4; - pub const VT_INDPTRTYPE: flatbuffers::VOffsetT = 6; - pub const VT_INDPTRBUFFER: flatbuffers::VOffsetT = 8; - pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 10; - pub const VT_INDICESBUFFER: flatbuffers::VOffsetT = 12; - /// Which axis, row or column, is compressed #[inline] pub fn compressedAxis(&self) -> SparseMatrixCompressedAxis { - self._tab - .get::( - SparseMatrixIndexCSX::VT_COMPRESSEDAXIS, - Some(SparseMatrixCompressedAxis::Row), - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::( + SparseMatrixIndexCSX::VT_COMPRESSEDAXIS, + Some(SparseMatrixCompressedAxis::Row), + ) + .unwrap() + } } /// The type of values in indptrBuffer #[inline] pub fn indptrType(&self) -> Int<'a> { - self._tab - .get::>( - SparseMatrixIndexCSX::VT_INDPTRTYPE, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + SparseMatrixIndexCSX::VT_INDPTRTYPE, + None, + ) + .unwrap() + } } /// indptrBuffer stores the location and size of indptr array that /// represents the range of the rows. @@ -541,19 +575,29 @@ impl<'a> SparseMatrixIndexCSX<'a> { /// ``` #[inline] pub fn indptrBuffer(&self) -> &'a Buffer { - self._tab - .get::(SparseMatrixIndexCSX::VT_INDPTRBUFFER, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseMatrixIndexCSX::VT_INDPTRBUFFER, None) + .unwrap() + } } /// The type of values in indicesBuffer #[inline] pub fn indicesType(&self) -> Int<'a> { - self._tab - .get::>( - SparseMatrixIndexCSX::VT_INDICESTYPE, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + SparseMatrixIndexCSX::VT_INDICESTYPE, + None, + ) + .unwrap() + } } /// indicesBuffer stores the location and size of the array that /// contains the column indices of the corresponding non-zero values. @@ -566,9 +610,14 @@ impl<'a> SparseMatrixIndexCSX<'a> { /// Note that the indices are sorted in lexicographical order for each row. #[inline] pub fn indicesBuffer(&self) -> &'a Buffer { - self._tab - .get::(SparseMatrixIndexCSX::VT_INDICESBUFFER, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseMatrixIndexCSX::VT_INDICESBUFFER, None) + .unwrap() + } } } @@ -581,22 +630,22 @@ impl flatbuffers::Verifiable for SparseMatrixIndexCSX<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::( - &"compressedAxis", + "compressedAxis", Self::VT_COMPRESSEDAXIS, false, )? .visit_field::>( - &"indptrType", + "indptrType", Self::VT_INDPTRTYPE, true, )? - .visit_field::(&"indptrBuffer", Self::VT_INDPTRBUFFER, true)? + .visit_field::("indptrBuffer", Self::VT_INDPTRBUFFER, true)? .visit_field::>( - &"indicesType", + "indicesType", Self::VT_INDICESTYPE, true, )? - .visit_field::(&"indicesBuffer", Self::VT_INDICESBUFFER, true)? + .visit_field::("indicesBuffer", Self::VT_INDICESBUFFER, true)? .finish(); Ok(()) } @@ -620,6 +669,7 @@ impl<'a> Default for SparseMatrixIndexCSXArgs<'a> { } } } + pub struct SparseMatrixIndexCSXBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -686,8 +736,8 @@ impl<'a: 'b, 'b> SparseMatrixIndexCSXBuilder<'a, 'b> { } } -impl std::fmt::Debug for SparseMatrixIndexCSX<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for SparseMatrixIndexCSX<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("SparseMatrixIndexCSX"); ds.field("compressedAxis", &self.compressedAxis()); ds.field("indptrType", &self.indptrType()); @@ -708,16 +758,22 @@ pub struct SparseTensorIndexCSF<'a> { impl<'a> flatbuffers::Follow<'a> for SparseTensorIndexCSF<'a> { type Inner = SparseTensorIndexCSF<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> SparseTensorIndexCSF<'a> { + pub const VT_INDPTRTYPE: flatbuffers::VOffsetT = 4; + pub const VT_INDPTRBUFFERS: flatbuffers::VOffsetT = 6; + pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 8; + pub const VT_INDICESBUFFERS: flatbuffers::VOffsetT = 10; + pub const VT_AXISORDER: flatbuffers::VOffsetT = 12; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { SparseTensorIndexCSF { _tab: table } } #[allow(unused_mut)] @@ -744,12 +800,6 @@ impl<'a> SparseTensorIndexCSF<'a> { builder.finish() } - pub const VT_INDPTRTYPE: flatbuffers::VOffsetT = 4; - pub const VT_INDPTRBUFFERS: flatbuffers::VOffsetT = 6; - pub const VT_INDICESTYPE: flatbuffers::VOffsetT = 8; - pub const VT_INDICESBUFFERS: flatbuffers::VOffsetT = 10; - pub const VT_AXISORDER: flatbuffers::VOffsetT = 12; - /// CSF is a generalization of compressed sparse row (CSR) index. /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf) /// @@ -783,12 +833,17 @@ impl<'a> SparseTensorIndexCSF<'a> { /// The type of values in indptrBuffers #[inline] pub fn indptrType(&self) -> Int<'a> { - self._tab - .get::>( - SparseTensorIndexCSF::VT_INDPTRTYPE, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + SparseTensorIndexCSF::VT_INDPTRTYPE, + None, + ) + .unwrap() + } } /// indptrBuffers stores the sparsity structure. /// Each two consecutive dimensions in a tensor correspond to a buffer in @@ -805,24 +860,33 @@ impl<'a> SparseTensorIndexCSF<'a> { /// ]. /// ``` #[inline] - pub fn indptrBuffers(&self) -> &'a [Buffer] { - self._tab - .get::>>( - SparseTensorIndexCSF::VT_INDPTRBUFFERS, - None, - ) - .map(|v| v.safe_slice()) - .unwrap() + pub fn indptrBuffers(&self) -> flatbuffers::Vector<'a, Buffer> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensorIndexCSF::VT_INDPTRBUFFERS, + None, + ) + .unwrap() + } } /// The type of values in indicesBuffers #[inline] pub fn indicesType(&self) -> Int<'a> { - self._tab - .get::>( - SparseTensorIndexCSF::VT_INDICESTYPE, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>( + SparseTensorIndexCSF::VT_INDICESTYPE, + None, + ) + .unwrap() + } } /// indicesBuffers stores values of nodes. /// Each tensor dimension corresponds to a buffer in indicesBuffers. @@ -836,14 +900,18 @@ impl<'a> SparseTensorIndexCSF<'a> { /// ]. /// ``` #[inline] - pub fn indicesBuffers(&self) -> &'a [Buffer] { - self._tab - .get::>>( - SparseTensorIndexCSF::VT_INDICESBUFFERS, - None, - ) - .map(|v| v.safe_slice()) - .unwrap() + pub fn indicesBuffers(&self) -> flatbuffers::Vector<'a, Buffer> { + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensorIndexCSF::VT_INDICESBUFFERS, + None, + ) + .unwrap() + } } /// axisOrder stores the sequence in which dimensions were traversed to /// produce the prefix tree. @@ -853,12 +921,17 @@ impl<'a> SparseTensorIndexCSF<'a> { /// ``` #[inline] pub fn axisOrder(&self) -> flatbuffers::Vector<'a, i32> { - self._tab - .get::>>( - SparseTensorIndexCSF::VT_AXISORDER, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensorIndexCSF::VT_AXISORDER, + None, + ) + .unwrap() + } } } @@ -871,27 +944,27 @@ impl flatbuffers::Verifiable for SparseTensorIndexCSF<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::>( - &"indptrType", + "indptrType", Self::VT_INDPTRTYPE, true, )? .visit_field::>>( - &"indptrBuffers", + "indptrBuffers", Self::VT_INDPTRBUFFERS, true, )? .visit_field::>( - &"indicesType", + "indicesType", Self::VT_INDICESTYPE, true, )? .visit_field::>>( - &"indicesBuffers", + "indicesBuffers", Self::VT_INDICESBUFFERS, true, )? .visit_field::>>( - &"axisOrder", + "axisOrder", Self::VT_AXISORDER, true, )? @@ -918,6 +991,7 @@ impl<'a> Default for SparseTensorIndexCSFArgs<'a> { } } } + pub struct SparseTensorIndexCSFBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -994,8 +1068,8 @@ impl<'a: 'b, 'b> SparseTensorIndexCSFBuilder<'a, 'b> { } } -impl std::fmt::Debug for SparseTensorIndexCSF<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for SparseTensorIndexCSF<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("SparseTensorIndexCSF"); ds.field("indptrType", &self.indptrType()); ds.field("indptrBuffers", &self.indptrBuffers()); @@ -1015,16 +1089,24 @@ pub struct SparseTensor<'a> { impl<'a> flatbuffers::Follow<'a> for SparseTensor<'a> { type Inner = SparseTensor<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> SparseTensor<'a> { + pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 4; + pub const VT_TYPE_: flatbuffers::VOffsetT = 6; + pub const VT_SHAPE: flatbuffers::VOffsetT = 8; + pub const VT_NON_ZERO_LENGTH: flatbuffers::VOffsetT = 10; + pub const VT_SPARSEINDEX_TYPE: flatbuffers::VOffsetT = 12; + pub const VT_SPARSEINDEX: flatbuffers::VOffsetT = 14; + pub const VT_DATA: flatbuffers::VOffsetT = 16; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { SparseTensor { _tab: table } } #[allow(unused_mut)] @@ -1051,82 +1133,112 @@ impl<'a> SparseTensor<'a> { builder.finish() } - pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 4; - pub const VT_TYPE_: flatbuffers::VOffsetT = 6; - pub const VT_SHAPE: flatbuffers::VOffsetT = 8; - pub const VT_NON_ZERO_LENGTH: flatbuffers::VOffsetT = 10; - pub const VT_SPARSEINDEX_TYPE: flatbuffers::VOffsetT = 12; - pub const VT_SPARSEINDEX: flatbuffers::VOffsetT = 14; - pub const VT_DATA: flatbuffers::VOffsetT = 16; - #[inline] pub fn type_type(&self) -> Type { - self._tab - .get::(SparseTensor::VT_TYPE_TYPE, Some(Type::NONE)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseTensor::VT_TYPE_TYPE, Some(Type::NONE)) + .unwrap() + } } /// The type of data contained in a value cell. /// Currently only fixed-width value types are supported, /// no strings or nested types. #[inline] pub fn type_(&self) -> flatbuffers::Table<'a> { - self._tab - .get::>>( - SparseTensor::VT_TYPE_, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensor::VT_TYPE_, + None, + ) + .unwrap() + } } /// The dimensions of the tensor, optionally named. #[inline] pub fn shape( &self, ) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { - self._tab - .get::>, - >>(SparseTensor::VT_SHAPE, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>, + >>(SparseTensor::VT_SHAPE, None) + .unwrap() + } } /// The number of non-zero values in a sparse tensor. #[inline] pub fn non_zero_length(&self) -> i64 { - self._tab - .get::(SparseTensor::VT_NON_ZERO_LENGTH, Some(0)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseTensor::VT_NON_ZERO_LENGTH, Some(0)) + .unwrap() + } } #[inline] pub fn sparseIndex_type(&self) -> SparseTensorIndex { - self._tab - .get::( - SparseTensor::VT_SPARSEINDEX_TYPE, - Some(SparseTensorIndex::NONE), - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::( + SparseTensor::VT_SPARSEINDEX_TYPE, + Some(SparseTensorIndex::NONE), + ) + .unwrap() + } } /// Sparse tensor index #[inline] pub fn sparseIndex(&self) -> flatbuffers::Table<'a> { - self._tab - .get::>>( - SparseTensor::VT_SPARSEINDEX, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + SparseTensor::VT_SPARSEINDEX, + None, + ) + .unwrap() + } } /// The location and size of the tensor's data #[inline] pub fn data(&self) -> &'a Buffer { - self._tab - .get::(SparseTensor::VT_DATA, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(SparseTensor::VT_DATA, None) + .unwrap() + } } #[inline] #[allow(non_snake_case)] pub fn type_as_null(&self) -> Option> { if self.type_type() == Type::Null { let u = self.type_(); - Some(Null::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Null::init_from_table(u) }) } else { None } @@ -1137,7 +1249,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_int(&self) -> Option> { if self.type_type() == Type::Int { let u = self.type_(); - Some(Int::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Int::init_from_table(u) }) } else { None } @@ -1148,7 +1263,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_floating_point(&self) -> Option> { if self.type_type() == Type::FloatingPoint { let u = self.type_(); - Some(FloatingPoint::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FloatingPoint::init_from_table(u) }) } else { None } @@ -1159,7 +1277,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_binary(&self) -> Option> { if self.type_type() == Type::Binary { let u = self.type_(); - Some(Binary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Binary::init_from_table(u) }) } else { None } @@ -1170,7 +1291,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_utf_8(&self) -> Option> { if self.type_type() == Type::Utf8 { let u = self.type_(); - Some(Utf8::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Utf8::init_from_table(u) }) } else { None } @@ -1181,7 +1305,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_bool(&self) -> Option> { if self.type_type() == Type::Bool { let u = self.type_(); - Some(Bool::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Bool::init_from_table(u) }) } else { None } @@ -1192,7 +1319,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_decimal(&self) -> Option> { if self.type_type() == Type::Decimal { let u = self.type_(); - Some(Decimal::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Decimal::init_from_table(u) }) } else { None } @@ -1203,7 +1333,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_date(&self) -> Option> { if self.type_type() == Type::Date { let u = self.type_(); - Some(Date::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Date::init_from_table(u) }) } else { None } @@ -1214,7 +1347,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_time(&self) -> Option> { if self.type_type() == Type::Time { let u = self.type_(); - Some(Time::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Time::init_from_table(u) }) } else { None } @@ -1225,7 +1361,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_timestamp(&self) -> Option> { if self.type_type() == Type::Timestamp { let u = self.type_(); - Some(Timestamp::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Timestamp::init_from_table(u) }) } else { None } @@ -1236,7 +1375,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_interval(&self) -> Option> { if self.type_type() == Type::Interval { let u = self.type_(); - Some(Interval::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Interval::init_from_table(u) }) } else { None } @@ -1247,7 +1389,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_list(&self) -> Option> { if self.type_type() == Type::List { let u = self.type_(); - Some(List::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { List::init_from_table(u) }) } else { None } @@ -1258,7 +1403,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_struct_(&self) -> Option> { if self.type_type() == Type::Struct_ { let u = self.type_(); - Some(Struct_::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Struct_::init_from_table(u) }) } else { None } @@ -1269,7 +1417,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_union(&self) -> Option> { if self.type_type() == Type::Union { let u = self.type_(); - Some(Union::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Union::init_from_table(u) }) } else { None } @@ -1280,7 +1431,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_fixed_size_binary(&self) -> Option> { if self.type_type() == Type::FixedSizeBinary { let u = self.type_(); - Some(FixedSizeBinary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FixedSizeBinary::init_from_table(u) }) } else { None } @@ -1291,7 +1445,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_fixed_size_list(&self) -> Option> { if self.type_type() == Type::FixedSizeList { let u = self.type_(); - Some(FixedSizeList::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FixedSizeList::init_from_table(u) }) } else { None } @@ -1302,7 +1459,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_map(&self) -> Option> { if self.type_type() == Type::Map { let u = self.type_(); - Some(Map::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Map::init_from_table(u) }) } else { None } @@ -1313,7 +1473,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_duration(&self) -> Option> { if self.type_type() == Type::Duration { let u = self.type_(); - Some(Duration::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Duration::init_from_table(u) }) } else { None } @@ -1324,7 +1487,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_large_binary(&self) -> Option> { if self.type_type() == Type::LargeBinary { let u = self.type_(); - Some(LargeBinary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeBinary::init_from_table(u) }) } else { None } @@ -1335,7 +1501,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_large_utf_8(&self) -> Option> { if self.type_type() == Type::LargeUtf8 { let u = self.type_(); - Some(LargeUtf8::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeUtf8::init_from_table(u) }) } else { None } @@ -1346,7 +1515,10 @@ impl<'a> SparseTensor<'a> { pub fn type_as_large_list(&self) -> Option> { if self.type_type() == Type::LargeList { let u = self.type_(); - Some(LargeList::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeList::init_from_table(u) }) } else { None } @@ -1359,7 +1531,10 @@ impl<'a> SparseTensor<'a> { ) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseTensorIndexCOO { let u = self.sparseIndex(); - Some(SparseTensorIndexCOO::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { SparseTensorIndexCOO::init_from_table(u) }) } else { None } @@ -1372,7 +1547,10 @@ impl<'a> SparseTensor<'a> { ) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseMatrixIndexCSX { let u = self.sparseIndex(); - Some(SparseMatrixIndexCSX::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { SparseMatrixIndexCSX::init_from_table(u) }) } else { None } @@ -1385,7 +1563,10 @@ impl<'a> SparseTensor<'a> { ) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseTensorIndexCSF { let u = self.sparseIndex(); - Some(SparseTensorIndexCSF::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { SparseTensorIndexCSF::init_from_table(u) }) } else { None } @@ -1400,7 +1581,7 @@ impl flatbuffers::Verifiable for SparseTensor<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_union::(&"type_type", Self::VT_TYPE_TYPE, &"type_", Self::VT_TYPE_, true, |key, v, pos| { + .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, true, |key, v, pos| { match key { Type::Null => v.verify_union_variant::>("Type::Null", pos), Type::Int => v.verify_union_variant::>("Type::Int", pos), @@ -1426,9 +1607,9 @@ impl flatbuffers::Verifiable for SparseTensor<'_> { _ => Ok(()), } })? - .visit_field::>>>(&"shape", Self::VT_SHAPE, true)? - .visit_field::(&"non_zero_length", Self::VT_NON_ZERO_LENGTH, false)? - .visit_union::(&"sparseIndex_type", Self::VT_SPARSEINDEX_TYPE, &"sparseIndex", Self::VT_SPARSEINDEX, true, |key, v, pos| { + .visit_field::>>>("shape", Self::VT_SHAPE, true)? + .visit_field::("non_zero_length", Self::VT_NON_ZERO_LENGTH, false)? + .visit_union::("sparseIndex_type", Self::VT_SPARSEINDEX_TYPE, "sparseIndex", Self::VT_SPARSEINDEX, true, |key, v, pos| { match key { SparseTensorIndex::SparseTensorIndexCOO => v.verify_union_variant::>("SparseTensorIndex::SparseTensorIndexCOO", pos), SparseTensorIndex::SparseMatrixIndexCSX => v.verify_union_variant::>("SparseTensorIndex::SparseMatrixIndexCSX", pos), @@ -1436,7 +1617,7 @@ impl flatbuffers::Verifiable for SparseTensor<'_> { _ => Ok(()), } })? - .visit_field::(&"data", Self::VT_DATA, true)? + .visit_field::("data", Self::VT_DATA, true)? .finish(); Ok(()) } @@ -1468,6 +1649,7 @@ impl<'a> Default for SparseTensorArgs<'a> { } } } + pub struct SparseTensorBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -1546,8 +1728,8 @@ impl<'a: 'b, 'b> SparseTensorBuilder<'a, 'b> { } } -impl std::fmt::Debug for SparseTensor<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for SparseTensor<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("SparseTensor"); ds.field("type_type", &self.type_type()); match self.type_type() { @@ -1809,18 +1991,6 @@ impl std::fmt::Debug for SparseTensor<'_> { ds.finish() } } -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_sparse_tensor<'a>(buf: &'a [u8]) -> SparseTensor<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_sparse_tensor<'a>(buf: &'a [u8]) -> SparseTensor<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - #[inline] /// Verifies that a buffer of bytes contains a `SparseTensor` /// and returns it. diff --git a/arrow/src/ipc/gen/Tensor.rs b/arrow/src/ipc/gen/Tensor.rs index 120636eaf1f5..f22ff23c98b7 100644 --- a/arrow/src/ipc/gen/Tensor.rs +++ b/arrow/src/ipc/gen/Tensor.rs @@ -36,16 +36,19 @@ pub struct TensorDim<'a> { impl<'a> flatbuffers::Follow<'a> for TensorDim<'a> { type Inner = TensorDim<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> TensorDim<'a> { + pub const VT_SIZE_: flatbuffers::VOffsetT = 4; + pub const VT_NAME: flatbuffers::VOffsetT = 6; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { TensorDim { _tab: table } } #[allow(unused_mut)] @@ -61,19 +64,24 @@ impl<'a> TensorDim<'a> { builder.finish() } - pub const VT_SIZE_: flatbuffers::VOffsetT = 4; - pub const VT_NAME: flatbuffers::VOffsetT = 6; - /// Length of dimension #[inline] pub fn size_(&self) -> i64 { - self._tab.get::(TensorDim::VT_SIZE_, Some(0)).unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { self._tab.get::(TensorDim::VT_SIZE_, Some(0)).unwrap() } } /// Name of the dimension, optional #[inline] pub fn name(&self) -> Option<&'a str> { - self._tab - .get::>(TensorDim::VT_NAME, None) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>(TensorDim::VT_NAME, None) + } } } @@ -85,9 +93,9 @@ impl flatbuffers::Verifiable for TensorDim<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::(&"size_", Self::VT_SIZE_, false)? + .visit_field::("size_", Self::VT_SIZE_, false)? .visit_field::>( - &"name", + "name", Self::VT_NAME, false, )? @@ -108,6 +116,7 @@ impl<'a> Default for TensorDimArgs<'a> { } } } + pub struct TensorDimBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -139,8 +148,8 @@ impl<'a: 'b, 'b> TensorDimBuilder<'a, 'b> { } } -impl std::fmt::Debug for TensorDim<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for TensorDim<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("TensorDim"); ds.field("size_", &self.size_()); ds.field("name", &self.name()); @@ -157,16 +166,22 @@ pub struct Tensor<'a> { impl<'a> flatbuffers::Follow<'a> for Tensor<'a> { type Inner = Tensor<'a>; #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { Self { - _tab: flatbuffers::Table { buf, loc }, + _tab: flatbuffers::Table::new(buf, loc), } } } impl<'a> Tensor<'a> { + pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 4; + pub const VT_TYPE_: flatbuffers::VOffsetT = 6; + pub const VT_SHAPE: flatbuffers::VOffsetT = 8; + pub const VT_STRIDES: flatbuffers::VOffsetT = 10; + pub const VT_DATA: flatbuffers::VOffsetT = 12; + #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { Tensor { _tab: table } } #[allow(unused_mut)] @@ -191,61 +206,81 @@ impl<'a> Tensor<'a> { builder.finish() } - pub const VT_TYPE_TYPE: flatbuffers::VOffsetT = 4; - pub const VT_TYPE_: flatbuffers::VOffsetT = 6; - pub const VT_SHAPE: flatbuffers::VOffsetT = 8; - pub const VT_STRIDES: flatbuffers::VOffsetT = 10; - pub const VT_DATA: flatbuffers::VOffsetT = 12; - #[inline] pub fn type_type(&self) -> Type { - self._tab - .get::(Tensor::VT_TYPE_TYPE, Some(Type::NONE)) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::(Tensor::VT_TYPE_TYPE, Some(Type::NONE)) + .unwrap() + } } /// The type of data contained in a value cell. Currently only fixed-width /// value types are supported, no strings or nested types #[inline] pub fn type_(&self) -> flatbuffers::Table<'a> { - self._tab - .get::>>( - Tensor::VT_TYPE_, - None, - ) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Tensor::VT_TYPE_, + None, + ) + .unwrap() + } } /// The dimensions of the tensor, optionally named #[inline] pub fn shape( &self, ) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { - self._tab - .get::>, - >>(Tensor::VT_SHAPE, None) - .unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>, + >>(Tensor::VT_SHAPE, None) + .unwrap() + } } /// Non-negative byte offsets to advance one value cell along each dimension /// If omitted, default to row-major order (C-like). #[inline] pub fn strides(&self) -> Option> { - self._tab - .get::>>( - Tensor::VT_STRIDES, - None, - ) + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { + self._tab + .get::>>( + Tensor::VT_STRIDES, + None, + ) + } } /// The location and size of the tensor's data #[inline] pub fn data(&self) -> &'a Buffer { - self._tab.get::(Tensor::VT_DATA, None).unwrap() + // Safety: + // Created from valid Table for this object + // which contains a valid value in this slot + unsafe { self._tab.get::(Tensor::VT_DATA, None).unwrap() } } #[inline] #[allow(non_snake_case)] pub fn type_as_null(&self) -> Option> { if self.type_type() == Type::Null { let u = self.type_(); - Some(Null::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Null::init_from_table(u) }) } else { None } @@ -256,7 +291,10 @@ impl<'a> Tensor<'a> { pub fn type_as_int(&self) -> Option> { if self.type_type() == Type::Int { let u = self.type_(); - Some(Int::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Int::init_from_table(u) }) } else { None } @@ -267,7 +305,10 @@ impl<'a> Tensor<'a> { pub fn type_as_floating_point(&self) -> Option> { if self.type_type() == Type::FloatingPoint { let u = self.type_(); - Some(FloatingPoint::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FloatingPoint::init_from_table(u) }) } else { None } @@ -278,7 +319,10 @@ impl<'a> Tensor<'a> { pub fn type_as_binary(&self) -> Option> { if self.type_type() == Type::Binary { let u = self.type_(); - Some(Binary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Binary::init_from_table(u) }) } else { None } @@ -289,7 +333,10 @@ impl<'a> Tensor<'a> { pub fn type_as_utf_8(&self) -> Option> { if self.type_type() == Type::Utf8 { let u = self.type_(); - Some(Utf8::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Utf8::init_from_table(u) }) } else { None } @@ -300,7 +347,10 @@ impl<'a> Tensor<'a> { pub fn type_as_bool(&self) -> Option> { if self.type_type() == Type::Bool { let u = self.type_(); - Some(Bool::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Bool::init_from_table(u) }) } else { None } @@ -311,7 +361,10 @@ impl<'a> Tensor<'a> { pub fn type_as_decimal(&self) -> Option> { if self.type_type() == Type::Decimal { let u = self.type_(); - Some(Decimal::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Decimal::init_from_table(u) }) } else { None } @@ -322,7 +375,10 @@ impl<'a> Tensor<'a> { pub fn type_as_date(&self) -> Option> { if self.type_type() == Type::Date { let u = self.type_(); - Some(Date::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Date::init_from_table(u) }) } else { None } @@ -333,7 +389,10 @@ impl<'a> Tensor<'a> { pub fn type_as_time(&self) -> Option> { if self.type_type() == Type::Time { let u = self.type_(); - Some(Time::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Time::init_from_table(u) }) } else { None } @@ -344,7 +403,10 @@ impl<'a> Tensor<'a> { pub fn type_as_timestamp(&self) -> Option> { if self.type_type() == Type::Timestamp { let u = self.type_(); - Some(Timestamp::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Timestamp::init_from_table(u) }) } else { None } @@ -355,7 +417,10 @@ impl<'a> Tensor<'a> { pub fn type_as_interval(&self) -> Option> { if self.type_type() == Type::Interval { let u = self.type_(); - Some(Interval::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Interval::init_from_table(u) }) } else { None } @@ -366,7 +431,10 @@ impl<'a> Tensor<'a> { pub fn type_as_list(&self) -> Option> { if self.type_type() == Type::List { let u = self.type_(); - Some(List::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { List::init_from_table(u) }) } else { None } @@ -377,7 +445,10 @@ impl<'a> Tensor<'a> { pub fn type_as_struct_(&self) -> Option> { if self.type_type() == Type::Struct_ { let u = self.type_(); - Some(Struct_::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Struct_::init_from_table(u) }) } else { None } @@ -388,7 +459,10 @@ impl<'a> Tensor<'a> { pub fn type_as_union(&self) -> Option> { if self.type_type() == Type::Union { let u = self.type_(); - Some(Union::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Union::init_from_table(u) }) } else { None } @@ -399,7 +473,10 @@ impl<'a> Tensor<'a> { pub fn type_as_fixed_size_binary(&self) -> Option> { if self.type_type() == Type::FixedSizeBinary { let u = self.type_(); - Some(FixedSizeBinary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FixedSizeBinary::init_from_table(u) }) } else { None } @@ -410,7 +487,10 @@ impl<'a> Tensor<'a> { pub fn type_as_fixed_size_list(&self) -> Option> { if self.type_type() == Type::FixedSizeList { let u = self.type_(); - Some(FixedSizeList::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { FixedSizeList::init_from_table(u) }) } else { None } @@ -421,7 +501,10 @@ impl<'a> Tensor<'a> { pub fn type_as_map(&self) -> Option> { if self.type_type() == Type::Map { let u = self.type_(); - Some(Map::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Map::init_from_table(u) }) } else { None } @@ -432,7 +515,10 @@ impl<'a> Tensor<'a> { pub fn type_as_duration(&self) -> Option> { if self.type_type() == Type::Duration { let u = self.type_(); - Some(Duration::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { Duration::init_from_table(u) }) } else { None } @@ -443,7 +529,10 @@ impl<'a> Tensor<'a> { pub fn type_as_large_binary(&self) -> Option> { if self.type_type() == Type::LargeBinary { let u = self.type_(); - Some(LargeBinary::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeBinary::init_from_table(u) }) } else { None } @@ -454,7 +543,10 @@ impl<'a> Tensor<'a> { pub fn type_as_large_utf_8(&self) -> Option> { if self.type_type() == Type::LargeUtf8 { let u = self.type_(); - Some(LargeUtf8::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeUtf8::init_from_table(u) }) } else { None } @@ -465,7 +557,10 @@ impl<'a> Tensor<'a> { pub fn type_as_large_list(&self) -> Option> { if self.type_type() == Type::LargeList { let u = self.type_(); - Some(LargeList::init_from_table(u)) + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { LargeList::init_from_table(u) }) } else { None } @@ -480,7 +575,7 @@ impl flatbuffers::Verifiable for Tensor<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_union::(&"type_type", Self::VT_TYPE_TYPE, &"type_", Self::VT_TYPE_, true, |key, v, pos| { + .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, true, |key, v, pos| { match key { Type::Null => v.verify_union_variant::>("Type::Null", pos), Type::Int => v.verify_union_variant::>("Type::Int", pos), @@ -506,9 +601,9 @@ impl flatbuffers::Verifiable for Tensor<'_> { _ => Ok(()), } })? - .visit_field::>>>(&"shape", Self::VT_SHAPE, true)? - .visit_field::>>(&"strides", Self::VT_STRIDES, false)? - .visit_field::(&"data", Self::VT_DATA, true)? + .visit_field::>>>("shape", Self::VT_SHAPE, true)? + .visit_field::>>("strides", Self::VT_STRIDES, false)? + .visit_field::("data", Self::VT_DATA, true)? .finish(); Ok(()) } @@ -536,6 +631,7 @@ impl<'a> Default for TensorArgs<'a> { } } } + pub struct TensorBuilder<'a: 'b, 'b> { fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, start_: flatbuffers::WIPOffset, @@ -596,8 +692,8 @@ impl<'a: 'b, 'b> TensorBuilder<'a, 'b> { } } -impl std::fmt::Debug for Tensor<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Tensor<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut ds = f.debug_struct("Tensor"); ds.field("type_type", &self.type_type()); match self.type_type() { @@ -822,18 +918,6 @@ impl std::fmt::Debug for Tensor<'_> { ds.finish() } } -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_tensor<'a>(buf: &'a [u8]) -> Tensor<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_tensor<'a>(buf: &'a [u8]) -> Tensor<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - #[inline] /// Verifies that a buffer of bytes contains a `Tensor` /// and returns it. diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index cc45b22373de..1f2824b343af 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -72,10 +72,10 @@ fn read_buffer( /// - cast the 64-bit array to the appropriate data type #[allow(clippy::too_many_arguments)] fn create_array( - nodes: &[ipc::FieldNode], + nodes: flatbuffers::Vector<'_, ipc::FieldNode>, field: &Field, data: &Buffer, - buffers: &[ipc::Buffer], + buffers: flatbuffers::Vector<'_, ipc::Buffer>, dictionaries_by_id: &HashMap, mut node_index: usize, mut buffer_index: usize, @@ -86,12 +86,13 @@ fn create_array( let array = match data_type { Utf8 | Binary | LargeBinary | LargeUtf8 => { let array = create_primitive_array( - &nodes[node_index], + nodes.get(node_index), data_type, - &buffers[buffer_index..buffer_index + 3] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>>()?, + &[ + read_buffer(buffers.get(buffer_index), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 2), data, compression_codec)?, + ], ); node_index += 1; buffer_index += 3; @@ -99,23 +100,23 @@ fn create_array( } FixedSizeBinary(_) => { let array = create_primitive_array( - &nodes[node_index], + nodes.get(node_index), data_type, - &buffers[buffer_index..buffer_index + 2] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>>()?, + &[ + read_buffer(buffers.get(buffer_index), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, + ], ); node_index += 1; buffer_index += 2; array } List(ref list_field) | LargeList(ref list_field) | Map(ref list_field, _) => { - let list_node = &nodes[node_index]; - let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>()?; + let list_node = nodes.get(node_index); + let list_buffers = [ + read_buffer(buffers.get(buffer_index), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, + ]; node_index += 1; buffer_index += 2; let triple = create_array( @@ -132,14 +133,15 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers[..], triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0) } FixedSizeList(ref list_field, _) => { - let list_node = &nodes[node_index]; - let list_buffers: Vec = buffers[buffer_index..=buffer_index] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>()?; + let list_node = nodes.get(node_index); + let list_buffers = [read_buffer( + buffers.get(buffer_index), + data, + compression_codec, + )?]; node_index += 1; buffer_index += 1; let triple = create_array( @@ -156,12 +158,12 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers[..], triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0) } Struct(struct_fields) => { - let struct_node = &nodes[node_index]; - let null_buffer: Buffer = - read_buffer(&buffers[buffer_index], data, compression_codec)?; + let struct_node = nodes.get(node_index); + let null_buffer = + read_buffer(buffers.get(buffer_index), data, compression_codec)?; node_index += 1; buffer_index += 1; @@ -196,11 +198,11 @@ fn create_array( } // Create dictionary array from RecordBatch Dictionary(_, _) => { - let index_node = &nodes[node_index]; - let index_buffers: Vec = buffers[buffer_index..buffer_index + 2] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>()?; + let index_node = nodes.get(node_index); + let index_buffers = [ + read_buffer(buffers.get(buffer_index), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, + ]; let dict_id = field.dict_id().ok_or_else(|| { ArrowError::IoError(format!("Field {} does not have dict id", field)) @@ -218,12 +220,12 @@ fn create_array( create_dictionary_array( index_node, data_type, - &index_buffers[..], + &index_buffers, value_array.clone(), ) } Union(fields, field_type_ids, mode) => { - let union_node = nodes[node_index]; + let union_node = nodes.get(node_index); node_index += 1; let len = union_node.length() as usize; @@ -231,12 +233,12 @@ fn create_array( // In V4, union types has validity bitmap // In V5 and later, union types have no validity bitmap if metadata < &ipc::MetadataVersion::V5 { - read_buffer(&buffers[buffer_index], data, compression_codec)?; + read_buffer(buffers.get(buffer_index), data, compression_codec)?; buffer_index += 1; } let type_ids: Buffer = - read_buffer(&buffers[buffer_index], data, compression_codec)?[..len] + read_buffer(buffers.get(buffer_index), data, compression_codec)?[..len] .into(); buffer_index += 1; @@ -244,7 +246,7 @@ fn create_array( let value_offsets = match mode { UnionMode::Dense => { let buffer = - read_buffer(&buffers[buffer_index], data, compression_codec)?; + read_buffer(buffers.get(buffer_index), data, compression_codec)?; buffer_index += 1; Some(buffer[..len * 4].into()) } @@ -277,8 +279,9 @@ fn create_array( Arc::new(array) } Null => { - let length = nodes[node_index].length(); - let null_count = nodes[node_index].null_count(); + let node = nodes.get(node_index); + let length = node.length(); + let null_count = node.null_count(); if length != null_count { return Err(ArrowError::IoError(format!( @@ -298,12 +301,12 @@ fn create_array( } _ => { let array = create_primitive_array( - &nodes[node_index], + nodes.get(node_index), data_type, - &buffers[buffer_index..buffer_index + 2] - .iter() - .map(|buf| read_buffer(buf, data, compression_codec)) - .collect::>>()?, + &[ + read_buffer(buffers.get(buffer_index), data, compression_codec)?, + read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, + ], ); node_index += 1; buffer_index += 2; @@ -875,7 +878,7 @@ impl FileReader { Ok(Self { reader, schema: Arc::new(schema), - blocks: blocks.to_vec(), + blocks: blocks.iter().copied().collect(), current_block: 0, total_blocks, dictionaries_by_id, From 23eb1acf1e10dcea9728014add1fbc01826686ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Oct 2022 08:39:32 +1300 Subject: [PATCH 0148/1411] Update chrono-tz requirement from 0.6 to 0.7 (#2903) Updates the requirements on [chrono-tz](https://github.com/chronotope/chrono-tz) to permit the latest version. - [Release notes](https://github.com/chronotope/chrono-tz/releases) - [Changelog](https://github.com/chronotope/chrono-tz/blob/main/CHANGELOG.md) - [Commits](https://github.com/chronotope/chrono-tz/commits) --- updated-dependencies: - dependency-name: chrono-tz dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 88ed493965ca..1a282c07658c 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -61,7 +61,7 @@ lazy_static = { version = "1.4", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -chrono-tz = { version = "0.6", default-features = false, optional = true } +chrono-tz = { version = "0.7", default-features = false, optional = true } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } From 8be8209958dfd3ee51c28da9bc3e4daf4c40a146 Mon Sep 17 00:00:00 2001 From: Anthony Poncet Date: Fri, 21 Oct 2022 01:18:32 +0200 Subject: [PATCH 0149/1411] Parquet record api support timestamp before epoch (#2899) * Parquet record api support timestamp before epoch * Revert submodule changes --- parquet/src/record/api.rs | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 7e1c484bf881..22b8a79780ab 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -799,13 +799,21 @@ fn convert_date_to_string(value: u32) -> String { format!("{}", dt.format("%Y-%m-%d %:z")) } +/// Helper method to convert Parquet timestamp into a string. +/// Input `value` is a number of seconds since the epoch in UTC. +/// Datetime is displayed in local timezone. +#[inline] +fn convert_timestamp_secs_to_string(value: i64) -> String { + let dt = Utc.timestamp(value, 0); + format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")) +} + /// Helper method to convert Parquet timestamp into a string. /// Input `value` is a number of milliseconds since the epoch in UTC. /// Datetime is displayed in local timezone. #[inline] fn convert_timestamp_millis_to_string(value: u64) -> String { - let dt = Utc.timestamp((value / 1000) as i64, 0); - format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")) + convert_timestamp_secs_to_string(value as i64 / 1000) } /// Helper method to convert Parquet timestamp into a string. @@ -813,7 +821,7 @@ fn convert_timestamp_millis_to_string(value: u64) -> String { /// Datetime is displayed in local timezone. #[inline] fn convert_timestamp_micros_to_string(value: u64) -> String { - convert_timestamp_millis_to_string(value / 1000) + convert_timestamp_secs_to_string(value as i64 / 1000000) } /// Helper method to convert Parquet decimal into a string. @@ -1082,7 +1090,7 @@ mod tests { } #[test] - fn test_convert_timestamp_to_string() { + fn test_convert_timestamp_millis_to_string() { fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); let dt = Utc.from_utc_datetime(&datetime); @@ -1091,6 +1099,25 @@ mod tests { assert_eq!(res, exp); } + check_datetime_conversion(1969, 9, 10, 1, 2, 3); + check_datetime_conversion(2010, 1, 2, 13, 12, 54); + check_datetime_conversion(2011, 1, 3, 8, 23, 1); + check_datetime_conversion(2012, 4, 5, 11, 6, 32); + check_datetime_conversion(2013, 5, 12, 16, 38, 0); + check_datetime_conversion(2014, 11, 28, 21, 15, 12); + } + + #[test] + fn test_convert_timestamp_micros_to_string() { + fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { + let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); + let dt = Utc.from_utc_datetime(&datetime); + let res = convert_timestamp_micros_to_string(dt.timestamp_micros() as u64); + let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); + assert_eq!(res, exp); + } + + check_datetime_conversion(1969, 9, 10, 1, 2, 3); check_datetime_conversion(2010, 1, 2, 13, 12, 54); check_datetime_conversion(2011, 1, 3, 8, 23, 1); check_datetime_conversion(2012, 4, 5, 11, 6, 32); From 9a8b04d779fa44fc9878c7cc33ad1e66a4e6a3f6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 21 Oct 2022 12:19:05 +1300 Subject: [PATCH 0150/1411] Split out value selection kernels into arrow-select (#2594) (#2885) * Split out arrow-select (#2594) * Fix doc * Clippy --- .github/workflows/arrow.yml | 13 +- .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-select/Cargo.toml | 51 ++++ .../kernels => arrow-select/src}/filter.rs | 101 +++---- .../src}/interleave.rs | 2 +- arrow-select/src/lib.rs | 22 ++ .../kernels => arrow-select/src}/take.rs | 279 +++++++++++++----- arrow/Cargo.toml | 3 +- arrow/src/compute/kernels/mod.rs | 5 +- arrow/src/compute/kernels/sort.rs | 25 +- arrow/src/compute/util.rs | 253 +--------------- 16 files changed, 353 insertions(+), 407 deletions(-) create mode 100644 arrow-select/Cargo.toml rename {arrow/src/compute/kernels => arrow-select/src}/filter.rs (95%) rename {arrow/src/compute/kernels => arrow-select/src}/interleave.rs (98%) create mode 100644 arrow-select/src/lib.rs rename {arrow/src/compute/kernels => arrow-select/src}/take.rs (88%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 613f52f870f5..3c73f9d5c7cc 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -30,6 +30,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-select/** - arrow-integration-test/** - .github/** @@ -61,6 +62,8 @@ jobs: run: cargo test -p arrow-schema --all-features - name: Test arrow-array with all features run: cargo test -p arrow-array --all-features + - name: Test arrow-select with all features + run: cargo test -p arrow-select --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow @@ -193,13 +196,15 @@ jobs: run: | rustup component add clippy - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-features + run: cargo clippy -p arrow-buffer --all-targets --all-features - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-features + run: cargo clippy -p arrow-data --all-targets --all-features - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-features + run: cargo clippy -p arrow-schema --all-targets --all-features - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-features + run: cargo clippy -p arrow-array --all-targets --all-features + - name: Clippy arrow-select with all features + run: cargo clippy -p arrow-select --all-targets --all-features - name: Clippy arrow run: | cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 3e785f056dc3..686dee9ff042 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -32,6 +32,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-select/** - arrow-flight/** - .github/** diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 42ab6a639c68..e44f5f8038ee 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -21,6 +21,7 @@ arrow: - arrow-buffer/**/* - arrow-data/**/* - arrow-schema/**/* + - arrow-select/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index bdf576af98d8..7c1d2972f452 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -29,6 +29,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-select/** - arrow-pyarrow-integration-testing/** - arrow-integration-test/** - arrow-integration-testing/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index bb75fcbbbb2a..92d6f2af2a9c 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -29,6 +29,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-select/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 0d5dc63a7da2..3c5b2eab7d19 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -32,6 +32,7 @@ on: - arrow-buffer/** - arrow-data/** - arrow-schema/** + - arrow-select/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index e57de7711d71..6f61b0e456d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "arrow-integration-test", "arrow-integration-testing", "arrow-schema", + "arrow-select", "parquet", "parquet_derive", "parquet_derive_test", diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml new file mode 100644 index 000000000000..a10f9862fc40 --- /dev/null +++ b/arrow-select/Cargo.toml @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-select" +version = "25.0.0" +description = "Selection kernels for arrow arrays" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_select" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } +arrow-data = { version = "25.0.0", path = "../arrow-data" } +arrow-schema = { version = "25.0.0", path = "../arrow-schema" } +arrow-array = { version = "25.0.0", path = "../arrow-array" } +num = { version = "0.4", default-features = false, features = ["std"] } + +[features] +default = [] + +[dev-dependencies] +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow/src/compute/kernels/filter.rs b/arrow-select/src/filter.rs similarity index 95% rename from arrow/src/compute/kernels/filter.rs rename to arrow-select/src/filter.rs index 150253b1c0de..3226c54180a3 100644 --- a/arrow/src/compute/kernels/filter.rs +++ b/arrow-select/src/filter.rs @@ -15,21 +15,21 @@ // specific language governing permissions and limitations // under the License. -//! Defines miscellaneous array kernels. +//! Defines filter kernels use std::ops::AddAssign; use std::sync::Arc; use num::Zero; -use crate::array::*; -use crate::buffer::{buffer_bin_and, Buffer, MutableBuffer}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::RecordBatch; -use crate::util::bit_iterator::{BitIndexIterator, BitSliceIterator}; -use crate::util::bit_util; -use crate::{downcast_dictionary_array, downcast_primitive_array}; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::*; +use arrow_buffer::bit_util; +use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; +use arrow_data::bit_iterator::{BitIndexIterator, BitSliceIterator}; +use arrow_data::transform::MutableArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::*; /// If the filter selects more than this fraction of rows, use /// [`SlicesIterator`] to copy ranges of values. Otherwise iterate @@ -130,7 +130,7 @@ pub type Filter<'a> = Box ArrayData + 'a>; /// Deprecated: Use [`FilterBuilder`] instead #[deprecated] #[allow(deprecated)] -pub fn build_filter(filter: &BooleanArray) -> Result { +pub fn build_filter(filter: &BooleanArray) -> Result { let iter = SlicesIterator::new(filter); let filter_count = filter_count(filter); let chunks = iter.collect::>(); @@ -173,19 +173,18 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { /// /// # Example /// ```rust -/// # use arrow::array::{Int32Array, BooleanArray}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::filter::filter; -/// # fn main() -> Result<()> { +/// # use arrow_array::{Int32Array, BooleanArray}; +/// # use arrow_select::filter::filter; /// let array = Int32Array::from(vec![5, 6, 7, 8, 9]); /// let filter_array = BooleanArray::from(vec![true, false, false, true, false]); -/// let c = filter(&array, &filter_array)?; +/// let c = filter(&array, &filter_array).unwrap(); /// let c = c.as_any().downcast_ref::().unwrap(); /// assert_eq!(c, &Int32Array::from(vec![5, 8])); -/// # Ok(()) -/// # } /// ``` -pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result { +pub fn filter( + values: &dyn Array, + predicate: &BooleanArray, +) -> Result { let predicate = FilterBuilder::new(predicate).build(); filter_array(values, &predicate) } @@ -194,7 +193,7 @@ pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result pub fn filter_record_batch( record_batch: &RecordBatch, predicate: &BooleanArray, -) -> Result { +) -> Result { let mut filter_builder = FilterBuilder::new(predicate); if record_batch.num_columns() > 1 { // Only optimize if filtering more than one column @@ -206,7 +205,7 @@ pub fn filter_record_batch( .columns() .iter() .map(|a| filter_array(a, &filter)) - .collect::>>()?; + .collect::, _>>()?; RecordBatch::try_new(record_batch.schema(), filtered_arrays) } @@ -318,12 +317,15 @@ pub struct FilterPredicate { impl FilterPredicate { /// Selects rows from `values` based on this [`FilterPredicate`] - pub fn filter(&self, values: &dyn Array) -> Result { + pub fn filter(&self, values: &dyn Array) -> Result { filter_array(values, self) } } -fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result { +fn filter_array( + values: &dyn Array, + predicate: &FilterPredicate, +) -> Result { if predicate.filter.len() > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Filter predicate of length {} is larger than target array of length {}", @@ -683,15 +685,11 @@ where #[cfg(test)] mod tests { + use arrow_array::builder::*; + use arrow_array::types::*; use rand::distributions::{Alphanumeric, Standard}; use rand::prelude::*; - use crate::datatypes::Int64Type; - use crate::{ - buffer::Buffer, - datatypes::{DataType, Field}, - }; - use super::*; macro_rules! def_temporal_test { @@ -922,24 +920,6 @@ mod tests { assert_eq!("world", values.value(d.keys().value(1) as usize)); } - #[test] - fn test_filter_string_array_with_negated_boolean_array() { - let a = StringArray::from(vec!["hello", " ", "world", "!"]); - let mut bb = BooleanBuilder::with_capacity(2); - bb.append_value(false); - bb.append_value(true); - bb.append_value(false); - bb.append_value(true); - let b = bb.finish(); - let b = crate::compute::not(&b).unwrap(); - - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!("hello", d.value(0)); - assert_eq!("world", d.value(1)); - } - #[test] fn test_filter_list_array() { let value_data = ArrayData::builder(DataType::Int32) @@ -1027,36 +1007,22 @@ mod tests { } #[test] - fn test_null_mask() -> Result<()> { - use crate::compute::kernels::comparison; - let a: PrimitiveArray = - PrimitiveArray::from(vec![Some(1), Some(2), None]); - let mask0 = comparison::eq(&a, &a)?; - let out0 = filter(&a, &mask0)?; - let out_arr0 = out0 - .as_any() - .downcast_ref::>() - .unwrap(); + fn test_null_mask() { + let a = Int64Array::from(vec![Some(1), Some(2), None]); let mask1 = BooleanArray::from(vec![Some(true), Some(true), None]); - let out1 = filter(&a, &mask1)?; - let out_arr1 = out1 - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(mask0, mask1); - assert_eq!(out_arr0, out_arr1); - Ok(()) + let out = filter(&a, &mask1).unwrap(); + assert_eq!(&out, &a.slice(0, 2)); } #[test] - fn test_fast_path() -> Result<()> { + fn test_fast_path() { let a: PrimitiveArray = PrimitiveArray::from(vec![Some(1), Some(2), None]); // all true let mask = BooleanArray::from(vec![true, true, true]); - let out = filter(&a, &mask)?; + let out = filter(&a, &mask).unwrap(); let b = out .as_any() .downcast_ref::>() @@ -1065,10 +1031,9 @@ mod tests { // all false let mask = BooleanArray::from(vec![false, false, false]); - let out = filter(&a, &mask)?; + let out = filter(&a, &mask).unwrap(); assert_eq!(out.len(), 0); assert_eq!(out.data_type(), &DataType::Int64); - Ok(()) } #[test] diff --git a/arrow/src/compute/kernels/interleave.rs b/arrow-select/src/interleave.rs similarity index 98% rename from arrow/src/compute/kernels/interleave.rs rename to arrow-select/src/interleave.rs index 01ac0fc8fe36..537075f1f308 100644 --- a/arrow/src/compute/kernels/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -45,7 +45,7 @@ use arrow_schema::ArrowError; /// values array 1 /// ``` /// -/// For selecting values by index from a single array see [compute::take](crate::compute::take) +/// For selecting values by index from a single array see [`crate::interleave`] pub fn interleave( values: &[&dyn Array], indices: &[(usize, usize)], diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs new file mode 100644 index 000000000000..159c9b0ffdea --- /dev/null +++ b/arrow-select/src/lib.rs @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow selection kernels + +pub mod filter; +pub mod interleave; +pub mod take; diff --git a/arrow/src/compute/kernels/take.rs b/arrow-select/src/take.rs similarity index 88% rename from arrow/src/compute/kernels/take.rs rename to arrow-select/src/take.rs index 0ef2025cf382..d52ec37b9b29 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow-select/src/take.rs @@ -19,16 +19,13 @@ use std::{ops::AddAssign, sync::Arc}; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::compute::util::{ - take_value_indices_from_fixed_size_list, take_value_indices_from_list, -}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; -use crate::{ - array::*, buffer::buffer_bin_and, downcast_dictionary_array, downcast_primitive_array, +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ + bit_util, buffer::buffer_bin_and, ArrowNativeType, Buffer, MutableBuffer, }; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, Field}; use num::{ToPrimitive, Zero}; @@ -49,7 +46,7 @@ use num::{ToPrimitive, Zero}; /// values array indices array result /// ``` /// -/// For selecting values by index from multiple arrays see [compute::interleave](crate::compute::interleave) +/// For selecting values by index from multiple arrays see [`crate::interleave`] /// /// # Errors /// This function errors whenever: @@ -62,28 +59,24 @@ use num::{ToPrimitive, Zero}; /// /// # Examples /// ``` -/// use arrow::array::{StringArray, UInt32Array}; -/// use arrow::error::Result; -/// use arrow::compute::take; -/// # fn main() -> Result<()> { +/// # use arrow_array::{StringArray, UInt32Array}; +/// # use arrow_select::take::take; /// let values = StringArray::from(vec!["zero", "one", "two"]); /// /// // Take items at index 2, and 1: /// let indices = UInt32Array::from(vec![2, 1]); -/// let taken = take(&values, &indices, None)?; +/// let taken = take(&values, &indices, None).unwrap(); /// let taken = taken.as_any().downcast_ref::().unwrap(); /// /// assert_eq!(*taken, StringArray::from(vec!["two", "one"])); -/// # Ok(()) -/// # } /// ``` pub fn take( values: &dyn Array, indices: &PrimitiveArray, options: Option, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { take_impl(values, indices, options) @@ -93,9 +86,9 @@ fn take_impl( values: &dyn Array, indices: &PrimitiveArray, options: Option, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let options = options.unwrap_or_default(); @@ -190,7 +183,7 @@ where DataType::Struct(fields) => { let struct_: &StructArray = values.as_any().downcast_ref::().unwrap(); - let arrays: Result> = struct_ + let arrays: Result, _> = struct_ .columns() .iter() .map(|a| take_impl(a.as_ref(), indices, Some(options.clone()))) @@ -263,21 +256,24 @@ pub struct TakeOptions { } #[inline(always)] -fn maybe_usize(index: I) -> Result { +fn maybe_usize(index: I) -> Result { index .to_usize() .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string())) } // take implementation when neither values nor indices contain nulls -fn take_no_nulls(values: &[T], indices: &[I]) -> Result<(Buffer, Option)> +fn take_no_nulls( + values: &[T], + indices: &[I], +) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, I: ArrowNativeType, { let values = indices .iter() - .map(|index| Result::Ok(values[maybe_usize::(*index)?])); + .map(|index| Result::<_, ArrowError>::Ok(values[maybe_usize::(*index)?])); // Soundness: `slice.map` is `TrustedLen`. let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; @@ -288,7 +284,7 @@ where fn take_values_nulls( values: &PrimitiveArray, indices: &[I], -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowPrimitiveType, I: ArrowNativeType, @@ -300,7 +296,7 @@ fn take_values_nulls_inner( values_data: &ArrayData, values: &[T], indices: &[I], -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, I: ArrowNativeType, @@ -316,7 +312,7 @@ where null_count += 1; bit_util::unset_bit(null_slice, i); } - Result::Ok(values[index]) + Result::<_, ArrowError>::Ok(values[index]) }); // Soundness: `slice.map` is `TrustedLen`. let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; @@ -335,10 +331,10 @@ where fn take_indices_nulls( values: &[T], indices: &PrimitiveArray, -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, - I: ArrowNumericType, + I: ArrowPrimitiveType, I::Native: ToPrimitive, { take_indices_nulls_inner(values, indices.values(), indices.data()) @@ -348,14 +344,14 @@ fn take_indices_nulls_inner( values: &[T], indices: &[I], indices_data: &ArrayData, -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, I: ArrowNativeType, { let values = indices.iter().map(|index| { let index = maybe_usize::(*index)?; - Result::Ok(match values.get(index) { + Result::<_, ArrowError>::Ok(match values.get(index) { Some(value) => *value, None => { if indices_data.is_null(index) { @@ -382,10 +378,10 @@ where fn take_values_indices_nulls( values: &PrimitiveArray, indices: &PrimitiveArray, -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowPrimitiveType, - I: ArrowNumericType, + I: ArrowPrimitiveType, I::Native: ToPrimitive, { take_values_indices_nulls_inner( @@ -401,7 +397,7 @@ fn take_values_indices_nulls_inner( values_data: &ArrayData, indices: &[I], indices_data: &ArrayData, -) -> Result<(Buffer, Option)> +) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, I: ArrowNativeType, @@ -422,7 +418,7 @@ where null_count += 1; bit_util::unset_bit(null_slice, i); } - Result::Ok(values[index]) + Result::<_, ArrowError>::Ok(values[index]) } }); // Soundness: `slice.map` is `TrustedLen`. @@ -450,10 +446,10 @@ where fn take_primitive( values: &PrimitiveArray, indices: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowPrimitiveType, - I: ArrowNumericType, + I: ArrowPrimitiveType, I::Native: ToPrimitive, { let indices_has_nulls = indices.null_count() > 0; @@ -502,9 +498,9 @@ fn take_bits( values: &Buffer, values_offset: usize, indices: &PrimitiveArray, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let len = indices.len(); @@ -518,7 +514,7 @@ where indices .iter() .enumerate() - .try_for_each::<_, Result<()>>(|(i, index)| { + .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| { if let Some(index) = index { let index = ToPrimitive::to_usize(&index).ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) @@ -536,7 +532,7 @@ where .values() .iter() .enumerate() - .try_for_each::<_, Result<()>>(|(i, index)| { + .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| { let index = ToPrimitive::to_usize(index).ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; @@ -554,9 +550,9 @@ where fn take_boolean( values: &BooleanArray, indices: &PrimitiveArray, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let val_buf = take_bits(values.values(), values.offset(), indices)?; @@ -588,10 +584,10 @@ where fn take_string( array: &GenericStringArray, indices: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where OffsetSize: Zero + AddAssign + OffsetSizeTrait, - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let data_len = indices.len(); @@ -706,11 +702,11 @@ where fn take_list( values: &GenericListArray, indices: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, - OffsetType: ArrowNumericType, + OffsetType: ArrowPrimitiveType, OffsetType::Native: ToPrimitive + OffsetSizeTrait, PrimitiveArray: From>>, { @@ -759,9 +755,9 @@ fn take_fixed_size_list( values: &FixedSizeListArray, indices: &PrimitiveArray, length: ::Native, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let list_indices = take_value_indices_from_fixed_size_list(values, indices, length)?; @@ -795,10 +791,10 @@ where fn take_binary( values: &GenericBinaryArray, indices: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where OffsetType: OffsetSizeTrait, - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let data_ref = values.data_ref(); @@ -813,7 +809,7 @@ where Ok(None) } }) - .collect::>>()? + .collect::, ArrowError>>()? .into_iter(); Ok(array_iter.collect::>()) @@ -822,9 +818,9 @@ where fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, -) -> Result +) -> Result where - IndexType: ArrowNumericType, + IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let data_ref = values.data_ref(); @@ -839,7 +835,7 @@ where Ok(None) } }) - .collect::>>()? + .collect::, ArrowError>>()? .into_iter(); FixedSizeBinaryArray::try_from_sparse_iter(array_iter) @@ -852,11 +848,11 @@ where fn take_dict( values: &DictionaryArray, indices: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowPrimitiveType, T::Native: num::Num, - I: ArrowNumericType, + I: ArrowPrimitiveType, I::Native: ToPrimitive, { let new_keys = take_primitive::(values.keys(), indices)?; @@ -877,10 +873,88 @@ where Ok(DictionaryArray::::from(data)) } +/// Takes/filters a list array's inner data using the offsets of the list array. +/// +/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns +/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 +/// elements) +fn take_value_indices_from_list( + list: &GenericListArray, + indices: &PrimitiveArray, +) -> Result<(PrimitiveArray, Vec), ArrowError> +where + IndexType: ArrowPrimitiveType, + IndexType::Native: ToPrimitive, + OffsetType: ArrowPrimitiveType, + OffsetType::Native: OffsetSizeTrait + std::ops::Add + num::Zero + num::One, + PrimitiveArray: From>>, +{ + // TODO: benchmark this function, there might be a faster unsafe alternative + let offsets: &[OffsetType::Native] = list.value_offsets(); + + let mut new_offsets = Vec::with_capacity(indices.len()); + let mut values = Vec::new(); + let mut current_offset = OffsetType::Native::zero(); + // add first offset + new_offsets.push(OffsetType::Native::zero()); + // compute the value indices, and set offsets accordingly + for i in 0..indices.len() { + if indices.is_valid(i) { + let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + let start = offsets[ix]; + let end = offsets[ix + 1]; + current_offset += end - start; + new_offsets.push(current_offset); + + let mut curr = start; + + // if start == end, this slot is empty + while curr < end { + values.push(Some(curr)); + curr += num::One::one(); + } + } else { + new_offsets.push(current_offset); + } + } + + Ok((PrimitiveArray::::from(values), new_offsets)) +} + +/// Takes/filters a fixed size list array's inner data using the offsets of the list array. +fn take_value_indices_from_fixed_size_list( + list: &FixedSizeListArray, + indices: &PrimitiveArray, + length: ::Native, +) -> Result, ArrowError> +where + IndexType: ArrowPrimitiveType, + IndexType::Native: ToPrimitive, +{ + let mut values = vec![]; + + for i in 0..indices.len() { + if indices.is_valid(i) { + let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + let start = + list.value_offset(index) as ::Native; + + values.extend(start..start + length); + } + } + + Ok(PrimitiveArray::::from(values)) +} + #[cfg(test)] mod tests { use super::*; - use crate::compute::util::tests::build_fixed_size_list_nullable; + use arrow_array::builder::*; + use arrow_schema::TimeUnit; fn test_take_decimal_arrays( data: Vec>, @@ -889,7 +963,7 @@ mod tests { expected_data: Vec>, precision: &u8, scale: &u8, - ) -> Result<()> { + ) -> Result<(), ArrowError> { let output = data .into_iter() .collect::() @@ -925,7 +999,7 @@ mod tests { index: &UInt32Array, options: Option, expected_data: Vec>, - ) -> Result<()> + ) -> Result<(), ArrowError> where T: ArrowPrimitiveType, PrimitiveArray: From>>, @@ -942,7 +1016,7 @@ mod tests { index: &UInt32Array, options: Option, expected_data: Vec>, - ) -> Result<()> + ) -> Result<(), ArrowError> where T: ArrowPrimitiveType, PrimitiveArray: From>, @@ -963,7 +1037,7 @@ mod tests { ) where T: ArrowPrimitiveType, PrimitiveArray: From>>, - I: ArrowNumericType, + I: ArrowPrimitiveType, I::Native: ToPrimitive, { let output = PrimitiveArray::::from(data); @@ -1697,11 +1771,13 @@ mod tests { { let indices = UInt32Array::from(indices); - let input_array = build_fixed_size_list_nullable::(input_data, length); + let input_array = + FixedSizeListArray::from_iter_primitive::(input_data, length); let output = take_fixed_size_list(&input_array, &indices, length as u32).unwrap(); - let expected = build_fixed_size_list_nullable::(expected_data, length); + let expected = + FixedSizeListArray::from_iter_primitive::(expected_data, length); assert_eq!(&output, &expected) } @@ -1988,4 +2064,75 @@ mod tests { ]); assert_eq!(result.keys(), &expected_keys); } + + fn build_generic_list(data: Vec>>) -> GenericListArray + where + S: OffsetSizeTrait + 'static, + T: ArrowPrimitiveType, + PrimitiveArray: From>>, + { + GenericListArray::from_iter_primitive::( + data.iter() + .map(|x| x.as_ref().map(|x| x.iter().map(|x| Some(*x)))), + ) + } + + #[test] + fn test_take_value_index_from_list() { + let list = build_generic_list::(vec![ + Some(vec![0, 1]), + Some(vec![2, 3, 4]), + Some(vec![5, 6, 7, 8, 9]), + ]); + let indices = UInt32Array::from(vec![2, 0]); + + let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap(); + + assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1])); + assert_eq!(offsets, vec![0, 5, 7]); + } + + #[test] + fn test_take_value_index_from_large_list() { + let list = build_generic_list::(vec![ + Some(vec![0, 1]), + Some(vec![2, 3, 4]), + Some(vec![5, 6, 7, 8, 9]), + ]); + let indices = UInt32Array::from(vec![2, 0]); + + let (indexed, offsets) = + take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap(); + + assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1])); + assert_eq!(offsets, vec![0, 5, 7]); + } + + #[test] + fn test_take_value_index_from_fixed_list() { + let list = FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), None]), + Some(vec![Some(4), None, Some(6)]), + None, + Some(vec![None, Some(8), Some(9)]), + ], + 3, + ); + + let indices = UInt32Array::from(vec![2, 1, 0]); + let indexed = + take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); + + assert_eq!(indexed, UInt32Array::from(vec![6, 7, 8, 3, 4, 5, 0, 1, 2])); + + let indices = UInt32Array::from(vec![3, 2, 1, 2, 0]); + let indexed = + take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); + + assert_eq!( + indexed, + UInt32Array::from(vec![9, 10, 11, 6, 7, 8, 3, 4, 5, 6, 7, 8, 0, 1, 2]) + ); + } } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 1a282c07658c..d066cbce956b 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -48,11 +48,12 @@ arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } arrow-data = { version = "25.0.0", path = "../arrow-data" } arrow-schema = { version = "25.0.0", path = "../arrow-schema" } arrow-array = { version = "25.0.0", path = "../arrow-array" } +arrow-select = { version = "25.0.0", path = "../arrow-select" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false, features = ["num-traits"]} +half = { version = "2.0", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.12", default-features = false } csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 8301f69bbf8b..68ae2439f2db 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -27,15 +27,14 @@ pub mod cast_utils; pub mod comparison; pub mod concat; pub mod concat_elements; -pub mod filter; -pub mod interleave; pub mod length; pub mod limit; pub mod partition; pub mod regexp; pub mod sort; pub mod substring; -pub mod take; pub mod temporal; pub mod window; pub mod zip; + +pub use arrow_select::{filter, interleave, take}; diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index ef423fcbf428..71bc9464ef50 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -1106,9 +1106,6 @@ fn sort_valids_array( #[cfg(test)] mod tests { use super::*; - use crate::compute::util::tests::{ - build_fixed_size_list_nullable, build_generic_list_nullable, - }; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; use std::convert::TryFrom; @@ -1356,12 +1353,15 @@ mod tests { { // for FixedSizedList if let Some(length) = fixed_length { - let input = Arc::new(build_fixed_size_list_nullable(data.clone(), length)); + let input = Arc::new(FixedSizeListArray::from_iter_primitive::( + data.clone(), + length, + )); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), _ => sort(&(input as ArrayRef), options).unwrap(), }; - let expected = Arc::new(build_fixed_size_list_nullable( + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( expected_data.clone(), length, )) as ArrayRef; @@ -1370,25 +1370,26 @@ mod tests { } // for List - let input = Arc::new(build_generic_list_nullable::(data.clone())); + let input = Arc::new(ListArray::from_iter_primitive::(data.clone())); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), _ => sort(&(input as ArrayRef), options).unwrap(), }; - let expected = - Arc::new(build_generic_list_nullable::(expected_data.clone())) - as ArrayRef; + let expected = Arc::new(ListArray::from_iter_primitive::( + expected_data.clone(), + )) as ArrayRef; assert_eq!(&sorted, &expected); // for LargeList - let input = Arc::new(build_generic_list_nullable::(data)); + let input = Arc::new(LargeListArray::from_iter_primitive::(data)); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), _ => sort(&(input as ArrayRef), options).unwrap(), }; - let expected = - Arc::new(build_generic_list_nullable::(expected_data)) as ArrayRef; + let expected = Arc::new(LargeListArray::from_iter_primitive::( + expected_data, + )) as ArrayRef; assert_eq!(&sorted, &expected); } diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 974af9593e36..9ddc535017ff 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -19,10 +19,7 @@ use crate::array::*; use crate::buffer::{buffer_bin_and, Buffer}; -use crate::datatypes::*; use crate::error::{ArrowError, Result}; -use num::{One, ToPrimitive, Zero}; -use std::ops::Add; /// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. /// @@ -61,93 +58,15 @@ pub(super) fn combine_option_bitmap( ) } -/// Takes/filters a list array's inner data using the offsets of the list array. -/// -/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns -/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 -/// elements) -pub(super) fn take_value_indices_from_list( - list: &GenericListArray, - indices: &PrimitiveArray, -) -> Result<(PrimitiveArray, Vec)> -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, - OffsetType: ArrowNumericType, - OffsetType::Native: OffsetSizeTrait + Add + Zero + One, - PrimitiveArray: From>>, -{ - // TODO: benchmark this function, there might be a faster unsafe alternative - let offsets: &[OffsetType::Native] = list.value_offsets(); - - let mut new_offsets = Vec::with_capacity(indices.len()); - let mut values = Vec::new(); - let mut current_offset = OffsetType::Native::zero(); - // add first offset - new_offsets.push(OffsetType::Native::zero()); - // compute the value indices, and set offsets accordingly - for i in 0..indices.len() { - if indices.is_valid(i) { - let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - let start = offsets[ix]; - let end = offsets[ix + 1]; - current_offset += end - start; - new_offsets.push(current_offset); - - let mut curr = start; - - // if start == end, this slot is empty - while curr < end { - values.push(Some(curr)); - curr += OffsetType::Native::one(); - } - } else { - new_offsets.push(current_offset); - } - } - - Ok((PrimitiveArray::::from(values), new_offsets)) -} - -/// Takes/filters a fixed size list array's inner data using the offsets of the list array. -pub(super) fn take_value_indices_from_fixed_size_list( - list: &FixedSizeListArray, - indices: &PrimitiveArray, - length: ::Native, -) -> Result> -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let mut values = vec![]; - - for i in 0..indices.len() { - if indices.is_valid(i) { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - let start = - list.value_offset(index) as ::Native; - - values.extend(start..start + length); - } - } - - Ok(PrimitiveArray::::from(values)) -} - #[cfg(test)] pub(super) mod tests { use super::*; use std::sync::Arc; + use crate::array::ArrayData; use crate::buffer::buffer_bin_or; use crate::datatypes::DataType; - use crate::util::bit_util; - use crate::{array::ArrayData, buffer::MutableBuffer}; /// Compares the null bitmaps of two arrays using a bitwise `or` operation. /// @@ -321,174 +240,4 @@ pub(super) mod tests { compare_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap() ); } - - pub(crate) fn build_generic_list( - data: Vec>>, - ) -> GenericListArray - where - S: OffsetSizeTrait + 'static, - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let data = data - .into_iter() - .map(|subarray| { - subarray.map(|item| { - item.into_iter() - .map(Some) - .collect::>>() - }) - }) - .collect(); - build_generic_list_nullable(data) - } - - pub(crate) fn build_generic_list_nullable( - data: Vec>>>, - ) -> GenericListArray - where - S: OffsetSizeTrait + 'static, - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let mut offset = vec![S::zero()]; - let mut values = vec![]; - - let list_len = data.len(); - let num_bytes = bit_util::ceil(list_len, 8); - let mut list_null_count = 0; - let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - for (idx, array) in data.into_iter().enumerate() { - if let Some(mut array) = array { - values.append(&mut array); - } else { - list_null_count += 1; - bit_util::unset_bit(list_bitmap.as_slice_mut(), idx); - } - offset.push(S::from_usize(values.len()).unwrap()); - } - - let value_data = PrimitiveArray::::from(values).into_data(); - let (list_data_type, value_offsets) = ( - GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref(&offset), - ); - - let list_data = ArrayData::builder(list_data_type) - .len(list_len) - .null_bit_buffer(Some(list_bitmap.into())) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build() - .unwrap(); - - GenericListArray::::from(list_data) - } - - pub(crate) fn build_fixed_size_list_nullable( - list_values: Vec>>>, - length: ::Native, - ) -> FixedSizeListArray - where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let mut values = vec![]; - let mut list_null_count = 0; - let list_len = list_values.len(); - - let num_bytes = bit_util::ceil(list_len, 8); - let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - for (idx, list_element) in list_values.into_iter().enumerate() { - if let Some(items) = list_element { - // every sub-array should have the same length - debug_assert_eq!(length as usize, items.len()); - - values.extend(items.into_iter()); - } else { - list_null_count += 1; - bit_util::unset_bit(list_bitmap.as_slice_mut(), idx); - values.extend(vec![None; length as usize].into_iter()); - } - } - - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", T::DATA_TYPE, list_null_count == 0)), - length, - ); - - let child_data = PrimitiveArray::::from(values).into_data(); - - let list_data = ArrayData::builder(list_data_type) - .len(list_len) - .null_bit_buffer(Some(list_bitmap.into())) - .add_child_data(child_data) - .build() - .unwrap(); - - FixedSizeListArray::from(list_data) - } - - #[test] - fn test_take_value_index_from_list() { - let list = build_generic_list::(vec![ - Some(vec![0, 1]), - Some(vec![2, 3, 4]), - Some(vec![5, 6, 7, 8, 9]), - ]); - let indices = UInt32Array::from(vec![2, 0]); - - let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap(); - - assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1])); - assert_eq!(offsets, vec![0, 5, 7]); - } - - #[test] - fn test_take_value_index_from_large_list() { - let list = build_generic_list::(vec![ - Some(vec![0, 1]), - Some(vec![2, 3, 4]), - Some(vec![5, 6, 7, 8, 9]), - ]); - let indices = UInt32Array::from(vec![2, 0]); - - let (indexed, offsets) = - take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap(); - - assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1])); - assert_eq!(offsets, vec![0, 5, 7]); - } - - #[test] - fn test_take_value_index_from_fixed_list() { - let list = build_fixed_size_list_nullable::( - vec![ - Some(vec![Some(1), Some(2), None]), - Some(vec![Some(4), None, Some(6)]), - None, - Some(vec![None, Some(8), Some(9)]), - ], - 3, - ); - - let indices = UInt32Array::from(vec![2, 1, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); - - assert_eq!(indexed, UInt32Array::from(vec![6, 7, 8, 3, 4, 5, 0, 1, 2])); - - let indices = UInt32Array::from(vec![3, 2, 1, 2, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); - - assert_eq!( - indexed, - UInt32Array::from(vec![9, 10, 11, 6, 7, 8, 3, 4, 5, 6, 7, 8, 0, 1, 2]) - ); - } } From 5de555e3396225ff427185f81788481a2c11543f Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 21 Oct 2022 09:14:56 +0800 Subject: [PATCH 0151/1411] replace from_timestamp by from_timestamp_opt (#2894) --- arrow-array/src/array/primitive_array.rs | 19 +++++- arrow-array/src/temporal_conversions.rs | 76 ++++++++++++------------ 2 files changed, 54 insertions(+), 41 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4722cec67c65..5a9ffd34cdb4 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1362,11 +1362,24 @@ mod tests { } #[test] - #[should_panic(expected = "invalid time")] fn test_time32second_invalid_neg() { - // The panic should come from chrono, not from arrow + // chrono::NaiveDatetime::from_timestamp_opt returns None while input is invalid let arr: PrimitiveArray = vec![-7201, -60054].into(); - println!("{:?}", arr); + assert_eq!( + "PrimitiveArray\n[\n null,\n null,\n]", + format!("{:?}", arr) + ) + } + + #[test] + fn test_timestamp_micros_out_of_range() { + // replicate the issue from https://github.com/apache/arrow-datafusion/issues/3832 + let arr: PrimitiveArray = + vec![9065525203050843594].into(); + assert_eq!( + "PrimitiveArray\n[\n null,\n]", + format!("{:?}", arr) + ) } #[test] diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index 4a371fc788e9..9aae83c8ad69 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -37,16 +37,16 @@ pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; /// converts a `i32` representing a `date32` to [`NaiveDateTime`] #[inline] -pub fn date32_to_datetime(v: i32) -> NaiveDateTime { - NaiveDateTime::from_timestamp(v as i64 * SECONDS_IN_DAY, 0) +pub fn date32_to_datetime(v: i32) -> Option { + NaiveDateTime::from_timestamp_opt(v as i64 * SECONDS_IN_DAY, 0) } /// converts a `i64` representing a `date64` to [`NaiveDateTime`] #[inline] -pub fn date64_to_datetime(v: i64) -> NaiveDateTime { +pub fn date64_to_datetime(v: i64) -> Option { let (sec, milli_sec) = split_second(v, MILLISECONDS); - NaiveDateTime::from_timestamp( + NaiveDateTime::from_timestamp_opt( // extract seconds from milliseconds sec, // discard extracted seconds and convert milliseconds to nanoseconds @@ -56,15 +56,15 @@ pub fn date64_to_datetime(v: i64) -> NaiveDateTime { /// converts a `i32` representing a `time32(s)` to [`NaiveDateTime`] #[inline] -pub fn time32s_to_time(v: i32) -> NaiveTime { - NaiveTime::from_num_seconds_from_midnight(v as u32, 0) +pub fn time32s_to_time(v: i32) -> Option { + NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0) } /// converts a `i32` representing a `time32(ms)` to [`NaiveDateTime`] #[inline] -pub fn time32ms_to_time(v: i32) -> NaiveTime { +pub fn time32ms_to_time(v: i32) -> Option { let v = v as i64; - NaiveTime::from_num_seconds_from_midnight( + NaiveTime::from_num_seconds_from_midnight_opt( // extract seconds from milliseconds (v / MILLISECONDS) as u32, // discard extracted seconds and convert milliseconds to @@ -75,8 +75,8 @@ pub fn time32ms_to_time(v: i32) -> NaiveTime { /// converts a `i64` representing a `time64(us)` to [`NaiveDateTime`] #[inline] -pub fn time64us_to_time(v: i64) -> NaiveTime { - NaiveTime::from_num_seconds_from_midnight( +pub fn time64us_to_time(v: i64) -> Option { + NaiveTime::from_num_seconds_from_midnight_opt( // extract seconds from microseconds (v / MICROSECONDS) as u32, // discard extracted seconds and convert microseconds to @@ -87,8 +87,8 @@ pub fn time64us_to_time(v: i64) -> NaiveTime { /// converts a `i64` representing a `time64(ns)` to [`NaiveDateTime`] #[inline] -pub fn time64ns_to_time(v: i64) -> NaiveTime { - NaiveTime::from_num_seconds_from_midnight( +pub fn time64ns_to_time(v: i64) -> Option { + NaiveTime::from_num_seconds_from_midnight_opt( // extract seconds from nanoseconds (v / NANOSECONDS) as u32, // discard extracted seconds @@ -98,16 +98,16 @@ pub fn time64ns_to_time(v: i64) -> NaiveTime { /// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`] #[inline] -pub fn timestamp_s_to_datetime(v: i64) -> NaiveDateTime { - NaiveDateTime::from_timestamp(v, 0) +pub fn timestamp_s_to_datetime(v: i64) -> Option { + NaiveDateTime::from_timestamp_opt(v, 0) } /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] #[inline] -pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { +pub fn timestamp_ms_to_datetime(v: i64) -> Option { let (sec, milli_sec) = split_second(v, MILLISECONDS); - NaiveDateTime::from_timestamp( + NaiveDateTime::from_timestamp_opt( // extract seconds from milliseconds sec, // discard extracted seconds and convert milliseconds to nanoseconds @@ -117,10 +117,10 @@ pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] #[inline] -pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { +pub fn timestamp_us_to_datetime(v: i64) -> Option { let (sec, micro_sec) = split_second(v, MICROSECONDS); - NaiveDateTime::from_timestamp( + NaiveDateTime::from_timestamp_opt( // extract seconds from microseconds sec, // discard extracted seconds and convert microseconds to nanoseconds @@ -130,10 +130,10 @@ pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] #[inline] -pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { +pub fn timestamp_ns_to_datetime(v: i64) -> Option { let (sec, nano_sec) = split_second(v, NANOSECONDS); - NaiveDateTime::from_timestamp( + NaiveDateTime::from_timestamp_opt( // extract seconds from nanoseconds sec, // discard extracted seconds nano_sec, @@ -172,14 +172,14 @@ pub fn duration_ns_to_duration(v: i64) -> Duration { /// Converts an [`ArrowPrimitiveType`] to [`NaiveDateTime`] pub fn as_datetime(v: i64) -> Option { match T::DATA_TYPE { - DataType::Date32 => Some(date32_to_datetime(v as i32)), - DataType::Date64 => Some(date64_to_datetime(v)), + DataType::Date32 => date32_to_datetime(v as i32), + DataType::Date64 => date64_to_datetime(v), DataType::Time32(_) | DataType::Time64(_) => None, DataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Some(timestamp_s_to_datetime(v)), - TimeUnit::Millisecond => Some(timestamp_ms_to_datetime(v)), - TimeUnit::Microsecond => Some(timestamp_us_to_datetime(v)), - TimeUnit::Nanosecond => Some(timestamp_ns_to_datetime(v)), + TimeUnit::Second => timestamp_s_to_datetime(v), + TimeUnit::Millisecond => timestamp_ms_to_datetime(v), + TimeUnit::Microsecond => timestamp_us_to_datetime(v), + TimeUnit::Nanosecond => timestamp_ns_to_datetime(v), }, // interval is not yet fully documented [ARROW-3097] DataType::Interval(_) => None, @@ -199,14 +199,14 @@ pub fn as_time(v: i64) -> Option { // safe to immediately cast to u32 as `self.value(i)` is positive i32 let v = v as u32; match unit { - TimeUnit::Second => Some(time32s_to_time(v as i32)), - TimeUnit::Millisecond => Some(time32ms_to_time(v as i32)), + TimeUnit::Second => time32s_to_time(v as i32), + TimeUnit::Millisecond => time32ms_to_time(v as i32), _ => None, } } DataType::Time64(unit) => match unit { - TimeUnit::Microsecond => Some(time64us_to_time(v)), - TimeUnit::Nanosecond => Some(time64ns_to_time(v)), + TimeUnit::Microsecond => time64us_to_time(v), + TimeUnit::Nanosecond => time64ns_to_time(v), _ => None, }, DataType::Timestamp(_, _) => as_datetime::(v).map(|datetime| datetime.time()), @@ -241,12 +241,12 @@ mod tests { fn negative_input_timestamp_ns_to_datetime() { assert_eq!( timestamp_ns_to_datetime(-1), - NaiveDateTime::from_timestamp(-1, 999_999_999) + NaiveDateTime::from_timestamp_opt(-1, 999_999_999) ); assert_eq!( timestamp_ns_to_datetime(-1_000_000_001), - NaiveDateTime::from_timestamp(-2, 999_999_999) + NaiveDateTime::from_timestamp_opt(-2, 999_999_999) ); } @@ -254,12 +254,12 @@ mod tests { fn negative_input_timestamp_us_to_datetime() { assert_eq!( timestamp_us_to_datetime(-1), - NaiveDateTime::from_timestamp(-1, 999_999_000) + NaiveDateTime::from_timestamp_opt(-1, 999_999_000) ); assert_eq!( timestamp_us_to_datetime(-1_000_001), - NaiveDateTime::from_timestamp(-2, 999_999_000) + NaiveDateTime::from_timestamp_opt(-2, 999_999_000) ); } @@ -267,12 +267,12 @@ mod tests { fn negative_input_timestamp_ms_to_datetime() { assert_eq!( timestamp_ms_to_datetime(-1), - NaiveDateTime::from_timestamp(-1, 999_000_000) + NaiveDateTime::from_timestamp_opt(-1, 999_000_000) ); assert_eq!( timestamp_ms_to_datetime(-1_001), - NaiveDateTime::from_timestamp(-2, 999_000_000) + NaiveDateTime::from_timestamp_opt(-2, 999_000_000) ); } @@ -280,12 +280,12 @@ mod tests { fn negative_input_date64_to_datetime() { assert_eq!( date64_to_datetime(-1), - NaiveDateTime::from_timestamp(-1, 999_000_000) + NaiveDateTime::from_timestamp_opt(-1, 999_000_000) ); assert_eq!( date64_to_datetime(-1_001), - NaiveDateTime::from_timestamp(-2, 999_000_000) + NaiveDateTime::from_timestamp_opt(-2, 999_000_000) ); } From be483777092cb1007ced0323a1c659c2634b1a5c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 20 Oct 2022 23:43:23 -0700 Subject: [PATCH 0152/1411] Cleanup decimal sort function (#2908) --- arrow/src/compute/kernels/sort.rs | 35 +++---------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 71bc9464ef50..6720a0c5c704 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -146,7 +146,9 @@ pub fn sort_to_indices( let (v, n) = partition_validity(values); Ok(match values.data_type() { - DataType::Decimal128(_, _) => sort_decimal(values, v, n, cmp, &options, limit), + DataType::Decimal128(_, _) => { + sort_primitive::(values, v, n, cmp, &options, limit) + } DataType::Boolean => sort_boolean(values, v, n, &options, limit), DataType::Int8 => { sort_primitive::(values, v, n, cmp, &options, limit) @@ -474,37 +476,6 @@ fn sort_boolean( UInt32Array::from(result_data) } -/// Sort Decimal array -fn sort_decimal( - decimal_values: &ArrayRef, - value_indices: Vec, - null_indices: Vec, - cmp: F, - options: &SortOptions, - limit: Option, -) -> UInt32Array -where - F: Fn(i128, i128) -> std::cmp::Ordering, -{ - // downcast to decimal array - let decimal_array = decimal_values - .as_any() - .downcast_ref::() - .expect("Unable to downcast to decimal array"); - let valids = value_indices - .into_iter() - .map(|index| (index, decimal_array.value(index as usize))) - .collect::>(); - sort_primitive_inner( - decimal_values.len(), - null_indices, - cmp, - options, - limit, - valids, - ) -} - /// Sort primitive values fn sort_primitive( values: &ArrayRef, From f629a2ebe08033e7b78585d82e98c50a4439e7a2 Mon Sep 17 00:00:00 2001 From: Max Burke Date: Fri, 21 Oct 2022 12:06:46 -0700 Subject: [PATCH 0153/1411] Implement ord for FixedSizeBinary types (#2905) * implement ord for FixedSizeBinary types * add ord test --- arrow/src/array/ord.rs | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index 3fc62f807bef..305d41cc0167 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -295,6 +295,14 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { + let left: FixedSizeBinaryArray = + FixedSizeBinaryArray::from(left.data().clone()); + let right: FixedSizeBinaryArray = + FixedSizeBinaryArray::from(right.data().clone()); + + Box::new(move |i, j| left.value(i).cmp(right.value(j))) + } (lhs, _) => { return Err(ArrowError::InvalidArgumentError(format!( "The data type type {:?} has no natural order", @@ -307,10 +315,34 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result Result<()> { + let items = vec![vec![1u8], vec![2u8]]; + let array = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); + + let cmp = build_compare(&array, &array)?; + + assert_eq!(Ordering::Less, (cmp)(0, 1)); + Ok(()) + } + + #[test] + fn test_fixed_size_binary_fixed_size_binary() -> Result<()> { + let items = vec![vec![1u8]]; + let array1 = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); + let items = vec![vec![2u8]]; + let array2 = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); + + let cmp = build_compare(&array1, &array2)?; + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + Ok(()) + } + #[test] fn test_i32() -> Result<()> { let array = Int32Array::from(vec![1, 2]); From 9e5e47716898ade5e6ffcff1f77551f82d55a1b8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 22 Oct 2022 08:26:56 +1300 Subject: [PATCH 0154/1411] Add specialized interleave implementation for primitives (#2898) --- arrow-select/src/interleave.rs | 63 ++++++++++++++++++-- arrow/Cargo.toml | 5 ++ arrow/benches/interleave_kernels.rs | 91 +++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 arrow/benches/interleave_kernels.rs diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 537075f1f308..29f75894dcb9 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -15,9 +15,22 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{make_array, new_empty_array, Array, ArrayRef}; +use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use arrow_array::cast::as_primitive_array; +use arrow_array::{ + downcast_primitive, make_array, new_empty_array, Array, ArrayRef, ArrowPrimitiveType, + PrimitiveArray, +}; use arrow_data::transform::MutableArrayData; -use arrow_schema::ArrowError; +use arrow_data::ArrayDataBuilder; +use arrow_schema::{ArrowError, DataType}; +use std::sync::Arc; + +macro_rules! primitive_helper { + ($t:ty, $values:ident, $indices:ident, $data_type:ident) => { + interleave_primitive::<$t>($values, $indices, $data_type) + }; +} /// /// Takes elements by index from a list of [`Array`], creating a new [`Array`] from those values. @@ -70,9 +83,51 @@ pub fn interleave( return Ok(new_empty_array(data_type)); } - // TODO: Add specialized implementations (#2864) + downcast_primitive! { + data_type => (primitive_helper, values, indices, data_type), + _ => interleave_fallback(values, indices) + } +} + +fn interleave_primitive( + values: &[&dyn Array], + indices: &[(usize, usize)], + data_type: &DataType, +) -> Result { + let mut has_nulls = false; + let cast: Vec<_> = values + .iter() + .map(|x| { + has_nulls = has_nulls || x.null_count() != 0; + as_primitive_array::(*x) + }) + .collect(); + + let mut values = BufferBuilder::::new(indices.len()); + for (a, b) in indices { + let v = cast[*a].value(*b); + values.append(v) + } + + let mut null_count = 0; + let nulls = has_nulls.then(|| { + let mut builder = BooleanBufferBuilder::new(indices.len()); + for (a, b) in indices { + let v = cast[*a].is_valid(*b); + null_count += !v as usize; + builder.append(v) + } + builder.finish() + }); + + let builder = ArrayDataBuilder::new(data_type.clone()) + .len(indices.len()) + .add_buffer(values.finish()) + .null_bit_buffer(nulls) + .null_count(null_count); - interleave_fallback(values, indices) + let data = unsafe { builder.build_unchecked() }; + Ok(Arc::new(PrimitiveArray::::from(data))) } /// Fallback implementation of interleave using [`MutableArrayData`] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d066cbce956b..7a933360c0c3 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -176,6 +176,11 @@ name = "take_kernels" harness = false required-features = ["test_utils"] +[[bench]] +name = "interleave_kernels" +harness = false +required-features = ["test_utils"] + [[bench]] name = "length_kernel" harness = false diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs new file mode 100644 index 000000000000..6cf56eb98950 --- /dev/null +++ b/arrow/benches/interleave_kernels.rs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; + +use criterion::Criterion; +use std::ops::Range; + +use rand::Rng; + +extern crate arrow; + +use arrow::datatypes::*; +use arrow::util::test_util::seedable_rng; +use arrow::{array::*, util::bench_util::*}; +use arrow_select::interleave::interleave; + +fn do_bench( + c: &mut Criterion, + prefix: &str, + len: usize, + base: &dyn Array, + slices: &[Range], +) { + let mut rng = seedable_rng(); + + let arrays: Vec<_> = slices + .iter() + .map(|r| base.slice(r.start, r.end - r.start)) + .collect(); + let values: Vec<_> = arrays.iter().map(|x| x.as_ref()).collect(); + + let indices: Vec<_> = (0..len) + .map(|_| { + let array_idx = rng.gen_range(0..values.len()); + let value_idx = rng.gen_range(0..values[array_idx].len()); + (array_idx, value_idx) + }) + .collect(); + + c.bench_function( + &format!("interleave {} {} {:?}", prefix, len, slices), + |b| b.iter(|| criterion::black_box(interleave(&values, &indices).unwrap())), + ); +} + +fn add_benchmark(c: &mut Criterion) { + let a = create_primitive_array::(1024, 0.); + + do_bench(c, "i32(0.0)", 100, &a, &[0..100, 100..230, 450..1000]); + do_bench(c, "i32(0.0)", 400, &a, &[0..100, 100..230, 450..1000]); + do_bench(c, "i32(0.0)", 1024, &a, &[0..100, 100..230, 450..1000]); + do_bench( + c, + "i32(0.0)", + 1024, + &a, + &[0..100, 100..230, 450..1000, 0..1000], + ); + + let a = create_primitive_array::(1024, 0.5); + + do_bench(c, "i32(0.5)", 100, &a, &[0..100, 100..230, 450..1000]); + do_bench(c, "i32(0.5)", 400, &a, &[0..100, 100..230, 450..1000]); + do_bench(c, "i32(0.5)", 1024, &a, &[0..100, 100..230, 450..1000]); + do_bench( + c, + "i32(0.5)", + 1024, + &a, + &[0..100, 100..230, 450..1000, 0..1000], + ); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); From 22e742bc76b5db333ab2384a92c362662cca3879 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 23 Oct 2022 00:08:05 -0700 Subject: [PATCH 0155/1411] Support decimal256 array in sort kernels (#2912) * Support decimal256 array in sort kernels * Add i256::MAX and i256::MIN cases --- arrow/src/compute/kernels/sort.rs | 465 ++++++++++++++++++++++++++++-- 1 file changed, 443 insertions(+), 22 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 6720a0c5c704..e2e20e756065 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -149,6 +149,9 @@ pub fn sort_to_indices( DataType::Decimal128(_, _) => { sort_primitive::(values, v, n, cmp, &options, limit) } + DataType::Decimal256(_, _) => { + sort_primitive::(values, v, n, cmp, &options, limit) + } DataType::Boolean => sort_boolean(values, v, n, &options, limit), DataType::Int8 => { sort_primitive::(values, v, n, cmp, &options, limit) @@ -1077,39 +1080,77 @@ fn sort_valids_array( #[cfg(test)] mod tests { use super::*; + use arrow_buffer::i256; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; use std::convert::TryFrom; use std::sync::Arc; - fn create_decimal_array(data: Vec>) -> Decimal128Array { + fn create_decimal128_array(data: Vec>) -> Decimal128Array { data.into_iter() .collect::() .with_precision_and_scale(23, 6) .unwrap() } - fn test_sort_to_indices_decimal_array( + fn create_decimal256_array(data: Vec>) -> Decimal256Array { + data.into_iter() + .collect::() + .with_precision_and_scale(53, 6) + .unwrap() + } + + fn test_sort_to_indices_decimal128_array( data: Vec>, options: Option, limit: Option, expected_data: Vec, ) { - let output = create_decimal_array(data); + let output = create_decimal128_array(data); + let expected = UInt32Array::from(expected_data); + let output = + sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + assert_eq!(output, expected) + } + + fn test_sort_to_indices_decimal256_array( + data: Vec>, + options: Option, + limit: Option, + expected_data: Vec, + ) { + let output = create_decimal256_array(data); let expected = UInt32Array::from(expected_data); let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } - fn test_sort_decimal_array( + fn test_sort_decimal128_array( data: Vec>, options: Option, limit: Option, expected_data: Vec>, ) { - let output = create_decimal_array(data); - let expected = Arc::new(create_decimal_array(expected_data)) as ArrayRef; + let output = create_decimal128_array(data); + let expected = Arc::new(create_decimal128_array(expected_data)) as ArrayRef; + let output = match limit { + Some(_) => { + sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() + } + _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), + }; + assert_eq!(&output, &expected) + } + + fn test_sort_decimal256_array( + data: Vec>, + options: Option, + limit: Option, + expected_data: Vec>, + ) { + let output = create_decimal256_array(data); + let expected = Arc::new(create_decimal256_array(expected_data)) as ArrayRef; let output = match limit { Some(_) => { sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() @@ -1758,14 +1799,14 @@ mod tests { #[test] fn test_sort_indices_decimal128() { // decimal default - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, None, vec![0, 6, 4, 2, 3, 5, 1], ); // decimal descending - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1775,7 +1816,7 @@ mod tests { vec![1, 5, 3, 2, 4, 6, 0], ); // decimal null_first and descending - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1785,7 +1826,7 @@ mod tests { vec![6, 0, 1, 5, 3, 2, 4], ); // decimal null_first - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1795,14 +1836,14 @@ mod tests { vec![0, 6, 4, 2, 3, 5, 1], ); // limit - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, Some(3), vec![0, 6, 4], ); // limit descending - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1812,7 +1853,7 @@ mod tests { vec![1, 5, 3], ); // limit descending null_first - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1822,7 +1863,7 @@ mod tests { vec![6, 0, 1], ); // limit null_first - test_sort_to_indices_decimal_array( + test_sort_to_indices_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1833,17 +1874,186 @@ mod tests { ); } + #[test] + fn test_sort_indices_decimal256() { + // decimal default + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + None, + None, + vec![0, 6, 4, 2, 3, 5, 1], + ); + // decimal descending + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![1, 5, 3, 2, 4, 6, 0], + ); + // decimal null_first and descending + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![6, 0, 1, 5, 3, 2, 4], + ); + // decimal null_first + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![0, 6, 4, 2, 3, 5, 1], + ); + // limit + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + None, + Some(3), + vec![0, 6, 4], + ); + // limit descending + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: false, + }), + Some(3), + vec![1, 5, 3], + ); + // limit descending null_first + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(3), + vec![6, 0, 1], + ); + // limit null_first + test_sort_to_indices_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(3), + vec![0, 6, 4], + ); + } + + #[test] + fn test_sort_indices_decimal256_max_min() { + test_sort_to_indices_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![0, 1, 4, 2, 3], + ); + + test_sort_to_indices_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![0, 3, 2, 4, 1], + ); + + test_sort_to_indices_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(4), + vec![0, 1, 4, 2], + ); + + test_sort_to_indices_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(4), + vec![0, 3, 2, 4], + ); + } + #[test] fn test_sort_decimal128() { // decimal default - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, None, vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], ); // decimal descending - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1853,7 +2063,7 @@ mod tests { vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None], ); // decimal null_first and descending - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1863,7 +2073,7 @@ mod tests { vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)], ); // decimal null_first - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1873,14 +2083,14 @@ mod tests { vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], ); // limit - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, Some(3), vec![None, None, Some(1)], ); // limit descending - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1890,7 +2100,7 @@ mod tests { vec![Some(5), Some(4), Some(3)], ); // limit descending null_first - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1900,7 +2110,7 @@ mod tests { vec![None, None, Some(5)], ); // limit null_first - test_sort_decimal_array( + test_sort_decimal128_array( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1911,6 +2121,217 @@ mod tests { ); } + #[test] + fn test_sort_decimal256() { + // decimal default + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + None, + None, + vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // decimal descending + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // decimal null_first and descending + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // decimal null_first + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // limit + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + None, + Some(3), + vec![None, None, Some(1)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // limit descending + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: false, + }), + Some(3), + vec![Some(5), Some(4), Some(3)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // limit descending null_first + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(3), + vec![None, None, Some(5)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + // limit null_first + test_sort_decimal256_array( + vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(3), + vec![None, None, Some(1)] + .iter() + .map(|v| v.map(i256::from_i128)) + .collect(), + ); + } + + #[test] + fn test_sort_decimal256_max_min() { + test_sort_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![ + None, + None, + Some(i256::MIN), + Some(i256::from_i128(-1)), + Some(i256::from_i128(1)), + Some(i256::MAX), + ], + ); + + test_sort_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![ + None, + None, + Some(i256::MAX), + Some(i256::from_i128(1)), + Some(i256::from_i128(-1)), + Some(i256::MIN), + ], + ); + + test_sort_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(4), + vec![None, None, Some(i256::MIN), Some(i256::from_i128(-1))], + ); + + test_sort_decimal256_array( + vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + Some(4), + vec![None, None, Some(i256::MAX), Some(i256::from_i128(1))], + ); + } + #[test] fn test_sort_primitives() { // default case From e859f30c52fe70e132ca57ac708989282a331d79 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 25 Oct 2022 07:25:10 +1300 Subject: [PATCH 0156/1411] Update quick-xml requirement from 0.25.0 to 0.26.0 (#2918) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.25.0...v0.26.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 6abb390fc800..e52137383959 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.13", default-features = false, optional = true } -quick-xml = { version = "0.25.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 7e5d4a1b3e506246cab6bbbf0fe16122e4defa6e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Oct 2022 07:32:12 +1300 Subject: [PATCH 0157/1411] Respect Page Size Limits in ArrowWriter (#2853) (#2890) * Respect Page Size Limits in ArrowWriter (#2853) * Update tests * Add test required features * Fix strings * Review feedback --- parquet/Cargo.toml | 4 + parquet/src/arrow/arrow_writer/byte_array.rs | 17 +- parquet/src/column/writer/encoder.rs | 1 + parquet/src/column/writer/mod.rs | 12 +- .../src/encodings/encoding/dict_encoder.rs | 3 +- parquet/src/encodings/encoding/mod.rs | 2 +- parquet/src/encodings/levels.rs | 9 +- parquet/src/encodings/rle.rs | 46 +- parquet/tests/arrow_writer_layout.rs | 472 ++++++++++++++++++ 9 files changed, 514 insertions(+), 52 deletions(-) create mode 100644 parquet/tests/arrow_writer_layout.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 819f41bca32a..9c7da94f9dd7 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -81,6 +81,10 @@ experimental = [] # Enable async APIs async = ["futures", "tokio"] +[[test]] +name = "arrow_writer_layout" +required-features = ["arrow"] + [[bin]] name = "parquet-read" required-features = ["cli"] diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 9ea3767a28ed..7070cecacf2b 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -379,8 +379,7 @@ impl DictEncoder { fn estimated_data_page_size(&self) -> usize { let bit_width = self.bit_width(); - 1 + RleEncoder::min_buffer_size(bit_width) - + RleEncoder::max_buffer_size(bit_width, self.indices.len()) + 1 + RleEncoder::max_buffer_size(bit_width, self.indices.len()) } fn estimated_dict_page_size(&self) -> usize { @@ -427,7 +426,6 @@ impl DictEncoder { struct ByteArrayEncoder { fallback: FallbackEncoder, dict_encoder: Option, - num_values: usize, min_value: Option, max_value: Option, } @@ -466,7 +464,6 @@ impl ColumnValueEncoder for ByteArrayEncoder { Ok(Self { fallback, dict_encoder: dictionary, - num_values: 0, min_value: None, max_value: None, }) @@ -487,7 +484,10 @@ impl ColumnValueEncoder for ByteArrayEncoder { } fn num_values(&self) -> usize { - self.num_values + match &self.dict_encoder { + Some(encoder) => encoder.indices.len(), + None => self.fallback.num_values, + } } fn has_dictionary(&self) -> bool { @@ -508,7 +508,7 @@ impl ColumnValueEncoder for ByteArrayEncoder { fn flush_dict_page(&mut self) -> Result> { match self.dict_encoder.take() { Some(encoder) => { - if self.num_values != 0 { + if !encoder.indices.is_empty() { return Err(general_err!( "Must flush data pages before flushing dictionary" )); @@ -551,10 +551,7 @@ where match &mut encoder.dict_encoder { Some(dict_encoder) => dict_encoder.encode(values, indices), - None => { - encoder.num_values += indices.len(); - encoder.fallback.encode(values, indices) - } + None => encoder.fallback.encode(values, indices), } } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 4fb4f210e146..9227c4ba1ce8 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -201,6 +201,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { } fn write_gather(&mut self, values: &Self::Values, indices: &[usize]) -> Result<()> { + self.num_values += indices.len(); let slice: Vec<_> = indices.iter().map(|idx| values[*idx].clone()).collect(); self.write_slice(&slice) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 55e667043d35..0f96b6fd78e5 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1825,7 +1825,7 @@ mod tests { let page_writer = Box::new(SerializedPageWriter::new(&mut writer)); let props = Arc::new( WriterProperties::builder() - .set_data_pagesize_limit(15) // actually each page will have size 15-18 bytes + .set_data_pagesize_limit(10) .set_write_batch_size(3) // write 3 values at a time .build(), ); @@ -1846,16 +1846,14 @@ mod tests { ); let mut res = Vec::new(); while let Some(page) = page_reader.get_next_page().unwrap() { - res.push((page.page_type(), page.num_values())); + res.push((page.page_type(), page.num_values(), page.buffer().len())); } assert_eq!( res, vec![ - (PageType::DICTIONARY_PAGE, 10), - (PageType::DATA_PAGE, 3), - (PageType::DATA_PAGE, 3), - (PageType::DATA_PAGE, 3), - (PageType::DATA_PAGE, 1) + (PageType::DICTIONARY_PAGE, 10, 40), + (PageType::DATA_PAGE, 9, 10), + (PageType::DATA_PAGE, 1, 3), ] ); } diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 18deba65e687..1b516452083c 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -162,8 +162,7 @@ impl Encoder for DictEncoder { fn estimated_data_encoded_size(&self) -> usize { let bit_width = self.bit_width(); - 1 + RleEncoder::min_buffer_size(bit_width) - + RleEncoder::max_buffer_size(bit_width, self.indices.len()) + RleEncoder::max_buffer_size(bit_width, self.indices.len()) } fn flush_buffer(&mut self) -> Result { diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index 34d3bb3d4c75..78f4a8b97b33 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -888,7 +888,7 @@ mod tests { // DICTIONARY // NOTE: The final size is almost the same because the dictionary entries are // preserved after encoded values have been written. - run_test::(Encoding::RLE_DICTIONARY, -1, &[123, 1024], 11, 68, 66); + run_test::(Encoding::RLE_DICTIONARY, -1, &[123, 1024], 0, 2, 0); // DELTA_BINARY_PACKED run_test::(Encoding::DELTA_BINARY_PACKED, -1, &[123; 1024], 0, 35, 0); diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 95384926ddba..cf1da20b6842 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -38,13 +38,8 @@ pub fn max_buffer_size( ) -> usize { let bit_width = num_required_bits(max_level as u64); match encoding { - Encoding::RLE => { - RleEncoder::max_buffer_size(bit_width, num_buffered_values) - + RleEncoder::min_buffer_size(bit_width) - } - Encoding::BIT_PACKED => { - ceil((num_buffered_values * bit_width as usize) as i64, 8) as usize - } + Encoding::RLE => RleEncoder::max_buffer_size(bit_width, num_buffered_values), + Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize, 8), _ => panic!("Unsupported encoding type {}", encoding), } } diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 93dd4ab565ca..9475275cb625 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -42,9 +42,8 @@ use crate::util::{ /// repeated-value := value that is repeated, using a fixed-width of /// round-up-to-next-byte(bit-width) -/// Maximum groups per bit-packed run. Current value is 64. +/// Maximum groups of 8 values per bit-packed run. Current value is 64. const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; -const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; /// A RLE/Bit-Packing hybrid encoder. // TODO: tracking memory usage @@ -99,31 +98,28 @@ impl RleEncoder { } } - /// Returns the minimum buffer size needed to use the encoder for `bit_width`. - /// This is the maximum length of a single run for `bit_width`. - pub fn min_buffer_size(bit_width: u8) -> usize { - let max_bit_packed_run_size = 1 + bit_util::ceil( - (MAX_VALUES_PER_BIT_PACKED_RUN * bit_width as usize) as i64, - 8, - ); - let max_rle_run_size = - bit_util::MAX_VLQ_BYTE_LEN + bit_util::ceil(bit_width as i64, 8) as usize; - std::cmp::max(max_bit_packed_run_size as usize, max_rle_run_size) - } - - /// Returns the maximum buffer size takes to encode `num_values` values with + /// Returns the maximum buffer size to encode `num_values` values with /// `bit_width`. pub fn max_buffer_size(bit_width: u8, num_values: usize) -> usize { - // First the maximum size for bit-packed run - let bytes_per_run = bit_width; - let num_runs = bit_util::ceil(num_values as i64, 8) as usize; - let bit_packed_max_size = num_runs + num_runs * bytes_per_run as usize; + // The maximum size occurs with the shortest possible runs of 8 + let num_runs = bit_util::ceil(num_values, 8); + + // The number of bytes in a run of 8 + let bytes_per_run = bit_width as usize; + + // The maximum size if stored as shortest possible bit packed runs of 8 + let bit_packed_max_size = num_runs + num_runs * bytes_per_run; + + // The length of `8` VLQ encoded + let rle_len_prefix = 1; + + // The length of an RLE run of 8 + let min_rle_run_size = rle_len_prefix + bit_util::ceil(bit_width as usize, 8); + + // The maximum size if stored as shortest possible RLE runs of 8 + let rle_max_size = num_runs * min_rle_run_size; - // Second the maximum size for RLE run - let min_rle_run_size = 1 + bit_util::ceil(bit_width as i64, 8) as usize; - let rle_max_size = - bit_util::ceil(num_values as i64, 8) as usize * min_rle_run_size; - std::cmp::max(bit_packed_max_size, rle_max_size) as usize + bit_packed_max_size.max(rle_max_size) } /// Encodes `value`, which must be representable with `bit_width` bits. @@ -905,8 +901,8 @@ mod tests { #[test] fn test_rle_specific_roundtrip() { let bit_width = 1; - let buffer_len = RleEncoder::min_buffer_size(bit_width); let values: Vec = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; + let buffer_len = RleEncoder::max_buffer_size(bit_width, values.len()); let mut encoder = RleEncoder::new(bit_width, buffer_len); for v in &values { encoder.put(*v as u64) diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs new file mode 100644 index 000000000000..40076add325a --- /dev/null +++ b/parquet/tests/arrow_writer_layout.rs @@ -0,0 +1,472 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests that the ArrowWriter correctly lays out values into multiple pages + +use arrow::array::{Int32Array, StringArray}; +use arrow::record_batch::RecordBatch; +use bytes::Bytes; +use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; +use parquet::arrow::ArrowWriter; +use parquet::basic::{Encoding, PageType}; +use parquet::file::metadata::ParquetMetaData; +use parquet::file::properties::WriterProperties; +use parquet::file::reader::SerializedPageReader; +use std::sync::Arc; + +struct Layout { + row_groups: Vec, +} + +struct RowGroup { + columns: Vec, +} + +struct ColumnChunk { + pages: Vec, + dictionary_page: Option, +} + +struct Page { + rows: usize, + compressed_size: usize, + page_header_size: usize, + encoding: Encoding, + page_type: PageType, +} + +struct LayoutTest { + props: WriterProperties, + batches: Vec, + layout: Layout, +} + +fn do_test(test: LayoutTest) { + let mut buf = Vec::with_capacity(1024); + + let mut writer = + ArrowWriter::try_new(&mut buf, test.batches[0].schema(), Some(test.props)) + .unwrap(); + for batch in test.batches { + writer.write(&batch).unwrap(); + } + writer.close().unwrap(); + let b = Bytes::from(buf); + + // Re-read file to decode column index + let read_options = ArrowReaderOptions::new().with_page_index(true); + let reader = + ParquetRecordBatchReaderBuilder::try_new_with_options(b.clone(), read_options) + .unwrap(); + + assert_layout(&b, reader.metadata().as_ref(), &test.layout); +} + +fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { + assert_eq!(meta.row_groups().len(), layout.row_groups.len()); + for (row_group, row_group_layout) in meta.row_groups().iter().zip(&layout.row_groups) + { + // Check against offset index + let offset_index = row_group.page_offset_index().as_ref().unwrap(); + assert_eq!(offset_index.len(), row_group_layout.columns.len()); + + for (column_index, column_layout) in + offset_index.iter().zip(&row_group_layout.columns) + { + assert_eq!( + column_index.len(), + column_layout.pages.len(), + "index page count mismatch" + ); + for (idx, (page, page_layout)) in + column_index.iter().zip(&column_layout.pages).enumerate() + { + assert_eq!( + page.compressed_page_size as usize, + page_layout.compressed_size + page_layout.page_header_size, + "index page {} size mismatch", + idx + ); + let next_first_row_index = column_index + .get(idx + 1) + .map(|x| x.first_row_index) + .unwrap_or_else(|| row_group.num_rows()); + + let num_rows = next_first_row_index - page.first_row_index; + assert_eq!( + num_rows as usize, page_layout.rows, + "index page {} row count", + idx + ); + } + } + + // Check against page data + assert_eq!( + row_group.columns().len(), + row_group_layout.columns.len(), + "column count mismatch" + ); + + let iter = row_group + .columns() + .iter() + .zip(&row_group_layout.columns) + .enumerate(); + + for (idx, (column, column_layout)) in iter { + let page_reader = SerializedPageReader::new( + Arc::new(file_reader.clone()), + column, + row_group.num_rows() as usize, + None, + ) + .unwrap(); + + let pages = page_reader.collect::, _>>().unwrap(); + assert_eq!( + pages.len(), + column_layout.pages.len() + + column_layout.dictionary_page.is_some() as usize, + "page {} count mismatch", + idx + ); + + let page_layouts = column_layout + .dictionary_page + .iter() + .chain(&column_layout.pages); + + for (page, page_layout) in pages.iter().zip(page_layouts) { + assert_eq!(page.encoding(), page_layout.encoding); + assert_eq!( + page.buffer().len(), + page_layout.compressed_size, + "page {} size mismatch", + idx + ); + assert_eq!(page.page_type(), page_layout.page_type); + } + } + } +} + +#[test] +fn test_primitive() { + let array = Arc::new(Int32Array::from_iter_values(0..2000)) as _; + let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); + let props = WriterProperties::builder() + .set_dictionary_enabled(false) + .set_data_pagesize_limit(1000) + .set_write_batch_size(10) + .build(); + + // Test spill plain encoding pages + do_test(LayoutTest { + props, + batches: vec![batch.clone()], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: (0..8) + .map(|_| Page { + rows: 250, + page_header_size: 34, + compressed_size: 1000, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }) + .collect(), + dictionary_page: None, + }], + }], + }, + }); + + // Test spill dictionary + let props = WriterProperties::builder() + .set_dictionary_enabled(true) + .set_dictionary_pagesize_limit(1000) + .set_data_pagesize_limit(10000) + .set_write_batch_size(10) + .build(); + + do_test(LayoutTest { + props, + batches: vec![batch.clone()], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: vec![ + Page { + rows: 250, + page_header_size: 34, + compressed_size: 258, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 1750, + page_header_size: 34, + compressed_size: 7000, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }, + ], + dictionary_page: Some(Page { + rows: 250, + page_header_size: 34, + compressed_size: 1000, + encoding: Encoding::PLAIN, + page_type: PageType::DICTIONARY_PAGE, + }), + }], + }], + }, + }); + + // Test spill dictionary encoded pages + let props = WriterProperties::builder() + .set_dictionary_enabled(true) + .set_dictionary_pagesize_limit(10000) + .set_data_pagesize_limit(500) + .set_write_batch_size(10) + .build(); + + do_test(LayoutTest { + props, + batches: vec![batch], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: vec![ + Page { + rows: 400, + page_header_size: 34, + compressed_size: 452, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 370, + page_header_size: 34, + compressed_size: 472, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 240, + page_header_size: 34, + compressed_size: 332, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + ], + dictionary_page: Some(Page { + rows: 2000, + page_header_size: 34, + compressed_size: 8000, + encoding: Encoding::PLAIN, + page_type: PageType::DICTIONARY_PAGE, + }), + }], + }], + }, + }); +} + +#[test] +fn test_string() { + let array = Arc::new(StringArray::from_iter_values( + (0..2000).map(|x| format!("{:04}", x)), + )) as _; + let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); + let props = WriterProperties::builder() + .set_dictionary_enabled(false) + .set_data_pagesize_limit(1000) + .set_write_batch_size(10) + .build(); + + // Test spill plain encoding pages + do_test(LayoutTest { + props, + batches: vec![batch.clone()], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: (0..15) + .map(|_| Page { + rows: 130, + page_header_size: 34, + compressed_size: 1040, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }) + .chain(std::iter::once(Page { + rows: 50, + page_header_size: 33, + compressed_size: 400, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + })) + .collect(), + dictionary_page: None, + }], + }], + }, + }); + + // Test spill dictionary + let props = WriterProperties::builder() + .set_dictionary_enabled(true) + .set_dictionary_pagesize_limit(1000) + .set_data_pagesize_limit(10000) + .set_write_batch_size(10) + .build(); + + do_test(LayoutTest { + props, + batches: vec![batch.clone()], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: vec![ + Page { + rows: 130, + page_header_size: 34, + compressed_size: 138, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 1250, + page_header_size: 36, + compressed_size: 10000, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 620, + page_header_size: 34, + compressed_size: 4960, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }, + ], + dictionary_page: Some(Page { + rows: 130, + page_header_size: 34, + compressed_size: 1040, + encoding: Encoding::PLAIN, + page_type: PageType::DICTIONARY_PAGE, + }), + }], + }], + }, + }); + + // Test spill dictionary encoded pages + let props = WriterProperties::builder() + .set_dictionary_enabled(true) + .set_dictionary_pagesize_limit(20000) + .set_data_pagesize_limit(500) + .set_write_batch_size(10) + .build(); + + do_test(LayoutTest { + props, + batches: vec![batch], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: vec![ + Page { + rows: 400, + page_header_size: 34, + compressed_size: 452, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 370, + page_header_size: 34, + compressed_size: 472, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 330, + page_header_size: 34, + compressed_size: 464, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + Page { + rows: 240, + page_header_size: 34, + compressed_size: 332, + encoding: Encoding::RLE_DICTIONARY, + page_type: PageType::DATA_PAGE, + }, + ], + dictionary_page: Some(Page { + rows: 2000, + page_header_size: 34, + compressed_size: 16000, + encoding: Encoding::PLAIN, + page_type: PageType::DICTIONARY_PAGE, + }), + }], + }], + }, + }); +} From d9fd1d5373a1b19d806d9faeeca36796bd517e4d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:21:09 +1300 Subject: [PATCH 0158/1411] Add timezone abstraction (#2909) * Add timezone abstraction * Deprecate instead of remove * Disallow missing colon in timezone * RAT --- arrow-array/Cargo.toml | 1 + arrow-array/src/array/primitive_array.rs | 17 -- arrow-array/src/lib.rs | 1 + arrow-array/src/temporal_conversions.rs | 12 +- arrow-array/src/timezone.rs | 325 +++++++++++++++++++++++ arrow/Cargo.toml | 2 +- arrow/src/compute/kernels/cast.rs | 69 ++--- arrow/src/compute/kernels/temporal.rs | 251 +++++------------ arrow/src/csv/writer.rs | 67 +---- 9 files changed, 437 insertions(+), 308 deletions(-) create mode 100644 arrow-array/src/timezone.rs diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index accc1d3e69eb..45765d99fcb7 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -49,6 +49,7 @@ arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } arrow-schema = { version = "25.0.0", path = "../arrow-schema" } arrow-data = { version = "25.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono-tz = { version = "0.7", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } hashbrown = { version = "0.12", default-features = false } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 5a9ffd34cdb4..d979d0d93e67 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1483,23 +1483,6 @@ mod tests { assert_eq!(array1, array2); } - #[cfg(feature = "chrono-tz")] - #[test] - fn test_with_timezone() { - use crate::compute::hour; - let a: TimestampMicrosecondArray = vec![37800000000, 86339000000].into(); - - let b = hour(&a).unwrap(); - assert_eq!(10, b.value(0)); - assert_eq!(23, b.value(1)); - - let a = a.with_timezone(String::from("America/Los_Angeles")); - - let b = hour(&a).unwrap(); - assert_eq!(2, b.value(0)); - assert_eq!(15, b.value(1)); - } - #[test] #[should_panic( expected = "Trying to access an element at index 4 from a PrimitiveArray of length 3" diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index cc963925d653..4f015793dda8 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -170,6 +170,7 @@ mod delta; pub mod iterator; mod raw_pointer; pub mod temporal_conversions; +pub mod timezone; mod trusted_len; pub mod types; diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index 9aae83c8ad69..8b1064115dbb 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -17,9 +17,10 @@ //! Conversion methods for dates and times. +use crate::timezone::Tz; use crate::ArrowPrimitiveType; use arrow_schema::{DataType, TimeUnit}; -use chrono::{Duration, NaiveDate, NaiveDateTime, NaiveTime}; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; /// Number of seconds in a day pub const SECONDS_IN_DAY: i64 = 86_400; @@ -187,6 +188,15 @@ pub fn as_datetime(v: i64) -> Option { } } +/// Converts an [`ArrowPrimitiveType`] to [`DateTime`] +pub fn as_datetime_with_timezone( + v: i64, + tz: Tz, +) -> Option> { + let naive = as_datetime::(v)?; + Some(Utc.from_utc_datetime(&naive).with_timezone(&tz)) +} + /// Converts an [`ArrowPrimitiveType`] to [`NaiveDate`] pub fn as_date(v: i64) -> Option { as_datetime::(v).map(|datetime| datetime.date()) diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs new file mode 100644 index 000000000000..4e60c0c422b6 --- /dev/null +++ b/arrow-array/src/timezone.rs @@ -0,0 +1,325 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Timezone for timestamp arrays + +use arrow_schema::ArrowError; +use chrono::format::{parse, Parsed, StrftimeItems}; +use chrono::FixedOffset; +pub use private::{Tz, TzOffset}; + +/// Parses a fixed offset of the form "+09:00" +fn parse_fixed_offset(tz: &str) -> Result { + if tz.len() != 6 { + return Err(ArrowError::ParseError(format!( + "Invalid timezone \"{}\": Expected format [+-]XX:XX", + tz + ))); + } + + let mut parsed = Parsed::new(); + parse(&mut parsed, tz, StrftimeItems::new("%:z")) + .and_then(|_| parsed.to_fixed_offset()) + .map_err(|e| { + ArrowError::ParseError(format!("Invalid timezone \"{}\": {}", tz, e)) + }) +} + +#[cfg(feature = "chrono-tz")] +mod private { + use super::*; + use chrono::offset::TimeZone; + use chrono::{LocalResult, NaiveDate, NaiveDateTime, Offset}; + use std::str::FromStr; + + /// An [`Offset`] for [`Tz`] + #[derive(Debug, Copy, Clone)] + pub struct TzOffset { + tz: Tz, + offset: FixedOffset, + } + + impl std::fmt::Display for TzOffset { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.offset.fmt(f) + } + } + + impl Offset for TzOffset { + fn fix(&self) -> FixedOffset { + self.offset + } + } + + /// An Arrow [`TimeZone`] + #[derive(Debug, Copy, Clone)] + pub struct Tz(TzInner); + + #[derive(Debug, Copy, Clone)] + enum TzInner { + Timezone(chrono_tz::Tz), + Offset(FixedOffset), + } + + impl FromStr for Tz { + type Err = ArrowError; + + fn from_str(tz: &str) -> Result { + if tz.starts_with('+') || tz.starts_with('-') { + Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?))) + } else { + Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { + ArrowError::ParseError(format!("Invalid timezone \"{}\": {}", tz, e)) + })?))) + } + } + } + + macro_rules! tz { + ($s:ident, $tz:ident, $b:block) => { + match $s.0 { + TzInner::Timezone($tz) => $b, + TzInner::Offset($tz) => $b, + } + }; + } + + impl TimeZone for Tz { + type Offset = TzOffset; + + fn from_offset(offset: &Self::Offset) -> Self { + offset.tz + } + + fn offset_from_local_date(&self, local: &NaiveDate) -> LocalResult { + tz!(self, tz, { + tz.offset_from_local_date(local).map(|x| TzOffset { + tz: *self, + offset: x.fix(), + }) + }) + } + + fn offset_from_local_datetime( + &self, + local: &NaiveDateTime, + ) -> LocalResult { + tz!(self, tz, { + tz.offset_from_local_datetime(local).map(|x| TzOffset { + tz: *self, + offset: x.fix(), + }) + }) + } + + fn offset_from_utc_date(&self, utc: &NaiveDate) -> Self::Offset { + tz!(self, tz, { + TzOffset { + tz: *self, + offset: tz.offset_from_utc_date(utc).fix(), + } + }) + } + + fn offset_from_utc_datetime(&self, utc: &NaiveDateTime) -> Self::Offset { + tz!(self, tz, { + TzOffset { + tz: *self, + offset: tz.offset_from_utc_datetime(utc).fix(), + } + }) + } + } + + #[cfg(test)] + mod tests { + use super::*; + use chrono::{Timelike, Utc}; + + #[test] + fn test_with_timezone() { + let vals = [ + Utc.timestamp_millis(37800000), + Utc.timestamp_millis(86339000), + ]; + + assert_eq!(10, vals[0].hour()); + assert_eq!(23, vals[1].hour()); + + let tz: Tz = "America/Los_Angeles".parse().unwrap(); + + assert_eq!(2, vals[0].with_timezone(&tz).hour()); + assert_eq!(15, vals[1].with_timezone(&tz).hour()); + } + + #[test] + fn test_using_chrono_tz_and_utc_naive_date_time() { + let sydney_tz = "Australia/Sydney".to_string(); + let tz: Tz = sydney_tz.parse().unwrap(); + let sydney_offset_without_dst = FixedOffset::east(10 * 60 * 60); + let sydney_offset_with_dst = FixedOffset::east(11 * 60 * 60); + // Daylight savings ends + // When local daylight time was about to reach + // Sunday, 4 April 2021, 3:00:00 am clocks were turned backward 1 hour to + // Sunday, 4 April 2021, 2:00:00 am local standard time instead. + + // Daylight savings starts + // When local standard time was about to reach + // Sunday, 3 October 2021, 2:00:00 am clocks were turned forward 1 hour to + // Sunday, 3 October 2021, 3:00:00 am local daylight time instead. + + // Sydney 2021-04-04T02:30:00+11:00 is 2021-04-03T15:30:00Z + let utc_just_before_sydney_dst_ends = + NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(15, 30, 0, 0); + assert_eq!( + tz.offset_from_utc_datetime(&utc_just_before_sydney_dst_ends) + .fix(), + sydney_offset_with_dst + ); + // Sydney 2021-04-04T02:30:00+10:00 is 2021-04-03T16:30:00Z + let utc_just_after_sydney_dst_ends = + NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(16, 30, 0, 0); + assert_eq!( + tz.offset_from_utc_datetime(&utc_just_after_sydney_dst_ends) + .fix(), + sydney_offset_without_dst + ); + // Sydney 2021-10-03T01:30:00+10:00 is 2021-10-02T15:30:00Z + let utc_just_before_sydney_dst_starts = + NaiveDate::from_ymd(2021, 10, 2).and_hms_nano(15, 30, 0, 0); + assert_eq!( + tz.offset_from_utc_datetime(&utc_just_before_sydney_dst_starts) + .fix(), + sydney_offset_without_dst + ); + // Sydney 2021-04-04T03:30:00+11:00 is 2021-10-02T16:30:00Z + let utc_just_after_sydney_dst_starts = + NaiveDate::from_ymd(2022, 10, 2).and_hms_nano(16, 30, 0, 0); + assert_eq!( + tz.offset_from_utc_datetime(&utc_just_after_sydney_dst_starts) + .fix(), + sydney_offset_with_dst + ); + } + } +} + +#[cfg(not(feature = "chrono-tz"))] +mod private { + use super::*; + use chrono::offset::TimeZone; + use chrono::{FixedOffset, LocalResult, NaiveDate, NaiveDateTime, Offset}; + use std::str::FromStr; + + /// An [`Offset`] for [`Tz`] + #[derive(Debug, Copy, Clone)] + pub struct TzOffset(FixedOffset); + + impl std::fmt::Display for TzOffset { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl Offset for TzOffset { + fn fix(&self) -> FixedOffset { + self.0 + } + } + + /// An Arrow [`TimeZone`] + #[derive(Debug, Copy, Clone)] + pub struct Tz(FixedOffset); + + impl FromStr for Tz { + type Err = ArrowError; + + fn from_str(tz: &str) -> Result { + if tz.starts_with('+') || tz.starts_with('-') { + Ok(Self(parse_fixed_offset(tz)?)) + } else { + Err(ArrowError::ParseError(format!( + "Invalid timezone \"{}\": only offset based timezones supported without chrono-tz feature", + tz + ))) + } + } + } + + impl TimeZone for Tz { + type Offset = TzOffset; + + fn from_offset(offset: &Self::Offset) -> Self { + Self(offset.0) + } + + fn offset_from_local_date(&self, local: &NaiveDate) -> LocalResult { + self.0.offset_from_local_date(local).map(TzOffset) + } + + fn offset_from_local_datetime( + &self, + local: &NaiveDateTime, + ) -> LocalResult { + self.0.offset_from_local_datetime(local).map(TzOffset) + } + + fn offset_from_utc_date(&self, utc: &NaiveDate) -> Self::Offset { + TzOffset(self.0.offset_from_utc_date(utc).fix()) + } + + fn offset_from_utc_datetime(&self, utc: &NaiveDateTime) -> Self::Offset { + TzOffset(self.0.offset_from_utc_datetime(utc).fix()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::{NaiveDate, Offset, TimeZone}; + + #[test] + fn test_with_offset() { + let t = NaiveDate::from_ymd(2000, 1, 1); + + let tz: Tz = "-00:00".parse().unwrap(); + assert_eq!(tz.offset_from_utc_date(&t).fix().local_minus_utc(), 0); + let tz: Tz = "+00:00".parse().unwrap(); + assert_eq!(tz.offset_from_utc_date(&t).fix().local_minus_utc(), 0); + + let tz: Tz = "-10:00".parse().unwrap(); + assert_eq!( + tz.offset_from_utc_date(&t).fix().local_minus_utc(), + -10 * 60 * 60 + ); + let tz: Tz = "+09:00".parse().unwrap(); + assert_eq!( + tz.offset_from_utc_date(&t).fix().local_minus_utc(), + 9 * 60 * 60 + ); + + let err = "+9:00".parse::().unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); + + let err = "+09".parse::().unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); + + let err = "+0900".parse::().unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 7a933360c0c3..4a1668cc0fdd 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -62,7 +62,6 @@ lazy_static = { version = "1.4", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -chrono-tz = { version = "0.7", default-features = false, optional = true } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } @@ -100,6 +99,7 @@ dyn_cmp_dict = [] # Enable dyn-arithmetic kernels for dictionary arrays # Note: this does not impact arithmetic with scalars dyn_arith_dict = [] +chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 66a04e91ed30..090195c16b81 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -35,17 +35,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::format::strftime::StrftimeItems; -use chrono::format::{parse, Parsed}; -use chrono::{NaiveDateTime, Timelike}; +use chrono::{DateTime, NaiveDateTime, Timelike}; use std::str; use std::sync::Arc; use crate::buffer::MutableBuffer; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::compute::kernels::temporal::return_compute_error_with; +use crate::compute::try_unary; use crate::compute::{divide_scalar, multiply_scalar}; -use crate::compute::{try_unary, using_chrono_tz_and_utc_naive_date_time}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::temporal_conversions::{ @@ -57,6 +54,8 @@ use crate::{ buffer::Buffer, util::display::array_value_to_string, util::serialization::lexical_to_string, }; +use arrow_array::temporal_conversions::as_datetime_with_timezone; +use arrow_array::timezone::Tz; use arrow_buffer::i256; use num::cast::AsPrimitive; use num::{NumCast, ToPrimitive}; @@ -1669,7 +1668,7 @@ where Arc::new(builder.finish()) } -fn extract_component_from_datatime_array< +fn extract_component_from_datetime_array< A: ArrayAccessor, OffsetSize, T: ArrowTemporalType, @@ -1678,55 +1677,25 @@ fn extract_component_from_datatime_array< iter: ArrayIter, mut builder: GenericStringBuilder, tz: &str, - mut parsed: Parsed, op: F, ) -> Result where OffsetSize: OffsetSizeTrait, - F: Fn(NaiveDateTime) -> String, + F: Fn(DateTime) -> String, i64: From, { - if (tz.starts_with('+') || tz.starts_with('-')) && !tz.contains(':') { - return_compute_error_with!( - "Invalid timezone", - "Expected format [+-]XX:XX".to_string() - ) - } else { - let tz_parse_result = parse(&mut parsed, tz, StrftimeItems::new("%z")); - let fixed_offset_from_parsed = match tz_parse_result { - Ok(_) => match parsed.to_fixed_offset() { - Ok(fo) => Some(fo), - err => return_compute_error_with!("Invalid timezone", err), - }, - _ => None, - }; - - for value in iter { - if let Some(value) = value { - match as_datetime::(>::from(value)) { - Some(utc) => { - let fixed_offset = match fixed_offset_from_parsed { - Some(fo) => fo, - None => { - match using_chrono_tz_and_utc_naive_date_time(tz, utc) { - Some(fo) => fo, - err => return_compute_error_with!( - "Unable to parse timezone", - err - ), - } - } - }; - builder.append_value(op(utc + fixed_offset)); - } - err => return_compute_error_with!( - "Unable to read value as datetime", - err - ), + let tz: Tz = tz.parse()?; + for value in iter { + match value { + Some(value) => match as_datetime_with_timezone::(value.into(), tz) { + Some(time) => builder.append_value(op(time)), + _ => { + return Err(ArrowError::ComputeError( + "Unable to read value as datetime".to_string(), + )) } - } else { - builder.append_null(); - } + }, + None => builder.append_null(), } } Ok(Arc::new(builder.finish())) @@ -1747,16 +1716,14 @@ where let builder = GenericStringBuilder::::new(); if let Some(tz) = tz { - let scratch = Parsed::new(); // The macro calls `as_datetime` on timestamp values of the array. // After applying timezone offset on the datatime, calling `to_string` to get // the strings. let iter = ArrayIter::new(array); - extract_component_from_datatime_array::<_, OffsetSize, T, _>( + extract_component_from_datetime_array::<_, OffsetSize, T, _>( iter, builder, tz, - scratch, |t| t.to_string(), ) } else { diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 220b7dadcc56..54799a32630e 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -17,16 +17,16 @@ //! Defines temporal kernels for time and date related functions. -use chrono::{Datelike, NaiveDateTime, NaiveTime, Timelike}; +use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use crate::array::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; -use arrow_array::temporal_conversions::{as_datetime, as_time}; +use arrow_array::temporal_conversions::{ + as_datetime, as_datetime_with_timezone, as_time, +}; -use chrono::format::strftime::StrftimeItems; -use chrono::format::{parse, Parsed}; -use chrono::FixedOffset; +use arrow_array::timezone::Tz; /// This function takes an `ArrayIter` of input array and an extractor `op` which takes /// an input `NaiveTime` and returns time component (e.g. hour) as `i32` value. @@ -87,7 +87,7 @@ where /// object used to parse timezone string. `op` is the extractor closure which takes /// data time object of `NaiveDateTime` type and returns `i32` value of extracted /// component. -fn extract_component_from_datatime_array< +fn extract_component_from_datetime_array< A: ArrayAccessor, T: ArrowTemporalType, F, @@ -95,54 +95,24 @@ fn extract_component_from_datatime_array< iter: ArrayIter, mut builder: PrimitiveBuilder, tz: &str, - mut parsed: Parsed, op: F, ) -> Result where - F: Fn(NaiveDateTime) -> i32, + F: Fn(DateTime) -> i32, i64: From, { - if (tz.starts_with('+') || tz.starts_with('-')) && !tz.contains(':') { - return_compute_error_with!( - "Invalid timezone", - "Expected format [+-]XX:XX".to_string() - ) - } else { - let tz_parse_result = parse(&mut parsed, tz, StrftimeItems::new("%z")); - let fixed_offset_from_parsed = match tz_parse_result { - Ok(_) => match parsed.to_fixed_offset() { - Ok(fo) => Some(fo), - err => return_compute_error_with!("Invalid timezone", err), - }, - _ => None, - }; - - for value in iter { - if let Some(value) = value { - match as_datetime::(i64::from(value)) { - Some(utc) => { - let fixed_offset = match fixed_offset_from_parsed { - Some(fo) => fo, - None => { - match using_chrono_tz_and_utc_naive_date_time(tz, utc) { - Some(fo) => fo, - err => return_compute_error_with!( - "Unable to parse timezone", - err - ), - } - } - }; - builder.append_value(op(utc + fixed_offset)); - } - err => return_compute_error_with!( - "Unable to read value as datetime", - err - ), + let tz: Tz = tz.parse()?; + for value in iter { + match value { + Some(value) => match as_datetime_with_timezone::(value.into(), tz) { + Some(time) => builder.append_value(op(time)), + _ => { + return Err(ArrowError::ComputeError( + "Unable to read value as datetime".to_string(), + )) } - } else { - builder.append_null(); - } + }, + None => builder.append_null(), } } Ok(builder.finish()) @@ -189,27 +159,18 @@ impl ChronoDateExt for T { } } -#[cfg(not(feature = "chrono-tz"))] -pub fn using_chrono_tz_and_utc_naive_date_time( - _tz: &str, - _utc: chrono::NaiveDateTime, -) -> Option { - None -} - /// Parse the given string into a string representing fixed-offset that is correct as of the given /// UTC NaiveDateTime. /// Note that the offset is function of time and can vary depending on whether daylight savings is /// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST. -#[cfg(feature = "chrono-tz")] +#[deprecated(note = "Use arrow_array::timezone::Tz instead")] pub fn using_chrono_tz_and_utc_naive_date_time( tz: &str, - utc: chrono::NaiveDateTime, -) -> Option { - use chrono::{Offset, TimeZone}; - tz.parse::() - .map(|tz| tz.offset_from_utc_datetime(&utc).fix()) - .ok() + utc: NaiveDateTime, +) -> Option { + use chrono::TimeZone; + let tz: Tz = tz.parse().ok()?; + Some(tz.offset_from_utc_datetime(&utc).fix()) } /// Extracts the hours of a given temporal primitive array as an array of integers within @@ -217,7 +178,7 @@ pub fn using_chrono_tz_and_utc_naive_date_time( pub fn hour(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { hour_generic::(array) } @@ -227,7 +188,7 @@ where pub fn hour_generic>(array: A) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -244,7 +205,7 @@ fn hour_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -257,9 +218,8 @@ where Ok(as_datetime_with_op::(iter, b, |t| t.hour() as i32)) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.hour() as i32 }) } @@ -271,7 +231,7 @@ where pub fn year(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { year_generic::(array) } @@ -280,7 +240,7 @@ where pub fn year_generic>(array: A) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -297,7 +257,7 @@ fn year_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { @@ -314,7 +274,7 @@ where pub fn quarter(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { quarter_generic::(array) } @@ -326,7 +286,7 @@ pub fn quarter_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -343,7 +303,7 @@ fn quarter_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -354,9 +314,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.quarter() as i32 }) } @@ -369,7 +328,7 @@ where pub fn month(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { month_generic::(array) } @@ -380,7 +339,7 @@ pub fn month_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -397,7 +356,7 @@ fn month_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -408,9 +367,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.month() as i32 }) } @@ -427,7 +385,7 @@ where pub fn num_days_from_monday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { num_days_from_monday_generic::(array) } @@ -443,7 +401,7 @@ pub fn num_days_from_monday_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -465,7 +423,7 @@ fn num_days_from_monday_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -476,9 +434,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.num_days_from_monday() }) } @@ -495,7 +452,7 @@ where pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { num_days_from_sunday_generic::(array) } @@ -511,7 +468,7 @@ pub fn num_days_from_sunday_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -533,7 +490,7 @@ fn num_days_from_sunday_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -544,9 +501,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.num_days_from_sunday() }) } @@ -561,7 +517,7 @@ where pub fn day(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { day_generic::(array) } @@ -570,7 +526,7 @@ where pub fn day_generic>(array: A) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -587,7 +543,7 @@ fn day_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -596,9 +552,8 @@ where Ok(as_datetime_with_op::(iter, b, |t| t.day() as i32)) } DataType::Timestamp(_, Some(ref tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.day() as i32 }) } @@ -611,7 +566,7 @@ where pub fn doy(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { doy_generic::(array) } @@ -621,7 +576,7 @@ where pub fn doy_generic>(array: A) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -640,7 +595,7 @@ fn doy_internal>( where T: ArrowTemporalType + ArrowNumericType, T::Native: ArrowNativeType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -651,9 +606,8 @@ where })) } DataType::Timestamp(_, Some(ref tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.ordinal() as i32 }) } @@ -665,7 +619,7 @@ where pub fn minute(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { minute_generic::(array) } @@ -676,7 +630,7 @@ pub fn minute_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -693,7 +647,7 @@ fn minute_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -704,9 +658,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.minute() as i32 }) } @@ -718,7 +671,7 @@ where pub fn week(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { week_generic::(array) } @@ -727,7 +680,7 @@ where pub fn week_generic>(array: A) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -744,7 +697,7 @@ fn week_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match dt { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { @@ -762,7 +715,7 @@ where pub fn second(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { second_generic::(array) } @@ -773,7 +726,7 @@ pub fn second_generic>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { @@ -790,7 +743,7 @@ fn second_internal>( ) -> Result where T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, + i64: From, { let b = Int32Builder::with_capacity(array.len()); match dt { @@ -801,9 +754,8 @@ where })) } DataType::Timestamp(_, Some(tz)) => { - let scratch = Parsed::new(); let iter = ArrayIter::new(array); - extract_component_from_datatime_array::(iter, b, tz, scratch, |t| { + extract_component_from_datetime_array::(iter, b, tz, |t| { t.second() as i32 }) } @@ -814,8 +766,6 @@ where #[cfg(test)] mod tests { use super::*; - #[cfg(feature = "chrono-tz")] - use chrono::NaiveDate; #[test] fn test_temporal_array_date64_hour() { @@ -1180,21 +1130,24 @@ mod tests { fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { let a = TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("+0100".to_string())); - assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) + let err = hour(&a).unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_initial_sign() { let a = TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("0100".to_string())); - assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) + let err = hour(&a).unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_with_only_colon() { let a = TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("01:00".to_string())); - assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) + let err = hour(&a).unwrap_err().to_string(); + assert!(err.contains("Invalid timezone"), "{}", err); } #[cfg(feature = "chrono-tz")] @@ -1231,65 +1184,7 @@ mod tests { vec![60 * 60 * 10], Some("Asia/Kolkatta".to_string()), ); - assert!(matches!(hour(&a), Err(ArrowError::ComputeError(_)))) - } - - #[cfg(feature = "chrono-tz")] - #[test] - fn test_using_chrono_tz_and_utc_naive_date_time() { - let sydney_tz = "Australia/Sydney".to_string(); - let sydney_offset_without_dst = FixedOffset::east(10 * 60 * 60); - let sydney_offset_with_dst = FixedOffset::east(11 * 60 * 60); - // Daylight savings ends - // When local daylight time was about to reach - // Sunday, 4 April 2021, 3:00:00 am clocks were turned backward 1 hour to - // Sunday, 4 April 2021, 2:00:00 am local standard time instead. - - // Daylight savings starts - // When local standard time was about to reach - // Sunday, 3 October 2021, 2:00:00 am clocks were turned forward 1 hour to - // Sunday, 3 October 2021, 3:00:00 am local daylight time instead. - - // Sydney 2021-04-04T02:30:00+11:00 is 2021-04-03T15:30:00Z - let utc_just_before_sydney_dst_ends = - NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(15, 30, 0, 0); - assert_eq!( - using_chrono_tz_and_utc_naive_date_time( - &sydney_tz, - utc_just_before_sydney_dst_ends - ), - Some(sydney_offset_with_dst) - ); - // Sydney 2021-04-04T02:30:00+10:00 is 2021-04-03T16:30:00Z - let utc_just_after_sydney_dst_ends = - NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(16, 30, 0, 0); - assert_eq!( - using_chrono_tz_and_utc_naive_date_time( - &sydney_tz, - utc_just_after_sydney_dst_ends - ), - Some(sydney_offset_without_dst) - ); - // Sydney 2021-10-03T01:30:00+10:00 is 2021-10-02T15:30:00Z - let utc_just_before_sydney_dst_starts = - NaiveDate::from_ymd(2021, 10, 2).and_hms_nano(15, 30, 0, 0); - assert_eq!( - using_chrono_tz_and_utc_naive_date_time( - &sydney_tz, - utc_just_before_sydney_dst_starts - ), - Some(sydney_offset_without_dst) - ); - // Sydney 2021-04-04T03:30:00+11:00 is 2021-10-02T16:30:00Z - let utc_just_after_sydney_dst_starts = - NaiveDate::from_ymd(2022, 10, 2).and_hms_nano(16, 30, 0, 0); - assert_eq!( - using_chrono_tz_and_utc_naive_date_time( - &sydney_tz, - utc_just_after_sydney_dst_starts - ), - Some(sydney_offset_with_dst) - ); + assert!(matches!(hour(&a), Err(ArrowError::ParseError(_)))) } #[test] diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index eb7a8fd5be88..e92e68e5f483 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -63,12 +63,9 @@ //! } //! ``` -use std::io::Write; - -#[cfg(feature = "chrono-tz")] -use crate::compute::kernels::temporal::using_chrono_tz_and_utc_naive_date_time; -#[cfg(feature = "chrono-tz")] +use arrow_array::timezone::Tz; use chrono::{DateTime, Utc}; +use std::io::Write; use crate::csv::map_csv_error; use crate::datatypes::*; @@ -239,45 +236,6 @@ impl Writer { Ok(()) } - #[cfg(not(feature = "chrono-tz"))] - fn handle_timestamp( - &self, - time_unit: &TimeUnit, - _time_zone: Option<&String>, - row_index: usize, - col: &ArrayRef, - ) -> Result { - use TimeUnit::*; - let datetime = match time_unit { - Second => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Millisecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Microsecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Nanosecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - }; - Ok(format!("{}", datetime.format(&self.timestamp_format))) - } - - #[cfg(feature = "chrono-tz")] fn handle_timestamp( &self, time_unit: &TimeUnit, @@ -286,7 +244,6 @@ impl Writer { col: &ArrayRef, ) -> Result { use TimeUnit::*; - let datetime = match time_unit { Second => col .as_any() @@ -313,25 +270,15 @@ impl Writer { .value_as_datetime(row_index) .unwrap(), }; - let tzs = match time_zone { - None => "UTC".to_string(), - Some(tzs) => tzs.to_string(), - }; - match using_chrono_tz_and_utc_naive_date_time(&tzs, datetime) { + let tz: Option = time_zone.map(|x| x.parse()).transpose()?; + match tz { Some(tz) => { let utc_time = DateTime::::from_utc(datetime, Utc); - Ok(format!( - "{}", - utc_time - .with_timezone(&tz) - .format(&self.timestamp_tz_format) - )) + let local_time = utc_time.with_timezone(&tz); + Ok(local_time.format(&self.timestamp_tz_format).to_string()) } - err => Err(ArrowError::ComputeError(format!( - "{}: {:?}", - "Unable to parse timezone", err - ))), + None => Ok(datetime.format(&self.timestamp_format).to_string()), } } From 642054ccb100464e780ae35c6ab9694c739e4242 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Oct 2022 09:21:24 +1300 Subject: [PATCH 0159/1411] Simplify TimestampArray from_vec with timezone (#2906) --- arrow-array/src/array/primitive_array.rs | 52 +++++++----------------- 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index d979d0d93e67..66aa825094f0 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -22,7 +22,7 @@ use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{print_long_array, Array, ArrayAccessor}; -use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -761,13 +761,19 @@ def_numeric_from_vec!(TimestampNanosecondType); impl PrimitiveArray { /// Construct a timestamp array from a vec of i64 values and an optional timezone - pub fn from_vec(data: Vec, timezone: Option) -> Self { - let array_data = - ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) - .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)); - let array_data = unsafe { array_data.build_unchecked() }; - PrimitiveArray::from(array_data) + pub fn from_vec(data: Vec, timezone: Option) -> Self + where + Self: From>, + { + Self::from(data).with_timezone_opt(timezone) + } + + /// Construct a timestamp array from a vec of `Option` values and an optional timezone + pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self + where + Self: From>>, + { + Self::from(data).with_timezone_opt(timezone) } /// Construct a timestamp array with new timezone @@ -788,36 +794,6 @@ impl PrimitiveArray { } } -impl PrimitiveArray { - /// Construct a timestamp array from a vec of `Option` values and an optional timezone - pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self { - // TODO: duplicated from def_numeric_from_vec! macro, it looks possible to convert to generic - let data_len = data.len(); - let mut null_buf = MutableBuffer::new_null(data_len); - let mut val_buf = MutableBuffer::new(data_len * std::mem::size_of::()); - - { - let null_slice = null_buf.as_slice_mut(); - for (i, v) in data.iter().enumerate() { - if let Some(n) = v { - bit_util::set_bit(null_slice, i); - val_buf.push(*n); - } else { - val_buf.push(0i64); - } - } - } - - let array_data = - ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) - .len(data_len) - .add_buffer(val_buf.into()) - .null_bit_buffer(Some(null_buf.into())); - let array_data = unsafe { array_data.build_unchecked() }; - PrimitiveArray::from(array_data) - } -} - /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { From 28d6152f95a1802c8d0722843970f53436472720 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Oct 2022 10:38:07 +1300 Subject: [PATCH 0160/1411] Deprecate TimestampArray from_vec and from_opt_vec (#2919) --- arrow-array/src/array/primitive_array.rs | 31 +++++---- arrow-integration-test/src/lib.rs | 46 ++++++------- arrow-select/src/filter.rs | 8 +-- arrow-select/src/take.rs | 9 +-- arrow/src/compute/kernels/cast.rs | 84 +++++++++++++----------- arrow/src/compute/kernels/temporal.rs | 71 ++++++++++---------- arrow/src/csv/writer.rs | 7 +- arrow/src/json/writer.rs | 12 ++-- parquet/src/arrow/arrow_writer/mod.rs | 8 +-- 9 files changed, 140 insertions(+), 136 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 66aa825094f0..3105cc6a964d 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -118,9 +118,9 @@ pub type Float64Array = PrimitiveArray; /// # use arrow_array::TimestampSecondArray; /// use chrono::FixedOffset; /// // Corresponds to single element array with entry 1970-05-09T14:25:11+0:00 -/// let arr = TimestampSecondArray::from_vec(vec![11111111], None); +/// let arr = TimestampSecondArray::from(vec![11111111]); /// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], None); +/// let arr = TimestampSecondArray::from(vec![Some(11111111)]); /// let utc_offset = FixedOffset::east(0); /// /// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1970-05-09 14:25:11") @@ -131,9 +131,9 @@ pub type Float64Array = PrimitiveArray; /// # use arrow_array::TimestampSecondArray; /// use chrono::FixedOffset; /// // Corresponds to single element array with entry 1969-08-25T09:34:49+0:00 -/// let arr = TimestampSecondArray::from_vec(vec![-11111111], None); +/// let arr = TimestampSecondArray::from(vec![-11111111]); /// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(-11111111)], None); +/// let arr = TimestampSecondArray::from(vec![Some(-11111111)]); /// let utc_offset = FixedOffset::east(0); /// /// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1969-08-25 09:34:49") @@ -144,9 +144,9 @@ pub type Float64Array = PrimitiveArray; /// # use arrow_array::TimestampSecondArray; /// use chrono::FixedOffset; /// // Corresponds to single element array with entry 1970-05-10T00:25:11+10:00 -/// let arr = TimestampSecondArray::from_vec(vec![11111111], Some("+10:00".to_string())); +/// let arr = TimestampSecondArray::from(vec![11111111]).with_timezone("+10:00".to_string()); /// // OR -/// let arr = TimestampSecondArray::from_opt_vec(vec![Some(11111111)], Some("+10:00".to_string())); +/// let arr = TimestampSecondArray::from(vec![Some(11111111)]).with_timezone("+10:00".to_string()); /// let sydney_offset = FixedOffset::east(10 * 60 * 60); /// /// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_offset).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11") @@ -761,6 +761,7 @@ def_numeric_from_vec!(TimestampNanosecondType); impl PrimitiveArray { /// Construct a timestamp array from a vec of i64 values and an optional timezone + #[deprecated(note = "Use with_timezone_opt instead")] pub fn from_vec(data: Vec, timezone: Option) -> Self where Self: From>, @@ -769,6 +770,7 @@ impl PrimitiveArray { } /// Construct a timestamp array from a vec of `Option` values and an optional timezone + #[deprecated(note = "Use with_timezone_opt instead")] pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self where Self: From>>, @@ -1150,7 +1152,7 @@ mod tests { #[test] fn test_timestamp_array_from_vec() { - let arr = TimestampSecondArray::from_vec(vec![1, -5], None); + let arr = TimestampSecondArray::from(vec![1, -5]); assert_eq!(2, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1158,7 +1160,7 @@ mod tests { assert_eq!(-5, arr.value(1)); assert_eq!(&[1, -5], arr.values()); - let arr = TimestampMillisecondArray::from_vec(vec![1, -5], None); + let arr = TimestampMillisecondArray::from(vec![1, -5]); assert_eq!(2, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1166,7 +1168,7 @@ mod tests { assert_eq!(-5, arr.value(1)); assert_eq!(&[1, -5], arr.values()); - let arr = TimestampMicrosecondArray::from_vec(vec![1, -5], None); + let arr = TimestampMicrosecondArray::from(vec![1, -5]); assert_eq!(2, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1174,7 +1176,7 @@ mod tests { assert_eq!(-5, arr.value(1)); assert_eq!(&[1, -5], arr.values()); - let arr = TimestampNanosecondArray::from_vec(vec![1, -5], None); + let arr = TimestampNanosecondArray::from(vec![1, -5]); assert_eq!(2, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1309,10 +1311,11 @@ mod tests { #[test] fn test_timestamp_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from_vec( - vec![1546214400000, 1546214400000, -1546214400000], - None, - ); + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", format!("{:?}", arr) diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index cf7024dc0264..d0db4b4b9ec1 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1240,34 +1240,28 @@ mod tests { None, Some(16584393546415), ]); - let ts_secs = TimestampSecondArray::from_opt_vec( - vec![None, Some(193438817552), None], + let ts_secs = TimestampSecondArray::from(vec![None, Some(193438817552), None]); + let ts_millis = TimestampMillisecondArray::from(vec![ None, - ); - let ts_millis = TimestampMillisecondArray::from_opt_vec( - vec![None, Some(38606916383008), Some(58113709376587)], - None, - ); - let ts_micros = - TimestampMicrosecondArray::from_opt_vec(vec![None, None, None], None); - let ts_nanos = TimestampNanosecondArray::from_opt_vec( - vec![None, None, Some(-6473623571954960143)], + Some(38606916383008), + Some(58113709376587), + ]); + let ts_micros = TimestampMicrosecondArray::from(vec![None, None, None]); + let ts_nanos = + TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]); + let ts_secs_tz = TimestampSecondArray::from(vec![None, Some(193438817552), None]) + .with_timezone_opt(secs_tz); + let ts_millis_tz = TimestampMillisecondArray::from(vec![ None, - ); - let ts_secs_tz = TimestampSecondArray::from_opt_vec( - vec![None, Some(193438817552), None], - secs_tz, - ); - let ts_millis_tz = TimestampMillisecondArray::from_opt_vec( - vec![None, Some(38606916383008), Some(58113709376587)], - millis_tz, - ); - let ts_micros_tz = - TimestampMicrosecondArray::from_opt_vec(vec![None, None, None], micros_tz); - let ts_nanos_tz = TimestampNanosecondArray::from_opt_vec( - vec![None, None, Some(-6473623571954960143)], - nanos_tz, - ); + Some(38606916383008), + Some(58113709376587), + ]) + .with_timezone_opt(millis_tz); + let ts_micros_tz = TimestampMicrosecondArray::from(vec![None, None, None]) + .with_timezone_opt(micros_tz); + let ts_nanos_tz = + TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]) + .with_timezone_opt(nanos_tz); let utf8s = StringArray::from(vec![Some("aa"), None, Some("bbb")]); let value_data = Int32Array::from(vec![None, Some(2), None, None]); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 3226c54180a3..71175ca5788d 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -760,22 +760,22 @@ mod tests { def_temporal_test!( test_filter_timestamp_second, TimestampSecondArray, - TimestampSecondArray::from_vec(vec![1, 2, 3, 4], None) + TimestampSecondArray::from(vec![1, 2, 3, 4]) ); def_temporal_test!( test_filter_timestamp_millisecond, TimestampMillisecondArray, - TimestampMillisecondArray::from_vec(vec![1, 2, 3, 4], None) + TimestampMillisecondArray::from(vec![1, 2, 3, 4]) ); def_temporal_test!( test_filter_timestamp_microsecond, TimestampMicrosecondArray, - TimestampMicrosecondArray::from_vec(vec![1, 2, 3, 4], None) + TimestampMicrosecondArray::from(vec![1, 2, 3, 4]) ); def_temporal_test!( test_filter_timestamp_nanosecond, TimestampNanosecondArray, - TimestampNanosecondArray::from_vec(vec![1, 2, 3, 4], None) + TimestampNanosecondArray::from(vec![1, 2, 3, 4]) ); #[test] diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d52ec37b9b29..77a1147ad6fc 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1348,10 +1348,11 @@ mod tests { fn test_take_preserve_timezone() { let index = Int64Array::from(vec![Some(0), None]); - let input = TimestampNanosecondArray::from_vec( - vec![1_639_715_368_000_000_000, 1_639_715_368_000_000_000], - Some("UTC".to_owned()), - ); + let input = TimestampNanosecondArray::from(vec![ + 1_639_715_368_000_000_000, + 1_639_715_368_000_000_000, + ]) + .with_timezone("UTC".to_string()); let result = take_impl(&input, &index, None).unwrap(); match result.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) => { diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 090195c16b81..c0b08ecc57d9 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3793,10 +3793,12 @@ mod tests { #[test] fn test_cast_timestamp_to_date32() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - Some(String::from("UTC")), - ); + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); @@ -3807,10 +3809,11 @@ mod tests { #[test] fn test_cast_timestamp_to_date64() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), None, - ); + ]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); @@ -3821,10 +3824,12 @@ mod tests { #[test] fn test_cast_timestamp_to_i64() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - Some("UTC".to_string()), - ); + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Int64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); @@ -3837,10 +3842,12 @@ mod tests { #[test] #[cfg(feature = "chrono-tz")] fn test_cast_timestamp_to_string() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - Some("UTC".to_string()), - ); + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; dbg!(&array); let b = cast(&array, &DataType::Utf8).unwrap(); @@ -3875,10 +3882,11 @@ mod tests { #[test] fn test_cast_between_timestamps() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000003005), Some(1545696002001), None], + let a = TimestampMillisecondArray::from(vec![ + Some(864000003005), + Some(1545696002001), None, - ); + ]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); @@ -5474,26 +5482,26 @@ mod tests { Arc::new(UInt64Array::from(vec![1, 2])), Arc::new(Float32Array::from(vec![1.0, 2.0])), Arc::new(Float64Array::from(vec![1.0, 2.0])), - Arc::new(TimestampSecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampMillisecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampMicrosecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampNanosecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampSecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampMillisecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampMicrosecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampNanosecondArray::from_vec( - vec![1000, 2000], - Some(tz_name), - )), + Arc::new(TimestampSecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), + Arc::new( + TimestampSecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMillisecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMicrosecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampNanosecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), Arc::new(Date32Array::from(vec![1000, 2000])), Arc::new(Date64Array::from(vec![1000, 2000])), Arc::new(Time32SecondArray::from(vec![1000, 2000])), diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 54799a32630e..ad1bab77388c 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -863,12 +863,12 @@ mod tests { #[test] fn test_temporal_array_timestamp_quarter_with_timezone() { // 24 * 60 * 60 = 86400 - let a = - TimestampSecondArray::from_vec(vec![86400 * 90], Some("+00:00".to_string())); + let a = TimestampSecondArray::from(vec![86400 * 90]) + .with_timezone("+00:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = - TimestampSecondArray::from_vec(vec![86400 * 90], Some("-10:00".to_string())); + let a = TimestampSecondArray::from(vec![86400 * 90]) + .with_timezone("-10:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -899,12 +899,12 @@ mod tests { #[test] fn test_temporal_array_timestamp_month_with_timezone() { // 24 * 60 * 60 = 86400 - let a = - TimestampSecondArray::from_vec(vec![86400 * 31], Some("+00:00".to_string())); + let a = TimestampSecondArray::from(vec![86400 * 31]) + .with_timezone("+00:00".to_string()); let b = month(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = - TimestampSecondArray::from_vec(vec![86400 * 31], Some("-10:00".to_string())); + let a = TimestampSecondArray::from(vec![86400 * 31]) + .with_timezone("-10:00".to_string()); let b = month(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -912,10 +912,12 @@ mod tests { #[test] fn test_temporal_array_timestamp_day_with_timezone() { // 24 * 60 * 60 = 86400 - let a = TimestampSecondArray::from_vec(vec![86400], Some("+00:00".to_string())); + let a = + TimestampSecondArray::from(vec![86400]).with_timezone("+00:00".to_string()); let b = day(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = TimestampSecondArray::from_vec(vec![86400], Some("-10:00".to_string())); + let a = + TimestampSecondArray::from(vec![86400]).with_timezone("-10:00".to_string()); let b = day(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -1095,7 +1097,8 @@ mod tests { #[test] fn test_temporal_array_timestamp_second_with_timezone() { - let a = TimestampSecondArray::from_vec(vec![10, 20], Some("+00:00".to_string())); + let a = + TimestampSecondArray::from(vec![10, 20]).with_timezone("+00:00".to_string()); let b = second(&a).unwrap(); assert_eq!(10, b.value(0)); assert_eq!(20, b.value(1)); @@ -1103,7 +1106,8 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_timezone() { - let a = TimestampSecondArray::from_vec(vec![0, 60], Some("+00:50".to_string())); + let a = + TimestampSecondArray::from(vec![0, 60]).with_timezone("+00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(50, b.value(0)); assert_eq!(51, b.value(1)); @@ -1111,41 +1115,40 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_negative_timezone() { - let a = TimestampSecondArray::from_vec(vec![60 * 55], Some("-00:50".to_string())); + let a = + TimestampSecondArray::from(vec![60 * 55]).with_timezone("-00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(5, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone() { - let a = TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("+01:00".to_string()), - ); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("+01:00".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { - let a = - TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("+0100".to_string())); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("+0100".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_initial_sign() { - let a = - TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("0100".to_string())); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("0100".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_with_only_colon() { - let a = - TimestampSecondArray::from_vec(vec![60 * 60 * 10], Some("01:00".to_string())); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("01:00".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } @@ -1153,10 +1156,8 @@ mod tests { #[cfg(feature = "chrono-tz")] #[test] fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - let a = TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("Asia/Kolkata".to_string()), - ); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("Asia/Kolkata".to_string()); let b = hour(&a).unwrap(); assert_eq!(15, b.value(0)); } @@ -1180,19 +1181,19 @@ mod tests { #[cfg(not(feature = "chrono-tz"))] #[test] fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - let a = TimestampSecondArray::from_vec( - vec![60 * 60 * 10], - Some("Asia/Kolkatta".to_string()), - ); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("Asia/Kolkatta".to_string()); assert!(matches!(hour(&a), Err(ArrowError::ParseError(_)))) } #[test] fn test_hour_minute_second_dictionary_array() { - let a = TimestampSecondArray::from_vec( - vec![60 * 60 * 10 + 61, 60 * 60 * 20 + 122, 60 * 60 * 30 + 183], - Some("+01:00".to_string()), - ); + let a = TimestampSecondArray::from(vec![ + 60 * 60 * 10 + 61, + 60 * 60 * 20 + 122, + 60 * 60 * 30 + 183, + ]) + .with_timezone("+01:00".to_string()); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 1]); let dict = DictionaryArray::try_new(&keys, &a).unwrap(); diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index e92e68e5f483..7ab0ae24a40b 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -489,10 +489,11 @@ mod tests { ]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c5 = TimestampMillisecondArray::from_opt_vec( - vec![None, Some(1555584887378), Some(1555555555555)], + let c5 = TimestampMillisecondArray::from(vec![ None, - ); + Some(1555584887378), + Some(1555555555555), + ]); let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); let c7: DictionaryArray = vec!["cupcakes", "cupcakes", "foo"].into_iter().collect(); diff --git a/arrow/src/json/writer.rs b/arrow/src/json/writer.rs index beee02582ff8..f622b0cce77f 100644 --- a/arrow/src/json/writer.rs +++ b/arrow/src/json/writer.rs @@ -892,14 +892,10 @@ mod tests { let ts_millis = ts_micros / 1000; let ts_secs = ts_millis / 1000; - let arr_nanos = - TimestampNanosecondArray::from_opt_vec(vec![Some(ts_nanos), None], None); - let arr_micros = - TimestampMicrosecondArray::from_opt_vec(vec![Some(ts_micros), None], None); - let arr_millis = - TimestampMillisecondArray::from_opt_vec(vec![Some(ts_millis), None], None); - let arr_secs = - TimestampSecondArray::from_opt_vec(vec![Some(ts_secs), None], None); + let arr_nanos = TimestampNanosecondArray::from(vec![Some(ts_nanos), None]); + let arr_micros = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]); + let arr_millis = TimestampMillisecondArray::from(vec![Some(ts_millis), None]); + let arr_secs = TimestampSecondArray::from(vec![Some(ts_secs), None]); let arr_names = StringArray::from(vec![Some("a"), Some("b")]); let schema = Schema::new(vec![ diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bc68874ebabd..52f55a91baa9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1417,7 +1417,7 @@ mod tests { #[test] fn timestamp_second_single_column() { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); - let values = Arc::new(TimestampSecondArray::from_vec(raw_values, None)); + let values = Arc::new(TimestampSecondArray::from(raw_values)); one_column_roundtrip(values, false); } @@ -1425,7 +1425,7 @@ mod tests { #[test] fn timestamp_millisecond_single_column() { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); - let values = Arc::new(TimestampMillisecondArray::from_vec(raw_values, None)); + let values = Arc::new(TimestampMillisecondArray::from(raw_values)); one_column_roundtrip(values, false); } @@ -1433,7 +1433,7 @@ mod tests { #[test] fn timestamp_microsecond_single_column() { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); - let values = Arc::new(TimestampMicrosecondArray::from_vec(raw_values, None)); + let values = Arc::new(TimestampMicrosecondArray::from(raw_values)); one_column_roundtrip(values, false); } @@ -1441,7 +1441,7 @@ mod tests { #[test] fn timestamp_nanosecond_single_column() { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); - let values = Arc::new(TimestampNanosecondArray::from_vec(raw_values, None)); + let values = Arc::new(TimestampNanosecondArray::from(raw_values)); one_column_roundtrip(values, false); } From bca84454bdfb13d39a31a1bbf8fdc14940604cbd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Oct 2022 11:52:18 +1300 Subject: [PATCH 0161/1411] Document crate topology (#2594) (#2913) * Document crate topology (#2594) * Review feedback --- arrow-array/src/lib.rs | 2 +- arrow-data/src/lib.rs | 2 +- arrow/src/array/mod.rs | 4 +++- arrow/src/lib.rs | 37 ++++++++++++++++++++++++++++++++++--- 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 4f015793dda8..e616099ccc89 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -16,7 +16,7 @@ // under the License. //! The central type in Apache Arrow are arrays, which are a known-length sequence of values -//! all having the same type. This module provides concrete implementations of each type, as +//! all having the same type. This crate provides concrete implementations of each type, as //! well as an [`Array`] trait that can be used for type-erasure. //! //! # Downcasting an Array diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 9b7e307db360..58571e181176 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Buffer abstractions for [Apache Arrow](https://docs.rs/arrow) +//! Array data abstractions for [Apache Arrow](https://docs.rs/arrow) mod bitmap; pub use bitmap::Bitmap; diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 10009f5abde9..af774de0a263 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Re-exports APIs from [arrow_array] +//! Statically typed implementations of Arrow Arrays +//! +//! **See [arrow_array] for examples and usage instructions** #[cfg(feature = "ffi")] mod ffi; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 324803cb1a90..9cf66d5460e2 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -21,10 +21,34 @@ //! Please see the [arrow crates.io](https://crates.io/crates/arrow) //! page for feature flags and tips to improve performance. //! +//! # Crate Topology +//! +//! The [`arrow`] project is implemented as multiple sub-crates, which are then re-exported by +//! this top-level crate. +//! +//! Crate authors can choose to depend on this top-level crate, or just +//! the sub-crates they need. +//! +//! The current list of sub-crates is: +//! +//! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions +//! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays +//! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays +//! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays +//! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays +//! +//! _This list is likely to grow as further functionality is split out from the top-level crate_ +//! +//! Some functionality is also distributed independently of this crate: +//! +//! * [`arrow-flight`] - support for [Arrow Flight RPC] +//! * [`arrow-integration-test`] - support for [Arrow JSON Test Format] +//! * [`parquet`](https://docs.rs/parquet/latest/parquet/) - support for [Apache Parquet] +//! //! # Columnar Format //! -//! The [`array`] module provides statically typed implementations of all the array -//! types as defined by the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html). +//! The [`array`] module provides statically typed implementations of all the array types as defined +//! by the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) //! //! For example, an [`Int32Array`](array::Int32Array) represents a nullable array of `i32` //! @@ -77,7 +101,7 @@ //! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a")); //! ``` //! -//! For more examples, consult the [`array`] docs. +//! For more examples, consult the [arrow_array] docs. //! //! # Type Erasure / Trait Objects //! @@ -235,6 +259,7 @@ //! orchestrates the primitives exported by this crate into an embeddable query engine, with //! SQL and DataFrame frontends, and heavily influences this crate's roadmap. //! +//! [`arrow`]: https://github.com/apache/arrow-rs //! [`array`]: mod@array //! [`Array`]: array::Array //! [`ArrayRef`]: array::ArrayRef @@ -242,6 +267,12 @@ //! [`make_array`]: array::make_array //! [`Buffer`]: buffer::Buffer //! [`RecordBatch`]: record_batch::RecordBatch +//! [`arrow-flight`]: https://docs.rs/arrow-flight/latest/arrow_flight/ +//! [`arrow-integration-test`]: https://docs.rs/arrow-integration-test/latest/arrow_integration_test/ +//! [`parquet`]: https://docs.rs/parquet/latest/parquet/ +//! [Arrow Flight RPC]: https://arrow.apache.org/docs/format/Flight.html +//! [Arrow JSON Test Format]: https://github.com/apache/arrow/blob/master/docs/source/format/Integration.rst#json-test-data-format +//! [Apache Parquet]: https://parquet.apache.org/ //! [DataFusion]: https://github.com/apache/arrow-datafusion //! [issue tracker]: https://github.com/apache/arrow-rs/issues //! From 4620abf1ee39b154bc462ec71f338b8f1eb017fd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 25 Oct 2022 14:13:28 -0400 Subject: [PATCH 0162/1411] Update required half from 2.0 --> 2.1 (#2927) --- arrow-array/Cargo.toml | 2 +- arrow-buffer/Cargo.toml | 2 +- arrow-data/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 45765d99fcb7..06bff9aced2d 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -51,7 +51,7 @@ arrow-data = { version = "25.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.7", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false } +half = { version = "2.1", default-features = false } hashbrown = { version = "0.12", default-features = false } [dev-dependencies] diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index f2b4dba509e3..bf3f3cacc6c9 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -39,7 +39,7 @@ bench = false [dependencies] num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false } +half = { version = "2.1", default-features = false } [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index be477802622d..8fe0054f87cd 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -49,7 +49,7 @@ arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } arrow-schema = { version = "25.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false } +half = { version = "2.1", default-features = false } [dev-dependencies] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 4a1668cc0fdd..2f80d6a27260 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -53,7 +53,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false, features = ["num-traits"] } +half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.12", default-features = false } csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } From 9c315ce2eacb1d8c6591c5186747c04a045a53a7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:04:33 +1300 Subject: [PATCH 0163/1411] Add experimental AWS_PROFILE support (#2178) (#2891) * Add experimental AWS_PROFILE support (#2178) * Add docs * Include region --- .github/workflows/object_store.yml | 2 + object_store/Cargo.toml | 7 ++ object_store/src/aws/client.rs | 2 +- object_store/src/aws/credential.rs | 152 ++++++++++++++++++++--------- object_store/src/aws/mod.rs | 103 ++++++++++++++----- 5 files changed, 191 insertions(+), 75 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 6996aa706636..2afcb4344371 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -50,6 +50,8 @@ jobs: run: cargo clippy -p object_store -- -D warnings - name: Run clippy with aws feature run: cargo clippy -p object_store --features aws -- -D warnings + - name: Run clippy with aws_profile feature + run: cargo clippy -p object_store --features aws_profile -- -D warnings - name: Run clippy with gcp feature run: cargo clippy -p object_store --features gcp -- -D warnings - name: Run clippy with azure feature diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index e52137383959..fc2af7e51419 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -52,12 +52,19 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } +# AWS Profile support +aws-types = { version = "0.49", optional = true } +aws-config = { version = "0.49", optional = true } + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] +# Experimental support for AWS_PROFILE +aws_profile = ["aws", "aws-config", "aws-types"] + [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 29621626c8b6..a07cdb3c6a82 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -198,7 +198,7 @@ pub struct S3Config { pub endpoint: String, pub bucket: String, pub bucket_endpoint: String, - pub credentials: CredentialProvider, + pub credentials: Box, pub retry_config: RetryConfig, pub allow_http: bool, } diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index ada855b4848a..32430d7f9668 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -22,6 +22,7 @@ use crate::util::hmac_sha256; use crate::{Result, RetryConfig}; use bytes::Buf; use chrono::{DateTime, Utc}; +use futures::future::BoxFuture; use futures::TryFutureExt; use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; @@ -289,21 +290,8 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { } /// Provides credentials for use when signing requests -#[derive(Debug)] -pub enum CredentialProvider { - Static(StaticCredentialProvider), - Instance(InstanceCredentialProvider), - WebIdentity(WebIdentityProvider), -} - -impl CredentialProvider { - pub async fn get_credential(&self) -> Result> { - match self { - Self::Static(s) => Ok(Arc::clone(&s.credential)), - Self::Instance(c) => c.get_credential().await, - Self::WebIdentity(c) => c.get_credential().await, - } - } +pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + fn get_credential(&self) -> BoxFuture<'_, Result>>; } /// A static set of credentials @@ -312,6 +300,12 @@ pub struct StaticCredentialProvider { pub credential: Arc, } +impl CredentialProvider for StaticCredentialProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(futures::future::ready(Ok(Arc::clone(&self.credential)))) + } +} + /// Credentials sourced from the instance metadata service /// /// @@ -324,22 +318,20 @@ pub struct InstanceCredentialProvider { pub metadata_endpoint: String, } -impl InstanceCredentialProvider { - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(|| { - instance_creds( - &self.client, - &self.retry_config, - &self.metadata_endpoint, - self.imdsv1_fallback, - ) - .map_err(|source| crate::Error::Generic { - store: "S3", - source, - }) +impl CredentialProvider for InstanceCredentialProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(|| { + instance_creds( + &self.client, + &self.retry_config, + &self.metadata_endpoint, + self.imdsv1_fallback, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, }) - .await + })) } } @@ -357,24 +349,22 @@ pub struct WebIdentityProvider { pub retry_config: RetryConfig, } -impl WebIdentityProvider { - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(|| { - web_identity( - &self.client, - &self.retry_config, - &self.token, - &self.role_arn, - &self.session_name, - &self.endpoint, - ) - .map_err(|source| crate::Error::Generic { - store: "S3", - source, - }) +impl CredentialProvider for WebIdentityProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(|| { + web_identity( + &self.client, + &self.retry_config, + &self.token, + &self.role_arn, + &self.session_name, + &self.endpoint, + ) + .map_err(|source| crate::Error::Generic { + store: "S3", + source, }) - .await + })) } } @@ -520,6 +510,74 @@ async fn web_identity( }) } +#[cfg(feature = "aws_profile")] +mod profile { + use super::*; + use aws_config::profile::ProfileFileCredentialsProvider; + use aws_config::provider_config::ProviderConfig; + use aws_types::credentials::ProvideCredentials; + use aws_types::region::Region; + use std::time::SystemTime; + + #[derive(Debug)] + pub struct ProfileProvider { + cache: TokenCache>, + credentials: ProfileFileCredentialsProvider, + } + + impl ProfileProvider { + pub fn new(name: String, region: String) -> Self { + let config = ProviderConfig::default().with_region(Some(Region::new(region))); + + Self { + cache: Default::default(), + credentials: ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(name) + .build(), + } + } + } + + impl CredentialProvider for ProfileProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(move || async move { + let c = + self.credentials + .provide_credentials() + .await + .map_err(|source| crate::Error::Generic { + store: "S3", + source: Box::new(source), + })?; + + let t_now = SystemTime::now(); + let expiry = match c.expiry().and_then(|e| e.duration_since(t_now).ok()) { + Some(ttl) => Instant::now() + ttl, + None => { + return Err(crate::Error::Generic { + store: "S3", + source: "Invalid expiry".into(), + }) + } + }; + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) + })) + } + } +} + +#[cfg(feature = "aws_profile")] +pub use profile::ProfileProvider; + #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index c08a6353fa82..4a810658c03f 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -109,6 +109,9 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, + #[snafu(display("Profile support requires aws_profile feature"))] + MissingProfileFeature, + #[snafu(display("ETag Header missing from response"))] MissingEtag, @@ -359,6 +362,7 @@ pub struct AmazonS3Builder { imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, + profile: Option, } impl AmazonS3Builder { @@ -370,13 +374,14 @@ impl AmazonS3Builder { /// Fill the [`AmazonS3Builder`] with regular AWS environment variables /// /// Variables extracted from environment: - /// * AWS_ACCESS_KEY_ID -> access_key_id - /// * AWS_SECRET_ACCESS_KEY -> secret_access_key - /// * AWS_DEFAULT_REGION -> region - /// * AWS_ENDPOINT -> endpoint - /// * AWS_SESSION_TOKEN -> token - /// * AWS_CONTAINER_CREDENTIALS_RELATIVE_URI -> - /// * AWS_ALLOW_HTTP -> set to "true" to permit HTTP connections without TLS + /// * `AWS_ACCESS_KEY_ID` -> access_key_id + /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key + /// * `AWS_DEFAULT_REGION` -> region + /// * `AWS_ENDPOINT` -> endpoint + /// * `AWS_SESSION_TOKEN` -> token + /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> + /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS + /// * `AWS_PROFILE` -> set profile name, requires `aws_profile` feature enabled /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -408,6 +413,10 @@ impl AmazonS3Builder { builder.token = Some(token); } + if let Ok(profile) = std::env::var("AWS_PROFILE") { + builder.profile = Some(profile); + } + // This env var is set in ECS // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html if let Ok(metadata_relative_uri) = @@ -528,6 +537,24 @@ impl AmazonS3Builder { self } + /// Set the AWS profile name, see + /// + /// This makes use of [aws-config] to provide credentials and therefore requires + /// the `aws-profile` feature to be enabled + /// + /// It is strongly encouraged that users instead make use of a credential manager + /// such as [aws-vault] not only to avoid the significant additional dependencies, + /// but also to avoid storing credentials in [plain text on disk] + /// + /// [aws-config]: https://docs.rs/aws-config + /// [aws-vault]: https://github.com/99designs/aws-vault + /// [plain text on disk]: https://99designs.com.au/blog/engineering/aws-vault/ + #[cfg(feature = "aws_profile")] + pub fn with_profile(mut self, profile: impl Into) -> Self { + self.profile = Some(profile.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { @@ -537,13 +564,13 @@ impl AmazonS3Builder { let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); - CredentialProvider::Static(StaticCredentialProvider { + Box::new(StaticCredentialProvider { credential: Arc::new(AwsCredential { key_id, secret_key, token, }), - }) + }) as _ } (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), @@ -565,7 +592,7 @@ impl AmazonS3Builder { // Disallow non-HTTPs requests let client = Client::builder().https_only(true).build().unwrap(); - CredentialProvider::WebIdentity(WebIdentityProvider { + Box::new(WebIdentityProvider { cache: Default::default(), token, session_name, @@ -573,24 +600,30 @@ impl AmazonS3Builder { endpoint, client, retry_config: self.retry_config.clone(), - }) - } - _ => { - info!("Using Instance credential provider"); - - // The instance metadata endpoint is access over HTTP - let client = Client::builder().https_only(false).build().unwrap(); - - CredentialProvider::Instance(InstanceCredentialProvider { - cache: Default::default(), - client, - retry_config: self.retry_config.clone(), - imdsv1_fallback: self.imdsv1_fallback, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }) + }) as _ } + _ => match self.profile { + Some(profile) => { + info!("Using profile \"{}\" credential provider", profile); + profile_credentials(profile, region.clone())? + } + None => { + info!("Using Instance credential provider"); + + // The instance metadata endpoint is access over HTTP + let client = Client::builder().https_only(false).build().unwrap(); + + Box::new(InstanceCredentialProvider { + cache: Default::default(), + client, + retry_config: self.retry_config.clone(), + imdsv1_fallback: self.imdsv1_fallback, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), + }) as _ + } + }, }, }; @@ -628,6 +661,22 @@ impl AmazonS3Builder { } } +#[cfg(feature = "aws_profile")] +fn profile_credentials( + profile: String, + region: String, +) -> Result> { + Ok(Box::new(credential::ProfileProvider::new(profile, region))) +} + +#[cfg(not(feature = "aws_profile"))] +fn profile_credentials( + _profile: String, + _region: String, +) -> Result> { + Err(Error::MissingProfileFeature.into()) +} + #[cfg(test)] mod tests { use super::*; From 4d5d10d81287039140a3b6f4439e2de43e201a8a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Oct 2022 11:07:29 +1300 Subject: [PATCH 0164/1411] Cleanup generated proto code (#2921) * Cleanup generated proto code * Add workspace clean check --- .gitattributes | 2 ++ .github/workflows/arrow_flight.yml | 2 ++ arrow-flight/build.rs | 8 ++------ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitattributes b/.gitattributes index 51008d2e3b4a..b7b0d51ff478 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ parquet/src/format.rs linguist-generated +arrow-flight/src/arrow.flight.protocol.rs linguist-generated +arrow-flight/src/sql/arrow.flight.protocol.sql.rs linguist-generated diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 686dee9ff042..d40c9b6ecee7 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -61,6 +61,8 @@ jobs: - name: Test --all-features run: | cargo test -p arrow-flight --all-features + - name: Verify workspace clean + run: git diff --exit-code clippy: name: Clippy diff --git a/arrow-flight/build.rs b/arrow-flight/build.rs index 25f034ac191b..4ceb298359db 100644 --- a/arrow-flight/build.rs +++ b/arrow-flight/build.rs @@ -16,16 +16,12 @@ // under the License. use std::{ - env, fs::OpenOptions, io::{Read, Write}, path::Path, }; fn main() -> Result<(), Box> { - // override the build location, in order to check in the changes to proto files - env::set_var("OUT_DIR", "src"); - // The current working directory can vary depending on how the project is being // built or released so we build an absolute path to the proto file let path = Path::new("../format/Flight.proto"); @@ -39,6 +35,7 @@ fn main() -> Result<(), Box> { tonic_build::configure() // protoc in unbuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") + .out_dir("src") .compile(&[proto_path], &[proto_dir])?; // read file contents to string @@ -56,8 +53,6 @@ fn main() -> Result<(), Box> { file.write_all(buffer.as_bytes())?; } - // override the build location, in order to check in the changes to proto files - env::set_var("OUT_DIR", "src/sql"); // The current working directory can vary depending on how the project is being // built or released so we build an absolute path to the proto file let path = Path::new("../format/FlightSql.proto"); @@ -71,6 +66,7 @@ fn main() -> Result<(), Box> { tonic_build::configure() // protoc in unbuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") + .out_dir("src/sql") .compile(&[proto_path], &[proto_dir])?; // read file contents to string From f812d2cda1379933fca22d9979e3df5be19792fa Mon Sep 17 00:00:00 2001 From: John Hughes Date: Wed, 26 Oct 2022 05:34:18 +0200 Subject: [PATCH 0165/1411] Support building `object_store` and `parquet` on wasm32-unknown-unknown target (#2896) * Support building object_store on wasm32-unknown-unknown target * Added cargo check step to parquet workflow for wasm32-unknown-unknown * Added compile-time warning for unsupported cloud features when compiling with wasm32 * Added cargo check features to the parquet github workflow. * Added a section to the README.md for parquet * * Added wasm32-unknown-unknown section to the object_store README.md --- .github/actions/setup-builder/action.yaml | 1 + .github/workflows/parquet.yml | 3 +++ object_store/Cargo.toml | 6 ++++-- object_store/README.md | 9 ++++++++- object_store/src/lib.rs | 20 +++++++++++++++++--- object_store/src/path/mod.rs | 6 ++++++ object_store/src/util.rs | 1 + parquet/Cargo.toml | 7 ++++--- parquet/README.md | 8 ++++++++ 9 files changed, 52 insertions(+), 9 deletions(-) diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml index 0ef6532da477..a4d4d392191f 100644 --- a/.github/actions/setup-builder/action.yaml +++ b/.github/actions/setup-builder/action.yaml @@ -53,4 +53,5 @@ runs: echo "Installing ${{ inputs.rust-version }}" rustup toolchain install ${{ inputs.rust-version }} rustup default ${{ inputs.rust-version }} + rustup target add wasm32-unknown-unknown echo "CARGO_TARGET_DIR=/github/home/target" >> $GITHUB_ENV diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 3c5b2eab7d19..550b590737ab 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -119,6 +119,9 @@ jobs: - name: Check compilation --all-targets --no-default-features --features json run: | cargo check -p parquet --all-targets --no-default-features --features json + - name: Check compilation wasm32-unknown-unknown + run: | + cargo check -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-unknown-unknown clippy: name: Clippy diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index fc2af7e51419..f5eb1115d895 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -37,7 +37,7 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } +tokio = { version = "1.18", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -51,13 +51,15 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } +# Fix for wasm32-unknown-unknown (see https://docs.rs/getrandom/latest/getrandom/#webassembly-support) +getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support aws-types = { version = "0.49", optional = true } aws-config = { version = "0.49", optional = true } [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/object_store/README.md b/object_store/README.md index fd10414a9285..5b47a65c124f 100644 --- a/object_store/README.md +++ b/object_store/README.md @@ -33,7 +33,14 @@ change. Supported object stores include: * Memory * Custom implementations - Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions + +## Support for `wasm32-unknown-unknown` target + +It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, and `gcp` are not supported. + +``` +cargo build -p object_store --target wasm32-unknown-unknown +``` \ No newline at end of file diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 5eaaabaf2944..6278d827b0c7 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -153,6 +153,12 @@ //! ``` //! +#[cfg(all( + target_arch = "wasm32", + any(feature = "gcp", feature = "aws", feature = "azure",) +))] +compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); + #[cfg(feature = "aws")] pub mod aws; #[cfg(feature = "azure")] @@ -160,6 +166,7 @@ pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; pub mod limit; +#[cfg(not(target_arch = "wasm32"))] pub mod local; pub mod memory; pub mod path; @@ -176,15 +183,16 @@ mod multipart; mod util; use crate::path::Path; -use crate::util::{ - coalesce_ranges, collect_bytes, maybe_spawn_blocking, OBJECT_STORE_COALESCE_DEFAULT, -}; +#[cfg(not(target_arch = "wasm32"))] +use crate::util::maybe_spawn_blocking; +use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; +#[cfg(not(target_arch = "wasm32"))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; @@ -351,6 +359,7 @@ impl GetResult { /// Collects the data into a [`Bytes`] pub async fn bytes(self) -> Result { match self { + #[cfg(not(target_arch = "wasm32"))] Self::File(mut file, path) => { maybe_spawn_blocking(move || { let len = file.seek(SeekFrom::End(0)).map_err(|source| { @@ -377,6 +386,8 @@ impl GetResult { .await } Self::Stream(s) => collect_bytes(s, None).await, + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), } } @@ -396,6 +407,7 @@ impl GetResult { /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { match self { + #[cfg(not(target_arch = "wasm32"))] Self::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; @@ -424,6 +436,8 @@ impl GetResult { .boxed() } Self::Stream(s) => s, + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), } } } diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 80e0f792aa55..59ad471c671e 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -18,9 +18,11 @@ //! Path abstraction for Object Storage use itertools::Itertools; +#[cfg(not(target_arch = "wasm32"))] use percent_encoding::percent_decode; use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; +#[cfg(not(target_arch = "wasm32"))] use url::Url; /// The delimiter to separate object namespaces, creating a directory structure. @@ -160,6 +162,7 @@ impl Path { }) } + #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences @@ -176,6 +179,7 @@ impl Path { Self::from_absolute_path(absolute) } + #[cfg(not(target_arch = "wasm32"))] /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences @@ -184,6 +188,7 @@ impl Path { Self::from_absolute_path_with_base(path, None) } + #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the provided base /// /// This will return an error if the path contains illegal character sequences @@ -308,6 +313,7 @@ where } } +#[cfg(not(target_arch = "wasm32"))] /// Given an absolute filesystem path convert it to a URL representation without canonicalization pub(crate) fn absolute_path_to_url( path: impl AsRef, diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 2814ca244c39..41c72d012b5a 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -69,6 +69,7 @@ where } } +#[cfg(not(target_arch = "wasm32"))] /// Takes a function and spawns it to a tokio blocking pool if available pub async fn maybe_spawn_blocking(f: F) -> Result where diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 9c7da94f9dd7..d2c215d461fb 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -30,7 +30,7 @@ edition = "2021" rust-version = "1.62" [dependencies] -ahash = "0.8" +ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.16", default-features = false } snap = { version = "1.0", default-features = false, optional = true } @@ -46,9 +46,8 @@ base64 = { version = "0.13", default-features = false, features = ["std"], optio clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } seq-macro = { version = "0.3", default-features = false } -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } -tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "fs", "rt", "io-util"] } +tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } hashbrown = { version = "0.12", default-features = false } [dev-dependencies] @@ -62,6 +61,8 @@ lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { path = "../arrow", version = "25.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet/README.md b/parquet/README.md index 96a34d7c2881..cd642317a12e 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -69,6 +69,14 @@ The `parquet` crate provides the following features which may be enabled in your - [ ] Predicate pushdown - [x] Parquet format 4.0.0 support +## Support for `wasm32-unknown-unknown` target + +It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180). + +``` +cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli +``` + ## License Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. From ed5843ecea4591e667f0cb1562c11a9eaea22769 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 27 Oct 2022 02:10:19 +0800 Subject: [PATCH 0166/1411] fix testcases while chrono-tz enabled (#2932) * fix testcases * add arrow test with all features * add chrono-tz in features * remove chrono-tz * remove duplicated blocks --- .github/workflows/arrow.yml | 4 ++-- arrow/src/compute/kernels/cast.rs | 8 ++++---- arrow/src/csv/writer.rs | 19 ++++--------------- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 3c73f9d5c7cc..4c395cf64c26 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -68,9 +68,9 @@ jobs: run: cargo test -p arrow-integration-test --all-features - name: Test arrow run: cargo test -p arrow - - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict + - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz run: | - cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict + cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz - name: Run examples run: | # Test arrow examples diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index c0b08ecc57d9..2380aa166912 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3853,8 +3853,8 @@ mod tests { let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001", c.value(1)); + assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); + assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); assert!(c.is_null(2)); } @@ -5754,9 +5754,9 @@ mod tests { let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00"), + Some("1970-01-01 20:30:00 +10:00"), None, - Some("1970-01-02 09:58:59"), + Some("1970-01-02 09:58:59 +10:00"), ]); assert_eq!( diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index 7ab0ae24a40b..330959096a04 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -524,25 +524,14 @@ mod tests { let mut buffer: Vec = vec![]; file.read_to_end(&mut buffer).unwrap(); - let expected = if cfg!(feature = "chrono-tz") { - r#"c1,c2,c3,c4,c5,c6,c7 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000+00:00,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000+00:00,23:46:03,foo -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000+00:00,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000+00:00,23:46:03,foo -"# - } else { - r#"c1,c2,c3,c4,c5,c6,c7 + let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo -"# - }; +"#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -646,8 +635,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo } let left = "c1,c2 -2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000+00:00 -2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000+00:00\n"; +2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 +2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; let right = writer.writer.into_inner().map(|s| s.to_string()); assert_eq!(Some(left.to_string()), right.ok()); } From 51d35684507a5a1a818bfb69497011f4e4593b9d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 09:10:52 +1300 Subject: [PATCH 0167/1411] Improve panic messages for RowSelection::and_then (#2925) (#2928) * Improve panic messages for RowSelection::and_then (#2925) * Review feedback --- parquet/src/arrow/arrow_reader/selection.rs | 40 +++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 6a965dc9bc56..f1270926bf4e 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -223,6 +223,11 @@ impl RowSelection { /// returned: NNNNNNNNNNNNYYYYYNNNNYYYYYYYYYYYYYNNNYYNNN /// /// + /// # Panics + /// + /// Panics if `other` does not have a length equal to the number of rows selected + /// by this RowSelection + /// pub fn and_then(&self, other: &Self) -> Self { let mut selectors = vec![]; let mut first = self.selectors.iter().cloned().peekable(); @@ -230,7 +235,9 @@ impl RowSelection { let mut to_skip = 0; while let Some(b) = second.peek_mut() { - let a = first.peek_mut().unwrap(); + let a = first + .peek_mut() + .expect("selection exceeds the number of selected rows"); if b.row_count == 0 { second.next().unwrap(); @@ -269,7 +276,10 @@ impl RowSelection { for v in first { if v.row_count != 0 { - assert!(v.skip); + assert!( + v.skip, + "selection contains less than the number of selected rows" + ); to_skip += v.row_count } } @@ -460,6 +470,32 @@ mod tests { ); } + #[test] + #[should_panic(expected = "selection exceeds the number of selected rows")] + fn test_and_longer() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(36)]); + a.and_then(&b); + } + + #[test] + #[should_panic(expected = "selection contains less than the number of selected rows")] + fn test_and_shorter() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(3)]); + a.and_then(&b); + } + #[test] fn test_and_fuzz() { let mut rand = thread_rng(); From 1d36bdf82853b344bab45dfda84f47ee7e3cfb3f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 10:15:14 +1300 Subject: [PATCH 0168/1411] Add lexsort benchmark (#2871) (#2929) * Add lexsort benchmark (#2871) * Format * Apply suggestions from code review Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- arrow/Cargo.toml | 5 + arrow/benches/lexsort.rs | 171 ++++++++++++++++++++++++++++++ arrow/src/compute/kernels/sort.rs | 14 ++- arrow/src/row/mod.rs | 18 ++++ arrow/src/util/bench_util.rs | 38 +++++++ 5 files changed, 241 insertions(+), 5 deletions(-) create mode 100644 arrow/benches/lexsort.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2f80d6a27260..9e0f93768fa1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -259,3 +259,8 @@ required-features = ["test_utils"] name = "bitwise_kernel" harness = false required-features = ["test_utils"] + +[[bench]] +name = "lexsort" +harness = false +required-features = ["test_utils"] diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs new file mode 100644 index 000000000000..3820007231ab --- /dev/null +++ b/arrow/benches/lexsort.rs @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::compute::{lexsort_to_indices, SortColumn}; +use arrow::row::{RowConverter, SortField}; +use arrow::util::bench_util::{ + create_dict_from_values, create_primitive_array, create_string_array_with_len, +}; +use arrow_array::types::Int32Type; +use arrow_array::{Array, ArrayRef, UInt32Array}; +use criterion::{criterion_group, criterion_main, Criterion}; +use std::sync::Arc; + +#[derive(Copy, Clone)] +enum Column { + RequiredI32, + OptionalI32, + Required16CharString, + Optional16CharString, + Optional50CharString, + Optional100Value50CharStringDict, +} + +impl std::fmt::Debug for Column { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Column::RequiredI32 => "i32", + Column::OptionalI32 => "i32_opt", + Column::Required16CharString => "str(16)", + Column::Optional16CharString => "str_opt(16)", + Column::Optional50CharString => "str_opt(50)", + Column::Optional100Value50CharStringDict => "dict(100,str_opt(50))", + }; + f.write_str(s) + } +} + +impl Column { + fn generate(self, size: usize) -> ArrayRef { + match self { + Column::RequiredI32 => { + Arc::new(create_primitive_array::(size, 0.)) + } + Column::OptionalI32 => { + Arc::new(create_primitive_array::(size, 0.2)) + } + Column::Required16CharString => { + Arc::new(create_string_array_with_len::(size, 0., 16)) + } + Column::Optional16CharString => { + Arc::new(create_string_array_with_len::(size, 0.2, 16)) + } + Column::Optional50CharString => { + Arc::new(create_string_array_with_len::(size, 0., 50)) + } + Column::Optional100Value50CharStringDict => { + Arc::new(create_dict_from_values::( + size, + 0.1, + &create_string_array_with_len::(100, 0., 50), + )) + } + } + } +} + +fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { + let arrays: Vec<_> = columns.iter().map(|x| x.generate(len)).collect(); + let sort_columns: Vec<_> = arrays + .iter() + .cloned() + .map(|values| SortColumn { + values, + options: None, + }) + .collect(); + + c.bench_function( + &format!("lexsort_to_indices({:?}): {}", columns, len), + |b| { + b.iter(|| { + criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap()) + }) + }, + ); + + c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| { + b.iter(|| { + criterion::black_box({ + let fields = arrays + .iter() + .map(|a| SortField::new(a.data_type().clone())) + .collect(); + let mut converter = RowConverter::new(fields); + let rows = converter.convert_columns(&arrays).unwrap(); + let mut sort: Vec<_> = rows.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) + }) + }) + }); +} + +fn add_benchmark(c: &mut Criterion) { + let cases: &[&[Column]] = &[ + &[Column::RequiredI32, Column::OptionalI32], + &[Column::RequiredI32, Column::Optional16CharString], + &[Column::RequiredI32, Column::Required16CharString], + &[Column::Optional16CharString, Column::Required16CharString], + &[ + Column::Optional16CharString, + Column::Optional50CharString, + Column::Required16CharString, + ], + &[ + Column::Optional16CharString, + Column::Required16CharString, + Column::Optional16CharString, + Column::Optional16CharString, + Column::Optional16CharString, + ], + &[ + Column::OptionalI32, + Column::Optional100Value50CharStringDict, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Required16CharString, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional50CharString, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional50CharString, + ], + ]; + + for case in cases { + do_bench(c, *case, 4096); + do_bench(c, *case, 4096 * 8); + } +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index e2e20e756065..b297622647e7 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -897,6 +897,10 @@ pub struct SortColumn { /// assert_eq!(as_primitive_array::(&sorted_columns[0]).value(1), -64); /// assert!(sorted_columns[0].is_null(0)); /// ``` +/// +/// Note: for multi-column sorts without a limit, using the [row format][crate::row] +/// may be significantly faster +/// pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result> { let indices = lexsort_to_indices(columns, limit)?; columns @@ -907,6 +911,9 @@ pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result, @@ -942,11 +949,8 @@ pub fn lexsort_to_indices( lexicographical_comparator.compare(a, b) }); - Ok(UInt32Array::from( - (&value_indices)[0..len] - .iter() - .map(|i| *i as u32) - .collect::>(), + Ok(UInt32Array::from_iter_values( + value_indices.iter().map(|i| *i as u32), )) } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index c3aa9ea4c5a7..8af642240e7e 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -73,6 +73,24 @@ //! assert_eq!(&c2_values, &["a", "f", "c", "e"]); //! ``` //! +//! It can also be used to implement a fast multi-column / lexicographic sort +//! +//! ``` +//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_array::{ArrayRef, UInt32Array}; +//! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { +//! let fields = arrays +//! .iter() +//! .map(|a| SortField::new(a.data_type().clone())) +//! .collect(); +//! let mut converter = RowConverter::new(fields); +//! let rows = converter.convert_columns(&arrays).unwrap(); +//! let mut sort: Vec<_> = rows.iter().enumerate().collect(); +//! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); +//! UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) +//! } +//! ``` +//! //! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] //! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] //! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 3b89e7982a6b..d07443301c16 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -20,6 +20,8 @@ use crate::array::*; use crate::datatypes::*; use crate::util::test_util::seedable_rng; +use arrow_buffer::Buffer; +use rand::distributions::uniform::SampleUniform; use rand::Rng; use rand::SeedableRng; use rand::{ @@ -187,3 +189,39 @@ pub fn create_fsb_array( })) .unwrap() } + +/// Creates a random (but fixed-seeded) dictionary array of a given size and null density +/// with the provided values array +pub fn create_dict_from_values( + size: usize, + null_density: f32, + values: &dyn Array, +) -> DictionaryArray +where + K: ArrowDictionaryKeyType, + Standard: Distribution, + K::Native: SampleUniform, +{ + let mut rng = seedable_rng(); + let data_type = DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let min_key = K::Native::from_usize(0).unwrap(); + let max_key = K::Native::from_usize(values.len()).unwrap(); + let keys: Buffer = (0..size).map(|_| rng.gen_range(min_key..max_key)).collect(); + + let nulls: Option = (null_density != 0.) + .then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect()); + + let data = ArrayDataBuilder::new(data_type) + .len(size) + .null_bit_buffer(nulls) + .add_buffer(keys) + .add_child_data(values.data().clone()) + .build() + .unwrap(); + + DictionaryArray::from(data) +} From c0d0ac0623170b8c97f1fa40b987ffbbfe6aee03 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 27 Oct 2022 05:15:37 +0800 Subject: [PATCH 0169/1411] support more fixedoffset tz format (#2936) --- arrow-array/src/timezone.rs | 41 +++++++++++++++++---------- arrow/src/compute/kernels/temporal.rs | 12 ++++++-- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index 4e60c0c422b6..7bd597904737 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -24,19 +24,24 @@ pub use private::{Tz, TzOffset}; /// Parses a fixed offset of the form "+09:00" fn parse_fixed_offset(tz: &str) -> Result { - if tz.len() != 6 { - return Err(ArrowError::ParseError(format!( - "Invalid timezone \"{}\": Expected format [+-]XX:XX", - tz - ))); + let mut parsed = Parsed::new(); + + if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z")) + .and_then(|_| parsed.to_fixed_offset()) + { + return Ok(fixed_offset); } - let mut parsed = Parsed::new(); - parse(&mut parsed, tz, StrftimeItems::new("%:z")) + if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z")) .and_then(|_| parsed.to_fixed_offset()) - .map_err(|e| { - ArrowError::ParseError(format!("Invalid timezone \"{}\": {}", tz, e)) - }) + { + return Ok(fixed_offset); + } + + Err(ArrowError::ParseError(format!( + "Invalid timezone \"{}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX", + tz + ))) } #[cfg(feature = "chrono-tz")] @@ -313,13 +318,19 @@ mod tests { 9 * 60 * 60 ); - let err = "+9:00".parse::().unwrap_err().to_string(); - assert!(err.contains("Invalid timezone"), "{}", err); + let tz = "+09".parse::().unwrap(); + assert_eq!( + tz.offset_from_utc_date(&t).fix().local_minus_utc(), + 9 * 60 * 60 + ); - let err = "+09".parse::().unwrap_err().to_string(); - assert!(err.contains("Invalid timezone"), "{}", err); + let tz = "+0900".parse::().unwrap(); + assert_eq!( + tz.offset_from_utc_date(&t).fix().local_minus_utc(), + 9 * 60 * 60 + ); - let err = "+0900".parse::().unwrap_err().to_string(); + let err = "+9:00".parse::().unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } } diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index ad1bab77388c..abb8b40c2c48 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -1133,8 +1133,16 @@ mod tests { fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { let a = TimestampSecondArray::from(vec![60 * 60 * 10]) .with_timezone("+0100".to_string()); - let err = hour(&a).unwrap_err().to_string(); - assert!(err.contains("Invalid timezone"), "{}", err); + let b = hour(&a).unwrap(); + assert_eq!(11, b.value(0)); + } + + #[test] + fn test_temporal_array_timestamp_hour_with_timezone_without_minutes() { + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("+01".to_string()); + let b = hour(&a).unwrap(); + assert_eq!(11, b.value(0)); } #[test] From a9f632c1bd04410c2528543d74151a91f78643cf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 10:35:17 +1300 Subject: [PATCH 0170/1411] Cleanup orphaned doc comments (#2935) (#2938) --- parquet/src/file/writer.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 8cb6df974e4a..dbbc38461677 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -173,12 +173,6 @@ impl SerializedFileWriter { } /// Closes and finalises file writer, returning the file metadata. - /// - /// All row groups must be appended before this method is called. - /// No writes are allowed after this point. - /// - /// Can be called multiple times. It is up to implementation to either result in - /// no-op, or return an `Err` for subsequent calls. pub fn close(mut self) -> Result { self.assert_previous_writer_closed()?; let metadata = self.write_metadata()?; @@ -431,10 +425,6 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { } /// Closes this row group writer and returns row group metadata. - /// After calling this method row group writer must not be used. - /// - /// Can be called multiple times. In subsequent calls will result in no-op and return - /// already created row group metadata. pub fn close(mut self) -> Result { if self.row_group_metadata.is_none() { self.assert_previous_writer_closed()?; From 994be0589c88fa5137e8db7302e51f9b5a1bc2c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 11:25:09 +1300 Subject: [PATCH 0171/1411] Benchmark with prepared row converter (#2930) * Benchmark with prepared row converter * Update arrow/benches/row_format.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- arrow/benches/row_format.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index ff505781a0a1..48bb013116b6 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -45,6 +45,10 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let mut converter = RowConverter::new(fields); let rows = converter.convert_columns(&cols).unwrap(); + // using a pre-prepared row converter should be faster than the first time + c.bench_function(&format!("convert_columns_prepared {}", name), |b| { + b.iter(|| black_box(converter.convert_columns(&cols).unwrap())); + }); c.bench_function(&format!("convert_rows {}", name), |b| { b.iter(|| black_box(converter.convert_rows(&rows).unwrap())); From afa8e27ce3aac2720a7684522d84b30811e61c4c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 14:37:53 +1300 Subject: [PATCH 0172/1411] Remove NativeDecimalType (#2945) --- arrow-array/src/types.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 70c43a2a4948..edf6d40f3ae0 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -472,17 +472,6 @@ mod private { impl DecimalTypeSealed for Decimal256Type {} } -/// Trait representing the in-memory layout of a decimal type -pub trait NativeDecimalType: Send + Sync + Copy + AsRef<[u8]> { - fn from_slice(slice: &[u8]) -> Self; -} - -impl NativeDecimalType for [u8; N] { - fn from_slice(slice: &[u8]) -> Self { - slice.try_into().unwrap() - } -} - /// A trait over the decimal types, used by [`DecimalArray`] to provide a generic /// implementation across the various decimal types /// From 4cef58e2becd3e532e0b4c1b672e2a484dfd601e Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 27 Oct 2022 12:49:22 +0800 Subject: [PATCH 0173/1411] fix datatype for timestamptz debug fmt (#2948) --- arrow-array/src/array/primitive_array.rs | 35 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 3105cc6a964d..016e5306cf8f 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -535,8 +535,9 @@ where impl std::fmt::Debug for PrimitiveArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "PrimitiveArray<{:?}>\n[\n", T::DATA_TYPE)?; - print_long_array(self, f, |array, index, f| match T::DATA_TYPE { + let data_type = self.data_type(); + write!(f, "PrimitiveArray<{:?}>\n[\n", data_type)?; + print_long_array(self, f, |array, index, f| match data_type { DataType::Date32 | DataType::Date64 => { let v = self.value(index).to_isize().unwrap() as i64; match as_date::(v) { @@ -1322,6 +1323,36 @@ mod tests { ); } + #[test] + fn test_timestamp_with_tz_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]) + .with_timezone("Asia/Taipei".to_string()); + assert_eq!( + "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", + format!("{:?}", arr) + ); + } + + #[test] + fn test_timestamp_with_fixed_offset_tz_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]) + .with_timezone("+08:00".to_string()); + assert_eq!( + "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", + format!("{:?}", arr) + ); + } + #[test] fn test_date32_fmt_debug() { let arr: PrimitiveArray = vec![12356, 13548, -365].into(); From d625f0adecbc33c9bd23896ef61bbdcf7cf58d53 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 27 Oct 2022 13:05:18 +0800 Subject: [PATCH 0174/1411] Fix chrono-rs clippy (#2949) * fix chrono-rs-clippy * enable chrono-tz in clippy --- .github/workflows/arrow.yml | 2 +- arrow/src/compute/kernels/cast.rs | 3 +-- arrow/src/compute/kernels/temporal.rs | 6 ++---- arrow/src/csv/writer.rs | 18 +++++++++--------- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 4c395cf64c26..6651b394a2f0 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -207,4 +207,4 @@ jobs: run: cargo clippy -p arrow-select --all-targets --all-features - name: Clippy arrow run: | - cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict --all-targets -- -D warnings + cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 2380aa166912..d354a95f0a2d 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -5499,8 +5499,7 @@ mod tests { .with_timezone(tz_name.clone()), ), Arc::new( - TimestampNanosecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), + TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name), ), Arc::new(Date32Array::from(vec![1000, 2000])), Arc::new(Date64Array::from(vec![1000, 2000])), diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index abb8b40c2c48..412adb9a9fe9 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -1178,10 +1178,8 @@ mod tests { // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. // When daylight savings is not in effect, Australia/Sydney has an offset difference of +10:00. - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(1635577147000)], - Some("Australia/Sydney".to_string()), - ); + let a = TimestampMillisecondArray::from(vec![Some(1635577147000)]) + .with_timezone("Australia/Sydney".to_string()); let b = hour(&a).unwrap(); assert_eq!(17, b.value(0)); } diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index 330959096a04..fb3348d944f3 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -610,19 +610,19 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), ]); - let c1 = TimestampMillisecondArray::from_opt_vec( + let c1 = TimestampMillisecondArray::from( // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). // The offset (difference to UTC) is +10:00. // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. // vec![Some(1555584887378), Some(1635577147000)], - Some("Australia/Sydney".to_string()), - ); - let c2 = TimestampMillisecondArray::from_opt_vec( - vec![Some(1555584887378), Some(1635577147000)], - None, - ); + ) + .with_timezone("Australia/Sydney".to_string()); + let c2 = TimestampMillisecondArray::from(vec![ + Some(1555584887378), + Some(1635577147000), + ]); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) .unwrap(); @@ -711,7 +711,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ]; let c1 = Date32Array::from(vec![3, 2, 1]); let c2 = Date64Array::from(vec![3, 2, 1]); - let c3 = TimestampNanosecondArray::from_vec(nanoseconds.clone(), None); + let c3 = TimestampNanosecondArray::from(nanoseconds.clone()); let batch = RecordBatch::try_new( Arc::new(schema.clone()), @@ -756,7 +756,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let expected = vec![Some(3), Some(2), Some(1)]; assert_eq!(actual, expected); let actual = c3.into_iter().collect::>(); - let expected = nanoseconds.into_iter().map(|x| Some(x)).collect::>(); + let expected = nanoseconds.into_iter().map(Some).collect::>(); assert_eq!(actual, expected); } } From 66ea66bde115a3479efa879eaf8b437d8a11bfc2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 08:38:33 +1300 Subject: [PATCH 0175/1411] Specialize interleave string ~2-3x faster (#2944) * Add interleave string benchmark * Specialize interleave strings (#2864) * Review feedback --- arrow-array/src/array/string_array.rs | 10 +-- arrow-select/src/interleave.rs | 107 ++++++++++++++++++++------ arrow/benches/interleave_kernels.rs | 43 +++++------ 3 files changed, 108 insertions(+), 52 deletions(-) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 7e2ed3667e22..0cf45a448593 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -90,8 +90,8 @@ impl GenericStringArray { /// caller is responsible for ensuring that index is within the array bounds #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { - let end = self.value_offsets().get_unchecked(i + 1); - let start = self.value_offsets().get_unchecked(i); + let end = self.value_offsets().get_unchecked(i + 1).as_usize(); + let start = self.value_offsets().get_unchecked(i).as_usize(); // Soundness // pointer alignment & location is ensured by RawPtrBox @@ -103,10 +103,8 @@ impl GenericStringArray { // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, // both of which should cleanly cast to isize on an architecture that supports // 32/64-bit offsets - let slice = std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ); + let slice = + std::slice::from_raw_parts(self.value_data.as_ptr().add(start), end - start); std::str::from_utf8_unchecked(slice) } diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 29f75894dcb9..9b3de8501326 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -16,11 +16,11 @@ // under the License. use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_array::cast::as_primitive_array; use arrow_array::{ downcast_primitive, make_array, new_empty_array, Array, ArrayRef, ArrowPrimitiveType, - PrimitiveArray, + GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; @@ -85,51 +85,110 @@ pub fn interleave( downcast_primitive! { data_type => (primitive_helper, values, indices, data_type), + DataType::Utf8 => interleave_string::(values, indices, data_type), + DataType::LargeUtf8 => interleave_string::(values, indices, data_type), _ => interleave_fallback(values, indices) } } +/// Common functionality for interleaving arrays +/// +/// T is the concrete Array type +struct Interleave<'a, T> { + /// The input arrays downcast to T + arrays: Vec<&'a T>, + /// The number of nulls in the interleaved output + null_count: usize, + /// The null buffer of the interleaved output + nulls: Option, +} + +impl<'a, T: Array + 'static> Interleave<'a, T> { + fn new(values: &[&'a dyn Array], indices: &'a [(usize, usize)]) -> Self { + let mut has_nulls = false; + let arrays: Vec<&T> = values + .iter() + .map(|x| { + has_nulls = has_nulls || x.null_count() != 0; + x.as_any().downcast_ref().unwrap() + }) + .collect(); + + let mut null_count = 0; + let nulls = has_nulls.then(|| { + let mut builder = BooleanBufferBuilder::new(indices.len()); + for (a, b) in indices { + let v = arrays[*a].is_valid(*b); + null_count += !v as usize; + builder.append(v) + } + builder.finish() + }); + + Self { + arrays, + null_count, + nulls, + } + } +} + fn interleave_primitive( values: &[&dyn Array], indices: &[(usize, usize)], data_type: &DataType, ) -> Result { - let mut has_nulls = false; - let cast: Vec<_> = values - .iter() - .map(|x| { - has_nulls = has_nulls || x.null_count() != 0; - as_primitive_array::(*x) - }) - .collect(); + let interleaved = Interleave::<'_, PrimitiveArray>::new(values, indices); let mut values = BufferBuilder::::new(indices.len()); for (a, b) in indices { - let v = cast[*a].value(*b); + let v = interleaved.arrays[*a].value(*b); values.append(v) } - let mut null_count = 0; - let nulls = has_nulls.then(|| { - let mut builder = BooleanBufferBuilder::new(indices.len()); - for (a, b) in indices { - let v = cast[*a].is_valid(*b); - null_count += !v as usize; - builder.append(v) - } - builder.finish() - }); - let builder = ArrayDataBuilder::new(data_type.clone()) .len(indices.len()) .add_buffer(values.finish()) - .null_bit_buffer(nulls) - .null_count(null_count); + .null_bit_buffer(interleaved.nulls) + .null_count(interleaved.null_count); let data = unsafe { builder.build_unchecked() }; Ok(Arc::new(PrimitiveArray::::from(data))) } +fn interleave_string( + values: &[&dyn Array], + indices: &[(usize, usize)], + data_type: &DataType, +) -> Result { + let interleaved = Interleave::<'_, GenericStringArray>::new(values, indices); + + let mut capacity = 0; + let mut offsets = BufferBuilder::::new(indices.len() + 1); + offsets.append(O::from_usize(0).unwrap()); + for (a, b) in indices { + let o = interleaved.arrays[*a].value_offsets(); + let element_len = o[*b + 1].as_usize() - o[*b].as_usize(); + capacity += element_len; + offsets.append(O::from_usize(capacity).expect("overflow")); + } + + let mut values = MutableBuffer::new(capacity); + for (a, b) in indices { + values.extend_from_slice(interleaved.arrays[*a].value(*b).as_bytes()); + } + + let builder = ArrayDataBuilder::new(data_type.clone()) + .len(indices.len()) + .add_buffer(offsets.finish()) + .add_buffer(values.into()) + .null_bit_buffer(interleaved.nulls) + .null_count(interleaved.null_count); + + let data = unsafe { builder.build_unchecked() }; + Ok(Arc::new(GenericStringArray::::from(data))) +} + /// Fallback implementation of interleave using [`MutableArrayData`] fn interleave_fallback( values: &[&dyn Array], diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs index 6cf56eb98950..0c3eec60c0ce 100644 --- a/arrow/benches/interleave_kernels.rs +++ b/arrow/benches/interleave_kernels.rs @@ -60,31 +60,30 @@ fn do_bench( } fn add_benchmark(c: &mut Criterion) { - let a = create_primitive_array::(1024, 0.); + let i32 = create_primitive_array::(1024, 0.); + let i32_opt = create_primitive_array::(1024, 0.5); + let string = create_string_array_with_len::(1024, 0., 20); + let string_opt = create_string_array_with_len::(1024, 0.5, 20); - do_bench(c, "i32(0.0)", 100, &a, &[0..100, 100..230, 450..1000]); - do_bench(c, "i32(0.0)", 400, &a, &[0..100, 100..230, 450..1000]); - do_bench(c, "i32(0.0)", 1024, &a, &[0..100, 100..230, 450..1000]); - do_bench( - c, - "i32(0.0)", - 1024, - &a, - &[0..100, 100..230, 450..1000, 0..1000], - ); + let cases: &[(&str, &dyn Array)] = &[ + ("i32(0.0)", &i32), + ("i32(0.5)", &i32_opt), + ("str(20, 0.0)", &string), + ("str(20, 0.5)", &string_opt), + ]; - let a = create_primitive_array::(1024, 0.5); + for (prefix, base) in cases { + let slices: &[(usize, &[_])] = &[ + (100, &[0..100, 100..230, 450..1000]), + (400, &[0..100, 100..230, 450..1000]), + (1024, &[0..100, 100..230, 450..1000]), + (1024, &[0..100, 100..230, 450..1000, 0..1000]), + ]; - do_bench(c, "i32(0.5)", 100, &a, &[0..100, 100..230, 450..1000]); - do_bench(c, "i32(0.5)", 400, &a, &[0..100, 100..230, 450..1000]); - do_bench(c, "i32(0.5)", 1024, &a, &[0..100, 100..230, 450..1000]); - do_bench( - c, - "i32(0.5)", - 1024, - &a, - &[0..100, 100..230, 450..1000, 0..1000], - ); + for (len, slice) in slices { + do_bench(c, prefix, *len, *base, slice); + } + } } criterion_group!(benches, add_benchmark); From 880c4d98e6d570db701fb013f3abf5e5e6f42e32 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 08:38:48 +1300 Subject: [PATCH 0176/1411] Add optional page row count limit for parquet `WriterProperties` (#2941) (#2942) * Add page row count limit (#2941) * Apply suggestions from code review Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- parquet/src/column/writer/mod.rs | 6 ++-- parquet/src/file/properties.rs | 43 ++++++++++++++++++++++++++-- parquet/tests/arrow_writer_layout.rs | 30 ++++++++++++++++++- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 0f96b6fd78e5..f9b429f5bc72 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -569,11 +569,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // // In such a scenario the dictionary decoder may return an estimated encoded // size in excess of the page size limit, even when there are no buffered values - if self.encoder.num_values() == 0 { + if self.page_metrics.num_buffered_values == 0 { return false; } - self.encoder.estimated_data_page_size() >= self.props.data_pagesize_limit() + self.page_metrics.num_buffered_rows as usize + >= self.props.data_page_row_count_limit() + || self.encoder.estimated_data_page_size() >= self.props.data_pagesize_limit() } /// Performs dictionary fallback. diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 57dae323d892..11fb13b4bd68 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -96,6 +96,7 @@ pub type WriterPropertiesPtr = Arc; pub struct WriterProperties { data_pagesize_limit: usize, dictionary_pagesize_limit: usize, + data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, writer_version: WriterVersion, @@ -112,15 +113,29 @@ impl WriterProperties { } /// Returns data page size limit. + /// + /// Note: this is a best effort limit based on the write batch size pub fn data_pagesize_limit(&self) -> usize { self.data_pagesize_limit } /// Returns dictionary page size limit. + /// + /// Note: this is a best effort limit based on the write batch size pub fn dictionary_pagesize_limit(&self) -> usize { self.dictionary_pagesize_limit } + /// Returns the maximum page row count + /// + /// This can be used to limit the number of rows within a page to + /// yield better page pruning + /// + /// Note: this is a best effort limit based on the write batch size + pub fn data_page_row_count_limit(&self) -> usize { + self.data_page_row_count_limit + } + /// Returns configured batch size for writes. /// /// When writing a batch of data, this setting allows to split it internally into @@ -222,6 +237,7 @@ impl WriterProperties { pub struct WriterPropertiesBuilder { data_pagesize_limit: usize, dictionary_pagesize_limit: usize, + data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, writer_version: WriterVersion, @@ -237,6 +253,7 @@ impl WriterPropertiesBuilder { Self { data_pagesize_limit: DEFAULT_PAGE_SIZE, dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, + data_page_row_count_limit: usize::MAX, write_batch_size: DEFAULT_WRITE_BATCH_SIZE, max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, writer_version: DEFAULT_WRITER_VERSION, @@ -252,6 +269,7 @@ impl WriterPropertiesBuilder { WriterProperties { data_pagesize_limit: self.data_pagesize_limit, dictionary_pagesize_limit: self.dictionary_pagesize_limit, + data_page_row_count_limit: self.data_page_row_count_limit, write_batch_size: self.write_batch_size, max_row_group_size: self.max_row_group_size, writer_version: self.writer_version, @@ -271,19 +289,38 @@ impl WriterPropertiesBuilder { self } - /// Sets data page size limit. + /// Sets best effort maximum size of a data page in bytes + /// + /// Note: this is a best effort limit based on the write batch size pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { self.data_pagesize_limit = value; self } - /// Sets dictionary page size limit. + /// Sets best effort maximum number of rows in a data page + /// + /// + /// This can be used to limit the number of rows within a page to + /// yield better page pruning + /// + /// Note: this is a best effort limit based on the write batch size + pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self { + self.data_page_row_count_limit = value; + self + } + + /// Sets best effort maximum dictionary page size, in bytes + /// + /// Note: this is a best effort limit based on the write batch size pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { self.dictionary_pagesize_limit = value; self } - /// Sets write batch size. + /// Sets write batch size + /// + /// Data is written in batches of this size, acting as an upper-bound on + /// the enforcement granularity of page limits pub fn set_write_batch_size(mut self, value: usize) -> Self { self.write_batch_size = value; self diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 40076add325a..e43456eb6f40 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -249,7 +249,7 @@ fn test_primitive() { do_test(LayoutTest { props, - batches: vec![batch], + batches: vec![batch.clone()], layout: Layout { row_groups: vec![RowGroup { columns: vec![ColumnChunk { @@ -308,6 +308,34 @@ fn test_primitive() { }], }, }); + + // Test row count limit + let props = WriterProperties::builder() + .set_dictionary_enabled(false) + .set_data_page_row_count_limit(100) + .set_write_batch_size(100) + .build(); + + do_test(LayoutTest { + props, + batches: vec![batch], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: (0..20) + .map(|_| Page { + rows: 100, + page_header_size: 34, + compressed_size: 400, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }) + .collect(), + dictionary_page: None, + }], + }], + }, + }); } #[test] From 4e1247e8c03f36940a912256e2d94f49a1b581df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Gallego=20Castellanos?= Date: Thu, 27 Oct 2022 22:59:25 +0200 Subject: [PATCH 0177/1411] Added support for LZ4_RAW compression. (#1604) (#2943) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added support for LZ4_RAW compression. (#1604) * This adds the implementation of LZ4_RAW codec by using lz4 block compression algorithm. (#1604) * This commit uses https://stackoverflow.com/questions/25740471/lz4-library-decompressed-data-upper-bound-size-estimation formula to estime the size of the uncompressed size. As it said in thread this algorithm over-estimates the size, but it is probably the best we can get with the current decompress API. As the size of a arrow LZ4_RAW block is not prepended to the block. * Other option would be to take the C++ approach to bypass the API (https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/compression_lz4.cc#L343). This approach consists on relaying on the output_buffer capacity to guess the uncompress_size. This works as `serialized_reader.rs` already knows the uncompressed_size, as it reads it from the page header, and allocates the output_buffer with a capacity equal to the uncompress_size (https://github.com/marioloko/arrow-rs/blob/master/parquet/src/file/serialized_reader.rs#L417). I did not follow this approach because: 1. It is too hacky. 2. It will limit the use cases of the `decompress` API, as the caller will need to know to allocate the right uncompressed_size. 3. It is not compatible with the current set of tests. However, new test can be created. * Clippy * Add integration test Co-authored-by: Adrián Gallego Castellanos Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/arrow_reader/mod.rs | 31 +++++++++++++ parquet/src/basic.rs | 4 ++ parquet/src/compression.rs | 64 +++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 51b09302cdf1..7f68b07eb487 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2390,4 +2390,35 @@ mod tests { assert_eq!(full.column(idx), projected.column(0)); } } + + #[test] + fn test_read_lz4_raw() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/lz4_raw_compressed.parquet", testdata); + let file = File::open(&path).unwrap(); + + let batches = ParquetRecordBatchReader::try_new(file, 1024) + .unwrap() + .collect::>>() + .unwrap(); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_rows(), 4); + + // https://github.com/apache/parquet-testing/pull/18 + let a: &Int64Array = batch.column(0).as_any().downcast_ref().unwrap(); + assert_eq!( + a.values(), + &[1593604800, 1593604800, 1593604801, 1593604801] + ); + + let a: &BinaryArray = batch.column(1).as_any().downcast_ref().unwrap(); + let a: Vec<_> = a.iter().flatten().collect(); + assert_eq!(a, &[b"abc", b"def", b"abc", b"def"]); + + let a: &Float64Array = batch.column(2).as_any().downcast_ref().unwrap(); + assert_eq!(a.values(), &[42.000000, 7.700000, 42.125000, 7.700000]); + } } diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index b0f591c7a9f7..96cdd537dbeb 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -282,6 +282,7 @@ pub enum Encoding { /// Supported compression algorithms. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum Compression { UNCOMPRESSED, SNAPPY, @@ -290,6 +291,7 @@ pub enum Compression { BROTLI, LZ4, ZSTD, + LZ4_RAW, } // ---------------------------------------------------------------------- @@ -826,6 +828,7 @@ impl TryFrom for Compression { parquet::CompressionCodec::BROTLI => Compression::BROTLI, parquet::CompressionCodec::LZ4 => Compression::LZ4, parquet::CompressionCodec::ZSTD => Compression::ZSTD, + parquet::CompressionCodec::LZ4_RAW => Compression::LZ4_RAW, _ => { return Err(general_err!( "unexpected parquet compression codec: {}", @@ -846,6 +849,7 @@ impl From for parquet::CompressionCodec { Compression::BROTLI => parquet::CompressionCodec::BROTLI, Compression::LZ4 => parquet::CompressionCodec::LZ4, Compression::ZSTD => parquet::CompressionCodec::ZSTD, + Compression::LZ4_RAW => parquet::CompressionCodec::LZ4_RAW, } } } diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index ee5141cbe140..f110e3d8272a 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -77,6 +77,8 @@ pub fn create_codec(codec: CodecType) -> Result>> { CodecType::LZ4 => Ok(Some(Box::new(LZ4Codec::new()))), #[cfg(any(feature = "zstd", test))] CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), + #[cfg(any(feature = "lz4", test))] + CodecType::LZ4_RAW => Ok(Some(Box::new(LZ4RawCodec::new()))), CodecType::UNCOMPRESSED => Ok(None), _ => Err(nyi_err!("The codec type {} is not supported yet", codec)), } @@ -325,6 +327,63 @@ mod zstd_codec { #[cfg(any(feature = "zstd", test))] pub use zstd_codec::*; +#[cfg(any(feature = "lz4", test))] +mod lz4_raw_codec { + use crate::compression::Codec; + use crate::errors::Result; + + /// Codec for LZ4 Raw compression algorithm. + pub struct LZ4RawCodec {} + + impl LZ4RawCodec { + /// Creates new LZ4 Raw compression codec. + pub(crate) fn new() -> Self { + Self {} + } + } + + // Compute max LZ4 uncompress size. + // Check https://stackoverflow.com/questions/25740471/lz4-library-decompressed-data-upper-bound-size-estimation + fn max_uncompressed_size(compressed_size: usize) -> usize { + (compressed_size << 8) - compressed_size - 2526 + } + + impl Codec for LZ4RawCodec { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + ) -> Result { + let offset = output_buf.len(); + let required_len = max_uncompressed_size(input_buf.len()); + output_buf.resize(offset + required_len, 0); + let required_len: i32 = required_len.try_into().unwrap(); + match lz4::block::decompress_to_buffer(input_buf, Some(required_len), &mut output_buf[offset..]) { + Ok(n) => { + output_buf.truncate(offset + n); + Ok(n) + }, + Err(e) => Err(e.into()), + } + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + let offset = output_buf.len(); + let required_len = lz4::block::compress_bound(input_buf.len())?; + output_buf.resize(offset + required_len, 0); + match lz4::block::compress_to_buffer(input_buf, None, false, &mut output_buf[offset..]) { + Ok(n) => { + output_buf.truncate(offset + n); + Ok(()) + }, + Err(e) => Err(e.into()), + } + } + } +} +#[cfg(any(feature = "lz4", test))] +pub use lz4_raw_codec::*; + #[cfg(test)] mod tests { use super::*; @@ -416,4 +475,9 @@ mod tests { fn test_codec_zstd() { test_codec(CodecType::ZSTD); } + + #[test] + fn test_codec_lz4_raw() { + test_codec(CodecType::LZ4_RAW); + } } From 63417b1f36461d4abf7c0aedc6f49e740a92e687 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 27 Oct 2022 14:46:02 -0700 Subject: [PATCH 0178/1411] Add pow to i256 (#2955) --- arrow-buffer/src/bigint.rs | 82 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 3518b85e4eb8..fe135b329b58 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -305,6 +305,55 @@ impl i256 { let (val, overflow) = Self::from_bigint_with_overflow(l % r); (!overflow).then_some(val) } + + /// Performs checked exponentiation + #[inline] + pub fn checked_pow(self, mut exp: u32) -> Option { + if exp == 0 { + return Some(i256::from_i128(1)); + } + + let mut base = self; + let mut acc: Self = i256::from_i128(1); + + while exp > 1 { + if (exp & 1) == 1 { + acc = acc.checked_mul(base)?; + } + exp /= 2; + base = base.checked_mul(base)?; + } + // since exp!=0, finally the exp must be 1. + // Deal with the final bit of the exponent separately, since + // squaring the base afterwards is not necessary and may cause a + // needless overflow. + acc.checked_mul(base) + } + + /// Performs wrapping exponentiation + #[inline] + pub fn wrapping_pow(self, mut exp: u32) -> Self { + if exp == 0 { + return i256::from_i128(1); + } + + let mut base = self; + let mut acc: Self = i256::from_i128(1); + + while exp > 1 { + if (exp & 1) == 1 { + acc = acc.wrapping_mul(base); + } + exp /= 2; + base = base.wrapping_mul(base); + } + + // since exp!=0, finally the exp must be 1. + // Deal with the final bit of the exponent separately, since + // squaring the base afterwards is not necessary and may cause a + // needless overflow. + acc.wrapping_mul(base) + } } /// Performs an unsigned multiplication of `a * b` returning a tuple of @@ -455,6 +504,39 @@ mod tests { expected ), } + + // Exponentiation + for exp in vec![0, 1, 3, 8, 100].into_iter() { + let actual = il.wrapping_pow(exp); + let (expected, overflow) = + i256::from_bigint_with_overflow(bl.clone().pow(exp)); + assert_eq!(actual.to_string(), expected.to_string()); + + let checked = il.checked_pow(exp); + match overflow { + true => assert!( + checked.is_none(), + "{} ^ {} = {} vs {} * {} = {}", + il, + exp, + actual, + bl, + exp, + expected + ), + false => assert_eq!( + checked.unwrap(), + actual, + "{} ^ {} = {} vs {} * {} = {}", + il, + exp, + actual, + bl, + exp, + expected + ), + } + } } #[test] From b4872b7daeaf4c58be021e3880fef3801f541bb7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 10:57:08 +1300 Subject: [PATCH 0179/1411] Cleanup CI (#2933) * Cleanup CI * Fix object_store emulator tests * Update .github/actions/setup-builder/action.yaml Co-authored-by: Andrew Lamb * Fix formatting Co-authored-by: Andrew Lamb --- .github/actions/setup-builder/action.yaml | 18 +++- .github/workflows/arrow.yml | 100 +++++++--------------- .github/workflows/arrow_flight.yml | 14 +-- .github/workflows/docs.yml | 3 +- .github/workflows/miri.yaml | 3 +- .github/workflows/object_store.yml | 34 +++++--- .github/workflows/parquet.yml | 75 +++++++--------- .github/workflows/parquet_derive.yml | 17 +--- .github/workflows/rust.yml | 12 ++- 9 files changed, 111 insertions(+), 165 deletions(-) diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml index a4d4d392191f..865ff66b9d09 100644 --- a/.github/actions/setup-builder/action.yaml +++ b/.github/actions/setup-builder/action.yaml @@ -20,8 +20,12 @@ description: 'Prepare Rust Build Environment' inputs: rust-version: description: 'version of rust to install (e.g. stable)' - required: true + required: false default: 'stable' + target: + description: 'target architecture(s)' + required: false + default: 'x86_64-unknown-linux-gnu' runs: using: "composite" steps: @@ -51,7 +55,13 @@ runs: shell: bash run: | echo "Installing ${{ inputs.rust-version }}" - rustup toolchain install ${{ inputs.rust-version }} + rustup toolchain install ${{ inputs.rust-version }} --target ${{ inputs.target }} rustup default ${{ inputs.rust-version }} - rustup target add wasm32-unknown-unknown - echo "CARGO_TARGET_DIR=/github/home/target" >> $GITHUB_ENV + - name: Disable debuginfo generation + # Disable full debug symbol generation to speed up CI build and keep memory down + # "1" means line tables only, which is useful for panic tracebacks. + shell: bash + run: echo "RUSTFLAGS=-C debuginfo=1" >> $GITHUB_ENV + - name: Enable backtraces + shell: bash + run: echo "RUST_BACKTRACE=1" >> $GITHUB_ENV diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 6651b394a2f0..868741c33cfa 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -42,18 +42,12 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Test arrow-buffer with all features run: cargo test -p arrow-buffer --all-features - name: Test arrow-data with all features @@ -66,11 +60,10 @@ jobs: run: cargo test -p arrow-select --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - - name: Test arrow + - name: Test arrow with default features run: cargo test -p arrow - - name: Test --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz - run: | - cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz + - name: Test arrow with all features apart from simd + run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz - name: Run examples run: | # Test arrow examples @@ -87,39 +80,26 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Check compilation - run: | - cargo check -p arrow + run: cargo check -p arrow - name: Check compilation --no-default-features - run: | - cargo check -p arrow --no-default-features + run: cargo check -p arrow --no-default-features - name: Check compilation --all-targets - run: | - cargo check -p arrow --all-targets + run: cargo check -p arrow --all-targets - name: Check compilation --no-default-features --all-targets - run: | - cargo check -p arrow --no-default-features --all-targets + run: cargo check -p arrow --no-default-features --all-targets - name: Check compilation --no-default-features --all-targets --features test_utils - run: | - cargo check -p arrow --no-default-features --all-targets --features test_utils + run: cargo check -p arrow --no-default-features --all-targets --features test_utils - name: Check compilation --no-default-features --all-targets --features ffi - run: | - cargo check -p arrow --no-default-features --all-targets --features ffi + run: cargo check -p arrow --no-default-features --all-targets --features ffi - name: Check compilation --no-default-features --all-targets --features chrono-tz - run: | - cargo check -p arrow --no-default-features --all-targets --features chrono-tz + run: cargo check -p arrow --no-default-features --all-targets --features chrono-tz # test the --features "simd" of the arrow crate. This requires nightly Rust. linux-test-simd: @@ -127,10 +107,6 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: @@ -140,46 +116,32 @@ jobs: with: rust-version: nightly - name: Run tests --features "simd" - run: | - cargo test -p arrow --features "simd" + run: cargo test -p arrow --features "simd" - name: Check compilation --features "simd" - run: | - cargo check -p arrow --features simd + run: cargo check -p arrow --features simd - name: Check compilation --features simd --all-targets - run: | - cargo check -p arrow --features simd --all-targets + run: cargo check -p arrow --features simd --all-targets - # test the arrow crate builds against wasm32 in stable rust + # test the arrow crate builds against wasm32 in nightly rust wasm32-build: name: Build wasm32 runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder with: - path: /github/home/.cargo - key: cargo-wasm32-cache3- - - name: Setup Rust toolchain for WASM - run: | - rustup toolchain install nightly - rustup override set nightly - rustup target add wasm32-unknown-unknown - rustup target add wasm32-wasi - - name: Build - run: | - cd arrow - cargo build --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown - cargo build --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-wasi + rust-version: nightly + target: wasm32-unknown-unknown,wasm32-wasi + - name: Build wasm32-unknown-unknown + run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown + - name: Build wasm32-wasi + run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-wasi clippy: name: Clippy @@ -190,21 +152,17 @@ jobs: - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Setup Clippy - run: | - rustup component add clippy + run: rustup component add clippy - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-targets --all-features + run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-targets --all-features + run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-targets --all-features + run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features + run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - name: Clippy arrow-select with all features - run: cargo clippy -p arrow-select --all-targets --all-features + run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - name: Clippy arrow - run: | - cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings + run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index d40c9b6ecee7..548caeb2ab75 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -43,18 +43,12 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Test run: | cargo test -p arrow-flight @@ -73,11 +67,7 @@ jobs: - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Setup Clippy - run: | - rustup component add clippy + run: rustup component add clippy - name: Run clippy - run: | - cargo clippy -p arrow-flight --all-features -- -D warnings + run: cargo clippy -p arrow-flight --all-features -- -D warnings diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 5e82d76febe6..e780226b6e27 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -51,5 +51,4 @@ jobs: with: rust-version: ${{ matrix.rust }} - name: Run cargo doc - run: | - cargo doc --document-private-items --no-deps --workspace --all-features + run: cargo doc --document-private-items --no-deps --workspace --all-features diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 92d6f2af2a9c..435582347e47 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -49,5 +49,4 @@ jobs: env: RUST_BACKTRACE: full RUST_LOG: "trace" - run: | - bash .github/workflows/miri.sh + run: bash .github/workflows/miri.sh diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 2afcb4344371..370c1ced380f 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -37,11 +37,10 @@ jobs: image: amd64/rust steps: - uses: actions/checkout@v3 - - name: Setup Rust toolchain with clippy - run: | - rustup toolchain install stable - rustup default stable - rustup component add clippy + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Setup Clippy + run: rustup component add clippy # Run different tests for the library on its own as well as # all targets to ensure that it still works in the absence of # features that might be enabled by dev-dependencies of other @@ -71,8 +70,6 @@ jobs: # Disable full debug symbol generation to speed up CI build and keep memory down # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" RUST_BACKTRACE: "1" # Run integration tests TEST_INTEGRATION: 1 @@ -121,6 +118,23 @@ jobs: OBJECT_STORE_AWS_ACCESS_KEY_ID: test OBJECT_STORE_AWS_SECRET_ACCESS_KEY: test OBJECT_STORE_AWS_ENDPOINT: http://localhost:4566 - run: | - # run tests - cargo test -p object_store --features=aws,azure,gcp + run: cargo test -p object_store --features=aws,azure,gcp + + # test the object_store crate builds against wasm32 in stable rust + wasm32-build: + name: Build wasm32 + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + target: wasm32-unknown-unknown,wasm32-wasi + - name: Build wasm32-unknown-unknown + run: cargo build -p object_store --target wasm32-unknown-unknown + - name: Build wasm32-wasi + run: cargo build -p object_store --target wasm32-wasi \ No newline at end of file diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 550b590737ab..dd1a782c4654 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -43,24 +43,16 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Test - run: | - cargo test -p parquet + run: cargo test -p parquet - name: Test --all-features - run: | - cargo test -p parquet --all-features + run: cargo test -p parquet --all-features # test compilation @@ -69,18 +61,12 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable # Run different tests for the library on its own as well as # all targets to ensure that it still works in the absence of @@ -93,35 +79,42 @@ jobs: # 3. compiles with just arrow feature # 3. compiles with all features - name: Check compilation - run: | - cargo check -p parquet + run: cargo check -p parquet - name: Check compilation --no-default-features - run: | - cargo check -p parquet --no-default-features + run: cargo check -p parquet --no-default-features - name: Check compilation --no-default-features --features arrow - run: | - cargo check -p parquet --no-default-features --features arrow + run: cargo check -p parquet --no-default-features --features arrow - name: Check compilation --no-default-features --all-features - run: | - cargo check -p parquet --all-features + run: cargo check -p parquet --all-features - name: Check compilation --all-targets - run: | - cargo check -p parquet --all-targets + run: cargo check -p parquet --all-targets - name: Check compilation --all-targets --no-default-features - run: | - cargo check -p parquet --all-targets --no-default-features + run: cargo check -p parquet --all-targets --no-default-features - name: Check compilation --all-targets --no-default-features --features arrow - run: | - cargo check -p parquet --all-targets --no-default-features --features arrow + run: cargo check -p parquet --all-targets --no-default-features --features arrow - name: Check compilation --all-targets --all-features - run: | - cargo check -p parquet --all-targets --all-features + run: cargo check -p parquet --all-targets --all-features - name: Check compilation --all-targets --no-default-features --features json - run: | - cargo check -p parquet --all-targets --no-default-features --features json - - name: Check compilation wasm32-unknown-unknown - run: | - cargo check -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-unknown-unknown + run: cargo check -p parquet --all-targets --no-default-features --features json + + # test the parquet crate builds against wasm32 in stable rust + wasm32-build: + name: Build wasm32 + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + target: wasm32-unknown-unknown,wasm32-wasi + - name: Build wasm32-unknown-unknown + run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-unknown-unknown + - name: Build wasm32-wasi + run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-wasi clippy: name: Clippy @@ -132,11 +125,7 @@ jobs: - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Setup Clippy - run: | - rustup component add clippy + run: rustup component add clippy - name: Run clippy - run: | - cargo clippy -p parquet --all-targets --all-features -- -D warnings + run: cargo clippy -p parquet --all-targets --all-features -- -D warnings diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index bd70fc30d1c5..e5620769bb3c 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -39,21 +39,14 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" steps: - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Test - run: | - cargo test -p parquet_derive + run: cargo test -p parquet_derive clippy: name: Clippy @@ -64,11 +57,7 @@ jobs: - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - with: - rust-version: stable - name: Setup Clippy - run: | - rustup component add clippy + run: rustup component add clippy - name: Run clippy - run: | - cargo clippy -p parquet_derive --all-features -- -D warnings + run: cargo clippy -p parquet_derive --all-features -- -D warnings diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c04d5643b49a..f4c98c5abad7 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -37,8 +37,7 @@ jobs: with: submodules: true - name: Install protoc with brew - run: | - brew install protobuf + run: brew install protobuf - name: Setup Rust toolchain run: | rustup toolchain install stable --no-self-update @@ -91,10 +90,9 @@ jobs: image: amd64/rust steps: - uses: actions/checkout@v3 - - name: Setup toolchain - run: | - rustup toolchain install stable - rustup default stable - rustup component add rustfmt + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Setup rustfmt + run: rustup component add rustfmt - name: Run run: cargo fmt --all -- --check From 73416f8e67efe1d0d8a8529c96c099429ab1b366 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 27 Oct 2022 16:53:14 -0700 Subject: [PATCH 0180/1411] Cast numeric to decimal256 (#2923) * Cast numeric to decimal256 * For review * Check scale overflow * Fix clippy --- arrow-buffer/src/bigint.rs | 16 ++ arrow/src/compute/kernels/cast.rs | 243 +++++++++++++++++++++++++++--- 2 files changed, 239 insertions(+), 20 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index fe135b329b58..892c6c99d216 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use num::cast::AsPrimitive; use num::BigInt; use std::cmp::Ordering; @@ -395,6 +396,21 @@ fn mulx(a: u128, b: u128) -> (u128, u128) { (low, high) } +macro_rules! define_as_primitive { + ($native_ty:ty) => { + impl AsPrimitive for $native_ty { + fn as_(self) -> i256 { + i256::from_i128(self as i128) + } + } + }; +} + +define_as_primitive!(i8); +define_as_primitive!(i16); +define_as_primitive!(i32); +define_as_primitive!(i64); + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index d354a95f0a2d..73868dd98c06 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -88,6 +88,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal256(_, _), Decimal128(_, _)) => true, // signed numeric to decimal (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | + (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | ( @@ -305,8 +306,8 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } -/// Cast the primitive array to defined decimal data type array -fn cast_primitive_to_decimal( +/// Cast the primitive array to defined decimal128 data type array +fn cast_primitive_to_decimal128( array: T, op: F, precision: u8, @@ -324,7 +325,26 @@ where Ok(Arc::new(decimal_array)) } -fn cast_integer_to_decimal( +/// Cast the primitive array to defined decimal256 data type array +fn cast_primitive_to_decimal256( + array: T, + op: F, + precision: u8, + scale: u8, +) -> Result +where + F: Fn(T::Item) -> i256, +{ + #[allow(clippy::redundant_closure)] + let decimal_array = ArrayIter::new(array) + .map(|v| v.map(|v| op(v))) + .collect::() + .with_precision_and_scale(precision, scale)?; + + Ok(Arc::new(decimal_array)) +} + +fn cast_integer_to_decimal128( array: &PrimitiveArray, precision: u8, scale: u8, @@ -334,12 +354,30 @@ where { let mul: i128 = 10_i128.pow(scale as u32); - // with_precision_and_scale validates the - // value is within range for the output precision - cast_primitive_to_decimal(array, |v| v.as_() * mul, precision, scale) + cast_primitive_to_decimal128(array, |v| v.as_() * mul, precision, scale) +} + +fn cast_integer_to_decimal256( + array: &PrimitiveArray, + precision: u8, + scale: u8, +) -> Result +where + ::Native: AsPrimitive, +{ + let mul: i256 = i256::from_i128(10_i128) + .checked_pow(scale as u32) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to Decimal256({}, {}). The scale causes overflow.", + precision, scale + )) + })?; + + cast_primitive_to_decimal256(array, |v| v.as_().wrapping_mul(mul), precision, scale) } -fn cast_floating_point_to_decimal( +fn cast_floating_point_to_decimal128( array: &PrimitiveArray, precision: u8, scale: u8, @@ -349,13 +387,22 @@ where { let mul = 10_f64.powi(scale as i32); - cast_primitive_to_decimal( + cast_primitive_to_decimal128(array, |v| (v.as_() * mul) as i128, precision, scale) +} + +fn cast_floating_point_to_decimal256( + array: &PrimitiveArray, + precision: u8, + scale: u8, +) -> Result +where + ::Native: AsPrimitive, +{ + let mul = 10_f64.powi(scale as i32); + + cast_primitive_to_decimal256( array, - |v| { - // with_precision_and_scale validates the - // value is within range for the output precision - (v.as_() * mul) as i128 - }, + |v| i256::from_i128((v.as_() * mul) as i128), precision, scale, ) @@ -545,32 +592,73 @@ pub fn cast_with_options( // cast data to decimal match from_type { // TODO now just support signed numeric to decimal, support decimal to numeric later - Int8 => cast_integer_to_decimal( + Int8 => cast_integer_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Int16 => cast_integer_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Int32 => cast_integer_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Int64 => cast_integer_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Float32 => cast_floating_point_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Float64 => cast_floating_point_to_decimal128( + as_primitive_array::(array), + *precision, + *scale, + ), + Null => Ok(new_null_array(to_type, array.len())), + _ => Err(ArrowError::CastError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type + ))), + } + } + (_, Decimal256(precision, scale)) => { + // cast data to decimal + match from_type { + // TODO now just support signed numeric to decimal, support decimal to numeric later + Int8 => cast_integer_to_decimal256( as_primitive_array::(array), *precision, *scale, ), - Int16 => cast_integer_to_decimal( + Int16 => cast_integer_to_decimal256( as_primitive_array::(array), *precision, *scale, ), - Int32 => cast_integer_to_decimal( + Int32 => cast_integer_to_decimal256( as_primitive_array::(array), *precision, *scale, ), - Int64 => cast_integer_to_decimal( + Int64 => cast_integer_to_decimal256( as_primitive_array::(array), *precision, *scale, ), - Float32 => cast_floating_point_to_decimal( + Float32 => cast_floating_point_to_decimal256( as_primitive_array::(array), *precision, *scale, ), - Float64 => cast_floating_point_to_decimal( + Float64 => cast_floating_point_to_decimal256( as_primitive_array::(array), *precision, *scale, @@ -3071,7 +3159,7 @@ mod tests { #[test] #[cfg(not(feature = "force_validate"))] - fn test_cast_numeric_to_decimal() { + fn test_cast_numeric_to_decimal128() { // test negative cast type let decimal_type = DataType::Decimal128(38, 6); assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); @@ -3184,6 +3272,121 @@ mod tests { ); } + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_cast_numeric_to_decimal256() { + // test negative cast type + let decimal_type = DataType::Decimal256(58, 6); + assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); + + // i8, i16, i32, i64 + let input_datas = vec![ + Arc::new(Int8Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i8 + Arc::new(Int16Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i16 + Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i32 + Arc::new(Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // i64 + ]; + for array in input_datas { + generate_cast_test_case!( + &array, + Decimal256Array, + &decimal_type, + vec![ + Some(i256::from_i128(1000000_i128)), + Some(i256::from_i128(2000000_i128)), + Some(i256::from_i128(3000000_i128)), + None, + Some(i256::from_i128(5000000_i128)) + ] + ); + } + + // test i8 to decimal type with overflow the result type + // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. + let array = Int8Array::from(vec![1, 2, 3, 4, 100]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast(&array, &DataType::Decimal256(3, 1)); + assert!(casted_array.is_ok()); + let array = casted_array.unwrap(); + let array: &Decimal256Array = as_primitive_array(&array); + let err = array.validate_decimal_precision(3); + assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal256 of precision 3. Max is 999", err.unwrap_err().to_string()); + + // test f32 to decimal type + let array = Float32Array::from(vec![ + Some(1.1), + Some(2.2), + Some(4.4), + None, + Some(1.123_456_7), + Some(1.123_456_7), + ]); + let array = Arc::new(array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &decimal_type, + vec![ + Some(i256::from_i128(1100000_i128)), + Some(i256::from_i128(2200000_i128)), + Some(i256::from_i128(4400000_i128)), + None, + Some(i256::from_i128(1123456_i128)), + Some(i256::from_i128(1123456_i128)), + ] + ); + + // test f64 to decimal type + let array = Float64Array::from(vec![ + Some(1.1), + Some(2.2), + Some(4.4), + None, + Some(1.123_456_789_123_4), + Some(1.123_456_789_012_345_6), + Some(1.123_456_789_012_345_6), + ]); + let array = Arc::new(array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &decimal_type, + vec![ + Some(i256::from_i128(1100000_i128)), + Some(i256::from_i128(2200000_i128)), + Some(i256::from_i128(4400000_i128)), + None, + Some(i256::from_i128(1123456_i128)), + Some(i256::from_i128(1123456_i128)), + Some(i256::from_i128(1123456_i128)), + ] + ); + } + #[test] fn test_cast_i32_to_f64() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); From b6f08a87e02144277bb0a7aa3708e42f6faf7a26 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 16:04:24 +1300 Subject: [PATCH 0181/1411] Add GenericByteArray (#2946) (#2947) * Add GenericByteArray (#2946) * Lint * Review feedback * Review feedback --- arrow-array/src/array/binary_array.rs | 191 +---------------------- arrow-array/src/array/byte_array.rs | 208 ++++++++++++++++++++++++++ arrow-array/src/array/mod.rs | 3 + arrow-array/src/array/string_array.rs | 178 +--------------------- arrow-array/src/types.rs | 90 ++++++++++- 5 files changed, 309 insertions(+), 361 deletions(-) create mode 100644 arrow-array/src/array/byte_array.rs diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index c8407b252ef1..259d949d42a5 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -15,118 +15,23 @@ // specific language governing permissions and limitations // under the License. -use crate::iterator::GenericBinaryIter; -use crate::raw_pointer::RawPtrBox; -use crate::{ - empty_offsets, print_long_array, Array, ArrayAccessor, GenericListArray, - OffsetSizeTrait, -}; +use crate::types::GenericBinaryType; +use crate::{Array, GenericByteArray, GenericListArray, OffsetSizeTrait}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -use std::any::Any; /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing /// binary data. -pub struct GenericBinaryArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} +pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { - /// Data type of the array. - pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { - DataType::LargeBinary - } else { - DataType::Binary - }; - /// Get the data type of the array. #[deprecated(note = "please use `Self::DATA_TYPE` instead")] pub const fn get_data_type() -> DataType { Self::DATA_TYPE } - /// Returns the length for value at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns the element at index `i` as bytes slice - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let end = *self.value_offsets().get_unchecked(i + 1); - let start = *self.value_offsets().get_unchecked(i); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (end - start).to_usize().unwrap(), - ) - } - - /// Returns the element at index `i` as bytes slice - /// # Panics - /// Panics if index `i` is out of bounds. - pub fn value(&self, i: usize) -> &[u8] { - assert!( - i < self.data.len(), - "Trying to access an element at index {} from a BinaryArray of length {}", - i, - self.len() - ); - //Soundness: length checked above, offset buffer length is 1 larger than logical array length - let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; - let start = unsafe { self.value_offsets().get_unchecked(i) }; - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - unsafe { - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ) - } - } - /// Creates a [GenericBinaryArray] from a vector of byte slices /// /// See also [`Self::from_iter_values`] @@ -230,85 +135,6 @@ impl GenericBinaryArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } - - /// constructs a new iterator - pub fn iter(&self) -> GenericBinaryIter<'_, OffsetSize> { - GenericBinaryIter::<'_, OffsetSize>::new(self) - } -} - -impl std::fmt::Debug for GenericBinaryArray { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let prefix = OffsetSize::PREFIX; - - write!(f, "{}BinaryArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - std::fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericBinaryArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor - for &'a GenericBinaryArray -{ - type Item = &'a [u8]; - - fn value(&self, index: usize) -> Self::Item { - GenericBinaryArray::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericBinaryArray::value_unchecked(self, index) - } -} - -impl From for GenericBinaryArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &Self::DATA_TYPE, - "[Large]BinaryArray expects Datatype::[Large]Binary" - ); - assert_eq!( - data.buffers().len(), - 2, - "BinaryArray data should contain 2 buffers only (offsets and values)" - ); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - let values = data.buffers()[1].as_ptr(); - Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - -impl From> for ArrayData { - fn from(array: GenericBinaryArray) -> Self { - array.data - } } impl From>> @@ -374,15 +200,6 @@ where } } -impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { - type Item = Option<&'a [u8]>; - type IntoIter = GenericBinaryIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericBinaryIter::<'a, T>::new(self) - } -} - /// An array where each element contains 0 or more bytes. /// The byte length of each element is represented by an i32. /// @@ -836,7 +653,7 @@ mod tests { } #[test] - #[should_panic(expected = "[Large]BinaryArray expects Datatype::[Large]Binary")] + #[should_panic(expected = "LargeBinaryArray expects DataType::LargeBinary")] fn test_binary_array_validation() { let array = BinaryArray::from_iter_values(&[&[1, 2]]); let _ = LargeBinaryArray::from(array.into_data()); diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs new file mode 100644 index 000000000000..8dd206bd2639 --- /dev/null +++ b/arrow-array/src/array/byte_array.rs @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{empty_offsets, print_long_array}; +use crate::iterator::ArrayIter; +use crate::raw_pointer::RawPtrBox; +use crate::types::bytes::ByteArrayNativeType; +use crate::types::ByteArrayType; +use crate::{Array, ArrayAccessor, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; + +/// Generic struct for variable-size byte arrays +/// +/// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data +/// +/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes +/// +/// [`StringArray`]: crate::StringArray +/// [`LargeStringArray`]: crate::LargeStringArray +/// [`BinaryArray`]: crate::BinaryArray +/// [`LargeBinaryArray`]: crate::LargeBinaryArray +pub struct GenericByteArray { + data: ArrayData, + value_offsets: RawPtrBox, + value_data: RawPtrBox, +} + +impl GenericByteArray { + /// Data type of the array. + pub const DATA_TYPE: DataType = T::DATA_TYPE; + + /// Returns the length for value at index `i`. + /// # Panics + /// Panics if index `i` is out of bounds. + #[inline] + pub fn value_length(&self, i: usize) -> T::Offset { + let offsets = self.value_offsets(); + offsets[i + 1] - offsets[i] + } + + /// Returns a clone of the value data buffer + pub fn value_data(&self) -> Buffer { + self.data.buffers()[1].clone() + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[T::Offset] { + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the ArrayData instance. + unsafe { + std::slice::from_raw_parts( + self.value_offsets.as_ptr().add(self.data.offset()), + self.len() + 1, + ) + } + } + + /// Returns the element at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native { + let end = *self.value_offsets().get_unchecked(i + 1); + let start = *self.value_offsets().get_unchecked(i); + + // Soundness + // pointer alignment & location is ensured by RawPtrBox + // buffer bounds/offset is ensured by the value_offset invariants + + // Safety of `to_isize().unwrap()` + // `start` and `end` are &OffsetSize, which is a generic type that implements the + // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, + // both of which should cleanly cast to isize on an architecture that supports + // 32/64-bit offsets + let b = std::slice::from_raw_parts( + self.value_data.as_ptr().offset(start.to_isize().unwrap()), + (end - start).to_usize().unwrap(), + ); + + // SAFETY: + // ArrayData is valid + T::Native::from_bytes_unchecked(b) + } + + /// Returns the element at index `i` + /// # Panics + /// Panics if index `i` is out of bounds. + pub fn value(&self, i: usize) -> &T::Native { + assert!( + i < self.data.len(), + "Trying to access an element at index {} from a {}{}Array of length {}", + i, + T::Offset::PREFIX, + T::PREFIX, + self.len() + ); + // SAFETY: + // Verified length above + unsafe { self.value_unchecked(i) } + } + + /// constructs a new iterator + pub fn iter(&self) -> ArrayIter<&Self> { + ArrayIter::new(self) + } +} + +impl std::fmt::Debug for GenericByteArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for GenericByteArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { + type Item = &'a T::Native; + + fn value(&self, index: usize) -> Self::Item { + GenericByteArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericByteArray::value_unchecked(self, index) + } +} + +impl From for GenericByteArray { + fn from(data: ArrayData) -> Self { + assert_eq!( + data.data_type(), + &Self::DATA_TYPE, + "{}{}Array expects DataType::{}", + T::Offset::PREFIX, + T::PREFIX, + Self::DATA_TYPE + ); + assert_eq!( + data.buffers().len(), + 2, + "{}{}Array data should contain 2 buffers only (offsets and values)", + T::Offset::PREFIX, + T::PREFIX, + ); + // Handle case of empty offsets + let offsets = match data.is_empty() && data.buffers()[0].is_empty() { + true => empty_offsets::().as_ptr() as *const _, + false => data.buffers()[0].as_ptr(), + }; + let values = data.buffers()[1].as_ptr(); + Self { + data, + // SAFETY: + // ArrayData must be valid, and validated data type above + value_offsets: unsafe { RawPtrBox::new(offsets) }, + value_data: unsafe { RawPtrBox::new(values) }, + } + } +} + +impl From> for ArrayData { + fn from(array: GenericByteArray) -> Self { + array.data + } +} + +impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray { + type Item = Option<&'a T::Native>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1613e4a69b86..41aa438c9fb3 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -31,6 +31,9 @@ pub use binary_array::*; mod boolean_array; pub use boolean_array::*; +mod byte_array; +pub use byte_array::*; + mod dictionary_array; pub use dictionary_array::*; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 0cf45a448593..94fcbae02e5d 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -15,67 +15,27 @@ // specific language governing permissions and limitations // under the License. -use crate::iterator::GenericStringIter; -use crate::raw_pointer::RawPtrBox; +use crate::types::GenericStringType; use crate::{ - empty_offsets, print_long_array, Array, ArrayAccessor, GenericBinaryArray, - GenericListArray, OffsetSizeTrait, + Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, }; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -use std::any::Any; /// Generic struct for \[Large\]StringArray /// /// See [`StringArray`] and [`LargeStringArray`] for storing /// specific string data. -pub struct GenericStringArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} +pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { - /// Data type of the array. - pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }; - /// Get the data type of the array. #[deprecated(note = "please use `Self::DATA_TYPE` instead")] pub const fn get_data_type() -> DataType { Self::DATA_TYPE } - /// Returns the length for the element at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - /// Returns the number of `Unicode Scalar Value` in the string at index `i`. /// # Performance /// This function has `O(n)` time complexity where `n` is the string length. @@ -85,45 +45,6 @@ impl GenericStringArray { self.value(i).chars().count() } - /// Returns the element at index - /// # Safety - /// caller is responsible for ensuring that index is within the array bounds - #[inline] - pub unsafe fn value_unchecked(&self, i: usize) -> &str { - let end = self.value_offsets().get_unchecked(i + 1).as_usize(); - let start = self.value_offsets().get_unchecked(i).as_usize(); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - // ISSUE: utf-8 well formedness is not checked - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - let slice = - std::slice::from_raw_parts(self.value_data.as_ptr().add(start), end - start); - std::str::from_utf8_unchecked(slice) - } - - /// Returns the element at index `i` as &str - /// # Panics - /// Panics if index `i` is out of bounds. - #[inline] - pub fn value(&self, i: usize) -> &str { - assert!( - i < self.data.len(), - "Trying to access an element at index {} from a StringArray of length {}", - i, - self.len() - ); - // Safety: - // `i < self.data.len() - unsafe { self.value_unchecked(i) } - } - /// Convert a list array to a string array. /// /// Note: this performs potentially expensive UTF-8 validation, consider using @@ -283,62 +204,6 @@ where } } -impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericStringArray { - type Item = Option<&'a str>; - type IntoIter = GenericStringIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericStringIter::<'a, T>::new(self) - } -} - -impl<'a, T: OffsetSizeTrait> GenericStringArray { - /// constructs a new iterator - pub fn iter(&'a self) -> GenericStringIter<'a, T> { - GenericStringIter::<'a, T>::new(self) - } -} - -impl std::fmt::Debug for GenericStringArray { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let prefix = OffsetSize::PREFIX; - - write!(f, "{}StringArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - std::fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericStringArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor - for &'a GenericStringArray -{ - type Item = &'a str; - - fn value(&self, index: usize) -> Self::Item { - GenericStringArray::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - GenericStringArray::value_unchecked(self, index) - } -} - impl From> for GenericStringArray { @@ -356,32 +221,6 @@ impl From> } } -impl From for GenericStringArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &Self::DATA_TYPE, - "[Large]StringArray expects Datatype::[Large]Utf8" - ); - assert_eq!( - data.buffers().len(), - 2, - "StringArray data should contain 2 buffers only (offsets and values)" - ); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - let values = data.buffers()[1].as_ptr(); - Self { - data, - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - impl From>> for GenericStringArray { @@ -402,12 +241,6 @@ impl From> for GenericStringArray From> for ArrayData { - fn from(array: GenericStringArray) -> Self { - array.data - } -} - /// An array where each element is a variable-sized sequence of bytes representing a string /// whose maximum length (in bytes) is represented by a i32. /// @@ -436,6 +269,7 @@ pub type LargeStringArray = GenericStringArray; mod tests { use super::*; use crate::builder::{ListBuilder, StringBuilder}; + use arrow_buffer::Buffer; use arrow_schema::Field; #[test] @@ -464,7 +298,7 @@ mod tests { } #[test] - #[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")] + #[should_panic(expected = "StringArray expects DataType::Utf8")] fn test_string_array_from_int() { let array = LargeStringArray::from(vec!["a", "b"]); drop(StringArray::from(array.into_data())); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index edf6d40f3ae0..e6197eed19cf 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -19,6 +19,7 @@ use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; +use crate::OffsetSizeTrait; use arrow_buffer::i256; use arrow_data::decimal::{ validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, @@ -28,6 +29,7 @@ use arrow_data::decimal::{ use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; use half::f16; +use std::marker::PhantomData; use std::ops::{Add, Sub}; // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` @@ -464,7 +466,10 @@ impl Date64Type { } } -mod private { +/// Crate private types for Decimal Arrays +/// +/// Not intended to be used outside this crate +mod decimal { use super::*; pub trait DecimalTypeSealed {} @@ -482,7 +487,7 @@ mod private { /// [`Decimal128Array`]: [crate::array::Decimal128Array] /// [`Decimal256Array`]: [crate::array::Decimal256Array] pub trait DecimalType: - 'static + Send + Sync + ArrowPrimitiveType + private::DecimalTypeSealed + 'static + Send + Sync + ArrowPrimitiveType + decimal::DecimalTypeSealed { const BYTE_LENGTH: usize; const MAX_PRECISION: u8; @@ -574,6 +579,87 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String } } +/// Crate private types for Byte Arrays +/// +/// Not intended to be used outside this crate +pub(crate) mod bytes { + use super::*; + + pub trait ByteArrayTypeSealed {} + impl ByteArrayTypeSealed for GenericStringType {} + impl ByteArrayTypeSealed for GenericBinaryType {} + + pub trait ByteArrayNativeType: std::fmt::Debug + Send + Sync { + /// # Safety + /// + /// `b` must be a valid byte sequence for `Self` + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; + } + + impl ByteArrayNativeType for [u8] { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + b + } + } + + impl ByteArrayNativeType for str { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + std::str::from_utf8_unchecked(b) + } + } +} + +/// A trait over the variable-size byte array types +/// +/// See [Variable Size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) +pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { + type Offset: OffsetSizeTrait; + type Native: bytes::ByteArrayNativeType + AsRef<[u8]> + ?Sized; + /// "Binary" or "String", for use in error messages + const PREFIX: &'static str; + const DATA_TYPE: DataType; +} + +/// [`ByteArrayType`] for string arrays +pub struct GenericStringType { + phantom: PhantomData, +} + +impl ByteArrayType for GenericStringType { + type Offset = O; + type Native = str; + const PREFIX: &'static str = "String"; + + const DATA_TYPE: DataType = if O::IS_LARGE { + DataType::LargeUtf8 + } else { + DataType::Utf8 + }; +} + +pub type Utf8Type = GenericStringType; +pub type LargeUtf8Type = GenericStringType; + +/// [`ByteArrayType`] for binary arrays +pub struct GenericBinaryType { + phantom: PhantomData, +} + +impl ByteArrayType for GenericBinaryType { + type Offset = O; + type Native = [u8]; + const PREFIX: &'static str = "Binary"; + + const DATA_TYPE: DataType = if O::IS_LARGE { + DataType::LargeBinary + } else { + DataType::Binary + }; +} + +pub type BinaryType = GenericBinaryType; +pub type LargeBinaryType = GenericBinaryType; + #[cfg(test)] mod tests { use super::*; From 0a115d6e270690ed076aeffc35d30dc369174f09 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 19:09:27 +1300 Subject: [PATCH 0182/1411] Fix GenericListArray::try_new_from_array_data error message (#526) (#2961) --- arrow-array/src/array/list_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 0db40a796964..17691bb324ae 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -201,7 +201,7 @@ impl GenericListArray { if data.buffers().len() != 1 { return Err(ArrowError::InvalidArgumentError( format!("ListArray data should contain a single buffer only (value offsets), had {}", - data.len()))); + data.buffers().len()))); } if data.child_data().len() != 1 { From 87ac05bcafd343d3d8ad3b519631d83090afeb1c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Oct 2022 19:09:43 +1300 Subject: [PATCH 0183/1411] Fix take string on sliced indices (#2960) --- arrow-select/src/take.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 77a1147ad6fc..ad1cfe4da321 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -21,9 +21,7 @@ use std::{ops::AddAssign, sync::Arc}; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ - bit_util, buffer::buffer_bin_and, ArrowNativeType, Buffer, MutableBuffer, -}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -675,12 +673,7 @@ where *offset = length_so_far; } - nulls = match indices.data_ref().null_buffer() { - Some(buffer) => { - Some(buffer_bin_and(buffer, 0, &null_buf.into(), 0, data_len)) - } - None => Some(null_buf.into()), - }; + nulls = Some(null_buf.into()) } let array_data = ArrayData::builder(GenericStringArray::::DATA_TYPE) @@ -1547,6 +1540,23 @@ mod tests { _test_take_string::() } + #[test] + fn test_take_slice_string() { + let strings = + StringArray::from(vec![Some("hello"), None, Some("world"), None, Some("hi")]); + let indices = Int32Array::from(vec![Some(0), Some(1), None, Some(0), Some(2)]); + let indices_slice = indices.slice(1, 4); + let indices_slice = indices_slice + .as_ref() + .as_any() + .downcast_ref::() + .unwrap(); + + let expected = StringArray::from(vec![None, None, Some("hello"), Some("world")]); + let result = take(&strings, indices_slice, None).unwrap(); + assert_eq!(result.as_ref(), &expected); + } + macro_rules! test_take_list { ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] From 843a2e5699a36537ff962d921d1c9aa0712a8d7c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 07:09:00 +1300 Subject: [PATCH 0184/1411] Add BooleanArray::true_count and BooleanArray::false_count (#2957) * Add BooleanArray::true_count and BooleanArray::false_count * Review feedback --- arrow-array/src/array/boolean_array.rs | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index c7a44c7d5f9e..31dde3a3dda7 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -103,6 +103,33 @@ impl BooleanArray { &self.data.buffers()[0] } + /// Returns the number of non null, true values within this array + pub fn true_count(&self) -> usize { + match self.data.null_buffer() { + Some(nulls) => { + let null_chunks = nulls.bit_chunks(self.offset(), self.len()); + let value_chunks = self.values().bit_chunks(self.offset(), self.len()); + null_chunks + .iter() + .zip(value_chunks.iter()) + .chain(std::iter::once(( + null_chunks.remainder_bits(), + value_chunks.remainder_bits(), + ))) + .map(|(a, b)| (a & b).count_ones() as usize) + .sum() + } + None => self + .values() + .count_set_bits_offset(self.offset(), self.len()), + } + } + + /// Returns the number of non null, false values within this array + pub fn false_count(&self) -> usize { + self.len() - self.null_count() - self.true_count() + } + /// Returns the boolean value at index `i`. /// /// # Safety @@ -285,6 +312,7 @@ impl>> FromIterator for BooleanArray #[cfg(test)] mod tests { use super::*; + use rand::{thread_rng, Rng}; #[test] fn test_boolean_fmt_debug() { @@ -431,4 +459,31 @@ mod tests { fn test_from_array_data_validation() { let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32)); } + + #[test] + fn test_true_false_count() { + let mut rng = thread_rng(); + + for _ in 0..10 { + // No nulls + let d: Vec<_> = (0..2000).map(|_| rng.gen_bool(0.5)).collect(); + let b = BooleanArray::from(d.clone()); + + let expected_true = d.iter().filter(|x| **x).count(); + assert_eq!(b.true_count(), expected_true); + assert_eq!(b.false_count(), d.len() - expected_true); + + // With nulls + let d: Vec<_> = (0..2000) + .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) + .collect(); + let b = BooleanArray::from(d.clone()); + + let expected_true = d.iter().filter(|x| matches!(x, Some(true))).count(); + assert_eq!(b.true_count(), expected_true); + + let expected_false = d.iter().filter(|x| matches!(x, Some(false))).count(); + assert_eq!(b.false_count(), expected_false); + } + } } From cbee739ddaebc3596b91754fbab9e26904d9622c Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Sat, 29 Oct 2022 02:41:57 +0800 Subject: [PATCH 0185/1411] Format Timestamps as RFC3339 (#2939) * standarize-tz-display * only test named timezone while chrono-tz enabled * fix docs * fix doc --- arrow-array/src/array/primitive_array.rs | 33 +++--- arrow/src/util/display.rs | 80 ++++++++++++-- arrow/src/util/pretty.rs | 129 ++++++++++++++++++++++- 3 files changed, 212 insertions(+), 30 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 016e5306cf8f..eb3618f7c307 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -18,7 +18,10 @@ use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; use crate::iterator::PrimitiveIter; use crate::raw_pointer::RawPtrBox; -use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; +use crate::temporal_conversions::{ + as_date, as_datetime, as_datetime_with_timezone, as_duration, as_time, +}; +use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{print_long_array, Array, ArrayAccessor}; @@ -26,7 +29,7 @@ use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; use std::any::Any; @@ -116,40 +119,40 @@ pub type Float64Array = PrimitiveArray; /// # Example: UTC timestamps post epoch /// ``` /// # use arrow_array::TimestampSecondArray; -/// use chrono::FixedOffset; +/// use arrow_array::timezone::Tz; /// // Corresponds to single element array with entry 1970-05-09T14:25:11+0:00 /// let arr = TimestampSecondArray::from(vec![11111111]); /// // OR /// let arr = TimestampSecondArray::from(vec![Some(11111111)]); -/// let utc_offset = FixedOffset::east(0); +/// let utc_tz: Tz = "+00:00".parse().unwrap(); /// -/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1970-05-09 14:25:11") +/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_tz).map(|v| v.to_string()).unwrap(), "1970-05-09 14:25:11 +00:00") /// ``` /// /// # Example: UTC timestamps pre epoch /// ``` /// # use arrow_array::TimestampSecondArray; -/// use chrono::FixedOffset; +/// use arrow_array::timezone::Tz; /// // Corresponds to single element array with entry 1969-08-25T09:34:49+0:00 /// let arr = TimestampSecondArray::from(vec![-11111111]); /// // OR /// let arr = TimestampSecondArray::from(vec![Some(-11111111)]); -/// let utc_offset = FixedOffset::east(0); +/// let utc_tz: Tz = "+00:00".parse().unwrap(); /// -/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_offset).map(|v| v.to_string()).unwrap(), "1969-08-25 09:34:49") +/// assert_eq!(arr.value_as_datetime_with_tz(0, utc_tz).map(|v| v.to_string()).unwrap(), "1969-08-25 09:34:49 +00:00") /// ``` /// /// # Example: With timezone specified /// ``` /// # use arrow_array::TimestampSecondArray; -/// use chrono::FixedOffset; +/// use arrow_array::timezone::Tz; /// // Corresponds to single element array with entry 1970-05-10T00:25:11+10:00 /// let arr = TimestampSecondArray::from(vec![11111111]).with_timezone("+10:00".to_string()); /// // OR /// let arr = TimestampSecondArray::from(vec![Some(11111111)]).with_timezone("+10:00".to_string()); -/// let sydney_offset = FixedOffset::east(10 * 60 * 60); +/// let sydney_tz: Tz = "+10:00".parse().unwrap(); /// -/// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_offset).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11") +/// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_tz).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11 +10:00") /// ``` /// pub type TimestampSecondArray = PrimitiveArray; @@ -503,12 +506,8 @@ where /// /// functionally it is same as `value_as_datetime`, however it adds /// the passed tz to the to-be-returned NaiveDateTime - pub fn value_as_datetime_with_tz( - &self, - i: usize, - tz: FixedOffset, - ) -> Option { - as_datetime::(i64::from(self.value(i))).map(|datetime| datetime + tz) + pub fn value_as_datetime_with_tz(&self, i: usize, tz: Tz) -> Option> { + as_datetime_with_timezone::(i64::from(self.value(i)), tz) } /// Returns value as a chrono `NaiveDate` by using `Self::datetime()` diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index 7c0b5a28f89e..f5bef1605ef8 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -33,6 +33,7 @@ use crate::{array, datatypes::IntervalUnit}; use array::DictionaryArray; use crate::error::{ArrowError, Result}; +use arrow_array::timezone::Tz; macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -190,7 +191,7 @@ macro_rules! make_string_datetime { } else { array .value_as_datetime($row) - .map(|d| d.to_string()) + .map(|d| format!("{:?}", d)) .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) }; @@ -198,6 +199,29 @@ macro_rules! make_string_datetime { }}; } +macro_rules! make_string_datetime_with_tz { + ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + match $tz_string.parse::() { + Ok(tz) => array + .value_as_datetime_with_tz($row, tz) + .map(|d| format!("{}", d.to_rfc3339())) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + Err(_) => array + .value_as_datetime($row) + .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + } + }; + + Ok(s) + }}; +} + // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex macro_rules! make_string_hex { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -334,17 +358,55 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result make_string!(array::Float32Array, column, row), DataType::Float64 => make_string!(array::Float64Array, column, row), DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => { - make_string_datetime!(array::TimestampSecondArray, column, row) + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { + match tz_string_opt { + Some(tz_string) => make_string_datetime_with_tz!( + array::TimestampSecondArray, + tz_string, + column, + row + ), + None => make_string_datetime!(array::TimestampSecondArray, column, row), + } } - DataType::Timestamp(unit, _) if *unit == TimeUnit::Millisecond => { - make_string_datetime!(array::TimestampMillisecondArray, column, row) + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { + match tz_string_opt { + Some(tz_string) => make_string_datetime_with_tz!( + array::TimestampMillisecondArray, + tz_string, + column, + row + ), + None => { + make_string_datetime!(array::TimestampMillisecondArray, column, row) + } + } } - DataType::Timestamp(unit, _) if *unit == TimeUnit::Microsecond => { - make_string_datetime!(array::TimestampMicrosecondArray, column, row) + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { + match tz_string_opt { + Some(tz_string) => make_string_datetime_with_tz!( + array::TimestampMicrosecondArray, + tz_string, + column, + row + ), + None => { + make_string_datetime!(array::TimestampMicrosecondArray, column, row) + } + } } - DataType::Timestamp(unit, _) if *unit == TimeUnit::Nanosecond => { - make_string_datetime!(array::TimestampNanosecondArray, column, row) + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { + match tz_string_opt { + Some(tz_string) => make_string_datetime_with_tz!( + array::TimestampNanosecondArray, + tz_string, + column, + row + ), + None => { + make_string_datetime!(array::TimestampNanosecondArray, column, row) + } + } } DataType::Date32 => make_string_date!(array::Date32Array, column, row), DataType::Date64 => make_string_date!(array::Date64Array, column, row), diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index f819e389f96e..8d811223cbb5 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -370,13 +370,134 @@ mod tests { }; } + /// Generate an array with type $ARRAYTYPE with a numeric value of + /// $VALUE, and compare $EXPECTED_RESULT to the output of + /// formatting that array with `pretty_format_batches` + macro_rules! check_datetime_with_timezone { + ($ARRAYTYPE:ident, $VALUE:expr, $TZ_STRING:expr, $EXPECTED_RESULT:expr) => { + let mut builder = $ARRAYTYPE::builder(10); + builder.append_value($VALUE); + builder.append_null(); + let array = builder.finish(); + let array = array.with_timezone($TZ_STRING); + + let schema = Arc::new(Schema::new(vec![Field::new( + "f", + array.data_type().clone(), + true, + )])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let table = pretty_format_batches(&[batch]) + .expect("formatting batches") + .to_string(); + + let expected = $EXPECTED_RESULT; + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n", actual); + }; + } + + #[test] + #[cfg(features = "chrono-tz")] + fn test_pretty_format_timestamp_second_with_utc_timezone() { + let expected = vec![ + "+---------------------------+", + "| f |", + "+---------------------------+", + "| 1970-05-09T14:25:11+00:00 |", + "| |", + "+---------------------------+", + ]; + check_datetime_with_timezone!( + TimestampSecondArray, + 11111111, + "UTC".to_string(), + expected + ); + } + + #[test] + #[cfg(features = "chrono-tz")] + fn test_pretty_format_timestamp_second_with_non_utc_timezone() { + let expected = vec![ + "+---------------------------+", + "| f |", + "+---------------------------+", + "| 1970-05-09T22:25:11+08:00 |", + "| |", + "+---------------------------+", + ]; + check_datetime_with_timezone!( + TimestampSecondArray, + 11111111, + "Asia/Taipei".to_string(), + expected + ); + } + + #[test] + fn test_pretty_format_timestamp_second_with_fixed_offset_timezone() { + let expected = vec![ + "+---------------------------+", + "| f |", + "+---------------------------+", + "| 1970-05-09T22:25:11+08:00 |", + "| |", + "+---------------------------+", + ]; + check_datetime_with_timezone!( + TimestampSecondArray, + 11111111, + "+08:00".to_string(), + expected + ); + } + + #[test] + fn test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() { + let expected = vec![ + "+-------------------------------------------------+", + "| f |", + "+-------------------------------------------------+", + "| 1970-05-09T14:25:11 (Unknown Time Zone '08:00') |", + "| |", + "+-------------------------------------------------+", + ]; + check_datetime_with_timezone!( + TimestampSecondArray, + 11111111, + "08:00".to_string(), + expected + ); + } + + #[test] + fn test_pretty_format_timestamp_second_with_unknown_timezone() { + let expected = vec![ + "+---------------------------------------------------+", + "| f |", + "+---------------------------------------------------+", + "| 1970-05-09T14:25:11 (Unknown Time Zone 'Unknown') |", + "| |", + "+---------------------------------------------------+", + ]; + check_datetime_with_timezone!( + TimestampSecondArray, + 11111111, + "Unknown".to_string(), + expected + ); + } + #[test] fn test_pretty_format_timestamp_second() { let expected = vec![ "+---------------------+", "| f |", "+---------------------+", - "| 1970-05-09 14:25:11 |", + "| 1970-05-09T14:25:11 |", "| |", "+---------------------+", ]; @@ -389,7 +510,7 @@ mod tests { "+-------------------------+", "| f |", "+-------------------------+", - "| 1970-01-01 03:05:11.111 |", + "| 1970-01-01T03:05:11.111 |", "| |", "+-------------------------+", ]; @@ -402,7 +523,7 @@ mod tests { "+----------------------------+", "| f |", "+----------------------------+", - "| 1970-01-01 00:00:11.111111 |", + "| 1970-01-01T00:00:11.111111 |", "| |", "+----------------------------+", ]; @@ -415,7 +536,7 @@ mod tests { "+-------------------------------+", "| f |", "+-------------------------------+", - "| 1970-01-01 00:00:00.011111111 |", + "| 1970-01-01T00:00:00.011111111 |", "| |", "+-------------------------------+", ]; From 779804317d9c9d80e72a955deb8594eb45a8308a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Oct 2022 15:43:43 -0400 Subject: [PATCH 0186/1411] Update version to 26.0.0 and add Changelog (#2962) * Update version to 26.0.0 * Initial changelog * Update chagnelog with labels * final updates --- CHANGELOG-old.md | 68 ++++++++++ CHANGELOG.md | 128 +++++++++++-------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 4 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow/Cargo.toml | 12 +- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 6 +- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 20 files changed, 188 insertions(+), 94 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 3f5c541df903..b7f4a7fadc84 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,74 @@ # Historical Changelog +## [25.0.0](https://github.com/apache/arrow-rs/tree/25.0.0) (2022-10-14) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/24.0.0...25.0.0) + +**Breaking changes:** + +- Make DecimalArray as PrimitiveArray [\#2857](https://github.com/apache/arrow-rs/pull/2857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix timestamp parsing while no explicit timezone given [\#2814](https://github.com/apache/arrow-rs/pull/2814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Support Arbitrary Number of Arrays in downcast\_primitive\_array [\#2809](https://github.com/apache/arrow-rs/pull/2809) ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Restore Integration test JSON schema serialization [\#2876](https://github.com/apache/arrow-rs/issues/2876) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix various invalid\_html\_tags clippy error [\#2861](https://github.com/apache/arrow-rs/issues/2861) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Replace complicated temporal macro with generic functions [\#2851](https://github.com/apache/arrow-rs/issues/2851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add NaN handling in dyn scalar comparison kernels [\#2829](https://github.com/apache/arrow-rs/issues/2829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant of sum kernel [\#2821](https://github.com/apache/arrow-rs/issues/2821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update to Clap 4 [\#2817](https://github.com/apache/arrow-rs/issues/2817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Safe API to Operate on Dictionary Values [\#2797](https://github.com/apache/arrow-rs/issues/2797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add modulus op into `ArrowNativeTypeOp` [\#2753](https://github.com/apache/arrow-rs/issues/2753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow creating of TimeUnit instances without direct dependency on parquet-format [\#2708](https://github.com/apache/arrow-rs/issues/2708) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Arrow Row Format [\#2677](https://github.com/apache/arrow-rs/issues/2677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Don't try to infer nulls in CSV schema inference [\#2859](https://github.com/apache/arrow-rs/issues/2859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `parquet::arrow::arrow_writer::ArrowWriter` ignores page size properties [\#2853](https://github.com/apache/arrow-rs/issues/2853) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Introducing ArrowNativeTypeOp made it impossible to call kernels from generics [\#2839](https://github.com/apache/arrow-rs/issues/2839) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Unsound ArrayData to Array Conversions [\#2834](https://github.com/apache/arrow-rs/issues/2834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression: `the trait bound for<'de> arrow::datatypes::Schema: serde::de::Deserialize<'de> is not satisfied` [\#2825](https://github.com/apache/arrow-rs/issues/2825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- convert string to timestamp shouldn't apply local timezone offset if there's no explicit timezone info in the string [\#2813](https://github.com/apache/arrow-rs/issues/2813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Add pub api for checking column index is sorted [\#2848](https://github.com/apache/arrow-rs/issues/2848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Take decimal as primitive \(\#2637\) [\#2869](https://github.com/apache/arrow-rs/pull/2869) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-integration-test crate [\#2868](https://github.com/apache/arrow-rs/pull/2868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Decimal cleanup \(\#2637\) [\#2865](https://github.com/apache/arrow-rs/pull/2865) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix various invalid\_html\_tags clippy errors [\#2862](https://github.com/apache/arrow-rs/pull/2862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Don't try to infer nullability in CSV reader [\#2860](https://github.com/apache/arrow-rs/pull/2860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Fix page size on dictionary fallback [\#2854](https://github.com/apache/arrow-rs/pull/2854) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Replace complicated temporal macro with generic functions [\#2850](https://github.com/apache/arrow-rs/pull/2850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- \[feat\] Add pub api for checking column index is sorted. [\#2849](https://github.com/apache/arrow-rs/pull/2849) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- parquet: Add `snap` option to README [\#2847](https://github.com/apache/arrow-rs/pull/2847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([exyi](https://github.com/exyi)) +- Cleanup cast kernel [\#2846](https://github.com/apache/arrow-rs/pull/2846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify ArrowNativeType [\#2841](https://github.com/apache/arrow-rs/pull/2841) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Expose ArrowNativeTypeOp trait to make it useful for type bound [\#2840](https://github.com/apache/arrow-rs/pull/2840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add `interleave` kernel \(\#1523\) [\#2838](https://github.com/apache/arrow-rs/pull/2838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Handle empty offsets buffer \(\#1824\) [\#2836](https://github.com/apache/arrow-rs/pull/2836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate ArrayData type when converting to Array \(\#2834\) [\#2835](https://github.com/apache/arrow-rs/pull/2835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Derive ArrowPrimitiveType for Decimal128Type and Decimal256Type \(\#2637\) [\#2833](https://github.com/apache/arrow-rs/pull/2833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add NaN handling in dyn scalar comparison kernels [\#2830](https://github.com/apache/arrow-rs/pull/2830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify OrderPreservingInterner allocation strategy ~97% faster \(\#2677\) [\#2827](https://github.com/apache/arrow-rs/pull/2827) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Convert rows to arrays \(\#2677\) [\#2826](https://github.com/apache/arrow-rs/pull/2826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add overflow-checking variant of sum kernel [\#2822](https://github.com/apache/arrow-rs/pull/2822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update Clap dependency to version 4 [\#2819](https://github.com/apache/arrow-rs/pull/2819) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jgoday](https://github.com/jgoday)) +- Fix i256 checked multiplication [\#2818](https://github.com/apache/arrow-rs/pull/2818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add string\_dictionary benches for row format \(\#2677\) [\#2816](https://github.com/apache/arrow-rs/pull/2816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add OrderPreservingInterner::lookup \(\#2677\) [\#2815](https://github.com/apache/arrow-rs/pull/2815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify FixedLengthEncoding [\#2812](https://github.com/apache/arrow-rs/pull/2812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement ArrowNumericType for Float16Type [\#2810](https://github.com/apache/arrow-rs/pull/2810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add DictionaryArray::with\_values to make it easier to operate on dictionary values [\#2798](https://github.com/apache/arrow-rs/pull/2798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add i256 \(\#2637\) [\#2781](https://github.com/apache/arrow-rs/pull/2781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add modulus ops into `ArrowNativeTypeOp` [\#2756](https://github.com/apache/arrow-rs/pull/2756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- feat: cast List / LargeList to Utf8 / LargeUtf8 [\#2588](https://github.com/apache/arrow-rs/pull/2588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gandronchik](https://github.com/gandronchik)) ## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/23.0.0...24.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index d69d8705e1c1..600e96b1d7ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,74 +19,100 @@ # Changelog -## [25.0.0](https://github.com/apache/arrow-rs/tree/25.0.0) (2022-10-14) +## [26.0.0](https://github.com/apache/arrow-rs/tree/26.0.0) (2022-10-28) -[Full Changelog](https://github.com/apache/arrow-rs/compare/24.0.0...25.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/25.0.0...26.0.0) **Breaking changes:** -- Make DecimalArray as PrimitiveArray [\#2857](https://github.com/apache/arrow-rs/pull/2857) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- fix timestamp parsing while no explicit timezone given [\#2814](https://github.com/apache/arrow-rs/pull/2814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Support Arbitrary Number of Arrays in downcast\_primitive\_array [\#2809](https://github.com/apache/arrow-rs/pull/2809) ([tustvold](https://github.com/tustvold)) +- Cast Timestamps to RFC3339 strings [\#2934](https://github.com/apache/arrow-rs/issues/2934) +- Remove Unused NativeDecimalType [\#2945](https://github.com/apache/arrow-rs/pull/2945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Format Timestamps as RFC3339 [\#2939](https://github.com/apache/arrow-rs/pull/2939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Update flatbuffers to resolve RUSTSEC-2021-0122 [\#2895](https://github.com/apache/arrow-rs/pull/2895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- replace `from_timestamp` by `from_timestamp_opt` [\#2894](https://github.com/apache/arrow-rs/pull/2894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) **Implemented enhancements:** -- Restore Integration test JSON schema serialization [\#2876](https://github.com/apache/arrow-rs/issues/2876) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fix various invalid\_html\_tags clippy error [\#2861](https://github.com/apache/arrow-rs/issues/2861) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Replace complicated temporal macro with generic functions [\#2851](https://github.com/apache/arrow-rs/issues/2851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add NaN handling in dyn scalar comparison kernels [\#2829](https://github.com/apache/arrow-rs/issues/2829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add overflow-checking variant of sum kernel [\#2821](https://github.com/apache/arrow-rs/issues/2821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Update to Clap 4 [\#2817](https://github.com/apache/arrow-rs/issues/2817) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Safe API to Operate on Dictionary Values [\#2797](https://github.com/apache/arrow-rs/issues/2797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add modulus op into `ArrowNativeTypeOp` [\#2753](https://github.com/apache/arrow-rs/issues/2753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow creating of TimeUnit instances without direct dependency on parquet-format [\#2708](https://github.com/apache/arrow-rs/issues/2708) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Arrow Row Format [\#2677](https://github.com/apache/arrow-rs/issues/2677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Optimized way to count the numbers of `true` and `false` values in a BooleanArray [\#2963](https://github.com/apache/arrow-rs/issues/2963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add pow to i256 [\#2954](https://github.com/apache/arrow-rs/issues/2954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Write Generic Code over \[Large\]BinaryArray and \[Large\]StringArray [\#2946](https://github.com/apache/arrow-rs/issues/2946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Page Row Count Limit [\#2941](https://github.com/apache/arrow-rs/issues/2941) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- prettyprint to show timezone offset for timestamp with timezone [\#2937](https://github.com/apache/arrow-rs/issues/2937) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast numeric to decimal256 [\#2922](https://github.com/apache/arrow-rs/issues/2922) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `freeze_with_dictionary` API to `MutableArrayData` [\#2914](https://github.com/apache/arrow-rs/issues/2914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support decimal256 array in sort kernels [\#2911](https://github.com/apache/arrow-rs/issues/2911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- support `[+/-]hhmm` and `[+/-]hh` as fixedoffset timezone format [\#2910](https://github.com/apache/arrow-rs/issues/2910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cleanup decimal sort function [\#2907](https://github.com/apache/arrow-rs/issues/2907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- replace `from_timestamp` by `from_timestamp_opt` [\#2892](https://github.com/apache/arrow-rs/issues/2892) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move Primitive arity kernels to arrow-array [\#2787](https://github.com/apache/arrow-rs/issues/2787) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- add overflow-checking for negative arithmetic kernel [\#2662](https://github.com/apache/arrow-rs/issues/2662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Don't try to infer nulls in CSV schema inference [\#2859](https://github.com/apache/arrow-rs/issues/2859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Subtle compatibility issue with serve\_arrow [\#2952](https://github.com/apache/arrow-rs/issues/2952) +- error\[E0599\]: no method named `total_cmp` found for struct `f16` in the current scope [\#2926](https://github.com/apache/arrow-rs/issues/2926) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fail at rowSelection `and_then` method [\#2925](https://github.com/apache/arrow-rs/issues/2925) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Ordering not implemented for FixedSizeBinary types [\#2904](https://github.com/apache/arrow-rs/issues/2904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet API: Could not convert timestamp before unix epoch to string/json [\#2897](https://github.com/apache/arrow-rs/issues/2897) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Overly Pessimistic RLE Size Estimation [\#2889](https://github.com/apache/arrow-rs/issues/2889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Memory alignment error in `RawPtrBox::new` [\#2882](https://github.com/apache/arrow-rs/issues/2882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compilation error under chrono-tz feature [\#2878](https://github.com/apache/arrow-rs/issues/2878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- AHash Statically Allocates 64 bytes [\#2875](https://github.com/apache/arrow-rs/issues/2875) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - `parquet::arrow::arrow_writer::ArrowWriter` ignores page size properties [\#2853](https://github.com/apache/arrow-rs/issues/2853) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Introducing ArrowNativeTypeOp made it impossible to call kernels from generics [\#2839](https://github.com/apache/arrow-rs/issues/2839) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Unsound ArrayData to Array Conversions [\#2834](https://github.com/apache/arrow-rs/issues/2834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Regression: `the trait bound for<'de> arrow::datatypes::Schema: serde::de::Deserialize<'de> is not satisfied` [\#2825](https://github.com/apache/arrow-rs/issues/2825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- convert string to timestamp shouldn't apply local timezone offset if there's no explicit timezone info in the string [\#2813](https://github.com/apache/arrow-rs/issues/2813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Document crate topology \(\#2594\) [\#2913](https://github.com/apache/arrow-rs/pull/2913) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Closed issues:** -- Add pub api for checking column index is sorted [\#2848](https://github.com/apache/arrow-rs/issues/2848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- SerializedFileWriter comments about multiple call on consumed self [\#2935](https://github.com/apache/arrow-rs/issues/2935) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Pointer freed error when deallocating ArrayData with shared memory buffer [\#2874](https://github.com/apache/arrow-rs/issues/2874) +- Release Arrow `25.0.0` \(next release after `24.0.0`\) [\#2820](https://github.com/apache/arrow-rs/issues/2820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Replace DecimalArray with PrimitiveArray [\#2637](https://github.com/apache/arrow-rs/issues/2637) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Take decimal as primitive \(\#2637\) [\#2869](https://github.com/apache/arrow-rs/pull/2869) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-integration-test crate [\#2868](https://github.com/apache/arrow-rs/pull/2868) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Decimal cleanup \(\#2637\) [\#2865](https://github.com/apache/arrow-rs/pull/2865) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix various invalid\_html\_tags clippy errors [\#2862](https://github.com/apache/arrow-rs/pull/2862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) -- Don't try to infer nullability in CSV reader [\#2860](https://github.com/apache/arrow-rs/pull/2860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Fix page size on dictionary fallback [\#2854](https://github.com/apache/arrow-rs/pull/2854) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- Replace complicated temporal macro with generic functions [\#2850](https://github.com/apache/arrow-rs/pull/2850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- \[feat\] Add pub api for checking column index is sorted. [\#2849](https://github.com/apache/arrow-rs/pull/2849) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- parquet: Add `snap` option to README [\#2847](https://github.com/apache/arrow-rs/pull/2847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([exyi](https://github.com/exyi)) -- Cleanup cast kernel [\#2846](https://github.com/apache/arrow-rs/pull/2846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Simplify ArrowNativeType [\#2841](https://github.com/apache/arrow-rs/pull/2841) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Expose ArrowNativeTypeOp trait to make it useful for type bound [\#2840](https://github.com/apache/arrow-rs/pull/2840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add `interleave` kernel \(\#1523\) [\#2838](https://github.com/apache/arrow-rs/pull/2838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Handle empty offsets buffer \(\#1824\) [\#2836](https://github.com/apache/arrow-rs/pull/2836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Validate ArrayData type when converting to Array \(\#2834\) [\#2835](https://github.com/apache/arrow-rs/pull/2835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Derive ArrowPrimitiveType for Decimal128Type and Decimal256Type \(\#2637\) [\#2833](https://github.com/apache/arrow-rs/pull/2833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add NaN handling in dyn scalar comparison kernels [\#2830](https://github.com/apache/arrow-rs/pull/2830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Simplify OrderPreservingInterner allocation strategy ~97% faster \(\#2677\) [\#2827](https://github.com/apache/arrow-rs/pull/2827) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Convert rows to arrays \(\#2677\) [\#2826](https://github.com/apache/arrow-rs/pull/2826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add overflow-checking variant of sum kernel [\#2822](https://github.com/apache/arrow-rs/pull/2822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update Clap dependency to version 4 [\#2819](https://github.com/apache/arrow-rs/pull/2819) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jgoday](https://github.com/jgoday)) -- Fix i256 checked multiplication [\#2818](https://github.com/apache/arrow-rs/pull/2818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add string\_dictionary benches for row format \(\#2677\) [\#2816](https://github.com/apache/arrow-rs/pull/2816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add OrderPreservingInterner::lookup \(\#2677\) [\#2815](https://github.com/apache/arrow-rs/pull/2815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Simplify FixedLengthEncoding [\#2812](https://github.com/apache/arrow-rs/pull/2812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement ArrowNumericType for Float16Type [\#2810](https://github.com/apache/arrow-rs/pull/2810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add DictionaryArray::with\_values to make it easier to operate on dictionary values [\#2798](https://github.com/apache/arrow-rs/pull/2798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add i256 \(\#2637\) [\#2781](https://github.com/apache/arrow-rs/pull/2781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add modulus ops into `ArrowNativeTypeOp` [\#2756](https://github.com/apache/arrow-rs/pull/2756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- feat: cast List / LargeList to Utf8 / LargeUtf8 [\#2588](https://github.com/apache/arrow-rs/pull/2588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gandronchik](https://github.com/gandronchik)) +- Fix GenericListArray::try\_new\_from\_array\_data error message \(\#526\) [\#2961](https://github.com/apache/arrow-rs/pull/2961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix take string on sliced indices [\#2960](https://github.com/apache/arrow-rs/pull/2960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::true\_count and BooleanArray::false\_count [\#2957](https://github.com/apache/arrow-rs/pull/2957) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add pow to i256 [\#2955](https://github.com/apache/arrow-rs/pull/2955) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix datatype for timestamptz debug fmt [\#2948](https://github.com/apache/arrow-rs/pull/2948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Add GenericByteArray \(\#2946\) [\#2947](https://github.com/apache/arrow-rs/pull/2947) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Specialize interleave string ~2-3x faster [\#2944](https://github.com/apache/arrow-rs/pull/2944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Added support for LZ4\_RAW compression. \(\#1604\) [\#2943](https://github.com/apache/arrow-rs/pull/2943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Add optional page row count limit for parquet `WriterProperties` \(\#2941\) [\#2942](https://github.com/apache/arrow-rs/pull/2942) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup orphaned doc comments \(\#2935\) [\#2938](https://github.com/apache/arrow-rs/pull/2938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- support more fixedoffset tz format [\#2936](https://github.com/apache/arrow-rs/pull/2936) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Benchmark with prepared row converter [\#2930](https://github.com/apache/arrow-rs/pull/2930) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add lexsort benchmark \(\#2871\) [\#2929](https://github.com/apache/arrow-rs/pull/2929) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve panic messages for RowSelection::and\_then \(\#2925\) [\#2928](https://github.com/apache/arrow-rs/pull/2928) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update required half from 2.0 --\> 2.1 [\#2927](https://github.com/apache/arrow-rs/pull/2927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Cast numeric to decimal256 [\#2923](https://github.com/apache/arrow-rs/pull/2923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup generated proto code [\#2921](https://github.com/apache/arrow-rs/pull/2921) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Deprecate TimestampArray from\_vec and from\_opt\_vec [\#2919](https://github.com/apache/arrow-rs/pull/2919) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support decimal256 array in sort kernels [\#2912](https://github.com/apache/arrow-rs/pull/2912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add timezone abstraction [\#2909](https://github.com/apache/arrow-rs/pull/2909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup decimal sort function [\#2908](https://github.com/apache/arrow-rs/pull/2908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify TimestampArray from\_vec with timezone [\#2906](https://github.com/apache/arrow-rs/pull/2906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement ord for FixedSizeBinary types [\#2905](https://github.com/apache/arrow-rs/pull/2905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- Update chrono-tz requirement from 0.6 to 0.7 [\#2903](https://github.com/apache/arrow-rs/pull/2903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Parquet record api support timestamp before epoch [\#2899](https://github.com/apache/arrow-rs/pull/2899) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AnthonyPoncet](https://github.com/AnthonyPoncet)) +- Specialize interleave integer [\#2898](https://github.com/apache/arrow-rs/pull/2898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support overflow-checking variant of negate kernel [\#2893](https://github.com/apache/arrow-rs/pull/2893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Respect Page Size Limits in ArrowWriter \(\#2853\) [\#2890](https://github.com/apache/arrow-rs/pull/2890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Improve row format docs [\#2888](https://github.com/apache/arrow-rs/pull/2888) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add FixedSizeList::from\_iter\_primitive [\#2887](https://github.com/apache/arrow-rs/pull/2887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify ListArray::from\_iter\_primitive [\#2886](https://github.com/apache/arrow-rs/pull/2886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out value selection kernels into arrow-select \(\#2594\) [\#2885](https://github.com/apache/arrow-rs/pull/2885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Increase default IPC alignment to 64 \(\#2883\) [\#2884](https://github.com/apache/arrow-rs/pull/2884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Copying inappropriately aligned buffer in ipc reader [\#2883](https://github.com/apache/arrow-rs/pull/2883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Validate decimal IPC read \(\#2387\) [\#2880](https://github.com/apache/arrow-rs/pull/2880) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix compilation error under `chrono-tz` feature [\#2879](https://github.com/apache/arrow-rs/pull/2879) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Don't validate decimal precision in ArrayData \(\#2637\) [\#2873](https://github.com/apache/arrow-rs/pull/2873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add downcast\_integer and downcast\_primitive [\#2872](https://github.com/apache/arrow-rs/pull/2872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Filter DecimalArray as PrimitiveArray ~5x Faster \(\#2637\) [\#2870](https://github.com/apache/arrow-rs/pull/2870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Treat DecimalArray as PrimitiveArray in row format [\#2866](https://github.com/apache/arrow-rs/pull/2866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 06bff9aced2d..f39899c70942 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "25.0.0" +version = "26.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "25.0.0", path = "../arrow-schema" } -arrow-data = { version = "25.0.0", path = "../arrow-data" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.7", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index bf3f3cacc6c9..610a35015e23 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "25.0.0" +version = "26.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 8fe0054f87cd..c94bdfd9919a 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "25.0.0" +version = "26.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "25.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1f696f5387b5..b4fe03b4fd70 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "25.0.0" +version = "26.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "25.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "26.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index f09977263a0a..a515e007f0ab 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "25.0.0" +arrow-flight = "26.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 8d8c0fda916e..f46223996644 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "25.0.0" +version = "26.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "25.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } +arrow = { version = "26.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 13088d3dfe6a..4562759b2ddf 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "25.0.0" +version = "26.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 955d311a7900..03118160280a 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "25.0.0" +version = "26.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "25.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "26.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index fae422b77be4..b248c34fa864 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "25.0.0" +version = "26.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index a10f9862fc40..e895bbcdd78b 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "25.0.0" +version = "26.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } -arrow-data = { version = "25.0.0", path = "../arrow-data" } -arrow-schema = { version = "25.0.0", path = "../arrow-schema" } -arrow-array = { version = "25.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "26.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 9e0f93768fa1..5b2639b7fdb0 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "25.0.0" +version = "26.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,11 +44,11 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" } -arrow-data = { version = "25.0.0", path = "../arrow-data" } -arrow-schema = { version = "25.0.0", path = "../arrow-schema" } -arrow-array = { version = "25.0.0", path = "../arrow-array" } -arrow-select = { version = "25.0.0", path = "../arrow-select" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-select = { version = "26.0.0", path = "../arrow-select" } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/arrow/README.md b/arrow/README.md index c687a205a2ae..7bfaad4751d3 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `25.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `26.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 6392716371e1..b3fa546b5f64 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/25.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/26.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 1dc45115678d..6790ef6fde6f 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="24.0.0" -FUTURE_RELEASE="25.0.0" +SINCE_TAG="25.0.0" +FUTURE_RELEASE="26.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index d2c215d461fb..70320ba65901 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "25.0.0" +version = "26.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "25.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "26.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -60,7 +60,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "25.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "26.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 5665038eb200..cf068d2f4e1c 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "25.0.0" +version = "26.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "25.0.0" } +parquet = { path = "../parquet", version = "26.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 12ba2d98e130..dfaba7def7a9 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "25.0.0" -parquet_derive = "25.0.0" +parquet = "26.0.0" +parquet_derive = "26.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 0c2758fdc290..83204ae7413e 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "25.0.0" +version = "26.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "25.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "25.0.0", default-features = false } +parquet = { path = "../parquet", version = "26.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "26.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From dbe518cd083bc0f584d285fa4ec6fc8cf6ce563a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 12:00:59 +1300 Subject: [PATCH 0187/1411] Combine take_utf8 and take_binary (#2969) (#2970) --- arrow-select/src/take.rs | 92 +++++++++++----------------------------- 1 file changed, 25 insertions(+), 67 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index ad1cfe4da321..d34a88ba53ce 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -17,7 +17,7 @@ //! Defines take kernel for [Array] -use std::{ops::AddAssign, sync::Arc}; +use std::sync::Arc; use arrow_array::types::*; use arrow_array::*; @@ -25,6 +25,7 @@ use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; +use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; use num::{ToPrimitive, Zero}; /// Take elements by index from [Array], creating a new [Array] from those indexes. @@ -140,18 +141,10 @@ where Ok(Arc::new(array)) } DataType::Utf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_string::(values, indices)?)) + Ok(Arc::new(take_bytes(as_string_array(values), indices)?)) } DataType::LargeUtf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_string::(values, indices)?)) + Ok(Arc::new(take_bytes(as_largestring_array(values), indices)?)) } DataType::List(_) => { let values = values @@ -209,18 +202,10 @@ where t => unimplemented!("Take not supported for dictionary type {:?}", t) } DataType::Binary => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_binary(values, indices)?)) + Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) } DataType::LargeBinary => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_binary(values, indices)?)) + Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) } DataType::FixedSizeBinary(_) => { let values = values @@ -579,23 +564,23 @@ where } /// `take` implementation for string arrays -fn take_string( - array: &GenericStringArray, +fn take_bytes( + array: &GenericByteArray, indices: &PrimitiveArray, -) -> Result, ArrowError> +) -> Result, ArrowError> where - OffsetSize: Zero + AddAssign + OffsetSizeTrait, + T: ByteArrayType, IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { let data_len = indices.len(); - let bytes_offset = (data_len + 1) * std::mem::size_of::(); + let bytes_offset = (data_len + 1) * std::mem::size_of::(); let mut offsets_buffer = MutableBuffer::from_len_zeroed(bytes_offset); let offsets = offsets_buffer.typed_data_mut(); let mut values = MutableBuffer::new(0); - let mut length_so_far = OffsetSize::zero(); + let mut length_so_far = T::Offset::from_usize(0).unwrap(); offsets[0] = length_so_far; let nulls; @@ -607,8 +592,8 @@ where let s = array.value(index); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); + length_so_far += T::Offset::from_usize(s.as_ref().len()).unwrap(); + values.extend_from_slice(s.as_ref()); *offset = length_so_far; } nulls = None @@ -624,10 +609,10 @@ where })?; if array.is_valid(index) { - let s = array.value(index); + let s = array.value(index).as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); + length_so_far += T::Offset::from_usize(s.len()).unwrap(); + values.extend_from_slice(s.as_ref()); } else { bit_util::unset_bit(null_slice, i); } @@ -642,10 +627,10 @@ where ArrowError::ComputeError("Cast to usize failed".to_string()) })?; - let s = array.value(index); + let s = array.value(index).as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); + length_so_far += T::Offset::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); } *offset = length_so_far; } @@ -662,10 +647,10 @@ where })?; if array.is_valid(index) && indices.is_valid(i) { - let s = array.value(index); + let s = array.value(index).as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); + length_so_far += T::Offset::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); } else { // set null bit bit_util::unset_bit(null_slice, i); @@ -676,7 +661,7 @@ where nulls = Some(null_buf.into()) } - let array_data = ArrayData::builder(GenericStringArray::::DATA_TYPE) + let array_data = ArrayData::builder(T::DATA_TYPE) .len(data_len) .add_buffer(offsets_buffer.into()) .add_buffer(values.into()) @@ -684,7 +669,7 @@ where let array_data = unsafe { array_data.build_unchecked() }; - Ok(GenericStringArray::::from(array_data)) + Ok(GenericByteArray::from(array_data)) } /// `take` implementation for list arrays @@ -781,33 +766,6 @@ where Ok(FixedSizeListArray::from(list_data)) } -fn take_binary( - values: &GenericBinaryArray, - indices: &PrimitiveArray, -) -> Result, ArrowError> -where - OffsetType: OffsetSizeTrait, - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ - let data_ref = values.data_ref(); - let array_iter = indices - .values() - .iter() - .map(|idx| { - let idx = maybe_usize::(*idx)?; - if data_ref.is_valid(idx) { - Ok(Some(values.value(idx))) - } else { - Ok(None) - } - }) - .collect::, ArrowError>>()? - .into_iter(); - - Ok(array_iter.collect::>()) -} - fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, From 94a7f4b69901754126186f4e18d08d59af76088e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 12:34:01 +1300 Subject: [PATCH 0188/1411] Faster unpack_dict_comparison (#2968) --- arrow/src/compute/kernels/comparison.rs | 33 +++++++++---------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 143050ea97f3..94e7f9660bdd 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -27,18 +27,19 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ - ArrowNativeType, ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, - Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, - Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, - TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, Float32Type, + Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, + IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, Time32MillisecondType, + Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; #[allow(unused_imports)] use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_util; +use arrow_select::take::take; +use num::ToPrimitive; use regex::Regex; use std::collections::HashMap; @@ -1815,21 +1816,11 @@ fn unpack_dict_comparison( ) -> Result where K: ArrowNumericType, + K::Native: ToPrimitive, { - assert_eq!(dict_comparison.len(), dict.values().len()); - - let result: BooleanArray = dict - .keys() - .iter() - .map(|key| { - key.map(|key| unsafe { - let key = key.as_usize(); - dict_comparison.value_unchecked(key) - }) - }) - .collect(); - - Ok(result) + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, dict.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } /// Helper function to perform boolean lambda function on values from two arrays using From 344c552d701374582ac1aff198e62acb9907afb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Gallego=20Castellanos?= Date: Sat, 29 Oct 2022 05:24:26 +0200 Subject: [PATCH 0189/1411] Pass decompressed size to parquet Codec::decompress (#2956) (#2959) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Pass decompressed size to parquet Codec::decompress (#2956) Added optional argument uncompressed_size to Coded::decompress to do a better estimation of the required uncompress size. * snappy: Probably no much improvement as `decompress_len` is already accurate. * gzip: No improvement. Ignores the size hint. * brotli: Probably no much improvement. The buffer size will be equal to the uncompressed_size size. * lz4: No improvement. As the buffer is located at the stack there are no extra allocations. Then it probably is better to keep it working as it is. * zstd: No improvement. Ignores the size hint. * lz4_raw: Improvement. The estimation method over-estimates, so knowin the uncompressed size reduces allocations. * Do not include header size in uncompressed_size. A page may contain header, uncompressed size includes the header size. The `decompress` method expects to receive the `uncompress_size` for the compress block, that is without the page headers. Co-authored-by: Adrián Gallego Castellanos --- parquet/src/compression.rs | 67 ++++++++++++++++++++------- parquet/src/file/serialized_reader.rs | 6 ++- 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index f110e3d8272a..310dbd34f1f6 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -38,7 +38,7 @@ let mut compressed = vec![]; codec.compress(&data[..], &mut compressed).unwrap(); let mut output = vec![]; -codec.decompress(&compressed[..], &mut output).unwrap(); +codec.decompress(&compressed[..], &mut output, None).unwrap(); assert_eq!(output, data); ``` @@ -57,9 +57,18 @@ pub trait Codec: Send { fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()>; /// Decompresses data stored in slice `input_buf` and appends output to `output_buf`. + /// + /// If the uncompress_size is provided it will allocate the exact amount of memory. + /// Otherwise, it will estimate the uncompressed size, allocating an amount of memory + /// greater or equal to the real uncompress_size. + /// /// Returns the total number of bytes written. - fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) - -> Result; + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + uncompress_size: Option, + ) -> Result; } /// Given the compression type `codec`, returns a codec used to compress and decompress @@ -112,8 +121,12 @@ mod snappy_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + uncompress_size: Option, ) -> Result { - let len = decompress_len(input_buf)?; + let len = match uncompress_size { + Some(size) => size, + None => decompress_len(input_buf)?, + }; let offset = output_buf.len(); output_buf.resize(offset + len, 0); self.decoder @@ -161,6 +174,7 @@ mod gzip_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + _uncompress_size: Option, ) -> Result { let mut decoder = read::GzDecoder::new(input_buf); decoder.read_to_end(output_buf).map_err(|e| e.into()) @@ -203,8 +217,10 @@ mod brotli_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + uncompress_size: Option, ) -> Result { - brotli::Decompressor::new(input_buf, BROTLI_DEFAULT_BUFFER_SIZE) + let buffer_size = uncompress_size.unwrap_or(BROTLI_DEFAULT_BUFFER_SIZE); + brotli::Decompressor::new(input_buf, buffer_size) .read_to_end(output_buf) .map_err(|e| e.into()) } @@ -248,6 +264,7 @@ mod lz4_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + _uncompress_size: Option, ) -> Result { let mut decoder = lz4::Decoder::new(input_buf)?; let mut buffer: [u8; LZ4_BUFFER_SIZE] = [0; LZ4_BUFFER_SIZE]; @@ -306,6 +323,7 @@ mod zstd_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + _uncompress_size: Option, ) -> Result { let mut decoder = zstd::Decoder::new(input_buf)?; match io::copy(&mut decoder, output_buf) { @@ -353,16 +371,23 @@ mod lz4_raw_codec { &mut self, input_buf: &[u8], output_buf: &mut Vec, + uncompress_size: Option, ) -> Result { let offset = output_buf.len(); - let required_len = max_uncompressed_size(input_buf.len()); + let required_len = + uncompress_size.unwrap_or_else(|| max_uncompressed_size(input_buf.len())); output_buf.resize(offset + required_len, 0); - let required_len: i32 = required_len.try_into().unwrap(); - match lz4::block::decompress_to_buffer(input_buf, Some(required_len), &mut output_buf[offset..]) { + match lz4::block::decompress_to_buffer( + input_buf, + Some(required_len.try_into().unwrap()), + &mut output_buf[offset..], + ) { Ok(n) => { - output_buf.truncate(offset + n); - Ok(n) - }, + if n < required_len { + output_buf.truncate(offset + n); + } + Ok(n) + } Err(e) => Err(e.into()), } } @@ -371,11 +396,16 @@ mod lz4_raw_codec { let offset = output_buf.len(); let required_len = lz4::block::compress_bound(input_buf.len())?; output_buf.resize(offset + required_len, 0); - match lz4::block::compress_to_buffer(input_buf, None, false, &mut output_buf[offset..]) { + match lz4::block::compress_to_buffer( + input_buf, + None, + false, + &mut output_buf[offset..], + ) { Ok(n) => { output_buf.truncate(offset + n); Ok(()) - }, + } Err(e) => Err(e.into()), } } @@ -390,7 +420,7 @@ mod tests { use crate::util::test_common::rand_gen::random_bytes; - fn test_roundtrip(c: CodecType, data: &[u8]) { + fn test_roundtrip(c: CodecType, data: &[u8], uncompress_size: Option) { let mut c1 = create_codec(c).unwrap().unwrap(); let mut c2 = create_codec(c).unwrap().unwrap(); @@ -402,7 +432,7 @@ mod tests { // Decompress with c2 let decompressed_size = c2 - .decompress(compressed.as_slice(), &mut decompressed) + .decompress(compressed.as_slice(), &mut decompressed, uncompress_size) .expect("Error when decompressing"); assert_eq!(data.len(), decompressed_size); assert_eq!(data, decompressed.as_slice()); @@ -416,7 +446,7 @@ mod tests { // Decompress with c1 let decompressed_size = c1 - .decompress(compressed.as_slice(), &mut decompressed) + .decompress(compressed.as_slice(), &mut decompressed, uncompress_size) .expect("Error when decompressing"); assert_eq!(data.len(), decompressed_size); assert_eq!(data, decompressed.as_slice()); @@ -435,7 +465,7 @@ mod tests { assert_eq!(&compressed[..4], prefix); let decompressed_size = c2 - .decompress(&compressed[4..], &mut decompressed) + .decompress(&compressed[4..], &mut decompressed, uncompress_size) .expect("Error when decompressing"); assert_eq!(data.len(), decompressed_size); @@ -447,7 +477,8 @@ mod tests { let sizes = vec![100, 10000, 100000]; for size in sizes { let data = random_bytes(size); - test_roundtrip(c, &data); + test_roundtrip(c, &data, None); + test_roundtrip(c, &data, Some(data.len())); } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 6b416e34dc65..854ae1ef6d34 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -420,7 +420,11 @@ pub(crate) fn decode_page( let mut decompressed = Vec::with_capacity(uncompressed_size); let compressed = &buffer.as_ref()[offset..]; decompressed.extend_from_slice(&buffer.as_ref()[..offset]); - decompressor.decompress(compressed, &mut decompressed)?; + decompressor.decompress( + compressed, + &mut decompressed, + Some(uncompressed_size - offset), + )?; if decompressed.len() != uncompressed_size { return Err(general_err!( From 04e224cd3762539b078bc28cdad2f73767c328ff Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Fri, 28 Oct 2022 21:24:55 -0600 Subject: [PATCH 0190/1411] Move `byte_size` from datafusion::physical_expr (#2965) * Move `byte_size` from datafusion::physical_expr * lint * PR feedback --- arrow-array/src/record_batch.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 58462449ea31..e613a38bb15b 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -377,6 +377,14 @@ impl RecordBatch { let schema = Arc::new(Schema::new(fields)); RecordBatch::try_new(schema, columns) } + + /// Returns the total number of bytes of memory occupied physically by this batch. + pub fn get_array_memory_size(&self) -> usize { + self.columns() + .iter() + .map(|array| array.get_array_memory_size()) + .sum() + } } /// Options that control the behaviour used when creating a [`RecordBatch`]. @@ -471,6 +479,22 @@ mod tests { check_batch(record_batch, 5) } + #[test] + fn byte_size_should_not_regress() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Utf8, false), + ]); + + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = StringArray::from(vec!["a", "b", "c", "d", "e"]); + + let record_batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) + .unwrap(); + assert_eq!(record_batch.get_array_memory_size(), 592); + } + fn check_batch(record_batch: RecordBatch, num_rows: usize) { assert_eq!(num_rows, record_batch.num_rows()); assert_eq!(2, record_batch.num_columns()); From 87c9db437a530e246528e72d4af902b78ce2623e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 28 Oct 2022 21:06:47 -0700 Subject: [PATCH 0191/1411] Use unary (#2973) --- arrow/src/compute/kernels/cast.rs | 61 +++++++------------------------ 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 73868dd98c06..3e55791502c2 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -41,8 +41,8 @@ use std::sync::Arc; use crate::buffer::MutableBuffer; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::compute::try_unary; use crate::compute::{divide_scalar, multiply_scalar}; +use crate::compute::{try_unary, unary}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::temporal_conversions::{ @@ -306,44 +306,6 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } -/// Cast the primitive array to defined decimal128 data type array -fn cast_primitive_to_decimal128( - array: T, - op: F, - precision: u8, - scale: u8, -) -> Result -where - F: Fn(T::Item) -> i128, -{ - #[allow(clippy::redundant_closure)] - let decimal_array = ArrayIter::new(array) - .map(|v| v.map(|v| op(v))) - .collect::() - .with_precision_and_scale(precision, scale)?; - - Ok(Arc::new(decimal_array)) -} - -/// Cast the primitive array to defined decimal256 data type array -fn cast_primitive_to_decimal256( - array: T, - op: F, - precision: u8, - scale: u8, -) -> Result -where - F: Fn(T::Item) -> i256, -{ - #[allow(clippy::redundant_closure)] - let decimal_array = ArrayIter::new(array) - .map(|v| v.map(|v| op(v))) - .collect::() - .with_precision_and_scale(precision, scale)?; - - Ok(Arc::new(decimal_array)) -} - fn cast_integer_to_decimal128( array: &PrimitiveArray, precision: u8, @@ -354,7 +316,9 @@ where { let mul: i128 = 10_i128.pow(scale as u32); - cast_primitive_to_decimal128(array, |v| v.as_() * mul, precision, scale) + unary::(array, |v| v.as_() * mul) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } fn cast_integer_to_decimal256( @@ -374,7 +338,9 @@ where )) })?; - cast_primitive_to_decimal256(array, |v| v.as_().wrapping_mul(mul), precision, scale) + unary::(array, |v| v.as_().wrapping_mul(mul)) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } fn cast_floating_point_to_decimal128( @@ -387,7 +353,9 @@ where { let mul = 10_f64.powi(scale as i32); - cast_primitive_to_decimal128(array, |v| (v.as_() * mul) as i128, precision, scale) + unary::(array, |v| (v.as_() * mul) as i128) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } fn cast_floating_point_to_decimal256( @@ -400,12 +368,9 @@ where { let mul = 10_f64.powi(scale as i32); - cast_primitive_to_decimal256( - array, - |v| i256::from_i128((v.as_() * mul) as i128), - precision, - scale, - ) + unary::(array, |v| i256::from_i128((v.as_() * mul) as i128)) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] From e6eb8341cb20460847ba12949ad7a937a62d76b7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 17:08:54 +1300 Subject: [PATCH 0192/1411] Add Decimal Arithmetic (#2881) * Add Decimal Arithmetic * Derive operations * Add neg * Clippy --- arrow-buffer/src/bigint.rs | 47 +++++++++++++ arrow/src/compute/kernels/arithmetic.rs | 63 +++++++++++++++++ arrow/src/datatypes/native.rs | 12 ++-- arrow/src/datatypes/numeric.rs | 93 +++++++++++++++++++++++++ 4 files changed, 211 insertions(+), 4 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 892c6c99d216..463c63729adc 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -189,6 +189,18 @@ impl i256 { (self != Self::MIN).then(|| self.wrapping_abs()) } + /// Negates this i256 + #[inline] + pub fn wrapping_neg(self) -> Self { + Self::from_parts(!self.low, !self.high).wrapping_add(i256::ONE) + } + + /// Negates this i256 returning `None` if `Self == Self::MIN` + #[inline] + pub fn checked_neg(self) -> Option { + (self != Self::MIN).then(|| self.wrapping_neg()) + } + /// Performs wrapping addition #[inline] pub fn wrapping_add(self, other: Self) -> Self { @@ -396,6 +408,30 @@ fn mulx(a: u128, b: u128) -> (u128, u128) { (low, high) } +macro_rules! derive_op { + ($t:ident, $op:ident, $wrapping:ident, $checked:ident) => { + impl std::ops::$t for i256 { + type Output = i256; + + #[cfg(debug_assertions)] + fn $op(self, rhs: Self) -> Self::Output { + self.$checked(rhs).expect("i256 overflow") + } + + #[cfg(not(debug_assertions))] + fn $op(self, rhs: Self) -> Self::Output { + self.$wrapping(rhs) + } + } + }; +} + +derive_op!(Add, add, wrapping_add, checked_add); +derive_op!(Sub, sub, wrapping_sub, checked_sub); +derive_op!(Mul, mul, wrapping_mul, checked_mul); +derive_op!(Div, div, wrapping_div, checked_div); +derive_op!(Rem, rem, wrapping_rem, checked_rem); + macro_rules! define_as_primitive { ($native_ty:ty) => { impl AsPrimitive for $native_ty { @@ -416,6 +452,7 @@ mod tests { use super::*; use num::{BigInt, FromPrimitive, Signed, ToPrimitive}; use rand::{thread_rng, Rng}; + use std::ops::Neg; #[test] fn test_signed_cmp() { @@ -466,6 +503,16 @@ mod tests { assert_eq!(ir.wrapping_abs(), abs); assert_eq!(ir.checked_abs().is_none(), overflow); + // Negation + let (neg, overflow) = i256::from_bigint_with_overflow(bl.clone().neg()); + assert_eq!(il.wrapping_neg(), neg); + assert_eq!(il.checked_neg().is_none(), overflow); + + // Negation + let (neg, overflow) = i256::from_bigint_with_overflow(br.clone().neg()); + assert_eq!(ir.wrapping_neg(), neg); + assert_eq!(ir.checked_neg().is_none(), overflow); + // Addition let actual = il.wrapping_add(ir); let (expected, overflow) = diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index e7fcb50cc254..d12a0c1964fd 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1625,6 +1625,7 @@ mod tests { use super::*; use crate::array::Int32Array; use crate::datatypes::{Date64Type, Int32Type, Int8Type}; + use arrow_buffer::i256; use chrono::NaiveDate; use half::f16; @@ -2898,6 +2899,68 @@ mod tests { overflow.expect_err("overflow should be detected"); } + #[test] + fn test_decimal128() { + let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); + let b = Decimal128Array::from_iter_values([7, -3, 6, 3]); + let e = Decimal128Array::from_iter_values([8, -1, 10, 8]); + let r = add(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = Decimal128Array::from_iter_values([-6, 5, -2, 2]); + let r = subtract(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = Decimal128Array::from_iter_values([7, -6, 24, 15]); + let r = multiply(&a, &b).unwrap(); + assert_eq!(e, r); + + let a = Decimal128Array::from_iter_values([23, 56, 32, 55]); + let b = Decimal128Array::from_iter_values([1, -2, 4, 5]); + let e = Decimal128Array::from_iter_values([23, -28, 8, 11]); + let r = divide(&a, &b).unwrap(); + assert_eq!(e, r); + } + + #[test] + fn test_decimal256() { + let a = Decimal256Array::from_iter_values( + [1, 2, 4, 5].into_iter().map(i256::from_i128), + ); + let b = Decimal256Array::from_iter_values( + [7, -3, 6, 3].into_iter().map(i256::from_i128), + ); + let e = Decimal256Array::from_iter_values( + [8, -1, 10, 8].into_iter().map(i256::from_i128), + ); + let r = add(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = Decimal256Array::from_iter_values( + [-6, 5, -2, 2].into_iter().map(i256::from_i128), + ); + let r = subtract(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = Decimal256Array::from_iter_values( + [7, -6, 24, 15].into_iter().map(i256::from_i128), + ); + let r = multiply(&a, &b).unwrap(); + assert_eq!(e, r); + + let a = Decimal256Array::from_iter_values( + [23, 56, 32, 55].into_iter().map(i256::from_i128), + ); + let b = Decimal256Array::from_iter_values( + [1, -2, 4, 5].into_iter().map(i256::from_i128), + ); + let e = Decimal256Array::from_iter_values( + [23, -28, 8, 11].into_iter().map(i256::from_i128), + ); + let r = divide(&a, &b).unwrap(); + assert_eq!(e, r); + } + #[test] #[cfg(feature = "dyn_arith_dict")] fn test_dictionary_div_dyn_wrapping_overflow() { diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 2643025f1573..bbdec14b44a0 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -17,7 +17,7 @@ use crate::error::{ArrowError, Result}; pub use arrow_array::ArrowPrimitiveType; -pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; +pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; use half::f16; /// Trait for [`ArrowNativeType`] that adds checked and unchecked arithmetic operations, @@ -85,9 +85,12 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { macro_rules! native_type_op { ($t:tt) => { + native_type_op!($t, 0, 1); + }; + ($t:tt, $zero:expr, $one: expr) => { impl ArrowNativeTypeOp for $t { - const ZERO: Self = 0; - const ONE: Self = 1; + const ZERO: Self = $zero; + const ONE: Self = $one; fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { @@ -173,7 +176,7 @@ macro_rules! native_type_op { } fn is_zero(self) -> bool { - self == 0 + self == Self::ZERO } fn is_eq(self, rhs: Self) -> bool { @@ -212,6 +215,7 @@ native_type_op!(u8); native_type_op!(u16); native_type_op!(u32); native_type_op!(u64); +native_type_op!(i256, i256::ZERO, i256::ONE); macro_rules! native_type_float_op { ($t:tt, $zero:expr, $one:expr) => { diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs index e74764d4c0ea..61fd05d52f90 100644 --- a/arrow/src/datatypes/numeric.rs +++ b/arrow/src/datatypes/numeric.rs @@ -365,6 +365,7 @@ make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8); +make_numeric_type!(Decimal128Type, i128, i128x4, m128x4); #[cfg(not(feature = "simd"))] impl ArrowNumericType for Float16Type {} @@ -462,6 +463,98 @@ impl ArrowNumericType for Float16Type { } } +#[cfg(not(feature = "simd"))] +impl ArrowNumericType for Decimal256Type {} + +#[cfg(feature = "simd")] +impl ArrowNumericType for Decimal256Type { + type Simd = i256; + type SimdMask = bool; + + fn lanes() -> usize { + 1 + } + + fn init(value: Self::Native) -> Self::Simd { + value + } + + fn load(slice: &[Self::Native]) -> Self::Simd { + slice[0] + } + + fn mask_init(value: bool) -> Self::SimdMask { + value + } + + fn mask_from_u64(mask: u64) -> Self::SimdMask { + mask != 0 + } + + fn mask_to_u64(mask: &Self::SimdMask) -> u64 { + *mask as u64 + } + + fn mask_get(mask: &Self::SimdMask, _idx: usize) -> bool { + *mask + } + + fn mask_set(_mask: Self::SimdMask, _idx: usize, value: bool) -> Self::SimdMask { + value + } + + fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { + match mask { + true => a, + false => b, + } + } + + fn mask_any(mask: Self::SimdMask) -> bool { + mask + } + + fn bin_op Self::Simd>( + left: Self::Simd, + right: Self::Simd, + op: F, + ) -> Self::Simd { + op(left, right) + } + + fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.eq(&right) + } + + fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.ne(&right) + } + + fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.lt(&right) + } + + fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.le(&right) + } + + fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.gt(&right) + } + + fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { + left.ge(&right) + } + + fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { + slice[0] = simd_result + } + + fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { + op(a) + } +} + #[cfg(feature = "simd")] pub trait ArrowFloatNumericType: ArrowNumericType { fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd; From 37f16b54d8861a0ab22b9376873d4fa554e06d6a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Oct 2022 17:20:30 +1300 Subject: [PATCH 0193/1411] Update AWS SDK (#2974) --- object_store/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index f5eb1115d895..fc80cb5774c7 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -55,8 +55,8 @@ rustls-pemfile = { version = "1.0", default-features = false, optional = true } getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support -aws-types = { version = "0.49", optional = true } -aws-config = { version = "0.49", optional = true } +aws-types = { version = "0.51", optional = true } +aws-config = { version = "0.51", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] From 999a6ae28524aedf57a0efb91640bc17bb7a5c7c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Oct 2022 06:52:48 +1300 Subject: [PATCH 0194/1411] More concat kernel to arrow-select (#2594) (#2976) --- .../kernels => arrow-select/src}/concat.rs | 90 ++++++++----------- arrow-select/src/lib.rs | 1 + arrow/src/compute/kernels/mod.rs | 3 +- 3 files changed, 40 insertions(+), 54 deletions(-) rename {arrow/src/compute/kernels => arrow-select/src}/concat.rs (93%) diff --git a/arrow/src/compute/kernels/concat.rs b/arrow-select/src/concat.rs similarity index 93% rename from arrow/src/compute/kernels/concat.rs rename to arrow-select/src/concat.rs index b6edf8c991cf..a1bb64be514d 100644 --- a/arrow/src/compute/kernels/concat.rs +++ b/arrow-select/src/concat.rs @@ -20,8 +20,8 @@ //! Example: //! //! ``` -//! use arrow::array::{ArrayRef, StringArray}; -//! use arrow::compute::concat; +//! use arrow_array::{ArrayRef, StringArray}; +//! use arrow_select::concat::concat; //! //! let arr = concat(&[ //! &StringArray::from(vec!["hello", "world"]), @@ -30,10 +30,10 @@ //! assert_eq!(arr.len(), 3); //! ``` -use crate::array::*; -use crate::datatypes::{DataType, SchemaRef}; -use crate::error::{ArrowError, Result}; -use crate::record_batch::RecordBatch; +use arrow_array::*; +use arrow_data::transform::{Capacities, MutableArrayData}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, SchemaRef}; fn compute_str_values_length(arrays: &[&ArrayData]) -> usize { arrays @@ -51,7 +51,7 @@ fn compute_str_values_length(arrays: &[&ArrayData]) -> } /// Concatenate multiple [Array] of the same type into a single [ArrayRef]. -pub fn concat(arrays: &[&dyn Array]) -> Result { +pub fn concat(arrays: &[&dyn Array]) -> Result { if arrays.is_empty() { return Err(ArrowError::ComputeError( "concat requires input of at least one array".to_string(), @@ -107,7 +107,7 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { pub fn concat_batches( schema: &SchemaRef, batches: &[RecordBatch], -) -> Result { +) -> Result { if batches.is_empty() { return Ok(RecordBatch::new_empty(schema.clone())); } @@ -138,7 +138,8 @@ pub fn concat_batches( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::*; + use arrow_array::types::*; + use arrow_schema::{Field, Schema}; use std::sync::Arc; #[test] @@ -148,18 +149,17 @@ mod tests { } #[test] - fn test_concat_one_element_vec() -> Result<()> { + fn test_concat_one_element_vec() { let arr = Arc::new(PrimitiveArray::::from(vec![ Some(-1), Some(2), None, ])) as ArrayRef; - let result = concat(&[arr.as_ref()])?; + let result = concat(&[arr.as_ref()]).unwrap(); assert_eq!( &arr, &result, "concatenating single element array gives back the same result" ); - Ok(()) } #[test] @@ -172,12 +172,13 @@ mod tests { } #[test] - fn test_concat_string_arrays() -> Result<()> { + fn test_concat_string_arrays() { let arr = concat(&[ &StringArray::from(vec!["hello", "world"]), &StringArray::from(vec!["2", "3", "4"]), &StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), - ])?; + ]) + .unwrap(); let expected_output = Arc::new(StringArray::from(vec![ Some("hello"), @@ -192,12 +193,10 @@ mod tests { ])) as ArrayRef; assert_eq!(&arr, &expected_output); - - Ok(()) } #[test] - fn test_concat_primitive_arrays() -> Result<()> { + fn test_concat_primitive_arrays() { let arr = concat(&[ &PrimitiveArray::::from(vec![ Some(-1), @@ -213,7 +212,8 @@ mod tests { None, ]), &PrimitiveArray::::from(vec![Some(256), Some(512), Some(1024)]), - ])?; + ]) + .unwrap(); let expected_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -231,12 +231,10 @@ mod tests { ])) as ArrayRef; assert_eq!(&arr, &expected_output); - - Ok(()) } #[test] - fn test_concat_primitive_array_slices() -> Result<()> { + fn test_concat_primitive_array_slices() { let input_1 = PrimitiveArray::::from(vec![ Some(-1), Some(-1), @@ -253,7 +251,7 @@ mod tests { None, ]) .slice(1, 3); - let arr = concat(&[input_1.as_ref(), input_2.as_ref()])?; + let arr = concat(&[input_1.as_ref(), input_2.as_ref()]).unwrap(); let expected_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -265,12 +263,10 @@ mod tests { ])) as ArrayRef; assert_eq!(&arr, &expected_output); - - Ok(()) } #[test] - fn test_concat_boolean_primitive_arrays() -> Result<()> { + fn test_concat_boolean_primitive_arrays() { let arr = concat(&[ &BooleanArray::from(vec![ Some(true), @@ -281,7 +277,8 @@ mod tests { Some(false), ]), &BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]), - ])?; + ]) + .unwrap(); let expected_output = Arc::new(BooleanArray::from(vec![ Some(true), @@ -297,12 +294,10 @@ mod tests { ])) as ArrayRef; assert_eq!(&arr, &expected_output); - - Ok(()) } #[test] - fn test_concat_primitive_list_arrays() -> Result<()> { + fn test_concat_primitive_list_arrays() { let list1 = vec![ Some(vec![Some(-1), Some(-1), Some(2), None, None]), Some(vec![]), @@ -324,7 +319,7 @@ mod tests { let list3_array = ListArray::from_iter_primitive::(list3.clone()); - let array_result = concat(&[&list1_array, &list2_array, &list3_array])?; + let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); let expected = list1 .into_iter() @@ -333,12 +328,10 @@ mod tests { let array_expected = ListArray::from_iter_primitive::(expected); assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); - - Ok(()) } #[test] - fn test_concat_struct_arrays() -> Result<()> { + fn test_concat_struct_arrays() { let field = Field::new("field", DataType::Int64, true); let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ @@ -367,7 +360,7 @@ mod tests { ])); let input_struct_3 = StructArray::from(vec![(field, input_primitive_3)]); - let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3])?; + let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3]).unwrap(); let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -390,12 +383,10 @@ mod tests { .unwrap() .column(0); assert_eq!(actual_primitive, &expected_primitive_output); - - Ok(()) } #[test] - fn test_concat_struct_array_slices() -> Result<()> { + fn test_concat_struct_array_slices() { let field = Field::new("field", DataType::Int64, true); let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ @@ -419,7 +410,8 @@ mod tests { let arr = concat(&[ input_struct_1.slice(1, 3).as_ref(), input_struct_2.slice(1, 2).as_ref(), - ])?; + ]) + .unwrap(); let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -435,39 +427,35 @@ mod tests { .unwrap() .column(0); assert_eq!(actual_primitive, &expected_primitive_output); - - Ok(()) } #[test] - fn test_string_array_slices() -> Result<()> { + fn test_string_array_slices() { let input_1 = StringArray::from(vec!["hello", "A", "B", "C"]); let input_2 = StringArray::from(vec!["world", "D", "E", "Z"]); - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; + let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()]) + .unwrap(); let expected_output = StringArray::from(vec!["A", "B", "C", "D", "E"]); let actual_output = arr.as_any().downcast_ref::().unwrap(); assert_eq!(actual_output, &expected_output); - - Ok(()) } #[test] - fn test_string_array_with_null_slices() -> Result<()> { + fn test_string_array_with_null_slices() { let input_1 = StringArray::from(vec![Some("hello"), None, Some("A"), Some("C")]); let input_2 = StringArray::from(vec![None, Some("world"), Some("D"), None]); - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; + let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()]) + .unwrap(); let expected_output = StringArray::from(vec![None, Some("A"), Some("C"), Some("world"), Some("D")]); let actual_output = arr.as_any().downcast_ref::().unwrap(); assert_eq!(actual_output, &expected_output); - - Ok(()) } fn collect_string_dictionary( @@ -539,7 +527,7 @@ mod tests { } #[test] - fn test_concat_string_sizes() -> Result<()> { + fn test_concat_string_sizes() { let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); @@ -550,11 +538,9 @@ mod tests { // 909 // closest 64 byte aligned cap = 960 - let arr = concat(&[&a, &b, &c])?; + let arr = concat(&[&a, &b, &c]).unwrap(); // this would have been 1280 if we did not precompute the value lengths. assert_eq!(arr.data().buffers()[1].capacity(), 960); - - Ok(()) } #[test] diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs index 159c9b0ffdea..5249b5c4c323 100644 --- a/arrow-select/src/lib.rs +++ b/arrow-select/src/lib.rs @@ -17,6 +17,7 @@ //! Arrow selection kernels +pub mod concat; pub mod filter; pub mod interleave; pub mod take; diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 68ae2439f2db..a772f5bcc429 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -25,7 +25,6 @@ pub mod boolean; pub mod cast; pub mod cast_utils; pub mod comparison; -pub mod concat; pub mod concat_elements; pub mod length; pub mod limit; @@ -37,4 +36,4 @@ pub mod temporal; pub mod window; pub mod zip; -pub use arrow_select::{filter, interleave, take}; +pub use arrow_select::{concat, filter, interleave, take}; From bcce9dd4fcf211cb8e0355f3e32bd67931b6c9fa Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 29 Oct 2022 18:09:13 -0700 Subject: [PATCH 0195/1411] Add decimal comparison kernel support (#2978) --- arrow-array/src/cast.rs | 15 ++++ arrow/src/compute/kernels/comparison.rs | 104 ++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index e4e290501443..4436dc77c2f0 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -461,6 +461,7 @@ array_downcast_fn!(as_decimal_array, Decimal128Array); #[cfg(test)] mod tests { + use arrow_buffer::i256; use std::sync::Arc; use super::*; @@ -496,4 +497,18 @@ mod tests { let array: ArrayRef = Arc::new(array); assert!(!as_string_array(&array).is_empty()) } + + #[test] + fn test_decimal128array() { + let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); + assert!(!as_primitive_array::(&a).is_empty()); + } + + #[test] + fn test_decimal256array() { + let a = Decimal256Array::from_iter_values( + [1, 2, 4, 5].into_iter().map(i256::from_i128), + ); + assert!(!as_primitive_array::(&a).is_empty()); + } } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 94e7f9660bdd..4d8248a8dcf6 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -27,12 +27,13 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ - ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, Float32Type, - Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, - IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimeUnit, - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, + Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, + IntervalYearMonthType, Time32MillisecondType, Time32SecondType, + Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type, }; #[allow(unused_imports)] use crate::downcast_dictionary_array; @@ -2257,6 +2258,12 @@ macro_rules! typed_compares { (DataType::Float64, DataType::Float64) => { cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) } + (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { + cmp_primitive_array::($LEFT, $RIGHT, $OP) + } + (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { + cmp_primitive_array::($LEFT, $RIGHT, $OP) + } (DataType::Utf8, DataType::Utf8) => { compare_op(as_string_array($LEFT), as_string_array($RIGHT), $OP) } @@ -3348,6 +3355,7 @@ fn new_all_set_buffer(len: usize) -> Buffer { #[rustfmt::skip::macros(vec)] #[cfg(test)] mod tests { + use arrow_buffer::i256; use std::sync::Arc; use super::*; @@ -6644,4 +6652,88 @@ mod tests { BooleanArray::from(vec![Some(true), None, None, Some(true)]) ); } + + #[test] + fn test_decimal128() { + let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); + let b = Decimal128Array::from_iter_values([7, -3, 4, 3]); + let e = BooleanArray::from(vec![false, false, true, false]); + let r = eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, false, false]); + let r = lt(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = lt_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, true, false]); + let r = lt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = lt_eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, false, true]); + let r = gt(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = gt_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, true, true]); + let r = gt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = gt_eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + } + + #[test] + fn test_decimal256() { + let a = Decimal256Array::from_iter_values( + [1, 2, 4, 5].into_iter().map(i256::from_i128), + ); + let b = Decimal256Array::from_iter_values( + [7, -3, 4, 3].into_iter().map(i256::from_i128), + ); + let e = BooleanArray::from(vec![false, false, true, false]); + let r = eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, false, false]); + let r = lt(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = lt_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, true, false]); + let r = lt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = lt_eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, false, true]); + let r = gt(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = gt_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, true, true]); + let r = gt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let r = gt_eq_dyn(&a, &b).unwrap(); + assert_eq!(e, r); + } } From 4311904b7e56d5fd5fdf0e675da1eb84e24c8137 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 30 Oct 2022 00:24:30 -0700 Subject: [PATCH 0196/1411] Compare dictionary and non-dictionary decimal arrays (#2980) --- arrow/src/compute/kernels/comparison.rs | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 4d8248a8dcf6..1074aaf31546 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -26,6 +26,7 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; +#[allow(unused_imports)] use crate::datatypes::{ ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, @@ -2167,6 +2168,12 @@ macro_rules! typed_cmp_dict_non_dict { (DataType::Float64, DataType::Float64) => { typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float64Type, $OP_BOOL, $OP_FLOAT) } + (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { + typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Decimal128Type, $OP_BOOL, $OP) + } + (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { + typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Decimal256Type, $OP_BOOL, $OP) + } (DataType::Utf8, DataType::Utf8) => { typed_dict_string_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), i32, $OP) } @@ -6653,6 +6660,81 @@ mod tests { ); } + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_cmp_dict_non_dict_decimal128() { + let array1: Decimal128Array = + Decimal128Array::from_iter_values([1, 2, 5, 4, 3, 0]); + + let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), Some(false), Some(true)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), Some(true), Some(false)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + } + + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_cmp_dict_non_dict_decimal256() { + let array1: Decimal256Array = Decimal256Array::from_iter_values( + [1, 2, 5, 4, 3, 0].into_iter().map(i256::from_i128), + ); + + let values = Decimal256Array::from_iter_values( + [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), + ); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), Some(false), Some(true)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), Some(true), Some(false)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + } + #[test] fn test_decimal128() { let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); From 3c1f3230e15b0bbf9c0e719a97c8d4d01c3e1d22 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sun, 30 Oct 2022 17:19:18 -0400 Subject: [PATCH 0197/1411] add clone and equal functions for CastOptions (#2985) Co-authored-by: askoa --- arrow/src/compute/kernels/cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 3e55791502c2..4c724b6401b9 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -61,7 +61,7 @@ use num::cast::AsPrimitive; use num::{NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors -#[derive(Debug)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct CastOptions { /// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false) pub safe: bool, From 99e205fcfd0533e1b524f5a0a20c6872eb6cda84 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 30 Oct 2022 18:06:13 -0700 Subject: [PATCH 0198/1411] Compare dictionary decimal arrays (#2982) * Compare dictionary decimal arrays * Use wildcard import --- arrow/src/compute/kernels/comparison.rs | 95 ++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 10 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 1074aaf31546..1806c447b1b9 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -26,16 +26,7 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; use crate::compute::util::combine_option_bitmap; -#[allow(unused_imports)] -use crate::datatypes::{ - ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, - Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, Time32MillisecondType, Time32SecondType, - Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, -}; +use crate::datatypes::*; #[allow(unused_imports)] use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; @@ -2388,6 +2379,12 @@ macro_rules! typed_dict_cmp { (DataType::Float64, DataType::Float64) => { cmp_dict::<$KT, Float64Type, _>($LEFT, $RIGHT, $OP_FLOAT) } + (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { + cmp_dict::<$KT, Decimal128Type, _>($LEFT, $RIGHT, $OP) + } + (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { + cmp_dict::<$KT, Decimal256Type, _>($LEFT, $RIGHT, $OP) + } (DataType::Utf8, DataType::Utf8) => { cmp_dict_utf8::<$KT, i32, _>($LEFT, $RIGHT, $OP) } @@ -6660,6 +6657,43 @@ mod tests { ); } + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_cmp_dict_decimal128() { + let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); + let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), Some(false), Some(true)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), Some(true), Some(false)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + } + #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_non_dict_decimal128() { @@ -6696,6 +6730,47 @@ mod tests { assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); } + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_cmp_dict_decimal256() { + let values = Decimal256Array::from_iter_values( + [0, 1, 2, 3, 4, 5].into_iter().map(i256::from_i128), + ); + let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); + let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let values = Decimal256Array::from_iter_values( + [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), + ); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), Some(false), Some(true)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), Some(true), Some(false)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + } + #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_non_dict_decimal256() { From 3a90654f4cb98ac7fe278991a6cbc11384664e2e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 31 Oct 2022 16:39:21 +1300 Subject: [PATCH 0199/1411] Accept any &dyn Array in nullif kernel (#2940) * Accept any &dyn Array in nullif kernel * Clippy * Update docs --- arrow-buffer/src/buffer/ops.rs | 8 +- arrow/src/compute/kernels/boolean.rs | 411 ++++++++++++++++++++++----- 2 files changed, 346 insertions(+), 73 deletions(-) diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index c1295ad9ab7e..87dc5c003fb2 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -66,10 +66,10 @@ pub fn bitwise_bin_op_helper( right: &Buffer, right_offset_in_bits: usize, len_in_bits: usize, - op: F, + mut op: F, ) -> Buffer where - F: Fn(u64, u64) -> u64, + F: FnMut(u64, u64) -> u64, { let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); @@ -97,10 +97,10 @@ pub fn bitwise_unary_op_helper( left: &Buffer, offset_in_bits: usize, len_in_bits: usize, - op: F, + mut op: F, ) -> Buffer where - F: Fn(u64) -> u64, + F: FnMut(u64) -> u64, { // reserve capacity and set length so we can get a typed view of u64 chunks let mut result = diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index 34921ca97eec..ea3b49e8cc03 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -22,15 +22,18 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use crate::array::{Array, ArrayData, BooleanArray, PrimitiveArray}; +use crate::array::{Array, ArrayData, BooleanArray}; use crate::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, }; use crate::compute::util::combine_option_bitmap; -use crate::datatypes::{ArrowNumericType, DataType}; +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util::ceil; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::{make_array, ArrayRef}; +use arrow_buffer::buffer::bitwise_unary_op_helper; /// Updates null buffer based on data buffer and null buffer of the operand at other side /// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false @@ -468,29 +471,23 @@ pub fn is_not_null(input: &dyn Array) -> Result { Ok(BooleanArray::from(data)) } -/// Copies original array, setting null bit to true if a secondary comparison boolean array is set to true. +/// Copies original array, setting validity bit to false if a secondary comparison +/// boolean array is set to true or null +/// /// Typically used to implement NULLIF. -// NOTE: For now this only supports Primitive Arrays. Although the code could be made generic, the issue -// is that currently the bitmap operations result in a final bitmap which is aligned to bit 0, and thus -// the left array's data needs to be sliced to a new offset, and for non-primitive arrays shifting the -// data might be too complicated. In the future, to avoid shifting left array's data, we could instead -// shift the final bitbuffer to the right, prepending with 0's instead. -pub fn nullif( - left: &PrimitiveArray, - right: &BooleanArray, -) -> Result> -where - T: ArrowNumericType, -{ - if left.len() != right.len() { +pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { + let left_data = left.data(); + let right_data = right.data(); + + if left_data.len() != right_data.len() { return Err(ArrowError::ComputeError( "Cannot perform comparison operation on arrays of different length" .to_string(), )); } - let left_data = left.data(); + let len = left_data.len(); + let left_offset = left_data.offset(); - // If left has no bitmap, create a new one with all values set for nullity op later // left=0 (null) right=null output bitmap=null // left=0 right=1 output bitmap=null // left=1 (set) right=null output bitmap=set (passthrough) @@ -499,69 +496,72 @@ where // // Thus: result = left null bitmap & (!right_values | !right_bitmap) // OR left null bitmap & !(right_values & right_bitmap) - // - // Do the right expression !(right_values & right_bitmap) first since there are two steps - // TRICK: convert BooleanArray buffer as a bitmap for faster operation - let rcb = match right.data().null_bitmap() { - Some(right_bitmap) => { - let and = buffer_bin_and( - right.values(), - right.offset(), - right_bitmap.buffer(), - right.offset(), - right.len(), - ); - buffer_unary_not(&and, 0, right.len()) - } - None => buffer_unary_not(right.values(), right.offset(), right.len()), - }; - // AND of original left null bitmap with right expression - // Here we take care of the possible offsets of the left and right arrays all at once. - let modified_null_buffer = match left_data.null_bitmap() { - Some(left_null_bitmap) => buffer_bin_and( - left_null_bitmap.buffer(), - left_data.offset(), - &rcb, + // Compute right_values & right_bitmap + let (right, right_offset) = match right_data.null_buffer() { + Some(buffer) => ( + buffer_bin_and( + &right_data.buffers()[0], + right_data.offset(), + buffer, + right_data.offset(), + len, + ), 0, - left_data.len(), ), - None => rcb, + None => (right_data.buffers()[0].clone(), right_data.offset()), }; - // Align/shift left data on offset as needed, since new bitmaps are shifted and aligned to 0 already - // NOTE: this probably only works for primitive arrays. - let data_buffers = if left.offset() == 0 { - left_data.buffers().to_vec() - } else { - // Shift each data buffer by type's bit_width * offset. - left_data - .buffers() - .iter() - .map(|buf| buf.slice(left.offset() * T::get_byte_width())) - .collect::>() + // Compute left null bitmap & !right + let mut valid_count = 0; + let combined = match left_data.null_buffer() { + Some(left) => { + bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { + let t = l & !r; + valid_count += t.count_ones() as usize; + t + }) + } + None => bitwise_unary_op_helper(&right, right_offset, len, |b| { + let t = !b; + valid_count += t.count_ones() as usize; + t + }), }; - // Construct new array with same values but modified null bitmap - // TODO: shift data buffer as needed - let data = unsafe { - ArrayData::new_unchecked( - T::DATA_TYPE, - left.len(), - None, // force new to compute the number of null bits - Some(modified_null_buffer), - 0, // No need for offset since left data has been shifted - data_buffers, - left_data.child_data().to_vec(), - ) + // Need to construct null buffer with offset of left + let null_buffer = match left_data.offset() { + 0 => combined, + _ => { + let mut builder = BooleanBufferBuilder::new(len + left_offset); + // Pad with 0s up to offset + builder.resize(left_offset); + builder.append_packed_range(0..len, &combined); + builder.finish() + } }; - Ok(PrimitiveArray::::from(data)) + + let null_count = len - valid_count; + let data = left_data + .clone() + .into_builder() + .null_bit_buffer(Some(null_buffer)) + .null_count(null_count); + + // SAFETY: + // Only altered null mask + Ok(make_array(unsafe { data.build_unchecked() })) } #[cfg(test)] mod tests { use super::*; use crate::array::{ArrayRef, Int32Array}; + use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; + use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array}; + use arrow_array::types::Int32Type; + use arrow_array::{StringArray, StructArray}; + use arrow_schema::Field; use std::sync::Arc; #[test] @@ -1110,7 +1110,8 @@ mod tests { Some(9), ]); - assert_eq!(expected, res); + let res = as_primitive_array::(&res); + assert_eq!(&expected, res); } #[test] @@ -1136,6 +1137,278 @@ mod tests { Some(8), // None => keep it None, // true => None ]); - assert_eq!(&expected, &res) + let res = as_primitive_array::(&res); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_string() { + let s = StringArray::from_iter([ + Some("hello"), + None, + Some("world"), + Some("a"), + Some("b"), + None, + None, + ]); + let select = BooleanArray::from_iter([ + Some(true), + Some(true), + Some(false), + Some(true), + Some(false), + Some(false), + None, + ]); + + let a = nullif(&s, &select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!( + r, + vec![None, None, Some("world"), None, Some("b"), None, None] + ); + + let s = s.slice(2, 3); + let select = select.slice(1, 3); + let select = as_boolean_array(select.as_ref()); + let a = nullif(s.as_ref(), select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!(r, vec![None, Some("a"), None]); + } + + #[test] + fn test_nullif_int_large_left_offset() { + let a = Int32Array::from(vec![ + Some(-1), // 0 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), // 8 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + None, // 16 + Some(15), // 17 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(17, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_int_large_right_offset() { + let a = Int32Array::from(vec![ + None, // 0 + Some(15), // 1 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(1, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 8 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 16 + Some(false), // 17 + Some(false), // 18 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(18, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_boolean_offset() { + let a = BooleanArray::from(vec![ + None, // 0 + Some(true), // 1 + Some(false), + Some(true), + Some(true), + ]); + let a = a.slice(1, 3); // Some(true), Some(false), Some(true) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = BooleanArray::from(vec![ + Some(true), // False => keep it + Some(false), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + struct Foo { + a: Option, + b: Option, + /// Whether the entry should be valid. + is_valid: bool, + } + + impl Foo { + fn new_valid(a: i32, b: bool) -> Foo { + Self { + a: Some(a), + b: Some(b), + is_valid: true, + } + } + + fn new_null() -> Foo { + Self { + a: None, + b: None, + is_valid: false, + } + } + } + + /// Struct Array equality is a bit weird -- we need to have the *child values* + /// correct even if the enclosing struct indicates it is null. But we + /// also need the top level is_valid bits to be correct. + fn create_foo_struct(values: Vec) -> StructArray { + let mut struct_array = StructBuilder::new( + vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Boolean, true), + ], + vec![ + Box::new(Int32Builder::with_capacity(values.len())), + Box::new(BooleanBuilder::with_capacity(values.len())), + ], + ); + + for value in values { + struct_array + .field_builder::(0) + .unwrap() + .append_option(value.a); + struct_array + .field_builder::(1) + .unwrap() + .append_option(value.b); + struct_array.append(value.is_valid); + } + + struct_array.finish() + } + + #[test] + fn test_nullif_struct_slices() { + let struct_array = create_foo_struct(vec![ + Foo::new_valid(7, true), + Foo::new_valid(15, false), + Foo::new_valid(8, true), + Foo::new_valid(12, false), + Foo::new_null(), + Foo::new_null(), + Foo::new_valid(42, true), + ]); + + // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), + // None, None + let struct_array = struct_array.slice(1, 5); + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&struct_array, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = create_foo_struct(vec![ + // Some(false) -> keep + Foo::new_valid(15, false), + // None -> keep + Foo::new_valid(8, true), + // Some(true) -> null out. But child values are still there. + Foo { + a: Some(12), + b: Some(false), + is_valid: false, + }, + // Some(false) -> keep, but was null + Foo::new_null(), + // None -> keep, but was null + Foo::new_null(), + ]); + + assert_eq!(&expected, res); } } From 3db7f427ec6b41c910958dc0333cc298405628eb Mon Sep 17 00:00:00 2001 From: jakevin Date: Mon, 31 Oct 2022 12:35:55 +0800 Subject: [PATCH 0200/1411] minor: remove redundant prefix (#2983) * minor: remove redundant prefix * typo --- arrow-buffer/src/alloc/alignment.rs | 2 +- arrow-data/src/data.rs | 4 ++-- arrow-flight/build.rs | 2 +- arrow-flight/src/lib.rs | 2 +- arrow-flight/src/sql/mod.rs | 2 +- arrow-flight/src/sql/server.rs | 22 ++++++++++------------ arrow-schema/src/error.rs | 6 +++--- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arrow-buffer/src/alloc/alignment.rs b/arrow-buffer/src/alloc/alignment.rs index 1bd15c54b990..7978baa2bbd8 100644 --- a/arrow-buffer/src/alloc/alignment.rs +++ b/arrow-buffer/src/alloc/alignment.rs @@ -18,7 +18,7 @@ // NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation // should align well with usage pattern of cache access and block sizes on layers of storage levels from // registers to non-volatile memory. These alignments are all cache aware alignments incorporated -// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's +// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimics Intel TBB's // cache_aligned_allocator which exploits cache locality and minimizes prefetch signals // resulting in less round trip time between the layers of storage. // For further info: https://software.intel.com/en-us/node/506094 diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index b53e9f0af4de..902bfbf67239 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -751,7 +751,7 @@ impl ArrayData { ) -> Result<&[T], ArrowError> { let buffer = &self.buffers[idx]; - let required_len = (len + self.offset) * std::mem::size_of::(); + let required_len = (len + self.offset) * mem::size_of::(); if buffer.len() < required_len { return Err(ArrowError::InvalidArgumentError(format!( @@ -1170,7 +1170,7 @@ impl ArrayData { // This should have been checked as part of `validate()` prior // to calling `validate_full()` but double check to be sure - assert!(buffer.len() / std::mem::size_of::() >= required_len); + assert!(buffer.len() / mem::size_of::() >= required_len); // Justification: buffer size was validated above let indexes: &[T] = diff --git a/arrow-flight/build.rs b/arrow-flight/build.rs index 4ceb298359db..bc20100ab37f 100644 --- a/arrow-flight/build.rs +++ b/arrow-flight/build.rs @@ -64,7 +64,7 @@ fn main() -> Result<(), Box> { let proto_path = Path::new("../format/FlightSql.proto"); tonic_build::configure() - // protoc in unbuntu builder needs this option + // protoc in ubuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") .out_dir("src/sql") .compile(&[proto_path], &[proto_dir])?; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 054981707085..1f4bcc6c434c 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -292,7 +292,7 @@ fn schema_to_ipc_format(schema_ipc: SchemaAsIpc) -> ArrowResult { let encoded_data = flight_schema_as_encoded_data(pair.0, pair.1); let mut schema = vec![]; - arrow::ipc::writer::write_message(&mut schema, encoded_data, pair.1)?; + writer::write_message(&mut schema, encoded_data, pair.1)?; Ok(IpcMessage(schema)) } diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index cd198a1401d1..30bdcb5604ff 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -137,7 +137,7 @@ impl ProstAnyExt for prost_types::Any { if !self.is::() { return Ok(None); } - let m = prost::Message::decode(&*self.value).map_err(|err| { + let m = Message::decode(&*self.value).map_err(|err| { ArrowError::ParseError(format!("Unable to decode Any value: {}", err)) })?; Ok(Some(m)) diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index f3208d376497..525c721aa2b5 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -41,9 +41,7 @@ static CLOSE_PREPARED_STATEMENT: &str = "ClosePreparedStatement"; /// Implements FlightSqlService to handle the flight sql protocol #[tonic::async_trait] -pub trait FlightSqlService: - std::marker::Sync + std::marker::Send + std::marker::Sized + 'static -{ +pub trait FlightSqlService: Sync + Send + Sized + 'static { /// When impl FlightSqlService, you can always set FlightService to Self type FlightService: FlightService; @@ -276,7 +274,7 @@ pub trait FlightSqlService: #[tonic::async_trait] impl FlightService for T where - T: FlightSqlService + std::marker::Send, + T: FlightSqlService + Send, { type HandshakeStream = Pin> + Send + 'static>>; @@ -413,7 +411,7 @@ where &self, request: Request, ) -> Result, Status> { - let msg: prost_types::Any = prost::Message::decode(&*request.get_ref().ticket) + let msg: prost_types::Any = Message::decode(&*request.get_ref().ticket) .map_err(decode_error_to_status)?; fn unpack(msg: prost_types::Any) -> Result { @@ -465,7 +463,7 @@ where ) -> Result, Status> { let cmd = request.get_mut().message().await?.unwrap(); let message: prost_types::Any = - prost::Message::decode(&*cmd.flight_descriptor.unwrap().cmd) + Message::decode(&*cmd.flight_descriptor.unwrap().cmd) .map_err(decode_error_to_status)?; if message.is::() { let token = message @@ -474,7 +472,7 @@ where .expect("unreachable"); let record_count = self.do_put_statement_update(token, request).await?; let result = DoPutUpdateResult { record_count }; - let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { + let output = futures::stream::iter(vec![Ok(PutResult { app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); @@ -495,7 +493,7 @@ where .do_put_prepared_statement_update(handle, request) .await?; let result = DoPutUpdateResult { record_count }; - let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { + let output = futures::stream::iter(vec![Ok(PutResult { app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); @@ -587,10 +585,10 @@ where } } -fn decode_error_to_status(err: prost::DecodeError) -> tonic::Status { - tonic::Status::invalid_argument(format!("{:?}", err)) +fn decode_error_to_status(err: prost::DecodeError) -> Status { + Status::invalid_argument(format!("{:?}", err)) } -fn arrow_error_to_status(err: arrow::error::ArrowError) -> tonic::Status { - tonic::Status::internal(format!("{:?}", err)) +fn arrow_error_to_status(err: arrow::error::ArrowError) -> Status { + Status::internal(format!("{:?}", err)) } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index 105d4d5e21f0..0d7a35a9dee2 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -50,19 +50,19 @@ impl ArrowError { } } -impl From<::std::io::Error> for ArrowError { +impl From for ArrowError { fn from(error: std::io::Error) -> Self { ArrowError::IoError(error.to_string()) } } -impl From<::std::string::FromUtf8Error> for ArrowError { +impl From for ArrowError { fn from(error: std::string::FromUtf8Error) -> Self { ArrowError::ParseError(error.to_string()) } } -impl From<::std::io::IntoInnerError> for ArrowError { +impl From> for ArrowError { fn from(error: std::io::IntoInnerError) -> Self { ArrowError::IoError(error.to_string()) } From 40d61ec07ad56e355205fe8dbcb074563e30c09e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Nov 2022 03:03:52 +1300 Subject: [PATCH 0201/1411] Mark parquet predicate pushdown as complete (#2987) --- parquet/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/README.md b/parquet/README.md index cd642317a12e..d904fc64e744 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -66,7 +66,7 @@ The `parquet` crate provides the following features which may be enabled in your - [ ] Row record writer - [x] Arrow record writer - [ ] Async support -- [ ] Predicate pushdown +- [x] Predicate pushdown - [x] Parquet format 4.0.0 support ## Support for `wasm32-unknown-unknown` target From 66c9636742162f832b434a513769e158f9723e67 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 31 Oct 2022 13:26:15 -0400 Subject: [PATCH 0202/1411] Fix ignored limit on `lexsort_to_indices` (#2991) * Fix ignored limit on lexsort_to_indices * Update comments * Update arrow/src/compute/kernels/sort.rs Co-authored-by: Batuhan Taskaya Co-authored-by: Batuhan Taskaya --- arrow/src/compute/kernels/sort.rs | 40 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index b297622647e7..a10e674ac9d1 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -950,7 +950,7 @@ pub fn lexsort_to_indices( }); Ok(UInt32Array::from_iter_values( - value_indices.iter().map(|i| *i as u32), + value_indices.iter().take(len).map(|i| *i as u32), )) } @@ -1422,6 +1422,18 @@ mod tests { } } + /// slice all arrays in expected_output to offset/length + fn slice_arrays( + expected_output: Vec, + offset: usize, + length: usize, + ) -> Vec { + expected_output + .into_iter() + .map(|array| array.slice(offset, length)) + .collect() + } + fn test_sort_binary_arrays( data: Vec>>, options: Option, @@ -3439,8 +3451,10 @@ mod tests { Some(2), Some(17), ])) as ArrayRef]; - test_lex_sort_arrays(input.clone(), expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays(input.clone(), slice_arrays(expected, 0, 2), Some(2)); + // Explicitly test a limit on the sort as a demonstration let expected = vec![Arc::new(PrimitiveArray::::from(vec![ Some(-1), Some(0), @@ -3519,7 +3533,8 @@ mod tests { Some(-2), ])) as ArrayRef, ]; - test_lex_sort_arrays(input, expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays(input, slice_arrays(expected, 0, 2), Some(2)); // test mix of string and in64 with option let input = vec![ @@ -3562,7 +3577,8 @@ mod tests { Some("7"), ])) as ArrayRef, ]; - test_lex_sort_arrays(input, expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays(input, slice_arrays(expected, 0, 3), Some(3)); // test sort with nulls first let input = vec![ @@ -3605,7 +3621,8 @@ mod tests { Some("world"), ])) as ArrayRef, ]; - test_lex_sort_arrays(input, expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays(input, slice_arrays(expected, 0, 1), Some(1)); // test sort with nulls last let input = vec![ @@ -3648,7 +3665,8 @@ mod tests { None, ])) as ArrayRef, ]; - test_lex_sort_arrays(input, expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays(input, slice_arrays(expected, 0, 2), Some(2)); // test sort with opposite options let input = vec![ @@ -3695,7 +3713,15 @@ mod tests { Some("foo"), ])) as ArrayRef, ]; - test_lex_sort_arrays(input, expected, None); + test_lex_sort_arrays(input.clone(), expected.clone(), None); + test_lex_sort_arrays( + input.clone(), + slice_arrays(expected.clone(), 0, 5), + Some(5), + ); + + // Limiting by more rows than present is ok + test_lex_sort_arrays(input, slice_arrays(expected, 0, 5), Some(10)); } #[test] From 363d3fad2d279a8f641414a86f9c89d7c37193cf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Nov 2022 11:28:31 +1300 Subject: [PATCH 0203/1411] Faster dictionary sort (#2993) --- arrow/src/compute/kernels/sort.rs | 51 ++++++++++++++----------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index a10e674ac9d1..97b0758e5dc7 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -24,7 +24,6 @@ use crate::datatypes::*; use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use std::cmp::Ordering; -use std::collections::HashMap; use TimeUnit::*; /// Sort the `ArrayRef` using `SortOptions`. @@ -114,7 +113,7 @@ where } } -fn cmp(l: T, r: T) -> std::cmp::Ordering +fn cmp(l: T, r: T) -> Ordering where T: Ord, { @@ -340,13 +339,13 @@ pub fn sort_to_indices( dt if DataType::is_primitive(dt) => { let dict_values = values.values(); let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); + let value_indices_map = sorted_rank(&sorted_value_indices); sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) }, DataType::Utf8 => { let dict_values = values.values(); let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = prepare_indices_map(&sorted_value_indices); + let value_indices_map = sorted_rank(&sorted_value_indices); sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) }, t => return Err(ArrowError::ComputeError(format!( @@ -490,8 +489,8 @@ fn sort_primitive( ) -> UInt32Array where T: ArrowPrimitiveType, - T::Native: std::cmp::PartialOrd, - F: Fn(T::Native, T::Native) -> std::cmp::Ordering, + T::Native: PartialOrd, + F: Fn(T::Native, T::Native) -> Ordering, { // create tuples that are used for sorting let valids = { @@ -504,24 +503,22 @@ where sort_primitive_inner(values.len(), null_indices, cmp, options, limit, valids) } -/// A helper function used to convert sorted value indices to a map that we can look up sorted order -/// for a value index later. -fn prepare_indices_map(sorted_value_indices: &UInt32Array) -> HashMap { - sorted_value_indices - .into_iter() - .enumerate() - .map(|(idx, index)| { - // Indices don't have None value - let index = index.unwrap(); - (index as usize, idx as u32) - }) - .collect::>() +/// Given a list of indices that yield a sorted order, returns the ordered +/// rank of each index +/// +/// e.g. [2, 4, 3, 1, 0] -> [4, 3, 0, 2, 1] +fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec { + assert_eq!(sorted_value_indices.null_count(), 0); + let sorted_indices = sorted_value_indices.values(); + let mut out: Vec<_> = (0..sorted_indices.len() as u32).collect(); + out.sort_unstable_by_key(|x| sorted_indices[*x as usize]); + out } /// Sort dictionary encoded primitive values fn sort_primitive_dictionary( values: &DictionaryArray, - value_indices_map: &HashMap, + value_indices_map: &[u32], value_indices: Vec, null_indices: Vec, options: SortOptions, @@ -530,7 +527,7 @@ fn sort_primitive_dictionary( ) -> UInt32Array where K: ArrowDictionaryKeyType, - F: Fn(u32, u32) -> std::cmp::Ordering, + F: Fn(u32, u32) -> Ordering, { let keys: &PrimitiveArray = values.keys(); @@ -539,8 +536,7 @@ where .into_iter() .map(|index| { let key: K::Native = keys.value(index as usize); - let value_order = value_indices_map.get(&key.as_usize()).unwrap(); - (index, *value_order) + (index, value_indices_map[key.as_usize()]) }) .collect::>(); @@ -558,8 +554,8 @@ fn sort_primitive_inner( ) -> UInt32Array where T: ArrowNativeType, - T: std::cmp::PartialOrd, - F: Fn(T, T) -> std::cmp::Ordering, + T: PartialOrd, + F: Fn(T, T) -> Ordering, { let mut nulls = null_indices; @@ -651,7 +647,7 @@ fn sort_string( /// Sort dictionary encoded strings fn sort_string_dictionary( values: &DictionaryArray, - value_indices_map: &HashMap, + value_indices_map: &[u32], value_indices: Vec, null_indices: Vec, options: &SortOptions, @@ -664,8 +660,7 @@ fn sort_string_dictionary( .into_iter() .map(|index| { let key: T::Native = keys.value(index as usize); - let value_order = value_indices_map.get(&key.as_usize()).unwrap(); - (index, *value_order) + (index, value_indices_map[key.as_usize()]) }) .collect::>(); @@ -723,7 +718,7 @@ fn sort_list( where S: OffsetSizeTrait, T: ArrowPrimitiveType, - T::Native: std::cmp::PartialOrd, + T::Native: PartialOrd, { sort_list_inner::(values, value_indices, null_indices, options, limit) } From 4980c3540cd1897d4dfcf2554b8b6e4e80e2bc5d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 31 Oct 2022 18:47:03 -0400 Subject: [PATCH 0204/1411] Add entry to changelog for 26.0.0 RC2 fix (#2992) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 600e96b1d7ca..42cd59975921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,7 @@ **Merged pull requests:** +- Fix ignored limit on lexsort\_to\_indices (#2991) [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - Fix GenericListArray::try\_new\_from\_array\_data error message \(\#526\) [\#2961](https://github.com/apache/arrow-rs/pull/2961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Fix take string on sliced indices [\#2960](https://github.com/apache/arrow-rs/pull/2960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add BooleanArray::true\_count and BooleanArray::false\_count [\#2957](https://github.com/apache/arrow-rs/pull/2957) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From c7f97c295fdddc8e3eb149ed86b7c0588528a6be Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 2 Nov 2022 07:00:37 +1300 Subject: [PATCH 0205/1411] Specialize interleave for byte arrays (#2864) (#2975) --- arrow-select/src/interleave.rs | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 9b3de8501326..95b694aba732 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -16,11 +16,9 @@ // under the License. use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_array::{ - downcast_primitive, make_array, new_empty_array, Array, ArrayRef, ArrowPrimitiveType, - GenericStringArray, OffsetSizeTrait, PrimitiveArray, -}; -use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; @@ -85,8 +83,10 @@ pub fn interleave( downcast_primitive! { data_type => (primitive_helper, values, indices, data_type), - DataType::Utf8 => interleave_string::(values, indices, data_type), - DataType::LargeUtf8 => interleave_string::(values, indices, data_type), + DataType::Utf8 => interleave_bytes::(values, indices), + DataType::LargeUtf8 => interleave_bytes::(values, indices), + DataType::Binary => interleave_bytes::(values, indices), + DataType::LargeBinary => interleave_bytes::(values, indices), _ => interleave_fallback(values, indices) } } @@ -156,29 +156,28 @@ fn interleave_primitive( Ok(Arc::new(PrimitiveArray::::from(data))) } -fn interleave_string( +fn interleave_bytes( values: &[&dyn Array], indices: &[(usize, usize)], - data_type: &DataType, ) -> Result { - let interleaved = Interleave::<'_, GenericStringArray>::new(values, indices); + let interleaved = Interleave::<'_, GenericByteArray>::new(values, indices); let mut capacity = 0; - let mut offsets = BufferBuilder::::new(indices.len() + 1); - offsets.append(O::from_usize(0).unwrap()); + let mut offsets = BufferBuilder::::new(indices.len() + 1); + offsets.append(T::Offset::from_usize(0).unwrap()); for (a, b) in indices { let o = interleaved.arrays[*a].value_offsets(); let element_len = o[*b + 1].as_usize() - o[*b].as_usize(); capacity += element_len; - offsets.append(O::from_usize(capacity).expect("overflow")); + offsets.append(T::Offset::from_usize(capacity).expect("overflow")); } let mut values = MutableBuffer::new(capacity); for (a, b) in indices { - values.extend_from_slice(interleaved.arrays[*a].value(*b).as_bytes()); + values.extend_from_slice(interleaved.arrays[*a].value(*b).as_ref()); } - let builder = ArrayDataBuilder::new(data_type.clone()) + let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(indices.len()) .add_buffer(offsets.finish()) .add_buffer(values.into()) @@ -186,7 +185,7 @@ fn interleave_string( .null_count(interleaved.null_count); let data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(GenericStringArray::::from(data))) + Ok(Arc::new(GenericByteArray::::from(data))) } /// Fallback implementation of interleave using [`MutableArrayData`] From 62e878e12229c7bc911e3096390fd72a8e20bda2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 2 Nov 2022 07:53:17 +1300 Subject: [PATCH 0206/1411] Specialize filter kernel for binary arrays (#2969) (#2971) * Generalize filter byte array (#2969) * Fix doc * Update comment --- arrow-select/src/filter.rs | 56 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 71175ca5788d..4596afc8791f 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -17,12 +17,11 @@ //! Defines filter kernels -use std::ops::AddAssign; use std::sync::Arc; -use num::Zero; - use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; +use arrow_array::types::ByteArrayType; use arrow_array::*; use arrow_buffer::bit_util; use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; @@ -355,18 +354,16 @@ fn filter_array( Ok(Arc::new(filter_boolean(values, predicate))) } DataType::Utf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(filter_string::(values, predicate))) + Ok(Arc::new(filter_bytes(as_string_array(values), predicate))) } DataType::LargeUtf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(filter_string::(values, predicate))) + Ok(Arc::new(filter_bytes(as_largestring_array(values), predicate))) + } + DataType::Binary => { + Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) + } + DataType::LargeBinary => { + Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) } DataType::Dictionary(_, _) => downcast_dictionary_array! { values => Ok(Arc::new(filter_dict(values, predicate))), @@ -545,11 +542,11 @@ where PrimitiveArray::from(data) } -/// [`FilterString`] is created from a source [`GenericStringArray`] and can be -/// used to build a new [`GenericStringArray`] by copying values from the source +/// [`FilterBytes`] is created from a source [`GenericByteArray`] and can be +/// used to build a new [`GenericByteArray`] by copying values from the source /// /// TODO(raphael): Could this be used for the take kernel as well? -struct FilterString<'a, OffsetSize> { +struct FilterBytes<'a, OffsetSize> { src_offsets: &'a [OffsetSize], src_values: &'a [u8], dst_offsets: MutableBuffer, @@ -557,15 +554,18 @@ struct FilterString<'a, OffsetSize> { cur_offset: OffsetSize, } -impl<'a, OffsetSize> FilterString<'a, OffsetSize> +impl<'a, OffsetSize> FilterBytes<'a, OffsetSize> where - OffsetSize: Zero + AddAssign + OffsetSizeTrait, + OffsetSize: OffsetSizeTrait, { - fn new(capacity: usize, array: &'a GenericStringArray) -> Self { + fn new(capacity: usize, array: &'a GenericByteArray) -> Self + where + T: ByteArrayType, + { let num_offsets_bytes = (capacity + 1) * std::mem::size_of::(); let mut dst_offsets = MutableBuffer::new(num_offsets_bytes); let dst_values = MutableBuffer::new(0); - let cur_offset = OffsetSize::zero(); + let cur_offset = OffsetSize::from_usize(0).unwrap(); dst_offsets.push(cur_offset); Self { @@ -622,21 +622,21 @@ where } } -/// `filter` implementation for string arrays +/// `filter` implementation for byte arrays /// /// Note: NULLs with a non-zero slot length in `array` will have the corresponding /// data copied across. This allows handling the null mask separately from the data -fn filter_string( - array: &GenericStringArray, +fn filter_bytes( + array: &GenericByteArray, predicate: &FilterPredicate, -) -> GenericStringArray +) -> GenericByteArray where - OffsetSize: Zero + AddAssign + OffsetSizeTrait, + T: ByteArrayType, { let data = array.data(); assert_eq!(data.buffers().len(), 2); assert_eq!(data.child_data().len(), 0); - let mut filter = FilterString::new(predicate.count, array); + let mut filter = FilterBytes::new(predicate.count, array); match &predicate.strategy { IterationStrategy::SlicesIterator => { @@ -650,7 +650,7 @@ where IterationStrategy::All | IterationStrategy::None => unreachable!(), } - let mut builder = ArrayDataBuilder::new(data.data_type().clone()) + let mut builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(predicate.count) .add_buffer(filter.dst_offsets.into()) .add_buffer(filter.dst_values.into()); @@ -660,7 +660,7 @@ where } let data = unsafe { builder.build_unchecked() }; - GenericStringArray::from(data) + GenericByteArray::from(data) } /// `filter` implementation for dictionaries From f11372cb8ff4d6fecbe1bd7b5ef3d66cba719c83 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Thu, 3 Nov 2022 00:09:13 +0800 Subject: [PATCH 0207/1411] Add `RowSelection::from_selectors_and_combine` to merge RowSelectors (#2994) * Support merge RowSelectors when creating RowSelection * remove useless * change it to default from * Update parquet/src/arrow/arrow_reader/selection.rs Co-authored-by: Andrew Lamb * fix comment Co-authored-by: Andrew Lamb --- parquet/src/arrow/arrow_reader/selection.rs | 124 +++++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index f1270926bf4e..2328c4501598 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -117,6 +117,38 @@ impl RowSelection { Self { selectors } } + /// Creates a [`RowSelection`] from a slice of uncombined `RowSelector`: + /// Like [skip(5),skip(5),read(10)]. + /// After combine will return [skip(10),read(10)] + /// # Note + /// [`RowSelection`] must be combined prior to use within offset_index or else the code will panic. + fn from_selectors_and_combine(selectors: &[RowSelector]) -> Self { + if selectors.len() < 2 { + return Self { + selectors: Vec::from(selectors), + }; + } + let first = selectors.first().unwrap(); + let mut sum_rows = first.row_count; + let mut skip = first.skip; + let mut combined_result = vec![]; + + for s in selectors.iter().skip(1) { + if s.skip == skip { + sum_rows += s.row_count + } else { + add_selector(skip, sum_rows, &mut combined_result); + sum_rows = s.row_count; + skip = s.skip; + } + } + add_selector(skip, sum_rows, &mut combined_result); + + Self { + selectors: combined_result, + } + } + /// Given an offset index, return the offset ranges for all data pages selected by `self` #[cfg(any(test, feature = "async"))] pub(crate) fn scan_ranges( @@ -307,7 +339,7 @@ impl RowSelection { impl From> for RowSelection { fn from(selectors: Vec) -> Self { - Self { selectors } + Self::from_selectors_and_combine(selectors.as_slice()) } } @@ -317,6 +349,15 @@ impl From for VecDeque { } } +fn add_selector(skip: bool, sum_row: usize, combined_result: &mut Vec) { + let selector = if skip { + RowSelector::skip(sum_row) + } else { + RowSelector::select(sum_row) + }; + combined_result.push(selector); +} + #[cfg(test)] mod tests { use super::*; @@ -470,6 +511,87 @@ mod tests { ); } + #[test] + fn test_combine() { + let a = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + ]; + + let b = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + RowSelector::skip(0), + ]; + + let c = vec![ + RowSelector::skip(2), + RowSelector::skip(4), + RowSelector::select(3), + RowSelector::select(3), + RowSelector::select(4), + RowSelector::skip(3), + RowSelector::skip(1), + RowSelector::skip(0), + ]; + + let expected = RowSelection::from(vec![ + RowSelector::skip(6), + RowSelector::select(10), + RowSelector::skip(4), + ]); + + assert_eq!(RowSelection::from_selectors_and_combine(&a), expected); + assert_eq!(RowSelection::from_selectors_and_combine(&b), expected); + assert_eq!(RowSelection::from_selectors_and_combine(&c), expected); + } + + #[test] + fn test_combine_2elements() { + let a = vec![RowSelector::select(10), RowSelector::select(5)]; + let a_expect = vec![RowSelector::select(15)]; + assert_eq!( + RowSelection::from_selectors_and_combine(&a).selectors, + a_expect + ); + + let b = vec![RowSelector::select(10), RowSelector::skip(5)]; + let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; + assert_eq!( + RowSelection::from_selectors_and_combine(&b).selectors, + b_expect + ); + + let c = vec![RowSelector::skip(10), RowSelector::select(5)]; + let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; + assert_eq!( + RowSelection::from_selectors_and_combine(&c).selectors, + c_expect + ); + + let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; + let d_expect = vec![RowSelector::skip(15)]; + assert_eq!( + RowSelection::from_selectors_and_combine(&d).selectors, + d_expect + ); + } + + #[test] + fn test_from_one_and_empty() { + let a = vec![RowSelector::select(10)]; + let selection1 = RowSelection::from(a.clone()); + assert_eq!(selection1.selectors, a); + + let b = vec![]; + let selection1 = RowSelection::from(b.clone()); + assert_eq!(selection1.selectors, b) + } + #[test] #[should_panic(expected = "selection exceeds the number of selected rows")] fn test_and_longer() { From b1050b7dce07a299c5f206e612c5eaa27277c563 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 3 Nov 2022 08:03:26 +1300 Subject: [PATCH 0208/1411] Update chrono-tz requirement from 0.7 to 0.8 (#3001) Updates the requirements on [chrono-tz](https://github.com/chronotope/chrono-tz) to permit the latest version. - [Release notes](https://github.com/chronotope/chrono-tz/releases) - [Changelog](https://github.com/chronotope/chrono-tz/blob/main/CHANGELOG.md) - [Commits](https://github.com/chronotope/chrono-tz/commits) --- updated-dependencies: - dependency-name: chrono-tz dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-array/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index f39899c70942..21f66c87ebd0 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -49,7 +49,7 @@ arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } arrow-data = { version = "26.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -chrono-tz = { version = "0.7", optional = true } +chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } hashbrown = { version = "0.12", default-features = false } From a2c6647d4790b64c538361d02bdbe34640ee75d2 Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 2 Nov 2022 18:24:13 -0700 Subject: [PATCH 0209/1411] `arrow::compute::kernels::temporal` should support nanoseconds (#2996) * temporal nano seconds support * renamed functions * returned some public methods for downstream projects * moved tests parts --- arrow/src/compute/kernels/temporal.rs | 150 ++++++++++++++++---------- 1 file changed, 92 insertions(+), 58 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 412adb9a9fe9..8e42b04b9fbc 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -621,50 +621,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - minute_generic::(array) -} - -/// Extracts the minutes of a given temporal array as an array of integers -pub fn minute_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - minute_internal::(array, value_type.as_ref()) - } - dt => minute_internal::(array, &dt), - } -} - -/// Extracts the minutes of a given temporal array as an array of integers -fn minute_internal>( - array: A, - dt: &DataType, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - let b = Int32Builder::with_capacity(array.len()); - match dt { - DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { - t.minute() as i32 - })) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { - t.minute() as i32 - }) - } - _ => return_compute_error_with!("minute does not support", array.data_type()), - } + time_fraction_generic::(array, "minute", |t| t.minute() as i32) } /// Extracts the week of a given temporal primitive array as an array of integers @@ -717,31 +674,46 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - second_generic::(array) + time_fraction_generic::(array, "second", |t| t.second() as i32) } -/// Extracts the seconds of a given temporal array as an array of integers -pub fn second_generic>( +/// Extracts the nanoseconds of a given temporal primitive array as an array of integers +pub fn nanosecond(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + time_fraction_generic::(array, "nanosecond", |t| t.nanosecond() as i32) +} + +/// Extracts the time fraction of a given temporal array as an array of integers +fn time_fraction_generic, F>( array: A, + name: &str, + op: F, ) -> Result where + F: Fn(NaiveDateTime) -> i32, T: ArrowTemporalType + ArrowNumericType, i64: From, { match array.data_type().clone() { DataType::Dictionary(_, value_type) => { - second_internal::(array, value_type.as_ref()) + time_fraction_internal::(array, value_type.as_ref(), name, op) } - dt => second_internal::(array, &dt), + dt => time_fraction_internal::(array, &dt, name, op), } } -/// Extracts the seconds of a given temporal array as an array of integers -fn second_internal>( +/// Extracts the time fraction of a given temporal array as an array of integers +fn time_fraction_internal, F>( array: A, dt: &DataType, + name: &str, + op: F, ) -> Result where + F: Fn(NaiveDateTime) -> i32, T: ArrowTemporalType + ArrowNumericType, i64: From, { @@ -749,20 +721,41 @@ where match dt { DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { - t.second() as i32 - })) + Ok(as_datetime_with_op::(iter, b, op)) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); extract_component_from_datetime_array::(iter, b, tz, |t| { - t.second() as i32 + op(t.naive_local()) }) } - _ => return_compute_error_with!("second does not support", array.data_type()), + _ => return_compute_error_with!( + format!("{} does not support", name), + array.data_type() + ), } } +pub fn minute_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + time_fraction_generic::(array, "minute", |t| t.minute() as i32) +} + +pub fn second_generic>( + array: A, +) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + time_fraction_generic::(array, "second", |t| t.second() as i32) +} + #[cfg(test)] mod tests { use super::*; @@ -1212,21 +1205,47 @@ mod tests { let expected = Int32Array::from(vec![11, 11, 21, 7, 21]); assert_eq!(expected, b); - let b = minute_generic::( + let b = time_fraction_generic::( + dict.downcast_dict::().unwrap(), + "minute", + |t| t.minute() as i32, + ) + .unwrap(); + + let b_old = minute_generic::( dict.downcast_dict::().unwrap(), ) .unwrap(); let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); assert_eq!(expected, b); + assert_eq!(expected, b_old); - let b = second_generic::( + let b = time_fraction_generic::( + dict.downcast_dict::().unwrap(), + "second", + |t| t.second() as i32, + ) + .unwrap(); + + let b_old = second_generic::( dict.downcast_dict::().unwrap(), ) .unwrap(); let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); assert_eq!(expected, b); + assert_eq!(expected, b_old); + + let b = time_fraction_generic::( + dict.downcast_dict::().unwrap(), + "nanosecond", + |t| t.nanosecond() as i32, + ) + .unwrap(); + + let expected = Int32Array::from(vec![0, 0, 0, 0, 0]); + assert_eq!(expected, b); } #[test] @@ -1313,4 +1332,19 @@ mod tests { let expected = Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]); assert_eq!(expected, b); } + + #[test] + fn test_temporal_array_date64_nanosecond() { + // new Date(1667328721453) + // Tue Nov 01 2022 11:52:01 GMT-0700 (Pacific Daylight Time) + // + // new Date(1667328721453).getMilliseconds() + // 453 + + let a: PrimitiveArray = vec![None, Some(1667328721453)].into(); + + let b = nanosecond(&a).unwrap(); + assert!(!b.is_valid(0)); + assert_eq!(453_000_000, b.value(1)); + } } From 24afac49063d4ee92c0a2be84fd7c305d6088e14 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 3 Nov 2022 01:28:48 -0700 Subject: [PATCH 0210/1411] Add macro downcast_temporal_array (#3007) * Add macro downcast_temporal_array * Fix TimeUnit import * Use downcast_temporal in downcast_primitive * Fix typo --- arrow-array/src/cast.rs | 137 ++++++++++++++++++++++++++++++++-------- 1 file changed, 110 insertions(+), 27 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 4436dc77c2f0..7bc62713733e 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -98,48 +98,31 @@ macro_rules! downcast_integer { /// `m` with the corresponding [`ArrowPrimitiveType`], followed by any additional arguments /// /// ``` -/// # use arrow_array::{downcast_primitive, ArrowPrimitiveType}; +/// # use arrow_array::{downcast_temporal, ArrowPrimitiveType}; /// # use arrow_schema::DataType; /// -/// macro_rules! primitive_size_helper { +/// macro_rules! temporal_size_helper { /// ($t:ty, $o:ty) => { /// std::mem::size_of::<<$t as ArrowPrimitiveType>::Native>() as $o /// }; /// } /// -/// fn primitive_size(t: &DataType) -> u8 { -/// downcast_primitive! { -/// t => (primitive_size_helper, u8), +/// fn temporal_size(t: &DataType) -> u8 { +/// downcast_temporal! { +/// t => (temporal_size_helper, u8), /// _ => u8::MAX /// } /// } /// -/// assert_eq!(primitive_size(&DataType::Int32), 4); -/// assert_eq!(primitive_size(&DataType::Int64), 8); -/// assert_eq!(primitive_size(&DataType::Float16), 2); +/// assert_eq!(temporal_size(&DataType::Date32), 4); +/// assert_eq!(temporal_size(&DataType::Date64), 8); /// ``` /// /// [`DataType`]: arrow_schema::DataType #[macro_export] -macro_rules! downcast_primitive { +macro_rules! downcast_temporal { ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { - $crate::downcast_integer! { - $($data_type),+ => ($m $(, $args)*), - $crate::repeat_pat!(arrow_schema::DataType::Float16, $($data_type),+) => { - $m!($crate::types::Float16Type $(, $args)*) - } - $crate::repeat_pat!(arrow_schema::DataType::Float32, $($data_type),+) => { - $m!($crate::types::Float32Type $(, $args)*) - } - $crate::repeat_pat!(arrow_schema::DataType::Float64, $($data_type),+) => { - $m!($crate::types::Float64Type $(, $args)*) - } - $crate::repeat_pat!(arrow_schema::DataType::Date32, $($data_type),+) => { - $m!($crate::types::Date32Type $(, $args)*) - } - $crate::repeat_pat!(arrow_schema::DataType::Date64, $($data_type),+) => { - $m!($crate::types::Date64Type $(, $args)*) - } + match ($($data_type),+) { $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), $($data_type),+) => { $m!($crate::types::Time32SecondType $(, $args)*) } @@ -152,6 +135,12 @@ macro_rules! downcast_primitive { $crate::repeat_pat!(arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), $($data_type),+) => { $m!($crate::types::Time64NanosecondType $(, $args)*) } + $crate::repeat_pat!(arrow_schema::DataType::Date32, $($data_type),+) => { + $m!($crate::types::Date32Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Date64, $($data_type),+) => { + $m!($crate::types::Date64Type $(, $args)*) + } $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), $($data_type),+) => { $m!($crate::types::TimestampSecondType $(, $args)*) } @@ -164,6 +153,95 @@ macro_rules! downcast_primitive { $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), $($data_type),+) => { $m!($crate::types::TimestampNanosecondType $(, $args)*) } + $(($($p),+) => $fallback,)* + } + }; +} + +/// Downcast an [`Array`] to a temporal [`PrimitiveArray`] based on its [`DataType`] +/// accepts a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, downcast_temporal_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_temporal(array: &dyn Array) { +/// downcast_temporal_array!( +/// array => { +/// for v in array { +/// println!("{:?}", v); +/// } +/// } +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_temporal_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal_array!($values => {$e} $($p => $fallback)*) + }; + (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal_array!($($values),+ => $e $($($p),+ => $fallback)*) + }; + ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal!{ + $($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e), + $($($p),+ => $fallback,)* + } + }; +} + +/// Given one or more expressions evaluating to primitive [`DataType`] invokes the provided macro +/// `m` with the corresponding [`ArrowPrimitiveType`], followed by any additional arguments +/// +/// ``` +/// # use arrow_array::{downcast_primitive, ArrowPrimitiveType}; +/// # use arrow_schema::DataType; +/// +/// macro_rules! primitive_size_helper { +/// ($t:ty, $o:ty) => { +/// std::mem::size_of::<<$t as ArrowPrimitiveType>::Native>() as $o +/// }; +/// } +/// +/// fn primitive_size(t: &DataType) -> u8 { +/// downcast_primitive! { +/// t => (primitive_size_helper, u8), +/// _ => u8::MAX +/// } +/// } +/// +/// assert_eq!(primitive_size(&DataType::Int32), 4); +/// assert_eq!(primitive_size(&DataType::Int64), 8); +/// assert_eq!(primitive_size(&DataType::Float16), 2); +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_primitive { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + $crate::downcast_integer! { + $($data_type),+ => ($m $(, $args)*), + $crate::repeat_pat!(arrow_schema::DataType::Float16, $($data_type),+) => { + $m!($crate::types::Float16Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Float32, $($data_type),+) => { + $m!($crate::types::Float32Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Float64, $($data_type),+) => { + $m!($crate::types::Float64Type $(, $args)*) + } $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($data_type),+) => { $m!($crate::types::IntervalYearMonthType $(, $args)*) } @@ -185,7 +263,12 @@ macro_rules! downcast_primitive { $crate::repeat_pat!(arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), $($data_type),+) => { $m!($crate::types::DurationNanosecondType $(, $args)*) } - $($($p),+ => $fallback,)* + _ => { + $crate::downcast_temporal! { + $($data_type),+ => ($m $(, $args)*), + $($($p),+ => $fallback,)* + } + } } }; } From 61cf6f75c4e03ca950f750cb2fdba4adee534372 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 4 Nov 2022 06:19:42 +0800 Subject: [PATCH 0211/1411] Round instead of Truncate while casting float to decimal (#3000) * add .round() before casting to integer * add more test cases * update test cases * add doc * Format Co-authored-by: Raphael Taylor-Davies --- arrow/src/compute/kernels/cast.rs | 103 +++++++++++++++++++++++------- 1 file changed, 79 insertions(+), 24 deletions(-) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 4c724b6401b9..4ad8dd99e73e 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -297,6 +297,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { /// * Time32 and Time64: precision lost when going to higher interval /// * Timestamp and Date{32|64}: precision lost when going to higher interval /// * Temporal to/from backing primitive: zero-copy with data type change +/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals +/// (i.e. casting 6.4999 to Decimal(10, 1) becomes 6.5). This is the breaking change from `26.0.0`. +/// It used to truncate it instead of round (i.e. outputs 6.4 instead) /// /// Unsupported Casts /// * To or from `StructArray` @@ -353,7 +356,7 @@ where { let mul = 10_f64.powi(scale as i32); - unary::(array, |v| (v.as_() * mul) as i128) + unary::(array, |v| (v.as_() * mul).round() as i128) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } @@ -368,9 +371,11 @@ where { let mul = 10_f64.powi(scale as i32); - unary::(array, |v| i256::from_i128((v.as_() * mul) as i128)) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) + unary::(array, |v| { + i256::from_i128((v.as_() * mul).round() as i128) + }) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] @@ -3192,8 +3197,8 @@ mod tests { Some(2.2), Some(4.4), None, - Some(1.123_456_7), - Some(1.123_456_7), + Some(1.123_456_4), // round down + Some(1.123_456_7), // round up ]); let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( @@ -3205,8 +3210,8 @@ mod tests { Some(2200000_i128), Some(4400000_i128), None, - Some(1123456_i128), - Some(1123456_i128), + Some(1123456_i128), // round down + Some(1123457_i128), // round up ] ); @@ -3216,9 +3221,10 @@ mod tests { Some(2.2), Some(4.4), None, - Some(1.123_456_789_123_4), - Some(1.123_456_789_012_345_6), - Some(1.123_456_789_012_345_6), + Some(1.123_456_489_123_4), // round up + Some(1.123_456_789_123_4), // round up + Some(1.123_456_489_012_345_6), // round down + Some(1.123_456_789_012_345_6), // round up ]); let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( @@ -3230,9 +3236,10 @@ mod tests { Some(2200000_i128), Some(4400000_i128), None, - Some(1123456_i128), - Some(1123456_i128), - Some(1123456_i128), + Some(1123456_i128), // round down + Some(1123457_i128), // round up + Some(1123456_i128), // round down + Some(1123457_i128), // round up ] ); } @@ -3307,8 +3314,8 @@ mod tests { Some(2.2), Some(4.4), None, - Some(1.123_456_7), - Some(1.123_456_7), + Some(1.123_456_4), // round down + Some(1.123_456_7), // round up ]); let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( @@ -3320,8 +3327,8 @@ mod tests { Some(i256::from_i128(2200000_i128)), Some(i256::from_i128(4400000_i128)), None, - Some(i256::from_i128(1123456_i128)), - Some(i256::from_i128(1123456_i128)), + Some(i256::from_i128(1123456_i128)), // round down + Some(i256::from_i128(1123457_i128)), // round up ] ); @@ -3331,9 +3338,10 @@ mod tests { Some(2.2), Some(4.4), None, - Some(1.123_456_789_123_4), - Some(1.123_456_789_012_345_6), - Some(1.123_456_789_012_345_6), + Some(1.123_456_489_123_4), // round down + Some(1.123_456_789_123_4), // round up + Some(1.123_456_489_012_345_6), // round down + Some(1.123_456_789_012_345_6), // round up ]); let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( @@ -3345,9 +3353,10 @@ mod tests { Some(i256::from_i128(2200000_i128)), Some(i256::from_i128(4400000_i128)), None, - Some(i256::from_i128(1123456_i128)), - Some(i256::from_i128(1123456_i128)), - Some(i256::from_i128(1123456_i128)), + Some(i256::from_i128(1123456_i128)), // round down + Some(i256::from_i128(1123457_i128)), // round up + Some(i256::from_i128(1123456_i128)), // round down + Some(i256::from_i128(1123457_i128)), // round up ] ); } @@ -5994,4 +6003,50 @@ mod tests { .collect::>(); assert_eq!(&out, &vec!["[0, 1, 2]", "[3, 4, 5]", "[6, 7]"]); } + + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_cast_f64_to_decimal128() { + // to reproduce https://github.com/apache/arrow-rs/issues/2997 + + let decimal_type = DataType::Decimal128(18, 2); + let array = Float64Array::from(vec![ + Some(0.0699999999), + Some(0.0659999999), + Some(0.0650000000), + Some(0.0649999999), + ]); + let array = Arc::new(array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &decimal_type, + vec![ + Some(7_i128), // round up + Some(7_i128), // round up + Some(7_i128), // round up + Some(6_i128), // round down + ] + ); + + let decimal_type = DataType::Decimal128(18, 3); + let array = Float64Array::from(vec![ + Some(0.0699999999), + Some(0.0659999999), + Some(0.0650000000), + Some(0.0649999999), + ]); + let array = Arc::new(array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &decimal_type, + vec![ + Some(70_i128), // round up + Some(66_i128), // round up + Some(65_i128), // round down + Some(65_i128), // round up + ] + ); + } } From 01ea8c70c6dd57670e0533edd784ce903b594316 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 3 Nov 2022 15:37:36 -0700 Subject: [PATCH 0212/1411] Fix some clippy errors after updating rust toolchain (#3010) * Fix clippy * More * More --- arrow-array/src/array/binary_array.rs | 2 +- arrow-schema/src/field.rs | 6 +-- arrow/benches/lexsort.rs | 4 +- arrow/src/compute/kernels/partition.rs | 28 ++++--------- arrow/src/csv/reader.rs | 2 +- arrow/src/ffi.rs | 6 +-- arrow/src/ipc/convert.rs | 5 +-- arrow/src/json/reader.rs | 6 +-- arrow/src/tensor.rs | 4 +- arrow/src/util/pretty.rs | 6 +-- arrow/tests/array_equal.rs | 56 +++++++++++++------------- parquet/src/bin/parquet-read.rs | 2 +- parquet/src/bin/parquet-schema.rs | 2 +- parquet/src/record/api.rs | 2 +- parquet/src/schema/types.rs | 2 +- parquet/tests/boolean_writer.rs | 6 +-- parquet_derive/src/parquet_field.rs | 7 ++-- 17 files changed, 64 insertions(+), 82 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 259d949d42a5..2ca8a061a6fa 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -655,7 +655,7 @@ mod tests { #[test] #[should_panic(expected = "LargeBinaryArray expects DataType::LargeBinary")] fn test_binary_array_validation() { - let array = BinaryArray::from_iter_values(&[&[1, 2]]); + let array = BinaryArray::from_iter_values([&[1, 2]]); let _ = LargeBinaryArray::from(array.into_data()); } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index adafbfa9b72c..e414d2834275 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -200,12 +200,12 @@ impl Field { /// within `self` contained within this field (including `self`) pub(crate) fn fields(&self) -> Vec<&Field> { let mut collected_fields = vec![self]; - collected_fields.append(&mut self._fields(&self.data_type)); + collected_fields.append(&mut Field::_fields(&self.data_type)); collected_fields } - fn _fields<'a>(&'a self, dt: &'a DataType) -> Vec<&Field> { + fn _fields(dt: &DataType) -> Vec<&Field> { match dt { DataType::Struct(fields) | DataType::Union(fields, _, _) => { fields.iter().flat_map(|f| f.fields()).collect() @@ -214,7 +214,7 @@ impl Field { | DataType::LargeList(field) | DataType::FixedSizeList(field, _) | DataType::Map(field, _) => field.fields(), - DataType::Dictionary(_, value_field) => self._fields(value_field.as_ref()), + DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()), _ => vec![], } } diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index 3820007231ab..aebb588cf9cc 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -162,8 +162,8 @@ fn add_benchmark(c: &mut Criterion) { ]; for case in cases { - do_bench(c, *case, 4096); - do_bench(c, *case, 4096 * 8); + do_bench(c, case, 4096); + do_bench(c, case, 4096 * 8); } } diff --git a/arrow/src/compute/kernels/partition.rs b/arrow/src/compute/kernels/partition.rs index e3a1497b8d27..0e48e627e655 100644 --- a/arrow/src/compute/kernels/partition.rs +++ b/arrow/src/compute/kernels/partition.rs @@ -174,33 +174,24 @@ mod tests { let median = input[input.len() / 2]; assert_eq!( 9, - partition_point( - 0, - input.len(), - &(|i: usize| input[i].cmp(&median) != Ordering::Greater) - ) + partition_point(0, input.len(), |i: usize| input[i].cmp(&median) + != Ordering::Greater) ); } { let search = input[9]; assert_eq!( 12, - partition_point( - 9, - input.len(), - &(|i: usize| input[i].cmp(&search) != Ordering::Greater) - ) + partition_point(9, input.len(), |i: usize| input[i].cmp(&search) + != Ordering::Greater) ); } { let search = input[0]; assert_eq!( 3, - partition_point( - 0, - 9, - &(|i: usize| input[i].cmp(&search) != Ordering::Greater) - ) + partition_point(0, 9, |i: usize| input[i].cmp(&search) + != Ordering::Greater) ); } let input = &[1, 2, 2, 2, 2, 2, 2, 2, 9]; @@ -208,11 +199,8 @@ mod tests { let search = input[5]; assert_eq!( 8, - partition_point( - 5, - 9, - &(|i: usize| input[i].cmp(&search) != Ordering::Greater) - ) + partition_point(5, 9, |i: usize| input[i].cmp(&search) + != Ordering::Greater) ); } } diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 123c5e1c6716..ff6df5514983 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1690,7 +1690,7 @@ mod tests { let actual = result.unwrap_err().to_string(); assert!( - actual.contains(&expected), + actual.contains(expected), "actual: '{}', expected: '{}'", actual, expected diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 77d277afa300..95e6dce3c5fd 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -474,11 +474,7 @@ impl FFI_ArrowArray { // If the layout has a null buffer by Arrow spec. // Note that even the array doesn't have a null buffer because it has // no null value, we still need to count 1 here to follow the spec. - if data_layout.can_contain_null_mask { - 1 - } else { - 0 - } + usize::from(data_layout.can_contain_null_mask) } } as i64; diff --git a/arrow/src/ipc/convert.rs b/arrow/src/ipc/convert.rs index 9f6cda37c650..0f5d246bcce5 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow/src/ipc/convert.rs @@ -437,7 +437,7 @@ pub(crate) fn build_field<'a>( }; let fb_field_name = fbb.create_string(field.name().as_str()); - let field_type = get_fb_field_type(field.data_type(), field.is_nullable(), fbb); + let field_type = get_fb_field_type(field.data_type(), fbb); let fb_dictionary = if let Dictionary(index_type, _) = field.data_type() { Some(get_fb_dictionary( @@ -477,7 +477,6 @@ pub(crate) fn build_field<'a>( /// Get the IPC type of a data type pub(crate) fn get_fb_field_type<'a>( data_type: &DataType, - is_nullable: bool, fbb: &mut FlatBufferBuilder<'a>, ) -> FBFieldType<'a> { // some IPC implementations expect an empty list for child data, instead of a null value. @@ -717,7 +716,7 @@ pub(crate) fn get_fb_field_type<'a>( // In this library, the dictionary "type" is a logical construct. Here we // pass through to the value type, as we've already captured the index // type in the DictionaryEncoding metadata in the parent field - get_fb_field_type(value_type, is_nullable, fbb) + get_fb_field_type(value_type, fbb) } Decimal128(precision, scale) => { let mut builder = ipc::DecimalBuilder::new(fbb); diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index d15894024809..a7382128e1c8 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -962,7 +962,7 @@ impl Decoder { fn build_boolean_array(&self, rows: &[Value], col_name: &str) -> Result { let mut builder = BooleanBuilder::with_capacity(rows.len()); for row in rows { - if let Some(value) = row.get(&col_name) { + if let Some(value) = row.get(col_name) { if let Some(boolean) = value.as_bool() { builder.append_value(boolean); } else { @@ -993,7 +993,7 @@ impl Decoder { Ok(Arc::new( rows.iter() .map(|row| { - row.get(&col_name).and_then(|value| { + row.get(col_name).and_then(|value| { if value.is_i64() { value.as_i64().and_then(num::cast::cast) } else if value.is_u64() { @@ -1496,7 +1496,7 @@ impl Decoder { let mut builder: StringDictionaryBuilder = self.build_string_dictionary_builder(rows.len()); for row in rows { - if let Some(value) = row.get(&col_name) { + if let Some(value) = row.get(col_name) { if let Some(str_v) = value.as_str() { builder.append(str_v).map(drop)? } else { diff --git a/arrow/src/tensor.rs b/arrow/src/tensor.rs index b8d07f83fb90..a46a1d08df85 100644 --- a/arrow/src/tensor.rs +++ b/arrow/src/tensor.rs @@ -113,13 +113,13 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { )); } - if strides != None { + if strides.is_some() { return Err(ArrowError::InvalidArgumentError( "expected None strides for tensor with no shape".to_string(), )); } - if names != None { + if names.is_some() { return Err(ArrowError::InvalidArgumentError( "expected None names for tensor with no shape".to_string(), )); diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 8d811223cbb5..63d5977e21c5 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -64,7 +64,7 @@ fn create_table(results: &[RecordBatch]) -> Result { let mut header = Vec::new(); for field in schema.fields() { - header.push(Cell::new(&field.name())); + header.push(Cell::new(field.name())); } table.set_header(header); @@ -317,9 +317,9 @@ mod tests { let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 3); - builder.append_value(&[1, 2, 3]).unwrap(); + builder.append_value([1, 2, 3]).unwrap(); builder.append_null(); - builder.append_value(&[7, 8, 9]).unwrap(); + builder.append_value([7, 8, 9]).unwrap(); let array = Arc::new(builder.finish()); diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index a5f3f42a1dfd..d24a24e2ea48 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -386,11 +386,11 @@ fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayDa #[test] fn test_list_equal() { - let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let a = create_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); test_equal(&a, &b, true); - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + let b = create_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 7])]); test_equal(&a, &b, false); } @@ -448,11 +448,11 @@ fn test_empty_offsets_list_equal() { // Test the case where null_count > 0 #[test] fn test_list_null() { - let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let a = create_list_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); test_equal(&a, &b, true); - let b = create_list_array(&[ + let b = create_list_array([ Some(&[1, 2]), None, Some(&[5, 6]), @@ -462,7 +462,7 @@ fn test_list_null() { ]); test_equal(&a, &b, false); - let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + let b = create_list_array([Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); test_equal(&a, &b, false); // a list where the nullness of values is determined by the list's bitmap @@ -506,8 +506,8 @@ fn test_list_null() { // Test the case where offset != 0 #[test] fn test_list_offsets() { - let a = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); + let a = create_list_array([Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); + let b = create_list_array([Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); let a_slice = a.slice(0, 3); let b_slice = b.slice(0, 3); @@ -539,32 +539,32 @@ fn create_fixed_size_binary_array, T: AsRef<[Option]>>( #[test] fn test_fixed_size_binary_equal() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); + let a = create_fixed_size_binary_array([Some(b"hello"), Some(b"world")]); + let b = create_fixed_size_binary_array([Some(b"hello"), Some(b"world")]); test_equal(&a, &b, true); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); + let b = create_fixed_size_binary_array([Some(b"hello"), Some(b"arrow")]); test_equal(&a, &b, false); } // Test the case where null_count > 0 #[test] fn test_fixed_size_binary_null() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); + let a = create_fixed_size_binary_array([Some(b"hello"), None, Some(b"world")]); + let b = create_fixed_size_binary_array([Some(b"hello"), None, Some(b"world")]); test_equal(&a, &b, true); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); + let b = create_fixed_size_binary_array([Some(b"hello"), Some(b"world"), None]); test_equal(&a, &b, false); - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); + let b = create_fixed_size_binary_array([Some(b"hello"), None, Some(b"arrow")]); test_equal(&a, &b, false); } #[test] fn test_fixed_size_binary_offsets() { // Test the case where offset != 0 - let a = create_fixed_size_binary_array(&[ + let a = create_fixed_size_binary_array([ Some(b"hello"), None, None, @@ -572,7 +572,7 @@ fn test_fixed_size_binary_offsets() { None, None, ]); - let b = create_fixed_size_binary_array(&[ + let b = create_fixed_size_binary_array([ Some(b"hello"), None, None, @@ -706,18 +706,18 @@ fn create_fixed_size_list_array, T: AsRef<[Option]>>( #[test] fn test_fixed_size_list_equal() { - let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let a = create_fixed_size_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); + let b = create_fixed_size_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 6])]); test_equal(&a, &b, true); - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); + let b = create_fixed_size_list_array([Some(&[1, 2, 3]), Some(&[4, 5, 7])]); test_equal(&a, &b, false); } // Test the case where null_count > 0 #[test] fn test_fixed_list_null() { - let a = create_fixed_size_list_array(&[ + let a = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, None, @@ -725,7 +725,7 @@ fn test_fixed_list_null() { None, None, ]); - let b = create_fixed_size_list_array(&[ + let b = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, None, @@ -735,7 +735,7 @@ fn test_fixed_list_null() { ]); test_equal(&a, &b, true); - let b = create_fixed_size_list_array(&[ + let b = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, Some(&[7, 8, 9]), @@ -745,7 +745,7 @@ fn test_fixed_list_null() { ]); test_equal(&a, &b, false); - let b = create_fixed_size_list_array(&[ + let b = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, None, @@ -755,7 +755,7 @@ fn test_fixed_list_null() { ]); test_equal(&a, &b, false); - let b = create_fixed_size_list_array(&[None, Some(&[4, 5, 6]), None, None]); + let b = create_fixed_size_list_array([None, Some(&[4, 5, 6]), None, None]); test_equal(&a.slice(2, 4), &b, true); test_equal(&a.slice(3, 3), &b.slice(1, 3), true); @@ -764,7 +764,7 @@ fn test_fixed_list_null() { #[test] fn test_fixed_list_offsets() { // Test the case where offset != 0 - let a = create_fixed_size_list_array(&[ + let a = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, None, @@ -772,7 +772,7 @@ fn test_fixed_list_offsets() { None, None, ]); - let b = create_fixed_size_list_array(&[ + let b = create_fixed_size_list_array([ Some(&[1, 2, 3]), None, None, diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 733e56173aa2..cf8009956e2e 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -78,7 +78,7 @@ fn main() { ) } else { let path = Path::new(&filename); - let file = File::open(&path).expect("Unable to open file"); + let file = File::open(path).expect("Unable to open file"); Box::new(SerializedFileReader::new(file).expect("Failed to create reader")) }; diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index 68c52def7c44..cd8e7692203d 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -57,7 +57,7 @@ fn main() { let args = Args::parse(); let filename = args.file_path; let path = Path::new(&filename); - let file = File::open(&path).expect("Unable to open file"); + let file = File::open(path).expect("Unable to open file"); let verbose = args.verbose; match SerializedFileReader::new(file) { diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 22b8a79780ab..d7e1e7550f00 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -835,7 +835,7 @@ fn convert_decimal_to_string(decimal: &Decimal) -> String { let num = BigInt::from_signed_bytes_be(decimal.data()); // Offset of the first digit in a string. - let negative = if num.sign() == Sign::Minus { 1 } else { 0 }; + let negative = i32::from(num.sign() == Sign::Minus); let mut num_str = num.to_string(); let mut point = num_str.len() as i32 - decimal.scale() - negative; diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index da6419424490..9f8023c91262 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -2075,7 +2075,7 @@ mod tests { let mut thrift_schema = to_thrift(&expected_schema).unwrap(); // Change all of None to Some(0) for mut elem in &mut thrift_schema[..] { - if elem.num_children == None { + if elem.num_children.is_none() { elem.num_children = Some(0); } } diff --git a/parquet/tests/boolean_writer.rs b/parquet/tests/boolean_writer.rs index dc2eccfbf3c3..8c3d50d8fde8 100644 --- a/parquet/tests/boolean_writer.rs +++ b/parquet/tests/boolean_writer.rs @@ -38,7 +38,7 @@ fn it_writes_data_without_hanging() { "; let schema = Arc::new(parse_message_type(message_type).expect("parse schema")); let props = Arc::new(WriterProperties::builder().build()); - let file = fs::File::create(&path).expect("create file"); + let file = fs::File::create(path).expect("create file"); let mut writer = SerializedFileWriter::new(file, schema, props).expect("create parquet writer"); for _group in 0..1 { @@ -64,14 +64,14 @@ fn it_writes_data_without_hanging() { } writer.close().expect("close writer"); - let bytes = fs::read(&path).expect("read file"); + let bytes = fs::read(path).expect("read file"); assert_eq!(&bytes[0..4], &[b'P', b'A', b'R', b'1']); // Now that we have written our data and are happy with it, make // sure we can read it back in < 5 seconds... let (sender, receiver) = mpsc::channel(); let _t = thread::spawn(move || { - let file = fs::File::open(&Path::new("it_writes_data_without_hanging.parquet")) + let file = fs::File::open(Path::new("it_writes_data_without_hanging.parquet")) .expect("open file"); let reader = SerializedFileReader::new(file).expect("get serialized reader"); let iter = reader.get_row_iter(None).expect("get iterator"); diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 82e3b5112fe0..06bcc0aca924 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -362,21 +362,20 @@ impl Type { /// Useful in determining the physical type of a field and the /// definition levels. fn leaf_type_recursive(&self) -> &Type { - self.leaf_type_recursive_helper(self, None) + Type::leaf_type_recursive_helper(self, None) } fn leaf_type_recursive_helper<'a>( - &'a self, ty: &'a Type, parent_ty: Option<&'a Type>, - ) -> &Type { + ) -> &'a Type { match ty { Type::TypePath(_) => parent_ty.unwrap_or(ty), Type::Option(ref first_type) | Type::Vec(ref first_type) | Type::Array(ref first_type) | Type::Reference(_, ref first_type) => { - self.leaf_type_recursive_helper(first_type, Some(ty)) + Type::leaf_type_recursive_helper(first_type, Some(ty)) } } } From 8ca4e65a7311e694e5d314aead7c970e75fbf270 Mon Sep 17 00:00:00 2001 From: Pier-Olivier Thibault <23230+pier-oliviert@users.noreply.github.com> Date: Thu, 3 Nov 2022 22:08:42 -0400 Subject: [PATCH 0213/1411] Parquet Writer: Make column descriptor getter on GenericColumnWriter (#3002) This is so that it's possible to gather information from the column we're about to write to. That information was already present in the column but not publicly available. The getter returns a reference of `ColumnDescPtr`. Co-authored-by: Raphael Taylor-Davies --- parquet/src/column/writer/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index f9b429f5bc72..7415d9aad0a7 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -427,6 +427,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.total_rows_written } + /// Returns a reference to a [`ColumnDescPtr`] + pub fn get_descriptor(&self) -> &ColumnDescPtr { + &self.descr + } + /// Finalises writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { From 97c881dc42fdbc59a057769d4a2ba28e332de4b0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Nov 2022 22:31:46 -0400 Subject: [PATCH 0214/1411] Add arrow-select to list of crates to publish (#3012) --- dev/release/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/release/README.md b/dev/release/README.md index b3fa546b5f64..a12e07f8ed34 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -256,6 +256,7 @@ Rust Arrow Crates: (cd arrow-schema && cargo publish) (cd arrow-data && cargo publish) (cd arrow-array && cargo publish) +(cd arrow-select && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From b4365ebb99b443a676241c69852a61fa7ecabcbc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:46:27 +1300 Subject: [PATCH 0215/1411] Fix more clippy lints (#3015) --- object_store/src/local.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index fd3c3592ab56..f7b7ad7dd625 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -803,16 +803,16 @@ fn open_file(path: &PathBuf) -> Result { } fn open_writable_file(path: &PathBuf) -> Result { - match File::create(&path) { + match File::create(path) { Ok(f) => Ok(f), Err(err) if err.kind() == std::io::ErrorKind::NotFound => { let parent = path .parent() .context(UnableToCreateFileSnafu { path: &path, err })?; - std::fs::create_dir_all(&parent) + std::fs::create_dir_all(parent) .context(UnableToCreateDirSnafu { path: parent })?; - match File::create(&path) { + match File::create(path) { Ok(f) => Ok(f), Err(err) => Err(Error::UnableToCreateFile { path: path.to_path_buf(), From 8400b09d2d3f5243d99eba19f85447922dd29575 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 4 Nov 2022 01:06:32 -0700 Subject: [PATCH 0216/1411] Replace hour_generic with hour_dyn (#3006) * Replace hour_generic with hour_dyn * Add Time32MillisecondType --- arrow/src/compute/kernels/temporal.rs | 88 +++++++++++++++++++++------ 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 8e42b04b9fbc..75196f37c075 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -17,7 +17,9 @@ //! Defines temporal kernels for time and date related functions. +use arrow_array::downcast_dictionary_array; use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; +use std::sync::Arc; use crate::array::*; use crate::datatypes::*; @@ -180,21 +182,74 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - hour_generic::(array) + hour_internal::(array, array.data_type()) } -/// Extracts the hours of a given temporal array as an array of integers within -/// the range of [0, 23]. -pub fn hour_generic>(array: A) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// Extracts the hours of a given array as an array of integers within +/// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn hour_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - hour_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let hour_values = hour_dyn(array.values())?; + Ok(Arc::new(array.with_values(&hour_values))) + } + dt => return_compute_error_with!("hour does not support", dt), + ) + } + DataType::Time32(TimeUnit::Second) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Time32(TimeUnit::Microsecond) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Time64(TimeUnit::Microsecond) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Time64(TimeUnit::Nanosecond) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) } - dt => hour_internal::(array, &dt), + DataType::Date32 => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Date64 => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Timestamp(TimeUnit::Second, _) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let array = as_primitive_array::(array); + hour_internal::(array, array.data_type()) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("hour does not support", dt), } } @@ -1197,13 +1252,12 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 1]); let dict = DictionaryArray::try_new(&keys, &a).unwrap(); - let b = hour_generic::( - dict.downcast_dict::().unwrap(), - ) - .unwrap(); + let b = hour_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![11, 11, 21, 7, 21]); - assert_eq!(expected, b); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![11, 21, 7])).unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); let b = time_fraction_generic::( dict.downcast_dict::().unwrap(), From 766f69f715faa619077cc5458aef955b627af715 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 4 Nov 2022 01:07:08 -0700 Subject: [PATCH 0217/1411] Check overflow when casting integer to decimal (#3009) * Check overflow when casting integer to decimal * Trigger Build * Combine cast_integer_to_decimal functions of decimal128 and decimal256 * Fix clippy * Trigger Build * Use PREFIX way. --- arrow-array/src/types.rs | 5 ++ arrow/src/compute/kernels/cast.rs | 130 +++++++++++++++++++++--------- arrow/src/datatypes/native.rs | 23 ++++++ 3 files changed, 122 insertions(+), 36 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index e6197eed19cf..7c7a5c811550 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -495,6 +495,9 @@ pub trait DecimalType: const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType; const DEFAULT_TYPE: DataType; + /// "Decimal128" or "Decimal256", for use in error messages + const PREFIX: &'static str; + /// Formats the decimal value with the provided precision and scale fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String; @@ -516,6 +519,7 @@ impl DecimalType for Decimal128Type { const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal128; const DEFAULT_TYPE: DataType = DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); + const PREFIX: &'static str = "Decimal128"; fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { format_decimal_str(&value.to_string(), precision as usize, scale as usize) @@ -543,6 +547,7 @@ impl DecimalType for Decimal256Type { const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal256; const DEFAULT_TYPE: DataType = DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); + const PREFIX: &'static str = "Decimal256"; fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { format_decimal_str(&value.to_string(), precision as usize, scale as usize) diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 4ad8dd99e73e..b1e744d26824 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -309,41 +309,43 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } -fn cast_integer_to_decimal128( - array: &PrimitiveArray, - precision: u8, - scale: u8, -) -> Result -where - ::Native: AsPrimitive, -{ - let mul: i128 = 10_i128.pow(scale as u32); - - unary::(array, |v| v.as_() * mul) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) -} - -fn cast_integer_to_decimal256( +fn cast_integer_to_decimal< + T: ArrowNumericType, + D: DecimalType + ArrowPrimitiveType, + M, +>( array: &PrimitiveArray, precision: u8, scale: u8, + base: M, + cast_options: &CastOptions, ) -> Result where - ::Native: AsPrimitive, + ::Native: AsPrimitive, + M: ArrowNativeTypeOp, { - let mul: i256 = i256::from_i128(10_i128) - .checked_pow(scale as u32) - .ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to Decimal256({}, {}). The scale causes overflow.", - precision, scale - )) - })?; + let mul: M = base.pow_checked(scale as u32).map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). The scale causes overflow.", + D::PREFIX, + precision, + scale, + )) + })?; - unary::(array, |v| v.as_().wrapping_mul(mul)) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) + if cast_options.safe { + let iter = array + .iter() + .map(|v| v.and_then(|v| v.as_().mul_checked(mul).ok())); + let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; + casted_array + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + try_unary::(array, |v| v.as_().mul_checked(mul)) + .and_then(|a| a.with_precision_and_scale(precision, scale)) + .map(|a| Arc::new(a) as ArrayRef) + } } fn cast_floating_point_to_decimal128( @@ -562,25 +564,33 @@ pub fn cast_with_options( // cast data to decimal match from_type { // TODO now just support signed numeric to decimal, support decimal to numeric later - Int8 => cast_integer_to_decimal128( + Int8 => cast_integer_to_decimal::<_, Decimal128Type, _>( as_primitive_array::(array), *precision, *scale, + 10_i128, + cast_options, ), - Int16 => cast_integer_to_decimal128( + Int16 => cast_integer_to_decimal::<_, Decimal128Type, _>( as_primitive_array::(array), *precision, *scale, + 10_i128, + cast_options, ), - Int32 => cast_integer_to_decimal128( + Int32 => cast_integer_to_decimal::<_, Decimal128Type, _>( as_primitive_array::(array), *precision, *scale, + 10_i128, + cast_options, ), - Int64 => cast_integer_to_decimal128( + Int64 => cast_integer_to_decimal::<_, Decimal128Type, _>( as_primitive_array::(array), *precision, *scale, + 10_i128, + cast_options, ), Float32 => cast_floating_point_to_decimal128( as_primitive_array::(array), @@ -603,25 +613,33 @@ pub fn cast_with_options( // cast data to decimal match from_type { // TODO now just support signed numeric to decimal, support decimal to numeric later - Int8 => cast_integer_to_decimal256( + Int8 => cast_integer_to_decimal::<_, Decimal256Type, _>( as_primitive_array::(array), *precision, *scale, + i256::from_i128(10_i128), + cast_options, ), - Int16 => cast_integer_to_decimal256( + Int16 => cast_integer_to_decimal::<_, Decimal256Type, _>( as_primitive_array::(array), *precision, *scale, + i256::from_i128(10_i128), + cast_options, ), - Int32 => cast_integer_to_decimal256( + Int32 => cast_integer_to_decimal::<_, Decimal256Type, _>( as_primitive_array::(array), *precision, *scale, + i256::from_i128(10_i128), + cast_options, ), - Int64 => cast_integer_to_decimal256( + Int64 => cast_integer_to_decimal::<_, Decimal256Type, _>( as_primitive_array::(array), *precision, *scale, + i256::from_i128(10_i128), + cast_options, ), Float32 => cast_floating_point_to_decimal256( as_primitive_array::(array), @@ -6049,4 +6067,44 @@ mod tests { ] ); } + + #[test] + fn test_cast_numeric_to_decimal128_overflow() { + let array = Int64Array::from(vec![i64::MAX]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(38, 30), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(38, 30), + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + } + + #[test] + fn test_cast_numeric_to_decimal256_overflow() { + let array = Int64Array::from(vec![i64::MAX]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(76, 76), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(76, 76), + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + } } diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index bbdec14b44a0..28ef877a2fd3 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -19,6 +19,7 @@ use crate::error::{ArrowError, Result}; pub use arrow_array::ArrowPrimitiveType; pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; use half::f16; +use num::complex::ComplexFloat; /// Trait for [`ArrowNativeType`] that adds checked and unchecked arithmetic operations, /// and totally ordered comparison operations @@ -68,6 +69,10 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { fn neg_wrapping(self) -> Self; + fn pow_checked(self, exp: u32) -> Result; + + fn pow_wrapping(self, exp: u32) -> Self; + fn is_zero(self) -> bool; fn is_eq(self, rhs: Self) -> bool; @@ -171,6 +176,16 @@ macro_rules! native_type_op { }) } + fn pow_checked(self, exp: u32) -> Result { + self.checked_pow(exp).ok_or_else(|| { + ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) + }) + } + + fn pow_wrapping(self, exp: u32) -> Self { + self.wrapping_pow(exp) + } + fn neg_wrapping(self) -> Self { self.wrapping_neg() } @@ -279,6 +294,14 @@ macro_rules! native_type_float_op { -self } + fn pow_checked(self, exp: u32) -> Result { + Ok(self.powi(exp as i32)) + } + + fn pow_wrapping(self, exp: u32) -> Self { + self.powi(exp as i32) + } + fn is_zero(self) -> bool { self == $zero } From 282e7b4c69a5d81cbbdd6d01d03064dd2fc6af99 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 4 Nov 2022 22:40:32 +1300 Subject: [PATCH 0218/1411] Add filter example (#3014) --- arrow/src/lib.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 9cf66d5460e2..7089c7152ee4 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -197,6 +197,20 @@ //! * [`sort`](compute::kernels::sort::sort) //! * some string operators such as [`substring`](compute::kernels::substring::substring) and [`length`](compute::kernels::length::length) //! +//! ``` +//! # use arrow::compute::gt_scalar; +//! # use arrow_array::cast::as_primitive_array; +//! # use arrow_array::Int32Array; +//! # use arrow_array::types::Int32Type; +//! # use arrow_select::filter::filter; +//! let array = Int32Array::from_iter(0..100); +//! let predicate = gt_scalar(&array, 60).unwrap(); +//! let filtered = filter(&array, &predicate).unwrap(); +//! +//! let expected = Int32Array::from_iter(61..100); +//! assert_eq!(&expected, as_primitive_array::(&filtered)); +//! ``` +//! //! As well as some horizontal operations, such as: //! //! * [`min`](compute::kernels::aggregate::min) and [`max`](compute::kernels::aggregate::max) From 29b3fef8ac9b3aa6da4372fd92bd1d230e285720 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 5 Nov 2022 10:32:31 +1300 Subject: [PATCH 0219/1411] Move ArrowNativeTypeOp to arrow-array (#2594) (#3018) * Move ArrowNativeTypeOp to arrow-array (#2594) * Fix features --- arrow-array/Cargo.toml | 2 +- .../src/arithmetic.rs | 47 +++++++++---------- arrow-array/src/lib.rs | 3 ++ arrow/src/datatypes/mod.rs | 4 +- 4 files changed, 29 insertions(+), 27 deletions(-) rename arrow/src/datatypes/native.rs => arrow-array/src/arithmetic.rs (85%) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 21f66c87ebd0..186e88ff147c 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -51,7 +51,7 @@ arrow-data = { version = "26.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.1", default-features = false } +half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.12", default-features = false } [dev-dependencies] diff --git a/arrow/src/datatypes/native.rs b/arrow-array/src/arithmetic.rs similarity index 85% rename from arrow/src/datatypes/native.rs rename to arrow-array/src/arithmetic.rs index 28ef877a2fd3..e596c0064369 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow-array/src/arithmetic.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; -pub use arrow_array::ArrowPrimitiveType; -pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; +use arrow_buffer::{i256, ArrowNativeType}; +use arrow_schema::ArrowError; use half::f16; use num::complex::ComplexFloat; @@ -45,31 +44,31 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { /// The multiplicative identity const ONE: Self; - fn add_checked(self, rhs: Self) -> Result; + fn add_checked(self, rhs: Self) -> Result; fn add_wrapping(self, rhs: Self) -> Self; - fn sub_checked(self, rhs: Self) -> Result; + fn sub_checked(self, rhs: Self) -> Result; fn sub_wrapping(self, rhs: Self) -> Self; - fn mul_checked(self, rhs: Self) -> Result; + fn mul_checked(self, rhs: Self) -> Result; fn mul_wrapping(self, rhs: Self) -> Self; - fn div_checked(self, rhs: Self) -> Result; + fn div_checked(self, rhs: Self) -> Result; fn div_wrapping(self, rhs: Self) -> Self; - fn mod_checked(self, rhs: Self) -> Result; + fn mod_checked(self, rhs: Self) -> Result; fn mod_wrapping(self, rhs: Self) -> Self; - fn neg_checked(self) -> Result; + fn neg_checked(self) -> Result; fn neg_wrapping(self) -> Self; - fn pow_checked(self, exp: u32) -> Result; + fn pow_checked(self, exp: u32) -> Result; fn pow_wrapping(self, exp: u32) -> Self; @@ -97,7 +96,7 @@ macro_rules! native_type_op { const ZERO: Self = $zero; const ONE: Self = $one; - fn add_checked(self, rhs: Self) -> Result { + fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( "Overflow happened on: {:?} + {:?}", @@ -110,7 +109,7 @@ macro_rules! native_type_op { self.wrapping_add(rhs) } - fn sub_checked(self, rhs: Self) -> Result { + fn sub_checked(self, rhs: Self) -> Result { self.checked_sub(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( "Overflow happened on: {:?} - {:?}", @@ -123,7 +122,7 @@ macro_rules! native_type_op { self.wrapping_sub(rhs) } - fn mul_checked(self, rhs: Self) -> Result { + fn mul_checked(self, rhs: Self) -> Result { self.checked_mul(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( "Overflow happened on: {:?} * {:?}", @@ -136,7 +135,7 @@ macro_rules! native_type_op { self.wrapping_mul(rhs) } - fn div_checked(self, rhs: Self) -> Result { + fn div_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) } else { @@ -153,7 +152,7 @@ macro_rules! native_type_op { self.wrapping_div(rhs) } - fn mod_checked(self, rhs: Self) -> Result { + fn mod_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) } else { @@ -170,13 +169,13 @@ macro_rules! native_type_op { self.wrapping_rem(rhs) } - fn neg_checked(self) -> Result { + fn neg_checked(self) -> Result { self.checked_neg().ok_or_else(|| { ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) }) } - fn pow_checked(self, exp: u32) -> Result { + fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) }) @@ -238,7 +237,7 @@ macro_rules! native_type_float_op { const ZERO: Self = $zero; const ONE: Self = $one; - fn add_checked(self, rhs: Self) -> Result { + fn add_checked(self, rhs: Self) -> Result { Ok(self + rhs) } @@ -246,7 +245,7 @@ macro_rules! native_type_float_op { self + rhs } - fn sub_checked(self, rhs: Self) -> Result { + fn sub_checked(self, rhs: Self) -> Result { Ok(self - rhs) } @@ -254,7 +253,7 @@ macro_rules! native_type_float_op { self - rhs } - fn mul_checked(self, rhs: Self) -> Result { + fn mul_checked(self, rhs: Self) -> Result { Ok(self * rhs) } @@ -262,7 +261,7 @@ macro_rules! native_type_float_op { self * rhs } - fn div_checked(self, rhs: Self) -> Result { + fn div_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) } else { @@ -274,7 +273,7 @@ macro_rules! native_type_float_op { self / rhs } - fn mod_checked(self, rhs: Self) -> Result { + fn mod_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) } else { @@ -286,7 +285,7 @@ macro_rules! native_type_float_op { self % rhs } - fn neg_checked(self) -> Result { + fn neg_checked(self) -> Result { Ok(-self) } @@ -294,7 +293,7 @@ macro_rules! native_type_float_op { -self } - fn pow_checked(self, exp: u32) -> Result { + fn pow_checked(self, exp: u32) -> Result { Ok(self.powi(exp as i32)) } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index e616099ccc89..5c86978dc94d 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -164,6 +164,9 @@ pub use array::*; mod record_batch; pub use record_batch::{RecordBatch, RecordBatchOptions}; +mod arithmetic; +pub use arithmetic::ArrowNativeTypeOp; + pub mod builder; pub mod cast; mod delta; diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 01462aeca96f..5d625a051fd0 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -22,12 +22,12 @@ //! * [`Field`](crate::datatypes::Field) to describe one field within a schema. //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. -mod native; -pub use native::*; mod numeric; pub use numeric::*; pub use arrow_array::types::*; +pub use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ DataType, Field, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionMode, From cdc8d0e58d51d1770d0e5363ba2985bbd417daa9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 5 Nov 2022 11:17:50 +1300 Subject: [PATCH 0220/1411] Split out arrow-cast (#2594) (#2998) * Split out arrow-cast (#2594) * Format --- .github/workflows/arrow.yml | 5 + .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-cast/Cargo.toml | 52 ++++ .../kernels => arrow-cast/src}/cast.rs | 233 +++++++++--------- {arrow/src/util => arrow-cast/src}/display.rs | 72 +++--- .../serialization.rs => arrow-cast/src/lib.rs | 22 +- .../cast_utils.rs => arrow-cast/src/parse.rs | 52 ++-- arrow/Cargo.toml | 1 + arrow/src/compute/kernels/mod.rs | 4 +- arrow/src/csv/writer.rs | 4 +- arrow/src/lib.rs | 1 + arrow/src/util/mod.rs | 3 +- arrow/src/util/reader_parser.rs | 5 +- 18 files changed, 259 insertions(+), 201 deletions(-) create mode 100644 arrow-cast/Cargo.toml rename {arrow/src/compute/kernels => arrow-cast/src}/cast.rs (97%) rename {arrow/src/util => arrow-cast/src}/display.rs (92%) rename arrow/src/util/serialization.rs => arrow-cast/src/lib.rs (52%) rename arrow/src/compute/kernels/cast_utils.rs => arrow-cast/src/parse.rs (87%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 868741c33cfa..9ae72dd009a3 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -28,6 +28,7 @@ on: - arrow/** - arrow-array/** - arrow-buffer/** + - arrow-cast/** - arrow-data/** - arrow-schema/** - arrow-select/** @@ -58,6 +59,8 @@ jobs: run: cargo test -p arrow-array --all-features - name: Test arrow-select with all features run: cargo test -p arrow-select --all-features + - name: Test arrow-cast with all features + run: cargo test -p arrow-cast --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -164,5 +167,7 @@ jobs: run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - name: Clippy arrow-select with all features run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings + - name: Clippy arrow-cast with all features + run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 548caeb2ab75..9621c9e69ddc 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -30,6 +30,7 @@ on: - arrow/** - arrow-array/** - arrow-buffer/** + - arrow-cast/** - arrow-data/** - arrow-schema/** - arrow-select/** diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index e44f5f8038ee..3a0073004996 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -19,6 +19,7 @@ arrow: - arrow/**/* - arrow-array/**/* - arrow-buffer/**/* + - arrow-cast/**/* - arrow-data/**/* - arrow-schema/**/* - arrow-select/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7c1d2972f452..c2c0a79e63ba 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -27,6 +27,7 @@ on: - arrow/** - arrow-array/** - arrow-buffer/** + - arrow-cast/** - arrow-data/** - arrow-schema/** - arrow-select/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 435582347e47..241b4f0b4a8d 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -27,6 +27,7 @@ on: - arrow/** - arrow-array/** - arrow-buffer/** + - arrow-cast/** - arrow-data/** - arrow-schema/** - arrow-select/** diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index dd1a782c4654..5a7beadfd71c 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -30,6 +30,7 @@ on: - arrow/** - arrow-array/** - arrow-buffer/** + - arrow-cast/** - arrow-data/** - arrow-schema/** - arrow-select/** diff --git a/Cargo.toml b/Cargo.toml index 6f61b0e456d6..d8fa5b9236e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ members = [ "arrow", "arrow-array", "arrow-buffer", + "arrow-cast", "arrow-data", "arrow-flight", "arrow-integration-test", diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml new file mode 100644 index 000000000000..714ea0b480dd --- /dev/null +++ b/arrow-cast/Cargo.toml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-cast" +version = "26.0.0" +description = "Cast kernel and utilities for Apache Arrow" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_cast" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-select = { version = "26.0.0", path = "../arrow-select" } +chrono = { version = "0.4", default-features = false, features = ["clock"] } +num = { version = "0.4", default-features = false, features = ["std"] } +lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } + +[dev-dependencies] + +[build-dependencies] diff --git a/arrow/src/compute/kernels/cast.rs b/arrow-cast/src/cast.rs similarity index 97% rename from arrow/src/compute/kernels/cast.rs rename to arrow-cast/src/cast.rs index b1e744d26824..a3abe545d529 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow-cast/src/cast.rs @@ -21,9 +21,9 @@ //! Example: //! //! ``` -//! use arrow::array::*; -//! use arrow::compute::cast; -//! use arrow::datatypes::DataType; +//! use arrow_array::*; +//! use arrow_cast::cast; +//! use arrow_schema::DataType; //! use std::sync::Arc; //! //! let a = Int32Array::from(vec![5, 6, 7]); @@ -36,27 +36,18 @@ //! ``` use chrono::{DateTime, NaiveDateTime, Timelike}; -use std::str; use std::sync::Arc; -use crate::buffer::MutableBuffer; -use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::compute::{divide_scalar, multiply_scalar}; -use crate::compute::{try_unary, unary}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::temporal_conversions::{ - as_datetime, EPOCH_DAYS_FROM_CE, MICROSECONDS, MILLISECONDS, MILLISECONDS_IN_DAY, - NANOSECONDS, SECONDS_IN_DAY, +use crate::display::{array_value_to_string, lexical_to_string}; +use crate::parse::string_to_timestamp_nanos; +use arrow_array::{ + builder::*, cast::*, iterator::ArrayIter, temporal_conversions::*, timezone::Tz, + types::*, *, }; -use crate::{array::*, compute::take}; -use crate::{ - buffer::Buffer, util::display::array_value_to_string, - util::serialization::lexical_to_string, -}; -use arrow_array::temporal_conversions::as_datetime_with_timezone; -use arrow_array::timezone::Tz; -use arrow_buffer::i256; +use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::*; +use arrow_select::take::take; use num::cast::AsPrimitive; use num::{NumCast, ToPrimitive}; @@ -305,12 +296,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { /// * To or from `StructArray` /// * List to primitive /// * Interval and duration -pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { +pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } fn cast_integer_to_decimal< - T: ArrowNumericType, + T: ArrowPrimitiveType, D: DecimalType + ArrowPrimitiveType, M, >( @@ -319,7 +310,7 @@ fn cast_integer_to_decimal< scale: u8, base: M, cast_options: &CastOptions, -) -> Result +) -> Result where ::Native: AsPrimitive, M: ArrowNativeTypeOp, @@ -342,42 +333,43 @@ where .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { - try_unary::(array, |v| v.as_().mul_checked(mul)) + array + .try_unary::<_, D, _>(|v| v.as_().mul_checked(mul)) .and_then(|a| a.with_precision_and_scale(precision, scale)) .map(|a| Arc::new(a) as ArrayRef) } } -fn cast_floating_point_to_decimal128( +fn cast_floating_point_to_decimal128( array: &PrimitiveArray, precision: u8, scale: u8, -) -> Result +) -> Result where ::Native: AsPrimitive, { let mul = 10_f64.powi(scale as i32); - unary::(array, |v| (v.as_() * mul).round() as i128) + array + .unary::<_, Decimal128Type>(|v| (v.as_() * mul).round() as i128) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } -fn cast_floating_point_to_decimal256( +fn cast_floating_point_to_decimal256( array: &PrimitiveArray, precision: u8, scale: u8, -) -> Result +) -> Result where ::Native: AsPrimitive, { let mul = 10_f64.powi(scale as i32); - unary::(array, |v| { - i256::from_i128((v.as_() * mul).round() as i128) - }) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) + array + .unary::<_, Decimal256Type>(|v| i256::from_i128((v.as_() * mul).round() as i128)) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] @@ -386,7 +378,7 @@ fn cast_reinterpret_arrays< O: ArrowPrimitiveType, >( array: &dyn Array, -) -> Result { +) -> Result { Ok(Arc::new( as_primitive_array::(array).reinterpret_cast::(), )) @@ -511,7 +503,7 @@ pub fn cast_with_options( array: &ArrayRef, to_type: &DataType, cast_options: &CastOptions, -) -> Result { +) -> Result { use DataType::*; let from_type = array.data_type(); @@ -869,7 +861,7 @@ pub fn cast_with_options( .iter() .map(|maybe_value| match maybe_value { Some(value) => { - let result = str::from_utf8(value); + let result = std::str::from_utf8(value); if cast_options.safe { Ok(result.ok()) } else { @@ -883,7 +875,7 @@ pub fn cast_with_options( } None => Ok(None), }) - .collect::>()?, + .collect::>()?, )) } _ => Err(ArrowError::CastError(format!( @@ -923,7 +915,7 @@ pub fn cast_with_options( .iter() .map(|maybe_value| match maybe_value { Some(value) => { - let result = str::from_utf8(value); + let result = std::str::from_utf8(value); if cast_options.safe { Ok(result.ok()) } else { @@ -937,7 +929,7 @@ pub fn cast_with_options( } None => Ok(None), }) - .collect::>()?, + .collect::>()?, )) } _ => Err(ArrowError::CastError(format!( @@ -1394,9 +1386,11 @@ pub fn cast_with_options( // we either divide or multiply, depending on size of each unit // units are never the same when the types are the same let converted = if from_size >= to_size { - divide_scalar(time_array, from_size / to_size)? + let divisor = from_size / to_size; + time_array.unary::<_, Int64Type>(|o| o / divisor) } else { - multiply_scalar(time_array, to_size / from_size)? + let mul = to_size / from_size; + time_array.unary::<_, Int64Type>(|o| o * mul) }; Ok(make_timestamp_array( &converted, @@ -1484,7 +1478,7 @@ pub fn cast_with_options( } /// Cast to string array to binary array -fn cast_string_to_binary(array: &ArrayRef) -> Result { +fn cast_string_to_binary(array: &ArrayRef) -> Result { let from_type = array.data_type(); match *from_type { DataType::Utf8 => { @@ -1534,7 +1528,7 @@ fn cast_decimal_to_decimal( input_scale: &u8, output_precision: &u8, output_scale: &u8, -) -> Result { +) -> Result { if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; // Original value is 11234_i128, and will be cast to 1123_i128. @@ -1575,7 +1569,7 @@ fn cast_decimal_to_decimal( .map(Some) } }) - .collect::>>()?; + .collect::, _>>()?; let output_array = values .into_iter() @@ -1631,7 +1625,7 @@ fn cast_decimal_to_decimal( .map(Some) } }) - .collect::>>()?; + .collect::, _>>()?; let output_array = values .into_iter() @@ -1654,10 +1648,10 @@ fn cast_decimal_to_decimal( fn cast_numeric_arrays( from: &ArrayRef, cast_options: &CastOptions, -) -> Result +) -> Result where - FROM: ArrowNumericType, - TO: ArrowNumericType, + FROM: ArrowPrimitiveType, + TO: ArrowPrimitiveType, FROM::Native: NumCast, TO::Native: NumCast, { @@ -1680,14 +1674,16 @@ where // Natural cast between numeric types // If the value of T can't be casted to R, will throw error -fn try_numeric_cast(from: &PrimitiveArray) -> Result> +fn try_numeric_cast( + from: &PrimitiveArray, +) -> Result, ArrowError> where - T: ArrowNumericType, - R: ArrowNumericType, + T: ArrowPrimitiveType, + R: ArrowPrimitiveType, T::Native: NumCast, R::Native: NumCast, { - try_unary(from, |value| { + from.try_unary(|value| { num::cast::cast::(value).ok_or_else(|| { ArrowError::CastError(format!( "Can't cast value {:?} to type {}", @@ -1702,8 +1698,8 @@ where // If the value of T can't be casted to R, it will be converted to null fn numeric_cast(from: &PrimitiveArray) -> PrimitiveArray where - T: ArrowNumericType, - R: ArrowNumericType, + T: ArrowPrimitiveType, + R: ArrowPrimitiveType, T::Native: NumCast, R::Native: NumCast, { @@ -1754,7 +1750,7 @@ fn extract_component_from_datetime_array< mut builder: GenericStringBuilder, tz: &str, op: F, -) -> Result +) -> Result where OffsetSize: OffsetSizeTrait, F: Fn(DateTime) -> String, @@ -1781,9 +1777,9 @@ where fn cast_timestamp_to_string( array: &ArrayRef, tz: &Option, -) -> Result +) -> Result where - T: ArrowTemporalType + ArrowNumericType, + T: ArrowTemporalType + ArrowPrimitiveType, i64: From<::Native>, OffsetSize: OffsetSizeTrait, { @@ -1816,7 +1812,7 @@ where /// Cast date32 types to Utf8/LargeUtf8 fn cast_date32_to_string( array: &ArrayRef, -) -> Result { +) -> Result { let array = array.as_any().downcast_ref::().unwrap(); Ok(Arc::new( @@ -1835,7 +1831,7 @@ fn cast_date32_to_string( /// Cast date64 types to Utf8/LargeUtf8 fn cast_date64_to_string( array: &ArrayRef, -) -> Result { +) -> Result { let array = array.as_any().downcast_ref::().unwrap(); Ok(Arc::new( @@ -1852,9 +1848,11 @@ fn cast_date64_to_string( } /// Cast numeric types to Utf8 -fn cast_numeric_to_string(array: &ArrayRef) -> Result +fn cast_numeric_to_string( + array: &ArrayRef, +) -> Result where - FROM: ArrowNumericType, + FROM: ArrowPrimitiveType, FROM::Native: lexical_core::ToLexical, OffsetSize: OffsetSizeTrait, { @@ -1870,7 +1868,7 @@ fn numeric_to_string_cast( from: &PrimitiveArray, ) -> GenericStringArray where - T: ArrowPrimitiveType + ArrowNumericType, + T: ArrowPrimitiveType + ArrowPrimitiveType, T::Native: lexical_core::ToLexical, OffsetSize: OffsetSizeTrait, { @@ -1883,9 +1881,9 @@ where fn cast_string_to_numeric( from: &ArrayRef, cast_options: &CastOptions, -) -> Result +) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, ::Native: lexical_core::FromLexical, { Ok(Arc::new(string_to_numeric_cast::( @@ -1899,9 +1897,9 @@ where fn string_to_numeric_cast( from: &GenericStringArray, cast_options: &CastOptions, -) -> Result> +) -> Result, ArrowError> where - T: ArrowNumericType, + T: ArrowPrimitiveType, ::Native: lexical_core::FromLexical, { if cast_options.safe { @@ -1928,7 +1926,7 @@ where }) .transpose() }) - .collect::>>()?; + .collect::, _>>()?; // Benefit: // 20% performance improvement // Soundness: @@ -1941,7 +1939,7 @@ where fn cast_string_to_date32( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { use chrono::Datelike; let string_array = array .as_any() @@ -1979,7 +1977,7 @@ fn cast_string_to_date32( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -1995,7 +1993,7 @@ fn cast_string_to_date32( fn cast_string_to_date64( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { let string_array = array .as_any() .downcast_ref::>() @@ -2032,7 +2030,7 @@ fn cast_string_to_date64( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2048,7 +2046,7 @@ fn cast_string_to_date64( fn cast_string_to_time32second( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { /// The number of nanoseconds per millisecond. const NANOS_PER_SEC: u32 = 1_000_000_000; @@ -2096,7 +2094,7 @@ fn cast_string_to_time32second( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2112,7 +2110,7 @@ fn cast_string_to_time32second( fn cast_string_to_time32millisecond( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { /// The number of nanoseconds per millisecond. const NANOS_PER_MILLI: u32 = 1_000_000; /// The number of milliseconds per second. @@ -2162,7 +2160,7 @@ fn cast_string_to_time32millisecond( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2178,7 +2176,7 @@ fn cast_string_to_time32millisecond( fn cast_string_to_time64microsecond( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { /// The number of nanoseconds per microsecond. const NANOS_PER_MICRO: i64 = 1_000; /// The number of microseconds per second. @@ -2226,7 +2224,7 @@ fn cast_string_to_time64microsecond( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2242,7 +2240,7 @@ fn cast_string_to_time64microsecond( fn cast_string_to_time64nanosecond( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { /// The number of nanoseconds per second. const NANOS_PER_SEC: i64 = 1_000_000_000; @@ -2288,7 +2286,7 @@ fn cast_string_to_time64nanosecond( }) .transpose() }) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2304,7 +2302,7 @@ fn cast_string_to_time64nanosecond( fn cast_string_to_timestamp_ns( array: &dyn Array, cast_options: &CastOptions, -) -> Result { +) -> Result { let string_array = array .as_any() .downcast_ref::>() @@ -2323,7 +2321,7 @@ fn cast_string_to_timestamp_ns( let vec = string_array .iter() .map(|v| v.map(string_to_timestamp_nanos).transpose()) - .collect::>>>()?; + .collect::>, _>>()?; // Benefit: // 20% performance improvement @@ -2336,7 +2334,10 @@ fn cast_string_to_timestamp_ns( } /// Casts Utf8 to Boolean -fn cast_utf8_to_boolean(from: &ArrayRef, cast_options: &CastOptions) -> Result { +fn cast_utf8_to_boolean( + from: &ArrayRef, + cast_options: &CastOptions, +) -> Result { let array = as_string_array(from); let output_array = array @@ -2358,7 +2359,7 @@ fn cast_utf8_to_boolean(from: &ArrayRef, cast_options: &CastOptions) -> Result Ok(None), }) - .collect::>()?; + .collect::>()?; Ok(Arc::new(output_array)) } @@ -2366,9 +2367,9 @@ fn cast_utf8_to_boolean(from: &ArrayRef, cast_options: &CastOptions) -> Result(from: &ArrayRef) -> Result +fn cast_numeric_to_bool(from: &ArrayRef) -> Result where - FROM: ArrowNumericType, + FROM: ArrowPrimitiveType, { numeric_to_bool_cast::( from.as_any() @@ -2378,9 +2379,9 @@ where .map(|to| Arc::new(to) as ArrayRef) } -fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result +fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result where - T: ArrowPrimitiveType + ArrowNumericType, + T: ArrowPrimitiveType + ArrowPrimitiveType, { let mut b = BooleanBuilder::with_capacity(from.len()); @@ -2403,9 +2404,9 @@ where fn cast_bool_to_numeric( from: &ArrayRef, cast_options: &CastOptions, -) -> Result +) -> Result where - TO: ArrowNumericType, + TO: ArrowPrimitiveType, TO::Native: num::cast::NumCast, { Ok(Arc::new(bool_to_numeric_cast::( @@ -2419,7 +2420,7 @@ fn bool_to_numeric_cast( _cast_options: &CastOptions, ) -> PrimitiveArray where - T: ArrowNumericType, + T: ArrowPrimitiveType, T::Native: num::NumCast, { let iter = (0..from.len()).map(|i| { @@ -2447,7 +2448,7 @@ fn dictionary_cast( array: &ArrayRef, to_type: &DataType, cast_options: &CastOptions, -) -> Result { +) -> Result { use DataType::*; match to_type { @@ -2525,7 +2526,7 @@ fn unpack_dictionary( array: &ArrayRef, to_type: &DataType, cast_options: &CastOptions, -) -> Result +) -> Result where K: ArrowDictionaryKeyType, { @@ -2567,7 +2568,7 @@ fn cast_to_dictionary( array: &ArrayRef, dict_value_type: &DataType, cast_options: &CastOptions, -) -> Result { +) -> Result { use DataType::*; match *dict_value_type { @@ -2625,10 +2626,10 @@ fn pack_numeric_to_dictionary( array: &ArrayRef, dict_value_type: &DataType, cast_options: &CastOptions, -) -> Result +) -> Result where K: ArrowDictionaryKeyType, - V: ArrowNumericType, + V: ArrowPrimitiveType, { // attempt to cast the source array values to the target value type (the dictionary values type) let cast_values = cast_with_options(array, dict_value_type, cast_options)?; @@ -2656,7 +2657,7 @@ where fn pack_string_to_dictionary( array: &ArrayRef, cast_options: &CastOptions, -) -> Result +) -> Result where K: ArrowDictionaryKeyType, { @@ -2681,7 +2682,7 @@ fn cast_primitive_to_list( to: &Field, to_type: &DataType, cast_options: &CastOptions, -) -> Result { +) -> Result { // cast primitive to list's primitive let cast_array = cast_with_options(array, to.data_type(), cast_options)?; // create offsets, where if array.len() = 2, we have [0,1,2] @@ -2721,7 +2722,7 @@ fn cast_list_inner( to: &Field, to_type: &DataType, cast_options: &CastOptions, -) -> Result { +) -> Result { let data = array.data_ref(); let underlying_array = make_array(data.child_data()[0].clone()); let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?; @@ -2745,7 +2746,9 @@ fn cast_list_inner( /// Helper function to cast from `Utf8` to `LargeUtf8` and vice versa. If the `LargeUtf8` is too large for /// a `Utf8` array it will return an Error. -fn cast_str_container(array: &dyn Array) -> Result +fn cast_str_container( + array: &dyn Array, +) -> Result where OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, OffsetSizeTo: OffsetSizeTrait + NumCast + ArrowNativeType, @@ -2760,15 +2763,17 @@ where let offsets = list_data.buffers()[0].typed_data::(); let mut offset_builder = BufferBuilder::::new(offsets.len()); - offsets.iter().try_for_each::<_, Result<_>>(|offset| { - let offset = OffsetSizeTo::from(*offset).ok_or_else(|| { - ArrowError::ComputeError( - "large-utf8 array too large to cast to utf8-array".into(), - ) + offsets + .iter() + .try_for_each::<_, Result<_, ArrowError>>(|offset| { + let offset = OffsetSizeTo::from(*offset).ok_or_else(|| { + ArrowError::ComputeError( + "large-utf8 array too large to cast to utf8-array".into(), + ) + })?; + offset_builder.append(offset); + Ok(()) })?; - offset_builder.append(offset); - Ok(()) - })?; let offset_buffer = offset_builder.finish(); @@ -2797,7 +2802,7 @@ where fn cast_list_container( array: &dyn Array, _cast_options: &CastOptions, -) -> Result +) -> Result where OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, OffsetSizeTo: OffsetSizeTrait + NumCast, @@ -2869,8 +2874,6 @@ where #[cfg(test)] mod tests { use super::*; - use crate::datatypes::TimeUnit; - use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { @@ -2901,7 +2904,7 @@ mod tests { array: Vec>, precision: u8, scale: u8, - ) -> Result { + ) -> Result { array .into_iter() .collect::() @@ -2912,7 +2915,7 @@ mod tests { array: Vec>, precision: u8, scale: u8, - ) -> Result { + ) -> Result { array .into_iter() .collect::() @@ -5169,7 +5172,7 @@ mod tests { /// Convert `array` into a vector of strings by casting to data type dt fn get_cast_values(array: &ArrayRef, dt: &DataType) -> Vec where - T: ArrowNumericType, + T: ArrowPrimitiveType, { let c = cast(array, dt).unwrap(); let a = c.as_any().downcast_ref::>().unwrap(); diff --git a/arrow/src/util/display.rs b/arrow-cast/src/display.rs similarity index 92% rename from arrow/src/util/display.rs rename to arrow-cast/src/display.rs index f5bef1605ef8..b29f844fb677 100644 --- a/arrow/src/util/display.rs +++ b/arrow-cast/src/display.rs @@ -22,18 +22,11 @@ use std::fmt::Write; use std::sync::Arc; -use crate::array::Array; -use crate::datatypes::{ - ArrowNativeType, ArrowPrimitiveType, DataType, Field, Int16Type, Int32Type, - Int64Type, Int8Type, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, - UnionMode, -}; -use crate::{array, datatypes::IntervalUnit}; - -use array::DictionaryArray; - -use crate::error::{ArrowError, Result}; use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_schema::*; macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -254,7 +247,7 @@ macro_rules! make_string_from_list { .value($row); let string_values = (0..list.len()) .map(|i| array_value_to_string(&list.clone(), i)) - .collect::>>()?; + .collect::, _>>()?; Ok(format!("[{}]", string_values.join(", "))) }}; } @@ -270,7 +263,7 @@ macro_rules! make_string_from_large_list { .value($row); let string_values = (0..list.len()) .map(|i| array_value_to_string(&list, i)) - .collect::>>()?; + .collect::, _>>()?; Ok(format!("[{}]", string_values.join(", "))) }}; } @@ -286,17 +279,17 @@ macro_rules! make_string_from_fixed_size_list { .value($row); let string_values = (0..list.len()) .map(|i| array_value_to_string(&list.clone(), i)) - .collect::>>()?; + .collect::, _>>()?; Ok(format!("[{}]", string_values.join(", "))) }}; } #[inline(always)] -pub fn make_string_from_decimal(column: &Arc, row: usize) -> Result { - let array = column - .as_any() - .downcast_ref::() - .unwrap(); +pub fn make_string_from_decimal( + column: &Arc, + row: usize, +) -> Result { + let array = column.as_any().downcast_ref::().unwrap(); Ok(array.value_as_string(row)) } @@ -306,7 +299,7 @@ fn append_struct_field_string( name: &str, field_col: &Arc, row: usize, -) -> Result<()> { +) -> Result<(), ArrowError> { target.push('"'); target.push_str(name); target.push_str("\": "); @@ -333,7 +326,10 @@ fn append_struct_field_string( /// /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. -pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result { +pub fn array_value_to_string( + column: &ArrayRef, + row: usize, +) -> Result { if column.is_null(row) { return Ok("".to_string()); } @@ -487,12 +483,12 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result Result { +) -> Result { let list = column .as_any() .downcast_ref::() @@ -522,9 +518,9 @@ fn union_to_string( } /// Converts the value of the dictionary array at `row` to a String fn dict_array_value_to_string( - colum: &array::ArrayRef, + colum: &ArrayRef, row: usize, -) -> Result { +) -> Result { let dict_array = colum.as_any().downcast_ref::>().unwrap(); let keys_array = dict_array.keys(); @@ -533,13 +529,23 @@ fn dict_array_value_to_string( return Ok(String::from("")); } - let dict_index = keys_array.value(row).to_usize().ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Can not convert value {:?} at index {:?} to usize for string conversion.", - keys_array.value(row), - row - )) - })?; - + let dict_index = keys_array.value(row).as_usize(); array_value_to_string(dict_array.values(), dict_index) } + +/// Converts numeric type to a `String` +pub fn lexical_to_string(n: N) -> String { + let mut buf = Vec::::with_capacity(N::FORMATTED_SIZE_DECIMAL); + unsafe { + // JUSTIFICATION + // Benefit + // Allows using the faster serializer lexical core and convert to string + // Soundness + // Length of buf is set as written length afterwards. lexical_core + // creates a valid string, so doesn't need to be checked. + let slice = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.capacity()); + let len = lexical_core::write(n, slice).len(); + buf.set_len(len); + String::from_utf8_unchecked(buf) + } +} diff --git a/arrow/src/util/serialization.rs b/arrow-cast/src/lib.rs similarity index 52% rename from arrow/src/util/serialization.rs rename to arrow-cast/src/lib.rs index 14d67ca117c4..397e5667e6ea 100644 --- a/arrow/src/util/serialization.rs +++ b/arrow-cast/src/lib.rs @@ -15,19 +15,9 @@ // specific language governing permissions and limitations // under the License. -/// Converts numeric type to a `String` -pub fn lexical_to_string(n: N) -> String { - let mut buf = Vec::::with_capacity(N::FORMATTED_SIZE_DECIMAL); - unsafe { - // JUSTIFICATION - // Benefit - // Allows using the faster serializer lexical core and convert to string - // Soundness - // Length of buf is set as written length afterwards. lexical_core - // creates a valid string, so doesn't need to be checked. - let slice = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.capacity()); - let len = lexical_core::write(n, slice).len(); - buf.set_len(len); - String::from_utf8_unchecked(buf) - } -} +//! Cast kernel for [Apache Arrow](https://docs.rs/arrow) + +pub mod cast; +pub use cast::*; +pub mod display; +pub mod parse; diff --git a/arrow/src/compute/kernels/cast_utils.rs b/arrow-cast/src/parse.rs similarity index 87% rename from arrow/src/compute/kernels/cast_utils.rs rename to arrow-cast/src/parse.rs index 718ea5ac64a3..8a9d34b4c637 100644 --- a/arrow/src/compute/kernels/cast_utils.rs +++ b/arrow-cast/src/parse.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; +use arrow_schema::ArrowError; use chrono::prelude::*; /// Accepts a string in RFC3339 / ISO8601 standard format and some @@ -66,7 +66,7 @@ use chrono::prelude::*; /// timestamp will be interpreted as though it were /// `1997-01-31T09:26:56.123-05:00` #[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { +pub fn string_to_timestamp_nanos(s: &str) -> Result { // Fast path: RFC3339 timestamp (with a T) // Example: 2020-09-08T13:42:29.190855Z if let Ok(ts) = DateTime::parse_from_rfc3339(s) { @@ -135,52 +135,50 @@ mod tests { use super::*; #[test] - fn string_to_timestamp_timezone() -> Result<()> { + fn string_to_timestamp_timezone() { // Explicit timezone assert_eq!( 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00")? + parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() ); assert_eq!( 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z")? + parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() ); assert_eq!( 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z")? + parse_timestamp("2020-09-08T13:42:29Z").unwrap() ); // no fractional part assert_eq!( 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00")? + parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() ); - Ok(()) } #[test] - fn string_to_timestamp_timezone_space() -> Result<()> { + fn string_to_timestamp_timezone_space() { // Ensure space rather than T between time and date is accepted assert_eq!( 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00")? + parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() ); assert_eq!( 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z")? + parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() ); assert_eq!( 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z")? + parse_timestamp("2020-09-08 13:42:29Z").unwrap() ); // no fractional part assert_eq!( 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00")? + parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() ); - Ok(()) } #[test] #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime - fn string_to_timestamp_no_timezone() -> Result<()> { + fn string_to_timestamp_no_timezone() { // This test is designed to succeed in regardless of the local // timezone the test machine is running. Thus it is still // somewhat susceptible to bugs in the use of chrono @@ -192,12 +190,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855")? + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855")? + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); // Also ensure that parsing timestamps with no fractional @@ -210,15 +208,13 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29")? + parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29")? + parse_timestamp("2020-09-08 13:42:29").unwrap() ); - - Ok(()) } #[test] @@ -235,7 +231,7 @@ mod tests { } // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { + fn parse_timestamp(s: &str) -> Result { let result = string_to_timestamp_nanos(s); if let Err(e) = &result { eprintln!("Error parsing timestamp '{}': {:?}", s, e); @@ -258,7 +254,7 @@ mod tests { } #[test] - fn string_without_timezone_to_timestamp() -> Result<()> { + fn string_without_timezone_to_timestamp() { // string without timezone should always output the same regardless the local or session timezone let naive_datetime = NaiveDateTime::new( @@ -269,12 +265,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855")? + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855")? + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); let naive_datetime = NaiveDateTime::new( @@ -285,14 +281,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29")? + parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29")? + parse_timestamp("2020-09-08 13:42:29").unwrap() ); - - Ok(()) } } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 5b2639b7fdb0..5749f6799874 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -45,6 +45,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] [dependencies] arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "26.0.0", path = "../arrow-cast" } arrow-data = { version = "26.0.0", path = "../arrow-data" } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } arrow-array = { version = "26.0.0", path = "../arrow-array" } diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index a772f5bcc429..9ffa53eb2db7 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -22,8 +22,6 @@ pub mod arithmetic; pub mod arity; pub mod bitwise; pub mod boolean; -pub mod cast; -pub mod cast_utils; pub mod comparison; pub mod concat_elements; pub mod length; @@ -36,4 +34,6 @@ pub mod temporal; pub mod window; pub mod zip; +pub use arrow_cast::cast; +pub use arrow_cast::parse as cast_utils; pub use arrow_select::{concat, filter, interleave, take}; diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index fb3348d944f3..b2d02fe84947 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -67,12 +67,12 @@ use arrow_array::timezone::Tz; use chrono::{DateTime, Utc}; use std::io::Write; +use crate::array::*; use crate::csv::map_csv_error; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; -use crate::util::display::make_string_from_decimal; -use crate::{array::*, util::serialization::lexical_to_string}; +use crate::util::display::{lexical_to_string, make_string_from_decimal}; const DEFAULT_DATE_FORMAT: &str = "%F"; const DEFAULT_TIME_FORMAT: &str = "%T"; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 7089c7152ee4..0081856f3d68 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -33,6 +33,7 @@ //! //! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions //! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays +//! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays //! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index f0b9e0076ba1..9a0ca852a114 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -24,12 +24,11 @@ pub use arrow_data::bit_mask; pub mod bench_util; #[cfg(feature = "test_utils")] pub mod data_gen; -pub mod display; #[cfg(feature = "prettyprint")] pub mod pretty; -pub(crate) mod serialization; pub mod string_writer; #[cfg(any(test, feature = "test_utils"))] pub mod test_util; +pub use arrow_cast::display; pub(crate) mod reader_parser; diff --git a/arrow/src/util/reader_parser.rs b/arrow/src/util/reader_parser.rs index 60082e8dd551..efee629056df 100644 --- a/arrow/src/util/reader_parser.rs +++ b/arrow/src/util/reader_parser.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::datatypes::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_cast::parse::string_to_timestamp_nanos; /// Specialized parsing implementations /// used by csv and json reader From fc58036e77510de71b51c0190acfd629738afff8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 5 Nov 2022 14:12:31 +1300 Subject: [PATCH 0221/1411] Update arrow-flight (#3019) --- arrow-flight/Cargo.toml | 3 +- arrow-flight/src/arrow.flight.protocol.rs | 75 +++-- .../src/sql/arrow.flight.protocol.sql.rs | 312 ++++++++++++------ 3 files changed, 252 insertions(+), 138 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index b4fe03b4fd70..394fb98c3b2c 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -47,7 +47,8 @@ flight-sql-experimental = ["prost-types"] tonic-build = { version = "0.8", default-features = false, features = ["transport", "prost"] } # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = ">1.0.30", default-features = false } +proc-macro2 = { version = "=1.0.47", default-features = false } +prost-build = { version = "=0.11.2", default-features = false } [[example]] name = "flight_sql_server" diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index d9e4200030fa..10ab82a87fc1 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -6,44 +6,43 @@ pub struct HandshakeRequest { /// /// A defined protocol version - #[prost(uint64, tag="1")] + #[prost(uint64, tag = "1")] pub protocol_version: u64, /// /// Arbitrary auth/handshake info. - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub payload: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeResponse { /// /// A defined protocol version - #[prost(uint64, tag="1")] + #[prost(uint64, tag = "1")] pub protocol_version: u64, /// /// Arbitrary auth/handshake info. - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub payload: ::prost::alloc::vec::Vec, } /// /// A message for doing simple auth. #[derive(Clone, PartialEq, ::prost::Message)] pub struct BasicAuth { - #[prost(string, tag="2")] + #[prost(string, tag = "2")] pub username: ::prost::alloc::string::String, - #[prost(string, tag="3")] + #[prost(string, tag = "3")] pub password: ::prost::alloc::string::String, } #[derive(Clone, PartialEq, ::prost::Message)] -pub struct Empty { -} +pub struct Empty {} /// /// Describes an available action, including both the name used for execution /// along with a short description of the purpose of the action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionType { - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub r#type: ::prost::alloc::string::String, - #[prost(string, tag="2")] + #[prost(string, tag = "2")] pub description: ::prost::alloc::string::String, } /// @@ -51,23 +50,23 @@ pub struct ActionType { /// of available Arrow Flight streams. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Criteria { - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub expression: ::prost::alloc::vec::Vec, } /// /// An opaque action specific for the service. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Action { - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub r#type: ::prost::alloc::string::String, - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub body: ::prost::alloc::vec::Vec, } /// /// An opaque result returned after executing an action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Result { - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub body: ::prost::alloc::vec::Vec, } /// @@ -78,7 +77,7 @@ pub struct SchemaResult { /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix /// 4 bytes - the byte length of the payload /// a flatbuffer Message whose header is the Schema - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub schema: ::prost::alloc::vec::Vec, } /// @@ -86,24 +85,34 @@ pub struct SchemaResult { /// a flight or be used to expose a set of previously defined flights. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightDescriptor { - #[prost(enumeration="flight_descriptor::DescriptorType", tag="1")] + #[prost(enumeration = "flight_descriptor::DescriptorType", tag = "1")] pub r#type: i32, /// /// Opaque value used to express a command. Should only be defined when /// type = CMD. - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub cmd: ::prost::alloc::vec::Vec, /// /// List of strings identifying a particular dataset. Should only be defined /// when type = PATH. - #[prost(string, repeated, tag="3")] + #[prost(string, repeated, tag = "3")] pub path: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } /// Nested message and enum types in `FlightDescriptor`. pub mod flight_descriptor { /// /// Describes what type of descriptor is defined. - #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] #[repr(i32)] pub enum DescriptorType { /// Protobuf pattern, not used. @@ -140,11 +149,11 @@ pub struct FlightInfo { /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix /// 4 bytes - the byte length of the payload /// a flatbuffer Message whose header is the Schema - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub schema: ::prost::alloc::vec::Vec, /// /// The descriptor associated with this info. - #[prost(message, optional, tag="2")] + #[prost(message, optional, tag = "2")] pub flight_descriptor: ::core::option::Option, /// /// A list of endpoints associated with the flight. To consume the @@ -156,12 +165,12 @@ pub struct FlightInfo { /// /// There is no ordering defined on endpoints. Hence, if the returned /// data has an ordering, it should be returned in a single endpoint. - #[prost(message, repeated, tag="3")] + #[prost(message, repeated, tag = "3")] pub endpoint: ::prost::alloc::vec::Vec, /// Set these to -1 if unknown. - #[prost(int64, tag="4")] + #[prost(int64, tag = "4")] pub total_records: i64, - #[prost(int64, tag="5")] + #[prost(int64, tag = "5")] pub total_bytes: i64, } /// @@ -170,7 +179,7 @@ pub struct FlightInfo { pub struct FlightEndpoint { /// /// Token used to retrieve this stream. - #[prost(message, optional, tag="1")] + #[prost(message, optional, tag = "1")] pub ticket: ::core::option::Option, /// /// A list of URIs where this ticket can be redeemed via DoGet(). @@ -187,7 +196,7 @@ pub struct FlightEndpoint { /// /// In other words, an application can use multiple locations to /// represent redundant and/or load balanced services. - #[prost(message, repeated, tag="2")] + #[prost(message, repeated, tag = "2")] pub location: ::prost::alloc::vec::Vec, } /// @@ -195,7 +204,7 @@ pub struct FlightEndpoint { /// stream given a ticket. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Location { - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub uri: ::prost::alloc::string::String, } /// @@ -206,7 +215,7 @@ pub struct Location { /// behavior to reuse a ticket. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Ticket { - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub ticket: ::prost::alloc::vec::Vec, } /// @@ -216,29 +225,29 @@ pub struct FlightData { /// /// The descriptor of the data. This is only relevant when a client is /// starting a new DoPut stream. - #[prost(message, optional, tag="1")] + #[prost(message, optional, tag = "1")] pub flight_descriptor: ::core::option::Option, /// /// Header for message data as described in Message.fbs::Message. - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub data_header: ::prost::alloc::vec::Vec, /// /// Application-defined metadata. - #[prost(bytes="vec", tag="3")] + #[prost(bytes = "vec", tag = "3")] pub app_metadata: ::prost::alloc::vec::Vec, /// /// The actual batch of Arrow data. Preferably handled with minimal-copies /// coming last in the definition to help with sidecar patterns (it is /// expected that some implementations will fetch this field off the wire /// with specialized code to avoid extra memory copies). - #[prost(bytes="vec", tag="1000")] + #[prost(bytes = "vec", tag = "1000")] pub data_body: ::prost::alloc::vec::Vec, } /// * /// The response message associated with the submission of a DoPut. #[derive(Clone, PartialEq, ::prost::Message)] pub struct PutResult { - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub app_metadata: ::prost::alloc::vec::Vec, } /// Generated client implementations. diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 284f6a15c526..0fd003e1154d 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -38,7 +38,7 @@ pub struct CommandGetSqlInfo { /// Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must /// at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. /// If additional metadata is included, the metadata IDs should start from 10,000. - #[prost(uint32, repeated, tag="1")] + #[prost(uint32, repeated, tag = "1")] pub info: ::prost::alloc::vec::Vec, } /// @@ -102,7 +102,7 @@ pub struct CommandGetSqlInfo { pub struct CommandGetXdbcTypeInfo { /// /// Specifies the data type to search for the info. - #[prost(int32, optional, tag="1")] + #[prost(int32, optional, tag = "1")] pub data_type: ::core::option::Option, } /// @@ -118,8 +118,7 @@ pub struct CommandGetXdbcTypeInfo { /// > /// The returned data should be ordered by catalog_name. #[derive(Clone, PartialEq, ::prost::Message)] -pub struct CommandGetCatalogs { -} +pub struct CommandGetCatalogs {} /// /// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. /// The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. @@ -139,7 +138,7 @@ pub struct CommandGetDbSchemas { /// Specifies the Catalog to search for the tables. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies a filter pattern for schemas to search for. @@ -147,7 +146,7 @@ pub struct CommandGetDbSchemas { /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, } /// @@ -183,7 +182,7 @@ pub struct CommandGetTables { /// Specifies the Catalog to search for the tables. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies a filter pattern for schemas to search for. @@ -191,7 +190,7 @@ pub struct CommandGetTables { /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies a filter pattern for tables to search for. @@ -199,16 +198,18 @@ pub struct CommandGetTables { /// In the pattern string, two special characters can be used to denote matching rules: /// - "%" means to match any substring with 0 or more characters. /// - "_" means to match any one character. - #[prost(string, optional, tag="3")] - pub table_name_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, + #[prost(string, optional, tag = "3")] + pub table_name_filter_pattern: ::core::option::Option< + ::prost::alloc::string::String, + >, /// /// Specifies a filter of table types which must match. /// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. /// TABLE, VIEW, and SYSTEM TABLE are commonly supported. - #[prost(string, repeated, tag="4")] + #[prost(string, repeated, tag = "4")] pub table_types: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, /// Specifies if the Arrow schema should be returned for found tables. - #[prost(bool, tag="5")] + #[prost(bool, tag = "5")] pub include_schema: bool, } /// @@ -225,8 +226,7 @@ pub struct CommandGetTables { /// > /// The returned data should be ordered by table_type. #[derive(Clone, PartialEq, ::prost::Message)] -pub struct CommandGetTableTypes { -} +pub struct CommandGetTableTypes {} /// /// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. /// Used in the command member of FlightDescriptor for the following RPC calls: @@ -249,16 +249,16 @@ pub struct CommandGetPrimaryKeys { /// Specifies the catalog to search for the table. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies the schema to search for the table. /// An empty string retrieves those without a schema. /// If omitted the schema name should not be used to narrow the search. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, /// Specifies the table to get the primary keys for. - #[prost(string, tag="3")] + #[prost(string, tag = "3")] pub table: ::prost::alloc::string::String, } /// @@ -292,16 +292,16 @@ pub struct CommandGetExportedKeys { /// Specifies the catalog to search for the foreign key table. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies the schema to search for the foreign key table. /// An empty string retrieves those without a schema. /// If omitted the schema name should not be used to narrow the search. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, /// Specifies the foreign key table to get the foreign keys for. - #[prost(string, tag="3")] + #[prost(string, tag = "3")] pub table: ::prost::alloc::string::String, } /// @@ -339,16 +339,16 @@ pub struct CommandGetImportedKeys { /// Specifies the catalog to search for the primary key table. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// /// Specifies the schema to search for the primary key table. /// An empty string retrieves those without a schema. /// If omitted the schema name should not be used to narrow the search. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, /// Specifies the primary key table to get the foreign keys for. - #[prost(string, tag="3")] + #[prost(string, tag = "3")] pub table: ::prost::alloc::string::String, } /// @@ -388,43 +388,41 @@ pub struct CommandGetCrossReference { /// The catalog name where the parent table is. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="1")] + #[prost(string, optional, tag = "1")] pub pk_catalog: ::core::option::Option<::prost::alloc::string::String>, /// * /// The Schema name where the parent table is. /// An empty string retrieves those without a schema. /// If omitted the schema name should not be used to narrow the search. - #[prost(string, optional, tag="2")] + #[prost(string, optional, tag = "2")] pub pk_db_schema: ::core::option::Option<::prost::alloc::string::String>, /// * /// The parent table name. It cannot be null. - #[prost(string, tag="3")] + #[prost(string, tag = "3")] pub pk_table: ::prost::alloc::string::String, /// * /// The catalog name where the foreign table is. /// An empty string retrieves those without a catalog. /// If omitted the catalog name should not be used to narrow the search. - #[prost(string, optional, tag="4")] + #[prost(string, optional, tag = "4")] pub fk_catalog: ::core::option::Option<::prost::alloc::string::String>, /// * /// The schema name where the foreign table is. /// An empty string retrieves those without a schema. /// If omitted the schema name should not be used to narrow the search. - #[prost(string, optional, tag="5")] + #[prost(string, optional, tag = "5")] pub fk_db_schema: ::core::option::Option<::prost::alloc::string::String>, /// * /// The foreign table name. It cannot be null. - #[prost(string, tag="6")] + #[prost(string, tag = "6")] pub fk_table: ::prost::alloc::string::String, } -// SQL Execution Action Messages - /// /// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementRequest { /// The valid SQL string to create a prepared statement for. - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, } /// @@ -436,15 +434,15 @@ pub struct ActionCreatePreparedStatementRequest { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { /// Opaque handle for the prepared statement on the server. - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, /// If a result set generating query was provided, dataset_schema contains the /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. - #[prost(bytes="vec", tag="2")] + #[prost(bytes = "vec", tag = "2")] pub dataset_schema: ::prost::alloc::vec::Vec, /// If the query provided contained parameters, parameter_schema contains the /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. - #[prost(bytes="vec", tag="3")] + #[prost(bytes = "vec", tag = "3")] pub parameter_schema: ::prost::alloc::vec::Vec, } /// @@ -453,11 +451,9 @@ pub struct ActionCreatePreparedStatementResult { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionClosePreparedStatementRequest { /// Opaque handle for the prepared statement on the server. - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } -// SQL Execution Messages. - /// /// Represents a SQL query. Used in the command member of FlightDescriptor /// for the following RPC calls: @@ -477,7 +473,7 @@ pub struct ActionClosePreparedStatementRequest { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementQuery { /// The SQL syntax. - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, } /// * @@ -486,7 +482,7 @@ pub struct CommandStatementQuery { #[derive(Clone, PartialEq, ::prost::Message)] pub struct TicketStatementQuery { /// Unique identifier for the instance of the statement to execute. - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub statement_handle: ::prost::alloc::vec::Vec, } /// @@ -509,7 +505,7 @@ pub struct TicketStatementQuery { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementQuery { /// Opaque handle for the prepared statement on the server. - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// @@ -518,7 +514,7 @@ pub struct CommandPreparedStatementQuery { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementUpdate { /// The SQL syntax. - #[prost(string, tag="1")] + #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, } /// @@ -528,7 +524,7 @@ pub struct CommandStatementUpdate { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementUpdate { /// Opaque handle for the prepared statement on the server. - #[prost(bytes="vec", tag="1")] + #[prost(bytes = "vec", tag = "1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// @@ -539,15 +535,13 @@ pub struct CommandPreparedStatementUpdate { pub struct DoPutUpdateResult { /// The number of records updated. A return value of -1 represents /// an unknown updated record count. - #[prost(int64, tag="1")] + #[prost(int64, tag = "1")] pub record_count: i64, } /// Options for CommandGetSqlInfo. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlInfo { - // Server Information [0-500): Provides basic information about the Flight SQL Server. - /// Retrieves a UTF-8 string with the name of the Flight SQL Server. FlightSqlServerName = 0, /// Retrieves a UTF-8 string with the native version of the Flight SQL Server. @@ -561,8 +555,6 @@ pub enum SqlInfo { /// - false: if read-write /// - true: if read only FlightSqlServerReadOnly = 3, - // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. - /// /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. /// @@ -1132,16 +1124,24 @@ impl SqlInfo { SqlInfo::SqlSupportsColumnAliasing => "SQL_SUPPORTS_COLUMN_ALIASING", SqlInfo::SqlNullPlusNullIsNull => "SQL_NULL_PLUS_NULL_IS_NULL", SqlInfo::SqlSupportsConvert => "SQL_SUPPORTS_CONVERT", - SqlInfo::SqlSupportsTableCorrelationNames => "SQL_SUPPORTS_TABLE_CORRELATION_NAMES", - SqlInfo::SqlSupportsDifferentTableCorrelationNames => "SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES", - SqlInfo::SqlSupportsExpressionsInOrderBy => "SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY", + SqlInfo::SqlSupportsTableCorrelationNames => { + "SQL_SUPPORTS_TABLE_CORRELATION_NAMES" + } + SqlInfo::SqlSupportsDifferentTableCorrelationNames => { + "SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES" + } + SqlInfo::SqlSupportsExpressionsInOrderBy => { + "SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY" + } SqlInfo::SqlSupportsOrderByUnrelated => "SQL_SUPPORTS_ORDER_BY_UNRELATED", SqlInfo::SqlSupportedGroupBy => "SQL_SUPPORTED_GROUP_BY", SqlInfo::SqlSupportsLikeEscapeClause => "SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE", SqlInfo::SqlSupportsNonNullableColumns => "SQL_SUPPORTS_NON_NULLABLE_COLUMNS", SqlInfo::SqlSupportedGrammar => "SQL_SUPPORTED_GRAMMAR", SqlInfo::SqlAnsi92SupportedLevel => "SQL_ANSI92_SUPPORTED_LEVEL", - SqlInfo::SqlSupportsIntegrityEnhancementFacility => "SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY", + SqlInfo::SqlSupportsIntegrityEnhancementFacility => { + "SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY" + } SqlInfo::SqlOuterJoinsSupportLevel => "SQL_OUTER_JOINS_SUPPORT_LEVEL", SqlInfo::SqlSchemaTerm => "SQL_SCHEMA_TERM", SqlInfo::SqlProcedureTerm => "SQL_PROCEDURE_TERM", @@ -1149,11 +1149,15 @@ impl SqlInfo { SqlInfo::SqlCatalogAtStart => "SQL_CATALOG_AT_START", SqlInfo::SqlSchemasSupportedActions => "SQL_SCHEMAS_SUPPORTED_ACTIONS", SqlInfo::SqlCatalogsSupportedActions => "SQL_CATALOGS_SUPPORTED_ACTIONS", - SqlInfo::SqlSupportedPositionedCommands => "SQL_SUPPORTED_POSITIONED_COMMANDS", + SqlInfo::SqlSupportedPositionedCommands => { + "SQL_SUPPORTED_POSITIONED_COMMANDS" + } SqlInfo::SqlSelectForUpdateSupported => "SQL_SELECT_FOR_UPDATE_SUPPORTED", SqlInfo::SqlStoredProceduresSupported => "SQL_STORED_PROCEDURES_SUPPORTED", SqlInfo::SqlSupportedSubqueries => "SQL_SUPPORTED_SUBQUERIES", - SqlInfo::SqlCorrelatedSubqueriesSupported => "SQL_CORRELATED_SUBQUERIES_SUPPORTED", + SqlInfo::SqlCorrelatedSubqueriesSupported => { + "SQL_CORRELATED_SUBQUERIES_SUPPORTED" + } SqlInfo::SqlSupportedUnions => "SQL_SUPPORTED_UNIONS", SqlInfo::SqlMaxBinaryLiteralLength => "SQL_MAX_BINARY_LITERAL_LENGTH", SqlInfo::SqlMaxCharLiteralLength => "SQL_MAX_CHAR_LITERAL_LENGTH", @@ -1176,21 +1180,39 @@ impl SqlInfo { SqlInfo::SqlMaxTableNameLength => "SQL_MAX_TABLE_NAME_LENGTH", SqlInfo::SqlMaxTablesInSelect => "SQL_MAX_TABLES_IN_SELECT", SqlInfo::SqlMaxUsernameLength => "SQL_MAX_USERNAME_LENGTH", - SqlInfo::SqlDefaultTransactionIsolation => "SQL_DEFAULT_TRANSACTION_ISOLATION", + SqlInfo::SqlDefaultTransactionIsolation => { + "SQL_DEFAULT_TRANSACTION_ISOLATION" + } SqlInfo::SqlTransactionsSupported => "SQL_TRANSACTIONS_SUPPORTED", - SqlInfo::SqlSupportedTransactionsIsolationLevels => "SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS", - SqlInfo::SqlDataDefinitionCausesTransactionCommit => "SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT", - SqlInfo::SqlDataDefinitionsInTransactionsIgnored => "SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED", + SqlInfo::SqlSupportedTransactionsIsolationLevels => { + "SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS" + } + SqlInfo::SqlDataDefinitionCausesTransactionCommit => { + "SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT" + } + SqlInfo::SqlDataDefinitionsInTransactionsIgnored => { + "SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED" + } SqlInfo::SqlSupportedResultSetTypes => "SQL_SUPPORTED_RESULT_SET_TYPES", - SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED", - SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY", - SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE", - SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE", + SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified => { + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED" + } + SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly => { + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY" + } + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive => { + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE" + } + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive => { + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE" + } SqlInfo::SqlBatchUpdatesSupported => "SQL_BATCH_UPDATES_SUPPORTED", SqlInfo::SqlSavepointsSupported => "SQL_SAVEPOINTS_SUPPORTED", SqlInfo::SqlNamedParametersSupported => "SQL_NAMED_PARAMETERS_SUPPORTED", SqlInfo::SqlLocatorsUpdateCopy => "SQL_LOCATORS_UPDATE_COPY", - SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported => "SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED", + SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported => { + "SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED" + } } } } @@ -1209,10 +1231,18 @@ impl SqlSupportedCaseSensitivity { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedCaseSensitivity::SqlCaseSensitivityUnknown => "SQL_CASE_SENSITIVITY_UNKNOWN", - SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive => "SQL_CASE_SENSITIVITY_CASE_INSENSITIVE", - SqlSupportedCaseSensitivity::SqlCaseSensitivityUppercase => "SQL_CASE_SENSITIVITY_UPPERCASE", - SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase => "SQL_CASE_SENSITIVITY_LOWERCASE", + SqlSupportedCaseSensitivity::SqlCaseSensitivityUnknown => { + "SQL_CASE_SENSITIVITY_UNKNOWN" + } + SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive => { + "SQL_CASE_SENSITIVITY_CASE_INSENSITIVE" + } + SqlSupportedCaseSensitivity::SqlCaseSensitivityUppercase => { + "SQL_CASE_SENSITIVITY_UPPERCASE" + } + SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase => { + "SQL_CASE_SENSITIVITY_LOWERCASE" + } } } } @@ -1273,7 +1303,9 @@ impl SupportedAnsi92SqlGrammarLevel { pub fn as_str_name(&self) -> &'static str { match self { SupportedAnsi92SqlGrammarLevel::Ansi92EntrySql => "ANSI92_ENTRY_SQL", - SupportedAnsi92SqlGrammarLevel::Ansi92IntermediateSql => "ANSI92_INTERMEDIATE_SQL", + SupportedAnsi92SqlGrammarLevel::Ansi92IntermediateSql => { + "ANSI92_INTERMEDIATE_SQL" + } SupportedAnsi92SqlGrammarLevel::Ansi92FullSql => "ANSI92_FULL_SQL", } } @@ -1330,9 +1362,15 @@ impl SqlSupportedElementActions { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedElementActions::SqlElementInProcedureCalls => "SQL_ELEMENT_IN_PROCEDURE_CALLS", - SqlSupportedElementActions::SqlElementInIndexDefinitions => "SQL_ELEMENT_IN_INDEX_DEFINITIONS", - SqlSupportedElementActions::SqlElementInPrivilegeDefinitions => "SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS", + SqlSupportedElementActions::SqlElementInProcedureCalls => { + "SQL_ELEMENT_IN_PROCEDURE_CALLS" + } + SqlSupportedElementActions::SqlElementInIndexDefinitions => { + "SQL_ELEMENT_IN_INDEX_DEFINITIONS" + } + SqlSupportedElementActions::SqlElementInPrivilegeDefinitions => { + "SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS" + } } } } @@ -1349,8 +1387,12 @@ impl SqlSupportedPositionedCommands { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedPositionedCommands::SqlPositionedDelete => "SQL_POSITIONED_DELETE", - SqlSupportedPositionedCommands::SqlPositionedUpdate => "SQL_POSITIONED_UPDATE", + SqlSupportedPositionedCommands::SqlPositionedDelete => { + "SQL_POSITIONED_DELETE" + } + SqlSupportedPositionedCommands::SqlPositionedUpdate => { + "SQL_POSITIONED_UPDATE" + } } } } @@ -1369,10 +1411,14 @@ impl SqlSupportedSubqueries { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedSubqueries::SqlSubqueriesInComparisons => "SQL_SUBQUERIES_IN_COMPARISONS", + SqlSupportedSubqueries::SqlSubqueriesInComparisons => { + "SQL_SUBQUERIES_IN_COMPARISONS" + } SqlSupportedSubqueries::SqlSubqueriesInExists => "SQL_SUBQUERIES_IN_EXISTS", SqlSupportedSubqueries::SqlSubqueriesInIns => "SQL_SUBQUERIES_IN_INS", - SqlSupportedSubqueries::SqlSubqueriesInQuantifieds => "SQL_SUBQUERIES_IN_QUANTIFIEDS", + SqlSupportedSubqueries::SqlSubqueriesInQuantifieds => { + "SQL_SUBQUERIES_IN_QUANTIFIEDS" + } } } } @@ -1411,10 +1457,18 @@ impl SqlTransactionIsolationLevel { pub fn as_str_name(&self) -> &'static str { match self { SqlTransactionIsolationLevel::SqlTransactionNone => "SQL_TRANSACTION_NONE", - SqlTransactionIsolationLevel::SqlTransactionReadUncommitted => "SQL_TRANSACTION_READ_UNCOMMITTED", - SqlTransactionIsolationLevel::SqlTransactionReadCommitted => "SQL_TRANSACTION_READ_COMMITTED", - SqlTransactionIsolationLevel::SqlTransactionRepeatableRead => "SQL_TRANSACTION_REPEATABLE_READ", - SqlTransactionIsolationLevel::SqlTransactionSerializable => "SQL_TRANSACTION_SERIALIZABLE", + SqlTransactionIsolationLevel::SqlTransactionReadUncommitted => { + "SQL_TRANSACTION_READ_UNCOMMITTED" + } + SqlTransactionIsolationLevel::SqlTransactionReadCommitted => { + "SQL_TRANSACTION_READ_COMMITTED" + } + SqlTransactionIsolationLevel::SqlTransactionRepeatableRead => { + "SQL_TRANSACTION_REPEATABLE_READ" + } + SqlTransactionIsolationLevel::SqlTransactionSerializable => { + "SQL_TRANSACTION_SERIALIZABLE" + } } } } @@ -1432,9 +1486,15 @@ impl SqlSupportedTransactions { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedTransactions::SqlTransactionUnspecified => "SQL_TRANSACTION_UNSPECIFIED", - SqlSupportedTransactions::SqlDataDefinitionTransactions => "SQL_DATA_DEFINITION_TRANSACTIONS", - SqlSupportedTransactions::SqlDataManipulationTransactions => "SQL_DATA_MANIPULATION_TRANSACTIONS", + SqlSupportedTransactions::SqlTransactionUnspecified => { + "SQL_TRANSACTION_UNSPECIFIED" + } + SqlSupportedTransactions::SqlDataDefinitionTransactions => { + "SQL_DATA_DEFINITION_TRANSACTIONS" + } + SqlSupportedTransactions::SqlDataManipulationTransactions => { + "SQL_DATA_MANIPULATION_TRANSACTIONS" + } } } } @@ -1453,10 +1513,18 @@ impl SqlSupportedResultSetType { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedResultSetType::SqlResultSetTypeUnspecified => "SQL_RESULT_SET_TYPE_UNSPECIFIED", - SqlSupportedResultSetType::SqlResultSetTypeForwardOnly => "SQL_RESULT_SET_TYPE_FORWARD_ONLY", - SqlSupportedResultSetType::SqlResultSetTypeScrollInsensitive => "SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE", - SqlSupportedResultSetType::SqlResultSetTypeScrollSensitive => "SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE", + SqlSupportedResultSetType::SqlResultSetTypeUnspecified => { + "SQL_RESULT_SET_TYPE_UNSPECIFIED" + } + SqlSupportedResultSetType::SqlResultSetTypeForwardOnly => { + "SQL_RESULT_SET_TYPE_FORWARD_ONLY" + } + SqlSupportedResultSetType::SqlResultSetTypeScrollInsensitive => { + "SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE" + } + SqlSupportedResultSetType::SqlResultSetTypeScrollSensitive => { + "SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE" + } } } } @@ -1474,9 +1542,15 @@ impl SqlSupportedResultSetConcurrency { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUnspecified => "SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED", - SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyReadOnly => "SQL_RESULT_SET_CONCURRENCY_READ_ONLY", - SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUpdatable => "SQL_RESULT_SET_CONCURRENCY_UPDATABLE", + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUnspecified => { + "SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED" + } + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyReadOnly => { + "SQL_RESULT_SET_CONCURRENCY_READ_ONLY" + } + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUpdatable => { + "SQL_RESULT_SET_CONCURRENCY_UPDATABLE" + } } } } @@ -1519,8 +1593,12 @@ impl SqlSupportsConvert { SqlSupportsConvert::SqlConvertDecimal => "SQL_CONVERT_DECIMAL", SqlSupportsConvert::SqlConvertFloat => "SQL_CONVERT_FLOAT", SqlSupportsConvert::SqlConvertInteger => "SQL_CONVERT_INTEGER", - SqlSupportsConvert::SqlConvertIntervalDayTime => "SQL_CONVERT_INTERVAL_DAY_TIME", - SqlSupportsConvert::SqlConvertIntervalYearMonth => "SQL_CONVERT_INTERVAL_YEAR_MONTH", + SqlSupportsConvert::SqlConvertIntervalDayTime => { + "SQL_CONVERT_INTERVAL_DAY_TIME" + } + SqlSupportsConvert::SqlConvertIntervalYearMonth => { + "SQL_CONVERT_INTERVAL_YEAR_MONTH" + } SqlSupportsConvert::SqlConvertLongvarbinary => "SQL_CONVERT_LONGVARBINARY", SqlSupportsConvert::SqlConvertLongvarchar => "SQL_CONVERT_LONGVARCHAR", SqlSupportsConvert::SqlConvertNumeric => "SQL_CONVERT_NUMERIC", @@ -1643,8 +1721,12 @@ impl XdbcDatetimeSubcode { XdbcDatetimeSubcode::XdbcSubcodeYear => "XDBC_SUBCODE_YEAR", XdbcDatetimeSubcode::XdbcSubcodeTime => "XDBC_SUBCODE_TIME", XdbcDatetimeSubcode::XdbcSubcodeTimestamp => "XDBC_SUBCODE_TIMESTAMP", - XdbcDatetimeSubcode::XdbcSubcodeTimeWithTimezone => "XDBC_SUBCODE_TIME_WITH_TIMEZONE", - XdbcDatetimeSubcode::XdbcSubcodeTimestampWithTimezone => "XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE", + XdbcDatetimeSubcode::XdbcSubcodeTimeWithTimezone => { + "XDBC_SUBCODE_TIME_WITH_TIMEZONE" + } + XdbcDatetimeSubcode::XdbcSubcodeTimestampWithTimezone => { + "XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE" + } XdbcDatetimeSubcode::XdbcSubcodeSecond => "XDBC_SUBCODE_SECOND", XdbcDatetimeSubcode::XdbcSubcodeYearToMonth => "XDBC_SUBCODE_YEAR_TO_MONTH", XdbcDatetimeSubcode::XdbcSubcodeDayToHour => "XDBC_SUBCODE_DAY_TO_HOUR", @@ -1652,20 +1734,42 @@ impl XdbcDatetimeSubcode { XdbcDatetimeSubcode::XdbcSubcodeDayToSecond => "XDBC_SUBCODE_DAY_TO_SECOND", XdbcDatetimeSubcode::XdbcSubcodeHourToMinute => "XDBC_SUBCODE_HOUR_TO_MINUTE", XdbcDatetimeSubcode::XdbcSubcodeHourToSecond => "XDBC_SUBCODE_HOUR_TO_SECOND", - XdbcDatetimeSubcode::XdbcSubcodeMinuteToSecond => "XDBC_SUBCODE_MINUTE_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeMinuteToSecond => { + "XDBC_SUBCODE_MINUTE_TO_SECOND" + } XdbcDatetimeSubcode::XdbcSubcodeIntervalYear => "XDBC_SUBCODE_INTERVAL_YEAR", - XdbcDatetimeSubcode::XdbcSubcodeIntervalMonth => "XDBC_SUBCODE_INTERVAL_MONTH", + XdbcDatetimeSubcode::XdbcSubcodeIntervalMonth => { + "XDBC_SUBCODE_INTERVAL_MONTH" + } XdbcDatetimeSubcode::XdbcSubcodeIntervalDay => "XDBC_SUBCODE_INTERVAL_DAY", XdbcDatetimeSubcode::XdbcSubcodeIntervalHour => "XDBC_SUBCODE_INTERVAL_HOUR", - XdbcDatetimeSubcode::XdbcSubcodeIntervalMinute => "XDBC_SUBCODE_INTERVAL_MINUTE", - XdbcDatetimeSubcode::XdbcSubcodeIntervalSecond => "XDBC_SUBCODE_INTERVAL_SECOND", - XdbcDatetimeSubcode::XdbcSubcodeIntervalYearToMonth => "XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH", - XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToHour => "XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR", - XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToMinute => "XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE", - XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToSecond => "XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND", - XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToMinute => "XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE", - XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToSecond => "XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND", - XdbcDatetimeSubcode::XdbcSubcodeIntervalMinuteToSecond => "XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND", + XdbcDatetimeSubcode::XdbcSubcodeIntervalMinute => { + "XDBC_SUBCODE_INTERVAL_MINUTE" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalSecond => { + "XDBC_SUBCODE_INTERVAL_SECOND" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalYearToMonth => { + "XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToHour => { + "XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToMinute => { + "XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalDayToSecond => { + "XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToMinute => { + "XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalHourToSecond => { + "XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND" + } + XdbcDatetimeSubcode::XdbcSubcodeIntervalMinuteToSecond => { + "XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND" + } } } } From e2c419953504bfc09d99436dc271ac446d216ac8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 6 Nov 2022 10:13:41 +1300 Subject: [PATCH 0222/1411] Support Predicate Pushdown for Parquet Lists (#2108) (#2999) * Add buffer to ColumnLevelDecoderImpl (#2108) * Implement skip_rep_levels * Add integration test * Clippy --- parquet/src/arrow/arrow_reader/mod.rs | 28 ++++ parquet/src/column/reader/decoder.rs | 221 ++++++++++++++++++++++---- parquet/src/encodings/rle.rs | 1 + 3 files changed, 218 insertions(+), 32 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 7f68b07eb487..eea271306e25 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2421,4 +2421,32 @@ mod tests { let a: &Float64Array = batch.column(2).as_any().downcast_ref().unwrap(); assert_eq!(a.values(), &[42.000000, 7.700000, 42.125000, 7.700000]); } + + #[test] + #[cfg(feature = "snap")] + fn test_read_nested_lists() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/nested_lists.snappy.parquet", testdata); + let file = File::open(&path).unwrap(); + + let f = file.try_clone().unwrap(); + let mut reader = ParquetRecordBatchReader::try_new(f, 60).unwrap(); + let expected = reader.next().unwrap().unwrap(); + assert_eq!(expected.num_rows(), 3); + + let selection = RowSelection::from(vec![ + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .with_row_selection(selection) + .build() + .unwrap(); + + let actual = reader.next().unwrap().unwrap(); + assert_eq!(actual.num_rows(), 1); + assert_eq!(actual.column(0), &expected.column(0).slice(1, 1)); + } } diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index b95b24a21c4b..da7fa78fe485 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -264,9 +264,13 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { } } +const SKIP_BUFFER_SIZE: usize = 1024; + /// An implementation of [`ColumnLevelDecoder`] for `[i16]` pub struct ColumnLevelDecoderImpl { decoder: Option, + /// Temporary buffer populated when skipping values + buffer: Vec, bit_width: u8, } @@ -275,9 +279,36 @@ impl ColumnLevelDecoderImpl { let bit_width = num_required_bits(max_level as u64); Self { decoder: None, + buffer: vec![], bit_width, } } + + /// Drops the first `len` values from the internal buffer + fn split_off_buffer(&mut self, len: usize) { + match self.buffer.len() == len { + true => self.buffer.clear(), + false => { + // Move to_read elements to end of slice + self.buffer.rotate_left(len); + // Truncate buffer + self.buffer.truncate(self.buffer.len() - len); + } + } + } + + /// Reads up to `to_read` values to the internal buffer + fn read_to_buffer(&mut self, to_read: usize) -> Result<()> { + let mut buf = std::mem::take(&mut self.buffer); + + // Repopulate buffer + buf.resize(to_read, 0); + let actual = self.read(&mut buf, 0..to_read)?; + buf.truncate(actual); + + self.buffer = buf; + Ok(()) + } } enum LevelDecoderInner { @@ -289,6 +320,7 @@ impl ColumnLevelDecoder for ColumnLevelDecoderImpl { type Slice = [i16]; fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + self.buffer.clear(); match encoding { Encoding::RLE => { let mut decoder = RleDecoder::new(self.bit_width); @@ -305,12 +337,25 @@ impl ColumnLevelDecoder for ColumnLevelDecoderImpl { } } - fn read(&mut self, out: &mut Self::Slice, range: Range) -> Result { + fn read(&mut self, out: &mut Self::Slice, mut range: Range) -> Result { + let read_from_buffer = match self.buffer.is_empty() { + true => 0, + false => { + let read_from_buffer = self.buffer.len().min(range.end - range.start); + out[range.start..range.start + read_from_buffer] + .copy_from_slice(&self.buffer[0..read_from_buffer]); + self.split_off_buffer(read_from_buffer); + read_from_buffer + } + }; + range.start += read_from_buffer; + match self.decoder.as_mut().unwrap() { - LevelDecoderInner::Packed(reader, bit_width) => { - Ok(reader.get_batch::(&mut out[range], *bit_width as usize)) + LevelDecoderInner::Packed(reader, bit_width) => Ok(read_from_buffer + + reader.get_batch::(&mut out[range], *bit_width as usize)), + LevelDecoderInner::Rle(reader) => { + Ok(read_from_buffer + reader.get_batch(&mut out[range])?) } - LevelDecoderInner::Rle(reader) => reader.get_batch(&mut out[range]), } } } @@ -323,41 +368,153 @@ impl DefinitionLevelDecoder for ColumnLevelDecoderImpl { ) -> Result<(usize, usize)> { let mut level_skip = 0; let mut value_skip = 0; - match self.decoder.as_mut().unwrap() { - LevelDecoderInner::Packed(reader, bit_width) => { - for _ in 0..num_levels { - // Values are delimited by max_def_level - if max_def_level - == reader - .get_value::(*bit_width as usize) - .expect("Not enough values in Packed ColumnLevelDecoderImpl.") - { - value_skip += 1; - } - level_skip += 1; - } - } - LevelDecoderInner::Rle(reader) => { - for _ in 0..num_levels { - if let Some(level) = reader - .get::() - .expect("Not enough values in Rle ColumnLevelDecoderImpl.") - { - // Values are delimited by max_def_level - if level == max_def_level { - value_skip += 1; - } - } - level_skip += 1; + while level_skip < num_levels { + let remaining_levels = num_levels - level_skip; + + if self.buffer.is_empty() { + // Only read number of needed values + self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?; + if self.buffer.is_empty() { + // Reached end of page + break; } } + let to_read = self.buffer.len().min(remaining_levels); + + level_skip += to_read; + value_skip += self.buffer[..to_read] + .iter() + .filter(|x| **x == max_def_level) + .count(); + + self.split_off_buffer(to_read) } + Ok((value_skip, level_skip)) } } impl RepetitionLevelDecoder for ColumnLevelDecoderImpl { - fn skip_rep_levels(&mut self, _num_records: usize) -> Result<(usize, usize)> { - Err(nyi_err!("https://github.com/apache/arrow-rs/issues/1792")) + fn skip_rep_levels(&mut self, num_records: usize) -> Result<(usize, usize)> { + let mut level_skip = 0; + let mut record_skip = 0; + + loop { + if self.buffer.is_empty() { + // Read SKIP_BUFFER_SIZE as we don't know how many to read + self.read_to_buffer(SKIP_BUFFER_SIZE)?; + if self.buffer.is_empty() { + // Reached end of page + break; + } + } + + let mut to_skip = 0; + while to_skip < self.buffer.len() && record_skip != num_records { + if self.buffer[to_skip] == 0 { + record_skip += 1; + } + to_skip += 1; + } + + // Find end of record + while to_skip < self.buffer.len() && self.buffer[to_skip] != 0 { + to_skip += 1; + } + + level_skip += to_skip; + if to_skip >= self.buffer.len() { + // Need to to read more values + self.buffer.clear(); + continue; + } + + self.split_off_buffer(to_skip); + break; + } + + Ok((record_skip, level_skip)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::encodings::rle::RleEncoder; + use rand::prelude::*; + + fn test_skip_levels(encoded: &[i16], data: ByteBufferPtr, skip: F) + where + F: Fn(&mut ColumnLevelDecoderImpl, &mut usize, usize), + { + let mut rng = thread_rng(); + let mut decoder = ColumnLevelDecoderImpl::new(5); + decoder.set_data(Encoding::RLE, data); + + let mut read = 0; + let mut decoded = vec![]; + let mut expected = vec![]; + while read < encoded.len() { + let to_read = rng.gen_range(0..(encoded.len() - read).min(100)) + 1; + + if rng.gen_bool(0.5) { + skip(&mut decoder, &mut read, to_read) + } else { + let start = decoded.len(); + let end = decoded.len() + to_read; + decoded.resize(end, 0); + let actual_read = decoder.read(&mut decoded, start..end).unwrap(); + assert_eq!(actual_read, to_read); + expected.extend_from_slice(&encoded[read..read + to_read]); + read += to_read; + } + } + assert_eq!(decoded, expected); + } + + #[test] + fn test_skip() { + let mut rng = thread_rng(); + let total_len = 10000; + let encoded: Vec = (0..total_len).map(|_| rng.gen_range(0..5)).collect(); + let mut encoder = RleEncoder::new(3, 1024); + for v in &encoded { + encoder.put(*v as _) + } + let data = ByteBufferPtr::new(encoder.consume()); + + for _ in 0..10 { + test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { + let (values_skipped, levels_skipped) = + decoder.skip_def_levels(to_read, 5).unwrap(); + assert_eq!(levels_skipped, to_read); + + let expected = &encoded[*read..*read + to_read]; + let expected_values_skipped = + expected.iter().filter(|x| **x == 5).count(); + assert_eq!(values_skipped, expected_values_skipped); + *read += to_read; + }); + + test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { + let (records_skipped, levels_skipped) = + decoder.skip_rep_levels(to_read).unwrap(); + + // If not run out of values + if levels_skipped + *read != encoded.len() { + // Should have read correct number of records + assert_eq!(records_skipped, to_read); + // Next value should be start of record + assert_eq!(encoded[levels_skipped + *read], 0); + } + + let expected = &encoded[*read..*read + levels_skipped]; + let expected_records_skipped = + expected.iter().filter(|x| **x == 0).count(); + assert_eq!(records_skipped, expected_records_skipped); + + *read += levels_skipped; + }); + } } } diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 9475275cb625..b0ae5af07d7f 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -338,6 +338,7 @@ impl RleDecoder { // These functions inline badly, they tend to inline and then create very large loop unrolls // that damage L1d-cache occupancy. This results in a ~18% performance drop #[inline(never)] + #[allow(unused)] pub fn get(&mut self) -> Result> { assert!(size_of::() <= 8); From 53b7f64d7c78d3173055fbf870c67294b80f81ef Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 6 Nov 2022 10:38:47 +1300 Subject: [PATCH 0223/1411] Make various i256 methods const (#3026) --- arrow-buffer/src/bigint.rs | 76 +++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 463c63729adc..e87c05826fe2 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -80,40 +80,36 @@ impl i256 { /// Create an integer value from its representation as a byte array in little-endian. #[inline] - pub fn from_le_bytes(b: [u8; 32]) -> Self { + pub const fn from_le_bytes(b: [u8; 32]) -> Self { + let (low, high) = split_array(b); Self { - high: i128::from_le_bytes(b[16..32].try_into().unwrap()), - low: u128::from_le_bytes(b[0..16].try_into().unwrap()), + high: i128::from_le_bytes(high), + low: u128::from_le_bytes(low), } } /// Create an integer value from its representation as a byte array in little-endian. #[inline] - pub fn from_be_bytes(b: [u8; 32]) -> Self { + pub const fn from_be_bytes(b: [u8; 32]) -> Self { + let (high, low) = split_array(b); Self { - high: i128::from_be_bytes(b[0..16].try_into().unwrap()), - low: u128::from_be_bytes(b[16..32].try_into().unwrap()), + high: i128::from_be_bytes(high), + low: u128::from_be_bytes(low), } } - pub fn from_i128(v: i128) -> Self { - let mut bytes = if num::Signed::is_negative(&v) { - [255_u8; 32] - } else { - [0; 32] - }; - bytes[0..16].copy_from_slice(&v.to_le_bytes()); - Self::from_le_bytes(bytes) + pub const fn from_i128(v: i128) -> Self { + Self::from_parts(v as u128, v >> 127) } /// Create an i256 from the provided low u128 and high i128 #[inline] - pub fn from_parts(low: u128, high: i128) -> Self { + pub const fn from_parts(low: u128, high: i128) -> Self { Self { low, high } } /// Returns this `i256` as a low u128 and high i128 - pub fn to_parts(self) -> (u128, i128) { + pub const fn to_parts(self) -> (u128, i128) { (self.low, self.high) } @@ -131,23 +127,31 @@ impl i256 { /// Return the memory representation of this integer as a byte array in little-endian byte order. #[inline] - pub fn to_le_bytes(self) -> [u8; 32] { + pub const fn to_le_bytes(self) -> [u8; 32] { + let low = self.low.to_le_bytes(); + let high = self.high.to_le_bytes(); let mut t = [0; 32]; - let t_low: &mut [u8; 16] = (&mut t[0..16]).try_into().unwrap(); - *t_low = self.low.to_le_bytes(); - let t_high: &mut [u8; 16] = (&mut t[16..32]).try_into().unwrap(); - *t_high = self.high.to_le_bytes(); + let mut i = 0; + while i != 16 { + t[i] = low[i]; + t[i + 16] = high[i]; + i += 1; + } t } /// Return the memory representation of this integer as a byte array in big-endian byte order. #[inline] - pub fn to_be_bytes(self) -> [u8; 32] { + pub const fn to_be_bytes(self) -> [u8; 32] { + let low = self.low.to_be_bytes(); + let high = self.high.to_be_bytes(); let mut t = [0; 32]; - let t_low: &mut [u8; 16] = (&mut t[0..16]).try_into().unwrap(); - *t_low = self.high.to_be_bytes(); - let t_high: &mut [u8; 16] = (&mut t[16..32]).try_into().unwrap(); - *t_high = self.low.to_be_bytes(); + let mut i = 0; + while i != 16 { + t[i] = high[i]; + t[i + 16] = low[i]; + i += 1; + } t } @@ -369,6 +373,20 @@ impl i256 { } } +/// Temporary workaround due to lack of stable const array slicing +/// See +const fn split_array(vals: [u8; 32]) -> ([u8; 16], [u8; 16]) { + let mut a = [0; 16]; + let mut b = [0; 16]; + let mut i = 0; + while i != 16 { + a[i] = vals[i]; + b[i] = vals[i + 16]; + i += 1; + } + (a, b) +} + /// Performs an unsigned multiplication of `a * b` returning a tuple of /// `(low, high)` where `low` contains the lower 128-bits of the result /// and `high` the higher 128-bits @@ -490,6 +508,12 @@ mod tests { // Comparison assert_eq!(il.cmp(&ir), bl.cmp(&br), "{} cmp {}", bl, br); + // Conversions + assert_eq!(i256::from_le_bytes(il.to_le_bytes()), il); + assert_eq!(i256::from_be_bytes(il.to_be_bytes()), il); + assert_eq!(i256::from_le_bytes(ir.to_le_bytes()), ir); + assert_eq!(i256::from_be_bytes(ir.to_be_bytes()), ir); + // To i128 assert_eq!(il.to_i128(), bl.to_i128(), "{}", bl); assert_eq!(ir.to_i128(), br.to_i128(), "{}", br); From 488eff0e44480ddc017480144617ba271d4d3e23 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 5 Nov 2022 17:50:52 -0700 Subject: [PATCH 0224/1411] Validate decimal256 with i256 directly (#3025) * Validate decimal256 with i256 * Use from_le_bytes * Trigger Build --- arrow-array/src/types.rs | 8 +- arrow-data/src/decimal.rs | 677 ++++++++++++++++++-------------------- 2 files changed, 319 insertions(+), 366 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 7c7a5c811550..03ecef361b04 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -22,9 +22,9 @@ use crate::delta::shift_months; use crate::OffsetSizeTrait; use arrow_buffer::i256; use arrow_data::decimal::{ - validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, - DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, + validate_decimal256_precision, validate_decimal_precision, DECIMAL128_MAX_PRECISION, + DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, + DECIMAL_DEFAULT_SCALE, }; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; @@ -554,7 +554,7 @@ impl DecimalType for Decimal256Type { } fn validate_decimal_precision(num: i256, precision: u8) -> Result<(), ArrowError> { - validate_decimal256_precision_with_lt_bytes(&num.to_le_bytes(), precision) + validate_decimal256_precision(num, precision) } } diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index 592a461ad5cd..a6a08774941e 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -15,628 +15,627 @@ // specific language governing permissions and limitations // under the License. +use arrow_buffer::i256; use arrow_schema::ArrowError; -use num::BigInt; -use std::cmp::Ordering; // MAX decimal256 value of little-endian format for each precision. // Each element is the max value of signed 256-bit integer for the specified precision which // is encoded to the 32-byte width format of little-endian. -pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ - [ +pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ + i256::from_le_bytes([ 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, - ], - [ + ]), + i256::from_le_bytes([ 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, - ], + ]), ]; // MIN decimal256 value of little-endian format for each precision. // Each element is the min value of signed 256-bit integer for the specified precision which // is encoded to the 76-byte width format of little-endian. -pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ - [ +pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ + i256::from_le_bytes([ 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, - ], - [ + ]), + i256::from_le_bytes([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, - ], + ]), ]; /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value @@ -770,11 +769,11 @@ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), Arro } } -/// Validates that the specified `byte_array` of little-endian format -/// value can be properly interpreted as a Decimal256 number with precision `precision` +/// Validates that the specified `i256` of value can be properly +/// interpreted as a Decimal256 number with precision `precision` #[inline] -pub fn validate_decimal256_precision_with_lt_bytes( - lt_value: &[u8], +pub fn validate_decimal256_precision( + value: i256, precision: u8, ) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { @@ -786,63 +785,17 @@ pub fn validate_decimal256_precision_with_lt_bytes( let max = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[usize::from(precision) - 1]; let min = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[usize::from(precision) - 1]; - if singed_cmp_le_bytes(lt_value, &max) == Ordering::Greater { + if value > max { Err(ArrowError::InvalidArgumentError(format!( "{:?} is too large to store in a Decimal256 of precision {}. Max is {:?}", - BigInt::from_signed_bytes_le(lt_value), - precision, - BigInt::from_signed_bytes_le(&max) + value, precision, max ))) - } else if singed_cmp_le_bytes(lt_value, &min) == Ordering::Less { + } else if value < min { Err(ArrowError::InvalidArgumentError(format!( "{:?} is too small to store in a Decimal256 of precision {}. Min is {:?}", - BigInt::from_signed_bytes_le(lt_value), - precision, - BigInt::from_signed_bytes_le(&min) + value, precision, min ))) } else { Ok(()) } } - -// compare two signed integer which are encoded with little endian. -// left bytes and right bytes must have the same length. -#[inline] -pub fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { - assert_eq!( - left.len(), - right.len(), - "Can't compare bytes array with different len: {}, {}", - left.len(), - right.len() - ); - assert_ne!(left.len(), 0, "Can't compare bytes array of length 0"); - let len = left.len(); - // the sign bit is 1, the value is negative - let left_negative = left[len - 1] >= 0x80_u8; - let right_negative = right[len - 1] >= 0x80_u8; - if left_negative != right_negative { - return match left_negative { - true => { - // left is negative value - // right is positive value - Ordering::Less - } - false => Ordering::Greater, - }; - } - for i in 0..len { - let l_byte = left[len - 1 - i]; - let r_byte = right[len - 1 - i]; - match l_byte.cmp(&r_byte) { - Ordering::Less => { - return Ordering::Less; - } - Ordering::Greater => { - return Ordering::Greater; - } - Ordering::Equal => {} - } - } - Ordering::Equal -} From 4f525fe1daa1058dfa90b3cd72cb6cc957f2ea7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Gallego=20Castellanos?= Date: Sun, 6 Nov 2022 05:14:42 +0100 Subject: [PATCH 0225/1411] Hadoop LZ4 Support for LZ4 Codec (#3013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added tests for hadoop_lz4_compress_large.parquet * Changed interface to be able to receive CodecOptions. * Added `CodecOptions` struct to hold `Codec` configuration. * Added `backward_compatible_lz4` option in `CodecOptions`. * Added `CodecOptions` to `ReadOptions` to be able to configure `SerializedFileReader`. * Added `SerializedRowGroupReaderOptionsBuilder` with `CodecOptions` to be able to configure `SerializedRowGroupReader`, with extensible interface. * Added `SerializedPageReaderOptionsBuilder` with `CodecOptions` to be able to configure `SerializedPageReader`, with extensible interface. * Added `new_with_config` to `SerializedPageReader` API to be able to configure `SerializedFileReader` without breaking `new` API. * `CodecOptions` implements `CopyTrait` as it is composed by `Copy` types. If in the future it contains a non `Copy` type, maybe is better to create `CodecOptionsPtr = Arc`. * `CodecOptions` is only added in the read path, in the write path the default values are taken, as the options currently only affect the read path and have no effect on write path. If required to add to write path maybe it will be nice to add into `WriteProperties`. * Added support for LZ4_HADOOP compression codec. * Added compression and decompression for LZ4_HADOOP. * Added tests for LZ4 fallback. * Added a test for two parquet files with the same content, both with LZ4 CompressionCodec, but one using the LZ4_HADOOP (no-fallback) algorithm and the other LZ4_RAW algorithm (fallback to last level). * Refactor `LZ4HadoopCodec::compress` function to make it easier to understand. * Fixed documentation tests. * Changed interface to make `CodecOptions` private to the crate. This commits hides `CodecOptions` from the public API. The changes are the following: - Added a new structs to public API `ReaderProperties`, `ReaderPropertiesBuilder` and `ReaderPropertiesPtr` to store inmutable reader config, as it is the case of `CodecOptions`. - Removed `SerializedRowGroupReaderOptions`, `SerializedRowGroupReaderOptionsBuilder`, `SerializedPageReaderOptionsBuilder` and `SerializedPageReaderOptions`. They are not required anymore as `SerializedRowGroupReader` and `SerializedRowGroupReaderOptions` use `ReaderPropertiesPtr` for configuration. - `SerializedRowGroupReader::new_with_options` renamed to `SerializedRowGroupReader::new_with_properties`. - `SerializedPageReader::new_with_options` renamed to `SerializedPageReader::new_with_properties`. - Test added for `ReaderPropertiesBuilder`. * Removed incorrect cfg macro for `try_hadoop_decompress` function. Co-authored-by: Adrián Gallego Castellanos --- parquet/src/arrow/arrow_reader/mod.rs | 70 ++++++ parquet/src/column/writer/mod.rs | 32 ++- parquet/src/compression.rs | 304 ++++++++++++++++++++++++-- parquet/src/file/properties.rs | 101 +++++++++ parquet/src/file/serialized_reader.rs | 63 +++++- parquet/src/file/writer.rs | 15 +- parquet/tests/arrow_writer_layout.rs | 8 +- 7 files changed, 550 insertions(+), 43 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index eea271306e25..19c877dffc2c 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2422,6 +2422,76 @@ mod tests { assert_eq!(a.values(), &[42.000000, 7.700000, 42.125000, 7.700000]); } + // This test is to ensure backward compatibility, it test 2 files containing the LZ4 CompressionCodec + // but different algorithms: LZ4_HADOOP and LZ4_RAW. + // 1. hadoop_lz4_compressed.parquet -> It is a file with LZ4 CompressionCodec which uses + // LZ4_HADOOP algorithm for compression. + // 2. non_hadoop_lz4_compressed.parquet -> It is a file with LZ4 CompressionCodec which uses + // LZ4_RAW algorithm for compression. This fallback is done to keep backward compatibility with + // older parquet-cpp versions. + // + // For more information, check: https://github.com/apache/arrow-rs/issues/2988 + #[test] + fn test_read_lz4_hadoop_fallback() { + for file in [ + "hadoop_lz4_compressed.parquet", + "non_hadoop_lz4_compressed.parquet", + ] { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/{}", testdata, file); + let file = File::open(&path).unwrap(); + let expected_rows = 4; + + let batches = ParquetRecordBatchReader::try_new(file, expected_rows) + .unwrap() + .collect::>>() + .unwrap(); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_rows(), expected_rows); + + let a: &Int64Array = batch.column(0).as_any().downcast_ref().unwrap(); + assert_eq!( + a.values(), + &[1593604800, 1593604800, 1593604801, 1593604801] + ); + + let b: &BinaryArray = batch.column(1).as_any().downcast_ref().unwrap(); + let b: Vec<_> = b.iter().flatten().collect(); + assert_eq!(b, &[b"abc", b"def", b"abc", b"def"]); + + let c: &Float64Array = batch.column(2).as_any().downcast_ref().unwrap(); + assert_eq!(c.values(), &[42.0, 7.7, 42.125, 7.7]); + } + } + + #[test] + fn test_read_lz4_hadoop_large() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/hadoop_lz4_compressed_larger.parquet", testdata); + let file = File::open(&path).unwrap(); + let expected_rows = 10000; + + let batches = ParquetRecordBatchReader::try_new(file, expected_rows) + .unwrap() + .collect::>>() + .unwrap(); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), expected_rows); + + let a: &StringArray = batch.column(0).as_any().downcast_ref().unwrap(); + let a: Vec<_> = a.iter().flatten().collect(); + assert_eq!(a[0], "c7ce6bef-d5b0-4863-b199-8ea8c7fb117b"); + assert_eq!(a[1], "e8fb9197-cb9f-4118-b67f-fbfa65f61843"); + assert_eq!(a[expected_rows - 2], "ab52a0cc-c6bb-4d61-8a8f-166dc4b8b13c"); + assert_eq!(a[expected_rows - 1], "85440778-460a-41ac-aa2e-ac3ee41696bf"); + } + #[test] #[cfg(feature = "snap")] fn test_read_nested_lists() { diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 7415d9aad0a7..3cdf04f5494c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -24,7 +24,7 @@ use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::{ ColumnValueEncoder, ColumnValueEncoderImpl, ColumnValues, }; -use crate::compression::{create_codec, Codec}; +use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::levels::LevelEncoder; @@ -221,7 +221,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { page_writer: Box, ) -> Self { let codec = props.compression(descr.path()); - let compressor = create_codec(codec).unwrap(); + let codec_options = CodecOptionsBuilder::default().build(); + let compressor = create_codec(codec, &codec_options).unwrap(); let encoder = E::try_new(&descr, props.as_ref()).unwrap(); let statistics_enabled = props.statistics_enabled(descr.path()); @@ -1107,7 +1108,8 @@ mod tests { }; use crate::file::writer::TrackedWrite; use crate::file::{ - properties::WriterProperties, reader::SerializedPageReader, + properties::{ReaderProperties, WriterProperties}, + reader::SerializedPageReader, writer::SerializedPageWriter, }; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; @@ -1674,11 +1676,15 @@ mod tests { assert_eq!(stats.null_count(), 0); assert!(stats.distinct_count().is_none()); - let reader = SerializedPageReader::new( + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); + let reader = SerializedPageReader::new_with_properties( Arc::new(Bytes::from(buf)), &r.metadata, r.rows_written as usize, None, + Arc::new(props), ) .unwrap(); @@ -1714,11 +1720,15 @@ mod tests { let r = writer.close().unwrap(); assert!(r.metadata.statistics().is_none()); - let reader = SerializedPageReader::new( + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); + let reader = SerializedPageReader::new_with_properties( Arc::new(Bytes::from(buf)), &r.metadata, r.rows_written as usize, None, + Arc::new(props), ) .unwrap(); @@ -1842,12 +1852,16 @@ mod tests { let r = writer.close().unwrap(); // Read pages and check the sequence + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); let mut page_reader = Box::new( - SerializedPageReader::new( + SerializedPageReader::new_with_properties( Arc::new(file), &r.metadata, r.rows_written as usize, None, + Arc::new(props), ) .unwrap(), ); @@ -2210,12 +2224,16 @@ mod tests { assert_eq!(values_written, values.len()); let result = writer.close().unwrap(); + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); let page_reader = Box::new( - SerializedPageReader::new( + SerializedPageReader::new_with_properties( Arc::new(file), &result.metadata, result.rows_written as usize, None, + Arc::new(props), ) .unwrap(), ); diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 310dbd34f1f6..bba14f94e2eb 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -26,9 +26,12 @@ # Example ```no_run -use parquet::{basic::Compression, compression::create_codec}; +use parquet::{basic::Compression, compression::{create_codec, CodecOptionsBuilder}}; -let mut codec = match create_codec(Compression::SNAPPY) { +let codec_options = CodecOptionsBuilder::default() + .set_backward_compatible_lz4(false) + .build(); +let mut codec = match create_codec(Compression::SNAPPY, &codec_options) { Ok(Some(codec)) => codec, _ => panic!(), }; @@ -71,10 +74,60 @@ pub trait Codec: Send { ) -> Result; } +/// Struct to hold `Codec` creation options. +#[derive(Debug, PartialEq, Eq)] +pub struct CodecOptions { + /// Whether or not to fallback to other LZ4 older implementations on error in LZ4_HADOOP. + backward_compatible_lz4: bool, +} + +impl Default for CodecOptions { + fn default() -> Self { + CodecOptionsBuilder::default().build() + } +} + +pub struct CodecOptionsBuilder { + /// Whether or not to fallback to other LZ4 older implementations on error in LZ4_HADOOP. + backward_compatible_lz4: bool, +} + +impl Default for CodecOptionsBuilder { + fn default() -> Self { + Self { + backward_compatible_lz4: true, + } + } +} + +impl CodecOptionsBuilder { + /// Enable/disable backward compatible LZ4. + /// + /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback + /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility + /// with files generated by older versions of this library, and LZ4_RAW, for backward + /// compatibility with files generated by older versions of parquet-cpp. + /// + /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error. + pub fn set_backward_compatible_lz4(mut self, value: bool) -> CodecOptionsBuilder { + self.backward_compatible_lz4 = value; + self + } + + pub fn build(self) -> CodecOptions { + CodecOptions { + backward_compatible_lz4: self.backward_compatible_lz4, + } + } +} + /// Given the compression type `codec`, returns a codec used to compress and decompress /// bytes for the compression type. /// This returns `None` if the codec type is `UNCOMPRESSED`. -pub fn create_codec(codec: CodecType) -> Result>> { +pub fn create_codec( + codec: CodecType, + options: &CodecOptions, +) -> Result>> { match codec { #[cfg(any(feature = "brotli", test))] CodecType::BROTLI => Ok(Some(Box::new(BrotliCodec::new()))), @@ -83,7 +136,9 @@ pub fn create_codec(codec: CodecType) -> Result>> { #[cfg(any(feature = "snap", test))] CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), #[cfg(any(feature = "lz4", test))] - CodecType::LZ4 => Ok(Some(Box::new(LZ4Codec::new()))), + CodecType::LZ4 => Ok(Some(Box::new(LZ4HadoopCodec::new( + options.backward_compatible_lz4, + )))), #[cfg(any(feature = "zstd", test))] CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), #[cfg(any(feature = "lz4", test))] @@ -348,6 +403,7 @@ pub use zstd_codec::*; #[cfg(any(feature = "lz4", test))] mod lz4_raw_codec { use crate::compression::Codec; + use crate::errors::ParquetError; use crate::errors::Result; /// Codec for LZ4 Raw compression algorithm. @@ -360,12 +416,6 @@ mod lz4_raw_codec { } } - // Compute max LZ4 uncompress size. - // Check https://stackoverflow.com/questions/25740471/lz4-library-decompressed-data-upper-bound-size-estimation - fn max_uncompressed_size(compressed_size: usize) -> usize { - (compressed_size << 8) - compressed_size - 2526 - } - impl Codec for LZ4RawCodec { fn decompress( &mut self, @@ -374,8 +424,14 @@ mod lz4_raw_codec { uncompress_size: Option, ) -> Result { let offset = output_buf.len(); - let required_len = - uncompress_size.unwrap_or_else(|| max_uncompressed_size(input_buf.len())); + let required_len = match uncompress_size { + Some(uncompress_size) => uncompress_size, + None => { + return Err(ParquetError::General( + "LZ4RawCodec unsupported without uncompress_size".into(), + )) + } + }; output_buf.resize(offset + required_len, 0); match lz4::block::decompress_to_buffer( input_buf, @@ -383,8 +439,10 @@ mod lz4_raw_codec { &mut output_buf[offset..], ) { Ok(n) => { - if n < required_len { - output_buf.truncate(offset + n); + if n != required_len { + return Err(ParquetError::General( + "LZ4RawCodec uncompress_size is not the expected one".into(), + )); } Ok(n) } @@ -414,6 +472,190 @@ mod lz4_raw_codec { #[cfg(any(feature = "lz4", test))] pub use lz4_raw_codec::*; +#[cfg(any(feature = "lz4", test))] +mod lz4_hadoop_codec { + use crate::compression::lz4_codec::LZ4Codec; + use crate::compression::lz4_raw_codec::LZ4RawCodec; + use crate::compression::Codec; + use crate::errors::{ParquetError, Result}; + use std::io; + + /// Size of u32 type. + const SIZE_U32: usize = std::mem::size_of::(); + + /// Length of the LZ4_HADOOP prefix. + const PREFIX_LEN: usize = SIZE_U32 * 2; + + /// Codec for LZ4 Hadoop compression algorithm. + pub struct LZ4HadoopCodec { + /// Whether or not to fallback to other LZ4 implementations on error. + /// Fallback is done to be backward compatible with older versions of this + /// library and older versions parquet-cpp. + backward_compatible_lz4: bool, + } + + impl LZ4HadoopCodec { + /// Creates new LZ4 Hadoop compression codec. + pub(crate) fn new(backward_compatible_lz4: bool) -> Self { + Self { + backward_compatible_lz4, + } + } + } + + /// Try to decompress the buffer as if it was compressed with the Hadoop Lz4Codec. + /// Adapted from pola-rs [compression.rs:try_decompress_hadoop](https://pola-rs.github.io/polars/src/parquet2/compression.rs.html#225) + /// Translated from the apache arrow c++ function [TryDecompressHadoop](https://github.com/apache/arrow/blob/bf18e6e4b5bb6180706b1ba0d597a65a4ce5ca48/cpp/src/arrow/util/compression_lz4.cc#L474). + /// Returns error if decompression failed. + fn try_decompress_hadoop( + input_buf: &[u8], + output_buf: &mut [u8], + ) -> io::Result { + // Parquet files written with the Hadoop Lz4Codec use their own framing. + // The input buffer can contain an arbitrary number of "frames", each + // with the following structure: + // - bytes 0..3: big-endian uint32_t representing the frame decompressed size + // - bytes 4..7: big-endian uint32_t representing the frame compressed size + // - bytes 8...: frame compressed data + // + // The Hadoop Lz4Codec source code can be found here: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc + let mut input_len = input_buf.len(); + let mut input = input_buf; + let mut read_bytes = 0; + let mut output_len = output_buf.len(); + let mut output: &mut [u8] = output_buf; + while input_len >= PREFIX_LEN { + let mut bytes = [0; SIZE_U32]; + bytes.copy_from_slice(&input[0..4]); + let expected_decompressed_size = u32::from_be_bytes(bytes); + let mut bytes = [0; SIZE_U32]; + bytes.copy_from_slice(&input[4..8]); + let expected_compressed_size = u32::from_be_bytes(bytes); + input = &input[PREFIX_LEN..]; + input_len -= PREFIX_LEN; + + if input_len < expected_compressed_size as usize { + return Err(io::Error::new( + io::ErrorKind::Other, + "Not enough bytes for Hadoop frame", + )); + } + + if output_len < expected_decompressed_size as usize { + return Err(io::Error::new( + io::ErrorKind::Other, + "Not enough bytes to hold advertised output", + )); + } + let decompressed_size = lz4::block::decompress_to_buffer( + &input[..expected_compressed_size as usize], + Some(output_len as i32), + output, + )?; + if decompressed_size != expected_decompressed_size as usize { + return Err(io::Error::new( + io::ErrorKind::Other, + "Unexpected decompressed size", + )); + } + input_len -= expected_compressed_size as usize; + output_len -= expected_decompressed_size as usize; + read_bytes += expected_decompressed_size as usize; + if input_len > expected_compressed_size as usize { + input = &input[expected_compressed_size as usize..]; + output = &mut output[expected_decompressed_size as usize..]; + } else { + break; + } + } + if input_len == 0 { + Ok(read_bytes) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + "Not all input are consumed", + )) + } + } + + impl Codec for LZ4HadoopCodec { + fn decompress( + &mut self, + input_buf: &[u8], + output_buf: &mut Vec, + uncompress_size: Option, + ) -> Result { + let output_len = output_buf.len(); + let required_len = match uncompress_size { + Some(n) => n, + None => { + return Err(ParquetError::General( + "LZ4HadoopCodec unsupported without uncompress_size".into(), + )) + } + }; + output_buf.resize(output_len + required_len, 0); + match try_decompress_hadoop(input_buf, &mut output_buf[output_len..]) { + Ok(n) => { + if n != required_len { + return Err(ParquetError::General( + "LZ4HadoopCodec uncompress_size is not the expected one" + .into(), + )); + } + Ok(n) + } + Err(e) if !self.backward_compatible_lz4 => Err(e.into()), + // Fallback done to be backward compatible with older versions of this + // libray and older versions of parquet-cpp. + Err(_) => { + // Truncate any inserted element before tryingg next algorithm. + output_buf.truncate(output_len); + match LZ4Codec::new().decompress( + input_buf, + output_buf, + uncompress_size, + ) { + Ok(n) => Ok(n), + Err(_) => { + // Truncate any inserted element before tryingg next algorithm. + output_buf.truncate(output_len); + LZ4RawCodec::new().decompress( + input_buf, + output_buf, + uncompress_size, + ) + } + } + } + } + } + + fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { + // Allocate memory to store the LZ4_HADOOP prefix. + let offset = output_buf.len(); + output_buf.resize(offset + PREFIX_LEN, 0); + + // Append LZ4_RAW compressed bytes after prefix. + LZ4RawCodec::new().compress(input_buf, output_buf)?; + + // Prepend decompressed size and compressed size in big endian to be compatible + // with LZ4_HADOOP. + let output_buf = &mut output_buf[offset..]; + let compressed_size = output_buf.len() - PREFIX_LEN; + let compressed_size = compressed_size as u32; + let uncompressed_size = input_buf.len() as u32; + output_buf[..SIZE_U32].copy_from_slice(&uncompressed_size.to_be_bytes()); + output_buf[SIZE_U32..PREFIX_LEN].copy_from_slice(&compressed_size.to_be_bytes()); + + Ok(()) + } + } +} +#[cfg(any(feature = "lz4", test))] +pub use lz4_hadoop_codec::*; + #[cfg(test)] mod tests { use super::*; @@ -421,8 +663,11 @@ mod tests { use crate::util::test_common::rand_gen::random_bytes; fn test_roundtrip(c: CodecType, data: &[u8], uncompress_size: Option) { - let mut c1 = create_codec(c).unwrap().unwrap(); - let mut c2 = create_codec(c).unwrap().unwrap(); + let codec_options = CodecOptionsBuilder::default() + .set_backward_compatible_lz4(false) + .build(); + let mut c1 = create_codec(c, &codec_options).unwrap().unwrap(); + let mut c2 = create_codec(c, &codec_options).unwrap().unwrap(); // Compress with c1 let mut compressed = Vec::new(); @@ -473,42 +718,53 @@ mod tests { assert_eq!(&decompressed[..4], prefix); } - fn test_codec(c: CodecType) { + fn test_codec_with_size(c: CodecType) { let sizes = vec![100, 10000, 100000]; for size in sizes { let data = random_bytes(size); - test_roundtrip(c, &data, None); test_roundtrip(c, &data, Some(data.len())); } } + fn test_codec_without_size(c: CodecType) { + let sizes = vec![100, 10000, 100000]; + for size in sizes { + let data = random_bytes(size); + test_roundtrip(c, &data, None); + } + } + #[test] fn test_codec_snappy() { - test_codec(CodecType::SNAPPY); + test_codec_with_size(CodecType::SNAPPY); + test_codec_without_size(CodecType::SNAPPY); } #[test] fn test_codec_gzip() { - test_codec(CodecType::GZIP); + test_codec_with_size(CodecType::GZIP); + test_codec_without_size(CodecType::GZIP); } #[test] fn test_codec_brotli() { - test_codec(CodecType::BROTLI); + test_codec_with_size(CodecType::BROTLI); + test_codec_without_size(CodecType::BROTLI); } #[test] fn test_codec_lz4() { - test_codec(CodecType::LZ4); + test_codec_with_size(CodecType::LZ4); } #[test] fn test_codec_zstd() { - test_codec(CodecType::ZSTD); + test_codec_with_size(CodecType::ZSTD); + test_codec_without_size(CodecType::ZSTD); } #[test] fn test_codec_lz4_raw() { - test_codec(CodecType::LZ4_RAW); + test_codec_with_size(CodecType::LZ4_RAW); } } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 11fb13b4bd68..dc9feb4ce185 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -47,10 +47,27 @@ //! Some(Encoding::PLAIN) //! ); //! ``` +//! +//! Reader properties. +//! +//! # Usage +//! +//! ```rust +//! use parquet::file::properties::ReaderProperties; +//! +//! // Create properties with default configuration. +//! let props = ReaderProperties::builder().build(); +//! +//! // Use properties builder to set certain options and assemble the configuration. +//! let props = ReaderProperties::builder() +//! .set_backward_compatible_lz4(false) +//! .build(); +//! ``` use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; +use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; use crate::schema::types::ColumnPath; @@ -560,6 +577,66 @@ impl ColumnProperties { } } +/// Reference counted reader properties. +pub type ReaderPropertiesPtr = Arc; + +/// Reader properties. +/// +/// All properties are immutable and `Send` + `Sync`. +/// Use [`ReaderPropertiesBuilder`] to assemble these properties. +pub struct ReaderProperties { + codec_options: CodecOptions, +} + +impl ReaderProperties { + /// Returns builder for reader properties with default values. + pub fn builder() -> ReaderPropertiesBuilder { + ReaderPropertiesBuilder::with_defaults() + } + + /// Returns codec options. + pub(crate) fn codec_options(&self) -> &CodecOptions { + &self.codec_options + } +} + +/// Reader properties builder. +pub struct ReaderPropertiesBuilder { + codec_options_builder: CodecOptionsBuilder, +} + +/// Reader properties builder. +impl ReaderPropertiesBuilder { + /// Returns default state of the builder. + fn with_defaults() -> Self { + Self { + codec_options_builder: CodecOptionsBuilder::default(), + } + } + + /// Finalizes the configuration and returns immutable reader properties struct. + pub fn build(self) -> ReaderProperties { + ReaderProperties { + codec_options: self.codec_options_builder.build(), + } + } + + /// Enable/disable backward compatible LZ4. + /// + /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback + /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility + /// with files generated by older versions of this library, and LZ4_RAW, for backward + /// compatibility with files generated by older versions of parquet-cpp. + /// + /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error. + pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self { + self.codec_options_builder = self + .codec_options_builder + .set_backward_compatible_lz4(value); + self + } +} + #[cfg(test)] mod tests { use super::*; @@ -747,4 +824,28 @@ mod tests { DEFAULT_DICTIONARY_ENABLED ); } + + #[test] + fn test_reader_properties_default_settings() { + let props = ReaderProperties::builder().build(); + + let codec_options = CodecOptionsBuilder::default() + .set_backward_compatible_lz4(true) + .build(); + + assert_eq!(props.codec_options(), &codec_options); + } + + #[test] + fn test_reader_properties_builder() { + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); + + let codec_options = CodecOptionsBuilder::default() + .set_backward_compatible_lz4(false) + .build(); + + assert_eq!(props.codec_options(), &codec_options); + } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 854ae1ef6d34..2b3c7d139148 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -31,7 +31,13 @@ use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::index_reader; -use crate::file::{footer, metadata::*, reader::*, statistics}; +use crate::file::{ + footer, + metadata::*, + properties::{ReaderProperties, ReaderPropertiesPtr}, + reader::*, + statistics, +}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; @@ -139,6 +145,7 @@ impl IntoIterator for SerializedFileReader { pub struct SerializedFileReader { chunk_reader: Arc, metadata: Arc, + props: ReaderPropertiesPtr, } /// A predicate for filtering row groups, invoked with the metadata and index @@ -153,6 +160,7 @@ pub type ReadGroupPredicate = Box bool>; pub struct ReadOptionsBuilder { predicates: Vec, enable_page_index: bool, + props: Option, } impl ReadOptionsBuilder { @@ -186,11 +194,21 @@ impl ReadOptionsBuilder { self } + /// Set the `ReaderProperties` configuration. + pub fn with_reader_properties(mut self, properties: ReaderProperties) -> Self { + self.props = Some(properties); + self + } + /// Seal the builder and return the read options pub fn build(self) -> ReadOptions { + let props = self + .props + .unwrap_or_else(|| ReaderProperties::builder().build()); ReadOptions { predicates: self.predicates, enable_page_index: self.enable_page_index, + props, } } } @@ -202,6 +220,7 @@ impl ReadOptionsBuilder { pub struct ReadOptions { predicates: Vec, enable_page_index: bool, + props: ReaderProperties, } impl SerializedFileReader { @@ -209,9 +228,11 @@ impl SerializedFileReader { /// Returns error if Parquet file does not exist or is corrupt. pub fn new(chunk_reader: R) -> Result { let metadata = footer::parse_metadata(&chunk_reader)?; + let props = Arc::new(ReaderProperties::builder().build()); Ok(Self { chunk_reader: Arc::new(chunk_reader), metadata: Arc::new(metadata), + props, }) } @@ -257,6 +278,7 @@ impl SerializedFileReader { Some(columns_indexes), Some(offset_indexes), )), + props: Arc::new(options.props), }) } else { Ok(Self { @@ -265,6 +287,7 @@ impl SerializedFileReader { metadata.file_metadata().clone(), filtered_row_groups, )), + props: Arc::new(options.props), }) } } @@ -298,10 +321,12 @@ impl FileReader for SerializedFileReader { fn get_row_group(&self, i: usize) -> Result> { let row_group_metadata = self.metadata.row_group(i); // Row groups should be processed sequentially. + let props = Arc::clone(&self.props); let f = Arc::clone(&self.chunk_reader); - Ok(Box::new(SerializedRowGroupReader::new( + Ok(Box::new(SerializedRowGroupReader::new_with_properties( f, row_group_metadata, + props, ))) } @@ -314,14 +339,20 @@ impl FileReader for SerializedFileReader { pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, + props: ReaderPropertiesPtr, } impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { - /// Creates new row group reader from a file and row group metadata. - fn new(chunk_reader: Arc, metadata: &'a RowGroupMetaData) -> Self { + /// Creates new row group reader from a file, row group metadata and custom config. + fn new_with_properties( + chunk_reader: Arc, + metadata: &'a RowGroupMetaData, + props: ReaderPropertiesPtr, + ) -> Self { Self { chunk_reader, metadata, + props, } } } @@ -345,11 +376,13 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' .as_ref() .map(|x| x[i].clone()); - Ok(Box::new(SerializedPageReader::new( + let props = Arc::clone(&self.props); + Ok(Box::new(SerializedPageReader::new_with_properties( Arc::clone(&self.chunk_reader), col, self.metadata.num_rows() as usize, page_locations, + props, )?)) } @@ -531,7 +564,25 @@ impl SerializedPageReader { total_rows: usize, page_locations: Option>, ) -> Result { - let decompressor = create_codec(meta.compression())?; + let props = Arc::new(ReaderProperties::builder().build()); + SerializedPageReader::new_with_properties( + reader, + meta, + total_rows, + page_locations, + props, + ) + } + + /// Creates a new serialized page with custom options. + pub fn new_with_properties( + reader: Arc, + meta: &ColumnChunkMetaData, + total_rows: usize, + page_locations: Option>, + props: ReaderPropertiesPtr, + ) -> Result { + let decompressor = create_codec(meta.compression(), props.codec_options())?; let (start, len) = meta.byte_range(); let state = match page_locations { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index dbbc38461677..528f72494190 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -646,10 +646,10 @@ mod tests { use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; use crate::column::page::PageReader; - use crate::compression::{create_codec, Codec}; + use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::Int32Type; use crate::file::{ - properties::{WriterProperties, WriterVersion}, + properties::{ReaderProperties, WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; @@ -947,7 +947,10 @@ mod tests { fn test_page_roundtrip(pages: &[Page], codec: Compression, physical_type: Type) { let mut compressed_pages = vec![]; let mut total_num_values = 0i64; - let mut compressor = create_codec(codec).unwrap(); + let codec_options = CodecOptionsBuilder::default() + .set_backward_compatible_lz4(false) + .build(); + let mut compressor = create_codec(codec, &codec_options).unwrap(); for page in pages { let uncompressed_len = page.buffer().len(); @@ -1056,11 +1059,15 @@ mod tests { .build() .unwrap(); - let mut page_reader = SerializedPageReader::new( + let props = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); + let mut page_reader = SerializedPageReader::new_with_properties( Arc::new(reader), &meta, total_num_values as usize, None, + Arc::new(props), ) .unwrap(); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index e43456eb6f40..5744de35e337 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -24,7 +24,7 @@ use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderB use parquet::arrow::ArrowWriter; use parquet::basic::{Encoding, PageType}; use parquet::file::metadata::ParquetMetaData; -use parquet::file::properties::WriterProperties; +use parquet::file::properties::{ReaderProperties, WriterProperties}; use parquet::file::reader::SerializedPageReader; use std::sync::Arc; @@ -129,11 +129,15 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { .enumerate(); for (idx, (column, column_layout)) in iter { - let page_reader = SerializedPageReader::new( + let properties = ReaderProperties::builder() + .set_backward_compatible_lz4(false) + .build(); + let page_reader = SerializedPageReader::new_with_properties( Arc::new(file_reader.clone()), column, row_group.num_rows() as usize, None, + Arc::new(properties), ) .unwrap(); From 108e7d276a83bfd9c3144005e0a000e8331fdfaa Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 5 Nov 2022 22:46:52 -0700 Subject: [PATCH 0226/1411] Check overflow while casting floating point value to decimal128 (#3021) * Check overflow while casting floating point value to decimal128 * Don't validate with precision * Return error when saturating * Use to_i128 * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix format Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-cast/src/cast.rs | 64 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index a3abe545d529..3e23a059bf3e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -344,16 +344,43 @@ fn cast_floating_point_to_decimal128( array: &PrimitiveArray, precision: u8, scale: u8, + cast_options: &CastOptions, ) -> Result where ::Native: AsPrimitive, { let mul = 10_f64.powi(scale as i32); - array - .unary::<_, Decimal128Type>(|v| (v.as_() * mul).round() as i128) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) + if cast_options.safe { + let iter = array + .iter() + .map(|v| v.and_then(|v| (mul * v.as_()).round().to_i128())); + let casted_array = + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; + casted_array + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + array + .try_unary::<_, Decimal128Type, _>(|v| { + mul.mul_checked(v.as_()).and_then(|value| { + let mul_v = value.round(); + let integer: i128 = mul_v.to_i128().ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + precision, + scale, + v + )) + })?; + + Ok(integer) + }) + }) + .and_then(|a| a.with_precision_and_scale(precision, scale)) + .map(|a| Arc::new(a) as ArrayRef) + } } fn cast_floating_point_to_decimal256( @@ -588,11 +615,13 @@ pub fn cast_with_options( as_primitive_array::(array), *precision, *scale, + cast_options, ), Float64 => cast_floating_point_to_decimal128( as_primitive_array::(array), *precision, *scale, + cast_options, ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( @@ -6110,4 +6139,31 @@ mod tests { ); assert!(casted_array.is_err()); } + + #[test] + fn test_cast_floating_point_to_decimal128_overflow() { + let array = Float64Array::from(vec![f64::MAX]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(38, 30), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(38, 30), + &CastOptions { safe: false }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Cast error: Cannot cast to Decimal128(38, 30)"; + assert!( + err.contains(expected_error), + "did not find expected error '{}' in actual error '{}'", + expected_error, + err + ); + } } From deb64554f9d25afa044248293b31ab7c26f0e42f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 6 Nov 2022 21:31:22 +1300 Subject: [PATCH 0227/1411] Split out arrow-ipc (#3022) * Split out arrow-ipc * RAT * Fix doc * Tweak required-features * Clippy * Fix feature flags --- .github/workflows/arrow.yml | 5 + .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-array/src/lib.rs | 2 +- arrow-array/src/record_batch.rs | 18 + arrow-ipc/CONTRIBUTING.md | 37 +++ arrow-ipc/Cargo.toml | 51 +++ {arrow => arrow-ipc}/regen.sh | 0 .../codec.rs => arrow-ipc/src/compression.rs | 133 +++++--- {arrow/src/ipc => arrow-ipc/src}/convert.rs | 294 +++++++++-------- {arrow/src/ipc => arrow-ipc/src}/gen/File.rs | 2 +- .../src/ipc => arrow-ipc/src}/gen/Message.rs | 8 +- .../src/ipc => arrow-ipc/src}/gen/Schema.rs | 0 .../ipc => arrow-ipc/src}/gen/SparseTensor.rs | 4 +- .../src/ipc => arrow-ipc/src}/gen/Tensor.rs | 2 +- {arrow/src/ipc => arrow-ipc/src}/gen/mod.rs | 0 arrow/src/ipc/mod.rs => arrow-ipc/src/lib.rs | 2 + {arrow/src/ipc => arrow-ipc/src}/reader.rs | 160 ++++----- {arrow/src/ipc => arrow-ipc/src}/writer.rs | 310 ++++++++---------- arrow/CONTRIBUTING.md | 17 - arrow/Cargo.toml | 12 +- arrow/src/ipc/compression/mod.rs | 26 -- arrow/src/ipc/compression/stub.rs | 63 ---- arrow/src/lib.rs | 25 +- arrow/tests/ipc_integration.rs | 61 ++++ dev/release/README.md | 2 + 30 files changed, 648 insertions(+), 592 deletions(-) create mode 100644 arrow-ipc/CONTRIBUTING.md create mode 100644 arrow-ipc/Cargo.toml rename {arrow => arrow-ipc}/regen.sh (100%) rename arrow/src/ipc/compression/codec.rs => arrow-ipc/src/compression.rs (67%) rename {arrow/src/ipc => arrow-ipc/src}/convert.rs (81%) rename {arrow/src/ipc => arrow-ipc/src}/gen/File.rs (99%) rename {arrow/src/ipc => arrow-ipc/src}/gen/Message.rs (99%) rename {arrow/src/ipc => arrow-ipc/src}/gen/Schema.rs (100%) rename {arrow/src/ipc => arrow-ipc/src}/gen/SparseTensor.rs (99%) rename {arrow/src/ipc => arrow-ipc/src}/gen/Tensor.rs (99%) rename {arrow/src/ipc => arrow-ipc/src}/gen/mod.rs (100%) rename arrow/src/ipc/mod.rs => arrow-ipc/src/lib.rs (97%) rename {arrow/src/ipc => arrow-ipc/src}/reader.rs (93%) rename {arrow/src/ipc => arrow-ipc/src}/writer.rs (87%) delete mode 100644 arrow/src/ipc/compression/mod.rs delete mode 100644 arrow/src/ipc/compression/stub.rs create mode 100644 arrow/tests/ipc_integration.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 9ae72dd009a3..d930086ef56a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -33,6 +33,7 @@ on: - arrow-schema/** - arrow-select/** - arrow-integration-test/** + - arrow-ipc/** - .github/** jobs: @@ -61,6 +62,8 @@ jobs: run: cargo test -p arrow-select --all-features - name: Test arrow-cast with all features run: cargo test -p arrow-cast --all-features + - name: Test arrow-ipc with all features + run: cargo test -p arrow-ipc --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -169,5 +172,7 @@ jobs: run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - name: Clippy arrow-cast with all features run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings + - name: Clippy arrow-ipc with all features + run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 9621c9e69ddc..ded4f5a67915 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -35,6 +35,7 @@ on: - arrow-schema/** - arrow-select/** - arrow-flight/** + - arrow-ipc/** - .github/** jobs: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 3a0073004996..17ebf54de732 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -23,6 +23,7 @@ arrow: - arrow-data/**/* - arrow-schema/**/* - arrow-select/**/* + - arrow-ipc/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c2c0a79e63ba..8566230ea0b9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -31,6 +31,7 @@ on: - arrow-data/** - arrow-schema/** - arrow-select/** + - arrow-ipc/** - arrow-pyarrow-integration-testing/** - arrow-integration-test/** - arrow-integration-testing/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 241b4f0b4a8d..2e85c9dd95a5 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -31,6 +31,7 @@ on: - arrow-data/** - arrow-schema/** - arrow-select/** + - arrow-ipc/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 5a7beadfd71c..b369ef69bfd9 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -34,6 +34,7 @@ on: - arrow-data/** - arrow-schema/** - arrow-select/** + - arrow-ipc/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index d8fa5b9236e0..0ab4853c6e10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "arrow-flight", "arrow-integration-test", "arrow-integration-testing", + "arrow-ipc", "arrow-schema", "arrow-select", "parquet", diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 5c86978dc94d..15267d3080e6 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -162,7 +162,7 @@ pub mod array; pub use array::*; mod record_batch; -pub use record_batch::{RecordBatch, RecordBatchOptions}; +pub use record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; mod arithmetic; pub use arithmetic::ArrowNativeTypeOp; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index e613a38bb15b..6f2385fa9b4a 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -22,6 +22,24 @@ use crate::{new_empty_array, Array, ArrayRef, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; use std::sync::Arc; +/// Trait for types that can read `RecordBatch`'s. +pub trait RecordBatchReader: Iterator> { + /// Returns the schema of this `RecordBatchReader`. + /// + /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this + /// reader should have the same schema as returned from this method. + fn schema(&self) -> SchemaRef; + + /// Reads the next `RecordBatch`. + #[deprecated( + since = "2.0.0", + note = "This method is deprecated in favour of `next` from the trait Iterator." + )] + fn next_batch(&mut self) -> Result, ArrowError> { + self.next().transpose() + } +} + /// A two-dimensional batch of column-oriented data with a defined /// [schema](arrow_schema::Schema). /// diff --git a/arrow-ipc/CONTRIBUTING.md b/arrow-ipc/CONTRIBUTING.md new file mode 100644 index 000000000000..5e14760f19df --- /dev/null +++ b/arrow-ipc/CONTRIBUTING.md @@ -0,0 +1,37 @@ + + +## Developer's guide + +# IPC + +The expected flatc version is 1.12.0+, built from [flatbuffers](https://github.com/google/flatbuffers) +master at fixed commit ID, by regen.sh. + +The IPC flatbuffer code was generated by running this command from the root of the project: + +```bash +./regen.sh +``` + +The above script will run the `flatc` compiler and perform some adjustments to the source code: + +- Replace `type__` with `type_` +- Remove `org::apache::arrow::flatbuffers` namespace +- Add includes to each generated file diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml new file mode 100644 index 000000000000..52ad5fe2e659 --- /dev/null +++ b/arrow-ipc/Cargo.toml @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-ipc" +version = "26.0.0" +description = "Support for the Arrow IPC format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_ipc" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "26.0.0", path = "../arrow-cast" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } +lz4 = { version = "1.23", default-features = false, optional = true } +zstd = { version = "0.11.1", default-features = false, optional = true } + +[dev-dependencies] +tempfile = "3.3" diff --git a/arrow/regen.sh b/arrow-ipc/regen.sh similarity index 100% rename from arrow/regen.sh rename to arrow-ipc/regen.sh diff --git a/arrow/src/ipc/compression/codec.rs b/arrow-ipc/src/compression.rs similarity index 67% rename from arrow/src/ipc/compression/codec.rs rename to arrow-ipc/src/compression.rs index 58ba8cb86585..6349ac232431 100644 --- a/arrow/src/ipc/compression/codec.rs +++ b/arrow-ipc/src/compression.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::buffer::Buffer; -use crate::error::{ArrowError, Result}; -use crate::ipc::CompressionType; -use std::io::{Read, Write}; +use crate::CompressionType; +use arrow_buffer::Buffer; +use arrow_schema::ArrowError; const LENGTH_NO_COMPRESSED_DATA: i64 = -1; const LENGTH_OF_PREFIX_DATA: i64 = 8; @@ -33,7 +32,7 @@ pub enum CompressionCodec { impl TryFrom for CompressionCodec { type Error = ArrowError; - fn try_from(compression_type: CompressionType) -> Result { + fn try_from(compression_type: CompressionType) -> Result { match compression_type { CompressionType::ZSTD => Ok(CompressionCodec::Zstd), CompressionType::LZ4_FRAME => Ok(CompressionCodec::Lz4Frame), @@ -60,7 +59,7 @@ impl CompressionCodec { &self, input: &[u8], output: &mut Vec, - ) -> Result { + ) -> Result { let uncompressed_data_len = input.len(); let original_output_len = output.len(); @@ -92,7 +91,10 @@ impl CompressionCodec { /// [8 bytes]: uncompressed length /// [remaining bytes]: compressed data stream /// ``` - pub(crate) fn decompress_to_buffer(&self, input: &Buffer) -> Result { + pub(crate) fn decompress_to_buffer( + &self, + input: &Buffer, + ) -> Result { // read the first 8 bytes to determine if the data is // compressed let decompressed_length = read_uncompressed_size(input); @@ -115,50 +117,89 @@ impl CompressionCodec { /// Compress the data in input buffer and write to output buffer /// using the specified compression - fn compress(&self, input: &[u8], output: &mut Vec) -> Result<()> { + fn compress(&self, input: &[u8], output: &mut Vec) -> Result<(), ArrowError> { match self { - CompressionCodec::Lz4Frame => { - let mut encoder = lz4::EncoderBuilder::new().build(output)?; - encoder.write_all(input)?; - match encoder.finish().1 { - Ok(_) => Ok(()), - Err(e) => Err(e.into()), - } - } - CompressionCodec::Zstd => { - let mut encoder = zstd::Encoder::new(output, 0)?; - encoder.write_all(input)?; - match encoder.finish() { - Ok(_) => Ok(()), - Err(e) => Err(e.into()), - } - } + CompressionCodec::Lz4Frame => compress_lz4(input, output), + CompressionCodec::Zstd => compress_zstd(input, output), } } /// Decompress the data in input buffer and write to output buffer /// using the specified compression - fn decompress(&self, input: &[u8], output: &mut Vec) -> Result { - let result: Result = match self { - CompressionCodec::Lz4Frame => { - let mut decoder = lz4::Decoder::new(input)?; - match decoder.read_to_end(output) { - Ok(size) => Ok(size), - Err(e) => Err(e.into()), - } - } - CompressionCodec::Zstd => { - let mut decoder = zstd::Decoder::new(input)?; - match decoder.read_to_end(output) { - Ok(size) => Ok(size), - Err(e) => Err(e.into()), - } - } - }; - result + fn decompress( + &self, + input: &[u8], + output: &mut Vec, + ) -> Result { + match self { + CompressionCodec::Lz4Frame => decompress_lz4(input, output), + CompressionCodec::Zstd => decompress_zstd(input, output), + } } } +#[cfg(feature = "lz4")] +fn compress_lz4(input: &[u8], output: &mut Vec) -> Result<(), ArrowError> { + use std::io::Write; + let mut encoder = lz4::EncoderBuilder::new().build(output)?; + encoder.write_all(input)?; + encoder.finish().1?; + Ok(()) +} + +#[cfg(not(feature = "lz4"))] +#[allow(clippy::ptr_arg)] +fn compress_lz4(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> { + Err(ArrowError::InvalidArgumentError( + "lz4 IPC compression requires the lz4 feature".to_string(), + )) +} + +#[cfg(feature = "lz4")] +fn decompress_lz4(input: &[u8], output: &mut Vec) -> Result { + use std::io::Read; + Ok(lz4::Decoder::new(input)?.read_to_end(output)?) +} + +#[cfg(not(feature = "lz4"))] +#[allow(clippy::ptr_arg)] +fn decompress_lz4(_input: &[u8], _output: &mut Vec) -> Result { + Err(ArrowError::InvalidArgumentError( + "lz4 IPC decompression requires the lz4 feature".to_string(), + )) +} + +#[cfg(feature = "zstd")] +fn compress_zstd(input: &[u8], output: &mut Vec) -> Result<(), ArrowError> { + use std::io::Write; + let mut encoder = zstd::Encoder::new(output, 0)?; + encoder.write_all(input)?; + encoder.finish()?; + Ok(()) +} + +#[cfg(not(feature = "zstd"))] +#[allow(clippy::ptr_arg)] +fn compress_zstd(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> { + Err(ArrowError::InvalidArgumentError( + "zstd IPC compression requires the zstd feature".to_string(), + )) +} + +#[cfg(feature = "zstd")] +fn decompress_zstd(input: &[u8], output: &mut Vec) -> Result { + use std::io::Read; + Ok(zstd::Decoder::new(input)?.read_to_end(output)?) +} + +#[cfg(not(feature = "zstd"))] +#[allow(clippy::ptr_arg)] +fn decompress_zstd(_input: &[u8], _output: &mut Vec) -> Result { + Err(ArrowError::InvalidArgumentError( + "zstd IPC decompression requires the zstd feature".to_string(), + )) +} + /// Get the uncompressed length /// Notes: /// LENGTH_NO_COMPRESSED_DATA: indicate that the data that follows is not compressed @@ -173,12 +214,11 @@ fn read_uncompressed_size(buffer: &[u8]) -> i64 { #[cfg(test)] mod tests { - use super::*; - #[test] + #[cfg(feature = "lz4")] fn test_lz4_compression() { let input_bytes = "hello lz4".as_bytes(); - let codec: CompressionCodec = CompressionCodec::Lz4Frame; + let codec = super::CompressionCodec::Lz4Frame; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); let mut result_output_bytes: Vec = Vec::new(); @@ -189,9 +229,10 @@ mod tests { } #[test] + #[cfg(feature = "zstd")] fn test_zstd_compression() { let input_bytes = "hello zstd".as_bytes(); - let codec: CompressionCodec = CompressionCodec::Zstd; + let codec = super::CompressionCodec::Zstd; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); let mut result_output_bytes: Vec = Vec::new(); diff --git a/arrow/src/ipc/convert.rs b/arrow-ipc/src/convert.rs similarity index 81% rename from arrow/src/ipc/convert.rs rename to arrow-ipc/src/convert.rs index 0f5d246bcce5..8d01c58b6ae3 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -17,16 +17,13 @@ //! Utilities for converting between IPC types and native Arrow types -use crate::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit, UnionMode}; -use crate::error::{ArrowError, Result}; -use crate::ipc; - +use arrow_schema::*; use flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; use std::collections::{BTreeMap, HashMap}; -use crate::ipc::{size_prefixed_root_as_message, CONTINUATION_MARKER}; +use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER}; use DataType::*; /// Serialize a schema in IPC format @@ -43,7 +40,7 @@ pub fn schema_to_fb(schema: &Schema) -> FlatBufferBuilder { pub fn schema_to_fb_offset<'a>( fbb: &mut FlatBufferBuilder<'a>, schema: &Schema, -) -> WIPOffset> { +) -> WIPOffset> { let mut fields = vec![]; for field in schema.fields() { let fb_field = build_field(fbb, field); @@ -55,7 +52,7 @@ pub fn schema_to_fb_offset<'a>( let fb_key_name = fbb.create_string(k.as_str()); let fb_val_name = fbb.create_string(v.as_str()); - let mut kv_builder = ipc::KeyValueBuilder::new(fbb); + let mut kv_builder = crate::KeyValueBuilder::new(fbb); kv_builder.add_key(fb_key_name); kv_builder.add_value(fb_val_name); custom_metadata.push(kv_builder.finish()); @@ -64,15 +61,15 @@ pub fn schema_to_fb_offset<'a>( let fb_field_list = fbb.create_vector(&fields); let fb_metadata_list = fbb.create_vector(&custom_metadata); - let mut builder = ipc::SchemaBuilder::new(fbb); + let mut builder = crate::SchemaBuilder::new(fbb); builder.add_fields(fb_field_list); builder.add_custom_metadata(fb_metadata_list); builder.finish() } /// Convert an IPC Field to Arrow Field -impl<'a> From> for Field { - fn from(field: ipc::Field) -> Field { +impl<'a> From> for Field { + fn from(field: crate::Field) -> Field { let arrow_field = if let Some(dictionary) = field.dictionary() { Field::new_dict( field.name().unwrap(), @@ -105,14 +102,14 @@ impl<'a> From> for Field { } /// Deserialize a Schema table from flat buffer format to Schema data type -pub fn fb_to_schema(fb: ipc::Schema) -> Schema { +pub fn fb_to_schema(fb: crate::Schema) -> Schema { let mut fields: Vec = vec![]; let c_fields = fb.fields().unwrap(); let len = c_fields.len(); for i in 0..len { - let c_field: ipc::Field = c_fields.get(i); + let c_field: crate::Field = c_fields.get(i); match c_field.type_type() { - ipc::Type::Decimal if fb.endianness() == ipc::Endianness::Big => { + crate::Type::Decimal if fb.endianness() == crate::Endianness::Big => { unimplemented!("Big Endian is not supported for Decimal!") } _ => (), @@ -138,8 +135,8 @@ pub fn fb_to_schema(fb: ipc::Schema) -> Schema { } /// Try deserialize flat buffer format bytes into a schema -pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result { - if let Ok(ipc) = ipc::root_as_message(bytes) { +pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result { + if let Ok(ipc) = crate::root_as_message(bytes) { if let Some(schema) = ipc.header_as_schema().map(fb_to_schema) { Ok(schema) } else { @@ -155,7 +152,7 @@ pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result { } /// Try deserialize the IPC format bytes into a schema -pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { +pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { // There are two protocol types: https://issues.apache.org/jira/browse/ARROW-6313 // The original protocal is: // 4 bytes - the byte length of the payload @@ -200,7 +197,7 @@ pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { } /// Get the Arrow data type from the flatbuffer Field table -pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { +pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> DataType { if let Some(dictionary) = field.dictionary() { if may_be_dictionary { let int = dictionary.indexType().unwrap(); @@ -223,9 +220,9 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT } match field.type_type() { - ipc::Type::Null => DataType::Null, - ipc::Type::Bool => DataType::Boolean, - ipc::Type::Int => { + crate::Type::Null => DataType::Null, + crate::Type::Bool => DataType::Boolean, + crate::Type::Int => { let int = field.type_as_int().unwrap(); match (int.bitWidth(), int.is_signed()) { (8, true) => DataType::Int8, @@ -242,103 +239,109 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT ), } } - ipc::Type::Binary => DataType::Binary, - ipc::Type::LargeBinary => DataType::LargeBinary, - ipc::Type::Utf8 => DataType::Utf8, - ipc::Type::LargeUtf8 => DataType::LargeUtf8, - ipc::Type::FixedSizeBinary => { + crate::Type::Binary => DataType::Binary, + crate::Type::LargeBinary => DataType::LargeBinary, + crate::Type::Utf8 => DataType::Utf8, + crate::Type::LargeUtf8 => DataType::LargeUtf8, + crate::Type::FixedSizeBinary => { let fsb = field.type_as_fixed_size_binary().unwrap(); DataType::FixedSizeBinary(fsb.byteWidth()) } - ipc::Type::FloatingPoint => { + crate::Type::FloatingPoint => { let float = field.type_as_floating_point().unwrap(); match float.precision() { - ipc::Precision::HALF => DataType::Float16, - ipc::Precision::SINGLE => DataType::Float32, - ipc::Precision::DOUBLE => DataType::Float64, + crate::Precision::HALF => DataType::Float16, + crate::Precision::SINGLE => DataType::Float32, + crate::Precision::DOUBLE => DataType::Float64, z => panic!("FloatingPoint type with precision of {:?} not supported", z), } } - ipc::Type::Date => { + crate::Type::Date => { let date = field.type_as_date().unwrap(); match date.unit() { - ipc::DateUnit::DAY => DataType::Date32, - ipc::DateUnit::MILLISECOND => DataType::Date64, + crate::DateUnit::DAY => DataType::Date32, + crate::DateUnit::MILLISECOND => DataType::Date64, z => panic!("Date type with unit of {:?} not supported", z), } } - ipc::Type::Time => { + crate::Type::Time => { let time = field.type_as_time().unwrap(); match (time.bitWidth(), time.unit()) { - (32, ipc::TimeUnit::SECOND) => DataType::Time32(TimeUnit::Second), - (32, ipc::TimeUnit::MILLISECOND) => { + (32, crate::TimeUnit::SECOND) => DataType::Time32(TimeUnit::Second), + (32, crate::TimeUnit::MILLISECOND) => { DataType::Time32(TimeUnit::Millisecond) } - (64, ipc::TimeUnit::MICROSECOND) => { + (64, crate::TimeUnit::MICROSECOND) => { DataType::Time64(TimeUnit::Microsecond) } - (64, ipc::TimeUnit::NANOSECOND) => DataType::Time64(TimeUnit::Nanosecond), + (64, crate::TimeUnit::NANOSECOND) => { + DataType::Time64(TimeUnit::Nanosecond) + } z => panic!( "Time type with bit width of {} and unit of {:?} not supported", z.0, z.1 ), } } - ipc::Type::Timestamp => { + crate::Type::Timestamp => { let timestamp = field.type_as_timestamp().unwrap(); let timezone: Option = timestamp.timezone().map(|tz| tz.to_string()); match timestamp.unit() { - ipc::TimeUnit::SECOND => DataType::Timestamp(TimeUnit::Second, timezone), - ipc::TimeUnit::MILLISECOND => { + crate::TimeUnit::SECOND => { + DataType::Timestamp(TimeUnit::Second, timezone) + } + crate::TimeUnit::MILLISECOND => { DataType::Timestamp(TimeUnit::Millisecond, timezone) } - ipc::TimeUnit::MICROSECOND => { + crate::TimeUnit::MICROSECOND => { DataType::Timestamp(TimeUnit::Microsecond, timezone) } - ipc::TimeUnit::NANOSECOND => { + crate::TimeUnit::NANOSECOND => { DataType::Timestamp(TimeUnit::Nanosecond, timezone) } z => panic!("Timestamp type with unit of {:?} not supported", z), } } - ipc::Type::Interval => { + crate::Type::Interval => { let interval = field.type_as_interval().unwrap(); match interval.unit() { - ipc::IntervalUnit::YEAR_MONTH => { + crate::IntervalUnit::YEAR_MONTH => { DataType::Interval(IntervalUnit::YearMonth) } - ipc::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), - ipc::IntervalUnit::MONTH_DAY_NANO => { + crate::IntervalUnit::DAY_TIME => { + DataType::Interval(IntervalUnit::DayTime) + } + crate::IntervalUnit::MONTH_DAY_NANO => { DataType::Interval(IntervalUnit::MonthDayNano) } z => panic!("Interval type with unit of {:?} unsupported", z), } } - ipc::Type::Duration => { + crate::Type::Duration => { let duration = field.type_as_duration().unwrap(); match duration.unit() { - ipc::TimeUnit::SECOND => DataType::Duration(TimeUnit::Second), - ipc::TimeUnit::MILLISECOND => DataType::Duration(TimeUnit::Millisecond), - ipc::TimeUnit::MICROSECOND => DataType::Duration(TimeUnit::Microsecond), - ipc::TimeUnit::NANOSECOND => DataType::Duration(TimeUnit::Nanosecond), + crate::TimeUnit::SECOND => DataType::Duration(TimeUnit::Second), + crate::TimeUnit::MILLISECOND => DataType::Duration(TimeUnit::Millisecond), + crate::TimeUnit::MICROSECOND => DataType::Duration(TimeUnit::Microsecond), + crate::TimeUnit::NANOSECOND => DataType::Duration(TimeUnit::Nanosecond), z => panic!("Duration type with unit of {:?} unsupported", z), } } - ipc::Type::List => { + crate::Type::List => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a list to have one child") } DataType::List(Box::new(children.get(0).into())) } - ipc::Type::LargeList => { + crate::Type::LargeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a large list to have one child") } DataType::LargeList(Box::new(children.get(0).into())) } - ipc::Type::FixedSizeList => { + crate::Type::FixedSizeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a list to have one child") @@ -346,7 +349,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT let fsl = field.type_as_fixed_size_list().unwrap(); DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) } - ipc::Type::Struct_ => { + crate::Type::Struct_ => { let mut fields = vec![]; if let Some(children) = field.children() { for i in 0..children.len() { @@ -356,7 +359,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT DataType::Struct(fields) } - ipc::Type::Map => { + crate::Type::Map => { let map = field.type_as_map().unwrap(); let children = field.children().unwrap(); if children.len() != 1 { @@ -364,7 +367,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT } DataType::Map(Box::new(children.get(0).into()), map.keysSorted()) } - ipc::Type::Decimal => { + crate::Type::Decimal => { let fsb = field.type_as_decimal().unwrap(); let bit_width = fsb.bitWidth(); if bit_width == 128 { @@ -381,12 +384,12 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT panic!("Unexpected decimal bit width {}", bit_width) } } - ipc::Type::Union => { + crate::Type::Union => { let union = field.type_as_union().unwrap(); let union_mode = match union.mode() { - ipc::UnionMode::Dense => UnionMode::Dense, - ipc::UnionMode::Sparse => UnionMode::Sparse, + crate::UnionMode::Dense => UnionMode::Dense, + crate::UnionMode::Sparse => UnionMode::Sparse, mode => panic!("Unexpected union mode: {:?}", mode), }; @@ -409,27 +412,27 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT } pub(crate) struct FBFieldType<'b> { - pub(crate) type_type: ipc::Type, + pub(crate) type_type: crate::Type, pub(crate) type_: WIPOffset, - pub(crate) children: Option>>>>, + pub(crate) children: Option>>>>, } /// Create an IPC Field from an Arrow Field pub(crate) fn build_field<'a>( fbb: &mut FlatBufferBuilder<'a>, field: &Field, -) -> WIPOffset> { +) -> WIPOffset> { // Optional custom metadata. let mut fb_metadata = None; if let Some(metadata) = field.metadata() { if !metadata.is_empty() { let mut kv_vec = vec![]; for (k, v) in metadata { - let kv_args = ipc::KeyValueArgs { + let kv_args = crate::KeyValueArgs { key: Some(fbb.create_string(k.as_str())), value: Some(fbb.create_string(v.as_str())), }; - let kv_offset = ipc::KeyValue::create(fbb, &kv_args); + let kv_offset = crate::KeyValue::create(fbb, &kv_args); kv_vec.push(kv_offset); } fb_metadata = Some(fbb.create_vector(&kv_vec)); @@ -454,7 +457,7 @@ pub(crate) fn build_field<'a>( None }; - let mut field_builder = ipc::FieldBuilder::new(fbb); + let mut field_builder = crate::FieldBuilder::new(fbb); field_builder.add_name(fb_field_name); if let Some(dictionary) = fb_dictionary { field_builder.add_dictionary(dictionary) @@ -481,21 +484,21 @@ pub(crate) fn get_fb_field_type<'a>( ) -> FBFieldType<'a> { // some IPC implementations expect an empty list for child data, instead of a null value. // An empty field list is thus returned for primitive types - let empty_fields: Vec> = vec![]; + let empty_fields: Vec> = vec![]; match data_type { Null => FBFieldType { - type_type: ipc::Type::Null, - type_: ipc::NullBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::Null, + type_: crate::NullBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, Boolean => FBFieldType { - type_type: ipc::Type::Bool, - type_: ipc::BoolBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::Bool, + type_: crate::BoolBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, UInt8 | UInt16 | UInt32 | UInt64 => { let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::IntBuilder::new(fbb); + let mut builder = crate::IntBuilder::new(fbb); builder.add_is_signed(false); match data_type { UInt8 => builder.add_bitWidth(8), @@ -505,14 +508,14 @@ pub(crate) fn get_fb_field_type<'a>( _ => {} }; FBFieldType { - type_type: ipc::Type::Int, + type_type: crate::Type::Int, type_: builder.finish().as_union_value(), children: Some(children), } } Int8 | Int16 | Int32 | Int64 => { let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::IntBuilder::new(fbb); + let mut builder = crate::IntBuilder::new(fbb); builder.add_is_signed(true); match data_type { Int8 => builder.add_bitWidth(8), @@ -522,95 +525,97 @@ pub(crate) fn get_fb_field_type<'a>( _ => {} }; FBFieldType { - type_type: ipc::Type::Int, + type_type: crate::Type::Int, type_: builder.finish().as_union_value(), children: Some(children), } } Float16 | Float32 | Float64 => { let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::FloatingPointBuilder::new(fbb); + let mut builder = crate::FloatingPointBuilder::new(fbb); match data_type { - Float16 => builder.add_precision(ipc::Precision::HALF), - Float32 => builder.add_precision(ipc::Precision::SINGLE), - Float64 => builder.add_precision(ipc::Precision::DOUBLE), + Float16 => builder.add_precision(crate::Precision::HALF), + Float32 => builder.add_precision(crate::Precision::SINGLE), + Float64 => builder.add_precision(crate::Precision::DOUBLE), _ => {} }; FBFieldType { - type_type: ipc::Type::FloatingPoint, + type_type: crate::Type::FloatingPoint, type_: builder.finish().as_union_value(), children: Some(children), } } Binary => FBFieldType { - type_type: ipc::Type::Binary, - type_: ipc::BinaryBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::Binary, + type_: crate::BinaryBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, LargeBinary => FBFieldType { - type_type: ipc::Type::LargeBinary, - type_: ipc::LargeBinaryBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::LargeBinary, + type_: crate::LargeBinaryBuilder::new(fbb) + .finish() + .as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, Utf8 => FBFieldType { - type_type: ipc::Type::Utf8, - type_: ipc::Utf8Builder::new(fbb).finish().as_union_value(), + type_type: crate::Type::Utf8, + type_: crate::Utf8Builder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, LargeUtf8 => FBFieldType { - type_type: ipc::Type::LargeUtf8, - type_: ipc::LargeUtf8Builder::new(fbb).finish().as_union_value(), + type_type: crate::Type::LargeUtf8, + type_: crate::LargeUtf8Builder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), }, FixedSizeBinary(len) => { - let mut builder = ipc::FixedSizeBinaryBuilder::new(fbb); + let mut builder = crate::FixedSizeBinaryBuilder::new(fbb); builder.add_byteWidth(*len as i32); FBFieldType { - type_type: ipc::Type::FixedSizeBinary, + type_type: crate::Type::FixedSizeBinary, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Date32 => { - let mut builder = ipc::DateBuilder::new(fbb); - builder.add_unit(ipc::DateUnit::DAY); + let mut builder = crate::DateBuilder::new(fbb); + builder.add_unit(crate::DateUnit::DAY); FBFieldType { - type_type: ipc::Type::Date, + type_type: crate::Type::Date, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Date64 => { - let mut builder = ipc::DateBuilder::new(fbb); - builder.add_unit(ipc::DateUnit::MILLISECOND); + let mut builder = crate::DateBuilder::new(fbb); + builder.add_unit(crate::DateUnit::MILLISECOND); FBFieldType { - type_type: ipc::Type::Date, + type_type: crate::Type::Date, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Time32(unit) | Time64(unit) => { - let mut builder = ipc::TimeBuilder::new(fbb); + let mut builder = crate::TimeBuilder::new(fbb); match unit { TimeUnit::Second => { builder.add_bitWidth(32); - builder.add_unit(ipc::TimeUnit::SECOND); + builder.add_unit(crate::TimeUnit::SECOND); } TimeUnit::Millisecond => { builder.add_bitWidth(32); - builder.add_unit(ipc::TimeUnit::MILLISECOND); + builder.add_unit(crate::TimeUnit::MILLISECOND); } TimeUnit::Microsecond => { builder.add_bitWidth(64); - builder.add_unit(ipc::TimeUnit::MICROSECOND); + builder.add_unit(crate::TimeUnit::MICROSECOND); } TimeUnit::Nanosecond => { builder.add_bitWidth(64); - builder.add_unit(ipc::TimeUnit::NANOSECOND); + builder.add_unit(crate::TimeUnit::NANOSECOND); } } FBFieldType { - type_type: ipc::Type::Time, + type_type: crate::Type::Time, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } @@ -618,48 +623,48 @@ pub(crate) fn get_fb_field_type<'a>( Timestamp(unit, tz) => { let tz = tz.clone().unwrap_or_default(); let tz_str = fbb.create_string(tz.as_str()); - let mut builder = ipc::TimestampBuilder::new(fbb); + let mut builder = crate::TimestampBuilder::new(fbb); let time_unit = match unit { - TimeUnit::Second => ipc::TimeUnit::SECOND, - TimeUnit::Millisecond => ipc::TimeUnit::MILLISECOND, - TimeUnit::Microsecond => ipc::TimeUnit::MICROSECOND, - TimeUnit::Nanosecond => ipc::TimeUnit::NANOSECOND, + TimeUnit::Second => crate::TimeUnit::SECOND, + TimeUnit::Millisecond => crate::TimeUnit::MILLISECOND, + TimeUnit::Microsecond => crate::TimeUnit::MICROSECOND, + TimeUnit::Nanosecond => crate::TimeUnit::NANOSECOND, }; builder.add_unit(time_unit); if !tz.is_empty() { builder.add_timezone(tz_str); } FBFieldType { - type_type: ipc::Type::Timestamp, + type_type: crate::Type::Timestamp, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Interval(unit) => { - let mut builder = ipc::IntervalBuilder::new(fbb); + let mut builder = crate::IntervalBuilder::new(fbb); let interval_unit = match unit { - IntervalUnit::YearMonth => ipc::IntervalUnit::YEAR_MONTH, - IntervalUnit::DayTime => ipc::IntervalUnit::DAY_TIME, - IntervalUnit::MonthDayNano => ipc::IntervalUnit::MONTH_DAY_NANO, + IntervalUnit::YearMonth => crate::IntervalUnit::YEAR_MONTH, + IntervalUnit::DayTime => crate::IntervalUnit::DAY_TIME, + IntervalUnit::MonthDayNano => crate::IntervalUnit::MONTH_DAY_NANO, }; builder.add_unit(interval_unit); FBFieldType { - type_type: ipc::Type::Interval, + type_type: crate::Type::Interval, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Duration(unit) => { - let mut builder = ipc::DurationBuilder::new(fbb); + let mut builder = crate::DurationBuilder::new(fbb); let time_unit = match unit { - TimeUnit::Second => ipc::TimeUnit::SECOND, - TimeUnit::Millisecond => ipc::TimeUnit::MILLISECOND, - TimeUnit::Microsecond => ipc::TimeUnit::MICROSECOND, - TimeUnit::Nanosecond => ipc::TimeUnit::NANOSECOND, + TimeUnit::Second => crate::TimeUnit::SECOND, + TimeUnit::Millisecond => crate::TimeUnit::MILLISECOND, + TimeUnit::Microsecond => crate::TimeUnit::MICROSECOND, + TimeUnit::Nanosecond => crate::TimeUnit::NANOSECOND, }; builder.add_unit(time_unit); FBFieldType { - type_type: ipc::Type::Duration, + type_type: crate::Type::Duration, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } @@ -667,25 +672,25 @@ pub(crate) fn get_fb_field_type<'a>( List(ref list_type) => { let child = build_field(fbb, list_type); FBFieldType { - type_type: ipc::Type::List, - type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::List, + type_: crate::ListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } LargeList(ref list_type) => { let child = build_field(fbb, list_type); FBFieldType { - type_type: ipc::Type::LargeList, - type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), + type_type: crate::Type::LargeList, + type_: crate::LargeListBuilder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } } FixedSizeList(ref list_type, len) => { let child = build_field(fbb, list_type); - let mut builder = ipc::FixedSizeListBuilder::new(fbb); + let mut builder = crate::FixedSizeListBuilder::new(fbb); builder.add_listSize(*len as i32); FBFieldType { - type_type: ipc::Type::FixedSizeList, + type_type: crate::Type::FixedSizeList, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } @@ -697,17 +702,17 @@ pub(crate) fn get_fb_field_type<'a>( children.push(build_field(fbb, field)); } FBFieldType { - type_type: ipc::Type::Struct_, - type_: ipc::Struct_Builder::new(fbb).finish().as_union_value(), + type_type: crate::Type::Struct_, + type_: crate::Struct_Builder::new(fbb).finish().as_union_value(), children: Some(fbb.create_vector(&children[..])), } } Map(map_field, keys_sorted) => { let child = build_field(fbb, map_field); - let mut field_type = ipc::MapBuilder::new(fbb); + let mut field_type = crate::MapBuilder::new(fbb); field_type.add_keysSorted(*keys_sorted); FBFieldType { - type_type: ipc::Type::Map, + type_type: crate::Type::Map, type_: field_type.finish().as_union_value(), children: Some(fbb.create_vector(&[child])), } @@ -719,23 +724,23 @@ pub(crate) fn get_fb_field_type<'a>( get_fb_field_type(value_type, fbb) } Decimal128(precision, scale) => { - let mut builder = ipc::DecimalBuilder::new(fbb); + let mut builder = crate::DecimalBuilder::new(fbb); builder.add_precision(*precision as i32); builder.add_scale(*scale as i32); builder.add_bitWidth(128); FBFieldType { - type_type: ipc::Type::Decimal, + type_type: crate::Type::Decimal, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } } Decimal256(precision, scale) => { - let mut builder = ipc::DecimalBuilder::new(fbb); + let mut builder = crate::DecimalBuilder::new(fbb); builder.add_precision(*precision as i32); builder.add_scale(*scale as i32); builder.add_bitWidth(256); FBFieldType { - type_type: ipc::Type::Decimal, + type_type: crate::Type::Decimal, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&empty_fields[..])), } @@ -747,18 +752,18 @@ pub(crate) fn get_fb_field_type<'a>( } let union_mode = match mode { - UnionMode::Sparse => ipc::UnionMode::Sparse, - UnionMode::Dense => ipc::UnionMode::Dense, + UnionMode::Sparse => crate::UnionMode::Sparse, + UnionMode::Dense => crate::UnionMode::Dense, }; let fbb_type_ids = fbb .create_vector(&type_ids.iter().map(|t| *t as i32).collect::>()); - let mut builder = ipc::UnionBuilder::new(fbb); + let mut builder = crate::UnionBuilder::new(fbb); builder.add_mode(union_mode); builder.add_typeIds(fbb_type_ids); FBFieldType { - type_type: ipc::Type::Union, + type_type: crate::Type::Union, type_: builder.finish().as_union_value(), children: Some(fbb.create_vector(&children[..])), } @@ -772,10 +777,10 @@ pub(crate) fn get_fb_dictionary<'a>( dict_id: i64, dict_is_ordered: bool, fbb: &mut FlatBufferBuilder<'a>, -) -> WIPOffset> { +) -> WIPOffset> { // We assume that the dictionary index type (as an integer) has already been // validated elsewhere, and can safely assume we are dealing with integers - let mut index_builder = ipc::IntBuilder::new(fbb); + let mut index_builder = crate::IntBuilder::new(fbb); match *index_type { Int8 | Int16 | Int32 | Int64 => index_builder.add_is_signed(true), @@ -793,7 +798,7 @@ pub(crate) fn get_fb_dictionary<'a>( let index_builder = index_builder.finish(); - let mut builder = ipc::DictionaryEncodingBuilder::new(fbb); + let mut builder = crate::DictionaryEncodingBuilder::new(fbb); builder.add_id(dict_id); builder.add_indexType(index_builder); builder.add_isOrdered(dict_is_ordered); @@ -804,7 +809,6 @@ pub(crate) fn get_fb_dictionary<'a>( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::{DataType, Field, Schema, UnionMode}; #[test] fn convert_schema_round_trip() { @@ -1024,14 +1028,14 @@ mod tests { let fb = schema_to_fb(&schema); // read back fields - let ipc = ipc::root_as_schema(fb.finished_data()).unwrap(); + let ipc = crate::root_as_schema(fb.finished_data()).unwrap(); let schema2 = fb_to_schema(ipc); assert_eq!(schema, schema2); } #[test] fn schema_from_bytes() { - // bytes of a schema generated from python (0.14.0), saved as an `ipc::Message`. + // bytes of a schema generated from python (0.14.0), saved as an `crate::Message`. // the schema is: Field("field1", DataType::UInt32, false) let bytes: Vec = vec![ 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 3, 0, @@ -1041,7 +1045,7 @@ mod tests { 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, 0, 0, 0, 0, ]; - let ipc = ipc::root_as_message(&bytes[..]).unwrap(); + let ipc = crate::root_as_message(&bytes[..]).unwrap(); let schema = ipc.header_as_schema().unwrap(); // a message generated from Rust, same as the Python one @@ -1053,7 +1057,7 @@ mod tests { 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, ]; - let ipc2 = ipc::root_as_message(&bytes[..]).unwrap(); + let ipc2 = crate::root_as_message(&bytes[..]).unwrap(); let schema2 = ipc.header_as_schema().unwrap(); assert_eq!(schema, schema2); diff --git a/arrow/src/ipc/gen/File.rs b/arrow-ipc/src/gen/File.rs similarity index 99% rename from arrow/src/ipc/gen/File.rs rename to arrow-ipc/src/gen/File.rs index 9aafe910ba2c..0e9427813788 100644 --- a/arrow/src/ipc/gen/File.rs +++ b/arrow-ipc/src/gen/File.rs @@ -18,7 +18,7 @@ #![allow(dead_code)] #![allow(unused_imports)] -use crate::ipc::gen::Schema::*; +use crate::gen::Schema::*; use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify diff --git a/arrow/src/ipc/gen/Message.rs b/arrow-ipc/src/gen/Message.rs similarity index 99% rename from arrow/src/ipc/gen/Message.rs rename to arrow-ipc/src/gen/Message.rs index d4b3a57f164e..2b9f79766e31 100644 --- a/arrow/src/ipc/gen/Message.rs +++ b/arrow-ipc/src/gen/Message.rs @@ -18,9 +18,9 @@ #![allow(dead_code)] #![allow(unused_imports)] -use crate::ipc::gen::Schema::*; -use crate::ipc::gen::SparseTensor::*; -use crate::ipc::gen::Tensor::*; +use crate::gen::Schema::*; +use crate::gen::SparseTensor::*; +use crate::gen::Tensor::*; use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify @@ -340,7 +340,7 @@ pub struct MessageHeaderUnionTableOffset {} /// Metadata about a field at some level of a nested type tree (but not /// its children). /// -/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` +/// For example, a `List` with values `[[1, 2, 3], null, [4], [5, 6], null]` /// would have {length: 5, null_count: 2} for its List node, and {length: 6, /// null_count: 0} for its Int16 node, as separate FieldNode structs // struct FieldNode, aligned to 8 diff --git a/arrow/src/ipc/gen/Schema.rs b/arrow-ipc/src/gen/Schema.rs similarity index 100% rename from arrow/src/ipc/gen/Schema.rs rename to arrow-ipc/src/gen/Schema.rs diff --git a/arrow/src/ipc/gen/SparseTensor.rs b/arrow-ipc/src/gen/SparseTensor.rs similarity index 99% rename from arrow/src/ipc/gen/SparseTensor.rs rename to arrow-ipc/src/gen/SparseTensor.rs index 317831c59ef0..c5e06c30e03e 100644 --- a/arrow/src/ipc/gen/SparseTensor.rs +++ b/arrow-ipc/src/gen/SparseTensor.rs @@ -18,8 +18,8 @@ #![allow(dead_code)] #![allow(unused_imports)] -use crate::ipc::gen::Schema::*; -use crate::ipc::gen::Tensor::*; +use crate::gen::Schema::*; +use crate::gen::Tensor::*; use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify diff --git a/arrow/src/ipc/gen/Tensor.rs b/arrow-ipc/src/gen/Tensor.rs similarity index 99% rename from arrow/src/ipc/gen/Tensor.rs rename to arrow-ipc/src/gen/Tensor.rs index f22ff23c98b7..954ebd29012b 100644 --- a/arrow/src/ipc/gen/Tensor.rs +++ b/arrow-ipc/src/gen/Tensor.rs @@ -18,7 +18,7 @@ #![allow(dead_code)] #![allow(unused_imports)] -use crate::ipc::gen::Schema::*; +use crate::gen::Schema::*; use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify diff --git a/arrow/src/ipc/gen/mod.rs b/arrow-ipc/src/gen/mod.rs similarity index 100% rename from arrow/src/ipc/gen/mod.rs rename to arrow-ipc/src/gen/mod.rs diff --git a/arrow/src/ipc/mod.rs b/arrow-ipc/src/lib.rs similarity index 97% rename from arrow/src/ipc/mod.rs rename to arrow-ipc/src/lib.rs index 2b30e72206c3..38217957dd87 100644 --- a/arrow/src/ipc/mod.rs +++ b/arrow-ipc/src/lib.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Support for the Arrow IPC format + // TODO: (vcq): Protobuf codegen is not generating Debug impls. #![allow(missing_debug_implementations)] diff --git a/arrow/src/ipc/reader.rs b/arrow-ipc/src/reader.rs similarity index 93% rename from arrow/src/ipc/reader.rs rename to arrow-ipc/src/reader.rs index 1f2824b343af..0165c775d5a3 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -26,16 +26,14 @@ use std::fmt; use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; -use crate::array::*; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::compute::cast; -use crate::datatypes::{DataType, Field, IntervalUnit, Schema, SchemaRef, UnionMode}; -use crate::error::{ArrowError, Result}; -use crate::ipc; -use crate::record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; - -use crate::ipc::compression::CompressionCodec; -use ipc::CONTINUATION_MARKER; +use arrow_array::*; +use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_cast::cast; +use arrow_data::ArrayData; +use arrow_schema::*; + +use crate::compression::CompressionCodec; +use crate::CONTINUATION_MARKER; use DataType::*; /// Read a buffer based on offset and length @@ -48,10 +46,10 @@ use DataType::*; /// follows is not compressed, which can be useful for cases where /// compression does not yield appreciable savings. fn read_buffer( - buf: &ipc::Buffer, + buf: &crate::Buffer, a_data: &Buffer, compression_codec: &Option, -) -> Result { +) -> Result { let start_offset = buf.offset() as usize; let buf_data = a_data.slice_with_length(start_offset, buf.length() as usize); // corner case: empty buffer @@ -72,16 +70,16 @@ fn read_buffer( /// - cast the 64-bit array to the appropriate data type #[allow(clippy::too_many_arguments)] fn create_array( - nodes: flatbuffers::Vector<'_, ipc::FieldNode>, + nodes: flatbuffers::Vector<'_, crate::FieldNode>, field: &Field, data: &Buffer, - buffers: flatbuffers::Vector<'_, ipc::Buffer>, + buffers: flatbuffers::Vector<'_, crate::Buffer>, dictionaries_by_id: &HashMap, mut node_index: usize, mut buffer_index: usize, compression_codec: &Option, - metadata: &ipc::MetadataVersion, -) -> Result<(ArrayRef, usize, usize)> { + metadata: &crate::MetadataVersion, +) -> Result<(ArrayRef, usize, usize), ArrowError> { let data_type = field.data_type(); let array = match data_type { Utf8 | Binary | LargeBinary | LargeUtf8 => { @@ -232,7 +230,7 @@ fn create_array( // In V4, union types has validity bitmap // In V5 and later, union types have no validity bitmap - if metadata < &ipc::MetadataVersion::V5 { + if metadata < &crate::MetadataVersion::V5 { read_buffer(buffers.get(buffer_index), data, compression_codec)?; buffer_index += 1; } @@ -323,7 +321,7 @@ fn skip_field( data_type: &DataType, mut node_index: usize, mut buffer_index: usize, -) -> Result<(usize, usize)> { +) -> Result<(usize, usize), ArrowError> { match data_type { Utf8 | Binary | LargeBinary | LargeUtf8 => { node_index += 1; @@ -396,7 +394,7 @@ fn skip_field( /// Reads the correct number of buffers based on data type and null_count, and creates a /// primitive array ref fn create_primitive_array( - field_node: &ipc::FieldNode, + field_node: &crate::FieldNode, data_type: &DataType, buffers: &[Buffer], ) -> ArrayRef { @@ -536,7 +534,7 @@ fn get_aligned_buffer(buffer: &Buffer, length: usize) -> Buffer { /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_list_array( - field_node: &ipc::FieldNode, + field_node: &crate::FieldNode, data_type: &DataType, buffers: &[Buffer], child_array: ArrayRef, @@ -564,7 +562,7 @@ fn create_list_array( /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_dictionary_array( - field_node: &ipc::FieldNode, + field_node: &crate::FieldNode, data_type: &DataType, buffers: &[Buffer], value_array: ArrayRef, @@ -583,15 +581,15 @@ fn create_dictionary_array( } } -/// Creates a record batch from binary data using the `ipc::RecordBatch` indexes and the `Schema` +/// Creates a record batch from binary data using the `crate::RecordBatch` indexes and the `Schema` pub fn read_record_batch( buf: &Buffer, - batch: ipc::RecordBatch, + batch: crate::RecordBatch, schema: SchemaRef, dictionaries_by_id: &HashMap, projection: Option<&[usize]>, - metadata: &ipc::MetadataVersion, -) -> Result { + metadata: &crate::MetadataVersion, +) -> Result { let buffers = batch.buffers().ok_or_else(|| { ArrowError::IoError("Unable to get buffers from IPC RecordBatch".to_string()) })?; @@ -669,11 +667,11 @@ pub fn read_record_batch( /// updating the `dictionaries_by_id` with the resulting dictionary pub fn read_dictionary( buf: &Buffer, - batch: ipc::DictionaryBatch, + batch: crate::DictionaryBatch, schema: &Schema, dictionaries_by_id: &mut HashMap, - metadata: &ipc::MetadataVersion, -) -> Result<()> { + metadata: &crate::MetadataVersion, +) -> Result<(), ArrowError> { if batch.isDelta() { return Err(ArrowError::IoError( "delta dictionary batches not supported".to_string(), @@ -732,7 +730,7 @@ pub struct FileReader { /// The blocks in the file /// /// A block indicates the regions in the file to read to get data - blocks: Vec, + blocks: Vec, /// A counter to keep track of the current block that should be read current_block: usize, @@ -746,7 +744,7 @@ pub struct FileReader { dictionaries_by_id: HashMap, /// Metadata version - metadata_version: ipc::MetadataVersion, + metadata_version: crate::MetadataVersion, /// Optional projection and projected_schema projection: Option<(Vec, Schema)>, @@ -772,7 +770,10 @@ impl FileReader { /// /// Returns errors if the file does not meet the Arrow Format header and footer /// requirements - pub fn try_new(reader: R, projection: Option>) -> Result { + pub fn try_new( + reader: R, + projection: Option>, + ) -> Result { let mut reader = BufReader::new(reader); // check if header and footer contain correct magic bytes let mut magic_buffer: [u8; 6] = [0; 6]; @@ -800,7 +801,7 @@ impl FileReader { reader.seek(SeekFrom::End(-10 - footer_len as i64))?; reader.read_exact(&mut footer_data)?; - let footer = ipc::root_as_footer(&footer_data[..]).map_err(|err| { + let footer = crate::root_as_footer(&footer_data[..]).map_err(|err| { ArrowError::IoError(format!("Unable to get root as footer: {:?}", err)) })?; @@ -813,7 +814,7 @@ impl FileReader { let total_blocks = blocks.len(); let ipc_schema = footer.schema().unwrap(); - let schema = ipc::convert::fb_to_schema(ipc_schema); + let schema = crate::convert::fb_to_schema(ipc_schema); // Create an array of optional dictionary value arrays, one per field. let mut dictionaries_by_id = HashMap::new(); @@ -831,7 +832,7 @@ impl FileReader { reader.read_exact(&mut block_data)?; - let message = ipc::root_as_message(&block_data[..]).map_err(|err| { + let message = crate::root_as_message(&block_data[..]).map_err(|err| { ArrowError::IoError(format!( "Unable to get root as message: {:?}", err @@ -839,7 +840,7 @@ impl FileReader { })?; match message.header_type() { - ipc::MessageHeader::DictionaryBatch => { + crate::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().unwrap(); // read the block that makes up the dictionary batch into a buffer @@ -900,7 +901,7 @@ impl FileReader { /// Read a specific record batch /// /// Sets the current block to the index, allowing random reads - pub fn set_index(&mut self, index: usize) -> Result<()> { + pub fn set_index(&mut self, index: usize) -> Result<(), ArrowError> { if index >= self.total_blocks { Err(ArrowError::IoError(format!( "Cannot set batch to index {} from {} total batches", @@ -912,7 +913,7 @@ impl FileReader { } } - fn maybe_next(&mut self) -> Result> { + fn maybe_next(&mut self) -> Result, ArrowError> { let block = self.blocks[self.current_block]; self.current_block += 1; @@ -928,12 +929,12 @@ impl FileReader { let mut block_data = vec![0; meta_len as usize]; self.reader.read_exact(&mut block_data)?; - let message = ipc::root_as_message(&block_data[..]).map_err(|err| { + let message = crate::root_as_message(&block_data[..]).map_err(|err| { ArrowError::IoError(format!("Unable to get root as footer: {:?}", err)) })?; // some old test data's footer metadata is not set, so we account for that - if self.metadata_version != ipc::MetadataVersion::V1 + if self.metadata_version != crate::MetadataVersion::V1 && message.version() != self.metadata_version { return Err(ArrowError::IoError( @@ -942,10 +943,10 @@ impl FileReader { } match message.header_type() { - ipc::MessageHeader::Schema => Err(ArrowError::IoError( + crate::MessageHeader::Schema => Err(ArrowError::IoError( "Not expecting a schema when messages are read".to_string(), )), - ipc::MessageHeader::RecordBatch => { + crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { ArrowError::IoError( "Unable to read IPC message as record batch".to_string(), @@ -968,7 +969,7 @@ impl FileReader { ).map(Some) } - ipc::MessageHeader::NONE => { + crate::MessageHeader::NONE => { Ok(None) } t => Err(ArrowError::IoError(format!( @@ -979,7 +980,7 @@ impl FileReader { } impl Iterator for FileReader { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { // get current block @@ -1037,7 +1038,10 @@ impl StreamReader { /// The first message in the stream is the schema, the reader will fail if it does not /// encounter a schema. /// To check if the reader is done, use `is_finished(self)` - pub fn try_new(reader: R, projection: Option>) -> Result { + pub fn try_new( + reader: R, + projection: Option>, + ) -> Result { let mut reader = BufReader::new(reader); // determine metadata length let mut meta_size: [u8; 4] = [0; 4]; @@ -1054,14 +1058,14 @@ impl StreamReader { let mut meta_buffer = vec![0; meta_len as usize]; reader.read_exact(&mut meta_buffer)?; - let message = ipc::root_as_message(meta_buffer.as_slice()).map_err(|err| { + let message = crate::root_as_message(meta_buffer.as_slice()).map_err(|err| { ArrowError::IoError(format!("Unable to get root as message: {:?}", err)) })?; // message header is a Schema, so read it - let ipc_schema: ipc::Schema = message.header_as_schema().ok_or_else(|| { + let ipc_schema: crate::Schema = message.header_as_schema().ok_or_else(|| { ArrowError::IoError("Unable to read IPC message as schema".to_string()) })?; - let schema = ipc::convert::fb_to_schema(ipc_schema); + let schema = crate::convert::fb_to_schema(ipc_schema); // Create an array of optional dictionary value arrays, one per field. let dictionaries_by_id = HashMap::new(); @@ -1092,7 +1096,7 @@ impl StreamReader { self.finished } - fn maybe_next(&mut self) -> Result> { + fn maybe_next(&mut self) -> Result, ArrowError> { if self.finished { return Ok(None); } @@ -1133,15 +1137,15 @@ impl StreamReader { self.reader.read_exact(&mut meta_buffer)?; let vecs = &meta_buffer.to_vec(); - let message = ipc::root_as_message(vecs).map_err(|err| { + let message = crate::root_as_message(vecs).map_err(|err| { ArrowError::IoError(format!("Unable to get root as message: {:?}", err)) })?; match message.header_type() { - ipc::MessageHeader::Schema => Err(ArrowError::IoError( + crate::MessageHeader::Schema => Err(ArrowError::IoError( "Not expecting a schema when messages are read".to_string(), )), - ipc::MessageHeader::RecordBatch => { + crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { ArrowError::IoError( "Unable to read IPC message as record batch".to_string(), @@ -1153,7 +1157,7 @@ impl StreamReader { read_record_batch(&buf.into(), batch, self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref()), &message.version()).map(Some) } - ipc::MessageHeader::DictionaryBatch => { + crate::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().ok_or_else(|| { ArrowError::IoError( "Unable to read IPC message as dictionary batch".to_string(), @@ -1170,7 +1174,7 @@ impl StreamReader { // read the next message until we encounter a RecordBatch self.maybe_next() } - ipc::MessageHeader::NONE => { + crate::MessageHeader::NONE => { Ok(None) } t => Err(ArrowError::IoError( @@ -1181,7 +1185,7 @@ impl StreamReader { } impl Iterator for StreamReader { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { self.maybe_next().transpose() @@ -1198,10 +1202,10 @@ impl RecordBatchReader for StreamReader { mod tests { use super::*; - use std::fs::File; - - use crate::datatypes; - use crate::datatypes::{ArrowNativeType, Float64Type, Int32Type, Int8Type}; + use arrow_array::builder::UnionBuilder; + use arrow_array::types::*; + use arrow_buffer::ArrowNativeType; + use arrow_data::ArrayDataBuilder; fn create_test_projection_schema() -> Schema { // define field types @@ -1347,7 +1351,8 @@ mod tests { // write record batch in IPC format let mut buf = Vec::new(); { - let mut writer = ipc::writer::FileWriter::try_new(&mut buf, &schema).unwrap(); + let mut writer = + crate::writer::FileWriter::try_new(&mut buf, &schema).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); } @@ -1382,15 +1387,18 @@ mod tests { ]; let batch = RecordBatch::try_new(Arc::new(schema.clone()), arrays).unwrap(); // create stream writer - let file = File::create("target/debug/testdata/float.stream").unwrap(); + let mut file = tempfile::tempfile().unwrap(); let mut stream_writer = - crate::ipc::writer::StreamWriter::try_new(file, &schema).unwrap(); + crate::writer::StreamWriter::try_new(&mut file, &schema).unwrap(); stream_writer.write(&batch).unwrap(); stream_writer.finish().unwrap(); + drop(stream_writer); + + file.rewind().unwrap(); + // read stream back - let file = File::open("target/debug/testdata/float.stream").unwrap(); - let reader = StreamReader::try_new(file, None).unwrap(); + let reader = StreamReader::try_new(&mut file, None).unwrap(); reader.for_each(|batch| { let batch = batch.unwrap(); @@ -1414,7 +1422,7 @@ mod tests { ); }); - let file = File::open("target/debug/testdata/float.stream").unwrap(); + file.rewind().unwrap(); // Read with projection let reader = StreamReader::try_new(file, Some(vec![0, 3])).unwrap(); @@ -1430,33 +1438,33 @@ mod tests { fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch { let mut buf = Vec::new(); let mut writer = - ipc::writer::FileWriter::try_new(&mut buf, &rb.schema()).unwrap(); + crate::writer::FileWriter::try_new(&mut buf, &rb.schema()).unwrap(); writer.write(rb).unwrap(); writer.finish().unwrap(); drop(writer); let mut reader = - ipc::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); reader.next().unwrap().unwrap() } fn roundtrip_ipc_stream(rb: &RecordBatch) -> RecordBatch { let mut buf = Vec::new(); let mut writer = - ipc::writer::StreamWriter::try_new(&mut buf, &rb.schema()).unwrap(); + crate::writer::StreamWriter::try_new(&mut buf, &rb.schema()).unwrap(); writer.write(rb).unwrap(); writer.finish().unwrap(); drop(writer); let mut reader = - ipc::reader::StreamReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + crate::reader::StreamReader::try_new(std::io::Cursor::new(buf), None) + .unwrap(); reader.next().unwrap().unwrap() } #[test] fn test_roundtrip_nested_dict() { - let inner: DictionaryArray = - vec!["a", "b", "a"].into_iter().collect(); + let inner: DictionaryArray = vec!["a", "b", "a"].into_iter().collect(); let array = Arc::new(inner) as ArrayRef; @@ -1477,11 +1485,11 @@ mod tests { } fn check_union_with_builder(mut builder: UnionBuilder) { - builder.append::("a", 1).unwrap(); - builder.append_null::("a").unwrap(); - builder.append::("c", 3.0).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append::("d", 11).unwrap(); + builder.append::("a", 1).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("c", 3.0).unwrap(); + builder.append::("a", 4).unwrap(); + builder.append::("d", 11).unwrap(); let union = builder.build().unwrap(); let schema = Arc::new(Schema::new(vec![Field::new( @@ -1521,7 +1529,7 @@ mod tests { let dict = Arc::new( xs.clone() .into_iter() - .collect::>(), + .collect::>(), ); let string_array: ArrayRef = Arc::new(StringArray::from(xs.clone())); let struct_array = StructArray::from(vec![ diff --git a/arrow/src/ipc/writer.rs b/arrow-ipc/src/writer.rs similarity index 87% rename from arrow/src/ipc/writer.rs rename to arrow-ipc/src/writer.rs index 4f40574ab12e..44f32f0cbcf1 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -26,21 +26,16 @@ use std::io::{BufWriter, Write}; use flatbuffers::FlatBufferBuilder; -use crate::array::{ - as_large_list_array, as_list_array, as_map_array, as_struct_array, as_union_array, - layout, make_array, Array, ArrayData, ArrayRef, BinaryArray, BufferBuilder, - BufferSpec, FixedSizeListArray, GenericBinaryArray, GenericStringArray, - LargeBinaryArray, LargeStringArray, OffsetSizeTrait, StringArray, -}; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::ipc; -use crate::record_batch::RecordBatch; -use crate::util::bit_util; - -use crate::ipc::compression::CompressionCodec; -use ipc::CONTINUATION_MARKER; +use arrow_array::builder::BufferBuilder; +use arrow_array::cast::*; +use arrow_array::*; +use arrow_buffer::bit_util; +use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_data::{layout, ArrayData, BufferSpec}; +use arrow_schema::*; + +use crate::compression::CompressionCodec; +use crate::CONTINUATION_MARKER; /// IPC write options used to control the behaviour of the writer #[derive(Debug, Clone)] @@ -58,24 +53,25 @@ pub struct IpcWriteOptions { /// /// version 2.0.0: V4, with legacy format enabled /// version 4.0.0: V5 - metadata_version: ipc::MetadataVersion, - /// Compression, if desired. Only supported when `ipc_compression` - /// feature is enabled - batch_compression_type: Option, + metadata_version: crate::MetadataVersion, + /// Compression, if desired. Will result in a runtime error + /// if the corresponding feature is not enabled + batch_compression_type: Option, } impl IpcWriteOptions { - /// Configures compression when writing IPC files. Requires the - /// `ipc_compression` feature of the crate to be activated. - #[cfg(feature = "ipc_compression")] + /// Configures compression when writing IPC files. + /// + /// Will result in a runtime error if the corresponding feature + /// is not enabled pub fn try_with_compression( mut self, - batch_compression_type: Option, - ) -> Result { + batch_compression_type: Option, + ) -> Result { self.batch_compression_type = batch_compression_type; if self.batch_compression_type.is_some() - && self.metadata_version < ipc::MetadataVersion::V5 + && self.metadata_version < crate::MetadataVersion::V5 { return Err(ArrowError::InvalidArgumentError( "Compression only supported in metadata v5 and above".to_string(), @@ -87,26 +83,26 @@ impl IpcWriteOptions { pub fn try_new( alignment: usize, write_legacy_ipc_format: bool, - metadata_version: ipc::MetadataVersion, - ) -> Result { + metadata_version: crate::MetadataVersion, + ) -> Result { if alignment == 0 || alignment % 8 != 0 { return Err(ArrowError::InvalidArgumentError( "Alignment should be greater than 0 and be a multiple of 8".to_string(), )); } match metadata_version { - ipc::MetadataVersion::V1 - | ipc::MetadataVersion::V2 - | ipc::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError( + crate::MetadataVersion::V1 + | crate::MetadataVersion::V2 + | crate::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError( "Writing IPC metadata version 3 and lower not supported".to_string(), )), - ipc::MetadataVersion::V4 => Ok(Self { + crate::MetadataVersion::V4 => Ok(Self { alignment, write_legacy_ipc_format, metadata_version, batch_compression_type: None, }), - ipc::MetadataVersion::V5 => { + crate::MetadataVersion::V5 => { if write_legacy_ipc_format { Err(ArrowError::InvalidArgumentError( "Legacy IPC format only supported on metadata version 4" @@ -122,7 +118,7 @@ impl IpcWriteOptions { } } z => Err(ArrowError::InvalidArgumentError(format!( - "Unsupported ipc::MetadataVersion {:?}", + "Unsupported crate::MetadataVersion {:?}", z ))), } @@ -134,7 +130,7 @@ impl Default for IpcWriteOptions { Self { alignment: 64, write_legacy_ipc_format: false, - metadata_version: ipc::MetadataVersion::V5, + metadata_version: crate::MetadataVersion::V5, batch_compression_type: None, } } @@ -151,13 +147,13 @@ impl IpcDataGenerator { ) -> EncodedData { let mut fbb = FlatBufferBuilder::new(); let schema = { - let fb = ipc::convert::schema_to_fb_offset(&mut fbb, schema); + let fb = crate::convert::schema_to_fb_offset(&mut fbb, schema); fb.as_union_value() }; - let mut message = ipc::MessageBuilder::new(&mut fbb); + let mut message = crate::MessageBuilder::new(&mut fbb); message.add_version(write_options.metadata_version); - message.add_header_type(ipc::MessageHeader::Schema); + message.add_header_type(crate::MessageHeader::Schema); message.add_bodyLength(0); message.add_header(schema); // TODO: custom metadata @@ -177,7 +173,7 @@ impl IpcDataGenerator { encoded_dictionaries: &mut Vec, dictionary_tracker: &mut DictionaryTracker, write_options: &IpcWriteOptions, - ) -> Result<()> { + ) -> Result<(), ArrowError> { match column.data_type() { DataType::Struct(fields) => { let s = as_struct_array(column); @@ -281,7 +277,7 @@ impl IpcDataGenerator { encoded_dictionaries: &mut Vec, dictionary_tracker: &mut DictionaryTracker, write_options: &IpcWriteOptions, - ) -> Result<()> { + ) -> Result<(), ArrowError> { match column.data_type() { DataType::Dictionary(_key_type, _value_type) => { let dict_id = field @@ -325,7 +321,7 @@ impl IpcDataGenerator { batch: &RecordBatch, dictionary_tracker: &mut DictionaryTracker, write_options: &IpcWriteOptions, - ) -> Result<(Vec, EncodedData)> { + ) -> Result<(Vec, EncodedData), ArrowError> { let schema = batch.schema(); let mut encoded_dictionaries = Vec::with_capacity(schema.all_fields().len()); @@ -344,17 +340,17 @@ impl IpcDataGenerator { Ok((encoded_dictionaries, encoded_message)) } - /// Write a `RecordBatch` into two sets of bytes, one for the header (ipc::Message) and the + /// Write a `RecordBatch` into two sets of bytes, one for the header (crate::Message) and the /// other for the batch's data fn record_batch_to_bytes( &self, batch: &RecordBatch, write_options: &IpcWriteOptions, - ) -> Result { + ) -> Result { let mut fbb = FlatBufferBuilder::new(); - let mut nodes: Vec = vec![]; - let mut buffers: Vec = vec![]; + let mut nodes: Vec = vec![]; + let mut buffers: Vec = vec![]; let mut arrow_data: Vec = vec![]; let mut offset = 0; @@ -362,8 +358,8 @@ impl IpcDataGenerator { let batch_compression_type = write_options.batch_compression_type; let compression = batch_compression_type.map(|batch_compression_type| { - let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); - c.add_method(ipc::BodyCompressionMethod::BUFFER); + let mut c = crate::BodyCompressionBuilder::new(&mut fbb); + c.add_method(crate::BodyCompressionMethod::BUFFER); c.add_codec(batch_compression_type); c.finish() }); @@ -394,7 +390,7 @@ impl IpcDataGenerator { let buffers = fbb.create_vector(&buffers); let nodes = fbb.create_vector(&nodes); let root = { - let mut batch_builder = ipc::RecordBatchBuilder::new(&mut fbb); + let mut batch_builder = crate::RecordBatchBuilder::new(&mut fbb); batch_builder.add_length(batch.num_rows() as i64); batch_builder.add_nodes(nodes); batch_builder.add_buffers(buffers); @@ -404,10 +400,10 @@ impl IpcDataGenerator { let b = batch_builder.finish(); b.as_union_value() }; - // create an ipc::Message - let mut message = ipc::MessageBuilder::new(&mut fbb); + // create an crate::Message + let mut message = crate::MessageBuilder::new(&mut fbb); message.add_version(write_options.metadata_version); - message.add_header_type(ipc::MessageHeader::RecordBatch); + message.add_header_type(crate::MessageHeader::RecordBatch); message.add_bodyLength(arrow_data.len() as i64); message.add_header(root); let root = message.finish(); @@ -420,26 +416,26 @@ impl IpcDataGenerator { }) } - /// Write dictionary values into two sets of bytes, one for the header (ipc::Message) and the + /// Write dictionary values into two sets of bytes, one for the header (crate::Message) and the /// other for the data fn dictionary_batch_to_bytes( &self, dict_id: i64, array_data: &ArrayData, write_options: &IpcWriteOptions, - ) -> Result { + ) -> Result { let mut fbb = FlatBufferBuilder::new(); - let mut nodes: Vec = vec![]; - let mut buffers: Vec = vec![]; + let mut nodes: Vec = vec![]; + let mut buffers: Vec = vec![]; let mut arrow_data: Vec = vec![]; // get the type of compression let batch_compression_type = write_options.batch_compression_type; let compression = batch_compression_type.map(|batch_compression_type| { - let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); - c.add_method(ipc::BodyCompressionMethod::BUFFER); + let mut c = crate::BodyCompressionBuilder::new(&mut fbb); + c.add_method(crate::BodyCompressionMethod::BUFFER); c.add_codec(batch_compression_type); c.finish() }); @@ -470,7 +466,7 @@ impl IpcDataGenerator { let nodes = fbb.create_vector(&nodes); let root = { - let mut batch_builder = ipc::RecordBatchBuilder::new(&mut fbb); + let mut batch_builder = crate::RecordBatchBuilder::new(&mut fbb); batch_builder.add_length(array_data.len() as i64); batch_builder.add_nodes(nodes); batch_builder.add_buffers(buffers); @@ -481,16 +477,16 @@ impl IpcDataGenerator { }; let root = { - let mut batch_builder = ipc::DictionaryBatchBuilder::new(&mut fbb); + let mut batch_builder = crate::DictionaryBatchBuilder::new(&mut fbb); batch_builder.add_id(dict_id); batch_builder.add_data(root); batch_builder.finish().as_union_value() }; let root = { - let mut message_builder = ipc::MessageBuilder::new(&mut fbb); + let mut message_builder = crate::MessageBuilder::new(&mut fbb); message_builder.add_version(write_options.metadata_version); - message_builder.add_header_type(ipc::MessageHeader::DictionaryBatch); + message_builder.add_header_type(crate::MessageHeader::DictionaryBatch); message_builder.add_bodyLength(arrow_data.len() as i64); message_builder.add_header(root); message_builder.finish() @@ -531,7 +527,11 @@ impl DictionaryTracker { /// * If the tracker has not been configured to error on replacement or this dictionary /// has never been seen before, return `Ok(true)` to indicate that the dictionary was just /// inserted. - pub fn insert(&mut self, dict_id: i64, column: &ArrayRef) -> Result { + pub fn insert( + &mut self, + dict_id: i64, + column: &ArrayRef, + ) -> Result { let dict_data = column.data(); let dict_values = &dict_data.child_data()[0]; @@ -565,9 +565,9 @@ pub struct FileWriter { /// The number of bytes between each block of bytes, as an offset for random access block_offsets: usize, /// Dictionary blocks that will be written as part of the IPC footer - dictionary_blocks: Vec, + dictionary_blocks: Vec, /// Record blocks that will be written as part of the IPC footer - record_blocks: Vec, + record_blocks: Vec, /// Whether the writer footer has been written, and the writer is finished finished: bool, /// Keeps track of dictionaries that have been written @@ -578,7 +578,7 @@ pub struct FileWriter { impl FileWriter { /// Try create a new writer, with the schema written as part of the header - pub fn try_new(writer: W, schema: &Schema) -> Result { + pub fn try_new(writer: W, schema: &Schema) -> Result { let write_options = IpcWriteOptions::default(); Self::try_new_with_options(writer, schema, write_options) } @@ -588,7 +588,7 @@ impl FileWriter { writer: W, schema: &Schema, write_options: IpcWriteOptions, - ) -> Result { + ) -> Result { let data_gen = IpcDataGenerator::default(); let mut writer = BufWriter::new(writer); // write magic to header aligned on 8 byte boundary @@ -613,7 +613,7 @@ impl FileWriter { } /// Write a record batch to the file - pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { if self.finished { return Err(ArrowError::IoError( "Cannot write record batch to file writer as it is closed".to_string(), @@ -631,7 +631,7 @@ impl FileWriter { write_message(&mut self.writer, encoded_dictionary, &self.write_options)?; let block = - ipc::Block::new(self.block_offsets as i64, meta as i32, data as i64); + crate::Block::new(self.block_offsets as i64, meta as i32, data as i64); self.dictionary_blocks.push(block); self.block_offsets += meta + data; } @@ -639,7 +639,7 @@ impl FileWriter { let (meta, data) = write_message(&mut self.writer, encoded_message, &self.write_options)?; // add a record block for the footer - let block = ipc::Block::new( + let block = crate::Block::new( self.block_offsets as i64, meta as i32, // TODO: is this still applicable? data as i64, @@ -650,7 +650,7 @@ impl FileWriter { } /// Write footer and closing tag, then mark the writer as done - pub fn finish(&mut self) -> Result<()> { + pub fn finish(&mut self) -> Result<(), ArrowError> { if self.finished { return Err(ArrowError::IoError( "Cannot write footer to file writer as it is closed".to_string(), @@ -663,10 +663,10 @@ impl FileWriter { let mut fbb = FlatBufferBuilder::new(); let dictionaries = fbb.create_vector(&self.dictionary_blocks); let record_batches = fbb.create_vector(&self.record_blocks); - let schema = ipc::convert::schema_to_fb_offset(&mut fbb, &self.schema); + let schema = crate::convert::schema_to_fb_offset(&mut fbb, &self.schema); let root = { - let mut footer_builder = ipc::FooterBuilder::new(&mut fbb); + let mut footer_builder = crate::FooterBuilder::new(&mut fbb); footer_builder.add_version(self.write_options.metadata_version); footer_builder.add_schema(schema); footer_builder.add_dictionaries(dictionaries); @@ -690,7 +690,7 @@ impl FileWriter { /// /// The buffer is flushed and the FileWriter is finished before returning the /// writer. - pub fn into_inner(mut self) -> Result { + pub fn into_inner(mut self) -> Result { if !self.finished { self.finish()?; } @@ -713,7 +713,7 @@ pub struct StreamWriter { impl StreamWriter { /// Try create a new writer, with the schema written as part of the header - pub fn try_new(writer: W, schema: &Schema) -> Result { + pub fn try_new(writer: W, schema: &Schema) -> Result { let write_options = IpcWriteOptions::default(); Self::try_new_with_options(writer, schema, write_options) } @@ -722,7 +722,7 @@ impl StreamWriter { writer: W, schema: &Schema, write_options: IpcWriteOptions, - ) -> Result { + ) -> Result { let data_gen = IpcDataGenerator::default(); let mut writer = BufWriter::new(writer); // write the schema, set the written bytes to the schema @@ -738,7 +738,7 @@ impl StreamWriter { } /// Write a record batch to the stream - pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { if self.finished { return Err(ArrowError::IoError( "Cannot write record batch to stream writer as it is closed".to_string(), @@ -759,7 +759,7 @@ impl StreamWriter { } /// Write continuation bytes, and mark the stream as done - pub fn finish(&mut self) -> Result<()> { + pub fn finish(&mut self) -> Result<(), ArrowError> { if self.finished { return Err(ArrowError::IoError( "Cannot write footer to stream writer as it is closed".to_string(), @@ -787,10 +787,9 @@ impl StreamWriter { /// # Example /// /// ``` - /// # use arrow::datatypes::Schema; - /// # use arrow::ipc::writer::{StreamWriter, IpcWriteOptions}; - /// # use arrow::ipc::MetadataVersion; - /// # use arrow::error::ArrowError; + /// # use arrow_ipc::writer::{StreamWriter, IpcWriteOptions}; + /// # use arrow_ipc::MetadataVersion; + /// # use arrow_schema::{ArrowError, Schema}; /// # fn main() -> Result<(), ArrowError> { /// // The result we expect from an empty schema /// let expected = vec![ @@ -815,7 +814,7 @@ impl StreamWriter { /// # Ok(()) /// # } /// ``` - pub fn into_inner(mut self) -> Result { + pub fn into_inner(mut self) -> Result { if !self.finished { self.finish()?; } @@ -823,9 +822,9 @@ impl StreamWriter { } } -/// Stores the encoded data, which is an ipc::Message, and optional Arrow data +/// Stores the encoded data, which is an crate::Message, and optional Arrow data pub struct EncodedData { - /// An encoded ipc::Message + /// An encoded crate::Message pub ipc_message: Vec, /// Arrow buffers to be written, should be an empty vec for schema messages pub arrow_data: Vec, @@ -835,7 +834,7 @@ pub fn write_message( mut writer: W, encoded: EncodedData, write_options: &IpcWriteOptions, -) -> Result<(usize, usize)> { +) -> Result<(usize, usize), ArrowError> { let arrow_data_len = encoded.arrow_data.len(); if arrow_data_len % 8 != 0 { return Err(ArrowError::MemoryError( @@ -877,7 +876,7 @@ pub fn write_message( Ok((aligned_size, body_len)) } -fn write_body_buffers(mut writer: W, data: &[u8]) -> Result { +fn write_body_buffers(mut writer: W, data: &[u8]) -> Result { let len = data.len() as u32; let pad_len = pad_to_8(len) as u32; let total_len = len + pad_len; @@ -898,17 +897,17 @@ fn write_continuation( mut writer: W, write_options: &IpcWriteOptions, total_len: i32, -) -> Result { +) -> Result { let mut written = 8; // the version of the writer determines whether continuation markers should be added match write_options.metadata_version { - ipc::MetadataVersion::V1 - | ipc::MetadataVersion::V2 - | ipc::MetadataVersion::V3 => { + crate::MetadataVersion::V1 + | crate::MetadataVersion::V2 + | crate::MetadataVersion::V3 => { unreachable!("Options with the metadata version cannot be created") } - ipc::MetadataVersion::V4 => { + crate::MetadataVersion::V4 => { if !write_options.write_legacy_ipc_format { // v0.15.0 format writer.write_all(&CONTINUATION_MARKER)?; @@ -916,12 +915,12 @@ fn write_continuation( } writer.write_all(&total_len.to_le_bytes()[..])?; } - ipc::MetadataVersion::V5 => { + crate::MetadataVersion::V5 => { // write continuation marker and message length writer.write_all(&CONTINUATION_MARKER)?; writer.write_all(&total_len.to_le_bytes()[..])?; } - z => panic!("Unsupported ipc::MetadataVersion {:?}", z), + z => panic!("Unsupported crate::MetadataVersion {:?}", z), }; writer.flush()?; @@ -932,7 +931,7 @@ fn write_continuation( /// In V4, null types have no validity bitmap /// In V5 and later, null and union types have no validity bitmap fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> bool { - if write_options.metadata_version < ipc::MetadataVersion::V5 { + if write_options.metadata_version < crate::MetadataVersion::V5 { !matches!(data_type, DataType::Null) } else { !matches!(data_type, DataType::Null | DataType::Union(_, _, _)) @@ -1053,22 +1052,22 @@ fn get_buffer_offset(array_data: &ArrayData) -> Off #[allow(clippy::too_many_arguments)] fn write_array_data( array_data: &ArrayData, - buffers: &mut Vec, + buffers: &mut Vec, arrow_data: &mut Vec, - nodes: &mut Vec, + nodes: &mut Vec, offset: i64, num_rows: usize, null_count: usize, compression_codec: &Option, write_options: &IpcWriteOptions, -) -> Result { +) -> Result { let mut offset = offset; if !matches!(array_data.data_type(), DataType::Null) { - nodes.push(ipc::FieldNode::new(num_rows as i64, null_count as i64)); + nodes.push(crate::FieldNode::new(num_rows as i64, null_count as i64)); } else { // NullArray's null_count equals to len, but the `null_count` passed in is from ArrayData // where null_count is always 0. - nodes.push(ipc::FieldNode::new(num_rows as i64, num_rows as i64)); + nodes.push(crate::FieldNode::new(num_rows as i64, num_rows as i64)); } if has_validity_bitmap(array_data.data_type(), write_options) { // write null buffer if exists @@ -1219,7 +1218,7 @@ fn write_array_data( } /// Write a buffer into `arrow_data`, a vector of bytes, and adds its -/// [`ipc::Buffer`] to `buffers`. Returns the new offset in `arrow_data` +/// [`crate::Buffer`] to `buffers`. Returns the new offset in `arrow_data` /// /// /// From @@ -1231,12 +1230,12 @@ fn write_array_data( /// follows is not compressed, which can be useful for cases where /// compression does not yield appreciable savings. fn write_buffer( - buffer: &[u8], // input - buffers: &mut Vec, // output buffer descriptors - arrow_data: &mut Vec, // output stream - offset: i64, // current output stream offset + buffer: &[u8], // input + buffers: &mut Vec, // output buffer descriptors + arrow_data: &mut Vec, // output stream + offset: i64, // current output stream offset compression_codec: &Option, -) -> Result { +) -> Result { let len: i64 = match compression_codec { Some(compressor) => compressor.compress_to_vec(buffer, arrow_data)?, None => { @@ -1253,7 +1252,7 @@ fn write_buffer( })?; // make new index entry - buffers.push(ipc::Buffer::new(offset, len)); + buffers.push(crate::Buffer::new(offset, len)); // padding and make offset 8 bytes aligned let pad_len = pad_to_8(len as u32) as i64; arrow_data.extend_from_slice(&vec![0u8; pad_len as usize][..]); @@ -1271,18 +1270,18 @@ fn pad_to_8(len: u32) -> usize { mod tests { use super::*; - use std::fs::File; use std::io::Seek; use std::sync::Arc; - use ipc::MetadataVersion; + use crate::MetadataVersion; - use crate::array::*; - use crate::datatypes::Field; - use crate::ipc::reader::*; + use crate::reader::*; + use arrow_array::builder::UnionBuilder; + use arrow_array::types::*; + use arrow_schema::DataType; #[test] - #[cfg(feature = "ipc_compression")] + #[cfg(feature = "lz4")] fn test_write_empty_record_batch_lz4_compression() { let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![]; @@ -1295,9 +1294,9 @@ mod tests { { let write_option = - IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) .unwrap() - .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) .unwrap(); let mut writer = @@ -1335,7 +1334,7 @@ mod tests { } #[test] - #[cfg(feature = "ipc_compression")] + #[cfg(feature = "lz4")] fn test_write_file_with_lz4_compression() { let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![Some(12), Some(1)]; @@ -1347,9 +1346,9 @@ mod tests { let mut file = tempfile::tempfile().unwrap(); { let write_option = - IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) .unwrap() - .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) .unwrap(); let mut writer = @@ -1387,7 +1386,7 @@ mod tests { } #[test] - #[cfg(feature = "ipc_compression")] + #[cfg(feature = "zstd")] fn test_write_file_with_zstd_compression() { let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![Some(12), Some(1)]; @@ -1398,9 +1397,9 @@ mod tests { let mut file = tempfile::tempfile().unwrap(); { let write_option = - IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) .unwrap() - .try_with_compression(Some(ipc::CompressionType::ZSTD)) + .try_with_compression(Some(crate::CompressionType::ZSTD)) .unwrap(); let mut writer = @@ -1482,7 +1481,7 @@ mod tests { } } - fn write_null_file(options: IpcWriteOptions, suffix: &str) { + fn write_null_file(options: IpcWriteOptions) { let schema = Schema::new(vec![ Field::new("nulls", DataType::Null, true), Field::new("int32s", DataType::Int32, false), @@ -1503,18 +1502,18 @@ mod tests { ], ) .unwrap(); - let file_name = format!("target/debug/testdata/nulls_{}.arrow_file", suffix); + let mut file = tempfile::tempfile().unwrap(); { - let file = File::create(&file_name).unwrap(); let mut writer = - FileWriter::try_new_with_options(file, &schema, options).unwrap(); + FileWriter::try_new_with_options(&mut file, &schema, options).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); } + file.rewind().unwrap(); + { - let file = File::open(&file_name).unwrap(); let reader = FileReader::try_new(file, None).unwrap(); reader.for_each(|maybe_batch| { maybe_batch @@ -1532,33 +1531,19 @@ mod tests { } #[test] fn test_write_null_file_v4() { - write_null_file( - IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap(), - "v4_a8", - ); - write_null_file( - IpcWriteOptions::try_new(8, true, MetadataVersion::V4).unwrap(), - "v4_a8l", - ); + write_null_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap()); + write_null_file(IpcWriteOptions::try_new(8, true, MetadataVersion::V4).unwrap()); write_null_file( IpcWriteOptions::try_new(64, false, MetadataVersion::V4).unwrap(), - "v4_a64", - ); - write_null_file( - IpcWriteOptions::try_new(64, true, MetadataVersion::V4).unwrap(), - "v4_a64l", ); + write_null_file(IpcWriteOptions::try_new(64, true, MetadataVersion::V4).unwrap()); } #[test] fn test_write_null_file_v5() { - write_null_file( - IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(), - "v5_a8", - ); + write_null_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap()); write_null_file( IpcWriteOptions::try_new(64, false, MetadataVersion::V5).unwrap(), - "v5_a64", ); } @@ -1626,45 +1611,6 @@ mod tests { assert!(dict_tracker.written.contains_key(&2)); } - #[test] - fn read_union_017() { - let testdata = crate::util::test_util::arrow_test_data(); - let data_file = File::open(format!( - "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", - testdata, - )) - .unwrap(); - - let reader = StreamReader::try_new(data_file, None).unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - // read and rewrite the stream to a temp location - { - let mut writer = StreamWriter::try_new(&mut file, &reader.schema()).unwrap(); - reader.for_each(|batch| { - writer.write(&batch.unwrap()).unwrap(); - }); - writer.finish().unwrap(); - } - file.rewind().unwrap(); - - // Compare original file and rewrote file - let rewrite_reader = StreamReader::try_new(file, None).unwrap(); - - let data_file = File::open(format!( - "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", - testdata, - )) - .unwrap(); - let reader = StreamReader::try_new(data_file, None).unwrap(); - - reader.into_iter().zip(rewrite_reader.into_iter()).for_each( - |(batch1, batch2)| { - assert_eq!(batch1.unwrap(), batch2.unwrap()); - }, - ); - } - fn write_union_file(options: IpcWriteOptions) { let schema = Schema::new(vec![Field::new( "union", @@ -1739,7 +1685,7 @@ mod tests { fn deserialize(bytes: Vec) -> RecordBatch { let mut stream_reader = - ipc::reader::StreamReader::try_new(std::io::Cursor::new(bytes), None) + crate::reader::StreamReader::try_new(std::io::Cursor::new(bytes), None) .unwrap(); stream_reader.next().unwrap().unwrap() } diff --git a/arrow/CONTRIBUTING.md b/arrow/CONTRIBUTING.md index bbf309d4d225..5b84bc2d3bdb 100644 --- a/arrow/CONTRIBUTING.md +++ b/arrow/CONTRIBUTING.md @@ -26,23 +26,6 @@ Rust [README.md](../README.md). Please refer to [lib.rs](src/lib.rs) for an introduction to this specific crate and its current functionality. -## IPC - -The expected flatc version is 1.12.0+, built from [flatbuffers](https://github.com/google/flatbuffers) -master at fixed commit ID, by regen.sh. - -The IPC flatbuffer code was generated by running this command from the root of the project: - -```bash -./regen.sh -``` - -The above script will run the `flatc` compiler and perform some adjustments to the source code: - -- Replace `type__` with `type_` -- Remove `org::apache::arrow::flatbuffers` namespace -- Add includes to each generated file - ## Guidelines in usage of `unsafe` [`unsafe`](https://doc.rust-lang.org/book/ch19-01-unsafe-rust.html) has a high maintenance cost because debugging and testing it is difficult, time consuming, often requires external tools (e.g. `valgrind`), and requires a higher-than-usual attention to details. Undefined behavior is particularly difficult to identify and test, and usage of `unsafe` is the [primary cause of undefined behavior](https://doc.rust-lang.org/reference/behavior-considered-undefined.html) in a program written in Rust. diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 5749f6799874..6c30df6bd27d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -50,6 +50,7 @@ arrow-data = { version = "26.0.0", path = "../arrow-data" } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } arrow-array = { version = "26.0.0", path = "../arrow-array" } arrow-select = { version = "26.0.0", path = "../arrow-select" } +arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } @@ -60,25 +61,22 @@ csv_crate = { version = "1.1", default-features = false, optional = true, packag regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } lazy_static = { version = "1.4", default-features = false } -lz4 = { version = "1.23", default-features = false, optional = true } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } multiversion = { version = "0.6.1", default-features = false } bitflags = { version = "1.2.1", default-features = false } -zstd = { version = "0.11.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] [features] default = ["csv", "ipc", "json"] -ipc_compression = ["ipc", "zstd", "lz4"] +ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["csv_crate"] -ipc = ["flatbuffers"] +ipc = ["arrow-ipc"] json = ["serde_json"] simd = ["packed_simd"] prettyprint = ["comfy-table"] @@ -265,3 +263,7 @@ required-features = ["test_utils"] name = "lexsort" harness = false required-features = ["test_utils"] + +[[test]] +name = "ipc_integration" +required-features = ["test_utils", "ipc"] diff --git a/arrow/src/ipc/compression/mod.rs b/arrow/src/ipc/compression/mod.rs deleted file mode 100644 index 666fa6d86a27..000000000000 --- a/arrow/src/ipc/compression/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[cfg(feature = "ipc_compression")] -mod codec; -#[cfg(feature = "ipc_compression")] -pub(crate) use codec::CompressionCodec; - -#[cfg(not(feature = "ipc_compression"))] -mod stub; -#[cfg(not(feature = "ipc_compression"))] -pub(crate) use stub::CompressionCodec; diff --git a/arrow/src/ipc/compression/stub.rs b/arrow/src/ipc/compression/stub.rs deleted file mode 100644 index 6240f084be3f..000000000000 --- a/arrow/src/ipc/compression/stub.rs +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Stubs that implement the same interface as the ipc_compression -//! codec module, but always errors. - -use crate::buffer::Buffer; -use crate::error::{ArrowError, Result}; -use crate::ipc::CompressionType; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum CompressionCodec {} - -impl TryFrom for CompressionType { - type Error = ArrowError; - fn try_from(codec: CompressionCodec) -> Result { - Err(ArrowError::InvalidArgumentError( - format!("codec type {:?} not supported because arrow was not compiled with the ipc_compression feature", codec))) - } -} - -impl TryFrom for CompressionCodec { - type Error = ArrowError; - - fn try_from(compression_type: CompressionType) -> Result { - Err(ArrowError::InvalidArgumentError( - format!("compression type {:?} not supported because arrow was not compiled with the ipc_compression feature", compression_type)) - ) - } -} - -impl CompressionCodec { - #[allow(clippy::ptr_arg)] - pub(crate) fn compress_to_vec( - &self, - _input: &[u8], - _output: &mut Vec, - ) -> Result { - Err(ArrowError::InvalidArgumentError( - "compression not supported because arrow was not compiled with the ipc_compression feature".to_string() - )) - } - - pub(crate) fn decompress_to_buffer(&self, _input: &[u8]) -> Result { - Err(ArrowError::InvalidArgumentError( - "decompression not supported because arrow was not compiled with the ipc_compression feature".to_string() - )) - } -} diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 0081856f3d68..b2fa30d26d53 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -314,35 +314,14 @@ pub mod ffi; #[cfg(feature = "ffi")] pub mod ffi_stream; #[cfg(feature = "ipc")] -pub mod ipc; +pub use arrow_ipc as ipc; #[cfg(feature = "serde_json")] pub mod json; #[cfg(feature = "pyarrow")] pub mod pyarrow; pub mod record_batch { - pub use arrow_array::{RecordBatch, RecordBatchOptions}; - use arrow_schema::{ArrowError, SchemaRef}; - - /// Trait for types that can read `RecordBatch`'s. - pub trait RecordBatchReader: - Iterator> - { - /// Returns the schema of this `RecordBatchReader`. - /// - /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this - /// reader should have the same schema as returned from this method. - fn schema(&self) -> SchemaRef; - - /// Reads the next `RecordBatch`. - #[deprecated( - since = "2.0.0", - note = "This method is deprecated in favour of `next` from the trait Iterator." - )] - fn next_batch(&mut self) -> Result, ArrowError> { - self.next().transpose() - } - } + pub use arrow_array::{RecordBatch, RecordBatchOptions, RecordBatchReader}; } pub mod row; pub use arrow_array::temporal_conversions; diff --git a/arrow/tests/ipc_integration.rs b/arrow/tests/ipc_integration.rs new file mode 100644 index 000000000000..abaa238ba5c6 --- /dev/null +++ b/arrow/tests/ipc_integration.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_ipc::reader::StreamReader; +use arrow_ipc::writer::StreamWriter; +use std::fs::File; +use std::io::Seek; + +#[test] +fn read_union_017() { + let testdata = arrow::util::test_util::arrow_test_data(); + let data_file = File::open(format!( + "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", + testdata, + )) + .unwrap(); + + let reader = StreamReader::try_new(data_file, None).unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + // read and rewrite the stream to a temp location + { + let mut writer = StreamWriter::try_new(&mut file, &reader.schema()).unwrap(); + reader.for_each(|batch| { + writer.write(&batch.unwrap()).unwrap(); + }); + writer.finish().unwrap(); + } + file.rewind().unwrap(); + + // Compare original file and rewrote file + let rewrite_reader = StreamReader::try_new(file, None).unwrap(); + + let data_file = File::open(format!( + "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", + testdata, + )) + .unwrap(); + let reader = StreamReader::try_new(data_file, None).unwrap(); + + reader + .into_iter() + .zip(rewrite_reader.into_iter()) + .for_each(|(batch1, batch2)| { + assert_eq!(batch1.unwrap(), batch2.unwrap()); + }); +} diff --git a/dev/release/README.md b/dev/release/README.md index a12e07f8ed34..093e1c4c29f3 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -257,6 +257,8 @@ Rust Arrow Crates: (cd arrow-data && cargo publish) (cd arrow-array && cargo publish) (cd arrow-select && cargo publish) +(cd arrow-cast && cargo publish) +(cd arrow-ipc && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From 12f0ef4ac424e035e480a140088914e631607cae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 7 Nov 2022 11:56:10 +1300 Subject: [PATCH 0228/1411] Fix nullif when existing array has no nulls (#3034) --- arrow/src/compute/kernels/boolean.rs | 32 ++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index ea3b49e8cc03..dee5d0d1b3ba 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -472,7 +472,7 @@ pub fn is_not_null(input: &dyn Array) -> Result { } /// Copies original array, setting validity bit to false if a secondary comparison -/// boolean array is set to true or null +/// boolean array is set to true /// /// Typically used to implement NULLIF. pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { @@ -522,11 +522,19 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { t }) } - None => bitwise_unary_op_helper(&right, right_offset, len, |b| { - let t = !b; - valid_count += t.count_ones() as usize; - t - }), + None => { + let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { + let t = !b; + valid_count += t.count_ones() as usize; + t + }); + // We need to compensate for the additional bits read from the end + let remainder_len = len % 64; + if remainder_len != 0 { + valid_count -= 64 - remainder_len + } + buffer + } }; // Need to construct null buffer with offset of left @@ -1411,4 +1419,16 @@ mod tests { assert_eq!(&expected, res); } + + #[test] + fn test_nullif_no_nulls() { + let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); + let comp = + BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let res = nullif(&a, &comp).unwrap(); + let res = as_primitive_array::(res.as_ref()); + + let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); + assert_eq!(res, &expected); + } } From 6dd9dae1cea7618a7e136285e7927e4d802ec058 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 6 Nov 2022 16:05:16 -0800 Subject: [PATCH 0229/1411] Check overflow when casting floating point value to decimal256 (#3033) * Check overflow when casting floating point value to decimal256 * Add from_f64 --- arrow-buffer/src/bigint.rs | 15 +++++++++- arrow-cast/src/cast.rs | 59 +++++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index e87c05826fe2..8dd57d2c4646 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -16,7 +16,7 @@ // under the License. use num::cast::AsPrimitive; -use num::BigInt; +use num::{BigInt, FromPrimitive}; use std::cmp::Ordering; /// A signed 256-bit integer @@ -102,6 +102,19 @@ impl i256 { Self::from_parts(v as u128, v >> 127) } + /// Create an optional i256 from the provided `f64`. Returning `None` + /// if overflow occurred + pub fn from_f64(v: f64) -> Option { + BigInt::from_f64(v).and_then(|i| { + let (integer, overflow) = i256::from_bigint_with_overflow(i); + if overflow { + None + } else { + Some(integer) + } + }) + } + /// Create an i256 from the provided low u128 and high i128 #[inline] pub const fn from_parts(low: u128, high: i128) -> Self { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3e23a059bf3e..5bf8c19c5baf 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -387,16 +387,38 @@ fn cast_floating_point_to_decimal256( array: &PrimitiveArray, precision: u8, scale: u8, + cast_options: &CastOptions, ) -> Result where ::Native: AsPrimitive, { let mul = 10_f64.powi(scale as i32); - array - .unary::<_, Decimal256Type>(|v| i256::from_i128((v.as_() * mul).round() as i128)) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) + if cast_options.safe { + let iter = array + .iter() + .map(|v| v.and_then(|v| i256::from_f64((v.as_() * mul).round()))); + let casted_array = + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; + casted_array + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + array + .try_unary::<_, Decimal256Type, _>(|v| { + i256::from_f64((v.as_() * mul).round()).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + precision, + scale, + v + )) + }) + }) + .and_then(|a| a.with_precision_and_scale(precision, scale)) + .map(|a| Arc::new(a) as ArrayRef) + } } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] @@ -666,11 +688,13 @@ pub fn cast_with_options( as_primitive_array::(array), *precision, *scale, + cast_options, ), Float64 => cast_floating_point_to_decimal256( as_primitive_array::(array), *precision, *scale, + cast_options, ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( @@ -6166,4 +6190,31 @@ mod tests { err ); } + + #[test] + fn test_cast_floating_point_to_decimal256_overflow() { + let array = Float64Array::from(vec![f64::MAX]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(76, 50), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(76, 50), + &CastOptions { safe: false }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Cast error: Cannot cast to Decimal256(76, 50)"; + assert!( + err.contains(expected_error), + "did not find expected error '{}' in actual error '{}'", + expected_error, + err + ); + } } From 4df1cc43e7010fa66f38db42abf3d7129b31c539 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 7 Nov 2022 11:11:42 -0800 Subject: [PATCH 0230/1411] Replace year_generic with year_dyn (#3041) --- arrow/src/compute/kernels/temporal.rs | 146 +++++++++++--------------- 1 file changed, 63 insertions(+), 83 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 75196f37c075..307f79606886 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -17,7 +17,7 @@ //! Defines temporal kernels for time and date related functions. -use arrow_array::downcast_dictionary_array; +use arrow_array::{downcast_dictionary_array, downcast_temporal_array}; use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use std::sync::Arc; @@ -182,7 +182,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - hour_internal::(array, array.data_type()) + hour_internal(array) } /// Extracts the hours of a given array as an array of integers within @@ -199,84 +199,48 @@ pub fn hour_dyn(array: &dyn Array) -> Result { dt => return_compute_error_with!("hour does not support", dt), ) } - DataType::Time32(TimeUnit::Second) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Time32(TimeUnit::Microsecond) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Time64(TimeUnit::Microsecond) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Time64(TimeUnit::Nanosecond) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Date32 => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Date64 => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Second, _) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let array = as_primitive_array::(array); - hour_internal::(array, array.data_type()) - .map(|a| Arc::new(a) as ArrayRef) + _ => { + downcast_temporal_array!( + array => { + hour_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("hour does not support", dt), + ) } - dt => return_compute_error_with!("hour does not support", dt), } } /// Extracts the hours of a given temporal array as an array of integers -fn hour_internal>( - array: A, - dt: &DataType, -) -> Result +fn hour_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Time32(_) | DataType::Time64(_) => { let iter = ArrayIter::new(array); - Ok(as_time_with_op::(iter, b, |t| t.hour() as i32)) + Ok(as_time_with_op::<&PrimitiveArray, T, _>(iter, b, |t| { + t.hour() as i32 + })) } DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| t.hour() as i32)) + Ok(as_datetime_with_op::<&PrimitiveArray, T, _>( + iter, + b, + |t| t.hour() as i32, + )) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { - t.hour() as i32 - }) + extract_component_from_datetime_array::<&PrimitiveArray, T, _>( + iter, + b, + tz, + |t| t.hour() as i32, + ) } _ => return_compute_error_with!("hour does not support", array.data_type()), } @@ -288,37 +252,50 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - year_generic::(array) + year_internal(array) } -/// Extracts the years of a given temporal array as an array of integers -pub fn year_generic>(array: A) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// Extracts the years of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn year_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - year_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let year_values = year_dyn(array.values())?; + Ok(Arc::new(array.with_values(&year_values))) + } + dt => return_compute_error_with!("year does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + year_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("year does not support", dt), + ) } - dt => year_internal::(array, &dt), } } /// Extracts the years of a given temporal array as an array of integers -fn year_internal>( - array: A, - dt: &DataType, -) -> Result +fn year_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { let b = Int32Builder::with_capacity(array.len()); let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| t.year())) + Ok(as_datetime_with_op::<&PrimitiveArray, T, _>( + iter, + b, + |t| t.year(), + )) } _t => return_compute_error_with!("year does not support", array.data_type()), } @@ -1310,12 +1287,15 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::try_new(&keys, &a).unwrap(); - let b = - year_generic::(dict.downcast_dict::().unwrap()) - .unwrap(); + let b = year_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![2018, 2019, 2019, 2018]); - assert_eq!(expected, b); + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![2018, 2019, 2019, 2018]), + ) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); } #[test] From 951caed784876d15a9e712a5981de31cee4e3085 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Mon, 7 Nov 2022 20:12:08 +0100 Subject: [PATCH 0231/1411] Enable casting from Date64 to Timestamp (#3038) --- arrow-cast/src/cast.rs | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 5bf8c19c5baf..e394426bd682 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -244,7 +244,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), Int64) => true, (Int64, Timestamp(_, _)) => true, (Timestamp(_, _), Timestamp(_, _) | Date32 | Date64) => true, - // date64 to timestamp might not make sense, + (Date64, Timestamp(_, None)) => true, (Int64, Duration(_)) => true, (Duration(_), Int64) => true, (Interval(from_type), Int64) => { @@ -1484,7 +1484,24 @@ pub fn cast_with_options( .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), )), - // date64 to timestamp might not make sense, + (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, TimestampSecondType>(|x| x / MILLISECONDS), + )), + (Date64, Timestamp(TimeUnit::Millisecond, None)) => { + cast_reinterpret_arrays::(array) + } + (Date64, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( + as_primitive_array::(array).unary::<_, TimestampMicrosecondType>( + |x| x * (MICROSECONDS / MILLISECONDS), + ), + )), + (Date64, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( + as_primitive_array::(array).unary::<_, TimestampNanosecondType>( + |x| x * (NANOSECONDS / MILLISECONDS), + ), + )), + (Int64, Duration(TimeUnit::Second)) => { cast_reinterpret_arrays::(array) } @@ -4073,6 +4090,59 @@ mod tests { assert!(c.is_null(2)); } + #[test] + fn test_cast_date64_to_timestamp() { + let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(864000000, c.value(0)); + assert_eq!(1545696000, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date64_to_timestamp_ms() { + let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Millisecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(864000000005, c.value(0)); + assert_eq!(1545696000001, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date64_to_timestamp_us() { + let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(864000000005000, c.value(0)); + assert_eq!(1545696000001000, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date64_to_timestamp_ns() { + let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(864000000005000000, c.value(0)); + assert_eq!(1545696000001000000, c.value(1)); + assert!(c.is_null(2)); + } + #[test] fn test_cast_timestamp_to_i64() { let a = TimestampMillisecondArray::from(vec![ From b7bc79bf2cbf593fafa0dc552cc2bb16b084d132 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 8 Nov 2022 09:44:02 +1300 Subject: [PATCH 0232/1411] Move reader_parser to arrow-cast (#3022) (#3043) * Move reader_parser to arrow-cast (#3022) * Format --- arrow-cast/src/parse.rs | 122 +++++++++++++++++++++++++++ arrow/src/csv/reader.rs | 2 +- arrow/src/json/reader.rs | 2 +- arrow/src/util/mod.rs | 1 - arrow/src/util/reader_parser.rs | 142 -------------------------------- 5 files changed, 124 insertions(+), 145 deletions(-) delete mode 100644 arrow/src/util/reader_parser.rs diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 8a9d34b4c637..126beb902a55 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use arrow_array::types::*; +use arrow_array::ArrowPrimitiveType; use arrow_schema::ArrowError; use chrono::prelude::*; @@ -130,6 +132,126 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Specialized parsing implementations +/// used by csv and json reader +pub trait Parser: ArrowPrimitiveType { + fn parse(string: &str) -> Option; + + fn parse_formatted(string: &str, _format: &str) -> Option { + Self::parse(string) + } +} + +impl Parser for Float32Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +impl Parser for Float64Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + string.parse::().ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); + +impl Parser for TimestampNanosecondType { + fn parse(string: &str) -> Option { + string_to_timestamp_nanos(string).ok() + } +} + +impl Parser for TimestampMicrosecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1000) + } +} + +impl Parser for TimestampMillisecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000) + } +} + +impl Parser for TimestampSecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000_000) + } +} + +parser_primitive!(Time64NanosecondType); +parser_primitive!(Time64MicrosecondType); +parser_primitive!(Time32MillisecondType); +parser_primitive!(Time32SecondType); + +/// Number of days between 0001-01-01 and 1970-01-01 +const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let date = string.parse::().ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + let date_time = string.parse::().ok()?; + Some(date_time.timestamp_millis()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + use chrono::format::Fixed; + use chrono::format::StrftimeItems; + let fmt = StrftimeItems::new(format); + let has_zone = fmt.into_iter().any(|item| match item { + chrono::format::Item::Fixed(fixed_item) => matches!( + fixed_item, + Fixed::RFC2822 + | Fixed::RFC3339 + | Fixed::TimezoneName + | Fixed::TimezoneOffsetColon + | Fixed::TimezoneOffsetColonZ + | Fixed::TimezoneOffset + | Fixed::TimezoneOffsetZ + ), + _ => false, + }); + if has_zone { + let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } else { + let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index ff6df5514983..404f37e9309a 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -56,7 +56,7 @@ use crate::array::{ use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchOptions}; -use crate::util::reader_parser::Parser; +use arrow_cast::parse::Parser; use crate::csv::map_csv_error; use csv_crate::{ByteRecord, StringRecord}; diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index a7382128e1c8..78c51559a7dd 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -60,8 +60,8 @@ use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchOptions}; use crate::util::bit_util; -use crate::util::reader_parser::Parser; use crate::{array::*, buffer::Buffer}; +use arrow_cast::parse::Parser; #[derive(Debug, Clone)] enum InferredType { diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 9a0ca852a114..4369ebe7dd45 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -31,4 +31,3 @@ pub mod string_writer; pub mod test_util; pub use arrow_cast::display; -pub(crate) mod reader_parser; diff --git a/arrow/src/util/reader_parser.rs b/arrow/src/util/reader_parser.rs deleted file mode 100644 index efee629056df..000000000000 --- a/arrow/src/util/reader_parser.rs +++ /dev/null @@ -1,142 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::types::*; -use arrow_array::*; -use arrow_cast::parse::string_to_timestamp_nanos; - -/// Specialized parsing implementations -/// used by csv and json reader -pub(crate) trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option; - - fn parse_formatted(string: &str, _format: &str) -> Option { - Self::parse(string) - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -macro_rules! parser_primitive { - ($t:ty) => { - impl Parser for $t { - fn parse(string: &str) -> Option { - string.parse::().ok() - } - } - }; -} -parser_primitive!(UInt64Type); -parser_primitive!(UInt32Type); -parser_primitive!(UInt16Type); -parser_primitive!(UInt8Type); -parser_primitive!(Int64Type); -parser_primitive!(Int32Type); -parser_primitive!(Int16Type); -parser_primitive!(Int8Type); - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - string_to_timestamp_nanos(string).ok() - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1000) - } -} - -impl Parser for TimestampMillisecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000) - } -} - -impl Parser for TimestampSecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000_000) - } -} - -parser_primitive!(Time64NanosecondType); -parser_primitive!(Time64MicrosecondType); -parser_primitive!(Time32MillisecondType); -parser_primitive!(Time32SecondType); - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - use chrono::Datelike; - let date = string.parse::().ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::Datelike; - let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - let date_time = string.parse::().ok()?; - Some(date_time.timestamp_millis()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::format::Fixed; - use chrono::format::StrftimeItems; - let fmt = StrftimeItems::new(format); - let has_zone = fmt.into_iter().any(|item| match item { - chrono::format::Item::Fixed(fixed_item) => matches!( - fixed_item, - Fixed::RFC2822 - | Fixed::RFC3339 - | Fixed::TimezoneName - | Fixed::TimezoneOffsetColon - | Fixed::TimezoneOffsetColonZ - | Fixed::TimezoneOffset - | Fixed::TimezoneOffsetZ - ), - _ => false, - }); - if has_zone { - let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } else { - let date_time = chrono::NaiveDateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } - } -} From 879b461af8c1259d48fbb1bc67d50fa2a38bea68 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 8 Nov 2022 11:49:38 +1300 Subject: [PATCH 0233/1411] Fix decoding long and/or padded RLE data (#3029) (#3035) (#3036) --- parquet/src/encodings/rle.rs | 95 +++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index b0ae5af07d7f..25c3c81a72dc 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -473,13 +473,18 @@ impl RleDecoder { let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); - let mut num_values = - cmp::min(max_values - values_read, self.bit_packed_left as usize); - - num_values = cmp::min(num_values, index_buf.len()); loop { - num_values = bit_reader.get_batch::( - &mut index_buf[..num_values], + let to_read = index_buf + .len() + .min(max_values - values_read) + .min(self.bit_packed_left as usize); + + if to_read == 0 { + break; + } + + let num_values = bit_reader.get_batch::( + &mut index_buf[..to_read], self.bit_width as usize, ); if num_values == 0 { @@ -492,7 +497,7 @@ impl RleDecoder { } self.bit_packed_left -= num_values as u32; values_read += num_values; - if num_values < index_buf.len() { + if num_values < to_read { break; } } @@ -509,6 +514,12 @@ impl RleDecoder { let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); if let Some(indicator_value) = bit_reader.get_vlq_int() { + // fastparquet adds padding to the end of pages. This is not spec-compliant + // but is handled by the C++ implementation + // + if indicator_value == 0 { + return false; + } if indicator_value & 1 == 1 { self.bit_packed_left = ((indicator_value >> 1) * 8) as u32; } else { @@ -528,6 +539,7 @@ impl RleDecoder { mod tests { use super::*; + use crate::util::bit_util::ceil; use rand::{self, distributions::Standard, thread_rng, Rng, SeedableRng}; use crate::util::memory::ByteBufferPtr; @@ -899,6 +911,75 @@ mod tests { assert!(output.iter().take(20).all(|x| *x == 255)); } + #[test] + fn test_rle_padded() { + let values: Vec = vec![0, 1, 1, 3, 1, 0]; + let bit_width = 2; + let buffer_len = RleEncoder::max_buffer_size(bit_width, values.len()); + let mut encoder = RleEncoder::new(bit_width, buffer_len + 1); + for v in &values { + encoder.put(*v as u64) + } + + let mut buffer = encoder.consume(); + buffer.push(0); + + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(ByteBufferPtr::new(buffer)); + + // We don't always reliably know how many non-null values are contained in a page + // and so the decoder must work correctly without a precise value count + let mut actual_values: Vec = vec![0; 12]; + let r = decoder + .get_batch(&mut actual_values) + .expect("get_batch() should be OK"); + + // Should decode 8 values despite only encoding 6 as length of + // bit packed run is always multiple of 8 + assert_eq!(r, 8); + assert_eq!(actual_values[..6], values); + assert_eq!(actual_values[6], 0); + assert_eq!(actual_values[7], 0); + } + + #[test] + fn test_long_run() { + // This writer does not write runs longer than 504 values as this allows + // encoding the run header as a single byte + // + // This tests that the decoder correctly handles longer runs + + let mut writer = BitWriter::new(1024); + let bit_width = 1; + + // Choose a non-multiple of 8 larger than 1024 so that the length + // of the run is ambiguous, as the encoding only stores `num_values / 8` + let num_values = 2002; + + // bit-packed header + let run_bytes = ceil(num_values * bit_width, 8) as u64; + writer.put_vlq_int(run_bytes << 1 | 1); + for _ in 0..run_bytes { + writer.put_aligned(0xFF_u8, 1); + } + let buffer = ByteBufferPtr::new(writer.consume()); + + let mut decoder = RleDecoder::new(1); + decoder.set_data(buffer.clone()); + + let mut decoded: Vec = vec![0; num_values]; + let r = decoder.get_batch(&mut decoded).unwrap(); + assert_eq!(r, num_values); + assert_eq!(vec![1; num_values], decoded); + + decoder.set_data(buffer); + let r = decoder + .get_batch_with_dict(&[0, 23], &mut decoded, num_values) + .unwrap(); + assert_eq!(r, num_values); + assert_eq!(vec![23; num_values], decoded); + } + #[test] fn test_rle_specific_roundtrip() { let bit_width = 1; From a950b52ec83e5ac14e147f9605f871ba6bd06ee0 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 8 Nov 2022 00:58:34 -0800 Subject: [PATCH 0234/1411] Cast decimal256 to signed integer (#3040) * Cast decimal256 to signed integer * Use ToPrimitive * Add CastOptions --- arrow-buffer/src/bigint.rs | 87 +++++++++++++-- arrow-cast/src/cast.rs | 216 ++++++++++++++++++++++++++++++------- 2 files changed, 261 insertions(+), 42 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 8dd57d2c4646..be02c2857db1 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -16,7 +16,7 @@ // under the License. use num::cast::AsPrimitive; -use num::{BigInt, FromPrimitive}; +use num::{BigInt, FromPrimitive, ToPrimitive}; use std::cmp::Ordering; /// A signed 256-bit integer @@ -388,13 +388,15 @@ impl i256 { /// Temporary workaround due to lack of stable const array slicing /// See -const fn split_array(vals: [u8; 32]) -> ([u8; 16], [u8; 16]) { - let mut a = [0; 16]; - let mut b = [0; 16]; +const fn split_array( + vals: [u8; N], +) -> ([u8; M], [u8; M]) { + let mut a = [0; M]; + let mut b = [0; M]; let mut i = 0; - while i != 16 { + while i != M { a[i] = vals[i]; - b[i] = vals[i + 16]; + b[i] = vals[i + M]; i += 1; } (a, b) @@ -478,6 +480,44 @@ define_as_primitive!(i16); define_as_primitive!(i32); define_as_primitive!(i64); +impl ToPrimitive for i256 { + fn to_i64(&self) -> Option { + let as_i128 = self.low as i128; + + let high_negative = self.high < 0; + let low_negative = as_i128 < 0; + let high_valid = self.high == -1 || self.high == 0; + + if high_negative == low_negative && high_valid { + let (low_bytes, high_bytes) = split_array(u128::to_le_bytes(self.low)); + let high = i64::from_le_bytes(high_bytes); + let low = i64::from_le_bytes(low_bytes); + + let high_negative = high < 0; + let low_negative = low < 0; + let high_valid = self.high == -1 || self.high == 0; + + (high_negative == low_negative && high_valid).then_some(low) + } else { + None + } + } + + fn to_u64(&self) -> Option { + let as_i128 = self.low as i128; + + let high_negative = self.high < 0; + let low_negative = as_i128 < 0; + let high_valid = self.high == -1 || self.high == 0; + + if high_negative == low_negative && high_valid { + self.low.to_u64() + } else { + None + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -676,4 +716,39 @@ mod tests { test_ops(i256::from_le_bytes(l), i256::from_le_bytes(r)) } } + + #[test] + fn test_i256_to_primitive() { + let a = i256::MAX; + assert!(a.to_i64().is_none()); + assert!(a.to_u64().is_none()); + + let a = i256::from_i128(i128::MAX); + assert!(a.to_i64().is_none()); + assert!(a.to_u64().is_none()); + + let a = i256::from_i128(i64::MAX as i128); + assert_eq!(a.to_i64().unwrap(), i64::MAX); + assert_eq!(a.to_u64().unwrap(), i64::MAX as u64); + + let a = i256::from_i128(i64::MAX as i128 + 1); + assert!(a.to_i64().is_none()); + assert_eq!(a.to_u64().unwrap(), i64::MAX as u64 + 1); + + let a = i256::MIN; + assert!(a.to_i64().is_none()); + assert!(a.to_u64().is_none()); + + let a = i256::from_i128(i128::MIN); + assert!(a.to_i64().is_none()); + assert!(a.to_u64().is_none()); + + let a = i256::from_i128(i64::MIN as i128); + assert_eq!(a.to_i64().unwrap(), i64::MIN); + assert!(a.to_u64().is_none()); + + let a = i256::from_i128(i64::MIN as i128 - 1); + assert!(a.to_i64().is_none()); + assert!(a.to_u64().is_none()); + } } diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index e394426bd682..1cc814730850 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -81,7 +81,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | // decimal to signed numeric - (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) + (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | + (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 ) | ( Null, Boolean @@ -433,34 +434,65 @@ fn cast_reinterpret_arrays< )) } -// cast the decimal array to integer array -macro_rules! cast_decimal_to_integer { - ($ARRAY:expr, $SCALE : ident, $VALUE_BUILDER: ident, $NATIVE_TYPE : ident, $DATA_TYPE : expr) => {{ - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - let mut value_builder = $VALUE_BUILDER::with_capacity(array.len()); - let div: i128 = 10_i128.pow(*$SCALE as u32); - let min_bound = ($NATIVE_TYPE::MIN) as i128; - let max_bound = ($NATIVE_TYPE::MAX) as i128; +fn cast_decimal_to_integer( + array: &ArrayRef, + base: D::Native, + scale: u8, + cast_options: &CastOptions, +) -> Result +where + T: ArrowPrimitiveType, + ::Native: NumCast, + D: DecimalType + ArrowPrimitiveType, + ::Native: ArrowNativeTypeOp + ToPrimitive, +{ + let array = array.as_any().downcast_ref::>().unwrap(); + + let div: D::Native = base.pow_checked(scale as u32).map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast to {:?}. The scale {} causes overflow.", + D::PREFIX, + scale, + )) + })?; + + let mut value_builder = PrimitiveBuilder::::with_capacity(array.len()); + + if cast_options.safe { for i in 0..array.len() { if array.is_null(i) { value_builder.append_null(); } else { - let v = array.value(i) / div; - // check the overflow - // For example: Decimal(128,10,0) as i8 - // 128 is out of range i8 - if v <= max_bound && v >= min_bound { - value_builder.append_value(v as $NATIVE_TYPE); - } else { - return Err(ArrowError::CastError(format!( - "value of {} is out of range {}", - v, $DATA_TYPE - ))); - } + let v = array + .value(i) + .div_checked(div) + .ok() + .and_then(::from::); + + value_builder.append_option(v); } } - Ok(Arc::new(value_builder.finish())) - }}; + } else { + for i in 0..array.len() { + if array.is_null(i) { + value_builder.append_null(); + } else { + let v = array.value(i).div_checked(div)?; + + let value = + ::from::(v).ok_or_else(|| { + ArrowError::CastError(format!( + "value of {:?} is out of range {}", + v, + T::DATA_TYPE + )) + })?; + + value_builder.append_value(value); + } + } + } + Ok(Arc::new(value_builder.finish())) } // cast the decimal array to floating-point array @@ -576,18 +608,30 @@ pub fn cast_with_options( (Decimal128(_, scale), _) => { // cast decimal to other type match to_type { - Int8 => { - cast_decimal_to_integer!(array, scale, Int8Builder, i8, Int8) - } - Int16 => { - cast_decimal_to_integer!(array, scale, Int16Builder, i16, Int16) - } - Int32 => { - cast_decimal_to_integer!(array, scale, Int32Builder, i32, Int32) - } - Int64 => { - cast_decimal_to_integer!(array, scale, Int64Builder, i64, Int64) - } + Int8 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + Int16 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + Int32 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + Int64 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), Float32 => { cast_decimal_to_float!(array, scale, Float32Builder, f32) } @@ -601,6 +645,40 @@ pub fn cast_with_options( ))), } } + (Decimal256(_, scale), _) => { + // cast decimal to other type + match to_type { + Int8 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + Int16 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + Int32 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + Int64 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + Null => Ok(new_null_array(to_type, array.len())), + _ => Err(ArrowError::CastError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type + ))), + } + } (_, Decimal128(precision, scale)) => { // cast data to decimal match from_type { @@ -3154,12 +3232,18 @@ mod tests { let value_array: Vec> = vec![Some(24400)]; let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); let array = Arc::new(decimal_array) as ArrayRef; - let casted_array = cast(&array, &DataType::Int8); + let casted_array = + cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); assert_eq!( "Cast error: value of 244 is out of range Int8".to_string(), casted_array.unwrap_err().to_string() ); + let casted_array = + cast_with_options(&array, &DataType::Int8, &CastOptions { safe: true }); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + // loss the precision: convert decimal to f32、f64 // f32 // 112345678_f32 and 112345679_f32 are same, so the 112345679_f32 will lose precision. @@ -3218,6 +3302,66 @@ mod tests { ); } + #[test] + fn test_cast_decimal256_to_numeric() { + let decimal_type = DataType::Decimal256(38, 2); + // negative test + assert!(!can_cast_types(&decimal_type, &DataType::UInt8)); + let value_array: Vec> = vec![ + Some(i256::from_i128(125)), + Some(i256::from_i128(225)), + Some(i256::from_i128(325)), + None, + Some(i256::from_i128(525)), + ]; + let decimal_array = create_decimal256_array(value_array, 38, 2).unwrap(); + let array = Arc::new(decimal_array) as ArrayRef; + // i8 + generate_cast_test_case!( + &array, + Int8Array, + &DataType::Int8, + vec![Some(1_i8), Some(2_i8), Some(3_i8), None, Some(5_i8)] + ); + // i16 + generate_cast_test_case!( + &array, + Int16Array, + &DataType::Int16, + vec![Some(1_i16), Some(2_i16), Some(3_i16), None, Some(5_i16)] + ); + // i32 + generate_cast_test_case!( + &array, + Int32Array, + &DataType::Int32, + vec![Some(1_i32), Some(2_i32), Some(3_i32), None, Some(5_i32)] + ); + // i64 + generate_cast_test_case!( + &array, + Int64Array, + &DataType::Int64, + vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] + ); + + // overflow test: out of range of max i8 + let value_array: Vec> = vec![Some(i256::from_i128(24400))]; + let decimal_array = create_decimal256_array(value_array, 38, 2).unwrap(); + let array = Arc::new(decimal_array) as ArrayRef; + let casted_array = + cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); + assert_eq!( + "Cast error: value of 244 is out of range Int8".to_string(), + casted_array.unwrap_err().to_string() + ); + + let casted_array = + cast_with_options(&array, &DataType::Int8, &CastOptions { safe: true }); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + } + #[test] #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal128() { From fe3318bba24abfe572fa037a0b8805a15bdf5c45 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Nov 2022 09:17:14 +1300 Subject: [PATCH 0235/1411] Split out arrow-csv (#2594) (#3044) * Split out arrow-csv (#2594) * Fix doc * Update arrow-csv/Cargo.toml Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- .github/workflows/arrow.yml | 5 + .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-csv/Cargo.toml | 53 ++ arrow/src/csv/mod.rs => arrow-csv/src/lib.rs | 8 +- {arrow/src/csv => arrow-csv/src}/reader.rs | 546 ++----------------- {arrow/src/csv => arrow-csv/src}/writer.rs | 172 ++---- arrow/Cargo.toml | 11 +- arrow/src/lib.rs | 2 +- arrow/tests/csv.rs | 486 +++++++++++++++++ arrow/tests/{ipc_integration.rs => ipc.rs} | 0 15 files changed, 656 insertions(+), 633 deletions(-) create mode 100644 arrow-csv/Cargo.toml rename arrow/src/csv/mod.rs => arrow-csv/src/lib.rs (85%) rename {arrow/src/csv => arrow-csv/src}/reader.rs (75%) rename {arrow/src/csv => arrow-csv/src}/writer.rs (81%) create mode 100644 arrow/tests/csv.rs rename arrow/tests/{ipc_integration.rs => ipc.rs} (100%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d930086ef56a..461e7e87ea56 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -34,6 +34,7 @@ on: - arrow-select/** - arrow-integration-test/** - arrow-ipc/** + - arrow-csv/** - .github/** jobs: @@ -64,6 +65,8 @@ jobs: run: cargo test -p arrow-cast --all-features - name: Test arrow-ipc with all features run: cargo test -p arrow-ipc --all-features + - name: Test arrow-csv with all features + run: cargo test -p arrow-csv --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -174,5 +177,7 @@ jobs: run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - name: Clippy arrow-ipc with all features run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings + - name: Clippy arrow-csv with all features + run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index ded4f5a67915..1f830ccf2b26 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -36,6 +36,7 @@ on: - arrow-select/** - arrow-flight/** - arrow-ipc/** + - arrow-csv/** - .github/** jobs: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 17ebf54de732..04c7c080e019 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -24,6 +24,7 @@ arrow: - arrow-schema/**/* - arrow-select/**/* - arrow-ipc/**/* + - arrow-csv/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 8566230ea0b9..9418b9042835 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -32,6 +32,7 @@ on: - arrow-schema/** - arrow-select/** - arrow-ipc/** + - arrow-csv/** - arrow-pyarrow-integration-testing/** - arrow-integration-test/** - arrow-integration-testing/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 2e85c9dd95a5..e58ebdb35695 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -32,6 +32,7 @@ on: - arrow-schema/** - arrow-select/** - arrow-ipc/** + - arrow-csv/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index b369ef69bfd9..4f3cf5f8005c 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -35,6 +35,7 @@ on: - arrow-schema/** - arrow-select/** - arrow-ipc/** + - arrow-csv/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 0ab4853c6e10..18497d043794 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "arrow-array", "arrow-buffer", "arrow-cast", + "arrow-csv", "arrow-data", "arrow-flight", "arrow-integration-test", diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml new file mode 100644 index 000000000000..d40cef0db112 --- /dev/null +++ b/arrow-csv/Cargo.toml @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-csv" +version = "26.0.0" +description = "Support for parsing CSV format into the Arrow format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_csv" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "26.0.0", path = "../arrow-cast" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +chrono = { version = "0.4", default-features = false, features = ["clock"] } +csv = { version = "1.1", default-features = false } +lazy_static = { version = "1.4", default-features = false } +lexical-core = { version = "^0.8", default-features = false } +regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } + +[dev-dependencies] +tempfile = "3.3" diff --git a/arrow/src/csv/mod.rs b/arrow-csv/src/lib.rs similarity index 85% rename from arrow/src/csv/mod.rs rename to arrow-csv/src/lib.rs index 46ba7d71e200..a45cf082d714 100644 --- a/arrow/src/csv/mod.rs +++ b/arrow-csv/src/lib.rs @@ -27,14 +27,14 @@ pub use self::writer::Writer; pub use self::writer::WriterBuilder; use arrow_schema::ArrowError; -fn map_csv_error(error: csv_crate::Error) -> ArrowError { +fn map_csv_error(error: csv::Error) -> ArrowError { match error.kind() { - csv_crate::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), - csv_crate::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( + csv::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), + csv::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( "Encountered UTF-8 error while reading CSV file: {}", err )), - csv_crate::ErrorKind::UnequalLengths { + csv::ErrorKind::UnequalLengths { expected_len, len, .. } => ArrowError::CsvError(format!( "Encountered unequal lengths between records on CSV file. Expected {} \ diff --git a/arrow/src/csv/reader.rs b/arrow-csv/src/reader.rs similarity index 75% rename from arrow/src/csv/reader.rs rename to arrow-csv/src/reader.rs index 404f37e9309a..459c23ad2616 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow-csv/src/reader.rs @@ -22,11 +22,11 @@ //! //! Example: //! -//! ``` -//! use arrow::csv; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use std::fs::File; -//! use std::sync::Arc; +//! ```no_run +//! # use arrow_schema::*; +//! # use arrow_csv::Reader; +//! # use std::fs::File; +//! # use std::sync::Arc; //! //! let schema = Schema::new(vec![ //! Field::new("city", DataType::Utf8, false), @@ -36,7 +36,7 @@ //! //! let file = File::open("test/data/uk_cities.csv").unwrap(); //! -//! let mut csv = csv::Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); +//! let mut csv = Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); //! let batch = csv.next().unwrap().unwrap(); //! ``` @@ -49,17 +49,15 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::sync::Arc; -use crate::array::{ - ArrayRef, BooleanArray, Decimal128Builder, DictionaryArray, PrimitiveArray, - StringArray, -}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::{RecordBatch, RecordBatchOptions}; +use arrow_array::builder::Decimal128Builder; +use arrow_array::types::*; +use arrow_array::*; use arrow_cast::parse::Parser; +use arrow_schema::*; -use crate::csv::map_csv_error; -use csv_crate::{ByteRecord, StringRecord}; +use crate::map_csv_error; +use arrow_data::decimal::validate_decimal_precision; +use csv::{ByteRecord, StringRecord}; use std::ops::Neg; lazy_static! { @@ -128,7 +126,7 @@ pub fn infer_file_schema( delimiter: u8, max_read_records: Option, has_header: bool, -) -> Result<(Schema, usize)> { +) -> Result<(Schema, usize), ArrowError> { let roptions = ReaderOptions { delimiter: Some(delimiter), max_read_records, @@ -142,7 +140,7 @@ pub fn infer_file_schema( fn infer_file_schema_with_csv_options( mut reader: R, roptions: ReaderOptions, -) -> Result<(Schema, usize)> { +) -> Result<(Schema, usize), ArrowError> { let saved_offset = reader.seek(SeekFrom::Current(0))?; let (schema, records_count) = @@ -164,7 +162,7 @@ pub fn infer_reader_schema( delimiter: u8, max_read_records: Option, has_header: bool, -) -> Result<(Schema, usize)> { +) -> Result<(Schema, usize), ArrowError> { let roptions = ReaderOptions { delimiter: Some(delimiter), max_read_records, @@ -177,7 +175,7 @@ pub fn infer_reader_schema( fn infer_reader_schema_with_csv_options( reader: R, roptions: ReaderOptions, -) -> Result<(Schema, usize)> { +) -> Result<(Schema, usize), ArrowError> { let mut csv_reader = Reader::build_csv_reader( reader, roptions.has_header, @@ -268,7 +266,7 @@ pub fn infer_schema_from_files( delimiter: u8, max_read_records: Option, has_header: bool, -) -> Result { +) -> Result { let mut schemas = vec![]; let mut records_to_read = max_read_records.unwrap_or(usize::MAX); @@ -302,7 +300,7 @@ pub struct Reader { /// Optional projection for which columns to load (zero-based column indices) projection: Option>, /// File reader - reader: csv_crate::Reader, + reader: csv::Reader, /// Current line number line_number: usize, /// Maximum number of rows to read @@ -410,8 +408,8 @@ impl Reader { escape: Option, quote: Option, terminator: Option, - ) -> csv_crate::Reader { - let mut reader_builder = csv_crate::ReaderBuilder::new(); + ) -> csv::Reader { + let mut reader_builder = csv::ReaderBuilder::new(); reader_builder.has_headers(has_header); if let Some(c) = delimiter { @@ -422,13 +420,13 @@ impl Reader { reader_builder.quote(c); } if let Some(t) = terminator { - reader_builder.terminator(csv_crate::Terminator::Any(t)); + reader_builder.terminator(csv::Terminator::Any(t)); } reader_builder.from_reader(reader) } fn from_csv_reader( - mut csv_reader: csv_crate::Reader, + mut csv_reader: csv::Reader, schema: SchemaRef, has_header: bool, batch_size: usize, @@ -474,7 +472,7 @@ impl Reader { } impl Iterator for Reader { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { let remaining = self.end - self.line_number; @@ -522,8 +520,8 @@ impl Iterator for Reader { } } -/// parses a slice of [csv_crate::StringRecord] into a -/// [RecordBatch](crate::record_batch::RecordBatch). +/// parses a slice of [csv::StringRecord] into a +/// [RecordBatch] fn parse( rows: &[StringRecord], fields: &[Field], @@ -531,13 +529,13 @@ fn parse( projection: Option<&Vec>, line_number: usize, datetime_format: Option<&str>, -) -> Result { +) -> Result { let projection: Vec = match projection { Some(v) => v.clone(), None => fields.iter().enumerate().map(|(i, _)| i).collect(), }; - let arrays: Result> = projection + let arrays: Result, _> = projection .iter() .map(|i| { let i = *i; @@ -706,7 +704,7 @@ fn build_decimal_array( col_idx: usize, precision: u8, scale: u8, -) -> Result { +) -> Result { let mut decimal_builder = Decimal128Builder::with_capacity(rows.len()); for row in rows { let col_s = row.get(col_idx); @@ -720,7 +718,7 @@ fn build_decimal_array( // append null decimal_builder.append_null(); } else { - let decimal_value: Result = + let decimal_value: Result = parse_decimal_with_parameter(s, precision, scale); match decimal_value { Ok(v) => { @@ -743,7 +741,11 @@ fn build_decimal_array( // Parse the string format decimal value to i128 format and checking the precision and scale. // The result i128 value can't be out of bounds. -fn parse_decimal_with_parameter(s: &str, precision: u8, scale: u8) -> Result { +fn parse_decimal_with_parameter( + s: &str, + precision: u8, + scale: u8, +) -> Result { if PARSE_DECIMAL_RE.is_match(s) { let mut offset = s.len(); let len = s.len(); @@ -808,7 +810,7 @@ fn parse_decimal_with_parameter(s: &str, precision: u8, scale: u8) -> Result Result { +fn parse_decimal(s: &str) -> Result { if PARSE_DECIMAL_RE.is_match(s) { let mut offset = s.len(); // each byte is digit、'-' or '.' @@ -856,7 +858,7 @@ fn build_primitive_array( rows: &[StringRecord], col_idx: usize, format: Option<&str>, -) -> Result { +) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { @@ -884,7 +886,7 @@ fn build_primitive_array( None => Ok(None), } }) - .collect::>>() + .collect::, ArrowError>>() .map(|e| Arc::new(e) as ArrayRef) } @@ -893,7 +895,7 @@ fn build_boolean_array( line_number: usize, rows: &[StringRecord], col_idx: usize, -) -> Result { +) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { @@ -918,7 +920,7 @@ fn build_boolean_array( None => Ok(None), } }) - .collect::>() + .collect::>() .map(|e| Arc::new(e) as ArrayRef) } @@ -988,16 +990,14 @@ impl ReaderBuilder { /// # Example /// /// ``` - /// extern crate arrow; - /// - /// use arrow::csv; + /// use arrow_csv::{Reader, ReaderBuilder}; /// use std::fs::File; /// - /// fn example() -> csv::Reader { + /// fn example() -> Reader { /// let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); /// /// // create a builder, inferring the schema with the first 100 records - /// let builder = csv::ReaderBuilder::new().infer_schema(Some(100)); + /// let builder = ReaderBuilder::new().infer_schema(Some(100)); /// /// let reader = builder.build(file).unwrap(); /// @@ -1086,7 +1086,7 @@ impl ReaderBuilder { } /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, mut reader: R) -> Result> { + pub fn build(self, mut reader: R) -> Result, ArrowError> { // check if schema should be inferred let delimiter = self.delimiter.unwrap_or(b','); let schema = match self.schema { @@ -1131,436 +1131,11 @@ impl ReaderBuilder { mod tests { use super::*; - use std::fs::File; - use std::io::{Cursor, Write}; + use std::io::Write; use tempfile::NamedTempFile; - use crate::array::*; - use crate::compute::cast; - use crate::datatypes::Field; use chrono::prelude::*; - #[test] - fn test_csv() { - let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] - .into_iter() - .map(|format| { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - format, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - }) - .collect(); - } - - #[test] - fn test_csv_schema_metadata() { - let mut metadata = std::collections::HashMap::new(); - metadata.insert("foo".to_owned(), "bar".to_owned()); - let schema = Schema::new_with_metadata( - vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ], - metadata.clone(), - ); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - assert_eq!(&metadata, batch.schema().metadata()); - } - - #[test] - fn test_csv_reader_with_decimal() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal128(38, 6), false), - Field::new("lng", DataType::Decimal128(38, 6), false), - ]); - - let file = File::open("test/data/decimal_test.csv").unwrap(); - - let mut csv = - Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("57.653484", lat.value_as_string(0)); - assert_eq!("53.002666", lat.value_as_string(1)); - assert_eq!("52.412811", lat.value_as_string(2)); - assert_eq!("51.481583", lat.value_as_string(3)); - assert_eq!("12.123456", lat.value_as_string(4)); - assert_eq!("50.760000", lat.value_as_string(5)); - assert_eq!("0.123000", lat.value_as_string(6)); - assert_eq!("123.000000", lat.value_as_string(7)); - assert_eq!("123.000000", lat.value_as_string(8)); - assert_eq!("-50.760000", lat.value_as_string(9)); - } - - #[test] - fn test_csv_from_buf_reader() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file_with_headers = - File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); - let both_files = file_with_headers - .chain(Cursor::new("\n".to_string())) - .chain(file_without_headers); - let mut csv = Reader::from_reader( - both_files, - Arc::new(schema), - true, - None, - 1024, - None, - None, - None, - ); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(74, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - } - - #[test] - fn test_csv_with_schema_inference() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - - let builder = ReaderBuilder::new().has_header(true).infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - let expected_schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, true), - Field::new("lat", DataType::Float64, true), - Field::new("lng", DataType::Float64, true), - ]); - assert_eq!(Arc::new(expected_schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } - - #[test] - fn test_csv_with_schema_inference_no_headers() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let builder = ReaderBuilder::new().infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - - // csv field names should be 'column_{number}' - let schema = csv.schema(); - assert_eq!("column_1", schema.field(0).name()); - assert_eq!("column_2", schema.field(1).name()); - assert_eq!("column_3", schema.field(2).name()); - let batch = csv.next().unwrap().unwrap(); - let batch_schema = batch.schema(); - - assert_eq!(schema, batch_schema); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } - - #[test] - fn test_csv_builder_with_bounds() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - // Set the bounds to the lines 0, 1 and 2. - let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // The value on line 0 is within the bounds - assert_eq!("Elgin, Scotland, the UK", city.value(0)); - - // The value on line 13 is outside of the bounds. Therefore - // the call to .value() will panic. - let result = std::panic::catch_unwind(|| city.value(13)); - assert!(result.is_err()); - } - - #[test] - fn test_csv_with_projection() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); - } - - #[test] - fn test_csv_with_dictionary() { - let schema = Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); - - let strings = cast(batch.column(0), &DataType::Utf8).unwrap(); - let strings = strings.as_any().downcast_ref::().unwrap(); - - assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); - assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); - assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); - } - - #[test] - fn test_nulls() { - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), - ]); - - let file = File::open("test/data/null_test.csv").unwrap(); - - let mut csv = - Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); - } - - #[test] - fn test_nulls_with_inference() { - let file = File::open("test/data/various_types.csv").unwrap(); - - let builder = ReaderBuilder::new() - .infer_schema(None) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3, 4, 5]); - - let mut csv = builder.build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - assert_eq!(7, batch.num_rows()); - assert_eq!(6, batch.num_columns()); - - let schema = batch.schema(); - - assert_eq!(&DataType::Int64, schema.field(0).data_type()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - assert_eq!(&DataType::Float64, schema.field(2).data_type()); - assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - assert_eq!(&DataType::Date32, schema.field(4).data_type()); - assert_eq!(&DataType::Date64, schema.field(5).data_type()); - - let names: Vec<&str> = - schema.fields().iter().map(|x| x.name().as_str()).collect(); - assert_eq!( - names, - vec![ - "c_int", - "c_float", - "c_string", - "c_bool", - "c_date", - "c_datetime" - ] - ); - - assert!(schema.field(0).is_nullable()); - assert!(schema.field(1).is_nullable()); - assert!(schema.field(2).is_nullable()); - assert!(schema.field(3).is_nullable()); - assert!(schema.field(4).is_nullable()); - assert!(schema.field(5).is_nullable()); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); - } - - #[test] - fn test_parse_invalid_csv() { - let file = File::open("test/data/various_types_invalid.csv").unwrap(); - - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, false), - Field::new("c_string", DataType::Utf8, false), - Field::new("c_bool", DataType::Boolean, false), - ]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3]); - - let mut csv = builder.build(file).unwrap(); - match csv.next() { - Some(e) => match e { - Err(e) => assert_eq!( - "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", - format!("{:?}", e) - ), - Ok(_) => panic!("should have failed"), - }, - None => panic!("should have failed"), - } - } - #[test] fn test_infer_field_schema() { assert_eq!(infer_field_schema("A", None), DataType::Utf8); @@ -1771,21 +1346,21 @@ mod tests { } #[test] - fn test_infer_schema_from_multiple_files() -> Result<()> { - let mut csv1 = NamedTempFile::new()?; - let mut csv2 = NamedTempFile::new()?; - let csv3 = NamedTempFile::new()?; // empty csv file should be skipped - let mut csv4 = NamedTempFile::new()?; - writeln!(csv1, "c1,c2,c3")?; - writeln!(csv1, "1,\"foo\",0.5")?; - writeln!(csv1, "3,\"bar\",1")?; - writeln!(csv1, "3,\"bar\",2e-06")?; + fn test_infer_schema_from_multiple_files() { + let mut csv1 = NamedTempFile::new().unwrap(); + let mut csv2 = NamedTempFile::new().unwrap(); + let csv3 = NamedTempFile::new().unwrap(); // empty csv file should be skipped + let mut csv4 = NamedTempFile::new().unwrap(); + writeln!(csv1, "c1,c2,c3").unwrap(); + writeln!(csv1, "1,\"foo\",0.5").unwrap(); + writeln!(csv1, "3,\"bar\",1").unwrap(); + writeln!(csv1, "3,\"bar\",2e-06").unwrap(); // reading csv2 will set c2 to optional - writeln!(csv2, "c1,c2,c3,c4")?; - writeln!(csv2, "10,,3.14,true")?; + writeln!(csv2, "c1,c2,c3,c4").unwrap(); + writeln!(csv2, "10,,3.14,true").unwrap(); // reading csv4 will set c3 to optional - writeln!(csv4, "c1,c2,c3")?; - writeln!(csv4, "10,\"foo\",")?; + writeln!(csv4, "c1,c2,c3").unwrap(); + writeln!(csv4, "10,\"foo\",").unwrap(); let schema = infer_schema_from_files( &[ @@ -1797,7 +1372,8 @@ mod tests { b',', Some(4), // only csv1 and csv2 should be read true, - )?; + ) + .unwrap(); assert_eq!(schema.fields().len(), 4); assert!(schema.field(0).is_nullable()); @@ -1809,8 +1385,6 @@ mod tests { assert_eq!(&DataType::Utf8, schema.field(1).data_type()); assert_eq!(&DataType::Float64, schema.field(2).data_type()); assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - - Ok(()) } #[test] diff --git a/arrow/src/csv/writer.rs b/arrow-csv/src/writer.rs similarity index 81% rename from arrow/src/csv/writer.rs rename to arrow-csv/src/writer.rs index b2d02fe84947..674b333698bd 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow-csv/src/writer.rs @@ -23,11 +23,11 @@ //! Example: //! //! ``` -//! use arrow::array::*; -//! use arrow::csv; -//! use arrow::datatypes::*; -//! use arrow::record_batch::RecordBatch; -//! use std::sync::Arc; +//! # use arrow_array::*; +//! # use arrow_array::types::*; +//! # use arrow_csv::Writer; +//! # use arrow_schema::*; +//! # use std::sync::Arc; //! //! let schema = Schema::new(vec![ //! Field::new("c1", DataType::Utf8, false), @@ -56,7 +56,7 @@ //! //! let mut output = Vec::with_capacity(1024); //! -//! let mut writer = csv::Writer::new(&mut output); +//! let mut writer = Writer::new(&mut output); //! let batches = vec![&batch, &batch]; //! for batch in batches { //! writer.write(batch).unwrap(); @@ -64,15 +64,14 @@ //! ``` use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::*; +use arrow_cast::display::{lexical_to_string, make_string_from_decimal}; +use arrow_schema::*; use chrono::{DateTime, Utc}; use std::io::Write; -use crate::array::*; -use crate::csv::map_csv_error; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::RecordBatch; -use crate::util::display::{lexical_to_string, make_string_from_decimal}; +use crate::map_csv_error; const DEFAULT_DATE_FORMAT: &str = "%F"; const DEFAULT_TIME_FORMAT: &str = "%T"; @@ -81,7 +80,7 @@ const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; fn write_primitive_value(array: &ArrayRef, i: usize) -> String where - T: ArrowNumericType, + T: ArrowPrimitiveType, T::Native: lexical_core::ToLexical, { let c = array.as_any().downcast_ref::>().unwrap(); @@ -92,7 +91,7 @@ where #[derive(Debug)] pub struct Writer { /// The object to write to - writer: csv_crate::Writer, + writer: csv::Writer, /// Whether file should be written with headers. Defaults to `true` has_headers: bool, /// The date format for date arrays @@ -115,7 +114,7 @@ impl Writer { /// Create a new CsvWriter from a writable object, with default options pub fn new(writer: W) -> Self { let delimiter = b','; - let mut builder = csv_crate::WriterBuilder::new(); + let mut builder = csv::WriterBuilder::new(); let writer = builder.delimiter(delimiter).from_writer(writer); Writer { writer, @@ -135,7 +134,7 @@ impl Writer { batch: &[ArrayRef], row_index: usize, buffer: &mut [String], - ) -> Result<()> { + ) -> Result<(), ArrowError> { // TODO: it'd be more efficient if we could create `record: Vec<&[u8]> for (col_index, item) in buffer.iter_mut().enumerate() { let col = &batch[col_index]; @@ -242,7 +241,7 @@ impl Writer { time_zone: Option<&String>, row_index: usize, col: &ArrayRef, - ) -> Result { + ) -> Result { use TimeUnit::*; let datetime = match time_unit { Second => col @@ -283,7 +282,7 @@ impl Writer { } /// Write a vector of record batches to a writable object - pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { let num_columns = batch.num_columns(); if self.beginning { if self.has_headers { @@ -305,7 +304,7 @@ impl Writer { .iter() .map(|array| match array.data_type() { DataType::Dictionary(_, value_type) => { - crate::compute::kernels::cast::cast(array, value_type) + arrow_cast::cast(array, value_type) .expect("cannot cast dictionary to underlying values") } _ => array.clone(), @@ -365,16 +364,14 @@ impl WriterBuilder { /// # Example /// /// ``` - /// extern crate arrow; + /// # use arrow_csv::{Writer, WriterBuilder}; + /// # use std::fs::File; /// - /// use arrow::csv; - /// use std::fs::File; - /// - /// fn example() -> csv::Writer { + /// fn example() -> Writer { /// let file = File::create("target/out.csv").unwrap(); /// /// // create a builder that doesn't write headers - /// let builder = csv::WriterBuilder::new().has_headers(false); + /// let builder = WriterBuilder::new().has_headers(false); /// let writer = builder.build(file); /// /// writer @@ -423,7 +420,7 @@ impl WriterBuilder { /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let delimiter = self.delimiter.unwrap_or(b','); - let mut builder = csv_crate::WriterBuilder::new(); + let mut builder = csv::WriterBuilder::new(); let writer = builder.delimiter(delimiter).from_writer(writer); Writer { writer, @@ -452,13 +449,8 @@ impl WriterBuilder { mod tests { use super::*; - use crate::csv::Reader; - use crate::datatypes::{Field, Schema}; - #[cfg(feature = "chrono-tz")] - use crate::util::string_writer::StringWriter; - use crate::util::test_util::get_temp_file; - use std::fs::File; - use std::io::{Cursor, Read}; + use crate::Reader; + use std::io::{Cursor, Read, Seek}; use std::sync::Arc; #[test] @@ -512,15 +504,17 @@ mod tests { ) .unwrap(); - let file = get_temp_file("columns.csv", &[]); + let mut file = tempfile::tempfile().unwrap(); - let mut writer = Writer::new(file); + let mut writer = Writer::new(&mut file); let batches = vec![&batch, &batch]; for batch in batches { writer.write(batch).unwrap(); } + drop(writer); + // check that file was written successfully - let mut file = File::open("target/debug/testdata/columns.csv").unwrap(); + file.rewind().unwrap(); let mut buffer: Vec = vec![]; file.read_to_end(&mut buffer).unwrap(); @@ -571,20 +565,21 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ) .unwrap(); - let file = get_temp_file("custom_options.csv", &[]); + let mut file = tempfile::tempfile().unwrap(); let builder = WriterBuilder::new() .has_headers(false) .with_delimiter(b'|') .with_time_format("%r".to_string()); - let mut writer = builder.build(file); + let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { writer.write(batch).unwrap(); } + drop(writer); // check that file was written successfully - let mut file = File::open("target/debug/testdata/custom_options.csv").unwrap(); + file.rewind().unwrap(); let mut buffer: Vec = vec![]; file.read_to_end(&mut buffer).unwrap(); @@ -595,105 +590,6 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ); } - #[cfg(feature = "chrono-tz")] - #[test] - fn test_export_csv_timestamps() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::Timestamp( - TimeUnit::Millisecond, - Some("Australia/Sydney".to_string()), - ), - true, - ), - Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), - ]); - - let c1 = TimestampMillisecondArray::from( - // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). - // The offset (difference to UTC) is +10:00. - // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) - // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. - // - vec![Some(1555584887378), Some(1635577147000)], - ) - .with_timezone("Australia/Sydney".to_string()); - let c2 = TimestampMillisecondArray::from(vec![ - Some(1555584887378), - Some(1635577147000), - ]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); - - let sw = StringWriter::new(); - let mut writer = Writer::new(sw); - let batches = vec![&batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - - let left = "c1,c2 -2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 -2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; - let right = writer.writer.into_inner().map(|s| s.to_string()); - assert_eq!(Some(left.to_string()), right.ok()); - } - - #[cfg(not(feature = "chrono-tz"))] - #[test] - fn test_conversion_consistency() { - // test if we can serialize and deserialize whilst retaining the same type information/ precision - - let schema = Schema::new(vec![ - Field::new("c1", DataType::Date32, false), - Field::new("c2", DataType::Date64, false), - ]); - - let c1 = Date32Array::from(vec![3, 2, 1]); - let c2 = Date64Array::from(vec![3, 2, 1]); - - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(c1), Arc::new(c2)], - ) - .unwrap(); - - let builder = WriterBuilder::new().has_headers(false); - - let mut buf: Cursor> = Default::default(); - // drop the writer early to release the borrow. - { - let mut writer = builder.build(&mut buf); - writer.write(&batch).unwrap(); - } - buf.set_position(0); - - let mut reader = Reader::new( - buf, - Arc::new(schema), - false, - None, - 3, - // starting at row 2 and up to row 6. - None, - None, - None, - ); - let rb = reader.next().unwrap().unwrap(); - let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); - let c2 = rb.column(1).as_any().downcast_ref::().unwrap(); - - let actual = c1.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - let actual = c2.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - } - - #[cfg(feature = "chrono-tz")] #[test] fn test_conversion_consistency() { // test if we can serialize and deserialize whilst retaining the same type information/ precision diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 6c30df6bd27d..cc9421de710d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -46,6 +46,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] [dependencies] arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } arrow-cast = { version = "26.0.0", path = "../arrow-cast" } +arrow-csv = { version = "26.0.0", path = "../arrow-csv", optional = true } arrow-data = { version = "26.0.0", path = "../arrow-data" } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } arrow-array = { version = "26.0.0", path = "../arrow-array" } @@ -57,10 +58,8 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.12", default-features = false } -csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } -lazy_static = { version = "1.4", default-features = false } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } @@ -75,7 +74,7 @@ features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] [features] default = ["csv", "ipc", "json"] ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] -csv = ["csv_crate"] +csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["serde_json"] simd = ["packed_simd"] @@ -265,5 +264,9 @@ harness = false required-features = ["test_utils"] [[test]] -name = "ipc_integration" +name = "ipc" required-features = ["test_utils", "ipc"] + +[[test]] +name = "csv" +required-features = ["csv"] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index b2fa30d26d53..d1e0095840a5 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -306,7 +306,7 @@ pub mod bitmap { pub mod array; pub mod compute; #[cfg(feature = "csv")] -pub mod csv; +pub use arrow_csv as csv; pub mod datatypes; pub mod error; #[cfg(feature = "ffi")] diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs new file mode 100644 index 000000000000..11e1b30e1488 --- /dev/null +++ b/arrow/tests/csv.rs @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fs::File; +use std::io::{Cursor, Read}; +use std::sync::Arc; + +use arrow_array::*; +use arrow_csv::{Reader, ReaderBuilder}; +use arrow_schema::*; + +#[test] +#[cfg(feature = "chrono-tz")] +fn test_export_csv_timestamps() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Timestamp( + TimeUnit::Millisecond, + Some("Australia/Sydney".to_string()), + ), + true, + ), + Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), + ]); + + let c1 = TimestampMillisecondArray::from( + // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). + // The offset (difference to UTC) is +10:00. + // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) + // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. + // + vec![Some(1555584887378), Some(1635577147000)], + ) + .with_timezone("Australia/Sydney".to_string()); + let c2 = + TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut sw = Vec::new(); + let mut writer = arrow_csv::Writer::new(&mut sw); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + let left = "c1,c2 +2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 +2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; + let right = String::from_utf8(sw).unwrap(); + assert_eq!(left, right); +} + +#[test] +fn test_csv() { + let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] + .into_iter() + .map(|format| { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + format, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + }) + .collect(); +} + +#[test] +fn test_csv_schema_metadata() { + let mut metadata = std::collections::HashMap::new(); + metadata.insert("foo".to_owned(), "bar".to_owned()); + let schema = Schema::new_with_metadata( + vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ], + metadata.clone(), + ); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + None, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + assert_eq!(&metadata, batch.schema().metadata()); +} + +#[test] +fn test_csv_reader_with_decimal() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Decimal128(38, 6), false), + Field::new("lng", DataType::Decimal128(38, 6), false), + ]); + + let file = File::open("test/data/decimal_test.csv").unwrap(); + + let mut csv = + Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("57.653484", lat.value_as_string(0)); + assert_eq!("53.002666", lat.value_as_string(1)); + assert_eq!("52.412811", lat.value_as_string(2)); + assert_eq!("51.481583", lat.value_as_string(3)); + assert_eq!("12.123456", lat.value_as_string(4)); + assert_eq!("50.760000", lat.value_as_string(5)); + assert_eq!("0.123000", lat.value_as_string(6)); + assert_eq!("123.000000", lat.value_as_string(7)); + assert_eq!("123.000000", lat.value_as_string(8)); + assert_eq!("-50.760000", lat.value_as_string(9)); +} + +#[test] +fn test_csv_from_buf_reader() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file_with_headers = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); + let both_files = file_with_headers + .chain(Cursor::new("\n".to_string())) + .chain(file_without_headers); + let mut csv = Reader::from_reader( + both_files, + Arc::new(schema), + true, + None, + 1024, + None, + None, + None, + ); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(74, batch.num_rows()); + assert_eq!(3, batch.num_columns()); +} + +#[test] +fn test_csv_with_schema_inference() { + let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + + let builder = ReaderBuilder::new().has_header(true).infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + let expected_schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, true), + Field::new("lat", DataType::Float64, true), + Field::new("lng", DataType::Float64, true), + ]); + assert_eq!(Arc::new(expected_schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); +} + +#[test] +fn test_csv_with_schema_inference_no_headers() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let builder = ReaderBuilder::new().infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + + // csv field names should be 'column_{number}' + let schema = csv.schema(); + assert_eq!("column_1", schema.field(0).name()); + assert_eq!("column_2", schema.field(1).name()); + assert_eq!("column_3", schema.field(2).name()); + let batch = csv.next().unwrap().unwrap(); + let batch_schema = batch.schema(); + + assert_eq!(schema, batch_schema); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); +} + +#[test] +fn test_csv_builder_with_bounds() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + // Set the bounds to the lines 0, 1 and 2. + let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // The value on line 0 is within the bounds + assert_eq!("Elgin, Scotland, the UK", city.value(0)); + + // The value on line 13 is outside of the bounds. Therefore + // the call to .value() will panic. + let result = std::panic::catch_unwind(|| city.value(13)); + assert!(result.is_err()); +} + +#[test] +fn test_csv_with_projection() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); +} + +#[test] +fn test_csv_with_dictionary() { + let schema = Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + + let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); + let strings = strings.as_any().downcast_ref::().unwrap(); + + assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); + assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); + assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); +} + +#[test] +fn test_nulls() { + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, false), + ]); + + let file = File::open("test/data/null_test.csv").unwrap(); + + let mut csv = Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); +} + +#[test] +fn test_nulls_with_inference() { + let file = File::open("test/data/various_types.csv").unwrap(); + + let builder = ReaderBuilder::new() + .infer_schema(None) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3, 4, 5]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(7, batch.num_rows()); + assert_eq!(6, batch.num_columns()); + + let schema = batch.schema(); + + assert_eq!(&DataType::Int64, schema.field(0).data_type()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + assert_eq!(&DataType::Float64, schema.field(2).data_type()); + assert_eq!(&DataType::Boolean, schema.field(3).data_type()); + assert_eq!(&DataType::Date32, schema.field(4).data_type()); + assert_eq!(&DataType::Date64, schema.field(5).data_type()); + + let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); + assert_eq!( + names, + vec![ + "c_int", + "c_float", + "c_string", + "c_bool", + "c_date", + "c_datetime" + ] + ); + + assert!(schema.field(0).is_nullable()); + assert!(schema.field(1).is_nullable()); + assert!(schema.field(2).is_nullable()); + assert!(schema.field(3).is_nullable()); + assert!(schema.field(4).is_nullable()); + assert!(schema.field(5).is_nullable()); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); +} + +#[test] +fn test_parse_invalid_csv() { + let file = File::open("test/data/various_types_invalid.csv").unwrap(); + + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, false), + Field::new("c_string", DataType::Utf8, false), + Field::new("c_bool", DataType::Boolean, false), + ]); + + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + match csv.next() { + Some(e) => match e { + Err(e) => assert_eq!( + "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", + format!("{:?}", e) + ), + Ok(_) => panic!("should have failed"), + }, + None => panic!("should have failed"), + } +} diff --git a/arrow/tests/ipc_integration.rs b/arrow/tests/ipc.rs similarity index 100% rename from arrow/tests/ipc_integration.rs rename to arrow/tests/ipc.rs From 8d75101e3773e5d74c8d5cda356a9eaba34acf90 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Nov 2022 18:10:24 +1300 Subject: [PATCH 0236/1411] Split out arrow-json (#3044) (#3049) * Split out arrow-json (#3044) * RAT * Fix feature * Revert no_run * RAT --- .github/workflows/arrow.yml | 5 + .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 3 +- arrow-json/Cargo.toml | 54 +++++++ .../src/json/mod.rs => arrow-json/src/lib.rs | 0 {arrow/src/json => arrow-json/src}/reader.rs | 136 +++++++++--------- {arrow/src/json => arrow-json/src}/writer.rs | 98 +++++++------ {arrow => arrow-json}/test/data/arrays.json | 0 {arrow => arrow-json}/test/data/basic.json | 0 .../test/data/basic_nulls.json | 0 .../test/data/list_string_dict_nested.json | 0 .../data/list_string_dict_nested_nulls.json | 0 .../test/data/mixed_arrays.json | 0 .../test/data/mixed_arrays.json.gz | Bin .../test/data/nested_structs.json | 0 arrow/Cargo.toml | 18 ++- arrow/benches/json_reader.rs | 5 +- arrow/src/lib.rs | 6 +- dev/release/rat_exclude_files.txt | 1 + 23 files changed, 194 insertions(+), 137 deletions(-) create mode 100644 arrow-json/Cargo.toml rename arrow/src/json/mod.rs => arrow-json/src/lib.rs (100%) rename {arrow/src/json => arrow-json/src}/reader.rs (97%) rename {arrow/src/json => arrow-json/src}/writer.rs (95%) rename {arrow => arrow-json}/test/data/arrays.json (100%) rename {arrow => arrow-json}/test/data/basic.json (100%) rename {arrow => arrow-json}/test/data/basic_nulls.json (100%) rename {arrow => arrow-json}/test/data/list_string_dict_nested.json (100%) rename {arrow => arrow-json}/test/data/list_string_dict_nested_nulls.json (100%) rename {arrow => arrow-json}/test/data/mixed_arrays.json (100%) rename {arrow => arrow-json}/test/data/mixed_arrays.json.gz (100%) rename {arrow => arrow-json}/test/data/nested_structs.json (100%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 461e7e87ea56..2e1c64ebe3a0 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -35,6 +35,7 @@ on: - arrow-integration-test/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: @@ -67,6 +68,8 @@ jobs: run: cargo test -p arrow-ipc --all-features - name: Test arrow-csv with all features run: cargo test -p arrow-csv --all-features + - name: Test arrow-json with all features + run: cargo test -p arrow-json --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -179,5 +182,7 @@ jobs: run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - name: Clippy arrow-csv with all features run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings + - name: Clippy arrow-json with all features + run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 1f830ccf2b26..2825d2400f1f 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -37,6 +37,7 @@ on: - arrow-flight/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 04c7c080e019..d93932cd2334 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -25,6 +25,7 @@ arrow: - arrow-select/**/* - arrow-ipc/**/* - arrow-csv/**/* + - arrow-json/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9418b9042835..3ece06b29238 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,6 +33,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - arrow-pyarrow-integration-testing/** - arrow-integration-test/** - arrow-integration-testing/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index e58ebdb35695..b1f5d85fc581 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -33,6 +33,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 4f3cf5f8005c..5b0cc87440e9 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -36,6 +36,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 18497d043794..16b4cb7f89e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,12 +27,13 @@ members = [ "arrow-integration-test", "arrow-integration-testing", "arrow-ipc", + "arrow-json", "arrow-schema", "arrow-select", + "object_store", "parquet", "parquet_derive", "parquet_derive_test", - "object_store", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml new file mode 100644 index 000000000000..0d8c91092103 --- /dev/null +++ b/arrow-json/Cargo.toml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-json" +version = "26.0.0" +description = "Support for parsing JSON format into the Arrow format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_json" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "26.0.0", path = "../arrow-cast" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +half = { version = "2.1", default-features = false } +indexmap = { version = "1.9", default-features = false, features = ["std"] } +num = { version = "0.4", default-features = false, features = ["std"] } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } + +[dev-dependencies] +tempfile = "3.3" +flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow/src/json/mod.rs b/arrow-json/src/lib.rs similarity index 100% rename from arrow/src/json/mod.rs rename to arrow-json/src/lib.rs diff --git a/arrow/src/json/reader.rs b/arrow-json/src/reader.rs similarity index 97% rename from arrow/src/json/reader.rs rename to arrow-json/src/reader.rs index 78c51559a7dd..b3af909ef46f 100644 --- a/arrow/src/json/reader.rs +++ b/arrow-json/src/reader.rs @@ -24,11 +24,10 @@ //! Example: //! //! ``` -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use std::fs::File; -//! use std::io::BufReader; -//! use std::sync::Arc; +//! # use arrow_schema::*; +//! # use std::fs::File; +//! # use std::io::BufReader; +//! # use std::sync::Arc; //! //! let schema = Schema::new(vec![ //! Field::new("a", DataType::Float64, false), @@ -38,10 +37,10 @@ //! //! let file = File::open("test/data/basic.json").unwrap(); //! -//! let mut json = json::Reader::new( +//! let mut json = arrow_json::Reader::new( //! BufReader::new(file), //! Arc::new(schema), -//! json::reader::DecoderOptions::new(), +//! arrow_json::reader::DecoderOptions::new(), //! ); //! //! let batch = json.next().unwrap().unwrap(); @@ -55,13 +54,13 @@ use indexmap::set::IndexSet as HashSet; use serde_json::json; use serde_json::{map::Map as JsonMap, Value}; -use crate::buffer::MutableBuffer; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::{RecordBatch, RecordBatchOptions}; -use crate::util::bit_util; -use crate::{array::*, buffer::Buffer}; +use arrow_array::builder::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_cast::parse::Parser; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::*; #[derive(Debug, Clone)] enum InferredType { @@ -72,7 +71,7 @@ enum InferredType { } impl InferredType { - fn merge(&mut self, other: InferredType) -> Result<()> { + fn merge(&mut self, other: InferredType) -> Result<(), ArrowError> { match (self, other) { (InferredType::Array(s), InferredType::Array(o)) => { s.merge(*o)?; @@ -147,7 +146,7 @@ fn coerce_data_type(dt: Vec<&DataType>) -> DataType { }) } -fn generate_datatype(t: &InferredType) -> Result { +fn generate_datatype(t: &InferredType) -> Result { Ok(match t { InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), @@ -160,14 +159,16 @@ fn generate_datatype(t: &InferredType) -> Result { }) } -fn generate_fields(spec: &HashMap) -> Result> { +fn generate_fields( + spec: &HashMap, +) -> Result, ArrowError> { spec.iter() .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) .collect() } /// Generate schema from JSON field names and inferred data types -fn generate_schema(spec: HashMap) -> Result { +fn generate_schema(spec: HashMap) -> Result { Ok(Schema::new(generate_fields(&spec)?)) } @@ -178,7 +179,7 @@ fn generate_schema(spec: HashMap) -> Result { /// ``` /// use std::fs::File; /// use std::io::BufReader; -/// use arrow::json::reader::ValueIter; +/// use arrow_json::reader::ValueIter; /// /// let mut reader = /// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); @@ -208,7 +209,7 @@ impl<'a, R: Read> ValueIter<'a, R> { } impl<'a, R: Read> Iterator for ValueIter<'a, R> { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { if let Some(max) = self.max_read_records { @@ -259,7 +260,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { /// ``` /// use std::fs::File; /// use std::io::BufReader; -/// use arrow::json::reader::infer_json_schema_from_seekable; +/// use arrow_json::reader::infer_json_schema_from_seekable; /// /// let file = File::open("test/data/mixed_arrays.json").unwrap(); /// // file's cursor's offset at 0 @@ -270,7 +271,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { pub fn infer_json_schema_from_seekable( reader: &mut BufReader, max_read_records: Option, -) -> Result { +) -> Result { let schema = infer_json_schema(reader, max_read_records); // return the reader seek back to the start reader.seek(SeekFrom::Start(0))?; @@ -292,7 +293,7 @@ pub fn infer_json_schema_from_seekable( /// use std::fs::File; /// use std::io::{BufReader, SeekFrom, Seek}; /// use flate2::read::GzDecoder; -/// use arrow::json::reader::infer_json_schema; +/// use arrow_json::reader::infer_json_schema; /// /// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); /// @@ -307,7 +308,7 @@ pub fn infer_json_schema_from_seekable( pub fn infer_json_schema( reader: &mut BufReader, max_read_records: Option, -) -> Result { +) -> Result { infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) } @@ -315,7 +316,7 @@ fn set_object_scalar_field_type( field_types: &mut HashMap, key: &str, ftype: DataType, -) -> Result<()> { +) -> Result<(), ArrowError> { if !field_types.contains_key(key) { field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); } @@ -340,7 +341,7 @@ fn set_object_scalar_field_type( } } -fn infer_scalar_array_type(array: &[Value]) -> Result { +fn infer_scalar_array_type(array: &[Value]) -> Result { let mut hs = HashSet::new(); for v in array { @@ -371,7 +372,7 @@ fn infer_scalar_array_type(array: &[Value]) -> Result { Ok(InferredType::Scalar(hs)) } -fn infer_nested_array_type(array: &[Value]) -> Result { +fn infer_nested_array_type(array: &[Value]) -> Result { let mut inner_ele_type = InferredType::Any; for v in array { @@ -391,7 +392,7 @@ fn infer_nested_array_type(array: &[Value]) -> Result { Ok(InferredType::Array(Box::new(inner_ele_type))) } -fn infer_struct_array_type(array: &[Value]) -> Result { +fn infer_struct_array_type(array: &[Value]) -> Result { let mut field_types = HashMap::new(); for v in array { @@ -411,7 +412,7 @@ fn infer_struct_array_type(array: &[Value]) -> Result { Ok(InferredType::Object(field_types)) } -fn infer_array_element_type(array: &[Value]) -> Result { +fn infer_array_element_type(array: &[Value]) -> Result { match array.iter().take(1).next() { None => Ok(InferredType::Any), // empty array, return any type that can be updated later Some(a) => match a { @@ -425,7 +426,7 @@ fn infer_array_element_type(array: &[Value]) -> Result { fn collect_field_types_from_object( field_types: &mut HashMap, map: &JsonMap, -) -> Result<()> { +) -> Result<(), ArrowError> { for (k, v) in map { match v { Value::Array(array) => { @@ -532,9 +533,9 @@ fn collect_field_types_from_object( /// The reason we diverge here is because we don't have utilities to deal with JSON data once it's /// interpreted as Strings. We should match Spark's behavior once we added more JSON parsing /// kernels in the future. -pub fn infer_json_schema_from_iterator(value_iter: I) -> Result +pub fn infer_json_schema_from_iterator(value_iter: I) -> Result where - I: Iterator>, + I: Iterator>, { let mut field_types: HashMap = HashMap::new(); @@ -563,7 +564,7 @@ where /// /// # Examples /// ``` -/// use arrow::json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; +/// use arrow_json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; /// use std::fs::File; /// use std::io::{BufReader, Seek, SeekFrom}; /// use std::sync::Arc; @@ -673,9 +674,12 @@ impl Decoder { /// interator into a [`RecordBatch`]. /// /// Returns `None` if the input iterator is exhausted. - pub fn next_batch(&self, value_iter: &mut I) -> Result> + pub fn next_batch( + &self, + value_iter: &mut I, + ) -> Result, ArrowError> where - I: Iterator>, + I: Iterator>, { let batch_size = self.options.batch_size; let mut rows: Vec = Vec::with_capacity(batch_size); @@ -732,7 +736,7 @@ impl Decoder { rows: &[Value], col_name: &str, key_type: &DataType, - ) -> Result { + ) -> Result { match *key_type { DataType::Int8 => { let dtype = DataType::Dictionary( @@ -803,7 +807,7 @@ impl Decoder { data_type: &DataType, col_name: &str, rows: &[Value], - ) -> Result + ) -> Result where DT: ArrowPrimitiveType + ArrowDictionaryKeyType, { @@ -923,7 +927,7 @@ impl Decoder { col_name: &str, key_type: &DataType, value_type: &DataType, - ) -> Result { + ) -> Result { if let DataType::Utf8 = *value_type { match *key_type { DataType::Int8 => self.build_dictionary_array::(rows, col_name), @@ -959,7 +963,11 @@ impl Decoder { } } - fn build_boolean_array(&self, rows: &[Value], col_name: &str) -> Result { + fn build_boolean_array( + &self, + rows: &[Value], + col_name: &str, + ) -> Result { let mut builder = BooleanBuilder::with_capacity(rows.len()); for row in rows { if let Some(value) = row.get(col_name) { @@ -980,9 +988,9 @@ impl Decoder { &self, rows: &[Value], col_name: &str, - ) -> Result + ) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, T::Native: num::NumCast, { let format_string = self @@ -1019,7 +1027,7 @@ impl Decoder { &self, rows: &[Value], list_field: &Field, - ) -> Result { + ) -> Result { // build list offsets let mut cur_offset = OffsetSize::zero(); let list_len = rows.len(); @@ -1188,8 +1196,8 @@ impl Decoder { rows: &[Value], struct_fields: &[Field], projection: &Option>, - ) -> Result> { - let arrays: Result> = struct_fields + ) -> Result, ArrowError> { + let arrays: Result, ArrowError> = struct_fields .iter() .filter(|field| { projection @@ -1393,7 +1401,7 @@ impl Decoder { field_name: &str, map_type: &DataType, struct_field: &Field, - ) -> Result { + ) -> Result { // A map has the format {"key": "value"} where key is most commonly a string, // but could be a string, number or boolean (🤷🏾‍♂️) (e.g. {1: "value"}). // A map is also represented as a flattened contiguous array, with the number @@ -1488,7 +1496,7 @@ impl Decoder { &self, rows: &[Value], col_name: &str, - ) -> Result + ) -> Result where T::Native: num::NumCast, T: ArrowPrimitiveType + ArrowDictionaryKeyType, @@ -1512,7 +1520,7 @@ impl Decoder { /// Read the primitive list's values into ArrayData fn read_primitive_list_values(&self, rows: &[Value]) -> ArrayData where - T: ArrowPrimitiveType + ArrowNumericType, + T: ArrowPrimitiveType, T::Native: num::NumCast, { let values = rows @@ -1637,7 +1645,7 @@ impl Reader { /// Read the next batch of records #[allow(clippy::should_implement_trait)] - pub fn next(&mut self) -> Result> { + pub fn next(&mut self) -> Result, ArrowError> { self.decoder .next_batch(&mut ValueIter::new(&mut self.reader, None)) } @@ -1667,16 +1675,13 @@ impl ReaderBuilder { /// # Example /// /// ``` - /// extern crate arrow; - /// - /// use arrow::json; - /// use std::fs::File; + /// # use std::fs::File; /// - /// fn example() -> json::Reader { + /// fn example() -> arrow_json::Reader { /// let file = File::open("test/data/basic.json").unwrap(); /// /// // create a builder, inferring the schema with the first 100 records - /// let builder = json::ReaderBuilder::new().infer_schema(Some(100)); + /// let builder = arrow_json::ReaderBuilder::new().infer_schema(Some(100)); /// /// let reader = builder.build::(file).unwrap(); /// @@ -1723,7 +1728,7 @@ impl ReaderBuilder { } /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, source: R) -> Result> + pub fn build(self, source: R) -> Result, ArrowError> where R: Read + Seek, { @@ -1743,7 +1748,7 @@ impl ReaderBuilder { } impl Iterator for Reader { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { self.next().transpose() @@ -1752,12 +1757,9 @@ impl Iterator for Reader { #[cfg(test)] mod tests { - use crate::{ - buffer::Buffer, - datatypes::DataType::{Dictionary, List}, - }; - use super::*; + use arrow_buffer::ToByteSlice; + use arrow_schema::DataType::{Dictionary, List}; use flate2::read::GzDecoder; use std::fs::File; use std::io::Cursor; @@ -2076,12 +2078,8 @@ mod tests { #[test] fn test_invalid_json_infer_schema() { - let re = infer_json_schema_from_seekable( - &mut BufReader::new( - File::open("test/data/uk_cities_with_headers.csv").unwrap(), - ), - None, - ); + let re = + infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new(b"}")), None); assert_eq!( re.err().unwrap().to_string(), "Json error: Not valid JSON: expected value at line 1 column 1", @@ -2096,9 +2094,7 @@ mod tests { true, )])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(b"}")).unwrap(); assert_eq!( reader.next().err().unwrap().to_string(), "Json error: Not valid JSON: expected value at line 1 column 1", @@ -2107,7 +2103,7 @@ mod tests { #[test] fn test_coersion_scalar_and_list() { - use crate::datatypes::DataType::*; + use arrow_schema::DataType::*; assert_eq!( List(Box::new(Field::new("item", Float64, true))), diff --git a/arrow/src/json/writer.rs b/arrow-json/src/writer.rs similarity index 95% rename from arrow/src/json/writer.rs rename to arrow-json/src/writer.rs index f622b0cce77f..69f626600392 100644 --- a/arrow/src/json/writer.rs +++ b/arrow-json/src/writer.rs @@ -27,18 +27,15 @@ //! [`record_batches_to_json_rows`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! # use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); //! let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); //! -//! let json_rows = json::writer::record_batches_to_json_rows(&[batch]).unwrap(); +//! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[batch]).unwrap(); //! assert_eq!( //! serde_json::Value::Object(json_rows[1].clone()), //! serde_json::json!({"a": 2}), @@ -51,12 +48,9 @@ //! [`LineDelimitedWriter`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! # use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); @@ -64,7 +58,7 @@ //! //! // Write the record batch out as JSON //! let buf = Vec::new(); -//! let mut writer = json::LineDelimitedWriter::new(buf); +//! let mut writer = arrow_json::LineDelimitedWriter::new(buf); //! writer.write_batches(&vec![batch]).unwrap(); //! writer.finish().unwrap(); //! @@ -80,12 +74,9 @@ //! [`ArrayWriter`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); @@ -93,7 +84,7 @@ //! //! // Write the record batch out as a JSON array //! let buf = Vec::new(); -//! let mut writer = json::ArrayWriter::new(buf); +//! let mut writer = arrow_json::ArrayWriter::new(buf); //! writer.write_batches(&vec![batch]).unwrap(); //! writer.finish().unwrap(); //! @@ -108,13 +99,13 @@ use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; use serde_json::Value; -use crate::array::*; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::json::JsonSerializable; -use crate::record_batch::RecordBatch; +use crate::JsonSerializable; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_schema::*; -fn primitive_array_to_json(array: &ArrayRef) -> Result> +fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -131,7 +122,7 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, row_count: usize, -) -> Result>> { +) -> Result>, ArrowError> { let inner_col_names = array.column_names(); let mut inner_objs = iter::repeat(JsonMap::new()) @@ -150,7 +141,7 @@ fn struct_array_to_jsonmap_array( } /// Converts an arrow [`ArrayRef`] into a `Vec` of Serde JSON [`serde_json::Value`]'s -pub fn array_to_json_array(array: &ArrayRef) -> Result> { +pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), DataType::Boolean => Ok(as_boolean_array(array) @@ -269,7 +260,7 @@ fn set_column_for_json_rows( row_count: usize, array: &ArrayRef, col_name: &str, -) -> Result<()> { +) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { set_column_by_primitive_type::(rows, row_count, array, col_name); @@ -474,7 +465,7 @@ fn set_column_for_json_rows( rows.iter_mut() .zip(listarr.iter()) .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<()> { + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { row.insert( col_name.to_string(), @@ -489,7 +480,7 @@ fn set_column_for_json_rows( rows.iter_mut() .zip(listarr.iter()) .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<()> { + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { let val = array_to_json_array(&v)?; row.insert(col_name.to_string(), Value::Array(val)); @@ -499,7 +490,7 @@ fn set_column_for_json_rows( } DataType::Dictionary(_, value_type) => { let slice = array.slice(0, row_count); - let hydrated = crate::compute::kernels::cast::cast(&slice, value_type) + let hydrated = arrow_cast::cast::cast(&slice, value_type) .expect("cannot cast dictionary to underlying values"); set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; } @@ -555,7 +546,7 @@ fn set_column_for_json_rows( /// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( batches: &[RecordBatch], -) -> Result>> { +) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) .collect(); @@ -581,24 +572,28 @@ pub fn record_batches_to_json_rows( pub trait JsonFormat: Debug + Default { #[inline] /// write any bytes needed at the start of the file to the writer - fn start_stream(&self, _writer: &mut W) -> Result<()> { + fn start_stream(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } #[inline] /// write any bytes needed for the start of each row - fn start_row(&self, _writer: &mut W, _is_first_row: bool) -> Result<()> { + fn start_row( + &self, + _writer: &mut W, + _is_first_row: bool, + ) -> Result<(), ArrowError> { Ok(()) } #[inline] /// write any bytes needed for the end of each row - fn end_row(&self, _writer: &mut W) -> Result<()> { + fn end_row(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } /// write any bytes needed for the start of each row - fn end_stream(&self, _writer: &mut W) -> Result<()> { + fn end_stream(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } } @@ -614,7 +609,7 @@ pub trait JsonFormat: Debug + Default { pub struct LineDelimited {} impl JsonFormat for LineDelimited { - fn end_row(&self, writer: &mut W) -> Result<()> { + fn end_row(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"\n")?; Ok(()) } @@ -629,19 +624,23 @@ impl JsonFormat for LineDelimited { pub struct JsonArray {} impl JsonFormat for JsonArray { - fn start_stream(&self, writer: &mut W) -> Result<()> { + fn start_stream(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"[")?; Ok(()) } - fn start_row(&self, writer: &mut W, is_first_row: bool) -> Result<()> { + fn start_row( + &self, + writer: &mut W, + is_first_row: bool, + ) -> Result<(), ArrowError> { if !is_first_row { writer.write_all(b",")?; } Ok(()) } - fn end_stream(&self, writer: &mut W) -> Result<()> { + fn end_stream(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"]")?; Ok(()) } @@ -692,7 +691,7 @@ where } /// Write a single JSON row to the output writer - pub fn write_row(&mut self, row: &Value) -> Result<()> { + pub fn write_row(&mut self, row: &Value) -> Result<(), ArrowError> { let is_first_row = !self.started; if !self.started { self.format.start_stream(&mut self.writer)?; @@ -709,7 +708,7 @@ where } /// Convert the `RecordBatch` into JSON rows, and write them to the output - pub fn write(&mut self, batch: RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: RecordBatch) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(&[batch])? { self.write_row(&Value::Object(row))?; } @@ -717,7 +716,7 @@ where } /// Convert the [`RecordBatch`] into JSON rows, and write them to the output - pub fn write_batches(&mut self, batches: &[RecordBatch]) -> Result<()> { + pub fn write_batches(&mut self, batches: &[RecordBatch]) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(batches)? { self.write_row(&Value::Object(row))?; } @@ -727,7 +726,7 @@ where /// Finishes the output stream. This function must be called after /// all record batches have been produced. (e.g. producing the final `']'` if writing /// arrays. - pub fn finish(&mut self) -> Result<()> { + pub fn finish(&mut self) -> Result<(), ArrowError> { if self.started && !self.finished { self.format.end_stream(&mut self.writer)?; self.finished = true; @@ -743,15 +742,14 @@ where #[cfg(test)] mod tests { - use std::convert::TryFrom; use std::fs::{read_to_string, File}; use std::sync::Arc; + use crate::reader::*; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_data::ArrayData; use serde_json::json; - use crate::buffer::*; - use crate::json::reader::*; - use super::*; /// Asserts that the NDJSON `input` is semantically identical to `expected` diff --git a/arrow/test/data/arrays.json b/arrow-json/test/data/arrays.json similarity index 100% rename from arrow/test/data/arrays.json rename to arrow-json/test/data/arrays.json diff --git a/arrow/test/data/basic.json b/arrow-json/test/data/basic.json similarity index 100% rename from arrow/test/data/basic.json rename to arrow-json/test/data/basic.json diff --git a/arrow/test/data/basic_nulls.json b/arrow-json/test/data/basic_nulls.json similarity index 100% rename from arrow/test/data/basic_nulls.json rename to arrow-json/test/data/basic_nulls.json diff --git a/arrow/test/data/list_string_dict_nested.json b/arrow-json/test/data/list_string_dict_nested.json similarity index 100% rename from arrow/test/data/list_string_dict_nested.json rename to arrow-json/test/data/list_string_dict_nested.json diff --git a/arrow/test/data/list_string_dict_nested_nulls.json b/arrow-json/test/data/list_string_dict_nested_nulls.json similarity index 100% rename from arrow/test/data/list_string_dict_nested_nulls.json rename to arrow-json/test/data/list_string_dict_nested_nulls.json diff --git a/arrow/test/data/mixed_arrays.json b/arrow-json/test/data/mixed_arrays.json similarity index 100% rename from arrow/test/data/mixed_arrays.json rename to arrow-json/test/data/mixed_arrays.json diff --git a/arrow/test/data/mixed_arrays.json.gz b/arrow-json/test/data/mixed_arrays.json.gz similarity index 100% rename from arrow/test/data/mixed_arrays.json.gz rename to arrow-json/test/data/mixed_arrays.json.gz diff --git a/arrow/test/data/nested_structs.json b/arrow-json/test/data/nested_structs.json similarity index 100% rename from arrow/test/data/nested_structs.json rename to arrow-json/test/data/nested_structs.json diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index cc9421de710d..d5392673e299 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,16 +44,15 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } arrow-cast = { version = "26.0.0", path = "../arrow-cast" } arrow-csv = { version = "26.0.0", path = "../arrow-csv", optional = true } arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "26.0.0", path = "../arrow-json", optional = true } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-array = { version = "26.0.0", path = "../arrow-array" } arrow-select = { version = "26.0.0", path = "../arrow-select" } -arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } -serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } -indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } @@ -64,9 +63,8 @@ packed_simd = { version = "0.3", default-features = false, optional = true, pack chrono = { version = "0.4", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } -lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } multiversion = { version = "0.6.1", default-features = false } -bitflags = { version = "1.2.1", default-features = false } +bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] @@ -76,7 +74,7 @@ default = ["csv", "ipc", "json"] ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] -json = ["serde_json"] +json = ["arrow-json"] simd = ["packed_simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but @@ -90,7 +88,7 @@ pyarrow = ["pyo3", "ffi"] # but is run as part of our CI checks force_validate = ["arrow-data/force_validate"] # Enable ffi support -ffi = [] +ffi = ["bitflags"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars dyn_cmp_dict = [] @@ -100,9 +98,9 @@ dyn_arith_dict = [] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } criterion = { version = "0.4", default-features = false } -flate2 = { version = "1", default-features = false, features = ["rust_backend"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } [build-dependencies] diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index ef3ddf0537bb..7bc3f4179fef 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -15,13 +15,10 @@ // specific language governing permissions and limitations // under the License. -extern crate arrow; -extern crate criterion; - use criterion::*; use arrow::datatypes::*; -use arrow::json::ReaderBuilder; +use arrow_json::ReaderBuilder; use std::io::Cursor; use std::sync::Arc; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index d1e0095840a5..1b2ff0684a66 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -34,7 +34,9 @@ //! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions //! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays //! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays +//! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format //! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays +//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays //! @@ -315,8 +317,8 @@ pub mod ffi; pub mod ffi_stream; #[cfg(feature = "ipc")] pub use arrow_ipc as ipc; -#[cfg(feature = "serde_json")] -pub mod json; +#[cfg(feature = "json")] +pub use arrow_json as json; #[cfg(feature = "pyarrow")] pub mod pyarrow; diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index bafee11edb7e..0ca2ab91a5e8 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -3,6 +3,7 @@ testing/* target/* dev/release/rat_exclude_files.txt arrow/test/data/* +arrow-json/test/data/* arrow/test/dependency/* arrow-integration-test/data/* parquet_derive/test/dependency/* From 6b3a0a2877db25bc39818d555a8f014f9b63b890 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Nov 2022 22:00:37 +1300 Subject: [PATCH 0237/1411] Faster f64 equality (#3060) --- arrow-array/src/arithmetic.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index e596c0064369..2a8db2dd1e38 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -306,7 +306,10 @@ macro_rules! native_type_float_op { } fn is_eq(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_eq() + // Equivalent to `self.total_cmp(&rhs).is_eq()` + // but LLVM isn't able to realise this is bitwise equality + // https://rust.godbolt.org/z/347nWGxoW + self.to_bits() == rhs.to_bits() } fn is_ne(self, rhs: Self) -> bool { From 6057cf7ead11d66c7eeb15aed965b632afff07c4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 9 Nov 2022 02:11:00 -0800 Subject: [PATCH 0238/1411] Fix null_count computation in binary (#3062) --- arrow/src/compute/kernels/arithmetic.rs | 57 +++++++++++++++++++++++++ arrow/src/compute/kernels/arity.rs | 4 +- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index d12a0c1964fd..b310d4fbf8cd 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -3044,4 +3044,61 @@ mod tests { let c = add(&a, &b).unwrap(); assert_eq!(c, expected); } + + #[test] + fn test_resize_builder() { + let mut null_buffer_builder = BooleanBufferBuilder::new(16); + null_buffer_builder.append_slice(&[ + false, false, false, false, false, false, false, false, false, false, false, + false, false, true, true, true, + ]); + // `resize` resizes the buffer length to the ceil of byte numbers. + // So the underlying buffer is not changed. + null_buffer_builder.resize(13); + assert_eq!(null_buffer_builder.len(), 13); + + let null_buffer = null_buffer_builder.finish(); + + // `count_set_bits` counts 1-bits in entire buffer. Because above `resize` doesn't + // actually truncate the buffer, `count_set_bits` still return 3. + assert_eq!(null_buffer.count_set_bits(), 3); + // `count_set_bits_offset` takes len in bits as parameter. + assert_eq!(null_buffer.count_set_bits_offset(0, 13), 0); + + let mut data_buffer_builder = BufferBuilder::::new(13); + data_buffer_builder.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + let data_buffer = data_buffer_builder.finish(); + + let arg1: Int32Array = ArrayDataBuilder::new(DataType::Int32) + .len(13) + .null_count(13) + .buffers(vec![data_buffer]) + .null_bit_buffer(Some(null_buffer)) + .build() + .unwrap() + .into(); + + assert_eq!(arg1.null_count(), 13); + + let mut data_buffer_builder = BufferBuilder::::new(13); + data_buffer_builder.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + let data_buffer = data_buffer_builder.finish(); + + let arg2: Int32Array = ArrayDataBuilder::new(DataType::Int32) + .len(13) + .null_count(0) + .buffers(vec![data_buffer]) + .null_bit_buffer(None) + .build() + .unwrap() + .into(); + + assert_eq!(arg2.null_count(), 0); + + let result_dyn = add_dyn(&arg1, &arg2).unwrap(); + let result = result_dyn.as_any().downcast_ref::().unwrap(); + + assert_eq!(result.len(), 13); + assert_eq!(result.null_count(), 13); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 11ae5a204c5c..c99d2b727b8d 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -191,7 +191,7 @@ where let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); let null_count = null_buffer .as_ref() - .map(|x| len - x.count_set_bits()) + .map(|x| len - x.count_set_bits_offset(0, len)) .unwrap_or_default(); let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); @@ -241,7 +241,7 @@ where let null_count = null_buffer .as_ref() - .map(|x| len - x.count_set_bits()) + .map(|x| len - x.count_set_bits_offset(0, len)) .unwrap_or_default(); let mut buffer = BufferBuilder::::new(len); From 232fccc69ede31bc263f91d077a482ad38574304 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 9 Nov 2022 02:12:01 -0800 Subject: [PATCH 0239/1411] Replace temporal generic funections with dyn functions (#3046) --- arrow/src/compute/kernels/temporal.rs | 534 +++++++++++++++----------- 1 file changed, 307 insertions(+), 227 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 307f79606886..94ddc95c7590 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -308,46 +308,52 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - quarter_generic::(array) + quarter_internal(array) } /// Extracts the quarter of a given temporal array as an array of integersa within -/// the range of [1, 4]. -pub fn quarter_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn quarter_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - quarter_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let quarter_values = quarter_dyn(array.values())?; + Ok(Arc::new(array.with_values(&quarter_values))) + } + dt => return_compute_error_with!("quarter does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + quarter_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("quarter does not support", dt), + ) } - dt => quarter_internal::(array, &dt), } } /// Extracts the quarter of a given temporal array as an array of integers -fn quarter_internal>( - array: A, - dt: &DataType, -) -> Result +fn quarter_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.quarter() as i32 })) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.quarter() as i32 }) } @@ -362,45 +368,52 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - month_generic::(array) + month_internal(array) } -/// Extracts the month of a given temporal array as an array of integers -pub fn month_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// Extracts the month of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn month_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - month_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let month_values = month_dyn(array.values())?; + Ok(Arc::new(array.with_values(&month_values))) + } + dt => return_compute_error_with!("month does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + month_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("month does not support", dt), + ) } - dt => month_internal::(array, &dt), } } /// Extracts the month of a given temporal array as an array of integers -fn month_internal>( - array: A, - dt: &DataType, -) -> Result +fn month_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.month() as i32 })) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.month() as i32 }) } @@ -419,7 +432,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - num_days_from_monday_generic::(array) + num_days_from_monday_internal(array) } /// Extracts the day of week of a given temporal array as an array of @@ -428,18 +441,29 @@ where /// Monday is encoded as `0`, Tuesday as `1`, etc. /// /// See also [`num_days_from_sunday`] which starts at Sunday. -pub fn num_days_from_monday_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - num_days_from_monday_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = num_days_from_monday_dyn(array.values())?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!("num_days_from_monday does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + num_days_from_monday_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("num_days_from_monday does not support", dt), + ) } - dt => num_days_from_monday_internal::(array, &dt), } } @@ -449,25 +473,22 @@ where /// Monday is encoded as `0`, Tuesday as `1`, etc. /// /// See also [`num_days_from_sunday`] which starts at Sunday. -fn num_days_from_monday_internal>( - array: A, - dt: &DataType, -) -> Result +fn num_days_from_monday_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.num_days_from_monday() })) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.num_days_from_monday() }) } @@ -486,7 +507,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - num_days_from_sunday_generic::(array) + num_days_from_sunday_internal(array) } /// Extracts the day of week of a given temporal array as an array of @@ -495,18 +516,29 @@ where /// Sunday is encoded as `0`, Monday as `1`, etc. /// /// See also [`num_days_from_monday`] which starts at Monday. -pub fn num_days_from_sunday_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - num_days_from_sunday_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = num_days_from_sunday_dyn(array.values())?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!("num_days_from_sunday does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + num_days_from_sunday_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("num_days_from_sunday does not support", dt), + ) } - dt => num_days_from_sunday_internal::(array, &dt), } } @@ -516,25 +548,22 @@ where /// Sunday is encoded as `0`, Monday as `1`, etc. /// /// See also [`num_days_from_monday`] which starts at Monday. -fn num_days_from_sunday_internal>( - array: A, - dt: &DataType, -) -> Result +fn num_days_from_sunday_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.num_days_from_sunday() })) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.num_days_from_sunday() }) } @@ -551,41 +580,50 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - day_generic::(array) + day_internal(array) } -/// Extracts the day of a given temporal array as an array of integers -pub fn day_generic>(array: A) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// Extracts the day of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn day_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - day_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = day_dyn(array.values())?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!("day does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + day_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("day does not support", dt), + ) } - dt => day_internal::(array, &dt), } } /// Extracts the day of a given temporal array as an array of integers -fn day_internal>( - array: A, - dt: &DataType, -) -> Result +fn day_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| t.day() as i32)) + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| t.day() as i32)) } DataType::Timestamp(_, Some(ref tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.day() as i32 }) } @@ -600,46 +638,55 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - doy_generic::(array) + doy_internal(array) } /// Extracts the day of year of a given temporal array as an array of integers -/// The day of year that ranges from 1 to 366 -pub fn doy_generic>(array: A) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// The day of year that ranges from 1 to 366. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn doy_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - doy_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = doy_dyn(array.values())?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!("doy does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + doy_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("doy does not support", dt), + ) } - dt => doy_internal::(array, &dt), } } /// Extracts the day of year of a given temporal array as an array of integers /// The day of year that ranges from 1 to 366 -fn doy_internal>( - array: A, - dt: &DataType, -) -> Result +fn doy_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, T::Native: ArrowNativeType, i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.ordinal() as i32 })) } DataType::Timestamp(_, Some(ref tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { t.ordinal() as i32 }) } @@ -653,7 +700,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_generic::(array, "minute", |t| t.minute() as i32) + time_fraction_internal(array, "minute", |t| t.minute() as i32) } /// Extracts the week of a given temporal primitive array as an array of integers @@ -662,37 +709,46 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - week_generic::(array) + week_internal(array) } -/// Extracts the week of a given temporal array as an array of integers -pub fn week_generic>(array: A) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ +/// Extracts the week of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn week_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - week_internal::(array, value_type.as_ref()) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = week_dyn(array.values())?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!("week does not support", dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + week_internal(array) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!("week does not support", dt), + ) } - dt => week_internal::(array, &dt), } } /// Extracts the week of a given temporal array as an array of integers -fn week_internal>( - array: A, - dt: &DataType, -) -> Result +fn week_internal(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, { - match dt { + match array.data_type() { DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let b = Int32Builder::with_capacity(array.len()); let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, |t| { + Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { t.iso_week().week() as i32 })) } @@ -706,7 +762,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_generic::(array, "second", |t| t.second() as i32) + time_fraction_internal(array, "second", |t| t.second() as i32) } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers @@ -715,32 +771,46 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_generic::(array, "nanosecond", |t| t.nanosecond() as i32) + time_fraction_internal(array, "nanosecond", |t| t.nanosecond() as i32) +} + +/// Extracts the nanoseconds of a given temporal primitive array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn nanosecond_dyn(array: &dyn Array) -> Result { + time_fraction_dyn(array, "nanosecond", |t| t.nanosecond() as i32) } /// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_generic, F>( - array: A, - name: &str, - op: F, -) -> Result +fn time_fraction_dyn(array: &dyn Array, name: &str, op: F) -> Result where F: Fn(NaiveDateTime) -> i32, - T: ArrowTemporalType + ArrowNumericType, - i64: From, { match array.data_type().clone() { - DataType::Dictionary(_, value_type) => { - time_fraction_internal::(array, value_type.as_ref(), name, op) + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + array => { + let values = time_fraction_dyn(array.values(), name, op)?; + Ok(Arc::new(array.with_values(&values))) + } + dt => return_compute_error_with!(format!("{} does not support", name), dt), + ) + } + _ => { + downcast_temporal_array!( + array => { + time_fraction_internal(array, name, op) + .map(|a| Arc::new(a) as ArrayRef) + } + dt => return_compute_error_with!(format!("{} does not support", name), dt), + ) } - dt => time_fraction_internal::(array, &dt, name, op), } } /// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_internal, F>( - array: A, - dt: &DataType, +fn time_fraction_internal( + array: &PrimitiveArray, name: &str, op: F, ) -> Result @@ -750,14 +820,14 @@ where i64: From, { let b = Int32Builder::with_capacity(array.len()); - match dt { + match array.data_type() { DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::(iter, b, op)) + Ok(as_datetime_with_op::<_, T, _>(iter, b, op)) } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::(iter, b, tz, |t| { + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { op(t.naive_local()) }) } @@ -768,24 +838,18 @@ where } } -pub fn minute_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - time_fraction_generic::(array, "minute", |t| t.minute() as i32) +/// Extracts the minutes of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn minute_dyn(array: &dyn Array) -> Result { + time_fraction_dyn(array, "minute", |t| t.minute() as i32) } -pub fn second_generic>( - array: A, -) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - time_fraction_generic::(array, "second", |t| t.second() as i32) +/// Extracts the seconds of a given temporal array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn second_dyn(array: &dyn Array) -> Result { + time_fraction_dyn(array, "second", |t| t.second() as i32) } #[cfg(test)] @@ -1236,47 +1300,34 @@ mod tests { let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); - let b = time_fraction_generic::( - dict.downcast_dict::().unwrap(), - "minute", - |t| t.minute() as i32, - ) - .unwrap(); + let b = time_fraction_dyn(&dict, "minute", |t| t.minute() as i32).unwrap(); - let b_old = minute_generic::( - dict.downcast_dict::().unwrap(), - ) - .unwrap(); + let b_old = minute_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); - assert_eq!(expected, b); - assert_eq!(expected, b_old); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 2, 3])).unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + assert_eq!(&expected, &b_old); - let b = time_fraction_generic::( - dict.downcast_dict::().unwrap(), - "second", - |t| t.second() as i32, - ) - .unwrap(); + let b = time_fraction_dyn(&dict, "second", |t| t.second() as i32).unwrap(); - let b_old = second_generic::( - dict.downcast_dict::().unwrap(), - ) - .unwrap(); + let b_old = second_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![1, 1, 2, 3, 2]); - assert_eq!(expected, b); - assert_eq!(expected, b_old); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 2, 3])).unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + assert_eq!(&expected, &b_old); - let b = time_fraction_generic::( - dict.downcast_dict::().unwrap(), - "nanosecond", - |t| t.nanosecond() as i32, - ) - .unwrap(); + let b = + time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); - let expected = Int32Array::from(vec![0, 0, 0, 0, 0]); - assert_eq!(expected, b); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![0, 0, 0, 0, 0])) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); } #[test] @@ -1308,20 +1359,19 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::try_new(&keys, &a).unwrap(); - let b = quarter_generic::( - dict.downcast_dict::().unwrap(), - ) - .unwrap(); + let b = quarter_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![1, 3, 3, 1]); - assert_eq!(expected, b); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 3, 3, 1])).unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); - let b = - month_generic::(dict.downcast_dict::().unwrap()) - .unwrap(); + let b = month_dyn(&dict).unwrap(); - let expected = Int32Array::from(vec![1, 8, 8, 1]); - assert_eq!(expected, b); + let expected_dict = + DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 8, 8, 1])).unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); } #[test] @@ -1334,37 +1384,55 @@ mod tests { let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), Some(0), None]); let dict = DictionaryArray::try_new(&keys, &a).unwrap(); - let b = num_days_from_monday_generic::( - dict.downcast_dict::().unwrap(), + let b = num_days_from_monday_dyn(&dict).unwrap(); + + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![Some(0), Some(2), Some(2), Some(0), None]), ) .unwrap(); - let expected = Int32Array::from(vec![Some(0), Some(2), Some(2), Some(0), None]); - assert_eq!(expected, b); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + + let b = num_days_from_sunday_dyn(&dict).unwrap(); - let b = num_days_from_sunday_generic::( - dict.downcast_dict::().unwrap(), + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![Some(1), Some(3), Some(3), Some(1), None]), ) .unwrap(); - let expected = Int32Array::from(vec![Some(1), Some(3), Some(3), Some(1), None]); - assert_eq!(expected, b); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); - let b = - day_generic::(dict.downcast_dict::().unwrap()) - .unwrap(); - let expected = Int32Array::from(vec![Some(1), Some(20), Some(20), Some(1), None]); - assert_eq!(expected, b); + let b = day_dyn(&dict).unwrap(); - let b = - doy_generic::(dict.downcast_dict::().unwrap()) - .unwrap(); - let expected = Int32Array::from(vec![Some(1), Some(51), Some(51), Some(1), None]); - assert_eq!(expected, b); + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![Some(1), Some(20), Some(20), Some(1), None]), + ) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); - let b = - week_generic::(dict.downcast_dict::().unwrap()) - .unwrap(); - let expected = Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]); - assert_eq!(expected, b); + let b = doy_dyn(&dict).unwrap(); + + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![Some(1), Some(51), Some(51), Some(1), None]), + ) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + + let b = week_dyn(&dict).unwrap(); + + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]), + ) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); } #[test] @@ -1380,5 +1448,17 @@ mod tests { let b = nanosecond(&a).unwrap(); assert!(!b.is_valid(0)); assert_eq!(453_000_000, b.value(1)); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1)]); + let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let b = nanosecond_dyn(&dict).unwrap(); + + let expected_dict = DictionaryArray::try_new( + &keys, + &Int32Array::from(vec![None, Some(453_000_000)]), + ) + .unwrap(); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); } } From af5f1e4f0a8bd14240153953700fe16b072315be Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Nov 2022 23:53:47 +1300 Subject: [PATCH 0240/1411] Faster f64 inequality (#3065) --- arrow-array/src/arithmetic.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index 2a8db2dd1e38..769c013d9fd2 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -313,7 +313,7 @@ macro_rules! native_type_float_op { } fn is_ne(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_ne() + !self.is_eq(rhs) } fn is_lt(self, rhs: Self) -> bool { From 5a3ecc2ea270af7f9aba4c1a162072acf9541fb8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Nov 2022 06:38:16 +1300 Subject: [PATCH 0241/1411] Fix row format decode loses timezone (#3063) (#3064) --- arrow/src/row/dictionary.rs | 36 +++++++++++++--------- arrow/src/row/fixed.rs | 17 +++++++++-- arrow/src/row/mod.rs | 61 +++++++++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 31 deletions(-) diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs index 1ec7c2a2145c..950a7d8972f8 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow/src/row/dictionary.rs @@ -90,8 +90,8 @@ pub fn encode_dictionary( } macro_rules! decode_primitive_helper { - ($t:ty, $values: ident) => { - decode_primitive::<$t>(&$values) + ($t:ty, $values: ident, $data_type:ident) => { + decode_primitive::<$t>(&$values, $data_type.clone()) }; } @@ -170,11 +170,11 @@ pub unsafe fn decode_dictionary( } let child = downcast_primitive! { - &value_type => (decode_primitive_helper, values), + value_type => (decode_primitive_helper, values, value_type), DataType::Null => NullArray::new(values.len()).into_data(), DataType::Boolean => decode_bool(&values), - DataType::Decimal128(p, s) => decode_decimal::(&values, *p, *s), - DataType::Decimal256(p, s) => decode_decimal::(&values, *p, *s), + DataType::Decimal128(_, _) => decode_primitive_helper!(Decimal128Type, values, value_type), + DataType::Decimal256(_, _) => decode_primitive_helper!(Decimal256Type, values, value_type), DataType::Utf8 => decode_string::(&values), DataType::LargeUtf8 => decode_string::(&values), DataType::Binary => decode_binary::(&values), @@ -247,7 +247,11 @@ fn decode_bool(values: &[&[u8]]) -> ArrayData { } /// Decodes a fixed length type array from dictionary values -fn decode_fixed( +/// +/// # Safety +/// +/// `data_type` must be appropriate native type for `T` +unsafe fn decode_fixed( values: &[&[u8]], data_type: DataType, ) -> ArrayData { @@ -267,17 +271,19 @@ fn decode_fixed( } /// Decodes a `PrimitiveArray` from dictionary values -fn decode_primitive(values: &[&[u8]]) -> ArrayData +fn decode_primitive( + values: &[&[u8]], + data_type: DataType, +) -> ArrayData where T::Native: FixedLengthEncoding, { - decode_fixed::(values, T::DATA_TYPE) -} + assert_eq!( + std::mem::discriminant(&T::DATA_TYPE), + std::mem::discriminant(&data_type), + ); -/// Decodes a `DecimalArray` from dictionary values -fn decode_decimal(values: &[&[u8]], precision: u8, scale: u8) -> ArrayData -where - T::Native: FixedLengthEncoding, -{ - decode_fixed::(values, T::TYPE_CONSTRUCTOR(precision, scale)) + // SAFETY: + // Validated data type above + unsafe { decode_fixed::(values, data_type) } } diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index d5935cfb6472..76bf358e7e03 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -267,7 +267,11 @@ pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray { } /// Decodes a `ArrayData` from rows based on the provided `FixedLengthEncoding` `T` -fn decode_fixed( +/// +/// # Safety +/// +/// `data_type` must be appropriate native type for `T` +unsafe fn decode_fixed( rows: &mut [&[u8]], data_type: DataType, options: SortOptions, @@ -319,16 +323,23 @@ fn decode_fixed( .null_bit_buffer(Some(nulls.into())); // SAFETY: Buffers correct length - unsafe { builder.build_unchecked() } + builder.build_unchecked() } /// Decodes a `PrimitiveArray` from rows pub fn decode_primitive( rows: &mut [&[u8]], + data_type: DataType, options: SortOptions, ) -> PrimitiveArray where T::Native: FixedLengthEncoding + ToByteSlice, { - decode_fixed::(rows, T::DATA_TYPE, options).into() + assert_eq!( + std::mem::discriminant(&T::DATA_TYPE), + std::mem::discriminant(&data_type), + ); + // SAFETY: + // Validated data type above + unsafe { decode_fixed::(rows, data_type, options).into() } } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 8af642240e7e..15fe5dc427b9 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -629,8 +629,8 @@ fn encode_column( } macro_rules! decode_primitive_helper { - ($t:ty, $rows: ident, $options:ident) => { - Arc::new(decode_primitive::<$t>($rows, $options)) + ($t:ty, $rows: ident, $data_type:ident, $options:ident) => { + Arc::new(decode_primitive::<$t>($rows, $data_type, $options)) }; } @@ -645,24 +645,17 @@ unsafe fn decode_column( interner: Option<&OrderPreservingInterner>, ) -> Result { let options = field.options; + let data_type = field.data_type.clone(); let array: ArrayRef = downcast_primitive! { - &field.data_type => (decode_primitive_helper, rows, options), + data_type => (decode_primitive_helper, rows, data_type, options), DataType::Null => Arc::new(NullArray::new(rows.len())), DataType::Boolean => Arc::new(decode_bool(rows, options)), DataType::Binary => Arc::new(decode_binary::(rows, options)), DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options)), - DataType::Decimal128(p, s) => Arc::new( - decode_primitive::(rows, options) - .with_precision_and_scale(*p, *s) - .unwrap(), - ), - DataType::Decimal256(p, s) => Arc::new( - decode_primitive::(rows, options) - .with_precision_and_scale(*p, *s) - .unwrap(), - ), + DataType::Decimal128(_, _) => decode_primitive_helper!(Decimal128Type, rows, data_type, options), + DataType::Decimal256(_, _) => decode_primitive_helper!(Decimal256Type, rows, data_type, options), DataType::Dictionary(k, v) => match k.as_ref() { DataType::Int8 => Arc::new(decode_dictionary::( interner.unwrap(), @@ -900,6 +893,48 @@ mod tests { assert_eq!(&cols[0], &col); } + #[test] + fn test_timezone() { + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]) + .with_timezone("+01:00".to_string()); + let d = a.data_type().clone(); + + let mut converter = + RowConverter::new(vec![SortField::new(a.data_type().clone())]); + let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap(); + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(back[0].data_type(), &d); + + // Test dictionary + let mut a = + PrimitiveDictionaryBuilder::::new(); + a.append(34).unwrap(); + a.append_null(); + a.append(345).unwrap(); + + // Construct dictionary with a timezone + let dict = a.finish(); + let values = TimestampNanosecondArray::from(dict.values().data().clone()); + let dict_with_tz = dict.with_values(&values.with_timezone("+02:00".to_string())); + let d = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Timestamp( + TimeUnit::Nanosecond, + Some("+02:00".to_string()), + )), + ); + + assert_eq!(dict_with_tz.data_type(), &d); + let mut converter = RowConverter::new(vec![SortField::new(d.clone())]); + let rows = converter + .convert_columns(&[Arc::new(dict_with_tz) as _]) + .unwrap(); + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(back[0].data_type(), &d); + } + #[test] fn test_null_encoding() { let col = Arc::new(NullArray::new(10)); From e4e15f8e7efc31db6469198852c3e7719577411d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 9 Nov 2022 12:54:17 -0500 Subject: [PATCH 0242/1411] Minor: Improve docstrings on WriterPropertiesBuilder (#3068) --- parquet/src/file/properties.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index dc9feb4ce185..cf821df2110f 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -306,38 +306,46 @@ impl WriterPropertiesBuilder { self } - /// Sets best effort maximum size of a data page in bytes + /// Sets best effort maximum size of a data page in bytes. /// - /// Note: this is a best effort limit based on the write batch size + /// Note: this is a best effort limit based on value of + /// [`set_write_batch_size`](Self::set_write_batch_size). pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { self.data_pagesize_limit = value; self } - /// Sets best effort maximum number of rows in a data page + /// Sets best effort maximum number of rows in a data page. /// /// /// This can be used to limit the number of rows within a page to - /// yield better page pruning + /// yield better page pruning. /// - /// Note: this is a best effort limit based on the write batch size + /// Note: this is a best effort limit based on value of + /// [`set_write_batch_size`](Self::set_write_batch_size). pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self { self.data_page_row_count_limit = value; self } - /// Sets best effort maximum dictionary page size, in bytes + /// Sets best effort maximum dictionary page size, in bytes. /// - /// Note: this is a best effort limit based on the write batch size + /// Note: this is a best effort limit based on value of + /// [`set_write_batch_size`](Self::set_write_batch_size). pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { self.dictionary_pagesize_limit = value; self } - /// Sets write batch size + /// Sets write batch size. + /// + /// For performance reasons, data for each column is written in + /// batches of this size. /// - /// Data is written in batches of this size, acting as an upper-bound on - /// the enforcement granularity of page limits + /// Additional limits such as such as + /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit) + /// are checked between batches, and thus the write batch size value acts as an + /// upper-bound on the enforcement granularity of other limits. pub fn set_write_batch_size(mut self, value: usize) -> Self { self.write_batch_size = value; self From f5962092c005c2ae3bfa10c050d75f36fab75d46 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 9 Nov 2022 14:54:10 -0800 Subject: [PATCH 0243/1411] Add Decimal128 and Decimal256 to downcast_primitive (#3056) * Add Decimal128 and Decimal256 to downcast_primitive * Add doc test * Fix test * For review --- arrow-array/src/cast.rs | 8 ++++++++ arrow/src/row/mod.rs | 21 --------------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 7bc62713733e..4569c36812bf 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -225,6 +225,8 @@ macro_rules! downcast_temporal_array { /// assert_eq!(primitive_size(&DataType::Int32), 4); /// assert_eq!(primitive_size(&DataType::Int64), 8); /// assert_eq!(primitive_size(&DataType::Float16), 2); +/// assert_eq!(primitive_size(&DataType::Decimal128(38, 10)), 16); +/// assert_eq!(primitive_size(&DataType::Decimal256(76, 20)), 32); /// ``` /// /// [`DataType`]: arrow_schema::DataType @@ -242,6 +244,12 @@ macro_rules! downcast_primitive { $crate::repeat_pat!(arrow_schema::DataType::Float64, $($data_type),+) => { $m!($crate::types::Float64Type $(, $args)*) } + $crate::repeat_pat!(arrow_schema::DataType::Decimal128(_, _), $($data_type),+) => { + $m!($crate::types::Decimal128Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Decimal256(_, _), $($data_type),+) => { + $m!($crate::types::Decimal256Type $(, $args)*) + } $crate::repeat_pat!(arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), $($data_type),+) => { $m!($crate::types::IntervalYearMonthType $(, $args)*) } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 15fe5dc427b9..e0312be1f8da 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -102,7 +102,6 @@ use std::sync::Arc; use arrow_array::cast::*; use arrow_array::*; -use arrow_buffer::i256; use crate::compute::SortOptions; use crate::datatypes::*; @@ -504,8 +503,6 @@ fn new_empty_rows( array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), DataType::Null => {}, DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), - DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += i128::ENCODED_LEN), - DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += i256::ENCODED_LEN), DataType::Binary => as_generic_binary_array::(array) .iter() .zip(lengths.iter_mut()) @@ -586,22 +583,6 @@ fn encode_column( column => fixed::encode(out, column, opts), DataType::Null => {} DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), - DataType::Decimal128(_, _) => { - let column = column - .as_any() - .downcast_ref::() - .unwrap(); - - fixed::encode(out, column, opts) - }, - DataType::Decimal256(_, _) => { - let column = column - .as_any() - .downcast_ref::() - .unwrap(); - - fixed::encode(out, column, opts) - }, DataType::Binary => { variable::encode(out, as_generic_binary_array::(column).iter(), opts) } @@ -654,8 +635,6 @@ unsafe fn decode_column( DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options)), - DataType::Decimal128(_, _) => decode_primitive_helper!(Decimal128Type, rows, data_type, options), - DataType::Decimal256(_, _) => decode_primitive_helper!(Decimal256Type, rows, data_type, options), DataType::Dictionary(k, v) => match k.as_ref() { DataType::Int8 => Arc::new(decode_dictionary::( interner.unwrap(), From d76a0d634521bf051d3bb2774af2006006bab999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=93=87=E5=91=9C=E5=93=87=E5=91=9C=E5=91=80=E5=92=A6?= =?UTF-8?q?=E8=80=B6?= Date: Thu, 10 Nov 2022 08:15:34 +0800 Subject: [PATCH 0244/1411] Support cast timestamp to time (#3016) * Support cast timestamp to time * support timestamp to time and add more test cases * organize imports * format code * comment test * support timezone * code format * support both no timezone and a timezone --- arrow-array/src/temporal_conversions.rs | 34 ++- arrow-cast/src/cast.rs | 359 +++++++++++++++++++++++- 2 files changed, 390 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index 8b1064115dbb..a4d910cc8bc1 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -20,7 +20,9 @@ use crate::timezone::Tz; use crate::ArrowPrimitiveType; use arrow_schema::{DataType, TimeUnit}; -use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; +use chrono::{ + DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc, +}; /// Number of seconds in a day pub const SECONDS_IN_DAY: i64 = 86_400; @@ -33,6 +35,10 @@ pub const NANOSECONDS: i64 = 1_000_000_000; /// Number of milliseconds in a day pub const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; +/// Number of microseconds in a day +pub const MICROSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MICROSECONDS; +/// Number of nanoseconds in a day +pub const NANOSECONDS_IN_DAY: i64 = SECONDS_IN_DAY * NANOSECONDS; /// Number of days between 0001-01-01 and 1970-01-01 pub const EPOCH_DAYS_FROM_CE: i32 = 719_163; @@ -97,6 +103,32 @@ pub fn time64ns_to_time(v: i64) -> Option { ) } +/// converts [`NaiveTime`] to a `i32` representing a `time32(s)` +#[inline] +pub fn time_to_time32s(v: NaiveTime) -> i32 { + v.num_seconds_from_midnight() as i32 +} + +/// converts [`NaiveTime`] to a `i32` representing a `time32(ms)` +#[inline] +pub fn time_to_time32ms(v: NaiveTime) -> i32 { + (v.num_seconds_from_midnight() as i64 * MILLISECONDS + + v.nanosecond() as i64 * MILLISECONDS / NANOSECONDS) as i32 +} + +/// converts [`NaiveTime`] to a `i64` representing a `time64(us)` +#[inline] +pub fn time_to_time64us(v: NaiveTime) -> i64 { + v.num_seconds_from_midnight() as i64 * MICROSECONDS + + v.nanosecond() as i64 * MICROSECONDS / NANOSECONDS +} + +/// converts [`NaiveTime`] to a `i64` representing a `time64(ns)` +#[inline] +pub fn time_to_time64ns(v: NaiveTime) -> i64 { + v.num_seconds_from_midnight() as i64 * NANOSECONDS + v.nanosecond() as i64 +} + /// converts a `i64` representing a `timestamp(s)` to [`NaiveDateTime`] #[inline] pub fn timestamp_s_to_datetime(v: i64) -> Option { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1cc814730850..bbd38fbc0267 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,7 +35,7 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{DateTime, NaiveDateTime, Timelike}; +use chrono::{DateTime, NaiveDateTime, NaiveTime, Timelike}; use std::sync::Arc; use crate::display::{array_value_to_string, lexical_to_string}; @@ -244,8 +244,15 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Timestamp(_, _), Int64) => true, (Int64, Timestamp(_, _)) => true, - (Timestamp(_, _), Timestamp(_, _) | Date32 | Date64) => true, (Date64, Timestamp(_, None)) => true, + (Timestamp(_, _), + Timestamp(_, _) + | Date32 + | Date64 + | Time32(TimeUnit::Second) + | Time32(TimeUnit::Millisecond) + | Time64(TimeUnit::Microsecond) + | Time64(TimeUnit::Nanosecond)) => true, (Int64, Duration(_)) => true, (Duration(_), Int64) => true, (Interval(from_type), Int64) => { @@ -559,6 +566,24 @@ fn make_timestamp_array( } } +fn as_time_res_with_timezone( + v: i64, + tz: Option, +) -> Result { + let time = match tz { + Some(tz) => as_datetime_with_timezone::(v, tz).map(|d| d.time()), + None => as_datetime::(v).map(|d| d.time()), + }; + + time.ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to create naive time with {} {}", + std::any::type_name::(), + v + )) + }) +} + /// Cast `array` to the provided data type and return a new Array with /// type `to_type`, if possible. It accepts `CastOptions` to allow consumers /// to configure cast behavior. @@ -1561,6 +1586,182 @@ pub fn cast_with_options( as_primitive_array::(array) .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), )), + (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Microsecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampSecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Nanosecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampSecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Microsecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Nanosecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Microsecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Nanosecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Microsecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Nanosecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Second)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32SecondType, ArrowError>(|x| { + Ok(time_to_time32s(as_time_res_with_timezone::< + TimestampSecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Millisecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampSecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Second)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32SecondType, ArrowError>(|x| { + Ok(time_to_time32s(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Millisecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Second)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32SecondType, ArrowError>(|x| { + Ok(time_to_time32s(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Millisecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Second)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32SecondType, ArrowError>(|x| { + Ok(time_to_time32s(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, + )) + } + (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Millisecond)) => { + let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; + Ok(Arc::new( + as_primitive_array::(array) + .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, + )) + } (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( as_primitive_array::(array) @@ -4234,6 +4435,160 @@ mod tests { assert!(c.is_null(2)); } + #[test] + fn test_cast_timestamp_to_time64() { + // test timestamp secs + let a = TimestampSecondArray::from(vec![Some(86405), Some(1), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000, c.value(0)); + assert_eq!(3601000000, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000000, c.value(0)); + assert_eq!(3601000000000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp milliseconds + let a = TimestampMillisecondArray::from(vec![Some(86405000), Some(1000), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000, c.value(0)); + assert_eq!(3601000000, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000000, c.value(0)); + assert_eq!(3601000000000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp microseconds + let a = + TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000, c.value(0)); + assert_eq!(3601000000, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000000, c.value(0)); + assert_eq!(3601000000000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp nanoseconds + let a = TimestampNanosecondArray::from(vec![ + Some(86405000000000), + Some(1000000000), + None, + ]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000, c.value(0)); + assert_eq!(3601000000, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000000000, c.value(0)); + assert_eq!(3601000000000, c.value(1)); + assert!(c.is_null(2)); + + // test overflow + let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)); + assert!(b.is_err()); + let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)); + assert!(b.is_err()); + } + + #[test] + fn test_cast_timestamp_to_time32() { + // test timestamp secs + let a = TimestampSecondArray::from(vec![Some(86405), Some(1), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605, c.value(0)); + assert_eq!(3601, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000, c.value(0)); + assert_eq!(3601000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp milliseconds + let a = TimestampMillisecondArray::from(vec![Some(86405000), Some(1000), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605, c.value(0)); + assert_eq!(3601, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000, c.value(0)); + assert_eq!(3601000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp microseconds + let a = + TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605, c.value(0)); + assert_eq!(3601, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000, c.value(0)); + assert_eq!(3601000, c.value(1)); + assert!(c.is_null(2)); + + // test timestamp nanoseconds + let a = TimestampNanosecondArray::from(vec![ + Some(86405000000000), + Some(1000000000), + None, + ]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605, c.value(0)); + assert_eq!(3601, c.value(1)); + assert!(c.is_null(2)); + let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(3605000, c.value(0)); + assert_eq!(3601000, c.value(1)); + assert!(c.is_null(2)); + + // test overflow + let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) + .with_timezone("+01:00".to_string()); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Time32(TimeUnit::Second)); + assert!(b.is_err()); + let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)); + assert!(b.is_err()); + } + #[test] fn test_cast_date64_to_timestamp() { let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); From a0fb44a3ce6e766bc5721542ccea8e62577d565a Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 10 Nov 2022 10:35:47 +0800 Subject: [PATCH 0245/1411] add tz in debug information (#3072) --- arrow-array/src/array/primitive_array.rs | 88 ++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index eb3618f7c307..b13ea5681e87 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -551,11 +551,33 @@ impl std::fmt::Debug for PrimitiveArray { None => write!(f, "null"), } } - DataType::Timestamp(_, _) => { + DataType::Timestamp(_, tz_string_opt) => { let v = self.value(index).to_isize().unwrap() as i64; - match as_datetime::(v) { - Some(datetime) => write!(f, "{:?}", datetime), - None => write!(f, "null"), + match tz_string_opt { + // for Timestamp with TimeZone + Some(tz_string) => { + match tz_string.parse::() { + // if the time zone is valid, construct a DateTime and format it as rfc3339 + Ok(tz) => match as_datetime_with_timezone::(v, tz) { + Some(datetime) => write!(f, "{}", datetime.to_rfc3339()), + None => write!(f, "null"), + }, + // if the time zone is invalid, shows NaiveDateTime with an error message + Err(_) => match as_datetime::(v) { + Some(datetime) => write!( + f, + "{:?} (Unknown Time Zone '{}')", + datetime, tz_string + ), + None => write!(f, "null"), + }, + } + } + // for Timestamp without TimeZone + None => match as_datetime::(v) { + Some(datetime) => write!(f, "{:?}", datetime), + None => write!(f, "null"), + }, } } _ => std::fmt::Debug::fmt(&array.value(index), f), @@ -1323,7 +1345,8 @@ mod tests { } #[test] - fn test_timestamp_with_tz_fmt_debug() { + #[cfg(feature = "chrono-tz")] + fn test_timestamp_with_named_tz_fmt_debug() { let arr: PrimitiveArray = TimestampMillisecondArray::from(vec![ 1546214400000, @@ -1332,7 +1355,26 @@ mod tests { ]) .with_timezone("Asia/Taipei".to_string()); assert_eq!( - "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", + "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", + format!("{:?}", arr) + ); + } + + #[test] + #[cfg(not(feature = "chrono-tz"))] + fn test_timestamp_with_named_tz_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]) + .with_timezone("Asia/Taipei".to_string()); + + println!("{:?}", arr); + + assert_eq!( + "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n]", format!("{:?}", arr) ); } @@ -1347,7 +1389,39 @@ mod tests { ]) .with_timezone("+08:00".to_string()); assert_eq!( - "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", + "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", + format!("{:?}", arr) + ); + } + + #[test] + fn test_timestamp_with_incorrect_tz_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]) + .with_timezone("xxx".to_string()); + assert_eq!( + "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'xxx'),\n]", + format!("{:?}", arr) + ); + } + + #[test] + #[cfg(feature = "chrono-tz")] + fn test_timestamp_with_tz_with_daylight_saving_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1647161999000, + 1647162000000, + 1667717999000, + 1667718000000, + ]) + .with_timezone("America/Denver".to_string()); + assert_eq!( + "PrimitiveArray\n[\n 2022-03-13T01:59:59-07:00,\n 2022-03-13T03:00:00-06:00,\n 2022-11-06T00:59:59-06:00,\n 2022-11-06T01:00:00-06:00,\n]", format!("{:?}", arr) ); } From 0e97338ba4d1e58ed88927dc230d5eb5fead38c5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:40:13 +1300 Subject: [PATCH 0246/1411] Add missing inline to ArrowNativeTypeOp (#3073) --- arrow-array/src/arithmetic.rs | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index 769c013d9fd2..f21532364f65 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -96,6 +96,7 @@ macro_rules! native_type_op { const ZERO: Self = $zero; const ONE: Self = $one; + #[inline] fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( @@ -105,10 +106,12 @@ macro_rules! native_type_op { }) } + #[inline] fn add_wrapping(self, rhs: Self) -> Self { self.wrapping_add(rhs) } + #[inline] fn sub_checked(self, rhs: Self) -> Result { self.checked_sub(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( @@ -118,10 +121,12 @@ macro_rules! native_type_op { }) } + #[inline] fn sub_wrapping(self, rhs: Self) -> Self { self.wrapping_sub(rhs) } + #[inline] fn mul_checked(self, rhs: Self) -> Result { self.checked_mul(rhs).ok_or_else(|| { ArrowError::ComputeError(format!( @@ -131,10 +136,12 @@ macro_rules! native_type_op { }) } + #[inline] fn mul_wrapping(self, rhs: Self) -> Self { self.wrapping_mul(rhs) } + #[inline] fn div_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) @@ -148,10 +155,12 @@ macro_rules! native_type_op { } } + #[inline] fn div_wrapping(self, rhs: Self) -> Self { self.wrapping_div(rhs) } + #[inline] fn mod_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) @@ -165,54 +174,66 @@ macro_rules! native_type_op { } } + #[inline] fn mod_wrapping(self, rhs: Self) -> Self { self.wrapping_rem(rhs) } + #[inline] fn neg_checked(self) -> Result { self.checked_neg().ok_or_else(|| { ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) }) } + #[inline] fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) }) } + #[inline] fn pow_wrapping(self, exp: u32) -> Self { self.wrapping_pow(exp) } + #[inline] fn neg_wrapping(self) -> Self { self.wrapping_neg() } + #[inline] fn is_zero(self) -> bool { self == Self::ZERO } + #[inline] fn is_eq(self, rhs: Self) -> bool { self == rhs } + #[inline] fn is_ne(self, rhs: Self) -> bool { self != rhs } + #[inline] fn is_lt(self, rhs: Self) -> bool { self < rhs } + #[inline] fn is_le(self, rhs: Self) -> bool { self <= rhs } + #[inline] fn is_gt(self, rhs: Self) -> bool { self > rhs } + #[inline] fn is_ge(self, rhs: Self) -> bool { self >= rhs } @@ -237,30 +258,37 @@ macro_rules! native_type_float_op { const ZERO: Self = $zero; const ONE: Self = $one; + #[inline] fn add_checked(self, rhs: Self) -> Result { Ok(self + rhs) } + #[inline] fn add_wrapping(self, rhs: Self) -> Self { self + rhs } + #[inline] fn sub_checked(self, rhs: Self) -> Result { Ok(self - rhs) } + #[inline] fn sub_wrapping(self, rhs: Self) -> Self { self - rhs } + #[inline] fn mul_checked(self, rhs: Self) -> Result { Ok(self * rhs) } + #[inline] fn mul_wrapping(self, rhs: Self) -> Self { self * rhs } + #[inline] fn div_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) @@ -269,10 +297,12 @@ macro_rules! native_type_float_op { } } + #[inline] fn div_wrapping(self, rhs: Self) -> Self { self / rhs } + #[inline] fn mod_checked(self, rhs: Self) -> Result { if rhs.is_zero() { Err(ArrowError::DivideByZero) @@ -281,30 +311,37 @@ macro_rules! native_type_float_op { } } + #[inline] fn mod_wrapping(self, rhs: Self) -> Self { self % rhs } + #[inline] fn neg_checked(self) -> Result { Ok(-self) } + #[inline] fn neg_wrapping(self) -> Self { -self } + #[inline] fn pow_checked(self, exp: u32) -> Result { Ok(self.powi(exp as i32)) } + #[inline] fn pow_wrapping(self, exp: u32) -> Self { self.powi(exp as i32) } + #[inline] fn is_zero(self) -> bool { self == $zero } + #[inline] fn is_eq(self, rhs: Self) -> bool { // Equivalent to `self.total_cmp(&rhs).is_eq()` // but LLVM isn't able to realise this is bitwise equality @@ -312,22 +349,27 @@ macro_rules! native_type_float_op { self.to_bits() == rhs.to_bits() } + #[inline] fn is_ne(self, rhs: Self) -> bool { !self.is_eq(rhs) } + #[inline] fn is_lt(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_lt() } + #[inline] fn is_le(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_le() } + #[inline] fn is_gt(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_gt() } + #[inline] fn is_ge(self, rhs: Self) -> bool { self.total_cmp(&rhs).is_ge() } From c027c70b7646f5442f4ed8812919421a0ebfed2e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:47:50 +1300 Subject: [PATCH 0247/1411] Deprecate Buffer::count_set_bits (#3067) (#3071) * Deprecate Buffer::count_set_bits (#3067) * Format --- arrow-buffer/src/buffer/immutable.rs | 27 ++++++++++++++++--------- arrow-select/src/filter.rs | 2 +- arrow/src/compute/kernels/arithmetic.rs | 3 --- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index c60d28afc782..94bc98678a61 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -215,6 +215,7 @@ impl Buffer { } /// Returns the number of 1-bits in this buffer. + #[deprecated(note = "use count_set_bits_offset instead")] pub fn count_set_bits(&self) -> usize { let len_in_bits = self.len() * 8; // self.offset is already taken into consideration by the bit_chunks implementation @@ -466,11 +467,17 @@ mod tests { #[test] fn test_count_bits() { - assert_eq!(0, Buffer::from(&[0b00000000]).count_set_bits()); - assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits()); - assert_eq!(3, Buffer::from(&[0b00001101]).count_set_bits()); - assert_eq!(6, Buffer::from(&[0b01001001, 0b01010010]).count_set_bits()); - assert_eq!(16, Buffer::from(&[0b11111111, 0b11111111]).count_set_bits()); + assert_eq!(0, Buffer::from(&[0b00000000]).count_set_bits_offset(0, 8)); + assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits_offset(0, 8)); + assert_eq!(3, Buffer::from(&[0b00001101]).count_set_bits_offset(0, 8)); + assert_eq!( + 6, + Buffer::from(&[0b01001001, 0b01010010]).count_set_bits_offset(0, 16) + ); + assert_eq!( + 16, + Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(0, 16) + ); } #[test] @@ -479,31 +486,31 @@ mod tests { 0, Buffer::from(&[0b11111111, 0b00000000]) .slice(1) - .count_set_bits() + .count_set_bits_offset(0, 8) ); assert_eq!( 8, Buffer::from(&[0b11111111, 0b11111111]) .slice_with_length(1, 1) - .count_set_bits() + .count_set_bits_offset(0, 8) ); assert_eq!( 3, Buffer::from(&[0b11111111, 0b11111111, 0b00001101]) .slice(2) - .count_set_bits() + .count_set_bits_offset(0, 8) ); assert_eq!( 6, Buffer::from(&[0b11111111, 0b01001001, 0b01010010]) .slice_with_length(1, 2) - .count_set_bits() + .count_set_bits_offset(0, 16) ); assert_eq!( 16, Buffer::from(&[0b11111111, 0b11111111, 0b11111111, 0b11111111]) .slice(2) - .count_set_bits() + .count_set_bits_offset(0, 16) ); } diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 4596afc8791f..f454397647c3 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -413,7 +413,7 @@ fn filter_null_mask( let nulls = filter_bits(data.null_buffer()?, data.offset(), predicate); // The filtered `nulls` has a length of `predicate.count` bits and // therefore the null count is this minus the number of valid bits - let null_count = predicate.count - nulls.count_set_bits(); + let null_count = predicate.count - nulls.count_set_bits_offset(0, predicate.count); if null_count == 0 { return None; diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b310d4fbf8cd..328ce02e4f5d 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -3059,9 +3059,6 @@ mod tests { let null_buffer = null_buffer_builder.finish(); - // `count_set_bits` counts 1-bits in entire buffer. Because above `resize` doesn't - // actually truncate the buffer, `count_set_bits` still return 3. - assert_eq!(null_buffer.count_set_bits(), 3); // `count_set_bits_offset` takes len in bits as parameter. assert_eq!(null_buffer.count_set_bits_offset(0, 13), 0); From 5fb3033eb882784499667d0ba20792f71aebd980 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 9 Nov 2022 19:05:55 -0800 Subject: [PATCH 0248/1411] Minor: Remove cloning ArrayData in with_precision_and_scale (#3050) * Use reference in with_precision_and_scale * Remove clone --- arrow-array/src/array/primitive_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index b13ea5681e87..195e0009c0cc 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -866,7 +866,7 @@ impl PrimitiveArray { // safety: self.data is valid DataType::Decimal as checked above let new_data_type = T::TYPE_CONSTRUCTOR(precision, scale); - let data = self.data().clone().into_builder().data_type(new_data_type); + let data = self.data.into_builder().data_type(new_data_type); // SAFETY // Validated data above From e44cb5b478257751916d1674292123eaf5d80a7c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Nov 2022 16:56:20 +1300 Subject: [PATCH 0249/1411] Add compare to ArrowNativeTypeOp (#3070) * Add total_cmp to ArrowNativeTypeOp * Format --- arrow-array/src/arithmetic.rs | 94 ++++++++++++++--------------------- 1 file changed, 36 insertions(+), 58 deletions(-) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index f21532364f65..566f3742e93d 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -19,6 +19,7 @@ use arrow_buffer::{i256, ArrowNativeType}; use arrow_schema::ArrowError; use half::f16; use num::complex::ComplexFloat; +use std::cmp::Ordering; /// Trait for [`ArrowNativeType`] that adds checked and unchecked arithmetic operations, /// and totally ordered comparison operations @@ -74,17 +75,34 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { fn is_zero(self) -> bool; - fn is_eq(self, rhs: Self) -> bool; - - fn is_ne(self, rhs: Self) -> bool; - - fn is_lt(self, rhs: Self) -> bool; + fn compare(self, rhs: Self) -> Ordering; - fn is_le(self, rhs: Self) -> bool; - - fn is_gt(self, rhs: Self) -> bool; + fn is_eq(self, rhs: Self) -> bool; - fn is_ge(self, rhs: Self) -> bool; + #[inline] + fn is_ne(self, rhs: Self) -> bool { + !self.is_eq(rhs) + } + + #[inline] + fn is_lt(self, rhs: Self) -> bool { + self.compare(rhs).is_lt() + } + + #[inline] + fn is_le(self, rhs: Self) -> bool { + self.compare(rhs).is_le() + } + + #[inline] + fn is_gt(self, rhs: Self) -> bool { + self.compare(rhs).is_gt() + } + + #[inline] + fn is_ge(self, rhs: Self) -> bool { + self.compare(rhs).is_ge() + } } macro_rules! native_type_op { @@ -209,33 +227,13 @@ macro_rules! native_type_op { } #[inline] - fn is_eq(self, rhs: Self) -> bool { - self == rhs - } - - #[inline] - fn is_ne(self, rhs: Self) -> bool { - self != rhs + fn compare(self, rhs: Self) -> Ordering { + self.cmp(&rhs) } #[inline] - fn is_lt(self, rhs: Self) -> bool { - self < rhs - } - - #[inline] - fn is_le(self, rhs: Self) -> bool { - self <= rhs - } - - #[inline] - fn is_gt(self, rhs: Self) -> bool { - self > rhs - } - - #[inline] - fn is_ge(self, rhs: Self) -> bool { - self >= rhs + fn is_eq(self, rhs: Self) -> bool { + self == rhs } } }; @@ -341,6 +339,11 @@ macro_rules! native_type_float_op { self == $zero } + #[inline] + fn compare(self, rhs: Self) -> Ordering { + <$t>::total_cmp(&self, &rhs) + } + #[inline] fn is_eq(self, rhs: Self) -> bool { // Equivalent to `self.total_cmp(&rhs).is_eq()` @@ -348,31 +351,6 @@ macro_rules! native_type_float_op { // https://rust.godbolt.org/z/347nWGxoW self.to_bits() == rhs.to_bits() } - - #[inline] - fn is_ne(self, rhs: Self) -> bool { - !self.is_eq(rhs) - } - - #[inline] - fn is_lt(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_lt() - } - - #[inline] - fn is_le(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_le() - } - - #[inline] - fn is_gt(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_gt() - } - - #[inline] - fn is_ge(self, rhs: Self) -> bool { - self.total_cmp(&rhs).is_ge() - } } }; } From 1cb9a4414f521fa8ab7b2e535c9669190448541b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 11 Nov 2022 08:33:59 +1300 Subject: [PATCH 0250/1411] Update hashbrown requirement from 0.12 to 0.13 (#3081) Updates the requirements on [hashbrown](https://github.com/rust-lang/hashbrown) to permit the latest version. - [Release notes](https://github.com/rust-lang/hashbrown/releases) - [Changelog](https://github.com/rust-lang/hashbrown/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/hashbrown/compare/v0.12.0...v0.13.1) --- updated-dependencies: - dependency-name: hashbrown dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-array/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 186e88ff147c..1e94ca64d99f 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -52,7 +52,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } -hashbrown = { version = "0.12", default-features = false } +hashbrown = { version = "0.13", default-features = false } [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d5392673e299..2e33014dbdea 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -56,7 +56,7 @@ arrow-select = { version = "26.0.0", path = "../arrow-select" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } -hashbrown = { version = "0.12", default-features = false } +hashbrown = { version = "0.13", default-features = false } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 70320ba65901..a414c16665af 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -48,7 +48,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op seq-macro = { version = "0.3", default-features = false } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } -hashbrown = { version = "0.12", default-features = false } +hashbrown = { version = "0.13", default-features = false } [dev-dependencies] base64 = { version = "0.13", default-features = false, features = ["std"] } From ce5e26f395d6a51895d67345bbb5cafba1c56f5e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 10 Nov 2022 11:36:20 -0800 Subject: [PATCH 0251/1411] Use ArrowNativeTypeOp on non-scalar comparison kernels (#3075) * Use ArrowNativeTypeOp on non-scalar comparison kernels * Fix clippy --- arrow/benches/comparison_kernels.rs | 8 ++- arrow/src/compute/kernels/comparison.rs | 84 +++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 7 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index e2afa99fb2ce..6599e3725aab 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -22,13 +22,14 @@ use criterion::Criterion; extern crate arrow; use arrow::compute::*; -use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType}; +use arrow::datatypes::{ArrowNativeTypeOp, ArrowNumericType, IntervalMonthDayNanoType}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type}; fn bench_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } @@ -36,6 +37,7 @@ where fn bench_neq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { neq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } @@ -43,6 +45,7 @@ where fn bench_lt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { lt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } @@ -50,6 +53,7 @@ where fn bench_lt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { lt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } @@ -57,6 +61,7 @@ where fn bench_gt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { gt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } @@ -64,6 +69,7 @@ where fn bench_gt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, + ::Native: ArrowNativeTypeOp, { gt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 1806c447b1b9..9d89287eebf1 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -3038,14 +3038,20 @@ pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { } /// Perform `left == right` operation on two [`PrimitiveArray`]s. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::eq, |a, b| a == b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a == b); + return compare_op(left, right, |a, b| a.is_eq(b)); } /// Perform `left == right` operation on a [`PrimitiveArray`] and a scalar value. @@ -3075,14 +3081,20 @@ where } /// Perform `left != right` operation on two [`PrimitiveArray`]s. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::ne, |a, b| a != b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a != b); + return compare_op(left, right, |a, b| a.is_ne(b)); } /// Perform `left != right` operation on a [`PrimitiveArray`] and a scalar value. @@ -3104,14 +3116,20 @@ where /// Perform `left < right` operation on two [`PrimitiveArray`]s. Null values are less than non-null /// values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::lt, |a, b| a < b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a < b); + return compare_op(left, right, |a, b| a.is_lt(b)); } /// Perform `left < right` operation on a [`PrimitiveArray`] and a scalar value. @@ -3134,17 +3152,23 @@ where /// Perform `left <= right` operation on two [`PrimitiveArray`]s. Null values are less than non-null /// values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::le, |a, b| a <= b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a <= b); + return compare_op(left, right, |a, b| a.is_le(b)); } /// Perform `left <= right` operation on a [`PrimitiveArray`] and a scalar value. @@ -3167,14 +3191,20 @@ where /// Perform `left > right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null /// values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::gt, |a, b| a > b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a > b); + return compare_op(left, right, |a, b| a.is_gt(b)); } /// Perform `left > right` operation on a [`PrimitiveArray`] and a scalar value. @@ -3197,17 +3227,23 @@ where /// Perform `left >= right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null /// values. +/// +/// If `simd` feature flag is not enabled: +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result where T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, { #[cfg(feature = "simd")] return simd_compare_op(left, right, T::ge, |a, b| a >= b); #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a >= b); + return compare_op(left, right, |a, b| a.is_ge(b)); } /// Perform `left >= right` operation on a [`PrimitiveArray`] and a scalar value. @@ -5805,11 +5841,17 @@ mod tests { ); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(eq(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(false), Some(false)], ); assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(neq(&array1, &array2).unwrap(), expected); + let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5824,10 +5866,16 @@ mod tests { ); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(eq(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(false), Some(false)], ); assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(neq(&array1, &array2).unwrap(), expected); } #[test] @@ -5846,11 +5894,17 @@ mod tests { ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(lt(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); + let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] .into_iter() .map(Some) @@ -5865,10 +5919,16 @@ mod tests { ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(lt(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); } #[test] @@ -5887,11 +5947,17 @@ mod tests { ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(gt(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); + let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] .into_iter() .map(Some) @@ -5906,10 +5972,16 @@ mod tests { ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + #[cfg(not(feature = "simd"))] + assert_eq!(gt(&array1, &array2).unwrap(), expected); + let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); } #[test] From ed20bf1431de784c6193cf2e21bcc6d178aa5de1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Nov 2022 09:41:46 +1300 Subject: [PATCH 0252/1411] Recurse into Dictionary value type in DataType::is_nested (#3083) --- arrow-schema/src/datatype.rs | 47 +++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 9037f7c9a53c..759fc39646c1 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -328,18 +328,20 @@ impl DataType { ) } - /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, or Map) + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, + /// or Map), or a dictionary of a nested type pub fn is_nested(t: &DataType) -> bool { use DataType::*; - matches!( - t, + match t { + Dictionary(_, v) => DataType::is_nested(v.as_ref()), List(_) - | FixedSizeList(_, _) - | LargeList(_) - | Struct(_) - | Union(_, _, _) - | Map(_, _) - ) + | FixedSizeList(_, _) + | LargeList(_) + | Struct(_) + | Union(_, _, _) + | Map(_, _) => true, + _ => false, + } } /// Compares the datatype with another, ignoring nested field names @@ -489,4 +491,31 @@ mod tests { ), ]); } + + #[test] + fn test_nested() { + let list = DataType::List(Box::new(Field::new("foo", DataType::Utf8, true))); + + assert!(!DataType::is_nested(&DataType::Boolean)); + assert!(!DataType::is_nested(&DataType::Int32)); + assert!(!DataType::is_nested(&DataType::Utf8)); + assert!(DataType::is_nested(&list)); + + assert!(!DataType::is_nested(&DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Boolean) + ))); + assert!(!DataType::is_nested(&DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Int64) + ))); + assert!(!DataType::is_nested(&DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::LargeUtf8) + ))); + assert!(DataType::is_nested(&DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(list) + ))); + } } From 8d364fe430c39d99ed68665c8c4223e02f54ab56 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 10 Nov 2022 21:24:27 +0000 Subject: [PATCH 0253/1411] early type checks in `RowConverter` (#3080) * refactor: remove duplicate code Decimal types are already handled by `downcast_primitive`. * refactor: check supported types when creating `RowConverter` Check supported row format types when creating the converter instead of during conversion. Also add an additional method `RowConverter::supports_fields` to check types w/o relying on an error. Closes #3077. * Simplify Co-authored-by: Raphael Taylor-Davies --- arrow/benches/lexsort.rs | 2 +- arrow/benches/row_format.rs | 4 +- arrow/src/row/dictionary.rs | 17 +++----- arrow/src/row/mod.rs | 83 +++++++++++++++++++++++-------------- 4 files changed, 61 insertions(+), 45 deletions(-) diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index aebb588cf9cc..5c161ec8df0f 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -105,7 +105,7 @@ fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { .iter() .map(|a| SortField::new(a.data_type().clone())) .collect(); - let mut converter = RowConverter::new(fields); + let mut converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(&arrays).unwrap(); let mut sort: Vec<_> = rows.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index 48bb013116b6..ac9f3106f7e7 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -38,12 +38,12 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { c.bench_function(&format!("convert_columns {}", name), |b| { b.iter(|| { - let mut converter = RowConverter::new(fields.clone()); + let mut converter = RowConverter::new(fields.clone()).unwrap(); black_box(converter.convert_columns(&cols).unwrap()) }); }); - let mut converter = RowConverter::new(fields); + let mut converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(&cols).unwrap(); // using a pre-prepared row converter should be faster than the first time c.bench_function(&format!("convert_columns_prepared {}", name), |b| { diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs index 950a7d8972f8..d8426ad0c3e6 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow/src/row/dictionary.rs @@ -33,8 +33,8 @@ use std::collections::HashMap; pub fn compute_dictionary_mapping( interner: &mut OrderPreservingInterner, values: &ArrayRef, -) -> Result>, ArrowError> { - Ok(downcast_primitive_array! { +) -> Vec> { + downcast_primitive_array! { values => interner .intern(values.iter().map(|x| x.map(|x| x.encode()))), DataType::Binary => { @@ -53,8 +53,8 @@ pub fn compute_dictionary_mapping( let iter = as_largestring_array(values).iter().map(|x| x.map(|x| x.as_bytes())); interner.intern(iter) } - t => return Err(ArrowError::NotYetImplemented(format!("dictionary value {} is not supported", t))), - }) + _ => unreachable!(), + } } /// Dictionary types are encoded as @@ -173,18 +173,11 @@ pub unsafe fn decode_dictionary( value_type => (decode_primitive_helper, values, value_type), DataType::Null => NullArray::new(values.len()).into_data(), DataType::Boolean => decode_bool(&values), - DataType::Decimal128(_, _) => decode_primitive_helper!(Decimal128Type, values, value_type), - DataType::Decimal256(_, _) => decode_primitive_helper!(Decimal256Type, values, value_type), DataType::Utf8 => decode_string::(&values), DataType::LargeUtf8 => decode_string::(&values), DataType::Binary => decode_binary::(&values), DataType::LargeBinary => decode_binary::(&values), - _ => { - return Err(ArrowError::NotYetImplemented(format!( - "decoding dictionary values of {}", - value_type - ))) - } + _ => unreachable!(), }; let data_type = diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index e0312be1f8da..4fbaa3931b08 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -40,7 +40,7 @@ //! let mut converter = RowConverter::new(vec![ //! SortField::new(DataType::Int32), //! SortField::new(DataType::Utf8), -//! ]); +//! ]).unwrap(); //! let rows = converter.convert_columns(&arrays).unwrap(); //! //! // Compare rows @@ -83,7 +83,7 @@ //! .iter() //! .map(|a| SortField::new(a.data_type().clone())) //! .collect(); -//! let mut converter = RowConverter::new(fields); +//! let mut converter = RowConverter::new(fields).unwrap(); //! let rows = converter.convert_columns(&arrays).unwrap(); //! let mut sort: Vec<_> = rows.iter().enumerate().collect(); //! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); @@ -231,12 +231,24 @@ impl SortField { impl RowConverter { /// Create a new [`RowConverter`] with the provided schema - pub fn new(fields: Vec) -> Self { + pub fn new(fields: Vec) -> Result { + if !Self::supports_fields(&fields) { + return Err(ArrowError::NotYetImplemented(format!( + "not yet implemented: {:?}", + fields + ))); + } + let interners = (0..fields.len()).map(|_| None).collect(); - Self { + Ok(Self { fields: fields.into(), interners, - } + }) + } + + /// Check if the given fields are supported by the row format. + pub fn supports_fields(fields: &[SortField]) -> bool { + fields.iter().all(|x| !DataType::is_nested(&x.data_type)) } /// Convert [`ArrayRef`] columns into [`Rows`] @@ -275,7 +287,7 @@ impl RowConverter { let interner = interner.get_or_insert_with(Default::default); - let mapping: Vec<_> = compute_dictionary_mapping(interner, values)? + let mapping: Vec<_> = compute_dictionary_mapping(interner, values) .into_iter() .map(|maybe_interned| { maybe_interned.map(|interned| interner.normalized_key(interned)) @@ -286,7 +298,7 @@ impl RowConverter { }) .collect::>>()?; - let mut rows = new_empty_rows(columns, &dictionaries, Arc::clone(&self.fields))?; + let mut rows = new_empty_rows(columns, &dictionaries, Arc::clone(&self.fields)); for ((column, field), dictionary) in columns.iter().zip(self.fields.iter()).zip(dictionaries) @@ -492,7 +504,7 @@ fn new_empty_rows( cols: &[ArrayRef], dictionaries: &[Option>>], fields: Arc<[SortField]>, -) -> Result { +) -> Rows { use fixed::FixedLengthEncoding; let num_rows = cols.first().map(|x| x.len()).unwrap_or(0); @@ -535,7 +547,7 @@ fn new_empty_rows( } _ => unreachable!(), } - t => return Err(ArrowError::NotYetImplemented(format!("not yet implemented: {}", t))) + _ => unreachable!(), } } @@ -565,11 +577,11 @@ fn new_empty_rows( let buffer = vec![0_u8; cur_offset]; - Ok(Rows { + Rows { buffer: buffer.into(), offsets: offsets.into(), fields, - }) + } } /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses @@ -605,7 +617,7 @@ fn encode_column( column => encode_dictionary(out, column, dictionary.unwrap(), opts), _ => unreachable!() } - t => unimplemented!("not yet implemented: {}", t) + _ => unreachable!(), } } @@ -747,7 +759,8 @@ mod tests { let mut converter = RowConverter::new(vec![ SortField::new(DataType::Int16), SortField::new(DataType::Float32), - ]); + ]) + .unwrap(); let rows = converter.convert_columns(&cols).unwrap(); assert_eq!(rows.offsets.as_ref(), &[0, 8, 16, 24, 32, 40, 48, 56]); @@ -787,7 +800,8 @@ mod tests { fn test_decimal128() { let mut converter = RowConverter::new(vec![SortField::new( DataType::Decimal128(DECIMAL128_MAX_PRECISION, 7), - )]); + )]) + .unwrap(); let col = Arc::new( Decimal128Array::from_iter([ None, @@ -815,7 +829,8 @@ mod tests { fn test_decimal256() { let mut converter = RowConverter::new(vec![SortField::new( DataType::Decimal256(DECIMAL256_MAX_PRECISION, 7), - )]); + )]) + .unwrap(); let col = Arc::new( Decimal256Array::from_iter([ None, @@ -843,7 +858,8 @@ mod tests { #[test] fn test_bool() { - let mut converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]); + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Boolean)]).unwrap(); let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) as ArrayRef; @@ -862,7 +878,8 @@ mod tests { descending: true, nulls_first: false, }, - )]); + )]) + .unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(2) < rows.row(1)); @@ -879,7 +896,7 @@ mod tests { let d = a.data_type().clone(); let mut converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]); + RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap(); let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); @@ -905,7 +922,7 @@ mod tests { ); assert_eq!(dict_with_tz.data_type(), &d); - let mut converter = RowConverter::new(vec![SortField::new(d.clone())]); + let mut converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); let rows = converter .convert_columns(&[Arc::new(dict_with_tz) as _]) .unwrap(); @@ -917,7 +934,8 @@ mod tests { #[test] fn test_null_encoding() { let col = Arc::new(NullArray::new(10)); - let mut converter = RowConverter::new(vec![SortField::new(DataType::Null)]); + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Null)]).unwrap(); let rows = converter.convert_columns(&[col]).unwrap(); assert_eq!(rows.num_rows(), 10); assert_eq!(rows.row(1).data.len(), 0); @@ -933,7 +951,8 @@ mod tests { Some(""), ])) as ArrayRef; - let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]); + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(1) < rows.row(0)); @@ -958,7 +977,8 @@ mod tests { Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]), ])) as ArrayRef; - let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]); + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); for i in 0..rows.num_rows() { @@ -983,7 +1003,8 @@ mod tests { descending: true, nulls_first: false, }, - )]); + )]) + .unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); for i in 0..rows.num_rows() { @@ -1017,7 +1038,7 @@ mod tests { ])) as ArrayRef; let mut converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]); + RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); assert!(rows_a.row(3) < rows_a.row(5)); @@ -1052,7 +1073,8 @@ mod tests { descending: true, nulls_first: false, }, - )]); + )]) + .unwrap(); let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); assert!(rows_c.row(3) > rows_c.row(5)); @@ -1078,7 +1100,7 @@ mod tests { let a = builder.finish(); let mut converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]); + RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); let rows = converter.convert_columns(&[Arc::new(a)]).unwrap(); assert!(rows.row(0) < rows.row(1)); assert!(rows.row(2) < rows.row(0)); @@ -1104,7 +1126,7 @@ mod tests { .build() .unwrap(); - let mut converter = RowConverter::new(vec![SortField::new(data_type)]); + let mut converter = RowConverter::new(vec![SortField::new(data_type)]).unwrap(); let rows = converter .convert_columns(&[Arc::new(DictionaryArray::::from(data))]) .unwrap(); @@ -1119,10 +1141,11 @@ mod tests { #[should_panic(expected = "rows were not produced by this RowConverter")] fn test_different_converter() { let values = Arc::new(Int32Array::from_iter([Some(1), Some(-1)])); - let mut converter = RowConverter::new(vec![SortField::new(DataType::Int32)]); + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); let rows = converter.convert_columns(&[values]).unwrap(); - let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]); + let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); let _ = converter.convert_rows(&rows); } @@ -1266,7 +1289,7 @@ mod tests { .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o)) .collect(); - let mut converter = RowConverter::new(columns); + let mut converter = RowConverter::new(columns).unwrap(); let rows = converter.convert_columns(&arrays).unwrap(); for i in 0..len { From 132152cb8db5085163ee0f21d24fc867716ba6d5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Nov 2022 10:28:09 +1300 Subject: [PATCH 0254/1411] Update arrow-flight subcrates (#3044) (#3052) --- .github/workflows/arrow_flight.yml | 3 --- arrow-flight/Cargo.toml | 5 ++++- arrow-flight/src/lib.rs | 13 +++++++------ arrow-flight/src/sql/mod.rs | 17 ++++++++--------- arrow-flight/src/sql/server.rs | 2 +- arrow-flight/src/utils.rs | 19 ++++++++----------- 6 files changed, 28 insertions(+), 31 deletions(-) diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 2825d2400f1f..ab7030b05e3c 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -27,7 +27,6 @@ on: - master pull_request: paths: - - arrow/** - arrow-array/** - arrow-buffer/** - arrow-cast/** @@ -36,8 +35,6 @@ on: - arrow-select/** - arrow-flight/** - arrow-ipc/** - - arrow-csv/** - - arrow-json/** - .github/** jobs: diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 394fb98c3b2c..085c8c50613e 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -27,7 +27,10 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "26.0.0", default-features = false, features = ["ipc"] } +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-ipc = { version = "26.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 1f4bcc6c434c..e742dbbe1a72 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -17,17 +17,18 @@ #![allow(rustdoc::invalid_html_tags)] -use arrow::datatypes::Schema; -use arrow::error::{ArrowError, Result as ArrowResult}; -use arrow::ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; +use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; +use arrow_schema::{ArrowError, Schema}; -use arrow::ipc::convert::try_schema_from_ipc_buffer; +use arrow_ipc::convert::try_schema_from_ipc_buffer; use std::{ convert::{TryFrom, TryInto}, fmt, ops::Deref, }; +type ArrowResult = std::result::Result; + #[allow(clippy::derive_partial_eq_without_eq)] mod gen { @@ -399,8 +400,8 @@ impl<'a> SchemaAsIpc<'a> { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::{DataType, Field, TimeUnit}; - use arrow::ipc::MetadataVersion; + use arrow_ipc::MetadataVersion; + use arrow_schema::{DataType, Field, TimeUnit}; struct TestVector(Vec, usize); diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 30bdcb5604ff..a5d4c4c3436c 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::error::{ArrowError, Result as ArrowResult}; +use arrow_schema::ArrowError; use prost::Message; mod gen { @@ -122,10 +122,10 @@ pub trait ProstAnyExt { /// /// * `Ok(None)` when message type mismatch /// * `Err` when parse failed - fn unpack(&self) -> ArrowResult>; + fn unpack(&self) -> Result, ArrowError>; /// Pack any message into `prost_types::Any` value. - fn pack(message: &M) -> ArrowResult; + fn pack(message: &M) -> Result; } impl ProstAnyExt for prost_types::Any { @@ -133,7 +133,7 @@ impl ProstAnyExt for prost_types::Any { M::type_url() == self.type_url } - fn unpack(&self) -> ArrowResult> { + fn unpack(&self) -> Result, ArrowError> { if !self.is::() { return Ok(None); } @@ -143,7 +143,7 @@ impl ProstAnyExt for prost_types::Any { Ok(Some(m)) } - fn pack(message: &M) -> ArrowResult { + fn pack(message: &M) -> Result { Ok(message.as_any()) } } @@ -165,14 +165,13 @@ mod tests { } #[test] - fn test_prost_any_pack_unpack() -> ArrowResult<()> { + fn test_prost_any_pack_unpack() { let query = CommandStatementQuery { query: "select 1".to_string(), }; - let any = prost_types::Any::pack(&query)?; + let any = prost_types::Any::pack(&query).unwrap(); assert!(any.is::()); - let unpack_query: CommandStatementQuery = any.unpack()?.unwrap(); + let unpack_query: CommandStatementQuery = any.unpack().unwrap().unwrap(); assert_eq!(query, unpack_query); - Ok(()) } } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 525c721aa2b5..d78474849af0 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -589,6 +589,6 @@ fn decode_error_to_status(err: prost::DecodeError) -> Status { Status::invalid_argument(format!("{:?}", err)) } -fn arrow_error_to_status(err: arrow::error::ArrowError) -> Status { +fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { Status::internal(format!("{:?}", err)) } diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 4a30b2d5aef8..49f9c47db6d0 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -20,13 +20,10 @@ use crate::{FlightData, IpcMessage, SchemaAsIpc, SchemaResult}; use std::collections::HashMap; -use arrow::array::ArrayRef; -use arrow::buffer::Buffer; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::{ArrowError, Result}; -use arrow::ipc::{reader, writer, writer::IpcWriteOptions}; -use arrow::record_batch::RecordBatch; -use std::convert::TryInto; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_buffer::Buffer; +use arrow_ipc::{reader, writer, writer::IpcWriteOptions}; +use arrow_schema::{ArrowError, Schema, SchemaRef}; /// Convert a `RecordBatch` to a vector of `FlightData` representing the bytes of the dictionaries /// and a `FlightData` representing the bytes of the batch's values @@ -52,9 +49,9 @@ pub fn flight_data_to_arrow_batch( data: &FlightData, schema: SchemaRef, dictionaries_by_id: &HashMap, -) -> Result { +) -> Result { // check that the data_header is a record batch message - let message = arrow::ipc::root_as_message(&data.data_header[..]).map_err(|err| { + let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|err| { ArrowError::ParseError(format!("Unable to get root as message: {:?}", err)) })?; @@ -85,7 +82,7 @@ pub fn flight_data_to_arrow_batch( pub fn flight_schema_from_arrow_schema( schema: &Schema, options: &IpcWriteOptions, -) -> Result { +) -> Result { SchemaAsIpc::new(schema, options).try_into() } @@ -109,7 +106,7 @@ pub fn flight_data_from_arrow_schema( pub fn ipc_message_from_arrow_schema( schema: &Schema, options: &IpcWriteOptions, -) -> Result> { +) -> Result, ArrowError> { let message = SchemaAsIpc::new(schema, options).try_into()?; let IpcMessage(vals) = message; Ok(vals) From 4dd7fea13dd2ac62f179fc7fc245037e7036afc9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Nov 2022 10:28:42 +1300 Subject: [PATCH 0255/1411] Update parquet to depend on arrow subcrates (#3028) * Update parquet to depend on arrow subcrates (#3044) * Fix parquet_derive * Fix parquet_derive * Fix no-default-features test compilation * Fix parquet-fromcsv * Clippy --- parquet/Cargo.toml | 16 +++++-- parquet/src/arrow/array_reader/builder.rs | 2 +- parquet/src/arrow/array_reader/byte_array.rs | 8 ++-- .../array_reader/byte_array_dictionary.rs | 25 +++------- parquet/src/arrow/array_reader/empty_array.rs | 5 +- .../array_reader/fixed_len_byte_array.rs | 15 +++--- parquet/src/arrow/array_reader/list_array.rs | 16 ++++--- parquet/src/arrow/array_reader/map_array.rs | 16 +++---- parquet/src/arrow/array_reader/mod.rs | 4 +- parquet/src/arrow/array_reader/null_array.rs | 8 ++-- .../src/arrow/array_reader/primitive_array.rs | 21 +++++---- .../src/arrow/array_reader/struct_array.rs | 9 ++-- parquet/src/arrow/array_reader/test_util.rs | 4 +- parquet/src/arrow/arrow_reader/filter.rs | 13 +++-- parquet/src/arrow/arrow_reader/mod.rs | 43 +++++++++-------- parquet/src/arrow/arrow_reader/selection.rs | 4 +- parquet/src/arrow/arrow_writer/byte_array.rs | 6 +-- parquet/src/arrow/arrow_writer/levels.rs | 47 +++++++++---------- parquet/src/arrow/arrow_writer/mod.rs | 21 ++++----- parquet/src/arrow/async_reader.rs | 8 ++-- parquet/src/arrow/buffer/bit_util.rs | 4 +- parquet/src/arrow/buffer/dictionary_buffer.rs | 11 +++-- parquet/src/arrow/buffer/offset_buffer.rs | 9 ++-- parquet/src/arrow/mod.rs | 12 ++--- parquet/src/arrow/record_reader/buffer.rs | 3 +- .../arrow/record_reader/definition_levels.rs | 8 ++-- parquet/src/arrow/record_reader/mod.rs | 6 +-- parquet/src/arrow/schema.rs | 10 ++-- parquet/src/arrow/schema/complex.rs | 2 +- parquet/src/arrow/schema/primitive.rs | 2 +- parquet/src/bin/parquet-fromcsv.rs | 3 +- parquet/src/column/reader.rs | 1 + parquet/src/column/writer/encoder.rs | 6 +-- parquet/src/compression.rs | 4 +- parquet/src/errors.rs | 16 +++---- parquet/src/file/serialized_reader.rs | 1 + parquet/src/lib.rs | 2 +- parquet/src/util/interner.rs | 1 + parquet_derive/Cargo.toml | 6 +-- 39 files changed, 199 insertions(+), 199 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a414c16665af..65c4009d3c19 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -30,6 +30,15 @@ edition = "2021" rust-version = "1.62" [dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "26.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "26.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "26.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "26.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "26.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", default-features = false, optional = true } + ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.16", default-features = false } @@ -41,7 +50,6 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "26.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -70,9 +78,9 @@ all-features = true [features] default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] # Enable arrow reader/writer APIs -arrow = ["dep:arrow", "base64"] +arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] # Enable CLI tools -cli = ["json", "base64", "clap", "arrow/csv"] +cli = ["json", "base64", "clap", "arrow-csv"] # Enable JSON APIs json = ["serde_json", "base64"] # Enable internal testing APIs @@ -100,7 +108,7 @@ required-features = ["cli"] [[bin]] name = "parquet-fromcsv" -required-features = ["cli"] +required-features = ["arrow", "cli"] [[bench]] name = "arrow_writer" diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index c0216466d489..246bccfece4e 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use arrow::datatypes::DataType; +use arrow_schema::DataType; use crate::arrow::array_reader::empty_array::make_empty_array_reader; use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader; diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 4bf4dee0d0b2..22fa0ab45a20 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -30,9 +30,9 @@ use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; -use arrow::array::{Array, ArrayRef, BinaryArray, Decimal128Array, OffsetSizeTrait}; -use arrow::buffer::Buffer; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::{Array, ArrayRef, BinaryArray, Decimal128Array, OffsetSizeTrait}; +use arrow_buffer::Buffer; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::ops::Range; use std::sync::Arc; @@ -587,7 +587,7 @@ mod tests { use super::*; use crate::arrow::array_reader::test_util::{byte_array_all_encodings, utf8_column}; use crate::arrow::record_reader::buffer::ValuesBuffer; - use arrow::array::{Array, StringArray}; + use arrow_array::{Array, StringArray}; #[test] fn test_byte_array_decoder() { diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 0a5d94fa6ae8..c4ed7e9070cc 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -20,9 +20,9 @@ use std::marker::PhantomData; use std::ops::Range; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; -use arrow::buffer::Buffer; -use arrow::datatypes::{ArrowNativeType, DataType as ArrowType}; +use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_schema::DataType as ArrowType; use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}; use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; @@ -188,15 +188,11 @@ where } fn get_def_levels(&self) -> Option<&[i16]> { - self.def_levels_buffer - .as_ref() - .map(|buf| buf.typed_data()) + self.def_levels_buffer.as_ref().map(|buf| buf.typed_data()) } fn get_rep_levels(&self) -> Option<&[i16]> { - self.rep_levels_buffer - .as_ref() - .map(|buf| buf.typed_data()) + self.rep_levels_buffer.as_ref().map(|buf| buf.typed_data()) } } @@ -395,7 +391,7 @@ where #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; + use arrow_array::{Array, StringArray}; use arrow::compute::cast; use crate::arrow::array_reader::test_util::{ @@ -528,13 +524,7 @@ mod tests { assert_eq!( strings.iter().collect::>(), - vec![ - Some("0"), - Some("1"), - Some("1"), - Some("2"), - Some("2"), - ] + vec![Some("0"), Some("1"), Some("1"), Some("2"), Some("2"),] ) } @@ -625,7 +615,6 @@ mod tests { } } - #[test] fn test_too_large_dictionary() { let data: Vec<_> = (0..128) diff --git a/parquet/src/arrow/array_reader/empty_array.rs b/parquet/src/arrow/array_reader/empty_array.rs index abe839b9dc29..2a3711fa0309 100644 --- a/parquet/src/arrow/array_reader/empty_array.rs +++ b/parquet/src/arrow/array_reader/empty_array.rs @@ -17,8 +17,9 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::Result; -use arrow::array::{ArrayDataBuilder, ArrayRef, StructArray}; -use arrow::datatypes::DataType as ArrowType; +use arrow_schema::DataType as ArrowType; +use arrow_array::{ArrayRef, StructArray}; +use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index ba3a02c4f6b7..e8d426d3a850 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -27,12 +27,13 @@ use crate::column::reader::decoder::{ColumnValueDecoder, ValuesBufferSlice}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; -use arrow::array::{ - ArrayDataBuilder, ArrayRef, Decimal128Array, FixedSizeBinaryArray, - IntervalDayTimeArray, IntervalYearMonthArray, +use arrow_array::{ + ArrayRef, Decimal128Array, FixedSizeBinaryArray, IntervalDayTimeArray, + IntervalYearMonthArray, }; -use arrow::buffer::Buffer; -use arrow::datatypes::{DataType as ArrowType, IntervalUnit}; +use arrow_buffer::Buffer; +use arrow_data::ArrayDataBuilder; +use arrow_schema::{DataType as ArrowType, IntervalUnit}; use std::any::Any; use std::ops::Range; use std::sync::Arc; @@ -427,10 +428,10 @@ mod tests { use super::*; use crate::arrow::arrow_reader::ParquetRecordBatchReader; use crate::arrow::ArrowWriter; - use arrow::array::{Array, Decimal128Array, ListArray}; + use arrow_array::{Array, Decimal128Array, ListArray}; use arrow::datatypes::Field; use arrow::error::Result as ArrowResult; - use arrow::record_batch::RecordBatch; + use arrow_array::RecordBatch; use bytes::Bytes; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index f0b5092e1ad4..965142f3840b 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -18,13 +18,14 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::ParquetError; use crate::errors::Result; -use arrow::array::{ - new_empty_array, Array, ArrayData, ArrayRef, BooleanBufferBuilder, GenericListArray, - MutableArrayData, OffsetSizeTrait, +use arrow_array::{ + builder::BooleanBufferBuilder, new_empty_array, Array, ArrayRef, GenericListArray, + OffsetSizeTrait, }; -use arrow::buffer::Buffer; -use arrow::datatypes::DataType as ArrowType; -use arrow::datatypes::ToByteSlice; +use arrow_buffer::Buffer; +use arrow_buffer::ToByteSlice; +use arrow_data::{transform::MutableArrayData, ArrayData}; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::cmp::Ordering; use std::marker::PhantomData; @@ -257,8 +258,9 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; - use arrow::array::{Array, ArrayDataBuilder, PrimitiveArray}; use arrow::datatypes::{Field, Int32Type as ArrowInt32, Int32Type}; + use arrow_array::{Array, PrimitiveArray}; + use arrow_data::ArrayDataBuilder; use std::sync::Arc; fn list_type( diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index bb80fdbdc5f7..cd1a76e86388 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -17,8 +17,8 @@ use crate::arrow::array_reader::{ArrayReader, ListArrayReader, StructArrayReader}; use crate::errors::Result; -use arrow::array::{Array, ArrayRef, MapArray}; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::{Array, ArrayRef, MapArray}; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; @@ -125,10 +125,10 @@ mod tests { use super::*; use crate::arrow::arrow_reader::ParquetRecordBatchReader; use crate::arrow::ArrowWriter; - use arrow::array; - use arrow::array::{MapBuilder, PrimitiveBuilder, StringBuilder}; use arrow::datatypes::{Field, Int32Type, Schema}; - use arrow::record_batch::RecordBatch; + use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder}; + use arrow_array::cast::*; + use arrow_array::RecordBatch; use bytes::Bytes; #[test] @@ -203,9 +203,9 @@ mod tests { let col = record_batch.column(0); assert!(col.is_null(0)); assert!(col.is_null(1)); - let map_entry = array::as_map_array(col).value(2); - let struct_col = array::as_struct_array(&map_entry); - let key_col = array::as_string_array(struct_col.column(0)); // Key column + let map_entry = as_map_array(col).value(2); + let struct_col = as_struct_array(&map_entry); + let key_col = as_string_array(struct_col.column(0)); // Key column assert_eq!(key_col.value(0), "three"); assert_eq!(key_col.value(1), "four"); assert_eq!(key_col.value(2), "five"); diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 3740f0faea69..aede5e86c693 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -18,8 +18,8 @@ //! Logic for reading into arrow arrays use crate::errors::Result; -use arrow::array::ArrayRef; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::ArrayRef; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; diff --git a/parquet/src/arrow/array_reader/null_array.rs b/parquet/src/arrow/array_reader/null_array.rs index 405633f0a823..4ad6c97e2f66 100644 --- a/parquet/src/arrow/array_reader/null_array.rs +++ b/parquet/src/arrow/array_reader/null_array.rs @@ -22,9 +22,9 @@ use crate::column::page::PageIterator; use crate::data_type::DataType; use crate::errors::Result; use crate::schema::types::ColumnDescPtr; -use arrow::array::ArrayRef; -use arrow::buffer::Buffer; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::ArrayRef; +use arrow_buffer::Buffer; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; @@ -82,7 +82,7 @@ where fn consume_batch(&mut self) -> Result { // convert to arrays - let array = arrow::array::NullArray::new(self.record_reader.num_values()); + let array = arrow_array::NullArray::new(self.record_reader.num_values()); // save definition and repetition buffers self.def_levels_buffer = self.record_reader.consume_def_levels(); diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 5fc5e639de92..012cad5c4c69 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -24,13 +24,14 @@ use crate::column::page::PageIterator; use crate::data_type::{DataType, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use arrow::array::{ - ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, Decimal128Array, - Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, - TimestampNanosecondBufferBuilder, UInt32Array, UInt64Array, +use arrow_array::{ + builder::{BooleanBufferBuilder, TimestampNanosecondBufferBuilder}, + ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array, Int32Array, + Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array, }; -use arrow::buffer::Buffer; -use arrow::datatypes::{DataType as ArrowType, TimeUnit}; +use arrow_buffer::Buffer; +use arrow_data::ArrayDataBuilder; +use arrow_schema::{DataType as ArrowType, TimeUnit}; use std::any::Any; use std::sync::Arc; @@ -205,8 +206,8 @@ where let array = match target_type { ArrowType::Date64 => { // this is cheap as it internally reinterprets the data - let a = arrow::compute::cast(&array, &ArrowType::Date32)?; - arrow::compute::cast(&a, target_type)? + let a = arrow_cast::cast(&array, &ArrowType::Date32)?; + arrow_cast::cast(&a, target_type)? } ArrowType::Decimal128(p, s) => { let array = match array.data_type() { @@ -236,7 +237,7 @@ where Arc::new(array) as ArrayRef } - _ => arrow::compute::cast(&array, target_type)?, + _ => arrow_cast::cast(&array, target_type)?, }; // save definition and repetition buffers @@ -270,8 +271,8 @@ mod tests { use crate::schema::types::SchemaDescriptor; use crate::util::test_common::rand_gen::make_pages; use crate::util::InMemoryPageIterator; - use arrow::array::{Array, PrimitiveArray}; use arrow::datatypes::ArrowPrimitiveType; + use arrow_array::{Array, PrimitiveArray}; use arrow::datatypes::DataType::Decimal128; use rand::distributions::uniform::SampleUniform; diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index f682f146c721..b470be5ad408 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -17,10 +17,9 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::{ParquetError, Result}; -use arrow::array::{ - ArrayData, ArrayDataBuilder, ArrayRef, BooleanBufferBuilder, StructArray, -}; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::{builder::BooleanBufferBuilder, ArrayRef, StructArray}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; @@ -216,9 +215,9 @@ mod tests { use super::*; use crate::arrow::array_reader::test_util::InMemoryArrayReader; use crate::arrow::array_reader::ListArrayReader; - use arrow::array::{Array, Int32Array, ListArray}; use arrow::buffer::Buffer; use arrow::datatypes::Field; + use arrow_array::{Array, Int32Array, ListArray}; #[test] fn test_struct_array_reader() { diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index ca1aabfd4aa1..6585d46146e2 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, ArrayRef}; -use arrow::datatypes::DataType as ArrowType; +use arrow_array::{Array, ArrayRef}; +use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index 8945ccde4248..cbded9a6f420 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -16,9 +16,8 @@ // under the License. use crate::arrow::ProjectionMask; -use arrow::array::BooleanArray; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_schema::ArrowError; /// A predicate operating on [`RecordBatch`] pub trait ArrowPredicate: Send + 'static { @@ -32,7 +31,7 @@ pub trait ArrowPredicate: Send + 'static { /// /// Rows that are `true` in the returned [`BooleanArray`] will be returned by the /// parquet reader, whereas rows that are `false` or `Null` will not be - fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult; + fn evaluate(&mut self, batch: RecordBatch) -> Result; } /// An [`ArrowPredicate`] created from an [`FnMut`] @@ -43,7 +42,7 @@ pub struct ArrowPredicateFn { impl ArrowPredicateFn where - F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, + F: FnMut(RecordBatch) -> Result + Send + 'static, { /// Create a new [`ArrowPredicateFn`]. `f` will be passed batches /// that contains the columns specified in `projection` @@ -56,13 +55,13 @@ where impl ArrowPredicate for ArrowPredicateFn where - F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, + F: FnMut(RecordBatch) -> Result + Send + 'static, { fn projection(&self) -> &ProjectionMask { &self.projection } - fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + fn evaluate(&mut self, batch: RecordBatch) -> Result { (self.f)(batch) } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 19c877dffc2c..35b70a0485cd 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -20,12 +20,10 @@ use std::collections::VecDeque; use std::sync::Arc; -use arrow::array::Array; -use arrow::compute::prep_null_mask_filter; -use arrow::datatypes::{DataType as ArrowType, Schema, SchemaRef}; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::{RecordBatch, RecordBatchReader}; -use arrow::{array::StructArray, error::ArrowError}; +use arrow_array::{Array, StructArray}; +use arrow_array::{RecordBatch, RecordBatchReader}; +use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; +use arrow_select::filter::prep_null_mask_filter; use crate::arrow::array_reader::{ build_array_reader, ArrayReader, FileReaderRowGroupCollection, RowGroupCollection, @@ -473,7 +471,7 @@ pub struct ParquetRecordBatchReader { } impl Iterator for ParquetRecordBatchReader { - type Item = ArrowResult; + type Item = Result; fn next(&mut self) -> Option { let mut read_records = 0; @@ -638,11 +636,12 @@ mod tests { use rand::{thread_rng, Rng, RngCore}; use tempfile::tempfile; - use arrow::array::*; - use arrow::buffer::Buffer; - use arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; - use arrow::error::Result as ArrowResult; - use arrow::record_batch::{RecordBatch, RecordBatchReader}; + use arrow_array::builder::*; + use arrow_array::*; + use arrow_array::{RecordBatch, RecordBatchReader}; + use arrow_buffer::Buffer; + use arrow_data::ArrayDataBuilder; + use arrow_schema::{DataType as ArrowDataType, Field, Schema}; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader, @@ -714,7 +713,7 @@ mod tests { file.rewind().unwrap(); let record_reader = ParquetRecordBatchReader::try_new(file, 2).unwrap(); - let batches = record_reader.collect::>>().unwrap(); + let batches = record_reader.collect::, _>>().unwrap(); assert_eq!(batches.len(), 4); for batch in &batches[0..3] { @@ -1067,7 +1066,7 @@ mod tests { let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 3) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(&written.slice(0, 3), &read[0]); @@ -1103,7 +1102,7 @@ mod tests { let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 3) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(&written.slice(0, 3), &read[0]); @@ -1143,7 +1142,7 @@ mod tests { let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 3) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(&written.slice(0, 3), &read[0]); @@ -1153,7 +1152,7 @@ mod tests { #[test] fn test_read_decimal_file() { - use arrow::array::Decimal128Array; + use arrow_array::Decimal128Array; let testdata = arrow::util::test_util::parquet_test_data(); let file_variants = vec![ ("byte_array", 4), @@ -1936,7 +1935,7 @@ mod tests { let record_reader = ParquetRecordBatchReader::try_new(file, 3).unwrap(); let batches = record_reader - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(batches.len(), 6); @@ -2271,7 +2270,7 @@ mod tests { let expected = get_expected_batches(&data, &selections, batch_size); let skip_reader = create_skip_reader(&test_file, batch_size, selections); assert_eq!( - skip_reader.collect::>>().unwrap(), + skip_reader.collect::, _>>().unwrap(), expected, "batch_size: {}, selection_len: {}, skip_first: {}", batch_size, @@ -2399,7 +2398,7 @@ mod tests { let batches = ParquetRecordBatchReader::try_new(file, 1024) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(batches.len(), 1); let batch = &batches[0]; @@ -2444,7 +2443,7 @@ mod tests { let batches = ParquetRecordBatchReader::try_new(file, expected_rows) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(batches.len(), 1); let batch = &batches[0]; @@ -2476,7 +2475,7 @@ mod tests { let batches = ParquetRecordBatchReader::try_new(file, expected_rows) .unwrap() - .collect::>>() + .collect::, _>>() .unwrap(); assert_eq!(batches.len(), 1); let batch = &batches[0]; diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 2328c4501598..357960906c81 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, BooleanArray}; -use arrow::compute::SlicesIterator; +use arrow_array::{Array, BooleanArray}; +use arrow_select::filter::SlicesIterator; use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 7070cecacf2b..d52317852805 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -31,17 +31,17 @@ use crate::file::writer::OnCloseColumnChunk; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; -use arrow::array::{ +use arrow_array::{ Array, ArrayAccessor, ArrayRef, BinaryArray, DictionaryArray, LargeBinaryArray, LargeStringArray, StringArray, }; -use arrow::datatypes::DataType; +use arrow_schema::DataType; macro_rules! downcast_dict_impl { ($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{ $op($array .as_any() - .downcast_ref::>() + .downcast_ref::>() .unwrap() .downcast_dict::<$val>() .unwrap()$(, $arg)*) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 5736f05fdcfe..e2a8a8c50e9c 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -41,11 +41,11 @@ //! \[1\] [parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding) use crate::errors::{ParquetError, Result}; -use arrow::array::{ - make_array, Array, ArrayData, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, - StructArray, +use arrow_array::{ + make_array, Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, StructArray, }; -use arrow::datatypes::{DataType, Field}; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field}; use std::ops::Range; /// Performs a depth-first scan of the children of `array`, constructing [`LevelInfo`] @@ -482,11 +482,13 @@ mod tests { use std::sync::Arc; - use arrow::array::*; - use arrow::buffer::Buffer; - use arrow::datatypes::{Int32Type, Schema, ToByteSlice}; - use arrow::record_batch::RecordBatch; - use arrow::util::pretty::pretty_format_columns; + use arrow_array::builder::*; + use arrow_array::types::Int32Type; + use arrow_array::*; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_cast::display::array_value_to_string; + use arrow_data::ArrayDataBuilder; + use arrow_schema::Schema; #[test] fn test_calculate_array_levels_twitter_example() { @@ -1355,21 +1357,18 @@ mod tests { let list_field = Field::new("col", list_type, true); let expected = vec![ - r#"+-------------------------------------+"#, - r#"| col |"#, - r#"+-------------------------------------+"#, - r#"| |"#, - r#"| |"#, - r#"| [] |"#, - r#"| [{"list": [3, ], "integers": null}] |"#, - r#"| [, {"list": null, "integers": 5}] |"#, - r#"| [] |"#, - r#"+-------------------------------------+"#, - ] - .join("\n"); - - let pretty = pretty_format_columns(list_field.name(), &[list.clone()]).unwrap(); - assert_eq!(pretty.to_string(), expected); + r#""#.to_string(), + r#""#.to_string(), + r#"[]"#.to_string(), + r#"[{"list": [3, ], "integers": null}]"#.to_string(), + r#"[, {"list": null, "integers": 5}]"#.to_string(), + r#"[]"#.to_string(), + ]; + + let actual: Vec<_> = (0..6) + .map(|x| array_value_to_string(&list, x).unwrap()) + .collect(); + assert_eq!(actual, expected); let levels = calculate_array_levels(&list, &list_field).unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 52f55a91baa9..ecb59e93e2f9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -21,11 +21,8 @@ use std::collections::VecDeque; use std::io::Write; use std::sync::Arc; -use arrow::array as arrow_array; -use arrow::array::ArrayRef; -use arrow::datatypes::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; -use arrow::record_batch::RecordBatch; -use arrow_array::Array; +use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_schema::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use super::schema::{ add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, @@ -54,8 +51,8 @@ mod levels; /// ``` /// # use std::sync::Arc; /// # use bytes::Bytes; -/// # use arrow::array::{ArrayRef, Int64Array}; -/// # use arrow::record_batch::RecordBatch; +/// # use arrow_array::{ArrayRef, Int64Array}; +/// # use arrow_array::RecordBatch; /// # use parquet::arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}; /// let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; /// let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); @@ -376,8 +373,8 @@ fn write_leaf( match column.data_type() { ArrowDataType::Date64 => { // If the column is a Date64, we cast it to a Date32, and then interpret that as Int32 - let array = arrow::compute::cast(column, &ArrowDataType::Date32)?; - let array = arrow::compute::cast(&array, &ArrowDataType::Int32)?; + let array = arrow_cast::cast(column, &ArrowDataType::Date32)?; + let array = arrow_cast::cast(&array, &ArrowDataType::Int32)?; let array = array .as_any() @@ -394,7 +391,7 @@ fn write_leaf( write_primitive(typed, &array[offset..offset + data.len()], levels)? } _ => { - let array = arrow::compute::cast(column, &ArrowDataType::Int32)?; + let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; let array = array .as_any() .downcast_ref::() @@ -432,7 +429,7 @@ fn write_leaf( write_primitive(typed, &array[offset..offset + data.len()], levels)? } _ => { - let array = arrow::compute::cast(column, &ArrowDataType::Int64)?; + let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; let array = array .as_any() .downcast_ref::() @@ -618,9 +615,9 @@ mod tests { use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; use arrow::error::Result as ArrowResult; - use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; + use arrow_array::RecordBatch; use crate::basic::Encoding; use crate::file::metadata::ParquetMetaData; diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index b6b5d7ff7de6..d52fa0406bfa 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -22,7 +22,7 @@ //! # #[tokio::main(flavor="current_thread")] //! # async fn main() { //! # -//! use arrow::record_batch::RecordBatch; +//! use arrow_array::RecordBatch; //! use arrow::util::pretty::pretty_format_batches; //! use futures::TryStreamExt; //! use tokio::fs::File; @@ -93,8 +93,8 @@ use thrift::protocol::TCompactInputProtocol; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; -use arrow::datatypes::SchemaRef; -use arrow::record_batch::RecordBatch; +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; use crate::arrow::arrow_reader::{ @@ -797,8 +797,8 @@ mod tests { use crate::arrow::ArrowWriter; use crate::file::footer::parse_metadata; use crate::file::page_index::index_reader; - use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; use arrow::error::Result as ArrowResult; + use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; use futures::TryStreamExt; use rand::{thread_rng, Rng}; use std::sync::Mutex; diff --git a/parquet/src/arrow/buffer/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs index 04704237c458..34a0a4b83e8d 100644 --- a/parquet/src/arrow/buffer/bit_util.rs +++ b/parquet/src/arrow/buffer/bit_util.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::util::bit_chunk_iterator::UnalignedBitChunk; +use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; use std::ops::Range; /// Counts the number of set bits in the provided range @@ -65,7 +65,7 @@ pub fn sign_extend_be(b: &[u8]) -> [u8; N] { #[cfg(test)] mod tests { use super::*; - use arrow::array::BooleanBufferBuilder; + use arrow_array::builder::BooleanBufferBuilder; use rand::prelude::*; #[test] diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index ae9e3590de3f..23ebea57b5b2 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -21,9 +21,10 @@ use crate::arrow::record_reader::buffer::{ }; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; -use arrow::array::{make_array, Array, ArrayDataBuilder, ArrayRef, OffsetSizeTrait}; -use arrow::buffer::Buffer; -use arrow::datatypes::{ArrowNativeType, DataType as ArrowType}; +use arrow_array::{make_array, Array, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::DataType as ArrowType; use std::sync::Arc; /// An array of variable length byte arrays that are potentially dictionary encoded @@ -179,7 +180,7 @@ impl }; // This will compute a new dictionary - let array = arrow::compute::cast( + let array = arrow_cast::cast( &values.into_array(null_buffer, value_type), data_type, ) @@ -252,8 +253,8 @@ impl BufferQueue #[cfg(test)] mod tests { use super::*; - use arrow::array::{Array, StringArray}; use arrow::compute::cast; + use arrow_array::{Array, StringArray}; #[test] fn test_dictionary_buffer() { diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 48eb70137392..df96996e3cbc 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -21,9 +21,10 @@ use crate::arrow::record_reader::buffer::{ }; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; -use arrow::array::{make_array, ArrayDataBuilder, ArrayRef, OffsetSizeTrait}; -use arrow::buffer::Buffer; -use arrow::datatypes::{ArrowNativeType, DataType as ArrowType}; +use arrow_array::{make_array, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::DataType as ArrowType; /// A buffer of variable-sized byte arrays that can be converted into /// a corresponding [`ArrayRef`] @@ -238,7 +239,7 @@ impl ValuesBufferSlice for OffsetBuffer { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Array, LargeStringArray, StringArray}; + use arrow_array::{Array, LargeStringArray, StringArray}; #[test] fn test_offset_buffer_empty() { diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index c5fe0fa2a627..97d0c25e2b4f 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -16,8 +16,8 @@ // under the License. //! Provides API for reading/writing Arrow -//! [RecordBatch](arrow::record_batch::RecordBatch)es and -//! [Array](arrow::array::Array)s to/from Parquet Files. +//! [RecordBatch](arrow_array::RecordBatch)es and +//! [Array](arrow_array::Array)s to/from Parquet Files. //! //! [Apache Arrow](http://arrow.apache.org/) is a cross-language development platform for //! in-memory data. @@ -25,8 +25,8 @@ //!# Example of writing Arrow record batch to Parquet file //! //!```rust -//! use arrow::array::{Int32Array, ArrayRef}; -//! use arrow::record_batch::RecordBatch; +//! use arrow_array::{Int32Array, ArrayRef}; +//! use arrow_array::RecordBatch; //! use parquet::arrow::arrow_writer::ArrowWriter; //! use parquet::file::properties::WriterProperties; //! use std::fs::File; @@ -70,9 +70,9 @@ //! use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; //! //! # use std::sync::Arc; -//! # use arrow::array::Int32Array; +//! # use arrow_array::Int32Array; //! # use arrow::datatypes::{DataType, Field, Schema}; -//! # use arrow::record_batch::RecordBatch; +//! # use arrow_array::RecordBatch; //! # use parquet::arrow::arrow_writer::ArrowWriter; //! # //! # let ids = Int32Array::from(vec![1, 2, 3, 4]); diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 64ea38f801d9..404989493883 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -19,8 +19,7 @@ use std::marker::PhantomData; use crate::arrow::buffer::bit_util::iter_set_bits_rev; use crate::data_type::Int96; -use arrow::buffer::{Buffer, MutableBuffer}; -use arrow::datatypes::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; /// A buffer that supports writing new data to the end, and removing data from the front /// diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 2d65db77fa69..84b7ab94cebb 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -17,10 +17,10 @@ use std::ops::Range; -use arrow::array::BooleanBufferBuilder; -use arrow::bitmap::Bitmap; -use arrow::buffer::Buffer; -use arrow::util::bit_chunk_iterator::UnalignedBitChunk; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; +use arrow_buffer::Buffer; +use arrow_data::Bitmap; use crate::arrow::buffer::bit_util::count_set_bits; use crate::arrow::record_reader::buffer::BufferQueue; diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index b7318af9e85a..ef17b8d0e6f4 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -17,8 +17,8 @@ use std::cmp::{max, min}; -use arrow::bitmap::Bitmap; -use arrow::buffer::Buffer; +use arrow_buffer::Buffer; +use arrow_data::Bitmap; use crate::arrow::record_reader::{ buffer::{BufferQueue, ScalarBuffer, ValuesBuffer}, @@ -409,9 +409,9 @@ fn packed_null_mask(descr: &ColumnDescPtr) -> bool { mod tests { use std::sync::Arc; - use arrow::array::{Int16BufferBuilder, Int32BufferBuilder}; use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; + use arrow_array::builder::{Int16BufferBuilder, Int32BufferBuilder}; use crate::basic::Encoding; use crate::data_type::Int32Type; diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 7803385e7f01..395c4aac1500 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -26,8 +26,8 @@ use std::collections::HashMap; use std::sync::Arc; -use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use arrow::ipc::writer; +use arrow_schema::{DataType, Field, Schema, TimeUnit}; +use arrow_ipc::writer; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, @@ -108,10 +108,10 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { } else { bytes.as_slice() }; - match arrow::ipc::root_as_message(slice) { + match arrow_ipc::root_as_message(slice) { Ok(message) => message .header_as_schema() - .map(arrow::ipc::convert::fb_to_schema) + .map(arrow_ipc::convert::fb_to_schema) .ok_or_else(|| arrow_err!("the message is not Arrow Schema")), Err(err) => { // The flatbuffers implementation returns an error on verification error. @@ -137,7 +137,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { /// Encodes the Arrow schema into the IPC format, and base64 encodes it fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); - let data_gen = arrow::ipc::writer::IpcDataGenerator::default(); + let data_gen = writer::IpcDataGenerator::default(); let mut serialized_schema = data_gen.schema_to_bytes(schema, &options); // manually prepending the length to the schema as arrow uses the legacy IPC format diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index d63ab5606b03..2334a5601b4c 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -21,7 +21,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError; use crate::errors::Result; use crate::schema::types::{SchemaDescriptor, Type, TypePtr}; -use arrow::datatypes::{DataType, Field, Schema}; +use arrow_schema::{DataType, Field, Schema}; fn get_repetition(t: &Type) -> Repetition { let info = t.get_basic_info(); diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 87edd75b0b8d..e5bab9ac96c2 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -20,7 +20,7 @@ use crate::basic::{ }; use crate::errors::{ParquetError, Result}; use crate::schema::types::{BasicTypeInfo, Type}; -use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{DataType, IntervalUnit, TimeUnit}; /// Converts [`Type`] to [`DataType`] with an optional `arrow_type_hint` /// provided by the arrow schema diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 8c62241e34f5..5fdece7cc8a3 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -71,7 +71,8 @@ use std::{ sync::Arc, }; -use arrow::{csv::ReaderBuilder, datatypes::Schema, error::ArrowError}; +use arrow_csv::ReaderBuilder; +use arrow_schema::{ArrowError, Schema}; use clap::{Parser, ValueEnum}; use parquet::{ arrow::{parquet_to_arrow_schema, ArrowWriter}, diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 09254999bdd3..f63b1e60a03e 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -515,6 +515,7 @@ where /// If the current page is fully decoded, this will NOT load the next page /// into the buffer #[inline] + #[cfg(feature = "arrow")] pub(crate) fn peek_next(&mut self) -> Result { if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 9227c4ba1ce8..22cc71f6cd5e 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -34,10 +34,10 @@ pub trait ColumnValues { fn len(&self) -> usize; } -#[cfg(any(feature = "arrow", test))] -impl ColumnValues for T { +#[cfg(feature = "arrow")] +impl ColumnValues for T { fn len(&self) -> usize { - arrow::array::Array::len(self) + arrow_array::Array::len(self) } } diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index bba14f94e2eb..4ee321609e04 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -126,7 +126,7 @@ impl CodecOptionsBuilder { /// This returns `None` if the codec type is `UNCOMPRESSED`. pub fn create_codec( codec: CodecType, - options: &CodecOptions, + _options: &CodecOptions, ) -> Result>> { match codec { #[cfg(any(feature = "brotli", test))] @@ -137,7 +137,7 @@ pub fn create_codec( CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), #[cfg(any(feature = "lz4", test))] CodecType::LZ4 => Ok(Some(Box::new(LZ4HadoopCodec::new( - options.backward_compatible_lz4, + _options.backward_compatible_lz4, )))), #[cfg(any(feature = "zstd", test))] CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index c4f5faaaacae..cbbd2405353f 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -19,8 +19,8 @@ use std::{cell, io, result, str}; -#[cfg(any(feature = "arrow", test))] -use arrow::error::ArrowError; +#[cfg(feature = "arrow")] +use arrow_schema::ArrowError; #[derive(Debug, PartialEq, Clone, Eq)] pub enum ParquetError { @@ -34,7 +34,7 @@ pub enum ParquetError { /// Returned when IO related failures occur, e.g. when there are not enough bytes to /// decode. EOF(String), - #[cfg(any(feature = "arrow", test))] + #[cfg(feature = "arrow")] /// Arrow error. /// Returned when reading into arrow or writing from arrow. ArrowError(String), @@ -49,7 +49,7 @@ impl std::fmt::Display for ParquetError { } ParquetError::NYI(ref message) => write!(fmt, "NYI: {}", message), ParquetError::EOF(ref message) => write!(fmt, "EOF: {}", message), - #[cfg(any(feature = "arrow", test))] + #[cfg(feature = "arrow")] ParquetError::ArrowError(ref message) => write!(fmt, "Arrow: {}", message), ParquetError::IndexOutOfBound(ref index, ref bound) => { write!(fmt, "Index {} out of bound: {}", index, bound) @@ -95,7 +95,7 @@ impl From for ParquetError { } } -#[cfg(any(feature = "arrow", test))] +#[cfg(feature = "arrow")] impl From for ParquetError { fn from(e: ArrowError) -> ParquetError { ParquetError::ArrowError(format!("underlying Arrow error: {}", e)) @@ -103,7 +103,7 @@ impl From for ParquetError { } /// A specialized `Result` for Parquet errors. -pub type Result = result::Result; +pub type Result = result::Result; // ---------------------------------------------------------------------- // Conversion from `ParquetError` to other types of `Error`s @@ -135,7 +135,7 @@ macro_rules! eof_err { ($fmt:expr, $($args:expr),*) => (ParquetError::EOF(format!($fmt, $($args),*))); } -#[cfg(any(feature = "arrow", test))] +#[cfg(feature = "arrow")] macro_rules! arrow_err { ($fmt:expr) => (ParquetError::ArrowError($fmt.to_owned())); ($fmt:expr, $($args:expr),*) => (ParquetError::ArrowError(format!($fmt, $($args),*))); @@ -147,7 +147,7 @@ macro_rules! arrow_err { // ---------------------------------------------------------------------- // Convert parquet error into other errors -#[cfg(any(feature = "arrow", test))] +#[cfg(feature = "arrow")] impl From for ArrowError { fn from(p: ParquetError) -> Self { Self::ParquetError(format!("{}", p)) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2b3c7d139148..a400d4dabcb1 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -292,6 +292,7 @@ impl SerializedFileReader { } } + #[cfg(feature = "arrow")] pub(crate) fn metadata_ref(&self) -> &Arc { &self.metadata } diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index b34d9aa8ae83..07cddfc3f448 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -79,7 +79,7 @@ pub use self::encodings::{decoding, encoding}; pub use self::util::memory; experimental!(#[macro_use] mod util); -#[cfg(any(feature = "arrow", test))] +#[cfg(feature = "arrow")] pub mod arrow; pub mod column; experimental!(mod compression); diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index e638237e06c5..a59ab8e7a31c 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -88,6 +88,7 @@ impl Interner { } /// Unwraps the inner storage + #[cfg(feature = "arrow")] pub fn into_inner(self) -> S { self.storage } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index cf068d2f4e1c..a0b2b6ea1447 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -23,7 +23,7 @@ description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] -keywords = [ "parquet" ] +keywords = ["parquet"] readme = "README.md" edition = "2021" rust-version = "1.62" @@ -34,5 +34,5 @@ proc-macro = true [dependencies] proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } -syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "26.0.0" } +syn = { version = "1.0", features = ["extra-traits"] } +parquet = { path = "../parquet", version = "26.0.0", default-features = false } From 9f14683313bd87e72344cdeb6b35201943fdbcb4 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Fri, 11 Nov 2022 05:34:29 +0800 Subject: [PATCH 0256/1411] Move `intersect_row_selections` from datafusion to arrow-rs. (#3047) * Add `RowSelection::intersect_row_selections` from datafusion. * fix pub issue --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 128 ++++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 35b70a0485cd..1f841a0ee175 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -41,7 +41,7 @@ mod filter; mod selection; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; -pub use selection::{RowSelection, RowSelector}; +pub use selection::{intersect_row_selections, RowSelection, RowSelector}; /// A generic builder for constructing sync or async arrow parquet readers. This is not intended /// to be used directly, instead you should use the specialization for the type of reader diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 357960906c81..e01c584b6e63 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -349,6 +349,66 @@ impl From for VecDeque { } } +// Combine two lists of `RowSelection` return the intersection of them +// For example: +// self: NNYYYYNNYYNYN +// other: NYNNNNNNY +// +// returned: NNNNNNNNYYNYN +pub fn intersect_row_selections( + left: Vec, + right: Vec, +) -> Vec { + let mut res = Vec::with_capacity(left.len()); + let mut l_iter = left.into_iter().peekable(); + let mut r_iter = right.into_iter().peekable(); + + while let (Some(a), Some(b)) = (l_iter.peek_mut(), r_iter.peek_mut()) { + if a.row_count == 0 { + l_iter.next().unwrap(); + continue; + } + if b.row_count == 0 { + r_iter.next().unwrap(); + continue; + } + match (a.skip, b.skip) { + // Keep both ranges + (false, false) => { + if a.row_count < b.row_count { + res.push(RowSelector::select(a.row_count)); + b.row_count -= a.row_count; + l_iter.next().unwrap(); + } else { + res.push(RowSelector::select(b.row_count)); + a.row_count -= b.row_count; + r_iter.next().unwrap(); + } + } + // skip at least one + _ => { + if a.row_count < b.row_count { + res.push(RowSelector::skip(a.row_count)); + b.row_count -= a.row_count; + l_iter.next().unwrap(); + } else { + res.push(RowSelector::skip(b.row_count)); + a.row_count -= b.row_count; + r_iter.next().unwrap(); + } + } + } + } + + if l_iter.peek().is_some() { + res.extend(l_iter); + } + if r_iter.peek().is_some() { + res.extend(r_iter); + } + res +} + fn add_selector(skip: bool, sum_row: usize, combined_result: &mut Vec) { let selector = if skip { RowSelector::skip(sum_row) @@ -618,6 +678,74 @@ mod tests { a.and_then(&b); } + #[test] + fn test_intersect_row_selection_and_combine() { + // a size equal b size + let a = vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ]; + let b = vec![ + RowSelector::select(8), + RowSelector::skip(1), + RowSelector::select(1), + ]; + + let res = intersect_row_selections(a, b); + assert_eq!( + RowSelection::from_selectors_and_combine(&res).selectors, + vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ], + ); + + // a size larger than b size + let a = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]; + let b = vec![RowSelector::select(36), RowSelector::skip(36)]; + let res = intersect_row_selections(a, b); + assert_eq!( + RowSelection::from_selectors_and_combine(&res).selectors, + vec![RowSelector::select(3), RowSelector::skip(69)] + ); + + // a size less than b size + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(a, b); + assert_eq!( + RowSelection::from_selectors_and_combine(&res).selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); + + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(a, b); + assert_eq!( + RowSelection::from_selectors_and_combine(&res).selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); + } + #[test] fn test_and_fuzz() { let mut rand = thread_rng(); From 885a3618fc8739aaa7c1f9ff328a09a39d4c3af6 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 10 Nov 2022 21:46:02 +0000 Subject: [PATCH 0257/1411] feat: add `OwnedRow` (#3079) Closes #3078. --- arrow/src/row/mod.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 4fbaa3931b08..4dd2a33c0bdc 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -451,6 +451,16 @@ pub struct Row<'a> { fields: &'a Arc<[SortField]>, } +impl<'a> Row<'a> { + /// Create owned version of the row to detach it from the shared [`Rows`]. + pub fn owned(&self) -> OwnedRow { + OwnedRow { + data: self.data.to_vec(), + fields: Arc::clone(self.fields), + } + } +} + // Manually derive these as don't wish to include `fields` impl<'a> PartialEq for Row<'a> { @@ -490,6 +500,66 @@ impl<'a> AsRef<[u8]> for Row<'a> { } } +/// Owned version of a [`Row`] that can be moved/cloned freely. +/// +/// This contains the data for the one specific row (not the entire buffer of all rows). +#[derive(Debug, Clone)] +pub struct OwnedRow { + data: Vec, + fields: Arc<[SortField]>, +} + +impl OwnedRow { + /// Get borrowed [`Row`] from owned version. + /// + /// This is helpful if you want to compare an [`OwnedRow`] with a [`Row`]. + pub fn row(&self) -> Row<'_> { + Row { + data: &self.data, + fields: &self.fields, + } + } +} + +// Manually derive these as don't wish to include `fields`. Also we just want to use the same `Row` implementations here. + +impl PartialEq for OwnedRow { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.row().eq(&other.row()) + } +} + +impl Eq for OwnedRow {} + +impl PartialOrd for OwnedRow { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + self.row().partial_cmp(&other.row()) + } +} + +impl Ord for OwnedRow { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + self.row().cmp(&other.row()) + } +} + +impl Hash for OwnedRow { + #[inline] + fn hash(&self, state: &mut H) { + self.row().hash(state) + } +} + +impl AsRef<[u8]> for OwnedRow { + #[inline] + fn as_ref(&self) -> &[u8] { + &self.data + } +} + /// Returns the null sentinel, negated if `invert` is true #[inline] fn null_sentinel(options: SortOptions) -> u8 { From 522625814cbb5f22e0b4f60227a397aff71098b5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Nov 2022 13:23:34 +1300 Subject: [PATCH 0258/1411] Make RowSelection::intersection a member function (#3084) --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 42 +++++++++++++-------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1f841a0ee175..35b70a0485cd 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -41,7 +41,7 @@ mod filter; mod selection; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; -pub use selection::{intersect_row_selections, RowSelection, RowSelector}; +pub use selection::{RowSelection, RowSelector}; /// A generic builder for constructing sync or async arrow parquet readers. This is not intended /// to be used directly, instead you should use the specialization for the type of reader diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index e01c584b6e63..d5c4ce5ea450 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -323,6 +323,18 @@ impl RowSelection { Self { selectors } } + /// Compute the intersection of two [`RowSelection`] + /// For example: + /// self: NNYYYYNNYYNYN + /// other: NYNNNNNNY + /// + /// returned: NNNNNNNNYYNYN + pub fn intersection(&self, other: &Self) -> Self { + Self { + selectors: intersect_row_selections(&self.selectors, &other.selectors), + } + } + /// Returns `true` if this [`RowSelection`] selects any rows pub fn selects_any(&self) -> bool { self.selectors.iter().any(|x| !x.skip) @@ -349,19 +361,19 @@ impl From for VecDeque { } } -// Combine two lists of `RowSelection` return the intersection of them -// For example: -// self: NNYYYYNNYYNYN -// other: NYNNNNNNY -// -// returned: NNNNNNNNYYNYN -pub fn intersect_row_selections( - left: Vec, - right: Vec, +/// Combine two lists of `RowSelection` return the intersection of them +/// For example: +/// self: NNYYYYNNYYNYN +/// other: NYNNNNNNY +/// +/// returned: NNNNNNNNYYNYN +fn intersect_row_selections( + left: &[RowSelector], + right: &[RowSelector], ) -> Vec { let mut res = Vec::with_capacity(left.len()); - let mut l_iter = left.into_iter().peekable(); - let mut r_iter = right.into_iter().peekable(); + let mut l_iter = left.iter().copied().peekable(); + let mut r_iter = right.iter().copied().peekable(); while let (Some(a), Some(b)) = (l_iter.peek_mut(), r_iter.peek_mut()) { if a.row_count == 0 { @@ -692,7 +704,7 @@ mod tests { RowSelector::select(1), ]; - let res = intersect_row_selections(a, b); + let res = intersect_row_selections(&a, &b); assert_eq!( RowSelection::from_selectors_and_combine(&res).selectors, vec![ @@ -710,7 +722,7 @@ mod tests { RowSelector::skip(33), ]; let b = vec![RowSelector::select(36), RowSelector::skip(36)]; - let res = intersect_row_selections(a, b); + let res = intersect_row_selections(&a, &b); assert_eq!( RowSelection::from_selectors_and_combine(&res).selectors, vec![RowSelector::select(3), RowSelector::skip(69)] @@ -725,7 +737,7 @@ mod tests { RowSelector::skip(2), RowSelector::select(2), ]; - let res = intersect_row_selections(a, b); + let res = intersect_row_selections(&a, &b); assert_eq!( RowSelection::from_selectors_and_combine(&res).selectors, vec![RowSelector::select(2), RowSelector::skip(8)] @@ -739,7 +751,7 @@ mod tests { RowSelector::skip(2), RowSelector::select(2), ]; - let res = intersect_row_selections(a, b); + let res = intersect_row_selections(&a, &b); assert_eq!( RowSelection::from_selectors_and_combine(&res).selectors, vec![RowSelector::select(2), RowSelector::skip(8)] From 01396822eb68a90565cf8b177aab4b0ce8af40e1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Nov 2022 14:42:47 +1300 Subject: [PATCH 0259/1411] Remove unused range module (#3085) --- parquet/src/file/page_index/mod.rs | 3 - parquet/src/file/page_index/range.rs | 475 --------------------------- 2 files changed, 478 deletions(-) delete mode 100644 parquet/src/file/page_index/range.rs diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index bb7808f16487..dcc1120fc4e3 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -17,6 +17,3 @@ pub mod index; pub mod index_reader; - -#[cfg(test)] -pub(crate) mod range; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs deleted file mode 100644 index 816ea4025f20..000000000000 --- a/parquet/src/file/page_index/range.rs +++ /dev/null @@ -1,475 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -use crate::errors::ParquetError; -use crate::format::PageLocation; -use std::cmp::Ordering; -use std::collections::VecDeque; -use std::ops::RangeInclusive; - -type Range = RangeInclusive; - -pub trait RangeOps { - fn is_before(&self, other: &Self) -> bool; - - fn is_after(&self, other: &Self) -> bool; - - fn count(&self) -> usize; - - fn union(left: &Range, right: &Range) -> Option; - - fn intersection(left: &Range, right: &Range) -> Option; -} - -impl RangeOps for Range { - fn is_before(&self, other: &Range) -> bool { - self.end() < other.start() - } - - fn is_after(&self, other: &Range) -> bool { - self.start() > other.end() - } - - fn count(&self) -> usize { - self.end() + 1 - self.start() - } - - /// Return the union of the two ranges, - /// Return `None` if there are hole between them. - fn union(left: &Range, right: &Range) -> Option { - if left.start() <= right.start() { - if left.end() + 1 >= *right.start() { - return Some(Range::new( - *left.start(), - std::cmp::max(*left.end(), *right.end()), - )); - } - } else if right.end() + 1 >= *left.start() { - return Some(Range::new( - *right.start(), - std::cmp::max(*left.end(), *right.end()), - )); - } - None - } - - /// Returns the intersection of the two ranges, - /// return null if they are not overlapped. - fn intersection(left: &Range, right: &Range) -> Option { - if left.start() <= right.start() { - if left.end() >= right.start() { - return Some(Range::new( - *right.start(), - std::cmp::min(*left.end(), *right.end()), - )); - } - } else if right.end() >= left.start() { - return Some(Range::new( - *left.start(), - std::cmp::min(*left.end(), *right.end()), - )); - } - None - } -} - -/// Struct representing row ranges in a row-group. These row ranges are calculated as a result of using -/// the column index on the filtering. -#[derive(Debug, Clone)] -pub struct RowRanges { - pub ranges: VecDeque, -} - -impl RowRanges { - //create an empty RowRanges - pub fn new_empty() -> Self { - RowRanges { - ranges: VecDeque::new(), - } - } - - pub fn count(&self) -> usize { - self.ranges.len() - } - - pub fn filter_with_mask(&self, mask: &[bool]) -> Result { - if self.ranges.len() != mask.len() { - return Err(ParquetError::General(format!( - "Mask size {} is not equal to number of pages {}", - mask.len(), - self.count() - ))); - } - let vec_range = mask - .iter() - .zip(self.ranges.clone()) - .filter_map(|(&f, r)| if f { Some(r) } else { None }) - .collect(); - Ok(RowRanges { ranges: vec_range }) - } - - /// Add a range to the end of the list of ranges. It maintains the disjunctive ascending order of the ranges by - /// trying to union the specified range to the last ranges in the list. The specified range shall be larger than - /// the last one or might be overlapped with some of the last ones. - /// [a, b] < [c, d] if b < c - pub fn add(&mut self, mut range: Range) { - let count = self.count(); - if count > 0 { - for i in 1..(count + 1) { - let index = count - i; - let last = self.ranges.get(index).unwrap(); - assert!(!last.is_after(&range), "Must add range in ascending!"); - // try to merge range - match Range::union(last, &range) { - None => { - break; - } - Some(r) => { - range = r; - self.ranges.remove(index); - } - } - } - } - self.ranges.push_back(range); - } - - /// Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no - /// elements between them. Otherwise, the two disjunctive ranges are stored separately. - /// For example: - /// [113, 241] ∪ [221, 340] = [113, 330] - /// [113, 230] ∪ [231, 340] = [113, 340] - /// while - /// [113, 230] ∪ [232, 340] = [113, 230], [232, 340] - /// - /// The result RowRanges object will contain all the row indexes that were contained in one of the specified objects. - pub fn union(mut left: RowRanges, mut right: RowRanges) -> RowRanges { - let v1 = &mut left.ranges; - let v2 = &mut right.ranges; - let mut result = RowRanges::new_empty(); - if v2.is_empty() { - left.clone() - } else { - let mut range2 = v2.pop_front().unwrap(); - while !v1.is_empty() { - let range1 = v1.pop_front().unwrap(); - if range1.is_after(&range2) { - result.add(range2); - range2 = range1; - std::mem::swap(v1, v2); - } else { - result.add(range1); - } - } - - result.add(range2); - while !v2.is_empty() { - result.add(v2.pop_front().unwrap()) - } - - result - } - } - - /// Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common - /// elements otherwise the result is empty. - /// For example: - /// [113, 241] ∩ [221, 340] = [221, 241] - /// while - /// [113, 230] ∩ [231, 340] = - /// - /// The result RowRanges object will contain all the row indexes there were contained in both of the specified objects - #[allow(clippy::mut_range_bound)] - pub fn intersection(left: RowRanges, right: RowRanges) -> RowRanges { - let mut result = RowRanges::new_empty(); - let mut right_index = 0; - for l in left.ranges.iter() { - for i in right_index..right.ranges.len() { - let r = right.ranges.get(i).unwrap(); - if l.is_before(r) { - break; - } else if l.is_after(r) { - right_index = i + 1; - continue; - } - if let Some(ra) = Range::intersection(l, r) { - result.add(ra); - } - } - } - result - } - - #[allow(unused)] - pub fn row_count(&self) -> usize { - self.ranges.iter().map(|x| x.count()).sum() - } - - pub fn is_overlapping(&self, x: &Range) -> bool { - self.ranges - .binary_search_by(|y| -> Ordering { - if y.is_before(x) { - Ordering::Less - } else if y.is_after(x) { - Ordering::Greater - } else { - Ordering::Equal - } - }) - .is_ok() - } -} - -/// Takes an array of [`PageLocation`], and a total number of rows, and based on the provided `page_mask` -/// returns the corresponding [`RowRanges`] to scan -pub fn compute_row_ranges( - page_mask: &[bool], - locations: &[PageLocation], - total_rows: usize, -) -> Result { - if page_mask.len() != locations.len() { - return Err(ParquetError::General(format!( - "Page_mask size {} is not equal to number of locations {}", - page_mask.len(), - locations.len(), - ))); - } - let row_ranges = page_locations_to_row_ranges(locations, total_rows)?; - row_ranges.filter_with_mask(page_mask) -} - -fn page_locations_to_row_ranges( - locations: &[PageLocation], - total_rows: usize, -) -> Result { - if locations.is_empty() || total_rows == 0 { - return Ok(RowRanges::new_empty()); - } - - // If we read directly from parquet pageIndex to construct locations, - // the location index should be continuous - let mut vec_range: VecDeque = locations - .windows(2) - .map(|x| { - let start = x[0].first_row_index as usize; - let end = (x[1].first_row_index - 1) as usize; - Range::new(start, end) - }) - .collect(); - - let last = Range::new( - locations.last().unwrap().first_row_index as usize, - total_rows - 1, - ); - vec_range.push_back(last); - - Ok(RowRanges { ranges: vec_range }) -} - -#[cfg(test)] -mod tests { - use crate::basic::Type::INT32; - use crate::file::page_index::index::{NativeIndex, PageIndex}; - use crate::file::page_index::range::{compute_row_ranges, Range, RowRanges}; - use crate::format::{BoundaryOrder, PageLocation}; - - #[test] - fn test_binary_search_overlap() { - let mut ranges = RowRanges::new_empty(); - ranges.add(Range::new(1, 3)); - ranges.add(Range::new(6, 7)); - - assert!(ranges.is_overlapping(&Range::new(1, 2))); - // include both [start, end] - assert!(ranges.is_overlapping(&Range::new(0, 1))); - assert!(ranges.is_overlapping(&Range::new(0, 3))); - - assert!(ranges.is_overlapping(&Range::new(0, 7))); - assert!(ranges.is_overlapping(&Range::new(2, 7))); - - assert!(!ranges.is_overlapping(&Range::new(4, 5))); - } - - #[test] - fn test_add_func_ascending_disjunctive() { - let mut ranges_1 = RowRanges::new_empty(); - ranges_1.add(Range::new(1, 3)); - ranges_1.add(Range::new(5, 6)); - ranges_1.add(Range::new(8, 9)); - assert_eq!(ranges_1.count(), 3); - } - - #[test] - fn test_add_func_ascending_merge() { - let mut ranges_1 = RowRanges::new_empty(); - ranges_1.add(Range::new(1, 3)); - ranges_1.add(Range::new(4, 5)); - ranges_1.add(Range::new(6, 7)); - assert_eq!(ranges_1.count(), 1); - } - - #[test] - #[should_panic(expected = "Must add range in ascending!")] - fn test_add_func_not_ascending() { - let mut ranges_1 = RowRanges::new_empty(); - ranges_1.add(Range::new(6, 7)); - ranges_1.add(Range::new(1, 3)); - ranges_1.add(Range::new(4, 5)); - assert_eq!(ranges_1.count(), 1); - } - - #[test] - fn test_union_func() { - let mut ranges_1 = RowRanges::new_empty(); - ranges_1.add(Range::new(1, 2)); - ranges_1.add(Range::new(3, 4)); - ranges_1.add(Range::new(5, 6)); - - let mut ranges_2 = RowRanges::new_empty(); - ranges_2.add(Range::new(2, 3)); - ranges_2.add(Range::new(4, 5)); - ranges_2.add(Range::new(6, 7)); - - let ranges = RowRanges::union(ranges_1, ranges_2); - assert_eq!(ranges.count(), 1); - let range = ranges.ranges.get(0).unwrap(); - assert_eq!(*range.start(), 1); - assert_eq!(*range.end(), 7); - - let mut ranges_a = RowRanges::new_empty(); - ranges_a.add(Range::new(1, 3)); - ranges_a.add(Range::new(5, 8)); - ranges_a.add(Range::new(11, 12)); - - let mut ranges_b = RowRanges::new_empty(); - ranges_b.add(Range::new(0, 2)); - ranges_b.add(Range::new(6, 7)); - ranges_b.add(Range::new(10, 11)); - - let ranges = RowRanges::union(ranges_a, ranges_b); - assert_eq!(ranges.count(), 3); - - let range_1 = ranges.ranges.get(0).unwrap(); - assert_eq!(*range_1.start(), 0); - assert_eq!(*range_1.end(), 3); - let range_2 = ranges.ranges.get(1).unwrap(); - assert_eq!(*range_2.start(), 5); - assert_eq!(*range_2.end(), 8); - let range_3 = ranges.ranges.get(2).unwrap(); - assert_eq!(*range_3.start(), 10); - assert_eq!(*range_3.end(), 12); - } - - #[test] - fn test_intersection_func() { - let mut ranges_1 = RowRanges::new_empty(); - ranges_1.add(Range::new(1, 2)); - ranges_1.add(Range::new(3, 4)); - ranges_1.add(Range::new(5, 6)); - - let mut ranges_2 = RowRanges::new_empty(); - ranges_2.add(Range::new(2, 3)); - ranges_2.add(Range::new(4, 5)); - ranges_2.add(Range::new(6, 7)); - - let ranges = RowRanges::intersection(ranges_1, ranges_2); - assert_eq!(ranges.count(), 1); - let range = ranges.ranges.get(0).unwrap(); - assert_eq!(*range.start(), 2); - assert_eq!(*range.end(), 6); - - let mut ranges_a = RowRanges::new_empty(); - ranges_a.add(Range::new(1, 3)); - ranges_a.add(Range::new(5, 8)); - ranges_a.add(Range::new(11, 12)); - - let mut ranges_b = RowRanges::new_empty(); - ranges_b.add(Range::new(0, 2)); - ranges_b.add(Range::new(6, 7)); - ranges_b.add(Range::new(10, 11)); - - let ranges = RowRanges::intersection(ranges_a, ranges_b); - assert_eq!(ranges.count(), 3); - - let range_1 = ranges.ranges.get(0).unwrap(); - assert_eq!(*range_1.start(), 1); - assert_eq!(*range_1.end(), 2); - let range_2 = ranges.ranges.get(1).unwrap(); - assert_eq!(*range_2.start(), 6); - assert_eq!(*range_2.end(), 7); - let range_3 = ranges.ranges.get(2).unwrap(); - assert_eq!(*range_3.start(), 11); - assert_eq!(*range_3.end(), 11); - } - - #[test] - fn test_compute_one() { - let locations = &[PageLocation { - offset: 50, - compressed_page_size: 10, - first_row_index: 0, - }]; - let total_rows = 10; - - let row_ranges = compute_row_ranges(&[true], locations, total_rows).unwrap(); - assert_eq!(row_ranges.count(), 1); - assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 9)); - } - - #[test] - fn test_compute_multi() { - let index: NativeIndex = NativeIndex { - physical_type: INT32, - indexes: vec![ - PageIndex { - min: Some(0), - max: Some(10), - null_count: Some(0), - }, - PageIndex { - min: Some(15), - max: Some(20), - null_count: Some(0), - }, - ], - boundary_order: BoundaryOrder::ASCENDING, - }; - let locations = &[ - PageLocation { - offset: 100, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 200, - compressed_page_size: 20, - first_row_index: 11, - }, - ]; - let total_rows = 20; - - //filter `x < 11` - let filter = - |page: &PageIndex| page.max.as_ref().map(|&x| x < 11).unwrap_or(false); - - let mask = index.indexes.iter().map(filter).collect::>(); - - let row_ranges = compute_row_ranges(&mask, locations, total_rows).unwrap(); - - assert_eq!(row_ranges.count(), 1); - assert_eq!(row_ranges.ranges.get(0).unwrap(), &Range::new(0, 10)); - } -} From 02a3f5cd24ef586cdf57af1f06cad662a094a9af Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 12 Nov 2022 07:18:52 +1300 Subject: [PATCH 0260/1411] Move CSV test data (#3044) (#3051) * Move CSV test data (#3044) * Format --- arrow-csv/src/reader.rs | 425 +++++++++++++++++- .../test/data/decimal_test.csv | 0 {arrow => arrow-csv}/test/data/null_test.csv | 0 {arrow => arrow-csv}/test/data/uk_cities.csv | 0 .../test/data/uk_cities_with_headers.csv | 0 .../test/data/various_types.csv | 0 .../test/data/various_types_invalid.csv | 0 arrow/Cargo.toml | 2 +- arrow/examples/read_csv.rs | 5 +- arrow/examples/read_csv_infer_schema.rs | 2 +- arrow/tests/csv.rs | 422 ----------------- dev/release/rat_exclude_files.txt | 1 + 12 files changed, 430 insertions(+), 427 deletions(-) rename {arrow => arrow-csv}/test/data/decimal_test.csv (100%) rename {arrow => arrow-csv}/test/data/null_test.csv (100%) rename {arrow => arrow-csv}/test/data/uk_cities.csv (100%) rename {arrow => arrow-csv}/test/data/uk_cities_with_headers.csv (100%) rename {arrow => arrow-csv}/test/data/various_types.csv (100%) rename {arrow => arrow-csv}/test/data/various_types_invalid.csv (100%) diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 459c23ad2616..2fb6493e1be6 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -22,7 +22,7 @@ //! //! Example: //! -//! ```no_run +//! ``` //! # use arrow_schema::*; //! # use arrow_csv::Reader; //! # use std::fs::File; @@ -1131,11 +1131,432 @@ impl ReaderBuilder { mod tests { use super::*; - use std::io::Write; + use std::io::{Cursor, Write}; use tempfile::NamedTempFile; use chrono::prelude::*; + #[test] + fn test_csv() { + let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] + .into_iter() + .map(|format| { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + format, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + }) + .collect(); + } + + #[test] + fn test_csv_schema_metadata() { + let mut metadata = std::collections::HashMap::new(); + metadata.insert("foo".to_owned(), "bar".to_owned()); + let schema = Schema::new_with_metadata( + vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ], + metadata.clone(), + ); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + None, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + assert_eq!(&metadata, batch.schema().metadata()); + } + + #[test] + fn test_csv_reader_with_decimal() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Decimal128(38, 6), false), + Field::new("lng", DataType::Decimal128(38, 6), false), + ]); + + let file = File::open("test/data/decimal_test.csv").unwrap(); + + let mut csv = + Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("57.653484", lat.value_as_string(0)); + assert_eq!("53.002666", lat.value_as_string(1)); + assert_eq!("52.412811", lat.value_as_string(2)); + assert_eq!("51.481583", lat.value_as_string(3)); + assert_eq!("12.123456", lat.value_as_string(4)); + assert_eq!("50.760000", lat.value_as_string(5)); + assert_eq!("0.123000", lat.value_as_string(6)); + assert_eq!("123.000000", lat.value_as_string(7)); + assert_eq!("123.000000", lat.value_as_string(8)); + assert_eq!("-50.760000", lat.value_as_string(9)); + } + + #[test] + fn test_csv_from_buf_reader() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file_with_headers = + File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); + let both_files = file_with_headers + .chain(Cursor::new("\n".to_string())) + .chain(file_without_headers); + let mut csv = Reader::from_reader( + both_files, + Arc::new(schema), + true, + None, + 1024, + None, + None, + None, + ); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(74, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + } + + #[test] + fn test_csv_with_schema_inference() { + let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + + let builder = ReaderBuilder::new().has_header(true).infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + let expected_schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, true), + Field::new("lat", DataType::Float64, true), + Field::new("lng", DataType::Float64, true), + ]); + assert_eq!(Arc::new(expected_schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + } + + #[test] + fn test_csv_with_schema_inference_no_headers() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let builder = ReaderBuilder::new().infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + + // csv field names should be 'column_{number}' + let schema = csv.schema(); + assert_eq!("column_1", schema.field(0).name()); + assert_eq!("column_2", schema.field(1).name()); + assert_eq!("column_3", schema.field(2).name()); + let batch = csv.next().unwrap().unwrap(); + let batch_schema = batch.schema(); + + assert_eq!(schema, batch_schema); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + } + + #[test] + fn test_csv_builder_with_bounds() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + // Set the bounds to the lines 0, 1 and 2. + let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // The value on line 0 is within the bounds + assert_eq!("Elgin, Scotland, the UK", city.value(0)); + + // The value on line 13 is outside of the bounds. Therefore + // the call to .value() will panic. + let result = std::panic::catch_unwind(|| city.value(13)); + assert!(result.is_err()); + } + + #[test] + fn test_csv_with_projection() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + } + + #[test] + fn test_csv_with_dictionary() { + let schema = Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + + let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); + let strings = strings.as_any().downcast_ref::().unwrap(); + + assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); + assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); + assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); + } + + #[test] + fn test_nulls() { + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, false), + ]); + + let file = File::open("test/data/null_test.csv").unwrap(); + + let mut csv = + Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_nulls_with_inference() { + let file = File::open("test/data/various_types.csv").unwrap(); + + let builder = ReaderBuilder::new() + .infer_schema(None) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3, 4, 5]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(7, batch.num_rows()); + assert_eq!(6, batch.num_columns()); + + let schema = batch.schema(); + + assert_eq!(&DataType::Int64, schema.field(0).data_type()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + assert_eq!(&DataType::Float64, schema.field(2).data_type()); + assert_eq!(&DataType::Boolean, schema.field(3).data_type()); + assert_eq!(&DataType::Date32, schema.field(4).data_type()); + assert_eq!(&DataType::Date64, schema.field(5).data_type()); + + let names: Vec<&str> = + schema.fields().iter().map(|x| x.name().as_str()).collect(); + assert_eq!( + names, + vec![ + "c_int", + "c_float", + "c_string", + "c_bool", + "c_date", + "c_datetime" + ] + ); + + assert!(schema.field(0).is_nullable()); + assert!(schema.field(1).is_nullable()); + assert!(schema.field(2).is_nullable()); + assert!(schema.field(3).is_nullable()); + assert!(schema.field(4).is_nullable()); + assert!(schema.field(5).is_nullable()); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_parse_invalid_csv() { + let file = File::open("test/data/various_types_invalid.csv").unwrap(); + + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, false), + Field::new("c_string", DataType::Utf8, false), + Field::new("c_bool", DataType::Boolean, false), + ]); + + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + match csv.next() { + Some(e) => match e { + Err(e) => assert_eq!( + "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", + format!("{:?}", e) + ), + Ok(_) => panic!("should have failed"), + }, + None => panic!("should have failed"), + } + } + #[test] fn test_infer_field_schema() { assert_eq!(infer_field_schema("A", None), DataType::Utf8); diff --git a/arrow/test/data/decimal_test.csv b/arrow-csv/test/data/decimal_test.csv similarity index 100% rename from arrow/test/data/decimal_test.csv rename to arrow-csv/test/data/decimal_test.csv diff --git a/arrow/test/data/null_test.csv b/arrow-csv/test/data/null_test.csv similarity index 100% rename from arrow/test/data/null_test.csv rename to arrow-csv/test/data/null_test.csv diff --git a/arrow/test/data/uk_cities.csv b/arrow-csv/test/data/uk_cities.csv similarity index 100% rename from arrow/test/data/uk_cities.csv rename to arrow-csv/test/data/uk_cities.csv diff --git a/arrow/test/data/uk_cities_with_headers.csv b/arrow-csv/test/data/uk_cities_with_headers.csv similarity index 100% rename from arrow/test/data/uk_cities_with_headers.csv rename to arrow-csv/test/data/uk_cities_with_headers.csv diff --git a/arrow/test/data/various_types.csv b/arrow-csv/test/data/various_types.csv similarity index 100% rename from arrow/test/data/various_types.csv rename to arrow-csv/test/data/various_types.csv diff --git a/arrow/test/data/various_types_invalid.csv b/arrow-csv/test/data/various_types_invalid.csv similarity index 100% rename from arrow/test/data/various_types_invalid.csv rename to arrow-csv/test/data/various_types_invalid.csv diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2e33014dbdea..452cc4bbd2a6 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -267,4 +267,4 @@ required-features = ["test_utils", "ipc"] [[test]] name = "csv" -required-features = ["csv"] +required-features = ["csv", "chrono-tz"] diff --git a/arrow/examples/read_csv.rs b/arrow/examples/read_csv.rs index a1a592134eba..efb55c6d2876 100644 --- a/arrow/examples/read_csv.rs +++ b/arrow/examples/read_csv.rs @@ -31,7 +31,10 @@ fn main() { Field::new("lng", DataType::Float64, false), ]); - let path = format!("{}/test/data/uk_cities.csv", env!("CARGO_MANIFEST_DIR")); + let path = format!( + "{}/../arrow-csv/test/data/uk_cities.csv", + env!("CARGO_MANIFEST_DIR") + ); let file = File::open(path).unwrap(); let mut csv = diff --git a/arrow/examples/read_csv_infer_schema.rs b/arrow/examples/read_csv_infer_schema.rs index 120a7b81910b..2a713ba6109c 100644 --- a/arrow/examples/read_csv_infer_schema.rs +++ b/arrow/examples/read_csv_infer_schema.rs @@ -23,7 +23,7 @@ use std::fs::File; fn main() { let path = format!( - "{}/test/data/uk_cities_with_headers.csv", + "{}/../arrow-csv/test/data/uk_cities_with_headers.csv", env!("CARGO_MANIFEST_DIR") ); let file = File::open(path).unwrap(); diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 11e1b30e1488..83a279ce4794 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -15,16 +15,12 @@ // specific language governing permissions and limitations // under the License. -use std::fs::File; -use std::io::{Cursor, Read}; use std::sync::Arc; use arrow_array::*; -use arrow_csv::{Reader, ReaderBuilder}; use arrow_schema::*; #[test] -#[cfg(feature = "chrono-tz")] fn test_export_csv_timestamps() { let schema = Schema::new(vec![ Field::new( @@ -66,421 +62,3 @@ fn test_export_csv_timestamps() { let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } - -#[test] -fn test_csv() { - let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] - .into_iter() - .map(|format| { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - format, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - }) - .collect(); -} - -#[test] -fn test_csv_schema_metadata() { - let mut metadata = std::collections::HashMap::new(); - metadata.insert("foo".to_owned(), "bar".to_owned()); - let schema = Schema::new_with_metadata( - vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ], - metadata.clone(), - ); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - assert_eq!(&metadata, batch.schema().metadata()); -} - -#[test] -fn test_csv_reader_with_decimal() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal128(38, 6), false), - Field::new("lng", DataType::Decimal128(38, 6), false), - ]); - - let file = File::open("test/data/decimal_test.csv").unwrap(); - - let mut csv = - Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("57.653484", lat.value_as_string(0)); - assert_eq!("53.002666", lat.value_as_string(1)); - assert_eq!("52.412811", lat.value_as_string(2)); - assert_eq!("51.481583", lat.value_as_string(3)); - assert_eq!("12.123456", lat.value_as_string(4)); - assert_eq!("50.760000", lat.value_as_string(5)); - assert_eq!("0.123000", lat.value_as_string(6)); - assert_eq!("123.000000", lat.value_as_string(7)); - assert_eq!("123.000000", lat.value_as_string(8)); - assert_eq!("-50.760000", lat.value_as_string(9)); -} - -#[test] -fn test_csv_from_buf_reader() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file_with_headers = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); - let both_files = file_with_headers - .chain(Cursor::new("\n".to_string())) - .chain(file_without_headers); - let mut csv = Reader::from_reader( - both_files, - Arc::new(schema), - true, - None, - 1024, - None, - None, - None, - ); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(74, batch.num_rows()); - assert_eq!(3, batch.num_columns()); -} - -#[test] -fn test_csv_with_schema_inference() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - - let builder = ReaderBuilder::new().has_header(true).infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - let expected_schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, true), - Field::new("lat", DataType::Float64, true), - Field::new("lng", DataType::Float64, true), - ]); - assert_eq!(Arc::new(expected_schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); -} - -#[test] -fn test_csv_with_schema_inference_no_headers() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let builder = ReaderBuilder::new().infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - - // csv field names should be 'column_{number}' - let schema = csv.schema(); - assert_eq!("column_1", schema.field(0).name()); - assert_eq!("column_2", schema.field(1).name()); - assert_eq!("column_3", schema.field(2).name()); - let batch = csv.next().unwrap().unwrap(); - let batch_schema = batch.schema(); - - assert_eq!(schema, batch_schema); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); -} - -#[test] -fn test_csv_builder_with_bounds() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - // Set the bounds to the lines 0, 1 and 2. - let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // The value on line 0 is within the bounds - assert_eq!("Elgin, Scotland, the UK", city.value(0)); - - // The value on line 13 is outside of the bounds. Therefore - // the call to .value() will panic. - let result = std::panic::catch_unwind(|| city.value(13)); - assert!(result.is_err()); -} - -#[test] -fn test_csv_with_projection() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); -} - -#[test] -fn test_csv_with_dictionary() { - let schema = Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); - - let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); - let strings = strings.as_any().downcast_ref::().unwrap(); - - assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); - assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); - assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); -} - -#[test] -fn test_nulls() { - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), - ]); - - let file = File::open("test/data/null_test.csv").unwrap(); - - let mut csv = Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); -} - -#[test] -fn test_nulls_with_inference() { - let file = File::open("test/data/various_types.csv").unwrap(); - - let builder = ReaderBuilder::new() - .infer_schema(None) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3, 4, 5]); - - let mut csv = builder.build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - assert_eq!(7, batch.num_rows()); - assert_eq!(6, batch.num_columns()); - - let schema = batch.schema(); - - assert_eq!(&DataType::Int64, schema.field(0).data_type()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - assert_eq!(&DataType::Float64, schema.field(2).data_type()); - assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - assert_eq!(&DataType::Date32, schema.field(4).data_type()); - assert_eq!(&DataType::Date64, schema.field(5).data_type()); - - let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); - assert_eq!( - names, - vec![ - "c_int", - "c_float", - "c_string", - "c_bool", - "c_date", - "c_datetime" - ] - ); - - assert!(schema.field(0).is_nullable()); - assert!(schema.field(1).is_nullable()); - assert!(schema.field(2).is_nullable()); - assert!(schema.field(3).is_nullable()); - assert!(schema.field(4).is_nullable()); - assert!(schema.field(5).is_nullable()); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); -} - -#[test] -fn test_parse_invalid_csv() { - let file = File::open("test/data/various_types_invalid.csv").unwrap(); - - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, false), - Field::new("c_string", DataType::Utf8, false), - Field::new("c_bool", DataType::Boolean, false), - ]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3]); - - let mut csv = builder.build(file).unwrap(); - match csv.next() { - Some(e) => match e { - Err(e) => assert_eq!( - "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", - format!("{:?}", e) - ), - Ok(_) => panic!("should have failed"), - }, - None => panic!("should have failed"), - } -} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0ca2ab91a5e8..fad1a5a7d1dd 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -3,6 +3,7 @@ testing/* target/* dev/release/rat_exclude_files.txt arrow/test/data/* +arrow-csv/test/data/* arrow-json/test/data/* arrow/test/dependency/* arrow-integration-test/data/* From 561f63a232843fb36486f139faa9249652a5053c Mon Sep 17 00:00:00 2001 From: Christian Salvati <81280761+src255@users.noreply.github.com> Date: Fri, 11 Nov 2022 14:06:14 -0500 Subject: [PATCH 0261/1411] Improved UX of creating `TimestampNanosecondArray` with timezones (#3088) * Make with_timezone more flexible Make with_timezone method accept both &str and String values. * Add alias for UTC Add a method to PrimitiveArray for using UTC as the timezone. Co-authored-by: Raphael Taylor-Davies --- arrow-array/src/array/primitive_array.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 195e0009c0cc..34abfeb0a3de 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -801,8 +801,13 @@ impl PrimitiveArray { } /// Construct a timestamp array with new timezone - pub fn with_timezone(&self, timezone: String) -> Self { - self.with_timezone_opt(Some(timezone)) + pub fn with_timezone(&self, timezone: impl Into) -> Self { + self.with_timezone_opt(Some(timezone.into())) + } + + /// Construct a timestamp array with UTC + pub fn with_timezone_utc(&self) -> Self { + self.with_timezone("+00:00") } /// Construct a timestamp array with an optional timezone @@ -1344,6 +1349,21 @@ mod tests { ); } + #[test] + fn test_timestamp_utc_fmt_debug() { + let arr: PrimitiveArray = + TimestampMillisecondArray::from(vec![ + 1546214400000, + 1546214400000, + -1546214400000, + ]) + .with_timezone_utc(); + assert_eq!( + "PrimitiveArray\n[\n 2018-12-31T00:00:00+00:00,\n 2018-12-31T00:00:00+00:00,\n 1921-01-02T00:00:00+00:00,\n]", + format!("{:?}", arr) + ); + } + #[test] #[cfg(feature = "chrono-tz")] fn test_timestamp_with_named_tz_fmt_debug() { From 94565bca99b5d9932a3e9a8e094aaf4e4384b1e5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Nov 2022 14:59:46 -0500 Subject: [PATCH 0262/1411] Update version to 27.0.0 and add changelog (#3089) * Update version to 27.0.0 * Initial Changelog * Updates after tags * Update CHANGELOG * updates --- CHANGELOG-old.md | 95 ++++++++++ CHANGELOG.md | 179 +++++++++++-------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 10 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow/Cargo.toml | 20 +-- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 +-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 24 files changed, 276 insertions(+), 160 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index b7f4a7fadc84..946958f1a636 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,101 @@ # Historical Changelog +## [26.0.0](https://github.com/apache/arrow-rs/tree/26.0.0) (2022-10-28) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/25.0.0...26.0.0) + +**Breaking changes:** + +- Cast Timestamps to RFC3339 strings [\#2934](https://github.com/apache/arrow-rs/issues/2934) +- Remove Unused NativeDecimalType [\#2945](https://github.com/apache/arrow-rs/pull/2945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Format Timestamps as RFC3339 [\#2939](https://github.com/apache/arrow-rs/pull/2939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Update flatbuffers to resolve RUSTSEC-2021-0122 [\#2895](https://github.com/apache/arrow-rs/pull/2895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- replace `from_timestamp` by `from_timestamp_opt` [\#2894](https://github.com/apache/arrow-rs/pull/2894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) + +**Implemented enhancements:** + +- Optimized way to count the numbers of `true` and `false` values in a BooleanArray [\#2963](https://github.com/apache/arrow-rs/issues/2963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add pow to i256 [\#2954](https://github.com/apache/arrow-rs/issues/2954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Write Generic Code over \[Large\]BinaryArray and \[Large\]StringArray [\#2946](https://github.com/apache/arrow-rs/issues/2946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Page Row Count Limit [\#2941](https://github.com/apache/arrow-rs/issues/2941) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- prettyprint to show timezone offset for timestamp with timezone [\#2937](https://github.com/apache/arrow-rs/issues/2937) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast numeric to decimal256 [\#2922](https://github.com/apache/arrow-rs/issues/2922) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `freeze_with_dictionary` API to `MutableArrayData` [\#2914](https://github.com/apache/arrow-rs/issues/2914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support decimal256 array in sort kernels [\#2911](https://github.com/apache/arrow-rs/issues/2911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- support `[+/-]hhmm` and `[+/-]hh` as fixedoffset timezone format [\#2910](https://github.com/apache/arrow-rs/issues/2910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cleanup decimal sort function [\#2907](https://github.com/apache/arrow-rs/issues/2907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- replace `from_timestamp` by `from_timestamp_opt` [\#2892](https://github.com/apache/arrow-rs/issues/2892) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move Primitive arity kernels to arrow-array [\#2787](https://github.com/apache/arrow-rs/issues/2787) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- add overflow-checking for negative arithmetic kernel [\#2662](https://github.com/apache/arrow-rs/issues/2662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Subtle compatibility issue with serve\_arrow [\#2952](https://github.com/apache/arrow-rs/issues/2952) +- error\[E0599\]: no method named `total_cmp` found for struct `f16` in the current scope [\#2926](https://github.com/apache/arrow-rs/issues/2926) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fail at rowSelection `and_then` method [\#2925](https://github.com/apache/arrow-rs/issues/2925) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Ordering not implemented for FixedSizeBinary types [\#2904](https://github.com/apache/arrow-rs/issues/2904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet API: Could not convert timestamp before unix epoch to string/json [\#2897](https://github.com/apache/arrow-rs/issues/2897) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Overly Pessimistic RLE Size Estimation [\#2889](https://github.com/apache/arrow-rs/issues/2889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Memory alignment error in `RawPtrBox::new` [\#2882](https://github.com/apache/arrow-rs/issues/2882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compilation error under chrono-tz feature [\#2878](https://github.com/apache/arrow-rs/issues/2878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- AHash Statically Allocates 64 bytes [\#2875](https://github.com/apache/arrow-rs/issues/2875) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `parquet::arrow::arrow_writer::ArrowWriter` ignores page size properties [\#2853](https://github.com/apache/arrow-rs/issues/2853) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Document crate topology \(\#2594\) [\#2913](https://github.com/apache/arrow-rs/pull/2913) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Closed issues:** + +- SerializedFileWriter comments about multiple call on consumed self [\#2935](https://github.com/apache/arrow-rs/issues/2935) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Pointer freed error when deallocating ArrayData with shared memory buffer [\#2874](https://github.com/apache/arrow-rs/issues/2874) +- Release Arrow `25.0.0` \(next release after `24.0.0`\) [\#2820](https://github.com/apache/arrow-rs/issues/2820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Replace DecimalArray with PrimitiveArray [\#2637](https://github.com/apache/arrow-rs/issues/2637) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix ignored limit on lexsort\_to\_indices (#2991) [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix GenericListArray::try\_new\_from\_array\_data error message \(\#526\) [\#2961](https://github.com/apache/arrow-rs/pull/2961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix take string on sliced indices [\#2960](https://github.com/apache/arrow-rs/pull/2960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::true\_count and BooleanArray::false\_count [\#2957](https://github.com/apache/arrow-rs/pull/2957) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add pow to i256 [\#2955](https://github.com/apache/arrow-rs/pull/2955) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix datatype for timestamptz debug fmt [\#2948](https://github.com/apache/arrow-rs/pull/2948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Add GenericByteArray \(\#2946\) [\#2947](https://github.com/apache/arrow-rs/pull/2947) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Specialize interleave string ~2-3x faster [\#2944](https://github.com/apache/arrow-rs/pull/2944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Added support for LZ4\_RAW compression. \(\#1604\) [\#2943](https://github.com/apache/arrow-rs/pull/2943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Add optional page row count limit for parquet `WriterProperties` \(\#2941\) [\#2942](https://github.com/apache/arrow-rs/pull/2942) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup orphaned doc comments \(\#2935\) [\#2938](https://github.com/apache/arrow-rs/pull/2938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- support more fixedoffset tz format [\#2936](https://github.com/apache/arrow-rs/pull/2936) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Benchmark with prepared row converter [\#2930](https://github.com/apache/arrow-rs/pull/2930) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add lexsort benchmark \(\#2871\) [\#2929](https://github.com/apache/arrow-rs/pull/2929) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve panic messages for RowSelection::and\_then \(\#2925\) [\#2928](https://github.com/apache/arrow-rs/pull/2928) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update required half from 2.0 --\> 2.1 [\#2927](https://github.com/apache/arrow-rs/pull/2927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Cast numeric to decimal256 [\#2923](https://github.com/apache/arrow-rs/pull/2923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup generated proto code [\#2921](https://github.com/apache/arrow-rs/pull/2921) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Deprecate TimestampArray from\_vec and from\_opt\_vec [\#2919](https://github.com/apache/arrow-rs/pull/2919) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support decimal256 array in sort kernels [\#2912](https://github.com/apache/arrow-rs/pull/2912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add timezone abstraction [\#2909](https://github.com/apache/arrow-rs/pull/2909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup decimal sort function [\#2908](https://github.com/apache/arrow-rs/pull/2908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify TimestampArray from\_vec with timezone [\#2906](https://github.com/apache/arrow-rs/pull/2906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement ord for FixedSizeBinary types [\#2905](https://github.com/apache/arrow-rs/pull/2905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- Update chrono-tz requirement from 0.6 to 0.7 [\#2903](https://github.com/apache/arrow-rs/pull/2903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Parquet record api support timestamp before epoch [\#2899](https://github.com/apache/arrow-rs/pull/2899) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AnthonyPoncet](https://github.com/AnthonyPoncet)) +- Specialize interleave integer [\#2898](https://github.com/apache/arrow-rs/pull/2898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support overflow-checking variant of negate kernel [\#2893](https://github.com/apache/arrow-rs/pull/2893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Respect Page Size Limits in ArrowWriter \(\#2853\) [\#2890](https://github.com/apache/arrow-rs/pull/2890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Improve row format docs [\#2888](https://github.com/apache/arrow-rs/pull/2888) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add FixedSizeList::from\_iter\_primitive [\#2887](https://github.com/apache/arrow-rs/pull/2887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify ListArray::from\_iter\_primitive [\#2886](https://github.com/apache/arrow-rs/pull/2886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out value selection kernels into arrow-select \(\#2594\) [\#2885](https://github.com/apache/arrow-rs/pull/2885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Increase default IPC alignment to 64 \(\#2883\) [\#2884](https://github.com/apache/arrow-rs/pull/2884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Copying inappropriately aligned buffer in ipc reader [\#2883](https://github.com/apache/arrow-rs/pull/2883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Validate decimal IPC read \(\#2387\) [\#2880](https://github.com/apache/arrow-rs/pull/2880) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix compilation error under `chrono-tz` feature [\#2879](https://github.com/apache/arrow-rs/pull/2879) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Don't validate decimal precision in ArrayData \(\#2637\) [\#2873](https://github.com/apache/arrow-rs/pull/2873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add downcast\_integer and downcast\_primitive [\#2872](https://github.com/apache/arrow-rs/pull/2872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Filter DecimalArray as PrimitiveArray ~5x Faster \(\#2637\) [\#2870](https://github.com/apache/arrow-rs/pull/2870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Treat DecimalArray as PrimitiveArray in row format [\#2866](https://github.com/apache/arrow-rs/pull/2866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) ## [25.0.0](https://github.com/apache/arrow-rs/tree/25.0.0) (2022-10-14) [Full Changelog](https://github.com/apache/arrow-rs/compare/24.0.0...25.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42cd59975921..00f6876855f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,101 +19,122 @@ # Changelog -## [26.0.0](https://github.com/apache/arrow-rs/tree/26.0.0) (2022-10-28) +## [27.0.0](https://github.com/apache/arrow-rs/tree/27.0.0) (2022-11-11) -[Full Changelog](https://github.com/apache/arrow-rs/compare/25.0.0...26.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/26.0.0...27.0.0) **Breaking changes:** -- Cast Timestamps to RFC3339 strings [\#2934](https://github.com/apache/arrow-rs/issues/2934) -- Remove Unused NativeDecimalType [\#2945](https://github.com/apache/arrow-rs/pull/2945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Format Timestamps as RFC3339 [\#2939](https://github.com/apache/arrow-rs/pull/2939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Update flatbuffers to resolve RUSTSEC-2021-0122 [\#2895](https://github.com/apache/arrow-rs/pull/2895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- replace `from_timestamp` by `from_timestamp_opt` [\#2894](https://github.com/apache/arrow-rs/pull/2894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Recurse into Dictionary value type in DataType::is\_nested [\#3083](https://github.com/apache/arrow-rs/pull/3083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- early type checks in `RowConverter` [\#3080](https://github.com/apache/arrow-rs/pull/3080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add Decimal128 and Decimal256 to downcast\_primitive [\#3056](https://github.com/apache/arrow-rs/pull/3056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace remaining \_generic temporal kernels with \_dyn kernels [\#3046](https://github.com/apache/arrow-rs/pull/3046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace year\_generic with year\_dyn [\#3041](https://github.com/apache/arrow-rs/pull/3041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Validate decimal256 with i256 directly [\#3025](https://github.com/apache/arrow-rs/pull/3025) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Hadoop LZ4 Support for LZ4 Codec [\#3013](https://github.com/apache/arrow-rs/pull/3013) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Replace hour\_generic with hour\_dyn [\#3006](https://github.com/apache/arrow-rs/pull/3006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Accept any &dyn Array in nullif kernel [\#2940](https://github.com/apache/arrow-rs/pull/2940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Optimized way to count the numbers of `true` and `false` values in a BooleanArray [\#2963](https://github.com/apache/arrow-rs/issues/2963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add pow to i256 [\#2954](https://github.com/apache/arrow-rs/issues/2954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Write Generic Code over \[Large\]BinaryArray and \[Large\]StringArray [\#2946](https://github.com/apache/arrow-rs/issues/2946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add Page Row Count Limit [\#2941](https://github.com/apache/arrow-rs/issues/2941) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- prettyprint to show timezone offset for timestamp with timezone [\#2937](https://github.com/apache/arrow-rs/issues/2937) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cast numeric to decimal256 [\#2922](https://github.com/apache/arrow-rs/issues/2922) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `freeze_with_dictionary` API to `MutableArrayData` [\#2914](https://github.com/apache/arrow-rs/issues/2914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support decimal256 array in sort kernels [\#2911](https://github.com/apache/arrow-rs/issues/2911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- support `[+/-]hhmm` and `[+/-]hh` as fixedoffset timezone format [\#2910](https://github.com/apache/arrow-rs/issues/2910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cleanup decimal sort function [\#2907](https://github.com/apache/arrow-rs/issues/2907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- replace `from_timestamp` by `from_timestamp_opt` [\#2892](https://github.com/apache/arrow-rs/issues/2892) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Move Primitive arity kernels to arrow-array [\#2787](https://github.com/apache/arrow-rs/issues/2787) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- add overflow-checking for negative arithmetic kernel [\#2662](https://github.com/apache/arrow-rs/issues/2662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row Format: Option to detach/own a row [\#3078](https://github.com/apache/arrow-rs/issues/3078) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row Format: API to check if datatypes are supported [\#3077](https://github.com/apache/arrow-rs/issues/3077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Deprecate Buffer::count\_set\_bits [\#3067](https://github.com/apache/arrow-rs/issues/3067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Decimal128 and Decimal256 to downcast\_primitive [\#3055](https://github.com/apache/arrow-rs/issues/3055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3042](https://github.com/apache/arrow-rs/issues/3042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast decimal256 to signed integer [\#3039](https://github.com/apache/arrow-rs/issues/3039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting Date64 to Timestamp [\#3037](https://github.com/apache/arrow-rs/issues/3037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check overflow when casting floating point value to decimal256 [\#3032](https://github.com/apache/arrow-rs/issues/3032) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare i256 in validate\_decimal256\_precision [\#3024](https://github.com/apache/arrow-rs/issues/3024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check overflow when casting floating point value to decimal128 [\#3020](https://github.com/apache/arrow-rs/issues/3020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add macro downcast\_temporal\_array [\#3008](https://github.com/apache/arrow-rs/issues/3008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace hour\_generic with hour\_dyn [\#3005](https://github.com/apache/arrow-rs/issues/3005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace temporal \_generic kernels with dyn [\#3004](https://github.com/apache/arrow-rs/issues/3004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `RowSelection::intersection` [\#3003](https://github.com/apache/arrow-rs/issues/3003) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- I would like to round rather than truncate when casting f64 to decimal [\#2997](https://github.com/apache/arrow-rs/issues/2997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow::compute::kernels::temporal should support nanoseconds [\#2995](https://github.com/apache/arrow-rs/issues/2995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release Arrow `26.0.0` \(next release after `25.0.0`\) [\#2953](https://github.com/apache/arrow-rs/issues/2953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add timezone offset for debug format of Timestamp with Timezone [\#2917](https://github.com/apache/arrow-rs/issues/2917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support merge RowSelectors when creating RowSelection [\#2858](https://github.com/apache/arrow-rs/issues/2858) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Subtle compatibility issue with serve\_arrow [\#2952](https://github.com/apache/arrow-rs/issues/2952) -- error\[E0599\]: no method named `total_cmp` found for struct `f16` in the current scope [\#2926](https://github.com/apache/arrow-rs/issues/2926) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Fail at rowSelection `and_then` method [\#2925](https://github.com/apache/arrow-rs/issues/2925) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Ordering not implemented for FixedSizeBinary types [\#2904](https://github.com/apache/arrow-rs/issues/2904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet API: Could not convert timestamp before unix epoch to string/json [\#2897](https://github.com/apache/arrow-rs/issues/2897) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Overly Pessimistic RLE Size Estimation [\#2889](https://github.com/apache/arrow-rs/issues/2889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Memory alignment error in `RawPtrBox::new` [\#2882](https://github.com/apache/arrow-rs/issues/2882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compilation error under chrono-tz feature [\#2878](https://github.com/apache/arrow-rs/issues/2878) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- AHash Statically Allocates 64 bytes [\#2875](https://github.com/apache/arrow-rs/issues/2875) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `parquet::arrow::arrow_writer::ArrowWriter` ignores page size properties [\#2853](https://github.com/apache/arrow-rs/issues/2853) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Inconsistent Nan Handling Between Scalar and Non-Scalar Comparison Kernels [\#3074](https://github.com/apache/arrow-rs/issues/3074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Debug format for timestamp ignores timezone [\#3069](https://github.com/apache/arrow-rs/issues/3069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row format decode loses timezone [\#3063](https://github.com/apache/arrow-rs/issues/3063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- binary operator produces incorrect result on arrays with resized null buffer [\#3061](https://github.com/apache/arrow-rs/issues/3061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RLEDecoder Panics on Null Padded Pages [\#3035](https://github.com/apache/arrow-rs/issues/3035) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Nullif with incorrect valid\_count [\#3031](https://github.com/apache/arrow-rs/issues/3031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RLEDecoder::get\_batch\_with\_dict may panic on bit-packed runs longer than 1024 [\#3029](https://github.com/apache/arrow-rs/issues/3029) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Converted type is None according to Parquet Tools then utilizing logical types [\#3017](https://github.com/apache/arrow-rs/issues/3017) +- CompressionCodec LZ4 incompatible with C++ implementation [\#2988](https://github.com/apache/arrow-rs/issues/2988) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Documentation updates:** -- Document crate topology \(\#2594\) [\#2913](https://github.com/apache/arrow-rs/pull/2913) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - -**Closed issues:** - -- SerializedFileWriter comments about multiple call on consumed self [\#2935](https://github.com/apache/arrow-rs/issues/2935) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Pointer freed error when deallocating ArrayData with shared memory buffer [\#2874](https://github.com/apache/arrow-rs/issues/2874) -- Release Arrow `25.0.0` \(next release after `24.0.0`\) [\#2820](https://github.com/apache/arrow-rs/issues/2820) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Replace DecimalArray with PrimitiveArray [\#2637](https://github.com/apache/arrow-rs/issues/2637) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Mark parquet predicate pushdown as complete [\#2987](https://github.com/apache/arrow-rs/pull/2987) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) **Merged pull requests:** -- Fix ignored limit on lexsort\_to\_indices (#2991) [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Fix GenericListArray::try\_new\_from\_array\_data error message \(\#526\) [\#2961](https://github.com/apache/arrow-rs/pull/2961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix take string on sliced indices [\#2960](https://github.com/apache/arrow-rs/pull/2960) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add BooleanArray::true\_count and BooleanArray::false\_count [\#2957](https://github.com/apache/arrow-rs/pull/2957) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add pow to i256 [\#2955](https://github.com/apache/arrow-rs/pull/2955) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- fix datatype for timestamptz debug fmt [\#2948](https://github.com/apache/arrow-rs/pull/2948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Add GenericByteArray \(\#2946\) [\#2947](https://github.com/apache/arrow-rs/pull/2947) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Specialize interleave string ~2-3x faster [\#2944](https://github.com/apache/arrow-rs/pull/2944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Added support for LZ4\_RAW compression. \(\#1604\) [\#2943](https://github.com/apache/arrow-rs/pull/2943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) -- Add optional page row count limit for parquet `WriterProperties` \(\#2941\) [\#2942](https://github.com/apache/arrow-rs/pull/2942) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Cleanup orphaned doc comments \(\#2935\) [\#2938](https://github.com/apache/arrow-rs/pull/2938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- support more fixedoffset tz format [\#2936](https://github.com/apache/arrow-rs/pull/2936) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Benchmark with prepared row converter [\#2930](https://github.com/apache/arrow-rs/pull/2930) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add lexsort benchmark \(\#2871\) [\#2929](https://github.com/apache/arrow-rs/pull/2929) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve panic messages for RowSelection::and\_then \(\#2925\) [\#2928](https://github.com/apache/arrow-rs/pull/2928) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update required half from 2.0 --\> 2.1 [\#2927](https://github.com/apache/arrow-rs/pull/2927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Cast numeric to decimal256 [\#2923](https://github.com/apache/arrow-rs/pull/2923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cleanup generated proto code [\#2921](https://github.com/apache/arrow-rs/pull/2921) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Deprecate TimestampArray from\_vec and from\_opt\_vec [\#2919](https://github.com/apache/arrow-rs/pull/2919) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support decimal256 array in sort kernels [\#2912](https://github.com/apache/arrow-rs/pull/2912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add timezone abstraction [\#2909](https://github.com/apache/arrow-rs/pull/2909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup decimal sort function [\#2908](https://github.com/apache/arrow-rs/pull/2908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Simplify TimestampArray from\_vec with timezone [\#2906](https://github.com/apache/arrow-rs/pull/2906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement ord for FixedSizeBinary types [\#2905](https://github.com/apache/arrow-rs/pull/2905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) -- Update chrono-tz requirement from 0.6 to 0.7 [\#2903](https://github.com/apache/arrow-rs/pull/2903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Parquet record api support timestamp before epoch [\#2899](https://github.com/apache/arrow-rs/pull/2899) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AnthonyPoncet](https://github.com/AnthonyPoncet)) -- Specialize interleave integer [\#2898](https://github.com/apache/arrow-rs/pull/2898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support overflow-checking variant of negate kernel [\#2893](https://github.com/apache/arrow-rs/pull/2893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Respect Page Size Limits in ArrowWriter \(\#2853\) [\#2890](https://github.com/apache/arrow-rs/pull/2890) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Improve row format docs [\#2888](https://github.com/apache/arrow-rs/pull/2888) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add FixedSizeList::from\_iter\_primitive [\#2887](https://github.com/apache/arrow-rs/pull/2887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Simplify ListArray::from\_iter\_primitive [\#2886](https://github.com/apache/arrow-rs/pull/2886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out value selection kernels into arrow-select \(\#2594\) [\#2885](https://github.com/apache/arrow-rs/pull/2885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Increase default IPC alignment to 64 \(\#2883\) [\#2884](https://github.com/apache/arrow-rs/pull/2884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Copying inappropriately aligned buffer in ipc reader [\#2883](https://github.com/apache/arrow-rs/pull/2883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Validate decimal IPC read \(\#2387\) [\#2880](https://github.com/apache/arrow-rs/pull/2880) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix compilation error under `chrono-tz` feature [\#2879](https://github.com/apache/arrow-rs/pull/2879) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Don't validate decimal precision in ArrayData \(\#2637\) [\#2873](https://github.com/apache/arrow-rs/pull/2873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add downcast\_integer and downcast\_primitive [\#2872](https://github.com/apache/arrow-rs/pull/2872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Filter DecimalArray as PrimitiveArray ~5x Faster \(\#2637\) [\#2870](https://github.com/apache/arrow-rs/pull/2870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Treat DecimalArray as PrimitiveArray in row format [\#2866](https://github.com/apache/arrow-rs/pull/2866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3088](https://github.com/apache/arrow-rs/pull/3088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([src255](https://github.com/src255)) +- Remove unused range module [\#3085](https://github.com/apache/arrow-rs/pull/3085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make intersect\_row\_selections a member function [\#3084](https://github.com/apache/arrow-rs/pull/3084) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update hashbrown requirement from 0.12 to 0.13 [\#3081](https://github.com/apache/arrow-rs/pull/3081) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: add `OwnedRow` [\#3079](https://github.com/apache/arrow-rs/pull/3079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Use ArrowNativeTypeOp on non-scalar comparison kernels [\#3075](https://github.com/apache/arrow-rs/pull/3075) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add missing inline to ArrowNativeTypeOp [\#3073](https://github.com/apache/arrow-rs/pull/3073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix debug information for Timestamp with Timezone [\#3072](https://github.com/apache/arrow-rs/pull/3072) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Deprecate Buffer::count\_set\_bits \(\#3067\) [\#3071](https://github.com/apache/arrow-rs/pull/3071) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add compare to ArrowNativeTypeOp [\#3070](https://github.com/apache/arrow-rs/pull/3070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve docstrings on WriterPropertiesBuilder [\#3068](https://github.com/apache/arrow-rs/pull/3068) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Faster f64 inequality [\#3065](https://github.com/apache/arrow-rs/pull/3065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix row format decode loses timezone \(\#3063\) [\#3064](https://github.com/apache/arrow-rs/pull/3064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix null\_count computation in binary [\#3062](https://github.com/apache/arrow-rs/pull/3062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Faster f64 equality [\#3060](https://github.com/apache/arrow-rs/pull/3060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update arrow-flight subcrates \(\#3044\) [\#3052](https://github.com/apache/arrow-rs/pull/3052) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Minor: Remove cloning ArrayData in with\_precision\_and\_scale [\#3050](https://github.com/apache/arrow-rs/pull/3050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split out arrow-json \(\#3044\) [\#3049](https://github.com/apache/arrow-rs/pull/3049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move `intersect_row_selections` from datafusion to arrow-rs. [\#3047](https://github.com/apache/arrow-rs/pull/3047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Split out arrow-csv \(\#2594\) [\#3044](https://github.com/apache/arrow-rs/pull/3044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move reader\_parser to arrow-cast \(\#3022\) [\#3043](https://github.com/apache/arrow-rs/pull/3043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cast decimal256 to signed integer [\#3040](https://github.com/apache/arrow-rs/pull/3040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable casting from Date64 to Timestamp [\#3038](https://github.com/apache/arrow-rs/pull/3038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Fix decoding long and/or padded RLE data \(\#3029\) \(\#3035\) [\#3036](https://github.com/apache/arrow-rs/pull/3036) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix nullif when existing array has no nulls [\#3034](https://github.com/apache/arrow-rs/pull/3034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow when casting floating point value to decimal256 [\#3033](https://github.com/apache/arrow-rs/pull/3033) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update parquet to depend on arrow subcrates [\#3028](https://github.com/apache/arrow-rs/pull/3028) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make various i256 methods const [\#3026](https://github.com/apache/arrow-rs/pull/3026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-ipc [\#3022](https://github.com/apache/arrow-rs/pull/3022) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow while casting floating point value to decimal128 [\#3021](https://github.com/apache/arrow-rs/pull/3021) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update arrow-flight [\#3019](https://github.com/apache/arrow-rs/pull/3019) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Move ArrowNativeTypeOp to arrow-array \(\#2594\) [\#3018](https://github.com/apache/arrow-rs/pull/3018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support cast timestamp to time [\#3016](https://github.com/apache/arrow-rs/pull/3016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([naosense](https://github.com/naosense)) +- Add filter example [\#3014](https://github.com/apache/arrow-rs/pull/3014) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow when casting integer to decimal [\#3009](https://github.com/apache/arrow-rs/pull/3009) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add macro downcast\_temporal\_array [\#3007](https://github.com/apache/arrow-rs/pull/3007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Parquet Writer: Make column descriptor public on the writer [\#3002](https://github.com/apache/arrow-rs/pull/3002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pier-oliviert](https://github.com/pier-oliviert)) +- Update chrono-tz requirement from 0.7 to 0.8 [\#3001](https://github.com/apache/arrow-rs/pull/3001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Round instead of Truncate while casting float to decimal [\#3000](https://github.com/apache/arrow-rs/pull/3000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Support Predicate Pushdown for Parquet Lists \(\#2108\) [\#2999](https://github.com/apache/arrow-rs/pull/2999) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-cast \(\#2594\) [\#2998](https://github.com/apache/arrow-rs/pull/2998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- `arrow::compute::kernels::temporal` should support nanoseconds [\#2996](https://github.com/apache/arrow-rs/pull/2996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Add `RowSelection::from_selectors_and_combine` to merge RowSelectors [\#2994](https://github.com/apache/arrow-rs/pull/2994) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Simplify Single-Column Dictionary Sort [\#2993](https://github.com/apache/arrow-rs/pull/2993) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Add entry to changelog for 26.0.0 RC2 fix [\#2992](https://github.com/apache/arrow-rs/pull/2992) ([alamb](https://github.com/alamb)) +- Fix ignored limit on `lexsort_to_indices` [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add clone and equal functions for CastOptions [\#2985](https://github.com/apache/arrow-rs/pull/2985) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- minor: remove redundant prefix [\#2983](https://github.com/apache/arrow-rs/pull/2983) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) +- Compare dictionary decimal arrays [\#2982](https://github.com/apache/arrow-rs/pull/2982) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Compare dictionary and non-dictionary decimal arrays [\#2980](https://github.com/apache/arrow-rs/pull/2980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add decimal comparison kernel support [\#2978](https://github.com/apache/arrow-rs/pull/2978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Move concat kernel to arrow-select \(\#2594\) [\#2976](https://github.com/apache/arrow-rs/pull/2976) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Specialize interleave for byte arrays \(\#2864\) [\#2975](https://github.com/apache/arrow-rs/pull/2975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use unary function for numeric to decimal cast [\#2973](https://github.com/apache/arrow-rs/pull/2973) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Specialize filter kernel for binary arrays \(\#2969\) [\#2971](https://github.com/apache/arrow-rs/pull/2971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Combine take\_utf8 and take\_binary \(\#2969\) [\#2970](https://github.com/apache/arrow-rs/pull/2970) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster Scalar Dictionary Comparison ~10% [\#2968](https://github.com/apache/arrow-rs/pull/2968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move `byte_size` from datafusion::physical\_expr [\#2965](https://github.com/apache/arrow-rs/pull/2965) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Pass decompressed size to parquet Codec::decompress \(\#2956\) [\#2959](https://github.com/apache/arrow-rs/pull/2959) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Add Decimal Arithmetic [\#2881](https://github.com/apache/arrow-rs/pull/2881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 1e94ca64d99f..816843d31ab7 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "26.0.0" +version = "27.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } chrono = { version = "0.4", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 610a35015e23..9ed4d91d21d2 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "26.0.0" +version = "27.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 714ea0b480dd..fe3f5e257668 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "26.0.0" +version = "27.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-select = { version = "26.0.0", path = "../arrow-select" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-select = { version = "27.0.0", path = "../arrow-select" } chrono = { version = "0.4", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index d40cef0db112..81c97c68484e 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "26.0.0" +version = "27.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "26.0.0", path = "../arrow-cast" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "27.0.0", path = "../arrow-cast" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } chrono = { version = "0.4", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } lazy_static = { version = "1.4", default-features = false } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index c94bdfd9919a..179bf7a032ed 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "26.0.0" +version = "27.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 085c8c50613e..e243f45f3161 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "26.0.0" +version = "27.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,10 +27,10 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-ipc = { version = "26.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-ipc = { version = "27.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index a515e007f0ab..56560d6710c1 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "26.0.0" +arrow-flight = "27.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index f46223996644..79e6825a18b7 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "26.0.0" +version = "27.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "26.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow = { version = "27.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 4562759b2ddf..015a8b7a953b 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "26.0.0" +version = "27.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 52ad5fe2e659..e3205e7a8153 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "26.0.0" +version = "27.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "26.0.0", path = "../arrow-cast" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "27.0.0", path = "../arrow-cast" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.11.1", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 0d8c91092103..dd7064946b57 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "26.0.0" +version = "27.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "26.0.0", path = "../arrow-cast" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "27.0.0", path = "../arrow-cast" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 03118160280a..5f54f5781160 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "26.0.0" +version = "27.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "26.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "27.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index b248c34fa864..3b809f23ed4f 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "26.0.0" +version = "27.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index e895bbcdd78b..07c376e55ddd 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "26.0.0" +version = "27.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 452cc4bbd2a6..2acad2c17bde 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "26.0.0" +version = "27.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,15 +44,15 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array" } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "26.0.0", path = "../arrow-cast" } -arrow-csv = { version = "26.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "26.0.0", path = "../arrow-data" } -arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "26.0.0", path = "../arrow-json", optional = true } -arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-select = { version = "26.0.0", path = "../arrow-select" } +arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "27.0.0", path = "../arrow-cast" } +arrow-csv = { version = "27.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-ipc = { version = "27.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "27.0.0", path = "../arrow-json", optional = true } +arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-select = { version = "27.0.0", path = "../arrow-select" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow/README.md b/arrow/README.md index 7bfaad4751d3..c5cd588e87a4 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `26.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `27.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 093e1c4c29f3..8b7c934b20ee 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/26.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/27.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 6790ef6fde6f..4f28a073f7bd 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="25.0.0" -FUTURE_RELEASE="26.0.0" +SINCE_TAG="26.0.0" +FUTURE_RELEASE="27.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 65c4009d3c19..b400b01a7d1d 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "26.0.0" +version = "27.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -30,14 +30,14 @@ edition = "2021" rust-version = "1.62" [dependencies] -arrow-array = { version = "26.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "26.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "26.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "26.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "26.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "26.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "26.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "27.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "27.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "27.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "27.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "27.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "27.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "27.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "27.0.0", path = "../arrow-ipc", default-features = false, optional = true } ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -68,7 +68,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "26.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "27.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index a0b2b6ea1447..c300fb3e5b3d 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "26.0.0" +version = "27.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "26.0.0", default-features = false } +parquet = { path = "../parquet", version = "27.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index dfaba7def7a9..c8fefc72c609 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "26.0.0" -parquet_derive = "26.0.0" +parquet = "27.0.0" +parquet_derive = "27.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 83204ae7413e..a10d34e86892 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "26.0.0" +version = "27.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "26.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "26.0.0", default-features = false } +parquet = { path = "../parquet", version = "27.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "27.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From ccc44170acadf1efb01b3a536ad48ae03328b3f1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 12 Nov 2022 21:48:50 -0800 Subject: [PATCH 0263/1411] Fix clippy by avoiding deprecated functions in chrono (#3096) * Fix clippy * Fix test * Trigger Build --- arrow-array/src/delta.rs | 207 ++++++++++++++++++------ arrow-array/src/temporal_conversions.rs | 2 +- arrow-array/src/timezone.rs | 34 ++-- arrow-array/src/types.rs | 8 +- arrow-cast/src/cast.rs | 16 +- arrow-cast/src/parse.rs | 16 +- arrow-csv/src/reader.rs | 24 +-- arrow/benches/cast_kernels.rs | 6 +- arrow/src/compute/kernels/arithmetic.rs | 24 +-- parquet/src/record/api.rs | 21 ++- 10 files changed, 248 insertions(+), 110 deletions(-) diff --git a/arrow-array/src/delta.rs b/arrow-array/src/delta.rs index b7efdab0a48d..b9b7a11e2d44 100644 --- a/arrow-array/src/delta.rs +++ b/arrow-array/src/delta.rs @@ -105,75 +105,186 @@ mod tests { #[test] fn test_shift_months() { - let base = NaiveDate::from_ymd(2020, 1, 31); - - assert_eq!(shift_months(base, 0), NaiveDate::from_ymd(2020, 1, 31)); - assert_eq!(shift_months(base, 1), NaiveDate::from_ymd(2020, 2, 29)); - assert_eq!(shift_months(base, 2), NaiveDate::from_ymd(2020, 3, 31)); - assert_eq!(shift_months(base, 3), NaiveDate::from_ymd(2020, 4, 30)); - assert_eq!(shift_months(base, 4), NaiveDate::from_ymd(2020, 5, 31)); - assert_eq!(shift_months(base, 5), NaiveDate::from_ymd(2020, 6, 30)); - assert_eq!(shift_months(base, 6), NaiveDate::from_ymd(2020, 7, 31)); - assert_eq!(shift_months(base, 7), NaiveDate::from_ymd(2020, 8, 31)); - assert_eq!(shift_months(base, 8), NaiveDate::from_ymd(2020, 9, 30)); - assert_eq!(shift_months(base, 9), NaiveDate::from_ymd(2020, 10, 31)); - assert_eq!(shift_months(base, 10), NaiveDate::from_ymd(2020, 11, 30)); - assert_eq!(shift_months(base, 11), NaiveDate::from_ymd(2020, 12, 31)); - assert_eq!(shift_months(base, 12), NaiveDate::from_ymd(2021, 1, 31)); - assert_eq!(shift_months(base, 13), NaiveDate::from_ymd(2021, 2, 28)); - - assert_eq!(shift_months(base, -1), NaiveDate::from_ymd(2019, 12, 31)); - assert_eq!(shift_months(base, -2), NaiveDate::from_ymd(2019, 11, 30)); - assert_eq!(shift_months(base, -3), NaiveDate::from_ymd(2019, 10, 31)); - assert_eq!(shift_months(base, -4), NaiveDate::from_ymd(2019, 9, 30)); - assert_eq!(shift_months(base, -5), NaiveDate::from_ymd(2019, 8, 31)); - assert_eq!(shift_months(base, -6), NaiveDate::from_ymd(2019, 7, 31)); - assert_eq!(shift_months(base, -7), NaiveDate::from_ymd(2019, 6, 30)); - assert_eq!(shift_months(base, -8), NaiveDate::from_ymd(2019, 5, 31)); - assert_eq!(shift_months(base, -9), NaiveDate::from_ymd(2019, 4, 30)); - assert_eq!(shift_months(base, -10), NaiveDate::from_ymd(2019, 3, 31)); - assert_eq!(shift_months(base, -11), NaiveDate::from_ymd(2019, 2, 28)); - assert_eq!(shift_months(base, -12), NaiveDate::from_ymd(2019, 1, 31)); - assert_eq!(shift_months(base, -13), NaiveDate::from_ymd(2018, 12, 31)); - - assert_eq!(shift_months(base, 1265), NaiveDate::from_ymd(2125, 6, 30)); + let base = NaiveDate::from_ymd_opt(2020, 1, 31).unwrap(); + + assert_eq!( + shift_months(base, 0), + NaiveDate::from_ymd_opt(2020, 1, 31).unwrap() + ); + assert_eq!( + shift_months(base, 1), + NaiveDate::from_ymd_opt(2020, 2, 29).unwrap() + ); + assert_eq!( + shift_months(base, 2), + NaiveDate::from_ymd_opt(2020, 3, 31).unwrap() + ); + assert_eq!( + shift_months(base, 3), + NaiveDate::from_ymd_opt(2020, 4, 30).unwrap() + ); + assert_eq!( + shift_months(base, 4), + NaiveDate::from_ymd_opt(2020, 5, 31).unwrap() + ); + assert_eq!( + shift_months(base, 5), + NaiveDate::from_ymd_opt(2020, 6, 30).unwrap() + ); + assert_eq!( + shift_months(base, 6), + NaiveDate::from_ymd_opt(2020, 7, 31).unwrap() + ); + assert_eq!( + shift_months(base, 7), + NaiveDate::from_ymd_opt(2020, 8, 31).unwrap() + ); + assert_eq!( + shift_months(base, 8), + NaiveDate::from_ymd_opt(2020, 9, 30).unwrap() + ); + assert_eq!( + shift_months(base, 9), + NaiveDate::from_ymd_opt(2020, 10, 31).unwrap() + ); + assert_eq!( + shift_months(base, 10), + NaiveDate::from_ymd_opt(2020, 11, 30).unwrap() + ); + assert_eq!( + shift_months(base, 11), + NaiveDate::from_ymd_opt(2020, 12, 31).unwrap() + ); + assert_eq!( + shift_months(base, 12), + NaiveDate::from_ymd_opt(2021, 1, 31).unwrap() + ); + assert_eq!( + shift_months(base, 13), + NaiveDate::from_ymd_opt(2021, 2, 28).unwrap() + ); + + assert_eq!( + shift_months(base, -1), + NaiveDate::from_ymd_opt(2019, 12, 31).unwrap() + ); + assert_eq!( + shift_months(base, -2), + NaiveDate::from_ymd_opt(2019, 11, 30).unwrap() + ); + assert_eq!( + shift_months(base, -3), + NaiveDate::from_ymd_opt(2019, 10, 31).unwrap() + ); + assert_eq!( + shift_months(base, -4), + NaiveDate::from_ymd_opt(2019, 9, 30).unwrap() + ); + assert_eq!( + shift_months(base, -5), + NaiveDate::from_ymd_opt(2019, 8, 31).unwrap() + ); + assert_eq!( + shift_months(base, -6), + NaiveDate::from_ymd_opt(2019, 7, 31).unwrap() + ); + assert_eq!( + shift_months(base, -7), + NaiveDate::from_ymd_opt(2019, 6, 30).unwrap() + ); + assert_eq!( + shift_months(base, -8), + NaiveDate::from_ymd_opt(2019, 5, 31).unwrap() + ); + assert_eq!( + shift_months(base, -9), + NaiveDate::from_ymd_opt(2019, 4, 30).unwrap() + ); + assert_eq!( + shift_months(base, -10), + NaiveDate::from_ymd_opt(2019, 3, 31).unwrap() + ); + assert_eq!( + shift_months(base, -11), + NaiveDate::from_ymd_opt(2019, 2, 28).unwrap() + ); + assert_eq!( + shift_months(base, -12), + NaiveDate::from_ymd_opt(2019, 1, 31).unwrap() + ); + assert_eq!( + shift_months(base, -13), + NaiveDate::from_ymd_opt(2018, 12, 31).unwrap() + ); + + assert_eq!( + shift_months(base, 1265), + NaiveDate::from_ymd_opt(2125, 6, 30).unwrap() + ); } #[test] fn test_shift_months_with_overflow() { - let base = NaiveDate::from_ymd(2020, 12, 31); + let base = NaiveDate::from_ymd_opt(2020, 12, 31).unwrap(); assert_eq!(shift_months(base, 0), base); - assert_eq!(shift_months(base, 1), NaiveDate::from_ymd(2021, 1, 31)); - assert_eq!(shift_months(base, 2), NaiveDate::from_ymd(2021, 2, 28)); - assert_eq!(shift_months(base, 12), NaiveDate::from_ymd(2021, 12, 31)); - assert_eq!(shift_months(base, 18), NaiveDate::from_ymd(2022, 6, 30)); - - assert_eq!(shift_months(base, -1), NaiveDate::from_ymd(2020, 11, 30)); - assert_eq!(shift_months(base, -2), NaiveDate::from_ymd(2020, 10, 31)); - assert_eq!(shift_months(base, -10), NaiveDate::from_ymd(2020, 2, 29)); - assert_eq!(shift_months(base, -12), NaiveDate::from_ymd(2019, 12, 31)); - assert_eq!(shift_months(base, -18), NaiveDate::from_ymd(2019, 6, 30)); + assert_eq!( + shift_months(base, 1), + NaiveDate::from_ymd_opt(2021, 1, 31).unwrap() + ); + assert_eq!( + shift_months(base, 2), + NaiveDate::from_ymd_opt(2021, 2, 28).unwrap() + ); + assert_eq!( + shift_months(base, 12), + NaiveDate::from_ymd_opt(2021, 12, 31).unwrap() + ); + assert_eq!( + shift_months(base, 18), + NaiveDate::from_ymd_opt(2022, 6, 30).unwrap() + ); + + assert_eq!( + shift_months(base, -1), + NaiveDate::from_ymd_opt(2020, 11, 30).unwrap() + ); + assert_eq!( + shift_months(base, -2), + NaiveDate::from_ymd_opt(2020, 10, 31).unwrap() + ); + assert_eq!( + shift_months(base, -10), + NaiveDate::from_ymd_opt(2020, 2, 29).unwrap() + ); + assert_eq!( + shift_months(base, -12), + NaiveDate::from_ymd_opt(2019, 12, 31).unwrap() + ); + assert_eq!( + shift_months(base, -18), + NaiveDate::from_ymd_opt(2019, 6, 30).unwrap() + ); } #[test] fn test_shift_months_datetime() { - let date = NaiveDate::from_ymd(2020, 1, 31); - let o_clock = NaiveTime::from_hms(1, 2, 3); + let date = NaiveDate::from_ymd_opt(2020, 1, 31).unwrap(); + let o_clock = NaiveTime::from_hms_opt(1, 2, 3).unwrap(); let base = NaiveDateTime::new(date, o_clock); assert_eq!( shift_months(base, 0).date(), - NaiveDate::from_ymd(2020, 1, 31) + NaiveDate::from_ymd_opt(2020, 1, 31).unwrap() ); assert_eq!( shift_months(base, 1).date(), - NaiveDate::from_ymd(2020, 2, 29) + NaiveDate::from_ymd_opt(2020, 2, 29).unwrap() ); assert_eq!( shift_months(base, 2).date(), - NaiveDate::from_ymd(2020, 3, 31) + NaiveDate::from_ymd_opt(2020, 3, 31).unwrap() ); assert_eq!(shift_months(base, 0).time(), o_clock); assert_eq!(shift_months(base, 1).time(), o_clock); diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index a4d910cc8bc1..f1f3f36d3c61 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -252,7 +252,7 @@ pub fn as_time(v: i64) -> Option { _ => None, }, DataType::Timestamp(_, _) => as_datetime::(v).map(|datetime| datetime.time()), - DataType::Date32 | DataType::Date64 => Some(NaiveTime::from_hms(0, 0, 0)), + DataType::Date32 | DataType::Date64 => NaiveTime::from_hms_opt(0, 0, 0), DataType::Interval(_) => None, _ => None, } diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index 7bd597904737..fd8c099c2091 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -158,8 +158,8 @@ mod private { #[test] fn test_with_timezone() { let vals = [ - Utc.timestamp_millis(37800000), - Utc.timestamp_millis(86339000), + Utc.timestamp_millis_opt(37800000).unwrap(), + Utc.timestamp_millis_opt(86339000).unwrap(), ]; assert_eq!(10, vals[0].hour()); @@ -175,8 +175,8 @@ mod private { fn test_using_chrono_tz_and_utc_naive_date_time() { let sydney_tz = "Australia/Sydney".to_string(); let tz: Tz = sydney_tz.parse().unwrap(); - let sydney_offset_without_dst = FixedOffset::east(10 * 60 * 60); - let sydney_offset_with_dst = FixedOffset::east(11 * 60 * 60); + let sydney_offset_without_dst = FixedOffset::east_opt(10 * 60 * 60).unwrap(); + let sydney_offset_with_dst = FixedOffset::east_opt(11 * 60 * 60).unwrap(); // Daylight savings ends // When local daylight time was about to reach // Sunday, 4 April 2021, 3:00:00 am clocks were turned backward 1 hour to @@ -188,32 +188,40 @@ mod private { // Sunday, 3 October 2021, 3:00:00 am local daylight time instead. // Sydney 2021-04-04T02:30:00+11:00 is 2021-04-03T15:30:00Z - let utc_just_before_sydney_dst_ends = - NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(15, 30, 0, 0); + let utc_just_before_sydney_dst_ends = NaiveDate::from_ymd_opt(2021, 4, 3) + .unwrap() + .and_hms_nano_opt(15, 30, 0, 0) + .unwrap(); assert_eq!( tz.offset_from_utc_datetime(&utc_just_before_sydney_dst_ends) .fix(), sydney_offset_with_dst ); // Sydney 2021-04-04T02:30:00+10:00 is 2021-04-03T16:30:00Z - let utc_just_after_sydney_dst_ends = - NaiveDate::from_ymd(2021, 4, 3).and_hms_nano(16, 30, 0, 0); + let utc_just_after_sydney_dst_ends = NaiveDate::from_ymd_opt(2021, 4, 3) + .unwrap() + .and_hms_nano_opt(16, 30, 0, 0) + .unwrap(); assert_eq!( tz.offset_from_utc_datetime(&utc_just_after_sydney_dst_ends) .fix(), sydney_offset_without_dst ); // Sydney 2021-10-03T01:30:00+10:00 is 2021-10-02T15:30:00Z - let utc_just_before_sydney_dst_starts = - NaiveDate::from_ymd(2021, 10, 2).and_hms_nano(15, 30, 0, 0); + let utc_just_before_sydney_dst_starts = NaiveDate::from_ymd_opt(2021, 10, 2) + .unwrap() + .and_hms_nano_opt(15, 30, 0, 0) + .unwrap(); assert_eq!( tz.offset_from_utc_datetime(&utc_just_before_sydney_dst_starts) .fix(), sydney_offset_without_dst ); // Sydney 2021-04-04T03:30:00+11:00 is 2021-10-02T16:30:00Z - let utc_just_after_sydney_dst_starts = - NaiveDate::from_ymd(2022, 10, 2).and_hms_nano(16, 30, 0, 0); + let utc_just_after_sydney_dst_starts = NaiveDate::from_ymd_opt(2022, 10, 2) + .unwrap() + .and_hms_nano_opt(16, 30, 0, 0) + .unwrap(); assert_eq!( tz.offset_from_utc_datetime(&utc_just_after_sydney_dst_starts) .fix(), @@ -300,7 +308,7 @@ mod tests { #[test] fn test_with_offset() { - let t = NaiveDate::from_ymd(2000, 1, 1); + let t = NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(); let tz: Tz = "-00:00".parse().unwrap(); assert_eq!(tz.offset_from_utc_date(&t).fix().local_minus_utc(), 0); diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 03ecef361b04..dd4d1ba4292b 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -327,7 +327,7 @@ impl Date32Type { /// /// * `i` - The Date32Type to convert pub fn to_naive_date(i: ::Native) -> NaiveDate { - let epoch = NaiveDate::from_ymd(1970, 1, 1); + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); epoch.add(Duration::days(i as i64)) } @@ -337,7 +337,7 @@ impl Date32Type { /// /// * `d` - The NaiveDate to convert pub fn from_naive_date(d: NaiveDate) -> ::Native { - let epoch = NaiveDate::from_ymd(1970, 1, 1); + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); d.sub(epoch).num_days() as ::Native } @@ -400,7 +400,7 @@ impl Date64Type { /// /// * `i` - The Date64Type to convert pub fn to_naive_date(i: ::Native) -> NaiveDate { - let epoch = NaiveDate::from_ymd(1970, 1, 1); + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); epoch.add(Duration::milliseconds(i as i64)) } @@ -410,7 +410,7 @@ impl Date64Type { /// /// * `d` - The NaiveDate to convert pub fn from_naive_date(d: NaiveDate) -> ::Native { - let epoch = NaiveDate::from_ymd(1970, 1, 1); + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); d.sub(epoch).num_milliseconds() as ::Native } diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index bbd38fbc0267..b3c0aaa82031 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -6120,7 +6120,7 @@ mod tests { #[test] fn test_cast_utf8_to_date32() { use chrono::NaiveDate; - let from_ymd = chrono::NaiveDate::from_ymd; + let from_ymd = chrono::NaiveDate::from_ymd_opt; let since = chrono::NaiveDate::signed_duration_since; let a = StringArray::from(vec![ @@ -6135,13 +6135,19 @@ mod tests { let c = b.as_any().downcast_ref::().unwrap(); // test valid inputs - let date_value = since(NaiveDate::from_ymd(2000, 1, 1), from_ymd(1970, 1, 1)) - .num_days() as i32; + let date_value = since( + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), + from_ymd(1970, 1, 1).unwrap(), + ) + .num_days() as i32; assert!(c.is_valid(0)); // "2000-01-01" assert_eq!(date_value, c.value(0)); - let date_value = since(NaiveDate::from_ymd(2000, 2, 2), from_ymd(1970, 1, 1)) - .num_days() as i32; + let date_value = since( + NaiveDate::from_ymd_opt(2000, 2, 2).unwrap(), + from_ymd(1970, 1, 1).unwrap(), + ) + .num_days() as i32; assert!(c.is_valid(1)); // "2000-2-2" assert_eq!(date_value, c.value(1)); diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 126beb902a55..b93d6c800240 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -305,8 +305,8 @@ mod tests { // timezone the test machine is running. Thus it is still // somewhat susceptible to bugs in the use of chrono let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 190855000), + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), ); // Ensure both T and ' ' variants work @@ -323,8 +323,8 @@ mod tests { // Also ensure that parsing timestamps with no fractional // second part works as well let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms(13, 42, 29), + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(13, 42, 29).unwrap(), ); // Ensure both T and ' ' variants work @@ -380,8 +380,8 @@ mod tests { // string without timezone should always output the same regardless the local or session timezone let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 190855000), + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), ); // Ensure both T and ' ' variants work @@ -396,8 +396,8 @@ mod tests { ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 0), + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), ); // Ensure both T and ' ' variants work diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 2fb6493e1be6..0bf05960a37d 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -1701,8 +1701,8 @@ mod tests { 0 ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2018, 11, 13), - NaiveTime::from_hms_nano(17, 11, 10, 0), + NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), + NaiveTime::from_hms_nano_opt(17, 11, 10, 0).unwrap(), ); assert_eq!( parse_item::("2018-11-13T17:11:10").unwrap(), @@ -1713,16 +1713,16 @@ mod tests { naive_datetime.timestamp_nanos() / 1000 ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2018, 11, 13), - NaiveTime::from_hms_nano(17, 11, 10, 11000000), + NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), + NaiveTime::from_hms_nano_opt(17, 11, 10, 11000000).unwrap(), ); assert_eq!( parse_item::("2018-11-13T17:11:10.011").unwrap(), naive_datetime.timestamp_nanos() / 1000 ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(1900, 2, 28), - NaiveTime::from_hms_nano(12, 34, 56, 0), + NaiveDate::from_ymd_opt(1900, 2, 28).unwrap(), + NaiveTime::from_hms_nano_opt(12, 34, 56, 0).unwrap(), ); assert_eq!( parse_item::("1900-02-28T12:34:56").unwrap(), @@ -1737,8 +1737,8 @@ mod tests { 0 ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2018, 11, 13), - NaiveTime::from_hms_nano(17, 11, 10, 0), + NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), + NaiveTime::from_hms_nano_opt(17, 11, 10, 0).unwrap(), ); assert_eq!( parse_item::("2018-11-13T17:11:10").unwrap(), @@ -1749,16 +1749,16 @@ mod tests { naive_datetime.timestamp_nanos() ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2018, 11, 13), - NaiveTime::from_hms_nano(17, 11, 10, 11000000), + NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), + NaiveTime::from_hms_nano_opt(17, 11, 10, 11000000).unwrap(), ); assert_eq!( parse_item::("2018-11-13T17:11:10.011").unwrap(), naive_datetime.timestamp_nanos() ); let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(1900, 2, 28), - NaiveTime::from_hms_nano(12, 34, 56, 0), + NaiveDate::from_ymd_opt(1900, 2, 28).unwrap(), + NaiveTime::from_hms_nano_opt(12, 34, 56, 0).unwrap(), ); assert_eq!( parse_item::("1900-02-28T12:34:56").unwrap(), diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 2c3d8cd1678a..e93c7860885c 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -52,7 +52,8 @@ fn build_utf8_date_array(size: usize, with_nulls: bool) -> ArrayRef { if with_nulls && rng.gen::() > 0.8 { builder.append_null(); } else { - let string = NaiveDate::from_num_days_from_ce(rng.sample(range)) + let string = NaiveDate::from_num_days_from_ce_opt(rng.sample(range)) + .unwrap() .format("%Y-%m-%d") .to_string(); builder.append_value(&string); @@ -73,7 +74,8 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { if with_nulls && rng.gen::() > 0.8 { builder.append_null(); } else { - let string = NaiveDateTime::from_timestamp(rng.sample(range), 0) + let string = NaiveDateTime::from_timestamp_opt(rng.sample(range), 0) + .unwrap() .format("%Y-%m-%dT%H:%M:%S") .to_string(); builder.append_value(&string); diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 328ce02e4f5d..a99a90204b7f 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1644,7 +1644,7 @@ mod tests { #[test] fn test_date32_month_add() { let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(1, 2)]); @@ -1652,28 +1652,28 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd(2001, 3, 1)) + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); } #[test] fn test_date32_day_time_add() { let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); let c = add_dyn(&a, &b).unwrap(); let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd(2000, 1, 2)) + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); } #[test] fn test_date32_month_day_nano_add() { let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( @@ -1683,14 +1683,14 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd(2000, 2, 3)) + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); } #[test] fn test_date64_month_add() { let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(1, 2)]); @@ -1698,28 +1698,28 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd(2001, 3, 1)) + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); } #[test] fn test_date64_day_time_add() { let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); let c = add_dyn(&a, &b).unwrap(); let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd(2000, 1, 2)) + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); } #[test] fn test_date64_month_day_nano_add() { let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd(2000, 1, 1), + NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), )]); let b = IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( @@ -1729,7 +1729,7 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd(2000, 2, 3)) + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); } diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index d7e1e7550f00..02cb94765cf6 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -795,7 +795,9 @@ impl fmt::Display for Field { #[inline] fn convert_date_to_string(value: u32) -> String { static NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24; - let dt = Utc.timestamp(value as i64 * NUM_SECONDS_IN_DAY, 0).date(); + let dt = Utc + .timestamp_opt(value as i64 * NUM_SECONDS_IN_DAY, 0) + .unwrap(); format!("{}", dt.format("%Y-%m-%d %:z")) } @@ -804,7 +806,7 @@ fn convert_date_to_string(value: u32) -> String { /// Datetime is displayed in local timezone. #[inline] fn convert_timestamp_secs_to_string(value: i64) -> String { - let dt = Utc.timestamp(value, 0); + let dt = Utc.timestamp_opt(value, 0).unwrap(); format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")) } @@ -1075,7 +1077,10 @@ mod tests { #[test] fn test_convert_date_to_string() { fn check_date_conversion(y: u32, m: u32, d: u32) { - let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(0, 0, 0); + let datetime = chrono::NaiveDate::from_ymd_opt(y as i32, m, d) + .unwrap() + .and_hms_opt(0, 0, 0) + .unwrap(); let dt = Utc.from_utc_datetime(&datetime); let res = convert_date_to_string((dt.timestamp() / 60 / 60 / 24) as u32); let exp = format!("{}", dt.format("%Y-%m-%d %:z")); @@ -1092,7 +1097,10 @@ mod tests { #[test] fn test_convert_timestamp_millis_to_string() { fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { - let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); + let datetime = chrono::NaiveDate::from_ymd_opt(y as i32, m, d) + .unwrap() + .and_hms_opt(h, mi, s) + .unwrap(); let dt = Utc.from_utc_datetime(&datetime); let res = convert_timestamp_millis_to_string(dt.timestamp_millis() as u64); let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); @@ -1110,7 +1118,10 @@ mod tests { #[test] fn test_convert_timestamp_micros_to_string() { fn check_datetime_conversion(y: u32, m: u32, d: u32, h: u32, mi: u32, s: u32) { - let datetime = chrono::NaiveDate::from_ymd(y as i32, m, d).and_hms(h, mi, s); + let datetime = chrono::NaiveDate::from_ymd_opt(y as i32, m, d) + .unwrap() + .and_hms_opt(h, mi, s) + .unwrap(); let dt = Utc.from_utc_datetime(&datetime); let res = convert_timestamp_micros_to_string(dt.timestamp_micros() as u64); let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); From aaf030f79416cc407535bc15d2893f4706268fe3 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Sun, 13 Nov 2022 17:14:25 +1100 Subject: [PATCH 0264/1411] Fix prettyprint for Interval second fractions (#3093) Co-authored-by: Raphael Taylor-Davies --- arrow-cast/src/display.rs | 4 +- arrow/src/util/pretty.rs | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index b29f844fb677..ae1c799a4ef8 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -89,7 +89,7 @@ macro_rules! make_string_interval_day_time { let mins = mins - (hours * 60); format!( - "0 years 0 mons {} days {} hours {} mins {}.{:02} secs", + "0 years 0 mons {} days {} hours {} mins {}.{:03} secs", days_parts, hours, mins, @@ -127,7 +127,7 @@ macro_rules! make_string_interval_month_day_nano { let mins = mins - (hours * 60); format!( - "0 years {} mons {} days {} hours {} mins {}.{:02} secs", + "0 years {} mons {} days {} hours {} mins {}.{:09} secs", months_part, days_part, hours, diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 63d5977e21c5..c98c8a649cb5 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -987,4 +987,88 @@ mod tests { Ok(()) } + + #[test] + fn test_pretty_format_interval_day_time() -> Result<()> { + let arr = Arc::new(arrow_array::IntervalDayTimeArray::from(vec![ + Some(1), + Some(10), + Some(100), + ])); + + let schema = Arc::new(Schema::new(vec![Field::new( + "IntervalDayTime", + arr.data_type().clone(), + true, + )])); + + let batch = RecordBatch::try_new(schema, vec![arr])?; + + let table = pretty_format_batches(&[batch])?.to_string(); + + let expected = vec![ + "+-------------------------------------------------+", + "| IntervalDayTime |", + "+-------------------------------------------------+", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.010 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.100 secs |", + "+-------------------------------------------------+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) + } + + #[test] + fn test_pretty_format_interval_month_day_nano_array() -> Result<()> { + let arr = Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![ + Some(1), + Some(10), + Some(100), + Some(1_000), + Some(10_000), + Some(100_000), + Some(1_000_000), + Some(10_000_000), + Some(100_000_000), + Some(1_000_000_000), + ])); + + let schema = Arc::new(Schema::new(vec![Field::new( + "IntervalMonthDayNano", + arr.data_type().clone(), + true, + )])); + + let batch = RecordBatch::try_new(schema, vec![arr])?; + + let table = pretty_format_batches(&[batch])?.to_string(); + + let expected = vec![ + "+-------------------------------------------------------+", + "| IntervalMonthDayNano |", + "+-------------------------------------------------------+", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000010 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000100 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000001000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000010000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000100000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.001000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.010000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.100000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs |", + "+-------------------------------------------------------+", + ]; + + let actual: Vec<&str> = table.lines().collect(); + + assert_eq!(expected, actual, "Actual result:\n{}", table); + + Ok(()) + } } From c7210ce2b5190eba83afe42d078b5aac0cfbd7cf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 13 Nov 2022 01:14:42 -0500 Subject: [PATCH 0265/1411] Minor: Add diagrams and documentation to row format (#3094) Co-authored-by: Raphael Taylor-Davies --- arrow/src/row/mod.rs | 191 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 162 insertions(+), 29 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 4dd2a33c0bdc..1d0a58d954bf 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -15,15 +15,42 @@ // specific language governing permissions and limitations // under the License. -//! A comparable row-oriented representation of a collection of [`Array`] +//! A comparable row-oriented representation of a collection of [`Array`]. //! -//! As [`Row`] are [normalized for sorting], they can be very efficiently [compared](PartialOrd), +//! [`Row`]s are [normalized for sorting], and can be very efficiently [compared], //! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. This //! makes the row format ideal for implementing efficient multi-column sorting, //! grouping, aggregation, windowing and more. //! -//! _Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to -//! yield a meaningful ordering_ +//! The format is described in more detail on [`RowConverter`] as well as the +//! [Fast and Memory Efficient Multi-Column Sorts in Apache Arrow Rust](https://arrow.apache.org/blog/2022/11/07/multi-column-sorts-in-arrow-rust-part-1/) article. +//! +//! _[`Rows`] generated by different [`RowConverter`] are arbitrarily +//! ordered. The same [`RowConverter`] must be used for the comparison +//! to be well defined._ +//! +//! For example, given three input [`Array`]s, this code creates byte +//! sequences that [compare] the same as when using [`lexsort`]. +//! +//! ```text +//! ┌─────┐ ┌─────┐ ┌─────┐ +//! │ │ │ │ │ │ +//! ├─────┤ ┌ ┼─────┼ ─ ┼─────┼ ┐ ┏━━━━━━━━━━━━━┓ +//! │ │ │ │ │ │ ─────────────▶┃ ┃ +//! ├─────┤ └ ┼─────┼ ─ ┼─────┼ ┘ ┗━━━━━━━━━━━━━┛ +//! │ │ │ │ │ │ +//! └─────┘ └─────┘ └─────┘ +//! ... +//! ┌─────┐ ┌ ┬─────┬ ─ ┬─────┬ ┐ ┏━━━━━━━━┓ +//! │ │ │ │ │ │ ─────────────▶┃ ┃ +//! └─────┘ └ ┴─────┴ ─ ┴─────┴ ┘ ┗━━━━━━━━┛ +//! UInt64 Utf8 F64 +//! +//! Input Arrays Row Format +//! (Columns) +//! ``` +//! +//! # Basic Example //! ``` //! # use std::sync::Arc; //! # use arrow::row::{RowConverter, SortField}; @@ -73,7 +100,9 @@ //! assert_eq!(&c2_values, &["a", "f", "c", "e"]); //! ``` //! -//! It can also be used to implement a fast multi-column / lexicographic sort +//! # Lexsort +//! +//! The row format can also be used to implement a fast multi-column / lexicographic sort //! //! ``` //! # use arrow::row::{RowConverter, SortField}; @@ -95,6 +124,9 @@ //! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] //! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] //! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] +//! [`lexsort`]: crate::compute::kernels::sort::lexsort +//! [compared]: PartialOrd +//! [compare]: PartialOrd use std::cmp::Ordering; use std::hash::{Hash, Hasher}; @@ -119,38 +151,75 @@ mod fixed; mod interner; mod variable; -/// Converts [`ArrayRef`] columns into a row-oriented format. +/// Converts [`ArrayRef`] columns into a [row-oriented](self) format. +/// +/// *Note: The encoding of the row format may change from release to release.* +/// +/// ## Overview /// -/// # Format +/// The row format is a variable length byte sequence created by +/// concatenating the encoded form of each column. The encoding for +/// each column depends on its datatype (and sort options). /// -/// The encoding of the row format should not be considered stable, but is documented here -/// for reference. +/// The encoding is carefully designed in such a way that escaping is +/// unnecessary: it is never ambiguous as to whether a byte is part of +/// a sentinel (e.g. null) or a value. /// /// ## Unsigned Integer Encoding /// /// A null integer is encoded as a `0_u8`, followed by a zero-ed number of bytes corresponding -/// to the integer's length +/// to the integer's length. /// /// A valid integer is encoded as `1_u8`, followed by the big-endian representation of the -/// integer +/// integer. +/// +/// ```text +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// 3 │03│00│00│00│ │01│00│00│00│03│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// 258 │02│01│00│00│ │01│00│00│01│02│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// 23423 │7F│5B│00│00│ │01│00│00│5B│7F│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// NULL │??│??│??│??│ │00│00│00│00│00│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// +/// 32-bit (4 bytes) Row Format +/// Value Little Endian +/// ``` /// /// ## Signed Integer Encoding /// /// Signed integers have their most significant sign bit flipped, and are then encoded in the -/// same manner as an unsigned integer +/// same manner as an unsigned integer. +/// +/// ```text +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// 5 │05│00│00│00│ │05│00│00│80│ │01│80│00│00│05│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┐ ┌──┬──┬──┬──┬──┐ +/// -5 │FB│FF│FF│FF│ │FB│FF│FF│7F│ │01│7F│FF│FF│FB│ +/// └──┴──┴──┴──┘ └──┴──┴──┴──┘ └──┴──┴──┴──┴──┘ +/// +/// Value 32-bit (4 bytes) High bit flipped Row Format +/// Little Endian +/// ``` /// /// ## Float Encoding /// /// Floats are converted from IEEE 754 representation to a signed integer representation /// by flipping all bar the sign bit if they are negative. /// -/// They are then encoded in the same manner as a signed integer +/// They are then encoded in the same manner as a signed integer. /// -/// ## Variable Length Bytes Encoding +/// ## Variable Length Bytes (including Strings) Encoding /// -/// A null is encoded as a `0_u8` +/// A null is encoded as a `0_u8`. /// -/// An empty byte array is encoded as `1_u8` +/// An empty byte array is encoded as `1_u8`. /// /// A non-null, non-empty byte array is encoded as `2_u8` followed by the byte array /// encoded using a block based scheme described below. @@ -158,9 +227,38 @@ mod variable; /// The byte array is broken up into 32-byte blocks, each block is written in turn /// to the output, followed by `0xFF_u8`. The final block is padded to 32-bytes /// with `0_u8` and written to the output, followed by the un-padded length in bytes -/// of this final block as a `u8` +/// of this final block as a `u8`. +/// +/// Note the following example encodings use a block size of 4 bytes, +/// as opposed to 32 bytes for brevity: +/// +/// ```text +/// ┌───┬───┬───┬───┬───┬───┐ +/// "MEEP" │02 │'M'│'E'│'E'│'P'│04 │ +/// └───┴───┴───┴───┴───┴───┘ +/// +/// ┌───┐ +/// "" │01 | +/// └───┘ +/// +/// NULL ┌───┐ +/// │00 │ +/// └───┘ +/// +/// "Defenestration" ┌───┬───┬───┬───┬───┬───┐ +/// │02 │'D'│'e'│'f'│'e'│FF │ +/// └───┼───┼───┼───┼───┼───┤ +/// │'n'│'e'│'s'│'t'│FF │ +/// ├───┼───┼───┼───┼───┤ +/// │'r'│'a'│'t'│'r'│FF │ +/// ├───┼───┼───┼───┼───┤ +/// │'a'│'t'│'i'│'o'│FF │ +/// ├───┼───┼───┼───┼───┤ +/// │'n'│00 │00 │00 │01 │ +/// └───┴───┴───┴───┴───┘ +/// ``` /// -/// This is loosely inspired by [COBS] encoding, and chosen over more traditional +/// This approach is loosely inspired by [COBS] encoding, and chosen over more traditional /// [byte stuffing] as it is more amenable to vectorisation, in particular AVX-256. /// /// ## Dictionary Encoding @@ -170,15 +268,48 @@ mod variable; /// the dictionary encoding, and encode the array values directly, however, this would lose /// the benefits of dictionary encoding to reduce memory and CPU consumption. /// -/// As such the [`RowConverter`] maintains an order-preserving dictionary encoding for each -/// dictionary encoded column. As this is a variable-length encoding, new dictionary values -/// can be added whilst preserving the sort order. +/// As such the [`RowConverter`] creates an order-preserving mapping +/// for each dictionary encoded column, which allows new dictionary +/// values to be added whilst preserving the sort order. /// /// A null dictionary value is encoded as `0_u8`. /// /// A non-null dictionary value is encoded as `1_u8` followed by a null-terminated byte array /// key determined by the order-preserving dictionary encoding /// +/// ```text +/// ┌──────────┐ ┌─────┐ +/// │ "Bar" │ ───────────────▶│ 01 │ +/// └──────────┘ └─────┘ +/// ┌──────────┐ ┌─────┬─────┐ +/// │"Fabulous"│ ───────────────▶│ 01 │ 02 │ +/// └──────────┘ └─────┴─────┘ +/// ┌──────────┐ ┌─────┐ +/// │ "Soup" │ ───────────────▶│ 05 │ +/// └──────────┘ └─────┘ +/// ┌──────────┐ ┌─────┐ +/// │ "ZZ" │ ───────────────▶│ 07 │ +/// └──────────┘ └─────┘ +/// +/// Example Order Preserving Mapping +/// ``` +/// Using the map above, the corresponding row format will be +/// +/// ```text +/// ┌─────┬─────┬─────┬─────┐ +/// "Fabulous" │ 01 │ 03 │ 05 │ 00 │ +/// └─────┴─────┴─────┴─────┘ +/// +/// ┌─────┬─────┬─────┐ +/// "ZZ" │ 01 │ 07 │ 00 │ +/// └─────┴─────┴─────┘ +/// +/// ┌─────┐ +/// NULL │ 00 │ +/// └─────┘ +/// +/// Input Row Format +/// ``` /// # Ordering /// /// ## Float Ordering @@ -199,8 +330,8 @@ mod variable; /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values /// -/// [COBS]:[https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing] -/// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing] +/// [COBS]: https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing +/// [byte stuffing]: https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing #[derive(Debug)] pub struct RowConverter { fields: Arc<[SortField]>, @@ -351,9 +482,9 @@ impl RowConverter { } } -/// A row-oriented representation of arrow data, that is normalized for comparison +/// A row-oriented representation of arrow data, that is normalized for comparison. /// -/// See [`RowConverter`] +/// See the [module level documentation](self) and [`RowConverter`] for more details. #[derive(Debug)] pub struct Rows { /// Underlying row bytes @@ -439,12 +570,14 @@ impl<'a> DoubleEndedIterator for RowsIter<'a> { } } -/// A comparable representation of a row +/// A comparable representation of a row. /// -/// Two [`Row`] can be compared if they both belong to [`Rows`] returned by calls to -/// [`RowConverter::convert_columns`] on the same [`RowConverter`] +/// See the [module level documentation](self) for more details. /// -/// Otherwise any ordering established by comparing the [`Row`] is arbitrary +/// Two [`Row`] can only be compared if they both belong to [`Rows`] +/// returned by calls to [`RowConverter::convert_columns`] on the same +/// [`RowConverter`]. If different [`RowConverter`]s are used, any +/// ordering established by comparing the [`Row`] is arbitrary. #[derive(Debug, Copy, Clone)] pub struct Row<'a> { data: &'a [u8], From 3084ee258122910cc491d85a8bf9729b7bed95dc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 12 Nov 2022 23:39:24 -0800 Subject: [PATCH 0266/1411] Use ArrowNativeTypeOp instead of total_cmp directly (#3087) --- arrow/src/compute/kernels/comparison.rs | 112 +++++++----------------- 1 file changed, 32 insertions(+), 80 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 9d89287eebf1..a286eedd190b 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -2748,30 +2748,22 @@ pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a == b, - |a, b| a.total_cmp(&b).is_eq(), - |a, b| a == b - ) + typed_dict_compares!(left, right, |a, b| a == b, |a, b| a.is_eq(b), |a, b| a + == b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a == b, |a, b| a == b, |a, b| a - .total_cmp(&b) - .is_eq()) + .is_eq(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a == b, |a, b| a == b, |a, b| a - .total_cmp(&b) - .is_eq()) + typed_cmp_dict_non_dict!(right, left, |a, b| a == b, |a, b| a == b, |a, b| b + .is_eq(a)) } _ => { typed_compares!(left, right, |a, b| !(a ^ b), |a, b| a == b, |a, b| a - .total_cmp(&b) - .is_eq()) + .is_eq(b)) } } } @@ -2801,30 +2793,22 @@ pub fn neq_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a != b, - |a, b| a.total_cmp(&b).is_ne(), - |a, b| a != b - ) + typed_dict_compares!(left, right, |a, b| a != b, |a, b| a.is_ne(b), |a, b| a + != b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a != b, |a, b| a != b, |a, b| a - .total_cmp(&b) - .is_ne()) + .is_ne(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a != b, |a, b| a != b, |a, b| a - .total_cmp(&b) - .is_ne()) + typed_cmp_dict_non_dict!(right, left, |a, b| a != b, |a, b| a != b, |a, b| b + .is_ne(a)) } _ => { typed_compares!(left, right, |a, b| (a ^ b), |a, b| a != b, |a, b| a - .total_cmp(&b) - .is_ne()) + .is_ne(b)) } } } @@ -2854,30 +2838,22 @@ pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a < b, - |a, b| a.total_cmp(&b).is_lt(), - |a, b| a < b - ) + typed_dict_compares!(left, right, |a, b| a < b, |a, b| a.is_lt(b), |a, b| a + < b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a < b, |a, b| a < b, |a, b| a - .total_cmp(&b) - .is_lt()) + .is_lt(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(right, left, |a, b| a > b, |a, b| a > b, |a, b| b - .total_cmp(&a) - .is_lt()) + .is_lt(a)) } _ => { typed_compares!(left, right, |a, b| ((!a) & b), |a, b| a < b, |a, b| a - .total_cmp(&b) - .is_lt()) + .is_lt(b)) } } } @@ -2906,30 +2882,22 @@ pub fn lt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a <= b, - |a, b| a.total_cmp(&b).is_le(), - |a, b| a <= b - ) + typed_dict_compares!(left, right, |a, b| a <= b, |a, b| a.is_le(b), |a, b| a + <= b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a <= b, |a, b| a <= b, |a, b| a - .total_cmp(&b) - .is_le()) + .is_le(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(right, left, |a, b| a >= b, |a, b| a >= b, |a, b| b - .total_cmp(&a) - .is_le()) + .is_le(a)) } _ => { typed_compares!(left, right, |a, b| !(a & (!b)), |a, b| a <= b, |a, b| a - .total_cmp(&b) - .is_le()) + .is_le(b)) } } } @@ -2958,30 +2926,22 @@ pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a > b, - |a, b| a.total_cmp(&b).is_gt(), - |a, b| a > b - ) + typed_dict_compares!(left, right, |a, b| a > b, |a, b| a.is_gt(b), |a, b| a + > b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a > b, |a, b| a > b, |a, b| a - .total_cmp(&b) - .is_gt()) + .is_gt(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(right, left, |a, b| a < b, |a, b| a < b, |a, b| b - .total_cmp(&a) - .is_gt()) + .is_gt(a)) } _ => { typed_compares!(left, right, |a, b| (a & (!b)), |a, b| a > b, |a, b| a - .total_cmp(&b) - .is_gt()) + .is_gt(b)) } } } @@ -3009,30 +2969,22 @@ pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_dict_compares!( - left, - right, - |a, b| a >= b, - |a, b| a.total_cmp(&b).is_ge(), - |a, b| a >= b - ) + typed_dict_compares!(left, right, |a, b| a >= b, |a, b| a.is_ge(b), |a, b| a + >= b) } DataType::Dictionary(_, _) if !matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(left, right, |a, b| a >= b, |a, b| a >= b, |a, b| a - .total_cmp(&b) - .is_ge()) + .is_ge(b)) } _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { typed_cmp_dict_non_dict!(right, left, |a, b| a <= b, |a, b| a <= b, |a, b| b - .total_cmp(&a) - .is_ge()) + .is_ge(a)) } _ => { typed_compares!(left, right, |a, b| !((!a) & b), |a, b| a >= b, |a, b| a - .total_cmp(&b) - .is_ge()) + .is_ge(b)) } } } From b7af85cb8dfe6887bb3fd43d1d76f659473b6927 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 13 Nov 2022 21:07:11 +0800 Subject: [PATCH 0267/1411] add bloom filter implementation based on split block (sbbf) spec (#3057) * add bloom filter implementation based on split block spec * format and also revist index method * bloom filter reader * create new function to facilitate fixture test * fix clippy * Update parquet/src/bloom_filter/mod.rs Co-authored-by: Andrew Lamb * Update parquet/src/bloom_filter/mod.rs Co-authored-by: Andrew Lamb * Update parquet/src/bloom_filter/mod.rs Co-authored-by: Andrew Lamb * Update parquet/src/bloom_filter/mod.rs Co-authored-by: Andrew Lamb * Update parquet/src/bloom_filter/mod.rs * Update parquet/src/bloom_filter/mod.rs Co-authored-by: Liang-Chi Hsieh * fix clippy Co-authored-by: Andrew Lamb Co-authored-by: Liang-Chi Hsieh --- parquet/Cargo.toml | 5 +- parquet/src/bloom_filter/mod.rs | 217 ++++++++++++++++++++++++++++++++ parquet/src/lib.rs | 2 + 3 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 parquet/src/bloom_filter/mod.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index b400b01a7d1d..dda0518f94f1 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -57,6 +57,7 @@ seq-macro = { version = "0.3", default-features = false } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } hashbrown = { version = "0.13", default-features = false } +twox-hash = { version = "1.6", optional = true } [dev-dependencies] base64 = { version = "0.13", default-features = false, features = ["std"] } @@ -76,7 +77,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" all-features = true [features] -default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] +default = ["arrow", "bloom", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] # Enable arrow reader/writer APIs arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] # Enable CLI tools @@ -89,6 +90,8 @@ test_common = ["arrow/test_utils"] experimental = [] # Enable async APIs async = ["futures", "tokio"] +# Bloomfilter +bloom = ["twox-hash"] [[test]] name = "arrow_writer_layout" diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs new file mode 100644 index 000000000000..770fb53e8d28 --- /dev/null +++ b/parquet/src/bloom_filter/mod.rs @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Bloom filter implementation specific to Parquet, as described +//! in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) + +use crate::errors::ParquetError; +use crate::file::metadata::ColumnChunkMetaData; +use crate::format::{ + BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, BloomFilterHeader, +}; +use std::hash::Hasher; +use std::io::{Read, Seek, SeekFrom}; +use thrift::protocol::TCompactInputProtocol; +use twox_hash::XxHash64; + +/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach) +const SALT: [u32; 8] = [ + 0x47b6137b_u32, + 0x44974d91_u32, + 0x8824ad5b_u32, + 0xa2b7289d_u32, + 0x705495c7_u32, + 0x2df1424b_u32, + 0x9efc4947_u32, + 0x5c6bfb31_u32, +]; + +/// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. +/// Each word is thought of as an array of bits; each bit is either "set" or "not set". +type Block = [u32; 8]; + +/// takes as its argument a single unsigned 32-bit integer and returns a block in which each +/// word has exactly one bit set. +fn mask(x: u32) -> Block { + let mut result = [0_u32; 8]; + for i in 0..8 { + // wrapping instead of checking for overflow + let y = x.wrapping_mul(SALT[i]); + let y = y >> 27; + result[i] = 1 << y; + } + result +} + +/// setting every bit in the block that was also set in the result from mask +fn block_insert(block: &mut Block, hash: u32) { + let mask = mask(hash); + for i in 0..8 { + block[i] |= mask[i]; + } +} + +/// returns true when every bit that is set in the result of mask is also set in the block. +fn block_check(block: &Block, hash: u32) -> bool { + let mask = mask(hash); + for i in 0..8 { + if block[i] & mask[i] == 0 { + return false; + } + } + true +} + +/// A split block Bloom filter +pub struct Sbbf(Vec); + +impl Sbbf { + fn new(bitset: &[u8]) -> Self { + let data = bitset + .chunks_exact(4 * 8) + .map(|chunk| { + let mut block = [0_u32; 8]; + for (i, word) in chunk.chunks_exact(4).enumerate() { + block[i] = u32::from_le_bytes(word.try_into().unwrap()); + } + block + }) + .collect::>(); + Self(data) + } + + pub fn read_from_column_chunk( + column_metadata: &ColumnChunkMetaData, + mut reader: &mut R, + ) -> Result { + let offset = column_metadata.bloom_filter_offset().ok_or_else(|| { + ParquetError::General("Bloom filter offset is not set".to_string()) + })? as u64; + reader.seek(SeekFrom::Start(offset))?; + // deserialize header + let mut prot = TCompactInputProtocol::new(&mut reader); + let header = BloomFilterHeader::read_from_in_protocol(&mut prot)?; + + match header.algorithm { + BloomFilterAlgorithm::BLOCK(_) => { + // this match exists to future proof the singleton algorithm enum + } + } + match header.compression { + BloomFilterCompression::UNCOMPRESSED(_) => { + // this match exists to future proof the singleton compression enum + } + } + match header.hash { + BloomFilterHash::XXHASH(_) => { + // this match exists to future proof the singleton hash enum + } + } + // length in bytes + let length: usize = header.num_bytes.try_into().map_err(|_| { + ParquetError::General("Bloom filter length is invalid".to_string()) + })?; + let mut buffer = vec![0_u8; length]; + reader.read_exact(&mut buffer).map_err(|e| { + ParquetError::General(format!("Could not read bloom filter: {}", e)) + })?; + Ok(Self::new(&buffer)) + } + + #[inline] + fn hash_to_block_index(&self, hash: u64) -> usize { + // unchecked_mul is unstable, but in reality this is safe, we'd just use saturating mul + // but it will not saturate + (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize + } + + /// Insert a hash into the filter + pub fn insert(&mut self, hash: u64) { + let block_index = self.hash_to_block_index(hash); + let block = &mut self.0[block_index]; + block_insert(block, hash as u32); + } + + /// Check if a hash is in the filter. May return + /// true for values that was never inserted ("false positive") + /// but will always return false if a hash has not been inserted. + pub fn check(&self, hash: u64) -> bool { + let block_index = self.hash_to_block_index(hash); + let block = &self.0[block_index]; + block_check(block, hash as u32) + } +} + +// per spec we use xxHash with seed=0 +const SEED: u64 = 0; + +pub fn hash_bytes>(value: A) -> u64 { + let mut hasher = XxHash64::with_seed(SEED); + hasher.write(value.as_ref()); + hasher.finish() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_bytes() { + assert_eq!(hash_bytes(b""), 17241709254077376921); + } + + #[test] + fn test_mask_set_quick_check() { + for i in 0..1_000_000 { + let result = mask(i); + assert!(result.iter().all(|&x| x.count_ones() == 1)); + } + } + + #[test] + fn test_block_insert_and_check() { + for i in 0..1_000_000 { + let mut block = [0_u32; 8]; + block_insert(&mut block, i); + assert!(block_check(&block, i)); + } + } + + #[test] + fn test_sbbf_insert_and_check() { + let mut sbbf = Sbbf(vec![[0_u32; 8]; 1_000]); + for i in 0..1_000_000 { + sbbf.insert(i); + assert!(sbbf.check(i)); + } + } + + #[test] + fn test_with_fixture() { + // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10 + let bitset: &[u8] = &[ + 200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, + 33, 0, 5, 99, 65, 2, 0, 224, 44, 64, 78, 96, 4, + ]; + let sbbf = Sbbf::new(bitset); + for a in 0..10i64 { + let value = format!("a{}", a); + let hash = hash_bytes(value); + assert!(sbbf.check(hash)); + } + } +} diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 07cddfc3f448..cd29d02f808e 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -84,6 +84,8 @@ pub mod arrow; pub mod column; experimental!(mod compression); experimental!(mod encodings); +#[cfg(feature = "bloom")] +pub mod bloom_filter; pub mod file; pub mod record; pub mod schema; From 20d81f5784e5609666fa3d0599f2f95e5e5a40c6 Mon Sep 17 00:00:00 2001 From: Max Burke Date: Sun, 13 Nov 2022 10:23:29 -0800 Subject: [PATCH 0268/1411] Add FixedSizeBinaryArray::try_from_sparse_iter_with_size (#3054) * implement ord for FixedSizeBinary types * add ord test * add compare for fixed size binary arrays * FixedSizeBinaryArray::try_from_sparse_iter has fails to generate a valid array when the iterator only produces None values * Tweak try_from_sparse_iter_with_size to take an Option; pass tests * simplify try_from_sparse_iter_with_size, make size parameter non-optional * add test for fixed size binary comparisons * move tests to use FixedSizeBinaryArray::from_sparse_iter_with_size, add docstring * format + fix failing tests * fix build --- .../src/array/fixed_size_binary_array.rs | 121 +++++++++++++++++- arrow-select/src/take.rs | 7 +- arrow/src/array/ffi.rs | 3 +- arrow/src/compute/kernels/comparison.rs | 41 ++++++ arrow/src/compute/kernels/sort.rs | 15 ++- arrow/src/compute/kernels/substring.rs | 3 +- arrow/src/ffi.rs | 8 +- arrow/src/util/bench_util.rs | 25 ++-- arrow/tests/array_transform.rs | 9 +- 9 files changed, 198 insertions(+), 34 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index f37d1e3e5c38..9bac49810301 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -129,6 +129,9 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. + #[deprecated( + note = "This function will fail if the iterator produces only None values; prefer `try_from_sparse_iter_with_size`" + )] pub fn try_from_sparse_iter(mut iter: T) -> Result where T: Iterator>, @@ -196,6 +199,86 @@ impl FixedSizeBinaryArray { Ok(FixedSizeBinaryArray::from(array_data)) } + /// Create an array from an iterable argument of sparse byte slices. + /// Sparsity means that items returned by the iterator are optional, i.e input argument can + /// contain `None` items. In cases where the iterator returns only `None` values, this + /// also takes a size parameter to ensure that the a valid FixedSizeBinaryArray is still + /// created. + /// + /// # Examples + /// + /// ``` + /// use arrow_array::FixedSizeBinaryArray; + /// let input_arg = vec![ + /// None, + /// Some(vec![7, 8]), + /// Some(vec![9, 10]), + /// None, + /// Some(vec![13, 14]), + /// None, + /// ]; + /// let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap(); + /// ``` + /// + /// # Errors + /// + /// Returns error if argument has length zero, or sizes of nested slices don't match. + pub fn try_from_sparse_iter_with_size( + mut iter: T, + size: i32, + ) -> Result + where + T: Iterator>, + U: AsRef<[u8]>, + { + let mut len = 0; + let mut byte = 0; + let mut null_buf = MutableBuffer::from_len_zeroed(0); + let mut buffer = MutableBuffer::from_len_zeroed(0); + + iter.try_for_each(|item| -> Result<(), ArrowError> { + // extend null bitmask by one byte per each 8 items + if byte == 0 { + null_buf.push(0u8); + byte = 8; + } + byte -= 1; + + if let Some(slice) = item { + let slice = slice.as_ref(); + if size as usize != slice.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Nested array size mismatch: one is {}, and the other is {}", + size, + slice.len() + ))); + } + + bit_util::set_bit(null_buf.as_slice_mut(), len); + buffer.extend_from_slice(slice); + } else { + buffer.extend_zeros(size as usize); + } + + len += 1; + + Ok(()) + })?; + + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::FixedSizeBinary(size), + len, + None, + Some(null_buf.into()), + 0, + vec![buffer.into()], + vec![], + ) + }; + Ok(FixedSizeBinaryArray::from(array_data)) + } + /// Create an array from an iterable argument of byte slices. /// /// # Examples @@ -333,6 +416,7 @@ impl From for FixedSizeBinaryArray { impl From>> for FixedSizeBinaryArray { fn from(v: Vec>) -> Self { + #[allow(deprecated)] Self::try_from_sparse_iter(v.into_iter()).unwrap() } } @@ -561,6 +645,7 @@ mod tests { fn test_all_none_fixed_size_binary_array_from_sparse_iter() { let none_option: Option<[u8; 32]> = None; let input_arg = vec![none_option, none_option, none_option]; + #[allow(deprecated)] let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); assert_eq!(0, arr.value_length()); @@ -576,9 +661,31 @@ mod tests { None, Some(vec![13, 14]), ]; - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + #[allow(deprecated)] + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()) + .unwrap(); assert_eq!(2, arr.value_length()); + assert_eq!(5, arr.len()); + + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + input_arg.into_iter(), + 2, + ) + .unwrap(); + assert_eq!(2, arr.value_length()); + assert_eq!(5, arr.len()); + } + + #[test] + fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() { + let input_arg = vec![None, None, None, None, None] as Vec>>; + + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + input_arg.into_iter(), + 16, + ) + .unwrap(); + assert_eq!(16, arr.value_length()); assert_eq!(5, arr.len()) } @@ -643,7 +750,9 @@ mod tests { #[test] fn fixed_size_binary_array_all_null() { let data = vec![None] as Vec>; - let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0) + .unwrap(); array .data() .validate_full() @@ -652,16 +761,14 @@ mod tests { #[test] // Test for https://github.com/apache/arrow-rs/issues/1390 - #[should_panic( - expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0" - )] fn fixed_size_binary_array_all_null_in_batch_with_schema() { let schema = Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); let none_option: Option<[u8; 2]> = None; - let item = FixedSizeBinaryArray::try_from_sparse_iter( + let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![none_option, none_option, none_option].into_iter(), + 2, ) .unwrap(); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d34a88ba53ce..4af876a79dcc 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -207,12 +207,12 @@ where DataType::LargeBinary => { Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) } - DataType::FixedSizeBinary(_) => { + DataType::FixedSizeBinary(size) => { let values = values .as_any() .downcast_ref::() .unwrap(); - Ok(Arc::new(take_fixed_size_binary(values, indices)?)) + Ok(Arc::new(take_fixed_size_binary(values, indices, *size)?)) } DataType::Null => { // Take applied to a null array produces a null array. @@ -769,6 +769,7 @@ where fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, + size: i32, ) -> Result where IndexType: ArrowPrimitiveType, @@ -789,7 +790,7 @@ where .collect::, ArrowError>>()? .into_iter(); - FixedSizeBinaryArray::try_from_sparse_iter(array_iter) + FixedSizeBinaryArray::try_from_sparse_iter_with_size(array_iter, size) } /// `take` implementation for dictionary arrays diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 72030f900a4e..a18f408a4566 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -212,7 +212,8 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?; + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; let data = array.data(); test_round_trip(data) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index a286eedd190b..4566b4969295 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -2270,6 +2270,18 @@ macro_rules! typed_compares { as_largestring_array($RIGHT), $OP, ), + (DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => { + let lhs = $LEFT + .as_any() + .downcast_ref::() + .unwrap(); + let rhs = $RIGHT + .as_any() + .downcast_ref::() + .unwrap(); + + compare_op(lhs, rhs, $OP) + } (DataType::Binary, DataType::Binary) => compare_op( as_generic_binary_array::($LEFT), as_generic_binary_array::($RIGHT), @@ -5449,6 +5461,35 @@ mod tests { ); } + #[test] + fn test_eq_dyn_neq_dyn_fixed_size_binary() { + use crate::array::FixedSizeBinaryArray; + + let values1: Vec> = + vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])]; + let values2: Vec> = + vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])]; + + let array1 = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2) + .unwrap(); + let array2 = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2) + .unwrap(); + + let result = eq_dyn(&array1, &array2).unwrap(); + assert_eq!( + BooleanArray::from(vec![Some(true), None, Some(false)]), + result + ); + + let result = neq_dyn(&array1, &array2).unwrap(); + assert_eq!( + BooleanArray::from(vec![Some(false), None, Some(true)]), + result + ); + } + #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_i8_array() { diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 97b0758e5dc7..81895760e588 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -1437,17 +1437,24 @@ mod tests { fixed_length: Option, ) { // Fixed size binary array - if fixed_length.is_some() { + if let Some(length) = fixed_length { let input = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + data.iter().cloned(), + length, + ) + .unwrap(), ); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), None => sort(&(input as ArrayRef), options).unwrap(), }; let expected = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned()) - .unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + expected_data.iter().cloned(), + length, + ) + .unwrap(), ) as ArrayRef; assert_eq!(&sorted, &expected); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index f52ddb3bc30b..76568ae0dac0 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -713,8 +713,9 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let expected = FixedSizeBinaryArray::try_from_sparse_iter( + let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![None, Some(b"rrow")].into_iter(), + 4, ) .unwrap(); assert_eq!(result, &expected); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 95e6dce3c5fd..03c265318185 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1231,7 +1231,8 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = FixedSizeBinaryArray::try_from_sparse_iter(values.into_iter())?; + let array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; // export it let array = ArrowArray::try_from(array.into_data())?; @@ -1250,7 +1251,7 @@ mod tests { // verify assert_eq!( array, - &FixedSizeBinaryArray::try_from_sparse_iter( + &FixedSizeBinaryArray::try_from_sparse_iter_with_size( vec![ None, Some(vec![10, 10, 10]), @@ -1265,7 +1266,8 @@ mod tests { Some(vec![30, 30, 30]), None, ] - .into_iter() + .into_iter(), + 3 )? ); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index d07443301c16..6420b6346feb 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -176,17 +176,20 @@ pub fn create_fsb_array( ) -> FixedSizeBinaryArray { let rng = &mut seedable_rng(); - FixedSizeBinaryArray::try_from_sparse_iter((0..size).map(|_| { - if rng.gen::() < null_density { - None - } else { - let value = rng - .sample_iter::(Standard) - .take(value_len) - .collect::>(); - Some(value) - } - })) + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + (0..size).map(|_| { + if rng.gen::() < null_density { + None + } else { + let value = rng + .sample_iter::(Standard) + .take(value_len) + .collect::>(); + Some(value) + } + }), + value_len as i32, + ) .unwrap() } diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 3619abacdc9d..03942be10e01 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -868,7 +868,7 @@ fn test_list_of_strings_append() { #[test] fn test_fixed_size_binary_append() { let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; - let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) + let a = FixedSizeBinaryArray::try_from_sparse_iter_with_size(a.into_iter(), 2) .expect("Failed to create FixedSizeBinaryArray from iterable"); let b = vec![ @@ -879,7 +879,7 @@ fn test_fixed_size_binary_append() { Some(vec![13, 14]), None, ]; - let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) + let b = FixedSizeBinaryArray::try_from_sparse_iter_with_size(b.into_iter(), 2) .expect("Failed to create FixedSizeBinaryArray from iterable"); let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); @@ -911,8 +911,9 @@ fn test_fixed_size_binary_append() { Some(vec![9, 10]), // b[4..4] ]; - let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + let expected = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2) + .expect("Failed to create FixedSizeBinaryArray from iterable"); assert_eq!(&result, expected.data()); } From 46da6064255d6cda5ef5617528f0202a57a400e2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 13 Nov 2022 11:43:26 -0800 Subject: [PATCH 0269/1411] Cleanup temporal _internal functions (#3099) --- arrow/src/compute/kernels/temporal.rs | 152 +++++--------------------- 1 file changed, 30 insertions(+), 122 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 94ddc95c7590..c94e21a1b2e5 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -175,16 +175,6 @@ pub fn using_chrono_tz_and_utc_naive_date_time( Some(tz.offset_from_utc_datetime(&utc).fix()) } -/// Extracts the hours of a given temporal primitive array as an array of integers within -/// the range of [0, 23]. -pub fn hour(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - hour_internal(array) -} - /// Extracts the hours of a given array as an array of integers within /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -202,7 +192,7 @@ pub fn hour_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - hour_internal(array) + hour(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("hour does not support", dt), @@ -211,8 +201,9 @@ pub fn hour_dyn(array: &dyn Array) -> Result { } } -/// Extracts the hours of a given temporal array as an array of integers -fn hour_internal(array: &PrimitiveArray) -> Result +/// Extracts the hours of a given temporal primitive array as an array of integers within +/// the range of [0, 23]. +pub fn hour(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -246,15 +237,6 @@ where } } -/// Extracts the years of a given temporal primitive array as an array of integers -pub fn year(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - year_internal(array) -} - /// Extracts the years of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -272,7 +254,7 @@ pub fn year_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - year_internal(array) + year(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("year does not support", dt), @@ -281,8 +263,8 @@ pub fn year_dyn(array: &dyn Array) -> Result { } } -/// Extracts the years of a given temporal array as an array of integers -fn year_internal(array: &PrimitiveArray) -> Result +/// Extracts the years of a given temporal primitive array as an array of integers +pub fn year(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -301,16 +283,6 @@ where } } -/// Extracts the quarter of a given temporal primitive array as an array of integers within -/// the range of [1, 4]. -pub fn quarter(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - quarter_internal(array) -} - /// Extracts the quarter of a given temporal array as an array of integersa within /// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -328,7 +300,7 @@ pub fn quarter_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - quarter_internal(array) + quarter(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("quarter does not support", dt), @@ -337,8 +309,9 @@ pub fn quarter_dyn(array: &dyn Array) -> Result { } } -/// Extracts the quarter of a given temporal array as an array of integers -fn quarter_internal(array: &PrimitiveArray) -> Result +/// Extracts the quarter of a given temporal primitive array as an array of integers within +/// the range of [1, 4]. +pub fn quarter(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -361,16 +334,6 @@ where } } -/// Extracts the month of a given temporal primitive array as an array of integers within -/// the range of [1, 12]. -pub fn month(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - month_internal(array) -} - /// Extracts the month of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -388,7 +351,7 @@ pub fn month_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - month_internal(array) + month(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("month does not support", dt), @@ -397,8 +360,9 @@ pub fn month_dyn(array: &dyn Array) -> Result { } } -/// Extracts the month of a given temporal array as an array of integers -fn month_internal(array: &PrimitiveArray) -> Result +/// Extracts the month of a given temporal primitive array as an array of integers within +/// the range of [1, 12]. +pub fn month(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -421,20 +385,6 @@ where } } -/// Extracts the day of week of a given temporal primitive array as an array of -/// integers. -/// -/// Monday is encoded as `0`, Tuesday as `1`, etc. -/// -/// See also [`num_days_from_sunday`] which starts at Sunday. -pub fn num_days_from_monday(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - num_days_from_monday_internal(array) -} - /// Extracts the day of week of a given temporal array as an array of /// integers. /// @@ -458,7 +408,7 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - num_days_from_monday_internal(array) + num_days_from_monday(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("num_days_from_monday does not support", dt), @@ -467,13 +417,13 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { } } -/// Extracts the day of week of a given temporal array as an array of +/// Extracts the day of week of a given temporal primitive array as an array of /// integers. /// /// Monday is encoded as `0`, Tuesday as `1`, etc. /// /// See also [`num_days_from_sunday`] which starts at Sunday. -fn num_days_from_monday_internal(array: &PrimitiveArray) -> Result +pub fn num_days_from_monday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -496,20 +446,6 @@ where } } -/// Extracts the day of week of a given temporal primitive array as an array of -/// integers, starting at Sunday. -/// -/// Sunday is encoded as `0`, Monday as `1`, etc. -/// -/// See also [`num_days_from_monday`] which starts at Monday. -pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - num_days_from_sunday_internal(array) -} - /// Extracts the day of week of a given temporal array as an array of /// integers, starting at Sunday. /// @@ -533,7 +469,7 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - num_days_from_sunday_internal(array) + num_days_from_sunday(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("num_days_from_sunday does not support", dt), @@ -542,13 +478,13 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { } } -/// Extracts the day of week of a given temporal array as an array of +/// Extracts the day of week of a given temporal primitive array as an array of /// integers, starting at Sunday. /// /// Sunday is encoded as `0`, Monday as `1`, etc. /// /// See also [`num_days_from_monday`] which starts at Monday. -fn num_days_from_sunday_internal(array: &PrimitiveArray) -> Result +pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -574,15 +510,6 @@ where } } -/// Extracts the day of a given temporal primitive array as an array of integers -pub fn day(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - day_internal(array) -} - /// Extracts the day of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -600,7 +527,7 @@ pub fn day_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - day_internal(array) + day(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("day does not support", dt), @@ -609,8 +536,8 @@ pub fn day_dyn(array: &dyn Array) -> Result { } } -/// Extracts the day of a given temporal array as an array of integers -fn day_internal(array: &PrimitiveArray) -> Result +/// Extracts the day of a given temporal primitive array as an array of integers +pub fn day(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -631,16 +558,6 @@ where } } -/// Extracts the day of year of a given temporal primitive array as an array of integers -/// The day of year that ranges from 1 to 366 -pub fn doy(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - doy_internal(array) -} - /// Extracts the day of year of a given temporal array as an array of integers /// The day of year that ranges from 1 to 366. /// If the given array isn't temporal primitive or dictionary array, @@ -659,7 +576,7 @@ pub fn doy_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - doy_internal(array) + doy(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("doy does not support", dt), @@ -668,9 +585,9 @@ pub fn doy_dyn(array: &dyn Array) -> Result { } } -/// Extracts the day of year of a given temporal array as an array of integers +/// Extracts the day of year of a given temporal primitive array as an array of integers /// The day of year that ranges from 1 to 366 -fn doy_internal(array: &PrimitiveArray) -> Result +pub fn doy(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, T::Native: ArrowNativeType, @@ -703,15 +620,6 @@ where time_fraction_internal(array, "minute", |t| t.minute() as i32) } -/// Extracts the week of a given temporal primitive array as an array of integers -pub fn week(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: From, -{ - week_internal(array) -} - /// Extracts the week of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -729,7 +637,7 @@ pub fn week_dyn(array: &dyn Array) -> Result { _ => { downcast_temporal_array!( array => { - week_internal(array) + week(array) .map(|a| Arc::new(a) as ArrayRef) } dt => return_compute_error_with!("week does not support", dt), @@ -738,8 +646,8 @@ pub fn week_dyn(array: &dyn Array) -> Result { } } -/// Extracts the week of a given temporal array as an array of integers -fn week_internal(array: &PrimitiveArray) -> Result +/// Extracts the week of a given temporal primitive array as an array of integers +pub fn week(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, From 430eb84d0d64b15e2f8b13dfa0f3bd014d3e50c9 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sun, 13 Nov 2022 18:04:53 -0500 Subject: [PATCH 0270/1411] Improve schema mismatch error message (#3098) * Improve schema mismatch error message * fix formatting issues * trivial change to trigger build * fix PR comments * Empty commit Co-authored-by: askoa --- arrow-schema/src/field.rs | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index e414d2834275..4d13f523fb96 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -266,15 +266,16 @@ impl Field { /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { if from.dict_id != self.dict_id { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_id".to_string(), - )); + return Err(ArrowError::SchemaError(format!( + "Fail to merge schema field because from dict_id = {} does not match {}", + from.dict_id, self.dict_id + ))); } if from.dict_is_ordered != self.dict_is_ordered { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_is_ordered" - .to_string(), - )); + return Err(ArrowError::SchemaError(format!( + "Fail to merge schema field because from dict_is_ordered = {} does not match {}", + from.dict_is_ordered, self.dict_is_ordered + ))); } // merge metadata match (self.metadata(), from.metadata()) { @@ -284,7 +285,8 @@ impl Field { if let Some(self_value) = self_metadata.get(key) { if self_value != from_value { return Err(ArrowError::SchemaError(format!( - "Fail to merge field due to conflicting metadata data value for key {}", key), + "Fail to merge field due to conflicting metadata data value for key {}. + From value = {} does not match {}", key, from_value, self_value), )); } } else { @@ -313,10 +315,9 @@ impl Field { } _ => { return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), - )); - } + format!("Fail to merge schema field because the from data_type = {} is not DataType::Struct", + from.data_type) + ))} }, DataType::Union(nested_fields, type_ids, _) => match &from.data_type { DataType::Union(from_nested_fields, from_type_ids, _) => { @@ -333,8 +334,8 @@ impl Field { // type id. if self_type_id != field_type_id { return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting type ids in union datatype" - .to_string(), + format!("Fail to merge schema field because the self_type_id = {} does not equal field_type_id = {}", + self_type_id, field_type_id) )); } @@ -351,8 +352,8 @@ impl Field { } _ => { return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), + format!("Fail to merge schema field because the from data_type = {} is not DataType::Union", + from.data_type) )); } }, @@ -390,8 +391,8 @@ impl Field { | DataType::Decimal256(_, _) => { if self.data_type != from.data_type { return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), + format!("Fail to merge schema field because the from data_type = {} does not equal {}", + from.data_type, self.data_type) )); } } From 0900be27859974b8717185d65422c36d7e735b4e Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 14 Nov 2022 16:59:32 +0800 Subject: [PATCH 0271/1411] Upgrade to thrift 0.17 and fix issues (#3104) * test with thrift 0.17 and fix issues * rebase * remove databend prefix * fix async reader * fix doc err * fix more doc items --- arrow/src/row/dictionary.rs | 2 +- parquet/Cargo.toml | 2 +- parquet/src/arrow/async_reader.rs | 2 +- parquet/src/bloom_filter/mod.rs | 2 +- parquet/src/file/footer.rs | 2 +- parquet/src/file/page_index/index_reader.rs | 2 +- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/file/writer.rs | 2 +- parquet/src/format.rs | 494 +++++++++++++------- 9 files changed, 330 insertions(+), 180 deletions(-) diff --git a/arrow/src/row/dictionary.rs b/arrow/src/row/dictionary.rs index d8426ad0c3e6..82169a37d359 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow/src/row/dictionary.rs @@ -260,7 +260,7 @@ unsafe fn decode_fixed( .add_buffer(buffer.into()); // SAFETY: Buffers correct length - unsafe { builder.build_unchecked() } + builder.build_unchecked() } /// Decodes a `PrimitiveArray` from dictionary values diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index dda0518f94f1..a5d43bf54bfa 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -41,7 +41,7 @@ arrow-ipc = { version = "27.0.0", path = "../arrow-ipc", default-features = fals ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } -thrift = { version = "0.16", default-features = false } +thrift = { version = "0.17", default-features = false } snap = { version = "1.0", default-features = false, optional = true } brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index d52fa0406bfa..e182cccbcea3 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -89,7 +89,7 @@ use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::ready; use futures::stream::Stream; -use thrift::protocol::TCompactInputProtocol; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 770fb53e8d28..adfd87307ac6 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -25,7 +25,7 @@ use crate::format::{ }; use std::hash::Hasher; use std::io::{Read, Seek, SeekFrom}; -use thrift::protocol::TCompactInputProtocol; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; use twox_hash::XxHash64; /// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach) diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index e8a114db75b4..27c07b78d7cf 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -18,7 +18,7 @@ use std::{io::Read, sync::Arc}; use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; -use thrift::protocol::TCompactInputProtocol; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; use crate::basic::ColumnOrder; diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 99877a92105a..af23c0bd9f01 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -23,7 +23,7 @@ use crate::file::page_index::index::{BooleanIndex, ByteArrayIndex, Index, Native use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use std::io::{Cursor, Read}; -use thrift::protocol::TCompactInputProtocol; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; /// Read on row group's all columns indexes and change into [`Index`] /// If not the format not available return an empty vector. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index a400d4dabcb1..ebe87aca6d5e 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -24,7 +24,7 @@ use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; use crate::format::{PageHeader, PageLocation, PageType}; use bytes::{Buf, Bytes}; -use thrift::protocol::TCompactInputProtocol; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; use crate::basic::{Encoding, Type}; use crate::column::page::{Page, PageMetadata, PageReader}; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 528f72494190..2efaf7cafc2e 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -22,7 +22,7 @@ use std::{io::Write, sync::Arc}; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; -use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; +use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol, TSerializable}; use crate::basic::PageType; use crate::column::writer::{ diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 6fb2e32ebcfc..0851b2287fba 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -1,4 +1,4 @@ -// Autogenerated by Thrift Compiler (0.16.0) +// Autogenerated by Thrift Compiler (0.17.0) // DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING #![allow(unused_imports)] @@ -17,7 +17,7 @@ use std::rc::Rc; use thrift::OrderedFloat; use thrift::{ApplicationError, ApplicationErrorKind, ProtocolError, ProtocolErrorKind, TThriftClient}; -use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType}; +use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSerializable, TSetIdentifier, TStructIdentifier, TType}; use thrift::protocol::field_id; use thrift::protocol::verify_expected_message_type; use thrift::protocol::verify_expected_sequence_number; @@ -50,11 +50,14 @@ impl Type { Self::BYTE_ARRAY, Self::FIXED_LEN_BYTE_ARRAY, ]; +} + +impl TSerializable for Type { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(Type::from(enum_value)) } @@ -96,7 +99,7 @@ impl From<&Type> for i32 { /// DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet. /// ConvertedType is superseded by LogicalType. This enum should not be extended. -/// +/// /// See LogicalTypes.md for conversion between ConvertedType and LogicalType. #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ConvertedType(pub i32); @@ -114,12 +117,12 @@ impl ConvertedType { /// an enum is converted into a binary field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. - /// + /// /// This may be used to annotate binary or fixed primitive types. The /// underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. - /// + /// /// This must be accompanied by a (maximum) precision and a scale in the /// SchemaElement. The precision specifies the number of digits in the decimal /// and the scale stores the location of the decimal point. For example 1.23 @@ -127,62 +130,62 @@ impl ConvertedType { /// 2 digits over). pub const DECIMAL: ConvertedType = ConvertedType(5); /// A Date - /// + /// /// Stored as days since Unix epoch, encoded as the INT32 physical type. - /// + /// pub const DATE: ConvertedType = ConvertedType(6); /// A time - /// + /// /// The total number of milliseconds since midnight. The value is stored /// as an INT32 physical type. pub const TIME_MILLIS: ConvertedType = ConvertedType(7); /// A time. - /// + /// /// The total number of microseconds since midnight. The value is stored as /// an INT64 physical type. pub const TIME_MICROS: ConvertedType = ConvertedType(8); /// A date/time combination - /// + /// /// Date and time recorded as milliseconds since the Unix epoch. Recorded as /// a physical type of INT64. pub const TIMESTAMP_MILLIS: ConvertedType = ConvertedType(9); /// A date/time combination - /// + /// /// Date and time recorded as microseconds since the Unix epoch. The value is /// stored as an INT64 physical type. pub const TIMESTAMP_MICROS: ConvertedType = ConvertedType(10); /// An unsigned integer value. - /// + /// /// The number describes the maximum number of meaningful data bits in /// the stored value. 8, 16 and 32 bit values are stored using the /// INT32 physical type. 64 bit values are stored using the INT64 /// physical type. - /// + /// pub const UINT_8: ConvertedType = ConvertedType(11); pub const UINT_16: ConvertedType = ConvertedType(12); pub const UINT_32: ConvertedType = ConvertedType(13); pub const UINT_64: ConvertedType = ConvertedType(14); /// A signed integer value. - /// + /// /// The number describes the maximum number of meaningful data bits in /// the stored value. 8, 16 and 32 bit values are stored using the /// INT32 physical type. 64 bit values are stored using the INT64 /// physical type. - /// + /// pub const INT_8: ConvertedType = ConvertedType(15); pub const INT_16: ConvertedType = ConvertedType(16); pub const INT_32: ConvertedType = ConvertedType(17); pub const INT_64: ConvertedType = ConvertedType(18); /// An embedded JSON document - /// + /// /// A JSON document embedded within a single UTF8 column. pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document - /// + /// /// A BSON document embedded within a single BINARY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time - /// + /// /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 /// This data is composed of three separate little endian unsigned /// integers. Each stores a component of a duration of time. The first @@ -216,11 +219,14 @@ impl ConvertedType { Self::BSON, Self::INTERVAL, ]; +} + +impl TSerializable for ConvertedType { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(ConvertedType::from(enum_value)) } @@ -290,11 +296,14 @@ impl FieldRepetitionType { Self::OPTIONAL, Self::REPEATED, ]; +} + +impl TSerializable for FieldRepetitionType { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(FieldRepetitionType::from(enum_value)) } @@ -385,11 +394,14 @@ impl Encoding { Self::RLE_DICTIONARY, Self::BYTE_STREAM_SPLIT, ]; +} + +impl TSerializable for Encoding { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(Encoding::from(enum_value)) } @@ -431,11 +443,11 @@ impl From<&Encoding> for i32 { } /// Supported compression algorithms. -/// +/// /// Codecs added in format version X.Y can be read by readers based on X.Y and later. /// Codec support may vary between readers based on the format version and /// libraries available at runtime. -/// +/// /// See Compression.md for a detailed specification of these algorithms. #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct CompressionCodec(pub i32); @@ -459,11 +471,14 @@ impl CompressionCodec { Self::ZSTD, Self::LZ4_RAW, ]; +} + +impl TSerializable for CompressionCodec { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(CompressionCodec::from(enum_value)) } @@ -517,11 +532,14 @@ impl PageType { Self::DICTIONARY_PAGE, Self::DATA_PAGE_V2, ]; +} + +impl TSerializable for PageType { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(PageType::from(enum_value)) } @@ -571,11 +589,14 @@ impl BoundaryOrder { Self::ASCENDING, Self::DESCENDING, ]; +} + +impl TSerializable for BoundaryOrder { #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { o_prot.write_i32(self.0) } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(BoundaryOrder::from(enum_value)) } @@ -619,14 +640,14 @@ impl From<&BoundaryOrder> for i32 { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Statistics { /// DEPRECATED: min and max value of the column. Use min_value and max_value. - /// + /// /// Values are encoded using PLAIN encoding, except that variable-length byte /// arrays do not include a length prefix. - /// + /// /// These fields encode min and max values determined by signed comparison /// only. New files should use the correct order for a column's logical type /// and store the values in the min_value and max_value fields. - /// + /// /// To support older readers, these may be set when the column order is /// signed. pub max: Option>, @@ -636,7 +657,7 @@ pub struct Statistics { /// count of distinct values occurring pub distinct_count: Option, /// Min and max values for the column, determined by its ColumnOrder. - /// + /// /// Values are encoded using PLAIN encoding, except that variable-length byte /// arrays do not include a length prefix. pub max_value: Option>, @@ -654,7 +675,10 @@ impl Statistics { min_value: min_value.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for Statistics { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -710,7 +734,7 @@ impl Statistics { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("Statistics"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.max { @@ -774,7 +798,10 @@ impl StringType { pub fn new() -> StringType { StringType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for StringType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -793,7 +820,7 @@ impl StringType { let ret = StringType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("StringType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -819,7 +846,10 @@ impl UUIDType { pub fn new() -> UUIDType { UUIDType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for UUIDType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -838,7 +868,7 @@ impl UUIDType { let ret = UUIDType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("UUIDType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -864,7 +894,10 @@ impl MapType { pub fn new() -> MapType { MapType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for MapType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -883,7 +916,7 @@ impl MapType { let ret = MapType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MapType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -909,7 +942,10 @@ impl ListType { pub fn new() -> ListType { ListType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for ListType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -928,7 +964,7 @@ impl ListType { let ret = ListType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ListType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -954,7 +990,10 @@ impl EnumType { pub fn new() -> EnumType { EnumType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for EnumType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -973,7 +1012,7 @@ impl EnumType { let ret = EnumType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EnumType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -999,7 +1038,10 @@ impl DateType { pub fn new() -> DateType { DateType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for DateType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1018,7 +1060,7 @@ impl DateType { let ret = DateType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DateType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1037,7 +1079,7 @@ impl Default for DateType { // /// Logical type to annotate a column that is always null. -/// +/// /// Sometimes when discovering the schema of existing data, values are always /// null and the physical type can't be determined. This annotation signals /// the case where the physical type was guessed from all null values. @@ -1049,7 +1091,10 @@ impl NullType { pub fn new() -> NullType { NullType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for NullType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1068,7 +1113,7 @@ impl NullType { let ret = NullType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("NullType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1087,10 +1132,10 @@ impl Default for NullType { // /// Decimal logical type annotation -/// +/// /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. -/// +/// /// Allowed for physical types: INT32, INT64, FIXED, and BINARY #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { @@ -1105,7 +1150,10 @@ impl DecimalType { precision, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for DecimalType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1139,7 +1187,7 @@ impl DecimalType { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DecimalType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("scale", TType::I32, 1))?; @@ -1166,7 +1214,10 @@ impl MilliSeconds { pub fn new() -> MilliSeconds { MilliSeconds {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for MilliSeconds { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1185,7 +1236,7 @@ impl MilliSeconds { let ret = MilliSeconds {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MilliSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1211,7 +1262,10 @@ impl MicroSeconds { pub fn new() -> MicroSeconds { MicroSeconds {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for MicroSeconds { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1230,7 +1284,7 @@ impl MicroSeconds { let ret = MicroSeconds {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MicroSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1256,7 +1310,10 @@ impl NanoSeconds { pub fn new() -> NanoSeconds { NanoSeconds {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for NanoSeconds { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1275,7 +1332,7 @@ impl NanoSeconds { let ret = NanoSeconds {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("NanoSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1300,8 +1357,8 @@ pub enum TimeUnit { NANOS(NanoSeconds), } -impl TimeUnit { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for TimeUnit { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -1363,7 +1420,7 @@ impl TimeUnit { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimeUnit"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -1393,7 +1450,7 @@ impl TimeUnit { // /// Timestamp logical type annotation -/// +/// /// Allowed for physical types: INT64 #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct TimestampType { @@ -1408,7 +1465,10 @@ impl TimestampType { unit, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for TimestampType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1442,7 +1502,7 @@ impl TimestampType { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimestampType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; @@ -1461,7 +1521,7 @@ impl TimestampType { // /// Time logical type annotation -/// +/// /// Allowed for physical types: INT32 (millis), INT64 (micros, nanos) #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct TimeType { @@ -1476,7 +1536,10 @@ impl TimeType { unit, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for TimeType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1510,7 +1573,7 @@ impl TimeType { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimeType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; @@ -1529,9 +1592,9 @@ impl TimeType { // /// Integer logical type annotation -/// +/// /// bitWidth must be 8, 16, 32, or 64. -/// +/// /// Allowed for physical types: INT32, INT64 #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct IntType { @@ -1546,7 +1609,10 @@ impl IntType { is_signed, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for IntType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1580,7 +1646,7 @@ impl IntType { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("IntType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("bitWidth", TType::I08, 1))?; @@ -1599,7 +1665,7 @@ impl IntType { // /// Embedded JSON logical type annotation -/// +/// /// Allowed for physical types: BINARY #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { @@ -1609,7 +1675,10 @@ impl JsonType { pub fn new() -> JsonType { JsonType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for JsonType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1628,7 +1697,7 @@ impl JsonType { let ret = JsonType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("JsonType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1647,7 +1716,7 @@ impl Default for JsonType { // /// Embedded BSON logical type annotation -/// +/// /// Allowed for physical types: BINARY #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { @@ -1657,7 +1726,10 @@ impl BsonType { pub fn new() -> BsonType { BsonType {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for BsonType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1676,7 +1748,7 @@ impl BsonType { let ret = BsonType {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BsonType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1711,8 +1783,8 @@ pub enum LogicalType { UUID(UUIDType), } -impl LogicalType { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for LogicalType { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -1844,7 +1916,7 @@ impl LogicalType { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("LogicalType"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -1948,12 +2020,12 @@ pub struct SchemaElement { pub num_children: Option, /// DEPRECATED: When the schema is the result of a conversion from another model. /// Used to record the original type to help with cross conversion. - /// + /// /// This is superseded by logicalType. pub converted_type: Option, /// DEPRECATED: Used when this column contains decimal data. /// See the DECIMAL converted type for more details. - /// + /// /// This is superseded by using the DecimalType annotation in logicalType. pub scale: Option, pub precision: Option, @@ -1961,7 +2033,7 @@ pub struct SchemaElement { /// original field id in the parquet schema pub field_id: Option, /// The logical type of this SchemaElement - /// + /// /// LogicalType replaces ConvertedType, but ConvertedType is still required /// for some logical types to ensure forward-compatibility in format v1. pub logical_type: Option, @@ -1982,7 +2054,10 @@ impl SchemaElement { logical_type: logical_type.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for SchemaElement { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2063,7 +2138,7 @@ impl SchemaElement { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SchemaElement"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.type_ { @@ -2148,7 +2223,10 @@ impl DataPageHeader { statistics: statistics.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for DataPageHeader { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2202,7 +2280,7 @@ impl DataPageHeader { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DataPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2239,7 +2317,10 @@ impl IndexPageHeader { pub fn new() -> IndexPageHeader { IndexPageHeader {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for IndexPageHeader { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2258,7 +2339,7 @@ impl IndexPageHeader { let ret = IndexPageHeader {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("IndexPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2294,7 +2375,10 @@ impl DictionaryPageHeader { is_sorted: is_sorted.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for DictionaryPageHeader { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2334,7 +2418,7 @@ impl DictionaryPageHeader { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DictionaryPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2360,7 +2444,7 @@ impl DictionaryPageHeader { /// New page format allowing reading levels without decompressing the data /// Repetition and definition levels are uncompressed /// The remaining section containing the data is compressed if is_compressed is true -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeaderV2 { /// Number of values, including NULLs, in this data page. * @@ -2399,7 +2483,10 @@ impl DataPageHeaderV2 { statistics: statistics.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for DataPageHeaderV2 { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2473,7 +2560,7 @@ impl DataPageHeaderV2 { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DataPageHeaderV2"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2522,7 +2609,10 @@ impl SplitBlockAlgorithm { pub fn new() -> SplitBlockAlgorithm { SplitBlockAlgorithm {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for SplitBlockAlgorithm { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2541,7 +2631,7 @@ impl SplitBlockAlgorithm { let ret = SplitBlockAlgorithm {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SplitBlockAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2564,8 +2654,8 @@ pub enum BloomFilterAlgorithm { BLOCK(SplitBlockAlgorithm), } -impl BloomFilterAlgorithm { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for BloomFilterAlgorithm { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2613,7 +2703,7 @@ impl BloomFilterAlgorithm { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2634,7 +2724,7 @@ impl BloomFilterAlgorithm { /// Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash /// algorithm. It uses 64 bits version of xxHash. -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct XxHash { } @@ -2643,7 +2733,10 @@ impl XxHash { pub fn new() -> XxHash { XxHash {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for XxHash { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2662,7 +2755,7 @@ impl XxHash { let ret = XxHash {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("XxHash"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2685,8 +2778,8 @@ pub enum BloomFilterHash { XXHASH(XxHash), } -impl BloomFilterHash { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for BloomFilterHash { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2734,7 +2827,7 @@ impl BloomFilterHash { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterHash"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2754,7 +2847,7 @@ impl BloomFilterHash { // /// The compression used in the Bloom filter. -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Uncompressed { } @@ -2763,7 +2856,10 @@ impl Uncompressed { pub fn new() -> Uncompressed { Uncompressed {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for Uncompressed { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2782,7 +2878,7 @@ impl Uncompressed { let ret = Uncompressed {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("Uncompressed"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2805,8 +2901,8 @@ pub enum BloomFilterCompression { UNCOMPRESSED(Uncompressed), } -impl BloomFilterCompression { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for BloomFilterCompression { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2854,7 +2950,7 @@ impl BloomFilterCompression { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterCompression"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2875,7 +2971,7 @@ impl BloomFilterCompression { /// Bloom filter header is stored at beginning of Bloom filter data of each column /// and followed by its bitset. -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BloomFilterHeader { /// The size of bitset in bytes * @@ -2897,7 +2993,10 @@ impl BloomFilterHeader { compression, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for BloomFilterHeader { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2945,7 +3044,7 @@ impl BloomFilterHeader { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("numBytes", TType::I32, 1))?; @@ -3002,7 +3101,7 @@ pub struct PageHeader { /// encryption itself is performed after page compression (if compressed) /// If enabled, this allows for disabling checksumming in HDFS if only a few /// pages need to be read. - /// + /// pub crc: Option, pub data_page_header: Option, pub index_page_header: Option, @@ -3023,7 +3122,10 @@ impl PageHeader { data_page_header_v2: data_page_header_v2.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for PageHeader { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3094,7 +3196,7 @@ impl PageHeader { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; @@ -3154,7 +3256,10 @@ impl KeyValue { value: value.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for KeyValue { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3187,7 +3292,7 @@ impl KeyValue { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("KeyValue"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("key", TType::String, 1))?; @@ -3227,7 +3332,10 @@ impl SortingColumn { nulls_first, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for SortingColumn { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3268,7 +3376,7 @@ impl SortingColumn { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SortingColumn"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("column_idx", TType::I32, 1))?; @@ -3308,7 +3416,10 @@ impl PageEncodingStats { count, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for PageEncodingStats { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3349,7 +3460,7 @@ impl PageEncodingStats { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageEncodingStats"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("page_type", TType::I32, 1))?; @@ -3426,7 +3537,10 @@ impl ColumnMetaData { bloom_filter_offset: bloom_filter_offset.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for ColumnMetaData { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -3562,7 +3676,7 @@ impl ColumnMetaData { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; @@ -3652,7 +3766,10 @@ impl EncryptionWithFooterKey { pub fn new() -> EncryptionWithFooterKey { EncryptionWithFooterKey {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for EncryptionWithFooterKey { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -3671,7 +3788,7 @@ impl EncryptionWithFooterKey { let ret = EncryptionWithFooterKey {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionWithFooterKey"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -3704,7 +3821,10 @@ impl EncryptionWithColumnKey { key_metadata: key_metadata.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for EncryptionWithColumnKey { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -3743,7 +3863,7 @@ impl EncryptionWithColumnKey { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionWithColumnKey"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("path_in_schema", TType::List, 1))?; @@ -3773,8 +3893,8 @@ pub enum ColumnCryptoMetaData { ENCRYPTIONWITHCOLUMNKEY(EncryptionWithColumnKey), } -impl ColumnCryptoMetaData { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for ColumnCryptoMetaData { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -3829,7 +3949,7 @@ impl ColumnCryptoMetaData { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnCryptoMetaData"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -3857,14 +3977,14 @@ impl ColumnCryptoMetaData { pub struct ColumnChunk { /// File where column data is stored. If not set, assumed to be same file as /// metadata. This path is relative to the current file. - /// + /// pub file_path: Option, /// Byte offset in file_path to the ColumnMetaData * pub file_offset: i64, /// Column metadata for this chunk. This is the same content as what is at /// file_path/file_offset. Having it here has it replicated in the file /// metadata. - /// + /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * pub offset_index_offset: Option, @@ -3894,7 +4014,10 @@ impl ColumnChunk { encrypted_column_metadata: encrypted_column_metadata.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for ColumnChunk { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3969,7 +4092,7 @@ impl ColumnChunk { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnChunk"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.file_path { @@ -4028,7 +4151,7 @@ impl ColumnChunk { pub struct RowGroup { /// Metadata for each column chunk in this row group. /// This list must have the same order as the SchemaElement list in FileMetaData. - /// + /// pub columns: Vec, /// Total byte size of all the uncompressed column data in this row group * pub total_byte_size: i64, @@ -4059,7 +4182,10 @@ impl RowGroup { ordinal: ordinal.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for RowGroup { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option = None; @@ -4136,7 +4262,7 @@ impl RowGroup { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("RowGroup"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("columns", TType::List, 1))?; @@ -4194,7 +4320,10 @@ impl TypeDefinedOrder { pub fn new() -> TypeDefinedOrder { TypeDefinedOrder {} } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for TypeDefinedOrder { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -4213,7 +4342,7 @@ impl TypeDefinedOrder { let ret = TypeDefinedOrder {}; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TypeDefinedOrder"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -4236,8 +4365,8 @@ pub enum ColumnOrder { TYPEORDER(TypeDefinedOrder), } -impl ColumnOrder { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for ColumnOrder { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -4285,7 +4414,7 @@ impl ColumnOrder { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnOrder"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -4324,7 +4453,10 @@ impl PageLocation { first_row_index, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for PageLocation { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -4365,7 +4497,7 @@ impl PageLocation { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageLocation"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("offset", TType::I64, 1))?; @@ -4399,7 +4531,10 @@ impl OffsetIndex { page_locations, } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for OffsetIndex { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; loop { @@ -4432,7 +4567,7 @@ impl OffsetIndex { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("OffsetIndex"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("page_locations", TType::List, 1))?; @@ -4470,7 +4605,7 @@ pub struct ColumnIndex { /// that list entries are populated before using them by inspecting null_pages. pub min_values: Vec>, pub max_values: Vec>, - /// Stores whether both min_values and max_values are orderd and if so, in + /// Stores whether both min_values and max_values are ordered and if so, in /// which direction. This allows readers to perform binary searches in both /// lists. Readers cannot assume that max_values\[i\] <= min_values\[i+1\], even /// if the lists are ordered. @@ -4489,7 +4624,10 @@ impl ColumnIndex { null_counts: null_counts.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for ColumnIndex { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option>> = None; @@ -4567,7 +4705,7 @@ impl ColumnIndex { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnIndex"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("null_pages", TType::List, 1))?; @@ -4631,7 +4769,10 @@ impl AesGcmV1 { supply_aad_prefix: supply_aad_prefix.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for AesGcmV1 { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -4669,7 +4810,7 @@ impl AesGcmV1 { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("AesGcmV1"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.aad_prefix { @@ -4725,7 +4866,10 @@ impl AesGcmCtrV1 { supply_aad_prefix: supply_aad_prefix.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for AesGcmCtrV1 { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -4763,7 +4907,7 @@ impl AesGcmCtrV1 { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("AesGcmCtrV1"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.aad_prefix { @@ -4806,8 +4950,8 @@ pub enum EncryptionAlgorithm { AESGCMCTRV1(AesGcmCtrV1), } -impl EncryptionAlgorithm { - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl TSerializable for EncryptionAlgorithm { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -4862,7 +5006,7 @@ impl EncryptionAlgorithm { Ok(ret.expect("return value should have been constructed")) } } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -4907,17 +5051,17 @@ pub struct FileMetaData { /// String for application that wrote this file. This should be in the format /// `` version `` (build ``). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) - /// + /// pub created_by: Option, /// Sort order used for the min_value and max_value fields of each column in /// this file. Sort orders are listed in the order matching the columns in the /// schema. The indexes are not necessary the same though, because only leaf /// nodes of the schema are represented in the list of sort orders. - /// + /// /// Without column_orders, the meaning of the min_value and max_value fields is /// undefined. To ensure well-defined behaviour, if min_value and max_value are /// written to a Parquet file, column_orders must be written as well. - /// + /// /// The obsolete min and max fields are always sorted by signed comparison /// regardless of column_orders. pub column_orders: Option>, @@ -4944,7 +5088,10 @@ impl FileMetaData { footer_signing_key_metadata: footer_signing_key_metadata.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for FileMetaData { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -5046,7 +5193,7 @@ impl FileMetaData { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("FileMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("version", TType::I32, 1))?; @@ -5130,7 +5277,10 @@ impl FileCryptoMetaData { key_metadata: key_metadata.into(), } } - pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +} + +impl TSerializable for FileCryptoMetaData { + fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -5163,7 +5313,7 @@ impl FileCryptoMetaData { }; Ok(ret) } - pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("FileCryptoMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("encryption_algorithm", TType::Struct, 1))?; From fc06c84f43b743b38db19ef4a8977a58f61c34d7 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 14 Nov 2022 11:55:01 -0800 Subject: [PATCH 0272/1411] Implements more temporal kernels using time_fraction_dyn (#3107) --- arrow/src/compute/kernels/temporal.rs | 305 +++----------------------- 1 file changed, 25 insertions(+), 280 deletions(-) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index c94e21a1b2e5..9ade79969988 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -241,26 +241,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn year_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let year_values = year_dyn(array.values())?; - Ok(Arc::new(array.with_values(&year_values))) - } - dt => return_compute_error_with!("year does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - year(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("year does not support", dt), - ) - } - } + time_fraction_dyn(array, "year", |t| t.year() as i32) } /// Extracts the years of a given temporal primitive array as an array of integers @@ -269,44 +250,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => { - let b = Int32Builder::with_capacity(array.len()); - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<&PrimitiveArray, T, _>( - iter, - b, - |t| t.year(), - )) - } - _t => return_compute_error_with!("year does not support", array.data_type()), - } + time_fraction_internal(array, "year", |t| t.year() as i32) } /// Extracts the quarter of a given temporal array as an array of integersa within /// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn quarter_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let quarter_values = quarter_dyn(array.values())?; - Ok(Arc::new(array.with_values(&quarter_values))) - } - dt => return_compute_error_with!("quarter does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - quarter(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("quarter does not support", dt), - ) - } - } + time_fraction_dyn(array, "quarter", |t| t.quarter() as i32) } /// Extracts the quarter of a given temporal primitive array as an array of integers within @@ -316,48 +267,14 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.quarter() as i32 - })) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.quarter() as i32 - }) - } - _ => return_compute_error_with!("quarter does not support", array.data_type()), - } + time_fraction_internal(array, "quarter", |t| t.quarter() as i32) } /// Extracts the month of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn month_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let month_values = month_dyn(array.values())?; - Ok(Arc::new(array.with_values(&month_values))) - } - dt => return_compute_error_with!("month does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - month(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("month does not support", dt), - ) - } - } + time_fraction_dyn(array, "month", |t| t.month() as i32) } /// Extracts the month of a given temporal primitive array as an array of integers within @@ -367,22 +284,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.month() as i32 - })) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.month() as i32 - }) - } - _ => return_compute_error_with!("month does not support", array.data_type()), - } + time_fraction_internal(array, "month", |t| t.month() as i32) } /// Extracts the day of week of a given temporal array as an array of @@ -395,26 +297,9 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = num_days_from_monday_dyn(array.values())?; - Ok(Arc::new(array.with_values(&values))) - } - dt => return_compute_error_with!("num_days_from_monday does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - num_days_from_monday(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("num_days_from_monday does not support", dt), - ) - } - } + time_fraction_dyn(array, "num_days_from_monday", |t| { + t.num_days_from_monday() as i32 + }) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -428,22 +313,9 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.num_days_from_monday() - })) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.num_days_from_monday() - }) - } - _ => return_compute_error_with!("weekday does not support", array.data_type()), - } + time_fraction_internal(array, "num_days_from_monday", |t| { + t.num_days_from_monday() as i32 + }) } /// Extracts the day of week of a given temporal array as an array of @@ -456,26 +328,9 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = num_days_from_sunday_dyn(array.values())?; - Ok(Arc::new(array.with_values(&values))) - } - dt => return_compute_error_with!("num_days_from_sunday does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - num_days_from_sunday(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("num_days_from_sunday does not support", dt), - ) - } - } + time_fraction_dyn(array, "num_days_from_sunday", |t| { + t.num_days_from_sunday() as i32 + }) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -489,51 +344,16 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.num_days_from_sunday() - })) - } - DataType::Timestamp(_, Some(tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.num_days_from_sunday() - }) - } - _ => return_compute_error_with!( - "num_days_from_sunday does not support", - array.data_type() - ), - } + time_fraction_internal(array, "num_days_from_sunday", |t| { + t.num_days_from_sunday() as i32 + }) } /// Extracts the day of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn day_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = day_dyn(array.values())?; - Ok(Arc::new(array.with_values(&values))) - } - dt => return_compute_error_with!("day does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - day(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("day does not support", dt), - ) - } - } + time_fraction_dyn(array, "day", |t| t.day() as i32) } /// Extracts the day of a given temporal primitive array as an array of integers @@ -542,20 +362,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| t.day() as i32)) - } - DataType::Timestamp(_, Some(ref tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.day() as i32 - }) - } - _ => return_compute_error_with!("day does not support", array.data_type()), - } + time_fraction_internal(array, "day", |t| t.day() as i32) } /// Extracts the day of year of a given temporal array as an array of integers @@ -563,26 +370,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn doy_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = doy_dyn(array.values())?; - Ok(Arc::new(array.with_values(&values))) - } - dt => return_compute_error_with!("doy does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - doy(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("doy does not support", dt), - ) - } - } + time_fraction_dyn(array, "doy", |t| t.ordinal() as i32) } /// Extracts the day of year of a given temporal primitive array as an array of integers @@ -593,22 +381,7 @@ where T::Native: ArrowNativeType, i64: From, { - let b = Int32Builder::with_capacity(array.len()); - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.ordinal() as i32 - })) - } - DataType::Timestamp(_, Some(ref tz)) => { - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - t.ordinal() as i32 - }) - } - _ => return_compute_error_with!("doy does not support", array.data_type()), - } + time_fraction_internal(array, "doy", |t| t.ordinal() as i32) } /// Extracts the minutes of a given temporal primitive array as an array of integers @@ -624,26 +397,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn week_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let values = week_dyn(array.values())?; - Ok(Arc::new(array.with_values(&values))) - } - dt => return_compute_error_with!("week does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - week(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("week does not support", dt), - ) - } - } + time_fraction_dyn(array, "week", |t| t.iso_week().week() as i32) } /// Extracts the week of a given temporal primitive array as an array of integers @@ -652,16 +406,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - match array.data_type() { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { - let b = Int32Builder::with_capacity(array.len()); - let iter = ArrayIter::new(array); - Ok(as_datetime_with_op::<_, T, _>(iter, b, |t| { - t.iso_week().week() as i32 - })) - } - _ => return_compute_error_with!("week does not support", array.data_type()), - } + time_fraction_internal(array, "week", |t| t.iso_week().week() as i32) } /// Extracts the seconds of a given temporal primitive array as an array of integers @@ -729,7 +474,7 @@ where { let b = Int32Builder::with_capacity(array.len()); match array.data_type() { - DataType::Date64 | DataType::Timestamp(_, None) => { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, None) => { let iter = ArrayIter::new(array); Ok(as_datetime_with_op::<_, T, _>(iter, b, op)) } From 19f372d82315e0df29c9b545bb011b0cbd45ceda Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Tue, 15 Nov 2022 05:27:28 +0800 Subject: [PATCH 0273/1411] cast: unsigned numeric type with decimal (#3106) --- arrow-cast/src/cast.rs | 169 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 9 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index b3c0aaa82031..8504a8167b38 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -71,18 +71,22 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } match (from_type, to_type) { - // TODO UTF8/unsigned numeric to decimal + // TODO UTF8 to decimal // cast one decimal type to another decimal type (Decimal128(_, _), Decimal128(_, _)) => true, (Decimal256(_, _), Decimal256(_, _)) => true, (Decimal128(_, _), Decimal256(_, _)) => true, (Decimal256(_, _), Decimal128(_, _)) => true, + // unsigned integer to decimal + (UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _)) | // signed numeric to decimal (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | + // decimal to unsigned numeric + (Decimal128(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | - (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 ) + (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64) | ( Null, Boolean @@ -633,6 +637,30 @@ pub fn cast_with_options( (Decimal128(_, scale), _) => { // cast decimal to other type match to_type { + UInt8 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + UInt16 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + UInt32 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), + UInt64 => cast_decimal_to_integer::( + array, + 10_i128, + *scale, + cast_options, + ), Int8 => cast_decimal_to_integer::( array, 10_i128, @@ -707,7 +735,34 @@ pub fn cast_with_options( (_, Decimal128(precision, scale)) => { // cast data to decimal match from_type { - // TODO now just support signed numeric to decimal, support decimal to numeric later + UInt8 => cast_integer_to_decimal::<_, Decimal128Type, _>( + as_primitive_array::(array), + *precision, + *scale, + 10_i128, + cast_options, + ), + UInt16 => cast_integer_to_decimal::<_, Decimal128Type, _>( + as_primitive_array::(array), + *precision, + *scale, + 10_i128, + cast_options, + ), + UInt32 => cast_integer_to_decimal::<_, Decimal128Type, _>( + as_primitive_array::(array), + *precision, + *scale, + 10_i128, + cast_options, + ), + UInt64 => cast_integer_to_decimal::<_, Decimal128Type, _>( + as_primitive_array::(array), + *precision, + *scale, + 10_i128, + cast_options, + ), Int8 => cast_integer_to_decimal::<_, Decimal128Type, _>( as_primitive_array::(array), *precision, @@ -2113,7 +2168,7 @@ where _ => { return Err(ArrowError::ComputeError( "Unable to read value as datetime".to_string(), - )) + )); } }, None => builder.append_null(), @@ -3379,13 +3434,38 @@ mod tests { #[test] fn test_cast_decimal_to_numeric() { - let decimal_type = DataType::Decimal128(38, 2); - // negative test - assert!(!can_cast_types(&decimal_type, &DataType::UInt8)); let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); let array = Arc::new(decimal_array) as ArrayRef; + // u8 + generate_cast_test_case!( + &array, + UInt8Array, + &DataType::UInt8, + vec![Some(1_u8), Some(2_u8), Some(3_u8), None, Some(5_u8)] + ); + // u16 + generate_cast_test_case!( + &array, + UInt16Array, + &DataType::UInt16, + vec![Some(1_u16), Some(2_u16), Some(3_u16), None, Some(5_u16)] + ); + // u32 + generate_cast_test_case!( + &array, + UInt32Array, + &DataType::UInt32, + vec![Some(1_u32), Some(2_u32), Some(3_u32), None, Some(5_u32)] + ); + // u64 + generate_cast_test_case!( + &array, + UInt64Array, + &DataType::UInt64, + vec![Some(1_u64), Some(2_u64), Some(3_u64), None, Some(5_u64)] + ); // i8 generate_cast_test_case!( &array, @@ -3429,6 +3509,22 @@ mod tests { vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] ); + // overflow test: out of range of max u8 + let value_array: Vec> = vec![Some(51300)]; + let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = Arc::new(decimal_array) as ArrayRef; + let casted_array = + cast_with_options(&array, &DataType::UInt8, &CastOptions { safe: false }); + assert_eq!( + "Cast error: value of 513 is out of range UInt8".to_string(), + casted_array.unwrap_err().to_string() + ); + + let casted_array = + cast_with_options(&array, &DataType::UInt8, &CastOptions { safe: true }); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(24400)]; let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); @@ -3566,9 +3662,53 @@ mod tests { #[test] #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal128() { - // test negative cast type let decimal_type = DataType::Decimal128(38, 6); - assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); + // u8, u16, u32, u64 + let input_datas = vec![ + Arc::new(UInt8Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u8 + Arc::new(UInt16Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u16 + Arc::new(UInt32Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u32 + Arc::new(UInt64Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u64 + ]; + + for array in input_datas { + generate_cast_test_case!( + &array, + Decimal128Array, + &decimal_type, + vec![ + Some(1000000_i128), + Some(2000000_i128), + Some(3000000_i128), + None, + Some(5000000_i128) + ] + ); + } // i8, i16, i32, i64 let input_datas = vec![ @@ -3616,6 +3756,17 @@ mod tests { ); } + // test u8 to decimal type with overflow the result type + // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. + let array = UInt8Array::from(vec![1, 2, 3, 4, 100]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast(&array, &DataType::Decimal128(3, 1)); + assert!(casted_array.is_ok()); + let array = casted_array.unwrap(); + let array: &Decimal128Array = as_primitive_array(&array); + let err = array.validate_decimal_precision(3); + assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); + // test i8 to decimal type with overflow the result type // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. let array = Int8Array::from(vec![1, 2, 3, 4, 100]); From 81ce601bef0b4e0c40a129c98e12e3acfae58c58 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Nov 2022 19:52:05 -0500 Subject: [PATCH 0274/1411] Update instructions for new crates (#3111) --- dev/release/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/release/README.md b/dev/release/README.md index 8b7c934b20ee..61d8af55d12e 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -259,6 +259,8 @@ Rust Arrow Crates: (cd arrow-select && cargo publish) (cd arrow-cast && cargo publish) (cd arrow-ipc && cargo publish) +(cd arrow-csv && cargo publish) +(cd arrow-json && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From b0b5d8b4f57049d72403a749e109829ff35a6b64 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 15 Nov 2022 14:47:27 +1300 Subject: [PATCH 0275/1411] Add PrimitiveArray::unary_opt (#3110) * Add PrimitiveArray::unary_opt * Format * Clippy --- arrow-array/src/array/primitive_array.rs | 66 ++++++++++++++++++++++++ arrow-cast/src/cast.rs | 30 +++-------- 2 files changed, 73 insertions(+), 23 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 34abfeb0a3de..7cf7de721611 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -438,6 +438,57 @@ impl PrimitiveArray { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) } + + /// Applies a unary and nullable function to all valid values in a primitive array + /// + /// This is unlike [`Self::unary`] which will apply an infallible function to all rows + /// regardless of validity, in many cases this will be significantly faster and should + /// be preferred if `op` is infallible. + /// + /// Note: LLVM is currently unable to effectively vectorize fallible operations + pub fn unary_opt(&self, op: F) -> PrimitiveArray + where + O: ArrowPrimitiveType, + F: Fn(T::Native) -> Option, + { + let data = self.data(); + let len = data.len(); + let offset = data.offset(); + let null_count = data.null_count(); + let nulls = data.null_buffer().map(|x| x.as_slice()); + + let mut null_builder = BooleanBufferBuilder::new(len); + match nulls { + Some(b) => null_builder.append_packed_range(offset..offset + len, b), + None => null_builder.append_n(len, true), + } + + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(len); + let slice = buffer.as_slice_mut(); + + let mut out_null_count = null_count; + + let _ = try_for_each_valid_idx(len, offset, null_count, nulls, |idx| { + match op(unsafe { self.value_unchecked(idx) }) { + Some(v) => unsafe { *slice.get_unchecked_mut(idx) = v }, + None => { + out_null_count += 1; + null_builder.set_bit(idx, false); + } + } + Ok::<_, ()>(()) + }); + + unsafe { + build_primitive_array( + len, + buffer.finish(), + out_null_count, + Some(null_builder.finish()), + ) + } + } } #[inline] @@ -1864,6 +1915,21 @@ mod tests { assert!(!array.is_null(2)); } + #[test] + fn test_unary_opt() { + let array = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]); + let r = array.unary_opt::<_, Int32Type>(|x| (x % 2 != 0).then_some(x)); + + let expected = + Int32Array::from(vec![Some(1), None, Some(3), None, Some(5), None, Some(7)]); + assert_eq!(r, expected); + + let r = expected.unary_opt::<_, Int32Type>(|x| (x % 3 != 0).then_some(x)); + let expected = + Int32Array::from(vec![Some(1), None, None, None, Some(5), None, Some(7)]); + assert_eq!(r, expected); + } + #[test] #[should_panic( expected = "Trying to access an element at index 4 from a PrimitiveArray of length 3" diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8504a8167b38..d6dbf3061bbe 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -337,11 +337,8 @@ where })?; if cast_options.safe { - let iter = array - .iter() - .map(|v| v.and_then(|v| v.as_().mul_checked(mul).ok())); - let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; - casted_array + array + .unary_opt::<_, D>(|v| v.as_().mul_checked(mul).ok()) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { @@ -364,12 +361,8 @@ where let mul = 10_f64.powi(scale as i32); if cast_options.safe { - let iter = array - .iter() - .map(|v| v.and_then(|v| (mul * v.as_()).round().to_i128())); - let casted_array = - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; - casted_array + array + .unary_opt::<_, Decimal128Type>(|v| (mul * v.as_()).round().to_i128()) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { @@ -407,12 +400,8 @@ where let mul = 10_f64.powi(scale as i32); if cast_options.safe { - let iter = array - .iter() - .map(|v| v.and_then(|v| i256::from_f64((v.as_() * mul).round()))); - let casted_array = - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; - casted_array + array + .unary_opt::<_, Decimal256Type>(|v| i256::from_f64((v.as_() * mul).round())) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { @@ -2107,12 +2096,7 @@ where T::Native: NumCast, R::Native: NumCast, { - let iter = from - .iter() - .map(|v| v.and_then(num::cast::cast::)); - // Soundness: - // The iterator is trustedLen because it comes from an `PrimitiveArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + from.unary_opt::<_, R>(num::cast::cast::) } fn as_time_with_string_op< From 5c2801d08edbe1573d6c23c55b7333f0aedbdc3c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 15 Nov 2022 18:20:30 +1300 Subject: [PATCH 0276/1411] Add downcast_array (#2901) (#3117) --- arrow-array/src/cast.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 4569c36812bf..02d5432c168f 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -19,6 +19,7 @@ use crate::array::*; use crate::types::*; +use arrow_data::ArrayData; /// Repeats the provided pattern based on the number of comma separated identifiers #[doc(hidden)] @@ -550,6 +551,38 @@ array_downcast_fn!(as_union_array, UnionArray); array_downcast_fn!(as_map_array, MapArray); array_downcast_fn!(as_decimal_array, Decimal128Array); +/// Downcasts a `dyn Array` to a concrete type +/// +/// ``` +/// # use arrow_array::{BooleanArray, Int32Array, RecordBatch, StringArray}; +/// # use arrow_array::cast::downcast_array; +/// struct ConcreteBatch { +/// col1: Int32Array, +/// col2: BooleanArray, +/// col3: StringArray, +/// } +/// +/// impl ConcreteBatch { +/// fn new(batch: &RecordBatch) -> Self { +/// Self { +/// col1: downcast_array(batch.column(0).as_ref()), +/// col2: downcast_array(batch.column(1).as_ref()), +/// col3: downcast_array(batch.column(2).as_ref()), +/// } +/// } +/// } +/// ``` +/// +/// # Panics +/// +/// Panics if array is not of the correct data type +pub fn downcast_array(array: &dyn Array) -> T +where + T: From, +{ + T::from(array.data().clone()) +} + #[cfg(test)] mod tests { use arrow_buffer::i256; From 7d41e1c194b24238010e1a26c4864f535a4899eb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 15 Nov 2022 00:13:29 -0800 Subject: [PATCH 0277/1411] Check overflow while casting between decimal types (#3076) --- arrow-cast/src/cast.rs | 439 +++++++++++++++++++++++++++++++++-------- 1 file changed, 359 insertions(+), 80 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index d6dbf3061bbe..79c23bfac897 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -612,16 +612,16 @@ pub fn cast_with_options( } match (from_type, to_type) { (Decimal128(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal::<16, 16>(array, s1, p2, s2) + cast_decimal_to_decimal_with_option::<16, 16>(array, s1, p2, s2, cast_options) } (Decimal256(_, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal::<32, 32>(array, s1, p2, s2) + cast_decimal_to_decimal_with_option::<32, 32>(array, s1, p2, s2, cast_options) } (Decimal128(_, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal::<16, 32>(array, s1, p2, s2) + cast_decimal_to_decimal_with_option::<16, 32>(array, s1, p2, s2, cast_options) } (Decimal256(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal::<32, 16>(array, s1, p2, s2) + cast_decimal_to_decimal_with_option::<32, 16>(array, s1, p2, s2, cast_options) } (Decimal128(_, scale), _) => { // cast decimal to other type @@ -1916,7 +1916,36 @@ const fn time_unit_multiple(unit: &TimeUnit) -> i64 { } /// Cast one type of decimal array to another type of decimal array -fn cast_decimal_to_decimal( +fn cast_decimal_to_decimal_with_option< + const BYTE_WIDTH1: usize, + const BYTE_WIDTH2: usize, +>( + array: &ArrayRef, + input_scale: &u8, + output_precision: &u8, + output_scale: &u8, + cast_options: &CastOptions, +) -> Result { + if cast_options.safe { + cast_decimal_to_decimal_safe::( + array, + input_scale, + output_precision, + output_scale, + ) + } else { + cast_decimal_to_decimal::( + array, + input_scale, + output_precision, + output_scale, + ) + } +} + +/// Cast one type of decimal array to another type of decimal array. Returning NULLs for +/// the array values when cast failures happen. +fn cast_decimal_to_decimal_safe( array: &ArrayRef, input_scale: &u8, output_precision: &u8, @@ -1928,54 +1957,50 @@ fn cast_decimal_to_decimal( let div = 10_i128.pow((input_scale - output_scale) as u32); if BYTE_WIDTH1 == 16 { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.wrapping_div(div))); if BYTE_WIDTH2 == 16 { - let output_array = iter - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; - - Ok(Arc::new(output_array)) + let iter = array + .iter() + .map(|v| v.and_then(|v| v.div_checked(div).ok())); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) } else { - let output_array = iter - .map(|v| v.map(i256::from_i128)) - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; - - Ok(Arc::new(output_array)) + let iter = array.iter().map(|v| { + v.and_then(|v| v.div_checked(div).ok().map(i256::from_i128)) + }); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) } } else { let array = array.as_any().downcast_ref::().unwrap(); let div = i256::from_i128(div); - let iter = array.iter().map(|v| v.map(|v| v.wrapping_div(div))); if BYTE_WIDTH2 == 16 { - let values = iter - .map(|v| { - if v.is_none() { - Ok(None) - } else { - v.as_ref().and_then(|v| v.to_i128()) - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) - }) - .map(Some) - } - }) - .collect::, _>>()?; - - let output_array = values - .into_iter() - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; - - Ok(Arc::new(output_array)) + let iter = array.iter().map(|v| { + v.and_then(|v| v.div_checked(div).ok().and_then(|v| v.to_i128())) + }); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) } else { - let output_array = iter - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; - - Ok(Arc::new(output_array)) + let iter = array + .iter() + .map(|v| v.and_then(|v| v.div_checked(div).ok())); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) } } } else { @@ -1984,54 +2009,278 @@ fn cast_decimal_to_decimal( let mul = 10_i128.pow((output_scale - input_scale) as u32); if BYTE_WIDTH1 == 16 { let array = array.as_any().downcast_ref::().unwrap(); - let iter = array.iter().map(|v| v.map(|v| v.wrapping_mul(mul))); if BYTE_WIDTH2 == 16 { - let output_array = iter - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; + let iter = array + .iter() + .map(|v| v.and_then(|v| v.mul_checked(mul).ok())); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + let iter = array.iter().map(|v| { + v.and_then(|v| v.mul_checked(mul).ok().map(i256::from_i128)) + }); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let mul = i256::from_i128(mul); + if BYTE_WIDTH2 == 16 { + let iter = array.iter().map(|v| { + v.and_then(|v| v.mul_checked(mul).ok().and_then(|v| v.to_i128())) + }); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + let iter = array + .iter() + .map(|v| v.and_then(|v| v.mul_checked(mul).ok())); + let casted_array = unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + }; + casted_array + .with_precision_and_scale(*output_precision, *output_scale) + .map(|a| Arc::new(a) as ArrayRef) + } + } + } +} + +/// Cast one type of decimal array to another type of decimal array. Returning `Err` if +/// cast failure happens. +fn cast_decimal_to_decimal( + array: &ArrayRef, + input_scale: &u8, + output_precision: &u8, + output_scale: &u8, +) -> Result { + if input_scale > output_scale { + // For example, input_scale is 4 and output_scale is 3; + // Original value is 11234_i128, and will be cast to 1123_i128. + let array = array.as_any().downcast_ref::().unwrap(); + if BYTE_WIDTH1 == 16 { + if BYTE_WIDTH2 == 16 { + let div = 10_i128 + .pow_checked((input_scale - output_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; - Ok(Arc::new(output_array)) + array + .try_unary::<_, Decimal128Type, _>(|v| { + v.checked_div(div).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) } else { - let output_array = iter - .map(|v| v.map(i256::from_i128)) - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; + let div = i256::from_i128(10_i128) + .pow_checked((input_scale - output_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; - Ok(Arc::new(output_array)) + array + .try_unary::<_, Decimal256Type, _>(|v| { + i256::from_i128(v).checked_div(div).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) } } else { let array = array.as_any().downcast_ref::().unwrap(); - let mul = i256::from_i128(mul); - let iter = array.iter().map(|v| v.map(|v| v.wrapping_mul(mul))); + let div = i256::from_i128(10_i128) + .pow_checked((input_scale - output_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; if BYTE_WIDTH2 == 16 { - let values = iter - .map(|v| { - if v.is_none() { - Ok(None) - } else { - v.as_ref().and_then(|v| v.to_i128()) - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) - }) - .map(Some) - } + array + .try_unary::<_, Decimal128Type, _>(|v| { + v.checked_div(div).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }).and_then(|v| v.to_i128().ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + })) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) }) - .collect::, _>>()?; + .map(|a| Arc::new(a) as ArrayRef) + } else { + array + .try_unary::<_, Decimal256Type, _>(|v| { + v.checked_div(div).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) + } + } + } else { + // For example, input_scale is 3 and output_scale is 4; + // Original value is 1123_i128, and will be cast to 11230_i128. + if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); - let output_array = values - .into_iter() - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; + if BYTE_WIDTH2 == 16 { + let mul = 10_i128 + .pow_checked((output_scale - input_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; - Ok(Arc::new(output_array)) + array + .try_unary::<_, Decimal128Type, _>(|v| { + v.checked_mul(mul).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) } else { - let output_array = iter - .collect::() - .with_precision_and_scale(*output_precision, *output_scale)?; + let mul = i256::from_i128(10_i128) + .pow_checked((output_scale - input_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; - Ok(Arc::new(output_array)) + array + .try_unary::<_, Decimal256Type, _>(|v| { + i256::from_i128(v).checked_mul(mul).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let mul = i256::from_i128(10_i128) + .pow_checked((output_scale - input_scale) as u32) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast. The scale {} causes overflow.", + *output_scale, + )) + })?; + if BYTE_WIDTH2 == 16 { + array + .try_unary::<_, Decimal128Type, _>(|v| { + v.checked_mul(mul).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }).and_then(|v| v.to_i128().ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + })) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) + } else { + array + .try_unary::<_, Decimal256Type, _>(|v| { + v.checked_mul(mul).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {:?}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + *output_precision, + *output_scale, + v + )) + }) + }) + .and_then(|a| { + a.with_precision_and_scale(*output_precision, *output_scale) + }) + .map(|a| Arc::new(a) as ArrayRef) } } } @@ -3343,6 +3592,36 @@ mod tests { err.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal128_to_decimal128_overflow() { + let input_type = DataType::Decimal128(38, 3); + let output_type = DataType::Decimal128(38, 38); + assert!(can_cast_types(&input_type, &output_type)); + + let array = vec![Some(i128::MAX)]; + let input_decimal_array = create_decimal_array(array, 38, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + let result = + cast_with_options(&array, &output_type, &CastOptions { safe: false }); + assert_eq!("Cast error: Cannot cast to \"Decimal128\"(38, 38). Overflowing on 170141183460469231731687303715884105727", + result.unwrap_err().to_string()); + } + + #[test] + fn test_cast_decimal128_to_decimal256_overflow() { + let input_type = DataType::Decimal128(38, 3); + let output_type = DataType::Decimal256(76, 76); + assert!(can_cast_types(&input_type, &output_type)); + + let array = vec![Some(i128::MAX)]; + let input_decimal_array = create_decimal_array(array, 38, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + let result = + cast_with_options(&array, &output_type, &CastOptions { safe: false }); + assert_eq!("Cast error: Cannot cast to \"Decimal256\"(76, 76). Overflowing on 170141183460469231731687303715884105727", + result.unwrap_err().to_string()); + } + #[test] fn test_cast_decimal128_to_decimal256() { let input_type = DataType::Decimal128(20, 3); From 8bb2917ee7c22c71cd71368cbe4dec4335e7d8f5 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Tue, 15 Nov 2022 14:33:10 -0500 Subject: [PATCH 0278/1411] Remove Option from `Field::metadata` (#3091) * Remove Option from field metadata * fix test issues * fix clippy warnings * fix test issues * fix test issues * use default for BTreeMap initialization * empty commit Co-authored-by: askoa --- arrow-array/src/builder/struct_builder.rs | 2 +- arrow-integration-test/src/field.rs | 10 +- arrow-integration-test/src/lib.rs | 171 +++++++++++----------- arrow-ipc/src/convert.rs | 30 ++-- arrow-schema/src/datatype.rs | 6 +- arrow-schema/src/field.rs | 75 +++++----- arrow-schema/src/schema.rs | 62 ++++---- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/schema/complex.rs | 10 +- 9 files changed, 176 insertions(+), 192 deletions(-) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 69c092c0368d..1cb04aa6f786 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -396,7 +396,7 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }) is not currently supported" + expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = vec![Field::new("f1", DataType::Int16, false)]; diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 9b1a8f5f9ba6..5b5863557098 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); + let mut res: BTreeMap = BTreeMap::default(); for value in values { match value.as_object() { Some(map) => { @@ -87,12 +87,12 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { } } } - Some(res) + res } // We also support map format, because Schema's metadata supports this. // See https://github.com/apache/arrow/pull/5907 Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); + let mut res: BTreeMap = BTreeMap::default(); for (k, v) in values { if let Some(str_value) = v.as_str() { res.insert(k.clone(), str_value.to_string().clone()); @@ -103,14 +103,14 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { ))); } } - Some(res) + res } Some(_) => { return Err(ArrowError::ParseError( "Field `metadata` is not json array".to_string(), )); } - _ => None, + _ => BTreeMap::default(), }; // if data_type is a struct or list, get its children diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index d0db4b4b9ec1..75b76af1e6fc 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -83,10 +83,10 @@ pub struct ArrowJsonField { impl From<&Field> for ArrowJsonField { fn from(field: &Field) -> Self { - let metadata_value = match field.metadata() { - Some(kv_list) => { + let metadata_value = match field.metadata().is_empty() { + false => { let mut array = Vec::new(); - for (k, v) in kv_list { + for (k, v) in field.metadata() { let mut kv_map = SJMap::new(); kv_map.insert(k.clone(), Value::String(v.clone())); array.push(Value::Object(kv_map)); @@ -1120,90 +1120,87 @@ mod tests { let micros_tz = Some("UTC".to_string()); let nanos_tz = Some("Africa/Johannesburg".to_string()); - let schema = - Schema::new(vec![ - Field::new("bools-with-metadata-map", DataType::Boolean, true) - .with_metadata(Some( - [("k".to_string(), "v".to_string())] - .iter() - .cloned() - .collect(), - )), - Field::new("bools-with-metadata-vec", DataType::Boolean, true) - .with_metadata(Some( - [("k2".to_string(), "v2".to_string())] - .iter() - .cloned() - .collect(), - )), - Field::new("bools", DataType::Boolean, true), - Field::new("int8s", DataType::Int8, true), - Field::new("int16s", DataType::Int16, true), - Field::new("int32s", DataType::Int32, true), - Field::new("int64s", DataType::Int64, true), - Field::new("uint8s", DataType::UInt8, true), - Field::new("uint16s", DataType::UInt16, true), - Field::new("uint32s", DataType::UInt32, true), - Field::new("uint64s", DataType::UInt64, true), - Field::new("float32s", DataType::Float32, true), - Field::new("float64s", DataType::Float64, true), - Field::new("date_days", DataType::Date32, true), - Field::new("date_millis", DataType::Date64, true), - Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), - Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), - Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), - Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), - Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), - Field::new( - "ts_millis", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micros", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nanos", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - Field::new( - "ts_secs_tz", - DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), - true, - ), - Field::new( - "ts_millis_tz", - DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), - true, - ), - Field::new( - "ts_micros_tz", - DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), - true, - ), - Field::new( - "ts_nanos_tz", - DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), - true, - ), - Field::new("utf8s", DataType::Utf8, true), - Field::new( - "lists", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), - true, - ), - Field::new( - "structs", - DataType::Struct(vec![ - Field::new("int32s", DataType::Int32, true), - Field::new("utf8s", DataType::Utf8, true), - ]), - true, - ), - ]); + let schema = Schema::new(vec![ + Field::new("bools-with-metadata-map", DataType::Boolean, true).with_metadata( + [("k".to_string(), "v".to_string())] + .iter() + .cloned() + .collect(), + ), + Field::new("bools-with-metadata-vec", DataType::Boolean, true).with_metadata( + [("k2".to_string(), "v2".to_string())] + .iter() + .cloned() + .collect(), + ), + Field::new("bools", DataType::Boolean, true), + Field::new("int8s", DataType::Int8, true), + Field::new("int16s", DataType::Int16, true), + Field::new("int32s", DataType::Int32, true), + Field::new("int64s", DataType::Int64, true), + Field::new("uint8s", DataType::UInt8, true), + Field::new("uint16s", DataType::UInt16, true), + Field::new("uint32s", DataType::UInt32, true), + Field::new("uint64s", DataType::UInt64, true), + Field::new("float32s", DataType::Float32, true), + Field::new("float64s", DataType::Float64, true), + Field::new("date_days", DataType::Date32, true), + Field::new("date_millis", DataType::Date64, true), + Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), + Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), + Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), + Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), + Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), + Field::new( + "ts_millis", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "ts_micros", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nanos", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + Field::new( + "ts_secs_tz", + DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), + true, + ), + Field::new( + "ts_millis_tz", + DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), + true, + ), + Field::new( + "ts_micros_tz", + DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), + true, + ), + Field::new( + "ts_nanos_tz", + DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), + true, + ), + Field::new("utf8s", DataType::Utf8, true), + Field::new( + "lists", + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new( + "structs", + DataType::Struct(vec![ + Field::new("int32s", DataType::Int32, true), + Field::new("utf8s", DataType::Utf8, true), + ]), + true, + ), + ]); let bools_with_metadata_map = BooleanArray::from(vec![Some(true), None, Some(false)]); diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 8d01c58b6ae3..a9dda6f2a1f1 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -86,18 +86,16 @@ impl<'a> From> for Field { ) }; - let mut metadata = None; + let mut metadata_map = BTreeMap::default(); if let Some(list) = field.custom_metadata() { - let mut metadata_map = BTreeMap::default(); for kv in list { if let (Some(k), Some(v)) = (kv.key(), kv.value()) { metadata_map.insert(k.to_string(), v.to_string()); } } - metadata = Some(metadata_map); } - arrow_field.with_metadata(metadata) + arrow_field.with_metadata(metadata_map) } } @@ -424,19 +422,17 @@ pub(crate) fn build_field<'a>( ) -> WIPOffset> { // Optional custom metadata. let mut fb_metadata = None; - if let Some(metadata) = field.metadata() { - if !metadata.is_empty() { - let mut kv_vec = vec![]; - for (k, v) in metadata { - let kv_args = crate::KeyValueArgs { - key: Some(fbb.create_string(k.as_str())), - value: Some(fbb.create_string(v.as_str())), - }; - let kv_offset = crate::KeyValue::create(fbb, &kv_args); - kv_vec.push(kv_offset); - } - fb_metadata = Some(fbb.create_vector(&kv_vec)); + if !field.metadata().is_empty() { + let mut kv_vec = vec![]; + for (k, v) in field.metadata() { + let kv_args = crate::KeyValueArgs { + key: Some(fbb.create_string(k.as_str())), + value: Some(fbb.create_string(v.as_str())), + }; + let kv_offset = crate::KeyValue::create(fbb, &kv_args); + kv_vec.push(kv_offset); } + fb_metadata = Some(fbb.create_vector(&kv_vec)); }; let fb_field_name = fbb.create_string(field.name().as_str()); @@ -822,7 +818,7 @@ mod tests { .collect(); let schema = Schema::new_with_metadata( vec![ - Field::new("uint8", DataType::UInt8, false).with_metadata(Some(field_md)), + Field::new("uint8", DataType::UInt8, false).with_metadata(field_md), Field::new("uint16", DataType::UInt16, true), Field::new("uint32", DataType::UInt32, false), Field::new("uint64", DataType::UInt64, true), diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 759fc39646c1..90ae429422c6 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -387,12 +387,12 @@ mod tests { let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); // Non-empty map: should be converted as JSON obj { ... } - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); + let first_name = + Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); // Empty map: should be omitted. let last_name = Field::new("last_name", DataType::Utf8, false) - .with_metadata(Some(BTreeMap::default())); + .with_metadata(BTreeMap::default()); let person = DataType::Struct(vec![ first_name, diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 4d13f523fb96..ee6ece862da5 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -35,8 +35,11 @@ pub struct Field { dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. - #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))] - metadata: Option>, + #[cfg_attr( + feature = "serde", + serde(skip_serializing_if = "BTreeMap::is_empty", default) + )] + metadata: BTreeMap, } // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` @@ -89,7 +92,7 @@ impl Field { nullable, dict_id: 0, dict_is_ordered: false, - metadata: None, + metadata: BTreeMap::default(), } } @@ -107,33 +110,30 @@ impl Field { nullable, dict_id, dict_is_ordered, - metadata: None, + metadata: BTreeMap::default(), } } /// Sets the `Field`'s optional custom metadata. /// The metadata is set as `None` for empty map. #[inline] - pub fn set_metadata(&mut self, metadata: Option>) { - // To make serde happy, convert Some(empty_map) to None. - self.metadata = None; - if let Some(v) = metadata { - if !v.is_empty() { - self.metadata = Some(v); - } + pub fn set_metadata(&mut self, metadata: BTreeMap) { + self.metadata = BTreeMap::default(); + if !metadata.is_empty() { + self.metadata = metadata; } } /// Sets the metadata of this `Field` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: Option>) -> Self { + pub fn with_metadata(mut self, metadata: BTreeMap) -> Self { self.set_metadata(metadata); self } /// Returns the immutable reference to the `Field`'s optional custom metadata. #[inline] - pub const fn metadata(&self) -> Option<&BTreeMap> { - self.metadata.as_ref() + pub const fn metadata(&self) -> &BTreeMap { + &self.metadata } /// Returns an immutable reference to the `Field`'s name. @@ -278,11 +278,11 @@ impl Field { ))); } // merge metadata - match (self.metadata(), from.metadata()) { - (Some(self_metadata), Some(from_metadata)) => { - let mut merged = self_metadata.clone(); - for (key, from_value) in from_metadata { - if let Some(self_value) = self_metadata.get(key) { + match (self.metadata().is_empty(), from.metadata().is_empty()) { + (false, false) => { + let mut merged = self.metadata().clone(); + for (key, from_value) in from.metadata() { + if let Some(self_value) = self.metadata.get(key) { if self_value != from_value { return Err(ArrowError::SchemaError(format!( "Fail to merge field due to conflicting metadata data value for key {}. @@ -293,10 +293,10 @@ impl Field { merged.insert(key.clone(), from_value.clone()); } } - self.set_metadata(Some(merged)); + self.set_metadata(merged); } - (None, Some(from_metadata)) => { - self.set_metadata(Some(from_metadata.clone())); + (true, false) => { + self.set_metadata(from.metadata().clone()); } _ => {} } @@ -415,12 +415,12 @@ impl Field { // self need to be nullable or both of them are not nullable && (self.nullable || !other.nullable) // make sure self.metadata is a superset of other.metadata - && match (&self.metadata, &other.metadata) { - (_, None) => true, - (None, Some(_)) => false, - (Some(self_meta), Some(other_meta)) => { - other_meta.iter().all(|(k, v)| { - match self_meta.get(k) { + && match (&self.metadata.is_empty(), &other.metadata.is_empty()) { + (_, true) => true, + (true, false) => false, + (false, false) => { + other.metadata().iter().all(|(k, v)| { + match self.metadata().get(k) { Some(s) => s == v, None => false } @@ -538,10 +538,10 @@ mod test { #[test] fn test_contains_reflexivity() { let mut field = Field::new("field1", DataType::Float16, false); - field.set_metadata(Some(BTreeMap::from([ + field.set_metadata(BTreeMap::from([ (String::from("k0"), String::from("v0")), (String::from("k1"), String::from("v1")), - ]))); + ])); assert!(field.contains(&field)) } @@ -550,23 +550,14 @@ mod test { let child_field = Field::new("child1", DataType::Float16, false); let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); - field1.set_metadata(Some(BTreeMap::from([( - String::from("k1"), - String::from("v1"), - )]))); + field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))])); let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); - field2.set_metadata(Some(BTreeMap::from([( - String::from("k2"), - String::from("v2"), - )]))); + field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))])); field2.try_merge(&field1).unwrap(); let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); - field3.set_metadata(Some(BTreeMap::from([( - String::from("k3"), - String::from("v3"), - )]))); + field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))])); field3.try_merge(&field2).unwrap(); assert!(field2.contains(&field1)); diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 60fe3c6ca9a0..519a8e089aef 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -419,12 +419,12 @@ mod tests { assert_ne!(schema2, schema4); assert_ne!(schema3, schema4); - let f = Field::new("c1", DataType::Utf8, false).with_metadata(Some( + let f = Field::new("c1", DataType::Utf8, false).with_metadata( [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(), - )); + ); let schema5 = Schema::new(vec![ f, Field::new("c2", DataType::Float64, true), @@ -437,13 +437,13 @@ mod tests { fn create_schema_string() { let schema = person_schema(); assert_eq!(schema.to_string(), - "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \ - Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {\"k\": \"v\"} }, \ + Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ Field { name: \"address\", data_type: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\ - ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }") + Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }\ + ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: {} }") } #[test] @@ -462,8 +462,8 @@ mod tests { assert_eq!(first_name.dict_is_ordered(), None); let metadata = first_name.metadata(); - assert!(metadata.is_some()); - let md = metadata.as_ref().unwrap(); + assert!(!metadata.is_empty()); + let md = &metadata; assert_eq!(md.len(), 1); let key = md.get("k"); assert!(key.is_some()); @@ -524,8 +524,8 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); + let first_name = + Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); Schema::new(vec![ first_name, @@ -556,16 +556,14 @@ mod tests { .iter() .cloned() .collect(); - let f1 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata1)); + let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1); let metadata2: BTreeMap = [("foo".to_string(), "baz".to_string())] .iter() .cloned() .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); assert!( Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]) @@ -579,34 +577,30 @@ mod tests { .iter() .cloned() .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().as_ref().unwrap(), - f2.metadata().as_ref().unwrap() - ); + assert!(!f1.metadata().is_empty()); + assert_eq!(f1.metadata(), f2.metadata()); // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata( [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(), - )); - let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + ); + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata( [("foo2".to_string(), "bar2".to_string())] .iter() .cloned() .collect(), - )); + ); assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); + assert!(!f1.metadata().is_empty()); assert_eq!( - f1.metadata().cloned().unwrap(), + f1.metadata().clone(), [ ("foo".to_string(), "bar".to_string()), ("foo2".to_string(), "bar2".to_string()) @@ -617,17 +611,17 @@ mod tests { ); // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata( [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(), - )); + ); let f2 = Field::new("first_name", DataType::Utf8, false); assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); + assert!(!f1.metadata().is_empty()); assert_eq!( - f1.metadata().cloned().unwrap(), + f1.metadata().clone(), [("foo".to_string(), "bar".to_string())] .iter() .cloned() @@ -638,7 +632,7 @@ mod tests { let mut f1 = Field::new("first_name", DataType::Utf8, false); let f2 = Field::new("first_name", DataType::Utf8, false); assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_none()); + assert!(f1.metadata().is_empty()); } #[test] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 35b70a0485cd..a720d439cc91 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2021,7 +2021,7 @@ mod tests { .collect(); let schema_with_metadata = - Arc::new(Schema::new(vec![field.with_metadata(Some(metadata))])); + Arc::new(Schema::new(vec![field.with_metadata(metadata)])); assert_ne!(schema_with_metadata, schema_without_metadata); diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 2334a5601b4c..4ff9c7a39566 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::collections::BTreeMap; + use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::ProjectionMask; use crate::basic::{ConvertedType, Repetition}; @@ -343,13 +345,17 @@ impl Visitor { (Some(key), Some(value)) => { let key_field = convert_field(map_key, &key, arrow_key); let value_field = convert_field(map_value, &value, arrow_value); + let field_metadata = match arrow_map { + Some(field) => field.metadata().clone(), + _ => BTreeMap::default(), + }; let map_field = Field::new( map_key_value.name(), DataType::Struct(vec![key_field, value_field]), false, // The inner map field is always non-nullable (#1697) ) - .with_metadata(arrow_map.and_then(|f| f.metadata().cloned())); + .with_metadata(field_metadata); Ok(Some(ParquetField { rep_level, @@ -539,7 +545,7 @@ fn convert_field( _ => Field::new(name, data_type, nullable), }; - field.with_metadata(hint.metadata().cloned()) + field.with_metadata(hint.metadata().clone()) } None => Field::new(name, data_type, nullable), } From 371ec57e370e76e8b690d404d9ca7a86c07a73e4 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Tue, 15 Nov 2022 14:34:37 -0500 Subject: [PATCH 0279/1411] Expose `SortingColumn` in parquet files (#3103) * Expose SortColumn from parquet file * fix formatting issues * empty commit * fix PR comments * formatting fix * add parquet round trip test * fix clippy error * update the test based on PR comment Co-authored-by: askoa --- parquet/src/file/metadata.rs | 21 ++++++++++-- parquet/src/file/properties.rs | 16 +++++++++ parquet/src/file/writer.rs | 61 ++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 0804890c22a0..895776a8a421 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -37,7 +37,7 @@ use std::sync::Arc; use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, - RowGroup, + RowGroup, SortingColumn, }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; @@ -229,6 +229,7 @@ pub type RowGroupMetaDataPtr = Arc; pub struct RowGroupMetaData { columns: Vec, num_rows: i64, + sorting_columns: Option>, total_byte_size: i64, schema_descr: SchemaDescPtr, page_offset_index: Option>>, @@ -260,6 +261,11 @@ impl RowGroupMetaData { self.num_rows } + /// Returns the sort ordering of the rows in this RowGroup if any + pub fn sorting_columns(&self) -> Option<&Vec> { + self.sorting_columns.as_ref() + } + /// Total byte size of all uncompressed column data in this row group. pub fn total_byte_size(&self) -> i64 { self.total_byte_size @@ -303,9 +309,11 @@ impl RowGroupMetaData { let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?; columns.push(cc); } + let sorting_columns = rg.sorting_columns; Ok(RowGroupMetaData { columns, num_rows, + sorting_columns, total_byte_size, schema_descr, page_offset_index: None, @@ -318,7 +326,7 @@ impl RowGroupMetaData { columns: self.columns().iter().map(|v| v.to_thrift()).collect(), total_byte_size: self.total_byte_size, num_rows: self.num_rows, - sorting_columns: None, + sorting_columns: self.sorting_columns().cloned(), file_offset: None, total_compressed_size: None, ordinal: None, @@ -331,6 +339,7 @@ pub struct RowGroupMetaDataBuilder { columns: Vec, schema_descr: SchemaDescPtr, num_rows: i64, + sorting_columns: Option>, total_byte_size: i64, page_offset_index: Option>>, } @@ -342,6 +351,7 @@ impl RowGroupMetaDataBuilder { columns: Vec::with_capacity(schema_descr.num_columns()), schema_descr, num_rows: 0, + sorting_columns: None, total_byte_size: 0, page_offset_index: None, } @@ -353,6 +363,12 @@ impl RowGroupMetaDataBuilder { self } + /// Sets the sorting order for columns + pub fn set_sorting_columns(mut self, value: Option>) -> Self { + self.sorting_columns = value; + self + } + /// Sets total size in bytes for this row group. pub fn set_total_byte_size(mut self, value: i64) -> Self { self.total_byte_size = value; @@ -384,6 +400,7 @@ impl RowGroupMetaDataBuilder { Ok(RowGroupMetaData { columns: self.columns, num_rows: self.num_rows, + sorting_columns: self.sorting_columns, total_byte_size: self.total_byte_size, schema_descr: self.schema_descr, page_offset_index: self.page_offset_index, diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index cf821df2110f..c65ba8035ee6 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -69,6 +69,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; +use crate::format::SortingColumn; use crate::schema::types::ColumnPath; const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; @@ -121,6 +122,7 @@ pub struct WriterProperties { pub(crate) key_value_metadata: Option>, default_column_properties: ColumnProperties, column_properties: HashMap, + sorting_columns: Option>, } impl WriterProperties { @@ -182,6 +184,11 @@ impl WriterProperties { self.key_value_metadata.as_ref() } + /// Returns sorting columns. + pub fn sorting_columns(&self) -> Option<&Vec> { + self.sorting_columns.as_ref() + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// This is not configurable. #[inline] @@ -262,6 +269,7 @@ pub struct WriterPropertiesBuilder { key_value_metadata: Option>, default_column_properties: ColumnProperties, column_properties: HashMap, + sorting_columns: Option>, } impl WriterPropertiesBuilder { @@ -278,6 +286,7 @@ impl WriterPropertiesBuilder { key_value_metadata: None, default_column_properties: ColumnProperties::new(), column_properties: HashMap::new(), + sorting_columns: None, } } @@ -294,6 +303,7 @@ impl WriterPropertiesBuilder { key_value_metadata: self.key_value_metadata, default_column_properties: self.default_column_properties, column_properties: self.column_properties, + sorting_columns: self.sorting_columns, } } @@ -370,6 +380,12 @@ impl WriterPropertiesBuilder { self } + /// Sets sorting order of rows in the row group if any + pub fn set_sorting_columns(mut self, value: Option>) -> Self { + self.sorting_columns = value; + self + } + // ---------------------------------------------------------------------- // Setters for any column (global) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2efaf7cafc2e..b67bdccfe39d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -434,6 +434,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { .set_column_metadata(column_chunks) .set_total_byte_size(self.total_bytes_written as i64) .set_num_rows(self.total_rows_written.unwrap_or(0) as i64) + .set_sorting_columns(self.props.sorting_columns().cloned()) .build()?; let metadata = Arc::new(row_group_metadata); @@ -653,6 +654,7 @@ mod tests { reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; + use crate::format::SortingColumn; use crate::record::RowAccessor; use crate::schema::types::{ColumnDescriptor, ColumnPath}; use crate::util::memory::ByteBufferPtr; @@ -844,6 +846,65 @@ mod tests { assert_eq!(read_field, &field); } + #[test] + fn test_file_writer_with_sorting_columns_metadata() { + let file = tempfile::tempfile().unwrap(); + + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![ + Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + types::Type::primitive_type_builder("col2", Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ); + let expected_result = Some(vec![SortingColumn { + column_idx: 0, + descending: false, + nulls_first: true, + }]); + let props = Arc::new( + WriterProperties::builder() + .set_key_value_metadata(Some(vec![KeyValue::new( + "key".to_string(), + "value".to_string(), + )])) + .set_sorting_columns(expected_result.clone()) + .build(), + ); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().expect("get row group writer"); + + let col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer.close().unwrap(); + + let col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer.close().unwrap(); + + row_group_writer.close().unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + let result: Vec>> = reader + .metadata() + .row_groups() + .iter() + .map(|f| f.sorting_columns()) + .collect(); + // validate the sorting column read match the one written above + assert_eq!(expected_result.as_ref(), result[0]); + } + #[test] fn test_file_writer_empty_row_groups() { let file = tempfile::tempfile().unwrap(); From c99d2f333f9c1acda1aeaa646b04405c3eae5044 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 15 Nov 2022 15:09:58 -0700 Subject: [PATCH 0280/1411] Include field name in merge error message (#3113) * Include field name in merge error message * Lint Co-authored-by: Raphael Taylor-Davies --- arrow-schema/src/field.rs | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index ee6ece862da5..b1de65e557ff 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -267,14 +267,14 @@ impl Field { pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError(format!( - "Fail to merge schema field because from dict_id = {} does not match {}", - from.dict_id, self.dict_id + "Fail to merge schema field '{}' because from dict_id = {} does not match {}", + self.name, from.dict_id, self.dict_id ))); } if from.dict_is_ordered != self.dict_is_ordered { return Err(ArrowError::SchemaError(format!( - "Fail to merge schema field because from dict_is_ordered = {} does not match {}", - from.dict_is_ordered, self.dict_is_ordered + "Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}", + self.name, from.dict_is_ordered, self.dict_is_ordered ))); } // merge metadata @@ -285,8 +285,8 @@ impl Field { if let Some(self_value) = self.metadata.get(key) { if self_value != from_value { return Err(ArrowError::SchemaError(format!( - "Fail to merge field due to conflicting metadata data value for key {}. - From value = {} does not match {}", key, from_value, self_value), + "Fail to merge field '{}' due to conflicting metadata data value for key {}. + From value = {} does not match {}", self.name, key, from_value, self_value), )); } } else { @@ -315,8 +315,8 @@ impl Field { } _ => { return Err(ArrowError::SchemaError( - format!("Fail to merge schema field because the from data_type = {} is not DataType::Struct", - from.data_type) + format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct", + self.name, from.data_type) ))} }, DataType::Union(nested_fields, type_ids, _) => match &from.data_type { @@ -334,8 +334,8 @@ impl Field { // type id. if self_type_id != field_type_id { return Err(ArrowError::SchemaError( - format!("Fail to merge schema field because the self_type_id = {} does not equal field_type_id = {}", - self_type_id, field_type_id) + format!("Fail to merge schema field '{}' because the self_type_id = {} does not equal field_type_id = {}", + self.name, self_type_id, field_type_id) )); } @@ -352,8 +352,8 @@ impl Field { } _ => { return Err(ArrowError::SchemaError( - format!("Fail to merge schema field because the from data_type = {} is not DataType::Union", - from.data_type) + format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union", + self.name, from.data_type) )); } }, @@ -391,8 +391,8 @@ impl Field { | DataType::Decimal256(_, _) => { if self.data_type != from.data_type { return Err(ArrowError::SchemaError( - format!("Fail to merge schema field because the from data_type = {} does not equal {}", - from.data_type, self.data_type) + format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}", + self.name, from.data_type, self.data_type) )); } } @@ -443,6 +443,16 @@ mod test { use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; + #[test] + fn test_merge_incompatible_types() { + let mut field = Field::new("c1", DataType::Int64, false); + let result = field + .try_merge(&Field::new("c1", DataType::Float32, true)) + .expect_err("should fail") + .to_string(); + assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result); + } + #[test] fn test_fields_with_dict_id() { let dict1 = Field::new_dict( From c95eb4c80a532653bc91e04e78814f1282c8d005 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 16 Nov 2022 09:10:36 +1100 Subject: [PATCH 0281/1411] Parse Time32/Time64 from formatted string (#3101) * Parse Time32/Time64 from formatted string * PR comments * PR comments refactoring --- arrow-cast/src/parse.rs | 420 +++++++++++++++++++++++++++++++++++++++- arrow-csv/src/reader.rs | 35 ++++ 2 files changed, 451 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index b93d6c800240..6de336351426 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -132,6 +132,97 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Accepts a string in ISO8601 standard format and some +/// variants and converts it to nanoseconds since midnight. +/// +/// Examples of accepted inputs: +/// * `09:26:56.123 AM` +/// * `23:59:59` +/// * `6:00 pm` +// +/// Internally, this function uses the `chrono` library for the +/// time parsing +/// +/// ## Timezone / Offset Handling +/// +/// This function does not support parsing strings with a timezone +/// or offset specified, as it considers only time since midnight. +pub fn string_to_time_nanoseconds(s: &str) -> Result { + // colon count, presence of decimal, presence of whitespace + fn preprocess_time_string(string: &str) -> (usize, bool, bool) { + string + .as_bytes() + .iter() + .fold((0, false, false), |tup, char| match char { + b':' => (tup.0 + 1, tup.1, tup.2), + b'.' => (tup.0, true, tup.2), + b' ' => (tup.0, tup.1, true), + _ => tup, + }) + } + + // Do a preprocess pass of the string to prune which formats to attempt parsing for + let formats: &[&str] = match preprocess_time_string(s.trim()) { + // 24-hour clock, with hour, minutes, seconds and fractions of a second specified + // Examples: + // * 09:50:12.123456789 + // * 9:50:12.123456789 + (2, true, false) => &["%H:%M:%S%.f", "%k:%M:%S%.f"], + + // 12-hour clock, with hour, minutes, seconds and fractions of a second specified + // Examples: + // * 09:50:12.123456789 PM + // * 09:50:12.123456789 pm + // * 9:50:12.123456789 AM + // * 9:50:12.123456789 am + (2, true, true) => &[ + "%I:%M:%S%.f %P", + "%I:%M:%S%.f %p", + "%l:%M:%S%.f %P", + "%l:%M:%S%.f %p", + ], + + // 24-hour clock, with hour, minutes and seconds specified + // Examples: + // * 09:50:12 + // * 9:50:12 + (2, false, false) => &["%H:%M:%S", "%k:%M:%S"], + + // 12-hour clock, with hour, minutes and seconds specified + // Examples: + // * 09:50:12 PM + // * 09:50:12 pm + // * 9:50:12 AM + // * 9:50:12 am + (2, false, true) => &["%I:%M:%S %P", "%I:%M:%S %p", "%l:%M:%S %P", "%l:%M:%S %p"], + + // 24-hour clock, with hour and minutes specified + // Examples: + // * 09:50 + // * 9:50 + (1, false, false) => &["%H:%M", "%k:%M"], + + // 12-hour clock, with hour and minutes specified + // Examples: + // * 09:50 PM + // * 09:50 pm + // * 9:50 AM + // * 9:50 am + (1, false, true) => &["%I:%M %P", "%I:%M %p", "%l:%M %P", "%l:%M %p"], + + _ => &[], + }; + + formats + .iter() + .find_map(|f| NaiveTime::parse_from_str(s, f).ok()) + .map(|nt| { + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64 + }) + // Return generic error if failed to parse as unknown which format user intended for the string + .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{}' as time", s))) +} + /// Specialized parsing implementations /// used by csv and json reader pub trait Parser: ArrowPrimitiveType { @@ -199,10 +290,76 @@ impl Parser for TimestampSecondType { } } -parser_primitive!(Time64NanosecondType); -parser_primitive!(Time64MicrosecondType); -parser_primitive!(Time32MillisecondType); -parser_primitive!(Time32SecondType); +impl Parser for Time64NanosecondType { + // Will truncate any fractions of a nanosecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + + nt.nanosecond() as i64, + ) + } +} + +impl Parser for Time64MicrosecondType { + // Will truncate any fractions of a microsecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| nanos / 1_000) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000 + + nt.nanosecond() as i64 / 1_000, + ) + } +} + +impl Parser for Time32MillisecondType { + // Will truncate any fractions of a millisecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 * 1_000 + + nt.nanosecond() as i32 / 1_000_000, + ) + } +} + +impl Parser for Time32SecondType { + // Will truncate any fractions of a second + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 + + nt.nanosecond() as i32 / 1_000_000_000, + ) + } +} /// Number of days between 0001-01-01 and 1970-01-01 const EPOCH_DAYS_FROM_CE: i32 = 719_163; @@ -411,4 +568,259 @@ mod tests { parse_timestamp("2020-09-08 13:42:29").unwrap() ); } + + #[test] + fn parse_time64_nanos() { + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567899999999"), + Some(7_801_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 am"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 PM"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 pm"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 AM"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 am"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 PM"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 pm"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10"), + Some(7_800_000_000_000) + ); + assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); + assert_eq!( + Time64NanosecondType::parse("12:10 AM"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 am"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 PM"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 pm"), + Some(51_000_000_000_000) + ); + + // parse directly as nanoseconds + assert_eq!(Time64NanosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64NanosecondType::parse("23:59:60"), + Some(86_400_000_000_000) + ); + + // custom format + assert_eq!( + Time64NanosecondType::parse_formatted( + "02 - 10 - 01 - .1234567", + "%H - %M - %S - %.f" + ), + Some(7_801_123_456_700) + ); + } + + #[test] + fn parse_time64_micros() { + // expected formats + assert_eq!( + Time64MicrosecondType::parse("02:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 AM"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 PM"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 pm"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01"), + Some(7_801_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 AM"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 am"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 PM"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 pm"), + Some(51_001_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); + assert_eq!( + Time64MicrosecondType::parse("2:10 PM"), + Some(51_000_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10 pm"), + Some(51_000_000_000) + ); + + // parse directly as microseconds + assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64MicrosecondType::parse("23:59:60"), + Some(86_400_000_000) + ); + + // custom format + assert_eq!( + Time64MicrosecondType::parse_formatted( + "02 - 10 - 01 - .1234", + "%H - %M - %S - %.f" + ), + Some(7_801_123_400) + ); + } + + #[test] + fn parse_time32_millis() { + // expected formats + assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 AM"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 am"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 pm"), + Some(51_001_120) + ); + assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); + + // parse directly as milliseconds + assert_eq!(Time32MillisecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); + + // custom format + assert_eq!( + Time32MillisecondType::parse_formatted( + "02 - 10 - 01 - .1", + "%H - %M - %S - %.f" + ), + Some(7_801_100) + ); + } + + #[test] + fn parse_time32_secs() { + // expected formats + assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); + assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); + assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); + assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); + assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); + assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); + assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); + + // parse directly as seconds + assert_eq!(Time32SecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); + + // custom format + assert_eq!( + Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), + Some(7_801) + ); + } } diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 0bf05960a37d..4200e9329c54 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -584,6 +584,24 @@ fn parse( i, datetime_format, ), + DataType::Time32(TimeUnit::Second) => { + build_primitive_array::(line_number, rows, i, None) + } + DataType::Time32(TimeUnit::Millisecond) => build_primitive_array::< + Time32MillisecondType, + >( + line_number, rows, i, None + ), + DataType::Time64(TimeUnit::Microsecond) => build_primitive_array::< + Time64MicrosecondType, + >( + line_number, rows, i, None + ), + DataType::Time64(TimeUnit::Nanosecond) => build_primitive_array::< + Time64NanosecondType, + >( + line_number, rows, i, None + ), DataType::Timestamp(TimeUnit::Microsecond, _) => { build_primitive_array::( line_number, @@ -1593,6 +1611,23 @@ mod tests { assert_eq!(parse_item::("1945-05-08").unwrap(), -9004); } + #[test] + fn parse_time() { + assert_eq!( + parse_item::("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + parse_item::("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + parse_item::("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!(parse_item::("2:10:01 pm"), Some(51_001)); + } + #[test] fn parse_date64() { assert_eq!(parse_item::("1970-01-01T00:00:00").unwrap(), 0); From 73d66d837c20e3b80a77fdad5018f7872de4ef9d Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 16 Nov 2022 11:04:40 +0800 Subject: [PATCH 0282/1411] parquet bloom filter part II: read sbbf bitset from row group reader, update API, and add cli demo (#3102) * add feature flag * add api * fix reading with chunk reader * refactor * add a binary to demo * add bin * remove unused * fix clippy * adjust byte size * update read method * parquet-show-bloom-filter with bloom feature required * remove extern crate * get rid of loop read * refactor to test * rework api * remove unused trait * update help --- parquet/Cargo.toml | 4 + parquet/README.md | 1 + parquet/src/bin/parquet-read.rs | 2 - parquet/src/bin/parquet-rowcount.rs | 1 - parquet/src/bin/parquet-schema.rs | 1 - parquet/src/bin/parquet-show-bloom-filter.rs | 110 ++++++++++++++++ parquet/src/bloom_filter/mod.rs | 124 +++++++++++++++---- parquet/src/file/reader.rs | 6 + parquet/src/file/serialized_reader.rs | 18 ++- 9 files changed, 235 insertions(+), 32 deletions(-) create mode 100644 parquet/src/bin/parquet-show-bloom-filter.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a5d43bf54bfa..fc7c8218ad02 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -113,6 +113,10 @@ required-features = ["cli"] name = "parquet-fromcsv" required-features = ["arrow", "cli"] +[[bin]] +name = "parquet-show-bloom-filter" +required-features = ["cli", "bloom"] + [[bench]] name = "arrow_writer" required-features = ["arrow"] diff --git a/parquet/README.md b/parquet/README.md index d904fc64e744..c9245b082119 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -41,6 +41,7 @@ However, for historical reasons, this crate uses versions with major numbers gre The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`: - `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet +- `bloom` (default) - support for [split block bloom filter](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) for reading from / writing to parquet - `async` - support `async` APIs for reading parquet - `json` - support for reading / writing `json` data to / from parquet - `brotli` (default) - support for parquet using `brotli` compression diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index cf8009956e2e..117f9ee0b17a 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -36,8 +36,6 @@ //! Note that `parquet-read` reads full file schema, no projection or filtering is //! applied. -extern crate parquet; - use clap::Parser; use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::record::Row; diff --git a/parquet/src/bin/parquet-rowcount.rs b/parquet/src/bin/parquet-rowcount.rs index 491f582c5103..5069d4b2543b 100644 --- a/parquet/src/bin/parquet-rowcount.rs +++ b/parquet/src/bin/parquet-rowcount.rs @@ -36,7 +36,6 @@ //! Note that `parquet-rowcount` reads full file schema, no projection or filtering is //! applied. -extern crate parquet; use clap::Parser; use parquet::file::reader::{FileReader, SerializedFileReader}; use std::{fs::File, path::Path}; diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index cd8e7692203d..ff7798a91cd3 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -36,7 +36,6 @@ //! Note that `verbose` is an optional boolean flag that allows to print schema only, //! when not provided or print full file metadata when provided. -extern crate parquet; use clap::Parser; use parquet::{ file::reader::{FileReader, SerializedFileReader}, diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs new file mode 100644 index 000000000000..a4dbdbe67de8 --- /dev/null +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to read bloom filter data from a Parquet file. +//! +//! # Install +//! +//! `parquet-show-bloom-filter` can be installed using `cargo`: +//! ``` +//! cargo install parquet --features=cli +//! ``` +//! After this `parquet-show-bloom-filter` should be available: +//! ``` +//! parquet-show-bloom-filter --file-name XYZ.parquet --column id --values a +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --features=cli --bin parquet-show-bloom-filter -- --file-name XYZ.parquet --column id --values a +//! ``` + +use clap::Parser; +use parquet::file::reader::{FileReader, SerializedFileReader}; +use std::{fs::File, path::Path}; + +#[derive(Debug, Parser)] +#[clap(author, version, about("Binary file to read bloom filter data from a Parquet file"), long_about = None)] +struct Args { + #[clap(short, long, help("Path to the parquet file"))] + file_name: String, + #[clap( + short, + long, + help("Check the bloom filter indexes for the given column") + )] + column: String, + #[clap( + short, + long, + help("Check if the given values match bloom filter, the values will be evaluated as strings"), + required = true + )] + values: Vec, +} + +fn main() { + let args = Args::parse(); + let file_name = args.file_name; + let path = Path::new(&file_name); + let file = File::open(path).expect("Unable to open file"); + + let file_reader = + SerializedFileReader::new(file).expect("Unable to open file as Parquet"); + let metadata = file_reader.metadata(); + for (ri, row_group) in metadata.row_groups().iter().enumerate() { + println!("Row group #{}", ri); + println!("{}", "=".repeat(80)); + if let Some((column_index, _)) = row_group + .columns() + .iter() + .enumerate() + .find(|(_, column)| column.column_path().string() == args.column) + { + let row_group_reader = file_reader + .get_row_group(ri) + .expect("Unable to read row group"); + if let Some(sbbf) = row_group_reader + .get_column_bloom_filter(column_index) + .expect("Failed to parse bloom filter") + { + args.values.iter().for_each(|value| { + println!( + "Value {} is {} in bloom filter", + value, + if sbbf.check(value.as_str()) { + "present" + } else { + "absent" + } + ) + }); + } + } else { + println!( + "No column named {} found, candidate columns are: {}", + args.column, + row_group + .columns() + .iter() + .map(|c| c.column_path().string()) + .collect::>() + .join(", ") + ); + } + } +} diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index adfd87307ac6..4944a93f8484 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -18,13 +18,16 @@ //! Bloom filter implementation specific to Parquet, as described //! in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) +use crate::data_type::AsBytes; use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; +use crate::file::reader::ChunkReader; use crate::format::{ BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, BloomFilterHeader, }; +use bytes::{Buf, Bytes}; use std::hash::Hasher; -use std::io::{Read, Seek, SeekFrom}; +use std::sync::Arc; use thrift::protocol::{TCompactInputProtocol, TSerializable}; use twox_hash::XxHash64; @@ -79,6 +82,37 @@ fn block_check(block: &Block, hash: u32) -> bool { /// A split block Bloom filter pub struct Sbbf(Vec); +const SBBF_HEADER_SIZE_ESTIMATE: usize = 20; + +/// given an initial offset, and a [ChunkReader], try to read out a bloom filter header and return +/// both the header and the offset after it (for bitset). +fn chunk_read_bloom_filter_header_and_offset( + offset: u64, + reader: Arc, +) -> Result<(BloomFilterHeader, u64), ParquetError> { + let buffer = reader.get_bytes(offset as u64, SBBF_HEADER_SIZE_ESTIMATE)?; + let (header, length) = read_bloom_filter_header_and_length(buffer)?; + Ok((header, offset + length)) +} + +/// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and +/// length of the header. +#[inline] +fn read_bloom_filter_header_and_length( + buffer: Bytes, +) -> Result<(BloomFilterHeader, u64), ParquetError> { + let total_length = buffer.len(); + let mut buf_reader = buffer.reader(); + let mut prot = TCompactInputProtocol::new(&mut buf_reader); + let header = BloomFilterHeader::read_from_in_protocol(&mut prot).map_err(|e| { + ParquetError::General(format!("Could not read bloom filter header: {}", e)) + })?; + Ok(( + header, + (total_length - buf_reader.into_inner().remaining()) as u64, + )) +} + impl Sbbf { fn new(bitset: &[u8]) -> Self { let data = bitset @@ -94,17 +128,20 @@ impl Sbbf { Self(data) } - pub fn read_from_column_chunk( + pub fn read_from_column_chunk( column_metadata: &ColumnChunkMetaData, - mut reader: &mut R, - ) -> Result { - let offset = column_metadata.bloom_filter_offset().ok_or_else(|| { - ParquetError::General("Bloom filter offset is not set".to_string()) - })? as u64; - reader.seek(SeekFrom::Start(offset))?; - // deserialize header - let mut prot = TCompactInputProtocol::new(&mut reader); - let header = BloomFilterHeader::read_from_in_protocol(&mut prot)?; + reader: Arc, + ) -> Result, ParquetError> { + let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() { + offset.try_into().map_err(|_| { + ParquetError::General("Bloom filter offset is invalid".to_string()) + })? + } else { + return Ok(None); + }; + + let (header, bitset_offset) = + chunk_read_bloom_filter_header_and_offset(offset, reader.clone())?; match header.algorithm { BloomFilterAlgorithm::BLOCK(_) => { @@ -125,11 +162,8 @@ impl Sbbf { let length: usize = header.num_bytes.try_into().map_err(|_| { ParquetError::General("Bloom filter length is invalid".to_string()) })?; - let mut buffer = vec![0_u8; length]; - reader.read_exact(&mut buffer).map_err(|e| { - ParquetError::General(format!("Could not read bloom filter: {}", e)) - })?; - Ok(Self::new(&buffer)) + let bitset = reader.get_bytes(bitset_offset, length)?; + Ok(Some(Self::new(&bitset))) } #[inline] @@ -139,17 +173,27 @@ impl Sbbf { (((hash >> 32).saturating_mul(self.0.len() as u64)) >> 32) as usize } + /// Insert an [AsBytes] value into the filter + pub fn insert(&mut self, value: T) { + self.insert_hash(hash_as_bytes(value)); + } + /// Insert a hash into the filter - pub fn insert(&mut self, hash: u64) { + fn insert_hash(&mut self, hash: u64) { let block_index = self.hash_to_block_index(hash); let block = &mut self.0[block_index]; block_insert(block, hash as u32); } + /// Check if an [AsBytes] value is probably present or definitely absent in the filter + pub fn check(&self, value: T) -> bool { + self.check_hash(hash_as_bytes(value)) + } + /// Check if a hash is in the filter. May return /// true for values that was never inserted ("false positive") /// but will always return false if a hash has not been inserted. - pub fn check(&self, hash: u64) -> bool { + fn check_hash(&self, hash: u64) -> bool { let block_index = self.hash_to_block_index(hash); let block = &self.0[block_index]; block_check(block, hash as u32) @@ -159,19 +203,24 @@ impl Sbbf { // per spec we use xxHash with seed=0 const SEED: u64 = 0; -pub fn hash_bytes>(value: A) -> u64 { +#[inline] +fn hash_as_bytes(value: A) -> u64 { let mut hasher = XxHash64::with_seed(SEED); - hasher.write(value.as_ref()); + hasher.write(value.as_bytes()); hasher.finish() } #[cfg(test)] mod tests { use super::*; + use crate::format::{ + BloomFilterAlgorithm, BloomFilterCompression, SplitBlockAlgorithm, Uncompressed, + XxHash, + }; #[test] fn test_hash_bytes() { - assert_eq!(hash_bytes(b""), 17241709254077376921); + assert_eq!(hash_as_bytes(""), 17241709254077376921); } #[test] @@ -210,8 +259,37 @@ mod tests { let sbbf = Sbbf::new(bitset); for a in 0..10i64 { let value = format!("a{}", a); - let hash = hash_bytes(value); - assert!(sbbf.check(hash)); + assert!(sbbf.check(value.as_str())); } } + + /// test the assumption that bloom filter header size should not exceed SBBF_HEADER_SIZE_ESTIMATE + /// essentially we are testing that the struct is packed with 4 i32 fields, each can be 1-5 bytes + /// so altogether it'll be 20 bytes at most. + #[test] + fn test_bloom_filter_header_size_assumption() { + let buffer: &[u8; 16] = + &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99]; + let ( + BloomFilterHeader { + algorithm, + compression, + hash, + num_bytes, + }, + read_length, + ) = read_bloom_filter_header_and_length(Bytes::copy_from_slice(buffer)).unwrap(); + assert_eq!(read_length, 15); + assert_eq!( + algorithm, + BloomFilterAlgorithm::BLOCK(SplitBlockAlgorithm {}) + ); + assert_eq!( + compression, + BloomFilterCompression::UNCOMPRESSED(Uncompressed {}) + ); + assert_eq!(hash, BloomFilterHash::XXHASH(XxHash {})); + assert_eq!(num_bytes, 32_i32); + assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE); + } } diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 70ff37a41e15..325944c2168b 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -21,6 +21,8 @@ use bytes::Bytes; use std::{boxed::Box, io::Read, sync::Arc}; +#[cfg(feature = "bloom")] +use crate::bloom_filter::Sbbf; use crate::column::page::PageIterator; use crate::column::{page::PageReader, reader::ColumnReader}; use crate::errors::{ParquetError, Result}; @@ -143,6 +145,10 @@ pub trait RowGroupReader: Send + Sync { Ok(col_reader) } + #[cfg(feature = "bloom")] + /// Get bloom filter for the `i`th column chunk, if present. + fn get_column_bloom_filter(&self, i: usize) -> Result>; + /// Get iterator of `Row`s from this row group. /// /// Projected schema can be a subset of or equal to the file schema, when it is None, diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ebe87aca6d5e..cb39dd194872 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -22,11 +22,9 @@ use std::collections::VecDeque; use std::io::Cursor; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; -use crate::format::{PageHeader, PageLocation, PageType}; -use bytes::{Buf, Bytes}; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; - use crate::basic::{Encoding, Type}; +#[cfg(feature = "bloom")] +use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; @@ -38,11 +36,14 @@ use crate::file::{ reader::*, statistics, }; +use crate::format::{PageHeader, PageLocation, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::util::{io::TryClone, memory::ByteBufferPtr}; -// export `SliceableCursor` and `FileSource` publically so clients can +use bytes::{Buf, Bytes}; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; +// export `SliceableCursor` and `FileSource` publicly so clients can // re-use the logic in their own ParquetFileWriter wrappers pub use crate::util::io::FileSource; @@ -387,6 +388,13 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' )?)) } + #[cfg(feature = "bloom")] + /// get bloom filter for the `i`th column + fn get_column_bloom_filter(&self, i: usize) -> Result> { + let col = self.metadata.column(i); + Sbbf::read_from_column_chunk(col, self.chunk_reader.clone()) + } + fn get_row_iter(&self, projection: Option) -> Result { RowIter::from_row_group(projection, self) } From e55b95e8db04bd792e2f4702b926bbb0cfc8a3df Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Wed, 16 Nov 2022 23:58:15 +0530 Subject: [PATCH 0283/1411] Clippy parquet fixes (#3124) * Minor paruqet code cleanup * Minor arrow-json code cleanup --- arrow-json/src/reader.rs | 2 -- parquet/src/data_type.rs | 24 ------------------------ parquet/src/encodings/decoding.rs | 15 ++++++++------- parquet/src/record/api.rs | 11 ++++++----- 4 files changed, 14 insertions(+), 38 deletions(-) diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index b3af909ef46f..860e6b58c4ac 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -909,7 +909,6 @@ impl Decoder { } #[inline(always)] - #[allow(clippy::unnecessary_wraps)] fn build_string_dictionary_builder( &self, row_len: usize, @@ -983,7 +982,6 @@ impl Decoder { Ok(Arc::new(builder.finish())) } - #[allow(clippy::unnecessary_wraps)] fn build_primitive_array( &self, rows: &[Value], diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 9cd36cf43dc8..3e423a41562a 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -1291,35 +1291,11 @@ macro_rules! ensure_phys_ty { } #[cfg(test)] -#[allow(clippy::float_cmp, clippy::approx_constant)] mod tests { use super::*; #[test] - #[allow(clippy::string_lit_as_bytes)] fn test_as_bytes() { - assert_eq!(false.as_bytes(), &[0]); - assert_eq!(true.as_bytes(), &[1]); - assert_eq!(7_i32.as_bytes(), &[7, 0, 0, 0]); - assert_eq!(555_i32.as_bytes(), &[43, 2, 0, 0]); - assert_eq!(555_u32.as_bytes(), &[43, 2, 0, 0]); - assert_eq!(i32::max_value().as_bytes(), &[255, 255, 255, 127]); - assert_eq!(i32::min_value().as_bytes(), &[0, 0, 0, 128]); - assert_eq!(7_i64.as_bytes(), &[7, 0, 0, 0, 0, 0, 0, 0]); - assert_eq!(555_i64.as_bytes(), &[43, 2, 0, 0, 0, 0, 0, 0]); - assert_eq!( - (i64::max_value()).as_bytes(), - &[255, 255, 255, 255, 255, 255, 255, 127] - ); - assert_eq!((i64::min_value()).as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 128]); - assert_eq!(3.14_f32.as_bytes(), &[195, 245, 72, 64]); - assert_eq!(3.14_f64.as_bytes(), &[31, 133, 235, 81, 184, 30, 9, 64]); - assert_eq!("hello".as_bytes(), &[b'h', b'e', b'l', b'l', b'o']); - assert_eq!( - Vec::from("hello".as_bytes()).as_bytes(), - &[b'h', b'e', b'l', b'l', b'o'] - ); - // Test Int96 let i96 = Int96::from(vec![1, 2, 3]); assert_eq!(i96.as_bytes(), &[1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]); diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 86941ffe0eeb..bbc119c361d8 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1069,10 +1069,11 @@ impl Decoder for DeltaByteArrayDecoder { } #[cfg(test)] -#[allow(clippy::approx_constant)] mod tests { use super::{super::encoding::*, *}; + use std::f32::consts::PI as PI_f32; + use std::f64::consts::PI as PI_f64; use std::sync::Arc; use crate::schema::types::{ @@ -1214,7 +1215,7 @@ mod tests { #[test] fn test_plain_decode_float() { - let data = vec![3.14, 2.414, 12.51]; + let data = vec![PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); let mut buffer = vec![0.0; 3]; test_plain_decode::( @@ -1228,7 +1229,7 @@ mod tests { #[test] fn test_plain_skip_float() { - let data = vec![3.14, 2.414, 12.51]; + let data = vec![PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1241,14 +1242,14 @@ mod tests { #[test] fn test_plain_skip_all_float() { - let data = vec![3.14, 2.414, 12.51]; + let data = vec![PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); } #[test] fn test_plain_skip_double() { - let data = vec![3.14f64, 2.414f64, 12.51f64]; + let data = vec![PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1261,14 +1262,14 @@ mod tests { #[test] fn test_plain_skip_all_double() { - let data = vec![3.14f64, 2.414f64, 12.51f64]; + let data = vec![PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] fn test_plain_decode_double() { - let data = vec![3.14f64, 2.414f64, 12.51f64]; + let data = vec![PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); let mut buffer = vec![0.0f64; 3]; test_plain_decode::( diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 02cb94765cf6..b64ff51eea84 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -859,10 +859,11 @@ fn convert_decimal_to_string(decimal: &Decimal) -> String { } #[cfg(test)] -#[allow(clippy::approx_constant, clippy::many_single_char_names)] +#[allow(clippy::many_single_char_names)] mod tests { use super::*; + use std::f64::consts::PI; use std::sync::Arc; use crate::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; @@ -1581,8 +1582,8 @@ mod tests { ]); assert!((10.3 - list.get_float(2).unwrap()).abs() < f32::EPSILON); - let list = make_list(vec![Field::Double(3.1415)]); - assert!((3.1415 - list.get_double(0).unwrap()).abs() < f64::EPSILON); + let list = make_list(vec![Field::Double(PI)]); + assert!((PI - list.get_double(0).unwrap()).abs() < f64::EPSILON); let list = make_list(vec![Field::Str("abc".to_string())]); assert_eq!(&"abc".to_string(), list.get_string(0).unwrap()); @@ -1631,7 +1632,7 @@ mod tests { ]); assert!(list.get_double(2).is_err()); - let list = make_list(vec![Field::Double(3.1415)]); + let list = make_list(vec![Field::Double(PI)]); assert!(list.get_string(0).is_err()); let list = make_list(vec![Field::Str("abc".to_string())]); @@ -1832,7 +1833,7 @@ mod tests { } #[cfg(test)] -#[allow(clippy::approx_constant, clippy::many_single_char_names)] +#[allow(clippy::many_single_char_names)] mod api_tests { use super::{make_list, make_map, make_row}; use crate::record::Field; From 2a065bee362cdb27a472cb4e665dd3e1d1a9a500 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 18 Nov 2022 08:02:47 +1300 Subject: [PATCH 0284/1411] Bump actions/labeler from 4.0.2 to 4.1.0 (#3129) Bumps [actions/labeler](https://github.com/actions/labeler) from 4.0.2 to 4.1.0. - [Release notes](https://github.com/actions/labeler/releases) - [Commits](https://github.com/actions/labeler/compare/v4.0.2...4.1.0) --- updated-dependencies: - dependency-name: actions/labeler dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 2f13b726bcfa..5f84affbc52d 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -40,7 +40,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@v4.0.2 + uses: actions/labeler@4.1.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml From 5bce1044f6ae3d64117b2f692a427af7e9d06029 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 17 Nov 2022 12:44:17 -0800 Subject: [PATCH 0285/1411] Add COW conversion for Buffer and PrimitiveArray and unary_mut (#3115) * Add some APIs for copy-on-write support * Update * Add unary_mut as an example * For review * For review * For review * Fix test and more for review * Add test on sliced array * Address an overlooking review. * For review --- arrow-array/src/array/mod.rs | 12 ++ arrow-array/src/array/primitive_array.rs | 161 +++++++++++++++++- .../src/builder/boolean_buffer_builder.rs | 5 + arrow-array/src/builder/buffer_builder.rs | 9 + .../src/builder/null_buffer_builder.rs | 25 ++- arrow-array/src/builder/primitive_builder.rs | 24 +++ arrow-buffer/src/buffer/immutable.rs | 19 +++ arrow-buffer/src/buffer/mutable.rs | 19 +++ arrow-buffer/src/bytes.rs | 5 + 9 files changed, 277 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 41aa438c9fb3..307753a7117e 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -862,6 +862,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::cast::downcast_array; use arrow_schema::Field; #[test] @@ -1113,4 +1114,15 @@ mod tests { assert!(compute_my_thing(&arr)); assert!(compute_my_thing(arr.as_ref())); } + + #[test] + fn test_downcast_array() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + + let boxed: ArrayRef = Arc::new(array); + let array: Int32Array = downcast_array(&boxed); + + let expected: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + assert_eq!(array, expected); + } } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 7cf7de721611..195e2dc19a1a 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -397,6 +397,36 @@ impl PrimitiveArray { unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } } + /// Applies an unary and infallible function to a mutable primitive array. + /// Mutable primitive array means that the buffer is not shared with other arrays. + /// As a result, this mutates the buffer directly without allocating new buffer. + /// + /// # Implementation + /// + /// This will apply the function for all values, including those on null slots. + /// This implies that the operation must be infallible for any value of the corresponding type + /// or this function may panic. + /// # Example + /// ```rust + /// # use arrow_array::{Int32Array, types::Int32Type}; + /// # fn main() { + /// let array = Int32Array::from(vec![Some(5), Some(7), None]); + /// let c = array.unary_mut(|x| x * 2 + 1).unwrap(); + /// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + /// # } + /// ``` + pub fn unary_mut(self, op: F) -> Result, PrimitiveArray> + where + F: Fn(T::Native) -> T::Native, + { + let mut builder = self.into_builder()?; + builder + .values_slice_mut() + .iter_mut() + .for_each(|v| *v = op(*v)); + Ok(builder.finish()) + } + /// Applies a unary and fallible function to all valid values in a primitive array /// /// This is unlike [`Self::unary`] which will apply an infallible function to all rows @@ -489,6 +519,66 @@ impl PrimitiveArray { ) } } + + /// Returns `PrimitiveBuilder` of this primitive array for mutating its values if the underlying + /// data buffer is not shared by others. + pub fn into_builder(self) -> Result, Self> { + let len = self.len(); + let null_bit_buffer = self + .data + .null_buffer() + .map(|b| b.bit_slice(self.data.offset(), len)); + + let element_len = std::mem::size_of::(); + let buffer = self.data.buffers()[0] + .slice_with_length(self.data.offset() * element_len, len * element_len); + + drop(self.data); + + let try_mutable_null_buffer = match null_bit_buffer { + None => Ok(None), + Some(null_buffer) => { + // Null buffer exists, tries to make it mutable + null_buffer.into_mutable().map(Some) + } + }; + + let try_mutable_buffers = match try_mutable_null_buffer { + Ok(mutable_null_buffer) => { + // Got mutable null buffer, tries to get mutable value buffer + let try_mutable_buffer = buffer.into_mutable(); + + // try_mutable_buffer.map(...).map_err(...) doesn't work as the compiler complains + // mutable_null_buffer is moved into map closure. + match try_mutable_buffer { + Ok(mutable_buffer) => Ok(PrimitiveBuilder::::new_from_buffer( + mutable_buffer, + mutable_null_buffer, + )), + Err(buffer) => Err((buffer, mutable_null_buffer.map(|b| b.into()))), + } + } + Err(mutable_null_buffer) => { + // Unable to get mutable null buffer + Err((buffer, Some(mutable_null_buffer))) + } + }; + + match try_mutable_buffers { + Ok(builder) => Ok(builder), + Err((buffer, null_bit_buffer)) => { + let builder = ArrayData::builder(T::DATA_TYPE) + .len(len) + .add_buffer(buffer) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + let array = PrimitiveArray::::from(array_data); + + Err(array) + } + } + } } #[inline] @@ -1036,7 +1126,9 @@ impl PrimitiveArray { mod tests { use super::*; use crate::builder::{Decimal128Builder, Decimal256Builder}; - use crate::BooleanArray; + use crate::cast::downcast_array; + use crate::{ArrayRef, BooleanArray}; + use std::sync::Arc; #[test] fn test_primitive_array_from_vec() { @@ -1939,4 +2031,71 @@ mod tests { array.value(4); } + + #[test] + fn test_into_builder() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + + let boxed: ArrayRef = Arc::new(array); + let col: Int32Array = downcast_array(&boxed); + drop(boxed); + + let mut builder = col.into_builder().unwrap(); + + let slice = builder.values_slice_mut(); + assert_eq!(slice, &[1, 2, 3]); + + slice[0] = 4; + slice[1] = 2; + slice[2] = 1; + + let expected: Int32Array = vec![Some(4), Some(2), Some(1)].into_iter().collect(); + + let new_array = builder.finish(); + assert_eq!(expected, new_array); + } + + #[test] + fn test_into_builder_cloned_array() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + + let boxed: ArrayRef = Arc::new(array); + + let col: Int32Array = PrimitiveArray::::from(boxed.data().clone()); + let err = col.into_builder(); + + match err { + Ok(_) => panic!("Should not get builder from cloned array"), + Err(returned) => { + let expected: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + assert_eq!(expected, returned) + } + } + } + + #[test] + fn test_into_builder_on_sliced_array() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + let slice = array.slice(1, 2); + let col: Int32Array = downcast_array(&slice); + + drop(slice); + + col.into_builder() + .expect_err("Should not build builder from sliced array"); + } + + #[test] + fn test_unary_mut() { + let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); + + let c = array.unary_mut(|x| x * 2 + 1).unwrap(); + let expected: Int32Array = vec![3, 5, 7].into_iter().map(Some).collect(); + + assert_eq!(expected, c); + + let array: Int32Array = Int32Array::from(vec![Some(5), Some(7), None]); + let c = array.unary_mut(|x| x * 2 + 1).unwrap(); + assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); + } } diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index 16c6750d1d9f..2ab01ccfe40b 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -33,6 +33,11 @@ impl BooleanBufferBuilder { Self { buffer, len: 0 } } + pub fn new_from_buffer(buffer: MutableBuffer, len: usize) -> Self { + assert!(len <= buffer.len() * 8); + Self { buffer, len } + } + #[inline] pub fn len(&self) -> usize { self.len diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index 2da11cb23203..d3146366d512 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -124,6 +124,15 @@ impl BufferBuilder { } } + pub fn new_from_buffer(buffer: MutableBuffer) -> Self { + let buffer_len = buffer.len(); + Self { + buffer, + len: buffer_len / std::mem::size_of::(), + _marker: PhantomData, + } + } + /// Returns the current number of array elements in the internal buffer. /// /// # Example: diff --git a/arrow-array/src/builder/null_buffer_builder.rs b/arrow-array/src/builder/null_buffer_builder.rs index b2aa622ca7a4..fef7214d5aa7 100644 --- a/arrow-array/src/builder/null_buffer_builder.rs +++ b/arrow-array/src/builder/null_buffer_builder.rs @@ -16,7 +16,7 @@ // under the License. use crate::builder::BooleanBufferBuilder; -use arrow_buffer::Buffer; +use arrow_buffer::{Buffer, MutableBuffer}; /// Builder for creating the null bit buffer. /// This builder only materializes the buffer when we append `false`. @@ -42,6 +42,29 @@ impl NullBufferBuilder { } } + /// Creates a new builder with given length. + pub fn new_with_len(len: usize) -> Self { + Self { + bitmap_builder: None, + len, + capacity: len, + } + } + + /// Creates a new builder from a `MutableBuffer`. + pub fn new_from_buffer(buffer: MutableBuffer, len: usize) -> Self { + let capacity = buffer.len() * 8; + + assert!(len < capacity); + + let bitmap_builder = Some(BooleanBufferBuilder::new_from_buffer(buffer, len)); + Self { + bitmap_builder, + len, + capacity, + } + } + /// Appends `n` `true`s into the builder /// to indicate that these `n` items are not nulls. #[inline] diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index ed3594c60df9..55d8bac0189f 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -19,6 +19,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::types::*; use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use std::any::Any; use std::sync::Arc; @@ -114,6 +115,24 @@ impl PrimitiveBuilder { } } + pub fn new_from_buffer( + values_buffer: MutableBuffer, + null_buffer: Option, + ) -> Self { + let values_builder = BufferBuilder::::new_from_buffer(values_buffer); + + let null_buffer_builder = null_buffer + .map(|buffer| { + NullBufferBuilder::new_from_buffer(buffer, values_builder.len()) + }) + .unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len())); + + Self { + values_builder, + null_buffer_builder, + } + } + /// Returns the capacity of this builder measured in slots of type `T` pub fn capacity(&self) -> usize { self.values_builder.capacity() @@ -204,6 +223,11 @@ impl PrimitiveBuilder { pub fn values_slice(&self) -> &[T::Native] { self.values_builder.as_slice() } + + /// Returns the current values buffer as a mutable slice + pub fn values_slice_mut(&mut self) -> &mut [T::Native] { + self.values_builder.as_slice_mut() + } } #[cfg(test)] diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 94bc98678a61..d5d7cd8ef8c7 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -227,6 +227,25 @@ impl Buffer { pub fn count_set_bits_offset(&self, offset: usize, len: usize) -> usize { UnalignedBitChunk::new(self.as_slice(), offset, len).count_ones() } + + /// Returns `MutableBuffer` for mutating the buffer if this buffer is not shared. + /// Returns `Err` if this is shared or its allocation is from an external source. + pub fn into_mutable(self) -> Result { + let offset_ptr = self.as_ptr(); + let offset = self.offset; + let length = self.length; + Arc::try_unwrap(self.data) + .and_then(|bytes| { + // The pointer of underlying buffer should not be offset. + assert_eq!(offset_ptr, bytes.ptr().as_ptr()); + MutableBuffer::from_bytes(bytes).map_err(Arc::new) + }) + .map_err(|bytes| Buffer { + data: bytes, + offset, + length, + }) + } } /// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index bd139466ae92..b70a74e84249 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -23,6 +23,7 @@ use crate::{ native::{ArrowNativeType, ToByteSlice}, util::bit_util, }; +use std::mem; use std::ptr::NonNull; /// A [`MutableBuffer`] is Arrow's interface to build a [`Buffer`] out of items or slices of items. @@ -92,6 +93,24 @@ impl MutableBuffer { } } + /// Allocates a new [MutableBuffer] from given `Bytes`. + pub(crate) fn from_bytes(bytes: Bytes) -> Result { + if !matches!(bytes.deallocation(), Deallocation::Arrow(_)) { + return Err(bytes); + } + + let len = bytes.len(); + let capacity = bytes.capacity(); + let ptr = bytes.ptr(); + mem::forget(bytes); + + Ok(Self { + data: ptr, + len, + capacity, + }) + } + /// creates a new [MutableBuffer] with capacity and length capable of holding `len` bits. /// This is useful to create a buffer for packed bitmaps. pub fn new_null(len: usize) -> Self { diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 20bf5a474b47..fea04ad0d50b 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -99,6 +99,11 @@ impl Bytes { Deallocation::Custom(_) => 0, } } + + #[inline] + pub(crate) fn deallocation(&self) -> &Deallocation { + &self.deallocation + } } // Deallocation is Send + Sync, repeating the bound here makes that refactoring safe From 475e079170e3b9a62d3e6a01cdd55f68cd91e4db Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Sun, 20 Nov 2022 15:16:48 +0800 Subject: [PATCH 0286/1411] comparison: decimal array with scalar (#3141) --- arrow/src/compute/kernels/comparison.rs | 138 ++++++++++++++++++------ 1 file changed, 105 insertions(+), 33 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 4566b4969295..6438acc3b117 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -1223,6 +1223,11 @@ macro_rules! dyn_compare_scalar { let left = as_primitive_array::($LEFT); $OP::(left, right) } + DataType::Decimal128(_, _) => { + let right = try_to_type!($RIGHT, to_i128)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } _ => Err(ArrowError::ComputeError(format!( "Unsupported data type {:?} for comparison {} with {:?}", $LEFT.data_type(), @@ -3562,7 +3567,7 @@ mod tests { vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] .into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] .into(); let res: Vec> = eq_bool(&a, &b).unwrap().iter().collect(); @@ -4976,6 +4981,7 @@ mod tests { ) ); } + #[test] fn test_lt_eq_dyn_scalar_with_dict() { let mut builder = @@ -5265,6 +5271,7 @@ mod tests { ) ); } + #[test] fn test_lt_dyn_utf8_scalar() { let array = StringArray::from(vec!["abc", "def", "xyz"]); @@ -5274,6 +5281,7 @@ mod tests { BooleanArray::from(vec![Some(true), Some(true), Some(false)]) ); } + #[test] fn test_lt_dyn_utf8_scalar_with_dict() { let mut builder = StringDictionaryBuilder::::new(); @@ -5301,6 +5309,7 @@ mod tests { BooleanArray::from(vec![Some(true), Some(true), Some(false)]) ); } + #[test] fn test_lt_eq_dyn_utf8_scalar_with_dict() { let mut builder = StringDictionaryBuilder::::new(); @@ -5328,6 +5337,7 @@ mod tests { BooleanArray::from(vec![Some(false), Some(true), Some(true)]) ); } + #[test] fn test_gt_eq_dyn_utf8_scalar_with_dict() { let mut builder = StringDictionaryBuilder::::new(); @@ -5383,6 +5393,7 @@ mod tests { BooleanArray::from(vec![Some(true), Some(true), Some(false)]) ); } + #[test] fn test_neq_dyn_utf8_scalar_with_dict() { let mut builder = StringDictionaryBuilder::::new(); @@ -5883,16 +5894,16 @@ mod tests { .collect(); let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], - ); + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] assert_eq!(lt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], - ); + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] @@ -5908,16 +5919,16 @@ mod tests { .collect(); let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], - ); + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] assert_eq!(lt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], - ); + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] @@ -5936,16 +5947,16 @@ mod tests { .collect(); let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], - ); + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] assert_eq!(gt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], - ); + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] @@ -5961,16 +5972,16 @@ mod tests { .collect(); let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], - ); + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] assert_eq!(gt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], - ); + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); #[cfg(not(feature = "simd"))] @@ -6556,13 +6567,13 @@ mod tests { let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], - ); + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], - ); + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] @@ -6574,13 +6585,13 @@ mod tests { let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], - ); + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], - ); + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); } @@ -6596,13 +6607,13 @@ mod tests { let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], - ); + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], - ); + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] @@ -6614,13 +6625,13 @@ mod tests { let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], - ); + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], - ); + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); } @@ -6915,6 +6926,67 @@ mod tests { assert_eq!(e, r); } + #[test] + fn test_decimal128_scalar() { + let a = Decimal128Array::from( + vec![Some(1), Some(2), Some(3), None, Some(4), Some(5)], + ); + let b = 3_i128; + // array eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), None, Some(false), Some(false)], + ); + let r = eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array neq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), None, Some(true), Some(true)], + ); + let r = neq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = neq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array lt scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), None, Some(false), Some(false)], + ); + let r = lt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array lt_eq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), None, Some(false), Some(false)], + ); + let r = lt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array gt scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), None, Some(true), Some(true)], + ); + let r = gt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array gt_eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), None, Some(true), Some(true)], + ); + let r = gt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + } + #[test] fn test_decimal256() { let a = Decimal256Array::from_iter_values( From e1b5657eb1206ce67eb079f6e72615982a70480a Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Mon, 21 Nov 2022 22:15:38 +0800 Subject: [PATCH 0287/1411] use chrono add/sub months (#3132) * use cargo add/sub months * update all chrono versions * clippy --- arrow-array/Cargo.toml | 2 +- arrow-array/src/delta.rs | 82 ++++++---------------------------- arrow-cast/Cargo.toml | 2 +- arrow-csv/Cargo.toml | 2 +- arrow-json/Cargo.toml | 2 +- arrow/Cargo.toml | 4 +- object_store/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- parquet_derive_test/Cargo.toml | 2 +- 9 files changed, 23 insertions(+), 77 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 816843d31ab7..d0c556a00674 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -48,7 +48,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } arrow-schema = { version = "27.0.0", path = "../arrow-schema" } arrow-data = { version = "27.0.0", path = "../arrow-data" } -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow-array/src/delta.rs b/arrow-array/src/delta.rs index b9b7a11e2d44..029168242b90 100644 --- a/arrow-array/src/delta.rs +++ b/arrow-array/src/delta.rs @@ -23,86 +23,32 @@ // Copied from chronoutil crate //! Contains utility functions for shifting Date objects. -use chrono::Datelike; - -/// Returns true if the year is a leap-year, as naively defined in the Gregorian calendar. -#[inline] -pub(crate) fn is_leap_year(year: i32) -> bool { - year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) -} - -// If the day lies within the month, this function has no effect. Otherwise, it shifts -// day backwards to the final day of the month. -// XXX: No attempt is made to handle days outside the 1-31 range. -#[inline] -fn normalise_day(year: i32, month: u32, day: u32) -> u32 { - if day <= 28 { - day - } else if month == 2 { - 28 + is_leap_year(year) as u32 - } else if day == 31 && (month == 4 || month == 6 || month == 9 || month == 11) { - 30 - } else { - day - } -} +use chrono::{Datelike, Months}; +use std::cmp::Ordering; /// Shift a date by the given number of months. -/// Ambiguous month-ends are shifted backwards as necessary. -pub(crate) fn shift_months(date: D, months: i32) -> D { - let mut year = date.year() + (date.month() as i32 + months) / 12; - let mut month = (date.month() as i32 + months) % 12; - let mut day = date.day(); - - if month < 1 { - year -= 1; - month += 12; - } - - day = normalise_day(year, month as u32, day); - - // This is slow but guaranteed to succeed (short of interger overflow) - if day <= 28 { - date.with_day(day) - .unwrap() - .with_month(month as u32) - .unwrap() - .with_year(year) - .unwrap() - } else { - date.with_day(1) - .unwrap() - .with_month(month as u32) - .unwrap() - .with_year(year) - .unwrap() - .with_day(day) - .unwrap() +pub(crate) fn shift_months< + D: Datelike + + std::ops::Add + + std::ops::Sub, +>( + date: D, + months: i32, +) -> D { + match months.cmp(&0) { + Ordering::Equal => date, + Ordering::Greater => date + Months::new(months as u32), + Ordering::Less => date - Months::new(-months as u32), } } #[cfg(test)] mod tests { - use std::collections::HashSet; use chrono::naive::{NaiveDate, NaiveDateTime, NaiveTime}; use super::*; - #[test] - fn test_leap_year_cases() { - let _leap_years: Vec = vec![ - 1904, 1908, 1912, 1916, 1920, 1924, 1928, 1932, 1936, 1940, 1944, 1948, 1952, - 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, - 2008, 2012, 2016, 2020, - ]; - let leap_years_1900_to_2020: HashSet = _leap_years.into_iter().collect(); - - for year in 1900..2021 { - assert_eq!(is_leap_year(year), leap_years_1900_to_2020.contains(&year)) - } - } - #[test] fn test_shift_months() { let base = NaiveDate::from_ymd_opt(2020, 1, 31).unwrap(); diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index fe3f5e257668..5f52a3283f97 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -43,7 +43,7 @@ arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } arrow-data = { version = "27.0.0", path = "../arrow-data" } arrow-schema = { version = "27.0.0", path = "../arrow-schema" } arrow-select = { version = "27.0.0", path = "../arrow-select" } -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 81c97c68484e..fc4c177bd043 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -43,7 +43,7 @@ arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } arrow-cast = { version = "27.0.0", path = "../arrow-cast" } arrow-data = { version = "27.0.0", path = "../arrow-data" } arrow-schema = { version = "27.0.0", path = "../arrow-schema" } -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } lazy_static = { version = "1.4", default-features = false } lexical-core = { version = "^0.8", default-features = false } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index dd7064946b57..3454b4c1dbe5 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -47,7 +47,7 @@ half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } [dev-dependencies] tempfile = "3.3" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2acad2c17bde..ab8963b9c300 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -60,7 +60,7 @@ hashbrown = { version = "0.13", default-features = false } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } multiversion = { version = "0.6.1", default-features = false } @@ -98,7 +98,7 @@ dyn_arith_dict = [] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } criterion = { version = "0.4", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index fc80cb5774c7..fd7442f9e84a 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -31,7 +31,7 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" bytes = "1.0" -chrono = { version = "0.4", default-features = false, features = ["clock"] } +chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" itertools = "0.10.1" parking_lot = { version = "0.12" } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index fc7c8218ad02..515da585ec26 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -47,7 +47,7 @@ brotli = { version = "3.3", default-features = false, features = ["std"], option flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.11.1", optional = true, default-features = false } -chrono = { version = "0.4", default-features = false, features = ["alloc"] } +chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index a10d34e86892..047e0196c704 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -31,4 +31,4 @@ rust-version = "1.62" [dependencies] parquet = { path = "../parquet", version = "27.0.0", default-features = false } parquet_derive = { path = "../parquet_derive", version = "27.0.0", default-features = false } -chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } +chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 6f8187fb34effc79b20a03cb1d0d164448fb5ba8 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Mon, 21 Nov 2022 21:22:14 +0530 Subject: [PATCH 0288/1411] Fix Panic on Reading Corrupt Parquet Schema (#2855) (#3130) * Adding len checks * Adding test case --- parquet/src/arrow/schema.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 395c4aac1500..07afccdb20bf 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -26,8 +26,8 @@ use std::collections::HashMap; use std::sync::Arc; -use arrow_schema::{DataType, Field, Schema, TimeUnit}; use arrow_ipc::writer; +use arrow_schema::{DataType, Field, Schema, TimeUnit}; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, @@ -103,7 +103,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { let decoded = base64::decode(encoded_meta); match decoded { Ok(bytes) => { - let slice = if bytes[0..4] == [255u8; 4] { + let slice = if bytes.len() > 8 && bytes[0..4] == [255u8; 4] { &bytes[8..] } else { bytes.as_slice() @@ -1769,4 +1769,9 @@ mod tests { assert_eq!(&schema, read_schema.as_ref()); Ok(()) } + + #[test] + fn test_get_arrow_schema_from_metadata() { + assert!(get_arrow_schema_from_metadata("").is_err()); + } } From 57f91f2e82184d6e04aab296448d1fe7352a7822 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 21 Nov 2022 16:59:14 +0000 Subject: [PATCH 0289/1411] refactor: convert `Field::metadata` to `HashMap` (#3148) * refactor: convert `Field::metadata` to `HashMap` Closes #2262. * refactor: code formatting Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-integration-test/src/field.rs | 8 +-- arrow-ipc/src/convert.rs | 6 +- arrow-schema/src/datatype.rs | 6 +- arrow-schema/src/field.rs | 85 +++++++++++++++++++++++------ arrow-schema/src/schema.rs | 23 ++++---- parquet/src/arrow/schema/complex.rs | 4 +- 6 files changed, 90 insertions(+), 42 deletions(-) diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 5b5863557098..4bfbf8e99129 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -18,7 +18,7 @@ use crate::{data_type_from_json, data_type_to_json}; use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; -use std::collections::BTreeMap; +use std::collections::HashMap; /// Parse a `Field` definition from a JSON representation. pub fn field_from_json(json: &serde_json::Value) -> Result { @@ -53,7 +53,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::default(); + let mut res: HashMap = HashMap::default(); for value in values { match value.as_object() { Some(map) => { @@ -92,7 +92,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // We also support map format, because Schema's metadata supports this. // See https://github.com/apache/arrow/pull/5907 Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::default(); + let mut res: HashMap = HashMap::default(); for (k, v) in values { if let Some(str_value) = v.as_str() { res.insert(k.clone(), str_value.to_string().clone()); @@ -110,7 +110,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { "Field `metadata` is not json array".to_string(), )); } - _ => BTreeMap::default(), + _ => HashMap::default(), }; // if data_type is a struct or list, get its children diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a9dda6f2a1f1..e11d64a473d4 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -21,7 +21,7 @@ use arrow_schema::*; use flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER}; use DataType::*; @@ -86,7 +86,7 @@ impl<'a> From> for Field { ) }; - let mut metadata_map = BTreeMap::default(); + let mut metadata_map = HashMap::default(); if let Some(list) = field.custom_metadata() { for kv in list { if let (Some(k), Some(v)) = (kv.key(), kv.value()) { @@ -812,7 +812,7 @@ mod tests { .iter() .cloned() .collect(); - let field_md: BTreeMap = [("k".to_string(), "v".to_string())] + let field_md: HashMap = [("k".to_string(), "v".to_string())] .iter() .cloned() .collect(); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 90ae429422c6..acf3691450ae 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -381,10 +381,10 @@ mod tests { #[test] #[cfg(feature = "serde")] fn serde_struct_type() { - use std::collections::BTreeMap; + use std::collections::HashMap; let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + let field_metadata: HashMap = kv_array.iter().cloned().collect(); // Non-empty map: should be converted as JSON obj { ... } let first_name = @@ -392,7 +392,7 @@ mod tests { // Empty map: should be omitted. let last_name = Field::new("last_name", DataType::Utf8, false) - .with_metadata(BTreeMap::default()); + .with_metadata(HashMap::default()); let person = DataType::Struct(vec![ first_name, diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b1de65e557ff..cd9024747b41 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -17,7 +17,7 @@ use crate::error::ArrowError; use std::cmp::Ordering; -use std::collections::BTreeMap; +use std::collections::HashMap; use std::hash::{Hash, Hasher}; use crate::datatype::DataType; @@ -37,9 +37,9 @@ pub struct Field { /// A map of key-value pairs containing additional custom meta data. #[cfg_attr( feature = "serde", - serde(skip_serializing_if = "BTreeMap::is_empty", default) + serde(skip_serializing_if = "HashMap::is_empty", default) )] - metadata: BTreeMap, + metadata: HashMap, } // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` @@ -68,9 +68,33 @@ impl Ord for Field { fn cmp(&self, other: &Self) -> Ordering { self.name .cmp(other.name()) - .then(self.data_type.cmp(other.data_type())) - .then(self.nullable.cmp(&other.nullable)) - .then(self.metadata.cmp(&other.metadata)) + .then_with(|| self.data_type.cmp(other.data_type())) + .then_with(|| self.nullable.cmp(&other.nullable)) + .then_with(|| { + // ensure deterministic key order + let mut keys: Vec<&String> = + self.metadata.keys().chain(other.metadata.keys()).collect(); + keys.sort(); + for k in keys { + match (self.metadata.get(k), other.metadata.get(k)) { + (None, None) => {} + (Some(_), None) => { + return Ordering::Less; + } + (None, Some(_)) => { + return Ordering::Greater; + } + (Some(v1), Some(v2)) => match v1.cmp(v2) { + Ordering::Equal => {} + other => { + return other; + } + }, + } + } + + Ordering::Equal + }) } } @@ -79,7 +103,14 @@ impl Hash for Field { self.name.hash(state); self.data_type.hash(state); self.nullable.hash(state); - self.metadata.hash(state); + + // ensure deterministic key order + let mut keys: Vec<&String> = self.metadata.keys().collect(); + keys.sort(); + for k in keys { + k.hash(state); + self.metadata.get(k).expect("key valid").hash(state); + } } } @@ -92,7 +123,7 @@ impl Field { nullable, dict_id: 0, dict_is_ordered: false, - metadata: BTreeMap::default(), + metadata: HashMap::default(), } } @@ -110,29 +141,29 @@ impl Field { nullable, dict_id, dict_is_ordered, - metadata: BTreeMap::default(), + metadata: HashMap::default(), } } /// Sets the `Field`'s optional custom metadata. /// The metadata is set as `None` for empty map. #[inline] - pub fn set_metadata(&mut self, metadata: BTreeMap) { - self.metadata = BTreeMap::default(); + pub fn set_metadata(&mut self, metadata: HashMap) { + self.metadata = HashMap::default(); if !metadata.is_empty() { self.metadata = metadata; } } /// Sets the metadata of this `Field` to be `metadata` and returns self - pub fn with_metadata(mut self, metadata: BTreeMap) -> Self { + pub fn with_metadata(mut self, metadata: HashMap) -> Self { self.set_metadata(metadata); self } /// Returns the immutable reference to the `Field`'s optional custom metadata. #[inline] - pub const fn metadata(&self) -> &BTreeMap { + pub const fn metadata(&self) -> &HashMap { &self.metadata } @@ -545,10 +576,30 @@ mod test { assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); } + #[test] + fn test_field_comparison_metadata() { + let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ + (String::from("k1"), String::from("v1")), + (String::from("k2"), String::from("v2")), + ])); + let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ + (String::from("k1"), String::from("v1")), + (String::from("k3"), String::from("v3")), + ])); + let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([ + (String::from("k1"), String::from("v1")), + (String::from("k3"), String::from("v4")), + ])); + + assert!(f1.cmp(&f2).is_lt()); + assert!(f2.cmp(&f3).is_lt()); + assert!(f1.cmp(&f3).is_lt()); + } + #[test] fn test_contains_reflexivity() { let mut field = Field::new("field1", DataType::Float16, false); - field.set_metadata(BTreeMap::from([ + field.set_metadata(HashMap::from([ (String::from("k0"), String::from("v0")), (String::from("k1"), String::from("v1")), ])); @@ -560,14 +611,14 @@ mod test { let child_field = Field::new("child1", DataType::Float16, false); let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); - field1.set_metadata(BTreeMap::from([(String::from("k1"), String::from("v1"))])); + field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); - field2.set_metadata(BTreeMap::from([(String::from("k2"), String::from("v2"))])); + field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))])); field2.try_merge(&field1).unwrap(); let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); - field3.set_metadata(BTreeMap::from([(String::from("k3"), String::from("v3"))])); + field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))])); field3.try_merge(&field2).unwrap(); assert!(field2.contains(&field1)); diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 519a8e089aef..8ff40866d518 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -290,7 +290,6 @@ mod tests { use super::*; use crate::datatype::DataType; use crate::{TimeUnit, UnionMode}; - use std::collections::BTreeMap; #[test] #[cfg(feature = "serde")] @@ -523,7 +522,7 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + let field_metadata: HashMap = kv_array.iter().cloned().collect(); let first_name = Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); @@ -551,18 +550,16 @@ mod tests { #[test] fn test_try_merge_field_with_metadata() { // 1. Different values for the same key should cause error. - let metadata1: BTreeMap = - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(); + let metadata1: HashMap = [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(); let f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata1); - let metadata2: BTreeMap = - [("foo".to_string(), "baz".to_string())] - .iter() - .cloned() - .collect(); + let metadata2: HashMap = [("foo".to_string(), "baz".to_string())] + .iter() + .cloned() + .collect(); let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); assert!( @@ -572,7 +569,7 @@ mod tests { // 2. None + Some let mut f1 = Field::new("first_name", DataType::Utf8, false); - let metadata2: BTreeMap = + let metadata2: HashMap = [("missing".to_string(), "value".to_string())] .iter() .cloned() diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 4ff9c7a39566..70cee9ef9ab4 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::collections::BTreeMap; +use std::collections::HashMap; use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::ProjectionMask; @@ -347,7 +347,7 @@ impl Visitor { let value_field = convert_field(map_value, &value, arrow_value); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), - _ => BTreeMap::default(), + _ => HashMap::default(), }; let map_field = Field::new( From 4c06f48de6b835be68f45dffa4b58510cd07bb78 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 21 Nov 2022 18:05:36 +0000 Subject: [PATCH 0290/1411] Implement Neg for i256 (#3151) --- arrow-buffer/src/bigint.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index be02c2857db1..23400b4a3f6e 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -465,6 +465,20 @@ derive_op!(Mul, mul, wrapping_mul, checked_mul); derive_op!(Div, div, wrapping_div, checked_div); derive_op!(Rem, rem, wrapping_rem, checked_rem); +impl std::ops::Neg for i256 { + type Output = i256; + + #[cfg(debug_assertions)] + fn neg(self) -> Self::Output { + self.checked_neg().expect("i256 overflow") + } + + #[cfg(not(debug_assertions))] + fn neg(self) -> Self::Output { + self.wrapping_neg() + } +} + macro_rules! define_as_primitive { ($native_ty:ty) => { impl AsPrimitive for $native_ty { From b3dbe7011ace86c7ab46b1a15388f128bffb5f6d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 21 Nov 2022 19:03:35 +0000 Subject: [PATCH 0291/1411] Add GenericByteBuilder (#2969) (#3122) * Add GenericByteBuilder (#2969) * RAT * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh Co-authored-by: Liang-Chi Hsieh --- ...ry_builder.rs => generic_bytes_builder.rs} | 171 +++++++++++---- .../src/builder/generic_string_builder.rs | 204 ------------------ arrow-array/src/builder/mod.rs | 6 +- 3 files changed, 135 insertions(+), 246 deletions(-) rename arrow-array/src/builder/{generic_binary_builder.rs => generic_bytes_builder.rs} (58%) delete mode 100644 arrow-array/src/builder/generic_string_builder.rs diff --git a/arrow-array/src/builder/generic_binary_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs similarity index 58% rename from arrow-array/src/builder/generic_binary_builder.rs rename to arrow-array/src/builder/generic_bytes_builder.rs index c806bebf9a0b..fa0a31ad79e1 100644 --- a/arrow-array/src/builder/generic_binary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -17,34 +17,35 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; -use crate::{ArrayRef, GenericBinaryArray, OffsetSizeTrait}; +use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; +use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; -/// Array builder for [`GenericBinaryArray`] -#[derive(Debug)] -pub struct GenericBinaryBuilder { +/// Array builder for [`GenericByteArray`] +pub struct GenericByteBuilder { value_builder: UInt8BufferBuilder, - offsets_builder: BufferBuilder, + offsets_builder: BufferBuilder, null_buffer_builder: NullBufferBuilder, } -impl GenericBinaryBuilder { - /// Creates a new [`GenericBinaryBuilder`]. +impl GenericByteBuilder { + /// Creates a new [`GenericByteBuilder`]. pub fn new() -> Self { Self::with_capacity(1024, 1024) } - /// Creates a new [`GenericBinaryBuilder`]. + /// Creates a new [`GenericByteBuilder`]. /// /// - `item_capacity` is the number of items to pre-allocate. /// The size of the preallocated buffer of offsets is the number of items plus one. - /// - `data_capacity` is the total number of bytes of string data to pre-allocate + /// - `data_capacity` is the total number of bytes of data to pre-allocate /// (for all items, not per item). pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); - offsets_builder.append(OffsetSize::zero()); + let mut offsets_builder = BufferBuilder::::new(item_capacity + 1); + offsets_builder.append(T::Offset::from_usize(0).unwrap()); Self { value_builder: UInt8BufferBuilder::new(data_capacity), offsets_builder, @@ -52,13 +53,22 @@ impl GenericBinaryBuilder { } } - /// Appends a byte slice into the builder. + /// Appends a value into the builder. #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) { - self.value_builder.append_slice(value.as_ref()); + pub fn append_value(&mut self, value: impl AsRef) { + self.value_builder.append_slice(value.as_ref().as_ref()); self.null_buffer_builder.append(true); self.offsets_builder - .append(OffsetSize::from_usize(self.value_builder.len()).unwrap()); + .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); + } + + /// Append an `Option` value into the builder. + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; } /// Append a null value into the builder. @@ -66,21 +76,22 @@ impl GenericBinaryBuilder { pub fn append_null(&mut self) { self.null_buffer_builder.append(false); self.offsets_builder - .append(OffsetSize::from_usize(self.value_builder.len()).unwrap()); + .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); } - /// Builds the [`GenericBinaryArray`] and reset this builder. - pub fn finish(&mut self) -> GenericBinaryArray { - let array_type = GenericBinaryArray::::DATA_TYPE; + /// Builds the [`GenericByteArray`] and reset this builder. + pub fn finish(&mut self) -> GenericByteArray { + let array_type = T::DATA_TYPE; let array_builder = ArrayDataBuilder::new(array_type) .len(self.len()) .add_buffer(self.offsets_builder.finish()) .add_buffer(self.value_builder.finish()) .null_bit_buffer(self.null_buffer_builder.finish()); - self.offsets_builder.append(OffsetSize::zero()); + self.offsets_builder + .append(T::Offset::from_usize(0).unwrap()); let array_data = unsafe { array_builder.build_unchecked() }; - GenericBinaryArray::::from(array_data) + GenericByteArray::from(array_data) } /// Returns the current values buffer as a slice @@ -89,18 +100,44 @@ impl GenericBinaryBuilder { } /// Returns the current offsets buffer as a slice - pub fn offsets_slice(&self) -> &[OffsetSize] { + pub fn offsets_slice(&self) -> &[T::Offset] { self.offsets_builder.as_slice() } } -impl Default for GenericBinaryBuilder { +impl std::fmt::Debug for GenericByteBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?; + f.debug_struct("") + .field("value_builder", &self.value_builder) + .field("offsets_builder", &self.offsets_builder) + .field("null_buffer_builder", &self.null_buffer_builder) + .finish() + } +} + +impl Default for GenericByteBuilder { fn default() -> Self { Self::new() } } -impl ArrayBuilder for GenericBinaryBuilder { +impl ArrayBuilder for GenericByteBuilder { + /// Returns the number of binary slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Returns whether the number of binary slots is zero + fn is_empty(&self) -> bool { + self.null_buffer_builder.is_empty() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -115,27 +152,19 @@ impl ArrayBuilder for GenericBinaryBuilder) -> Box { self } +} - /// Returns the number of binary slots in the builder - fn len(&self) -> usize { - self.null_buffer_builder.len() - } - - /// Returns whether the number of binary slots is zero - fn is_empty(&self) -> bool { - self.null_buffer_builder.is_empty() - } +/// Array builder for [`GenericStringArray`][crate::GenericStringArray] +pub type GenericStringBuilder = GenericByteBuilder>; - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} +/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] +pub type GenericBinaryBuilder = GenericByteBuilder>; #[cfg(test)] mod tests { use super::*; use crate::array::{Array, OffsetSizeTrait}; + use crate::GenericStringArray; fn _test_generic_binary_builder() { let mut builder = GenericBinaryBuilder::::new(); @@ -230,4 +259,70 @@ mod tests { fn test_large_binary_builder_reset() { _test_generic_binary_builder_reset::() } + + fn _test_generic_string_array_builder() { + let mut builder = GenericStringBuilder::::new(); + let owned = "arrow".to_owned(); + + builder.append_value("hello"); + builder.append_value(""); + builder.append_value(&owned); + builder.append_null(); + builder.append_option(Some("rust")); + builder.append_option(None::<&str>); + builder.append_option(None::); + assert_eq!(7, builder.len()); + + assert_eq!( + GenericStringArray::::from(vec![ + Some("hello"), + Some(""), + Some("arrow"), + None, + Some("rust"), + None, + None + ]), + builder.finish() + ); + } + + #[test] + fn test_string_array_builder() { + _test_generic_string_array_builder::() + } + + #[test] + fn test_large_string_array_builder() { + _test_generic_string_array_builder::() + } + + fn _test_generic_string_array_builder_finish() { + let mut builder = GenericStringBuilder::::with_capacity(3, 11); + + builder.append_value("hello"); + builder.append_value("rust"); + builder.append_null(); + + builder.finish(); + assert!(builder.is_empty()); + assert_eq!(&[O::zero()], builder.offsets_slice()); + + builder.append_value("arrow"); + builder.append_value("parquet"); + let arr = builder.finish(); + // array should not have null buffer because there is not `null` value. + assert_eq!(None, arr.data().null_buffer()); + assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) + } + + #[test] + fn test_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() + } + + #[test] + fn test_large_string_array_builder_finish() { + _test_generic_string_array_builder_finish::() + } } diff --git a/arrow-array/src/builder/generic_string_builder.rs b/arrow-array/src/builder/generic_string_builder.rs deleted file mode 100644 index f766b6f55f2a..000000000000 --- a/arrow-array/src/builder/generic_string_builder.rs +++ /dev/null @@ -1,204 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::builder::{ArrayBuilder, GenericBinaryBuilder}; -use crate::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; -use std::any::Any; -use std::sync::Arc; - -/// Array builder for [`GenericStringArray`] -#[derive(Debug)] -pub struct GenericStringBuilder { - builder: GenericBinaryBuilder, -} - -impl GenericStringBuilder { - /// Creates a new [`GenericStringBuilder`]. - pub fn new() -> Self { - Self { - builder: GenericBinaryBuilder::new(), - } - } - - /// Creates a new [`GenericStringBuilder`]. - /// - /// - `item_capacity` is the number of items to pre-allocate. - /// The size of the preallocated buffer of offsets is the number of items plus one. - /// - `data_capacity` is the total number of bytes of string data to pre-allocate - /// (for all items, not per item). - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - Self { - builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity), - } - } - - /// Appends a string into the builder. - #[inline] - pub fn append_value(&mut self, value: impl AsRef) { - self.builder.append_value(value.as_ref().as_bytes()); - } - - /// Append a null value into the builder. - #[inline] - pub fn append_null(&mut self) { - self.builder.append_null() - } - - /// Append an `Option` value into the builder. - #[inline] - pub fn append_option(&mut self, value: Option>) { - match value { - None => self.append_null(), - Some(v) => self.append_value(v), - }; - } - - /// Builds the [`GenericStringArray`] and reset this builder. - pub fn finish(&mut self) -> GenericStringArray { - let t = GenericStringArray::::DATA_TYPE; - let v = self.builder.finish(); - let builder = v.into_data().into_builder().data_type(t); - - // SAFETY: - // Data must be UTF-8 as only support writing `str` - // Offsets must be valid as guaranteed by `GenericBinaryBuilder` - let data = unsafe { builder.build_unchecked() }; - data.into() - } - - /// Returns the current values buffer as a slice. - pub fn values_slice(&self) -> &[u8] { - self.builder.values_slice() - } - - /// Returns the current offsets buffer as a slice. - pub fn offsets_slice(&self) -> &[OffsetSize] { - self.builder.offsets_slice() - } -} - -impl Default for GenericStringBuilder { - fn default() -> Self { - Self::new() - } -} - -impl ArrayBuilder for GenericStringBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &dyn Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - let a = GenericStringBuilder::::finish(self); - Arc::new(a) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::{Array, OffsetSizeTrait}; - use crate::builder::ArrayBuilder; - - fn _test_generic_string_array_builder() { - let mut builder = GenericStringBuilder::::new(); - let owned = "arrow".to_owned(); - - builder.append_value("hello"); - builder.append_value(""); - builder.append_value(&owned); - builder.append_null(); - builder.append_option(Some("rust")); - builder.append_option(None::<&str>); - builder.append_option(None::); - assert_eq!(7, builder.len()); - - assert_eq!( - GenericStringArray::::from(vec![ - Some("hello"), - Some(""), - Some("arrow"), - None, - Some("rust"), - None, - None - ]), - builder.finish() - ); - } - - #[test] - fn test_string_array_builder() { - _test_generic_string_array_builder::() - } - - #[test] - fn test_large_string_array_builder() { - _test_generic_string_array_builder::() - } - - fn _test_generic_string_array_builder_finish() { - let mut builder = GenericStringBuilder::::with_capacity(3, 11); - - builder.append_value("hello"); - builder.append_value("rust"); - builder.append_null(); - - builder.finish(); - assert!(builder.is_empty()); - assert_eq!(&[O::zero()], builder.offsets_slice()); - - builder.append_value("arrow"); - builder.append_value("parquet"); - let arr = builder.finish(); - // array should not have null buffer because there is not `null` value. - assert_eq!(None, arr.data().null_buffer()); - assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) - } - - #[test] - fn test_string_array_builder_finish() { - _test_generic_string_array_builder_finish::() - } - - #[test] - fn test_large_string_array_builder_finish() { - _test_generic_string_array_builder_finish::() - } -} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 5edf011d7bf6..a5c1e3d4b2fd 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -28,12 +28,10 @@ mod fixed_size_binary_builder; pub use fixed_size_binary_builder::*; mod fixed_size_list_builder; pub use fixed_size_list_builder::*; -mod generic_binary_builder; -pub use generic_binary_builder::*; +mod generic_bytes_builder; +pub use generic_bytes_builder::*; mod generic_list_builder; pub use generic_list_builder::*; -mod generic_string_builder; -pub use generic_string_builder::*; mod map_builder; pub use map_builder::*; mod null_buffer_builder; From 870f0fa7d8975247df7bc734ce08e5f807027f53 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 21 Nov 2022 13:50:17 -0800 Subject: [PATCH 0292/1411] Add collect.rs example (#3153) * Add collect.rs example * Remove unnecessary extern crate. --- arrow/examples/README.md | 2 +- arrow/examples/collect.rs | 86 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 arrow/examples/collect.rs diff --git a/arrow/examples/README.md b/arrow/examples/README.md index 41ffd823357d..314ce9c620f1 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -20,7 +20,7 @@ # Examples - [`builders.rs`](builders.rs): Using the Builder API -- `collect` (TODO): Using the `FromIter` API +- [`collect.rs`](collect.rs): Using the `FromIter` API - [`dynamic_types.rs`](dynamic_types.rs): - [`read_csv.rs`](read_csv.rs): Reading CSV files with explict schema, pretty printing Arrays - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, pretty printing Arrays diff --git a/arrow/examples/collect.rs b/arrow/examples/collect.rs new file mode 100644 index 000000000000..d523a8036a2f --- /dev/null +++ b/arrow/examples/collect.rs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +///! `FromIterator` API is implemented for different array types to easily create them +/// from values. +use arrow::array::Array; +use arrow_array::types::Int32Type; +use arrow_array::{Float32Array, Int32Array, Int8Array, ListArray}; + +fn main() { + // Primitive Arrays + // + // Primitive arrays are arrays of fixed-width primitive types (u8, u16, u32, + // u64, i8, i16, i32, i64, f32, f64, etc.) + + // Create an Int8Array with 4 values + let array: Int8Array = vec![1, 2, 3, 4].into_iter().collect(); + println!("{:?}", array); + + // Arrays can also be built from `Vec>`. `None` + // represents a null value in the array. + let array: Int8Array = vec![Some(1_i8), Some(2), None, Some(3)] + .into_iter() + .collect(); + println!("{:?}", array); + assert!(array.is_null(2)); + + let array: Float32Array = [Some(1.0_f32), Some(2.3), None].into_iter().collect(); + println!("{:?}", array); + assert_eq!(array.value(0), 1.0_f32); + assert_eq!(array.value(1), 2.3_f32); + assert!(array.is_null(2)); + + // Although not implementing `FromIterator`, ListArrays provides `from_iter_primitive` + // function to create ListArrays from `Vec>>>`. The outer `None` + // represents a null list, the inner `None` represents a null value in a list. + let data = vec![ + Some(vec![]), + None, + Some(vec![Some(3), None, Some(5), Some(19)]), + Some(vec![Some(6), Some(7)]), + ]; + let list_array = ListArray::from_iter_primitive::(data); + + assert!(!list_array.is_valid(1)); + + let list0 = list_array.value(0); + let list2 = list_array.value(2); + let list3 = list_array.value(3); + + assert_eq!( + &[] as &[i32], + list0 + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + assert!(!list2 + .as_any() + .downcast_ref::() + .unwrap() + .is_valid(1)); + assert_eq!( + &[6, 7], + list3 + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); +} From 528a0728359946436454678625630ad55e6499fb Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Mon, 21 Nov 2022 17:48:08 -0500 Subject: [PATCH 0293/1411] Don't Skip Serializing Empty Metadata (#3082) (#3126) * remove skip_serializing_if to please postcard * add serde postcard roundtrip tests * fix formatting issues * use bincode for serialization tests Co-authored-by: askoa --- arrow-schema/Cargo.toml | 1 + arrow-schema/src/datatype.rs | 8 ++++---- arrow-schema/src/field.rs | 37 ++++++++++++++++++++++++++++++++---- arrow-schema/src/schema.rs | 4 ---- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 3b809f23ed4f..d88632d1040d 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -45,3 +45,4 @@ default = [] [dev-dependencies] serde_json = "1.0" +bincode = { version = "1.3.3", default-features = false } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index acf3691450ae..572d6f67da66 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -415,11 +415,11 @@ mod tests { assert_eq!( "{\"Struct\":[\ {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ - {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ {\"name\":\"address\",\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", + [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}},\ + {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}\ + ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{}}]}", serialized ); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index cd9024747b41..9eed03ed24e3 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -35,10 +35,6 @@ pub struct Field { dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. - #[cfg_attr( - feature = "serde", - serde(skip_serializing_if = "HashMap::is_empty", default) - )] metadata: HashMap, } @@ -654,4 +650,37 @@ mod test { assert!(!field1.contains(&field2)); assert!(!field2.contains(&field1)); } + + #[cfg(feature = "serde")] + fn assert_binary_serde_round_trip(field: Field) { + let serialized = bincode::serialize(&field).unwrap(); + let deserialized: Field = bincode::deserialize(&serialized).unwrap(); + assert_eq!(field, deserialized) + } + + #[cfg(feature = "serde")] + #[test] + fn test_field_without_metadata_serde() { + let field = Field::new("name", DataType::Boolean, true); + assert_binary_serde_round_trip(field) + } + + #[cfg(feature = "serde")] + #[test] + fn test_field_with_empty_metadata_serde() { + let field = + Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new()); + + assert_binary_serde_round_trip(field) + } + + #[cfg(feature = "serde")] + #[test] + fn test_field_with_nonempty_metadata_serde() { + let mut metadata = HashMap::new(); + metadata.insert("hi".to_owned(), "".to_owned()); + let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata); + + assert_binary_serde_round_trip(field) + } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 8ff40866d518..e45cedfb6769 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -34,10 +34,6 @@ pub type SchemaRef = std::sync::Arc; pub struct Schema { pub fields: Vec, /// A map of key-value pairs containing additional meta data. - #[cfg_attr( - feature = "serde", - serde(skip_serializing_if = "HashMap::is_empty", default) - )] pub metadata: HashMap, } From de05308c68f091271ffb1e96bf0744698082aedc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 22 Nov 2022 00:42:46 -0800 Subject: [PATCH 0294/1411] Add like_utf8_scalar_dyn kernel (#3146) --- arrow/src/compute/kernels/comparison.rs | 155 ++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 6438acc3b117..05c8b7aa6156 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -278,6 +278,39 @@ fn like_scalar<'a, L: ArrayAccessor>( like_scalar_op(left, right, |x| x) } +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + like_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + like_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + like_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -4471,6 +4504,14 @@ mod tests { vec![true, true, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_escape_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex, vec![".*", "a", "*"], @@ -4479,6 +4520,14 @@ mod tests { vec![true, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_escape_regex, + vec![".*", "a", "*"], + ".*", + like_utf8_scalar_dyn, + vec![true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex_dot, vec![".", "a", "*"], @@ -4487,6 +4536,14 @@ mod tests { vec![true, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_escape_regex_dot, + vec![".", "a", "*"], + ".", + like_utf8_scalar_dyn, + vec![true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar, vec!["arrow", "parquet", "datafusion", "flight"], @@ -4494,6 +4551,15 @@ mod tests { like_utf8_scalar, vec![true, true, false, false] ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_start, vec!["arrow", "parrow", "arrows", "arr"], @@ -4502,6 +4568,14 @@ mod tests { vec![true, false, true, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow%", + like_utf8_scalar_dyn, + vec![true, false, true, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_end, vec!["arrow", "parrow", "arrows", "arr"], @@ -4510,6 +4584,14 @@ mod tests { vec![true, true, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "%arrow", + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_equals, vec!["arrow", "parrow", "arrows", "arr"], @@ -4518,6 +4600,14 @@ mod tests { vec![true, false, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + like_utf8_scalar_dyn, + vec![true, false, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_one, vec!["arrow", "arrows", "parrow", "arr"], @@ -4526,6 +4616,14 @@ mod tests { vec![false, true, false, false] ); + test_utf8_scalar!( + test_utf8_array_like_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + like_utf8_scalar_dyn, + vec![false, true, false, false] + ); + test_utf8_scalar!( test_utf8_scalar_like_escape, vec!["a%", "a\\x"], @@ -4534,6 +4632,14 @@ mod tests { vec![true, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_dyn_escape, + vec!["a%", "a\\x"], + "a\\%", + like_utf8_scalar_dyn, + vec![true, false] + ); + test_utf8_scalar!( test_utf8_scalar_like_escape_contains, vec!["ba%", "ba\\x"], @@ -4542,6 +4648,14 @@ mod tests { vec![true, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_dyn_escape_contains, + vec!["ba%", "ba\\x"], + "%a\\%", + like_utf8_scalar_dyn, + vec![true, false] + ); + test_utf8!( test_utf8_scalar_ilike_regex, vec!["%%%"], @@ -6138,6 +6252,12 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let data = + vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; + + let dict_arrayref: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_arrayref) as ArrayRef; + assert_eq!( like_dict_scalar(&dict_array, "Air").unwrap(), BooleanArray::from( @@ -6145,6 +6265,13 @@ mod tests { ), ); + assert_eq!( + like_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( like_dict_scalar(&dict_array, "Wa%").unwrap(), BooleanArray::from( @@ -6152,6 +6279,13 @@ mod tests { ), ); + assert_eq!( + like_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( like_dict_scalar(&dict_array, "%r").unwrap(), BooleanArray::from( @@ -6159,6 +6293,13 @@ mod tests { ), ); + assert_eq!( + like_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] + ), + ); + assert_eq!( like_dict_scalar(&dict_array, "%i%").unwrap(), BooleanArray::from( @@ -6166,12 +6307,26 @@ mod tests { ), ); + assert_eq!( + like_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( like_dict_scalar(&dict_array, "%a%r%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] ), ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); } #[test] From 004a151e8df711292062236f8a94c09b6e18ef47 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 22 Nov 2022 09:27:32 +0000 Subject: [PATCH 0295/1411] Cleanup parquet tests (#3116) --- parquet/src/file/writer.rs | 117 +++++++++++++------------------- parquet/tests/boolean_writer.rs | 89 ------------------------ 2 files changed, 49 insertions(+), 157 deletions(-) delete mode 100644 parquet/tests/boolean_writer.rs diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b67bdccfe39d..2fe0b26e7f65 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -648,14 +648,15 @@ mod tests { use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; use crate::column::page::PageReader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; - use crate::data_type::Int32Type; + use crate::data_type::{BoolType, Int32Type}; + use crate::file::reader::ChunkReader; use crate::file::{ properties::{ReaderProperties, WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; use crate::format::SortingColumn; - use crate::record::RowAccessor; + use crate::record::{Row, RowAccessor}; use crate::schema::types::{ColumnDescriptor, ColumnPath}; use crate::util::memory::ByteBufferPtr; @@ -1163,16 +1164,35 @@ mod tests { assert_eq!(to_thrift(left.statistics()), to_thrift(right.statistics())); } - /// File write-read roundtrip. - /// `data` consists of arrays of values for each row group. - fn test_file_roundtrip( - file: File, + /// Tests roundtrip of i32 data written using `W` and read using `R` + fn test_roundtrip_i32( + file: W, data: Vec>, - ) -> crate::format::FileMetaData { + ) -> crate::format::FileMetaData + where + W: Write, + R: ChunkReader + From + 'static, + { + test_roundtrip::(file, data, |r| r.get_int(0).unwrap()) + } + + /// Tests roundtrip of data of type `D` written using `W` and read using `R` + /// and the provided `values` function + fn test_roundtrip( + mut file: W, + data: Vec>, + value: F, + ) -> crate::format::FileMetaData + where + W: Write, + R: ChunkReader + From + 'static, + D: DataType, + F: Fn(Row) -> D::T, + { let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(&mut vec![Arc::new( - types::Type::primitive_type_builder("col1", Type::INT32) + types::Type::primitive_type_builder("col1", D::get_physical_type()) .with_repetition(Repetition::REQUIRED) .build() .unwrap(), @@ -1181,16 +1201,15 @@ mod tests { .unwrap(), ); let props = Arc::new(WriterProperties::builder().build()); - let mut file_writer = assert_send( - SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(), - ); + let mut file_writer = + SerializedFileWriter::new(&mut file, schema, props).unwrap(); let mut rows: i64 = 0; for (idx, subset) in data.iter().enumerate() { let mut row_group_writer = file_writer.next_row_group().unwrap(); if let Some(mut writer) = row_group_writer.next_column().unwrap() { rows += writer - .typed::() + .typed::() .write_batch(&subset[..], None, None) .unwrap() as i64; writer.close().unwrap(); @@ -1202,7 +1221,7 @@ mod tests { } let file_metadata = file_writer.close().unwrap(); - let reader = assert_send(SerializedFileReader::new(file).unwrap()); + let reader = SerializedFileReader::new(R::from(file)).unwrap(); assert_eq!(reader.num_row_groups(), data.len()); assert_eq!( reader.metadata().file_metadata().num_rows(), @@ -1212,16 +1231,19 @@ mod tests { for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { let row_group_reader = reader.get_row_group(i).unwrap(); let iter = row_group_reader.get_row_iter(None).unwrap(); - let res = iter - .map(|elem| elem.get_int(0).unwrap()) - .collect::>(); + let res: Vec<_> = iter.map(&value).collect(); assert_eq!(res, *item); } file_metadata } - fn assert_send(t: T) -> T { - t + /// File write-read roundtrip. + /// `data` consists of arrays of values for each row group. + fn test_file_roundtrip( + file: File, + data: Vec>, + ) -> crate::format::FileMetaData { + test_roundtrip_i32::(file, data) } #[test] @@ -1245,58 +1267,17 @@ mod tests { } fn test_bytes_roundtrip(data: Vec>) { - let mut buffer = vec![]; - - let schema = Arc::new( - types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( - types::Type::primitive_type_builder("col1", Type::INT32) - .with_repetition(Repetition::REQUIRED) - .build() - .unwrap(), - )]) - .build() - .unwrap(), - ); - - let mut rows: i64 = 0; - { - let props = Arc::new(WriterProperties::builder().build()); - let mut writer = - SerializedFileWriter::new(&mut buffer, schema, props).unwrap(); - - for subset in &data { - let mut row_group_writer = writer.next_row_group().unwrap(); - if let Some(mut writer) = row_group_writer.next_column().unwrap() { - rows += writer - .typed::() - .write_batch(&subset[..], None, None) - .unwrap() as i64; - - writer.close().unwrap(); - } - row_group_writer.close().unwrap(); - } - writer.close().unwrap(); - } - - let reading_cursor = Bytes::from(buffer); - let reader = SerializedFileReader::new(reading_cursor).unwrap(); + test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data); + } - assert_eq!(reader.num_row_groups(), data.len()); - assert_eq!( - reader.metadata().file_metadata().num_rows(), - rows, - "row count in metadata not equal to number of rows written" + #[test] + fn test_boolean_roundtrip() { + let my_bool_values: Vec<_> = (0..2049).map(|idx| idx % 2 == 0).collect(); + test_roundtrip::, Bytes, BoolType, _>( + Vec::with_capacity(1024), + vec![my_bool_values], + |r| r.get_bool(0).unwrap(), ); - for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { - let row_group_reader = reader.get_row_group(i).unwrap(); - let iter = row_group_reader.get_row_iter(None).unwrap(); - let res = iter - .map(|elem| elem.get_int(0).unwrap()) - .collect::>(); - assert_eq!(res, *item); - } } #[test] diff --git a/parquet/tests/boolean_writer.rs b/parquet/tests/boolean_writer.rs deleted file mode 100644 index 8c3d50d8fde8..000000000000 --- a/parquet/tests/boolean_writer.rs +++ /dev/null @@ -1,89 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use parquet::data_type::BoolType; -use parquet::file::properties::WriterProperties; -use parquet::file::reader::FileReader; -use parquet::file::serialized_reader::SerializedFileReader; -use parquet::file::writer::SerializedFileWriter; -use parquet::schema::parser::parse_message_type; -use std::fs; -use std::path::Path; -use std::sync::{mpsc, Arc}; -use std::thread; -use std::time::Duration; - -#[test] -fn it_writes_data_without_hanging() { - let path = Path::new("it_writes_data_without_hanging.parquet"); - - let message_type = " - message BooleanType { - REQUIRED BOOLEAN DIM0; - } -"; - let schema = Arc::new(parse_message_type(message_type).expect("parse schema")); - let props = Arc::new(WriterProperties::builder().build()); - let file = fs::File::create(path).expect("create file"); - let mut writer = - SerializedFileWriter::new(file, schema, props).expect("create parquet writer"); - for _group in 0..1 { - let mut row_group_writer = writer.next_row_group().expect("get row group writer"); - let values: Vec = vec![0; 2049]; - let my_bool_values: Vec = values - .iter() - .enumerate() - .map(|(count, _x)| count % 2 == 0) - .collect(); - while let Some(mut col_writer) = - row_group_writer.next_column().expect("next column") - { - col_writer - .typed::() - .write_batch(&my_bool_values, None, None) - .expect("writing bool column"); - - col_writer.close().expect("close column"); - } - let rg_md = row_group_writer.close().expect("close row group"); - println!("total rows written: {}", rg_md.num_rows()); - } - writer.close().expect("close writer"); - - let bytes = fs::read(path).expect("read file"); - assert_eq!(&bytes[0..4], &[b'P', b'A', b'R', b'1']); - - // Now that we have written our data and are happy with it, make - // sure we can read it back in < 5 seconds... - let (sender, receiver) = mpsc::channel(); - let _t = thread::spawn(move || { - let file = fs::File::open(Path::new("it_writes_data_without_hanging.parquet")) - .expect("open file"); - let reader = SerializedFileReader::new(file).expect("get serialized reader"); - let iter = reader.get_row_iter(None).expect("get iterator"); - for record in iter { - println!("reading: {}", record); - } - println!("finished reading"); - if let Ok(()) = sender.send(true) {} - }); - assert_ne!( - Err(mpsc::RecvTimeoutError::Timeout), - receiver.recv_timeout(Duration::from_millis(5000)) - ); - fs::remove_file("it_writes_data_without_hanging.parquet").expect("remove file"); -} From e214ccccc702d0295fbf59258a6a817cd09ac4ea Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 22 Nov 2022 20:06:57 +0800 Subject: [PATCH 0296/1411] parquet bloom filter part III: add sbbf writer, remove `bloom` default feature, add reader properties (#3119) * bloom filter part III - add reader properties - add writer properties - remove `bloom` feature * update row group vec * fix clippy * fix clippy * remove default feature for twox * incorporate ndv and fpp * fix doc * add unit test * fix clippy * Apply suggestions from code review Co-authored-by: Andrew Lamb * remove underflow logic * refactor write Co-authored-by: Andrew Lamb --- parquet/Cargo.toml | 9 +- parquet/src/bin/parquet-show-bloom-filter.rs | 5 +- parquet/src/bloom_filter/mod.rs | 125 ++++++++++++++- parquet/src/column/writer/mod.rs | 22 +++ parquet/src/file/metadata.rs | 2 +- parquet/src/file/properties.rs | 151 +++++++++++++++++-- parquet/src/file/reader.rs | 7 +- parquet/src/file/serialized_reader.rs | 27 ++-- parquet/src/file/writer.rs | 46 +++++- parquet/src/lib.rs | 1 - 10 files changed, 353 insertions(+), 42 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 515da585ec26..7a150c94963d 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -57,7 +57,8 @@ seq-macro = { version = "0.3", default-features = false } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } hashbrown = { version = "0.13", default-features = false } -twox-hash = { version = "1.6", optional = true } +twox-hash = { version = "1.6", default-features = false } +paste = { version = "1.0" } [dev-dependencies] base64 = { version = "0.13", default-features = false, features = ["std"] } @@ -77,7 +78,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" all-features = true [features] -default = ["arrow", "bloom", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] +default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] # Enable arrow reader/writer APIs arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] # Enable CLI tools @@ -90,8 +91,6 @@ test_common = ["arrow/test_utils"] experimental = [] # Enable async APIs async = ["futures", "tokio"] -# Bloomfilter -bloom = ["twox-hash"] [[test]] name = "arrow_writer_layout" @@ -115,7 +114,7 @@ required-features = ["arrow", "cli"] [[bin]] name = "parquet-show-bloom-filter" -required-features = ["cli", "bloom"] +required-features = ["cli"] [[bench]] name = "arrow_writer" diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index a4dbdbe67de8..28493a94c490 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -78,10 +78,7 @@ fn main() { let row_group_reader = file_reader .get_row_group(ri) .expect("Unable to read row group"); - if let Some(sbbf) = row_group_reader - .get_column_bloom_filter(column_index) - .expect("Failed to parse bloom filter") - { + if let Some(sbbf) = row_group_reader.get_column_bloom_filter(column_index) { args.values.iter().for_each(|value| { println!( "Value {} is {} in bloom filter", diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 4944a93f8484..4efba3834ded 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -24,11 +24,15 @@ use crate::file::metadata::ColumnChunkMetaData; use crate::file::reader::ChunkReader; use crate::format::{ BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, BloomFilterHeader, + SplitBlockAlgorithm, Uncompressed, XxHash, }; use bytes::{Buf, Bytes}; use std::hash::Hasher; +use std::io::Write; use std::sync::Arc; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; +use thrift::protocol::{ + TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable, +}; use twox_hash::XxHash64; /// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach) @@ -80,6 +84,7 @@ fn block_check(block: &Block, hash: u32) -> bool { } /// A split block Bloom filter +#[derive(Debug, Clone)] pub struct Sbbf(Vec); const SBBF_HEADER_SIZE_ESTIMATE: usize = 20; @@ -113,7 +118,43 @@ fn read_bloom_filter_header_and_length( )) } +const BITSET_MIN_LENGTH: usize = 32; +const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; + +#[inline] +fn optimal_num_of_bytes(num_bytes: usize) -> usize { + let num_bytes = num_bytes.min(BITSET_MAX_LENGTH); + let num_bytes = num_bytes.max(BITSET_MIN_LENGTH); + num_bytes.next_power_of_two() +} + +// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf +// given fpp = (1 - e^(-k * n / m)) ^ k +// we have m = - k * n / ln(1 - fpp ^ (1 / k)) +// where k = number of hash functions, m = number of bits, n = number of distinct values +#[inline] +fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize { + let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln(); + num_bits as usize +} + impl Sbbf { + /// Create a new [Sbbf] with given number of distinct values and false positive probability. + /// Will panic if `fpp` is greater than 1.0 or less than 0.0. + pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Self { + assert!((0.0..-1.0).contains(&fpp), "invalid fpp: {}", fpp); + let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp); + Self::new_with_num_of_bytes(num_bits / 8) + } + + /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted + /// to the next power of two bounded by `BITSET_MIN_LENGTH` and `BITSET_MAX_LENGTH`. + pub fn new_with_num_of_bytes(num_bytes: usize) -> Self { + let num_bytes = optimal_num_of_bytes(num_bytes); + let bitset = vec![0_u8; num_bytes]; + Self::new(&bitset) + } + fn new(bitset: &[u8]) -> Self { let data = bitset .chunks_exact(4 * 8) @@ -128,6 +169,45 @@ impl Sbbf { Self(data) } + /// Write the bloom filter data (header and then bitset) to the output + pub fn write(&self, mut writer: W) -> Result<(), ParquetError> { + let mut protocol = TCompactOutputProtocol::new(&mut writer); + let header = self.header(); + header.write_to_out_protocol(&mut protocol).map_err(|e| { + ParquetError::General(format!("Could not write bloom filter header: {}", e)) + })?; + protocol.flush()?; + self.write_bitset(&mut writer)?; + Ok(()) + } + + /// Write the bitset in serialized form to the writer. + fn write_bitset(&self, mut writer: W) -> Result<(), ParquetError> { + for block in &self.0 { + for word in block { + writer.write_all(&word.to_le_bytes()).map_err(|e| { + ParquetError::General(format!( + "Could not write bloom filter bit set: {}", + e + )) + })?; + } + } + Ok(()) + } + + /// Create and populate [`BloomFilterHeader`] from this bitset for writing to serialized form + fn header(&self) -> BloomFilterHeader { + BloomFilterHeader { + // 8 i32 per block, 4 bytes per i32 + num_bytes: self.0.len() as i32 * 4 * 8, + algorithm: BloomFilterAlgorithm::BLOCK(SplitBlockAlgorithm {}), + hash: BloomFilterHash::XXHASH(XxHash {}), + compression: BloomFilterCompression::UNCOMPRESSED(Uncompressed {}), + } + } + + /// Read a new bloom filter from the given offset in the given reader. pub fn read_from_column_chunk( column_metadata: &ColumnChunkMetaData, reader: Arc, @@ -292,4 +372,47 @@ mod tests { assert_eq!(num_bytes, 32_i32); assert_eq!(20, SBBF_HEADER_SIZE_ESTIMATE); } + + #[test] + fn test_optimal_num_of_bytes() { + for (input, expected) in &[ + (0, 32), + (9, 32), + (31, 32), + (32, 32), + (33, 64), + (99, 128), + (1024, 1024), + (999_000_000, 128 * 1024 * 1024), + ] { + assert_eq!(*expected, optimal_num_of_bytes(*input)); + } + } + + #[test] + fn test_num_of_bits_from_ndv_fpp() { + for (fpp, ndv, num_bits) in &[ + (0.1, 10, 57), + (0.01, 10, 96), + (0.001, 10, 146), + (0.1, 100, 577), + (0.01, 100, 968), + (0.001, 100, 1460), + (0.1, 1000, 5772), + (0.01, 1000, 9681), + (0.001, 1000, 14607), + (0.1, 10000, 57725), + (0.01, 10000, 96815), + (0.001, 10000, 146076), + (0.1, 100000, 577254), + (0.01, 100000, 968152), + (0.001, 100000, 1460769), + (0.1, 1000000, 5772541), + (0.01, 1000000, 9681526), + (0.001, 1000000, 14607697), + (1e-50, 1_000_000_000_000, 14226231280773240832), + ] { + assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64); + } + } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 3cdf04f5494c..ae7920e22839 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -16,6 +16,8 @@ // under the License. //! Contains column writer API. + +use crate::bloom_filter::Sbbf; use crate::format::{ColumnIndex, OffsetIndex}; use std::collections::{BTreeSet, VecDeque}; @@ -154,6 +156,8 @@ pub struct ColumnCloseResult { pub rows_written: u64, /// Metadata for this column chunk pub metadata: ColumnChunkMetaData, + /// Optional bloom filter for this column + pub bloom_filter: Option, /// Optional column index, for filtering pub column_index: Option, /// Optional offset index, identifying page locations @@ -209,6 +213,9 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { rep_levels_sink: Vec, data_pages: VecDeque, + // bloom filter + bloom_filter: Option, + // column index and offset index column_index_builder: ColumnIndexBuilder, offset_index_builder: OffsetIndexBuilder, @@ -231,6 +238,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); + let bloom_filter_enabled = props.bloom_filter_enabled(descr.path()); + let bloom_filter = if bloom_filter_enabled { + if let Some(ndv) = props.bloom_filter_ndv(descr.path()) { + let fpp = props.bloom_filter_fpp(descr.path()); + Some(Sbbf::new_with_ndv_fpp(ndv, fpp)) + } else { + let max_bytes = props.bloom_filter_max_bytes(descr.path()); + Some(Sbbf::new_with_num_of_bytes(max_bytes as usize)) + } + } else { + None + }; + Self { descr, props, @@ -260,6 +280,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_column_nulls: 0, column_distinct_count: None, }, + bloom_filter, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -458,6 +479,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Ok(ColumnCloseResult { bytes_written: self.column_metrics.total_bytes_written, rows_written: self.column_metrics.total_rows_written, + bloom_filter: self.bloom_filter, metadata, column_index, offset_index, diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 895776a8a421..2ba50fa31a1e 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -236,7 +236,7 @@ pub struct RowGroupMetaData { } impl RowGroupMetaData { - /// Returns builer for row group metadata. + /// Returns builder for row group metadata. pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder { RowGroupMetaDataBuilder::new(schema_descr) } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c65ba8035ee6..03117d4cb077 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -64,6 +64,7 @@ //! .build(); //! ``` +use paste::paste; use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; @@ -82,6 +83,9 @@ const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); +const DEFAULT_BLOOM_FILTER_ENABLED: bool = false; +const DEFAULT_BLOOM_FILTER_MAX_BYTES: u32 = 1024 * 1024; +const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.01; /// Parquet writer version. /// @@ -125,6 +129,26 @@ pub struct WriterProperties { sorting_columns: Option>, } +macro_rules! def_col_property_getter { + ($field:ident, $field_type:ty) => { + pub fn $field(&self, col: &ColumnPath) -> Option<$field_type> { + self.column_properties + .get(col) + .and_then(|c| c.$field()) + .or_else(|| self.default_column_properties.$field()) + } + }; + ($field:ident, $field_type:ty, $default_val:expr) => { + pub fn $field(&self, col: &ColumnPath) -> $field_type { + self.column_properties + .get(col) + .and_then(|c| c.$field()) + .or_else(|| self.default_column_properties.$field()) + .unwrap_or($default_val) + } + }; +} + impl WriterProperties { /// Returns builder for writer properties with default values. pub fn builder() -> WriterPropertiesBuilder { @@ -255,6 +279,11 @@ impl WriterProperties { .or_else(|| self.default_column_properties.max_statistics_size()) .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) } + + def_col_property_getter!(bloom_filter_enabled, bool, DEFAULT_BLOOM_FILTER_ENABLED); + def_col_property_getter!(bloom_filter_fpp, f64, DEFAULT_BLOOM_FILTER_FPP); + def_col_property_getter!(bloom_filter_ndv, u64); + def_col_property_getter!(bloom_filter_max_bytes, u32, DEFAULT_BLOOM_FILTER_MAX_BYTES); } /// Writer properties builder. @@ -272,6 +301,52 @@ pub struct WriterPropertiesBuilder { sorting_columns: Option>, } +macro_rules! def_opt_field_setter { + ($field: ident, $type: ty) => { + paste! { + pub fn [](&mut self, value: $type) -> &mut Self { + self.$field = Some(value); + self + } + } + }; + ($field: ident, $type: ty, $min_value:expr, $max_value:expr) => { + paste! { + pub fn [](&mut self, value: $type) -> &mut Self { + if ($min_value..=$max_value).contains(&value) { + self.$field = Some(value); + } else { + self.$field = None + } + self + } + } + }; +} + +macro_rules! def_opt_field_getter { + ($field: ident, $type: ty) => { + paste! { + #[doc = "Returns " $field " if set."] + pub fn $field(&self) -> Option<$type> { + self.$field + } + } + }; +} + +macro_rules! def_per_col_setter { + ($field:ident, $field_type:ty) => { + paste! { + #[doc = "Sets " $field " for a column. Takes precedence over globally defined settings."] + pub fn [](mut self, col: ColumnPath, value: $field_type) -> Self { + self.get_mut_props(col).[](value); + self + } + } + } +} + impl WriterPropertiesBuilder { /// Returns default state of the builder. fn with_defaults() -> Self { @@ -284,7 +359,7 @@ impl WriterPropertiesBuilder { writer_version: DEFAULT_WRITER_VERSION, created_by: DEFAULT_CREATED_BY.to_string(), key_value_metadata: None, - default_column_properties: ColumnProperties::new(), + default_column_properties: Default::default(), column_properties: HashMap::new(), sorting_columns: None, } @@ -439,7 +514,7 @@ impl WriterPropertiesBuilder { fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { self.column_properties .entry(col) - .or_insert_with(ColumnProperties::new) + .or_insert_with(Default::default) } /// Sets encoding for a column. @@ -492,6 +567,11 @@ impl WriterPropertiesBuilder { self.get_mut_props(col).set_max_statistics_size(value); self } + + def_per_col_setter!(bloom_filter_enabled, bool); + def_per_col_setter!(bloom_filter_fpp, f64); + def_per_col_setter!(bloom_filter_max_bytes, u32); + def_per_col_setter!(bloom_filter_ndv, u64); } /// Controls the level of statistics to be computed by the writer @@ -515,27 +595,24 @@ impl Default for EnabledStatistics { /// /// If a field is `None`, it means that no specific value has been set for this column, /// so some subsequent or default value must be used. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Default, PartialEq)] struct ColumnProperties { encoding: Option, codec: Option, dictionary_enabled: Option, statistics_enabled: Option, max_statistics_size: Option, + /// bloom filter enabled + bloom_filter_enabled: Option, + /// bloom filter expected number of distinct values + bloom_filter_ndv: Option, + /// bloom filter false positive probability + bloom_filter_fpp: Option, + /// bloom filter max number of bytes + bloom_filter_max_bytes: Option, } impl ColumnProperties { - /// Initialise column properties with default values. - fn new() -> Self { - Self { - encoding: None, - codec: None, - dictionary_enabled: None, - statistics_enabled: None, - max_statistics_size: None, - } - } - /// Sets encoding for this column. /// /// If dictionary is not enabled, this is treated as a primary encoding for a column. @@ -572,6 +649,11 @@ impl ColumnProperties { self.max_statistics_size = Some(value); } + def_opt_field_setter!(bloom_filter_enabled, bool); + def_opt_field_setter!(bloom_filter_fpp, f64, 0.0, 1.0); + def_opt_field_setter!(bloom_filter_max_bytes, u32); + def_opt_field_setter!(bloom_filter_ndv, u64); + /// Returns optional encoding for this column. fn encoding(&self) -> Option { self.encoding @@ -599,17 +681,25 @@ impl ColumnProperties { fn max_statistics_size(&self) -> Option { self.max_statistics_size } + + def_opt_field_getter!(bloom_filter_enabled, bool); + def_opt_field_getter!(bloom_filter_fpp, f64); + def_opt_field_getter!(bloom_filter_max_bytes, u32); + def_opt_field_getter!(bloom_filter_ndv, u64); } /// Reference counted reader properties. pub type ReaderPropertiesPtr = Arc; +const DEFAULT_READ_BLOOM_FILTER: bool = false; + /// Reader properties. /// /// All properties are immutable and `Send` + `Sync`. /// Use [`ReaderPropertiesBuilder`] to assemble these properties. pub struct ReaderProperties { codec_options: CodecOptions, + read_bloom_filter: bool, } impl ReaderProperties { @@ -622,11 +712,17 @@ impl ReaderProperties { pub(crate) fn codec_options(&self) -> &CodecOptions { &self.codec_options } + + /// Returns whether to read bloom filter + pub(crate) fn read_bloom_filter(&self) -> bool { + self.read_bloom_filter + } } /// Reader properties builder. pub struct ReaderPropertiesBuilder { codec_options_builder: CodecOptionsBuilder, + read_bloom_filter: Option, } /// Reader properties builder. @@ -635,6 +731,7 @@ impl ReaderPropertiesBuilder { fn with_defaults() -> Self { Self { codec_options_builder: CodecOptionsBuilder::default(), + read_bloom_filter: None, } } @@ -642,6 +739,9 @@ impl ReaderPropertiesBuilder { pub fn build(self) -> ReaderProperties { ReaderProperties { codec_options: self.codec_options_builder.build(), + read_bloom_filter: self + .read_bloom_filter + .unwrap_or(DEFAULT_READ_BLOOM_FILTER), } } @@ -659,6 +759,17 @@ impl ReaderPropertiesBuilder { .set_backward_compatible_lz4(value); self } + + /// Enable/disable reading bloom filter + /// + /// If reading bloom filter is enabled, bloom filter will be read from the file. + /// If reading bloom filter is disabled, bloom filter will not be read from the file. + /// + /// By default bloom filter is set to be read. + pub fn set_read_bloom_filter(mut self, value: bool) -> Self { + self.read_bloom_filter = Some(value); + self + } } #[cfg(test)] @@ -701,6 +812,13 @@ mod tests { props.max_statistics_size(&ColumnPath::from("col")), DEFAULT_MAX_STATISTICS_SIZE ); + assert!(!props.bloom_filter_enabled(&ColumnPath::from("col"))); + assert_eq!(props.bloom_filter_fpp(&ColumnPath::from("col")), 0.01); + assert_eq!(props.bloom_filter_ndv(&ColumnPath::from("col")), None); + assert_eq!( + props.bloom_filter_max_bytes(&ColumnPath::from("col")), + 1024 * 1024 + ); } #[test] @@ -784,6 +902,10 @@ mod tests { EnabledStatistics::Chunk, ) .set_column_max_statistics_size(ColumnPath::from("col"), 123) + .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) + .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100) + .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1) + .set_column_bloom_filter_max_bytes(ColumnPath::from("col"), 1000) .build(); assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0); @@ -858,6 +980,7 @@ mod tests { .build(); assert_eq!(props.codec_options(), &codec_options); + assert!(!props.read_bloom_filter()); } #[test] diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 325944c2168b..bb82f229927d 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -21,7 +21,6 @@ use bytes::Bytes; use std::{boxed::Box, io::Read, sync::Arc}; -#[cfg(feature = "bloom")] use crate::bloom_filter::Sbbf; use crate::column::page::PageIterator; use crate::column::{page::PageReader, reader::ColumnReader}; @@ -145,9 +144,9 @@ pub trait RowGroupReader: Send + Sync { Ok(col_reader) } - #[cfg(feature = "bloom")] - /// Get bloom filter for the `i`th column chunk, if present. - fn get_column_bloom_filter(&self, i: usize) -> Result>; + /// Get bloom filter for the `i`th column chunk, if present and the reader was configured + /// to read bloom filters. + fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>; /// Get iterator of `Row`s from this row group. /// diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index cb39dd194872..84768aa23c88 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -20,10 +20,10 @@ use std::collections::VecDeque; use std::io::Cursor; +use std::iter; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; use crate::basic::{Encoding, Type}; -#[cfg(feature = "bloom")] use crate::bloom_filter::Sbbf; use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; @@ -329,7 +329,7 @@ impl FileReader for SerializedFileReader { f, row_group_metadata, props, - ))) + )?)) } fn get_row_iter(&self, projection: Option) -> Result { @@ -342,6 +342,7 @@ pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, props: ReaderPropertiesPtr, + bloom_filters: Vec>, } impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { @@ -350,12 +351,22 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, props: ReaderPropertiesPtr, - ) -> Self { - Self { + ) -> Result { + let bloom_filters = if props.read_bloom_filter() { + metadata + .columns() + .iter() + .map(|col| Sbbf::read_from_column_chunk(col, chunk_reader.clone())) + .collect::>>()? + } else { + iter::repeat(None).take(metadata.columns().len()).collect() + }; + Ok(Self { chunk_reader, metadata, props, - } + bloom_filters, + }) } } @@ -388,11 +399,9 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' )?)) } - #[cfg(feature = "bloom")] /// get bloom filter for the `i`th column - fn get_column_bloom_filter(&self, i: usize) -> Result> { - let col = self.metadata.column(i); - Sbbf::read_from_column_chunk(col, self.chunk_reader.clone()) + fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf> { + self.bloom_filters[i].as_ref() } fn get_row_iter(&self, projection: Option) -> Result { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2fe0b26e7f65..3f1731687e2c 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -18,10 +18,10 @@ //! Contains file writer API, and provides methods to write row groups and columns by //! using row group writers and column writers respectively. -use std::{io::Write, sync::Arc}; - +use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; +use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol, TSerializable}; use crate::basic::PageType; @@ -92,6 +92,7 @@ pub type OnCloseColumnChunk<'a> = Box Result<() pub type OnCloseRowGroup<'a> = Box< dyn FnOnce( RowGroupMetaDataPtr, + Vec>, Vec>, Vec>, ) -> Result<()> @@ -116,6 +117,7 @@ pub struct SerializedFileWriter { descr: SchemaDescPtr, props: WriterPropertiesPtr, row_groups: Vec, + bloom_filters: Vec>>, column_indexes: Vec>>, offset_indexes: Vec>>, row_group_index: usize, @@ -132,6 +134,7 @@ impl SerializedFileWriter { descr: Arc::new(SchemaDescriptor::new(schema)), props: properties, row_groups: vec![], + bloom_filters: vec![], column_indexes: Vec::new(), offset_indexes: Vec::new(), row_group_index: 0, @@ -149,10 +152,15 @@ impl SerializedFileWriter { self.row_group_index += 1; let row_groups = &mut self.row_groups; + let row_bloom_filters = &mut self.bloom_filters; let row_column_indexes = &mut self.column_indexes; let row_offset_indexes = &mut self.offset_indexes; - let on_close = |metadata, row_group_column_index, row_group_offset_index| { + let on_close = |metadata, + row_group_bloom_filter, + row_group_column_index, + row_group_offset_index| { row_groups.push(metadata); + row_bloom_filters.push(row_group_bloom_filter); row_column_indexes.push(row_group_column_index); row_offset_indexes.push(row_group_offset_index); Ok(()) @@ -212,6 +220,31 @@ impl SerializedFileWriter { Ok(()) } + /// Serialize all the bloom filter to the file + fn write_bloom_filters(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { + // iter row group + // iter each column + // write bloom filter to the file + for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { + for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() { + match &self.bloom_filters[row_group_idx][column_idx] { + Some(bloom_filter) => { + let start_offset = self.buf.bytes_written(); + bloom_filter.write(&mut self.buf)?; + // set offset and index for bloom filter + column_chunk + .meta_data + .as_mut() + .expect("can't have bloom filter without column metadata") + .bloom_filter_offset = Some(start_offset as i64); + } + None => {} + } + } + } + Ok(()) + } + /// Serialize all the column index to the file fn write_column_indexes(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { // iter row group @@ -250,6 +283,7 @@ impl SerializedFileWriter { .map(|v| v.to_thrift()) .collect::>(); + self.write_bloom_filters(&mut row_groups)?; // Write column indexes and offset indexes self.write_column_indexes(&mut row_groups)?; self.write_offset_indexes(&mut row_groups)?; @@ -320,6 +354,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { column_index: usize, row_group_metadata: Option, column_chunks: Vec, + bloom_filters: Vec>, column_indexes: Vec>, offset_indexes: Vec>, on_close: Option>, @@ -348,6 +383,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { column_index: 0, row_group_metadata: None, column_chunks: Vec::with_capacity(num_columns), + bloom_filters: Vec::with_capacity(num_columns), column_indexes: Vec::with_capacity(num_columns), offset_indexes: Vec::with_capacity(num_columns), total_bytes_written: 0, @@ -380,11 +416,13 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { let column_chunks = &mut self.column_chunks; let column_indexes = &mut self.column_indexes; let offset_indexes = &mut self.offset_indexes; + let bloom_filters = &mut self.bloom_filters; let on_close = |r: ColumnCloseResult| { // Update row group writer metrics *total_bytes_written += r.bytes_written; column_chunks.push(r.metadata); + bloom_filters.push(r.bloom_filter); column_indexes.push(r.column_index); offset_indexes.push(r.offset_index); @@ -443,6 +481,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { if let Some(on_close) = self.on_close.take() { on_close( metadata, + self.bloom_filters.clone(), self.column_indexes.clone(), self.offset_indexes.clone(), )? @@ -623,6 +662,7 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, W> { Ok(spec) } + fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()> { let mut protocol = TCompactOutputProtocol::new(&mut self.sink); metadata diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index cd29d02f808e..4cdba1dc55ee 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -84,7 +84,6 @@ pub mod arrow; pub mod column; experimental!(mod compression); experimental!(mod encodings); -#[cfg(feature = "bloom")] pub mod bloom_filter; pub mod file; pub mod record; From f091cbbf0a1f212fbe1bd4d63fa018e7a5c82ccc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 22 Nov 2022 15:27:14 +0000 Subject: [PATCH 0297/1411] Check overflow in MutableArrayData extend offsets (#3123) (#3157) * Check overflow in MutableArrayData extend offsets (#3123) * Update arrow-data/src/transform/utils.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- arrow-data/src/transform/list.rs | 6 ++++-- arrow-data/src/transform/utils.rs | 22 +++++++++++++++++++--- arrow-data/src/transform/variable_size.rs | 6 ++++-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index 2f14f2fb514a..76a845958da8 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -21,9 +21,11 @@ use super::{ }; use crate::ArrayData; use arrow_buffer::ArrowNativeType; -use num::Integer; +use num::{CheckedAdd, Integer}; -pub(super) fn build_extend(array: &ArrayData) -> Extend { +pub(super) fn build_extend( + array: &ArrayData, +) -> Extend { let offsets = array.buffer::(0); if array.null_count() == 0 { // fast case where we can copy regions without nullability checks diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 6a4c240c9ae3..b1e3388ba84e 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -16,7 +16,7 @@ // under the License. use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; -use num::Integer; +use num::{CheckedAdd, Integer}; /// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero. #[inline] @@ -27,7 +27,7 @@ pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { } } -pub(super) fn extend_offsets( +pub(super) fn extend_offsets( buffer: &mut MutableBuffer, mut last_offset: T, offsets: &[T], @@ -36,7 +36,10 @@ pub(super) fn extend_offsets( offsets.windows(2).for_each(|offsets| { // compute the new offset let length = offsets[1] - offsets[0]; - last_offset = last_offset + length; + // if you hit this appending to a StringArray / BinaryArray it is because you + // are trying to add more data than can fit into that type. Try breaking your data into + // smaller batches or using LargeStringArray / LargeBinaryArray + last_offset = last_offset.checked_add(&length).expect("offset overflow"); buffer.push(last_offset); }); } @@ -55,3 +58,16 @@ pub(super) unsafe fn get_last_offset( debug_assert!(prefix.is_empty() && suffix.is_empty()); *offsets.get_unchecked(offsets.len() - 1) } + +#[cfg(test)] +mod tests { + use crate::transform::utils::extend_offsets; + use arrow_buffer::MutableBuffer; + + #[test] + #[should_panic(expected = "offset overflow")] + fn test_overflow() { + let mut buffer = MutableBuffer::new(10); + extend_offsets(&mut buffer, i32::MAX - 4, &[0, 5]); + } +} diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index 73c4783189dc..ce62459aef09 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -18,7 +18,7 @@ use crate::ArrayData; use arrow_buffer::{ArrowNativeType, MutableBuffer}; use num::traits::AsPrimitive; -use num::Integer; +use num::{CheckedAdd, Integer}; use super::{ Extend, _MutableArrayData, @@ -39,7 +39,9 @@ fn extend_offset_values>( buffer.extend_from_slice(new_values); } -pub(super) fn build_extend>( +pub(super) fn build_extend< + T: ArrowNativeType + Integer + CheckedAdd + AsPrimitive, +>( array: &ArrayData, ) -> Extend { let offsets = array.buffer::(0); From a110004b3d9f30358c22ac917fcad3745ea2460c Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 22 Nov 2022 15:58:51 +0000 Subject: [PATCH 0298/1411] feat: `{Field,DataType}::size` (#3149) Add a way to calculate in-memory size of `Field` and `DataType`. Closes #3147. --- arrow-schema/src/datatype.rs | 50 ++++++++++++++++++++++++++++++++++++ arrow-schema/src/field.rs | 15 +++++++++++ 2 files changed, 65 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 572d6f67da66..b9be4bec79d8 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -372,6 +372,56 @@ impl DataType { _ => self == other, } } + + /// Return size of this instance in bytes. + /// + /// Includes the size of `Self`. + pub fn size(&self) -> usize { + std::mem::size_of_val(self) + + match self { + DataType::Null + | DataType::Boolean + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) + | DataType::Binary + | DataType::FixedSizeBinary(_) + | DataType::LargeBinary + | DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) => 0, + DataType::Timestamp(_, s) => { + s.as_ref().map(|s| s.capacity()).unwrap_or_default() + } + DataType::List(field) + | DataType::FixedSizeList(field, _) + | DataType::LargeList(field) + | DataType::Map(field, _) => field.size(), + DataType::Struct(fields) | DataType::Union(fields, _, _) => { + fields + .iter() + .map(|field| field.size() - std::mem::size_of_val(field)) + .sum::() + + (std::mem::size_of::() * fields.capacity()) + } + DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), + } + } } #[cfg(test)] diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 9eed03ed24e3..5813902ddd77 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -455,6 +455,21 @@ impl Field { } } } + + /// Return size of this instance in bytes. + /// + /// Includes the size of `Self`. + pub fn size(&self) -> usize { + std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type) + + self.data_type.size() + + self.name.capacity() + + (std::mem::size_of::<(String, String)>() * self.metadata.capacity()) + + self + .metadata + .iter() + .map(|(k, v)| k.capacity() + v.capacity()) + .sum::() + } } // TODO: improve display with crate https://crates.io/crates/derive_more ? From 6455e340168595e8c69f8d4bae59487e651bd513 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Tue, 22 Nov 2022 22:25:36 +0530 Subject: [PATCH 0299/1411] Prevent precision=0 for decimal type (#3162) * Adding decimal precision checks * Doc edits --- arrow-array/src/array/primitive_array.rs | 27 ++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 195e2dc19a1a..487fc2a17de5 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -993,12 +993,13 @@ impl From for PrimitiveArray { impl PrimitiveArray { /// Returns a Decimal array with the same data as self, with the - /// specified precision. + /// specified precision and scale. /// /// Returns an Error if: - /// 1. `precision` is larger than `T:MAX_PRECISION` - /// 2. `scale` is larger than `T::MAX_SCALE` - /// 3. `scale` is > `precision` + /// - `precision` is zero + /// - `precision` is larger than `T:MAX_PRECISION` + /// - `scale` is larger than `T::MAX_SCALE` + /// - `scale` is > `precision` pub fn with_precision_and_scale( self, precision: u8, @@ -1025,18 +1026,24 @@ impl PrimitiveArray { precision: u8, scale: u8, ) -> Result<(), ArrowError> { + if precision == 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "precision cannot be 0, has to be between [1, {}]", + T::MAX_PRECISION + ))); + } if precision > T::MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "precision {} is greater than max {}", precision, - Decimal128Type::MAX_PRECISION + T::MAX_PRECISION ))); } if scale > T::MAX_SCALE { return Err(ArrowError::InvalidArgumentError(format!( "scale {} is greater than max {}", scale, - Decimal128Type::MAX_SCALE + T::MAX_SCALE ))); } if scale > precision { @@ -1934,6 +1941,14 @@ mod tests { arr.validate_decimal_precision(5).unwrap(); } + #[test] + #[should_panic(expected = "precision cannot be 0, has to be between [1, 38]")] + fn test_decimal_array_with_precision_zero() { + Decimal128Array::from_iter_values([12345, 456]) + .with_precision_and_scale(0, 2) + .unwrap(); + } + #[test] #[should_panic(expected = "precision 40 is greater than max 38")] fn test_decimal_array_with_precision_and_scale_invalid_precision() { From ed1d74b718ed9a7e99de452d7fd5794f549273b6 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Wed, 23 Nov 2022 00:28:36 +0530 Subject: [PATCH 0300/1411] Fix parquet decimal precision (#3164) --- parquet/src/arrow/arrow_reader/mod.rs | 34 +++++++++++++++++++++++++++ parquet/src/arrow/schema.rs | 9 ++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a720d439cc91..da4b56237e14 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2518,4 +2518,38 @@ mod tests { assert_eq!(actual.num_rows(), 1); assert_eq!(actual.column(0), &expected.column(0).slice(1, 1)); } + + #[test] + fn test_arbitary_decimal() { + let values = [1, 2, 3, 4, 5, 6, 7, 8]; + let decimals_19_0 = Decimal128Array::from_iter_values(values) + .with_precision_and_scale(19, 0) + .unwrap(); + let decimals_12_0 = Decimal128Array::from_iter_values(values) + .with_precision_and_scale(12, 0) + .unwrap(); + let decimals_17_10 = Decimal128Array::from_iter_values(values) + .with_precision_and_scale(17, 10) + .unwrap(); + + let written = RecordBatch::try_from_iter([ + ("decimal_values_19_0", Arc::new(decimals_19_0) as ArrayRef), + ("decimal_values_12_0", Arc::new(decimals_12_0) as ArrayRef), + ("decimal_values_17_10", Arc::new(decimals_17_10) as ArrayRef), + ]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + writer.write(&written).unwrap(); + writer.close().unwrap(); + + let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert_eq!(&written.slice(0, 8), &read[0]); + } } diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 07afccdb20bf..464b86d0c67d 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -233,7 +233,14 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result usize { - (10.0_f64.powi(precision as i32).log2() / 8.0).ceil() as usize + // digits = floor(log_10(2^(8*n - 1) - 1)) // definition in parquet's logical types + // ceil(digits) = log10(2^(8*n - 1) - 1) + // 10^ceil(digits) = 2^(8*n - 1) - 1 + // 10^ceil(digits) + 1 = 2^(8*n - 1) + // log2(10^ceil(digits) + 1) = (8*n - 1) + // log2(10^ceil(digits) + 1) + 1 = 8*n + // (log2(10^ceil(a) + 1) + 1) / 8 = n + (((10.0_f64.powi(precision as i32) + 1.0).log2() + 1.0) / 8.0).ceil() as usize } /// Convert an arrow field to a parquet `Type` From 00e5542b9863cd6826936365b52f35ff804c6a22 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 05:46:46 +0000 Subject: [PATCH 0301/1411] Extend Decimal256 as Primitive (#3156) --- arrow-data/src/transform/fixed_binary.rs | 2 -- arrow-data/src/transform/mod.rs | 16 +++++++--------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arrow-data/src/transform/fixed_binary.rs b/arrow-data/src/transform/fixed_binary.rs index fe21a6bc382d..a20901014c5d 100644 --- a/arrow-data/src/transform/fixed_binary.rs +++ b/arrow-data/src/transform/fixed_binary.rs @@ -22,7 +22,6 @@ use arrow_schema::DataType; pub(super) fn build_extend(array: &ArrayData) -> Extend { let size = match array.data_type() { DataType::FixedSizeBinary(i) => *i as usize, - DataType::Decimal256(_, _) => 32, _ => unreachable!(), }; @@ -58,7 +57,6 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let size = match mutable.data_type { DataType::FixedSizeBinary(i) => i as usize, - DataType::Decimal256(_, _) => 32, _ => unreachable!(), }; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index c34376aaba29..6a8c89d25a22 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -20,7 +20,7 @@ use super::{ ArrayData, ArrayDataBuilder, }; use crate::bit_mask::set_bits; -use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; use num::Integer; @@ -186,7 +186,6 @@ fn build_extend_dictionary( fn build_extend(array: &ArrayData) -> Extend { match array.data_type() { - DataType::Decimal128(_, _) => primitive::build_extend::(array), DataType::Null => null::build_extend(array), DataType::Boolean => boolean::build_extend(array), DataType::UInt8 => primitive::build_extend::(array), @@ -214,6 +213,8 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Interval(IntervalUnit::MonthDayNano) => { primitive::build_extend::(array) } + DataType::Decimal128(_, _) => primitive::build_extend::(array), + DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), DataType::LargeUtf8 | DataType::LargeBinary => { variable_size::build_extend::(array) @@ -222,9 +223,7 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::LargeList(_) => list::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), DataType::Struct(_) => structure::build_extend(array), - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::build_extend(array) - } + DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), DataType::Float16 => primitive::build_extend::(array), DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), DataType::Union(_, _, mode) => match mode { @@ -236,7 +235,6 @@ fn build_extend(array: &ArrayData) -> Extend { fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { Box::new(match data_type { - DataType::Decimal128(_, _) => primitive::extend_nulls::, DataType::Null => null::extend_nulls, DataType::Boolean => boolean::extend_nulls, DataType::UInt8 => primitive::extend_nulls::, @@ -258,6 +256,8 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { | DataType::Duration(_) | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, + DataType::Decimal128(_, _) => primitive::extend_nulls::, + DataType::Decimal256(_, _) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, @@ -274,9 +274,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { _ => unreachable!(), }, DataType::Struct(_) => structure::extend_nulls, - DataType::FixedSizeBinary(_) | DataType::Decimal256(_, _) => { - fixed_binary::extend_nulls - } + DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls, DataType::Float16 => primitive::extend_nulls::, DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, DataType::Union(_, _, mode) => match mode { From ca92306f801ca39372a4b4c8d9b4d430ead38f64 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 05:47:33 +0000 Subject: [PATCH 0302/1411] Add Decimal128, Decimal256, Float16 to DataType::is_numeric (#3121) --- arrow-schema/src/datatype.rs | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index b9be4bec79d8..cf85902e4ce7 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -263,30 +263,13 @@ impl fmt::Display for DataType { impl DataType { /// Returns true if the type is primitive: (numeric, temporal). + #[inline] pub fn is_primitive(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - Int8 | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - | Float64 - | Date32 - | Date64 - | Time32(_) - | Time64(_) - | Timestamp(_, _) - | Interval(_) - | Duration(_) - ) + Self::is_numeric(t) || Self::is_temporal(t) } - /// Returns true if this type is numeric: (UInt*, Int*, or Float*). + /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*). + #[inline] pub fn is_numeric(t: &DataType) -> bool { use DataType::*; matches!( @@ -299,12 +282,16 @@ impl DataType { | Int16 | Int32 | Int64 + | Float16 | Float32 | Float64 + | Decimal128(_, _) + | Decimal256(_, _) ) } /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). + #[inline] pub fn is_temporal(t: &DataType) -> bool { use DataType::*; matches!( @@ -320,6 +307,7 @@ impl DataType { } /// Returns true if this type is valid as a dictionary key + #[inline] pub fn is_dictionary_key_type(t: &DataType) -> bool { use DataType::*; matches!( From a6f140f79377eb02b4bec08177d940a3918c65d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 07:58:52 +0000 Subject: [PATCH 0303/1411] Remove unnecessary Buffer::from_slice_ref reference (#3161) * Remove unnecessary Buffer::from_slice_ref reference * Clippy * More clippy --- arrow-array/src/array/binary_array.rs | 38 +++++----- .../src/array/fixed_size_binary_array.rs | 10 +-- .../src/array/fixed_size_list_array.rs | 8 +- arrow-array/src/array/list_array.rs | 30 ++++---- arrow-array/src/array/primitive_array.rs | 6 +- arrow-array/src/array/string_array.rs | 14 ++-- arrow-array/src/array/union_array.rs | 20 ++--- .../src/builder/generic_list_builder.rs | 10 +-- arrow-array/src/builder/map_builder.rs | 6 +- arrow-array/src/builder/struct_builder.rs | 6 +- arrow-buffer/src/buffer/immutable.rs | 2 +- arrow-cast/src/cast.rs | 14 ++-- arrow-data/src/data.rs | 6 +- arrow-integration-test/src/lib.rs | 2 +- arrow-ipc/src/reader.rs | 8 +- arrow-ipc/src/writer.rs | 4 +- arrow-json/src/reader.rs | 8 +- arrow-select/src/filter.rs | 12 +-- arrow-select/src/take.rs | 2 +- arrow/src/compute/kernels/comparison.rs | 2 +- arrow/src/compute/kernels/limit.rs | 6 +- arrow/src/ffi.rs | 2 +- arrow/src/util/pretty.rs | 2 +- arrow/tests/array_transform.rs | 9 +-- arrow/tests/array_validation.rs | 76 +++++++++---------- 25 files changed, 150 insertions(+), 153 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 2ca8a061a6fa..0b526ecb3dee 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -296,8 +296,8 @@ mod tests { // Array data: ["hello", "", "parquet"] let array_data = ArrayData::builder(DataType::Binary) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array = BinaryArray::from(array_data); @@ -335,8 +335,8 @@ mod tests { let array_data = ArrayData::builder(DataType::Binary) .len(2) .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array = BinaryArray::from(array_data); @@ -360,8 +360,8 @@ mod tests { // Array data: ["hello", "", "parquet"] let array_data = ArrayData::builder(DataType::LargeBinary) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array = LargeBinaryArray::from(array_data); @@ -399,8 +399,8 @@ mod tests { let array_data = ArrayData::builder(DataType::LargeBinary) .len(2) .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array = LargeBinaryArray::from(array_data); @@ -429,8 +429,8 @@ mod tests { // Array data: ["hello", "", "parquet"] let array_data1 = ArrayData::builder(GenericBinaryArray::::DATA_TYPE) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); @@ -441,7 +441,7 @@ mod tests { let array_data2 = ArrayData::builder(data_type) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .add_child_data(child_data) .build() .unwrap(); @@ -484,7 +484,7 @@ mod tests { .unwrap(); let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); - let null_buffer = Buffer::from_slice_ref(&[0b101]); + let null_buffer = Buffer::from_slice_ref([0b101]); let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( Field::new("item", DataType::UInt8, false), )); @@ -493,7 +493,7 @@ mod tests { let array_data = ArrayData::builder(data_type) .len(2) .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .null_bit_buffer(Some(null_buffer)) .add_child_data(child_data) .build() @@ -525,7 +525,7 @@ mod tests { let child_data = ArrayData::builder(DataType::UInt8) .len(10) .add_buffer(Buffer::from(&values[..])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010]))) .build() .unwrap(); @@ -537,7 +537,7 @@ mod tests { // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) .len(2) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .add_child_data(child_data) .build() .unwrap(); @@ -617,7 +617,7 @@ mod tests { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt32) .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; @@ -626,7 +626,7 @@ mod tests { DataType::List(Box::new(Field::new("item", DataType::UInt32, false))); let array_data = ArrayData::builder(data_type) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .add_child_data(values_data) .build() .unwrap(); @@ -644,8 +644,8 @@ mod tests { let offsets: [i32; 4] = [0, 5, 5, 12]; let array_data = ArrayData::builder(DataType::Binary) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let binary_array = BinaryArray::from(array_data); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 9bac49810301..245cf522810d 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -539,7 +539,7 @@ mod tests { let values_data = ArrayData::builder(DataType::UInt8) .len(12) .offset(2) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); // [null, [10, 11, 12, 13]] @@ -551,7 +551,7 @@ mod tests { .len(2) .offset(1) .add_child_data(values_data) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101]))) + .null_bit_buffer(Some(Buffer::from_slice_ref([0b101]))) .build_unchecked() }; let list_array = FixedSizeListArray::from(array_data); @@ -575,7 +575,7 @@ mod tests { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt32) .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); @@ -598,8 +598,8 @@ mod tests { let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt8) .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101010101010]))) + .add_buffer(Buffer::from_slice_ref(values)) + .null_bit_buffer(Some(Buffer::from_slice_ref([0b101010101010]))) .build() .unwrap(); diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index c536a422e82f..ca1dee35c41e 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -245,7 +245,7 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(9) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8])) .build() .unwrap(); @@ -320,7 +320,7 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); @@ -343,7 +343,7 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); @@ -405,7 +405,7 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 17691bb324ae..54699749f2ff 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -416,13 +416,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two let list_data_type = @@ -506,13 +506,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); // Construct a list array from the above two let list_data_type = @@ -596,13 +596,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + let value_offsets = Buffer::from_slice_ref([0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); // 01011001 00000001 let mut null_bits: [u8; 2] = [0; 2]; bit_util::set_bit(&mut null_bits, 0); @@ -660,13 +660,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); // 01011001 00000001 let mut null_bits: [u8; 2] = [0; 2]; bit_util::set_bit(&mut null_bits, 0); @@ -727,13 +727,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); // 01011001 00000001 let mut null_bits: [u8; 2] = [0; 2]; bit_util::set_bit(&mut null_bits, 0); @@ -768,7 +768,7 @@ mod tests { let value_data = unsafe { ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; let list_data_type = @@ -790,7 +790,7 @@ mod tests { // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_list_array_invalid_child_array_len() { - let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]); + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { @@ -818,11 +818,11 @@ mod tests { fn test_list_array_offsets_need_not_start_at_zero() { let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); - let value_offsets = Buffer::from_slice_ref(&[2, 2, 5, 7]); + let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); @@ -865,7 +865,7 @@ mod tests { let values: [i32; 8] = [0; 8]; let value_data = unsafe { ArrayData::builder(DataType::Int32) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(values)) .build_unchecked() }; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 487fc2a17de5..f34c899e2265 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1139,7 +1139,7 @@ mod tests { #[test] fn test_primitive_array_from_vec() { - let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]); + let buf = Buffer::from_slice_ref([0, 1, 2, 3, 4]); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); assert_eq!(buf, arr.data.buffers()[0]); assert_eq!(5, arr.len()); @@ -1638,7 +1638,7 @@ mod tests { #[test] fn test_primitive_array_builder() { // Test building a primitive array with ArrayData builder and offset - let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]); + let buf = Buffer::from_slice_ref([0i32, 1, 2, 3, 4, 5, 6]); let buf2 = buf.clone(); let data = ArrayData::builder(DataType::Int32) .len(5) @@ -1707,7 +1707,7 @@ mod tests { // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_primitive_array_invalid_buffer_len() { - let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]); + let buffer = Buffer::from_slice_ref([0i32, 1, 2, 3, 4]); let data = unsafe { ArrayData::builder(DataType::Int32) .add_buffer(buffer.clone()) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 94fcbae02e5d..8d92093f5ce8 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -371,8 +371,8 @@ mod tests { let offsets: [i32; 4] = [0, 5, 5, 12]; let array_data = ArrayData::builder(DataType::Utf8) .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_slice_ref(values)) .build() .unwrap(); let string_array = StringArray::from(array_data); @@ -548,7 +548,7 @@ mod tests { .unwrap(); let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); - let null_buffer = Buffer::from_slice_ref(&[0b101]); + let null_buffer = Buffer::from_slice_ref([0b101]); let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( Field::new("item", DataType::UInt8, false), )); @@ -557,7 +557,7 @@ mod tests { let array_data = ArrayData::builder(data_type) .len(2) .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .null_bit_buffer(Some(null_buffer)) .add_child_data(child_data) .build() @@ -589,7 +589,7 @@ mod tests { let child_data = ArrayData::builder(DataType::UInt8) .len(10) .add_buffer(Buffer::from(&values[..])) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010]))) .build() .unwrap(); @@ -601,7 +601,7 @@ mod tests { // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) .len(2) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .add_child_data(child_data) .build() .unwrap(); @@ -636,7 +636,7 @@ mod tests { let array_data = ArrayData::builder(data_type) .len(2) - .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_buffer(Buffer::from_slice_ref(offsets)) .add_child_data(child_data) .build() .unwrap(); diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index f62a84cf03ce..c8ccfdc073f2 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -418,15 +418,15 @@ mod tests { // Check data assert_eq!( union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref(&[1_i32, 4, 6]) + Buffer::from_slice_ref([1_i32, 4, 6]) ); assert_eq!( union.data().child_data()[1].buffers()[0], - Buffer::from_slice_ref(&[2_i32, 7]) + Buffer::from_slice_ref([2_i32, 7]) ); assert_eq!( union.data().child_data()[2].buffers()[0], - Buffer::from_slice_ref(&[3_i32, 5]), + Buffer::from_slice_ref([3_i32, 5]), ); assert_eq!(expected_array_values.len(), union.len()); @@ -627,8 +627,8 @@ mod tests { let type_ids = [1_i8, 0, 0, 2, 0, 1]; let value_offsets = [0_i32, 0, 1, 0, 2, 1]; - let type_id_buffer = Buffer::from_slice_ref(&type_ids); - let value_offsets_buffer = Buffer::from_slice_ref(&value_offsets); + let type_id_buffer = Buffer::from_slice_ref(type_ids); + let value_offsets_buffer = Buffer::from_slice_ref(value_offsets); let children: Vec<(Field, Arc)> = vec![ ( @@ -650,14 +650,14 @@ mod tests { .unwrap(); // Check type ids - assert_eq!(Buffer::from_slice_ref(&type_ids), array.data().buffers()[0]); + assert_eq!(Buffer::from_slice_ref(type_ids), array.data().buffers()[0]); for (i, id) in type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); } // Check offsets assert_eq!( - Buffer::from_slice_ref(&value_offsets), + Buffer::from_slice_ref(value_offsets), array.data().buffers()[1] ); for (i, id) in value_offsets.iter().enumerate() { @@ -738,14 +738,14 @@ mod tests { // Check data assert_eq!( union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref(&[1_i32, 0, 0, 4, 0, 6, 0]), + Buffer::from_slice_ref([1_i32, 0, 0, 4, 0, 6, 0]), ); assert_eq!( - Buffer::from_slice_ref(&[0_i32, 2_i32, 0, 0, 0, 0, 7]), + Buffer::from_slice_ref([0_i32, 2_i32, 0, 0, 0, 0, 7]), union.data().child_data()[1].buffers()[0] ); assert_eq!( - Buffer::from_slice_ref(&[0_i32, 0, 3_i32, 0, 5, 0, 0]), + Buffer::from_slice_ref([0_i32, 0, 3_i32, 0, 5, 0, 0]), union.data().child_data()[2].buffers()[0] ); diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 3f5892ff037d..f0775797128a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -176,9 +176,9 @@ mod tests { let list_array = builder.finish(); let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); + assert_eq!(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7]), values); assert_eq!( - Buffer::from_slice_ref(&[0, 3, 6, 8].map(|n| O::from_usize(n).unwrap())), + Buffer::from_slice_ref([0, 3, 6, 8].map(|n| O::from_usize(n).unwrap())), list_array.data().buffers()[0].clone() ); assert_eq!(DataType::Int32, list_array.value_type()); @@ -296,21 +296,21 @@ mod tests { assert_eq!(4, list_array.len()); assert_eq!(1, list_array.null_count()); assert_eq!( - Buffer::from_slice_ref(&[0, 2, 5, 5, 6]), + Buffer::from_slice_ref([0, 2, 5, 5, 6]), list_array.data().buffers()[0].clone() ); assert_eq!(6, list_array.values().data().len()); assert_eq!(1, list_array.values().data().null_count()); assert_eq!( - Buffer::from_slice_ref(&[0, 2, 4, 7, 7, 8, 10]), + Buffer::from_slice_ref([0, 2, 4, 7, 7, 8, 10]), list_array.values().data().buffers()[0].clone() ); assert_eq!(10, list_array.values().data().child_data()[0].len()); assert_eq!(0, list_array.values().data().child_data()[0].null_count()); assert_eq!( - Buffer::from_slice_ref(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + Buffer::from_slice_ref([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), list_array.values().data().child_data()[0].buffers()[0].clone() ); } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 78f49550071a..71ca8480a2a9 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -228,15 +228,15 @@ mod tests { let expected_string_data = ArrayData::builder(DataType::Utf8) .len(4) .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) + .add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7])) .add_buffer(Buffer::from_slice_ref(b"joemark")) .build() .unwrap(); let expected_int_data = ArrayData::builder(DataType::Int32) .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) - .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) + .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) + .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) .build() .unwrap(); diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 1cb04aa6f786..f00f81d1a5c0 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -300,15 +300,15 @@ mod tests { let expected_string_data = ArrayData::builder(DataType::Utf8) .len(4) .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) + .add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7])) .add_buffer(Buffer::from_slice_ref(b"joemark")) .build() .unwrap(); let expected_int_data = ArrayData::builder(DataType::Int32) .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref(&[11_u8]))) - .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) + .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) + .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) .build() .unwrap(); diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index d5d7cd8ef8c7..4048787c6a1f 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -55,7 +55,7 @@ impl Buffer { } /// Initializes a [Buffer] from a slice of items. - pub fn from_slice_ref>(items: &T) -> Self { + pub fn from_slice_ref>(items: T) -> Self { let slice = items.as_ref(); let capacity = slice.len() * std::mem::size_of::(); let mut buffer = MutableBuffer::with_capacity(capacity); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 79c23bfac897..3bf97cf7ade4 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4460,7 +4460,7 @@ mod tests { .data() .clone(); - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two // [[0,0,0], [-1, -2, -1], [2, 100000000]] @@ -4525,7 +4525,7 @@ mod tests { .data() .clone(); - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 9]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 9]); // Construct a list array from the above two let list_data_type = @@ -6765,13 +6765,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two let list_data_type = @@ -6789,13 +6789,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); // Construct a list array from the above two let list_data_type = @@ -7007,7 +7007,7 @@ mod tests { #[test] fn test_list_to_string() { let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); let value_data = ArrayData::builder(DataType::Utf8) .len(str_array.len()) .buffers(str_array.data().buffers().to_vec()) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 902bfbf67239..811696e4dd17 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1575,7 +1575,7 @@ mod tests { 5, None, 0, - vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])], + vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])], vec![], ) .unwrap(); @@ -1690,8 +1690,8 @@ mod tests { assert!(!int_data.ptr_eq(&int_data_slice)); assert!(!int_data_slice.ptr_eq(&int_data)); - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0_i32, 2_i32, 2_i32, 5_i32]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); let string_data = ArrayData::try_new( DataType::Utf8, 3, diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 75b76af1e6fc..a0510edd94b6 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1262,7 +1262,7 @@ mod tests { let utf8s = StringArray::from(vec![Some("aa"), None, Some("bbb")]); let value_data = Int32Array::from(vec![None, Some(2), None, None]); - let value_offsets = Buffer::from_slice_ref(&[0, 3, 4, 4]); + let value_offsets = Buffer::from_slice_ref([0, 3, 4, 4]); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 0165c775d5a3..e697a89d01aa 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -525,7 +525,7 @@ fn get_aligned_buffer(buffer: &Buffer, length: usize) -> Buffer { if align_offset != 0 { let len_in_bytes = (length * std::mem::size_of::()).min(buffer.len()); let slice = &buffer.as_slice()[0..len_in_bytes]; - Buffer::from_slice_ref(&slice) + Buffer::from_slice_ref(slice) } else { buffer.clone() } @@ -1282,9 +1282,7 @@ mod tests { let array8_values = ArrayData::builder(DataType::Int32) .len(9) - .add_buffer(Buffer::from_slice_ref(&[ - 40, 41, 42, 43, 44, 45, 46, 47, 48, - ])) + .add_buffer(Buffer::from_slice_ref([40, 41, 42, 43, 44, 45, 46, 47, 48])) .build() .unwrap(); let array8_data = ArrayData::builder(schema.field(8).data_type().clone()) @@ -1593,7 +1591,7 @@ mod tests { false, ); - let entry_offsets = Buffer::from_slice_ref(&[0, 2, 4, 6]); + let entry_offsets = Buffer::from_slice_ref([0, 2, 4, 6]); let map_data = ArrayData::builder(map_data_type) .len(3) .add_buffer(entry_offsets) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 44f32f0cbcf1..dec44de177f3 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1557,8 +1557,8 @@ mod tests { let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false); - let types = Buffer::from_slice_ref(&[0_i8, 0, 0]); - let offsets = Buffer::from_slice_ref(&[0_i32, 1, 2]); + let types = Buffer::from_slice_ref([0_i8, 0, 0]); + let offsets = Buffer::from_slice_ref([0_i32, 1, 2]); let union = UnionArray::try_new(&[0], types, Some(offsets), vec![(dctfield, array)]) diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 860e6b58c4ac..646d9c0d1975 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -2194,7 +2194,7 @@ mod tests { // test that the list offsets are correct assert_eq!( cc.data().buffers()[0], - Buffer::from_slice_ref(&[0i32, 2, 2, 4, 5]) + Buffer::from_slice_ref([0i32, 2, 2, 4, 5]) ); let cc = cc.values(); let cc = cc.as_any().downcast_ref::().unwrap(); @@ -2215,7 +2215,7 @@ mod tests { // test that the list offsets are correct assert_eq!( dd.data().buffers()[0], - Buffer::from_slice_ref(&[0i32, 1, 1, 2, 6]) + Buffer::from_slice_ref([0i32, 1, 1, 2, 6]) ); let dd = dd.values(); let dd = dd.as_any().downcast_ref::().unwrap(); @@ -2343,7 +2343,7 @@ mod tests { .unwrap(); let a_list = ArrayDataBuilder::new(a_field.data_type().clone()) .len(6) - .add_buffer(Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7])) .add_child_data(a) .null_bit_buffer(Some(Buffer::from(vec![0b00110111]))) .build() @@ -2359,7 +2359,7 @@ mod tests { let expected = expected.as_any().downcast_ref::().unwrap(); assert_eq!( read.data().buffers()[0], - Buffer::from_slice_ref(&[0i32, 2, 3, 6, 6, 6, 7]) + Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) ); // compare list null buffers assert_eq!(read.data().null_buffer(), expected.data().null_buffer()); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index f454397647c3..41d93aefa31b 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -924,11 +924,11 @@ mod tests { fn test_filter_list_array() { let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8, 8]); + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8, 8]); let list_data_type = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); @@ -948,11 +948,11 @@ mod tests { // expected: [[3, 4, 5], null] let value_data = ArrayData::builder(DataType::Int32) .len(3) - .add_buffer(Buffer::from_slice_ref(&[3, 4, 5])) + .add_buffer(Buffer::from_slice_ref([3, 4, 5])) .build() .unwrap(); - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 3]); + let value_offsets = Buffer::from_slice_ref([0i64, 3, 3]); let list_data_type = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); @@ -1305,7 +1305,7 @@ mod tests { fn test_filter_fixed_size_list_arrays() { let value_data = ArrayData::builder(DataType::Int32) .len(9) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8])) .build() .unwrap(); let list_data_type = DataType::FixedSizeList( @@ -1355,7 +1355,7 @@ mod tests { fn test_filter_fixed_size_list_arrays_with_null() { let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 4af876a79dcc..d498ae487c3e 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1846,7 +1846,7 @@ mod tests { .data() .clone(); // Construct offsets - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 05c8b7aa6156..7423b13bc07c 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -4026,7 +4026,7 @@ mod tests { ]) .data() .clone(); - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 6, 9]); + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 6, 9]); let list_data_type = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 07cf727b09d4..7b8f519cf6ac 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -91,13 +91,13 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); // Construct a buffer for value offsets, for the nested array: // [[0, 1], null, [2, 3], null, [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0, 2, 2, 4, 4, 6, 6, 9, 9, 10]); + let value_offsets = Buffer::from_slice_ref([0, 2, 2, 4, 4, 6, 6, 9, 9, 10]); // 01010101 00000001 let mut null_bits: [u8; 2] = [0; 2]; bit_util::set_bit(&mut null_bits, 0); @@ -150,7 +150,7 @@ mod tests { .unwrap(); let int_data = ArrayData::builder(DataType::Int32) .len(5) - .add_buffer(Buffer::from_slice_ref(&[0, 28, 42, 0, 0])) + .add_buffer(Buffer::from_slice_ref([0, 28, 42, 0, 0])) .null_bit_buffer(Some(Buffer::from([0b00000110]))) .build() .unwrap(); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 03c265318185..fc8dc654af0c 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1017,7 +1017,7 @@ mod tests { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build() .unwrap(); diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index c98c8a649cb5..b1a07dfee8bc 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -874,7 +874,7 @@ mod tests { // Can't use UnionBuilder with non-primitive types, so manually build outer UnionArray let a_array = Int32Array::from(vec![None, None, None, Some(1234), Some(23)]); - let type_ids = Buffer::from_slice_ref(&[1_i8, 1, 0, 0, 1]); + let type_ids = Buffer::from_slice_ref([1_i8, 1, 0, 0, 1]); let children: Vec<(Field, Arc)> = vec![ (Field::new("a", DataType::Int32, true), Arc::new(a_array)), diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 03942be10e01..42f9ab277d40 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -557,8 +557,7 @@ fn test_list_append() { Some(14), Some(15), ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); + let list_value_offsets = Buffer::from_slice_ref([0i32, 3, 5, 11, 13, 13, 15, 15, 17]); let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 8, @@ -637,7 +636,7 @@ fn test_list_nulls_append() { Some(15), ]); let list_value_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Int64, true))), 12, @@ -772,7 +771,7 @@ fn test_map_nulls_append() { ]); let map_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); let expected_list_data = ArrayData::try_new( DataType::Map( @@ -852,7 +851,7 @@ fn test_list_of_strings_append() { None, // extend b[0..0] ]); - let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); + let list_value_offsets = Buffer::from_slice_ref([0, 3, 5, 6, 9, 10, 13]); let expected_list_data = ArrayData::try_new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), 6, diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 16f031a1eb15..4faf69658e6a 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -32,7 +32,7 @@ use std::sync::Arc; expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" )] fn test_buffer_too_small() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer = Buffer::from_slice_ref([0i32, 2i32]); // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); } @@ -42,7 +42,7 @@ fn test_buffer_too_small() { expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" )] fn test_buffer_too_small_offset() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer = Buffer::from_slice_ref([0i32, 2i32]); // should fail -- size is ok, but also has offset ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); } @@ -50,8 +50,8 @@ fn test_buffer_too_small_offset() { #[test] #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] fn test_bad_number_of_buffers() { - let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); - let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer1 = Buffer::from_slice_ref([0i32, 2i32]); + let buffer2 = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) .unwrap(); } @@ -59,7 +59,7 @@ fn test_bad_number_of_buffers() { #[test] #[should_panic(expected = "integer overflow computing min buffer size")] fn test_fixed_width_overflow() { - let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let buffer = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) .unwrap(); } @@ -85,7 +85,7 @@ fn test_bitmap_too_small() { #[test] #[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] fn test_non_int_dictionary() { - let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let i32_buffer = Buffer::from_slice_ref([0i32, 2i32]); let data_type = DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); let child_data = ArrayData::try_new( @@ -113,7 +113,7 @@ fn test_non_int_dictionary() { fn test_mismatched_dictionary_types() { // test w/ dictionary created with a child array data that has type different than declared let string_array: StringArray = vec![Some("foo"), Some("bar")].into_iter().collect(); - let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); + let i32_buffer = Buffer::from_slice_ref([0i32, 1i32]); // Dict says LargeUtf8 but array is Utf8 let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::LargeUtf8)); @@ -140,7 +140,7 @@ fn test_empty_utf8_array_with_empty_offsets_buffer() { #[test] fn test_empty_utf8_array_with_single_zero_offset() { let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + let offsets_buffer = Buffer::from_slice_ref([0i32]); ArrayData::try_new( DataType::Utf8, 0, @@ -156,7 +156,7 @@ fn test_empty_utf8_array_with_single_zero_offset() { #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] fn test_empty_utf8_array_with_invalid_offset() { let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[1i32]); + let offsets_buffer = Buffer::from_slice_ref([1i32]); ArrayData::try_new( DataType::Utf8, 0, @@ -170,8 +170,8 @@ fn test_empty_utf8_array_with_invalid_offset() { #[test] fn test_empty_utf8_array_with_non_zero_offset() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0i32, 2, 6, 0]); ArrayData::try_new( DataType::Utf8, 0, @@ -189,7 +189,7 @@ fn test_empty_utf8_array_with_non_zero_offset() { )] fn test_empty_large_utf8_array_with_wrong_type_offsets() { let data_buffer = Buffer::from(&[]); - let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + let offsets_buffer = Buffer::from_slice_ref([0i32]); ArrayData::try_new( DataType::LargeUtf8, 0, @@ -204,8 +204,8 @@ fn test_empty_large_utf8_array_with_wrong_type_offsets() { #[test] #[should_panic(expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8")] fn test_validate_offsets_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -222,8 +222,8 @@ fn test_validate_offsets_i32() { expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" )] fn test_validate_offsets_i64() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0i64, 2i64]); ArrayData::try_new( DataType::LargeUtf8, 2, @@ -238,8 +238,8 @@ fn test_validate_offsets_i64() { #[test] #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] fn test_validate_offsets_negative_first_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([-2i32, 1i32, 3i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -254,8 +254,8 @@ fn test_validate_offsets_negative_first_i32() { #[test] #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] fn test_validate_offsets_negative_last_i32() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0i32, 2i32, -3i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -270,9 +270,9 @@ fn test_validate_offsets_negative_last_i32() { #[test] #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] fn test_validate_offsets_range_too_small() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); // start offset is larger than end - let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); + let offsets_buffer = Buffer::from_slice_ref([4i32, 2i32, 3i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -287,9 +287,9 @@ fn test_validate_offsets_range_too_small() { #[test] #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] fn test_validate_offsets_range_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); + let offsets_buffer = Buffer::from_slice_ref([0i32, 2i32, 10i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -304,9 +304,9 @@ fn test_validate_offsets_range_too_large() { #[test] #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] fn test_validate_offsets_first_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); + let offsets_buffer = Buffer::from_slice_ref([10i32, 2i32, 10i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -320,9 +320,9 @@ fn test_validate_offsets_first_too_large() { #[test] fn test_validate_offsets_first_too_large_skipped() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); // 10 is off the end of the buffer, but offset starts at 1 so it is skipped - let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); + let offsets_buffer = Buffer::from_slice_ref([10i32, 2i32, 3i32, 4i32]); let data = ArrayData::try_new( DataType::Utf8, 2, @@ -340,9 +340,9 @@ fn test_validate_offsets_first_too_large_skipped() { #[test] #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] fn test_validate_offsets_last_too_large() { - let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); // 10 is off the end of the buffer - let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); + let offsets_buffer = Buffer::from_slice_ref([5i32, 7i32, 8i32]); ArrayData::try_new( DataType::Utf8, 2, @@ -421,7 +421,7 @@ fn test_validate_struct_child_length() { /// Test that the array of type `data_type` that has invalid utf8 data errors fn check_utf8_validation(data_type: DataType) { // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself - let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); + let data_buffer = Buffer::from_slice_ref([b'a', b'a', 0x80, 0x00]); let offsets: Vec = [0, 2, 3] .iter() .map(|&v| T::from_usize(v).unwrap()) @@ -485,7 +485,7 @@ fn test_validate_large_utf8_char_boundary() { /// Test that the array of type `data_type` that has invalid indexes (out of bounds) fn check_index_out_of_bounds_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + let data_buffer = Buffer::from_slice_ref([b'a', b'b', b'c', b'd']); // First two offsets are fine, then 5 is out of bounds let offsets: Vec = [0, 1, 2, 5, 2] .iter() @@ -538,7 +538,7 @@ fn test_validate_large_binary_out_of_bounds() { // validate that indexes don't go bacwards check indexes that go backwards fn check_index_backwards_validation(data_type: DataType) { - let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + let data_buffer = Buffer::from_slice_ref([b'a', b'b', b'c', b'd']); // First three offsets are fine, then 1 goes backwards let offsets: Vec = [0, 1, 2, 2, 1] .iter() @@ -799,7 +799,7 @@ fn test_validate_union_different_types() { let field2 = vec![Some(1), Some(2)].into_iter().collect::(); - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + let type_ids = Buffer::from_slice_ref([0i8, 1i8]); ArrayData::try_new( DataType::Union( @@ -830,7 +830,7 @@ fn test_validate_union_sparse_different_child_len() { // field 2 only has 1 item but array should have 2 let field2 = vec![Some(1)].into_iter().collect::(); - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + let type_ids = Buffer::from_slice_ref([0i8, 1i8]); ArrayData::try_new( DataType::Union( @@ -857,7 +857,7 @@ fn test_validate_union_dense_without_offsets() { let field2 = vec![Some(1)].into_iter().collect::(); - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); + let type_ids = Buffer::from_slice_ref([0i8, 1i8]); ArrayData::try_new( DataType::Union( @@ -884,8 +884,8 @@ fn test_validate_union_dense_with_bad_len() { let field2 = vec![Some(1)].into_iter().collect::(); - let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); - let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 + let type_ids = Buffer::from_slice_ref([0i8, 1i8]); + let offsets = Buffer::from_slice_ref([0i32]); // should have 2 offsets, but only have 1 ArrayData::try_new( DataType::Union( From fa513333abf25ee6938d1df98554538324a7069b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 08:53:23 +0000 Subject: [PATCH 0304/1411] Remove unnecessary downcasts in builders (#3166) --- arrow-array/src/builder/fixed_size_list_builder.rs | 11 +++-------- arrow-array/src/builder/generic_list_builder.rs | 7 +------ arrow-array/src/builder/map_builder.rs | 14 ++------------ 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index e15708ed6c33..f6388d7899b7 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -111,16 +111,11 @@ where /// Builds the [`FixedSizeListBuilder`] and reset this builder. pub fn finish(&mut self) -> FixedSizeListArray { let len = self.len(); - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); + let values_arr = self.values_builder.finish(); let values_data = values_arr.data(); - assert!( - values_data.len() == len * self.list_len as usize, + assert_eq!( + values_data.len(), len * self.list_len as usize, "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", values_data.len(), self.list_len, diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index f0775797128a..11656786454a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -115,12 +115,7 @@ where /// Builds the [`GenericListArray`] and reset this builder. pub fn finish(&mut self) -> GenericListArray { let len = self.len(); - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); + let values_arr = self.values_builder.finish(); let values_data = values_arr.data(); let offset_buffer = self.offsets_builder.finish(); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 71ca8480a2a9..4b75972482be 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -107,18 +107,8 @@ impl MapBuilder { let len = self.len(); // Build the keys - let keys_arr = self - .key_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_arr = self - .value_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); + let keys_arr = self.key_builder.finish(); + let values_arr = self.value_builder.finish(); let keys_field = Field::new( self.field_names.key.as_str(), From 12a67b9bc7e1538f5af1a189cc0a78c14d551897 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 13:15:39 +0000 Subject: [PATCH 0305/1411] Add Row size methods (#3160) (#3163) * Add Row size methods (#3160) * Fix copypasta * Fix --- arrow/src/row/interner.rs | 21 +++++++++++++++++++++ arrow/src/row/mod.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arrow/src/row/interner.rs b/arrow/src/row/interner.rs index e6c8f0972417..1c71b6a55217 100644 --- a/arrow/src/row/interner.rs +++ b/arrow/src/row/interner.rs @@ -157,6 +157,15 @@ impl OrderPreservingInterner { pub fn value(&self, key: Interned) -> &[u8] { self.values.index(key) } + + /// Returns the size of this instance in bytes including self + pub fn size(&self) -> usize { + std::mem::size_of::() + + self.keys.buffer_size() + + self.values.buffer_size() + + self.bucket.size() + + self.lookup.capacity() * std::mem::size_of::() + } } /// A buffer of `[u8]` indexed by `[Interned]` @@ -192,6 +201,11 @@ impl InternBuffer { self.offsets.push(self.values.len()); key } + + /// Returns the byte size of the associated buffers + fn buffer_size(&self) -> usize { + self.values.capacity() + self.offsets.capacity() * std::mem::size_of::() + } } impl Index for InternBuffer { @@ -324,6 +338,13 @@ impl Bucket { } } } + + /// Returns the size of this instance in bytes + fn size(&self) -> usize { + std::mem::size_of::() + + self.slots.capacity() * std::mem::size_of::() + + self.next.as_ref().map(|x| x.size()).unwrap_or_default() + } } #[cfg(test)] diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 1d0a58d954bf..c57fd41ebc02 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -358,6 +358,14 @@ impl SortField { pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self { Self { options, data_type } } + + /// Return size of this instance in bytes. + /// + /// Includes the size of `Self`. + pub fn size(&self) -> usize { + self.data_type.size() + std::mem::size_of::() + - std::mem::size_of::() + } } impl RowConverter { @@ -480,6 +488,21 @@ impl RowConverter { }) .collect() } + + /// Returns the size of this instance in bytes + /// + /// Includes the size of `Self`. + pub fn size(&self) -> usize { + std::mem::size_of::() + + self.fields.iter().map(|x| x.size()).sum::() + + self.interners.capacity() + * std::mem::size_of::>>() + + self + .interners + .iter() + .filter_map(|x| x.as_ref().map(|x| x.size())) + .sum::() + } } /// A row-oriented representation of arrow data, that is normalized for comparison. @@ -512,6 +535,16 @@ impl Rows { pub fn iter(&self) -> RowsIter<'_> { self.into_iter() } + + /// Returns the size of this instance in bytes + /// + /// Includes the size of `Self`. + pub fn size(&self) -> usize { + // Size of fields is accounted for as part of RowConverter + std::mem::size_of::() + + self.buffer.len() + + self.offsets.len() * std::mem::size_of::() + } } impl<'a> IntoIterator for &'a Rows { From 6c466afe3b0b3a4c7b90c99c27eefade62011c31 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 23 Nov 2022 11:12:58 -0500 Subject: [PATCH 0306/1411] Add finish_cloned to ArrayBuilder (#3158) * add finish_cloned to PrimitiveBuilder * Add finish_cloned on array builders * incorporate PR comments and other PR merges * remove build_clone from union builder Co-authored-by: askoa --- arrow-array/src/builder/boolean_builder.rs | 45 ++++++++++ .../src/builder/fixed_size_binary_builder.rs | 53 +++++++++++ .../src/builder/fixed_size_list_builder.rs | 79 +++++++++++++++++ .../src/builder/generic_bytes_builder.rs | 56 +++++++++++- .../src/builder/generic_list_builder.rs | 55 ++++++++++++ arrow-array/src/builder/map_builder.rs | 48 ++++++++++ arrow-array/src/builder/mod.rs | 3 + .../src/builder/null_buffer_builder.rs | 6 +- arrow-array/src/builder/primitive_builder.rs | 46 +++++++++- .../builder/primitive_dictionary_builder.rs | 22 +++++ .../src/builder/string_dictionary_builder.rs | 73 +++++++++++++++ arrow-array/src/builder/struct_builder.rs | 88 +++++++++++++++++++ 12 files changed, 571 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 96711dd1f6f6..96f436253c5a 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BooleanBufferBuilder}; use crate::{ArrayRef, BooleanArray}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -154,6 +155,23 @@ impl BooleanBuilder { let array_data = unsafe { builder.build_unchecked() }; BooleanArray::from(array_data) } + + /// Builds the [BooleanArray] without resetting the builder. + pub fn finish_cloned(&self) -> BooleanArray { + let len = self.len(); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + let value_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let builder = ArrayData::builder(DataType::Boolean) + .len(len) + .add_buffer(value_buffer) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + BooleanArray::from(array_data) + } } impl ArrayBuilder for BooleanBuilder { @@ -186,6 +204,11 @@ impl ArrayBuilder for BooleanBuilder { fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } #[cfg(test)] @@ -259,4 +282,26 @@ mod tests { assert_eq!(0, array.null_count()); assert!(array.data().null_buffer().is_none()); } + + #[test] + fn test_boolean_array_builder_finish_cloned() { + let mut builder = BooleanArray::builder(16); + builder.append_option(Some(true)); + builder.append_value(false); + builder.append_slice(&[true, false, true]); + let mut array = builder.finish_cloned(); + assert_eq!(3, array.true_count()); + assert_eq!(2, array.false_count()); + + builder + .append_values(&[false, false, true], &[true, true, true]) + .unwrap(); + + array = builder.finish(); + assert_eq!(4, array.true_count()); + assert_eq!(4, array.false_count()); + + assert_eq!(0, array.null_count()); + assert!(array.data().null_buffer().is_none()); + } } diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 15b840d0a95d..e9581922ccaa 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, UInt8BufferBuilder}; use crate::{ArrayRef, FixedSizeBinaryArray}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -87,6 +88,23 @@ impl FixedSizeBinaryBuilder { let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) } + + /// Builds the [`FixedSizeBinaryArray`] without resetting the builder. + pub fn finish_cloned(&self) -> FixedSizeBinaryArray { + let array_length = self.len(); + let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let array_data_builder = + ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(values_buffer) + .null_bit_buffer( + self.null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref), + ) + .len(array_length); + let array_data = unsafe { array_data_builder.build_unchecked() }; + FixedSizeBinaryArray::from(array_data) + } } impl ArrayBuilder for FixedSizeBinaryBuilder { @@ -119,6 +137,11 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } #[cfg(test)] @@ -146,6 +169,36 @@ mod tests { assert_eq!(5, array.value_length()); } + #[test] + fn test_fixed_size_binary_builder_finish_cloned() { + let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); + + // [b"hello", null, "arrow"] + builder.append_value(b"hello").unwrap(); + builder.append_null(); + builder.append_value(b"arrow").unwrap(); + let mut array: FixedSizeBinaryArray = builder.finish_cloned(); + + assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); + assert_eq!(3, array.len()); + assert_eq!(1, array.null_count()); + assert_eq!(10, array.value_offset(2)); + assert_eq!(5, array.value_length()); + + // [b"finis", null, "clone"] + builder.append_value(b"finis").unwrap(); + builder.append_null(); + builder.append_value(b"clone").unwrap(); + + array = builder.finish(); + + assert_eq!(&DataType::FixedSizeBinary(5), array.data_type()); + assert_eq!(6, array.len()); + assert_eq!(2, array.null_count()); + assert_eq!(25, array.value_offset(5)); + assert_eq!(5, array.value_length()); + } + #[test] fn test_fixed_size_binary_builder_with_zero_value_length() { let mut builder = FixedSizeBinaryBuilder::new(0); diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index f6388d7899b7..516c22925786 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::ArrayBuilder; use crate::{ArrayRef, FixedSizeListArray}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; use std::any::Any; @@ -84,6 +85,11 @@ where fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } impl FixedSizeListBuilder @@ -135,6 +141,37 @@ where FixedSizeListArray::from(array_data) } + + /// Builds the [`FixedSizeListBuilder`] without resetting the builder. + pub fn finish_cloned(&self) -> FixedSizeListArray { + let len = self.len(); + let values_arr = self.values_builder.finish_cloned(); + let values_data = values_arr.data(); + + assert_eq!( + values_data.len(), len * self.list_len as usize, + "Length of the child array ({}) must be the multiple of the value length ({}) and the array length ({}).", + values_data.len(), + self.list_len, + len, + ); + + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + let array_data = ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", values_data.data_type().clone(), true)), + self.list_len, + )) + .len(len) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data.build_unchecked() }; + + FixedSizeListArray::from(array_data) + } } #[cfg(test)] @@ -176,6 +213,48 @@ mod tests { assert_eq!(3, list_array.value_length()); } + #[test] + fn test_fixed_size_list_array_builder_finish_cloned() { + let values_builder = Int32Builder::new(); + let mut builder = FixedSizeListBuilder::new(values_builder, 3); + + // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true); + let mut list_array = builder.finish_cloned(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(3, list_array.value_length()); + + builder.values().append_value(6); + builder.values().append_value(7); + builder.values().append_null(); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(5, list_array.len()); + assert_eq!(2, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + } + #[test] fn test_fixed_size_list_array_builder_empty() { let values_builder = Int32Array::builder(5); diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index fa0a31ad79e1..9f9078c708c8 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -19,7 +19,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; @@ -94,6 +94,25 @@ impl GenericByteBuilder { GenericByteArray::from(array_data) } + /// Builds the [`GenericByteArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericByteArray { + let array_type = T::DATA_TYPE; + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice()); + let array_builder = ArrayDataBuilder::new(array_type) + .len(self.len()) + .add_buffer(offset_buffer) + .add_buffer(value_buffer) + .null_bit_buffer( + self.null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref), + ); + + let array_data = unsafe { array_builder.build_unchecked() }; + GenericByteArray::from(array_data) + } + /// Returns the current values buffer as a slice pub fn values_slice(&self) -> &[u8] { self.value_builder.as_slice() @@ -138,6 +157,11 @@ impl ArrayBuilder for GenericByteBuilder { Arc::new(self.finish()) } + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + /// Returns the builder as a non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { self @@ -325,4 +349,34 @@ mod tests { fn test_large_string_array_builder_finish() { _test_generic_string_array_builder_finish::() } + + fn _test_generic_string_array_builder_finish_cloned() { + let mut builder = GenericStringBuilder::::with_capacity(3, 11); + + builder.append_value("hello"); + builder.append_value("rust"); + builder.append_null(); + + let mut arr = builder.finish_cloned(); + assert!(!builder.is_empty()); + assert_eq!(3, arr.len()); + + builder.append_value("arrow"); + builder.append_value("parquet"); + arr = builder.finish(); + + assert!(arr.data().null_buffer().is_some()); + assert_eq!(&[O::zero()], builder.offsets_slice()); + assert_eq!(5, arr.len()); + } + + #[test] + fn test_string_array_builder_finish_cloned() { + _test_generic_string_array_builder_finish_cloned::() + } + + #[test] + fn test_large_string_array_builder_finish_cloned() { + _test_generic_string_array_builder_finish_cloned::() + } } diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 11656786454a..8f3f881c4b32 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::{ArrayRef, GenericListArray, OffsetSizeTrait}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::Field; use std::any::Any; @@ -85,6 +86,11 @@ where fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } impl GenericListBuilder @@ -138,6 +144,34 @@ where GenericListArray::::from(array_data) } + /// Builds the [`GenericListArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListArray { + let len = self.len(); + let values_arr = self.values_builder.finish_cloned(); + let values_data = values_arr.data(); + + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + let field = Box::new(Field::new( + "item", + values_data.data_type().clone(), + true, // TODO: find a consistent way of getting this + )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); + let array_data_builder = ArrayData::builder(data_type) + .len(len) + .add_buffer(offset_buffer) + .add_child_data(values_data.clone()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data_builder.build_unchecked() }; + + GenericListArray::::from(array_data) + } + /// Returns the current offsets buffer as a slice pub fn offsets_slice(&self) -> &[OffsetSize] { self.offsets_builder.as_slice() @@ -255,6 +289,27 @@ mod tests { assert!(builder.is_empty()); } + #[test] + fn test_list_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + #[test] fn test_list_list_array_builder() { let primitive_builder = Int32Builder::with_capacity(10); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 4b75972482be..5602f88636c3 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::{Array, ArrayRef, MapArray, StructArray}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; @@ -142,6 +143,48 @@ impl MapBuilder { MapArray::from(array_data) } + + pub fn finish_cloned(&self) -> MapArray { + let len = self.len(); + + // Build the keys + let keys_arr = self.key_builder.finish_cloned(); + let values_arr = self.value_builder.finish_cloned(); + + let keys_field = Field::new( + self.field_names.key.as_str(), + keys_arr.data_type().clone(), + false, // always nullable + ); + let values_field = Field::new( + self.field_names.value.as_str(), + values_arr.data_type().clone(), + true, + ); + + let struct_array = + StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); + + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + let map_field = Box::new(Field::new( + self.field_names.entry.as_str(), + struct_array.data_type().clone(), + false, // always non-nullable + )); + let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys + .len(len) + .add_buffer(offset_buffer) + .add_child_data(struct_array.into_data()) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { array_data.build_unchecked() }; + + MapArray::from(array_data) + } } impl ArrayBuilder for MapBuilder { @@ -157,6 +200,11 @@ impl ArrayBuilder for MapBuilder { Arc::new(self.finish()) } + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + fn as_any(&self) -> &dyn Any { self } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index a5c1e3d4b2fd..eaf8243973b8 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -107,6 +107,9 @@ pub trait ArrayBuilder: Any + Send { /// Builds the array fn finish(&mut self) -> ArrayRef; + /// Builds the array without resetting the underlying builder. + fn finish_cloned(&self) -> ArrayRef; + /// Returns the builder as a non-mutable `Any` reference. /// /// This is most useful when one wants to call non-mutable APIs on a specific builder diff --git a/arrow-array/src/builder/null_buffer_builder.rs b/arrow-array/src/builder/null_buffer_builder.rs index fef7214d5aa7..b3c788fe5993 100644 --- a/arrow-array/src/builder/null_buffer_builder.rs +++ b/arrow-array/src/builder/null_buffer_builder.rs @@ -135,7 +135,11 @@ impl NullBufferBuilder { buf } - #[inline] + /// Returns the inner bitmap builder as slice + pub fn as_slice(&self) -> Option<&[u8]> { + Some(self.bitmap_builder.as_ref()?.as_slice()) + } + fn materialize_if_needed(&mut self) { if self.bitmap_builder.is_none() { self.materialize() diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 55d8bac0189f..7a1fbafc76ff 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -19,7 +19,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::types::*; use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; -use arrow_buffer::MutableBuffer; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; use std::any::Any; use std::sync::Arc; @@ -93,6 +93,11 @@ impl ArrayBuilder for PrimitiveBuilder { fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } impl Default for PrimitiveBuilder { @@ -219,6 +224,23 @@ impl PrimitiveBuilder { PrimitiveArray::::from(array_data) } + /// Builds the [`PrimitiveArray`] without resetting the builder. + pub fn finish_cloned(&self) -> PrimitiveArray { + let len = self.len(); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); + let builder = ArrayData::builder(T::DATA_TYPE) + .len(len) + .add_buffer(values_buffer) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + PrimitiveArray::::from(array_data) + } + /// Returns the current values buffer as a slice pub fn values_slice(&self) -> &[T::Native] { self.values_builder.as_slice() @@ -431,4 +453,26 @@ mod tests { assert_eq!(5, arr.len()); assert_eq!(0, builder.len()); } + + #[test] + fn test_primitive_array_builder_finish_cloned() { + let mut builder = Int32Builder::new(); + builder.append_value(23); + builder.append_value(45); + let result = builder.finish_cloned(); + assert_eq!(result, Int32Array::from(vec![23, 45])); + builder.append_value(56); + assert_eq!(builder.finish_cloned(), Int32Array::from(vec![23, 45, 56])); + + builder.append_slice(&[2, 4, 6, 8]); + let mut arr = builder.finish(); + assert_eq!(7, arr.len()); + assert_eq!(arr, Int32Array::from(vec![23, 45, 56, 2, 4, 6, 8])); + assert_eq!(0, builder.len()); + + builder.append_slice(&[1, 3, 5, 7, 9]); + arr = builder.finish(); + assert_eq!(5, arr.len()); + assert_eq!(0, builder.len()); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index c43416e5af30..5b8a7283528a 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -160,6 +160,11 @@ where fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } impl PrimitiveDictionaryBuilder @@ -210,6 +215,23 @@ where DictionaryArray::from(unsafe { builder.build_unchecked() }) } + + /// Builds the `DictionaryArray` without resetting the builder. + pub fn finish_cloned(&self) -> DictionaryArray { + let values = self.values_builder.finish_cloned(); + let keys = self.keys_builder.finish_cloned(); + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } } #[cfg(test)] diff --git a/arrow-array/src/builder/string_dictionary_builder.rs b/arrow-array/src/builder/string_dictionary_builder.rs index e41086c872f1..f44756b6bcc5 100644 --- a/arrow-array/src/builder/string_dictionary_builder.rs +++ b/arrow-array/src/builder/string_dictionary_builder.rs @@ -222,6 +222,11 @@ where fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } } impl StringDictionaryBuilder @@ -287,6 +292,23 @@ where DictionaryArray::from(unsafe { builder.build_unchecked() }) } + + /// Builds the `DictionaryArray` without resetting the builder. + pub fn finish_cloned(&self) -> DictionaryArray { + let values = self.values_builder.finish_cloned(); + let keys = self.keys_builder.finish_cloned(); + + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + } } fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] { @@ -331,6 +353,57 @@ mod tests { assert_eq!(ava.value(1), "def"); } + #[test] + fn test_string_dictionary_builder_finish_cloned() { + let mut builder = StringDictionaryBuilder::::new(); + builder.append("abc").unwrap(); + builder.append_null(); + builder.append("def").unwrap(); + builder.append("def").unwrap(); + builder.append("abc").unwrap(); + let mut array = builder.finish_cloned(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + + assert_eq!(ava.value(0), "abc"); + assert_eq!(ava.value(1), "def"); + + builder.append("abc").unwrap(); + builder.append("ghi").unwrap(); + builder.append("def").unwrap(); + + array = builder.finish(); + + assert_eq!( + array.keys(), + &Int8Array::from(vec![ + Some(0), + None, + Some(1), + Some(1), + Some(0), + Some(0), + Some(2), + Some(1) + ]) + ); + + // Values are polymorphic and so require a downcast. + let av2 = array.values(); + let ava2: &StringArray = av2.as_any().downcast_ref::().unwrap(); + + assert_eq!(ava2.value(0), "abc"); + assert_eq!(ava2.value(1), "def"); + assert_eq!(ava2.value(2), "ghi"); + } + #[test] fn test_string_dictionary_builder_with_existing_dictionary() { let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index f00f81d1a5c0..98d0e1a1d275 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -18,6 +18,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::*; use crate::{Array, ArrayRef, StructArray}; +use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; use std::any::Any; @@ -63,6 +64,11 @@ impl ArrayBuilder for StructBuilder { Arc::new(self.finish()) } + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } + /// Returns the builder as a non-mutable `Any` reference. /// /// This is most useful when one wants to call non-mutable APIs on a specific builder @@ -230,6 +236,30 @@ impl StructBuilder { StructArray::from(array_data) } + /// Builds the `StructArray` without resetting the builder. + pub fn finish_cloned(&self) -> StructArray { + self.validate_content(); + + let mut child_data = Vec::with_capacity(self.field_builders.len()); + for f in &self.field_builders { + let arr = f.finish_cloned(); + child_data.push(arr.data().clone()); + } + let length = self.len(); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + + let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) + .len(length) + .child_data(child_data) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + StructArray::from(array_data) + } + /// Constructs and validates contents in the builder to ensure that /// - fields and field_builders are of equal length /// - the number of items in individual field_builders are equal to self.len() @@ -374,6 +404,64 @@ mod tests { assert_eq!(0, builder.len()); } + #[test] + fn test_struct_array_builder_finish_cloned() { + let int_builder = Int32Builder::new(); + let bool_builder = BooleanBuilder::new(); + + let mut fields = Vec::new(); + let mut field_builders = Vec::new(); + fields.push(Field::new("f1", DataType::Int32, false)); + field_builders.push(Box::new(int_builder) as Box); + fields.push(Field::new("f2", DataType::Boolean, false)); + field_builders.push(Box::new(bool_builder) as Box); + + let mut builder = StructBuilder::new(fields, field_builders); + builder + .field_builder::(0) + .unwrap() + .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[ + false, true, false, true, false, true, false, true, false, true, + ]); + + // Append slot values - all are valid. + for _ in 0..10 { + builder.append(true); + } + + assert_eq!(10, builder.len()); + + let mut arr = builder.finish_cloned(); + + assert_eq!(10, arr.len()); + assert_eq!(10, builder.len()); + + builder + .field_builder::(0) + .unwrap() + .append_slice(&[1, 3, 5, 7, 9]); + builder + .field_builder::(1) + .unwrap() + .append_slice(&[false, true, false, true, false]); + + // Append slot values - all are valid. + for _ in 0..5 { + builder.append(true); + } + + assert_eq!(15, builder.len()); + + arr = builder.finish(); + + assert_eq!(15, arr.len()); + assert_eq!(0, builder.len()); + } + #[test] fn test_struct_array_builder_from_schema() { let mut fields = vec![ From 78ab0ef3f6f422fd4b79a29504f0274220aaf74b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 23 Nov 2022 10:33:47 -0800 Subject: [PATCH 0307/1411] Support decimal negative scale (#3152) * Support decimal negative scale * Fix casting from numeric to negative scale decimal * Fix clippy --- arrow-array/src/array/primitive_array.rs | 15 +++- arrow-array/src/types.rs | 33 ++++---- arrow-cast/src/cast.rs | 102 +++++++++++++++++++---- arrow-csv/src/reader.rs | 6 +- arrow-data/src/decimal.rs | 6 +- arrow-schema/src/datatype.rs | 4 +- arrow-select/src/take.rs | 6 +- arrow/benches/cast_kernels.rs | 4 +- arrow/src/datatypes/ffi.rs | 4 +- arrow/tests/array_transform.rs | 2 +- 10 files changed, 131 insertions(+), 51 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index f34c899e2265..bd68b9698ce9 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1003,7 +1003,7 @@ impl PrimitiveArray { pub fn with_precision_and_scale( self, precision: u8, - scale: u8, + scale: i8, ) -> Result where Self: Sized, @@ -1024,7 +1024,7 @@ impl PrimitiveArray { fn validate_precision_scale( &self, precision: u8, - scale: u8, + scale: i8, ) -> Result<(), ArrowError> { if precision == 0 { return Err(ArrowError::InvalidArgumentError(format!( @@ -1046,7 +1046,14 @@ impl PrimitiveArray { T::MAX_SCALE ))); } - if scale > precision { + if scale < -T::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is smaller than min {}", + scale, + -Decimal128Type::MAX_SCALE + ))); + } + if scale > 0 && scale as u8 > precision { return Err(ArrowError::InvalidArgumentError(format!( "scale {} is greater than precision {}", scale, precision @@ -1102,7 +1109,7 @@ impl PrimitiveArray { } /// Returns the decimal scale of this array - pub fn scale(&self) -> u8 { + pub fn scale(&self) -> i8 { match T::BYTE_LENGTH { 16 => { if let DataType::Decimal128(_, s) = self.data().data_type() { diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index dd4d1ba4292b..40d262e8ed72 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -491,15 +491,15 @@ pub trait DecimalType: { const BYTE_LENGTH: usize; const MAX_PRECISION: u8; - const MAX_SCALE: u8; - const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType; + const MAX_SCALE: i8; + const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType; const DEFAULT_TYPE: DataType; /// "Decimal128" or "Decimal256", for use in error messages const PREFIX: &'static str; /// Formats the decimal value with the provided precision and scale - fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String; + fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String; /// Validates that `value` contains no more than `precision` decimal digits fn validate_decimal_precision( @@ -515,14 +515,14 @@ pub struct Decimal128Type {} impl DecimalType for Decimal128Type { const BYTE_LENGTH: usize = 16; const MAX_PRECISION: u8 = DECIMAL128_MAX_PRECISION; - const MAX_SCALE: u8 = DECIMAL128_MAX_SCALE; - const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal128; + const MAX_SCALE: i8 = DECIMAL128_MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal128; const DEFAULT_TYPE: DataType = DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); const PREFIX: &'static str = "Decimal128"; - fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { - format_decimal_str(&value.to_string(), precision as usize, scale as usize) + fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale) } fn validate_decimal_precision(num: i128, precision: u8) -> Result<(), ArrowError> { @@ -543,14 +543,14 @@ pub struct Decimal256Type {} impl DecimalType for Decimal256Type { const BYTE_LENGTH: usize = 32; const MAX_PRECISION: u8 = DECIMAL256_MAX_PRECISION; - const MAX_SCALE: u8 = DECIMAL256_MAX_SCALE; - const TYPE_CONSTRUCTOR: fn(u8, u8) -> DataType = DataType::Decimal256; + const MAX_SCALE: i8 = DECIMAL256_MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal256; const DEFAULT_TYPE: DataType = DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE); const PREFIX: &'static str = "Decimal256"; - fn format_decimal(value: Self::Native, precision: u8, scale: u8) -> String { - format_decimal_str(&value.to_string(), precision as usize, scale as usize) + fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale) } fn validate_decimal_precision(num: i256, precision: u8) -> Result<(), ArrowError> { @@ -564,7 +564,7 @@ impl ArrowPrimitiveType for Decimal256Type { const DATA_TYPE: DataType = ::DEFAULT_TYPE; } -fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String { +fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String { let (sign, rest) = match value_str.strip_prefix('-') { Some(stripped) => ("-", stripped), None => ("", value_str), @@ -574,13 +574,16 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: usize) -> String if scale == 0 { value_str.to_string() - } else if rest.len() > scale { + } else if scale < 0 { + let padding = value_str.len() + scale.unsigned_abs() as usize; + format!("{:0 scale as usize { // Decimal separator is in the middle of the string - let (whole, decimal) = value_str.split_at(value_str.len() - scale); + let (whole, decimal) = value_str.split_at(value_str.len() - scale as usize); format!("{}.{}", whole, decimal) } else { // String has to be padded - format!("{}0.{:0>width$}", sign, rest, width = scale) + format!("{}0.{:0>width$}", sign, rest, width = scale as usize) } } diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3bf97cf7ade4..61be2171b7c1 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -319,7 +319,7 @@ fn cast_integer_to_decimal< >( array: &PrimitiveArray, precision: u8, - scale: u8, + scale: i8, base: M, cast_options: &CastOptions, ) -> Result @@ -327,7 +327,7 @@ where ::Native: AsPrimitive, M: ArrowNativeTypeOp, { - let mul: M = base.pow_checked(scale as u32).map_err(|_| { + let mul_or_div: M = base.pow_checked(scale.unsigned_abs() as u32).map_err(|_| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). The scale causes overflow.", D::PREFIX, @@ -336,14 +336,26 @@ where )) })?; - if cast_options.safe { + if scale < 0 { + if cast_options.safe { + array + .unary_opt::<_, D>(|v| v.as_().div_checked(mul_or_div).ok()) + .with_precision_and_scale(precision, scale) + .map(|a| Arc::new(a) as ArrayRef) + } else { + array + .try_unary::<_, D, _>(|v| v.as_().div_checked(mul_or_div)) + .and_then(|a| a.with_precision_and_scale(precision, scale)) + .map(|a| Arc::new(a) as ArrayRef) + } + } else if cast_options.safe { array - .unary_opt::<_, D>(|v| v.as_().mul_checked(mul).ok()) + .unary_opt::<_, D>(|v| v.as_().mul_checked(mul_or_div).ok()) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array - .try_unary::<_, D, _>(|v| v.as_().mul_checked(mul)) + .try_unary::<_, D, _>(|v| v.as_().mul_checked(mul_or_div)) .and_then(|a| a.with_precision_and_scale(precision, scale)) .map(|a| Arc::new(a) as ArrayRef) } @@ -352,7 +364,7 @@ where fn cast_floating_point_to_decimal128( array: &PrimitiveArray, precision: u8, - scale: u8, + scale: i8, cast_options: &CastOptions, ) -> Result where @@ -391,7 +403,7 @@ where fn cast_floating_point_to_decimal256( array: &PrimitiveArray, precision: u8, - scale: u8, + scale: i8, cast_options: &CastOptions, ) -> Result where @@ -437,7 +449,7 @@ fn cast_reinterpret_arrays< fn cast_decimal_to_integer( array: &ArrayRef, base: D::Native, - scale: u8, + scale: i8, cast_options: &CastOptions, ) -> Result where @@ -1921,9 +1933,9 @@ fn cast_decimal_to_decimal_with_option< const BYTE_WIDTH2: usize, >( array: &ArrayRef, - input_scale: &u8, + input_scale: &i8, output_precision: &u8, - output_scale: &u8, + output_scale: &i8, cast_options: &CastOptions, ) -> Result { if cast_options.safe { @@ -1947,9 +1959,9 @@ fn cast_decimal_to_decimal_with_option< /// the array values when cast failures happen. fn cast_decimal_to_decimal_safe( array: &ArrayRef, - input_scale: &u8, + input_scale: &i8, output_precision: &u8, - output_scale: &u8, + output_scale: &i8, ) -> Result { if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; @@ -2062,9 +2074,9 @@ fn cast_decimal_to_decimal_safe( array: &ArrayRef, - input_scale: &u8, + input_scale: &i8, output_precision: &u8, - output_scale: &u8, + output_scale: &i8, ) -> Result { if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; @@ -3540,7 +3552,7 @@ mod tests { fn create_decimal_array( array: Vec>, precision: u8, - scale: u8, + scale: i8, ) -> Result { array .into_iter() @@ -3551,7 +3563,7 @@ mod tests { fn create_decimal256_array( array: Vec>, precision: u8, - scale: u8, + scale: i8, ) -> Result { array .into_iter() @@ -7206,4 +7218,62 @@ mod tests { err ); } + + #[test] + fn test_cast_decimal128_to_decimal128_negative_scale() { + let input_type = DataType::Decimal128(20, 0); + let output_type = DataType::Decimal128(20, -1); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let input_decimal_array = create_decimal_array(array, 20, 0).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(112345_i128), + Some(212345_i128), + Some(312345_i128), + None + ] + ); + + let casted_array = cast(&array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("1123450", decimal_arr.value_as_string(0)); + assert_eq!("2123450", decimal_arr.value_as_string(1)); + assert_eq!("3123450", decimal_arr.value_as_string(2)); + } + + #[test] + fn test_cast_numeric_to_decimal128_negative() { + let decimal_type = DataType::Decimal128(38, -1); + let array = Arc::new(Int32Array::from(vec![ + Some(1123456), + Some(2123456), + Some(3123456), + ])) as ArrayRef; + + let casted_array = cast(&array, &decimal_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("1123450", decimal_arr.value_as_string(0)); + assert_eq!("2123450", decimal_arr.value_as_string(1)); + assert_eq!("3123450", decimal_arr.value_as_string(2)); + + let array = Arc::new(Float32Array::from(vec![ + Some(1123.456), + Some(2123.456), + Some(3123.456), + ])) as ArrayRef; + + let casted_array = cast(&array, &decimal_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("1120", decimal_arr.value_as_string(0)); + assert_eq!("2120", decimal_arr.value_as_string(1)); + assert_eq!("3120", decimal_arr.value_as_string(2)); + } } diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 4200e9329c54..6432fb1b8017 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -721,7 +721,7 @@ fn build_decimal_array( rows: &[StringRecord], col_idx: usize, precision: u8, - scale: u8, + scale: i8, ) -> Result { let mut decimal_builder = Decimal128Builder::with_capacity(rows.len()); for row in rows { @@ -762,13 +762,13 @@ fn build_decimal_array( fn parse_decimal_with_parameter( s: &str, precision: u8, - scale: u8, + scale: i8, ) -> Result { if PARSE_DECIMAL_RE.is_match(s) { let mut offset = s.len(); let len = s.len(); let mut base = 1; - let scale_usize = usize::from(scale); + let scale_usize = usize::from(scale as u8); // handle the value after the '.' and meet the scale let delimiter_position = s.find('.'); diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index a6a08774941e..7011c40858c2 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -728,17 +728,17 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ pub const DECIMAL128_MAX_PRECISION: u8 = 38; /// The maximum scale for [arrow_schema::DataType::Decimal128] values -pub const DECIMAL128_MAX_SCALE: u8 = 38; +pub const DECIMAL128_MAX_SCALE: i8 = 38; /// The maximum precision for [arrow_schema::DataType::Decimal256] values pub const DECIMAL256_MAX_PRECISION: u8 = 76; /// The maximum scale for [arrow_schema::DataType::Decimal256] values -pub const DECIMAL256_MAX_SCALE: u8 = 76; +pub const DECIMAL256_MAX_SCALE: i8 = 76; /// The default scale for [arrow_schema::DataType::Decimal128] and /// [arrow_schema::DataType::Decimal256] values -pub const DECIMAL_DEFAULT_SCALE: u8 = 10; +pub const DECIMAL_DEFAULT_SCALE: i8 = 10; /// Validates that the specified `i128` value can be properly /// interpreted as a Decimal number with precision `precision` diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index cf85902e4ce7..f74e2a24b04f 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -190,14 +190,14 @@ pub enum DataType { /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. - Decimal128(u8, u8), + Decimal128(u8, i8), /// Exact 256-bit width decimal value with precision and scale /// /// * precision is the total number of digits /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. - Decimal256(u8, u8), + Decimal256(u8, i8), /// A Map is a logical nested type that is represented as /// /// `List>` diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d498ae487c3e..857b6e3231ba 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -914,7 +914,7 @@ mod tests { options: Option, expected_data: Vec>, precision: &u8, - scale: &u8, + scale: &i8, ) -> Result<(), ArrowError> { let output = data .into_iter() @@ -1032,7 +1032,7 @@ mod tests { fn test_take_decimal128_non_null_indices() { let index = UInt32Array::from(vec![0, 5, 3, 1, 4, 2]); let precision: u8 = 10; - let scale: u8 = 5; + let scale: i8 = 5; test_take_decimal_arrays( vec![None, Some(3), Some(5), Some(2), Some(3), None], &index, @@ -1048,7 +1048,7 @@ mod tests { fn test_take_decimal128() { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); let precision: u8 = 10; - let scale: u8 = 5; + let scale: i8 = 5; test_take_decimal_arrays( vec![Some(0), Some(1), Some(2), Some(3), Some(4)], &index, diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index e93c7860885c..7ef4d1d7e74a 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -84,7 +84,7 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { Arc::new(builder.finish()) } -fn build_decimal128_array(size: usize, precision: u8, scale: u8) -> ArrayRef { +fn build_decimal128_array(size: usize, precision: u8, scale: i8) -> ArrayRef { let mut rng = seedable_rng(); let mut builder = Decimal128Builder::with_capacity(size); @@ -99,7 +99,7 @@ fn build_decimal128_array(size: usize, precision: u8, scale: u8) -> ArrayRef { ) } -fn build_decimal256_array(size: usize, precision: u8, scale: u8) -> ArrayRef { +fn build_decimal256_array(size: usize, precision: u8, scale: i8) -> ArrayRef { let mut rng = seedable_rng(); let mut builder = Decimal256Builder::with_capacity(size); let mut bytes = [0; 32]; diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index ef303dfdd1ff..41addf24fbc2 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -103,7 +103,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer precision".to_string(), ) })?; - let parsed_scale = scale.parse::().map_err(|_| { + let parsed_scale = scale.parse::().map_err(|_| { ArrowError::CDataInterface( "The decimal type requires an integer scale".to_string(), ) @@ -119,7 +119,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer precision".to_string(), ) })?; - let parsed_scale = scale.parse::().map_err(|_| { + let parsed_scale = scale.parse::().map_err(|_| { ArrowError::CDataInterface( "The decimal type requires an integer scale".to_string(), ) diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 42f9ab277d40..3c08a592dd2c 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -31,7 +31,7 @@ use std::sync::Arc; fn create_decimal_array( array: Vec>, precision: u8, - scale: u8, + scale: i8, ) -> Decimal128Array { array .into_iter() From f749e1d9f19a5da9249b8e1d2429b10acde97805 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:48:53 +0000 Subject: [PATCH 0308/1411] Return slice from GenericByteArray::value_data (#3171) --- arrow-array/src/array/byte_array.rs | 8 +++---- arrow-cast/src/cast.rs | 23 ++++++++------------ arrow/src/compute/kernels/concat_elements.rs | 13 +++-------- arrow/src/compute/kernels/substring.rs | 6 ++--- 4 files changed, 18 insertions(+), 32 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 8dd206bd2639..8c2616624c0c 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -21,7 +21,7 @@ use crate::raw_pointer::RawPtrBox; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, OffsetSizeTrait}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; @@ -55,9 +55,9 @@ impl GenericByteArray { offsets[i + 1] - offsets[i] } - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() + /// Returns the raw value data + pub fn value_data(&self) -> &[u8] { + self.data.buffers()[1].as_slice() } /// Returns the offset values in the offsets buffer diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 61be2171b7c1..3f17758255c7 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3402,14 +3402,13 @@ where OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, OffsetSizeTo: OffsetSizeTrait + NumCast + ArrowNativeType, { - let str_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let list_data = array.data(); - let str_values_buf = str_array.value_data(); - - let offsets = list_data.buffers()[0].typed_data::(); + let data = array.data(); + assert_eq!( + data.data_type(), + &GenericStringArray::::DATA_TYPE + ); + let str_values_buf = data.buffers()[1].clone(); + let offsets = data.buffers()[0].typed_data::(); let mut offset_builder = BufferBuilder::::new(offsets.len()); offsets @@ -3426,18 +3425,14 @@ where let offset_buffer = offset_builder.finish(); - let dtype = if matches!(std::mem::size_of::(), 8) { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }; + let dtype = GenericStringArray::::DATA_TYPE; let builder = ArrayData::builder(dtype) .offset(array.offset()) .len(array.len()) .add_buffer(offset_buffer) .add_buffer(str_values_buf) - .null_bit_buffer(list_data.null_buffer().cloned()); + .null_bit_buffer(data.null_buffer().cloned()); let array_data = unsafe { builder.build_unchecked() }; diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow/src/compute/kernels/concat_elements.rs index 1c0a0925df74..a908ba9ab5d8 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow/src/compute/kernels/concat_elements.rs @@ -50,10 +50,8 @@ pub fn concat_elements_utf8( let left_offsets = left.value_offsets(); let right_offsets = right.value_offsets(); - let left_buffer = left.value_data(); - let right_buffer = right.value_data(); - let left_values = left_buffer.as_slice(); - let right_values = right_buffer.as_slice(); + let left_values = left.value_data(); + let right_values = right.value_data(); let mut output_values = BufferBuilder::::new( left_values.len() + right_values.len() @@ -115,16 +113,11 @@ pub fn concat_elements_utf8_many( size, )?; - let data_buffers = arrays + let data_values = arrays .iter() .map(|array| array.value_data()) .collect::>(); - let data_values = data_buffers - .iter() - .map(|buffer| buffer.as_slice()) - .collect::>(); - let mut offsets = arrays .iter() .map(|a| a.value_offsets().iter().peekable()) diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 76568ae0dac0..23cb2c19fddf 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -253,8 +253,7 @@ fn binary_substring( length: Option, ) -> Result { let offsets = array.value_offsets(); - let values = array.value_data(); - let data = values.as_slice(); + let data = array.value_data(); let zero = OffsetSize::zero(); // start and end offsets of all substrings @@ -364,8 +363,7 @@ fn utf8_substring( length: Option, ) -> Result { let offsets = array.value_offsets(); - let values = array.value_data(); - let data = values.as_slice(); + let data = array.value_data(); let zero = OffsetSize::zero(); // Check if `offset` is at a valid char boundary. From cea5146b69b3413a1d5caa946e0774ec8d834e95 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 23 Nov 2022 13:56:07 -0500 Subject: [PATCH 0309/1411] Add RowSelection::iter(), Into> and example (#3173) --- parquet/src/arrow/arrow_reader/selection.rs | 54 ++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index d5c4ce5ea450..487e95fcd831 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -59,6 +59,30 @@ impl RowSelector { /// A typical use-case would be using the [`PageIndex`] to filter out rows /// that don't satisfy a predicate /// +/// # Example +/// ``` +/// use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; +/// +/// let selectors = vec![ +/// RowSelector { row_count: 5, skip: true }, +/// RowSelector { row_count: 5, skip: false }, +/// RowSelector { row_count: 5, skip: false }, +/// RowSelector { row_count: 5, skip: true }, +/// ]; +/// +/// // Creating a selection will combine adjacent selectors +/// let selection: RowSelection = selectors.into(); +/// +/// let expected = vec![ +/// RowSelector { row_count: 5, skip: true }, +/// RowSelector { row_count: 10, skip: false }, +/// RowSelector { row_count: 5, skip: true }, +/// ]; +/// +/// let actual: Vec = selection.into(); +/// assert_eq!(actual, expected); +/// ``` +/// /// [`PageIndex`]: [crate::file::page_index::index::PageIndex] #[derive(Debug, Clone, Default, Eq, PartialEq)] pub struct RowSelection { @@ -243,7 +267,6 @@ impl RowSelection { selectors: remaining, } } - /// Given a [`RowSelection`] computed under `self`, returns the [`RowSelection`] /// representing their conjunction /// @@ -347,6 +370,12 @@ impl RowSelection { } self } + + /// Returns an iterator over the [`RowSelector`]s for this + /// [`RowSelection`]. + pub fn iter(&self) -> impl Iterator { + self.selectors.iter() + } } impl From> for RowSelection { @@ -355,6 +384,12 @@ impl From> for RowSelection { } } +impl From for Vec { + fn from(r: RowSelection) -> Self { + r.selectors + } +} + impl From for VecDeque { fn from(r: RowSelection) -> Self { r.selectors.into() @@ -789,6 +824,23 @@ mod tests { } } + #[test] + fn test_iter() { + // use the iter() API to show it does what is expected and + // avoid accidental deletion + let selectors = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(4), + ]; + + let round_tripped = RowSelection::from(selectors.clone()) + .iter() + .cloned() + .collect::>(); + assert_eq!(selectors, round_tripped); + } + #[test] fn test_scan_ranges() { let index = vec![ From 1d22fe3c23cc6ea1fb1df560c35f73cfdad96612 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Nov 2022 08:59:37 +0000 Subject: [PATCH 0310/1411] Add RowParser (#3174) --- arrow/src/row/mod.rs | 97 +++++++++++++++++++++++++++++++++------ arrow/src/row/variable.rs | 14 +++--- 2 files changed, 90 insertions(+), 21 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index c57fd41ebc02..058c35869d20 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -437,7 +437,12 @@ impl RowConverter { }) .collect::>>()?; - let mut rows = new_empty_rows(columns, &dictionaries, Arc::clone(&self.fields)); + let config = RowConfig { + fields: Arc::clone(&self.fields), + // Don't need to validate UTF-8 as came from arrow array + validate_utf8: false, + }; + let mut rows = new_empty_rows(columns, &dictionaries, config); for ((column, field), dictionary) in columns.iter().zip(self.fields.iter()).zip(dictionaries) @@ -465,14 +470,15 @@ impl RowConverter { where I: IntoIterator>, { + let mut validate_utf8 = false; let mut rows: Vec<_> = rows .into_iter() .map(|row| { assert!( - Arc::ptr_eq(row.fields, &self.fields), + Arc::ptr_eq(&row.config.fields, &self.fields), "rows were not produced by this RowConverter" ); - + validate_utf8 |= row.config.validate_utf8; row.data }) .collect(); @@ -484,11 +490,18 @@ impl RowConverter { // SAFETY // We have validated that the rows came from this [`RowConverter`] // and therefore must be valid - unsafe { decode_column(field, &mut rows, interner.as_deref()) } + unsafe { + decode_column(field, &mut rows, interner.as_deref(), validate_utf8) + } }) .collect() } + /// Returns a [`RowParser`] that can be used to parse [`Row`] from bytes + pub fn parser(&self) -> RowParser { + RowParser::new(Arc::clone(&self.fields)) + } + /// Returns the size of this instance in bytes /// /// Includes the size of `Self`. @@ -505,6 +518,43 @@ impl RowConverter { } } +/// A [`RowParser`] can be created from a [`RowConverter`] and used to parse bytes to [`Row`] +#[derive(Debug)] +pub struct RowParser { + config: RowConfig, +} + +impl RowParser { + fn new(fields: Arc<[SortField]>) -> Self { + Self { + config: RowConfig { + fields, + validate_utf8: true, + }, + } + } + + /// Creates a [`Row`] from the provided `bytes`. + /// + /// `bytes` must be a [`Row`] produced by the [`RowConverter`] associated with + /// this [`RowParser`], otherwise subsequent operations with the produced [`Row`] may panic + pub fn parse<'a>(&'a self, bytes: &'a [u8]) -> Row<'a> { + Row { + data: bytes, + config: &self.config, + } + } +} + +/// The config of a given set of [`Row`] +#[derive(Debug, Clone)] +struct RowConfig { + /// The schema for these rows + fields: Arc<[SortField]>, + /// Whether to run UTF-8 validation when converting to arrow arrays + validate_utf8: bool, +} + /// A row-oriented representation of arrow data, that is normalized for comparison. /// /// See the [module level documentation](self) and [`RowConverter`] for more details. @@ -514,8 +564,8 @@ pub struct Rows { buffer: Box<[u8]>, /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]` offsets: Box<[usize]>, - /// The schema for these rows - fields: Arc<[SortField]>, + /// The config for these rows + config: RowConfig, } impl Rows { @@ -524,7 +574,7 @@ impl Rows { let start = self.offsets[row]; Row { data: &self.buffer[start..end], - fields: &self.fields, + config: &self.config, } } @@ -614,7 +664,7 @@ impl<'a> DoubleEndedIterator for RowsIter<'a> { #[derive(Debug, Copy, Clone)] pub struct Row<'a> { data: &'a [u8], - fields: &'a Arc<[SortField]>, + config: &'a RowConfig, } impl<'a> Row<'a> { @@ -622,7 +672,7 @@ impl<'a> Row<'a> { pub fn owned(&self) -> OwnedRow { OwnedRow { data: self.data.to_vec(), - fields: Arc::clone(self.fields), + config: self.config.clone(), } } } @@ -672,7 +722,7 @@ impl<'a> AsRef<[u8]> for Row<'a> { #[derive(Debug, Clone)] pub struct OwnedRow { data: Vec, - fields: Arc<[SortField]>, + config: RowConfig, } impl OwnedRow { @@ -682,7 +732,7 @@ impl OwnedRow { pub fn row(&self) -> Row<'_> { Row { data: &self.data, - fields: &self.fields, + config: &self.config, } } } @@ -739,7 +789,7 @@ fn null_sentinel(options: SortOptions) -> u8 { fn new_empty_rows( cols: &[ArrayRef], dictionaries: &[Option>>], - fields: Arc<[SortField]>, + config: RowConfig, ) -> Rows { use fixed::FixedLengthEncoding; @@ -816,7 +866,7 @@ fn new_empty_rows( Rows { buffer: buffer.into(), offsets: offsets.into(), - fields, + config, } } @@ -872,6 +922,7 @@ unsafe fn decode_column( field: &SortField, rows: &mut [&[u8]], interner: Option<&OrderPreservingInterner>, + validate_utf8: bool, ) -> Result { let options = field.options; let data_type = field.data_type.clone(); @@ -881,8 +932,8 @@ unsafe fn decode_column( DataType::Boolean => Arc::new(decode_bool(rows, options)), DataType::Binary => Arc::new(decode_binary::(rows, options)), DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), - DataType::Utf8 => Arc::new(decode_string::(rows, options)), - DataType::LargeUtf8 => Arc::new(decode_string::(rows, options)), + DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), + DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), DataType::Dictionary(k, v) => match k.as_ref() { DataType::Int8 => Arc::new(decode_dictionary::( interner.unwrap(), @@ -1373,6 +1424,22 @@ mod tests { assert!(rows.row(3) < rows.row(0)); } + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at string")] + fn test_invalid_utf8() { + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); + let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _; + let rows = converter.convert_columns(&[array]).unwrap(); + let binary_row = rows.row(0); + + let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + let parser = converter.parser(); + let utf8_row = parser.parse(binary_row.as_ref()); + + converter.convert_rows(std::iter::once(utf8_row)).unwrap(); + } + #[test] #[should_panic(expected = "rows were not produced by this RowConverter")] fn test_different_converter() { diff --git a/arrow/src/row/variable.rs b/arrow/src/row/variable.rs index 36f337e658b6..3aa0b4839435 100644 --- a/arrow/src/row/variable.rs +++ b/arrow/src/row/variable.rs @@ -214,16 +214,18 @@ pub fn decode_binary( pub unsafe fn decode_string( rows: &mut [&[u8]], options: SortOptions, + validate_utf8: bool, ) -> GenericStringArray { - let d = match I::IS_LARGE { - true => DataType::LargeUtf8, - false => DataType::Utf8, - }; + let decoded = decode_binary::(rows, options); + + if validate_utf8 { + return GenericStringArray::from(decoded); + } - let builder = decode_binary::(rows, options) + let builder = decoded .into_data() .into_builder() - .data_type(d); + .data_type(GenericStringArray::::DATA_TYPE); // SAFETY: // Row data must have come from a valid UTF-8 array From 8ba78427ef2fea52ffabe91104b74b17906b3772 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Nov 2022 09:47:21 +0000 Subject: [PATCH 0311/1411] Faster BinaryArray to StringArray conversion (#3168) * Faster ByteArray to StringArray conversion * Add benchmark * Fix logical conflict --- arrow-array/src/array/string_array.rs | 16 +++++++++++++++- arrow/benches/array_data_validate.rs | 6 ++++++ arrow/src/row/mod.rs | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 8d92093f5ce8..fb3bb23179b5 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -216,8 +216,22 @@ impl From> for GenericStringArray { fn from(v: GenericBinaryArray) -> Self { + let offsets = v.value_offsets(); + let values = v.value_data(); + + // We only need to validate that all values are valid UTF-8 + let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence"); + for offset in offsets.iter() { + assert!( + validated.is_char_boundary(offset.as_usize()), + "Invalid UTF-8 sequence" + ) + } + let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - Self::from(builder.build().unwrap()) + // SAFETY: + // Validated UTF-8 above + Self::from(unsafe { builder.build_unchecked() }) } } diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 3cd13c09c58a..3b0fdbe63c97 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -52,6 +52,12 @@ fn validate_benchmark(c: &mut Criterion) { c.bench_function("validate_utf8_array_data 20000", |b| { b.iter(|| validate_utf8_array(&str_arr)) }); + + let byte_array = + BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); + c.bench_function("byte_array_to_string_array 20000", |b| { + b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone()))) + }); } criterion_group!(benches, validate_benchmark); diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 058c35869d20..6ce9f2b12c25 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -1425,7 +1425,7 @@ mod tests { } #[test] - #[should_panic(expected = "Invalid UTF8 sequence at string")] + #[should_panic(expected = "Invalid UTF-8 sequence")] fn test_invalid_utf8() { let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); From 5640a5b0c7d456a68c7c0cc562425ccf5494ecec Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 24 Nov 2022 18:16:18 +0800 Subject: [PATCH 0312/1411] bloom filter part IV: adjust writer properties, bloom filter properties, and incorporate into column encoder (#3165) * rework bloom filter 1. update number of properties 2. push down hashing to encoder level 3. add more docs * move bloom filter * update prompt * remove unused updates --- parquet/src/arrow/array_reader/mod.rs | 2 +- parquet/src/arrow/arrow_writer/byte_array.rs | 6 + parquet/src/bin/parquet-fromcsv-help.txt | 9 +- parquet/src/bin/parquet-fromcsv.rs | 22 +- parquet/src/bin/parquet-show-bloom-filter.rs | 2 +- parquet/src/bloom_filter/mod.rs | 43 +-- parquet/src/column/writer/encoder.rs | 32 ++ parquet/src/column/writer/mod.rs | 22 +- parquet/src/file/properties.rs | 297 ++++++++++++------- 9 files changed, 284 insertions(+), 151 deletions(-) diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index aede5e86c693..f46f6073a714 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -102,7 +102,7 @@ pub trait RowGroupCollection { /// Get schema of parquet file. fn schema(&self) -> SchemaDescPtr; - /// Get the numer of rows in this collection + /// Get the number of rows in this collection fn num_rows(&self) -> usize; /// Returns an iterator over the column chunks for particular column diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index d52317852805..d870ac54fe4d 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -17,6 +17,7 @@ use crate::arrow::arrow_writer::levels::LevelInfo; use crate::basic::Encoding; +use crate::bloom_filter::Sbbf; use crate::column::page::PageWriter; use crate::column::writer::encoder::{ ColumnValueEncoder, DataPageValues, DictionaryPage, @@ -451,6 +452,11 @@ impl ColumnValueEncoder for ByteArrayEncoder { } } + fn flush_bloom_filter(&mut self) -> Option { + // TODO FIX ME need to handle bloom filter in arrow writer + None + } + fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result where Self: Sized, diff --git a/parquet/src/bin/parquet-fromcsv-help.txt b/parquet/src/bin/parquet-fromcsv-help.txt index f599a13f0f18..ec7eb0cc13f1 100644 --- a/parquet/src/bin/parquet-fromcsv-help.txt +++ b/parquet/src/bin/parquet-fromcsv-help.txt @@ -37,10 +37,10 @@ Options: [possible values: lf, crlf, cr] -e, --escape-char - escape charactor + escape character -q, --quote-char - quate charactor + quote character -D, --double-quote double quote @@ -58,6 +58,11 @@ Options: -m, --max-row-group-size max row group size + --enable-bloom-filter + whether to enable bloom filter writing + + [possible values: true, false] + --help display usage help diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 5fdece7cc8a3..b11f3406cb34 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -57,11 +57,11 @@ //! //! - `-i`, `--input-file` : Path to input CSV file //! - `-f`, `--input-format` : Dialect for input file, `csv` or `tsv`. -//! - `-d`, `--delimiter : Field delimitor for CSV file, default depends `--input-format` -//! - `-e`, `--escape` : Escape charactor for input file +//! - `-d`, `--delimiter : Field delimiter for CSV file, default depends `--input-format` +//! - `-e`, `--escape` : Escape character for input file //! - `-h`, `--has-header` : Input has header -//! - `-r`, `--record-terminator` : Record terminator charactor for input. default is CRLF -//! - `-q`, `--quote-char` : Input quoting charactor +//! - `-r`, `--record-terminator` : Record terminator character for input. default is CRLF +//! - `-q`, `--quote-char` : Input quoting character //! use std::{ @@ -182,9 +182,9 @@ struct Args { delimiter: Option, #[clap(value_enum, short, long, help("record terminator"))] record_terminator: Option, - #[clap(short, long, help("escape charactor"))] + #[clap(short, long, help("escape character"))] escape_char: Option, - #[clap(short, long, help("quate charactor"))] + #[clap(short, long, help("quote character"))] quote_char: Option, #[clap(short('D'), long, help("double quote"))] double_quote: Option, @@ -197,6 +197,8 @@ struct Args { writer_version: Option, #[clap(short, long, help("max row group size"))] max_row_group_size: Option, + #[clap(long, help("whether to enable bloom filter writing"))] + enable_bloom_filter: Option, #[clap(long, action=clap::ArgAction::Help, help("display usage help"))] help: Option, @@ -290,6 +292,10 @@ fn configure_writer_properties(args: &Args) -> WriterProperties { properties_builder = properties_builder.set_max_row_group_size(max_row_group_size); } + if let Some(enable_bloom_filter) = args.enable_bloom_filter { + properties_builder = + properties_builder.set_bloom_filter_enabled(enable_bloom_filter); + } properties_builder.build() } @@ -548,6 +554,7 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + enable_bloom_filter: None, help: None, }; let arrow_schema = Arc::new(Schema::new(vec![ @@ -582,6 +589,7 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + enable_bloom_filter: None, help: None, }; let arrow_schema = Arc::new(Schema::new(vec![ @@ -636,6 +644,8 @@ mod tests { parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, + // by default we shall test bloom filter writing + enable_bloom_filter: Some(true), help: None, }; convert_csv_to_parquet(&args).unwrap(); diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index 28493a94c490..55ecb2abf134 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -83,7 +83,7 @@ fn main() { println!( "Value {} is {} in bloom filter", value, - if sbbf.check(value.as_str()) { + if sbbf.check(&value.as_str()) { "present" } else { "absent" diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 4efba3834ded..15c38cf5915b 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -16,7 +16,7 @@ // under the License. //! Bloom filter implementation specific to Parquet, as described -//! in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) +//! in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md). use crate::data_type::AsBytes; use crate::errors::ParquetError; @@ -35,7 +35,7 @@ use thrift::protocol::{ }; use twox_hash::XxHash64; -/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach) +/// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach). const SALT: [u32; 8] = [ 0x47b6137b_u32, 0x44974d91_u32, @@ -83,7 +83,9 @@ fn block_check(block: &Block, hash: u32) -> bool { true } -/// A split block Bloom filter +/// A split block Bloom filter. The creation of this structure is based on the +/// [`crate::file::properties::BloomFilterProperties`] struct set via [`crate::file::properties::WriterProperties`] and +/// is thus hidden by default. #[derive(Debug, Clone)] pub struct Sbbf(Vec); @@ -118,8 +120,8 @@ fn read_bloom_filter_header_and_length( )) } -const BITSET_MIN_LENGTH: usize = 32; -const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; +pub(crate) const BITSET_MIN_LENGTH: usize = 32; +pub(crate) const BITSET_MAX_LENGTH: usize = 128 * 1024 * 1024; #[inline] fn optimal_num_of_bytes(num_bytes: usize) -> usize { @@ -141,15 +143,20 @@ fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize { impl Sbbf { /// Create a new [Sbbf] with given number of distinct values and false positive probability. /// Will panic if `fpp` is greater than 1.0 or less than 0.0. - pub fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Self { - assert!((0.0..-1.0).contains(&fpp), "invalid fpp: {}", fpp); + pub(crate) fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result { + if !(0.0..1.0).contains(&fpp) { + return Err(ParquetError::General(format!( + "False positive probability must be between 0.0 and 1.0, got {}", + fpp + ))); + } let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp); - Self::new_with_num_of_bytes(num_bits / 8) + Ok(Self::new_with_num_of_bytes(num_bits / 8)) } /// Create a new [Sbbf] with given number of bytes, the exact number of bytes will be adjusted - /// to the next power of two bounded by `BITSET_MIN_LENGTH` and `BITSET_MAX_LENGTH`. - pub fn new_with_num_of_bytes(num_bytes: usize) -> Self { + /// to the next power of two bounded by [BITSET_MIN_LENGTH] and [BITSET_MAX_LENGTH]. + pub(crate) fn new_with_num_of_bytes(num_bytes: usize) -> Self { let num_bytes = optimal_num_of_bytes(num_bytes); let bitset = vec![0_u8; num_bytes]; Self::new(&bitset) @@ -170,7 +177,7 @@ impl Sbbf { } /// Write the bloom filter data (header and then bitset) to the output - pub fn write(&self, mut writer: W) -> Result<(), ParquetError> { + pub(crate) fn write(&self, mut writer: W) -> Result<(), ParquetError> { let mut protocol = TCompactOutputProtocol::new(&mut writer); let header = self.header(); header.write_to_out_protocol(&mut protocol).map_err(|e| { @@ -208,7 +215,7 @@ impl Sbbf { } /// Read a new bloom filter from the given offset in the given reader. - pub fn read_from_column_chunk( + pub(crate) fn read_from_column_chunk( column_metadata: &ColumnChunkMetaData, reader: Arc, ) -> Result, ParquetError> { @@ -254,7 +261,7 @@ impl Sbbf { } /// Insert an [AsBytes] value into the filter - pub fn insert(&mut self, value: T) { + pub fn insert(&mut self, value: &T) { self.insert_hash(hash_as_bytes(value)); } @@ -266,7 +273,7 @@ impl Sbbf { } /// Check if an [AsBytes] value is probably present or definitely absent in the filter - pub fn check(&self, value: T) -> bool { + pub fn check(&self, value: &T) -> bool { self.check_hash(hash_as_bytes(value)) } @@ -284,7 +291,7 @@ impl Sbbf { const SEED: u64 = 0; #[inline] -fn hash_as_bytes(value: A) -> u64 { +fn hash_as_bytes(value: &A) -> u64 { let mut hasher = XxHash64::with_seed(SEED); hasher.write(value.as_bytes()); hasher.finish() @@ -324,8 +331,8 @@ mod tests { fn test_sbbf_insert_and_check() { let mut sbbf = Sbbf(vec![[0_u32; 8]; 1_000]); for i in 0..1_000_000 { - sbbf.insert(i); - assert!(sbbf.check(i)); + sbbf.insert(&i); + assert!(sbbf.check(&i)); } } @@ -339,7 +346,7 @@ mod tests { let sbbf = Sbbf::new(bitset); for a in 0..10i64 { let value = format!("a{}", a); - assert!(sbbf.check(value.as_str())); + assert!(sbbf.check(&value.as_str())); } } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 22cc71f6cd5e..0d0716d7a7d5 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -16,6 +16,7 @@ // under the License. use crate::basic::Encoding; +use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -24,6 +25,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::DataType; use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::errors::{ParquetError, Result}; +use crate::file::properties::BloomFilterProperties; use crate::file::properties::{EnabledStatistics, WriterProperties}; use crate::schema::types::{ColumnDescPtr, ColumnDescriptor}; use crate::util::memory::ByteBufferPtr; @@ -115,6 +117,11 @@ pub trait ColumnValueEncoder { /// Flush the next data page for this column chunk fn flush_data_page(&mut self) -> Result>; + + /// Flushes bloom filter if enabled and returns it, otherwise returns `None`. Subsequent writes + /// will *not* be tracked by the bloom filter as it is empty since. This should be called once + /// near the end of encoding. + fn flush_bloom_filter(&mut self) -> Option; } pub struct ColumnValueEncoderImpl { @@ -125,6 +132,7 @@ pub struct ColumnValueEncoderImpl { statistics_enabled: EnabledStatistics, min_value: Option, max_value: Option, + bloom_filter: Option, } impl ColumnValueEncoderImpl { @@ -136,6 +144,13 @@ impl ColumnValueEncoderImpl { } } + // encode the values into bloom filter if enabled + if let Some(bloom_filter) = &mut self.bloom_filter { + for value in slice { + bloom_filter.insert(value); + } + } + match &mut self.dict_encoder { Some(encoder) => encoder.put(slice), _ => self.encoder.put(slice), @@ -161,6 +176,10 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { } } + fn flush_bloom_filter(&mut self) -> Option { + self.bloom_filter.take() + } + fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result { let dict_supported = props.dictionary_enabled(descr.path()) && has_dictionary_support(T::get_physical_type(), props); @@ -175,12 +194,25 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { let statistics_enabled = props.statistics_enabled(descr.path()); + let bloom_filter_enabled = props.bloom_filter_enabled(descr.path()); + let bloom_filter = + if let Some(BloomFilterProperties { ndv, fpp }) = bloom_filter_enabled { + Sbbf::new_with_ndv_fpp(ndv, fpp) + .map_err(|e| { + eprintln!("invalid bloom filter properties: {}", e); + }) + .ok() + } else { + None + }; + Ok(Self { encoder, dict_encoder, descr: descr.clone(), num_values: 0, statistics_enabled, + bloom_filter, min_value: None, max_value: None, }) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index ae7920e22839..40f8c99403f0 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -212,10 +212,6 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { def_levels_sink: Vec, rep_levels_sink: Vec, data_pages: VecDeque, - - // bloom filter - bloom_filter: Option, - // column index and offset index column_index_builder: ColumnIndexBuilder, offset_index_builder: OffsetIndexBuilder, @@ -238,19 +234,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); - let bloom_filter_enabled = props.bloom_filter_enabled(descr.path()); - let bloom_filter = if bloom_filter_enabled { - if let Some(ndv) = props.bloom_filter_ndv(descr.path()) { - let fpp = props.bloom_filter_fpp(descr.path()); - Some(Sbbf::new_with_ndv_fpp(ndv, fpp)) - } else { - let max_bytes = props.bloom_filter_max_bytes(descr.path()); - Some(Sbbf::new_with_num_of_bytes(max_bytes as usize)) - } - } else { - None - }; - Self { descr, props, @@ -280,7 +263,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_column_nulls: 0, column_distinct_count: None, }, - bloom_filter, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -454,7 +436,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { &self.descr } - /// Finalises writes and closes the column writer. + /// Finalizes writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { if self.page_metrics.num_buffered_values > 0 { @@ -479,7 +461,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Ok(ColumnCloseResult { bytes_written: self.column_metrics.total_bytes_written, rows_written: self.column_metrics.total_rows_written, - bloom_filter: self.bloom_filter, + bloom_filter: self.encoder.flush_bloom_filter(), metadata, column_index, offset_index, diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 03117d4cb077..6d30be2e4baf 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -64,7 +64,6 @@ //! .build(); //! ``` -use paste::paste; use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; @@ -83,9 +82,10 @@ const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); -const DEFAULT_BLOOM_FILTER_ENABLED: bool = false; -const DEFAULT_BLOOM_FILTER_MAX_BYTES: u32 = 1024 * 1024; -const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.01; +/// default value for the false positive probability used in a bloom filter. +pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; +/// default value for the expected number of distinct values used in a bloom filter. +pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; /// Parquet writer version. /// @@ -129,26 +129,6 @@ pub struct WriterProperties { sorting_columns: Option>, } -macro_rules! def_col_property_getter { - ($field:ident, $field_type:ty) => { - pub fn $field(&self, col: &ColumnPath) -> Option<$field_type> { - self.column_properties - .get(col) - .and_then(|c| c.$field()) - .or_else(|| self.default_column_properties.$field()) - } - }; - ($field:ident, $field_type:ty, $default_val:expr) => { - pub fn $field(&self, col: &ColumnPath) -> $field_type { - self.column_properties - .get(col) - .and_then(|c| c.$field()) - .or_else(|| self.default_column_properties.$field()) - .unwrap_or($default_val) - } - }; -} - impl WriterProperties { /// Returns builder for writer properties with default values. pub fn builder() -> WriterPropertiesBuilder { @@ -280,10 +260,17 @@ impl WriterProperties { .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) } - def_col_property_getter!(bloom_filter_enabled, bool, DEFAULT_BLOOM_FILTER_ENABLED); - def_col_property_getter!(bloom_filter_fpp, f64, DEFAULT_BLOOM_FILTER_FPP); - def_col_property_getter!(bloom_filter_ndv, u64); - def_col_property_getter!(bloom_filter_max_bytes, u32, DEFAULT_BLOOM_FILTER_MAX_BYTES); + /// Returns whether bloom filter is enabled for a given column. Bloom filter can be enabled over + /// all or for a specific column, and is by default set to be disabled. + pub fn bloom_filter_enabled( + &self, + col: &ColumnPath, + ) -> Option { + self.column_properties + .get(col) + .and_then(|c| c.bloom_filter_enabled()) + .or_else(|| self.default_column_properties.bloom_filter_enabled()) + } } /// Writer properties builder. @@ -301,52 +288,6 @@ pub struct WriterPropertiesBuilder { sorting_columns: Option>, } -macro_rules! def_opt_field_setter { - ($field: ident, $type: ty) => { - paste! { - pub fn [](&mut self, value: $type) -> &mut Self { - self.$field = Some(value); - self - } - } - }; - ($field: ident, $type: ty, $min_value:expr, $max_value:expr) => { - paste! { - pub fn [](&mut self, value: $type) -> &mut Self { - if ($min_value..=$max_value).contains(&value) { - self.$field = Some(value); - } else { - self.$field = None - } - self - } - } - }; -} - -macro_rules! def_opt_field_getter { - ($field: ident, $type: ty) => { - paste! { - #[doc = "Returns " $field " if set."] - pub fn $field(&self) -> Option<$type> { - self.$field - } - } - }; -} - -macro_rules! def_per_col_setter { - ($field:ident, $field_type:ty) => { - paste! { - #[doc = "Sets " $field " for a column. Takes precedence over globally defined settings."] - pub fn [](mut self, col: ColumnPath, value: $field_type) -> Self { - self.get_mut_props(col).[](value); - self - } - } - } -} - impl WriterPropertiesBuilder { /// Returns default state of the builder. fn with_defaults() -> Self { @@ -506,6 +447,30 @@ impl WriterPropertiesBuilder { self } + /// Sets whether bloom filter is enabled for any column. + /// If the bloom filter is enabled previously then it is a no-op. + /// If the bloom filter is not yet enabled, a default set of ndv and fpp value will be used. + /// You can use [`set_bloom_filter_ndv`](Self::set_bloom_filter_ndv) and [`set_bloom_filter_fpp`](Self::set_bloom_filter_fpp) to further adjust the ndv and fpp. + pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self { + self.default_column_properties + .set_bloom_filter_enabled(value); + self + } + + /// Sets bloom filter false positive probability (fpp) for any column. + /// Implicitly [`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled). + pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self { + self.default_column_properties.set_bloom_filter_fpp(value); + self + } + + /// Sets number of distinct values (ndv) for bloom filter for any column. + /// Implicitly [`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled). + pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self { + self.default_column_properties.set_bloom_filter_ndv(value); + self + } + // ---------------------------------------------------------------------- // Setters for a specific column @@ -568,10 +533,33 @@ impl WriterPropertiesBuilder { self } - def_per_col_setter!(bloom_filter_enabled, bool); - def_per_col_setter!(bloom_filter_fpp, f64); - def_per_col_setter!(bloom_filter_max_bytes, u32); - def_per_col_setter!(bloom_filter_ndv, u64); + /// Sets whether a bloom filter should be created for a specific column. + /// The behavior is similar to [`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled). + /// Takes precedence over globally defined settings. + pub fn set_column_bloom_filter_enabled( + mut self, + col: ColumnPath, + value: bool, + ) -> Self { + self.get_mut_props(col).set_bloom_filter_enabled(value); + self + } + + /// Sets the false positive probability for bloom filter for a specific column. + /// The behavior is similar to [`set_bloom_filter_fpp`](Self::set_bloom_filter_fpp) but will + /// override the default. + pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self { + self.get_mut_props(col).set_bloom_filter_fpp(value); + self + } + + /// Sets the number of distinct values for bloom filter for a specific column. + /// The behavior is similar to [`set_bloom_filter_ndv`](Self::set_bloom_filter_ndv) but will + /// override the default. + pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self { + self.get_mut_props(col).set_bloom_filter_ndv(value); + self + } } /// Controls the level of statistics to be computed by the writer @@ -591,6 +579,43 @@ impl Default for EnabledStatistics { } } +/// Controls the bloom filter to be computed by the writer. +#[derive(Debug, Clone, PartialEq)] +pub struct BloomFilterProperties { + /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`]. + /// + /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`]. + /// + /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the + /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value + /// e.g. 0.1, 0.05, or 0.001 is recommended. + /// + /// Setting to very small number diminishes the value of the filter itself, as the bitset size is + /// even larger than just storing the whole value. You are also expected to set `ndv` if it can + /// be known in advance in order to largely reduce space usage. + pub fpp: f64, + /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`]. + /// + /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`]. + /// + /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic + /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller + /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter + /// anyway. + /// + /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size. + pub ndv: u64, +} + +impl Default for BloomFilterProperties { + fn default() -> Self { + BloomFilterProperties { + fpp: DEFAULT_BLOOM_FILTER_FPP, + ndv: DEFAULT_BLOOM_FILTER_NDV, + } + } +} + /// Container for column properties that can be changed as part of writer. /// /// If a field is `None`, it means that no specific value has been set for this column, @@ -602,14 +627,8 @@ struct ColumnProperties { dictionary_enabled: Option, statistics_enabled: Option, max_statistics_size: Option, - /// bloom filter enabled - bloom_filter_enabled: Option, - /// bloom filter expected number of distinct values - bloom_filter_ndv: Option, - /// bloom filter false positive probability - bloom_filter_fpp: Option, - /// bloom filter max number of bytes - bloom_filter_max_bytes: Option, + /// bloom filter related properties + bloom_filter_enabled: Option, } impl ColumnProperties { @@ -649,10 +668,45 @@ impl ColumnProperties { self.max_statistics_size = Some(value); } - def_opt_field_setter!(bloom_filter_enabled, bool); - def_opt_field_setter!(bloom_filter_fpp, f64, 0.0, 1.0); - def_opt_field_setter!(bloom_filter_max_bytes, u32); - def_opt_field_setter!(bloom_filter_ndv, u64); + /// If `value` is `true`, sets bloom filter properties to default values if not previously set, + /// otherwise it is a no-op. + /// If `value` is `false`, resets bloom filter properties to `None`. + fn set_bloom_filter_enabled(&mut self, value: bool) { + if value { + self.bloom_filter_enabled = self + .bloom_filter_enabled() + .or_else(|| Some(Default::default())); + } else { + self.bloom_filter_enabled = None; + } + } + + /// Sets the false positive probability for bloom filter for this column, and implicitly enables + /// bloom filter if not previously enabled. If the `value` is not between 0 and 1 exclusive, it is + /// discarded as no-op. + fn set_bloom_filter_fpp(&mut self, value: f64) { + if (0.0..1.0).contains(&value) { + self.bloom_filter_enabled = self + .bloom_filter_enabled() + .or_else(|| Some(Default::default())) + .map(|BloomFilterProperties { ndv, .. }| BloomFilterProperties { + ndv, + fpp: value, + }); + } + } + + /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly + /// enables bloom filter if not previously enabled. + fn set_bloom_filter_ndv(&mut self, value: u64) { + self.bloom_filter_enabled = self + .bloom_filter_enabled() + .or_else(|| Some(Default::default())) + .map(|BloomFilterProperties { fpp, .. }| BloomFilterProperties { + ndv: value, + fpp, + }); + } /// Returns optional encoding for this column. fn encoding(&self) -> Option { @@ -682,10 +736,10 @@ impl ColumnProperties { self.max_statistics_size } - def_opt_field_getter!(bloom_filter_enabled, bool); - def_opt_field_getter!(bloom_filter_fpp, f64); - def_opt_field_getter!(bloom_filter_max_bytes, u32); - def_opt_field_getter!(bloom_filter_ndv, u64); + /// Returns bloom filter properties if set. + fn bloom_filter_enabled(&self) -> Option { + self.bloom_filter_enabled.clone() + } } /// Reference counted reader properties. @@ -812,13 +866,9 @@ mod tests { props.max_statistics_size(&ColumnPath::from("col")), DEFAULT_MAX_STATISTICS_SIZE ); - assert!(!props.bloom_filter_enabled(&ColumnPath::from("col"))); - assert_eq!(props.bloom_filter_fpp(&ColumnPath::from("col")), 0.01); - assert_eq!(props.bloom_filter_ndv(&ColumnPath::from("col")), None); - assert_eq!( - props.bloom_filter_max_bytes(&ColumnPath::from("col")), - 1024 * 1024 - ); + assert!(props + .bloom_filter_enabled(&ColumnPath::from("col")) + .is_none()); } #[test] @@ -903,9 +953,8 @@ mod tests { ) .set_column_max_statistics_size(ColumnPath::from("col"), 123) .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) - .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100) + .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64) .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1) - .set_column_bloom_filter_max_bytes(ColumnPath::from("col"), 1000) .build(); assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0); @@ -947,6 +996,10 @@ mod tests { EnabledStatistics::Chunk ); assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); + assert_eq!( + props.bloom_filter_enabled(&ColumnPath::from("col")), + Some(BloomFilterProperties { fpp: 0.1, ndv: 100 }) + ); } #[test] @@ -954,6 +1007,7 @@ mod tests { let props = WriterProperties::builder() .set_encoding(Encoding::DELTA_BINARY_PACKED) .set_compression(Compression::GZIP) + .set_bloom_filter_enabled(true) .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) .build(); @@ -969,6 +1023,43 @@ mod tests { props.dictionary_enabled(&ColumnPath::from("col")), DEFAULT_DICTIONARY_ENABLED ); + assert_eq!( + props.bloom_filter_enabled(&ColumnPath::from("col")), + Some(BloomFilterProperties { + fpp: 0.05, + ndv: 1_000_000_u64 + }) + ); + } + + #[test] + fn test_writer_properties_bloom_filter_ndv_fpp_set() { + assert_eq!( + WriterProperties::builder() + .build() + .bloom_filter_enabled(&ColumnPath::from("col")), + None + ); + assert_eq!( + WriterProperties::builder() + .set_bloom_filter_ndv(100) + .build() + .bloom_filter_enabled(&ColumnPath::from("col")), + Some(BloomFilterProperties { + fpp: 0.05, + ndv: 100 + }) + ); + assert_eq!( + WriterProperties::builder() + .set_bloom_filter_fpp(0.1) + .build() + .bloom_filter_enabled(&ColumnPath::from("col")), + Some(BloomFilterProperties { + fpp: 0.1, + ndv: 1_000_000_u64 + }) + ); } #[test] From 4b9e3fee2878401b141c17a7ac3767cc3fa6c06f Mon Sep 17 00:00:00 2001 From: "xudong.w" Date: Thu, 24 Nov 2022 19:33:39 +0800 Subject: [PATCH 0313/1411] Add read parquet examples (#3170) * Add read parquet examples * address comments * add real row filter * Update parquet/examples/async_read_parquet.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/Cargo.toml | 10 ++++ parquet/examples/async_read_parquet.rs | 66 ++++++++++++++++++++++++++ parquet/examples/read_parquet.rs | 43 +++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 parquet/examples/async_read_parquet.rs create mode 100644 parquet/examples/read_parquet.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 7a150c94963d..73c778c4a851 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -92,6 +92,16 @@ experimental = [] # Enable async APIs async = ["futures", "tokio"] +[[example]] +name = "read_parquet" +required-features = ["arrow"] +path = "./examples/read_parquet.rs" + +[[example]] +name = "async_read_parquet" +required-features = ["arrow", "async"] +path = "./examples/async_read_parquet.rs" + [[test]] name = "arrow_writer_layout" required-features = ["arrow"] diff --git a/parquet/examples/async_read_parquet.rs b/parquet/examples/async_read_parquet.rs new file mode 100644 index 000000000000..9b4b6d4ffac6 --- /dev/null +++ b/parquet/examples/async_read_parquet.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::util::pretty::print_batches; +use futures::TryStreamExt; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter}; +use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::errors::Result; +use std::time::SystemTime; +use tokio::fs::File; + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<()> { + // Create parquet file that will be read. + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_plain.parquet", testdata); + let file = File::open(path).await.unwrap(); + + // Create a async parquet reader builder with batch_size. + // batch_size is the number of rows to read up to buffer once from pages, defaults to 1024 + let mut builder = ParquetRecordBatchStreamBuilder::new(file) + .await + .unwrap() + .with_batch_size(8192); + + let file_metadata = builder.metadata().file_metadata().clone(); + let mask = ProjectionMask::roots(file_metadata.schema_descr(), [0, 1, 2]); + // Set projection mask to read only root columns 1 and 2. + builder = builder.with_projection(mask); + + // Highlight: set `RowFilter`, it'll push down filter predicates to skip IO and decode. + // For more specific usage: please refer to https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/src/physical_plan/file_format/parquet/row_filter.rs. + let filter = ArrowPredicateFn::new( + ProjectionMask::roots(file_metadata.schema_descr(), [0]), + |record_batch| arrow::compute::eq_dyn_scalar(record_batch.column(0), 1), + ); + let row_filter = RowFilter::new(vec![Box::new(filter)]); + builder = builder.with_row_filter(row_filter); + + // Build a async parquet reader. + let stream = builder.build().unwrap(); + + let start = SystemTime::now(); + + let result = stream.try_collect::>().await?; + + println!("took: {} ms", start.elapsed().unwrap().as_millis()); + + print_batches(&result).unwrap(); + + Ok(()) +} diff --git a/parquet/examples/read_parquet.rs b/parquet/examples/read_parquet.rs new file mode 100644 index 000000000000..3d6d70aeed20 --- /dev/null +++ b/parquet/examples/read_parquet.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::util::pretty::print_batches; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::errors::Result; +use std::fs::File; + +fn main() -> Result<()> { + // Create parquet file that will be read. + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{}/alltypes_plain.parquet", testdata); + let file = File::open(path).unwrap(); + + // Create a sync parquet reader with batch_size. + // batch_size is the number of rows to read up to buffer once from pages, defaults to 1024 + let parquet_reader = ParquetRecordBatchReaderBuilder::try_new(file)? + .with_batch_size(8192) + .build()?; + + let mut batches = Vec::new(); + + for batch in parquet_reader { + batches.push(batch?); + } + + print_batches(&batches).unwrap(); + Ok(()) +} From eefbdce229eb355bbdfd5ccbc24247be882fda15 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Nov 2022 13:40:28 +0000 Subject: [PATCH 0314/1411] Bloom filter config tweaks (#3023) (#3175) * Bloom filter config tweaks (#3023) * Further tweaks --- parquet/src/column/writer/encoder.rs | 16 ++---- parquet/src/file/properties.rs | 86 ++++++++++++++-------------- 2 files changed, 46 insertions(+), 56 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 0d0716d7a7d5..c343f1d6c824 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -25,7 +25,6 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::DataType; use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::errors::{ParquetError, Result}; -use crate::file::properties::BloomFilterProperties; use crate::file::properties::{EnabledStatistics, WriterProperties}; use crate::schema::types::{ColumnDescPtr, ColumnDescriptor}; use crate::util::memory::ByteBufferPtr; @@ -194,17 +193,10 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { let statistics_enabled = props.statistics_enabled(descr.path()); - let bloom_filter_enabled = props.bloom_filter_enabled(descr.path()); - let bloom_filter = - if let Some(BloomFilterProperties { ndv, fpp }) = bloom_filter_enabled { - Sbbf::new_with_ndv_fpp(ndv, fpp) - .map_err(|e| { - eprintln!("invalid bloom filter properties: {}", e); - }) - .ok() - } else { - None - }; + let bloom_filter = props + .bloom_filter_properties(descr.path()) + .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp)) + .transpose()?; Ok(Self { encoder, diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 6d30be2e4baf..c8083fcf30fa 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -260,16 +260,17 @@ impl WriterProperties { .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE) } - /// Returns whether bloom filter is enabled for a given column. Bloom filter can be enabled over - /// all or for a specific column, and is by default set to be disabled. - pub fn bloom_filter_enabled( + /// Returns the [`BloomFilterProperties`] for the given column + /// + /// Returns `None` if bloom filter is disabled + pub fn bloom_filter_properties( &self, col: &ColumnPath, - ) -> Option { + ) -> Option<&BloomFilterProperties> { self.column_properties .get(col) - .and_then(|c| c.bloom_filter_enabled()) - .or_else(|| self.default_column_properties.bloom_filter_enabled()) + .and_then(|c| c.bloom_filter_properties()) + .or_else(|| self.default_column_properties.bloom_filter_properties()) } } @@ -628,7 +629,7 @@ struct ColumnProperties { statistics_enabled: Option, max_statistics_size: Option, /// bloom filter related properties - bloom_filter_enabled: Option, + bloom_filter_properies: Option, } impl ColumnProperties { @@ -672,40 +673,37 @@ impl ColumnProperties { /// otherwise it is a no-op. /// If `value` is `false`, resets bloom filter properties to `None`. fn set_bloom_filter_enabled(&mut self, value: bool) { - if value { - self.bloom_filter_enabled = self - .bloom_filter_enabled() - .or_else(|| Some(Default::default())); - } else { - self.bloom_filter_enabled = None; + if value && self.bloom_filter_properies.is_none() { + self.bloom_filter_properies = Some(Default::default()) + } else if !value { + self.bloom_filter_properies = None } } /// Sets the false positive probability for bloom filter for this column, and implicitly enables - /// bloom filter if not previously enabled. If the `value` is not between 0 and 1 exclusive, it is - /// discarded as no-op. + /// bloom filter if not previously enabled. + /// + /// # Panics + /// + /// Panics if the `value` is not between 0 and 1 exclusive fn set_bloom_filter_fpp(&mut self, value: f64) { - if (0.0..1.0).contains(&value) { - self.bloom_filter_enabled = self - .bloom_filter_enabled() - .or_else(|| Some(Default::default())) - .map(|BloomFilterProperties { ndv, .. }| BloomFilterProperties { - ndv, - fpp: value, - }); - } + assert!( + value > 0. && value < 1.0, + "fpp must be between 0 and 1 exclusive, got {}", + value + ); + + self.bloom_filter_properies + .get_or_insert_with(Default::default) + .fpp = value; } /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly /// enables bloom filter if not previously enabled. fn set_bloom_filter_ndv(&mut self, value: u64) { - self.bloom_filter_enabled = self - .bloom_filter_enabled() - .or_else(|| Some(Default::default())) - .map(|BloomFilterProperties { fpp, .. }| BloomFilterProperties { - ndv: value, - fpp, - }); + self.bloom_filter_properies + .get_or_insert_with(Default::default) + .ndv = value; } /// Returns optional encoding for this column. @@ -736,9 +734,9 @@ impl ColumnProperties { self.max_statistics_size } - /// Returns bloom filter properties if set. - fn bloom_filter_enabled(&self) -> Option { - self.bloom_filter_enabled.clone() + /// Returns the bloom filter properties, or `None` if not enabled + fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> { + self.bloom_filter_properies.as_ref() } } @@ -867,7 +865,7 @@ mod tests { DEFAULT_MAX_STATISTICS_SIZE ); assert!(props - .bloom_filter_enabled(&ColumnPath::from("col")) + .bloom_filter_properties(&ColumnPath::from("col")) .is_none()); } @@ -997,8 +995,8 @@ mod tests { ); assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); assert_eq!( - props.bloom_filter_enabled(&ColumnPath::from("col")), - Some(BloomFilterProperties { fpp: 0.1, ndv: 100 }) + props.bloom_filter_properties(&ColumnPath::from("col")), + Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 }) ); } @@ -1024,8 +1022,8 @@ mod tests { DEFAULT_DICTIONARY_ENABLED ); assert_eq!( - props.bloom_filter_enabled(&ColumnPath::from("col")), - Some(BloomFilterProperties { + props.bloom_filter_properties(&ColumnPath::from("col")), + Some(&BloomFilterProperties { fpp: 0.05, ndv: 1_000_000_u64 }) @@ -1037,15 +1035,15 @@ mod tests { assert_eq!( WriterProperties::builder() .build() - .bloom_filter_enabled(&ColumnPath::from("col")), + .bloom_filter_properties(&ColumnPath::from("col")), None ); assert_eq!( WriterProperties::builder() .set_bloom_filter_ndv(100) .build() - .bloom_filter_enabled(&ColumnPath::from("col")), - Some(BloomFilterProperties { + .bloom_filter_properties(&ColumnPath::from("col")), + Some(&BloomFilterProperties { fpp: 0.05, ndv: 100 }) @@ -1054,8 +1052,8 @@ mod tests { WriterProperties::builder() .set_bloom_filter_fpp(0.1) .build() - .bloom_filter_enabled(&ColumnPath::from("col")), - Some(BloomFilterProperties { + .bloom_filter_properties(&ColumnPath::from("col")), + Some(&BloomFilterProperties { fpp: 0.1, ndv: 1_000_000_u64 }) From 3e2d39ed89ba786dcb88ddbb2a73253cdf680903 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 24 Nov 2022 13:52:40 +0000 Subject: [PATCH 0315/1411] Update zstd requirement from 0.11.1 to 0.12.0 (#3178) Updates the requirements on [zstd](https://github.com/gyscos/zstd-rs) to permit the latest version. - [Release notes](https://github.com/gyscos/zstd-rs/releases) - [Commits](https://github.com/gyscos/zstd-rs/commits) --- updated-dependencies: - dependency-name: zstd dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-ipc/Cargo.toml | 2 +- parquet/Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index e3205e7a8153..838cde8fa252 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -45,7 +45,7 @@ arrow-data = { version = "27.0.0", path = "../arrow-data" } arrow-schema = { version = "27.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } -zstd = { version = "0.11.1", default-features = false, optional = true } +zstd = { version = "0.12.0", default-features = false, optional = true } [dev-dependencies] tempfile = "3.3" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 73c778c4a851..88f6eff23053 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -46,7 +46,7 @@ snap = { version = "1.0", default-features = false, optional = true } brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } lz4 = { version = "1.23", default-features = false, optional = true } -zstd = { version = "0.11.1", optional = true, default-features = false } +zstd = { version = "0.12.0", optional = true, default-features = false } chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } @@ -68,7 +68,7 @@ tempfile = { version = "3.0", default-features = false } brotli = { version = "3.3", default-features = false, features = ["std"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } lz4 = { version = "1.23", default-features = false } -zstd = { version = "0.11", default-features = false } +zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { path = "../arrow", version = "27.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } From 22deab00696f4fac2a587c59bf9b0ce2c2ec3ac6 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 24 Nov 2022 19:18:49 +0100 Subject: [PATCH 0316/1411] Move decimal constants from `arrow-data` to `arrow-schema` crate (#3177) * Move decimal constants from `arrow-data` to `arrow-schema` crate * Remove `arrow-schema` crate prefix from intra doc links * Avoid breaking change and move non-schema constants back to `arrow-data` * Update arrow/src/row/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/types.rs | 6 +++--- arrow-data/src/decimal.rs | 29 +++++++++-------------------- arrow-schema/src/datatype.rs | 16 ++++++++++++++++ 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 40d262e8ed72..13194d61f015 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -21,12 +21,12 @@ use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; use crate::OffsetSizeTrait; use arrow_buffer::i256; -use arrow_data::decimal::{ - validate_decimal256_precision, validate_decimal_precision, DECIMAL128_MAX_PRECISION, +use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; +use arrow_schema::{ + ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; -use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use chrono::{Duration, NaiveDate}; use half::f16; use std::marker::PhantomData; diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index 7011c40858c2..9367d4ec2546 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -18,6 +18,11 @@ use arrow_buffer::i256; use arrow_schema::ArrowError; +pub use arrow_schema::{ + DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, + DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, +}; + // MAX decimal256 value of little-endian format for each precision. // Each element is the max value of signed 256-bit integer for the specified precision which // is encoded to the 32-byte width format of little-endian. @@ -638,8 +643,8 @@ pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ ]), ]; -/// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value -/// that can be stored in [arrow_schema::DataType::Decimal128] value of precision `p` +/// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value that can +/// be stored in [arrow_schema::DataType::Decimal128] value of precision `p` pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 9, 99, @@ -681,8 +686,8 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 99999999999999999999999999999999999999, ]; -/// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value -/// that can be stored in a [arrow_schema::DataType::Decimal128] value of precision `p` +/// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value that can +/// be stored in a [arrow_schema::DataType::Decimal128] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -9, -99, @@ -724,22 +729,6 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; -/// The maximum precision for [arrow_schema::DataType::Decimal128] values -pub const DECIMAL128_MAX_PRECISION: u8 = 38; - -/// The maximum scale for [arrow_schema::DataType::Decimal128] values -pub const DECIMAL128_MAX_SCALE: i8 = 38; - -/// The maximum precision for [arrow_schema::DataType::Decimal256] values -pub const DECIMAL256_MAX_PRECISION: u8 = 76; - -/// The maximum scale for [arrow_schema::DataType::Decimal256] values -pub const DECIMAL256_MAX_SCALE: i8 = 76; - -/// The default scale for [arrow_schema::DataType::Decimal128] and -/// [arrow_schema::DataType::Decimal256] values -pub const DECIMAL_DEFAULT_SCALE: i8 = 10; - /// Validates that the specified `i128` value can be properly /// interpreted as a Decimal number with precision `precision` #[inline] diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index f74e2a24b04f..6e0f626ef94d 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -412,6 +412,22 @@ impl DataType { } } +/// The maximum precision for [DataType::Decimal128] values +pub const DECIMAL128_MAX_PRECISION: u8 = 38; + +/// The maximum scale for [DataType::Decimal128] values +pub const DECIMAL128_MAX_SCALE: i8 = 38; + +/// The maximum precision for [DataType::Decimal256] values +pub const DECIMAL256_MAX_PRECISION: u8 = 76; + +/// The maximum scale for [DataType::Decimal256] values +pub const DECIMAL256_MAX_SCALE: i8 = 76; + +/// The default scale for [DataType::Decimal128] and [DataType::Decimal256] +/// values +pub const DECIMAL_DEFAULT_SCALE: i8 = 10; + #[cfg(test)] mod tests { use super::*; From 007fb4c56ffce7f8b6be7928196da79ee5eff75a Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 25 Nov 2022 03:20:35 +0800 Subject: [PATCH 0317/1411] add an integration with pytest against pyspark (#3176) --- .github/workflows/parquet.yml | 34 ++++++- .gitignore | 3 + parquet/README.md | 1 - parquet/pytest/pyspark_integration_test.py | 65 ++++++++++++ parquet/pytest/requirements.in | 20 ++++ parquet/pytest/requirements.txt | 102 +++++++++++++++++++ parquet/src/bin/parquet-show-bloom-filter.rs | 19 +++- 7 files changed, 238 insertions(+), 6 deletions(-) create mode 100755 parquet/pytest/pyspark_integration_test.py create mode 100644 parquet/pytest/requirements.in create mode 100644 parquet/pytest/requirements.txt diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 5b0cc87440e9..c5c7aac053f0 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -19,7 +19,6 @@ # tests for parquet crate name: "parquet" - # trigger for all PRs that touch certain files and changes to master on: push: @@ -58,7 +57,6 @@ jobs: - name: Test --all-features run: cargo test -p parquet --all-features - # test compilation linux-features: name: Check Compilation @@ -120,6 +118,38 @@ jobs: - name: Build wasm32-wasi run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-wasi + pyspark-integration-test: + name: PySpark Integration Test + runs-on: ubuntu-latest + strategy: + matrix: + rust: [stable] + steps: + - uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: "pip" + - name: Install Python dependencies + run: | + cd parquet/pytest + pip install -r requirements.txt + - name: Black check the test files + run: | + cd parquet/pytest + black --check *.py --verbose + - name: Setup Rust toolchain + run: | + rustup toolchain install ${{ matrix.rust }} + rustup default ${{ matrix.rust }} + - name: Install binary for checking + run: cargo install --path parquet --bin parquet-show-bloom-filter --features=arrow,cli + - name: Run pytest + run: | + cd parquet/pytest + pytest -v + clippy: name: Clippy runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index b8506ea06cb0..52ad19cb077d 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,6 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk +# Python virtual env in parquet crate +parquet/pytest/venv/ +__pycache__/ diff --git a/parquet/README.md b/parquet/README.md index c9245b082119..d904fc64e744 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -41,7 +41,6 @@ However, for historical reasons, this crate uses versions with major numbers gre The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`: - `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet -- `bloom` (default) - support for [split block bloom filter](https://github.com/apache/parquet-format/blob/master/BloomFilter.md) for reading from / writing to parquet - `async` - support `async` APIs for reading parquet - `json` - support for reading / writing `json` data to / from parquet - `brotli` (default) - support for parquet using `brotli` compression diff --git a/parquet/pytest/pyspark_integration_test.py b/parquet/pytest/pyspark_integration_test.py new file mode 100755 index 000000000000..0a0b881e3e9b --- /dev/null +++ b/parquet/pytest/pyspark_integration_test.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pyspark.sql +import tempfile +import subprocess +import pathlib + + +def create_data_and_df(): + spark = pyspark.sql.SparkSession.builder.getOrCreate() + spark.conf.set("parquet.bloom.filter.enabled", True) + spark.conf.set("parquet.bloom.filter.expected.ndv", 10) + spark.conf.set("parquet.bloom.filter.max.bytes", 32) + data = [(f"id-{i % 10}", f"name-{i%10}") for i in range(100)] + df = spark.createDataFrame(data, ["id", "name"]).repartition(1) + return data, df + + +def get_expected_output(data): + expected = ["Row group #0", "=" * 80] + for v in data: + expected.append(f"Value {v[0]} is present in bloom filter") + for v in data: + expected.append(f"Value {v[1]} is absent in bloom filter") + expected = "\n".join(expected) + "\n" + return expected.encode("utf-8") + + +def get_cli_output(output_dir, data, col_name="id"): + # take the first (and only) parquet file + parquet_file = sorted(pathlib.Path(output_dir).glob("*.parquet"))[0] + args = [ + "parquet-show-bloom-filter", + "--file-name", + parquet_file, + "--column", + col_name, + ] + for v in data: + args.extend(["--values", v[0]]) + for v in data: + args.extend(["--values", v[1]]) + return subprocess.check_output(args) + + +def test_pyspark_bloom_filter(): + data, df = create_data_and_df() + with tempfile.TemporaryDirectory() as output_dir: + df.write.parquet(output_dir, mode="overwrite") + cli_output = get_cli_output(output_dir, data) + assert cli_output == get_expected_output(data) diff --git a/parquet/pytest/requirements.in b/parquet/pytest/requirements.in new file mode 100644 index 000000000000..a0b30b867625 --- /dev/null +++ b/parquet/pytest/requirements.in @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +pytest +pyspark +black + diff --git a/parquet/pytest/requirements.txt b/parquet/pytest/requirements.txt new file mode 100644 index 000000000000..fb6f8fb6dd96 --- /dev/null +++ b/parquet/pytest/requirements.txt @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This file is autogenerated by pip-compile with python 3.10 +# To update, run: +# +# pip-compile --generate-hashes --resolver=backtracking +# +attrs==22.1.0 \ + --hash=sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6 \ + --hash=sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c + # via pytest +black==22.10.0 \ + --hash=sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7 \ + --hash=sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6 \ + --hash=sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650 \ + --hash=sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb \ + --hash=sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d \ + --hash=sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d \ + --hash=sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de \ + --hash=sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395 \ + --hash=sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae \ + --hash=sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa \ + --hash=sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef \ + --hash=sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383 \ + --hash=sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66 \ + --hash=sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87 \ + --hash=sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d \ + --hash=sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0 \ + --hash=sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b \ + --hash=sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458 \ + --hash=sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4 \ + --hash=sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1 \ + --hash=sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff + # via -r requirements.in +click==8.1.3 \ + --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e \ + --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 + # via black +exceptiongroup==1.0.4 \ + --hash=sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828 \ + --hash=sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec + # via pytest +iniconfig==1.1.1 \ + --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ + --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 + # via pytest +mypy-extensions==0.4.3 \ + --hash=sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d \ + --hash=sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8 + # via black +packaging==21.3 \ + --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ + --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 + # via pytest +pathspec==0.10.2 \ + --hash=sha256:88c2606f2c1e818b978540f73ecc908e13999c6c3a383daf3705652ae79807a5 \ + --hash=sha256:8f6bf73e5758fd365ef5d58ce09ac7c27d2833a8d7da51712eac6e27e35141b0 + # via black +platformdirs==2.5.4 \ + --hash=sha256:1006647646d80f16130f052404c6b901e80ee4ed6bef6792e1f238a8969106f7 \ + --hash=sha256:af0276409f9a02373d540bf8480021a048711d572745aef4b7842dad245eba10 + # via black +pluggy==1.0.0 \ + --hash=sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159 \ + --hash=sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3 + # via pytest +py4j==0.10.9.5 \ + --hash=sha256:276a4a3c5a2154df1860ef3303a927460e02e97b047dc0a47c1c3fb8cce34db6 \ + --hash=sha256:52d171a6a2b031d8a5d1de6efe451cf4f5baff1a2819aabc3741c8406539ba04 + # via pyspark +pyparsing==3.0.9 \ + --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb \ + --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc + # via packaging +pyspark==3.3.1 \ + --hash=sha256:e99fa7de92be406884bfd831c32b9306a3a99de44cfc39a2eefb6ed07445d5fa + # via -r requirements.in +pytest==7.2.0 \ + --hash=sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71 \ + --hash=sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59 + # via -r requirements.in +tomli==2.0.1 \ + --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ + --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f + # via + # black + # pytest diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index 55ecb2abf134..f9462327f831 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -34,7 +34,11 @@ //! ``` use clap::Parser; -use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::file::{ + properties::ReaderProperties, + reader::{FileReader, SerializedFileReader}, + serialized_reader::ReadOptionsBuilder, +}; use std::{fs::File, path::Path}; #[derive(Debug, Parser)] @@ -63,8 +67,17 @@ fn main() { let path = Path::new(&file_name); let file = File::open(path).expect("Unable to open file"); - let file_reader = - SerializedFileReader::new(file).expect("Unable to open file as Parquet"); + let file_reader = SerializedFileReader::new_with_options( + file, + ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(), + ) + .expect("Unable to open file as Parquet"); let metadata = file_reader.metadata(); for (ri, row_group) in metadata.row_groups().iter().enumerate() { println!("Row group #{}", ri); From 2460c7b5da2ba22c7fb0ef0df6ac84984e3aed12 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Fri, 25 Nov 2022 00:51:11 +0530 Subject: [PATCH 0318/1411] Doc improvements (#3155) * Improving arrow-json docs * Improving arrow-array docs * Fix tests * Fix typos * Incorporate review comments * Improve doc for fixed_size_list_builder * Fix doc comments --- arrow-array/src/arithmetic.rs | 22 +++ arrow-array/src/array/boolean_array.rs | 2 +- arrow-array/src/array/list_array.rs | 2 + arrow-array/src/array/mod.rs | 1 + arrow-array/src/array/primitive_array.rs | 30 +++- .../src/builder/boolean_buffer_builder.rs | 13 ++ arrow-array/src/builder/buffer_builder.rs | 32 ++++ .../src/builder/fixed_size_binary_builder.rs | 16 ++ .../src/builder/fixed_size_list_builder.rs | 39 +++++ arrow-array/src/builder/map_builder.rs | 47 ++++++ arrow-array/src/builder/mod.rs | 6 + arrow-array/src/builder/primitive_builder.rs | 35 ++++ .../builder/primitive_dictionary_builder.rs | 1 + .../src/builder/string_dictionary_builder.rs | 1 + arrow-array/src/builder/struct_builder.rs | 2 + arrow-array/src/iterator.rs | 5 + arrow-array/src/lib.rs | 3 + arrow-array/src/record_batch.rs | 1 + arrow-array/src/types.rs | 154 ++++++++++++++---- arrow-json/src/lib.rs | 4 + arrow-json/src/reader.rs | 2 + 21 files changed, 387 insertions(+), 31 deletions(-) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index 566f3742e93d..dcb6a1be7241 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -45,60 +45,82 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { /// The multiplicative identity const ONE: Self; + /// Checked addition operation fn add_checked(self, rhs: Self) -> Result; + /// Wrapping addition operation fn add_wrapping(self, rhs: Self) -> Self; + /// Checked subtraction operation fn sub_checked(self, rhs: Self) -> Result; + /// Wrapping subtraction operation fn sub_wrapping(self, rhs: Self) -> Self; + /// Checked multiplication operation fn mul_checked(self, rhs: Self) -> Result; + /// Wrapping multiplication operation fn mul_wrapping(self, rhs: Self) -> Self; + /// Checked division operation fn div_checked(self, rhs: Self) -> Result; + /// Wrapping division operation fn div_wrapping(self, rhs: Self) -> Self; + /// Checked remainder operation fn mod_checked(self, rhs: Self) -> Result; + /// Wrapping remainder operation fn mod_wrapping(self, rhs: Self) -> Self; + /// Checked negation operation fn neg_checked(self) -> Result; + /// Wrapping negation operation fn neg_wrapping(self) -> Self; + /// Checked exponentiation operation fn pow_checked(self, exp: u32) -> Result; + /// Wrapping exponentiation operation fn pow_wrapping(self, exp: u32) -> Self; + /// Returns true if zero else false fn is_zero(self) -> bool; + /// Compare operation fn compare(self, rhs: Self) -> Ordering; + /// Equality operation fn is_eq(self, rhs: Self) -> bool; + /// Not equal operation #[inline] fn is_ne(self, rhs: Self) -> bool { !self.is_eq(rhs) } + /// Less than operation #[inline] fn is_lt(self, rhs: Self) -> bool { self.compare(rhs).is_lt() } + /// Less than equals operation #[inline] fn is_le(self, rhs: Self) -> bool { self.compare(rhs).is_le() } + /// Greater than operation #[inline] fn is_gt(self, rhs: Self) -> bool { self.compare(rhs).is_gt() } + /// Greater than equals operation #[inline] fn is_ge(self, rhs: Self) -> bool { self.compare(rhs).is_ge() diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 31dde3a3dda7..83af9760da71 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -91,7 +91,7 @@ impl BooleanArray { self.data.is_empty() } - // Returns a new boolean array builder + /// Returns a new boolean array builder pub fn builder(capacity: usize) -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 54699749f2ff..204a36c32337 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -29,7 +29,9 @@ use std::any::Any; /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { + /// True for 64 bit offset size and false for 32 bit offset size const IS_LARGE: bool; + /// Prefix for the offset size const PREFIX: &'static str; } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 307753a7117e..5fc44d8965e4 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -382,6 +382,7 @@ impl<'a, T: Array> Array for &'a T { /// The value at null indexes is unspecified, and implementations must not rely on a specific /// value such as [`Default::default`] being returned, however, it must not be undefined pub trait ArrayAccessor: Array { + /// The Arrow type of the element being accessed. type Item: Send + Sync; /// Returns the element at index `i` diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index bd68b9698ce9..42d183238eac 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -165,21 +165,48 @@ pub type TimestampMicrosecondArray = PrimitiveArray; /// A primitive array where each element is of type `TimestampNanosecondType.` /// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) pub type TimestampNanosecondArray = PrimitiveArray; + +// TODO: give examples for the below types + +/// A primitive array where each element is of 32-bit date type. pub type Date32Array = PrimitiveArray; +/// A primitive array where each element is of 64-bit date type. pub type Date64Array = PrimitiveArray; + +/// An array where each element is of 32-bit type representing time elapsed in seconds +/// since midnight. pub type Time32SecondArray = PrimitiveArray; +/// An array where each element is of 32-bit type representing time elapsed in milliseconds +/// since midnight. pub type Time32MillisecondArray = PrimitiveArray; +/// An array where each element is of 64-bit type representing time elapsed in microseconds +/// since midnight. pub type Time64MicrosecondArray = PrimitiveArray; +/// An array where each element is of 64-bit type representing time elapsed in nanoseconds +/// since midnight. pub type Time64NanosecondArray = PrimitiveArray; + +/// An array where each element is a “calendar” interval in months. pub type IntervalYearMonthArray = PrimitiveArray; +/// An array where each element is a “calendar” interval days and milliseconds. pub type IntervalDayTimeArray = PrimitiveArray; +/// An array where each element is a “calendar” interval in months, days, and nanoseconds. pub type IntervalMonthDayNanoArray = PrimitiveArray; + +/// An array where each element is an elapsed time type in seconds. pub type DurationSecondArray = PrimitiveArray; +/// An array where each element is an elapsed time type in milliseconds. pub type DurationMillisecondArray = PrimitiveArray; +/// An array where each element is an elapsed time type in microseconds. pub type DurationMicrosecondArray = PrimitiveArray; +/// An array where each element is an elapsed time type in nanoseconds. pub type DurationNanosecondArray = PrimitiveArray; +/// An array where each element is a 128-bits decimal with precision in [1, 38] and +/// scale in [-38, 38]. pub type Decimal128Array = PrimitiveArray; +/// An array where each element is a 256-bits decimal with precision in [1, 76] and +/// scale in [-76, 76]. pub type Decimal256Array = PrimitiveArray; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the @@ -256,7 +283,7 @@ impl PrimitiveArray { } } - // Returns a new primitive array builder + /// Returns a new primitive array builder pub fn builder(capacity: usize) -> PrimitiveBuilder { PrimitiveBuilder::::with_capacity(capacity) } @@ -749,6 +776,7 @@ impl<'a, T: ArrowPrimitiveType> PrimitiveArray { /// the type can be collected to `PrimitiveArray`. #[derive(Debug)] pub struct NativeAdapter { + /// Corresponding Rust native type if available pub native: Option, } diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index 2ab01ccfe40b..4f8638ee789c 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -19,6 +19,7 @@ use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask; use std::ops::Range; +/// A builder for creating a boolean [`Buffer`] #[derive(Debug)] pub struct BooleanBufferBuilder { buffer: MutableBuffer, @@ -26,6 +27,7 @@ pub struct BooleanBufferBuilder { } impl BooleanBufferBuilder { + /// Creates a new `BooleanBufferBuilder` #[inline] pub fn new(capacity: usize) -> Self { let byte_capacity = bit_util::ceil(capacity, 8); @@ -33,16 +35,19 @@ impl BooleanBufferBuilder { Self { buffer, len: 0 } } + /// Creates a new `BooleanBufferBuilder` from [`MutableBuffer`] of `len` pub fn new_from_buffer(buffer: MutableBuffer, len: usize) -> Self { assert!(len <= buffer.len() * 8); Self { buffer, len } } + /// Returns the length of the buffer #[inline] pub fn len(&self) -> usize { self.len } + /// Sets a bit in the buffer at `index` #[inline] pub fn set_bit(&mut self, index: usize, v: bool) { if v { @@ -52,21 +57,25 @@ impl BooleanBufferBuilder { } } + /// Gets a bit in the buffer at `index` #[inline] pub fn get_bit(&self, index: usize) -> bool { bit_util::get_bit(self.buffer.as_slice(), index) } + /// Returns true if empty #[inline] pub fn is_empty(&self) -> bool { self.len == 0 } + /// Returns the capacity of the buffer #[inline] pub fn capacity(&self) -> usize { self.buffer.capacity() * 8 } + /// Advances the buffer by `additional` bits #[inline] pub fn advance(&mut self, additional: usize) { let new_len = self.len + additional; @@ -99,6 +108,7 @@ impl BooleanBufferBuilder { self.len = len; } + /// Appends a boolean `v` into the buffer #[inline] pub fn append(&mut self, v: bool) { self.advance(1); @@ -107,6 +117,7 @@ impl BooleanBufferBuilder { } } + /// Appends n `additional` bits of value `v` into the buffer #[inline] pub fn append_n(&mut self, additional: usize, v: bool) { self.advance(additional); @@ -118,6 +129,7 @@ impl BooleanBufferBuilder { } } + /// Appends a slice of booleans into the buffer #[inline] pub fn append_slice(&mut self, slice: &[bool]) { let additional = slice.len(); @@ -156,6 +168,7 @@ impl BooleanBufferBuilder { self.buffer.as_slice() } + /// Creates a [`Buffer`] #[inline] pub fn finish(&mut self) -> Buffer { let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index d3146366d512..d4eed0de9de7 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -21,47 +21,78 @@ use std::marker::PhantomData; use crate::types::*; +/// Buffer builder for signed 8-bit integer type. pub type Int8BufferBuilder = BufferBuilder; +/// Buffer builder for signed 16-bit integer type. pub type Int16BufferBuilder = BufferBuilder; +/// Buffer builder for signed 32-bit integer type. pub type Int32BufferBuilder = BufferBuilder; +/// Buffer builder for signed 64-bit integer type. pub type Int64BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 8-bit integer type. pub type UInt8BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 16-bit integer type. pub type UInt16BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 32-bit integer type. pub type UInt32BufferBuilder = BufferBuilder; +/// Buffer builder for usigned 64-bit integer type. pub type UInt64BufferBuilder = BufferBuilder; +/// Buffer builder for 32-bit floating point type. pub type Float32BufferBuilder = BufferBuilder; +/// Buffer builder for 64-bit floating point type. pub type Float64BufferBuilder = BufferBuilder; +/// Buffer builder for timestamp type of second unit. pub type TimestampSecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for timestamp type of millisecond unit. pub type TimestampMillisecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for timestamp type of microsecond unit. pub type TimestampMicrosecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for timestamp type of nanosecond unit. pub type TimestampNanosecondBufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for 32-bit date type. pub type Date32BufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 64-bit date type. pub type Date64BufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for 32-bit elaspsed time since midnight of second unit. pub type Time32SecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit. pub type Time32MillisecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 64-bit elaspsed time since midnight of microsecond unit. pub type Time64MicrosecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 64-bit elaspsed time since midnight of nanosecond unit. pub type Time64NanosecondBufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for “calendar” interval in months. pub type IntervalYearMonthBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for “calendar” interval in days and milliseconds. pub type IntervalDayTimeBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder “calendar” interval in months, days, and nanoseconds. pub type IntervalMonthDayNanoBufferBuilder = BufferBuilder<::Native>; + +/// Buffer builder for elaspsed time of second unit. pub type DurationSecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of milliseconds unit. pub type DurationMillisecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of microseconds unit. pub type DurationMicrosecondBufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for elaspsed time of nanoseconds unit. pub type DurationNanosecondBufferBuilder = BufferBuilder<::Native>; @@ -124,6 +155,7 @@ impl BufferBuilder { } } + /// Creates a new builder from a [`MutableBuffer`] pub fn new_from_buffer(buffer: MutableBuffer) -> Self { let buffer_len = buffer.len(); Self { diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index e9581922ccaa..4c8225adf153 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -24,6 +24,22 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; +/// A fixed size binary array builder +/// ``` +/// use arrow_array::builder::FixedSizeBinaryBuilder; +/// use arrow_array::Array; +/// +/// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); +/// // [b"hello", null, b"arrow"] +/// builder.append_value(b"hello").unwrap(); +/// builder.append_null(); +/// builder.append_value(b"arrow").unwrap(); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), b"hello"); +/// assert!(array.is_null(1)); +/// assert_eq!(array.value(2), b"arrow"); +/// ``` #[derive(Debug)] pub struct FixedSizeBinaryBuilder { values_builder: UInt8BufferBuilder, diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 516c22925786..bc4ce466ac39 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -25,6 +25,44 @@ use std::any::Any; use std::sync::Arc; /// Array builder for [`FixedSizeListArray`] +/// ``` +/// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array}; +/// let values_builder = Int32Builder::new(); +/// let mut builder = FixedSizeListBuilder::new(values_builder, 3); +/// +/// // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] +/// builder.values().append_value(0); +/// builder.values().append_value(1); +/// builder.values().append_value(2); +/// builder.append(true); +/// builder.values().append_null(); +/// builder.values().append_null(); +/// builder.values().append_null(); +/// builder.append(false); +/// builder.values().append_value(3); +/// builder.values().append_null(); +/// builder.values().append_value(5); +/// builder.append(true); +/// builder.values().append_value(6); +/// builder.values().append_value(7); +/// builder.values().append_null(); +/// builder.append(true); +/// let list_array = builder.finish(); +/// assert_eq!( +/// *list_array.value(0), +/// Int32Array::from(vec![Some(0), Some(1), Some(2)]) +/// ); +/// assert!(list_array.is_null(1)); +/// assert_eq!( +/// *list_array.value(2), +/// Int32Array::from(vec![Some(3), None, Some(5)]) +/// ); +/// assert_eq!( +/// *list_array.value(3), +/// Int32Array::from(vec![Some(6), Some(7), None]) +/// ) +/// ``` +/// #[derive(Debug)] pub struct FixedSizeListBuilder { null_buffer_builder: NullBufferBuilder, @@ -104,6 +142,7 @@ where &mut self.values_builder } + /// Returns the length of the list pub fn value_length(&self) -> i32 { self.list_len } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 5602f88636c3..737b4fa72de1 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -24,6 +24,43 @@ use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; +/// Creates a new `MapBuilder` +/// ``` +/// use arrow_array::builder::{MapBuilder, Int32Builder, StringBuilder}; +/// use arrow_array::{StringArray, Int32Array}; +/// use std::sync::Arc; +/// +/// let string_builder = StringBuilder::new(); +/// let int_builder = Int32Builder::with_capacity(4); +/// +/// let mut builder = MapBuilder::new(None, string_builder, int_builder); +/// +/// let string_builder = builder.keys(); +/// string_builder.append_value("joe"); +/// string_builder.append_null(); +/// string_builder.append_null(); +/// string_builder.append_value("mark"); +/// +/// let int_builder = builder.values(); +/// int_builder.append_value(1); +/// int_builder.append_value(2); +/// int_builder.append_null(); +/// int_builder.append_value(4); +/// +/// builder.append(true).unwrap(); +/// builder.append(false).unwrap(); +/// builder.append(true).unwrap(); +/// +/// let arr = builder.finish(); +/// assert_eq!( +/// *arr.values(), +/// Int32Array::from(vec![Some(1), Some(2), None, Some(4)]) +/// ); +/// assert_eq!( +/// *arr.keys(), +/// StringArray::from(vec![Some("joe"), None, None, Some("mark")]) +/// ); +/// ``` #[derive(Debug)] pub struct MapBuilder { offsets_builder: BufferBuilder, @@ -33,10 +70,14 @@ pub struct MapBuilder { value_builder: V, } +/// Contains details of the mapping #[derive(Debug, Clone)] pub struct MapFieldNames { + /// [`Field`] name for map entries pub entry: String, + /// [`Field`] name for map key pub key: String, + /// [`Field`] name for map value pub value: String, } @@ -52,6 +93,7 @@ impl Default for MapFieldNames { #[allow(dead_code)] impl MapBuilder { + /// Creates a new `MapBuilder` pub fn new( field_names: Option, key_builder: K, @@ -61,6 +103,7 @@ impl MapBuilder { Self::with_capacity(field_names, key_builder, value_builder, capacity) } + /// Creates a new `MapBuilder` with capacity pub fn with_capacity( field_names: Option, key_builder: K, @@ -79,10 +122,12 @@ impl MapBuilder { } } + /// Returns the key array builder of the map pub fn keys(&mut self) -> &mut K { &mut self.key_builder } + /// Returns the value array builder of the map pub fn values(&mut self) -> &mut V { &mut self.value_builder } @@ -104,6 +149,7 @@ impl MapBuilder { Ok(()) } + /// Builds the [`MapArray`] pub fn finish(&mut self) -> MapArray { let len = self.len(); @@ -144,6 +190,7 @@ impl MapBuilder { MapArray::from(array_data) } + /// Builds the [`MapArray`] without resetting the builder. pub fn finish_cloned(&self) -> MapArray { let len = self.len(); diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index eaf8243973b8..3486e396b671 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -128,11 +128,17 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } +/// A list array builder with i32 offsets pub type ListBuilder = GenericListBuilder; +/// A list array builder with i64 offsets pub type LargeListBuilder = GenericListBuilder; +/// A binary array builder with i32 offsets pub type BinaryBuilder = GenericBinaryBuilder; +/// A binary array builder with i64 offsets pub type LargeBinaryBuilder = GenericBinaryBuilder; +/// A string array builder with i32 offsets pub type StringBuilder = GenericStringBuilder; +/// A string array builder with i64 offsets pub type LargeStringBuilder = GenericStringBuilder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 7a1fbafc76ff..ef420dcbc295 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -24,36 +24,69 @@ use arrow_data::ArrayData; use std::any::Any; use std::sync::Arc; +/// A signed 8-bit integer array builder. pub type Int8Builder = PrimitiveBuilder; +/// A signed 16-bit integer array builder. pub type Int16Builder = PrimitiveBuilder; +/// A signed 32-bit integer array builder. pub type Int32Builder = PrimitiveBuilder; +/// A signed 64-bit integer array builder. pub type Int64Builder = PrimitiveBuilder; +/// An usigned 8-bit integer array builder. pub type UInt8Builder = PrimitiveBuilder; +/// An usigned 16-bit integer array builder. pub type UInt16Builder = PrimitiveBuilder; +/// An usigned 32-bit integer array builder. pub type UInt32Builder = PrimitiveBuilder; +/// An usigned 64-bit integer array builder. pub type UInt64Builder = PrimitiveBuilder; +/// A 32-bit floating point array builder. pub type Float32Builder = PrimitiveBuilder; +/// A 64-bit floating point array builder. pub type Float64Builder = PrimitiveBuilder; +/// A timestamp second array builder. pub type TimestampSecondBuilder = PrimitiveBuilder; +/// A timestamp millisecond array builder. pub type TimestampMillisecondBuilder = PrimitiveBuilder; +/// A timestamp microsecond array builder. pub type TimestampMicrosecondBuilder = PrimitiveBuilder; +/// A timestamp nanosecond array builder. pub type TimestampNanosecondBuilder = PrimitiveBuilder; + +/// A 32-bit date array builder. pub type Date32Builder = PrimitiveBuilder; +/// A 64-bit date array builder. pub type Date64Builder = PrimitiveBuilder; + +/// A 32-bit elaspsed time in seconds array builder. pub type Time32SecondBuilder = PrimitiveBuilder; +/// A 32-bit elaspsed time in milliseconds array builder. pub type Time32MillisecondBuilder = PrimitiveBuilder; +/// A 64-bit elaspsed time in microseconds array builder. pub type Time64MicrosecondBuilder = PrimitiveBuilder; +/// A 64-bit elaspsed time in nanoseconds array builder. pub type Time64NanosecondBuilder = PrimitiveBuilder; + +/// A “calendar” interval in months array builder. pub type IntervalYearMonthBuilder = PrimitiveBuilder; +/// A “calendar” interval in days and milliseconds array builder. pub type IntervalDayTimeBuilder = PrimitiveBuilder; +/// A “calendar” interval in months, days, and nanoseconds array builder. pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; + +/// An elapsed time in seconds array builder. pub type DurationSecondBuilder = PrimitiveBuilder; +/// An elapsed time in milliseconds array builder. pub type DurationMillisecondBuilder = PrimitiveBuilder; +/// An elapsed time in microseconds array builder. pub type DurationMicrosecondBuilder = PrimitiveBuilder; +/// An elapsed time in nanoseconds array builder. pub type DurationNanosecondBuilder = PrimitiveBuilder; +/// A decimal 128 array builder pub type Decimal128Builder = PrimitiveBuilder; +/// A decimal 256 array builder pub type Decimal256Builder = PrimitiveBuilder; /// Array builder for fixed-width primitive types @@ -120,6 +153,7 @@ impl PrimitiveBuilder { } } + /// Creates a new primitive array builder from buffers pub fn new_from_buffer( values_buffer: MutableBuffer, null_buffer: Option, @@ -157,6 +191,7 @@ impl PrimitiveBuilder { self.values_builder.advance(1); } + /// Appends `n` no. of null's into the builder #[inline] pub fn append_nulls(&mut self, n: usize) { self.null_buffer_builder.append_n_nulls(n); diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 5b8a7283528a..4640902d870f 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -193,6 +193,7 @@ where Ok(key) } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { self.keys_builder.append_null() diff --git a/arrow-array/src/builder/string_dictionary_builder.rs b/arrow-array/src/builder/string_dictionary_builder.rs index f44756b6bcc5..878cfc727631 100644 --- a/arrow-array/src/builder/string_dictionary_builder.rs +++ b/arrow-array/src/builder/string_dictionary_builder.rs @@ -270,6 +270,7 @@ where Ok(key) } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { self.keys_builder.append_null() diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 98d0e1a1d275..12bcaf0944ef 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -174,6 +174,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box, field_builders: Vec>) -> Self { Self { fields, @@ -182,6 +183,7 @@ impl StructBuilder { } } + /// Creates a new `StructBuilder` from vector of [`Field`] with `capacity` pub fn from_fields(fields: Vec, capacity: usize) -> Self { let mut builders = Vec::with_capacity(fields.len()); for field in &fields { diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 351f90bacfc6..e7c5e8367e23 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -116,10 +116,15 @@ impl ExactSizeIterator for ArrayIter {} /// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray pub type PrimitiveIter<'a, T> = ArrayIter<&'a PrimitiveArray>; +/// an iterator that returns Some(T) or None, that can be used on any BooleanArray pub type BooleanIter<'a> = ArrayIter<&'a BooleanArray>; +/// an iterator that returns Some(T) or None, that can be used on any Utf8Array pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray>; +/// an iterator that returns Some(T) or None, that can be used on any BinaryArray pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; +/// an iterator that returns Some(T) or None, that can be used on any FixedSizeBinaryArray pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; +/// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; #[cfg(test)] diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 15267d3080e6..5fcd1f33d480 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -158,6 +158,9 @@ //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html //! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html +#![deny(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + pub mod array; pub use array::*; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 6f2385fa9b4a..ea0eb385358a 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -417,6 +417,7 @@ pub struct RecordBatchOptions { } impl RecordBatchOptions { + /// Creates a new `RecordBatchOptions` pub fn new() -> Self { Self { match_field_names: true, diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 13194d61f015..0646a7f29daf 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -34,16 +34,19 @@ use std::ops::{Add, Sub}; // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` // operation assumes bit-packing. +/// A boolean datatype #[derive(Debug)] pub struct BooleanType {} impl BooleanType { + /// Type represetings is arrow [`DataType`] pub const DATA_TYPE: DataType = DataType::Boolean; } macro_rules! make_type { - ($name:ident, $native_ty:ty, $data_ty:expr) => { + ($name:ident, $native_ty:ty, $data_ty:expr, $doc_string: literal) => { #[derive(Debug)] + #[doc = $doc_string] pub struct $name {} impl ArrowPrimitiveType for $name { @@ -53,89 +56,168 @@ macro_rules! make_type { }; } -make_type!(Int8Type, i8, DataType::Int8); -make_type!(Int16Type, i16, DataType::Int16); -make_type!(Int32Type, i32, DataType::Int32); -make_type!(Int64Type, i64, DataType::Int64); -make_type!(UInt8Type, u8, DataType::UInt8); -make_type!(UInt16Type, u16, DataType::UInt16); -make_type!(UInt32Type, u32, DataType::UInt32); -make_type!(UInt64Type, u64, DataType::UInt64); -make_type!(Float16Type, f16, DataType::Float16); -make_type!(Float32Type, f32, DataType::Float32); -make_type!(Float64Type, f64, DataType::Float64); +make_type!(Int8Type, i8, DataType::Int8, "A signed 8-bit integer type."); +make_type!( + Int16Type, + i16, + DataType::Int16, + "A signed 16-bit integer type." +); +make_type!( + Int32Type, + i32, + DataType::Int32, + "A signed 32-bit integer type." +); +make_type!( + Int64Type, + i64, + DataType::Int64, + "A signed 64-bit integer type." +); +make_type!( + UInt8Type, + u8, + DataType::UInt8, + "An unsigned 8-bit integer type." +); +make_type!( + UInt16Type, + u16, + DataType::UInt16, + "An unsigned 16-bit integer type." +); +make_type!( + UInt32Type, + u32, + DataType::UInt32, + "An unsigned 32-bit integer type." +); +make_type!( + UInt64Type, + u64, + DataType::UInt64, + "An unsigned 64-bit integer type." +); +make_type!( + Float16Type, + f16, + DataType::Float16, + "A 16-bit floating point number type." +); +make_type!( + Float32Type, + f32, + DataType::Float32, + "A 32-bit floating point number type." +); +make_type!( + Float64Type, + f64, + DataType::Float64, + "A 64-bit floating point number type." +); make_type!( TimestampSecondType, i64, - DataType::Timestamp(TimeUnit::Second, None) + DataType::Timestamp(TimeUnit::Second, None), + "A timestamp second type with an optional timezone." ); make_type!( TimestampMillisecondType, i64, - DataType::Timestamp(TimeUnit::Millisecond, None) + DataType::Timestamp(TimeUnit::Millisecond, None), + "A timestamp millisecond type with an optional timezone." ); make_type!( TimestampMicrosecondType, i64, - DataType::Timestamp(TimeUnit::Microsecond, None) + DataType::Timestamp(TimeUnit::Microsecond, None), + "A timestamp microsecond type with an optional timezone." ); make_type!( TimestampNanosecondType, i64, - DataType::Timestamp(TimeUnit::Nanosecond, None) + DataType::Timestamp(TimeUnit::Nanosecond, None), + "A timestamp nanosecond type with an optional timezone." +); +make_type!( + Date32Type, + i32, + DataType::Date32, + "A 32-bit date type representing the elapsed time since UNIX epoch in days(32 bits)." +); +make_type!( + Date64Type, + i64, + DataType::Date64, + "A 64-bit date type representing the elapsed time since UNIX epoch in days(32 bits)." +); +make_type!( + Time32SecondType, + i32, + DataType::Time32(TimeUnit::Second), + "A 32-bit time type representing the elapsed time since midnight in seconds." ); -make_type!(Date32Type, i32, DataType::Date32); -make_type!(Date64Type, i64, DataType::Date64); -make_type!(Time32SecondType, i32, DataType::Time32(TimeUnit::Second)); make_type!( Time32MillisecondType, i32, - DataType::Time32(TimeUnit::Millisecond) + DataType::Time32(TimeUnit::Millisecond), + "A 32-bit time type representing the elapsed time since midnight in milliseconds." ); make_type!( Time64MicrosecondType, i64, - DataType::Time64(TimeUnit::Microsecond) + DataType::Time64(TimeUnit::Microsecond), + "A 64-bit time type representing the elapsed time since midnight in microseconds." ); make_type!( Time64NanosecondType, i64, - DataType::Time64(TimeUnit::Nanosecond) + DataType::Time64(TimeUnit::Nanosecond), + "A 64-bit time type representing the elapsed time since midnight in nanoseconds." ); make_type!( IntervalYearMonthType, i32, - DataType::Interval(IntervalUnit::YearMonth) + DataType::Interval(IntervalUnit::YearMonth), + "A “calendar” interval type in months." ); make_type!( IntervalDayTimeType, i64, - DataType::Interval(IntervalUnit::DayTime) + DataType::Interval(IntervalUnit::DayTime), + "A “calendar” interval type in days and milliseconds." ); make_type!( IntervalMonthDayNanoType, i128, - DataType::Interval(IntervalUnit::MonthDayNano) + DataType::Interval(IntervalUnit::MonthDayNano), + "A “calendar” interval type in months, days, and nanoseconds." ); make_type!( DurationSecondType, i64, - DataType::Duration(TimeUnit::Second) + DataType::Duration(TimeUnit::Second), + "An elapsed time type in seconds." ); make_type!( DurationMillisecondType, i64, - DataType::Duration(TimeUnit::Millisecond) + DataType::Duration(TimeUnit::Millisecond), + "An elapsed time type in milliseconds." ); make_type!( DurationMicrosecondType, i64, - DataType::Duration(TimeUnit::Microsecond) + DataType::Duration(TimeUnit::Microsecond), + "An elapsed time type in microseconds." ); make_type!( DurationNanosecondType, i64, - DataType::Duration(TimeUnit::Nanosecond) + DataType::Duration(TimeUnit::Nanosecond), + "An elapsed time type in nanoseconds." ); /// A subtype of primitive type that represents legal dictionary keys. @@ -489,10 +571,15 @@ mod decimal { pub trait DecimalType: 'static + Send + Sync + ArrowPrimitiveType + decimal::DecimalTypeSealed { + /// Width of the type const BYTE_LENGTH: usize; + /// Maximum number of significant digits const MAX_PRECISION: u8; + /// Maximum no of digits after the decimal point (note the scale can be negative) const MAX_SCALE: i8; + /// fn to create its [`DataType`] const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType; + /// Default values for [`DataType`] const DEFAULT_TYPE: DataType; /// "Decimal128" or "Decimal256", for use in error messages @@ -621,10 +708,15 @@ pub(crate) mod bytes { /// /// See [Variable Size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { + /// Type of offset i.e i32/i64 type Offset: OffsetSizeTrait; + /// Type for representing its equivalent rust type i.e + /// Utf8Array will have native type has &str + /// BinaryArray will have type as [u8] type Native: bytes::ByteArrayNativeType + AsRef<[u8]> + ?Sized; /// "Binary" or "String", for use in error messages const PREFIX: &'static str; + /// Datatype of array elements const DATA_TYPE: DataType; } @@ -645,7 +737,9 @@ impl ByteArrayType for GenericStringType { }; } +/// An arrow utf8 array with i32 offsets pub type Utf8Type = GenericStringType; +/// An arrow utf8 array with i64 offsets pub type LargeUtf8Type = GenericStringType; /// [`ByteArrayType`] for binary arrays @@ -665,7 +759,9 @@ impl ByteArrayType for GenericBinaryType { }; } +/// An arrow binary array with i32 offsets pub type BinaryType = GenericBinaryType; +/// An arrow binary array with i64 offsets pub type LargeBinaryType = GenericBinaryType; #[cfg(test)] diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 21f96d90a5d0..0f1c0064f5a2 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -19,6 +19,9 @@ //! line-delimited records. See the module level documentation for the //! [`reader`] and [`writer`] for usage examples. +#![deny(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + pub mod reader; pub mod writer; @@ -30,6 +33,7 @@ use serde_json::{Number, Value}; /// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.). pub trait JsonSerializable: 'static { + /// Converts self into json value if its possible fn into_json_value(self) -> Option; } diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 646d9c0d1975..0d3148c5a055 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -198,6 +198,7 @@ pub struct ValueIter<'a, R: Read> { } impl<'a, R: Read> ValueIter<'a, R> { + /// Creates a new `ValueIter` pub fn new(reader: &'a mut BufReader, max_read_records: Option) -> Self { Self { reader, @@ -613,6 +614,7 @@ impl Default for DecoderOptions { } impl DecoderOptions { + /// Creates a new `DecoderOptions` pub fn new() -> Self { Default::default() } From 2c86895f3672af9a0d835204ccc03108d342361e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 05:49:19 +0000 Subject: [PATCH 0319/1411] Derive clone for arrays (#3184) * Derive clone for arrays * Also derive DictionaryArray * Sidestep derive trait constraints * Clippy --- arrow-array/src/array/boolean_array.rs | 1 + arrow-array/src/array/byte_array.rs | 10 ++++++++++ arrow-array/src/array/dictionary_array.rs | 11 +++++++++++ arrow-array/src/array/fixed_size_binary_array.rs | 1 + arrow-array/src/array/fixed_size_list_array.rs | 1 + arrow-array/src/array/list_array.rs | 10 ++++++++++ arrow-array/src/array/map_array.rs | 1 + arrow-array/src/array/null_array.rs | 1 + arrow-array/src/array/primitive_array.rs | 9 +++++++++ arrow-array/src/array/struct_array.rs | 1 + arrow-array/src/array/union_array.rs | 1 + arrow-array/src/raw_pointer.rs | 8 ++++++++ 12 files changed, 55 insertions(+) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 83af9760da71..e166f467a70c 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -63,6 +63,7 @@ use std::any::Any; /// assert!(arr.is_valid(3)); /// assert_eq!(true, arr.value(3)); /// ``` +#[derive(Clone)] pub struct BooleanArray { data: ArrayData, /// Pointer to the value array. The lifetime of this must be <= to the value buffer diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 8c2616624c0c..f846499eefbf 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -42,6 +42,16 @@ pub struct GenericByteArray { value_data: RawPtrBox, } +impl Clone for GenericByteArray { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + value_offsets: self.value_offsets, + value_data: self.value_data, + } + } +} + impl GenericByteArray { /// Data type of the array. pub const DATA_TYPE: DataType = T::DATA_TYPE; diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 002ee6f47820..6cff5bfdc9f6 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -222,6 +222,17 @@ pub struct DictionaryArray { is_ordered: bool, } +impl Clone for DictionaryArray { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + keys: self.keys.clone(), + values: self.values.clone(), + is_ordered: self.is_ordered, + } + } +} + impl DictionaryArray { /// Attempt to create a new DictionaryArray with a specified keys /// (indexes into the dictionary) and values (dictionary) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 245cf522810d..0d63fdded136 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -47,6 +47,7 @@ use std::any::Any; /// /// ``` /// +#[derive(Clone)] pub struct FixedSizeBinaryArray { data: ArrayData, value_data: RawPtrBox, diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index ca1dee35c41e..e9ceb556c642 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -60,6 +60,7 @@ use std::any::Any; /// /// For non generic lists, you may wish to consider using /// [crate::array::FixedSizeBinaryArray] +#[derive(Clone)] pub struct FixedSizeListArray { data: ArrayData, values: ArrayRef, diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 204a36c32337..3f581a88699e 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -68,6 +68,16 @@ pub struct GenericListArray { value_offsets: RawPtrBox, } +impl Clone for GenericListArray { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets, + } + } +} + impl GenericListArray { /// The data type constructor of list array. /// The input is the schema of the child array and diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 0f3ae2e689a2..c3e6cf82248c 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -28,6 +28,7 @@ use std::sync::Arc; /// /// [MapArray] is physically a [crate::array::ListArray] that has a /// [crate::array::StructArray] with 2 child fields. +#[derive(Clone)] pub struct MapArray { data: ArrayData, values: ArrayRef, diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index d796324f663f..a5ba953c2201 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -36,6 +36,7 @@ use std::any::Any; /// assert_eq!(array.len(), 10); /// assert_eq!(array.null_count(), 10); /// ``` +#[derive(Clone)] pub struct NullArray { data: ArrayData, } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 42d183238eac..e3d14e79ded0 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -257,6 +257,15 @@ pub struct PrimitiveArray { raw_values: RawPtrBox, } +impl Clone for PrimitiveArray { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + raw_values: self.raw_values, + } + } +} + impl PrimitiveArray { /// Returns the length of this array. #[inline] diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 841d3235f64b..fcbda600f680 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -50,6 +50,7 @@ use std::any::Any; /// assert_eq!(0, struct_array.null_count()); /// assert_eq!(0, struct_array.offset()); /// ``` +#[derive(Clone)] pub struct StructArray { data: ArrayData, pub(crate) boxed_fields: Vec, diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index c8ccfdc073f2..092f538bf459 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -104,6 +104,7 @@ use std::any::Any; /// let value = array.value(2).as_any().downcast_ref::().unwrap().value(0); /// assert_eq!(34, value); /// ``` +#[derive(Clone)] pub struct UnionArray { data: ArrayData, boxed_fields: Vec, diff --git a/arrow-array/src/raw_pointer.rs b/arrow-array/src/raw_pointer.rs index 3e4233ea1b24..0fea8c186d4c 100644 --- a/arrow-array/src/raw_pointer.rs +++ b/arrow-array/src/raw_pointer.rs @@ -25,6 +25,14 @@ pub(super) struct RawPtrBox { ptr: NonNull, } +impl Clone for RawPtrBox { + fn clone(&self) -> Self { + Self { ptr: self.ptr } + } +} + +impl Copy for RawPtrBox {} + impl RawPtrBox { /// # Safety /// The user must guarantee that: From 187bf619dfafccdb21cea6b2cecabd29daffc1e4 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Fri, 25 Nov 2022 15:06:12 +0800 Subject: [PATCH 0320/1411] fix: cast decimal to decimal should be round the result (#3139) Co-authored-by: Raphael Taylor-Davies --- arrow-cast/src/cast.rs | 192 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 179 insertions(+), 13 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3f17758255c7..07c7d6a3ac55 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1967,12 +1967,26 @@ fn cast_decimal_to_decimal_safe().unwrap(); if BYTE_WIDTH2 == 16 { - let iter = array - .iter() - .map(|v| v.and_then(|v| v.div_checked(div).ok())); + // rounding the result + let iter = array.iter().map(|v| { + v.map(|v| { + // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= 0 && r >= half { + d.wrapping_add(1) + } else if v < 0 && r <= neg_half { + d.wrapping_sub(1) + } else { + d + } + }) + }); let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; @@ -1981,7 +1995,17 @@ fn cast_decimal_to_decimal_safe= 0 && r >= half { + d.wrapping_add(1) + } else if v < 0 && r <= neg_half { + d.wrapping_sub(1) + } else { + d + }) + }) }); let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) @@ -1993,9 +2017,22 @@ fn cast_decimal_to_decimal_safe().unwrap(); let div = i256::from_i128(div); + let half = div / i256::from_i128(2); + let neg_half = half.wrapping_neg(); if BYTE_WIDTH2 == 16 { let iter = array.iter().map(|v| { - v.and_then(|v| v.div_checked(div).ok().and_then(|v| v.to_i128())) + v.and_then(|v| { + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= i256::ZERO && r >= half { + d.wrapping_add(i256::ONE) + } else if v < i256::ZERO && r <= neg_half { + d.wrapping_sub(i256::ONE) + } else { + d + } + .to_i128() + }) }); let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) @@ -2004,9 +2041,19 @@ fn cast_decimal_to_decimal_safe= i256::ZERO && r >= half { + d.wrapping_add(i256::ONE) + } else if v < i256::ZERO && r <= neg_half { + d.wrapping_sub(i256::ONE) + } else { + d + } + }) + }); let casted_array = unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }; @@ -3566,6 +3613,125 @@ mod tests { .with_precision_and_scale(precision, scale) } + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_cast_decimal_to_decimal_round() { + let array = vec![ + Some(1123454), + Some(2123456), + Some(-3123453), + Some(-3123456), + None, + ]; + let input_decimal_array = create_decimal_array(array, 20, 4).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + // decimal128 to decimal128 + let input_type = DataType::Decimal128(20, 4); + let output_type = DataType::Decimal128(20, 3); + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(112345_i128), + Some(212346_i128), + Some(-312345_i128), + Some(-312346_i128), + None + ] + ); + + // decimal128 to decimal256 + let input_type = DataType::Decimal128(20, 4); + let output_type = DataType::Decimal256(20, 3); + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some(i256::from_i128(112345_i128)), + Some(i256::from_i128(212346_i128)), + Some(i256::from_i128(-312345_i128)), + Some(i256::from_i128(-312346_i128)), + None + ] + ); + + // decimal256 + let array = vec![ + Some(i256::from_i128(1123454)), + Some(i256::from_i128(2123456)), + Some(i256::from_i128(-3123453)), + Some(i256::from_i128(-3123456)), + None, + ]; + let input_decimal_array = create_decimal256_array(array, 20, 4).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + + // decimal256 to decimal256 + let input_type = DataType::Decimal256(20, 4); + let output_type = DataType::Decimal256(20, 3); + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some(i256::from_i128(112345_i128)), + Some(i256::from_i128(212346_i128)), + Some(i256::from_i128(-312345_i128)), + Some(i256::from_i128(-312346_i128)), + None + ] + ); + // decimal256 to decimal128 + let input_type = DataType::Decimal256(20, 4); + let output_type = DataType::Decimal128(20, 3); + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(112345_i128), + Some(212346_i128), + Some(-312345_i128), + Some(-312346_i128), + None + ] + ); + + // decimal256 to decimal128 overflow + let array = vec![ + Some(i256::from_i128(1123454)), + Some(i256::from_i128(2123456)), + Some(i256::from_i128(-3123453)), + Some(i256::from_i128(-3123456)), + None, + Some(i256::MAX), + Some(i256::MIN), + ]; + let input_decimal_array = create_decimal256_array(array, 76, 4).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(112345_i128), + Some(212346_i128), + Some(-312345_i128), + Some(-312346_i128), + None, + None, + None + ] + ); + } + #[test] #[cfg(not(feature = "force_validate"))] fn test_cast_decimal128_to_decimal128() { @@ -7219,7 +7385,7 @@ mod tests { let input_type = DataType::Decimal128(20, 0); let output_type = DataType::Decimal128(20, -1); assert!(can_cast_types(&input_type, &output_type)); - let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = vec![Some(1123450), Some(2123455), Some(3123456), None]; let input_decimal_array = create_decimal_array(array, 20, 0).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; generate_cast_test_case!( @@ -7228,8 +7394,8 @@ mod tests { &output_type, vec![ Some(112345_i128), - Some(212345_i128), - Some(312345_i128), + Some(212346_i128), + Some(312346_i128), None ] ); @@ -7238,8 +7404,8 @@ mod tests { let decimal_arr = as_primitive_array::(&casted_array); assert_eq!("1123450", decimal_arr.value_as_string(0)); - assert_eq!("2123450", decimal_arr.value_as_string(1)); - assert_eq!("3123450", decimal_arr.value_as_string(2)); + assert_eq!("2123460", decimal_arr.value_as_string(1)); + assert_eq!("3123460", decimal_arr.value_as_string(2)); } #[test] From d74c48e0541aba2941daf6ea2ce8dce84619bda5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 07:06:30 +0000 Subject: [PATCH 0321/1411] Row decode cleanups (#3180) * Row decode cleanups * Clippy --- arrow/src/row/fixed.rs | 65 ++++++++++++++------------------------ arrow/src/row/mod.rs | 71 +++++++++--------------------------------- 2 files changed, 39 insertions(+), 97 deletions(-) diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 76bf358e7e03..0bad033d9bd8 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -19,8 +19,9 @@ use crate::array::PrimitiveArray; use crate::compute::SortOptions; use crate::datatypes::ArrowPrimitiveType; use crate::row::{null_sentinel, Rows}; +use arrow_array::builder::BufferBuilder; use arrow_array::BooleanArray; -use arrow_buffer::{bit_util, i256, MutableBuffer, ToByteSlice}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use half::f16; @@ -266,61 +267,43 @@ pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray { unsafe { BooleanArray::from(builder.build_unchecked()) } } +fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) { + let mut null_count = 0; + let buffer = MutableBuffer::collect_bool(rows.len(), |idx| { + let valid = rows[idx][0] == 1; + null_count += !valid as usize; + valid + }) + .into(); + (null_count, buffer) +} + /// Decodes a `ArrayData` from rows based on the provided `FixedLengthEncoding` `T` /// /// # Safety /// /// `data_type` must be appropriate native type for `T` -unsafe fn decode_fixed( +unsafe fn decode_fixed( rows: &mut [&[u8]], data_type: DataType, options: SortOptions, ) -> ArrayData { let len = rows.len(); - let mut null_count = 0; - let mut nulls = MutableBuffer::new(bit_util::ceil(len, 64) * 8); - let mut values = MutableBuffer::new(std::mem::size_of::() * len); + let mut values = BufferBuilder::::new(len); + let (null_count, nulls) = decode_nulls(rows); - let chunks = len / 64; - let remainder = len % 64; - for chunk in 0..chunks { - let mut null_packed = 0; - - for bit_idx in 0..64 { - let i = split_off(&mut rows[bit_idx + chunk * 64], T::ENCODED_LEN); - let null = i[0] == 1; - null_count += !null as usize; - null_packed |= (null as u64) << bit_idx; - - let value = T::Encoded::from_slice(&i[1..], options.descending); - values.push(T::decode(value)); - } - - nulls.push(null_packed); - } - - if remainder != 0 { - let mut null_packed = 0; - - for bit_idx in 0..remainder { - let i = split_off(&mut rows[bit_idx + chunks * 64], T::ENCODED_LEN); - let null = i[0] == 1; - null_count += !null as usize; - null_packed |= (null as u64) << bit_idx; - - let value = T::Encoded::from_slice(&i[1..], options.descending); - values.push(T::decode(value)); - } - - nulls.push(null_packed); + for row in rows { + let i = split_off(row, T::ENCODED_LEN); + let value = T::Encoded::from_slice(&i[1..], options.descending); + values.append(T::decode(value)); } let builder = ArrayDataBuilder::new(data_type) - .len(rows.len()) + .len(len) .null_count(null_count) - .add_buffer(values.into()) - .null_bit_buffer(Some(nulls.into())); + .add_buffer(values.finish()) + .null_bit_buffer(Some(nulls)); // SAFETY: Buffers correct length builder.build_unchecked() @@ -333,7 +316,7 @@ pub fn decode_primitive( options: SortOptions, ) -> PrimitiveArray where - T::Native: FixedLengthEncoding + ToByteSlice, + T::Native: FixedLengthEncoding, { assert_eq!( std::mem::discriminant(&T::DATA_TYPE), diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 6ce9f2b12c25..4f48b46cb2a6 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -908,11 +908,22 @@ fn encode_column( } macro_rules! decode_primitive_helper { - ($t:ty, $rows: ident, $data_type:ident, $options:ident) => { + ($t:ty, $rows:ident, $data_type:ident, $options:ident) => { Arc::new(decode_primitive::<$t>($rows, $data_type, $options)) }; } +macro_rules! decode_dictionary_helper { + ($t:ty, $interner:ident, $v:ident, $options:ident, $rows:ident) => { + Arc::new(decode_dictionary::<$t>( + $interner.unwrap(), + $v.as_ref(), + $options, + $rows, + )?) + }; +} + /// Decodes a the provided `field` from `rows` /// /// # Safety @@ -934,61 +945,9 @@ unsafe fn decode_column( DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), - DataType::Dictionary(k, v) => match k.as_ref() { - DataType::Int8 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::Int16 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::Int32 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::Int64 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::UInt8 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::UInt16 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::UInt32 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - DataType::UInt64 => Arc::new(decode_dictionary::( - interner.unwrap(), - v.as_ref(), - options, - rows, - )?), - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "{} is not a valid dictionary key type", - field.data_type - ))); - } + DataType::Dictionary(k, v) => downcast_integer! { + k.as_ref() => (decode_dictionary_helper, interner, v, options, rows), + _ => unreachable!() }, _ => { return Err(ArrowError::NotYetImplemented(format!( From 4c37ec24a95a4a0c30f5f5eb4e81fd3647816a6f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 08:29:49 +0000 Subject: [PATCH 0322/1411] StructArray::columns return slice (#3186) --- arrow-array/src/array/struct_array.rs | 7 ++++--- arrow-cast/src/display.rs | 2 +- arrow-ipc/src/writer.rs | 2 +- parquet/src/arrow/arrow_writer/levels.rs | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index fcbda600f680..6c6490e3168f 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -68,13 +68,14 @@ impl StructArray { } /// Returns the fields of the struct array - pub fn columns(&self) -> Vec<&ArrayRef> { - self.boxed_fields.iter().collect() + pub fn columns(&self) -> &[ArrayRef] { + &self.boxed_fields } /// Returns child array refs of the struct array + #[deprecated(note = "Use columns().to_vec()")] pub fn columns_ref(&self) -> Vec { - self.boxed_fields.clone() + self.columns().to_vec() } /// Return field names in this struct array diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index ae1c799a4ef8..434f750afc48 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -459,7 +459,7 @@ pub fn array_value_to_string( let mut s = String::new(); s.push('{'); - let mut kv_iter = st.columns().into_iter().zip(st.column_names().into_iter()); + let mut kv_iter = st.columns().iter().zip(st.column_names()); if let Some((col, name)) = kv_iter.next() { append_struct_field_string(&mut s, name, col, row)?; } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index dec44de177f3..0497cbe5e47f 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -177,7 +177,7 @@ impl IpcDataGenerator { match column.data_type() { DataType::Struct(fields) => { let s = as_struct_array(column); - for (field, &column) in fields.iter().zip(s.columns().iter()) { + for (field, column) in fields.iter().zip(s.columns()) { self.encode_dictionaries( field, column, diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index e2a8a8c50e9c..182f68c498ff 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -324,7 +324,7 @@ impl LevelInfoBuilder { }; let write_non_null = |children: &mut [LevelInfoBuilder], range: Range| { - for (child_array, child) in array.columns().into_iter().zip(children) { + for (child_array, child) in array.columns().iter().zip(children) { child.write(child_array, range.clone()) } }; From 3998bed87d9eaa95ea1e1e205bac1fa218a9e0f1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 10:48:17 +0000 Subject: [PATCH 0323/1411] Add sleep to object_store CI (#3189) * Add sleep to object_store CI * Update .github/workflows/object_store.yml Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- .github/workflows/object_store.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 370c1ced380f..23c5bab13a32 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -86,6 +86,8 @@ jobs: - name: Configure Fake GCS Server (GCP emulation) run: | docker run -d -p 4443:4443 fsouza/fake-gcs-server -scheme http + # Give the container a moment to start up prior to configuring it + sleep 1 curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" From cbe5af071ce68b2a36d9e9881767ebd95bfdac83 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 12:52:44 +0000 Subject: [PATCH 0324/1411] Tweak row format docs (#3191) --- arrow/src/row/mod.rs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 4f48b46cb2a6..21d8e4df0624 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -17,19 +17,13 @@ //! A comparable row-oriented representation of a collection of [`Array`]. //! -//! [`Row`]s are [normalized for sorting], and can be very efficiently [compared], -//! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. This -//! makes the row format ideal for implementing efficient multi-column sorting, -//! grouping, aggregation, windowing and more. +//! [`Row`]s are [normalized for sorting], and can therefore be very efficiently [compared], +//! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. +//! This makes the row format ideal for implementing efficient multi-column sorting, +//! grouping, aggregation, windowing and more, as described in more detail +//! [here](https://arrow.apache.org/blog/2022/11/07/multi-column-sorts-in-arrow-rust-part-1/). //! -//! The format is described in more detail on [`RowConverter`] as well as the -//! [Fast and Memory Efficient Multi-Column Sorts in Apache Arrow Rust](https://arrow.apache.org/blog/2022/11/07/multi-column-sorts-in-arrow-rust-part-1/) article. -//! -//! _[`Rows`] generated by different [`RowConverter`] are arbitrarily -//! ordered. The same [`RowConverter`] must be used for the comparison -//! to be well defined._ -//! -//! For example, given three input [`Array`]s, this code creates byte +//! For example, given three input [`Array`], [`RowConverter`] creates byte //! sequences that [compare] the same as when using [`lexsort`]. //! //! ```text @@ -50,6 +44,9 @@ //! (Columns) //! ``` //! +//! _[`Rows`] must be generated by the same [`RowConverter`] for the comparison +//! to be meaningful._ +//! //! # Basic Example //! ``` //! # use std::sync::Arc; From 14e6212198ce75c9c17147edd3deedf126dae452 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Fri, 25 Nov 2022 22:20:55 +0530 Subject: [PATCH 0325/1411] Improve regex related kernels (#3192) --- arrow-csv/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index fc4c177bd043..5255244a1214 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -47,7 +47,7 @@ chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } lazy_static = { version = "1.4", default-features = false } lexical-core = { version = "^0.8", default-features = false } -regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } +regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] tempfile = "3.3" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ab8963b9c300..1e90e1e09e17 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -57,7 +57,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.13", default-features = false } -regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } +regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } From 426a3d4868dd17065e81774d00b51931c53c37dc Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 25 Nov 2022 13:30:03 -0500 Subject: [PATCH 0326/1411] Update version to 28.0.0 and add changelog (#3181) * Update version * Create changelog --- CHANGELOG-old.md | 121 ++++++++++++- CHANGELOG.md | 179 ++++++++----------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 10 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow/Cargo.toml | 20 +-- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- object_store/CONTRIBUTING.md | 4 +- parquet/Cargo.toml | 20 +-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 25 files changed, 280 insertions(+), 186 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 946958f1a636..5adb12a913a9 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,123 @@ # Historical Changelog +## [27.0.0](https://github.com/apache/arrow-rs/tree/27.0.0) (2022-11-11) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/26.0.0...27.0.0) + +**Breaking changes:** + +- Recurse into Dictionary value type in DataType::is\_nested [\#3083](https://github.com/apache/arrow-rs/pull/3083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- early type checks in `RowConverter` [\#3080](https://github.com/apache/arrow-rs/pull/3080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add Decimal128 and Decimal256 to downcast\_primitive [\#3056](https://github.com/apache/arrow-rs/pull/3056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace remaining \_generic temporal kernels with \_dyn kernels [\#3046](https://github.com/apache/arrow-rs/pull/3046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace year\_generic with year\_dyn [\#3041](https://github.com/apache/arrow-rs/pull/3041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Validate decimal256 with i256 directly [\#3025](https://github.com/apache/arrow-rs/pull/3025) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Hadoop LZ4 Support for LZ4 Codec [\#3013](https://github.com/apache/arrow-rs/pull/3013) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Replace hour\_generic with hour\_dyn [\#3006](https://github.com/apache/arrow-rs/pull/3006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Accept any &dyn Array in nullif kernel [\#2940](https://github.com/apache/arrow-rs/pull/2940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Row Format: Option to detach/own a row [\#3078](https://github.com/apache/arrow-rs/issues/3078) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row Format: API to check if datatypes are supported [\#3077](https://github.com/apache/arrow-rs/issues/3077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Deprecate Buffer::count\_set\_bits [\#3067](https://github.com/apache/arrow-rs/issues/3067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Decimal128 and Decimal256 to downcast\_primitive [\#3055](https://github.com/apache/arrow-rs/issues/3055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3042](https://github.com/apache/arrow-rs/issues/3042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast decimal256 to signed integer [\#3039](https://github.com/apache/arrow-rs/issues/3039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting Date64 to Timestamp [\#3037](https://github.com/apache/arrow-rs/issues/3037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check overflow when casting floating point value to decimal256 [\#3032](https://github.com/apache/arrow-rs/issues/3032) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare i256 in validate\_decimal256\_precision [\#3024](https://github.com/apache/arrow-rs/issues/3024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check overflow when casting floating point value to decimal128 [\#3020](https://github.com/apache/arrow-rs/issues/3020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add macro downcast\_temporal\_array [\#3008](https://github.com/apache/arrow-rs/issues/3008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace hour\_generic with hour\_dyn [\#3005](https://github.com/apache/arrow-rs/issues/3005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace temporal \_generic kernels with dyn [\#3004](https://github.com/apache/arrow-rs/issues/3004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `RowSelection::intersection` [\#3003](https://github.com/apache/arrow-rs/issues/3003) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- I would like to round rather than truncate when casting f64 to decimal [\#2997](https://github.com/apache/arrow-rs/issues/2997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow::compute::kernels::temporal should support nanoseconds [\#2995](https://github.com/apache/arrow-rs/issues/2995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release Arrow `26.0.0` \(next release after `25.0.0`\) [\#2953](https://github.com/apache/arrow-rs/issues/2953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add timezone offset for debug format of Timestamp with Timezone [\#2917](https://github.com/apache/arrow-rs/issues/2917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support merge RowSelectors when creating RowSelection [\#2858](https://github.com/apache/arrow-rs/issues/2858) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Inconsistent Nan Handling Between Scalar and Non-Scalar Comparison Kernels [\#3074](https://github.com/apache/arrow-rs/issues/3074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Debug format for timestamp ignores timezone [\#3069](https://github.com/apache/arrow-rs/issues/3069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row format decode loses timezone [\#3063](https://github.com/apache/arrow-rs/issues/3063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- binary operator produces incorrect result on arrays with resized null buffer [\#3061](https://github.com/apache/arrow-rs/issues/3061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RLEDecoder Panics on Null Padded Pages [\#3035](https://github.com/apache/arrow-rs/issues/3035) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Nullif with incorrect valid\_count [\#3031](https://github.com/apache/arrow-rs/issues/3031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RLEDecoder::get\_batch\_with\_dict may panic on bit-packed runs longer than 1024 [\#3029](https://github.com/apache/arrow-rs/issues/3029) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Converted type is None according to Parquet Tools then utilizing logical types [\#3017](https://github.com/apache/arrow-rs/issues/3017) +- CompressionCodec LZ4 incompatible with C++ implementation [\#2988](https://github.com/apache/arrow-rs/issues/2988) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Mark parquet predicate pushdown as complete [\#2987](https://github.com/apache/arrow-rs/pull/2987) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Merged pull requests:** + +- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3088](https://github.com/apache/arrow-rs/pull/3088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([src255](https://github.com/src255)) +- Remove unused range module [\#3085](https://github.com/apache/arrow-rs/pull/3085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make intersect\_row\_selections a member function [\#3084](https://github.com/apache/arrow-rs/pull/3084) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update hashbrown requirement from 0.12 to 0.13 [\#3081](https://github.com/apache/arrow-rs/pull/3081) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: add `OwnedRow` [\#3079](https://github.com/apache/arrow-rs/pull/3079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Use ArrowNativeTypeOp on non-scalar comparison kernels [\#3075](https://github.com/apache/arrow-rs/pull/3075) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add missing inline to ArrowNativeTypeOp [\#3073](https://github.com/apache/arrow-rs/pull/3073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix debug information for Timestamp with Timezone [\#3072](https://github.com/apache/arrow-rs/pull/3072) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Deprecate Buffer::count\_set\_bits \(\#3067\) [\#3071](https://github.com/apache/arrow-rs/pull/3071) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add compare to ArrowNativeTypeOp [\#3070](https://github.com/apache/arrow-rs/pull/3070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve docstrings on WriterPropertiesBuilder [\#3068](https://github.com/apache/arrow-rs/pull/3068) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Faster f64 inequality [\#3065](https://github.com/apache/arrow-rs/pull/3065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix row format decode loses timezone \(\#3063\) [\#3064](https://github.com/apache/arrow-rs/pull/3064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix null\_count computation in binary [\#3062](https://github.com/apache/arrow-rs/pull/3062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Faster f64 equality [\#3060](https://github.com/apache/arrow-rs/pull/3060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update arrow-flight subcrates \(\#3044\) [\#3052](https://github.com/apache/arrow-rs/pull/3052) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Minor: Remove cloning ArrayData in with\_precision\_and\_scale [\#3050](https://github.com/apache/arrow-rs/pull/3050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split out arrow-json \(\#3044\) [\#3049](https://github.com/apache/arrow-rs/pull/3049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move `intersect_row_selections` from datafusion to arrow-rs. [\#3047](https://github.com/apache/arrow-rs/pull/3047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Split out arrow-csv \(\#2594\) [\#3044](https://github.com/apache/arrow-rs/pull/3044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move reader\_parser to arrow-cast \(\#3022\) [\#3043](https://github.com/apache/arrow-rs/pull/3043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cast decimal256 to signed integer [\#3040](https://github.com/apache/arrow-rs/pull/3040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable casting from Date64 to Timestamp [\#3038](https://github.com/apache/arrow-rs/pull/3038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Fix decoding long and/or padded RLE data \(\#3029\) \(\#3035\) [\#3036](https://github.com/apache/arrow-rs/pull/3036) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix nullif when existing array has no nulls [\#3034](https://github.com/apache/arrow-rs/pull/3034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow when casting floating point value to decimal256 [\#3033](https://github.com/apache/arrow-rs/pull/3033) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update parquet to depend on arrow subcrates [\#3028](https://github.com/apache/arrow-rs/pull/3028) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make various i256 methods const [\#3026](https://github.com/apache/arrow-rs/pull/3026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-ipc [\#3022](https://github.com/apache/arrow-rs/pull/3022) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow while casting floating point value to decimal128 [\#3021](https://github.com/apache/arrow-rs/pull/3021) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update arrow-flight [\#3019](https://github.com/apache/arrow-rs/pull/3019) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Move ArrowNativeTypeOp to arrow-array \(\#2594\) [\#3018](https://github.com/apache/arrow-rs/pull/3018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support cast timestamp to time [\#3016](https://github.com/apache/arrow-rs/pull/3016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([naosense](https://github.com/naosense)) +- Add filter example [\#3014](https://github.com/apache/arrow-rs/pull/3014) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Check overflow when casting integer to decimal [\#3009](https://github.com/apache/arrow-rs/pull/3009) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add macro downcast\_temporal\_array [\#3007](https://github.com/apache/arrow-rs/pull/3007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Parquet Writer: Make column descriptor public on the writer [\#3002](https://github.com/apache/arrow-rs/pull/3002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pier-oliviert](https://github.com/pier-oliviert)) +- Update chrono-tz requirement from 0.7 to 0.8 [\#3001](https://github.com/apache/arrow-rs/pull/3001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Round instead of Truncate while casting float to decimal [\#3000](https://github.com/apache/arrow-rs/pull/3000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Support Predicate Pushdown for Parquet Lists \(\#2108\) [\#2999](https://github.com/apache/arrow-rs/pull/2999) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-cast \(\#2594\) [\#2998](https://github.com/apache/arrow-rs/pull/2998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- `arrow::compute::kernels::temporal` should support nanoseconds [\#2996](https://github.com/apache/arrow-rs/pull/2996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Add `RowSelection::from_selectors_and_combine` to merge RowSelectors [\#2994](https://github.com/apache/arrow-rs/pull/2994) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Simplify Single-Column Dictionary Sort [\#2993](https://github.com/apache/arrow-rs/pull/2993) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Add entry to changelog for 26.0.0 RC2 fix [\#2992](https://github.com/apache/arrow-rs/pull/2992) ([alamb](https://github.com/alamb)) +- Fix ignored limit on `lexsort_to_indices` [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add clone and equal functions for CastOptions [\#2985](https://github.com/apache/arrow-rs/pull/2985) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- minor: remove redundant prefix [\#2983](https://github.com/apache/arrow-rs/pull/2983) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) +- Compare dictionary decimal arrays [\#2982](https://github.com/apache/arrow-rs/pull/2982) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Compare dictionary and non-dictionary decimal arrays [\#2980](https://github.com/apache/arrow-rs/pull/2980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add decimal comparison kernel support [\#2978](https://github.com/apache/arrow-rs/pull/2978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Move concat kernel to arrow-select \(\#2594\) [\#2976](https://github.com/apache/arrow-rs/pull/2976) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Specialize interleave for byte arrays \(\#2864\) [\#2975](https://github.com/apache/arrow-rs/pull/2975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use unary function for numeric to decimal cast [\#2973](https://github.com/apache/arrow-rs/pull/2973) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Specialize filter kernel for binary arrays \(\#2969\) [\#2971](https://github.com/apache/arrow-rs/pull/2971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Combine take\_utf8 and take\_binary \(\#2969\) [\#2970](https://github.com/apache/arrow-rs/pull/2970) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster Scalar Dictionary Comparison ~10% [\#2968](https://github.com/apache/arrow-rs/pull/2968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move `byte_size` from datafusion::physical\_expr [\#2965](https://github.com/apache/arrow-rs/pull/2965) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Pass decompressed size to parquet Codec::decompress \(\#2956\) [\#2959](https://github.com/apache/arrow-rs/pull/2959) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) +- Add Decimal Arithmetic [\#2881](https://github.com/apache/arrow-rs/pull/2881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + ## [26.0.0](https://github.com/apache/arrow-rs/tree/26.0.0) (2022-10-28) [Full Changelog](https://github.com/apache/arrow-rs/compare/25.0.0...26.0.0) @@ -114,6 +231,7 @@ - Add downcast\_integer and downcast\_primitive [\#2872](https://github.com/apache/arrow-rs/pull/2872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Filter DecimalArray as PrimitiveArray ~5x Faster \(\#2637\) [\#2870](https://github.com/apache/arrow-rs/pull/2870) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Treat DecimalArray as PrimitiveArray in row format [\#2866](https://github.com/apache/arrow-rs/pull/2866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + ## [25.0.0](https://github.com/apache/arrow-rs/tree/25.0.0) (2022-10-14) [Full Changelog](https://github.com/apache/arrow-rs/compare/24.0.0...25.0.0) @@ -182,6 +300,7 @@ - Add i256 \(\#2637\) [\#2781](https://github.com/apache/arrow-rs/pull/2781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add modulus ops into `ArrowNativeTypeOp` [\#2756](https://github.com/apache/arrow-rs/pull/2756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) - feat: cast List / LargeList to Utf8 / LargeUtf8 [\#2588](https://github.com/apache/arrow-rs/pull/2588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gandronchik](https://github.com/gandronchik)) + ## [24.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/23.0.0...24.0.0) @@ -239,8 +358,6 @@ - Add overflow-checking variants of arithmetic dyn kernels [\#2740](https://github.com/apache/arrow-rs/pull/2740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - Trim parquet row selection [\#2705](https://github.com/apache/arrow-rs/pull/2705) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - - ## [23.0.0](https://github.com/apache/arrow-rs/tree/24.0.0) (2022-09-16) [Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...23.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00f6876855f0..c775d33526ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,122 +19,99 @@ # Changelog -## [27.0.0](https://github.com/apache/arrow-rs/tree/27.0.0) (2022-11-11) +## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-24) -[Full Changelog](https://github.com/apache/arrow-rs/compare/26.0.0...27.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/27.0.0...28.0.0) **Breaking changes:** -- Recurse into Dictionary value type in DataType::is\_nested [\#3083](https://github.com/apache/arrow-rs/pull/3083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- early type checks in `RowConverter` [\#3080](https://github.com/apache/arrow-rs/pull/3080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Add Decimal128 and Decimal256 to downcast\_primitive [\#3056](https://github.com/apache/arrow-rs/pull/3056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Replace remaining \_generic temporal kernels with \_dyn kernels [\#3046](https://github.com/apache/arrow-rs/pull/3046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Replace year\_generic with year\_dyn [\#3041](https://github.com/apache/arrow-rs/pull/3041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Validate decimal256 with i256 directly [\#3025](https://github.com/apache/arrow-rs/pull/3025) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Hadoop LZ4 Support for LZ4 Codec [\#3013](https://github.com/apache/arrow-rs/pull/3013) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) -- Replace hour\_generic with hour\_dyn [\#3006](https://github.com/apache/arrow-rs/pull/3006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Accept any &dyn Array in nullif kernel [\#2940](https://github.com/apache/arrow-rs/pull/2940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return slice from GenericByteArray::value\_data [\#3171](https://github.com/apache/arrow-rs/pull/3171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support decimal negative scale [\#3152](https://github.com/apache/arrow-rs/pull/3152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- refactor: convert `Field::metadata` to `HashMap` [\#3148](https://github.com/apache/arrow-rs/pull/3148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Don't Skip Serializing Empty Metadata \(\#3082\) [\#3126](https://github.com/apache/arrow-rs/pull/3126) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add Decimal128, Decimal256, Float16 to DataType::is\_numeric [\#3121](https://github.com/apache/arrow-rs/pull/3121) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Upgrade to thrift 0.17 and fix issues [\#3104](https://github.com/apache/arrow-rs/pull/3104) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Fix prettyprint for Interval second fractions [\#3093](https://github.com/apache/arrow-rs/pull/3093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Remove Option from `Field::metadata` [\#3091](https://github.com/apache/arrow-rs/pull/3091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) **Implemented enhancements:** -- Row Format: Option to detach/own a row [\#3078](https://github.com/apache/arrow-rs/issues/3078) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Row Format: API to check if datatypes are supported [\#3077](https://github.com/apache/arrow-rs/issues/3077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Deprecate Buffer::count\_set\_bits [\#3067](https://github.com/apache/arrow-rs/issues/3067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add Decimal128 and Decimal256 to downcast\_primitive [\#3055](https://github.com/apache/arrow-rs/issues/3055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3042](https://github.com/apache/arrow-rs/issues/3042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cast decimal256 to signed integer [\#3039](https://github.com/apache/arrow-rs/issues/3039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting Date64 to Timestamp [\#3037](https://github.com/apache/arrow-rs/issues/3037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Check overflow when casting floating point value to decimal256 [\#3032](https://github.com/apache/arrow-rs/issues/3032) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare i256 in validate\_decimal256\_precision [\#3024](https://github.com/apache/arrow-rs/issues/3024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Check overflow when casting floating point value to decimal128 [\#3020](https://github.com/apache/arrow-rs/issues/3020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add macro downcast\_temporal\_array [\#3008](https://github.com/apache/arrow-rs/issues/3008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace hour\_generic with hour\_dyn [\#3005](https://github.com/apache/arrow-rs/issues/3005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace temporal \_generic kernels with dyn [\#3004](https://github.com/apache/arrow-rs/issues/3004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `RowSelection::intersection` [\#3003](https://github.com/apache/arrow-rs/issues/3003) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- I would like to round rather than truncate when casting f64 to decimal [\#2997](https://github.com/apache/arrow-rs/issues/2997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow::compute::kernels::temporal should support nanoseconds [\#2995](https://github.com/apache/arrow-rs/issues/2995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Release Arrow `26.0.0` \(next release after `25.0.0`\) [\#2953](https://github.com/apache/arrow-rs/issues/2953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Add timezone offset for debug format of Timestamp with Timezone [\#2917](https://github.com/apache/arrow-rs/issues/2917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support merge RowSelectors when creating RowSelection [\#2858](https://github.com/apache/arrow-rs/issues/2858) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add iterator to RowSelection [\#3172](https://github.com/apache/arrow-rs/issues/3172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Row Format Size Tracking [\#3160](https://github.com/apache/arrow-rs/issues/3160) +- Add ArrayBuilder::finish\_cloned\(\) [\#3154](https://github.com/apache/arrow-rs/issues/3154) +- Optimize memory usage of json reader [\#3150](https://github.com/apache/arrow-rs/issues/3150) +- Add `Field::size` and `DataType::size` [\#3147](https://github.com/apache/arrow-rs/issues/3147) +- Add like\_utf8\_scalar\_dyn kernel [\#3145](https://github.com/apache/arrow-rs/issues/3145) +- support comparison for decimal128 array with scalar in kernel [\#3140](https://github.com/apache/arrow-rs/issues/3140) +- Replace custom date/time add/sub months by chrono 0.4.23's new api [\#3131](https://github.com/apache/arrow-rs/issues/3131) +- Upgrade chrono to 0.4.23 [\#3120](https://github.com/apache/arrow-rs/issues/3120) +- Implements more temporal kernels using time\_fraction\_dyn [\#3108](https://github.com/apache/arrow-rs/issues/3108) +- Upgrade to thrift 0.17 [\#3105](https://github.com/apache/arrow-rs/issues/3105) +- Be able to parse time formatted strings [\#3100](https://github.com/apache/arrow-rs/issues/3100) +- Improve "Fail to merge schema" error messages [\#3095](https://github.com/apache/arrow-rs/issues/3095) +- Expose `SortingColumn` when reading and writing parquet metadata [\#3090](https://github.com/apache/arrow-rs/issues/3090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Change Field::metadata to HashMap [\#3086](https://github.com/apache/arrow-rs/issues/3086) +- API to take back ownership of an ArrayRef [\#2901](https://github.com/apache/arrow-rs/issues/2901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Specialized Interleave Kernel [\#2864](https://github.com/apache/arrow-rs/issues/2864) **Fixed bugs:** -- Inconsistent Nan Handling Between Scalar and Non-Scalar Comparison Kernels [\#3074](https://github.com/apache/arrow-rs/issues/3074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Debug format for timestamp ignores timezone [\#3069](https://github.com/apache/arrow-rs/issues/3069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Row format decode loses timezone [\#3063](https://github.com/apache/arrow-rs/issues/3063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- binary operator produces incorrect result on arrays with resized null buffer [\#3061](https://github.com/apache/arrow-rs/issues/3061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RLEDecoder Panics on Null Padded Pages [\#3035](https://github.com/apache/arrow-rs/issues/3035) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Nullif with incorrect valid\_count [\#3031](https://github.com/apache/arrow-rs/issues/3031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RLEDecoder::get\_batch\_with\_dict may panic on bit-packed runs longer than 1024 [\#3029](https://github.com/apache/arrow-rs/issues/3029) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Converted type is None according to Parquet Tools then utilizing logical types [\#3017](https://github.com/apache/arrow-rs/issues/3017) -- CompressionCodec LZ4 incompatible with C++ implementation [\#2988](https://github.com/apache/arrow-rs/issues/2988) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) +- Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) +- Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) +- Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) +- Decimal Casts are Unchecked [\#2986](https://github.com/apache/arrow-rs/issues/2986) +- Reading parquet files with a corrupt ARROW:schema panics [\#2855](https://github.com/apache/arrow-rs/issues/2855) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -**Documentation updates:** +**Closed issues:** -- Mark parquet predicate pushdown as complete [\#2987](https://github.com/apache/arrow-rs/pull/2987) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- audit and create a document for bloom filter configurations [\#3138](https://github.com/apache/arrow-rs/issues/3138) +- Release Arrow `27.0.0` \(next release after `26.0.0`\) [\#3045](https://github.com/apache/arrow-rs/issues/3045) +- Perf about ParquetRecordBatchStream vs ParquetRecordBatchReader [\#2916](https://github.com/apache/arrow-rs/issues/2916) **Merged pull requests:** -- Improved UX of creating `TimestampNanosecondArray` with timezones [\#3088](https://github.com/apache/arrow-rs/pull/3088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([src255](https://github.com/src255)) -- Remove unused range module [\#3085](https://github.com/apache/arrow-rs/pull/3085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Make intersect\_row\_selections a member function [\#3084](https://github.com/apache/arrow-rs/pull/3084) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update hashbrown requirement from 0.12 to 0.13 [\#3081](https://github.com/apache/arrow-rs/pull/3081) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat: add `OwnedRow` [\#3079](https://github.com/apache/arrow-rs/pull/3079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Use ArrowNativeTypeOp on non-scalar comparison kernels [\#3075](https://github.com/apache/arrow-rs/pull/3075) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add missing inline to ArrowNativeTypeOp [\#3073](https://github.com/apache/arrow-rs/pull/3073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix debug information for Timestamp with Timezone [\#3072](https://github.com/apache/arrow-rs/pull/3072) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Deprecate Buffer::count\_set\_bits \(\#3067\) [\#3071](https://github.com/apache/arrow-rs/pull/3071) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add compare to ArrowNativeTypeOp [\#3070](https://github.com/apache/arrow-rs/pull/3070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Improve docstrings on WriterPropertiesBuilder [\#3068](https://github.com/apache/arrow-rs/pull/3068) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Faster f64 inequality [\#3065](https://github.com/apache/arrow-rs/pull/3065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix row format decode loses timezone \(\#3063\) [\#3064](https://github.com/apache/arrow-rs/pull/3064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix null\_count computation in binary [\#3062](https://github.com/apache/arrow-rs/pull/3062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Faster f64 equality [\#3060](https://github.com/apache/arrow-rs/pull/3060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update arrow-flight subcrates \(\#3044\) [\#3052](https://github.com/apache/arrow-rs/pull/3052) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Minor: Remove cloning ArrayData in with\_precision\_and\_scale [\#3050](https://github.com/apache/arrow-rs/pull/3050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Split out arrow-json \(\#3044\) [\#3049](https://github.com/apache/arrow-rs/pull/3049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move `intersect_row_selections` from datafusion to arrow-rs. [\#3047](https://github.com/apache/arrow-rs/pull/3047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Split out arrow-csv \(\#2594\) [\#3044](https://github.com/apache/arrow-rs/pull/3044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move reader\_parser to arrow-cast \(\#3022\) [\#3043](https://github.com/apache/arrow-rs/pull/3043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cast decimal256 to signed integer [\#3040](https://github.com/apache/arrow-rs/pull/3040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Enable casting from Date64 to Timestamp [\#3038](https://github.com/apache/arrow-rs/pull/3038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) -- Fix decoding long and/or padded RLE data \(\#3029\) \(\#3035\) [\#3036](https://github.com/apache/arrow-rs/pull/3036) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix nullif when existing array has no nulls [\#3034](https://github.com/apache/arrow-rs/pull/3034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Check overflow when casting floating point value to decimal256 [\#3033](https://github.com/apache/arrow-rs/pull/3033) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update parquet to depend on arrow subcrates [\#3028](https://github.com/apache/arrow-rs/pull/3028) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Make various i256 methods const [\#3026](https://github.com/apache/arrow-rs/pull/3026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-ipc [\#3022](https://github.com/apache/arrow-rs/pull/3022) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Check overflow while casting floating point value to decimal128 [\#3021](https://github.com/apache/arrow-rs/pull/3021) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update arrow-flight [\#3019](https://github.com/apache/arrow-rs/pull/3019) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Move ArrowNativeTypeOp to arrow-array \(\#2594\) [\#3018](https://github.com/apache/arrow-rs/pull/3018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support cast timestamp to time [\#3016](https://github.com/apache/arrow-rs/pull/3016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([naosense](https://github.com/naosense)) -- Add filter example [\#3014](https://github.com/apache/arrow-rs/pull/3014) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Check overflow when casting integer to decimal [\#3009](https://github.com/apache/arrow-rs/pull/3009) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add macro downcast\_temporal\_array [\#3007](https://github.com/apache/arrow-rs/pull/3007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Parquet Writer: Make column descriptor public on the writer [\#3002](https://github.com/apache/arrow-rs/pull/3002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([pier-oliviert](https://github.com/pier-oliviert)) -- Update chrono-tz requirement from 0.7 to 0.8 [\#3001](https://github.com/apache/arrow-rs/pull/3001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Round instead of Truncate while casting float to decimal [\#3000](https://github.com/apache/arrow-rs/pull/3000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Support Predicate Pushdown for Parquet Lists \(\#2108\) [\#2999](https://github.com/apache/arrow-rs/pull/2999) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-cast \(\#2594\) [\#2998](https://github.com/apache/arrow-rs/pull/2998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- `arrow::compute::kernels::temporal` should support nanoseconds [\#2996](https://github.com/apache/arrow-rs/pull/2996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Add `RowSelection::from_selectors_and_combine` to merge RowSelectors [\#2994](https://github.com/apache/arrow-rs/pull/2994) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Simplify Single-Column Dictionary Sort [\#2993](https://github.com/apache/arrow-rs/pull/2993) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Add entry to changelog for 26.0.0 RC2 fix [\#2992](https://github.com/apache/arrow-rs/pull/2992) ([alamb](https://github.com/alamb)) -- Fix ignored limit on `lexsort_to_indices` [\#2991](https://github.com/apache/arrow-rs/pull/2991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add clone and equal functions for CastOptions [\#2985](https://github.com/apache/arrow-rs/pull/2985) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- minor: remove redundant prefix [\#2983](https://github.com/apache/arrow-rs/pull/2983) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) -- Compare dictionary decimal arrays [\#2982](https://github.com/apache/arrow-rs/pull/2982) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Compare dictionary and non-dictionary decimal arrays [\#2980](https://github.com/apache/arrow-rs/pull/2980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add decimal comparison kernel support [\#2978](https://github.com/apache/arrow-rs/pull/2978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Move concat kernel to arrow-select \(\#2594\) [\#2976](https://github.com/apache/arrow-rs/pull/2976) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Specialize interleave for byte arrays \(\#2864\) [\#2975](https://github.com/apache/arrow-rs/pull/2975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use unary function for numeric to decimal cast [\#2973](https://github.com/apache/arrow-rs/pull/2973) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Specialize filter kernel for binary arrays \(\#2969\) [\#2971](https://github.com/apache/arrow-rs/pull/2971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Combine take\_utf8 and take\_binary \(\#2969\) [\#2970](https://github.com/apache/arrow-rs/pull/2970) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster Scalar Dictionary Comparison ~10% [\#2968](https://github.com/apache/arrow-rs/pull/2968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move `byte_size` from datafusion::physical\_expr [\#2965](https://github.com/apache/arrow-rs/pull/2965) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Pass decompressed size to parquet Codec::decompress \(\#2956\) [\#2959](https://github.com/apache/arrow-rs/pull/2959) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([marioloko](https://github.com/marioloko)) -- Add Decimal Arithmetic [\#2881](https://github.com/apache/arrow-rs/pull/2881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update zstd requirement from 0.11.1 to 0.12.0 [\#3178](https://github.com/apache/arrow-rs/pull/3178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bloom filter config tweaks \(\#3023\) [\#3175](https://github.com/apache/arrow-rs/pull/3175) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add RowParser [\#3174](https://github.com/apache/arrow-rs/pull/3174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `RowSelection::iter()`, `Into>` and example [\#3173](https://github.com/apache/arrow-rs/pull/3173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add read parquet examples [\#3170](https://github.com/apache/arrow-rs/pull/3170) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([xudong963](https://github.com/xudong963)) +- Faster BinaryArray to StringArray conversion \(~67%\) [\#3168](https://github.com/apache/arrow-rs/pull/3168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unnecessary downcasts in builders [\#3166](https://github.com/apache/arrow-rs/pull/3166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- bloom filter part IV: adjust writer properties, bloom filter properties, and incorporate into column encoder [\#3165](https://github.com/apache/arrow-rs/pull/3165) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Fix parquet decimal precision [\#3164](https://github.com/apache/arrow-rs/pull/3164) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) +- Add Row size methods \(\#3160\) [\#3163](https://github.com/apache/arrow-rs/pull/3163) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Prevent precision=0 for decimal type [\#3162](https://github.com/apache/arrow-rs/pull/3162) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Remove unnecessary Buffer::from\_slice\_ref reference [\#3161](https://github.com/apache/arrow-rs/pull/3161) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add finish\_cloned to ArrayBuilder [\#3158](https://github.com/apache/arrow-rs/pull/3158) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Check overflow in MutableArrayData extend offsets \(\#3123\) [\#3157](https://github.com/apache/arrow-rs/pull/3157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Extend Decimal256 as Primitive [\#3156](https://github.com/apache/arrow-rs/pull/3156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add collect.rs example [\#3153](https://github.com/apache/arrow-rs/pull/3153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement Neg for i256 [\#3151](https://github.com/apache/arrow-rs/pull/3151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: `{Field,DataType}::size` [\#3149](https://github.com/apache/arrow-rs/pull/3149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add like\_utf8\_scalar\_dyn kernel [\#3146](https://github.com/apache/arrow-rs/pull/3146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- comparison op: decimal128 array with scalar [\#3141](https://github.com/apache/arrow-rs/pull/3141) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Fix Panic on Reading Corrupt Parquet Schema \(\#2855\) [\#3130](https://github.com/apache/arrow-rs/pull/3130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) +- Clippy parquet fixes [\#3124](https://github.com/apache/arrow-rs/pull/3124) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Add GenericByteBuilder \(\#2969\) [\#3122](https://github.com/apache/arrow-rs/pull/3122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- parquet bloom filter part III: add sbbf writer, remove `bloom` default feature, add reader properties [\#3119](https://github.com/apache/arrow-rs/pull/3119) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Add downcast\_array \(\#2901\) [\#3117](https://github.com/apache/arrow-rs/pull/3117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add COW conversion for Buffer and PrimitiveArray and unary\_mut [\#3115](https://github.com/apache/arrow-rs/pull/3115) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Include field name in merge error message [\#3113](https://github.com/apache/arrow-rs/pull/3113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- Add PrimitiveArray::unary\_opt [\#3110](https://github.com/apache/arrow-rs/pull/3110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implements more temporal kernels using time\_fraction\_dyn [\#3107](https://github.com/apache/arrow-rs/pull/3107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- cast: support unsigned numeric type to decimal128 [\#3106](https://github.com/apache/arrow-rs/pull/3106) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Expose `SortingColumn` in parquet files [\#3103](https://github.com/apache/arrow-rs/pull/3103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- parquet bloom filter part II: read sbbf bitset from row group reader, update API, and add cli demo [\#3102](https://github.com/apache/arrow-rs/pull/3102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Parse Time32/Time64 from formatted string [\#3101](https://github.com/apache/arrow-rs/pull/3101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Cleanup temporal \_internal functions [\#3099](https://github.com/apache/arrow-rs/pull/3099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve schema mismatch error message [\#3098](https://github.com/apache/arrow-rs/pull/3098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Fix clippy by avoiding deprecated functions in chrono [\#3096](https://github.com/apache/arrow-rs/pull/3096) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Minor: Add diagrams and documentation to row format [\#3094](https://github.com/apache/arrow-rs/pull/3094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Minor: Use ArrowNativeTypeOp instead of total\_cmp directly [\#3087](https://github.com/apache/arrow-rs/pull/3087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Check overflow while casting between decimal types [\#3076](https://github.com/apache/arrow-rs/pull/3076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add bloom filter implementation based on split block \(sbbf\) spec [\#3057](https://github.com/apache/arrow-rs/pull/3057) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Add FixedSizeBinaryArray::try\_from\_sparse\_iter\_with\_size [\#3054](https://github.com/apache/arrow-rs/pull/3054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index d0c556a00674..37f73c6d1c4b 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "27.0.0" +version = "28.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 9ed4d91d21d2..1959721c9edc 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "27.0.0" +version = "28.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 5f52a3283f97..a5911a0a49e8 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "27.0.0" +version = "28.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } -arrow-select = { version = "27.0.0", path = "../arrow-select" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-select = { version = "28.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 5255244a1214..8139e0bd11a8 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "27.0.0" +version = "28.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "27.0.0", path = "../arrow-cast" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "28.0.0", path = "../arrow-cast" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } lazy_static = { version = "1.4", default-features = false } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 179bf7a032ed..7b64ebefc8d6 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "27.0.0" +version = "28.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index e243f45f3161..76aceb136af4 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "27.0.0" +version = "28.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,10 +27,10 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-ipc = { version = "27.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-ipc = { version = "28.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 56560d6710c1..310df3f8af5f 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "27.0.0" +arrow-flight = "28.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 79e6825a18b7..56575a6e4916 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "27.0.0" +version = "28.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "27.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } +arrow = { version = "28.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 015a8b7a953b..35f857510258 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "27.0.0" +version = "28.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 838cde8fa252..80cf1ee00eff 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "27.0.0" +version = "28.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "27.0.0", path = "../arrow-cast" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "28.0.0", path = "../arrow-cast" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 3454b4c1dbe5..b9eb7c5c4d67 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "27.0.0" +version = "28.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "27.0.0", path = "../arrow-cast" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "28.0.0", path = "../arrow-cast" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 5f54f5781160..aaa595916987 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "27.0.0" +version = "28.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "27.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "28.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index d88632d1040d..7eafb95ef4c8 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "27.0.0" +version = "28.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 07c376e55ddd..36659f91731a 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "27.0.0" +version = "28.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } -arrow-array = { version = "27.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 1e90e1e09e17..b818ad9000be 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "27.0.0" +version = "28.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,15 +44,15 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array" } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "27.0.0", path = "../arrow-cast" } -arrow-csv = { version = "27.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "27.0.0", path = "../arrow-data" } -arrow-ipc = { version = "27.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "27.0.0", path = "../arrow-json", optional = true } -arrow-schema = { version = "27.0.0", path = "../arrow-schema" } -arrow-select = { version = "27.0.0", path = "../arrow-select" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "28.0.0", path = "../arrow-cast" } +arrow-csv = { version = "28.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-select = { version = "28.0.0", path = "../arrow-select" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow/README.md b/arrow/README.md index c5cd588e87a4..71cdad76947f 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `27.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `28.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 61d8af55d12e..a3d1a8c314a6 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/27.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/28.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 4f28a073f7bd..057f72c4161b 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="26.0.0" -FUTURE_RELEASE="27.0.0" +SINCE_TAG="27.0.0" +FUTURE_RELEASE="28.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index 7c2832cf7ef1..e780ec5c9b09 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -49,7 +49,7 @@ export TEST_INTEGRATION=1 export AWS_DEFAULT_REGION=us-east-1 export AWS_ACCESS_KEY_ID=test export AWS_SECRET_ACCESS_KEY=test -export AWS_ENDPOINT=http://127.0.0.1:4566 +export AWS_ENDPOINT=http://128.0.0.1:4566 export OBJECT_STORE_BUCKET=test-bucket ``` @@ -79,7 +79,7 @@ $ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azur Create a bucket ``` -$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://128.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://128.0.0.1:10001/devstoreaccount1;' ``` Run tests diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 88f6eff23053..b2d878dd5930 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "27.0.0" +version = "28.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -30,14 +30,14 @@ edition = "2021" rust-version = "1.62" [dependencies] -arrow-array = { version = "27.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "27.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "27.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "27.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "27.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "27.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "27.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "27.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "28.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "28.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "28.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "28.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "28.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "28.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", default-features = false, optional = true } ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -70,7 +70,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "27.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "28.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index c300fb3e5b3d..019122586e24 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "27.0.0" +version = "28.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "27.0.0", default-features = false } +parquet = { path = "../parquet", version = "28.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index c8fefc72c609..0e34e498b46c 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "27.0.0" -parquet_derive = "27.0.0" +parquet = "28.0.0" +parquet_derive = "28.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 047e0196c704..6119ceb6cd58 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "27.0.0" +version = "28.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "27.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "27.0.0", default-features = false } +parquet = { path = "../parquet", version = "28.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "28.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From eb91dac6e99b0bcbcbd8057b03233059089bbf27 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 25 Nov 2022 14:03:38 -0500 Subject: [PATCH 0327/1411] Final 28.0.0 CHANGELOG updates (#3194) * Update Changelog with labels * Update with latest * update --- CHANGELOG.md | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c775d33526ce..accec4491852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,12 +19,13 @@ # Changelog -## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-24) +## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-25) [Full Changelog](https://github.com/apache/arrow-rs/compare/27.0.0...28.0.0) **Breaking changes:** +- StructArray::columns return slice [\#3186](https://github.com/apache/arrow-rs/pull/3186) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Return slice from GenericByteArray::value\_data [\#3171](https://github.com/apache/arrow-rs/pull/3171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Support decimal negative scale [\#3152](https://github.com/apache/arrow-rs/pull/3152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - refactor: convert `Field::metadata` to `HashMap` [\#3148](https://github.com/apache/arrow-rs/pull/3148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) @@ -37,41 +38,47 @@ **Implemented enhancements:** - Add iterator to RowSelection [\#3172](https://github.com/apache/arrow-rs/issues/3172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Row Format Size Tracking [\#3160](https://github.com/apache/arrow-rs/issues/3160) -- Add ArrayBuilder::finish\_cloned\(\) [\#3154](https://github.com/apache/arrow-rs/issues/3154) +- create an integration test set for parquet crate against pyspark for working with bloom filters [\#3167](https://github.com/apache/arrow-rs/issues/3167) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Row Format Size Tracking [\#3160](https://github.com/apache/arrow-rs/issues/3160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add ArrayBuilder::finish\_cloned\(\) [\#3154](https://github.com/apache/arrow-rs/issues/3154) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Optimize memory usage of json reader [\#3150](https://github.com/apache/arrow-rs/issues/3150) -- Add `Field::size` and `DataType::size` [\#3147](https://github.com/apache/arrow-rs/issues/3147) -- Add like\_utf8\_scalar\_dyn kernel [\#3145](https://github.com/apache/arrow-rs/issues/3145) -- support comparison for decimal128 array with scalar in kernel [\#3140](https://github.com/apache/arrow-rs/issues/3140) -- Replace custom date/time add/sub months by chrono 0.4.23's new api [\#3131](https://github.com/apache/arrow-rs/issues/3131) +- Add `Field::size` and `DataType::size` [\#3147](https://github.com/apache/arrow-rs/issues/3147) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add like\_utf8\_scalar\_dyn kernel [\#3145](https://github.com/apache/arrow-rs/issues/3145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- support comparison for decimal128 array with scalar in kernel [\#3140](https://github.com/apache/arrow-rs/issues/3140) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- audit and create a document for bloom filter configurations [\#3138](https://github.com/apache/arrow-rs/issues/3138) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Should be the rounding vs truncation when cast decimal to smaller scale [\#3137](https://github.com/apache/arrow-rs/issues/3137) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Upgrade chrono to 0.4.23 [\#3120](https://github.com/apache/arrow-rs/issues/3120) -- Implements more temporal kernels using time\_fraction\_dyn [\#3108](https://github.com/apache/arrow-rs/issues/3108) -- Upgrade to thrift 0.17 [\#3105](https://github.com/apache/arrow-rs/issues/3105) -- Be able to parse time formatted strings [\#3100](https://github.com/apache/arrow-rs/issues/3100) -- Improve "Fail to merge schema" error messages [\#3095](https://github.com/apache/arrow-rs/issues/3095) +- Implements more temporal kernels using time\_fraction\_dyn [\#3108](https://github.com/apache/arrow-rs/issues/3108) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Upgrade to thrift 0.17 [\#3105](https://github.com/apache/arrow-rs/issues/3105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Be able to parse time formatted strings [\#3100](https://github.com/apache/arrow-rs/issues/3100) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve "Fail to merge schema" error messages [\#3095](https://github.com/apache/arrow-rs/issues/3095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Expose `SortingColumn` when reading and writing parquet metadata [\#3090](https://github.com/apache/arrow-rs/issues/3090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Change Field::metadata to HashMap [\#3086](https://github.com/apache/arrow-rs/issues/3086) +- Change Field::metadata to HashMap [\#3086](https://github.com/apache/arrow-rs/issues/3086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support bloom filter reading and writing for parquet [\#3023](https://github.com/apache/arrow-rs/issues/3023) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - API to take back ownership of an ArrayRef [\#2901](https://github.com/apache/arrow-rs/issues/2901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Specialized Interleave Kernel [\#2864](https://github.com/apache/arrow-rs/issues/2864) +- Specialized Interleave Kernel [\#2864](https://github.com/apache/arrow-rs/issues/2864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) -- Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) -- Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) -- Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) -- Decimal Casts are Unchecked [\#2986](https://github.com/apache/arrow-rs/issues/2986) -- Reading parquet files with a corrupt ARROW:schema panics [\#2855](https://github.com/apache/arrow-rs/issues/2855) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Decimal Casts are Unchecked [\#2986](https://github.com/apache/arrow-rs/issues/2986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- audit and create a document for bloom filter configurations [\#3138](https://github.com/apache/arrow-rs/issues/3138) -- Release Arrow `27.0.0` \(next release after `26.0.0`\) [\#3045](https://github.com/apache/arrow-rs/issues/3045) +- Release Arrow `27.0.0` \(next release after `26.0.0`\) [\#3045](https://github.com/apache/arrow-rs/issues/3045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] - Perf about ParquetRecordBatchStream vs ParquetRecordBatchReader [\#2916](https://github.com/apache/arrow-rs/issues/2916) **Merged pull requests:** +- Improve regex related kernels by upto 85% [\#3192](https://github.com/apache/arrow-rs/pull/3192) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Derive clone for arrays [\#3184](https://github.com/apache/arrow-rs/pull/3184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Row decode cleanups [\#3180](https://github.com/apache/arrow-rs/pull/3180) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update zstd requirement from 0.11.1 to 0.12.0 [\#3178](https://github.com/apache/arrow-rs/pull/3178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Move decimal constants from `arrow-data` to `arrow-schema` crate [\#3177](https://github.com/apache/arrow-rs/pull/3177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- bloom filter part V: add an integration with pytest against pyspark [\#3176](https://github.com/apache/arrow-rs/pull/3176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) - Bloom filter config tweaks \(\#3023\) [\#3175](https://github.com/apache/arrow-rs/pull/3175) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - Add RowParser [\#3174](https://github.com/apache/arrow-rs/pull/3174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add `RowSelection::iter()`, `Into>` and example [\#3173](https://github.com/apache/arrow-rs/pull/3173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) @@ -86,11 +93,13 @@ - Add finish\_cloned to ArrayBuilder [\#3158](https://github.com/apache/arrow-rs/pull/3158) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) - Check overflow in MutableArrayData extend offsets \(\#3123\) [\#3157](https://github.com/apache/arrow-rs/pull/3157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Extend Decimal256 as Primitive [\#3156](https://github.com/apache/arrow-rs/pull/3156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Doc improvements [\#3155](https://github.com/apache/arrow-rs/pull/3155) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) - Add collect.rs example [\#3153](https://github.com/apache/arrow-rs/pull/3153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - Implement Neg for i256 [\#3151](https://github.com/apache/arrow-rs/pull/3151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - feat: `{Field,DataType}::size` [\#3149](https://github.com/apache/arrow-rs/pull/3149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) - Add like\_utf8\_scalar\_dyn kernel [\#3146](https://github.com/apache/arrow-rs/pull/3146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - comparison op: decimal128 array with scalar [\#3141](https://github.com/apache/arrow-rs/pull/3141) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Cast: should get the round result for decimal to a decimal with smaller scale [\#3139](https://github.com/apache/arrow-rs/pull/3139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) - Fix Panic on Reading Corrupt Parquet Schema \(\#2855\) [\#3130](https://github.com/apache/arrow-rs/pull/3130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) - Clippy parquet fixes [\#3124](https://github.com/apache/arrow-rs/pull/3124) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) - Add GenericByteBuilder \(\#2969\) [\#3122](https://github.com/apache/arrow-rs/pull/3122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From ddaab1dbab2bf776348dd974a21965e6753bf4c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Nov 2022 19:54:01 +0000 Subject: [PATCH 0328/1411] Use self capture in DataType (#3190) --- arrow-schema/src/datatype.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 6e0f626ef94d..f1d13aefd279 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -264,16 +264,16 @@ impl fmt::Display for DataType { impl DataType { /// Returns true if the type is primitive: (numeric, temporal). #[inline] - pub fn is_primitive(t: &DataType) -> bool { - Self::is_numeric(t) || Self::is_temporal(t) + pub fn is_primitive(&self) -> bool { + self.is_numeric() || self.is_temporal() } /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*). #[inline] - pub fn is_numeric(t: &DataType) -> bool { + pub fn is_numeric(&self) -> bool { use DataType::*; matches!( - t, + self, UInt8 | UInt16 | UInt32 @@ -292,10 +292,10 @@ impl DataType { /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). #[inline] - pub fn is_temporal(t: &DataType) -> bool { + pub fn is_temporal(&self) -> bool { use DataType::*; matches!( - t, + self, Date32 | Date64 | Timestamp(_, _) @@ -308,19 +308,19 @@ impl DataType { /// Returns true if this type is valid as a dictionary key #[inline] - pub fn is_dictionary_key_type(t: &DataType) -> bool { + pub fn is_dictionary_key_type(&self) -> bool { use DataType::*; matches!( - t, + self, UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 ) } /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, /// or Map), or a dictionary of a nested type - pub fn is_nested(t: &DataType) -> bool { + pub fn is_nested(&self) -> bool { use DataType::*; - match t { + match self { Dictionary(_, v) => DataType::is_nested(v.as_ref()), List(_) | FixedSizeList(_, _) From fd08c31a2cd37342d261f67e999b2be2d5a4ba6b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 26 Nov 2022 03:31:19 -0500 Subject: [PATCH 0329/1411] Include integration tests in arrow crate (#3196) --- arrow/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index b818ad9000be..8172615f2e55 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -27,6 +27,7 @@ keywords = ["arrow"] include = [ "benches/*.rs", "src/**/*.rs", + "tests/*.rs", "Cargo.toml", ] edition = "2021" From befea02c2f277a95d1f80f00aa0e9591942bd723 Mon Sep 17 00:00:00 2001 From: Jie Han <11144133+doki23@users.noreply.github.com> Date: Sat, 26 Nov 2022 21:14:15 +0800 Subject: [PATCH 0330/1411] To pyarrow with schema (#3188) * to pyarrow with schema * only use schema * add test * Run python tests in CI Co-authored-by: Raphael Taylor-Davies --- .github/workflows/integration.yml | 10 ++++---- arrow/Cargo.toml | 4 +++ arrow/src/pyarrow.rs | 9 +++---- arrow/tests/pyarrow.rs | 42 +++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 arrow/tests/pyarrow.rs diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 3ece06b29238..656e56a652ca 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -149,13 +149,13 @@ jobs: virtualenv venv source venv/bin/activate pip install maturin toml pytest pytz pyarrow>=5.0 + - name: Run Rust tests + run: | + source venv/bin/activate + cargo test -p arrow --test pyarrow --features pyarrow - name: Run tests - env: - CARGO_HOME: "/home/runner/.cargo" - CARGO_TARGET_DIR: "/home/runner/target" run: | source venv/bin/activate - pushd arrow-pyarrow-integration-testing + cd arrow-pyarrow-integration-testing maturin develop pytest -v . - popd diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8172615f2e55..a97ec1ac123f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -269,3 +269,7 @@ required-features = ["test_utils", "ipc"] [[test]] name = "csv" required-features = ["csv", "chrono-tz"] + +[[test]] +name = "pyarrow" +required-features = ["pyarrow"] diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 7c365a4344a5..5ddc3105a4ad 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -184,20 +184,19 @@ impl PyArrowConvert for RecordBatch { fn to_pyarrow(&self, py: Python) -> PyResult { let mut py_arrays = vec![]; - let mut py_names = vec![]; let schema = self.schema(); - let fields = schema.fields().iter(); let columns = self.columns().iter(); - for (array, field) in columns.zip(fields) { + for array in columns { py_arrays.push(array.data().to_pyarrow(py)?); - py_names.push(field.name()); } + let py_schema = schema.to_pyarrow(py)?; + let module = py.import("pyarrow")?; let class = module.getattr("RecordBatch")?; - let record = class.call_method1("from_arrays", (py_arrays, py_names))?; + let record = class.call_method1("from_arrays", (py_arrays, py_schema))?; Ok(PyObject::from(record)) } diff --git a/arrow/tests/pyarrow.rs b/arrow/tests/pyarrow.rs new file mode 100644 index 000000000000..4b1226c738f5 --- /dev/null +++ b/arrow/tests/pyarrow.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int32Array, StringArray}; +use arrow::pyarrow::PyArrowConvert; +use arrow::record_batch::RecordBatch; +use pyo3::Python; +use std::sync::Arc; + +#[test] +fn test_to_pyarrow() { + pyo3::prepare_freethreaded_python(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); + let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b"])); + let input = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap(); + println!("input: {:?}", input); + + let res = Python::with_gil(|py| { + let py_input = input.to_pyarrow(py)?; + let records = RecordBatch::from_pyarrow(py_input.as_ref(py))?; + let py_records = records.to_pyarrow(py)?; + RecordBatch::from_pyarrow(py_records.as_ref(py)) + }) + .unwrap(); + + assert_eq!(input, res); +} From 8c6e57960f92c0fad9982caba32f226e318313d9 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Sat, 26 Nov 2022 18:48:24 +0530 Subject: [PATCH 0331/1411] Support Duration in array_value_to_string (#3183) * Improve array_value_to_string * Fix fmt issues * Reverting to safe calls --- arrow-cast/src/display.rs | 252 +++++++++++++++++++------------------- 1 file changed, 128 insertions(+), 124 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 434f750afc48..287065eb6950 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -32,13 +32,7 @@ macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - array.value($row).to_string() - }; - - Ok(s) + Ok(array.value($row).to_string()) }}; } @@ -49,20 +43,14 @@ macro_rules! make_string_interval_year_month { .downcast_ref::() .unwrap(); - let s = if array.is_null($row) { - "NULL".to_string() - } else { - let interval = array.value($row) as f64; - let years = (interval / 12_f64).floor(); - let month = interval - (years * 12_f64); + let interval = array.value($row) as f64; + let years = (interval / 12_f64).floor(); + let month = interval - (years * 12_f64); - format!( - "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", - years, month, - ) - }; - - Ok(s) + Ok(format!( + "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", + years, month, + )) }}; } @@ -73,32 +61,26 @@ macro_rules! make_string_interval_day_time { .downcast_ref::() .unwrap(); - let s = if array.is_null($row) { - "NULL".to_string() - } else { - let value: u64 = array.value($row) as u64; - - let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; - let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; + let value: u64 = array.value($row) as u64; - let secs = milliseconds_part / 1000; - let mins = secs / 60; - let hours = mins / 60; + let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; + let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; - let secs = secs - (mins * 60); - let mins = mins - (hours * 60); + let secs = milliseconds_part / 1000; + let mins = secs / 60; + let hours = mins / 60; - format!( - "0 years 0 mons {} days {} hours {} mins {}.{:03} secs", - days_parts, - hours, - mins, - secs, - (milliseconds_part % 1000), - ) - }; + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); - Ok(s) + Ok(format!( + "0 years 0 mons {} days {} hours {} mins {}.{:03} secs", + days_parts, + hours, + mins, + secs, + (milliseconds_part % 1000), + )) }}; } @@ -109,35 +91,29 @@ macro_rules! make_string_interval_month_day_nano { .downcast_ref::() .unwrap(); - let s = if array.is_null($row) { - "NULL".to_string() - } else { - let value: u128 = array.value($row) as u128; - - let months_part: i32 = - ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; - let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; - let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; - - let secs = nanoseconds_part / 1000000000; - let mins = secs / 60; - let hours = mins / 60; - - let secs = secs - (mins * 60); - let mins = mins - (hours * 60); - - format!( - "0 years {} mons {} days {} hours {} mins {}.{:09} secs", - months_part, - days_part, - hours, - mins, - secs, - (nanoseconds_part % 1000000000), - ) - }; + let value: u128 = array.value($row) as u128; - Ok(s) + let months_part: i32 = + ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; + let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + + let secs = nanoseconds_part / 1000000000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + Ok(format!( + "0 years {} mons {} days {} hours {} mins {}.{:09} secs", + months_part, + days_part, + hours, + mins, + secs, + (nanoseconds_part % 1000000000), + )) }}; } @@ -145,16 +121,10 @@ macro_rules! make_string_date { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - array - .value_as_date($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) - }; - - Ok(s) + Ok(array + .value_as_date($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) }}; } @@ -162,16 +132,10 @@ macro_rules! make_string_time { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - array - .value_as_time($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) - }; - - Ok(s) + Ok(array + .value_as_time($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) }}; } @@ -179,16 +143,10 @@ macro_rules! make_string_datetime { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - array - .value_as_datetime($row) - .map(|d| format!("{:?}", d)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) - }; - - Ok(s) + Ok(array + .value_as_datetime($row) + .map(|d| format!("{:?}", d)) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) }}; } @@ -196,19 +154,15 @@ macro_rules! make_string_datetime_with_tz { ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - match $tz_string.parse::() { - Ok(tz) => array - .value_as_datetime_with_tz($row, tz) - .map(|d| format!("{}", d.to_rfc3339())) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - Err(_) => array - .value_as_datetime($row) - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - } + let s = match $tz_string.parse::() { + Ok(tz) => array + .value_as_datetime_with_tz($row, tz) + .map(|d| format!("{}", d.to_rfc3339())) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + Err(_) => array + .value_as_datetime($row) + .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), }; Ok(s) @@ -220,19 +174,13 @@ macro_rules! make_string_hex { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = if array.is_null($row) { - "".to_string() - } else { - let mut tmp = "".to_string(); + let mut tmp = "".to_string(); - for character in array.value($row) { - let _ = write!(tmp, "{:02x}", character); - } - - tmp - }; + for character in array.value($row) { + let _ = write!(tmp, "{:02x}", character); + } - Ok(s) + Ok(tmp) }}; } @@ -284,6 +232,17 @@ macro_rules! make_string_from_fixed_size_list { }}; } +macro_rules! make_string_from_duration { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + Ok(array + .value_as_duration($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + }}; +} + #[inline(always)] pub fn make_string_from_decimal( column: &Arc, @@ -474,6 +433,20 @@ pub fn array_value_to_string( DataType::Union(field_vec, type_ids, mode) => { union_to_string(column, row, field_vec, type_ids, mode) } + DataType::Duration(unit) => match *unit { + TimeUnit::Second => { + make_string_from_duration!(array::DurationSecondArray, column, row) + } + TimeUnit::Millisecond => { + make_string_from_duration!(array::DurationMillisecondArray, column, row) + } + TimeUnit::Microsecond => { + make_string_from_duration!(array::DurationMicrosecondArray, column, row) + } + TimeUnit::Nanosecond => { + make_string_from_duration!(array::DurationNanosecondArray, column, row) + } + }, _ => Err(ArrowError::InvalidArgumentError(format!( "Pretty printing not implemented for {:?} type", column.data_type() @@ -549,3 +522,34 @@ pub fn lexical_to_string(n: N) -> String { String::from_utf8_unchecked(buf) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_array_value_to_string_duration() { + let ns_array = + Arc::new(DurationNanosecondArray::from(vec![Some(1), None])) as ArrayRef; + assert_eq!( + array_value_to_string(&ns_array, 0).unwrap(), + "PT0.000000001S" + ); + assert_eq!(array_value_to_string(&ns_array, 1).unwrap(), ""); + + let us_array = + Arc::new(DurationMicrosecondArray::from(vec![Some(1), None])) as ArrayRef; + assert_eq!(array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S"); + assert_eq!(array_value_to_string(&us_array, 1).unwrap(), ""); + + let ms_array = + Arc::new(DurationMillisecondArray::from(vec![Some(1), None])) as ArrayRef; + assert_eq!(array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S"); + assert_eq!(array_value_to_string(&ms_array, 1).unwrap(), ""); + + let s_array = + Arc::new(DurationSecondArray::from(vec![Some(1), None])) as ArrayRef; + assert_eq!(array_value_to_string(&s_array, 0).unwrap(), "PT1S"); + assert_eq!(array_value_to_string(&s_array, 1).unwrap(), ""); + } +} From 0b12828ddc75112c92541c612e7a75e5dbe44081 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Sat, 26 Nov 2022 18:49:55 +0530 Subject: [PATCH 0332/1411] Adding scalar nlike_dyn, ilike_dyn, nilike_dyn kernels (#3195) --- arrow/src/compute/kernels/comparison.rs | 344 ++++++++++++++++++------ 1 file changed, 269 insertions(+), 75 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 7423b13bc07c..bca74bee0a1d 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -410,6 +410,39 @@ fn nlike_scalar<'a, L: ArrayAccessor>( like_scalar_op(left, right, |x| !x) } +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nlike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nlike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -556,6 +589,39 @@ fn ilike_scalar<'a, L: ArrayAccessor>( Ok(BooleanArray::from(data)) } +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + ilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + ilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -702,6 +768,39 @@ fn nilike_scalar<'a, L: ArrayAccessor>( Ok(BooleanArray::from(data)) } +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -4410,6 +4509,10 @@ mod tests { } } }; + ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { + test_utf8_scalar!($test_name, $left, $right, $op, $expected); + test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected); + }; } macro_rules! test_flag_utf8 { @@ -4498,160 +4601,100 @@ mod tests { test_utf8_scalar!( test_utf8_array_like_scalar_escape_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex, - vec![".*", "a", "*"], - ".*", - like_utf8_scalar, - vec![true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex_dot, - vec![".", "a", "*"], - ".", - like_utf8_scalar, - vec![true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn, vec!["arrow", "parquet", "datafusion", "flight"], "%ar%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - like_utf8_scalar, - vec![true, false, true, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "arr"], "arrow%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, true, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_end, vec!["arrow", "parrow", "arrows", "arr"], "%arrow", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - like_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "arrow", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - like_utf8_scalar, - vec![false, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", + like_utf8_scalar, like_utf8_scalar_dyn, vec![false, true, false, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape, - vec!["a%", "a\\x"], - "a\\%", - like_utf8_scalar, - vec![true, false] - ); - - test_utf8_scalar!( test_utf8_scalar_like_dyn_escape, vec!["a%", "a\\x"], "a\\%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape_contains, - vec!["ba%", "ba\\x"], - "%a\\%", - like_utf8_scalar, - vec![true, false] - ); - - test_utf8_scalar!( test_utf8_scalar_like_dyn_escape_contains, vec!["ba%", "ba\\x"], "%a\\%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false] ); @@ -4716,64 +4759,80 @@ mod tests { ); test_utf8_scalar!( test_utf8_array_nlike_escape_testing, + test_utf8_array_nlike_escape_dyn_testing_dyn, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex, + test_utf8_array_nlike_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex_dot, + test_utf8_array_nlike_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar, + test_utf8_array_nlike_scalar_dyn, vec!["arrow", "parquet", "datafusion", "flight"], "%ar%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_start, + test_utf8_array_nlike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "arr"], "arrow%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, false, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_end, + test_utf8_array_nlike_scalar_dyn_end, vec!["arrow", "parrow", "arrows", "arr"], "%arrow", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_equals, + test_utf8_array_nlike_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "arrow", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_one, + test_utf8_array_nlike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![true, false, true, true] ); @@ -4784,50 +4843,64 @@ mod tests { ilike_utf8, vec![true, true, true, false, false, true, false] ); + test_utf8_scalar!( ilike_utf8_scalar_escape_testing, + ilike_utf8_scalar_escape_dyn_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); + test_utf8_scalar!( test_utf8_array_ilike_scalar, + test_utf8_array_ilike_dyn_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "%AR%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_start, + test_utf8_array_ilike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "ARR"], "aRRow%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, false, true, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_end, + test_utf8_array_ilike_scalar_dyn_end, vec!["ArroW", "parrow", "ARRowS", "arr"], "%arrow", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_equals, + test_utf8_array_ilike_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "Arrow", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, false, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_one, + test_utf8_array_ilike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![false, true, false, false] ); @@ -4838,50 +4911,64 @@ mod tests { nilike_utf8, vec![false, false, false, true, true, false, true] ); + test_utf8_scalar!( nilike_utf8_scalar_escape_testing, + nilike_utf8_scalar_escape_dyn_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); + test_utf8_scalar!( test_utf8_array_nilike_scalar, + test_utf8_array_nilike_dyn_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "%AR%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_start, + test_utf8_array_nilike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "ARR"], "aRRow%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, true, false, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_end, + test_utf8_array_nilike_scalar_dyn_end, vec!["ArroW", "parrow", "ARRowS", "arr"], "%arrow", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_equals, + test_utf8_array_nilike_scalar_dyn_equals, vec!["arRow", "parrow", "arrows", "arr"], "Arrow", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_one, + test_utf8_array_nilike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![true, false, true, true] ); @@ -6252,11 +6339,7 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_arrayref: DictionaryArray = data.into_iter().collect(); - let dict_arrayref = Arc::new(dict_arrayref) as ArrayRef; + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( like_dict_scalar(&dict_array, "Air").unwrap(), @@ -6548,6 +6631,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( nlike_dict_scalar(&dict_array, "Air").unwrap(), BooleanArray::from( @@ -6555,6 +6640,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "Wa%").unwrap(), BooleanArray::from( @@ -6562,6 +6654,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%r").unwrap(), BooleanArray::from( @@ -6569,6 +6668,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%i%").unwrap(), BooleanArray::from( @@ -6576,12 +6682,26 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%a%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] ), ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); } #[test] @@ -6591,6 +6711,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( ilike_dict_scalar(&dict_array, "air").unwrap(), BooleanArray::from( @@ -6598,6 +6720,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "wa%").unwrap(), BooleanArray::from( @@ -6605,6 +6734,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%R").unwrap(), BooleanArray::from( @@ -6612,6 +6748,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%I%").unwrap(), BooleanArray::from( @@ -6619,12 +6762,26 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] ), ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] + ), + ); } #[test] @@ -6634,6 +6791,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( nilike_dict_scalar(&dict_array, "air").unwrap(), BooleanArray::from( @@ -6641,6 +6800,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "wa%").unwrap(), BooleanArray::from( @@ -6648,6 +6814,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%R").unwrap(), BooleanArray::from( @@ -6655,6 +6828,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%I%").unwrap(), BooleanArray::from( @@ -6662,12 +6842,26 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] ), ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); } #[test] From 2ea47e436d59a576d58d895d5805de1f2fe4c399 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 26 Nov 2022 20:38:28 +0000 Subject: [PATCH 0333/1411] Move zip and shift kernels to arrow-select (#3201) --- arrow-select/src/lib.rs | 2 ++ .../kernels => arrow-select/src}/window.rs | 18 +++++++----------- .../kernels => arrow-select/src}/zip.rs | 9 +++++---- arrow/src/compute/kernels/mod.rs | 4 +--- 4 files changed, 15 insertions(+), 18 deletions(-) rename {arrow/src/compute/kernels => arrow-select/src}/window.rs (94%) rename {arrow/src/compute/kernels => arrow-select/src}/zip.rs (94%) diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs index 5249b5c4c323..cf887dfca47c 100644 --- a/arrow-select/src/lib.rs +++ b/arrow-select/src/lib.rs @@ -21,3 +21,5 @@ pub mod concat; pub mod filter; pub mod interleave; pub mod take; +pub mod window; +pub mod zip; diff --git a/arrow/src/compute/kernels/window.rs b/arrow-select/src/window.rs similarity index 94% rename from arrow/src/compute/kernels/window.rs rename to arrow-select/src/window.rs index 54b11c3b2747..70ac86857db2 100644 --- a/arrow/src/compute/kernels/window.rs +++ b/arrow-select/src/window.rs @@ -17,12 +17,9 @@ //! Defines windowing functions, like `shift`ing -use crate::array::{Array, ArrayRef}; -use crate::error::Result; -use crate::{ - array::{make_array, new_null_array}, - compute::concat, -}; +use crate::concat::concat; +use arrow_array::{make_array, new_null_array, Array, ArrayRef}; +use arrow_schema::ArrowError; use num::abs; /// Shifts array by defined number of items (to left or right) @@ -30,9 +27,8 @@ use num::abs; /// a negative value shifts the array to the left. /// # Examples /// ``` -/// use arrow::array::Int32Array; -/// use arrow::error::Result; -/// use arrow::compute::shift; +/// # use arrow_array::Int32Array; +/// # use arrow_select::window::shift; /// /// let a: Int32Array = vec![Some(1), None, Some(4)].into(); /// @@ -56,7 +52,7 @@ use num::abs; /// let expected: Int32Array = vec![None, None, None].into(); /// assert_eq!(res.as_ref(), &expected); /// ``` -pub fn shift(array: &dyn Array, offset: i64) -> Result { +pub fn shift(array: &dyn Array, offset: i64) -> Result { let value_len = array.len() as i64; if offset == 0 { Ok(make_array(array.data_ref().clone())) @@ -86,7 +82,7 @@ pub fn shift(array: &dyn Array, offset: i64) -> Result { #[cfg(test)] mod tests { use super::*; - use crate::array::{Float64Array, Int32Array, Int32DictionaryArray}; + use arrow_array::{Float64Array, Int32Array, Int32DictionaryArray}; #[test] fn test_shift_neg() { diff --git a/arrow/src/compute/kernels/zip.rs b/arrow-select/src/zip.rs similarity index 94% rename from arrow/src/compute/kernels/zip.rs rename to arrow-select/src/zip.rs index c28529cf6762..e5d0f25e8fdb 100644 --- a/arrow/src/compute/kernels/zip.rs +++ b/arrow-select/src/zip.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::array::*; -use crate::compute::SlicesIterator; -use crate::error::{ArrowError, Result}; +use crate::filter::SlicesIterator; +use arrow_array::*; +use arrow_data::transform::MutableArrayData; +use arrow_schema::ArrowError; /// Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` /// are taken, where the mask evaluates `false` values of `falsy` are taken. @@ -30,7 +31,7 @@ pub fn zip( mask: &BooleanArray, truthy: &dyn Array, falsy: &dyn Array, -) -> Result { +) -> Result { if truthy.data_type() != falsy.data_type() { return Err(ArrowError::InvalidArgumentError( "arguments need to have the same data type".into(), diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 9ffa53eb2db7..0eebb701232a 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -31,9 +31,7 @@ pub mod regexp; pub mod sort; pub mod substring; pub mod temporal; -pub mod window; -pub mod zip; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; -pub use arrow_select::{concat, filter, interleave, take}; +pub use arrow_select::{concat, filter, interleave, take, window, zip}; From 0ef18481bd44a08fe041aa23c7b97b0c4695a024 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 26 Nov 2022 20:38:41 +0000 Subject: [PATCH 0334/1411] Deprecate limit kernel (#3200) --- arrow/src/compute/kernels/limit.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 7b8f519cf6ac..1f6c6aec5e1f 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -25,12 +25,14 @@ use crate::array::ArrayRef; /// where: /// * it performs a bounds-check on the array /// * it slices from offset 0 +#[deprecated(note = "Use Array::slice")] pub fn limit(array: &ArrayRef, num_elements: usize) -> ArrayRef { let lim = num_elements.min(array.len()); array.slice(0, lim) } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use crate::array::*; From 785d9928a6f926a396344dba0cb558e425c44517 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 27 Nov 2022 00:08:05 -0800 Subject: [PATCH 0335/1411] Hide _dict_scalar kernels behind _dyn kernels (#3202) --- arrow/src/compute/kernels/comparison.rs | 48 ++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index bca74bee0a1d..10cab4889346 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -326,7 +326,7 @@ pub fn like_utf8_scalar( /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// /// See the documentation on [`like_utf8`] for more details. -pub fn like_dict_scalar( +fn like_dict_scalar( left: &DictionaryArray, right: &str, ) -> Result { @@ -458,7 +458,7 @@ pub fn nlike_utf8_scalar( /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// /// See the documentation on [`like_utf8`] for more details. -pub fn nlike_dict_scalar( +fn nlike_dict_scalar( left: &DictionaryArray, right: &str, ) -> Result { @@ -637,7 +637,7 @@ pub fn ilike_utf8_scalar( /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// /// See the documentation on [`like_utf8`] for more details. -pub fn ilike_dict_scalar( +fn ilike_dict_scalar( left: &DictionaryArray, right: &str, ) -> Result { @@ -816,7 +816,7 @@ pub fn nilike_utf8_scalar( /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// /// See the documentation on [`like_utf8`] for more details. -pub fn nilike_dict_scalar( +fn nilike_dict_scalar( left: &DictionaryArray, right: &str, ) -> Result { @@ -6342,7 +6342,7 @@ mod tests { let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( - like_dict_scalar(&dict_array, "Air").unwrap(), + like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] ), @@ -6356,7 +6356,7 @@ mod tests { ); assert_eq!( - like_dict_scalar(&dict_array, "Wa%").unwrap(), + like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] ), @@ -6370,7 +6370,7 @@ mod tests { ); assert_eq!( - like_dict_scalar(&dict_array, "%r").unwrap(), + like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] ), @@ -6384,7 +6384,7 @@ mod tests { ); assert_eq!( - like_dict_scalar(&dict_array, "%i%").unwrap(), + like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] ), @@ -6398,7 +6398,7 @@ mod tests { ); assert_eq!( - like_dict_scalar(&dict_array, "%a%r%").unwrap(), + like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] ), @@ -6634,7 +6634,7 @@ mod tests { let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( - nlike_dict_scalar(&dict_array, "Air").unwrap(), + nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] ), @@ -6648,7 +6648,7 @@ mod tests { ); assert_eq!( - nlike_dict_scalar(&dict_array, "Wa%").unwrap(), + nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] ), @@ -6662,7 +6662,7 @@ mod tests { ); assert_eq!( - nlike_dict_scalar(&dict_array, "%r").unwrap(), + nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] ), @@ -6676,7 +6676,7 @@ mod tests { ); assert_eq!( - nlike_dict_scalar(&dict_array, "%i%").unwrap(), + nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] ), @@ -6690,7 +6690,7 @@ mod tests { ); assert_eq!( - nlike_dict_scalar(&dict_array, "%a%r%").unwrap(), + nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] ), @@ -6714,7 +6714,7 @@ mod tests { let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( - ilike_dict_scalar(&dict_array, "air").unwrap(), + ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] ), @@ -6728,7 +6728,7 @@ mod tests { ); assert_eq!( - ilike_dict_scalar(&dict_array, "wa%").unwrap(), + ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] ), @@ -6742,7 +6742,7 @@ mod tests { ); assert_eq!( - ilike_dict_scalar(&dict_array, "%R").unwrap(), + ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), BooleanArray::from( vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] ), @@ -6756,7 +6756,7 @@ mod tests { ); assert_eq!( - ilike_dict_scalar(&dict_array, "%I%").unwrap(), + ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] ), @@ -6770,7 +6770,7 @@ mod tests { ); assert_eq!( - ilike_dict_scalar(&dict_array, "%A%r%").unwrap(), + ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] ), @@ -6794,7 +6794,7 @@ mod tests { let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( - nilike_dict_scalar(&dict_array, "air").unwrap(), + nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] ), @@ -6808,7 +6808,7 @@ mod tests { ); assert_eq!( - nilike_dict_scalar(&dict_array, "wa%").unwrap(), + nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] ), @@ -6822,7 +6822,7 @@ mod tests { ); assert_eq!( - nilike_dict_scalar(&dict_array, "%R").unwrap(), + nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), BooleanArray::from( vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] ), @@ -6836,7 +6836,7 @@ mod tests { ); assert_eq!( - nilike_dict_scalar(&dict_array, "%I%").unwrap(), + nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] ), @@ -6850,7 +6850,7 @@ mod tests { ); assert_eq!( - nilike_dict_scalar(&dict_array, "%A%r%").unwrap(), + nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] ), From 8db21eaace15ca10581dc20cd34db220a41a2e0d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 27 Nov 2022 08:53:43 +0000 Subject: [PATCH 0336/1411] Remove special case ArrayData equality for decimals (#3204) --- arrow-data/src/equal/decimal.rs | 73 --------------------------------- arrow-data/src/equal/mod.rs | 12 +++--- 2 files changed, 7 insertions(+), 78 deletions(-) delete mode 100644 arrow-data/src/equal/decimal.rs diff --git a/arrow-data/src/equal/decimal.rs b/arrow-data/src/equal/decimal.rs deleted file mode 100644 index 15703389cb8a..000000000000 --- a/arrow-data/src/equal/decimal.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; -use arrow_schema::DataType; - -use super::utils::equal_len; - -pub(super) fn decimal_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let size = match lhs.data_type() { - DataType::Decimal128(_, _) => 16, - DataType::Decimal256(_, _) => 32, - _ => unreachable!(), - }; - - let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; - let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - - // Only checking one null mask here because by the time the control flow reaches - // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { - equal_len( - lhs_values, - rhs_values, - size * lhs_start, - size * rhs_start, - size * len, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) - } -} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 063ef64d4d84..85c595cfed1c 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -20,11 +20,11 @@ //! depend on dynamic casting of `Array`. use crate::data::ArrayData; +use arrow_buffer::i256; use arrow_schema::{DataType, IntervalUnit}; use half::f16; mod boolean; -mod decimal; mod dictionary; mod fixed_binary; mod fixed_list; @@ -40,7 +40,6 @@ mod variable_size; // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). use boolean::boolean_equal; -use decimal::decimal_equal; use dictionary::dictionary_equal; use fixed_binary::fixed_binary_equal; use fixed_list::fixed_list_equal; @@ -74,6 +73,12 @@ fn equal_values( DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Decimal128(_, _) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } + DataType::Decimal256(_, _) => { + primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) + } DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { @@ -98,9 +103,6 @@ fn equal_values( DataType::FixedSizeBinary(_) => { fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) } - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - decimal_equal(lhs, rhs, lhs_start, rhs_start, len) - } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => { From 1daf7d31ff5a99fc47cc14328e2ab603dbe34679 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 27 Nov 2022 13:18:16 -0800 Subject: [PATCH 0337/1411] Add a cast test case for decimal negative scale (#3203) * Add a cast test case for decimal negative scale * Add one more test --- arrow-cast/src/cast.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 07c7d6a3ac55..1f5359bae1f9 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -7437,4 +7437,40 @@ mod tests { assert_eq!("2120", decimal_arr.value_as_string(1)); assert_eq!("3120", decimal_arr.value_as_string(2)); } + + #[test] + fn test_cast_decimal128_to_decimal128_negative() { + let input_type = DataType::Decimal128(10, -1); + let output_type = DataType::Decimal128(10, -2); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(123)]; + let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![Some(12_i128),] + ); + + let casted_array = cast(&array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("1200", decimal_arr.value_as_string(0)); + + let array = vec![Some(125)]; + let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![Some(13_i128),] + ); + + let casted_array = cast(&array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("1300", decimal_arr.value_as_string(0)); + } } From f985818012e6d0a56fca49487dbc13c516f4613c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 27 Nov 2022 13:20:47 -0800 Subject: [PATCH 0338/1411] Use SlicesIterator for ArrayData Equality (#3198) * Use SlicesIterator for ArrayData Equality * Use BitSliceIterator --- arrow-data/src/equal/fixed_binary.rs | 72 ++++++++++++++++++-------- arrow-data/src/equal/primitive.rs | 75 ++++++++++++++++++++-------- arrow-select/src/filter.rs | 2 +- arrow/benches/equal.rs | 3 ++ 4 files changed, 110 insertions(+), 42 deletions(-) diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index d6af208016fa..17e470b5c47c 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::data::{contains_nulls, ArrayData}; +use crate::bit_iterator::BitSliceIterator; +use crate::contains_nulls; +use crate::data::ArrayData; +use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD; use arrow_buffer::bit_util::get_bit; use arrow_schema::DataType; @@ -47,26 +50,55 @@ pub(super) fn fixed_binary_equal( size * len, ) } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; + let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + // with nulls, we need to compare item by item whenever it is not null + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + + lhs_is_null + || (lhs_is_null == rhs_is_null) + && equal_len( + lhs_values, + rhs_values, + lhs_pos * size, + rhs_pos * size, + size, // 1 * size since we are comparing a single entry + ) + }) + } else { + let lhs_slices_iter = BitSliceIterator::new( + lhs.null_buffer().as_ref().unwrap(), + lhs_start + lhs.offset(), + len, + ); + let rhs_slices_iter = BitSliceIterator::new( + rhs.null_buffer().as_ref().unwrap(), + rhs_start + rhs.offset(), + len, + ); + + lhs_slices_iter.zip(rhs_slices_iter).all( + |((l_start, l_end), (r_start, r_end))| { + l_start == r_start + && l_end == r_end + && equal_len( + lhs_values, + rhs_values, + (lhs_start + l_start) * size, + (rhs_start + r_start) * size, + (l_end - l_start) * size, + ) + }, + ) + } } } diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs index e619375d5314..f52541e2861c 100644 --- a/arrow-data/src/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -15,13 +15,17 @@ // specific language governing permissions and limitations // under the License. +use crate::bit_iterator::BitSliceIterator; +use crate::contains_nulls; +use arrow_buffer::bit_util::get_bit; use std::mem::size_of; -use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; +use crate::data::ArrayData; use super::utils::equal_len; +pub(crate) const NULL_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.4; + pub(super) fn primitive_equal( lhs: &ArrayData, rhs: &ArrayData, @@ -45,25 +49,54 @@ pub(super) fn primitive_equal( len * byte_width, ) } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let selectivity_frac = lhs.null_count() as f64 / lhs.len() as f64; + + if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { + // get a ref of the null buffer bytes, to use in testing for nullness + let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); + let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + // with nulls, we need to compare item by item whenever it is not null + (0..len).all(|i| { + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); + let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + + lhs_is_null + || (lhs_is_null == rhs_is_null) + && equal_len( + lhs_values, + rhs_values, + lhs_pos * byte_width, + rhs_pos * byte_width, + byte_width, // 1 * byte_width since we are comparing a single entry + ) + }) + } else { + let lhs_slices_iter = BitSliceIterator::new( + lhs.null_buffer().as_ref().unwrap(), + lhs_start + lhs.offset(), + len, + ); + let rhs_slices_iter = BitSliceIterator::new( + rhs.null_buffer().as_ref().unwrap(), + rhs_start + rhs.offset(), + len, + ); - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * byte_width, - rhs_pos * byte_width, - byte_width, // 1 * byte_width since we are comparing a single entry - ) - }) + lhs_slices_iter.zip(rhs_slices_iter).all( + |((l_start, l_end), (r_start, r_end))| { + l_start == r_start + && l_end == r_end + && equal_len( + lhs_values, + rhs_values, + (lhs_start + l_start) * byte_width, + (rhs_start + r_start) * byte_width, + (l_end - l_start) * byte_width, + ) + }, + ) + } } } diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 41d93aefa31b..fde4b41b04cf 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -39,7 +39,7 @@ use arrow_schema::*; const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; /// An iterator of `(usize, usize)` each representing an interval -/// `[start, end)` whose slots of a [BooleanArray] are true. Each +/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each /// interval corresponds to a contiguous region of memory to be /// "taken" from an array to be filtered. /// diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs index f54aff1b5cc7..2f4e2fada9e9 100644 --- a/arrow/benches/equal.rs +++ b/arrow/benches/equal.rs @@ -43,6 +43,9 @@ fn add_benchmark(c: &mut Criterion) { let arr_a_nulls = create_primitive_array::(512, 0.5); c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls))); + let arr_a = create_primitive_array::(51200, 0.1); + c.bench_function("equal_51200", |b| b.iter(|| bench_equal(&arr_a))); + let arr_a = create_string_array::(512, 0.0); c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a))); From a6daff5fcc360f9c570cc20cae26b53373af8d9b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 28 Nov 2022 02:52:39 -0800 Subject: [PATCH 0339/1411] Add _dyn kernels of like, ilike, nlike, nilike kernels for dictionary support (#3197) * Add dictionary suppport to like, ilike, nlike, nilike kernels * Add _dyn kernels for dictionary support * Gated by feature dyn_cmp_dict --- arrow/src/compute/kernels/comparison.rs | 385 +++++++++++++++++++++++- 1 file changed, 380 insertions(+), 5 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 10cab4889346..33a24500aabd 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -140,14 +140,13 @@ fn is_like_pattern(c: char) -> bool { /// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] /// /// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) -fn regex_like( - left: &GenericStringArray, - right: &GenericStringArray, +fn regex_like<'a, S: ArrayAccessor, F>( + left: S, + right: S, negate_regex: bool, op: F, ) -> Result where - OffsetSize: OffsetSizeTrait, F: Fn(&str) -> Result, { let mut map = HashMap::new(); @@ -227,6 +226,86 @@ pub fn like_utf8( }) } +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + like_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + like_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + like_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn like_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + #[inline] fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( left: L, @@ -402,6 +481,85 @@ pub fn nlike_utf8( }) } +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nlike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nlike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nlike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nlike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + #[inline] fn nlike_scalar<'a, L: ArrayAccessor>( left: L, @@ -497,6 +655,85 @@ pub fn ilike_utf8( }) } +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + ilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + ilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + ilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn ilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + #[inline] fn ilike_scalar<'a, L: ArrayAccessor>( left: L, @@ -616,7 +853,7 @@ pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { Err(ArrowError::ComputeError( - "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), )) } } @@ -676,6 +913,85 @@ pub fn nilike_utf8( }) } +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + #[inline] fn nilike_scalar<'a, L: ArrayAccessor>( left: L, @@ -4451,6 +4767,24 @@ mod tests { }; } + macro_rules! test_dict_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn $test_name() { + let left: DictionaryArray = $left.into_iter().collect(); + let right: DictionaryArray = $right.into_iter().collect(); + let res = $op(&left, &right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + #[test] fn test_utf8_eq_scalar_on_slice() { let a = StringArray::from( @@ -4599,6 +4933,14 @@ mod tests { vec![true, true, true, false, false, true, false, false] ); + test_dict_utf8!( + test_utf8_array_like_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], + like_dyn, + vec![true, true, true, false, false, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_escape_testing, test_utf8_array_like_scalar_dyn_escape_testing, @@ -4707,6 +5049,14 @@ mod tests { vec![true] ); + test_dict_utf8!( + test_utf8_scalar_ilike_regex_dict, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_dyn, + vec![true] + ); + #[test] fn test_replace_like_wildcards() { let a_eq = "_%"; @@ -4757,6 +5107,15 @@ mod tests { nlike_utf8, vec![false, false, false, true, true, false, true] ); + + test_dict_utf8!( + test_utf8_array_nlike_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_dyn, + vec![false, false, false, true, true, false, true] + ); + test_utf8_scalar!( test_utf8_array_nlike_escape_testing, test_utf8_array_nlike_escape_dyn_testing_dyn, @@ -4844,6 +5203,14 @@ mod tests { vec![true, true, true, false, false, true, false] ); + test_dict_utf8!( + test_utf8_array_ilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_dyn, + vec![true, true, true, false, false, true, false] + ); + test_utf8_scalar!( ilike_utf8_scalar_escape_testing, ilike_utf8_scalar_escape_dyn_testing, @@ -4912,6 +5279,14 @@ mod tests { vec![false, false, false, true, true, false, true] ); + test_dict_utf8!( + test_utf8_array_nilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_dyn, + vec![false, false, false, true, true, false, true] + ); + test_utf8_scalar!( nilike_utf8_scalar_escape_testing, nilike_utf8_scalar_escape_dyn_testing, From 6f41b95de4a0ac33319b9e96e179b47fcd71fdbf Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Mon, 28 Nov 2022 21:18:20 +0800 Subject: [PATCH 0340/1411] fix bug: cast decimal256 to other decimal with safe(false) (#3208) --- arrow-cast/src/cast.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1f5359bae1f9..aa40ad425a5e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2128,8 +2128,8 @@ fn cast_decimal_to_decimal( if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; // Original value is 11234_i128, and will be cast to 1123_i128. - let array = array.as_any().downcast_ref::().unwrap(); if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); if BYTE_WIDTH2 == 16 { let div = 10_i128 .pow_checked((input_scale - output_scale) as u32) @@ -3816,6 +3816,34 @@ mod tests { ); } + #[test] + fn test_cast_decimal256_to_decimal128_overflow() { + let input_type = DataType::Decimal256(76, 5); + let output_type = DataType::Decimal128(38, 7); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(i256::from_i128(i128::MAX))]; + let input_decimal_array = create_decimal256_array(array, 76, 5).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + let result = + cast_with_options(&array, &output_type, &CastOptions { safe: false }); + assert_eq!("Invalid argument error: 17014118346046923173168730371588410572700 cannot be casted to 128-bit integer for Decimal128", + result.unwrap_err().to_string()); + } + + #[test] + fn test_cast_decimal256_to_decimal256_overflow() { + let input_type = DataType::Decimal256(76, 5); + let output_type = DataType::Decimal256(76, 55); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(i256::from_i128(i128::MAX))]; + let input_decimal_array = create_decimal256_array(array, 76, 5).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + let result = + cast_with_options(&array, &output_type, &CastOptions { safe: false }); + assert_eq!("Cast error: Cannot cast to \"Decimal256\"(76, 55). Overflowing on 170141183460469231731687303715884105727", + result.unwrap_err().to_string()); + } + #[test] fn test_cast_decimal256_to_decimal128() { let input_type = DataType::Decimal256(20, 3); From 5d84746cfdfe3ae9a2678f10b4dbb2e9385dc479 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 28 Nov 2022 13:16:43 -0800 Subject: [PATCH 0341/1411] Add try_unary_mut (#3134) * Add add_scalar_mut and add_scalar_checked_mut * Update slice related functions for completeness. * Change result type * Update API doc * Remove _mut arithmetic kernels * For review --- arrow-array/src/array/primitive_array.rs | 38 +++++++++++++++++++ .../src/builder/boolean_buffer_builder.rs | 5 +++ .../src/builder/null_buffer_builder.rs | 4 ++ arrow-array/src/builder/primitive_builder.rs | 18 +++++++++ arrow/src/compute/kernels/arithmetic.rs | 23 +++++++++++ arrow/src/compute/kernels/arity.rs | 27 +++++++++++++ 6 files changed, 115 insertions(+) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index e3d14e79ded0..036ef0cdd52f 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -505,6 +505,44 @@ impl PrimitiveArray { }) } + /// Applies an unary and fallible function to all valid values in a mutable primitive array. + /// Mutable primitive array means that the buffer is not shared with other arrays. + /// As a result, this mutates the buffer directly without allocating new buffer. + /// + /// This is unlike [`Self::unary_mut`] which will apply an infallible function to all rows + /// regardless of validity, in many cases this will be significantly faster and should + /// be preferred if `op` is infallible. + /// + /// This returns an `Err` when the input array is shared buffer with other + /// array. In the case, returned `Err` wraps input array. If the function + /// encounters an error during applying on values. In the case, this returns an `Err` within + /// an `Ok` which wraps the actual error. + /// + /// Note: LLVM is currently unable to effectively vectorize fallible operations + pub fn try_unary_mut( + self, + op: F, + ) -> Result, E>, PrimitiveArray> + where + F: Fn(T::Native) -> Result, + { + let len = self.len(); + let null_count = self.null_count(); + let mut builder = self.into_builder()?; + + let (slice, null_buffer) = builder.slices_mut(); + + match try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { *slice.get_unchecked_mut(idx) = op(*slice.get_unchecked(idx))? }; + Ok::<_, E>(()) + }) { + Ok(_) => {} + Err(err) => return Ok(Err(err)), + }; + + Ok(Ok(builder.finish())) + } + /// Applies a unary and nullable function to all valid values in a primitive array /// /// This is unlike [`Self::unary`] which will apply an infallible function to all rows diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index 4f8638ee789c..7d86f74f6aae 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -168,6 +168,11 @@ impl BooleanBufferBuilder { self.buffer.as_slice() } + /// Returns the packed bits + pub fn as_slice_mut(&mut self) -> &mut [u8] { + self.buffer.as_slice_mut() + } + /// Creates a [`Buffer`] #[inline] pub fn finish(&mut self) -> Buffer { diff --git a/arrow-array/src/builder/null_buffer_builder.rs b/arrow-array/src/builder/null_buffer_builder.rs index b3c788fe5993..0061f70c7ed4 100644 --- a/arrow-array/src/builder/null_buffer_builder.rs +++ b/arrow-array/src/builder/null_buffer_builder.rs @@ -154,6 +154,10 @@ impl NullBufferBuilder { self.bitmap_builder = Some(b); } } + + pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> { + self.bitmap_builder.as_mut().map(|b| b.as_slice_mut()) + } } impl NullBufferBuilder { diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index ef420dcbc295..fa1dc3ad1264 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -285,6 +285,24 @@ impl PrimitiveBuilder { pub fn values_slice_mut(&mut self) -> &mut [T::Native] { self.values_builder.as_slice_mut() } + + /// Returns the current values buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Returns the current values buffer as a mutable slice + pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { + self.null_buffer_builder.as_slice_mut() + } + + /// Returns the current values buffer and null buffer as a slice + pub fn slices_mut(&mut self) -> (&mut [T::Native], Option<&mut [u8]>) { + ( + self.values_builder.as_slice_mut(), + self.null_buffer_builder.as_slice_mut(), + ) + } } #[cfg(test)] diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index a99a90204b7f..f9deada5389b 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1624,6 +1624,7 @@ where mod tests { use super::*; use crate::array::Int32Array; + use crate::compute::{try_unary_mut, unary_mut}; use crate::datatypes::{Date64Type, Int32Type, Int8Type}; use arrow_buffer::i256; use chrono::NaiveDate; @@ -3098,4 +3099,26 @@ mod tests { assert_eq!(result.len(), 13); assert_eq!(result.null_count(), 13); } + + #[test] + fn test_primitive_add_scalar_by_unary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = 3; + let c = unary_mut(a, |value| value.add_wrapping(b)).unwrap(); + let expected = Int32Array::from(vec![18, 17, 12, 11, 4]); + assert_eq!(c, expected); + } + + #[test] + fn test_primitive_add_scalar_overflow_by_try_unary_mut() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + + let wrapped = unary_mut(a, |value| value.add_wrapping(1)).unwrap(); + let expected = Int32Array::from(vec![-2147483648, -2147483647]); + assert_eq!(expected, wrapped); + + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let overflow = try_unary_mut(a, |value| value.add_checked(1)); + let _ = overflow.unwrap().expect_err("overflow should be detected"); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index c99d2b727b8d..946d15e9e984 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -58,6 +58,18 @@ where array.unary(op) } +/// See [`PrimitiveArray::unary_mut`] +pub fn unary_mut( + array: PrimitiveArray, + op: F, +) -> std::result::Result, PrimitiveArray> +where + I: ArrowPrimitiveType, + F: Fn(I::Native) -> I::Native, +{ + array.unary_mut(op) +} + /// See [`PrimitiveArray::try_unary`] pub fn try_unary(array: &PrimitiveArray, op: F) -> Result> where @@ -68,6 +80,21 @@ where array.try_unary(op) } +/// See [`PrimitiveArray::try_unary_mut`] +pub fn try_unary_mut( + array: PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + I: ArrowPrimitiveType, + F: Fn(I::Native) -> Result, +{ + array.try_unary_mut(op) +} + /// A helper function that applies an infallible unary function to a dictionary array with primitive value type. fn unary_dict(array: &DictionaryArray, op: F) -> Result where From 64b466e7864f8b019d3396c45efec81342b46a7f Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 29 Nov 2022 08:17:37 +1100 Subject: [PATCH 0342/1411] Infer timestamps from CSV files (#3209) * Infer timestamps from CSV files * Fix regex patterns --- arrow-csv/src/reader.rs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 6432fb1b8017..f8f9f50a3e2f 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -70,9 +70,11 @@ lazy_static! { .case_insensitive(true) .build() .unwrap(); - static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); + static ref DATE32_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); + static ref DATE64_RE: Regex = + Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$").unwrap(); static ref DATETIME_RE: Regex = - Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap(); + Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}$").unwrap(); } /// Infer the data type of a record @@ -90,10 +92,12 @@ fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { DataType::Float64 } else if INTEGER_RE.is_match(string) { DataType::Int64 - } else if datetime_re.is_match(string) { - DataType::Date64 - } else if DATE_RE.is_match(string) { + } else if DATE32_RE.is_match(string) { DataType::Date32 + } else if DATE64_RE.is_match(string) { + DataType::Date64 + } else if datetime_re.is_match(string) { + DataType::Timestamp(TimeUnit::Nanosecond, None) } else { DataType::Utf8 } @@ -1590,10 +1594,9 @@ mod tests { infer_field_schema("2020-11-08T14:20:01", None), DataType::Date64 ); - // to be inferred as a date64 this needs a custom datetime_re assert_eq!( infer_field_schema("2020-11-08 14:20:01", None), - DataType::Utf8 + DataType::Date64 ); let reg = Regex::new(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$").ok(); assert_eq!( @@ -1602,6 +1605,14 @@ mod tests { ); assert_eq!(infer_field_schema("-5.13", None), DataType::Float64); assert_eq!(infer_field_schema("0.1300", None), DataType::Float64); + assert_eq!( + infer_field_schema("2021-12-19 13:12:30.921", None), + DataType::Timestamp(TimeUnit::Nanosecond, None) + ); + assert_eq!( + infer_field_schema("2021-12-19T13:12:30.123456789", None), + DataType::Timestamp(TimeUnit::Nanosecond, None) + ); } #[test] From 1eff6fe104a5a8b38bf79c7443faade23cc8c962 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 29 Nov 2022 04:30:06 +0000 Subject: [PATCH 0343/1411] Update tonic-build to 0.8.3 (#3214) --- arrow-flight/Cargo.toml | 2 +- arrow-flight/src/arrow.flight.protocol.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 76aceb136af4..0e09953e7479 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -47,11 +47,11 @@ flight-sql-experimental = ["prost-types"] [dev-dependencies] [build-dependencies] -tonic-build = { version = "0.8", default-features = false, features = ["transport", "prost"] } # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.47", default-features = false } prost-build = { version = "=0.11.2", default-features = false } +tonic-build = { version = "=0.8.3", default-features = false, features = ["transport", "prost"] } [[example]] name = "flight_sql_server" diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index 10ab82a87fc1..e6754e806e06 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -579,10 +579,10 @@ pub mod flight_service_client { pub mod flight_service_server { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] use tonic::codegen::*; - ///Generated trait containing gRPC methods that should be implemented for use with FlightServiceServer. + /// Generated trait containing gRPC methods that should be implemented for use with FlightServiceServer. #[async_trait] pub trait FlightService: Send + Sync + 'static { - ///Server streaming response type for the Handshake method. + /// Server streaming response type for the Handshake method. type HandshakeStream: futures_core::Stream< Item = Result, > @@ -597,7 +597,7 @@ pub mod flight_service_server { &self, request: tonic::Request>, ) -> Result, tonic::Status>; - ///Server streaming response type for the ListFlights method. + /// Server streaming response type for the ListFlights method. type ListFlightsStream: futures_core::Stream< Item = Result, > @@ -638,7 +638,7 @@ pub mod flight_service_server { &self, request: tonic::Request, ) -> Result, tonic::Status>; - ///Server streaming response type for the DoGet method. + /// Server streaming response type for the DoGet method. type DoGetStream: futures_core::Stream< Item = Result, > @@ -653,7 +653,7 @@ pub mod flight_service_server { &self, request: tonic::Request, ) -> Result, tonic::Status>; - ///Server streaming response type for the DoPut method. + /// Server streaming response type for the DoPut method. type DoPutStream: futures_core::Stream< Item = Result, > @@ -670,7 +670,7 @@ pub mod flight_service_server { &self, request: tonic::Request>, ) -> Result, tonic::Status>; - ///Server streaming response type for the DoExchange method. + /// Server streaming response type for the DoExchange method. type DoExchangeStream: futures_core::Stream< Item = Result, > @@ -686,7 +686,7 @@ pub mod flight_service_server { &self, request: tonic::Request>, ) -> Result, tonic::Status>; - ///Server streaming response type for the DoAction method. + /// Server streaming response type for the DoAction method. type DoActionStream: futures_core::Stream< Item = Result, > @@ -703,7 +703,7 @@ pub mod flight_service_server { &self, request: tonic::Request, ) -> Result, tonic::Status>; - ///Server streaming response type for the ListActions method. + /// Server streaming response type for the ListActions method. type ListActionsStream: futures_core::Stream< Item = Result, > From 4926bad4d1ae653d01923ebf16d71055eb76da6d Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Tue, 29 Nov 2022 18:19:17 +0800 Subject: [PATCH 0344/1411] add test cases for extracing week with timezone (#3218) --- arrow/src/compute/kernels/temporal.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index 9ade79969988..cea0a6afcd75 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -934,6 +934,31 @@ mod tests { assert!(matches!(hour(&a), Err(ArrowError::ParseError(_)))) } + #[test] + fn test_temporal_array_timestamp_week_without_timezone() { + // 1970-01-01T00:00:00 -> 1970-01-01T00:00:00 Thursday (week 1) + // 1970-01-01T00:00:00 + 4 days -> 1970-01-05T00:00:00 Monday (week 2) + // 1970-01-01T00:00:00 + 4 days - 1 second -> 1970-01-04T23:59:59 Sunday (week 1) + let a = TimestampSecondArray::from(vec![0, 86400 * 4, 86400 * 4 - 1]); + let b = week(&a).unwrap(); + assert_eq!(1, b.value(0)); + assert_eq!(2, b.value(1)); + assert_eq!(1, b.value(2)); + } + + #[test] + fn test_temporal_array_timestamp_week_with_timezone() { + // 1970-01-01T01:00:00+01:00 -> 1970-01-01T01:00:00+01:00 Thursday (week 1) + // 1970-01-01T01:00:00+01:00 + 4 days -> 1970-01-05T01:00:00+01:00 Monday (week 2) + // 1970-01-01T01:00:00+01:00 + 4 days - 1 second -> 1970-01-05T00:59:59+01:00 Monday (week 2) + let a = TimestampSecondArray::from(vec![0, 86400 * 4, 86400 * 4 - 1]) + .with_timezone("+01:00".to_string()); + let b = week(&a).unwrap(); + assert_eq!(1, b.value(0)); + assert_eq!(2, b.value(1)); + assert_eq!(2, b.value(2)); + } + #[test] fn test_hour_minute_second_dictionary_array() { let a = TimestampSecondArray::from(vec![ From 1d6b5ab71eec290ffd9656e15bb06aed2b820148 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 29 Nov 2022 21:21:24 +1100 Subject: [PATCH 0345/1411] Ensure StructArrays check nullability of fields (#3205) --- arrow-array/src/array/mod.rs | 2 +- arrow-array/src/array/struct_array.rs | 107 ++++++++++++++++++------- arrow-array/src/builder/map_builder.rs | 15 ++-- arrow-cast/src/cast.rs | 3 +- arrow-ipc/src/writer.rs | 2 +- arrow-json/src/writer.rs | 8 +- arrow/src/util/pretty.rs | 4 +- 7 files changed, 93 insertions(+), 48 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 5fc44d8965e4..0f9a2ce59291 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -916,7 +916,7 @@ mod tests { #[test] fn test_null_struct() { let struct_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); + DataType::Struct(vec![Field::new("data", DataType::Int64, true)]); let array = new_null_array(&struct_type, 9); let a = array.as_any().downcast_ref::().unwrap(); diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 6c6490e3168f..7d88cc5c6deb 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -172,7 +172,7 @@ impl TryFrom> for StructArray { child_null_buffer.bit_slice(child_datum_offset, child_datum_len) }); } else if null.is_some() { - // when one of the fields has no nulls, them there is no null in the array + // when one of the fields has no nulls, then there is no null in the array null = None; } } @@ -212,20 +212,30 @@ impl From> for StructArray { fn from(v: Vec<(Field, ArrayRef)>) -> Self { let (field_types, field_values): (Vec<_>, Vec<_>) = v.into_iter().unzip(); - // Check the length of the child arrays - let length = field_values[0].len(); - for i in 1..field_values.len() { - assert_eq!( - length, - field_values[i].len(), - "all child arrays of a StructArray must have the same length" - ); - assert_eq!( - field_types[i].data_type(), - field_values[i].data().data_type(), - "the field data types must match the array data in a StructArray" - ) - } + let length = field_values.get(0).map(|a| a.len()).unwrap_or(0); + field_types.iter().zip(field_values.iter()).for_each( + |(field_type, field_value)| { + // Check the length of the child arrays + assert_eq!( + length, + field_value.len(), + "all child arrays of a StructArray must have the same length" + ); + // Check data types of child arrays + assert_eq!( + field_type.data_type(), + field_value.data().data_type(), + "the field data types must match the array data in a StructArray" + ); + // Check nullability of child arrays + if !field_type.is_nullable() { + assert!( + field_value.null_count() == 0, + "non-nullable field cannot have null values" + ); + } + }, + ); let array_data = ArrayData::builder(DataType::Struct(field_types)) .child_data(field_values.into_iter().map(|a| a.into_data()).collect()) @@ -258,20 +268,30 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self { let (field_types, field_values): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip(); - // Check the length of the child arrays - let length = field_values[0].len(); - for i in 1..field_values.len() { - assert_eq!( - length, - field_values[i].len(), - "all child arrays of a StructArray must have the same length" - ); - assert_eq!( - field_types[i].data_type(), - field_values[i].data().data_type(), - "the field data types must match the array data in a StructArray" - ) - } + let length = field_values.get(0).map(|a| a.len()).unwrap_or(0); + field_types.iter().zip(field_values.iter()).for_each( + |(field_type, field_value)| { + // Check the length of the child arrays + assert_eq!( + length, + field_value.len(), + "all child arrays of a StructArray must have the same length" + ); + // Check data types of child arrays + assert_eq!( + field_type.data_type(), + field_value.data().data_type(), + "the field data types must match the array data in a StructArray" + ); + // Check nullability of child arrays + if !field_type.is_nullable() { + assert!( + field_value.null_count() == 0, + "non-nullable field cannot have null values" + ); + } + }, + ); let array_data = ArrayData::builder(DataType::Struct(field_types)) .null_bit_buffer(Some(pair.1)) @@ -408,7 +428,19 @@ mod tests { #[should_panic( expected = "the field data types must match the array data in a StructArray" )] - fn test_struct_array_from_mismatched_types() { + fn test_struct_array_from_mismatched_types_single() { + drop(StructArray::from(vec![( + Field::new("b", DataType::Int16, false), + Arc::new(BooleanArray::from(vec![false, false, true, true])) + as Arc, + )])); + } + + #[test] + #[should_panic( + expected = "the field data types must match the array data in a StructArray" + )] + fn test_struct_array_from_mismatched_types_multiple() { drop(StructArray::from(vec![ ( Field::new("b", DataType::Int16, false), @@ -528,4 +560,19 @@ mod tests { ), ])); } + + #[test] + fn test_struct_array_from_empty() { + let sa = StructArray::from(vec![]); + assert!(sa.is_empty()) + } + + #[test] + #[should_panic(expected = "non-nullable field cannot have null values")] + fn test_struct_array_from_mismatched_nullability() { + drop(StructArray::from(vec![( + Field::new("c", DataType::Int32, false), + Arc::new(Int32Array::from(vec![Some(42), None, Some(19)])) as ArrayRef, + )])); + } } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 737b4fa72de1..831128c29d05 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -37,8 +37,8 @@ use std::sync::Arc; /// /// let string_builder = builder.keys(); /// string_builder.append_value("joe"); -/// string_builder.append_null(); -/// string_builder.append_null(); +/// string_builder.append_value("n1"); +/// string_builder.append_value("n2"); /// string_builder.append_value("mark"); /// /// let int_builder = builder.values(); @@ -58,7 +58,7 @@ use std::sync::Arc; /// ); /// assert_eq!( /// *arr.keys(), -/// StringArray::from(vec![Some("joe"), None, None, Some("mark")]) +/// StringArray::from(vec![Some("joe"), Some("n1"), Some("n2"), Some("mark")]) /// ); /// ``` #[derive(Debug)] @@ -286,8 +286,8 @@ mod tests { let string_builder = builder.keys(); string_builder.append_value("joe"); - string_builder.append_null(); - string_builder.append_null(); + string_builder.append_value("n1"); + string_builder.append_value("n2"); string_builder.append_value("mark"); let int_builder = builder.values(); @@ -312,9 +312,8 @@ mod tests { let expected_string_data = ArrayData::builder(DataType::Utf8) .len(4) - .null_bit_buffer(Some(Buffer::from(&[9_u8]))) - .add_buffer(Buffer::from_slice_ref([0, 3, 3, 3, 7])) - .add_buffer(Buffer::from_slice_ref(b"joemark")) + .add_buffer(Buffer::from_slice_ref([0, 3, 5, 7, 11])) + .add_buffer(Buffer::from_slice_ref(b"joen1n2mark")) .build() .unwrap(); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index aa40ad425a5e..23be8839593c 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -6714,8 +6714,7 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to struct - let data_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); + let data_type = DataType::Struct(vec![Field::new("data", DataType::Int64, true)]); cast_from_null_to_other(&data_type); } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 0497cbe5e47f..032783deed72 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1799,7 +1799,7 @@ mod tests { Arc::new(strings) as ArrayRef, ), ( - Field::new("c", DataType::Int32, false), + Field::new("c", DataType::Int32, true), Arc::new(ints) as ArrayRef, ), ]); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 69f626600392..16eec79c64ac 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1069,7 +1069,7 @@ mod tests { Field::new( "c1", DataType::Struct(vec![ - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Field::new( "c12", DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), @@ -1083,7 +1083,7 @@ mod tests { let c1 = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( @@ -1230,7 +1230,7 @@ mod tests { DataType::List(Box::new(Field::new( "s", DataType::Struct(vec![ - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Field::new( "c12", DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), @@ -1246,7 +1246,7 @@ mod tests { let struct_values = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index b1a07dfee8bc..7e8378d15339 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -713,7 +713,7 @@ mod tests { Field::new( "c1", DataType::Struct(vec![ - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Field::new( "c12", DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), @@ -727,7 +727,7 @@ mod tests { let c1 = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, false), + Field::new("c11", DataType::Int32, true), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( From b2bfe9c5fb4f09ef9f7b1fd67ec767f6d42cf8d8 Mon Sep 17 00:00:00 2001 From: Sumit Date: Tue, 29 Nov 2022 11:45:02 +0100 Subject: [PATCH 0346/1411] object_store: add support for using proxy_url for connection testing (#3109) --- object_store/src/aws/client.rs | 23 +++++++++---- object_store/src/aws/mod.rs | 58 +++++++++++++++++++++++++++++--- object_store/src/azure/client.rs | 25 ++++++++++---- object_store/src/azure/mod.rs | 11 +++++- object_store/src/gcp/mod.rs | 55 +++++++++++++++++++++++++++++- 5 files changed, 153 insertions(+), 19 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index a07cdb3c6a82..e51fe415cd14 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -88,6 +88,9 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -201,6 +204,7 @@ pub struct S3Config { pub credentials: Box, pub retry_config: RetryConfig, pub allow_http: bool, + pub proxy_url: Option, } impl S3Config { @@ -216,13 +220,20 @@ pub(crate) struct S3Client { } impl S3Client { - pub fn new(config: S3Config) -> Self { - let client = reqwest::ClientBuilder::new() - .https_only(!config.allow_http) - .build() - .unwrap(); + pub fn new(config: S3Config) -> Result { + let builder = reqwest::ClientBuilder::new().https_only(!config.allow_http); + let client = match &config.proxy_url { + Some(ref url) => { + let pr = reqwest::Proxy::all(url) + .map_err(|source| Error::ProxyUrl { source })?; + builder.proxy(pr) + } + _ => builder, + } + .build() + .unwrap(); - Self { config, client } + Ok(Self { config, client }) } /// Returns the config diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 4a810658c03f..cf7a5542e0c5 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -36,7 +36,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; -use reqwest::Client; +use reqwest::{Client, Proxy}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -120,6 +120,9 @@ enum Error { #[snafu(display("Error reading token file: {}", source))] ReadTokenFile { source: std::io::Error }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -363,6 +366,7 @@ pub struct AmazonS3Builder { virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, + proxy_url: Option, } impl AmazonS3Builder { @@ -537,6 +541,12 @@ impl AmazonS3Builder { self } + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Set the AWS profile name, see /// /// This makes use of [aws-config] to provide credentials and therefore requires @@ -561,6 +571,14 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; + let clientbuilder = match self.proxy_url { + Some(ref url) => { + let pr: Proxy = + Proxy::all(url).map_err(|source| Error::ProxyUrl { source })?; + Client::builder().proxy(pr) + } + None => Client::builder(), + }; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); @@ -590,7 +608,7 @@ impl AmazonS3Builder { let endpoint = format!("https://sts.{}.amazonaws.com", region); // Disallow non-HTTPs requests - let client = Client::builder().https_only(true).build().unwrap(); + let client = clientbuilder.https_only(true).build().unwrap(); Box::new(WebIdentityProvider { cache: Default::default(), @@ -611,7 +629,7 @@ impl AmazonS3Builder { info!("Using Instance credential provider"); // The instance metadata endpoint is access over HTTP - let client = Client::builder().https_only(false).build().unwrap(); + let client = clientbuilder.https_only(false).build().unwrap(); Box::new(InstanceCredentialProvider { cache: Default::default(), @@ -653,9 +671,10 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, allow_http: self.allow_http, + proxy_url: self.proxy_url, }; - let client = Arc::new(S3Client::new(config)); + let client = Arc::new(S3Client::new(config).unwrap()); Ok(AmazonS3 { client }) } @@ -898,4 +917,35 @@ mod tests { let err = integration.delete(&location).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); } + + #[tokio::test] + async fn s3_test_proxy_url() { + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("https://example.com") + .build(); + + assert!(s3.is_ok()); + + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("asdf://example.com") + .build(); + + assert!(match s3 { + Err(crate::Error::Generic { source, .. }) => matches!( + source.downcast_ref(), + Some(crate::aws::Error::ProxyUrl { .. }) + ), + _ => false, + }) + } } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index ece07853a1b6..d8cfdd1c759e 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -27,7 +27,7 @@ use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, - Client as ReqwestClient, Method, Response, StatusCode, + Client as ReqwestClient, Method, Proxy, Response, StatusCode, }; use serde::{Deserialize, Deserializer, Serialize}; use snafu::{ResultExt, Snafu}; @@ -82,6 +82,9 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -124,6 +127,7 @@ pub struct AzureConfig { pub allow_http: bool, pub service: Url, pub is_emulator: bool, + pub proxy_url: Option, } impl AzureConfig { @@ -148,13 +152,20 @@ pub(crate) struct AzureClient { impl AzureClient { /// create a new instance of [AzureClient] - pub fn new(config: AzureConfig) -> Self { - let client = reqwest::ClientBuilder::new() - .https_only(!config.allow_http) - .build() - .unwrap(); + pub fn new(config: AzureConfig) -> Result { + let builder = ReqwestClient::builder(); + + let client = if let Some(url) = config.proxy_url.as_ref() { + let pr = Proxy::all(url).map_err(|source| Error::ProxyUrl { source }); + builder.proxy(pr.unwrap()) + } else { + builder + } + .https_only(!config.allow_http) + .build() + .unwrap(); - Self { config, client } + Ok(Self { config, client }) } /// Returns the config diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index f7ca4cf4e8c4..060b4b2d25dd 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -360,6 +360,7 @@ pub struct MicrosoftAzureBuilder { use_emulator: bool, retry_config: RetryConfig, allow_http: bool, + proxy_url: Option, } impl Debug for MicrosoftAzureBuilder { @@ -500,6 +501,12 @@ impl MicrosoftAzureBuilder { self } + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(self) -> Result { @@ -516,6 +523,7 @@ impl MicrosoftAzureBuilder { retry_config, allow_http, authority_host, + proxy_url, } = self; let container = container_name.ok_or(Error::MissingContainerName {})?; @@ -567,9 +575,10 @@ impl MicrosoftAzureBuilder { container, credentials: auth, is_emulator, + proxy_url, }; - let client = Arc::new(client::AzureClient::new(config)); + let client = Arc::new(client::AzureClient::new(config)?); Ok(MicrosoftAzure { client }) } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 0ef4d3564b64..0da92fdbe3d1 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -41,6 +41,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; +use reqwest::Proxy; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -122,6 +123,9 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, + + #[snafu(display("Unable to use proxy url: {}", source))] + ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -741,6 +745,7 @@ pub struct GoogleCloudStorageBuilder { service_account_path: Option, client: Option, retry_config: RetryConfig, + proxy_url: Option, } impl GoogleCloudStorageBuilder { @@ -782,6 +787,12 @@ impl GoogleCloudStorageBuilder { self } + /// Set proxy url used for connection + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` pub fn build(self) -> Result { @@ -790,12 +801,24 @@ impl GoogleCloudStorageBuilder { service_account_path, client, retry_config, + proxy_url, } = self; let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; - let client = client.unwrap_or_else(Client::new); + + let client = match (proxy_url, client) { + (_, Some(client)) => client, + (Some(url), None) => { + let pr = Proxy::all(&url).map_err(|source| Error::ProxyUrl { source })?; + Client::builder() + .proxy(pr) + .build() + .map_err(|source| Error::ProxyUrl { source })? + } + (None, None) => Client::new(), + }; let credentials = reader_credentials_file(service_account_path)?; @@ -1015,4 +1038,34 @@ mod test { err ) } + + #[tokio::test] + async fn gcs_test_proxy_url() { + use std::io::Write; + use tempfile::NamedTempFile; + let mut tfile = NamedTempFile::new().unwrap(); + let creds = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; + write!(tfile, "{}", creds).unwrap(); + let service_account_path = tfile.path(); + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("https://example.com") + .build(); + assert!(dbg!(gcs).is_ok()); + + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("asdf://example.com") + .build(); + + assert!(match gcs { + Err(ObjectStoreError::Generic { source, .. }) => matches!( + source.downcast_ref(), + Some(crate::gcp::Error::ProxyUrl { .. }) + ), + _ => false, + }) + } } From 733d32e90b67bbc62bcff6fc4aa1873d43d4e686 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 29 Nov 2022 10:47:07 +0000 Subject: [PATCH 0347/1411] Support StructArray in Row Format (#3159) (#3212) * Extract Codec and Encoder * Add StructArray support to Row format (#3159) * More docs * Review feedback --- arrow/src/row/fixed.rs | 6 +- arrow/src/row/mod.rs | 515 ++++++++++++++++++++++++++++++----------- 2 files changed, 384 insertions(+), 137 deletions(-) diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 0bad033d9bd8..9aef83ce2ade 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -267,7 +267,11 @@ pub fn decode_bool(rows: &mut [&[u8]], options: SortOptions) -> BooleanArray { unsafe { BooleanArray::from(builder.build_unchecked()) } } -fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) { +/// Decodes a single byte from each row, interpreting `0x01` as a valid value +/// and all other values as a null +/// +/// Returns the null count and null buffer +pub fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) { let mut null_count = 0; let buffer = MutableBuffer::collect_bool(rows.len(), |idx| { let valid = rows[idx][0] == 1; diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 21d8e4df0624..8572bf892fdb 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -131,6 +131,7 @@ use std::sync::Arc; use arrow_array::cast::*; use arrow_array::*; +use arrow_data::ArrayDataBuilder; use crate::compute::SortOptions; use crate::datatypes::*; @@ -307,6 +308,31 @@ mod variable; /// /// Input Row Format /// ``` +/// +/// ## Struct Encoding +/// +/// A null is encoded as a `0_u8`. +/// +/// A valid value is encoded as `1_u8` followed by the row encoding of each child. +/// +/// This encoding effectively flattens the schema in a depth-first fashion. +/// +/// For example +/// +/// ```text +/// ┌───────┬────────────────────────┬───────┐ +/// │ Int32 │ Struct[Int32, Float32] │ Int32 │ +/// └───────┴────────────────────────┴───────┘ +/// ``` +/// +/// Is encoded as +/// +/// ```text +/// ┌───────┬───────────────┬───────┬─────────┬───────┐ +/// │ Int32 │ Null Sentinel │ Int32 │ Float32 │ Int32 │ +/// └───────┴───────────────┴───────┴─────────┴───────┘ +/// ``` +/// /// # Ordering /// /// ## Float Ordering @@ -332,8 +358,103 @@ mod variable; #[derive(Debug)] pub struct RowConverter { fields: Arc<[SortField]>, - /// interning state for column `i`, if column`i` is a dictionary - interners: Vec>>, + /// State for codecs + codecs: Vec, +} + +#[derive(Debug)] +enum Codec { + /// No additional codec state is necessary + Stateless, + /// The interner used to encode dictionary values + Dictionary(OrderPreservingInterner), + /// A row converter for the child fields + /// and the encoding of a row containing only nulls + Struct(RowConverter, OwnedRow), +} + +impl Codec { + fn new(sort_field: &SortField) -> Result { + match &sort_field.data_type { + DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())), + d if !d.is_nested() => Ok(Self::Stateless), + DataType::Struct(f) => { + let sort_fields = f + .iter() + .map(|x| { + SortField::new_with_options( + x.data_type().clone(), + sort_field.options, + ) + }) + .collect(); + + let mut converter = RowConverter::new(sort_fields)?; + let nulls: Vec<_> = + f.iter().map(|x| new_null_array(x.data_type(), 1)).collect(); + + let nulls = converter.convert_columns(&nulls)?; + let owned = OwnedRow { + data: nulls.buffer, + config: nulls.config, + }; + + Ok(Self::Struct(converter, owned)) + } + _ => Err(ArrowError::NotYetImplemented(format!( + "not yet implemented: {:?}", + sort_field.data_type + ))), + } + } + + fn encoder(&mut self, array: &dyn Array) -> Result> { + match self { + Codec::Stateless => Ok(Encoder::Stateless), + Codec::Dictionary(interner) => { + let values = downcast_dictionary_array! { + array => array.values(), + _ => unreachable!() + }; + + let mapping = compute_dictionary_mapping(interner, values) + .into_iter() + .map(|maybe_interned| { + maybe_interned.map(|interned| interner.normalized_key(interned)) + }) + .collect(); + + Ok(Encoder::Dictionary(mapping)) + } + Codec::Struct(converter, null) => { + let v = as_struct_array(array); + let rows = converter.convert_columns(v.columns())?; + Ok(Encoder::Struct(rows, null.row())) + } + } + } + + fn size(&self) -> usize { + match self { + Codec::Stateless => 0, + Codec::Dictionary(interner) => interner.size(), + Codec::Struct(converter, nulls) => converter.size() + nulls.data.len(), + } + } +} + +#[derive(Debug)] +enum Encoder<'a> { + /// No additional encoder state is necessary + Stateless, + /// The mapping from dictionary keys to normalized keys + Dictionary(Vec>), + /// The row encoding of the child array and the encoding of a null row + /// + /// It is necessary to encode to a temporary [`Rows`] to avoid serializing + /// values that are masked by a null in the parent StructArray, otherwise + /// this would establish an ordering between semantically null values + Struct(Rows, Row<'a>), } /// Configure the data type and sort order for a given column @@ -370,21 +491,31 @@ impl RowConverter { pub fn new(fields: Vec) -> Result { if !Self::supports_fields(&fields) { return Err(ArrowError::NotYetImplemented(format!( - "not yet implemented: {:?}", + "Row format support not yet implemented for: {:?}", fields ))); } - let interners = (0..fields.len()).map(|_| None).collect(); + let codecs = fields.iter().map(Codec::new).collect::>()?; Ok(Self { fields: fields.into(), - interners, + codecs, }) } /// Check if the given fields are supported by the row format. pub fn supports_fields(fields: &[SortField]) -> bool { - fields.iter().all(|x| !DataType::is_nested(&x.data_type)) + fields.iter().all(|x| Self::supports_datatype(&x.data_type)) + } + + fn supports_datatype(d: &DataType) -> bool { + match d { + _ if !d.is_nested() => true, + DataType::Struct(f) => { + f.iter().all(|x| Self::supports_datatype(x.data_type())) + } + _ => false, + } } /// Convert [`ArrayRef`] columns into [`Rows`] @@ -403,11 +534,11 @@ impl RowConverter { ))); } - let dictionaries = columns + let encoders = columns .iter() - .zip(&mut self.interners) + .zip(&mut self.codecs) .zip(self.fields.iter()) - .map(|((column, interner), field)| { + .map(|((column, codec), field)| { if !column.data_type().equals_datatype(&field.data_type) { return Err(ArrowError::InvalidArgumentError(format!( "RowConverter column schema mismatch, expected {} got {}", @@ -415,22 +546,7 @@ impl RowConverter { column.data_type() ))); } - - let values = downcast_dictionary_array! { - column => column.values(), - _ => return Ok(None) - }; - - let interner = interner.get_or_insert_with(Default::default); - - let mapping: Vec<_> = compute_dictionary_mapping(interner, values) - .into_iter() - .map(|maybe_interned| { - maybe_interned.map(|interned| interner.normalized_key(interned)) - }) - .collect(); - - Ok(Some(mapping)) + codec.encoder(column.as_ref()) }) .collect::>>()?; @@ -439,13 +555,13 @@ impl RowConverter { // Don't need to validate UTF-8 as came from arrow array validate_utf8: false, }; - let mut rows = new_empty_rows(columns, &dictionaries, config); + let mut rows = new_empty_rows(columns, &encoders, config); - for ((column, field), dictionary) in - columns.iter().zip(self.fields.iter()).zip(dictionaries) + for ((column, field), encoder) in + columns.iter().zip(self.fields.iter()).zip(encoders) { // We encode a column at a time to minimise dispatch overheads - encode_column(&mut rows, column, field.options, dictionary.as_deref()) + encode_column(&mut rows, column, field.options, &encoder) } if cfg!(debug_assertions) { @@ -480,17 +596,26 @@ impl RowConverter { }) .collect(); + // SAFETY + // We have validated that the rows came from this [`RowConverter`] + // and therefore must be valid + unsafe { self.convert_raw(&mut rows, validate_utf8) } + } + + /// Convert raw bytes into [`ArrayRef`] + /// + /// # Safety + /// + /// `rows` must contain valid data for this [`RowConverter`] + unsafe fn convert_raw( + &self, + rows: &mut [&[u8]], + validate_utf8: bool, + ) -> Result> { self.fields .iter() - .zip(&self.interners) - .map(|(field, interner)| { - // SAFETY - // We have validated that the rows came from this [`RowConverter`] - // and therefore must be valid - unsafe { - decode_column(field, &mut rows, interner.as_deref(), validate_utf8) - } - }) + .zip(&self.codecs) + .map(|(field, codec)| decode_column(field, rows, codec, validate_utf8)) .collect() } @@ -505,13 +630,8 @@ impl RowConverter { pub fn size(&self) -> usize { std::mem::size_of::() + self.fields.iter().map(|x| x.size()).sum::() - + self.interners.capacity() - * std::mem::size_of::>>() - + self - .interners - .iter() - .filter_map(|x| x.as_ref().map(|x| x.size())) - .sum::() + + self.codecs.capacity() * std::mem::size_of::() + + self.codecs.iter().map(Codec::size).sum::() } } @@ -668,7 +788,7 @@ impl<'a> Row<'a> { /// Create owned version of the row to detach it from the shared [`Rows`]. pub fn owned(&self) -> OwnedRow { OwnedRow { - data: self.data.to_vec(), + data: self.data.into(), config: self.config.clone(), } } @@ -718,7 +838,7 @@ impl<'a> AsRef<[u8]> for Row<'a> { /// This contains the data for the one specific row (not the entire buffer of all rows). #[derive(Debug, Clone)] pub struct OwnedRow { - data: Vec, + data: Box<[u8]>, config: RowConfig, } @@ -783,54 +903,64 @@ fn null_sentinel(options: SortOptions) -> u8 { } /// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`] -fn new_empty_rows( - cols: &[ArrayRef], - dictionaries: &[Option>>], - config: RowConfig, -) -> Rows { +fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> Rows { use fixed::FixedLengthEncoding; let num_rows = cols.first().map(|x| x.len()).unwrap_or(0); let mut lengths = vec![0; num_rows]; - for (array, dict) in cols.iter().zip(dictionaries) { - downcast_primitive_array! { - array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), - DataType::Null => {}, - DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), - DataType::Binary => as_generic_binary_array::(array) - .iter() - .zip(lengths.iter_mut()) - .for_each(|(slice, length)| *length += variable::encoded_len(slice)), - DataType::LargeBinary => as_generic_binary_array::(array) - .iter() - .zip(lengths.iter_mut()) - .for_each(|(slice, length)| *length += variable::encoded_len(slice)), - DataType::Utf8 => as_string_array(array) - .iter() - .zip(lengths.iter_mut()) - .for_each(|(slice, length)| { - *length += variable::encoded_len(slice.map(|x| x.as_bytes())) - }), - DataType::LargeUtf8 => as_largestring_array(array) - .iter() - .zip(lengths.iter_mut()) - .for_each(|(slice, length)| { - *length += variable::encoded_len(slice.map(|x| x.as_bytes())) - }), - DataType::Dictionary(_, _) => downcast_dictionary_array! { - array => { - let dict = dict.as_ref().unwrap(); - for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { - match v.and_then(|v| dict[v as usize]) { - Some(k) => *length += k.len() + 1, - None => *length += 1, + for (array, encoder) in cols.iter().zip(encoders) { + match encoder { + Encoder::Stateless => { + downcast_primitive_array! { + array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)), + DataType::Null => {}, + DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN), + DataType::Binary => as_generic_binary_array::(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| *length += variable::encoded_len(slice)), + DataType::LargeBinary => as_generic_binary_array::(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| *length += variable::encoded_len(slice)), + DataType::Utf8 => as_string_array(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| { + *length += variable::encoded_len(slice.map(|x| x.as_bytes())) + }), + DataType::LargeUtf8 => as_largestring_array(array) + .iter() + .zip(lengths.iter_mut()) + .for_each(|(slice, length)| { + *length += variable::encoded_len(slice.map(|x| x.as_bytes())) + }), + _ => unreachable!(), + } + } + Encoder::Dictionary(dict) => { + downcast_dictionary_array! { + array => { + for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { + match v.and_then(|v| dict[v as usize]) { + Some(k) => *length += k.len() + 1, + None => *length += 1, + } } } + _ => unreachable!(), } - _ => unreachable!(), } - _ => unreachable!(), + Encoder::Struct(rows, null) => { + let array = as_struct_array(array); + lengths.iter_mut().enumerate().for_each(|(idx, length)| { + match array.is_valid(idx) { + true => *length += 1 + rows.row(idx).as_ref().len(), + false => *length += 1 + null.data.len(), + } + }); + } } } @@ -872,35 +1002,59 @@ fn encode_column( out: &mut Rows, column: &ArrayRef, opts: SortOptions, - dictionary: Option<&[Option<&[u8]>]>, + encoder: &Encoder<'_>, ) { - downcast_primitive_array! { - column => fixed::encode(out, column, opts), - DataType::Null => {} - DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), - DataType::Binary => { - variable::encode(out, as_generic_binary_array::(column).iter(), opts) + match encoder { + Encoder::Stateless => { + downcast_primitive_array! { + column => fixed::encode(out, column, opts), + DataType::Null => {} + DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), + DataType::Binary => { + variable::encode(out, as_generic_binary_array::(column).iter(), opts) + } + DataType::LargeBinary => { + variable::encode(out, as_generic_binary_array::(column).iter(), opts) + } + DataType::Utf8 => variable::encode( + out, + as_string_array(column).iter().map(|x| x.map(|x| x.as_bytes())), + opts, + ), + DataType::LargeUtf8 => variable::encode( + out, + as_largestring_array(column) + .iter() + .map(|x| x.map(|x| x.as_bytes())), + opts, + ), + _ => unreachable!(), + } } - DataType::LargeBinary => { - variable::encode(out, as_generic_binary_array::(column).iter(), opts) + Encoder::Dictionary(dict) => { + downcast_dictionary_array! { + column => encode_dictionary(out, column, dict, opts), + _ => unreachable!() + } } - DataType::Utf8 => variable::encode( - out, - as_string_array(column).iter().map(|x| x.map(|x| x.as_bytes())), - opts, - ), - DataType::LargeUtf8 => variable::encode( - out, - as_largestring_array(column) - .iter() - .map(|x| x.map(|x| x.as_bytes())), - opts, - ), - DataType::Dictionary(_, _) => downcast_dictionary_array! { - column => encode_dictionary(out, column, dictionary.unwrap(), opts), - _ => unreachable!() + Encoder::Struct(rows, null) => { + let array = as_struct_array(column.as_ref()); + let null_sentinel = null_sentinel(opts); + out.offsets + .iter_mut() + .skip(1) + .enumerate() + .for_each(|(idx, offset)| { + let (row, sentinel) = match array.is_valid(idx) { + true => (rows.row(idx), 0x01), + false => (*null, null_sentinel), + }; + let end_offset = *offset + 1 + row.as_ref().len(); + out.buffer[*offset] = sentinel; + out.buffer[*offset + 1..end_offset].copy_from_slice(row.as_ref()); + *offset = end_offset; + }) } - _ => unreachable!(), } } @@ -912,12 +1066,7 @@ macro_rules! decode_primitive_helper { macro_rules! decode_dictionary_helper { ($t:ty, $interner:ident, $v:ident, $options:ident, $rows:ident) => { - Arc::new(decode_dictionary::<$t>( - $interner.unwrap(), - $v.as_ref(), - $options, - $rows, - )?) + Arc::new(decode_dictionary::<$t>($interner, $v, $options, $rows)?) }; } @@ -929,28 +1078,73 @@ macro_rules! decode_dictionary_helper { unsafe fn decode_column( field: &SortField, rows: &mut [&[u8]], - interner: Option<&OrderPreservingInterner>, + codec: &Codec, validate_utf8: bool, ) -> Result { let options = field.options; - let data_type = field.data_type.clone(); - let array: ArrayRef = downcast_primitive! { - data_type => (decode_primitive_helper, rows, data_type, options), - DataType::Null => Arc::new(NullArray::new(rows.len())), - DataType::Boolean => Arc::new(decode_bool(rows, options)), - DataType::Binary => Arc::new(decode_binary::(rows, options)), - DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), - DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), - DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), - DataType::Dictionary(k, v) => downcast_integer! { - k.as_ref() => (decode_dictionary_helper, interner, v, options, rows), - _ => unreachable!() - }, - _ => { - return Err(ArrowError::NotYetImplemented(format!( - "converting {} row is not supported", - field.data_type - ))) + + let array: ArrayRef = match codec { + Codec::Stateless => { + let data_type = field.data_type.clone(); + downcast_primitive! { + data_type => (decode_primitive_helper, rows, data_type, options), + DataType::Null => Arc::new(NullArray::new(rows.len())), + DataType::Boolean => Arc::new(decode_bool(rows, options)), + DataType::Binary => Arc::new(decode_binary::(rows, options)), + DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), + DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), + DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), + _ => unreachable!() + } + } + Codec::Dictionary(interner) => { + let (k, v) = match &field.data_type { + DataType::Dictionary(k, v) => (k.as_ref(), v.as_ref()), + _ => unreachable!(), + }; + downcast_integer! { + k => (decode_dictionary_helper, interner, v, options, rows), + _ => unreachable!() + } + } + Codec::Struct(converter, _) => { + let child_fields = match &field.data_type { + DataType::Struct(f) => f, + _ => unreachable!(), + }; + + let (null_count, nulls) = fixed::decode_nulls(rows); + rows.iter_mut().for_each(|row| *row = &row[1..]); + let children = converter.convert_raw(rows, validate_utf8)?; + + let child_data = child_fields + .iter() + .zip(&children) + .map(|(f, c)| { + let data = c.data().clone(); + match f.is_nullable() { + true => data, + false => { + assert_eq!(data.null_count(), null_count); + // Need to strip out null buffer if any as this is created + // as an artifact of the row encoding process that encodes + // nulls from the parent struct array in the children + data.into_builder() + .null_count(0) + .null_bit_buffer(None) + .build_unchecked() + } + } + }) + .collect(); + + let builder = ArrayDataBuilder::new(field.data_type.clone()) + .len(rows.len()) + .null_count(null_count) + .null_bit_buffer(Some(nulls)) + .child_data(child_data); + + Arc::new(StructArray::from(builder.build_unchecked())) } }; Ok(array) @@ -965,6 +1159,7 @@ mod tests { use rand::{thread_rng, Rng}; use arrow_array::NullArray; + use arrow_buffer::Buffer; use crate::array::{ BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, @@ -1329,6 +1524,54 @@ mod tests { assert_eq!(&cols[0], &a); } + #[test] + fn test_struct() { + // Test basic + let a = Arc::new(Int32Array::from(vec![1, 1, 2, 2])) as ArrayRef; + let a_f = Field::new("int", DataType::Int32, false); + let u = Arc::new(StringArray::from(vec!["a", "b", "c", "d"])) as ArrayRef; + let u_f = Field::new("s", DataType::Utf8, false); + let s1 = Arc::new(StructArray::from(vec![(a_f, a), (u_f, u)])) as ArrayRef; + + let sort_fields = vec![SortField::new(s1.data_type().clone())]; + let mut converter = RowConverter::new(sort_fields).unwrap(); + let r1 = converter.convert_columns(&[Arc::clone(&s1)]).unwrap(); + + for (a, b) in r1.iter().zip(r1.iter().skip(1)) { + assert!(a < b); + } + + let back = converter.convert_rows(&r1).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(&back[0], &s1); + + // Test struct nullability + let data = s1 + .data() + .clone() + .into_builder() + .null_bit_buffer(Some(Buffer::from_slice_ref([0b00001010]))) + .null_count(2) + .build() + .unwrap(); + + let s2 = Arc::new(StructArray::from(data)) as ArrayRef; + let r2 = converter.convert_columns(&[Arc::clone(&s2)]).unwrap(); + assert_eq!(r2.row(0), r2.row(2)); // Nulls equal + assert!(r2.row(0) < r2.row(1)); // Nulls first + assert_ne!(r1.row(0), r2.row(0)); // Value does not equal null + assert_eq!(r1.row(1), r2.row(1)); // Values equal + + let back = converter.convert_rows(&r2).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(&back[0], &s2); + let back_s = as_struct_array(&back[0]); + for c in back_s.columns() { + // Children should not contain nulls + assert_eq!(c.null_count(), 0); + } + } + #[test] fn test_primitive_dictionary() { let mut builder = PrimitiveDictionaryBuilder::::new(); From ab3f384483c4fef645f9d1653f1adda3470594b2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 29 Nov 2022 11:31:13 +0000 Subject: [PATCH 0348/1411] Support `FixedSizeBinary` in Row format (#3182) * Add support for FixedSizeBinary in Row format * Add docs --- arrow/src/row/fixed.rs | 59 +++++++++++++++++++++++++++++++++++++++++- arrow/src/row/mod.rs | 47 +++++++++++++++++++++++++++++++-- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs index 9aef83ce2ade..03c53c994794 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow/src/row/fixed.rs @@ -20,7 +20,7 @@ use crate::compute::SortOptions; use crate::datatypes::ArrowPrimitiveType; use crate::row::{null_sentinel, Rows}; use arrow_array::builder::BufferBuilder; -use arrow_array::BooleanArray; +use arrow_array::{BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; @@ -201,6 +201,29 @@ pub fn encode>>( } } +pub fn encode_fixed_size_binary( + out: &mut Rows, + array: &FixedSizeBinaryArray, + opts: SortOptions, +) { + let len = array.value_length() as usize; + for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(array.iter()) { + let end_offset = *offset + len + 1; + if let Some(val) = maybe_val { + let to_write = &mut out.buffer[*offset..end_offset]; + to_write[0] = 1; + to_write[1..].copy_from_slice(&val[..len]); + if opts.descending { + // Flip bits to reverse order + to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v) + } + } else { + out.buffer[*offset] = null_sentinel(opts); + } + *offset = end_offset; + } +} + /// Splits `len` bytes from `src` #[inline] fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] { @@ -330,3 +353,37 @@ where // Validated data type above unsafe { decode_fixed::(rows, data_type, options).into() } } + +/// Decodes a `FixedLengthBinary` from rows +pub fn decode_fixed_size_binary( + rows: &mut [&[u8]], + size: i32, + options: SortOptions, +) -> FixedSizeBinaryArray { + let len = rows.len(); + + let mut values = MutableBuffer::new(size as usize * rows.len()); + let (null_count, nulls) = decode_nulls(rows); + + let encoded_len = size as usize + 1; + + for row in rows { + let i = split_off(row, encoded_len); + values.extend_from_slice(&i[1..]); + } + + if options.descending { + for v in values.as_slice_mut() { + *v = !*v; + } + } + + let builder = ArrayDataBuilder::new(DataType::FixedSizeBinary(size)) + .len(len) + .null_count(null_count) + .add_buffer(values.into()) + .null_bit_buffer(Some(nulls)); + + // SAFETY: Buffers correct length + unsafe { builder.build_unchecked().into() } +} diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 8572bf892fdb..cff49740fb15 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -139,7 +139,7 @@ use crate::error::{ArrowError, Result}; use crate::row::dictionary::{ compute_dictionary_mapping, decode_dictionary, encode_dictionary, }; -use crate::row::fixed::{decode_bool, decode_primitive}; +use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; use crate::row::interner::OrderPreservingInterner; use crate::row::variable::{decode_binary, decode_string}; use crate::{downcast_dictionary_array, downcast_primitive_array}; @@ -213,6 +213,16 @@ mod variable; /// /// They are then encoded in the same manner as a signed integer. /// +/// ## Fixed Length Bytes Encoding +/// +/// Fixed length bytes are encoded in the same fashion as primitive types above. +/// +/// For a fixed length array of length `n`: +/// +/// A null is encoded as `0_u8` null sentinel followed by `n` `0_u8` bytes +/// +/// A valid value is encoded as `1_u8` followed by the value bytes +/// /// ## Variable Length Bytes (including Strings) Encoding /// /// A null is encoded as a `0_u8`. @@ -936,6 +946,10 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> .for_each(|(slice, length)| { *length += variable::encoded_len(slice.map(|x| x.as_bytes())) }), + DataType::FixedSizeBinary(len) => { + let len = len.to_usize().unwrap(); + lengths.iter_mut().for_each(|x| *x += 1 + len) + } _ => unreachable!(), } } @@ -1028,6 +1042,10 @@ fn encode_column( .map(|x| x.map(|x| x.as_bytes())), opts, ), + DataType::FixedSizeBinary(_) => { + let array = column.as_any().downcast_ref().unwrap(); + fixed::encode_fixed_size_binary(out, array, opts) + } _ => unreachable!(), } } @@ -1092,6 +1110,7 @@ unsafe fn decode_column( DataType::Boolean => Arc::new(decode_bool(rows, options)), DataType::Binary => Arc::new(decode_binary::(rows, options)), DataType::LargeBinary => Arc::new(decode_binary::(rows, options)), + DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), _ => unreachable!() @@ -1154,6 +1173,7 @@ unsafe fn decode_column( mod tests { use std::sync::Arc; + use arrow_array::builder::FixedSizeBinaryBuilder; use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; use rand::{thread_rng, Rng}; @@ -1713,9 +1733,31 @@ mod tests { DictionaryArray::from(data) } + fn generate_fixed_size_binary( + len: usize, + valid_percent: f64, + ) -> FixedSizeBinaryArray { + let mut rng = thread_rng(); + let width = rng.gen_range(0..20); + let mut builder = FixedSizeBinaryBuilder::new(width); + + let mut b = vec![0; width as usize]; + for _ in 0..len { + match rng.gen_bool(valid_percent) { + true => { + b.iter_mut().for_each(|x| *x = rng.gen()); + builder.append_value(&b).unwrap(); + } + false => builder.append_null(), + } + } + + builder.finish() + } + fn generate_column(len: usize) -> ArrayRef { let mut rng = thread_rng(); - match rng.gen_range(0..9) { + match rng.gen_range(0..10) { 0 => Arc::new(generate_primitive_array::(len, 0.8)), 1 => Arc::new(generate_primitive_array::(len, 0.8)), 2 => Arc::new(generate_primitive_array::(len, 0.8)), @@ -1738,6 +1780,7 @@ mod tests { len, 0.8, )), + 9 => Arc::new(generate_fixed_size_binary(len, 0.8)), _ => unreachable!(), } } From bdfe0fdeb127c99ef918af779a3b8404e91e41b1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Nov 2022 13:56:30 +0000 Subject: [PATCH 0349/1411] Update prost-build requirement from =0.11.2 to =0.11.3 (#3225) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/compare/v0.11.2...v0.11.3) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 0e09953e7479..bf62a0dcd065 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -50,7 +50,7 @@ flight-sql-experimental = ["prost-types"] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.47", default-features = false } -prost-build = { version = "=0.11.2", default-features = false } +prost-build = { version = "=0.11.3", default-features = false } tonic-build = { version = "=0.8.3", default-features = false, features = ["transport", "prost"] } [[example]] From 1a8e6ed957e483ec27b88fce54a48b8176be3179 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 29 Nov 2022 12:48:34 -0800 Subject: [PATCH 0350/1411] Fix CI build by upgrading tonic-build to 0.8.4 (#3231) --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index bf62a0dcd065..77881a70f708 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -51,7 +51,7 @@ flight-sql-experimental = ["prost-types"] # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.47", default-features = false } prost-build = { version = "=0.11.3", default-features = false } -tonic-build = { version = "=0.8.3", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] name = "flight_sql_server" From afa83166bfe517b4f932b9ed0e527396b2023115 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 30 Nov 2022 04:04:48 -0500 Subject: [PATCH 0351/1411] User RegexSet for matching DataType (#3217) * User RegexSet for matching DataType * fix PR comments Co-authored-by: askoa --- arrow-csv/src/reader.rs | 54 +++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index f8f9f50a3e2f..c69e1753b71d 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -42,7 +42,7 @@ use core::cmp::min; use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; +use regex::{Regex, RegexSet}; use std::collections::HashSet; use std::fmt; use std::fs::File; @@ -61,45 +61,46 @@ use csv::{ByteRecord, StringRecord}; use std::ops::Neg; lazy_static! { + static ref REGEX_SET: RegexSet = RegexSet::new([ + r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN + r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL + r"^-?(\d+)$", //INTEGER + r"^\d{4}-\d\d-\d\d$", //DATE32 + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$", //DATE64 + ]).unwrap(); + //The order should match with REGEX_SET + static ref MATCH_DATA_TYPE: Vec = vec![ + DataType::Boolean, + DataType::Float64, + DataType::Int64, + DataType::Date32, + DataType::Date64, + ]; static ref PARSE_DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); - static ref DECIMAL_RE: Regex = - Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap(); - static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); - static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") - .case_insensitive(true) - .build() - .unwrap(); - static ref DATE32_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); - static ref DATE64_RE: Regex = - Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$").unwrap(); static ref DATETIME_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}$").unwrap(); } /// Infer the data type of a record fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { - let datetime_re = datetime_re.unwrap_or_else(|| DATETIME_RE.clone()); // when quoting is enabled in the reader, these quotes aren't escaped, we default to // Utf8 for them if string.starts_with('"') { return DataType::Utf8; } + let matches = REGEX_SET.matches(string).into_iter().next(); // match regex in a particular order - if BOOLEAN_RE.is_match(string) { - DataType::Boolean - } else if DECIMAL_RE.is_match(string) { - DataType::Float64 - } else if INTEGER_RE.is_match(string) { - DataType::Int64 - } else if DATE32_RE.is_match(string) { - DataType::Date32 - } else if DATE64_RE.is_match(string) { - DataType::Date64 - } else if datetime_re.is_match(string) { - DataType::Timestamp(TimeUnit::Nanosecond, None) - } else { - DataType::Utf8 + match matches { + Some(ix) => MATCH_DATA_TYPE[ix].clone(), + None => { + let datetime_re = datetime_re.unwrap_or_else(|| DATETIME_RE.clone()); + if datetime_re.is_match(string) { + DataType::Timestamp(TimeUnit::Nanosecond, None) + } else { + DataType::Utf8 + } + } } } @@ -1588,6 +1589,7 @@ mod tests { assert_eq!(infer_field_schema(".2", None), DataType::Float64); assert_eq!(infer_field_schema("2.", None), DataType::Float64); assert_eq!(infer_field_schema("true", None), DataType::Boolean); + assert_eq!(infer_field_schema("trUe", None), DataType::Boolean); assert_eq!(infer_field_schema("false", None), DataType::Boolean); assert_eq!(infer_field_schema("2020-11-08", None), DataType::Date32); assert_eq!( From 54587e0dd49690a008d616750f23cdfc5f828c93 Mon Sep 17 00:00:00 2001 From: Aarash Heydari Date: Wed, 30 Nov 2022 04:31:04 -0500 Subject: [PATCH 0352/1411] Remove unwraps from create_primitive_array (#3232) --- arrow-ipc/src/reader.rs | 43 +++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index e697a89d01aa..32f580afbf55 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -91,7 +91,7 @@ fn create_array( read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, read_buffer(buffers.get(buffer_index + 2), data, compression_codec)?, ], - ); + )?; node_index += 1; buffer_index += 3; array @@ -104,7 +104,7 @@ fn create_array( read_buffer(buffers.get(buffer_index), data, compression_codec)?, read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, ], - ); + )?; node_index += 1; buffer_index += 2; array @@ -305,7 +305,7 @@ fn create_array( read_buffer(buffers.get(buffer_index), data, compression_codec)?, read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, ], - ); + )?; node_index += 1; buffer_index += 2; array @@ -397,7 +397,7 @@ fn create_primitive_array( field_node: &crate::FieldNode, data_type: &DataType, buffers: &[Buffer], -) -> ArrayRef { +) -> Result { let length = field_node.length() as usize; let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); let array_data = match data_type { @@ -407,8 +407,7 @@ fn create_primitive_array( .len(length) .buffers(buffers[1..3].to_vec()) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } FixedSizeBinary(_) => { // read 2 buffers: null buffer (optional) and data buffer @@ -416,8 +415,7 @@ fn create_primitive_array( .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } Int8 | Int16 @@ -434,19 +432,16 @@ fn create_primitive_array( .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap(); + .build()?; let values = Arc::new(Int64Array::from(data)) as ArrayRef; - // this cast is infallible, the unwrap is safe - let casted = cast(&values, data_type).unwrap(); + let casted = cast(&values, data_type)?; casted.into_data() } else { ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } } Float32 => { @@ -456,19 +451,16 @@ fn create_primitive_array( .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap(); + .build()?; let values = Arc::new(Float64Array::from(data)) as ArrayRef; - // this cast is infallible, the unwrap is safe - let casted = cast(&values, data_type).unwrap(); + let casted = cast(&values, data_type)?; casted.into_data() } else { ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } } Boolean @@ -483,8 +475,7 @@ fn create_primitive_array( .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build() - .unwrap(), + .build()?, Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { let buffer = get_aligned_buffer::(&buffers[1], length); @@ -493,8 +484,7 @@ fn create_primitive_array( .len(length) .add_buffer(buffer) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } Decimal256(_, _) => { let buffer = get_aligned_buffer::(&buffers[1], length); @@ -504,13 +494,12 @@ fn create_primitive_array( .len(length) .add_buffer(buffer) .null_bit_buffer(null_buffer) - .build() - .unwrap() + .build()? } t => unreachable!("Data type {:?} either unsupported or not primitive", t), }; - make_array(array_data) + Ok(make_array(array_data)) } /// Checks if given `Buffer` is properly aligned with `T`. From fdc3457b30d64323317f14647a9e462e2fb724b0 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Thu, 1 Dec 2022 02:44:42 +0800 Subject: [PATCH 0353/1411] Add new API to validate the precision for decimal array (#3242) * support new api to validate decimal array: if value is overflow with the specified precision, will be changed to None * Update arrow-array/src/array/primitive_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/array/primitive_array.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 036ef0cdd52f..7c201177f045 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1151,6 +1151,14 @@ impl PrimitiveArray { }) } + /// Validates the Decimal Array, if the value of slot is overflow for the specified precision, and + /// will be casted to Null + pub fn null_if_overflow_precision(&self, precision: u8) -> Self { + self.unary_opt::<_, T>(|v| { + (T::validate_decimal_precision(v, precision).is_ok()).then_some(v) + }) + } + /// Returns [`Self::value`] formatted as a string pub fn value_as_string(&self, row: usize) -> String { T::format_decimal(self.value(row), self.precision(), self.scale()) @@ -2055,6 +2063,15 @@ mod tests { .unwrap(); } + #[test] + fn test_decimal_array_set_null_if_overflow_with_precision() { + let array = + Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); + let result = array.null_if_overflow_precision(5); + let expected = Decimal128Array::from(vec![None, Some(123), None, None]); + assert_eq!(result, expected); + } + #[test] fn test_decimal256_iter() { let mut builder = Decimal256Builder::with_capacity(30); From 7d4e8d2e276dd6bdfc22da70e85a2183eddce081 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 30 Nov 2022 18:50:19 +0000 Subject: [PATCH 0354/1411] Move nullif to arrow-select (#2594) (#3241) --- arrow-select/src/lib.rs | 1 + arrow-select/src/nullif.rs | 454 +++++++++++++++++++++++++++ arrow/src/compute/kernels/boolean.rs | 431 +------------------------ 3 files changed, 457 insertions(+), 429 deletions(-) create mode 100644 arrow-select/src/nullif.rs diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs index cf887dfca47c..c468e20a511e 100644 --- a/arrow-select/src/lib.rs +++ b/arrow-select/src/lib.rs @@ -20,6 +20,7 @@ pub mod concat; pub mod filter; pub mod interleave; +pub mod nullif; pub mod take; pub mod window; pub mod zip; diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs new file mode 100644 index 000000000000..a0a1a3a2206b --- /dev/null +++ b/arrow-select/src/nullif.rs @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; +use arrow_buffer::buffer::{ + bitwise_bin_op_helper, bitwise_unary_op_helper, buffer_bin_and, +}; +use arrow_schema::ArrowError; + +/// Copies original array, setting validity bit to false if a secondary comparison +/// boolean array is set to true +/// +/// Typically used to implement NULLIF. +pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { + let left_data = left.data(); + let right_data = right.data(); + + if left_data.len() != right_data.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + let len = left_data.len(); + let left_offset = left_data.offset(); + + // left=0 (null) right=null output bitmap=null + // left=0 right=1 output bitmap=null + // left=1 (set) right=null output bitmap=set (passthrough) + // left=1 right=1 & comp=true output bitmap=null + // left=1 right=1 & comp=false output bitmap=set + // + // Thus: result = left null bitmap & (!right_values | !right_bitmap) + // OR left null bitmap & !(right_values & right_bitmap) + + // Compute right_values & right_bitmap + let (right, right_offset) = match right_data.null_buffer() { + Some(buffer) => ( + buffer_bin_and( + &right_data.buffers()[0], + right_data.offset(), + buffer, + right_data.offset(), + len, + ), + 0, + ), + None => (right_data.buffers()[0].clone(), right_data.offset()), + }; + + // Compute left null bitmap & !right + let mut valid_count = 0; + let combined = match left_data.null_buffer() { + Some(left) => { + bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { + let t = l & !r; + valid_count += t.count_ones() as usize; + t + }) + } + None => { + let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { + let t = !b; + valid_count += t.count_ones() as usize; + t + }); + // We need to compensate for the additional bits read from the end + let remainder_len = len % 64; + if remainder_len != 0 { + valid_count -= 64 - remainder_len + } + buffer + } + }; + + // Need to construct null buffer with offset of left + let null_buffer = match left_data.offset() { + 0 => combined, + _ => { + let mut builder = BooleanBufferBuilder::new(len + left_offset); + // Pad with 0s up to offset + builder.resize(left_offset); + builder.append_packed_range(0..len, &combined); + builder.finish() + } + }; + + let null_count = len - valid_count; + let data = left_data + .clone() + .into_builder() + .null_bit_buffer(Some(null_buffer)) + .null_count(null_count); + + // SAFETY: + // Only altered null mask + Ok(make_array(unsafe { data.build_unchecked() })) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; + use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array}; + use arrow_array::types::Int32Type; + use arrow_array::{Int32Array, StringArray, StructArray}; + use arrow_schema::{DataType, Field}; + + #[test] + fn test_nullif_int_array() { + let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); + let comp = + BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let res = nullif(&a, &comp).unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), + None, + None, // comp true, slot 2 turned into null + Some(1), + // Even though comp array / right is null, should still pass through original value + // comp true, slot 2 turned into null + Some(9), + ]); + + let res = as_primitive_array::(&res); + assert_eq!(&expected, res); + } + + #[test] + fn test_nullif_int_array_offset() { + let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); + let a = a.slice(1, 3); // Some(15), Some(8), Some(1) + let a = a.as_any().downcast_ref::().unwrap(); + let comp = BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(a, comp).unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + let res = as_primitive_array::(&res); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_string() { + let s = StringArray::from_iter([ + Some("hello"), + None, + Some("world"), + Some("a"), + Some("b"), + None, + None, + ]); + let select = BooleanArray::from_iter([ + Some(true), + Some(true), + Some(false), + Some(true), + Some(false), + Some(false), + None, + ]); + + let a = nullif(&s, &select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!( + r, + vec![None, None, Some("world"), None, Some("b"), None, None] + ); + + let s = s.slice(2, 3); + let select = select.slice(1, 3); + let select = as_boolean_array(select.as_ref()); + let a = nullif(s.as_ref(), select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!(r, vec![None, Some("a"), None]); + } + + #[test] + fn test_nullif_int_large_left_offset() { + let a = Int32Array::from(vec![ + Some(-1), // 0 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), // 8 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + None, // 16 + Some(15), // 17 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(17, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_int_large_right_offset() { + let a = Int32Array::from(vec![ + None, // 0 + Some(15), // 1 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(1, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 8 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 16 + Some(false), // 17 + Some(false), // 18 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(18, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_boolean_offset() { + let a = BooleanArray::from(vec![ + None, // 0 + Some(true), // 1 + Some(false), + Some(true), + Some(true), + ]); + let a = a.slice(1, 3); // Some(true), Some(false), Some(true) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = BooleanArray::from(vec![ + Some(true), // False => keep it + Some(false), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + struct Foo { + a: Option, + b: Option, + /// Whether the entry should be valid. + is_valid: bool, + } + + impl Foo { + fn new_valid(a: i32, b: bool) -> Foo { + Self { + a: Some(a), + b: Some(b), + is_valid: true, + } + } + + fn new_null() -> Foo { + Self { + a: None, + b: None, + is_valid: false, + } + } + } + + /// Struct Array equality is a bit weird -- we need to have the *child values* + /// correct even if the enclosing struct indicates it is null. But we + /// also need the top level is_valid bits to be correct. + fn create_foo_struct(values: Vec) -> StructArray { + let mut struct_array = StructBuilder::new( + vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Boolean, true), + ], + vec![ + Box::new(Int32Builder::with_capacity(values.len())), + Box::new(BooleanBuilder::with_capacity(values.len())), + ], + ); + + for value in values { + struct_array + .field_builder::(0) + .unwrap() + .append_option(value.a); + struct_array + .field_builder::(1) + .unwrap() + .append_option(value.b); + struct_array.append(value.is_valid); + } + + struct_array.finish() + } + + #[test] + fn test_nullif_struct_slices() { + let struct_array = create_foo_struct(vec![ + Foo::new_valid(7, true), + Foo::new_valid(15, false), + Foo::new_valid(8, true), + Foo::new_valid(12, false), + Foo::new_null(), + Foo::new_null(), + Foo::new_valid(42, true), + ]); + + // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), + // None, None + let struct_array = struct_array.slice(1, 5); + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&struct_array, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = create_foo_struct(vec![ + // Some(false) -> keep + Foo::new_valid(15, false), + // None -> keep + Foo::new_valid(8, true), + // Some(true) -> null out. But child values are still there. + Foo { + a: Some(12), + b: Some(false), + is_valid: false, + }, + // Some(false) -> keep, but was null + Foo::new_null(), + // None -> keep, but was null + Foo::new_null(), + ]); + + assert_eq!(&expected, res); + } + + #[test] + fn test_nullif_no_nulls() { + let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); + let comp = + BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let res = nullif(&a, &comp).unwrap(); + let res = as_primitive_array::(res.as_ref()); + + let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); + assert_eq!(res, &expected); + } +} diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index dee5d0d1b3ba..1b33fa19ea02 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -22,6 +22,8 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. +pub use arrow_select::nullif; + use crate::array::{Array, ArrayData, BooleanArray}; use crate::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, @@ -31,9 +33,6 @@ use crate::compute::util::combine_option_bitmap; use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util::ceil; -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::{make_array, ArrayRef}; -use arrow_buffer::buffer::bitwise_unary_op_helper; /// Updates null buffer based on data buffer and null buffer of the operand at other side /// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false @@ -471,105 +470,10 @@ pub fn is_not_null(input: &dyn Array) -> Result { Ok(BooleanArray::from(data)) } -/// Copies original array, setting validity bit to false if a secondary comparison -/// boolean array is set to true -/// -/// Typically used to implement NULLIF. -pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - let left_data = left.data(); - let right_data = right.data(); - - if left_data.len() != right_data.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - let len = left_data.len(); - let left_offset = left_data.offset(); - - // left=0 (null) right=null output bitmap=null - // left=0 right=1 output bitmap=null - // left=1 (set) right=null output bitmap=set (passthrough) - // left=1 right=1 & comp=true output bitmap=null - // left=1 right=1 & comp=false output bitmap=set - // - // Thus: result = left null bitmap & (!right_values | !right_bitmap) - // OR left null bitmap & !(right_values & right_bitmap) - - // Compute right_values & right_bitmap - let (right, right_offset) = match right_data.null_buffer() { - Some(buffer) => ( - buffer_bin_and( - &right_data.buffers()[0], - right_data.offset(), - buffer, - right_data.offset(), - len, - ), - 0, - ), - None => (right_data.buffers()[0].clone(), right_data.offset()), - }; - - // Compute left null bitmap & !right - let mut valid_count = 0; - let combined = match left_data.null_buffer() { - Some(left) => { - bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { - let t = l & !r; - valid_count += t.count_ones() as usize; - t - }) - } - None => { - let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { - let t = !b; - valid_count += t.count_ones() as usize; - t - }); - // We need to compensate for the additional bits read from the end - let remainder_len = len % 64; - if remainder_len != 0 { - valid_count -= 64 - remainder_len - } - buffer - } - }; - - // Need to construct null buffer with offset of left - let null_buffer = match left_data.offset() { - 0 => combined, - _ => { - let mut builder = BooleanBufferBuilder::new(len + left_offset); - // Pad with 0s up to offset - builder.resize(left_offset); - builder.append_packed_range(0..len, &combined); - builder.finish() - } - }; - - let null_count = len - valid_count; - let data = left_data - .clone() - .into_builder() - .null_bit_buffer(Some(null_buffer)) - .null_count(null_count); - - // SAFETY: - // Only altered null mask - Ok(make_array(unsafe { data.build_unchecked() })) -} - #[cfg(test)] mod tests { use super::*; use crate::array::{ArrayRef, Int32Array}; - use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; - use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array}; - use arrow_array::types::Int32Type; - use arrow_array::{StringArray, StructArray}; - use arrow_schema::Field; use std::sync::Arc; #[test] @@ -1100,335 +1004,4 @@ mod tests { assert_eq!(expected, res); assert_eq!(None, res.data_ref().null_bitmap()); } - - #[test] - fn test_nullif_int_array() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); - let res = nullif(&a, &comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), - None, - None, // comp true, slot 2 turned into null - Some(1), - // Even though comp array / right is null, should still pass through original value - // comp true, slot 2 turned into null - Some(9), - ]); - - let res = as_primitive_array::(&res); - assert_eq!(&expected, res); - } - - #[test] - fn test_nullif_int_array_offset() { - let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); - let a = a.slice(1, 3); // Some(15), Some(8), Some(1) - let a = a.as_any().downcast_ref::().unwrap(); - let comp = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(a, comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - let res = as_primitive_array::(&res); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_string() { - let s = StringArray::from_iter([ - Some("hello"), - None, - Some("world"), - Some("a"), - Some("b"), - None, - None, - ]); - let select = BooleanArray::from_iter([ - Some(true), - Some(true), - Some(false), - Some(true), - Some(false), - Some(false), - None, - ]); - - let a = nullif(&s, &select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); - assert_eq!( - r, - vec![None, None, Some("world"), None, Some("b"), None, None] - ); - - let s = s.slice(2, 3); - let select = select.slice(1, 3); - let select = as_boolean_array(select.as_ref()); - let a = nullif(s.as_ref(), select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); - assert_eq!(r, vec![None, Some("a"), None]); - } - - #[test] - fn test_nullif_int_large_left_offset() { - let a = Int32Array::from(vec![ - Some(-1), // 0 - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), // 8 - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - None, // 16 - Some(15), // 17 - Some(8), - Some(1), - Some(9), - ]); - let a = a.slice(17, 3); // Some(15), Some(8), Some(1) - - let comp = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_int_large_right_offset() { - let a = Int32Array::from(vec![ - None, // 0 - Some(15), // 1 - Some(8), - Some(1), - Some(9), - ]); - let a = a.slice(1, 3); // Some(15), Some(8), Some(1) - - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), // 8 - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), // 16 - Some(false), // 17 - Some(false), // 18 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(18, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_boolean_offset() { - let a = BooleanArray::from(vec![ - None, // 0 - Some(true), // 1 - Some(false), - Some(true), - Some(true), - ]); - let a = a.slice(1, 3); // Some(true), Some(false), Some(true) - - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), // 1 - Some(false), // 2 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = BooleanArray::from(vec![ - Some(true), // False => keep it - Some(false), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - struct Foo { - a: Option, - b: Option, - /// Whether the entry should be valid. - is_valid: bool, - } - - impl Foo { - fn new_valid(a: i32, b: bool) -> Foo { - Self { - a: Some(a), - b: Some(b), - is_valid: true, - } - } - - fn new_null() -> Foo { - Self { - a: None, - b: None, - is_valid: false, - } - } - } - - /// Struct Array equality is a bit weird -- we need to have the *child values* - /// correct even if the enclosing struct indicates it is null. But we - /// also need the top level is_valid bits to be correct. - fn create_foo_struct(values: Vec) -> StructArray { - let mut struct_array = StructBuilder::new( - vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Boolean, true), - ], - vec![ - Box::new(Int32Builder::with_capacity(values.len())), - Box::new(BooleanBuilder::with_capacity(values.len())), - ], - ); - - for value in values { - struct_array - .field_builder::(0) - .unwrap() - .append_option(value.a); - struct_array - .field_builder::(1) - .unwrap() - .append_option(value.b); - struct_array.append(value.is_valid); - } - - struct_array.finish() - } - - #[test] - fn test_nullif_struct_slices() { - let struct_array = create_foo_struct(vec![ - Foo::new_valid(7, true), - Foo::new_valid(15, false), - Foo::new_valid(8, true), - Foo::new_valid(12, false), - Foo::new_null(), - Foo::new_null(), - Foo::new_valid(42, true), - ]); - - // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), - // None, None - let struct_array = struct_array.slice(1, 5); - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), // 1 - Some(false), // 2 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&struct_array, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = create_foo_struct(vec![ - // Some(false) -> keep - Foo::new_valid(15, false), - // None -> keep - Foo::new_valid(8, true), - // Some(true) -> null out. But child values are still there. - Foo { - a: Some(12), - b: Some(false), - is_valid: false, - }, - // Some(false) -> keep, but was null - Foo::new_null(), - // None -> keep, but was null - Foo::new_null(), - ]); - - assert_eq!(&expected, res); - } - - #[test] - fn test_nullif_no_nulls() { - let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); - let res = nullif(&a, &comp).unwrap(); - let res = as_primitive_array::(res.as_ref()); - - let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); - assert_eq!(res, &expected); - } } From 335c69afc4ca9d866ac1c7cd1df5a13d691137a4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 30 Nov 2022 18:54:43 +0000 Subject: [PATCH 0355/1411] Fix MapBuilder example (#3246) --- arrow-array/src/builder/map_builder.rs | 105 ++++--------------------- 1 file changed, 16 insertions(+), 89 deletions(-) diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 831128c29d05..0de89e7b73da 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -26,40 +26,32 @@ use std::sync::Arc; /// Creates a new `MapBuilder` /// ``` -/// use arrow_array::builder::{MapBuilder, Int32Builder, StringBuilder}; -/// use arrow_array::{StringArray, Int32Array}; -/// use std::sync::Arc; +/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; +/// # use arrow_array::{Int32Array, StringArray}; /// /// let string_builder = StringBuilder::new(); /// let int_builder = Int32Builder::with_capacity(4); /// +/// // Construct `[{"joe": 1}, {"blogs": 2, "foo": 4}, {}, null]` /// let mut builder = MapBuilder::new(None, string_builder, int_builder); /// -/// let string_builder = builder.keys(); -/// string_builder.append_value("joe"); -/// string_builder.append_value("n1"); -/// string_builder.append_value("n2"); -/// string_builder.append_value("mark"); -/// -/// let int_builder = builder.values(); -/// int_builder.append_value(1); -/// int_builder.append_value(2); -/// int_builder.append_null(); -/// int_builder.append_value(4); +/// builder.keys().append_value("joe"); +/// builder.values().append_value(1); +/// builder.append(true).unwrap(); /// +/// builder.keys().append_value("blogs"); +/// builder.values().append_value(2); +/// builder.keys().append_value("foo"); +/// builder.values().append_value(4); /// builder.append(true).unwrap(); -/// builder.append(false).unwrap(); /// builder.append(true).unwrap(); +/// builder.append(false).unwrap(); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]); +/// assert_eq!(*array.values(), Int32Array::from(vec![1, 2, 4])); +/// assert_eq!(*array.keys(), StringArray::from(vec!["joe", "blogs", "foo"])); /// -/// let arr = builder.finish(); -/// assert_eq!( -/// *arr.values(), -/// Int32Array::from(vec![Some(1), Some(2), None, Some(4)]) -/// ); -/// assert_eq!( -/// *arr.keys(), -/// StringArray::from(vec![Some("joe"), Some("n1"), Some("n2"), Some("mark")]) -/// ); /// ``` #[derive(Debug)] pub struct MapBuilder { @@ -91,7 +83,6 @@ impl Default for MapFieldNames { } } -#[allow(dead_code)] impl MapBuilder { /// Creates a new `MapBuilder` pub fn new( @@ -264,67 +255,3 @@ impl ArrayBuilder for MapBuilder { self } } - -#[cfg(test)] -mod tests { - use super::*; - use arrow_buffer::Buffer; - use arrow_data::Bitmap; - - use crate::builder::{Int32Builder, StringBuilder}; - - // TODO: add a test that finishes building, after designing a spec-compliant - // way of inserting values to the map. - // A map's values shouldn't be repeated within a slot - - #[test] - fn test_map_array_builder() { - let string_builder = StringBuilder::new(); - let int_builder = Int32Builder::with_capacity(4); - - let mut builder = MapBuilder::new(None, string_builder, int_builder); - - let string_builder = builder.keys(); - string_builder.append_value("joe"); - string_builder.append_value("n1"); - string_builder.append_value("n2"); - string_builder.append_value("mark"); - - let int_builder = builder.values(); - int_builder.append_value(1); - int_builder.append_value(2); - int_builder.append_null(); - int_builder.append_value(4); - - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.append(true).unwrap(); - - let arr = builder.finish(); - - let map_data = arr.data(); - assert_eq!(3, map_data.len()); - assert_eq!(1, map_data.null_count()); - assert_eq!( - Some(&Bitmap::from(Buffer::from(&[5_u8]))), - map_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .add_buffer(Buffer::from_slice_ref([0, 3, 5, 7, 11])) - .add_buffer(Buffer::from_slice_ref(b"joen1n2mark")) - .build() - .unwrap(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) - .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) - .build() - .unwrap(); - - assert_eq!(&expected_string_data, arr.keys().data()); - assert_eq!(&expected_int_data, arr.values().data()); - } -} From 6bd559f10eca5eab0dc2caca8ce7e5c77a985500 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 30 Nov 2022 19:59:08 +0000 Subject: [PATCH 0356/1411] Validate dictionaries read over IPC (#3247) --- arrow-ipc/src/reader.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 32f580afbf55..ef0a49be693b 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -131,7 +131,7 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers, triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0)? } FixedSizeList(ref list_field, _) => { let list_node = nodes.get(node_index); @@ -156,7 +156,7 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers, triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0)? } Struct(struct_fields) => { let struct_node = nodes.get(node_index); @@ -220,7 +220,7 @@ fn create_array( data_type, &index_buffers, value_array.clone(), - ) + )? } Union(fields, field_type_ids, mode) => { let union_node = nodes.get(node_index); @@ -527,7 +527,7 @@ fn create_list_array( data_type: &DataType, buffers: &[Buffer], child_array: ArrayRef, -) -> ArrayRef { +) -> Result { let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); let length = field_node.length() as usize; let child_data = child_array.into_data(); @@ -545,7 +545,7 @@ fn create_list_array( _ => unreachable!("Cannot create list or map array from {:?}", data_type), }; - make_array(builder.build().unwrap()) + Ok(make_array(builder.build()?)) } /// Reads the correct number of buffers based on list type and null_count, and creates a @@ -555,7 +555,7 @@ fn create_dictionary_array( data_type: &DataType, buffers: &[Buffer], value_array: ArrayRef, -) -> ArrayRef { +) -> Result { if let Dictionary(_, _) = *data_type { let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); let builder = ArrayData::builder(data_type.clone()) @@ -564,7 +564,7 @@ fn create_dictionary_array( .add_child_data(value_array.into_data()) .null_bit_buffer(null_buffer); - make_array(unsafe { builder.build_unchecked() }) + Ok(make_array(builder.build()?)) } else { unreachable!("Cannot create dictionary array from {:?}", data_type) } From 9538c265b16d8f6eb3e78218941e5c816a57dbe2 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 30 Nov 2022 12:24:53 -0800 Subject: [PATCH 0357/1411] Move tests which require chrono-tz feature from `arrow-cast` to `arrow` (#3222) * Enable necessary test * Move tests * Trigger Build * Fix clippy * Fix test * Move feature requirement to Cargo.toml --- arrow-cast/src/cast.rs | 329 +----------------------------- arrow/Cargo.toml | 4 + arrow/tests/array_cast.rs | 407 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 416 insertions(+), 324 deletions(-) create mode 100644 arrow/tests/array_cast.rs diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 23be8839593c..ad9f08388326 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -160,7 +160,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (Utf8, _) => DataType::is_numeric(to_type), + (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (LargeUtf8, LargeBinary | Date32 @@ -171,11 +171,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type), + (LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, (Date32, Utf8) | (Date32, LargeUtf8) => true, (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) || from_type == &Binary, + (_, Utf8 | LargeUtf8) => (DataType::is_numeric(from_type) && from_type != &Float16) || from_type == &Binary, // start numeric casts ( @@ -972,6 +972,7 @@ pub fn cast_with_options( Int16 => cast_numeric_to_bool::(array), Int32 => cast_numeric_to_bool::(array), Int64 => cast_numeric_to_bool::(array), + Float16 => cast_numeric_to_bool::(array), Float32 => cast_numeric_to_bool::(array), Float64 => cast_numeric_to_bool::(array), Utf8 => cast_utf8_to_boolean(array, cast_options), @@ -989,6 +990,7 @@ pub fn cast_with_options( Int16 => cast_bool_to_numeric::(array, cast_options), Int32 => cast_bool_to_numeric::(array, cast_options), Int64 => cast_bool_to_numeric::(array, cast_options), + Float16 => cast_bool_to_numeric::(array, cast_options), Float32 => cast_bool_to_numeric::(array, cast_options), Float64 => cast_bool_to_numeric::(array, cast_options), Utf8 => { @@ -3614,7 +3616,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_decimal_to_decimal_round() { let array = vec![ Some(1123454), @@ -3733,7 +3734,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); let output_type = DataType::Decimal128(20, 4); @@ -4124,7 +4124,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal128() { let decimal_type = DataType::Decimal128(38, 6); // u8, u16, u32, u64 @@ -4296,7 +4295,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal256() { // test negative cast type let decimal_type = DataType::Decimal256(58, 6); @@ -5274,25 +5272,6 @@ mod tests { assert!(c.is_null(2)); } - #[test] - #[cfg(feature = "chrono-tz")] - fn test_cast_timestamp_to_string() { - let a = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); - let array = Arc::new(a) as ArrayRef; - dbg!(&array); - let b = cast(&array, &DataType::Utf8).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); - assert!(c.is_null(2)); - } - #[test] fn test_cast_date32_to_string() { let a = Date32Array::from(vec![10000, 17890]); @@ -6799,41 +6778,6 @@ mod tests { assert!(!c.is_valid(5)); // "2000-01-01" } - #[test] - #[cfg_attr(miri, ignore)] // running forever - #[cfg(feature = "chrono-tz")] - fn test_can_cast_types() { - // this function attempts to ensure that can_cast_types stays - // in sync with cast. It simply tries all combinations of - // types and makes sure that if `can_cast_types` returns - // true, so does `cast` - - let all_types = get_all_types(); - - for array in get_arrays_of_all_types() { - for to_type in &all_types { - println!("Test casting {:?} --> {:?}", array.data_type(), to_type); - let cast_result = cast(&array, to_type); - let reported_cast_ability = can_cast_types(array.data_type(), to_type); - - // check for mismatch - match (cast_result, reported_cast_ability) { - (Ok(_), false) => { - panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", - array, array.data_type(), to_type) - } - (Err(e), true) => { - panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ - Error was {:?}", - array, array.data_type(), to_type, e) - } - // otherwise it was a match - _ => {} - }; - } - } - } - #[test] fn test_cast_list_containers() { // large-list to list @@ -6868,99 +6812,6 @@ mod tests { assert_eq!(&expected.value(2), &actual.value(2)); } - /// Create instances of arrays with varying types for cast tests - #[cfg(feature = "chrono-tz")] - fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); - let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; - vec![ - Arc::new(BinaryArray::from(binary_data.clone())), - Arc::new(LargeBinaryArray::from(binary_data.clone())), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - Arc::new(make_list_array()), - Arc::new(make_large_list_array()), - Arc::new(make_fixed_size_list_array()), - Arc::new(make_fixed_size_binary_array()), - Arc::new(StructArray::from(vec![ - ( - Field::new("a", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("b", DataType::Int32, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ])), - Arc::new(make_union_array()), - Arc::new(NullArray::new(10)), - Arc::new(StringArray::from(vec!["foo", "bar"])), - Arc::new(LargeStringArray::from(vec!["foo", "bar"])), - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(Int8Array::from(vec![1, 2])), - Arc::new(Int16Array::from(vec![1, 2])), - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(Int64Array::from(vec![1, 2])), - Arc::new(UInt8Array::from(vec![1, 2])), - Arc::new(UInt16Array::from(vec![1, 2])), - Arc::new(UInt32Array::from(vec![1, 2])), - Arc::new(UInt64Array::from(vec![1, 2])), - Arc::new(Float32Array::from(vec![1.0, 2.0])), - Arc::new(Float64Array::from(vec![1.0, 2.0])), - Arc::new(TimestampSecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), - Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), - Arc::new( - TimestampSecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMillisecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMicrosecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name), - ), - Arc::new(Date32Array::from(vec![1000, 2000])), - Arc::new(Date64Array::from(vec![1000, 2000])), - Arc::new(Time32SecondArray::from(vec![1000, 2000])), - Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), - Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), - Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), - Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), - Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), - Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), - Arc::new(DurationSecondArray::from(vec![1000, 2000])), - Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), - Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), - Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - Arc::new( - create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0) - .unwrap(), - ), - ] - } - fn make_list_array() -> ListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -7009,140 +6860,6 @@ mod tests { LargeListArray::from(list_data) } - #[cfg(feature = "chrono-tz")] - fn make_fixed_size_list_array() -> FixedSizeListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .build() - .unwrap(); - FixedSizeListArray::from(list_data) - } - - #[cfg(feature = "chrono-tz")] - fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - FixedSizeBinaryArray::from(array_data) - } - - #[cfg(feature = "chrono-tz")] - fn make_union_array() -> UnionArray { - let mut builder = UnionBuilder::with_capacity_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.build().unwrap() - } - - /// Creates a dictionary with primitive dictionary values, and keys of type K - #[cfg(feature = "chrono-tz")] - fn make_dictionary_primitive() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: PrimitiveDictionaryBuilder = - PrimitiveDictionaryBuilder::new(); - b.append(1).unwrap(); - b.append(2).unwrap(); - Arc::new(b.finish()) - } - - /// Creates a dictionary with utf8 values, and keys of type K - #[cfg(feature = "chrono-tz")] - fn make_dictionary_utf8() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); - b.append("foo").unwrap(); - b.append("bar").unwrap(); - Arc::new(b.finish()) - } - - // Get a selection of datatypes to try and cast to - #[cfg(feature = "chrono-tz")] - fn get_all_types() -> Vec { - use DataType::*; - let tz_name = String::from("America/New_York"); - - vec![ - Null, - Boolean, - Int8, - Int16, - Int32, - UInt64, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Timestamp(TimeUnit::Second, Some(tz_name.clone())), - Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Nanosecond, Some(tz_name)), - Date32, - Date64, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - Interval(IntervalUnit::YearMonth), - Interval(IntervalUnit::DayTime), - Interval(IntervalUnit::MonthDayNano), - Binary, - FixedSizeBinary(10), - LargeBinary, - Utf8, - LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), - Struct(vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ]), - Union( - vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - Decimal128(38, 0), - ] - } - #[test] fn test_utf8_cast_offsets() { // test if offset of the array is taken into account during cast @@ -7169,41 +6886,6 @@ mod tests { assert_eq!(&out1, &out2.slice(1, 2)) } - #[test] - #[cfg(feature = "chrono-tz")] - fn test_timestamp_cast_utf8() { - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 10:30:00"), - None, - Some("1970-01-01 23:58:59"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let array = array.with_timezone("Australia/Sydney".to_string()); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00 +10:00"), - None, - Some("1970-01-02 09:58:59 +10:00"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - } - #[test] fn test_list_to_string() { let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); @@ -7268,7 +6950,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_f64_to_decimal128() { // to reproduce https://github.com/apache/arrow-rs/issues/2997 diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a97ec1ac123f..876d0d65084e 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -273,3 +273,7 @@ required-features = ["csv", "chrono-tz"] [[test]] name = "pyarrow" required-features = ["pyarrow"] + +[[test]] +name = "array_cast" +required-features = ["chrono-tz"] diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs new file mode 100644 index 000000000000..95fb973289a5 --- /dev/null +++ b/arrow/tests/array_cast.rs @@ -0,0 +1,407 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::{ + PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, +}; +use arrow_array::types::{ + ArrowDictionaryKeyType, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, + DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, + FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, + IntervalYearMonthArray, LargeBinaryArray, LargeListArray, LargeStringArray, + ListArray, NullArray, PrimitiveArray, StringArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, UnionArray, +}; +use arrow_buffer::Buffer; +use arrow_cast::{can_cast_types, cast}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use half::f16; +use std::sync::Arc; + +#[test] +fn test_cast_timestamp_to_string() { + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); + let array = Arc::new(a) as ArrayRef; + dbg!(&array); + let b = cast(&array, &DataType::Utf8).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(&DataType::Utf8, c.data_type()); + assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); + assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); + assert!(c.is_null(2)); +} + +#[test] +#[cfg_attr(miri, ignore)] // running forever +fn test_can_cast_types() { + // this function attempts to ensure that can_cast_types stays + // in sync with cast. It simply tries all combinations of + // types and makes sure that if `can_cast_types` returns + // true, so does `cast` + + let all_types = get_all_types(); + + for array in get_arrays_of_all_types() { + for to_type in &all_types { + println!("Test casting {:?} --> {:?}", array.data_type(), to_type); + let cast_result = cast(&array, to_type); + let reported_cast_ability = can_cast_types(array.data_type(), to_type); + + // check for mismatch + match (cast_result, reported_cast_ability) { + (Ok(_), false) => { + panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", + array, array.data_type(), to_type) + } + (Err(e), true) => { + panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ + Error was {:?}", + array, array.data_type(), to_type, e) + } + // otherwise it was a match + _ => {} + }; + } + } +} + +/// Create instances of arrays with varying types for cast tests +fn get_arrays_of_all_types() -> Vec { + let tz_name = String::from("America/New_York"); + let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; + vec![ + Arc::new(BinaryArray::from(binary_data.clone())), + Arc::new(LargeBinaryArray::from(binary_data.clone())), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + Arc::new(make_list_array()), + Arc::new(make_large_list_array()), + Arc::new(make_fixed_size_list_array()), + Arc::new(make_fixed_size_binary_array()), + Arc::new(StructArray::from(vec![ + ( + Field::new("a", DataType::Boolean, false), + Arc::new(BooleanArray::from(vec![false, false, true, true])) + as Arc, + ), + ( + Field::new("b", DataType::Int32, false), + Arc::new(Int32Array::from(vec![42, 28, 19, 31])), + ), + ])), + Arc::new(make_union_array()), + Arc::new(NullArray::new(10)), + Arc::new(StringArray::from(vec!["foo", "bar"])), + Arc::new(LargeStringArray::from(vec!["foo", "bar"])), + Arc::new(BooleanArray::from(vec![true, false])), + Arc::new(Int8Array::from(vec![1, 2])), + Arc::new(Int16Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int64Array::from(vec![1, 2])), + Arc::new(UInt8Array::from(vec![1, 2])), + Arc::new(UInt16Array::from(vec![1, 2])), + Arc::new(UInt32Array::from(vec![1, 2])), + Arc::new(UInt64Array::from(vec![1, 2])), + Arc::new( + [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))] + .into_iter() + .collect::(), + ), + Arc::new(Float32Array::from(vec![1.0, 2.0])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + Arc::new(TimestampSecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), + Arc::new( + TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMillisecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMicrosecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), + Arc::new(Date32Array::from(vec![1000, 2000])), + Arc::new(Date64Array::from(vec![1000, 2000])), + Arc::new(Time32SecondArray::from(vec![1000, 2000])), + Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), + Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), + Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), + Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), + Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), + Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), + Arc::new(DurationSecondArray::from(vec![1000, 2000])), + Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), + Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), + Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), + Arc::new( + create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0).unwrap(), + ), + ] +} + +fn make_fixed_size_list_array() -> FixedSizeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a fixed size list array from the above two + let list_data_type = + DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, true)), 2); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data) + .build() + .unwrap(); + FixedSizeListArray::from(list_data) +} + +fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { + let values: [u8; 15] = *b"hellotherearrow"; + + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(3) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + FixedSizeBinaryArray::from(array_data) +} + +fn make_list_array() -> ListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + ListArray::from(list_data) +} + +fn make_large_list_array() -> LargeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + LargeListArray::from(list_data) +} + +fn make_union_array() -> UnionArray { + let mut builder = UnionBuilder::with_capacity_dense(7); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.build().unwrap() +} + +/// Creates a dictionary with primitive dictionary values, and keys of type K +fn make_dictionary_primitive() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: PrimitiveDictionaryBuilder = + PrimitiveDictionaryBuilder::new(); + b.append(1).unwrap(); + b.append(2).unwrap(); + Arc::new(b.finish()) +} + +/// Creates a dictionary with utf8 values, and keys of type K +fn make_dictionary_utf8() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); + b.append("foo").unwrap(); + b.append("bar").unwrap(); + Arc::new(b.finish()) +} + +fn create_decimal_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + +// Get a selection of datatypes to try and cast to +fn get_all_types() -> Vec { + use DataType::*; + let tz_name = String::from("America/New_York"); + + vec![ + Null, + Boolean, + Int8, + Int16, + Int32, + UInt64, + UInt8, + UInt16, + UInt32, + UInt64, + Float16, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Timestamp(TimeUnit::Second, Some(tz_name.clone())), + Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Nanosecond, Some(tz_name)), + Date32, + Date64, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + Interval(IntervalUnit::YearMonth), + Interval(IntervalUnit::DayTime), + Interval(IntervalUnit::MonthDayNano), + Binary, + FixedSizeBinary(10), + LargeBinary, + Utf8, + LargeUtf8, + List(Box::new(Field::new("item", DataType::Int8, true))), + List(Box::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Box::new(Field::new("item", DataType::Int8, true))), + LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + Struct(vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ]), + Union( + vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), + Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), + Decimal128(38, 0), + ] +} + +#[test] +fn test_timestamp_cast_utf8() { + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 10:30:00"), + None, + Some("1970-01-01 23:58:59"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); + + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let array = array.with_timezone("Australia/Sydney".to_string()); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 20:30:00 +10:00"), + None, + Some("1970-01-02 09:58:59 +10:00"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); +} From 989ab8d7a28745e76296a27445bf49921ec2d1cd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 30 Nov 2022 12:35:49 -0800 Subject: [PATCH 0358/1411] Remove negative scale check (#3230) * Remove negative scale check * Update datatype doc for negative scale * Update Decimal128Array and Decimal256Array. --- arrow-array/src/array/primitive_array.rs | 11 ++--------- arrow-schema/src/datatype.rs | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 7c201177f045..4ff0ed4d93e6 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -203,10 +203,10 @@ pub type DurationMicrosecondArray = PrimitiveArray; pub type DurationNanosecondArray = PrimitiveArray; /// An array where each element is a 128-bits decimal with precision in [1, 38] and -/// scale in [-38, 38]. +/// scale less or equal to 38. pub type Decimal128Array = PrimitiveArray; /// An array where each element is a 256-bits decimal with precision in [1, 76] and -/// scale in [-76, 76]. +/// scale less or equal to 76. pub type Decimal256Array = PrimitiveArray; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the @@ -1121,13 +1121,6 @@ impl PrimitiveArray { T::MAX_SCALE ))); } - if scale < -T::MAX_SCALE { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is smaller than min {}", - scale, - -Decimal128Type::MAX_SCALE - ))); - } if scale > 0 && scale as u8 > precision { return Err(ArrowError::InvalidArgumentError(format!( "scale {} is greater than precision {}", diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index f1d13aefd279..4162d41bf1b4 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -190,6 +190,13 @@ pub enum DataType { /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. Decimal128(u8, i8), /// Exact 256-bit width decimal value with precision and scale /// @@ -197,6 +204,13 @@ pub enum DataType { /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. Decimal256(u8, i8), /// A Map is a logical nested type that is represented as /// From 961e114af0bd74d31dfcaa30e91f9929a6e6d719 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 30 Nov 2022 14:56:18 -0800 Subject: [PATCH 0359/1411] Add binary_mut and try_binary_mut (#3144) * Add add_mut * Add try_binary_mut * Add test * Change result type * Remove _mut kernels * Fix clippy --- arrow/src/compute/kernels/arithmetic.rs | 31 +++- arrow/src/compute/kernels/arity.rs | 216 ++++++++++++++++++++++++ 2 files changed, 246 insertions(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index f9deada5389b..c57e27095c23 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1624,7 +1624,7 @@ where mod tests { use super::*; use crate::array::Int32Array; - use crate::compute::{try_unary_mut, unary_mut}; + use crate::compute::{binary_mut, try_binary_mut, try_unary_mut, unary_mut}; use crate::datatypes::{Date64Type, Int32Type, Int8Type}; use arrow_buffer::i256; use chrono::NaiveDate; @@ -3100,6 +3100,35 @@ mod tests { assert_eq!(result.null_count(), 13); } + #[test] + fn test_primitive_array_add_mut_by_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + + let c = binary_mut(a, &b, |a, b| a.add_wrapping(b)) + .unwrap() + .unwrap(); + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + } + + #[test] + fn test_primitive_add_mut_wrapping_overflow_by_try_binary_mut() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + + let wrapped = binary_mut(a, &b, |a, b| a.add_wrapping(b)) + .unwrap() + .unwrap(); + let expected = Int32Array::from(vec![-2147483648, -2147483647]); + assert_eq!(expected, wrapped); + + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + let overflow = try_binary_mut(a, &b, |a, b| a.add_checked(b)); + let _ = overflow.unwrap().expect_err("overflow should be detected"); + } + #[test] fn test_primitive_add_scalar_by_unary_mut() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 946d15e9e984..d0f18cf5866d 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -232,6 +232,75 @@ where Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) } +/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating +/// the mutable [`PrimitiveArray`] `a`. If any index is null in either `a` or `b`, the +/// corresponding index in the result will also be null. +/// +/// Mutable primitive array means that the buffer is not shared with other arrays. +/// As a result, this mutates the buffer directly without allocating new buffer. +/// +/// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This +/// is beneficial when the cost of the operation is low compared to the cost of branching, and +/// especially when the operation can be vectorised, however, requires `op` to be infallible +/// for all possible values of its inputs +/// +/// # Error +/// +/// This function gives error if the arrays have different lengths. +/// This function gives error of original [`PrimitiveArray`] `a` if it is not a mutable +/// primitive array. +pub fn binary_mut( + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> T::Native, +{ + if a.len() != b.len() { + return Ok(Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + ))); + } + + if a.is_empty() { + return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty( + &T::DATA_TYPE, + )))); + } + + let len = a.len(); + + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits_offset(0, len)) + .unwrap_or_default(); + + let mut builder = a.into_builder()?; + + builder + .values_slice_mut() + .iter_mut() + .zip(b.values()) + .for_each(|(l, r)| *l = op(*l, *r)); + + let array_builder = builder + .finish() + .data() + .clone() + .into_builder() + .null_bit_buffer(null_buffer) + .null_count(null_count); + + let array_data = unsafe { array_builder.build_unchecked() }; + Ok(Ok(PrimitiveArray::::from(array_data))) +} + /// Applies the provided fallible binary operation across `a` and `b`, returning any error, /// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a` /// or `b`, the corresponding index in the result will also be null @@ -289,6 +358,83 @@ where } } +/// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable +/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in +/// either `a` or `b`, the corresponding index in the result will also be null +/// +/// Like [`try_unary`] the function is only evaluated for non-null indices +/// +/// Mutable primitive array means that the buffer is not shared with other arrays. +/// As a result, this mutates the buffer directly without allocating new buffer. +/// +/// # Error +/// +/// Return an error if the arrays have different lengths or +/// the operation is under erroneous. +/// This function gives error of original [`PrimitiveArray`] `a` if it is not a mutable +/// primitive array. +pub fn try_binary_mut( + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> Result, +{ + if a.len() != b.len() { + return Ok(Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + ))); + } + let len = a.len(); + + if a.is_empty() { + return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty( + &T::DATA_TYPE, + )))); + } + + if a.null_count() == 0 && b.null_count() == 0 { + try_binary_no_nulls_mut(len, a, b, op) + } else { + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits_offset(0, len)) + .unwrap_or_default(); + + let mut builder = a.into_builder()?; + + let slice = builder.values_slice_mut(); + + match try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { + *slice.get_unchecked_mut(idx) = + op(*slice.get_unchecked(idx), b.value_unchecked(idx))? + }; + Ok::<_, ArrowError>(()) + }) { + Ok(_) => {} + Err(err) => return Ok(Err(err)), + }; + + let array_builder = builder + .finish() + .data() + .clone() + .into_builder() + .null_bit_buffer(null_buffer) + .null_count(null_count); + + let array_data = unsafe { array_builder.build_unchecked() }; + Ok(Ok(PrimitiveArray::::from(array_data))) + } +} + /// This intentional inline(never) attribute helps LLVM optimize the loop. #[inline(never)] fn try_binary_no_nulls( @@ -310,6 +456,35 @@ where Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) } +/// This intentional inline(never) attribute helps LLVM optimize the loop. +#[inline(never)] +fn try_binary_no_nulls_mut( + len: usize, + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> Result, +{ + let mut builder = a.into_builder()?; + let slice = builder.values_slice_mut(); + + for idx in 0..len { + unsafe { + match op(*slice.get_unchecked(idx), b.value_unchecked(idx)) { + Ok(value) => *slice.get_unchecked_mut(idx) = value, + Err(err) => return Ok(Err(err)), + }; + }; + } + Ok(Ok(builder.finish())) +} + #[inline(never)] fn try_binary_opt_no_nulls( len: usize, @@ -385,6 +560,7 @@ mod tests { use super::*; use crate::array::{as_primitive_array, Float64Array, PrimitiveDictionaryBuilder}; use crate::datatypes::{Float64Type, Int32Type, Int8Type}; + use arrow_array::Int32Array; #[test] fn test_unary_f64_slice() { @@ -444,4 +620,44 @@ mod tests { &expected ); } + + #[test] + fn test_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let c = binary_mut(a, &b, |l, r| l + r).unwrap().unwrap(); + + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + } + + #[test] + fn test_try_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap(); + + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![1, 2, 3, 4, 5]); + let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap(); + let expected = Int32Array::from(vec![16, 16, 12, 12, 6]); + assert_eq!(c, expected); + + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let _ = try_binary_mut(a, &b, |l, r| { + if l == 1 { + Err(ArrowError::InvalidArgumentError( + "got error".parse().unwrap(), + )) + } else { + Ok(l + r) + } + }) + .unwrap() + .expect_err("should got error"); + } } From c5c34fa43141d01709485d3a008d3df93262c49c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Dec 2022 11:40:58 +0000 Subject: [PATCH 0360/1411] Don't recurse to children in ArrayData::try_new (#3248) * Don't recurse to children in ArrayData::validate_full * Add validate_data and update ArrayData::try_new --- arrow-data/src/data.rs | 81 +++++++++++++++++++++------------ arrow/tests/array_validation.rs | 34 -------------- 2 files changed, 51 insertions(+), 64 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 811696e4dd17..b230dfdb7564 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -286,7 +286,7 @@ impl ArrayData { /// # Safety /// /// The input values *must* form a valid Arrow array for - /// `data_type`, or undefined behavior can results. + /// `data_type`, or undefined behavior can result. /// /// Note: This is a low level API and most users of the arrow /// crate should create arrays using the methods in the `array` @@ -318,19 +318,20 @@ impl ArrayData { // Provide a force_validate mode #[cfg(feature = "force_validate")] - new_self.validate_full().unwrap(); + new_self.validate_data().unwrap(); new_self } - /// Create a new ArrayData, validating that the provided buffers - /// form a valid Arrow array of the specified data type. + /// Create a new ArrayData, validating that the provided buffers form a valid + /// Arrow array of the specified data type. /// /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer /// is set to `None`. /// - /// Note: This is a low level API and most users of the arrow - /// crate should create arrays using the methods in the `array` - /// module. + /// Internally this calls through to [`Self::validate_data`] + /// + /// Note: This is a low level API and most users of the arrow crate should create + /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array) pub fn try_new( data_type: DataType, len: usize, @@ -366,7 +367,10 @@ impl ArrayData { }; // As the data is not trusted, do a full validation of its contents - new_self.validate_full()?; + // We don't need to validate children as we can assume that the + // [`ArrayData`] in `child_data` have already been validated through + // a call to `ArrayData::try_new` or created using unsafe + new_self.validate_data()?; Ok(new_self) } @@ -617,8 +621,8 @@ impl ArrayData { /// contents of the buffers (e.g. that all offsets for UTF8 arrays /// are within the bounds of the values buffer). /// - /// See [ArrayData::validate_full] to validate fully the offset content - /// and the validitiy of utf8 data + /// See [ArrayData::validate_data] to validate fully the offset content + /// and the validity of utf8 data pub fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; @@ -954,35 +958,34 @@ impl ArrayData { Ok(values_data) } - /// "expensive" validation that ensures: + /// Validate that the data contained within this [`ArrayData`] is valid /// /// 1. Null count is correct /// 2. All offsets are valid /// 3. All String data is valid UTF-8 /// 4. All dictionary offsets are valid /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - /// Note calls `validate()` internally - pub fn validate_full(&self) -> Result<(), ArrowError> { - // Check all buffer sizes prior to looking at them more deeply in this function + /// Internally this calls: + /// + /// * [`Self::validate`] + /// * [`Self::validate_nulls`] + /// * [`Self::validate_values`] + /// + /// Note: this does not recurse into children, for a recursive variant + /// see [`Self::validate_full`] + pub fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; - - let null_bitmap_buffer = self - .null_bitmap - .as_ref() - .map(|null_bitmap| null_bitmap.buffer_ref()); - - let actual_null_count = count_nulls(null_bitmap_buffer, self.offset, self.len); - if actual_null_count != self.null_count { - return Err(ArrowError::InvalidArgumentError(format!( - "null_count value ({}) doesn't match actual number of nulls in array ({})", - self.null_count, actual_null_count - ))); - } - + self.validate_nulls()?; self.validate_values()?; + Ok(()) + } + /// Performs a full recursive validation of this [`ArrayData`] and all its children + /// + /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] + /// and all its children recursively + pub fn validate_full(&self) -> Result<(), ArrowError> { + self.validate_data()?; // validate all children recursively self.child_data .iter() @@ -995,10 +998,28 @@ impl ArrayData { )) }) })?; + Ok(()) + } + /// Validates the the null count is correct + pub fn validate_nulls(&self) -> Result<(), ArrowError> { + let nulls = self.null_buffer(); + + let actual_null_count = count_nulls(nulls, self.offset, self.len); + if actual_null_count != self.null_count { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count value ({}) doesn't match actual number of nulls in array ({})", + self.null_count, actual_null_count + ))); + } Ok(()) } + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Utf8 => self.validate_utf8::(), diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 4faf69658e6a..64c433a6616a 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -753,40 +753,6 @@ fn test_validate_list_negative_offsets() { .unwrap(); } -#[test] -#[should_panic(expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])")] -/// test that children are validated recursively (aka bugs in child data of struct also are flagged) -fn test_validate_recursive() { - // Form invalid dictionary array - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - // -1 is not a valid index - let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); - - let dict_data_type = DataType::Dictionary( - Box::new(keys.data_type().clone()), - Box::new(values.data_type().clone()), - ); - - // purposely create an invalid child data - let dict_data = unsafe { - ArrayData::new_unchecked( - dict_data_type, - 2, - None, - None, - 0, - vec![keys.data().buffers()[0].clone()], - vec![values.into_data()], - ) - }; - - // Now, try and create a struct with this invalid child data (and expect an error) - let data_type = - DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); - - ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); -} - /// returns a buffer initialized with some constant value for tests fn make_i32_buffer(n: usize) -> Buffer { Buffer::from_slice_ref(&vec![42i32; n]) From f133621d0f56ebbf23392f9119349d0201bf51cd Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 1 Dec 2022 19:41:25 +0800 Subject: [PATCH 0361/1411] update `&Option` to `Option<&T>` (#3249) * update ref to option to option of ref * change more tests * fix as ref --- arrow-cast/src/cast.rs | 48 +++++++++++++++------------ arrow-ipc/src/compression.rs | 2 +- arrow-ipc/src/writer.rs | 8 ++--- parquet/src/arrow/arrow_reader/mod.rs | 7 ++-- parquet/src/arrow/async_reader.rs | 6 ++-- parquet/src/column/writer/mod.rs | 4 +-- parquet/src/file/metadata.rs | 4 +-- parquet/tests/arrow_writer_layout.rs | 2 +- parquet_derive/src/parquet_field.rs | 4 +-- 9 files changed, 45 insertions(+), 40 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ad9f08388326..649d5ca90007 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1054,17 +1054,20 @@ pub fn cast_with_options( Int64 => cast_numeric_to_string::(array), Float32 => cast_numeric_to_string::(array), Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => { - cast_timestamp_to_string::(array, tz) - } - Timestamp(TimeUnit::Microsecond, tz) => { - cast_timestamp_to_string::(array, tz) - } - Timestamp(TimeUnit::Millisecond, tz) => { - cast_timestamp_to_string::(array, tz) - } + Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< + TimestampNanosecondType, + i32, + >(array, tz.as_ref()), + Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< + TimestampMicrosecondType, + i32, + >(array, tz.as_ref()), + Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< + TimestampMillisecondType, + i32, + >(array, tz.as_ref()), Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz) + cast_timestamp_to_string::(array, tz.as_ref()) } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), @@ -1108,17 +1111,20 @@ pub fn cast_with_options( Int64 => cast_numeric_to_string::(array), Float32 => cast_numeric_to_string::(array), Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => { - cast_timestamp_to_string::(array, tz) - } - Timestamp(TimeUnit::Microsecond, tz) => { - cast_timestamp_to_string::(array, tz) - } - Timestamp(TimeUnit::Millisecond, tz) => { - cast_timestamp_to_string::(array, tz) - } + Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< + TimestampNanosecondType, + i64, + >(array, tz.as_ref()), + Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< + TimestampMicrosecondType, + i64, + >(array, tz.as_ref()), + Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< + TimestampMillisecondType, + i64, + >(array, tz.as_ref()), Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz) + cast_timestamp_to_string::(array, tz.as_ref()) } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), @@ -2474,7 +2480,7 @@ where /// Cast timestamp types to Utf8/LargeUtf8 fn cast_timestamp_to_string( array: &ArrayRef, - tz: &Option, + tz: Option<&String>, ) -> Result where T: ArrowTemporalType + ArrowPrimitiveType, diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index 6349ac232431..f64d14441cb1 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -22,8 +22,8 @@ use arrow_schema::ArrowError; const LENGTH_NO_COMPRESSED_DATA: i64 = -1; const LENGTH_OF_PREFIX_DATA: i64 = 8; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] /// Represents compressing a ipc stream using a particular compression algorithm +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CompressionCodec { Lz4Frame, Zstd, diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 032783deed72..5f188fe1a9fc 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -377,7 +377,7 @@ impl IpcDataGenerator { offset, array.len(), array.null_count(), - &compression_codec, + compression_codec, write_options, )?; } @@ -452,7 +452,7 @@ impl IpcDataGenerator { 0, array_data.len(), array_data.null_count(), - &compression_codec, + compression_codec, write_options, )?; @@ -1058,7 +1058,7 @@ fn write_array_data( offset: i64, num_rows: usize, null_count: usize, - compression_codec: &Option, + compression_codec: Option, write_options: &IpcWriteOptions, ) -> Result { let mut offset = offset; @@ -1234,7 +1234,7 @@ fn write_buffer( buffers: &mut Vec, // output buffer descriptors arrow_data: &mut Vec, // output stream offset: i64, // current output stream offset - compression_codec: &Option, + compression_codec: Option, ) -> Result { let len: i64 = match compression_codec { Some(compressor) => compressor.compress_to_vec(buffer, arrow_data)?, diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index da4b56237e14..e89ddaffe833 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1567,7 +1567,8 @@ mod tests { let expected_data = match opts.row_selections { Some((selections, row_count)) => { - let mut without_skip_data = gen_expected_data::(&def_levels, &values); + let mut without_skip_data = + gen_expected_data::(def_levels.as_ref(), &values); let mut skip_data: Vec> = vec![]; let dequeue: VecDeque = selections.clone().into(); @@ -1585,7 +1586,7 @@ mod tests { } None => { //get flatten table data - let expected_data = gen_expected_data::(&def_levels, &values); + let expected_data = gen_expected_data::(def_levels.as_ref(), &values); assert_eq!(expected_data.len(), opts.num_rows * opts.num_row_groups); expected_data } @@ -1654,7 +1655,7 @@ mod tests { } fn gen_expected_data( - def_levels: &Option>>, + def_levels: Option<&Vec>>, values: &[Vec], ) -> Vec> { let data: Vec> = match def_levels { diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index e182cccbcea3..7602d54a5107 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -903,10 +903,8 @@ mod tests { // Check offset indexes are present for all columns for rg in metadata_with_index.row_groups() { - let page_locations = rg - .page_offset_index() - .as_ref() - .expect("expected page offset index"); + let page_locations = + rg.page_offset_index().expect("expected page offset index"); assert_eq!(page_locations.len(), rg.columns().len()) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 40f8c99403f0..1010dc156a02 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -601,7 +601,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } /// Update the column index and offset index when adding the data page - fn update_column_offset_index(&mut self, page_statistics: &Option) { + fn update_column_offset_index(&mut self, page_statistics: Option<&Statistics>) { // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; @@ -664,7 +664,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; // update column and offset index - self.update_column_offset_index(&page_statistics); + self.update_column_offset_index(page_statistics.as_ref()); let compressed_page = match self.props.writer_version() { WriterVersion::PARQUET_1_0 => { diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 2ba50fa31a1e..51a5264e3cf1 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -277,8 +277,8 @@ impl RowGroupMetaData { } /// Returns reference of page offset index of all column in this row group. - pub fn page_offset_index(&self) -> &Option>> { - &self.page_offset_index + pub fn page_offset_index(&self) -> Option<&Vec>> { + self.page_offset_index.as_ref() } /// Returns reference to a schema descriptor. diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 5744de35e337..bf24950e99c2 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -81,7 +81,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { for (row_group, row_group_layout) in meta.row_groups().iter().zip(&layout.row_groups) { // Check against offset index - let offset_index = row_group.page_offset_index().as_ref().unwrap(); + let offset_index = row_group.page_offset_index().unwrap(); assert_eq!(offset_index.len(), row_group_layout.columns.len()); for (column_index, column_layout) in diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 06bcc0aca924..48b6d3ac41b8 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -672,8 +672,8 @@ mod test { let struct_def: proc_macro2::TokenStream = quote! { struct StringBorrower<'a> { optional_str: Option<&'a str>, - optional_string: &Option, - optional_dumb_int: &Option<&i32>, + optional_string: Option<&String>, + optional_dumb_int: Option<&i32>, } }; From 8f5fd9a123bfd790baa392f11f16807468229fff Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 1 Dec 2022 06:23:38 -0800 Subject: [PATCH 0362/1411] fix(object_store,aws,gcp): multipart upload enforce size limit of 5 MiB not 5MB (#3234) * fix: use better minimum part size * test: don't make the test larger than necessary * Further tweaks * Format Co-authored-by: Raphael Taylor-Davies --- object_store/CONTRIBUTING.md | 6 +++--- object_store/src/lib.rs | 3 ++- object_store/src/multipart.rs | 15 ++++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index e780ec5c9b09..4e6b3afe3859 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -46,9 +46,9 @@ Setup environment ``` export TEST_INTEGRATION=1 -export AWS_DEFAULT_REGION=us-east-1 -export AWS_ACCESS_KEY_ID=test -export AWS_SECRET_ACCESS_KEY=test +export OBJECT_STORE_AWS_DEFAULT_REGION=us-east-1 +export OBJECT_STORE_AWS_ACCESS_KEY_ID=test +export OBJECT_STORE_AWS_SECRET_ACCESS_KEY=test export AWS_ENDPOINT=http://128.0.0.1:4566 export OBJECT_STORE_BUCKET=test-bucket ``` diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6278d827b0c7..a36bb5fb8de4 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -769,7 +769,8 @@ mod tests { assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage - let data = get_vec_of_bytes(5_000, 5); + // Sizes carefully chosen to exactly hit min limit of 5 MiB + let data = get_vec_of_bytes(242_880, 22); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 102d8bedaa46..de8591462500 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -81,7 +81,11 @@ where current_buffer: Vec::new(), // TODO: Should self vary by provider? // TODO: Should we automatically increase then when part index gets large? - min_part_size: 5_000_000, + + // Minimum size of 5 MiB + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html + // https://cloud.google.com/storage/quotas#requests + min_part_size: 5_242_880, current_part_idx: 0, completion_task: None, } @@ -113,13 +117,14 @@ where mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { // Poll current tasks self.as_mut().poll_tasks(cx)?; // If adding buf to pending buffer would trigger send, check // whether we have capacity for another task. - let enough_to_send = (buf.len() + self.current_buffer.len()) > self.min_part_size; + let enough_to_send = + (buf.len() + self.current_buffer.len()) >= self.min_part_size; if enough_to_send && self.tasks.len() < self.max_concurrency { // If we do, copy into the buffer and submit the task, and return ready. self.current_buffer.extend_from_slice(buf); @@ -149,7 +154,7 @@ where fn poll_flush( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { // Poll current tasks self.as_mut().poll_tasks(cx)?; @@ -177,7 +182,7 @@ where fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { // First, poll flush match self.as_mut().poll_flush(cx) { Poll::Pending => return Poll::Pending, From 89a072e50950a857f046451a477bc61fc9b8c5de Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Dec 2022 17:47:56 +0000 Subject: [PATCH 0363/1411] Better document implications of offsets (#3228) (#3243) --- arrow-schema/src/datatype.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 4162d41bf1b4..da1c20ddbd38 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -149,21 +149,37 @@ pub enum DataType { /// days can differ in length during day light savings time transitions). Interval(IntervalUnit), /// Opaque binary data of variable length. + /// + /// A single Binary array can store up to [`i32::MAX`] bytes + /// of binary data in total Binary, /// Opaque binary data of fixed size. /// Enum parameter specifies the number of bytes per value. FixedSizeBinary(i32), /// Opaque binary data of variable length and 64-bit offsets. + /// + /// A single LargeBinary array can store up to [`i64::MAX`] bytes + /// of binary data in total LargeBinary, - /// A variable-length string in Unicode with UTF-8 encoding. + /// A variable-length string in Unicode with UTF-8 encoding + /// + /// A single Utf8 array can store up to [`i32::MAX`] bytes + /// of string data in total Utf8, /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. + /// + /// A single LargeUtf8 array can store up to [`i64::MAX`] bytes + /// of string data in total LargeUtf8, /// A list of some logical data type with variable length. + /// + /// A single List array can store up to [`i32::MAX`] elements in total List(Box), /// A list of some logical data type with fixed length. FixedSizeList(Box, i32), /// A list of some logical data type with variable length and 64-bit offsets. + /// + /// A single LargeList array can store up to [`i64::MAX`] elements in total LargeList(Box), /// A nested datatype that contains a number of sub-fields. Struct(Vec), From 26438feb7a59aa156563ed8c6e8b0e6579b2e028 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 1 Dec 2022 11:08:19 -0800 Subject: [PATCH 0364/1411] Casting from decimal256 to unsigned numeric (#3240) --- arrow-cast/src/cast.rs | 56 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 649d5ca90007..be767f137cd8 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -84,6 +84,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | // decimal to unsigned numeric (Decimal128(_, _), UInt8 | UInt16 | UInt32 | UInt64) | + (Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64) @@ -702,6 +703,30 @@ pub fn cast_with_options( (Decimal256(_, scale), _) => { // cast decimal to other type match to_type { + UInt8 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + UInt16 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + UInt32 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), + UInt64 => cast_decimal_to_integer::( + array, + i256::from_i128(10_i128), + *scale, + cast_options, + ), Int8 => cast_decimal_to_integer::( array, i256::from_i128(10_i128), @@ -4071,9 +4096,6 @@ mod tests { #[test] fn test_cast_decimal256_to_numeric() { - let decimal_type = DataType::Decimal256(38, 2); - // negative test - assert!(!can_cast_types(&decimal_type, &DataType::UInt8)); let value_array: Vec> = vec![ Some(i256::from_i128(125)), Some(i256::from_i128(225)), @@ -4083,6 +4105,34 @@ mod tests { ]; let decimal_array = create_decimal256_array(value_array, 38, 2).unwrap(); let array = Arc::new(decimal_array) as ArrayRef; + // u8 + generate_cast_test_case!( + &array, + UInt8Array, + &DataType::UInt8, + vec![Some(1_u8), Some(2_u8), Some(3_u8), None, Some(5_u8)] + ); + // u16 + generate_cast_test_case!( + &array, + UInt16Array, + &DataType::UInt16, + vec![Some(1_u16), Some(2_u16), Some(3_u16), None, Some(5_u16)] + ); + // u32 + generate_cast_test_case!( + &array, + UInt32Array, + &DataType::UInt32, + vec![Some(1_u32), Some(2_u32), Some(3_u32), None, Some(5_u32)] + ); + // u64 + generate_cast_test_case!( + &array, + UInt64Array, + &DataType::UInt64, + vec![Some(1_u64), Some(2_u64), Some(3_u64), None, Some(5_u64)] + ); // i8 generate_cast_test_case!( &array, From 95cbca64e1dc30360304a1522f07c58dc661ef6b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 09:49:02 +0000 Subject: [PATCH 0365/1411] Add ObjectStore ClientConfig (#3252) * Add ObjectStore ClientConfig * Fix default allow HTTP for GCP * Fix tests * Tweak error message --- object_store/src/aws/client.rs | 22 ++--------- object_store/src/aws/mod.rs | 65 +++++++++++++++---------------- object_store/src/azure/client.rs | 26 ++++--------- object_store/src/azure/mod.rs | 32 +++++++++------- object_store/src/client/mod.rs | 50 ++++++++++++++++++++++++ object_store/src/gcp/mod.rs | 66 ++++++++++++++++---------------- object_store/src/lib.rs | 3 ++ 7 files changed, 147 insertions(+), 117 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index e51fe415cd14..ccc0a9c6bbc0 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -23,7 +23,8 @@ use crate::multipart::UploadPart; use crate::path::DELIMITER; use crate::util::{format_http_range, format_prefix}; use crate::{ - BoxStream, ListResult, MultipartId, ObjectMeta, Path, Result, RetryConfig, StreamExt, + BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, + RetryConfig, StreamExt, }; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; @@ -88,9 +89,6 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -203,8 +201,7 @@ pub struct S3Config { pub bucket_endpoint: String, pub credentials: Box, pub retry_config: RetryConfig, - pub allow_http: bool, - pub proxy_url: Option, + pub client_options: ClientOptions, } impl S3Config { @@ -221,18 +218,7 @@ pub(crate) struct S3Client { impl S3Client { pub fn new(config: S3Config) -> Result { - let builder = reqwest::ClientBuilder::new().https_only(!config.allow_http); - let client = match &config.proxy_url { - Some(ref url) => { - let pr = reqwest::Proxy::all(url) - .map_err(|source| Error::ProxyUrl { source })?; - builder.proxy(pr) - } - _ => builder, - } - .build() - .unwrap(); - + let client = config.client_options.client()?; Ok(Self { config, client }) } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index cf7a5542e0c5..c92b8c29a1ff 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -36,7 +36,6 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; -use reqwest::{Client, Proxy}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -51,8 +50,8 @@ use crate::aws::credential::{ }; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, + Result, RetryConfig, StreamExt, }; mod client; @@ -120,9 +119,6 @@ enum Error { #[snafu(display("Error reading token file: {}", source))] ReadTokenFile { source: std::io::Error }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -361,12 +357,11 @@ pub struct AmazonS3Builder { endpoint: Option, token: Option, retry_config: RetryConfig, - allow_http: bool, imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, - proxy_url: Option, + client_options: ClientOptions, } impl AmazonS3Builder { @@ -431,7 +426,8 @@ impl AmazonS3Builder { } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.allow_http = text == "true"; + builder.client_options = + builder.client_options.with_allow_http(text == "true"); } builder @@ -487,7 +483,7 @@ impl AmazonS3Builder { /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.client_options = self.client_options.with_allow_http(allow_http); self } @@ -543,7 +539,13 @@ impl AmazonS3Builder { /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -571,14 +573,6 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; - let clientbuilder = match self.proxy_url { - Some(ref url) => { - let pr: Proxy = - Proxy::all(url).map_err(|source| Error::ProxyUrl { source })?; - Client::builder().proxy(pr) - } - None => Client::builder(), - }; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); @@ -608,7 +602,11 @@ impl AmazonS3Builder { let endpoint = format!("https://sts.{}.amazonaws.com", region); // Disallow non-HTTPs requests - let client = clientbuilder.https_only(true).build().unwrap(); + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; Box::new(WebIdentityProvider { cache: Default::default(), @@ -629,11 +627,12 @@ impl AmazonS3Builder { info!("Using Instance credential provider"); // The instance metadata endpoint is access over HTTP - let client = clientbuilder.https_only(false).build().unwrap(); + let client_options = + self.client_options.clone().with_allow_http(true); Box::new(InstanceCredentialProvider { cache: Default::default(), - client, + client: client_options.client()?, retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback, metadata_endpoint: self @@ -670,11 +669,10 @@ impl AmazonS3Builder { bucket_endpoint, credentials, retry_config: self.retry_config, - allow_http: self.allow_http, - proxy_url: self.proxy_url, + client_options: self.client_options, }; - let client = Arc::new(S3Client::new(config).unwrap()); + let client = Arc::new(S3Client::new(config)?); Ok(AmazonS3 { client }) } @@ -931,21 +929,20 @@ mod tests { assert!(s3.is_ok()); - let s3 = AmazonS3Builder::new() + let err = AmazonS3Builder::new() .with_access_key_id("access_key_id") .with_secret_access_key("secret_access_key") .with_region("region") .with_bucket_name("bucket_name") .with_allow_http(true) .with_proxy_url("asdf://example.com") - .build(); + .build() + .unwrap_err() + .to_string(); - assert!(match s3 { - Err(crate::Error::Generic { source, .. }) => matches!( - source.downcast_ref(), - Some(crate::aws::Error::ProxyUrl { .. }) - ), - _ => false, - }) + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); } } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index d8cfdd1c759e..b537f5edf679 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -21,13 +21,16 @@ use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::path::DELIMITER; use crate::util::{format_http_range, format_prefix}; -use crate::{BoxStream, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt}; +use crate::{ + BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, + StreamExt, +}; use bytes::{Buf, Bytes}; use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, - Client as ReqwestClient, Method, Proxy, Response, StatusCode, + Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Deserializer, Serialize}; use snafu::{ResultExt, Snafu}; @@ -82,9 +85,6 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for crate::Error { @@ -124,10 +124,9 @@ pub struct AzureConfig { pub container: String, pub credentials: CredentialProvider, pub retry_config: RetryConfig, - pub allow_http: bool, pub service: Url, pub is_emulator: bool, - pub proxy_url: Option, + pub client_options: ClientOptions, } impl AzureConfig { @@ -153,18 +152,7 @@ pub(crate) struct AzureClient { impl AzureClient { /// create a new instance of [AzureClient] pub fn new(config: AzureConfig) -> Result { - let builder = ReqwestClient::builder(); - - let client = if let Some(url) = config.proxy_url.as_ref() { - let pr = Proxy::all(url).map_err(|source| Error::ProxyUrl { source }); - builder.proxy(pr.unwrap()) - } else { - builder - } - .https_only(!config.allow_http) - .build() - .unwrap(); - + let client = config.client_options.client()?; Ok(Self { config, client }) } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 060b4b2d25dd..4b7131ea85be 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -30,7 +30,8 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, }; use async_trait::async_trait; use bytes::Bytes; @@ -359,8 +360,7 @@ pub struct MicrosoftAzureBuilder { authority_host: Option, use_emulator: bool, retry_config: RetryConfig, - allow_http: bool, - proxy_url: Option, + client_options: ClientOptions, } impl Debug for MicrosoftAzureBuilder { @@ -480,10 +480,10 @@ impl MicrosoftAzureBuilder { } /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS is allowed + /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.client_options = self.client_options.with_allow_http(allow_http); self } @@ -503,7 +503,13 @@ impl MicrosoftAzureBuilder { /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -521,14 +527,13 @@ impl MicrosoftAzureBuilder { sas_query_pairs, use_emulator, retry_config, - allow_http, authority_host, - proxy_url, + mut client_options, } = self; let container = container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, allow_http, storage_url, auth, account) = if use_emulator { + let (is_emulator, storage_url, auth, account) = if use_emulator { let account_name = account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); // Allow overriding defaults. Values taken from @@ -537,7 +542,9 @@ impl MicrosoftAzureBuilder { let account_key = access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); let credential = credential::CredentialProvider::AccessKey(account_key); - (true, true, url, credential, account_name) + + client_options = client_options.with_allow_http(true); + (true, url, credential, account_name) } else { let account_name = account_name.ok_or(Error::MissingAccount {})?; let account_url = format!("https://{}.blob.core.windows.net", &account_name); @@ -564,18 +571,17 @@ impl MicrosoftAzureBuilder { } else { Err(Error::MissingCredentials {}) }?; - (false, allow_http, url, credential, account_name) + (false, url, credential, account_name) }; let config = client::AzureConfig { account, - allow_http, retry_config, service: storage_url, container, credentials: auth, is_emulator, - proxy_url, + client_options, }; let client = Arc::new(client::AzureClient::new(config)?); diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index c93c68a1faa4..2b58a77f2ce3 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -23,3 +23,53 @@ pub mod mock_server; pub mod pagination; pub mod retry; pub mod token; + +use reqwest::{Client, ClientBuilder, Proxy}; + +fn map_client_error(e: reqwest::Error) -> super::Error { + super::Error::Generic { + store: "HTTP client", + source: Box::new(e), + } +} + +/// HTTP client configuration for remote object stores +#[derive(Debug, Clone, Default)] +pub struct ClientOptions { + proxy_url: Option, + allow_http: bool, +} + +impl ClientOptions { + /// Create a new [`ClientOptions`] with default values + pub fn new() -> Self { + Default::default() + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Set an HTTP proxy to use for requests + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.proxy_url = Some(proxy_url.into()); + self + } + + pub(crate) fn client(&self) -> super::Result { + let mut builder = ClientBuilder::new(); + if let Some(proxy) = &self.proxy_url { + let proxy = Proxy::all(proxy).map_err(map_client_error)?; + builder = builder.proxy(proxy); + } + + builder + .https_only(!self.allow_http) + .build() + .map_err(map_client_error) + } +} diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 0da92fdbe3d1..41d6696c1536 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -41,7 +41,6 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; -use reqwest::Proxy; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -53,7 +52,8 @@ use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::{format_http_range, format_prefix}, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, }; use credential::OAuthProvider; @@ -123,9 +123,6 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, - - #[snafu(display("Unable to use proxy url: {}", source))] - ProxyUrl { source: reqwest::Error }, } impl From for super::Error { @@ -739,13 +736,23 @@ fn reader_credentials_file( /// .with_bucket_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Debug, Default)] +#[derive(Debug)] pub struct GoogleCloudStorageBuilder { bucket_name: Option, service_account_path: Option, - client: Option, retry_config: RetryConfig, - proxy_url: Option, + client_options: ClientOptions, +} + +impl Default for GoogleCloudStorageBuilder { + fn default() -> Self { + Self { + bucket_name: None, + service_account_path: None, + retry_config: Default::default(), + client_options: ClientOptions::new().with_allow_http(true), + } + } } impl GoogleCloudStorageBuilder { @@ -787,9 +794,15 @@ impl GoogleCloudStorageBuilder { self } - /// Set proxy url used for connection + /// Set the proxy_url to be used by the underlying client pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.proxy_url = Some(proxy_url.into()); + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; self } @@ -799,27 +812,15 @@ impl GoogleCloudStorageBuilder { let Self { bucket_name, service_account_path, - client, retry_config, - proxy_url, + client_options, } = self; let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; - let client = match (proxy_url, client) { - (_, Some(client)) => client, - (Some(url), None) => { - let pr = Proxy::all(&url).map_err(|source| Error::ProxyUrl { source })?; - Client::builder() - .proxy(pr) - .build() - .map_err(|source| Error::ProxyUrl { source })? - } - (None, None) => Client::new(), - }; - + let client = client_options.client()?; let credentials = reader_credentials_file(service_account_path)?; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes @@ -1054,18 +1055,17 @@ mod test { .build(); assert!(dbg!(gcs).is_ok()); - let gcs = GoogleCloudStorageBuilder::new() + let err = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) .with_bucket_name("foo") .with_proxy_url("asdf://example.com") - .build(); + .build() + .unwrap_err() + .to_string(); - assert!(match gcs { - Err(ObjectStoreError::Generic { source, .. }) => matches!( - source.downcast_ref(), - Some(crate::gcp::Error::ProxyUrl { .. }) - ), - _ => false, - }) + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index a36bb5fb8de4..ec41f381228b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -197,6 +197,9 @@ use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; +#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] +pub use client::ClientOptions; + /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; From de3828cd71a17076147b07a796e4b97bc669648d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 2 Dec 2022 02:28:28 -0800 Subject: [PATCH 0366/1411] fix(object_store,gcp): test copy_if_not_exist (#3236) * fix(object_store,gcp): test copy_if_not_exist * doc: update GCS testing instructions * test: move copy test into non-local branch * Revert CONTENT_LENGTH change Co-authored-by: Raphael Taylor-Davies --- object_store/CONTRIBUTING.md | 6 +++--- object_store/src/gcp/mod.rs | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index 4e6b3afe3859..efcd5fe343db 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -95,13 +95,13 @@ To test the GCS integration, we use [Fake GCS Server](https://github.com/fsouza/ Startup the fake server: ```shell -docker run -p 4443:4443 fsouza/fake-gcs-server +docker run -p 4443:4443 fsouza/fake-gcs-server -scheme http ``` Configure the account: ```shell -curl --insecure -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "https://localhost:4443/storage/v1/b" -echo '{"gcs_base_url": "https://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > /tmp/gcs.json +curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" +echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > /tmp/gcs.json ``` Now run the tests: diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 41d6696c1536..f93cbde3d1b4 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -123,6 +123,12 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, + + #[snafu(display("Already exists: {}", path))] + AlreadyExists { + source: crate::client::retry::Error, + path: String, + }, } impl From for super::Error { @@ -138,6 +144,10 @@ impl From for super::Error { source: Box::new(source), } } + Error::AlreadyExists { source, path } => Self::AlreadyExists { + source: Box::new(source), + path, + }, _ => Self::Generic { store: "GCS", source: Box::new(err), @@ -419,8 +429,22 @@ impl GoogleCloudStorageClient { .bearer_auth(token) .send_retry(&self.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|err| { + if err + .status() + .map(|status| status == reqwest::StatusCode::PRECONDITION_FAILED) + .unwrap_or_else(|| false) + { + Error::AlreadyExists { + source: err, + path: to.to_string(), + } + } else { + Error::CopyRequest { + source: err, + path: from.to_string(), + } + } })?; Ok(()) @@ -880,8 +904,8 @@ mod test { use crate::{ tests::{ - get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, + copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, Error as ObjectStoreError, ObjectStore, }; @@ -946,6 +970,9 @@ mod test { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; if integration.client.base_url == default_gcs_base_url() { + // Fake GCS server doesn't currently honor ifGenerationMatch + // https://github.com/fsouza/fake-gcs-server/issues/994 + copy_if_not_exists(&integration).await; // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; From 9833288520c2e9ad353442170e3d2a8f27c6672d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 11:59:43 +0000 Subject: [PATCH 0367/1411] Support List and LargeList in Row format (#3159) (#3251) * Support List and LargeList in Row format (#3159) * Clippy * Update arrow/src/row/mod.rs Co-authored-by: Marco Neumann * Update arrow/src/row/list.rs Co-authored-by: Andrew Lamb * More tests * Tweak docs Co-authored-by: Marco Neumann Co-authored-by: Andrew Lamb --- arrow/src/row/list.rs | 178 +++++++++++++++++++ arrow/src/row/mod.rs | 350 +++++++++++++++++++++++++++++++++++++- arrow/src/row/variable.rs | 112 ++++++------ 3 files changed, 584 insertions(+), 56 deletions(-) create mode 100644 arrow/src/row/list.rs diff --git a/arrow/src/row/list.rs b/arrow/src/row/list.rs new file mode 100644 index 000000000000..e5ea5c2a04c4 --- /dev/null +++ b/arrow/src/row/list.rs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::compute::SortOptions; +use crate::row::{RowConverter, Rows, SortField}; +use arrow_array::builder::BufferBuilder; +use arrow_array::{Array, GenericListArray, OffsetSizeTrait}; +use arrow_data::ArrayDataBuilder; +use arrow_schema::ArrowError; +use std::ops::Range; + +pub fn compute_lengths( + lengths: &mut [usize], + rows: &Rows, + array: &GenericListArray, +) { + let offsets = array.value_offsets().windows(2); + lengths + .iter_mut() + .zip(offsets) + .enumerate() + .for_each(|(idx, (length, offsets))| { + let start = offsets[0].as_usize(); + let end = offsets[1].as_usize(); + let range = array.is_valid(idx).then_some(start..end); + *length += encoded_len(rows, range); + }); +} + +fn encoded_len(rows: &Rows, range: Option>) -> usize { + match range { + None => 1, + Some(range) if range.start == range.end => 1, + Some(range) => { + let element_count = range.end - range.start; + let row_bytes = range.map(|i| rows.row(i).as_ref().len()).sum::(); + let total = (1 + element_count) * std::mem::size_of::() + row_bytes; + super::variable::padded_length(Some(total)) + } + } +} + +/// Encodes the provided `GenericListArray` to `out` with the provided `SortOptions` +/// +/// `rows` should contain the encoded child elements +pub fn encode( + out: &mut Rows, + rows: &Rows, + opts: SortOptions, + array: &GenericListArray, +) { + let mut temporary = vec![]; + let offsets = array.value_offsets().windows(2); + out.offsets + .iter_mut() + .skip(1) + .zip(offsets) + .enumerate() + .for_each(|(idx, (offset, offsets))| { + let start = offsets[0].as_usize(); + let end = offsets[1].as_usize(); + let range = array.is_valid(idx).then_some(start..end); + let out = &mut out.buffer[*offset..]; + *offset += encode_one(out, &mut temporary, rows, range, opts) + }); +} + +#[inline] +fn encode_one( + out: &mut [u8], + temporary: &mut Vec, + rows: &Rows, + range: Option>, + opts: SortOptions, +) -> usize { + temporary.clear(); + + match range { + None => super::variable::encode_one(out, None, opts), + Some(range) if range.start == range.end => { + super::variable::encode_one(out, Some(&[]), opts) + } + Some(range) => { + for row in range.clone().map(|i| rows.row(i)) { + temporary.extend_from_slice(row.as_ref()); + } + for row in range.clone().map(|i| rows.row(i)) { + let len: u32 = row + .as_ref() + .len() + .try_into() + .expect("ListArray or LargeListArray containing a list of more than u32::MAX items is not supported"); + temporary.extend_from_slice(&len.to_be_bytes()); + } + let row_count: u32 = (range.end - range.start) + .try_into() + .expect("lists containing more than u32::MAX elements not supported"); + temporary.extend_from_slice(&row_count.to_be_bytes()); + super::variable::encode_one(out, Some(temporary), opts) + } + } +} + +/// Decodes a string array from `rows` with the provided `options` +/// +/// # Safety +/// +/// `rows` must contain valid data for the provided `converter` +pub unsafe fn decode( + converter: &RowConverter, + rows: &mut [&[u8]], + field: &SortField, + validate_utf8: bool, +) -> Result, ArrowError> { + let canonical = super::variable::decode_binary::(rows, field.options); + + let mut offsets = BufferBuilder::::new(rows.len() + 1); + offsets.append(O::from_usize(0).unwrap()); + let mut current_offset = 0; + + let mut child_rows = Vec::with_capacity(rows.len()); + canonical.value_offsets().windows(2).for_each(|w| { + let start = w[0] as usize; + let end = w[1] as usize; + if start == end { + // Null or empty list + offsets.append(O::from_usize(current_offset).unwrap()); + return; + } + + let row = &canonical.value_data()[start..end]; + let element_count_start = row.len() - 4; + let element_count = + u32::from_be_bytes((&row[element_count_start..]).try_into().unwrap()) + as usize; + + let lengths_start = element_count_start - (element_count * 4); + let mut row_offset = 0; + row[lengths_start..element_count_start] + .chunks_exact(4) + .for_each(|chunk| { + let len = u32::from_be_bytes(chunk.try_into().unwrap()); + let next_row_offset = row_offset + len as usize; + child_rows.push(&row[row_offset..next_row_offset]); + row_offset = next_row_offset; + }); + + current_offset += element_count; + offsets.append(O::from_usize(current_offset).unwrap()); + }); + + let child = converter.convert_raw(&mut child_rows, validate_utf8)?; + assert_eq!(child.len(), 1); + let child_data = child[0].data().clone(); + + let builder = ArrayDataBuilder::new(field.data_type.clone()) + .len(rows.len()) + .null_count(canonical.null_count()) + .null_bit_buffer(canonical.data().null_buffer().cloned()) + .add_buffer(offsets.finish()) + .add_child_data(child_data); + + Ok(GenericListArray::from(unsafe { builder.build_unchecked() })) +} diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index cff49740fb15..abb8039cc398 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -147,6 +147,7 @@ use crate::{downcast_dictionary_array, downcast_primitive_array}; mod dictionary; mod fixed; mod interner; +mod list; mod variable; /// Converts [`ArrayRef`] columns into a [row-oriented](self) format. @@ -343,6 +344,56 @@ mod variable; /// └───────┴───────────────┴───────┴─────────┴───────┘ /// ``` /// +/// ## List Encoding +/// +/// Lists are encoded by first encoding all child elements to the row format. +/// +/// A "canonical byte array" is then constructed by concatenating the row +/// encodings of all their elements into a single binary array, followed +/// by the lengths of each encoded row, and the number of elements, encoded +/// as big endian `u32`. +/// +/// This canonical byte array is then encoded using the variable length byte +/// encoding described above. +/// +/// _The lengths are not strictly necessary but greatly simplify decode, they +/// may be removed in a future iteration_. +/// +/// For example given: +/// +/// ```text +/// [1_u8, 2_u8, 3_u8] +/// [1_u8, null] +/// [] +/// null +/// ``` +/// +/// The elements would be converted to: +/// +/// ```text +/// ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ +/// 1 │01│01│ 2 │01│02│ 3 │01│03│ 1 │01│01│ null │00│00│ +/// └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘ +///``` +/// +/// Which would be grouped into the following canonical byte arrays: +/// +/// ```text +/// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐ +/// [1_u8, 2_u8, 3_u8] │01│01│01│02│01│03│00│00│00│02│00│00│00│02│00│00│00│02│00│00│00│03│ +/// └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘ +/// └──── rows ────┘ └───────── row lengths ─────────┘ └─ count ─┘ +/// +/// ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐ +/// [1_u8, null] │01│01│00│00│00│00│00│02│00│00│00│02│00│00│00│02│ +/// └──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘ +///``` +/// +/// With `[]` represented by an empty byte array, and `null` a null byte array. +/// +/// These byte arrays will then be encoded using the variable length byte encoding +/// described above. +/// /// # Ordering /// /// ## Float Ordering @@ -381,6 +432,8 @@ enum Codec { /// A row converter for the child fields /// and the encoding of a row containing only nulls Struct(RowConverter, OwnedRow), + /// A row converter for the child field + List(RowConverter), } impl Codec { @@ -388,6 +441,20 @@ impl Codec { match &sort_field.data_type { DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())), d if !d.is_nested() => Ok(Self::Stateless), + DataType::List(f) | DataType::LargeList(f) => { + // The encoded contents will be inverted if descending is set to true + // As such we set `descending` to false and negate nulls first if it + // it set to true + let options = SortOptions { + descending: false, + nulls_first: sort_field.options.nulls_first + != sort_field.options.descending, + }; + + let field = SortField::new_with_options(f.data_type().clone(), options); + let converter = RowConverter::new(vec![field])?; + Ok(Self::List(converter)) + } DataType::Struct(f) => { let sort_fields = f .iter() @@ -441,6 +508,15 @@ impl Codec { let rows = converter.convert_columns(v.columns())?; Ok(Encoder::Struct(rows, null.row())) } + Codec::List(converter) => { + let values = match array.data_type() { + DataType::List(_) => as_list_array(array).values(), + DataType::LargeList(_) => as_large_list_array(array).values(), + _ => unreachable!(), + }; + let rows = converter.convert_columns(&[values])?; + Ok(Encoder::List(rows)) + } } } @@ -449,6 +525,7 @@ impl Codec { Codec::Stateless => 0, Codec::Dictionary(interner) => interner.size(), Codec::Struct(converter, nulls) => converter.size() + nulls.data.len(), + Codec::List(converter) => converter.size(), } } } @@ -459,12 +536,14 @@ enum Encoder<'a> { Stateless, /// The mapping from dictionary keys to normalized keys Dictionary(Vec>), - /// The row encoding of the child array and the encoding of a null row + /// The row encoding of the child arrays and the encoding of a null row /// /// It is necessary to encode to a temporary [`Rows`] to avoid serializing /// values that are masked by a null in the parent StructArray, otherwise /// this would establish an ordering between semantically null values Struct(Rows, Row<'a>), + /// The row encoding of the child array + List(Rows), } /// Configure the data type and sort order for a given column @@ -521,6 +600,9 @@ impl RowConverter { fn supports_datatype(d: &DataType) -> bool { match d { _ if !d.is_nested() => true, + DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { + Self::supports_datatype(f.data_type()) + } DataType::Struct(f) => { f.iter().all(|x| Self::supports_datatype(x.data_type())) } @@ -571,7 +653,7 @@ impl RowConverter { columns.iter().zip(self.fields.iter()).zip(encoders) { // We encode a column at a time to minimise dispatch overheads - encode_column(&mut rows, column, field.options, &encoder) + encode_column(&mut rows, column.as_ref(), field.options, &encoder) } if cfg!(debug_assertions) { @@ -975,6 +1057,15 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> } }); } + Encoder::List(rows) => match array.data_type() { + DataType::List(_) => { + list::compute_lengths(&mut lengths, rows, as_list_array(array)) + } + DataType::LargeList(_) => { + list::compute_lengths(&mut lengths, rows, as_large_list_array(array)) + } + _ => unreachable!(), + }, } } @@ -1014,7 +1105,7 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses fn encode_column( out: &mut Rows, - column: &ArrayRef, + column: &dyn Array, opts: SortOptions, encoder: &Encoder<'_>, ) { @@ -1056,7 +1147,7 @@ fn encode_column( } } Encoder::Struct(rows, null) => { - let array = as_struct_array(column.as_ref()); + let array = as_struct_array(column); let null_sentinel = null_sentinel(opts); out.offsets .iter_mut() @@ -1073,6 +1164,13 @@ fn encode_column( *offset = end_offset; }) } + Encoder::List(rows) => match column.data_type() { + DataType::List(_) => list::encode(out, rows, opts, as_list_array(column)), + DataType::LargeList(_) => { + list::encode(out, rows, opts, as_large_list_array(column)) + } + _ => unreachable!(), + }, } } @@ -1165,6 +1263,15 @@ unsafe fn decode_column( Arc::new(StructArray::from(builder.build_unchecked())) } + Codec::List(converter) => match &field.data_type { + DataType::List(_) => { + Arc::new(list::decode::(converter, rows, field, validate_utf8)?) + } + DataType::LargeList(_) => { + Arc::new(list::decode::(converter, rows, field, validate_utf8)?) + } + _ => unreachable!(), + }, }; Ok(array) } @@ -1173,7 +1280,9 @@ unsafe fn decode_column( mod tests { use std::sync::Arc; - use arrow_array::builder::FixedSizeBinaryBuilder; + use arrow_array::builder::{ + FixedSizeBinaryBuilder, GenericListBuilder, Int32Builder, + }; use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; use rand::{thread_rng, Rng}; @@ -1542,6 +1651,24 @@ mod tests { let cols = converter.convert_rows(&rows_c).unwrap(); assert_eq!(&cols[0], &a); + + let mut converter = RowConverter::new(vec![SortField::new_with_options( + a.data_type().clone(), + SortOptions { + descending: true, + nulls_first: true, + }, + )]) + .unwrap(); + + let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); + assert!(rows_c.row(3) < rows_c.row(5)); + assert!(rows_c.row(2) > rows_c.row(1)); + assert!(rows_c.row(0) > rows_c.row(1)); + assert!(rows_c.row(3) < rows_c.row(0)); + + let cols = converter.convert_rows(&rows_c).unwrap(); + assert_eq!(&cols[0], &a); } #[test] @@ -1671,6 +1798,219 @@ mod tests { let _ = converter.convert_rows(&rows); } + fn test_single_list() { + let mut builder = GenericListBuilder::::new(Int32Builder::new()); + builder.values().append_value(32); + builder.values().append_value(52); + builder.values().append_value(32); + builder.append(true); + builder.values().append_value(32); + builder.values().append_value(52); + builder.values().append_value(12); + builder.append(true); + builder.values().append_value(32); + builder.values().append_value(52); + builder.append(true); + builder.values().append_value(32); // MASKED + builder.values().append_value(52); // MASKED + builder.append(false); + builder.values().append_value(32); + builder.values().append_null(); + builder.append(true); + builder.append(true); + + let list = Arc::new(builder.finish()) as ArrayRef; + let d = list.data_type().clone(); + + let mut converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); + + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] + assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 42] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 42] + assert!(rows.row(3) < rows.row(5)); // null < [] + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + + let options = SortOptions { + descending: false, + nulls_first: false, + }; + let field = SortField::new_with_options(d.clone(), options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] + assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 42] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 42] + assert!(rows.row(3) > rows.row(5)); // null > [] + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + + let options = SortOptions { + descending: true, + nulls_first: false, + }; + let field = SortField::new_with_options(d.clone(), options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] + assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 42] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 42] + assert!(rows.row(3) > rows.row(5)); // null > [] + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + + let options = SortOptions { + descending: true, + nulls_first: true, + }; + let field = SortField::new_with_options(d, options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] + assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 42] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 42] + assert!(rows.row(3) < rows.row(5)); // null < [] + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + } + + fn test_nested_list() { + let mut builder = GenericListBuilder::::new( + GenericListBuilder::::new(Int32Builder::new()), + ); + + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(1); + builder.values().values().append_null(); + builder.values().append(true); + builder.values().append(false); + builder.append(true); + builder.append(false); + + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.append(true); + + let list = Arc::new(builder.finish()) as ArrayRef; + let d = list.data_type().clone(); + + // [ + // [[1, 2], [1, null]], + // [[1, null], [1, null]], + // [[1, null], null] + // null + // [[1, 2]] + // ] + let options = SortOptions { + descending: false, + nulls_first: true, + }; + let field = SortField::new_with_options(d.clone(), options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); + assert!(rows.row(1) > rows.row(2)); + assert!(rows.row(2) > rows.row(3)); + assert!(rows.row(4) < rows.row(0)); + assert!(rows.row(4) > rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + + let options = SortOptions { + descending: true, + nulls_first: true, + }; + let field = SortField::new_with_options(d.clone(), options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) > rows.row(1)); + assert!(rows.row(1) > rows.row(2)); + assert!(rows.row(2) > rows.row(3)); + assert!(rows.row(4) > rows.row(0)); + assert!(rows.row(4) > rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + + let options = SortOptions { + descending: true, + nulls_first: false, + }; + let field = SortField::new_with_options(d, options); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); + + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(1) < rows.row(2)); + assert!(rows.row(2) < rows.row(3)); + assert!(rows.row(4) > rows.row(0)); + assert!(rows.row(4) < rows.row(1)); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + back[0].data().validate_full().unwrap(); + assert_eq!(&back[0], &list); + } + + #[test] + fn test_list() { + test_single_list::(); + test_nested_list::(); + } + + #[test] + fn test_large_list() { + test_single_list::(); + test_nested_list::(); + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, diff --git a/arrow/src/row/variable.rs b/arrow/src/row/variable.rs index 3aa0b4839435..9162f2312031 100644 --- a/arrow/src/row/variable.rs +++ b/arrow/src/row/variable.rs @@ -37,9 +37,16 @@ pub const EMPTY_SENTINEL: u8 = 1; pub const NON_EMPTY_SENTINEL: u8 = 2; /// Returns the length of the encoded representation of a byte array, including the null byte +#[inline] pub fn encoded_len(a: Option<&[u8]>) -> usize { + padded_length(a.map(|x| x.len())) +} + +/// Returns the padded length of the encoded length of the given length +#[inline] +pub fn padded_length(a: Option) -> usize { match a { - Some(a) => 1 + ceil(a.len(), BLOCK_SIZE) * (BLOCK_SIZE + 1), + Some(a) => 1 + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), None => 1, } } @@ -61,59 +68,62 @@ pub fn encode<'a, I: Iterator>>( opts: SortOptions, ) { for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { - match maybe_val { - Some(val) if val.is_empty() => { - out.buffer[*offset] = match opts.descending { - true => !EMPTY_SENTINEL, - false => EMPTY_SENTINEL, - }; - *offset += 1; + *offset += encode_one(&mut out.buffer[*offset..], maybe_val, opts); + } +} + +pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize { + match val { + Some(val) if val.is_empty() => { + out[0] = match opts.descending { + true => !EMPTY_SENTINEL, + false => EMPTY_SENTINEL, + }; + 1 + } + Some(val) => { + let block_count = ceil(val.len(), BLOCK_SIZE); + let end_offset = 1 + block_count * (BLOCK_SIZE + 1); + let to_write = &mut out[..end_offset]; + + // Write `2_u8` to demarcate as non-empty, non-null string + to_write[0] = NON_EMPTY_SENTINEL; + + let chunks = val.chunks_exact(BLOCK_SIZE); + let remainder = chunks.remainder(); + for (input, output) in chunks + .clone() + .zip(to_write[1..].chunks_exact_mut(BLOCK_SIZE + 1)) + { + let input: &[u8; BLOCK_SIZE] = input.try_into().unwrap(); + let out_block: &mut [u8; BLOCK_SIZE] = + (&mut output[..BLOCK_SIZE]).try_into().unwrap(); + + *out_block = *input; + + // Indicate that there are further blocks to follow + output[BLOCK_SIZE] = BLOCK_CONTINUATION; } - Some(val) => { - let block_count = ceil(val.len(), BLOCK_SIZE); - let end_offset = *offset + 1 + block_count * (BLOCK_SIZE + 1); - let to_write = &mut out.buffer[*offset..end_offset]; - - // Write `2_u8` to demarcate as non-empty, non-null string - to_write[0] = NON_EMPTY_SENTINEL; - - let chunks = val.chunks_exact(BLOCK_SIZE); - let remainder = chunks.remainder(); - for (input, output) in chunks - .clone() - .zip(to_write[1..].chunks_exact_mut(BLOCK_SIZE + 1)) - { - let input: &[u8; BLOCK_SIZE] = input.try_into().unwrap(); - let out_block: &mut [u8; BLOCK_SIZE] = - (&mut output[..BLOCK_SIZE]).try_into().unwrap(); - - *out_block = *input; - - // Indicate that there are further blocks to follow - output[BLOCK_SIZE] = BLOCK_CONTINUATION; - } - - if !remainder.is_empty() { - let start_offset = 1 + (block_count - 1) * (BLOCK_SIZE + 1); - to_write[start_offset..start_offset + remainder.len()] - .copy_from_slice(remainder); - *to_write.last_mut().unwrap() = remainder.len() as u8; - } else { - // We must overwrite the continuation marker written by the loop above - *to_write.last_mut().unwrap() = BLOCK_SIZE as u8; - } - - *offset = end_offset; - - if opts.descending { - // Invert bits - to_write.iter_mut().for_each(|v| *v = !*v) - } + + if !remainder.is_empty() { + let start_offset = 1 + (block_count - 1) * (BLOCK_SIZE + 1); + to_write[start_offset..start_offset + remainder.len()] + .copy_from_slice(remainder); + *to_write.last_mut().unwrap() = remainder.len() as u8; + } else { + // We must overwrite the continuation marker written by the loop above + *to_write.last_mut().unwrap() = BLOCK_SIZE as u8; } - None => { - out.buffer[*offset] = null_sentinel(opts); - *offset += 1; + + if opts.descending { + // Invert bits + to_write.iter_mut().for_each(|v| *v = !*v) } + end_offset + } + None => { + out[0] = null_sentinel(opts); + 1 } } } From 2da6aab1b087d121a57567459607e44a8777befe Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 18:42:41 +0000 Subject: [PATCH 0368/1411] Add more ClientConfig Options for Object Store RequestBuilder (#3127) (#3256) * Add more ClientConfig Options (#3127) * Add header support --- object_store/src/client/mod.rs | 151 +++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 2b58a77f2ce3..47e68637b663 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -24,7 +24,9 @@ pub mod pagination; pub mod retry; pub mod token; +use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use std::time::Duration; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -33,11 +35,25 @@ fn map_client_error(e: reqwest::Error) -> super::Error { } } +static DEFAULT_USER_AGENT: &str = + concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { + user_agent: Option, + default_headers: Option, proxy_url: Option, allow_http: bool, + timeout: Option, + connect_timeout: Option, + pool_idle_timeout: Option, + pool_max_idle_per_host: Option, + http2_keep_alive_interval: Option, + http2_keep_alive_timeout: Option, + http2_keep_alive_while_idle: bool, + http1_only: bool, + http2_only: bool, } impl ClientOptions { @@ -46,6 +62,20 @@ impl ClientOptions { Default::default() } + /// Sets the User-Agent header to be used by this client + /// + /// Default is based on the version of this crate + pub fn with_user_agent(mut self, agent: HeaderValue) -> Self { + self.user_agent = Some(agent); + self + } + + /// Sets the default headers for every request + pub fn with_default_headers(mut self, headers: HeaderMap) -> Self { + self.default_headers = Some(headers); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -54,19 +84,140 @@ impl ClientOptions { self } + /// Only use http1 connections + pub fn with_http1_only(mut self) -> Self { + self.http1_only = true; + self + } + + /// Only use http2 connections + pub fn with_http2_only(mut self) -> Self { + self.http2_only = true; + self + } + /// Set an HTTP proxy to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); self } + /// Set a request timeout + /// + /// The timeout is applied from when the request starts connecting until the + /// response body has finished + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = Some(timeout); + self + } + + /// Set a timeout for only the connect phase of a Client + pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { + self.connect_timeout = Some(timeout); + self + } + + /// Set the pool max idle timeout + /// + /// This is the length of time an idle connection will be kept alive + /// + /// Default is 90 seconds + pub fn with_pool_idle_timeout(mut self, timeout: Duration) -> Self { + self.pool_idle_timeout = Some(timeout); + self + } + + /// Set the maximum number of idle connections per host + /// + /// Default is no limit + pub fn with_pool_max_idle_per_host(mut self, max: usize) -> Self { + self.pool_max_idle_per_host = Some(max); + self + } + + /// Sets an interval for HTTP2 Ping frames should be sent to keep a connection alive. + /// + /// Default is disabled + pub fn with_http2_keep_alive_interval(mut self, interval: Duration) -> Self { + self.http2_keep_alive_interval = Some(interval); + self + } + + /// Sets a timeout for receiving an acknowledgement of the keep-alive ping. + /// + /// If the ping is not acknowledged within the timeout, the connection will be closed. + /// Does nothing if http2_keep_alive_interval is disabled. + /// + /// Default is disabled + pub fn with_http2_keep_alive_timeout(mut self, interval: Duration) -> Self { + self.http2_keep_alive_timeout = Some(interval); + self + } + + /// Enable HTTP2 keep alive pings for idle connections + /// + /// If disabled, keep-alive pings are only sent while there are open request/response + /// streams. If enabled, pings are also sent when no streams are active + /// + /// Default is disabled + pub fn with_http2_keep_alive_while_idle(mut self) -> Self { + self.http2_keep_alive_while_idle = true; + self + } + pub(crate) fn client(&self) -> super::Result { let mut builder = ClientBuilder::new(); + + match &self.user_agent { + Some(user_agent) => builder = builder.user_agent(user_agent), + None => builder = builder.user_agent(DEFAULT_USER_AGENT), + } + + if let Some(headers) = &self.default_headers { + builder = builder.default_headers(headers.clone()) + } + if let Some(proxy) = &self.proxy_url { let proxy = Proxy::all(proxy).map_err(map_client_error)?; builder = builder.proxy(proxy); } + if let Some(timeout) = self.timeout { + builder = builder.timeout(timeout) + } + + if let Some(timeout) = self.connect_timeout { + builder = builder.connect_timeout(timeout) + } + + if let Some(timeout) = self.pool_idle_timeout { + builder = builder.pool_idle_timeout(timeout) + } + + if let Some(max) = self.pool_max_idle_per_host { + builder = builder.pool_max_idle_per_host(max) + } + + if let Some(interval) = self.http2_keep_alive_interval { + builder = builder.http2_keep_alive_interval(interval) + } + + if let Some(interval) = self.http2_keep_alive_timeout { + builder = builder.http2_keep_alive_timeout(interval) + } + + if self.http2_keep_alive_while_idle { + builder = builder.http2_keep_alive_while_idle(true) + } + + if self.http1_only { + builder = builder.http1_only() + } + + if self.http2_only { + builder = builder.http2_prior_knowledge() + } + builder .https_only(!self.allow_http) .build() From 9abdb5572bcaf4d594514d58410a812f3a490c55 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 2 Dec 2022 10:49:46 -0800 Subject: [PATCH 0369/1411] Make arithmetic kernels supports dictionary of decimal array (#3255) --- arrow/src/compute/kernels/arithmetic.rs | 80 ++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index c57e27095c23..cafd63620571 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -35,8 +35,8 @@ use crate::datatypes::{ }; #[cfg(feature = "dyn_arith_dict")] use crate::datatypes::{ - Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, - UInt32Type, UInt64Type, UInt8Type, + Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use crate::error::{ArrowError, Result}; use crate::{datatypes, downcast_primitive_array}; @@ -461,6 +461,14 @@ macro_rules! typed_dict_op { let array = $MATH_OP::<$KT, Float64Type, _>($LEFT, $RIGHT, $OP)?; Ok(Arc::new(array)) } + (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { + let array = $MATH_OP::<$KT, Decimal128Type, _>($LEFT, $RIGHT, $OP)?; + Ok(Arc::new(array)) + } + (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { + let array = $MATH_OP::<$KT, Decimal256Type, _>($LEFT, $RIGHT, $OP)?; + Ok(Arc::new(array)) + } (t1, t2) => Err(ArrowError::CastError(format!( "Cannot perform arithmetic operation on two dictionary arrays of different value types ({} and {})", t1, t2 @@ -3150,4 +3158,72 @@ mod tests { let overflow = try_unary_mut(a, |value| value.add_checked(1)); let _ = overflow.unwrap().expect_err("overflow should be detected"); } + + #[test] + #[cfg(feature = "dyn_arith_dict")] + fn test_dict_decimal() { + let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); + let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let result = add_dyn(&array1, &array2).unwrap(); + let expected = + Arc::new(Decimal128Array::from(vec![8, 9, 2, 8, 6, 5])) as ArrayRef; + assert_eq!(&result, &expected); + + let result = subtract_dyn(&array1, &array2).unwrap(); + let expected = + Arc::new(Decimal128Array::from(vec![-6, -5, 8, 0, 0, -5])) as ArrayRef; + assert_eq!(&result, &expected); + + let values = Decimal256Array::from_iter_values([ + i256::from_i128(0), + i256::from_i128(1), + i256::from_i128(2), + i256::from_i128(3), + i256::from_i128(4), + i256::from_i128(5), + ]); + let keys = + Int8Array::from(vec![Some(1_i8), None, Some(5), Some(4), Some(3), None]); + let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let values = Decimal256Array::from_iter_values([ + i256::from_i128(7), + i256::from_i128(-3), + i256::from_i128(4), + i256::from_i128(3), + i256::from_i128(5), + ]); + let keys = + Int8Array::from(vec![Some(0_i8), Some(0), None, Some(2), Some(3), Some(4)]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let result = add_dyn(&array1, &array2).unwrap(); + let expected = Arc::new(Decimal256Array::from(vec![ + Some(i256::from_i128(8)), + None, + None, + Some(i256::from_i128(8)), + Some(i256::from_i128(6)), + None, + ])) as ArrayRef; + + assert_eq!(&result, &expected); + + let result = subtract_dyn(&array1, &array2).unwrap(); + let expected = Arc::new(Decimal256Array::from(vec![ + Some(i256::from_i128(-6)), + None, + None, + Some(i256::from_i128(0)), + Some(i256::from_i128(0)), + None, + ])) as ArrayRef; + assert_eq!(&result, &expected); + } } From ecbb8c23765ff6530ca32b5b3139713d6aaebfed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 19:30:11 +0000 Subject: [PATCH 0370/1411] Add BooleanArray::from_unary and BooleanArray::from_binary (#3258) * Add BooleanArray::from_unary and BooleanArray::from_binary * Add docs * Tweak signatures * Remove fallibility from combine_option_bitmap * Remove unused compare_option_bitmap * Remove fallibility * Fix doc --- arrow-array/src/array/boolean_array.rs | 87 +++++++ arrow-data/src/bit_mask.rs | 141 +++++++++++ arrow/src/compute/kernels/arithmetic.rs | 8 +- arrow/src/compute/kernels/arity.rs | 10 +- arrow/src/compute/kernels/boolean.rs | 4 +- arrow/src/compute/kernels/comparison.rs | 56 +---- arrow/src/compute/kernels/concat_elements.rs | 6 +- arrow/src/compute/mod.rs | 2 - arrow/src/compute/util.rs | 243 ------------------- 9 files changed, 250 insertions(+), 307 deletions(-) delete mode 100644 arrow/src/compute/util.rs diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index e166f467a70c..920fdabc2c71 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -20,6 +20,7 @@ use crate::iterator::BooleanIter; use crate::raw_pointer::RawPtrBox; use crate::{print_long_array, Array, ArrayAccessor}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; @@ -173,6 +174,92 @@ impl BooleanArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + + /// Create a [`BooleanArray`] by evaluating the operation for + /// each element of the provided array + /// + /// ``` + /// # use arrow_array::{BooleanArray, Int32Array}; + /// + /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); + /// let r = BooleanArray::from_unary(&array, |x| x > 2); + /// assert_eq!(&r, &BooleanArray::from(vec![false, false, true, true, true])); + /// ``` + pub fn from_unary(left: T, mut op: F) -> Self + where + F: FnMut(T::Item) -> bool, + { + let null_bit_buffer = left + .data() + .null_buffer() + .map(|b| b.bit_slice(left.offset(), left.len())); + + let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + // SAFETY: i in range 0..len + op(left.value_unchecked(i)) + }); + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; + Self::from(data) + } + + /// Create a [`BooleanArray`] by evaluating the binary operation for + /// each element of the provided arrays + /// + /// ``` + /// # use arrow_array::{BooleanArray, Int32Array}; + /// + /// let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + /// let b = Int32Array::from(vec![1, 2, 0, 2, 5]); + /// let r = BooleanArray::from_binary(&a, &b, |a, b| a == b); + /// assert_eq!(&r, &BooleanArray::from(vec![true, true, false, false, true])); + /// ``` + /// + /// # Panics + /// + /// This function panics if left and right are not the same length + /// + pub fn from_binary( + left: T, + right: S, + mut op: F, + ) -> Self + where + F: FnMut(T::Item, S::Item) -> bool, + { + assert_eq!(left.len(), right.len()); + + let null_bit_buffer = + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); + + let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + // SAFETY: i in range 0..len + op(left.value_unchecked(i), right.value_unchecked(i)) + }); + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; + Self::from(data) + } } impl Array for BooleanArray { diff --git a/arrow-data/src/bit_mask.rs b/arrow-data/src/bit_mask.rs index 6a0a46038992..ed8e65257788 100644 --- a/arrow-data/src/bit_mask.rs +++ b/arrow-data/src/bit_mask.rs @@ -17,8 +17,11 @@ //! Utils for working with packed bit masks +use crate::ArrayData; use arrow_buffer::bit_chunk_iterator::BitChunks; use arrow_buffer::bit_util::{ceil, get_bit, set_bit}; +use arrow_buffer::buffer::buffer_bin_and; +use arrow_buffer::Buffer; /// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the /// bits in `data` in the range `[offset_read..offset_read+len]` @@ -62,9 +65,41 @@ pub fn set_bits( null_count as usize } +/// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. +/// +/// This function is useful when implementing operations on higher level arrays. +pub fn combine_option_bitmap( + arrays: &[&ArrayData], + len_in_bits: usize, +) -> Option { + let (buffer, offset) = arrays + .iter() + .map(|array| (array.null_buffer().cloned(), array.offset())) + .reduce(|acc, buffer_and_offset| match (acc, buffer_and_offset) { + ((None, _), (None, _)) => (None, 0), + ((Some(buffer), offset), (None, _)) | ((None, _), (Some(buffer), offset)) => { + (Some(buffer), offset) + } + ((Some(buffer_left), offset_left), (Some(buffer_right), offset_right)) => ( + Some(buffer_bin_and( + &buffer_left, + offset_left, + &buffer_right, + offset_right, + len_in_bits, + )), + 0, + ), + })?; + + Some(buffer?.bit_slice(offset, len_in_bits)) +} + #[cfg(test)] mod tests { use super::*; + use arrow_schema::DataType; + use std::sync::Arc; #[test] fn test_set_bits_aligned() { @@ -187,4 +222,110 @@ mod tests { assert_eq!(destination, expected_data); assert_eq!(result, expected_null_count); } + + fn make_data_with_null_bit_buffer( + len: usize, + offset: usize, + null_bit_buffer: Option, + ) -> Arc { + let buffer = Buffer::from(&vec![11; len + offset]); + + Arc::new( + ArrayData::try_new( + DataType::UInt8, + len, + null_bit_buffer, + offset, + vec![buffer], + vec![], + ) + .unwrap(), + ) + } + + #[test] + fn test_combine_option_bitmap() { + let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); + let some_bitmap = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); + let inverse_bitmap = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); + let some_other_bitmap = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b11010111]))); + assert_eq!(None, combine_option_bitmap(&[], 8)); + assert_eq!( + Some(Buffer::from([0b01001010])), + combine_option_bitmap(&[&some_bitmap], 8) + ); + assert_eq!( + None, + combine_option_bitmap(&[&none_bitmap, &none_bitmap], 8) + ); + assert_eq!( + Some(Buffer::from([0b01001010])), + combine_option_bitmap(&[&some_bitmap, &none_bitmap], 8) + ); + assert_eq!( + Some(Buffer::from([0b11010111])), + combine_option_bitmap(&[&none_bitmap, &some_other_bitmap], 8) + ); + assert_eq!( + Some(Buffer::from([0b01001010])), + combine_option_bitmap(&[&some_bitmap, &some_bitmap], 8,) + ); + assert_eq!( + Some(Buffer::from([0b0])), + combine_option_bitmap(&[&some_bitmap, &inverse_bitmap], 8,) + ); + assert_eq!( + Some(Buffer::from([0b01000010])), + combine_option_bitmap(&[&some_bitmap, &some_other_bitmap, &none_bitmap], 8,) + ); + assert_eq!( + Some(Buffer::from([0b00001001])), + combine_option_bitmap( + &[ + &some_bitmap.slice(3, 5), + &inverse_bitmap.slice(2, 5), + &some_other_bitmap.slice(1, 5) + ], + 5, + ) + ); + } + + #[test] + fn test_combine_option_bitmap_with_offsets() { + let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); + let bitmap0 = + make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10101010]))); + let bitmap1 = + make_data_with_null_bit_buffer(8, 1, Some(Buffer::from([0b01010100, 0b1]))); + let bitmap2 = + make_data_with_null_bit_buffer(8, 2, Some(Buffer::from([0b10101000, 0b10]))); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1], 8) + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap2], 8) + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1, &none_bitmap], 8) + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&none_bitmap, &bitmap2], 8) + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap0, &bitmap1], 8) + ); + assert_eq!( + Some(Buffer::from([0b10101010])), + combine_option_bitmap(&[&bitmap1, &bitmap2], 8) + ); + } } diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index cafd63620571..23cefe48e2c8 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -310,10 +310,10 @@ where } // Create the combined `Bitmap` - let null_bit_buffer = crate::compute::util::combine_option_bitmap( + let null_bit_buffer = arrow_data::bit_mask::combine_option_bitmap( &[left.data_ref(), right.data_ref()], left.len(), - )?; + ); let lanes = T::lanes(); let buffer_size = left.len() * std::mem::size_of::(); @@ -660,10 +660,10 @@ where ))); } - let null_bit_buffer = crate::compute::util::combine_option_bitmap( + let null_bit_buffer = arrow_data::bit_mask::combine_option_bitmap( &[left.data_ref(), right.data_ref()], left.len(), - )?; + ); // Safety justification: Since the inputs are valid Arrow arrays, all values are // valid indexes into the dictionary (which is verified during construction) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index d0f18cf5866d..6207ab63935d 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -22,12 +22,12 @@ use crate::array::{ PrimitiveArray, }; use crate::buffer::Buffer; -use crate::compute::util::combine_option_bitmap; use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_iterator::try_for_each_valid_idx; use arrow_buffer::MutableBuffer; +use arrow_data::bit_mask::combine_option_bitmap; use std::sync::Arc; #[inline] @@ -215,7 +215,7 @@ where return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); let null_count = null_buffer .as_ref() .map(|x| len - x.count_set_bits_offset(0, len)) @@ -275,7 +275,7 @@ where let len = a.len(); - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); let null_count = null_buffer .as_ref() .map(|x| len - x.count_set_bits_offset(0, len)) @@ -333,7 +333,7 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls(len, a, b, op) } else { - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); let null_count = null_buffer .as_ref() @@ -401,7 +401,7 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls_mut(len, a, b, op) } else { - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); let null_count = null_buffer .as_ref() .map(|x| len - x.count_set_bits_offset(0, len)) diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index 1b33fa19ea02..aa42f3d20c03 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -29,10 +29,10 @@ use crate::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, }; -use crate::compute::util::combine_option_bitmap; use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util::ceil; +use arrow_data::bit_mask::combine_option_bitmap; /// Updates null buffer based on data buffer and null buffer of the operand at other side /// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false @@ -108,7 +108,7 @@ pub(crate) fn build_null_buffer_for_and_or( len_in_bits: usize, ) -> Option { // `arrays` are not empty, so safely do `unwrap` directly. - combine_option_bitmap(&[left_data, right_data], len_in_bits).unwrap() + combine_option_bitmap(&[left_data, right_data], len_in_bits) } /// Updates null buffer based on data buffer and null buffer of the operand at other side diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 33a24500aabd..b672410fec15 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -25,12 +25,12 @@ use crate::array::*; use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; -use crate::compute::util::combine_option_bitmap; use crate::datatypes::*; #[allow(unused_imports)] use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_util; +use arrow_data::bit_mask::combine_option_bitmap; use arrow_select::take::take; use num::ToPrimitive; use regex::Regex; @@ -53,26 +53,7 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; - - let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { - // SAFETY: i in range 0..len - op(left.value_unchecked(i), right.value_unchecked(i)) - }); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + Ok(BooleanArray::from_binary(left, right, op)) } /// Helper function to perform boolean lambda function on values from array accessor, this @@ -81,28 +62,7 @@ fn compare_op_scalar(left: T, op: F) -> Result bool, { - let null_bit_buffer = left - .data() - .null_buffer() - .map(|b| b.bit_slice(left.offset(), left.len())); - - let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { - // SAFETY: i in range 0..len - op(left.value_unchecked(i)) - }); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + Ok(BooleanArray::from_unary(left, op)) } /// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified @@ -158,7 +118,7 @@ where } let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len())?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { @@ -1172,7 +1132,7 @@ pub fn regexp_is_match_utf8( )); } let null_bit_buffer = - combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len())?; + combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); let mut patterns: HashMap = HashMap::new(); let mut result = BooleanBufferBuilder::new(array.len()); @@ -2294,7 +2254,7 @@ where } let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], len)?; + combine_option_bitmap(&[left.data_ref(), right.data_ref()], len); // we process the data in chunks so that each iteration results in one u64 of comparison result bits const CHUNK_SIZE: usize = 64; @@ -3701,7 +3661,7 @@ where let num_bytes = bit_util::ceil(left_len, 8); let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len)? { + match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { Some(buff) => buff, None => new_all_set_buffer(num_bytes), }; @@ -3758,7 +3718,7 @@ where let num_bytes = bit_util::ceil(left_len, 8); let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len)? { + match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { Some(buff) => buff, None => new_all_set_buffer(num_bytes), }; diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow/src/compute/kernels/concat_elements.rs index a908ba9ab5d8..25c8f60de3f6 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow/src/compute/kernels/concat_elements.rs @@ -16,8 +16,8 @@ // under the License. use crate::array::*; -use crate::compute::util::combine_option_bitmap; use crate::error::{ArrowError, Result}; +use arrow_data::bit_mask::combine_option_bitmap; /// Returns the elementwise concatenation of a [`StringArray`]. /// @@ -45,7 +45,7 @@ pub fn concat_elements_utf8( ))); } - let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len())?; + let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len()); let left_offsets = left.value_offsets(); let right_offsets = right.value_offsets(); @@ -111,7 +111,7 @@ pub fn concat_elements_utf8_many( .collect::>() .as_slice(), size, - )?; + ); let data_values = arrays .iter() diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index 28e5e6b520bc..c0b10afe48a6 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -19,8 +19,6 @@ pub mod kernels; -mod util; - pub use self::kernels::aggregate::*; pub use self::kernels::arithmetic::*; pub use self::kernels::arity::*; diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs deleted file mode 100644 index 9ddc535017ff..000000000000 --- a/arrow/src/compute/util.rs +++ /dev/null @@ -1,243 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Common utilities for computation kernels. - -use crate::array::*; -use crate::buffer::{buffer_bin_and, Buffer}; -use crate::error::{ArrowError, Result}; - -/// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. -/// -/// This function is useful when implementing operations on higher level arrays. -#[allow(clippy::unnecessary_wraps)] -pub(super) fn combine_option_bitmap( - arrays: &[&ArrayData], - len_in_bits: usize, -) -> Result> { - arrays - .iter() - .map(|array| (array.null_buffer().cloned(), array.offset())) - .reduce(|acc, buffer_and_offset| match (acc, buffer_and_offset) { - ((None, _), (None, _)) => (None, 0), - ((Some(buffer), offset), (None, _)) | ((None, _), (Some(buffer), offset)) => { - (Some(buffer), offset) - } - ((Some(buffer_left), offset_left), (Some(buffer_right), offset_right)) => ( - Some(buffer_bin_and( - &buffer_left, - offset_left, - &buffer_right, - offset_right, - len_in_bits, - )), - 0, - ), - }) - .map_or( - Err(ArrowError::ComputeError( - "Arrays must not be empty".to_string(), - )), - |(buffer, offset)| { - Ok(buffer.map(|buffer| buffer.bit_slice(offset, len_in_bits))) - }, - ) -} - -#[cfg(test)] -pub(super) mod tests { - use super::*; - - use std::sync::Arc; - - use crate::array::ArrayData; - use crate::buffer::buffer_bin_or; - use crate::datatypes::DataType; - - /// Compares the null bitmaps of two arrays using a bitwise `or` operation. - /// - /// This function is useful when implementing operations on higher level arrays. - pub(super) fn compare_option_bitmap( - left_data: &ArrayData, - right_data: &ArrayData, - len_in_bits: usize, - ) -> Result> { - let left_offset_in_bits = left_data.offset(); - let right_offset_in_bits = right_data.offset(); - - let left = left_data.null_buffer(); - let right = right_data.null_buffer(); - - match left { - None => match right { - None => Ok(None), - Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))), - }, - Some(l) => match right { - None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))), - - Some(r) => Ok(Some(buffer_bin_or( - l, - left_offset_in_bits, - r, - right_offset_in_bits, - len_in_bits, - ))), - }, - } - } - - fn make_data_with_null_bit_buffer( - len: usize, - offset: usize, - null_bit_buffer: Option, - ) -> Arc { - let buffer = Buffer::from(&vec![11; len + offset]); - - Arc::new( - ArrayData::try_new( - DataType::UInt8, - len, - null_bit_buffer, - offset, - vec![buffer], - vec![], - ) - .unwrap(), - ) - } - - #[test] - fn test_combine_option_bitmap() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let some_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); - let inverse_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); - let some_other_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b11010111]))); - assert_eq!( - combine_option_bitmap(&[], 8).unwrap_err().to_string(), - "Compute error: Arrays must not be empty", - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap], 8).unwrap() - ); - assert_eq!( - None, - combine_option_bitmap(&[&none_bitmap, &none_bitmap], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap, &none_bitmap], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b11010111])), - combine_option_bitmap(&[&none_bitmap, &some_other_bitmap], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap, &some_bitmap], 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b0])), - combine_option_bitmap(&[&some_bitmap, &inverse_bitmap], 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01000010])), - combine_option_bitmap(&[&some_bitmap, &some_other_bitmap, &none_bitmap], 8,) - .unwrap() - ); - assert_eq!( - Some(Buffer::from([0b00001001])), - combine_option_bitmap( - &[ - &some_bitmap.slice(3, 5), - &inverse_bitmap.slice(2, 5), - &some_other_bitmap.slice(1, 5) - ], - 5, - ) - .unwrap() - ); - } - - #[test] - fn test_combine_option_bitmap_with_offsets() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let bitmap0 = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10101010]))); - let bitmap1 = - make_data_with_null_bit_buffer(8, 1, Some(Buffer::from([0b01010100, 0b1]))); - let bitmap2 = - make_data_with_null_bit_buffer(8, 2, Some(Buffer::from([0b10101000, 0b10]))); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap2], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1, &none_bitmap], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&none_bitmap, &bitmap2], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap0, &bitmap1], 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1, &bitmap2], 8).unwrap() - ); - } - - #[test] - fn test_compare_option_bitmap() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let some_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); - let inverse_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); - assert_eq!( - None, - compare_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b11111111])), - compare_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap() - ); - } -} From 74b174ca7a6c11d2b410334f45aa399987b35fc0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Dec 2022 15:19:19 -0500 Subject: [PATCH 0371/1411] Update object_store version to 0.5.2 and add CHANGELOG (#3253) * Update object_store crate version to 0.5.2 * Initial changelog * Updates * More update * Update Changlog, semi manually * final updates --- object_store/CHANGELOG-old.md | 29 +++++++++++++++++- object_store/CHANGELOG.md | 30 ++++++++++--------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 8 ++--- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index bf1ef6219fc2..1397d8a8e3d0 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,34 @@ # Historical Changelog +# Changelog + +## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) + +**Implemented enhancements:** + +- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) +- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) +- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) + + ## [object_store_0.5.0](https://github.com/apache/arrow-rs/tree/object_store_0.5.0) (2022-09-08) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.4.0...object_store_0.5.0) @@ -105,4 +133,3 @@ - Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* - diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 6919111099fd..528d649df5e7 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,30 +19,32 @@ # Changelog -## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) +## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) **Implemented enhancements:** -- Allow HTTP S3 URLs [\#2806](https://github.com/apache/arrow-rs/issues/2806) -- object\_store: support AWS ECS instance credentials [\#2802](https://github.com/apache/arrow-rs/issues/2802) -- Object Store S3 Alibaba Cloud OSS support [\#2777](https://github.com/apache/arrow-rs/issues/2777) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Expose option to use GCS object store in integration tests [\#2627](https://github.com/apache/arrow-rs/issues/2627) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) +- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- S3 Signature Error Performing List With Prefix Containing Spaces [\#2800](https://github.com/apache/arrow-rs/issues/2800) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Erratic Behaviour if Incorrect S3 Region Configured [\#2795](https://github.com/apache/arrow-rs/issues/2795) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) +- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Support for overriding instance metadata endpoint [\#2811](https://github.com/apache/arrow-rs/pull/2811) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Allow Configuring non-TLS HTTP Connections in AmazonS3Builder::from\_env [\#2807](https://github.com/apache/arrow-rs/pull/2807) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Fix S3 query canonicalization \(\#2800\) [\#2801](https://github.com/apache/arrow-rs/pull/2801) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Handle incomplete HTTP redirects missing LOCATION \(\#2795\) [\#2796](https://github.com/apache/arrow-rs/pull/2796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Handle S3 virtual host request type [\#2782](https://github.com/apache/arrow-rs/pull/2782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) -- Fix object\_store multipart uploads on S3 Compatible Stores [\#2731](https://github.com/apache/arrow-rs/pull/2731) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mildbyte](https://github.com/mildbyte)) +- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) +- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) +- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index fd7442f9e84a..9b1dee5c54ab 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.1" +version = "0.5.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 865acdeb0286..cf070d3c5dcd 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.0" -FUTURE_RELEASE="object_store_0.5.1" +SINCE_TAG="object_store_0.5.1" +FUTURE_RELEASE="object_store_0.5.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -49,8 +49,8 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-file=.githubchangeloggenerator.cache \ --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ - --max-issues=300 \ - --exclude-tags-regex "^\d+\.\d+\.\d+$" \ + --max-issues=600 \ + --exclude-tags-regex "(^\d+\.\d+\.\d+$)|(rc)" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} From f6bd9b5b9944955c1bf4cb6e4aaf21f8f249ddba Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Dec 2022 22:16:18 +0000 Subject: [PATCH 0372/1411] Minor: Remove parquet build script (#3257) * Remove parquet build script * Format --- parquet/Cargo.toml | 1 - parquet/build.rs | 24 ------------------------ parquet/src/file/properties.rs | 3 ++- 3 files changed, 2 insertions(+), 26 deletions(-) delete mode 100644 parquet/build.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index b2d878dd5930..eecfd55b9098 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -25,7 +25,6 @@ repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" -build = "build.rs" edition = "2021" rust-version = "1.62" diff --git a/parquet/build.rs b/parquet/build.rs deleted file mode 100644 index 8aada1835ce1..000000000000 --- a/parquet/build.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -fn main() { - // Set Parquet version and "created by" string. - let version = env!("CARGO_PKG_VERSION"); - let created_by = format!("parquet-rs version {}", version); - println!("cargo:rustc-env=PARQUET_VERSION={}", version); - println!("cargo:rustc-env=PARQUET_CREATED_BY={}", created_by); -} diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c8083fcf30fa..ae13eff201bd 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -81,7 +81,8 @@ const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; -const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); +const DEFAULT_CREATED_BY: &str = + concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); /// default value for the false positive probability used in a bloom filter. pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; /// default value for the expected number of distinct values used in a bloom filter. From bcfbd4604c91797393163c5e941638720fa64533 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 2 Dec 2022 15:38:11 -0800 Subject: [PATCH 0373/1411] Skip aws integration test (#3262) --- object_store/src/aws/credential.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 32430d7f9668..900af24062ca 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -664,6 +664,7 @@ mod tests { async fn test_instance_metadata() { if env::var("TEST_INTEGRATION").is_err() { eprintln!("skipping AWS integration test"); + return; } // For example https://github.com/aws/amazon-ec2-metadata-mock From cb4170b50a54c466897afc83583f01dca23544c0 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Sat, 3 Dec 2022 23:39:25 +0800 Subject: [PATCH 0374/1411] Get the round result for decimal to a decimal with smaller scale (#3224) * support cast decimal for round when the option is false * fix conflict after merge * fix error case * change to wrapping api --- arrow-cast/src/cast.rs | 143 ++++++++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 32 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index be767f137cd8..8d28a6cc772d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2164,6 +2164,7 @@ fn cast_decimal_to_decimal( if BYTE_WIDTH1 == 16 { let array = array.as_any().downcast_ref::().unwrap(); if BYTE_WIDTH2 == 16 { + // the div must be greater or equal than 10 let div = 10_i128 .pow_checked((input_scale - output_scale) as u32) .map_err(|_| { @@ -2172,10 +2173,23 @@ fn cast_decimal_to_decimal( *output_scale, )) })?; + let half = div / 2; + let neg_half = -half; array .try_unary::<_, Decimal128Type, _>(|v| { - v.checked_div(div).ok_or_else(|| { + // cast to smaller scale, need to round the result + // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= 0 && r >= half { + d.checked_add(1) + } else if v < 0 && r <= neg_half { + d.checked_sub(1) + } else { + Some(d) + } + .ok_or_else(|| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). Overflowing on {:?}", Decimal128Type::PREFIX, @@ -2199,9 +2213,23 @@ fn cast_decimal_to_decimal( )) })?; + let half = div / i256::from_i128(2_i128); + let neg_half = -half; + array .try_unary::<_, Decimal256Type, _>(|v| { - i256::from_i128(v).checked_div(div).ok_or_else(|| { + // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation + let v = i256::from_i128(v); + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= i256::ZERO && r >= half { + d.checked_add(i256::ONE) + } else if v < i256::ZERO && r <= neg_half { + d.checked_sub(i256::ONE) + } else { + Some(d) + } + .ok_or_else(|| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). Overflowing on {:?}", Decimal256Type::PREFIX, @@ -2226,10 +2254,21 @@ fn cast_decimal_to_decimal( *output_scale, )) })?; + let half = div / i256::from_i128(2_i128); + let neg_half = -half; if BYTE_WIDTH2 == 16 { array .try_unary::<_, Decimal128Type, _>(|v| { - v.checked_div(div).ok_or_else(|| { + // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= i256::ZERO && r >= half { + d.checked_add(i256::ONE) + } else if v < i256::ZERO && r <= neg_half { + d.checked_sub(i256::ONE) + } else { + Some(d) + }.ok_or_else(|| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). Overflowing on {:?}", Decimal128Type::PREFIX, @@ -2250,7 +2289,17 @@ fn cast_decimal_to_decimal( } else { array .try_unary::<_, Decimal256Type, _>(|v| { - v.checked_div(div).ok_or_else(|| { + // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation + let d = v.wrapping_div(div); + let r = v.wrapping_rem(div); + if v >= i256::ZERO && r >= half { + d.checked_add(i256::ONE) + } else if v < i256::ZERO && r <= neg_half { + d.checked_sub(i256::ONE) + } else { + Some(d) + } + .ok_or_else(|| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). Overflowing on {:?}", Decimal256Type::PREFIX, @@ -3621,6 +3670,26 @@ mod tests { } } } + + let cast_option = CastOptions { safe: false }; + let casted_array_with_option = + cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); + let result_array = casted_array_with_option + .as_any() + .downcast_ref::<$OUTPUT_TYPE_ARRAY>() + .unwrap(); + assert_eq!($OUTPUT_TYPE, result_array.data_type()); + assert_eq!(result_array.len(), $OUTPUT_VALUES.len()); + for (i, x) in $OUTPUT_VALUES.iter().enumerate() { + match x { + Some(x) => { + assert_eq!(result_array.value(i), *x); + } + None => { + assert!(result_array.is_null(i)); + } + } + } }; } @@ -3647,6 +3716,44 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] + #[should_panic( + expected = "5789604461865809771178549250434395392663499233282028201972879200395656481997 cannot be casted to 128-bit integer for Decimal128" + )] + fn test_cast_decimal_to_decimal_round_with_error() { + // decimal256 to decimal128 overflow + let array = vec![ + Some(i256::from_i128(1123454)), + Some(i256::from_i128(2123456)), + Some(i256::from_i128(-3123453)), + Some(i256::from_i128(-3123456)), + None, + Some(i256::MAX), + Some(i256::MIN), + ]; + let input_decimal_array = create_decimal256_array(array, 76, 4).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + let input_type = DataType::Decimal256(76, 4); + let output_type = DataType::Decimal128(20, 3); + assert!(can_cast_types(&input_type, &output_type)); + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(112345_i128), + Some(212346_i128), + Some(-312345_i128), + Some(-312346_i128), + None, + None, + None, + ] + ); + } + + #[test] + #[cfg(not(feature = "force_validate"))] fn test_cast_decimal_to_decimal_round() { let array = vec![ Some(1123454), @@ -3734,34 +3841,6 @@ mod tests { None ] ); - - // decimal256 to decimal128 overflow - let array = vec![ - Some(i256::from_i128(1123454)), - Some(i256::from_i128(2123456)), - Some(i256::from_i128(-3123453)), - Some(i256::from_i128(-3123456)), - None, - Some(i256::MAX), - Some(i256::MIN), - ]; - let input_decimal_array = create_decimal256_array(array, 76, 4).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; - assert!(can_cast_types(&input_type, &output_type)); - generate_cast_test_case!( - &array, - Decimal128Array, - &output_type, - vec![ - Some(112345_i128), - Some(212346_i128), - Some(-312345_i128), - Some(-312346_i128), - None, - None, - None - ] - ); } #[test] From 796b670338ce33806a39777ea18cf6fae8fa7ee4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 3 Dec 2022 17:56:33 +0000 Subject: [PATCH 0375/1411] Fix panic on nullif empty array (#3261) (#3263) * Fix panic on nullif empty array (#3261) * Format --- arrow-select/src/nullif.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index a0a1a3a2206b..23a586f63652 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -39,6 +39,10 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Date: Sat, 3 Dec 2022 15:26:23 -0800 Subject: [PATCH 0376/1411] Enable casting between Dictionary of DecimalArray and DecimalArray (#3238) * Enable casting between Dictionary of DecimalArray and DecimalArray * Add tests and fix more issues * Move Dictionary matches to top --- arrow-cast/src/cast.rs | 261 ++++++++++++++++++++------------------ arrow/tests/array_cast.rs | 86 ++++++++++--- 2 files changed, 206 insertions(+), 141 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8d28a6cc772d..cddbf0d95c41 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -71,24 +71,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } match (from_type, to_type) { - // TODO UTF8 to decimal - // cast one decimal type to another decimal type - (Decimal128(_, _), Decimal128(_, _)) => true, - (Decimal256(_, _), Decimal256(_, _)) => true, - (Decimal128(_, _), Decimal256(_, _)) => true, - (Decimal256(_, _), Decimal128(_, _)) => true, - // unsigned integer to decimal - (UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _)) | - // signed numeric to decimal - (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | - (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | - // decimal to unsigned numeric - (Decimal128(_, _), UInt8 | UInt16 | UInt32 | UInt64) | - (Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | - // decimal to signed numeric - (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | - (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64) - | ( + ( Null, Boolean | Int8 @@ -120,10 +103,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Map(_, _) | Dictionary(_, _) ) => true, - (Decimal128(_, _), _) => false, - (_, Decimal128(_, _)) => false, - (Struct(_), _) => false, - (_, Struct(_)) => false, + // Dictionary/List conditions should be put in front of others + (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { + can_cast_types(from_value_type, to_value_type) + } + (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), + (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (LargeList(list_from), LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) } @@ -140,12 +125,29 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), - (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { - can_cast_types(from_value_type, to_value_type) - } - (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), - (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), - + // TODO UTF8 to decimal + // cast one decimal type to another decimal type + (Decimal128(_, _), Decimal128(_, _)) => true, + (Decimal256(_, _), Decimal256(_, _)) => true, + (Decimal128(_, _), Decimal256(_, _)) => true, + (Decimal256(_, _), Decimal128(_, _)) => true, + // unsigned integer to decimal + (UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _)) | + // signed numeric to decimal + (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | + (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | + // decimal to unsigned numeric + (Decimal128(_, _), UInt8 | UInt16 | UInt32 | UInt64) | + (Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | + // decimal to signed numeric + (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | + (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64) => true, + (Decimal128(_, _), _) => false, + (_, Decimal128(_, _)) => false, + (Decimal256(_, _), _) => false, + (_, Decimal256(_, _)) => false, + (Struct(_), _) => false, + (_, Struct(_)) => false, (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8, (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8, @@ -624,6 +626,103 @@ pub fn cast_with_options( return Ok(array.clone()); } match (from_type, to_type) { + ( + Null, + Boolean + | Int8 + | UInt8 + | Int16 + | UInt16 + | Int32 + | UInt32 + | Float32 + | Date32 + | Time32(_) + | Int64 + | UInt64 + | Float64 + | Date64 + | Timestamp(_, _) + | Time64(_) + | Duration(_) + | Interval(_) + | FixedSizeBinary(_) + | Binary + | Utf8 + | LargeBinary + | LargeUtf8 + | List(_) + | LargeList(_) + | FixedSizeList(_, _) + | Struct(_) + | Map(_, _) + | Dictionary(_, _), + ) => Ok(new_null_array(to_type, array.len())), + (Dictionary(index_type, _), _) => match **index_type { + Int8 => dictionary_cast::(array, to_type, cast_options), + Int16 => dictionary_cast::(array, to_type, cast_options), + Int32 => dictionary_cast::(array, to_type, cast_options), + Int64 => dictionary_cast::(array, to_type, cast_options), + UInt8 => dictionary_cast::(array, to_type, cast_options), + UInt16 => dictionary_cast::(array, to_type, cast_options), + UInt32 => dictionary_cast::(array, to_type, cast_options), + UInt64 => dictionary_cast::(array, to_type, cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from dictionary type {:?} to {:?} not supported", + from_type, to_type, + ))), + }, + (_, Dictionary(index_type, value_type)) => match **index_type { + Int8 => cast_to_dictionary::(array, value_type, cast_options), + Int16 => cast_to_dictionary::(array, value_type, cast_options), + Int32 => cast_to_dictionary::(array, value_type, cast_options), + Int64 => cast_to_dictionary::(array, value_type, cast_options), + UInt8 => cast_to_dictionary::(array, value_type, cast_options), + UInt16 => cast_to_dictionary::(array, value_type, cast_options), + UInt32 => cast_to_dictionary::(array, value_type, cast_options), + UInt64 => cast_to_dictionary::(array, value_type, cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from type {:?} to dictionary type {:?} not supported", + from_type, to_type, + ))), + }, + (List(_), List(ref to)) => { + cast_list_inner::(array, to, to_type, cast_options) + } + (LargeList(_), LargeList(ref to)) => { + cast_list_inner::(array, to, to_type, cast_options) + } + (List(list_from), LargeList(list_to)) => { + if list_to.data_type() != list_from.data_type() { + Err(ArrowError::CastError( + "cannot cast list to large-list with different child data".into(), + )) + } else { + cast_list_container::(&**array, cast_options) + } + } + (LargeList(list_from), List(list_to)) => { + if list_to.data_type() != list_from.data_type() { + Err(ArrowError::CastError( + "cannot cast large-list to list with different child data".into(), + )) + } else { + cast_list_container::(&**array, cast_options) + } + } + (List(_) | LargeList(_), _) => match to_type { + Utf8 => cast_list_to_string!(array, i32), + LargeUtf8 => cast_list_to_string!(array, i64), + _ => Err(ArrowError::CastError( + "Cannot cast list to non-list data types".to_string(), + )), + }, + (_, List(ref to)) => { + cast_primitive_to_list::(array, to, to_type, cast_options) + } + (_, LargeList(ref to)) => { + cast_primitive_to_list::(array, to, to_type, cast_options) + } (Decimal128(_, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal_with_option::<16, 16>(array, s1, p2, s2, cast_options) } @@ -887,107 +986,12 @@ pub fn cast_with_options( ))), } } - ( - Null, - Boolean - | Int8 - | UInt8 - | Int16 - | UInt16 - | Int32 - | UInt32 - | Float32 - | Date32 - | Time32(_) - | Int64 - | UInt64 - | Float64 - | Date64 - | Timestamp(_, _) - | Time64(_) - | Duration(_) - | Interval(_) - | FixedSizeBinary(_) - | Binary - | Utf8 - | LargeBinary - | LargeUtf8 - | List(_) - | LargeList(_) - | FixedSizeList(_, _) - | Struct(_) - | Map(_, _) - | Dictionary(_, _), - ) => Ok(new_null_array(to_type, array.len())), (Struct(_), _) => Err(ArrowError::CastError( "Cannot cast from struct to other types".to_string(), )), (_, Struct(_)) => Err(ArrowError::CastError( "Cannot cast to struct from other types".to_string(), )), - (List(_), List(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (LargeList(_), LargeList(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (List(list_from), LargeList(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast list to large-list with different child data".into(), - )) - } else { - cast_list_container::(&**array, cast_options) - } - } - (LargeList(list_from), List(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast large-list to list with different child data".into(), - )) - } else { - cast_list_container::(&**array, cast_options) - } - } - (List(_) | LargeList(_), Utf8) => cast_list_to_string!(array, i32), - (List(_) | LargeList(_), LargeUtf8) => cast_list_to_string!(array, i64), - (List(_), _) => Err(ArrowError::CastError( - "Cannot cast list to non-list data types".to_string(), - )), - (_, List(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } - (_, LargeList(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } - (Dictionary(index_type, _), _) => match **index_type { - Int8 => dictionary_cast::(array, to_type, cast_options), - Int16 => dictionary_cast::(array, to_type, cast_options), - Int32 => dictionary_cast::(array, to_type, cast_options), - Int64 => dictionary_cast::(array, to_type, cast_options), - UInt8 => dictionary_cast::(array, to_type, cast_options), - UInt16 => dictionary_cast::(array, to_type, cast_options), - UInt32 => dictionary_cast::(array, to_type, cast_options), - UInt64 => dictionary_cast::(array, to_type, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from dictionary type {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (_, Dictionary(index_type, value_type)) => match **index_type { - Int8 => cast_to_dictionary::(array, value_type, cast_options), - Int16 => cast_to_dictionary::(array, value_type, cast_options), - Int32 => cast_to_dictionary::(array, value_type, cast_options), - Int64 => cast_to_dictionary::(array, value_type, cast_options), - UInt8 => cast_to_dictionary::(array, value_type, cast_options), - UInt16 => cast_to_dictionary::(array, value_type, cast_options), - UInt32 => cast_to_dictionary::(array, value_type, cast_options), - UInt64 => cast_to_dictionary::(array, value_type, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from type {:?} to dictionary type {:?} not supported", - from_type, to_type, - ))), - }, (_, Boolean) => match from_type { UInt8 => cast_numeric_to_bool::(array), UInt16 => cast_numeric_to_bool::(array), @@ -3390,7 +3394,18 @@ fn cast_to_dictionary( dict_value_type, cast_options, ), + Decimal128(_, _) => pack_numeric_to_dictionary::( + array, + dict_value_type, + cast_options, + ), + Decimal256(_, _) => pack_numeric_to_dictionary::( + array, + dict_value_type, + cast_options, + ), Utf8 => pack_string_to_dictionary::(array, cast_options), + LargeUtf8 => pack_string_to_dictionary::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {:?}", dict_value_type diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 95fb973289a5..be37a7636b63 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -19,12 +19,13 @@ use arrow_array::builder::{ PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, }; use arrow_array::types::{ - ArrowDictionaryKeyType, Int16Type, Int32Type, Int64Type, Int8Type, - TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type, + Int64Type, Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, + UInt8Type, }; use arrow_array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, + Array, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Date32Array, + Date64Array, Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, @@ -35,7 +36,7 @@ use arrow_array::{ TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, UnionArray, }; -use arrow_buffer::Buffer; +use arrow_buffer::{i256, Buffer}; use arrow_cast::{can_cast_types, cast}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; @@ -101,14 +102,14 @@ fn get_arrays_of_all_types() -> Vec { vec![ Arc::new(BinaryArray::from(binary_data.clone())), Arc::new(LargeBinaryArray::from(binary_data.clone())), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), make_dictionary_utf8::(), make_dictionary_utf8::(), make_dictionary_utf8::(), @@ -184,6 +185,46 @@ fn get_arrays_of_all_types() -> Vec { Arc::new( create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0).unwrap(), ), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), + make_dictionary_primitive::(vec![ + i256::from_i128(1), + i256::from_i128(2), + ]), ] } @@ -273,12 +314,15 @@ fn make_union_array() -> UnionArray { } /// Creates a dictionary with primitive dictionary values, and keys of type K -fn make_dictionary_primitive() -> ArrayRef { +/// and values of type V +fn make_dictionary_primitive( + values: Vec, +) -> ArrayRef { // Pick Int32 arbitrarily for dictionary values - let mut b: PrimitiveDictionaryBuilder = - PrimitiveDictionaryBuilder::new(); - b.append(1).unwrap(); - b.append(2).unwrap(); + let mut b: PrimitiveDictionaryBuilder = PrimitiveDictionaryBuilder::new(); + values.iter().for_each(|v| { + b.append(*v).unwrap(); + }); Arc::new(b.finish()) } @@ -369,6 +413,12 @@ fn get_all_types() -> Vec { Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), Decimal128(38, 0), + Dictionary(Box::new(DataType::Int8), Box::new(Decimal128(38, 0))), + Dictionary(Box::new(DataType::Int16), Box::new(Decimal128(38, 0))), + Dictionary(Box::new(DataType::UInt32), Box::new(Decimal128(38, 0))), + Dictionary(Box::new(DataType::Int8), Box::new(Decimal256(76, 0))), + Dictionary(Box::new(DataType::Int16), Box::new(Decimal256(76, 0))), + Dictionary(Box::new(DataType::UInt32), Box::new(Decimal256(76, 0))), ] } From 9f64476d837e834b2616a8b6cd19bbd860defd39 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 4 Dec 2022 13:58:13 +0000 Subject: [PATCH 0377/1411] Simplify decimal cast logic (#3264) * Simplify decimal cast logic * Add test --- arrow-cast/src/cast.rs | 668 +++++++++++------------------------------ 1 file changed, 169 insertions(+), 499 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index cddbf0d95c41..372f0bd3c420 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -330,7 +330,7 @@ where ::Native: AsPrimitive, M: ArrowNativeTypeOp, { - let mul_or_div: M = base.pow_checked(scale.unsigned_abs() as u32).map_err(|_| { + let scale_factor = base.pow_checked(scale.unsigned_abs() as u32).map_err(|_| { ArrowError::CastError(format!( "Cannot cast to {:?}({}, {}). The scale causes overflow.", D::PREFIX, @@ -339,29 +339,19 @@ where )) })?; - if scale < 0 { - if cast_options.safe { - array - .unary_opt::<_, D>(|v| v.as_().div_checked(mul_or_div).ok()) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - array - .try_unary::<_, D, _>(|v| v.as_().div_checked(mul_or_div)) - .and_then(|a| a.with_precision_and_scale(precision, scale)) - .map(|a| Arc::new(a) as ArrayRef) + let array = if scale < 0 { + match cast_options.safe { + true => array.unary_opt::<_, D>(|v| v.as_().div_checked(scale_factor).ok()), + false => array.try_unary::<_, D, _>(|v| v.as_().div_checked(scale_factor))?, } - } else if cast_options.safe { - array - .unary_opt::<_, D>(|v| v.as_().mul_checked(mul_or_div).ok()) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) } else { - array - .try_unary::<_, D, _>(|v| v.as_().mul_checked(mul_or_div)) - .and_then(|a| a.with_precision_and_scale(precision, scale)) - .map(|a| Arc::new(a) as ArrayRef) - } + match cast_options.safe { + true => array.unary_opt::<_, D>(|v| v.as_().mul_checked(scale_factor).ok()), + false => array.try_unary::<_, D, _>(|v| v.as_().mul_checked(scale_factor))?, + } + }; + + Ok(Arc::new(array.with_precision_and_scale(precision, scale)?)) } fn cast_floating_point_to_decimal128( @@ -383,22 +373,17 @@ where } else { array .try_unary::<_, Decimal128Type, _>(|v| { - mul.mul_checked(v.as_()).and_then(|value| { - let mul_v = value.round(); - let integer: i128 = mul_v.to_i128().ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - precision, - scale, - v - )) - })?; - - Ok(integer) + (mul * v.as_()).round().to_i128().ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + precision, + scale, + v + )) }) - }) - .and_then(|a| a.with_precision_and_scale(precision, scale)) + })? + .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } } @@ -431,8 +416,8 @@ where v )) }) - }) - .and_then(|a| a.with_precision_and_scale(precision, scale)) + })? + .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } } @@ -724,16 +709,40 @@ pub fn cast_with_options( cast_primitive_to_list::(array, to, to_type, cast_options) } (Decimal128(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal_with_option::<16, 16>(array, s1, p2, s2, cast_options) + cast_decimal_to_decimal::( + as_primitive_array(array), + *s1, + *p2, + *s2, + cast_options, + ) } (Decimal256(_, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal_with_option::<32, 32>(array, s1, p2, s2, cast_options) + cast_decimal_to_decimal::( + as_primitive_array(array), + *s1, + *p2, + *s2, + cast_options, + ) } (Decimal128(_, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal_with_option::<16, 32>(array, s1, p2, s2, cast_options) + cast_decimal_to_decimal::( + as_primitive_array(array), + *s1, + *p2, + *s2, + cast_options, + ) } (Decimal256(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal_with_option::<32, 16>(array, s1, p2, s2, cast_options) + cast_decimal_to_decimal::( + as_primitive_array(array), + *s1, + *p2, + *s2, + cast_options, + ) } (Decimal128(_, scale), _) => { // cast decimal to other type @@ -1964,471 +1973,110 @@ const fn time_unit_multiple(unit: &TimeUnit) -> i64 { } } -/// Cast one type of decimal array to another type of decimal array -fn cast_decimal_to_decimal_with_option< - const BYTE_WIDTH1: usize, - const BYTE_WIDTH2: usize, ->( - array: &ArrayRef, - input_scale: &i8, - output_precision: &u8, - output_scale: &i8, - cast_options: &CastOptions, -) -> Result { - if cast_options.safe { - cast_decimal_to_decimal_safe::( - array, - input_scale, - output_precision, - output_scale, - ) - } else { - cast_decimal_to_decimal::( - array, - input_scale, - output_precision, - output_scale, - ) +/// A utility trait that provides checked conversions between +/// decimal types inspired by [`NumCast`] +trait DecimalCast: Sized { + fn to_i128(self) -> Option; + + fn to_i256(self) -> Option; + + fn from_decimal(n: T) -> Option; +} + +impl DecimalCast for i128 { + fn to_i128(self) -> Option { + Some(self) + } + + fn to_i256(self) -> Option { + Some(i256::from_i128(self)) + } + + fn from_decimal(n: T) -> Option { + n.to_i128() } } -/// Cast one type of decimal array to another type of decimal array. Returning NULLs for -/// the array values when cast failures happen. -fn cast_decimal_to_decimal_safe( - array: &ArrayRef, - input_scale: &i8, - output_precision: &u8, - output_scale: &i8, -) -> Result { - if input_scale > output_scale { - // For example, input_scale is 4 and output_scale is 3; - // Original value is 11234_i128, and will be cast to 1123_i128. - let div = 10_i128.pow((input_scale - output_scale) as u32); - let half = div / 2; - let neg_half = half.wrapping_neg(); - if BYTE_WIDTH1 == 16 { - let array = array.as_any().downcast_ref::().unwrap(); - if BYTE_WIDTH2 == 16 { - // rounding the result - let iter = array.iter().map(|v| { - v.map(|v| { - // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= 0 && r >= half { - d.wrapping_add(1) - } else if v < 0 && r <= neg_half { - d.wrapping_sub(1) - } else { - d - } - }) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let iter = array.iter().map(|v| { - v.map(|v| { - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - i256::from_i128(if v >= 0 && r >= half { - d.wrapping_add(1) - } else if v < 0 && r <= neg_half { - d.wrapping_sub(1) - } else { - d - }) - }) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } - } else { - let array = array.as_any().downcast_ref::().unwrap(); - let div = i256::from_i128(div); - let half = div / i256::from_i128(2); - let neg_half = half.wrapping_neg(); - if BYTE_WIDTH2 == 16 { - let iter = array.iter().map(|v| { - v.and_then(|v| { - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= i256::ZERO && r >= half { - d.wrapping_add(i256::ONE) - } else if v < i256::ZERO && r <= neg_half { - d.wrapping_sub(i256::ONE) - } else { - d - } - .to_i128() - }) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let iter = array.iter().map(|v| { - v.map(|v| { - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= i256::ZERO && r >= half { - d.wrapping_add(i256::ONE) - } else if v < i256::ZERO && r <= neg_half { - d.wrapping_sub(i256::ONE) - } else { - d - } - }) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } - } - } else { - // For example, input_scale is 3 and output_scale is 4; - // Original value is 1123_i128, and will be cast to 11230_i128. - let mul = 10_i128.pow((output_scale - input_scale) as u32); - if BYTE_WIDTH1 == 16 { - let array = array.as_any().downcast_ref::().unwrap(); - if BYTE_WIDTH2 == 16 { - let iter = array - .iter() - .map(|v| v.and_then(|v| v.mul_checked(mul).ok())); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let iter = array.iter().map(|v| { - v.and_then(|v| v.mul_checked(mul).ok().map(i256::from_i128)) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } - } else { - let array = array.as_any().downcast_ref::().unwrap(); - let mul = i256::from_i128(mul); - if BYTE_WIDTH2 == 16 { - let iter = array.iter().map(|v| { - v.and_then(|v| v.mul_checked(mul).ok().and_then(|v| v.to_i128())) - }); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let iter = array - .iter() - .map(|v| v.and_then(|v| v.mul_checked(mul).ok())); - let casted_array = unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - }; - casted_array - .with_precision_and_scale(*output_precision, *output_scale) - .map(|a| Arc::new(a) as ArrayRef) - } - } +impl DecimalCast for i256 { + fn to_i128(self) -> Option { + self.to_i128() + } + + fn to_i256(self) -> Option { + Some(self) + } + + fn from_decimal(n: T) -> Option { + n.to_i256() } } -/// Cast one type of decimal array to another type of decimal array. Returning `Err` if -/// cast failure happens. -fn cast_decimal_to_decimal( - array: &ArrayRef, - input_scale: &i8, - output_precision: &u8, - output_scale: &i8, -) -> Result { - if input_scale > output_scale { - // For example, input_scale is 4 and output_scale is 3; - // Original value is 11234_i128, and will be cast to 1123_i128. - if BYTE_WIDTH1 == 16 { - let array = array.as_any().downcast_ref::().unwrap(); - if BYTE_WIDTH2 == 16 { - // the div must be greater or equal than 10 - let div = 10_i128 - .pow_checked((input_scale - output_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; - let half = div / 2; - let neg_half = -half; +fn cast_decimal_to_decimal( + array: &PrimitiveArray, + input_scale: i8, + output_precision: u8, + output_scale: i8, + cast_options: &CastOptions, +) -> Result +where + I: DecimalType, + O: DecimalType, + I::Native: DecimalCast + ArrowNativeTypeOp, + O::Native: DecimalCast + ArrowNativeTypeOp, +{ + let error = |x| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + O::PREFIX, + output_precision, + output_scale, + x + )) + }; - array - .try_unary::<_, Decimal128Type, _>(|v| { - // cast to smaller scale, need to round the result - // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= 0 && r >= half { - d.checked_add(1) - } else if v < 0 && r <= neg_half { - d.checked_sub(1) - } else { - Some(d) - } - .ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let div = i256::from_i128(10_i128) - .pow_checked((input_scale - output_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; + let array: PrimitiveArray = if input_scale > output_scale { + let div = I::Native::from_decimal(10_i128) + .unwrap() + .pow_checked((input_scale - output_scale) as u32)?; - let half = div / i256::from_i128(2_i128); - let neg_half = -half; + let half = div.div_wrapping(I::Native::from_usize(2).unwrap()); + let half_neg = half.neg_wrapping(); - array - .try_unary::<_, Decimal256Type, _>(|v| { - // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation - let v = i256::from_i128(v); - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= i256::ZERO && r >= half { - d.checked_add(i256::ONE) - } else if v < i256::ZERO && r <= neg_half { - d.checked_sub(i256::ONE) - } else { - Some(d) - } - .ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } - } else { - let array = array.as_any().downcast_ref::().unwrap(); - let div = i256::from_i128(10_i128) - .pow_checked((input_scale - output_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; - let half = div / i256::from_i128(2_i128); - let neg_half = -half; - if BYTE_WIDTH2 == 16 { - array - .try_unary::<_, Decimal128Type, _>(|v| { - // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= i256::ZERO && r >= half { - d.checked_add(i256::ONE) - } else if v < i256::ZERO && r <= neg_half { - d.checked_sub(i256::ONE) - } else { - Some(d) - }.ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }).and_then(|v| v.to_i128().ok_or_else(|| { - ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) - })) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } else { - array - .try_unary::<_, Decimal256Type, _>(|v| { - // the div must be gt_eq 10, we don't need to check the overflow for the `div`/`mod` operation - let d = v.wrapping_div(div); - let r = v.wrapping_rem(div); - if v >= i256::ZERO && r >= half { - d.checked_add(i256::ONE) - } else if v < i256::ZERO && r <= neg_half { - d.checked_sub(i256::ONE) - } else { - Some(d) - } - .ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } + let f = |x: I::Native| { + // div is >= 10 and so this cannot overflow + let div = x.div_wrapping(div); + let rem = x.mod_wrapping(div); + + // Round result + let adjusted = match x >= I::Native::ZERO { + true if rem >= half => div.add_wrapping(I::Native::ONE), + false if rem <= half_neg => div.sub_wrapping(I::Native::ONE), + _ => div, + }; + O::Native::from_decimal(adjusted) + }; + + match cast_options.safe { + true => array.unary_opt(f), + false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, } } else { - // For example, input_scale is 3 and output_scale is 4; - // Original value is 1123_i128, and will be cast to 11230_i128. - if BYTE_WIDTH1 == 16 { - let array = array.as_any().downcast_ref::().unwrap(); - - if BYTE_WIDTH2 == 16 { - let mul = 10_i128 - .pow_checked((output_scale - input_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; + let mul = O::Native::from_decimal(10_i128) + .unwrap() + .pow_checked((output_scale - input_scale) as u32)?; - array - .try_unary::<_, Decimal128Type, _>(|v| { - v.checked_mul(mul).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } else { - let mul = i256::from_i128(10_i128) - .pow_checked((output_scale - input_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; + let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok()); - array - .try_unary::<_, Decimal256Type, _>(|v| { - i256::from_i128(v).checked_mul(mul).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } - } else { - let array = array.as_any().downcast_ref::().unwrap(); - let mul = i256::from_i128(10_i128) - .pow_checked((output_scale - input_scale) as u32) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast. The scale {} causes overflow.", - *output_scale, - )) - })?; - if BYTE_WIDTH2 == 16 { - array - .try_unary::<_, Decimal128Type, _>(|v| { - v.checked_mul(mul).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }).and_then(|v| v.to_i128().ok_or_else(|| { - ArrowError::InvalidArgumentError( - format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), - ) - })) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } else { - array - .try_unary::<_, Decimal256Type, _>(|v| { - v.checked_mul(mul).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {:?}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - *output_precision, - *output_scale, - v - )) - }) - }) - .and_then(|a| { - a.with_precision_and_scale(*output_precision, *output_scale) - }) - .map(|a| Arc::new(a) as ArrayRef) - } + match cast_options.safe { + true => array.unary_opt(f), + false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, } - } + }; + + Ok(Arc::new(array.with_precision_and_scale( + output_precision, + output_scale, + )?)) } /// Convert Array into a PrimitiveArray of type, and apply numeric cast @@ -3678,6 +3326,7 @@ mod tests { for (i, x) in $OUTPUT_VALUES.iter().enumerate() { match x { Some(x) => { + assert!(!result_array.is_null(i)); assert_eq!(result_array.value(i), *x); } None => { @@ -3733,7 +3382,7 @@ mod tests { #[test] #[cfg(not(feature = "force_validate"))] #[should_panic( - expected = "5789604461865809771178549250434395392663499233282028201972879200395656481997 cannot be casted to 128-bit integer for Decimal128" + expected = "Cannot cast to Decimal128(20, 3). Overflowing on 57896044618658097711785492504343953926634992332820282019728792003956564819967" )] fn test_cast_decimal_to_decimal_round_with_error() { // decimal256 to decimal128 overflow @@ -3901,7 +3550,7 @@ mod tests { let array = Arc::new(input_decimal_array) as ArrayRef; let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); - assert_eq!("Cast error: Cannot cast to \"Decimal128\"(38, 38). Overflowing on 170141183460469231731687303715884105727", + assert_eq!("Cast error: Cannot cast to Decimal128(38, 38). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -3916,7 +3565,7 @@ mod tests { let array = Arc::new(input_decimal_array) as ArrayRef; let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); - assert_eq!("Cast error: Cannot cast to \"Decimal256\"(76, 76). Overflowing on 170141183460469231731687303715884105727", + assert_eq!("Cast error: Cannot cast to Decimal256(76, 76). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -3951,7 +3600,7 @@ mod tests { let array = Arc::new(input_decimal_array) as ArrayRef; let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); - assert_eq!("Invalid argument error: 17014118346046923173168730371588410572700 cannot be casted to 128-bit integer for Decimal128", + assert_eq!("Cast error: Cannot cast to Decimal128(38, 7). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -3965,7 +3614,7 @@ mod tests { let array = Arc::new(input_decimal_array) as ArrayRef; let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); - assert_eq!("Cast error: Cannot cast to \"Decimal256\"(76, 55). Overflowing on 170141183460469231731687303715884105727", + assert_eq!("Cast error: Cannot cast to Decimal256(76, 55). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -7331,4 +6980,25 @@ mod tests { assert_eq!("1300", decimal_arr.value_as_string(0)); } + + #[test] + fn test_cast_decimal128_to_decimal256_negative() { + let input_type = DataType::Decimal128(10, 3); + let output_type = DataType::Decimal256(10, 5); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(i128::MAX), Some(i128::MIN)]; + let input_decimal_array = create_decimal_array(array, 10, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + + let hundred = i256::from_i128(100); + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some(i256::from_i128(i128::MAX).mul_wrapping(hundred)), + Some(i256::from_i128(i128::MIN).mul_wrapping(hundred)) + ] + ); + } } From 1640fd1bf85bc6a51afc0b00ff037f37191da83d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 4 Dec 2022 22:33:58 +0000 Subject: [PATCH 0378/1411] Fix decimal cast typo (#3270) --- arrow-cast/src/cast.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 372f0bd3c420..6d43c996c88a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2044,14 +2044,14 @@ where let f = |x: I::Native| { // div is >= 10 and so this cannot overflow - let div = x.div_wrapping(div); - let rem = x.mod_wrapping(div); + let d = x.div_wrapping(div); + let r = x.mod_wrapping(div); // Round result let adjusted = match x >= I::Native::ZERO { - true if rem >= half => div.add_wrapping(I::Native::ONE), - false if rem <= half_neg => div.sub_wrapping(I::Native::ONE), - _ => div, + true if r >= half => d.add_wrapping(I::Native::ONE), + false if r <= half_neg => d.sub_wrapping(I::Native::ONE), + _ => d, }; O::Native::from_decimal(adjusted) }; From 06e1111c21fb56f61b405aafa967f2c6fd321a18 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 5 Dec 2022 00:43:36 -0800 Subject: [PATCH 0379/1411] Support casting from decimal256 to float (#3267) --- arrow-cast/src/cast.rs | 154 ++++++++++++++++++++++++++++++++++------- 1 file changed, 128 insertions(+), 26 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 6d43c996c88a..272a422eb114 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -141,7 +141,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | - (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64) => true, + (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, (Decimal128(_, _), _) => false, (_, Decimal128(_, _)) => false, (Decimal256(_, _), _) => false, @@ -496,23 +496,16 @@ where } // cast the decimal array to floating-point array -macro_rules! cast_decimal_to_float { - ($ARRAY:expr, $SCALE : ident, $VALUE_BUILDER: ident, $NATIVE_TYPE : ty) => {{ - let array = $ARRAY.as_any().downcast_ref::().unwrap(); - let div = 10_f64.powi(*$SCALE as i32); - let mut value_builder = $VALUE_BUILDER::with_capacity(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - value_builder.append_null(); - } else { - // The range of f32 or f64 is larger than i128, we don't need to check overflow. - // cast the i128 to f64 will lose precision, for example the `112345678901234568` will be as `112345678901234560`. - let v = (array.value(i) as f64 / div) as $NATIVE_TYPE; - value_builder.append_value(v); - } - } - Ok(Arc::new(value_builder.finish())) - }}; +fn cast_decimal_to_float( + array: &ArrayRef, + op: F, +) -> Result +where + F: Fn(D::Native) -> T::Native, +{ + let array = array.as_any().downcast_ref::>().unwrap(); + let array = array.unary::<_, T>(op); + Ok(Arc::new(array)) } // cast the List array to Utf8 array @@ -796,10 +789,14 @@ pub fn cast_with_options( cast_options, ), Float32 => { - cast_decimal_to_float!(array, scale, Float32Builder, f32) + cast_decimal_to_float::(array, |x| { + (x as f64 / 10_f64.powi(*scale as i32)) as f32 + }) } Float64 => { - cast_decimal_to_float!(array, scale, Float64Builder, f64) + cast_decimal_to_float::(array, |x| { + (x as f64 / 10_f64.powi(*scale as i32)) as f64 + }) } Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( @@ -859,6 +856,16 @@ pub fn cast_with_options( *scale, cast_options, ), + Float32 => { + cast_decimal_to_float::(array, |x| { + (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 + }) + } + Float64 => { + cast_decimal_to_float::(array, |x| { + (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f64 + }) + } Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", @@ -3735,16 +3742,28 @@ mod tests { // f32 generate_cast_test_case!( &array, - Int64Array, - &DataType::Int64, - vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] + Float32Array, + &DataType::Float32, + vec![ + Some(1.25_f32), + Some(2.25_f32), + Some(3.25_f32), + None, + Some(5.25_f32) + ] ); // f64 generate_cast_test_case!( &array, - Int64Array, - &DataType::Int64, - vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] + Float64Array, + &DataType::Float64, + vec![ + Some(1.25_f64), + Some(2.25_f64), + Some(3.25_f64), + None, + Some(5.25_f64) + ] ); // overflow test: out of range of max u8 @@ -3904,6 +3923,32 @@ mod tests { &DataType::Int64, vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] ); + // f32 + generate_cast_test_case!( + &array, + Float32Array, + &DataType::Float32, + vec![ + Some(1.25_f32), + Some(2.25_f32), + Some(3.25_f32), + None, + Some(5.25_f32) + ] + ); + // f64 + generate_cast_test_case!( + &array, + Float64Array, + &DataType::Float64, + vec![ + Some(1.25_f64), + Some(2.25_f64), + Some(3.25_f64), + None, + Some(5.25_f64) + ] + ); // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(i256::from_i128(24400))]; @@ -3920,6 +3965,63 @@ mod tests { cast_with_options(&array, &DataType::Int8, &CastOptions { safe: true }); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); + + // loss the precision: convert decimal to f32、f64 + // f32 + // 112345678_f32 and 112345679_f32 are same, so the 112345679_f32 will lose precision. + let value_array: Vec> = vec![ + Some(i256::from_i128(125)), + Some(i256::from_i128(225)), + Some(i256::from_i128(325)), + None, + Some(i256::from_i128(525)), + Some(i256::from_i128(112345678)), + Some(i256::from_i128(112345679)), + ]; + let decimal_array = create_decimal256_array(value_array, 76, 2).unwrap(); + let array = Arc::new(decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Float32Array, + &DataType::Float32, + vec![ + Some(1.25_f32), + Some(2.25_f32), + Some(3.25_f32), + None, + Some(5.25_f32), + Some(1_123_456.7_f32), + Some(1_123_456.7_f32) + ] + ); + + // f64 + // 112345678901234568_f64 and 112345678901234560_f64 are same, so the 112345678901234568_f64 will lose precision. + let value_array: Vec> = vec![ + Some(i256::from_i128(125)), + Some(i256::from_i128(225)), + Some(i256::from_i128(325)), + None, + Some(i256::from_i128(525)), + Some(i256::from_i128(112345678901234568)), + Some(i256::from_i128(112345678901234560)), + ]; + let decimal_array = create_decimal256_array(value_array, 76, 2).unwrap(); + let array = Arc::new(decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Float64Array, + &DataType::Float64, + vec![ + Some(1.25_f64), + Some(2.25_f64), + Some(3.25_f64), + None, + Some(5.25_f64), + Some(1_123_456_789_012_345.6_f64), + Some(1_123_456_789_012_345.6_f64), + ] + ); } #[test] From b155461f770eb2ab8cc5d3296f6123582cf5073d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Dec 2022 10:34:02 +0000 Subject: [PATCH 0380/1411] Loosen nullability restrictions added in #3205 (#3226) (#3244) * Loosen nullability restrictions added in #3205 (#3226) * Fix tests * More test fixes * Yet more incorrect tests * Review feedback --- arrow-array/src/array/binary_array.rs | 2 +- arrow-array/src/array/mod.rs | 4 +- arrow-array/src/array/string_array.rs | 5 +- arrow-array/src/array/struct_array.rs | 30 ++++---- arrow-cast/src/cast.rs | 3 +- arrow-data/src/data.rs | 103 +++++++++++++++++++++++++- arrow-select/src/take.rs | 4 +- arrow/src/compute/kernels/limit.rs | 4 +- arrow/src/row/mod.rs | 35 +-------- 9 files changed, 133 insertions(+), 57 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 0b526ecb3dee..3a30d748ee3a 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -531,7 +531,7 @@ mod tests { let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( - Field::new("item", DataType::UInt8, false), + Field::new("item", DataType::UInt8, true), )); // [None, Some(b"Parquet")] diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 0f9a2ce59291..1e17e35d0f6d 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -915,8 +915,10 @@ mod tests { #[test] fn test_null_struct() { + // It is possible to create a null struct containing a non-nullable child + // see https://github.com/apache/arrow-rs/pull/3244 for details let struct_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, true)]); + DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); let array = new_null_array(&struct_type, 9); let a = array.as_any().downcast_ref::().unwrap(); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index fb3bb23179b5..c8db589e3c28 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -608,8 +608,11 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); + + // It is possible to create a null struct containing a non-nullable child + // see https://github.com/apache/arrow-rs/pull/3244 for details let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( - Field::new("item", DataType::UInt8, false), + Field::new("item", DataType::UInt8, true), )); // [None, Some(b"Parquet")] diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 7d88cc5c6deb..bf6489c1380c 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -227,13 +227,6 @@ impl From> for StructArray { field_value.data().data_type(), "the field data types must match the array data in a StructArray" ); - // Check nullability of child arrays - if !field_type.is_nullable() { - assert!( - field_value.null_count() == 0, - "non-nullable field cannot have null values" - ); - } }, ); @@ -241,6 +234,10 @@ impl From> for StructArray { .child_data(field_values.into_iter().map(|a| a.into_data()).collect()) .len(length); let array_data = unsafe { array_data.build_unchecked() }; + + // We must validate nullability + array_data.validate_nulls().unwrap(); + Self::from(array_data) } } @@ -283,13 +280,6 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { field_value.data().data_type(), "the field data types must match the array data in a StructArray" ); - // Check nullability of child arrays - if !field_type.is_nullable() { - assert!( - field_value.null_count() == 0, - "non-nullable field cannot have null values" - ); - } }, ); @@ -298,6 +288,10 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { .child_data(field_values.into_iter().map(|a| a.into_data()).collect()) .len(length); let array_data = unsafe { array_data.build_unchecked() }; + + // We must validate nullability + array_data.validate_nulls().unwrap(); + Self::from(array_data) } } @@ -470,8 +464,8 @@ mod tests { .unwrap(); let field_types = vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Int32, false), + Field::new("a", DataType::Boolean, true), + Field::new("b", DataType::Int32, true), ]; let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) .len(5) @@ -568,7 +562,9 @@ mod tests { } #[test] - #[should_panic(expected = "non-nullable field cannot have null values")] + #[should_panic( + expected = "non-nullable child of type Int32 contains nulls not present in parent Struct" + )] fn test_struct_array_from_mismatched_nullability() { drop(StructArray::from(vec![( Field::new("c", DataType::Int32, false), diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 272a422eb114..7bb3aeb9603f 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -6594,7 +6594,8 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to struct - let data_type = DataType::Struct(vec![Field::new("data", DataType::Int64, true)]); + let data_type = + DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); cast_from_null_to_other(&data_type); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index b230dfdb7564..b38321aacf4c 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -19,6 +19,7 @@ //! common attributes and operations for Arrow array. use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; +use arrow_buffer::bit_chunk_iterator::BitChunks; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; @@ -975,6 +976,7 @@ impl ArrayData { /// see [`Self::validate_full`] pub fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; + self.validate_nulls()?; self.validate_values()?; Ok(()) @@ -1001,7 +1003,13 @@ impl ArrayData { Ok(()) } - /// Validates the the null count is correct + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + /// Validates the the null count is correct and that any + /// nullability requirements of its children are correct pub fn validate_nulls(&self) -> Result<(), ArrowError> { let nulls = self.null_buffer(); @@ -1012,9 +1020,102 @@ impl ArrayData { self.null_count, actual_null_count ))); } + + // In general non-nullable children should not contain nulls, however, for certain + // types, such as StructArray and FixedSizeList, nulls in the parent take up + // space in the child. As such we permit nulls in the children in the corresponding + // positions for such types + match &self.data_type { + DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { + if !f.is_nullable() { + self.validate_non_nullable(None, 0, &self.child_data[0])? + } + } + DataType::FixedSizeList(field, len) => { + let child = &self.child_data[0]; + if !field.is_nullable() { + match nulls { + Some(nulls) => { + let element_len = *len as usize; + let mut buffer = + MutableBuffer::new_null(element_len * self.len); + + // Expand each bit within `null_mask` into `element_len` + // bits, constructing the implicit mask of the child elements + for i in 0..self.len { + if !bit_util::get_bit(nulls.as_ref(), self.offset + i) { + continue; + } + for j in 0..element_len { + bit_util::set_bit( + buffer.as_mut(), + i * element_len + j, + ) + } + } + let mask = buffer.into(); + self.validate_non_nullable(Some(&mask), 0, child)?; + } + None => self.validate_non_nullable(None, 0, child)?, + } + } + } + DataType::Struct(fields) => { + for (field, child) in fields.iter().zip(&self.child_data) { + if !field.is_nullable() { + self.validate_non_nullable(nulls, self.offset, child)? + } + } + } + _ => {} + } + Ok(()) } + /// Verifies that `child` contains no nulls not present in `mask` + fn validate_non_nullable( + &self, + mask: Option<&Buffer>, + offset: usize, + data: &ArrayData, + ) -> Result<(), ArrowError> { + let mask = match mask { + Some(mask) => mask.as_ref(), + None => return match data.null_count { + 0 => Ok(()), + _ => Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent {}", + data.data_type(), + self.data_type + ))), + }, + }; + + match data.null_buffer() { + Some(nulls) => { + let mask = BitChunks::new(mask, offset, data.len); + let nulls = BitChunks::new(nulls.as_ref(), data.offset, data.len); + mask + .iter() + .zip(nulls.iter()) + .chain(std::iter::once(( + mask.remainder_bits(), + nulls.remainder_bits(), + ))).try_for_each(|(m, c)| { + if (m & !c) != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent", + data.data_type() + ))) + } + Ok(()) + }) + } + None => Ok(()), + } + } + /// Validates the values stored within this [`ArrayData`] are valid /// without recursing into child [`ArrayData`] /// diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 857b6e3231ba..0b1d44319493 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1603,7 +1603,7 @@ mod tests { let list_data_type = DataType::$list_data_type(Box::new(Field::new( "item", DataType::Int32, - false, + true, ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) @@ -1676,7 +1676,7 @@ mod tests { let list_data_type = DataType::$list_data_type(Box::new(Field::new( "item", DataType::Int32, - false, + true, ))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 1f6c6aec5e1f..0d92e98cf718 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -158,8 +158,8 @@ mod tests { .unwrap(); let field_types = vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Int32, false), + Field::new("a", DataType::Boolean, true), + Field::new("b", DataType::Int32, true), ]; let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) .len(5) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index abb8039cc398..ea3def6ac831 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -1225,36 +1225,11 @@ unsafe fn decode_column( } } Codec::Struct(converter, _) => { - let child_fields = match &field.data_type { - DataType::Struct(f) => f, - _ => unreachable!(), - }; - let (null_count, nulls) = fixed::decode_nulls(rows); rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = child_fields - .iter() - .zip(&children) - .map(|(f, c)| { - let data = c.data().clone(); - match f.is_nullable() { - true => data, - false => { - assert_eq!(data.null_count(), null_count); - // Need to strip out null buffer if any as this is created - // as an artifact of the row encoding process that encodes - // nulls from the parent struct array in the children - data.into_builder() - .null_count(0) - .null_bit_buffer(None) - .build_unchecked() - } - } - }) - .collect(); - + let child_data = children.iter().map(|c| c.data().clone()).collect(); let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) .null_count(null_count) @@ -1712,11 +1687,8 @@ mod tests { let back = converter.convert_rows(&r2).unwrap(); assert_eq!(back.len(), 1); assert_eq!(&back[0], &s2); - let back_s = as_struct_array(&back[0]); - for c in back_s.columns() { - // Children should not contain nulls - assert_eq!(c.null_count(), 0); - } + + back[0].data().validate_full().unwrap(); } #[test] @@ -2198,6 +2170,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); for (actual, expected) in back.iter().zip(&arrays) { + actual.data().validate_full().unwrap(); assert_eq!(actual, expected) } } From 94d597edab1cb6dd0c84b94ab223ba89901ccfb6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Dec 2022 22:04:23 +0000 Subject: [PATCH 0381/1411] Add parquet-layout binary (#3269) * Add parquet-layout binary * Docs * Clippy * Add RowGroup row count --- parquet/Cargo.toml | 7 +- parquet/src/bin/parquet-layout.rs | 234 ++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 parquet/src/bin/parquet-layout.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index eecfd55b9098..1ea28b7de366 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -51,6 +51,7 @@ num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } seq-macro = { version = "0.3", default-features = false } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } @@ -81,7 +82,7 @@ default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] # Enable arrow reader/writer APIs arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] # Enable CLI tools -cli = ["json", "base64", "clap", "arrow-csv"] +cli = ["json", "base64", "clap", "arrow-csv", "serde"] # Enable JSON APIs json = ["serde_json", "base64"] # Enable internal testing APIs @@ -125,6 +126,10 @@ required-features = ["arrow", "cli"] name = "parquet-show-bloom-filter" required-features = ["cli"] +[[bin]] +name = "parquet-layout" +required-features = ["cli"] + [[bench]] name = "arrow_writer" required-features = ["arrow"] diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs new file mode 100644 index 000000000000..7a685d2069e8 --- /dev/null +++ b/parquet/src/bin/parquet-layout.rs @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary that prints the physical layout of a parquet file +//! +//! # Install +//! +//! `parquet-layout` can be installed using `cargo`: +//! ``` +//! cargo install parquet --features=cli +//! ``` +//! After this `parquet-layout` should be available: +//! ``` +//! parquet-layout XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --features=cli --bin parquet-layout XYZ.parquet +//! ``` + +use std::fs::File; +use std::io::Read; + +use clap::Parser; +use serde::Serialize; +use thrift::protocol::{TCompactInputProtocol, TSerializable}; + +use parquet::basic::{Compression, Encoding}; +use parquet::errors::Result; +use parquet::file::reader::ChunkReader; +use parquet::format::PageHeader; + +#[derive(Serialize, Debug)] +struct ParquetFile { + row_groups: Vec, +} + +#[derive(Serialize, Debug)] +struct RowGroup { + columns: Vec, + row_count: i64, +} + +#[derive(Serialize, Debug)] +struct ColumnChunk { + path: String, + has_offset_index: bool, + has_column_index: bool, + has_bloom_filter: bool, + pages: Vec, +} + +#[derive(Serialize, Debug)] +struct Page { + compression: Option<&'static str>, + encoding: &'static str, + page_type: &'static str, + compressed_bytes: i32, + uncompressed_bytes: i32, + header_bytes: i32, + num_values: i32, +} + +fn do_layout(reader: &C) -> Result { + let metadata = parquet::file::footer::parse_metadata(reader)?; + let schema = metadata.file_metadata().schema_descr(); + + let row_groups = (0..metadata.num_row_groups()) + .map(|row_group_idx| { + let row_group = metadata.row_group(row_group_idx); + let columns = row_group + .columns() + .iter() + .zip(schema.columns()) + .map(|(column, column_schema)| { + let compression = compression(column.compression()); + let mut pages = vec![]; + + let mut start = column + .dictionary_page_offset() + .unwrap_or_else(|| column.data_page_offset()) + as u64; + + let end = start + column.compressed_size() as u64; + while start != end { + let (header_len, header) = read_page_header(reader, start)?; + if let Some(dictionary) = header.dictionary_page_header { + pages.push(Page { + compression, + encoding: encoding(dictionary.encoding), + page_type: "dictionary", + compressed_bytes: header.compressed_page_size, + uncompressed_bytes: header.uncompressed_page_size, + header_bytes: header_len as _, + num_values: dictionary.num_values, + }) + } else if let Some(data_page) = header.data_page_header { + pages.push(Page { + compression, + encoding: encoding(data_page.encoding), + page_type: "data_page_v1", + compressed_bytes: header.compressed_page_size, + uncompressed_bytes: header.uncompressed_page_size, + header_bytes: header_len as _, + num_values: data_page.num_values, + }) + } else if let Some(data_page) = header.data_page_header_v2 { + let is_compressed = data_page.is_compressed.unwrap_or(true); + + pages.push(Page { + compression: compression.filter(|_| is_compressed), + encoding: encoding(data_page.encoding), + page_type: "data_page_v2", + compressed_bytes: header.compressed_page_size, + uncompressed_bytes: header.uncompressed_page_size, + header_bytes: header_len as _, + num_values: data_page.num_values, + }) + } + start += header.compressed_page_size as u64 + header_len as u64; + } + + Ok(ColumnChunk { + path: column_schema.path().parts().join("."), + has_offset_index: column.offset_index_offset().is_some(), + has_column_index: column.column_index_offset().is_some(), + has_bloom_filter: column.bloom_filter_offset().is_some(), + pages, + }) + }) + .collect::>>()?; + + Ok(RowGroup { + columns, + row_count: row_group.num_rows(), + }) + }) + .collect::>>()?; + + Ok(ParquetFile { row_groups }) +} + +/// Reads the page header at `offset` from `reader`, returning +/// both the `PageHeader` and its length in bytes +fn read_page_header( + reader: &C, + offset: u64, +) -> Result<(usize, PageHeader)> { + struct TrackedRead(R, usize); + + impl Read for TrackedRead { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let v = self.0.read(buf)?; + self.1 += v; + Ok(v) + } + } + + let len = reader.len().checked_sub(offset).unwrap() as usize; + let input = reader.get_read(offset, len)?; + let mut tracked = TrackedRead(input, 0); + let mut prot = TCompactInputProtocol::new(&mut tracked); + let header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok((tracked.1, header)) +} + +/// Returns a string representation for a given compression +fn compression(compression: Compression) -> Option<&'static str> { + match compression { + Compression::UNCOMPRESSED => None, + Compression::SNAPPY => Some("snappy"), + Compression::GZIP => Some("gzip"), + Compression::LZO => Some("lzo"), + Compression::BROTLI => Some("brotli"), + Compression::LZ4 => Some("lz4"), + Compression::ZSTD => Some("zstd"), + Compression::LZ4_RAW => Some("lz4_raw"), + } +} + +/// Returns a string representation for a given encoding +fn encoding(encoding: parquet::format::Encoding) -> &'static str { + match Encoding::try_from(encoding) { + Ok(Encoding::PLAIN) => "plain", + Ok(Encoding::PLAIN_DICTIONARY) => "plain_dictionary", + Ok(Encoding::RLE) => "rle", + Ok(Encoding::BIT_PACKED) => "bit_packed", + Ok(Encoding::DELTA_BINARY_PACKED) => "delta_binary_packed", + Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array", + Ok(Encoding::DELTA_BYTE_ARRAY) => "delta_byte_array", + Ok(Encoding::RLE_DICTIONARY) => "rle_dictionary", + Ok(Encoding::BYTE_STREAM_SPLIT) => "byte_stream_split", + Err(_) => "unknown", + } +} + +#[derive(Debug, Parser)] +#[clap(author, version, about("Prints the physical layout of a parquet file"), long_about = None)] +struct Args { + #[clap(help("Path to a parquet file"))] + file: String, +} + +impl Args { + fn run(&self) -> Result<()> { + let file = File::open(&self.file)?; + let layout = do_layout(&file)?; + + let out = std::io::stdout(); + let writer = out.lock(); + + serde_json::to_writer_pretty(writer, &layout).unwrap(); + Ok(()) + } +} + +fn main() -> Result<()> { + Args::parse().run() +} From 7b38cb8eae4f1ae6fb451bf288b1b4b6f0642858 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Dec 2022 22:49:03 +0000 Subject: [PATCH 0382/1411] Disable const-random ahash feature on non-WASM (#3271) (#3277) --- parquet/Cargo.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 1ea28b7de366..f55f36c8a85b 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -28,6 +28,12 @@ readme = "README.md" edition = "2021" rust-version = "1.62" +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } + [dependencies] arrow-array = { version = "28.0.0", path = "../arrow-array", default-features = false, optional = true } arrow-buffer = { version = "28.0.0", path = "../arrow-buffer", default-features = false, optional = true } @@ -38,7 +44,6 @@ arrow-schema = { version = "28.0.0", path = "../arrow-schema", default-features arrow-select = { version = "28.0.0", path = "../arrow-select", default-features = false, optional = true } arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", default-features = false, optional = true } -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } snap = { version = "1.0", default-features = false, optional = true } From f5c165acc0e6cc4b34e0eaea006aab7e5bd28d66 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Dec 2022 22:52:20 +0000 Subject: [PATCH 0383/1411] Reload token from AWS_WEB_IDENTITY_TOKEN_FILE (#3274) * Reload token from AWS_WEB_IDENTITY_TOKEN_FILE * Clippy * Update object_store/src/aws/credential.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- object_store/src/aws/credential.rs | 11 +++++++---- object_store/src/aws/mod.rs | 11 +++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 900af24062ca..199899d6f000 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -341,7 +341,7 @@ impl CredentialProvider for InstanceCredentialProvider { #[derive(Debug)] pub struct WebIdentityProvider { pub cache: TokenCache>, - pub token: String, + pub token_path: String, pub role_arn: String, pub session_name: String, pub endpoint: String, @@ -355,7 +355,7 @@ impl CredentialProvider for WebIdentityProvider { web_identity( &self.client, &self.retry_config, - &self.token, + &self.token_path, &self.role_arn, &self.session_name, &self.endpoint, @@ -477,11 +477,14 @@ impl From for AwsCredential { async fn web_identity( client: &Client, retry_config: &RetryConfig, - token: &str, + token_path: &str, role_arn: &str, session_name: &str, endpoint: &str, ) -> Result>, StdError> { + let token = std::fs::read_to_string(token_path) + .map_err(|e| format!("Failed to read token file '{}': {}", token_path, e))?; + let bytes = client .request(Method::POST, endpoint) .query(&[ @@ -490,7 +493,7 @@ async fn web_identity( ("RoleArn", role_arn), ("RoleSessionName", session_name), ("Version", "2011-06-15"), - ("WebIdentityToken", token), + ("WebIdentityToken", &token), ]) .send_retry(retry_config) .await? diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index c92b8c29a1ff..aa419d60501a 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -116,9 +116,6 @@ enum Error { #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, - - #[snafu(display("Error reading token file: {}", source))] - ReadTokenFile { source: std::io::Error }, } impl From for super::Error { @@ -588,13 +585,11 @@ impl AmazonS3Builder { (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), // TODO: Replace with `AmazonS3Builder::credentials_from_env` _ => match ( - std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), std::env::var("AWS_ROLE_ARN"), ) { - (Some(token_file), Ok(role_arn)) => { + (Ok(token_path), Ok(role_arn)) => { info!("Using WebIdentity credential provider"); - let token = std::fs::read_to_string(token_file) - .context(ReadTokenFileSnafu)?; let session_name = std::env::var("AWS_ROLE_SESSION_NAME") .unwrap_or_else(|_| "WebIdentitySession".to_string()); @@ -610,7 +605,7 @@ impl AmazonS3Builder { Box::new(WebIdentityProvider { cache: Default::default(), - token, + token_path, session_name, role_arn, endpoint, From 7c75a66ee30f14c84984e4ee9aabca737f62f440 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 6 Dec 2022 00:26:40 -0800 Subject: [PATCH 0384/1411] Support casting from unsigned numeric to Decimal256 (#3273) --- arrow-buffer/src/bigint.rs | 4 ++ arrow-cast/src/cast.rs | 80 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 23400b4a3f6e..cfe14fb39f43 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -493,6 +493,10 @@ define_as_primitive!(i8); define_as_primitive!(i16); define_as_primitive!(i32); define_as_primitive!(i64); +define_as_primitive!(u8); +define_as_primitive!(u16); +define_as_primitive!(u32); +define_as_primitive!(u64); impl ToPrimitive for i256 { fn to_i64(&self) -> Option { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 7bb3aeb9603f..ebdefb18e9ed 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -133,6 +133,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal256(_, _), Decimal128(_, _)) => true, // unsigned integer to decimal (UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _)) | + (UInt8 | UInt16 | UInt32 | UInt64, Decimal256(_, _)) | // signed numeric to decimal (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | @@ -954,7 +955,34 @@ pub fn cast_with_options( (_, Decimal256(precision, scale)) => { // cast data to decimal match from_type { - // TODO now just support signed numeric to decimal, support decimal to numeric later + UInt8 => cast_integer_to_decimal::<_, Decimal256Type, _>( + as_primitive_array::(array), + *precision, + *scale, + i256::from_i128(10_i128), + cast_options, + ), + UInt16 => cast_integer_to_decimal::<_, Decimal256Type, _>( + as_primitive_array::(array), + *precision, + *scale, + i256::from_i128(10_i128), + cast_options, + ), + UInt32 => cast_integer_to_decimal::<_, Decimal256Type, _>( + as_primitive_array::(array), + *precision, + *scale, + i256::from_i128(10_i128), + cast_options, + ), + UInt64 => cast_integer_to_decimal::<_, Decimal256Type, _>( + as_primitive_array::(array), + *precision, + *scale, + i256::from_i128(10_i128), + cast_options, + ), Int8 => cast_integer_to_decimal::<_, Decimal256Type, _>( as_primitive_array::(array), *precision, @@ -4197,9 +4225,53 @@ mod tests { #[test] fn test_cast_numeric_to_decimal256() { - // test negative cast type - let decimal_type = DataType::Decimal256(58, 6); - assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); + let decimal_type = DataType::Decimal256(76, 6); + // u8, u16, u32, u64 + let input_datas = vec![ + Arc::new(UInt8Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u8 + Arc::new(UInt16Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u16 + Arc::new(UInt32Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u32 + Arc::new(UInt64Array::from(vec![ + Some(1), + Some(2), + Some(3), + None, + Some(5), + ])) as ArrayRef, // u64 + ]; + + for array in input_datas { + generate_cast_test_case!( + &array, + Decimal256Array, + &decimal_type, + vec![ + Some(i256::from_i128(1000000_i128)), + Some(i256::from_i128(2000000_i128)), + Some(i256::from_i128(3000000_i128)), + None, + Some(i256::from_i128(5000000_i128)) + ] + ); + } // i8, i16, i32, i64 let input_datas = vec![ From 99ced481308e870f69792e49cd23a529fa3ccc70 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 6 Dec 2022 17:35:31 +0000 Subject: [PATCH 0385/1411] Disable getrandom object_store (#3278) --- object_store/Cargo.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 9b1dee5c54ab..f37831516b95 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -51,15 +51,13 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -# Fix for wasm32-unknown-unknown (see https://docs.rs/getrandom/latest/getrandom/#webassembly-support) -getrandom = { version = "0.2", features = ["js"], optional = true } # AWS Profile support aws-types = { version = "0.51", optional = true } aws-config = { version = "0.51", optional = true } [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring", "getrandom"] +cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] From 16484a6d841ebe2347dffbaa1b14d06392944cbc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Dec 2022 05:05:04 -0500 Subject: [PATCH 0386/1411] Minor: Allow Field::new to take existing `String` as well as &str (#3288) --- arrow-schema/src/field.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 5813902ddd77..a3275dcb3355 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -112,9 +112,9 @@ impl Hash for Field { impl Field { /// Creates a new field - pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { + pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { Field { - name: name.to_string(), + name: name.into(), data_type, nullable, dict_id: 0, @@ -125,14 +125,14 @@ impl Field { /// Creates a new field that has additional dictionary information pub fn new_dict( - name: &str, + name: impl Into, data_type: DataType, nullable: bool, dict_id: i64, dict_is_ordered: bool, ) -> Self { Field { - name: name.to_string(), + name: name.into(), data_type, nullable, dict_id, @@ -485,6 +485,20 @@ mod test { use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; + #[test] + fn test_new_with_string() { + // Fields should allow owned Strings to support reuse + let s = String::from("c1"); + Field::new(s, DataType::Int64, false); + } + + #[test] + fn test_new_dict_with_string() { + // Fields should allow owned Strings to support reuse + let s = String::from("c1"); + Field::new_dict(s, DataType::Int64, false, 4, false); + } + #[test] fn test_merge_incompatible_types() { let mut field = Field::new("c1", DataType::Int64, false); From 2e806b0c634e7c5487811c28329b8ef4c5f1ba77 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 8 Dec 2022 02:07:01 -0800 Subject: [PATCH 0387/1411] fix(ffi): handle null data buffers from empty arrays (#3276) * test(python): validate roundtripping of empty arrays * test: check more types * test: parameterize rust side * fix: also check for zero length buffers * fix: handle null data buffers * fix: only allow zero-length buffers to be null --- arrow-pyarrow-integration-testing/src/lib.rs | 9 ++++++ .../tests/test_sql.py | 32 +++++++++++++++++++ arrow/src/ffi.rs | 25 +++++++++------ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index 2e74f0cf66b4..cf94b0dd40af 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -20,6 +20,7 @@ use std::sync::Arc; +use arrow::array::new_empty_array; use pyo3::prelude::*; use pyo3::wrap_pyfunction; @@ -70,6 +71,13 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { Ok(array == expected) } +#[pyfunction] +fn make_empty_array(datatype: PyArrowType, py: Python) -> PyResult { + let array = new_empty_array(&datatype.0); + + array.data().to_pyarrow(py) +} + /// Returns the substring #[pyfunction] fn substring( @@ -134,6 +142,7 @@ fn round_trip_record_batch_reader( fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(double))?; m.add_wrapped(wrap_pyfunction!(double_py))?; + m.add_wrapped(wrap_pyfunction!(make_empty_array))?; m.add_wrapped(wrap_pyfunction!(substring))?; m.add_wrapped(wrap_pyfunction!(concatenate))?; m.add_wrapped(wrap_pyfunction!(round_trip_type))?; diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index a19edf0ccd03..5a8bec792273 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -193,6 +193,38 @@ def test_time32_python(): del b del expected + +@pytest.mark.parametrize("datatype", _supported_pyarrow_types, ids=str) +def test_empty_array_python(datatype): + """ + Python -> Rust -> Python + """ + if datatype == pa.float16(): + pytest.skip("Float 16 is not implemented in Rust") + + a = pa.array([], datatype) + b = rust.round_trip_array(a) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + del a + del b + + +@pytest.mark.parametrize("datatype", _supported_pyarrow_types, ids=str) +def test_empty_array_rust(datatype): + """ + Rust -> Python + """ + a = pa.array([], type=datatype) + b = rust.make_empty_array(datatype) + b.validate(full=True) + assert a.to_pylist() == b.to_pylist() + assert a.type == b.type + del a + del b + + def test_binary_array(): """ Python -> Rust -> Python diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index fc8dc654af0c..0c1c1fa54df0 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -123,7 +123,7 @@ use std::{ use bitflags::bitflags; use crate::array::{layout, ArrayData}; -use crate::buffer::Buffer; +use crate::buffer::{Buffer, MutableBuffer}; use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util; @@ -578,7 +578,7 @@ unsafe fn create_buffer( index: usize, len: usize, ) -> Option { - if array.buffers.is_null() { + if array.buffers.is_null() || array.n_buffers == 0 { return None; } let buffers = array.buffers as *mut *const u8; @@ -657,13 +657,20 @@ pub trait ArrowArrayRef { let len = self.buffer_len(index)?; - unsafe { create_buffer(self.owner().clone(), self.array(), index, len) } - .ok_or_else(|| { - ArrowError::CDataInterface(format!( - "The external buffer at position {} is null.", - index - 1 - )) - }) + match unsafe { + create_buffer(self.owner().clone(), self.array(), index, len) + } { + Some(buf) => Ok(buf), + None if len == 0 => { + // Null data buffer, which Rust doesn't allow. So create + // an empty buffer. + Ok(MutableBuffer::new(0).into()) + } + None => Err(ArrowError::CDataInterface(format!( + "The external buffer at position {} is null.", + index - 1 + ))), + } }) .collect() } From 7b717139d52010e6754e21eb248f036ea9c4361e Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Thu, 8 Dec 2022 23:06:04 +1100 Subject: [PATCH 0388/1411] Fix ipc schema custom_metadata serialization (#3282) * Fix ipc schema custom_metadata serialization * Fix ipc doc test * PR comments --- arrow-ipc/src/convert.rs | 95 ++++++++++++++++++++++++---------------- arrow-ipc/src/writer.rs | 14 +++--- 2 files changed, 64 insertions(+), 45 deletions(-) diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index e11d64a473d4..e5522303df52 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -41,29 +41,37 @@ pub fn schema_to_fb_offset<'a>( fbb: &mut FlatBufferBuilder<'a>, schema: &Schema, ) -> WIPOffset> { - let mut fields = vec![]; - for field in schema.fields() { - let fb_field = build_field(fbb, field); - fields.push(fb_field); - } - - let mut custom_metadata = vec![]; - for (k, v) in schema.metadata() { - let fb_key_name = fbb.create_string(k.as_str()); - let fb_val_name = fbb.create_string(v.as_str()); + let fields = schema + .fields() + .iter() + .map(|field| build_field(fbb, field)) + .collect::>(); + let fb_field_list = fbb.create_vector(&fields); - let mut kv_builder = crate::KeyValueBuilder::new(fbb); - kv_builder.add_key(fb_key_name); - kv_builder.add_value(fb_val_name); - custom_metadata.push(kv_builder.finish()); - } + let fb_metadata_list = if !schema.metadata().is_empty() { + let custom_metadata = schema + .metadata() + .iter() + .map(|(k, v)| { + let fb_key_name = fbb.create_string(k); + let fb_val_name = fbb.create_string(v); - let fb_field_list = fbb.create_vector(&fields); - let fb_metadata_list = fbb.create_vector(&custom_metadata); + let mut kv_builder = crate::KeyValueBuilder::new(fbb); + kv_builder.add_key(fb_key_name); + kv_builder.add_value(fb_val_name); + kv_builder.finish() + }) + .collect::>(); + Some(fbb.create_vector(&custom_metadata)) + } else { + None + }; let mut builder = crate::SchemaBuilder::new(fbb); builder.add_fields(fb_field_list); - builder.add_custom_metadata(fb_metadata_list); + if let Some(fb_metadata_list) = fb_metadata_list { + builder.add_custom_metadata(fb_metadata_list); + } builder.finish() } @@ -1031,32 +1039,45 @@ mod tests { #[test] fn schema_from_bytes() { - // bytes of a schema generated from python (0.14.0), saved as an `crate::Message`. - // the schema is: Field("field1", DataType::UInt32, false) + // Bytes of a schema generated via following python code, using pyarrow 10.0.1: + // + // import pyarrow as pa + // schema = pa.schema([pa.field('field1', pa.uint32(), nullable=False)]) + // sink = pa.BufferOutputStream() + // with pa.ipc.new_stream(sink, schema) as writer: + // pass + // # stripping continuation & length prefix & suffix bytes to get only schema bytes + // [x for x in sink.getvalue().to_pybytes()][8:-8] let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 3, 0, + 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 4, 0, 12, 0, 0, 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, 0, 0, 0, 16, 0, 20, 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, - 0, 0, 2, 32, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 8, 0, - 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, - 0, 0, 0, 0, + 0, 0, 2, 16, 0, 0, 0, 32, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 102, + 105, 101, 108, 100, 49, 0, 0, 0, 0, 6, 0, 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, + 0, ]; - let ipc = crate::root_as_message(&bytes[..]).unwrap(); + let ipc = crate::root_as_message(&bytes).unwrap(); let schema = ipc.header_as_schema().unwrap(); - // a message generated from Rust, same as the Python one - let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 14, 0, 12, 0, 11, 0, 4, 0, 10, 0, 0, 0, 20, 0, 0, - 0, 0, 0, 0, 1, 3, 0, 10, 0, 12, 0, 0, 0, 8, 0, 4, 0, 10, 0, 0, 0, 8, 0, 0, 0, - 8, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 16, 0, 0, 0, 12, 0, 18, 0, 12, 0, 0, 0, - 11, 0, 4, 0, 12, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 2, 20, 0, 0, 0, 0, 0, 6, 0, - 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, - 0, 0, - ]; - let ipc2 = crate::root_as_message(&bytes[..]).unwrap(); - let schema2 = ipc.header_as_schema().unwrap(); + // generate same message with Rust + let data_gen = crate::writer::IpcDataGenerator::default(); + let arrow_schema = + Schema::new(vec![Field::new("field1", DataType::UInt32, false)]); + let bytes = data_gen + .schema_to_bytes(&arrow_schema, &crate::writer::IpcWriteOptions::default()) + .ipc_message; + + let ipc2 = crate::root_as_message(&bytes).unwrap(); + let schema2 = ipc2.header_as_schema().unwrap(); + + // can't compare schema directly as it compares the underlying bytes, which can differ + assert!(schema.custom_metadata().is_none()); + assert!(schema2.custom_metadata().is_none()); + assert_eq!(schema.endianness(), schema2.endianness()); + assert!(schema.features().is_none()); + assert!(schema2.features().is_none()); + assert_eq!(fb_to_schema(schema), fb_to_schema(schema2)); - assert_eq!(schema, schema2); assert_eq!(ipc.version(), ipc2.version()); assert_eq!(ipc.header_type(), ipc2.header_type()); assert_eq!(ipc.bodyLength(), ipc2.bodyLength()); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 5f188fe1a9fc..c407cd12c239 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -793,15 +793,13 @@ impl StreamWriter { /// # fn main() -> Result<(), ArrowError> { /// // The result we expect from an empty schema /// let expected = vec![ - /// 255, 255, 255, 255, 64, 0, 0, 0, + /// 255, 255, 255, 255, 48, 0, 0, 0, /// 16, 0, 0, 0, 0, 0, 10, 0, - /// 14, 0, 12, 0, 11, 0, 4, 0, - /// 10, 0, 0, 0, 20, 0, 0, 0, - /// 0, 0, 0, 1, 4, 0, 10, 0, - /// 12, 0, 0, 0, 8, 0, 4, 0, - /// 10, 0, 0, 0, 8, 0, 0, 0, - /// 8, 0, 0, 0, 0, 0, 0, 0, - /// 0, 0, 0, 0, 0, 0, 0, 0, + /// 12, 0, 10, 0, 9, 0, 4, 0, + /// 10, 0, 0, 0, 16, 0, 0, 0, + /// 0, 1, 4, 0, 8, 0, 8, 0, + /// 0, 0, 4, 0, 8, 0, 0, 0, + /// 4, 0, 0, 0, 0, 0, 0, 0, /// 255, 255, 255, 255, 0, 0, 0, 0 /// ]; /// From 96c7c9d06628ef1690035bc0a1096901adaf084c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Dec 2022 15:07:35 +0000 Subject: [PATCH 0389/1411] Split out arrow-string (#2594) (#3295) * Split out arrow-string (#2594) * Doc * Clippy --- .github/workflows/arrow.yml | 4 + .github/workflows/arrow_flight.yml | 5 +- .github/workflows/dev_pr/labeler.yml | 9 +- .github/workflows/integration.yml | 15 +- .github/workflows/miri.yaml | 11 +- .github/workflows/parquet.yml | 3 +- Cargo.toml | 35 +- arrow-string/Cargo.toml | 49 + .../src}/concat_elements.rs | 11 +- .../kernels => arrow-string/src}/length.rs | 184 +- arrow-string/src/lib.rs | 24 + arrow-string/src/like.rs | 2100 +++++++++++++++++ .../kernels => arrow-string/src}/regexp.rs | 152 +- .../kernels => arrow-string/src}/substring.rs | 73 +- arrow/Cargo.toml | 4 +- arrow/src/compute/kernels/comparison.rs | 2041 +--------------- arrow/src/compute/kernels/mod.rs | 5 +- arrow/src/lib.rs | 1 + dev/release/README.md | 1 + 19 files changed, 2563 insertions(+), 2164 deletions(-) create mode 100644 arrow-string/Cargo.toml rename {arrow/src/compute/kernels => arrow-string/src}/concat_elements.rs (97%) rename {arrow/src/compute/kernels => arrow-string/src}/length.rs (84%) create mode 100644 arrow-string/src/lib.rs create mode 100644 arrow-string/src/like.rs rename {arrow/src/compute/kernels => arrow-string/src}/regexp.rs (53%) rename {arrow/src/compute/kernels => arrow-string/src}/substring.rs (95%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 2e1c64ebe3a0..0b47f02566ce 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -70,6 +70,8 @@ jobs: run: cargo test -p arrow-csv --all-features - name: Test arrow-json with all features run: cargo test -p arrow-json --all-features + - name: Test arrow-string with all features + run: cargo test -p arrow-string --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -184,5 +186,7 @@ jobs: run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - name: Clippy arrow-json with all features run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings + - name: Clippy arrow-string with all features + run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index ab7030b05e3c..356c0fc0a073 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -31,10 +31,11 @@ on: - arrow-buffer/** - arrow-cast/** - arrow-data/** - - arrow-schema/** - - arrow-select/** - arrow-flight/** - arrow-ipc/** + - arrow-schema/** + - arrow-select/** + - arrow-string/** - .github/** jobs: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index d93932cd2334..35f2a873c6a4 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -16,16 +16,17 @@ # under the License. arrow: - - arrow/**/* - arrow-array/**/* - arrow-buffer/**/* - arrow-cast/**/* + - arrow-csv/**/* - arrow-data/**/* - - arrow-schema/**/* - - arrow-select/**/* - arrow-ipc/**/* - - arrow-csv/**/* - arrow-json/**/* + - arrow-schema/**/* + - arrow-select/**/* + - arrow-string/**/* + - arrow/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 656e56a652ca..d23f4c0717e0 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -24,20 +24,21 @@ on: - master pull_request: paths: - - arrow/** + - .github/** - arrow-array/** - arrow-buffer/** - arrow-cast/** + - arrow-csv/** - arrow-data/** - - arrow-schema/** - - arrow-select/** + - arrow-integration-test/** + - arrow-integration-testing/** - arrow-ipc/** - - arrow-csv/** - arrow-json/** - arrow-pyarrow-integration-testing/** - - arrow-integration-test/** - - arrow-integration-testing/** - - .github/** + - arrow-schema/** + - arrow-select/** + - arrow-string/** + - arrow/** jobs: diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index b1f5d85fc581..f9cc7df79283 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -24,17 +24,18 @@ on: - master pull_request: paths: - - arrow/** + - .github/** - arrow-array/** - arrow-buffer/** - arrow-cast/** + - arrow-csv/** - arrow-data/** - - arrow-schema/** - - arrow-select/** - arrow-ipc/** - - arrow-csv/** - arrow-json/** - - .github/** + - arrow-schema/** + - arrow-select/** + - arrow-string/** + - arrow/** jobs: miri-checks: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index c5c7aac053f0..f7d94f85783e 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -36,6 +36,7 @@ on: - arrow-ipc/** - arrow-csv/** - arrow-json/** + - arrow-string/** - parquet/** - .github/** @@ -123,7 +124,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - rust: [stable] + rust: [ stable ] steps: - uses: actions/checkout@v3 - name: Setup Python diff --git a/Cargo.toml b/Cargo.toml index 16b4cb7f89e8..556b86a008a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,23 +17,24 @@ [workspace] members = [ - "arrow", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-flight", - "arrow-integration-test", - "arrow-integration-testing", - "arrow-ipc", - "arrow-json", - "arrow-schema", - "arrow-select", - "object_store", - "parquet", - "parquet_derive", - "parquet_derive_test", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-flight", + "arrow-integration-test", + "arrow-integration-testing", + "arrow-ipc", + "arrow-json", + "arrow-schema", + "arrow-select", + "arrow-string", + "object_store", + "parquet", + "parquet_derive", + "parquet_derive_test", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml new file mode 100644 index 000000000000..97c4b5ffbf1c --- /dev/null +++ b/arrow-string/Cargo.toml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-string" +version = "28.0.0" +description = "String kernels for arrow arrays" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_string" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "28.0.0", path = "../arrow-array" } +regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } +regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } + +[features] +dyn_cmp_dict = [] diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow-string/src/concat_elements.rs similarity index 97% rename from arrow/src/compute/kernels/concat_elements.rs rename to arrow-string/src/concat_elements.rs index 25c8f60de3f6..e9219fb2dc09 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::array::*; -use crate::error::{ArrowError, Result}; +use arrow_array::builder::BufferBuilder; +use arrow_array::*; use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayDataBuilder; +use arrow_schema::ArrowError; /// Returns the elementwise concatenation of a [`StringArray`]. /// @@ -36,7 +38,7 @@ use arrow_data::bit_mask::combine_option_bitmap; pub fn concat_elements_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result> { +) -> Result, ArrowError> { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( "Arrays must have the same length: {} != {}", @@ -89,7 +91,7 @@ pub fn concat_elements_utf8( /// An error will be returned if the [`StringArray`] are of different lengths pub fn concat_elements_utf8_many( arrays: &[&GenericStringArray], -) -> Result> { +) -> Result, ArrowError> { if arrays.is_empty() { return Err(ArrowError::ComputeError( "concat requires input of at least one array".to_string(), @@ -158,6 +160,7 @@ pub fn concat_elements_utf8_many( #[cfg(test)] mod tests { use super::*; + use arrow_array::StringArray; #[test] fn test_string_concat() { let left = [Some("foo"), Some("bar"), None] diff --git a/arrow/src/compute/kernels/length.rs b/arrow-string/src/length.rs similarity index 84% rename from arrow/src/compute/kernels/length.rs rename to arrow-string/src/length.rs index a68aa2bde4eb..f7faa0a61435 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow-string/src/length.rs @@ -17,12 +17,11 @@ //! Defines kernel for length of string arrays and binary arrays -use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType}; -use crate::{ - datatypes::*, - error::{ArrowError, Result}, -}; - +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::Buffer; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; macro_rules! unary_offsets { @@ -153,7 +152,7 @@ where /// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray, /// or DictionaryArray with above Arrays as values /// * length of null is null. -pub fn length(array: &dyn Array) -> Result { +pub fn length(array: &dyn Array) -> Result { match array.data_type() { DataType::Dictionary(kt, _) => { kernel_dict!( @@ -189,7 +188,7 @@ pub fn length(array: &dyn Array) -> Result { /// or DictionaryArray with above Arrays as values /// * bit_length of null is null. /// * bit_length is in number of bits -pub fn bit_length(array: &dyn Array) -> Result { +pub fn bit_length(array: &dyn Array) -> Result { match array.data_type() { DataType::Dictionary(kt, _) => { kernel_dict!( @@ -220,6 +219,7 @@ pub fn bit_length(array: &dyn Array) -> Result { #[cfg(test)] mod tests { use super::*; + use arrow_array::cast::as_primitive_array; fn double_vec(v: Vec) -> Vec { [&v[..], &v[..]].concat() @@ -245,11 +245,10 @@ mod tests { macro_rules! length_binary_helper { ($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{ let array = GenericBinaryArray::<$offset_ty>::from($value); - let result = $kernel(&array)?; + let result = $kernel(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }}; } @@ -259,64 +258,61 @@ mod tests { GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>( $value, ); - let result = length(&array)?; + let result = length(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }}; } #[test] #[cfg_attr(miri, ignore)] // running forever - fn length_test_string() -> Result<()> { + fn length_test_string() { length_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value, result.value(i)); }); - Ok(()) }) } #[test] #[cfg_attr(miri, ignore)] // running forever - fn length_test_large_string() -> Result<()> { + fn length_test_large_string() { length_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value as i64, result.value(i)); }); - Ok(()) }) } #[test] - fn length_test_binary() -> Result<()> { + fn length_test_binary() { let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]]; let result: Vec = vec![4, 3, 2]; length_binary_helper!(i32, Int32Array, length, value, result) } #[test] - fn length_test_large_binary() -> Result<()> { + fn length_test_large_binary() { let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"]; let result: Vec = vec![4, 2, 3]; length_binary_helper!(i64, Int64Array, length, value, result) } #[test] - fn length_test_list() -> Result<()> { + fn length_test_list() { let value = vec![ Some(vec![]), Some(vec![Some(1), Some(2), Some(4)]), @@ -327,7 +323,7 @@ mod tests { } #[test] - fn length_test_large_list() -> Result<()> { + fn length_test_large_list() { let value = vec![ Some(vec![]), Some(vec![Some(1.1), Some(2.2), Some(3.3)]), @@ -348,28 +344,27 @@ mod tests { } #[test] - fn length_null_string() -> Result<()> { + fn length_null_string() { length_null_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn length_null_large_string() -> Result<()> { + fn length_null_large_string() { length_null_cases_string() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = length(&array)?; + let result = length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -380,12 +375,11 @@ mod tests { .collect::>() .into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn length_null_binary() -> Result<()> { + fn length_null_binary() { let value: Vec> = vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")]; let result: Vec> = vec![Some(4), None, Some(2), Some(5)]; @@ -393,7 +387,7 @@ mod tests { } #[test] - fn length_null_large_binary() -> Result<()> { + fn length_null_large_binary() { let value: Vec> = vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")]; let result: Vec> = vec![Some(2), None, Some(3), Some(5)]; @@ -401,7 +395,7 @@ mod tests { } #[test] - fn length_null_list() -> Result<()> { + fn length_null_list() { let value = vec![ Some(vec![]), None, @@ -413,7 +407,7 @@ mod tests { } #[test] - fn length_null_large_list() -> Result<()> { + fn length_null_large_list() { let value = vec![ Some(vec![]), None, @@ -434,31 +428,27 @@ mod tests { /// Tests with an offset #[test] - fn length_offsets_string() -> Result<()> { + fn length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = length(b.as_ref())?; + let result = length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(1), Some(5), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn length_offsets_binary() -> Result<()> { + fn length_offsets_binary() { let value: Vec> = vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = length(b.as_ref())?; + let result = length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(1), Some(2), None]); assert_eq!(&expected, result); - - Ok(()) } fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { @@ -480,47 +470,45 @@ mod tests { #[test] #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI - fn bit_length_test_string() -> Result<()> { + fn bit_length_test_string() { bit_length_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value, result.value(i)); }); - Ok(()) }) } #[test] #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI - fn bit_length_test_large_string() -> Result<()> { + fn bit_length_test_large_string() { bit_length_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); expected.iter().enumerate().for_each(|(i, value)| { assert_eq!(*value as i64, result.value(i)); }); - Ok(()) }) } #[test] - fn bit_length_binary() -> Result<()> { + fn bit_length_binary() { let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"]; let expected: Vec = vec![24, 16, 40]; length_binary_helper!(i32, Int32Array, bit_length, value, expected) } #[test] - fn bit_length_large_binary() -> Result<()> { + fn bit_length_large_binary() { let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]]; let expected: Vec = vec![32, 8, 16]; length_binary_helper!(i64, Int64Array, bit_length, value, expected) @@ -535,28 +523,27 @@ mod tests { } #[test] - fn bit_length_null_string() -> Result<()> { + fn bit_length_null_string() { bit_length_null_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = StringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn bit_length_null_large_string() -> Result<()> { + fn bit_length_null_large_string() { bit_length_null_cases() .into_iter() - .try_for_each(|(input, len, expected)| { + .for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); - let result = bit_length(&array)?; + let result = bit_length(&array).unwrap(); assert_eq!(len, result.len()); let result = result.as_any().downcast_ref::().unwrap(); @@ -567,12 +554,11 @@ mod tests { .collect::>() .into(); assert_eq!(expected.data(), result.data()); - Ok(()) }) } #[test] - fn bit_length_null_binary() -> Result<()> { + fn bit_length_null_binary() { let value: Vec> = vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])]; let expected: Vec> = vec![Some(24), None, Some(40), Some(16)]; @@ -580,7 +566,7 @@ mod tests { } #[test] - fn bit_length_null_large_binary() -> Result<()> { + fn bit_length_null_large_binary() { let value: Vec> = vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")]; let expected: Vec> = vec![Some(24), None, Some(16), Some(32)]; @@ -597,47 +583,42 @@ mod tests { /// Tests with an offset #[test] - fn bit_length_offsets_string() -> Result<()> { + fn bit_length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = bit_length(b.as_ref())?; + let result = bit_length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(8), Some(40), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn bit_length_offsets_binary() -> Result<()> { + fn bit_length_offsets_binary() { let value: Vec> = vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = bit_length(b.as_ref())?; + let result = bit_length(b.as_ref()).unwrap(); let result: &Int32Array = as_primitive_array(&result); let expected = Int32Array::from(vec![Some(0), Some(40), None]); assert_eq!(&expected, result); - - Ok(()) } #[test] - fn length_dictionary() -> Result<()> { - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - _length_dictionary::()?; - Ok(()) - } - - fn _length_dictionary() -> Result<()> { + fn length_dictionary() { + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + _length_dictionary::(); + } + + fn _length_dictionary() { const TOTAL: i32 = 100; let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"]; @@ -657,7 +638,7 @@ mod tests { let expected: Vec> = data.iter().map(|opt| opt.map(|s| s.len() as i32)).collect(); - let res = length(&dict_array)?; + let res = length(&dict_array).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); let actual: Vec> = actual .values() @@ -670,24 +651,21 @@ mod tests { for i in 0..TOTAL as usize { assert_eq!(expected[i], actual[i],); } - - Ok(()) } #[test] - fn bit_length_dictionary() -> Result<()> { - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - _bit_length_dictionary::()?; - Ok(()) - } - - fn _bit_length_dictionary() -> Result<()> { + fn bit_length_dictionary() { + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + _bit_length_dictionary::(); + } + + fn _bit_length_dictionary() { const TOTAL: i32 = 100; let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"]; @@ -709,7 +687,7 @@ mod tests { .map(|opt| opt.map(|s| (s.chars().count() * 8) as i32)) .collect(); - let res = bit_length(&dict_array)?; + let res = bit_length(&dict_array).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); let actual: Vec> = actual .values() @@ -722,7 +700,5 @@ mod tests { for i in 0..TOTAL as usize { assert_eq!(expected[i], actual[i],); } - - Ok(()) } } diff --git a/arrow-string/src/lib.rs b/arrow-string/src/lib.rs new file mode 100644 index 000000000000..4bd4d282656c --- /dev/null +++ b/arrow-string/src/lib.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow string kernels + +pub mod concat_elements; +pub mod length; +pub mod like; +pub mod regexp; +pub mod substring; diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs new file mode 100644 index 000000000000..11d79676d63c --- /dev/null +++ b/arrow-string/src/like.rs @@ -0,0 +1,2100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::cast::*; +use arrow_array::*; +use arrow_buffer::{bit_util, MutableBuffer}; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::*; +use regex::Regex; +use std::collections::HashMap; + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// There are two wildcards supported with the LIKE operator: +/// +/// 1. `%` - The percent sign represents zero, one, or multiple characters +/// 2. `_` - The underscore represents a single character +/// +/// For example: +/// ``` +/// use arrow_array::{StringArray, BooleanArray}; +/// use arrow_string::like::like_utf8; +/// +/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); +/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); +/// +/// let result = like_utf8(&strings, &patterns).unwrap(); +/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); +/// ``` +pub fn like_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + like_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + like_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + like_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn like_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( + left: L, + right: &str, + op: F, +) -> Result { + if !right.contains(is_like_pattern) { + // fast path, can use equals + Ok(BooleanArray::from_unary(left, |item| op(item == right))) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let starts_with = &right[..right.len() - 1]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.starts_with(starts_with)) + })) + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_with = &right[1..]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.ends_with(ends_with)) + })) + } else if right.starts_with('%') + && right.ends_with('%') + && !right.ends_with("\\%") + && !right[1..right.len() - 1].contains(is_like_pattern) + { + let contains = &right[1..right.len() - 1]; + + Ok(BooleanArray::from_unary(left, |item| { + op(item.contains(contains)) + })) + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + })?; + + Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) + } +} + +#[inline] +fn like_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| x) +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + like_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + like_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + like_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn like_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + like_scalar(left, right) +} + +/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn like_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + like_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + like_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +fn replace_like_wildcards(pattern: &str) -> Result { + let mut result = String::new(); + let pattern = String::from(pattern); + let mut chars_iter = pattern.chars().peekable(); + while let Some(c) = chars_iter.next() { + if c == '\\' { + let next = chars_iter.peek(); + match next { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } + } + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); + } else { + result.push(c); + } + } + Ok(result) +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nlike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nlike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nlike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nlike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn nlike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| !x) +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nlike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nlike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + nlike_scalar(left, right) +} + +/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn nlike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + nlike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + ilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + ilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + ilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn ilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn ilike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let bytes = bit_util::ceil(left.len(), 8); + let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); + let bool_slice = bool_buf.as_slice_mut(); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + let right_uppercase = right.to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase() == right_uppercase { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let start_str = &right[..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if left + .value_unchecked(i) + .to_uppercase() + .starts_with(start_str) + { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_str = &right[1..].to_uppercase(); + + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase().ends_with(ends_str) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') + && right.ends_with('%') + && !right[1..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use contains + let contains = &right[1..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase().contains(contains) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + + for i in 0..left.len() { + let haystack = unsafe { left.value_unchecked(i) }; + if re.is_match(haystack) { + bit_util::set_bit(bool_slice, i); + } + } + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![bool_buf.into()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + ilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + ilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + ilike_scalar(left, right) +} + +/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn ilike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + ilike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) +} + +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = as_string_array(left); + let right = as_string_array(right); + nilike_utf8(left, right) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = as_largestring_array(left); + let right = as_largestring_array(right); + nilike_utf8(left, right) + } + #[cfg(feature = "dyn_cmp_dict")] + (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { + downcast_dictionary_array!( + left => { + let right = as_dictionary_array(right); + nilike_dict(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +#[cfg(feature = "dyn_cmp_dict")] +fn nilike_dict( + left: &DictionaryArray, + right: &DictionaryArray, +) -> Result { + match (left.value_type(), right.value_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + regex_like(left, right, true, |re_pattern| { + Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + }) + }) + } + _ => Err(ArrowError::ComputeError( + "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" + .to_string(), + )), + } +} + +#[inline] +fn nilike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + let null_bit_buffer = left.data().null_buffer().cloned(); + let bytes = bit_util::ceil(left.len(), 8); + let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); + let bool_slice = bool_buf.as_slice_mut(); + + if !right.contains(is_like_pattern) { + // fast path, can use equals + let right_uppercase = right.to_uppercase(); + for i in 0..left.len() { + unsafe { + if left.value_unchecked(i).to_uppercase() != right_uppercase { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let start_str = &right[..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if !(left + .value_unchecked(i) + .to_uppercase() + .starts_with(start_str)) + { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_str = &right[1..].to_uppercase(); + + for i in 0..left.len() { + unsafe { + if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else if right.starts_with('%') + && right.ends_with('%') + && !right[1..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use contains + let contains = &right[1..right.len() - 1].to_uppercase(); + for i in 0..left.len() { + unsafe { + if !(left.value_unchecked(i).to_uppercase().contains(contains)) { + bit_util::set_bit(bool_slice, i); + } + } + } + } else { + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; + + for i in 0..left.len() { + let haystack = unsafe { left.value_unchecked(i) }; + if !re.is_match(haystack) { + bit_util::set_bit(bool_slice, i); + } + } + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![bool_buf.into()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar_dyn( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + nilike_scalar(left, right) +} + +/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +fn nilike_dict_scalar( + left: &DictionaryArray, + right: &str, +) -> Result { + match left.value_type() { + DataType::Utf8 => { + let left = left.downcast_dict::>().unwrap(); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = left.downcast_dict::>().unwrap(); + nilike_scalar(left, right) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + +fn is_like_pattern(c: char) -> bool { + c == '%' || c == '_' +} + +/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] +/// +/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) +fn regex_like<'a, S: ArrayAccessor, F>( + left: S, + right: S, + negate_regex: bool, + op: F, +) -> Result +where + F: Fn(&str) -> Result, +{ + let mut map = HashMap::new(); + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + + let null_bit_buffer = + combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); + + let mut result = BooleanBufferBuilder::new(left.len()); + for i in 0..left.len() { + let haystack = left.value(i); + let pat = right.value(i); + let re = if let Some(ref regex) = map.get(pat) { + regex + } else { + let re_pattern = replace_like_wildcards(pat)?; + let re = op(&re_pattern)?; + map.insert(pat, re); + map.get(pat).unwrap() + }; + + result.append(if negate_regex { + !re.is_match(haystack) + } else { + re.is_match(haystack) + }); + } + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::types::Int8Type; + + macro_rules! test_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let right = StringArray::from($right); + let res = $op(&left, &right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + + macro_rules! test_dict_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn $test_name() { + let left: DictionaryArray = $left.into_iter().collect(); + let right: DictionaryArray = $right.into_iter().collect(); + let res = $op(&left, &right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + + macro_rules! test_utf8_scalar { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let res = $op(&left, $right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + + let left = LargeStringArray::from($left); + let res = $op(&left, $right).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + } + }; + ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { + test_utf8_scalar!($test_name, $left, $right, $op, $expected); + test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected); + }; + } + + test_utf8!( + test_utf8_array_like, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], + like_utf8, + vec![true, true, true, false, false, true, false, false] + ); + + test_dict_utf8!( + test_utf8_array_like_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], + like_dyn, + vec![true, true, true, false, false, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_testing, + test_utf8_array_like_scalar_dyn_escape_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex, + test_utf8_array_like_scalar_dyn_escape_regex, + vec![".*", "a", "*"], + ".*", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_escape_regex_dot, + test_utf8_array_like_scalar_dyn_escape_regex_dot, + vec![".", "a", "*"], + ".", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar, + test_utf8_array_like_scalar_dyn, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_start, + test_utf8_array_like_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, true, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_end, + test_utf8_array_like_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "%arrow", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_equals, + test_utf8_array_like_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_like_scalar_one, + test_utf8_array_like_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![false, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_scalar_like_escape, + test_utf8_scalar_like_dyn_escape, + vec!["a%", "a\\x"], + "a\\%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false] + ); + + test_utf8_scalar!( + test_utf8_scalar_like_escape_contains, + test_utf8_scalar_like_dyn_escape_contains, + vec!["ba%", "ba\\x"], + "%a\\%", + like_utf8_scalar, + like_utf8_scalar_dyn, + vec![true, false] + ); + + test_utf8!( + test_utf8_scalar_ilike_regex, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_utf8, + vec![true] + ); + + test_dict_utf8!( + test_utf8_scalar_ilike_regex_dict, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_dyn, + vec![true] + ); + + #[test] + fn test_replace_like_wildcards() { + let a_eq = "_%"; + let expected = "..*"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_leave_like_meta_chars() { + let a_eq = "\\%\\_"; + let expected = "%_"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_with_multiple_escape_chars() { + let a_eq = "\\\\%"; + let expected = "\\\\%"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_escape_regex_meta_char() { + let a_eq = "."; + let expected = "\\."; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + test_utf8!( + test_utf8_array_nlike, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_utf8, + vec![false, false, false, true, true, false, true] + ); + + test_dict_utf8!( + test_utf8_array_nlike_dict, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], + nlike_dyn, + vec![false, false, false, true, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_escape_testing, + test_utf8_array_nlike_escape_dyn_testing_dyn, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex, + test_utf8_array_nlike_scalar_dyn_escape_regex, + vec![".*", "a", "*"], + ".*", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_escape_regex_dot, + test_utf8_array_nlike_scalar_dyn_escape_regex_dot, + vec![".", "a", "*"], + ".", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true] + ); + test_utf8_scalar!( + test_utf8_array_nlike_scalar, + test_utf8_array_nlike_scalar_dyn, + vec!["arrow", "parquet", "datafusion", "flight"], + "%ar%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_start, + test_utf8_array_nlike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow%", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_end, + test_utf8_array_nlike_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "%arrow", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_equals, + test_utf8_array_nlike_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nlike_scalar_one, + test_utf8_array_nlike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nlike_utf8_scalar, + nlike_utf8_scalar_dyn, + vec![true, false, true, true] + ); + + test_utf8!( + test_utf8_array_ilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_utf8, + vec![true, true, true, false, false, true, false] + ); + + test_dict_utf8!( + test_utf8_array_ilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + ilike_dyn, + vec![true, true, true, false, false, true, false] + ); + + test_utf8_scalar!( + ilike_utf8_scalar_escape_testing, + ilike_utf8_scalar_escape_dyn_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar, + test_utf8_array_ilike_dyn_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_start, + test_utf8_array_ilike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, false, true, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_end, + test_utf8_array_ilike_scalar_dyn_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, true, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_equals, + test_utf8_array_ilike_scalar_dyn_equals, + vec!["arrow", "parrow", "arrows", "arr"], + "Arrow", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![true, false, false, false] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_scalar_one, + test_utf8_array_ilike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, false, false] + ); + + test_utf8!( + test_utf8_array_nilike, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_utf8, + vec![false, false, false, true, true, false, true] + ); + + test_dict_utf8!( + test_utf8_array_nilike_dict, + vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], + nilike_dyn, + vec![false, false, false, true, true, false, true] + ); + + test_utf8_scalar!( + nilike_utf8_scalar_escape_testing, + nilike_utf8_scalar_escape_dyn_testing, + vec!["varchar(255)", "int(255)", "varchar", "int"], + "%(%)%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar, + test_utf8_array_nilike_dyn_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "%AR%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_start, + test_utf8_array_nilike_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "ARR"], + "aRRow%", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, true, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_end, + test_utf8_array_nilike_scalar_dyn_end, + vec!["ArroW", "parrow", "ARRowS", "arr"], + "%arrow", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_equals, + test_utf8_array_nilike_scalar_dyn_equals, + vec!["arRow", "parrow", "arrows", "arr"], + "Arrow", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_nilike_scalar_one, + test_utf8_array_nilike_scalar_dyn_one, + vec!["arrow", "arrows", "parrow", "arr"], + "arrow_", + nilike_utf8_scalar, + nilike_utf8_scalar_dyn, + vec![true, false, true, true] + ); + + #[test] + fn test_dict_like_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + } + + #[test] + fn test_dict_nlike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + } + + #[test] + fn test_dict_ilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(true), + None, + Some(true) + ]), + ); + } + + #[test] + fn test_dict_nilike_kernels() { + let data = vec![ + Some("Earth"), + Some("Fire"), + Some("Water"), + Some("Air"), + None, + Some("Air"), + ]; + + let dict_array: DictionaryArray = data.into_iter().collect(); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(true), + None, + Some(true) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), + BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), + BooleanArray::from(vec![ + Some(false), + Some(true), + Some(false), + Some(false), + None, + Some(false) + ]), + ); + } +} diff --git a/arrow/src/compute/kernels/regexp.rs b/arrow-string/src/regexp.rs similarity index 53% rename from arrow/src/compute/kernels/regexp.rs rename to arrow-string/src/regexp.rs index 1c5fa1927756..bb4b2b0a8268 100644 --- a/arrow/src/compute/kernels/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -18,22 +18,154 @@ //! Defines kernel to extract substrings based on a regular //! expression of a \[Large\]StringArray -use crate::array::{ - ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder, OffsetSizeTrait, -}; -use crate::error::{ArrowError, Result}; +use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; +use arrow_array::*; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; +use regex::Regex; use std::collections::HashMap; - use std::sync::Arc; -use regex::Regex; +/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. +/// If `regex_array` element has an empty value, the corresponding result value is always true. +/// +/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow +/// special search modes, such as case insensitive and multi-line mode. +/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) +/// for more information. +pub fn regexp_is_match_utf8( + array: &GenericStringArray, + regex_array: &GenericStringArray, + flags_array: Option<&GenericStringArray>, +) -> Result { + if array.len() != regex_array.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + let null_bit_buffer = + combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); + + let mut patterns: HashMap = HashMap::new(); + let mut result = BooleanBufferBuilder::new(array.len()); + + let complete_pattern = match flags_array { + Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( + |(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(flag) => format!("(?{}){}", flag, pattern), + None => pattern.to_string(), + }) + }, + )) as Box>>, + None => Box::new( + regex_array + .iter() + .map(|pattern| pattern.map(|pattern| pattern.to_string())), + ), + }; + + array + .iter() + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + // Required for Postgres compatibility: + // SELECT 'foobarbequebaz' ~ ''); = true + (Some(_), Some(pattern)) if pattern == *"" => { + result.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re.clone(), + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {:?}", + e + )) + })?; + patterns.insert(pattern, re.clone()); + re + } + }; + result.append(re.is_match(value)); + } + _ => result.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()?; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![result.finish()], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + +/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`regexp_is_match_utf8`] for more details. +pub fn regexp_is_match_utf8_scalar( + array: &GenericStringArray, + regex: &str, + flag: Option<&str>, +) -> Result { + let null_bit_buffer = array.data().null_buffer().cloned(); + let mut result = BooleanBufferBuilder::new(array.len()); + + let pattern = match flag { + Some(flag) => format!("(?{}){}", flag, regex), + None => regex.to_string(), + }; + if pattern.is_empty() { + result.append_n(array.len(), true); + } else { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {:?}", + e + )) + })?; + for i in 0..array.len() { + let value = array.value(i); + result.append(re.is_match(value)); + } + } + + let buffer = result.finish(); + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + array.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} /// Extract all groups matched by a regular expression for a given String array. pub fn regexp_match( array: &GenericStringArray, regex_array: &GenericStringArray, flags_array: Option<&GenericStringArray>, -) -> Result { +) -> Result { let mut patterns: HashMap = HashMap::new(); let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); @@ -94,14 +226,14 @@ pub fn regexp_match( } Ok(()) }) - .collect::>>()?; + .collect::, ArrowError>>()?; Ok(Arc::new(list_builder.finish())) } #[cfg(test)] mod tests { use super::*; - use crate::array::{ListArray, StringArray}; + use arrow_array::{ListArray, StringArray}; #[test] fn match_single_group() { @@ -117,7 +249,7 @@ mod tests { let mut pattern_values = vec![r".*-(\d*)-.*"; 4]; pattern_values.push(r"(bar)(bequ1e)"); pattern_values.push(""); - let pattern = StringArray::from(pattern_values); + let pattern = GenericStringArray::::from(pattern_values); let actual = regexp_match(&array, &pattern, None).unwrap(); let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); let mut expected_builder = ListBuilder::new(elem_builder); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow-string/src/substring.rs similarity index 95% rename from arrow/src/compute/kernels/substring.rs rename to arrow-string/src/substring.rs index 23cb2c19fddf..ece367553414 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow-string/src/substring.rs @@ -19,14 +19,12 @@ //! Supported array types: //! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray] -use crate::array::DictionaryArray; -use crate::buffer::MutableBuffer; -use crate::datatypes::*; -use crate::{array::*, buffer::Buffer}; -use crate::{ - datatypes::DataType, - error::{ArrowError, Result}, -}; +use arrow_array::builder::BufferBuilder; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::cmp::Ordering; use std::sync::Arc; @@ -45,8 +43,8 @@ use std::sync::Arc; /// /// # Basic usage /// ``` -/// # use arrow::array::StringArray; -/// # use arrow::compute::kernels::substring::substring; +/// # use arrow_array::StringArray; +/// # use arrow_string::substring::substring; /// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]); /// let result = substring(&array, 1, Some(4)).unwrap(); /// let result = result.as_any().downcast_ref::().unwrap(); @@ -61,13 +59,17 @@ use std::sync::Arc; /// /// ## Example of trying to get an invalid utf-8 format substring /// ``` -/// # use arrow::array::StringArray; -/// # use arrow::compute::kernels::substring::substring; +/// # use arrow_array::StringArray; +/// # use arrow_string::substring::substring; /// let array = StringArray::from(vec![Some("E=mc²")]); /// let error = substring(&array, 0, Some(5)).unwrap_err().to_string(); /// assert!(error.contains("invalid utf-8 boundary")); /// ``` -pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result { +pub fn substring( + array: &dyn Array, + start: i64, + length: Option, +) -> Result { macro_rules! substring_dict { ($kt: ident, $($t: ident: $gt: ident), *) => { match $kt.as_ref() { @@ -171,8 +173,8 @@ pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result( array: &GenericStringArray, start: i64, length: Option, -) -> Result> { +) -> Result, ArrowError> { let mut vals = BufferBuilder::::new({ let offsets = array.value_offsets(); (offsets[array.len()] - offsets[0]).to_usize().unwrap() @@ -251,7 +253,7 @@ fn binary_substring( array: &GenericBinaryArray, start: OffsetSize, length: Option, -) -> Result { +) -> Result { let offsets = array.value_offsets(); let data = array.value_data(); let zero = OffsetSize::zero(); @@ -312,7 +314,7 @@ fn fixed_size_binary_substring( old_len: i32, start: i32, length: Option, -) -> Result { +) -> Result { let new_start = if start >= 0 { start.min(old_len) } else { @@ -361,7 +363,7 @@ fn utf8_substring( array: &GenericStringArray, start: OffsetSize, length: Option, -) -> Result { +) -> Result { let offsets = array.value_offsets(); let data = array.value_data(); let zero = OffsetSize::zero(); @@ -391,21 +393,23 @@ fn utf8_substring( let mut len_so_far = zero; new_offsets.push(zero); - offsets.windows(2).try_for_each(|pair| -> Result<()> { - let new_start = match start.cmp(&zero) { - Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, - Ordering::Equal => pair[0], - Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, - }; - let new_end = match length { - Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, - None => pair[1], - }; - len_so_far += new_end - new_start; - new_starts_ends.push((new_start, new_end)); - new_offsets.push(len_so_far); - Ok(()) - })?; + offsets + .windows(2) + .try_for_each(|pair| -> Result<(), ArrowError> { + let new_start = match start.cmp(&zero) { + Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, + Ordering::Equal => pair[0], + Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, + }; + let new_end = match length { + Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, + None => pair[1], + }; + len_so_far += new_end - new_start; + new_starts_ends.push((new_start, new_end)); + new_offsets.push(len_so_far); + Ok(()) + })?; // concatenate substrings into a buffer let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); @@ -439,7 +443,6 @@ fn utf8_substring( #[cfg(test)] mod tests { use super::*; - use crate::datatypes::*; /// A helper macro to generate test cases. /// # Arguments diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 876d0d65084e..17f88c084cbc 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,12 +54,12 @@ arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true } arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true } arrow-schema = { version = "28.0.0", path = "../arrow-schema" } arrow-select = { version = "28.0.0", path = "../arrow-select" } +arrow-string = { version = "28.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.13", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } @@ -92,7 +92,7 @@ force_validate = ["arrow-data/force_validate"] ffi = ["bitflags"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars -dyn_cmp_dict = [] +dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"] # Enable dyn-arithmetic kernels for dictionary arrays # Note: this does not impact arithmetic with scalars dyn_arith_dict = [] diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index b672410fec15..6976a68d99af 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -23,1227 +23,75 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. //! -use crate::array::*; -use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; -use crate::datatypes::*; -#[allow(unused_imports)] -use crate::downcast_dictionary_array; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_select::take::take; -use num::ToPrimitive; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -fn compare_op( - left: T, - right: S, - op: F, -) -> Result -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -fn compare_op_scalar(left: T, op: F) -> Result -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified -/// comparison function. -pub fn no_simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op(left, right, op) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using -/// a specified comparison function. -pub fn no_simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op_scalar(left, |l| op(l, right)) -} - -fn is_like_pattern(c: char) -> bool { - c == '%' || c == '_' -} - -/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] -/// -/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) -fn regex_like<'a, S: ArrayAccessor, F>( - left: S, - right: S, - negate_regex: bool, - op: F, -) -> Result -where - F: Fn(&str) -> Result, -{ - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); - - let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = replace_like_wildcards(pat)?; - let re = op(&re_pattern)?; - map.insert(pat, re); - map.get(pat).unwrap() - }; - - result.append(if negate_regex { - !re.is_match(haystack) - } else { - re.is_match(haystack) - }); - } - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow::array::{StringArray, BooleanArray}; -/// use arrow::compute::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - like_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - like_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - like_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn like_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( - left: L, - right: &str, - op: F, -) -> Result { - if !right.contains(is_like_pattern) { - // fast path, can use equals - compare_op_scalar(left, |item| op(item == right)) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - compare_op_scalar(left, |item| op(item.starts_with(starts_with))) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - compare_op_scalar(left, |item| op(item.ends_with(ends_with))) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - compare_op_scalar(left, |item| op(item.contains(contains))) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - - compare_op_scalar(left, |item| op(re.is_match(item))) - } -} - -#[inline] -fn like_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - like_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - like_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - like_scalar(left, right) -} - -/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn like_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } - } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nlike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nlike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nlike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nlike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nlike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - nlike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nlike_scalar(left, right) -} - -/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nlike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - ilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - ilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - ilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn ilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn ilike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() == right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str) - { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); - - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().ends_with(ends_str) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().contains(contains) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - ilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - ilike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - ilike_scalar(left, right) -} - -/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn ilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - -#[inline] -fn nilike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() != right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str)) - { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); - - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().contains(contains)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if !re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - nilike_dict_scalar(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nilike_scalar(left, right) -} - -/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} +pub use arrow_string::like::*; +pub use arrow_string::regexp::{regexp_is_match_utf8, regexp_is_match_utf8_scalar}; -/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`]. -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow -/// special search modes, such as case insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -pub fn regexp_is_match_utf8( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> Result { - if array.len() != regex_array.len() { +use crate::array::*; +use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; +use crate::datatypes::*; +#[allow(unused_imports)] +use crate::downcast_dictionary_array; +use crate::error::{ArrowError, Result}; +use crate::util::bit_util; +use arrow_data::bit_mask::combine_option_bitmap; +use arrow_select::take::take; +use num::ToPrimitive; + +/// Helper function to perform boolean lambda function on values from two array accessors, this +/// version does not attempt to use SIMD. +fn compare_op( + left: T, + right: S, + op: F, +) -> Result +where + F: Fn(T::Item, S::Item) -> bool, +{ + if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform comparison operation on arrays of different length" .to_string(), )); } - let null_bit_buffer = - combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{}){}", flag, pattern), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT 'foobarbequebaz' ~ ''); = true - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re.clone(), - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) - })?; - patterns.insert(pattern, re.clone()); - re - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::>>()?; - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + Ok(BooleanArray::from_binary(left, right, op)) } -/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`regexp_is_match_utf8`] for more details. -pub fn regexp_is_match_utf8_scalar( - array: &GenericStringArray, - regex: &str, - flag: Option<&str>, -) -> Result { - let null_bit_buffer = array.data().null_buffer().cloned(); - let mut result = BooleanBufferBuilder::new(array.len()); +/// Helper function to perform boolean lambda function on values from array accessor, this +/// version does not attempt to use SIMD. +fn compare_op_scalar(left: T, op: F) -> Result +where + F: Fn(T::Item) -> bool, +{ + Ok(BooleanArray::from_unary(left, op)) +} - let pattern = match flag { - Some(flag) => format!("(?{}){}", flag, regex), - None => regex.to_string(), - }; - if pattern.is_empty() { - result.append_n(array.len(), true); - } else { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) - })?; - for i in 0..array.len() { - let value = array.value(i); - result.append(re.is_match(value)); - } - } +/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified +/// comparison function. +pub fn no_simd_compare_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + compare_op(left, right, op) +} - let buffer = result.finish(); - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using +/// a specified comparison function. +pub fn no_simd_compare_op_scalar( + left: &PrimitiveArray, + right: T::Native, + op: F, +) -> Result +where + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> bool, +{ + compare_op_scalar(left, |l| op(l, right)) } /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`]. @@ -4727,24 +3575,6 @@ mod tests { }; } - macro_rules! test_dict_utf8 { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - #[cfg(feature = "dyn_cmp_dict")] - fn $test_name() { - let left: DictionaryArray = $left.into_iter().collect(); - let right: DictionaryArray = $right.into_iter().collect(); - let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } - } - }; - } - #[test] fn test_utf8_eq_scalar_on_slice() { let a = StringArray::from( @@ -4879,432 +3709,25 @@ mod tests { left.value(i), i, $right - ); - } - } - }; - } - - test_utf8!( - test_utf8_array_like, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_utf8, - vec![true, true, true, false, false, true, false, false] - ); - - test_dict_utf8!( - test_utf8_array_like_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_dyn, - vec![true, true, true, false, false, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_testing, - test_utf8_array_like_scalar_dyn_escape_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_regex, - test_utf8_array_like_scalar_dyn_escape_regex, - vec![".*", "a", "*"], - ".*", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_escape_regex_dot, - test_utf8_array_like_scalar_dyn_escape_regex_dot, - vec![".", "a", "*"], - ".", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar, - test_utf8_array_like_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_start, - test_utf8_array_like_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_end, - test_utf8_array_like_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_equals, - test_utf8_array_like_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_one, - test_utf8_array_like_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![false, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_scalar_like_escape, - test_utf8_scalar_like_dyn_escape, - vec!["a%", "a\\x"], - "a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] - ); - - test_utf8_scalar!( - test_utf8_scalar_like_escape_contains, - test_utf8_scalar_like_dyn_escape_contains, - vec!["ba%", "ba\\x"], - "%a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] - ); - - test_utf8!( - test_utf8_scalar_ilike_regex, - vec!["%%%"], - vec![r#"\%_\%"#], - ilike_utf8, - vec![true] - ); - - test_dict_utf8!( - test_utf8_scalar_ilike_regex_dict, - vec!["%%%"], - vec![r#"\%_\%"#], - ilike_dyn, - vec![true] - ); - - #[test] - fn test_replace_like_wildcards() { - let a_eq = "_%"; - let expected = "..*"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_leave_like_meta_chars() { - let a_eq = "\\%\\_"; - let expected = "%_"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_with_multiple_escape_chars() { - let a_eq = "\\\\%"; - let expected = "\\\\%"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_escape_regex_meta_char() { - let a_eq = "."; - let expected = "\\."; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - test_utf8!( - test_utf8_array_eq, - vec!["arrow", "arrow", "arrow", "arrow"], - vec!["arrow", "parquet", "datafusion", "flight"], - eq_utf8, - vec![true, false, false, false] - ); - test_utf8_scalar!( - test_utf8_array_eq_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "arrow", - eq_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8!( - test_utf8_array_nlike, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nlike_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_dyn, - vec![false, false, false, true, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_escape_testing, - test_utf8_array_nlike_escape_dyn_testing_dyn, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_escape_regex, - test_utf8_array_nlike_scalar_dyn_escape_regex, - vec![".*", "a", "*"], - ".*", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_escape_regex_dot, - test_utf8_array_nlike_scalar_dyn_escape_regex_dot, - vec![".", "a", "*"], - ".", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true] - ); - test_utf8_scalar!( - test_utf8_array_nlike_scalar, - test_utf8_array_nlike_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_start, - test_utf8_array_nlike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_end, - test_utf8_array_nlike_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_equals, - test_utf8_array_nlike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_one, - test_utf8_array_nlike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![true, false, true, true] - ); - - test_utf8!( - test_utf8_array_ilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_utf8, - vec![true, true, true, false, false, true, false] - ); - - test_dict_utf8!( - test_utf8_array_ilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_dyn, - vec![true, true, true, false, false, true, false] - ); - - test_utf8_scalar!( - ilike_utf8_scalar_escape_testing, - ilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar, - test_utf8_array_ilike_dyn_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%AR%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_start, - test_utf8_array_ilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], - "aRRow%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_end, - test_utf8_array_ilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], - "%arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_equals, - test_utf8_array_ilike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "Arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_ilike_scalar_one, - test_utf8_array_ilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, false, false] - ); + ); + } + } + }; + } test_utf8!( - test_utf8_array_nilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_dyn, - vec![false, false, false, true, true, false, true] - ); - - test_utf8_scalar!( - nilike_utf8_scalar_escape_testing, - nilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar, - test_utf8_array_nilike_dyn_scalar, + test_utf8_array_eq, + vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"], - "%AR%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_start, - test_utf8_array_nilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], - "aRRow%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_end, - test_utf8_array_nilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], - "%arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nilike_scalar_equals, - test_utf8_array_nilike_scalar_dyn_equals, - vec!["arRow", "parrow", "arrows", "arr"], - "Arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, true, true] + eq_utf8, + vec![true, false, false, false] ); - test_utf8_scalar!( - test_utf8_array_nilike_scalar_one, - test_utf8_array_nilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![true, false, true, true] + test_utf8_array_eq_scalar, + vec!["arrow", "parquet", "datafusion", "flight"], + "arrow", + eq_utf8_scalar, + vec![true, false, false, false] ); test_utf8!( @@ -6667,86 +5090,6 @@ mod tests { assert_eq!(gt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } - #[test] - fn test_dict_like_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - like_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - } - #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_to_utf8_array() { @@ -6959,246 +5302,6 @@ mod tests { ); } - #[test] - fn test_dict_nlike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nlike_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - } - - #[test] - fn test_dict_ilike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - ilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] - ), - ); - } - - #[test] - fn test_dict_nilike_kernels() { - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_array: DictionaryArray = data.into_iter().collect(); - - let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), - BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - - assert_eq!( - nilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), - BooleanArray::from( - vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] - ), - ); - } - #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dict_non_dict_float_nan() { diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 0eebb701232a..29468861f82a 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -23,15 +23,12 @@ pub mod arity; pub mod bitwise; pub mod boolean; pub mod comparison; -pub mod concat_elements; -pub mod length; pub mod limit; pub mod partition; -pub mod regexp; pub mod sort; -pub mod substring; pub mod temporal; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_select::{concat, filter, interleave, take, window, zip}; +pub use arrow_string::{concat_elements, length, regexp, substring}; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 1b2ff0684a66..a27e6b9af44a 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -39,6 +39,7 @@ //! * [`arrow-json`][arrow_json] - read/write JSON to arrow format //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays +//! * [`arrow-string`][arrow_string] - string kernels for arrow arrays //! //! _This list is likely to grow as further functionality is split out from the top-level crate_ //! diff --git a/dev/release/README.md b/dev/release/README.md index a3d1a8c314a6..0e35f80aaf9f 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -258,6 +258,7 @@ Rust Arrow Crates: (cd arrow-array && cargo publish) (cd arrow-select && cargo publish) (cd arrow-cast && cargo publish) +(cd arrow-string && cargo publish) (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) From bf1cccb3fdab5d9d5c6d759448ba6a96c63f89a1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Dec 2022 16:37:14 +0000 Subject: [PATCH 0390/1411] Add more comparison benchmarks (#3298) --- arrow/benches/comparison_kernels.rs | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 6599e3725aab..99229ed0b37b 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -168,6 +168,51 @@ fn add_benchmark(c: &mut Criterion) { }) }); + let arr_a = create_primitive_array_with_seed::(size, 0.0, 42); + let arr_b = create_primitive_array_with_seed::(size, 0.0, 43); + + c.bench_function("eq Int32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); + c.bench_function("eq scalar Int32", |b| { + b.iter(|| { + eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + + c.bench_function("neq Int32", |b| b.iter(|| bench_neq(&arr_a, &arr_b))); + c.bench_function("neq scalar Int32", |b| { + b.iter(|| { + neq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + + c.bench_function("lt Int32", |b| b.iter(|| bench_lt(&arr_a, &arr_b))); + c.bench_function("lt scalar Int32", |b| { + b.iter(|| { + lt_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + + c.bench_function("lt_eq Int32", |b| b.iter(|| bench_lt_eq(&arr_a, &arr_b))); + c.bench_function("lt_eq scalar Int32", |b| { + b.iter(|| { + lt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + + c.bench_function("gt Int32", |b| b.iter(|| bench_gt(&arr_a, &arr_b))); + c.bench_function("gt scalar Int32", |b| { + b.iter(|| { + gt_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + + c.bench_function("gt_eq Int32", |b| b.iter(|| bench_gt_eq(&arr_a, &arr_b))); + c.bench_function("gt_eq scalar Int32", |b| { + b.iter(|| { + gt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() + }) + }); + c.bench_function("eq MonthDayNano", |b| { b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b)) }); From 7d2139749029f78b1d88eddf24be664071e12686 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 8 Dec 2022 08:50:13 -0800 Subject: [PATCH 0391/1411] Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec (#3293) * Fix null buffer import/export behavior * Clippy Co-authored-by: Raphael Taylor-Davies --- arrow/src/ffi.rs | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 0c1c1fa54df0..abb53dff68bd 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -462,12 +462,18 @@ impl FFI_ArrowArray { /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub fn new(data: &ArrayData) -> Self { - // * insert the null buffer at the start - // * make all others `Option`. - let buffers = iter::once(data.null_buffer().cloned()) - .chain(data.buffers().iter().map(|b| Some(b.clone()))) - .collect::>(); let data_layout = layout(data.data_type()); + + let buffers = if data_layout.can_contain_null_mask { + // * insert the null buffer at the start + // * make all others `Option`. + iter::once(data.null_buffer().cloned()) + .chain(data.buffers().iter().map(|b| Some(b.clone()))) + .collect::>() + } else { + data.buffers().iter().map(|b| Some(b.clone())).collect() + }; + // `n_buffers` is the number of buffers by the spec. let n_buffers = { data_layout.buffers.len() + { @@ -616,8 +622,15 @@ pub trait ArrowArrayRef { let len = self.array().len(); let offset = self.array().offset(); let null_count = self.array().null_count(); - let buffers = self.buffers()?; - let null_bit_buffer = self.null_bit_buffer(); + + let data_layout = layout(&data_type); + let buffers = self.buffers(data_layout.can_contain_null_mask)?; + + let null_bit_buffer = if data_layout.can_contain_null_mask { + self.null_bit_buffer() + } else { + None + }; let mut child_data: Vec = (0..self.array().n_children as usize) .map(|i| { @@ -649,11 +662,12 @@ pub trait ArrowArrayRef { } /// returns all buffers, as organized by Rust (i.e. null buffer is skipped) - fn buffers(&self) -> Result> { - (0..self.array().n_buffers - 1) + fn buffers(&self, can_contain_null_mask: bool) -> Result> { + // + 1: skip null buffer + let buffer_begin = can_contain_null_mask as i64; + (buffer_begin..self.array().n_buffers) .map(|index| { - // + 1: skip null buffer - let index = (index + 1) as usize; + let index = index as usize; let len = self.buffer_len(index)?; @@ -668,7 +682,7 @@ pub trait ArrowArrayRef { } None => Err(ArrowError::CDataInterface(format!( "The external buffer at position {} is null.", - index - 1 + index ))), } }) From fa1f6112f091d7493f8911a47a893048ce0e4e02 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 8 Dec 2022 08:50:24 -0800 Subject: [PATCH 0392/1411] Set bloom filter on byte array (#3284) * Set bloom filter on byte array * Check positive values * For review * Clippy Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/arrow_writer/byte_array.rs | 18 ++- parquet/src/arrow/arrow_writer/mod.rs | 119 ++++++++++++++++++- parquet/src/bloom_filter/mod.rs | 2 +- 3 files changed, 131 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index d870ac54fe4d..c3a9f83d15f3 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -429,6 +429,7 @@ struct ByteArrayEncoder { dict_encoder: Option, min_value: Option, max_value: Option, + bloom_filter: Option, } impl ColumnValueEncoder for ByteArrayEncoder { @@ -453,8 +454,7 @@ impl ColumnValueEncoder for ByteArrayEncoder { } fn flush_bloom_filter(&mut self) -> Option { - // TODO FIX ME need to handle bloom filter in arrow writer - None + self.bloom_filter.take() } fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result @@ -467,11 +467,17 @@ impl ColumnValueEncoder for ByteArrayEncoder { let fallback = FallbackEncoder::new(descr, props)?; + let bloom_filter = props + .bloom_filter_properties(descr.path()) + .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp)) + .transpose()?; + Ok(Self { fallback, dict_encoder: dictionary, min_value: None, max_value: None, + bloom_filter, }) } @@ -555,6 +561,14 @@ where } } + // encode the values into bloom filter if enabled + if let Some(bloom_filter) = &mut encoder.bloom_filter { + let valid = indices.iter().cloned(); + for idx in valid { + bloom_filter.insert(values.value(idx).as_ref()); + } + } + match &mut encoder.dict_encoder { Some(dict_encoder) => dict_encoder.encode(values, indices), None => encoder.fallback.encode(values, indices), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index ecb59e93e2f9..a609b992a393 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -622,7 +622,8 @@ mod tests { use crate::basic::Encoding; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index_reader::read_pages_locations; - use crate::file::properties::WriterVersion; + use crate::file::properties::{ReaderProperties, WriterVersion}; + use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ reader::{FileReader, SerializedFileReader}, statistics::Statistics, @@ -1269,6 +1270,7 @@ mod tests { .set_dictionary_enabled(dictionary_size != 0) .set_dictionary_pagesize_limit(dictionary_size.max(1)) .set_encoding(*encoding) + .set_bloom_filter_enabled(true) .build(); files.push(roundtrip_opts(&expected_batch, props)) @@ -1279,17 +1281,17 @@ mod tests { files } - fn values_required(iter: I) + fn values_required(iter: I) -> Vec where A: From> + Array + 'static, I: IntoIterator, { let raw_values: Vec<_> = iter.into_iter().collect(); let values = Arc::new(A::from(raw_values)); - one_column_roundtrip(values, false); + one_column_roundtrip(values, false) } - fn values_optional(iter: I) + fn values_optional(iter: I) -> Vec where A: From>> + Array + 'static, I: IntoIterator, @@ -1300,7 +1302,7 @@ mod tests { .map(|(i, v)| if i % 2 == 0 { None } else { Some(v) }) .collect(); let optional_values = Arc::new(A::from(optional_raw_values)); - one_column_roundtrip(optional_values, true); + one_column_roundtrip(optional_values, true) } fn required_and_optional(iter: I) @@ -1312,6 +1314,70 @@ mod tests { values_optional::(iter); } + fn check_bloom_filter( + files: Vec, + file_column: String, + positive_values: Vec, + negative_values: Vec, + ) { + files.into_iter().take(1).for_each(|file| { + let file_reader = SerializedFileReader::new_with_options( + file, + ReadOptionsBuilder::new() + .with_reader_properties( + ReaderProperties::builder() + .set_read_bloom_filter(true) + .build(), + ) + .build(), + ) + .expect("Unable to open file as Parquet"); + let metadata = file_reader.metadata(); + + // Gets bloom filters from all row groups. + let mut bloom_filters: Vec<_> = vec![]; + for (ri, row_group) in metadata.row_groups().iter().enumerate() { + if let Some((column_index, _)) = row_group + .columns() + .iter() + .enumerate() + .find(|(_, column)| column.column_path().string() == file_column) + { + let row_group_reader = file_reader + .get_row_group(ri) + .expect("Unable to read row group"); + if let Some(sbbf) = + row_group_reader.get_column_bloom_filter(column_index) + { + bloom_filters.push(sbbf.clone()); + } else { + panic!("No bloom filter for column named {} found", file_column); + } + } else { + panic!("No column named {} found", file_column); + } + } + + positive_values.iter().for_each(|value| { + let found = bloom_filters.iter().find(|sbbf| sbbf.check(value)); + assert!( + found.is_some(), + "{}", + format!("Value {:?} should be in bloom filter", value.as_bytes()) + ); + }); + + negative_values.iter().for_each(|value| { + let found = bloom_filters.iter().find(|sbbf| sbbf.check(value)); + assert!( + found.is_none(), + "{}", + format!("Value {:?} should not be in bloom filter", value.as_bytes()) + ); + }); + }); + } + #[test] fn all_null_primitive_single_column() { let values = Arc::new(Int32Array::from(vec![None; SMALL_SIZE])); @@ -1528,6 +1594,49 @@ mod tests { values_required::(many_vecs_iter); } + #[test] + fn i32_column_bloom_filter() { + let positive_values: Vec = (0..SMALL_SIZE as i32).collect(); + let files = values_required::(positive_values); + check_bloom_filter( + files, + "col".to_string(), + (0..SMALL_SIZE as i32).collect(), + (SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(), + ); + } + + #[test] + fn binary_column_bloom_filter() { + let one_vec: Vec = (0..SMALL_SIZE as u8).collect(); + let many_vecs: Vec<_> = std::iter::repeat(one_vec).take(SMALL_SIZE).collect(); + let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); + + let files = values_required::(many_vecs_iter); + check_bloom_filter( + files, + "col".to_string(), + many_vecs, + vec![vec![(SMALL_SIZE + 1) as u8]], + ); + } + + #[test] + fn empty_string_null_column_bloom_filter() { + let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); + let raw_strs = raw_values.iter().map(|s| s.as_str()); + + let files = values_optional::(raw_strs); + + let optional_raw_values: Vec<_> = raw_values + .iter() + .enumerate() + .filter_map(|(i, v)| if i % 2 == 0 { None } else { Some(v.as_str()) }) + .collect(); + // For null slots, empty string should not be in bloom filter. + check_bloom_filter(files, "col".to_string(), optional_raw_values, vec![""]); + } + #[test] fn large_binary_single_column() { let one_vec: Vec = (0..SMALL_SIZE as u8).collect(); diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 15c38cf5915b..5bb89bf3f4d2 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -261,7 +261,7 @@ impl Sbbf { } /// Insert an [AsBytes] value into the filter - pub fn insert(&mut self, value: &T) { + pub fn insert(&mut self, value: &T) { self.insert_hash(hash_as_bytes(value)); } From 2db98ee2a7e36c00990a5ff54e7c0847f501148f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Dec 2022 20:17:13 +0000 Subject: [PATCH 0393/1411] Don't use dangling NonNull as sentinel (#3289) * Don't use dangling NonNull as sentinel * Review feedback --- arrow-buffer/src/alloc/mod.rs | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index 6b09c4b31b9a..a7ce80600462 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -28,9 +28,18 @@ mod alignment; pub use alignment::ALIGNMENT; +/// Returns an aligned non null pointer similar to [`NonNull::dangling`] +/// +/// Note that the pointer value may potentially represent a valid pointer, which means +/// this must not be used as a "not yet initialized" sentinel value. +/// +/// Types that lazily allocate must track initialization by some other means. #[inline] -unsafe fn null_pointer() -> NonNull { - NonNull::new_unchecked(ALIGNMENT as *mut u8) +fn dangling_ptr() -> NonNull { + // SAFETY: ALIGNMENT is a non-zero usize which is then casted + // to a *mut T. Therefore, `ptr` is not null and the conditions for + // calling new_unchecked() are respected. + unsafe { NonNull::new_unchecked(ALIGNMENT as *mut u8) } } /// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. @@ -39,7 +48,7 @@ unsafe fn null_pointer() -> NonNull { pub fn allocate_aligned(size: usize) -> NonNull { unsafe { if size == 0 { - null_pointer() + dangling_ptr() } else { let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); let raw_ptr = std::alloc::alloc(layout); @@ -54,7 +63,7 @@ pub fn allocate_aligned(size: usize) -> NonNull { pub fn allocate_aligned_zeroed(size: usize) -> NonNull { unsafe { if size == 0 { - null_pointer() + dangling_ptr() } else { let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); let raw_ptr = std::alloc::alloc_zeroed(layout); @@ -72,7 +81,7 @@ pub fn allocate_aligned_zeroed(size: usize) -> NonNull { /// /// * size must be the same size that was used to allocate that block of memory, pub unsafe fn free_aligned(ptr: NonNull, size: usize) { - if ptr != null_pointer() { + if size != 0 { std::alloc::dealloc( ptr.as_ptr() as *mut u8, Layout::from_size_align_unchecked(size, ALIGNMENT), @@ -96,13 +105,13 @@ pub unsafe fn reallocate( old_size: usize, new_size: usize, ) -> NonNull { - if ptr == null_pointer() { + if old_size == 0 { return allocate_aligned(new_size); } if new_size == 0 { free_aligned(ptr, old_size); - return null_pointer(); + return dangling_ptr(); } let raw_ptr = std::alloc::realloc( From 7b3e94fd0ba89090b1fc8daf462ac25dc32ccd12 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Dec 2022 20:36:00 +0000 Subject: [PATCH 0394/1411] Split out arrow-ord (#2594) (#3299) * Split out arrow-ord (#2594) * Make LexicographicalComparator public * Tweak CI * Fix SIMD * Doc --- .github/workflows/arrow.yml | 34 +- .github/workflows/arrow_flight.yml | 1 - .github/workflows/integration.yml | 1 + .github/workflows/parquet.yml | 1 - Cargo.toml | 1 + arrow-array/Cargo.toml | 4 + arrow-array/src/lib.rs | 3 + .../datatypes => arrow-array/src}/numeric.rs | 16 +- arrow-ord/Cargo.toml | 53 ++ .../kernels => arrow-ord/src}/comparison.rs | 554 +++++++++--------- arrow-ord/src/lib.rs | 23 + {arrow/src/array => arrow-ord/src}/ord.rs | 89 ++- .../kernels => arrow-ord/src}/partition.rs | 100 ++-- .../compute/kernels => arrow-ord/src}/sort.rs | 98 ++-- arrow-string/src/regexp.rs | 115 ++++ arrow/Cargo.toml | 6 +- arrow/src/array/mod.rs | 3 +- arrow/src/compute/kernels/mod.rs | 11 +- arrow/src/datatypes/mod.rs | 7 +- arrow/src/lib.rs | 2 + arrow/src/row/mod.rs | 4 +- dev/release/README.md | 1 + 22 files changed, 657 insertions(+), 470 deletions(-) rename {arrow/src/datatypes => arrow-array/src}/numeric.rs (98%) create mode 100644 arrow-ord/Cargo.toml rename {arrow/src/compute/kernels => arrow-ord/src}/comparison.rs (94%) create mode 100644 arrow-ord/src/lib.rs rename {arrow/src/array => arrow-ord/src}/ord.rs (90%) rename {arrow/src/compute/kernels => arrow-ord/src}/partition.rs (82%) rename {arrow/src/compute/kernels => arrow-ord/src}/sort.rs (98%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 0b47f02566ce..458e0e0a149a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -25,18 +25,20 @@ on: - master pull_request: paths: - - arrow/** + - .github/** - arrow-array/** - arrow-buffer/** - arrow-cast/** + - arrow-csv/** - arrow-data/** - - arrow-schema/** - - arrow-select/** - arrow-integration-test/** - arrow-ipc/** - - arrow-csv/** - arrow-json/** - - .github/** + - arrow-ord/** + - arrow-schema/** + - arrow-select/** + - arrow-string/** + - arrow/** jobs: @@ -58,8 +60,8 @@ jobs: run: cargo test -p arrow-data --all-features - name: Test arrow-schema with all features run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features - run: cargo test -p arrow-array --all-features + - name: Test arrow-array with all features except SIMD + run: cargo test -p arrow-array - name: Test arrow-select with all features run: cargo test -p arrow-select --all-features - name: Test arrow-cast with all features @@ -72,6 +74,8 @@ jobs: run: cargo test -p arrow-json --all-features - name: Test arrow-string with all features run: cargo test -p arrow-string --all-features + - name: Test arrow-ord with all features except SIMD + run: cargo test -p arrow-ord --features dyn_cmp_dict - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -129,10 +133,12 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: nightly - - name: Run tests --features "simd" - run: cargo test -p arrow --features "simd" - - name: Check compilation --features "simd" - run: cargo check -p arrow --features simd + - name: Test arrow-array with SIMD + run: cargo test -p arrow-array --features simd + - name: Test arrow-ord with SIMD + run: cargo test -p arrow-ord --features simd + - name: Test arrow with SIMD + run: cargo test -p arrow --features simd - name: Check compilation --features simd --all-targets run: cargo check -p arrow --features simd --all-targets @@ -174,8 +180,8 @@ jobs: run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - name: Clippy arrow-schema with all features run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings + - name: Clippy arrow-array with all features except SIMD + run: cargo clippy -p arrow-array --all-targets -- -D warnings - name: Clippy arrow-select with all features run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - name: Clippy arrow-cast with all features @@ -188,5 +194,7 @@ jobs: run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - name: Clippy arrow-string with all features run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings + - name: Clippy arrow-ord with all features except SIMD + run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 356c0fc0a073..f12eb4d8beb8 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -35,7 +35,6 @@ on: - arrow-ipc/** - arrow-schema/** - arrow-select/** - - arrow-string/** - .github/** jobs: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d23f4c0717e0..526106bfe7c9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,6 +33,7 @@ on: - arrow-integration-test/** - arrow-integration-testing/** - arrow-ipc/** + - arrow-ord/** - arrow-json/** - arrow-pyarrow-integration-testing/** - arrow-schema/** diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index f7d94f85783e..67552af864c1 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -36,7 +36,6 @@ on: - arrow-ipc/** - arrow-csv/** - arrow-json/** - - arrow-string/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 556b86a008a1..c123106c6f75 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ members = [ "arrow-integration-testing", "arrow-ipc", "arrow-json", + "arrow-ord", "arrow-schema", "arrow-select", "arrow-string", diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 37f73c6d1c4b..67f59a6dcd64 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -53,6 +53,10 @@ chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.13", default-features = false } +packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } + +[features] +simd = ["packed_simd"] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 5fcd1f33d480..d6a9ab30b85b 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -170,6 +170,9 @@ pub use record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; mod arithmetic; pub use arithmetic::ArrowNativeTypeOp; +mod numeric; +pub use numeric::*; + pub mod builder; pub mod cast; mod delta; diff --git a/arrow/src/datatypes/numeric.rs b/arrow-array/src/numeric.rs similarity index 98% rename from arrow/src/datatypes/numeric.rs rename to arrow-array/src/numeric.rs index 61fd05d52f90..9d9048085106 100644 --- a/arrow/src/datatypes/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use super::*; +use crate::types::*; +use crate::ArrowPrimitiveType; #[cfg(feature = "simd")] use packed_simd::*; #[cfg(feature = "simd")] @@ -106,9 +107,11 @@ where /// Writes a SIMD result back to a slice fn write(simd_result: Self::Simd, slice: &mut [Self::Native]); + /// Performs a SIMD unary operation fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd; } +/// A subtype of primitive type that represents numeric values. #[cfg(not(feature = "simd"))] pub trait ArrowNumericType: ArrowPrimitiveType {} @@ -468,7 +471,7 @@ impl ArrowNumericType for Decimal256Type {} #[cfg(feature = "simd")] impl ArrowNumericType for Decimal256Type { - type Simd = i256; + type Simd = arrow_buffer::i256; type SimdMask = bool; fn lanes() -> usize { @@ -555,11 +558,14 @@ impl ArrowNumericType for Decimal256Type { } } +/// A subtype of primitive type that represents numeric float values #[cfg(feature = "simd")] pub trait ArrowFloatNumericType: ArrowNumericType { + /// SIMD version of pow fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd; } +/// A subtype of primitive type that represents numeric float values #[cfg(not(feature = "simd"))] pub trait ArrowFloatNumericType: ArrowNumericType {} @@ -583,11 +589,7 @@ make_float_numeric_type!(Float64Type, f64x8); #[cfg(all(test, feature = "simd"))] mod tests { - use crate::datatypes::{ - ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, Int8Type, - IntervalMonthDayNanoType, UInt16Type, - }; - use packed_simd::*; + use super::*; use FromCast; /// calculate the expected mask by iterating over all bits diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml new file mode 100644 index 000000000000..7a9e7ba4a9c5 --- /dev/null +++ b/arrow-ord/Cargo.toml @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-ord" +version = "28.0.0" +description = "Ordering kernels for arrow arrays" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_ord" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-select = { version = "28.0.0", path = "../arrow-select" } +num = { version = "0.4", default-features = false, features = ["std"] } + +[dev-dependencies] +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[features] +dyn_cmp_dict = [] +simd = ["arrow-array/simd"] diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow-ord/src/comparison.rs similarity index 94% rename from arrow/src/compute/kernels/comparison.rs rename to arrow-ord/src/comparison.rs index 6976a68d99af..196590008248 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -23,19 +23,15 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. //! -pub use arrow_string::like::*; -pub use arrow_string::regexp::{regexp_is_match_utf8, regexp_is_match_utf8_scalar}; - -use crate::array::*; -use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer}; -use crate::datatypes::*; -#[allow(unused_imports)] -use crate::downcast_dictionary_array; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::buffer::buffer_unary_not; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; -use num::ToPrimitive; /// Helper function to perform boolean lambda function on values from two array accessors, this /// version does not attempt to use SIMD. @@ -43,7 +39,7 @@ fn compare_op( left: T, right: S, op: F, -) -> Result +) -> Result where F: Fn(T::Item, S::Item) -> bool, { @@ -59,7 +55,10 @@ where /// Helper function to perform boolean lambda function on values from array accessor, this /// version does not attempt to use SIMD. -fn compare_op_scalar(left: T, op: F) -> Result +fn compare_op_scalar( + left: T, + op: F, +) -> Result where F: Fn(T::Item) -> bool, { @@ -72,9 +71,9 @@ pub fn no_simd_compare_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result +) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, F: Fn(T::Native, T::Native) -> bool, { compare_op(left, right, op) @@ -86,9 +85,9 @@ pub fn no_simd_compare_op_scalar( left: &PrimitiveArray, right: T::Native, op: F, -) -> Result +) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, F: Fn(T::Native, T::Native) -> bool, { compare_op_scalar(left, |l| op(l, right)) @@ -98,13 +97,13 @@ where pub fn eq_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a == b) } fn utf8_empty( left: &GenericStringArray, -) -> Result { +) -> Result { let null_bit_buffer = left .data() .null_buffer() @@ -140,7 +139,7 @@ fn utf8_empty( pub fn eq_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { if right.is_empty() { return utf8_empty::<_, true>(left); } @@ -148,37 +147,58 @@ pub fn eq_utf8_scalar( } /// Perform `left == right` operation on [`BooleanArray`] -pub fn eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn eq_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| !(a ^ b)) } /// Perform `left != right` operation on [`BooleanArray`] -pub fn neq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn neq_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| (a ^ b)) } /// Perform `left < right` operation on [`BooleanArray`] -pub fn lt_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn lt_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| ((!a) & b)) } /// Perform `left <= right` operation on [`BooleanArray`] -pub fn lt_eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn lt_eq_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| !(a & (!b))) } /// Perform `left > right` operation on [`BooleanArray`] -pub fn gt_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn gt_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| (a & (!b))) } /// Perform `left >= right` operation on [`BooleanArray`] -pub fn gt_eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn gt_eq_bool( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { compare_op(left, right, |a, b| !((!a) & b)) } /// Perform `left == right` operation on [`BooleanArray`] and a scalar -pub fn eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn eq_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { let len = left.len(); let left_offset = left.offset(); @@ -207,27 +227,42 @@ pub fn eq_bool_scalar(left: &BooleanArray, right: bool) -> Result } /// Perform `left < right` operation on [`BooleanArray`] and a scalar -pub fn lt_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn lt_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { compare_op_scalar(left, |a: bool| !a & right) } /// Perform `left <= right` operation on [`BooleanArray`] and a scalar -pub fn lt_eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn lt_eq_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { compare_op_scalar(left, |a| a <= right) } /// Perform `left > right` operation on [`BooleanArray`] and a scalar -pub fn gt_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn gt_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { compare_op_scalar(left, |a: bool| a & !right) } /// Perform `left >= right` operation on [`BooleanArray`] and a scalar -pub fn gt_eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn gt_eq_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { compare_op_scalar(left, |a| a >= right) } /// Perform `left != right` operation on [`BooleanArray`] and a scalar -pub fn neq_bool_scalar(left: &BooleanArray, right: bool) -> Result { +pub fn neq_bool_scalar( + left: &BooleanArray, + right: bool, +) -> Result { eq_bool_scalar(left, !right) } @@ -235,7 +270,7 @@ pub fn neq_bool_scalar(left: &BooleanArray, right: bool) -> Result pub fn eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a == b) } @@ -243,7 +278,7 @@ pub fn eq_binary( pub fn eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a == right) } @@ -251,7 +286,7 @@ pub fn eq_binary_scalar( pub fn neq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a != b) } @@ -259,7 +294,7 @@ pub fn neq_binary( pub fn neq_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a != right) } @@ -267,7 +302,7 @@ pub fn neq_binary_scalar( pub fn lt_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a < b) } @@ -275,7 +310,7 @@ pub fn lt_binary( pub fn lt_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a < right) } @@ -283,7 +318,7 @@ pub fn lt_binary_scalar( pub fn lt_eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a <= b) } @@ -291,7 +326,7 @@ pub fn lt_eq_binary( pub fn lt_eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a <= right) } @@ -299,7 +334,7 @@ pub fn lt_eq_binary_scalar( pub fn gt_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a > b) } @@ -307,7 +342,7 @@ pub fn gt_binary( pub fn gt_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a > right) } @@ -315,7 +350,7 @@ pub fn gt_binary_scalar( pub fn gt_eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a >= b) } @@ -323,7 +358,7 @@ pub fn gt_eq_binary( pub fn gt_eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], -) -> Result { +) -> Result { compare_op_scalar(left, |a| a >= right) } @@ -331,7 +366,7 @@ pub fn gt_eq_binary_scalar( pub fn neq_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a != b) } @@ -339,7 +374,7 @@ pub fn neq_utf8( pub fn neq_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { if right.is_empty() { return utf8_empty::<_, false>(left); } @@ -350,7 +385,7 @@ pub fn neq_utf8_scalar( pub fn lt_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a < b) } @@ -358,7 +393,7 @@ pub fn lt_utf8( pub fn lt_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { compare_op_scalar(left, |a| a < right) } @@ -366,7 +401,7 @@ pub fn lt_utf8_scalar( pub fn lt_eq_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a <= b) } @@ -374,7 +409,7 @@ pub fn lt_eq_utf8( pub fn lt_eq_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { compare_op_scalar(left, |a| a <= right) } @@ -382,7 +417,7 @@ pub fn lt_eq_utf8_scalar( pub fn gt_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a > b) } @@ -390,7 +425,7 @@ pub fn gt_utf8( pub fn gt_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { compare_op_scalar(left, |a| a > right) } @@ -398,7 +433,7 @@ pub fn gt_utf8_scalar( pub fn gt_eq_utf8( left: &GenericStringArray, right: &GenericStringArray, -) -> Result { +) -> Result { compare_op(left, right, |a, b| a >= b) } @@ -406,12 +441,16 @@ pub fn gt_eq_utf8( pub fn gt_eq_utf8_scalar( left: &GenericStringArray, right: &str, -) -> Result { +) -> Result { compare_op_scalar(left, |a| a >= right) } // Avoids creating a closure for each combination of `$RIGHT` and `$TY` -fn try_to_type_result(value: Option, right: &str, ty: &str) -> Result { +fn try_to_type_result( + value: Option, + right: &str, + ty: &str, +) -> Result { value.ok_or_else(|| { ArrowError::ComputeError(format!("Could not convert {} with {}", right, ty,)) }) @@ -590,7 +629,7 @@ macro_rules! dyn_compare_utf8_scalar { /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn eq_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -609,7 +648,7 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn lt_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn lt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -628,7 +667,7 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn lt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn lt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -647,7 +686,7 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn gt_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn gt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -666,7 +705,7 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn gt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn gt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -685,7 +724,7 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn neq_dyn_scalar(left: &dyn Array, right: T) -> Result +pub fn neq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { @@ -699,7 +738,10 @@ where /// Perform `left == right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray -pub fn eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { +pub fn eq_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -717,7 +759,10 @@ pub fn eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result Result { +pub fn neq_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -736,7 +781,10 @@ pub fn neq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result Result { +pub fn lt_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -754,7 +802,10 @@ pub fn lt_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result Result { +pub fn lt_eq_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -773,7 +824,10 @@ pub fn lt_eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray -pub fn gt_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { +pub fn gt_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -791,7 +845,10 @@ pub fn gt_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result= right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray -pub fn gt_eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { +pub fn gt_eq_dyn_binary_scalar( + left: &dyn Array, + right: &[u8], +) -> Result { match left.data_type() { DataType::Binary => { let left = as_generic_binary_array::(left); @@ -810,7 +867,10 @@ pub fn gt_eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result Result { +pub fn eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -837,7 +897,10 @@ pub fn eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn lt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { +pub fn lt_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -864,7 +927,10 @@ pub fn lt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn gt_eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { +pub fn gt_eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -891,7 +957,10 @@ pub fn gt_eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result Result { +pub fn lt_eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -918,7 +987,10 @@ pub fn lt_eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn gt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { +pub fn gt_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -945,7 +1017,10 @@ pub fn gt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn neq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { +pub fn neq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { let result = match left.data_type() { DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { DataType::Utf8 | DataType::LargeUtf8 => { @@ -972,7 +1047,10 @@ pub fn neq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result Result { +pub fn eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -987,7 +1065,10 @@ pub fn eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. -pub fn lt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { +pub fn lt_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -1002,7 +1083,10 @@ pub fn lt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. -pub fn gt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { +pub fn gt_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -1017,7 +1101,10 @@ pub fn gt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. -pub fn lt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { +pub fn lt_eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -1032,7 +1119,10 @@ pub fn lt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result= right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. -pub fn gt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { +pub fn gt_eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -1047,7 +1137,10 @@ pub fn gt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result Result { +pub fn neq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { let result = match left.data_type() { DataType::Boolean => { let left = as_boolean_array(left); @@ -1067,10 +1160,10 @@ pub fn neq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result( dict: &DictionaryArray, dict_comparison: BooleanArray, -) -> Result +) -> Result where - K: ArrowNumericType, - K::Native: ToPrimitive, + K: ArrowPrimitiveType, + K::Native: num::ToPrimitive, { // TODO: Use take_boolean (#2967) let array = take(&dict_comparison, dict.keys(), None)?; @@ -1085,7 +1178,7 @@ fn simd_compare_op( right: &PrimitiveArray, simd_op: SI, scalar_op: SC, -) -> Result +) -> Result where T: ArrowNumericType, SI: Fn(T::Simd, T::Simd) -> T::SimdMask, @@ -1185,7 +1278,7 @@ fn simd_compare_op_scalar( right: T::Native, simd_op: SI, scalar_op: SC, -) -> Result +) -> Result where T: ArrowNumericType, SI: Fn(T::Simd, T::Simd) -> T::SimdMask, @@ -1271,11 +1364,11 @@ where Ok(BooleanArray::from(data)) } -fn cmp_primitive_array( +fn cmp_primitive_array( left: &dyn Array, right: &dyn Array, op: F, -) -> Result +) -> Result where F: Fn(T::Native, T::Native) -> bool, { @@ -1836,10 +1929,10 @@ fn cmp_dict_primitive( left: &DictionaryArray, right: &dyn Array, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, - T: ArrowNumericType + Sync + Send, + K: ArrowPrimitiveType, + T: ArrowPrimitiveType + Sync + Send, F: Fn(T::Native, T::Native) -> bool, { compare_op( @@ -1856,9 +1949,9 @@ fn cmp_dict_string_array( left: &DictionaryArray, right: &dyn Array, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(&str, &str) -> bool, { compare_op( @@ -1879,9 +1972,9 @@ fn cmp_dict_boolean_array( left: &DictionaryArray, right: &dyn Array, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(bool, bool) -> bool, { compare_op( @@ -1898,9 +1991,9 @@ fn cmp_dict_binary_array( left: &DictionaryArray, right: &dyn Array, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(&[u8], &[u8]) -> bool, { compare_op( @@ -1922,10 +2015,10 @@ pub fn cmp_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, - T: ArrowNumericType + Sync + Send, + K: ArrowPrimitiveType, + T: ArrowPrimitiveType + Sync + Send, F: Fn(T::Native, T::Native) -> bool, { compare_op( @@ -1942,9 +2035,9 @@ pub fn cmp_dict_bool( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(bool, bool) -> bool, { compare_op( @@ -1961,9 +2054,9 @@ pub fn cmp_dict_utf8( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(&str, &str) -> bool, { compare_op( @@ -1983,9 +2076,9 @@ pub fn cmp_dict_binary( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result +) -> Result where - K: ArrowNumericType, + K: ArrowPrimitiveType, F: Fn(&[u8], &[u8]) -> bool, { compare_op( @@ -2009,14 +2102,14 @@ where /// /// # Example /// ``` -/// use arrow::array::{StringArray, BooleanArray}; -/// use arrow::compute::eq_dyn; +/// use arrow_array::{StringArray, BooleanArray}; +/// use arrow_ord::comparison::eq_dyn; /// let array1 = StringArray::from(vec![Some("foo"), None, Some("bar")]); /// let array2 = StringArray::from(vec![Some("foo"), None, Some("baz")]); /// let result = eq_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(true), None, Some(false)]), result); /// ``` -pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2052,8 +2145,8 @@ pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// # Example /// ``` -/// use arrow::array::{BinaryArray, BooleanArray}; -/// use arrow::compute::neq_dyn; +/// use arrow_array::{BinaryArray, BooleanArray}; +/// use arrow_ord::comparison::neq_dyn; /// let values1: Vec> = vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36])]; /// let values2: Vec> = vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])]; /// let array1 = BinaryArray::from(values1); @@ -2061,7 +2154,7 @@ pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// let result = neq_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(false), None, Some(true)]), result); /// ``` -pub fn neq_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn neq_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2097,16 +2190,16 @@ pub fn neq_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// # Example /// ``` -/// use arrow::array::{PrimitiveArray, BooleanArray}; -/// use arrow::datatypes::Int32Type; -/// use arrow::compute::lt_dyn; +/// use arrow_array::{PrimitiveArray, BooleanArray}; +/// use arrow_array::types::Int32Type; +/// use arrow_ord::comparison::lt_dyn; /// let array1: PrimitiveArray = PrimitiveArray::from(vec![Some(0), Some(1), Some(2)]); /// let array2: PrimitiveArray = PrimitiveArray::from(vec![Some(1), Some(1), None]); /// let result = lt_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(true), Some(false), None]), result); /// ``` #[allow(clippy::bool_comparison)] -pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2142,15 +2235,18 @@ pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// # Example /// ``` -/// use arrow::array::{PrimitiveArray, BooleanArray}; -/// use arrow::datatypes::Date32Type; -/// use arrow::compute::lt_eq_dyn; +/// use arrow_array::{PrimitiveArray, BooleanArray}; +/// use arrow_array::types::Date32Type; +/// use arrow_ord::comparison::lt_eq_dyn; /// let array1: PrimitiveArray = vec![Some(12356), Some(13548), Some(-365), Some(365)].into(); /// let array2: PrimitiveArray = vec![Some(12355), Some(13548), Some(-364), None].into(); /// let result = lt_eq_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(false), Some(true), Some(true), None]), result); /// ``` -pub fn lt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn lt_eq_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2186,15 +2282,15 @@ pub fn lt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// # Example /// ``` -/// use arrow::array::BooleanArray; -/// use arrow::compute::gt_dyn; +/// use arrow_array::BooleanArray; +/// use arrow_ord::comparison::gt_dyn; /// let array1 = BooleanArray::from(vec![Some(true), Some(false), None]); /// let array2 = BooleanArray::from(vec![Some(false), Some(true), None]); /// let result = gt_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(true), Some(false), None]), result); /// ``` #[allow(clippy::bool_comparison)] -pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2230,14 +2326,17 @@ pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// # Example /// ``` -/// use arrow::array::{BooleanArray, StringArray}; -/// use arrow::compute::gt_eq_dyn; +/// use arrow_array::{BooleanArray, StringArray}; +/// use arrow_ord::comparison::gt_eq_dyn; /// let array1 = StringArray::from(vec![Some(""), Some("aaa"), None]); /// let array2 = StringArray::from(vec![Some(" "), Some("aa"), None]); /// let result = gt_eq_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(false), Some(true), None]), result); /// ``` -pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn gt_eq_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) if matches!(right.data_type(), DataType::Dictionary(_, _)) => @@ -2268,7 +2367,10 @@ pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +pub fn eq( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2285,7 +2387,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn eq_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2297,7 +2402,10 @@ where } /// Applies an unary and infallible comparison function to a primitive array. -pub fn unary_cmp(left: &PrimitiveArray, op: F) -> Result +pub fn unary_cmp( + left: &PrimitiveArray, + op: F, +) -> Result where T: ArrowNumericType, F: Fn(T::Native) -> bool, @@ -2311,7 +2419,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +pub fn neq( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2328,7 +2439,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn neq_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn neq_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2346,7 +2460,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +pub fn lt( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2364,7 +2481,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn lt_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn lt_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2385,7 +2505,7 @@ where pub fn lt_eq( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2403,7 +2523,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn lt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn lt_eq_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2421,7 +2544,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result +pub fn gt( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2439,7 +2565,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn gt_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn gt_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2460,7 +2589,7 @@ where pub fn gt_eq( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2478,7 +2607,10 @@ where /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. -pub fn gt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result +pub fn gt_eq_scalar( + left: &PrimitiveArray, + right: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -2493,7 +2625,7 @@ where pub fn contains( left: &PrimitiveArray, right: &GenericListArray, -) -> Result +) -> Result where T: ArrowNumericType, OffsetSize: OffsetSizeTrait, @@ -2551,7 +2683,7 @@ where pub fn contains_utf8( left: &GenericStringArray, right: &ListArray, -) -> Result +) -> Result where OffsetSize: OffsetSizeTrait, { @@ -2620,12 +2752,12 @@ fn new_all_set_buffer(len: usize) -> Buffer { #[rustfmt::skip::macros(vec)] #[cfg(test)] mod tests { - use arrow_buffer::i256; - use std::sync::Arc; - use super::*; - use crate::datatypes::Int8Type; - use crate::{array::Int32Array, array::Int64Array, datatypes::Field}; + use arrow_array::builder::{ + ListBuilder, PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder, + }; + use arrow_buffer::i256; + use arrow_schema::Field; /// Evaluate `KERNEL` with two vectors as inputs and assert against the expected output. /// `A_VEC` and `B_VEC` can be of type `Vec` or `Vec>` where `T` is the native @@ -3639,82 +3771,6 @@ mod tests { }; } - macro_rules! test_flag_utf8 { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let right = StringArray::from($right); - let res = $op(&left, &right, None).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } - } - }; - ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let right = StringArray::from($right); - let flag = Some(StringArray::from($flag)); - let res = $op(&left, &right, flag.as_ref()).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } - } - }; - } - - macro_rules! test_flag_utf8_scalar { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let res = $op(&left, $right, None).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } - } - }; - ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let flag = Some($flag); - let res = $op(&left, $right, flag).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } - } - }; - } - test_utf8!( test_utf8_array_eq, vec!["arrow", "arrow", "arrow", "arrow"], @@ -3804,44 +3860,6 @@ mod tests { gt_eq_utf8_scalar, vec![false, false, true, true] ); - test_flag_utf8!( - test_utf8_array_regexp_is_match, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], - vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], - regexp_is_match_utf8, - vec![true, false, true, false, false, true] - ); - test_flag_utf8!( - test_utf8_array_regexp_is_match_insensitive, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], - vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], - vec!["i"; 6], - regexp_is_match_utf8, - vec![true, true, true, true, false, true] - ); - - test_flag_utf8_scalar!( - test_utf8_array_regexp_is_match_scalar, - vec!["arrow", "ARROW", "parquet", "PARQUET"], - "^ar", - regexp_is_match_utf8_scalar, - vec![true, false, false, false] - ); - test_flag_utf8_scalar!( - test_utf8_array_regexp_is_match_empty_scalar, - vec!["arrow", "ARROW", "parquet", "PARQUET"], - "", - regexp_is_match_utf8_scalar, - vec![true, true, true, true] - ); - test_flag_utf8_scalar!( - test_utf8_array_regexp_is_match_insensitive_scalar, - vec!["arrow", "ARROW", "parquet", "PARQUET"], - "^ar", - "i", - regexp_is_match_utf8_scalar, - vec![true, true, false, false] - ); #[test] fn test_eq_dyn_scalar() { @@ -3881,8 +3899,7 @@ mod tests { ); assert_eq!(eq_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(eq_dyn_scalar(&array, 8).unwrap(), expected); } @@ -3924,8 +3941,7 @@ mod tests { ); assert_eq!(lt_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(lt_dyn_scalar(&array, 8).unwrap(), expected); } @@ -3967,8 +3983,7 @@ mod tests { ); assert_eq!(lt_eq_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(lt_eq_dyn_scalar(&array, 8).unwrap(), expected); } @@ -4010,8 +4025,7 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(gt_dyn_scalar(&array, 8).unwrap(), expected); } @@ -4053,8 +4067,7 @@ mod tests { ); assert_eq!(gt_eq_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(gt_eq_dyn_scalar(&array, 8).unwrap(), expected); } @@ -4096,8 +4109,7 @@ mod tests { ); assert_eq!(neq_dyn_scalar(&array, 8).unwrap(), expected); - let array: ArrayRef = Arc::new(array); - let array = crate::compute::cast(&array, &DataType::Float64).unwrap(); + let array = array.unary::<_, Float64Type>(|x| x as f64); assert_eq!(neq_dyn_scalar(&array, 8).unwrap(), expected); } @@ -4433,8 +4445,6 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_fixed_size_binary() { - use crate::array::FixedSizeBinaryArray; - let values1: Vec> = vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])]; let values2: Vec> = diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs new file mode 100644 index 000000000000..c84db09fd32e --- /dev/null +++ b/arrow-ord/src/lib.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow ordering kernels + +pub mod comparison; +pub mod ord; +pub mod partition; +pub mod sort; diff --git a/arrow/src/array/ord.rs b/arrow-ord/src/ord.rs similarity index 90% rename from arrow/src/array/ord.rs rename to arrow-ord/src/ord.rs index 305d41cc0167..44eb3b183802 100644 --- a/arrow/src/array/ord.rs +++ b/arrow-ord/src/ord.rs @@ -17,14 +17,12 @@ //! Contains functions and function factories to compare arrays. -use std::cmp::Ordering; - -use crate::array::*; -use crate::datatypes::TimeUnit; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_schema::{ArrowError, DataType}; use num::Float; +use std::cmp::Ordering; /// Compare the values at two arbitrary indices in two arrays. pub type DynComparator = Box Ordering + Send + Sync>; @@ -130,7 +128,7 @@ fn cmp_dict_primitive( key_type: &DataType, left: &dyn Array, right: &dyn Array, -) -> Result +) -> Result where VT: ArrowPrimitiveType, VT::Native: Ord, @@ -160,25 +158,24 @@ where /// The arrays' types must be equal. /// # Example /// ``` -/// use arrow::array::{build_compare, Int32Array}; +/// use arrow_array::Int32Array; +/// use arrow_ord::ord::build_compare; /// -/// # fn main() -> arrow::error::Result<()> { /// let array1 = Int32Array::from(vec![1, 2]); /// let array2 = Int32Array::from(vec![3, 4]); /// -/// let cmp = build_compare(&array1, &array2)?; +/// let cmp = build_compare(&array1, &array2).unwrap(); /// /// // 1 (index 0 of array1) is smaller than 4 (index 1 of array2) /// assert_eq!(std::cmp::Ordering::Less, (cmp)(0, 1)); -/// # Ok(()) -/// # } /// ``` // This is a factory of comparisons. // The lifetime 'a enforces that we cannot use the closure beyond any of the array's lifetime. -pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { - use DataType::*; - use IntervalUnit::*; - use TimeUnit::*; +pub fn build_compare( + left: &dyn Array, + right: &dyn Array, +) -> Result { + use arrow_schema::{DataType::*, IntervalUnit::*, TimeUnit::*}; Ok(match (left.data_type(), right.data_type()) { (a, b) if a != b => { return Err(ArrowError::InvalidArgumentError( @@ -315,130 +312,119 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result Result<()> { + fn test_fixed_size_binary() { let items = vec![vec![1u8], vec![2u8]]; let array = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) } #[test] - fn test_fixed_size_binary_fixed_size_binary() -> Result<()> { + fn test_fixed_size_binary_fixed_size_binary() { let items = vec![vec![1u8]]; let array1 = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); let items = vec![vec![2u8]]; let array2 = FixedSizeBinaryArray::try_from_iter(items.into_iter()).unwrap(); - let cmp = build_compare(&array1, &array2)?; + let cmp = build_compare(&array1, &array2).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 0)); - Ok(()) } #[test] - fn test_i32() -> Result<()> { + fn test_i32() { let array = Int32Array::from(vec![1, 2]); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) } #[test] - fn test_i32_i32() -> Result<()> { + fn test_i32_i32() { let array1 = Int32Array::from(vec![1]); let array2 = Int32Array::from(vec![2]); - let cmp = build_compare(&array1, &array2)?; + let cmp = build_compare(&array1, &array2).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 0)); - Ok(()) } #[test] - fn test_f64() -> Result<()> { + fn test_f64() { let array = Float64Array::from(vec![1.0, 2.0]); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) } #[test] - fn test_f64_nan() -> Result<()> { + fn test_f64_nan() { let array = Float64Array::from(vec![1.0, f64::NAN]); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) } #[test] - fn test_f64_zeros() -> Result<()> { + fn test_f64_zeros() { let array = Float64Array::from(vec![-0.0, 0.0]); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Equal, (cmp)(0, 1)); assert_eq!(Ordering::Equal, (cmp)(1, 0)); - Ok(()) } #[test] - fn test_decimal() -> Result<()> { + fn test_decimal() { let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)] .into_iter() .collect::() .with_precision_and_scale(23, 6) .unwrap(); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(1, 0)); assert_eq!(Ordering::Greater, (cmp)(0, 2)); - Ok(()) } #[test] - fn test_dict() -> Result<()> { + fn test_dict() { let data = vec!["a", "b", "c", "a", "a", "c", "c"]; let array = data.into_iter().collect::>(); - let cmp = build_compare(&array, &array)?; + let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); assert_eq!(Ordering::Equal, (cmp)(3, 4)); assert_eq!(Ordering::Greater, (cmp)(2, 3)); - Ok(()) } #[test] - fn test_multiple_dict() -> Result<()> { + fn test_multiple_dict() { let d1 = vec!["a", "b", "c", "d"]; let a1 = d1.into_iter().collect::>(); let d2 = vec!["e", "f", "g", "a"]; let a2 = d2.into_iter().collect::>(); - let cmp = build_compare(&a1, &a2)?; + let cmp = build_compare(&a1, &a2).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 0)); assert_eq!(Ordering::Equal, (cmp)(0, 3)); assert_eq!(Ordering::Greater, (cmp)(1, 3)); - Ok(()) } #[test] - fn test_primitive_dict() -> Result<()> { + fn test_primitive_dict() { let values = Int32Array::from(vec![1_i32, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); @@ -447,13 +433,12 @@ pub mod tests { let keys = Int8Array::from_iter_values([0, 1, 1, 3]); let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); - let cmp = build_compare(&array1, &array2)?; + let cmp = build_compare(&array1, &array2).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 0)); assert_eq!(Ordering::Less, (cmp)(0, 3)); assert_eq!(Ordering::Equal, (cmp)(3, 3)); assert_eq!(Ordering::Greater, (cmp)(3, 1)); assert_eq!(Ordering::Greater, (cmp)(3, 2)); - Ok(()) } } diff --git a/arrow/src/compute/kernels/partition.rs b/arrow-ord/src/partition.rs similarity index 82% rename from arrow/src/compute/kernels/partition.rs rename to arrow-ord/src/partition.rs index 0e48e627e655..26a030beb35e 100644 --- a/arrow/src/compute/kernels/partition.rs +++ b/arrow-ord/src/partition.rs @@ -17,11 +17,9 @@ //! Defines partition kernel for `ArrayRef` -use crate::compute::kernels::sort::LexicographicalComparator; -use crate::compute::SortColumn; -use crate::error::{ArrowError, Result}; +use crate::sort::{LexicographicalComparator, SortColumn}; +use arrow_schema::ArrowError; use std::cmp::Ordering; -use std::iter::Iterator; use std::ops::Range; /// Given a list of already sorted columns, find partition ranges that would partition @@ -35,7 +33,7 @@ use std::ops::Range; /// range. pub fn lexicographical_partition_ranges( columns: &[SortColumn], -) -> Result> + '_> { +) -> Result> + '_, ArrowError> { LexicographicalPartitionIterator::try_new(columns) } @@ -47,7 +45,9 @@ struct LexicographicalPartitionIterator<'a> { } impl<'a> LexicographicalPartitionIterator<'a> { - fn try_new(columns: &'a [SortColumn]) -> Result { + fn try_new( + columns: &'a [SortColumn], + ) -> Result { if columns.is_empty() { return Err(ArrowError::InvalidArgumentError( "Sort requires at least one column".to_string(), @@ -90,7 +90,7 @@ fn exponential_search_next_partition_point( let target = start; let mut bound = 1; while bound + start < end - && comparator.compare(&(bound + start), &target) != Ordering::Greater + && comparator.compare(bound + start, target) != Ordering::Greater { bound *= 2; } @@ -101,7 +101,7 @@ fn exponential_search_next_partition_point( // note here we have right = min(end, start + bound + 1) because (start + bound) might // actually be considered and must be included. partition_point(start + bound / 2, end.min(start + bound + 1), |idx| { - comparator.compare(&idx, &target) != Ordering::Greater + comparator.compare(idx, target) != Ordering::Greater }) } @@ -162,9 +162,9 @@ impl<'a> Iterator for LexicographicalPartitionIterator<'a> { #[cfg(test)] mod tests { use super::*; - use crate::array::*; - use crate::compute::SortOptions; - use crate::datatypes::DataType; + use crate::sort::SortOptions; + use arrow_array::*; + use arrow_schema::DataType; use std::sync::Arc; #[test] @@ -233,7 +233,7 @@ mod tests { } #[test] - fn test_lexicographical_partition_single_column() -> Result<()> { + fn test_lexicographical_partition_single_column() { let input = vec![SortColumn { values: Arc::new(Int64Array::from(vec![1, 2, 2, 2, 2, 2, 2, 2, 9])) as ArrayRef, @@ -242,18 +242,15 @@ mod tests { nulls_first: true, }), }]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!( - vec![(0_usize..1_usize), (1_usize..8_usize), (8_usize..9_usize)], - results.collect::>() - ); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!( + vec![(0_usize..1_usize), (1_usize..8_usize), (8_usize..9_usize)], + results.collect::>() + ); } #[test] - fn test_lexicographical_partition_all_equal_values() -> Result<()> { + fn test_lexicographical_partition_all_equal_values() { let input = vec![SortColumn { values: Arc::new(Int64Array::from_value(1, 1000)) as ArrayRef, options: Some(SortOptions { @@ -262,15 +259,12 @@ mod tests { }), }]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); } #[test] - fn test_lexicographical_partition_all_null_values() -> Result<()> { + fn test_lexicographical_partition_all_null_values() { let input = vec![ SortColumn { values: new_null_array(&DataType::Int8, 1000), @@ -287,15 +281,12 @@ mod tests { }), }, ]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); } #[test] - fn test_lexicographical_partition_unique_column_1() -> Result<()> { + fn test_lexicographical_partition_unique_column_1() { let input = vec![ SortColumn { values: Arc::new(Int64Array::from(vec![None, Some(-1)])) as ArrayRef, @@ -313,18 +304,15 @@ mod tests { }), }, ]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!( - vec![(0_usize..1_usize), (1_usize..2_usize)], - results.collect::>() - ); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!( + vec![(0_usize..1_usize), (1_usize..2_usize)], + results.collect::>() + ); } #[test] - fn test_lexicographical_partition_unique_column_2() -> Result<()> { + fn test_lexicographical_partition_unique_column_2() { let input = vec![ SortColumn { values: Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1)])) @@ -346,18 +334,15 @@ mod tests { }), }, ]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!( - vec![(0_usize..1_usize), (1_usize..2_usize), (2_usize..3_usize),], - results.collect::>() - ); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!( + vec![(0_usize..1_usize), (1_usize..2_usize), (2_usize..3_usize),], + results.collect::>() + ); } #[test] - fn test_lexicographical_partition_non_unique_column_1() -> Result<()> { + fn test_lexicographical_partition_non_unique_column_1() { let input = vec![ SortColumn { values: Arc::new(Int64Array::from(vec![ @@ -384,13 +369,10 @@ mod tests { }), }, ]; - { - let results = lexicographical_partition_ranges(&input)?; - assert_eq!( - vec![(0_usize..1_usize), (1_usize..3_usize), (3_usize..4_usize),], - results.collect::>() - ); - } - Ok(()) + let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!( + vec![(0_usize..1_usize), (1_usize..3_usize), (3_usize..4_usize),], + results.collect::>() + ); } } diff --git a/arrow/src/compute/kernels/sort.rs b/arrow-ord/src/sort.rs similarity index 98% rename from arrow/src/compute/kernels/sort.rs rename to arrow-ord/src/sort.rs index 81895760e588..a2035988fe25 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow-ord/src/sort.rs @@ -17,14 +17,15 @@ //! Defines sort kernel for `ArrayRef` -use crate::array::*; -use crate::buffer::MutableBuffer; -use crate::compute::take; -use crate::datatypes::*; -use crate::downcast_dictionary_array; -use crate::error::{ArrowError, Result}; +use crate::ord::{build_compare, DynComparator}; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; +use arrow_select::take::take; use std::cmp::Ordering; -use TimeUnit::*; /// Sort the `ArrayRef` using `SortOptions`. /// @@ -41,18 +42,17 @@ use TimeUnit::*; /// # Example /// ```rust /// # use std::sync::Arc; -/// # use arrow::array::{Int32Array, ArrayRef}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::sort::sort; -/// # fn main() -> Result<()> { +/// # use arrow_array::{Int32Array, ArrayRef}; +/// # use arrow_ord::sort::sort; /// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); /// let sorted_array = sort(&array, None).unwrap(); /// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); /// assert_eq!(sorted_array, &Int32Array::from(vec![1, 2, 3, 4, 5])); -/// # Ok(()) -/// # } /// ``` -pub fn sort(values: &ArrayRef, options: Option) -> Result { +pub fn sort( + values: &ArrayRef, + options: Option, +) -> Result { let indices = sort_to_indices(values, options, None)?; take(values.as_ref(), &indices, None) } @@ -69,10 +69,8 @@ pub fn sort(values: &ArrayRef, options: Option) -> Result /// # Example /// ```rust /// # use std::sync::Arc; -/// # use arrow::array::{Int32Array, ArrayRef}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::sort::{sort_limit, SortOptions}; -/// # fn main() -> Result<()> { +/// # use arrow_array::{Int32Array, ArrayRef}; +/// # use arrow_ord::sort::{sort_limit, SortOptions}; /// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); /// /// // Find the the top 2 items @@ -88,14 +86,12 @@ pub fn sort(values: &ArrayRef, options: Option) -> Result /// let sorted_array = sort_limit(&array, options, Some(2)).unwrap(); /// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); /// assert_eq!(sorted_array, &Int32Array::from(vec![5, 4])); -/// # Ok(()) -/// # } /// ``` pub fn sort_limit( values: &ArrayRef, options: Option, limit: Option, -) -> Result { +) -> Result { let indices = sort_to_indices(values, options, limit)?; take(values.as_ref(), &indices, None) } @@ -139,7 +135,7 @@ pub fn sort_to_indices( values: &ArrayRef, options: Option, limit: Option, -) -> Result { +) -> Result { let options = options.unwrap_or_default(); let (v, n) = partition_validity(values); @@ -198,32 +194,32 @@ pub fn sort_to_indices( DataType::Date64 => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Time32(Second) => { + DataType::Time32(TimeUnit::Second) => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Time32(Millisecond) => { + DataType::Time32(TimeUnit::Millisecond) => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Time64(Microsecond) => { + DataType::Time64(TimeUnit::Microsecond) => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Time64(Nanosecond) => { + DataType::Time64(TimeUnit::Nanosecond) => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Timestamp(Second, _) => { + DataType::Timestamp(TimeUnit::Second, _) => { sort_primitive::(values, v, n, cmp, &options, limit) } - DataType::Timestamp(Millisecond, _) => { + DataType::Timestamp(TimeUnit::Millisecond, _) => { sort_primitive::( values, v, n, cmp, &options, limit, ) } - DataType::Timestamp(Microsecond, _) => { + DataType::Timestamp(TimeUnit::Microsecond, _) => { sort_primitive::( values, v, n, cmp, &options, limit, ) } - DataType::Timestamp(Nanosecond, _) => { + DataType::Timestamp(TimeUnit::Nanosecond, _) => { sort_primitive::( values, v, n, cmp, &options, limit, ) @@ -857,11 +853,12 @@ pub struct SortColumn { /// Example: /// /// ``` -/// use std::convert::From; -/// use std::sync::Arc; -/// use arrow::array::{ArrayRef, StringArray, PrimitiveArray, as_primitive_array}; -/// use arrow::compute::kernels::sort::{SortColumn, SortOptions, lexsort}; -/// use arrow::datatypes::Int64Type; +/// # use std::convert::From; +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, StringArray, PrimitiveArray}; +/// # use arrow_array::types::Int64Type; +/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort}; /// /// let sorted_columns = lexsort(&vec![ /// SortColumn { @@ -893,10 +890,13 @@ pub struct SortColumn { /// assert!(sorted_columns[0].is_null(0)); /// ``` /// -/// Note: for multi-column sorts without a limit, using the [row format][crate::row] +/// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow/latest/arrow/row/) /// may be significantly faster /// -pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result> { +pub fn lexsort( + columns: &[SortColumn], + limit: Option, +) -> Result, ArrowError> { let indices = lexsort_to_indices(columns, limit)?; columns .iter() @@ -907,12 +907,12 @@ pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result, -) -> Result { +) -> Result { if columns.is_empty() { return Err(ArrowError::InvalidArgumentError( "Sort requires at least one column".to_string(), @@ -941,7 +941,7 @@ pub fn lexsort_to_indices( let lexicographical_comparator = LexicographicalComparator::try_new(columns)?; // uint32 can be sorted unstably sort_unstable_by(&mut value_indices, len, |a, b| { - lexicographical_comparator.compare(a, b) + lexicographical_comparator.compare(*a, *b) }); Ok(UInt32Array::from_iter_values( @@ -966,21 +966,17 @@ type LexicographicalCompareItem<'a> = ( /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data /// at given two indices. The lifetime is the same at the data wrapped. -pub(crate) struct LexicographicalComparator<'a> { +pub struct LexicographicalComparator<'a> { compare_items: Vec>, } impl LexicographicalComparator<'_> { /// lexicographically compare values at the wrapped columns with given indices. - pub(crate) fn compare<'a, 'b>( - &'a self, - a_idx: &'b usize, - b_idx: &'b usize, - ) -> Ordering { + pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { for (data, comparator, sort_option) in &self.compare_items { - match (data.is_valid(*a_idx), data.is_valid(*b_idx)) { + match (data.is_valid(a_idx), data.is_valid(b_idx)) { (true, true) => { - match (comparator)(*a_idx, *b_idx) { + match (comparator)(a_idx, b_idx) { // equal, move on to next column Ordering::Equal => continue, order => { @@ -1016,9 +1012,9 @@ impl LexicographicalComparator<'_> { /// Create a new lex comparator that will wrap the given sort columns and give comparison /// results with two indices. - pub(crate) fn try_new( + pub fn try_new( columns: &[SortColumn], - ) -> Result> { + ) -> Result, ArrowError> { let compare_items = columns .iter() .map(|column| { @@ -1032,7 +1028,7 @@ impl LexicographicalComparator<'_> { column.options.unwrap_or_default(), )) }) - .collect::>>()?; + .collect::, ArrowError>>()?; Ok(LexicographicalComparator { compare_items }) } } diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index bb4b2b0a8268..ddb47969cf29 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -286,4 +286,119 @@ mod tests { let result = actual.as_any().downcast_ref::().unwrap(); assert_eq!(&expected, result); } + + macro_rules! test_flag_utf8 { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let right = StringArray::from($right); + let res = $op(&left, &right, None).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let right = StringArray::from($right); + let flag = Some(StringArray::from($flag)); + let res = $op(&left, &right, flag.as_ref()).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!(v, expected[i]); + } + } + }; + } + + macro_rules! test_flag_utf8_scalar { + ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let res = $op(&left, $right, None).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + } + }; + ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => { + #[test] + fn $test_name() { + let left = StringArray::from($left); + let flag = Some($flag); + let res = $op(&left, $right, flag).unwrap(); + let expected = $expected; + assert_eq!(expected.len(), res.len()); + for i in 0..res.len() { + let v = res.value(i); + assert_eq!( + v, + expected[i], + "unexpected result when comparing {} at position {} to {} ", + left.value(i), + i, + $right + ); + } + } + }; + } + + test_flag_utf8!( + test_utf8_array_regexp_is_match, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], + vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], + regexp_is_match_utf8, + vec![true, false, true, false, false, true] + ); + test_flag_utf8!( + test_utf8_array_regexp_is_match_insensitive, + vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], + vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], + vec!["i"; 6], + regexp_is_match_utf8, + vec![true, true, true, true, false, true] + ); + + test_flag_utf8_scalar!( + test_utf8_array_regexp_is_match_scalar, + vec!["arrow", "ARROW", "parquet", "PARQUET"], + "^ar", + regexp_is_match_utf8_scalar, + vec![true, false, false, false] + ); + test_flag_utf8_scalar!( + test_utf8_array_regexp_is_match_empty_scalar, + vec!["arrow", "ARROW", "parquet", "PARQUET"], + "", + regexp_is_match_utf8_scalar, + vec![true, true, true, true] + ); + test_flag_utf8_scalar!( + test_utf8_array_regexp_is_match_insensitive_scalar, + vec!["arrow", "ARROW", "parquet", "PARQUET"], + "^ar", + "i", + regexp_is_match_utf8_scalar, + vec![true, true, false, false] + ); } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 17f88c084cbc..86029cc1a3be 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -52,6 +52,7 @@ arrow-csv = { version = "28.0.0", path = "../arrow-csv", optional = true } arrow-data = { version = "28.0.0", path = "../arrow-data" } arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true } arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "28.0.0", path = "../arrow-ord" } arrow-schema = { version = "28.0.0", path = "../arrow-schema" } arrow-select = { version = "28.0.0", path = "../arrow-select" } arrow-string = { version = "28.0.0", path = "../arrow-string" } @@ -60,7 +61,6 @@ num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.13", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } @@ -76,7 +76,7 @@ ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["arrow-json"] -simd = ["packed_simd"] +simd = ["arrow-array/simd", "arrow-ord/simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but # not the core arrow code itself. Be aware that `rand` must be kept as @@ -92,7 +92,7 @@ force_validate = ["arrow-data/force_validate"] ffi = ["bitflags"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars -dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"] +dyn_cmp_dict = ["arrow-string/dyn_cmp_dict", "arrow-ord/dyn_cmp_dict"] # Enable dyn-arithmetic kernels for dictionary arrays # Note: this does not impact arithmetic with scalars dyn_arith_dict = [] diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index af774de0a263..1a10725df678 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -21,7 +21,6 @@ #[cfg(feature = "ffi")] mod ffi; -mod ord; // --------------------- Array & ArrayData --------------------- pub use arrow_array::array::*; @@ -39,4 +38,4 @@ pub use self::ffi::{export_array_into_raw, make_array_from_raw}; // --------------------- Array's values comparison --------------------- -pub use self::ord::{build_compare, DynComparator}; +pub use arrow_ord::ord::{build_compare, DynComparator}; diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 29468861f82a..837fb73d56d1 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -22,13 +22,18 @@ pub mod arithmetic; pub mod arity; pub mod bitwise; pub mod boolean; -pub mod comparison; pub mod limit; -pub mod partition; -pub mod sort; pub mod temporal; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; +pub use arrow_ord::{partition, sort}; pub use arrow_select::{concat, filter, interleave, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; + +/// Comparison kernels for `Array`s. +pub mod comparison { + pub use arrow_ord::comparison::*; + pub use arrow_string::like::*; + pub use arrow_string::regexp::{regexp_is_match_utf8, regexp_is_match_utf8_scalar}; +} diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 5d625a051fd0..c25240096812 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -22,11 +22,10 @@ //! * [`Field`](crate::datatypes::Field) to describe one field within a schema. //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. -mod numeric; -pub use numeric::*; - pub use arrow_array::types::*; -pub use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +pub use arrow_array::{ + ArrowFloatNumericType, ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, +}; pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index a27e6b9af44a..d57168dc9ea2 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -36,7 +36,9 @@ //! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays //! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format //! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays +//! * [`arrow-ipc`][arrow_ipc] - read/write IPC to arrow format //! * [`arrow-json`][arrow_json] - read/write JSON to arrow format +//! * [`arrow-ord`][arrow_ord] - ordering kernels for arrow arrays //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays //! * [`arrow-string`][arrow_string] - string kernels for arrow arrays diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index ea3def6ac831..bf58cf2f01ea 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -1264,13 +1264,13 @@ mod tests { use arrow_array::NullArray; use arrow_buffer::Buffer; + use arrow_ord::sort::{LexicographicalComparator, SortColumn, SortOptions}; use crate::array::{ BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray, PrimitiveDictionaryBuilder, StringArray, }; - use crate::compute::{LexicographicalComparator, SortColumn}; use crate::util::display::array_value_to_string; use super::*; @@ -2154,7 +2154,7 @@ mod tests { let row_i = rows.row(i); let row_j = rows.row(j); let row_cmp = row_i.cmp(&row_j); - let lex_cmp = comparator.compare(&i, &j); + let lex_cmp = comparator.compare(i, j); assert_eq!( row_cmp, lex_cmp, diff --git a/dev/release/README.md b/dev/release/README.md index 0e35f80aaf9f..ae36ef156e36 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -259,6 +259,7 @@ Rust Arrow Crates: (cd arrow-select && cargo publish) (cd arrow-cast && cargo publish) (cd arrow-string && cargo publish) +(cd arrow-ord && cargo publish) (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) From d11da24ad43c27e28d0b2340a810e4733c2162cf Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 9 Dec 2022 06:47:09 -0500 Subject: [PATCH 0395/1411] refactor: Merge similar functions `ilike_scalar` and `nilike_scalar` (#3303) * merge functions ilike_scalar and nilike_scalar * Use from_unary Co-authored-by: askoa Co-authored-by: Raphael Taylor-Davies --- arrow-string/src/like.rs | 169 ++++++--------------------------------- 1 file changed, 26 insertions(+), 143 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 11d79676d63c..c8a4d37cd7cc 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -18,7 +18,6 @@ use arrow_array::builder::BooleanBufferBuilder; use arrow_array::cast::*; use arrow_array::*; -use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::*; @@ -584,66 +583,44 @@ fn ilike_dict( } #[inline] -fn ilike_scalar<'a, L: ArrayAccessor>( +fn ilike_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( left: L, right: &str, + op: F, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - if !right.contains(is_like_pattern) { // fast path, can use equals let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() == right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } + + Ok(BooleanArray::from_unary(left, |item| { + op(item.to_uppercase() == right_uppercase) + })) } else if right.ends_with('%') && !right.ends_with("\\%") && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use starts_with let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str) - { - bit_util::set_bit(bool_slice, i); - } - } - } + Ok(BooleanArray::from_unary(left, |item| { + op(item.to_uppercase().starts_with(start_str)) + })) } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { // fast path, can use ends_with let ends_str = &right[1..].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().ends_with(ends_str) { - bit_util::set_bit(bool_slice, i); - } - } - } + Ok(BooleanArray::from_unary(left, |item| { + op(item.to_uppercase().ends_with(ends_str)) + })) } else if right.starts_with('%') && right.ends_with('%') + && !right.ends_with("\\%") && !right[1..right.len() - 1].contains(is_like_pattern) { // fast path, can use contains let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase().contains(contains) { - bit_util::set_bit(bool_slice, i); - } - } - } + Ok(BooleanArray::from_unary(left, |item| { + op(item.to_uppercase().contains(contains)) + })) } else { let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { @@ -653,26 +630,16 @@ fn ilike_scalar<'a, L: ArrayAccessor>( )) })?; - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; + Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) + } +} - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +#[inline] +fn ilike_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + ilike_scalar_op(left, right, |x| x) } /// Perform SQL `left ILIKE right` operation on [`StringArray`] / @@ -852,91 +819,7 @@ fn nilike_scalar<'a, L: ArrayAccessor>( left: L, right: &str, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).to_uppercase() != right_uppercase { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left - .value_unchecked(i) - .to_uppercase() - .starts_with(start_str)) - { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); - - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).to_uppercase().contains(contains)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if !re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + ilike_scalar_op(left, right, |x| !x) } /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / From d18827d28e4149c81a7e3a3c86aae3fdedb87305 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Fri, 9 Dec 2022 07:41:01 -0700 Subject: [PATCH 0396/1411] FlightSQL Client & integration test (#3207) * squash * Undo nightly clippy advice * PR feedback * PR feedback * PR feedback * PR feedback * Formatting --- arrow-flight/Cargo.toml | 4 + arrow-flight/examples/flight_sql_server.rs | 229 ++++++++- arrow-flight/examples/server.rs | 18 +- arrow-flight/src/sql/client.rs | 531 +++++++++++++++++++++ arrow-flight/src/sql/mod.rs | 1 + arrow-flight/src/sql/server.rs | 4 +- arrow-flight/src/utils.rs | 52 +- 7 files changed, 804 insertions(+), 35 deletions(-) create mode 100644 arrow-flight/src/sql/client.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 77881a70f708..35f70669ca0f 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -45,6 +45,10 @@ default = [] flight-sql-experimental = ["prost-types"] [dev-dependencies] +arrow = { version = "28.0.0", path = "../arrow", features = ["prettyprint"] } +tempfile = "3.3" +tokio-stream = { version = "0.1", features = ["net"] } +tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index aa0d407113d7..29e6c2c37d68 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -15,13 +15,27 @@ // specific language governing permissions and limitations // under the License. -use arrow_flight::sql::{ActionCreatePreparedStatementResult, SqlInfo}; -use arrow_flight::{Action, FlightData, HandshakeRequest, HandshakeResponse, Ticket}; -use futures::Stream; +use arrow_array::builder::StringBuilder; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_flight::sql::{ActionCreatePreparedStatementResult, ProstMessageExt, SqlInfo}; +use arrow_flight::{ + Action, FlightData, FlightEndpoint, HandshakeRequest, HandshakeResponse, IpcMessage, + Location, SchemaAsIpc, Ticket, +}; +use futures::{stream, Stream}; +use prost_types::Any; +use std::fs; use std::pin::Pin; -use tonic::transport::Server; +use std::sync::Arc; +use tempfile::NamedTempFile; +use tokio::net::{UnixListener, UnixStream}; +use tokio_stream::wrappers::UnixListenerStream; +use tonic::transport::{Endpoint, Server}; use tonic::{Request, Response, Status, Streaming}; +use arrow_flight::flight_descriptor::DescriptorType; +use arrow_flight::sql::client::FlightSqlServiceClient; +use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ flight_service_server::FlightService, flight_service_server::FlightServiceServer, @@ -36,10 +50,28 @@ use arrow_flight::{ }, FlightDescriptor, FlightInfo, }; +use arrow_ipc::writer::IpcWriteOptions; +use arrow_schema::{ArrowError, DataType, Field, Schema}; + +macro_rules! status { + ($desc:expr, $err:expr) => { + Status::internal(format!("{}: {} at {}:{}", $desc, $err, file!(), line!())) + }; +} #[derive(Clone)] pub struct FlightSqlServiceImpl {} +impl FlightSqlServiceImpl { + fn fake_result() -> Result { + let schema = Schema::new(vec![Field::new("salutation", DataType::Utf8, false)]); + let mut builder = StringBuilder::new(); + builder.append_value("Hello, FlightSQL!"); + let cols = vec![Arc::new(builder.finish()) as ArrayRef]; + RecordBatch::try_new(Arc::new(schema), cols) + } +} + #[tonic::async_trait] impl FlightSqlService for FlightSqlServiceImpl { type FlightService = FlightSqlServiceImpl; @@ -57,7 +89,7 @@ impl FlightSqlService for FlightSqlServiceImpl { .get("authorization") .ok_or(Status::invalid_argument("authorization field not present"))? .to_str() - .map_err(|_| Status::invalid_argument("authorization not parsable"))?; + .map_err(|e| status!("authorization not parsable", e))?; if !authorization.starts_with(basic) { Err(Status::invalid_argument(format!( "Auth type not implemented: {}", @@ -66,20 +98,20 @@ impl FlightSqlService for FlightSqlServiceImpl { } let base64 = &authorization[basic.len()..]; let bytes = base64::decode(base64) - .map_err(|_| Status::invalid_argument("authorization not parsable"))?; + .map_err(|e| status!("authorization not decodable", e))?; let str = String::from_utf8(bytes) - .map_err(|_| Status::invalid_argument("authorization not parsable"))?; + .map_err(|e| status!("authorization not parsable", e))?; let parts: Vec<_> = str.split(":").collect(); - if parts.len() != 2 { - Err(Status::invalid_argument(format!( - "Invalid authorization header" - )))?; - } - let user = parts[0]; - let pass = parts[1]; - if user != "admin" || pass != "password" { + let (user, pass) = match parts.as_slice() { + [user, pass] => (user, pass), + _ => Err(Status::invalid_argument( + "Invalid authorization header".to_string(), + ))?, + }; + if user != &"admin" || pass != &"password" { Err(Status::unauthenticated("Invalid credentials!"))? } + let result = HandshakeResponse { protocol_version: 0, payload: "random_uuid_token".as_bytes().to_vec(), @@ -89,7 +121,26 @@ impl FlightSqlService for FlightSqlServiceImpl { return Ok(Response::new(Box::pin(output))); } - // get_flight_info + async fn do_get_fallback( + &self, + _request: Request, + _message: prost_types::Any, + ) -> Result::DoGetStream>, Status> { + let batch = + Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let schema = (*batch.schema()).clone(); + let batches = vec![batch]; + let flight_data = batches_to_flight_data(schema, batches) + .map_err(|e| status!("Could not convert batches", e))? + .into_iter() + .map(Ok); + + let stream: Pin> + Send>> = + Box::pin(stream::iter(flight_data)); + let resp = Response::new(stream); + Ok(resp) + } + async fn get_flight_info_statement( &self, _query: CommandStatementQuery, @@ -102,12 +153,49 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_prepared_statement( &self, - _query: CommandPreparedStatementQuery, + cmd: CommandPreparedStatementQuery, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_prepared_statement not implemented", - )) + let handle = String::from_utf8(cmd.prepared_statement_handle) + .map_err(|e| status!("Unable to parse handle", e))?; + let batch = + Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let schema = (*batch.schema()).clone(); + let num_rows = batch.num_rows(); + let num_bytes = batch.get_array_memory_size(); + let loc = Location { + uri: "grpc+tcp://127.0.0.1".to_string(), + }; + let fetch = FetchResults { + handle: handle.to_string(), + }; + let buf = ::prost::Message::encode_to_vec(&fetch.as_any()); + let ticket = Ticket { ticket: buf }; + let endpoint = FlightEndpoint { + ticket: Some(ticket), + location: vec![loc], + }; + let endpoints = vec![endpoint]; + + let message = SchemaAsIpc::new(&schema, &IpcWriteOptions::default()) + .try_into() + .map_err(|e| status!("Unable to serialize schema", e))?; + let IpcMessage(schema_bytes) = message; + + let flight_desc = FlightDescriptor { + r#type: DescriptorType::Cmd.into(), + cmd: vec![], + path: vec![], + }; + let info = FlightInfo { + schema: schema_bytes, + flight_descriptor: Some(flight_desc), + endpoint: endpoints, + total_records: num_rows as i64, + total_bytes: num_bytes as i64, + }; + let resp = Response::new(info); + Ok(resp) } async fn get_flight_info_catalogs( @@ -328,20 +416,33 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } - // do_action async fn do_action_create_prepared_statement( &self, _query: ActionCreatePreparedStatementRequest, _request: Request, ) -> Result { - Err(Status::unimplemented("Not yet implemented")) + let handle = "some_uuid"; + let schema = Self::fake_result() + .map_err(|e| status!("Error getting result schema", e))? + .schema(); + let message = SchemaAsIpc::new(&schema, &IpcWriteOptions::default()) + .try_into() + .map_err(|e| status!("Unable to serialize schema", e))?; + let IpcMessage(schema_bytes) = message; + let res = ActionCreatePreparedStatementResult { + prepared_statement_handle: handle.as_bytes().to_vec(), + dataset_schema: schema_bytes, + parameter_schema: vec![], // TODO: parameters + }; + Ok(res) } + async fn do_action_close_prepared_statement( &self, _query: ActionClosePreparedStatementRequest, _request: Request, ) { - unimplemented!("Not yet implemented") + unimplemented!("Implement do_action_close_prepared_statement") } async fn register_sql_info(&self, _id: i32, _result: &SqlInfo) {} @@ -360,3 +461,85 @@ async fn main() -> Result<(), Box> { Ok(()) } + +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FetchResults { + #[prost(string, tag = "1")] + pub handle: ::prost::alloc::string::String, +} + +impl ProstMessageExt for FetchResults { + fn type_url() -> &'static str { + "type.googleapis.com/arrow.flight.protocol.sql.FetchResults" + } + + fn as_any(&self) -> Any { + prost_types::Any { + type_url: FetchResults::type_url().to_string(), + value: ::prost::Message::encode_to_vec(self), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use futures::TryStreamExt; + + use arrow::util::pretty::pretty_format_batches; + use arrow_flight::utils::flight_data_to_batches; + use tower::service_fn; + + async fn client_with_uds(path: String) -> FlightSqlServiceClient { + let connector = service_fn(move |_| UnixStream::connect(path.clone())); + let channel = Endpoint::try_from("https://example.com") + .unwrap() + .connect_with_connector(connector) + .await + .unwrap(); + FlightSqlServiceClient::new(channel) + } + + #[tokio::test] + async fn test_select_1() { + let file = NamedTempFile::new().unwrap(); + let path = file.into_temp_path().to_str().unwrap().to_string(); + let _ = fs::remove_file(path.clone()); + + let uds = UnixListener::bind(path.clone()).unwrap(); + let stream = UnixListenerStream::new(uds); + + // We would just listen on TCP, but it seems impossible to know when tonic is ready to serve + let service = FlightSqlServiceImpl {}; + let serve_future = Server::builder() + .add_service(FlightServiceServer::new(service)) + .serve_with_incoming(stream); + + let request_future = async { + let mut client = client_with_uds(path).await; + let token = client.handshake("admin", "password").await.unwrap(); + println!("Auth succeeded with token: {:?}", token); + let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); + let flight_info = stmt.execute().await.unwrap(); + let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); + let flight_data = client.do_get(ticket).await.unwrap(); + let flight_data: Vec = flight_data.try_collect().await.unwrap(); + let batches = flight_data_to_batches(&flight_data).unwrap(); + let res = pretty_format_batches(batches.as_slice()).unwrap(); + let expected = r#" ++-------------------+ +| salutation | ++-------------------+ +| Hello, FlightSQL! | ++-------------------+"# + .trim() + .to_string(); + assert_eq!(res.to_string(), expected); + }; + + tokio::select! { + _ = serve_future => panic!("server returned first"), + _ = request_future => println!("Client finished!"), + } + } +} diff --git a/arrow-flight/examples/server.rs b/arrow-flight/examples/server.rs index 75d05378710f..1d473103af8e 100644 --- a/arrow-flight/examples/server.rs +++ b/arrow-flight/examples/server.rs @@ -58,63 +58,63 @@ impl FlightService for FlightServiceImpl { &self, _request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement handshake")) } async fn list_flights( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement list_flights")) } async fn get_flight_info( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement get_flight_info")) } async fn get_schema( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement get_schema")) } async fn do_get( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement do_get")) } async fn do_put( &self, _request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement do_put")) } async fn do_action( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement do_action")) } async fn list_actions( &self, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement list_actions")) } async fn do_exchange( &self, _request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("Implement do_exchange")) } } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs new file mode 100644 index 000000000000..fa6691793a17 --- /dev/null +++ b/arrow-flight/src/sql/client.rs @@ -0,0 +1,531 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use crate::flight_service_client::FlightServiceClient; +use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; +use crate::sql::{ + ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, + ActionCreatePreparedStatementResult, CommandGetCatalogs, CommandGetCrossReference, + CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, + CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, + CommandPreparedStatementQuery, CommandStatementQuery, CommandStatementUpdate, + DoPutUpdateResult, ProstAnyExt, ProstMessageExt, SqlInfo, +}; +use crate::{ + Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, + HandshakeResponse, IpcMessage, Ticket, +}; +use arrow_array::RecordBatch; +use arrow_buffer::Buffer; +use arrow_ipc::convert::fb_to_schema; +use arrow_ipc::reader::read_record_batch; +use arrow_ipc::{root_as_message, MessageHeader}; +use arrow_schema::{ArrowError, Schema, SchemaRef}; +use futures::{stream, TryStreamExt}; +use prost::Message; +use tokio::sync::{Mutex, MutexGuard}; +use tonic::transport::{Channel, Endpoint}; +use tonic::Streaming; + +/// A FlightSQLServiceClient is an endpoint for retrieving or storing Arrow data +/// by FlightSQL protocol. +#[derive(Debug, Clone)] +pub struct FlightSqlServiceClient { + token: Option, + flight_client: Arc>>, +} + +/// A FlightSql protocol client that can run queries against FlightSql servers +/// This client is in the "experimental" stage. It is not guaranteed to follow the spec in all instances. +/// Github issues are welcomed. +impl FlightSqlServiceClient { + /// Creates a new FlightSql Client that connects via TCP to a server + pub async fn new_with_endpoint(host: &str, port: u16) -> Result { + let addr = format!("http://{}:{}", host, port); + let endpoint = Endpoint::new(addr) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .connect_timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(20)) + .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait + .tcp_keepalive(Option::Some(Duration::from_secs(3600))) + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(20)) + .keep_alive_while_idle(true); + let channel = endpoint.connect().await.map_err(|e| { + ArrowError::IoError(format!("Cannot connect to endpoint: {}", e)) + })?; + Ok(Self::new(channel)) + } + + /// Creates a new FlightSql client that connects to a server over an arbitrary tonic `Channel` + pub fn new(channel: Channel) -> Self { + let flight_client = FlightServiceClient::new(channel); + FlightSqlServiceClient { + token: None, + flight_client: Arc::new(Mutex::new(flight_client)), + } + } + + fn mut_client( + &mut self, + ) -> Result>, ArrowError> { + self.flight_client + .try_lock() + .map_err(|_| ArrowError::IoError("Unable to lock client".to_string())) + } + + async fn get_flight_info_for_command( + &mut self, + cmd: M, + ) -> Result { + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let fi = self + .mut_client()? + .get_flight_info(descriptor) + .await + .map_err(status_to_arrow_error)? + .into_inner(); + Ok(fi) + } + + /// Execute a query on the server. + pub async fn execute(&mut self, query: String) -> Result { + let cmd = CommandStatementQuery { query }; + self.get_flight_info_for_command(cmd).await + } + + /// Perform a `handshake` with the server, passing credentials and establishing a session + /// Returns arbitrary auth/handshake info binary blob + pub async fn handshake( + &mut self, + username: &str, + password: &str, + ) -> Result, ArrowError> { + let cmd = HandshakeRequest { + protocol_version: 0, + payload: vec![], + }; + let mut req = tonic::Request::new(stream::iter(vec![cmd])); + let val = base64::encode(format!("{}:{}", username, password)); + let val = format!("Basic {}", val) + .parse() + .map_err(|_| ArrowError::ParseError("Cannot parse header".to_string()))?; + req.metadata_mut().insert("authorization", val); + let resp = self + .mut_client()? + .handshake(req) + .await + .map_err(|e| ArrowError::IoError(format!("Can't handshake {}", e)))?; + if let Some(auth) = resp.metadata().get("authorization") { + let auth = auth.to_str().map_err(|_| { + ArrowError::ParseError("Can't read auth header".to_string()) + })?; + let bearer = "Bearer "; + if !auth.starts_with(bearer) { + Err(ArrowError::ParseError("Invalid auth header!".to_string()))?; + } + let auth = auth[bearer.len()..].to_string(); + self.token = Some(auth); + } + let responses: Vec = + resp.into_inner().try_collect().await.map_err(|_| { + ArrowError::ParseError("Can't collect responses".to_string()) + })?; + let resp = match responses.as_slice() { + [resp] => resp, + [] => Err(ArrowError::ParseError("No handshake response".to_string()))?, + _ => Err(ArrowError::ParseError( + "Multiple handshake responses".to_string(), + ))?, + }; + Ok(resp.payload.clone()) + } + + /// Execute a update query on the server, and return the number of records affected + pub async fn execute_update(&mut self, query: String) -> Result { + let cmd = CommandStatementUpdate { query }; + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let mut result = self + .mut_client()? + .do_put(stream::iter(vec![FlightData { + flight_descriptor: Some(descriptor), + ..Default::default() + }])) + .await + .map_err(status_to_arrow_error)? + .into_inner(); + let result = result + .message() + .await + .map_err(status_to_arrow_error)? + .unwrap(); + let any: prost_types::Any = prost::Message::decode(&*result.app_metadata) + .map_err(decode_error_to_arrow_error)?; + let result: DoPutUpdateResult = any.unpack()?.unwrap(); + Ok(result.record_count) + } + + /// Request a list of catalogs as tabular FlightInfo results + pub async fn get_catalogs(&mut self) -> Result { + self.get_flight_info_for_command(CommandGetCatalogs {}) + .await + } + + /// Request a list of database schemas as tabular FlightInfo results + pub async fn get_db_schemas( + &mut self, + request: CommandGetDbSchemas, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Given a flight ticket, request to be sent the stream. Returns record batch stream reader + pub async fn do_get( + &mut self, + ticket: Ticket, + ) -> Result, ArrowError> { + Ok(self + .mut_client()? + .do_get(ticket) + .await + .map_err(status_to_arrow_error)? + .into_inner()) + } + + /// Request a list of tables. + pub async fn get_tables( + &mut self, + request: CommandGetTables, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Request the primary keys for a table. + pub async fn get_primary_keys( + &mut self, + request: CommandGetPrimaryKeys, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Retrieves a description about the foreign key columns that reference the + /// primary key columns of the given table. + pub async fn get_exported_keys( + &mut self, + request: CommandGetExportedKeys, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Retrieves the foreign key columns for the given table. + pub async fn get_imported_keys( + &mut self, + request: CommandGetImportedKeys, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Retrieves a description of the foreign key columns in the given foreign key + /// table that reference the primary key or the columns representing a unique + /// constraint of the parent table (could be the same or a different table). + pub async fn get_cross_reference( + &mut self, + request: CommandGetCrossReference, + ) -> Result { + self.get_flight_info_for_command(request).await + } + + /// Request a list of table types. + pub async fn get_table_types(&mut self) -> Result { + self.get_flight_info_for_command(CommandGetTableTypes {}) + .await + } + + /// Request a list of SQL information. + pub async fn get_sql_info( + &mut self, + sql_infos: Vec, + ) -> Result { + let request = CommandGetSqlInfo { + info: sql_infos.iter().map(|sql_info| *sql_info as u32).collect(), + }; + self.get_flight_info_for_command(request).await + } + + /// Create a prepared statement object. + pub async fn prepare( + &mut self, + query: String, + ) -> Result, ArrowError> { + let cmd = ActionCreatePreparedStatementRequest { query }; + let action = Action { + r#type: CREATE_PREPARED_STATEMENT.to_string(), + body: cmd.as_any().encode_to_vec(), + }; + let mut req = tonic::Request::new(action); + if let Some(token) = &self.token { + let val = format!("Bearer {}", token).parse().map_err(|_| { + ArrowError::IoError("Statement already closed.".to_string()) + })?; + req.metadata_mut().insert("authorization", val); + } + let mut result = self + .mut_client()? + .do_action(req) + .await + .map_err(status_to_arrow_error)? + .into_inner(); + let result = result + .message() + .await + .map_err(status_to_arrow_error)? + .unwrap(); + let any: prost_types::Any = + prost::Message::decode(&*result.body).map_err(decode_error_to_arrow_error)?; + let prepared_result: ActionCreatePreparedStatementResult = any.unpack()?.unwrap(); + let dataset_schema = match prepared_result.dataset_schema.len() { + 0 => Schema::empty(), + _ => Schema::try_from(IpcMessage(prepared_result.dataset_schema))?, + }; + let parameter_schema = match prepared_result.parameter_schema.len() { + 0 => Schema::empty(), + _ => Schema::try_from(IpcMessage(prepared_result.parameter_schema))?, + }; + Ok(PreparedStatement::new( + self.flight_client.clone(), + prepared_result.prepared_statement_handle, + dataset_schema, + parameter_schema, + )) + } + + /// Explicitly shut down and clean up the client. + pub async fn close(&mut self) -> Result<(), ArrowError> { + Ok(()) + } +} + +/// A PreparedStatement +#[derive(Debug, Clone)] +pub struct PreparedStatement { + flight_client: Arc>>, + parameter_binding: Option, + handle: Vec, + dataset_schema: Schema, + parameter_schema: Schema, +} + +impl PreparedStatement { + pub(crate) fn new( + client: Arc>>, + handle: Vec, + dataset_schema: Schema, + parameter_schema: Schema, + ) -> Self { + PreparedStatement { + flight_client: client, + parameter_binding: None, + handle, + dataset_schema, + parameter_schema, + } + } + + /// Executes the prepared statement query on the server. + pub async fn execute(&mut self) -> Result { + let cmd = CommandPreparedStatementQuery { + prepared_statement_handle: self.handle.clone(), + }; + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let result = self + .mut_client()? + .get_flight_info(descriptor) + .await + .map_err(status_to_arrow_error)? + .into_inner(); + Ok(result) + } + + /// Executes the prepared statement update query on the server. + pub async fn execute_update(&mut self) -> Result { + let cmd = CommandPreparedStatementQuery { + prepared_statement_handle: self.handle.clone(), + }; + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let mut result = self + .mut_client()? + .do_put(stream::iter(vec![FlightData { + flight_descriptor: Some(descriptor), + ..Default::default() + }])) + .await + .map_err(status_to_arrow_error)? + .into_inner(); + let result = result + .message() + .await + .map_err(status_to_arrow_error)? + .unwrap(); + let any: prost_types::Any = Message::decode(&*result.app_metadata) + .map_err(decode_error_to_arrow_error)?; + let result: DoPutUpdateResult = any.unpack()?.unwrap(); + Ok(result.record_count) + } + + /// Retrieve the parameter schema from the query. + pub fn parameter_schema(&self) -> Result<&Schema, ArrowError> { + Ok(&self.parameter_schema) + } + + /// Retrieve the ResultSet schema from the query. + pub fn dataset_schema(&self) -> Result<&Schema, ArrowError> { + Ok(&self.dataset_schema) + } + + /// Set a RecordBatch that contains the parameters that will be bind. + pub fn set_parameters( + &mut self, + parameter_binding: RecordBatch, + ) -> Result<(), ArrowError> { + self.parameter_binding = Some(parameter_binding); + Ok(()) + } + + /// Close the prepared statement, so that this PreparedStatement can not used + /// anymore and server can free up any resources. + pub async fn close(mut self) -> Result<(), ArrowError> { + let cmd = ActionClosePreparedStatementRequest { + prepared_statement_handle: self.handle.clone(), + }; + let action = Action { + r#type: CLOSE_PREPARED_STATEMENT.to_string(), + body: cmd.as_any().encode_to_vec(), + }; + let _ = self + .mut_client()? + .do_action(action) + .await + .map_err(status_to_arrow_error)?; + Ok(()) + } + + fn mut_client( + &mut self, + ) -> Result>, ArrowError> { + self.flight_client + .try_lock() + .map_err(|_| ArrowError::IoError("Unable to lock client".to_string())) + } +} + +fn decode_error_to_arrow_error(err: prost::DecodeError) -> ArrowError { + ArrowError::IoError(err.to_string()) +} + +fn status_to_arrow_error(status: tonic::Status) -> ArrowError { + ArrowError::IoError(format!("{:?}", status)) +} + +// A polymorphic structure to natively represent different types of data contained in `FlightData` +pub enum ArrowFlightData { + RecordBatch(RecordBatch), + Schema(Schema), +} + +/// Extract `Schema` or `RecordBatch`es from the `FlightData` wire representation +pub fn arrow_data_from_flight_data( + flight_data: FlightData, + arrow_schema_ref: &SchemaRef, +) -> Result { + let ipc_message = root_as_message(&flight_data.data_header[..]).map_err(|err| { + ArrowError::ParseError(format!("Unable to get root as message: {:?}", err)) + })?; + + match ipc_message.header_type() { + MessageHeader::RecordBatch => { + let ipc_record_batch = + ipc_message.header_as_record_batch().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a record batch" + .to_string(), + ) + })?; + + let dictionaries_by_field = HashMap::new(); + let record_batch = read_record_batch( + &Buffer::from(&flight_data.data_body), + ipc_record_batch, + arrow_schema_ref.clone(), + &dictionaries_by_field, + None, + &ipc_message.version(), + )?; + Ok(ArrowFlightData::RecordBatch(record_batch)) + } + MessageHeader::Schema => { + let ipc_schema = ipc_message.header_as_schema().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a schema".to_string(), + ) + })?; + + let arrow_schema = fb_to_schema(ipc_schema); + Ok(ArrowFlightData::Schema(arrow_schema)) + } + MessageHeader::DictionaryBatch => { + let _ = ipc_message.header_as_dictionary_batch().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a dictionary batch" + .to_string(), + ) + })?; + Err(ArrowError::NotYetImplemented( + "no idea on how to convert an ipc dictionary batch to an arrow type" + .to_string(), + )) + } + MessageHeader::Tensor => { + let _ = ipc_message.header_as_tensor().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a tensor".to_string(), + ) + })?; + Err(ArrowError::NotYetImplemented( + "no idea on how to convert an ipc tensor to an arrow type".to_string(), + )) + } + MessageHeader::SparseTensor => { + let _ = ipc_message.header_as_sparse_tensor().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a sparse tensor".to_string(), + ) + })?; + Err(ArrowError::NotYetImplemented( + "no idea on how to convert an ipc sparse tensor to an arrow type" + .to_string(), + )) + } + _ => Err(ArrowError::ComputeError(format!( + "Unable to convert message with header_type: '{:?}' to arrow data", + ipc_message.header_type() + ))), + } +} diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index a5d4c4c3436c..0ddc64c554d8 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -58,6 +58,7 @@ pub use gen::SupportedSqlGrammar; pub use gen::TicketStatementQuery; pub use gen::UpdateDeleteRules; +pub mod client; pub mod server; /// ProstMessageExt are useful utility methods for prost::Message types diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index d78474849af0..ec48d7cfed31 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -36,8 +36,8 @@ use super::{ TicketStatementQuery, }; -static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; -static CLOSE_PREPARED_STATEMENT: &str = "ClosePreparedStatement"; +pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; +pub(crate) static CLOSE_PREPARED_STATEMENT: &str = "ClosePreparedStatement"; /// Implements FlightSqlService to handle the flight sql protocol #[tonic::async_trait] diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 49f9c47db6d0..392d41c83ce8 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -19,10 +19,12 @@ use crate::{FlightData, IpcMessage, SchemaAsIpc, SchemaResult}; use std::collections::HashMap; +use std::sync::Arc; use arrow_array::{ArrayRef, RecordBatch}; use arrow_buffer::Buffer; -use arrow_ipc::{reader, writer, writer::IpcWriteOptions}; +use arrow_ipc::convert::fb_to_schema; +use arrow_ipc::{reader, root_as_message, writer, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema, SchemaRef}; /// Convert a `RecordBatch` to a vector of `FlightData` representing the bytes of the dictionaries @@ -44,6 +46,32 @@ pub fn flight_data_from_arrow_batch( (flight_dictionaries, flight_batch) } +/// Convert a slice of wire protocol `FlightData`s into a vector of `RecordBatch`es +pub fn flight_data_to_batches( + flight_data: &[FlightData], +) -> Result, ArrowError> { + let schema = flight_data.get(0).ok_or_else(|| { + ArrowError::CastError("Need at least one FlightData for schema".to_string()) + })?; + let message = root_as_message(&schema.data_header[..]) + .map_err(|_| ArrowError::CastError("Cannot get root as message".to_string()))?; + + let ipc_schema: arrow_ipc::Schema = message.header_as_schema().ok_or_else(|| { + ArrowError::CastError("Cannot get header as Schema".to_string()) + })?; + let schema = fb_to_schema(ipc_schema); + let schema = Arc::new(schema); + + let mut batches = vec![]; + let dictionaries_by_id = HashMap::new(); + for datum in flight_data[1..].iter() { + let batch = + flight_data_to_arrow_batch(datum, schema.clone(), &dictionaries_by_id)?; + batches.push(batch); + } + Ok(batches) +} + /// Convert `FlightData` (with supplied schema and dictionaries) to an arrow `RecordBatch`. pub fn flight_data_to_arrow_batch( data: &FlightData, @@ -111,3 +139,25 @@ pub fn ipc_message_from_arrow_schema( let IpcMessage(vals) = message; Ok(vals) } + +/// Convert `RecordBatch`es to wire protocol `FlightData`s +pub fn batches_to_flight_data( + schema: Schema, + batches: Vec, +) -> Result, ArrowError> { + let options = IpcWriteOptions::default(); + let schema_flight_data: FlightData = SchemaAsIpc::new(&schema, &options).into(); + let mut dictionaries = vec![]; + let mut flight_data = vec![]; + for batch in batches.iter() { + let (flight_dictionaries, flight_datum) = + flight_data_from_arrow_batch(batch, &options); + dictionaries.extend(flight_dictionaries); + flight_data.push(flight_datum); + } + let mut stream = vec![schema_flight_data]; + stream.extend(dictionaries.into_iter()); + stream.extend(flight_data.into_iter()); + let flight_data: Vec<_> = stream.into_iter().collect(); + Ok(flight_data) +} From a92804e2412854d18fe79a59116ac21ba14df5d8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Dec 2022 14:21:52 -0500 Subject: [PATCH 0397/1411] Update versions to 29.0.0 and update CHANGELOG (#3315) * Update version to 29.0.0 * Update changelog * Update new crates to 29.0.0 * Update changelog * updates * Apply suggestions from code review Co-authored-by: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> --- CHANGELOG-old.md | 102 +++++++++++ CHANGELOG.md | 170 +++++++++---------- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 12 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-ord/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 10 +- arrow/Cargo.toml | 24 +-- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/file_release_pr.sh | 4 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 +-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 27 files changed, 282 insertions(+), 184 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 5adb12a913a9..06bf7297f32e 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,108 @@ # Historical Changelog +## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-25) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/27.0.0...28.0.0) + +**Breaking changes:** + +- StructArray::columns return slice [\#3186](https://github.com/apache/arrow-rs/pull/3186) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return slice from GenericByteArray::value\_data [\#3171](https://github.com/apache/arrow-rs/pull/3171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support decimal negative scale [\#3152](https://github.com/apache/arrow-rs/pull/3152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- refactor: convert `Field::metadata` to `HashMap` [\#3148](https://github.com/apache/arrow-rs/pull/3148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Don't Skip Serializing Empty Metadata \(\#3082\) [\#3126](https://github.com/apache/arrow-rs/pull/3126) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add Decimal128, Decimal256, Float16 to DataType::is\_numeric [\#3121](https://github.com/apache/arrow-rs/pull/3121) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Upgrade to thrift 0.17 and fix issues [\#3104](https://github.com/apache/arrow-rs/pull/3104) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Fix prettyprint for Interval second fractions [\#3093](https://github.com/apache/arrow-rs/pull/3093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Remove Option from `Field::metadata` [\#3091](https://github.com/apache/arrow-rs/pull/3091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) + +**Implemented enhancements:** + +- Add iterator to RowSelection [\#3172](https://github.com/apache/arrow-rs/issues/3172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- create an integration test set for parquet crate against pyspark for working with bloom filters [\#3167](https://github.com/apache/arrow-rs/issues/3167) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Row Format Size Tracking [\#3160](https://github.com/apache/arrow-rs/issues/3160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add ArrayBuilder::finish\_cloned\(\) [\#3154](https://github.com/apache/arrow-rs/issues/3154) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Optimize memory usage of json reader [\#3150](https://github.com/apache/arrow-rs/issues/3150) +- Add `Field::size` and `DataType::size` [\#3147](https://github.com/apache/arrow-rs/issues/3147) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add like\_utf8\_scalar\_dyn kernel [\#3145](https://github.com/apache/arrow-rs/issues/3145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- support comparison for decimal128 array with scalar in kernel [\#3140](https://github.com/apache/arrow-rs/issues/3140) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- audit and create a document for bloom filter configurations [\#3138](https://github.com/apache/arrow-rs/issues/3138) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Should be the rounding vs truncation when cast decimal to smaller scale [\#3137](https://github.com/apache/arrow-rs/issues/3137) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Upgrade chrono to 0.4.23 [\#3120](https://github.com/apache/arrow-rs/issues/3120) +- Implements more temporal kernels using time\_fraction\_dyn [\#3108](https://github.com/apache/arrow-rs/issues/3108) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Upgrade to thrift 0.17 [\#3105](https://github.com/apache/arrow-rs/issues/3105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Be able to parse time formatted strings [\#3100](https://github.com/apache/arrow-rs/issues/3100) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve "Fail to merge schema" error messages [\#3095](https://github.com/apache/arrow-rs/issues/3095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Expose `SortingColumn` when reading and writing parquet metadata [\#3090](https://github.com/apache/arrow-rs/issues/3090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Change Field::metadata to HashMap [\#3086](https://github.com/apache/arrow-rs/issues/3086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support bloom filter reading and writing for parquet [\#3023](https://github.com/apache/arrow-rs/issues/3023) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- API to take back ownership of an ArrayRef [\#2901](https://github.com/apache/arrow-rs/issues/2901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Specialized Interleave Kernel [\#2864](https://github.com/apache/arrow-rs/issues/2864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Decimal Casts are Unchecked [\#2986](https://github.com/apache/arrow-rs/issues/2986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Release Arrow `27.0.0` \(next release after `26.0.0`\) [\#3045](https://github.com/apache/arrow-rs/issues/3045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Perf about ParquetRecordBatchStream vs ParquetRecordBatchReader [\#2916](https://github.com/apache/arrow-rs/issues/2916) + +**Merged pull requests:** + +- Improve regex related kernels by upto 85% [\#3192](https://github.com/apache/arrow-rs/pull/3192) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Derive clone for arrays [\#3184](https://github.com/apache/arrow-rs/pull/3184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Row decode cleanups [\#3180](https://github.com/apache/arrow-rs/pull/3180) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update zstd requirement from 0.11.1 to 0.12.0 [\#3178](https://github.com/apache/arrow-rs/pull/3178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Move decimal constants from `arrow-data` to `arrow-schema` crate [\#3177](https://github.com/apache/arrow-rs/pull/3177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- bloom filter part V: add an integration with pytest against pyspark [\#3176](https://github.com/apache/arrow-rs/pull/3176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Bloom filter config tweaks \(\#3023\) [\#3175](https://github.com/apache/arrow-rs/pull/3175) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add RowParser [\#3174](https://github.com/apache/arrow-rs/pull/3174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `RowSelection::iter()`, `Into>` and example [\#3173](https://github.com/apache/arrow-rs/pull/3173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add read parquet examples [\#3170](https://github.com/apache/arrow-rs/pull/3170) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([xudong963](https://github.com/xudong963)) +- Faster BinaryArray to StringArray conversion \(~67%\) [\#3168](https://github.com/apache/arrow-rs/pull/3168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unnecessary downcasts in builders [\#3166](https://github.com/apache/arrow-rs/pull/3166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- bloom filter part IV: adjust writer properties, bloom filter properties, and incorporate into column encoder [\#3165](https://github.com/apache/arrow-rs/pull/3165) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Fix parquet decimal precision [\#3164](https://github.com/apache/arrow-rs/pull/3164) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) +- Add Row size methods \(\#3160\) [\#3163](https://github.com/apache/arrow-rs/pull/3163) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Prevent precision=0 for decimal type [\#3162](https://github.com/apache/arrow-rs/pull/3162) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Remove unnecessary Buffer::from\_slice\_ref reference [\#3161](https://github.com/apache/arrow-rs/pull/3161) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add finish\_cloned to ArrayBuilder [\#3158](https://github.com/apache/arrow-rs/pull/3158) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Check overflow in MutableArrayData extend offsets \(\#3123\) [\#3157](https://github.com/apache/arrow-rs/pull/3157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Extend Decimal256 as Primitive [\#3156](https://github.com/apache/arrow-rs/pull/3156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Doc improvements [\#3155](https://github.com/apache/arrow-rs/pull/3155) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Add collect.rs example [\#3153](https://github.com/apache/arrow-rs/pull/3153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement Neg for i256 [\#3151](https://github.com/apache/arrow-rs/pull/3151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: `{Field,DataType}::size` [\#3149](https://github.com/apache/arrow-rs/pull/3149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add like\_utf8\_scalar\_dyn kernel [\#3146](https://github.com/apache/arrow-rs/pull/3146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- comparison op: decimal128 array with scalar [\#3141](https://github.com/apache/arrow-rs/pull/3141) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Cast: should get the round result for decimal to a decimal with smaller scale [\#3139](https://github.com/apache/arrow-rs/pull/3139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Fix Panic on Reading Corrupt Parquet Schema \(\#2855\) [\#3130](https://github.com/apache/arrow-rs/pull/3130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) +- Clippy parquet fixes [\#3124](https://github.com/apache/arrow-rs/pull/3124) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Add GenericByteBuilder \(\#2969\) [\#3122](https://github.com/apache/arrow-rs/pull/3122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- parquet bloom filter part III: add sbbf writer, remove `bloom` default feature, add reader properties [\#3119](https://github.com/apache/arrow-rs/pull/3119) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Add downcast\_array \(\#2901\) [\#3117](https://github.com/apache/arrow-rs/pull/3117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add COW conversion for Buffer and PrimitiveArray and unary\_mut [\#3115](https://github.com/apache/arrow-rs/pull/3115) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Include field name in merge error message [\#3113](https://github.com/apache/arrow-rs/pull/3113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- Add PrimitiveArray::unary\_opt [\#3110](https://github.com/apache/arrow-rs/pull/3110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implements more temporal kernels using time\_fraction\_dyn [\#3107](https://github.com/apache/arrow-rs/pull/3107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- cast: support unsigned numeric type to decimal128 [\#3106](https://github.com/apache/arrow-rs/pull/3106) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Expose `SortingColumn` in parquet files [\#3103](https://github.com/apache/arrow-rs/pull/3103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- parquet bloom filter part II: read sbbf bitset from row group reader, update API, and add cli demo [\#3102](https://github.com/apache/arrow-rs/pull/3102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Parse Time32/Time64 from formatted string [\#3101](https://github.com/apache/arrow-rs/pull/3101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Cleanup temporal \_internal functions [\#3099](https://github.com/apache/arrow-rs/pull/3099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve schema mismatch error message [\#3098](https://github.com/apache/arrow-rs/pull/3098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Fix clippy by avoiding deprecated functions in chrono [\#3096](https://github.com/apache/arrow-rs/pull/3096) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Minor: Add diagrams and documentation to row format [\#3094](https://github.com/apache/arrow-rs/pull/3094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Minor: Use ArrowNativeTypeOp instead of total\_cmp directly [\#3087](https://github.com/apache/arrow-rs/pull/3087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Check overflow while casting between decimal types [\#3076](https://github.com/apache/arrow-rs/pull/3076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add bloom filter implementation based on split block \(sbbf\) spec [\#3057](https://github.com/apache/arrow-rs/pull/3057) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Add FixedSizeBinaryArray::try\_from\_sparse\_iter\_with\_size [\#3054](https://github.com/apache/arrow-rs/pull/3054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) ## [27.0.0](https://github.com/apache/arrow-rs/tree/27.0.0) (2022-11-11) [Full Changelog](https://github.com/apache/arrow-rs/compare/26.0.0...27.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index accec4491852..13891a58ccdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,108 +19,104 @@ # Changelog -## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-25) +## [29.0.0](https://github.com/apache/arrow-rs/tree/29.0.0) (2022-12-09) -[Full Changelog](https://github.com/apache/arrow-rs/compare/27.0.0...28.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/28.0.0...29.0.0) **Breaking changes:** -- StructArray::columns return slice [\#3186](https://github.com/apache/arrow-rs/pull/3186) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Return slice from GenericByteArray::value\_data [\#3171](https://github.com/apache/arrow-rs/pull/3171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support decimal negative scale [\#3152](https://github.com/apache/arrow-rs/pull/3152) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- refactor: convert `Field::metadata` to `HashMap` [\#3148](https://github.com/apache/arrow-rs/pull/3148) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Don't Skip Serializing Empty Metadata \(\#3082\) [\#3126](https://github.com/apache/arrow-rs/pull/3126) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Add Decimal128, Decimal256, Float16 to DataType::is\_numeric [\#3121](https://github.com/apache/arrow-rs/pull/3121) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Upgrade to thrift 0.17 and fix issues [\#3104](https://github.com/apache/arrow-rs/pull/3104) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- Fix prettyprint for Interval second fractions [\#3093](https://github.com/apache/arrow-rs/pull/3093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Remove Option from `Field::metadata` [\#3091](https://github.com/apache/arrow-rs/pull/3091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Minor: Allow `Field::new` and `Field::new_with_dict` to take existing `String` as well as `&str` [\#3288](https://github.com/apache/arrow-rs/pull/3288) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- update `&Option` to `Option<&T>` [\#3249](https://github.com/apache/arrow-rs/pull/3249) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Hide `*_dict_scalar` kernels behind `*_dyn` kernels [\#3202](https://github.com/apache/arrow-rs/pull/3202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) **Implemented enhancements:** -- Add iterator to RowSelection [\#3172](https://github.com/apache/arrow-rs/issues/3172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- create an integration test set for parquet crate against pyspark for working with bloom filters [\#3167](https://github.com/apache/arrow-rs/issues/3167) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Row Format Size Tracking [\#3160](https://github.com/apache/arrow-rs/issues/3160) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add ArrayBuilder::finish\_cloned\(\) [\#3154](https://github.com/apache/arrow-rs/issues/3154) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Optimize memory usage of json reader [\#3150](https://github.com/apache/arrow-rs/issues/3150) -- Add `Field::size` and `DataType::size` [\#3147](https://github.com/apache/arrow-rs/issues/3147) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add like\_utf8\_scalar\_dyn kernel [\#3145](https://github.com/apache/arrow-rs/issues/3145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- support comparison for decimal128 array with scalar in kernel [\#3140](https://github.com/apache/arrow-rs/issues/3140) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- audit and create a document for bloom filter configurations [\#3138](https://github.com/apache/arrow-rs/issues/3138) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Should be the rounding vs truncation when cast decimal to smaller scale [\#3137](https://github.com/apache/arrow-rs/issues/3137) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Upgrade chrono to 0.4.23 [\#3120](https://github.com/apache/arrow-rs/issues/3120) -- Implements more temporal kernels using time\_fraction\_dyn [\#3108](https://github.com/apache/arrow-rs/issues/3108) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Upgrade to thrift 0.17 [\#3105](https://github.com/apache/arrow-rs/issues/3105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Be able to parse time formatted strings [\#3100](https://github.com/apache/arrow-rs/issues/3100) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve "Fail to merge schema" error messages [\#3095](https://github.com/apache/arrow-rs/issues/3095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Expose `SortingColumn` when reading and writing parquet metadata [\#3090](https://github.com/apache/arrow-rs/issues/3090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Change Field::metadata to HashMap [\#3086](https://github.com/apache/arrow-rs/issues/3086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support bloom filter reading and writing for parquet [\#3023](https://github.com/apache/arrow-rs/issues/3023) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- API to take back ownership of an ArrayRef [\#2901](https://github.com/apache/arrow-rs/issues/2901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Specialized Interleave Kernel [\#2864](https://github.com/apache/arrow-rs/issues/2864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support writing BloomFilter in arrow\_writer [\#3275](https://github.com/apache/arrow-rs/issues/3275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support casting from unsigned numeric to Decimal256 [\#3272](https://github.com/apache/arrow-rs/issues/3272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting from Decimal256 to float types [\#3266](https://github.com/apache/arrow-rs/issues/3266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3254](https://github.com/apache/arrow-rs/issues/3254) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting from Decimal256 to unsigned numeric [\#3239](https://github.com/apache/arrow-rs/issues/3239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- precision is not considered when cast value to decimal [\#3223](https://github.com/apache/arrow-rs/issues/3223) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use RegexSet in arrow\_csv::infer\_field\_schema [\#3211](https://github.com/apache/arrow-rs/issues/3211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement FlightSQL Client [\#3206](https://github.com/apache/arrow-rs/issues/3206) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add binary\_mut and try\_binary\_mut [\#3143](https://github.com/apache/arrow-rs/issues/3143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add try\_unary\_mut [\#3133](https://github.com/apache/arrow-rs/issues/3133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Decimal Casts are Unchecked [\#2986](https://github.com/apache/arrow-rs/issues/2986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3290](https://github.com/apache/arrow-rs/issues/3290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- using ahash `compile-time-rng` kills reproducible builds [\#3271](https://github.com/apache/arrow-rs/issues/3271) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Decimal128 to Decimal256 Overflows [\#3265](https://github.com/apache/arrow-rs/issues/3265) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `nullif` panics on empty array [\#3261](https://github.com/apache/arrow-rs/issues/3261) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Some more inconsistency between can\_cast\_types and cast\_with\_options [\#3250](https://github.com/apache/arrow-rs/issues/3250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3237](https://github.com/apache/arrow-rs/issues/3237) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- new\_null\_array Panics creating StructArray with non-nullable fields [\#3226](https://github.com/apache/arrow-rs/issues/3226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- bool should cast from/to Float16Type as `can_cast_types` returns true [\#3221](https://github.com/apache/arrow-rs/issues/3221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Utf8 and LargeUtf8 cannot cast from/to Float16 but can\_cast\_types returns true [\#3220](https://github.com/apache/arrow-rs/issues/3220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Re-enable some tests in `arrow-cast` crate [\#3219](https://github.com/apache/arrow-rs/issues/3219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Off-by-one buffer size error triggers Panic when constructing RecordBatch from IPC bytes \(should return an Error\) [\#3215](https://github.com/apache/arrow-rs/issues/3215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow to and from pyarrow conversion results in changes in schema [\#3136](https://github.com/apache/arrow-rs/issues/3136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -**Closed issues:** +**Documentation updates:** -- Release Arrow `27.0.0` \(next release after `26.0.0`\) [\#3045](https://github.com/apache/arrow-rs/issues/3045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Perf about ParquetRecordBatchStream vs ParquetRecordBatchReader [\#2916](https://github.com/apache/arrow-rs/issues/2916) +- better document when we need `LargeUtf8` instead of `Utf8` [\#3228](https://github.com/apache/arrow-rs/issues/3228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Improve regex related kernels by upto 85% [\#3192](https://github.com/apache/arrow-rs/pull/3192) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Derive clone for arrays [\#3184](https://github.com/apache/arrow-rs/pull/3184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Row decode cleanups [\#3180](https://github.com/apache/arrow-rs/pull/3180) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update zstd requirement from 0.11.1 to 0.12.0 [\#3178](https://github.com/apache/arrow-rs/pull/3178) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Move decimal constants from `arrow-data` to `arrow-schema` crate [\#3177](https://github.com/apache/arrow-rs/pull/3177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- bloom filter part V: add an integration with pytest against pyspark [\#3176](https://github.com/apache/arrow-rs/pull/3176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Bloom filter config tweaks \(\#3023\) [\#3175](https://github.com/apache/arrow-rs/pull/3175) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add RowParser [\#3174](https://github.com/apache/arrow-rs/pull/3174) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add `RowSelection::iter()`, `Into>` and example [\#3173](https://github.com/apache/arrow-rs/pull/3173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Add read parquet examples [\#3170](https://github.com/apache/arrow-rs/pull/3170) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([xudong963](https://github.com/xudong963)) -- Faster BinaryArray to StringArray conversion \(~67%\) [\#3168](https://github.com/apache/arrow-rs/pull/3168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove unnecessary downcasts in builders [\#3166](https://github.com/apache/arrow-rs/pull/3166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- bloom filter part IV: adjust writer properties, bloom filter properties, and incorporate into column encoder [\#3165](https://github.com/apache/arrow-rs/pull/3165) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Fix parquet decimal precision [\#3164](https://github.com/apache/arrow-rs/pull/3164) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) -- Add Row size methods \(\#3160\) [\#3163](https://github.com/apache/arrow-rs/pull/3163) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Prevent precision=0 for decimal type [\#3162](https://github.com/apache/arrow-rs/pull/3162) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Remove unnecessary Buffer::from\_slice\_ref reference [\#3161](https://github.com/apache/arrow-rs/pull/3161) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add finish\_cloned to ArrayBuilder [\#3158](https://github.com/apache/arrow-rs/pull/3158) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Check overflow in MutableArrayData extend offsets \(\#3123\) [\#3157](https://github.com/apache/arrow-rs/pull/3157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Extend Decimal256 as Primitive [\#3156](https://github.com/apache/arrow-rs/pull/3156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Doc improvements [\#3155](https://github.com/apache/arrow-rs/pull/3155) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Add collect.rs example [\#3153](https://github.com/apache/arrow-rs/pull/3153) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Implement Neg for i256 [\#3151](https://github.com/apache/arrow-rs/pull/3151) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: `{Field,DataType}::size` [\#3149](https://github.com/apache/arrow-rs/pull/3149) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Add like\_utf8\_scalar\_dyn kernel [\#3146](https://github.com/apache/arrow-rs/pull/3146) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- comparison op: decimal128 array with scalar [\#3141](https://github.com/apache/arrow-rs/pull/3141) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Cast: should get the round result for decimal to a decimal with smaller scale [\#3139](https://github.com/apache/arrow-rs/pull/3139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Fix Panic on Reading Corrupt Parquet Schema \(\#2855\) [\#3130](https://github.com/apache/arrow-rs/pull/3130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([psvri](https://github.com/psvri)) -- Clippy parquet fixes [\#3124](https://github.com/apache/arrow-rs/pull/3124) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Add GenericByteBuilder \(\#2969\) [\#3122](https://github.com/apache/arrow-rs/pull/3122) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- parquet bloom filter part III: add sbbf writer, remove `bloom` default feature, add reader properties [\#3119](https://github.com/apache/arrow-rs/pull/3119) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Add downcast\_array \(\#2901\) [\#3117](https://github.com/apache/arrow-rs/pull/3117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add COW conversion for Buffer and PrimitiveArray and unary\_mut [\#3115](https://github.com/apache/arrow-rs/pull/3115) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Include field name in merge error message [\#3113](https://github.com/apache/arrow-rs/pull/3113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) -- Add PrimitiveArray::unary\_opt [\#3110](https://github.com/apache/arrow-rs/pull/3110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implements more temporal kernels using time\_fraction\_dyn [\#3107](https://github.com/apache/arrow-rs/pull/3107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- cast: support unsigned numeric type to decimal128 [\#3106](https://github.com/apache/arrow-rs/pull/3106) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Expose `SortingColumn` in parquet files [\#3103](https://github.com/apache/arrow-rs/pull/3103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) -- parquet bloom filter part II: read sbbf bitset from row group reader, update API, and add cli demo [\#3102](https://github.com/apache/arrow-rs/pull/3102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Parse Time32/Time64 from formatted string [\#3101](https://github.com/apache/arrow-rs/pull/3101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Cleanup temporal \_internal functions [\#3099](https://github.com/apache/arrow-rs/pull/3099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve schema mismatch error message [\#3098](https://github.com/apache/arrow-rs/pull/3098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Fix clippy by avoiding deprecated functions in chrono [\#3096](https://github.com/apache/arrow-rs/pull/3096) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Minor: Add diagrams and documentation to row format [\#3094](https://github.com/apache/arrow-rs/pull/3094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Minor: Use ArrowNativeTypeOp instead of total\_cmp directly [\#3087](https://github.com/apache/arrow-rs/pull/3087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Check overflow while casting between decimal types [\#3076](https://github.com/apache/arrow-rs/pull/3076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- add bloom filter implementation based on split block \(sbbf\) spec [\#3057](https://github.com/apache/arrow-rs/pull/3057) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) -- Add FixedSizeBinaryArray::try\_from\_sparse\_iter\_with\_size [\#3054](https://github.com/apache/arrow-rs/pull/3054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- refactor: Merge similar functions `ilike_scalar` and `nilike_scalar` [\#3303](https://github.com/apache/arrow-rs/pull/3303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Split out arrow-ord \(\#2594\) [\#3299](https://github.com/apache/arrow-rs/pull/3299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-string \(\#2594\) [\#3295](https://github.com/apache/arrow-rs/pull/3295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3293](https://github.com/apache/arrow-rs/pull/3293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Don't use dangling NonNull as sentinel [\#3289](https://github.com/apache/arrow-rs/pull/3289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Set bloom filter on byte array [\#3284](https://github.com/apache/arrow-rs/pull/3284) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Fix ipc schema custom\_metadata serialization [\#3282](https://github.com/apache/arrow-rs/pull/3282) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Disable const-random ahash feature on non-WASM \(\#3271\) [\#3277](https://github.com/apache/arrow-rs/pull/3277) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- fix\(ffi\): handle null data buffers from empty arrays [\#3276](https://github.com/apache/arrow-rs/pull/3276) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Support casting from unsigned numeric to Decimal256 [\#3273](https://github.com/apache/arrow-rs/pull/3273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add parquet-layout binary [\#3269](https://github.com/apache/arrow-rs/pull/3269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support casting from Decimal256 to float types [\#3267](https://github.com/apache/arrow-rs/pull/3267) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify decimal cast logic [\#3264](https://github.com/apache/arrow-rs/pull/3264) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix panic on nullif empty array \(\#3261\) [\#3263](https://github.com/apache/arrow-rs/pull/3263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::from\_unary and BooleanArray::from\_binary [\#3258](https://github.com/apache/arrow-rs/pull/3258) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Remove parquet build script [\#3257](https://github.com/apache/arrow-rs/pull/3257) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3255](https://github.com/apache/arrow-rs/pull/3255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support List and LargeList in Row format \(\#3159\) [\#3251](https://github.com/apache/arrow-rs/pull/3251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't recurse to children in ArrayData::try\_new [\#3248](https://github.com/apache/arrow-rs/pull/3248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate dictionaries read over IPC [\#3247](https://github.com/apache/arrow-rs/pull/3247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix MapBuilder example [\#3246](https://github.com/apache/arrow-rs/pull/3246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Loosen nullability restrictions added in \#3205 \(\#3226\) [\#3244](https://github.com/apache/arrow-rs/pull/3244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Better document implications of offsets \(\#3228\) [\#3243](https://github.com/apache/arrow-rs/pull/3243) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add new API to validate the precision for decimal array [\#3242](https://github.com/apache/arrow-rs/pull/3242) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Move nullif to arrow-select \(\#2594\) [\#3241](https://github.com/apache/arrow-rs/pull/3241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Casting from Decimal256 to unsigned numeric [\#3240](https://github.com/apache/arrow-rs/pull/3240) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3238](https://github.com/apache/arrow-rs/pull/3238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove unwraps from 'create\_primitive\_array' [\#3232](https://github.com/apache/arrow-rs/pull/3232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aarashy](https://github.com/aarashy)) +- Fix CI build by upgrading tonic-build to 0.8.4 [\#3231](https://github.com/apache/arrow-rs/pull/3231) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Remove negative scale check [\#3230](https://github.com/apache/arrow-rs/pull/3230) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update prost-build requirement from =0.11.2 to =0.11.3 [\#3225](https://github.com/apache/arrow-rs/pull/3225) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Get the round result for decimal to a decimal with smaller scale [\#3224](https://github.com/apache/arrow-rs/pull/3224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Move tests which require chrono-tz feature from `arrow-cast` to `arrow` [\#3222](https://github.com/apache/arrow-rs/pull/3222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add test cases for extracing week with/without timezone [\#3218](https://github.com/apache/arrow-rs/pull/3218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Use RegexSet for matching DataType [\#3217](https://github.com/apache/arrow-rs/pull/3217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Update tonic-build to 0.8.3 [\#3214](https://github.com/apache/arrow-rs/pull/3214) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Support StructArray in Row Format \(\#3159\) [\#3212](https://github.com/apache/arrow-rs/pull/3212) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Infer timestamps from CSV files [\#3209](https://github.com/apache/arrow-rs/pull/3209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- fix bug: cast decimal256 to other decimal with no-safe [\#3208](https://github.com/apache/arrow-rs/pull/3208) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- FlightSQL Client & integration test [\#3207](https://github.com/apache/arrow-rs/pull/3207) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Ensure StructArrays check nullability of fields [\#3205](https://github.com/apache/arrow-rs/pull/3205) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Remove special case ArrayData equality for decimals [\#3204](https://github.com/apache/arrow-rs/pull/3204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add a cast test case for decimal negative scale [\#3203](https://github.com/apache/arrow-rs/pull/3203) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Move zip and shift kernels to arrow-select [\#3201](https://github.com/apache/arrow-rs/pull/3201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate limit kernel [\#3200](https://github.com/apache/arrow-rs/pull/3200) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use SlicesIterator for ArrayData Equality [\#3198](https://github.com/apache/arrow-rs/pull/3198) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add \_dyn kernels of like, ilike, nlike, nilike kernels for dictionary support [\#3197](https://github.com/apache/arrow-rs/pull/3197) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Adding scalar nlike\_dyn, ilike\_dyn, nilike\_dyn kernels [\#3195](https://github.com/apache/arrow-rs/pull/3195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Use self capture in DataType [\#3190](https://github.com/apache/arrow-rs/pull/3190) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- To pyarrow with schema [\#3188](https://github.com/apache/arrow-rs/pull/3188) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- Support Duration in array\_value\_to\_string [\#3183](https://github.com/apache/arrow-rs/pull/3183) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Support `FixedSizeBinary` in Row format [\#3182](https://github.com/apache/arrow-rs/pull/3182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add binary\_mut and try\_binary\_mut [\#3144](https://github.com/apache/arrow-rs/pull/3144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add try\_unary\_mut [\#3134](https://github.com/apache/arrow-rs/pull/3134) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 67f59a6dcd64..e8b2762b4f71 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "28.0.0" +version = "29.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 1959721c9edc..99ecf2a85893 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "28.0.0" +version = "29.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index a5911a0a49e8..e54139d5a9b5 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "28.0.0" +version = "29.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-select = { version = "28.0.0", path = "../arrow-select" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-select = { version = "29.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 8139e0bd11a8..0a8a0bec7b7d 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "28.0.0" +version = "29.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "28.0.0", path = "../arrow-cast" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } lazy_static = { version = "1.4", default-features = false } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 7b64ebefc8d6..abe7aa63990b 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "28.0.0" +version = "29.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 35f70669ca0f..80ca172e82f2 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "28.0.0" +version = "29.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,10 +27,10 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-ipc = { version = "28.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-ipc = { version = "29.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -45,7 +45,7 @@ default = [] flight-sql-experimental = ["prost-types"] [dev-dependencies] -arrow = { version = "28.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "29.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 310df3f8af5f..4b2940e45c92 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "28.0.0" +arrow-flight = "29.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 56575a6e4916..720b47c2a5d3 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "28.0.0" +version = "29.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "28.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } +arrow = { version = "29.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 35f857510258..fcbe96d73b4d 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "28.0.0" +version = "29.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 80cf1ee00eff..55cc467bbd44 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "28.0.0" +version = "29.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "28.0.0", path = "../arrow-cast" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index b9eb7c5c4d67..0c6dea0df7cf 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "28.0.0" +version = "29.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "28.0.0", path = "../arrow-cast" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 7a9e7ba4a9c5..10aab03a54e8 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "28.0.0" +version = "29.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-select = { version = "28.0.0", path = "../arrow-select" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-select = { version = "29.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index aaa595916987..3d5a16bfa4d0 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "28.0.0" +version = "29.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "28.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "29.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 7eafb95ef4c8..cb03c97b7e06 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "28.0.0" +version = "29.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 36659f91731a..ec7a90fe6ce0 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "28.0.0" +version = "29.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 97c4b5ffbf1c..fa32ab6dc43e 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "28.0.0" +version = "29.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-array = { version = "28.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 86029cc1a3be..7f08d38e1ae0 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "28.0.0" +version = "29.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,17 +45,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array" } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "28.0.0", path = "../arrow-cast" } -arrow-csv = { version = "28.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "28.0.0", path = "../arrow-data" } -arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "28.0.0", path = "../arrow-ord" } -arrow-schema = { version = "28.0.0", path = "../arrow-schema" } -arrow-select = { version = "28.0.0", path = "../arrow-select" } -arrow-string = { version = "28.0.0", path = "../arrow-string" } +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-csv = { version = "29.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "29.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-select = { version = "29.0.0", path = "../arrow-select" } +arrow-string = { version = "29.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow/README.md b/arrow/README.md index 71cdad76947f..5bcdf0cafce9 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `28.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `29.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index ae36ef156e36..e1bbd24a5dca 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/28.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/29.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/file_release_pr.sh b/dev/release/file_release_pr.sh index 71c7547cd4cc..2db3d7986d3f 100644 --- a/dev/release/file_release_pr.sh +++ b/dev/release/file_release_pr.sh @@ -25,8 +25,8 @@ set -e -FUTURE_RELEASE="23.0.0" -ISSUE_NUMBER=2665 +FUTURE_RELEASE="29.0.0" +ISSUE_NUMBER=3216 TITLE="Update version to \`$FUTURE_RELEASE\` and update \`CHANGELOG\`" BODY="# Which issue does this PR close?\n\nCloses #$ISSUE_NUMBER.\n\n# Rationale for this change\nPrepare for biweekly release\n\n# What changes are included in this PR?\n\n# Are there any user-facing changes?\nYes" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 057f72c4161b..ef7034bbdde0 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="27.0.0" -FUTURE_RELEASE="28.0.0" +SINCE_TAG="28.0.0" +FUTURE_RELEASE="29.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index f55f36c8a85b..b13977eff3a4 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "28.0.0" +version = "29.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "28.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "28.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "28.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "28.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "28.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "28.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "28.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "29.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "29.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "29.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "29.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "29.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "29.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } @@ -75,7 +75,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "28.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "29.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 019122586e24..c704b3457769 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "28.0.0" +version = "29.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "28.0.0", default-features = false } +parquet = { path = "../parquet", version = "29.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 0e34e498b46c..e4debae0bda1 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "28.0.0" -parquet_derive = "28.0.0" +parquet = "29.0.0" +parquet_derive = "29.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 6119ceb6cd58..1167f128f865 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "28.0.0" +version = "29.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "28.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "28.0.0", default-features = false } +parquet = { path = "../parquet", version = "29.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "29.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From f078aede7a82c5373ebc08ede16307b0268dce89 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Dec 2022 20:09:54 +0000 Subject: [PATCH 0398/1411] Use take for dictionary like comparisons (#3313) * Use take for like comparisons * Fix benchmark name * Format --- arrow-string/Cargo.toml | 1 + arrow-string/src/like.rs | 121 ++++------------------------ arrow/benches/comparison_kernels.rs | 24 +++++- 3 files changed, 39 insertions(+), 107 deletions(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index fa32ab6dc43e..7dd4472f58c9 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-select = { version = "29.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index c8a4d37cd7cc..2e0356e73dbe 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -21,6 +21,7 @@ use arrow_array::*; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::*; +use arrow_select::take::take; use regex::Regex; use std::collections::HashMap; @@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - like_dict_scalar(left, right) + let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -240,31 +244,6 @@ pub fn like_utf8_scalar( like_scalar(left, right) } -/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn like_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - like_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: /// /// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` @@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - nlike_dict_scalar(left, right) + let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar( nlike_scalar(left, right) } -/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nlike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nlike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - ilike_dict_scalar(left, right) + let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar( ilike_scalar(left, right) } -/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn ilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - ilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn( DataType::Dictionary(_, _) => { downcast_dictionary_array!( left => { - nilike_dict_scalar(left, right) + let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar( nilike_scalar(left, right) } -/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -fn nilike_dict_scalar( - left: &DictionaryArray, - right: &str, -) -> Result { - match left.value_type() { - DataType::Utf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.downcast_dict::>().unwrap(); - nilike_scalar(left, right) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - fn is_like_pattern(c: char) -> bool { c == '%' || c == '_' } diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 99229ed0b37b..7b3b935bcf3a 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$")) }); - let dict_arr_a = create_string_dict_array::(size, 0.0, 4); - let dict_arr_b = create_string_dict_array::(size, 0.0, 4); + let strings = create_string_array::(20, 0.); + let dict_arr_a = create_dict_from_values::(size, 0., &strings); + let dict_arr_b = create_dict_from_values::(size, 0., &strings); - c.bench_function("dict eq string", |b| { + c.bench_function("eq dictionary[10] string[4])", |b| { b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b)) }); + + c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| { + b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test")) + }); + + c.bench_function( + "gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])", + |b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")), + ); + + c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| { + b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test")) + }); + + c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| { + b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test")) + }); } criterion_group!(benches, add_benchmark); From c215f499e15ee7e4bcf62e35f6d6243291a3ee89 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Dec 2022 21:27:23 +0000 Subject: [PATCH 0399/1411] Use BufWriter when writing bloom filters (#3318) (#3319) Disable bloom filters for most tests --- parquet/src/arrow/arrow_writer/mod.rs | 55 +++++++++++++++++++++++---- parquet/src/bloom_filter/mod.rs | 7 +++- parquet/src/file/properties.rs | 14 +++---- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index a609b992a393..53ca71d28077 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1225,16 +1225,44 @@ mod tests { file } + struct RoundTripOptions { + values: ArrayRef, + schema: SchemaRef, + bloom_filter: bool, + } + + impl RoundTripOptions { + fn new(values: ArrayRef, nullable: bool) -> Self { + let data_type = values.data_type().clone(); + let schema = Schema::new(vec![Field::new("col", data_type, nullable)]); + Self { + values, + schema: Arc::new(schema), + bloom_filter: false, + } + } + } + fn one_column_roundtrip(values: ArrayRef, nullable: bool) -> Vec { - let data_type = values.data_type().clone(); - let schema = Schema::new(vec![Field::new("col", data_type, nullable)]); - one_column_roundtrip_with_schema(values, Arc::new(schema)) + one_column_roundtrip_with_options(RoundTripOptions::new(values, nullable)) } fn one_column_roundtrip_with_schema( values: ArrayRef, schema: SchemaRef, ) -> Vec { + let mut options = RoundTripOptions::new(values, false); + options.schema = schema; + one_column_roundtrip_with_options(options) + } + + fn one_column_roundtrip_with_options(options: RoundTripOptions) -> Vec { + let RoundTripOptions { + values, + schema, + bloom_filter, + } = options; + let encodings = match values.data_type() { DataType::Utf8 | DataType::LargeUtf8 @@ -1270,7 +1298,7 @@ mod tests { .set_dictionary_enabled(dictionary_size != 0) .set_dictionary_pagesize_limit(dictionary_size.max(1)) .set_encoding(*encoding) - .set_bloom_filter_enabled(true) + .set_bloom_filter_enabled(bloom_filter) .build(); files.push(roundtrip_opts(&expected_batch, props)) @@ -1596,8 +1624,11 @@ mod tests { #[test] fn i32_column_bloom_filter() { - let positive_values: Vec = (0..SMALL_SIZE as i32).collect(); - let files = values_required::(positive_values); + let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32)); + let mut options = RoundTripOptions::new(array, false); + options.bloom_filter = true; + + let files = one_column_roundtrip_with_options(options); check_bloom_filter( files, "col".to_string(), @@ -1612,7 +1643,11 @@ mod tests { let many_vecs: Vec<_> = std::iter::repeat(one_vec).take(SMALL_SIZE).collect(); let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); - let files = values_required::(many_vecs_iter); + let array = Arc::new(BinaryArray::from_iter_values(many_vecs_iter)); + let mut options = RoundTripOptions::new(array, false); + options.bloom_filter = true; + + let files = one_column_roundtrip_with_options(options); check_bloom_filter( files, "col".to_string(), @@ -1626,7 +1661,11 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); let raw_strs = raw_values.iter().map(|s| s.as_str()); - let files = values_optional::(raw_strs); + let array = Arc::new(StringArray::from_iter_values(raw_strs)); + let mut options = RoundTripOptions::new(array, false); + options.bloom_filter = true; + + let files = one_column_roundtrip_with_options(options); let optional_raw_values: Vec<_> = raw_values .iter() diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 5bb89bf3f4d2..9334fbd7a05c 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -28,7 +28,7 @@ use crate::format::{ }; use bytes::{Buf, Bytes}; use std::hash::Hasher; -use std::io::Write; +use std::io::{BufWriter, Write}; use std::sync::Arc; use thrift::protocol::{ TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable, @@ -177,7 +177,9 @@ impl Sbbf { } /// Write the bloom filter data (header and then bitset) to the output - pub(crate) fn write(&self, mut writer: W) -> Result<(), ParquetError> { + pub(crate) fn write(&self, writer: W) -> Result<(), ParquetError> { + // Use a BufWriter to avoid costs of writing individual blocks + let mut writer = BufWriter::new(writer); let mut protocol = TCompactOutputProtocol::new(&mut writer); let header = self.header(); header.write_to_out_protocol(&mut protocol).map_err(|e| { @@ -185,6 +187,7 @@ impl Sbbf { })?; protocol.flush()?; self.write_bitset(&mut writer)?; + writer.flush()?; Ok(()) } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index ae13eff201bd..7d20b736ea0c 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -630,7 +630,7 @@ struct ColumnProperties { statistics_enabled: Option, max_statistics_size: Option, /// bloom filter related properties - bloom_filter_properies: Option, + bloom_filter_properties: Option, } impl ColumnProperties { @@ -674,10 +674,10 @@ impl ColumnProperties { /// otherwise it is a no-op. /// If `value` is `false`, resets bloom filter properties to `None`. fn set_bloom_filter_enabled(&mut self, value: bool) { - if value && self.bloom_filter_properies.is_none() { - self.bloom_filter_properies = Some(Default::default()) + if value && self.bloom_filter_properties.is_none() { + self.bloom_filter_properties = Some(Default::default()) } else if !value { - self.bloom_filter_properies = None + self.bloom_filter_properties = None } } @@ -694,7 +694,7 @@ impl ColumnProperties { value ); - self.bloom_filter_properies + self.bloom_filter_properties .get_or_insert_with(Default::default) .fpp = value; } @@ -702,7 +702,7 @@ impl ColumnProperties { /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly /// enables bloom filter if not previously enabled. fn set_bloom_filter_ndv(&mut self, value: u64) { - self.bloom_filter_properies + self.bloom_filter_properties .get_or_insert_with(Default::default) .ndv = value; } @@ -737,7 +737,7 @@ impl ColumnProperties { /// Returns the bloom filter properties, or `None` if not enabled fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> { - self.bloom_filter_properies.as_ref() + self.bloom_filter_properties.as_ref() } } From 4f6c5f004328e2ddb40e76587b8c217ce5ce3645 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Dec 2022 16:41:42 -0500 Subject: [PATCH 0400/1411] Update changelog for late breaking additions (#3321) --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13891a58ccdb..b346183712c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,9 @@ **Merged pull requests:** +- Use BufWriter when writing bloom filters and limit tests \(\#3318\) [\#3319](https://github.com/apache/arrow-rs/pull/3319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use take for dictionary like comparisons [\#3313](https://github.com/apache/arrow-rs/pull/3313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update versions to 29.0.0 and update CHANGELOG [\#3315](https://github.com/apache/arrow-rs/pull/3315) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) - refactor: Merge similar functions `ilike_scalar` and `nilike_scalar` [\#3303](https://github.com/apache/arrow-rs/pull/3303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) - Split out arrow-ord \(\#2594\) [\#3299](https://github.com/apache/arrow-rs/pull/3299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Split out arrow-string \(\#2594\) [\#3295](https://github.com/apache/arrow-rs/pull/3295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From 9e39f96b121d88b7427295bd326d14bb78d0fb39 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Dec 2022 22:28:46 +0000 Subject: [PATCH 0401/1411] Add ASCII fast path for ILIKE scalar (90% faster) (#3306) * Add ASCII fast path for ILIKE scalar * Update starts_with and ends_with * Avoid allocations * Format * Add more tests * Add failing test * Verify is_ascii --- arrow-array/src/array/byte_array.rs | 8 ++ arrow-string/src/like.rs | 207 ++++++++++++++++++++-------- 2 files changed, 158 insertions(+), 57 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f846499eefbf..eb528384eace 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -70,6 +70,14 @@ impl GenericByteArray { self.data.buffers()[1].as_slice() } + /// Returns true if all data within this array is ASCII + pub fn is_ascii(&self) -> bool { + let offsets = self.value_offsets(); + let start = offsets.first().unwrap(); + let end = offsets.last().unwrap(); + self.value_data()[start.as_usize()..end.as_usize()].is_ascii() + } + /// Returns the offset values in the offsets buffer #[inline] pub fn value_offsets(&self) -> &[T::Offset] { diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 2e0356e73dbe..e359a80cb24b 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -442,7 +442,10 @@ pub fn nlike_utf8_scalar( /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// -/// See the documentation on [`like_utf8`] for more details. +/// Case insensitive version of [`like_utf8`] +/// +/// Note: this only implements loose matching as defined by the Unicode standard. For example, +/// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to `SS` pub fn ilike_utf8( left: &GenericStringArray, right: &GenericStringArray, @@ -499,7 +502,7 @@ pub fn ilike_dyn( /// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`]. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. #[cfg(feature = "dyn_cmp_dict")] fn ilike_dict( left: &DictionaryArray, @@ -540,60 +543,55 @@ fn ilike_dict( } #[inline] -fn ilike_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( - left: L, +fn ilike_scalar_op bool>( + left: &GenericStringArray, right: &str, op: F, ) -> Result { - if !right.contains(is_like_pattern) { - // fast path, can use equals - let right_uppercase = right.to_uppercase(); - - Ok(BooleanArray::from_unary(left, |item| { - op(item.to_uppercase() == right_uppercase) - })) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1].to_uppercase(); - Ok(BooleanArray::from_unary(left, |item| { - op(item.to_uppercase().starts_with(start_str)) - })) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..].to_uppercase(); + // If not ASCII faster to use case insensitive regex than using to_uppercase + if right.is_ascii() && left.is_ascii() { + if !right.contains(is_like_pattern) { + return Ok(BooleanArray::from_unary(left, |item| { + op(item.eq_ignore_ascii_case(right)) + })); + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) + { + // fast path, can use starts_with + let start_str = &right[..right.len() - 1]; + return Ok(BooleanArray::from_unary(left, |item| { + let end = item.len().min(start_str.len()); + let result = item.is_char_boundary(end) + && start_str.eq_ignore_ascii_case(&item[..end]); + op(result) + })); + } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { + // fast path, can use ends_with + let ends_str = &right[1..]; + return Ok(BooleanArray::from_unary(left, |item| { + let start = item.len().saturating_sub(ends_str.len()); + let result = item.is_char_boundary(start) + && ends_str.eq_ignore_ascii_case(&item[start..]); + op(result) + })); + } + } - Ok(BooleanArray::from_unary(left, |item| { - op(item.to_uppercase().ends_with(ends_str)) - })) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1].to_uppercase(); - Ok(BooleanArray::from_unary(left, |item| { - op(item.to_uppercase().contains(contains)) - })) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - })?; + let re_pattern = replace_like_wildcards(right)?; + let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from ILIKE pattern: {}", + e + )) + })?; - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) - } + Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) } #[inline] -fn ilike_scalar<'a, L: ArrayAccessor>( - left: L, +fn ilike_scalar( + left: &GenericStringArray, right: &str, ) -> Result { ilike_scalar_op(left, right, |x| x) @@ -603,7 +601,7 @@ fn ilike_scalar<'a, L: ArrayAccessor>( /// [`LargeStringArray`], or [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn ilike_utf8_scalar_dyn( left: &dyn Array, right: &str, @@ -641,7 +639,7 @@ pub fn ilike_utf8_scalar_dyn( /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn ilike_utf8_scalar( left: &GenericStringArray, right: &str, @@ -652,7 +650,7 @@ pub fn ilike_utf8_scalar( /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn nilike_utf8( left: &GenericStringArray, right: &GenericStringArray, @@ -670,7 +668,7 @@ pub fn nilike_utf8( /// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`]. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn nilike_dyn( left: &dyn Array, right: &dyn Array, @@ -709,7 +707,7 @@ pub fn nilike_dyn( /// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`]. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. #[cfg(feature = "dyn_cmp_dict")] fn nilike_dict( left: &DictionaryArray, @@ -750,8 +748,8 @@ fn nilike_dict( } #[inline] -fn nilike_scalar<'a, L: ArrayAccessor>( - left: L, +fn nilike_scalar( + left: &GenericStringArray, right: &str, ) -> Result { ilike_scalar_op(left, right, |x| !x) @@ -761,7 +759,7 @@ fn nilike_scalar<'a, L: ArrayAccessor>( /// [`LargeStringArray`], or [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn nilike_utf8_scalar_dyn( left: &dyn Array, right: &str, @@ -799,7 +797,7 @@ pub fn nilike_utf8_scalar_dyn( /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// -/// See the documentation on [`like_utf8`] for more details. +/// See the documentation on [`ilike_utf8`] for more details. pub fn nilike_utf8_scalar( left: &GenericStringArray, right: &str, @@ -1272,6 +1270,101 @@ mod tests { vec![true, false, false, false] ); + // We only implement loose matching + test_utf8_scalar!( + test_utf8_array_ilike_unicode, + test_utf8_array_ilike_unicode_dyn, + vec![ + "FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", "FFKoSS" + ], + "FFkoSS", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_unicode_starts, + test_utf8_array_ilike_unicode_start_dyn, + vec![ + "FFkoßsdlkdf", + "FFkoSSsdlkdf", + "FFkosssdlkdf", + "FFkoS", + "FFkos", + "ffkoSS", + "ffkoß", + "FfkosSsdfd", + "FFKoSS", + ], + "FFkoSS%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_unicode_ends, + test_utf8_array_ilike_unicode_ends_dyn, + vec![ + "sdlkdfFFkoß", + "sdlkdfFFkoSS", + "sdlkdfFFkoss", + "FFkoS", + "FFkos", + "ffkoSS", + "ffkoß", + "h😃klFfkosS", + "FFKoSS", + ], + "%FFkoSS", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_unicode_contains, + test_utf8_array_ilike_unicode_contains_dyn, + vec![ + "sdlkdfFkoßsdfs", + "sdlkdfFkoSSdggs", + "sdlkdfFkosssdsd", + "FkoS", + "Fkos", + "ffkoSS", + "ffkoß", + "😃sadlksffkosSsh😃klF", + "😱slgffkosSsh😃klF", + "FFKoSS", + ], + "%FFkoSS%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true, true, true] + ); + + test_utf8_scalar!( + test_utf8_array_ilike_unicode_complex, + test_utf8_array_ilike_unicode_complex_dyn, + vec![ + "sdlkdfFooßsdfs", + "sdlkdfFooSSdggs", + "sdlkdfFoosssdsd", + "FooS", + "Foos", + "ffooSS", + "ffooß", + "😃sadlksffofsSsh😃klF", + "😱slgffoesSsh😃klF", + "FFKoSS", + ], + "%FF__SS%", + ilike_utf8_scalar, + ilike_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true, true, true] + ); + test_utf8_scalar!( test_utf8_array_ilike_scalar_one, test_utf8_array_ilike_scalar_dyn_one, From ad94368a722ca8d78a47f929a90775b669421691 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 10 Dec 2022 09:41:37 -0800 Subject: [PATCH 0402/1411] Add bloom filter benchmark (#3323) --- parquet/benches/arrow_writer.rs | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index ddca1e53c6de..676debf5c00c 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -26,6 +26,7 @@ use std::sync::Arc; use arrow::datatypes::*; use arrow::{record_batch::RecordBatch, util::data_gen::*}; +use parquet::file::properties::WriterProperties; use parquet::{arrow::ArrowWriter, errors::Result}; fn create_primitive_bench_batch( @@ -294,9 +295,26 @@ fn _create_nested_bench_batch( #[inline] fn write_batch(batch: &RecordBatch) -> Result<()> { + write_batch_with_option(batch, None) +} + +#[inline] +fn write_batch_enable_bloom_filter(batch: &RecordBatch) -> Result<()> { + let option = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + + write_batch_with_option(batch, Some(option)) +} + +#[inline] +fn write_batch_with_option( + batch: &RecordBatch, + props: Option, +) -> Result<()> { // Write batch to an in-memory writer let buffer = vec![]; - let mut writer = ArrowWriter::try_new(buffer, batch.schema(), None)?; + let mut writer = ArrowWriter::try_new(buffer, batch.schema(), props)?; writer.write(batch)?; writer.close()?; @@ -317,6 +335,10 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + group.bench_function("4096 values primitive with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + let batch = create_primitive_bench_batch_non_null(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch @@ -329,6 +351,10 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + group.bench_function("4096 values primitive non-null with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + let batch = create_bool_bench_batch(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch @@ -365,6 +391,10 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + group.bench_function("4096 values string with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + let batch = create_string_dictionary_bench_batch(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch @@ -377,6 +407,10 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + group.bench_function("4096 values string dictionary with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch @@ -389,6 +423,10 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + group.bench_function("4096 values string non-null with bloom filter", |b| { + b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) + }); + group.finish(); } From 225ea9327cc5b18d7a3b88a38362c8c5893a24b7 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 12 Dec 2022 16:35:03 +0100 Subject: [PATCH 0403/1411] feat(object_store): add PrefixObjectStore (#3329) * feat(object_store): add PrefixObjectStore * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * chore: PR comments * refactor: infallible full_path Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/lib.rs | 1 + object_store/src/prefix.rs | 281 +++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 object_store/src/prefix.rs diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index ec41f381228b..0cd56612ee45 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -170,6 +170,7 @@ pub mod limit; pub mod local; pub mod memory; pub mod path; +pub mod prefix; pub mod throttle; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs new file mode 100644 index 000000000000..d61fc22271a2 --- /dev/null +++ b/object_store/src/prefix.rs @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store wrapper handling a constant path prefix +use bytes::Bytes; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use std::ops::Range; +use tokio::io::AsyncWrite; + +use crate::path::Path; +use crate::{ + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + Result as ObjectStoreResult, +}; + +/// Store wrapper that applies a constant prefix to all paths handled by the store. +#[derive(Debug, Clone)] +pub struct PrefixObjectStore { + prefix: Path, + inner: T, +} + +impl std::fmt::Display for PrefixObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PrefixObjectStore({})", self.prefix.as_ref()) + } +} + +impl PrefixObjectStore { + /// Create a new instance of [`PrefixObjectStore`] + pub fn new(store: T, prefix: impl Into) -> Self { + Self { + prefix: prefix.into(), + inner: store, + } + } + + /// Create the full path from a path relative to prefix + fn full_path(&self, location: &Path) -> Path { + self.prefix.parts().chain(location.parts()).collect() + } + + /// Strip the constant prefix from a given path + fn strip_prefix(&self, path: &Path) -> Option { + Some(path.prefix_match(&self.prefix)?.collect()) + } +} + +#[async_trait::async_trait] +impl ObjectStore for PrefixObjectStore { + /// Save the provided bytes to the specified location. + async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.put(&full_path, bytes).await + } + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Path) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.get(&full_path).await + } + + /// Return the bytes that are stored at the specified location + /// in the given byte range + async fn get_range( + &self, + location: &Path, + range: Range, + ) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.get_range(&full_path, range).await + } + + /// Return the metadata for the specified location + async fn head(&self, location: &Path) -> ObjectStoreResult { + let full_path = self.full_path(location); + self.inner.head(&full_path).await.map(|meta| ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + }) + } + + /// Delete the object at the specified location. + async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.delete(&full_path).await + } + + /// List all the objects with the given prefix. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult>> { + Ok(self + .inner + .list(Some(&self.full_path(prefix.unwrap_or(&Path::from("/"))))) + .await? + .map_ok(|meta| ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + }) + .boxed()) + } + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + /// + /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of + /// `foo/bar_baz/x`. + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> ObjectStoreResult { + self.inner + .list_with_delimiter(Some( + &self.full_path(prefix.unwrap_or(&Path::from("/"))), + )) + .await + .map(|lst| ListResult { + common_prefixes: lst + .common_prefixes + .iter() + .filter_map(|p| self.strip_prefix(p)) + .collect(), + objects: lst + .objects + .iter() + .filter_map(|meta| { + Some(ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(&meta.location)?, + }) + }) + .collect(), + }) + } + + /// Copy an object from one path to another in the same object store. + /// + /// If there exists an object at the destination, it will be overwritten. + async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.copy(&full_from, &full_to).await + } + + /// Copy an object from one path to another, only if destination is empty. + /// + /// Will return an error if the destination already has an object. + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.copy_if_not_exists(&full_from, &full_to).await + } + + /// Move an object from one path to another in the same object store. + /// + /// Will return an error if the destination already has an object. + async fn rename_if_not_exists( + &self, + from: &Path, + to: &Path, + ) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.rename_if_not_exists(&full_from, &full_to).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> ObjectStoreResult<(MultipartId, Box)> { + let full_path = self.full_path(location); + self.inner.put_multipart(&full_path).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> ObjectStoreResult<()> { + let full_path = self.full_path(location); + self.inner.abort_multipart(&full_path, multipart_id).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::local::LocalFileSystem; + use crate::test_util::flatten_list_stream; + use crate::tests::{ + copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list, rename_and_copy, stream_get, + }; + + use tempfile::TempDir; + + #[tokio::test] + async fn prefix_test() { + let root = TempDir::new().unwrap(); + let inner = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + let integration = PrefixObjectStore::new(inner, "prefix"); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + + #[tokio::test] + async fn prefix_test_applies_prefix() { + let tmpdir = TempDir::new().unwrap(); + let local = LocalFileSystem::new_with_prefix(tmpdir.path()).unwrap(); + + let location = Path::from("prefix/test_file.json"); + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + local.put(&location, data).await.unwrap(); + + let prefix = PrefixObjectStore::new(local, "prefix"); + let location_prefix = Path::from("test_file.json"); + + let content_list = flatten_list_stream(&prefix, None).await.unwrap(); + assert_eq!(content_list, &[location_prefix.clone()]); + + let root = Path::from("/"); + let content_list = flatten_list_stream(&prefix, Some(&root)).await.unwrap(); + assert_eq!(content_list, &[location_prefix.clone()]); + + let read_data = prefix + .get(&location_prefix) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + + let target_prefix = Path::from("/test_written.json"); + prefix + .put(&target_prefix, expected_data.clone()) + .await + .unwrap(); + + prefix.delete(&location_prefix).await.unwrap(); + + let local = LocalFileSystem::new_with_prefix(tmpdir.path()).unwrap(); + + let err = local.get(&location).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + let location = Path::from("prefix/test_written.json"); + let read_data = local.get(&location).await.unwrap().bytes().await.unwrap(); + assert_eq!(&*read_data, expected_data) + } +} From 75ef138c2397a221311c626eada51fd96a7515c9 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 12 Dec 2022 08:21:48 -0800 Subject: [PATCH 0404/1411] Support UnionArray in ffi (#3305) * Make ffi support UnionArray * Move union to supported types --- .../tests/test_sql.py | 14 +- arrow/src/datatypes/ffi.rs | 56 ++++++ arrow/src/ffi.rs | 166 +++++++++++++++++- 3 files changed, 226 insertions(+), 10 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 5a8bec792273..196dc7990309 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -87,10 +87,6 @@ def assert_pyarrow_leak(): ), pa.dictionary(pa.int8(), pa.string()), pa.map_(pa.string(), pa.int32()), -] - -_unsupported_pyarrow_types = [ - pa.decimal256(76, 38), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, @@ -113,6 +109,10 @@ def assert_pyarrow_leak(): ), ] +_unsupported_pyarrow_types = [ + pa.decimal256(76, 38), +] + @pytest.mark.parametrize("pyarrow_type", _supported_pyarrow_types, ids=str) def test_type_roundtrip(pyarrow_type): @@ -202,6 +202,9 @@ def test_empty_array_python(datatype): if datatype == pa.float16(): pytest.skip("Float 16 is not implemented in Rust") + if type(datatype) is pa.lib.DenseUnionType or type(datatype) is pa.lib.SparseUnionType: + pytest.skip("Union is not implemented in Python") + a = pa.array([], datatype) b = rust.round_trip_array(a) b.validate(full=True) @@ -216,6 +219,9 @@ def test_empty_array_rust(datatype): """ Rust -> Python """ + if type(datatype) is pa.lib.DenseUnionType or type(datatype) is pa.lib.SparseUnionType: + pytest.skip("Union is not implemented in Python") + a = pa.array([], type=datatype) b = rust.make_empty_array(datatype) b.validate(full=True) diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index 41addf24fbc2..58fc8858ad75 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use arrow_schema::UnionMode; use std::convert::TryFrom; use crate::datatypes::DataType::Map; @@ -134,6 +135,50 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { } } } + // DenseUnion + ["+ud", extra] => { + let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The Union type requires an integer type id".to_string(), + ) + })).collect::>>()?; + let mut fields = Vec::with_capacity(type_ids.len()); + for idx in 0..c_schema.n_children { + let c_child = c_schema.child(idx as usize); + let field = Field::try_from(c_child)?; + fields.push(field); + } + + if fields.len() != type_ids.len() { + return Err(ArrowError::CDataInterface( + "The Union type requires same number of fields and type ids".to_string(), + )); + } + + DataType::Union(fields, type_ids, UnionMode::Dense) + } + // SparseUnion + ["+us", extra] => { + let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The Union type requires an integer type id".to_string(), + ) + })).collect::>>()?; + let mut fields = Vec::with_capacity(type_ids.len()); + for idx in 0..c_schema.n_children { + let c_child = c_schema.child(idx as usize); + let field = Field::try_from(c_child)?; + fields.push(field); + } + + if fields.len() != type_ids.len() { + return Err(ArrowError::CDataInterface( + "The Union type requires same number of fields and type ids".to_string(), + )); + } + + DataType::Union(fields, type_ids, UnionMode::Sparse) + } // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp. ["tss", ""] => DataType::Timestamp(TimeUnit::Second, None), @@ -211,6 +256,10 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { | DataType::Map(child, _) => { vec![FFI_ArrowSchema::try_from(child.as_ref())?] } + DataType::Union(fields, _, _) => fields + .iter() + .map(FFI_ArrowSchema::try_from) + .collect::>>()?, DataType::Struct(fields) => fields .iter() .map(FFI_ArrowSchema::try_from) @@ -279,6 +328,13 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::Struct(_) => Ok("+s".to_string()), DataType::Map(_, _) => Ok("+m".to_string()), DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), + DataType::Union(_, type_ids, mode) => { + let formats = type_ids.iter().map(|t| t.to_string()).collect::>(); + match mode { + UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), + UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), + } + } other => Err(ArrowError::CDataInterface(format!( "The datatype \"{:?}\" is still not supported in Rust implementation", other diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index abb53dff68bd..5e9b01b5c6b0 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -120,6 +120,7 @@ use std::{ sync::Arc, }; +use arrow_schema::UnionMode; use bitflags::bitflags; use crate::array::{layout, ArrayData}; @@ -310,8 +311,6 @@ impl Drop for FFI_ArrowSchema { #[allow(clippy::manual_bits)] fn bit_width(data_type: &DataType, i: usize) -> Result { Ok(match (data_type, i) { - // the null buffer is bit sized - (_, 0) => 1, // primitive types first buffer's size is given by the native types (DataType::Boolean, 1) => 1, (DataType::UInt8, 1) => size_of::() * 8, @@ -385,6 +384,30 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { data_type, i ))) } + // type ids. UnionArray doesn't have null bitmap so buffer index begins with 0. + (DataType::Union(_, _, _), 0) => size_of::() * 8, + // Only DenseUnion has 2nd buffer + (DataType::Union(_, _, UnionMode::Dense), 1) => size_of::() * 8, + (DataType::Union(_, _, UnionMode::Sparse), _) => { + return Err(ArrowError::CDataInterface(format!( + "The datatype \"{:?}\" expects 1 buffer, but requested {}. Please verify that the C data interface is correctly implemented.", + data_type, i + ))) + } + (DataType::Union(_, _, UnionMode::Dense), _) => { + return Err(ArrowError::CDataInterface(format!( + "The datatype \"{:?}\" expects 2 buffer, but requested {}. Please verify that the C data interface is correctly implemented.", + data_type, i + ))) + } + (_, 0) => { + // We don't call this `bit_width` to compute buffer length for null buffer. If any types that don't have null buffer like + // UnionArray, they should be handled above. + return Err(ArrowError::CDataInterface(format!( + "The datatype \"{:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented.", + data_type + ))) + } _ => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{:?}\" is still not supported in Rust implementation", @@ -661,7 +684,8 @@ pub trait ArrowArrayRef { }) } - /// returns all buffers, as organized by Rust (i.e. null buffer is skipped) + /// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present + /// in the spec of the type) fn buffers(&self, can_contain_null_mask: bool) -> Result> { // + 1: skip null buffer let buffer_begin = can_contain_null_mask as i64; @@ -690,9 +714,9 @@ pub trait ArrowArrayRef { } /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface) - // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`. - // for variable-sized buffers, such as the second buffer of a stringArray, we need - // to fetch offset buffer's len to build the second buffer. + /// Rust implementation uses fixed-sized buffers, which require knowledge of their `len`. + /// for variable-sized buffers, such as the second buffer of a stringArray, we need + /// to fetch offset buffer's len to build the second buffer. fn buffer_len(&self, i: usize) -> Result { // Special handling for dictionary type as we only care about the key type in the case. let t = self.data_type()?; @@ -937,6 +961,9 @@ mod tests { }; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; + use arrow_array::builder::UnionBuilder; + use arrow_array::types::{Float64Type, Int32Type}; + use arrow_array::{Float64Array, UnionArray}; use std::convert::TryFrom; #[test] @@ -1500,4 +1527,131 @@ mod tests { Ok(()) } + + #[test] + fn test_union_sparse_array() -> Result<()> { + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("c", 3.0).unwrap(); + builder.append::("a", 4).unwrap(); + let union = builder.build().unwrap(); + + // export it + let array = ArrowArray::try_from(union.data().clone())?; + + // (simulate consumer) import it + let data = ArrayData::try_from(array)?; + let array = make_array(data); + + let array = array.as_any().downcast_ref::().unwrap(); + + let expected_type_ids = vec![0_i8, 0, 1, 0]; + + // Check type ids + assert_eq!( + Buffer::from_slice_ref(&expected_type_ids), + array.data().buffers()[0] + ); + for (i, id) in expected_type_ids.iter().enumerate() { + assert_eq!(id, &array.type_id(i)); + } + + // Check offsets, sparse union should only have a single buffer, i.e. no offsets + assert_eq!(array.data().buffers().len(), 1); + + for i in 0..array.len() { + let slot = array.value(i); + match i { + 0 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(1_i32, value); + } + 1 => assert!(slot.is_null(0)), + 2 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(value, 3_f64); + } + 3 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(4_i32, value); + } + _ => unreachable!(), + } + } + + Ok(()) + } + + #[test] + fn test_union_dense_array() -> Result<()> { + let mut builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append_null::("a").unwrap(); + builder.append::("c", 3.0).unwrap(); + builder.append::("a", 4).unwrap(); + let union = builder.build().unwrap(); + + // export it + let array = ArrowArray::try_from(union.data().clone())?; + + // (simulate consumer) import it + let data = ArrayData::try_from(array)?; + let array = make_array(data); + + let array = array.as_any().downcast_ref::().unwrap(); + + let expected_type_ids = vec![0_i8, 0, 1, 0]; + + // Check type ids + assert_eq!( + Buffer::from_slice_ref(&expected_type_ids), + array.data().buffers()[0] + ); + for (i, id) in expected_type_ids.iter().enumerate() { + assert_eq!(id, &array.type_id(i)); + } + + assert_eq!(array.data().buffers().len(), 2); + + for i in 0..array.len() { + let slot = array.value(i); + match i { + 0 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(1_i32, value); + } + 1 => assert!(slot.is_null(0)), + 2 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(value, 3_f64); + } + 3 => { + let slot = slot.as_any().downcast_ref::().unwrap(); + assert!(!slot.is_null(0)); + assert_eq!(slot.len(), 1); + let value = slot.value(0); + assert_eq!(4_i32, value); + } + _ => unreachable!(), + } + } + + Ok(()) + } } From 19f8e8cb02d5ece6b64c5fb08d9eeac4b7293911 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 12 Dec 2022 18:08:51 +0100 Subject: [PATCH 0405/1411] feat(object_store): parse well-known storage urls (#3327) * feat(object_store): add url parsing to azure builder * feat(object_store): add url parsing to aws builder * feat(object_store): add url parsing to gcs builder * feat(object_store): parse gcs service account from env * fix: typo * docs(object_store): fix example / template urls * feat(object_store): parse S3 virtually hosted urls * refactor: raise url parsing errors on build * fix: properly set virtual_hosted_style_request in url parsing --- object_store/src/aws/mod.rs | 97 ++++++++++++++++++++++++++++- object_store/src/azure/mod.rs | 111 +++++++++++++++++++++++++++++++++- object_store/src/gcp/mod.rs | 95 +++++++++++++++++++++++++++++ 3 files changed, 300 insertions(+), 3 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index aa419d60501a..0fcfbaf9c5c2 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -42,6 +42,7 @@ use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWrite; use tracing::info; +use url::Url; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ @@ -116,6 +117,18 @@ enum Error { #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -359,6 +372,7 @@ pub struct AmazonS3Builder { metadata_endpoint: Option, profile: Option, client_options: ClientOptions, + url_parse_error: Option, } impl AmazonS3Builder { @@ -430,6 +444,67 @@ impl AmazonS3Builder { builder } + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `s3:///` + /// - `s3a:///` + /// - `https://s3..amazonaws.com` + /// - `https://.s3..amazonaws.com` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_url("s3://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "s3" | "s3a" => { + self.bucket_name = parsed.host_str().map(|host| host.to_owned()); + } + "https" => { + if let Some(host) = parsed.host_str() { + let parts = host.splitn(4, '.').collect::>(); + if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws" + { + self.bucket_name = Some(parts[1].to_string()); + } + if parts.len() == 4 + && parts[1] == "s3" + && parts[3] == "amazonaws.com" + { + self.bucket_name = Some(parts[0].to_string()); + self.region = Some(parts[2].to_string()); + self.virtual_hosted_style_request = true; + } + } + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the AWS Access Key (required) pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { self.access_key_id = Some(access_key_id.into()); @@ -567,6 +642,10 @@ impl AmazonS3Builder { /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(self) -> Result { + if let Some(err) = self.url_parse_error { + return Err(err.into()); + } + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; @@ -642,8 +721,8 @@ impl AmazonS3Builder { let endpoint: String; let bucket_endpoint: String; - //If `endpoint` is provided then its assumed to be consistent with - // `virutal_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // If `endpoint` is provided then its assumed to be consistent with + // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. if self.virtual_hosted_style_request { endpoint = self.endpoint.unwrap_or_else(|| { @@ -940,4 +1019,18 @@ mod tests { err ); } + + #[test] + fn s3_test_urls() { + let builder = AmazonS3Builder::new().with_url("s3://bucket/path"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let builder = + AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.region, Some("region".to_string())) + } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 4b7131ea85be..2cc4fe1a43ef 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -114,6 +114,12 @@ enum Error { #[snafu(display("Azure credential error: {}", source), context(false))] Credential { source: credential::Error }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -361,6 +367,7 @@ pub struct MicrosoftAzureBuilder { use_emulator: bool, retry_config: RetryConfig, client_options: ClientOptions, + url_parse_error: Option, } impl Debug for MicrosoftAzureBuilder { @@ -379,7 +386,7 @@ impl MicrosoftAzureBuilder { Default::default() } - /// Create an instance of [MicrosoftAzureBuilder] with values pre-populated from environment variables. + /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. /// /// Variables extracted from environment: /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name @@ -424,6 +431,78 @@ impl MicrosoftAzureBuilder { builder } + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `azure:///` (custom) + /// - `https://.dfs.core.windows.net` + /// - `https://.blob.core.windows.net` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_url("abfss://file_system@account.dfs.core.windows.net/") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "az" | "adl" | "azure" => { + self.container_name = parsed.host_str().map(|host| host.to_owned()); + } + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = + parsed.host_str().map(|host| host.to_owned()); + } else if let Some(host) = parsed.host_str() { + let parts = host.splitn(2, '.').collect::>(); + if parts.len() == 2 && parts[1] == "dfs.core.windows.net" { + self.container_name = Some(parsed.username().to_owned()); + self.account_name = Some(parts[0].to_string()); + } + } + } + "https" => { + if let Some(host) = parsed.host_str() { + let parts = host.splitn(2, '.').collect::>(); + if parts.len() == 2 + && (parts[1] == "dfs.core.windows.net" + || parts[1] == "blob.core.windows.net") + { + self.account_name = Some(parts[0].to_string()); + } + } + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the Azure Account (required) pub fn with_account(mut self, account: impl Into) -> Self { self.account_name = Some(account.into()); @@ -529,8 +608,13 @@ impl MicrosoftAzureBuilder { retry_config, authority_host, mut client_options, + url_parse_error, } = self; + if let Some(err) = url_parse_error { + return Err(err.into()); + } + let container = container_name.ok_or(Error::MissingContainerName {})?; let (is_emulator, storage_url, auth, account) = if use_emulator { @@ -716,4 +800,29 @@ mod tests { copy_if_not_exists(&integration).await; stream_get(&integration).await; } + + #[test] + fn azure_blob_test_urls() { + let builder = MicrosoftAzureBuilder::new() + .with_url("abfss://file_system@account.dfs.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("az://container"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new().with_url("az://container/path"); + assert_eq!(builder.container_name, Some("container".to_string())); + + let builder = MicrosoftAzureBuilder::new() + .with_url("https://account.dfs.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())); + + let builder = MicrosoftAzureBuilder::new() + .with_url("https://account.blob.core.windows.net/"); + assert_eq!(builder.account_name, Some("account".to_string())) + } } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index f93cbde3d1b4..b3bd57256157 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -44,6 +44,7 @@ use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; +use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; @@ -129,6 +130,18 @@ enum Error { source: crate::client::retry::Error, path: String, }, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, } impl From for super::Error { @@ -766,6 +779,7 @@ pub struct GoogleCloudStorageBuilder { service_account_path: Option, retry_config: RetryConfig, client_options: ClientOptions, + url_parse_error: Option, } impl Default for GoogleCloudStorageBuilder { @@ -775,6 +789,7 @@ impl Default for GoogleCloudStorageBuilder { service_account_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), + url_parse_error: None, } } } @@ -785,6 +800,75 @@ impl GoogleCloudStorageBuilder { Default::default() } + /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * SERVICE_ACCOUNT: (alias) location of service account file + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let azure = GoogleCloudStorageBuilder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + + if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + if let Ok(service_account_path) = std::env::var("GOOGLE_SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `gs:///` + /// + /// Please note that this is a best effort implementation, and will not fail for malformed URLs, + /// but rather warn and ignore the passed url. The url also has no effect on how the + /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_url("gs://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl AsRef) -> Self { + let maybe_parsed = Url::parse(url.as_ref()); + match maybe_parsed { + Ok(parsed) => match parsed.scheme() { + "gs" => { + self.bucket_name = parsed.host_str().map(|host| host.to_owned()); + } + other => { + self.url_parse_error = Some(Error::UnknownUrlScheme { + scheme: other.into(), + }); + } + }, + Err(err) => { + self.url_parse_error = Some(Error::UnableToParseUrl { + source: err, + url: url.as_ref().into(), + }); + } + }; + self + } + /// Set the bucket name (required) pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { self.bucket_name = Some(bucket_name.into()); @@ -838,8 +922,13 @@ impl GoogleCloudStorageBuilder { service_account_path, retry_config, client_options, + url_parse_error, } = self; + if let Some(err) = url_parse_error { + return Err(err.into()); + } + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; let service_account_path = service_account_path.ok_or(Error::MissingServiceAccountPath)?; @@ -1095,4 +1184,10 @@ mod test { err ); } + + #[test] + fn gcs_test_urls() { + let builder = GoogleCloudStorageBuilder::new().with_url("gs://bucket/path"); + assert_eq!(builder.bucket_name, Some("bucket".to_string())) + } } From b2a12836d32f9c3579fdf6e0f32f0a7e6eed4d30 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Dec 2022 19:08:01 +0000 Subject: [PATCH 0406/1411] Update base64 to 0.20 (#3335) * Update base64 to 0.20 * Fix object_store --- arrow-flight/Cargo.toml | 2 +- object_store/Cargo.toml | 2 +- object_store/src/gcp/credential.rs | 10 ++++++++-- parquet/Cargo.toml | 4 ++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 80ca172e82f2..d622e41490a5 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -31,7 +31,7 @@ arrow-array = { version = "29.0.0", path = "../arrow-array" } arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } arrow-ipc = { version = "29.0.0", path = "../arrow-ipc" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -base64 = { version = "0.13", default-features = false } +base64 = { version = "0.20", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } prost = { version = "0.11", default-features = false } diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index f37831516b95..a662a810f8b1 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -43,7 +43,7 @@ url = "2.2" walkdir = "2" # Cloud storage support -base64 = { version = "0.13", default-features = false, optional = true } +base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 5b8cdb8480b4..a2a98a39be33 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -18,11 +18,17 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; use crate::RetryConfig; +use base64::engine::fast_portable::FastPortable; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; +const URL_SAFE_NO_PAD: FastPortable = FastPortable::from( + &base64::alphabet::URL_SAFE, + base64::engine::fast_portable::NO_PAD, +); + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -166,7 +172,7 @@ impl OAuthProvider { ) .context(SignSnafu)?; - let signature = base64::encode_config(&sig_bytes, base64::URL_SAFE_NO_PAD); + let signature = base64::encode_engine(&sig_bytes, &URL_SAFE_NO_PAD); let jwt = [message, signature].join("."); let body = [ @@ -218,5 +224,5 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; - Ok(base64::encode_config(string, base64::URL_SAFE_NO_PAD)) + Ok(base64::encode_engine(string, &URL_SAFE_NO_PAD)) } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index b13977eff3a4..cde46b98b214 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -54,7 +54,7 @@ zstd = { version = "0.12.0", optional = true, default-features = false } chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } +base64 = { version = "0.20", default-features = false, features = ["std", ], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -66,7 +66,7 @@ twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } [dev-dependencies] -base64 = { version = "0.13", default-features = false, features = ["std"] } +base64 = { version = "0.20", default-features = false, features = ["std"] } criterion = { version = "0.4", default-features = false } snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } From a973d39a66bc04d1cd1928664a91c0cb474e9c03 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Dec 2022 19:08:27 +0000 Subject: [PATCH 0407/1411] Update prost-build 0.11.4 (#3334) --- arrow-flight/Cargo.toml | 2 +- arrow-flight/src/arrow.flight.protocol.rs | 16 ++++++++++++++++ .../src/sql/arrow.flight.protocol.sql.rs | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index d622e41490a5..938e889f75e8 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -54,7 +54,7 @@ tower = "0.4.13" # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.47", default-features = false } -prost-build = { version = "=0.11.3", default-features = false } +prost-build = { version = "=0.11.4", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index e6754e806e06..dc0c4609b5a3 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -2,6 +2,7 @@ /// /// The request that a client provides to a server on handshake. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeRequest { /// @@ -13,6 +14,7 @@ pub struct HandshakeRequest { #[prost(bytes = "vec", tag = "2")] pub payload: ::prost::alloc::vec::Vec, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeResponse { /// @@ -26,6 +28,7 @@ pub struct HandshakeResponse { } /// /// A message for doing simple auth. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct BasicAuth { #[prost(string, tag = "2")] @@ -33,11 +36,13 @@ pub struct BasicAuth { #[prost(string, tag = "3")] pub password: ::prost::alloc::string::String, } +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Empty {} /// /// Describes an available action, including both the name used for execution /// along with a short description of the purpose of the action. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionType { #[prost(string, tag = "1")] @@ -48,6 +53,7 @@ pub struct ActionType { /// /// A service specific expression that can be used to return a limited set /// of available Arrow Flight streams. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Criteria { #[prost(bytes = "vec", tag = "1")] @@ -55,6 +61,7 @@ pub struct Criteria { } /// /// An opaque action specific for the service. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Action { #[prost(string, tag = "1")] @@ -64,6 +71,7 @@ pub struct Action { } /// /// An opaque result returned after executing an action. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Result { #[prost(bytes = "vec", tag = "1")] @@ -71,6 +79,7 @@ pub struct Result { } /// /// Wrap the result of a getSchema call +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct SchemaResult { /// The schema of the dataset in its IPC form: @@ -83,6 +92,7 @@ pub struct SchemaResult { /// /// The name or tag for a Flight. May be used as a way to retrieve or generate /// a flight or be used to expose a set of previously defined flights. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightDescriptor { #[prost(enumeration = "flight_descriptor::DescriptorType", tag = "1")] @@ -143,6 +153,7 @@ pub mod flight_descriptor { /// /// The access coordinates for retrieval of a dataset. With a FlightInfo, a /// consumer is able to determine how to retrieve a dataset. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightInfo { /// The schema of the dataset in its IPC form: @@ -175,6 +186,7 @@ pub struct FlightInfo { } /// /// A particular stream or split associated with a flight. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightEndpoint { /// @@ -202,6 +214,7 @@ pub struct FlightEndpoint { /// /// A location where a Flight service will accept retrieval of a particular /// stream given a ticket. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Location { #[prost(string, tag = "1")] @@ -213,6 +226,7 @@ pub struct Location { /// /// Tickets are meant to be single use. It is an error/application-defined /// behavior to reuse a ticket. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Ticket { #[prost(bytes = "vec", tag = "1")] @@ -220,6 +234,7 @@ pub struct Ticket { } /// /// A batch of Arrow data as part of a stream of batches. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightData { /// @@ -245,6 +260,7 @@ pub struct FlightData { } /// * /// The response message associated with the submission of a DoPut. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct PutResult { #[prost(bytes = "vec", tag = "1")] diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 0fd003e1154d..5fc091427300 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -18,6 +18,7 @@ /// int32_to_int32_list_map: map> /// > /// where there is one row per requested piece of metadata information. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetSqlInfo { /// @@ -98,6 +99,7 @@ pub struct CommandGetSqlInfo { /// is only relevant to be used by ODBC). /// > /// The returned data should be ordered by data_type and then by type_name. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetXdbcTypeInfo { /// @@ -117,6 +119,7 @@ pub struct CommandGetXdbcTypeInfo { /// catalog_name: utf8 not null /// > /// The returned data should be ordered by catalog_name. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCatalogs {} /// @@ -132,6 +135,7 @@ pub struct CommandGetCatalogs {} /// db_schema_name: utf8 not null /// > /// The returned data should be ordered by catalog_name, then db_schema_name. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetDbSchemas { /// @@ -176,6 +180,7 @@ pub struct CommandGetDbSchemas { /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTables { /// @@ -225,6 +230,7 @@ pub struct CommandGetTables { /// table_type: utf8 not null /// > /// The returned data should be ordered by table_type. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTableTypes {} /// @@ -243,6 +249,7 @@ pub struct CommandGetTableTypes {} /// key_sequence: int not null /// > /// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetPrimaryKeys { /// @@ -286,6 +293,7 @@ pub struct CommandGetPrimaryKeys { /// > /// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. /// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetExportedKeys { /// @@ -333,6 +341,7 @@ pub struct CommandGetExportedKeys { /// - 2 = SET NULL /// - 3 = NO ACTION /// - 4 = SET DEFAULT +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetImportedKeys { /// @@ -382,6 +391,7 @@ pub struct CommandGetImportedKeys { /// - 2 = SET NULL /// - 3 = NO ACTION /// - 4 = SET DEFAULT +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCrossReference { /// * @@ -419,6 +429,7 @@ pub struct CommandGetCrossReference { } /// /// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementRequest { /// The valid SQL string to create a prepared statement for. @@ -431,6 +442,7 @@ pub struct ActionCreatePreparedStatementRequest { /// The resultant PreparedStatement can be closed either: /// - Manually, through the "ClosePreparedStatement" action; /// - Automatically, by a server timeout. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { /// Opaque handle for the prepared statement on the server. @@ -448,6 +460,7 @@ pub struct ActionCreatePreparedStatementResult { /// /// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. /// Closes server resources associated with the prepared statement handle. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionClosePreparedStatementRequest { /// Opaque handle for the prepared statement on the server. @@ -470,6 +483,7 @@ pub struct ActionClosePreparedStatementRequest { /// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - GetFlightInfo: execute the query. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementQuery { /// The SQL syntax. @@ -479,6 +493,7 @@ pub struct CommandStatementQuery { /// * /// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. /// This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct TicketStatementQuery { /// Unique identifier for the instance of the statement to execute. @@ -502,6 +517,7 @@ pub struct TicketStatementQuery { /// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. /// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. /// - GetFlightInfo: execute the prepared statement instance. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementQuery { /// Opaque handle for the prepared statement on the server. @@ -511,6 +527,7 @@ pub struct CommandPreparedStatementQuery { /// /// Represents a SQL update query. Used in the command member of FlightDescriptor /// for the the RPC call DoPut to cause the server to execute the included SQL update. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementUpdate { /// The SQL syntax. @@ -521,6 +538,7 @@ pub struct CommandStatementUpdate { /// Represents a SQL update query. Used in the command member of FlightDescriptor /// for the the RPC call DoPut to cause the server to execute the included /// prepared statement handle as an update. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementUpdate { /// Opaque handle for the prepared statement on the server. @@ -531,6 +549,7 @@ pub struct CommandPreparedStatementUpdate { /// Returned from the RPC call DoPut when a CommandStatementUpdate /// CommandPreparedStatementUpdate was in the request, containing /// results from the update. +#[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { /// The number of records updated. A return value of -1 represents From 2ad0705a52bab77a22190b8c75f1d8662fd49e45 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 12 Dec 2022 13:31:29 -0800 Subject: [PATCH 0408/1411] Support casting from String to Decimal (#3281) * Support casting from string to decimal * Fix clippy * Fix * Trigger Build * Add more test coverage, add Null value. --- arrow-buffer/src/bigint.rs | 15 +- arrow-cast/src/cast.rs | 520 ++++++++++++++++++++++++++++++++++++- 2 files changed, 533 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index cfe14fb39f43..fc360657cb54 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -16,7 +16,7 @@ // under the License. use num::cast::AsPrimitive; -use num::{BigInt, FromPrimitive, ToPrimitive}; +use num::{BigInt, FromPrimitive, Num, ToPrimitive}; use std::cmp::Ordering; /// A signed 256-bit integer @@ -102,6 +102,19 @@ impl i256 { Self::from_parts(v as u128, v >> 127) } + /// Create an integer value from its representation as string. + #[inline] + pub fn from_string(value_str: &str) -> Option { + let numbers = BigInt::from_str_radix(value_str, 10).ok()?; + let (integer, overflow) = Self::from_bigint_with_overflow(numbers); + + if overflow { + None + } else { + Some(integer) + } + } + /// Create an optional i256 from the provided `f64`. Returning `None` /// if overflow occurred pub fn from_f64(v: f64) -> Option { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ebdefb18e9ed..8bd71245ca7e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -125,7 +125,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), - // TODO UTF8 to decimal // cast one decimal type to another decimal type (Decimal128(_, _), Decimal128(_, _)) => true, (Decimal256(_, _), Decimal256(_, _)) => true, @@ -143,6 +142,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, + // Utf8 to decimal + (Utf8 | LargeUtf8, Decimal128(_, _)) => true, + (Utf8 | LargeUtf8, Decimal256(_, _)) => true, (Decimal128(_, _), _) => false, (_, Decimal128(_, _)) => false, (Decimal256(_, _), _) => false, @@ -945,6 +947,18 @@ pub fn cast_with_options( *scale, cast_options, ), + Utf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), + LargeUtf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", @@ -1023,6 +1037,18 @@ pub fn cast_with_options( *scale, cast_options, ), + Utf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), + LargeUtf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", @@ -2829,6 +2855,176 @@ fn cast_utf8_to_boolean( Ok(Arc::new(output_array)) } +/// Parses given string to specified decimal native (i128/i256) based on given +/// scale. Returns an `Err` if it cannot parse given string. +fn parse_string_to_decimal_native( + value_str: &str, + scale: usize, +) -> Result +where + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + let value_str = value_str.trim(); + let parts: Vec<&str> = value_str.split('.').collect(); + if parts.len() > 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid decimal format: {:?}", + value_str + ))); + } + + let integers = parts[0].trim_start_matches('0'); + let decimals = if parts.len() == 2 { parts[1] } else { "" }; + + // Adjust decimal based on scale + let number_decimals = if decimals.len() > scale { + let decimal_number = i256::from_string(decimals).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Cannot parse decimal format: {}", + value_str + )) + })?; + + let div = + i256::from_i128(10_i128).pow_checked((decimals.len() - scale) as u32)?; + + let half = div.div_wrapping(i256::from_i128(2)); + let half_neg = half.neg_wrapping(); + + let d = decimal_number.div_wrapping(div); + let r = decimal_number.mod_wrapping(div); + + // Round result + let adjusted = match decimal_number >= i256::ZERO { + true if r >= half => d.add_wrapping(i256::ONE), + false if r <= half_neg => d.sub_wrapping(i256::ONE), + _ => d, + }; + + let integers = if !integers.is_empty() { + i256::from_string(integers) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Cannot parse decimal format: {}", + value_str + )) + }) + .map(|v| { + v.mul_wrapping(i256::from_i128(10_i128).pow_wrapping(scale as u32)) + })? + } else { + i256::ZERO + }; + + format!("{}", integers.add_wrapping(adjusted)) + } else { + let padding = if scale > decimals.len() { scale } else { 0 }; + + let decimals = format!("{:0( + from: &GenericStringArray, + precision: u8, + scale: i8, + cast_options: &CastOptions, +) -> Result, ArrowError> +where + T: DecimalType, + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + if cast_options.safe { + let iter = from.iter().map(|v| { + v.and_then(|v| parse_string_to_decimal_native::(v, scale as usize).ok()) + }); + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + Ok(unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + .with_precision_and_scale(precision, scale)? + }) + } else { + let vec = from + .iter() + .map(|v| { + v.map(|v| { + parse_string_to_decimal_native::(v, scale as usize).map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + T::DATA_TYPE, + )) + }) + }) + .transpose() + }) + .collect::, _>>()?; + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + Ok(unsafe { + PrimitiveArray::::from_trusted_len_iter(vec.iter()) + .with_precision_and_scale(precision, scale)? + }) + } +} + +/// Cast Utf8 to decimal +fn cast_string_to_decimal( + from: &ArrayRef, + precision: u8, + scale: i8, + cast_options: &CastOptions, +) -> Result +where + T: DecimalType, + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + if scale < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot cast string to decimal with negative scale {}", + scale + ))); + } + + if scale > T::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot cast string to decimal greater than maximum scale {}", + T::MAX_SCALE + ))); + } + + Ok(Arc::new(string_to_decimal_cast::( + from.as_any() + .downcast_ref::>() + .unwrap(), + precision, + scale, + cast_options, + )?)) +} + /// Cast numeric types to Boolean /// /// Any zero value returns `false` while non-zero returns `true` @@ -7176,4 +7372,326 @@ mod tests { ] ); } + + #[test] + fn test_parse_string_to_decimal() { + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::("123.45", 2).unwrap(), + 38, + 2, + ), + "123.45" + ); + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::("12345", 2).unwrap(), + 38, + 2 + ), + "12345.00" + ); + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::("0.12345", 2).unwrap(), + 38, + 2 + ), + "0.12" + ); + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::(".12345", 2).unwrap(), + 38, + 2 + ), + "0.12" + ); + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::(".1265", 2).unwrap(), + 38, + 2 + ), + "0.13" + ); + assert_eq!( + Decimal128Type::format_decimal( + parse_string_to_decimal_native::(".1265", 2).unwrap(), + 38, + 2 + ), + "0.13" + ); + + assert_eq!( + Decimal256Type::format_decimal( + parse_string_to_decimal_native::("123.45", 3).unwrap(), + 38, + 3 + ), + "123.450" + ); + assert_eq!( + Decimal256Type::format_decimal( + parse_string_to_decimal_native::("12345", 3).unwrap(), + 38, + 3 + ), + "12345.000" + ); + assert_eq!( + Decimal256Type::format_decimal( + parse_string_to_decimal_native::("0.12345", 3).unwrap(), + 38, + 3 + ), + "0.123" + ); + assert_eq!( + Decimal256Type::format_decimal( + parse_string_to_decimal_native::(".12345", 3).unwrap(), + 38, + 3 + ), + "0.123" + ); + assert_eq!( + Decimal256Type::format_decimal( + parse_string_to_decimal_native::(".1265", 3).unwrap(), + 38, + 3 + ), + "0.127" + ); + } + + fn test_cast_string_to_decimal(array: ArrayRef) { + // Decimal128 + let output_type = DataType::Decimal128(38, 2); + assert!(can_cast_types(array.data_type(), &output_type)); + + let casted_array = cast(&array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("123.45", decimal_arr.value_as_string(0)); + assert_eq!("1.23", decimal_arr.value_as_string(1)); + assert_eq!("0.12", decimal_arr.value_as_string(2)); + assert_eq!("0.13", decimal_arr.value_as_string(3)); + assert_eq!("1.26", decimal_arr.value_as_string(4)); + assert_eq!("12345.00", decimal_arr.value_as_string(5)); + assert_eq!("12345.00", decimal_arr.value_as_string(6)); + assert_eq!("0.12", decimal_arr.value_as_string(7)); + assert_eq!("12.23", decimal_arr.value_as_string(8)); + assert!(decimal_arr.is_null(9)); + assert_eq!("0.00", decimal_arr.value_as_string(10)); + assert_eq!("0.00", decimal_arr.value_as_string(11)); + assert!(decimal_arr.is_null(12)); + + // Decimal256 + let output_type = DataType::Decimal256(76, 3); + assert!(can_cast_types(array.data_type(), &output_type)); + + let casted_array = cast(&array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!("123.450", decimal_arr.value_as_string(0)); + assert_eq!("1.235", decimal_arr.value_as_string(1)); + assert_eq!("0.123", decimal_arr.value_as_string(2)); + assert_eq!("0.127", decimal_arr.value_as_string(3)); + assert_eq!("1.263", decimal_arr.value_as_string(4)); + assert_eq!("12345.000", decimal_arr.value_as_string(5)); + assert_eq!("12345.000", decimal_arr.value_as_string(6)); + assert_eq!("0.123", decimal_arr.value_as_string(7)); + assert_eq!("12.234", decimal_arr.value_as_string(8)); + assert!(decimal_arr.is_null(9)); + assert_eq!("0.000", decimal_arr.value_as_string(10)); + assert_eq!("0.000", decimal_arr.value_as_string(11)); + assert!(decimal_arr.is_null(12)); + } + + #[test] + fn test_cast_utf8_to_decimal() { + let str_array = StringArray::from(vec![ + Some("123.45"), + Some("1.2345"), + Some("0.12345"), + Some("0.1267"), + Some("1.263"), + Some("12345.0"), + Some("12345"), + Some("000.123"), + Some("12.234000"), + None, + Some(""), + Some(" "), + None, + ]); + let array = Arc::new(str_array) as ArrayRef; + + test_cast_string_to_decimal(array); + } + + #[test] + fn test_cast_large_utf8_to_decimal() { + let str_array = LargeStringArray::from(vec![ + Some("123.45"), + Some("1.2345"), + Some("0.12345"), + Some("0.1267"), + Some("1.263"), + Some("12345.0"), + Some("12345"), + Some("000.123"), + Some("12.234000"), + None, + Some(""), + Some(" "), + None, + ]); + let array = Arc::new(str_array) as ArrayRef; + + test_cast_string_to_decimal(array); + } + + #[test] + fn test_cast_invalid_utf8_to_decimal() { + let str_array = StringArray::from(vec!["4.4.5", ". 0.123"]); + let array = Arc::new(str_array) as ArrayRef; + + // Safe cast + let output_type = DataType::Decimal128(38, 2); + let casted_array = cast(&array, &output_type).unwrap(); + assert!(casted_array.is_null(0)); + assert!(casted_array.is_null(1)); + + let output_type = DataType::Decimal256(76, 2); + let casted_array = cast(&array, &output_type).unwrap(); + assert!(casted_array.is_null(0)); + assert!(casted_array.is_null(1)); + + // Non-safe cast + let output_type = DataType::Decimal128(38, 2); + let str_array = StringArray::from(vec!["4.4.5"]); + let array = Arc::new(str_array) as ArrayRef; + let option = CastOptions { safe: false }; + let casted_err = cast_with_options(&array, &output_type, &option).unwrap_err(); + assert!(casted_err + .to_string() + .contains("Cannot cast string '4.4.5' to value of Decimal128(38, 10) type")); + + let str_array = StringArray::from(vec![". 0.123"]); + let array = Arc::new(str_array) as ArrayRef; + let casted_err = cast_with_options(&array, &output_type, &option).unwrap_err(); + assert!(casted_err.to_string().contains( + "Cannot cast string '. 0.123' to value of Decimal128(38, 10) type" + )); + } + + fn test_cast_string_to_decimal128_overflow(overflow_array: ArrayRef) { + let output_type = DataType::Decimal128(38, 2); + let casted_array = cast(&overflow_array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert!(decimal_arr.is_null(0)); + assert!(decimal_arr.is_null(1)); + assert!(decimal_arr.is_null(2)); + assert_eq!( + "999999999999999999999999999999999999.99", + decimal_arr.value_as_string(3) + ); + assert_eq!( + "100000000000000000000000000000000000.00", + decimal_arr.value_as_string(4) + ); + } + + #[test] + fn test_cast_utf8_to_decimal128_overflow() { + let overflow_str_array = StringArray::from(vec![ + i128::MAX.to_string(), + i128::MIN.to_string(), + "99999999999999999999999999999999999999".to_string(), + "999999999999999999999999999999999999.99".to_string(), + "99999999999999999999999999999999999.999".to_string(), + ]); + let overflow_array = Arc::new(overflow_str_array) as ArrayRef; + + test_cast_string_to_decimal128_overflow(overflow_array); + } + + #[test] + fn test_cast_large_utf8_to_decimal128_overflow() { + let overflow_str_array = LargeStringArray::from(vec![ + i128::MAX.to_string(), + i128::MIN.to_string(), + "99999999999999999999999999999999999999".to_string(), + "999999999999999999999999999999999999.99".to_string(), + "99999999999999999999999999999999999.999".to_string(), + ]); + let overflow_array = Arc::new(overflow_str_array) as ArrayRef; + + test_cast_string_to_decimal128_overflow(overflow_array); + } + + fn test_cast_string_to_decimal256_overflow(overflow_array: ArrayRef) { + let output_type = DataType::Decimal256(76, 2); + let casted_array = cast(&overflow_array, &output_type).unwrap(); + let decimal_arr = as_primitive_array::(&casted_array); + + assert_eq!( + "170141183460469231731687303715884105727.00", + decimal_arr.value_as_string(0) + ); + assert_eq!( + "-170141183460469231731687303715884105728.00", + decimal_arr.value_as_string(1) + ); + assert_eq!( + "99999999999999999999999999999999999999.00", + decimal_arr.value_as_string(2) + ); + assert_eq!( + "999999999999999999999999999999999999.99", + decimal_arr.value_as_string(3) + ); + assert_eq!( + "100000000000000000000000000000000000.00", + decimal_arr.value_as_string(4) + ); + assert!(decimal_arr.is_null(5)); + assert!(decimal_arr.is_null(6)); + } + + #[test] + fn test_cast_utf8_to_decimal256_overflow() { + let overflow_str_array = StringArray::from(vec![ + i128::MAX.to_string(), + i128::MIN.to_string(), + "99999999999999999999999999999999999999".to_string(), + "999999999999999999999999999999999999.99".to_string(), + "99999999999999999999999999999999999.999".to_string(), + i256::MAX.to_string(), + i256::MIN.to_string(), + ]); + let overflow_array = Arc::new(overflow_str_array) as ArrayRef; + + test_cast_string_to_decimal256_overflow(overflow_array); + } + + #[test] + fn test_cast_large_utf8_to_decimal256_overflow() { + let overflow_str_array = LargeStringArray::from(vec![ + i128::MAX.to_string(), + i128::MIN.to_string(), + "99999999999999999999999999999999999999".to_string(), + "999999999999999999999999999999999999.99".to_string(), + "99999999999999999999999999999999999.999".to_string(), + i256::MAX.to_string(), + i256::MIN.to_string(), + ]); + let overflow_array = Arc::new(overflow_str_array) as ArrayRef; + + test_cast_string_to_decimal256_overflow(overflow_array); + } } From 31d5706ee35d1e3d8fe6e751f75e6b8aeac1e0a4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Dec 2022 03:45:00 -0500 Subject: [PATCH 0409/1411] Minor: Update release instructions for new crates (#3337) --- dev/release/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/release/README.md b/dev/release/README.md index e1bbd24a5dca..75849641d8b5 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -263,6 +263,8 @@ Rust Arrow Crates: (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) +(cd arrow-ord && cargo publish) +(cd arrow-string && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From 46b2848596216afc3bb09882b7004dbf1add57ed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 13 Dec 2022 08:48:00 +0000 Subject: [PATCH 0410/1411] Experiment (#3333) --- parquet/src/bloom_filter/mod.rs | 120 +++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 39 deletions(-) diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 9334fbd7a05c..e6742aefc3cd 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -49,38 +49,82 @@ const SALT: [u32; 8] = [ /// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. /// Each word is thought of as an array of bits; each bit is either "set" or "not set". -type Block = [u32; 8]; - -/// takes as its argument a single unsigned 32-bit integer and returns a block in which each -/// word has exactly one bit set. -fn mask(x: u32) -> Block { - let mut result = [0_u32; 8]; - for i in 0..8 { - // wrapping instead of checking for overflow - let y = x.wrapping_mul(SALT[i]); - let y = y >> 27; - result[i] = 1 << y; +#[derive(Debug, Copy, Clone)] +struct Block([u32; 8]); +impl Block { + const ZERO: Block = Block([0; 8]); + + /// takes as its argument a single unsigned 32-bit integer and returns a block in which each + /// word has exactly one bit set. + fn mask(x: u32) -> Self { + let mut result = [0_u32; 8]; + for i in 0..8 { + // wrapping instead of checking for overflow + let y = x.wrapping_mul(SALT[i]); + let y = y >> 27; + result[i] = 1 << y; + } + Self(result) + } + + #[inline] + #[cfg(target_endian = "little")] + fn to_le_bytes(self) -> [u8; 32] { + self.to_ne_bytes() + } + + #[inline] + #[cfg(not(target_endian = "little"))] + fn to_le_bytes(self) -> [u8; 32] { + self.swap_bytes().to_ne_bytes() + } + + #[inline] + fn to_ne_bytes(self) -> [u8; 32] { + unsafe { std::mem::transmute(self) } + } + + #[inline] + #[cfg(not(target_endian = "little"))] + fn swap_bytes(mut self) -> Self { + self.0.iter_mut().for_each(|x| *x = x.swap_bytes()); + self + } + + /// setting every bit in the block that was also set in the result from mask + fn insert(&mut self, hash: u32) { + let mask = Self::mask(hash); + for i in 0..8 { + self[i] |= mask[i]; + } + } + + /// returns true when every bit that is set in the result of mask is also set in the block. + fn check(&self, hash: u32) -> bool { + let mask = Self::mask(hash); + for i in 0..8 { + if self[i] & mask[i] == 0 { + return false; + } + } + true } - result } -/// setting every bit in the block that was also set in the result from mask -fn block_insert(block: &mut Block, hash: u32) { - let mask = mask(hash); - for i in 0..8 { - block[i] |= mask[i]; +impl std::ops::Index for Block { + type Output = u32; + + #[inline] + fn index(&self, index: usize) -> &Self::Output { + self.0.index(index) } } -/// returns true when every bit that is set in the result of mask is also set in the block. -fn block_check(block: &Block, hash: u32) -> bool { - let mask = mask(hash); - for i in 0..8 { - if block[i] & mask[i] == 0 { - return false; - } +impl std::ops::IndexMut for Block { + #[inline] + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + self.0.index_mut(index) } - true } /// A split block Bloom filter. The creation of this structure is based on the @@ -166,7 +210,7 @@ impl Sbbf { let data = bitset .chunks_exact(4 * 8) .map(|chunk| { - let mut block = [0_u32; 8]; + let mut block = Block::ZERO; for (i, word) in chunk.chunks_exact(4).enumerate() { block[i] = u32::from_le_bytes(word.try_into().unwrap()); } @@ -194,14 +238,14 @@ impl Sbbf { /// Write the bitset in serialized form to the writer. fn write_bitset(&self, mut writer: W) -> Result<(), ParquetError> { for block in &self.0 { - for word in block { - writer.write_all(&word.to_le_bytes()).map_err(|e| { + writer + .write_all(block.to_le_bytes().as_slice()) + .map_err(|e| { ParquetError::General(format!( "Could not write bloom filter bit set: {}", e )) })?; - } } Ok(()) } @@ -271,8 +315,7 @@ impl Sbbf { /// Insert a hash into the filter fn insert_hash(&mut self, hash: u64) { let block_index = self.hash_to_block_index(hash); - let block = &mut self.0[block_index]; - block_insert(block, hash as u32); + self.0[block_index].insert(hash as u32) } /// Check if an [AsBytes] value is probably present or definitely absent in the filter @@ -285,8 +328,7 @@ impl Sbbf { /// but will always return false if a hash has not been inserted. fn check_hash(&self, hash: u64) -> bool { let block_index = self.hash_to_block_index(hash); - let block = &self.0[block_index]; - block_check(block, hash as u32) + self.0[block_index].check(hash as u32) } } @@ -316,23 +358,23 @@ mod tests { #[test] fn test_mask_set_quick_check() { for i in 0..1_000_000 { - let result = mask(i); - assert!(result.iter().all(|&x| x.count_ones() == 1)); + let result = Block::mask(i); + assert!(result.0.iter().all(|&x| x.count_ones() == 1)); } } #[test] fn test_block_insert_and_check() { for i in 0..1_000_000 { - let mut block = [0_u32; 8]; - block_insert(&mut block, i); - assert!(block_check(&block, i)); + let mut block = Block::ZERO; + block.insert(i); + assert!(block.check(i)); } } #[test] fn test_sbbf_insert_and_check() { - let mut sbbf = Sbbf(vec![[0_u32; 8]; 1_000]); + let mut sbbf = Sbbf(vec![Block::ZERO; 1_000]); for i in 0..1_000_000 { sbbf.insert(&i); assert!(sbbf.check(&i)); From 2749dcca50e6dd0ac72db7fe802552c2db742c3c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 13 Dec 2022 12:12:17 -0800 Subject: [PATCH 0411/1411] Optimize bulk writing of all blocks of bloom filter (#3340) --- parquet/src/bloom_filter/mod.rs | 15 +++++++++------ parquet/src/file/writer.rs | 9 +++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index e6742aefc3cd..1a561bf16a7f 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -28,7 +28,7 @@ use crate::format::{ }; use bytes::{Buf, Bytes}; use std::hash::Hasher; -use std::io::{BufWriter, Write}; +use std::io::Write; use std::sync::Arc; use thrift::protocol::{ TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable, @@ -220,10 +220,10 @@ impl Sbbf { Self(data) } - /// Write the bloom filter data (header and then bitset) to the output - pub(crate) fn write(&self, writer: W) -> Result<(), ParquetError> { - // Use a BufWriter to avoid costs of writing individual blocks - let mut writer = BufWriter::new(writer); + /// Write the bloom filter data (header and then bitset) to the output. This doesn't + /// flush the writer in order to boost performance of bulk writing all blocks. Caller + /// must remember to flush the writer. + pub(crate) fn write(&self, mut writer: W) -> Result<(), ParquetError> { let mut protocol = TCompactOutputProtocol::new(&mut writer); let header = self.header(); header.write_to_out_protocol(&mut protocol).map_err(|e| { @@ -231,7 +231,6 @@ impl Sbbf { })?; protocol.flush()?; self.write_bitset(&mut writer)?; - writer.flush()?; Ok(()) } @@ -330,6 +329,10 @@ impl Sbbf { let block_index = self.hash_to_block_index(hash); self.0[block_index].check(hash as u32) } + + pub(crate) fn block_num(&self) -> usize { + self.0.len() + } } // per spec we use xxHash with seed=0 diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 3f1731687e2c..d92a42a6524e 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -21,6 +21,7 @@ use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; +use std::io::BufWriter; use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol, TSerializable}; @@ -225,23 +226,27 @@ impl SerializedFileWriter { // iter row group // iter each column // write bloom filter to the file + let mut start_offset = self.buf.bytes_written(); + let mut writer = BufWriter::new(&mut self.buf); + for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() { match &self.bloom_filters[row_group_idx][column_idx] { Some(bloom_filter) => { - let start_offset = self.buf.bytes_written(); - bloom_filter.write(&mut self.buf)?; + bloom_filter.write(&mut writer)?; // set offset and index for bloom filter column_chunk .meta_data .as_mut() .expect("can't have bloom filter without column metadata") .bloom_filter_offset = Some(start_offset as i64); + start_offset += bloom_filter.block_num() * 32; } None => {} } } } + writer.flush()?; Ok(()) } From a93859b07516b91511ffe3106a423b9af4b69f34 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Tue, 13 Dec 2022 16:07:20 -0500 Subject: [PATCH 0412/1411] add map array to pretty print (#3339) Co-authored-by: askoa --- arrow-cast/src/display.rs | 77 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 287065eb6950..10709994ddae 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -281,6 +281,29 @@ fn append_struct_field_string( Ok(()) } +fn append_map_field_string( + target: &mut String, + field_col: &Arc, + row: usize, +) -> Result<(), ArrowError> { + if field_col.is_null(row) { + target.push_str("null"); + } else { + match field_col.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + target.push('"'); + target.push_str(array_value_to_string(field_col, row)?.as_str()); + target.push('"'); + } + _ => { + target.push_str(array_value_to_string(field_col, row)?.as_str()); + } + } + } + + Ok(()) +} + /// Get the value at the given row in an array as a String. /// /// Note this function is quite inefficient and is unlikely to be @@ -430,6 +453,38 @@ pub fn array_value_to_string( Ok(s) } + DataType::Map(_, _) => { + let map_array = + column.as_any().downcast_ref::().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert column to map array.".to_string(), + ) + })?; + let map_entry = map_array.value(row); + let st = map_entry + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert map entry to struct array." + .to_string(), + ) + })?; + let mut s = String::new(); + s.push('{'); + let entries_count = st.column(0).len(); + for i in 0..entries_count { + if i > 0 { + s.push_str(", "); + } + append_map_field_string(&mut s, st.column(0), i)?; + s.push_str(": "); + append_map_field_string(&mut s, st.column(1), i)?; + } + s.push('}'); + + Ok(s) + } DataType::Union(field_vec, type_ids, mode) => { union_to_string(column, row, field_vec, type_ids, mode) } @@ -527,6 +582,28 @@ pub fn lexical_to_string(n: N) -> String { mod tests { use super::*; + #[test] + fn test_map_arry_to_string() { + let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"]; + let values_data = UInt32Array::from(vec![0u32, 10, 20, 30, 40, 50, 60, 70]); + + // Construct a buffer for value offsets, for the nested array: + // [[a, b, c], [d, e, f], [g, h]] + let entry_offsets = [0, 3, 6, 8]; + + let map_array = MapArray::new_from_strings( + keys.clone().into_iter(), + &values_data, + &entry_offsets, + ) + .unwrap(); + let param = Arc::new(map_array) as ArrayRef; + assert_eq!( + "{\"d\": 30, \"e\": 40, \"f\": 50}", + array_value_to_string(¶m, 1).unwrap() + ); + } + #[test] fn test_array_value_to_string_duration() { let ns_array = From a0a5880665b1836890f6843b6b8772d81c463351 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Thu, 15 Dec 2022 05:21:03 -0500 Subject: [PATCH 0413/1411] feat: configure null value in arrow csv writer (#3342) * feat: arrow_csv writer null value configuration * Update PR comment Co-authored-by: Liang-Chi Hsieh Co-authored-by: askoa Co-authored-by: Liang-Chi Hsieh --- arrow-csv/src/writer.rs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 674b333698bd..c5eed7f1e3e8 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -77,6 +77,7 @@ const DEFAULT_DATE_FORMAT: &str = "%F"; const DEFAULT_TIME_FORMAT: &str = "%T"; const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; +const DEFAULT_NULL_VALUE: &str = ""; fn write_primitive_value(array: &ArrayRef, i: usize) -> String where @@ -108,6 +109,8 @@ pub struct Writer { time_format: String, /// Is the beginning-of-writer beginning: bool, + /// The value to represent null entries + null_value: String, } impl Writer { @@ -125,6 +128,7 @@ impl Writer { timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), timestamp_tz_format: DEFAULT_TIMESTAMP_TZ_FORMAT.to_string(), beginning: true, + null_value: DEFAULT_NULL_VALUE.to_string(), } } @@ -139,8 +143,8 @@ impl Writer { for (col_index, item) in buffer.iter_mut().enumerate() { let col = &batch[col_index]; if col.is_null(row_index) { - // write an empty value - *item = "".to_string(); + // write the configured null value + *item = self.null_value.clone(); continue; } let string = match col.data_type() { @@ -340,6 +344,8 @@ pub struct WriterBuilder { timestamp_tz_format: Option, /// Optional time format for time arrays time_format: Option, + /// Optional value to represent null + null_value: Option, } impl Default for WriterBuilder { @@ -352,6 +358,7 @@ impl Default for WriterBuilder { time_format: Some(DEFAULT_TIME_FORMAT.to_string()), timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), + null_value: Some(DEFAULT_NULL_VALUE.to_string()), } } } @@ -417,6 +424,12 @@ impl WriterBuilder { self } + /// Set the value to represent null in output + pub fn with_null(mut self, null_value: String) -> Self { + self.null_value = Some(null_value); + self + } + /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let delimiter = self.delimiter.unwrap_or(b','); @@ -441,6 +454,9 @@ impl WriterBuilder { .timestamp_tz_format .unwrap_or_else(|| DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, + null_value: self + .null_value + .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), } } } @@ -570,6 +586,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let builder = WriterBuilder::new() .has_headers(false) .with_delimiter(b'|') + .with_null("NULL".to_string()) .with_time_format("%r".to_string()); let mut writer = builder.build(&mut file); let batches = vec![&batch]; @@ -584,7 +601,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit||2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1||11:46:03 PM\n" + "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" .to_string(), String::from_utf8(buffer).unwrap() ); From 915115a9f38f82a96896b4c0a9fd4c461a08e866 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 15 Dec 2022 21:29:43 +0000 Subject: [PATCH 0414/1411] Update AWS SDK (#3349) --- object_store/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a662a810f8b1..8973254c0914 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -53,8 +53,8 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.51", optional = true } -aws-config = { version = "0.51", optional = true } +aws-types = { version = "0.52", optional = true } +aws-config = { version = "0.52", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 3a48242e3880ecfa87b912324a87338661711856 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 15 Dec 2022 13:30:02 -0800 Subject: [PATCH 0415/1411] Add UnionArray test to arrow-pyarrow integration test (#3343) --- .../tests/test_sql.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 196dc7990309..c97dad77ea1d 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -358,6 +358,35 @@ def test_dictionary_python(): del a del b +def test_dense_union_python(): + """ + Python -> Rust -> Python + """ + xs = pa.array([5, 6, 7]) + ys = pa.array([False, True]) + types = pa.array([0, 1, 1, 0, 0], type=pa.int8()) + offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32()) + a = pa.UnionArray.from_dense(types, offsets, [xs, ys]) + + b = rust.round_trip_array(a) + assert a == b + del a + del b + +def test_sparse_union_python(): + """ + Python -> Rust -> Python + """ + xs = pa.array([5, 6, 7]) + ys = pa.array([False, False, True]) + types = pa.array([0, 1, 1], type=pa.int8()) + a = pa.UnionArray.from_sparse(types, [xs, ys]) + + b = rust.round_trip_array(a) + assert a == b + del a + del b + def test_record_batch_reader(): """ Python -> Rust -> Python From 5ecb0e075c81ef497b1568d7d36210d56e1d691d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 15 Dec 2022 15:06:54 -0800 Subject: [PATCH 0416/1411] Fix clippy errors (#3352) --- arrow-array/src/builder/union_builder.rs | 2 +- arrow-array/src/types.rs | 2 +- arrow-cast/src/cast.rs | 17 +++++++------- arrow-data/src/data.rs | 4 ++-- arrow-ipc/src/convert.rs | 4 ++-- arrow/src/array/ffi.rs | 8 +++---- arrow/src/compute/kernels/temporal.rs | 20 +++++------------ arrow/src/util/data_gen.rs | 2 +- arrow/src/util/pretty.rs | 4 ++-- arrow/src/util/test_util.rs | 2 +- arrow/tests/array_validation.rs | 14 ++++++------ parquet/src/arrow/arrow_reader/mod.rs | 22 +++++++++---------- parquet/src/arrow/arrow_writer/byte_array.rs | 4 ++-- parquet/src/arrow/async_reader.rs | 2 +- parquet/src/bloom_filter/mod.rs | 2 +- parquet/src/encodings/decoding.rs | 2 +- .../src/encodings/encoding/dict_encoder.rs | 4 ++-- parquet/src/encodings/rle.rs | 2 +- parquet/src/schema/types.rs | 2 +- parquet/src/util/bit_util.rs | 8 +++---- 20 files changed, 59 insertions(+), 68 deletions(-) diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index def1e1eca063..28fb7e5d999a 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -296,7 +296,7 @@ impl UnionBuilder { let arr_data_ref = unsafe { arr_data_builder.build_unchecked() }; let array_ref = make_array(arr_data_ref); - children.push((type_id, (Field::new(&name, data_type, false), array_ref))) + children.push((type_id, (Field::new(name, data_type, false), array_ref))) } children.sort_by(|a, b| { diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 0646a7f29daf..e36f850f2e14 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -483,7 +483,7 @@ impl Date64Type { /// * `i` - The Date64Type to convert pub fn to_naive_date(i: ::Native) -> NaiveDate { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - epoch.add(Duration::milliseconds(i as i64)) + epoch.add(Duration::milliseconds(i)) } /// Converts a chrono::NaiveDate into an arrow Date64Type diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8bd71245ca7e..f3dbdb8e06e8 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -798,7 +798,7 @@ pub fn cast_with_options( } Float64 => { cast_decimal_to_float::(array, |x| { - (x as f64 / 10_f64.powi(*scale as i32)) as f64 + x as f64 / 10_f64.powi(*scale as i32) }) } Null => Ok(new_null_array(to_type, array.len())), @@ -866,7 +866,7 @@ pub fn cast_with_options( } Float64 => { cast_decimal_to_float::(array, |x| { - (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f64 + x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }) } Null => Ok(new_null_array(to_type, array.len())), @@ -5946,8 +5946,7 @@ mod tests { #[test] fn test_cast_from_uint32() { - let u32_values: Vec = - vec![0, u8::MAX as u32, u16::MAX as u32, u32::MAX as u32]; + let u32_values: Vec = vec![0, u8::MAX as u32, u16::MAX as u32, u32::MAX]; let u32_array: ArrayRef = Arc::new(UInt32Array::from(u32_values)); let f64_expected = vec!["0.0", "255.0", "65535.0", "4294967295.0"]; @@ -6013,7 +6012,7 @@ mod tests { #[test] fn test_cast_from_uint16() { - let u16_values: Vec = vec![0, u8::MAX as u16, u16::MAX as u16]; + let u16_values: Vec = vec![0, u8::MAX as u16, u16::MAX]; let u16_array: ArrayRef = Arc::new(UInt16Array::from(u16_values)); let f64_expected = vec!["0.0", "255.0", "65535.0"]; @@ -6301,13 +6300,13 @@ mod tests { #[test] fn test_cast_from_int32() { let i32_values: Vec = vec![ - i32::MIN as i32, + i32::MIN, i16::MIN as i32, i8::MIN as i32, 0, i8::MAX as i32, i16::MAX as i32, - i32::MAX as i32, + i32::MAX, ]; let i32_array: ArrayRef = Arc::new(Int32Array::from(i32_values)); @@ -6463,13 +6462,13 @@ mod tests { #[test] fn test_cast_from_date32() { let i32_values: Vec = vec![ - i32::MIN as i32, + i32::MIN, i16::MIN as i32, i8::MIN as i32, 0, i8::MAX as i32, i16::MAX as i32, - i32::MAX as i32, + i32::MAX, ]; let date32_array: ArrayRef = Arc::new(Date32Array::from(i32_values)); diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index b38321aacf4c..918ecae847a9 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1657,12 +1657,12 @@ mod tests { /// returns a buffer initialized with some constant value for tests fn make_i32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42i32; n]) + Buffer::from_slice_ref(vec![42i32; n]) } /// returns a buffer initialized with some constant value for tests fn make_f32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42f32; n]) + Buffer::from_slice_ref(vec![42f32; n]) } #[test] diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index e5522303df52..a60a19b866cb 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -573,7 +573,7 @@ pub(crate) fn get_fb_field_type<'a>( }, FixedSizeBinary(len) => { let mut builder = crate::FixedSizeBinaryBuilder::new(fbb); - builder.add_byteWidth(*len as i32); + builder.add_byteWidth(*len); FBFieldType { type_type: crate::Type::FixedSizeBinary, type_: builder.finish().as_union_value(), @@ -692,7 +692,7 @@ pub(crate) fn get_fb_field_type<'a>( FixedSizeList(ref list_type, len) => { let child = build_field(fbb, list_type); let mut builder = crate::FixedSizeListBuilder::new(fbb); - builder.add_listSize(*len as i32); + builder.add_listSize(*len); FBFieldType { type_type: crate::Type::FixedSizeList, type_: builder.finish().as_union_value(), diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index a18f408a4566..fb7771ac620e 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -224,7 +224,7 @@ mod tests { let v: Vec = (0..9).into_iter().collect(); let value_data = ArrayData::builder(DataType::Int64) .len(9) - .add_buffer(Buffer::from_slice_ref(&v)) + .add_buffer(Buffer::from_slice_ref(v)) .build()?; let list_data_type = DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int64, false)), 3); @@ -249,7 +249,7 @@ mod tests { let v: Vec = (0..16).into_iter().collect(); let value_data = ArrayData::builder(DataType::Int16) .len(16) - .add_buffer(Buffer::from_slice_ref(&v)) + .add_buffer(Buffer::from_slice_ref(v)) .build()?; let list_data_type = DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int16, false)), 2); @@ -269,11 +269,11 @@ mod tests { let v: Vec = (0..16).into_iter().collect(); let value_data = ArrayData::builder(DataType::Int32) .len(16) - .add_buffer(Buffer::from_slice_ref(&v)) + .add_buffer(Buffer::from_slice_ref(v)) .build()?; let offsets: Vec = vec![0, 2, 4, 6, 8, 10, 12, 14, 16]; - let value_offsets = Buffer::from_slice_ref(&offsets); + let value_offsets = Buffer::from_slice_ref(offsets); let inner_list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); let inner_list_data = ArrayData::builder(inner_list_data_type.clone()) diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow/src/compute/kernels/temporal.rs index cea0a6afcd75..15d56f70308f 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow/src/compute/kernels/temporal.rs @@ -241,7 +241,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn year_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "year", |t| t.year() as i32) + time_fraction_dyn(array, "year", |t| t.year()) } /// Extracts the years of a given temporal primitive array as an array of integers @@ -250,7 +250,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "year", |t| t.year() as i32) + time_fraction_internal(array, "year", |t| t.year()) } /// Extracts the quarter of a given temporal array as an array of integersa within @@ -297,9 +297,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "num_days_from_monday", |t| { - t.num_days_from_monday() as i32 - }) + time_fraction_dyn(array, "num_days_from_monday", |t| t.num_days_from_monday()) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -313,9 +311,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "num_days_from_monday", |t| { - t.num_days_from_monday() as i32 - }) + time_fraction_internal(array, "num_days_from_monday", |t| t.num_days_from_monday()) } /// Extracts the day of week of a given temporal array as an array of @@ -328,9 +324,7 @@ where /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { - time_fraction_dyn(array, "num_days_from_sunday", |t| { - t.num_days_from_sunday() as i32 - }) + time_fraction_dyn(array, "num_days_from_sunday", |t| t.num_days_from_sunday()) } /// Extracts the day of week of a given temporal primitive array as an array of @@ -344,9 +338,7 @@ where T: ArrowTemporalType + ArrowNumericType, i64: From, { - time_fraction_internal(array, "num_days_from_sunday", |t| { - t.num_days_from_sunday() as i32 - }) + time_fraction_internal(array, "num_days_from_sunday", |t| t.num_days_from_sunday()) } /// Extracts the day of a given temporal array as an array of integers. diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 5dda410f0087..01f4ef5c7829 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -194,7 +194,7 @@ fn create_random_list_array( // Create list's child data let child_array = - create_random_array(list_field, child_len as usize, null_density, true_density)?; + create_random_array(list_field, child_len, null_density, true_density)?; let child_data = child_array.data(); // Create list's null buffers, if it is nullable let null_buffer = match field.is_nullable() { diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 7e8378d15339..859053352384 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -73,7 +73,7 @@ fn create_table(results: &[RecordBatch]) -> Result
{ let mut cells = Vec::new(); for col in 0..batch.num_columns() { let column = batch.column(col); - cells.push(Cell::new(&array_value_to_string(column, row)?)); + cells.push(Cell::new(array_value_to_string(column, row)?)); } table.add_row(cells); } @@ -95,7 +95,7 @@ fn create_column(field: &str, columns: &[ArrayRef]) -> Result
{ for col in columns { for row in 0..col.len() { - let cells = vec![Cell::new(&array_value_to_string(col, row)?)]; + let cells = vec![Cell::new(array_value_to_string(col, row)?)]; table.add_row(cells); } } diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs index 836bda6f98ca..83107aa79239 100644 --- a/arrow/src/util/test_util.rs +++ b/arrow/src/util/test_util.rs @@ -196,7 +196,7 @@ impl Iterator for BadIterator { /// report whatever the iterator says to fn size_hint(&self) -> (usize, Option) { - (0, Some(self.claimed as usize)) + (0, Some(self.claimed)) } } diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 64c433a6616a..3cdec46b59a0 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -427,7 +427,7 @@ fn check_utf8_validation(data_type: DataType) { .map(|&v| T::from_usize(v).unwrap()) .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, 2, @@ -459,7 +459,7 @@ fn check_utf8_char_boundary(data_type: DataType) { .map(|&v| T::from_usize(v).unwrap()) .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, 2, @@ -492,7 +492,7 @@ fn check_index_out_of_bounds_validation(data_type: DataType) .map(|&v| T::from_usize(v).unwrap()) .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, 4, @@ -545,7 +545,7 @@ fn check_index_backwards_validation(data_type: DataType) { .map(|&v| T::from_usize(v).unwrap()) .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, 4, @@ -697,7 +697,7 @@ fn check_list_offsets(data_type: DataType) { .iter() .map(|&v| T::from_usize(v).unwrap()) .collect(); - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, @@ -740,7 +740,7 @@ fn test_validate_list_negative_offsets() { // -1 is an invalid offset any way you look at it let offsets: Vec = vec![0, 2, -1, 4]; - let offsets_buffer = Buffer::from_slice_ref(&offsets); + let offsets_buffer = Buffer::from_slice_ref(offsets); ArrayData::try_new( data_type, @@ -755,7 +755,7 @@ fn test_validate_list_negative_offsets() { /// returns a buffer initialized with some constant value for tests fn make_i32_buffer(n: usize) -> Buffer { - Buffer::from_slice_ref(&vec![42i32; n]) + Buffer::from_slice_ref(vec![42i32; n]) } #[test] diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index e89ddaffe833..df38e554f9db 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1162,7 +1162,7 @@ mod tests { ]; for (prefix, target_precision) in file_variants { let path = format!("{}/{}_decimal.parquet", testdata, prefix); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let mut record_reader = ParquetRecordBatchReader::try_new(file, 32).unwrap(); let batch = record_reader.next().unwrap().unwrap(); @@ -1777,7 +1777,7 @@ mod tests { fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/nested_maps.snappy.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); for batch in record_batch_reader { @@ -1969,7 +1969,7 @@ mod tests { fn test_read_null_list() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/null_list.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let mut record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); @@ -1994,7 +1994,7 @@ mod tests { fn test_null_schema_inference() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/null_list.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let arrow_field = Field::new( "emptylist", @@ -2085,7 +2085,7 @@ mod tests { fn test_empty_projection() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/alltypes_plain.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); let file_metadata = builder.metadata().file_metadata(); @@ -2260,7 +2260,7 @@ mod tests { let test_file = File::open(&path).unwrap(); let mut serial_reader = - ParquetRecordBatchReader::try_new(File::open(path).unwrap(), 7300).unwrap(); + ParquetRecordBatchReader::try_new(File::open(&path).unwrap(), 7300).unwrap(); let data = serial_reader.next().unwrap().unwrap(); let do_test = |batch_size: usize, selection_len: usize| { @@ -2316,7 +2316,7 @@ mod tests { let testdata = arrow::util::test_util::parquet_test_data(); // `alltypes_plain.parquet` only have 8 rows let path = format!("{}/alltypes_plain.parquet", testdata); - let test_file = File::open(&path).unwrap(); + let test_file = File::open(path).unwrap(); let builder = ParquetRecordBatchReaderBuilder::try_new(test_file).unwrap(); let num_rows = builder.metadata.file_metadata().num_rows(); @@ -2395,7 +2395,7 @@ mod tests { fn test_read_lz4_raw() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/lz4_raw_compressed.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let batches = ParquetRecordBatchReader::try_new(file, 1024) .unwrap() @@ -2439,7 +2439,7 @@ mod tests { ] { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/{}", testdata, file); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let expected_rows = 4; let batches = ParquetRecordBatchReader::try_new(file, expected_rows) @@ -2471,7 +2471,7 @@ mod tests { fn test_read_lz4_hadoop_large() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/hadoop_lz4_compressed_larger.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let expected_rows = 10000; let batches = ParquetRecordBatchReader::try_new(file, expected_rows) @@ -2497,7 +2497,7 @@ mod tests { fn test_read_nested_lists() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/nested_lists.snappy.parquet", testdata); - let file = File::open(&path).unwrap(); + let file = File::open(path).unwrap(); let f = file.try_clone().unwrap(); let mut reader = ParquetRecordBatchReader::try_new(f, 60).unwrap(); diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index c3a9f83d15f3..4b9d91334a7f 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -405,11 +405,11 @@ impl DictEncoder { let num_values = self.indices.len(); let buffer_len = self.estimated_data_page_size(); let mut buffer = Vec::with_capacity(buffer_len); - buffer.push(self.bit_width() as u8); + buffer.push(self.bit_width()); let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer); for index in &self.indices { - encoder.put(*index as u64) + encoder.put(*index) } self.indices.clear(); diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 7602d54a5107..4285a1c17ca9 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -1360,6 +1360,6 @@ mod tests { .build() .unwrap(); assert_ne!(1024, file_rows); - assert_eq!(stream.batch_size, file_rows as usize); + assert_eq!(stream.batch_size, file_rows); } } diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 1a561bf16a7f..a6620fc144ab 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -141,7 +141,7 @@ fn chunk_read_bloom_filter_header_and_offset( offset: u64, reader: Arc, ) -> Result<(BloomFilterHeader, u64), ParquetError> { - let buffer = reader.get_bytes(offset as u64, SBBF_HEADER_SIZE_ESTIMATE)?; + let buffer = reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE)?; let (header, length) = read_bloom_filter_header_and_length(buffer)?; Ok((header, offset + length)) } diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index bbc119c361d8..7e3058ba7b3f 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -557,7 +557,7 @@ where self.mini_block_bit_widths.clear(); self.bit_reader.get_aligned_bytes( &mut self.mini_block_bit_widths, - self.mini_blocks_per_block as usize, + self.mini_blocks_per_block, ); let mut offset = self.bit_reader.get_byte_offset(); diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 1b516452083c..4f4a6ab4f55a 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -123,12 +123,12 @@ impl DictEncoder { pub fn write_indices(&mut self) -> Result { let buffer_len = self.estimated_data_encoded_size(); let mut buffer = Vec::with_capacity(buffer_len); - buffer.push(self.bit_width() as u8); + buffer.push(self.bit_width()); // Write bit width in the first byte let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer); for index in &self.indices { - encoder.put(*index as u64) + encoder.put(*index) } self.indices.clear(); Ok(ByteBufferPtr::new(encoder.consume())) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 25c3c81a72dc..77b76d0e7e53 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -823,7 +823,7 @@ mod tests { values.push(i % 2); } let num_groups = bit_util::ceil(100, 8) as u8; - expected_buffer.push(((num_groups << 1) as u8) | 1); + expected_buffer.push((num_groups << 1) | 1); expected_buffer.resize(expected_buffer.len() + 100 / 8, 0b10101010); // For the last 4 0 and 1's, padded with 0. diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 9f8023c91262..4501e7e31c1d 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1103,7 +1103,7 @@ fn from_thrift_helper( let mut fields = vec![]; let mut next_index = index + 1; for _ in 0..n { - let child_result = from_thrift_helper(elements, next_index as usize)?; + let child_result = from_thrift_helper(elements, next_index)?; next_index = child_result.0; fields.push(child_result.1); } diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 68b2f2b2550d..cfbd521e9a7e 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -918,12 +918,12 @@ mod tests { fn test_put_value_rand_numbers(total: usize, num_bits: usize) { assert!(num_bits < 64); let num_bytes = ceil(num_bits, 8); - let mut writer = BitWriter::new(num_bytes as usize * total); + let mut writer = BitWriter::new(num_bytes * total); let values: Vec = random_numbers::(total) .iter() .map(|v| v & ((1 << num_bits) - 1)) .collect(); - (0..total).for_each(|i| writer.put_value(values[i] as u64, num_bits)); + (0..total).for_each(|i| writer.put_value(values[i], num_bits)); let mut reader = BitReader::from(writer.consume()); (0..total).for_each(|i| { @@ -959,7 +959,7 @@ mod tests { { assert!(num_bits <= 64); let num_bytes = ceil(num_bits, 8); - let mut writer = BitWriter::new(num_bytes as usize * total); + let mut writer = BitWriter::new(num_bytes * total); let mask = match num_bits { 64 => u64::MAX, @@ -975,7 +975,7 @@ mod tests { let expected_values: Vec = values.iter().map(|v| from_ne_slice(v.as_bytes())).collect(); - (0..total).for_each(|i| writer.put_value(values[i] as u64, num_bits)); + (0..total).for_each(|i| writer.put_value(values[i], num_bits)); let buf = writer.consume(); let mut reader = BitReader::from(buf); From 6d5f02439de9af7a944e010bf1bd8a65955515c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 16 Dec 2022 13:54:24 +0000 Subject: [PATCH 0417/1411] Upstream newline_delimited_stream and ChunkedStore from DataFusion (#3341) * Upstream newline_delimited_stream and ChunkedStore from DataFusion * Clippy --- object_store/src/chunked.rs | 247 +++++++++++++++++++++++++++++++ object_store/src/delimited.rs | 270 ++++++++++++++++++++++++++++++++++ object_store/src/lib.rs | 3 + 3 files changed, 520 insertions(+) create mode 100644 object_store/src/chunked.rs create mode 100644 object_store/src/delimited.rs diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs new file mode 100644 index 000000000000..76865ef96701 --- /dev/null +++ b/object_store/src/chunked.rs @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A [`ChunkedStore`] that can be used to test streaming behaviour + +use std::fmt::{Debug, Display, Formatter}; +use std::io::{BufReader, Read}; +use std::ops::Range; +use std::sync::Arc; + +use async_trait::async_trait; +use bytes::{BufMut, Bytes, BytesMut}; +use futures::stream::BoxStream; +use futures::StreamExt; +use tokio::io::AsyncWrite; + +use crate::path::Path; +use crate::util::maybe_spawn_blocking; +use crate::{GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{MultipartId, Result}; + +/// Wraps a [`ObjectStore`] and makes its get response return chunks +/// in a controllable manner. +/// +/// A `ChunkedStore` makes the memory consumption and performance of +/// the wrapped [`ObjectStore`] worse. It is intended for use within +/// tests, to control the chunks in the produced output streams. For +/// example, it is used to verify the delimiting logic in +/// newline_delimited_stream. +#[derive(Debug)] +pub struct ChunkedStore { + inner: Arc, + chunk_size: usize, +} + +impl ChunkedStore { + /// Creates a new [`ChunkedStore`] with the specified chunk_size + pub fn new(inner: Arc, chunk_size: usize) -> Self { + Self { inner, chunk_size } + } +} + +impl Display for ChunkedStore { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ChunkedStore({})", self.inner) + } +} + +#[async_trait] +impl ObjectStore for ChunkedStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.inner.put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.inner.put_multipart(location).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.inner.abort_multipart(location, multipart_id).await + } + + async fn get(&self, location: &Path) -> Result { + match self.inner.get(location).await? { + GetResult::File(std_file, ..) => { + let reader = BufReader::new(std_file); + let chunk_size = self.chunk_size; + Ok(GetResult::Stream( + futures::stream::try_unfold(reader, move |mut reader| async move { + let (r, out, reader) = maybe_spawn_blocking(move || { + let mut out = Vec::with_capacity(chunk_size); + let r = (&mut reader) + .take(chunk_size as u64) + .read_to_end(&mut out) + .map_err(|err| crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(err), + })?; + Ok((r, out, reader)) + }) + .await?; + + match r { + 0 => Ok(None), + _ => Ok(Some((out.into(), reader))), + } + }) + .boxed(), + )) + } + GetResult::Stream(stream) => { + let buffer = BytesMut::new(); + Ok(GetResult::Stream( + futures::stream::unfold( + (stream, buffer, false, self.chunk_size), + |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { + // Keep accumulating bytes until we reach capacity as long as + // the stream can provide them: + if exhausted { + return None; + } + while buffer.len() < chunk_size { + match stream.next().await { + None => { + exhausted = true; + let slice = buffer.split_off(0).freeze(); + return Some(( + Ok(slice), + (stream, buffer, exhausted, chunk_size), + )); + } + Some(Ok(bytes)) => { + buffer.put(bytes); + } + Some(Err(e)) => { + return Some(( + Err(crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(e), + }), + (stream, buffer, exhausted, chunk_size), + )) + } + }; + } + // Return the chunked values as the next value in the stream + let slice = buffer.split_to(chunk_size).freeze(); + Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) + }, + ) + .boxed(), + )) + } + } + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + self.inner.get_range(location, range).await + } + + async fn head(&self, location: &Path) -> Result { + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.inner.list(prefix).await + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.inner.copy_if_not_exists(from, to).await + } +} + +#[cfg(test)] +mod tests { + use futures::StreamExt; + + use crate::local::LocalFileSystem; + use crate::memory::InMemory; + use crate::path::Path; + use crate::tests::*; + + use super::*; + + #[tokio::test] + async fn test_chunked_basic() { + let location = Path::parse("test").unwrap(); + let store: Arc = Arc::new(InMemory::new()); + store + .put(&location, Bytes::from(vec![0; 1001])) + .await + .unwrap(); + + for chunk_size in [10, 20, 31] { + let store = ChunkedStore::new(Arc::clone(&store), chunk_size); + let mut s = match store.get(&location).await.unwrap() { + GetResult::Stream(s) => s, + _ => unreachable!(), + }; + + let mut remaining = 1001; + while let Some(next) = s.next().await { + let size = next.unwrap().len(); + let expected = remaining.min(chunk_size); + assert_eq!(size, expected); + remaining -= expected; + } + assert_eq!(remaining, 0); + } + } + + #[tokio::test] + async fn test_chunked() { + let temporary = tempfile::tempdir().unwrap(); + let integrations: &[Arc] = &[ + Arc::new(InMemory::new()), + Arc::new(LocalFileSystem::new_with_prefix(temporary.path()).unwrap()), + ]; + + for integration in integrations { + let integration = ChunkedStore::new(Arc::clone(integration), 100); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + } +} diff --git a/object_store/src/delimited.rs b/object_store/src/delimited.rs new file mode 100644 index 000000000000..13214865117a --- /dev/null +++ b/object_store/src/delimited.rs @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utility for streaming newline delimited files from object storage + +use std::collections::VecDeque; + +use bytes::Bytes; +use futures::{Stream, StreamExt}; +use snafu::{ensure, Snafu}; + +use super::Result; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("encountered unterminated string"))] + UnterminatedString, + + #[snafu(display("encountered trailing escape character"))] + TrailingEscape, +} + +impl From for super::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "LineDelimiter", + source: Box::new(err), + } + } +} + +/// The ASCII encoding of `"` +const QUOTE: u8 = b'"'; + +/// The ASCII encoding of `\n` +const NEWLINE: u8 = b'\n'; + +/// The ASCII encoding of `\` +const ESCAPE: u8 = b'\\'; + +/// [`LineDelimiter`] is provided with a stream of [`Bytes`] and returns an iterator +/// of [`Bytes`] containing a whole number of new line delimited records +#[derive(Debug, Default)] +struct LineDelimiter { + /// Complete chunks of [`Bytes`] + complete: VecDeque, + /// Remainder bytes that form the next record + remainder: Vec, + /// True if the last character was the escape character + is_escape: bool, + /// True if currently processing a quoted string + is_quote: bool, +} + +impl LineDelimiter { + /// Creates a new [`LineDelimiter`] with the provided delimiter + fn new() -> Self { + Self::default() + } + + /// Adds the next set of [`Bytes`] + fn push(&mut self, val: impl Into) { + let val: Bytes = val.into(); + + let is_escape = &mut self.is_escape; + let is_quote = &mut self.is_quote; + let mut record_ends = val.iter().enumerate().filter_map(|(idx, v)| { + if *is_escape { + *is_escape = false; + None + } else if *v == ESCAPE { + *is_escape = true; + None + } else if *v == QUOTE { + *is_quote = !*is_quote; + None + } else if *is_quote { + None + } else { + (*v == NEWLINE).then_some(idx + 1) + } + }); + + let start_offset = match self.remainder.is_empty() { + true => 0, + false => match record_ends.next() { + Some(idx) => { + self.remainder.extend_from_slice(&val[0..idx]); + self.complete + .push_back(Bytes::from(std::mem::take(&mut self.remainder))); + idx + } + None => { + self.remainder.extend_from_slice(&val); + return; + } + }, + }; + let end_offset = record_ends.last().unwrap_or(start_offset); + if start_offset != end_offset { + self.complete.push_back(val.slice(start_offset..end_offset)); + } + + if end_offset != val.len() { + self.remainder.extend_from_slice(&val[end_offset..]) + } + } + + /// Marks the end of the stream, delimiting any remaining bytes + /// + /// Returns `true` if there is no remaining data to be read + fn finish(&mut self) -> Result { + if !self.remainder.is_empty() { + ensure!(!self.is_quote, UnterminatedStringSnafu); + ensure!(!self.is_quote, TrailingEscapeSnafu); + + self.complete + .push_back(Bytes::from(std::mem::take(&mut self.remainder))) + } + Ok(self.complete.is_empty()) + } +} + +impl Iterator for LineDelimiter { + type Item = Bytes; + + fn next(&mut self) -> Option { + self.complete.pop_front() + } +} + +/// Given a [`Stream`] of [`Bytes`] returns a [`Stream`] where each +/// yielded [`Bytes`] contains a whole number of new line delimited records +/// accounting for `\` style escapes and `"` quotes +pub fn newline_delimited_stream(s: S) -> impl Stream> +where + S: Stream> + Unpin, +{ + let delimiter = LineDelimiter::new(); + + futures::stream::unfold( + (s, delimiter, false), + |(mut s, mut delimiter, mut exhausted)| async move { + loop { + if let Some(next) = delimiter.next() { + return Some((Ok(next), (s, delimiter, exhausted))); + } else if exhausted { + return None; + } + + match s.next().await { + Some(Ok(bytes)) => delimiter.push(bytes), + Some(Err(e)) => return Some((Err(e), (s, delimiter, exhausted))), + None => { + exhausted = true; + match delimiter.finish() { + Ok(true) => return None, + Ok(false) => continue, + Err(e) => return Some((Err(e), (s, delimiter, exhausted))), + } + } + } + } + }, + ) +} + +#[cfg(test)] +mod tests { + use futures::stream::{BoxStream, TryStreamExt}; + + use super::*; + + #[test] + fn test_delimiter() { + let mut delimiter = LineDelimiter::new(); + delimiter.push("hello\nworld"); + delimiter.push("\n\n"); + + assert_eq!(delimiter.next().unwrap(), Bytes::from("hello\n")); + assert_eq!(delimiter.next().unwrap(), Bytes::from("world\n")); + assert_eq!(delimiter.next().unwrap(), Bytes::from("\n")); + assert!(delimiter.next().is_none()); + } + + #[test] + fn test_delimiter_escaped() { + let mut delimiter = LineDelimiter::new(); + delimiter.push(""); + delimiter.push("fo\\\n\"foo"); + delimiter.push("bo\n\"bar\n"); + delimiter.push("\"he"); + delimiter.push("llo\"\n"); + assert_eq!( + delimiter.next().unwrap(), + Bytes::from("fo\\\n\"foobo\n\"bar\n") + ); + assert_eq!(delimiter.next().unwrap(), Bytes::from("\"hello\"\n")); + assert!(delimiter.next().is_none()); + + // Verify can push further data + delimiter.push("\"foo\nbar\",\"fiz\\\"inner\\\"\"\nhello"); + assert!(!delimiter.finish().unwrap()); + + assert_eq!( + delimiter.next().unwrap(), + Bytes::from("\"foo\nbar\",\"fiz\\\"inner\\\"\"\n") + ); + assert_eq!(delimiter.next().unwrap(), Bytes::from("hello")); + assert!(delimiter.finish().unwrap()); + assert!(delimiter.next().is_none()); + } + + #[tokio::test] + async fn test_delimiter_stream() { + let input = vec!["hello\nworld\nbin", "go\ncup", "cakes"]; + let input_stream = + futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); + let stream = newline_delimited_stream(input_stream); + + let results: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + results, + vec![ + Bytes::from("hello\nworld\n"), + Bytes::from("bingo\n"), + Bytes::from("cupcakes") + ] + ) + } + #[tokio::test] + async fn test_delimiter_unfold_stream() { + let input_stream: BoxStream<'static, Result> = futures::stream::unfold( + VecDeque::from(["hello\nworld\nbin", "go\ncup", "cakes"]), + |mut input| async move { + if !input.is_empty() { + Some((Ok(Bytes::from(input.pop_front().unwrap())), input)) + } else { + None + } + }, + ) + .boxed(); + let stream = newline_delimited_stream(input_stream); + + let results: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + results, + vec![ + Bytes::from("hello\nworld\n"), + Bytes::from("bingo\n"), + Bytes::from("cupcakes") + ] + ) + } +} diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 0cd56612ee45..85e8737b7726 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -163,6 +163,9 @@ compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); pub mod aws; #[cfg(feature = "azure")] pub mod azure; +#[cfg(not(target_arch = "wasm32"))] +pub mod chunked; +pub mod delimited; #[cfg(feature = "gcp")] pub mod gcp; pub mod limit; From 309cf5cd299dbf91235b60876e092c1af3990b84 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 16 Dec 2022 13:54:37 +0000 Subject: [PATCH 0418/1411] Use ArrayData::ptr_eq in DictionaryTracker (#3354) * Use ArrayData::ptr_eq in DictionaryTracker * Fallback to logical comparison if error_on_replacement --- arrow-ipc/src/writer.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index c407cd12c239..006660b6a0e1 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -537,10 +537,16 @@ impl DictionaryTracker { // If a dictionary with this id was already emitted, check if it was the same. if let Some(last) = self.written.get(&dict_id) { - if last.data().child_data()[0] == *dict_values { + if ArrayData::ptr_eq(&last.data().child_data()[0], dict_values) { // Same dictionary values => no need to emit it again return Ok(false); - } else if self.error_on_replacement { + } + if self.error_on_replacement { + // If error on replacement perform a logical comparison + if last.data().child_data()[0] == *dict_values { + // Same dictionary values => no need to emit it again + return Ok(false); + } return Err(ArrowError::InvalidArgumentError( "Dictionary replacement detected when writing IPC file format. \ Arrow IPC files only support a single dictionary for a given field \ From 3039633d57b78d20fc4d9ae9165c0d0777e81bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 16 Dec 2022 14:55:19 +0100 Subject: [PATCH 0419/1411] Deprecate flight_data_from_arrow_batch (#3353) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Deprecate flight_data_from_arrow_batch * Add docs * Re-implement * Del * fmt Co-authored-by: Daniël Heres --- arrow-flight/src/utils.rs | 17 +++++++++++++---- arrow-ipc/src/writer.rs | 3 +++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 392d41c83ce8..855b333853bf 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -29,6 +29,10 @@ use arrow_schema::{ArrowError, Schema, SchemaRef}; /// Convert a `RecordBatch` to a vector of `FlightData` representing the bytes of the dictionaries /// and a `FlightData` representing the bytes of the batch's values +#[deprecated( + since = "30.0.0", + note = "Use IpcDataGenerator directly with DictionaryTracker to avoid re-sending dictionaries" +)] pub fn flight_data_from_arrow_batch( batch: &RecordBatch, options: &IpcWriteOptions, @@ -149,11 +153,16 @@ pub fn batches_to_flight_data( let schema_flight_data: FlightData = SchemaAsIpc::new(&schema, &options).into(); let mut dictionaries = vec![]; let mut flight_data = vec![]; + + let data_gen = writer::IpcDataGenerator::default(); + let mut dictionary_tracker = writer::DictionaryTracker::new(false); + for batch in batches.iter() { - let (flight_dictionaries, flight_datum) = - flight_data_from_arrow_batch(batch, &options); - dictionaries.extend(flight_dictionaries); - flight_data.push(flight_datum); + let (encoded_dictionaries, encoded_batch) = + data_gen.encoded_batch(batch, &mut dictionary_tracker, &options)?; + + dictionaries.extend(encoded_dictionaries.into_iter().map(Into::into)); + flight_data.push(encoded_batch.into()); } let mut stream = vec![schema_flight_data]; stream.extend(dictionaries.into_iter()); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 006660b6a0e1..106b4e4c9850 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -316,6 +316,9 @@ impl IpcDataGenerator { Ok(()) } + /// Encodes a batch to a number of [EncodedData] items (dictionary batches + the record batch). + /// The [DictionaryTracker] keeps track of dictionaries with new `dict_id`s (so they are only sent once) + /// Make sure the [DictionaryTracker] is initialized at the start of the stream. pub fn encoded_batch( &self, batch: &RecordBatch, From 89354ca7c3a66e05cefad77bd6350c8fad7b58bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 16 Dec 2022 19:27:12 +0000 Subject: [PATCH 0420/1411] More clippy lint fixes (#3355) --- object_store/src/aws/client.rs | 2 +- object_store/src/azure/client.rs | 2 +- object_store/src/azure/credential.rs | 2 +- object_store/src/local.rs | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index ccc0a9c6bbc0..d2d2aefa4b7f 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -131,7 +131,7 @@ impl TryFrom for ListResult { let common_prefixes = value .common_prefixes .into_iter() - .map(|x| Ok(Path::parse(&x.prefix)?)) + .map(|x| Ok(Path::parse(x.prefix)?)) .collect::>()?; let objects = value diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index b537f5edf679..fedd85e3dc30 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -407,7 +407,7 @@ impl TryFrom for ListResult { .blob_prefix .unwrap_or_default() .into_iter() - .map(|x| Ok(Path::parse(&x.name)?)) + .map(|x| Ok(Path::parse(x.name)?)) .collect::>()?; let objects = value diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 721fcaea46f0..38e6e64f1e0f 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -153,7 +153,7 @@ fn generate_authorization( key: &str, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(base64::decode(key).unwrap(), &str_to_sign); + let auth = hmac_sha256(base64::decode(key).unwrap(), str_to_sign); format!("SharedKey {}:{}", account, base64::encode(auth)) } diff --git a/object_store/src/local.rs b/object_store/src/local.rs index f7b7ad7dd625..2ef87adbb093 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -396,7 +396,7 @@ impl ObjectStore for LocalFileSystem { None => self.config.root.to_file_path().unwrap(), }; - let walkdir = WalkDir::new(&root_path) + let walkdir = WalkDir::new(root_path) // Don't include the root directory itself .min_depth(1) .follow_links(true); @@ -748,7 +748,7 @@ impl AsyncWrite for LocalUpload { self.inner_state = LocalUploadState::Complete; file.sync_all()?; std::mem::drop(file); - std::fs::rename(&staging_path, &self.dest)?; + std::fs::rename(staging_path, &self.dest)?; Poll::Ready(Ok(())) } _ => { From 07284c55142cde412d1a13efd1307570c3ae24ae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 17 Dec 2022 20:24:46 +0000 Subject: [PATCH 0421/1411] Add CSV reader benchmark (#3338) (#3357) * Add CSV reader benchmark (#3338) * Add floats * Format --- arrow/Cargo.toml | 5 ++ arrow/benches/csv_reader.rs | 115 ++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 arrow/benches/csv_reader.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 7f08d38e1ae0..98d04d5d2635 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -201,6 +201,11 @@ name = "csv_writer" harness = false required-features = ["csv"] +[[bench]] +name = "csv_reader" +harness = false +required-features = ["test_utils", "csv"] + [[bench]] name = "json_reader" harness = false diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs new file mode 100644 index 000000000000..f6353fb851f5 --- /dev/null +++ b/arrow/benches/csv_reader.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate arrow; +extern crate criterion; + +use criterion::*; + +use arrow::array::*; +use arrow::csv; +use arrow::datatypes::*; +use arrow::record_batch::RecordBatch; +use arrow::util::bench_util::{create_primitive_array, create_string_array_with_len}; +use std::io::Cursor; +use std::sync::Arc; + +fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { + let batch = RecordBatch::try_from_iter(cols.into_iter().map(|a| ("col", a))).unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut csv = csv::Writer::new(&mut buf); + csv.write(&batch).unwrap(); + drop(csv); + + for batch_size in [128, 1024, 4096] { + c.bench_function(&format!("{} - {}", name, batch_size), |b| { + b.iter(|| { + let cursor = Cursor::new(buf.as_slice()); + let reader = csv::ReaderBuilder::new() + .with_schema(batch.schema()) + .with_batch_size(batch_size) + .has_header(true) + .build(cursor) + .unwrap(); + + for next in reader { + next.unwrap(); + } + }); + }); + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 u64(0)", cols); + + let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 i64(0)", cols); + + let cols = + vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 f32(0)", cols); + + let cols = + vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + do_bench(c, "4096 f64(0)", cols); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; + do_bench(c, "4096 string(10, 0)", cols); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; + do_bench(c, "4096 string(30, 0)", cols); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; + do_bench(c, "4096 string(100, 0)", cols); + + let cols = + vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; + do_bench(c, "4096 string(100, 0.5)", cols); + + let cols = vec![ + Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, + Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, + Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + do_bench( + c, + "4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", + cols, + ); + + let cols = vec![ + Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, + Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, + ]; + do_bench( + c, + "4096 string(20, 0.5), string(30, 0), f64(0), i64(0)", + cols, + ); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 491b0239a81bb3e7e2829d69c5a59799a0d4f6e6 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 18 Dec 2022 14:15:31 -0800 Subject: [PATCH 0422/1411] Fix unary_dyn for decimal scalar arithmetic computation (#3345) * Fix unary for decimal arithmetic computation * Use discriminant --- arrow/src/compute/kernels/arithmetic.rs | 20 +++++++++++++++++++- arrow/src/compute/kernels/arity.rs | 17 +++++++++++------ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 23cefe48e2c8..913a2cad6c93 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1633,7 +1633,7 @@ mod tests { use super::*; use crate::array::Int32Array; use crate::compute::{binary_mut, try_binary_mut, try_unary_mut, unary_mut}; - use crate::datatypes::{Date64Type, Int32Type, Int8Type}; + use crate::datatypes::{Date64Type, Decimal128Type, Int32Type, Int8Type}; use arrow_buffer::i256; use chrono::NaiveDate; use half::f16; @@ -3226,4 +3226,22 @@ mod tests { ])) as ArrayRef; assert_eq!(&result, &expected); } + + #[test] + fn test_decimal_add_scalar_dyn() { + let a = Decimal128Array::from(vec![100, 210, 320]) + .with_precision_and_scale(38, 2) + .unwrap(); + + let result = add_scalar_dyn::(&a, 1).unwrap(); + let result = as_primitive_array::(&result) + .clone() + .with_precision_and_scale(38, 2) + .unwrap(); + let expected = Decimal128Array::from(vec![101, 211, 321]) + .with_precision_and_scale(38, 2) + .unwrap(); + + assert_eq!(&expected, &result); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 6207ab63935d..02659a5a7738 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -114,9 +114,12 @@ where T: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { - if array.value_type() != T::DATA_TYPE { + if std::mem::discriminant(&array.value_type()) + != std::mem::discriminant(&T::DATA_TYPE) + { return Err(ArrowError::CastError(format!( - "Cannot perform the unary operation on dictionary array of value type {}", + "Cannot perform the unary operation of type {} on dictionary array of value type {}", + T::DATA_TYPE, array.value_type() ))); } @@ -135,14 +138,15 @@ where downcast_dictionary_array! { array => unary_dict::<_, F, T>(array, op), t => { - if t == &T::DATA_TYPE { + if std::mem::discriminant(t) == std::mem::discriminant(&T::DATA_TYPE) { Ok(Arc::new(unary::( array.as_any().downcast_ref::>().unwrap(), op, ))) } else { Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation on array of type {}", + "Cannot perform unary operation of type {} on array of type {}", + T::DATA_TYPE, t ))) } @@ -166,14 +170,15 @@ where ))) }, t => { - if t == &T::DATA_TYPE { + if std::mem::discriminant(t) == std::mem::discriminant(&T::DATA_TYPE) { Ok(Arc::new(try_unary::( array.as_any().downcast_ref::>().unwrap(), op, )?)) } else { Err(ArrowError::NotYetImplemented(format!( - "Cannot perform unary operation on array of type {}", + "Cannot perform unary operation of type {} on array of type {}", + T::DATA_TYPE, t ))) } From a8c968584e2d19587c48e8c9099c8d8f7ffffba3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 18 Dec 2022 14:56:55 -0800 Subject: [PATCH 0423/1411] Fix incorrect output string from try_to_type (#3351) * Minor fix of try_to_type * Add test --- arrow-ord/src/comparison.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 196590008248..80c8b6b1c393 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -460,7 +460,7 @@ fn try_to_type_result( /// Type of expression is `Result<.., ArrowError>` macro_rules! try_to_type { ($RIGHT: expr, $TY: ident) => { - try_to_type_result($RIGHT.$TY(), stringify!($RIGHT), stringify!($TYPE)) + try_to_type_result($RIGHT.$TY(), &format!("{:?}", $RIGHT), stringify!($TY)) }; } @@ -5827,4 +5827,22 @@ mod tests { let r = gt_eq_dyn(&a, &b).unwrap(); assert_eq!(e, r); } + + #[derive(Debug)] + struct ToType {} + + impl ToType { + fn to_i128(&self) -> Option { + None + } + } + + #[test] + fn test_try_to_type() { + let a = ToType {}; + let to_type = try_to_type!(a, to_i128).unwrap_err(); + assert!(to_type + .to_string() + .contains("Could not convert ToType with to_i128")); + } } From c3444334c58b651206e7bac946e21e7d2ae48c59 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 19 Dec 2022 08:56:06 +0000 Subject: [PATCH 0424/1411] Add csv-core based reader (#3338) (#3365) * Add csv-core based reader (#3338) * More docs --- arrow-csv/Cargo.toml | 1 + arrow-csv/src/{reader.rs => reader/mod.rs} | 397 ++++++++++----------- arrow-csv/src/reader/records.rs | 266 ++++++++++++++ 3 files changed, 458 insertions(+), 206 deletions(-) rename arrow-csv/src/{reader.rs => reader/mod.rs} (90%) create mode 100644 arrow-csv/src/reader/records.rs diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 0a8a0bec7b7d..d02e599b31e1 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -45,6 +45,7 @@ arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } +csv-core = { version = "0.1"} lazy_static = { version = "1.4", default-features = false } lexical-core = { version = "^0.8", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader/mod.rs similarity index 90% rename from arrow-csv/src/reader.rs rename to arrow-csv/src/reader/mod.rs index c69e1753b71d..877876b77c9a 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader/mod.rs @@ -40,13 +40,14 @@ //! let batch = csv.next().unwrap().unwrap(); //! ``` -use core::cmp::min; +mod records; + use lazy_static::lazy_static; use regex::{Regex, RegexSet}; use std::collections::HashSet; use std::fmt; use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; use arrow_array::builder::Decimal128Builder; @@ -56,8 +57,9 @@ use arrow_cast::parse::Parser; use arrow_schema::*; use crate::map_csv_error; +use crate::reader::records::{RecordReader, StringRecords}; use arrow_data::decimal::validate_decimal_precision; -use csv::{ByteRecord, StringRecord}; +use csv::StringRecord; use std::ops::Neg; lazy_static! { @@ -107,7 +109,7 @@ fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { /// This is a collection of options for csv reader when the builder pattern cannot be used /// and the parameters need to be passed around #[derive(Debug, Default, Clone)] -pub struct ReaderOptions { +struct ReaderOptions { has_header: bool, delimiter: Option, escape: Option, @@ -177,11 +179,36 @@ pub fn infer_reader_schema( infer_reader_schema_with_csv_options(reader, roptions) } +/// Creates a `csv::Reader` +fn build_csv_reader( + reader: R, + has_header: bool, + delimiter: Option, + escape: Option, + quote: Option, + terminator: Option, +) -> csv::Reader { + let mut reader_builder = csv::ReaderBuilder::new(); + reader_builder.has_headers(has_header); + + if let Some(c) = delimiter { + reader_builder.delimiter(c); + } + reader_builder.escape(escape); + if let Some(c) = quote { + reader_builder.quote(c); + } + if let Some(t) = terminator { + reader_builder.terminator(csv::Terminator::Any(t)); + } + reader_builder.from_reader(reader) +} + fn infer_reader_schema_with_csv_options( reader: R, roptions: ReaderOptions, ) -> Result<(Schema, usize), ArrowError> { - let mut csv_reader = Reader::build_csv_reader( + let mut csv_reader = build_csv_reader( reader, roptions.has_header, roptions.delimiter, @@ -305,15 +332,15 @@ pub struct Reader { /// Optional projection for which columns to load (zero-based column indices) projection: Option>, /// File reader - reader: csv::Reader, + reader: RecordReader>, + /// Rows to skip + to_skip: usize, /// Current line number line_number: usize, - /// Maximum number of rows to read + /// End line number end: usize, /// Number of records per batch batch_size: usize, - /// Vector that can hold the `StringRecord`s of the batches - batch_records: Vec, /// datetime format used to parse datetime values, (format understood by chrono) /// /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) @@ -351,16 +378,23 @@ impl Reader { projection: Option>, datetime_format: Option, ) -> Self { - Self::from_reader( - reader, - schema, - has_header, - delimiter, - batch_size, - bounds, - projection, - datetime_format, - ) + let mut builder = ReaderBuilder::new() + .has_header(has_header) + .with_batch_size(batch_size); + + if let Some(delimiter) = delimiter { + builder = builder.with_delimiter(delimiter); + } + if let Some((start, end)) = bounds { + builder = builder.with_bounds(start, end); + } + if let Some(projection) = projection { + builder = builder.with_projection(projection) + } + if let Some(format) = datetime_format { + builder = builder.with_datetime_format(format) + } + builder.build_with_schema(reader, schema) } /// Returns the schema of the reader, useful for getting the schema without reading @@ -383,6 +417,7 @@ impl Reader { /// This constructor allows you more flexibility in what records are processed by the /// csv reader. #[allow(clippy::too_many_arguments)] + #[deprecated(note = "Use Reader::new or ReaderBuilder")] pub fn from_reader( reader: R, schema: SchemaRef, @@ -393,142 +428,57 @@ impl Reader { projection: Option>, datetime_format: Option, ) -> Self { - let csv_reader = - Self::build_csv_reader(reader, has_header, delimiter, None, None, None); - Self::from_csv_reader( - csv_reader, + Self::new( + reader, schema, has_header, + delimiter, batch_size, bounds, projection, datetime_format, ) } - - fn build_csv_reader( - reader: R, - has_header: bool, - delimiter: Option, - escape: Option, - quote: Option, - terminator: Option, - ) -> csv::Reader { - let mut reader_builder = csv::ReaderBuilder::new(); - reader_builder.has_headers(has_header); - - if let Some(c) = delimiter { - reader_builder.delimiter(c); - } - reader_builder.escape(escape); - if let Some(c) = quote { - reader_builder.quote(c); - } - if let Some(t) = terminator { - reader_builder.terminator(csv::Terminator::Any(t)); - } - reader_builder.from_reader(reader) - } - - fn from_csv_reader( - mut csv_reader: csv::Reader, - schema: SchemaRef, - has_header: bool, - batch_size: usize, - bounds: Bounds, - projection: Option>, - datetime_format: Option, - ) -> Self { - let (start, end) = match bounds { - None => (0, usize::MAX), - Some((start, end)) => (start, end), - }; - - // First we will skip `start` rows - // note that this skips by iteration. This is because in general it is not possible - // to seek in CSV. However, skipping still saves the burden of creating arrow arrays, - // which is a slow operation that scales with the number of columns - - let mut record = ByteRecord::new(); - // Skip first start items - for _ in 0..start { - let res = csv_reader.read_byte_record(&mut record); - if !res.unwrap_or(false) { - break; - } - } - - // Initialize batch_records with StringRecords so they - // can be reused across batches - let mut batch_records = Vec::with_capacity(batch_size); - batch_records.resize_with(batch_size, Default::default); - - Self { - schema, - projection, - reader: csv_reader, - line_number: if has_header { start + 1 } else { start }, - batch_size, - end, - batch_records, - datetime_format, - } - } } impl Iterator for Reader { type Item = Result; fn next(&mut self) -> Option { - let remaining = self.end - self.line_number; - - let mut read_records = 0; - for i in 0..min(self.batch_size, remaining) { - match self.reader.read_record(&mut self.batch_records[i]) { - Ok(true) => { - read_records += 1; - } - Ok(false) => break, - Err(e) => { - return Some(Err(ArrowError::ParseError(format!( - "Error parsing line {}: {:?}", - self.line_number + i, - e - )))); - } + if self.to_skip != 0 { + if let Err(e) = self.reader.skip(std::mem::take(&mut self.to_skip)) { + return Some(Err(e)); } } - // return early if no data was loaded - if read_records == 0 { - return None; - } + let remaining = self.end - self.line_number; + let to_read = self.batch_size.min(remaining); - let format: Option<&str> = match self.datetime_format { - Some(ref format) => Some(format.as_ref()), - _ => None, + let batch = match self.reader.read(to_read) { + Ok(b) if b.is_empty() => return None, + Ok(b) => b, + Err(e) => return Some(Err(e)), }; // parse the batches into a RecordBatch let result = parse( - &self.batch_records[..read_records], + &batch, self.schema.fields(), Some(self.schema.metadata.clone()), self.projection.as_ref(), self.line_number, - format, + self.datetime_format.as_deref(), ); - self.line_number += read_records; + self.line_number += batch.len(); Some(result) } } -/// parses a slice of [csv::StringRecord] into a -/// [RecordBatch] +/// Parses a slice of [`StringRecords`] into a [RecordBatch] fn parse( - rows: &[StringRecord], + rows: &StringRecords<'_>, fields: &[Field], metadata: Option>, projection: Option<&Vec>, @@ -624,7 +574,9 @@ fn parse( ) } DataType::Utf8 => Ok(Arc::new( - rows.iter().map(|row| row.get(i)).collect::(), + rows.iter() + .map(|row| Some(row.get(i))) + .collect::(), ) as ArrayRef), DataType::Dictionary(key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => @@ -723,34 +675,26 @@ fn parse_bool(string: &str) -> Option { // parse the column string to an Arrow Array fn build_decimal_array( _line_number: usize, - rows: &[StringRecord], + rows: &StringRecords<'_>, col_idx: usize, precision: u8, scale: i8, ) -> Result { let mut decimal_builder = Decimal128Builder::with_capacity(rows.len()); - for row in rows { - let col_s = row.get(col_idx); - match col_s { - None => { - // No data for this row - decimal_builder.append_null(); - } - Some(s) => { - if s.is_empty() { - // append null - decimal_builder.append_null(); - } else { - let decimal_value: Result = - parse_decimal_with_parameter(s, precision, scale); - match decimal_value { - Ok(v) => { - decimal_builder.append_value(v); - } - Err(e) => { - return Err(e); - } - } + for row in rows.iter() { + let s = row.get(col_idx); + if s.is_empty() { + // append null + decimal_builder.append_null(); + } else { + let decimal_value: Result = + parse_decimal_with_parameter(s, precision, scale); + match decimal_value { + Ok(v) => { + decimal_builder.append_value(v); + } + Err(e) => { + return Err(e); } } } @@ -878,35 +822,31 @@ fn parse_decimal(s: &str) -> Result { // parses a specific column (col_idx) into an Arrow Array. fn build_primitive_array( line_number: usize, - rows: &[StringRecord], + rows: &StringRecords<'_>, col_idx: usize, format: Option<&str>, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { - match row.get(col_idx) { - Some(s) => { - if s.is_empty() { - return Ok(None); - } + let s = row.get(col_idx); + if s.is_empty() { + return Ok(None); + } - let parsed = match format { - Some(format) => parse_formatted::(s, format), - _ => parse_item::(s), - }; - match parsed { - Some(e) => Ok(Some(e)), - None => Err(ArrowError::ParseError(format!( - // TODO: we should surface the underlying error here. - "Error while parsing value {} for column {} at line {}", - s, - col_idx, - line_number + row_index - ))), - } - } - None => Ok(None), + let parsed = match format { + Some(format) => parse_formatted::(s, format), + _ => parse_item::(s), + }; + match parsed { + Some(e) => Ok(Some(e)), + None => Err(ArrowError::ParseError(format!( + // TODO: we should surface the underlying error here. + "Error while parsing value {} for column {} at line {}", + s, + col_idx, + line_number + row_index + ))), } }) .collect::, ArrowError>>() @@ -916,31 +856,23 @@ fn build_primitive_array( // parses a specific column (col_idx) into an Arrow Array. fn build_boolean_array( line_number: usize, - rows: &[StringRecord], + rows: &StringRecords<'_>, col_idx: usize, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { - match row.get(col_idx) { - Some(s) => { - if s.is_empty() { - return Ok(None); - } - - let parsed = parse_bool(s); - match parsed { - Some(e) => Ok(Some(e)), - None => Err(ArrowError::ParseError(format!( - // TODO: we should surface the underlying error here. - "Error while parsing value {} for column {} at line {}", - s, - col_idx, - line_number + row_index - ))), - } - } - None => Ok(None), + let s = row.get(col_idx); + let parsed = parse_bool(s); + match parsed { + Some(e) => Ok(Some(e)), + None => Err(ArrowError::ParseError(format!( + // TODO: we should surface the underlying error here. + "Error while parsing value {} for column {} at line {}", + s, + col_idx, + line_number + row_index + ))), } }) .collect::>() @@ -1109,10 +1041,13 @@ impl ReaderBuilder { } /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, mut reader: R) -> Result, ArrowError> { + pub fn build( + mut self, + mut reader: R, + ) -> Result, ArrowError> { // check if schema should be inferred let delimiter = self.delimiter.unwrap_or(b','); - let schema = match self.schema { + let schema = match self.schema.take() { Some(schema) => schema, None => { let roptions = ReaderOptions { @@ -1122,7 +1057,7 @@ impl ReaderBuilder { escape: self.escape, quote: self.quote, terminator: self.terminator, - datetime_re: self.datetime_re, + datetime_re: self.datetime_re.take(), }; let (inferred_schema, _) = infer_file_schema_with_csv_options(&mut reader, roptions)?; @@ -1130,23 +1065,45 @@ impl ReaderBuilder { Arc::new(inferred_schema) } }; - let csv_reader = Reader::build_csv_reader( - reader, - self.has_header, - self.delimiter, - self.escape, - self.quote, - self.terminator, + Ok(self.build_with_schema(reader, schema)) + } + + fn build_with_schema(self, reader: R, schema: SchemaRef) -> Reader { + let mut reader_builder = csv_core::ReaderBuilder::new(); + reader_builder.escape(self.escape); + + if let Some(c) = self.delimiter { + reader_builder.delimiter(c); + } + if let Some(c) = self.quote { + reader_builder.quote(c); + } + if let Some(t) = self.terminator { + reader_builder.terminator(csv_core::Terminator::Any(t)); + } + let reader = RecordReader::new( + BufReader::new(reader), + reader_builder.build(), + schema.fields().len(), ); - Ok(Reader::from_csv_reader( - csv_reader, + + let header = self.has_header as usize; + + let (start, end) = match self.bounds { + Some((start, end)) => (start + header, end + header), + None => (header, usize::MAX), + }; + + Reader { schema, - self.has_header, - self.batch_size, - self.bounds, - self.projection.clone(), - self.datetime_format, - )) + projection: self.projection, + reader, + to_skip: start, + line_number: start, + end, + batch_size: self.batch_size, + datetime_format: self.datetime_format, + } } } @@ -1285,7 +1242,7 @@ mod tests { let both_files = file_with_headers .chain(Cursor::new("\n".to_string())) .chain(file_without_headers); - let mut csv = Reader::from_reader( + let mut csv = Reader::new( both_files, Arc::new(schema), true, @@ -1480,6 +1437,7 @@ mod tests { Field::new("c_int", DataType::UInt64, false), Field::new("c_float", DataType::Float32, true), Field::new("c_string", DataType::Utf8, false), + Field::new("c_bool", DataType::Boolean, false), ]); let file = File::open("test/data/null_test.csv").unwrap(); @@ -2074,4 +2032,31 @@ mod tests { let col1_arr = col1.as_any().downcast_ref::().unwrap(); assert_eq!(col1_arr.value(5), "value5"); } + + #[test] + fn test_header_bounds() { + let csv = "a,b\na,b\na,b\na,b\na,b\n"; + let tests = [ + (None, false, 5), + (None, true, 4), + (Some((0, 4)), false, 4), + (Some((1, 4)), false, 3), + (Some((0, 4)), true, 4), + (Some((1, 4)), true, 3), + ]; + + for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() { + let mut reader = ReaderBuilder::new().has_header(has_header); + if let Some((start, end)) = bounds { + reader = reader.with_bounds(start, end); + } + let b = reader + .build(Cursor::new(csv.as_bytes())) + .unwrap() + .next() + .unwrap() + .unwrap(); + assert_eq!(b.num_rows(), expected, "{}", idx); + } + } } diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs new file mode 100644 index 000000000000..711baa15278f --- /dev/null +++ b/arrow-csv/src/reader/records.rs @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ArrowError; +use csv_core::{ReadRecordResult, Reader}; +use std::io::BufRead; + +/// The estimated length of a field in bytes +const AVERAGE_FIELD_SIZE: usize = 8; + +/// The minimum amount of data in a single read +const MIN_CAPACITY: usize = 1024; + +pub struct RecordReader { + reader: R, + delimiter: Reader, + + num_columns: usize, + + num_rows: usize, + offsets: Vec, + data: Vec, +} + +impl RecordReader { + pub fn new(reader: R, delimiter: Reader, num_columns: usize) -> Self { + Self { + reader, + delimiter, + num_columns, + num_rows: 0, + offsets: vec![], + data: vec![], + } + } + + fn fill_buf(&mut self, to_read: usize) -> Result<(), ArrowError> { + // Reserve sufficient capacity in offsets + self.offsets.resize(to_read * self.num_columns + 1, 0); + self.num_rows = 0; + + if to_read == 0 { + return Ok(()); + } + + // The current offset into `self.data` + let mut output_offset = 0; + // The current offset into `input` + let mut input_offset = 0; + // The current offset into `self.offsets` + let mut field_offset = 1; + // The number of fields read for the current row + let mut field_count = 0; + + 'outer: loop { + let input = self.reader.fill_buf()?; + + 'input: loop { + // Reserve necessary space in output data based on best estimate + let remaining_rows = to_read - self.num_rows; + let capacity = remaining_rows * self.num_columns * AVERAGE_FIELD_SIZE; + let estimated_data = capacity.max(MIN_CAPACITY); + self.data.resize(output_offset + estimated_data, 0); + + loop { + let (result, bytes_read, bytes_written, end_positions) = + self.delimiter.read_record( + &input[input_offset..], + &mut self.data[output_offset..], + &mut self.offsets[field_offset..], + ); + + field_count += end_positions; + field_offset += end_positions; + input_offset += bytes_read; + output_offset += bytes_written; + + match result { + ReadRecordResult::End => break 'outer, // Reached end of file + ReadRecordResult::InputEmpty => break 'input, // Input exhausted, need to read more + ReadRecordResult::OutputFull => break, // Need to allocate more capacity + ReadRecordResult::OutputEndsFull => { + return Err(ArrowError::CsvError(format!("incorrect number of fields, expected {} got more than {}", self.num_columns, field_count))) + } + ReadRecordResult::Record => { + if field_count != self.num_columns { + return Err(ArrowError::CsvError(format!("incorrect number of fields, expected {} got {}", self.num_columns, field_count))) + } + self.num_rows += 1; + field_count = 0; + + if self.num_rows == to_read { + break 'outer // Read sufficient rows + } + + if input.len() == input_offset { + // Input exhausted, need to read more + // Without this read_record will interpret the empty input + // byte array as indicating the end of the file + break 'input + } + } + } + } + } + self.reader.consume(input_offset); + input_offset = 0; + } + self.reader.consume(input_offset); + + // csv_core::Reader writes end offsets relative to the start of the row + // Therefore scan through and offset these based on the cumulative row offsets + let mut row_offset = 0; + self.offsets[1..] + .chunks_mut(self.num_columns) + .for_each(|row| { + let offset = row_offset; + row.iter_mut().for_each(|x| { + *x += offset; + row_offset = *x; + }); + }); + + Ok(()) + } + + /// Skips forward `to_skip` rows + pub fn skip(&mut self, mut to_skip: usize) -> Result<(), ArrowError> { + // TODO: This could be done by scanning for unquoted newline delimiters + while to_skip != 0 { + self.fill_buf(to_skip.min(1024))?; + to_skip -= self.num_rows; + } + Ok(()) + } + + /// Reads up to `to_read` rows from the reader + pub fn read(&mut self, to_read: usize) -> Result, ArrowError> { + self.fill_buf(to_read)?; + + // Need to slice fields to the actual number of rows read + // + // We intentionally avoid using `Vec::truncate` to avoid having + // to re-initialize the data again + let num_fields = self.num_rows * self.num_columns; + let last_offset = self.offsets[num_fields]; + + // Need to truncate data to the actual amount of data read + let data = std::str::from_utf8(&self.data[..last_offset]).map_err(|e| { + ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {}", e)) + })?; + + Ok(StringRecords { + num_columns: self.num_columns, + num_rows: self.num_rows, + offsets: &self.offsets[..num_fields + 1], + data, + }) + } +} + +/// A collection of parsed, UTF-8 CSV records +#[derive(Debug)] +pub struct StringRecords<'a> { + num_columns: usize, + num_rows: usize, + offsets: &'a [usize], + data: &'a str, +} + +impl<'a> StringRecords<'a> { + fn get(&self, index: usize) -> StringRecord<'a> { + let field_idx = index * self.num_columns; + StringRecord { + data: self.data, + offsets: &self.offsets[field_idx..field_idx + self.num_columns + 1], + } + } + + pub fn len(&self) -> usize { + self.num_rows + } + + pub fn is_empty(&self) -> bool { + self.num_rows == 0 + } + + pub fn iter(&self) -> impl Iterator> + '_ { + (0..self.num_rows).map(|x| self.get(x)) + } +} + +/// A single parsed, UTF-8 CSV record +#[derive(Debug, Clone, Copy)] +pub struct StringRecord<'a> { + data: &'a str, + offsets: &'a [usize], +} + +impl<'a> StringRecord<'a> { + pub fn get(&self, index: usize) -> &'a str { + let end = self.offsets[index + 1]; + let start = self.offsets[index]; + + // SAFETY: + // Parsing produces offsets at valid byte boundaries + unsafe { self.data.get_unchecked(start..end) } + } +} + +#[cfg(test)] +mod tests { + use crate::reader::records::RecordReader; + use csv_core::Reader; + use std::io::Cursor; + + #[test] + fn test_basic() { + let csv = [ + "foo,bar,baz", + "a,b,c", + "12,3,5", + "\"asda\"\"asas\",\"sdffsnsd\", as", + ] + .join("\n"); + + let mut expected = vec![ + vec!["foo", "bar", "baz"], + vec!["a", "b", "c"], + vec!["12", "3", "5"], + vec!["asda\"asas", "sdffsnsd", " as"], + ] + .into_iter(); + + let cursor = Cursor::new(csv.as_bytes()); + let mut reader = RecordReader::new(cursor, Reader::new(), 3); + + loop { + let b = reader.read(3).unwrap(); + if b.is_empty() { + break; + } + + b.iter().zip(&mut expected).for_each(|(record, expected)| { + let actual = (0..3) + .map(|field_idx| record.get(field_idx)) + .collect::>(); + assert_eq!(actual, expected) + }) + } + } +} From 5e4789402b262b4d2847453888762a8a6e1a2d8b Mon Sep 17 00:00:00 2001 From: Your friendly neighborhood geek Date: Mon, 19 Dec 2022 14:38:35 +0530 Subject: [PATCH 0425/1411] add support for content-type in `ClientOptions` (#3358) * add support for content-type in `ClientOptions` - currently only supported for aws & azure * add ClientOptions to GoogleCloudStorageClient - add methods `filename` and `extension` to `Path` --- object_store/src/aws/client.rs | 8 +++++- object_store/src/azure/client.rs | 5 ++++ object_store/src/client/mod.rs | 39 +++++++++++++++++++++++++++ object_store/src/gcp/mod.rs | 16 ++++++++++-- object_store/src/path/mod.rs | 45 ++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index d2d2aefa4b7f..0e22bfc97e22 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -29,7 +29,9 @@ use crate::{ use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; -use reqwest::{Client as ReqwestClient, Method, Response, StatusCode}; +use reqwest::{ + header::CONTENT_TYPE, Client as ReqwestClient, Method, Response, StatusCode, +}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::ops::Range; @@ -279,6 +281,10 @@ impl S3Client { builder = builder.body(bytes) } + if let Some(value) = self.config().client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } + let response = builder .query(query) .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index fedd85e3dc30..440c379743a6 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -28,6 +28,7 @@ use crate::{ use bytes::{Buf, Bytes}; use chrono::{DateTime, TimeZone, Utc}; use itertools::Itertools; +use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, Client as ReqwestClient, Method, Response, StatusCode, @@ -207,6 +208,10 @@ impl AzureClient { builder = builder.query(query); } + if let Some(value) = self.config().client_options.get_content_type(path) { + builder = builder.header(CONTENT_TYPE, value); + } + if let Some(bytes) = bytes { builder = builder .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 47e68637b663..9df7b5039da9 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -26,8 +26,11 @@ pub mod token; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use std::collections::HashMap; use std::time::Duration; +use crate::path::Path; + fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { store: "HTTP client", @@ -42,6 +45,8 @@ static DEFAULT_USER_AGENT: &str = #[derive(Debug, Clone, Default)] pub struct ClientOptions { user_agent: Option, + content_type_map: HashMap, + default_content_type: Option, default_headers: Option, proxy_url: Option, allow_http: bool, @@ -70,6 +75,22 @@ impl ClientOptions { self } + /// Set the default CONTENT_TYPE for uploads + pub fn with_default_content_type(mut self, mime: impl Into) -> Self { + self.default_content_type = Some(mime.into()); + self + } + + /// Set the CONTENT_TYPE for a given file extension + pub fn with_content_type_for_suffix( + mut self, + extension: impl Into, + mime: impl Into, + ) -> Self { + self.content_type_map.insert(extension.into(), mime.into()); + self + } + /// Sets the default headers for every request pub fn with_default_headers(mut self, headers: HeaderMap) -> Self { self.default_headers = Some(headers); @@ -165,6 +186,24 @@ impl ClientOptions { self } + /// Get the mime type for the file in `path` to be uploaded + /// + /// Gets the file extension from `path`, and returns the + /// mime type if it was defined initially through + /// `ClientOptions::with_content_type_for_suffix` + /// + /// Otherwise returns the default mime type if it was defined + /// earlier through `ClientOptions::with_default_content_type` + pub fn get_content_type(&self, path: &Path) -> Option<&str> { + match path.extension() { + Some(extension) => match self.content_type_map.get(extension) { + Some(ct) => Some(ct.as_str()), + None => self.default_content_type.as_deref(), + }, + None => self.default_content_type.as_deref(), + } + } + pub(crate) fn client(&self) -> super::Result { let mut builder = ClientBuilder::new(); diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index b3bd57256157..c83ab6493cb9 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -258,6 +258,7 @@ struct GoogleCloudStorageClient { bucket_name_encoded: String, retry_config: RetryConfig, + client_options: ClientOptions, // TODO: Hook this up in tests max_list_results: Option, @@ -328,10 +329,15 @@ impl GoogleCloudStorageClient { self.base_url, self.bucket_name_encoded ); + let content_type = self + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + self.client .request(Method::POST, url) .bearer_auth(token) - .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) @@ -347,11 +353,16 @@ impl GoogleCloudStorageClient { let token = self.get_token().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); + let content_type = self + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + let response = self .client .request(Method::POST, &url) .bearer_auth(token) - .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) .send_retry(&self.retry_config) @@ -967,6 +978,7 @@ impl GoogleCloudStorageBuilder { bucket_name, bucket_name_encoded: encoded_bucket_name, retry_config, + client_options, max_list_results: None, }), }) diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 59ad471c671e..020e5f58e096 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -229,6 +229,27 @@ impl Path { } } + /// Returns the last path segment containing the filename stored in this [`Path`] + pub fn filename(&self) -> Option<&str> { + match self.raw.is_empty() { + true => None, + false => self.raw.split(DELIMITER).last(), + } + } + + /// Returns the extension of the file stored in this [`Path`], if any + pub fn extension(&self) -> Option<&str> { + self.filename() + .and_then(|f| f.rsplit_once('.')) + .and_then(|(_, extension)| { + if extension.is_empty() { + None + } else { + Some(extension) + } + }) + } + /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` /// /// Returns `None` if the prefix does not match @@ -551,4 +572,28 @@ mod tests { assert_eq!(a.raw, b.raw); assert_eq!(b.raw, c.raw); } + + #[test] + fn filename_from_path() { + let a = Path::from("foo/bar"); + let b = Path::from("foo/bar.baz"); + let c = Path::from("foo.bar/baz"); + + assert_eq!(a.filename(), Some("bar")); + assert_eq!(b.filename(), Some("bar.baz")); + assert_eq!(c.filename(), Some("baz")); + } + + #[test] + fn file_extension() { + let a = Path::from("foo/bar"); + let b = Path::from("foo/bar.baz"); + let c = Path::from("foo.bar/baz"); + let d = Path::from("foo.bar/baz.qux"); + + assert_eq!(a.extension(), None); + assert_eq!(b.extension(), Some("baz")); + assert_eq!(c.extension(), None); + assert_eq!(d.extension(), Some("qux")); + } } From 2cf4abb0f894aeb143b94025802aeb1f7e395e8a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Dec 2022 17:28:48 +0000 Subject: [PATCH 0426/1411] Update proc-macro2 requirement from =1.0.47 to =1.0.49 (#3369) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.47...1.0.49) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 938e889f75e8..238e03f3c61f 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -53,7 +53,7 @@ tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.47", default-features = false } +proc-macro2 = { version = "=1.0.49", default-features = false } prost-build = { version = "=0.11.4", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From e2abb4bf5e1e0057a0a2e9fb55934a370b52cb3a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 19 Dec 2022 09:29:52 -0800 Subject: [PATCH 0427/1411] Put BufWriter into TrackedWrite (#3361) * Put BufWriter into TrackedWrite * Update benchmark * Update TrackedWrite doc. * Update --- parquet/benches/arrow_writer.rs | 9 ++++--- parquet/src/bloom_filter/mod.rs | 4 --- parquet/src/column/writer/mod.rs | 16 ++++++++--- parquet/src/file/writer.rs | 46 +++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 676debf5c00c..a590ceb5911c 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -17,7 +17,10 @@ #[macro_use] extern crate criterion; + use criterion::{Criterion, Throughput}; +use std::env; +use std::fs::File; extern crate arrow; extern crate parquet; @@ -312,9 +315,9 @@ fn write_batch_with_option( batch: &RecordBatch, props: Option, ) -> Result<()> { - // Write batch to an in-memory writer - let buffer = vec![]; - let mut writer = ArrowWriter::try_new(buffer, batch.schema(), props)?; + let path = env::temp_dir().join("arrow_writer.temp"); + let file = File::create(path).unwrap(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), props)?; writer.write(batch)?; writer.close()?; diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index a6620fc144ab..e255a8dc12da 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -329,10 +329,6 @@ impl Sbbf { let block_index = self.hash_to_block_index(hash); self.0[block_index].check(hash as u32) } - - pub(crate) fn block_num(&self) -> usize { - self.0.len() - } } // per spec we use xxHash with seed=0 diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 1010dc156a02..fb244920236a 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1680,6 +1680,8 @@ mod tests { assert_eq!(stats.null_count(), 0); assert!(stats.distinct_count().is_none()); + drop(write); + let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) .build(); @@ -1724,6 +1726,8 @@ mod tests { let r = writer.close().unwrap(); assert!(r.metadata.statistics().is_none()); + drop(write); + let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) .build(); @@ -1842,8 +1846,8 @@ mod tests { // ARROW-5129: Test verifies that we add data page in case of dictionary encoding // and no fallback occurred so far. let mut file = tempfile::tempfile().unwrap(); - let mut writer = TrackedWrite::new(&mut file); - let page_writer = Box::new(SerializedPageWriter::new(&mut writer)); + let mut write = TrackedWrite::new(&mut file); + let page_writer = Box::new(SerializedPageWriter::new(&mut write)); let props = Arc::new( WriterProperties::builder() .set_data_pagesize_limit(10) @@ -1855,6 +1859,8 @@ mod tests { writer.write_batch(data, None, None).unwrap(); let r = writer.close().unwrap(); + drop(write); + // Read pages and check the sequence let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) @@ -2196,8 +2202,8 @@ mod tests { rep_levels: Option<&[i16]>, ) { let mut file = tempfile::tempfile().unwrap(); - let mut writer = TrackedWrite::new(&mut file); - let page_writer = Box::new(SerializedPageWriter::new(&mut writer)); + let mut write = TrackedWrite::new(&mut file); + let page_writer = Box::new(SerializedPageWriter::new(&mut write)); let max_def_level = match def_levels { Some(buf) => *buf.iter().max().unwrap_or(&0i16), @@ -2228,6 +2234,8 @@ mod tests { assert_eq!(values_written, values.len()); let result = writer.close().unwrap(); + drop(write); + let props = ReaderProperties::builder() .set_backward_compatible_lz4(false) .build(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index d92a42a6524e..a12d5477c0e9 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -21,7 +21,7 @@ use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; -use std::io::BufWriter; +use std::io::{BufWriter, IoSlice}; use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol, TSerializable}; @@ -44,17 +44,19 @@ use crate::schema::types::{ }; /// A wrapper around a [`Write`] that keeps track of the number -/// of bytes that have been written -pub struct TrackedWrite { - inner: W, +/// of bytes that have been written. The given [`Write`] is wrapped +/// with a [`BufWriter`] to optimize writing performance. +pub struct TrackedWrite { + inner: BufWriter, bytes_written: usize, } impl TrackedWrite { /// Create a new [`TrackedWrite`] from a [`Write`] pub fn new(inner: W) -> Self { + let buf_write = BufWriter::new(inner); Self { - inner, + inner: buf_write, bytes_written: 0, } } @@ -65,8 +67,13 @@ impl TrackedWrite { } /// Returns the underlying writer. - pub fn into_inner(self) -> W { - self.inner + pub fn into_inner(self) -> Result { + self.inner.into_inner().map_err(|err| { + ParquetError::General(format!( + "fail to get inner writer: {:?}", + err.to_string() + )) + }) } } @@ -77,6 +84,19 @@ impl Write for TrackedWrite { Ok(bytes) } + fn write_vectored(&mut self, bufs: &[IoSlice<'_>]) -> std::io::Result { + let bytes = self.inner.write_vectored(bufs)?; + self.bytes_written += bytes; + Ok(bytes) + } + + fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { + self.inner.write_all(buf)?; + self.bytes_written += buf.len(); + + Ok(()) + } + fn flush(&mut self) -> std::io::Result<()> { self.inner.flush() } @@ -226,27 +246,23 @@ impl SerializedFileWriter { // iter row group // iter each column // write bloom filter to the file - let mut start_offset = self.buf.bytes_written(); - let mut writer = BufWriter::new(&mut self.buf); - for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() { match &self.bloom_filters[row_group_idx][column_idx] { Some(bloom_filter) => { - bloom_filter.write(&mut writer)?; + let start_offset = self.buf.bytes_written(); + bloom_filter.write(&mut self.buf)?; // set offset and index for bloom filter column_chunk .meta_data .as_mut() .expect("can't have bloom filter without column metadata") .bloom_filter_offset = Some(start_offset as i64); - start_offset += bloom_filter.block_num() * 32; } None => {} } } } - writer.flush()?; Ok(()) } @@ -336,7 +352,7 @@ impl SerializedFileWriter { self.assert_previous_writer_closed()?; let _ = self.write_metadata()?; - Ok(self.buf.into_inner()) + self.buf.into_inner() } } @@ -558,7 +574,7 @@ impl<'a> SerializedColumnWriter<'a> { /// Writes and serializes pages and metadata into output stream. /// /// `SerializedPageWriter` should not be used after calling `close()`. -pub struct SerializedPageWriter<'a, W> { +pub struct SerializedPageWriter<'a, W: Write> { sink: &'a mut TrackedWrite, } From e664208b79b638536ae296212223537c3cd37acb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 19 Dec 2022 21:01:40 +0000 Subject: [PATCH 0428/1411] Add parquet ObjectStore integration (#3370) * Add parquet ObjectStore integration * Apply suggestions from code review Co-authored-by: Andrew Lamb * Add tests * Fix merge conflict Co-authored-by: Andrew Lamb --- object_store/src/lib.rs | 15 +- parquet/Cargo.toml | 3 + parquet/src/arrow/async_reader/metadata.rs | 159 ++++++++++++++++++ .../{async_reader.rs => async_reader/mod.rs} | 8 + parquet/src/arrow/async_reader/store.rs | 158 +++++++++++++++++ 5 files changed, 340 insertions(+), 3 deletions(-) create mode 100644 parquet/src/arrow/async_reader/metadata.rs rename parquet/src/arrow/{async_reader.rs => async_reader/mod.rs} (99%) create mode 100644 parquet/src/arrow/async_reader/store.rs diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 85e8737b7726..6078c1c93cdf 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -33,9 +33,18 @@ //! //! # Create an [`ObjectStore`] implementation: //! -//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder) -//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder) -//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder) +#![cfg_attr( + feature = "gcp", + doc = "* [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" +)] +#![cfg_attr( + feature = "aws", + doc = "* [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)" +)] +#![cfg_attr( + feature = "azure", + doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" +)] //! * In Memory: [`InMemory`](memory::InMemory) //! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index cde46b98b214..22dbc7e22cf9 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -43,6 +43,7 @@ arrow-data = { version = "29.0.0", path = "../arrow-data", default-features = fa arrow-schema = { version = "29.0.0", path = "../arrow-schema", default-features = false, optional = true } arrow-select = { version = "29.0.0", path = "../arrow-select", default-features = false, optional = true } arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", default-features = false, optional = true } +object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } @@ -96,6 +97,8 @@ test_common = ["arrow/test_utils"] experimental = [] # Enable async APIs async = ["futures", "tokio"] +# Enable object_store integration +object_store = ["dep:object_store", "async"] [[example]] name = "read_parquet" diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs new file mode 100644 index 000000000000..9c96d06502c8 --- /dev/null +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::errors::{ParquetError, Result}; +use crate::file::footer::{decode_footer, decode_metadata}; +use crate::file::metadata::ParquetMetaData; +use bytes::{BufMut, Bytes, BytesMut}; +use std::future::Future; +use std::ops::Range; + +/// Fetches parquet metadata +/// +/// Parameters: +/// * fetch: an async function that can fetch byte ranges +/// * file_size: the total size of the parquet file +/// * footer_size_hint: footer prefetch size (see comments below) +/// +/// The length of the parquet footer, which contains file metadata, is not +/// known up front. Therefore this function will first issue a request to read +/// the last 8 bytes to determine the footer's precise length, before +/// issuing a second request to fetch the metadata bytes +/// +/// If a hint is set, this method will read the specified number of bytes +/// in the first request, instead of 8, and only issue a second request +/// if additional bytes are needed. This can therefore eliminate a +/// potentially costly additional fetch operation +pub async fn fetch_parquet_metadata( + mut fetch: F, + file_size: usize, + footer_size_hint: Option, +) -> Result +where + F: FnMut(Range) -> Fut, + Fut: Future>, +{ + if file_size < 8 { + return Err(ParquetError::EOF(format!( + "file size of {} is less than footer", + file_size + ))); + } + + // If a size hint is provided, read more than the minimum size + // to try and avoid a second fetch. + let footer_start = if let Some(size_hint) = footer_size_hint { + file_size.saturating_sub(size_hint) + } else { + file_size - 8 + }; + + let suffix = fetch(footer_start..file_size).await?; + let suffix_len = suffix.len(); + + let mut footer = [0; 8]; + footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); + + let length = decode_footer(&footer)?; + + if file_size < length + 8 { + return Err(ParquetError::EOF(format!( + "file size of {} is less than footer + metadata {}", + file_size, + length + 8 + ))); + } + + // Did not fetch the entire file metadata in the initial read, need to make a second request + if length > suffix_len - 8 { + let metadata_start = file_size - length - 8; + let remaining_metadata = fetch(metadata_start..footer_start).await?; + + let mut metadata = BytesMut::with_capacity(length); + + metadata.put(remaining_metadata.as_ref()); + metadata.put(&suffix[..suffix_len - 8]); + + Ok(decode_metadata(metadata.as_ref())?) + } else { + let metadata_start = file_size - length - 8; + + Ok(decode_metadata( + &suffix[metadata_start - footer_start..suffix_len - 8], + )?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::file::reader::{FileReader, Length, SerializedFileReader}; + use crate::util::test_common::file_util::get_test_file; + use std::fs::File; + use std::io::{Read, Seek, SeekFrom}; + + fn read_range(file: &mut File, range: Range) -> Result { + file.seek(SeekFrom::Start(range.start as _))?; + let len = range.end - range.start; + let mut buf = Vec::with_capacity(len); + file.take(len as _).read_to_end(&mut buf)?; + Ok(buf.into()) + } + + #[tokio::test] + async fn test_simple() { + let mut file = get_test_file("nulls.snappy.parquet"); + let len = file.len() as usize; + + let reader = SerializedFileReader::new(file.try_clone().unwrap()).unwrap(); + let expected = reader.metadata().file_metadata().schema(); + + let mut fetch = |range| futures::future::ready(read_range(&mut file, range)); + let actual = fetch_parquet_metadata(&mut fetch, len, None).await.unwrap(); + assert_eq!(actual.file_metadata().schema(), expected); + + // Metadata hint too small + let actual = fetch_parquet_metadata(&mut fetch, len, Some(10)) + .await + .unwrap(); + assert_eq!(actual.file_metadata().schema(), expected); + + // Metadata hint too large + let actual = fetch_parquet_metadata(&mut fetch, len, Some(500)) + .await + .unwrap(); + assert_eq!(actual.file_metadata().schema(), expected); + + // Metadata hint exactly correct + let actual = fetch_parquet_metadata(&mut fetch, len, Some(428)) + .await + .unwrap(); + assert_eq!(actual.file_metadata().schema(), expected); + + let err = fetch_parquet_metadata(&mut fetch, 4, None) + .await + .unwrap_err() + .to_string(); + assert_eq!(err, "EOF: file size of 4 is less than footer"); + + let err = fetch_parquet_metadata(&mut fetch, 20, None) + .await + .unwrap_err() + .to_string(); + assert_eq!(err, "Parquet error: Invalid Parquet file. Corrupt footer"); + } +} diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader/mod.rs similarity index 99% rename from parquet/src/arrow/async_reader.rs rename to parquet/src/arrow/async_reader/mod.rs index 4285a1c17ca9..cbaa2bf6b0aa 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -116,6 +116,14 @@ use crate::file::FOOTER_SIZE; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; +mod metadata; +pub use metadata::*; + +#[cfg(feature = "object_store")] +mod store; +#[cfg(feature = "object_store")] +pub use store::*; + /// The asynchronous interface used by [`ParquetRecordBatchStream`] to read parquet files pub trait AsyncFileReader: Send { /// Retrieve the bytes in `range` diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs new file mode 100644 index 000000000000..716b641cd00a --- /dev/null +++ b/parquet/src/arrow/async_reader/store.rs @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, TryFutureExt}; + +use object_store::{ObjectMeta, ObjectStore}; + +use crate::arrow::async_reader::{fetch_parquet_metadata, AsyncFileReader}; +use crate::errors::{ParquetError, Result}; +use crate::file::metadata::ParquetMetaData; + +/// Implements [`AsyncFileReader`] for a parquet file in object storage +pub struct ParquetObjectReader { + store: Arc, + meta: ObjectMeta, + metadata_size_hint: Option, +} + +impl ParquetObjectReader { + /// Creates a new [`ParquetObjectReader`] for the provided [`ObjectStore`] and [`ObjectMeta`] + /// + /// [`ObjectMeta`] can be obtained using [`ObjectStore::list`] or [`ObjectStore::head`] + pub fn new(store: Arc, meta: ObjectMeta) -> Self { + Self { + store, + meta, + metadata_size_hint: None, + } + } + + /// Provide a hint as to the size of the parquet file's footer, see [fetch_parquet_metadata] + pub fn with_footer_size_hint(self, hint: usize) -> Self { + Self { + metadata_size_hint: Some(hint), + ..self + } + } +} + +impl AsyncFileReader for ParquetObjectReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { + self.store + .get_range(&self.meta.location, range) + .map_err(|e| { + ParquetError::General(format!("AsyncChunkReader::get_bytes error: {}", e)) + }) + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, Result>> + where + Self: Send, + { + async move { + self.store + .get_ranges(&self.meta.location, &ranges) + .await + .map_err(|e| { + ParquetError::General(format!( + "ParquetObjectReader::get_byte_ranges error: {}", + e + )) + }) + } + .boxed() + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + Box::pin(async move { + let metadata = fetch_parquet_metadata( + |range| { + self.store + .get_range(&self.meta.location, range) + .map_err(|e| { + ParquetError::General(format!( + "ParquetObjectReader::get_metadata error: {}", + e + )) + }) + }, + self.meta.size, + self.metadata_size_hint, + ) + .await?; + Ok(Arc::new(metadata)) + }) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use futures::TryStreamExt; + + use arrow::util::test_util::parquet_test_data; + use object_store::local::LocalFileSystem; + use object_store::path::Path; + use object_store::ObjectStore; + + use crate::arrow::async_reader::ParquetObjectReader; + use crate::arrow::ParquetRecordBatchStreamBuilder; + + #[tokio::test] + async fn test_simple() { + let res = parquet_test_data(); + let store = LocalFileSystem::new_with_prefix(res).unwrap(); + + let mut meta = store + .head(&Path::from("alltypes_plain.parquet")) + .await + .unwrap(); + + let store = Arc::new(store) as Arc; + let object_reader = ParquetObjectReader::new(Arc::clone(&store), meta.clone()); + let builder = ParquetRecordBatchStreamBuilder::new(object_reader) + .await + .unwrap(); + let batches: Vec<_> = builder.build().unwrap().try_collect().await.unwrap(); + + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 8); + + meta.location = Path::from("I don't exist.parquet"); + + let object_reader = ParquetObjectReader::new(store, meta); + // Cannot use unwrap_err as ParquetRecordBatchStreamBuilder: !Debug + match ParquetRecordBatchStreamBuilder::new(object_reader).await { + Ok(_) => panic!("expected failure"), + Err(e) => { + let err = e.to_string(); + assert!(err.contains("Parquet error: ParquetObjectReader::get_metadata error: Object at location") && err.contains("not found: No such file or directory (os error 2)"), "{}", err); + } + } + } +} From 8cab7a2b446a916bb4b6f3152ddecd4b8b5dd61a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 19 Dec 2022 21:02:08 +0000 Subject: [PATCH 0429/1411] Add CSV build_buffered (#3338) (#3368) * Add CSV build_buffered (#3338) * Doc tweaks --- arrow-csv/src/reader/mod.rs | 42 ++++++++++++++++++++++--------------- arrow/benches/csv_reader.rs | 2 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 877876b77c9a..bc6b016ec9cf 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -47,7 +47,7 @@ use regex::{Regex, RegexSet}; use std::collections::HashSet; use std::fmt; use std::fs::File; -use std::io::{BufReader, Read, Seek, SeekFrom}; +use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; use std::sync::Arc; use arrow_array::builder::Decimal128Builder; @@ -325,14 +325,17 @@ pub fn infer_schema_from_files( // optional bounds of the reader, of the form (min line, max line). type Bounds = Option<(usize, usize)>; +/// CSV file reader using [`std::io::BufReader`] +pub type Reader = BufReader>; + /// CSV file reader -pub struct Reader { +pub struct BufReader { /// Explicit schema for the CSV file schema: SchemaRef, /// Optional projection for which columns to load (zero-based column indices) projection: Option>, /// File reader - reader: RecordReader>, + reader: RecordReader, /// Rows to skip to_skip: usize, /// Current line number @@ -347,9 +350,9 @@ pub struct Reader { datetime_format: Option, } -impl fmt::Debug for Reader +impl fmt::Debug for BufReader where - R: Read, + R: BufRead, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Reader") @@ -394,7 +397,7 @@ impl Reader { if let Some(format) = datetime_format { builder = builder.with_datetime_format(format) } - builder.build_with_schema(reader, schema) + builder.build_with_schema(StdBufReader::new(reader), schema) } /// Returns the schema of the reader, useful for getting the schema without reading @@ -441,7 +444,7 @@ impl Reader { } } -impl Iterator for Reader { +impl Iterator for BufReader { type Item = Result; fn next(&mut self) -> Option { @@ -1040,11 +1043,19 @@ impl ReaderBuilder { self } - /// Create a new `Reader` from the `ReaderBuilder` - pub fn build( + /// Create a new `Reader` from a non-buffered reader + /// + /// If `R: BufRead` consider using [`Self::build_buffered`] to avoid unnecessary additional + /// buffering, as internally this method wraps `reader` in [`std::io::BufReader`] + pub fn build(self, reader: R) -> Result, ArrowError> { + self.build_buffered(StdBufReader::new(reader)) + } + + /// Create a new `BufReader` from a buffered reader + pub fn build_buffered( mut self, mut reader: R, - ) -> Result, ArrowError> { + ) -> Result, ArrowError> { // check if schema should be inferred let delimiter = self.delimiter.unwrap_or(b','); let schema = match self.schema.take() { @@ -1068,7 +1079,7 @@ impl ReaderBuilder { Ok(self.build_with_schema(reader, schema)) } - fn build_with_schema(self, reader: R, schema: SchemaRef) -> Reader { + fn build_with_schema(self, reader: R, schema: SchemaRef) -> BufReader { let mut reader_builder = csv_core::ReaderBuilder::new(); reader_builder.escape(self.escape); @@ -1081,11 +1092,8 @@ impl ReaderBuilder { if let Some(t) = self.terminator { reader_builder.terminator(csv_core::Terminator::Any(t)); } - let reader = RecordReader::new( - BufReader::new(reader), - reader_builder.build(), - schema.fields().len(), - ); + let delimiter = reader_builder.build(); + let reader = RecordReader::new(reader, delimiter, schema.fields().len()); let header = self.has_header as usize; @@ -1094,7 +1102,7 @@ impl ReaderBuilder { None => (header, usize::MAX), }; - Reader { + BufReader { schema, projection: self.projection, reader, diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index f6353fb851f5..02c8ca2d2993 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -44,7 +44,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { .with_schema(batch.schema()) .with_batch_size(batch_size) .has_header(true) - .build(cursor) + .build_buffered(cursor) .unwrap(); for next in reader { From 0f196b8dad7592ae139d17c4a8aa960b0e8731fa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 19 Dec 2022 21:02:34 +0000 Subject: [PATCH 0430/1411] Use custom Any instead of prost_types (#3360) * Use custom Any instead of prost_types * Remove unnecesary path prefix --- arrow-flight/Cargo.toml | 5 +- arrow-flight/examples/flight_sql_server.rs | 11 ++-- arrow-flight/src/sql/client.rs | 22 +++---- arrow-flight/src/sql/mod.rs | 67 +++++++++++++--------- arrow-flight/src/sql/server.rs | 26 ++++----- 5 files changed, 73 insertions(+), 58 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 238e03f3c61f..847d77ca58de 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -35,14 +35,13 @@ base64 = { version = "0.20", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } prost = { version = "0.11", default-features = false } -prost-types = { version = "0.11.0", default-features = false, optional = true } prost-derive = { version = "0.11", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -futures = { version = "0.3", default-features = false, features = ["alloc"]} +futures = { version = "0.3", default-features = false, features = ["alloc"] } [features] default = [] -flight-sql-experimental = ["prost-types"] +flight-sql-experimental = [] [dev-dependencies] arrow = { version = "29.0.0", path = "../arrow", features = ["prettyprint"] } diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 29e6c2c37d68..5adb5d59a0ed 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -17,13 +17,14 @@ use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; -use arrow_flight::sql::{ActionCreatePreparedStatementResult, ProstMessageExt, SqlInfo}; +use arrow_flight::sql::{ + ActionCreatePreparedStatementResult, Any, ProstMessageExt, SqlInfo, +}; use arrow_flight::{ Action, FlightData, FlightEndpoint, HandshakeRequest, HandshakeResponse, IpcMessage, Location, SchemaAsIpc, Ticket, }; use futures::{stream, Stream}; -use prost_types::Any; use std::fs; use std::pin::Pin; use std::sync::Arc; @@ -124,7 +125,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_get_fallback( &self, _request: Request, - _message: prost_types::Any, + _message: Any, ) -> Result::DoGetStream>, Status> { let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; @@ -474,9 +475,9 @@ impl ProstMessageExt for FetchResults { } fn as_any(&self) -> Any { - prost_types::Any { + Any { type_url: FetchResults::type_url().to_string(), - value: ::prost::Message::encode_to_vec(self), + value: ::prost::Message::encode_to_vec(self).into(), } } } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index fa6691793a17..74039027e7cb 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -23,11 +23,12 @@ use crate::flight_service_client::FlightServiceClient; use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; use crate::sql::{ ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, CommandGetCatalogs, CommandGetCrossReference, - CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, - CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, - CommandPreparedStatementQuery, CommandStatementQuery, CommandStatementUpdate, - DoPutUpdateResult, ProstAnyExt, ProstMessageExt, SqlInfo, + ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, + CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, + CommandStatementQuery, CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, + SqlInfo, }; use crate::{ Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, @@ -177,8 +178,8 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any: prost_types::Any = prost::Message::decode(&*result.app_metadata) - .map_err(decode_error_to_arrow_error)?; + let any = + Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } @@ -298,8 +299,7 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any: prost_types::Any = - prost::Message::decode(&*result.body).map_err(decode_error_to_arrow_error)?; + let any = Any::decode(&*result.body).map_err(decode_error_to_arrow_error)?; let prepared_result: ActionCreatePreparedStatementResult = any.unpack()?.unwrap(); let dataset_schema = match prepared_result.dataset_schema.len() { 0 => Schema::empty(), @@ -384,8 +384,8 @@ impl PreparedStatement { .await .map_err(status_to_arrow_error)? .unwrap(); - let any: prost_types::Any = Message::decode(&*result.app_metadata) - .map_err(decode_error_to_arrow_error)?; + let any = + Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 0ddc64c554d8..88dc6cde9800 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -16,6 +16,7 @@ // under the License. use arrow_schema::ArrowError; +use bytes::Bytes; use prost::Message; mod gen { @@ -66,8 +67,8 @@ pub trait ProstMessageExt: prost::Message + Default { /// type_url for this Message fn type_url() -> &'static str; - /// Convert this Message to prost_types::Any - fn as_any(&self) -> prost_types::Any; + /// Convert this Message to [`Any`] + fn as_any(&self) -> Any; } macro_rules! prost_message_ext { @@ -78,10 +79,10 @@ macro_rules! prost_message_ext { concat!("type.googleapis.com/arrow.flight.protocol.sql.", stringify!($name)) } - fn as_any(&self) -> prost_types::Any { - prost_types::Any { + fn as_any(&self) -> Any { + Any { type_url: <$name>::type_url().to_string(), - value: self.encode_to_vec(), + value: self.encode_to_vec().into(), } } } @@ -111,30 +112,44 @@ prost_message_ext!( TicketStatementQuery, ); -/// ProstAnyExt are useful utility methods for prost_types::Any -/// The API design is inspired by [rust-protobuf](https://github.com/stepancheg/rust-protobuf/blob/master/protobuf/src/well_known_types_util/any.rs) -pub trait ProstAnyExt { - /// Check if `Any` contains a message of given type. - fn is(&self) -> bool; - - /// Extract a message from this `Any`. - /// - /// # Returns - /// - /// * `Ok(None)` when message type mismatch - /// * `Err` when parse failed - fn unpack(&self) -> Result, ArrowError>; - - /// Pack any message into `prost_types::Any` value. - fn pack(message: &M) -> Result; +/// An implementation of the protobuf [`Any`] message type +/// +/// Encoded protobuf messages are not self-describing, nor contain any information +/// on the schema of the encoded payload. Consequently to decode a protobuf a client +/// must know the exact schema of the message. +/// +/// This presents a problem for loosely typed APIs, where the exact message payloads +/// are not enumerable, and therefore cannot be enumerated as variants in a [oneof]. +/// +/// One solution is [`Any`] where the encoded payload is paired with a `type_url` +/// identifying the type of encoded message, and the resulting combination encoded. +/// +/// Clients can then decode the outer [`Any`], inspect the `type_url` and if it is +/// a type they recognise, proceed to decode the embedded message `value` +/// +/// [`Any`]: https://developers.google.com/protocol-buffers/docs/proto3#any +/// [oneof]: https://developers.google.com/protocol-buffers/docs/proto3#oneof +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Any { + /// A URL/resource name that uniquely identifies the type of the serialized + /// protocol buffer message. This string must contain at least + /// one "/" character. The last segment of the URL's path must represent + /// the fully qualified name of the type (as in + /// `path/google.protobuf.Duration`). The name should be in a canonical form + /// (e.g., leading "." is not accepted). + #[prost(string, tag = "1")] + pub type_url: String, + /// Must be a valid serialized protocol buffer of the above specified type. + #[prost(bytes = "bytes", tag = "2")] + pub value: Bytes, } -impl ProstAnyExt for prost_types::Any { - fn is(&self) -> bool { +impl Any { + pub fn is(&self) -> bool { M::type_url() == self.type_url } - fn unpack(&self) -> Result, ArrowError> { + pub fn unpack(&self) -> Result, ArrowError> { if !self.is::() { return Ok(None); } @@ -144,7 +159,7 @@ impl ProstAnyExt for prost_types::Any { Ok(Some(m)) } - fn pack(message: &M) -> Result { + pub fn pack(message: &M) -> Result { Ok(message.as_any()) } } @@ -170,7 +185,7 @@ mod tests { let query = CommandStatementQuery { query: "select 1".to_string(), }; - let any = prost_types::Any::pack(&query).unwrap(); + let any = Any::pack(&query).unwrap(); assert!(any.is::()); let unpack_query: CommandStatementQuery = any.unpack().unwrap().unwrap(); assert_eq!(query, unpack_query); diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index ec48d7cfed31..fdf9c9133b90 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -17,6 +17,7 @@ use std::pin::Pin; +use crate::sql::Any; use futures::Stream; use prost::Message; use tonic::{Request, Response, Status, Streaming}; @@ -32,7 +33,7 @@ use super::{ CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementUpdate, DoPutUpdateResult, ProstAnyExt, ProstMessageExt, SqlInfo, + CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, }; @@ -63,7 +64,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_get_fallback( &self, _request: Request, - message: prost_types::Any, + message: Any, ) -> Result::DoGetStream>, Status> { Err(Status::unimplemented(format!( "do_get: The defined request is invalid: {}", @@ -311,8 +312,8 @@ where &self, request: Request, ) -> Result, Status> { - let message: prost_types::Any = - Message::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; + let message = + Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; if message.is::() { let token = message @@ -411,10 +412,10 @@ where &self, request: Request, ) -> Result, Status> { - let msg: prost_types::Any = Message::decode(&*request.get_ref().ticket) + let msg: Any = Message::decode(&*request.get_ref().ticket) .map_err(decode_error_to_status)?; - fn unpack(msg: prost_types::Any) -> Result { + fn unpack(msg: Any) -> Result { msg.unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| Status::internal("Expected a command, but found none.")) @@ -462,9 +463,8 @@ where mut request: Request>, ) -> Result, Status> { let cmd = request.get_mut().message().await?.unwrap(); - let message: prost_types::Any = - Message::decode(&*cmd.flight_descriptor.unwrap().cmd) - .map_err(decode_error_to_status)?; + let message = Any::decode(&*cmd.flight_descriptor.unwrap().cmd) + .map_err(decode_error_to_status)?; if message.is::() { let token = message .unpack() @@ -536,8 +536,8 @@ where request: Request, ) -> Result, Status> { if request.get_ref().r#type == CREATE_PREPARED_STATEMENT { - let any: prost_types::Any = Message::decode(&*request.get_ref().body) - .map_err(decode_error_to_status)?; + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedStatementRequest = any .unpack() @@ -556,8 +556,8 @@ where return Ok(Response::new(Box::pin(output))); } if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { - let any: prost_types::Any = Message::decode(&*request.get_ref().body) - .map_err(decode_error_to_status)?; + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionClosePreparedStatementRequest = any .unpack() From f521e11dc2da0313cb1a16958d39a67a41a25fa6 Mon Sep 17 00:00:00 2001 From: Jiacai Liu Date: Wed, 21 Dec 2022 01:01:12 +0800 Subject: [PATCH 0431/1411] feat: add append_key_value_metadata (#3367) * Add update_key_value_metadata * Add comments * Address review * fix clippy * Update parquet/src/arrow/arrow_writer/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix reviews * Test and fix Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/arrow_writer/mod.rs | 9 ++- parquet/src/file/writer.rs | 80 ++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 53ca71d28077..5cf33d125484 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -32,7 +32,7 @@ use super::schema::{ use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::RowGroupMetaDataPtr; +use crate::file::metadata::{KeyValue, RowGroupMetaDataPtr}; use crate::file::properties::WriterProperties; use crate::file::writer::SerializedRowGroupWriter; use crate::{data_type::*, file::writer::SerializedFileWriter}; @@ -158,6 +158,13 @@ impl ArrowWriter { self.flush_rows(self.buffered_rows) } + /// Additional [`KeyValue`] metadata to be written in addition to those from [`WriterProperties`] + /// + /// This method provide a way to append kv_metadata after write RecordBatch + pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) { + self.writer.append_key_value_metadata(kv_metadata) + } + /// Flushes `num_rows` from the buffer into a new row group fn flush_rows(&mut self, num_rows: usize) -> Result<()> { if num_rows == 0 { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index a12d5477c0e9..2d879be806c4 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -142,6 +142,8 @@ pub struct SerializedFileWriter { column_indexes: Vec>>, offset_indexes: Vec>>, row_group_index: usize, + // kv_metadatas will be appended to `props` when `write_metadata` + kv_metadatas: Vec, } impl SerializedFileWriter { @@ -159,6 +161,7 @@ impl SerializedFileWriter { column_indexes: Vec::new(), offset_indexes: Vec::new(), row_group_index: 0, + kv_metadatas: Vec::new(), }) } @@ -309,12 +312,18 @@ impl SerializedFileWriter { self.write_column_indexes(&mut row_groups)?; self.write_offset_indexes(&mut row_groups)?; + let key_value_metadata = match self.props.key_value_metadata() { + Some(kv) => Some(kv.iter().chain(&self.kv_metadatas).cloned().collect()), + None if self.kv_metadatas.is_empty() => None, + None => Some(self.kv_metadatas.clone()), + }; + let file_metadata = parquet::FileMetaData { num_rows, row_groups, + key_value_metadata, version: self.props.writer_version().as_num(), schema: types::to_thrift(self.schema.as_ref())?, - key_value_metadata: self.props.key_value_metadata().cloned(), created_by: Some(self.props.created_by().to_owned()), column_orders: None, encryption_algorithm: None, @@ -347,6 +356,10 @@ impl SerializedFileWriter { } } + pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) { + self.kv_metadatas.push(kv_metadata); + } + /// Writes the file footer and returns the underlying writer. pub fn into_inner(mut self) -> Result { self.assert_previous_writer_closed()?; @@ -1355,4 +1368,69 @@ mod tests { }) }); } + + fn test_kv_metadata( + initial_kv: Option>, + final_kv: Option>, + ) { + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + )]) + .build() + .unwrap(), + ); + let mut out = Vec::with_capacity(1024); + let props = Arc::new( + WriterProperties::builder() + .set_key_value_metadata(initial_kv.clone()) + .build(), + ); + let mut writer = SerializedFileWriter::new(&mut out, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + let column = row_group_writer.next_column().unwrap().unwrap(); + column.close().unwrap(); + row_group_writer.close().unwrap(); + if let Some(kvs) = &final_kv { + for kv in kvs { + writer.append_key_value_metadata(kv.clone()) + } + } + writer.close().unwrap(); + + let reader = SerializedFileReader::new(Bytes::from(out)).unwrap(); + let metadata = reader.metadata().file_metadata(); + let keys = metadata.key_value_metadata(); + + match (initial_kv, final_kv) { + (Some(a), Some(b)) => { + let keys = keys.unwrap(); + assert_eq!(keys.len(), a.len() + b.len()); + assert_eq!(&keys[..a.len()], a.as_slice()); + assert_eq!(&keys[a.len()..], b.as_slice()); + } + (Some(v), None) => assert_eq!(keys.unwrap(), &v), + (None, Some(v)) if !v.is_empty() => assert_eq!(keys.unwrap(), &v), + _ => assert!(keys.is_none()), + } + } + + #[test] + fn test_append_metadata() { + let kv1 = KeyValue::new("cupcakes".to_string(), "awesome".to_string()); + let kv2 = KeyValue::new("bingo".to_string(), "bongo".to_string()); + + test_kv_metadata(None, None); + test_kv_metadata(Some(vec![kv1.clone()]), None); + test_kv_metadata(None, Some(vec![kv2.clone()])); + test_kv_metadata(Some(vec![kv1.clone()]), Some(vec![kv2.clone()])); + test_kv_metadata(Some(vec![]), Some(vec![kv2])); + test_kv_metadata(Some(vec![]), Some(vec![])); + test_kv_metadata(Some(vec![kv1]), Some(vec![])); + test_kv_metadata(None, Some(vec![])); + } } From 8b84d4d594476f61009676f9ab0cb7fb9295cc64 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Dec 2022 17:02:14 +0000 Subject: [PATCH 0432/1411] Use bytes in arrow-flight (#3359) * Use bytes in arrow-flight * Integration test fixes --- .github/workflows/arrow_flight.yml | 2 +- arrow-flight/build.rs | 10 +++- arrow-flight/examples/flight_sql_server.rs | 24 ++++----- arrow-flight/src/arrow.flight.protocol.rs | 52 +++++++++---------- arrow-flight/src/lib.rs | 29 ++++++----- .../src/sql/arrow.flight.protocol.sql.rs | 28 +++++----- arrow-flight/src/sql/client.rs | 15 +++--- arrow-flight/src/sql/server.rs | 6 +-- arrow-flight/src/utils.rs | 3 +- .../auth_basic_proto.rs | 6 +-- .../integration_test.rs | 2 +- .../src/flight_client_scenarios/middleware.rs | 5 +- .../src/flight_server_scenarios.rs | 2 +- .../auth_basic_proto.rs | 4 +- .../integration_test.rs | 4 +- .../src/flight_server_scenarios/middleware.rs | 3 +- 16 files changed, 101 insertions(+), 94 deletions(-) diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index f12eb4d8beb8..fb3e9f577d5a 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -71,4 +71,4 @@ jobs: - name: Setup Clippy run: rustup component add clippy - name: Run clippy - run: cargo clippy -p arrow-flight --all-features -- -D warnings + run: cargo clippy -p arrow-flight --all-targets --all-features -- -D warnings diff --git a/arrow-flight/build.rs b/arrow-flight/build.rs index bc20100ab37f..3f50fa81279f 100644 --- a/arrow-flight/build.rs +++ b/arrow-flight/build.rs @@ -36,7 +36,7 @@ fn main() -> Result<(), Box> { // protoc in unbuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") .out_dir("src") - .compile(&[proto_path], &[proto_dir])?; + .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; // read file contents to string let mut file = OpenOptions::new() @@ -67,7 +67,7 @@ fn main() -> Result<(), Box> { // protoc in ubuntu builder needs this option .protoc_arg("--experimental_allow_proto3_optional") .out_dir("src/sql") - .compile(&[proto_path], &[proto_dir])?; + .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; // read file contents to string let mut file = OpenOptions::new() @@ -94,3 +94,9 @@ fn main() -> Result<(), Box> { // As the proto file is checked in, the build should not fail if the file is not found Ok(()) } + +fn prost_config() -> prost_build::Config { + let mut config = prost_build::Config::new(); + config.bytes([".arrow"]); + config +} diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 5adb5d59a0ed..0d06aa664ec8 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -25,17 +25,13 @@ use arrow_flight::{ Location, SchemaAsIpc, Ticket, }; use futures::{stream, Stream}; -use std::fs; +use prost::Message; use std::pin::Pin; use std::sync::Arc; -use tempfile::NamedTempFile; -use tokio::net::{UnixListener, UnixStream}; -use tokio_stream::wrappers::UnixListenerStream; -use tonic::transport::{Endpoint, Server}; +use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; use arrow_flight::flight_descriptor::DescriptorType; -use arrow_flight::sql::client::FlightSqlServiceClient; use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ flight_service_server::FlightService, @@ -88,7 +84,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let authorization = request .metadata() .get("authorization") - .ok_or(Status::invalid_argument("authorization field not present"))? + .ok_or_else(|| Status::invalid_argument("authorization field not present"))? .to_str() .map_err(|e| status!("authorization not parsable", e))?; if !authorization.starts_with(basic) { @@ -102,7 +98,7 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(|e| status!("authorization not decodable", e))?; let str = String::from_utf8(bytes) .map_err(|e| status!("authorization not parsable", e))?; - let parts: Vec<_> = str.split(":").collect(); + let parts: Vec<_> = str.split(':').collect(); let (user, pass) = match parts.as_slice() { [user, pass] => (user, pass), _ => Err(Status::invalid_argument( @@ -115,7 +111,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let result = HandshakeResponse { protocol_version: 0, - payload: "random_uuid_token".as_bytes().to_vec(), + payload: "random_uuid_token".into(), }; let result = Ok(result); let output = futures::stream::iter(vec![result]); @@ -157,7 +153,7 @@ impl FlightSqlService for FlightSqlServiceImpl { cmd: CommandPreparedStatementQuery, _request: Request, ) -> Result, Status> { - let handle = String::from_utf8(cmd.prepared_statement_handle) + let handle = std::str::from_utf8(&cmd.prepared_statement_handle) .map_err(|e| status!("Unable to parse handle", e))?; let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; @@ -170,7 +166,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let fetch = FetchResults { handle: handle.to_string(), }; - let buf = ::prost::Message::encode_to_vec(&fetch.as_any()); + let buf = fetch.as_any().encode_to_vec().into(); let ticket = Ticket { ticket: buf }; let endpoint = FlightEndpoint { ticket: Some(ticket), @@ -185,7 +181,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let flight_desc = FlightDescriptor { r#type: DescriptorType::Cmd.into(), - cmd: vec![], + cmd: Default::default(), path: vec![], }; let info = FlightInfo { @@ -431,9 +427,9 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(|e| status!("Unable to serialize schema", e))?; let IpcMessage(schema_bytes) = message; let res = ActionCreatePreparedStatementResult { - prepared_statement_handle: handle.as_bytes().to_vec(), + prepared_statement_handle: handle.into(), dataset_schema: schema_bytes, - parameter_schema: vec![], // TODO: parameters + parameter_schema: Default::default(), // TODO: parameters }; Ok(res) } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index dc0c4609b5a3..a61c83d0c146 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -11,8 +11,8 @@ pub struct HandshakeRequest { pub protocol_version: u64, /// /// Arbitrary auth/handshake info. - #[prost(bytes = "vec", tag = "2")] - pub payload: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub payload: ::prost::bytes::Bytes, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -23,8 +23,8 @@ pub struct HandshakeResponse { pub protocol_version: u64, /// /// Arbitrary auth/handshake info. - #[prost(bytes = "vec", tag = "2")] - pub payload: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub payload: ::prost::bytes::Bytes, } /// /// A message for doing simple auth. @@ -56,8 +56,8 @@ pub struct ActionType { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Criteria { - #[prost(bytes = "vec", tag = "1")] - pub expression: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub expression: ::prost::bytes::Bytes, } /// /// An opaque action specific for the service. @@ -66,16 +66,16 @@ pub struct Criteria { pub struct Action { #[prost(string, tag = "1")] pub r#type: ::prost::alloc::string::String, - #[prost(bytes = "vec", tag = "2")] - pub body: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub body: ::prost::bytes::Bytes, } /// /// An opaque result returned after executing an action. #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Result { - #[prost(bytes = "vec", tag = "1")] - pub body: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub body: ::prost::bytes::Bytes, } /// /// Wrap the result of a getSchema call @@ -86,8 +86,8 @@ pub struct SchemaResult { /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix /// 4 bytes - the byte length of the payload /// a flatbuffer Message whose header is the Schema - #[prost(bytes = "vec", tag = "1")] - pub schema: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub schema: ::prost::bytes::Bytes, } /// /// The name or tag for a Flight. May be used as a way to retrieve or generate @@ -100,8 +100,8 @@ pub struct FlightDescriptor { /// /// Opaque value used to express a command. Should only be defined when /// type = CMD. - #[prost(bytes = "vec", tag = "2")] - pub cmd: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub cmd: ::prost::bytes::Bytes, /// /// List of strings identifying a particular dataset. Should only be defined /// when type = PATH. @@ -160,8 +160,8 @@ pub struct FlightInfo { /// 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix /// 4 bytes - the byte length of the payload /// a flatbuffer Message whose header is the Schema - #[prost(bytes = "vec", tag = "1")] - pub schema: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub schema: ::prost::bytes::Bytes, /// /// The descriptor associated with this info. #[prost(message, optional, tag = "2")] @@ -229,8 +229,8 @@ pub struct Location { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct Ticket { - #[prost(bytes = "vec", tag = "1")] - pub ticket: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub ticket: ::prost::bytes::Bytes, } /// /// A batch of Arrow data as part of a stream of batches. @@ -244,27 +244,27 @@ pub struct FlightData { pub flight_descriptor: ::core::option::Option, /// /// Header for message data as described in Message.fbs::Message. - #[prost(bytes = "vec", tag = "2")] - pub data_header: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub data_header: ::prost::bytes::Bytes, /// /// Application-defined metadata. - #[prost(bytes = "vec", tag = "3")] - pub app_metadata: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "3")] + pub app_metadata: ::prost::bytes::Bytes, /// /// The actual batch of Arrow data. Preferably handled with minimal-copies /// coming last in the definition to help with sidecar patterns (it is /// expected that some implementations will fetch this field off the wire /// with specialized code to avoid extra memory copies). - #[prost(bytes = "vec", tag = "1000")] - pub data_body: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1000")] + pub data_body: ::prost::bytes::Bytes, } /// * /// The response message associated with the submission of a DoPut. #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct PutResult { - #[prost(bytes = "vec", tag = "1")] - pub app_metadata: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub app_metadata: ::prost::bytes::Bytes, } /// Generated client implementations. pub mod flight_service_client { diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index e742dbbe1a72..53ea5d4633e4 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -21,6 +21,7 @@ use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; use arrow_ipc::convert::try_schema_from_ipc_buffer; +use bytes::Bytes; use std::{ convert::{TryFrom, TryInto}, fmt, @@ -83,7 +84,7 @@ pub struct SchemaAsIpc<'a> { /// IpcMessage represents a `Schema` in the format expected in /// `FlightInfo.schema` #[derive(Debug)] -pub struct IpcMessage(pub Vec); +pub struct IpcMessage(pub Bytes); // Useful conversion functions @@ -97,7 +98,7 @@ fn flight_schema_as_encoded_data( fn flight_schema_as_flatbuffer(schema: &Schema, options: &IpcWriteOptions) -> IpcMessage { let encoded_data = flight_schema_as_encoded_data(schema, options); - IpcMessage(encoded_data.ipc_message) + IpcMessage(encoded_data.ipc_message.into()) } // Implement a bunch of useful traits for various conversions, displays, @@ -106,7 +107,7 @@ fn flight_schema_as_flatbuffer(schema: &Schema, options: &IpcWriteOptions) -> Ip // Deref impl Deref for IpcMessage { - type Target = Vec; + type Target = [u8]; fn deref(&self) -> &Self::Target { &self.0 @@ -239,8 +240,8 @@ impl fmt::Display for Ticket { impl From for FlightData { fn from(data: EncodedData) -> Self { FlightData { - data_header: data.ipc_message, - data_body: data.arrow_data, + data_header: data.ipc_message.into(), + data_body: data.arrow_data.into(), ..Default::default() } } @@ -294,7 +295,7 @@ fn schema_to_ipc_format(schema_ipc: SchemaAsIpc) -> ArrowResult { let mut schema = vec![]; writer::write_message(&mut schema, encoded_data, pair.1)?; - Ok(IpcMessage(schema)) + Ok(IpcMessage(schema.into())) } impl TryFrom<&FlightData> for Schema { @@ -322,14 +323,14 @@ impl TryFrom for Schema { type Error = ArrowError; fn try_from(value: IpcMessage) -> ArrowResult { - try_schema_from_ipc_buffer(value.0.as_slice()) + try_schema_from_ipc_buffer(&value) } } impl TryFrom<&SchemaResult> for Schema { type Error = ArrowError; fn try_from(data: &SchemaResult) -> ArrowResult { - try_schema_from_ipc_buffer(data.schema.as_slice()) + try_schema_from_ipc_buffer(&data.schema) } } @@ -339,24 +340,24 @@ impl FlightData { pub fn new( flight_descriptor: Option, message: IpcMessage, - app_metadata: Vec, - data_body: Vec, + app_metadata: impl Into, + data_body: impl Into, ) -> Self { let IpcMessage(vals) = message; FlightData { flight_descriptor, data_header: vals, - app_metadata, - data_body, + app_metadata: app_metadata.into(), + data_body: data_body.into(), } } } impl FlightDescriptor { - pub fn new_cmd(cmd: Vec) -> Self { + pub fn new_cmd(cmd: impl Into) -> Self { FlightDescriptor { r#type: DescriptorType::Cmd.into(), - cmd, + cmd: cmd.into(), ..Default::default() } } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 5fc091427300..c2eb8d618348 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -446,16 +446,16 @@ pub struct ActionCreatePreparedStatementRequest { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { /// Opaque handle for the prepared statement on the server. - #[prost(bytes = "vec", tag = "1")] - pub prepared_statement_handle: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub prepared_statement_handle: ::prost::bytes::Bytes, /// If a result set generating query was provided, dataset_schema contains the /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. - #[prost(bytes = "vec", tag = "2")] - pub dataset_schema: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "2")] + pub dataset_schema: ::prost::bytes::Bytes, /// If the query provided contained parameters, parameter_schema contains the /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. - #[prost(bytes = "vec", tag = "3")] - pub parameter_schema: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "3")] + pub parameter_schema: ::prost::bytes::Bytes, } /// /// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. @@ -464,8 +464,8 @@ pub struct ActionCreatePreparedStatementResult { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionClosePreparedStatementRequest { /// Opaque handle for the prepared statement on the server. - #[prost(bytes = "vec", tag = "1")] - pub prepared_statement_handle: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub prepared_statement_handle: ::prost::bytes::Bytes, } /// /// Represents a SQL query. Used in the command member of FlightDescriptor @@ -497,8 +497,8 @@ pub struct CommandStatementQuery { #[derive(Clone, PartialEq, ::prost::Message)] pub struct TicketStatementQuery { /// Unique identifier for the instance of the statement to execute. - #[prost(bytes = "vec", tag = "1")] - pub statement_handle: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub statement_handle: ::prost::bytes::Bytes, } /// /// Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for @@ -521,8 +521,8 @@ pub struct TicketStatementQuery { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementQuery { /// Opaque handle for the prepared statement on the server. - #[prost(bytes = "vec", tag = "1")] - pub prepared_statement_handle: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub prepared_statement_handle: ::prost::bytes::Bytes, } /// /// Represents a SQL update query. Used in the command member of FlightDescriptor @@ -542,8 +542,8 @@ pub struct CommandStatementUpdate { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementUpdate { /// Opaque handle for the prepared statement on the server. - #[prost(bytes = "vec", tag = "1")] - pub prepared_statement_handle: ::prost::alloc::vec::Vec, + #[prost(bytes = "bytes", tag = "1")] + pub prepared_statement_handle: ::prost::bytes::Bytes, } /// /// Returned from the RPC call DoPut when a CommandStatementUpdate diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 74039027e7cb..cf71edead3e3 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use bytes::Bytes; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; @@ -119,10 +120,10 @@ impl FlightSqlServiceClient { &mut self, username: &str, password: &str, - ) -> Result, ArrowError> { + ) -> Result { let cmd = HandshakeRequest { protocol_version: 0, - payload: vec![], + payload: Default::default(), }; let mut req = tonic::Request::new(stream::iter(vec![cmd])); let val = base64::encode(format!("{}:{}", username, password)); @@ -279,7 +280,7 @@ impl FlightSqlServiceClient { let cmd = ActionCreatePreparedStatementRequest { query }; let action = Action { r#type: CREATE_PREPARED_STATEMENT.to_string(), - body: cmd.as_any().encode_to_vec(), + body: cmd.as_any().encode_to_vec().into(), }; let mut req = tonic::Request::new(action); if let Some(token) = &self.token { @@ -328,7 +329,7 @@ impl FlightSqlServiceClient { pub struct PreparedStatement { flight_client: Arc>>, parameter_binding: Option, - handle: Vec, + handle: Bytes, dataset_schema: Schema, parameter_schema: Schema, } @@ -336,14 +337,14 @@ pub struct PreparedStatement { impl PreparedStatement { pub(crate) fn new( client: Arc>>, - handle: Vec, + handle: impl Into, dataset_schema: Schema, parameter_schema: Schema, ) -> Self { PreparedStatement { flight_client: client, parameter_binding: None, - handle, + handle: handle.into(), dataset_schema, parameter_schema, } @@ -417,7 +418,7 @@ impl PreparedStatement { }; let action = Action { r#type: CLOSE_PREPARED_STATEMENT.to_string(), - body: cmd.as_any().encode_to_vec(), + body: cmd.as_any().encode_to_vec().into(), }; let _ = self .mut_client()? diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index fdf9c9133b90..e764e0c51ac7 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -473,7 +473,7 @@ where let record_count = self.do_put_statement_update(token, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.encode_to_vec(), + app_metadata: result.encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } @@ -494,7 +494,7 @@ where .await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.encode_to_vec(), + app_metadata: result.encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } @@ -551,7 +551,7 @@ where .do_action_create_prepared_statement(cmd, request) .await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { - body: stmt.as_any().encode_to_vec(), + body: stmt.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 855b333853bf..266f8eb29241 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -18,6 +18,7 @@ //! Utilities to assist with reading and writing Arrow data as Flight messages use crate::{FlightData, IpcMessage, SchemaAsIpc, SchemaResult}; +use bytes::Bytes; use std::collections::HashMap; use std::sync::Arc; @@ -138,7 +139,7 @@ pub fn flight_data_from_arrow_schema( pub fn ipc_message_from_arrow_schema( schema: &Schema, options: &IpcWriteOptions, -) -> Result, ArrowError> { +) -> Result { let message = SchemaAsIpc::new(schema, options).try_into()?; let IpcMessage(vals) = message; Ok(vals) diff --git a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs index ab398d3d2e7b..53c6c441271b 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs @@ -74,7 +74,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { .expect("No response received") .expect("Invalid response received"); - let body = String::from_utf8(r.body).unwrap(); + let body = std::str::from_utf8(&r.body).unwrap(); assert_eq!(body, AUTH_USERNAME); Ok(()) @@ -94,7 +94,7 @@ async fn authenticate( let req = stream::once(async { HandshakeRequest { - payload, + payload: payload.into(), ..HandshakeRequest::default() } }); @@ -105,5 +105,5 @@ async fn authenticate( let r = rx.next().await.expect("must respond from handshake")?; assert!(rx.next().await.is_none(), "must not respond a second time"); - Ok(String::from_utf8(r.payload).unwrap()) + Ok(std::str::from_utf8(&r.payload).unwrap().into()) } diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index c01baa09a1f7..a40076b3de0a 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -138,7 +138,7 @@ async fn send_batch( .await?; // Only the record batch's FlightData gets app_metadata - batch_flight_data.app_metadata = metadata.to_vec(); + batch_flight_data.app_metadata = metadata.to_vec().into(); upload_tx.send(batch_flight_data).await?; Ok(()) } diff --git a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs index db8c42cc081c..72ef37d3f548 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs @@ -19,6 +19,7 @@ use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, FlightDescriptor, }; +use prost::bytes::Bytes; use tonic::{Request, Status}; type Error = Box; @@ -31,7 +32,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { let mut descriptor = FlightDescriptor::default(); descriptor.set_type(DescriptorType::Cmd); - descriptor.cmd = b"".to_vec(); + descriptor.cmd = Bytes::from_static(b""); // This call is expected to fail. match client @@ -56,7 +57,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { } // This call should succeed - descriptor.cmd = b"success".to_vec(); + descriptor.cmd = Bytes::from_static(b"success"); let resp = client.get_flight_info(Request::new(descriptor)).await?; let headers = resp.metadata(); diff --git a/arrow-integration-testing/src/flight_server_scenarios.rs b/arrow-integration-testing/src/flight_server_scenarios.rs index e56252f1dfbf..6976c1267524 100644 --- a/arrow-integration-testing/src/flight_server_scenarios.rs +++ b/arrow-integration-testing/src/flight_server_scenarios.rs @@ -39,7 +39,7 @@ pub async fn listen_on(port: u16) -> Result { pub fn endpoint(ticket: &str, location_uri: impl Into) -> FlightEndpoint { FlightEndpoint { ticket: Some(Ticket { - ticket: ticket.as_bytes().to_vec(), + ticket: ticket.as_bytes().to_vec().into(), }), location: vec![Location { uri: location_uri.into(), diff --git a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs index 68a4a0d3b4ad..72d47b1391ee 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs @@ -149,7 +149,7 @@ impl FlightService for AuthBasicProtoScenarioImpl { && *auth.password == *password { Ok(HandshakeResponse { - payload: username.as_bytes().to_vec(), + payload: username.as_bytes().to_vec().into(), ..HandshakeResponse::default() }) } else { @@ -203,7 +203,7 @@ impl FlightService for AuthBasicProtoScenarioImpl { ) -> Result, Status> { let flight_context = self.check_auth(request.metadata()).await?; // Respond with the authenticated username. - let buf = flight_context.peer_identity().as_bytes().to_vec(); + let buf = flight_context.peer_identity().as_bytes().to_vec().into(); let result = arrow_flight::Result { body: buf }; let output = futures::stream::once(async { Ok(result) }); Ok(Response::new(Box::pin(output) as Self::DoActionStream)) diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index dee2fda3be3d..9c6f26befac0 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -125,7 +125,7 @@ impl FlightService for FlightServiceImpl { arrow_flight::utils::flight_data_from_arrow_batch(batch, &options); // Only the record batch's FlightData gets app_metadata - let metadata = counter.to_string().into_bytes(); + let metadata = counter.to_string().into(); batch_flight_data.app_metadata = metadata; dictionary_flight_data @@ -275,7 +275,7 @@ async fn send_app_metadata( app_metadata: &[u8], ) -> Result<(), Status> { tx.send(Ok(PutResult { - app_metadata: app_metadata.to_vec(), + app_metadata: app_metadata.to_vec().into(), })) .await .map_err(|e| Status::internal(format!("Could not send PutResult: {:?}", e))) diff --git a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs index 5876ac9bfe6d..9b1c84b57119 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs @@ -93,7 +93,8 @@ impl FlightService for MiddlewareScenarioImpl { let descriptor = request.into_inner(); - if descriptor.r#type == DescriptorType::Cmd as i32 && descriptor.cmd == b"success" + if descriptor.r#type == DescriptorType::Cmd as i32 + && descriptor.cmd.as_ref() == b"success" { // Return a fake location - the test doesn't read it let endpoint = super::endpoint("foo", "grpc+tcp://localhost:10010"); From c1c97f1344abce3437868e66b6453e1580241c30 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 20 Dec 2022 09:52:17 -0800 Subject: [PATCH 0433/1411] Don't flush in the middle (#3374) --- parquet/src/file/writer.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2d879be806c4..66b5d8e23a75 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -23,7 +23,7 @@ use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; use std::io::{BufWriter, IoSlice}; use std::{io::Write, sync::Arc}; -use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol, TSerializable}; +use thrift::protocol::{TCompactOutputProtocol, TSerializable}; use crate::basic::PageType; use crate::column::writer::{ @@ -230,7 +230,6 @@ impl SerializedFileWriter { let start_offset = self.buf.bytes_written(); let mut protocol = TCompactOutputProtocol::new(&mut self.buf); offset_index.write_to_out_protocol(&mut protocol)?; - protocol.flush()?; let end_offset = self.buf.bytes_written(); // set offset and index for offset index column_metadata.offset_index_offset = Some(start_offset as i64); @@ -282,7 +281,6 @@ impl SerializedFileWriter { let start_offset = self.buf.bytes_written(); let mut protocol = TCompactOutputProtocol::new(&mut self.buf); column_index.write_to_out_protocol(&mut protocol)?; - protocol.flush()?; let end_offset = self.buf.bytes_written(); // set offset and index for offset index column_metadata.column_index_offset = Some(start_offset as i64); @@ -335,7 +333,6 @@ impl SerializedFileWriter { { let mut protocol = TCompactOutputProtocol::new(&mut self.buf); file_metadata.write_to_out_protocol(&mut protocol)?; - protocol.flush()?; } let end_pos = self.buf.bytes_written(); @@ -605,7 +602,6 @@ impl<'a, W: Write> SerializedPageWriter<'a, W> { { let mut protocol = TCompactOutputProtocol::new(&mut self.sink); header.write_to_out_protocol(&mut protocol)?; - protocol.flush()?; } Ok(self.sink.bytes_written() - start_pos) } @@ -702,7 +698,6 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, W> { metadata .to_column_metadata_thrift() .write_to_out_protocol(&mut protocol)?; - protocol.flush()?; Ok(()) } From 9cdc1c1e14a7eaab2683d13efeaed82d4b5f34c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Dec 2022 18:18:09 +0000 Subject: [PATCH 0434/1411] Document all features (#3377) --- arrow-flight/Cargo.toml | 3 +++ arrow-ord/Cargo.toml | 3 +++ arrow-string/Cargo.toml | 3 +++ arrow/Cargo.toml | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 847d77ca58de..ea02bfed497d 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -39,6 +39,9 @@ prost-derive = { version = "0.11", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } +[package.metadata.docs.rs] +all-features = true + [features] default = [] flight-sql-experimental = [] diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 10aab03a54e8..c07e6ae38455 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -48,6 +48,9 @@ num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +[package.metadata.docs.rs] +features = ["dyn_cmp_dict"] + [features] dyn_cmp_dict = [] simd = ["arrow-array/simd"] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 7dd4472f58c9..0bb23fd8e90f 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -46,5 +46,8 @@ arrow-select = { version = "29.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } +[package.metadata.docs.rs] +all-features = true + [features] dyn_cmp_dict = [] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 98d04d5d2635..0954909a0990 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -68,7 +68,7 @@ multiversion = { version = "0.6.1", default-features = false } bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] -features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] +features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "dyn_arith_dict", "ffi", "pyarrow"] [features] default = ["csv", "ipc", "json"] From a8968cd2677d7515915e9d33549b13dfb4a5b2ae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Dec 2022 23:48:00 +0000 Subject: [PATCH 0435/1411] Infer JSON as UTF-8 (#3376) --- parquet/src/arrow/{schema.rs => schema/mod.rs} | 2 ++ parquet/src/arrow/schema/primitive.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) rename parquet/src/arrow/{schema.rs => schema/mod.rs} (99%) diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema/mod.rs similarity index 99% rename from parquet/src/arrow/schema.rs rename to parquet/src/arrow/schema/mod.rs index 464b86d0c67d..120612822671 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -526,6 +526,7 @@ mod tests { OPTIONAL FLOAT float; OPTIONAL BINARY string (UTF8); OPTIONAL BINARY string_2 (STRING); + OPTIONAL BINARY json (JSON); } "; let parquet_group_type = parse_message_type(message_type).unwrap(); @@ -546,6 +547,7 @@ mod tests { Field::new("float", DataType::Float32, true), Field::new("string", DataType::Utf8, true), Field::new("string_2", DataType::Utf8, true), + Field::new("json", DataType::Utf8, true), ]; assert_eq!(&arrow_fields, converted_arrow_schema.fields()); diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index e5bab9ac96c2..bd56583a8f77 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -227,11 +227,11 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Result { match (info.logical_type(), info.converted_type()) { (Some(LogicalType::String), _) => Ok(DataType::Utf8), - (Some(LogicalType::Json), _) => Ok(DataType::Binary), + (Some(LogicalType::Json), _) => Ok(DataType::Utf8), (Some(LogicalType::Bson), _) => Ok(DataType::Binary), (Some(LogicalType::Enum), _) => Ok(DataType::Binary), (None, ConvertedType::NONE) => Ok(DataType::Binary), - (None, ConvertedType::JSON) => Ok(DataType::Binary), + (None, ConvertedType::JSON) => Ok(DataType::Utf8), (None, ConvertedType::BSON) => Ok(DataType::Binary), (None, ConvertedType::ENUM) => Ok(DataType::Binary), (None, ConvertedType::UTF8) => Ok(DataType::Utf8), From e89b04337c3ef38e79a5ecf6b158718c1cb544c0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Dec 2022 18:52:25 -0500 Subject: [PATCH 0436/1411] minor: Improve arrow-flight docs (#3372) * minor: Improve arrow-flight docs * prettier * Update arrow-flight/src/lib.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/README.md | 11 ++++++++++- arrow-flight/src/lib.rs | 19 +++++++++++++++++++ arrow/README.md | 2 +- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 4b2940e45c92..76b990b0163f 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -32,4 +32,13 @@ arrow-flight = "29.0.0" Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. -This crate provides a Rust implementation of the [Flight.proto](../../format/Flight.proto) gRPC protocol and provides an example that demonstrates how to build a Flight server implemented with Tonic. +This crate provides a Rust implementation of the +[Flight.proto](../../format/Flight.proto) gRPC protocol and +[examples](https://github.com/apache/arrow-rs/tree/master/arrow-flight/examples) +that demonstrate how to build a Flight server implemented with [tonic](https://docs.rs/crate/tonic/latest). + +## Feature Flags + +- `flight-sql-experimental`: Enables experimental support for + [Apache Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html), + a protocol for interacting with SQL databases. diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 53ea5d4633e4..051509fb16e2 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -15,6 +15,21 @@ // specific language governing permissions and limitations // under the License. +//! A native Rust implementation of [Apache Arrow Flight](https://arrow.apache.org/docs/format/Flight.html) +//! for exchanging [Arrow](https://arrow.apache.org) data between processes. +//! +//! Please see the [arrow-flight crates.io](https://crates.io/crates/arrow-flight) +//! page for feature flags and more information. +//! +//! # Overview +//! +//! This crate contains: +//! +//! 1. Low level [prost] generated structs +//! for Flight gRPC protobuf messages, such as [`FlightData`]. +//! +//! 2. Low level [tonic] generated [`flight_service_client`] and +//! [`flight_service_server`]. #![allow(rustdoc::invalid_html_tags)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; @@ -36,16 +51,20 @@ mod gen { include!("arrow.flight.protocol.rs"); } +/// Defines a `Flight` for generation or retrieval. pub mod flight_descriptor { use super::gen; pub use gen::flight_descriptor::DescriptorType; } +/// Low Level [tonic] [`FlightServiceClient`](gen::flight_service_client::FlightServiceClient). pub mod flight_service_client { use super::gen; pub use gen::flight_service_client::FlightServiceClient; } +/// Low Level [tonic] [`FlightServiceServer`](gen::flight_service_server::FlightServiceServer) +/// and [`FlightService`](gen::flight_service_server::FlightService). pub mod flight_service_server { use super::gen; pub use gen::flight_service_server::FlightService; diff --git a/arrow/README.md b/arrow/README.md index 5bcdf0cafce9..4d2f8e303b8d 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -44,7 +44,7 @@ The `arrow` crate provides the following features which may be enabled in your ` - `csv` (default) - support for reading and writing Arrow arrays to/from csv files - `json` (default) - support for reading and writing Arrow array to/from json files - `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) -- `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) +- `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns - `js` - support for building arrow for WebAssembly / JavaScript - `simd` - (_Requires Nightly Rust_) Use alternate hand optimized From 0e4ddbfb4e9f29ad79fa34998fc3c85ba14aca45 Mon Sep 17 00:00:00 2001 From: Kevin Lim Date: Wed, 21 Dec 2022 01:37:48 -0800 Subject: [PATCH 0437/1411] Add derive for Clone and Debug for `ParquetObjectReader` (#3382) --- parquet/src/arrow/async_reader/store.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 716b641cd00a..e5de8eae6238 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -29,6 +29,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; /// Implements [`AsyncFileReader`] for a parquet file in object storage +#[derive(Clone, Debug)] pub struct ParquetObjectReader { store: Arc, meta: ObjectMeta, From db9084e74cc5869727e9e51ca0ddd5a6d386c271 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Dec 2022 09:38:30 +0000 Subject: [PATCH 0438/1411] Split out arrow-row (#2594) (#3375) * Split out arrow-row (#2594) * Fix CI * Fix doc * More SortOptions to arrow_schema --- .github/workflows/arrow.yml | 5 ++ .github/workflows/integration.yml | 3 +- Cargo.toml | 1 + arrow-ord/src/sort.rs | 21 +------ arrow-row/Cargo.toml | 61 ++++++++++++++++++ .../src/row => arrow-row/src}/dictionary.rs | 9 ++- {arrow/src/row => arrow-row/src}/fixed.rs | 8 +-- {arrow/src/row => arrow-row/src}/interner.rs | 0 arrow/src/row/mod.rs => arrow-row/src/lib.rs | 62 ++++++++----------- {arrow/src/row => arrow-row/src}/list.rs | 5 +- {arrow/src/row => arrow-row/src}/variable.rs | 9 ++- arrow-schema/src/lib.rs | 19 ++++++ arrow/Cargo.toml | 2 + arrow/src/lib.rs | 2 +- dev/release/README.md | 3 +- 15 files changed, 134 insertions(+), 76 deletions(-) create mode 100644 arrow-row/Cargo.toml rename {arrow/src/row => arrow-row/src}/dictionary.rs (97%) rename {arrow/src/row => arrow-row/src}/fixed.rs (98%) rename {arrow/src/row => arrow-row/src}/interner.rs (100%) rename arrow/src/row/mod.rs => arrow-row/src/lib.rs (98%) rename {arrow/src/row => arrow-row/src}/list.rs (98%) rename {arrow/src/row => arrow-row/src}/variable.rs (97%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 458e0e0a149a..e0db2c08812a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -35,6 +35,7 @@ on: - arrow-ipc/** - arrow-json/** - arrow-ord/** + - arrow-row/** - arrow-schema/** - arrow-select/** - arrow-string/** @@ -76,6 +77,8 @@ jobs: run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features except SIMD run: cargo test -p arrow-ord --features dyn_cmp_dict + - name: Test arrow-row with all features + run: cargo test -p arrow-row --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -196,5 +199,7 @@ jobs: run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features except SIMD run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings + - name: Clippy arrow-row with all features + run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 526106bfe7c9..0975c11d52f8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,11 +33,12 @@ on: - arrow-integration-test/** - arrow-integration-testing/** - arrow-ipc/** - - arrow-ord/** - arrow-json/** + - arrow-ord/** - arrow-pyarrow-integration-testing/** - arrow-schema/** - arrow-select/** + - arrow-sort/** - arrow-string/** - arrow/** diff --git a/Cargo.toml b/Cargo.toml index c123106c6f75..fb072f7d346a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ members = [ "arrow-ipc", "arrow-json", "arrow-ord", + "arrow-row", "arrow-schema", "arrow-select", "arrow-string", diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index a2035988fe25..d13a7a03de94 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -27,6 +27,8 @@ use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; use std::cmp::Ordering; +pub use arrow_schema::SortOptions; + /// Sort the `ArrayRef` using `SortOptions`. /// /// Performs a sort on values and indices. Nulls are ordered according @@ -366,25 +368,6 @@ pub fn sort_to_indices( }) } -/// Options that define how sort kernels should behave -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct SortOptions { - /// Whether to sort in descending order - pub descending: bool, - /// Whether to sort nulls first - pub nulls_first: bool, -} - -impl Default for SortOptions { - fn default() -> Self { - Self { - descending: false, - // default to nulls first to match spark's behavior - nulls_first: true, - } - } -} - /// Sort boolean values /// /// when a limit is present, the sort is pair-comparison based as k-select might be more efficient, diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml new file mode 100644 index 000000000000..4741c9d5840e --- /dev/null +++ b/arrow-row/Cargo.toml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-row" +version = "29.0.0" +description = "Arrow row format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_row" +path = "src/lib.rs" +bench = false + +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } + +[dependencies] +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } + +half = { version = "2.1", default-features = false } +hashbrown = { version = "0.13", default-features = false } + +[dev-dependencies] +arrow-cast = { version = "29.0.0", path = "../arrow-cast" } +arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[features] + diff --git a/arrow/src/row/dictionary.rs b/arrow-row/src/dictionary.rs similarity index 97% rename from arrow/src/row/dictionary.rs rename to arrow-row/src/dictionary.rs index 82169a37d359..0da6c68d1684 100644 --- a/arrow/src/row/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -15,17 +15,16 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::fixed::{FixedLengthEncoding, FromSlice}; -use crate::row::interner::{Interned, OrderPreservingInterner}; -use crate::row::{null_sentinel, Rows}; +use crate::fixed::{FixedLengthEncoding, FromSlice}; +use crate::interner::{Interned, OrderPreservingInterner}; +use crate::{null_sentinel, Rows}; use arrow_array::builder::*; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::{ArrowError, DataType, SortOptions}; use std::collections::hash_map::Entry; use std::collections::HashMap; diff --git a/arrow/src/row/fixed.rs b/arrow-row/src/fixed.rs similarity index 98% rename from arrow/src/row/fixed.rs rename to arrow-row/src/fixed.rs index 03c53c994794..159eba9adf19 100644 --- a/arrow/src/row/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -16,14 +16,12 @@ // under the License. use crate::array::PrimitiveArray; -use crate::compute::SortOptions; -use crate::datatypes::ArrowPrimitiveType; -use crate::row::{null_sentinel, Rows}; +use crate::{null_sentinel, Rows}; use arrow_array::builder::BufferBuilder; -use arrow_array::{BooleanArray, FixedSizeBinaryArray}; +use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{DataType, SortOptions}; use half::f16; pub trait FromSlice { diff --git a/arrow/src/row/interner.rs b/arrow-row/src/interner.rs similarity index 100% rename from arrow/src/row/interner.rs rename to arrow-row/src/interner.rs diff --git a/arrow/src/row/mod.rs b/arrow-row/src/lib.rs similarity index 98% rename from arrow/src/row/mod.rs rename to arrow-row/src/lib.rs index bf58cf2f01ea..cf23e6e5c3b0 100644 --- a/arrow/src/row/mod.rs +++ b/arrow-row/src/lib.rs @@ -50,7 +50,7 @@ //! # Basic Example //! ``` //! # use std::sync::Arc; -//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_row::{RowConverter, SortField}; //! # use arrow_array::{ArrayRef, Int32Array, StringArray}; //! # use arrow_array::cast::{as_primitive_array, as_string_array}; //! # use arrow_array::types::Int32Type; @@ -102,7 +102,7 @@ //! The row format can also be used to implement a fast multi-column / lexicographic sort //! //! ``` -//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_row::{RowConverter, SortField}; //! # use arrow_array::{ArrayRef, UInt32Array}; //! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { //! let fields = arrays @@ -117,11 +117,11 @@ //! } //! ``` //! -//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] -//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] -//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] -//! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] -//! [`lexsort`]: crate::compute::kernels::sort::lexsort +//! [non-comparison sorts]: https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts +//! [radix sort]: https://en.wikipedia.org/wiki/Radix_sort +//! [normalized for sorting]: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf +//! [`memcmp`]: https://www.man7.org/linux/man-pages/man3/memcmp.3.html +//! [`lexsort`]: https://docs.rs/arrow-ord/latest/arrow_ord/sort/fn.lexsort.html //! [compared]: PartialOrd //! [compare]: PartialOrd @@ -131,18 +131,16 @@ use std::sync::Arc; use arrow_array::cast::*; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; +use arrow_schema::*; -use crate::compute::SortOptions; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::row::dictionary::{ +use crate::dictionary::{ compute_dictionary_mapping, decode_dictionary, encode_dictionary, }; -use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; -use crate::row::interner::OrderPreservingInterner; -use crate::row::variable::{decode_binary, decode_string}; -use crate::{downcast_dictionary_array, downcast_primitive_array}; +use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; +use crate::interner::OrderPreservingInterner; +use crate::variable::{decode_binary, decode_string}; mod dictionary; mod fixed; @@ -437,7 +435,7 @@ enum Codec { } impl Codec { - fn new(sort_field: &SortField) -> Result { + fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())), d if !d.is_nested() => Ok(Self::Stateless), @@ -485,7 +483,7 @@ impl Codec { } } - fn encoder(&mut self, array: &dyn Array) -> Result> { + fn encoder(&mut self, array: &dyn Array) -> Result, ArrowError> { match self { Codec::Stateless => Ok(Encoder::Stateless), Codec::Dictionary(interner) => { @@ -577,7 +575,7 @@ impl SortField { impl RowConverter { /// Create a new [`RowConverter`] with the provided schema - pub fn new(fields: Vec) -> Result { + pub fn new(fields: Vec) -> Result { if !Self::supports_fields(&fields) { return Err(ArrowError::NotYetImplemented(format!( "Row format support not yet implemented for: {:?}", @@ -585,7 +583,7 @@ impl RowConverter { ))); } - let codecs = fields.iter().map(Codec::new).collect::>()?; + let codecs = fields.iter().map(Codec::new).collect::>()?; Ok(Self { fields: fields.into(), codecs, @@ -617,7 +615,7 @@ impl RowConverter { /// # Panics /// /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`] - pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { + pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { if columns.len() != self.fields.len() { return Err(ArrowError::InvalidArgumentError(format!( "Incorrect number of arrays provided to RowConverter, expected {} got {}", @@ -640,7 +638,7 @@ impl RowConverter { } codec.encoder(column.as_ref()) }) - .collect::>>()?; + .collect::, _>>()?; let config = RowConfig { fields: Arc::clone(&self.fields), @@ -671,7 +669,7 @@ impl RowConverter { /// # Panics /// /// Panics if the rows were not produced by this [`RowConverter`] - pub fn convert_rows<'a, I>(&self, rows: I) -> Result> + pub fn convert_rows<'a, I>(&self, rows: I) -> Result, ArrowError> where I: IntoIterator>, { @@ -703,7 +701,7 @@ impl RowConverter { &self, rows: &mut [&[u8]], validate_utf8: bool, - ) -> Result> { + ) -> Result, ArrowError> { self.fields .iter() .zip(&self.codecs) @@ -1196,7 +1194,7 @@ unsafe fn decode_column( rows: &mut [&[u8]], codec: &Codec, validate_utf8: bool, -) -> Result { +) -> Result { let options = field.options; let array: ArrayRef = match codec { @@ -1255,24 +1253,18 @@ unsafe fn decode_column( mod tests { use std::sync::Arc; - use arrow_array::builder::{ - FixedSizeBinaryBuilder, GenericListBuilder, Int32Builder, - }; use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; use rand::{thread_rng, Rng}; - use arrow_array::NullArray; + use arrow_array::builder::*; + use arrow_array::types::*; + use arrow_array::*; + use arrow_buffer::i256; use arrow_buffer::Buffer; + use arrow_cast::display::array_value_to_string; use arrow_ord::sort::{LexicographicalComparator, SortColumn, SortOptions}; - use crate::array::{ - BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray, - Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray, - PrimitiveDictionaryBuilder, StringArray, - }; - use crate::util::display::array_value_to_string; - use super::*; #[test] diff --git a/arrow/src/row/list.rs b/arrow-row/src/list.rs similarity index 98% rename from arrow/src/row/list.rs rename to arrow-row/src/list.rs index e5ea5c2a04c4..dcd247be1a7b 100644 --- a/arrow/src/row/list.rs +++ b/arrow-row/src/list.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::{RowConverter, Rows, SortField}; +use crate::{RowConverter, Rows, SortField}; use arrow_array::builder::BufferBuilder; use arrow_array::{Array, GenericListArray, OffsetSizeTrait}; use arrow_data::ArrayDataBuilder; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, SortOptions}; use std::ops::Range; pub fn compute_lengths( diff --git a/arrow/src/row/variable.rs b/arrow-row/src/variable.rs similarity index 97% rename from arrow/src/row/variable.rs rename to arrow-row/src/variable.rs index 9162f2312031..c927f76963ab 100644 --- a/arrow/src/row/variable.rs +++ b/arrow-row/src/variable.rs @@ -15,14 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::compute::SortOptions; -use crate::row::{null_sentinel, Rows}; -use crate::util::bit_util::ceil; +use crate::{null_sentinel, Rows}; use arrow_array::builder::BufferBuilder; -use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait}; +use arrow_array::*; +use arrow_buffer::bit_util::ceil; use arrow_buffer::MutableBuffer; use arrow_data::ArrayDataBuilder; -use arrow_schema::DataType; +use arrow_schema::{DataType, SortOptions}; /// The block size of the variable length encoding pub const BLOCK_SIZE: usize = 32; diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 34030f2d356e..c2b1aba3b926 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -25,3 +25,22 @@ mod field; pub use field::*; mod schema; pub use schema::*; + +/// Options that define the sort order of a given column +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct SortOptions { + /// Whether to sort in descending order + pub descending: bool, + /// Whether to sort nulls first + pub nulls_first: bool, +} + +impl Default for SortOptions { + fn default() -> Self { + Self { + descending: false, + // default to nulls first to match spark's behavior + nulls_first: true, + } + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 0954909a0990..772c1be7745e 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -53,9 +53,11 @@ arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", optional = true } arrow-json = { version = "29.0.0", path = "../arrow-json", optional = true } arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +arrow-row = { version = "29.0.0", path = "../arrow-row" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } arrow-select = { version = "29.0.0", path = "../arrow-select" } arrow-string = { version = "29.0.0", path = "../arrow-string" } + rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index d57168dc9ea2..8611acf52fec 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -328,7 +328,7 @@ pub mod pyarrow; pub mod record_batch { pub use arrow_array::{RecordBatch, RecordBatchOptions, RecordBatchReader}; } -pub mod row; pub use arrow_array::temporal_conversions; +pub use arrow_row as row; pub mod tensor; pub mod util; diff --git a/dev/release/README.md b/dev/release/README.md index 75849641d8b5..a18d8a4992c0 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -258,13 +258,12 @@ Rust Arrow Crates: (cd arrow-array && cargo publish) (cd arrow-select && cargo publish) (cd arrow-cast && cargo publish) -(cd arrow-string && cargo publish) -(cd arrow-ord && cargo publish) (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) (cd arrow-ord && cargo publish) (cd arrow-string && cargo publish) +(cd arrow-row && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) From 13e0b871ed9f9a6df2b7d46c4589488e60cb2e60 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Dec 2022 18:15:45 +0000 Subject: [PATCH 0439/1411] Update prost-build (#3385) --- arrow-flight/Cargo.toml | 2 +- arrow-flight/src/arrow.flight.protocol.rs | 9 + .../src/sql/arrow.flight.protocol.sql.rs | 405 ++++++++++++++++++ 3 files changed, 415 insertions(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index ea02bfed497d..80710d1fac4f 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -56,7 +56,7 @@ tower = "0.4.13" # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.49", default-features = false } -prost-build = { version = "=0.11.4", default-features = false } +prost-build = { version = "=0.11.5", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index a61c83d0c146..c79ec65ef921 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -148,6 +148,15 @@ pub mod flight_descriptor { DescriptorType::Cmd => "CMD", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "UNKNOWN" => Some(Self::Unknown), + "PATH" => Some(Self::Path), + "CMD" => Some(Self::Cmd), + _ => None, + } + } } } /// diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index c2eb8d618348..080156cce88e 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -1234,6 +1234,125 @@ impl SqlInfo { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "FLIGHT_SQL_SERVER_NAME" => Some(Self::FlightSqlServerName), + "FLIGHT_SQL_SERVER_VERSION" => Some(Self::FlightSqlServerVersion), + "FLIGHT_SQL_SERVER_ARROW_VERSION" => Some(Self::FlightSqlServerArrowVersion), + "FLIGHT_SQL_SERVER_READ_ONLY" => Some(Self::FlightSqlServerReadOnly), + "SQL_DDL_CATALOG" => Some(Self::SqlDdlCatalog), + "SQL_DDL_SCHEMA" => Some(Self::SqlDdlSchema), + "SQL_DDL_TABLE" => Some(Self::SqlDdlTable), + "SQL_IDENTIFIER_CASE" => Some(Self::SqlIdentifierCase), + "SQL_IDENTIFIER_QUOTE_CHAR" => Some(Self::SqlIdentifierQuoteChar), + "SQL_QUOTED_IDENTIFIER_CASE" => Some(Self::SqlQuotedIdentifierCase), + "SQL_ALL_TABLES_ARE_SELECTABLE" => Some(Self::SqlAllTablesAreSelectable), + "SQL_NULL_ORDERING" => Some(Self::SqlNullOrdering), + "SQL_KEYWORDS" => Some(Self::SqlKeywords), + "SQL_NUMERIC_FUNCTIONS" => Some(Self::SqlNumericFunctions), + "SQL_STRING_FUNCTIONS" => Some(Self::SqlStringFunctions), + "SQL_SYSTEM_FUNCTIONS" => Some(Self::SqlSystemFunctions), + "SQL_DATETIME_FUNCTIONS" => Some(Self::SqlDatetimeFunctions), + "SQL_SEARCH_STRING_ESCAPE" => Some(Self::SqlSearchStringEscape), + "SQL_EXTRA_NAME_CHARACTERS" => Some(Self::SqlExtraNameCharacters), + "SQL_SUPPORTS_COLUMN_ALIASING" => Some(Self::SqlSupportsColumnAliasing), + "SQL_NULL_PLUS_NULL_IS_NULL" => Some(Self::SqlNullPlusNullIsNull), + "SQL_SUPPORTS_CONVERT" => Some(Self::SqlSupportsConvert), + "SQL_SUPPORTS_TABLE_CORRELATION_NAMES" => { + Some(Self::SqlSupportsTableCorrelationNames) + } + "SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES" => { + Some(Self::SqlSupportsDifferentTableCorrelationNames) + } + "SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY" => { + Some(Self::SqlSupportsExpressionsInOrderBy) + } + "SQL_SUPPORTS_ORDER_BY_UNRELATED" => Some(Self::SqlSupportsOrderByUnrelated), + "SQL_SUPPORTED_GROUP_BY" => Some(Self::SqlSupportedGroupBy), + "SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE" => Some(Self::SqlSupportsLikeEscapeClause), + "SQL_SUPPORTS_NON_NULLABLE_COLUMNS" => { + Some(Self::SqlSupportsNonNullableColumns) + } + "SQL_SUPPORTED_GRAMMAR" => Some(Self::SqlSupportedGrammar), + "SQL_ANSI92_SUPPORTED_LEVEL" => Some(Self::SqlAnsi92SupportedLevel), + "SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY" => { + Some(Self::SqlSupportsIntegrityEnhancementFacility) + } + "SQL_OUTER_JOINS_SUPPORT_LEVEL" => Some(Self::SqlOuterJoinsSupportLevel), + "SQL_SCHEMA_TERM" => Some(Self::SqlSchemaTerm), + "SQL_PROCEDURE_TERM" => Some(Self::SqlProcedureTerm), + "SQL_CATALOG_TERM" => Some(Self::SqlCatalogTerm), + "SQL_CATALOG_AT_START" => Some(Self::SqlCatalogAtStart), + "SQL_SCHEMAS_SUPPORTED_ACTIONS" => Some(Self::SqlSchemasSupportedActions), + "SQL_CATALOGS_SUPPORTED_ACTIONS" => Some(Self::SqlCatalogsSupportedActions), + "SQL_SUPPORTED_POSITIONED_COMMANDS" => { + Some(Self::SqlSupportedPositionedCommands) + } + "SQL_SELECT_FOR_UPDATE_SUPPORTED" => Some(Self::SqlSelectForUpdateSupported), + "SQL_STORED_PROCEDURES_SUPPORTED" => Some(Self::SqlStoredProceduresSupported), + "SQL_SUPPORTED_SUBQUERIES" => Some(Self::SqlSupportedSubqueries), + "SQL_CORRELATED_SUBQUERIES_SUPPORTED" => { + Some(Self::SqlCorrelatedSubqueriesSupported) + } + "SQL_SUPPORTED_UNIONS" => Some(Self::SqlSupportedUnions), + "SQL_MAX_BINARY_LITERAL_LENGTH" => Some(Self::SqlMaxBinaryLiteralLength), + "SQL_MAX_CHAR_LITERAL_LENGTH" => Some(Self::SqlMaxCharLiteralLength), + "SQL_MAX_COLUMN_NAME_LENGTH" => Some(Self::SqlMaxColumnNameLength), + "SQL_MAX_COLUMNS_IN_GROUP_BY" => Some(Self::SqlMaxColumnsInGroupBy), + "SQL_MAX_COLUMNS_IN_INDEX" => Some(Self::SqlMaxColumnsInIndex), + "SQL_MAX_COLUMNS_IN_ORDER_BY" => Some(Self::SqlMaxColumnsInOrderBy), + "SQL_MAX_COLUMNS_IN_SELECT" => Some(Self::SqlMaxColumnsInSelect), + "SQL_MAX_COLUMNS_IN_TABLE" => Some(Self::SqlMaxColumnsInTable), + "SQL_MAX_CONNECTIONS" => Some(Self::SqlMaxConnections), + "SQL_MAX_CURSOR_NAME_LENGTH" => Some(Self::SqlMaxCursorNameLength), + "SQL_MAX_INDEX_LENGTH" => Some(Self::SqlMaxIndexLength), + "SQL_DB_SCHEMA_NAME_LENGTH" => Some(Self::SqlDbSchemaNameLength), + "SQL_MAX_PROCEDURE_NAME_LENGTH" => Some(Self::SqlMaxProcedureNameLength), + "SQL_MAX_CATALOG_NAME_LENGTH" => Some(Self::SqlMaxCatalogNameLength), + "SQL_MAX_ROW_SIZE" => Some(Self::SqlMaxRowSize), + "SQL_MAX_ROW_SIZE_INCLUDES_BLOBS" => Some(Self::SqlMaxRowSizeIncludesBlobs), + "SQL_MAX_STATEMENT_LENGTH" => Some(Self::SqlMaxStatementLength), + "SQL_MAX_STATEMENTS" => Some(Self::SqlMaxStatements), + "SQL_MAX_TABLE_NAME_LENGTH" => Some(Self::SqlMaxTableNameLength), + "SQL_MAX_TABLES_IN_SELECT" => Some(Self::SqlMaxTablesInSelect), + "SQL_MAX_USERNAME_LENGTH" => Some(Self::SqlMaxUsernameLength), + "SQL_DEFAULT_TRANSACTION_ISOLATION" => { + Some(Self::SqlDefaultTransactionIsolation) + } + "SQL_TRANSACTIONS_SUPPORTED" => Some(Self::SqlTransactionsSupported), + "SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS" => { + Some(Self::SqlSupportedTransactionsIsolationLevels) + } + "SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT" => { + Some(Self::SqlDataDefinitionCausesTransactionCommit) + } + "SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED" => { + Some(Self::SqlDataDefinitionsInTransactionsIgnored) + } + "SQL_SUPPORTED_RESULT_SET_TYPES" => Some(Self::SqlSupportedResultSetTypes), + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED" => { + Some(Self::SqlSupportedConcurrenciesForResultSetUnspecified) + } + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY" => { + Some(Self::SqlSupportedConcurrenciesForResultSetForwardOnly) + } + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE" => { + Some(Self::SqlSupportedConcurrenciesForResultSetScrollSensitive) + } + "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE" => { + Some(Self::SqlSupportedConcurrenciesForResultSetScrollInsensitive) + } + "SQL_BATCH_UPDATES_SUPPORTED" => Some(Self::SqlBatchUpdatesSupported), + "SQL_SAVEPOINTS_SUPPORTED" => Some(Self::SqlSavepointsSupported), + "SQL_NAMED_PARAMETERS_SUPPORTED" => Some(Self::SqlNamedParametersSupported), + "SQL_LOCATORS_UPDATE_COPY" => Some(Self::SqlLocatorsUpdateCopy), + "SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED" => { + Some(Self::SqlStoredFunctionsUsingCallSyntaxSupported) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1264,6 +1383,18 @@ impl SqlSupportedCaseSensitivity { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_CASE_SENSITIVITY_UNKNOWN" => Some(Self::SqlCaseSensitivityUnknown), + "SQL_CASE_SENSITIVITY_CASE_INSENSITIVE" => { + Some(Self::SqlCaseSensitivityCaseInsensitive) + } + "SQL_CASE_SENSITIVITY_UPPERCASE" => Some(Self::SqlCaseSensitivityUppercase), + "SQL_CASE_SENSITIVITY_LOWERCASE" => Some(Self::SqlCaseSensitivityLowercase), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1286,6 +1417,16 @@ impl SqlNullOrdering { SqlNullOrdering::SqlNullsSortedAtEnd => "SQL_NULLS_SORTED_AT_END", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_NULLS_SORTED_HIGH" => Some(Self::SqlNullsSortedHigh), + "SQL_NULLS_SORTED_LOW" => Some(Self::SqlNullsSortedLow), + "SQL_NULLS_SORTED_AT_START" => Some(Self::SqlNullsSortedAtStart), + "SQL_NULLS_SORTED_AT_END" => Some(Self::SqlNullsSortedAtEnd), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1306,6 +1447,15 @@ impl SupportedSqlGrammar { SupportedSqlGrammar::SqlExtendedGrammar => "SQL_EXTENDED_GRAMMAR", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_MINIMUM_GRAMMAR" => Some(Self::SqlMinimumGrammar), + "SQL_CORE_GRAMMAR" => Some(Self::SqlCoreGrammar), + "SQL_EXTENDED_GRAMMAR" => Some(Self::SqlExtendedGrammar), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1328,6 +1478,15 @@ impl SupportedAnsi92SqlGrammarLevel { SupportedAnsi92SqlGrammarLevel::Ansi92FullSql => "ANSI92_FULL_SQL", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "ANSI92_ENTRY_SQL" => Some(Self::Ansi92EntrySql), + "ANSI92_INTERMEDIATE_SQL" => Some(Self::Ansi92IntermediateSql), + "ANSI92_FULL_SQL" => Some(Self::Ansi92FullSql), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1348,6 +1507,15 @@ impl SqlOuterJoinsSupportLevel { SqlOuterJoinsSupportLevel::SqlFullOuterJoins => "SQL_FULL_OUTER_JOINS", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_JOINS_UNSUPPORTED" => Some(Self::SqlJoinsUnsupported), + "SQL_LIMITED_OUTER_JOINS" => Some(Self::SqlLimitedOuterJoins), + "SQL_FULL_OUTER_JOINS" => Some(Self::SqlFullOuterJoins), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1366,6 +1534,14 @@ impl SqlSupportedGroupBy { SqlSupportedGroupBy::SqlGroupByBeyondSelect => "SQL_GROUP_BY_BEYOND_SELECT", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_GROUP_BY_UNRELATED" => Some(Self::SqlGroupByUnrelated), + "SQL_GROUP_BY_BEYOND_SELECT" => Some(Self::SqlGroupByBeyondSelect), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1392,6 +1568,19 @@ impl SqlSupportedElementActions { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_ELEMENT_IN_PROCEDURE_CALLS" => Some(Self::SqlElementInProcedureCalls), + "SQL_ELEMENT_IN_INDEX_DEFINITIONS" => { + Some(Self::SqlElementInIndexDefinitions) + } + "SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS" => { + Some(Self::SqlElementInPrivilegeDefinitions) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1414,6 +1603,14 @@ impl SqlSupportedPositionedCommands { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_POSITIONED_DELETE" => Some(Self::SqlPositionedDelete), + "SQL_POSITIONED_UPDATE" => Some(Self::SqlPositionedUpdate), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1440,6 +1637,16 @@ impl SqlSupportedSubqueries { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_SUBQUERIES_IN_COMPARISONS" => Some(Self::SqlSubqueriesInComparisons), + "SQL_SUBQUERIES_IN_EXISTS" => Some(Self::SqlSubqueriesInExists), + "SQL_SUBQUERIES_IN_INS" => Some(Self::SqlSubqueriesInIns), + "SQL_SUBQUERIES_IN_QUANTIFIEDS" => Some(Self::SqlSubqueriesInQuantifieds), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1458,6 +1665,14 @@ impl SqlSupportedUnions { SqlSupportedUnions::SqlUnionAll => "SQL_UNION_ALL", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_UNION" => Some(Self::SqlUnion), + "SQL_UNION_ALL" => Some(Self::SqlUnionAll), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1490,6 +1705,19 @@ impl SqlTransactionIsolationLevel { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_TRANSACTION_NONE" => Some(Self::SqlTransactionNone), + "SQL_TRANSACTION_READ_UNCOMMITTED" => { + Some(Self::SqlTransactionReadUncommitted) + } + "SQL_TRANSACTION_READ_COMMITTED" => Some(Self::SqlTransactionReadCommitted), + "SQL_TRANSACTION_REPEATABLE_READ" => Some(Self::SqlTransactionRepeatableRead), + "SQL_TRANSACTION_SERIALIZABLE" => Some(Self::SqlTransactionSerializable), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1516,6 +1744,19 @@ impl SqlSupportedTransactions { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_TRANSACTION_UNSPECIFIED" => Some(Self::SqlTransactionUnspecified), + "SQL_DATA_DEFINITION_TRANSACTIONS" => { + Some(Self::SqlDataDefinitionTransactions) + } + "SQL_DATA_MANIPULATION_TRANSACTIONS" => { + Some(Self::SqlDataManipulationTransactions) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1546,6 +1787,20 @@ impl SqlSupportedResultSetType { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_RESULT_SET_TYPE_UNSPECIFIED" => Some(Self::SqlResultSetTypeUnspecified), + "SQL_RESULT_SET_TYPE_FORWARD_ONLY" => Some(Self::SqlResultSetTypeForwardOnly), + "SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE" => { + Some(Self::SqlResultSetTypeScrollInsensitive) + } + "SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE" => { + Some(Self::SqlResultSetTypeScrollSensitive) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1572,6 +1827,21 @@ impl SqlSupportedResultSetConcurrency { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED" => { + Some(Self::SqlResultSetConcurrencyUnspecified) + } + "SQL_RESULT_SET_CONCURRENCY_READ_ONLY" => { + Some(Self::SqlResultSetConcurrencyReadOnly) + } + "SQL_RESULT_SET_CONCURRENCY_UPDATABLE" => { + Some(Self::SqlResultSetConcurrencyUpdatable) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1630,6 +1900,32 @@ impl SqlSupportsConvert { SqlSupportsConvert::SqlConvertVarchar => "SQL_CONVERT_VARCHAR", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_CONVERT_BIGINT" => Some(Self::SqlConvertBigint), + "SQL_CONVERT_BINARY" => Some(Self::SqlConvertBinary), + "SQL_CONVERT_BIT" => Some(Self::SqlConvertBit), + "SQL_CONVERT_CHAR" => Some(Self::SqlConvertChar), + "SQL_CONVERT_DATE" => Some(Self::SqlConvertDate), + "SQL_CONVERT_DECIMAL" => Some(Self::SqlConvertDecimal), + "SQL_CONVERT_FLOAT" => Some(Self::SqlConvertFloat), + "SQL_CONVERT_INTEGER" => Some(Self::SqlConvertInteger), + "SQL_CONVERT_INTERVAL_DAY_TIME" => Some(Self::SqlConvertIntervalDayTime), + "SQL_CONVERT_INTERVAL_YEAR_MONTH" => Some(Self::SqlConvertIntervalYearMonth), + "SQL_CONVERT_LONGVARBINARY" => Some(Self::SqlConvertLongvarbinary), + "SQL_CONVERT_LONGVARCHAR" => Some(Self::SqlConvertLongvarchar), + "SQL_CONVERT_NUMERIC" => Some(Self::SqlConvertNumeric), + "SQL_CONVERT_REAL" => Some(Self::SqlConvertReal), + "SQL_CONVERT_SMALLINT" => Some(Self::SqlConvertSmallint), + "SQL_CONVERT_TIME" => Some(Self::SqlConvertTime), + "SQL_CONVERT_TIMESTAMP" => Some(Self::SqlConvertTimestamp), + "SQL_CONVERT_TINYINT" => Some(Self::SqlConvertTinyint), + "SQL_CONVERT_VARBINARY" => Some(Self::SqlConvertVarbinary), + "SQL_CONVERT_VARCHAR" => Some(Self::SqlConvertVarchar), + _ => None, + } + } } /// * /// The JDBC/ODBC-defined type of any object. @@ -1695,6 +1991,36 @@ impl XdbcDataType { XdbcDataType::XdbcWvarchar => "XDBC_WVARCHAR", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "XDBC_UNKNOWN_TYPE" => Some(Self::XdbcUnknownType), + "XDBC_CHAR" => Some(Self::XdbcChar), + "XDBC_NUMERIC" => Some(Self::XdbcNumeric), + "XDBC_DECIMAL" => Some(Self::XdbcDecimal), + "XDBC_INTEGER" => Some(Self::XdbcInteger), + "XDBC_SMALLINT" => Some(Self::XdbcSmallint), + "XDBC_FLOAT" => Some(Self::XdbcFloat), + "XDBC_REAL" => Some(Self::XdbcReal), + "XDBC_DOUBLE" => Some(Self::XdbcDouble), + "XDBC_DATETIME" => Some(Self::XdbcDatetime), + "XDBC_INTERVAL" => Some(Self::XdbcInterval), + "XDBC_VARCHAR" => Some(Self::XdbcVarchar), + "XDBC_DATE" => Some(Self::XdbcDate), + "XDBC_TIME" => Some(Self::XdbcTime), + "XDBC_TIMESTAMP" => Some(Self::XdbcTimestamp), + "XDBC_LONGVARCHAR" => Some(Self::XdbcLongvarchar), + "XDBC_BINARY" => Some(Self::XdbcBinary), + "XDBC_VARBINARY" => Some(Self::XdbcVarbinary), + "XDBC_LONGVARBINARY" => Some(Self::XdbcLongvarbinary), + "XDBC_BIGINT" => Some(Self::XdbcBigint), + "XDBC_TINYINT" => Some(Self::XdbcTinyint), + "XDBC_BIT" => Some(Self::XdbcBit), + "XDBC_WCHAR" => Some(Self::XdbcWchar), + "XDBC_WVARCHAR" => Some(Self::XdbcWvarchar), + _ => None, + } + } } /// * /// Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. @@ -1791,6 +2117,55 @@ impl XdbcDatetimeSubcode { } } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "XDBC_SUBCODE_UNKNOWN" => Some(Self::XdbcSubcodeUnknown), + "XDBC_SUBCODE_YEAR" => Some(Self::XdbcSubcodeYear), + "XDBC_SUBCODE_TIME" => Some(Self::XdbcSubcodeTime), + "XDBC_SUBCODE_TIMESTAMP" => Some(Self::XdbcSubcodeTimestamp), + "XDBC_SUBCODE_TIME_WITH_TIMEZONE" => Some(Self::XdbcSubcodeTimeWithTimezone), + "XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE" => { + Some(Self::XdbcSubcodeTimestampWithTimezone) + } + "XDBC_SUBCODE_SECOND" => Some(Self::XdbcSubcodeSecond), + "XDBC_SUBCODE_YEAR_TO_MONTH" => Some(Self::XdbcSubcodeYearToMonth), + "XDBC_SUBCODE_DAY_TO_HOUR" => Some(Self::XdbcSubcodeDayToHour), + "XDBC_SUBCODE_DAY_TO_MINUTE" => Some(Self::XdbcSubcodeDayToMinute), + "XDBC_SUBCODE_DAY_TO_SECOND" => Some(Self::XdbcSubcodeDayToSecond), + "XDBC_SUBCODE_HOUR_TO_MINUTE" => Some(Self::XdbcSubcodeHourToMinute), + "XDBC_SUBCODE_HOUR_TO_SECOND" => Some(Self::XdbcSubcodeHourToSecond), + "XDBC_SUBCODE_MINUTE_TO_SECOND" => Some(Self::XdbcSubcodeMinuteToSecond), + "XDBC_SUBCODE_INTERVAL_YEAR" => Some(Self::XdbcSubcodeIntervalYear), + "XDBC_SUBCODE_INTERVAL_MONTH" => Some(Self::XdbcSubcodeIntervalMonth), + "XDBC_SUBCODE_INTERVAL_DAY" => Some(Self::XdbcSubcodeIntervalDay), + "XDBC_SUBCODE_INTERVAL_HOUR" => Some(Self::XdbcSubcodeIntervalHour), + "XDBC_SUBCODE_INTERVAL_MINUTE" => Some(Self::XdbcSubcodeIntervalMinute), + "XDBC_SUBCODE_INTERVAL_SECOND" => Some(Self::XdbcSubcodeIntervalSecond), + "XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH" => { + Some(Self::XdbcSubcodeIntervalYearToMonth) + } + "XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR" => { + Some(Self::XdbcSubcodeIntervalDayToHour) + } + "XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE" => { + Some(Self::XdbcSubcodeIntervalDayToMinute) + } + "XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND" => { + Some(Self::XdbcSubcodeIntervalDayToSecond) + } + "XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE" => { + Some(Self::XdbcSubcodeIntervalHourToMinute) + } + "XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND" => { + Some(Self::XdbcSubcodeIntervalHourToSecond) + } + "XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND" => { + Some(Self::XdbcSubcodeIntervalMinuteToSecond) + } + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1817,6 +2192,15 @@ impl Nullable { Nullable::NullabilityUnknown => "NULLABILITY_UNKNOWN", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "NULLABILITY_NO_NULLS" => Some(Self::NullabilityNoNulls), + "NULLABILITY_NULLABLE" => Some(Self::NullabilityNullable), + "NULLABILITY_UNKNOWN" => Some(Self::NullabilityUnknown), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1852,6 +2236,16 @@ impl Searchable { Searchable::Full => "SEARCHABLE_FULL", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SEARCHABLE_NONE" => Some(Self::None), + "SEARCHABLE_CHAR" => Some(Self::Char), + "SEARCHABLE_BASIC" => Some(Self::Basic), + "SEARCHABLE_FULL" => Some(Self::Full), + _ => None, + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -1876,4 +2270,15 @@ impl UpdateDeleteRules { UpdateDeleteRules::SetDefault => "SET_DEFAULT", } } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "CASCADE" => Some(Self::Cascade), + "RESTRICT" => Some(Self::Restrict), + "SET_NULL" => Some(Self::SetNull), + "NO_ACTION" => Some(Self::NoAction), + "SET_DEFAULT" => Some(Self::SetDefault), + _ => None, + } + } } From e7fc07304119d7c9b94437391b9adb92fe7d8b3f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Dec 2022 21:20:51 +0000 Subject: [PATCH 0440/1411] Split out arrow-arith (#2594) (#3384) * Split out arrow-arith (#2594) * Update CI * Fix clippy * Update docs * Feature flag * Fix CI * Cleanup dependencies --- .github/workflows/arrow.yml | 9 +- Cargo.toml | 1 + arrow-arith/Cargo.toml | 57 +++++ .../kernels => arrow-arith/src}/aggregate.rs | 46 ++-- .../kernels => arrow-arith/src}/arithmetic.rs | 238 +++++++++--------- .../kernels => arrow-arith/src}/arity.rs | 85 +++---- .../kernels => arrow-arith/src}/bitwise.rs | 74 +++--- .../kernels => arrow-arith/src}/boolean.rs | 111 ++++---- arrow-arith/src/lib.rs | 25 ++ .../kernels => arrow-arith/src}/temporal.rs | 73 +++--- arrow/Cargo.toml | 12 +- arrow/src/compute/kernels/mod.rs | 7 +- arrow/src/lib.rs | 2 + dev/release/README.md | 1 + 14 files changed, 398 insertions(+), 343 deletions(-) create mode 100644 arrow-arith/Cargo.toml rename {arrow/src/compute/kernels => arrow-arith/src}/aggregate.rs (98%) rename {arrow/src/compute/kernels => arrow-arith/src}/arithmetic.rs (95%) rename {arrow/src/compute/kernels => arrow-arith/src}/arity.rs (91%) rename {arrow/src/compute/kernels => arrow-arith/src}/bitwise.rs (81%) rename {arrow/src/compute/kernels => arrow-arith/src}/boolean.rs (92%) create mode 100644 arrow-arith/src/lib.rs rename {arrow/src/compute/kernels => arrow-arith/src}/temporal.rs (95%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index e0db2c08812a..c1e9d600a02a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -26,6 +26,7 @@ on: pull_request: paths: - .github/** + - arrow-arith/** - arrow-array/** - arrow-buffer/** - arrow-cast/** @@ -77,6 +78,8 @@ jobs: run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features except SIMD run: cargo test -p arrow-ord --features dyn_cmp_dict + - name: Test arrow-arith with all features except SIMD + run: cargo test -p arrow-arith --features dyn_arith_dict - name: Test arrow-row with all features run: cargo test -p arrow-row --all-features - name: Test arrow-integration-test with all features @@ -140,6 +143,8 @@ jobs: run: cargo test -p arrow-array --features simd - name: Test arrow-ord with SIMD run: cargo test -p arrow-ord --features simd + - name: Test arrow-arith with SIMD + run: cargo test -p arrow-arith --features simd - name: Test arrow with SIMD run: cargo test -p arrow --features simd - name: Check compilation --features simd --all-targets @@ -199,7 +204,9 @@ jobs: run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features except SIMD run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings + - name: Clippy arrow-arith with all features except SIMD + run: cargo clippy -p arrow-arith --all-targets --features dyn_arith_dict -- -D warnings - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow + - name: Clippy arrow with all features except SIMD run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/Cargo.toml b/Cargo.toml index fb072f7d346a..ebecc9eaf078 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "arrow", + "arrow-arith", "arrow-array", "arrow-buffer", "arrow-cast", diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml new file mode 100644 index 000000000000..854941c25345 --- /dev/null +++ b/arrow-arith/Cargo.toml @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-arith" +version = "29.0.0" +description = "Arrow arithmetic kernels" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_arith" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +chrono = { version = "0.4.23", default-features = false } +half = { version = "2.1", default-features = false } +multiversion = { version = "0.6.1", default-features = false } +num = { version = "0.4", default-features = false, features = ["std"] } + +[dev-dependencies] + +[package.metadata.docs.rs] +features = ["dyn_arith_dict"] + +[features] +dyn_arith_dict = [] +simd = ["arrow-array/simd"] diff --git a/arrow/src/compute/kernels/aggregate.rs b/arrow-arith/src/aggregate.rs similarity index 98% rename from arrow/src/compute/kernels/aggregate.rs rename to arrow-arith/src/aggregate.rs index 4e726974f66c..a9503130b0f9 100644 --- a/arrow/src/compute/kernels/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -17,19 +17,16 @@ //! Defines aggregations over Arrow arrays. -use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_schema::ArrowError; use multiversion::multiversion; -#[allow(unused_imports)] -use std::ops::{Add, Deref}; -use crate::array::{ - as_primitive_array, Array, ArrayAccessor, ArrayIter, BooleanArray, - GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, -}; -use crate::datatypes::{ArrowNativeType, ArrowNativeTypeOp, ArrowNumericType, DataType}; -use crate::error::Result; -use crate::util::bit_iterator::BitIndexIterator; +use arrow_array::cast::*; +use arrow_array::iterator::ArrayIter; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_data::bit_iterator::try_for_each_valid_idx; +use arrow_data::bit_iterator::BitIndexIterator; +use arrow_schema::ArrowError; +use arrow_schema::*; /// Generic test for NaN, the optimizer should be able to remove this for integer types. #[inline] @@ -63,10 +60,8 @@ where /// Returns the minimum value in the boolean array. /// /// ``` -/// use arrow::{ -/// array::BooleanArray, -/// compute::min_boolean, -/// }; +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::aggregate::min_boolean; /// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(min_boolean(&a), Some(false)) @@ -88,10 +83,8 @@ pub fn min_boolean(array: &BooleanArray) -> Option { /// Returns the maximum value in the boolean array /// /// ``` -/// use arrow::{ -/// array::BooleanArray, -/// compute::max_boolean, -/// }; +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::aggregate::max_boolean; /// /// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); /// assert_eq!(max_boolean(&a), Some(true)) @@ -205,7 +198,7 @@ where /// use `sum_array` instead. pub fn sum_array_checked>( array: A, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -345,7 +338,7 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `sum` instead. -pub fn sum_checked(array: &PrimitiveArray) -> Result> +pub fn sum_checked(array: &PrimitiveArray) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -375,7 +368,7 @@ where array.len(), array.offset(), null_count, - Some(buffer.deref()), + Some(buffer.as_slice()), |idx| { unsafe { sum = sum.add_checked(array.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) @@ -390,8 +383,7 @@ where #[cfg(feature = "simd")] mod simd { use super::is_nan; - use crate::array::{Array, PrimitiveArray}; - use crate::datatypes::{ArrowNativeTypeOp, ArrowNumericType}; + use arrow_array::*; use std::marker::PhantomData; pub(super) trait SimdAggregate { @@ -771,10 +763,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::*; - use crate::compute::add; - use crate::datatypes::{Float32Type, Int32Type, Int8Type}; - use arrow_array::types::Float64Type; + use crate::arithmetic::add; + use arrow_array::types::*; #[test] fn test_primitive_array_sum() { diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow-arith/src/arithmetic.rs similarity index 95% rename from arrow/src/compute/kernels/arithmetic.rs rename to arrow-arith/src/arithmetic.rs index 913a2cad6c93..8a4657d7e668 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -22,29 +22,12 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -use crate::array::*; -#[cfg(feature = "simd")] -use crate::buffer::MutableBuffer; -use crate::compute::kernels::arity::unary; -use crate::compute::{ - binary, binary_opt, try_binary, try_unary, try_unary_dyn, unary_dyn, -}; -use crate::datatypes::{ - ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, - IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, -}; -#[cfg(feature = "dyn_arith_dict")] -use crate::datatypes::{ - Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, - Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; -use crate::error::{ArrowError, Result}; -use crate::{datatypes, downcast_primitive_array}; +use crate::arity::*; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_schema::*; use num::traits::Pow; -#[cfg(feature = "simd")] -use std::borrow::BorrowMut; -#[cfg(feature = "simd")] -use std::slice::{ChunksExact, ChunksExactMut}; use std::sync::Arc; /// Helper function to perform math lambda function on values from two arrays. If either @@ -58,7 +41,7 @@ pub fn math_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result> +) -> Result, ArrowError> where LT: ArrowNumericType, RT: ArrowNumericType, @@ -76,11 +59,11 @@ fn math_checked_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result> +) -> Result, ArrowError> where LT: ArrowNumericType, RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Result, + F: Fn(LT::Native, RT::Native) -> Result, LT::Native: ArrowNativeTypeOp, RT::Native: ArrowNativeTypeOp, { @@ -99,11 +82,11 @@ fn math_checked_divide_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result> +) -> Result, ArrowError> where LT: ArrowNumericType, RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Result, + F: Fn(LT::Native, RT::Native) -> Result, { try_binary(left, right, op) } @@ -122,11 +105,11 @@ fn math_checked_divide_op_on_iters( right: impl Iterator>, op: F, len: usize, - null_bit_buffer: Option, -) -> Result> + null_bit_buffer: Option, +) -> Result, ArrowError> where T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, + F: Fn(T::Native, T::Native) -> Result, { let buffer = if null_bit_buffer.is_some() { let values = left.zip(right).map(|(left, right)| { @@ -137,7 +120,7 @@ where } }); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { crate::buffer::Buffer::try_from_trusted_len_iter(values) } + unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } } else { // no value is null let values = left @@ -145,11 +128,11 @@ where .zip(right.map(|r| r.unwrap())) .map(|(left, right)| op(left, right)); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { crate::buffer::Buffer::try_from_trusted_len_iter(values) } + unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } }?; let data = unsafe { - ArrayData::new_unchecked( + arrow_data::ArrayData::new_unchecked( T::DATA_TYPE, len, None, @@ -174,7 +157,7 @@ fn simd_checked_modulus( valid_mask: Option, left: T::Simd, right: T::Simd, -) -> Result +) -> Result where T::Native: ArrowNativeTypeOp, { @@ -211,7 +194,7 @@ fn simd_checked_divide( valid_mask: Option, left: T::Simd, right: T::Simd, -) -> Result +) -> Result where T::Native: ArrowNativeTypeOp, { @@ -247,11 +230,11 @@ where #[inline] fn simd_checked_divide_op_remainder( valid_mask: Option, - left_chunks: ChunksExact, - right_chunks: ChunksExact, - result_chunks: ChunksExactMut, + left_chunks: std::slice::ChunksExact, + right_chunks: std::slice::ChunksExact, + result_chunks: std::slice::ChunksExactMut, op: F, -) -> Result<()> +) -> Result<(), ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -296,11 +279,11 @@ fn simd_checked_divide_op( right: &PrimitiveArray, simd_op: SI, scalar_op: SC, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, - SI: Fn(Option, T::Simd, T::Simd) -> Result, + SI: Fn(Option, T::Simd, T::Simd) -> Result, SC: Fn(T::Native, T::Native) -> T::Native, { if left.len() != right.len() { @@ -317,7 +300,8 @@ where let lanes = T::lanes(); let buffer_size = left.len() * std::mem::size_of::(); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); + let mut result = + arrow_buffer::MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); match &null_bit_buffer { Some(b) => { @@ -332,11 +316,7 @@ where valid_chunks .iter() - .zip( - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())), - ) + .zip((&mut result_chunks).zip((&mut left_chunks).zip(&mut right_chunks))) .try_for_each( |(mut mask, (result_slice, (left_slice, right_slice)))| { // split chunks further into slices corresponding to the vector length @@ -345,7 +325,7 @@ where result_slice .chunks_exact_mut(lanes) .zip(left_slice.chunks_exact(lanes).zip(right_slice.chunks_exact(lanes))) - .try_for_each(|(result_slice, (left_slice, right_slice))| -> Result<()> { + .try_for_each(|(result_slice, (left_slice, right_slice))| -> Result<(), ArrowError> { let simd_left = T::load(left_slice); let simd_right = T::load(right_slice); @@ -376,21 +356,20 @@ where let mut left_chunks = left.values().chunks_exact(lanes); let mut right_chunks = right.values().chunks_exact(lanes); - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) + (&mut result_chunks) + .zip((&mut left_chunks).zip(&mut right_chunks)) .try_for_each( - |(result_slice, (left_slice, right_slice))| -> Result<()> { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); + |(result_slice, (left_slice, right_slice))| -> Result<(), ArrowError> { + let simd_left = T::load(left_slice); + let simd_right = T::load(right_slice); - let simd_result = simd_op(None, simd_left, simd_right)?; + let simd_result = simd_op(None, simd_left, simd_right)?; - T::write(simd_result, result_slice); + T::write(simd_result, result_slice); - Ok(()) - }, - )?; + Ok(()) + }, + )?; simd_checked_divide_op_remainder::( None, @@ -403,7 +382,7 @@ where } let data = unsafe { - ArrayData::new_unchecked( + arrow_data::ArrayData::new_unchecked( T::DATA_TYPE, left.len(), None, @@ -556,7 +535,7 @@ fn math_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result> +) -> Result, ArrowError> where K: ArrowNumericType, T: ArrowNumericType, @@ -612,11 +591,11 @@ fn math_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result> +) -> Result, ArrowError> where K: ArrowNumericType, T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, + F: Fn(T::Native, T::Native) -> Result, T::Native: ArrowNativeTypeOp, { // left and right's value types are supposed to be same as guaranteed by the caller macro now. @@ -646,11 +625,11 @@ fn math_divide_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result> +) -> Result, ArrowError> where K: ArrowNumericType, T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, + F: Fn(T::Native, T::Native) -> Result, { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( @@ -699,7 +678,7 @@ fn math_divide_safe_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, -) -> Result +) -> Result where K: ArrowNumericType, T: ArrowNumericType, @@ -715,7 +694,7 @@ fn math_safe_divide_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result +) -> Result where LT: ArrowNumericType, RT: ArrowNumericType, @@ -733,7 +712,7 @@ where pub fn add( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -749,7 +728,7 @@ where pub fn add_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -762,7 +741,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_dyn_checked` instead. -pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a.add_wrapping(b), math_op_dict) @@ -834,7 +813,10 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add_dyn` instead. -pub fn add_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { +pub fn add_dyn_checked( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -914,7 +896,7 @@ pub fn add_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result pub fn add_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -930,7 +912,7 @@ where pub fn add_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -946,7 +928,10 @@ where /// For an overflow-checking variant, use `add_scalar_checked_dyn` instead. /// /// This returns an `Err` when the input array is not supported for adding operation. -pub fn add_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result +pub fn add_scalar_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -963,7 +948,10 @@ where /// /// As this kernel has the branching costs and also prevents LLVM from vectorising it correctly, /// it is usually much slower than non-checking variant. -pub fn add_scalar_checked_dyn(array: &dyn Array, scalar: T::Native) -> Result +pub fn add_scalar_checked_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -980,7 +968,7 @@ where pub fn subtract( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -996,7 +984,7 @@ where pub fn subtract_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1009,7 +997,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_dyn_checked` instead. -pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a.sub_wrapping(b), math_op_dict) @@ -1033,7 +1021,10 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract_dyn` instead. -pub fn subtract_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { +pub fn subtract_dyn_checked( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -1065,7 +1056,7 @@ pub fn subtract_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1081,7 +1072,7 @@ where pub fn subtract_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1095,7 +1086,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_scalar_checked_dyn` instead. -pub fn subtract_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result +pub fn subtract_scalar_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1112,7 +1106,7 @@ where pub fn subtract_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, -) -> Result +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1125,7 +1119,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `negate_checked` instead. -pub fn negate(array: &PrimitiveArray) -> Result> +pub fn negate(array: &PrimitiveArray) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1137,7 +1131,9 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `negate` instead. -pub fn negate_checked(array: &PrimitiveArray) -> Result> +pub fn negate_checked( + array: &PrimitiveArray, +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1149,9 +1145,9 @@ where pub fn powf_scalar( array: &PrimitiveArray, raise: T::Native, -) -> Result> +) -> Result, ArrowError> where - T: datatypes::ArrowFloatNumericType, + T: ArrowFloatNumericType, T::Native: Pow, { Ok(unary(array, |x| x.pow(raise))) @@ -1165,7 +1161,7 @@ where pub fn multiply( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1181,7 +1177,7 @@ where pub fn multiply_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1194,7 +1190,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_dyn_checked` instead. -pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!(left, right, |a, b| a.mul_wrapping(b), math_op_dict) @@ -1218,7 +1214,10 @@ pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply_dyn` instead. -pub fn multiply_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { +pub fn multiply_dyn_checked( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -1250,9 +1249,9 @@ pub fn multiply_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { Ok(unary(array, |value| value.mul_wrapping(scalar))) @@ -1266,7 +1265,7 @@ where pub fn multiply_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1280,7 +1279,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_scalar_checked_dyn` instead. -pub fn multiply_scalar_dyn(array: &dyn Array, scalar: T::Native) -> Result +pub fn multiply_scalar_dyn( + array: &dyn Array, + scalar: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1297,7 +1299,7 @@ where pub fn multiply_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, -) -> Result +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1312,7 +1314,7 @@ where pub fn modulus( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1340,7 +1342,7 @@ where pub fn divide_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1370,7 +1372,7 @@ where pub fn divide_opt( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1390,7 +1392,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_dyn_checked` instead. -pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -1432,7 +1434,10 @@ pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `divide_dyn` instead. -pub fn divide_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { +pub fn divide_dyn_checked( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -1465,7 +1470,10 @@ pub fn divide_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result Result { +pub fn divide_dyn_opt( + left: &dyn Array, + right: &dyn Array, +) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -1515,7 +1523,7 @@ pub fn divide_dyn_opt(left: &dyn Array, right: &dyn Array) -> Result { pub fn divide( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1531,7 +1539,7 @@ where pub fn modulus_scalar( array: &PrimitiveArray, modulo: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1549,7 +1557,7 @@ where pub fn divide_scalar( array: &PrimitiveArray, divisor: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1567,7 +1575,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_scalar_checked_dyn` instead. -pub fn divide_scalar_dyn(array: &dyn Array, divisor: T::Native) -> Result +pub fn divide_scalar_dyn( + array: &dyn Array, + divisor: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1588,7 +1599,7 @@ where pub fn divide_scalar_checked_dyn( array: &dyn Array, divisor: T::Native, -) -> Result +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1611,7 +1622,10 @@ where /// Unlike `divide_scalar_dyn` or `divide_scalar_checked_dyn`, division by zero will get a /// null value instead returning an `Err`, this also doesn't check overflowing, overflowing /// will just wrap the result around. -pub fn divide_scalar_opt_dyn(array: &dyn Array, divisor: T::Native) -> Result +pub fn divide_scalar_opt_dyn( + array: &dyn Array, + divisor: T::Native, +) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1631,10 +1645,11 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::Int32Array; - use crate::compute::{binary_mut, try_binary_mut, try_unary_mut, unary_mut}; - use crate::datatypes::{Date64Type, Decimal128Type, Int32Type, Int8Type}; + use arrow_array::builder::{ + BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, + }; use arrow_buffer::i256; + use arrow_data::ArrayDataBuilder; use chrono::NaiveDate; use half::f16; @@ -2530,7 +2545,6 @@ mod tests { #[should_panic(expected = "DivideByZero")] #[cfg(feature = "dyn_arith_dict")] fn test_f32_dict_array_divide_dyn_by_zero() { - use crate::datatypes::Float32Type; let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(1.5).unwrap(); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow-arith/src/arity.rs similarity index 91% rename from arrow/src/compute/kernels/arity.rs rename to arrow-arith/src/arity.rs index 02659a5a7738..e89fe7b914a4 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow-arith/src/arity.rs @@ -17,17 +17,14 @@ //! Defines kernels suitable to perform operations to primitive arrays. -use crate::array::{ - Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, - PrimitiveArray, -}; -use crate::buffer::Buffer; -use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; -use crate::downcast_dictionary_array; -use crate::error::{ArrowError, Result}; -use crate::util::bit_iterator::try_for_each_valid_idx; -use arrow_buffer::MutableBuffer; +use arrow_array::builder::BufferBuilder; +use arrow_array::iterator::ArrayIter; +use arrow_array::*; +use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; use std::sync::Arc; #[inline] @@ -71,11 +68,14 @@ where } /// See [`PrimitiveArray::try_unary`] -pub fn try_unary(array: &PrimitiveArray, op: F) -> Result> +pub fn try_unary( + array: &PrimitiveArray, + op: F, +) -> Result, ArrowError> where I: ArrowPrimitiveType, O: ArrowPrimitiveType, - F: Fn(I::Native) -> Result, + F: Fn(I::Native) -> Result, { array.try_unary(op) } @@ -84,19 +84,16 @@ where pub fn try_unary_mut( array: PrimitiveArray, op: F, -) -> std::result::Result< - std::result::Result, ArrowError>, - PrimitiveArray, -> +) -> Result, ArrowError>, PrimitiveArray> where I: ArrowPrimitiveType, - F: Fn(I::Native) -> Result, + F: Fn(I::Native) -> Result, { array.try_unary_mut(op) } /// A helper function that applies an infallible unary function to a dictionary array with primitive value type. -fn unary_dict(array: &DictionaryArray, op: F) -> Result +fn unary_dict(array: &DictionaryArray, op: F) -> Result where K: ArrowNumericType, T: ArrowPrimitiveType, @@ -108,11 +105,14 @@ where } /// A helper function that applies a fallible unary function to a dictionary array with primitive value type. -fn try_unary_dict(array: &DictionaryArray, op: F) -> Result +fn try_unary_dict( + array: &DictionaryArray, + op: F, +) -> Result where K: ArrowNumericType, T: ArrowPrimitiveType, - F: Fn(T::Native) -> Result, + F: Fn(T::Native) -> Result, { if std::mem::discriminant(&array.value_type()) != std::mem::discriminant(&T::DATA_TYPE) @@ -130,7 +130,7 @@ where } /// Applies an infallible unary function to an array with primitive values. -pub fn unary_dyn(array: &dyn Array, op: F) -> Result +pub fn unary_dyn(array: &dyn Array, op: F) -> Result where T: ArrowPrimitiveType, F: Fn(T::Native) -> T::Native, @@ -155,10 +155,10 @@ where } /// Applies a fallible unary function to an array with primitive values. -pub fn try_unary_dyn(array: &dyn Array, op: F) -> Result +pub fn try_unary_dyn(array: &dyn Array, op: F) -> Result where T: ArrowPrimitiveType, - F: Fn(T::Native) -> Result, + F: Fn(T::Native) -> Result, { downcast_dictionary_array! { array => if array.values().data_type() == &T::DATA_TYPE { @@ -202,7 +202,7 @@ pub fn binary( a: &PrimitiveArray, b: &PrimitiveArray, op: F, -) -> Result> +) -> Result, ArrowError> where A: ArrowPrimitiveType, B: ArrowPrimitiveType, @@ -258,10 +258,7 @@ pub fn binary_mut( a: PrimitiveArray, b: &PrimitiveArray, op: F, -) -> std::result::Result< - std::result::Result, ArrowError>, - PrimitiveArray, -> +) -> Result, ArrowError>, PrimitiveArray> where T: ArrowPrimitiveType, F: Fn(T::Native, T::Native) -> T::Native, @@ -320,10 +317,10 @@ pub fn try_binary( a: A, b: B, op: F, -) -> Result> +) -> Result, ArrowError> where O: ArrowPrimitiveType, - F: Fn(A::Item, B::Item) -> Result, + F: Fn(A::Item, B::Item) -> Result, { if a.len() != b.len() { return Err(ArrowError::ComputeError( @@ -382,13 +379,10 @@ pub fn try_binary_mut( a: PrimitiveArray, b: &PrimitiveArray, op: F, -) -> std::result::Result< - std::result::Result, ArrowError>, - PrimitiveArray, -> +) -> Result, ArrowError>, PrimitiveArray> where T: ArrowPrimitiveType, - F: Fn(T::Native, T::Native) -> Result, + F: Fn(T::Native, T::Native) -> Result, { if a.len() != b.len() { return Ok(Err(ArrowError::ComputeError( @@ -447,10 +441,10 @@ fn try_binary_no_nulls( a: A, b: B, op: F, -) -> Result> +) -> Result, ArrowError> where O: ArrowPrimitiveType, - F: Fn(A::Item, B::Item) -> Result, + F: Fn(A::Item, B::Item) -> Result, { let mut buffer = MutableBuffer::new(len * O::get_byte_width()); for idx in 0..len { @@ -468,13 +462,10 @@ fn try_binary_no_nulls_mut( a: PrimitiveArray, b: &PrimitiveArray, op: F, -) -> std::result::Result< - std::result::Result, ArrowError>, - PrimitiveArray, -> +) -> Result, ArrowError>, PrimitiveArray> where T: ArrowPrimitiveType, - F: Fn(T::Native, T::Native) -> Result, + F: Fn(T::Native, T::Native) -> Result, { let mut builder = a.into_builder()?; let slice = builder.values_slice_mut(); @@ -496,7 +487,7 @@ fn try_binary_opt_no_nulls( a: A, b: B, op: F, -) -> Result> +) -> Result, ArrowError> where O: ArrowPrimitiveType, F: Fn(A::Item, B::Item) -> Option, @@ -524,7 +515,7 @@ pub(crate) fn binary_opt Result> +) -> Result, ArrowError> where O: ArrowPrimitiveType, F: Fn(A::Item, B::Item) -> Option, @@ -563,9 +554,9 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::{as_primitive_array, Float64Array, PrimitiveDictionaryBuilder}; - use crate::datatypes::{Float64Type, Int32Type, Int8Type}; - use arrow_array::Int32Array; + use arrow_array::builder::*; + use arrow_array::cast::*; + use arrow_array::types::*; #[test] fn test_unary_f64_slice() { diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow-arith/src/bitwise.rs similarity index 81% rename from arrow/src/compute/kernels/bitwise.rs rename to arrow-arith/src/bitwise.rs index 0b877b326482..08cc246b351a 100644 --- a/arrow/src/compute/kernels/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::array::PrimitiveArray; -use crate::compute::{binary, unary}; -use crate::datatypes::ArrowNumericType; -use crate::error::Result; +use crate::arity::{binary, unary}; +use arrow_array::*; +use arrow_schema::ArrowError; use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array @@ -26,7 +25,7 @@ fn bitwise_op( left: &PrimitiveArray, right: &PrimitiveArray, op: F, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, F: Fn(T::Native, T::Native) -> T::Native, @@ -39,7 +38,7 @@ where pub fn bitwise_and( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitAnd, @@ -52,7 +51,7 @@ where pub fn bitwise_or( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitOr, @@ -65,7 +64,7 @@ where pub fn bitwise_xor( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitXor, @@ -75,7 +74,7 @@ where /// Perform `!array` operation on array. If array value is null /// then the result is also null. -pub fn bitwise_not(array: &PrimitiveArray) -> Result> +pub fn bitwise_not(array: &PrimitiveArray) -> Result, ArrowError> where T: ArrowNumericType, T::Native: Not, @@ -88,7 +87,7 @@ where pub fn bitwise_and_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitAnd, @@ -101,7 +100,7 @@ where pub fn bitwise_or_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitOr, @@ -114,7 +113,7 @@ where pub fn bitwise_xor_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result> +) -> Result, ArrowError> where T: ArrowNumericType, T::Native: BitXor, @@ -124,15 +123,10 @@ where #[cfg(test)] mod tests { - use crate::array::{Int32Array, UInt64Array}; - use crate::compute::kernels::bitwise::{ - bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, - bitwise_xor, bitwise_xor_scalar, - }; - use crate::error::Result; + use super::*; #[test] - fn test_bitwise_and_array() -> Result<()> { + fn test_bitwise_and_array() -> Result<(), ArrowError> { // unsigned value let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12)]); @@ -150,43 +144,41 @@ mod tests { } #[test] - fn test_bitwise_and_array_scalar() -> Result<()> { + fn test_bitwise_and_array_scalar() { // unsigned value let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); let scalar = 7; let expected = UInt64Array::from(vec![Some(7), Some(2), None, Some(4)]); - let result = bitwise_and_scalar(&left, scalar)?; + let result = bitwise_and_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let scalar = -20; let expected = Int32Array::from(vec![Some(0), Some(0), None, Some(4)]); - let result = bitwise_and_scalar(&left, scalar)?; + let result = bitwise_and_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); - Ok(()) } #[test] - fn test_bitwise_or_array() -> Result<()> { + fn test_bitwise_or_array() { // unsigned value let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); let expected = UInt64Array::from(vec![Some(7), Some(7), None, Some(13)]); - let result = bitwise_or(&left, &right)?; + let result = bitwise_or(&left, &right).unwrap(); assert_eq!(expected, result); // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let right = Int32Array::from(vec![Some(-7), Some(-5), Some(8), Some(13)]); let expected = Int32Array::from(vec![Some(-7), Some(-5), None, Some(13)]); - let result = bitwise_or(&left, &right)?; + let result = bitwise_or(&left, &right).unwrap(); assert_eq!(expected, result); - Ok(()) } #[test] - fn test_bitwise_not_array() -> Result<()> { + fn test_bitwise_not_array() { // unsigned value let array = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); let expected = UInt64Array::from(vec![ @@ -195,67 +187,63 @@ mod tests { None, Some(18446744073709551611), ]); - let result = bitwise_not(&array)?; + let result = bitwise_not(&array).unwrap(); assert_eq!(expected, result); // signed value let array = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let expected = Int32Array::from(vec![Some(-2), Some(-3), None, Some(-5)]); - let result = bitwise_not(&array)?; + let result = bitwise_not(&array).unwrap(); assert_eq!(expected, result); - Ok(()) } #[test] - fn test_bitwise_or_array_scalar() -> Result<()> { + fn test_bitwise_or_array_scalar() { // unsigned value let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); let scalar = 7; let expected = UInt64Array::from(vec![Some(15), Some(7), None, Some(7)]); - let result = bitwise_or_scalar(&left, scalar)?; + let result = bitwise_or_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let scalar = 20; let expected = Int32Array::from(vec![Some(21), Some(22), None, Some(20)]); - let result = bitwise_or_scalar(&left, scalar)?; + let result = bitwise_or_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); - Ok(()) } #[test] - fn test_bitwise_xor_array() -> Result<()> { + fn test_bitwise_xor_array() { // unsigned value let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); let expected = UInt64Array::from(vec![Some(6), Some(7), None, Some(9)]); - let result = bitwise_xor(&left, &right)?; + let result = bitwise_xor(&left, &right).unwrap(); assert_eq!(expected, result); // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let right = Int32Array::from(vec![Some(-7), Some(5), Some(8), Some(-13)]); let expected = Int32Array::from(vec![Some(-8), Some(7), None, Some(-9)]); - let result = bitwise_xor(&left, &right)?; + let result = bitwise_xor(&left, &right).unwrap(); assert_eq!(expected, result); - Ok(()) } #[test] - fn test_bitwise_xor_array_scalar() -> Result<()> { + fn test_bitwise_xor_array_scalar() { // unsigned value let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); let scalar = 7; let expected = UInt64Array::from(vec![Some(8), Some(5), None, Some(3)]); - let result = bitwise_xor_scalar(&left, scalar)?; + let result = bitwise_xor_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let scalar = -20; let expected = Int32Array::from(vec![Some(-19), Some(-18), None, Some(-24)]); - let result = bitwise_xor_scalar(&left, scalar)?; + let result = bitwise_xor_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); - Ok(()) } } diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow-arith/src/boolean.rs similarity index 92% rename from arrow/src/compute/kernels/boolean.rs rename to arrow-arith/src/boolean.rs index aa42f3d20c03..4c1a02ad7498 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -22,17 +22,16 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. -pub use arrow_select::nullif; - -use crate::array::{Array, ArrayData, BooleanArray}; -use crate::buffer::{ +use arrow_array::*; +use arrow_buffer::bit_util::ceil; +use arrow_buffer::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, - buffer_unary_not, Buffer, MutableBuffer, + buffer_unary_not, }; -use crate::datatypes::DataType; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util::ceil; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; /// Updates null buffer based on data buffer and null buffer of the operand at other side /// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false @@ -182,7 +181,7 @@ pub(crate) fn binary_boolean_kernel( right: &BooleanArray, op: F, null_op: U, -) -> Result +) -> Result where F: Fn(&Buffer, usize, &Buffer, usize, usize) -> Buffer, U: Fn(&ArrayData, usize, &ArrayData, usize, usize) -> Option, @@ -227,18 +226,17 @@ where /// This function errors when the arrays have different lengths. /// # Example /// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::and; -/// # fn main() -> Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::and; /// let a = BooleanArray::from(vec![Some(false), Some(true), None]); /// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]); -/// let and_ab = and(&a, &b)?; +/// let and_ab = and(&a, &b).unwrap(); /// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None])); -/// # Ok(()) -/// # } /// ``` -pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn and( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { binary_boolean_kernel(left, right, buffer_bin_and, build_null_buffer_for_and_or) } @@ -261,22 +259,21 @@ pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { /// # Example /// /// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::and_kleene; -/// # fn main() -> Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::and_kleene; /// let a = BooleanArray::from(vec![Some(true), Some(false), None]); /// let b = BooleanArray::from(vec![None, None, None]); -/// let and_ab = and_kleene(&a, &b)?; +/// let and_ab = and_kleene(&a, &b).unwrap(); /// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None])); -/// # Ok(()) -/// # } /// ``` /// /// # Fails /// /// If the operands have different lengths -pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn and_kleene( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { binary_boolean_kernel( left, right, @@ -291,18 +288,14 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::or; /// let a = BooleanArray::from(vec![Some(false), Some(true), None]); /// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]); -/// let or_ab = or(&a, &b)?; +/// let or_ab = or(&a, &b).unwrap(); /// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), Some(true), None])); -/// # Ok(()) -/// # } /// ``` -pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { binary_boolean_kernel(left, right, buffer_bin_or, build_null_buffer_for_and_or) } @@ -325,22 +318,21 @@ pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { /// # Example /// /// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::or_kleene; -/// # fn main() -> Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::or_kleene; /// let a = BooleanArray::from(vec![Some(true), Some(false), None]); /// let b = BooleanArray::from(vec![None, None, None]); -/// let or_ab = or_kleene(&a, &b)?; +/// let or_ab = or_kleene(&a, &b).unwrap(); /// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None])); -/// # Ok(()) -/// # } /// ``` /// /// # Fails /// /// If the operands have different lengths -pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { +pub fn or_kleene( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { binary_boolean_kernel(left, right, buffer_bin_or, build_null_buffer_for_or_kleene) } @@ -350,17 +342,13 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::not; /// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let not_a = not(&a)?; +/// let not_a = not(&a).unwrap(); /// assert_eq!(not_a, BooleanArray::from(vec![Some(true), Some(false), None])); -/// # Ok(()) -/// # } /// ``` -pub fn not(left: &BooleanArray) -> Result { +pub fn not(left: &BooleanArray) -> Result { let left_offset = left.offset(); let len = left.len(); @@ -391,17 +379,13 @@ pub fn not(left: &BooleanArray) -> Result { /// This function never errors. /// # Example /// ```rust -/// # use arrow::error::Result; -/// use arrow::array::BooleanArray; -/// use arrow::compute::kernels::boolean::is_null; -/// # fn main() -> Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::is_null; /// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let a_is_null = is_null(&a)?; +/// let a_is_null = is_null(&a).unwrap(); /// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true])); -/// # Ok(()) -/// # } /// ``` -pub fn is_null(input: &dyn Array) -> Result { +pub fn is_null(input: &dyn Array) -> Result { let len = input.len(); let output = match input.data_ref().null_buffer() { @@ -432,17 +416,13 @@ pub fn is_null(input: &dyn Array) -> Result { /// This function never errors. /// # Example /// ```rust -/// # use arrow::error::Result; -/// use arrow::array::BooleanArray; -/// use arrow::compute::kernels::boolean::is_not_null; -/// # fn main() -> Result<()> { +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::is_not_null; /// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let a_is_not_null = is_not_null(&a)?; +/// let a_is_not_null = is_not_null(&a).unwrap(); /// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false])); -/// # Ok(()) -/// # } /// ``` -pub fn is_not_null(input: &dyn Array) -> Result { +pub fn is_not_null(input: &dyn Array) -> Result { let len = input.len(); let output = match input.data_ref().null_buffer() { @@ -473,7 +453,6 @@ pub fn is_not_null(input: &dyn Array) -> Result { #[cfg(test)] mod tests { use super::*; - use crate::array::{ArrayRef, Int32Array}; use std::sync::Arc; #[test] diff --git a/arrow-arith/src/lib.rs b/arrow-arith/src/lib.rs new file mode 100644 index 000000000000..60d31c972b66 --- /dev/null +++ b/arrow-arith/src/lib.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Arrow arithmetic and aggregation kernels + +pub mod aggregate; +pub mod arithmetic; +pub mod arity; +pub mod bitwise; +pub mod boolean; +pub mod temporal; diff --git a/arrow/src/compute/kernels/temporal.rs b/arrow-arith/src/temporal.rs similarity index 95% rename from arrow/src/compute/kernels/temporal.rs rename to arrow-arith/src/temporal.rs index 15d56f70308f..5dcda8758dc9 100644 --- a/arrow/src/compute/kernels/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -17,18 +17,19 @@ //! Defines temporal kernels for time and date related functions. -use arrow_array::{downcast_dictionary_array, downcast_temporal_array}; use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use std::sync::Arc; -use crate::array::*; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; +use arrow_array::builder::*; +use arrow_array::iterator::ArrayIter; use arrow_array::temporal_conversions::{ as_datetime, as_datetime_with_timezone, as_time, }; - use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_schema::{ArrowError, DataType}; /// This function takes an `ArrayIter` of input array and an extractor `op` which takes /// an input `NaiveTime` and returns time component (e.g. hour) as `i32` value. @@ -98,7 +99,7 @@ fn extract_component_from_datetime_array< mut builder: PrimitiveBuilder, tz: &str, op: F, -) -> Result +) -> Result where F: Fn(DateTime) -> i32, i64: From, @@ -178,7 +179,7 @@ pub fn using_chrono_tz_and_utc_naive_date_time( /// Extracts the hours of a given array as an array of integers within /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn hour_dyn(array: &dyn Array) -> Result { +pub fn hour_dyn(array: &dyn Array) -> Result { match array.data_type().clone() { DataType::Dictionary(_, _) => { downcast_dictionary_array!( @@ -203,7 +204,7 @@ pub fn hour_dyn(array: &dyn Array) -> Result { /// Extracts the hours of a given temporal primitive array as an array of integers within /// the range of [0, 23]. -pub fn hour(array: &PrimitiveArray) -> Result +pub fn hour(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -240,12 +241,12 @@ where /// Extracts the years of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn year_dyn(array: &dyn Array) -> Result { +pub fn year_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "year", |t| t.year()) } /// Extracts the years of a given temporal primitive array as an array of integers -pub fn year(array: &PrimitiveArray) -> Result +pub fn year(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -256,13 +257,13 @@ where /// Extracts the quarter of a given temporal array as an array of integersa within /// the range of [1, 4]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn quarter_dyn(array: &dyn Array) -> Result { +pub fn quarter_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "quarter", |t| t.quarter() as i32) } /// Extracts the quarter of a given temporal primitive array as an array of integers within /// the range of [1, 4]. -pub fn quarter(array: &PrimitiveArray) -> Result +pub fn quarter(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -273,13 +274,13 @@ where /// Extracts the month of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn month_dyn(array: &dyn Array) -> Result { +pub fn month_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "month", |t| t.month() as i32) } /// Extracts the month of a given temporal primitive array as an array of integers within /// the range of [1, 12]. -pub fn month(array: &PrimitiveArray) -> Result +pub fn month(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -296,7 +297,7 @@ where /// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { +pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "num_days_from_monday", |t| t.num_days_from_monday()) } @@ -306,7 +307,9 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result { /// Monday is encoded as `0`, Tuesday as `1`, etc. /// /// See also [`num_days_from_sunday`] which starts at Sunday. -pub fn num_days_from_monday(array: &PrimitiveArray) -> Result +pub fn num_days_from_monday( + array: &PrimitiveArray, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -323,7 +326,7 @@ where /// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { +pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "num_days_from_sunday", |t| t.num_days_from_sunday()) } @@ -333,7 +336,9 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result { /// Sunday is encoded as `0`, Monday as `1`, etc. /// /// See also [`num_days_from_monday`] which starts at Monday. -pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result +pub fn num_days_from_sunday( + array: &PrimitiveArray, +) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -344,12 +349,12 @@ where /// Extracts the day of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn day_dyn(array: &dyn Array) -> Result { +pub fn day_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "day", |t| t.day() as i32) } /// Extracts the day of a given temporal primitive array as an array of integers -pub fn day(array: &PrimitiveArray) -> Result +pub fn day(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -361,13 +366,13 @@ where /// The day of year that ranges from 1 to 366. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn doy_dyn(array: &dyn Array) -> Result { +pub fn doy_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "doy", |t| t.ordinal() as i32) } /// Extracts the day of year of a given temporal primitive array as an array of integers /// The day of year that ranges from 1 to 366 -pub fn doy(array: &PrimitiveArray) -> Result +pub fn doy(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, T::Native: ArrowNativeType, @@ -377,7 +382,7 @@ where } /// Extracts the minutes of a given temporal primitive array as an array of integers -pub fn minute(array: &PrimitiveArray) -> Result +pub fn minute(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -388,12 +393,12 @@ where /// Extracts the week of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn week_dyn(array: &dyn Array) -> Result { +pub fn week_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "week", |t| t.iso_week().week() as i32) } /// Extracts the week of a given temporal primitive array as an array of integers -pub fn week(array: &PrimitiveArray) -> Result +pub fn week(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -402,7 +407,7 @@ where } /// Extracts the seconds of a given temporal primitive array as an array of integers -pub fn second(array: &PrimitiveArray) -> Result +pub fn second(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -411,7 +416,7 @@ where } /// Extracts the nanoseconds of a given temporal primitive array as an array of integers -pub fn nanosecond(array: &PrimitiveArray) -> Result +pub fn nanosecond(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -422,12 +427,16 @@ where /// Extracts the nanoseconds of a given temporal primitive array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn nanosecond_dyn(array: &dyn Array) -> Result { +pub fn nanosecond_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "nanosecond", |t| t.nanosecond() as i32) } /// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_dyn(array: &dyn Array, name: &str, op: F) -> Result +fn time_fraction_dyn( + array: &dyn Array, + name: &str, + op: F, +) -> Result where F: Fn(NaiveDateTime) -> i32, { @@ -458,7 +467,7 @@ fn time_fraction_internal( array: &PrimitiveArray, name: &str, op: F, -) -> Result +) -> Result where F: Fn(NaiveDateTime) -> i32, T: ArrowTemporalType + ArrowNumericType, @@ -486,14 +495,14 @@ where /// Extracts the minutes of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn minute_dyn(array: &dyn Array) -> Result { +pub fn minute_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "minute", |t| t.minute() as i32) } /// Extracts the seconds of a given temporal array as an array of integers. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. -pub fn second_dyn(array: &dyn Array) -> Result { +pub fn second_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "second", |t| t.second() as i32) } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 772c1be7745e..17b02626fb05 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -45,6 +45,7 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] +arrow-arith = { version = "29.0.0", path = "../arrow-arith" } arrow-array = { version = "29.0.0", path = "../arrow-array" } arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } arrow-cast = { version = "29.0.0", path = "../arrow-cast" } @@ -59,14 +60,8 @@ arrow-select = { version = "29.0.0", path = "../arrow-select" } arrow-string = { version = "29.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.1", default-features = false, features = ["num-traits"] } -hashbrown = { version = "0.13", default-features = false } -regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } -multiversion = { version = "0.6.1", default-features = false } bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] @@ -78,7 +73,7 @@ ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["arrow-json"] -simd = ["arrow-array/simd", "arrow-ord/simd"] +simd = ["arrow-array/simd", "arrow-ord/simd", "arrow-arith/simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but # not the core arrow code itself. Be aware that `rand` must be kept as @@ -97,12 +92,13 @@ ffi = ["bitflags"] dyn_cmp_dict = ["arrow-string/dyn_cmp_dict", "arrow-ord/dyn_cmp_dict"] # Enable dyn-arithmetic kernels for dictionary arrays # Note: this does not impact arithmetic with scalars -dyn_arith_dict = [] +dyn_arith_dict = ["arrow-arith/dyn_arith_dict"] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] chrono = { version = "0.4.23", default-features = false, features = ["clock"] } criterion = { version = "0.4", default-features = false } +half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 837fb73d56d1..19f3c27a04fa 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -17,14 +17,9 @@ //! Computation kernels on Arrow Arrays -pub mod aggregate; -pub mod arithmetic; -pub mod arity; -pub mod bitwise; -pub mod boolean; pub mod limit; -pub mod temporal; +pub use arrow_arith::{aggregate, arithmetic, arity, bitwise, boolean, temporal}; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_ord::{partition, sort}; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 8611acf52fec..cee4f886cf9c 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -31,6 +31,7 @@ //! //! The current list of sub-crates is: //! +//! * [`arrow-arith][arrow_arith] - arithmetic kernels //! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions //! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays //! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays @@ -39,6 +40,7 @@ //! * [`arrow-ipc`][arrow_ipc] - read/write IPC to arrow format //! * [`arrow-json`][arrow_json] - read/write JSON to arrow format //! * [`arrow-ord`][arrow_ord] - ordering kernels for arrow arrays +//! * [`arrow-row`][arrow_row] - comparable row format //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays //! * [`arrow-string`][arrow_string] - string kernels for arrow arrays diff --git a/dev/release/README.md b/dev/release/README.md index a18d8a4992c0..81f219034e1c 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -262,6 +262,7 @@ Rust Arrow Crates: (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) (cd arrow-ord && cargo publish) +(cd arrow-arith && cargo publish) (cd arrow-string && cargo publish) (cd arrow-row && cargo publish) (cd arrow && cargo publish) From 98bba9c4523895caff21fd82fdd58a579fa17041 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 22 Dec 2022 09:13:44 +0000 Subject: [PATCH 0441/1411] Add HttpStore (#3294) (#3380) --- .github/workflows/object_store.yml | 6 +- object_store/Cargo.toml | 1 + object_store/src/azure/client.rs | 18 +- object_store/src/azure/mod.rs | 3 +- object_store/src/client/mod.rs | 2 + object_store/src/http/client.rs | 372 +++++++++++++++++++++++++++++ object_store/src/http/mod.rs | 281 ++++++++++++++++++++++ object_store/src/lib.rs | 14 +- object_store/src/util.rs | 18 +- 9 files changed, 694 insertions(+), 21 deletions(-) create mode 100644 object_store/src/http/client.rs create mode 100644 object_store/src/http/mod.rs diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 23c5bab13a32..4de7b31331b3 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -77,6 +77,7 @@ jobs: AZURE_USE_EMULATOR: "1" AZURITE_BLOB_STORAGE_URL: "http://localhost:10000" AZURITE_QUEUE_STORAGE_URL: "http://localhost:10001" + HTTP_URL: "http://localhost:8080" GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" OBJECT_STORE_BUCKET: test-bucket @@ -91,6 +92,9 @@ jobs: curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" + - name: Setup WebDav + run: docker run -d -p 8080:80 rclone/rclone serve webdav /data --addr :80 + - name: Setup LocalStack (AWS emulation) env: AWS_DEFAULT_REGION: "us-east-1" @@ -120,7 +124,7 @@ jobs: OBJECT_STORE_AWS_ACCESS_KEY_ID: test OBJECT_STORE_AWS_SECRET_ACCESS_KEY: test OBJECT_STORE_AWS_ENDPOINT: http://localhost:4566 - run: cargo test -p object_store --features=aws,azure,gcp + run: cargo test -p object_store --features=aws,azure,gcp,http # test the object_store crate builds against wasm32 in stable rust wasm32-build: diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 8973254c0914..fd033d55d666 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -61,6 +61,7 @@ cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] +http = ["cloud"] # Experimental support for AWS_PROFILE aws_profile = ["aws", "aws-config", "aws-types"] diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 440c379743a6..50f836377add 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -20,20 +20,20 @@ use crate::azure::credential::*; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::path::DELIMITER; -use crate::util::{format_http_range, format_prefix}; +use crate::util::{deserialize_rfc1123, format_http_range, format_prefix}; use crate::{ BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; use bytes::{Buf, Bytes}; -use chrono::{DateTime, TimeZone, Utc}; +use chrono::{DateTime, Utc}; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, Client as ReqwestClient, Method, Response, StatusCode, }; -use serde::{Deserialize, Deserializer, Serialize}; +use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; use std::ops::Range; @@ -479,7 +479,7 @@ impl TryFrom for ObjectMeta { #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] #[serde(rename_all = "PascalCase")] struct BlobProperties { - #[serde(deserialize_with = "deserialize_http_date", rename = "Last-Modified")] + #[serde(deserialize_with = "deserialize_rfc1123", rename = "Last-Modified")] pub last_modified: DateTime, pub etag: String, #[serde(rename = "Content-Length")] @@ -492,16 +492,6 @@ struct BlobProperties { pub content_language: Option, } -// deserialize dates used in Azure payloads according to rfc1123 -fn deserialize_http_date<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let s = String::deserialize(deserializer)?; - Utc.datetime_from_str(&s, RFC1123_FMT) - .map_err(serde::de::Error::custom) -} - #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct BlockId(Bytes); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 2cc4fe1a43ef..4224ae633dcd 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -46,6 +46,7 @@ use std::sync::Arc; use tokio::io::AsyncWrite; use url::Url; +use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -219,7 +220,7 @@ impl ObjectStore for MicrosoftAzure { .to_str() .context(BadHeaderSnafu)?; let last_modified = Utc - .datetime_from_str(last_modified, credential::RFC1123_FMT) + .datetime_from_str(last_modified, RFC1123_FMT) .context(InvalidLastModifiedSnafu { last_modified })?; let content_length = headers diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 9df7b5039da9..f07377e98995 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -20,8 +20,10 @@ pub mod backoff; #[cfg(test)] pub mod mock_server; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; pub mod retry; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; use reqwest::header::{HeaderMap, HeaderValue}; diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs new file mode 100644 index 000000000000..799c5be0c5eb --- /dev/null +++ b/object_store/src/http/client.rs @@ -0,0 +1,372 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::retry::{self, RetryConfig, RetryExt}; +use crate::path::{Path, DELIMITER}; +use crate::util::{deserialize_rfc1123, format_http_range}; +use crate::{ClientOptions, ObjectMeta, Result}; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, Utc}; +use percent_encoding::percent_decode_str; +use reqwest::header::{CONTENT_TYPE, RANGE}; +use reqwest::{Method, Response, StatusCode}; +use serde::Deserialize; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::ops::Range; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Request error: {}", source))] + Request { source: retry::Error }, + + #[snafu(display("Request error: {}", source))] + Reqwest { source: reqwest::Error }, + + #[snafu(display("Error decoding PROPFIND response: {}", source))] + InvalidPropFind { source: quick_xml::de::DeError }, + + #[snafu(display("Missing content size for {}", href))] + MissingSize { href: String }, + + #[snafu(display("Error getting properties of \"{}\" got \"{}\"", href, status))] + PropStatus { href: String, status: String }, + + #[snafu(display("Failed to parse href \"{}\": {}", href, source))] + InvalidHref { + href: String, + source: url::ParseError, + }, + + #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + NonUnicode { + path: String, + source: std::str::Utf8Error, + }, + + #[snafu(display("Encountered invalid path \"{}\": {}", path, source))] + InvalidPath { + path: String, + source: crate::path::Error, + }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "HTTP", + source: Box::new(err), + } + } +} + +/// Internal client for HttpStore +#[derive(Debug)] +pub struct Client { + url: Url, + client: reqwest::Client, + retry_config: RetryConfig, + client_options: ClientOptions, +} + +impl Client { + pub fn new( + url: Url, + client_options: ClientOptions, + retry_config: RetryConfig, + ) -> Result { + let client = client_options.client()?; + Ok(Self { + url, + retry_config, + client_options, + client, + }) + } + + pub fn base_url(&self) -> &Url { + &self.url + } + + fn path_url(&self, location: &Path) -> Url { + let mut url = self.url.clone(); + url.path_segments_mut().unwrap().extend(location.parts()); + url + } + + /// Create a directory with `path` using MKCOL + async fn make_directory(&self, path: &str) -> Result<(), Error> { + let method = Method::from_bytes(b"MKCOL").unwrap(); + let mut url = self.url.clone(); + url.path_segments_mut() + .unwrap() + .extend(path.split(DELIMITER)); + + self.client + .request(method, url) + .send_retry(&self.retry_config) + .await + .context(RequestSnafu)?; + + Ok(()) + } + + /// Recursively create parent directories + async fn create_parent_directories(&self, location: &Path) -> Result<()> { + let mut stack = vec![]; + + // Walk backwards until a request succeeds + let mut last_prefix = location.as_ref(); + while let Some((prefix, _)) = last_prefix.rsplit_once(DELIMITER) { + last_prefix = prefix; + + match self.make_directory(prefix).await { + Ok(_) => break, + Err(Error::Request { source }) + if matches!(source.status(), Some(StatusCode::CONFLICT)) => + { + // Need to create parent + stack.push(prefix) + } + Err(e) => return Err(e.into()), + } + } + + // Retry the failed requests, which should now succeed + for prefix in stack.into_iter().rev() { + self.make_directory(prefix).await?; + } + + Ok(()) + } + + pub async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let mut retry = false; + loop { + let url = self.path_url(location); + let mut builder = self.client.put(url).body(bytes.clone()); + if let Some(value) = self.client_options.get_content_type(location) { + builder = builder.header(CONTENT_TYPE, value); + } + + match builder.send_retry(&self.retry_config).await { + Ok(_) => return Ok(()), + Err(source) => match source.status() { + // Some implementations return 404 instead of 409 + Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { + retry = true; + self.create_parent_directories(location).await? + } + _ => return Err(Error::Request { source }.into()), + }, + } + } + } + + pub async fn list( + &self, + location: Option<&Path>, + depth: &str, + ) -> Result { + let url = location + .map(|path| self.path_url(path)) + .unwrap_or_else(|| self.url.clone()); + + let method = Method::from_bytes(b"PROPFIND").unwrap(); + let result = self + .client + .request(method, url) + .header("Depth", depth) + .send_retry(&self.retry_config) + .await; + + let response = match result { + Ok(result) => result.bytes().await.context(ReqwestSnafu)?, + Err(e) if matches!(e.status(), Some(StatusCode::NOT_FOUND)) => { + return match depth { + "0" => { + let path = location.map(|x| x.as_ref()).unwrap_or(""); + Err(crate::Error::NotFound { + path: path.to_string(), + source: Box::new(e), + }) + } + _ => { + // If prefix not found, return empty result set + Ok(Default::default()) + } + }; + } + Err(source) => return Err(Error::Request { source }.into()), + }; + + let status = quick_xml::de::from_reader(response.reader()) + .context(InvalidPropFindSnafu)?; + Ok(status) + } + + pub async fn delete(&self, path: &Path) -> Result<()> { + let url = self.path_url(path); + self.client + .delete(url) + .send_retry(&self.retry_config) + .await + .context(RequestSnafu)?; + Ok(()) + } + + pub async fn get( + &self, + location: &Path, + range: Option>, + ) -> Result { + let url = self.path_url(location); + let mut builder = self.client.get(url); + + if let Some(range) = range { + builder = builder.header(RANGE, format_http_range(range)); + } + + builder + .send_retry(&self.retry_config) + .await + .map_err(|source| match source.status() { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + }, + _ => Error::Request { source }.into(), + }) + } + + pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { + let from = self.path_url(from); + let to = self.path_url(to); + let method = Method::from_bytes(b"COPY").unwrap(); + + let mut builder = self + .client + .request(method, from) + .header("Destination", to.as_str()); + + if !overwrite { + builder = builder.header("Overwrite", "F"); + } + + match builder.send_retry(&self.retry_config).await { + Ok(_) => Ok(()), + Err(e) + if !overwrite + && matches!(e.status(), Some(StatusCode::PRECONDITION_FAILED)) => + { + Err(crate::Error::AlreadyExists { + path: to.to_string(), + source: Box::new(e), + }) + } + Err(source) => Err(Error::Request { source }.into()), + } + } +} + +/// The response returned by a PROPFIND request, i.e. list +#[derive(Deserialize, Default)] +pub struct MultiStatus { + pub response: Vec, +} + +#[derive(Deserialize)] +pub struct MultiStatusResponse { + href: String, + #[serde(rename = "propstat")] + prop_stat: PropStat, +} + +impl MultiStatusResponse { + /// Returns an error if this response is not OK + pub fn check_ok(&self) -> Result<()> { + match self.prop_stat.status.contains("200 OK") { + true => Ok(()), + false => Err(Error::PropStatus { + href: self.href.clone(), + status: self.prop_stat.status.clone(), + } + .into()), + } + } + + /// Returns the resolved path of this element relative to `base_url` + pub fn path(&self, base_url: &Url) -> Result { + let url = Url::options() + .base_url(Some(base_url)) + .parse(&self.href) + .context(InvalidHrefSnafu { href: &self.href })?; + + // Reverse any percent encoding + let path = percent_decode_str(url.path()) + .decode_utf8() + .context(NonUnicodeSnafu { path: url.path() })?; + + Ok(Path::parse(path.as_ref()).context(InvalidPathSnafu { path })?) + } + + fn size(&self) -> Result { + let size = self + .prop_stat + .prop + .content_length + .context(MissingSizeSnafu { href: &self.href })?; + Ok(size) + } + + /// Returns this objects metadata as [`ObjectMeta`] + pub fn object_meta(&self, base_url: &Url) -> Result { + Ok(ObjectMeta { + location: self.path(base_url)?, + last_modified: self.prop_stat.prop.last_modified, + size: self.size()?, + }) + } + + /// Returns true if this is a directory / collection + pub fn is_dir(&self) -> bool { + self.prop_stat.prop.resource_type.collection.is_some() + } +} + +#[derive(Deserialize)] +pub struct PropStat { + prop: Prop, + status: String, +} + +#[derive(Deserialize)] +pub struct Prop { + #[serde(deserialize_with = "deserialize_rfc1123", rename = "getlastmodified")] + last_modified: DateTime, + + #[serde(rename = "getcontentlength")] + content_length: Option, + + #[serde(rename = "resourcetype")] + resource_type: ResourceType, +} + +#[derive(Deserialize)] +pub struct ResourceType { + collection: Option<()>, +} diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs new file mode 100644 index 000000000000..25997d8924ec --- /dev/null +++ b/object_store/src/http/mod.rs @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store implementation for generic HTTP servers +//! +//! This follows [rfc2518] commonly known called [WebDAV] +//! +//! Basic get support will work out of the box with most HTTP servers, +//! even those that don't explicitly support [rfc2518] +//! +//! Other operations such as list, delete, copy, etc... will likely +//! require server-side configuration. A list of HTTP servers with support +//! can be found [here](https://wiki.archlinux.org/title/WebDAV#Server) +//! +//! Multipart uploads are not currently supported +//! +//! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 +//! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV + +use std::ops::Range; + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; +use snafu::{OptionExt, ResultExt, Snafu}; +use tokio::io::AsyncWrite; +use url::Url; + +use crate::http::client::Client; +use crate::path::Path; +use crate::{ + ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + RetryConfig, +}; + +mod client; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Must specify a URL"))] + MissingUrl, + + #[snafu(display("Invalid URL: {}", source))] + InvalidUrl { source: reqwest::Error }, + + #[snafu(display("Object is a directory"))] + IsDirectory, + + #[snafu(display("PROPFIND response contained no valid objects"))] + NoObjects, + + #[snafu(display("PROPFIND response contained more than one object"))] + MultipleObjects, + + #[snafu(display("Request error: {}", source))] + Reqwest { source: reqwest::Error }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + Self::Generic { + store: "HTTP", + source: Box::new(err), + } + } +} + +/// An [`ObjectStore`] implementation for generic HTTP servers +/// +/// See [`crate::http`] for more information +#[derive(Debug)] +pub struct HttpStore { + client: Client, +} + +impl std::fmt::Display for HttpStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "HttpStore") + } +} + +#[async_trait] +impl ObjectStore for HttpStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.client.put(location, bytes).await + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> Result<(MultipartId, Box)> { + Err(super::Error::NotImplemented) + } + + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> Result<()> { + Err(super::Error::NotImplemented) + } + + async fn get(&self, location: &Path) -> Result { + let response = self.client.get(location, None).await?; + let stream = response + .bytes_stream() + .map_err(|source| Error::Reqwest { source }.into()) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let bytes = self + .client + .get(location, Some(range)) + .await? + .bytes() + .await + .context(ReqwestSnafu)?; + Ok(bytes) + } + + async fn head(&self, location: &Path) -> Result { + let status = self.client.list(Some(location), "0").await?; + match status.response.len() { + 1 => { + let response = status.response.into_iter().next().unwrap(); + response.check_ok()?; + match response.is_dir() { + true => Err(Error::IsDirectory.into()), + false => response.object_meta(self.client.base_url()), + } + } + 0 => Err(Error::NoObjects.into()), + _ => Err(Error::MultipleObjects.into()), + } + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.client.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let status = self.client.list(prefix, "infinity").await?; + Ok(futures::stream::iter( + status + .response + .into_iter() + .filter(|r| !r.is_dir()) + .map(|response| { + response.check_ok()?; + response.object_meta(self.client.base_url()) + }), + ) + .boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let status = self.client.list(prefix, "1").await?; + let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or(0); + + let mut objects: Vec = Vec::with_capacity(status.response.len()); + let mut common_prefixes = Vec::with_capacity(status.response.len()); + for response in status.response { + response.check_ok()?; + match response.is_dir() { + false => objects.push(response.object_meta(self.client.base_url())?), + true => { + let path = response.path(self.client.base_url())?; + // Exclude the current object + if path.as_ref().len() > prefix_len { + common_prefixes.push(path); + } + } + } + } + + Ok(ListResult { + common_prefixes, + objects, + }) + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy(from, to, true).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy(from, to, false).await + } +} + +/// Configure a connection to a generic HTTP server +#[derive(Debug, Default)] +pub struct HttpBuilder { + url: Option>, + client_options: ClientOptions, + retry_config: RetryConfig, +} + +impl HttpBuilder { + /// Create a new [`HttpBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Set the URL + pub fn with_url(mut self, url: impl reqwest::IntoUrl) -> Self { + self.url = Some(url.into_url().context(InvalidUrlSnafu).map_err(Into::into)); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Build an [`HttpStore`] with the configured options + pub fn build(self) -> Result { + let url = self.url.context(MissingUrlSnafu)??; + Ok(HttpStore { + client: Client::new(url, self.client_options, self.retry_config)?, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::tests::*; + + use super::*; + + #[tokio::test] + async fn http_test() { + dotenv::dotenv().ok(); + let force = std::env::var("TEST_INTEGRATION"); + if force.is_err() { + eprintln!("skipping HTTP integration test - set TEST_INTEGRATION to run"); + return; + } + let url = std::env::var("HTTP_URL").expect("HTTP_URL must be set"); + let options = ClientOptions::new().with_allow_http(true); + let integration = HttpBuilder::new() + .with_url(url) + .with_client_options(options) + .build() + .unwrap(); + + put_get_delete_list_opts(&integration, false).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + } +} diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6078c1c93cdf..0c416c26b78d 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -45,6 +45,10 @@ feature = "azure", doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] +#![cfg_attr( + feature = "http", + doc = "* [HTTP Storage](https://datatracker.ietf.org/doc/html/rfc2518): [`HttpBuilder`](http::HttpBuilder)" +)] //! * In Memory: [`InMemory`](memory::InMemory) //! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! @@ -177,6 +181,8 @@ pub mod chunked; pub mod delimited; #[cfg(feature = "gcp")] pub mod gcp; +#[cfg(feature = "http")] +pub mod http; pub mod limit; #[cfg(not(target_arch = "wasm32"))] pub mod local; @@ -185,10 +191,10 @@ pub mod path; pub mod prefix; pub mod throttle; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod client; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure"))] +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] @@ -210,7 +216,7 @@ use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use tokio::io::AsyncWrite; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] +#[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] pub use client::ClientOptions; /// An alias for a dynamically dispatched object store implementation. @@ -1003,7 +1009,7 @@ mod tests { let paths = flatten_list_stream(storage, None).await.unwrap(); for f in &paths { - let _ = storage.delete(f).await; + storage.delete(f).await.unwrap(); } } diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 41c72d012b5a..e592e7b64f2d 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -20,6 +20,22 @@ use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; +#[cfg(any(feature = "azure", feature = "http"))] +pub static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; + +// deserialize dates according to rfc1123 +#[cfg(any(feature = "azure", feature = "http"))] +pub fn deserialize_rfc1123<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let s: String = serde::Deserialize::deserialize(deserializer)?; + chrono::TimeZone::datetime_from_str(&chrono::Utc, &s, RFC1123_FMT) + .map_err(serde::de::Error::custom) +} + /// Returns the prefix to be passed to an object store #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { @@ -30,7 +46,7 @@ pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { /// Returns a formatted HTTP range header as per /// -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] pub fn format_http_range(range: std::ops::Range) -> String { format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) } From af07998aefa765f8bece0bcbb72d06996537cafc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 22 Dec 2022 10:21:15 -0800 Subject: [PATCH 0442/1411] Fix broken FlightSQL example (#3387) * Fix flight sql example * Add into CI --- .github/workflows/arrow_flight.yml | 3 +++ arrow-flight/examples/flight_sql_server.rs | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index fb3e9f577d5a..a0e9f38b02d1 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -56,6 +56,9 @@ jobs: - name: Test --all-features run: | cargo test -p arrow-flight --all-features + - name: Test --examples + run: | + cargo test -p arrow-flight --features=flight-sql-experimental --examples - name: Verify workspace clean run: git diff --exit-code diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 0d06aa664ec8..ae015001f0a1 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -482,8 +482,14 @@ impl ProstMessageExt for FetchResults { mod tests { use super::*; use futures::TryStreamExt; + use std::fs; + use tempfile::NamedTempFile; + use tokio::net::{UnixListener, UnixStream}; + use tokio_stream::wrappers::UnixListenerStream; + use tonic::transport::Endpoint; use arrow::util::pretty::pretty_format_batches; + use arrow_flight::sql::client::FlightSqlServiceClient; use arrow_flight::utils::flight_data_to_batches; use tower::service_fn; From 17b3210af2ccd190489de9c641fd10f009abd45b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 23 Dec 2022 07:23:37 -0500 Subject: [PATCH 0443/1411] Initial Mid-level `FlightClient` (#3378) * Mid-level FlightClient * cleanup * fixup for use of Bytes * clippy * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fixup * BoxStream Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/client.rs | 567 ++++++++++++++++++++++++++++ arrow-flight/src/error.rs | 59 +++ arrow-flight/src/lib.rs | 7 + arrow-flight/tests/client.rs | 309 +++++++++++++++ arrow-flight/tests/common/server.rs | 212 +++++++++++ 5 files changed, 1154 insertions(+) create mode 100644 arrow-flight/src/client.rs create mode 100644 arrow-flight/src/error.rs create mode 100644 arrow-flight/tests/client.rs create mode 100644 arrow-flight/tests/common/server.rs diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs new file mode 100644 index 000000000000..0e75ac7c0c7f --- /dev/null +++ b/arrow-flight/src/client.rs @@ -0,0 +1,567 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{ + flight_service_client::FlightServiceClient, utils::flight_data_to_arrow_batch, + FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, Ticket, +}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_schema::Schema; +use bytes::Bytes; +use futures::{future::ready, ready, stream, StreamExt}; +use std::{collections::HashMap, convert::TryFrom, pin::Pin, sync::Arc, task::Poll}; +use tonic::{metadata::MetadataMap, transport::Channel, Streaming}; + +use crate::error::{FlightError, Result}; + +/// A "Mid level" [Apache Arrow Flight](https://arrow.apache.org/docs/format/Flight.html) client. +/// +/// [`FlightClient`] is intended as a convenience for interactions +/// with Arrow Flight servers. For more direct control, such as access +/// to the response headers, use [`FlightServiceClient`] directly +/// via methods such as [`Self::inner`] or [`Self::into_inner`]. +/// +/// # Example: +/// ```no_run +/// # async fn run() { +/// # use arrow_flight::FlightClient; +/// # use bytes::Bytes; +/// use tonic::transport::Channel; +/// let channel = Channel::from_static("http://localhost:1234") +/// .connect() +/// .await +/// .expect("error connecting"); +/// +/// let mut client = FlightClient::new(channel); +/// +/// // Send 'Hi' bytes as the handshake request to the server +/// let response = client +/// .handshake(Bytes::from("Hi")) +/// .await +/// .expect("error handshaking"); +/// +/// // Expect the server responded with 'Ho' +/// assert_eq!(response, Bytes::from("Ho")); +/// # } +/// ``` +#[derive(Debug)] +pub struct FlightClient { + /// Optional grpc header metadata to include with each request + metadata: MetadataMap, + + /// The inner client + inner: FlightServiceClient, +} + +impl FlightClient { + /// Creates a client client with the provided [`Channel`](tonic::transport::Channel) + pub fn new(channel: Channel) -> Self { + Self::new_from_inner(FlightServiceClient::new(channel)) + } + + /// Creates a new higher level client with the provided lower level client + pub fn new_from_inner(inner: FlightServiceClient) -> Self { + Self { + metadata: MetadataMap::new(), + inner, + } + } + + /// Return a reference to gRPC metadata included with each request + pub fn metadata(&self) -> &MetadataMap { + &self.metadata + } + + /// Return a reference to gRPC metadata included with each request + /// + /// These headers can be used, for example, to include + /// authorization or other application specific headers. + pub fn metadata_mut(&mut self) -> &mut MetadataMap { + &mut self.metadata + } + + /// Add the specified header with value to all subsequent + /// requests. See [`Self::metadata_mut`] for fine grained control. + pub fn add_header(&mut self, key: &str, value: &str) -> Result<()> { + let key = tonic::metadata::MetadataKey::<_>::from_bytes(key.as_bytes()) + .map_err(|e| FlightError::ExternalError(Box::new(e)))?; + + let value = value + .parse() + .map_err(|e| FlightError::ExternalError(Box::new(e)))?; + + // ignore previous value + self.metadata.insert(key, value); + + Ok(()) + } + + /// Return a reference to the underlying tonic + /// [`FlightServiceClient`] + pub fn inner(&self) -> &FlightServiceClient { + &self.inner + } + + /// Return a mutable reference to the underlying tonic + /// [`FlightServiceClient`] + pub fn inner_mut(&mut self) -> &mut FlightServiceClient { + &mut self.inner + } + + /// Consume this client and return the underlying tonic + /// [`FlightServiceClient`] + pub fn into_inner(self) -> FlightServiceClient { + self.inner + } + + /// Perform an Arrow Flight handshake with the server, sending + /// `payload` as the [`HandshakeRequest`] payload and returning + /// the [`HandshakeResponse`](crate::HandshakeResponse) + /// bytes returned from the server + /// + /// See [`FlightClient`] docs for an example. + pub async fn handshake(&mut self, payload: impl Into) -> Result { + let request = HandshakeRequest { + protocol_version: 0, + payload: payload.into(), + }; + + // apply headers, etc + let request = self.make_request(stream::once(ready(request))); + + let mut response_stream = self.inner.handshake(request).await?.into_inner(); + + if let Some(response) = response_stream.next().await.transpose()? { + // check if there is another response + if response_stream.next().await.is_some() { + return Err(FlightError::protocol( + "Got unexpected second response from handshake", + )); + } + + Ok(response.payload) + } else { + Err(FlightError::protocol("No response from handshake")) + } + } + + /// Make a `DoGet` call to the server with the provided ticket, + /// returning a [`FlightRecordBatchStream`] for reading + /// [`RecordBatch`]es. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use bytes::Bytes; + /// # use arrow_flight::FlightClient; + /// # use arrow_flight::Ticket; + /// # use arrow_array::RecordBatch; + /// # use tonic::transport::Channel; + /// # use futures::stream::TryStreamExt; + /// # let channel = Channel::from_static("http://localhost:1234") + /// # .connect() + /// # .await + /// # .expect("error connecting"); + /// # let ticket = Ticket { ticket: Bytes::from("foo") }; + /// let mut client = FlightClient::new(channel); + /// + /// // Invoke a do_get request on the server with a previously + /// // received Ticket + /// + /// let response = client + /// .do_get(ticket) + /// .await + /// .expect("error invoking do_get"); + /// + /// // Use try_collect to get the RecordBatches from the server + /// let batches: Vec = response + /// .try_collect() + /// .await + /// .expect("no stream errors"); + /// # } + /// ``` + pub async fn do_get(&mut self, ticket: Ticket) -> Result { + let request = self.make_request(ticket); + + let response = self.inner.do_get(request).await?.into_inner(); + + let flight_data_stream = FlightDataStream::new(response); + Ok(FlightRecordBatchStream::new(flight_data_stream)) + } + + /// Make a `GetFlightInfo` call to the server with the provided + /// [`FlightDescriptor`] and return the [`FlightInfo`] from the + /// server. The [`FlightInfo`] can be used with [`Self::do_get`] + /// to retrieve the requested batches. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use arrow_flight::FlightClient; + /// # use arrow_flight::FlightDescriptor; + /// # use tonic::transport::Channel; + /// # let channel = Channel::from_static("http://localhost:1234") + /// # .connect() + /// # .await + /// # .expect("error connecting"); + /// let mut client = FlightClient::new(channel); + /// + /// // Send a 'CMD' request to the server + /// let request = FlightDescriptor::new_cmd(b"MOAR DATA".to_vec()); + /// let flight_info = client + /// .get_flight_info(request) + /// .await + /// .expect("error handshaking"); + /// + /// // retrieve the first endpoint from the returned flight info + /// let ticket = flight_info + /// .endpoint[0] + /// // Extract the ticket + /// .ticket + /// .clone() + /// .expect("expected ticket"); + /// + /// // Retrieve the corresponding RecordBatch stream with do_get + /// let data = client + /// .do_get(ticket) + /// .await + /// .expect("error fetching data"); + /// # } + /// ``` + pub async fn get_flight_info( + &mut self, + descriptor: FlightDescriptor, + ) -> Result { + let request = self.make_request(descriptor); + + let response = self.inner.get_flight_info(request).await?.into_inner(); + Ok(response) + } + + // TODO other methods + // list_flights + // get_schema + // do_put + // do_action + // list_actions + // do_exchange + + /// return a Request, adding any configured metadata + fn make_request(&self, t: T) -> tonic::Request { + // Pass along metadata + let mut request = tonic::Request::new(t); + *request.metadata_mut() = self.metadata.clone(); + request + } +} + +/// A stream of [`RecordBatch`]es from from an Arrow Flight server. +/// +/// To access the lower level Flight messages directly, consider +/// calling [`Self::into_inner`] and using the [`FlightDataStream`] +/// directly. +#[derive(Debug)] +pub struct FlightRecordBatchStream { + inner: FlightDataStream, + got_schema: bool, +} + +impl FlightRecordBatchStream { + pub fn new(inner: FlightDataStream) -> Self { + Self { + inner, + got_schema: false, + } + } + + /// Has a message defining the schema been received yet? + pub fn got_schema(&self) -> bool { + self.got_schema + } + + /// Consume self and return the wrapped [`FlightDataStream`] + pub fn into_inner(self) -> FlightDataStream { + self.inner + } +} +impl futures::Stream for FlightRecordBatchStream { + type Item = Result; + + /// Returns the next [`RecordBatch`] available in this stream, or `None` if + /// there are no further results available. + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll>> { + loop { + let res = ready!(self.inner.poll_next_unpin(cx)); + match res { + // Inner exhausted + None => { + return Poll::Ready(None); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + // translate data + Some(Ok(data)) => match data.payload { + DecodedPayload::Schema(_) if self.got_schema => { + return Poll::Ready(Some(Err(FlightError::protocol( + "Unexpectedly saw multiple Schema messages in FlightData stream", + )))); + } + DecodedPayload::Schema(_) => { + self.got_schema = true; + // Need next message, poll inner again + } + DecodedPayload::RecordBatch(batch) => { + return Poll::Ready(Some(Ok(batch))); + } + DecodedPayload::None => { + // Need next message + } + }, + } + } + } +} + +/// Wrapper around a stream of [`FlightData`] that handles the details +/// of decoding low level Flight messages into [`Schema`] and +/// [`RecordBatch`]es, including details such as dictionaries. +/// +/// # Protocol Details +/// +/// The client handles flight messages as followes: +/// +/// - **None:** This message has no effect. This is useful to +/// transmit metadata without any actual payload. +/// +/// - **Schema:** The schema is (re-)set. Dictionaries are cleared and +/// the decoded schema is returned. +/// +/// - **Dictionary Batch:** A new dictionary for a given column is registered. An existing +/// dictionary for the same column will be overwritten. This +/// message is NOT visible. +/// +/// - **Record Batch:** Record batch is created based on the current +/// schema and dictionaries. This fails if no schema was transmitted +/// yet. +/// +/// All other message types (at the time of writing: e.g. tensor and +/// sparse tensor) lead to an error. +/// +/// Example usecases +/// +/// 1. Using this low level stream it is possible to receive a steam +/// of RecordBatches in FlightData that have different schemas by +/// handling multiple schema messages separately. +#[derive(Debug)] +pub struct FlightDataStream { + /// Underlying data stream + response: Streaming, + /// Decoding state + state: Option, + /// seen the end of the inner stream? + done: bool, +} + +impl FlightDataStream { + /// Create a new wrapper around the stream of FlightData + pub fn new(response: Streaming) -> Self { + Self { + state: None, + response, + done: false, + } + } + + /// Extracts flight data from the next message, updating decoding + /// state as necessary. + fn extract_message(&mut self, data: FlightData) -> Result> { + use arrow_ipc::MessageHeader; + let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|e| { + FlightError::DecodeError(format!("Error decoding root message: {e}")) + })?; + + match message.header_type() { + MessageHeader::NONE => Ok(Some(DecodedFlightData::new_none(data))), + MessageHeader::Schema => { + let schema = Schema::try_from(&data).map_err(|e| { + FlightError::DecodeError(format!("Error decoding schema: {e}")) + })?; + + let schema = Arc::new(schema); + let dictionaries_by_field = HashMap::new(); + + self.state = Some(FlightStreamState { + schema: Arc::clone(&schema), + dictionaries_by_field, + }); + Ok(Some(DecodedFlightData::new_schema(data, schema))) + } + MessageHeader::DictionaryBatch => { + let state = if let Some(state) = self.state.as_mut() { + state + } else { + return Err(FlightError::protocol( + "Received DictionaryBatch prior to Schema", + )); + }; + + let buffer: arrow_buffer::Buffer = data.data_body.into(); + let dictionary_batch = + message.header_as_dictionary_batch().ok_or_else(|| { + FlightError::protocol( + "Could not get dictionary batch from DictionaryBatch message", + ) + })?; + + arrow_ipc::reader::read_dictionary( + &buffer, + dictionary_batch, + &state.schema, + &mut state.dictionaries_by_field, + &message.version(), + ) + .map_err(|e| { + FlightError::DecodeError(format!( + "Error decoding ipc dictionary: {e}" + )) + })?; + + // Updated internal state, but no decoded message + Ok(None) + } + MessageHeader::RecordBatch => { + let state = if let Some(state) = self.state.as_ref() { + state + } else { + return Err(FlightError::protocol( + "Received RecordBatch prior to Schema", + )); + }; + + let batch = flight_data_to_arrow_batch( + &data, + Arc::clone(&state.schema), + &state.dictionaries_by_field, + ) + .map_err(|e| { + FlightError::DecodeError(format!( + "Error decoding ipc RecordBatch: {e}" + )) + })?; + + Ok(Some(DecodedFlightData::new_record_batch(data, batch))) + } + other => { + let name = other.variant_name().unwrap_or("UNKNOWN"); + Err(FlightError::protocol(format!("Unexpected message: {name}"))) + } + } + } +} + +impl futures::Stream for FlightDataStream { + type Item = Result; + /// Returns the result of decoding the next [`FlightData`] message + /// from the server, or `None` if there are no further results + /// available. + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + if self.done { + return Poll::Ready(None); + } + loop { + let res = ready!(self.response.poll_next_unpin(cx)); + + return Poll::Ready(match res { + None => { + self.done = true; + None // inner is exhausted + } + Some(data) => Some(match data { + Err(e) => Err(FlightError::Tonic(e)), + Ok(data) => match self.extract_message(data) { + Ok(Some(extracted)) => Ok(extracted), + Ok(None) => continue, // Need next input message + Err(e) => Err(e), + }, + }), + }); + } + } +} + +/// tracks the state needed to reconstruct [`RecordBatch`]es from a +/// streaming flight response. +#[derive(Debug)] +struct FlightStreamState { + schema: Arc, + dictionaries_by_field: HashMap, +} + +/// FlightData and the decoded payload (Schema, RecordBatch), if any +#[derive(Debug)] +pub struct DecodedFlightData { + pub inner: FlightData, + pub payload: DecodedPayload, +} + +impl DecodedFlightData { + pub fn new_none(inner: FlightData) -> Self { + Self { + inner, + payload: DecodedPayload::None, + } + } + + pub fn new_schema(inner: FlightData, schema: Arc) -> Self { + Self { + inner, + payload: DecodedPayload::Schema(schema), + } + } + + pub fn new_record_batch(inner: FlightData, batch: RecordBatch) -> Self { + Self { + inner, + payload: DecodedPayload::RecordBatch(batch), + } + } + + /// return the metadata field of the inner flight data + pub fn app_metadata(&self) -> &[u8] { + &self.inner.app_metadata + } +} + +/// The result of decoding [`FlightData`] +#[derive(Debug)] +pub enum DecodedPayload { + /// None (no data was sent in the corresponding FlightData) + None, + + /// A decoded Schema message + Schema(Arc), + + /// A decoded Record batch. + RecordBatch(RecordBatch), +} diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs new file mode 100644 index 000000000000..fbb9efa44c24 --- /dev/null +++ b/arrow-flight/src/error.rs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Errors for the Apache Arrow Flight crate +#[derive(Debug)] +pub enum FlightError { + /// Returned when functionality is not yet available. + NotYetImplemented(String), + /// Error from the underlying tonic library + Tonic(tonic::Status), + /// Some unexpected message was received + ProtocolError(String), + /// An error occured during decoding + DecodeError(String), + /// Some other (opaque) error + ExternalError(Box), +} + +impl FlightError { + pub fn protocol(message: impl Into) -> Self { + Self::ProtocolError(message.into()) + } + + /// Wraps an external error in an `ArrowError`. + pub fn from_external_error(error: Box) -> Self { + Self::ExternalError(error) + } +} + +impl std::fmt::Display for FlightError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO better format / error + write!(f, "{:?}", self) + } +} + +impl std::error::Error for FlightError {} + +impl From for FlightError { + fn from(status: tonic::Status) -> Self { + Self::Tonic(status) + } +} + +pub type Result = std::result::Result; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 051509fb16e2..f30cb54844da 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -71,6 +71,13 @@ pub mod flight_service_server { pub use gen::flight_service_server::FlightServiceServer; } +/// Mid Level [`FlightClient`] for +pub mod client; +pub use client::FlightClient; + +/// Common error types +pub mod error; + pub use gen::Action; pub use gen::ActionType; pub use gen::BasicAuth; diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs new file mode 100644 index 000000000000..5bc1062f046d --- /dev/null +++ b/arrow-flight/tests/client.rs @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Integration test for "mid level" Client + +mod common { + pub mod server; +} +use arrow_flight::{ + error::FlightError, FlightClient, FlightDescriptor, FlightInfo, HandshakeRequest, + HandshakeResponse, +}; +use bytes::Bytes; +use common::server::TestFlightServer; +use futures::Future; +use tokio::{net::TcpListener, task::JoinHandle}; +use tonic::{ + transport::{Channel, Uri}, + Status, +}; + +use std::{net::SocketAddr, time::Duration}; + +const DEFAULT_TIMEOUT_SECONDS: u64 = 30; + +#[tokio::test] +async fn test_handshake() { + do_test(|test_server, mut client| async move { + let request_payload = Bytes::from("foo"); + let response_payload = Bytes::from("Bar"); + + let request = HandshakeRequest { + payload: request_payload.clone(), + protocol_version: 0, + }; + + let response = HandshakeResponse { + payload: response_payload.clone(), + protocol_version: 0, + }; + + test_server.set_handshake_response(Ok(response)); + let response = client.handshake(request_payload).await.unwrap(); + assert_eq!(response, response_payload); + assert_eq!(test_server.take_handshake_request(), Some(request)); + }) + .await; +} + +#[tokio::test] +async fn test_handshake_error() { + do_test(|test_server, mut client| async move { + let request_payload = "foo".to_string().into_bytes(); + let e = Status::unauthenticated("DENIED"); + test_server.set_handshake_response(Err(e)); + + let response = client.handshake(request_payload).await.unwrap_err(); + let e = Status::unauthenticated("DENIED"); + expect_status(response, e); + }) + .await; +} + +#[tokio::test] +async fn test_handshake_metadata() { + do_test(|test_server, mut client| async move { + client.add_header("foo", "bar").unwrap(); + + let request_payload = Bytes::from("Blarg"); + let response_payload = Bytes::from("Bazz"); + + let response = HandshakeResponse { + payload: response_payload.clone(), + protocol_version: 0, + }; + + test_server.set_handshake_response(Ok(response)); + client.handshake(request_payload).await.unwrap(); + ensure_metadata(&client, &test_server); + }) + .await; +} + +/// Verifies that all headers sent from the the client are in the request_metadata +fn ensure_metadata(client: &FlightClient, test_server: &TestFlightServer) { + let client_metadata = client.metadata().clone().into_headers(); + assert!(!client_metadata.is_empty()); + let metadata = test_server + .take_last_request_metadata() + .expect("No headers in server") + .into_headers(); + + for (k, v) in &client_metadata { + assert_eq!( + metadata.get(k).as_ref(), + Some(&v), + "Missing / Mismatched metadata {:?} sent {:?} got {:?}", + k, + client_metadata, + metadata + ); + } +} + +fn test_flight_info(request: &FlightDescriptor) -> FlightInfo { + FlightInfo { + schema: Bytes::new(), + endpoint: vec![], + flight_descriptor: Some(request.clone()), + total_bytes: 123, + total_records: 456, + } +} + +#[tokio::test] +async fn test_get_flight_info() { + do_test(|test_server, mut client| async move { + let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); + + let expected_response = test_flight_info(&request); + test_server.set_get_flight_info_response(Ok(expected_response.clone())); + + let response = client.get_flight_info(request.clone()).await.unwrap(); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_get_flight_info_request(), Some(request)); + }) + .await; +} + +#[tokio::test] +async fn test_get_flight_info_error() { + do_test(|test_server, mut client| async move { + let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); + + let e = Status::unauthenticated("DENIED"); + test_server.set_get_flight_info_response(Err(e)); + + let response = client.get_flight_info(request.clone()).await.unwrap_err(); + let e = Status::unauthenticated("DENIED"); + expect_status(response, e); + }) + .await; +} + +#[tokio::test] +async fn test_get_flight_info_metadata() { + do_test(|test_server, mut client| async move { + client.add_header("foo", "bar").unwrap(); + let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); + + let expected_response = test_flight_info(&request); + test_server.set_get_flight_info_response(Ok(expected_response)); + client.get_flight_info(request.clone()).await.unwrap(); + ensure_metadata(&client, &test_server); + }) + .await; +} + +// TODO more negative tests (like if there are endpoints defined, etc) + +// TODO test for do_get + +/// Runs the future returned by the function, passing it a test server and client +async fn do_test(f: F) +where + F: Fn(TestFlightServer, FlightClient) -> Fut, + Fut: Future, +{ + let test_server = TestFlightServer::new(); + let fixture = TestFixture::new(&test_server).await; + let client = FlightClient::new(fixture.channel().await); + + // run the test function + f(test_server, client).await; + + // cleanly shutdown the test fixture + fixture.shutdown_and_wait().await +} + +fn expect_status(error: FlightError, expected: Status) { + let status = if let FlightError::Tonic(status) = error { + status + } else { + panic!("Expected FlightError::Tonic, got: {:?}", error); + }; + + assert_eq!( + status.code(), + expected.code(), + "Got {:?} want {:?}", + status, + expected + ); + assert_eq!( + status.message(), + expected.message(), + "Got {:?} want {:?}", + status, + expected + ); + assert_eq!( + status.details(), + expected.details(), + "Got {:?} want {:?}", + status, + expected + ); +} + +/// Creates and manages a running TestServer with a background task +struct TestFixture { + /// channel to send shutdown command + shutdown: Option>, + + /// Address the server is listening on + addr: SocketAddr, + + // handle for the server task + handle: Option>>, +} + +impl TestFixture { + /// create a new test fixture from the server + pub async fn new(test_server: &TestFlightServer) -> Self { + // let OS choose a a free port + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + println!("Listening on {addr}"); + + // prepare the shutdown channel + let (tx, rx) = tokio::sync::oneshot::channel(); + + let server_timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECONDS); + + let shutdown_future = async move { + rx.await.ok(); + }; + + let serve_future = tonic::transport::Server::builder() + .timeout(server_timeout) + .add_service(test_server.service()) + .serve_with_incoming_shutdown( + tokio_stream::wrappers::TcpListenerStream::new(listener), + shutdown_future, + ); + + // Run the server in its own background task + let handle = tokio::task::spawn(serve_future); + + Self { + shutdown: Some(tx), + addr, + handle: Some(handle), + } + } + + /// Return a [`Channel`] connected to the TestServer + pub async fn channel(&self) -> Channel { + let url = format!("http://{}", self.addr); + let uri: Uri = url.parse().expect("Valid URI"); + Channel::builder(uri) + .timeout(Duration::from_secs(DEFAULT_TIMEOUT_SECONDS)) + .connect() + .await + .expect("error connecting to server") + } + + /// Stops the test server and waits for the server to shutdown + pub async fn shutdown_and_wait(mut self) { + if let Some(shutdown) = self.shutdown.take() { + shutdown.send(()).expect("server quit early"); + } + if let Some(handle) = self.handle.take() { + println!("Waiting on server to finish"); + handle + .await + .expect("task join error (panic?)") + .expect("Server Error found at shutdown"); + } + } +} + +impl Drop for TestFixture { + fn drop(&mut self) { + if let Some(shutdown) = self.shutdown.take() { + shutdown.send(()).ok(); + } + if self.handle.is_some() { + // tests should properly clean up TestFixture + println!("TestFixture::Drop called prior to `shutdown_and_wait`"); + } + } +} diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs new file mode 100644 index 000000000000..f1cb140b68c7 --- /dev/null +++ b/arrow-flight/tests/common/server.rs @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::{Arc, Mutex}; + +use futures::stream::BoxStream; +use tonic::{metadata::MetadataMap, Request, Response, Status, Streaming}; + +use arrow_flight::{ + flight_service_server::{FlightService, FlightServiceServer}, + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, +}; + +#[derive(Debug, Clone)] +/// Flight server for testing, with configurable responses +pub struct TestFlightServer { + /// Shared state to configure responses + state: Arc>, +} + +impl TestFlightServer { + /// Create a `TestFlightServer` + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(State::new())), + } + } + + /// Return an [`FlightServiceServer`] that can be used with a + /// [`Server`](tonic::transport::Server) + pub fn service(&self) -> FlightServiceServer { + // wrap up tonic goop + FlightServiceServer::new(self.clone()) + } + + /// Specify the response returned from the next call to handshake + pub fn set_handshake_response(&self, response: Result) { + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.handshake_response.replace(response); + } + + /// Take and return last handshake request send to the server, + pub fn take_handshake_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .handshake_request + .take() + } + + /// Specify the response returned from the next call to handshake + pub fn set_get_flight_info_response(&self, response: Result) { + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.get_flight_info_response.replace(response); + } + + /// Take and return last get_flight_info request send to the server, + pub fn take_get_flight_info_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .get_flight_info_request + .take() + } + + /// Returns the last metadata from a request received by the server + pub fn take_last_request_metadata(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .last_request_metadata + .take() + } + + /// Save the last request's metadatacom + fn save_metadata(&self, request: &Request) { + let metadata = request.metadata().clone(); + let mut state = self.state.lock().expect("mutex not poisoned"); + state.last_request_metadata = Some(metadata); + } +} + +/// mutable state for the TestFlightSwrver +#[derive(Debug, Default)] +struct State { + /// The last handshake request that was received + pub handshake_request: Option, + /// The next response to return from `handshake()` + pub handshake_response: Option>, + /// The last `get_flight_info` request received + pub get_flight_info_request: Option, + /// the next response to return from `get_flight_info` + pub get_flight_info_response: Option>, + /// The last request headers received + pub last_request_metadata: Option, +} + +impl State { + fn new() -> Self { + Default::default() + } +} + +/// Implement the FlightService trait +#[tonic::async_trait] +impl FlightService for TestFlightServer { + type HandshakeStream = BoxStream<'static, Result>; + type ListFlightsStream = BoxStream<'static, Result>; + type DoGetStream = BoxStream<'static, Result>; + type DoPutStream = BoxStream<'static, Result>; + type DoActionStream = BoxStream<'static, Result>; + type ListActionsStream = BoxStream<'static, Result>; + type DoExchangeStream = BoxStream<'static, Result>; + + async fn handshake( + &self, + request: Request>, + ) -> Result, Status> { + self.save_metadata(&request); + let handshake_request = request.into_inner().message().await?.unwrap(); + + let mut state = self.state.lock().expect("mutex not poisoned"); + state.handshake_request = Some(handshake_request); + + let response = state.handshake_response.take().unwrap_or_else(|| { + Err(Status::internal("No handshake response configured")) + })?; + + // turn into a streaming response + let output = futures::stream::iter(std::iter::once(Ok(response))); + Ok(Response::new(Box::pin(output) as Self::HandshakeStream)) + } + + async fn list_flights( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Implement list_flights")) + } + + async fn get_flight_info( + &self, + request: Request, + ) -> Result, Status> { + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + state.get_flight_info_request = Some(request.into_inner()); + let response = state.get_flight_info_response.take().unwrap_or_else(|| { + Err(Status::internal("No get_flight_info response configured")) + })?; + Ok(Response::new(response)) + } + + async fn get_schema( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Implement get_schema")) + } + + async fn do_get( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Implement do_get")) + } + + async fn do_put( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Implement do_put")) + } + + async fn do_action( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Implement do_action")) + } + + async fn list_actions( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Implement list_actions")) + } + + async fn do_exchange( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Implement do_exchange")) + } +} From 733b7e7fd1e8c43a404c3ce40ecf741d493c21b4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 26 Dec 2022 10:18:07 -0800 Subject: [PATCH 0444/1411] Add https support (#3388) --- .github/workflows/arrow_flight.yml | 3 + arrow-flight/Cargo.toml | 1 + arrow-flight/examples/data/ca.pem | 28 +++++ arrow-flight/examples/data/client1.key | 28 +++++ arrow-flight/examples/data/client1.pem | 19 ++++ arrow-flight/examples/data/client_ca.pem | 19 ++++ arrow-flight/examples/data/server.key | 28 +++++ arrow-flight/examples/data/server.pem | 27 +++++ arrow-flight/examples/flight_sql_server.rs | 114 ++++++++++++++++++++- arrow-flight/src/sql/client.rs | 40 ++++++++ dev/release/rat_exclude_files.txt | 1 + 11 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 arrow-flight/examples/data/ca.pem create mode 100644 arrow-flight/examples/data/client1.key create mode 100644 arrow-flight/examples/data/client1.pem create mode 100644 arrow-flight/examples/data/client_ca.pem create mode 100644 arrow-flight/examples/data/server.key create mode 100644 arrow-flight/examples/data/server.pem diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index a0e9f38b02d1..d7e8033fe930 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -59,6 +59,9 @@ jobs: - name: Test --examples run: | cargo test -p arrow-flight --features=flight-sql-experimental --examples + - name: Test --examples with TLS + run: | + cargo test -p arrow-flight --features=flight-sql-experimental,tls --examples - name: Verify workspace clean run: git diff --exit-code diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 80710d1fac4f..e4a977b653f6 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -45,6 +45,7 @@ all-features = true [features] default = [] flight-sql-experimental = [] +tls = ["tonic/tls"] [dev-dependencies] arrow = { version = "29.0.0", path = "../arrow", features = ["prettyprint"] } diff --git a/arrow-flight/examples/data/ca.pem b/arrow-flight/examples/data/ca.pem new file mode 100644 index 000000000000..d81956096677 --- /dev/null +++ b/arrow-flight/examples/data/ca.pem @@ -0,0 +1,28 @@ +-----BEGIN CERTIFICATE----- +MIIE3DCCA0SgAwIBAgIRAObeYbJFiVQSGR8yk44dsOYwDQYJKoZIhvcNAQELBQAw +gYUxHjAcBgNVBAoTFW1rY2VydCBkZXZlbG9wbWVudCBDQTEtMCsGA1UECwwkbHVj +aW9ATHVjaW9zLVdvcmstTUJQIChMdWNpbyBGcmFuY28pMTQwMgYDVQQDDCtta2Nl +cnQgbHVjaW9ATHVjaW9zLVdvcmstTUJQIChMdWNpbyBGcmFuY28pMB4XDTE5MDky +OTIzMzUzM1oXDTI5MDkyOTIzMzUzM1owgYUxHjAcBgNVBAoTFW1rY2VydCBkZXZl +bG9wbWVudCBDQTEtMCsGA1UECwwkbHVjaW9ATHVjaW9zLVdvcmstTUJQIChMdWNp +byBGcmFuY28pMTQwMgYDVQQDDCtta2NlcnQgbHVjaW9ATHVjaW9zLVdvcmstTUJQ +IChMdWNpbyBGcmFuY28pMIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIBigKCAYEA +y/vE61ItbN/1qMYt13LMf+le1svwfkCCOPsygk7nWeRXmomgUpymqn1LnWiuB0+e +4IdVH2f5E9DknWEpPhKIDMRTCbz4jTwQfHrxCb8EGj3I8oO73pJO5S/xCedM9OrZ +qWcYWwN0GQ8cO/ogazaoZf1uTrRNHyzRyQsKyb412kDBTNEeldJZ2ljKgXXvh4HO +2ZIk9K/ZAaAf6VN8K/89rlJ9/KPgRVNsyAapE+Pb8XXKtpzeFiEcUfuXVYWtkoW+ +xyn/Zu8A1L2CXMQ1sARh7P/42BTMKr5pfraYgcBGxKXLrxoySpxCO9KqeVveKy1q +fPm5FCwFsXDr0koFLrCiR58mcIO/04Q9DKKTV4Z2a+LoqDJRY37KfBSc8sDMPhw5 +k7g3WPoa6QwXRjZTCA5fHWVgLOtcwLsnju5tBE4LDxwF6s+1wPF8NI5yUfufcEjJ +Z6JBwgoWYosVj27Lx7KBNLU/57PX9ryee691zmtswt0tP0WVBAgalhYWg99RXoa3 +AgMBAAGjRTBDMA4GA1UdDwEB/wQEAwICBDASBgNVHRMBAf8ECDAGAQH/AgEAMB0G +A1UdDgQWBBQdvlE4Bdcsjc9oaxjDCRu5FiuZkzANBgkqhkiG9w0BAQsFAAOCAYEA +BP/6o1kPINksMJZSSXgNCPZskDLyGw7auUZBnQ0ocDT3W6gXQvT/27LM1Hxoj9Eh +qU1TYdEt7ppecLQSGvzQ02MExG7H75art75oLiB+A5agDira937YbK4MCjqW481d +bDhw6ixJnY1jIvwjEZxyH6g94YyL927aSPch51fys0kSnjkFzC2RmuzDADScc4XH +5P1+/3dnIm3M5yfpeUzoaOrTXNmhn8p0RDIGrZ5kA5eISIGGD3Mm8FDssUNKndtO +g4ojHUsxb14icnAYGeye1NOhGiqN6TEFcgr6MPd0XdFNZ5c0HUaBCfN6bc+JxDV5 +MKZVJdNeJsYYwilgJNHAyZgCi30JC20xeYVtTF7CEEsMrFDGJ70Kz7o/FnRiFsA1 +ZSwVVWhhkHG2VkT4vlo0O3fYeZpenYicvy+wZNTbGK83gzHWqxxNC1z3Etg5+HRJ +F9qeMWPyfA3IHYXygiMcviyLcyNGG/SJ0EhUpYBN/Gg7wI5yFkcsxUDPPzd23O0M +-----END CERTIFICATE----- diff --git a/arrow-flight/examples/data/client1.key b/arrow-flight/examples/data/client1.key new file mode 100644 index 000000000000..f4d8da2758ac --- /dev/null +++ b/arrow-flight/examples/data/client1.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCiiWrmzpENsI+c +Cz4aBpG+Pl8WOsrByfZx/ZnJdCZHO3MTYE6sCLhYssf0ygAEEGxvmkd4cxmfCfgf +xuT8u+D7Y5zQSoymkbWdU6/9jbNY6Ovtc+a96I1LGXOKROQw6KR3PuqLpUqEOJiB +l03qK+HMU0g56G1n31Od7HkJsDRvtePqy3I3LgpdcRps23sk46tCzZzhyfqIQ7Qf +J5qZx93tA+pfy+Xtb9XIUTIWKIp1/uyfh8Fp8HA0c9zJCSZzJOX2j3GH1TYqkVgP +egI2lhmdXhP5Q8vdhwy0UJaL28RJXA6UAg0tPZeWJe6pux9JiA81sI6My+Krrw8D +yibkGTTbAgMBAAECggEANCQhRym9HsclSsnQgkjZOE6J8nep08EWbjsMurOoE/He +WLjshAPIH6w6uSyUFLmwD51OkDVcYsiv8IG9s9YRtpOeGrPPqx/TQ0U1kAGFJ2CR +Tvt/aizQJudjSVgQXCBFontsgp/j58bAJdKEDDtHlGSjJvCJKGlcSa0ypwj/yVXt +frjROJNYzw9gMM7fN/IKF/cysdXSeLl/Q9RnHVIfC3jOFJutsILCK8+PC51dM8Fl +IOjmPmiZ080yV8RBcMRECwl53vLOE3OOpR3ZijfNCY1KU8zWi1oELJ1o6f4+cBye +7WPgFEoBew5XHXZ+ke8rh8cc0wth7ZTcC+xC/456AQKBgQDQr2EzBwXxYLF8qsN1 +R4zlzXILLdZN8a4bKfrS507/Gi1gDBHzfvbE7HfljeqrAkbKMdKNkbz3iS85SguH +jsM047xUGJg0PAcwBLHUedlSn1xDDcDHW6X8ginpA2Zz1+WAlhNz6XurA1wnjZmS +VcPxopH7QsuFCclqtt14MbBQ6QKBgQDHY3jcAVfQF+yhQ0YyM6GPLN342aTplgyJ +yz4uWVMeXacU4QzqGbf2L2hc9M2L28Xb37RWC3Q/by0vUefiC6qxRt+GJdRsOuQj +2F1uUibeWtAWp249fcfvxjLib276J+Eit18LI0s0mNR3ekK4GcjSe4NwSq5IrU8e +pBreet3dIwKBgQCxVuil4WkGd+I8jC0v5A7zVsR8hYZhlGkdgm45fgHevdMjlP5I +S3PPYxh8hj6O9o9L0k0Yq2nHfdgYujjUCNkQgBuR55iogv6kqsioRKgPE4fnH6/c +eqCy1bZh4tbUyPqqbF65mQfUCzXsEuQXvDSYiku+F0Q2mVuGCUJpmug3yQKBgEd3 +LeCdUp4xlQ0QEd74hpXM3RrO178pmwDgqj7uoU4m/zYKnBhkc3137I406F+SvE5c +1kRpApeh/64QS27IA7xazM9GS+cnDJKUgJiENY5JOoCELo03wiv8/EwQ6NQc6yMI +WrahRdlqVe0lEzjtdP+MacYb3nAKPmubIk5P96nFAoGAFAyrKpFTyXbNYBTw9Rab +TG6q7qkn+YTHN3+k4mo9NGGwZ3pXvmrKMYCIRhLMbqzsmTbFqCPPIxKsrmf8QYLh +xHYQjrCkbZ0wZdcdeV6yFSDsF218nF/12ZPE7CBOQMfZTCKFNWGL97uIVcmR6K5G +ojTkOvaUnwQtSFhNuzyr23I= +-----END PRIVATE KEY----- diff --git a/arrow-flight/examples/data/client1.pem b/arrow-flight/examples/data/client1.pem new file mode 100644 index 000000000000..bb3b82c40c5a --- /dev/null +++ b/arrow-flight/examples/data/client1.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDCTCCAfGgAwIBAgIQYbE9d1Rft5h4ku7FSAvWdzANBgkqhkiG9w0BAQsFADAn +MSUwIwYDVQQDExxUb25pYyBFeGFtcGxlIENsaWVudCBSb290IENBMB4XDTE5MTAx +NDEyMzkzNloXDTI0MTAxMjEyMzkzNlowEjEQMA4GA1UEAxMHY2xpZW50MTCCASIw +DQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKKJaubOkQ2wj5wLPhoGkb4+XxY6 +ysHJ9nH9mcl0Jkc7cxNgTqwIuFiyx/TKAAQQbG+aR3hzGZ8J+B/G5Py74PtjnNBK +jKaRtZ1Tr/2Ns1jo6+1z5r3ojUsZc4pE5DDopHc+6oulSoQ4mIGXTeor4cxTSDno +bWffU53seQmwNG+14+rLcjcuCl1xGmzbeyTjq0LNnOHJ+ohDtB8nmpnH3e0D6l/L +5e1v1chRMhYoinX+7J+HwWnwcDRz3MkJJnMk5faPcYfVNiqRWA96AjaWGZ1eE/lD +y92HDLRQlovbxElcDpQCDS09l5Yl7qm7H0mIDzWwjozL4quvDwPKJuQZNNsCAwEA +AaNGMEQwEwYDVR0lBAwwCgYIKwYBBQUHAwIwDAYDVR0TAQH/BAIwADAfBgNVHSME +GDAWgBQV1YOR+Jpl1fbujvWLSBEoRvsDhTANBgkqhkiG9w0BAQsFAAOCAQEAfTPu +KeHXmyVTSCUrYQ1X5Mu7VzfZlRbhoytHOw7bYGgwaFwQj+ZhlPt8nFC22/bEk4IV +AoCOli0WyPIB7Lx52dZ+v9JmYOK6ca2Aa/Dkw8Q+M3XA024FQWq3nZ6qANKC32/9 +Nk+xOcb1Qd/11stpTkRf2Oj7F7K4GnlFbY6iMyNW+RFXGKEbL5QAJDTDPIT8vw1x +oYeNPwmC042uEboCZPNXmuctiK9Wt1TAxjZT/cwdIBGGJ+xrW72abfJGs7bUcJfc +O4r9V0xVv+X0iKWTW0fwd9qjNfiEP1tFCcZb2XsNQPe/DlQZ+h98P073tZEsWI/G +KJrFspGX8vOuSdIeqw== +-----END CERTIFICATE----- diff --git a/arrow-flight/examples/data/client_ca.pem b/arrow-flight/examples/data/client_ca.pem new file mode 100644 index 000000000000..aa483b931056 --- /dev/null +++ b/arrow-flight/examples/data/client_ca.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDGzCCAgOgAwIBAgIRAMNWpWRu6Q1txEYUyrkyXKEwDQYJKoZIhvcNAQELBQAw +JzElMCMGA1UEAxMcVG9uaWMgRXhhbXBsZSBDbGllbnQgUm9vdCBDQTAeFw0xOTEw +MTQxMjM5MzZaFw0yOTEwMTExMjM5MzZaMCcxJTAjBgNVBAMTHFRvbmljIEV4YW1w +bGUgQ2xpZW50IFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIB +AQCv8Nj4XJbMI0wWUvLbmCf7IEvJFnomodGnDurh8Y5AGMPJ8cGdZC1yo2Lgah+D +IhXdsd72Wp7MhdntJAyPrMCDBfDrFiuj6YHDgt3OhPQSYl7EWG7QjFK3B2sp1K5D +h16G5zfwUKDj9Jp3xuPGuqNFQHL02nwbhtDilqHvaTfOJKVjsFCoU8Z77mfwXSwn +sPXpPB7oOO4mWfAtcwU11rTMiHFSGFlFhgbHULU/y90DcpfRQEpEiBoiK13gkyoP +zHT9WAg3Pelwb6K7c7kJ7mp4axhbf7MkwFhDQIjbBWqus2Eu3b0mf86ALfDbAaNC +wBi8xbNH2vWaDjiwLDY5uMZDAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwICBDAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBQV1YOR+Jpl1fbujvWLSBEoRvsDhTANBgkq +hkiG9w0BAQsFAAOCAQEAaXmM29TYkFUzZUsV7TSonAK560BjxDmbg0GJSUgLEFUJ +wpKqa9UKOSapG45LEeR2wwAmVWDJomJplkuvTD/KOabAbZKyPEfp+VMCaBUnILQF +Cxv5m7kQ3wmPS/rEL8FD809UGowW9cYqnZzUy5i/r263rx0k3OPjkkZN66Mh6+3H +ibNdaxf7ITO0JVb/Ohq9vLC9qf7ujiB1atMdJwkOWsZrLJXLygpx/D0/UhBT4fFH +OlyVOmuR27qaMbPgOs2l8DznkJY/QUfnET8iOQhFgb0Dt/Os4PYFhSDRIrgl5dJ7 +L/zZVQfZYpdxlBHJlDC1/NzVQl/1MgDnSgPGStZKPQ== +-----END CERTIFICATE----- diff --git a/arrow-flight/examples/data/server.key b/arrow-flight/examples/data/server.key new file mode 100644 index 000000000000..80984ef9000d --- /dev/null +++ b/arrow-flight/examples/data/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDyptbMyYWztgta +t1MXLMzIkaQdeeVbs1Y/qCpAdwZe/Y5ZpbzjGIjCxbB6vNRSnEbYKpytKHPzYfM7 +8d8K8bPvpnqXIiTXFT0JQlw1OHLC1fr4e598GJumAmpMYFrtqv0fbmUFTuQGbHxe +OH2vji0bvr3NKZubMfkEZP3X4sNXXoXIuW2LaS8OMGKoJaeCBvdbszEiSGj/v9Bj +pM0yLTH89NNMX1T+FtTKnuXag5g7pr6lzJj83+MzAGy4nOjseSuUimuiyG90/C5t +A5wC0Qh5RbDnkFYhC44Kxof/i6+jnfateIPNiIIwQV+2f6G/aK1hgjekT10m/eoR +YDTf+e5ZAgMBAAECggEACODt7yRYjhDVLYaTtb9f5t7dYG67Y7WWLFIc6arxQryI +XuNfm/ej2WyeXn9WTYeGWBaHERbv1zH4UnMxNBdP/C7dQXZwXqZaS2JwOUpNeK+X +tUvgtAu6dkKUXSMRcKzXAjVp4N3YHhwOGOx8PNY49FDwZPdmyDD16aFAYIvdle6/ +PSMrj38rB1sbQQdmRob2FjJBSDZ44nsr+/nilrcOFNfNnWv7tQIWYVXNcLfdK/WJ +ZCDFhA8lr/Yon6MEq6ApTj2ZYRRGXPd6UeASJkmTZEUIUbeDcje/MO8cHkREpuRH +wm3pCjR7OdO4vc+/d/QmEvu5ns6wbTauelYnL616YQKBgQD414gJtpCHauNEUlFB +v/R3DzPI5NGp9PAqovOD8nCbI49Mw61gP/ExTIPKiR5uUX/5EL04uspaNkuohXk+ +ys0G5At0NfV7W39lzhvALEaSfleybvYxppbBrc20/q8Gvi/i30NY+1LM3RdtMiEw +hKHjU0SnFhJq0InFg3AO/iCeTQKBgQD5obkbzpOidSsa55aNsUlO2qjiUY9leq9b +irAohIZ8YnuuixYvkOeSeSz1eIrA4tECeAFSgTZxYe1Iz+USru2Xg/0xNte11dJD +rBoH/yMn2gDvBK7xQ6uFMPTeYtKG0vfvpXZYSWZzGntyrHTwFk6UV+xdrt9MBdd1 +XdSn7bwOPQKBgC9VQAko8uDvUf+C8PXiv2uONrl13PPJJY3WpR9qFEVOREnDxszS +HNzVwxPZdTJiykbkCjoqPadfQJDzopZxGQLAifU29lTamKcSx3CMe3gOFDxaovXa +zD5XAxP0hfJwZsdu1G6uj5dsTrJ0oJ+L+wc0pZBqwGIU/L/XOo9/g1DZAoGAUebL +kuH98ik7EUK2VJq8EJERI9/ailLsQb6I+WIxtZGiPqwHhWencpkrNQZtj8dbB9JT +rLwUHrMgZOlAoRafgTyez4zMzS3wJJ/Mkp8U67hM4h7JPwMSvUpIrMYDiJSjIA9L +er/qSw1/Pypx22uWMHmAZWRAgvLPtAQrB0Wqk4kCgYEAr2H1PvfbwZwkSvlMt5o8 +WLnBbxcM3AKglLRbkShxxgiZYdEP71/uOtRMiL26du5XX8evItITN0DsvmXL/kcd +h29LK7LM5uLw7efz0Qxs03G6kEyIHVkacowHi5I5Ul1qI61SoV3yMB1TjIU+bXZt +0ZjC07totO0fqPOLQxonjQg= +-----END PRIVATE KEY----- diff --git a/arrow-flight/examples/data/server.pem b/arrow-flight/examples/data/server.pem new file mode 100644 index 000000000000..4cc97bcf4b6d --- /dev/null +++ b/arrow-flight/examples/data/server.pem @@ -0,0 +1,27 @@ +-----BEGIN CERTIFICATE----- +MIIEmDCCAwCgAwIBAgIQVEJFCgU/CZk9JEwTucWPpzANBgkqhkiG9w0BAQsFADCB +hTEeMBwGA1UEChMVbWtjZXJ0IGRldmVsb3BtZW50IENBMS0wKwYDVQQLDCRsdWNp +b0BMdWNpb3MtV29yay1NQlAgKEx1Y2lvIEZyYW5jbykxNDAyBgNVBAMMK21rY2Vy +dCBsdWNpb0BMdWNpb3MtV29yay1NQlAgKEx1Y2lvIEZyYW5jbykwHhcNMTkwNjAx +MDAwMDAwWhcNMjkwOTI5MjMzNTM0WjBYMScwJQYDVQQKEx5ta2NlcnQgZGV2ZWxv +cG1lbnQgY2VydGlmaWNhdGUxLTArBgNVBAsMJGx1Y2lvQEx1Y2lvcy1Xb3JrLU1C +UCAoTHVjaW8gRnJhbmNvKTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB +APKm1szJhbO2C1q3UxcszMiRpB155VuzVj+oKkB3Bl79jlmlvOMYiMLFsHq81FKc +RtgqnK0oc/Nh8zvx3wrxs++mepciJNcVPQlCXDU4csLV+vh7n3wYm6YCakxgWu2q +/R9uZQVO5AZsfF44fa+OLRu+vc0pm5sx+QRk/dfiw1dehci5bYtpLw4wYqglp4IG +91uzMSJIaP+/0GOkzTItMfz000xfVP4W1Mqe5dqDmDumvqXMmPzf4zMAbLic6Ox5 +K5SKa6LIb3T8Lm0DnALRCHlFsOeQViELjgrGh/+Lr6Od9q14g82IgjBBX7Z/ob9o +rWGCN6RPXSb96hFgNN/57lkCAwEAAaOBrzCBrDAOBgNVHQ8BAf8EBAMCBaAwEwYD +VR0lBAwwCgYIKwYBBQUHAwEwDAYDVR0TAQH/BAIwADAfBgNVHSMEGDAWgBQdvlE4 +Bdcsjc9oaxjDCRu5FiuZkzBWBgNVHREETzBNggtleGFtcGxlLmNvbYINKi5leGFt +cGxlLmNvbYIMZXhhbXBsZS50ZXN0gglsb2NhbGhvc3SHBH8AAAGHEAAAAAAAAAAA +AAAAAAAAAAEwDQYJKoZIhvcNAQELBQADggGBAKb2TJ8l+e1eraNwZWizLw5fccAf +y59J1JAWdLxZyAI/bkiTlVO3DQoPZpw7XwLhefCvILkwKAL4TtIGGVC9yTb5Q5eg +rqGO3FC0yg1fn65Kf1VpVxxUVyoiM5PQ4pFJb4AicAv88rCOLD9FFuE0PKOKU/dm +Tw0WgPStoh9wsJ1RXUuTJYZs1nd1kMBlfv9NbLilnL+cR2sLktS54X5XagsBYVlf +oapRb0JtABOoQhX3U8QMq8UF8yzceRHNTN9yfLOUrW26s9nKtlWVniNhw1uPxZw9 +RHM7w9/4+a9LXtEDYg4IP/1mm0ywBoUqy1O6hA73uId+Yi/kFBks/GyYaGjKgYcO +23B75tkPGYEdGuGZYLzZNHbXg4V0UxFQG3KA1pUiSnD3bN2Rxs+CMpzORnOeK3xi +EooKgAPYsehItoQOMPpccI2xHdSAMWtwUgOKrefUQujkx2Op+KFlspF0+WJ6AZEe +2D4hyWaEZsvvILXapwqHDCuN3/jSUlTIqUoE1w== +-----END CERTIFICATE----- diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index ae015001f0a1..54e19a8cc57d 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -29,6 +29,8 @@ use prost::Message; use std::pin::Pin; use std::sync::Arc; use tonic::transport::Server; +#[cfg(feature = "tls")] +use tonic::transport::{Certificate, Identity, ServerTlsConfig}; use tonic::{Request, Response, Status, Streaming}; use arrow_flight::flight_descriptor::DescriptorType; @@ -447,6 +449,7 @@ impl FlightSqlService for FlightSqlServiceImpl { /// This example shows how to run a FlightSql server #[tokio::main] +#[cfg(not(feature = "tls"))] async fn main() -> Result<(), Box> { let addr = "0.0.0.0:50051".parse()?; @@ -459,6 +462,33 @@ async fn main() -> Result<(), Box> { Ok(()) } +/// This example shows how to run a HTTPs FlightSql server +#[tokio::main] +#[cfg(feature = "tls")] +async fn main() -> Result<(), Box> { + let addr = "0.0.0.0:50051".parse()?; + + let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); + + println!("Listening on {:?}", addr); + + let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; + let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; + let client_ca = std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; + + let tls_config = ServerTlsConfig::new() + .identity(Identity::from_pem(&cert, &key)) + .client_ca_root(Certificate::from_pem(&client_ca)); + + Server::builder() + .tls_config(tls_config)? + .add_service(svc) + .serve(addr) + .await?; + + Ok(()) +} + #[derive(Clone, PartialEq, ::prost::Message)] pub struct FetchResults { #[prost(string, tag = "1")] @@ -479,20 +509,29 @@ impl ProstMessageExt for FetchResults { } #[cfg(test)] +#[allow(unused_imports)] mod tests { use super::*; use futures::TryStreamExt; use std::fs; + use std::time::Duration; use tempfile::NamedTempFile; use tokio::net::{UnixListener, UnixStream}; + use tokio::time::sleep; use tokio_stream::wrappers::UnixListenerStream; - use tonic::transport::Endpoint; + use tonic::body::BoxBody; + use tonic::codegen::{http, Body, Service}; + + #[cfg(feature = "tls")] + use tonic::transport::ClientTlsConfig; use arrow::util::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; use arrow_flight::utils::flight_data_to_batches; - use tower::service_fn; + use tonic::transport::{Certificate, Channel, Endpoint}; + use tower::{service_fn, ServiceExt}; + #[cfg(not(feature = "tls"))] async fn client_with_uds(path: String) -> FlightSqlServiceClient { let connector = service_fn(move |_| UnixStream::connect(path.clone())); let channel = Endpoint::try_from("https://example.com") @@ -503,7 +542,78 @@ mod tests { FlightSqlServiceClient::new(channel) } + #[cfg(feature = "tls")] + async fn create_https_server() -> Result<(), tonic::transport::Error> { + let cert = std::fs::read_to_string("examples/data/server.pem").unwrap(); + let key = std::fs::read_to_string("examples/data/server.key").unwrap(); + let client_ca = std::fs::read_to_string("examples/data/client_ca.pem").unwrap(); + + let tls_config = ServerTlsConfig::new() + .identity(Identity::from_pem(&cert, &key)) + .client_ca_root(Certificate::from_pem(&client_ca)); + + let addr = "0.0.0.0:50051".parse().unwrap(); + + let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); + + Server::builder() + .tls_config(tls_config) + .unwrap() + .add_service(svc) + .serve(addr) + .await + } + + #[tokio::test] + #[cfg(feature = "tls")] + async fn test_select_https() { + tokio::spawn(async { + create_https_server().await.unwrap(); + }); + + sleep(Duration::from_millis(2000)).await; + + let request_future = async { + let cert = std::fs::read_to_string("examples/data/client1.pem").unwrap(); + let key = std::fs::read_to_string("examples/data/client1.key").unwrap(); + let server_ca = std::fs::read_to_string("examples/data/ca.pem").unwrap(); + + let mut client = FlightSqlServiceClient::new_with_endpoint( + Identity::from_pem(cert, key), + Certificate::from_pem(&server_ca), + "localhost", + "127.0.0.1", + 50051, + ) + .await + .unwrap(); + let token = client.handshake("admin", "password").await.unwrap(); + println!("Auth succeeded with token: {:?}", token); + let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); + let flight_info = stmt.execute().await.unwrap(); + let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); + let flight_data = client.do_get(ticket).await.unwrap(); + let flight_data: Vec = flight_data.try_collect().await.unwrap(); + let batches = flight_data_to_batches(&flight_data).unwrap(); + let res = pretty_format_batches(batches.as_slice()).unwrap(); + let expected = r#" ++-------------------+ +| salutation | ++-------------------+ +| Hello, FlightSQL! | ++-------------------+"# + .trim() + .to_string(); + assert_eq!(res.to_string(), expected); + }; + + tokio::select! { + _ = request_future => println!("Client finished!"), + } + } + #[tokio::test] + #[cfg(not(feature = "tls"))] async fn test_select_1() { let file = NamedTempFile::new().unwrap(); let path = file.into_temp_path().to_str().unwrap().to_string(); diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index cf71edead3e3..679213af0d86 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -44,6 +44,8 @@ use arrow_schema::{ArrowError, Schema, SchemaRef}; use futures::{stream, TryStreamExt}; use prost::Message; use tokio::sync::{Mutex, MutexGuard}; +#[cfg(feature = "tls")] +use tonic::transport::{Certificate, ClientTlsConfig, Identity}; use tonic::transport::{Channel, Endpoint}; use tonic::Streaming; @@ -60,6 +62,7 @@ pub struct FlightSqlServiceClient { /// Github issues are welcomed. impl FlightSqlServiceClient { /// Creates a new FlightSql Client that connects via TCP to a server + #[cfg(not(feature = "tls"))] pub async fn new_with_endpoint(host: &str, port: u16) -> Result { let addr = format!("http://{}:{}", host, port); let endpoint = Endpoint::new(addr) @@ -71,6 +74,43 @@ impl FlightSqlServiceClient { .http2_keep_alive_interval(Duration::from_secs(300)) .keep_alive_timeout(Duration::from_secs(20)) .keep_alive_while_idle(true); + + let channel = endpoint.connect().await.map_err(|e| { + ArrowError::IoError(format!("Cannot connect to endpoint: {}", e)) + })?; + Ok(Self::new(channel)) + } + + /// Creates a new HTTPs FlightSql Client that connects via TCP to a server + #[cfg(feature = "tls")] + pub async fn new_with_endpoint( + client_ident: Identity, + server_ca: Certificate, + domain: &str, + host: &str, + port: u16, + ) -> Result { + let addr = format!("https://{}:{}", host, port); + + let endpoint = Endpoint::new(addr) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .connect_timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(20)) + .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait + .tcp_keepalive(Option::Some(Duration::from_secs(3600))) + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(20)) + .keep_alive_while_idle(true); + + let tls_config = ClientTlsConfig::new() + .domain_name(domain) + .ca_certificate(server_ca) + .identity(client_ident); + + let endpoint = endpoint + .tls_config(tls_config) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))?; + let channel = endpoint.connect().await.map_err(|e| { ArrowError::IoError(format!("Cannot connect to endpoint: {}", e)) })?; diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index fad1a5a7d1dd..6f9d1b5f302b 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -24,3 +24,4 @@ arrow-flight/src/arrow.flight.protocol.rs arrow-flight/src/sql/arrow.flight.protocol.sql.rs .github/* parquet/src/bin/parquet-fromcsv-help.txt +arrow-flight/examples/data/* From 2444994b62e82bf243d34d1516c071b2f1337191 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 27 Dec 2022 10:22:56 +0000 Subject: [PATCH 0445/1411] Update quick-xml to 0.27 (#3395) * Update quick-xml * Fix Azure --- object_store/Cargo.toml | 2 +- object_store/src/aws/client.rs | 4 ++-- object_store/src/azure/client.rs | 4 ++-- object_store/src/gcp/mod.rs | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index fd033d55d666..a9cc151b985a 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -44,7 +44,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } -quick-xml = { version = "0.26.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 0e22bfc97e22..b40bcbacf99e 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -189,9 +189,9 @@ struct CompleteMultipart { #[derive(Debug, Serialize)] struct MultipartPart { - #[serde(rename = "$unflatten=ETag")] + #[serde(rename = "ETag")] e_tag: String, - #[serde(rename = "$unflatten=PartNumber")] + #[serde(rename = "PartNumber")] part_number: usize, } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 50f836377add..556a2ad2b292 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -410,7 +410,6 @@ impl TryFrom for ListResult { let common_prefixes = value .blobs .blob_prefix - .unwrap_or_default() .into_iter() .map(|x| Ok(Path::parse(x.name)?)) .collect::>()?; @@ -437,7 +436,8 @@ impl TryFrom for ListResult { #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] #[serde(rename_all = "PascalCase")] struct Blobs { - pub blob_prefix: Option>, + #[serde(default)] + pub blob_prefix: Vec, #[serde(rename = "Blob", default)] pub blobs: Vec, } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index c83ab6493cb9..c1424d9713c1 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -221,9 +221,9 @@ struct InitiateMultipartUploadResult { #[derive(serde::Serialize, Debug)] #[serde(rename_all = "PascalCase", rename(serialize = "Part"))] struct MultipartPart { - #[serde(rename = "$unflatten=PartNumber")] + #[serde(rename = "PartNumber")] part_number: usize, - #[serde(rename = "$unflatten=ETag")] + #[serde(rename = "ETag")] e_tag: String, } From 9a5073ac86b8daa222f080d5c3850d6bb98a999b Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 27 Dec 2022 02:23:04 -0800 Subject: [PATCH 0446/1411] Upgrade multiversion (#3396) --- arrow-arith/Cargo.toml | 2 +- arrow-arith/src/aggregate.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 854941c25345..db85c2a6b978 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -44,7 +44,7 @@ arrow-data = { version = "29.0.0", path = "../arrow-data" } arrow-schema = { version = "29.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } -multiversion = { version = "0.6.1", default-features = false } +multiversion = { version = "0.7.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index a9503130b0f9..dc3d70bb2831 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -104,8 +104,7 @@ pub fn max_boolean(array: &BooleanArray) -> Option { } /// Helper to compute min/max of [`ArrayAccessor`]. -#[multiversion] -#[clone(target = "x86_64+avx")] +#[multiversion(targets("x86_64+avx"))] fn min_max_helper, F>(array: A, cmp: F) -> Option where F: Fn(&T, &T) -> bool, From 1d0abfafe0da7c28b562fa0ba8c65a10b65a0821 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 27 Dec 2022 19:49:51 +0800 Subject: [PATCH 0447/1411] fix clippy issues (#3398) --- arrow-integration-test/src/field.rs | 2 +- arrow-integration-testing/src/bin/arrow-file-to-stream.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 4bfbf8e99129..dd0519157f9c 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -253,7 +253,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { }; let mut field = - Field::new_dict(&name, data_type, nullable, dict_id, dict_is_ordered); + Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) } diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs index e939fe4f0bf7..3e027faef91f 100644 --- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs +++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs @@ -30,7 +30,7 @@ struct Args { fn main() -> Result<()> { let args = Args::parse(); - let f = File::open(&args.file_name)?; + let f = File::open(args.file_name)?; let reader = BufReader::new(f); let mut reader = FileReader::try_new(reader, None)?; let schema = reader.schema(); From b903384678341412598010b02f9998ed32a3ad7c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 27 Dec 2022 17:13:08 -0800 Subject: [PATCH 0448/1411] Make sure integration works on latest version of localstack (#3403) --- object_store/CONTRIBUTING.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index efcd5fe343db..550640d931b4 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -39,7 +39,7 @@ To test the S3 integration against [localstack](https://localstack.cloud/) First start up a container running localstack ``` -$ podman run --rm -it -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +$ podman run --rm -it -e PROVIDER_OVERRIDE_S3=asf -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack ``` Setup environment @@ -49,7 +49,9 @@ export TEST_INTEGRATION=1 export OBJECT_STORE_AWS_DEFAULT_REGION=us-east-1 export OBJECT_STORE_AWS_ACCESS_KEY_ID=test export OBJECT_STORE_AWS_SECRET_ACCESS_KEY=test -export AWS_ENDPOINT=http://128.0.0.1:4566 +export OBJECT_STORE_AWS_ENDPOINT=http://localhost:4566 +export AWS_ACCESS_KEY_ID=test +export AWS_SECRET_ACCESS_KEY=test export OBJECT_STORE_BUCKET=test-bucket ``` @@ -59,6 +61,12 @@ Create a bucket using the AWS CLI podman run --net=host --env-host amazon/aws-cli --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket ``` +Or directly with: + +``` +aws s3 mb s3://test-bucket --endpoint-url=http://localhost:4566 +``` + Run tests ``` From 8f9969246d13741ead0cbcbb1cb7bab057ca15d2 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 28 Dec 2022 04:55:25 -0500 Subject: [PATCH 0449/1411] object_store: Flush buffered multipart only during poll_shutdown (#3397) Co-authored-by: askoa --- object_store/src/multipart.rs | 56 +++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index de8591462500..65427d1f2d70 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -109,6 +109,43 @@ where } } +impl CloudMultiPartUpload +where + T: CloudMultiPartUploadImpl + Send + Sync, +{ + // The `poll_flush` function will only flush the in-progress tasks. + // The `final_flush` method called during `poll_shutdown` will flush + // the `current_buffer` along with in-progress tasks. + // Please see https://github.com/apache/arrow-rs/issues/3390 for more details. + fn final_flush( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + // Poll current tasks + self.as_mut().poll_tasks(cx)?; + + // If current_buffer is not empty, see if it can be submitted + if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { + let out_buffer: Vec = std::mem::take(&mut self.current_buffer); + let inner = Arc::clone(&self.inner); + let part_idx = self.current_part_idx; + self.tasks.push(Box::pin(async move { + let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + Ok((part_idx, upload_part)) + })); + } + + self.as_mut().poll_tasks(cx)?; + + // If tasks and current_buffer are empty, return Ready + if self.tasks.is_empty() && self.current_buffer.is_empty() { + Poll::Ready(Ok(())) + } else { + Poll::Pending + } + } +} + impl AsyncWrite for CloudMultiPartUpload where T: CloudMultiPartUploadImpl + Send + Sync, @@ -158,21 +195,8 @@ where // Poll current tasks self.as_mut().poll_tasks(cx)?; - // If current_buffer is not empty, see if it can be submitted - if !self.current_buffer.is_empty() && self.tasks.len() < self.max_concurrency { - let out_buffer: Vec = std::mem::take(&mut self.current_buffer); - let inner = Arc::clone(&self.inner); - let part_idx = self.current_part_idx; - self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; - Ok((part_idx, upload_part)) - })); - } - - self.as_mut().poll_tasks(cx)?; - - // If tasks and current_buffer are empty, return Ready - if self.tasks.is_empty() && self.current_buffer.is_empty() { + // If tasks is empty, return Ready + if self.tasks.is_empty() { Poll::Ready(Ok(())) } else { Poll::Pending @@ -184,7 +208,7 @@ where cx: &mut std::task::Context<'_>, ) -> Poll> { // First, poll flush - match self.as_mut().poll_flush(cx) { + match self.as_mut().final_flush(cx) { Poll::Pending => return Poll::Pending, Poll::Ready(res) => res?, }; From 513d543cf99d0622661c2824d735d4f4bf17c0d3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 28 Dec 2022 15:25:41 -0800 Subject: [PATCH 0450/1411] Ends ParquetRecordBatchStream when polling on StreamState::Error (#3404) * Remove unnecessary StreamState * Return None for StreamState::Error --- parquet/src/arrow/async_reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index cbaa2bf6b0aa..e93c85580ca0 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -568,7 +568,7 @@ where return Poll::Ready(Some(Err(e))); } }, - StreamState::Error => return Poll::Pending, + StreamState::Error => return Poll::Ready(None), // Ends the stream as error happens. } } } From 99a20dd872972096e54f26b33450c5958f33d2e5 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 29 Dec 2022 22:27:22 +0800 Subject: [PATCH 0451/1411] add more integration test for parquet bloom filter round trip tests (#3210) * add more integration test * fix doc --- .github/workflows/parquet.yml | 4 +- parquet/pytest/pyspark_integration_test.py | 65 ----------- parquet/pytest/requirements.in | 2 +- parquet/pytest/requirements.txt | 71 ++++++++++++ parquet/pytest/test_parquet_integration.py | 112 +++++++++++++++++++ parquet/src/bin/parquet-fromcsv.rs | 7 +- parquet/src/bin/parquet-show-bloom-filter.rs | 2 + 7 files changed, 195 insertions(+), 68 deletions(-) delete mode 100755 parquet/pytest/pyspark_integration_test.py create mode 100755 parquet/pytest/test_parquet_integration.py diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 67552af864c1..65afa47b1b32 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -144,7 +144,9 @@ jobs: rustup toolchain install ${{ matrix.rust }} rustup default ${{ matrix.rust }} - name: Install binary for checking - run: cargo install --path parquet --bin parquet-show-bloom-filter --features=arrow,cli + run: | + cargo install --path parquet --bin parquet-show-bloom-filter --features=cli + cargo install --path parquet --bin parquet-fromcsv --features=arrow,cli - name: Run pytest run: | cd parquet/pytest diff --git a/parquet/pytest/pyspark_integration_test.py b/parquet/pytest/pyspark_integration_test.py deleted file mode 100755 index 0a0b881e3e9b..000000000000 --- a/parquet/pytest/pyspark_integration_test.py +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import pyspark.sql -import tempfile -import subprocess -import pathlib - - -def create_data_and_df(): - spark = pyspark.sql.SparkSession.builder.getOrCreate() - spark.conf.set("parquet.bloom.filter.enabled", True) - spark.conf.set("parquet.bloom.filter.expected.ndv", 10) - spark.conf.set("parquet.bloom.filter.max.bytes", 32) - data = [(f"id-{i % 10}", f"name-{i%10}") for i in range(100)] - df = spark.createDataFrame(data, ["id", "name"]).repartition(1) - return data, df - - -def get_expected_output(data): - expected = ["Row group #0", "=" * 80] - for v in data: - expected.append(f"Value {v[0]} is present in bloom filter") - for v in data: - expected.append(f"Value {v[1]} is absent in bloom filter") - expected = "\n".join(expected) + "\n" - return expected.encode("utf-8") - - -def get_cli_output(output_dir, data, col_name="id"): - # take the first (and only) parquet file - parquet_file = sorted(pathlib.Path(output_dir).glob("*.parquet"))[0] - args = [ - "parquet-show-bloom-filter", - "--file-name", - parquet_file, - "--column", - col_name, - ] - for v in data: - args.extend(["--values", v[0]]) - for v in data: - args.extend(["--values", v[1]]) - return subprocess.check_output(args) - - -def test_pyspark_bloom_filter(): - data, df = create_data_and_df() - with tempfile.TemporaryDirectory() as output_dir: - df.write.parquet(output_dir, mode="overwrite") - cli_output = get_cli_output(output_dir, data) - assert cli_output == get_expected_output(data) diff --git a/parquet/pytest/requirements.in b/parquet/pytest/requirements.in index a0b30b867625..575fb839a182 100644 --- a/parquet/pytest/requirements.in +++ b/parquet/pytest/requirements.in @@ -17,4 +17,4 @@ pytest pyspark black - +pandas diff --git a/parquet/pytest/requirements.txt b/parquet/pytest/requirements.txt index fb6f8fb6dd96..7462e8ff3b0d 100644 --- a/parquet/pytest/requirements.txt +++ b/parquet/pytest/requirements.txt @@ -63,10 +63,69 @@ mypy-extensions==0.4.3 \ --hash=sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d \ --hash=sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8 # via black +numpy==1.23.5 \ + --hash=sha256:01dd17cbb340bf0fc23981e52e1d18a9d4050792e8fb8363cecbf066a84b827d \ + --hash=sha256:06005a2ef6014e9956c09ba07654f9837d9e26696a0470e42beedadb78c11b07 \ + --hash=sha256:09b7847f7e83ca37c6e627682f145856de331049013853f344f37b0c9690e3df \ + --hash=sha256:0aaee12d8883552fadfc41e96b4c82ee7d794949e2a7c3b3a7201e968c7ecab9 \ + --hash=sha256:0cbe9848fad08baf71de1a39e12d1b6310f1d5b2d0ea4de051058e6e1076852d \ + --hash=sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a \ + --hash=sha256:33161613d2269025873025b33e879825ec7b1d831317e68f4f2f0f84ed14c719 \ + --hash=sha256:5039f55555e1eab31124a5768898c9e22c25a65c1e0037f4d7c495a45778c9f2 \ + --hash=sha256:522e26bbf6377e4d76403826ed689c295b0b238f46c28a7251ab94716da0b280 \ + --hash=sha256:56e454c7833e94ec9769fa0f86e6ff8e42ee38ce0ce1fa4cbb747ea7e06d56aa \ + --hash=sha256:58f545efd1108e647604a1b5aa809591ccd2540f468a880bedb97247e72db387 \ + --hash=sha256:5e05b1c973a9f858c74367553e236f287e749465f773328c8ef31abe18f691e1 \ + --hash=sha256:7903ba8ab592b82014713c491f6c5d3a1cde5b4a3bf116404e08f5b52f6daf43 \ + --hash=sha256:8969bfd28e85c81f3f94eb4a66bc2cf1dbdc5c18efc320af34bffc54d6b1e38f \ + --hash=sha256:92c8c1e89a1f5028a4c6d9e3ccbe311b6ba53694811269b992c0b224269e2398 \ + --hash=sha256:9c88793f78fca17da0145455f0d7826bcb9f37da4764af27ac945488116efe63 \ + --hash=sha256:a7ac231a08bb37f852849bbb387a20a57574a97cfc7b6cabb488a4fc8be176de \ + --hash=sha256:abdde9f795cf292fb9651ed48185503a2ff29be87770c3b8e2a14b0cd7aa16f8 \ + --hash=sha256:af1da88f6bc3d2338ebbf0e22fe487821ea4d8e89053e25fa59d1d79786e7481 \ + --hash=sha256:b2a9ab7c279c91974f756c84c365a669a887efa287365a8e2c418f8b3ba73fb0 \ + --hash=sha256:bf837dc63ba5c06dc8797c398db1e223a466c7ece27a1f7b5232ba3466aafe3d \ + --hash=sha256:ca51fcfcc5f9354c45f400059e88bc09215fb71a48d3768fb80e357f3b457e1e \ + --hash=sha256:ce571367b6dfe60af04e04a1834ca2dc5f46004ac1cc756fb95319f64c095a96 \ + --hash=sha256:d208a0f8729f3fb790ed18a003f3a57895b989b40ea4dce4717e9cf4af62c6bb \ + --hash=sha256:dbee87b469018961d1ad79b1a5d50c0ae850000b639bcb1b694e9981083243b6 \ + --hash=sha256:e9f4c4e51567b616be64e05d517c79a8a22f3606499941d97bb76f2ca59f982d \ + --hash=sha256:f063b69b090c9d918f9df0a12116029e274daf0181df392839661c4c7ec9018a \ + --hash=sha256:f9a909a8bae284d46bbfdefbdd4a262ba19d3bc9921b1e76126b1d21c3c34135 + # via pandas packaging==21.3 \ --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ --hash=sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522 # via pytest +pandas==1.5.2 \ + --hash=sha256:0183cb04a057cc38fde5244909fca9826d5d57c4a5b7390c0cc3fa7acd9fa883 \ + --hash=sha256:1fc87eac0541a7d24648a001d553406f4256e744d92df1df8ebe41829a915028 \ + --hash=sha256:220b98d15cee0b2cd839a6358bd1f273d0356bf964c1a1aeb32d47db0215488b \ + --hash=sha256:2552bffc808641c6eb471e55aa6899fa002ac94e4eebfa9ec058649122db5824 \ + --hash=sha256:315e19a3e5c2ab47a67467fc0362cb36c7c60a93b6457f675d7d9615edad2ebe \ + --hash=sha256:344021ed3e639e017b452aa8f5f6bf38a8806f5852e217a7594417fb9bbfa00e \ + --hash=sha256:375262829c8c700c3e7cbb336810b94367b9c4889818bbd910d0ecb4e45dc261 \ + --hash=sha256:457d8c3d42314ff47cc2d6c54f8fc0d23954b47977b2caed09cd9635cb75388b \ + --hash=sha256:4aed257c7484d01c9a194d9a94758b37d3d751849c05a0050c087a358c41ad1f \ + --hash=sha256:530948945e7b6c95e6fa7aa4be2be25764af53fba93fe76d912e35d1c9ee46f5 \ + --hash=sha256:5ae7e989f12628f41e804847a8cc2943d362440132919a69429d4dea1f164da0 \ + --hash=sha256:71f510b0efe1629bf2f7c0eadb1ff0b9cf611e87b73cd017e6b7d6adb40e2b3a \ + --hash=sha256:73f219fdc1777cf3c45fde7f0708732ec6950dfc598afc50588d0d285fddaefc \ + --hash=sha256:8092a368d3eb7116e270525329a3e5c15ae796ccdf7ccb17839a73b4f5084a39 \ + --hash=sha256:82ae615826da838a8e5d4d630eb70c993ab8636f0eff13cb28aafc4291b632b5 \ + --hash=sha256:9608000a5a45f663be6af5c70c3cbe634fa19243e720eb380c0d378666bc7702 \ + --hash=sha256:a40dd1e9f22e01e66ed534d6a965eb99546b41d4d52dbdb66565608fde48203f \ + --hash=sha256:b4f5a82afa4f1ff482ab8ded2ae8a453a2cdfde2001567b3ca24a4c5c5ca0db3 \ + --hash=sha256:c009a92e81ce836212ce7aa98b219db7961a8b95999b97af566b8dc8c33e9519 \ + --hash=sha256:c218796d59d5abd8780170c937b812c9637e84c32f8271bbf9845970f8c1351f \ + --hash=sha256:cc3cd122bea268998b79adebbb8343b735a5511ec14efb70a39e7acbc11ccbdc \ + --hash=sha256:d0d8fd58df5d17ddb8c72a5075d87cd80d71b542571b5f78178fb067fa4e9c72 \ + --hash=sha256:e18bc3764cbb5e118be139b3b611bc3fbc5d3be42a7e827d1096f46087b395eb \ + --hash=sha256:e2b83abd292194f350bb04e188f9379d36b8dfac24dd445d5c87575f3beaf789 \ + --hash=sha256:e7469271497960b6a781eaa930cba8af400dd59b62ec9ca2f4d31a19f2f91090 \ + --hash=sha256:e9dbacd22555c2d47f262ef96bb4e30880e5956169741400af8b306bbb24a273 \ + --hash=sha256:f6257b314fc14958f8122779e5a1557517b0f8e500cfb2bd53fa1f75a8ad0af2 + # via -r requirements.in pathspec==0.10.2 \ --hash=sha256:88c2606f2c1e818b978540f73ecc908e13999c6c3a383daf3705652ae79807a5 \ --hash=sha256:8f6bf73e5758fd365ef5d58ce09ac7c27d2833a8d7da51712eac6e27e35141b0 @@ -94,6 +153,18 @@ pytest==7.2.0 \ --hash=sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71 \ --hash=sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59 # via -r requirements.in +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via pandas +pytz==2022.6 \ + --hash=sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427 \ + --hash=sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2 + # via pandas +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via python-dateutil tomli==2.0.1 \ --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f diff --git a/parquet/pytest/test_parquet_integration.py b/parquet/pytest/test_parquet_integration.py new file mode 100755 index 000000000000..268caa8fab06 --- /dev/null +++ b/parquet/pytest/test_parquet_integration.py @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pyspark.sql +import pandas as pd +from tempfile import NamedTemporaryFile, TemporaryDirectory +import subprocess +import pathlib +import pytest + + +def create_data_and_spark_df(n): + spark = pyspark.sql.SparkSession.builder.getOrCreate() + spark.conf.set("parquet.bloom.filter.enabled", True) + spark.conf.set("parquet.bloom.filter.expected.ndv", 10) + spark.conf.set("parquet.bloom.filter.max.bytes", 32) + data = [(f"id-{i % 10}", f"name-{i%10}") for i in range(n)] + df = spark.createDataFrame(data, ["id", "name"]).repartition(1) + return data, df + + +def create_data_and_pandas_df(n): + data = [(f"id-{i % 10}", f"name-{i%10}") for i in range(n)] + df = pd.DataFrame(data, columns=["id", "name"]) + return data, df + + +def get_expected_output(data): + expected = ["Row group #0", "=" * 80] + for v in data: + expected.append(f"Value {v[0]} is present in bloom filter") + for v in data: + expected.append(f"Value {v[1]} is absent in bloom filter") + expected = "\n".join(expected) + "\n" + return expected.encode("utf-8") + + +def get_from_csv_cli_output(schema_file, output_file, csv_file): + args = [ + "parquet-fromcsv", + "--schema", + schema_file, + "--enable-bloom-filter", + "true", + "--input-file", + csv_file, + "--output-file", + output_file, + ] + return subprocess.check_output(args) + + +def get_show_filter_cli_output(output_dir, data, col_name="id"): + # take the first (and only) parquet file + (parquet_file,) = sorted(pathlib.Path(output_dir).glob("*.parquet")) + args = [ + "parquet-show-bloom-filter", + "--file-name", + parquet_file, + "--column", + col_name, + ] + for v in data: + args.extend(["--values", v[0]]) + for v in data: + args.extend(["--values", v[1]]) + return subprocess.check_output(args) + + +SCHEMA = b"""message schema { + required binary id (UTF8); + required binary name (UTF8); +}""" + + +@pytest.mark.parametrize("n", [1, 10]) +class TestParquetIntegration: + def test_pyspark_bloom_filter(self, n): + data, df = create_data_and_spark_df(n) + with TemporaryDirectory() as output_dir: + df.write.parquet(output_dir, mode="overwrite") + cli_output = get_show_filter_cli_output(output_dir, data) + assert cli_output == get_expected_output(data) + + def test_bloom_filter_round_trip(self, n): + data, df = create_data_and_pandas_df(n) + with NamedTemporaryFile(suffix=".csv") as csv_file, NamedTemporaryFile( + suffix=".schema" + ) as schema_file, TemporaryDirectory() as output_dir: + schema_file.write(SCHEMA) + schema_file.flush() + df.to_csv(csv_file.name, index=False, header=True) + parquet_file = pathlib.Path(output_dir) / "output.parquet" + cli_output = get_from_csv_cli_output( + schema_file.name, parquet_file, csv_file.name + ) + assert cli_output == b"" + cli_output = get_show_filter_cli_output(output_dir, data) + assert cli_output == get_expected_output(data) diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index b11f3406cb34..53391a6addc8 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -25,7 +25,7 @@ //! cargo install parquet --features=cli //! ``` //! -//! After this `parquet-fromcsv` shoud be available: +//! After this `parquet-fromcsv` should be available: //! //! ```text //! parquet-fromcsv --schema message_schema_for_parquet.txt input.csv output.parquet @@ -46,15 +46,19 @@ //! //! ## Parquet file options //! +//! ```text //! - `-b`, `--batch-size` : Batch size for Parquet //! - `-c`, `--parquet-compression` : Compression option for Parquet, default is SNAPPY //! - `-s`, `--schema` : Path to message schema for generated Parquet file //! - `-o`, `--output-file` : Path to output Parquet file //! - `-w`, `--writer-version` : Writer version //! - `-m`, `--max-row-group-size` : Max row group size +//! - `--enable-bloom-filter` : Enable bloom filter during writing +//! ``` //! //! ## Input file options //! +//! ```text //! - `-i`, `--input-file` : Path to input CSV file //! - `-f`, `--input-format` : Dialect for input file, `csv` or `tsv`. //! - `-d`, `--delimiter : Field delimiter for CSV file, default depends `--input-format` @@ -62,6 +66,7 @@ //! - `-h`, `--has-header` : Input has header //! - `-r`, `--record-terminator` : Record terminator character for input. default is CRLF //! - `-q`, `--quote-char` : Input quoting character +//! ``` //! use std::{ diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index f9462327f831..ca8f558a6e00 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -103,6 +103,8 @@ fn main() { } ) }); + } else { + println!("No bloom filter found for column {}", args.column); } } else { println!( From bfae10840997a6cadf393bab0ae8b2dacfb74b4b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 29 Dec 2022 11:46:37 -0500 Subject: [PATCH 0452/1411] Version 30.0.0 release notes and changelog (#3406) * Update version to 30.0.0 * Update version in script * Initial changelog * updates --- CHANGELOG-old.md | 101 ++++++++++++++ CHANGELOG.md | 139 +++++++------------ arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 12 +- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-ord/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +- arrow/Cargo.toml | 28 ++-- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 +-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 265 insertions(+), 195 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 06bf7297f32e..f62b1ee707cc 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,107 @@ # Historical Changelog +## [29.0.0](https://github.com/apache/arrow-rs/tree/29.0.0) (2022-12-09) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/28.0.0...29.0.0) + +**Breaking changes:** + +- Minor: Allow `Field::new` and `Field::new_with_dict` to take existing `String` as well as `&str` [\#3288](https://github.com/apache/arrow-rs/pull/3288) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- update `&Option` to `Option<&T>` [\#3249](https://github.com/apache/arrow-rs/pull/3249) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) +- Hide `*_dict_scalar` kernels behind `*_dyn` kernels [\#3202](https://github.com/apache/arrow-rs/pull/3202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + +**Implemented enhancements:** + +- Support writing BloomFilter in arrow\_writer [\#3275](https://github.com/apache/arrow-rs/issues/3275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support casting from unsigned numeric to Decimal256 [\#3272](https://github.com/apache/arrow-rs/issues/3272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting from Decimal256 to float types [\#3266](https://github.com/apache/arrow-rs/issues/3266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3254](https://github.com/apache/arrow-rs/issues/3254) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting from Decimal256 to unsigned numeric [\#3239](https://github.com/apache/arrow-rs/issues/3239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- precision is not considered when cast value to decimal [\#3223](https://github.com/apache/arrow-rs/issues/3223) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use RegexSet in arrow\_csv::infer\_field\_schema [\#3211](https://github.com/apache/arrow-rs/issues/3211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement FlightSQL Client [\#3206](https://github.com/apache/arrow-rs/issues/3206) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add binary\_mut and try\_binary\_mut [\#3143](https://github.com/apache/arrow-rs/issues/3143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add try\_unary\_mut [\#3133](https://github.com/apache/arrow-rs/issues/3133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3290](https://github.com/apache/arrow-rs/issues/3290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- using ahash `compile-time-rng` kills reproducible builds [\#3271](https://github.com/apache/arrow-rs/issues/3271) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Decimal128 to Decimal256 Overflows [\#3265](https://github.com/apache/arrow-rs/issues/3265) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `nullif` panics on empty array [\#3261](https://github.com/apache/arrow-rs/issues/3261) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Some more inconsistency between can\_cast\_types and cast\_with\_options [\#3250](https://github.com/apache/arrow-rs/issues/3250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3237](https://github.com/apache/arrow-rs/issues/3237) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- new\_null\_array Panics creating StructArray with non-nullable fields [\#3226](https://github.com/apache/arrow-rs/issues/3226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- bool should cast from/to Float16Type as `can_cast_types` returns true [\#3221](https://github.com/apache/arrow-rs/issues/3221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Utf8 and LargeUtf8 cannot cast from/to Float16 but can\_cast\_types returns true [\#3220](https://github.com/apache/arrow-rs/issues/3220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Re-enable some tests in `arrow-cast` crate [\#3219](https://github.com/apache/arrow-rs/issues/3219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Off-by-one buffer size error triggers Panic when constructing RecordBatch from IPC bytes \(should return an Error\) [\#3215](https://github.com/apache/arrow-rs/issues/3215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow to and from pyarrow conversion results in changes in schema [\#3136](https://github.com/apache/arrow-rs/issues/3136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- better document when we need `LargeUtf8` instead of `Utf8` [\#3228](https://github.com/apache/arrow-rs/issues/3228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Use BufWriter when writing bloom filters and limit tests \(\#3318\) [\#3319](https://github.com/apache/arrow-rs/pull/3319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use take for dictionary like comparisons [\#3313](https://github.com/apache/arrow-rs/pull/3313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update versions to 29.0.0 and update CHANGELOG [\#3315](https://github.com/apache/arrow-rs/pull/3315) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- refactor: Merge similar functions `ilike_scalar` and `nilike_scalar` [\#3303](https://github.com/apache/arrow-rs/pull/3303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Split out arrow-ord \(\#2594\) [\#3299](https://github.com/apache/arrow-rs/pull/3299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-string \(\#2594\) [\#3295](https://github.com/apache/arrow-rs/pull/3295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3293](https://github.com/apache/arrow-rs/pull/3293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Don't use dangling NonNull as sentinel [\#3289](https://github.com/apache/arrow-rs/pull/3289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Set bloom filter on byte array [\#3284](https://github.com/apache/arrow-rs/pull/3284) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Fix ipc schema custom\_metadata serialization [\#3282](https://github.com/apache/arrow-rs/pull/3282) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Disable const-random ahash feature on non-WASM \(\#3271\) [\#3277](https://github.com/apache/arrow-rs/pull/3277) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- fix\(ffi\): handle null data buffers from empty arrays [\#3276](https://github.com/apache/arrow-rs/pull/3276) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Support casting from unsigned numeric to Decimal256 [\#3273](https://github.com/apache/arrow-rs/pull/3273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add parquet-layout binary [\#3269](https://github.com/apache/arrow-rs/pull/3269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support casting from Decimal256 to float types [\#3267](https://github.com/apache/arrow-rs/pull/3267) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Simplify decimal cast logic [\#3264](https://github.com/apache/arrow-rs/pull/3264) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix panic on nullif empty array \(\#3261\) [\#3263](https://github.com/apache/arrow-rs/pull/3263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::from\_unary and BooleanArray::from\_binary [\#3258](https://github.com/apache/arrow-rs/pull/3258) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Remove parquet build script [\#3257](https://github.com/apache/arrow-rs/pull/3257) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3255](https://github.com/apache/arrow-rs/pull/3255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support List and LargeList in Row format \(\#3159\) [\#3251](https://github.com/apache/arrow-rs/pull/3251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't recurse to children in ArrayData::try\_new [\#3248](https://github.com/apache/arrow-rs/pull/3248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate dictionaries read over IPC [\#3247](https://github.com/apache/arrow-rs/pull/3247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix MapBuilder example [\#3246](https://github.com/apache/arrow-rs/pull/3246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Loosen nullability restrictions added in \#3205 \(\#3226\) [\#3244](https://github.com/apache/arrow-rs/pull/3244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Better document implications of offsets \(\#3228\) [\#3243](https://github.com/apache/arrow-rs/pull/3243) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add new API to validate the precision for decimal array [\#3242](https://github.com/apache/arrow-rs/pull/3242) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Move nullif to arrow-select \(\#2594\) [\#3241](https://github.com/apache/arrow-rs/pull/3241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Casting from Decimal256 to unsigned numeric [\#3240](https://github.com/apache/arrow-rs/pull/3240) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3238](https://github.com/apache/arrow-rs/pull/3238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove unwraps from 'create\_primitive\_array' [\#3232](https://github.com/apache/arrow-rs/pull/3232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aarashy](https://github.com/aarashy)) +- Fix CI build by upgrading tonic-build to 0.8.4 [\#3231](https://github.com/apache/arrow-rs/pull/3231) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Remove negative scale check [\#3230](https://github.com/apache/arrow-rs/pull/3230) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update prost-build requirement from =0.11.2 to =0.11.3 [\#3225](https://github.com/apache/arrow-rs/pull/3225) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Get the round result for decimal to a decimal with smaller scale [\#3224](https://github.com/apache/arrow-rs/pull/3224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Move tests which require chrono-tz feature from `arrow-cast` to `arrow` [\#3222](https://github.com/apache/arrow-rs/pull/3222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add test cases for extracing week with/without timezone [\#3218](https://github.com/apache/arrow-rs/pull/3218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- Use RegexSet for matching DataType [\#3217](https://github.com/apache/arrow-rs/pull/3217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Update tonic-build to 0.8.3 [\#3214](https://github.com/apache/arrow-rs/pull/3214) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Support StructArray in Row Format \(\#3159\) [\#3212](https://github.com/apache/arrow-rs/pull/3212) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Infer timestamps from CSV files [\#3209](https://github.com/apache/arrow-rs/pull/3209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- fix bug: cast decimal256 to other decimal with no-safe [\#3208](https://github.com/apache/arrow-rs/pull/3208) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- FlightSQL Client & integration test [\#3207](https://github.com/apache/arrow-rs/pull/3207) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Ensure StructArrays check nullability of fields [\#3205](https://github.com/apache/arrow-rs/pull/3205) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Remove special case ArrayData equality for decimals [\#3204](https://github.com/apache/arrow-rs/pull/3204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add a cast test case for decimal negative scale [\#3203](https://github.com/apache/arrow-rs/pull/3203) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Move zip and shift kernels to arrow-select [\#3201](https://github.com/apache/arrow-rs/pull/3201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate limit kernel [\#3200](https://github.com/apache/arrow-rs/pull/3200) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use SlicesIterator for ArrayData Equality [\#3198](https://github.com/apache/arrow-rs/pull/3198) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add \_dyn kernels of like, ilike, nlike, nilike kernels for dictionary support [\#3197](https://github.com/apache/arrow-rs/pull/3197) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Adding scalar nlike\_dyn, ilike\_dyn, nilike\_dyn kernels [\#3195](https://github.com/apache/arrow-rs/pull/3195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Use self capture in DataType [\#3190](https://github.com/apache/arrow-rs/pull/3190) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- To pyarrow with schema [\#3188](https://github.com/apache/arrow-rs/pull/3188) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- Support Duration in array\_value\_to\_string [\#3183](https://github.com/apache/arrow-rs/pull/3183) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Support `FixedSizeBinary` in Row format [\#3182](https://github.com/apache/arrow-rs/pull/3182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add binary\_mut and try\_binary\_mut [\#3144](https://github.com/apache/arrow-rs/pull/3144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add try\_unary\_mut [\#3134](https://github.com/apache/arrow-rs/pull/3134) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) ## [28.0.0](https://github.com/apache/arrow-rs/tree/28.0.0) (2022-11-25) [Full Changelog](https://github.com/apache/arrow-rs/compare/27.0.0...28.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index b346183712c1..1278d52c78b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,107 +19,76 @@ # Changelog -## [29.0.0](https://github.com/apache/arrow-rs/tree/29.0.0) (2022-12-09) +## [30.0.0](https://github.com/apache/arrow-rs/tree/30.0.0) (2022-12-29) -[Full Changelog](https://github.com/apache/arrow-rs/compare/28.0.0...29.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/29.0.0...30.0.0) **Breaking changes:** -- Minor: Allow `Field::new` and `Field::new_with_dict` to take existing `String` as well as `&str` [\#3288](https://github.com/apache/arrow-rs/pull/3288) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- update `&Option` to `Option<&T>` [\#3249](https://github.com/apache/arrow-rs/pull/3249) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jimexist](https://github.com/Jimexist)) -- Hide `*_dict_scalar` kernels behind `*_dyn` kernels [\#3202](https://github.com/apache/arrow-rs/pull/3202) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Infer Parquet JSON Logical and Converted Type as UTF-8 [\#3376](https://github.com/apache/arrow-rs/pull/3376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use custom Any instead of prost\_types [\#3360](https://github.com/apache/arrow-rs/pull/3360) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Use bytes in arrow-flight [\#3359](https://github.com/apache/arrow-rs/pull/3359) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support writing BloomFilter in arrow\_writer [\#3275](https://github.com/apache/arrow-rs/issues/3275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support casting from unsigned numeric to Decimal256 [\#3272](https://github.com/apache/arrow-rs/issues/3272) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting from Decimal256 to float types [\#3266](https://github.com/apache/arrow-rs/issues/3266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3254](https://github.com/apache/arrow-rs/issues/3254) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Casting from Decimal256 to unsigned numeric [\#3239](https://github.com/apache/arrow-rs/issues/3239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- precision is not considered when cast value to decimal [\#3223](https://github.com/apache/arrow-rs/issues/3223) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use RegexSet in arrow\_csv::infer\_field\_schema [\#3211](https://github.com/apache/arrow-rs/issues/3211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement FlightSQL Client [\#3206](https://github.com/apache/arrow-rs/issues/3206) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Add binary\_mut and try\_binary\_mut [\#3143](https://github.com/apache/arrow-rs/issues/3143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add try\_unary\_mut [\#3133](https://github.com/apache/arrow-rs/issues/3133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add derived implementations of Clone and Debug for `ParquetObjectReader` [\#3381](https://github.com/apache/arrow-rs/issues/3381) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up TrackedWrite [\#3366](https://github.com/apache/arrow-rs/issues/3366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Is it possible for ArrowWriter to write key\_value\_metadata after write all records [\#3356](https://github.com/apache/arrow-rs/issues/3356) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add UnionArray test to arrow-pyarrow integration test [\#3346](https://github.com/apache/arrow-rs/issues/3346) +- Document / Deprecate arrow\_flight::utils::flight\_data\_from\_arrow\_batch [\#3312](https://github.com/apache/arrow-rs/issues/3312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- \[FlightSQL\] Support HTTPs [\#3309](https://github.com/apache/arrow-rs/issues/3309) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support UnionArray in ffi [\#3304](https://github.com/apache/arrow-rs/issues/3304) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- Support casting from String to Decimal [\#3280](https://github.com/apache/arrow-rs/issues/3280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow ArrowCSV writer to control the display of NULL values [\#3268](https://github.com/apache/arrow-rs/issues/3268) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3290](https://github.com/apache/arrow-rs/issues/3290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- using ahash `compile-time-rng` kills reproducible builds [\#3271](https://github.com/apache/arrow-rs/issues/3271) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Decimal128 to Decimal256 Overflows [\#3265](https://github.com/apache/arrow-rs/issues/3265) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `nullif` panics on empty array [\#3261](https://github.com/apache/arrow-rs/issues/3261) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Some more inconsistency between can\_cast\_types and cast\_with\_options [\#3250](https://github.com/apache/arrow-rs/issues/3250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3237](https://github.com/apache/arrow-rs/issues/3237) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- new\_null\_array Panics creating StructArray with non-nullable fields [\#3226](https://github.com/apache/arrow-rs/issues/3226) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- bool should cast from/to Float16Type as `can_cast_types` returns true [\#3221](https://github.com/apache/arrow-rs/issues/3221) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Utf8 and LargeUtf8 cannot cast from/to Float16 but can\_cast\_types returns true [\#3220](https://github.com/apache/arrow-rs/issues/3220) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Re-enable some tests in `arrow-cast` crate [\#3219](https://github.com/apache/arrow-rs/issues/3219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Off-by-one buffer size error triggers Panic when constructing RecordBatch from IPC bytes \(should return an Error\) [\#3215](https://github.com/apache/arrow-rs/issues/3215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow to and from pyarrow conversion results in changes in schema [\#3136](https://github.com/apache/arrow-rs/issues/3136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FlightSQL example is broken [\#3386](https://github.com/apache/arrow-rs/issues/3386) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- CSV Reader Bounds Incorrectly Handles Header [\#3364](https://github.com/apache/arrow-rs/issues/3364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect output string from `try_to_type` [\#3350](https://github.com/apache/arrow-rs/issues/3350) +- Decimal arithmetic computation fails to run because decimal type equality [\#3344](https://github.com/apache/arrow-rs/issues/3344) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pretty print not implemented for Map [\#3322](https://github.com/apache/arrow-rs/issues/3322) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ILIKE Kernels Inconsistent Case Folding [\#3311](https://github.com/apache/arrow-rs/issues/3311) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- better document when we need `LargeUtf8` instead of `Utf8` [\#3228](https://github.com/apache/arrow-rs/issues/3228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- minor: Improve arrow-flight docs [\#3372](https://github.com/apache/arrow-rs/pull/3372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Use BufWriter when writing bloom filters and limit tests \(\#3318\) [\#3319](https://github.com/apache/arrow-rs/pull/3319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Use take for dictionary like comparisons [\#3313](https://github.com/apache/arrow-rs/pull/3313) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update versions to 29.0.0 and update CHANGELOG [\#3315](https://github.com/apache/arrow-rs/pull/3315) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- refactor: Merge similar functions `ilike_scalar` and `nilike_scalar` [\#3303](https://github.com/apache/arrow-rs/pull/3303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Split out arrow-ord \(\#2594\) [\#3299](https://github.com/apache/arrow-rs/pull/3299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-string \(\#2594\) [\#3295](https://github.com/apache/arrow-rs/pull/3295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Skip null buffer when importing FFI ArrowArray struct if no null buffer in the spec [\#3293](https://github.com/apache/arrow-rs/pull/3293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Don't use dangling NonNull as sentinel [\#3289](https://github.com/apache/arrow-rs/pull/3289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Set bloom filter on byte array [\#3284](https://github.com/apache/arrow-rs/pull/3284) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Fix ipc schema custom\_metadata serialization [\#3282](https://github.com/apache/arrow-rs/pull/3282) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Disable const-random ahash feature on non-WASM \(\#3271\) [\#3277](https://github.com/apache/arrow-rs/pull/3277) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- fix\(ffi\): handle null data buffers from empty arrays [\#3276](https://github.com/apache/arrow-rs/pull/3276) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Support casting from unsigned numeric to Decimal256 [\#3273](https://github.com/apache/arrow-rs/pull/3273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add parquet-layout binary [\#3269](https://github.com/apache/arrow-rs/pull/3269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support casting from Decimal256 to float types [\#3267](https://github.com/apache/arrow-rs/pull/3267) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Simplify decimal cast logic [\#3264](https://github.com/apache/arrow-rs/pull/3264) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix panic on nullif empty array \(\#3261\) [\#3263](https://github.com/apache/arrow-rs/pull/3263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add BooleanArray::from\_unary and BooleanArray::from\_binary [\#3258](https://github.com/apache/arrow-rs/pull/3258) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Remove parquet build script [\#3257](https://github.com/apache/arrow-rs/pull/3257) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Make arithmetic kernels supports DictionaryArray of DecimalType [\#3255](https://github.com/apache/arrow-rs/pull/3255) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support List and LargeList in Row format \(\#3159\) [\#3251](https://github.com/apache/arrow-rs/pull/3251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Don't recurse to children in ArrayData::try\_new [\#3248](https://github.com/apache/arrow-rs/pull/3248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Validate dictionaries read over IPC [\#3247](https://github.com/apache/arrow-rs/pull/3247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix MapBuilder example [\#3246](https://github.com/apache/arrow-rs/pull/3246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Loosen nullability restrictions added in \#3205 \(\#3226\) [\#3244](https://github.com/apache/arrow-rs/pull/3244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Better document implications of offsets \(\#3228\) [\#3243](https://github.com/apache/arrow-rs/pull/3243) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add new API to validate the precision for decimal array [\#3242](https://github.com/apache/arrow-rs/pull/3242) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Move nullif to arrow-select \(\#2594\) [\#3241](https://github.com/apache/arrow-rs/pull/3241) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Casting from Decimal256 to unsigned numeric [\#3240](https://github.com/apache/arrow-rs/pull/3240) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Enable casting between Dictionary of DecimalArray and DecimalArray [\#3238](https://github.com/apache/arrow-rs/pull/3238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove unwraps from 'create\_primitive\_array' [\#3232](https://github.com/apache/arrow-rs/pull/3232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aarashy](https://github.com/aarashy)) -- Fix CI build by upgrading tonic-build to 0.8.4 [\#3231](https://github.com/apache/arrow-rs/pull/3231) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) -- Remove negative scale check [\#3230](https://github.com/apache/arrow-rs/pull/3230) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update prost-build requirement from =0.11.2 to =0.11.3 [\#3225](https://github.com/apache/arrow-rs/pull/3225) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Get the round result for decimal to a decimal with smaller scale [\#3224](https://github.com/apache/arrow-rs/pull/3224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Move tests which require chrono-tz feature from `arrow-cast` to `arrow` [\#3222](https://github.com/apache/arrow-rs/pull/3222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- add test cases for extracing week with/without timezone [\#3218](https://github.com/apache/arrow-rs/pull/3218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) -- Use RegexSet for matching DataType [\#3217](https://github.com/apache/arrow-rs/pull/3217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Update tonic-build to 0.8.3 [\#3214](https://github.com/apache/arrow-rs/pull/3214) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Support StructArray in Row Format \(\#3159\) [\#3212](https://github.com/apache/arrow-rs/pull/3212) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Infer timestamps from CSV files [\#3209](https://github.com/apache/arrow-rs/pull/3209) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- fix bug: cast decimal256 to other decimal with no-safe [\#3208](https://github.com/apache/arrow-rs/pull/3208) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- FlightSQL Client & integration test [\#3207](https://github.com/apache/arrow-rs/pull/3207) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Ensure StructArrays check nullability of fields [\#3205](https://github.com/apache/arrow-rs/pull/3205) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Remove special case ArrayData equality for decimals [\#3204](https://github.com/apache/arrow-rs/pull/3204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add a cast test case for decimal negative scale [\#3203](https://github.com/apache/arrow-rs/pull/3203) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Move zip and shift kernels to arrow-select [\#3201](https://github.com/apache/arrow-rs/pull/3201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Deprecate limit kernel [\#3200](https://github.com/apache/arrow-rs/pull/3200) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use SlicesIterator for ArrayData Equality [\#3198](https://github.com/apache/arrow-rs/pull/3198) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add \_dyn kernels of like, ilike, nlike, nilike kernels for dictionary support [\#3197](https://github.com/apache/arrow-rs/pull/3197) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Adding scalar nlike\_dyn, ilike\_dyn, nilike\_dyn kernels [\#3195](https://github.com/apache/arrow-rs/pull/3195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Use self capture in DataType [\#3190](https://github.com/apache/arrow-rs/pull/3190) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- To pyarrow with schema [\#3188](https://github.com/apache/arrow-rs/pull/3188) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) -- Support Duration in array\_value\_to\_string [\#3183](https://github.com/apache/arrow-rs/pull/3183) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Support `FixedSizeBinary` in Row format [\#3182](https://github.com/apache/arrow-rs/pull/3182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add binary\_mut and try\_binary\_mut [\#3144](https://github.com/apache/arrow-rs/pull/3144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add try\_unary\_mut [\#3134](https://github.com/apache/arrow-rs/pull/3134) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Ends ParquetRecordBatchStream when polling on StreamState::Error [\#3404](https://github.com/apache/arrow-rs/pull/3404) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- fix clippy issues [\#3398](https://github.com/apache/arrow-rs/pull/3398) ([Jimexist](https://github.com/Jimexist)) +- Upgrade multiversion to 0.7.1 [\#3396](https://github.com/apache/arrow-rs/pull/3396) ([viirya](https://github.com/viirya)) +- Make FlightSQL Support HTTPs [\#3388](https://github.com/apache/arrow-rs/pull/3388) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Fix broken FlightSQL example [\#3387](https://github.com/apache/arrow-rs/pull/3387) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Update prost-build [\#3385](https://github.com/apache/arrow-rs/pull/3385) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-arith \(\#2594\) [\#3384](https://github.com/apache/arrow-rs/pull/3384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add derive for Clone and Debug for `ParquetObjectReader` [\#3382](https://github.com/apache/arrow-rs/pull/3382) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kszlim](https://github.com/kszlim)) +- Initial Mid-level `FlightClient` [\#3378](https://github.com/apache/arrow-rs/pull/3378) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Document all features on docs.rs [\#3377](https://github.com/apache/arrow-rs/pull/3377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-row \(\#2594\) [\#3375](https://github.com/apache/arrow-rs/pull/3375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unnecessary flush calls on TrackedWrite [\#3374](https://github.com/apache/arrow-rs/pull/3374) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Update proc-macro2 requirement from =1.0.47 to =1.0.49 [\#3369](https://github.com/apache/arrow-rs/pull/3369) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add CSV build\_buffered \(\#3338\) [\#3368](https://github.com/apache/arrow-rs/pull/3368) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add append\_key\_value\_metadata [\#3367](https://github.com/apache/arrow-rs/pull/3367) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jiacai2050](https://github.com/jiacai2050)) +- Add csv-core based reader \(\#3338\) [\#3365](https://github.com/apache/arrow-rs/pull/3365) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Put BufWriter into TrackedWrite [\#3361](https://github.com/apache/arrow-rs/pull/3361) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add CSV reader benchmark \(\#3338\) [\#3357](https://github.com/apache/arrow-rs/pull/3357) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use ArrayData::ptr\_eq in DictionaryTracker [\#3354](https://github.com/apache/arrow-rs/pull/3354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate flight\_data\_from\_arrow\_batch [\#3353](https://github.com/apache/arrow-rs/pull/3353) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Dandandan](https://github.com/Dandandan)) +- Fix incorrect output string from try\_to\_type [\#3351](https://github.com/apache/arrow-rs/pull/3351) ([viirya](https://github.com/viirya)) +- Fix unary\_dyn for decimal scalar arithmetic computation [\#3345](https://github.com/apache/arrow-rs/pull/3345) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add UnionArray test to arrow-pyarrow integration test [\#3343](https://github.com/apache/arrow-rs/pull/3343) ([viirya](https://github.com/viirya)) +- feat: configure null value in arrow csv writer [\#3342](https://github.com/apache/arrow-rs/pull/3342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Optimize bulk writing of all blocks of bloom filter [\#3340](https://github.com/apache/arrow-rs/pull/3340) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add MapArray to pretty print [\#3339](https://github.com/apache/arrow-rs/pull/3339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Update prost-build 0.11.4 [\#3334](https://github.com/apache/arrow-rs/pull/3334) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Faster Parquet Bloom Writer [\#3333](https://github.com/apache/arrow-rs/pull/3333) ([tustvold](https://github.com/tustvold)) +- Add bloom filter benchmark for parquet writer [\#3323](https://github.com/apache/arrow-rs/pull/3323) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add ASCII fast path for ILIKE scalar \(90% faster\) [\#3306](https://github.com/apache/arrow-rs/pull/3306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support UnionArray in ffi [\#3305](https://github.com/apache/arrow-rs/pull/3305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support casting from String to Decimal [\#3281](https://github.com/apache/arrow-rs/pull/3281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index db85c2a6b978..32c2043cdaa9 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "29.0.0" +version = "30.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } multiversion = { version = "0.7.1", default-features = false } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index e8b2762b4f71..06a84cad4e7a 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "29.0.0" +version = "30.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 99ecf2a85893..2df41b537f5f 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "29.0.0" +version = "30.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index e54139d5a9b5..c94a5b77a045 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "29.0.0" +version = "30.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-select = { version = "29.0.0", path = "../arrow-select" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-select = { version = "30.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index d02e599b31e1..0b6e6035c0b5 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "29.0.0" +version = "30.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "29.0.0", path = "../arrow-cast" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index abe7aa63990b..9df8bd0b1027 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "29.0.0" +version = "30.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index e4a977b653f6..4fc0e0d91435 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "29.0.0" +version = "30.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,10 +27,10 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-ipc = { version = "29.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-ipc = { version = "30.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } base64 = { version = "0.20", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -48,7 +48,7 @@ flight-sql-experimental = [] tls = ["tonic/tls"] [dev-dependencies] -arrow = { version = "29.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "30.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 76b990b0163f..df3a0839532d 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "29.0.0" +arrow-flight = "30.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 720b47c2a5d3..df28ffce5d4a 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "29.0.0" +version = "30.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "29.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } +arrow = { version = "30.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index fcbe96d73b4d..045931867726 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "29.0.0" +version = "30.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 55cc467bbd44..31ae90929a7e 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "29.0.0" +version = "30.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "29.0.0", path = "../arrow-cast" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 0c6dea0df7cf..88323105d0fe 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "29.0.0" +version = "30.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "29.0.0", path = "../arrow-cast" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index c07e6ae38455..3a2096ea5651 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "29.0.0" +version = "30.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-select = { version = "29.0.0", path = "../arrow-select" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-select = { version = "30.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 3d5a16bfa4d0..9a1daad36849 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "29.0.0" +version = "30.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "29.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "30.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 4741c9d5840e..99b1eb150720 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "29.0.0" +version = "30.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "29.0.0", path = "../arrow-cast" } -arrow-ord = { version = "29.0.0", path = "../arrow-ord" } +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } +arrow-ord = { version = "30.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index cb03c97b7e06..d6fddd1e0916 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "29.0.0" +version = "30.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index ec7a90fe6ce0..a609f72a69c0 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "29.0.0" +version = "30.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-array = { version = "29.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 0bb23fd8e90f..c6cc2f2a32dc 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "29.0.0" +version = "30.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-select = { version = "29.0.0", path = "../arrow-select" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-select = { version = "30.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 17b02626fb05..202b4c4f40f6 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "29.0.0" +version = "30.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "29.0.0", path = "../arrow-arith" } -arrow-array = { version = "29.0.0", path = "../arrow-array" } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "29.0.0", path = "../arrow-cast" } -arrow-csv = { version = "29.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "29.0.0", path = "../arrow-data" } -arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "29.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "29.0.0", path = "../arrow-ord" } -arrow-row = { version = "29.0.0", path = "../arrow-row" } -arrow-schema = { version = "29.0.0", path = "../arrow-schema" } -arrow-select = { version = "29.0.0", path = "../arrow-select" } -arrow-string = { version = "29.0.0", path = "../arrow-string" } +arrow-arith = { version = "30.0.0", path = "../arrow-arith" } +arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } +arrow-csv = { version = "30.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-ipc = { version = "30.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "30.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "30.0.0", path = "../arrow-ord" } +arrow-row = { version = "30.0.0", path = "../arrow-row" } +arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-select = { version = "30.0.0", path = "../arrow-select" } +arrow-string = { version = "30.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow/README.md b/arrow/README.md index 4d2f8e303b8d..441e65ac3fc3 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `29.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `30.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 81f219034e1c..fd7fa8c82adf 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/29.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/30.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index ef7034bbdde0..ef87d20f4c30 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="28.0.0" -FUTURE_RELEASE="29.0.0" +SINCE_TAG="29.0.0" +FUTURE_RELEASE="30.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 22dbc7e22cf9..6ee83a2c43de 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "29.0.0" +version = "30.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "29.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "29.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "29.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "29.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "29.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "29.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "29.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "29.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "30.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "30.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "30.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "30.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "30.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "30.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "30.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "30.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "29.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "30.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index c704b3457769..a22503bcf295 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "29.0.0" +version = "30.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "29.0.0", default-features = false } +parquet = { path = "../parquet", version = "30.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index e4debae0bda1..b10672b84a57 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "29.0.0" -parquet_derive = "29.0.0" +parquet = "30.0.0" +parquet_derive = "30.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 1167f128f865..59cfef593899 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "29.0.0" +version = "30.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "29.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "29.0.0", default-features = false } +parquet = { path = "../parquet", version = "30.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "30.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 9b5762181f63f0dc77e80414f1761e4819f43287 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 29 Dec 2022 12:23:31 -0500 Subject: [PATCH 0453/1411] Final release ntoes (#3409) --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1278d52c78b8..0d39458d967a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ **Merged pull requests:** +- Version 30.0.0 release notes and changelog [\#3406](https://github.com/apache/arrow-rs/pull/3406) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) - Ends ParquetRecordBatchStream when polling on StreamState::Error [\#3404](https://github.com/apache/arrow-rs/pull/3404) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) - fix clippy issues [\#3398](https://github.com/apache/arrow-rs/pull/3398) ([Jimexist](https://github.com/Jimexist)) - Upgrade multiversion to 0.7.1 [\#3396](https://github.com/apache/arrow-rs/pull/3396) ([viirya](https://github.com/viirya)) @@ -89,6 +90,7 @@ - Add ASCII fast path for ILIKE scalar \(90% faster\) [\#3306](https://github.com/apache/arrow-rs/pull/3306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Support UnionArray in ffi [\#3305](https://github.com/apache/arrow-rs/pull/3305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - Support casting from String to Decimal [\#3281](https://github.com/apache/arrow-rs/pull/3281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add more integration test for parquet bloom filter round trip tests [\#3210](https://github.com/apache/arrow-rs/pull/3210) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) From 9398af67d5d6530616fd5e687ba781299d74bfd4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 30 Dec 2022 07:29:34 -0500 Subject: [PATCH 0454/1411] Update verify_release.sh to work with 30.0.0 (#3413) --- dev/release/verify-release-candidate.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 98c582c2e178..c42391222fce 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -118,6 +118,11 @@ test_source_distribution() { (cd arrow && cargo build && cargo test) (cd arrow-flight && cargo build && cargo test) + # To avoid https://github.com/apache/arrow-rs/issues/3410, + # remove path reference from parquet: + # object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } + # object_store = { version = "0.5", default-features = false, optional = true } + sed -i -e 's/\(^object_store.*\)\(path = ".*", \)/\1/g' parquet/Cargo.toml (cd parquet && cargo build && cargo test) (cd parquet_derive && cargo build && cargo test) From dc09b0b426cb4d4c9d4bf0a112668256565c25cd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 31 Dec 2022 07:58:01 -0500 Subject: [PATCH 0455/1411] Implement `RecordBatch` <--> `FlightData` encode/decode + tests (#3391) * Implement `RecordBatch` <--> `FlightData` encode/decode + tests * fix comment * Update arrow-flight/src/encode.rs Co-authored-by: Liang-Chi Hsieh * Add test encoding error * Add test for chained streams * Add mismatched schema and data test * Add new test * more tests * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Add From ArrowError impl for FlightError * Correct make_dictionary_batch and add tests * do not take * Make dictionary massaging non pub * Add comment about memory size and make split function non pub * explicitly return early from encode stream * fix doc link Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 + arrow-flight/src/client.rs | 325 +----------------- arrow-flight/src/decode.rs | 396 +++++++++++++++++++++ arrow-flight/src/encode.rs | 511 ++++++++++++++++++++++++++++ arrow-flight/src/error.rs | 25 ++ arrow-flight/src/lib.rs | 10 +- arrow-flight/tests/client.rs | 92 ++++- arrow-flight/tests/common/server.rs | 46 ++- arrow-flight/tests/encode_decode.rs | 453 ++++++++++++++++++++++++ arrow-ipc/src/reader.rs | 6 + arrow-ipc/src/writer.rs | 5 +- 11 files changed, 1552 insertions(+), 319 deletions(-) create mode 100644 arrow-flight/src/decode.rs create mode 100644 arrow-flight/src/encode.rs create mode 100644 arrow-flight/tests/encode_decode.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 4fc0e0d91435..1664004bdff3 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -29,6 +29,8 @@ license = "Apache-2.0" [dependencies] arrow-array = { version = "30.0.0", path = "../arrow-array" } arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +# Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 +arrow-cast = { version = "30.0.0", path = "../arrow-cast" } arrow-ipc = { version = "30.0.0", path = "../arrow-ipc" } arrow-schema = { version = "30.0.0", path = "../arrow-schema" } base64 = { version = "0.20", default-features = false, features = ["std"] } diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index 0e75ac7c0c7f..753c40f2a5c1 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -16,15 +16,12 @@ // under the License. use crate::{ - flight_service_client::FlightServiceClient, utils::flight_data_to_arrow_batch, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, Ticket, + decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, + FlightDescriptor, FlightInfo, HandshakeRequest, Ticket, }; -use arrow_array::{ArrayRef, RecordBatch}; -use arrow_schema::Schema; use bytes::Bytes; -use futures::{future::ready, ready, stream, StreamExt}; -use std::{collections::HashMap, convert::TryFrom, pin::Pin, sync::Arc, task::Poll}; -use tonic::{metadata::MetadataMap, transport::Channel, Streaming}; +use futures::{future::ready, stream, StreamExt, TryStreamExt}; +use tonic::{metadata::MetadataMap, transport::Channel}; use crate::error::{FlightError, Result}; @@ -161,7 +158,7 @@ impl FlightClient { /// Make a `DoGet` call to the server with the provided ticket, /// returning a [`FlightRecordBatchStream`] for reading - /// [`RecordBatch`]es. + /// [`RecordBatch`](arrow_array::RecordBatch)es. /// /// # Example: /// ```no_run @@ -197,10 +194,17 @@ impl FlightClient { pub async fn do_get(&mut self, ticket: Ticket) -> Result { let request = self.make_request(ticket); - let response = self.inner.do_get(request).await?.into_inner(); - - let flight_data_stream = FlightDataStream::new(response); - Ok(FlightRecordBatchStream::new(flight_data_stream)) + let response_stream = self + .inner + .do_get(request) + .await? + .into_inner() + // convert to FlightError + .map_err(|e| e.into()); + + Ok(FlightRecordBatchStream::new_from_flight_data( + response_stream, + )) } /// Make a `GetFlightInfo` call to the server with the provided @@ -268,300 +272,3 @@ impl FlightClient { request } } - -/// A stream of [`RecordBatch`]es from from an Arrow Flight server. -/// -/// To access the lower level Flight messages directly, consider -/// calling [`Self::into_inner`] and using the [`FlightDataStream`] -/// directly. -#[derive(Debug)] -pub struct FlightRecordBatchStream { - inner: FlightDataStream, - got_schema: bool, -} - -impl FlightRecordBatchStream { - pub fn new(inner: FlightDataStream) -> Self { - Self { - inner, - got_schema: false, - } - } - - /// Has a message defining the schema been received yet? - pub fn got_schema(&self) -> bool { - self.got_schema - } - - /// Consume self and return the wrapped [`FlightDataStream`] - pub fn into_inner(self) -> FlightDataStream { - self.inner - } -} -impl futures::Stream for FlightRecordBatchStream { - type Item = Result; - - /// Returns the next [`RecordBatch`] available in this stream, or `None` if - /// there are no further results available. - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll>> { - loop { - let res = ready!(self.inner.poll_next_unpin(cx)); - match res { - // Inner exhausted - None => { - return Poll::Ready(None); - } - Some(Err(e)) => { - return Poll::Ready(Some(Err(e))); - } - // translate data - Some(Ok(data)) => match data.payload { - DecodedPayload::Schema(_) if self.got_schema => { - return Poll::Ready(Some(Err(FlightError::protocol( - "Unexpectedly saw multiple Schema messages in FlightData stream", - )))); - } - DecodedPayload::Schema(_) => { - self.got_schema = true; - // Need next message, poll inner again - } - DecodedPayload::RecordBatch(batch) => { - return Poll::Ready(Some(Ok(batch))); - } - DecodedPayload::None => { - // Need next message - } - }, - } - } - } -} - -/// Wrapper around a stream of [`FlightData`] that handles the details -/// of decoding low level Flight messages into [`Schema`] and -/// [`RecordBatch`]es, including details such as dictionaries. -/// -/// # Protocol Details -/// -/// The client handles flight messages as followes: -/// -/// - **None:** This message has no effect. This is useful to -/// transmit metadata without any actual payload. -/// -/// - **Schema:** The schema is (re-)set. Dictionaries are cleared and -/// the decoded schema is returned. -/// -/// - **Dictionary Batch:** A new dictionary for a given column is registered. An existing -/// dictionary for the same column will be overwritten. This -/// message is NOT visible. -/// -/// - **Record Batch:** Record batch is created based on the current -/// schema and dictionaries. This fails if no schema was transmitted -/// yet. -/// -/// All other message types (at the time of writing: e.g. tensor and -/// sparse tensor) lead to an error. -/// -/// Example usecases -/// -/// 1. Using this low level stream it is possible to receive a steam -/// of RecordBatches in FlightData that have different schemas by -/// handling multiple schema messages separately. -#[derive(Debug)] -pub struct FlightDataStream { - /// Underlying data stream - response: Streaming, - /// Decoding state - state: Option, - /// seen the end of the inner stream? - done: bool, -} - -impl FlightDataStream { - /// Create a new wrapper around the stream of FlightData - pub fn new(response: Streaming) -> Self { - Self { - state: None, - response, - done: false, - } - } - - /// Extracts flight data from the next message, updating decoding - /// state as necessary. - fn extract_message(&mut self, data: FlightData) -> Result> { - use arrow_ipc::MessageHeader; - let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|e| { - FlightError::DecodeError(format!("Error decoding root message: {e}")) - })?; - - match message.header_type() { - MessageHeader::NONE => Ok(Some(DecodedFlightData::new_none(data))), - MessageHeader::Schema => { - let schema = Schema::try_from(&data).map_err(|e| { - FlightError::DecodeError(format!("Error decoding schema: {e}")) - })?; - - let schema = Arc::new(schema); - let dictionaries_by_field = HashMap::new(); - - self.state = Some(FlightStreamState { - schema: Arc::clone(&schema), - dictionaries_by_field, - }); - Ok(Some(DecodedFlightData::new_schema(data, schema))) - } - MessageHeader::DictionaryBatch => { - let state = if let Some(state) = self.state.as_mut() { - state - } else { - return Err(FlightError::protocol( - "Received DictionaryBatch prior to Schema", - )); - }; - - let buffer: arrow_buffer::Buffer = data.data_body.into(); - let dictionary_batch = - message.header_as_dictionary_batch().ok_or_else(|| { - FlightError::protocol( - "Could not get dictionary batch from DictionaryBatch message", - ) - })?; - - arrow_ipc::reader::read_dictionary( - &buffer, - dictionary_batch, - &state.schema, - &mut state.dictionaries_by_field, - &message.version(), - ) - .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc dictionary: {e}" - )) - })?; - - // Updated internal state, but no decoded message - Ok(None) - } - MessageHeader::RecordBatch => { - let state = if let Some(state) = self.state.as_ref() { - state - } else { - return Err(FlightError::protocol( - "Received RecordBatch prior to Schema", - )); - }; - - let batch = flight_data_to_arrow_batch( - &data, - Arc::clone(&state.schema), - &state.dictionaries_by_field, - ) - .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc RecordBatch: {e}" - )) - })?; - - Ok(Some(DecodedFlightData::new_record_batch(data, batch))) - } - other => { - let name = other.variant_name().unwrap_or("UNKNOWN"); - Err(FlightError::protocol(format!("Unexpected message: {name}"))) - } - } - } -} - -impl futures::Stream for FlightDataStream { - type Item = Result; - /// Returns the result of decoding the next [`FlightData`] message - /// from the server, or `None` if there are no further results - /// available. - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - if self.done { - return Poll::Ready(None); - } - loop { - let res = ready!(self.response.poll_next_unpin(cx)); - - return Poll::Ready(match res { - None => { - self.done = true; - None // inner is exhausted - } - Some(data) => Some(match data { - Err(e) => Err(FlightError::Tonic(e)), - Ok(data) => match self.extract_message(data) { - Ok(Some(extracted)) => Ok(extracted), - Ok(None) => continue, // Need next input message - Err(e) => Err(e), - }, - }), - }); - } - } -} - -/// tracks the state needed to reconstruct [`RecordBatch`]es from a -/// streaming flight response. -#[derive(Debug)] -struct FlightStreamState { - schema: Arc, - dictionaries_by_field: HashMap, -} - -/// FlightData and the decoded payload (Schema, RecordBatch), if any -#[derive(Debug)] -pub struct DecodedFlightData { - pub inner: FlightData, - pub payload: DecodedPayload, -} - -impl DecodedFlightData { - pub fn new_none(inner: FlightData) -> Self { - Self { - inner, - payload: DecodedPayload::None, - } - } - - pub fn new_schema(inner: FlightData, schema: Arc) -> Self { - Self { - inner, - payload: DecodedPayload::Schema(schema), - } - } - - pub fn new_record_batch(inner: FlightData, batch: RecordBatch) -> Self { - Self { - inner, - payload: DecodedPayload::RecordBatch(batch), - } - } - - /// return the metadata field of the inner flight data - pub fn app_metadata(&self) -> &[u8] { - &self.inner.app_metadata - } -} - -/// The result of decoding [`FlightData`] -#[derive(Debug)] -pub enum DecodedPayload { - /// None (no data was sent in the corresponding FlightData) - None, - - /// A decoded Schema message - Schema(Arc), - - /// A decoded Record batch. - RecordBatch(RecordBatch), -} diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs new file mode 100644 index 000000000000..cab52a434897 --- /dev/null +++ b/arrow-flight/src/decode.rs @@ -0,0 +1,396 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{utils::flight_data_to_arrow_batch, FlightData}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_schema::Schema; +use bytes::Bytes; +use futures::{ready, stream::BoxStream, Stream, StreamExt}; +use std::{ + collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll, +}; + +use crate::error::{FlightError, Result}; + +/// Decodes a [Stream] of [`FlightData`] back into +/// [`RecordBatch`]es. This can be used to decode the response from an +/// Arrow Flight server +/// +/// # Note +/// To access the lower level Flight messages (e.g. to access +/// [`FlightData::app_metadata`]), you can call [`Self::into_inner`] +/// and use the [`FlightDataDecoder`] directly. +/// +/// # Example: +/// ```no_run +/// # async fn f() -> Result<(), arrow_flight::error::FlightError>{ +/// # use bytes::Bytes; +/// // make a do_get request +/// use arrow_flight::{ +/// error::Result, +/// decode::FlightRecordBatchStream, +/// Ticket, +/// flight_service_client::FlightServiceClient +/// }; +/// use tonic::transport::Channel; +/// use futures::stream::{StreamExt, TryStreamExt}; +/// +/// let client: FlightServiceClient = // make client.. +/// # unimplemented!(); +/// +/// let request = tonic::Request::new( +/// Ticket { ticket: Bytes::new() } +/// ); +/// +/// // Get a stream of FlightData; +/// let flight_data_stream = client +/// .do_get(request) +/// .await? +/// .into_inner(); +/// +/// // Decode stream of FlightData to RecordBatches +/// let record_batch_stream = FlightRecordBatchStream::new_from_flight_data( +/// // convert tonic::Status to FlightError +/// flight_data_stream.map_err(|e| e.into()) +/// ); +/// +/// // Read back RecordBatches +/// while let Some(batch) = record_batch_stream.next().await { +/// match batch { +/// Ok(batch) => { /* process batch */ }, +/// Err(e) => { /* handle error */ }, +/// }; +/// } +/// +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug)] +pub struct FlightRecordBatchStream { + inner: FlightDataDecoder, + got_schema: bool, +} + +impl FlightRecordBatchStream { + /// Create a new [`FlightRecordBatchStream`] from a decoded stream + pub fn new(inner: FlightDataDecoder) -> Self { + Self { + inner, + got_schema: false, + } + } + + /// Create a new [`FlightRecordBatchStream`] from a stream of [`FlightData`] + pub fn new_from_flight_data(inner: S) -> Self + where + S: Stream> + Send + 'static, + { + Self { + inner: FlightDataDecoder::new(inner), + got_schema: false, + } + } + + /// Has a message defining the schema been received yet? + pub fn got_schema(&self) -> bool { + self.got_schema + } + + /// Consume self and return the wrapped [`FlightDataDecoder`] + pub fn into_inner(self) -> FlightDataDecoder { + self.inner + } +} +impl futures::Stream for FlightRecordBatchStream { + type Item = Result; + + /// Returns the next [`RecordBatch`] available in this stream, or `None` if + /// there are no further results available. + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll>> { + loop { + let res = ready!(self.inner.poll_next_unpin(cx)); + match res { + // Inner exhausted + None => { + return Poll::Ready(None); + } + Some(Err(e)) => { + return Poll::Ready(Some(Err(e))); + } + // translate data + Some(Ok(data)) => match data.payload { + DecodedPayload::Schema(_) if self.got_schema => { + return Poll::Ready(Some(Err(FlightError::protocol( + "Unexpectedly saw multiple Schema messages in FlightData stream", + )))); + } + DecodedPayload::Schema(_) => { + self.got_schema = true; + // Need next message, poll inner again + } + DecodedPayload::RecordBatch(batch) => { + return Poll::Ready(Some(Ok(batch))); + } + DecodedPayload::None => { + // Need next message + } + }, + } + } + } +} + +/// Wrapper around a stream of [`FlightData`] that handles the details +/// of decoding low level Flight messages into [`Schema`] and +/// [`RecordBatch`]es, including details such as dictionaries. +/// +/// # Protocol Details +/// +/// The client handles flight messages as followes: +/// +/// - **None:** This message has no effect. This is useful to +/// transmit metadata without any actual payload. +/// +/// - **Schema:** The schema is (re-)set. Dictionaries are cleared and +/// the decoded schema is returned. +/// +/// - **Dictionary Batch:** A new dictionary for a given column is registered. An existing +/// dictionary for the same column will be overwritten. This +/// message is NOT visible. +/// +/// - **Record Batch:** Record batch is created based on the current +/// schema and dictionaries. This fails if no schema was transmitted +/// yet. +/// +/// All other message types (at the time of writing: e.g. tensor and +/// sparse tensor) lead to an error. +/// +/// Example usecases +/// +/// 1. Using this low level stream it is possible to receive a steam +/// of RecordBatches in FlightData that have different schemas by +/// handling multiple schema messages separately. +pub struct FlightDataDecoder { + /// Underlying data stream + response: BoxStream<'static, Result>, + /// Decoding state + state: Option, + /// Seen the end of the inner stream? + done: bool, +} + +impl Debug for FlightDataDecoder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FlightDataDecoder") + .field("response", &"") + .field("state", &self.state) + .field("done", &self.done) + .finish() + } +} + +impl FlightDataDecoder { + /// Create a new wrapper around the stream of [`FlightData`] + pub fn new(response: S) -> Self + where + S: Stream> + Send + 'static, + { + Self { + state: None, + response: response.boxed(), + done: false, + } + } + + /// Extracts flight data from the next message, updating decoding + /// state as necessary. + fn extract_message(&mut self, data: FlightData) -> Result> { + use arrow_ipc::MessageHeader; + let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|e| { + FlightError::DecodeError(format!("Error decoding root message: {e}")) + })?; + + match message.header_type() { + MessageHeader::NONE => Ok(Some(DecodedFlightData::new_none(data))), + MessageHeader::Schema => { + let schema = Schema::try_from(&data).map_err(|e| { + FlightError::DecodeError(format!("Error decoding schema: {e}")) + })?; + + let schema = Arc::new(schema); + let dictionaries_by_field = HashMap::new(); + + self.state = Some(FlightStreamState { + schema: Arc::clone(&schema), + dictionaries_by_field, + }); + Ok(Some(DecodedFlightData::new_schema(data, schema))) + } + MessageHeader::DictionaryBatch => { + let state = if let Some(state) = self.state.as_mut() { + state + } else { + return Err(FlightError::protocol( + "Received DictionaryBatch prior to Schema", + )); + }; + + let buffer: arrow_buffer::Buffer = data.data_body.into(); + let dictionary_batch = + message.header_as_dictionary_batch().ok_or_else(|| { + FlightError::protocol( + "Could not get dictionary batch from DictionaryBatch message", + ) + })?; + + arrow_ipc::reader::read_dictionary( + &buffer, + dictionary_batch, + &state.schema, + &mut state.dictionaries_by_field, + &message.version(), + ) + .map_err(|e| { + FlightError::DecodeError(format!( + "Error decoding ipc dictionary: {e}" + )) + })?; + + // Updated internal state, but no decoded message + Ok(None) + } + MessageHeader::RecordBatch => { + let state = if let Some(state) = self.state.as_ref() { + state + } else { + return Err(FlightError::protocol( + "Received RecordBatch prior to Schema", + )); + }; + + let batch = flight_data_to_arrow_batch( + &data, + Arc::clone(&state.schema), + &state.dictionaries_by_field, + ) + .map_err(|e| { + FlightError::DecodeError(format!( + "Error decoding ipc RecordBatch: {e}" + )) + })?; + + Ok(Some(DecodedFlightData::new_record_batch(data, batch))) + } + other => { + let name = other.variant_name().unwrap_or("UNKNOWN"); + Err(FlightError::protocol(format!("Unexpected message: {name}"))) + } + } + } +} + +impl futures::Stream for FlightDataDecoder { + type Item = Result; + /// Returns the result of decoding the next [`FlightData`] message + /// from the server, or `None` if there are no further results + /// available. + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + if self.done { + return Poll::Ready(None); + } + loop { + let res = ready!(self.response.poll_next_unpin(cx)); + + return Poll::Ready(match res { + None => { + self.done = true; + None // inner is exhausted + } + Some(data) => Some(match data { + Err(e) => Err(e), + Ok(data) => match self.extract_message(data) { + Ok(Some(extracted)) => Ok(extracted), + Ok(None) => continue, // Need next input message + Err(e) => Err(e), + }, + }), + }); + } + } +} + +/// tracks the state needed to reconstruct [`RecordBatch`]es from a +/// streaming flight response. +#[derive(Debug)] +struct FlightStreamState { + schema: Arc, + dictionaries_by_field: HashMap, +} + +/// FlightData and the decoded payload (Schema, RecordBatch), if any +#[derive(Debug)] +pub struct DecodedFlightData { + pub inner: FlightData, + pub payload: DecodedPayload, +} + +impl DecodedFlightData { + pub fn new_none(inner: FlightData) -> Self { + Self { + inner, + payload: DecodedPayload::None, + } + } + + pub fn new_schema(inner: FlightData, schema: Arc) -> Self { + Self { + inner, + payload: DecodedPayload::Schema(schema), + } + } + + pub fn new_record_batch(inner: FlightData, batch: RecordBatch) -> Self { + Self { + inner, + payload: DecodedPayload::RecordBatch(batch), + } + } + + /// return the metadata field of the inner flight data + pub fn app_metadata(&self) -> Bytes { + self.inner.app_metadata.clone() + } +} + +/// The result of decoding [`FlightData`] +#[derive(Debug)] +pub enum DecodedPayload { + /// None (no data was sent in the corresponding FlightData) + None, + + /// A decoded Schema message + Schema(Arc), + + /// A decoded Record batch. + RecordBatch(RecordBatch), +} diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs new file mode 100644 index 000000000000..7c339b67d488 --- /dev/null +++ b/arrow-flight/src/encode.rs @@ -0,0 +1,511 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{collections::VecDeque, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; + +use crate::{error::Result, FlightData, SchemaAsIpc}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use bytes::Bytes; +use futures::{ready, stream::BoxStream, Stream, StreamExt}; + +/// Creates a [`Stream`](futures::Stream) of [`FlightData`]s from a +/// `Stream` of [`Result`]<[`RecordBatch`], [`FlightError`]>. +/// +/// This can be used to implement [`FlightService::do_get`] in an +/// Arrow Flight implementation; +/// +/// # Caveats +/// 1. [`DictionaryArray`](arrow_array::array::DictionaryArray)s +/// are converted to their underlying types prior to transport, due to +/// . +/// +/// # Example +/// ```no_run +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, RecordBatch, UInt32Array}; +/// # async fn f() { +/// # let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); +/// # let record_batch = RecordBatch::try_from_iter(vec![ +/// # ("a", Arc::new(c1) as ArrayRef) +/// # ]) +/// # .expect("cannot create record batch"); +/// use arrow_flight::encode::FlightDataEncoderBuilder; +/// +/// // Get an input stream of Result +/// let input_stream = futures::stream::iter(vec![Ok(record_batch)]); +/// +/// // Build a stream of `Result` (e.g. to return for do_get) +/// let flight_data_stream = FlightDataEncoderBuilder::new() +/// .build(input_stream); +/// +/// // Create a tonic `Response` that can be returned from a Flight server +/// let response = tonic::Response::new(flight_data_stream); +/// # } +/// ``` +/// +/// [`FlightService::do_get`]: crate::flight_service_server::FlightService::do_get +/// [`FlightError`]: crate::error::FlightError +#[derive(Debug)] +pub struct FlightDataEncoderBuilder { + /// The maximum message size (see details on [`Self::with_max_message_size`]). + max_batch_size: usize, + /// Ipc writer options + options: IpcWriteOptions, + /// Metadata to add to the schema message + app_metadata: Bytes, +} + +/// Default target size for record batches to send. +/// +/// Note this value would normally be 4MB, but the size calculation is +/// somewhat inexact, so we set it to 2MB. +pub const GRPC_TARGET_MAX_BATCH_SIZE: usize = 2097152; + +impl Default for FlightDataEncoderBuilder { + fn default() -> Self { + Self { + max_batch_size: GRPC_TARGET_MAX_BATCH_SIZE, + options: IpcWriteOptions::default(), + app_metadata: Bytes::new(), + } + } +} + +impl FlightDataEncoderBuilder { + pub fn new() -> Self { + Self::default() + } + + /// Set the (approximate) maximum encoded [`RecordBatch`] size to + /// limit the gRPC message size. Defaults to 2MB. + /// + /// The encoder splits up [`RecordBatch`]s (preserving order) to + /// limit individual messages to approximately this size. The size + /// is approximate because there additional encoding overhead on + /// top of the underlying data itself. + /// + pub fn with_max_message_size(mut self, max_batch_size: usize) -> Self { + self.max_batch_size = max_batch_size; + self + } + + /// Specify application specific metadata included in the + /// [`FlightData::app_metadata`] field of the the first Schema + /// message + pub fn with_metadata(mut self, app_metadata: Bytes) -> Self { + self.app_metadata = app_metadata; + self + } + + /// Set the [`IpcWriteOptions`] used to encode the [`RecordBatch`]es for transport. + pub fn with_options(mut self, options: IpcWriteOptions) -> Self { + self.options = options; + self + } + + /// Return a [`Stream`](futures::Stream) of [`FlightData`], + /// consuming self. More details on [`FlightDataEncoder`] + pub fn build(self, input: S) -> FlightDataEncoder + where + S: Stream> + Send + 'static, + { + let Self { + max_batch_size, + options, + app_metadata, + } = self; + + FlightDataEncoder::new(input.boxed(), max_batch_size, options, app_metadata) + } +} + +/// Stream that encodes a stream of record batches to flight data. +/// +/// See [`FlightDataEncoderBuilder`] for details and example. +pub struct FlightDataEncoder { + /// Input stream + inner: BoxStream<'static, Result>, + /// schema, set after the first batch + schema: Option, + /// Max size of batches to encode + max_batch_size: usize, + /// do the encoding / tracking of dictionaries + encoder: FlightIpcEncoder, + /// optional metadata to add to schema FlightData + app_metadata: Option, + /// data queued up to send but not yet sent + queue: VecDeque, + /// Is this strema done (inner is empty or errored) + done: bool, +} + +impl FlightDataEncoder { + fn new( + inner: BoxStream<'static, Result>, + max_batch_size: usize, + options: IpcWriteOptions, + app_metadata: Bytes, + ) -> Self { + Self { + inner, + schema: None, + max_batch_size, + encoder: FlightIpcEncoder::new(options), + app_metadata: Some(app_metadata), + queue: VecDeque::new(), + done: false, + } + } + + /// Place the `FlightData` in the queue to send + fn queue_message(&mut self, data: FlightData) { + self.queue.push_back(data); + } + + /// Place the `FlightData` in the queue to send + fn queue_messages(&mut self, datas: impl IntoIterator) { + for data in datas { + self.queue_message(data) + } + } + + /// Encodes batch into one or more `FlightData` messages in self.queue + fn encode_batch(&mut self, batch: RecordBatch) -> Result<()> { + let schema = match &self.schema { + Some(schema) => schema.clone(), + None => { + let batch_schema = batch.schema(); + // The first message is the schema message, and all + // batches have the same schema + let schema = Arc::new(prepare_schema_for_flight(&batch_schema)); + let mut schema_flight_data = self.encoder.encode_schema(&schema); + + // attach any metadata requested + if let Some(app_metadata) = self.app_metadata.take() { + schema_flight_data.app_metadata = app_metadata; + } + self.queue_message(schema_flight_data); + // remember schema + self.schema = Some(schema.clone()); + schema + } + }; + + // encode the batch + let batch = prepare_batch_for_flight(&batch, schema)?; + + for batch in split_batch_for_grpc_response(batch, self.max_batch_size) { + let (flight_dictionaries, flight_batch) = + self.encoder.encode_batch(&batch)?; + + self.queue_messages(flight_dictionaries); + self.queue_message(flight_batch); + } + + Ok(()) + } +} + +impl Stream for FlightDataEncoder { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + loop { + if self.done && self.queue.is_empty() { + return Poll::Ready(None); + } + + // Any messages queued to send? + if let Some(data) = self.queue.pop_front() { + return Poll::Ready(Some(Ok(data))); + } + + // Get next batch + let batch = ready!(self.inner.poll_next_unpin(cx)); + + match batch { + None => { + // inner is done + self.done = true; + // queue must also be empty so we are done + assert!(self.queue.is_empty()); + return Poll::Ready(None); + } + Some(Err(e)) => { + // error from inner + self.done = true; + self.queue.clear(); + return Poll::Ready(Some(Err(e))); + } + Some(Ok(batch)) => { + // had data, encode into the queue + if let Err(e) = self.encode_batch(batch) { + self.done = true; + self.queue.clear(); + return Poll::Ready(Some(Err(e))); + } + } + } + } + } +} + +/// Prepare an arrow Schema for transport over the Arrow Flight protocol +/// +/// Convert dictionary types to underlying types +/// +/// See hydrate_dictionary for more information +fn prepare_schema_for_flight(schema: &Schema) -> Schema { + let fields = schema + .fields() + .iter() + .map(|field| match field.data_type() { + DataType::Dictionary(_, value_type) => Field::new( + field.name(), + value_type.as_ref().clone(), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + _ => field.clone(), + }) + .collect(); + + Schema::new(fields) +} + +/// Split [`RecordBatch`] so it hopefully fits into a gRPC response. +/// +/// Data is zero-copy sliced into batches. +/// +/// Note: this method does not take into account already sliced +/// arrays: +fn split_batch_for_grpc_response( + batch: RecordBatch, + max_batch_size: usize, +) -> Vec { + let size = batch + .columns() + .iter() + .map(|col| col.get_buffer_memory_size()) + .sum::(); + + let n_batches = + (size / max_batch_size + usize::from(size % max_batch_size != 0)).max(1); + let rows_per_batch = (batch.num_rows() / n_batches).max(1); + let mut out = Vec::with_capacity(n_batches + 1); + + let mut offset = 0; + while offset < batch.num_rows() { + let length = (rows_per_batch).min(batch.num_rows() - offset); + out.push(batch.slice(offset, length)); + + offset += length; + } + + out +} + +/// The data needed to encode a stream of flight data, holding on to +/// shared Dictionaries. +/// +/// TODO: at allow dictionaries to be flushed / avoid building them +/// +/// TODO limit on the number of dictionaries??? +struct FlightIpcEncoder { + options: IpcWriteOptions, + data_gen: IpcDataGenerator, + dictionary_tracker: DictionaryTracker, +} + +impl FlightIpcEncoder { + fn new(options: IpcWriteOptions) -> Self { + let error_on_replacement = true; + Self { + options, + data_gen: IpcDataGenerator::default(), + dictionary_tracker: DictionaryTracker::new(error_on_replacement), + } + } + + /// Encode a schema as a FlightData + fn encode_schema(&self, schema: &Schema) -> FlightData { + SchemaAsIpc::new(schema, &self.options).into() + } + + /// Convert a `RecordBatch` to a Vec of `FlightData` representing + /// dictionaries and a `FlightData` representing the batch + fn encode_batch( + &mut self, + batch: &RecordBatch, + ) -> Result<(Vec, FlightData)> { + let (encoded_dictionaries, encoded_batch) = self.data_gen.encoded_batch( + batch, + &mut self.dictionary_tracker, + &self.options, + )?; + + let flight_dictionaries = + encoded_dictionaries.into_iter().map(Into::into).collect(); + let flight_batch = encoded_batch.into(); + + Ok((flight_dictionaries, flight_batch)) + } +} + +/// Prepares a RecordBatch for transport over the Arrow Flight protocol +/// +/// This means: +/// +/// 1. Hydrates any dictionaries to its underlying type. See +/// hydrate_dictionary for more information. +/// +fn prepare_batch_for_flight( + batch: &RecordBatch, + schema: SchemaRef, +) -> Result { + let columns = batch + .columns() + .iter() + .map(hydrate_dictionary) + .collect::>>()?; + + Ok(RecordBatch::try_new(schema, columns)?) +} + +/// Hydrates a dictionary to its underlying type +/// +/// An IPC response, streaming or otherwise, defines its schema up front +/// which defines the mapping from dictionary IDs. It then sends these +/// dictionaries over the wire. +/// +/// This requires identifying the different dictionaries in use, assigning +/// them IDs, and sending new dictionaries, delta or otherwise, when needed +/// +/// See also: +/// * +/// +/// For now we just hydrate the dictionaries to their underlying type +fn hydrate_dictionary(array: &ArrayRef) -> Result { + let arr = if let DataType::Dictionary(_, value) = array.data_type() { + arrow_cast::cast(array, value)? + } else { + Arc::clone(array) + }; + Ok(arr) +} + +#[cfg(test)] +mod tests { + use arrow::{ + array::{UInt32Array, UInt8Array}, + compute::concat_batches, + }; + + use super::*; + + #[test] + /// ensure only the batch's used data (not the allocated data) is sent + /// + fn test_encode_flight_data() { + let options = arrow::ipc::writer::IpcWriteOptions::default(); + let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); + + let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c1) as ArrayRef)]) + .expect("cannot create record batch"); + let schema = batch.schema(); + + let (_, baseline_flight_batch) = make_flight_data(&batch, &options); + + let big_batch = batch.slice(0, batch.num_rows() - 1); + let optimized_big_batch = + prepare_batch_for_flight(&big_batch, Arc::clone(&schema)) + .expect("failed to optimize"); + let (_, optimized_big_flight_batch) = + make_flight_data(&optimized_big_batch, &options); + + assert_eq!( + baseline_flight_batch.data_body.len(), + optimized_big_flight_batch.data_body.len() + ); + + let small_batch = batch.slice(0, 1); + let optimized_small_batch = + prepare_batch_for_flight(&small_batch, Arc::clone(&schema)) + .expect("failed to optimize"); + let (_, optimized_small_flight_batch) = + make_flight_data(&optimized_small_batch, &options); + + assert!( + baseline_flight_batch.data_body.len() + > optimized_small_flight_batch.data_body.len() + ); + } + + pub fn make_flight_data( + batch: &RecordBatch, + options: &IpcWriteOptions, + ) -> (Vec, FlightData) { + let data_gen = IpcDataGenerator::default(); + let mut dictionary_tracker = DictionaryTracker::new(false); + + let (encoded_dictionaries, encoded_batch) = data_gen + .encoded_batch(batch, &mut dictionary_tracker, options) + .expect("DictionaryTracker configured above to not error on replacement"); + + let flight_dictionaries = + encoded_dictionaries.into_iter().map(Into::into).collect(); + let flight_batch = encoded_batch.into(); + + (flight_dictionaries, flight_batch) + } + + #[test] + fn test_split_batch_for_grpc_response() { + let max_batch_size = 1024; + + // no split + let c = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); + let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) + .expect("cannot create record batch"); + let split = split_batch_for_grpc_response(batch.clone(), max_batch_size); + assert_eq!(split.len(), 1); + assert_eq!(batch, split[0]); + + // split once + let n_rows = max_batch_size + 1; + assert!(n_rows % 2 == 1, "should be an odd number"); + let c = + UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); + let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) + .expect("cannot create record batch"); + let split = split_batch_for_grpc_response(batch.clone(), max_batch_size); + assert_eq!(split.len(), 3); + assert_eq!( + split.iter().map(|batch| batch.num_rows()).sum::(), + n_rows + ); + assert_eq!(concat_batches(&batch.schema(), &split).unwrap(), batch); + } + + // test sending record batches + // test sending record batches with multiple different dictionaries +} diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs index fbb9efa44c24..11e0ae5c9fae 100644 --- a/arrow-flight/src/error.rs +++ b/arrow-flight/src/error.rs @@ -15,9 +15,13 @@ // specific language governing permissions and limitations // under the License. +use arrow_schema::ArrowError; + /// Errors for the Apache Arrow Flight crate #[derive(Debug)] pub enum FlightError { + /// Underlying arrow error + Arrow(ArrowError), /// Returned when functionality is not yet available. NotYetImplemented(String), /// Error from the underlying tonic library @@ -56,4 +60,25 @@ impl From for FlightError { } } +impl From for FlightError { + fn from(value: ArrowError) -> Self { + Self::Arrow(value) + } +} + +// default conversion from FlightError to tonic treats everything +// other than `Status` as an internal error +impl From for tonic::Status { + fn from(value: FlightError) -> Self { + match value { + FlightError::Arrow(e) => tonic::Status::internal(e.to_string()), + FlightError::NotYetImplemented(e) => tonic::Status::internal(e), + FlightError::Tonic(status) => status, + FlightError::ProtocolError(e) => tonic::Status::internal(e), + FlightError::DecodeError(e) => tonic::Status::internal(e), + FlightError::ExternalError(e) => tonic::Status::internal(e.to_string()), + } + } +} + pub type Result = std::result::Result; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index f30cb54844da..c2da58eb5bb7 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -71,10 +71,18 @@ pub mod flight_service_server { pub use gen::flight_service_server::FlightServiceServer; } -/// Mid Level [`FlightClient`] for +/// Mid Level [`FlightClient`] pub mod client; pub use client::FlightClient; +/// Decoder to create [`RecordBatch`](arrow_array::RecordBatch) streams from [`FlightData`] streams. +/// See [`FlightRecordBatchStream`](decode::FlightRecordBatchStream). +pub mod decode; + +/// Encoder to create [`FlightData`] streams from [`RecordBatch`](arrow_array::RecordBatch) streams. +/// See [`FlightDataEncoderBuilder`](encode::FlightDataEncoderBuilder). +pub mod encode; + /// Common error types pub mod error; diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 5bc1062f046d..c471294d7dc4 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -20,20 +20,21 @@ mod common { pub mod server; } +use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ error::FlightError, FlightClient, FlightDescriptor, FlightInfo, HandshakeRequest, - HandshakeResponse, + HandshakeResponse, Ticket, }; use bytes::Bytes; use common::server::TestFlightServer; -use futures::Future; +use futures::{Future, TryStreamExt}; use tokio::{net::TcpListener, task::JoinHandle}; use tonic::{ transport::{Channel, Uri}, Status, }; -use std::{net::SocketAddr, time::Duration}; +use std::{net::SocketAddr, sync::Arc, time::Duration}; const DEFAULT_TIMEOUT_SECONDS: u64 = 30; @@ -173,7 +174,90 @@ async fn test_get_flight_info_metadata() { // TODO more negative tests (like if there are endpoints defined, etc) -// TODO test for do_get +#[tokio::test] +async fn test_do_get() { + do_test(|test_server, mut client| async move { + let ticket = Ticket { + ticket: Bytes::from("my awesome flight ticket"), + }; + + let batch = RecordBatch::try_from_iter(vec![( + "col", + Arc::new(UInt64Array::from_iter([1, 2, 3, 4])) as _, + )]) + .unwrap(); + + let response = vec![Ok(batch.clone())]; + test_server.set_do_get_response(response); + let response_stream = client + .do_get(ticket.clone()) + .await + .expect("error making request"); + + let expected_response = vec![batch]; + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_do_get_request(), Some(ticket)); + }) + .await; +} + +#[tokio::test] +async fn test_do_get_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo", "bar").unwrap(); + let ticket = Ticket { + ticket: Bytes::from("my awesome flight ticket"), + }; + + let response = client.do_get(ticket.clone()).await.unwrap_err(); + + let e = Status::internal("No do_get response configured"); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_get_request(), Some(ticket)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_get_error_in_record_batch_stream() { + do_test(|test_server, mut client| async move { + let ticket = Ticket { + ticket: Bytes::from("my awesome flight ticket"), + }; + + let batch = RecordBatch::try_from_iter(vec![( + "col", + Arc::new(UInt64Array::from_iter([1, 2, 3, 4])) as _, + )]) + .unwrap(); + + let e = Status::data_loss("she's dead jim"); + + let expected_response = vec![Ok(batch), Err(FlightError::Tonic(e.clone()))]; + + test_server.set_do_get_response(expected_response); + + let response_stream = client + .do_get(ticket.clone()) + .await + .expect("error making request"); + + let response: Result, FlightError> = response_stream.try_collect().await; + + let response = response.unwrap_err(); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_get_request(), Some(ticket)); + }) + .await; +} /// Runs the future returned by the function, passing it a test server and client async fn do_test(f: F) diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index f1cb140b68c7..45f81b189e8d 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -17,10 +17,13 @@ use std::sync::{Arc, Mutex}; -use futures::stream::BoxStream; +use arrow_array::RecordBatch; +use futures::{stream::BoxStream, TryStreamExt}; use tonic::{metadata::MetadataMap, Request, Response, Status, Streaming}; use arrow_flight::{ + encode::FlightDataEncoderBuilder, + error::FlightError, flight_service_server::{FlightService, FlightServiceServer}, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, @@ -80,6 +83,21 @@ impl TestFlightServer { .take() } + /// Specify the response returned from the next call to `do_get` + pub fn set_do_get_response(&self, response: Vec>) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.do_get_response.replace(response); + } + + /// Take and return last do_get request send to the server, + pub fn take_do_get_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .do_get_request + .take() + } + /// Returns the last metadata from a request received by the server pub fn take_last_request_metadata(&self) -> Option { self.state @@ -97,7 +115,7 @@ impl TestFlightServer { } } -/// mutable state for the TestFlightSwrver +/// mutable state for the TestFlightServer, captures requests and provides responses #[derive(Debug, Default)] struct State { /// The last handshake request that was received @@ -108,6 +126,10 @@ struct State { pub get_flight_info_request: Option, /// the next response to return from `get_flight_info` pub get_flight_info_response: Option>, + /// The last do_get request received + pub do_get_request: Option, + /// The next response returned from `do_get` + pub do_get_response: Option>>, /// The last request headers received pub last_request_metadata: Option, } @@ -177,9 +199,25 @@ impl FlightService for TestFlightServer { async fn do_get( &self, - _request: Request, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Implement do_get")) + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.do_get_request = Some(request.into_inner()); + + let batches: Vec<_> = state + .do_get_response + .take() + .ok_or_else(|| Status::internal("No do_get response configured"))?; + + let batch_stream = futures::stream::iter(batches); + + let stream = FlightDataEncoderBuilder::new() + .build(batch_stream) + .map_err(|e| e.into()); + + Ok(Response::new(Box::pin(stream) as _)) } async fn do_put( diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs new file mode 100644 index 000000000000..45b8c0bf5ac9 --- /dev/null +++ b/arrow-flight/tests/encode_decode.rs @@ -0,0 +1,453 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests for round trip encoding / decoding + +use std::sync::Arc; + +use arrow::{compute::concat_batches, datatypes::Int32Type}; +use arrow_array::{ArrayRef, DictionaryArray, Float64Array, RecordBatch, UInt8Array}; +use arrow_flight::{ + decode::{DecodedPayload, FlightDataDecoder, FlightRecordBatchStream}, + encode::FlightDataEncoderBuilder, + error::FlightError, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use bytes::Bytes; +use futures::{StreamExt, TryStreamExt}; + +#[tokio::test] +async fn test_empty() { + roundtrip(vec![]).await; +} + +#[tokio::test] +async fn test_empty_batch() { + let batch = make_primative_batch(5); + let empty = RecordBatch::new_empty(batch.schema()); + roundtrip(vec![empty]).await; +} + +#[tokio::test] +async fn test_error() { + let input_batch_stream = + futures::stream::iter(vec![Err(FlightError::NotYetImplemented("foo".into()))]); + + let encoder = FlightDataEncoderBuilder::default(); + let encode_stream = encoder.build(input_batch_stream); + + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); + let result: Result, _> = decode_stream.try_collect().await; + + let result = result.unwrap_err(); + assert_eq!(result.to_string(), r#"NotYetImplemented("foo")"#); +} + +#[tokio::test] +async fn test_primative_one() { + roundtrip(vec![make_primative_batch(5)]).await; +} + +#[tokio::test] +async fn test_primative_many() { + roundtrip(vec![ + make_primative_batch(1), + make_primative_batch(7), + make_primative_batch(32), + ]) + .await; +} + +#[tokio::test] +async fn test_primative_empty() { + let batch = make_primative_batch(5); + let empty = RecordBatch::new_empty(batch.schema()); + + roundtrip(vec![batch, empty]).await; +} + +#[tokio::test] +async fn test_dictionary_one() { + roundtrip_dictionary(vec![make_dictionary_batch(5)]).await; +} + +#[tokio::test] +async fn test_dictionary_many() { + roundtrip_dictionary(vec![ + make_dictionary_batch(5), + make_dictionary_batch(9), + make_dictionary_batch(5), + make_dictionary_batch(5), + ]) + .await; +} + +#[tokio::test] +async fn test_app_metadata() { + let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(78))]); + + let app_metadata = Bytes::from("My Metadata"); + let encoder = FlightDataEncoderBuilder::default().with_metadata(app_metadata.clone()); + + let encode_stream = encoder.build(input_batch_stream); + + // use lower level stream to get access to app metadata + let decode_stream = + FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + + let mut messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); + + println!("{messages:#?}"); + + // expect that the app metadata made it through on the schema message + assert_eq!(messages.len(), 2); + let message2 = messages.pop().unwrap(); + let message1 = messages.pop().unwrap(); + + assert_eq!(message1.app_metadata(), app_metadata); + assert!(matches!(message1.payload, DecodedPayload::Schema(_))); + + // but not on the data + assert_eq!(message2.app_metadata(), Bytes::new()); + assert!(matches!(message2.payload, DecodedPayload::RecordBatch(_))); +} + +#[tokio::test] +async fn test_max_message_size() { + let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(5))]); + + // 5 input rows, with a very small limit should result in 5 batch messages + let encoder = FlightDataEncoderBuilder::default().with_max_message_size(1); + + let encode_stream = encoder.build(input_batch_stream); + + // use lower level stream to get access to app metadata + let decode_stream = + FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + + let messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); + + println!("{messages:#?}"); + + assert_eq!(messages.len(), 6); + assert!(matches!(messages[0].payload, DecodedPayload::Schema(_))); + for message in messages.iter().skip(1) { + assert!(matches!(message.payload, DecodedPayload::RecordBatch(_))); + } +} + +#[tokio::test] +async fn test_max_message_size_fuzz() { + // send through batches of varying sizes with various max + // batch sizes and ensure the data gets through ok + let input = vec![ + make_primative_batch(123), + make_primative_batch(17), + make_primative_batch(201), + make_primative_batch(2), + make_primative_batch(1), + make_primative_batch(11), + make_primative_batch(127), + ]; + + for max_message_size in [10, 1024, 2048, 6400, 3211212] { + let encoder = + FlightDataEncoderBuilder::default().with_max_message_size(max_message_size); + + let input_batch_stream = futures::stream::iter(input.clone()).map(Ok); + + let encode_stream = encoder.build(input_batch_stream); + + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); + let output: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); + + let input_batch = concat_batches(&input[0].schema(), &input).unwrap(); + let output_batch = concat_batches(&output[0].schema(), &output).unwrap(); + assert_eq!(input_batch, output_batch); + } +} + +#[tokio::test] +async fn test_mismatched_record_batch_schema() { + // send 2 batches with different schemas + let input_batch_stream = futures::stream::iter(vec![ + Ok(make_primative_batch(5)), + Ok(make_dictionary_batch(3)), + ]); + + let encoder = FlightDataEncoderBuilder::default(); + let encode_stream = encoder.build(input_batch_stream); + + let result: Result, FlightError> = encode_stream.try_collect().await; + let err = result.unwrap_err(); + assert_eq!( + err.to_string(), + "Arrow(InvalidArgumentError(\"number of columns(1) must match number of fields(2) in schema\"))" + ); +} + +#[tokio::test] +async fn test_chained_streams_batch_decoder() { + let batch1 = make_primative_batch(5); + let batch2 = make_dictionary_batch(3); + + // Model sending two flight streams back to back, with different schemas + let encode_stream1 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch2.clone())])); + + // append the two streams (so they will have two different schema messages) + let encode_stream = encode_stream1.chain(encode_stream2); + + // FlightRecordBatchStream errors if the schema changes + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); + let result: Result, FlightError> = decode_stream.try_collect().await; + + let err = result.unwrap_err(); + assert_eq!( + err.to_string(), + "ProtocolError(\"Unexpectedly saw multiple Schema messages in FlightData stream\")" + ); +} + +#[tokio::test] +async fn test_chained_streams_data_decoder() { + let batch1 = make_primative_batch(5); + let batch2 = make_dictionary_batch(3); + + // Model sending two flight streams back to back, with different schemas + let encode_stream1 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch2.clone())])); + + // append the two streams (so they will have two different schema messages) + let encode_stream = encode_stream1.chain(encode_stream2); + + // lower level decode stream can handle multiple schema messages + let decode_stream = FlightDataDecoder::new(encode_stream); + + let decoded_data: Vec<_> = + decode_stream.try_collect().await.expect("encode / decode"); + + println!("decoded data: {decoded_data:#?}"); + + // expect two schema messages with the data + assert_eq!(decoded_data.len(), 4); + assert!(matches!(decoded_data[0].payload, DecodedPayload::Schema(_))); + assert!(matches!( + decoded_data[1].payload, + DecodedPayload::RecordBatch(_) + )); + assert!(matches!(decoded_data[2].payload, DecodedPayload::Schema(_))); + assert!(matches!( + decoded_data[3].payload, + DecodedPayload::RecordBatch(_) + )); +} + +#[tokio::test] +async fn test_mismatched_schema_message() { + // Model sending schema that is mismatched with the data + // and expect an error + async fn do_test(batch1: RecordBatch, batch2: RecordBatch, expected: &str) { + let encode_stream1 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch1.clone())])) + // take only schema message from first stream + .take(1); + let encode_stream2 = FlightDataEncoderBuilder::default() + .build(futures::stream::iter(vec![Ok(batch2.clone())])) + // take only data message from second + .skip(1); + + // append the two streams + let encode_stream = encode_stream1.chain(encode_stream2); + + // FlightRecordBatchStream errors if the schema changes + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); + let result: Result, FlightError> = decode_stream.try_collect().await; + + let err = result.unwrap_err().to_string(); + assert!( + err.contains(expected), + "could not find '{expected}' in '{err}'" + ); + } + + // primitive batch first (has more columns) + do_test( + make_primative_batch(5), + make_dictionary_batch(3), + "Error decoding ipc RecordBatch: Io error: Invalid data for schema", + ) + .await; + + // dictioanry batch first + do_test( + make_dictionary_batch(3), + make_primative_batch(5), + "Error decoding ipc RecordBatch: Invalid argument error", + ) + .await; +} + +/// Make a primtive batch for testing +/// +/// Example: +/// i: 0, 1, None, 3, 4 +/// f: 5.0, 4.0, None, 2.0, 1.0 +fn make_primative_batch(num_rows: usize) -> RecordBatch { + let i: UInt8Array = (0..num_rows) + .map(|i| { + if i == num_rows / 2 { + None + } else { + Some(i.try_into().unwrap()) + } + }) + .collect(); + + let f: Float64Array = (0..num_rows) + .map(|i| { + if i == num_rows / 2 { + None + } else { + Some((num_rows - i) as f64) + } + }) + .collect(); + + RecordBatch::try_from_iter(vec![("i", Arc::new(i) as ArrayRef), ("f", Arc::new(f))]) + .unwrap() +} + +/// Make a dictionary batch for testing +/// +/// Example: +/// a: value0, value1, value2, None, value1, value2 +fn make_dictionary_batch(num_rows: usize) -> RecordBatch { + let values: Vec<_> = (0..num_rows) + .map(|i| { + if i == num_rows / 2 { + None + } else { + // repeat some values for low cardinality + let v = i / 3; + Some(format!("value{v}")) + } + }) + .collect(); + + let a: DictionaryArray = values + .iter() + .map(|s| s.as_ref().map(|s| s.as_str())) + .collect(); + + RecordBatch::try_from_iter(vec![("a", Arc::new(a) as ArrayRef)]).unwrap() +} + +/// Encodes input as a FlightData stream, and then decodes it using +/// FlightRecordBatchStream and valides the decoded record batches +/// match the input. +async fn roundtrip(input: Vec) { + let expected_output = input.clone(); + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) + .await +} + +/// Encodes input as a FlightData stream, and then decodes it using +/// FlightRecordBatchStream and valides the decoded record batches +/// match the expected input. +/// +/// When is resolved, +/// it should be possible to use `roundtrip` +async fn roundtrip_dictionary(input: Vec) { + let schema = Arc::new(prepare_schema_for_flight(&input[0].schema())); + let expected_output: Vec<_> = input + .iter() + .map(|batch| prepare_batch_for_flight(batch, schema.clone()).unwrap()) + .collect(); + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) + .await +} + +async fn roundtrip_with_encoder( + encoder: FlightDataEncoderBuilder, + input_batches: Vec, + expected_batches: Vec, +) { + println!("Round tripping with encoder:\n{encoder:#?}"); + + let input_batch_stream = futures::stream::iter(input_batches.clone()).map(Ok); + + let encode_stream = encoder.build(input_batch_stream); + + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); + let output_batches: Vec<_> = + decode_stream.try_collect().await.expect("encode / decode"); + + // remove any empty batches from input as they are not transmitted + let expected_batches: Vec<_> = expected_batches + .into_iter() + .filter(|b| b.num_rows() > 0) + .collect(); + + assert_eq!(expected_batches, output_batches); +} + +/// Workaround for https://github.com/apache/arrow-rs/issues/1206 +fn prepare_schema_for_flight(schema: &Schema) -> Schema { + let fields = schema + .fields() + .iter() + .map(|field| match field.data_type() { + DataType::Dictionary(_, value_type) => Field::new( + field.name(), + value_type.as_ref().clone(), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + _ => field.clone(), + }) + .collect(); + + Schema::new(fields) +} + +/// Workaround for https://github.com/apache/arrow-rs/issues/1206 +fn prepare_batch_for_flight( + batch: &RecordBatch, + schema: SchemaRef, +) -> Result { + let columns = batch + .columns() + .iter() + .map(hydrate_dictionary) + .collect::, _>>()?; + + Ok(RecordBatch::try_new(schema, columns)?) +} + +fn hydrate_dictionary(array: &ArrayRef) -> Result { + let arr = if let DataType::Dictionary(_, value) = array.data_type() { + arrow_cast::cast(array, value)? + } else { + Arc::clone(array) + }; + Ok(arr) +} diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index ef0a49be693b..231f72910174 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -298,6 +298,12 @@ fn create_array( make_array(data) } _ => { + if nodes.len() <= node_index { + return Err(ArrowError::IoError(format!( + "Invalid data for schema. {} refers to node index {} but only {} in schema", + field, node_index, nodes.len() + ))); + } let array = create_primitive_array( nodes.get(node_index), data_type, diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 106b4e4c9850..82cf2c90b852 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -37,7 +37,7 @@ use arrow_schema::*; use crate::compression::CompressionCodec; use crate::CONTINUATION_MARKER; -/// IPC write options used to control the behaviour of the writer +/// IPC write options used to control the behaviour of the [`IpcDataGenerator`] #[derive(Debug, Clone)] pub struct IpcWriteOptions { /// Write padding after memory buffers to this multiple of bytes. @@ -514,6 +514,9 @@ pub struct DictionaryTracker { } impl DictionaryTracker { + /// Create a new [`DictionaryTracker`]. If `error_on_replacement` + /// is true, an error will be generated if an update to an + /// existing dictionary is attempted. pub fn new(error_on_replacement: bool) -> Self { Self { written: HashMap::new(), From 808a982ba3ad928719cd4377e2e81fbca53b07c0 Mon Sep 17 00:00:00 2001 From: GeauxEric Date: Sat, 31 Dec 2022 05:06:03 -0800 Subject: [PATCH 0456/1411] Add Put and Multipart Put doc examples (#3420) --- object_store/src/lib.rs | 54 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 0c416c26b78d..425c5cdba1d1 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -61,7 +61,7 @@ //! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) //! //! -//! # Listing objects: +//! # List objects: //! //! Use the [`ObjectStore::list`] method to iterate over objects in //! remote storage or files in the local filesystem: @@ -114,7 +114,7 @@ //! ... //! ``` //! -//! # Fetching objects +//! # Fetch objects //! //! Use the [`ObjectStore::get`] method to fetch the data bytes //! from remote storage or files in the local filesystem as a stream. @@ -164,7 +164,57 @@ //! ```text //! Num zeros in data/file01.parquet is 657 //! ``` +//! # Put object +//! Use the [`ObjectStore::put`] method to save data in remote storage or local filesystem. //! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! # async fn put() { +//! use object_store::ObjectStore; +//! use std::sync::Arc; +//! use bytes::Bytes; +//! use object_store::path::Path; +//! +//! let object_store: Arc = Arc::new(get_object_store()); +//! let path: Path = "data/file1".try_into().unwrap(); +//! let bytes = Bytes::from_static(b"hello"); +//! object_store +//! .put(&path, bytes) +//! .await +//! .unwrap(); +//! # } +//! ``` +//! +//! # Multipart put object +//! Use the [`ObjectStore::put_multipart`] method to save large amount of data in chunks. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! # async fn multi_upload() { +//! use object_store::ObjectStore; +//! use std::sync::Arc; +//! use bytes::Bytes; +//! use tokio::io::AsyncWriteExt; +//! use object_store::path::Path; +//! +//! let object_store: Arc = Arc::new(get_object_store()); +//! let path: Path = "data/large_file".try_into().unwrap(); +//! let (_id, mut writer) = object_store +//! .put_multipart(&path) +//! .await +//! .unwrap(); +//! let bytes = Bytes::from_static(b"hello"); +//! writer.write_all(&bytes).await.unwrap(); +//! writer.flush().await.unwrap(); +//! writer.shutdown().await.unwrap(); +//! # } +//! ``` #[cfg(all( target_arch = "wasm32", From ec43d6fd5ebdd5f64b3556790d44bf96829e8ae8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 31 Dec 2022 10:28:21 -0800 Subject: [PATCH 0457/1411] Provide `into_builder` for bytearray (#3326) * Provide into_builder for bytearray * For review * Remove slices_mut * Modify test and remove values_slice_mut --- arrow-array/src/array/byte_array.rs | 86 +++++++++++++++++++ arrow-array/src/array/string_array.rs | 25 ++++++ .../src/builder/generic_bytes_builder.rs | 40 ++++++++- arrow-array/src/builder/primitive_builder.rs | 4 +- 4 files changed, 152 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index eb528384eace..2cb04efb8e89 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -16,6 +16,7 @@ // under the License. use crate::array::{empty_offsets, print_long_array}; +use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::raw_pointer::RawPtrBox; use crate::types::bytes::ByteArrayNativeType; @@ -139,6 +140,91 @@ impl GenericByteArray { pub fn iter(&self) -> ArrayIter<&Self> { ArrayIter::new(self) } + + /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying + /// offset and data buffers are not shared by others. + pub fn into_builder(self) -> Result, Self> { + let len = self.len(); + let null_bit_buffer = self + .data + .null_buffer() + .map(|b| b.bit_slice(self.data.offset(), len)); + + let element_len = std::mem::size_of::(); + let offset_buffer = self.data.buffers()[0] + .slice_with_length(self.data.offset() * element_len, (len + 1) * element_len); + + let element_len = std::mem::size_of::(); + let value_len = + T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); + let value_buffer = self.data.buffers()[1] + .slice_with_length(self.data.offset() * element_len, value_len * element_len); + + drop(self.data); + + let try_mutable_null_buffer = match null_bit_buffer { + None => Ok(None), + Some(null_buffer) => { + // Null buffer exists, tries to make it mutable + null_buffer.into_mutable().map(Some) + } + }; + + let try_mutable_buffers = match try_mutable_null_buffer { + Ok(mutable_null_buffer) => { + // Got mutable null buffer, tries to get mutable value buffer + let try_mutable_offset_buffer = offset_buffer.into_mutable(); + let try_mutable_value_buffer = value_buffer.into_mutable(); + + // try_mutable_offset_buffer.map(...).map_err(...) doesn't work as the compiler complains + // mutable_null_buffer is moved into map closure. + match (try_mutable_offset_buffer, try_mutable_value_buffer) { + (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe { + Ok(GenericByteBuilder::::new_from_buffer( + mutable_offset_buffer, + mutable_value_buffer, + mutable_null_buffer, + )) + }, + (Ok(mutable_offset_buffer), Err(value_buffer)) => Err(( + mutable_offset_buffer.into(), + value_buffer, + mutable_null_buffer.map(|b| b.into()), + )), + (Err(offset_buffer), Ok(mutable_value_buffer)) => Err(( + offset_buffer, + mutable_value_buffer.into(), + mutable_null_buffer.map(|b| b.into()), + )), + (Err(offset_buffer), Err(value_buffer)) => Err(( + offset_buffer, + value_buffer, + mutable_null_buffer.map(|b| b.into()), + )), + } + } + Err(mutable_null_buffer) => { + // Unable to get mutable null buffer + Err((offset_buffer, value_buffer, Some(mutable_null_buffer))) + } + }; + + match try_mutable_buffers { + Ok(builder) => Ok(builder), + Err((offset_buffer, value_buffer, null_bit_buffer)) => { + let builder = ArrayData::builder(T::DATA_TYPE) + .len(len) + .add_buffer(offset_buffer) + .add_buffer(value_buffer) + .null_bit_buffer(null_bit_buffer); + + let array_data = unsafe { builder.build_unchecked() }; + let array = GenericByteArray::::from(array_data); + + Err(array) + } + } + } } impl std::fmt::Debug for GenericByteArray { diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index c8db589e3c28..4a4152adc678 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -697,4 +697,29 @@ mod tests { assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } + + #[test] + fn test_into_builder() { + let array: StringArray = vec!["hello", "arrow"].into(); + + // Append values + let mut builder = array.into_builder().unwrap(); + + builder.append_value("rust"); + + let expected: StringArray = vec!["hello", "arrow", "rust"].into(); + let array = builder.finish(); + assert_eq!(expected, array); + } + + #[test] + fn test_into_builder_err() { + let array: StringArray = vec!["hello", "arrow"].into(); + + // Clone it, so we cannot get a mutable builder back + let shared_array = array.clone(); + + let err_return = array.into_builder().unwrap_err(); + assert_eq!(&err_return, &shared_array); + } } diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 9f9078c708c8..195628f4712f 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -19,7 +19,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayDataBuilder; use std::any::Any; use std::sync::Arc; @@ -53,6 +53,34 @@ impl GenericByteBuilder { } } + /// Creates a new [`GenericByteBuilder`] from buffers. + /// + /// # Safety + /// This doesn't verify buffer contents as it assumes the buffers are from existing and + /// valid [`GenericByteArray`]. + pub unsafe fn new_from_buffer( + offsets_buffer: MutableBuffer, + value_buffer: MutableBuffer, + null_buffer: Option, + ) -> Self { + let offsets_builder = BufferBuilder::::new_from_buffer(offsets_buffer); + let value_builder = BufferBuilder::::new_from_buffer(value_buffer); + + let null_buffer_builder = null_buffer + .map(|buffer| { + NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1) + }) + .unwrap_or_else(|| { + NullBufferBuilder::new_with_len(offsets_builder.len() - 1) + }); + + Self { + offsets_builder, + value_builder, + null_buffer_builder, + } + } + /// Appends a value into the builder. #[inline] pub fn append_value(&mut self, value: impl AsRef) { @@ -122,6 +150,16 @@ impl GenericByteBuilder { pub fn offsets_slice(&self) -> &[T::Offset] { self.offsets_builder.as_slice() } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } + + /// Returns the current null buffer as a mutable slice + pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { + self.null_buffer_builder.as_slice_mut() + } } impl std::fmt::Debug for GenericByteBuilder { diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index fa1dc3ad1264..f3f3f3728db9 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -286,12 +286,12 @@ impl PrimitiveBuilder { self.values_builder.as_slice_mut() } - /// Returns the current values buffer as a slice + /// Returns the current null buffer as a slice pub fn validity_slice(&self) -> Option<&[u8]> { self.null_buffer_builder.as_slice() } - /// Returns the current values buffer as a mutable slice + /// Returns the current null buffer as a mutable slice pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> { self.null_buffer_builder.as_slice_mut() } From 2408bb274e82c785cca9b4596cd8f201ccc5d7c6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 1 Jan 2023 12:24:12 -0500 Subject: [PATCH 0458/1411] Minor: Improve docs for arrow-ipc, remove clippy ignore (#3421) * Minor: Add doc example to IpcDataGenerator * Improve docs / cleanup * Update doc links --- arrow-ipc/src/lib.rs | 7 +++---- arrow-ipc/src/writer.rs | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/arrow-ipc/src/lib.rs b/arrow-ipc/src/lib.rs index 38217957dd87..4f35ffb60a9f 100644 --- a/arrow-ipc/src/lib.rs +++ b/arrow-ipc/src/lib.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Support for the Arrow IPC format - -// TODO: (vcq): Protobuf codegen is not generating Debug impls. -#![allow(missing_debug_implementations)] +//! Support for the [Arrow IPC Format] +//! +//! [Arrow IPC Format]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc pub mod convert; pub mod reader; diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 82cf2c90b852..e4dcab40a148 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -137,6 +137,38 @@ impl Default for IpcWriteOptions { } #[derive(Debug, Default)] +/// Handles low level details of encoding [`Array`] and [`Schema`] into the +/// [Arrow IPC Format]. +/// +/// # Example: +/// ``` +/// # fn run() { +/// # use std::sync::Arc; +/// # use arrow_array::UInt64Array; +/// # use arrow_array::RecordBatch; +/// # use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; +/// +/// // Create a record batch +/// let batch = RecordBatch::try_from_iter(vec![ +/// ("col2", Arc::new(UInt64Array::from_iter([10, 23, 33])) as _) +/// ]).unwrap(); +/// +/// // Error of dictionary ids are replaced. +/// let error_on_replacement = true; +/// let options = IpcWriteOptions::default(); +/// let mut dictionary_tracker = DictionaryTracker::new(error_on_replacement); +/// +/// // encode the batch into zero or more encoded dictionaries +/// // and the data for the actual array. +/// let data_gen = IpcDataGenerator {}; +/// let (encoded_dictionaries, encoded_message) = data_gen +/// .encoded_batch(&batch, &mut dictionary_tracker, &options) +/// .unwrap(); +/// # } +/// ``` +/// +/// [Arrow IPC Format]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc + pub struct IpcDataGenerator {} impl IpcDataGenerator { From b371f41f338737b4e214d74b48e18939f5643a84 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Mon, 2 Jan 2023 05:31:04 -0500 Subject: [PATCH 0459/1411] refactor: convert `*like_dyn`, `*like_utf8_scalar_dyn` and `*like_dict` functions to macros (#3411) * like_dyn refactor * name changes to better match existing names * Use declarative macro for `*like_dyn` functions * fix PR comment * empty commit * macros for `*like_utf8_scalar_dyn` and `*like_dict` functions Co-authored-by: askoa --- arrow-string/src/like.rs | 577 +++++++++------------------------------ 1 file changed, 122 insertions(+), 455 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e359a80cb24b..d8afa8d4c614 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -25,61 +25,31 @@ use arrow_select::take::take; use regex::Regex; use std::collections::HashMap; -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow_array::{StringArray, BooleanArray}; -/// use arrow_string::like::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / +macro_rules! dyn_function { + ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { +#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")] /// [`LargeStringArray`], or [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`]. /// /// See the documentation on [`like_utf8`] for more details. -pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { +pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result { match (left.data_type(), right.data_type()) { (DataType::Utf8, DataType::Utf8) => { let left = as_string_array(left); let right = as_string_array(right); - like_utf8(left, right) + $fn_utf8(left, right) } (DataType::LargeUtf8, DataType::LargeUtf8) => { let left = as_largestring_array(left); let right = as_largestring_array(right); - like_utf8(left, right) + $fn_utf8(left, right) } #[cfg(feature = "dyn_cmp_dict")] (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { downcast_dictionary_array!( left => { let right = as_dictionary_array(right); - like_dict(left, right) + $fn_dict(left, right) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t @@ -87,19 +57,78 @@ pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result { - Err(ArrowError::ComputeError( - "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) + Err(ArrowError::ComputeError(format!( + "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", + stringify!($fn_name) + ))) } } } -/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values + } +} +dyn_function!("LIKE", like_dyn, like_utf8, like_dict); +dyn_function!("NOT LIKE", nlike_dyn, nlike_utf8, nlike_dict); +dyn_function!("ILIKE", ilike_dyn, ilike_utf8, ilike_dict); +dyn_function!("NOT ILIKE", nilike_dyn, nilike_utf8, nilike_dict); + +macro_rules! scalar_dyn_function { + ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { +#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")] +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn $fn_name( + left: &dyn Array, + right: &str, +) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + $fn_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + $fn_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + let dict_comparison = $fn_name(left.values().as_ref(), right)?; + // TODO: Use take_boolean (#2967) + let array = take(&dict_comparison, left.keys(), None)?; + Ok(BooleanArray::from(array.data().clone())) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError(format!( + "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", + stringify!($fn_name) + ))) + } + } +} + } +} +scalar_dyn_function!("LIKE", like_utf8_scalar_dyn, like_scalar); +scalar_dyn_function!("NOT LIKE", nlike_utf8_scalar_dyn, nlike_scalar); +scalar_dyn_function!("ILIKE", ilike_utf8_scalar_dyn, ilike_scalar); +scalar_dyn_function!("NOT ILIKE", nilike_utf8_scalar_dyn, nilike_scalar); + +macro_rules! dict_function { + ($sql:tt, $fn_name:tt, $pat:tt, $neg:expr, $typ:tt) => { + +#[doc = concat!("Perform SQL `left ", $sql ," right` operation on on [`DictionaryArray`] with values")] /// [`StringArray`]/[`LargeStringArray`]. /// /// See the documentation on [`like_utf8`] for more details. #[cfg(feature = "dyn_cmp_dict")] -fn like_dict( +fn $fn_name( left: &DictionaryArray, right: &DictionaryArray, ) -> Result { @@ -108,11 +137,11 @@ fn like_dict( let left = left.downcast_dict::>().unwrap(); let right = right.downcast_dict::>().unwrap(); - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + regex_like(left, right, $neg, |re_pattern| { + Regex::new(&format!($pat, re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e + "Unable to build regex from {} pattern: {}", + $typ, e )) }) }) @@ -121,21 +150,61 @@ fn like_dict( let left = left.downcast_dict::>().unwrap(); let right = right.downcast_dict::>().unwrap(); - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + regex_like(left, right, $neg, |re_pattern| { + Regex::new(&format!($pat, re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", + "Unable to build regex from {} pattern: {}", + $typ, e )) }) }) } - _ => Err(ArrowError::ComputeError( - "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), + _ => Err(ArrowError::ComputeError(format!( + "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", + stringify!($fn_name) + ))), } } + } +} + +dict_function!("LIKE", like_dict, "^{}$", false, "LIKE"); +dict_function!("NOT LIKE", nlike_dict, "^{}$", true, "LIKE"); +dict_function!("ILIKE", ilike_dict, "(?i)^{}$", false, "ILIKE"); +dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE"); + +/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// There are two wildcards supported with the LIKE operator: +/// +/// 1. `%` - The percent sign represents zero, one, or multiple characters +/// 2. `_` - The underscore represents a single character +/// +/// For example: +/// ``` +/// use arrow_array::{StringArray, BooleanArray}; +/// use arrow_string::like::like_utf8; +/// +/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); +/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); +/// +/// let result = like_utf8(&strings, &patterns).unwrap(); +/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); +/// ``` +pub fn like_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + regex_like(left, right, false, |re_pattern| { + Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + ArrowError::ComputeError(format!( + "Unable to build regex from LIKE pattern: {}", + e + )) + }) + }) +} #[inline] fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( @@ -194,45 +263,6 @@ fn like_scalar<'a, L: ArrayAccessor>( like_scalar_op(left, right, |x| x) } -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar_dyn( - left: &dyn Array, - right: &str, -) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - like_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - like_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -299,88 +329,6 @@ pub fn nlike_utf8( }) } -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_dyn( - left: &dyn Array, - right: &dyn Array, -) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nlike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nlike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nlike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nlike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - #[inline] fn nlike_scalar<'a, L: ArrayAccessor>( left: L, @@ -389,45 +337,6 @@ fn nlike_scalar<'a, L: ArrayAccessor>( like_scalar_op(left, right, |x| !x) } -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar_dyn( - left: &dyn Array, - right: &str, -) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nlike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nlike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -460,88 +369,6 @@ pub fn ilike_utf8( }) } -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ilike_dyn( - left: &dyn Array, - right: &dyn Array, -) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - ilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - ilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - ilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`ilike_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn ilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - #[inline] fn ilike_scalar_op bool>( left: &GenericStringArray, @@ -597,45 +424,6 @@ fn ilike_scalar( ilike_scalar_op(left, right, |x| x) } -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn ilike_utf8_scalar_dyn( - left: &dyn Array, - right: &str, -) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - ilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - ilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -665,88 +453,6 @@ pub fn nilike_utf8( }) } -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn nilike_dyn( - left: &dyn Array, - right: &dyn Array, -) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); - nilike_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); - nilike_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - nilike_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - -/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`ilike_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn nilike_dict( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) - }) - }) - } - _ => Err(ArrowError::ComputeError( - "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values" - .to_string(), - )), - } -} - #[inline] fn nilike_scalar( left: &GenericStringArray, @@ -755,45 +461,6 @@ fn nilike_scalar( ilike_scalar_op(left, right, |x| !x) } -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn nilike_utf8_scalar_dyn( - left: &dyn Array, - right: &str, -) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = as_string_array(left); - nilike_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = as_largestring_array(left); - nilike_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError( - "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )) - } - } -} - /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// From 1889e33da31218ee2c58ad874036b17b699538b9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 2 Jan 2023 14:42:19 +0000 Subject: [PATCH 0460/1411] Derive Clone for ObjectStore builders and Make URL Parsing Stricter (#3419) (#3424) * Derive Clone for ObjectStore builders (#3419) Make URL parsing more strict * Review feedback --- object_store/src/aws/mod.rs | 112 +++++++++++-------- object_store/src/azure/mod.rs | 196 ++++++++++++++++++---------------- object_store/src/gcp/mod.rs | 95 ++++++++-------- object_store/src/http/mod.rs | 21 ++-- 4 files changed, 228 insertions(+), 196 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 0fcfbaf9c5c2..786ccd20f18a 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -36,6 +36,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; +use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; @@ -129,6 +130,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -358,7 +362,7 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { /// .with_secret_access_key(SECRET_KEY) /// .build(); /// ``` -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct AmazonS3Builder { access_key_id: Option, secret_access_key: Option, @@ -366,13 +370,13 @@ pub struct AmazonS3Builder { bucket_name: Option, endpoint: Option, token: Option, + url: Option, retry_config: RetryConfig, imdsv1_fallback: bool, virtual_hosted_style_request: bool, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, - url_parse_error: Option, } impl AmazonS3Builder { @@ -453,9 +457,7 @@ impl AmazonS3Builder { /// - `https://s3..amazonaws.com` /// - `https://.s3..amazonaws.com` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -465,44 +467,39 @@ impl AmazonS3Builder { /// .with_url("s3://bucket/path") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "s3" | "s3a" => { - self.bucket_name = parsed.host_str().map(|host| host.to_owned()); - } - "https" => { - if let Some(host) = parsed.host_str() { - let parts = host.splitn(4, '.').collect::>(); - if parts.len() == 4 && parts[0] == "s3" && parts[2] == "amazonaws" - { - self.bucket_name = Some(parts[1].to_string()); - } - if parts.len() == 4 - && parts[1] == "s3" - && parts[3] == "amazonaws.com" - { - self.bucket_name = Some(parts[0].to_string()); - self.region = Some(parts[2].to_string()); - self.virtual_hosted_style_request = true; - } - } + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "s3" | "s3a" => self.bucket_name = Some(validate(host)?), + "https" => match host.splitn(4, '.').collect_tuple() { + Some(("s3", bucket, "amazonaws", "com")) => { + self.bucket_name = Some(bucket.to_string()); } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); + Some((bucket, "s3", region, "amazonaws.com")) => { + self.bucket_name = Some(bucket.to_string()); + self.region = Some(region.to_string()); + self.virtual_hosted_style_request = true; } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), }; - self + Ok(()) } /// Set the AWS Access Key (required) @@ -641,9 +638,9 @@ impl AmazonS3Builder { /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. - pub fn build(self) -> Result { - if let Some(err) = self.url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; @@ -1022,15 +1019,36 @@ mod tests { #[test] fn s3_test_urls() { - let builder = AmazonS3Builder::new().with_url("s3://bucket/path"); + let mut builder = AmazonS3Builder::new(); + builder.parse_url("s3://bucket/path").unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - let builder = AmazonS3Builder::new().with_url("https://s3.bucket.amazonaws.com"); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.bucket.amazonaws.com") + .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - let builder = - AmazonS3Builder::new().with_url("https://bucket.s3.region.amazonaws.com"); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://bucket.s3.region.amazonaws.com") + .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); - assert_eq!(builder.region, Some("region".to_string())) + assert_eq!(builder.region, Some("region".to_string())); + assert!(builder.virtual_hosted_style_request); + + let err_cases = [ + "mailto://bucket/path", + "s3://bucket.mydomain/path", + "https://s3.bucket.mydomain.com", + "https://s3.bucket.foo.amazonaws.com", + "https://bucket.mydomain.region.amazonaws.com", + "https://bucket.s3.region.bar.amazonaws.com", + "https://bucket.foo.s3.amazonaws.com", + ]; + let mut builder = AmazonS3Builder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 4224ae633dcd..7cf369de3b3a 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -37,7 +37,7 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::fmt::{Debug, Formatter}; use std::io; @@ -121,6 +121,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -354,7 +357,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { /// .with_container_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Default)] +#[derive(Default, Clone)] pub struct MicrosoftAzureBuilder { account_name: Option, access_key: Option, @@ -365,10 +368,10 @@ pub struct MicrosoftAzureBuilder { tenant_id: Option, sas_query_pairs: Option>, authority_host: Option, + url: Option, use_emulator: bool, retry_config: RetryConfig, client_options: ClientOptions, - url_parse_error: Option, } impl Debug for MicrosoftAzureBuilder { @@ -444,9 +447,7 @@ impl MicrosoftAzureBuilder { /// - `https://.dfs.core.windows.net` /// - `https://.blob.core.windows.net` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -456,52 +457,48 @@ impl MicrosoftAzureBuilder { /// .with_url("abfss://file_system@account.dfs.core.windows.net/") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "az" | "adl" | "azure" => { - self.container_name = parsed.host_str().map(|host| host.to_owned()); - } - "abfs" | "abfss" => { - // abfs(s) might refer to the fsspec convention abfs:/// - // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ - if parsed.username().is_empty() { - self.container_name = - parsed.host_str().map(|host| host.to_owned()); - } else if let Some(host) = parsed.host_str() { - let parts = host.splitn(2, '.').collect::>(); - if parts.len() == 2 && parts[1] == "dfs.core.windows.net" { - self.container_name = Some(parsed.username().to_owned()); - self.account_name = Some(parts[0].to_string()); - } - } - } - "https" => { - if let Some(host) = parsed.host_str() { - let parts = host.splitn(2, '.').collect::>(); - if parts.len() == 2 - && (parts[1] == "dfs.core.windows.net" - || parts[1] == "blob.core.windows.net") - { - self.account_name = Some(parts[0].to_string()); - } - } + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = Some(validate(host)?); + } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else { + return Err(UrlNotRecognisedSnafu { url }.build().into()); } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); + } + "https" => match host.split_once('.') { + Some((a, "dfs.core.windows.net")) + | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } - }; - self + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) } /// Set the Azure Account (required) @@ -595,63 +592,49 @@ impl MicrosoftAzureBuilder { /// Configure a connection to container with given name on Microsoft Azure /// Blob store. - pub fn build(self) -> Result { - let Self { - account_name, - access_key, - container_name, - bearer_token, - client_id, - client_secret, - tenant_id, - sas_query_pairs, - use_emulator, - retry_config, - authority_host, - mut client_options, - url_parse_error, - } = self; - - if let Some(err) = url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } - let container = container_name.ok_or(Error::MissingContainerName {})?; + let container = self.container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, storage_url, auth, account) = if use_emulator { - let account_name = - account_name.unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); + let (is_emulator, storage_url, auth, account) = if self.use_emulator { + let account_name = self + .account_name + .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = - access_key.unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + let account_key = self + .access_key + .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); let credential = credential::CredentialProvider::AccessKey(account_key); - client_options = client_options.with_allow_http(true); + self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) } else { - let account_name = account_name.ok_or(Error::MissingAccount {})?; + let account_name = self.account_name.ok_or(Error::MissingAccount {})?; let account_url = format!("https://{}.blob.core.windows.net", &account_name); let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; - let credential = if let Some(bearer_token) = bearer_token { + let credential = if let Some(bearer_token) = self.bearer_token { Ok(credential::CredentialProvider::AccessKey(bearer_token)) - } else if let Some(access_key) = access_key { + } else if let Some(access_key) = self.access_key { Ok(credential::CredentialProvider::AccessKey(access_key)) } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (client_id, client_secret, tenant_id) + (self.client_id, self.client_secret, self.tenant_id) { let client_credential = credential::ClientSecretOAuthProvider::new( client_id, client_secret, tenant_id, - authority_host, + self.authority_host, ); Ok(credential::CredentialProvider::ClientSecret( client_credential, )) - } else if let Some(query_pairs) = sas_query_pairs { + } else if let Some(query_pairs) = self.sas_query_pairs { Ok(credential::CredentialProvider::SASToken(query_pairs)) } else { Err(Error::MissingCredentials {}) @@ -661,12 +644,12 @@ impl MicrosoftAzureBuilder { let config = client::AzureConfig { account, - retry_config, - service: storage_url, + is_emulator, container, + retry_config: self.retry_config, + client_options: self.client_options, + service: storage_url, credentials: auth, - is_emulator, - client_options, }; let client = Arc::new(client::AzureClient::new(config)?); @@ -804,26 +787,49 @@ mod tests { #[test] fn azure_blob_test_urls() { - let builder = MicrosoftAzureBuilder::new() - .with_url("abfss://file_system@account.dfs.core.windows.net/"); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.core.windows.net/") + .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); assert_eq!(builder.container_name, Some("file_system".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("abfs://container/path"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("abfs://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("az://container"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new().with_url("az://container/path"); + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container/path").unwrap(); assert_eq!(builder.container_name, Some("container".to_string())); - let builder = MicrosoftAzureBuilder::new() - .with_url("https://account.dfs.core.windows.net/"); + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.core.windows.net/") + .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); - let builder = MicrosoftAzureBuilder::new() - .with_url("https://account.blob.core.windows.net/"); - assert_eq!(builder.account_name, Some("account".to_string())) + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + + let err_cases = [ + "mailto://account.blob.core.windows.net/", + "az://blob.mydomain/", + "abfs://container.foo/path", + "abfss://file_system@account.foo.dfs.core.windows.net/", + "abfss://file_system.bar@account.dfs.core.windows.net/", + "https://blob.mydomain/", + "https://blob.foo.dfs.core.windows.net/", + ]; + let mut builder = MicrosoftAzureBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index c1424d9713c1..f2638748f6ca 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -42,7 +42,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -142,6 +142,9 @@ enum Error { scheme ))] UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, } impl From for super::Error { @@ -784,13 +787,13 @@ fn reader_credentials_file( /// .with_bucket_name(BUCKET_NAME) /// .build(); /// ``` -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct GoogleCloudStorageBuilder { bucket_name: Option, + url: Option, service_account_path: Option, retry_config: RetryConfig, client_options: ClientOptions, - url_parse_error: Option, } impl Default for GoogleCloudStorageBuilder { @@ -800,7 +803,7 @@ impl Default for GoogleCloudStorageBuilder { service_account_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), - url_parse_error: None, + url: None, } } } @@ -845,9 +848,7 @@ impl GoogleCloudStorageBuilder { /// /// - `gs:///` /// - /// Please note that this is a best effort implementation, and will not fail for malformed URLs, - /// but rather warn and ignore the passed url. The url also has no effect on how the - /// storage is accessed - e.g. which driver or protocol is used for reading from the location. + /// Note: Settings derived from the URL will override any others set on this builder /// /// # Example /// ``` @@ -857,29 +858,31 @@ impl GoogleCloudStorageBuilder { /// .with_url("gs://bucket/path") /// .build(); /// ``` - pub fn with_url(mut self, url: impl AsRef) -> Self { - let maybe_parsed = Url::parse(url.as_ref()); - match maybe_parsed { - Ok(parsed) => match parsed.scheme() { - "gs" => { - self.bucket_name = parsed.host_str().map(|host| host.to_owned()); - } - other => { - self.url_parse_error = Some(Error::UnknownUrlScheme { - scheme: other.into(), - }); - } - }, - Err(err) => { - self.url_parse_error = Some(Error::UnableToParseUrl { - source: err, - url: url.as_ref().into(), - }); - } - }; + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); self } + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "gs" => self.bucket_name = Some(validate(host)?), + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + /// Set the bucket name (required) pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { self.bucket_name = Some(bucket_name.into()); @@ -927,24 +930,17 @@ impl GoogleCloudStorageBuilder { /// Configure a connection to Google Cloud Storage, returning a /// new [`GoogleCloudStorage`] and consuming `self` - pub fn build(self) -> Result { - let Self { - bucket_name, - service_account_path, - retry_config, - client_options, - url_parse_error, - } = self; - - if let Some(err) = url_parse_error { - return Err(err.into()); + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; } - let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; - let service_account_path = - service_account_path.ok_or(Error::MissingServiceAccountPath)?; + let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; + let service_account_path = self + .service_account_path + .ok_or(Error::MissingServiceAccountPath)?; - let client = client_options.client()?; + let client = self.client_options.client()?; let credentials = reader_credentials_file(service_account_path)?; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes @@ -977,8 +973,8 @@ impl GoogleCloudStorageBuilder { token_cache: Default::default(), bucket_name, bucket_name_encoded: encoded_bucket_name, - retry_config, - client_options, + retry_config: self.retry_config, + client_options: self.client_options, max_list_results: None, }), }) @@ -1199,7 +1195,14 @@ mod test { #[test] fn gcs_test_urls() { - let builder = GoogleCloudStorageBuilder::new().with_url("gs://bucket/path"); - assert_eq!(builder.bucket_name, Some("bucket".to_string())) + let mut builder = GoogleCloudStorageBuilder::new(); + builder.parse_url("gs://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; + let mut builder = GoogleCloudStorageBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } } } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 25997d8924ec..f05e70024b8c 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -55,8 +55,11 @@ enum Error { #[snafu(display("Must specify a URL"))] MissingUrl, - #[snafu(display("Invalid URL: {}", source))] - InvalidUrl { source: reqwest::Error }, + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, #[snafu(display("Object is a directory"))] IsDirectory, @@ -210,9 +213,9 @@ impl ObjectStore for HttpStore { } /// Configure a connection to a generic HTTP server -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct HttpBuilder { - url: Option>, + url: Option, client_options: ClientOptions, retry_config: RetryConfig, } @@ -224,8 +227,8 @@ impl HttpBuilder { } /// Set the URL - pub fn with_url(mut self, url: impl reqwest::IntoUrl) -> Self { - self.url = Some(url.into_url().context(InvalidUrlSnafu).map_err(Into::into)); + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); self } @@ -243,9 +246,11 @@ impl HttpBuilder { /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { - let url = self.url.context(MissingUrlSnafu)??; + let url = self.url.context(MissingUrlSnafu)?; + let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + Ok(HttpStore { - client: Client::new(url, self.client_options, self.retry_config)?, + client: Client::new(parsed, self.client_options, self.retry_config)?, }) } } From 6139d8984ca702fa744aaf3b0b2193dd85468cf7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 2 Jan 2023 15:49:24 +0000 Subject: [PATCH 0461/1411] Add parquet-index binary (#3405) * Add parquet-index binary * Improve error message for missing index --- parquet/Cargo.toml | 4 + parquet/src/bin/parquet-index.rs | 173 +++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 parquet/src/bin/parquet-index.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 6ee83a2c43de..7a76ff64e519 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -138,6 +138,10 @@ required-features = ["cli"] name = "parquet-layout" required-features = ["cli"] +[[bin]] +name = "parquet-index" +required-features = ["cli"] + [[bench]] name = "arrow_writer" required-features = ["arrow"] diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs new file mode 100644 index 000000000000..6622783e6cf4 --- /dev/null +++ b/parquet/src/bin/parquet-index.rs @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary that prints the [page index] of a parquet file +//! +//! # Install +//! +//! `parquet-layout` can be installed using `cargo`: +//! ``` +//! cargo install parquet --features=cli +//! ``` +//! After this `parquet-index` should be available: +//! ``` +//! parquet-index XYZ.parquet COLUMN_NAME +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --features=cli --bin parquet-index XYZ.parquet COLUMN_NAME +//! +//! [page index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md + +use clap::Parser; +use parquet::errors::{ParquetError, Result}; +use parquet::file::page_index::index::{Index, PageIndex}; +use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::file::serialized_reader::ReadOptionsBuilder; +use parquet::format::PageLocation; +use std::fs::File; + +#[derive(Debug, Parser)] +#[clap(author, version, about("Prints the page index of a parquet file"), long_about = None)] +struct Args { + #[clap(help("Path to a parquet file"))] + file: String, + + #[clap(help("Column name to print"))] + column: String, +} + +impl Args { + fn run(&self) -> Result<()> { + let file = File::open(&self.file)?; + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options)?; + + let schema = reader.metadata().file_metadata().schema_descr(); + let column_idx = schema + .columns() + .iter() + .position(|x| x.name() == self.column.as_str()) + .ok_or_else(|| { + ParquetError::General(format!("Failed to find column {}", self.column)) + })?; + + // Column index data for all row groups and columns + let column_index = reader + .metadata() + .page_indexes() + .ok_or_else(|| ParquetError::General("Column index not found".to_string()))?; + + // Offset index data for all row groups and columns + let offset_index = reader + .metadata() + .offset_indexes() + .ok_or_else(|| ParquetError::General("Offset index not found".to_string()))?; + + // Iterate through each row group + for (row_group_idx, ((column_indices, offset_indices), row_group)) in column_index + .iter() + .zip(offset_index) + .zip(reader.metadata().row_groups()) + .enumerate() + { + println!("Row Group: {}", row_group_idx); + let offset_index = offset_indices.get(column_idx).ok_or_else(|| { + ParquetError::General(format!( + "No offset index for row group {} column chunk {}", + row_group_idx, column_idx + )) + })?; + + let row_counts = compute_row_counts(offset_index, row_group.num_rows()); + match &column_indices[column_idx] { + Index::NONE => println!("NO INDEX"), + Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::INT32(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::INT64(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?, + Index::BYTE_ARRAY(_) => println!("BYTE_ARRAY not supported"), + Index::FIXED_LEN_BYTE_ARRAY(_) => { + println!("FIXED_LEN_BYTE_ARRAY not supported") + } + } + } + Ok(()) + } +} + +/// Computes the number of rows in each page within a column chunk +fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { + if offset_index.is_empty() { + return vec![]; + } + + let mut last = offset_index[0].first_row_index; + let mut out = Vec::with_capacity(offset_index.len()); + for o in offset_index.iter().skip(1) { + out.push(o.first_row_index - last); + last = o.first_row_index; + } + out.push(rows); + out +} + +/// Prints index information for a single column chunk +fn print_index( + column_index: &[PageIndex], + offset_index: &[PageLocation], + row_counts: &[i64], +) -> Result<()> { + if column_index.len() != offset_index.len() { + return Err(ParquetError::General(format!( + "Index length mismatch, got {} and {}", + column_index.len(), + offset_index.len() + ))); + } + + for (idx, ((c, o), row_count)) in column_index + .iter() + .zip(offset_index) + .zip(row_counts) + .enumerate() + { + print!( + "Page {:>5} at offset {:#010x} with length {:>10} and row count {:>10}", + idx, o.offset, o.compressed_page_size, row_count + ); + match &c.min { + Some(m) => print!(", min {:>10}", m), + None => print!(", min {:>10}", "NONE"), + } + + match &c.max { + Some(m) => print!(", max {:>10}", m), + None => print!(", max {:>10}", "NONE"), + } + println!() + } + + Ok(()) +} + +fn main() -> Result<()> { + Args::parse().run() +} From 08a976f217610cc8b1dea3375fe577027460ecc3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 3 Jan 2023 03:18:26 -0800 Subject: [PATCH 0462/1411] Generic bytes dictionary builder (#3426) * Add GenericByteBuilder * Update mod * Add tests * Move constraint to ByteArrayType and add type aliases for large string and large binary. * Update arrow-select/src/take.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- ...rs => generic_bytes_dictionary_builder.rs} | 376 +++++++++++++----- arrow-array/src/builder/mod.rs | 4 +- arrow-array/src/types.rs | 2 +- arrow-select/src/take.rs | 11 +- 4 files changed, 278 insertions(+), 115 deletions(-) rename arrow-array/src/builder/{string_dictionary_builder.rs => generic_bytes_dictionary_builder.rs} (55%) diff --git a/arrow-array/src/builder/string_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs similarity index 55% rename from arrow-array/src/builder/string_dictionary_builder.rs rename to arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 878cfc727631..34b736d65861 100644 --- a/arrow-array/src/builder/string_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::{ArrayBuilder, PrimitiveBuilder, StringBuilder}; -use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, DictionaryArray, StringArray}; +use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; +use crate::types::{ + ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType, +}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; use hashbrown::hash_map::RawEntryMut; @@ -25,45 +27,15 @@ use hashbrown::HashMap; use std::any::Any; use std::sync::Arc; -/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. -/// -/// ``` -/// // Create a dictionary array indexed by bytes whose values are Strings. -/// // It can thus hold up to 256 distinct string values. -/// -/// # use arrow_array::builder::StringDictionaryBuilder; -/// # use arrow_array::{Int8Array, StringArray}; -/// # use arrow_array::types::Int8Type; -/// -/// let mut builder = StringDictionaryBuilder::::new(); -/// -/// // The builder builds the dictionary value by value -/// builder.append("abc").unwrap(); -/// builder.append_null(); -/// builder.append("def").unwrap(); -/// builder.append("def").unwrap(); -/// builder.append("abc").unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(ava.value(0), "abc"); -/// assert_eq!(ava.value(1), "def"); -/// -/// ``` +/// Generic array builder for `DictionaryArray` that stores generic byte values. +/// For example to map a set of byte indices to String values. Note that +/// the use of a `HashMap` here will not scale to very large arrays or +/// result in an ordered dictionary. #[derive(Debug)] -pub struct StringDictionaryBuilder +pub struct GenericByteDictionaryBuilder where K: ArrowDictionaryKeyType, + T: ByteArrayType, { state: ahash::RandomState, /// Used to provide a lookup from string value to key type @@ -74,26 +46,28 @@ where dedup: HashMap, keys_builder: PrimitiveBuilder, - values_builder: StringBuilder, + values_builder: GenericByteBuilder, } -impl Default for StringDictionaryBuilder +impl Default for GenericByteDictionaryBuilder where K: ArrowDictionaryKeyType, + T: ByteArrayType, { fn default() -> Self { Self::new() } } -impl StringDictionaryBuilder +impl GenericByteDictionaryBuilder where K: ArrowDictionaryKeyType, + T: ByteArrayType, { - /// Creates a new `StringDictionaryBuilder` + /// Creates a new `GenericByteDictionaryBuilder` pub fn new() -> Self { let keys_builder = PrimitiveBuilder::new(); - let values_builder = StringBuilder::new(); + let values_builder = GenericByteBuilder::::new(); Self { state: Default::default(), dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()), @@ -102,25 +76,28 @@ where } } - /// Creates a new `StringDictionaryBuilder` with the provided capacities + /// Creates a new `GenericByteDictionaryBuilder` with the provided capacities /// /// `keys_capacity`: the number of keys, i.e. length of array to build /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary - /// `string_capacity`: the total number of bytes of all distinct strings in the dictionary + /// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary pub fn with_capacity( keys_capacity: usize, value_capacity: usize, - string_capacity: usize, + data_capacity: usize, ) -> Self { Self { state: Default::default(), dedup: Default::default(), keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder: StringBuilder::with_capacity(value_capacity, string_capacity), + values_builder: GenericByteBuilder::::with_capacity( + value_capacity, + data_capacity, + ), } } - /// Creates a new `StringDictionaryBuilder` from a keys capacity and a dictionary + /// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary /// which is initialized with the given values. /// The indices of those dictionary values are used as keys. /// @@ -145,7 +122,7 @@ where /// ``` pub fn new_with_dictionary( keys_capacity: usize, - dictionary_values: &StringArray, + dictionary_values: &GenericByteArray, ) -> Result { let state = ahash::RandomState::default(); let dict_len = dictionary_values.len(); @@ -153,19 +130,21 @@ where let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ()); let values_len = dictionary_values.value_data().len(); - let mut values_builder = StringBuilder::with_capacity(dict_len, values_len); + let mut values_builder = + GenericByteBuilder::::with_capacity(dict_len, values_len); for (idx, maybe_value) in dictionary_values.iter().enumerate() { match maybe_value { Some(value) => { - let hash = state.hash_one(value.as_bytes()); + let value_bytes: &[u8] = value.as_ref(); + let hash = state.hash_one(value_bytes); let key = K::Native::from_usize(idx) .ok_or(ArrowError::DictionaryKeyOverflowError)?; let entry = dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| { - value.as_bytes() == get_bytes(&values_builder, key) + value_bytes == get_bytes(&values_builder, key) }); if let RawEntryMut::Vacant(v) = entry { @@ -189,9 +168,10 @@ where } } -impl ArrayBuilder for StringDictionaryBuilder +impl ArrayBuilder for GenericByteDictionaryBuilder where K: ArrowDictionaryKeyType, + T: ByteArrayType, { /// Returns the builder as an non-mutable `Any` reference. fn as_any(&self) -> &dyn Any { @@ -229,26 +209,31 @@ where } } -impl StringDictionaryBuilder +impl GenericByteDictionaryBuilder where K: ArrowDictionaryKeyType, + T: ByteArrayType, { /// Append a primitive value to the array. Return an existing index /// if already present in the values array or a new index if the /// value is appended to the values array. /// /// Returns an error if the new index would overflow the key type. - pub fn append(&mut self, value: impl AsRef) -> Result { - let value = value.as_ref(); + pub fn append( + &mut self, + value: impl AsRef, + ) -> Result { + let value_native: &T::Native = value.as_ref(); + let value_bytes: &[u8] = value_native.as_ref(); let state = &self.state; let storage = &mut self.values_builder; - let hash = state.hash_one(value.as_bytes()); + let hash = state.hash_one(value_bytes); let entry = self .dedup .raw_entry_mut() - .from_hash(hash, |key| value.as_bytes() == get_bytes(storage, key)); + .from_hash(hash, |key| value_bytes == get_bytes(storage, key)); let key = match entry { RawEntryMut::Occupied(entry) => *entry.into_key(), @@ -312,7 +297,10 @@ where } } -fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] { +fn get_bytes<'a, K: ArrowNativeType, T: ByteArrayType>( + values: &'a GenericByteBuilder, + key: &K, +) -> &'a [u8] { let offsets = values.offsets_slice(); let values = values.values_slice(); @@ -323,6 +311,94 @@ fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a &values[start_offset..end_offset] } +/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices +/// to String values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +/// +/// ``` +/// // Create a dictionary array indexed by bytes whose values are Strings. +/// // It can thus hold up to 256 distinct string values. +/// +/// # use arrow_array::builder::StringDictionaryBuilder; +/// # use arrow_array::{Int8Array, StringArray}; +/// # use arrow_array::types::Int8Type; +/// +/// let mut builder = StringDictionaryBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append("abc").unwrap(); +/// builder.append_null(); +/// builder.append("def").unwrap(); +/// builder.append("def").unwrap(); +/// builder.append("abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert_eq!(ava.value(1), "def"); +/// +/// ``` +pub type StringDictionaryBuilder = + GenericByteDictionaryBuilder>; + +/// Array builder for `DictionaryArray` that stores large Strings. For example to map a set of byte indices +/// to String values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +pub type LargeStringDictionaryBuilder = + GenericByteDictionaryBuilder>; + +/// Array builder for `DictionaryArray` that stores binary. For example to map a set of byte indices +/// to binary values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +/// +/// ``` +/// // Create a dictionary array indexed by bytes whose values are binary. +/// // It can thus hold up to 256 distinct binary values. +/// +/// # use arrow_array::builder::BinaryDictionaryBuilder; +/// # use arrow_array::{BinaryArray, Int8Array}; +/// # use arrow_array::types::Int8Type; +/// +/// let mut builder = BinaryDictionaryBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append(b"abc").unwrap(); +/// builder.append_null(); +/// builder.append(b"def").unwrap(); +/// builder.append(b"def").unwrap(); +/// builder.append(b"abc").unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &BinaryArray = av.as_any().downcast_ref::().unwrap(); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert_eq!(ava.value(1), b"def"); +/// +/// ``` +pub type BinaryDictionaryBuilder = + GenericByteDictionaryBuilder>; + +/// Array builder for `DictionaryArray` that stores large binary. For example to map a set of byte indices +/// to binary values. Note that the use of a `HashMap` here will not scale to very large +/// arrays or result in an ordered dictionary. +pub type LargeBinaryDictionaryBuilder = + GenericByteDictionaryBuilder>; + #[cfg(test)] mod tests { use super::*; @@ -330,15 +406,20 @@ mod tests { use crate::array::Array; use crate::array::Int8Array; use crate::types::{Int16Type, Int8Type}; - - #[test] - fn test_string_dictionary_builder() { - let mut builder = StringDictionaryBuilder::::new(); - builder.append("abc").unwrap(); + use crate::{BinaryArray, StringArray}; + + fn test_bytes_dictionary_builder(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append(values[0]).unwrap(); builder.append_null(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); let array = builder.finish(); assert_eq!( @@ -348,20 +429,36 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); - assert_eq!(ava.value(0), "abc"); - assert_eq!(ava.value(1), "def"); + assert_eq!(*ava.value(0), *values[0]); + assert_eq!(*ava.value(1), *values[1]); } #[test] - fn test_string_dictionary_builder_finish_cloned() { - let mut builder = StringDictionaryBuilder::::new(); - builder.append("abc").unwrap(); + fn test_string_dictionary_builder() { + test_bytes_dictionary_builder::>(vec!["abc", "def"]); + } + + #[test] + fn test_binary_dictionary_builder() { + test_bytes_dictionary_builder::>(vec![b"abc", b"def"]); + } + + fn test_bytes_dictionary_builder_finish_cloned(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteDictionaryBuilder::::new(); + + builder.append(values[0]).unwrap(); builder.append_null(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); let mut array = builder.finish_cloned(); assert_eq!( @@ -371,14 +468,15 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); - assert_eq!(ava.value(0), "abc"); - assert_eq!(ava.value(1), "def"); + assert_eq!(ava.value(0), values[0]); + assert_eq!(ava.value(1), values[1]); - builder.append("abc").unwrap(); - builder.append("ghi").unwrap(); - builder.append("def").unwrap(); + builder.append(values[0]).unwrap(); + builder.append(values[2]).unwrap(); + builder.append(values[1]).unwrap(); array = builder.finish(); @@ -398,25 +496,48 @@ mod tests { // Values are polymorphic and so require a downcast. let av2 = array.values(); - let ava2: &StringArray = av2.as_any().downcast_ref::().unwrap(); + let ava2: &GenericByteArray = + av2.as_any().downcast_ref::>().unwrap(); - assert_eq!(ava2.value(0), "abc"); - assert_eq!(ava2.value(1), "def"); - assert_eq!(ava2.value(2), "ghi"); + assert_eq!(ava2.value(0), values[0]); + assert_eq!(ava2.value(1), values[1]); + assert_eq!(ava2.value(2), values[2]); } #[test] - fn test_string_dictionary_builder_with_existing_dictionary() { - let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); + fn test_string_dictionary_builder_finish_cloned() { + test_bytes_dictionary_builder_finish_cloned::>(vec![ + "abc", "def", "ghi", + ]); + } + #[test] + fn test_binary_dictionary_builder_finish_cloned() { + test_bytes_dictionary_builder_finish_cloned::>(vec![ + b"abc", b"def", b"ghi", + ]); + } + + fn test_bytes_dictionary_builder_with_existing_dictionary( + dictionary: GenericByteArray, + values: Vec<&T::Native>, + ) where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { let mut builder = - StringDictionaryBuilder::new_with_dictionary(6, &dictionary).unwrap(); - builder.append("abc").unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary( + 6, + &dictionary, + ) + .unwrap(); + builder.append(values[0]).unwrap(); builder.append_null(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - builder.append("ghi").unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); + builder.append(values[2]).unwrap(); let array = builder.finish(); assert_eq!( @@ -426,26 +547,50 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); assert!(!ava.is_valid(0)); - assert_eq!(ava.value(1), "def"); - assert_eq!(ava.value(2), "abc"); - assert_eq!(ava.value(3), "ghi"); + assert_eq!(ava.value(1), values[1]); + assert_eq!(ava.value(2), values[0]); + assert_eq!(ava.value(3), values[2]); } #[test] - fn test_string_dictionary_builder_with_reserved_null_value() { - let dictionary: Vec> = vec![None]; - let dictionary = StringArray::from(dictionary); + fn test_string_dictionary_builder_with_existing_dictionary() { + test_bytes_dictionary_builder_with_existing_dictionary::>( + StringArray::from(vec![None, Some("def"), Some("abc")]), + vec!["abc", "def", "ghi"], + ); + } + + #[test] + fn test_binary_dictionary_builder_with_existing_dictionary() { + let values: Vec> = vec![None, Some(b"def"), Some(b"abc")]; + test_bytes_dictionary_builder_with_existing_dictionary::>( + BinaryArray::from(values), + vec![b"abc", b"def", b"ghi"], + ); + } + fn test_bytes_dictionary_builder_with_reserved_null_value( + dictionary: GenericByteArray, + values: Vec<&T::Native>, + ) where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { let mut builder = - StringDictionaryBuilder::::new_with_dictionary(4, &dictionary) - .unwrap(); - builder.append("abc").unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary( + 4, + &dictionary, + ) + .unwrap(); + builder.append(values[0]).unwrap(); builder.append_null(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); + builder.append(values[1]).unwrap(); + builder.append(values[0]).unwrap(); let array = builder.finish(); assert!(array.is_null(1)); @@ -460,4 +605,21 @@ mod tests { assert_eq!(keys.value(2), 2); assert_eq!(keys.value(3), 1); } + + #[test] + fn test_string_dictionary_builder_with_reserved_null_value() { + test_bytes_dictionary_builder_with_reserved_null_value::>( + StringArray::from(vec![None]), + vec!["abc", "def"], + ); + } + + #[test] + fn test_binary_dictionary_builder_with_reserved_null_value() { + let values: Vec> = vec![None]; + test_bytes_dictionary_builder_with_reserved_null_value::>( + BinaryArray::from(values), + vec![b"abc", b"def"], + ); + } } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 3486e396b671..820ecd23bc5e 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -39,10 +39,10 @@ mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; pub use primitive_dictionary_builder::*; -mod string_dictionary_builder; -pub use string_dictionary_builder::*; mod struct_builder; pub use struct_builder::*; +mod generic_bytes_dictionary_builder; +pub use generic_bytes_dictionary_builder::*; mod union_builder; pub use union_builder::*; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index e36f850f2e14..25c047a11d35 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -713,7 +713,7 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { /// Type for representing its equivalent rust type i.e /// Utf8Array will have native type has &str /// BinaryArray will have type as [u8] - type Native: bytes::ByteArrayNativeType + AsRef<[u8]> + ?Sized; + type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; /// "Binary" or "String", for use in error messages const PREFIX: &'static str; /// Datatype of array elements diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 0b1d44319493..458fbdb66ef6 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -592,8 +592,9 @@ where let s = array.value(index); - length_so_far += T::Offset::from_usize(s.as_ref().len()).unwrap(); - values.extend_from_slice(s.as_ref()); + let s: &[u8] = s.as_ref(); + length_so_far += T::Offset::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); *offset = length_so_far; } nulls = None @@ -609,7 +610,7 @@ where })?; if array.is_valid(index) { - let s = array.value(index).as_ref(); + let s: &[u8] = array.value(index).as_ref(); length_so_far += T::Offset::from_usize(s.len()).unwrap(); values.extend_from_slice(s.as_ref()); @@ -627,7 +628,7 @@ where ArrowError::ComputeError("Cast to usize failed".to_string()) })?; - let s = array.value(index).as_ref(); + let s: &[u8] = array.value(index).as_ref(); length_so_far += T::Offset::from_usize(s.len()).unwrap(); values.extend_from_slice(s); @@ -647,7 +648,7 @@ where })?; if array.is_valid(index) && indices.is_valid(i) { - let s = array.value(index).as_ref(); + let s: &[u8] = array.value(index).as_ref(); length_so_far += T::Offset::from_usize(s.len()).unwrap(); values.extend_from_slice(s); From 26ea71cefab55990ffb8197707f2a8518e41412d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 3 Jan 2023 19:04:57 +0000 Subject: [PATCH 0463/1411] Verify ArrayData::data_type compatible in PrimitiveArray::from (#3440) --- arrow-arith/src/arity.rs | 8 +++---- arrow-array/src/array/primitive_array.rs | 30 ++++++++++++++++++++---- arrow-row/src/dictionary.rs | 5 +--- arrow-row/src/fixed.rs | 5 +--- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index e89fe7b914a4..3e7a81862927 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -114,9 +114,7 @@ where T: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { - if std::mem::discriminant(&array.value_type()) - != std::mem::discriminant(&T::DATA_TYPE) - { + if !PrimitiveArray::::is_compatible(&array.value_type()) { return Err(ArrowError::CastError(format!( "Cannot perform the unary operation of type {} on dictionary array of value type {}", T::DATA_TYPE, @@ -138,7 +136,7 @@ where downcast_dictionary_array! { array => unary_dict::<_, F, T>(array, op), t => { - if std::mem::discriminant(t) == std::mem::discriminant(&T::DATA_TYPE) { + if PrimitiveArray::::is_compatible(t) { Ok(Arc::new(unary::( array.as_any().downcast_ref::>().unwrap(), op, @@ -170,7 +168,7 @@ where ))) }, t => { - if std::mem::discriminant(t) == std::mem::discriminant(&T::DATA_TYPE) { + if PrimitiveArray::::is_compatible(t) { Ok(Arc::new(try_unary::( array.as_any().downcast_ref::>().unwrap(), op, diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4ff0ed4d93e6..01eda724ba47 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -297,6 +297,21 @@ impl PrimitiveArray { PrimitiveBuilder::::with_capacity(capacity) } + /// Returns if this [`PrimitiveArray`] is compatible with the provided [`DataType`] + /// + /// This is equivalent to `data_type == T::DATA_TYPE`, however ignores timestamp + /// timezones and decimal precision and scale + pub fn is_compatible(data_type: &DataType) -> bool { + match T::DATA_TYPE { + DataType::Timestamp(t1, _) => { + matches!(data_type, DataType::Timestamp(t2, _) if &t1 == t2) + } + DataType::Decimal128(_, _) => matches!(data_type, DataType::Decimal128(_, _)), + DataType::Decimal256(_, _) => matches!(data_type, DataType::Decimal256(_, _)), + _ => T::DATA_TYPE.eq(data_type), + } + } + /// Returns the primitive value at index `i`. /// /// # Safety @@ -1042,10 +1057,8 @@ impl PrimitiveArray { /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { - // Use discriminant to allow for decimals - assert_eq!( - std::mem::discriminant(&T::DATA_TYPE), - std::mem::discriminant(data.data_type()), + assert!( + Self::is_compatible(data.data_type()), "PrimitiveArray expected ArrayData with type {} got {}", T::DATA_TYPE, data.data_type() @@ -2205,4 +2218,13 @@ mod tests { let c = array.unary_mut(|x| x * 2 + 1).unwrap(); assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); } + + #[test] + #[should_panic( + expected = "PrimitiveArray expected ArrayData with type Interval(MonthDayNano) got Interval(DayTime)" + )] + fn test_invalid_interval_type() { + let array = IntervalDayTimeArray::from(vec![1, 2, 3]); + let _ = IntervalMonthDayNanoArray::from(array.into_data()); + } } diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index 0da6c68d1684..e332e11316fd 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -270,10 +270,7 @@ fn decode_primitive( where T::Native: FixedLengthEncoding, { - assert_eq!( - std::mem::discriminant(&T::DATA_TYPE), - std::mem::discriminant(&data_type), - ); + assert!(PrimitiveArray::::is_compatible(&data_type)); // SAFETY: // Validated data type above diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index 159eba9adf19..d4b82c2a3989 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -343,10 +343,7 @@ pub fn decode_primitive( where T::Native: FixedLengthEncoding, { - assert_eq!( - std::mem::discriminant(&T::DATA_TYPE), - std::mem::discriminant(&data_type), - ); + assert!(PrimitiveArray::::is_compatible(&data_type)); // SAFETY: // Validated data type above unsafe { decode_fixed::(rows, data_type, options).into() } From dc91a244e34a7ffe88f5d0676cef59843926c41d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 3 Jan 2023 15:19:25 -0500 Subject: [PATCH 0464/1411] Minor: run clippy on `arrow-integration-testing` (#3428) * Minor run clippy on arrow-integration-testing * Fix clippy errors in arrow-integration-testing --- .github/workflows/arrow.yml | 4 ++++ .../flight_client_scenarios/integration_test.rs | 12 ++++++++++-- .../flight_server_scenarios/integration_test.rs | 17 +++++++++++++---- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index c1e9d600a02a..a5f402cbebba 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -208,5 +208,9 @@ jobs: run: cargo clippy -p arrow-arith --all-targets --features dyn_arith_dict -- -D warnings - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings + - name: Clippy arrow-integration-test with all features + run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings + - name: Clippy arrow-integration-testing with all features + run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings - name: Clippy arrow with all features except SIMD run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index a40076b3de0a..1f1b312f9619 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -130,8 +130,16 @@ async fn send_batch( batch: &RecordBatch, options: &writer::IpcWriteOptions, ) -> Result { - let (dictionary_flight_data, mut batch_flight_data) = - arrow_flight::utils::flight_data_from_arrow_batch(batch, options); + let data_gen = writer::IpcDataGenerator::default(); + let mut dictionary_tracker = writer::DictionaryTracker::new(false); + + let (encoded_dictionaries, encoded_batch) = data_gen + .encoded_batch(batch, &mut dictionary_tracker, options) + .expect("DictionaryTracker configured above to not error on replacement"); + + let dictionary_flight_data: Vec = + encoded_dictionaries.into_iter().map(Into::into).collect(); + let mut batch_flight_data: FlightData = encoded_batch.into(); upload_tx .send_all(&mut stream::iter(dictionary_flight_data).map(Ok)) diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 9c6f26befac0..7ad4c676ffab 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -25,7 +25,7 @@ use arrow::{ buffer::Buffer, datatypes::Schema, datatypes::SchemaRef, - ipc::{self, reader}, + ipc::{self, reader, writer}, record_batch::RecordBatch, }; use arrow_flight::{ @@ -121,15 +121,24 @@ impl FlightService for FlightServiceImpl { .iter() .enumerate() .flat_map(|(counter, batch)| { - let (dictionary_flight_data, mut batch_flight_data) = - arrow_flight::utils::flight_data_from_arrow_batch(batch, &options); + let data_gen = writer::IpcDataGenerator::default(); + let mut dictionary_tracker = writer::DictionaryTracker::new(false); + + let (encoded_dictionaries, encoded_batch) = data_gen + .encoded_batch(batch, &mut dictionary_tracker, &options) + .expect( + "DictionaryTracker configured above to not error on replacement", + ); + + let dictionary_flight_data = + encoded_dictionaries.into_iter().map(Into::into); + let mut batch_flight_data: FlightData = encoded_batch.into(); // Only the record batch's FlightData gets app_metadata let metadata = counter.to_string().into(); batch_flight_data.app_metadata = metadata; dictionary_flight_data - .into_iter() .chain(std::iter::once(batch_flight_data)) .map(Ok) }); From b82b35f11ed6c3eabee85e7fd72d003db260f9c0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 3 Jan 2023 19:00:51 -0500 Subject: [PATCH 0465/1411] Minor: run arrow-integration-test{,ing} clippy after arrow clippy (#3445) --- .github/workflows/arrow.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index a5f402cbebba..4ac64005323f 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -208,9 +208,9 @@ jobs: run: cargo clippy -p arrow-arith --all-targets --features dyn_arith_dict -- -D warnings - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings + - name: Clippy arrow with all features except SIMD + run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings - name: Clippy arrow-integration-test with all features run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - name: Clippy arrow-integration-testing with all features run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features except SIMD - run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings From 65ff80e4d80e39d087d61df0ac0a4b2a79833a0c Mon Sep 17 00:00:00 2001 From: Your friendly neighborhood geek Date: Wed, 4 Jan 2023 17:45:19 +0530 Subject: [PATCH 0466/1411] parquet record API: timestamp as signed integer (#3437) - Use signed integers to store 'Date', 'TimestampMillis' and 'TimestampMicros' in 'enum Field' - remove timezone from string representation of Date --- parquet/src/record/api.rs | 55 ++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index b64ff51eea84..2d15e126ff65 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -123,8 +123,8 @@ pub trait RowAccessor { fn get_ulong(&self, i: usize) -> Result; fn get_float(&self, i: usize) -> Result; fn get_double(&self, i: usize) -> Result; - fn get_timestamp_millis(&self, i: usize) -> Result; - fn get_timestamp_micros(&self, i: usize) -> Result; + fn get_timestamp_millis(&self, i: usize) -> Result; + fn get_timestamp_micros(&self, i: usize) -> Result; fn get_decimal(&self, i: usize) -> Result<&Decimal>; fn get_string(&self, i: usize) -> Result<&String>; fn get_bytes(&self, i: usize) -> Result<&ByteArray>; @@ -219,9 +219,9 @@ impl RowAccessor for Row { row_primitive_accessor!(get_double, Double, f64); - row_primitive_accessor!(get_timestamp_millis, TimestampMillis, u64); + row_primitive_accessor!(get_timestamp_millis, TimestampMillis, i64); - row_primitive_accessor!(get_timestamp_micros, TimestampMicros, u64); + row_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64); row_complex_accessor!(get_decimal, Decimal, Decimal); @@ -295,8 +295,8 @@ pub trait ListAccessor { fn get_ulong(&self, i: usize) -> Result; fn get_float(&self, i: usize) -> Result; fn get_double(&self, i: usize) -> Result; - fn get_timestamp_millis(&self, i: usize) -> Result; - fn get_timestamp_micros(&self, i: usize) -> Result; + fn get_timestamp_millis(&self, i: usize) -> Result; + fn get_timestamp_micros(&self, i: usize) -> Result; fn get_decimal(&self, i: usize) -> Result<&Decimal>; fn get_string(&self, i: usize) -> Result<&String>; fn get_bytes(&self, i: usize) -> Result<&ByteArray>; @@ -362,9 +362,9 @@ impl ListAccessor for List { list_primitive_accessor!(get_double, Double, f64); - list_primitive_accessor!(get_timestamp_millis, TimestampMillis, u64); + list_primitive_accessor!(get_timestamp_millis, TimestampMillis, i64); - list_primitive_accessor!(get_timestamp_micros, TimestampMicros, u64); + list_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64); list_complex_accessor!(get_decimal, Decimal, Decimal); @@ -453,9 +453,9 @@ impl<'a> ListAccessor for MapList<'a> { map_list_primitive_accessor!(get_double, Double, f64); - map_list_primitive_accessor!(get_timestamp_millis, TimestampMillis, u64); + map_list_primitive_accessor!(get_timestamp_millis, TimestampMillis, i64); - map_list_primitive_accessor!(get_timestamp_micros, TimestampMicros, u64); + map_list_primitive_accessor!(get_timestamp_micros, TimestampMicros, i64); list_complex_accessor!(get_decimal, Decimal, Decimal); @@ -522,11 +522,11 @@ pub enum Field { Bytes(ByteArray), /// Date without a time of day, stores the number of days from the /// Unix epoch, 1 January 1970. - Date(u32), + Date(i32), /// Milliseconds from the Unix epoch, 1 January 1970. - TimestampMillis(u64), + TimestampMillis(i64), /// Microseconds from the Unix epoch, 1 Janiary 1970. - TimestampMicros(u64), + TimestampMicros(i64), // ---------------------------------------------------------------------- // Complex types @@ -590,7 +590,7 @@ impl Field { ConvertedType::UINT_8 => Field::UByte(value as u8), ConvertedType::UINT_16 => Field::UShort(value as u16), ConvertedType::UINT_32 => Field::UInt(value as u32), - ConvertedType::DATE => Field::Date(value as u32), + ConvertedType::DATE => Field::Date(value), ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i32( value, descr.type_precision(), @@ -606,8 +606,8 @@ impl Field { match descr.converted_type() { ConvertedType::INT_64 | ConvertedType::NONE => Field::Long(value), ConvertedType::UINT_64 => Field::ULong(value as u64), - ConvertedType::TIMESTAMP_MILLIS => Field::TimestampMillis(value as u64), - ConvertedType::TIMESTAMP_MICROS => Field::TimestampMicros(value as u64), + ConvertedType::TIMESTAMP_MILLIS => Field::TimestampMillis(value), + ConvertedType::TIMESTAMP_MICROS => Field::TimestampMicros(value), ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i64( value, descr.type_precision(), @@ -621,7 +621,7 @@ impl Field { /// `Timestamp` value. #[inline] pub fn convert_int96(_descr: &ColumnDescPtr, value: Int96) -> Self { - Field::TimestampMillis(value.to_i64() as u64) + Field::TimestampMillis(value.to_i64()) } /// Converts Parquet FLOAT type with logical type into `f32` value. @@ -793,12 +793,12 @@ impl fmt::Display for Field { /// Input `value` is a number of days since the epoch in UTC. /// Date is displayed in local timezone. #[inline] -fn convert_date_to_string(value: u32) -> String { +fn convert_date_to_string(value: i32) -> String { static NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24; let dt = Utc .timestamp_opt(value as i64 * NUM_SECONDS_IN_DAY, 0) .unwrap(); - format!("{}", dt.format("%Y-%m-%d %:z")) + format!("{}", dt.format("%Y-%m-%d")) } /// Helper method to convert Parquet timestamp into a string. @@ -814,16 +814,16 @@ fn convert_timestamp_secs_to_string(value: i64) -> String { /// Input `value` is a number of milliseconds since the epoch in UTC. /// Datetime is displayed in local timezone. #[inline] -fn convert_timestamp_millis_to_string(value: u64) -> String { - convert_timestamp_secs_to_string(value as i64 / 1000) +fn convert_timestamp_millis_to_string(value: i64) -> String { + convert_timestamp_secs_to_string(value / 1000) } /// Helper method to convert Parquet timestamp into a string. /// Input `value` is a number of microseconds since the epoch in UTC. /// Datetime is displayed in local timezone. #[inline] -fn convert_timestamp_micros_to_string(value: u64) -> String { - convert_timestamp_secs_to_string(value as i64 / 1000000) +fn convert_timestamp_micros_to_string(value: i64) -> String { + convert_timestamp_secs_to_string(value / 1000000) } /// Helper method to convert Parquet decimal into a string. @@ -1083,11 +1083,12 @@ mod tests { .and_hms_opt(0, 0, 0) .unwrap(); let dt = Utc.from_utc_datetime(&datetime); - let res = convert_date_to_string((dt.timestamp() / 60 / 60 / 24) as u32); - let exp = format!("{}", dt.format("%Y-%m-%d %:z")); + let res = convert_date_to_string((dt.timestamp() / 60 / 60 / 24) as i32); + let exp = format!("{}", dt.format("%Y-%m-%d")); assert_eq!(res, exp); } + check_date_conversion(1969, 12, 31); check_date_conversion(2010, 1, 2); check_date_conversion(2014, 5, 1); check_date_conversion(2016, 2, 29); @@ -1103,7 +1104,7 @@ mod tests { .and_hms_opt(h, mi, s) .unwrap(); let dt = Utc.from_utc_datetime(&datetime); - let res = convert_timestamp_millis_to_string(dt.timestamp_millis() as u64); + let res = convert_timestamp_millis_to_string(dt.timestamp_millis()); let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); assert_eq!(res, exp); } @@ -1124,7 +1125,7 @@ mod tests { .and_hms_opt(h, mi, s) .unwrap(); let dt = Utc.from_utc_datetime(&datetime); - let res = convert_timestamp_micros_to_string(dt.timestamp_micros() as u64); + let res = convert_timestamp_micros_to_string(dt.timestamp_micros()); let exp = format!("{}", dt.format("%Y-%m-%d %H:%M:%S %:z")); assert_eq!(res, exp); } From 61a77a5afc2c141eb530f29ee6e99eafe6756170 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 4 Jan 2023 08:25:49 -0500 Subject: [PATCH 0467/1411] Parquet writer v2: clear buffer after page flush (#3447) * parquet writer v2: clear buffer after page flush` * fix clippy issue * fmt fix * fixed issue with flush_page * fmt fix * fix clippy errors Co-authored-by: askoa --- parquet/src/arrow/arrow_writer/byte_array.rs | 5 +- parquet/src/arrow/arrow_writer/mod.rs | 60 +++++++++++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 4b9d91334a7f..24dae4f20d64 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -288,13 +288,14 @@ impl FallbackEncoder { let mut out = Vec::with_capacity(lengths.len() + buffer.len()); out.extend_from_slice(lengths.data()); out.extend_from_slice(buffer); + buffer.clear(); (out, Encoding::DELTA_LENGTH_BYTE_ARRAY) } FallbackEncoderImpl::Delta { buffer, prefix_lengths, suffix_lengths, - .. + last_value, } => { let prefix_lengths = prefix_lengths.flush_buffer()?; let suffix_lengths = suffix_lengths.flush_buffer()?; @@ -305,6 +306,8 @@ impl FallbackEncoder { out.extend_from_slice(prefix_lengths.data()); out.extend_from_slice(suffix_lengths.data()); out.extend_from_slice(buffer); + buffer.clear(); + last_value.clear(); (out, Encoding::DELTA_BYTE_ARRAY) } }; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5cf33d125484..340ab246a38b 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1180,6 +1180,7 @@ mod tests { } const SMALL_SIZE: usize = 7; + const MEDIUM_SIZE: usize = 63; fn roundtrip( expected_batch: RecordBatch, @@ -1199,7 +1200,14 @@ mod tests { files } - fn roundtrip_opts(expected_batch: &RecordBatch, props: WriterProperties) -> File { + fn roundtrip_opts_with_array_validation( + expected_batch: &RecordBatch, + props: WriterProperties, + validate: F, + ) -> File + where + F: Fn(&ArrayData, &ArrayData), + { let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( @@ -1225,13 +1233,18 @@ mod tests { for i in 0..expected_batch.num_columns() { let expected_data = expected_batch.column(i).data(); let actual_data = actual_batch.column(i).data(); - - assert_eq!(expected_data, actual_data); + validate(expected_data, actual_data); } file } + fn roundtrip_opts(expected_batch: &RecordBatch, props: WriterProperties) -> File { + roundtrip_opts_with_array_validation(expected_batch, props, |a, b| { + assert_eq!(a, b) + }) + } + struct RoundTripOptions { values: ArrayRef, schema: SchemaRef, @@ -1838,6 +1851,47 @@ mod tests { one_column_roundtrip(values, false); } + #[test] + fn fallback_flush_data_page() { + //tests if the Fallback::flush_data_page clears all buffers correctly + let raw_values: Vec<_> = (0..MEDIUM_SIZE).map(|i| i.to_string()).collect(); + let values = Arc::new(StringArray::from(raw_values)); + let encodings = vec![ + Encoding::DELTA_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + ]; + let data_type = values.data_type().clone(); + let schema = Arc::new(Schema::new(vec![Field::new("col", data_type, false)])); + let expected_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + + let row_group_sizes = [1024, SMALL_SIZE, SMALL_SIZE / 2, SMALL_SIZE / 2 + 1, 10]; + let data_pagesize_limit: usize = 32; + let write_batch_size: usize = 16; + + for encoding in &encodings { + for row_group_size in row_group_sizes { + let props = WriterProperties::builder() + .set_writer_version(WriterVersion::PARQUET_2_0) + .set_max_row_group_size(row_group_size) + .set_dictionary_enabled(false) + .set_encoding(*encoding) + .set_data_pagesize_limit(data_pagesize_limit) + .set_write_batch_size(write_batch_size) + .build(); + + roundtrip_opts_with_array_validation(&expected_batch, props, |a, b| { + let string_array_a = StringArray::from(a.clone()); + let string_array_b = StringArray::from(b.clone()); + let vec_a: Vec<&str> = + string_array_a.iter().map(|v| v.unwrap()).collect(); + let vec_b: Vec<&str> = + string_array_b.iter().map(|v| v.unwrap()).collect(); + assert_eq!(vec_a, vec_b, "failed for encoder: {encoding:?} and row_group_size: {row_group_size:?}"); + }); + } + } + } + #[test] fn arrow_writer_string_dictionary() { // define schema From ec2fd47046d341b41b9d2416f0c23b8d2e25008d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 4 Jan 2023 15:48:45 +0000 Subject: [PATCH 0468/1411] Remove multiversion dependency (#3452) --- arrow-arith/Cargo.toml | 1 - arrow-arith/src/aggregate.rs | 3 --- 2 files changed, 4 deletions(-) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 32c2043cdaa9..0da34ea15bb0 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -44,7 +44,6 @@ arrow-data = { version = "30.0.0", path = "../arrow-data" } arrow-schema = { version = "30.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } -multiversion = { version = "0.7.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index dc3d70bb2831..a1cf8d84954c 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -17,8 +17,6 @@ //! Defines aggregations over Arrow arrays. -use multiversion::multiversion; - use arrow_array::cast::*; use arrow_array::iterator::ArrayIter; use arrow_array::*; @@ -104,7 +102,6 @@ pub fn max_boolean(array: &BooleanArray) -> Option { } /// Helper to compute min/max of [`ArrayAccessor`]. -#[multiversion(targets("x86_64+avx"))] fn min_max_helper, F>(array: A, cmp: F) -> Option where F: Fn(&T, &T) -> bool, From 3213ef53109701b3e73be1a49fa980693103d8db Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 4 Jan 2023 18:21:04 +0000 Subject: [PATCH 0469/1411] Re-export nullif kernel (#3451) --- arrow/src/compute/kernels/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index 19f3c27a04fa..d9c948c607bd 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -23,7 +23,7 @@ pub use arrow_arith::{aggregate, arithmetic, arity, bitwise, boolean, temporal}; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_ord::{partition, sort}; -pub use arrow_select::{concat, filter, interleave, take, window, zip}; +pub use arrow_select::{concat, filter, interleave, nullif, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; /// Comparison kernels for `Array`s. From 28a04db03cea376991e7efb04c3cf4f71f6d05bf Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Wed, 4 Jan 2023 21:03:45 +0100 Subject: [PATCH 0470/1411] object_store: builder configuration api (#3436) * feat: draf configuration api for azure * feat: add configuration for aws and gcp * fix: clippy * feat: allow passing typed config keys * refactor: implement try_from for config keys * chore: PR feedback * refactor: make options api fallible * fix: docs errors * chore: remove helpers * test: test sas key splitting and un-nit nits --- object_store/src/aws/mod.rs | 348 +++++++++++++++++++++++++++++++--- object_store/src/azure/mod.rs | 339 ++++++++++++++++++++++++++++++--- object_store/src/gcp/mod.rs | 174 ++++++++++++++++- object_store/src/lib.rs | 7 + object_store/src/util.rs | 9 + 5 files changed, 822 insertions(+), 55 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 786ccd20f18a..4b633d9f5d24 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -37,9 +37,11 @@ use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; +use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; use tracing::info; @@ -51,6 +53,7 @@ use crate::aws::credential::{ StaticCredentialProvider, WebIdentityProvider, }; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; +use crate::util::str_is_truthy; use crate::{ ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, StreamExt, @@ -133,13 +136,21 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { - fn from(err: Error) -> Self { - Self::Generic { - store: "S3", - source: Box::new(err), + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: "S3", key } + } + _ => Self::Generic { + store: "S3", + source: Box::new(source), + }, } } } @@ -379,6 +390,184 @@ pub struct AmazonS3Builder { client_options: ClientOptions, } +/// Configuration keys for [`AmazonS3Builder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](AmazonS3Builder::try_with_option) +/// or [`with_options`](AmazonS3Builder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// +/// let options = HashMap::from([ +/// ("aws_access_key_id", "my-access-key-id"), +/// ("aws_secret_access_key", "my-secret-access-key"), +/// ]); +/// let typed_options = vec![ +/// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), +/// ]; +/// let azure = AmazonS3Builder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(AmazonS3ConfigKey::Region, "my-region") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +pub enum AmazonS3ConfigKey { + /// AWS Access Key + /// + /// See [`AmazonS3Builder::with_access_key_id`] for details. + /// + /// Supported keys: + /// - `aws_access_key_id` + /// - `access_key_id` + AccessKeyId, + + /// Secret Access Key + /// + /// See [`AmazonS3Builder::with_secret_access_key`] for details. + /// + /// Supported keys: + /// - `aws_secret_access_key` + /// - `secret_access_key` + SecretAccessKey, + + /// Region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_region` + /// - `region` + Region, + + /// Default region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_default_region` + /// - `default_region` + DefaultRegion, + + /// Bucket name + /// + /// See [`AmazonS3Builder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `aws_bucket` + /// - `aws_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Sets custom endpoint for communicating with AWS S3. + /// + /// See [`AmazonS3Builder::with_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_endpoint` + /// - `aws_endpoint_url` + /// - `endpoint` + /// - `endpoint_url` + Endpoint, + + /// Token to use for requests (passed to underlying provider) + /// + /// See [`AmazonS3Builder::with_token`] for details. + /// + /// Supported keys: + /// - `aws_session_token` + /// - `aws_token` + /// - `session_token` + /// - `token` + Token, + + /// Fall back to ImdsV1 + /// + /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. + /// + /// Supported keys: + /// - `aws_imdsv1_fallback` + /// - `imdsv1_fallback` + ImdsV1Fallback, + + /// If virtual hosted style request has to be used + /// + /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. + /// + /// Supported keys: + /// - `aws_virtual_hosted_style_request` + /// - `virtual_hosted_style_request` + VirtualHostedStyleRequest, + + /// Set the instance metadata endpoint + /// + /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_metadata_endpoint` + /// - `metadata_endpoint` + MetadataEndpoint, + + /// AWS profile name + /// + /// Supported keys: + /// - `aws_profile` + /// - `profile` + Profile, +} + +impl AsRef for AmazonS3ConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccessKeyId => "aws_access_key_id", + Self::SecretAccessKey => "aws_secret_access_key", + Self::Region => "aws_region", + Self::Bucket => "aws_bucket", + Self::Endpoint => "aws_endpoint", + Self::Token => "aws_session_token", + Self::ImdsV1Fallback => "aws_imdsv1_fallback", + Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", + Self::DefaultRegion => "aws_default_region", + Self::MetadataEndpoint => "aws_metadata_endpoint", + Self::Profile => "aws_profile", + } + } +} + +impl FromStr for AmazonS3ConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), + "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), + "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), + "aws_region" | "region" => Ok(Self::Region), + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { + Ok(Self::Bucket) + } + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { + Ok(Self::Endpoint) + } + "aws_session_token" | "aws_token" | "session_token" | "token" => { + Ok(Self::Token) + } + "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { + Ok(Self::VirtualHostedStyleRequest) + } + "aws_profile" | "profile" => Ok(Self::Profile), + "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), + "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl AmazonS3Builder { /// Create a new [`AmazonS3Builder`] with default values. pub fn new() -> Self { @@ -407,28 +596,16 @@ impl AmazonS3Builder { pub fn from_env() -> Self { let mut builder: Self = Default::default(); - if let Ok(access_key_id) = std::env::var("AWS_ACCESS_KEY_ID") { - builder.access_key_id = Some(access_key_id); - } - - if let Ok(secret_access_key) = std::env::var("AWS_SECRET_ACCESS_KEY") { - builder.secret_access_key = Some(secret_access_key); - } - - if let Ok(secret) = std::env::var("AWS_DEFAULT_REGION") { - builder.region = Some(secret); - } - - if let Ok(endpoint) = std::env::var("AWS_ENDPOINT") { - builder.endpoint = Some(endpoint); - } - - if let Ok(token) = std::env::var("AWS_SESSION_TOKEN") { - builder.token = Some(token); - } - - if let Ok(profile) = std::env::var("AWS_PROFILE") { - builder.profile = Some(profile); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AWS_") { + if let Ok(config_key) = + AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } // This env var is set in ECS @@ -442,7 +619,7 @@ impl AmazonS3Builder { if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { builder.client_options = - builder.client_options.with_allow_http(text == "true"); + builder.client_options.with_allow_http(str_is_truthy(&text)); } builder @@ -472,6 +649,55 @@ impl AmazonS3Builder { self } + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match AmazonS3ConfigKey::from_str(key.as_ref())? { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), + AmazonS3ConfigKey::SecretAccessKey => { + self.secret_access_key = Some(value.into()) + } + AmazonS3ConfigKey::Region => self.region = Some(value.into()), + AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), + AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), + AmazonS3ConfigKey::Token => self.token = Some(value.into()), + AmazonS3ConfigKey::ImdsV1Fallback => { + self.imdsv1_fallback = str_is_truthy(&value.into()) + } + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + self.virtual_hosted_style_request = str_is_truthy(&value.into()) + } + AmazonS3ConfigKey::DefaultRegion => { + self.region = self.region.or_else(|| Some(value.into())) + } + AmazonS3ConfigKey::MetadataEndpoint => { + self.metadata_endpoint = Some(value.into()) + } + AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + /// + /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -773,6 +999,7 @@ mod tests { put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; + use std::collections::HashMap; use std::env; const NON_EXISTENT_NAME: &str = "nonexistentname"; @@ -915,6 +1142,73 @@ mod tests { assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } + #[test] + fn s3_test_config_from_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id.clone()), + ("aws_secret_access_key", aws_secret_access_key), + ("aws_default_region", aws_default_region.clone()), + ("aws_endpoint", aws_endpoint.clone()), + ("aws_session_token", aws_session_token.clone()), + ]); + + let builder = AmazonS3Builder::new() + .try_with_options(&options) + .unwrap() + .try_with_option("aws_secret_access_key", "new-secret-key") + .unwrap(); + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + } + + #[test] + fn s3_test_config_from_typed_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), + (AmazonS3ConfigKey::SecretAccessKey, aws_secret_access_key), + (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), + (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), + (AmazonS3ConfigKey::Token, aws_session_token.clone()), + ]); + + let builder = AmazonS3Builder::new() + .try_with_options(&options) + .unwrap() + .try_with_option(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key") + .unwrap(); + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + } + + #[test] + fn s3_test_config_fallible_options() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id), + ("invalid-key", aws_secret_access_key), + ]); + + let builder = AmazonS3Builder::new().try_with_options(&options); + assert!(builder.is_err()); + } + #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 7cf369de3b3a..416883ac95a2 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -37,16 +37,18 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use percent_encoding::percent_decode_str; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; -use std::collections::BTreeSet; use std::fmt::{Debug, Formatter}; use std::io; use std::ops::Range; use std::sync::Arc; +use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::util::RFC1123_FMT; +use crate::util::{str_is_truthy, RFC1123_FMT}; pub use credential::authority_hosts; mod client; @@ -124,13 +126,28 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Failed parsing an SAS key"))] + DecodeSasKey { source: std::str::Utf8Error }, + + #[snafu(display("Missing component in SAS query pair"))] + MissingSasComponent {}, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { fn from(source: Error) -> Self { - Self::Generic { - store: "MicrosoftAzure", - source: Box::new(source), + match source { + Error::UnknownConfigurationKey { key } => Self::UnknownConfigurationKey { + store: "MicrosoftAzure", + key, + }, + _ => Self::Generic { + store: "MicrosoftAzure", + source: Box::new(source), + }, } } } @@ -367,6 +384,7 @@ pub struct MicrosoftAzureBuilder { client_secret: Option, tenant_id: Option, sas_query_pairs: Option>, + sas_key: Option, authority_host: Option, url: Option, use_emulator: bool, @@ -374,6 +392,157 @@ pub struct MicrosoftAzureBuilder { client_options: ClientOptions, } +/// Configuration keys for [`MicrosoftAzureBuilder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](MicrosoftAzureBuilder::try_with_option) +/// or [`with_options`](MicrosoftAzureBuilder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// +/// let options = HashMap::from([ +/// ("azure_client_id", "my-client-id"), +/// ("azure_client_secret", "my-account-name"), +/// ]); +/// let typed_options = vec![ +/// (AzureConfigKey::AccountName, "my-account-name"), +/// ]; +/// let azure = MicrosoftAzureBuilder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(AzureConfigKey::AuthorityId, "my-tenant-id") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +pub enum AzureConfigKey { + /// The name of the azure storage account + /// + /// Supported keys: + /// - `azure_storage_account_name` + /// - `account_name` + AccountName, + + /// Master key for accessing storage account + /// + /// Supported keys: + /// - `azure_storage_account_key` + /// - `azure_storage_access_key` + /// - `azure_storage_master_key` + /// - `access_key` + /// - `account_key` + /// - `master_key` + AccessKey, + + /// Service principal client id for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_id` + /// - `azure_client_id` + /// - `client_id` + ClientId, + + /// Service principal client secret for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_secret` + /// - `azure_client_secret` + /// - `client_secret` + ClientSecret, + + /// Tenant id used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_tenant_id` + /// - `azure_storage_authority_id` + /// - `azure_tenant_id` + /// - `azure_authority_id` + /// - `tenant_id` + /// - `authority_id` + AuthorityId, + + /// Shared access signature. + /// + /// The signature is expected to be percent-encoded, much like they are provided + /// in the azure storage explorer or azure portal. + /// + /// Supported keys: + /// - `azure_storage_sas_key` + /// - `azure_storage_sas_token` + /// - `sas_key` + /// - `sas_token` + SasKey, + + /// Bearer token + /// + /// Supported keys: + /// - `azure_storage_token` + /// - `bearer_token` + /// - `token` + Token, + + /// Use object store with azurite storage emulator + /// + /// Supported keys: + /// - `azure_storage_use_emulator` + /// - `object_store_use_emulator` + /// - `use_emulator` + UseEmulator, +} + +impl AsRef for AzureConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccountName => "azure_storage_account_name", + Self::AccessKey => "azure_storage_account_key", + Self::ClientId => "azure_storage_client_id", + Self::ClientSecret => "azure_storage_client_secret", + Self::AuthorityId => "azure_storage_tenant_id", + Self::SasKey => "azure_storage_sas_key", + Self::Token => "azure_storage_token", + Self::UseEmulator => "azure_storage_use_emulator", + } + } +} + +impl FromStr for AzureConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "azure_storage_account_key" + | "azure_storage_access_key" + | "azure_storage_master_key" + | "master_key" + | "account_key" + | "access_key" => Ok(Self::AccessKey), + "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), + "azure_storage_client_id" | "azure_client_id" | "client_id" => { + Ok(Self::ClientId) + } + "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { + Ok(Self::ClientSecret) + } + "azure_storage_tenant_id" + | "azure_storage_authority_id" + | "azure_tenant_id" + | "azure_authority_id" + | "tenant_id" + | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_sas_key" + | "azure_storage_sas_token" + | "sas_key" + | "sas_token" => Ok(Self::SasKey), + "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), + "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl Debug for MicrosoftAzureBuilder { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( @@ -409,27 +578,21 @@ impl MicrosoftAzureBuilder { /// ``` pub fn from_env() -> Self { let mut builder = Self::default(); - - if let Ok(account_name) = std::env::var("AZURE_STORAGE_ACCOUNT_NAME") { - builder.account_name = Some(account_name); - } - - if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCOUNT_KEY") { - builder.access_key = Some(access_key); - } else if let Ok(access_key) = std::env::var("AZURE_STORAGE_ACCESS_KEY") { - builder.access_key = Some(access_key); - } - - if let Ok(client_id) = std::env::var("AZURE_STORAGE_CLIENT_ID") { - builder.client_id = Some(client_id); - } - - if let Ok(client_secret) = std::env::var("AZURE_STORAGE_CLIENT_SECRET") { - builder.client_secret = Some(client_secret); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AZURE_") { + if let Ok(config_key) = + AzureConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } - if let Ok(tenant_id) = std::env::var("AZURE_STORAGE_TENANT_ID") { - builder.tenant_id = Some(tenant_id); + if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { + builder.client_options = + builder.client_options.with_allow_http(str_is_truthy(&text)); } builder @@ -462,6 +625,40 @@ impl MicrosoftAzureBuilder { self } + /// Set an option on the builder via a key - value pair. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match AzureConfigKey::from_str(key.as_ref())? { + AzureConfigKey::AccessKey => self.access_key = Some(value.into()), + AzureConfigKey::AccountName => self.account_name = Some(value.into()), + AzureConfigKey::ClientId => self.client_id = Some(value.into()), + AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), + AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::SasKey => self.sas_key = Some(value.into()), + AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::UseEmulator => { + self.use_emulator = str_is_truthy(&value.into()) + } + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -636,6 +833,8 @@ impl MicrosoftAzureBuilder { )) } else if let Some(query_pairs) = self.sas_query_pairs { Ok(credential::CredentialProvider::SASToken(query_pairs)) + } else if let Some(sas) = self.sas_key { + Ok(credential::CredentialProvider::SASToken(split_sas(&sas)?)) } else { Err(Error::MissingCredentials {}) }?; @@ -673,6 +872,25 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { Ok(url) } +fn split_sas(sas: &str) -> Result, Error> { + let sas = percent_decode_str(sas) + .decode_utf8() + .context(DecodeSasKeySnafu {})?; + let kv_str_pairs = sas + .trim_start_matches('?') + .split('&') + .filter(|s| !s.chars().all(char::is_whitespace)); + let mut pairs = Vec::new(); + for kv_pair_str in kv_str_pairs { + let (k, v) = kv_pair_str + .trim() + .split_once('=') + .ok_or(Error::MissingSasComponent {})?; + pairs.push((k.into(), v.into())) + } + Ok(pairs) +} + #[cfg(test)] mod tests { use super::*; @@ -680,6 +898,7 @@ mod tests { copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, }; + use std::collections::HashMap; use std::env; // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment @@ -832,4 +1051,76 @@ mod tests { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn azure_test_config_from_map() { + let azure_client_id = "object_store:fake_access_key_id"; + let azure_storage_account_name = "object_store:fake_secret_key"; + let azure_storage_token = "object_store:fake_default_region"; + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("azure_storage_account_name", azure_storage_account_name), + ("azure_storage_token", azure_storage_token), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(options) + .unwrap(); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_config_from_typed_map() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_account_name = "object_store:fake_secret_key".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + (AzureConfigKey::ClientId, azure_client_id.clone()), + ( + AzureConfigKey::AccountName, + azure_storage_account_name.clone(), + ), + (AzureConfigKey::Token, azure_storage_token.clone()), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_config_fallible_options() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("invalid-key", azure_storage_token), + ]); + + let builder = MicrosoftAzureBuilder::new().try_with_options(&options); + assert!(builder.is_err()); + } + + #[test] + fn azure_test_split_sas() { + let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; + let expected = vec![ + ("sv".to_string(), "2021-10-04".to_string()), + ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), + ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), + ("sr".to_string(), "c".to_string()), + ("sp".to_string(), "rcwl".to_string()), + ( + "sig".to_string(), + "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), + ), + ]; + let pairs = split_sas(raw_sas).unwrap(); + assert_eq!(expected, pairs); + } } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index f2638748f6ca..177812fa8930 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -33,6 +33,7 @@ use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; use std::ops::Range; +use std::str::FromStr; use std::sync::Arc; use async_trait::async_trait; @@ -42,6 +43,7 @@ use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; +use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -145,6 +147,9 @@ enum Error { #[snafu(display("URL did not match any known pattern for scheme: {}", url))] UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, } impl From for super::Error { @@ -164,6 +169,9 @@ impl From for super::Error { source: Box::new(source), path, }, + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: "GCS", key } + } _ => Self::Generic { store: "GCS", source: Box::new(err), @@ -796,6 +804,74 @@ pub struct GoogleCloudStorageBuilder { client_options: ClientOptions, } +/// Configuration keys for [`GoogleCloudStorageBuilder`] +/// +/// Configuration via keys can be dome via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) +/// or [`with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// +/// let options = HashMap::from([ +/// ("google_service_account", "my-service-account"), +/// ]); +/// let typed_options = vec![ +/// (GoogleConfigKey::Bucket, "my-bucket"), +/// ]; +/// let azure = GoogleCloudStorageBuilder::new() +/// .try_with_options(options) +/// .unwrap() +/// .try_with_options(typed_options) +/// .unwrap() +/// .try_with_option(GoogleConfigKey::Bucket, "my-new-bucket") +/// .unwrap(); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +pub enum GoogleConfigKey { + /// Path to the service account file + /// + /// Supported keys: + /// - `google_service_account` + /// - `service_account` + ServiceAccount, + + /// Bucket name + /// + /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `google_bucket` + /// - `google_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, +} + +impl AsRef for GoogleConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::ServiceAccount => "google_service_account", + Self::Bucket => "google_bucket", + } + } +} + +impl FromStr for GoogleConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "google_service_account" | "service_account" => Ok(Self::ServiceAccount), + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { + Ok(Self::Bucket) + } + _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + } + } +} + impl Default for GoogleCloudStorageBuilder { fn default() -> Self { Self { @@ -835,8 +911,16 @@ impl GoogleCloudStorageBuilder { builder.service_account_path = Some(service_account_path); } - if let Ok(service_account_path) = std::env::var("GOOGLE_SERVICE_ACCOUNT") { - builder.service_account_path = Some(service_account_path); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("GOOGLE_") { + if let Ok(config_key) = + GoogleConfigKey::from_str(&key.to_ascii_lowercase()) + { + builder = builder.try_with_option(config_key, value).unwrap(); + } + } + } } builder @@ -863,6 +947,34 @@ impl GoogleCloudStorageBuilder { self } + /// Set an option on the builder via a key - value pair. + pub fn try_with_option( + mut self, + key: impl AsRef, + value: impl Into, + ) -> Result { + match GoogleConfigKey::from_str(key.as_ref())? { + GoogleConfigKey::ServiceAccount => { + self.service_account_path = Some(value.into()) + } + GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + }; + Ok(self) + } + + /// Hydrate builder from key value pairs + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -995,9 +1107,9 @@ fn convert_object_meta(object: &Object) -> Result { #[cfg(test)] mod test { - use std::env; - use bytes::Bytes; + use std::collections::HashMap; + use std::env; use crate::{ tests::{ @@ -1205,4 +1317,58 @@ mod test { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn gcs_test_config_from_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account.clone()), + ("google_bucket_name", google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_from_typed_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ( + GoogleConfigKey::ServiceAccount, + google_service_account.clone(), + ), + (GoogleConfigKey::Bucket, google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_fallible_options() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account), + ("invalid-key", google_bucket_name), + ]); + + let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); + assert!(builder.is_err()); + } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 425c5cdba1d1..4ec58c387e49 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -555,6 +555,13 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, + + #[snafu(display( + "Configuration key: '{}' is not valid for store '{}'.", + key, + store + ))] + UnknownConfigurationKey { store: &'static str, key: String }, } impl From for std::io::Error { diff --git a/object_store/src/util.rs b/object_store/src/util.rs index e592e7b64f2d..08bfd86d9f67 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -185,6 +185,15 @@ fn merge_ranges( ret } +#[allow(dead_code)] +pub(crate) fn str_is_truthy(val: &str) -> bool { + val.eq_ignore_ascii_case("1") + | val.eq_ignore_ascii_case("true") + | val.eq_ignore_ascii_case("on") + | val.eq_ignore_ascii_case("yes") + | val.eq_ignore_ascii_case("y") +} + #[cfg(test)] mod tests { use super::*; From 5dafa4d0267f52a0cfa7b9384a7f77e53cd2cd6e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 5 Jan 2023 00:08:06 -0800 Subject: [PATCH 0471/1411] Support Decimal256 in ffi (#3453) --- arrow-pyarrow-integration-testing/tests/test_sql.py | 2 +- arrow/src/datatypes/ffi.rs | 13 ++++++++++--- arrow/src/ffi.rs | 3 +++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index c97dad77ea1d..98564408d937 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -63,6 +63,7 @@ def assert_pyarrow_leak(): pa.float32(), pa.float64(), pa.decimal128(19, 4), + pa.decimal256(76, 38), pa.string(), pa.binary(), pa.binary(10), @@ -110,7 +111,6 @@ def assert_pyarrow_leak(): ] _unsupported_pyarrow_types = [ - pa.decimal256(76, 38), ] diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index 58fc8858ad75..37fa85fcf5dd 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -112,8 +112,8 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { DataType::Decimal128(parsed_precision, parsed_scale) }, [precision, scale, bits] => { - if *bits != "128" { - return Err(ArrowError::CDataInterface("Only 128 bit wide decimal is supported in the Rust implementation".to_string())); + if *bits != "128" && *bits != "256" { + return Err(ArrowError::CDataInterface("Only 128/256 bit wide decimal is supported in the Rust implementation".to_string())); } let parsed_precision = precision.parse::().map_err(|_| { ArrowError::CDataInterface( @@ -125,7 +125,11 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer scale".to_string(), ) })?; - DataType::Decimal128(parsed_precision, parsed_scale) + if *bits == "128" { + DataType::Decimal128(parsed_precision, parsed_scale) + } else { + DataType::Decimal256(parsed_precision, parsed_scale) + } } _ => { return Err(ArrowError::CDataInterface(format!( @@ -305,6 +309,9 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::Decimal128(precision, scale) => { Ok(format!("d:{},{}", precision, scale)) } + DataType::Decimal256(precision, scale) => { + Ok(format!("d:{},{},256", precision, scale)) + } DataType::Date32 => Ok("tdD".to_string()), DataType::Date64 => Ok("tdm".to_string()), DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 5e9b01b5c6b0..4111b858d050 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -120,6 +120,7 @@ use std::{ sync::Arc, }; +use arrow_buffer::i256; use arrow_schema::UnionMode; use bitflags::bitflags; @@ -324,6 +325,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Float32, 1) => size_of::() * 8, (DataType::Float64, 1) => size_of::() * 8, (DataType::Decimal128(..), 1) => size_of::() * 8, + (DataType::Decimal256(..), 1) => size_of::() * 8, (DataType::Timestamp(..), 1) => size_of::() * 8, (DataType::Duration(..), 1) => size_of::() * 8, // primitive types have a single buffer @@ -339,6 +341,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Float32, _) | (DataType::Float64, _) | (DataType::Decimal128(..), _) | + (DataType::Decimal256(..), _) | (DataType::Timestamp(..), _) | (DataType::Duration(..), _) => { return Err(ArrowError::CDataInterface(format!( From 4a3b7e993366b9934e704c910c3716ac9cbf8208 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 5 Jan 2023 09:08:21 +0000 Subject: [PATCH 0472/1411] Prepare object_store 0.5.3 (#3457) --- object_store/CHANGELOG-old.md | 27 ++++++++++++ object_store/CHANGELOG.md | 42 +++++++++++-------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 1397d8a8e3d0..2813cfc9df1a 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -21,6 +21,33 @@ # Changelog +## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) + +**Implemented enhancements:** + +- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) +- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) +- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) +- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) +- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.5.1](https://github.com/apache/arrow-rs/tree/object_store_0.5.1) (2022-10-04) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.5.1) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 528d649df5e7..41b029ccab78 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,32 +19,40 @@ # Changelog -## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) +## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) **Implemented enhancements:** -- Object Store: Allow custom reqwest client [\#3127](https://github.com/apache/arrow-rs/issues/3127) -- socks5 proxy support for the object\_store crate [\#2989](https://github.com/apache/arrow-rs/issues/2989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Cannot query S3 paths containing whitespace [\#2799](https://github.com/apache/arrow-rs/issues/2799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) +- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Fixed bugs:** +**Closed issues:** -- object\_store\(gcp\): GCP complains about content-length for copy [\#3235](https://github.com/apache/arrow-rs/issues/3235) -- object\_store\(aws\): EntityTooSmall error on multi-part upload [\#3233](https://github.com/apache/arrow-rs/issues/3233) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add more ClientConfig Options for Object Store RequestBuilder \(\#3127\) [\#3256](https://github.com/apache/arrow-rs/pull/3256) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ObjectStore ClientConfig [\#3252](https://github.com/apache/arrow-rs/pull/3252) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- fix\(object\_store,gcp\): test copy\_if\_not\_exist [\#3236](https://github.com/apache/arrow-rs/pull/3236) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- fix\(object\_store,aws,gcp\): multipart upload enforce size limit of 5 MiB not 5MB [\#3234](https://github.com/apache/arrow-rs/pull/3234) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- object\_store: add support for using proxy\_url for connection testing [\#3109](https://github.com/apache/arrow-rs/pull/3109) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sum12](https://github.com/sum12)) -- Update AWS SDK [\#2974](https://github.com/apache/arrow-rs/pull/2974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update quick-xml requirement from 0.25.0 to 0.26.0 [\#2918](https://github.com/apache/arrow-rs/pull/2918) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support building object_store and parquet on wasm32-unknown-unknown target [\#2896](https://github.com/apache/arrow-rs/pull/2899) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jondo2010](https://github.com/jondo2010)) -- Add experimental AWS\_PROFILE support \(\#2178\) [\#2891](https://github.com/apache/arrow-rs/pull/2891) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) +- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a9cc151b985a..e61a127c9c00 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.2" +version = "0.5.3" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index cf070d3c5dcd..2f6c809a79bf 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.1" -FUTURE_RELEASE="object_store_0.5.2" +SINCE_TAG="object_store_0.5.2" +FUTURE_RELEASE="object_store_0.5.3" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From e256e3de0c902a98dc15a13cfc86dfe4fb142dfa Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Jan 2023 06:24:41 -0500 Subject: [PATCH 0473/1411] Improve arrow flight batch splitting and naming (#3444) * Improve arrow flight batch splitting and naming * Review feedback: rename to max_flight_data_size --- arrow-flight/src/encode.rs | 104 ++++++++++++++++++++-------- arrow-flight/tests/encode_decode.rs | 8 +-- 2 files changed, 80 insertions(+), 32 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 7c339b67d488..55000bba2fad 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -63,24 +63,25 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// [`FlightError`]: crate::error::FlightError #[derive(Debug)] pub struct FlightDataEncoderBuilder { - /// The maximum message size (see details on [`Self::with_max_message_size`]). - max_batch_size: usize, + /// The maximum approximate target message size in bytes + /// (see details on [`Self::with_max_flight_data_size`]). + max_flight_data_size: usize, /// Ipc writer options options: IpcWriteOptions, /// Metadata to add to the schema message app_metadata: Bytes, } -/// Default target size for record batches to send. +/// Default target size for encoded [`FlightData`]. /// /// Note this value would normally be 4MB, but the size calculation is /// somewhat inexact, so we set it to 2MB. -pub const GRPC_TARGET_MAX_BATCH_SIZE: usize = 2097152; +pub const GRPC_TARGET_MAX_FLIGHT_SIZE_BYTES: usize = 2097152; impl Default for FlightDataEncoderBuilder { fn default() -> Self { Self { - max_batch_size: GRPC_TARGET_MAX_BATCH_SIZE, + max_flight_data_size: GRPC_TARGET_MAX_FLIGHT_SIZE_BYTES, options: IpcWriteOptions::default(), app_metadata: Bytes::new(), } @@ -92,16 +93,18 @@ impl FlightDataEncoderBuilder { Self::default() } - /// Set the (approximate) maximum encoded [`RecordBatch`] size to - /// limit the gRPC message size. Defaults to 2MB. + /// Set the (approximate) maximum size, in bytes, of the + /// [`FlightData`] produced by this encoder. Defaults to 2MB. /// - /// The encoder splits up [`RecordBatch`]s (preserving order) to - /// limit individual messages to approximately this size. The size - /// is approximate because there additional encoding overhead on - /// top of the underlying data itself. + /// Since there is often a maximum message size for gRPC messages + /// (typically around 4MB), this encoder splits up [`RecordBatch`]s + /// (preserving order) into multiple [`FlightData`] objects to + /// limit the size individual messages sent via gRPC. /// - pub fn with_max_message_size(mut self, max_batch_size: usize) -> Self { - self.max_batch_size = max_batch_size; + /// The size is approximate because of the additional encoding + /// overhead on top of the underlying data buffers themselves. + pub fn with_max_flight_data_size(mut self, max_flight_data_size: usize) -> Self { + self.max_flight_data_size = max_flight_data_size; self } @@ -126,12 +129,12 @@ impl FlightDataEncoderBuilder { S: Stream> + Send + 'static, { let Self { - max_batch_size, + max_flight_data_size, options, app_metadata, } = self; - FlightDataEncoder::new(input.boxed(), max_batch_size, options, app_metadata) + FlightDataEncoder::new(input.boxed(), max_flight_data_size, options, app_metadata) } } @@ -143,29 +146,30 @@ pub struct FlightDataEncoder { inner: BoxStream<'static, Result>, /// schema, set after the first batch schema: Option, - /// Max size of batches to encode - max_batch_size: usize, + /// Target maximum size of flight data + /// (see details on [`FlightDataEncoderBuilder::with_max_flight_data_size`]). + max_flight_data_size: usize, /// do the encoding / tracking of dictionaries encoder: FlightIpcEncoder, /// optional metadata to add to schema FlightData app_metadata: Option, /// data queued up to send but not yet sent queue: VecDeque, - /// Is this strema done (inner is empty or errored) + /// Is this stream done (inner is empty or errored) done: bool, } impl FlightDataEncoder { fn new( inner: BoxStream<'static, Result>, - max_batch_size: usize, + max_flight_data_size: usize, options: IpcWriteOptions, app_metadata: Bytes, ) -> Self { Self { inner, schema: None, - max_batch_size, + max_flight_data_size, encoder: FlightIpcEncoder::new(options), app_metadata: Some(app_metadata), queue: VecDeque::new(), @@ -210,7 +214,7 @@ impl FlightDataEncoder { // encode the batch let batch = prepare_batch_for_flight(&batch, schema)?; - for batch in split_batch_for_grpc_response(batch, self.max_batch_size) { + for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { let (flight_dictionaries, flight_batch) = self.encoder.encode_batch(&batch)?; @@ -300,7 +304,7 @@ fn prepare_schema_for_flight(schema: &Schema) -> Schema { /// arrays: fn split_batch_for_grpc_response( batch: RecordBatch, - max_batch_size: usize, + max_flight_data_size: usize, ) -> Vec { let size = batch .columns() @@ -308,8 +312,9 @@ fn split_batch_for_grpc_response( .map(|col| col.get_buffer_memory_size()) .sum::(); - let n_batches = - (size / max_batch_size + usize::from(size % max_batch_size != 0)).max(1); + let n_batches = (size / max_flight_data_size + + usize::from(size % max_flight_data_size != 0)) + .max(1); let rows_per_batch = (batch.num_rows() / n_batches).max(1); let mut out = Vec::with_capacity(n_batches + 1); @@ -419,6 +424,7 @@ mod tests { array::{UInt32Array, UInt8Array}, compute::concat_batches, }; + use arrow_array::UInt64Array; use super::*; @@ -480,24 +486,24 @@ mod tests { #[test] fn test_split_batch_for_grpc_response() { - let max_batch_size = 1024; + let max_flight_data_size = 1024; // no split let c = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); - let split = split_batch_for_grpc_response(batch.clone(), max_batch_size); + let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); assert_eq!(split.len(), 1); assert_eq!(batch, split[0]); // split once - let n_rows = max_batch_size + 1; + let n_rows = max_flight_data_size + 1; assert!(n_rows % 2 == 1, "should be an odd number"); let c = UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); - let split = split_batch_for_grpc_response(batch.clone(), max_batch_size); + let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); assert_eq!(split.len(), 3); assert_eq!( split.iter().map(|batch| batch.num_rows()).sum::(), @@ -506,6 +512,48 @@ mod tests { assert_eq!(concat_batches(&batch.schema(), &split).unwrap(), batch); } + #[test] + fn test_split_batch_for_grpc_response_sizes() { + // 2000 8 byte entries into 2k pieces: 8 chunks of 250 rows + verify_split(2000, 2 * 1024, vec![250, 250, 250, 250, 250, 250, 250, 250]); + + // 2000 8 byte entries into 4k pieces: 4 chunks of 500 rows + verify_split(2000, 4 * 1024, vec![500, 500, 500, 500]); + + // 2023 8 byte entries into 3k pieces does not divide evenly + verify_split(2023, 3 * 1024, vec![337, 337, 337, 337, 337, 337, 1]); + + // 10 8 byte entries into 1 byte pieces means each rows gets its own + verify_split(10, 1, vec![1, 1, 1, 1, 1, 1, 1, 1, 1, 1]); + + // 10 8 byte entries into 1k byte pieces means one piece + verify_split(10, 1024, vec![10]); + } + + /// Creates a UInt64Array of 8 byte integers with input_rows rows + /// `max_flight_data_size_bytes` pieces and verifies the row counts in + /// those pieces + fn verify_split( + num_input_rows: u64, + max_flight_data_size_bytes: usize, + expected_sizes: Vec, + ) { + let array: UInt64Array = (0..num_input_rows).collect(); + + let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(array) as ArrayRef)]) + .expect("cannot create record batch"); + + let input_rows = batch.num_rows(); + + let split = + split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); + let sizes: Vec<_> = split.iter().map(|batch| batch.num_rows()).collect(); + let output_rows: usize = sizes.iter().sum(); + + assert_eq!(sizes, expected_sizes, "mismatch for {batch:?}"); + assert_eq!(input_rows, output_rows, "mismatch for {batch:?}"); + } + // test sending record batches // test sending record batches with multiple different dictionaries } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 45b8c0bf5ac9..0aa98768774e 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -131,7 +131,7 @@ async fn test_max_message_size() { let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(5))]); // 5 input rows, with a very small limit should result in 5 batch messages - let encoder = FlightDataEncoderBuilder::default().with_max_message_size(1); + let encoder = FlightDataEncoderBuilder::default().with_max_flight_data_size(1); let encode_stream = encoder.build(input_batch_stream); @@ -164,9 +164,9 @@ async fn test_max_message_size_fuzz() { make_primative_batch(127), ]; - for max_message_size in [10, 1024, 2048, 6400, 3211212] { - let encoder = - FlightDataEncoderBuilder::default().with_max_message_size(max_message_size); + for max_message_size_bytes in [10, 1024, 2048, 6400, 3211212] { + let encoder = FlightDataEncoderBuilder::default() + .with_max_flight_data_size(max_message_size_bytes); let input_batch_stream = futures::stream::iter(input.clone()).map(Ok); From 81abc1a942cd13a92231f4a828077ad60fdabe36 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Jan 2023 06:37:25 -0500 Subject: [PATCH 0474/1411] Consolidate arrow ipc tests and increase coverage (#3427) * Consolidate arrow ipc tests and increase coverage * fix fmt --- arrow-integration-testing/src/lib.rs | 7 + arrow-integration-testing/tests/ipc_reader.rs | 278 +++++-------- arrow-integration-testing/tests/ipc_writer.rs | 389 ++++++++---------- arrow/Cargo.toml | 4 - arrow/tests/ipc.rs | 61 --- 5 files changed, 286 insertions(+), 453 deletions(-) delete mode 100644 arrow/tests/ipc.rs diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index 2edd0ed28389..b0c8b85afe2e 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -77,6 +77,13 @@ pub fn read_json_file(json_name: &str) -> Result { } /// Read gzipped JSON test file +/// +/// For example given the input: +/// version = `0.17.1` +/// path = `generated_union` +/// +/// Returns the contents of +/// `arrow-ipc-stream/integration/0.17.1/generated_union.json.gz` pub fn read_gzip_json(version: &str, path: &str) -> ArrowJson { use flate2::read::GzDecoder; use std::io::Read; diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 778d1ee77d3f..e185634f0dd4 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -15,16 +15,18 @@ // specific language governing permissions and limitations // under the License. +//! Tests for reading the content of [`FileReader`] and [`StreamReader`] +//! in `testing/arrow-ipc-stream/integration/...` + use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::util::test_util::arrow_test_data; use arrow_integration_testing::read_gzip_json; use std::fs::File; #[test] -fn read_generated_files_014() { +fn read_0_1_4() { let testdata = arrow_test_data(); let version = "0.14.1"; - // the test is repetitive, thus we can read all supported files at once let paths = vec![ "generated_interval", "generated_datetime", @@ -37,51 +39,42 @@ fn read_generated_files_014() { "generated_decimal", ]; paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); - - let mut reader = FileReader::try_new(file, None).unwrap(); + verify_arrow_file(&testdata, version, path); + verify_arrow_stream(&testdata, version, path); + }); +} - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); +#[test] +fn read_0_1_7() { + let testdata = arrow_test_data(); + let version = "0.17.1"; + let paths = vec!["generated_union"]; + paths.iter().for_each(|path| { + verify_arrow_file(&testdata, version, path); + verify_arrow_stream(&testdata, version, path); }); } #[test] #[should_panic(expected = "Big Endian is not supported for Decimal!")] -fn read_decimal_be_file_should_panic() { +fn read_1_0_0_bigendian_decimal_should_panic() { let testdata = arrow_test_data(); - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_decimal.arrow_file", - testdata - )) - .unwrap(); - FileReader::try_new(file, None).unwrap(); + verify_arrow_file(&testdata, "1.0.0-bigendian", "generated_decimal"); } #[test] #[should_panic( expected = "Last offset 687865856 of Utf8 is larger than values length 41" )] -fn read_dictionary_be_not_implemented() { +fn read_1_0_0_bigendian_dictionary_should_panic() { // The offsets are not translated for big-endian files // https://github.com/apache/arrow-rs/issues/859 let testdata = arrow_test_data(); - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_dictionary.arrow_file", - testdata - )) - .unwrap(); - FileReader::try_new(file, None).unwrap(); + verify_arrow_file(&testdata, "1.0.0-bigendian", "generated_dictionary"); } #[test] -fn read_generated_be_files_should_work() { - // complementary to the previous test +fn read_1_0_0_bigendian() { let testdata = arrow_test_data(); let paths = vec![ "generated_interval", @@ -102,163 +95,119 @@ fn read_generated_be_files_should_work() { .unwrap(); FileReader::try_new(file, None).unwrap(); + + // While the the reader doesn't error but the values are not read correctly + // so verifing the contents fails + //verify_arrow_file(&testdata, "1.0.0-bigendian", path); }); } #[test] -fn projection_should_work() { - // complementary to the previous test +fn read_1_0_0_littleendian() { let testdata = arrow_test_data(); + let version = "1.0.0-littleendian"; let paths = vec![ - "generated_interval", "generated_datetime", + "generated_custom_metadata", + "generated_decimal", + "generated_decimal256", + "generated_dictionary", + "generated_dictionary_unsigned", + "generated_duplicate_fieldnames", + "generated_extension", + "generated_interval", "generated_map", + // fails with + // thread 'read_1_0_0_littleendian' panicked at 'assertion failed: `(left == right)` + //"generated_map_non_canonical", "generated_nested", - "generated_null_trivial", + "generated_nested_dictionary", + "generated_nested_large_offsets", "generated_null", + "generated_null_trivial", + "generated_primitive", + "generated_primitive_large_offsets", "generated_primitive_no_batches", "generated_primitive_zerolength", - "generated_primitive", + "generated_recursive_nested", + "generated_union", ]; paths.iter().for_each(|path| { - // We must use littleendian files here. - // The offsets are not translated for big-endian files - // https://github.com/apache/arrow-rs/issues/859 - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/1.0.0-littleendian/{}.arrow_file", - testdata, path - )) - .unwrap(); - - let reader = FileReader::try_new(file, Some(vec![0])).unwrap(); - let datatype_0 = reader.schema().fields()[0].data_type().clone(); - reader.for_each(|batch| { - let batch = batch.unwrap(); - assert_eq!(batch.columns().len(), 1); - assert_eq!(datatype_0, batch.schema().fields()[0].data_type().clone()); - }); + verify_arrow_file(&testdata, version, path); + verify_arrow_stream(&testdata, version, path); }); } #[test] -fn read_generated_streams_014() { +fn read_2_0_0_compression() { let testdata = arrow_test_data(); - let version = "0.14.1"; + let version = "2.0.0-compression"; + // the test is repetitive, thus we can read all supported files at once - let paths = vec![ - "generated_interval", - "generated_datetime", - "generated_dictionary", - "generated_map", - "generated_nested", - "generated_primitive_no_batches", - "generated_primitive_zerolength", - "generated_primitive", - "generated_decimal", - ]; + let paths = vec!["generated_lz4", "generated_zstd"]; paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - - let mut reader = StreamReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); - // the next batch must be empty - assert!(reader.next().is_none()); - // the stream must indicate that it's finished - assert!(reader.is_finished()); + verify_arrow_file(&testdata, version, path); + verify_arrow_stream(&testdata, version, path); }); } -#[test] -fn read_generated_files_100() { - let testdata = arrow_test_data(); - let version = "1.0.0-littleendian"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec![ - "generated_interval", - "generated_datetime", - "generated_dictionary", - "generated_map", - // "generated_map_non_canonical", - "generated_nested", - "generated_null_trivial", - "generated_null", - "generated_primitive_no_batches", - "generated_primitive_zerolength", - "generated_primitive", - ]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); - +/// Verifies the arrow file format integration test +/// +/// Input file: +/// `arrow-ipc-stream/integration//.arrow_file +/// +/// Verification json file +/// `arrow-ipc-stream/integration//.json.gz +fn verify_arrow_file(testdata: &str, version: &str, path: &str) { + let filename = format!( + "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", + testdata, version, path + ); + println!("Verifying {filename}"); + + // Compare contents to the expected output format in JSON + { + println!(" verifying content"); + let file = File::open(&filename).unwrap(); let mut reader = FileReader::try_new(file, None).unwrap(); // read expected JSON output let arrow_json = read_gzip_json(version, path); assert!(arrow_json.equals_reader(&mut reader).unwrap()); - }); -} + } -#[test] -fn read_generated_streams_100() { - let testdata = arrow_test_data(); - let version = "1.0.0-littleendian"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec![ - "generated_interval", - "generated_datetime", - "generated_dictionary", - "generated_map", - // "generated_map_non_canonical", - "generated_nested", - "generated_null_trivial", - "generated_null", - "generated_primitive_no_batches", - "generated_primitive_zerolength", - "generated_primitive", - ]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - - let mut reader = StreamReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); - // the next batch must be empty - assert!(reader.next().is_none()); - // the stream must indicate that it's finished - assert!(reader.is_finished()); - }); + // Verify that projection works by selecting the first column + { + println!(" verifying projection"); + let file = File::open(&filename).unwrap(); + let reader = FileReader::try_new(file, Some(vec![0])).unwrap(); + let datatype_0 = reader.schema().fields()[0].data_type().clone(); + reader.for_each(|batch| { + let batch = batch.unwrap(); + assert_eq!(batch.columns().len(), 1); + assert_eq!(datatype_0, batch.schema().fields()[0].data_type().clone()); + }); + } } -#[test] -fn read_generated_streams_200() { - let testdata = arrow_test_data(); - let version = "2.0.0-compression"; - - // the test is repetitive, thus we can read all supported files at once - let paths = vec!["generated_lz4", "generated_zstd"]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - +/// Verifies the arrow stream integration test +/// +/// Input file: +/// `arrow-ipc-stream/integration//.stream +/// +/// Verification json file +/// `arrow-ipc-stream/integration//.json.gz +fn verify_arrow_stream(testdata: &str, version: &str, path: &str) { + let filename = format!( + "{}/arrow-ipc-stream/integration/{}/{}.stream", + testdata, version, path + ); + println!("Verifying {filename}"); + + // Compare contents to the expected output format in JSON + { + println!(" verifying content"); + let file = File::open(&filename).unwrap(); let mut reader = StreamReader::try_new(file, None).unwrap(); // read expected JSON output @@ -268,26 +217,5 @@ fn read_generated_streams_200() { assert!(reader.next().is_none()); // the stream must indicate that it's finished assert!(reader.is_finished()); - }); -} - -#[test] -fn read_generated_files_200() { - let testdata = arrow_test_data(); - let version = "2.0.0-compression"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec!["generated_lz4", "generated_zstd"]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); - - let mut reader = FileReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); - }); + } } diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index 0aa17cd05c35..e429b5e5cb39 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -24,10 +24,9 @@ use std::fs::File; use std::io::Seek; #[test] -fn read_and_rewrite_generated_files_014() { +fn write_0_1_4() { let testdata = arrow_test_data(); let version = "0.14.1"; - // the test is repetitive, thus we can read all supported files at once let paths = vec![ "generated_interval", "generated_datetime", @@ -40,275 +39,239 @@ fn read_and_rewrite_generated_files_014() { "generated_decimal", ]; paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); - - let mut reader = FileReader::try_new(file, None).unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - - // read and rewrite the file to a temp location - { - let mut writer = FileWriter::try_new(&mut file, &reader.schema()).unwrap(); - while let Some(Ok(batch)) = reader.next() { - writer.write(&batch).unwrap(); - } - writer.finish().unwrap(); - } - file.rewind().unwrap(); - - let mut reader = FileReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); + roundtrip_arrow_file(&testdata, version, path); + roundtrip_arrow_stream(&testdata, version, path); }); } #[test] -fn read_and_rewrite_generated_streams_014() { +fn write_0_1_7() { let testdata = arrow_test_data(); - let version = "0.14.1"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec![ - "generated_interval", - "generated_datetime", - "generated_dictionary", - "generated_map", - "generated_nested", - "generated_primitive_no_batches", - "generated_primitive_zerolength", - "generated_primitive", - "generated_decimal", - ]; + let version = "0.17.1"; + let paths = vec!["generated_union"]; paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - - let reader = StreamReader::try_new(file, None).unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - - // read and rewrite the stream to a temp location - { - let mut writer = StreamWriter::try_new(&mut file, &reader.schema()).unwrap(); - reader.for_each(|batch| { - writer.write(&batch.unwrap()).unwrap(); - }); - writer.finish().unwrap(); - } - - file.rewind().unwrap(); - let mut reader = StreamReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); + roundtrip_arrow_file(&testdata, version, path); + roundtrip_arrow_stream(&testdata, version, path); }); } #[test] -fn read_and_rewrite_generated_files_100() { +fn write_1_0_0_littleendian() { let testdata = arrow_test_data(); let version = "1.0.0-littleendian"; - // the test is repetitive, thus we can read all supported files at once let paths = vec![ - "generated_custom_metadata", "generated_datetime", - "generated_dictionary_unsigned", + "generated_custom_metadata", + "generated_decimal", + "generated_decimal256", "generated_dictionary", - // "generated_duplicate_fieldnames", + "generated_dictionary_unsigned", + "generated_duplicate_fieldnames", + "generated_extension", "generated_interval", "generated_map", + // thread 'write_1_0_0_littleendian' panicked at 'assertion failed: `(left == right)` + // "generated_map_non_canonical", "generated_nested", - // "generated_nested_large_offsets", - "generated_null_trivial", + "generated_nested_dictionary", + "generated_nested_large_offsets", "generated_null", + "generated_null_trivial", + "generated_primitive", "generated_primitive_large_offsets", "generated_primitive_no_batches", "generated_primitive_zerolength", - "generated_primitive", - // "generated_recursive_nested", + "generated_recursive_nested", + "generated_union", ]; paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); - - let mut reader = FileReader::try_new(file, None).unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - - // read and rewrite the file to a temp location - { - // write IPC version 5 - let options = - IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5).unwrap(); - let mut writer = - FileWriter::try_new_with_options(&mut file, &reader.schema(), options) - .unwrap(); - while let Some(Ok(batch)) = reader.next() { - writer.write(&batch).unwrap(); - } - writer.finish().unwrap(); - } - - file.rewind().unwrap(); - let mut reader = FileReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); + roundtrip_arrow_file(&testdata, version, path); + roundtrip_arrow_stream(&testdata, version, path); }); } #[test] -fn read_and_rewrite_generated_streams_100() { +fn write_2_0_0_compression() { let testdata = arrow_test_data(); - let version = "1.0.0-littleendian"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec![ - "generated_custom_metadata", - "generated_datetime", - "generated_dictionary_unsigned", - "generated_dictionary", - // "generated_duplicate_fieldnames", - "generated_interval", - "generated_map", - "generated_nested", - // "generated_nested_large_offsets", - "generated_null_trivial", - "generated_null", - "generated_primitive_large_offsets", - "generated_primitive_no_batches", - "generated_primitive_zerolength", - "generated_primitive", - // "generated_recursive_nested", - ]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - - let reader = StreamReader::try_new(file, None).unwrap(); + let version = "2.0.0-compression"; + let paths = vec!["generated_lz4", "generated_zstd"]; - let mut file = tempfile::tempfile().unwrap(); + // writer options for each compression type + let all_options = vec![ + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .unwrap(), + // write IPC version 5 with zstd + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::ZSTD)) + .unwrap(), + ]; - // read and rewrite the stream to a temp location - { - let options = - IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5).unwrap(); - let mut writer = - StreamWriter::try_new_with_options(&mut file, &reader.schema(), options) - .unwrap(); - reader.for_each(|batch| { - writer.write(&batch.unwrap()).unwrap(); - }); - writer.finish().unwrap(); + paths.iter().for_each(|path| { + for options in &all_options { + println!("Using options {options:?}"); + roundtrip_arrow_file_with_options(&testdata, version, path, options.clone()); + roundtrip_arrow_stream_with_options( + &testdata, + version, + path, + options.clone(), + ); } - - file.rewind().unwrap(); - - let mut reader = StreamReader::try_new(file, None).unwrap(); - - // read expected JSON output - let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } -#[test] -fn read_and_rewrite_compression_files_200() { - let testdata = arrow_test_data(); - let version = "2.0.0-compression"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec!["generated_lz4", "generated_zstd"]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - )) - .unwrap(); +/// Verifies the arrow file writer by reading the contents of an +/// arrow_file, writing it to a file, and then ensuring the contents +/// match the expected json contents. It also verifies that +/// RecordBatches read from the new file matches the original. +/// +/// Input file: +/// `arrow-ipc-stream/integration//.arrow_file +/// +/// Verification json file +/// `arrow-ipc-stream/integration//.json.gz +fn roundtrip_arrow_file(testdata: &str, version: &str, path: &str) { + roundtrip_arrow_file_with_options(testdata, version, path, IpcWriteOptions::default()) +} +fn roundtrip_arrow_file_with_options( + testdata: &str, + version: &str, + path: &str, + options: IpcWriteOptions, +) { + let filename = format!( + "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", + testdata, version, path + ); + println!("Verifying {filename}"); + + let mut tempfile = tempfile::tempfile().unwrap(); + + { + println!(" writing to tempfile {tempfile:?}"); + let file = File::open(&filename).unwrap(); let mut reader = FileReader::try_new(file, None).unwrap(); - let mut file = tempfile::tempfile().unwrap(); - // read and rewrite the file to a temp location { - // write IPC version 5 - let options = IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) - .unwrap() - .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) - .unwrap(); - - let mut writer = - FileWriter::try_new_with_options(&mut file, &reader.schema(), options) - .unwrap(); + let mut writer = FileWriter::try_new_with_options( + &mut tempfile, + &reader.schema(), + options, + ) + .unwrap(); while let Some(Ok(batch)) = reader.next() { writer.write(&batch).unwrap(); } writer.finish().unwrap(); } + } - file.rewind().unwrap(); - let mut reader = FileReader::try_new(file, None).unwrap(); + { + println!(" checking rewrite to with json"); + tempfile.rewind().unwrap(); + let mut reader = FileReader::try_new(&tempfile, None).unwrap(); - // read expected JSON output let arrow_json = read_gzip_json(version, path); assert!(arrow_json.equals_reader(&mut reader).unwrap()); - }); + } + + { + println!(" checking rewrite with original"); + let file = File::open(&filename).unwrap(); + let reader = FileReader::try_new(file, None).unwrap(); + + tempfile.rewind().unwrap(); + let rewrite_reader = FileReader::try_new(&tempfile, None).unwrap(); + + // Compare to original reader + reader.into_iter().zip(rewrite_reader.into_iter()).for_each( + |(batch1, batch2)| { + assert_eq!(batch1.unwrap(), batch2.unwrap()); + }, + ); + } } -#[test] -fn read_and_rewrite_compression_stream_200() { - let testdata = arrow_test_data(); - let version = "2.0.0-compression"; - // the test is repetitive, thus we can read all supported files at once - let paths = vec!["generated_lz4", "generated_zstd"]; - paths.iter().for_each(|path| { - let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - )) - .unwrap(); - - let reader = StreamReader::try_new(file, None).unwrap(); +/// Verifies the arrow file writer by reading the contents of an +/// arrow_file, writing it to a file, and then ensuring the contents +/// match the expected json contents. It also verifies that +/// RecordBatches read from the new file matches the original. +/// +/// Input file: +/// `arrow-ipc-stream/integration//.stream +/// +/// Verification json file +/// `arrow-ipc-stream/integration//.json.gz +fn roundtrip_arrow_stream(testdata: &str, version: &str, path: &str) { + roundtrip_arrow_stream_with_options( + testdata, + version, + path, + IpcWriteOptions::default(), + ) +} - let mut file = tempfile::tempfile().unwrap(); +fn roundtrip_arrow_stream_with_options( + testdata: &str, + version: &str, + path: &str, + options: IpcWriteOptions, +) { + let filename = format!( + "{}/arrow-ipc-stream/integration/{}/{}.stream", + testdata, version, path + ); + println!("Verifying {filename}"); + + let mut tempfile = tempfile::tempfile().unwrap(); + + { + println!(" writing to tempfile {tempfile:?}"); + let file = File::open(&filename).unwrap(); + let mut reader = StreamReader::try_new(file, None).unwrap(); - // read and rewrite the stream to a temp location + // read and rewrite the file to a temp location { - let options = IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) - .unwrap() - .try_with_compression(Some(ipc::CompressionType::ZSTD)) - .unwrap(); - - let mut writer = - StreamWriter::try_new_with_options(&mut file, &reader.schema(), options) - .unwrap(); - reader.for_each(|batch| { - writer.write(&batch.unwrap()).unwrap(); - }); + let mut writer = StreamWriter::try_new_with_options( + &mut tempfile, + &reader.schema(), + options, + ) + .unwrap(); + while let Some(Ok(batch)) = reader.next() { + writer.write(&batch).unwrap(); + } writer.finish().unwrap(); } + } - file.rewind().unwrap(); + { + println!(" checking rewrite to with json"); + tempfile.rewind().unwrap(); + let mut reader = StreamReader::try_new(&tempfile, None).unwrap(); - let mut reader = StreamReader::try_new(file, None).unwrap(); - - // read expected JSON output let arrow_json = read_gzip_json(version, path); assert!(arrow_json.equals_reader(&mut reader).unwrap()); - }); + } + + { + println!(" checking rewrite with original"); + let file = File::open(&filename).unwrap(); + let reader = StreamReader::try_new(file, None).unwrap(); + + tempfile.rewind().unwrap(); + let rewrite_reader = StreamReader::try_new(&tempfile, None).unwrap(); + + // Compare to original reader + reader.into_iter().zip(rewrite_reader.into_iter()).for_each( + |(batch1, batch2)| { + assert_eq!(batch1.unwrap(), batch2.unwrap()); + }, + ); + } } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 202b4c4f40f6..d83637cbcea1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -265,10 +265,6 @@ name = "lexsort" harness = false required-features = ["test_utils"] -[[test]] -name = "ipc" -required-features = ["test_utils", "ipc"] - [[test]] name = "csv" required-features = ["csv", "chrono-tz"] diff --git a/arrow/tests/ipc.rs b/arrow/tests/ipc.rs deleted file mode 100644 index abaa238ba5c6..000000000000 --- a/arrow/tests/ipc.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_ipc::reader::StreamReader; -use arrow_ipc::writer::StreamWriter; -use std::fs::File; -use std::io::Seek; - -#[test] -fn read_union_017() { - let testdata = arrow::util::test_util::arrow_test_data(); - let data_file = File::open(format!( - "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", - testdata, - )) - .unwrap(); - - let reader = StreamReader::try_new(data_file, None).unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - // read and rewrite the stream to a temp location - { - let mut writer = StreamWriter::try_new(&mut file, &reader.schema()).unwrap(); - reader.for_each(|batch| { - writer.write(&batch.unwrap()).unwrap(); - }); - writer.finish().unwrap(); - } - file.rewind().unwrap(); - - // Compare original file and rewrote file - let rewrite_reader = StreamReader::try_new(file, None).unwrap(); - - let data_file = File::open(format!( - "{}/arrow-ipc-stream/integration/0.17.1/generated_union.stream", - testdata, - )) - .unwrap(); - let reader = StreamReader::try_new(data_file, None).unwrap(); - - reader - .into_iter() - .zip(rewrite_reader.into_iter()) - .for_each(|(batch1, batch2)| { - assert_eq!(batch1.unwrap(), batch2.unwrap()); - }); -} From 42ffc3f344b338289d5e8e6b12b247f791dd5d8f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Jan 2023 09:39:28 -0500 Subject: [PATCH 0475/1411] Complete mid-level `FlightClient` (#3402) * Implement `FlightClient::do_put` and `FlightClient::do_exchange` * Implement ArrowClient::{list_flights, list_actions, do_action, get_schema} * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * remove outdated comment * make foo/bar placeholders in test more specific * simplify tests Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/client.rs | 303 ++++++++++++++++++++++++++-- arrow-flight/src/lib.rs | 26 ++- arrow-flight/tests/client.rs | 297 +++++++++++++++++++++++---- arrow-flight/tests/common/server.rs | 85 +++++++- 4 files changed, 638 insertions(+), 73 deletions(-) diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index 753c40f2a5c1..bdd51dda4f9f 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -16,11 +16,17 @@ // under the License. use crate::{ - decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, - FlightDescriptor, FlightInfo, HandshakeRequest, Ticket, + decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, Action, + ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, PutResult, Ticket, }; +use arrow_schema::Schema; use bytes::Bytes; -use futures::{future::ready, stream, StreamExt, TryStreamExt}; +use futures::{ + future::ready, + stream::{self, BoxStream}, + Stream, StreamExt, TryStreamExt, +}; use tonic::{metadata::MetadataMap, transport::Channel}; use crate::error::{FlightError, Result}; @@ -160,6 +166,11 @@ impl FlightClient { /// returning a [`FlightRecordBatchStream`] for reading /// [`RecordBatch`](arrow_array::RecordBatch)es. /// + /// # Note + /// + /// To access the returned [`FlightData`] use + /// [`FlightRecordBatchStream::into_inner()`] + /// /// # Example: /// ```no_run /// # async fn run() { @@ -167,12 +178,8 @@ impl FlightClient { /// # use arrow_flight::FlightClient; /// # use arrow_flight::Ticket; /// # use arrow_array::RecordBatch; - /// # use tonic::transport::Channel; /// # use futures::stream::TryStreamExt; - /// # let channel = Channel::from_static("http://localhost:1234") - /// # .connect() - /// # .await - /// # .expect("error connecting"); + /// # let channel: tonic::transport::Channel = unimplemented!(); /// # let ticket = Ticket { ticket: Bytes::from("foo") }; /// let mut client = FlightClient::new(channel); /// @@ -199,8 +206,7 @@ impl FlightClient { .do_get(request) .await? .into_inner() - // convert to FlightError - .map_err(|e| e.into()); + .map_err(FlightError::Tonic); Ok(FlightRecordBatchStream::new_from_flight_data( response_stream, @@ -217,11 +223,7 @@ impl FlightClient { /// # async fn run() { /// # use arrow_flight::FlightClient; /// # use arrow_flight::FlightDescriptor; - /// # use tonic::transport::Channel; - /// # let channel = Channel::from_static("http://localhost:1234") - /// # .connect() - /// # .await - /// # .expect("error connecting"); + /// # let channel: tonic::transport::Channel = unimplemented!(); /// let mut client = FlightClient::new(channel); /// /// // Send a 'CMD' request to the server @@ -256,13 +258,270 @@ impl FlightClient { Ok(response) } - // TODO other methods - // list_flights - // get_schema - // do_put - // do_action - // list_actions - // do_exchange + /// Make a `DoPut` call to the server with the provided + /// [`Stream`](futures::Stream) of [`FlightData`] and returning a + /// stream of [`PutResult`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use futures::{TryStreamExt, StreamExt}; + /// # use std::sync::Arc; + /// # use arrow_array::UInt64Array; + /// # use arrow_array::RecordBatch; + /// # use arrow_flight::{FlightClient, FlightDescriptor, PutResult}; + /// # use arrow_flight::encode::FlightDataEncoderBuilder; + /// # let batch = RecordBatch::try_from_iter(vec![ + /// # ("col2", Arc::new(UInt64Array::from_iter([10, 23, 33])) as _) + /// # ]).unwrap(); + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// // encode the batch as a stream of `FlightData` + /// let flight_data_stream = FlightDataEncoderBuilder::new() + /// .build(futures::stream::iter(vec![Ok(batch)])) + /// // data encoder return Results, but do_put requires FlightData + /// .map(|batch|batch.unwrap()); + /// + /// // send the stream and get the results as `PutResult` + /// let response: Vec= client + /// .do_put(flight_data_stream) + /// .await + /// .unwrap() + /// .try_collect() // use TryStreamExt to collect stream + /// .await + /// .expect("error calling do_put"); + /// # } + /// ``` + pub async fn do_put + Send + 'static>( + &mut self, + request: S, + ) -> Result>> { + let request = self.make_request(request); + + let response = self + .inner + .do_put(request) + .await? + .into_inner() + .map_err(FlightError::Tonic); + + Ok(response.boxed()) + } + + /// Make a `DoExchange` call to the server with the provided + /// [`Stream`](futures::Stream) of [`FlightData`] and returning a + /// stream of [`FlightData`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use futures::{TryStreamExt, StreamExt}; + /// # use std::sync::Arc; + /// # use arrow_array::UInt64Array; + /// # use arrow_array::RecordBatch; + /// # use arrow_flight::{FlightClient, FlightDescriptor, PutResult}; + /// # use arrow_flight::encode::FlightDataEncoderBuilder; + /// # let batch = RecordBatch::try_from_iter(vec![ + /// # ("col2", Arc::new(UInt64Array::from_iter([10, 23, 33])) as _) + /// # ]).unwrap(); + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// // encode the batch as a stream of `FlightData` + /// let flight_data_stream = FlightDataEncoderBuilder::new() + /// .build(futures::stream::iter(vec![Ok(batch)])) + /// // data encoder return Results, but do_exchange requires FlightData + /// .map(|batch|batch.unwrap()); + /// + /// // send the stream and get the results as `RecordBatches` + /// let response: Vec = client + /// .do_exchange(flight_data_stream) + /// .await + /// .unwrap() + /// .try_collect() // use TryStreamExt to collect stream + /// .await + /// .expect("error calling do_exchange"); + /// # } + /// ``` + pub async fn do_exchange + Send + 'static>( + &mut self, + request: S, + ) -> Result { + let request = self.make_request(request); + + let response = self + .inner + .do_exchange(request) + .await? + .into_inner() + .map_err(FlightError::Tonic); + + Ok(FlightRecordBatchStream::new_from_flight_data(response)) + } + + /// Make a `ListFlights` call to the server with the provided + /// critera and returning a [`Stream`](futures::Stream) of [`FlightInfo`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use futures::TryStreamExt; + /// # use bytes::Bytes; + /// # use arrow_flight::{FlightInfo, FlightClient}; + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// // Send 'Name=Foo' bytes as the "expression" to the server + /// // and gather the returned FlightInfo + /// let responses: Vec = client + /// .list_flights(Bytes::from("Name=Foo")) + /// .await + /// .expect("error listing flights") + /// .try_collect() // use TryStreamExt to collect stream + /// .await + /// .expect("error gathering flights"); + /// # } + /// ``` + pub async fn list_flights( + &mut self, + expression: impl Into, + ) -> Result>> { + let request = Criteria { + expression: expression.into(), + }; + + let request = self.make_request(request); + + let response = self + .inner + .list_flights(request) + .await? + .into_inner() + .map_err(FlightError::Tonic); + + Ok(response.boxed()) + } + + /// Make a `GetSchema` call to the server with the provided + /// [`FlightDescriptor`] and returning the associated [`Schema`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use bytes::Bytes; + /// # use arrow_flight::{FlightDescriptor, FlightClient}; + /// # use arrow_schema::Schema; + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// // Request the schema result of a 'CMD' request to the server + /// let request = FlightDescriptor::new_cmd(b"MOAR DATA".to_vec()); + /// + /// let schema: Schema = client + /// .get_schema(request) + /// .await + /// .expect("error making request"); + /// # } + /// ``` + pub async fn get_schema( + &mut self, + flight_descriptor: FlightDescriptor, + ) -> Result { + let request = self.make_request(flight_descriptor); + + let schema_result = self.inner.get_schema(request).await?.into_inner(); + + // attempt decode from IPC + let schema: Schema = schema_result.try_into()?; + + Ok(schema) + } + + /// Make a `ListActions` call to the server and returning a + /// [`Stream`](futures::Stream) of [`ActionType`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use futures::TryStreamExt; + /// # use arrow_flight::{ActionType, FlightClient}; + /// # use arrow_schema::Schema; + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// // List available actions on the server: + /// let actions: Vec = client + /// .list_actions() + /// .await + /// .expect("error listing actions") + /// .try_collect() // use TryStreamExt to collect stream + /// .await + /// .expect("error gathering actions"); + /// # } + /// ``` + pub async fn list_actions( + &mut self, + ) -> Result>> { + let request = self.make_request(Empty {}); + + let action_stream = self + .inner + .list_actions(request) + .await? + .into_inner() + .map_err(FlightError::Tonic); + + Ok(action_stream.boxed()) + } + + /// Make a `DoAction` call to the server and returning a + /// [`Stream`](futures::Stream) of opaque [`Bytes`]. + /// + /// # Example: + /// ```no_run + /// # async fn run() { + /// # use bytes::Bytes; + /// # use futures::TryStreamExt; + /// # use arrow_flight::{Action, FlightClient}; + /// # use arrow_schema::Schema; + /// # let channel: tonic::transport::Channel = unimplemented!(); + /// let mut client = FlightClient::new(channel); + /// + /// let request = Action::new("my_action", "the body"); + /// + /// // Make a request to run the action on the server + /// let results: Vec = client + /// .do_action(request) + /// .await + /// .expect("error executing acton") + /// .try_collect() // use TryStreamExt to collect stream + /// .await + /// .expect("error gathering action results"); + /// # } + /// ``` + pub async fn do_action( + &mut self, + action: Action, + ) -> Result>> { + let request = self.make_request(action); + + let result_stream = self + .inner + .do_action(request) + .await? + .into_inner() + .map_err(FlightError::Tonic) + .map(|r| { + r.map(|r| { + // unwrap inner bytes + let crate::Result { body } = r; + body + }) + }); + + Ok(result_stream.boxed()) + } /// return a Request, adding any configured metadata fn make_request(&self, t: T) -> tonic::Request { diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index c2da58eb5bb7..87aeba1c1194 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -348,8 +348,7 @@ impl TryFrom for Schema { type Error = ArrowError; fn try_from(value: FlightInfo) -> ArrowResult { - let msg = IpcMessage(value.schema); - msg.try_into() + value.try_decode_schema() } } @@ -368,6 +367,13 @@ impl TryFrom<&SchemaResult> for Schema { } } +impl TryFrom for Schema { + type Error = ArrowError; + fn try_from(data: SchemaResult) -> ArrowResult { + (&data).try_into() + } +} + // FlightData, FlightDescriptor, etc.. impl FlightData { @@ -422,6 +428,12 @@ impl FlightInfo { total_bytes, } } + + /// Try and convert the data in this `FlightInfo` into a [`Schema`] + pub fn try_decode_schema(self) -> ArrowResult { + let msg = IpcMessage(self.schema); + msg.try_into() + } } impl<'a> SchemaAsIpc<'a> { @@ -432,6 +444,16 @@ impl<'a> SchemaAsIpc<'a> { } } +impl Action { + /// Create a new Action with type and body + pub fn new(action_type: impl Into, body: impl Into) -> Self { + Self { + r#type: action_type.into(), + body: body.into(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index c471294d7dc4..7537e46db403 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -22,12 +22,13 @@ mod common { } use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ - error::FlightError, FlightClient, FlightDescriptor, FlightInfo, HandshakeRequest, - HandshakeResponse, Ticket, + decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, + error::FlightError, FlightClient, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, Ticket, }; use bytes::Bytes; use common::server::TestFlightServer; -use futures::{Future, TryStreamExt}; +use futures::{Future, StreamExt, TryStreamExt}; use tokio::{net::TcpListener, task::JoinHandle}; use tonic::{ transport::{Channel, Uri}, @@ -41,8 +42,9 @@ const DEFAULT_TIMEOUT_SECONDS: u64 = 30; #[tokio::test] async fn test_handshake() { do_test(|test_server, mut client| async move { - let request_payload = Bytes::from("foo"); - let response_payload = Bytes::from("Bar"); + client.add_header("foo-header", "bar-header-value").unwrap(); + let request_payload = Bytes::from("foo-request-payload"); + let response_payload = Bytes::from("bar-response-payload"); let request = HandshakeRequest { payload: request_payload.clone(), @@ -58,6 +60,7 @@ async fn test_handshake() { let response = client.handshake(request_payload).await.unwrap(); assert_eq!(response, response_payload); assert_eq!(test_server.take_handshake_request(), Some(request)); + ensure_metadata(&client, &test_server); }) .await; } @@ -65,7 +68,7 @@ async fn test_handshake() { #[tokio::test] async fn test_handshake_error() { do_test(|test_server, mut client| async move { - let request_payload = "foo".to_string().into_bytes(); + let request_payload = "foo-request-payload".to_string().into_bytes(); let e = Status::unauthenticated("DENIED"); test_server.set_handshake_response(Err(e)); @@ -76,26 +79,6 @@ async fn test_handshake_error() { .await; } -#[tokio::test] -async fn test_handshake_metadata() { - do_test(|test_server, mut client| async move { - client.add_header("foo", "bar").unwrap(); - - let request_payload = Bytes::from("Blarg"); - let response_payload = Bytes::from("Bazz"); - - let response = HandshakeResponse { - payload: response_payload.clone(), - protocol_version: 0, - }; - - test_server.set_handshake_response(Ok(response)); - client.handshake(request_payload).await.unwrap(); - ensure_metadata(&client, &test_server); - }) - .await; -} - /// Verifies that all headers sent from the the client are in the request_metadata fn ensure_metadata(client: &FlightClient, test_server: &TestFlightServer) { let client_metadata = client.metadata().clone().into_headers(); @@ -130,6 +113,7 @@ fn test_flight_info(request: &FlightDescriptor) -> FlightInfo { #[tokio::test] async fn test_get_flight_info() { do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); let expected_response = test_flight_info(&request); @@ -139,6 +123,7 @@ async fn test_get_flight_info() { assert_eq!(response, expected_response); assert_eq!(test_server.take_get_flight_info_request(), Some(request)); + ensure_metadata(&client, &test_server); }) .await; } @@ -158,25 +143,12 @@ async fn test_get_flight_info_error() { .await; } -#[tokio::test] -async fn test_get_flight_info_metadata() { - do_test(|test_server, mut client| async move { - client.add_header("foo", "bar").unwrap(); - let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); - - let expected_response = test_flight_info(&request); - test_server.set_get_flight_info_response(Ok(expected_response)); - client.get_flight_info(request.clone()).await.unwrap(); - ensure_metadata(&client, &test_server); - }) - .await; -} - // TODO more negative tests (like if there are endpoints defined, etc) #[tokio::test] async fn test_do_get() { do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); let ticket = Ticket { ticket: Bytes::from("my awesome flight ticket"), }; @@ -202,6 +174,7 @@ async fn test_do_get() { assert_eq!(response, expected_response); assert_eq!(test_server.take_do_get_request(), Some(ticket)); + ensure_metadata(&client, &test_server); }) .await; } @@ -209,7 +182,7 @@ async fn test_do_get() { #[tokio::test] async fn test_do_get_error() { do_test(|test_server, mut client| async move { - client.add_header("foo", "bar").unwrap(); + client.add_header("foo-header", "bar-header-value").unwrap(); let ticket = Ticket { ticket: Bytes::from("my awesome flight ticket"), }; @@ -259,6 +232,248 @@ async fn test_do_get_error_in_record_batch_stream() { .await; } +#[tokio::test] +async fn test_do_put() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + // encode the batch as a stream of FlightData + let input_flight_data = test_flight_data().await; + + let expected_response = vec![ + PutResult { + app_metadata: Bytes::from("foo-metadata1"), + }, + PutResult { + app_metadata: Bytes::from("bar-metadata2"), + }, + ]; + + test_server + .set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); + + let response_stream = client + .do_put(futures::stream::iter(input_flight_data.clone())) + .await + .expect("error making request"); + + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_put_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let input_flight_data = test_flight_data().await; + + let response = client + .do_put(futures::stream::iter(input_flight_data.clone())) + .await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::internal("No do_put response configured"); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_put_error_stream() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let input_flight_data = test_flight_data().await; + + let response = vec![ + Ok(PutResult { + app_metadata: Bytes::from("foo-metadata"), + }), + Err(FlightError::Tonic(Status::invalid_argument("bad arg"))), + ]; + + test_server.set_do_put_response(response); + + let response_stream = client + .do_put(futures::stream::iter(input_flight_data.clone())) + .await + .expect("error making request"); + + let response: Result, _> = response_stream.try_collect().await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::invalid_argument("bad arg"); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_exchange() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + // encode the batch as a stream of FlightData + let input_flight_data = test_flight_data().await; + let output_flight_data = test_flight_data2().await; + + test_server.set_do_exchange_response( + output_flight_data.clone().into_iter().map(Ok).collect(), + ); + + let response_stream = client + .do_exchange(futures::stream::iter(input_flight_data.clone())) + .await + .expect("error making request"); + + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + let expected_stream = futures::stream::iter(output_flight_data).map(Ok); + + let expected_batches: Vec<_> = + FlightRecordBatchStream::new_from_flight_data(expected_stream) + .try_collect() + .await + .unwrap(); + + assert_eq!(response, expected_batches); + assert_eq!( + test_server.take_do_exchange_request(), + Some(input_flight_data) + ); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_exchange_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let input_flight_data = test_flight_data().await; + + let response = client + .do_exchange(futures::stream::iter(input_flight_data.clone())) + .await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::internal("No do_exchange response configured"); + expect_status(response, e); + // server still got the request + assert_eq!( + test_server.take_do_exchange_request(), + Some(input_flight_data) + ); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_exchange_error_stream() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let input_flight_data = test_flight_data().await; + + let response = test_flight_data2() + .await + .into_iter() + .enumerate() + .map(|(i, m)| { + if i == 0 { + Ok(m) + } else { + // make all messages after the first an error + let e = tonic::Status::invalid_argument("the error"); + Err(FlightError::Tonic(e)) + } + }) + .collect(); + + test_server.set_do_exchange_response(response); + + let response_stream = client + .do_exchange(futures::stream::iter(input_flight_data.clone())) + .await + .expect("error making request"); + + let response: Result, _> = response_stream.try_collect().await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = tonic::Status::invalid_argument("the error"); + expect_status(response, e); + // server still got the request + assert_eq!( + test_server.take_do_exchange_request(), + Some(input_flight_data) + ); + ensure_metadata(&client, &test_server); + }) + .await; +} + +async fn test_flight_data() -> Vec { + let batch = RecordBatch::try_from_iter(vec![( + "col", + Arc::new(UInt64Array::from_iter([1, 2, 3, 4])) as _, + )]) + .unwrap(); + + // encode the batch as a stream of FlightData + FlightDataEncoderBuilder::new() + .build(futures::stream::iter(vec![Ok(batch)])) + .try_collect() + .await + .unwrap() +} + +async fn test_flight_data2() -> Vec { + let batch = RecordBatch::try_from_iter(vec![( + "col2", + Arc::new(UInt64Array::from_iter([10, 23, 33])) as _, + )]) + .unwrap(); + + // encode the batch as a stream of FlightData + FlightDataEncoderBuilder::new() + .build(futures::stream::iter(vec![Ok(batch)])) + .try_collect() + .await + .unwrap() +} + /// Runs the future returned by the function, passing it a test server and client async fn do_test(f: F) where diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index 45f81b189e8d..5060d9d0cc89 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -18,7 +18,7 @@ use std::sync::{Arc, Mutex}; use arrow_array::RecordBatch; -use futures::{stream::BoxStream, TryStreamExt}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use tonic::{metadata::MetadataMap, Request, Response, Status, Streaming}; use arrow_flight::{ @@ -98,6 +98,39 @@ impl TestFlightServer { .take() } + /// Specify the response returned from the next call to `do_put` + pub fn set_do_put_response(&self, response: Vec>) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.do_put_response.replace(response); + } + + /// Take and return last do_put request send to the server, + pub fn take_do_put_request(&self) -> Option> { + self.state + .lock() + .expect("mutex not poisoned") + .do_put_request + .take() + } + + /// Specify the response returned from the next call to `do_exchange` + pub fn set_do_exchange_response( + &self, + response: Vec>, + ) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.do_exchange_response.replace(response); + } + + /// Take and return last do_exchange request send to the server, + pub fn take_do_exchange_request(&self) -> Option> { + self.state + .lock() + .expect("mutex not poisoned") + .do_exchange_request + .take() + } + /// Returns the last metadata from a request received by the server pub fn take_last_request_metadata(&self) -> Option { self.state @@ -130,6 +163,14 @@ struct State { pub do_get_request: Option, /// The next response returned from `do_get` pub do_get_response: Option>>, + /// The last do_put request received + pub do_put_request: Option>, + /// The next response returned from `do_put` + pub do_put_response: Option>>, + /// The last do_exchange request received + pub do_exchange_request: Option>, + /// The next response returned from `do_exchange` + pub do_exchange_response: Option>>, /// The last request headers received pub last_request_metadata: Option, } @@ -167,7 +208,7 @@ impl FlightService for TestFlightServer { // turn into a streaming response let output = futures::stream::iter(std::iter::once(Ok(response))); - Ok(Response::new(Box::pin(output) as Self::HandshakeStream)) + Ok(Response::new(output.boxed())) } async fn list_flights( @@ -215,16 +256,30 @@ impl FlightService for TestFlightServer { let stream = FlightDataEncoderBuilder::new() .build(batch_stream) - .map_err(|e| e.into()); + .map_err(Into::into); - Ok(Response::new(Box::pin(stream) as _)) + Ok(Response::new(stream.boxed())) } async fn do_put( &self, - _request: Request>, + request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Implement do_put")) + self.save_metadata(&request); + let do_put_request: Vec<_> = request.into_inner().try_collect().await?; + + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.do_put_request = Some(do_put_request); + + let response = state + .do_put_response + .take() + .ok_or_else(|| Status::internal("No do_put response configured"))?; + + let stream = futures::stream::iter(response).map_err(Into::into); + + Ok(Response::new(stream.boxed())) } async fn do_action( @@ -243,8 +298,22 @@ impl FlightService for TestFlightServer { async fn do_exchange( &self, - _request: Request>, + request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Implement do_exchange")) + self.save_metadata(&request); + let do_exchange_request: Vec<_> = request.into_inner().try_collect().await?; + + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.do_exchange_request = Some(do_exchange_request); + + let response = state + .do_exchange_response + .take() + .ok_or_else(|| Status::internal("No do_exchange response configured"))?; + + let stream = futures::stream::iter(response).map_err(Into::into); + + Ok(Response::new(stream.boxed())) } } From 4f44c2d3ef2e26ae773852f700e28373462304a7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Jan 2023 11:52:04 -0500 Subject: [PATCH 0476/1411] Minor: add ticket links to failing ipc integration tests (#3461) --- arrow-integration-testing/tests/ipc_reader.rs | 10 ++++++---- arrow-integration-testing/tests/ipc_writer.rs | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index e185634f0dd4..6d91eeccb19e 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -96,8 +96,11 @@ fn read_1_0_0_bigendian() { FileReader::try_new(file, None).unwrap(); - // While the the reader doesn't error but the values are not read correctly - // so verifing the contents fails + // While the the reader doesn't error but the values are not + // read correctly on little endian platforms so verifing the + // contents fails + // + // https://github.com/apache/arrow-rs/issues/3459 //verify_arrow_file(&testdata, "1.0.0-bigendian", path); }); } @@ -117,8 +120,7 @@ fn read_1_0_0_littleendian() { "generated_extension", "generated_interval", "generated_map", - // fails with - // thread 'read_1_0_0_littleendian' panicked at 'assertion failed: `(left == right)` + // https://github.com/apache/arrow-rs/issues/3460 //"generated_map_non_canonical", "generated_nested", "generated_nested_dictionary", diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index e429b5e5cb39..a521737fa5ea 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -70,7 +70,7 @@ fn write_1_0_0_littleendian() { "generated_extension", "generated_interval", "generated_map", - // thread 'write_1_0_0_littleendian' panicked at 'assertion failed: `(left == right)` + // https://github.com/apache/arrow-rs/issues/3460 // "generated_map_non_canonical", "generated_nested", "generated_nested_dictionary", From 2d2d0a3ba72efb5ee82324064f7c7678c2dd8336 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Jan 2023 14:41:28 -0500 Subject: [PATCH 0477/1411] Add tests for `FlightClient::{list_flights, list_actions, do_action, get_schema}` (#3463) --- arrow-flight/src/lib.rs | 7 + arrow-flight/tests/client.rs | 326 +++++++++++++++++++++++++++- arrow-flight/tests/common/server.rs | 165 ++++++++++++-- 3 files changed, 466 insertions(+), 32 deletions(-) diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 87aeba1c1194..3057735a6ad7 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -454,6 +454,13 @@ impl Action { } } +impl Result { + /// Create a new Result with the specified body + pub fn new(body: impl Into) -> Self { + Self { body: body.into() } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 7537e46db403..032dad04923d 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -23,9 +23,10 @@ mod common { use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, - error::FlightError, FlightClient, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, Ticket, + error::FlightError, Action, ActionType, Criteria, Empty, FlightClient, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, Ticket, }; +use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; use common::server::TestFlightServer; use futures::{Future, StreamExt, TryStreamExt}; @@ -70,10 +71,9 @@ async fn test_handshake_error() { do_test(|test_server, mut client| async move { let request_payload = "foo-request-payload".to_string().into_bytes(); let e = Status::unauthenticated("DENIED"); - test_server.set_handshake_response(Err(e)); + test_server.set_handshake_response(Err(e.clone())); let response = client.handshake(request_payload).await.unwrap_err(); - let e = Status::unauthenticated("DENIED"); expect_status(response, e); }) .await; @@ -134,10 +134,9 @@ async fn test_get_flight_info_error() { let request = FlightDescriptor::new_cmd(b"My Command".to_vec()); let e = Status::unauthenticated("DENIED"); - test_server.set_get_flight_info_response(Err(e)); + test_server.set_get_flight_info_response(Err(e.clone())); let response = client.get_flight_info(request.clone()).await.unwrap_err(); - let e = Status::unauthenticated("DENIED"); expect_status(response, e); }) .await; @@ -213,7 +212,7 @@ async fn test_do_get_error_in_record_batch_stream() { let e = Status::data_loss("she's dead jim"); - let expected_response = vec![Ok(batch), Err(FlightError::Tonic(e.clone()))]; + let expected_response = vec![Ok(batch), Err(e.clone())]; test_server.set_do_get_response(expected_response); @@ -300,11 +299,13 @@ async fn test_do_put_error_stream() { let input_flight_data = test_flight_data().await; + let e = Status::invalid_argument("bad arg"); + let response = vec![ Ok(PutResult { app_metadata: Bytes::from("foo-metadata"), }), - Err(FlightError::Tonic(Status::invalid_argument("bad arg"))), + Err(e.clone()), ]; test_server.set_do_put_response(response); @@ -320,7 +321,6 @@ async fn test_do_put_error_stream() { Err(e) => e, }; - let e = Status::invalid_argument("bad arg"); expect_status(response, e); // server still got the request assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); @@ -404,6 +404,7 @@ async fn test_do_exchange_error_stream() { let input_flight_data = test_flight_data().await; + let e = Status::invalid_argument("the error"); let response = test_flight_data2() .await .into_iter() @@ -413,8 +414,7 @@ async fn test_do_exchange_error_stream() { Ok(m) } else { // make all messages after the first an error - let e = tonic::Status::invalid_argument("the error"); - Err(FlightError::Tonic(e)) + Err(e.clone()) } }) .collect(); @@ -432,7 +432,6 @@ async fn test_do_exchange_error_stream() { Err(e) => e, }; - let e = tonic::Status::invalid_argument("the error"); expect_status(response, e); // server still got the request assert_eq!( @@ -444,6 +443,309 @@ async fn test_do_exchange_error_stream() { .await; } +#[tokio::test] +async fn test_get_schema() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let schema = Schema::new(vec![Field::new("foo", DataType::Int64, true)]); + + let request = FlightDescriptor::new_cmd("my command"); + test_server.set_get_schema_response(Ok(schema.clone())); + + let response = client + .get_schema(request.clone()) + .await + .expect("error making request"); + + let expected_schema = schema; + let expected_request = request; + + assert_eq!(response, expected_schema); + assert_eq!( + test_server.take_get_schema_request(), + Some(expected_request) + ); + + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_get_schema_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + let request = FlightDescriptor::new_cmd("my command"); + + let e = Status::unauthenticated("DENIED"); + test_server.set_get_schema_response(Err(e.clone())); + + let response = client.get_schema(request).await.unwrap_err(); + expect_status(response, e); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_flights() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let infos = vec![ + test_flight_info(&FlightDescriptor::new_cmd("foo")), + test_flight_info(&FlightDescriptor::new_cmd("bar")), + ]; + + let response = infos.iter().map(|i| Ok(i.clone())).collect(); + test_server.set_list_flights_response(response); + + let response_stream = client + .list_flights("query") + .await + .expect("error making request"); + + let expected_response = infos; + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + let expected_request = Some(Criteria { + expression: "query".into(), + }); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_list_flights_request(), expected_request); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_flights_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let response = client.list_flights("query").await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::internal("No list_flights response configured"); + expect_status(response, e); + // server still got the request + let expected_request = Some(Criteria { + expression: "query".into(), + }); + assert_eq!(test_server.take_list_flights_request(), expected_request); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_flights_error_in_stream() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let e = Status::data_loss("she's dead jim"); + + let response = vec![ + Ok(test_flight_info(&FlightDescriptor::new_cmd("foo"))), + Err(e.clone()), + ]; + test_server.set_list_flights_response(response); + + let response_stream = client + .list_flights("other query") + .await + .expect("error making request"); + + let response: Result, FlightError> = response_stream.try_collect().await; + + let response = response.unwrap_err(); + expect_status(response, e); + // server still got the request + let expected_request = Some(Criteria { + expression: "other query".into(), + }); + assert_eq!(test_server.take_list_flights_request(), expected_request); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_actions() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let actions = vec![ + ActionType { + r#type: "type 1".into(), + description: "awesomeness".into(), + }, + ActionType { + r#type: "type 2".into(), + description: "more awesomeness".into(), + }, + ]; + + let response = actions.iter().map(|i| Ok(i.clone())).collect(); + test_server.set_list_actions_response(response); + + let response_stream = client.list_actions().await.expect("error making request"); + + let expected_response = actions; + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_list_actions_request(), Some(Empty {})); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_actions_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let response = client.list_actions().await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::internal("No list_actions response configured"); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_list_actions_request(), Some(Empty {})); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_list_actions_error_in_stream() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let e = Status::data_loss("she's dead jim"); + + let response = vec![ + Ok(ActionType { + r#type: "type 1".into(), + description: "awesomeness".into(), + }), + Err(e.clone()), + ]; + test_server.set_list_actions_response(response); + + let response_stream = client.list_actions().await.expect("error making request"); + + let response: Result, FlightError> = response_stream.try_collect().await; + + let response = response.unwrap_err(); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_list_actions_request(), Some(Empty {})); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_action() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let bytes = vec![Bytes::from("foo"), Bytes::from("blarg")]; + + let response = bytes + .iter() + .cloned() + .map(arrow_flight::Result::new) + .map(Ok) + .collect(); + test_server.set_do_action_response(response); + + let request = Action::new("action type", "action body"); + + let response_stream = client + .do_action(request.clone()) + .await + .expect("error making request"); + + let expected_response = bytes; + let response: Vec<_> = response_stream + .try_collect() + .await + .expect("Error streaming data"); + + assert_eq!(response, expected_response); + assert_eq!(test_server.take_do_action_request(), Some(request)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_action_error() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let request = Action::new("action type", "action body"); + + let response = client.do_action(request.clone()).await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + let e = Status::internal("No do_action response configured"); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_action_request(), Some(request)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_action_error_in_stream() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let e = Status::data_loss("she's dead jim"); + + let request = Action::new("action type", "action body"); + + let response = vec![Ok(arrow_flight::Result::new("foo")), Err(e.clone())]; + test_server.set_do_action_response(response); + + let response_stream = client + .do_action(request.clone()) + .await + .expect("error making request"); + + let response: Result, FlightError> = response_stream.try_collect().await; + + let response = response.unwrap_err(); + expect_status(response, e); + // server still got the request + assert_eq!(test_server.take_do_action_request(), Some(request)); + ensure_metadata(&client, &test_server); + }) + .await; +} + async fn test_flight_data() -> Vec { let batch = RecordBatch::try_from_iter(vec![( "col", diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index 5060d9d0cc89..b87019d632c4 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -18,15 +18,15 @@ use std::sync::{Arc, Mutex}; use arrow_array::RecordBatch; +use arrow_schema::Schema; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use tonic::{metadata::MetadataMap, Request, Response, Status, Streaming}; use arrow_flight::{ encode::FlightDataEncoderBuilder, - error::FlightError, flight_service_server::{FlightService, FlightServiceServer}, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, + HandshakeRequest, HandshakeResponse, PutResult, SchemaAsIpc, SchemaResult, Ticket, }; #[derive(Debug, Clone)] @@ -84,7 +84,7 @@ impl TestFlightServer { } /// Specify the response returned from the next call to `do_get` - pub fn set_do_get_response(&self, response: Vec>) { + pub fn set_do_get_response(&self, response: Vec>) { let mut state = self.state.lock().expect("mutex not poisoned"); state.do_get_response.replace(response); } @@ -99,7 +99,7 @@ impl TestFlightServer { } /// Specify the response returned from the next call to `do_put` - pub fn set_do_put_response(&self, response: Vec>) { + pub fn set_do_put_response(&self, response: Vec>) { let mut state = self.state.lock().expect("mutex not poisoned"); state.do_put_response.replace(response); } @@ -114,10 +114,7 @@ impl TestFlightServer { } /// Specify the response returned from the next call to `do_exchange` - pub fn set_do_exchange_response( - &self, - response: Vec>, - ) { + pub fn set_do_exchange_response(&self, response: Vec>) { let mut state = self.state.lock().expect("mutex not poisoned"); state.do_exchange_response.replace(response); } @@ -131,6 +128,69 @@ impl TestFlightServer { .take() } + /// Specify the response returned from the next call to `list_flights` + pub fn set_list_flights_response(&self, response: Vec>) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.list_flights_response.replace(response); + } + + /// Take and return last list_flights request send to the server, + pub fn take_list_flights_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .list_flights_request + .take() + } + + /// Specify the response returned from the next call to `get_schema` + pub fn set_get_schema_response(&self, response: Result) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.get_schema_response.replace(response); + } + + /// Take and return last get_schema request send to the server, + pub fn take_get_schema_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .get_schema_request + .take() + } + + /// Specify the response returned from the next call to `list_actions` + pub fn set_list_actions_response(&self, response: Vec>) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.list_actions_response.replace(response); + } + + /// Take and return last list_actions request send to the server, + pub fn take_list_actions_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .list_actions_request + .take() + } + + /// Specify the response returned from the next call to `do_action` + pub fn set_do_action_response( + &self, + response: Vec>, + ) { + let mut state = self.state.lock().expect("mutex not poisoned"); + state.do_action_response.replace(response); + } + + /// Take and return last do_action request send to the server, + pub fn take_do_action_request(&self) -> Option { + self.state + .lock() + .expect("mutex not poisoned") + .do_action_request + .take() + } + /// Returns the last metadata from a request received by the server pub fn take_last_request_metadata(&self) -> Option { self.state @@ -162,15 +222,31 @@ struct State { /// The last do_get request received pub do_get_request: Option, /// The next response returned from `do_get` - pub do_get_response: Option>>, + pub do_get_response: Option>>, /// The last do_put request received pub do_put_request: Option>, /// The next response returned from `do_put` - pub do_put_response: Option>>, + pub do_put_response: Option>>, /// The last do_exchange request received pub do_exchange_request: Option>, /// The next response returned from `do_exchange` - pub do_exchange_response: Option>>, + pub do_exchange_response: Option>>, + /// The last list_flights request received + pub list_flights_request: Option, + /// The next response returned from `list_flights` + pub list_flights_response: Option>>, + /// The last get_schema request received + pub get_schema_request: Option, + /// The next response returned from `get_schema` + pub get_schema_response: Option>, + /// The last list_actions request received + pub list_actions_request: Option, + /// The next response returned from `list_actions` + pub list_actions_response: Option>>, + /// The last do_action request received + pub do_action_request: Option, + /// The next response returned from `do_action` + pub do_action_response: Option>>, /// The last request headers received pub last_request_metadata: Option, } @@ -213,9 +289,21 @@ impl FlightService for TestFlightServer { async fn list_flights( &self, - _request: Request, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Implement list_flights")) + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.list_flights_request = Some(request.into_inner()); + + let flights: Vec<_> = state + .list_flights_response + .take() + .ok_or_else(|| Status::internal("No list_flights response configured"))?; + + let flights_stream = futures::stream::iter(flights); + + Ok(Response::new(flights_stream.boxed())) } async fn get_flight_info( @@ -233,9 +321,22 @@ impl FlightService for TestFlightServer { async fn get_schema( &self, - _request: Request, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Implement get_schema")) + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + state.get_schema_request = Some(request.into_inner()); + let schema = state.get_schema_response.take().unwrap_or_else(|| { + Err(Status::internal("No get_schema response configured")) + })?; + + // encode the schema + let options = arrow_ipc::writer::IpcWriteOptions::default(); + let response: SchemaResult = SchemaAsIpc::new(&schema, &options) + .try_into() + .expect("Error encoding schema"); + + Ok(Response::new(response)) } async fn do_get( @@ -252,7 +353,7 @@ impl FlightService for TestFlightServer { .take() .ok_or_else(|| Status::internal("No do_get response configured"))?; - let batch_stream = futures::stream::iter(batches); + let batch_stream = futures::stream::iter(batches).map_err(Into::into); let stream = FlightDataEncoderBuilder::new() .build(batch_stream) @@ -284,16 +385,40 @@ impl FlightService for TestFlightServer { async fn do_action( &self, - _request: Request, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Implement do_action")) + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.do_action_request = Some(request.into_inner()); + + let results: Vec<_> = state + .do_action_response + .take() + .ok_or_else(|| Status::internal("No do_action response configured"))?; + + let results_stream = futures::stream::iter(results); + + Ok(Response::new(results_stream.boxed())) } async fn list_actions( &self, - _request: Request, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Implement list_actions")) + self.save_metadata(&request); + let mut state = self.state.lock().expect("mutex not poisoned"); + + state.list_actions_request = Some(request.into_inner()); + + let actions: Vec<_> = state + .list_actions_response + .take() + .ok_or_else(|| Status::internal("No list_actions response configured"))?; + + let action_stream = futures::stream::iter(actions); + + Ok(Response::new(action_stream.boxed())) } async fn do_exchange( From acefeef1cb5698a6afe1d3061644f6276d39117c Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 5 Jan 2023 22:00:50 -0800 Subject: [PATCH 0478/1411] support RFC3339 style timestamps in `arrow-json` (#3449) * Use array_value_to_string in arrow-json * Update tests * Add test to write timestamps with timezone in json * Add use_z=true to to_rfc3339 * Fix the write_timestamps_tz test * Add arrow-json/chrono-tz to top level Cargo.toml * Remove chrono-tz requirements from arrow-json * Fix linting errors --- arrow-cast/src/display.rs | 3 +- arrow-json/src/writer.rs | 74 +++++++++++++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 10709994ddae..5534ebd8134a 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -27,6 +27,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::*; +use chrono::prelude::SecondsFormat; macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -157,7 +158,7 @@ macro_rules! make_string_datetime_with_tz { let s = match $tz_string.parse::() { Ok(tz) => array .value_as_datetime_with_tz($row, tz) - .map(|d| format!("{}", d.to_rfc3339())) + .map(|d| format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true))) .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), Err(_) => array .value_as_datetime($row) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 16eec79c64ac..9045bd3a77ee 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -105,6 +105,8 @@ use arrow_array::types::*; use arrow_array::*; use arrow_schema::*; +use arrow_cast::display::array_value_to_string; + fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where T: ArrowPrimitiveType, @@ -217,17 +219,16 @@ macro_rules! set_column_by_array_type { macro_rules! set_temporal_column_by_array_type { ($array_type:ident, $col_name:ident, $rows:ident, $array:ident, $row_count:ident, $cast_fn:ident) => { - let arr = $array.as_any().downcast_ref::<$array_type>().unwrap(); - $rows .iter_mut() .enumerate() .take($row_count) .for_each(|(i, row)| { - if !arr.is_null(i) { - if let Some(v) = arr.$cast_fn(i) { - row.insert($col_name.to_string(), v.to_string().into()); - } + if !$array.is_null(i) { + row.insert( + $col_name.to_string(), + array_value_to_string($array, i).unwrap().to_string().into(), + ); } }); }; @@ -925,7 +926,66 @@ mod tests { assert_json_eq( &buf, - r#"{"nanos":"2018-11-13 17:11:10.011375885","micros":"2018-11-13 17:11:10.011375","millis":"2018-11-13 17:11:10.011","secs":"2018-11-13 17:11:10","name":"a"} + r#"{"micros":"2018-11-13T17:11:10.011375","millis":"2018-11-13T17:11:10.011","name":"a","nanos":"2018-11-13T17:11:10.011375885","secs":"2018-11-13T17:11:10"} +{"name":"b"} +"#, + ); + } + + #[test] + fn write_timestamps_with_tz() { + let ts_string = "2018-11-13T17:11:10.011375885995"; + let ts_nanos = ts_string + .parse::() + .unwrap() + .timestamp_nanos(); + let ts_micros = ts_nanos / 1000; + let ts_millis = ts_micros / 1000; + let ts_secs = ts_millis / 1000; + + let arr_nanos = TimestampNanosecondArray::from(vec![Some(ts_nanos), None]); + let arr_micros = TimestampMicrosecondArray::from(vec![Some(ts_micros), None]); + let arr_millis = TimestampMillisecondArray::from(vec![Some(ts_millis), None]); + let arr_secs = TimestampSecondArray::from(vec![Some(ts_secs), None]); + let arr_names = StringArray::from(vec![Some("a"), Some("b")]); + + let tz = "+00:00".to_string(); + + let arr_nanos = arr_nanos.with_timezone(&tz); + let arr_micros = arr_micros.with_timezone(&tz); + let arr_millis = arr_millis.with_timezone(&tz); + let arr_secs = arr_secs.with_timezone(&tz); + + let schema = Schema::new(vec![ + Field::new("nanos", arr_nanos.data_type().clone(), true), + Field::new("micros", arr_micros.data_type().clone(), true), + Field::new("millis", arr_millis.data_type().clone(), true), + Field::new("secs", arr_secs.data_type().clone(), true), + Field::new("name", arr_names.data_type().clone(), true), + ]); + let schema = Arc::new(schema); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(arr_nanos), + Arc::new(arr_micros), + Arc::new(arr_millis), + Arc::new(arr_secs), + Arc::new(arr_names), + ], + ) + .unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"micros":"2018-11-13T17:11:10.011375Z","millis":"2018-11-13T17:11:10.011Z","name":"a","nanos":"2018-11-13T17:11:10.011375885Z","secs":"2018-11-13T17:11:10Z"} {"name":"b"} "#, ); From 9403ef5119de1b791418b47f6c704b950bba9dc5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 6 Jan 2023 09:47:40 -0800 Subject: [PATCH 0479/1411] Add more dictionary value type support to build_compare. (#3466) --- arrow-ord/src/ord.rs | 149 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 3 deletions(-) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 44eb3b183802..6122f9cb3f33 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -82,7 +82,7 @@ fn compare_dict_primitive(left: &dyn Array, right: &dyn Array) -> DynCompa where K: ArrowDictionaryKeyType, V: ArrowPrimitiveType, - V::Native: Ord, + V::Native: ArrowNativeTypeOp, { let left = left.as_any().downcast_ref::>().unwrap(); let right = right.as_any().downcast_ref::>().unwrap(); @@ -99,7 +99,7 @@ where let key_right = right_keys.value(j).as_usize(); let left = left_values.value(key_left); let right = right_values.value(key_right); - left.cmp(&right) + left.compare(right) }) } @@ -131,7 +131,7 @@ fn cmp_dict_primitive( ) -> Result where VT: ArrowPrimitiveType, - VT::Native: Ord, + VT::Native: ArrowNativeTypeOp, { use DataType::*; @@ -263,6 +263,73 @@ pub fn build_compare( UInt16 => cmp_dict_primitive::(key_type_lhs, left, right)?, UInt32 => cmp_dict_primitive::(key_type_lhs, left, right)?, UInt64 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Float32 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Float64 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Date32 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Date64 => cmp_dict_primitive::(key_type_lhs, left, right)?, + Time32(Second) => { + cmp_dict_primitive::(key_type_lhs, left, right)? + } + Time32(Millisecond) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Time64(Microsecond) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Time64(Nanosecond) => { + cmp_dict_primitive::(key_type_lhs, left, right)? + } + Timestamp(Second, _) => { + cmp_dict_primitive::(key_type_lhs, left, right)? + } + Timestamp(Millisecond, _) => cmp_dict_primitive::< + TimestampMillisecondType, + >(key_type_lhs, left, right)?, + Timestamp(Microsecond, _) => cmp_dict_primitive::< + TimestampMicrosecondType, + >(key_type_lhs, left, right)?, + Timestamp(Nanosecond, _) => { + cmp_dict_primitive::( + key_type_lhs, + left, + right, + )? + } + Interval(YearMonth) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Interval(DayTime) => { + cmp_dict_primitive::(key_type_lhs, left, right)? + } + Interval(MonthDayNano) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Duration(Second) => { + cmp_dict_primitive::(key_type_lhs, left, right)? + } + Duration(Millisecond) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Duration(Microsecond) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, + Duration(Nanosecond) => cmp_dict_primitive::( + key_type_lhs, + left, + right, + )?, Utf8 => match key_type_lhs { UInt8 => compare_dict_string::(left, right), UInt16 => compare_dict_string::(left, right), @@ -441,4 +508,80 @@ pub mod tests { assert_eq!(Ordering::Greater, (cmp)(3, 1)); assert_eq!(Ordering::Greater, (cmp)(3, 2)); } + + #[test] + fn test_float_dict() { + let values = Float32Array::from(vec![1.0, 0.5, 2.1, 5.5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = Float32Array::from(vec![1.2, 3.2, 4.0, 5.5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } + + #[test] + fn test_timestamp_dict() { + let values = TimestampSecondArray::from(vec![1, 0, 2, 5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = TimestampSecondArray::from(vec![2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } + + #[test] + fn test_interval_dict() { + let values = IntervalDayTimeArray::from(vec![1, 0, 2, 5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = IntervalDayTimeArray::from(vec![2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } + + #[test] + fn test_duration_dict() { + let values = DurationSecondArray::from(vec![1, 0, 2, 5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = DurationSecondArray::from(vec![2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } } From b4d57059e4b375a7b2128e430865390a5aa81510 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 6 Jan 2023 09:47:56 -0800 Subject: [PATCH 0480/1411] Add a test for stream writer for writing sliced array (#3472) --- arrow-ipc/src/writer.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index e4dcab40a148..ed5e53a959c0 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1890,4 +1890,40 @@ mod tests { assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); assert_eq!(record_batch_slice, deserialized_batch); } + + #[test] + fn test_stream_writer_writes_array_slice() { + let array = UInt32Array::from(vec![Some(1), Some(2), Some(3)]); + assert_eq!( + vec![Some(1), Some(2), Some(3)], + array.iter().collect::>() + ); + + let sliced = array.slice(1, 2); + let read_sliced: &UInt32Array = as_primitive_array(&sliced); + assert_eq!( + vec![Some(2), Some(3)], + read_sliced.iter().collect::>() + ); + + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::UInt32, true)])), + vec![sliced], + ) + .expect("new batch"); + + let mut writer = + StreamWriter::try_new(vec![], &batch.schema()).expect("new writer"); + writer.write(&batch).expect("write"); + let outbuf = writer.into_inner().expect("inner"); + + let mut reader = StreamReader::try_new(&outbuf[..], None).expect("new reader"); + let read_batch = reader.next().unwrap().expect("read batch"); + + let read_array: &UInt32Array = as_primitive_array(read_batch.column(0)); + assert_eq!( + vec![Some(2), Some(3)], + read_array.iter().collect::>() + ); + } } From 7805a81157152d161c172bbf8ac5ec671568b480 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Jan 2023 14:11:44 -0500 Subject: [PATCH 0481/1411] Add tests for record batch size splitting logic in FlightClient (#3481) * Add tests for record batch size splitting logic in FlightClient * cargo clippy --fix * fix: Improve comments --- arrow-flight/src/encode.rs | 227 ++++++++++++++++++++++++++++++++++++- 1 file changed, 226 insertions(+), 1 deletion(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 55000bba2fad..c130a2d7e8cc 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -423,8 +423,11 @@ mod tests { use arrow::{ array::{UInt32Array, UInt8Array}, compute::concat_batches, + datatypes::Int32Type, + }; + use arrow_array::{ + DictionaryArray, Int16Array, Int32Array, Int64Array, StringArray, UInt64Array, }; - use arrow_array::UInt64Array; use super::*; @@ -556,4 +559,226 @@ mod tests { // test sending record batches // test sending record batches with multiple different dictionaries + + #[tokio::test] + async fn flight_data_size_even() { + let s1 = + StringArray::from_iter_values(std::iter::repeat(".10 bytes.").take(1024)); + let i1 = Int16Array::from_iter_values(0..1024); + let s2 = StringArray::from_iter_values(std::iter::repeat("6bytes").take(1024)); + let i2 = Int64Array::from_iter_values(0..1024); + + let batch = RecordBatch::try_from_iter(vec![ + ("s1", Arc::new(s1) as _), + ("i1", Arc::new(i1) as _), + ("s2", Arc::new(s2) as _), + ("i2", Arc::new(i2) as _), + ]) + .unwrap(); + + verify_encoded_split(batch, 112).await; + } + + #[tokio::test] + async fn flight_data_size_uneven_variable_lengths() { + // each row has a longer string than the last with increasing lengths 0 --> 1024 + let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); + let batch = + RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); + + // overage is much higher than ideal + // https://github.com/apache/arrow-rs/issues/3478 + verify_encoded_split(batch, 4304).await; + } + + #[tokio::test] + async fn flight_data_size_large_row() { + // batch with individual that can each exceed the batch size + let array1 = StringArray::from_iter_values(vec![ + "*".repeat(500), + "*".repeat(500), + "*".repeat(500), + "*".repeat(500), + ]); + let array2 = StringArray::from_iter_values(vec![ + "*".to_string(), + "*".repeat(1000), + "*".repeat(2000), + "*".repeat(4000), + ]); + + let array3 = StringArray::from_iter_values(vec![ + "*".to_string(), + "*".to_string(), + "*".repeat(1000), + "*".repeat(2000), + ]); + + let batch = RecordBatch::try_from_iter(vec![ + ("a1", Arc::new(array1) as _), + ("a2", Arc::new(array2) as _), + ("a3", Arc::new(array3) as _), + ]) + .unwrap(); + + // 5k over limit (which is 2x larger than limit of 5k) + // overage is much higher than ideal + // https://github.com/apache/arrow-rs/issues/3478 + verify_encoded_split(batch, 5800).await; + } + + #[tokio::test] + async fn flight_data_size_string_dictionary() { + // Small dictionary (only 2 distinct values ==> 2 entries in dictionary) + let array: DictionaryArray = (1..1024) + .map(|i| match i % 3 { + 0 => Some("value0"), + 1 => Some("value1"), + _ => None, + }) + .collect(); + + let batch = + RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + + verify_encoded_split(batch, 160).await; + } + + #[tokio::test] + async fn flight_data_size_large_dictionary() { + // large dictionary (all distinct values ==> 1024 entries in dictionary) + let values: Vec<_> = (1..1024).map(|i| "**".repeat(i)).collect(); + + let array: DictionaryArray = + values.iter().map(|s| Some(s.as_str())).collect(); + + let batch = + RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + + // overage is much higher than ideal + // https://github.com/apache/arrow-rs/issues/3478 + verify_encoded_split(batch, 3328).await; + } + + #[tokio::test] + async fn flight_data_size_large_dictionary_repeated_non_uniform() { + // large dictionary (1024 distinct values) that are used throughout the array + let values = StringArray::from_iter_values((0..1024).map(|i| "******".repeat(i))); + let keys = Int32Array::from_iter_values((0..3000).map(|i| (3000 - i) % 1024)); + let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let batch = + RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + + // overage is much higher than ideal + // https://github.com/apache/arrow-rs/issues/3478 + verify_encoded_split(batch, 5280).await; + } + + #[tokio::test] + async fn flight_data_size_multiple_dictionaries() { + // high cardinality + let values1: Vec<_> = (1..1024).map(|i| "**".repeat(i)).collect(); + // highish cardinality + let values2: Vec<_> = (1..1024).map(|i| "**".repeat(i % 10)).collect(); + // medium cardinality + let values3: Vec<_> = (1..1024).map(|i| "**".repeat(i % 100)).collect(); + + let array1: DictionaryArray = + values1.iter().map(|s| Some(s.as_str())).collect(); + let array2: DictionaryArray = + values2.iter().map(|s| Some(s.as_str())).collect(); + let array3: DictionaryArray = + values3.iter().map(|s| Some(s.as_str())).collect(); + + let batch = RecordBatch::try_from_iter(vec![ + ("a1", Arc::new(array1) as _), + ("a2", Arc::new(array2) as _), + ("a3", Arc::new(array3) as _), + ]) + .unwrap(); + + // overage is much higher than ideal + // https://github.com/apache/arrow-rs/issues/3478 + verify_encoded_split(batch, 4128).await; + } + + /// Return size, in memory of flight data + fn flight_data_size(d: &FlightData) -> usize { + let flight_descriptor_size = d + .flight_descriptor + .as_ref() + .map(|descriptor| { + let path_len: usize = + descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); + + std::mem::size_of_val(descriptor) + descriptor.cmd.len() + path_len + }) + .unwrap_or(0); + + flight_descriptor_size + + d.app_metadata.len() + + d.data_body.len() + + d.data_header.len() + } + + /// Coverage for + /// + /// Encodes the specified batch using several values of + /// `max_flight_data_size` between 1K to 5K and ensures that the + /// resulting size of the flight data stays within the limit + /// + `allowed_overage` + /// + /// `allowed_overage` is how far off the actual data encoding is + /// from the target limit that was set. It is an improvement when + /// the allowed_overage decreses. + /// + /// Note this overhead will likely always be greater than zero to + /// account for encoding overhead such as IPC headers and padding. + /// + /// + async fn verify_encoded_split(batch: RecordBatch, allowed_overage: usize) { + let num_rows = batch.num_rows(); + + // Track the overall required maximum overage + let mut max_overage_seen = 0; + + for max_flight_data_size in [1024, 2021, 5000] { + println!("Encoding {num_rows} with a maximum size of {max_flight_data_size}"); + + let mut stream = FlightDataEncoderBuilder::new() + .with_max_flight_data_size(max_flight_data_size) + .build(futures::stream::iter([Ok(batch.clone())])); + + let mut i = 0; + while let Some(data) = stream.next().await.transpose().unwrap() { + let actual_data_size = flight_data_size(&data); + + let actual_overage = if actual_data_size > max_flight_data_size { + actual_data_size - max_flight_data_size + } else { + 0 + }; + + assert!( + actual_overage <= allowed_overage, + "encoded data[{i}]: actual size {actual_data_size}, \ + actual_overage: {actual_overage} \ + allowed_overage: {allowed_overage}" + ); + + i += 1; + + max_overage_seen = max_overage_seen.max(actual_overage) + } + } + + // ensure that the specified overage is exactly the maxmium so + // that when the splitting logic improves, the tests must be + // updated to reflect the better logic + assert_eq!( + allowed_overage, max_overage_seen, + "Specified overage was too high" + ); + } } From 39eeeaf62fbee0555c9a5ca925e2f277cba9cbbe Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Sat, 7 Jan 2023 06:18:41 +1100 Subject: [PATCH 0482/1411] Meaningful error message for map builder with null keys (#3450) * Meaningful error message for map builder with null keys * clippy --- arrow-array/src/array/map_array.rs | 11 ++-- arrow-array/src/builder/map_builder.rs | 86 ++++++++++++++------------ 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index c3e6cf82248c..f2b9a87a21b9 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -110,11 +110,12 @@ impl From for ArrayData { impl MapArray { fn try_new_from_array_data(data: ArrayData) -> Result { - assert!( - matches!(data.data_type(), DataType::Map(_, _)), - "MapArray expected ArrayData with DataType::Map got {}", - data.data_type() - ); + if !matches!(data.data_type(), DataType::Map(_, _)) { + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray expected ArrayData with DataType::Map got {}", + data.data_type() + ))); + } if data.buffers().len() != 1 { return Err(ArrowError::InvalidArgumentError( diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 0de89e7b73da..3c03a486c226 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -102,8 +102,7 @@ impl MapBuilder { capacity: usize, ) -> Self { let mut offsets_builder = BufferBuilder::::new(capacity + 1); - let len = 0; - offsets_builder.append(len); + offsets_builder.append(0); Self { offsets_builder, null_buffer_builder: NullBufferBuilder::new(capacity), @@ -143,56 +142,49 @@ impl MapBuilder { /// Builds the [`MapArray`] pub fn finish(&mut self) -> MapArray { let len = self.len(); - // Build the keys let keys_arr = self.key_builder.finish(); let values_arr = self.value_builder.finish(); - - let keys_field = Field::new( - self.field_names.key.as_str(), - keys_arr.data_type().clone(), - false, // always nullable - ); - let values_field = Field::new( - self.field_names.value.as_str(), - values_arr.data_type().clone(), - true, - ); - - let struct_array = - StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.null_buffer_builder.finish(); self.offsets_builder.append(0); - let map_field = Box::new(Field::new( - self.field_names.entry.as_str(), - struct_array.data_type().clone(), - false, // always non-nullable - )); - let array_data = ArrayData::builder(DataType::Map(map_field, false)) // TODO: support sorted keys - .len(len) - .add_buffer(offset_buffer) - .add_child_data(struct_array.into_data()) - .null_bit_buffer(null_bit_buffer); - - let array_data = unsafe { array_data.build_unchecked() }; + let null_bit_buffer = self.null_buffer_builder.finish(); - MapArray::from(array_data) + self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) } /// Builds the [`MapArray`] without resetting the builder. pub fn finish_cloned(&self) -> MapArray { let len = self.len(); - // Build the keys let keys_arr = self.key_builder.finish_cloned(); let values_arr = self.value_builder.finish_cloned(); + let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + let null_bit_buffer = self + .null_buffer_builder + .as_slice() + .map(Buffer::from_slice_ref); + + self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) + } + + fn finish_helper( + &self, + keys_arr: Arc, + values_arr: Arc, + offset_buffer: Buffer, + null_bit_buffer: Option, + len: usize, + ) -> MapArray { + assert!( + keys_arr.null_count() == 0, + "Keys array must have no null values, found {} null value(s)", + keys_arr.null_count() + ); let keys_field = Field::new( self.field_names.key.as_str(), keys_arr.data_type().clone(), - false, // always nullable + false, // always non-nullable ); let values_field = Field::new( self.field_names.value.as_str(), @@ -203,11 +195,6 @@ impl MapBuilder { let struct_array = StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); - let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); let map_field = Box::new(Field::new( self.field_names.entry.as_str(), struct_array.data_type().clone(), @@ -255,3 +242,24 @@ impl ArrayBuilder for MapBuilder { self } } + +#[cfg(test)] +mod tests { + use crate::builder::{Int32Builder, StringBuilder}; + + use super::*; + + #[test] + #[should_panic( + expected = "Keys array must have no null values, found 1 null value(s)" + )] + fn test_map_builder_with_null_keys_panics() { + let mut builder = + MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); + builder.keys().append_null(); + builder.values().append_value(42); + builder.append(true).unwrap(); + + builder.finish(); + } +} From b39a20a6171d42e41d0b75c3a16989087880b3e4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 7 Jan 2023 09:07:06 +1300 Subject: [PATCH 0483/1411] Fix CSV infinite loop and improve error messages (#3470) * Fix CSV infinite loop and improve error messages * Doc --- arrow-csv/src/reader/records.rs | 85 +++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index 711baa15278f..501da408815c 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -31,7 +31,7 @@ pub struct RecordReader { num_columns: usize, - num_rows: usize, + line_number: usize, offsets: Vec, data: Vec, } @@ -42,19 +42,21 @@ impl RecordReader { reader, delimiter, num_columns, - num_rows: 0, + line_number: 1, offsets: vec![], data: vec![], } } - fn fill_buf(&mut self, to_read: usize) -> Result<(), ArrowError> { + /// Clears and then fills the buffers on this [`RecordReader`] + /// returning the number of records read + fn fill_buf(&mut self, to_read: usize) -> Result { // Reserve sufficient capacity in offsets self.offsets.resize(to_read * self.num_columns + 1, 0); - self.num_rows = 0; + let mut read = 0; if to_read == 0 { - return Ok(()); + return Ok(0); } // The current offset into `self.data` @@ -71,7 +73,7 @@ impl RecordReader { 'input: loop { // Reserve necessary space in output data based on best estimate - let remaining_rows = to_read - self.num_rows; + let remaining_rows = to_read - read; let capacity = remaining_rows * self.num_columns * AVERAGE_FIELD_SIZE; let estimated_data = capacity.max(MIN_CAPACITY); self.data.resize(output_offset + estimated_data, 0); @@ -94,24 +96,26 @@ impl RecordReader { ReadRecordResult::InputEmpty => break 'input, // Input exhausted, need to read more ReadRecordResult::OutputFull => break, // Need to allocate more capacity ReadRecordResult::OutputEndsFull => { - return Err(ArrowError::CsvError(format!("incorrect number of fields, expected {} got more than {}", self.num_columns, field_count))) + let line_number = self.line_number + read; + return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got more than {}", line_number, self.num_columns, field_count))); } ReadRecordResult::Record => { if field_count != self.num_columns { - return Err(ArrowError::CsvError(format!("incorrect number of fields, expected {} got {}", self.num_columns, field_count))) + let line_number = self.line_number + read; + return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got {}", line_number, self.num_columns, field_count))); } - self.num_rows += 1; + read += 1; field_count = 0; - if self.num_rows == to_read { - break 'outer // Read sufficient rows + if read == to_read { + break 'outer; // Read sufficient rows } if input.len() == input_offset { // Input exhausted, need to read more // Without this read_record will interpret the empty input // byte array as indicating the end of the file - break 'input + break 'input; } } } @@ -135,28 +139,38 @@ impl RecordReader { }); }); - Ok(()) + self.line_number += read; + + Ok(read) } - /// Skips forward `to_skip` rows - pub fn skip(&mut self, mut to_skip: usize) -> Result<(), ArrowError> { + /// Skips forward `to_skip` rows, returning an error if insufficient lines in source + pub fn skip(&mut self, to_skip: usize) -> Result<(), ArrowError> { // TODO: This could be done by scanning for unquoted newline delimiters - while to_skip != 0 { - self.fill_buf(to_skip.min(1024))?; - to_skip -= self.num_rows; + let mut skipped = 0; + while to_skip > skipped { + let read = self.fill_buf(to_skip.min(1024))?; + if read == 0 { + return Err(ArrowError::CsvError(format!( + "Failed to skip {} rows only found {}", + to_skip, skipped + ))); + } + + skipped += read; } Ok(()) } /// Reads up to `to_read` rows from the reader pub fn read(&mut self, to_read: usize) -> Result, ArrowError> { - self.fill_buf(to_read)?; + let num_rows = self.fill_buf(to_read)?; // Need to slice fields to the actual number of rows read // // We intentionally avoid using `Vec::truncate` to avoid having // to re-initialize the data again - let num_fields = self.num_rows * self.num_columns; + let num_fields = num_rows * self.num_columns; let last_offset = self.offsets[num_fields]; // Need to truncate data to the actual amount of data read @@ -165,8 +179,8 @@ impl RecordReader { })?; Ok(StringRecords { + num_rows, num_columns: self.num_columns, - num_rows: self.num_rows, offsets: &self.offsets[..num_fields + 1], data, }) @@ -263,4 +277,33 @@ mod tests { }) } } + + #[test] + fn test_invalid_fields() { + let csv = "a,b\nb,c\na\n"; + let cursor = Cursor::new(csv.as_bytes()); + let mut reader = RecordReader::new(cursor, Reader::new(), 2); + let err = reader.read(4).unwrap_err().to_string(); + + let expected = + "Csv error: incorrect number of fields for line 3, expected 2 got 1"; + + assert_eq!(err, expected); + + // Test with initial skip + let cursor = Cursor::new(csv.as_bytes()); + let mut reader = RecordReader::new(cursor, Reader::new(), 2); + reader.skip(1).unwrap(); + let err = reader.read(4).unwrap_err().to_string(); + assert_eq!(err, expected); + } + + #[test] + fn test_skip_insufficient_rows() { + let csv = "a\nv\n"; + let cursor = Cursor::new(csv.as_bytes()); + let mut reader = RecordReader::new(cursor, Reader::new(), 1); + let err = reader.skip(3).unwrap_err().to_string(); + assert_eq!(err, "Csv error: Failed to skip 3 rows only found 2"); + } } From d325ad2d0f2b39972d1b299479d06c5fd9a41d42 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 6 Jan 2023 15:07:26 -0500 Subject: [PATCH 0484/1411] change `concat_batches` parameter to non owned reference (#3480) * change `concat_batches` parameter to non owned reference * provide backward compatibility Co-authored-by: ask --- arrow-select/src/concat.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index a1bb64be514d..7e28f1695509 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -104,10 +104,11 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { } /// Concatenates `batches` together into a single record batch. -pub fn concat_batches( +pub fn concat_batches<'a>( schema: &SchemaRef, - batches: &[RecordBatch], + input_batches: impl IntoIterator, ) -> Result { + let batches: Vec<&RecordBatch> = input_batches.into_iter().collect(); if batches.is_empty() { return Ok(RecordBatch::new_empty(schema.clone())); } @@ -611,10 +612,14 @@ mod tests { ], ) .unwrap(); - let new_batch = concat_batches(&schema, &[batch1, batch2]).unwrap(); + let new_batch = concat_batches(&schema, [&batch1, &batch2]).unwrap(); assert_eq!(new_batch.schema().as_ref(), schema.as_ref()); assert_eq!(2, new_batch.num_columns()); assert_eq!(4, new_batch.num_rows()); + let new_batch_owned = concat_batches(&schema, &[batch1, batch2]).unwrap(); + assert_eq!(new_batch_owned.schema().as_ref(), schema.as_ref()); + assert_eq!(2, new_batch_owned.num_columns()); + assert_eq!(4, new_batch_owned.num_rows()); } #[test] @@ -623,7 +628,7 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Utf8, false), ])); - let batch = concat_batches(&schema, &[]).unwrap(); + let batch = concat_batches(&schema, []).unwrap(); assert_eq!(batch.schema().as_ref(), schema.as_ref()); assert_eq!(0, batch.num_rows()); } @@ -654,7 +659,7 @@ mod tests { ], ) .unwrap(); - let error = concat_batches(&schema1, &[batch1, batch2]).unwrap_err(); + let error = concat_batches(&schema1, [&batch1, &batch2]).unwrap_err(); assert_eq!( error.to_string(), "Invalid argument error: batches[1] schema is different with argument schema.", From ca7ea599d963a809c687f6aadc5729c452b11a29 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 6 Jan 2023 23:50:34 -0500 Subject: [PATCH 0485/1411] feat: `column_name` based index access for `RecordBatch` and `StructArray` (#3458) * feat: Add `column_name` based index access for `RecordBatch` and `StructArray` * change to simpler coding Co-authored-by: askoa --- arrow-array/src/array/struct_array.rs | 39 +++++++++++++++++++++++++- arrow-array/src/record_batch.rs | 40 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index bf6489c1380c..dc949c8e4269 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -20,7 +20,7 @@ use arrow_buffer::buffer::buffer_bin_or; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; -use std::any::Any; +use std::{any::Any, ops::Index}; /// A nested array type where each child (called *field*) is represented by a separate /// array. @@ -296,6 +296,23 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { } } +impl Index<&str> for StructArray { + type Output = ArrayRef; + + /// Get a reference to a column's array by name. + /// + /// Note: A schema can currently have duplicate field names, in which case + /// the first field will always be selected. + /// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178) + /// + /// # Panics + /// + /// Panics if the name is not in the schema. + fn index(&self, name: &str) -> &Self::Output { + self.column_by_name(name).unwrap() + } +} + #[cfg(test)] mod tests { use super::*; @@ -352,6 +369,26 @@ mod tests { assert_eq!(0, struct_array.offset()); } + /// validates that struct can be accessed using `column_name` as index i.e. `struct_array["column_name"]`. + #[test] + fn test_struct_array_index_access() { + let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); + let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); + + let struct_array = StructArray::from(vec![ + ( + Field::new("b", DataType::Boolean, false), + boolean.clone() as ArrayRef, + ), + ( + Field::new("c", DataType::Int32, false), + int.clone() as ArrayRef, + ), + ]); + assert_eq!(struct_array["b"].as_ref(), boolean.as_ref()); + assert_eq!(struct_array["c"].as_ref(), int.as_ref()); + } + /// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) #[test] fn test_struct_array_from_vec() { diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index ea0eb385358a..72b567f75a80 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -20,6 +20,7 @@ use crate::{new_empty_array, Array, ArrayRef, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; +use std::ops::Index; use std::sync::Arc; /// Trait for types that can read `RecordBatch`'s. @@ -288,6 +289,13 @@ impl RecordBatch { &self.columns[index] } + /// Get a reference to a column's array by name. + pub fn column_by_name(&self, name: &str) -> Option<&ArrayRef> { + self.schema() + .column_with_name(name) + .map(|(index, _)| &self.columns[index]) + } + /// Get a reference to all columns in the record batch. pub fn columns(&self) -> &[ArrayRef] { &self.columns[..] @@ -473,6 +481,19 @@ impl From for StructArray { } } +impl Index<&str> for RecordBatch { + type Output = ArrayRef; + + /// Get a reference to a column's array by name. + /// + /// # Panics + /// + /// Panics if the name is not in the schema. + fn index(&self, name: &str) -> &Self::Output { + self.column_by_name(name).unwrap() + } +} + #[cfg(test)] mod tests { use super::*; @@ -746,6 +767,25 @@ mod tests { assert_eq!(batch1, batch2); } + /// validates if the record batch can be accessed using `column_name` as index i.e. `record_batch["column_name"]` + #[test] + fn record_batch_index_access() { + let id_arr = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); + let val_arr = Arc::new(Int32Array::from(vec![5, 6, 7, 8])); + let schema1 = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("val", DataType::Int32, false), + ]); + let record_batch = RecordBatch::try_new( + Arc::new(schema1), + vec![id_arr.clone(), val_arr.clone()], + ) + .unwrap(); + + assert_eq!(record_batch["id"].as_ref(), id_arr.as_ref()); + assert_eq!(record_batch["val"].as_ref(), val_arr.as_ref()); + } + #[test] fn record_batch_vals_ne() { let id_arr1 = Int32Array::from(vec![1, 2, 3, 4]); From c28d69aa5be2cce2d065300c9a79c4063589f300 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 7 Jan 2023 06:09:35 -0500 Subject: [PATCH 0486/1411] Preserve empty list array elements in take kernel (#3473) * Update test_take_list to fail with empty list * Fix null_bit_buffer to match intended array comment says: [[0,null,0], [-1,-2,3], null, [5,null]] which implies null buffer of: 0b11111011 * Fix null_bit_buffer to match intended array comment says: [[0,null,0], [-1,-2,3], [null], [5,null]] which has not null values at the list level, or a null buffer of 0b11111011 * Compute null buffer in take_value_indices_from_list this way we can distinguish empty list elements from null elements. * clippy --- arrow-select/src/take.rs | 73 ++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 458fbdb66ef6..9fffa0b5f6de 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -691,27 +691,11 @@ where { // TODO: Some optimizations can be done here such as if it is // taking the whole list or a contiguous sublist - let (list_indices, offsets) = + let (list_indices, offsets, null_buf) = take_value_indices_from_list::(values, indices)?; let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; - // determine null count and null buffer, which are a function of `values` and `indices` - let mut null_count = 0; - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - { - let null_slice = null_buf.as_slice_mut(); - offsets[..].windows(2).enumerate().for_each( - |(i, window): (usize, &[OffsetType::Native])| { - if window[0] == window[1] { - // offsets are equal, slot is null - bit_util::unset_bit(null_slice, i); - null_count += 1; - } - }, - ); - } - let value_offsets = Buffer::from_slice_ref(&offsets); + let value_offsets = Buffer::from_slice_ref(offsets); // create a new list with taken data and computed null information let list_data = ArrayDataBuilder::new(values.data_type().clone()) .len(indices.len()) @@ -831,10 +815,18 @@ where /// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns /// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 /// elements) +#[allow(clippy::type_complexity)] fn take_value_indices_from_list( list: &GenericListArray, indices: &PrimitiveArray, -) -> Result<(PrimitiveArray, Vec), ArrowError> +) -> Result< + ( + PrimitiveArray, + Vec, + MutableBuffer, + ), + ArrowError, +> where IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, @@ -850,6 +842,12 @@ where let mut current_offset = OffsetType::Native::zero(); // add first offset new_offsets.push(OffsetType::Native::zero()); + + // Initialize null buffer + let num_bytes = bit_util::ceil(indices.len(), 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); + let null_slice = null_buf.as_slice_mut(); + // compute the value indices, and set offsets accordingly for i in 0..indices.len() { if indices.is_valid(i) { @@ -868,12 +866,20 @@ where values.push(Some(curr)); curr += num::One::one(); } + if !list.is_valid(ix) { + bit_util::unset_bit(null_slice, i); + } } else { + bit_util::unset_bit(null_slice, i); new_offsets.push(current_offset); } } - Ok((PrimitiveArray::::from(values), new_offsets)) + Ok(( + PrimitiveArray::::from(values), + new_offsets, + null_buf, + )) } /// Takes/filters a fixed size list array's inner data using the offsets of the list array. @@ -1519,12 +1525,12 @@ mod tests { macro_rules! test_take_list { ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ - // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] + // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]] let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]) .data() .clone(); // Construct offsets - let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; + let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two let list_data_type = DataType::$list_data_type(Box::new(Field::new( @@ -1533,30 +1539,28 @@ mod tests { false, ))); let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) + .len(4) .add_buffer(value_offsets) .add_child_data(value_data) .build() .unwrap(); let list_array = $list_array_type::from(list_data); - // index returns: [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] - let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(2), Some(0)]); + // index returns: [[2,3], null, [-1,-2,-1], [], [0,0,0]] + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(2), Some(0)]); let a = take(&list_array, &index, None).unwrap(); let a: &$list_array_type = a.as_any().downcast_ref::<$list_array_type>().unwrap(); // construct a value array with expected results: - // [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] + // [[2,3], null, [-1,-2,-1], [], [0,0,0]] let expected_data = Int32Array::from(vec![ Some(2), Some(3), Some(-1), Some(-2), Some(-1), - Some(2), - Some(3), Some(0), Some(0), Some(0), @@ -1564,7 +1568,7 @@ mod tests { .data() .clone(); // construct offsets - let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 7, 10]; + let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 5, 8]; let expected_offsets = Buffer::from_slice_ref(&expected_offsets); // construct list array from the two let expected_list_data = ArrayData::builder(list_data_type) @@ -1609,7 +1613,7 @@ mod tests { let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) - .null_bit_buffer(Some(Buffer::from([0b10111101, 0b00000000]))) + .null_bit_buffer(Some(Buffer::from([0b11111111]))) .add_child_data(value_data) .build() .unwrap(); @@ -1682,7 +1686,7 @@ mod tests { let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) - .null_bit_buffer(Some(Buffer::from([0b01111101]))) + .null_bit_buffer(Some(Buffer::from([0b11111011]))) .add_child_data(value_data) .build() .unwrap(); @@ -2057,10 +2061,12 @@ mod tests { ]); let indices = UInt32Array::from(vec![2, 0]); - let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap(); + let (indexed, offsets, null_buf) = + take_value_indices_from_list(&list, &indices).unwrap(); assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1])); assert_eq!(offsets, vec![0, 5, 7]); + assert_eq!(null_buf.as_slice(), &[0b11111111]); } #[test] @@ -2072,11 +2078,12 @@ mod tests { ]); let indices = UInt32Array::from(vec![2, 0]); - let (indexed, offsets) = + let (indexed, offsets, null_buf) = take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap(); assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1])); assert_eq!(offsets, vec![0, 5, 7]); + assert_eq!(null_buf.as_slice(), &[0b11111111]); } #[test] From 8492c27dfb6840e94843b0b2bb8de484280b6c5d Mon Sep 17 00:00:00 2001 From: "Valeriy V. Vorotyntsev" Date: Sat, 7 Jan 2023 18:16:49 +0200 Subject: [PATCH 0487/1411] [doc] Fix broken URLs (#3486) * [doc] Fix broken URLs Use proper syntax when [linking to items by name]. Before: https://docs.rs/arrow-array/latest/arrow_array/iterator/%5Bcrate::PrimitiveArray%5D After: https://docs.rs/arrow-array/latest/arrow_array/array/struct.PrimitiveArray.html [linking to items by name]: https://doc.rust-lang.org/rustdoc/write-documentation/linking-to-items-by-name.html * [doc] Use proper identifiers arrow-array: - `DecimalArray` is undefined. Use `PrimitiveArray` instead. - `arrow` crate is not among `arrow-array`'s dependencies, so its items cannot be referred to using ['intra-doc link'] syntax. ['intra-doc link']: https://doc.rust-lang.org/rustdoc/write-documentation/linking-to-items-by-name.html --- arrow-array/src/iterator.rs | 6 +++--- arrow-array/src/types.rs | 8 ++++---- object_store/src/aws/mod.rs | 4 ++-- parquet/src/arrow/arrow_reader/filter.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 6 +++--- parquet/src/arrow/arrow_reader/selection.rs | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index e7c5e8367e23..ff99233129cf 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -39,9 +39,9 @@ use crate::array::{ /// there are more efficient ways to iterate over just the non-null indices, this functionality /// is provided by [`compute::try_unary`] /// -/// [`PrimitiveArray`]: [crate::PrimitiveArray] -/// [`compute::unary`]: [arrow::compute::unary] -/// [`compute::try_unary`]: [arrow::compute::try_unary] +/// [`PrimitiveArray`]: crate::PrimitiveArray +/// [`compute::unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.unary.html +/// [`compute::try_unary`]: https://docs.rs/arrow/latest/arrow/compute/fn.try_unary.html #[derive(Debug)] pub struct ArrayIter { array: T, diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 25c047a11d35..e7d92d2d08f9 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -559,15 +559,15 @@ mod decimal { impl DecimalTypeSealed for Decimal256Type {} } -/// A trait over the decimal types, used by [`DecimalArray`] to provide a generic +/// A trait over the decimal types, used by [`PrimitiveArray`] to provide a generic /// implementation across the various decimal types /// /// Implemented by [`Decimal128Type`] and [`Decimal256Type`] for [`Decimal128Array`] /// and [`Decimal256Array`] respectively /// -/// [`DecimalArray`]: [crate::array::DecimalArray] -/// [`Decimal128Array`]: [crate::array::Decimal128Array] -/// [`Decimal256Array`]: [crate::array::Decimal256Array] +/// [`PrimitiveArray`]: crate::array::PrimitiveArray +/// [`Decimal128Array`]: crate::array::Decimal128Array +/// [`Decimal256Array`]: crate::array::Decimal256Array pub trait DecimalType: 'static + Send + Sync + ArrowPrimitiveType + decimal::DecimalTypeSealed { diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 4b633d9f5d24..20174692fb5e 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -814,8 +814,8 @@ impl AmazonS3Builder { /// /// This option has no effect if not using instance credentials /// - /// [IMDSv2]: [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html] - /// [SSRF attack]: [https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/] + /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html + /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ /// pub fn with_imdsv1_fallback(mut self) -> Self { self.imdsv1_fallback = true; diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index cbded9a6f420..ea529b74f610 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -94,7 +94,7 @@ where /// leaves 99% of the rows, it may be better to not filter the data from parquet and /// apply the filter after the RecordBatch has been fully decoded. /// -/// [`RowSelection`]: [super::selection::RowSelection] +/// [`RowSelection`]: crate::arrow::arrow_reader::RowSelection pub struct RowFilter { /// A list of [`ArrowPredicate`] pub(crate) predicates: Vec>, diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index df38e554f9db..312f0140769c 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -50,7 +50,7 @@ pub use selection::{RowSelection, RowSelector}; /// * For a synchronous API - [`ParquetRecordBatchReaderBuilder`] /// * For an asynchronous API - [`ParquetRecordBatchStreamBuilder`] /// -/// [`ParquetRecordBatchStreamBuilder`]: [crate::arrow::async_reader::ParquetRecordBatchStreamBuilder] +/// [`ParquetRecordBatchStreamBuilder`]: crate::arrow::async_reader::ParquetRecordBatchStreamBuilder pub struct ArrowReaderBuilder { pub(crate) input: T, @@ -150,7 +150,7 @@ impl ArrowReaderBuilder { /// An example use case of this would be applying a selection determined by /// evaluating predicates against the [`Index`] /// - /// [`Index`]: [parquet::file::page_index::index::Index] + /// [`Index`]: crate::file::page_index::index::Index pub fn with_row_selection(self, selection: RowSelection) -> Self { Self { selection: Some(selection), @@ -238,7 +238,7 @@ impl ArrowReaderOptions { /// Set this true to enable decoding of the [PageIndex] if present. This can be used /// to push down predicates to the parquet scan, potentially eliminating unnecessary IO /// - /// [PageIndex]: [https://github.com/apache/parquet-format/blob/master/PageIndex.md] + /// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn with_page_index(self, page_index: bool) -> Self { Self { page_index, ..self } } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 487e95fcd831..03c7e01e0840 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -83,7 +83,7 @@ impl RowSelector { /// assert_eq!(actual, expected); /// ``` /// -/// [`PageIndex`]: [crate::file::page_index::index::PageIndex] +/// [`PageIndex`]: crate::file::page_index::index::PageIndex #[derive(Debug, Clone, Default, Eq, PartialEq)] pub struct RowSelection { selectors: Vec, From c74665808439cb7020fb1cfb74b376a136c73259 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sat, 7 Jan 2023 20:08:22 +0200 Subject: [PATCH 0488/1411] Fixes a broken link in the `arrow` lib.rs rustdoc (#3487) --- arrow/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index cee4f886cf9c..64e5d6a2cd3d 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -31,7 +31,7 @@ //! //! The current list of sub-crates is: //! -//! * [`arrow-arith][arrow_arith] - arithmetic kernels +//! * [`arrow-arith`][arrow_arith] - arithmetic kernels //! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions //! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays //! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays From eae993fd196d0a8df8a90857bc4a7ae8f5a3e845 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 9 Jan 2023 04:25:29 -0600 Subject: [PATCH 0489/1411] feat: Allow providing a service account key directly for GCS (#3489) * feat: Allow providing a service account key directly for GCP Use case: We're storing service accounts keys external to where the object store client is being created. We do not want to have to write the key to a file before creating the object store client. This change allows for providing the key directly. * Add additional aliases for specifying service account path "google_service_account_path" and "service_account_path" can now be used. * Add test asserting aliases set appropriate config option --- object_store/src/gcp/mod.rs | 144 ++++++++++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 16 deletions(-) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 177812fa8930..28972c4a6636 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -121,8 +121,13 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Missing service account path"))] - MissingServiceAccountPath, + #[snafu(display("Missing service account path or key"))] + MissingServiceAccountPathOrKey, + + #[snafu(display( + "One of service account path or service account key may be provided." + ))] + ServiceAccountPathAndKeyProvided, #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, @@ -800,14 +805,15 @@ pub struct GoogleCloudStorageBuilder { bucket_name: Option, url: Option, service_account_path: Option, + service_account_key: Option, retry_config: RetryConfig, client_options: ClientOptions, } /// Configuration keys for [`GoogleCloudStorageBuilder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) -/// or [`with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) +/// or [`try_with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. /// /// # Example /// ``` @@ -835,8 +841,17 @@ pub enum GoogleConfigKey { /// Supported keys: /// - `google_service_account` /// - `service_account` + /// - `google_service_account_path` + /// - `service_account_path` ServiceAccount, + /// The serialized service account key. + /// + /// Supported keys: + /// - `google_service_account_key` + /// - `service_account_key` + ServiceAccountKey, + /// Bucket name /// /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. @@ -853,6 +868,7 @@ impl AsRef for GoogleConfigKey { fn as_ref(&self) -> &str { match self { Self::ServiceAccount => "google_service_account", + Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", } } @@ -863,7 +879,13 @@ impl FromStr for GoogleConfigKey { fn from_str(s: &str) -> Result { match s { - "google_service_account" | "service_account" => Ok(Self::ServiceAccount), + "google_service_account" + | "service_account" + | "google_service_account_path" + | "service_account_path" => Ok(Self::ServiceAccount), + "google_service_account_key" | "service_account_key" => { + Ok(Self::ServiceAccountKey) + } "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { Ok(Self::Bucket) } @@ -877,6 +899,7 @@ impl Default for GoogleCloudStorageBuilder { Self { bucket_name: None, service_account_path: None, + service_account_key: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, @@ -894,13 +917,17 @@ impl GoogleCloudStorageBuilder { /// /// Variables extracted from environment: /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file /// * SERVICE_ACCOUNT: (alias) location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key + /// * GOOGLE_BUCKET: bucket name + /// * GOOGLE_BUCKET_NAME: (alias) bucket name /// /// # Example /// ``` /// use object_store::gcp::GoogleCloudStorageBuilder; /// - /// let azure = GoogleCloudStorageBuilder::from_env() + /// let gcs = GoogleCloudStorageBuilder::from_env() /// .with_bucket_name("foo") /// .build(); /// ``` @@ -957,6 +984,9 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ServiceAccount => { self.service_account_path = Some(value.into()) } + GoogleConfigKey::ServiceAccountKey => { + self.service_account_key = Some(value.into()) + } GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), }; Ok(self) @@ -1001,8 +1031,12 @@ impl GoogleCloudStorageBuilder { self } - /// Set the path to the service account file (required). Example - /// `"/tmp/gcs.json"` + /// Set the path to the service account file. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be + /// set. + /// + /// Example `"/tmp/gcs.json"`. /// /// Example contents of `gcs.json`: /// @@ -1022,6 +1056,19 @@ impl GoogleCloudStorageBuilder { self } + /// Set the service account key. The service account must be in the JSON + /// format. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be + /// set. + pub fn with_service_account_key( + mut self, + service_account: impl Into, + ) -> Self { + self.service_account_key = Some(service_account.into()); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1048,12 +1095,19 @@ impl GoogleCloudStorageBuilder { } let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; - let service_account_path = self - .service_account_path - .ok_or(Error::MissingServiceAccountPath)?; let client = self.client_options.client()?; - let credentials = reader_credentials_file(service_account_path)?; + + let credentials = match (self.service_account_path, self.service_account_key) { + (Some(path), None) => reader_credentials_file(path)?, + (None, Some(key)) => { + serde_json::from_str(&key).context(DecodeCredentialsSnafu)? + } + (None, None) => return Err(Error::MissingServiceAccountPathOrKey.into()), + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes let scope = "https://www.googleapis.com/auth/devstorage.full_control"; @@ -1110,6 +1164,8 @@ mod test { use bytes::Bytes; use std::collections::HashMap; use std::env; + use std::io::Write; + use tempfile::NamedTempFile; use crate::{ tests::{ @@ -1121,6 +1177,7 @@ mod test { use super::*; + const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. @@ -1278,11 +1335,8 @@ mod test { #[tokio::test] async fn gcs_test_proxy_url() { - use std::io::Write; - use tempfile::NamedTempFile; let mut tfile = NamedTempFile::new().unwrap(); - let creds = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; - write!(tfile, "{}", creds).unwrap(); + write!(tfile, "{}", FAKE_KEY).unwrap(); let service_account_path = tfile.path(); let gcs = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) @@ -1318,6 +1372,27 @@ mod test { } } + #[test] + fn gcs_test_service_account_key_only() { + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_bucket_name("foo") + .build() + .unwrap(); + } + + #[test] + fn gcs_test_service_account_key_and_path() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{}", FAKE_KEY).unwrap(); + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_service_account_path(tfile.path().to_str().unwrap()) + .with_bucket_name("foo") + .build() + .unwrap_err(); + } + #[test] fn gcs_test_config_from_map() { let google_service_account = "object_store:fake_service_account".to_string(); @@ -1371,4 +1446,41 @@ mod test { let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); assert!(builder.is_err()); } + + #[test] + fn gcs_test_config_aliases() { + // Service account path + for alias in [ + "google_service_account", + "service_account", + "google_service_account_path", + "service_account_path", + ] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, "/fake/path.json")]) + .unwrap(); + assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); + } + + // Service account key + for alias in ["google_service_account_key", "service_account_key"] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, FAKE_KEY)]) + .unwrap(); + assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); + } + + // Bucket name + for alias in [ + "google_bucket", + "google_bucket_name", + "bucket", + "bucket_name", + ] { + let builder = GoogleCloudStorageBuilder::new() + .try_with_options([(alias, "fake_bucket")]) + .unwrap(); + assert_eq!("fake_bucket", builder.bucket_name.unwrap()); + } + } } From 592d7a3601b1b7876ab5753abde66113f1a9dc23 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 9 Jan 2023 11:57:52 +0100 Subject: [PATCH 0490/1411] feat: add `parquet-rewrite` CLI (#3477) * feat: add `parquet-rewrite` CLI Closes #3476. * refactor: init ArrowWriter early --- parquet/Cargo.toml | 4 + parquet/src/bin/parquet-rewrite.rs | 293 +++++++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 parquet/src/bin/parquet-rewrite.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 7a76ff64e519..2aa7449787b1 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -118,6 +118,10 @@ required-features = ["arrow"] name = "parquet-read" required-features = ["cli"] +[[bin]] +name = "parquet-rewrite" +required-features = ["arrow", "cli"] + [[bin]] name = "parquet-schema" required-features = ["cli"] diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs new file mode 100644 index 000000000000..cd60225cad84 --- /dev/null +++ b/parquet/src/bin/parquet-rewrite.rs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to rewrite parquet files. +//! +//! # Install +//! +//! `parquet-rewrite` can be installed using `cargo`: +//! ``` +//! cargo install parquet --features=cli +//! ``` +//! After this `parquet-rewrite` should be available: +//! ``` +//! parquet-rewrite -i XYZ.parquet -o XYZ2.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --features=cli --bin parquet-rewrite -- -i XYZ.parquet -o XYZ2.parquet +//! ``` + +use std::fs::File; + +use arrow_array::RecordBatchReader; +use clap::{builder::PossibleValue, Parser, ValueEnum}; +use parquet::{ + arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}, + basic::Compression, + file::{ + properties::{EnabledStatistics, WriterProperties, WriterVersion}, + reader::FileReader, + serialized_reader::SerializedFileReader, + }, +}; + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum CompressionArgs { + /// No compression. + None, + + /// Snappy + Snappy, + + /// GZip + Gzip, + + /// LZO + Lzo, + + /// Brotli + Brotli, + + /// LZ4 + Lz4, + + /// Zstd + Zstd, + + /// LZ4 Raw + Lz4Raw, +} + +impl From for Compression { + fn from(value: CompressionArgs) -> Self { + match value { + CompressionArgs::None => Self::UNCOMPRESSED, + CompressionArgs::Snappy => Self::SNAPPY, + CompressionArgs::Gzip => Self::GZIP, + CompressionArgs::Lzo => Self::LZO, + CompressionArgs::Brotli => Self::BROTLI, + CompressionArgs::Lz4 => Self::LZ4, + CompressionArgs::Zstd => Self::ZSTD, + CompressionArgs::Lz4Raw => Self::LZ4_RAW, + } + } +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Debug)] +enum EnabledStatisticsArgs { + /// Compute no statistics + None, + + /// Compute chunk-level statistics but not page-level + Chunk, + + /// Compute page-level and chunk-level statistics + Page, +} + +impl From for EnabledStatistics { + fn from(value: EnabledStatisticsArgs) -> Self { + match value { + EnabledStatisticsArgs::None => Self::None, + EnabledStatisticsArgs::Chunk => Self::Chunk, + EnabledStatisticsArgs::Page => Self::Page, + } + } +} + +#[derive(Clone, Copy, Debug)] +enum WriterVersionArgs { + Parquet1_0, + Parquet2_0, +} + +impl ValueEnum for WriterVersionArgs { + fn value_variants<'a>() -> &'a [Self] { + &[Self::Parquet1_0, Self::Parquet2_0] + } + + fn to_possible_value(&self) -> Option { + match self { + WriterVersionArgs::Parquet1_0 => Some(PossibleValue::new("1.0")), + WriterVersionArgs::Parquet2_0 => Some(PossibleValue::new("2.0")), + } + } +} + +impl From for WriterVersion { + fn from(value: WriterVersionArgs) -> Self { + match value { + WriterVersionArgs::Parquet1_0 => Self::PARQUET_1_0, + WriterVersionArgs::Parquet2_0 => Self::PARQUET_2_0, + } + } +} + +#[derive(Debug, Parser)] +#[clap(author, version, about("Read and write parquet file with potentially different settings"), long_about = None)] +struct Args { + /// Path to input parquet file. + #[clap(short, long)] + input: String, + + /// Path to output parquet file. + #[clap(short, long)] + output: String, + + /// Compression used. + #[clap(long, value_enum)] + compression: Option, + + /// Sets maximum number of rows in a row group. + #[clap(long)] + max_row_group_size: Option, + + /// Sets best effort maximum number of rows in a data page. + #[clap(long)] + data_page_row_count_limit: Option, + + /// Sets best effort maximum size of a data page in bytes. + #[clap(long)] + data_pagesize_limit: Option, + + /// Sets max statistics size for any column. + /// + /// Applicable only if statistics are enabled. + #[clap(long)] + max_statistics_size: Option, + + /// Sets best effort maximum dictionary page size, in bytes. + #[clap(long)] + dictionary_pagesize_limit: Option, + + /// Sets whether bloom filter is enabled for any column. + #[clap(long)] + bloom_filter_enabled: Option, + + /// Sets bloom filter false positive probability (fpp) for any column. + #[clap(long)] + bloom_filter_fpp: Option, + + /// Sets number of distinct values (ndv) for bloom filter for any column. + #[clap(long)] + bloom_filter_ndv: Option, + + /// Sets flag to enable/disable dictionary encoding for any column. + #[clap(long)] + dictionary_enabled: Option, + + /// Sets flag to enable/disable statistics for any column. + #[clap(long)] + statistics_enabled: Option, + + /// Sets writer version. + #[clap(long)] + writer_version: Option, +} + +fn main() { + let args = Args::parse(); + + // read key-value metadata + let parquet_reader = SerializedFileReader::new( + File::open(&args.input).expect("Unable to open input file"), + ) + .expect("Failed to create reader"); + let kv_md = parquet_reader + .metadata() + .file_metadata() + .key_value_metadata() + .cloned(); + + // create actual parquet reader + let parquet_reader = ParquetRecordBatchReaderBuilder::try_new( + File::open(args.input).expect("Unable to open input file"), + ) + .expect("parquet open") + .build() + .expect("parquet open"); + + let mut writer_properties_builder = + WriterProperties::builder().set_key_value_metadata(kv_md); + if let Some(value) = args.compression { + writer_properties_builder = + writer_properties_builder.set_compression(value.into()); + } + if let Some(value) = args.max_row_group_size { + writer_properties_builder = + writer_properties_builder.set_max_row_group_size(value); + } + if let Some(value) = args.data_page_row_count_limit { + writer_properties_builder = + writer_properties_builder.set_data_page_row_count_limit(value); + } + if let Some(value) = args.data_pagesize_limit { + writer_properties_builder = + writer_properties_builder.set_data_pagesize_limit(value); + } + if let Some(value) = args.dictionary_pagesize_limit { + writer_properties_builder = + writer_properties_builder.set_dictionary_pagesize_limit(value); + } + if let Some(value) = args.max_statistics_size { + writer_properties_builder = + writer_properties_builder.set_max_statistics_size(value); + } + if let Some(value) = args.bloom_filter_enabled { + writer_properties_builder = + writer_properties_builder.set_bloom_filter_enabled(value); + + if value { + if let Some(value) = args.bloom_filter_fpp { + writer_properties_builder = + writer_properties_builder.set_bloom_filter_fpp(value); + } + if let Some(value) = args.bloom_filter_ndv { + writer_properties_builder = + writer_properties_builder.set_bloom_filter_ndv(value); + } + } + } + if let Some(value) = args.dictionary_enabled { + writer_properties_builder = + writer_properties_builder.set_dictionary_enabled(value); + } + if let Some(value) = args.statistics_enabled { + writer_properties_builder = + writer_properties_builder.set_statistics_enabled(value.into()); + } + if let Some(value) = args.writer_version { + writer_properties_builder = + writer_properties_builder.set_writer_version(value.into()); + } + let writer_properties = writer_properties_builder.build(); + let mut parquet_writer = ArrowWriter::try_new( + File::create(&args.output).expect("Unable to open output file"), + parquet_reader.schema(), + Some(writer_properties), + ) + .expect("create arrow writer"); + + for maybe_batch in parquet_reader { + let batch = maybe_batch.expect("reading batch"); + parquet_writer.write(&batch).expect("writing data"); + } + + parquet_writer.close().expect("finalizing file"); +} From fb36dd980b398deabe5547af114982326b97e078 Mon Sep 17 00:00:00 2001 From: Wenjun L <47608857+csphile@users.noreply.github.com> Date: Mon, 9 Jan 2023 22:59:07 +0100 Subject: [PATCH 0491/1411] Fix: Added support to cast string without time (#3494) * Fix: Added support casting strings without time to timestamp * Fix: Added support casting strings without time to timestamp Co-authored-by: Wenjun Liu --- arrow-cast/src/parse.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 6de336351426..e885ec5b67a8 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -37,6 +37,7 @@ use chrono::prelude::*; /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31` # close to RCF3339, only date no time // /// Internally, this function uses the `chrono` library for the /// datetime parsing @@ -121,6 +122,14 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { return Ok(ts.timestamp_nanos()); } + // without a timezone specifier as a local time, only date + // Example: 2020-09-08 + if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") { + if let Some(ts) = dt.and_hms_opt(0, 0, 0) { + return Ok(ts.timestamp_nanos()); + } + } + // Note we don't pass along the error message from the underlying // chrono parsing because we tried several different format // strings and we don't know which the user was trying to @@ -494,6 +503,19 @@ mod tests { naive_datetime_whole_secs.timestamp_nanos(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); + + // ensure without time work + // no time, should be the nano second at + // 2020-09-08 0:0:0 + let naive_datetime_no_time = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + + assert_eq!( + naive_datetime_no_time.timestamp_nanos(), + parse_timestamp("2020-09-08").unwrap() + ) } #[test] From cada9ba33803a48a3145ab333fe1cf6410999d89 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 10 Jan 2023 13:55:07 +0100 Subject: [PATCH 0492/1411] Fix IPCWriter for Sliced BooleanArray (#3498) * fix: bool IPC Fixes #3496. * refactor: simplify code * refactor: `assert!` -> `assert_eq!` --- arrow-ipc/src/writer.rs | 73 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index ed5e53a959c0..d7cc83aabddb 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1202,7 +1202,7 @@ fn write_array_data( ) { // Truncate values - assert!(array_data.buffers().len() == 1); + assert_eq!(array_data.buffers().len(), 1); let buffer = &array_data.buffers()[0]; let layout = layout(data_type); @@ -1231,6 +1231,14 @@ fn write_array_data( compression_codec, )?; } + } else if matches!(data_type, DataType::Boolean) { + // Bools are special because the payload (= 1 bit) is smaller than the physical container elements (= bytes). + // The array data may not start at the physical boundary of the underlying buffer, so we need to shift bits around. + assert_eq!(array_data.buffers().len(), 1); + + let buffer = &array_data.buffers()[0]; + let buffer = buffer.bit_slice(array_data.offset(), array_data.len()); + offset = write_buffer(&buffer, buffers, arrow_data, offset, compression_codec)?; } else { for buffer in array_data.buffers() { offset = @@ -1312,6 +1320,7 @@ fn pad_to_8(len: u32) -> usize { mod tests { use super::*; + use std::io::Cursor; use std::io::Seek; use std::sync::Arc; @@ -1926,4 +1935,66 @@ mod tests { read_array.iter().collect::>() ); } + + #[test] + fn encode_bools_slice() { + // Test case for https://github.com/apache/arrow-rs/issues/3496 + assert_bool_roundtrip([true, false], 1, 1); + + // slice somewhere in the middle + assert_bool_roundtrip( + [ + true, false, true, true, false, false, true, true, true, false, false, + false, true, true, true, true, false, false, false, false, true, true, + true, true, true, false, false, false, false, false, + ], + 13, + 17, + ); + + // start at byte boundary, end in the middle + assert_bool_roundtrip( + [ + true, false, true, true, false, false, true, true, true, false, false, + false, + ], + 8, + 2, + ); + + // start and stop and byte boundary + assert_bool_roundtrip( + [ + true, false, true, true, false, false, true, true, true, false, false, + false, true, true, true, true, true, false, false, false, false, false, + ], + 8, + 8, + ); + } + + fn assert_bool_roundtrip( + bools: [bool; N], + offset: usize, + length: usize, + ) { + let val_bool_field = Field::new("val", DataType::Boolean, false); + + let schema = Arc::new(Schema::new(vec![val_bool_field])); + + let bools = BooleanArray::from(bools.to_vec()); + + let batch = + RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(bools)]).unwrap(); + let batch = batch.slice(offset, length); + + let mut writer = StreamWriter::try_new(Vec::::new(), &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + let data = writer.into_inner().unwrap(); + + let mut reader = StreamReader::try_new(Cursor::new(data), None).unwrap(); + let batch2 = reader.next().unwrap().unwrap(); + assert_eq!(batch, batch2); + } } From e8cc351af662515f7ff9e25b6eb1e609f89b6bc8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 10 Jan 2023 05:59:01 -0800 Subject: [PATCH 0493/1411] Refactoring build_compare for decimal and using downcast_primitive (#3484) * Refactor build_compare for decimal and add dict support * Simplify code using downcast_primitive --- arrow-ord/src/ord.rs | 162 +++++++++++++++++++++---------------------- 1 file changed, 80 insertions(+), 82 deletions(-) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 6122f9cb3f33..b7737c6de61f 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -153,6 +153,12 @@ where }) } +macro_rules! cmp_dict_primitive_helper { + ($t:ty, $key_type_lhs:expr, $left:expr, $right:expr) => { + cmp_dict_primitive::<$t>($key_type_lhs, $left, $right)? + }; +} + /// returns a comparison function that compares two values at two different positions /// between the two arrays. /// The arrays' types must be equal. @@ -193,6 +199,12 @@ pub fn build_compare( (Int64, Int64) => compare_primitives::(left, right), (Float32, Float32) => compare_float::(left, right), (Float64, Float64) => compare_float::(left, right), + (Decimal128(_, _), Decimal128(_, _)) => { + compare_primitives::(left, right) + } + (Decimal256(_, _), Decimal256(_, _)) => { + compare_primitives::(left, right) + } (Date32, Date32) => compare_primitives::(left, right), (Date64, Date64) => compare_primitives::(left, right), (Time32(Second), Time32(Second)) => { @@ -253,83 +265,8 @@ pub fn build_compare( } let key_type_lhs = key_type_lhs.as_ref(); - - match value_type_lhs.as_ref() { - Int8 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Int16 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Int32 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Int64 => cmp_dict_primitive::(key_type_lhs, left, right)?, - UInt8 => cmp_dict_primitive::(key_type_lhs, left, right)?, - UInt16 => cmp_dict_primitive::(key_type_lhs, left, right)?, - UInt32 => cmp_dict_primitive::(key_type_lhs, left, right)?, - UInt64 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Float32 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Float64 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Date32 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Date64 => cmp_dict_primitive::(key_type_lhs, left, right)?, - Time32(Second) => { - cmp_dict_primitive::(key_type_lhs, left, right)? - } - Time32(Millisecond) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Time64(Microsecond) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Time64(Nanosecond) => { - cmp_dict_primitive::(key_type_lhs, left, right)? - } - Timestamp(Second, _) => { - cmp_dict_primitive::(key_type_lhs, left, right)? - } - Timestamp(Millisecond, _) => cmp_dict_primitive::< - TimestampMillisecondType, - >(key_type_lhs, left, right)?, - Timestamp(Microsecond, _) => cmp_dict_primitive::< - TimestampMicrosecondType, - >(key_type_lhs, left, right)?, - Timestamp(Nanosecond, _) => { - cmp_dict_primitive::( - key_type_lhs, - left, - right, - )? - } - Interval(YearMonth) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Interval(DayTime) => { - cmp_dict_primitive::(key_type_lhs, left, right)? - } - Interval(MonthDayNano) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Duration(Second) => { - cmp_dict_primitive::(key_type_lhs, left, right)? - } - Duration(Millisecond) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Duration(Microsecond) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, - Duration(Nanosecond) => cmp_dict_primitive::( - key_type_lhs, - left, - right, - )?, + downcast_primitive! { + value_type_lhs.as_ref() => (cmp_dict_primitive_helper, key_type_lhs, left, right), Utf8 => match key_type_lhs { UInt8 => compare_dict_string::(left, right), UInt16 => compare_dict_string::(left, right), @@ -354,11 +291,6 @@ pub fn build_compare( } } } - (Decimal128(_, _), Decimal128(_, _)) => { - let left: Decimal128Array = Decimal128Array::from(left.data().clone()); - let right: Decimal128Array = Decimal128Array::from(right.data().clone()); - Box::new(move |i, j| left.value(i).cmp(&right.value(j))) - } (FixedSizeBinary(_), FixedSizeBinary(_)) => { let left: FixedSizeBinaryArray = FixedSizeBinaryArray::from(left.data().clone()); @@ -380,6 +312,7 @@ pub fn build_compare( pub mod tests { use super::*; use arrow_array::{FixedSizeBinaryArray, Float64Array, Int32Array}; + use arrow_buffer::i256; use std::cmp::Ordering; #[test] @@ -464,6 +397,23 @@ pub mod tests { assert_eq!(Ordering::Greater, (cmp)(0, 2)); } + #[test] + fn test_decimali256() { + let array = vec![ + Some(i256::from_i128(5_i128)), + Some(i256::from_i128(2_i128)), + Some(i256::from_i128(3_i128)), + ] + .into_iter() + .collect::() + .with_precision_and_scale(53, 6) + .unwrap(); + + let cmp = build_compare(&array, &array).unwrap(); + assert_eq!(Ordering::Less, (cmp)(1, 0)); + assert_eq!(Ordering::Greater, (cmp)(0, 2)); + } + #[test] fn test_dict() { let data = vec!["a", "b", "c", "a", "a", "c", "c"]; @@ -584,4 +534,52 @@ pub mod tests { assert_eq!(Ordering::Greater, (cmp)(3, 1)); assert_eq!(Ordering::Greater, (cmp)(3, 2)); } + + #[test] + fn test_decimal_dict() { + let values = Decimal128Array::from(vec![1, 0, 2, 5]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = Decimal128Array::from(vec![2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } + + #[test] + fn test_decimal256_dict() { + let values = Decimal256Array::from(vec![ + i256::from_i128(1), + i256::from_i128(0), + i256::from_i128(2), + i256::from_i128(5), + ]); + let keys = Int8Array::from_iter_values([0, 0, 1, 3]); + let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let values = Decimal256Array::from(vec![ + i256::from_i128(2), + i256::from_i128(3), + i256::from_i128(4), + i256::from_i128(5), + ]); + let keys = Int8Array::from_iter_values([0, 1, 1, 3]); + let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let cmp = build_compare(&array1, &array2).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 3)); + assert_eq!(Ordering::Equal, (cmp)(3, 3)); + assert_eq!(Ordering::Greater, (cmp)(3, 1)); + assert_eq!(Ordering::Greater, (cmp)(3, 2)); + } } From 005b64cf0f4bcbc79f318285564589d73677b557 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 10 Jan 2023 16:10:39 +0100 Subject: [PATCH 0494/1411] Remove azurite exception (#3497) --- object_store/src/azure/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 416883ac95a2..cbd5a35dc13d 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -965,10 +965,8 @@ mod tests { #[tokio::test] async fn azure_blob_test() { - let use_emulator = env::var("AZURE_USE_EMULATOR").is_ok(); let integration = maybe_skip_integration!().build().unwrap(); - // Azurite doesn't support listing with spaces - https://github.com/localstack/localstack/issues/6328 - put_get_delete_list_opts(&integration, use_emulator).await; + put_get_delete_list_opts(&integration, false).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; From b4abb750da61b16cad3863bf1078f78b554e7cba Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 10 Jan 2023 19:46:31 +0100 Subject: [PATCH 0495/1411] Preserve DataType metadata in make_builder (#3438) * Preserve DataType metadata in make_builder * Fix doc --- arrow-array/src/builder/primitive_builder.rs | 54 ++++++++++++++++- arrow-array/src/builder/struct_builder.rs | 62 +++++++++++++++----- 2 files changed, 99 insertions(+), 17 deletions(-) diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index f3f3f3728db9..a969e121808b 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -21,6 +21,7 @@ use crate::types::*; use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; +use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -94,6 +95,7 @@ pub type Decimal256Builder = PrimitiveBuilder; pub struct PrimitiveBuilder { values_builder: BufferBuilder, null_buffer_builder: NullBufferBuilder, + data_type: DataType, } impl ArrayBuilder for PrimitiveBuilder { @@ -150,6 +152,7 @@ impl PrimitiveBuilder { Self { values_builder: BufferBuilder::::new(capacity), null_buffer_builder: NullBufferBuilder::new(capacity), + data_type: T::DATA_TYPE, } } @@ -169,9 +172,29 @@ impl PrimitiveBuilder { Self { values_builder, null_buffer_builder, + data_type: T::DATA_TYPE, } } + /// By default [`PrimitiveBuilder`] uses [`ArrowPrimitiveType::DATA_TYPE`] as the + /// data type of the generated array. + /// + /// This method allows overriding the data type, to allow specifying timezones + /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] + /// + /// # Panics + /// + /// This method panics if `data_type` is not [PrimitiveArray::is_compatible] + pub fn with_data_type(self, data_type: DataType) -> Self { + assert!( + PrimitiveArray::::is_compatible(&data_type), + "incompatible data type for builder, expected {} got {}", + T::DATA_TYPE, + data_type + ); + Self { data_type, ..self } + } + /// Returns the capacity of this builder measured in slots of type `T` pub fn capacity(&self) -> usize { self.values_builder.capacity() @@ -250,7 +273,7 @@ impl PrimitiveBuilder { pub fn finish(&mut self) -> PrimitiveArray { let len = self.len(); let null_bit_buffer = self.null_buffer_builder.finish(); - let builder = ArrayData::builder(T::DATA_TYPE) + let builder = ArrayData::builder(self.data_type.clone()) .len(len) .add_buffer(self.values_builder.finish()) .null_bit_buffer(null_bit_buffer); @@ -267,7 +290,7 @@ impl PrimitiveBuilder { .as_slice() .map(Buffer::from_slice_ref); let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let builder = ArrayData::builder(T::DATA_TYPE) + let builder = ArrayData::builder(self.data_type.clone()) .len(len) .add_buffer(values_buffer) .null_bit_buffer(null_bit_buffer); @@ -309,6 +332,7 @@ impl PrimitiveBuilder { mod tests { use super::*; use arrow_buffer::Buffer; + use arrow_schema::TimeUnit; use crate::array::Array; use crate::array::BooleanArray; @@ -528,4 +552,30 @@ mod tests { assert_eq!(5, arr.len()); assert_eq!(0, builder.len()); } + + #[test] + fn test_primitive_array_builder_with_data_type() { + let mut builder = + Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + builder.append_value(1); + let array = builder.finish(); + assert_eq!(array.precision(), 1); + assert_eq!(array.scale(), 2); + + let data_type = + DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())); + let mut builder = + TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); + builder.append_value(1); + let array = builder.finish(); + assert_eq!(array.data_type(), &data_type); + } + + #[test] + #[should_panic( + expected = "incompatible data type for builder, expected Int32 got Int64" + )] + fn test_invalid_with_data_type() { + Int32Builder::new().with_data_type(DataType::Int64); + } } diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 12bcaf0944ef..ecf9ca4ffea7 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -115,9 +115,10 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } - DataType::Decimal128(_precision, _scale) => { - Box::new(Decimal128Builder::with_capacity(capacity)) - } + DataType::Decimal128(p, s) => Box::new( + Decimal128Builder::with_capacity(capacity) + .with_data_type(DataType::Decimal128(*p, *s)), + ), DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), @@ -133,18 +134,22 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(Time64NanosecondBuilder::with_capacity(capacity)) } - DataType::Timestamp(TimeUnit::Second, _) => { - Box::new(TimestampSecondBuilder::with_capacity(capacity)) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - Box::new(TimestampMillisecondBuilder::with_capacity(capacity)) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - Box::new(TimestampMicrosecondBuilder::with_capacity(capacity)) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Box::new(TimestampNanosecondBuilder::with_capacity(capacity)) - } + DataType::Timestamp(TimeUnit::Second, tz) => Box::new( + TimestampSecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Second, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Millisecond, tz) => Box::new( + TimestampMillisecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Millisecond, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Microsecond, tz) => Box::new( + TimestampMicrosecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())), + ), + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Box::new( + TimestampNanosecondBuilder::with_capacity(capacity) + .with_data_type(DataType::Timestamp(TimeUnit::Nanosecond, tz.clone())), + ), DataType::Interval(IntervalUnit::YearMonth) => { Box::new(IntervalYearMonthBuilder::with_capacity(capacity)) } @@ -484,6 +489,33 @@ mod tests { assert!(builder.field_builder::(2).is_some()); } + #[test] + fn test_datatype_properties() { + let fields = vec![ + Field::new("f1", DataType::Decimal128(1, 2), false), + Field::new( + "f2", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + false, + ), + ]; + let mut builder = StructBuilder::from_fields(fields.clone(), 1); + builder + .field_builder::(0) + .unwrap() + .append_value(1); + builder + .field_builder::(1) + .unwrap() + .append_value(1); + builder.append(true); + let array = builder.finish(); + + assert_eq!(array.data_type(), &DataType::Struct(fields.clone())); + assert_eq!(array.column(0).data_type(), fields[0].data_type()); + assert_eq!(array.column(1).data_type(), fields[1].data_type()); + } + #[test] #[should_panic( expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" From a8276c09b1a4eb31886e288919bf31ea655550b9 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Tue, 10 Jan 2023 13:49:39 -0500 Subject: [PATCH 0496/1411] Add a function to get memory size of array slice (#3501) * Add a function to get memory size of array slice * typo fix * PR comments * Fix error types Co-authored-by: slo --- arrow-data/src/data.rs | 92 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 918ecae847a9..31dad5e82668 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -463,6 +463,62 @@ impl ArrayData { size } + /// Returns the total number of the bytes of memory occupied by the buffers by this slice of [ArrayData] + pub fn get_slice_memory_size(&self) -> Result { + let mut result: usize = 0; + let layout = layout(&self.data_type); + + for spec in layout.buffers.iter() { + match spec { + BufferSpec::FixedWidth { byte_width } => { + let buffer_size = + self.len.checked_mul(*byte_width).ok_or_else(|| { + ArrowError::ComputeError( + "Integer overflow computing buffer size".to_string(), + ) + })?; + result += buffer_size; + } + BufferSpec::VariableWidth => { + let buffer_len: usize; + match self.data_type { + DataType::Utf8 | DataType::Binary => { + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0] ) as usize; + } + DataType::LargeUtf8 | DataType::LargeBinary => { + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0]) as usize; + } + _ => { + return Err(ArrowError::NotYetImplemented(format!( + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", + self.data_type + ))) + } + }; + result += buffer_len; + } + BufferSpec::BitMap => { + let buffer_size = bit_util::ceil(self.len, 8); + result += buffer_size; + } + BufferSpec::AlwaysNull => { + // Nothing to do + } + } + } + + if self.null_bitmap().is_some() { + result += bit_util::ceil(self.len, 8); + } + + for child in &self.child_data { + result += child.get_slice_memory_size()?; + } + Ok(result) + } + /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. pub fn get_array_memory_size(&self) -> usize { let mut size = mem::size_of_val(self); @@ -1838,6 +1894,42 @@ mod tests { assert!(!string_data_slice.ptr_eq(&string_data)) } + #[test] + fn test_slice_memory_size() { + let mut bit_v: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut bit_v, 0); + bit_util::set_bit(&mut bit_v, 3); + bit_util::set_bit(&mut bit_v, 10); + let data = ArrayData::builder(DataType::Int32) + .len(16) + .add_buffer(make_i32_buffer(16)) + .null_bit_buffer(Some(Buffer::from(bit_v))) + .build() + .unwrap(); + let new_data = data.slice(1, 14); + assert_eq!( + data.get_slice_memory_size().unwrap() - 8, + new_data.get_slice_memory_size().unwrap() + ); + let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]); + let string_data = ArrayData::try_new( + DataType::Utf8, + 3, + Some(Buffer::from_iter(vec![true, false, true])), + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + let string_data_slice = string_data.slice(1, 2); + //4 bytes of offset and 2 bytes of data reduced by slicing. + assert_eq!( + string_data.get_slice_memory_size().unwrap() - 6, + string_data_slice.get_slice_memory_size().unwrap() + ); + } + #[test] fn test_count_nulls() { let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111])); From ccb80e82bbfd19e8b353e107ba41cfe0cbaa029a Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Wed, 11 Jan 2023 10:19:16 +0800 Subject: [PATCH 0497/1411] Support decimal int32/64 for writer (#3431) --- parquet/src/arrow/arrow_writer/mod.rs | 41 +++++++++++---- parquet/src/arrow/schema/mod.rs | 73 +++++++++++++++++++++------ 2 files changed, 90 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 340ab246a38b..311981593718 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -21,7 +21,9 @@ use std::collections::VecDeque; use std::io::Write; use std::sync::Arc; -use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_array::cast::as_primitive_array; +use arrow_array::types::Decimal128Type; +use arrow_array::{types, Array, ArrayRef, RecordBatch}; use arrow_schema::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use super::schema::{ @@ -397,6 +399,12 @@ fn write_leaf( let array: &[i32] = data.buffers()[0].typed_data(); write_primitive(typed, &array[offset..offset + data.len()], levels)? } + ArrowDataType::Decimal128(_, _) => { + // use the int32 to represent the decimal with low precision + let array = as_primitive_array::(column) + .unary::<_, types::Int32Type>(|v| v as i32); + write_primitive(typed, array.values(), levels)? + } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; let array = array @@ -435,6 +443,12 @@ fn write_leaf( let array: &[i64] = data.buffers()[0].typed_data(); write_primitive(typed, &array[offset..offset + data.len()], levels)? } + ArrowDataType::Decimal128(_, _) => { + // use the int64 to represent the decimal with low precision + let array = as_primitive_array::(column) + .unary::<_, types::Int64Type>(|v| v as i64); + write_primitive(typed, array.values(), levels)? + } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; let array = array @@ -840,23 +854,32 @@ mod tests { roundtrip(batch, Some(SMALL_SIZE / 2)); } - #[test] - fn arrow_writer_decimal() { - let decimal_field = Field::new("a", DataType::Decimal128(5, 2), false); + fn get_decimal_batch(precision: u8, scale: i8) -> RecordBatch { + let decimal_field = + Field::new("a", DataType::Decimal128(precision, scale), false); let schema = Schema::new(vec![decimal_field]); let decimal_values = vec![10_000, 50_000, 0, -100] .into_iter() .map(Some) .collect::() - .with_precision_and_scale(5, 2) + .with_precision_and_scale(precision, scale) .unwrap(); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(decimal_values)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(decimal_values)]).unwrap() + } - roundtrip(batch, Some(SMALL_SIZE / 2)); + #[test] + fn arrow_writer_decimal() { + // int32 to store the decimal value + let batch_int32_decimal = get_decimal_batch(5, 2); + roundtrip(batch_int32_decimal, Some(SMALL_SIZE / 2)); + // int64 to store the decimal value + let batch_int64_decimal = get_decimal_batch(12, 2); + roundtrip(batch_int64_decimal, Some(SMALL_SIZE / 2)); + // fixed_length_byte_array to store the decimal value + let batch_fixed_len_byte_array_decimal = get_decimal_batch(30, 2); + roundtrip(batch_fixed_len_byte_array_decimal, Some(SMALL_SIZE / 2)); } #[test] diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 120612822671..f03a6c695801 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -399,21 +399,32 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::Decimal128(precision, scale) - | DataType::Decimal256(precision, scale) => { + DataType::Decimal128(precision, scale) => { // Decimal precision determines the Parquet physical type to use. - // TODO(ARROW-12018): Enable the below after ARROW-10818 Decimal support - // - // let (physical_type, length) = if *precision > 1 && *precision <= 9 { - // (PhysicalType::INT32, -1) - // } else if *precision <= 18 { - // (PhysicalType::INT64, -1) - // } else { - // ( - // PhysicalType::FIXED_LEN_BYTE_ARRAY, - // decimal_length_from_precision(*precision) as i32, - // ) - // }; + // Following the: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal + let (physical_type, length) = if *precision > 1 && *precision <= 9 { + (PhysicalType::INT32, -1) + } else if *precision <= 18 { + (PhysicalType::INT64, -1) + } else { + ( + PhysicalType::FIXED_LEN_BYTE_ARRAY, + decimal_length_from_precision(*precision) as i32, + ) + }; + Type::primitive_type_builder(name, physical_type) + .with_repetition(repetition) + .with_length(length) + .with_logical_type(Some(LogicalType::Decimal { + scale: *scale as i32, + precision: *precision as i32, + })) + .with_precision(*precision as i32) + .with_scale(*scale as i32) + .build() + } + DataType::Decimal256(precision, scale) => { + // For the decimal256, use the fixed length byte array to store the data Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(repetition) .with_length(decimal_length_from_precision(*precision) as i32) @@ -627,7 +638,7 @@ mod tests { ProjectionMask::all(), None, ) - .unwrap(); + .unwrap(); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -1257,6 +1268,9 @@ mod tests { REPEATED INT32 int_list; REPEATED BINARY byte_list; REPEATED BINARY string_list (UTF8); + REQUIRED INT32 decimal_int32 (DECIMAL(8,2)); + REQUIRED INT64 decimal_int64 (DECIMAL(16,2)); + REQUIRED FIXED_LEN_BYTE_ARRAY (13) decimal_fix_length (DECIMAL(30,2)); } "; let parquet_group_type = parse_message_type(message_type).unwrap(); @@ -1326,6 +1340,20 @@ mod tests { ))), false, ), + Field::new( + "decimal_int32", + DataType::Decimal128(8, 2), + false, + ), + Field::new( + "decimal_int64", + DataType::Decimal128(16, 2), + false, + ), + Field::new( + "decimal_fix_length", + DataType::Decimal128(30, 2), + false, ), ]; assert_eq!(arrow_fields, converted_arrow_fields); @@ -1373,6 +1401,9 @@ mod tests { } } REQUIRED BINARY dictionary_strings (STRING); + REQUIRED INT32 decimal_int32 (DECIMAL(8,2)); + REQUIRED INT64 decimal_int64 (DECIMAL(16,2)); + REQUIRED FIXED_LEN_BYTE_ARRAY (13) decimal_fix_length (DECIMAL(30,2)); } "; let parquet_group_type = parse_message_type(message_type).unwrap(); @@ -1458,6 +1489,18 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), false, ), + Field::new( + "decimal_int32", + DataType::Decimal128(8, 2), + false), + Field::new("decimal_int64", + DataType::Decimal128(16, 2), + false), + Field::new( + "decimal_fix_length", + DataType::Decimal128(30, 2), + false, + ), ]; let arrow_schema = Schema::new(arrow_fields); let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap(); From eecd991d044b0e5331867bc611bee42cb752f5c8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 10 Jan 2023 23:01:34 -0800 Subject: [PATCH 0498/1411] Upgrade base64 to 0.21 (#3500) * Upgrade base64 to 0.21 * Move to function * Use prelude --- arrow-flight/Cargo.toml | 2 +- arrow-flight/examples/flight_sql_server.rs | 5 ++++- arrow-flight/src/lib.rs | 4 +++- arrow-flight/src/sql/client.rs | 4 +++- object_store/Cargo.toml | 2 +- object_store/src/azure/client.rs | 4 +++- object_store/src/azure/credential.rs | 6 ++++-- object_store/src/azure/mod.rs | 7 ++++++- object_store/src/gcp/credential.rs | 12 ++++-------- parquet/Cargo.toml | 4 ++-- parquet/src/arrow/schema/mod.rs | 6 ++++-- parquet/src/record/api.rs | 5 ++++- 12 files changed, 39 insertions(+), 22 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1664004bdff3..d357e747c42c 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -33,7 +33,7 @@ arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } arrow-cast = { version = "30.0.0", path = "../arrow-cast" } arrow-ipc = { version = "30.0.0", path = "../arrow-ipc" } arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -base64 = { version = "0.20", default-features = false, features = ["std"] } +base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } prost = { version = "0.11", default-features = false } diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 54e19a8cc57d..5aff347e48d1 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -24,6 +24,8 @@ use arrow_flight::{ Action, FlightData, FlightEndpoint, HandshakeRequest, HandshakeResponse, IpcMessage, Location, SchemaAsIpc, Ticket, }; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use futures::{stream, Stream}; use prost::Message; use std::pin::Pin; @@ -96,7 +98,8 @@ impl FlightSqlService for FlightSqlServiceImpl { )))?; } let base64 = &authorization[basic.len()..]; - let bytes = base64::decode(base64) + let bytes = BASE64_STANDARD + .decode(base64) .map_err(|e| status!("authorization not decodable", e))?; let str = String::from_utf8(bytes) .map_err(|e| status!("authorization not parsable", e))?; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 3057735a6ad7..a44b4b06e4c5 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -36,6 +36,8 @@ use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; use arrow_ipc::convert::try_schema_from_ipc_buffer; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::Bytes; use std::{ convert::{TryFrom, TryInto}, @@ -265,7 +267,7 @@ impl fmt::Display for Ticket { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Ticket {{")?; write!(f, " ticket: ")?; - write!(f, "{}", base64::encode(&self.ticket)) + write!(f, "{}", BASE64_STANDARD.encode(&self.ticket)) } } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 679213af0d86..ecc121d985a0 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::Bytes; use std::collections::HashMap; use std::sync::Arc; @@ -166,7 +168,7 @@ impl FlightSqlServiceClient { payload: Default::default(), }; let mut req = tonic::Request::new(stream::iter(vec![cmd])); - let val = base64::encode(format!("{}:{}", username, password)); + let val = BASE64_STANDARD.encode(format!("{}:{}", username, password)); let val = format!("Basic {}", val) .parse() .map_err(|_| ArrowError::ParseError("Cannot parse header".to_string()))?; diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index e61a127c9c00..4be6d63fcdea 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -43,7 +43,7 @@ url = "2.2" walkdir = "2" # Cloud storage support -base64 = { version = "0.20", default-features = false, features = ["std"], optional = true } +base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 556a2ad2b292..426b3b164695 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -25,6 +25,8 @@ use crate::{ BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use itertools::Itertools; @@ -528,7 +530,7 @@ impl BlockList { for block_id in &self.blocks { let node = format!( "\t{}\n", - base64::encode(block_id) + BASE64_STANDARD.encode(block_id) ); s.push_str(&node); } diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 38e6e64f1e0f..96ff8ce153a5 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -19,6 +19,8 @@ use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; use crate::RetryConfig; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use chrono::Utc; use reqwest::header::ACCEPT; use reqwest::{ @@ -153,8 +155,8 @@ fn generate_authorization( key: &str, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(base64::decode(key).unwrap(), str_to_sign); - format!("SharedKey {}:{}", account, base64::encode(auth)) + let auth = hmac_sha256(BASE64_STANDARD.decode(key).unwrap(), str_to_sign); + format!("SharedKey {}:{}", account, BASE64_STANDARD.encode(auth)) } fn add_if_exists<'a>(h: &'a HeaderMap, key: &HeaderName) -> &'a str { diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index cbd5a35dc13d..3bce8e5984b8 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -34,6 +34,8 @@ use crate::{ RetryConfig, }; use async_trait::async_trait; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::Bytes; use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; @@ -330,7 +332,10 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { &self.location, Some(buf.into()), true, - &[("comp", "block"), ("blockid", &base64::encode(block_id))], + &[ + ("comp", "block"), + ("blockid", &BASE64_STANDARD.encode(block_id)), + ], ) .await?; diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index a2a98a39be33..cc157dd41985 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -18,17 +18,13 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; use crate::RetryConfig; -use base64::engine::fast_portable::FastPortable; +use base64::prelude::BASE64_URL_SAFE_NO_PAD; +use base64::Engine; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; use std::time::{Duration, Instant}; -const URL_SAFE_NO_PAD: FastPortable = FastPortable::from( - &base64::alphabet::URL_SAFE, - base64::engine::fast_portable::NO_PAD, -); - #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("No RSA key found in pem file"))] @@ -172,7 +168,7 @@ impl OAuthProvider { ) .context(SignSnafu)?; - let signature = base64::encode_engine(&sig_bytes, &URL_SAFE_NO_PAD); + let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); let jwt = [message, signature].join("."); let body = [ @@ -224,5 +220,5 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; - Ok(base64::encode_engine(string, &URL_SAFE_NO_PAD)) + Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 2aa7449787b1..ade8d95210f5 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -55,7 +55,7 @@ zstd = { version = "0.12.0", optional = true, default-features = false } chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -base64 = { version = "0.20", default-features = false, features = ["std", ], optional = true } +base64 = { version = "0.21", default-features = false, features = ["std", ], optional = true } clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -67,7 +67,7 @@ twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } [dev-dependencies] -base64 = { version = "0.20", default-features = false, features = ["std"] } +base64 = { version = "0.21", default-features = false, features = ["std"] } criterion = { version = "0.4", default-features = false } snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index f03a6c695801..2ca4b7ef8a79 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -23,6 +23,8 @@ //! //! The interfaces for converting arrow schema to parquet schema is coming. +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use std::collections::HashMap; use std::sync::Arc; @@ -100,7 +102,7 @@ pub(crate) fn parquet_to_array_schema_and_fields( /// Try to convert Arrow schema metadata into a schema fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { - let decoded = base64::decode(encoded_meta); + let decoded = BASE64_STANDARD.decode(encoded_meta); match decoded { Ok(bytes) => { let slice = if bytes.len() > 8 && bytes[0..4] == [255u8; 4] { @@ -148,7 +150,7 @@ fn encode_arrow_schema(schema: &Schema) -> String { len_prefix_schema.append((schema_len as u32).to_le_bytes().to_vec().as_mut()); len_prefix_schema.append(&mut serialized_schema.ipc_message); - base64::encode(&len_prefix_schema) + BASE64_STANDARD.encode(&len_prefix_schema) } /// Mutates writer metadata by storing the encoded Arrow schema. diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 2d15e126ff65..0880e717981a 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -669,6 +669,9 @@ impl Field { #[cfg(any(feature = "json", test))] pub fn to_json_value(&self) -> Value { + use base64::prelude::BASE64_STANDARD; + use base64::Engine; + match &self { Field::Null => Value::Null, Field::Bool(b) => Value::Bool(*b), @@ -688,7 +691,7 @@ impl Field { .unwrap_or(Value::Null), Field::Decimal(n) => Value::String(convert_decimal_to_string(n)), Field::Str(s) => Value::String(s.to_owned()), - Field::Bytes(b) => Value::String(base64::encode(b.data())), + Field::Bytes(b) => Value::String(BASE64_STANDARD.encode(b.data())), Field::Date(d) => Value::String(convert_date_to_string(*d)), Field::TimestampMillis(ts) => { Value::String(convert_timestamp_millis_to_string(*ts)) From 3788fd20f053ee58f08b4d09cd4dac5bb9b96c06 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Wed, 11 Jan 2023 15:01:55 +0800 Subject: [PATCH 0499/1411] fix comments (#3505) --- arrow-buffer/src/bigint.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index fc360657cb54..c3a05ba061db 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -88,7 +88,7 @@ impl i256 { } } - /// Create an integer value from its representation as a byte array in little-endian. + /// Create an integer value from its representation as a byte array in big-endian. #[inline] pub const fn from_be_bytes(b: [u8; 32]) -> Self { let (high, low) = split_array(b); From 5fb337db04a1a19f7d40da46f19b7b5fd4051593 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 11 Jan 2023 22:50:26 +1100 Subject: [PATCH 0500/1411] Fix negative interval prettyprint (#3491) * Fix negative interval prettyprint * Simplify check * Empty * Fix edge case --- arrow-cast/src/display.rs | 30 ++++++++++++++++------ arrow/src/util/pretty.rs | 54 ++++++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 5534ebd8134a..e603260b072c 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -67,20 +67,29 @@ macro_rules! make_string_interval_day_time { let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; - let secs = milliseconds_part / 1000; + let secs = milliseconds_part / 1_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); + let milliseconds = milliseconds_part % 1_000; + + let secs_sign = if secs < 0 || milliseconds < 0 { + "-" + } else { + "" + }; + Ok(format!( - "0 years 0 mons {} days {} hours {} mins {}.{:03} secs", + "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", days_parts, hours, mins, - secs, - (milliseconds_part % 1000), + secs_sign, + secs.abs(), + milliseconds.abs(), )) }}; } @@ -99,21 +108,26 @@ macro_rules! make_string_interval_month_day_nano { let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; - let secs = nanoseconds_part / 1000000000; + let secs = nanoseconds_part / 1_000_000_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); + let nanoseconds = nanoseconds_part % 1_000_000_000; + + let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; + Ok(format!( - "0 years {} mons {} days {} hours {} mins {}.{:09} secs", + "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", months_part, days_part, hours, mins, - secs, - (nanoseconds_part % 1000000000), + secs_sign, + secs.abs(), + nanoseconds.abs(), )) }}; } diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 859053352384..53ae0fddef6f 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -991,6 +991,9 @@ mod tests { #[test] fn test_pretty_format_interval_day_time() -> Result<()> { let arr = Arc::new(arrow_array::IntervalDayTimeArray::from(vec![ + Some(-600000), + Some(4294966295), + Some(4294967295), Some(1), Some(10), Some(100), @@ -1007,13 +1010,16 @@ mod tests { let table = pretty_format_batches(&[batch])?.to_string(); let expected = vec![ - "+-------------------------------------------------+", - "| IntervalDayTime |", - "+-------------------------------------------------+", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100 secs |", - "+-------------------------------------------------+", + "+----------------------------------------------------+", + "| IntervalDayTime |", + "+----------------------------------------------------+", + "| 0 years 0 mons -1 days 0 hours -10 mins 0.000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins -1.001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins -0.001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.010 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.100 secs |", + "+----------------------------------------------------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -1026,6 +1032,9 @@ mod tests { #[test] fn test_pretty_format_interval_month_day_nano_array() -> Result<()> { let arr = Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![ + Some(-600000000000), + Some(18446744072709551615), + Some(18446744073709551615), Some(1), Some(10), Some(100), @@ -1049,20 +1058,23 @@ mod tests { let table = pretty_format_batches(&[batch])?.to_string(); let expected = vec![ - "+-------------------------------------------------------+", - "| IntervalMonthDayNano |", - "+-------------------------------------------------------+", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000100 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000001000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000010000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000100000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs |", - "+-------------------------------------------------------+", + "+-----------------------------------------------------------+", + "| IntervalMonthDayNano |", + "+-----------------------------------------------------------+", + "| 0 years -1 mons -1 days 0 hours -10 mins 0.000000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins -1.000000001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins -0.000000001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000001 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000010 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000100 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000001000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000010000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.000100000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.001000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.010000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 0.100000000 secs |", + "| 0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs |", + "+-----------------------------------------------------------+", ]; let actual: Vec<&str> = table.lines().collect(); From ddba53be76a6ceb2cf232677574b611ab67c33c9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 11 Jan 2023 14:51:04 +0100 Subject: [PATCH 0501/1411] Update prost-build requirement from =0.11.5 to =0.11.6 (#3507) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/compare/v0.11.5...v0.11.6) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index d357e747c42c..d88880468756 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -59,7 +59,7 @@ tower = "0.4.13" # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.49", default-features = false } -prost-build = { version = "=0.11.5", default-features = false } +prost-build = { version = "=0.11.6", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] From 55c87c114443739ed73ebc28d0ba53bf875ecd9a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 11 Jan 2023 13:41:07 -0800 Subject: [PATCH 0502/1411] Fix DataTypeLayout for LargeList (#3503) * Fix DataTypeLayout for LargeList * Add datalayout test --- arrow-array/src/types.rs | 44 ++++++++++++++++++++++++++++++++++++++++ arrow-data/src/data.rs | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index e7d92d2d08f9..7c41a469e30e 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -767,6 +767,8 @@ pub type LargeBinaryType = GenericBinaryType; #[cfg(test)] mod tests { use super::*; + use arrow_data::{layout, BufferSpec}; + use std::mem::size_of; #[test] fn month_day_nano_should_roundtrip() { @@ -803,4 +805,46 @@ mod tests { let value = IntervalYearMonthType::make_value(-1, -2); assert_eq!(IntervalYearMonthType::to_months(value), -14); } + + fn test_layout() { + let layout = layout(&T::DATA_TYPE); + + assert_eq!(layout.buffers.len(), 1); + + let spec = &layout.buffers[0]; + assert_eq!( + spec, + &BufferSpec::FixedWidth { + byte_width: size_of::() + } + ); + } + + #[test] + fn test_layouts() { + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + } } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 31dad5e82668..14dbe9387db3 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1470,7 +1470,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::()), DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data - DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, _, mode) => { let type_ids = BufferSpec::FixedWidth { From c731b045540a9e38b8eb0a0767b135b57006fb20 Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 11 Jan 2023 23:57:19 -0800 Subject: [PATCH 0503/1411] Enable cast Date32 to Timestamp (#3508) * Enable cast Date32 to Timestamp * fix test --- arrow-cast/src/cast.rs | 73 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index f3dbdb8e06e8..8b8244a7c9ac 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -255,6 +255,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), Int64) => true, (Int64, Timestamp(_, _)) => true, (Date64, Timestamp(_, None)) => true, + (Date32, Timestamp(_, None)) => true, (Timestamp(_, _), Timestamp(_, _) | Date32 @@ -1943,7 +1944,24 @@ pub fn cast_with_options( |x| x * (NANOSECONDS / MILLISECONDS), ), )), - + (Date32, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, TimestampSecondType>(|x| (x as i64) * SECONDS_IN_DAY), + )), + (Date32, Timestamp(TimeUnit::Millisecond, None)) => Ok(Arc::new( + as_primitive_array::(array).unary::<_, TimestampMillisecondType>( + |x| (x as i64) * MILLISECONDS_IN_DAY, + ), + )), + (Date32, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( + as_primitive_array::(array).unary::<_, TimestampMicrosecondType>( + |x| (x as i64) * MICROSECONDS_IN_DAY, + ), + )), + (Date32, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( + as_primitive_array::(array) + .unary::<_, TimestampNanosecondType>(|x| (x as i64) * NANOSECONDS_IN_DAY), + )), (Int64, Duration(TimeUnit::Second)) => { cast_reinterpret_arrays::(array) } @@ -7693,4 +7711,57 @@ mod tests { test_cast_string_to_decimal256_overflow(overflow_array); } + + #[test] + fn test_cast_date32_to_timestamp() { + let a = Date32Array::from(vec![Some(18628), Some(18993), None]); // 2021-1-1, 2022-1-1 + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(1609459200, c.value(0)); + assert_eq!(1640995200, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date32_to_timestamp_ms() { + let a = Date32Array::from(vec![Some(18628), Some(18993), None]); // 2021-1-1, 2022-1-1 + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Millisecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1609459200000, c.value(0)); + assert_eq!(1640995200000, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date32_to_timestamp_us() { + let a = Date32Array::from(vec![Some(18628), Some(18993), None]); // 2021-1-1, 2022-1-1 + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1609459200000000, c.value(0)); + assert_eq!(1640995200000000, c.value(1)); + assert!(c.is_null(2)); + } + + #[test] + fn test_cast_date32_to_timestamp_ns() { + let a = Date32Array::from(vec![Some(18628), Some(18993), None]); // 2021-1-1, 2022-1-1 + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1609459200000000000, c.value(0)); + assert_eq!(1640995200000000000, c.value(1)); + assert!(c.is_null(2)); + } } From 9ae0c9bee87da07063eee9849cbdb055bf227543 Mon Sep 17 00:00:00 2001 From: Steve Vaughan Date: Thu, 12 Jan 2023 14:50:45 -0500 Subject: [PATCH 0504/1411] Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502) * Extract Regex implementation from dict function Extract the implementation comparing 2 ArrayAccessors from the generated dict function so that it can be used for other string comparisons (i.e. starts_with, ends_with, and contains). The new functions replace the use of the macro parameters pat, neg, and typ. * Provide SQL operation for documenation Provide the entire SQL operation instead of generating it based on assumptions about the syntax of "like"-based operations. This will allow it to be used for other comparison operations. * feat: Implement SQL STARTSWITH, ENDSWITH, and CONTAINS * Add missing documentation for public functions * Remove the dependency on arrow-ord Duplicate compare_op and compare_op_scalar * Fix document duplication source without a link * fix: Helper functions shouldn't be public * Duplication comment was in the wrong file * Remove unused no_simd_compare_op This was accidentally included as part of the duplication of compare_op and compare_op_scalar from arrow_ord::comparison * Add unit tests * fix: Remove typo in documentation * fix: Be consistent with references to more details Co-authored-by: Steve Vaughan Jr --- arrow-string/src/like.rs | 320 +++++++++++++++++++++++++++++++++++---- 1 file changed, 287 insertions(+), 33 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index d8afa8d4c614..c9cdb7bab18d 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -25,9 +25,45 @@ use arrow_select::take::take; use regex::Regex; use std::collections::HashMap; +/// Helper function to perform boolean lambda function on values from two array accessors, this +/// version does not attempt to use SIMD. +/// +/// Duplicated from `arrow_ord::comparison` +fn compare_op( + left: T, + right: S, + op: F, +) -> Result +where + F: Fn(T::Item, S::Item) -> bool, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + + Ok(BooleanArray::from_binary(left, right, op)) +} + +/// Helper function to perform boolean lambda function on values from array accessor, this +/// version does not attempt to use SIMD. +/// +/// Duplicated from `arrow_ord::comparison` +fn compare_op_scalar( + left: T, + op: F, +) -> Result +where + F: Fn(T::Item) -> bool, +{ + Ok(BooleanArray::from_unary(left, op)) +} + macro_rules! dyn_function { ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")] +#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] /// [`LargeStringArray`], or [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`]. /// @@ -67,14 +103,32 @@ pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result { -#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")] +#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] /// [`LargeStringArray`], or [`DictionaryArray`] with values /// [`StringArray`]/[`LargeStringArray`] and a scalar. /// @@ -115,15 +169,34 @@ pub fn $fn_name( } } } -scalar_dyn_function!("LIKE", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("NOT LIKE", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("ILIKE", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!("NOT ILIKE", nilike_utf8_scalar_dyn, nilike_scalar); +scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); +scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); +scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); +scalar_dyn_function!( + "left NOT ILIKE right", + nilike_utf8_scalar_dyn, + nilike_scalar +); +scalar_dyn_function!( + "STARTSWITH(left, right)", + starts_with_utf8_scalar_dyn, + starts_with_scalar +); +scalar_dyn_function!( + "ENDSWITH(left, right)", + ends_with_utf8_scalar_dyn, + ends_with_scalar +); +scalar_dyn_function!( + "CONTAINS(left, right)", + contains_utf8_scalar_dyn, + contains_scalar +); macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $pat:tt, $neg:expr, $typ:tt) => { + ($sql:tt, $fn_name:tt, $fn_impl:tt) => { -#[doc = concat!("Perform SQL `left ", $sql ," right` operation on on [`DictionaryArray`] with values")] +#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] /// [`StringArray`]/[`LargeStringArray`]. /// /// See the documentation on [`like_utf8`] for more details. @@ -137,28 +210,13 @@ fn $fn_name( let left = left.downcast_dict::>().unwrap(); let right = right.downcast_dict::>().unwrap(); - regex_like(left, right, $neg, |re_pattern| { - Regex::new(&format!($pat, re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from {} pattern: {}", - $typ, e - )) - }) - }) + $fn_impl(left, right) } (DataType::LargeUtf8, DataType::LargeUtf8) => { let left = left.downcast_dict::>().unwrap(); let right = right.downcast_dict::>().unwrap(); - regex_like(left, right, $neg, |re_pattern| { - Regex::new(&format!($pat, re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from {} pattern: {}", - $typ, - e - )) - }) - }) + $fn_impl(left, right) } _ => Err(ArrowError::ComputeError(format!( "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", @@ -169,10 +227,13 @@ fn $fn_name( } } -dict_function!("LIKE", like_dict, "^{}$", false, "LIKE"); -dict_function!("NOT LIKE", nlike_dict, "^{}$", true, "LIKE"); -dict_function!("ILIKE", ilike_dict, "(?i)^{}$", false, "ILIKE"); -dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE"); +dict_function!("left LIKE right", like_dict, like); +dict_function!("left NOT LIKE right", nlike_dict, nlike); +dict_function!("left ILIKE right", ilike_dict, ilike); +dict_function!("left NOT ILIKE right", nilike_dict, nilike); +dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); +dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); +dict_function!("CONTAINS(left, right)", contains_dict, contains); /// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. /// @@ -195,6 +256,14 @@ dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE"); pub fn like_utf8( left: &GenericStringArray, right: &GenericStringArray, +) -> Result { + like(left, right) +} + +#[inline] +fn like<'a, S: ArrayAccessor>( + left: S, + right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { @@ -318,6 +387,14 @@ fn replace_like_wildcards(pattern: &str) -> Result { pub fn nlike_utf8( left: &GenericStringArray, right: &GenericStringArray, +) -> Result { + nlike(left, right) +} + +#[inline] +fn nlike<'a, S: ArrayAccessor>( + left: S, + right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { @@ -358,6 +435,14 @@ pub fn nlike_utf8_scalar( pub fn ilike_utf8( left: &GenericStringArray, right: &GenericStringArray, +) -> Result { + ilike(left, right) +} + +#[inline] +fn ilike<'a, S: ArrayAccessor>( + left: S, + right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { @@ -442,6 +527,14 @@ pub fn ilike_utf8_scalar( pub fn nilike_utf8( left: &GenericStringArray, right: &GenericStringArray, +) -> Result { + nilike(left, right) +} + +#[inline] +fn nilike<'a, S: ArrayAccessor>( + left: S, + right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { @@ -533,6 +626,117 @@ where Ok(BooleanArray::from(data)) } +/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn starts_with_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + starts_with(left, right) +} + +#[inline] +fn starts_with<'a, S: ArrayAccessor>( + left: S, + right: S, +) -> Result { + compare_op(left, right, |l, r| l.starts_with(r)) +} + +#[inline] +fn starts_with_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + compare_op_scalar(left, |item| item.starts_with(right)) +} + +/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn starts_with_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + starts_with_scalar(left, right) +} + +/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ends_with_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + ends_with(left, right) +} + +#[inline] +fn ends_with<'a, S: ArrayAccessor>( + left: S, + right: S, +) -> Result { + compare_op(left, right, |l, r| l.ends_with(r)) +} + +#[inline] +fn ends_with_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + compare_op_scalar(left, |item| item.ends_with(right)) +} + +/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ends_with_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + ends_with_scalar(left, right) +} + +/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn contains_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result { + contains(left, right) +} + +#[inline] +fn contains<'a, S: ArrayAccessor>( + left: S, + right: S, +) -> Result { + compare_op(left, right, |l, r| l.contains(r)) +} + +#[inline] +fn contains_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + compare_op_scalar(left, |item| item.contains(right)) +} + +/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / +/// [`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn contains_utf8_scalar( + left: &GenericStringArray, + right: &str, +) -> Result { + contains_scalar(left, right) +} + #[cfg(test)] mod tests { use super::*; @@ -682,6 +886,18 @@ mod tests { vec![true, false, true, false] ); + // Replicates `test_utf8_array_like_scalar_start` `test_utf8_array_like_scalar_dyn_start` to + // demonstrate that `SQL STARTSWITH` works as expected. + test_utf8_scalar!( + test_utf8_array_starts_with_scalar_start, + test_utf8_array_starts_with_scalar_dyn_start, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + starts_with_utf8_scalar, + starts_with_utf8_scalar_dyn, + vec![true, false, true, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_end, test_utf8_array_like_scalar_dyn_end, @@ -692,6 +908,18 @@ mod tests { vec![true, true, false, false] ); + // Replicates `test_utf8_array_like_scalar_end` `test_utf8_array_like_scalar_dyn_end` to + // demonstrate that `SQL ENDSWITH` works as expected. + test_utf8_scalar!( + test_utf8_array_ends_with_scalar_end, + test_utf8_array_ends_with_scalar_dyn_end, + vec!["arrow", "parrow", "arrows", "arr"], + "arrow", + ends_with_utf8_scalar, + ends_with_utf8_scalar_dyn, + vec![true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_like_scalar_equals, test_utf8_array_like_scalar_dyn_equals, @@ -1011,6 +1239,32 @@ mod tests { vec![false, true, true, false, false, false, false, true, true, true] ); + // Replicates `test_utf8_array_ilike_unicode_contains` and + // `test_utf8_array_ilike_unicode_contains_dyn` to + // demonstrate that `SQL CONTAINS` works as expected. + // + // NOTE: 5 of the values were changed because the original used a case insensitive `ilike`. + test_utf8_scalar!( + test_utf8_array_contains_unicode_contains, + test_utf8_array_contains_unicode_contains_dyn, + vec![ + "sdlkdfFkoßsdfs", + "sdlkdFFkoSSdggs", // Original was case insensitive "sdlkdfFkoSSdggs" + "sdlkdFFkoSSsdsd", // Original was case insensitive "sdlkdfFkosssdsd" + "FkoS", + "Fkos", + "ffkoSS", + "ffkoß", + "😃sadlksFFkoSSsh😃klF", // Original was case insensitive "😃sadlksffkosSsh😃klF" + "😱slgFFkoSSsh😃klF", // Original was case insensitive "😱slgffkosSsh😃klF" + "FFkoSS", // "FFKoSS" + ], + "FFkoSS", + contains_utf8_scalar, + contains_utf8_scalar_dyn, + vec![false, true, true, false, false, false, false, true, true, true] + ); + test_utf8_scalar!( test_utf8_array_ilike_unicode_complex, test_utf8_array_ilike_unicode_complex_dyn, From 79d823f9ad4b6c02d4aa7a6d4a5a178a25fc4363 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 12 Jan 2023 21:54:52 +0100 Subject: [PATCH 0505/1411] Additional nullif re-export (#3515) --- arrow/src/compute/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index c0b10afe48a6..c9fd525e85a4 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -29,6 +29,7 @@ pub use self::kernels::concat::*; pub use self::kernels::filter::*; pub use self::kernels::interleave::*; pub use self::kernels::limit::*; +pub use self::kernels::nullif::*; pub use self::kernels::partition::*; pub use self::kernels::regexp::*; pub use self::kernels::sort::*; From 8688dba69b925f5be3b6484c19ca7d54da1c0511 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 13 Jan 2023 02:32:30 -0500 Subject: [PATCH 0506/1411] Update version to `31.0.0` and add changelog (#3518) * Update version * Update changelog --- CHANGELOG-old.md | 91 +++++++++++++++ CHANGELOG.md | 116 ++++++++++--------- arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 14 +-- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-ord/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +-- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +- arrow/Cargo.toml | 28 ++--- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 ++-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 261 insertions(+), 168 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index f62b1ee707cc..2bf0aef992f8 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,97 @@ # Historical Changelog +## [30.0.1](https://github.com/apache/arrow-rs/tree/30.0.1) (2023-01-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.0...30.0.1) + +**Implemented enhancements:** + +- Generic bytes dictionary builder [\#3425](https://github.com/apache/arrow-rs/issues/3425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) +- Mid-level `ArrowFlight` Client [\#3371](https://github.com/apache/arrow-rs/issues/3371) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Improve performance of the CSV parser [\#3338](https://github.com/apache/arrow-rs/issues/3338) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- `nullif` kernel no longer exported [\#3454](https://github.com/apache/arrow-rs/issues/3454) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- PrimitiveArray from ArrayData Unsound For IntervalArray [\#3439](https://github.com/apache/arrow-rs/issues/3439) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- LZ4-compressed PQ files unreadable by Pandas and ClickHouse [\#3433](https://github.com/apache/arrow-rs/issues/3433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet Record API: Cannot convert date before Unix epoch to json [\#3430](https://github.com/apache/arrow-rs/issues/3430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- parquet-fromcsv with writer version v2 does not stop [\#3408](https://github.com/apache/arrow-rs/issues/3408) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +## [30.0.0](https://github.com/apache/arrow-rs/tree/30.0.0) (2022-12-29) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/29.0.0...30.0.0) + +**Breaking changes:** + +- Infer Parquet JSON Logical and Converted Type as UTF-8 [\#3376](https://github.com/apache/arrow-rs/pull/3376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use custom Any instead of prost\_types [\#3360](https://github.com/apache/arrow-rs/pull/3360) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Use bytes in arrow-flight [\#3359](https://github.com/apache/arrow-rs/pull/3359) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Add derived implementations of Clone and Debug for `ParquetObjectReader` [\#3381](https://github.com/apache/arrow-rs/issues/3381) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up TrackedWrite [\#3366](https://github.com/apache/arrow-rs/issues/3366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Is it possible for ArrowWriter to write key\_value\_metadata after write all records [\#3356](https://github.com/apache/arrow-rs/issues/3356) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add UnionArray test to arrow-pyarrow integration test [\#3346](https://github.com/apache/arrow-rs/issues/3346) +- Document / Deprecate arrow\_flight::utils::flight\_data\_from\_arrow\_batch [\#3312](https://github.com/apache/arrow-rs/issues/3312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- \[FlightSQL\] Support HTTPs [\#3309](https://github.com/apache/arrow-rs/issues/3309) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support UnionArray in ffi [\#3304](https://github.com/apache/arrow-rs/issues/3304) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- Support casting from String to Decimal [\#3280](https://github.com/apache/arrow-rs/issues/3280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow ArrowCSV writer to control the display of NULL values [\#3268](https://github.com/apache/arrow-rs/issues/3268) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- FlightSQL example is broken [\#3386](https://github.com/apache/arrow-rs/issues/3386) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- CSV Reader Bounds Incorrectly Handles Header [\#3364](https://github.com/apache/arrow-rs/issues/3364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect output string from `try_to_type` [\#3350](https://github.com/apache/arrow-rs/issues/3350) +- Decimal arithmetic computation fails to run because decimal type equality [\#3344](https://github.com/apache/arrow-rs/issues/3344) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pretty print not implemented for Map [\#3322](https://github.com/apache/arrow-rs/issues/3322) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ILIKE Kernels Inconsistent Case Folding [\#3311](https://github.com/apache/arrow-rs/issues/3311) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- minor: Improve arrow-flight docs [\#3372](https://github.com/apache/arrow-rs/pull/3372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Version 30.0.0 release notes and changelog [\#3406](https://github.com/apache/arrow-rs/pull/3406) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Ends ParquetRecordBatchStream when polling on StreamState::Error [\#3404](https://github.com/apache/arrow-rs/pull/3404) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- fix clippy issues [\#3398](https://github.com/apache/arrow-rs/pull/3398) ([Jimexist](https://github.com/Jimexist)) +- Upgrade multiversion to 0.7.1 [\#3396](https://github.com/apache/arrow-rs/pull/3396) ([viirya](https://github.com/viirya)) +- Make FlightSQL Support HTTPs [\#3388](https://github.com/apache/arrow-rs/pull/3388) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Fix broken FlightSQL example [\#3387](https://github.com/apache/arrow-rs/pull/3387) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) +- Update prost-build [\#3385](https://github.com/apache/arrow-rs/pull/3385) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-arith \(\#2594\) [\#3384](https://github.com/apache/arrow-rs/pull/3384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add derive for Clone and Debug for `ParquetObjectReader` [\#3382](https://github.com/apache/arrow-rs/pull/3382) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kszlim](https://github.com/kszlim)) +- Initial Mid-level `FlightClient` [\#3378](https://github.com/apache/arrow-rs/pull/3378) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Document all features on docs.rs [\#3377](https://github.com/apache/arrow-rs/pull/3377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-row \(\#2594\) [\#3375](https://github.com/apache/arrow-rs/pull/3375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unnecessary flush calls on TrackedWrite [\#3374](https://github.com/apache/arrow-rs/pull/3374) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Update proc-macro2 requirement from =1.0.47 to =1.0.49 [\#3369](https://github.com/apache/arrow-rs/pull/3369) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add CSV build\_buffered \(\#3338\) [\#3368](https://github.com/apache/arrow-rs/pull/3368) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add append\_key\_value\_metadata [\#3367](https://github.com/apache/arrow-rs/pull/3367) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jiacai2050](https://github.com/jiacai2050)) +- Add csv-core based reader \(\#3338\) [\#3365](https://github.com/apache/arrow-rs/pull/3365) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Put BufWriter into TrackedWrite [\#3361](https://github.com/apache/arrow-rs/pull/3361) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add CSV reader benchmark \(\#3338\) [\#3357](https://github.com/apache/arrow-rs/pull/3357) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use ArrayData::ptr\_eq in DictionaryTracker [\#3354](https://github.com/apache/arrow-rs/pull/3354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate flight\_data\_from\_arrow\_batch [\#3353](https://github.com/apache/arrow-rs/pull/3353) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Dandandan](https://github.com/Dandandan)) +- Fix incorrect output string from try\_to\_type [\#3351](https://github.com/apache/arrow-rs/pull/3351) ([viirya](https://github.com/viirya)) +- Fix unary\_dyn for decimal scalar arithmetic computation [\#3345](https://github.com/apache/arrow-rs/pull/3345) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add UnionArray test to arrow-pyarrow integration test [\#3343](https://github.com/apache/arrow-rs/pull/3343) ([viirya](https://github.com/viirya)) +- feat: configure null value in arrow csv writer [\#3342](https://github.com/apache/arrow-rs/pull/3342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Optimize bulk writing of all blocks of bloom filter [\#3340](https://github.com/apache/arrow-rs/pull/3340) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add MapArray to pretty print [\#3339](https://github.com/apache/arrow-rs/pull/3339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Update prost-build 0.11.4 [\#3334](https://github.com/apache/arrow-rs/pull/3334) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Faster Parquet Bloom Writer [\#3333](https://github.com/apache/arrow-rs/pull/3333) ([tustvold](https://github.com/tustvold)) +- Add bloom filter benchmark for parquet writer [\#3323](https://github.com/apache/arrow-rs/pull/3323) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- Add ASCII fast path for ILIKE scalar \(90% faster\) [\#3306](https://github.com/apache/arrow-rs/pull/3306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support UnionArray in ffi [\#3305](https://github.com/apache/arrow-rs/pull/3305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support casting from String to Decimal [\#3281](https://github.com/apache/arrow-rs/pull/3281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- add more integration test for parquet bloom filter round trip tests [\#3210](https://github.com/apache/arrow-rs/pull/3210) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) ## [29.0.0](https://github.com/apache/arrow-rs/tree/29.0.0) (2022-12-09) [Full Changelog](https://github.com/apache/arrow-rs/compare/28.0.0...29.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d39458d967a..6eb7ebab8339 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,78 +19,80 @@ # Changelog -## [30.0.0](https://github.com/apache/arrow-rs/tree/30.0.0) (2022-12-29) +## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-12) -[Full Changelog](https://github.com/apache/arrow-rs/compare/29.0.0...30.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) **Breaking changes:** -- Infer Parquet JSON Logical and Converted Type as UTF-8 [\#3376](https://github.com/apache/arrow-rs/pull/3376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Use custom Any instead of prost\_types [\#3360](https://github.com/apache/arrow-rs/pull/3360) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Use bytes in arrow-flight [\#3359](https://github.com/apache/arrow-rs/pull/3359) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- support RFC3339 style timestamps in `arrow-json` [\#3449](https://github.com/apache/arrow-rs/pull/3449) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- Improve arrow flight batch splitting and naming [\#3444](https://github.com/apache/arrow-rs/pull/3444) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Parquet record API: timestamp as signed integer [\#3437](https://github.com/apache/arrow-rs/pull/3437) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ByteBaker](https://github.com/ByteBaker)) +- Support decimal int32/64 for writer [\#3431](https://github.com/apache/arrow-rs/pull/3431) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) **Implemented enhancements:** -- Add derived implementations of Clone and Debug for `ParquetObjectReader` [\#3381](https://github.com/apache/arrow-rs/issues/3381) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Speed up TrackedWrite [\#3366](https://github.com/apache/arrow-rs/issues/3366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Is it possible for ArrowWriter to write key\_value\_metadata after write all records [\#3356](https://github.com/apache/arrow-rs/issues/3356) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add UnionArray test to arrow-pyarrow integration test [\#3346](https://github.com/apache/arrow-rs/issues/3346) -- Document / Deprecate arrow\_flight::utils::flight\_data\_from\_arrow\_batch [\#3312](https://github.com/apache/arrow-rs/issues/3312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- \[FlightSQL\] Support HTTPs [\#3309](https://github.com/apache/arrow-rs/issues/3309) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support UnionArray in ffi [\#3304](https://github.com/apache/arrow-rs/issues/3304) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) -- Support casting from String to Decimal [\#3280](https://github.com/apache/arrow-rs/issues/3280) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow ArrowCSV writer to control the display of NULL values [\#3268](https://github.com/apache/arrow-rs/issues/3268) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting Date32 to timestamp [\#3504](https://github.com/apache/arrow-rs/issues/3504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting strings like `'2001-01-01'` to timestamp [\#3492](https://github.com/apache/arrow-rs/issues/3492) +- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) +- CLI to "rewrite" parquet files [\#3476](https://github.com/apache/arrow-rs/issues/3476) +- Add more dictionary value type support to `build_compare` [\#3465](https://github.com/apache/arrow-rs/issues/3465) +- Allow `concat_batches` to take non owned RecordBatch [\#3456](https://github.com/apache/arrow-rs/issues/3456) +- Release Arrow `30.0.1` \(maintenance release for `30.0.0`\) [\#3455](https://github.com/apache/arrow-rs/issues/3455) +- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3442](https://github.com/apache/arrow-rs/issues/3442) +- make\_builder Loses Timezone and Decimal Scale Information [\#3435](https://github.com/apache/arrow-rs/issues/3435) +- Use RFC3339 style timestamps in arrow-json [\#3416](https://github.com/apache/arrow-rs/issues/3416) +- ArrayData`get_slice_memory_size` or similar [\#3407](https://github.com/apache/arrow-rs/issues/3407) **Fixed bugs:** -- FlightSQL example is broken [\#3386](https://github.com/apache/arrow-rs/issues/3386) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- CSV Reader Bounds Incorrectly Handles Header [\#3364](https://github.com/apache/arrow-rs/issues/3364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect output string from `try_to_type` [\#3350](https://github.com/apache/arrow-rs/issues/3350) -- Decimal arithmetic computation fails to run because decimal type equality [\#3344](https://github.com/apache/arrow-rs/issues/3344) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Pretty print not implemented for Map [\#3322](https://github.com/apache/arrow-rs/issues/3322) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ILIKE Kernels Inconsistent Case Folding [\#3311](https://github.com/apache/arrow-rs/issues/3311) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sliced batch w/ bool column doesn't roundtrip through IPC [\#3496](https://github.com/apache/arrow-rs/issues/3496) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- take kernel on List array introduces nulls instead of empty lists [\#3471](https://github.com/apache/arrow-rs/issues/3471) +- Infinite Loop If Skipping More CSV Lines than Present [\#3469](https://github.com/apache/arrow-rs/issues/3469) -**Documentation updates:** +**Closed issues:** -- minor: Improve arrow-flight docs [\#3372](https://github.com/apache/arrow-rs/pull/3372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) **Merged pull requests:** -- Version 30.0.0 release notes and changelog [\#3406](https://github.com/apache/arrow-rs/pull/3406) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Ends ParquetRecordBatchStream when polling on StreamState::Error [\#3404](https://github.com/apache/arrow-rs/pull/3404) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- fix clippy issues [\#3398](https://github.com/apache/arrow-rs/pull/3398) ([Jimexist](https://github.com/Jimexist)) -- Upgrade multiversion to 0.7.1 [\#3396](https://github.com/apache/arrow-rs/pull/3396) ([viirya](https://github.com/viirya)) -- Make FlightSQL Support HTTPs [\#3388](https://github.com/apache/arrow-rs/pull/3388) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) -- Fix broken FlightSQL example [\#3387](https://github.com/apache/arrow-rs/pull/3387) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([viirya](https://github.com/viirya)) -- Update prost-build [\#3385](https://github.com/apache/arrow-rs/pull/3385) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-arith \(\#2594\) [\#3384](https://github.com/apache/arrow-rs/pull/3384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add derive for Clone and Debug for `ParquetObjectReader` [\#3382](https://github.com/apache/arrow-rs/pull/3382) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kszlim](https://github.com/kszlim)) -- Initial Mid-level `FlightClient` [\#3378](https://github.com/apache/arrow-rs/pull/3378) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Document all features on docs.rs [\#3377](https://github.com/apache/arrow-rs/pull/3377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Split out arrow-row \(\#2594\) [\#3375](https://github.com/apache/arrow-rs/pull/3375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove unnecessary flush calls on TrackedWrite [\#3374](https://github.com/apache/arrow-rs/pull/3374) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Update proc-macro2 requirement from =1.0.47 to =1.0.49 [\#3369](https://github.com/apache/arrow-rs/pull/3369) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add CSV build\_buffered \(\#3338\) [\#3368](https://github.com/apache/arrow-rs/pull/3368) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: add append\_key\_value\_metadata [\#3367](https://github.com/apache/arrow-rs/pull/3367) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jiacai2050](https://github.com/jiacai2050)) -- Add csv-core based reader \(\#3338\) [\#3365](https://github.com/apache/arrow-rs/pull/3365) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Put BufWriter into TrackedWrite [\#3361](https://github.com/apache/arrow-rs/pull/3361) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Add CSV reader benchmark \(\#3338\) [\#3357](https://github.com/apache/arrow-rs/pull/3357) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use ArrayData::ptr\_eq in DictionaryTracker [\#3354](https://github.com/apache/arrow-rs/pull/3354) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Deprecate flight\_data\_from\_arrow\_batch [\#3353](https://github.com/apache/arrow-rs/pull/3353) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Dandandan](https://github.com/Dandandan)) -- Fix incorrect output string from try\_to\_type [\#3351](https://github.com/apache/arrow-rs/pull/3351) ([viirya](https://github.com/viirya)) -- Fix unary\_dyn for decimal scalar arithmetic computation [\#3345](https://github.com/apache/arrow-rs/pull/3345) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add UnionArray test to arrow-pyarrow integration test [\#3343](https://github.com/apache/arrow-rs/pull/3343) ([viirya](https://github.com/viirya)) -- feat: configure null value in arrow csv writer [\#3342](https://github.com/apache/arrow-rs/pull/3342) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Optimize bulk writing of all blocks of bloom filter [\#3340](https://github.com/apache/arrow-rs/pull/3340) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Add MapArray to pretty print [\#3339](https://github.com/apache/arrow-rs/pull/3339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Update prost-build 0.11.4 [\#3334](https://github.com/apache/arrow-rs/pull/3334) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Faster Parquet Bloom Writer [\#3333](https://github.com/apache/arrow-rs/pull/3333) ([tustvold](https://github.com/tustvold)) -- Add bloom filter benchmark for parquet writer [\#3323](https://github.com/apache/arrow-rs/pull/3323) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) -- Add ASCII fast path for ILIKE scalar \(90% faster\) [\#3306](https://github.com/apache/arrow-rs/pull/3306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support UnionArray in ffi [\#3305](https://github.com/apache/arrow-rs/pull/3305) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support casting from String to Decimal [\#3281](https://github.com/apache/arrow-rs/pull/3281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- add more integration test for parquet bloom filter round trip tests [\#3210](https://github.com/apache/arrow-rs/pull/3210) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jimexist](https://github.com/Jimexist)) +- Additional nullif re-export [\#3515](https://github.com/apache/arrow-rs/pull/3515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Enable cast Date32 to Timestamp [\#3508](https://github.com/apache/arrow-rs/pull/3508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Update prost-build requirement from =0.11.5 to =0.11.6 [\#3507](https://github.com/apache/arrow-rs/pull/3507) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor fix for the comments [\#3505](https://github.com/apache/arrow-rs/pull/3505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Fix DataTypeLayout for LargeList [\#3503](https://github.com/apache/arrow-rs/pull/3503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3502](https://github.com/apache/arrow-rs/pull/3502) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([snmvaughan](https://github.com/snmvaughan)) +- Add a function to get memory size of array slice [\#3501](https://github.com/apache/arrow-rs/pull/3501) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Fix IPCWriter for Sliced BooleanArray [\#3498](https://github.com/apache/arrow-rs/pull/3498) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Fix: Added support to cast string without time [\#3494](https://github.com/apache/arrow-rs/pull/3494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gaelwjl](https://github.com/gaelwjl)) +- Fix negative interval prettyprint [\#3491](https://github.com/apache/arrow-rs/pull/3491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Fixes a broken link in the arrow lib.rs rustdoc [\#3487](https://github.com/apache/arrow-rs/pull/3487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([AdamGS](https://github.com/AdamGS)) +- Refactoring build\_compare for decimal and using downcast\_primitive [\#3484](https://github.com/apache/arrow-rs/pull/3484) ([viirya](https://github.com/viirya)) +- Add tests for record batch size splitting logic in FlightClient [\#3481](https://github.com/apache/arrow-rs/pull/3481) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- change `concat_batches` parameter to non owned reference [\#3480](https://github.com/apache/arrow-rs/pull/3480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- feat: add `parquet-rewrite` CLI [\#3477](https://github.com/apache/arrow-rs/pull/3477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) +- Preserve empty list array elements in take kernel [\#3473](https://github.com/apache/arrow-rs/pull/3473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- Add a test for stream writer for writing sliced array [\#3472](https://github.com/apache/arrow-rs/pull/3472) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix CSV infinite loop and improve error messages [\#3470](https://github.com/apache/arrow-rs/pull/3470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more dictionary value type support to `build_compare` [\#3466](https://github.com/apache/arrow-rs/pull/3466) ([viirya](https://github.com/viirya)) +- Add tests for `FlightClient::{list_flights, list_actions, do_action, get_schema}` [\#3463](https://github.com/apache/arrow-rs/pull/3463) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Minor: add ticket links to failing ipc integration tests [\#3461](https://github.com/apache/arrow-rs/pull/3461) ([alamb](https://github.com/alamb)) +- feat: `column_name` based index access for `RecordBatch` and `StructArray` [\#3458](https://github.com/apache/arrow-rs/pull/3458) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Support Decimal256 in FFI [\#3453](https://github.com/apache/arrow-rs/pull/3453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove multiversion dependency [\#3452](https://github.com/apache/arrow-rs/pull/3452) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Re-export nullif kernel [\#3451](https://github.com/apache/arrow-rs/pull/3451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Meaningful error message for map builder with null keys [\#3450](https://github.com/apache/arrow-rs/pull/3450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Parquet writer v2: clear buffer after page flush [\#3447](https://github.com/apache/arrow-rs/pull/3447) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- Verify ArrayData::data\_type compatible in PrimitiveArray::from [\#3440](https://github.com/apache/arrow-rs/pull/3440) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Preserve DataType metadata in make\_builder [\#3438](https://github.com/apache/arrow-rs/pull/3438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate arrow ipc tests and increase coverage [\#3427](https://github.com/apache/arrow-rs/pull/3427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Generic bytes dictionary builder [\#3426](https://github.com/apache/arrow-rs/pull/3426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Minor: Improve docs for arrow-ipc, remove clippy ignore [\#3421](https://github.com/apache/arrow-rs/pull/3421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- refactor: convert `*like_dyn`, `*like_utf8_scalar_dyn` and `*like_dict` functions to macros [\#3411](https://github.com/apache/arrow-rs/pull/3411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add parquet-index binary [\#3405](https://github.com/apache/arrow-rs/pull/3405) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Complete mid-level `FlightClient` [\#3402](https://github.com/apache/arrow-rs/pull/3402) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Implement `RecordBatch` \<--\> `FlightData` encode/decode + tests [\#3391](https://github.com/apache/arrow-rs/pull/3391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Provide `into_builder` for bytearray [\#3326](https://github.com/apache/arrow-rs/pull/3326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 0da34ea15bb0..67f82c05821a 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "30.0.0" +version = "31.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 06a84cad4e7a..155dcc412c25 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "30.0.0" +version = "31.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 2df41b537f5f..bef5a7a8a875 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "30.0.0" +version = "31.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index c94a5b77a045..cab1edcccdf2 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "30.0.0" +version = "31.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-select = { version = "30.0.0", path = "../arrow-select" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-select = { version = "31.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 0b6e6035c0b5..674e159074d6 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "30.0.0" +version = "31.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 9df8bd0b1027..fc5839522e82 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "30.0.0" +version = "31.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index d88880468756..1571c29b0a8d 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "30.0.0" +version = "31.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "30.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "31.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -50,7 +50,7 @@ flight-sql-experimental = [] tls = ["tonic/tls"] [dev-dependencies] -arrow = { version = "30.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "31.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index df3a0839532d..5159d5499fa2 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "30.0.0" +arrow-flight = "31.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index df28ffce5d4a..6177c7b37b82 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "30.0.0" +version = "31.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "30.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } +arrow = { version = "31.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 045931867726..3f07da7d3fa8 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "30.0.0" +version = "31.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 31ae90929a7e..eb39c8ed45ab 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "30.0.0" +version = "31.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 88323105d0fe..9b9095b27cc1 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "30.0.0" +version = "31.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 3a2096ea5651..a8f9fcdf82ae 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "30.0.0" +version = "31.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-select = { version = "30.0.0", path = "../arrow-select" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-select = { version = "31.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 9a1daad36849..e1fa90836f61 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "30.0.0" +version = "31.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "30.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "31.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 99b1eb150720..436f6d04b427 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "30.0.0" +version = "31.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-ord = { version = "30.0.0", path = "../arrow-ord" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-ord = { version = "31.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index d6fddd1e0916..8ccf565bf5e5 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "30.0.0" +version = "31.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index a609f72a69c0..a1ba58900b65 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "30.0.0" +version = "31.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-array = { version = "30.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index c6cc2f2a32dc..f62ec919d5fc 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "30.0.0" +version = "31.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-select = { version = "30.0.0", path = "../arrow-select" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-select = { version = "31.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index d83637cbcea1..8719cba0effe 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "30.0.0" +version = "31.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "30.0.0", path = "../arrow-arith" } -arrow-array = { version = "30.0.0", path = "../arrow-array" } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "30.0.0", path = "../arrow-cast" } -arrow-csv = { version = "30.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "30.0.0", path = "../arrow-data" } -arrow-ipc = { version = "30.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "30.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "30.0.0", path = "../arrow-ord" } -arrow-row = { version = "30.0.0", path = "../arrow-row" } -arrow-schema = { version = "30.0.0", path = "../arrow-schema" } -arrow-select = { version = "30.0.0", path = "../arrow-select" } -arrow-string = { version = "30.0.0", path = "../arrow-string" } +arrow-arith = { version = "31.0.0", path = "../arrow-arith" } +arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "31.0.0", path = "../arrow-cast" } +arrow-csv = { version = "31.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-ipc = { version = "31.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "31.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "31.0.0", path = "../arrow-ord" } +arrow-row = { version = "31.0.0", path = "../arrow-row" } +arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-select = { version = "31.0.0", path = "../arrow-select" } +arrow-string = { version = "31.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow/README.md b/arrow/README.md index 441e65ac3fc3..d0c7785821e6 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `30.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `31.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index fd7fa8c82adf..1fcc0862f6bc 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/30.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/31.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index ef87d20f4c30..236809ed5f85 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="29.0.0" -FUTURE_RELEASE="30.0.0" +SINCE_TAG="30.0.0" +FUTURE_RELEASE="31.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index ade8d95210f5..b395a5bada4b 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "30.0.0" +version = "31.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "30.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "30.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "30.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "30.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "30.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "30.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "30.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "30.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "31.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "31.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "31.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "31.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "31.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "31.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "31.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "31.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "30.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "31.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index a22503bcf295..8234d02d4c49 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "30.0.0" +version = "31.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "30.0.0", default-features = false } +parquet = { path = "../parquet", version = "31.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index b10672b84a57..72e2568e4b19 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "30.0.0" -parquet_derive = "30.0.0" +parquet = "31.0.0" +parquet_derive = "31.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 59cfef593899..47f5a54b3bab 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "30.0.0" +version = "31.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "30.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "30.0.0", default-features = false } +parquet = { path = "../parquet", version = "31.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "31.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From d49cd21f9c5ac27961041f7a2a9dbf4cea9708de Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 12 Jan 2023 23:50:05 -0800 Subject: [PATCH 0507/1411] Make consistent behavior on zeros equality on floating point types (#3510) * Treat positive and negative float zeros as equal * Update doc * Add test * Make build_compare consistent with comparison kernels --- arrow-ord/src/comparison.rs | 67 +++++++++++++++++++++++++++++++++++++ arrow-ord/src/ord.rs | 36 ++++---------------- 2 files changed, 74 insertions(+), 29 deletions(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 80c8b6b1c393..4754aeb1f75a 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -628,6 +628,8 @@ macro_rules! dyn_compare_utf8_scalar { /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn eq_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -647,6 +649,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -666,6 +670,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -685,6 +691,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -704,6 +712,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -723,6 +733,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq_dyn_scalar(left: &dyn Array, right: T) -> Result where @@ -2098,6 +2110,8 @@ where /// /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. /// /// # Example @@ -2141,6 +2155,8 @@ pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result Result Result Result( left: &PrimitiveArray, @@ -2386,6 +2412,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn eq_scalar( left: &PrimitiveArray, @@ -2418,6 +2446,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq( left: &PrimitiveArray, @@ -2438,6 +2468,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn neq_scalar( left: &PrimitiveArray, @@ -2459,6 +2491,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt( left: &PrimitiveArray, @@ -2480,6 +2514,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_scalar( left: &PrimitiveArray, @@ -2501,6 +2537,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq( left: &PrimitiveArray, @@ -2522,6 +2560,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn lt_eq_scalar( left: &PrimitiveArray, @@ -2543,6 +2583,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt( left: &PrimitiveArray, @@ -2564,6 +2606,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_scalar( left: &PrimitiveArray, @@ -2585,6 +2629,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq( left: &PrimitiveArray, @@ -2606,6 +2652,8 @@ where /// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros are different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. pub fn gt_eq_scalar( left: &PrimitiveArray, @@ -5828,6 +5876,25 @@ mod tests { assert_eq!(e, r); } + #[test] + #[cfg(not(feature = "simd"))] + fn test_floating_zeros() { + let a = Float32Array::from(vec![0.0_f32, -0.0]); + let b = Float32Array::from(vec![-0.0_f32, 0.0]); + + let result = eq_dyn(&a, &b).unwrap(); + let excepted = BooleanArray::from(vec![false, false]); + assert_eq!(excepted, result); + + let result = eq_dyn_scalar(&a, 0.0).unwrap(); + let excepted = BooleanArray::from(vec![true, false]); + assert_eq!(excepted, result); + + let result = eq_dyn_scalar(&a, -0.0).unwrap(); + let excepted = BooleanArray::from(vec![false, true]); + assert_eq!(excepted, result); + } + #[derive(Debug)] struct ToType {} diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index b7737c6de61f..00b6668adaf9 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -21,32 +21,21 @@ use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; -use num::Float; use std::cmp::Ordering; /// Compare the values at two arbitrary indices in two arrays. pub type DynComparator = Box Ordering + Send + Sync>; -/// compares two floats, placing NaNs at last -fn cmp_nans_last(a: &T, b: &T) -> Ordering { - match (a.is_nan(), b.is_nan()) { - (true, true) => Ordering::Equal, - (true, false) => Ordering::Greater, - (false, true) => Ordering::Less, - _ => a.partial_cmp(b).unwrap(), - } -} - fn compare_primitives( left: &dyn Array, right: &dyn Array, ) -> DynComparator where - T::Native: Ord, + T::Native: ArrowNativeTypeOp, { let left: PrimitiveArray = PrimitiveArray::from(left.data().clone()); let right: PrimitiveArray = PrimitiveArray::from(right.data().clone()); - Box::new(move |i, j| left.value(i).cmp(&right.value(j))) + Box::new(move |i, j| left.value(i).compare(right.value(j))) } fn compare_boolean(left: &dyn Array, right: &dyn Array) -> DynComparator { @@ -56,18 +45,6 @@ fn compare_boolean(left: &dyn Array, right: &dyn Array) -> DynComparator { Box::new(move |i, j| left.value(i).cmp(&right.value(j))) } -fn compare_float( - left: &dyn Array, - right: &dyn Array, -) -> DynComparator -where - T::Native: Float, -{ - let left: PrimitiveArray = PrimitiveArray::from(left.data().clone()); - let right: PrimitiveArray = PrimitiveArray::from(right.data().clone()); - Box::new(move |i, j| cmp_nans_last(&left.value(i), &right.value(j))) -} - fn compare_string(left: &dyn Array, right: &dyn Array) -> DynComparator where T: OffsetSizeTrait, @@ -197,8 +174,8 @@ pub fn build_compare( (Int16, Int16) => compare_primitives::(left, right), (Int32, Int32) => compare_primitives::(left, right), (Int64, Int64) => compare_primitives::(left, right), - (Float32, Float32) => compare_float::(left, right), - (Float64, Float64) => compare_float::(left, right), + (Float32, Float32) => compare_primitives::(left, right), + (Float64, Float64) => compare_primitives::(left, right), (Decimal128(_, _), Decimal128(_, _)) => { compare_primitives::(left, right) } @@ -372,6 +349,7 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); assert_eq!(Ordering::Less, (cmp)(0, 1)); + assert_eq!(Ordering::Equal, (cmp)(1, 1)); } #[test] @@ -380,8 +358,8 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Equal, (cmp)(0, 1)); - assert_eq!(Ordering::Equal, (cmp)(1, 0)); + assert_eq!(Ordering::Less, (cmp)(0, 1)); + assert_eq!(Ordering::Greater, (cmp)(1, 0)); } #[test] From 2191e723de3796638acbee6bdcbc246ea43d9a48 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Fri, 13 Jan 2023 15:57:30 +0800 Subject: [PATCH 0508/1411] use the decimal_type to replace the decimal logical (#3522) --- parquet/src/arrow/schema/primitive.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index bd56583a8f77..e95db2b033e5 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -235,14 +235,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result Ok(DataType::Binary), (None, ConvertedType::ENUM) => Ok(DataType::Binary), (None, ConvertedType::UTF8) => Ok(DataType::Utf8), - (Some(LogicalType::Decimal { precision, scale }), _) => Ok(DataType::Decimal128( - precision.try_into().unwrap(), - scale.try_into().unwrap(), - )), - (None, ConvertedType::DECIMAL) => Ok(DataType::Decimal128( - precision.try_into().unwrap(), - scale.try_into().unwrap(), - )), + (Some(LogicalType::Decimal { scale: s, precision: p }), _) => decimal_type(s, p), + (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), (logical, converted) => Err(arrow_err!( "Unable to convert parquet BYTE_ARRAY logical type {:?} or converted type {}", logical, From 0650a3a7726e992cf9d253165b79fbb00a3d9222 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 13 Jan 2023 09:28:06 +0100 Subject: [PATCH 0509/1411] Fix reading null booleans from CSV (#3523) * Fix reading null booleans from CSV * Clippy * Review feedback --- arrow-csv/src/reader/mod.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index bc6b016ec9cf..0c7bfa897fd4 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -866,6 +866,9 @@ fn build_boolean_array( .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); + if s.is_empty() { + return Ok(None); + } let parsed = parse_bool(s); match parsed { Some(e) => Ok(Some(e)), @@ -1122,6 +1125,7 @@ mod tests { use std::io::{Cursor, Write}; use tempfile::NamedTempFile; + use arrow_array::cast::as_boolean_array; use chrono::prelude::*; #[test] @@ -2067,4 +2071,32 @@ mod tests { assert_eq!(b.num_rows(), expected, "{}", idx); } } + + #[test] + fn test_null_boolean() { + let csv = "true,false\nFalse,True\n,True\nFalse,"; + let b = ReaderBuilder::new() + .build_buffered(Cursor::new(csv.as_bytes())) + .unwrap() + .next() + .unwrap() + .unwrap(); + + assert_eq!(b.num_rows(), 4); + assert_eq!(b.num_columns(), 2); + + let c = as_boolean_array(b.column(0)); + assert_eq!(c.null_count(), 1); + assert!(c.value(0)); + assert!(!c.value(1)); + assert!(c.is_null(2)); + assert!(!c.value(3)); + + let c = as_boolean_array(b.column(1)); + assert_eq!(c.null_count(), 1); + assert!(!c.value(0)); + assert!(c.value(1)); + assert!(c.value(2)); + assert!(c.is_null(3)); + } } From 25a11baf535d72356966cf931baa3d3153b177dc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 13 Jan 2023 10:44:31 +0100 Subject: [PATCH 0510/1411] Final changelog tweaks for 31.0.0 (#3524) --- CHANGELOG.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eb7ebab8339..109453c306a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ # Changelog -## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-12) +## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-13) [Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) @@ -33,30 +33,31 @@ **Implemented enhancements:** - Support casting Date32 to timestamp [\#3504](https://github.com/apache/arrow-rs/issues/3504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting strings like `'2001-01-01'` to timestamp [\#3492](https://github.com/apache/arrow-rs/issues/3492) -- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) -- CLI to "rewrite" parquet files [\#3476](https://github.com/apache/arrow-rs/issues/3476) +- Support casting strings like `'2001-01-01'` to timestamp [\#3492](https://github.com/apache/arrow-rs/issues/3492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CLI to "rewrite" parquet files [\#3476](https://github.com/apache/arrow-rs/issues/3476) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Add more dictionary value type support to `build_compare` [\#3465](https://github.com/apache/arrow-rs/issues/3465) -- Allow `concat_batches` to take non owned RecordBatch [\#3456](https://github.com/apache/arrow-rs/issues/3456) +- Allow `concat_batches` to take non owned RecordBatch [\#3456](https://github.com/apache/arrow-rs/issues/3456) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Release Arrow `30.0.1` \(maintenance release for `30.0.0`\) [\#3455](https://github.com/apache/arrow-rs/issues/3455) -- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3442](https://github.com/apache/arrow-rs/issues/3442) -- make\_builder Loses Timezone and Decimal Scale Information [\#3435](https://github.com/apache/arrow-rs/issues/3435) -- Use RFC3339 style timestamps in arrow-json [\#3416](https://github.com/apache/arrow-rs/issues/3416) -- ArrayData`get_slice_memory_size` or similar [\#3407](https://github.com/apache/arrow-rs/issues/3407) +- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3442](https://github.com/apache/arrow-rs/issues/3442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- make\_builder Loses Timezone and Decimal Scale Information [\#3435](https://github.com/apache/arrow-rs/issues/3435) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use RFC3339 style timestamps in arrow-json [\#3416](https://github.com/apache/arrow-rs/issues/3416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrayData`get_slice_memory_size` or similar [\#3407](https://github.com/apache/arrow-rs/issues/3407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Sliced batch w/ bool column doesn't roundtrip through IPC [\#3496](https://github.com/apache/arrow-rs/issues/3496) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- take kernel on List array introduces nulls instead of empty lists [\#3471](https://github.com/apache/arrow-rs/issues/3471) -- Infinite Loop If Skipping More CSV Lines than Present [\#3469](https://github.com/apache/arrow-rs/issues/3469) - -**Closed issues:** - -- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) +- Unable to read CSV with null boolean value [\#3521](https://github.com/apache/arrow-rs/issues/3521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make consistent behavior on zeros equality on floating point types [\#3509](https://github.com/apache/arrow-rs/issues/3509) +- Sliced batch w/ bool column doesn't roundtrip through IPC [\#3496](https://github.com/apache/arrow-rs/issues/3496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- take kernel on List array introduces nulls instead of empty lists [\#3471](https://github.com/apache/arrow-rs/issues/3471) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Infinite Loop If Skipping More CSV Lines than Present [\#3469](https://github.com/apache/arrow-rs/issues/3469) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** +- Fix reading null booleans from CSV [\#3523](https://github.com/apache/arrow-rs/pull/3523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- minor fix: use the unified decimal type builder [\#3522](https://github.com/apache/arrow-rs/pull/3522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Update version to `31.0.0` and add changelog [\#3518](https://github.com/apache/arrow-rs/pull/3518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) - Additional nullif re-export [\#3515](https://github.com/apache/arrow-rs/pull/3515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make consistent behavior on zeros equality on floating point types [\#3510](https://github.com/apache/arrow-rs/pull/3510) ([viirya](https://github.com/viirya)) - Enable cast Date32 to Timestamp [\#3508](https://github.com/apache/arrow-rs/pull/3508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) - Update prost-build requirement from =0.11.5 to =0.11.6 [\#3507](https://github.com/apache/arrow-rs/pull/3507) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) - minor fix for the comments [\#3505](https://github.com/apache/arrow-rs/pull/3505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) From 84e0cc187eeebd9a3b17e2b1a299ec7dbe9e0163 Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 13 Jan 2023 12:14:27 -0800 Subject: [PATCH 0511/1411] No panic on timestamp buffer overflow (#3519) * No panic on timestamp buffer overflow * Added more description --- arrow-cast/src/parse.rs | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index e885ec5b67a8..8cf6b4ea7e01 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -84,13 +84,13 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // timezone offset, using ' ' as a separator // Example: 2020-09-08 13:42:29.190855-05:00 if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return Ok(ts.timestamp_nanos()); + return to_timestamp_nanos(ts.naive_utc()); } // with an explicit Z, using ' ' as a separator // Example: 2020-09-08 13:42:29Z if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.timestamp_nanos()); + return to_timestamp_nanos(ts.naive_utc()); } // Support timestamps without an explicit timezone offset, again @@ -99,7 +99,7 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // without a timezone specifier as a local time, using T as a separator // Example: 2020-09-08T13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { - return Ok(ts.timestamp_nanos()); + return to_timestamp_nanos(ts); } // without a timezone specifier as a local time, using T as a @@ -112,7 +112,7 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // without a timezone specifier as a local time, using ' ' as a separator // Example: 2020-09-08 13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") { - return Ok(ts.timestamp_nanos()); + return to_timestamp_nanos(ts); } // without a timezone specifier as a local time, using ' ' as a @@ -141,6 +141,18 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +#[inline] +fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { + if dt.timestamp().checked_mul(1_000_000_000).is_none() { + return Err(ArrowError::ParseError( + ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), + )); + } + + Ok(dt.timestamp_nanos()) +} + /// Accepts a string in ISO8601 standard format and some /// variants and converts it to nanoseconds since midnight. /// @@ -373,6 +385,9 @@ impl Parser for Time32SecondType { /// Number of days between 0001-01-01 and 1970-01-01 const EPOCH_DAYS_FROM_CE: i32 = 719_163; +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + impl Parser for Date32Type { fn parse(string: &str) -> Option { let date = string.parse::().ok()?; @@ -845,4 +860,11 @@ mod tests { Some(7_801) ); } + + #[test] + fn string_to_timestamp_old() { + parse_timestamp("1677-06-14T07:29:01.256") + .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) + .unwrap_err(); + } } From 5a7ec4624d35af0a860d84274308ba4f5303cada Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 13 Jan 2023 22:34:44 +0100 Subject: [PATCH 0512/1411] Clap fixes (#3528) --- parquet/Cargo.toml | 2 +- parquet/src/bin/parquet-fromcsv-help.txt | 2 +- parquet/src/bin/parquet-fromcsv.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index b395a5bada4b..43bd52beeb18 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -56,7 +56,7 @@ chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } base64 = { version = "0.21", default-features = false, features = ["std", ], optional = true } -clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } +clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } seq-macro = { version = "0.3", default-features = false } diff --git a/parquet/src/bin/parquet-fromcsv-help.txt b/parquet/src/bin/parquet-fromcsv-help.txt index ec7eb0cc13f1..44d75f5a036d 100644 --- a/parquet/src/bin/parquet-fromcsv-help.txt +++ b/parquet/src/bin/parquet-fromcsv-help.txt @@ -67,4 +67,4 @@ Options: display usage help -V, --version - Print version information + Print version diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 53391a6addc8..23913f0eafb3 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -528,7 +528,7 @@ mod tests { Ok(_) => panic!("unexpected success"), Err(e) => assert_eq!( format!("{}", e), - "error: Invalid value 'zip' for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), + "error: invalid value 'zip' for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), } } From 95cf030137cd7b925e89c102fb060b971e4a3836 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 15 Jan 2023 22:50:43 +0000 Subject: [PATCH 0513/1411] Write backwards compatible row group statistics (#3526) (#3527) * Write backwards compatible row group statistics (#3526) * More docs * More docs --- parquet/src/basic.rs | 7 ++++ parquet/src/column/writer/mod.rs | 27 ++++++++++++- parquet/src/file/statistics.rs | 68 ++++++++++++++++++++++++++++---- parquet/src/file/writer.rs | 56 ++++++++++++++++++++++++++ parquet/src/schema/types.rs | 12 +++++- 5 files changed, 159 insertions(+), 11 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 96cdd537dbeb..bdc203b742fe 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -330,6 +330,13 @@ pub enum SortOrder { UNDEFINED, } +impl SortOrder { + /// Returns true if this is [`Self::SIGNED`] + pub fn is_signed(&self) -> bool { + matches!(self, Self::SIGNED) + } +} + /// Column order that specifies what method was used to aggregate min/max values for /// statistics. /// diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index fb244920236a..f2417900d99e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -33,7 +33,7 @@ use crate::encodings::levels::LevelEncoder; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ColumnIndexBuilder, OffsetIndexBuilder}; use crate::file::properties::EnabledStatistics; -use crate::file::statistics::Statistics; +use crate::file::statistics::{Statistics, ValueStatistics}; use crate::file::{ metadata::ColumnChunkMetaData, properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, @@ -817,13 +817,20 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .set_dictionary_page_offset(dict_page_offset); if self.statistics_enabled != EnabledStatistics::None { - let statistics = Statistics::new( + let statistics = ValueStatistics::::new( self.column_metrics.min_column_value.clone(), self.column_metrics.max_column_value.clone(), self.column_metrics.column_distinct_count, self.column_metrics.num_column_nulls, false, ); + + // Some common readers only support the deprecated statistics + // format so we also write them out if possible + // See https://github.com/apache/arrow-rs/issues/799 + let statistics = statistics + .with_backwards_compatible_min_max(self.descr.sort_order().is_signed()) + .into(); builder = builder.set_statistics(statistics); } @@ -1893,6 +1900,9 @@ mod tests { fn test_bool_statistics() { let stats = statistics_roundtrip::(&[true, false, false, true]); assert!(stats.has_min_max_set()); + // Booleans have an unsigned sort order and so are not compatible + // with the deprecated `min` and `max` statistics + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Boolean(stats) = stats { assert_eq!(stats.min(), &false); assert_eq!(stats.max(), &true); @@ -1905,6 +1915,7 @@ mod tests { fn test_int32_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int32(stats) = stats { assert_eq!(stats.min(), &-2); assert_eq!(stats.max(), &3); @@ -1917,6 +1928,7 @@ mod tests { fn test_int64_statistics() { let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Int64(stats) = stats { assert_eq!(stats.min(), &-2); assert_eq!(stats.max(), &3); @@ -1938,6 +1950,7 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(stats.has_min_max_set()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::Int96(stats) = stats { assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); @@ -1950,6 +1963,7 @@ mod tests { fn test_float_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min(), &-2.0); assert_eq!(stats.max(), &3.0); @@ -1962,6 +1976,7 @@ mod tests { fn test_double_statistics() { let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min(), &-2.0); assert_eq!(stats.max(), &3.0); @@ -1978,6 +1993,7 @@ mod tests { .collect::>(); let stats = statistics_roundtrip::(&input); + assert!(!stats.is_min_max_backwards_compatible()); assert!(stats.has_min_max_set()); if let Statistics::ByteArray(stats) = stats { assert_eq!(stats.min(), &ByteArray::from("aaw")); @@ -1999,6 +2015,7 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(stats.has_min_max_set()); + assert!(!stats.is_min_max_backwards_compatible()); if let Statistics::FixedLenByteArray(stats) = stats { let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); assert_eq!(stats.min(), &expected_min); @@ -2013,6 +2030,7 @@ mod tests { fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); @@ -2025,6 +2043,7 @@ mod tests { fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Float(stats) = stats { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); @@ -2037,6 +2056,7 @@ mod tests { fn test_float_statistics_nan_only() { let stats = statistics_roundtrip::(&[f32::NAN, f32::NAN]); assert!(!stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); assert!(matches!(stats, Statistics::Float(_))); } @@ -2044,6 +2064,7 @@ mod tests { fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); @@ -2056,6 +2077,7 @@ mod tests { fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); if let Statistics::Double(stats) = stats { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); @@ -2069,6 +2091,7 @@ mod tests { let stats = statistics_roundtrip::(&[f64::NAN, f64::NAN]); assert!(!stats.has_min_max_set()); assert!(matches!(stats, Statistics::Double(_))); + assert!(stats.is_min_max_backwards_compatible()); } #[test] diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 35b5179d36bb..8eb04ffbc65c 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -252,10 +252,13 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { (None, None) }; - if stats.is_min_max_deprecated() { - thrift_stats.min = min; - thrift_stats.max = max; - } else { + if stats.is_min_max_backwards_compatible() { + // Copy to deprecated min, max values for compatibility with older readers + thrift_stats.min = min.clone(); + thrift_stats.max = max.clone(); + } + + if !stats.is_min_max_deprecated() { thrift_stats.min_value = min; thrift_stats.max_value = max; } @@ -329,6 +332,20 @@ impl Statistics { statistics_enum_func![self, is_min_max_deprecated] } + /// Old versions of parquet stored statistics in `min` and `max` fields, ordered + /// using signed comparison. This resulted in an undefined ordering for unsigned + /// quantities, such as booleans and unsigned integers. + /// + /// These fields were therefore deprecated in favour of `min_value` and `max_value`, + /// which have a type-defined sort order. + /// + /// However, not all readers have been updated. For backwards compatibility, this method + /// returns `true` if the statistics within this have a signed sort order, that is + /// compatible with being stored in the deprecated `min` and `max` fields + pub fn is_min_max_backwards_compatible(&self) -> bool { + statistics_enum_func![self, is_min_max_backwards_compatible] + } + /// Returns optional value of number of distinct values occurring. /// When it is `None`, the value should be ignored. pub fn distinct_count(&self) -> Option { @@ -405,7 +422,14 @@ pub struct ValueStatistics { // Distinct count could be omitted in some cases distinct_count: Option, null_count: u64, + + /// If `true` populate the deprecated `min` and `max` fields instead of + /// `min_value` and `max_value` is_min_max_deprecated: bool, + + /// If `true` the statistics are compatible with the deprecated `min` and + /// `max` fields. See [`ValueStatistics::is_min_max_backwards_compatible`] + is_min_max_backwards_compatible: bool, } impl ValueStatistics { @@ -423,6 +447,19 @@ impl ValueStatistics { distinct_count, null_count, is_min_max_deprecated, + is_min_max_backwards_compatible: is_min_max_deprecated, + } + } + + /// Set whether to write the deprecated `min` and `max` fields + /// for compatibility with older parquet writers + /// + /// This should only be enabled if the field is signed, + /// see [`Self::is_min_max_backwards_compatible`] + pub fn with_backwards_compatible_min_max(self, backwards_compatible: bool) -> Self { + Self { + is_min_max_backwards_compatible: backwards_compatible, + ..self } } @@ -478,6 +515,20 @@ impl ValueStatistics { fn is_min_max_deprecated(&self) -> bool { self.is_min_max_deprecated } + + /// Old versions of parquet stored statistics in `min` and `max` fields, ordered + /// using signed comparison. This resulted in an undefined ordering for unsigned + /// quantities, such as booleans and unsigned integers. + /// + /// These fields were therefore deprecated in favour of `min_value` and `max_value`, + /// which have a type-defined sort order. + /// + /// However, not all readers have been updated. For backwards compatibility, this method + /// returns `true` if the statistics within this have a signed sort order, that is + /// compatible with being stored in the deprecated `min` and `max` fields + pub fn is_min_max_backwards_compatible(&self) -> bool { + self.is_min_max_backwards_compatible + } } impl fmt::Display for ValueStatistics { @@ -509,12 +560,13 @@ impl fmt::Debug for ValueStatistics { write!( f, "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ - min_max_deprecated: {}}}", + min_max_deprecated: {}, min_max_backwards_compatible: {}}}", self.min, self.max, self.distinct_count, self.null_count, - self.is_min_max_deprecated + self.is_min_max_deprecated, + self.is_min_max_backwards_compatible ) } } @@ -569,14 +621,14 @@ mod tests { assert_eq!( format!("{:?}", stats), "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ - min_max_deprecated: true})" + min_max_deprecated: true, min_max_backwards_compatible: true})" ); let stats = Statistics::int32(None, None, None, 7, false); assert_eq!( format!("{:?}", stats), "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ - min_max_deprecated: false})" + min_max_deprecated: false, min_max_backwards_compatible: false})" ) } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 66b5d8e23a75..c8373c3616b1 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -726,6 +726,7 @@ mod tests { }; use crate::format::SortingColumn; use crate::record::{Row, RowAccessor}; + use crate::schema::parser::parse_message_type; use crate::schema::types::{ColumnDescriptor, ColumnPath}; use crate::util::memory::ByteBufferPtr; @@ -1428,4 +1429,59 @@ mod tests { test_kv_metadata(Some(vec![kv1]), Some(vec![])); test_kv_metadata(None, Some(vec![])); } + + #[test] + fn test_backwards_compatible_statistics() { + let message_type = " + message test_schema { + REQUIRED INT32 decimal1 (DECIMAL(8,2)); + REQUIRED INT32 i32 (INTEGER(32,true)); + REQUIRED INT32 u32 (INTEGER(32,false)); + } + "; + + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let props = Arc::new(WriterProperties::builder().build()); + let mut writer = SerializedFileWriter::new(vec![], schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + for _ in 0..3 { + let mut writer = row_group_writer.next_column().unwrap().unwrap(); + writer + .typed::() + .write_batch(&[1, 2, 3], None, None) + .unwrap(); + writer.close().unwrap(); + } + let metadata = row_group_writer.close().unwrap(); + writer.close().unwrap(); + + let thrift = metadata.to_thrift(); + let encoded_stats: Vec<_> = thrift + .columns + .into_iter() + .map(|x| x.meta_data.unwrap().statistics.unwrap()) + .collect(); + + // decimal + let s = &encoded_stats[0]; + assert_eq!(s.min.as_deref(), Some(1_i32.to_le_bytes().as_ref())); + assert_eq!(s.max.as_deref(), Some(3_i32.to_le_bytes().as_ref())); + assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); + assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); + + // i32 + let s = &encoded_stats[1]; + assert_eq!(s.min.as_deref(), Some(1_i32.to_le_bytes().as_ref())); + assert_eq!(s.max.as_deref(), Some(3_i32.to_le_bytes().as_ref())); + assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); + assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); + + // u32 + let s = &encoded_stats[2]; + assert_eq!(s.min.as_deref(), None); + assert_eq!(s.max.as_deref(), None); + assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); + assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); + } } diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 4501e7e31c1d..1b966b41426c 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -22,7 +22,8 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; use crate::format::SchemaElement; use crate::basic::{ - ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType, + ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, + Type as PhysicalType, }; use crate::errors::{ParquetError, Result}; @@ -846,6 +847,15 @@ impl ColumnDescriptor { _ => panic!("Expected primitive type!"), } } + + /// Returns the sort order for this column + pub fn sort_order(&self) -> SortOrder { + ColumnOrder::get_sort_order( + self.logical_type(), + self.converted_type(), + self.physical_type(), + ) + } } /// A schema descriptor. This encapsulates the top-level schemas for all the columns, From 07fd434946e262ec70b303b5065fd75fc3bde922 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 16 Jan 2023 09:33:47 -0800 Subject: [PATCH 0514/1411] Support casting from binary to dictionary of binary (#3482) --- .../generic_bytes_dictionary_builder.rs | 4 +-- arrow-cast/src/cast.rs | 26 +++++++++++++++++++ arrow/tests/array_cast.rs | 2 ++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 34b736d65861..4a920f3ee43e 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -268,7 +268,7 @@ where let keys = self.keys_builder.finish(); let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8)); + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() @@ -285,7 +285,7 @@ where let keys = self.keys_builder.finish_cloned(); let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8)); + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8b8244a7c9ac..aa6697a7170d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3303,6 +3303,8 @@ fn cast_to_dictionary( ), Utf8 => pack_string_to_dictionary::(array, cast_options), LargeUtf8 => pack_string_to_dictionary::(array, cast_options), + Binary => pack_binary_to_dictionary::(array, cast_options), + LargeBinary => pack_binary_to_dictionary::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {:?}", dict_value_type @@ -3366,6 +3368,30 @@ where Ok(Arc::new(b.finish())) } +// Packs the data as a BinaryDictionaryArray, if possible, with the +// key types of K +fn pack_binary_to_dictionary( + array: &ArrayRef, + cast_options: &CastOptions, +) -> Result +where + K: ArrowDictionaryKeyType, +{ + let cast_values = cast_with_options(array, &DataType::Binary, cast_options)?; + let values = cast_values.as_any().downcast_ref::().unwrap(); + let mut b = BinaryDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); + + // copy each element one at a time + for i in 0..values.len() { + if values.is_null(i) { + b.append_null(); + } else { + b.append(values.value(i))?; + } + } + Ok(Arc::new(b.finish())) +} + /// Helper function that takes a primitive array and casts to a (generic) list array. fn cast_primitive_to_list( array: &ArrayRef, diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index be37a7636b63..91d2da9985b5 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -411,7 +411,9 @@ fn get_all_types() -> Vec { ), Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + Dictionary(Box::new(DataType::Int16), Box::new(DataType::Binary)), Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), + Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)), Decimal128(38, 0), Dictionary(Box::new(DataType::Int8), Box::new(Decimal128(38, 0))), Dictionary(Box::new(DataType::Int16), Box::new(Decimal128(38, 0))), From 767a9e7df69bcb447929e772daf32c506a2ef0e1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 17 Jan 2023 00:01:48 -0800 Subject: [PATCH 0515/1411] Update aws-config and aws-types requirements from 0.52 to 0.53 (#3539) --- object_store/Cargo.toml | 7 ++++--- object_store/src/aws/credential.rs | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 4be6d63fcdea..8c9ede087b33 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -53,8 +53,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.52", optional = true } -aws-config = { version = "0.52", optional = true } +aws-types = { version = "0.53", optional = true } +aws-credential-types = { version = "0.53", optional = true } +aws-config = { version = "0.53", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] @@ -64,7 +65,7 @@ aws = ["cloud"] http = ["cloud"] # Experimental support for AWS_PROFILE -aws_profile = ["aws", "aws-config", "aws-types"] +aws_profile = ["aws", "aws-config", "aws-types", "aws-credential-types"] [dev-dependencies] # In alphabetical order dotenv = "0.15.0" diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 199899d6f000..3a6976d11b0f 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -518,7 +518,7 @@ mod profile { use super::*; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; - use aws_types::credentials::ProvideCredentials; + use aws_credential_types::provider::ProvideCredentials; use aws_types::region::Region; use std::time::SystemTime; From 9ae6ba79b21561ad256bda94bed1a29b385566f9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jan 2023 17:22:20 +0000 Subject: [PATCH 0516/1411] Update proc-macro2 requirement from =1.0.49 to =1.0.50 (#3545) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.49...1.0.50) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1571c29b0a8d..bad94457bcc0 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -58,7 +58,7 @@ tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.49", default-features = false } +proc-macro2 = { version = "=1.0.50", default-features = false } prost-build = { version = "=0.11.6", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From ec181469d5b16a589ca2833c7b8a93f19f9cdcaa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Jan 2023 17:27:50 +0000 Subject: [PATCH 0517/1411] Use GHA concurrency groups (#3495) (#3538) * Use GHA concurrency groups (#3495) * Trigger CI --- .github/workflows/arrow.yml | 4 +++ .github/workflows/arrow_flight.yml | 3 ++ .github/workflows/cancel.yml | 54 ---------------------------- .github/workflows/coverage.yml | 4 +++ .github/workflows/dev.yml | 4 +++ .github/workflows/dev_pr.yml | 4 +++ .github/workflows/docs.yml | 4 +++ .github/workflows/integration.yml | 4 +++ .github/workflows/miri.yaml | 4 +++ .github/workflows/object_store.yml | 4 +++ .github/workflows/parquet.yml | 4 +++ .github/workflows/parquet_derive.yml | 3 ++ .github/workflows/rust.yml | 4 +++ 13 files changed, 46 insertions(+), 54 deletions(-) delete mode 100644 .github/workflows/cancel.yml diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 4ac64005323f..35e70c8f070c 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -18,6 +18,10 @@ # tests for arrow crate name: arrow +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + on: # always trigger push: diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index d7e8033fe930..02c149aaae0b 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -19,6 +19,9 @@ # tests for arrow_flight crate name: arrow_flight +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true # trigger for all PRs that touch certain files and changes to master on: diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml deleted file mode 100644 index a98c8ee5d225..000000000000 --- a/.github/workflows/cancel.yml +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Attempt to cancel stale workflow runs to save github actions runner time -name: cancel - -on: - workflow_run: - # The name of another workflow (whichever one) that always runs on PRs - workflows: ['Dev'] - types: ['requested'] - -jobs: - cancel-stale-workflow-runs: - name: "Cancel stale workflow runs" - runs-on: ubuntu-latest - steps: - # Unfortunately, we need to define a separate cancellation step for - # each workflow where we want to cancel stale runs. - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Dev runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: dev.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Integration runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: integration.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Rust runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: rust.yml - skipEventTypes: '["push", "schedule"]' diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index e688428e187c..3fa254142dbe 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -17,6 +17,10 @@ name: coverage +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # Trigger only on pushes to master, not pull requests on: push: diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 214a11d5ec80..0eb2d024f352 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -17,6 +17,10 @@ name: dev +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs and changes to master on: push: diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 5f84affbc52d..c1492580cd39 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -17,6 +17,10 @@ name: dev_pr +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # Trigger whenever a PR is changed (title as well as new / changed commits) on: pull_request_target: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index e780226b6e27..bf1bf7aad880 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,6 +17,10 @@ name: docs +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs and changes to master on: push: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 0975c11d52f8..9b2e7797d5ff 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -17,6 +17,10 @@ name: integration +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs that touch certain files and changes to master on: push: diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index f9cc7df79283..0c1f8069cd40 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -17,6 +17,10 @@ name: miri +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs that touch certain files and changes to master on: push: diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 4de7b31331b3..f182d21eef13 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -19,6 +19,10 @@ # tests for `object_store` crate name: object_store +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs that touch certain files and changes to master on: push: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 65afa47b1b32..ee5813f567bb 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -19,6 +19,10 @@ # tests for parquet crate name: "parquet" +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs that touch certain files and changes to master on: push: diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index e5620769bb3c..72b90ecfd81a 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -19,6 +19,9 @@ # tests for parquet_derive crate name: parquet_derive +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true # trigger for all PRs that touch certain files and changes to master on: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f4c98c5abad7..e09e898fe160 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -18,6 +18,10 @@ # workspace wide tests name: rust +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + # trigger for all PRs and changes to master on: push: From c906fbf4f3a4126b88c986ed5b5a478a5614d287 Mon Sep 17 00:00:00 2001 From: Sid Date: Tue, 17 Jan 2023 23:33:53 +0530 Subject: [PATCH 0518/1411] set sum of uncompressed column size as row group size for parquet files (#3531) * add uncompressed column size as row group size * track number of bytes written in parquet writer * add compression tests to parquet writer * Remove added files Co-authored-by: sid Co-authored-by: Raphael Taylor-Davies --- parquet/src/file/writer.rs | 84 +++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 14 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c8373c3616b1..65f254185334 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -382,6 +382,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { buf: &'a mut TrackedWrite, total_rows_written: Option, total_bytes_written: u64, + total_uncompressed_bytes: i64, column_index: usize, row_group_metadata: Option, column_chunks: Vec, @@ -418,6 +419,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { column_indexes: Vec::with_capacity(num_columns), offset_indexes: Vec::with_capacity(num_columns), total_bytes_written: 0, + total_uncompressed_bytes: 0, } } @@ -443,6 +445,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { let page_writer = Box::new(SerializedPageWriter::new(self.buf)); let total_bytes_written = &mut self.total_bytes_written; + let total_uncompressed_bytes = &mut self.total_uncompressed_bytes; let total_rows_written = &mut self.total_rows_written; let column_chunks = &mut self.column_chunks; let column_indexes = &mut self.column_indexes; @@ -452,6 +455,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { let on_close = |r: ColumnCloseResult| { // Update row group writer metrics *total_bytes_written += r.bytes_written; + *total_uncompressed_bytes += r.metadata.uncompressed_size(); column_chunks.push(r.metadata); bloom_filters.push(r.bloom_filter); column_indexes.push(r.column_index); @@ -501,7 +505,7 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { let column_chunks = std::mem::take(&mut self.column_chunks); let row_group_metadata = RowGroupMetaData::builder(self.descr.clone()) .set_column_metadata(column_chunks) - .set_total_byte_size(self.total_bytes_written as i64) + .set_total_byte_size(self.total_uncompressed_bytes) .set_num_rows(self.total_rows_written.unwrap_or(0) as i64) .set_sorting_columns(self.props.sorting_columns().cloned()) .build()?; @@ -1238,12 +1242,18 @@ mod tests { fn test_roundtrip_i32( file: W, data: Vec>, + compression: Compression, ) -> crate::format::FileMetaData where W: Write, R: ChunkReader + From + 'static, { - test_roundtrip::(file, data, |r| r.get_int(0).unwrap()) + test_roundtrip::( + file, + data, + |r| r.get_int(0).unwrap(), + compression, + ) } /// Tests roundtrip of data of type `D` written using `W` and read using `R` @@ -1252,6 +1262,7 @@ mod tests { mut file: W, data: Vec>, value: F, + compression: Compression, ) -> crate::format::FileMetaData where W: Write, @@ -1270,7 +1281,11 @@ mod tests { .build() .unwrap(), ); - let props = Arc::new(WriterProperties::builder().build()); + let props = Arc::new( + WriterProperties::builder() + .set_compression(compression) + .build(), + ); let mut file_writer = SerializedFileWriter::new(&mut file, schema, props).unwrap(); let mut rows: i64 = 0; @@ -1302,6 +1317,14 @@ mod tests { let row_group_reader = reader.get_row_group(i).unwrap(); let iter = row_group_reader.get_row_iter(None).unwrap(); let res: Vec<_> = iter.map(&value).collect(); + let row_group_size = row_group_reader.metadata().total_byte_size(); + let uncompressed_size: i64 = row_group_reader + .metadata() + .columns() + .iter() + .map(|v| v.uncompressed_size()) + .sum(); + assert_eq!(row_group_size, uncompressed_size); assert_eq!(res, *item); } file_metadata @@ -1313,31 +1336,52 @@ mod tests { file: File, data: Vec>, ) -> crate::format::FileMetaData { - test_roundtrip_i32::(file, data) + test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED) } #[test] fn test_bytes_writer_empty_row_groups() { - test_bytes_roundtrip(vec![]); + test_bytes_roundtrip(vec![], Compression::UNCOMPRESSED); } #[test] fn test_bytes_writer_single_row_group() { - test_bytes_roundtrip(vec![vec![1, 2, 3, 4, 5]]); + test_bytes_roundtrip(vec![vec![1, 2, 3, 4, 5]], Compression::UNCOMPRESSED); } #[test] fn test_bytes_writer_multiple_row_groups() { - test_bytes_roundtrip(vec![ - vec![1, 2, 3, 4, 5], - vec![1, 2, 3], - vec![1], - vec![1, 2, 3, 4, 5, 6], - ]); + test_bytes_roundtrip( + vec![ + vec![1, 2, 3, 4, 5], + vec![1, 2, 3], + vec![1], + vec![1, 2, 3, 4, 5, 6], + ], + Compression::UNCOMPRESSED, + ); + } + + #[test] + fn test_bytes_writer_single_row_group_compressed() { + test_bytes_roundtrip(vec![vec![1, 2, 3, 4, 5]], Compression::SNAPPY); + } + + #[test] + fn test_bytes_writer_multiple_row_groups_compressed() { + test_bytes_roundtrip( + vec![ + vec![1, 2, 3, 4, 5], + vec![1, 2, 3], + vec![1], + vec![1, 2, 3, 4, 5, 6], + ], + Compression::SNAPPY, + ); } - fn test_bytes_roundtrip(data: Vec>) { - test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data); + fn test_bytes_roundtrip(data: Vec>, compression: Compression) { + test_roundtrip_i32::, Bytes>(Vec::with_capacity(1024), data, compression); } #[test] @@ -1347,6 +1391,18 @@ mod tests { Vec::with_capacity(1024), vec![my_bool_values], |r| r.get_bool(0).unwrap(), + Compression::UNCOMPRESSED, + ); + } + + #[test] + fn test_boolean_compressed_roundtrip() { + let my_bool_values: Vec<_> = (0..2049).map(|idx| idx % 2 == 0).collect(); + test_roundtrip::, Bytes, BoolType, _>( + Vec::with_capacity(1024), + vec![my_bool_values], + |r| r.get_bool(0).unwrap(), + Compression::SNAPPY, ); } From 14545a42ec09782ec0371c05c01d112e0ca37604 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Jan 2023 00:41:40 +0100 Subject: [PATCH 0519/1411] Minor: Add documentation about memory use for ArrayData (#3529) * Minor: Add documentation about memory use for ArrayData * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-data/src/data.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-data/src/data.rs * Update arrow-data/src/bitmap.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Liang-Chi Hsieh Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-data/src/bitmap.rs | 11 ++++- arrow-data/src/data.rs | 100 +++++++++++++++++++++++++++++++++------ 2 files changed, 94 insertions(+), 17 deletions(-) diff --git a/arrow-data/src/bitmap.rs b/arrow-data/src/bitmap.rs index 0002ef022122..a356b9ff7d38 100644 --- a/arrow-data/src/bitmap.rs +++ b/arrow-data/src/bitmap.rs @@ -68,12 +68,19 @@ impl Bitmap { self.bits } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [Bitmap]. + /// Returns the total number of bytes of memory occupied by the + /// buffers owned by this [Bitmap]. + /// + /// If multiple [`Bitmap`]s refer to the same underlying + /// [`Buffer`] they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { self.bits.capacity() } - /// Returns the total number of bytes of memory occupied physically by this [Bitmap]. + /// Returns the total number of bytes of memory occupied + /// physically by this [Bitmap] and its [`Buffer`]s. + /// + /// Equivalent to: `size_of_val(self)` + [`Self::get_buffer_memory_size`] pub fn get_array_memory_size(&self) -> usize { self.bits.capacity() + mem::size_of_val(self) } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 14dbe9387db3..258ee082da1b 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates +//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; @@ -245,6 +245,46 @@ pub(crate) fn into_buffers( /// An generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. +/// +/// # Memory Layout +/// +/// `ArrayData` has references to one or more underlying data buffers +/// and optional child ArrayDatas, depending on type as illustrated +/// below. Bitmaps are not shown for simplicity but they are stored +/// similarly to the buffers. +/// +/// ```text +/// offset +/// points to +/// ┌───────────────────┐ start of ┌───────┐ Different +/// │ │ data │ │ ArrayData may +/// │ArrayData { │ │.... │ also refers to +/// │ data_type: ... │ ─ ─ ─ ─▶│1234 │ ┌ ─ the same +/// │ offset: ... ─ ─ ─│─ ┘ │4372 │ underlying +/// │ len: ... ─ ─ ─│─ ┐ │4888 │ │ buffer with different offset/len +/// │ buffers: [ │ │5882 │◀─ +/// │ ... │ │ │4323 │ +/// │ ] │ ─ ─ ─ ─▶│4859 │ +/// │ child_data: [ │ │.... │ +/// │ ... │ │ │ +/// │ ] │ └───────┘ +/// │} │ +/// │ │ Shared Buffer uses +/// │ │ │ bytes::Bytes to hold +/// └───────────────────┘ actual data values +/// ┌ ─ ─ ┘ +/// +/// ▼ +/// ┌───────────────────┐ +/// │ArrayData { │ +/// │ ... │ +/// │} │ +/// │ │ +/// └───────────────────┘ +/// +/// Child ArrayData may also have its own buffers and children +/// ``` + #[derive(Debug, Clone)] pub struct ArrayData { /// The data type for this array data @@ -375,24 +415,25 @@ impl ArrayData { Ok(new_self) } - /// Returns a builder to construct a `ArrayData` instance. + /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`] #[inline] pub const fn builder(data_type: DataType) -> ArrayDataBuilder { ArrayDataBuilder::new(data_type) } - /// Returns a reference to the data type of this array data + /// Returns a reference to the [`DataType`] of this [`ArrayData`] #[inline] pub const fn data_type(&self) -> &DataType { &self.data_type } - /// Returns a slice of buffers for this array data + /// Returns a slice of the [`Buffer`]s that hold the data. pub fn buffers(&self) -> &[Buffer] { &self.buffers[..] } - /// Returns a slice of children data arrays + /// Returns a slice of children [`ArrayData`]. This will be non + /// empty for type such as lists and structs. pub fn child_data(&self) -> &[ArrayData] { &self.child_data[..] } @@ -405,13 +446,13 @@ impl ArrayData { false } - /// Returns a reference to the null bitmap of this array data + /// Returns a reference to the null bitmap of this [`ArrayData`] #[inline] pub const fn null_bitmap(&self) -> Option<&Bitmap> { self.null_bitmap.as_ref() } - /// Returns a reference to the null buffer of this array data. + /// Returns a reference to the null buffer of this [`ArrayData`]. pub fn null_buffer(&self) -> Option<&Buffer> { self.null_bitmap().as_ref().map(|b| b.buffer_ref()) } @@ -424,19 +465,19 @@ impl ArrayData { true } - /// Returns the length (i.e., number of elements) of this array + /// Returns the length (i.e., number of elements) of this [`ArrayData`]. #[inline] pub const fn len(&self) -> usize { self.len } - // Returns whether array data is empty + /// Returns whether this [`ArrayData`] is empty #[inline] pub const fn is_empty(&self) -> bool { self.len == 0 } - /// Returns the offset of this array + /// Returns the offset of this [`ArrayData`] #[inline] pub const fn offset(&self) -> usize { self.offset @@ -448,7 +489,17 @@ impl ArrayData { self.null_count } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. + /// Returns the total number of bytes of memory occupied by the + /// buffers owned by this [`ArrayData`] and all of its + /// children. (See also diagram on [`ArrayData`]). + /// + /// Note that this [`ArrayData`] may only refer to a subset of the + /// data in the underlying [`Buffer`]s (due to `offset` and + /// `length`), but the size returned includes the entire size of + /// the buffers. + /// + /// If multiple [`ArrayData`]s refer to the same underlying + /// [`Buffer`]s they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { let mut size = 0; for buffer in &self.buffers { @@ -463,7 +514,18 @@ impl ArrayData { size } - /// Returns the total number of the bytes of memory occupied by the buffers by this slice of [ArrayData] + /// Returns the total number of the bytes of memory occupied by + /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]). + /// + /// This is approximately the number of bytes if a new + /// [`ArrayData`] was formed by creating new [`Buffer`]s with + /// exactly the data needed. + /// + /// For example, a [`DataType::Int64`] with `100` elements, + /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If + /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its + /// first `20` elements, then [`Self::get_slice_memory_size`] on the + /// sliced [`ArrayData`] would return `20 * 8 = 160`. pub fn get_slice_memory_size(&self) -> Result { let mut result: usize = 0; let layout = layout(&self.data_type); @@ -519,7 +581,14 @@ impl ArrayData { Ok(result) } - /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. + /// Returns the total number of bytes of memory occupied + /// physically by this [`ArrayData`] and all its [`Buffer`]s and + /// children. (See also diagram on [`ArrayData`]). + /// + /// Equivalent to: + /// `size_of_val(self)` + + /// [`Self::get_buffer_memory_size`] + + /// `size_of_val(child)` for all children pub fn get_array_memory_size(&self) -> usize { let mut size = mem::size_of_val(self); @@ -541,8 +610,9 @@ impl ArrayData { size } - /// Creates a zero-copy slice of itself. This creates a new [ArrayData] - /// with a different offset, len and a shifted null bitmap. + /// Creates a zero-copy slice of itself. This creates a new + /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a + /// different offset and len /// /// # Panics /// From 96831de828bcca5e6240c4d5dd5ddb1a1ea778e9 Mon Sep 17 00:00:00 2001 From: sachin agarwal Date: Wed, 18 Jan 2023 18:18:03 +0530 Subject: [PATCH 0520/1411] parquet:: avoid reading extra 8 bytes (#3550) * parquet:: avoid reading extra 8 bytes Effect in increasing performance * Update parquet/src/arrow/async_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/async_reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e93c85580ca0..64b334fd43c7 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -200,7 +200,7 @@ impl AsyncFileReader for T { .await?; let mut buf = Vec::with_capacity(metadata_len); - self.read_to_end(&mut buf).await?; + self.take(metadata_len as _).read_to_end(&mut buf).await?; Ok(Arc::new(decode_metadata(&buf)?)) } From 56dfad0b2a03bc14f398a2998a68da2bc02fb7d2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Jan 2023 12:48:47 +0000 Subject: [PATCH 0521/1411] Improve concat kernel capacity estimation (#3546) * Improve concat kernel capacity estimation * Review feedback * Format --- arrow-select/src/concat.rs | 137 +++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 44 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 7e28f1695509..cff8fd25b7f1 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -30,24 +30,28 @@ //! assert_eq!(arr.len(), 3); //! ``` +use arrow_array::types::*; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_data::transform::{Capacities, MutableArrayData}; -use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, SchemaRef}; -fn compute_str_values_length(arrays: &[&ArrayData]) -> usize { - arrays - .iter() - .map(|&data| { - // get the length of the value buffer - let buf_len = data.buffers()[1].len(); - // find the offset of the buffer - // this returns a slice of offsets, starting from the offset of the array - // so we can take the first value - let offset = data.buffer::(0)[0]; - buf_len - offset.to_usize().unwrap() - }) - .sum() +fn binary_capacity(arrays: &[&dyn Array]) -> Capacities { + let mut item_capacity = 0; + let mut bytes_capacity = 0; + for array in arrays { + let a = array + .as_any() + .downcast_ref::>() + .unwrap(); + + // Guaranteed to always have at least one element + let offsets = a.value_offsets(); + bytes_capacity += offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize(); + item_capacity += a.len() + } + + Capacities::Binary(item_capacity, Some(bytes_capacity)) } /// Concatenate multiple [Array] of the same type into a single [ArrayRef]. @@ -61,43 +65,27 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { return Ok(array.slice(0, array.len())); } - if arrays - .iter() - .any(|array| array.data_type() != arrays[0].data_type()) - { + let d = arrays[0].data_type(); + if arrays.iter().skip(1).any(|array| array.data_type() != d) { return Err(ArrowError::InvalidArgumentError( "It is not possible to concatenate arrays of different data types." .to_string(), )); } - let lengths = arrays.iter().map(|array| array.len()).collect::>(); - let capacity = lengths.iter().sum(); - - let arrays = arrays.iter().map(|a| a.data()).collect::>(); - - let mut mutable = match arrays[0].data_type() { - DataType::Utf8 => { - let str_values_size = compute_str_values_length::(&arrays); - MutableArrayData::with_capacities( - arrays, - false, - Capacities::Binary(capacity, Some(str_values_size)), - ) - } - DataType::LargeUtf8 => { - let str_values_size = compute_str_values_length::(&arrays); - MutableArrayData::with_capacities( - arrays, - false, - Capacities::Binary(capacity, Some(str_values_size)), - ) - } - _ => MutableArrayData::new(arrays, false, capacity), + let capacity = match d { + DataType::Utf8 => binary_capacity::(arrays), + DataType::LargeUtf8 => binary_capacity::(arrays), + DataType::Binary => binary_capacity::(arrays), + DataType::LargeBinary => binary_capacity::(arrays), + _ => Capacities::Array(arrays.iter().map(|a| a.len()).sum()), }; - for (i, len) in lengths.iter().enumerate() { - mutable.extend(i, 0, *len) + let array_data = arrays.iter().map(|a| a.data()).collect::>(); + let mut mutable = MutableArrayData::with_capacities(array_data, false, capacity); + + for (i, a) in arrays.iter().enumerate() { + mutable.extend(i, 0, a.len()) } Ok(make_array(mutable.freeze())) @@ -139,7 +127,6 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; - use arrow_array::types::*; use arrow_schema::{Field, Schema}; use std::sync::Arc; @@ -665,4 +652,66 @@ mod tests { "Invalid argument error: batches[1] schema is different with argument schema.", ); } + + #[test] + fn concat_capacity() { + let a = Int32Array::from_iter_values(0..100); + let b = Int32Array::from_iter_values(10..20); + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 440); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + + let a = concat(&[&a.slice(10, 20), &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 120); + assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 64 + + let a = StringArray::from_iter_values(std::iter::repeat("foo").take(100)); + let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]); + + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + // (100 + 4 + 1) * size_of() + assert_eq!(data.buffers()[0].len(), 420); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + + // len("foo") * 100 + len("bingo") + len("bongo") + len("lorem") + assert_eq!(data.buffers()[1].len(), 315); + assert_eq!(data.buffers()[1].capacity(), 320); // Nearest multiple of 64 + + let a = concat(&[&a.slice(10, 40), &b]).unwrap(); + let data = a.data(); + // (40 + 4 + 5) * size_of() + assert_eq!(data.buffers()[0].len(), 180); + assert_eq!(data.buffers()[0].capacity(), 192); // Nearest multiple of 64 + + // len("foo") * 40 + len("bingo") + len("bongo") + len("lorem") + assert_eq!(data.buffers()[1].len(), 135); + assert_eq!(data.buffers()[1].capacity(), 192); // Nearest multiple of 64 + + let a = LargeBinaryArray::from_iter_values(std::iter::repeat(b"foo").take(100)); + let b = + LargeBinaryArray::from_iter_values(std::iter::repeat(b"cupcakes").take(10)); + + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + // (100 + 10 + 1) * size_of() + assert_eq!(data.buffers()[0].len(), 888); + assert_eq!(data.buffers()[0].capacity(), 896); // Nearest multiple of 64 + + // len("foo") * 100 + len("cupcakes") * 10 + assert_eq!(data.buffers()[1].len(), 380); + assert_eq!(data.buffers()[1].capacity(), 384); // Nearest multiple of 64 + + let a = concat(&[&a.slice(10, 40), &b]).unwrap(); + let data = a.data(); + // (40 + 10 + 1) * size_of() + assert_eq!(data.buffers()[0].len(), 408); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + + // len("foo") * 40 + len("cupcakes") * 10 + assert_eq!(data.buffers()[1].len(), 200); + assert_eq!(data.buffers()[1].capacity(), 256); // Nearest multiple of 64 + } } From 40837a87c6a7ae177298fe3fcc0e83aaf678640e Mon Sep 17 00:00:00 2001 From: Frank <35358771+Frankonly@users.noreply.github.com> Date: Wed, 18 Jan 2023 20:49:21 +0800 Subject: [PATCH 0522/1411] Update pyarrow method call to avoid warning (#3544) * Update pyarrow method call to avoid warning * resolve problem --- arrow/src/pyarrow.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 5ddc3105a4ad..4355d2e47c26 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -196,7 +196,8 @@ impl PyArrowConvert for RecordBatch { let module = py.import("pyarrow")?; let class = module.getattr("RecordBatch")?; - let record = class.call_method1("from_arrays", (py_arrays, py_schema))?; + let record = class + .call_method1("from_arrays", (py_arrays, None::, py_schema))?; Ok(PyObject::from(record)) } From 3ae1c728b266c1ba801409eb7f4b901285783e94 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Jan 2023 17:38:07 +0000 Subject: [PATCH 0523/1411] Expose Inner FlightServiceClient on FlightSqlServiceClient (#3551) (#3556) * Remove unnecessary Mutex from FlightSqlServiceClient (#3551) * Add inner and inner_mut * Add into_inner --- arrow-flight/src/sql/client.rs | 53 ++++++++++++++++------------------ 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index ecc121d985a0..5c5f84b3d15a 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -19,7 +19,6 @@ use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; use std::collections::HashMap; -use std::sync::Arc; use std::time::Duration; use crate::flight_service_client::FlightServiceClient; @@ -45,7 +44,6 @@ use arrow_ipc::{root_as_message, MessageHeader}; use arrow_schema::{ArrowError, Schema, SchemaRef}; use futures::{stream, TryStreamExt}; use prost::Message; -use tokio::sync::{Mutex, MutexGuard}; #[cfg(feature = "tls")] use tonic::transport::{Certificate, ClientTlsConfig, Identity}; use tonic::transport::{Channel, Endpoint}; @@ -56,7 +54,7 @@ use tonic::Streaming; #[derive(Debug, Clone)] pub struct FlightSqlServiceClient { token: Option, - flight_client: Arc>>, + flight_client: FlightServiceClient, } /// A FlightSql protocol client that can run queries against FlightSql servers @@ -124,16 +122,23 @@ impl FlightSqlServiceClient { let flight_client = FlightServiceClient::new(channel); FlightSqlServiceClient { token: None, - flight_client: Arc::new(Mutex::new(flight_client)), + flight_client, } } - fn mut_client( - &mut self, - ) -> Result>, ArrowError> { + /// Return a reference to the underlying [`FlightServiceClient`] + pub fn inner(&self) -> &FlightServiceClient { + &self.flight_client + } + + /// Return a mutable reference to the underlying [`FlightServiceClient`] + pub fn inner_mut(&mut self) -> &mut FlightServiceClient { + &mut self.flight_client + } + + /// Consume this client and return the underlying [`FlightServiceClient`] + pub fn into_inner(self) -> FlightServiceClient { self.flight_client - .try_lock() - .map_err(|_| ArrowError::IoError("Unable to lock client".to_string())) } async fn get_flight_info_for_command( @@ -142,7 +147,7 @@ impl FlightSqlServiceClient { ) -> Result { let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let fi = self - .mut_client()? + .flight_client .get_flight_info(descriptor) .await .map_err(status_to_arrow_error)? @@ -174,7 +179,7 @@ impl FlightSqlServiceClient { .map_err(|_| ArrowError::ParseError("Cannot parse header".to_string()))?; req.metadata_mut().insert("authorization", val); let resp = self - .mut_client()? + .flight_client .handshake(req) .await .map_err(|e| ArrowError::IoError(format!("Can't handshake {}", e)))?; @@ -208,7 +213,7 @@ impl FlightSqlServiceClient { let cmd = CommandStatementUpdate { query }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let mut result = self - .mut_client()? + .flight_client .do_put(stream::iter(vec![FlightData { flight_descriptor: Some(descriptor), ..Default::default() @@ -247,7 +252,7 @@ impl FlightSqlServiceClient { ticket: Ticket, ) -> Result, ArrowError> { Ok(self - .mut_client()? + .flight_client .do_get(ticket) .await .map_err(status_to_arrow_error)? @@ -332,7 +337,7 @@ impl FlightSqlServiceClient { req.metadata_mut().insert("authorization", val); } let mut result = self - .mut_client()? + .flight_client .do_action(req) .await .map_err(status_to_arrow_error)? @@ -369,7 +374,7 @@ impl FlightSqlServiceClient { /// A PreparedStatement #[derive(Debug, Clone)] pub struct PreparedStatement { - flight_client: Arc>>, + flight_client: FlightServiceClient, parameter_binding: Option, handle: Bytes, dataset_schema: Schema, @@ -378,13 +383,13 @@ pub struct PreparedStatement { impl PreparedStatement { pub(crate) fn new( - client: Arc>>, + flight_client: FlightServiceClient, handle: impl Into, dataset_schema: Schema, parameter_schema: Schema, ) -> Self { PreparedStatement { - flight_client: client, + flight_client, parameter_binding: None, handle: handle.into(), dataset_schema, @@ -399,7 +404,7 @@ impl PreparedStatement { }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let result = self - .mut_client()? + .flight_client .get_flight_info(descriptor) .await .map_err(status_to_arrow_error)? @@ -414,7 +419,7 @@ impl PreparedStatement { }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let mut result = self - .mut_client()? + .flight_client .do_put(stream::iter(vec![FlightData { flight_descriptor: Some(descriptor), ..Default::default() @@ -463,20 +468,12 @@ impl PreparedStatement { body: cmd.as_any().encode_to_vec().into(), }; let _ = self - .mut_client()? + .flight_client .do_action(action) .await .map_err(status_to_arrow_error)?; Ok(()) } - - fn mut_client( - &mut self, - ) -> Result>, ArrowError> { - self.flight_client - .try_lock() - .map_err(|_| ArrowError::IoError("Unable to lock client".to_string())) - } } fn decode_error_to_arrow_error(err: prost::DecodeError) -> ArrowError { From de62808a9d65e052ff3e89550bf780d952c8ceae Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 18 Jan 2023 12:50:03 -0800 Subject: [PATCH 0524/1411] Upgrade pyo3 to 0.18.0 (#3557) --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index e1fa90836f61..7a2dc563a1ac 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -33,7 +33,7 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", version = "31.0.0", features = ["pyarrow"] } -pyo3 = { version = "0.17", features = ["extension-module"] } +pyo3 = { version = "0.18", features = ["extension-module"] } [package.metadata.maturin] requires-dist = ["pyarrow>=1"] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8719cba0effe..ee926ee52868 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -61,7 +61,7 @@ arrow-string = { version = "31.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } -pyo3 = { version = "0.17", default-features = false, optional = true } +pyo3 = { version = "0.18", default-features = false, optional = true } bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] From d9802353f195979f7c6541143c7e849f5ac2d661 Mon Sep 17 00:00:00 2001 From: Frank <35358771+Frankonly@users.noreply.github.com> Date: Thu, 19 Jan 2023 17:15:40 +0800 Subject: [PATCH 0525/1411] Update pyarrow method call with kwargs (#3560) --- arrow/src/pyarrow.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 4355d2e47c26..09933304ecf9 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; -use pyo3::types::{PyList, PyTuple}; +use pyo3::types::{PyDict, PyList, PyTuple}; use crate::array::{make_array, Array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; @@ -196,8 +196,10 @@ impl PyArrowConvert for RecordBatch { let module = py.import("pyarrow")?; let class = module.getattr("RecordBatch")?; - let record = class - .call_method1("from_arrays", (py_arrays, None::, py_schema))?; + let args = (py_arrays,); + let kwargs = PyDict::new(py); + kwargs.set_item("schema", py_schema)?; + let record = class.call_method("from_arrays", args, Some(kwargs))?; Ok(PyObject::from(record)) } From 046cb96c820afe70a196c34b527ce9ad7d0fbbb5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Jan 2023 21:53:38 +0000 Subject: [PATCH 0526/1411] Improve GenericBytesBuilder offset overflow panic message (#139) (#3564) --- arrow-array/src/builder/generic_bytes_builder.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 195628f4712f..8be3ac7f4f15 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -81,13 +81,18 @@ impl GenericByteBuilder { } } + #[inline] + fn next_offset(&self) -> T::Offset { + T::Offset::from_usize(self.value_builder.len()) + .expect("byte array offset overflow") + } + /// Appends a value into the builder. #[inline] pub fn append_value(&mut self, value: impl AsRef) { self.value_builder.append_slice(value.as_ref().as_ref()); self.null_buffer_builder.append(true); - self.offsets_builder - .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); + self.offsets_builder.append(self.next_offset()); } /// Append an `Option` value into the builder. @@ -103,8 +108,7 @@ impl GenericByteBuilder { #[inline] pub fn append_null(&mut self) { self.null_buffer_builder.append(false); - self.offsets_builder - .append(T::Offset::from_usize(self.value_builder.len()).unwrap()); + self.offsets_builder.append(self.next_offset()); } /// Builds the [`GenericByteArray`] and reset this builder. @@ -116,8 +120,7 @@ impl GenericByteBuilder { .add_buffer(self.value_builder.finish()) .null_bit_buffer(self.null_buffer_builder.finish()); - self.offsets_builder - .append(T::Offset::from_usize(0).unwrap()); + self.offsets_builder.append(self.next_offset()); let array_data = unsafe { array_builder.build_unchecked() }; GenericByteArray::from(array_data) } From 4fa0ee8a1ed679d91c4adf82608fb3d5de68598d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Jan 2023 21:53:54 +0000 Subject: [PATCH 0527/1411] Return reference from ListArray::values (#3561) --- arrow-array/src/array/list_array.rs | 4 +-- arrow-cast/src/cast.rs | 15 ++++------ arrow-ipc/src/writer.rs | 4 +-- arrow-json/src/reader.rs | 44 +++++++++-------------------- arrow-row/src/lib.rs | 2 +- arrow/src/util/data_gen.rs | 2 +- 6 files changed, 26 insertions(+), 45 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 3f581a88699e..6c49fc7fc70f 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -90,8 +90,8 @@ impl GenericListArray { }; /// Returns a reference to the values of this list. - pub fn values(&self) -> ArrayRef { - self.values.clone() + pub fn values(&self) -> &ArrayRef { + &self.values } /// Returns a clone of the value type of this list. diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index aa6697a7170d..6b385ccd43fc 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4709,8 +4709,7 @@ mod tests { assert_eq!(1, arr.value_length(2)); assert_eq!(1, arr.value_length(3)); assert_eq!(1, arr.value_length(4)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); + let c = as_primitive_array::(arr.values()); assert_eq!(5, c.value(0)); assert_eq!(6, c.value(1)); assert_eq!(7, c.value(2)); @@ -4736,8 +4735,8 @@ mod tests { assert_eq!(1, arr.value_length(2)); assert_eq!(1, arr.value_length(3)); assert_eq!(1, arr.value_length(4)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); + + let c = as_primitive_array::(arr.values()); assert_eq!(1, c.null_count()); assert_eq!(5, c.value(0)); assert!(!c.is_valid(1)); @@ -4764,8 +4763,7 @@ mod tests { assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); assert_eq!(1, arr.value_length(3)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); + let c = as_primitive_array::(arr.values()); assert_eq!(1, c.null_count()); assert_eq!(7.0, c.value(0)); assert_eq!(8.0, c.value(1)); @@ -4914,9 +4912,8 @@ mod tests { assert_eq!(2, array.value_length(2)); // expect 4 nulls: negative numbers and overflow - let values = array.values(); - assert_eq!(4, values.null_count()); - let u16arr = values.as_any().downcast_ref::().unwrap(); + let u16arr = as_primitive_array::(array.values()); + assert_eq!(4, u16arr.null_count()); // expect 4 nulls: negative numbers and overflow let expected: UInt16Array = diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index d7cc83aabddb..ec3cba64aa73 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -223,7 +223,7 @@ impl IpcDataGenerator { let list = as_list_array(column); self.encode_dictionaries( field, - &list.values(), + list.values(), encoded_dictionaries, dictionary_tracker, write_options, @@ -233,7 +233,7 @@ impl IpcDataGenerator { let list = as_large_list_array(column); self.encode_dictionaries( field, - &list.values(), + list.values(), encoded_dictionaries, dictionary_tracker, write_options, diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 0d3148c5a055..64a1b53199bc 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -1758,6 +1758,10 @@ impl Iterator for Reader { #[cfg(test)] mod tests { use super::*; + use arrow_array::cast::{ + as_boolean_array, as_dictionary_array, as_primitive_array, as_string_array, + as_struct_array, + }; use arrow_buffer::ToByteSlice; use arrow_schema::DataType::{Dictionary, List}; use flate2::read::GzDecoder; @@ -2056,8 +2060,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); + let bb = as_primitive_array::(bb.values()); assert_eq!(9, bb.len()); assert_eq!(2.0, bb.value(0)); assert_eq!(-6.1, bb.value(5)); @@ -2068,8 +2071,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); + let cc = as_boolean_array(cc.values()); assert_eq!(6, cc.len()); assert!(!cc.value(0)); assert!(!cc.value(4)); @@ -2183,8 +2185,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let bb = bb.values(); - let bb = bb.as_any().downcast_ref::().unwrap(); + let bb = as_primitive_array::(bb.values()); assert_eq!(10, bb.len()); assert_eq!(4.0, bb.value(9)); @@ -2198,8 +2199,7 @@ mod tests { cc.data().buffers()[0], Buffer::from_slice_ref([0i32, 2, 2, 4, 5]) ); - let cc = cc.values(); - let cc = cc.as_any().downcast_ref::().unwrap(); + let cc = as_boolean_array(cc.values()); let cc_expected = BooleanArray::from(vec![ Some(false), Some(true), @@ -2219,8 +2219,8 @@ mod tests { dd.data().buffers()[0], Buffer::from_slice_ref([0i32, 1, 1, 2, 6]) ); - let dd = dd.values(); - let dd = dd.as_any().downcast_ref::().unwrap(); + + let dd = as_string_array(dd.values()); // values are 6 because a `d: null` is treated as a null slot // and a list's null slot can be omitted from the child (i.e. same offset) assert_eq!(6, dd.len()); @@ -2366,16 +2366,8 @@ mod tests { // compare list null buffers assert_eq!(read.data().null_buffer(), expected.data().null_buffer()); // build struct from list - let struct_values = read.values(); - let struct_array: &StructArray = struct_values - .as_any() - .downcast_ref::() - .unwrap(); - let expected_struct_values = expected.values(); - let expected_struct_array = expected_struct_values - .as_any() - .downcast_ref::() - .unwrap(); + let struct_array = as_struct_array(read.values()); + let expected_struct_array = as_struct_array(expected.values()); assert_eq!(7, struct_array.len()); assert_eq!(1, struct_array.null_count()); @@ -2694,11 +2686,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let evs_list = evs_list.values(); - let evs_list = evs_list - .as_any() - .downcast_ref::>() - .unwrap(); + let evs_list = as_dictionary_array::(evs_list.values()); assert_eq!(6, evs_list.len()); assert!(evs_list.is_valid(1)); assert_eq!(DataType::Utf8, evs_list.value_type()); @@ -2755,11 +2743,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let evs_list = evs_list.values(); - let evs_list = evs_list - .as_any() - .downcast_ref::>() - .unwrap(); + let evs_list = as_dictionary_array::(evs_list.values()); assert_eq!(8, evs_list.len()); assert!(evs_list.is_valid(1)); assert_eq!(DataType::Utf8, evs_list.value_type()); diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index cf23e6e5c3b0..eb9dc29848f0 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -512,7 +512,7 @@ impl Codec { DataType::LargeList(_) => as_large_list_array(array).values(), _ => unreachable!(), }; - let rows = converter.convert_columns(&[values])?; + let rows = converter.convert_columns(&[values.clone()])?; Ok(Encoder::List(rows)) } } diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 01f4ef5c7829..8db4b154e90c 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -342,7 +342,7 @@ mod tests { let col_c_values = col_c.values(); assert!(col_c_values.len() > size); // col_c_values should be a list - let col_c_list = col_c_values.as_any().downcast_ref::().unwrap(); + let col_c_list = as_list_array(col_c_values); // Its values should be FixedSizeBinary(6) let fsb = col_c_list.values(); assert_eq!(fsb.data_type(), &DataType::FixedSizeBinary(6)); From 9bb6aaf38c17d63b8b6201f03cd5adb330b64fa7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 19 Jan 2023 23:08:48 +0100 Subject: [PATCH 0528/1411] Implement `std::error::Error::source` for `ArrowError` and `FlightError` (#3567) * impl Error::source for ArrowError * Add source() for FlightError * clippy * Update arrow-flight/src/error.rs Co-authored-by: Liang-Chi Hsieh Co-authored-by: Liang-Chi Hsieh --- arrow-flight/src/error.rs | 65 ++++++++++++++++++++++++++++++++++++--- arrow-schema/src/error.rs | 40 +++++++++++++++++++++++- 2 files changed, 100 insertions(+), 5 deletions(-) diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs index 11e0ae5c9fae..7a43e537afc5 100644 --- a/arrow-flight/src/error.rs +++ b/arrow-flight/src/error.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::error::Error; + use arrow_schema::ArrowError; /// Errors for the Apache Arrow Flight crate @@ -30,8 +32,8 @@ pub enum FlightError { ProtocolError(String), /// An error occured during decoding DecodeError(String), - /// Some other (opaque) error - ExternalError(Box), + /// External error that can provide source of error by calling `Error::source`. + ExternalError(Box), } impl FlightError { @@ -40,7 +42,7 @@ impl FlightError { } /// Wraps an external error in an `ArrowError`. - pub fn from_external_error(error: Box) -> Self { + pub fn from_external_error(error: Box) -> Self { Self::ExternalError(error) } } @@ -52,7 +54,15 @@ impl std::fmt::Display for FlightError { } } -impl std::error::Error for FlightError {} +impl Error for FlightError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + if let Self::ExternalError(e) = self { + Some(e.as_ref()) + } else { + None + } + } +} impl From for FlightError { fn from(status: tonic::Status) -> Self { @@ -82,3 +92,50 @@ impl From for tonic::Status { } pub type Result = std::result::Result; + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn error_source() { + let e1 = FlightError::DecodeError("foo".into()); + assert!(e1.source().is_none()); + + // one level of wrapping + let e2 = FlightError::ExternalError(Box::new(e1)); + let source = e2.source().unwrap().downcast_ref::().unwrap(); + assert!(matches!(source, FlightError::DecodeError(_))); + + let e3 = FlightError::ExternalError(Box::new(e2)); + let source = e3 + .source() + .unwrap() + .downcast_ref::() + .unwrap() + .source() + .unwrap() + .downcast_ref::() + .unwrap(); + + assert!(matches!(source, FlightError::DecodeError(_))); + } + + #[test] + fn error_through_arrow() { + // flight error that wraps an arrow error that wraps a flight error + let e1 = FlightError::DecodeError("foo".into()); + let e2 = ArrowError::ExternalError(Box::new(e1)); + let e3 = FlightError::ExternalError(Box::new(e2)); + + // ensure we can find the lowest level error by following source() + let mut root_error: &dyn Error = &e3; + while let Some(source) = root_error.source() { + // walk the next level + root_error = source; + } + + let source = root_error.downcast_ref::().unwrap(); + assert!(matches!(source, FlightError::DecodeError(_))); + } +} diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index 0d7a35a9dee2..ea60572b3d4d 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -100,4 +100,42 @@ impl Display for ArrowError { } } -impl Error for ArrowError {} +impl Error for ArrowError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + if let Self::ExternalError(e) = self { + Some(e.as_ref()) + } else { + None + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn error_source() { + let e1 = ArrowError::DivideByZero; + assert!(e1.source().is_none()); + + // one level of wrapping + let e2 = ArrowError::ExternalError(Box::new(e1)); + let source = e2.source().unwrap().downcast_ref::().unwrap(); + assert!(matches!(source, ArrowError::DivideByZero)); + + // two levels of wrapping + let e3 = ArrowError::ExternalError(Box::new(e2)); + let source = e3 + .source() + .unwrap() + .downcast_ref::() + .unwrap() + .source() + .unwrap() + .downcast_ref::() + .unwrap(); + + assert!(matches!(source, ArrowError::DivideByZero)); + } +} From a61da1e655e76e8676f1cdb021b13551e720b0de Mon Sep 17 00:00:00 2001 From: bmmeijers Date: Fri, 20 Jan 2023 13:27:52 +0100 Subject: [PATCH 0529/1411] Show row_counts also for (FixedLen)ByteArray (#3573) --- parquet/src/bin/parquet-index.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 6622783e6cf4..b2f8b4d63ecf 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -103,9 +103,11 @@ impl Args { Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?, Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?, Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::BYTE_ARRAY(_) => println!("BYTE_ARRAY not supported"), - Index::FIXED_LEN_BYTE_ARRAY(_) => { - println!("FIXED_LEN_BYTE_ARRAY not supported") + Index::BYTE_ARRAY(v) => { + print_index(&v.indexes, offset_index, &row_counts)? + } + Index::FIXED_LEN_BYTE_ARRAY(v) => { + print_index(&v.indexes, offset_index, &row_counts)? } } } @@ -130,7 +132,7 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { } /// Prints index information for a single column chunk -fn print_index( +fn print_index( column_index: &[PageIndex], offset_index: &[PageLocation], row_counts: &[i64], @@ -154,12 +156,12 @@ fn print_index( idx, o.offset, o.compressed_page_size, row_count ); match &c.min { - Some(m) => print!(", min {:>10}", m), + Some(m) => print!(", min {:>10?}", m), None => print!(", min {:>10}", "NONE"), } match &c.max { - Some(m) => print!(", max {:>10}", m), + Some(m) => print!(", max {:>10?}", m), None => print!(", max {:>10}", "NONE"), } println!() From a1cedb4fdfb561eda4e836a6c8fcb898d7a37029 Mon Sep 17 00:00:00 2001 From: sachin agarwal Date: Fri, 20 Jan 2023 20:06:25 +0530 Subject: [PATCH 0530/1411] Correct error return (#3576) --- parquet/src/arrow/async_reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 64b334fd43c7..780ba6f3b4c1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -179,7 +179,7 @@ impl AsyncFileReader for T { let mut buffer = Vec::with_capacity(to_read); let read = self.take(to_read as u64).read_to_end(&mut buffer).await?; if read != to_read { - eof_err!("expected to read {} bytes, got {}", to_read, read); + return Err(eof_err!("expected to read {} bytes, got {}", to_read, read)); } Ok(buffer.into()) From 19e3e8c8314f87d8c2acf3a7b69538fdec6f793c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Jan 2023 17:09:44 +0000 Subject: [PATCH 0531/1411] Implement Extend for ArrayBuilder (#1841) (#3563) * Implement Extend for ArrayBuilder (#1841) * Add dictionaries * Add tests --- arrow-array/src/array/dictionary_array.rs | 13 +---- arrow-array/src/builder/boolean_builder.rs | 22 +++++++++ .../src/builder/generic_bytes_builder.rs | 23 +++++++++ .../generic_bytes_dictionary_builder.rs | 47 +++++++++++++++++- .../src/builder/generic_list_builder.rs | 47 ++++++++++++++++++ arrow-array/src/builder/primitive_builder.rs | 22 +++++++++ .../builder/primitive_dictionary_builder.rs | 48 ++++++++++++++++++- 7 files changed, 207 insertions(+), 15 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 6cff5bfdc9f6..fb2868c2778f 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -481,18 +481,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator> for Dictionary let it = iter.into_iter(); let (lower, _) = it.size_hint(); let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024); - it.for_each(|i| { - if let Some(i) = i { - // Note: impl ... for Result> fails with - // error[E0117]: only traits defined in the current crate can be implemented for arbitrary types - builder - .append(i) - .expect("Unable to append a value to a dictionary array."); - } else { - builder.append_null(); - } - }); - + builder.extend(it); builder.finish() } } diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 96f436253c5a..06709e5f375d 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -211,6 +211,15 @@ impl ArrayBuilder for BooleanBuilder { } } +impl Extend> for BooleanBuilder { + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -304,4 +313,17 @@ mod tests { assert_eq!(0, array.null_count()); assert!(array.data().null_buffer().is_none()); } + + #[test] + fn test_extend() { + let mut builder = BooleanBuilder::new(); + builder.extend([false, false, true, false, false].into_iter().map(Some)); + builder.extend([true, true, false].into_iter().map(Some)); + let array = builder.finish(); + let values = array.iter().map(|x| x.unwrap()).collect::>(); + assert_eq!( + &values, + &[false, false, true, false, false, true, true, false] + ) + } } diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 8be3ac7f4f15..73600d9e0a38 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -88,6 +88,10 @@ impl GenericByteBuilder { } /// Appends a value into the builder. + /// + /// # Panics + /// + /// Panics if the resulting length of [`Self::values_slice`] would exceed `T::Offset::MAX` #[inline] pub fn append_value(&mut self, value: impl AsRef) { self.value_builder.append_slice(value.as_ref().as_ref()); @@ -219,6 +223,15 @@ impl ArrayBuilder for GenericByteBuilder { } } +impl> Extend> for GenericByteBuilder { + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + /// Array builder for [`GenericStringArray`][crate::GenericStringArray] pub type GenericStringBuilder = GenericByteBuilder>; @@ -420,4 +433,14 @@ mod tests { fn test_large_string_array_builder_finish_cloned() { _test_generic_string_array_builder_finish_cloned::() } + + #[test] + fn test_extend() { + let mut builder = GenericStringBuilder::::new(); + builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some)); + builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some)); + let array = builder.finish(); + assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); + assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); + } } diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 4a920f3ee43e..449100da1e0e 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -214,7 +214,7 @@ where K: ArrowDictionaryKeyType, T: ByteArrayType, { - /// Append a primitive value to the array. Return an existing index + /// Append a value to the array. Return an existing index /// if already present in the values array or a new index if the /// value is appended to the values array. /// @@ -255,12 +255,34 @@ where Ok(key) } + /// Infallibly append a value to this builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_value(&mut self, value: impl AsRef) { + self.append(value).expect("dictionary key overflow"); + } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { self.keys_builder.append_null() } + /// Append an `Option` value into the builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] + pub fn append_option(&mut self, value: Option>) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -297,6 +319,17 @@ where } } +impl> Extend> + for GenericByteDictionaryBuilder +{ + #[inline] + fn extend>>(&mut self, iter: I) { + for v in iter { + self.append_option(v) + } + } +} + fn get_bytes<'a, K: ArrowNativeType, T: ByteArrayType>( values: &'a GenericByteBuilder, key: &K, @@ -405,7 +438,7 @@ mod tests { use crate::array::Array; use crate::array::Int8Array; - use crate::types::{Int16Type, Int8Type}; + use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; use crate::{BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) @@ -622,4 +655,14 @@ mod tests { vec![b"abc", b"def"], ); } + + #[test] + fn test_extend() { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + let dict = builder.finish(); + assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); + assert_eq!(dict.values().len(), 4); + } } diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 8f3f881c4b32..6228475542bd 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -111,6 +111,10 @@ where } /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` #[inline] pub fn append(&mut self, is_valid: bool) { self.offsets_builder @@ -178,10 +182,32 @@ where } } +impl Extend> for GenericListBuilder +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + self.values_builder.extend(elements); + self.append(true); + } + None => self.append(false), + } + } + } +} + #[cfg(test)] mod tests { use super::*; use crate::builder::{Int32Builder, ListBuilder}; + use crate::cast::as_primitive_array; + use crate::types::Int32Type; use crate::{Array, Int32Array}; use arrow_buffer::Buffer; use arrow_schema::DataType; @@ -364,4 +390,25 @@ mod tests { list_array.values().data().child_data()[0].buffers()[0].clone() ); } + + #[test] + fn test_extend() { + let mut builder = ListBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let a_values = array.values(); + let elements = as_primitive_array::(a_values.as_ref()); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } } diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index a969e121808b..2d88ea50f257 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -238,6 +238,10 @@ impl PrimitiveBuilder { } /// Appends values from a slice of type `T` and a validity boolean slice + /// + /// # Panics + /// + /// Panics if `values` and `is_valid` have different lengths #[inline] pub fn append_values(&mut self, values: &[T::Native], is_valid: &[bool]) { assert_eq!( @@ -328,6 +332,15 @@ impl PrimitiveBuilder { } } +impl Extend> for PrimitiveBuilder

{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -578,4 +591,13 @@ mod tests { fn test_invalid_with_data_type() { Int32Builder::new().with_data_type(DataType::Int64); } + + #[test] + fn test_extend() { + let mut builder = PrimitiveBuilder::::new(); + builder.extend([1, 2, 3, 5, 2, 4, 4].into_iter().map(Some)); + builder.extend([2, 4, 6, 2].into_iter().map(Some)); + let array = builder.finish(); + assert_eq!(array.values(), &[1, 2, 3, 5, 2, 4, 4, 2, 4, 6, 2]); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 4640902d870f..f44f0e30602e 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -193,12 +193,34 @@ where Ok(key) } + /// Infallibly append a value to this builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + pub fn append_value(&mut self, value: V::Native) { + self.append(value).expect("dictionary key overflow"); + } + /// Appends a null slot into the builder #[inline] pub fn append_null(&mut self) { self.keys_builder.append_null() } + /// Append an `Option` value into the builder + /// + /// # Panics + /// + /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] + pub fn append_option(&mut self, value: Option) { + match value { + None => self.append_null(), + Some(v) => self.append_value(v), + }; + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); @@ -235,6 +257,17 @@ where } } +impl Extend> + for PrimitiveDictionaryBuilder +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + self.append_option(v) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -242,7 +275,7 @@ mod tests { use crate::array::Array; use crate::array::UInt32Array; use crate::array::UInt8Array; - use crate::types::{UInt32Type, UInt8Type}; + use crate::types::{Int32Type, UInt32Type, UInt8Type}; #[test] fn test_primitive_dictionary_builder() { @@ -270,6 +303,19 @@ mod tests { assert_eq!(avs, &[12345678, 22345678]); } + #[test] + fn test_extend() { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + let dict = builder.finish(); + assert_eq!( + dict.keys().values(), + &[0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 0, 2, 0] + ); + assert_eq!(dict.values().len(), 5); + } + #[test] #[should_panic(expected = "DictionaryKeyOverflowError")] fn test_primitive_dictionary_overflow() { From 0ec5f72e6d21556d5677b74dd5d45d93c5af0b38 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Jan 2023 17:37:04 +0000 Subject: [PATCH 0532/1411] Fix final page row count in parquet-index binary (#3554) --- parquet/src/bin/parquet-index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index b2f8b4d63ecf..485b31bed3de 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -127,7 +127,7 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { out.push(o.first_row_index - last); last = o.first_row_index; } - out.push(rows); + out.push(rows - last); out } From acaba0af6a65484ab8ba8c7784befd85c1ae6838 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 20 Jan 2023 11:53:43 -0800 Subject: [PATCH 0533/1411] Add pack_byte_to_dictionary (#3572) --- arrow-cast/src/cast.rs | 53 ++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 6b385ccd43fc..c54761840167 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3301,10 +3301,16 @@ fn cast_to_dictionary( dict_value_type, cast_options, ), - Utf8 => pack_string_to_dictionary::(array, cast_options), - LargeUtf8 => pack_string_to_dictionary::(array, cast_options), - Binary => pack_binary_to_dictionary::(array, cast_options), - LargeBinary => pack_binary_to_dictionary::(array, cast_options), + Utf8 => pack_byte_to_dictionary::>(array, cast_options), + LargeUtf8 => { + pack_byte_to_dictionary::>(array, cast_options) + } + Binary => { + pack_byte_to_dictionary::>(array, cast_options) + } + LargeBinary => { + pack_byte_to_dictionary::>(array, cast_options) + } _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {:?}", dict_value_type @@ -3344,42 +3350,23 @@ where Ok(Arc::new(b.finish())) } -// Packs the data as a StringDictionaryArray, if possible, with the -// key types of K -fn pack_string_to_dictionary( - array: &ArrayRef, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, -{ - let cast_values = cast_with_options(array, &DataType::Utf8, cast_options)?; - let values = cast_values.as_any().downcast_ref::().unwrap(); - let mut b = StringDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); - - // copy each element one at a time - for i in 0..values.len() { - if values.is_null(i) { - b.append_null(); - } else { - b.append(values.value(i))?; - } - } - Ok(Arc::new(b.finish())) -} - -// Packs the data as a BinaryDictionaryArray, if possible, with the +// Packs the data as a GenericByteDictionaryBuilder, if possible, with the // key types of K -fn pack_binary_to_dictionary( +fn pack_byte_to_dictionary( array: &ArrayRef, cast_options: &CastOptions, ) -> Result where K: ArrowDictionaryKeyType, + T: ByteArrayType, { - let cast_values = cast_with_options(array, &DataType::Binary, cast_options)?; - let values = cast_values.as_any().downcast_ref::().unwrap(); - let mut b = BinaryDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); + let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?; + let values = cast_values + .as_any() + .downcast_ref::>() + .unwrap(); + let mut b = + GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); // copy each element one at a time for i in 0..values.len() { From 24e5daef3248c38a6fb354c8427c9ba653e2b3e9 Mon Sep 17 00:00:00 2001 From: comphead Date: Sat, 21 Jan 2023 17:02:56 -0800 Subject: [PATCH 0534/1411] Remove unwrap on datetime cast for CSV writer (#3570) * avoid unwrap on casting * avoid unwrap on cast * fmt * fixes --- arrow-csv/src/writer.rs | 143 +++++++++++++++++++++++++++++----------- 1 file changed, 104 insertions(+), 39 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index c5eed7f1e3e8..3ab28c2df816 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -88,6 +88,26 @@ where lexical_to_string(c.value(i)) } +fn invalid_cast_error(dt: &str, col_index: usize, row_index: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {} at col index: {} row index: {}", + dt, col_index, row_index + )) +} + +macro_rules! write_temporal_value { + ($array:expr, $tpe: ident, $format: expr, $col_index: expr, $row_index: expr, $cast_func: ident, $tpe_name: expr) => {{ + $array + .as_any() + .downcast_ref::<$tpe>() + .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? + .$cast_func($row_index) + .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? + .format($format) + .to_string() + }}; +} + /// A CSV writer #[derive(Debug)] pub struct Writer { @@ -171,55 +191,70 @@ impl Writer { c.value(row_index).to_owned() } DataType::Date32 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_date(row_index) - .unwrap() - .format(&self.date_format) - .to_string() + write_temporal_value!( + col, + Date32Array, + &self.date_format, + col_index, + row_index, + value_as_date, + "Date32" + ) } DataType::Date64 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_datetime(row_index) - .unwrap() - .format(&self.datetime_format) - .to_string() + write_temporal_value!( + col, + Date64Array, + &self.datetime_format, + col_index, + row_index, + value_as_datetime, + "Date64" + ) } DataType::Time32(TimeUnit::Second) => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() + write_temporal_value!( + col, + Time32SecondArray, + &self.time_format, + col_index, + row_index, + value_as_time, + "Time32" + ) } DataType::Time32(TimeUnit::Millisecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() + write_temporal_value!( + col, + Time32MillisecondArray, + &self.time_format, + col_index, + row_index, + value_as_time, + "Time32" + ) } DataType::Time64(TimeUnit::Microsecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() + write_temporal_value!( + col, + Time64MicrosecondArray, + &self.time_format, + col_index, + row_index, + value_as_time, + "Time64" + ) } DataType::Time64(TimeUnit::Nanosecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() + write_temporal_value!( + col, + Time64NanosecondArray, + &self.time_format, + col_index, + row_index, + value_as_time, + "Time64" + ) } DataType::Timestamp(time_unit, time_zone) => { self.handle_timestamp(time_unit, time_zone.as_ref(), row_index, col)? @@ -672,4 +707,34 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let expected = nanoseconds.into_iter().map(Some).collect::>(); assert_eq!(actual, expected); } + + #[test] + fn test_write_csv_invalid_cast() { + let schema = Schema::new(vec![ + Field::new("c0", DataType::UInt32, false), + Field::new("c1", DataType::Date64, false), + ]); + + let c0 = UInt32Array::from(vec![Some(123), Some(234)]); + let c1 = Date64Array::from(vec![Some(1926632005177), Some(1926632005177685347)]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]) + .unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + let mut writer = Writer::new(&mut file); + let batches = vec![&batch, &batch]; + for batch in batches { + writer + .write(batch) + .map_err(|e| { + dbg!(e.to_string()); + assert!(e.to_string().ends_with( + invalid_cast_error("Date64", 1, 1).to_string().as_str() + )) + }) + .unwrap_err(); + } + drop(writer); + } } From 892a80385bdc1ee4c033e3b848fed882982e352d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 22 Jan 2023 09:13:38 +0000 Subject: [PATCH 0535/1411] Add external variant to ParquetError (#3285) (#3574) --- parquet/src/arrow/arrow_writer/levels.rs | 4 +- parquet/src/column/mod.rs | 26 +- parquet/src/encodings/decoding.rs | 7 +- parquet/src/encodings/encoding/mod.rs | 11 +- parquet/src/errors.rs | 39 ++- parquet/src/file/footer.rs | 24 +- parquet/src/record/api.rs | 24 +- parquet/src/record/reader.rs | 17 +- parquet/src/schema/parser.rs | 409 +++++++++-------------- 9 files changed, 236 insertions(+), 325 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 182f68c498ff..15197c02e586 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -1085,7 +1085,7 @@ mod tests { .unwrap(); let struct_null_level = - calculate_array_levels(batch.column(0), batch.schema().field(0)); + calculate_array_levels(batch.column(0), batch.schema().field(0)).unwrap(); // create second batch // define schema @@ -1108,7 +1108,7 @@ mod tests { .unwrap(); let struct_non_null_level = - calculate_array_levels(batch.column(0), batch.schema().field(0)); + calculate_array_levels(batch.column(0), batch.schema().field(0)).unwrap(); // The 2 levels should not be the same if struct_non_null_level == struct_null_level { diff --git a/parquet/src/column/mod.rs b/parquet/src/column/mod.rs index 93a4f00d2eef..cb0c035dd6e2 100644 --- a/parquet/src/column/mod.rs +++ b/parquet/src/column/mod.rs @@ -36,18 +36,18 @@ //! repetition levels and read them to verify write/read correctness. //! //! ```rust,no_run -//! use std::{fs, path::Path, sync::Arc}; -//! -//! use parquet::{ -//! column::{reader::ColumnReader, writer::ColumnWriter}, -//! data_type::Int32Type, -//! file::{ -//! properties::WriterProperties, -//! reader::{FileReader, SerializedFileReader}, -//! writer::SerializedFileWriter, -//! }, -//! schema::parser::parse_message_type, -//! }; +//! # use std::{fs, path::Path, sync::Arc}; +//! # +//! # use parquet::{ +//! # column::{reader::ColumnReader, writer::ColumnWriter}, +//! # data_type::Int32Type, +//! # file::{ +//! # properties::WriterProperties, +//! # reader::{FileReader, SerializedFileReader}, +//! # writer::SerializedFileWriter, +//! # }, +//! # schema::parser::parse_message_type, +//! # }; //! //! let path = Path::new("/path/to/column_sample.parquet"); //! @@ -111,7 +111,7 @@ //! } //! } //! -//! assert_eq!(res, Ok((3, 5))); +//! assert_eq!(res.unwrap(), (3, 5)); //! assert_eq!(values, vec![1, 2, 3, 0, 0, 0, 0, 0]); //! assert_eq!(def_levels, vec![3, 3, 3, 2, 2, 0, 0, 0]); //! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1, 0, 0, 0]); diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 7e3058ba7b3f..8058335875c9 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1946,11 +1946,12 @@ mod tests { let decoder = get_decoder::(descr, encoding); match err { Some(parquet_error) => { - assert!(decoder.is_err()); - assert_eq!(decoder.err().unwrap(), parquet_error); + assert_eq!( + decoder.err().unwrap().to_string(), + parquet_error.to_string() + ); } None => { - assert!(decoder.is_ok()); assert_eq!(decoder.unwrap().encoding(), encoding); } } diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index 78f4a8b97b33..b7e30c4ecf08 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -1081,13 +1081,12 @@ mod tests { let encoder = get_encoder::(encoding); match err { Some(parquet_error) => { - assert!(encoder.is_err()); - assert_eq!(encoder.err().unwrap(), parquet_error); - } - None => { - assert!(encoder.is_ok()); - assert_eq!(encoder.unwrap().encoding(), encoding); + assert_eq!( + encoder.err().unwrap().to_string(), + parquet_error.to_string() + ) } + None => assert_eq!(encoder.unwrap().encoding(), encoding), } } diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index cbbd2405353f..703ff51f44c2 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -17,12 +17,13 @@ //! Common Parquet errors and macros. +use std::error::Error; use std::{cell, io, result, str}; #[cfg(feature = "arrow")] use arrow_schema::ArrowError; -#[derive(Debug, PartialEq, Clone, Eq)] +#[derive(Debug)] pub enum ParquetError { /// General Parquet error. /// Returned when code violates normal workflow of working with Parquet files. @@ -39,66 +40,72 @@ pub enum ParquetError { /// Returned when reading into arrow or writing from arrow. ArrowError(String), IndexOutOfBound(usize, usize), + /// An external error variant + External(Box), } impl std::fmt::Display for ParquetError { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - match *self { - ParquetError::General(ref message) => { + match &self { + ParquetError::General(message) => { write!(fmt, "Parquet error: {}", message) } - ParquetError::NYI(ref message) => write!(fmt, "NYI: {}", message), - ParquetError::EOF(ref message) => write!(fmt, "EOF: {}", message), + ParquetError::NYI(message) => write!(fmt, "NYI: {}", message), + ParquetError::EOF(message) => write!(fmt, "EOF: {}", message), #[cfg(feature = "arrow")] - ParquetError::ArrowError(ref message) => write!(fmt, "Arrow: {}", message), - ParquetError::IndexOutOfBound(ref index, ref bound) => { + ParquetError::ArrowError(message) => write!(fmt, "Arrow: {}", message), + ParquetError::IndexOutOfBound(index, ref bound) => { write!(fmt, "Index {} out of bound: {}", index, bound) } + ParquetError::External(e) => write!(fmt, "External: {}", e), } } } -impl std::error::Error for ParquetError { - fn cause(&self) -> Option<&dyn ::std::error::Error> { - None +impl Error for ParquetError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + ParquetError::External(e) => Some(e.as_ref()), + _ => None, + } } } impl From for ParquetError { fn from(e: io::Error) -> ParquetError { - ParquetError::General(format!("underlying IO error: {}", e)) + ParquetError::External(Box::new(e)) } } #[cfg(any(feature = "snap", test))] impl From for ParquetError { fn from(e: snap::Error) -> ParquetError { - ParquetError::General(format!("underlying snap error: {}", e)) + ParquetError::External(Box::new(e)) } } impl From for ParquetError { fn from(e: thrift::Error) -> ParquetError { - ParquetError::General(format!("underlying Thrift error: {}", e)) + ParquetError::External(Box::new(e)) } } impl From for ParquetError { fn from(e: cell::BorrowMutError) -> ParquetError { - ParquetError::General(format!("underlying borrow error: {}", e)) + ParquetError::External(Box::new(e)) } } impl From for ParquetError { fn from(e: str::Utf8Error) -> ParquetError { - ParquetError::General(format!("underlying utf8 error: {}", e)) + ParquetError::External(Box::new(e)) } } #[cfg(feature = "arrow")] impl From for ParquetError { fn from(e: ArrowError) -> ParquetError { - ParquetError::ArrowError(format!("underlying Arrow error: {}", e)) + ParquetError::External(Box::new(e)) } } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 27c07b78d7cf..760caa9774e9 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -156,10 +156,9 @@ mod tests { fn test_parse_metadata_size_smaller_than_footer() { let test_file = tempfile::tempfile().unwrap(); let reader_result = parse_metadata(&test_file); - assert!(reader_result.is_err()); assert_eq!( - reader_result.err().unwrap(), - general_err!("Invalid Parquet file. Size is smaller than footer") + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Size is smaller than footer" ); } @@ -167,10 +166,9 @@ mod tests { fn test_parse_metadata_corrupt_footer() { let data = Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]); let reader_result = parse_metadata(&data); - assert!(reader_result.is_err()); assert_eq!( - reader_result.err().unwrap(), - general_err!("Invalid Parquet file. Corrupt footer") + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Corrupt footer" ); } @@ -178,12 +176,9 @@ mod tests { fn test_parse_metadata_invalid_length() { let test_file = Bytes::from(vec![0, 0, 0, 255, b'P', b'A', b'R', b'1']); let reader_result = parse_metadata(&test_file); - assert!(reader_result.is_err()); assert_eq!( - reader_result.err().unwrap(), - general_err!( - "Invalid Parquet file. Metadata length is less than zero (-16777216)" - ) + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Metadata length is less than zero (-16777216)" ); } @@ -191,12 +186,9 @@ mod tests { fn test_parse_metadata_invalid_start() { let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); let reader_result = parse_metadata(&test_file); - assert!(reader_result.is_err()); assert_eq!( - reader_result.err().unwrap(), - general_err!( - "Invalid Parquet file. Reported metadata length of 255 + 8 byte footer, but file is only 8 bytes" - ) + reader_result.unwrap_err().to_string(), + "Parquet error: Invalid Parquet file. Reported metadata length of 255 + 8 byte footer, but file is only 8 bytes" ); } diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 0880e717981a..8c942cb44ef0 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -1536,16 +1536,16 @@ mod tests { ]); assert_eq!( - ParquetError::General("Cannot access Group as Float".to_string()), - row.get_float(0).unwrap_err() + row.get_float(0).unwrap_err().to_string(), + "Parquet error: Cannot access Group as Float" ); assert_eq!( - ParquetError::General("Cannot access ListInternal as Float".to_string()), - row.get_float(1).unwrap_err() + row.get_float(1).unwrap_err().to_string(), + "Parquet error: Cannot access ListInternal as Float" ); assert_eq!( - ParquetError::General("Cannot access MapInternal as Float".to_string()), - row.get_float(2).unwrap_err() + row.get_float(2).unwrap_err().to_string(), + "Parquet error: Cannot access MapInternal as Float", ); } @@ -1680,8 +1680,8 @@ mod tests { ("Y".to_string(), Field::Int(2)), ]))]); assert_eq!( - general_err!("Cannot access Group as Float".to_string()), - list.get_float(0).unwrap_err() + list.get_float(0).unwrap_err().to_string(), + "Parquet error: Cannot access Group as Float" ); let list = make_list(vec![Field::ListInternal(make_list(vec![ @@ -1691,8 +1691,8 @@ mod tests { Field::Int(12), ]))]); assert_eq!( - general_err!("Cannot access ListInternal as Float".to_string()), - list.get_float(0).unwrap_err() + list.get_float(0).unwrap_err().to_string(), + "Parquet error: Cannot access ListInternal as Float" ); let list = make_list(vec![Field::MapInternal(make_map(vec![ @@ -1701,8 +1701,8 @@ mod tests { (Field::Int(3), Field::Float(2.3)), ]))]); assert_eq!( - general_err!("Cannot access MapInternal as Float".to_string()), - list.get_float(0).unwrap_err() + list.get_float(0).unwrap_err().to_string(), + "Parquet error: Cannot access MapInternal as Float", ); } diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 0b7e04587354..a84693536995 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -824,7 +824,7 @@ impl Iterator for ReaderIter { mod tests { use super::*; - use crate::errors::{ParquetError, Result}; + use crate::errors::Result; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; use crate::schema::parser::parse_message_type; @@ -1452,10 +1452,9 @@ mod tests { "; let schema = parse_message_type(schema).unwrap(); let res = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)); - assert!(res.is_err()); assert_eq!( - res.unwrap_err(), - general_err!("Root schema does not contain projection") + res.unwrap_err().to_string(), + "Parquet error: Root schema does not contain projection" ); } @@ -1469,10 +1468,9 @@ mod tests { "; let schema = parse_message_type(schema).unwrap(); let res = test_row_group_rows("nested_maps.snappy.parquet", Some(schema)); - assert!(res.is_err()); assert_eq!( - res.unwrap_err(), - general_err!("Root schema does not contain projection") + res.unwrap_err().to_string(), + "Parquet error: Root schema does not contain projection" ); } @@ -1542,10 +1540,9 @@ mod tests { let reader = SerializedFileReader::try_from(path.as_path()).unwrap(); let res = RowIter::from_file_into(Box::new(reader)).project(proj); - assert!(res.is_err()); assert_eq!( - res.err().unwrap(), - general_err!("Root schema does not contain projection") + res.err().unwrap().to_string(), + "Parquet error: Root schema does not contain projection" ); } diff --git a/parquet/src/schema/parser.rs b/parquet/src/schema/parser.rs index 140e3e08500b..c09f13603d29 100644 --- a/parquet/src/schema/parser.rs +++ b/parquet/src/schema/parser.rs @@ -628,30 +628,26 @@ mod tests { assert!(assert_token(None, "b").is_err()); } - #[test] - fn test_parse_message_type_invalid() { - let mut iter = Tokenizer::from_str("test"); - let result = Parser { + fn parse(schema: &str) -> Result { + let mut iter = Tokenizer::from_str(schema); + Parser { tokenizer: &mut iter, } - .parse_message_type(); - assert!(result.is_err()); + .parse_message_type() + } + + #[test] + fn test_parse_message_type_invalid() { assert_eq!( - result.unwrap_err().to_string(), + parse("test").unwrap_err().to_string(), "Parquet error: Message type does not start with 'message'" ); } #[test] fn test_parse_message_type_no_name() { - let mut iter = Tokenizer::from_str("message"); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); assert_eq!( - result.unwrap_err().to_string(), + parse("message").unwrap_err().to_string(), "Parquet error: Expected name, found None" ); } @@ -659,46 +655,34 @@ mod tests { #[test] fn test_parse_message_type_fixed_byte_array() { let schema = " - message schema { - REQUIRED FIXED_LEN_BYTE_ARRAY col; - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY col; + } + "; + assert_eq!( + parse(schema).unwrap_err().to_string(), + "Parquet error: Expected '(', found token 'col'" + ); let schema = " - message schema { - REQUIRED FIXED_LEN_BYTE_ARRAY(16) col; - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_ok()); + message schema { + REQUIRED FIXED_LEN_BYTE_ARRAY(16) col; + } + "; + parse(schema).unwrap(); } #[test] fn test_parse_message_type_integer() { // Invalid integer syntax let schema = " - message root { - optional int64 f1 (INTEGER()); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); + message root { + optional int64 f1 (INTEGER()); + } + "; assert_eq!( - result, - Err(general_err!("Failed to parse bit_width for INTEGER type")) + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse bit_width for INTEGER type" ); // Invalid integer syntax, needs both bit-width and UTC sign @@ -707,123 +691,87 @@ mod tests { optional int64 f1 (INTEGER(32,)); } "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); assert_eq!( - result, - Err(general_err!("Incorrect bit width 32 for INT64")) + parse(schema).unwrap_err().to_string(), + "Parquet error: Incorrect bit width 32 for INT64" ); // Invalid integer because of non-numeric bit width let schema = " - message root { - optional int32 f1 (INTEGER(eight,true)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); + message root { + optional int32 f1 (INTEGER(eight,true)); + } + "; assert_eq!( - result, - Err(general_err!("Failed to parse bit_width for INTEGER type")) + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse bit_width for INTEGER type" ); // Valid types let schema = " - message root { - optional int32 f1 (INTEGER(8,false)); - optional int32 f2 (INTEGER(8,true)); - optional int32 f3 (INTEGER(16,false)); - optional int32 f4 (INTEGER(16,true)); - optional int32 f5 (INTEGER(32,false)); - optional int32 f6 (INTEGER(32,true)); - optional int64 f7 (INTEGER(64,false)); - optional int64 f7 (INTEGER(64,true)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_ok()); + message root { + optional int32 f1 (INTEGER(8,false)); + optional int32 f2 (INTEGER(8,true)); + optional int32 f3 (INTEGER(16,false)); + optional int32 f4 (INTEGER(16,true)); + optional int32 f5 (INTEGER(32,false)); + optional int32 f6 (INTEGER(32,true)); + optional int64 f7 (INTEGER(64,false)); + optional int64 f7 (INTEGER(64,true)); + } + "; + parse(schema).unwrap(); } #[test] fn test_parse_message_type_temporal() { // Invalid timestamp syntax let schema = " - message root { - optional int64 f1 (TIMESTAMP(); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); + message root { + optional int64 f1 (TIMESTAMP(); + } + "; assert_eq!( - result, - Err(general_err!("Failed to parse timeunit for TIMESTAMP type")) + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse timeunit for TIMESTAMP type" ); // Invalid timestamp syntax, needs both unit and UTC adjustment let schema = " - message root { - optional int64 f1 (TIMESTAMP(MILLIS,)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); + message root { + optional int64 f1 (TIMESTAMP(MILLIS,)); + } + "; assert_eq!( - result, - Err(general_err!( - "Failed to parse timezone info for TIMESTAMP type" - )) + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse timezone info for TIMESTAMP type" ); // Invalid timestamp because of unknown unit let schema = " - message root { - optional int64 f1 (TIMESTAMP(YOCTOS,)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); + message root { + optional int64 f1 (TIMESTAMP(YOCTOS,)); + } + "; + assert_eq!( - result, - Err(general_err!("Failed to parse timeunit for TIMESTAMP type")) + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse timeunit for TIMESTAMP type" ); // Valid types let schema = " - message root { - optional int32 f1 (DATE); - optional int32 f2 (TIME(MILLIS,true)); - optional int64 f3 (TIME(MICROS,false)); - optional int64 f4 (TIME(NANOS,true)); - optional int64 f5 (TIMESTAMP(MILLIS,true)); - optional int64 f6 (TIMESTAMP(MICROS,true)); - optional int64 f7 (TIMESTAMP(NANOS,false)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_ok()); + message root { + optional int32 f1 (DATE); + optional int32 f2 (TIME(MILLIS,true)); + optional int64 f3 (TIME(MICROS,false)); + optional int64 f4 (TIME(NANOS,true)); + optional int64 f5 (TIMESTAMP(MILLIS,true)); + optional int64 f6 (TIMESTAMP(MICROS,true)); + optional int64 f7 (TIMESTAMP(NANOS,false)); + } + "; + parse(schema).unwrap(); } #[test] @@ -833,86 +781,68 @@ mod tests { // Invalid decimal syntax let schema = " - message root { - optional int32 f1 (DECIMAL(); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); + message root { + optional int32 f1 (DECIMAL(); + } + "; + assert_eq!( + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse precision for DECIMAL type" + ); // Invalid decimal, need precision and scale let schema = " - message root { - optional int32 f1 (DECIMAL()); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); + message root { + optional int32 f1 (DECIMAL()); + } + "; + assert_eq!( + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse precision for DECIMAL type" + ); // Invalid decimal because of `,` - has precision, needs scale let schema = " - message root { - optional int32 f1 (DECIMAL(8,)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); + message root { + optional int32 f1 (DECIMAL(8,)); + } + "; + assert_eq!( + parse(schema).unwrap_err().to_string(), + "Parquet error: Failed to parse scale for DECIMAL type" + ); // Invalid decimal because, we always require either precision or scale to be // specified as part of converted type let schema = " - message root { - optional int32 f3 (DECIMAL); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_err()); + message root { + optional int32 f3 (DECIMAL); + } + "; + assert_eq!( + parse(schema).unwrap_err().to_string(), + "Parquet error: Expected ')', found token ';'" + ); // Valid decimal (precision, scale) let schema = " - message root { - optional int32 f1 (DECIMAL(8, 3)); - optional int32 f2 (DECIMAL(8)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let result = Parser { - tokenizer: &mut iter, - } - .parse_message_type(); - assert!(result.is_ok()); + message root { + optional int32 f1 (DECIMAL(8, 3)); + optional int32 f2 (DECIMAL(8)); + } + "; + parse(schema).unwrap(); } #[test] fn test_parse_message_type_compare_1() { let schema = " - message root { - optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); - optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); - } - "; - let mut iter = Tokenizer::from_str(schema); - let message = Parser { - tokenizer: &mut iter, - } - .parse_message_type() - .unwrap(); + message root { + optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); + optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); + } + "; + let message = parse(schema).unwrap(); let expected = Type::group_type_builder("root") .with_fields(&mut vec![ @@ -958,27 +888,22 @@ mod tests { #[test] fn test_parse_message_type_compare_2() { let schema = " - message root { - required group a0 { - optional group a1 (LIST) { - repeated binary a2 (UTF8); - } + message root { + required group a0 { + optional group a1 (LIST) { + repeated binary a2 (UTF8); + } - optional group b1 (LIST) { - repeated group b2 { - optional int32 b3; - optional double b4; - } - } - } - } - "; - let mut iter = Tokenizer::from_str(schema); - let message = Parser { - tokenizer: &mut iter, - } - .parse_message_type() - .unwrap(); + optional group b1 (LIST) { + repeated group b2 { + optional int32 b3; + optional double b4; + } + } + } + } + "; + let message = parse(schema).unwrap(); let expected = Type::group_type_builder("root") .with_fields(&mut vec![Arc::new( @@ -1048,21 +973,16 @@ mod tests { #[test] fn test_parse_message_type_compare_3() { let schema = " - message root { - required int32 _1 (INT_8); - required int32 _2 (INT_16); - required float _3; - required double _4; - optional int32 _5 (DATE); - optional binary _6 (UTF8); - } - "; - let mut iter = Tokenizer::from_str(schema); - let message = Parser { - tokenizer: &mut iter, - } - .parse_message_type() - .unwrap(); + message root { + required int32 _1 (INT_8); + required int32 _2 (INT_16); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional binary _6 (UTF8); + } + "; + let message = parse(schema).unwrap(); let mut fields = vec![ Arc::new( @@ -1116,25 +1036,20 @@ mod tests { #[test] fn test_parse_message_type_compare_4() { let schema = " - message root { - required int32 _1 (INTEGER(8,true)); - required int32 _2 (INTEGER(16,false)); - required float _3; - required double _4; - optional int32 _5 (DATE); - optional int32 _6 (TIME(MILLIS,false)); - optional int64 _7 (TIME(MICROS,true)); - optional int64 _8 (TIMESTAMP(MILLIS,true)); - optional int64 _9 (TIMESTAMP(NANOS,false)); - optional binary _10 (STRING); - } - "; - let mut iter = Tokenizer::from_str(schema); - let message = Parser { - tokenizer: &mut iter, - } - .parse_message_type() - .unwrap(); + message root { + required int32 _1 (INTEGER(8,true)); + required int32 _2 (INTEGER(16,false)); + required float _3; + required double _4; + optional int32 _5 (DATE); + optional int32 _6 (TIME(MILLIS,false)); + optional int64 _7 (TIME(MICROS,true)); + optional int64 _8 (TIMESTAMP(MILLIS,true)); + optional int64 _9 (TIMESTAMP(NANOS,false)); + optional binary _10 (STRING); + } + "; + let message = parse(schema).unwrap(); let mut fields = vec![ Arc::new( From de381ec544462597d60847f58a9e80dd11e1039c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 23 Jan 2023 16:47:45 +0000 Subject: [PATCH 0536/1411] Use native types in PageIndex (#3575) (#3578) * Use native types in PageIndex (#3575) * Cleanup endianess * Format * Cleanup test * Add FixedLenByteArray test --- parquet/src/bin/parquet-index.rs | 6 +- parquet/src/data_type.rs | 59 +---------- parquet/src/encodings/rle.rs | 7 +- parquet/src/file/page_index/index.rs | 104 +------------------- parquet/src/file/page_index/index_reader.rs | 12 +-- parquet/src/file/serialized_reader.rs | 80 +++++++++++---- parquet/src/file/statistics.rs | 6 +- parquet/src/util/bit_util.rs | 102 ++++++++++++------- 8 files changed, 147 insertions(+), 229 deletions(-) diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 485b31bed3de..a924ef373c02 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -132,7 +132,7 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { } /// Prints index information for a single column chunk -fn print_index( +fn print_index( column_index: &[PageIndex], offset_index: &[PageLocation], row_counts: &[i64], @@ -156,12 +156,12 @@ fn print_index( idx, o.offset, o.compressed_page_size, row_count ); match &c.min { - Some(m) => print!(", min {:>10?}", m), + Some(m) => print!(", min {:>10}", m), None => print!(", min {:>10}", "NONE"), } match &c.max { - Some(m) => print!(", max {:>10?}", m), + Some(m) => print!(", max {:>10}", m), None => print!(", max {:>10}", "NONE"), } println!() diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 3e423a41562a..5aff88e53402 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -27,10 +27,7 @@ use crate::basic::Type; use crate::column::reader::{ColumnReader, ColumnReaderImpl}; use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; -use crate::util::{ - bit_util::{from_le_slice, from_ne_slice, FromBytes}, - memory::ByteBufferPtr, -}; +use crate::util::{bit_util::FromBytes, memory::ByteBufferPtr}; /// Rust representation for logical type INT96, value is backed by an array of `u32`. /// The type only takes 12 bytes, without extra padding. @@ -1226,60 +1223,6 @@ impl AsRef<[u8]> for FixedLenByteArray { } } -impl FromBytes for Int96 { - type Buffer = [u8; 12]; - fn from_le_bytes(bs: Self::Buffer) -> Self { - let mut i = Int96::new(); - i.set_data( - from_le_slice(&bs[0..4]), - from_le_slice(&bs[4..8]), - from_le_slice(&bs[8..12]), - ); - i - } - fn from_be_bytes(_bs: Self::Buffer) -> Self { - unimplemented!() - } - fn from_ne_bytes(bs: Self::Buffer) -> Self { - let mut i = Int96::new(); - i.set_data( - from_ne_slice(&bs[0..4]), - from_ne_slice(&bs[4..8]), - from_ne_slice(&bs[8..12]), - ); - i - } -} - -// FIXME Needed to satisfy the constraint of many decoding functions but ByteArray does not -// appear to actual be converted directly from bytes -impl FromBytes for ByteArray { - type Buffer = Vec; - fn from_le_bytes(bs: Self::Buffer) -> Self { - ByteArray::from(bs) - } - fn from_be_bytes(_bs: Self::Buffer) -> Self { - unreachable!() - } - fn from_ne_bytes(bs: Self::Buffer) -> Self { - ByteArray::from(bs) - } -} - -impl FromBytes for FixedLenByteArray { - type Buffer = Vec; - - fn from_le_bytes(bs: Self::Buffer) -> Self { - Self(ByteArray::from(bs)) - } - fn from_be_bytes(_bs: Self::Buffer) -> Self { - unreachable!() - } - fn from_ne_bytes(bs: Self::Buffer) -> Self { - Self(ByteArray::from(bs)) - } -} - /// Macro to reduce repetition in making type assertions on the physical type against `T` macro_rules! ensure_phys_ty { ($($ty:pat_param)|+ , $err: literal) => { diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 77b76d0e7e53..63ab15c73ead 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -18,8 +18,9 @@ use std::{cmp, mem::size_of}; use crate::errors::{ParquetError, Result}; +use crate::util::bit_util::from_le_slice; use crate::util::{ - bit_util::{self, from_ne_slice, BitReader, BitWriter, FromBytes}, + bit_util::{self, BitReader, BitWriter, FromBytes}, memory::ByteBufferPtr, }; @@ -349,7 +350,7 @@ impl RleDecoder { } let value = if self.rle_left > 0 { - let rle_value = from_ne_slice( + let rle_value = from_le_slice( &self .current_value .as_mut() @@ -381,7 +382,7 @@ impl RleDecoder { let num_values = cmp::min(buffer.len() - values_read, self.rle_left as usize); for i in 0..num_values { - let repeated_value = from_ne_slice( + let repeated_value = from_le_slice( &self.current_value.as_mut().unwrap().to_ne_bytes(), ); buffer[values_read + i] = repeated_value; diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 7adf2c08a9fd..83d55caa4ba9 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -17,7 +17,7 @@ use crate::basic::Type; use crate::data_type::private::ParquetValueType; -use crate::data_type::Int96; +use crate::data_type::{ByteArray, Int96}; use crate::errors::ParquetError; use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; @@ -53,14 +53,14 @@ pub enum Index { /// will only return pageLocations without min_max index, /// `NONE` represents this lack of index information NONE, - BOOLEAN(BooleanIndex), + BOOLEAN(NativeIndex), INT32(NativeIndex), INT64(NativeIndex), INT96(NativeIndex), FLOAT(NativeIndex), DOUBLE(NativeIndex), - BYTE_ARRAY(ByteArrayIndex), - FIXED_LEN_BYTE_ARRAY(ByteArrayIndex), + BYTE_ARRAY(NativeIndex), + FIXED_LEN_BYTE_ARRAY(NativeIndex), } impl Index { @@ -143,99 +143,3 @@ impl NativeIndex { }) } } - -/// An index of a column of bytes type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ByteArrayIndex { - /// The physical type - pub physical_type: Type, - /// The indexes, one item per page - pub indexes: Vec>>, - pub boundary_order: BoundaryOrder, -} - -impl ByteArrayIndex { - pub(crate) fn try_new( - index: ColumnIndex, - physical_type: Type, - ) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .into_iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - (Some(min), Some(max)) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - physical_type, - indexes, - boundary_order: index.boundary_order, - }) - } -} - -/// An index of a column of boolean physical type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct BooleanIndex { - /// The indexes, one item per page - pub indexes: Vec>, - pub boundary_order: BoundaryOrder, -} - -impl BooleanIndex { - pub(crate) fn try_new(index: ColumnIndex) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .into_iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min[0] != 0; - let max = max[0] == 1; - (Some(min), Some(max)) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - indexes, - boundary_order: index.boundary_order, - }) - } -} diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index af23c0bd9f01..36b1c9d6c275 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -19,7 +19,7 @@ use crate::basic::Type; use crate::data_type::Int96; use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; -use crate::file::page_index::index::{BooleanIndex, ByteArrayIndex, Index, NativeIndex}; +use crate::file::page_index::index::{Index, NativeIndex}; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use std::io::{Cursor, Read}; @@ -154,17 +154,17 @@ pub(crate) fn deserialize_column_index( let index = ColumnIndex::read_from_in_protocol(&mut prot)?; let index = match column_type { - Type::BOOLEAN => Index::BOOLEAN(BooleanIndex::try_new(index)?), + Type::BOOLEAN => { + Index::BOOLEAN(NativeIndex::::try_new(index, column_type)?) + } Type::INT32 => Index::INT32(NativeIndex::::try_new(index, column_type)?), Type::INT64 => Index::INT64(NativeIndex::::try_new(index, column_type)?), Type::INT96 => Index::INT96(NativeIndex::::try_new(index, column_type)?), Type::FLOAT => Index::FLOAT(NativeIndex::::try_new(index, column_type)?), Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new(index, column_type)?), - Type::BYTE_ARRAY => { - Index::BYTE_ARRAY(ByteArrayIndex::try_new(index, column_type)?) - } + Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new(index, column_type)?), Type::FIXED_LEN_BYTE_ARRAY => { - Index::FIXED_LEN_BYTE_ARRAY(ByteArrayIndex::try_new(index, column_type)?) + Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new(index, column_type)?) } }; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 84768aa23c88..8ee37352bdd7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -831,7 +831,10 @@ mod tests { use crate::basic::{self, ColumnOrder}; use crate::data_type::private::ParquetValueType; - use crate::file::page_index::index::{ByteArrayIndex, Index, NativeIndex}; + use crate::data_type::{AsBytes, FixedLenByteArrayType}; + use crate::file::page_index::index::{Index, NativeIndex}; + use crate::file::properties::WriterProperties; + use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::bit_util::from_le_slice; @@ -1363,8 +1366,8 @@ mod tests { let page0 = &index_in_pages[0]; let min = page0.min.as_ref().unwrap(); let max = page0.max.as_ref().unwrap(); - assert_eq!("Hello", std::str::from_utf8(min.as_slice()).unwrap()); - assert_eq!("today", std::str::from_utf8(max.as_slice()).unwrap()); + assert_eq!(b"Hello", min.as_bytes()); + assert_eq!(b"today", max.as_bytes()); let offset_indexes = metadata.offset_indexes().unwrap(); // only one row group @@ -1502,7 +1505,7 @@ mod tests { //col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max: 12/31/10, num_nulls: 0] assert!(!&page_indexes[0][8].is_sorted()); if let Index::BYTE_ARRAY(index) = &page_indexes[0][8] { - check_bytes_page_index( + check_native_page_index( index, 974, get_row_group_min_max_bytes(row_group_metadata, 8), @@ -1515,7 +1518,7 @@ mod tests { //col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795 SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] assert!(&page_indexes[0][9].is_sorted()); if let Index::BYTE_ARRAY(index) = &page_indexes[0][9] { - check_bytes_page_index( + check_native_page_index( index, 352, get_row_group_min_max_bytes(row_group_metadata, 9), @@ -1575,20 +1578,6 @@ mod tests { }); } - fn check_bytes_page_index( - row_group_index: &ByteArrayIndex, - page_size: usize, - min_max: (&[u8], &[u8]), - boundary_order: BoundaryOrder, - ) { - assert_eq!(row_group_index.indexes.len(), page_size); - assert_eq!(row_group_index.boundary_order, boundary_order); - row_group_index.indexes.iter().all(|x| { - x.min.as_ref().unwrap().as_slice() >= min_max.0 - && x.max.as_ref().unwrap().as_slice() <= min_max.1 - }); - } - fn get_row_group_min_max_bytes( r: &RowGroupMetaData, col_num: usize, @@ -1742,4 +1731,57 @@ mod tests { assert_eq!(vec.len(), 352); } + + #[test] + fn test_fixed_length_index() { + let message_type = " + message test_schema { + OPTIONAL FIXED_LEN_BYTE_ARRAY (11) value (DECIMAL(25,2)); + } + "; + + let schema = parse_message_type(message_type).unwrap(); + let mut out = Vec::with_capacity(1024); + let mut writer = SerializedFileWriter::new( + &mut out, + Arc::new(schema), + Arc::new(WriterProperties::builder().build()), + ) + .unwrap(); + + let mut r = writer.next_row_group().unwrap(); + let mut c = r.next_column().unwrap().unwrap(); + c.typed::() + .write_batch( + &[vec![0; 11].into(), vec![5; 11].into(), vec![3; 11].into()], + Some(&[1, 1, 0, 1]), + None, + ) + .unwrap(); + c.close().unwrap(); + r.close().unwrap(); + writer.close().unwrap(); + + let b = Bytes::from(out); + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(b, options).unwrap(); + let index = reader.metadata().page_indexes().unwrap(); + + // 1 row group + assert_eq!(index.len(), 1); + let c = &index[0]; + // 1 column + assert_eq!(c.len(), 1); + + match &c[0] { + Index::FIXED_LEN_BYTE_ARRAY(v) => { + assert_eq!(v.indexes.len(), 1); + let page_idx = &v.indexes[0]; + assert_eq!(page_idx.null_count.unwrap(), 1); + assert_eq!(page_idx.min.as_ref().unwrap().as_ref(), &[0; 11]); + assert_eq!(page_idx.max.as_ref().unwrap().as_ref(), &[5; 11]); + } + _ => unreachable!(), + } + } } diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 8eb04ffbc65c..76885fdbf7a5 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -44,7 +44,7 @@ use crate::format::Statistics as TStatistics; use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::*; -use crate::util::bit_util::from_ne_slice; +use crate::util::bit_util::from_le_slice; pub(crate) mod private { use super::*; @@ -181,11 +181,11 @@ pub fn from_thrift( // min/max statistics for INT96 columns. let min = min.map(|data| { assert_eq!(data.len(), 12); - from_ne_slice::(&data) + from_le_slice::(&data) }); let max = max.map(|data| { assert_eq!(data.len(), 12); - from_ne_slice::(&data) + from_le_slice::(&data) }); Statistics::int96(min, max, distinct_count, null_count, old_format) } diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index cfbd521e9a7e..c229ea3da26e 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -17,37 +17,34 @@ use std::{cmp, mem::size_of}; -use crate::data_type::AsBytes; +use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; +use crate::errors::{ParquetError, Result}; use crate::util::bit_pack::{unpack16, unpack32, unpack64, unpack8}; use crate::util::memory::ByteBufferPtr; #[inline] -pub fn from_ne_slice(bs: &[u8]) -> T { - let mut b = T::Buffer::default(); - { - let b = b.as_mut(); - let bs = &bs[..b.len()]; - b.copy_from_slice(bs); - } - T::from_ne_bytes(b) +pub fn from_le_slice(bs: &[u8]) -> T { + // TODO: propagate the error (#3577) + T::try_from_le_slice(bs).unwrap() } #[inline] -pub fn from_le_slice(bs: &[u8]) -> T { - let mut b = T::Buffer::default(); - { - let b = b.as_mut(); - let bs = &bs[..b.len()]; - b.copy_from_slice(bs); +fn array_from_slice(bs: &[u8]) -> Result<[u8; N]> { + // Need to slice as may be called with zero-padded values + match bs.get(..N) { + Some(b) => Ok(b.try_into().unwrap()), + None => Err(general_err!( + "error converting value, expected {} bytes got {}", + N, + bs.len() + )), } - T::from_le_bytes(b) } pub trait FromBytes: Sized { type Buffer: AsMut<[u8]> + Default; + fn try_from_le_slice(b: &[u8]) -> Result; fn from_le_bytes(bs: Self::Buffer) -> Self; - fn from_be_bytes(bs: Self::Buffer) -> Self; - fn from_ne_bytes(bs: Self::Buffer) -> Self; } macro_rules! from_le_bytes { @@ -55,38 +52,69 @@ macro_rules! from_le_bytes { $( impl FromBytes for $ty { type Buffer = [u8; size_of::()]; + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(Self::from_le_bytes(array_from_slice(b)?)) + } fn from_le_bytes(bs: Self::Buffer) -> Self { <$ty>::from_le_bytes(bs) } - fn from_be_bytes(bs: Self::Buffer) -> Self { - <$ty>::from_be_bytes(bs) - } - fn from_ne_bytes(bs: Self::Buffer) -> Self { - <$ty>::from_ne_bytes(bs) - } } )* }; } +from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 } + impl FromBytes for bool { type Buffer = [u8; 1]; + + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(Self::from_le_bytes(array_from_slice(b)?)) + } + fn from_le_bytes(bs: Self::Buffer) -> Self { + bs[0] != 0 + } +} + +impl FromBytes for Int96 { + type Buffer = [u8; 12]; + + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(Self::from_le_bytes(array_from_slice(b)?)) + } + fn from_le_bytes(bs: Self::Buffer) -> Self { - Self::from_ne_bytes(bs) + let mut i = Int96::new(); + i.set_data( + from_le_slice(&bs[0..4]), + from_le_slice(&bs[4..8]), + from_le_slice(&bs[8..12]), + ); + i } - fn from_be_bytes(bs: Self::Buffer) -> Self { - Self::from_ne_bytes(bs) +} + +impl FromBytes for ByteArray { + type Buffer = Vec; + + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(b.to_vec().into()) } - fn from_ne_bytes(bs: Self::Buffer) -> Self { - match bs[0] { - 0 => false, - 1 => true, - _ => panic!("Invalid byte when reading bool"), - } + fn from_le_bytes(bs: Self::Buffer) -> Self { + bs.into() } } -from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 } +impl FromBytes for FixedLenByteArray { + type Buffer = Vec; + + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(b.to_vec().into()) + } + fn from_le_bytes(bs: Self::Buffer) -> Self { + bs.into() + } +} /// Reads `size` of bytes from `src`, and reinterprets them as type `ty`, in /// little-endian order. @@ -98,7 +126,7 @@ where assert!(size <= src.len()); let mut buffer = ::Buffer::default(); buffer.as_mut()[..size].copy_from_slice(&src[..size]); - ::from_ne_bytes(buffer) + ::from_le_bytes(buffer) } /// Returns the ceil of value/divisor. @@ -395,7 +423,7 @@ impl BitReader { } // TODO: better to avoid copying here - Some(from_ne_slice(v.as_bytes())) + Some(from_le_slice(v.as_bytes())) } /// Read multiple values from their packed representation where each element is represented @@ -973,7 +1001,7 @@ mod tests { // Generic values used to check against actual values read from `get_batch`. let expected_values: Vec = - values.iter().map(|v| from_ne_slice(v.as_bytes())).collect(); + values.iter().map(|v| from_le_slice(v.as_bytes())).collect(); (0..total).for_each(|i| writer.put_value(values[i], num_bits)); From f1b1689dc54cfe40fd88992ea202ecefa8eda281 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 23 Jan 2023 20:30:17 +0000 Subject: [PATCH 0537/1411] Clear bits in BooleanBufferBuilder (#3587) (#3588) --- .../src/builder/boolean_buffer_builder.rs | 73 ++++++++++++++++++- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index 7d86f74f6aae..a0fdea948356 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -38,7 +38,12 @@ impl BooleanBufferBuilder { /// Creates a new `BooleanBufferBuilder` from [`MutableBuffer`] of `len` pub fn new_from_buffer(buffer: MutableBuffer, len: usize) -> Self { assert!(len <= buffer.len() * 8); - Self { buffer, len } + let mut s = Self { + len: buffer.len() * 8, + buffer, + }; + s.truncate(len); + s } /// Returns the length of the buffer @@ -86,6 +91,26 @@ impl BooleanBufferBuilder { self.len = new_len; } + /// Truncates the builder to the given length + /// + /// If `len` is greater than the buffer's current length, this has no effect + #[inline] + pub fn truncate(&mut self, len: usize) { + if len > self.len { + return; + } + + let new_len_bytes = bit_util::ceil(len, 8); + self.buffer.truncate(new_len_bytes); + self.len = len; + + let remainder = self.len % 8; + if remainder != 0 { + let mask = (1_u8 << remainder).wrapping_sub(1); + *self.buffer.as_mut().last_mut().unwrap() &= mask; + } + } + /// Reserve space to at least `additional` new bits. /// Capacity will be `>= self.len() + additional`. /// New bytes are uninitialized and reading them is undefined behavior. @@ -103,9 +128,10 @@ impl BooleanBufferBuilder { /// growing it (potentially reallocating it) and writing `false` in the newly available bits. #[inline] pub fn resize(&mut self, len: usize) { - let len_bytes = bit_util::ceil(len, 8); - self.buffer.resize(len_bytes, 0); - self.len = len; + match len.checked_sub(self.len) { + Some(delta) => self.advance(delta), + None => self.truncate(len), + } } /// Appends a boolean `v` into the buffer @@ -383,6 +409,45 @@ mod tests { assert_eq!(builder.as_slice(), &[0b11101111, 0b00000001]); } + #[test] + fn test_truncate() { + let b = MutableBuffer::from_iter([true, true, true, true]); + let mut builder = BooleanBufferBuilder::new_from_buffer(b, 2); + builder.advance(2); + let finished = builder.finish(); + assert_eq!(finished.as_slice(), &[0b00000011]); + + let mut builder = BooleanBufferBuilder::new(10); + builder.append_n(5, true); + builder.resize(3); + builder.advance(2); + let finished = builder.finish(); + assert_eq!(finished.as_slice(), &[0b00000111]); + + let mut builder = BooleanBufferBuilder::new(10); + builder.append_n(16, true); + assert_eq!(builder.as_slice(), &[0xFF, 0xFF]); + builder.truncate(20); + assert_eq!(builder.as_slice(), &[0xFF, 0xFF]); + builder.truncate(14); + assert_eq!(builder.as_slice(), &[0xFF, 0b00111111]); + builder.append(false); + builder.append(true); + assert_eq!(builder.as_slice(), &[0xFF, 0b10111111]); + builder.append_packed_range(0..3, &[0xFF]); + assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b00000111]); + builder.truncate(17); + assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b00000001]); + builder.append_packed_range(0..2, &[2]); + assert_eq!(builder.as_slice(), &[0xFF, 0b10111111, 0b0000101]); + builder.truncate(8); + assert_eq!(builder.as_slice(), &[0xFF]); + builder.resize(14); + assert_eq!(builder.as_slice(), &[0xFF, 0x00]); + builder.truncate(0); + assert_eq!(builder.as_slice(), &[]); + } + #[test] fn test_boolean_builder_increases_buffer_len() { // 00000010 01001000 From b826657ac6031b3a0ce2148f7f148ad0ed324b80 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 23 Jan 2023 12:30:46 -0800 Subject: [PATCH 0538/1411] Iterate all dictionary key types in cast test (#3585) --- arrow/tests/array_cast.rs | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 91d2da9985b5..ff6fbad099cb 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -351,7 +351,7 @@ fn get_all_types() -> Vec { use DataType::*; let tz_name = String::from("America/New_York"); - vec![ + let mut types = vec![ Null, Boolean, Int8, @@ -409,19 +409,28 @@ fn get_all_types() -> Vec { vec![0, 1], UnionMode::Dense, ), - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Binary)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)), Decimal128(38, 0), - Dictionary(Box::new(DataType::Int8), Box::new(Decimal128(38, 0))), - Dictionary(Box::new(DataType::Int16), Box::new(Decimal128(38, 0))), - Dictionary(Box::new(DataType::UInt32), Box::new(Decimal128(38, 0))), - Dictionary(Box::new(DataType::Int8), Box::new(Decimal256(76, 0))), - Dictionary(Box::new(DataType::Int16), Box::new(Decimal256(76, 0))), - Dictionary(Box::new(DataType::UInt32), Box::new(Decimal256(76, 0))), - ] + ]; + + let dictionary_key_types = + vec![Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64]; + + let mut dictionary_types = dictionary_key_types + .into_iter() + .flat_map(|key_type| { + vec![ + Dictionary(Box::new(key_type.clone()), Box::new(Int32)), + Dictionary(Box::new(key_type.clone()), Box::new(Utf8)), + Dictionary(Box::new(key_type.clone()), Box::new(Binary)), + Dictionary(Box::new(key_type.clone()), Box::new(Decimal128(38, 0))), + Dictionary(Box::new(key_type), Box::new(Decimal256(76, 0))), + ] + }) + .into_iter() + .collect::>(); + + types.append(&mut dictionary_types); + types } #[test] From b35e179722f54d94866672e8086265679fc7d8f3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 24 Jan 2023 08:45:35 +0000 Subject: [PATCH 0539/1411] Fix nullif null count (#3579) (#3590) * Fix nullif null count (#3579) * Clippy --- arrow-select/src/nullif.rs | 87 +++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 23a586f63652..34876e948b9d 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -37,7 +37,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Result ( buffer_bin_and( &right_data.buffers()[0], @@ -68,27 +68,26 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { - let t = l & !r; - valid_count += t.count_ones() as usize; - t - }) + let mut valid_count = 0; + let b = + bitwise_bin_op_helper(left, l_offset, &right, r_offset, len, |l, r| { + let t = l & !r; + valid_count += t.count_ones() as usize; + t + }); + (b, len - valid_count) } None => { - let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { + let mut null_count = 0; + let buffer = bitwise_unary_op_helper(&right, r_offset, len, |b| { let t = !b; - valid_count += t.count_ones() as usize; + null_count += t.count_zeros() as usize; t }); - // We need to compensate for the additional bits read from the end - let remainder_len = len % 64; - if remainder_len != 0 { - valid_count -= 64 - remainder_len - } - buffer + (buffer, null_count) } }; @@ -96,15 +95,14 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result combined, _ => { - let mut builder = BooleanBufferBuilder::new(len + left_offset); + let mut builder = BooleanBufferBuilder::new(len + l_offset); // Pad with 0s up to offset - builder.resize(left_offset); + builder.resize(l_offset); builder.append_packed_range(0..len, &combined); builder.finish() } }; - let null_count = len - valid_count; let data = left_data .clone() .into_builder() @@ -125,6 +123,7 @@ mod tests { use arrow_array::{Int32Array, StringArray, StructArray}; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; + use rand::{thread_rng, Rng}; #[test] fn test_nullif_int_array() { @@ -464,4 +463,52 @@ mod tests { let res = nullif(&a, &mask).unwrap(); assert_eq!(res.as_ref(), &a); } + + fn test_nullif(values: &Int32Array, filter: &BooleanArray) { + let expected: Int32Array = values + .iter() + .zip(filter.iter()) + .map(|(a, b)| match b { + Some(true) => None, + Some(false) | None => a, + }) + .collect(); + + let r = nullif(values, filter).unwrap(); + r.data().validate().unwrap(); + + assert_eq!(expected.data(), r.data()); + } + + #[test] + fn nullif_fuzz() { + let mut rng = thread_rng(); + + let arrays = [ + Int32Array::from(vec![0; 128]), + (0..128).map(|_| rng.gen_bool(0.5).then_some(0)).collect(), + ]; + + for a in arrays { + let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; + + for (a_offset, a_length) in a_slices { + let a = a.slice(a_offset, a_length); + let a = as_primitive_array::(a.as_ref()); + + for i in 1..65 { + let b_start_offset = rng.gen_range(0..i); + let b_end_offset = rng.gen_range(0..i); + + let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) + .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) + .collect(); + let b = b.slice(b_start_offset, a_length); + let b = as_boolean_array(b.as_ref()); + + test_nullif(a, b); + } + } + } + } } From 025ffd0b226fcaca24ff1e9c64c3189c0926f587 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:33:18 +0000 Subject: [PATCH 0540/1411] Faster BooleanBufferBuilder::append_n for true values (#3596) --- .../src/builder/boolean_buffer_builder.rs | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index a0fdea948356..ac2a96feade0 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -146,12 +146,27 @@ impl BooleanBufferBuilder { /// Appends n `additional` bits of value `v` into the buffer #[inline] pub fn append_n(&mut self, additional: usize, v: bool) { - self.advance(additional); - if additional > 0 && v { - let offset = self.len() - additional; - (0..additional).for_each(|i| unsafe { - bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) - }) + match v { + true => { + let new_len = self.len + additional; + let new_len_bytes = bit_util::ceil(new_len, 8); + let cur_remainder = self.len % 8; + let new_remainder = new_len % 8; + + if cur_remainder != 0 { + // Pad last byte with 1s + *self.buffer.as_slice_mut().last_mut().unwrap() |= + !((1 << cur_remainder) - 1) + } + self.buffer.resize(new_len_bytes, 0xFF); + if new_remainder != 0 { + // Clear remaining bits + *self.buffer.as_slice_mut().last_mut().unwrap() &= + (1 << new_remainder) - 1 + } + self.len = new_len; + } + false => self.advance(additional), } } From f20cba5d05bc828fa16a9b795581d7bd6be82e69 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 24 Jan 2023 17:56:16 +0000 Subject: [PATCH 0541/1411] Add conversion from StringArray to BinaryArray (#3592) --- arrow-array/src/array/binary_array.rs | 32 ++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 3a30d748ee3a..cb863c563584 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::types::GenericBinaryType; -use crate::{Array, GenericByteArray, GenericListArray, OffsetSizeTrait}; +use crate::types::{ByteArrayType, GenericBinaryType}; +use crate::{ + Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, +}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -157,6 +159,21 @@ impl From> for GenericBinaryArray { } } +impl From> + for GenericBinaryArray +{ + fn from(value: GenericStringArray) -> Self { + let builder = value + .into_data() + .into_builder() + .data_type(GenericBinaryType::::DATA_TYPE); + + // Safety: + // A StringArray is a valid BinaryArray + Self::from(unsafe { builder.build_unchecked() }) + } +} + impl FromIterator> for GenericBinaryArray where @@ -283,7 +300,7 @@ pub type LargeBinaryArray = GenericBinaryArray; #[cfg(test)] mod tests { use super::*; - use crate::ListArray; + use crate::{ListArray, StringArray}; use arrow_schema::Field; #[test] @@ -697,4 +714,13 @@ mod tests { assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } + + #[test] + fn test_to_from_string() { + let s = StringArray::from_iter_values(["a", "b", "c", "d"]); + let b = BinaryArray::from(s.clone()); + let sa = StringArray::from(b); // Performs UTF-8 validation again + + assert_eq!(s, sa); + } } From d938cd9bf621c9bf3c3bd9e33c4c0a5eb4060534 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 24 Jan 2023 17:56:37 +0000 Subject: [PATCH 0542/1411] Update to flatbuffers 23.1.21 (#3597) --- arrow-ipc/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index eb39c8ed45ab..d2274e8956e9 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -43,7 +43,7 @@ arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } arrow-cast = { version = "31.0.0", path = "../arrow-cast" } arrow-data = { version = "31.0.0", path = "../arrow-data" } arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] } +flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } From bf21ad91b0603dc935158b40c4f940c24be32f82 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 25 Jan 2023 00:12:17 -0800 Subject: [PATCH 0543/1411] Enable casting between Utf8/LargeUtf8 and Binary/LargeBinary (#3542) * Enable casting between Utf8/LargeUtf8 and Binary/LargeBinary * For review * Add native bound restrict * Use From for Utf8 -> Binary and LargeUtf8 -> LargeBinary. * Restrict the input and output native types to be the same. --- arrow-cast/src/cast.rs | 151 +++++++++++++++++++++----------------- arrow/tests/array_cast.rs | 2 + 2 files changed, 84 insertions(+), 69 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c54761840167..c60e660378aa 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -151,13 +151,16 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (_, Decimal256(_, _)) => false, (Struct(_), _) => false, (_, Struct(_)) => false, - (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8, - (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8, + (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8 || from_type == &LargeUtf8, + (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8, (Utf8, LargeUtf8) => true, (LargeUtf8, Utf8) => true, + (Binary, LargeBinary) => true, + (LargeBinary, Binary) => true, (Utf8, Binary + | LargeBinary | Date32 | Date64 | Time32(TimeUnit::Second) @@ -168,7 +171,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { ) => true, (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (LargeUtf8, - LargeBinary + Binary + | LargeBinary | Date32 | Date64 | Time32(TimeUnit::Second) @@ -1075,7 +1079,8 @@ pub fn cast_with_options( Float16 => cast_numeric_to_bool::(array), Float32 => cast_numeric_to_bool::(array), Float64 => cast_numeric_to_bool::(array), - Utf8 => cast_utf8_to_boolean(array, cast_options), + Utf8 => cast_utf8_to_boolean::(array, cast_options), + LargeUtf8 => cast_utf8_to_boolean::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -1102,13 +1107,22 @@ pub fn cast_with_options( .collect::(), )) } + LargeUtf8 => { + let array = array.as_any().downcast_ref::().unwrap(); + Ok(Arc::new( + array + .iter() + .map(|value| value.map(|value| if value { "1" } else { "0" })) + .collect::(), + )) + } _ => Err(ArrowError::CastError(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, ))), }, (Utf8, _) => match to_type { - LargeUtf8 => cast_str_container::(&**array), + LargeUtf8 => cast_byte_container::(&**array), UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), UInt32 => cast_string_to_numeric::(array, cast_options), @@ -1121,7 +1135,11 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), - Binary => cast_string_to_binary(array), + Binary => Ok(Arc::new(BinaryArray::from(as_string_array(array).clone()))), + LargeBinary => { + let binary = BinaryArray::from(as_string_array(array).clone()); + cast_byte_container::(&binary) + } Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -1143,7 +1161,7 @@ pub fn cast_with_options( ))), }, (_, Utf8) => match from_type { - LargeUtf8 => cast_str_container::(&**array), + LargeUtf8 => cast_byte_container::(&**array), UInt8 => cast_numeric_to_string::(array), UInt16 => cast_numeric_to_string::(array), UInt32 => cast_numeric_to_string::(array), @@ -1270,7 +1288,14 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), - LargeBinary => cast_string_to_binary(array), + Binary => { + let large_binary = + LargeBinaryArray::from(as_largestring_array(array).clone()); + cast_byte_container::(&large_binary) + } + LargeBinary => Ok(Arc::new(LargeBinaryArray::from( + as_largestring_array(array).clone(), + ))), Time32(TimeUnit::Second) => { cast_string_to_time32second::(&**array, cast_options) } @@ -1291,7 +1316,22 @@ pub fn cast_with_options( from_type, to_type, ))), }, - + (Binary, _) => match to_type { + LargeBinary => { + cast_byte_container::(&**array) + } + _ => Err(ArrowError::CastError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type, + ))), + }, + (LargeBinary, _) => match to_type { + Binary => cast_byte_container::(&**array), + _ => Err(ArrowError::CastError(format!( + "Casting from {:?} to {:?} not supported", + from_type, to_type, + ))), + }, // start numeric casts (UInt8, UInt16) => { cast_numeric_arrays::(array, cast_options) @@ -2007,41 +2047,6 @@ pub fn cast_with_options( } } -/// Cast to string array to binary array -fn cast_string_to_binary(array: &ArrayRef) -> Result { - let from_type = array.data_type(); - match *from_type { - DataType::Utf8 => { - let data = unsafe { - array - .data() - .clone() - .into_builder() - .data_type(DataType::Binary) - .build_unchecked() - }; - - Ok(Arc::new(BinaryArray::from(data)) as ArrayRef) - } - DataType::LargeUtf8 => { - let data = unsafe { - array - .data() - .clone() - .into_builder() - .data_type(DataType::LargeBinary) - .build_unchecked() - }; - - Ok(Arc::new(LargeBinaryArray::from(data)) as ArrayRef) - } - _ => Err(ArrowError::InvalidArgumentError(format!( - "{:?} cannot be converted to binary array", - from_type - ))), - } -} - /// Get the time unit as a multiple of a second const fn time_unit_multiple(unit: &TimeUnit) -> i64 { match unit { @@ -2843,11 +2848,17 @@ fn cast_string_to_timestamp_ns( } /// Casts Utf8 to Boolean -fn cast_utf8_to_boolean( +fn cast_utf8_to_boolean( from: &ArrayRef, cast_options: &CastOptions, -) -> Result { - let array = as_string_array(from); +) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let array = from + .as_any() + .downcast_ref::>() + .unwrap(); let output_array = array .iter() @@ -2861,7 +2872,7 @@ fn cast_utf8_to_boolean( invalid_value => match cast_options.safe { true => Ok(None), false => Err(ArrowError::CastError(format!( - "Cannot cast string '{}' to value of Boolean type", + "Cannot cast value '{}' to value of Boolean type", invalid_value, ))), }, @@ -3447,39 +3458,43 @@ fn cast_list_inner( Ok(Arc::new(list) as ArrayRef) } -/// Helper function to cast from `Utf8` to `LargeUtf8` and vice versa. If the `LargeUtf8` is too large for -/// a `Utf8` array it will return an Error. -fn cast_str_container( +/// Helper function to cast from one `ByteArrayType` to another and vice versa. +/// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error. +fn cast_byte_container( array: &dyn Array, ) -> Result where - OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, - OffsetSizeTo: OffsetSizeTrait + NumCast + ArrowNativeType, + FROM: ByteArrayType, + TO: ByteArrayType, + FROM::Offset: OffsetSizeTrait + ToPrimitive, + TO::Offset: OffsetSizeTrait + NumCast, { let data = array.data(); - assert_eq!( - data.data_type(), - &GenericStringArray::::DATA_TYPE - ); + assert_eq!(data.data_type(), &FROM::DATA_TYPE); let str_values_buf = data.buffers()[1].clone(); - let offsets = data.buffers()[0].typed_data::(); + let offsets = data.buffers()[0].typed_data::(); - let mut offset_builder = BufferBuilder::::new(offsets.len()); + let mut offset_builder = BufferBuilder::::new(offsets.len()); offsets .iter() .try_for_each::<_, Result<_, ArrowError>>(|offset| { - let offset = OffsetSizeTo::from(*offset).ok_or_else(|| { - ArrowError::ComputeError( - "large-utf8 array too large to cast to utf8-array".into(), - ) - })?; + let offset = <::Offset as NumCast>::from(*offset) + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "{}{} array too large to cast to {}{} array", + FROM::Offset::PREFIX, + FROM::PREFIX, + TO::Offset::PREFIX, + TO::PREFIX + )) + })?; offset_builder.append(offset); Ok(()) })?; let offset_buffer = offset_builder.finish(); - let dtype = GenericStringArray::::DATA_TYPE; + let dtype = TO::DATA_TYPE; let builder = ArrayData::builder(dtype) .offset(array.offset()) @@ -3490,9 +3505,7 @@ where let array_data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(GenericStringArray::::from( - array_data, - ))) + Ok(Arc::new(GenericByteArray::::from(array_data))) } /// Cast the container type of List/Largelist array but not the inner types. @@ -4813,7 +4826,7 @@ mod tests { Ok(_) => panic!("expected error"), Err(e) => { assert!(e.to_string().contains( - "Cast error: Cannot cast string 'invalid' to value of Boolean type" + "Cast error: Cannot cast value 'invalid' to value of Boolean type" )) } } diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index ff6fbad099cb..ae73b1b4200b 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -421,7 +421,9 @@ fn get_all_types() -> Vec { vec![ Dictionary(Box::new(key_type.clone()), Box::new(Int32)), Dictionary(Box::new(key_type.clone()), Box::new(Utf8)), + Dictionary(Box::new(key_type.clone()), Box::new(LargeUtf8)), Dictionary(Box::new(key_type.clone()), Box::new(Binary)), + Dictionary(Box::new(key_type.clone()), Box::new(LargeBinary)), Dictionary(Box::new(key_type.clone()), Box::new(Decimal128(38, 0))), Dictionary(Box::new(key_type), Box::new(Decimal256(76, 0))), ] From 98d35d3e4351e12fcd6d882a8cb8670c90c770f8 Mon Sep 17 00:00:00 2001 From: Daniel Poelzleithner Date: Wed, 25 Jan 2023 10:46:19 +0100 Subject: [PATCH 0544/1411] Add ClientOption.allow_insecure (#3600) * Add ClientOption.allow_insecure Add option to allow insecure https connections. In local isolated test environments, it is normal to use self signed, local certificates for automated integration testing. * clarify with_allow_invalid_certificates Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/client/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index f07377e98995..d019e8119ac2 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -52,6 +52,7 @@ pub struct ClientOptions { default_headers: Option, proxy_url: Option, allow_http: bool, + allow_insecure: bool, timeout: Option, connect_timeout: Option, pool_idle_timeout: Option, @@ -106,6 +107,21 @@ impl ClientOptions { self.allow_http = allow_http; self } + /// Allows connections to invalid SSL certificates + /// * false (default): Only valid HTTPS certificates are allowed + /// * true: All HTTPS certificates are allowed + /// + /// # Warning + /// + /// You should think very carefully before using this method. If + /// invalid certificates are trusted, *any* certificate for *any* site + /// will be trusted for use. This includes expired certificates. This + /// introduces significant vulnerabilities, and should only be used + /// as a last resort or for testing + pub fn with_allow_invalid_certificates(mut self, allow_insecure: bool) -> Self { + self.allow_insecure = allow_insecure; + self + } /// Only use http1 connections pub fn with_http1_only(mut self) -> Self { @@ -259,6 +275,10 @@ impl ClientOptions { builder = builder.http2_prior_knowledge() } + if self.allow_insecure { + builder = builder.danger_accept_invalid_certs(self.allow_insecure) + } + builder .https_only(!self.allow_http) .build() From f0be9da82cbd76da3042b426daf6c424c9560d93 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Wed, 25 Jan 2023 09:04:25 -0500 Subject: [PATCH 0545/1411] feat: Add `RunEndEncodedArray` (#3553) * Add `RunEndEncodedArray` * fix doctest and clippy issues * fix doc issues * fix doc issue * add validation for run_ends array and corresponding tests * PR comments * seal ArrowRunEndIndexType per PR suggestion * Fix PR suggestions * few more PR coments * run array name change * fix doc issues * doc change * lint fix * make append methods infallible * fix array.len and other minor changes * formatting fix * add validation of array len * fmt fix * PR comment and some documentation changes * pr suggestion * empty commit Co-authored-by: ask --- arrow-array/src/array/mod.rs | 18 + arrow-array/src/array/run_array.rs | 507 +++++++++++++++++ .../src/builder/generic_byte_run_builder.rs | 519 ++++++++++++++++++ arrow-array/src/builder/mod.rs | 4 + .../src/builder/primitive_run_builder.rs | 294 ++++++++++ arrow-array/src/types.rs | 25 + arrow-data/src/data.rs | 98 +++- arrow-data/src/equal/mod.rs | 1 + arrow-data/src/transform/mod.rs | 16 + arrow-integration-test/src/datatype.rs | 1 + arrow-ipc/src/convert.rs | 1 + arrow-schema/src/datatype.rs | 23 + arrow-schema/src/error.rs | 4 + arrow-schema/src/field.rs | 1 + parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/schema/mod.rs | 1 + 16 files changed, 1511 insertions(+), 4 deletions(-) create mode 100644 arrow-array/src/array/run_array.rs create mode 100644 arrow-array/src/builder/generic_byte_run_builder.rs create mode 100644 arrow-array/src/builder/primitive_run_builder.rs diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1e17e35d0f6d..69f6ba4d8de1 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -64,6 +64,9 @@ pub use struct_array::*; mod union_array; pub use union_array::*; +mod run_array; +pub use run_array::*; + /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. pub trait Array: std::fmt::Debug + Send + Sync { @@ -579,6 +582,20 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } dt => panic!("Unexpected dictionary key type {:?}", dt), }, + DataType::RunEndEncoded(ref run_ends_type, _) => { + match run_ends_type.data_type() { + DataType::Int16 => { + Arc::new(RunArray::::from(data)) as ArrayRef + } + DataType::Int32 => { + Arc::new(RunArray::::from(data)) as ArrayRef + } + DataType::Int64 => { + Arc::new(RunArray::::from(data)) as ArrayRef + } + dt => panic!("Unexpected data type for run_ends array {:?}", dt), + } + } DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, @@ -737,6 +754,7 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { new_null_sized_decimal(data_type, length, std::mem::size_of::()) } DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, length, 32), + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs new file mode 100644 index 000000000000..0e39cd288340 --- /dev/null +++ b/arrow-array/src/array/run_array.rs @@ -0,0 +1,507 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; + +use arrow_buffer::ArrowNativeType; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, Field}; + +use crate::{ + builder::StringRunBuilder, + make_array, + types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, + Array, ArrayRef, PrimitiveArray, +}; + +/// +/// A run-end encoding (REE) is a variation of [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding). +/// +/// This encoding is good for representing data containing same values repeated consecutively. +/// +/// [`RunArray`] contains `run_ends` array and `values` array of same length. +/// The `run_ends` array stores the indexes at which the run ends. The `values` array +/// stores the value of each run. Below example illustrates how a logical array is represented in +/// [`RunArray`] +/// +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐ +/// ┌─────────────────┐ ┌─────────┐ ┌─────────────────┐ +/// │ │ A │ │ 2 │ │ │ A │ +/// ├─────────────────┤ ├─────────┤ ├─────────────────┤ +/// │ │ D │ │ 3 │ │ │ A │ run length of 'A' = runs_ends[0] - 0 = 2 +/// ├─────────────────┤ ├─────────┤ ├─────────────────┤ +/// │ │ B │ │ 6 │ │ │ D │ run length of 'D' = run_ends[1] - run_ends[0] = 1 +/// └─────────────────┘ └─────────┘ ├─────────────────┤ +/// │ values run_ends │ │ B │ +/// ├─────────────────┤ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘ │ B │ +/// ├─────────────────┤ +/// RunArray │ B │ run length of 'B' = run_ends[2] - run_ends[1] = 3 +/// length = 3 └─────────────────┘ +/// +/// Logical array +/// Contents +/// ``` + +pub struct RunArray { + data: ArrayData, + run_ends: PrimitiveArray, + values: ArrayRef, +} + +impl RunArray { + // calculates the logical length of the array encoded + // by the given run_ends array. + fn logical_len(run_ends: &PrimitiveArray) -> usize { + let len = run_ends.len(); + if len == 0 { + return 0; + } + run_ends.value(len - 1).as_usize() + } + + /// Attempts to create RunArray using given run_ends (index where a run ends) + /// and the values (value of the run). Returns an error if the given data is not compatible + /// with RunEndEncoded specification. + pub fn try_new( + run_ends: &PrimitiveArray, + values: &dyn Array, + ) -> Result { + let run_ends_type = run_ends.data_type().clone(); + let values_type = values.data_type().clone(); + let ree_array_type = DataType::RunEndEncoded( + Box::new(Field::new("run_ends", run_ends_type, false)), + Box::new(Field::new("values", values_type, true)), + ); + let len = RunArray::logical_len(run_ends); + let builder = ArrayDataBuilder::new(ree_array_type) + .len(len) + .add_child_data(run_ends.data().clone()) + .add_child_data(values.data().clone()); + + // `build_unchecked` is used to avoid recursive validation of child arrays. + let array_data = unsafe { builder.build_unchecked() }; + + // Safety: `validate_data` checks below + // 1. The given array data has exactly two child arrays. + // 2. The first child array (run_ends) has valid data type. + // 3. run_ends array does not have null values + // 4. run_ends array has non-zero and strictly increasing values. + // 5. The length of run_ends array and values array are the same. + array_data.validate_data()?; + + Ok(array_data.into()) + } + + /// Returns a reference to run_ends array + /// + /// Note: any slicing of this array is not applied to the returned array + /// and must be handled separately + pub fn run_ends(&self) -> &PrimitiveArray { + &self.run_ends + } + + /// Returns a reference to values array + pub fn values(&self) -> &ArrayRef { + &self.values + } +} + +impl From for RunArray { + // The method assumes the caller already validated the data using `ArrayData::validate_data()` + fn from(data: ArrayData) -> Self { + match data.data_type() { + DataType::RunEndEncoded(_, _) => {} + _ => { + panic!("Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"); + } + } + + let run_ends = PrimitiveArray::::from(data.child_data()[0].clone()); + let values = make_array(data.child_data()[1].clone()); + Self { + data, + run_ends, + values, + } + } +} + +impl From> for ArrayData { + fn from(array: RunArray) -> Self { + array.data + } +} + +impl Array for RunArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +impl std::fmt::Debug for RunArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + writeln!( + f, + "RunArray {{run_ends: {:?}, values: {:?}}}", + self.run_ends, self.values + ) + } +} + +/// Constructs a `RunArray` from an iterator of optional strings. +/// +/// # Example: +/// ``` +/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; +/// +/// let test = vec!["a", "a", "b", "c", "c"]; +/// let array: RunArray = test +/// .iter() +/// .map(|&x| if x == "b" { None } else { Some(x) }) +/// .collect(); +/// assert_eq!( +/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` +impl<'a, T: RunEndIndexType> FromIterator> for RunArray { + fn from_iter>>(iter: I) -> Self { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = StringRunBuilder::with_capacity(lower, 256); + it.for_each(|i| { + builder.append_option(i); + }); + + builder.finish() + } +} + +/// Constructs a `RunArray` from an iterator of strings. +/// +/// # Example: +/// +/// ``` +/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type}; +/// +/// let test = vec!["a", "a", "b", "c"]; +/// let array: RunArray = test.into_iter().collect(); +/// assert_eq!( +/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` +impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { + fn from_iter>(iter: I) -> Self { + let it = iter.into_iter(); + let (lower, _) = it.size_hint(); + let mut builder = StringRunBuilder::with_capacity(lower, 256); + it.for_each(|i| { + builder.append_value(i); + }); + + builder.finish() + } +} + +/// +/// A [`RunArray`] array where run ends are stored using `i16` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int16RunArray = RunArray; + +/// +/// A [`RunArray`] array where run ends are stored using `i32` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int32Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int32RunArray = RunArray; + +/// +/// A [`RunArray`] array where run ends are stored using `i64` data type. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); +/// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.run_ends(), &Int64Array::from(vec![2, 3, 5])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int64RunArray = RunArray; + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::builder::PrimitiveRunBuilder; + use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; + use crate::{Array, Int16Array, Int32Array, StringArray}; + + #[test] + fn test_run_array() { + // Construct a value array + let value_data = PrimitiveArray::::from_iter_values([ + 10_i8, 11, 12, 13, 14, 15, 16, 17, + ]); + + // Construct a run_ends array: + let run_ends_data = PrimitiveArray::::from_iter_values([ + 4_i16, 6, 7, 9, 13, 18, 20, 22, + ]); + + // Construct a run ends encoded array from the above two + let ree_array = + RunArray::::try_new(&run_ends_data, &value_data).unwrap(); + + assert_eq!(ree_array.len(), 22); + assert_eq!(ree_array.null_count(), 0); + + let values = ree_array.values(); + assert_eq!(&value_data.into_data(), values.data()); + assert_eq!(&DataType::Int8, values.data_type()); + + let run_ends = ree_array.run_ends(); + assert_eq!(&run_ends_data.into_data(), run_ends.data()); + assert_eq!(&DataType::Int16, run_ends.data_type()); + } + + #[test] + fn test_run_array_fmt_debug() { + let mut builder = PrimitiveRunBuilder::::with_capacity(3); + builder.append_value(12345678); + builder.append_null(); + builder.append_value(22345678); + let array = builder.finish(); + assert_eq!( + "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", + format!("{:?}", array) + ); + + let mut builder = PrimitiveRunBuilder::::with_capacity(20); + for _ in 0..20 { + builder.append_value(1); + } + let array = builder.finish(); + + assert_eq!(array.len(), 20); + assert_eq!(array.null_count(), 0); + + assert_eq!( + "RunArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_run_array_from_iter() { + let test = vec!["a", "a", "b", "c"]; + let array: RunArray = test + .iter() + .map(|&x| if x == "b" { None } else { Some(x) }) + .collect(); + assert_eq!( + "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", + format!("{:?}", array) + ); + + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + + let array: RunArray = test.into_iter().collect(); + assert_eq!( + "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_run_array_run_ends_as_primitive_array() { + let test = vec!["a", "b", "c", "a"]; + let array: RunArray = test.into_iter().collect(); + + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + + let run_ends = array.run_ends(); + assert_eq!(&DataType::Int16, run_ends.data_type()); + assert_eq!(0, run_ends.null_count()); + assert_eq!(&[1, 2, 3, 4], run_ends.values()); + } + + #[test] + fn test_run_array_as_primitive_array_with_null() { + let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; + let array: RunArray = test.into_iter().collect(); + + assert_eq!(array.len(), 6); + assert_eq!(array.null_count(), 0); + + let run_ends = array.run_ends(); + assert_eq!(&DataType::Int32, run_ends.data_type()); + assert_eq!(0, run_ends.null_count()); + assert_eq!(5, run_ends.len()); + assert_eq!(&[1, 2, 3, 5, 6], run_ends.values()); + + let values_data = array.values(); + assert_eq!(2, values_data.null_count()); + assert_eq!(5, values_data.len()); + } + + #[test] + fn test_run_array_all_nulls() { + let test = vec![None, None, None]; + let array: RunArray = test.into_iter().collect(); + + assert_eq!(array.len(), 3); + assert_eq!(array.null_count(), 0); + + let run_ends = array.run_ends(); + assert_eq!(1, run_ends.len()); + assert_eq!(&[3], run_ends.values()); + + let values_data = array.values(); + assert_eq!(1, values_data.null_count()); + } + + #[test] + fn test_run_array_try_new() { + let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + let array = RunArray::::try_new(&run_ends, &values).unwrap(); + assert_eq!(array.run_ends().data_type(), &DataType::Int32); + assert_eq!(array.values().data_type(), &DataType::Utf8); + + assert_eq!(array.null_count(), 0); + assert_eq!(array.len(), 4); + assert_eq!(array.run_ends.null_count(), 0); + assert_eq!(array.values().null_count(), 1); + + assert_eq!( + "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", + format!("{:?}", array) + ); + } + + #[test] + fn test_run_array_int16_type_definition() { + let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); + let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); + assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_run_array_empty_string() { + let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect(); + let values: Arc = Arc::new(StringArray::from(vec!["a", "", "c"])); + assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 4, 5])); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_run_array_length_mismatch() { + let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect(); + + let actual = RunArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string()); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); + } + + #[test] + fn test_run_array_run_ends_with_null() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); + + let actual = RunArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); + } + + #[test] + fn test_run_array_run_ends_with_zeroes() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect(); + + let actual = RunArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string()); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); + } + + #[test] + fn test_run_array_run_ends_non_increasing() { + let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] + .into_iter() + .collect(); + let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect(); + + let actual = RunArray::::try_new(&run_ends, &values); + let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string()); + assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); + } + + #[test] + #[should_panic( + expected = "PrimitiveArray expected ArrayData with type Int64 got Int32" + )] + fn test_run_array_run_ends_data_type_mismatch() { + let a = RunArray::::from_iter(["32"]); + let _ = RunArray::::from(a.into_data()); + } +} diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs new file mode 100644 index 000000000000..c1ecbcb5ddec --- /dev/null +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -0,0 +1,519 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::bytes::ByteArrayNativeType; +use std::{any::Any, sync::Arc}; + +use crate::{ + types::{ + BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, + Utf8Type, + }, + ArrayRef, ArrowPrimitiveType, RunArray, +}; + +use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; + +use arrow_buffer::ArrowNativeType; + +/// Array builder for [`RunArray`] for String and Binary types. +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::GenericByteRunBuilder; +/// # use arrow_array::{GenericByteArray, BinaryArray}; +/// # use arrow_array::types::{BinaryType, Int16Type}; +/// # use arrow_array::{Array, Int16Array}; +/// # use arrow_array::cast::as_generic_binary_array; +/// +/// let mut builder = +/// GenericByteRunBuilder::::new(); +/// builder.append_value(b"abc"); +/// builder.append_value(b"abc"); +/// builder.append_null(); +/// builder.append_value(b"def"); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(2), Some(3), Some(4)]) +/// ); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &BinaryArray = as_generic_binary_array(av.as_ref()); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert_eq!(ava.value(2), b"def"); +/// ``` +#[derive(Debug)] +pub struct GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: GenericByteBuilder, + current_value: Vec, + has_current_value: bool, + current_run_end_index: usize, + prev_run_end_index: usize, +} + +impl Default for GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + fn default() -> Self { + Self::new() + } +} + +impl GenericByteRunBuilder +where + R: ArrowPrimitiveType, + V: ByteArrayType, +{ + /// Creates a new `GenericByteRunBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: GenericByteBuilder::::new(), + current_value: Vec::new(), + has_current_value: false, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } + + /// Creates a new `GenericByteRunBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + /// `data_capacity`: the expected number of bytes of run end encoded values + pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: GenericByteBuilder::::with_capacity( + capacity, + data_capacity, + ), + current_value: Vec::new(), + has_current_value: false, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } +} + +impl ArrayBuilder for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the length of logical array encoded by + /// the eventual runs array. + fn len(&self) -> usize { + self.current_run_end_index + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.current_run_end_index == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, +{ + /// Appends optional value to the logical array encoded by the RunArray. + pub fn append_option(&mut self, input_value: Option>) { + match input_value { + Some(value) => self.append_value(value), + None => self.append_null(), + } + } + + /// Appends value to the logical array encoded by the RunArray. + pub fn append_value(&mut self, input_value: impl AsRef) { + let value: &[u8] = input_value.as_ref().as_ref(); + if !self.has_current_value { + self.append_run_end(); + self.current_value.extend_from_slice(value); + self.has_current_value = true; + } else if self.current_value.as_slice() != value { + self.append_run_end(); + self.current_value.clear(); + self.current_value.extend_from_slice(value); + } + self.current_run_end_index += 1; + } + + /// Appends null to the logical array encoded by the RunArray. + pub fn append_null(&mut self) { + if self.has_current_value { + self.append_run_end(); + self.current_value.clear(); + self.has_current_value = false; + } + self.current_run_end_index += 1; + } + + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { + // write the last run end to the array. + self.append_run_end(); + + // reset the run end index to zero. + self.current_value.clear(); + self.has_current_value = false; + self.current_run_end_index = 0; + self.prev_run_end_index = 0; + + // build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); + + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } + + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + // Appends the current run to the array. + fn append_run_end(&mut self) { + // empty array or the function called without appending any value. + if self.prev_run_end_index == self.current_run_end_index { + return; + } + let run_end_index = self.run_end_index_as_native(); + self.run_ends_builder.append_value(run_end_index); + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { + // Safety: + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; + self.values_builder.append_value(native); + } else { + self.values_builder.append_null(); + } + self.prev_run_end_index = self.current_run_end_index; + } + + // Similar to `append_run_end` but on custom builders. + // Used in `finish_cloned` which is not suppose to mutate `self`. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut GenericByteBuilder, + ) { + let run_end_index = self.run_end_index_as_native(); + run_ends_builder.append_value(run_end_index); + if self.has_current_value { + let slice = self.current_value.as_slice(); + let native = unsafe { + // Safety: + // As self.current_value is created from V::Native. The value V::Native can be + // built back from the bytes without validations + V::Native::from_bytes_unchecked(slice) + }; + values_builder.append_value(native); + } else { + values_builder.append_null(); + } + } + + fn run_end_index_as_native(&self) -> R::Native { + R::Native::from_usize(self.current_run_end_index) + .unwrap_or_else(|| panic!( + "Cannot convert the value {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + } +} + +/// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded values are Strings. +/// +/// # use arrow_array::builder::StringRunBuilder; +/// # use arrow_array::{Int16Array, StringArray}; +/// # use arrow_array::types::Int16Type; +/// # use arrow_array::cast::as_string_array; +/// +/// let mut builder = StringRunBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value("abc"); +/// builder.append_null(); +/// builder.append_value("def"); +/// builder.append_value("def"); +/// builder.append_value("abc"); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &StringArray = as_string_array(av.as_ref()); +/// +/// assert_eq!(ava.value(0), "abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), "def"); +/// assert_eq!(ava.value(3), "abc"); +/// +/// ``` +pub type StringRunBuilder = GenericByteRunBuilder; + +/// Array builder for [`RunArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringRunBuilder`] for an example. +pub type LargeStringRunBuilder = GenericByteRunBuilder; + +/// Array builder for [`RunArray`] that encodes binary values([`BinaryType`]). +/// +/// ``` +/// // Create a run-end encoded array with run-end indexes data type as `i16`. +/// // The encoded data is binary values. +/// +/// # use arrow_array::builder::BinaryRunBuilder; +/// # use arrow_array::{BinaryArray, Int16Array}; +/// # use arrow_array::types::Int16Type; +/// # use arrow_array::cast::as_generic_binary_array; +/// +/// let mut builder = BinaryRunBuilder::::new(); +/// +/// // The builder builds the dictionary value by value +/// builder.append_value(b"abc"); +/// builder.append_null(); +/// builder.append_value(b"def"); +/// builder.append_value(b"def"); +/// builder.append_value(b"abc"); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &BinaryArray = as_generic_binary_array::(av.as_ref()); +/// +/// assert_eq!(ava.value(0), b"abc"); +/// assert!(av.is_null(1)); +/// assert_eq!(ava.value(2), b"def"); +/// assert_eq!(ava.value(3), b"abc"); +/// +/// ``` +pub type BinaryRunBuilder = GenericByteRunBuilder; + +/// Array builder for [`RunArray`] that encodes large binary values([`LargeBinaryType`]). +/// See documentation of [`BinaryRunBuilder`] for an example. +pub type LargeBinaryRunBuilder = GenericByteRunBuilder; + +#[cfg(test)] +mod tests { + use super::*; + + use crate::array::Array; + use crate::types::Int16Type; + use crate::GenericByteArray; + use crate::Int16Array; + use crate::Int16RunArray; + + fn test_bytes_run_buider(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteRunBuilder::::new(); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_null(); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); + builder.append_value(values[2]); + let array = builder.finish(); + + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(3), Some(5), Some(7), Some(11)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(*ava.value(0), *values[0]); + assert!(ava.is_null(1)); + assert_eq!(*ava.value(2), *values[1]); + assert_eq!(*ava.value(3), *values[2]); + } + + #[test] + fn test_string_run_buider() { + test_bytes_run_buider::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_string_run_buider_with_empty_strings() { + test_bytes_run_buider::(vec!["abc", "", "ghi"]); + } + + #[test] + fn test_binary_run_buider() { + test_bytes_run_buider::(vec![b"abc", b"def", b"ghi"]); + } + + fn test_bytes_run_buider_finish_cloned(values: Vec<&T::Native>) + where + T: ByteArrayType, + ::Native: PartialEq, + ::Native: AsRef<::Native>, + { + let mut builder = GenericByteRunBuilder::::new(); + + builder.append_value(values[0]); + builder.append_null(); + builder.append_value(values[1]); + builder.append_value(values[1]); + builder.append_value(values[0]); + let mut array: Int16RunArray = builder.finish_cloned(); + + assert_eq!(array.len(), 5); + assert_eq!(array.null_count(), 0); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) + ); + + // Values are polymorphic and so require a downcast. + let av = array.values(); + let ava: &GenericByteArray = + av.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava.value(0), values[0]); + assert!(ava.is_null(1)); + assert_eq!(ava.value(2), values[1]); + assert_eq!(ava.value(3), values[0]); + + // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only + // one entry in final output. + builder.append_value(values[0]); + builder.append_value(values[0]); + builder.append_value(values[1]); + array = builder.finish(); + + assert_eq!(array.len(), 8); + assert_eq!(array.null_count(), 0); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(7), Some(8),]) + ); + + // Values are polymorphic and so require a downcast. + let av2 = array.values(); + let ava2: &GenericByteArray = + av2.as_any().downcast_ref::>().unwrap(); + + assert_eq!(ava2.value(0), values[0]); + assert!(ava2.is_null(1)); + assert_eq!(ava2.value(2), values[1]); + // The value appended before and after `finish_cloned` has only one entry. + assert_eq!(ava2.value(3), values[0]); + assert_eq!(ava2.value(4), values[1]); + } + + #[test] + fn test_string_run_buider_finish_cloned() { + test_bytes_run_buider_finish_cloned::(vec!["abc", "def", "ghi"]); + } + + #[test] + fn test_binary_run_buider_finish_cloned() { + test_bytes_run_buider_finish_cloned::(vec![b"abc", b"def", b"ghi"]); + } +} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 820ecd23bc5e..fc2454635d99 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -39,10 +39,14 @@ mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; pub use primitive_dictionary_builder::*; +mod primitive_run_builder; +pub use primitive_run_builder::*; mod struct_builder; pub use struct_builder::*; mod generic_bytes_dictionary_builder; pub use generic_bytes_dictionary_builder::*; +mod generic_byte_run_builder; +pub use generic_byte_run_builder::*; mod union_builder; pub use union_builder::*; diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs new file mode 100644 index 000000000000..82c46abfa053 --- /dev/null +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -0,0 +1,294 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, sync::Arc}; + +use crate::{types::RunEndIndexType, ArrayRef, ArrowPrimitiveType, RunArray}; + +use super::{ArrayBuilder, PrimitiveBuilder}; + +use arrow_buffer::ArrowNativeType; + +/// Array builder for [`RunArray`] that encodes primitive values. +/// +/// # Example: +/// +/// ``` +/// +/// # use arrow_array::builder::PrimitiveRunBuilder; +/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_array::types::{UInt32Type, Int16Type}; +/// # use arrow_array::{Array, UInt32Array, Int16Array}; +/// +/// let mut builder = +/// PrimitiveRunBuilder::::new(); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_value(1234); +/// builder.append_null(); +/// builder.append_value(5678); +/// builder.append_value(5678); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.run_ends(), +/// &Int16Array::from(vec![Some(3), Some(4), Some(6)]) +/// ); +/// +/// let av = array.values(); +/// +/// assert!(!av.is_null(0)); +/// assert!(av.is_null(1)); +/// assert!(!av.is_null(2)); +/// +/// // Values are polymorphic and so require a downcast. +/// let ava: &UInt32Array = as_primitive_array::(av.as_ref()); +/// +/// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); +/// ``` +#[derive(Debug)] +pub struct PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + run_ends_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + current_value: Option, + current_run_end_index: usize, + prev_run_end_index: usize, +} + +impl Default for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + fn default() -> Self { + Self::new() + } +} + +impl PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Creates a new `PrimitiveRunBuilder` + pub fn new() -> Self { + Self { + run_ends_builder: PrimitiveBuilder::new(), + values_builder: PrimitiveBuilder::new(), + current_value: None, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } + + /// Creates a new `PrimitiveRunBuilder` with the provided capacity + /// + /// `capacity`: the expected number of run-end encoded values. + pub fn with_capacity(capacity: usize) -> Self { + Self { + run_ends_builder: PrimitiveBuilder::with_capacity(capacity), + values_builder: PrimitiveBuilder::with_capacity(capacity), + current_value: None, + current_run_end_index: 0, + prev_run_end_index: 0, + } + } +} + +impl ArrayBuilder for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the length of logical array encoded by + /// the eventual runs array. + fn len(&self) -> usize { + self.current_run_end_index + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.current_run_end_index == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + /// Appends optional value to the logical array encoded by the RunArray. + pub fn append_option(&mut self, value: Option) { + if self.current_run_end_index == 0 { + self.current_run_end_index = 1; + self.current_value = value; + return; + } + if self.current_value != value { + self.append_run_end(); + self.current_value = value; + } + + self.current_run_end_index += 1; + } + + /// Appends value to the logical array encoded by the run-ends array. + pub fn append_value(&mut self, value: V::Native) { + self.append_option(Some(value)) + } + + /// Appends null to the logical array encoded by the run-ends array. + pub fn append_null(&mut self) { + self.append_option(None) + } + + /// Creates the RunArray and resets the builder. + /// Panics if RunArray cannot be built. + pub fn finish(&mut self) -> RunArray { + // write the last run end to the array. + self.append_run_end(); + + // reset the run index to zero. + self.current_value = None; + self.current_run_end_index = 0; + + // build the run encoded array by adding run_ends and values array as its children. + let run_ends_array = self.run_ends_builder.finish(); + let values_array = self.values_builder.finish(); + RunArray::::try_new(&run_ends_array, &values_array).unwrap() + } + + /// Creates the RunArray and without resetting the builder. + /// Panics if RunArray cannot be built. + pub fn finish_cloned(&self) -> RunArray { + let mut run_ends_array = self.run_ends_builder.finish_cloned(); + let mut values_array = self.values_builder.finish_cloned(); + + // Add current run if one exists + if self.prev_run_end_index != self.current_run_end_index { + let mut run_end_builder = run_ends_array.into_builder().unwrap(); + let mut values_builder = values_array.into_builder().unwrap(); + self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder); + run_ends_array = run_end_builder.finish(); + values_array = values_builder.finish(); + } + + RunArray::try_new(&run_ends_array, &values_array).unwrap() + } + + // Appends the current run to the array. + fn append_run_end(&mut self) { + // empty array or the function called without appending any value. + if self.prev_run_end_index == self.current_run_end_index { + return; + } + let run_end_index = self.run_end_index_as_native(); + self.run_ends_builder.append_value(run_end_index); + self.values_builder.append_option(self.current_value); + self.prev_run_end_index = self.current_run_end_index; + } + + // Similar to `append_run_end` but on custom builders. + // Used in `finish_cloned` which is not suppose to mutate `self`. + fn append_run_end_with_builders( + &self, + run_ends_builder: &mut PrimitiveBuilder, + values_builder: &mut PrimitiveBuilder, + ) { + let run_end_index = self.run_end_index_as_native(); + run_ends_builder.append_value(run_end_index); + values_builder.append_option(self.current_value); + } + + fn run_end_index_as_native(&self) -> R::Native { + R::Native::from_usize(self.current_run_end_index) + .unwrap_or_else(|| panic!( + "Cannot convert `current_run_end_index` {} from `usize` to native form of arrow datatype {}", + self.current_run_end_index, + R::DATA_TYPE + )) + } +} + +#[cfg(test)] +mod tests { + use crate::builder::PrimitiveRunBuilder; + use crate::cast::as_primitive_array; + use crate::types::{Int16Type, UInt32Type}; + use crate::{Array, Int16Array, UInt32Array}; + + #[test] + fn test_primitive_ree_array_builder() { + let mut builder = PrimitiveRunBuilder::::new(); + builder.append_value(1234); + builder.append_value(1234); + builder.append_value(1234); + builder.append_null(); + builder.append_value(5678); + builder.append_value(5678); + + let array = builder.finish(); + + assert_eq!(array.null_count(), 0); + assert_eq!(array.len(), 6); + + assert_eq!( + array.run_ends(), + &Int16Array::from(vec![Some(3), Some(4), Some(6)]) + ); + + let av = array.values(); + + assert!(!av.is_null(0)); + assert!(av.is_null(1)); + assert!(!av.is_null(2)); + + // Values are polymorphic and so require a downcast. + let ava: &UInt32Array = as_primitive_array::(av.as_ref()); + + assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); + } +} diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 7c41a469e30e..fc02c0e5a3dc 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -240,6 +240,31 @@ impl ArrowDictionaryKeyType for UInt32Type {} impl ArrowDictionaryKeyType for UInt64Type {} +mod run { + use super::*; + + pub trait RunEndTypeSealed {} + + impl RunEndTypeSealed for Int16Type {} + + impl RunEndTypeSealed for Int32Type {} + + impl RunEndTypeSealed for Int64Type {} +} + +/// A subtype of primitive type that is used as run-ends index +/// in `RunArray`. +/// See +/// +/// Note: The implementation of this trait is sealed to avoid accidental misuse. +pub trait RunEndIndexType: ArrowPrimitiveType + run::RunEndTypeSealed {} + +impl RunEndIndexType for Int16Type {} + +impl RunEndIndexType for Int32Type {} + +impl RunEndIndexType for Int64Type {} + /// A subtype of primitive type that represents temporal values. pub trait ArrowTemporalType: ArrowPrimitiveType {} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 258ee082da1b..07bbc664234a 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -198,9 +198,9 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff ], _ => unreachable!(), }, - DataType::FixedSizeList(_, _) | DataType::Struct(_) => { - [empty_buffer, MutableBuffer::new(0)] - } + DataType::FixedSizeList(_, _) + | DataType::Struct(_) + | DataType::RunEndEncoded(_, _) => [empty_buffer, MutableBuffer::new(0)], DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, @@ -724,6 +724,12 @@ impl ArrayData { DataType::Dictionary(_, data_type) => { vec![Self::new_empty(data_type)] } + DataType::RunEndEncoded(run_ends, values) => { + vec![ + Self::new_empty(run_ends.data_type()), + Self::new_empty(values.data_type()), + ] + } }; // Data was constructed correctly above @@ -853,6 +859,19 @@ impl ArrayData { ))); } } + DataType::RunEndEncoded(run_ends_type, _) => { + if run_ends_type.is_nullable() { + return Err(ArrowError::InvalidArgumentError( + "The nullable should be set to false for the field defining run_ends array.".to_string() + )); + } + if !DataType::is_run_ends_type(run_ends_type.data_type()) { + return Err(ArrowError::InvalidArgumentError(format!( + "RunArray run_ends types must be Int16, Int32 or Int64, but was {}", + run_ends_type.data_type() + ))); + } + } _ => {} }; @@ -998,6 +1017,25 @@ impl ArrayData { } Ok(()) } + DataType::RunEndEncoded(run_ends_field, values_field) => { + self.validate_num_child_data(2)?; + let run_ends_data = + self.get_valid_child_data(0, run_ends_field.data_type())?; + let values_data = + self.get_valid_child_data(1, values_field.data_type())?; + if run_ends_data.len != values_data.len { + return Err(ArrowError::InvalidArgumentError(format!( + "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", + run_ends_data.len, values_data.len + ))); + } + if run_ends_data.null_count() > 0 { + return Err(ArrowError::InvalidArgumentError( + "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), + )); + } + Ok(()) + } DataType::Union(fields, _, mode) => { self.validate_num_child_data(fields.len())?; @@ -1286,6 +1324,15 @@ impl ArrayData { _ => unreachable!(), } } + DataType::RunEndEncoded(run_ends, _values) => { + let run_ends_data = self.child_data()[0].clone(); + match run_ends.data_type() { + DataType::Int16 => run_ends_data.check_run_ends::(self.len()), + DataType::Int32 => run_ends_data.check_run_ends::(self.len()), + DataType::Int64 => run_ends_data.check_run_ends::(self.len()), + _ => unreachable!(), + } + } _ => { // No extra validation check required for other types Ok(()) @@ -1446,6 +1493,50 @@ impl ArrayData { }) } + /// Validates that each value in run_ends array is positive and strictly increasing. + fn check_run_ends(&self, array_len: usize) -> Result<(), ArrowError> + where + T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, + { + let values = self.typed_buffer::(0, self.len())?; + let mut prev_value: i64 = 0_i64; + values.iter().enumerate().try_for_each(|(ix, &inp_value)| { + let value: i64 = inp_value.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (can not convert to i64)", + ix, inp_value + )) + })?; + if value <= 0_i64 { + return Err(ArrowError::InvalidArgumentError(format!( + "The values in run_ends array should be strictly positive. Found value {} at index {} that does not match the criteria.", + value, + ix + ))); + } + if ix > 0 && value <= prev_value { + return Err(ArrowError::InvalidArgumentError(format!( + "The values in run_ends array should be strictly increasing. Found value {} at index {} with previous value {} that does not match the criteria.", + value, + ix, + prev_value + ))); + } + + prev_value = value; + Ok(()) + })?; + + if prev_value.as_usize() != array_len { + return Err(ArrowError::InvalidArgumentError(format!( + "The length of array does not match the last value in the run_ends array. The last value of run_ends array is {} and length of array is {}.", + prev_value, + array_len + ))); + } + Ok(()) + } + /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may /// return false when the arrays are logically equal @@ -1542,6 +1633,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, + DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, _, mode) => { let type_ids = BufferSpec::FixedWidth { byte_width: size_of::(), diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 85c595cfed1c..aff61e3d37e5 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -137,6 +137,7 @@ fn equal_values( }, DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 6a8c89d25a22..2a24b1cc2662 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -230,6 +230,7 @@ fn build_extend(array: &ArrayData) -> Extend { UnionMode::Sparse => union::build_extend_sparse(array), UnionMode::Dense => union::build_extend_dense(array), }, + DataType::RunEndEncoded(_, _) => todo!(), } } @@ -281,6 +282,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { UnionMode::Sparse => union::extend_nulls_sparse, UnionMode::Dense => union::extend_nulls_dense, }, + DataType::RunEndEncoded(_, _) => todo!(), }) } @@ -473,6 +475,20 @@ impl<'a> MutableArrayData<'a> { }) .collect::>(), }, + DataType::RunEndEncoded(_, _) => { + let run_ends_child = arrays + .iter() + .map(|array| &array.child_data()[0]) + .collect::>(); + let value_child = arrays + .iter() + .map(|array| &array.child_data()[1]) + .collect::>(); + vec![ + MutableArrayData::new(run_ends_child, false, array_capacity), + MutableArrayData::new(value_child, use_nulls, array_capacity), + ] + } DataType::FixedSizeList(_, _) => { let childs = arrays .iter() diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index dd0b95b0a836..c2e326b4f2f3 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -357,6 +357,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { DataType::Map(_, keys_sorted) => { json!({"name": "map", "keysSorted": keys_sorted}) } + DataType::RunEndEncoded(_, _) => todo!(), } } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a60a19b866cb..305bb943cbbf 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -711,6 +711,7 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&children[..])), } } + RunEndEncoded(_, _) => todo!(), Map(map_field, keys_sorted) => { let child = build_field(fbb, map_field); let mut field_type = crate::MapBuilder::new(fbb); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index da1c20ddbd38..1e5c1321c952 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -242,6 +242,18 @@ pub enum DataType { /// child fields may be respectively "entries", "key", and "value", but this is /// not enforced. Map(Box, bool), + /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These + /// encodings are well-suited for representing data containing sequences of the + /// same value, called runs. Each run is represented as a value and an integer giving + /// the index in the array where the run ends. + /// + /// A run-end encoded array has no buffers by itself, but has two child arrays. The + /// first child array, called the run ends array, holds either 16, 32, or 64-bit + /// signed integers. The actual values of each run are held in the second child array. + /// + /// These child arrays are prescribed the standard names of "run_ends" and "values" + /// respectively. + RunEndEncoded(Box, Box), } /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. @@ -346,6 +358,13 @@ impl DataType { ) } + /// Returns true if this type is valid for run-ends array in RunArray + #[inline] + pub fn is_run_ends_type(&self) -> bool { + use DataType::*; + matches!(self, Int16 | Int32 | Int64) + } + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, /// or Map), or a dictionary of a nested type pub fn is_nested(&self) -> bool { @@ -438,6 +457,10 @@ impl DataType { + (std::mem::size_of::() * fields.capacity()) } DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), + DataType::RunEndEncoded(run_ends, values) => { + run_ends.size() - std::mem::size_of_val(run_ends) + values.size() + - std::mem::size_of_val(values) + } } } } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index ea60572b3d4d..6213af8bcf10 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -41,6 +41,7 @@ pub enum ArrowError { /// Error during import or export to/from the C Data Interface CDataInterface(String), DictionaryKeyOverflowError, + RunEndIndexOverflowError, } impl ArrowError { @@ -96,6 +97,9 @@ impl Display for ArrowError { ArrowError::DictionaryKeyOverflowError => { write!(f, "Dictionary key bigger than the key type") } + ArrowError::RunEndIndexOverflowError => { + write!(f, "Run end encoded array index overflow error") + } } } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index a3275dcb3355..dc3ab3d6237f 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -410,6 +410,7 @@ impl Field { | DataType::List(_) | DataType::Map(_, _) | DataType::Dictionary(_, _) + | DataType::RunEndEncoded(_, _) | DataType::FixedSizeList(_, _) | DataType::FixedSizeBinary(_) | DataType::Utf8 diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 311981593718..c459d40d73b9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -360,7 +360,7 @@ fn write_leaves( ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), )), - ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) => { + ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) | ArrowDataType::RunEndEncoded(_, _) => { Err(ParquetError::NYI( format!( "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 2ca4b7ef8a79..d81d6a69bbb9 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -507,6 +507,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { let dict_field = Field::new(name, *value.clone(), field.is_nullable()); arrow_to_parquet_type(&dict_field) } + DataType::RunEndEncoded(_, _) => Err(arrow_err!("Converting RunEndEncodedType to parquet not supported",)) } } From 73ce0760d849722e71a8a1a44b6b3230ae68f8b3 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Wed, 25 Jan 2023 15:50:58 +0100 Subject: [PATCH 0546/1411] [object_store] support azure managed and workload identities (#3581) * feat: add azure managed identity credential * test: azure managed identity credential * feat: add azure federated token credential * test: add workload identity test * refactor: PR feedback * Update object_store/src/azure/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * refactor: id priorities * refactor: use managed identity as default credential * chore: remove usused parameter Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/azure/client.rs | 8 +- object_store/src/azure/credential.rs | 337 +++++++++++++++++++++++++-- object_store/src/azure/mod.rs | 147 ++++++++++-- 3 files changed, 459 insertions(+), 33 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 426b3b164695..e42950b90102 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -169,9 +169,11 @@ impl AzureClient { CredentialProvider::AccessKey(key) => { Ok(AzureCredential::AccessKey(key.to_owned())) } - CredentialProvider::ClientSecret(cred) => { - let token = cred - .fetch_token(&self.client, &self.config.retry_config) + CredentialProvider::TokenCredential(cache, cred) => { + let token = cache + .get_or_insert_with(|| { + cred.fetch_token(&self.client, &self.config.retry_config) + }) .await .context(AuthorizationSnafu)?; Ok(AzureCredential::AuthorizationToken( diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 96ff8ce153a5..280d8430011c 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -31,6 +31,7 @@ use reqwest::{ }, Client, Method, RequestBuilder, }; +use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::str; @@ -44,8 +45,11 @@ pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); -pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; +pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; const CONTENT_TYPE_JSON: &str = "application/json"; +const MSI_SECRET_ENV_KEY: &str = "IDENTITY_HEADER"; +const MSI_API_VERSION: &str = "2019-08-01"; +const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; #[derive(Debug, Snafu)] pub enum Error { @@ -54,6 +58,9 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, + + #[snafu(display("Error reading federated token file "))] + FederatedTokenFile, } pub type Result = std::result::Result; @@ -63,7 +70,7 @@ pub type Result = std::result::Result; pub enum CredentialProvider { AccessKey(String), SASToken(Vec<(String, String)>), - ClientSecret(ClientSecretOAuthProvider), + TokenCredential(TokenCache, Box), } pub(crate) enum AzureCredential { @@ -273,7 +280,16 @@ fn lexy_sort<'a>( values } -#[derive(serde::Deserialize, Debug)] +#[async_trait::async_trait] +pub trait TokenCredential: std::fmt::Debug + Send + Sync + 'static { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>; +} + +#[derive(Deserialize, Debug)] struct TokenResponse { access_token: String, expires_in: u64, @@ -282,11 +298,9 @@ struct TokenResponse { /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct ClientSecretOAuthProvider { - scope: String, token_url: String, client_id: String, client_secret: String, - cache: TokenCache, } impl ClientSecretOAuthProvider { @@ -294,45 +308,220 @@ impl ClientSecretOAuthProvider { pub fn new( client_id: String, client_secret: String, - tenant_id: String, + tenant_id: impl AsRef, authority_host: Option, ) -> Self { let authority_host = authority_host .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { - scope: "https://storage.azure.com/.default".to_owned(), - token_url: format!("{}/{}/oauth2/v2.0/token", authority_host, tenant_id), + token_url: format!( + "{}/{}/oauth2/v2.0/token", + authority_host, + tenant_id.as_ref() + ), client_id, client_secret, - cache: TokenCache::default(), } } +} +#[async_trait::async_trait] +impl TokenCredential for ClientSecretOAuthProvider { /// Fetch a token - pub async fn fetch_token( + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result { - self.cache - .get_or_insert_with(|| self.fetch_token_inner(client, retry)) + ) -> Result> { + let response: TokenResponse = client + .request(Method::POST, &self.token_url) + .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) + .form(&[ + ("client_id", self.client_id.as_str()), + ("client_secret", self.client_secret.as_str()), + ("scope", AZURE_STORAGE_SCOPE), + ("grant_type", "client_credentials"), + ]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() .await + .context(TokenResponseBodySnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) } +} + +fn expires_in_string<'de, D>(deserializer: D) -> std::result::Result +where + D: serde::de::Deserializer<'de>, +{ + let v = String::deserialize(deserializer)?; + v.parse::().map_err(serde::de::Error::custom) +} + +// NOTE: expires_on is a String version of unix epoch time, not an integer. +// +#[derive(Debug, Clone, Deserialize)] +struct MsiTokenResponse { + pub access_token: String, + #[serde(deserialize_with = "expires_in_string")] + pub expires_in: u64, +} + +/// Attempts authentication using a managed identity that has been assigned to the deployment environment. +/// +/// This authentication type works in Azure VMs, App Service and Azure Functions applications, as well as the Azure Cloud Shell +/// +#[derive(Debug)] +pub struct ImdsManagedIdentityOAuthProvider { + msi_endpoint: String, + client_id: Option, + object_id: Option, + msi_res_id: Option, + client: Client, +} + +impl ImdsManagedIdentityOAuthProvider { + /// Create a new [`ImdsManagedIdentityOAuthProvider`] for an azure backed store + pub fn new( + client_id: Option, + object_id: Option, + msi_res_id: Option, + msi_endpoint: Option, + client: Client, + ) -> Self { + let msi_endpoint = msi_endpoint.unwrap_or_else(|| { + "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() + }); - /// Fetch a fresh token - async fn fetch_token_inner( + Self { + msi_endpoint, + client_id, + object_id, + msi_res_id, + client, + } + } +} + +#[async_trait::async_trait] +impl TokenCredential for ImdsManagedIdentityOAuthProvider { + /// Fetch a token + async fn fetch_token( + &self, + _client: &Client, + retry: &RetryConfig, + ) -> Result> { + let mut query_items = vec![ + ("api-version", MSI_API_VERSION), + ("resource", AZURE_STORAGE_SCOPE), + ]; + + let mut identity = None; + if let Some(client_id) = &self.client_id { + identity = Some(("client_id", client_id)); + } + if let Some(object_id) = &self.object_id { + identity = Some(("object_id", object_id)); + } + if let Some(msi_res_id) = &self.msi_res_id { + identity = Some(("msi_res_id", msi_res_id)); + } + if let Some((key, value)) = identity { + query_items.push((key, value)); + } + + let mut builder = self + .client + .request(Method::GET, &self.msi_endpoint) + .header("metadata", "true") + .query(&query_items); + + if let Ok(val) = std::env::var(MSI_SECRET_ENV_KEY) { + builder = builder.header("x-identity-header", val); + }; + + let response: MsiTokenResponse = builder + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenResponseBodySnafu)?; + + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + + Ok(token) + } +} + +/// Credential for using workload identity dfederation +/// +/// +#[derive(Debug)] +pub struct WorkloadIdentityOAuthProvider { + token_url: String, + client_id: String, + federated_token_file: String, +} + +impl WorkloadIdentityOAuthProvider { + /// Create a new [`WorkloadIdentityOAuthProvider`] for an azure backed store + pub fn new( + client_id: impl Into, + federated_token_file: impl Into, + tenant_id: impl AsRef, + authority_host: Option, + ) -> Self { + let authority_host = authority_host + .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + + Self { + token_url: format!( + "{}/{}/oauth2/v2.0/token", + authority_host, + tenant_id.as_ref() + ), + client_id: client_id.into(), + federated_token_file: federated_token_file.into(), + } + } +} + +#[async_trait::async_trait] +impl TokenCredential for WorkloadIdentityOAuthProvider { + /// Fetch a token + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, ) -> Result> { + let token_str = std::fs::read_to_string(&self.federated_token_file) + .map_err(|_| Error::FederatedTokenFile)?; + + // https://learn.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow#third-case-access-token-request-with-a-federated-credential let response: TokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ ("client_id", self.client_id.as_str()), - ("client_secret", self.client_secret.as_str()), - ("scope", self.scope.as_str()), + ( + "client_assertion_type", + "urn:ietf:params:oauth:client-assertion-type:jwt-bearer", + ), + ("client_assertion", token_str.as_str()), + ("scope", AZURE_STORAGE_SCOPE), ("grant_type", "client_credentials"), ]) .send_retry(retry) @@ -350,3 +539,117 @@ impl ClientSecretOAuthProvider { Ok(token) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::mock_server::MockServer; + use futures::executor::block_on; + use hyper::body::to_bytes; + use hyper::{Body, Response}; + use reqwest::{Client, Method}; + use tempfile::NamedTempFile; + + #[tokio::test] + async fn test_managed_identity() { + let server = MockServer::new(); + + std::env::set_var(MSI_SECRET_ENV_KEY, "env-secret"); + + let endpoint = server.url(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Test IMDS + server.push_fn(|req| { + assert_eq!(req.uri().path(), "/metadata/identity/oauth2/token"); + assert!(req.uri().query().unwrap().contains("client_id=client_id")); + assert_eq!(req.method(), &Method::GET); + let t = req + .headers() + .get("x-identity-header") + .unwrap() + .to_str() + .unwrap(); + assert_eq!(t, "env-secret"); + let t = req.headers().get("metadata").unwrap().to_str().unwrap(); + assert_eq!(t, "true"); + Response::new(Body::from( + r#" + { + "access_token": "TOKEN", + "refresh_token": "", + "expires_in": "3599", + "expires_on": "1506484173", + "not_before": "1506480273", + "resource": "https://management.azure.com/", + "token_type": "Bearer" + } + "#, + )) + }); + + let credential = ImdsManagedIdentityOAuthProvider::new( + Some("client_id".into()), + None, + None, + Some(format!("{}/metadata/identity/oauth2/token", endpoint)), + client.clone(), + ); + + let token = credential + .fetch_token(&client, &retry_config) + .await + .unwrap(); + + assert_eq!(&token.token, "TOKEN"); + } + + #[tokio::test] + async fn test_workload_identity() { + let server = MockServer::new(); + let tokenfile = NamedTempFile::new().unwrap(); + let tenant = "tenant"; + std::fs::write(tokenfile.path(), "federated-token").unwrap(); + + let endpoint = server.url(); + let client = Client::new(); + let retry_config = RetryConfig::default(); + + // Test IMDS + server.push_fn(move |req| { + assert_eq!(req.uri().path(), format!("/{}/oauth2/v2.0/token", tenant)); + assert_eq!(req.method(), &Method::POST); + let body = block_on(to_bytes(req.into_body())).unwrap(); + let body = String::from_utf8(body.to_vec()).unwrap(); + assert!(body.contains("federated-token")); + Response::new(Body::from( + r#" + { + "access_token": "TOKEN", + "refresh_token": "", + "expires_in": 3599, + "expires_on": "1506484173", + "not_before": "1506480273", + "resource": "https://management.azure.com/", + "token_type": "Bearer" + } + "#, + )) + }); + + let credential = WorkloadIdentityOAuthProvider::new( + "client_id", + tokenfile.path().to_str().unwrap(), + tenant, + Some(endpoint.to_string()), + ); + + let token = credential + .fetch_token(&client, &retry_config) + .await + .unwrap(); + + assert_eq!(&token.token, "TOKEN"); + } +} diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 3bce8e5984b8..1eea27801a3b 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -27,6 +27,7 @@ //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. use self::client::{BlockId, BlockList}; +use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, @@ -65,6 +66,8 @@ const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; const EMULATOR_ACCOUNT_KEY: &str = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; +const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; + /// A specialized `Error` for Azure object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -393,6 +396,10 @@ pub struct MicrosoftAzureBuilder { authority_host: Option, url: Option, use_emulator: bool, + msi_endpoint: Option, + object_id: Option, + msi_resource_id: Option, + federated_token_file: Option, retry_config: RetryConfig, client_options: ClientOptions, } @@ -496,6 +503,36 @@ pub enum AzureConfigKey { /// - `object_store_use_emulator` /// - `use_emulator` UseEmulator, + + /// Endpoint to request a imds managed identity token + /// + /// Supported keys: + /// - `azure_msi_endpoint` + /// - `azure_identity_endpoint` + /// - `identity_endpoint` + /// - `msi_endpoint` + MsiEndpoint, + + /// Object id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_object_id` + /// - `object_id` + ObjectId, + + /// Msi resource id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_msi_resource_id` + /// - `msi_resource_id` + MsiResourceId, + + /// File containing token for Azure AD workload identity federation + /// + /// Supported keys: + /// - `azure_federated_token_file` + /// - `federated_token_file` + FederatedTokenFile, } impl AsRef for AzureConfigKey { @@ -509,6 +546,10 @@ impl AsRef for AzureConfigKey { Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", + Self::MsiEndpoint => "azure_msi_endpoint", + Self::ObjectId => "azure_object_id", + Self::MsiResourceId => "azure_msi_resource_id", + Self::FederatedTokenFile => "azure_federated_token_file", } } } @@ -543,6 +584,15 @@ impl FromStr for AzureConfigKey { | "sas_token" => Ok(Self::SasKey), "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_msi_endpoint" + | "azure_identity_endpoint" + | "identity_endpoint" + | "msi_endpoint" => Ok(Self::MsiEndpoint), + "azure_object_id" | "object_id" => Ok(Self::ObjectId), + "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), + "azure_federated_token_file" | "federated_token_file" => { + Ok(Self::FederatedTokenFile) + } _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -600,6 +650,10 @@ impl MicrosoftAzureBuilder { builder.client_options.with_allow_http(str_is_truthy(&text)); } + if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { + builder = builder.with_msi_endpoint(text); + } + builder } @@ -644,6 +698,12 @@ impl MicrosoftAzureBuilder { AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), AzureConfigKey::SasKey => self.sas_key = Some(value.into()), AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), + AzureConfigKey::ObjectId => self.object_id = Some(value.into()), + AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), + AzureConfigKey::FederatedTokenFile => { + self.federated_token_file = Some(value.into()) + } AzureConfigKey::UseEmulator => { self.use_emulator = str_is_truthy(&value.into()) } @@ -743,6 +803,24 @@ impl MicrosoftAzureBuilder { self } + /// Sets the client id for use in client secret or k8s federated credential flow + pub fn with_client_id(mut self, client_id: impl Into) -> Self { + self.client_id = Some(client_id.into()); + self + } + + /// Sets the client secret for use in client secret flow + pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { + self.client_secret = Some(client_secret.into()); + self + } + + /// Sets the tenant id for use in client secret or k8s federated credential flow + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + /// Set query pairs appended to the url for shared access signature authorization pub fn with_sas_authorization( mut self, @@ -769,8 +847,8 @@ impl MicrosoftAzureBuilder { /// Sets an alternative authority host for OAuth based authorization /// common hosts for azure clouds are defined in [authority_hosts]. /// Defaults to - pub fn with_authority_host(mut self, authority_host: String) -> Self { - self.authority_host = Some(authority_host); + pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { + self.authority_host = Some(authority_host.into()); self } @@ -792,6 +870,23 @@ impl MicrosoftAzureBuilder { self } + /// Sets the endpoint for acquiring managed identity token + pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { + self.msi_endpoint = Some(msi_endpoint.into()); + self + } + + /// Sets a file path for acquiring azure federated identity token in k8s + /// + /// requires `client_id` and `tenant_id` to be set + pub fn with_federated_token_file( + mut self, + federated_token_file: impl Into, + ) -> Self { + self.federated_token_file = Some(federated_token_file.into()); + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(mut self) -> Result { @@ -821,28 +916,54 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(bearer_token) = self.bearer_token { - Ok(credential::CredentialProvider::AccessKey(bearer_token)) + credential::CredentialProvider::AccessKey(bearer_token) } else if let Some(access_key) = self.access_key { - Ok(credential::CredentialProvider::AccessKey(access_key)) + credential::CredentialProvider::AccessKey(access_key) + } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = + (&self.client_id, &self.tenant_id, self.federated_token_file) + { + let client_credential = credential::WorkloadIdentityOAuthProvider::new( + client_id, + federated_token_file, + tenant_id, + self.authority_host, + ); + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(client_credential), + ) } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (self.client_id, self.client_secret, self.tenant_id) + (&self.client_id, self.client_secret, &self.tenant_id) { let client_credential = credential::ClientSecretOAuthProvider::new( - client_id, + client_id.clone(), client_secret, tenant_id, self.authority_host, ); - Ok(credential::CredentialProvider::ClientSecret( - client_credential, - )) + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(client_credential), + ) } else if let Some(query_pairs) = self.sas_query_pairs { - Ok(credential::CredentialProvider::SASToken(query_pairs)) + credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { - Ok(credential::CredentialProvider::SASToken(split_sas(&sas)?)) + credential::CredentialProvider::SASToken(split_sas(&sas)?) } else { - Err(Error::MissingCredentials {}) - }?; + let client = + self.client_options.clone().with_allow_http(true).client()?; + let msi_credential = credential::ImdsManagedIdentityOAuthProvider::new( + self.client_id, + self.object_id, + self.msi_resource_id, + self.msi_endpoint, + client, + ); + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(msi_credential), + ) + }; (false, url, credential, account_name) }; From 42b2d55e407217a95fab70e8b6c9834f2a39fa1e Mon Sep 17 00:00:00 2001 From: Marius S <39998+winding-lines@users.noreply.github.com> Date: Wed, 25 Jan 2023 07:24:30 -0800 Subject: [PATCH 0547/1411] Additional GCP authentication (#3541) * Implement authentication with instance and application credentials * Fix link in documentation * Address feedback * Instantiate InstanceCredentialsProvider client just once --- object_store/src/gcp/credential.rs | 255 ++++++++++++++++++++++++++++- object_store/src/gcp/mod.rs | 166 ++++++++++--------- 2 files changed, 344 insertions(+), 77 deletions(-) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index cc157dd41985..56468568b35f 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -17,16 +17,30 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; +use crate::ClientOptions; use crate::RetryConfig; +use async_trait::async_trait; use base64::prelude::BASE64_URL_SAFE_NO_PAD; use base64::Engine; +use futures::TryFutureExt; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; +use std::env; +use std::fs::File; +use std::io::BufReader; +use std::path::Path; use std::time::{Duration, Instant}; +use tracing::info; #[derive(Debug, Snafu)] pub enum Error { + #[snafu(display("Unable to open service account file: {}", source))] + OpenCredentials { source: std::io::Error }, + + #[snafu(display("Unable to decode service account file: {}", source))] + DecodeCredentials { source: serde_json::Error }, + #[snafu(display("No RSA key found in pem file"))] MissingKey, @@ -47,6 +61,12 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, + + #[snafu(display("A configuration file was passed in but was not used."))] + UnusedConfigurationFile, + + #[snafu(display("Error creating client: {}", source))] + Client { source: crate::Error }, } pub type Result = std::result::Result; @@ -104,6 +124,15 @@ struct TokenResponse { expires_in: u64, } +#[async_trait] +pub trait TokenProvider: std::fmt::Debug + Send + Sync { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>; +} + /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct OAuthProvider { @@ -138,9 +167,12 @@ impl OAuthProvider { random: ring::rand::SystemRandom::new(), }) } +} +#[async_trait] +impl TokenProvider for OAuthProvider { /// Fetch a fresh token - pub async fn fetch_token( + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, @@ -195,6 +227,69 @@ impl OAuthProvider { } } +fn read_credentials_file( + service_account_path: impl AsRef, +) -> Result +where + T: serde::de::DeserializeOwned, +{ + let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let reader = BufReader::new(file); + serde_json::from_reader(reader).context(DecodeCredentialsSnafu) +} + +/// A deserialized `service-account-********.json`-file. +#[derive(serde::Deserialize, Debug)] +pub struct ServiceAccountCredentials { + /// The private key in RSA format. + pub private_key: String, + + /// The email address associated with the service account. + pub client_email: String, + + /// Base URL for GCS + #[serde(default = "default_gcs_base_url")] + pub gcs_base_url: String, + + /// Disable oauth and use empty tokens. + #[serde(default = "default_disable_oauth")] + pub disable_oauth: bool, +} + +pub fn default_gcs_base_url() -> String { + "https://storage.googleapis.com".to_owned() +} + +pub fn default_disable_oauth() -> bool { + false +} + +impl ServiceAccountCredentials { + /// Create a new [`ServiceAccountCredentials`] from a file. + pub fn from_file>(path: P) -> Result { + read_credentials_file(path) + } + + /// Create a new [`ServiceAccountCredentials`] from a string. + pub fn from_key(key: &str) -> Result { + serde_json::from_str(key).context(DecodeCredentialsSnafu) + } + + /// Create an [`OAuthProvider`] from this credentials struct. + pub fn token_provider( + self, + scope: &str, + audience: &str, + ) -> Result> { + Ok(Box::new(OAuthProvider::new( + self.client_email, + self.private_key, + scope.to_string(), + audience.to_string(), + )?) as Box) + } +} + /// Returns the number of seconds since unix epoch fn seconds_since_epoch() -> u64 { std::time::SystemTime::now() @@ -205,7 +300,7 @@ fn seconds_since_epoch() -> u64 { fn decode_first_rsa_key(private_key_pem: String) -> Result { use rustls_pemfile::Item; - use std::io::{BufReader, Cursor}; + use std::io::Cursor; let mut cursor = Cursor::new(private_key_pem); let mut reader = BufReader::new(&mut cursor); @@ -222,3 +317,159 @@ fn b64_encode_obj(obj: &T) -> Result { let string = serde_json::to_string(obj).context(EncodeSnafu)?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } + +/// A provider that uses the Google Cloud Platform metadata server to fetch a token. +/// +/// +#[derive(Debug, Default)] +pub struct InstanceCredentialProvider { + audience: String, + client: Client, +} + +impl InstanceCredentialProvider { + /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. + pub fn new>( + audience: T, + client_options: ClientOptions, + ) -> Result { + client_options + .with_allow_http(true) + .client() + .map(|client| Self { + audience: audience.into(), + client, + }) + .context(ClientSnafu) + } +} + +/// Make a request to the metadata server to fetch a token, using a a given hostname. +async fn make_metadata_request( + client: &Client, + hostname: &str, + retry: &RetryConfig, + audience: &str, +) -> Result { + let url = format!( + "http://{}/computeMetadata/v1/instance/service-accounts/default/token", + hostname + ); + let response: TokenResponse = client + .request(Method::GET, url) + .header("Metadata-Flavor", "Google") + .query(&[("audience", audience)]) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json() + .await + .context(TokenResponseBodySnafu)?; + Ok(response) +} + +#[async_trait] +impl TokenProvider for InstanceCredentialProvider { + /// Fetch a token from the metadata server. + /// Since the connection is local we need to enable http access and don't actually use the client object passed in. + async fn fetch_token( + &self, + _client: &Client, + retry: &RetryConfig, + ) -> Result> { + const METADATA_IP: &str = "169.254.169.254"; + const METADATA_HOST: &str = "metadata"; + + info!("fetching token from metadata server"); + let response = + make_metadata_request(&self.client, METADATA_HOST, retry, &self.audience) + .or_else(|_| { + make_metadata_request( + &self.client, + METADATA_IP, + retry, + &self.audience, + ) + }) + .await?; + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + Ok(token) + } +} + +/// A deserialized `application_default_credentials.json`-file. +/// +#[derive(serde::Deserialize, Debug)] +pub struct ApplicationDefaultCredentials { + client_id: String, + client_secret: String, + refresh_token: String, + #[serde(rename = "type")] + type_: String, +} + +impl ApplicationDefaultCredentials { + const DEFAULT_TOKEN_GCP_URI: &'static str = + "https://accounts.google.com/o/oauth2/token"; + const CREDENTIALS_PATH: &'static str = + ".config/gcloud/application_default_credentials.json"; + const EXPECTED_TYPE: &str = "authorized_user"; + + // Create a new application default credential in the following situations: + // 1. a file is passed in and the type matches. + // 2. without argument if the well-known configuration file is present. + pub fn new(path: Option<&str>) -> Result, Error> { + if let Some(path) = path { + if let Ok(credentials) = read_credentials_file::(path) { + if credentials.type_ == Self::EXPECTED_TYPE { + return Ok(Some(credentials)); + } + } + // Return an error if the path has not been used. + return Err(Error::UnusedConfigurationFile); + } + if let Some(home) = env::var_os("HOME") { + let path = Path::new(&home).join(Self::CREDENTIALS_PATH); + + // It's expected for this file to not exist unless it has been explicitly configured by the user. + if path.try_exists().unwrap_or(false) { + return read_credentials_file::(path).map(Some); + } + } + Ok(None) + } +} + +#[async_trait] +impl TokenProvider for ApplicationDefaultCredentials { + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result, Error> { + let body = [ + ("grant_type", "refresh_token"), + ("client_id", &self.client_id), + ("client_secret", &self.client_secret), + ("refresh_token", &self.refresh_token), + ]; + + let response = client + .request(Method::POST, Self::DEFAULT_TOKEN_GCP_URI) + .form(&body) + .send_retry(retry) + .await + .context(TokenRequestSnafu)? + .json::() + .await + .context(TokenResponseBodySnafu)?; + let token = TemporaryToken { + token: response.access_token, + expiry: Instant::now() + Duration::from_secs(response.expires_in), + }; + Ok(token) + } +} diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 28972c4a6636..871413b43801 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -30,8 +30,7 @@ //! consider implementing automatic clean up of unused parts that are older than one //! week. use std::collections::BTreeSet; -use std::fs::File; -use std::io::{self, BufReader}; +use std::io; use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -59,18 +58,15 @@ use crate::{ RetryConfig, }; -use credential::OAuthProvider; +use self::credential::{ + default_gcs_base_url, ApplicationDefaultCredentials, InstanceCredentialProvider, + ServiceAccountCredentials, TokenProvider, +}; mod credential; #[derive(Debug, Snafu)] enum Error { - #[snafu(display("Unable to open service account file: {}", source))] - OpenCredentials { source: std::io::Error }, - - #[snafu(display("Unable to decode service account file: {}", source))] - DecodeCredentials { source: serde_json::Error }, - #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] InvalidXMLResponse { source: quick_xml::de::DeError, @@ -121,8 +117,8 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Missing service account path or key"))] - MissingServiceAccountPathOrKey, + #[snafu(display("Could not find either metadata credentials or configuration properties to initialize GCS credentials."))] + MissingCredentials, #[snafu(display( "One of service account path or service account key may be provided." @@ -185,32 +181,6 @@ impl From for super::Error { } } -/// A deserialized `service-account-********.json`-file. -#[derive(serde::Deserialize, Debug)] -struct ServiceAccountCredentials { - /// The private key in RSA format. - pub private_key: String, - - /// The email address associated with the service account. - pub client_email: String, - - /// Base URL for GCS - #[serde(default = "default_gcs_base_url")] - pub gcs_base_url: String, - - /// Disable oauth and use empty tokens. - #[serde(default = "default_disable_oauth")] - pub disable_oauth: bool, -} - -fn default_gcs_base_url() -> String { - "https://storage.googleapis.com".to_owned() -} - -fn default_disable_oauth() -> bool { - false -} - #[derive(serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] struct ListResponse { @@ -267,7 +237,7 @@ struct GoogleCloudStorageClient { client: Client, base_url: String, - oauth_provider: Option, + token_provider: Option>>, token_cache: TokenCache, bucket_name: String, @@ -282,11 +252,11 @@ struct GoogleCloudStorageClient { impl GoogleCloudStorageClient { async fn get_token(&self) -> Result { - if let Some(oauth_provider) = &self.oauth_provider { + if let Some(token_provider) = &self.token_provider { Ok(self .token_cache .get_or_insert_with(|| { - oauth_provider.fetch_token(&self.client, &self.retry_config) + token_provider.fetch_token(&self.client, &self.retry_config) }) .await .context(CredentialSnafu)?) @@ -779,14 +749,6 @@ impl ObjectStore for GoogleCloudStorage { } } -fn reader_credentials_file( - service_account_path: impl AsRef, -) -> Result { - let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; - let reader = BufReader::new(file); - Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) -} - /// Configure a connection to Google Cloud Storage using the specified /// credentials. /// @@ -806,6 +768,7 @@ pub struct GoogleCloudStorageBuilder { url: Option, service_account_path: Option, service_account_key: Option, + application_credentials_path: Option, retry_config: RetryConfig, client_options: ClientOptions, } @@ -862,6 +825,11 @@ pub enum GoogleConfigKey { /// - `bucket` /// - `bucket_name` Bucket, + + /// Application credentials path + /// + /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. + ApplicationCredentials, } impl AsRef for GoogleConfigKey { @@ -870,6 +838,7 @@ impl AsRef for GoogleConfigKey { Self::ServiceAccount => "google_service_account", Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", + Self::ApplicationCredentials => "google_application_credentials", } } } @@ -889,6 +858,7 @@ impl FromStr for GoogleConfigKey { "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { Ok(Self::Bucket) } + "google_application_credentials" => Ok(Self::ApplicationCredentials), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -900,6 +870,7 @@ impl Default for GoogleCloudStorageBuilder { bucket_name: None, service_account_path: None, service_account_key: None, + application_credentials_path: None, retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, @@ -988,6 +959,9 @@ impl GoogleCloudStorageBuilder { self.service_account_key = Some(value.into()) } GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path = Some(value.into()) + } }; Ok(self) } @@ -1069,6 +1043,17 @@ impl GoogleCloudStorageBuilder { self } + /// Set the path to the application credentials file. + /// + /// + pub fn with_application_credentials( + mut self, + application_credentials_path: impl Into, + ) -> Self { + self.application_credentials_path = Some(application_credentials_path.into()); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1098,44 +1083,75 @@ impl GoogleCloudStorageBuilder { let client = self.client_options.client()?; - let credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => reader_credentials_file(path)?, - (None, Some(key)) => { - serde_json::from_str(&key).context(DecodeCredentialsSnafu)? - } - (None, None) => return Err(Error::MissingServiceAccountPathOrKey.into()), - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) - } - }; + // First try to initialize from the service account information. + let service_account_credentials = + match (self.service_account_path, self.service_account_key) { + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .context(CredentialSnafu)?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, + ), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; + + // Then try to initialize from the application credentials file, or the environment. + let application_default_credentials = ApplicationDefaultCredentials::new( + self.application_credentials_path.as_deref(), + ) + .context(CredentialSnafu)?; + + let disable_oauth = service_account_credentials + .as_ref() + .map(|c| c.disable_oauth) + .unwrap_or(false); + + let gcs_base_url = service_account_credentials + .as_ref() + .map(|c| c.gcs_base_url.clone()) + .unwrap_or_else(default_gcs_base_url); // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); - - let oauth_provider = (!credentials.disable_oauth) - .then(|| { - OAuthProvider::new( - credentials.client_email, - credentials.private_key, - scope.to_string(), - audience, + let audience = "https://www.googleapis.com/oauth2/v4/token"; + + let token_provider = if disable_oauth { + None + } else { + let best_provider = if let Some(credentials) = service_account_credentials { + Some( + credentials + .token_provider(scope, audience) + .context(CredentialSnafu)?, ) - }) - .transpose() - .context(CredentialSnafu)?; + } else if let Some(credentials) = application_default_credentials { + Some(Box::new(credentials) as Box) + } else { + Some(Box::new( + InstanceCredentialProvider::new( + audience, + self.client_options.clone(), + ) + .context(CredentialSnafu)?, + ) as Box) + }; + + // A provider is required at this point, bail out if we don't have one. + Some(best_provider.ok_or(Error::MissingCredentials)?) + }; let encoded_bucket_name = percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); - // The cloud storage crate currently only supports authentication via - // environment variables. Set the environment variable explicitly so - // that we can optionally accept command line arguments instead. Ok(GoogleCloudStorage { client: Arc::new(GoogleCloudStorageClient { client, - base_url: credentials.gcs_base_url, - oauth_provider, + base_url: gcs_base_url, + token_provider: token_provider.map(Arc::new), token_cache: Default::default(), bucket_name, bucket_name_encoded: encoded_bucket_name, From 3f72ebb3768e566da9e8a47f7b8540b237fcb881 Mon Sep 17 00:00:00 2001 From: Davis Silverman Date: Wed, 25 Jan 2023 14:40:42 -0500 Subject: [PATCH 0548/1411] Allow StringArray construction with Vec> (#3602) --- arrow-arith/src/aggregate.rs | 3 ++- arrow-array/src/array/string_array.rs | 19 +++++++++++++++++-- .../generic_bytes_dictionary_builder.rs | 3 ++- arrow-ord/src/comparison.rs | 3 ++- arrow/tests/array_transform.rs | 3 ++- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index a1cf8d84954c..b578dbd4a94c 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -1072,7 +1072,8 @@ mod tests { #[test] fn test_string_min_max_all_nulls() { - let a = StringArray::from(vec![None, None]); + let v: Vec> = vec![None, None]; + let a = StringArray::from(v); assert_eq!(None, min_string(&a)); assert_eq!(None, max_string(&a)); } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 4a4152adc678..926bcc7bf3c1 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -249,6 +249,14 @@ impl From> for GenericStringArray From>> + for GenericStringArray +{ + fn from(v: Vec>) -> Self { + v.into_iter().collect() + } +} + impl From> for GenericStringArray { fn from(v: Vec) -> Self { Self::from_iter_values(v) @@ -439,6 +447,13 @@ mod tests { assert_eq!(array1.value(0), "hello"); assert_eq!(array1.value(1), "hello2"); + + // Also works with String types. + let data2: Vec = vec!["goodbye".into(), "goodbye2".into()]; + let array2 = StringArray::from_iter_values(data2.iter()); + + assert_eq!(array2.value(0), "goodbye"); + assert_eq!(array2.value(1), "goodbye2"); } #[test] @@ -467,7 +482,7 @@ mod tests { #[test] fn test_string_array_all_null() { - let data = vec![None]; + let data: Vec> = vec![None]; let array = StringArray::from(data); array .data() @@ -477,7 +492,7 @@ mod tests { #[test] fn test_large_string_array_all_null() { - let data = vec![None]; + let data: Vec> = vec![None]; let array = LargeStringArray::from(data); array .data() diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 449100da1e0e..5af41a51948b 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -641,8 +641,9 @@ mod tests { #[test] fn test_string_dictionary_builder_with_reserved_null_value() { + let v: Vec> = vec![None]; test_bytes_dictionary_builder_with_reserved_null_value::>( - StringArray::from(vec![None]), + StringArray::from(v), vec!["abc", "def"], ); } diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4754aeb1f75a..b8b510a2eb84 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -3712,7 +3712,8 @@ mod tests { // value_offsets = [0, 3, 6, 6] let list_array = builder.finish(); - let nulls = StringArray::from(vec![None, None, None, None]); + let v: Vec> = vec![None, None, None, None]; + let nulls = StringArray::from(v); let nulls_result = contains_utf8(&nulls, &list_array).unwrap(); assert_eq!( nulls_result diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 3c08a592dd2c..34ef6cbae428 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -433,7 +433,8 @@ fn test_struct_nulls() { let data = mutable.freeze(); let array = StructArray::from(data); - let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; + let v: Vec> = vec![None, None]; + let expected_string = Arc::new(StringArray::from(v)) as ArrayRef; let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; let expected = From 1afefbbf102b73ad1308da9fbf9e0bc4850ddde7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 25 Jan 2023 23:30:02 +0000 Subject: [PATCH 0549/1411] Faster ListArray to StringArray conversion (#3593) --- arrow-array/src/array/string_array.rs | 65 +++++++-------------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 926bcc7bf3c1..14db338825dd 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -45,50 +45,6 @@ impl GenericStringArray { self.value(i).chars().count() } - /// Convert a list array to a string array. - /// - /// Note: this performs potentially expensive UTF-8 validation, consider using - /// [`StringBuilder`][crate::builder::StringBuilder] to avoid this - /// - /// # Panics - /// - /// This method panics if the array contains non-UTF-8 data - fn from_list(v: GenericListArray) -> Self { - assert_eq!( - v.data_ref().child_data().len(), - 1, - "StringArray can only be created from list array of u8 values \ - (i.e. List>)." - ); - let child_data = &v.data_ref().child_data()[0]; - - assert_eq!( - child_data.child_data().len(), - 0, - "StringArray can only be created from list array of u8 values \ - (i.e. List>)." - ); - assert_eq!( - child_data.data_type(), - &DataType::UInt8, - "StringArray can only be created from List arrays, mismatched data types." - ); - assert_eq!( - child_data.null_count(), - 0, - "The child array cannot contain null values." - ); - - let builder = ArrayData::builder(Self::DATA_TYPE) - .len(v.len()) - .offset(v.offset()) - .add_buffer(v.data().buffers()[0].clone()) - .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .null_bit_buffer(v.data().null_buffer().cloned()); - - Self::from(builder.build().unwrap()) - } - /// Creates a [`GenericStringArray`] based on an iterator of values without nulls pub fn from_iter_values(iter: I) -> Self where @@ -208,7 +164,7 @@ impl From> for GenericStringArray { fn from(v: GenericListArray) -> Self { - GenericStringArray::::from_list(v) + GenericBinaryArray::::from(v).into() } } @@ -290,7 +246,8 @@ pub type LargeStringArray = GenericStringArray; #[cfg(test)] mod tests { use super::*; - use crate::builder::{ListBuilder, StringBuilder}; + use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder}; + use crate::types::UInt8Type; use arrow_buffer::Buffer; use arrow_schema::Field; @@ -678,7 +635,7 @@ mod tests { #[test] #[should_panic( - expected = "StringArray can only be created from List arrays, mismatched data types." + expected = "BinaryArray can only be created from List arrays, mismatched data types." )] fn test_string_array_from_list_array_wrong_type() { _test_generic_string_array_from_list_array_wrong_type::(); @@ -686,10 +643,20 @@ mod tests { #[test] #[should_panic( - expected = "StringArray can only be created from List arrays, mismatched data types." + expected = "BinaryArray can only be created from List arrays, mismatched data types." )] fn test_large_string_array_from_list_array_wrong_type() { - _test_generic_string_array_from_list_array_wrong_type::(); + _test_generic_string_array_from_list_array_wrong_type::(); + } + + #[test] + #[should_panic(expected = "Invalid UTF-8 sequence: Utf8Error")] + fn test_list_array_utf8_validation() { + let mut builder = ListBuilder::new(PrimitiveBuilder::::new()); + builder.values().append_value(0xFF); + builder.append(true); + let list = builder.finish(); + let _ = StringArray::from(list); } #[test] From 902a17d7d3817ef9030adeb535fd5951b9f72590 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 26 Jan 2023 11:12:42 +0100 Subject: [PATCH 0550/1411] Support sending schemas for empty streams (#3594) * Support sending schemas for empty streams * comments * clippy * Restore got_schema, return references * Review comments * revert unecessary change * Update arrow-flight/src/decode.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/decode.rs | 32 ++++++++------ arrow-flight/src/encode.rs | 66 +++++++++++++++++++++-------- arrow-flight/tests/encode_decode.rs | 27 ++++++++++++ 3 files changed, 94 insertions(+), 31 deletions(-) diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index cab52a434897..fe132e3e8448 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -17,7 +17,7 @@ use crate::{utils::flight_data_to_arrow_batch, FlightData}; use arrow_array::{ArrayRef, RecordBatch}; -use arrow_schema::Schema; +use arrow_schema::{Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; use std::{ @@ -82,16 +82,12 @@ use crate::error::{FlightError, Result}; #[derive(Debug)] pub struct FlightRecordBatchStream { inner: FlightDataDecoder, - got_schema: bool, } impl FlightRecordBatchStream { /// Create a new [`FlightRecordBatchStream`] from a decoded stream pub fn new(inner: FlightDataDecoder) -> Self { - Self { - inner, - got_schema: false, - } + Self { inner } } /// Create a new [`FlightRecordBatchStream`] from a stream of [`FlightData`] @@ -101,13 +97,18 @@ impl FlightRecordBatchStream { { Self { inner: FlightDataDecoder::new(inner), - got_schema: false, } } /// Has a message defining the schema been received yet? + #[deprecated = "use schema().is_some() instead"] pub fn got_schema(&self) -> bool { - self.got_schema + self.schema().is_some() + } + + /// Return schema for the stream, if it has been received + pub fn schema(&self) -> Option<&SchemaRef> { + self.inner.schema() } /// Consume self and return the wrapped [`FlightDataDecoder`] @@ -125,6 +126,7 @@ impl futures::Stream for FlightRecordBatchStream { cx: &mut std::task::Context<'_>, ) -> Poll>> { loop { + let had_schema = self.schema().is_some(); let res = ready!(self.inner.poll_next_unpin(cx)); match res { // Inner exhausted @@ -136,13 +138,12 @@ impl futures::Stream for FlightRecordBatchStream { } // translate data Some(Ok(data)) => match data.payload { - DecodedPayload::Schema(_) if self.got_schema => { + DecodedPayload::Schema(_) if had_schema => { return Poll::Ready(Some(Err(FlightError::protocol( "Unexpectedly saw multiple Schema messages in FlightData stream", )))); } DecodedPayload::Schema(_) => { - self.got_schema = true; // Need next message, poll inner again } DecodedPayload::RecordBatch(batch) => { @@ -219,6 +220,11 @@ impl FlightDataDecoder { } } + /// Returns the current schema for this stream + pub fn schema(&self) -> Option<&SchemaRef> { + self.state.as_ref().map(|state| &state.schema) + } + /// Extracts flight data from the next message, updating decoding /// state as necessary. fn extract_message(&mut self, data: FlightData) -> Result> { @@ -343,7 +349,7 @@ impl futures::Stream for FlightDataDecoder { /// streaming flight response. #[derive(Debug)] struct FlightStreamState { - schema: Arc, + schema: SchemaRef, dictionaries_by_field: HashMap, } @@ -362,7 +368,7 @@ impl DecodedFlightData { } } - pub fn new_schema(inner: FlightData, schema: Arc) -> Self { + pub fn new_schema(inner: FlightData, schema: SchemaRef) -> Self { Self { inner, payload: DecodedPayload::Schema(schema), @@ -389,7 +395,7 @@ pub enum DecodedPayload { None, /// A decoded Schema message - Schema(Arc), + Schema(SchemaRef), /// A decoded Record batch. RecordBatch(RecordBatch), diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index c130a2d7e8cc..2f06ee58f070 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -70,6 +70,8 @@ pub struct FlightDataEncoderBuilder { options: IpcWriteOptions, /// Metadata to add to the schema message app_metadata: Bytes, + /// Optional schema, if known before data. + schema: Option, } /// Default target size for encoded [`FlightData`]. @@ -84,6 +86,7 @@ impl Default for FlightDataEncoderBuilder { max_flight_data_size: GRPC_TARGET_MAX_FLIGHT_SIZE_BYTES, options: IpcWriteOptions::default(), app_metadata: Bytes::new(), + schema: None, } } } @@ -122,6 +125,15 @@ impl FlightDataEncoderBuilder { self } + /// Specify a schema for the RecordBatches being sent. If a schema + /// is not specified, an encoded Schema message will be sent when + /// the first [`RecordBatch`], if any, is encoded. Some clients + /// expect a Schema message even if there is no data sent. + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + /// Return a [`Stream`](futures::Stream) of [`FlightData`], /// consuming self. More details on [`FlightDataEncoder`] pub fn build(self, input: S) -> FlightDataEncoder @@ -132,9 +144,16 @@ impl FlightDataEncoderBuilder { max_flight_data_size, options, app_metadata, + schema, } = self; - FlightDataEncoder::new(input.boxed(), max_flight_data_size, options, app_metadata) + FlightDataEncoder::new( + input.boxed(), + schema, + max_flight_data_size, + options, + app_metadata, + ) } } @@ -162,11 +181,12 @@ pub struct FlightDataEncoder { impl FlightDataEncoder { fn new( inner: BoxStream<'static, Result>, + schema: Option, max_flight_data_size: usize, options: IpcWriteOptions, app_metadata: Bytes, ) -> Self { - Self { + let mut encoder = Self { inner, schema: None, max_flight_data_size, @@ -174,7 +194,13 @@ impl FlightDataEncoder { app_metadata: Some(app_metadata), queue: VecDeque::new(), done: false, + }; + + // If schema is known up front, enqueue it immediately + if let Some(schema) = schema { + encoder.encode_schema(&schema); } + encoder } /// Place the `FlightData` in the queue to send @@ -189,26 +215,30 @@ impl FlightDataEncoder { } } + /// Encodes schema as a [`FlightData`] in self.queue. + /// Updates `self.schema` and returns the new schema + fn encode_schema(&mut self, schema: &SchemaRef) -> SchemaRef { + // The first message is the schema message, and all + // batches have the same schema + let schema = Arc::new(prepare_schema_for_flight(schema)); + let mut schema_flight_data = self.encoder.encode_schema(&schema); + + // attach any metadata requested + if let Some(app_metadata) = self.app_metadata.take() { + schema_flight_data.app_metadata = app_metadata; + } + self.queue_message(schema_flight_data); + // remember schema + self.schema = Some(schema.clone()); + schema + } + /// Encodes batch into one or more `FlightData` messages in self.queue fn encode_batch(&mut self, batch: RecordBatch) -> Result<()> { let schema = match &self.schema { Some(schema) => schema.clone(), - None => { - let batch_schema = batch.schema(); - // The first message is the schema message, and all - // batches have the same schema - let schema = Arc::new(prepare_schema_for_flight(&batch_schema)); - let mut schema_flight_data = self.encoder.encode_schema(&schema); - - // attach any metadata requested - if let Some(app_metadata) = self.app_metadata.take() { - schema_flight_data.app_metadata = app_metadata; - } - self.queue_message(schema_flight_data); - // remember schema - self.schema = Some(schema.clone()); - schema - } + // encode the schema if this is the first time we have seen it + None => self.encode_schema(&batch.schema()), }; // encode the batch diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 0aa98768774e..1990e5b0cbb1 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -96,6 +96,33 @@ async fn test_dictionary_many() { .await; } +#[tokio::test] +async fn test_zero_batches_no_schema() { + let stream = FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![])); + + let mut decoder = FlightRecordBatchStream::new_from_flight_data(stream); + assert!(decoder.schema().is_none()); + // No batches come out + assert!(decoder.next().await.is_none()); + // schema has not been received + assert!(decoder.schema().is_none()); +} + +#[tokio::test] +async fn test_zero_batches_schema_specified() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])); + let stream = FlightDataEncoderBuilder::default() + .with_schema(schema.clone()) + .build(futures::stream::iter(vec![])); + + let mut decoder = FlightRecordBatchStream::new_from_flight_data(stream); + assert!(decoder.schema().is_none()); + // No batches come out + assert!(decoder.next().await.is_none()); + // But schema has been received correctly + assert_eq!(decoder.schema(), Some(&schema)); +} + #[tokio::test] async fn test_app_metadata() { let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(78))]); From 0f1a92a5f31916570d70b78562913cf877e8929c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Jan 2023 15:03:30 +0000 Subject: [PATCH 0551/1411] Add Raw JSON Reader (~2.5x faster) (#3479) * Add Raw JSON Reader * Custom tape decoder * RAT * Cleanup * More columns in benchmark * CI fixes * Tweaks * Add List support * Add support for nested nulls * Remove unnecessary dependency * Add RawDecoder * Clippy * Fix List * Fix buffering * More tests * Add Send bounds * Fix variance * Review feedback * Add deprecation notices * Build RawDecoder with builder * Improve field estimate * Format * Handle unicode split over strings * Improve detection of invalid UTF-8 sequences --- arrow-json/Cargo.toml | 1 + arrow-json/src/lib.rs | 6 +- arrow-json/src/raw/boolean_array.rs | 43 ++ arrow-json/src/raw/list_array.rs | 116 ++++ arrow-json/src/raw/mod.rs | 570 ++++++++++++++++++ arrow-json/src/raw/primitive_array.rs | 88 +++ arrow-json/src/raw/string_array.rs | 67 +++ arrow-json/src/raw/struct_array.rs | 129 +++++ arrow-json/src/raw/tape.rs | 801 ++++++++++++++++++++++++++ arrow-json/src/reader.rs | 19 + arrow/Cargo.toml | 2 +- arrow/benches/json_reader.rs | 101 +++- 12 files changed, 1916 insertions(+), 27 deletions(-) create mode 100644 arrow-json/src/raw/boolean_array.rs create mode 100644 arrow-json/src/raw/list_array.rs create mode 100644 arrow-json/src/raw/mod.rs create mode 100644 arrow-json/src/raw/primitive_array.rs create mode 100644 arrow-json/src/raw/string_array.rs create mode 100644 arrow-json/src/raw/struct_array.rs create mode 100644 arrow-json/src/raw/tape.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 9b9095b27cc1..c6aa9b486cdc 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -48,6 +48,7 @@ indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +lexical-core = { version = "0.8", default-features = false } [dev-dependencies] tempfile = "3.3" diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 0f1c0064f5a2..7e582c3359a6 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -25,8 +25,10 @@ pub mod reader; pub mod writer; -pub use self::reader::Reader; -pub use self::reader::ReaderBuilder; +mod raw; + +pub use self::raw::{RawDecoder, RawReader, RawReaderBuilder}; +pub use self::reader::{Reader, ReaderBuilder}; pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer}; use half::f16; use serde_json::{Number, Value}; diff --git a/arrow-json/src/raw/boolean_array.rs b/arrow-json/src/raw/boolean_array.rs new file mode 100644 index 000000000000..12917785e5b0 --- /dev/null +++ b/arrow-json/src/raw/boolean_array.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::BooleanBuilder; +use arrow_array::Array; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{tape_error, ArrayDecoder}; + +#[derive(Default)] +pub struct BooleanArrayDecoder {} + +impl ArrayDecoder for BooleanArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut builder = BooleanBuilder::with_capacity(pos.len()); + for p in pos { + match tape.get(*p) { + TapeElement::Null => builder.append_null(), + TapeElement::True => builder.append_value(true), + TapeElement::False => builder.append_value(false), + d => return Err(tape_error(d, "boolean")), + } + } + + Ok(builder.finish().into_data()) + } +} diff --git a/arrow-json/src/raw/list_array.rs b/arrow-json/src/raw/list_array.rs new file mode 100644 index 000000000000..9d96885f9943 --- /dev/null +++ b/arrow-json/src/raw/list_array.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use arrow_array::OffsetSizeTrait; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; +use std::marker::PhantomData; + +pub struct ListArrayDecoder { + data_type: DataType, + decoder: Box, + phantom: PhantomData, + is_nullable: bool, +} + +impl ListArrayDecoder { + pub fn new(data_type: DataType, is_nullable: bool) -> Result { + let field = match &data_type { + DataType::List(f) if !O::IS_LARGE => f, + DataType::LargeList(f) if O::IS_LARGE => f, + _ => unreachable!(), + }; + let decoder = make_decoder(field.data_type().clone(), field.is_nullable())?; + + Ok(Self { + data_type, + decoder, + phantom: Default::default(), + is_nullable, + }) + } +} + +impl ArrayDecoder for ListArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut child_pos = Vec::with_capacity(pos.len()); + let mut offsets = BufferBuilder::::new(pos.len() + 1); + offsets.append(O::from_usize(0).unwrap()); + + let mut null_count = 0; + let mut nulls = self + .is_nullable + .then(|| BooleanBufferBuilder::new(pos.len())); + + for p in pos { + let end_idx = match (tape.get(*p), nulls.as_mut()) { + (TapeElement::StartList(end_idx), None) => end_idx, + (TapeElement::StartList(end_idx), Some(nulls)) => { + nulls.append(true); + end_idx + } + (TapeElement::Null, Some(nulls)) => { + nulls.append(false); + null_count += 1; + *p + 1 + } + (d, _) => return Err(tape_error(d, "[")), + }; + + let mut cur_idx = *p + 1; + while cur_idx < end_idx { + child_pos.push(cur_idx); + + // Advance to next field + cur_idx = match tape.get(cur_idx) { + TapeElement::String(_) + | TapeElement::Number(_) + | TapeElement::True + | TapeElement::False + | TapeElement::Null => cur_idx + 1, + TapeElement::StartList(end_idx) => end_idx + 1, + TapeElement::StartObject(end_idx) => end_idx + 1, + d => return Err(tape_error(d, "list value")), + } + } + + let offset = O::from_usize(child_pos.len()).ok_or_else(|| { + ArrowError::JsonError(format!( + "offset overflow decoding {}", + self.data_type + )) + })?; + offsets.append(offset) + } + + let child_data = self.decoder.decode(tape, &child_pos).unwrap(); + + let data = ArrayDataBuilder::new(self.data_type.clone()) + .len(pos.len()) + .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) + .null_count(null_count) + .add_buffer(offsets.finish()) + .child_data(vec![child_data]); + + // Safety + // Validated lengths above + Ok(unsafe { data.build_unchecked() }) + } +} diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs new file mode 100644 index 000000000000..9ffa7d2133a0 --- /dev/null +++ b/arrow-json/src/raw/mod.rs @@ -0,0 +1,570 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A faster JSON reader that will eventually replace [`Reader`] +//! +//! [`Reader`]: crate::reader::Reader + +use crate::raw::boolean_array::BooleanArrayDecoder; +use crate::raw::list_array::ListArrayDecoder; +use crate::raw::primitive_array::PrimitiveArrayDecoder; +use crate::raw::string_array::StringArrayDecoder; +use crate::raw::struct_array::StructArrayDecoder; +use crate::raw::tape::{Tape, TapeDecoder, TapeElement}; +use arrow_array::types::*; +use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, SchemaRef}; +use std::io::BufRead; + +mod boolean_array; +mod list_array; +mod primitive_array; +mod string_array; +mod struct_array; +mod tape; + +/// A builder for [`RawReader`] and [`RawDecoder`] +pub struct RawReaderBuilder { + batch_size: usize, + + schema: SchemaRef, +} + +impl RawReaderBuilder { + /// Create a new [`RawReaderBuilder`] with the provided [`SchemaRef`] + /// + /// This could be obtained using [`infer_json_schema`] if not known + /// + /// Any columns not present in `schema` will be ignored + /// + /// [`infer_json_schema`]: crate::reader::infer_json_schema + pub fn new(schema: SchemaRef) -> Self { + Self { + batch_size: 1024, + schema, + } + } + + /// Sets the batch size in rows to read + pub fn with_batch_size(self, batch_size: usize) -> Self { + Self { batch_size, ..self } + } + + /// Create a [`RawReader`] with the provided [`BufRead`] + pub fn build(self, reader: R) -> Result, ArrowError> { + Ok(RawReader { + reader, + decoder: self.build_decoder()?, + }) + } + + /// Create a [`RawDecoder`] + pub fn build_decoder(self) -> Result { + let decoder = make_decoder(DataType::Struct(self.schema.fields.clone()), false)?; + let num_fields = self.schema.all_fields().len(); + + Ok(RawDecoder { + decoder, + tape_decoder: TapeDecoder::new(self.batch_size, num_fields), + batch_size: self.batch_size, + schema: self.schema, + }) + } +} + +/// Reads JSON data with a known schema directly into arrow [`RecordBatch`] +/// +/// This is significantly faster than [`Reader`] and eventually intended +/// to replace it ([#3610](https://github.com/apache/arrow-rs/issues/3610)) +/// +/// Lines consisting solely of ASCII whitespace are ignored +/// +/// [`Reader`]: crate::reader::Reader +pub struct RawReader { + reader: R, + decoder: RawDecoder, +} + +impl std::fmt::Debug for RawReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RawReader") + .field("decoder", &self.decoder) + .finish() + } +} + +impl RawReader { + /// Reads the next [`RecordBatch`] returning `Ok(None)` if EOF + fn read(&mut self) -> Result, ArrowError> { + loop { + let buf = self.reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + + let decoded = self.decoder.decode(buf)?; + self.reader.consume(decoded); + if decoded != read { + break; + } + } + self.decoder.flush() + } +} + +impl Iterator for RawReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.read().transpose() + } +} + +impl RecordBatchReader for RawReader { + fn schema(&self) -> SchemaRef { + self.decoder.schema.clone() + } +} + +/// A low-level interface for reading JSON data from a byte stream +/// +/// See [`RawReader`] for a higher-level interface for interface with [`BufRead`] +/// +/// The push-based interface facilitates integration with sources that yield arbitrarily +/// delimited bytes ranges, such as [`BufRead`], or a chunked byte stream received from +/// object storage +/// +/// ``` +/// # use std::io::BufRead; +/// # use arrow_array::RecordBatch; +/// # use arrow_json::{RawDecoder, RawReaderBuilder}; +/// # use arrow_schema::{ArrowError, SchemaRef}; +/// # +/// fn read_from_json( +/// mut reader: R, +/// schema: SchemaRef, +/// ) -> Result>, ArrowError> { +/// let mut decoder = RawReaderBuilder::new(schema).build_decoder()?; +/// let mut next = move || { +/// loop { +/// // RawDecoder is agnostic that buf doesn't contain whole records +/// let buf = reader.fill_buf()?; +/// if buf.is_empty() { +/// break; // Input exhausted +/// } +/// let read = buf.len(); +/// let decoded = decoder.decode(buf)?; +/// +/// // Consume the number of bytes read +/// reader.consume(decoded); +/// if decoded != read { +/// break; // Read batch size +/// } +/// } +/// decoder.flush() +/// }; +/// Ok(std::iter::from_fn(move || next().transpose())) +/// } +/// ``` +pub struct RawDecoder { + tape_decoder: TapeDecoder, + decoder: Box, + batch_size: usize, + schema: SchemaRef, +} + +impl std::fmt::Debug for RawDecoder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RawDecoder") + .field("schema", &self.schema) + .field("batch_size", &self.batch_size) + .finish() + } +} + +impl RawDecoder { + /// Read JSON objects from `buf`, returning the number of bytes read + /// + /// This method returns once `batch_size` objects have been parsed since the + /// last call to [`Self::flush`], or `buf` is exhausted. Any remaining bytes + /// should be included in the next call to [`Self::decode`] + /// + /// There is no requirement that `buf` contains a whole number of records, facilitating + /// integration with arbitrary byte streams, such as that yielded by [`BufRead`] + pub fn decode(&mut self, buf: &[u8]) -> Result { + self.tape_decoder.decode(buf) + } + + /// Flushes the currently buffered data to a [`RecordBatch`] + /// + /// Returns `Ok(None)` if no buffered data + /// + /// Note: if called part way through decoding a record, this will return an error + pub fn flush(&mut self) -> Result, ArrowError> { + let tape = self.tape_decoder.finish()?; + + if tape.num_rows() == 0 { + return Ok(None); + } + + // First offset is null sentinel + let mut next_object = 1; + let pos: Vec<_> = (0..tape.num_rows()) + .map(|_| { + let end = match tape.get(next_object) { + TapeElement::StartObject(end) => end, + _ => unreachable!("corrupt tape"), + }; + std::mem::replace(&mut next_object, end + 1) + }) + .collect(); + + let decoded = self.decoder.decode(&tape, &pos)?; + self.tape_decoder.clear(); + + // Sanity check + assert!(matches!(decoded.data_type(), DataType::Struct(_))); + assert_eq!(decoded.null_count(), 0); + assert_eq!(decoded.len(), pos.len()); + + // Clear out buffer + let columns = decoded + .child_data() + .iter() + .map(|x| make_array(x.clone())) + .collect(); + + let batch = RecordBatch::try_new(self.schema.clone(), columns)?; + Ok(Some(batch)) + } +} + +trait ArrayDecoder: Send { + /// Decode elements from `tape` starting at the indexes contained in `pos` + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result; +} + +macro_rules! primitive_decoder { + ($t:ty, $data_type:expr) => { + Ok(Box::new(PrimitiveArrayDecoder::<$t>::new($data_type))) + }; +} + +fn make_decoder( + data_type: DataType, + is_nullable: bool, +) -> Result, ArrowError> { + downcast_integer! { + data_type => (primitive_decoder, data_type), + DataType::Float32 => primitive_decoder!(Float32Type, data_type), + DataType::Float64 => primitive_decoder!(Float64Type, data_type), + DataType::Boolean => Ok(Box::::default()), + DataType::Utf8 => Ok(Box::>::default()), + DataType::LargeUtf8 => Ok(Box::>::default()), + DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, is_nullable)?)), + DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, is_nullable)?)), + DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, is_nullable)?)), + DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { + Err(ArrowError::JsonError(format!("{} is not supported by JSON", data_type))) + } + d => Err(ArrowError::NotYetImplemented(format!("Support for {} in JSON reader", d))) + } +} + +fn tape_error(d: TapeElement, expected: &str) -> ArrowError { + ArrowError::JsonError(format!("expected {expected} got {d}")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::reader::infer_json_schema; + use crate::ReaderBuilder; + use arrow_array::cast::{ + as_boolean_array, as_largestring_array, as_list_array, as_primitive_array, + as_string_array, as_struct_array, + }; + use arrow_array::types::Int32Type; + use arrow_array::Array; + use arrow_schema::{DataType, Field, Schema}; + use std::fs::File; + use std::io::{BufReader, Cursor, Seek}; + use std::sync::Arc; + + fn do_read(buf: &str, batch_size: usize, schema: SchemaRef) -> Vec { + let mut unbuffered = vec![]; + + // Test with different batch sizes to test for boundary conditions + for batch_size in [1, 3, 100, batch_size] { + unbuffered = RawReaderBuilder::new(schema.clone()) + .with_batch_size(batch_size) + .build(Cursor::new(buf.as_bytes())) + .unwrap() + .collect::, _>>() + .unwrap(); + + for b in unbuffered.iter().take(unbuffered.len() - 1) { + assert_eq!(b.num_rows(), batch_size) + } + + // Test with different buffer sizes to test for boundary conditions + for b in [1, 3, 5] { + let buffered = RawReaderBuilder::new(schema.clone()) + .with_batch_size(batch_size) + .build(BufReader::with_capacity(b, Cursor::new(buf.as_bytes()))) + .unwrap() + .collect::, _>>() + .unwrap(); + assert_eq!(unbuffered, buffered); + } + } + + unbuffered + } + + #[test] + fn test_basic() { + let buf = r#" + {"a": 1, "b": 2, "c": true} + {"a": 2E0, "b": 4, "c": false} + + {"b": 6, "a": 2.0} + {"b": "5", "a": 2} + {"b": 4e0} + {"b": 7, "a": null} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int32, true), + Field::new("c", DataType::Boolean, true), + ])); + + let batches = do_read(buf, 1024, schema); + assert_eq!(batches.len(), 1); + + let col1 = as_primitive_array::(batches[0].column(0)); + assert_eq!(col1.null_count(), 2); + assert_eq!(col1.values(), &[1, 2, 2, 2, 0, 0]); + assert!(col1.is_null(4)); + assert!(col1.is_null(5)); + + let col2 = as_primitive_array::(batches[0].column(1)); + assert_eq!(col2.null_count(), 0); + assert_eq!(col2.values(), &[2, 4, 6, 5, 4, 7]); + + let col3 = as_boolean_array(batches[0].column(2)); + assert_eq!(col3.null_count(), 4); + assert!(col3.value(0)); + assert!(!col3.is_null(0)); + assert!(!col3.value(1)); + assert!(!col3.is_null(1)); + } + + #[test] + fn test_string() { + let buf = r#" + {"a": "1", "b": "2"} + {"a": "hello", "b": "shoo"} + {"b": "\t😁foo", "a": "\nfoobar\ud83d\ude00\u0061\u0073\u0066\u0067\u00FF"} + + {"b": null} + {"b": "", "a": null} + + "#; + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::LargeUtf8, true), + ])); + + let batches = do_read(buf, 1024, schema); + assert_eq!(batches.len(), 1); + + let col1 = as_string_array(batches[0].column(0)); + assert_eq!(col1.null_count(), 2); + assert_eq!(col1.value(0), "1"); + assert_eq!(col1.value(1), "hello"); + assert_eq!(col1.value(2), "\nfoobar😀asfgÿ"); + assert!(col1.is_null(3)); + assert!(col1.is_null(4)); + + let col2 = as_largestring_array(batches[0].column(1)); + assert_eq!(col2.null_count(), 1); + assert_eq!(col2.value(0), "2"); + assert_eq!(col2.value(1), "shoo"); + assert_eq!(col2.value(2), "\t😁foo"); + assert!(col2.is_null(3)); + assert_eq!(col2.value(4), ""); + } + + #[test] + fn test_complex() { + let buf = r#" + {"list": [], "nested": {"a": 1, "b": 2}, "nested_list": {"list2": [{"c": 3}, {"c": 4}]}} + {"list": [5, 6], "nested": {"a": 7}, "nested_list": {"list2": []}} + {"list": null, "nested": {"a": null}} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "list", + DataType::List(Box::new(Field::new("element", DataType::Int32, false))), + true, + ), + Field::new( + "nested", + DataType::Struct(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]), + true, + ), + Field::new( + "nested_list", + DataType::Struct(vec![Field::new( + "list2", + DataType::List(Box::new(Field::new( + "element", + DataType::Struct(vec![Field::new("c", DataType::Int32, false)]), + false, + ))), + true, + )]), + true, + ), + ])); + + let batches = do_read(buf, 1024, schema); + assert_eq!(batches.len(), 1); + + let list = as_list_array(batches[0].column(0).as_ref()); + assert_eq!(list.value_offsets(), &[0, 0, 2, 2]); + assert_eq!(list.null_count(), 1); + assert!(list.is_null(4)); + let list_values = as_primitive_array::(list.values().as_ref()); + assert_eq!(list_values.values(), &[5, 6]); + + let nested = as_struct_array(batches[0].column(1).as_ref()); + let a = as_primitive_array::(nested.column(0).as_ref()); + assert_eq!(list.null_count(), 1); + assert_eq!(a.values(), &[1, 7, 0]); + assert!(list.is_null(2)); + + let b = as_primitive_array::(nested.column(1).as_ref()); + assert_eq!(b.null_count(), 2); + assert_eq!(b.len(), 3); + assert_eq!(b.value(0), 2); + assert!(b.is_null(1)); + assert!(b.is_null(2)); + + let nested_list = as_struct_array(batches[0].column(2).as_ref()); + let list2 = as_list_array(nested_list.column(0).as_ref()); + assert_eq!(list2.null_count(), 1); + assert_eq!(list2.value_offsets(), &[0, 2, 2, 2]); + assert!(list2.is_null(3)); + + let list2_values = as_struct_array(list2.values().as_ref()); + + let c = as_primitive_array::(list2_values.column(0)); + assert_eq!(c.values(), &[3, 4]); + } + + #[test] + fn test_projection() { + let buf = r#" + {"list": [], "nested": {"a": 1, "b": 2}, "nested_list": {"list2": [{"c": 3, "d": 5}, {"c": 4}]}} + {"list": [5, 6], "nested": {"a": 7}, "nested_list": {"list2": []}} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "nested", + DataType::Struct(vec![Field::new("a", DataType::Int32, false)]), + true, + ), + Field::new( + "nested_list", + DataType::Struct(vec![Field::new( + "list2", + DataType::List(Box::new(Field::new( + "element", + DataType::Struct(vec![Field::new("d", DataType::Int32, false)]), + false, + ))), + true, + )]), + true, + ), + ])); + + let batches = do_read(buf, 1024, schema); + assert_eq!(batches.len(), 1); + + let nested = as_struct_array(batches[0].column(0).as_ref()); + assert_eq!(nested.num_columns(), 1); + let a = as_primitive_array::(nested.column(0).as_ref()); + assert_eq!(a.null_count(), 0); + assert_eq!(a.values(), &[1, 7]); + + let nested_list = as_struct_array(batches[0].column(1).as_ref()); + assert_eq!(nested_list.num_columns(), 1); + assert_eq!(nested_list.null_count(), 0); + + let list2 = as_list_array(nested_list.column(0).as_ref()); + assert_eq!(list2.value_offsets(), &[0, 2, 2]); + assert_eq!(list2.null_count(), 0); + + let child = as_struct_array(list2.values().as_ref()); + assert_eq!(child.num_columns(), 1); + assert_eq!(child.len(), 2); + assert_eq!(child.null_count(), 0); + + let c = as_primitive_array::(child.column(0).as_ref()); + assert_eq!(c.values(), &[5, 0]); + assert_eq!(c.null_count(), 1); + assert!(c.is_null(1)); + } + + #[test] + fn integration_test() { + let files = [ + "test/data/basic.json", + "test/data/basic_nulls.json", + "test/data/list_string_dict_nested_nulls.json", + ]; + + for file in files { + let mut f = BufReader::new(File::open(file).unwrap()); + let schema = Arc::new(infer_json_schema(&mut f, None).unwrap()); + + f.rewind().unwrap(); + let a = ReaderBuilder::new() + .with_schema(schema.clone()) + .build(&mut f) + .unwrap(); + let a_result = a.into_iter().collect::, _>>().unwrap(); + + f.rewind().unwrap(); + let b = RawReaderBuilder::new(schema).build(f).unwrap(); + let b_result = b.into_iter().collect::, _>>().unwrap(); + + assert_eq!(a_result, b_result); + } + } +} diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/raw/primitive_array.rs new file mode 100644 index 000000000000..72ce30203d01 --- /dev/null +++ b/arrow-json/src/raw/primitive_array.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use num::NumCast; +use std::marker::PhantomData; + +use arrow_array::builder::PrimitiveBuilder; +use arrow_array::{Array, ArrowPrimitiveType}; +use arrow_cast::parse::Parser; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{tape_error, ArrayDecoder}; + +pub struct PrimitiveArrayDecoder { + data_type: DataType, + // Invariant and Send + phantom: PhantomData P>, +} + +impl PrimitiveArrayDecoder

{ + pub fn new(data_type: DataType) -> Self { + Self { + data_type, + phantom: Default::default(), + } + } +} + +impl

ArrayDecoder for PrimitiveArrayDecoder

+where + P: ArrowPrimitiveType + Parser, + P::Native: NumCast, +{ + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) + .with_data_type(self.data_type.clone()); + + for p in pos { + match tape.get(*p) { + TapeElement::Null => builder.append_null(), + TapeElement::String(idx) => { + let s = tape.get_string(idx); + let value = P::parse(s).ok_or_else(|| { + ArrowError::JsonError(format!( + "failed to parse \"{s}\" as {}", + self.data_type + )) + })?; + + builder.append_value(value) + } + TapeElement::Number(idx) => { + let s = tape.get_string(idx); + let value = lexical_core::parse::(s.as_bytes()) + .ok() + .and_then(NumCast::from) + .ok_or_else(|| { + ArrowError::JsonError(format!( + "failed to parse {s} as {}", + self.data_type + )) + })?; + + builder.append_value(value) + } + d => return Err(tape_error(d, "primitive")), + } + } + + Ok(builder.finish().into_data()) + } +} diff --git a/arrow-json/src/raw/string_array.rs b/arrow-json/src/raw/string_array.rs new file mode 100644 index 000000000000..31a7a99bec03 --- /dev/null +++ b/arrow-json/src/raw/string_array.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::GenericStringBuilder; +use arrow_array::{Array, GenericStringArray, OffsetSizeTrait}; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; +use std::marker::PhantomData; + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{tape_error, ArrayDecoder}; + +#[derive(Default)] +pub struct StringArrayDecoder { + phantom: PhantomData, +} + +impl ArrayDecoder for StringArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut data_capacity = 0; + for p in pos { + match tape.get(*p) { + TapeElement::String(idx) => { + data_capacity += tape.get_string(idx).len(); + } + TapeElement::Null => {} + d => return Err(tape_error(d, "string")), + } + } + + if O::from_usize(data_capacity).is_none() { + return Err(ArrowError::JsonError(format!( + "offset overflow decoding {}", + GenericStringArray::::DATA_TYPE + ))); + } + + let mut builder = + GenericStringBuilder::::with_capacity(pos.len(), data_capacity); + + for p in pos { + match tape.get(*p) { + TapeElement::String(idx) => { + builder.append_value(tape.get_string(idx)); + } + TapeElement::Null => builder.append_null(), + _ => unreachable!(), + } + } + + Ok(builder.finish().into_data()) + } +} diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs new file mode 100644 index 000000000000..3b7895f37c7f --- /dev/null +++ b/arrow-json/src/raw/struct_array.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, Field}; + +pub struct StructArrayDecoder { + data_type: DataType, + decoders: Vec>, + is_nullable: bool, +} + +impl StructArrayDecoder { + pub fn new(data_type: DataType, is_nullable: bool) -> Result { + let decoders = struct_fields(&data_type) + .iter() + .map(|f| make_decoder(f.data_type().clone(), f.is_nullable())) + .collect::, ArrowError>>()?; + + Ok(Self { + data_type, + decoders, + is_nullable, + }) + } +} + +impl ArrayDecoder for StructArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let fields = struct_fields(&self.data_type); + let mut child_pos: Vec<_> = + (0..fields.len()).map(|_| vec![0; pos.len()]).collect(); + + let mut null_count = 0; + let mut nulls = self + .is_nullable + .then(|| BooleanBufferBuilder::new(pos.len())); + + for (row, p) in pos.iter().enumerate() { + let end_idx = match (tape.get(*p), nulls.as_mut()) { + (TapeElement::StartObject(end_idx), None) => end_idx, + (TapeElement::StartObject(end_idx), Some(nulls)) => { + nulls.append(true); + end_idx + } + (TapeElement::Null, Some(nulls)) => { + nulls.append(false); + null_count += 1; + continue; + } + (d, _) => return Err(tape_error(d, "{")), + }; + + let mut cur_idx = *p + 1; + while cur_idx < end_idx { + // Read field name + let field_name = match tape.get(cur_idx) { + TapeElement::String(s) => tape.get_string(s), + d => return Err(tape_error(d, "field name")), + }; + + // Update child pos if match found + if let Some(field_idx) = + fields.iter().position(|x| x.name() == field_name) + { + child_pos[field_idx][row] = cur_idx + 1; + } + + // Advance to next field + cur_idx = match tape.get(cur_idx + 1) { + TapeElement::String(_) + | TapeElement::Number(_) + | TapeElement::True + | TapeElement::False + | TapeElement::Null => cur_idx + 2, + TapeElement::StartList(end_idx) => end_idx + 1, + TapeElement::StartObject(end_idx) => end_idx + 1, + d => return Err(tape_error(d, "field value")), + } + } + } + + let child_data = self + .decoders + .iter_mut() + .zip(child_pos) + .map(|(d, pos)| d.decode(tape, &pos)) + .collect::, ArrowError>>()?; + + // Sanity check + child_data + .iter() + .for_each(|x| assert_eq!(x.len(), pos.len())); + + let data = ArrayDataBuilder::new(self.data_type.clone()) + .len(pos.len()) + .null_count(null_count) + .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) + .child_data(child_data); + + // Safety + // Validated lengths above + Ok(unsafe { data.build_unchecked() }) + } +} + +fn struct_fields(data_type: &DataType) -> &[Field] { + match &data_type { + DataType::Struct(f) => f, + _ => unreachable!(), + } +} diff --git a/arrow-json/src/raw/tape.rs b/arrow-json/src/raw/tape.rs new file mode 100644 index 000000000000..6ca4e2d3f521 --- /dev/null +++ b/arrow-json/src/raw/tape.rs @@ -0,0 +1,801 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::ArrowError; +use std::fmt::{Display, Formatter}; + +/// We decode JSON to a flattened tape representation, +/// allowing for efficient traversal of the JSON data +/// +/// This approach is inspired by [simdjson] +/// +/// Uses `u32` for offsets to ensure `TapeElement` is 64-bits. A future +/// iteration may increase this to a custom `u56` type. +/// +/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum TapeElement { + /// The start of an object, i.e. `{` + /// + /// Contains the offset of the corresponding [`Self::EndObject`] + StartObject(u32), + /// The end of an object, i.e. `}` + /// + /// Contains the offset of the corresponding [`Self::StartObject`] + EndObject(u32), + /// The start of a list , i.e. `[` + /// + /// Contains the offset of the corresponding [`Self::EndList`] + StartList(u32), + /// The end of a list , i.e. `]` + /// + /// Contains the offset of the corresponding [`Self::StartList`] + EndList(u32), + /// A string value + /// + /// Contains the offset into the [`Tape`] string data + String(u32), + /// A numeric value + /// + /// Contains the offset into the [`Tape`] string data + Number(u32), + /// A true literal + True, + /// A false literal + False, + /// A null literal + Null, +} + +impl Display for TapeElement { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + TapeElement::StartObject(_) => write!(f, "{{"), + TapeElement::EndObject(_) => write!(f, "}}"), + TapeElement::StartList(_) => write!(f, "["), + TapeElement::EndList(_) => write!(f, "]"), + TapeElement::String(_) => write!(f, "string"), + TapeElement::Number(_) => write!(f, "number"), + TapeElement::True => write!(f, "true"), + TapeElement::False => write!(f, "false"), + TapeElement::Null => write!(f, "null"), + } + } +} + +/// A decoded JSON tape +/// +/// String and numeric data is stored alongside an array of [`TapeElement`] +/// +/// The first element is always [`TapeElement::Null`] +/// +/// This approach to decoding JSON is inspired by [simdjson] +/// +/// [simdjson]: https://github.com/simdjson/simdjson/blob/master/doc/tape.md +#[derive(Debug)] +pub struct Tape<'a> { + elements: &'a [TapeElement], + strings: &'a str, + string_offsets: &'a [usize], + num_rows: usize, +} + +impl<'a> Tape<'a> { + /// Returns the string for the given string index + #[inline] + pub fn get_string(&self, idx: u32) -> &'a str { + let end_offset = self.string_offsets[idx as usize + 1]; + let start_offset = self.string_offsets[idx as usize]; + // SAFETY: + // Verified offsets + unsafe { self.strings.get_unchecked(start_offset..end_offset) } + } + + /// Returns the tape element at `idx` + pub fn get(&self, idx: u32) -> TapeElement { + self.elements[idx as usize] + } + + /// Returns the number of rows + pub fn num_rows(&self) -> usize { + self.num_rows + } +} + +/// States based on +#[derive(Debug, Copy, Clone)] +enum DecoderState { + /// Decoding an object + /// + /// Contains index of start [`TapeElement::StartObject`] + Object(u32), + /// Decoding a list + /// + /// Contains index of start [`TapeElement::StartList`] + List(u32), + String, + Value, + Number, + Colon, + Escape, + /// A unicode escape sequence, + /// + /// Consists of a `(low surrogate, high surrogate, decoded length)` + Unicode(u16, u16, u8), + /// A boolean or null literal + /// + /// Consists of `(literal, decoded length)` + Literal(Literal, u8), +} + +impl DecoderState { + fn as_str(&self) -> &'static str { + match self { + DecoderState::Object(_) => "object", + DecoderState::List(_) => "list", + DecoderState::String => "string", + DecoderState::Value => "value", + DecoderState::Number => "number", + DecoderState::Colon => "colon", + DecoderState::Escape => "escape", + DecoderState::Unicode(_, _, _) => "unicode literal", + DecoderState::Literal(d, _) => d.as_str(), + } + } +} + +#[derive(Debug, Copy, Clone)] +enum Literal { + Null, + True, + False, +} + +impl Literal { + fn element(&self) -> TapeElement { + match self { + Literal::Null => TapeElement::Null, + Literal::True => TapeElement::True, + Literal::False => TapeElement::False, + } + } + + fn as_str(&self) -> &'static str { + match self { + Literal::Null => "null", + Literal::True => "true", + Literal::False => "false", + } + } + + fn bytes(&self) -> &'static [u8] { + self.as_str().as_bytes() + } +} + +/// Evaluates to the next element in the iterator or breaks the current loop +macro_rules! next { + ($next:ident) => { + match $next.next() { + Some(b) => b, + None => break, + } + }; +} + +/// Implements a state machine for decoding JSON to a tape +pub struct TapeDecoder { + elements: Vec, + + num_rows: usize, + + /// Number of rows to read per batch + batch_size: usize, + + /// A buffer of parsed string data + /// + /// Note: if part way through a record, i.e. `stack` is not empty, + /// this may contain truncated UTF-8 data + bytes: Vec, + + /// Offsets into `data` + offsets: Vec, + + /// A stack of [`DecoderState`] + stack: Vec, +} + +impl TapeDecoder { + /// Create a new [`TapeDecoder`] with the provided batch size + /// and an estimated number of fields in each row + pub fn new(batch_size: usize, num_fields: usize) -> Self { + let tokens_per_row = 2 + num_fields * 2; + let mut offsets = Vec::with_capacity(batch_size * (num_fields * 2) + 1); + offsets.push(0); + + let mut elements = Vec::with_capacity(batch_size * tokens_per_row); + elements.push(TapeElement::Null); + + Self { + offsets, + elements, + batch_size, + num_rows: 0, + bytes: Vec::with_capacity(num_fields * 2 * 8), + stack: Vec::with_capacity(10), + } + } + + pub fn decode(&mut self, buf: &[u8]) -> Result { + if self.num_rows >= self.batch_size { + return Ok(0); + } + + let mut iter = BufIter::new(buf); + + while !iter.is_empty() { + match self.stack.last_mut() { + // Start of row + None => { + // Skip over leading whitespace + iter.skip_whitespace(); + match next!(iter) { + b'{' => { + let idx = self.elements.len() as u32; + self.stack.push(DecoderState::Object(idx)); + self.elements.push(TapeElement::StartObject(u32::MAX)); + } + b => return Err(err(b, "trimming leading whitespace")), + } + } + // Decoding an object + Some(DecoderState::Object(start_idx)) => { + iter.advance_until(|b| !json_whitespace(b) && b != b','); + match next!(iter) { + b'"' => { + self.stack.push(DecoderState::Value); + self.stack.push(DecoderState::Colon); + self.stack.push(DecoderState::String); + } + b'}' => { + let start_idx = *start_idx; + let end_idx = self.elements.len() as u32; + self.elements[start_idx as usize] = + TapeElement::StartObject(end_idx); + self.elements.push(TapeElement::EndObject(start_idx)); + self.stack.pop(); + self.num_rows += self.stack.is_empty() as usize; + if self.num_rows >= self.batch_size { + break; + } + } + b => return Err(err(b, "parsing object")), + } + } + // Decoding a list + Some(DecoderState::List(start_idx)) => { + iter.advance_until(|b| !json_whitespace(b) && b != b','); + match iter.peek() { + Some(b']') => { + iter.next(); + let start_idx = *start_idx; + let end_idx = self.elements.len() as u32; + self.elements[start_idx as usize] = + TapeElement::StartList(end_idx); + self.elements.push(TapeElement::EndList(start_idx)); + self.stack.pop(); + } + Some(_) => self.stack.push(DecoderState::Value), + None => break, + } + } + // Decoding a string + Some(DecoderState::String) => { + let s = iter.advance_until(|b| matches!(b, b'\\' | b'"')); + self.bytes.extend_from_slice(s); + + match next!(iter) { + b'\\' => self.stack.push(DecoderState::Escape), + b'"' => { + let idx = self.offsets.len() - 1; + self.elements.push(TapeElement::String(idx as _)); + self.offsets.push(self.bytes.len()); + self.stack.pop(); + } + b => unreachable!("{}", b), + } + } + Some(state @ DecoderState::Value) => { + iter.skip_whitespace(); + *state = match next!(iter) { + b'"' => DecoderState::String, + b @ b'-' | b @ b'0'..=b'9' => { + self.bytes.push(b); + DecoderState::Number + } + b'n' => DecoderState::Literal(Literal::Null, 1), + b'f' => DecoderState::Literal(Literal::False, 1), + b't' => DecoderState::Literal(Literal::True, 1), + b'[' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartList(u32::MAX)); + DecoderState::List(idx) + } + b'{' => { + let idx = self.elements.len() as u32; + self.elements.push(TapeElement::StartObject(u32::MAX)); + DecoderState::Object(idx) + } + b => return Err(err(b, "parsing value")), + }; + } + Some(DecoderState::Number) => { + let s = iter.advance_until(|b| { + !matches!(b, b'0'..=b'9' | b'-' | b'+' | b'.' | b'e' | b'E') + }); + self.bytes.extend_from_slice(s); + + if !iter.is_empty() { + self.stack.pop(); + let idx = self.offsets.len() - 1; + self.elements.push(TapeElement::Number(idx as _)); + self.offsets.push(self.bytes.len()); + } + } + Some(DecoderState::Colon) => { + iter.skip_whitespace(); + match next!(iter) { + b':' => self.stack.pop(), + b => return Err(err(b, "parsing colon")), + }; + } + Some(DecoderState::Literal(literal, idx)) => { + let bytes = literal.bytes(); + let expected = bytes.iter().skip(*idx as usize).copied(); + for (expected, b) in expected.zip(&mut iter) { + match b == expected { + true => *idx += 1, + false => return Err(err(b, "parsing literal")), + } + } + if *idx == bytes.len() as u8 { + let element = literal.element(); + self.stack.pop(); + self.elements.push(element); + } + } + Some(DecoderState::Escape) => { + let v = match next!(iter) { + b'u' => { + self.stack.pop(); + self.stack.push(DecoderState::Unicode(0, 0, 0)); + continue; + } + b'"' => b'"', + b'\\' => b'\\', + b'/' => b'/', + b'b' => 8, // BS + b'f' => 12, // FF + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b => return Err(err(b, "parsing escape sequence")), + }; + + self.stack.pop(); + self.bytes.push(v); + } + // Parse a unicode escape sequence + Some(DecoderState::Unicode(high, low, idx)) => loop { + match *idx { + 0..=3 => *high = *high << 4 | parse_hex(next!(iter))? as u16, + 4 => { + if let Some(c) = char::from_u32(*high as u32) { + write_char(c, &mut self.bytes); + self.stack.pop(); + break; + } + + match next!(iter) { + b'\\' => {} + b => return Err(err(b, "parsing surrogate pair escape")), + } + } + 5 => match next!(iter) { + b'u' => {} + b => return Err(err(b, "parsing surrogate pair unicode")), + }, + 6..=9 => *low = *low << 4 | parse_hex(next!(iter))? as u16, + _ => { + let c = char_from_surrogate_pair(*low, *high)?; + write_char(c, &mut self.bytes); + self.stack.pop(); + break; + } + } + *idx += 1; + }, + } + } + + Ok(buf.len() - iter.len()) + } + + /// Finishes the current [`Tape`] + pub fn finish(&self) -> Result, ArrowError> { + if let Some(b) = self.stack.last() { + return Err(ArrowError::JsonError(format!( + "Truncated record whilst reading {}", + b.as_str() + ))); + } + + if self.offsets.len() >= u32::MAX as usize { + return Err(ArrowError::JsonError(format!("Encountered more than {} bytes of string data, consider using a smaller batch size", u32::MAX))); + } + + if self.offsets.len() >= u32::MAX as usize { + return Err(ArrowError::JsonError(format!("Encountered more than {} JSON elements, consider using a smaller batch size", u32::MAX))); + } + + // Sanity check + assert_eq!( + self.offsets.last().copied().unwrap_or_default(), + self.bytes.len() + ); + + let strings = std::str::from_utf8(&self.bytes).map_err(|_| { + ArrowError::JsonError("Encountered non-UTF-8 data".to_string()) + })?; + + for offset in self.offsets.iter().copied() { + if !strings.is_char_boundary(offset) { + return Err(ArrowError::JsonError( + "Encountered truncated UTF-8 sequence".to_string(), + )); + } + } + + Ok(Tape { + strings, + elements: &self.elements, + string_offsets: &self.offsets, + num_rows: self.num_rows, + }) + } + + /// Clears this [`TapeDecoder`] in preparation to read the next batch + pub fn clear(&mut self) { + assert!(self.stack.is_empty()); + + self.num_rows = 0; + self.bytes.clear(); + self.elements.clear(); + self.elements.push(TapeElement::Null); + self.offsets.clear(); + self.offsets.push(0); + } +} + +/// A wrapper around a slice iterator that provides some helper functionality +struct BufIter<'a>(std::slice::Iter<'a, u8>); + +impl<'a> BufIter<'a> { + fn new(buf: &'a [u8]) -> Self { + Self(buf.iter()) + } + + fn as_slice(&self) -> &'a [u8] { + self.0.as_slice() + } + + fn is_empty(&self) -> bool { + self.0.len() == 0 + } + + fn peek(&self) -> Option { + self.0.as_slice().first().copied() + } + + fn advance(&mut self, skip: usize) { + for _ in 0..skip { + self.0.next(); + } + } + + fn advance_until bool>(&mut self, f: F) -> &[u8] { + let s = self.as_slice(); + match s.iter().copied().position(f) { + Some(x) => { + self.advance(x); + &s[..x] + } + None => { + self.advance(s.len()); + s + } + } + } + + fn skip_whitespace(&mut self) { + self.advance_until(|b| !json_whitespace(b)); + } +} + +impl<'a> Iterator for BufIter<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + self.0.next().copied() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +impl<'a> ExactSizeIterator for BufIter<'a> {} + +/// Returns an error for a given byte `b` and context `ctx` +fn err(b: u8, ctx: &str) -> ArrowError { + ArrowError::JsonError(format!( + "Encountered unexpected '{}' whilst {ctx}", + b as char + )) +} + +/// Creates a character from an UTF-16 surrogate pair +fn char_from_surrogate_pair(low: u16, high: u16) -> Result { + let n = (((high - 0xD800) as u32) << 10 | (low - 0xDC00) as u32) + 0x1_0000; + char::from_u32(n).ok_or_else(|| { + ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {}", n)) + }) +} + +/// Writes `c` as UTF-8 to `out` +fn write_char(c: char, out: &mut Vec) { + let mut t = [0; 4]; + out.extend_from_slice(c.encode_utf8(&mut t).as_bytes()); +} + +/// Evaluates to true if `b` is a valid JSON whitespace character +#[inline] +fn json_whitespace(b: u8) -> bool { + matches!(b, b' ' | b'\n' | b'\r' | b'\t') +} + +/// Parse a hex character to `u8` +fn parse_hex(b: u8) -> Result { + let digit = char::from(b) + .to_digit(16) + .ok_or_else(|| err(b, "unicode escape"))?; + Ok(digit as u8) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sizes() { + assert_eq!(std::mem::size_of::(), 8); + assert_eq!(std::mem::size_of::(), 8); + } + + #[test] + fn test_basic() { + let a = r#" + {"hello": "world", "foo": 2, "bar": 45} + + {"foo": "bar"} + + {"fiz": null} + + {"a": true, "b": false, "c": null} + + {"a": "", "": "a"} + + {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }} + + {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} } + "#; + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(a.as_bytes()).unwrap(); + + let finished = decoder.finish().unwrap(); + assert_eq!( + finished.elements, + &[ + TapeElement::Null, + TapeElement::StartObject(8), // {"hello": "world", "foo": 2, "bar": 45} + TapeElement::String(0), // "hello" + TapeElement::String(1), // "world" + TapeElement::String(2), // "foo" + TapeElement::Number(3), // 2 + TapeElement::String(4), // "bar" + TapeElement::Number(5), // 45 + TapeElement::EndObject(1), + TapeElement::StartObject(12), // {"foo": "bar"} + TapeElement::String(6), // "foo" + TapeElement::String(7), // "bar" + TapeElement::EndObject(9), + TapeElement::StartObject(16), // {"fiz": null} + TapeElement::String(8), // "fiz + TapeElement::Null, // null + TapeElement::EndObject(13), + TapeElement::StartObject(24), // {"a": true, "b": false, "c": null} + TapeElement::String(9), // "a" + TapeElement::True, // true + TapeElement::String(10), // "b" + TapeElement::False, // false + TapeElement::String(11), // "c" + TapeElement::Null, // null + TapeElement::EndObject(17), + TapeElement::StartObject(30), // {"a": "", "": "a"} + TapeElement::String(12), // "a" + TapeElement::String(13), // "" + TapeElement::String(14), // "" + TapeElement::String(15), // "a" + TapeElement::EndObject(25), + TapeElement::StartObject(49), // {"a": "b", "object": {"nested": "hello", "foo": 23}, "b": {}, "c": {"foo": null }} + TapeElement::String(16), // "a" + TapeElement::String(17), // "b" + TapeElement::String(18), // "object" + TapeElement::StartObject(40), // {"nested": "hello", "foo": 23} + TapeElement::String(19), // "nested" + TapeElement::String(20), // "hello" + TapeElement::String(21), // "foo" + TapeElement::Number(22), // 23 + TapeElement::EndObject(35), + TapeElement::String(23), // "b" + TapeElement::StartObject(43), // {} + TapeElement::EndObject(42), + TapeElement::String(24), // "c" + TapeElement::StartObject(48), // {"foo": null } + TapeElement::String(25), // "foo" + TapeElement::Null, // null + TapeElement::EndObject(45), + TapeElement::EndObject(31), + TapeElement::StartObject(75), // {"a": ["", "foo", ["bar", "c"]], "b": {"1": []}, "c": {"2": [1, 2, 3]} } + TapeElement::String(26), // "a" + TapeElement::StartList(59), // ["", "foo", ["bar", "c"]] + TapeElement::String(27), // "" + TapeElement::String(28), // "foo" + TapeElement::StartList(58), // ["bar", "c"] + TapeElement::String(29), // "bar" + TapeElement::String(30), // "c" + TapeElement::EndList(55), + TapeElement::EndList(52), + TapeElement::String(31), // "b" + TapeElement::StartObject(65), // {"1": []} + TapeElement::String(32), // "1" + TapeElement::StartList(64), // [] + TapeElement::EndList(63), + TapeElement::EndObject(61), + TapeElement::String(33), // "c" + TapeElement::StartObject(74), // {"2": [1, 2, 3]} + TapeElement::String(34), // "2" + TapeElement::StartList(73), // [1, 2, 3] + TapeElement::Number(35), // 1 + TapeElement::Number(36), // 2 + TapeElement::Number(37), // 3 + TapeElement::EndList(69), + TapeElement::EndObject(67), + TapeElement::EndObject(50) + ] + ); + + assert_eq!( + finished.strings, + "helloworldfoo2bar45foobarfizabcaaabobjectnestedhellofoo23bcfooafoobarcb1c2123" + ); + assert_eq!( + &finished.string_offsets, + &[ + 0, 5, 10, 13, 14, 17, 19, 22, 25, 28, 29, 30, 31, 32, 32, 32, 33, 34, 35, + 41, 47, 52, 55, 57, 58, 59, 62, 63, 63, 66, 69, 70, 71, 72, 73, 74, 75, + 76, 77 + ] + ) + } + + #[test] + fn test_invalid() { + // Test invalid + let mut decoder = TapeDecoder::new(16, 2); + let err = decoder.decode(b"hello").unwrap_err().to_string(); + assert_eq!( + err, + "Json error: Encountered unexpected 'h' whilst trimming leading whitespace" + ); + + let mut decoder = TapeDecoder::new(16, 2); + let err = decoder.decode(b"{\"hello\": }").unwrap_err().to_string(); + assert_eq!( + err, + "Json error: Encountered unexpected '}' whilst parsing value" + ); + + let mut decoder = TapeDecoder::new(16, 2); + let err = decoder + .decode(b"{\"hello\": [ false, tru ]}") + .unwrap_err() + .to_string(); + assert_eq!( + err, + "Json error: Encountered unexpected ' ' whilst parsing literal" + ); + + let mut decoder = TapeDecoder::new(16, 2); + let err = decoder + .decode(b"{\"hello\": \"\\ud8\"}") + .unwrap_err() + .to_string(); + assert_eq!( + err, + "Json error: Encountered unexpected '\"' whilst unicode escape" + ); + + // Missing surrogate pair + let mut decoder = TapeDecoder::new(16, 2); + let err = decoder + .decode(b"{\"hello\": \"\\ud83d\"}") + .unwrap_err() + .to_string(); + assert_eq!( + err, + "Json error: Encountered unexpected '\"' whilst parsing surrogate pair escape" + ); + + // Test truncation + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"he").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Truncated record whilst reading string"); + + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"hello\" : ").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Truncated record whilst reading value"); + + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"hello\" : [").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Truncated record whilst reading list"); + + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"hello\" : tru").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Truncated record whilst reading true"); + + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"hello\" : nu").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Truncated record whilst reading null"); + + // Test invalid UTF-8 + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"hello\" : \"world\xFF\"}").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Encountered non-UTF-8 data"); + + let mut decoder = TapeDecoder::new(16, 2); + decoder.decode(b"{\"\xe2\" : \"\x96\xa1\"}").unwrap(); + let err = decoder.finish().unwrap_err().to_string(); + assert_eq!(err, "Json error: Encountered truncated UTF-8 sequence"); + } +} diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 64a1b53199bc..c2647ebfc18c 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -563,6 +563,9 @@ where /// converts them to [`RecordBatch`]es. To decode JSON formatted files, /// see [`Reader`]. /// +/// Note: Consider instead using [`RawDecoder`] which is faster and will +/// eventually replace this implementation as part of [#3610] +/// /// # Examples /// ``` /// use arrow_json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; @@ -584,6 +587,9 @@ where /// assert_eq!(4, batch.num_rows()); /// assert_eq!(4, batch.num_columns()); /// ``` +/// +/// [`RawDecoder`]: crate::raw::RawDecoder +/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 #[derive(Debug)] pub struct Decoder { /// Explicit schema for the JSON file @@ -1607,6 +1613,12 @@ fn flatten_json_string_values(values: &[Value]) -> Vec> { .collect::>>() } /// JSON file reader +/// +/// Note: Consider instead using [`RawReader`] which is faster and will +/// eventually replace this implementation as part of [#3610] +/// +/// [`RawReader`]: crate::raw::RawReader +/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 #[derive(Debug)] pub struct Reader { reader: BufReader, @@ -1652,6 +1664,13 @@ impl Reader { } /// JSON file reader builder +/// +/// Note: Consider instead using [`RawReaderBuilder`] which is faster and will +/// eventually replace this implementation as part of [#3610] +/// +/// [`RawReaderBuilder`]: crate::raw::RawReaderBuilder +/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 +/// #[derive(Debug, Default)] pub struct ReaderBuilder { /// Optional schema for the JSON file diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ee926ee52868..bb67bfc400e1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -207,7 +207,7 @@ required-features = ["test_utils", "csv"] [[bench]] name = "json_reader" harness = false -required-features = ["json"] +required-features = ["test_utils", "json"] [[bench]] name = "equal" diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index 7bc3f4179fef..b5d8a53679ef 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -18,18 +18,50 @@ use criterion::*; use arrow::datatypes::*; -use arrow_json::ReaderBuilder; +use arrow::util::bench_util::{ + create_primitive_array, create_string_array, create_string_array_with_len, +}; +use arrow_array::RecordBatch; +use arrow_json::RawReaderBuilder; +use arrow_json::{LineDelimitedWriter, ReaderBuilder}; use std::io::Cursor; use std::sync::Arc; -fn json_primitive_to_record_batch() { +fn do_bench(c: &mut Criterion, name: &str, json: &str, schema: SchemaRef) { + c.bench_function(&format!("{name} (basic)"), |b| { + b.iter(|| { + let cursor = Cursor::new(black_box(json)); + let builder = ReaderBuilder::new() + .with_schema(schema.clone()) + .with_batch_size(64); + + let mut reader = builder.build(cursor).unwrap(); + while let Some(next) = reader.next().transpose() { + next.unwrap(); + } + }) + }); + + c.bench_function(&format!("{name} (raw)"), |b| { + b.iter(|| { + let cursor = Cursor::new(black_box(json)); + let builder = RawReaderBuilder::new(schema.clone()).with_batch_size(64); + let reader = builder.build(cursor).unwrap(); + for next in reader { + next.unwrap(); + } + }) + }); +} + +fn small_bench_primitive(c: &mut Criterion) { let schema = Arc::new(Schema::new(vec![ Field::new("c1", DataType::Utf8, true), Field::new("c2", DataType::Float64, true), Field::new("c3", DataType::UInt32, true), Field::new("c4", DataType::Boolean, true), ])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let json_content = r#" {"c1": "eleven", "c2": 6.2222222225, "c3": 5.0, "c4": false} {"c1": "twelve", "c2": -55555555555555.2, "c3": 3} @@ -42,15 +74,45 @@ fn json_primitive_to_record_batch() { {"c2": -35, "c3": 100.0, "c4": true} {"c1": "fifteen", "c2": null, "c4": true} "#; - let cursor = Cursor::new(json_content); - let mut reader = builder.build(cursor).unwrap(); - #[allow(clippy::unit_arg)] - criterion::black_box({ - reader.next().unwrap(); - }); + + do_bench(c, "small_bench_primitive", json_content, schema) } -fn json_list_primitive_to_record_batch() { +fn large_bench_primitive(c: &mut Criterion) { + let schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Utf8, true), + Field::new("c2", DataType::Int32, true), + Field::new("c3", DataType::UInt32, true), + Field::new("c4", DataType::Utf8, true), + Field::new("c5", DataType::Utf8, true), + Field::new("c6", DataType::Float32, true), + ])); + + let c1 = Arc::new(create_string_array::(4096, 0.)); + let c2 = Arc::new(create_primitive_array::(4096, 0.)); + let c3 = Arc::new(create_primitive_array::(4096, 0.)); + let c4 = Arc::new(create_string_array_with_len::(4096, 0.2, 10)); + let c5 = Arc::new(create_string_array_with_len::(4096, 0.2, 20)); + let c6 = Arc::new(create_primitive_array::(4096, 0.2)); + + let batch = RecordBatch::try_from_iter([ + ("c1", c1 as _), + ("c2", c2 as _), + ("c3", c3 as _), + ("c4", c4 as _), + ("c5", c5 as _), + ("c6", c6 as _), + ]) + .unwrap(); + + let mut out = Vec::with_capacity(1024); + LineDelimitedWriter::new(&mut out).write(batch).unwrap(); + + let json = std::str::from_utf8(&out).unwrap(); + do_bench(c, "large_bench_primitive", json, schema) +} + +fn small_bench_list(c: &mut Criterion) { let schema = Arc::new(Schema::new(vec![ Field::new( "c1", @@ -73,8 +135,7 @@ fn json_list_primitive_to_record_batch() { true, ), ])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" + let json = r#" {"c1": ["eleven"], "c2": [6.2222222225, -3.2, null], "c3": [5.0, 6], "c4": [false, true]} {"c1": ["twelve"], "c2": [-55555555555555.2, 12500000.0], "c3": [3, 4, 5]} {"c1": null, "c2": [3], "c3": [125, 127, 129], "c4": [null, false, true]} @@ -88,21 +149,13 @@ fn json_list_primitive_to_record_batch() { {"c1": ["fifteen"], "c2": [null, 2.1, 1.5, -3], "c4": [true, false, null]} {"c1": ["fifteen"], "c2": [], "c4": [true, false, null]} "#; - let cursor = Cursor::new(json_content); - let mut reader = builder.build(cursor).unwrap(); - #[allow(clippy::unit_arg)] - criterion::black_box({ - reader.next().unwrap(); - }); + do_bench(c, "small_bench_list", json, schema) } fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("json_primitive_to_record_batch", |b| { - b.iter(json_primitive_to_record_batch) - }); - c.bench_function("json_list_primitive_to_record_batch", |b| { - b.iter(json_list_primitive_to_record_batch) - }); + small_bench_primitive(c); + large_bench_primitive(c); + small_bench_list(c); } criterion_group!(benches, criterion_benchmark); From 9728c676b50b19c06643a23daba4aa4a1dc48055 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 26 Jan 2023 20:42:40 -0800 Subject: [PATCH 0552/1411] Fix clippy (#3612) --- arrow-arith/src/arithmetic.rs | 4 +- arrow-arith/src/temporal.rs | 6 +- arrow-array/src/array/binary_array.rs | 2 +- arrow-array/src/array/boolean_array.rs | 12 +-- arrow-array/src/array/dictionary_array.rs | 10 +- .../src/array/fixed_size_binary_array.rs | 2 +- .../src/array/fixed_size_list_array.rs | 3 +- arrow-array/src/array/list_array.rs | 2 +- arrow-array/src/array/mod.rs | 6 +- arrow-array/src/array/null_array.rs | 2 +- arrow-array/src/array/primitive_array.rs | 42 ++++---- arrow-array/src/array/run_array.rs | 10 +- arrow-array/src/array/string_array.rs | 6 +- arrow-array/src/array/struct_array.rs | 4 +- arrow-array/src/array/union_array.rs | 8 +- arrow-array/src/builder/boolean_builder.rs | 2 +- .../src/builder/fixed_size_binary_builder.rs | 3 +- arrow-array/src/builder/primitive_builder.rs | 2 +- arrow-array/src/builder/struct_builder.rs | 2 +- arrow-array/src/record_batch.rs | 5 +- arrow-array/src/timezone.rs | 8 +- arrow-array/src/types.rs | 4 +- arrow-buffer/src/alloc/mod.rs | 2 +- arrow-buffer/src/bigint.rs | 42 ++------ arrow-buffer/src/buffer/scalar.rs | 3 +- arrow-cast/src/cast.rs | 95 +++++++------------ arrow-cast/src/display.rs | 5 +- arrow-cast/src/parse.rs | 13 +-- arrow-csv/src/lib.rs | 8 +- arrow-csv/src/reader/mod.rs | 48 ++++------ arrow-csv/src/reader/records.rs | 5 +- arrow-csv/src/writer.rs | 6 +- arrow-data/src/data.rs | 37 +++----- arrow-data/src/decimal.rs | 18 ++-- arrow-data/src/transform/mod.rs | 2 +- arrow-flight/examples/flight_sql_server.rs | 5 +- arrow-flight/src/error.rs | 2 +- arrow-flight/src/lib.rs | 23 +++-- arrow-flight/src/sql/client.rs | 16 ++-- arrow-flight/src/sql/mod.rs | 2 +- arrow-flight/src/sql/server.rs | 4 +- arrow-flight/src/utils.rs | 2 +- arrow-flight/tests/client.rs | 19 +--- arrow-integration-test/src/datatype.rs | 10 +- arrow-integration-test/src/field.rs | 11 +-- arrow-integration-test/src/lib.rs | 41 +++----- .../src/bin/arrow-json-integration-test.rs | 6 +- .../auth_basic_proto.rs | 8 +- .../integration_test.rs | 4 +- .../src/flight_client_scenarios/middleware.rs | 8 +- .../src/flight_server_scenarios.rs | 2 +- .../integration_test.rs | 31 +++--- arrow-integration-testing/src/lib.rs | 3 +- arrow-integration-testing/tests/ipc_reader.rs | 15 +-- arrow-integration-testing/tests/ipc_writer.rs | 12 +-- arrow-ipc/src/compression.rs | 3 +- arrow-ipc/src/convert.rs | 17 ++-- arrow-ipc/src/reader.rs | 28 +++--- arrow-ipc/src/writer.rs | 8 +- arrow-json/src/raw/mod.rs | 4 +- arrow-json/src/raw/tape.rs | 2 +- arrow-json/src/reader.rs | 56 ++++------- arrow-json/src/writer.rs | 3 +- arrow-ord/src/comparison.rs | 2 +- arrow-ord/src/ord.rs | 12 +-- arrow-ord/src/sort.rs | 13 +-- arrow-row/src/lib.rs | 3 +- arrow-schema/src/datatype.rs | 2 +- arrow-schema/src/error.rs | 22 ++--- arrow-schema/src/field.rs | 2 +- arrow-schema/src/schema.rs | 10 +- arrow-select/src/concat.rs | 6 +- arrow-select/src/take.rs | 6 +- arrow-string/src/concat_elements.rs | 3 +- arrow-string/src/length.rs | 6 +- arrow-string/src/like.rs | 32 +++---- arrow-string/src/regexp.rs | 17 ++-- arrow-string/src/substring.rs | 3 +- arrow/benches/arithmetic_kernels.rs | 28 +++--- arrow/benches/csv_reader.rs | 2 +- arrow/benches/interleave_kernels.rs | 7 +- arrow/benches/lexsort.rs | 13 +-- arrow/benches/row_format.rs | 6 +- arrow/benches/string_dictionary_builder.rs | 5 +- arrow/examples/builders.rs | 12 +-- arrow/examples/collect.rs | 6 +- arrow/examples/tensor_builder.rs | 6 +- arrow/src/datatypes/ffi.rs | 27 +++--- arrow/src/ffi.rs | 30 ++---- arrow/src/ffi_stream.rs | 3 +- arrow/src/util/data_gen.rs | 12 +-- arrow/src/util/pretty.rs | 24 ++--- arrow/src/util/test_util.rs | 4 +- object_store/src/aws/credential.rs | 14 +-- object_store/src/aws/mod.rs | 17 ++-- object_store/src/azure/client.rs | 8 +- object_store/src/azure/credential.rs | 4 +- object_store/src/azure/mod.rs | 2 +- object_store/src/client/backoff.rs | 2 +- object_store/src/client/retry.rs | 4 +- object_store/src/gcp/credential.rs | 3 +- object_store/src/gcp/mod.rs | 16 ++-- object_store/src/lib.rs | 16 ++-- object_store/src/local.rs | 9 +- object_store/src/memory.rs | 5 +- object_store/src/multipart.rs | 2 +- object_store/src/path/mod.rs | 44 +++------ parquet/benches/arrow_reader.rs | 4 +- parquet/examples/async_read_parquet.rs | 2 +- parquet/examples/read_parquet.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 29 +++--- parquet/src/arrow/arrow_writer/mod.rs | 17 ++-- parquet/src/arrow/async_reader/metadata.rs | 3 +- parquet/src/arrow/async_reader/mod.rs | 22 ++--- parquet/src/arrow/async_reader/store.rs | 8 +- parquet/src/arrow/buffer/bit_util.rs | 2 +- parquet/src/arrow/decoder/delta_byte_array.rs | 3 +- parquet/src/arrow/schema/mod.rs | 2 +- parquet/src/basic.rs | 18 ++-- parquet/src/bin/parquet-fromcsv.rs | 40 ++++---- parquet/src/bin/parquet-index.rs | 9 +- parquet/src/bin/parquet-read.rs | 2 +- parquet/src/bin/parquet-rowcount.rs | 2 +- parquet/src/bin/parquet-schema.rs | 2 +- parquet/src/bin/parquet-show-bloom-filter.rs | 2 +- parquet/src/bloom_filter/mod.rs | 12 +-- parquet/src/column/page.rs | 3 +- parquet/src/column/reader/decoder.rs | 4 +- parquet/src/column/writer/mod.rs | 24 ++--- parquet/src/encodings/levels.rs | 6 +- parquet/src/errors.rs | 14 +-- parquet/src/file/footer.rs | 2 +- parquet/src/file/metadata.rs | 2 +- parquet/src/file/properties.rs | 3 +- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/file/statistics.rs | 37 ++++---- parquet/src/file/writer.rs | 2 +- parquet/src/record/api.rs | 44 ++++----- parquet/src/record/reader.rs | 25 ++--- parquet/src/record/triplet.rs | 3 +- parquet/src/schema/printer.rs | 46 ++++----- parquet/src/schema/types.rs | 49 +++++----- parquet/src/schema/visitor.rs | 2 +- parquet/src/util/bit_pack.rs | 8 +- parquet/src/util/bit_util.rs | 3 +- parquet/src/util/test_common/rand_gen.rs | 2 +- parquet/tests/arrow_writer_layout.rs | 14 +-- 147 files changed, 707 insertions(+), 1013 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 8a4657d7e668..48f0412bf8c7 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -2063,7 +2063,7 @@ mod tests { let e = add(&a, &b).expect_err("should have failed due to different lengths"); assert_eq!( "ComputeError(\"Cannot perform binary operation on arrays of different length\")", - format!("{:?}", e) + format!("{e:?}") ); } @@ -2238,7 +2238,7 @@ mod tests { let e = divide_scalar_dyn::(&a, 0_i32) .expect_err("should have failed due to divide by zero"); - assert_eq!("DivideByZero", format!("{:?}", e)); + assert_eq!("DivideByZero", format!("{e:?}")); } #[test] diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 5dcda8758dc9..ac76358ef2dd 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -447,7 +447,7 @@ where let values = time_fraction_dyn(array.values(), name, op)?; Ok(Arc::new(array.with_values(&values))) } - dt => return_compute_error_with!(format!("{} does not support", name), dt), + dt => return_compute_error_with!(format!("{name} does not support"), dt), ) } _ => { @@ -456,7 +456,7 @@ where time_fraction_internal(array, name, op) .map(|a| Arc::new(a) as ArrayRef) } - dt => return_compute_error_with!(format!("{} does not support", name), dt), + dt => return_compute_error_with!(format!("{name} does not support"), dt), ) } } @@ -486,7 +486,7 @@ where }) } _ => return_compute_error_with!( - format!("{} does not support", name), + format!("{name} does not support"), array.data_type() ), } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index cb863c563584..50757dcbe1b6 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -607,7 +607,7 @@ mod tests { .scan(0usize, |pos, i| { if *pos < 10 { *pos += 1; - Some(Some(format!("value {}", i))) + Some(Some(format!("value {i}"))) } else { // actually returns up to 10 values None diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 920fdabc2c71..4c83dcf411d4 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -407,7 +407,7 @@ mod tests { let arr = BooleanArray::from(vec![true, false, false]); assert_eq!( "BooleanArray\n[\n true,\n false,\n false,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -420,7 +420,7 @@ mod tests { let arr = builder.finish(); assert_eq!( "BooleanArray\n[\n true,\n null,\n false,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -435,7 +435,7 @@ mod tests { for i in 0..4 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); - assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i) + assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}") } } @@ -454,7 +454,7 @@ mod tests { } else { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); - assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i) + assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}") } } } @@ -470,7 +470,7 @@ mod tests { for i in 0..3 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); - assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i) + assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {i}") } } @@ -510,7 +510,7 @@ mod tests { assert_eq!(2, arr.offset()); assert_eq!(0, arr.null_count()); for i in 0..3 { - assert_eq!(i != 0, arr.value(i), "failed at {}", i); + assert_eq!(i != 0, arr.value(i), "failed at {i}"); } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index fb2868c2778f..eb2f1b606bb1 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -725,7 +725,7 @@ mod tests { let array = builder.finish(); assert_eq!( "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray\n[\n 12345678,\n 22345678,\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); let mut builder = @@ -736,7 +736,7 @@ mod tests { let array = builder.finish(); assert_eq!( "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray\n[\n 1,\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } @@ -749,13 +749,13 @@ mod tests { .collect(); assert_eq!( "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); let array: DictionaryArray = test.into_iter().collect(); assert_eq!( "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } @@ -900,7 +900,7 @@ mod tests { assert_eq!( "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 2,\n null,\n 1,\n] values: StringArray\n[\n \"foo\",\n \"bar\",\n \"baz\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 0d63fdded136..9debea08d321 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -629,7 +629,7 @@ mod tests { let arr = FixedSizeBinaryArray::from(array_data); assert_eq!( "FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index e9ceb556c642..67a20d142eb5 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -173,8 +173,7 @@ impl From for FixedSizeListArray { assert_eq!( values.len() % *len as usize, 0, - "FixedSizeListArray child array length should be a multiple of {}", - len + "FixedSizeListArray child array length should be a multiple of {len}" ); } diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 6c49fc7fc70f..b378549ebf20 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -289,7 +289,7 @@ impl std::fmt::Debug for GenericListArray std::fmt::Result { let prefix = OffsetSize::PREFIX; - write!(f, "{}ListArray\n[\n", prefix)?; + write!(f, "{prefix}ListArray\n[\n")?; print_long_array(self, f, |array, index, f| { std::fmt::Debug::fmt(&array.value(index), f) })?; diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 69f6ba4d8de1..e953781e5c98 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -580,7 +580,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::UInt64 => { Arc::new(DictionaryArray::::from(data)) as ArrayRef } - dt => panic!("Unexpected dictionary key type {:?}", dt), + dt => panic!("Unexpected dictionary key type {dt:?}"), }, DataType::RunEndEncoded(ref run_ends_type, _) => { match run_ends_type.data_type() { @@ -593,13 +593,13 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Int64 => { Arc::new(RunArray::::from(data)) as ArrayRef } - dt => panic!("Unexpected data type for run_ends array {:?}", dt), + dt => panic!("Unexpected data type for run_ends array {dt:?}"), } } DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, - dt => panic!("Unexpected data type {:?}", dt), + dt => panic!("Unexpected data type {dt:?}"), } } diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index a5ba953c2201..6b68aace706f 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -144,6 +144,6 @@ mod tests { #[test] fn test_debug_null_array() { let array = NullArray::new(1024 * 1024); - assert_eq!(format!("{:?}", array), "NullArray(1048576)"); + assert_eq!(format!("{array:?}"), "NullArray(1048576)"); } } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 01eda724ba47..a757eb7dd4c1 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -765,19 +765,19 @@ where impl std::fmt::Debug for PrimitiveArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let data_type = self.data_type(); - write!(f, "PrimitiveArray<{:?}>\n[\n", data_type)?; + write!(f, "PrimitiveArray<{data_type:?}>\n[\n")?; print_long_array(self, f, |array, index, f| match data_type { DataType::Date32 | DataType::Date64 => { let v = self.value(index).to_isize().unwrap() as i64; match as_date::(v) { - Some(date) => write!(f, "{:?}", date), + Some(date) => write!(f, "{date:?}"), None => write!(f, "null"), } } DataType::Time32(_) | DataType::Time64(_) => { let v = self.value(index).to_isize().unwrap() as i64; match as_time::(v) { - Some(time) => write!(f, "{:?}", time), + Some(time) => write!(f, "{time:?}"), None => write!(f, "null"), } } @@ -796,8 +796,7 @@ impl std::fmt::Debug for PrimitiveArray { Err(_) => match as_datetime::(v) { Some(datetime) => write!( f, - "{:?} (Unknown Time Zone '{}')", - datetime, tz_string + "{datetime:?} (Unknown Time Zone '{tz_string}')" ), None => write!(f, "null"), }, @@ -805,7 +804,7 @@ impl std::fmt::Debug for PrimitiveArray { } // for Timestamp without TimeZone None => match as_datetime::(v) { - Some(datetime) => write!(f, "{:?}", datetime), + Some(datetime) => write!(f, "{datetime:?}"), None => write!(f, "null"), }, } @@ -1136,8 +1135,7 @@ impl PrimitiveArray { } if scale > 0 && scale as u8 > precision { return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is greater than precision {}", - scale, precision + "scale {scale} is greater than precision {precision}" ))); } @@ -1546,7 +1544,7 @@ mod tests { let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); assert_eq!( "PrimitiveArray\n[\n 0,\n 1,\n 2,\n 3,\n 4,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1558,13 +1556,13 @@ mod tests { "PrimitiveArray\n[\n{}\n]", values .iter() - .map(|v| { format!(" {},", v) }) + .map(|v| { format!(" {v},") }) .collect::>() .join("\n") ); let array = Int16Array::from(values); - assert_eq!(array_expected, format!("{:?}", array)); + assert_eq!(array_expected, format!("{array:?}")); }) } @@ -1577,7 +1575,7 @@ mod tests { let arr = builder.finish(); assert_eq!( "PrimitiveArray\n[\n 0,\n 1,\n null,\n 3,\n 4,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1591,7 +1589,7 @@ mod tests { ]); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1606,7 +1604,7 @@ mod tests { .with_timezone_utc(); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00+00:00,\n 2018-12-31T00:00:00+00:00,\n 1921-01-02T00:00:00+00:00,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1637,11 +1635,11 @@ mod tests { ]) .with_timezone("Asia/Taipei".to_string()); - println!("{:?}", arr); + println!("{arr:?}"); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'Asia/Taipei'),\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1656,7 +1654,7 @@ mod tests { .with_timezone("+08:00".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1671,7 +1669,7 @@ mod tests { .with_timezone("xxx".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'xxx'),\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1697,7 +1695,7 @@ mod tests { let arr: PrimitiveArray = vec![12356, 13548, -365].into(); assert_eq!( "PrimitiveArray\n[\n 2003-10-31,\n 2007-02-04,\n 1969-01-01,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1706,7 +1704,7 @@ mod tests { let arr: PrimitiveArray = vec![7201, 60054].into(); assert_eq!( "PrimitiveArray\n[\n 02:00:01,\n 16:40:54,\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -1716,7 +1714,7 @@ mod tests { let arr: PrimitiveArray = vec![-7201, -60054].into(); assert_eq!( "PrimitiveArray\n[\n null,\n null,\n]", - format!("{:?}", arr) + format!("{arr:?}") ) } @@ -1727,7 +1725,7 @@ mod tests { vec![9065525203050843594].into(); assert_eq!( "PrimitiveArray\n[\n null,\n]", - format!("{:?}", arr) + format!("{arr:?}") ) } diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 0e39cd288340..48c4896b695c 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -320,7 +320,7 @@ mod tests { let array = builder.finish(); assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); let mut builder = PrimitiveRunBuilder::::with_capacity(20); @@ -334,7 +334,7 @@ mod tests { assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } @@ -347,7 +347,7 @@ mod tests { .collect(); assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); assert_eq!(array.len(), 4); @@ -356,7 +356,7 @@ mod tests { let array: RunArray = test.into_iter().collect(); assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } @@ -428,7 +428,7 @@ mod tests { assert_eq!( "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", - format!("{:?}", array) + format!("{array:?}") ); } diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 14db338825dd..cb401540d292 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -363,7 +363,7 @@ mod tests { let arr: StringArray = vec!["hello", "arrow"].into(); assert_eq!( "StringArray\n[\n \"hello\",\n \"arrow\",\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -372,7 +372,7 @@ mod tests { let arr: LargeStringArray = vec!["hello", "arrow"].into(); assert_eq!( "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]", - format!("{:?}", arr) + format!("{arr:?}") ); } @@ -420,7 +420,7 @@ mod tests { .scan(0usize, |pos, i| { if *pos < 10 { *pos += 1; - Some(Some(format!("value {}", i))) + Some(Some(format!("value {i}"))) } else { // actually returns up to 10 values None diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index dc949c8e4269..9149895f6ec9 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -143,8 +143,8 @@ impl TryFrom> for StructArray { if let Some(len) = len { if len != child_datum_len { return Err(ArrowError::InvalidArgumentError( - format!("Array of field \"{}\" has length {}, but previous elements have length {}. - All arrays in every entry in a struct array must have the same length.", field_name, child_datum_len, len) + format!("Array of field \"{field_name}\" has length {child_datum_len}, but previous elements have length {len}. + All arrays in every entry in a struct array must have the same length.") )); } } else { diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 092f538bf459..5870952d7f75 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -192,8 +192,7 @@ impl UnionArray { if !invalid_type_ids.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( "Type Ids must be positive and cannot be greater than the number of \ - child arrays, found:\n{:?}", - invalid_type_ids + child arrays, found:\n{invalid_type_ids:?}" ))); } @@ -208,8 +207,7 @@ impl UnionArray { if !invalid_offsets.is_empty() { return Err(ArrowError::InvalidArgumentError(format!( "Offsets must be positive and within the length of the Array, \ - found:\n{:?}", - invalid_offsets + found:\n{invalid_offsets:?}" ))); } } @@ -345,7 +343,7 @@ impl std::fmt::Debug for UnionArray { } else { "UnionArray(Sparse)\n[" }; - writeln!(f, "{}", header)?; + writeln!(f, "{header}")?; writeln!(f, "-- type id buffer:")?; writeln!(f, "{:?}", self.data().buffers()[0])?; diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 06709e5f375d..eeb39b802948 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -247,7 +247,7 @@ mod tests { for i in 0..10 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i) + assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") } } diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 4c8225adf153..695b553f0eee 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -58,8 +58,7 @@ impl FixedSizeBinaryBuilder { pub fn with_capacity(capacity: usize, byte_width: i32) -> Self { assert!( byte_width >= 0, - "value length ({}) of the array must >= 0", - byte_width + "value length ({byte_width}) of the array must >= 0" ); Self { values_builder: UInt8BufferBuilder::new(capacity * byte_width as usize), diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 2d88ea50f257..71671fe7db53 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -455,7 +455,7 @@ mod tests { for i in 0..10 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i) + assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {i}") } } diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index ecf9ca4ffea7..72aa53e189dd 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -174,7 +174,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(StructBuilder::from_fields(fields.clone(), capacity)) } - t => panic!("Data type {:?} is not currently supported", t), + t => panic!("Data type {t:?} is not currently supported"), } } diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 72b567f75a80..035efb4f0f2c 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -192,10 +192,7 @@ impl RecordBatch { if let Some((i, (col_type, field_type))) = not_match { return Err(ArrowError::InvalidArgumentError(format!( - "column types must match schema types, expected {:?} but found {:?} at column index {}", - field_type, - col_type, - i))); + "column types must match schema types, expected {field_type:?} but found {col_type:?} at column index {i}"))); } Ok(RecordBatch { diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index fd8c099c2091..3af76c3dafb7 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -39,8 +39,7 @@ fn parse_fixed_offset(tz: &str) -> Result { } Err(ArrowError::ParseError(format!( - "Invalid timezone \"{}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX", - tz + "Invalid timezone \"{tz}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX" ))) } @@ -88,7 +87,7 @@ mod private { Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?))) } else { Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { - ArrowError::ParseError(format!("Invalid timezone \"{}\": {}", tz, e)) + ArrowError::ParseError(format!("Invalid timezone \"{tz}\": {e}")) })?))) } } @@ -266,8 +265,7 @@ mod private { Ok(Self(parse_fixed_offset(tz)?)) } else { Err(ArrowError::ParseError(format!( - "Invalid timezone \"{}\": only offset based timezones supported without chrono-tz feature", - tz + "Invalid timezone \"{tz}\": only offset based timezones supported without chrono-tz feature" ))) } } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index fc02c0e5a3dc..641d4c2fc157 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -688,11 +688,11 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String { value_str.to_string() } else if scale < 0 { let padding = value_str.len() + scale.unsigned_abs() as usize; - format!("{:0 scale as usize { // Decimal separator is in the middle of the string let (whole, decimal) = value_str.split_at(value_str.len() - scale as usize); - format!("{}.{}", whole, decimal) + format!("{whole}.{decimal}") } else { // String has to be padded format!("{}0.{:0>width$}", sign, rest, width = scale as usize) diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index a7ce80600462..1493d839f5ab 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -144,7 +144,7 @@ impl Debug for Deallocation { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { Deallocation::Arrow(capacity) => { - write!(f, "Deallocation::Arrow {{ capacity: {} }}", capacity) + write!(f, "Deallocation::Arrow {{ capacity: {capacity} }}") } Deallocation::Custom(_) => { write!(f, "Deallocation::Custom {{ capacity: unknown }}") diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index c3a05ba061db..0d404df169e1 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -29,7 +29,7 @@ pub struct i256 { impl std::fmt::Debug for i256 { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } @@ -580,7 +580,7 @@ mod tests { for v in vals { let (t, overflow) = i256::from_bigint_with_overflow(v.clone()); assert!(!overflow); - assert_eq!(t.to_i128(), v.to_i128(), "{} vs {}", v, t); + assert_eq!(t.to_i128(), v.to_i128(), "{v} vs {t}"); } } @@ -590,7 +590,7 @@ mod tests { let br = BigInt::from_signed_bytes_le(&ir.to_le_bytes()); // Comparison - assert_eq!(il.cmp(&ir), bl.cmp(&br), "{} cmp {}", bl, br); + assert_eq!(il.cmp(&ir), bl.cmp(&br), "{bl} cmp {br}"); // Conversions assert_eq!(i256::from_le_bytes(il.to_le_bytes()), il); @@ -599,8 +599,8 @@ mod tests { assert_eq!(i256::from_be_bytes(ir.to_be_bytes()), ir); // To i128 - assert_eq!(il.to_i128(), bl.to_i128(), "{}", bl); - assert_eq!(ir.to_i128(), br.to_i128(), "{}", br); + assert_eq!(il.to_i128(), bl.to_i128(), "{bl}"); + assert_eq!(ir.to_i128(), br.to_i128(), "{br}"); // Absolute value let (abs, overflow) = i256::from_bigint_with_overflow(bl.abs()); @@ -655,24 +655,12 @@ mod tests { match overflow { true => assert!( checked.is_none(), - "{} * {} = {} vs {} * {} = {}", - il, - ir, - actual, - bl, - br, - expected + "{il} * {ir} = {actual} vs {bl} * {br} = {expected}" ), false => assert_eq!( checked.unwrap(), actual, - "{} * {} = {} vs {} * {} = {}", - il, - ir, - actual, - bl, - br, - expected + "{il} * {ir} = {actual} vs {bl} * {br} = {expected}" ), } @@ -687,24 +675,12 @@ mod tests { match overflow { true => assert!( checked.is_none(), - "{} ^ {} = {} vs {} * {} = {}", - il, - exp, - actual, - bl, - exp, - expected + "{il} ^ {exp} = {actual} vs {bl} * {exp} = {expected}" ), false => assert_eq!( checked.unwrap(), actual, - "{} ^ {} = {} vs {} * {} = {}", - il, - exp, - actual, - bl, - exp, - expected + "{il} ^ {exp} = {actual} vs {bl} * {exp} = {expected}" ), } } diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index a9f2df3d9ff3..124f3f6f5894 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -58,8 +58,7 @@ impl ScalarBuffer { let (prefix, offsets, suffix) = unsafe { bytes.align_to::() }; assert!( prefix.is_empty() && suffix.is_empty(), - "buffer is not aligned to {} byte boundary", - size + "buffer is not aligned to {size} byte boundary" ); let ptr = offsets.as_ptr(); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c60e660378aa..aec665aa3013 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -654,8 +654,7 @@ pub fn cast_with_options( UInt32 => dictionary_cast::(array, to_type, cast_options), UInt64 => dictionary_cast::(array, to_type, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from dictionary type {:?} to {:?} not supported", - from_type, to_type, + "Casting from dictionary type {from_type:?} to {to_type:?} not supported", ))), }, (_, Dictionary(index_type, value_type)) => match **index_type { @@ -668,8 +667,7 @@ pub fn cast_with_options( UInt32 => cast_to_dictionary::(array, value_type, cast_options), UInt64 => cast_to_dictionary::(array, value_type, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from type {:?} to dictionary type {:?} not supported", - from_type, to_type, + "Casting from type {from_type:?} to dictionary type {to_type:?} not supported", ))), }, (List(_), List(ref to)) => { @@ -808,8 +806,7 @@ pub fn cast_with_options( } Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type + "Casting from {from_type:?} to {to_type:?} not supported" ))), } } @@ -876,8 +873,7 @@ pub fn cast_with_options( } Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type + "Casting from {from_type:?} to {to_type:?} not supported" ))), } } @@ -966,8 +962,7 @@ pub fn cast_with_options( ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type + "Casting from {from_type:?} to {to_type:?} not supported" ))), } } @@ -1056,8 +1051,7 @@ pub fn cast_with_options( ), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type + "Casting from {from_type:?} to {to_type:?} not supported" ))), } } @@ -1082,8 +1076,7 @@ pub fn cast_with_options( Utf8 => cast_utf8_to_boolean::(array, cast_options), LargeUtf8 => cast_utf8_to_boolean::(array, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (Boolean, _) => match to_type { @@ -1117,8 +1110,7 @@ pub fn cast_with_options( )) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (Utf8, _) => match to_type { @@ -1156,8 +1148,7 @@ pub fn cast_with_options( cast_string_to_timestamp_ns::(&**array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (_, Utf8) => match from_type { @@ -1214,8 +1205,7 @@ pub fn cast_with_options( )) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (_, LargeUtf8) => match from_type { @@ -1271,8 +1261,7 @@ pub fn cast_with_options( )) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeUtf8, _) => match to_type { @@ -1312,8 +1301,7 @@ pub fn cast_with_options( cast_string_to_timestamp_ns::(&**array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (Binary, _) => match to_type { @@ -1321,15 +1309,13 @@ pub fn cast_with_options( cast_byte_container::(&**array) } _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeBinary, _) => match to_type { Binary => cast_byte_container::(&**array), _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), }, // start numeric casts @@ -2041,8 +2027,7 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (_, _) => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, + "Casting from {from_type:?} to {to_type:?} not supported", ))), } } @@ -2872,8 +2857,7 @@ where invalid_value => match cast_options.safe { true => Ok(None), false => Err(ArrowError::CastError(format!( - "Cannot cast value '{}' to value of Boolean type", - invalid_value, + "Cannot cast value '{invalid_value}' to value of Boolean type", ))), }, }, @@ -2897,8 +2881,7 @@ where let parts: Vec<&str> = value_str.split('.').collect(); if parts.len() > 2 { return Err(ArrowError::InvalidArgumentError(format!( - "Invalid decimal format: {:?}", - value_str + "Invalid decimal format: {value_str:?}" ))); } @@ -2909,8 +2892,7 @@ where let number_decimals = if decimals.len() > scale { let decimal_number = i256::from_string(decimals).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( - "Cannot parse decimal format: {}", - value_str + "Cannot parse decimal format: {value_str}" )) })?; @@ -2934,8 +2916,7 @@ where i256::from_string(integers) .ok_or_else(|| { ArrowError::InvalidArgumentError(format!( - "Cannot parse decimal format: {}", - value_str + "Cannot parse decimal format: {value_str}" )) }) .map(|v| { @@ -2949,8 +2930,8 @@ where } else { let padding = if scale > decimals.len() { scale } else { 0 }; - let decimals = format!("{:0( UInt64 => Arc::new(DictionaryArray::::from(data)), _ => { return Err(ArrowError::CastError(format!( - "Unsupported type {:?} for dictionary index", - to_index_type + "Unsupported type {to_index_type:?} for dictionary index" ))); } }; @@ -3323,8 +3302,7 @@ fn cast_to_dictionary( pack_byte_to_dictionary::>(array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Unsupported output type for dictionary packing: {:?}", - dict_value_type + "Unsupported output type for dictionary packing: {dict_value_type:?}" ))), } } @@ -4797,8 +4775,7 @@ mod tests { e.to_string().contains( "Cast error: Cannot cast string 'seven' to value of Int32 type", ), - "Error: {}", - e + "Error: {e}" ) } } @@ -6689,13 +6666,11 @@ mod tests { let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let res = cast(&array, &cast_type); assert!(res.is_err()); - let actual_error = format!("{:?}", res); + let actual_error = format!("{res:?}"); let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; assert!( actual_error.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - actual_error, - expected_error + "did not find expected error '{actual_error}' in actual error '{expected_error}'" ); } @@ -6711,7 +6686,7 @@ mod tests { // dictionary indexed by int32, but not a dictionary indexed // with int8) for i in 0..200 { - let val = format!("val{}", i); + let val = format!("val{i}"); builder.append(&val).unwrap(); } let array: ArrayRef = Arc::new(builder.finish()); @@ -6719,13 +6694,11 @@ mod tests { let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let res = cast(&array, &cast_type); assert!(res.is_err()); - let actual_error = format!("{:?}", res); + let actual_error = format!("{res:?}"); let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; assert!( actual_error.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - actual_error, - expected_error + "did not find expected error '{actual_error}' in actual error '{expected_error}'" ); } @@ -7265,9 +7238,7 @@ mod tests { let expected_error = "Cast error: Cannot cast to Decimal128(38, 30)"; assert!( err.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - expected_error, - err + "did not find expected error '{expected_error}' in actual error '{err}'" ); } @@ -7292,9 +7263,7 @@ mod tests { let expected_error = "Cast error: Cannot cast to Decimal256(76, 50)"; assert!( err.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - expected_error, - err + "did not find expected error '{expected_error}' in actual error '{err}'" ); } diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index e603260b072c..16fbfb0bbce5 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -543,8 +543,7 @@ fn union_to_string( let type_id = list.type_id(row); let field_idx = type_ids.iter().position(|t| t == &type_id).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( - "Repl error: could not get field name for type id: {} in union array.", - type_id, + "Repl error: could not get field name for type id: {type_id} in union array.", )) })?; let name = fields.get(field_idx).unwrap().name(); @@ -557,7 +556,7 @@ fn union_to_string( }, )?; - Ok(format!("{{{}={}}}", name, value)) + Ok(format!("{{{name}={value}}}")) } /// Converts the value of the dictionary array at `row` to a String fn dict_array_value_to_string( diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 8cf6b4ea7e01..459b94f37dc8 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -136,8 +136,7 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // match. Ths any of the specific error messages is likely to be // be more confusing than helpful Err(ArrowError::CastError(format!( - "Error parsing '{}' as timestamp", - s + "Error parsing '{s}' as timestamp" ))) } @@ -241,7 +240,7 @@ pub fn string_to_time_nanoseconds(s: &str) -> Result { nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64 }) // Return generic error if failed to parse as unknown which format user intended for the string - .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{}' as time", s))) + .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{s}' as time"))) } /// Specialized parsing implementations @@ -550,7 +549,7 @@ mod tests { fn parse_timestamp(s: &str) -> Result { let result = string_to_timestamp_nanos(s); if let Err(e) = &result { - eprintln!("Error parsing timestamp '{}': {:?}", s, e); + eprintln!("Error parsing timestamp '{s}': {e:?}"); } result } @@ -558,13 +557,11 @@ mod tests { fn expect_timestamp_parse_error(s: &str, expected_err: &str) { match string_to_timestamp_nanos(s) { Ok(v) => panic!( - "Expected error '{}' while parsing '{}', but parsed {} instead", - expected_err, s, v + "Expected error '{expected_err}' while parsing '{s}', but parsed {v} instead" ), Err(e) => { assert!(e.to_string().contains(expected_err), - "Can not find expected error '{}' while parsing '{}'. Actual error '{}'", - expected_err, s, e); + "Can not find expected error '{expected_err}' while parsing '{s}'. Actual error '{e}'"); } } } diff --git a/arrow-csv/src/lib.rs b/arrow-csv/src/lib.rs index a45cf082d714..e6dc69935199 100644 --- a/arrow-csv/src/lib.rs +++ b/arrow-csv/src/lib.rs @@ -31,15 +31,13 @@ fn map_csv_error(error: csv::Error) -> ArrowError { match error.kind() { csv::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), csv::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( - "Encountered UTF-8 error while reading CSV file: {}", - err + "Encountered UTF-8 error while reading CSV file: {err}" )), csv::ErrorKind::UnequalLengths { expected_len, len, .. } => ArrowError::CsvError(format!( - "Encountered unequal lengths between records on CSV file. Expected {} \ - records, found {} records", - len, expected_len + "Encountered unequal lengths between records on CSV file. Expected {len} \ + records, found {expected_len} records" )), _ => ArrowError::CsvError("Error reading CSV file".to_string()), } diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 0c7bfa897fd4..82b033f8086a 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -148,7 +148,7 @@ fn infer_file_schema_with_csv_options( mut reader: R, roptions: ReaderOptions, ) -> Result<(Schema, usize), ArrowError> { - let saved_offset = reader.seek(SeekFrom::Current(0))?; + let saved_offset = reader.stream_position()?; let (schema, records_count) = infer_reader_schema_with_csv_options(&mut reader, roptions)?; @@ -626,14 +626,12 @@ fn parse( .collect::>(), ) as ArrayRef), _ => Err(ArrowError::ParseError(format!( - "Unsupported dictionary key type {:?}", - key_type + "Unsupported dictionary key type {key_type:?}" ))), } } other => Err(ArrowError::ParseError(format!( - "Unsupported data type {:?}", - other + "Unsupported data type {other:?}" ))), } }) @@ -765,14 +763,12 @@ fn parse_decimal_with_parameter( match validate_decimal_precision(result, precision) { Ok(_) => Ok(result), Err(e) => Err(ArrowError::ParseError(format!( - "parse decimal overflow: {}", - e + "parse decimal overflow: {e}" ))), } } else { Err(ArrowError::ParseError(format!( - "can't parse the string value {} to decimal", - s + "can't parse the string value {s} to decimal" ))) } } @@ -816,8 +812,7 @@ fn parse_decimal(s: &str) -> Result { } } else { Err(ArrowError::ParseError(format!( - "can't parse the string value {} to decimal", - s + "can't parse the string value {s} to decimal" ))) } } @@ -1542,7 +1537,7 @@ mod tests { Some(e) => match e { Err(e) => assert_eq!( "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", - format!("{:?}", e) + format!("{e:?}") ), Ok(_) => panic!("should have failed"), }, @@ -1690,10 +1685,7 @@ mod tests { for s in can_not_parse_tests { let result = parse_decimal_with_parameter(s, 20, 3); assert_eq!( - format!( - "Parser error: can't parse the string value {} to decimal", - s - ), + format!("Parser error: can't parse the string value {s} to decimal"), result.unwrap_err().to_string() ); } @@ -1705,9 +1697,7 @@ mod tests { assert!( actual.contains(expected), - "actual: '{}', expected: '{}'", - actual, - expected + "actual: '{actual}', expected: '{expected}'" ); } } @@ -1960,10 +1950,10 @@ mod tests { let mut csv_text = Vec::new(); let mut csv_writer = std::io::Cursor::new(&mut csv_text); for index in 0..10 { - let text1 = format!("id{:}", index); - let text2 = format!("value{:}", index); + let text1 = format!("id{index:}"); + let text2 = format!("value{index:}"); csv_writer - .write_fmt(format_args!("~{}~,~{}~\r\n", text1, text2)) + .write_fmt(format_args!("~{text1}~,~{text2}~\r\n")) .unwrap(); } let mut csv_reader = std::io::Cursor::new(&csv_text); @@ -1993,10 +1983,10 @@ mod tests { let mut csv_text = Vec::new(); let mut csv_writer = std::io::Cursor::new(&mut csv_text); for index in 0..10 { - let text1 = format!("id{:}", index); - let text2 = format!("value\\\"{:}", index); + let text1 = format!("id{index:}"); + let text2 = format!("value\\\"{index:}"); csv_writer - .write_fmt(format_args!("\"{}\",\"{}\"\r\n", text1, text2)) + .write_fmt(format_args!("\"{text1}\",\"{text2}\"\r\n")) .unwrap(); } let mut csv_reader = std::io::Cursor::new(&csv_text); @@ -2026,10 +2016,10 @@ mod tests { let mut csv_text = Vec::new(); let mut csv_writer = std::io::Cursor::new(&mut csv_text); for index in 0..10 { - let text1 = format!("id{:}", index); - let text2 = format!("value{:}", index); + let text1 = format!("id{index:}"); + let text2 = format!("value{index:}"); csv_writer - .write_fmt(format_args!("\"{}\",\"{}\"\n", text1, text2)) + .write_fmt(format_args!("\"{text1}\",\"{text2}\"\n")) .unwrap(); } let mut csv_reader = std::io::Cursor::new(&csv_text); @@ -2068,7 +2058,7 @@ mod tests { .next() .unwrap() .unwrap(); - assert_eq!(b.num_rows(), expected, "{}", idx); + assert_eq!(b.num_rows(), expected, "{idx}"); } } diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index 501da408815c..76adb719ec17 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -152,8 +152,7 @@ impl RecordReader { let read = self.fill_buf(to_skip.min(1024))?; if read == 0 { return Err(ArrowError::CsvError(format!( - "Failed to skip {} rows only found {}", - to_skip, skipped + "Failed to skip {to_skip} rows only found {skipped}" ))); } @@ -175,7 +174,7 @@ impl RecordReader { // Need to truncate data to the actual amount of data read let data = std::str::from_utf8(&self.data[..last_offset]).map_err(|e| { - ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {}", e)) + ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {e}")) })?; Ok(StringRecords { diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 3ab28c2df816..bc11eef2fcf1 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -90,8 +90,7 @@ where fn invalid_cast_error(dt: &str, col_index: usize, row_index: usize) -> ArrowError { ArrowError::CastError(format!( - "Cannot cast to {} at col index: {} row index: {}", - dt, col_index, row_index + "Cannot cast to {dt} at col index: {col_index} row index: {row_index}" )) } @@ -264,8 +263,7 @@ impl Writer { // List and Struct arrays not supported by the writer, any // other type needs to be implemented return Err(ArrowError::CsvError(format!( - "CSV Writer does not support {:?} data type", - t + "CSV Writer does not support {t:?} data type" ))); } }; diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 07bbc664234a..709262e83464 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -854,8 +854,7 @@ impl ArrayData { // At the moment, constructing a DictionaryArray will also check this if !DataType::is_dictionary_key_type(key_type) { return Err(ArrowError::InvalidArgumentError(format!( - "Dictionary key type must be integer, but was {}", - key_type + "Dictionary key type must be integer, but was {key_type}" ))); } } @@ -1366,15 +1365,13 @@ impl ArrayData { // check if the offset can be converted to usize let r = x.to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: Could not convert offset {} to usize at position {}", - x, i))} + "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))} ); // check if the offset exceeds the limit match r { Ok(n) if n <= offset_limit => Ok((i, n)), Ok(_) => Err(ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: offset at position {} out of bounds: {} > {}", - i, x, offset_limit)) + "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}")) ), Err(e) => Err(e), } @@ -1417,8 +1414,7 @@ impl ArrayData { || !values_str.is_char_boundary(range.end) { return Err(ArrowError::InvalidArgumentError(format!( - "incomplete utf-8 byte sequence from index {}", - string_index + "incomplete utf-8 byte sequence from index {string_index}" ))); } Ok(()) @@ -1431,8 +1427,7 @@ impl ArrayData { |string_index, range| { std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { ArrowError::InvalidArgumentError(format!( - "Invalid UTF8 sequence at string index {} ({:?}): {}", - string_index, range, e + "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" )) })?; Ok(()) @@ -1478,15 +1473,13 @@ impl ArrayData { } let dict_index: i64 = dict_index.try_into().map_err(|_| { ArrowError::InvalidArgumentError(format!( - "Value at position {} out of bounds: {} (can not convert to i64)", - i, dict_index + "Value at position {i} out of bounds: {dict_index} (can not convert to i64)" )) })?; if dict_index < 0 || dict_index > max_value { return Err(ArrowError::InvalidArgumentError(format!( - "Value at position {} out of bounds: {} (should be in [0, {}])", - i, dict_index, max_value + "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])" ))); } Ok(()) @@ -1503,23 +1496,17 @@ impl ArrayData { values.iter().enumerate().try_for_each(|(ix, &inp_value)| { let value: i64 = inp_value.try_into().map_err(|_| { ArrowError::InvalidArgumentError(format!( - "Value at position {} out of bounds: {} (can not convert to i64)", - ix, inp_value + "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)" )) })?; if value <= 0_i64 { return Err(ArrowError::InvalidArgumentError(format!( - "The values in run_ends array should be strictly positive. Found value {} at index {} that does not match the criteria.", - value, - ix + "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria." ))); } if ix > 0 && value <= prev_value { return Err(ArrowError::InvalidArgumentError(format!( - "The values in run_ends array should be strictly increasing. Found value {} at index {} with previous value {} that does not match the criteria.", - value, - ix, - prev_value + "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria." ))); } @@ -1529,9 +1516,7 @@ impl ArrayData { if prev_value.as_usize() != array_len { return Err(ArrowError::InvalidArgumentError(format!( - "The length of array does not match the last value in the run_ends array. The last value of run_ends array is {} and length of array is {}.", - prev_value, - array_len + "The length of array does not match the last value in the run_ends array. The last value of run_ends array is {prev_value} and length of array is {array_len}." ))); } Ok(()) diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index 9367d4ec2546..f74ab880d478 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -735,8 +735,7 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL128_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( - "Max precision of a Decimal128 is {}, but got {}", - DECIMAL128_MAX_PRECISION, precision, + "Max precision of a Decimal128 is {DECIMAL128_MAX_PRECISION}, but got {precision}", ))); } @@ -745,13 +744,11 @@ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), Arro if value > max { Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal128 of precision {}. Max is {}", - value, precision, max + "{value} is too large to store in a Decimal128 of precision {precision}. Max is {max}" ))) } else if value < min { Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal128 of precision {}. Min is {}", - value, precision, min + "{value} is too small to store in a Decimal128 of precision {precision}. Min is {min}" ))) } else { Ok(()) @@ -767,8 +764,7 @@ pub fn validate_decimal256_precision( ) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( - "Max precision of a Decimal256 is {}, but got {}", - DECIMAL256_MAX_PRECISION, precision, + "Max precision of a Decimal256 is {DECIMAL256_MAX_PRECISION}, but got {precision}", ))); } let max = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[usize::from(precision) - 1]; @@ -776,13 +772,11 @@ pub fn validate_decimal256_precision( if value > max { Err(ArrowError::InvalidArgumentError(format!( - "{:?} is too large to store in a Decimal256 of precision {}. Max is {:?}", - value, precision, max + "{value:?} is too large to store in a Decimal256 of precision {precision}. Max is {max:?}" ))) } else if value < min { Err(ArrowError::InvalidArgumentError(format!( - "{:?} is too small to store in a Decimal256 of precision {}. Min is {:?}", - value, precision, min + "{value:?} is too small to store in a Decimal256 of precision {precision}. Min is {min:?}" ))) } else { Ok(()) diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 2a24b1cc2662..fef6d4be4985 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -381,7 +381,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } - _ => panic!("Capacities: {:?} not yet supported", capacities), + _ => panic!("Capacities: {capacities:?} not yet supported"), }; let child_data = match &data_type { diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 5aff347e48d1..28aef4e921a7 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -93,8 +93,7 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(|e| status!("authorization not parsable", e))?; if !authorization.starts_with(basic) { Err(Status::invalid_argument(format!( - "Auth type not implemented: {}", - authorization + "Auth type not implemented: {authorization}" )))?; } let base64 = &authorization[basic.len()..]; @@ -473,7 +472,7 @@ async fn main() -> Result<(), Box> { let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); - println!("Listening on {:?}", addr); + println!("Listening on {addr:?}"); let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs index 7a43e537afc5..5524dd1a4654 100644 --- a/arrow-flight/src/error.rs +++ b/arrow-flight/src/error.rs @@ -50,7 +50,7 @@ impl FlightError { impl std::fmt::Display for FlightError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // TODO better format / error - write!(f, "{:?}", self) + write!(f, "{self:?}") } } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index a44b4b06e4c5..7aebd92e2ba2 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -174,7 +174,7 @@ impl fmt::Display for FlightData { write!(f, "FlightData {{")?; write!(f, " descriptor: ")?; match &self.flight_descriptor { - Some(d) => write!(f, "{}", d)?, + Some(d) => write!(f, "{d}")?, None => write!(f, "None")?, }; write!(f, ", header: ")?; @@ -200,7 +200,7 @@ impl fmt::Display for FlightDescriptor { write!(f, "path: [")?; let mut sep = ""; for element in &self.path { - write!(f, "{}{}", sep, element)?; + write!(f, "{sep}{element}")?; sep = ", "; } write!(f, "]")?; @@ -218,13 +218,13 @@ impl fmt::Display for FlightEndpoint { write!(f, "FlightEndpoint {{")?; write!(f, " ticket: ")?; match &self.ticket { - Some(value) => write!(f, "{}", value), + Some(value) => write!(f, "{value}"), None => write!(f, " none"), }?; write!(f, ", location: [")?; let mut sep = ""; for location in &self.location { - write!(f, "{}{}", sep, location)?; + write!(f, "{sep}{location}")?; sep = ", "; } write!(f, "]")?; @@ -237,16 +237,16 @@ impl fmt::Display for FlightInfo { let ipc_message = IpcMessage(self.schema.clone()); let schema: Schema = ipc_message.try_into().map_err(|_err| fmt::Error)?; write!(f, "FlightInfo {{")?; - write!(f, " schema: {}", schema)?; + write!(f, " schema: {schema}")?; write!(f, ", descriptor:")?; match &self.flight_descriptor { - Some(d) => write!(f, " {}", d), + Some(d) => write!(f, " {d}"), None => write!(f, " None"), }?; write!(f, ", endpoint: [")?; let mut sep = ""; for endpoint in &self.endpoint { - write!(f, "{}{}", sep, endpoint)?; + write!(f, "{sep}{endpoint}")?; sep = ", "; } write!(f, "], total_records: {}", self.total_records)?; @@ -339,8 +339,7 @@ impl TryFrom<&FlightData> for Schema { fn try_from(data: &FlightData) -> ArrowResult { convert::try_schema_from_flatbuffer_bytes(&data.data_header[..]).map_err(|err| { ArrowError::ParseError(format!( - "Unable to convert flight data to Arrow schema: {}", - err + "Unable to convert flight data to Arrow schema: {err}" )) }) } @@ -489,7 +488,7 @@ mod tests { fn it_accepts_equal_output() { let input = TestVector(vec![91; 10], 10); - let actual = format!("{}", input); + let actual = format!("{input}"); let expected = format!("{:?}", vec![91; 10]); assert_eq!(actual, expected); } @@ -498,7 +497,7 @@ mod tests { fn it_accepts_short_output() { let input = TestVector(vec![91; 6], 10); - let actual = format!("{}", input); + let actual = format!("{input}"); let expected = format!("{:?}", vec![91; 6]); assert_eq!(actual, expected); } @@ -507,7 +506,7 @@ mod tests { fn it_accepts_long_output() { let input = TestVector(vec![91; 10], 9); - let actual = format!("{}", input); + let actual = format!("{input}"); let expected = format!("{:?}", vec![91; 9]); assert_eq!(actual, expected); } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 5c5f84b3d15a..31ba1e274f88 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -90,7 +90,7 @@ impl FlightSqlServiceClient { host: &str, port: u16, ) -> Result { - let addr = format!("https://{}:{}", host, port); + let addr = format!("https://{host}:{port}"); let endpoint = Endpoint::new(addr) .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? @@ -112,7 +112,7 @@ impl FlightSqlServiceClient { .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))?; let channel = endpoint.connect().await.map_err(|e| { - ArrowError::IoError(format!("Cannot connect to endpoint: {}", e)) + ArrowError::IoError(format!("Cannot connect to endpoint: {e}")) })?; Ok(Self::new(channel)) } @@ -173,8 +173,8 @@ impl FlightSqlServiceClient { payload: Default::default(), }; let mut req = tonic::Request::new(stream::iter(vec![cmd])); - let val = BASE64_STANDARD.encode(format!("{}:{}", username, password)); - let val = format!("Basic {}", val) + let val = BASE64_STANDARD.encode(format!("{username}:{password}")); + let val = format!("Basic {val}") .parse() .map_err(|_| ArrowError::ParseError("Cannot parse header".to_string()))?; req.metadata_mut().insert("authorization", val); @@ -182,7 +182,7 @@ impl FlightSqlServiceClient { .flight_client .handshake(req) .await - .map_err(|e| ArrowError::IoError(format!("Can't handshake {}", e)))?; + .map_err(|e| ArrowError::IoError(format!("Can't handshake {e}")))?; if let Some(auth) = resp.metadata().get("authorization") { let auth = auth.to_str().map_err(|_| { ArrowError::ParseError("Can't read auth header".to_string()) @@ -331,7 +331,7 @@ impl FlightSqlServiceClient { }; let mut req = tonic::Request::new(action); if let Some(token) = &self.token { - let val = format!("Bearer {}", token).parse().map_err(|_| { + let val = format!("Bearer {token}").parse().map_err(|_| { ArrowError::IoError("Statement already closed.".to_string()) })?; req.metadata_mut().insert("authorization", val); @@ -481,7 +481,7 @@ fn decode_error_to_arrow_error(err: prost::DecodeError) -> ArrowError { } fn status_to_arrow_error(status: tonic::Status) -> ArrowError { - ArrowError::IoError(format!("{:?}", status)) + ArrowError::IoError(format!("{status:?}")) } // A polymorphic structure to natively represent different types of data contained in `FlightData` @@ -496,7 +496,7 @@ pub fn arrow_data_from_flight_data( arrow_schema_ref: &SchemaRef, ) -> Result { let ipc_message = root_as_message(&flight_data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {:?}", err)) + ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) })?; match ipc_message.header_type() { diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 88dc6cde9800..9ea74c3f35bb 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -154,7 +154,7 @@ impl Any { return Ok(None); } let m = Message::decode(&*self.value).map_err(|err| { - ArrowError::ParseError(format!("Unable to decode Any value: {}", err)) + ArrowError::ParseError(format!("Unable to decode Any value: {err}")) })?; Ok(Some(m)) } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index e764e0c51ac7..d48181189a56 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -586,9 +586,9 @@ where } fn decode_error_to_status(err: prost::DecodeError) -> Status { - Status::invalid_argument(format!("{:?}", err)) + Status::invalid_argument(format!("{err:?}")) } fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { - Status::internal(format!("{:?}", err)) + Status::internal(format!("{err:?}")) } diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 266f8eb29241..ccf1e73866e1 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -85,7 +85,7 @@ pub fn flight_data_to_arrow_batch( ) -> Result { // check that the data_header is a record batch message let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {:?}", err)) + ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) })?; message diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 032dad04923d..ab1cfa1fb053 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -92,10 +92,7 @@ fn ensure_metadata(client: &FlightClient, test_server: &TestFlightServer) { assert_eq!( metadata.get(k).as_ref(), Some(&v), - "Missing / Mismatched metadata {:?} sent {:?} got {:?}", - k, - client_metadata, - metadata + "Missing / Mismatched metadata {k:?} sent {client_metadata:?} got {metadata:?}" ); } } @@ -797,29 +794,23 @@ fn expect_status(error: FlightError, expected: Status) { let status = if let FlightError::Tonic(status) = error { status } else { - panic!("Expected FlightError::Tonic, got: {:?}", error); + panic!("Expected FlightError::Tonic, got: {error:?}"); }; assert_eq!( status.code(), expected.code(), - "Got {:?} want {:?}", - status, - expected + "Got {status:?} want {expected:?}" ); assert_eq!( status.message(), expected.message(), - "Got {:?} want {:?}", - status, - expected + "Got {status:?} want {expected:?}" ); assert_eq!( status.details(), expected.details(), - "Got {:?} want {:?}", - status, - expected + "Got {status:?} want {expected:?}" ); } diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index c2e326b4f2f3..ece64e16eb08 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -152,7 +152,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { }, Some(s) if s == "int" => match map.get("isSigned") { Some(&Value::Bool(true)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { + Some(Value::Number(n)) => match n.as_u64() { Some(8) => Ok(DataType::Int8), Some(16) => Ok(DataType::Int16), Some(32) => Ok(DataType::Int32), @@ -166,7 +166,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { )), }, Some(&Value::Bool(false)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { + Some(Value::Number(n)) => match n.as_u64() { Some(8) => Ok(DataType::UInt8), Some(16) => Ok(DataType::UInt16), Some(32) => Ok(DataType::UInt32), @@ -226,8 +226,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { UnionMode::Dense } else { return Err(ArrowError::ParseError(format!( - "Unknown union mode {:?} for union", - mode + "Unknown union mode {mode:?} for union" ))); }; if let Some(type_ids) = map.get("typeIds") { @@ -256,8 +255,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { } } Some(other) => Err(ArrowError::ParseError(format!( - "invalid or unsupported type name: {} in {:?}", - other, json + "invalid or unsupported type name: {other} in {json:?}" ))), None => Err(ArrowError::ParseError("type name missing".to_string())), }, diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index dd0519157f9c..abed0bd1d908 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -26,7 +26,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { match *json { Value::Object(ref map) => { let name = match map.get("name") { - Some(&Value::String(ref name)) => name.to_string(), + Some(Value::String(name)) => name.to_string(), _ => { return Err(ArrowError::ParseError( "Field missing 'name' attribute".to_string(), @@ -52,7 +52,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { - Some(&Value::Array(ref values)) => { + Some(Value::Array(values)) => { let mut res: HashMap = HashMap::default(); for value in values { match value.as_object() { @@ -91,15 +91,14 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { } // We also support map format, because Schema's metadata supports this. // See https://github.com/apache/arrow/pull/5907 - Some(&Value::Object(ref values)) => { + Some(Value::Object(values)) => { let mut res: HashMap = HashMap::default(); for (k, v) in values { if let Some(str_value) = v.as_str() { res.insert(k.clone(), str_value.to_string().clone()); } else { return Err(ArrowError::ParseError(format!( - "Field 'metadata' contains non-string value for key {}", - k + "Field 'metadata' contains non-string value for key {k}" ))); } } @@ -180,7 +179,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { } t => { return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {:?}", t) + format!("Map children should be a struct with 2 fields, found {t:?}") )) } } diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index a0510edd94b6..87a7edc8740b 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -172,8 +172,8 @@ impl ArrowJson { match batch { Some(Ok(batch)) => { if json_batch != batch { - println!("json: {:?}", json_batch); - println!("batch: {:?}", batch); + println!("json: {json_batch:?}"); + println!("batch: {batch:?}"); return Ok(false); } } @@ -255,8 +255,7 @@ impl ArrowJsonField { } Err(e) => { eprintln!( - "Encountered error while converting JSON field to Arrow field: {:?}", - e + "Encountered error while converting JSON field to Arrow field: {e:?}" ); false } @@ -323,10 +322,7 @@ pub fn array_from_json( { match is_valid { 1 => b.append_value(value.as_i64().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to get {:?} as int64", - value - )) + ArrowError::JsonError(format!("Unable to get {value:?} as int64")) })? as i8), _ => b.append_null(), }; @@ -411,18 +407,16 @@ pub fn array_from_json( i64::from_le_bytes(bytes) } _ => panic!( - "Unable to parse {:?} as interval daytime", - value + "Unable to parse {value:?} as interval daytime" ), } } _ => panic!( - "Unable to parse {:?} as interval daytime", - value + "Unable to parse {value:?} as interval daytime" ), } } - _ => panic!("Unable to parse {:?} as number", value), + _ => panic!("Unable to parse {value:?} as number"), }), _ => b.append_null(), }; @@ -502,7 +496,7 @@ pub fn array_from_json( value.as_u64().expect("Unable to read number as u64"), ) } else { - panic!("Unable to parse value {:?} as u64", value) + panic!("Unable to parse value {value:?} as u64") } } _ => b.append_null(), @@ -542,11 +536,11 @@ pub fn array_from_json( months_days_ns } (_, _, _) => { - panic!("Unable to parse {:?} as MonthDayNano", v) + panic!("Unable to parse {v:?} as MonthDayNano") } } } - _ => panic!("Unable to parse {:?} as MonthDayNano", value), + _ => panic!("Unable to parse {value:?} as MonthDayNano"), }), _ => b.append_null(), }; @@ -760,16 +754,14 @@ pub fn array_from_json( DataType::Dictionary(key_type, value_type) => { let dict_id = field.dict_id().ok_or_else(|| { ArrowError::JsonError(format!( - "Unable to find dict_id for field {:?}", - field + "Unable to find dict_id for field {field:?}" )) })?; // find dictionary let dictionary = dictionaries .ok_or_else(|| { ArrowError::JsonError(format!( - "Unable to find any dictionaries for field {:?}", - field + "Unable to find any dictionaries for field {field:?}" )) })? .get(&dict_id); @@ -783,8 +775,7 @@ pub fn array_from_json( dictionaries, ), None => Err(ArrowError::JsonError(format!( - "Unable to find dictionary for field {:?}", - field + "Unable to find dictionary for field {field:?}" ))), } } @@ -892,8 +883,7 @@ pub fn array_from_json( Ok(Arc::new(array)) } t => Err(ArrowError::JsonError(format!( - "data type {:?} not supported", - t + "data type {t:?} not supported" ))), } } @@ -963,8 +953,7 @@ pub fn dictionary_array_from_json( Ok(array) } _ => Err(ArrowError::JsonError(format!( - "Dictionary key type {:?} not supported", - dict_key + "Dictionary key type {dict_key:?} not supported" ))), } } diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 5eb443b08a85..0702a8a68cae 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -62,7 +62,7 @@ fn main() -> Result<()> { fn json_to_arrow(json_name: &str, arrow_name: &str, verbose: bool) -> Result<()> { if verbose { - eprintln!("Converting {} to {}", json_name, arrow_name); + eprintln!("Converting {json_name} to {arrow_name}"); } let json_file = read_json_file(json_name)?; @@ -81,7 +81,7 @@ fn json_to_arrow(json_name: &str, arrow_name: &str, verbose: bool) -> Result<()> fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { if verbose { - eprintln!("Converting {} to {}", arrow_name, json_name); + eprintln!("Converting {arrow_name} to {json_name}"); } let arrow_file = File::open(arrow_name)?; @@ -155,7 +155,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { if verbose { - eprintln!("Validating {} and {}", arrow_name, json_name); + eprintln!("Validating {arrow_name} and {json_name}"); } // open JSON file diff --git a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs index 53c6c441271b..9f66abf50106 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs @@ -30,7 +30,7 @@ type Result = std::result::Result; type Client = FlightServiceClient; pub async fn run_scenario(host: &str, port: u16) -> Result { - let url = format!("http://{}:{}", host, port); + let url = format!("http://{host}:{port}"); let mut client = FlightServiceClient::connect(url).await?; let action = arrow_flight::Action::default(); @@ -41,15 +41,13 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { Err(e) => { if e.code() != tonic::Code::Unauthenticated { return Err(Box::new(Status::internal(format!( - "Expected UNAUTHENTICATED but got {:?}", - e + "Expected UNAUTHENTICATED but got {e:?}" )))); } } Ok(other) => { return Err(Box::new(Status::internal(format!( - "Expected UNAUTHENTICATED but got {:?}", - other + "Expected UNAUTHENTICATED but got {other:?}" )))); } } diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 1f1b312f9619..3c537c5f61d8 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -42,7 +42,7 @@ type Result = std::result::Result; type Client = FlightServiceClient; pub async fn run_scenario(host: &str, port: u16, path: &str) -> Result { - let url = format!("http://{}:{}", host, port); + let url = format!("http://{host}:{port}"); let client = FlightServiceClient::connect(url).await?; @@ -235,7 +235,7 @@ async fn consume_flight_location( let expected_data = expected_batch.column(i).data(); let actual_data = actual_batch.column(i).data(); - assert_eq!(expected_data, actual_data, "Data for field {}", field_name); + assert_eq!(expected_data, actual_data, "Data for field {field_name}"); } } diff --git a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs index 72ef37d3f548..773919ff72af 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs @@ -26,7 +26,7 @@ type Error = Box; type Result = std::result::Result; pub async fn run_scenario(host: &str, port: u16) -> Result { - let url = format!("http://{}:{}", host, port); + let url = format!("http://{host}:{port}"); let conn = tonic::transport::Endpoint::new(url)?.connect().await?; let mut client = FlightServiceClient::with_interceptor(conn, middleware_interceptor); @@ -48,8 +48,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { if value != "expected value" { let msg = format!( "On failing call: Expected to receive header 'x-middleware: expected value', \ - but instead got: '{}'", - value + but instead got: '{value}'" ); return Err(Box::new(Status::internal(msg))); } @@ -67,8 +66,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { if value != "expected value" { let msg = format!( "On success call: Expected to receive header 'x-middleware: expected value', \ - but instead got: '{}'", - value + but instead got: '{value}'" ); return Err(Box::new(Status::internal(msg))); } diff --git a/arrow-integration-testing/src/flight_server_scenarios.rs b/arrow-integration-testing/src/flight_server_scenarios.rs index 6976c1267524..9034776c68d4 100644 --- a/arrow-integration-testing/src/flight_server_scenarios.rs +++ b/arrow-integration-testing/src/flight_server_scenarios.rs @@ -28,7 +28,7 @@ type Error = Box; type Result = std::result::Result; pub async fn listen_on(port: u16) -> Result { - let addr: SocketAddr = format!("0.0.0.0:{}", port).parse()?; + let addr: SocketAddr = format!("0.0.0.0:{port}").parse()?; let listener = TcpListener::bind(addr).await?; let addr = listener.local_addr()?; diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 7ad4c676ffab..51d08d94313c 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -48,7 +48,7 @@ pub async fn scenario_setup(port: u16) -> Result { let addr = super::listen_on(port).await?; let service = FlightServiceImpl { - server_location: format!("grpc+tcp://{}", addr), + server_location: format!("grpc+tcp://{addr}"), ..Default::default() }; let svc = FlightServiceServer::new(service); @@ -103,13 +103,13 @@ impl FlightService for FlightServiceImpl { let ticket = request.into_inner(); let key = String::from_utf8(ticket.ticket.to_vec()) - .map_err(|e| Status::invalid_argument(format!("Invalid ticket: {:?}", e)))?; + .map_err(|e| Status::invalid_argument(format!("Invalid ticket: {e:?}")))?; let uploaded_chunks = self.uploaded_chunks.lock().await; - let flight = uploaded_chunks.get(&key).ok_or_else(|| { - Status::not_found(format!("Could not find flight. {}", key)) - })?; + let flight = uploaded_chunks + .get(&key) + .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?; let options = arrow::ipc::writer::IpcWriteOptions::default(); @@ -204,7 +204,7 @@ impl FlightService for FlightServiceImpl { Ok(Response::new(info)) } - other => Err(Status::unimplemented(format!("Request type: {}", other))), + other => Err(Status::unimplemented(format!("Request type: {other}"))), } } @@ -231,7 +231,7 @@ impl FlightService for FlightServiceImpl { let key = descriptor.path[0].clone(); let schema = Schema::try_from(&flight_data) - .map_err(|e| Status::invalid_argument(format!("Invalid schema: {:?}", e)))?; + .map_err(|e| Status::invalid_argument(format!("Invalid schema: {e:?}")))?; let schema_ref = Arc::new(schema.clone()); let (response_tx, response_rx) = mpsc::channel(10); @@ -287,7 +287,7 @@ async fn send_app_metadata( app_metadata: app_metadata.to_vec().into(), })) .await - .map_err(|e| Status::internal(format!("Could not send PutResult: {:?}", e))) + .map_err(|e| Status::internal(format!("Could not send PutResult: {e:?}"))) } async fn record_batch_from_message( @@ -309,9 +309,8 @@ async fn record_batch_from_message( &message.version(), ); - arrow_batch_result.map_err(|e| { - Status::internal(format!("Could not convert to RecordBatch: {:?}", e)) - }) + arrow_batch_result + .map_err(|e| Status::internal(format!("Could not convert to RecordBatch: {e:?}"))) } async fn dictionary_from_message( @@ -331,9 +330,8 @@ async fn dictionary_from_message( dictionaries_by_id, &message.version(), ); - dictionary_batch_result.map_err(|e| { - Status::internal(format!("Could not convert to Dictionary: {:?}", e)) - }) + dictionary_batch_result + .map_err(|e| Status::internal(format!("Could not convert to Dictionary: {e:?}"))) } async fn save_uploaded_chunks( @@ -351,7 +349,7 @@ async fn save_uploaded_chunks( while let Some(Ok(data)) = input_stream.next().await { let message = arrow::ipc::root_as_message(&data.data_header[..]) - .map_err(|e| Status::internal(format!("Could not parse message: {:?}", e)))?; + .map_err(|e| Status::internal(format!("Could not parse message: {e:?}")))?; match message.header_type() { ipc::MessageHeader::Schema => { @@ -384,8 +382,7 @@ async fn save_uploaded_chunks( t => { return Err(Status::internal(format!( "Reading types other than record batches not yet supported, \ - unable to read {:?}", - t + unable to read {t:?}" ))); } } diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index b0c8b85afe2e..fe0cc68a4205 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -90,8 +90,7 @@ pub fn read_gzip_json(version: &str, path: &str) -> ArrowJson { let testdata = arrow_test_data(); let file = File::open(format!( - "{}/arrow-ipc-stream/integration/{}/{}.json.gz", - testdata, version, path + "{testdata}/arrow-ipc-stream/integration/{version}/{path}.json.gz" )) .unwrap(); let mut gz = GzDecoder::new(&file); diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 6d91eeccb19e..d6e81cd9883b 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -89,8 +89,7 @@ fn read_1_0_0_bigendian() { ]; paths.iter().for_each(|path| { let file = File::open(format!( - "{}/arrow-ipc-stream/integration/1.0.0-bigendian/{}.arrow_file", - testdata, path + "{testdata}/arrow-ipc-stream/integration/1.0.0-bigendian/{path}.arrow_file" )) .unwrap(); @@ -161,10 +160,8 @@ fn read_2_0_0_compression() { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_file(testdata: &str, version: &str, path: &str) { - let filename = format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - ); + let filename = + format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON @@ -200,10 +197,8 @@ fn verify_arrow_file(testdata: &str, version: &str, path: &str) { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_stream(testdata: &str, version: &str, path: &str) { - let filename = format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - ); + let filename = + format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index a521737fa5ea..40f356b1d442 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -143,10 +143,8 @@ fn roundtrip_arrow_file_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = format!( - "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", - testdata, version, path - ); + let filename = + format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); @@ -222,10 +220,8 @@ fn roundtrip_arrow_stream_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = format!( - "{}/arrow-ipc-stream/integration/{}/{}.stream", - testdata, version, path - ); + let filename = + format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index f64d14441cb1..e6e203bc0034 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -37,8 +37,7 @@ impl TryFrom for CompressionCodec { CompressionType::ZSTD => Ok(CompressionCodec::Zstd), CompressionType::LZ4_FRAME => Ok(CompressionCodec::Lz4Frame), other_type => Err(ArrowError::NotYetImplemented(format!( - "compression type {:?} not supported ", - other_type + "compression type {other_type:?} not supported " ))), } } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 305bb943cbbf..c5681b0c8f1b 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -185,8 +185,7 @@ pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { let msg = size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { ArrowError::ParseError(format!( - "Unable to convert flight info to a message: {}", - err + "Unable to convert flight info to a message: {err}" )) })?; let ipc_schema = msg.header_as_schema().ok_or_else(|| { @@ -259,7 +258,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat crate::Precision::HALF => DataType::Float16, crate::Precision::SINGLE => DataType::Float32, crate::Precision::DOUBLE => DataType::Float64, - z => panic!("FloatingPoint type with precision of {:?} not supported", z), + z => panic!("FloatingPoint type with precision of {z:?} not supported"), } } crate::Type::Date => { @@ -267,7 +266,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat match date.unit() { crate::DateUnit::DAY => DataType::Date32, crate::DateUnit::MILLISECOND => DataType::Date64, - z => panic!("Date type with unit of {:?} not supported", z), + z => panic!("Date type with unit of {z:?} not supported"), } } crate::Type::Time => { @@ -305,7 +304,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat crate::TimeUnit::NANOSECOND => { DataType::Timestamp(TimeUnit::Nanosecond, timezone) } - z => panic!("Timestamp type with unit of {:?} not supported", z), + z => panic!("Timestamp type with unit of {z:?} not supported"), } } crate::Type::Interval => { @@ -320,7 +319,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat crate::IntervalUnit::MONTH_DAY_NANO => { DataType::Interval(IntervalUnit::MonthDayNano) } - z => panic!("Interval type with unit of {:?} unsupported", z), + z => panic!("Interval type with unit of {z:?} unsupported"), } } crate::Type::Duration => { @@ -330,7 +329,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat crate::TimeUnit::MILLISECOND => DataType::Duration(TimeUnit::Millisecond), crate::TimeUnit::MICROSECOND => DataType::Duration(TimeUnit::Microsecond), crate::TimeUnit::NANOSECOND => DataType::Duration(TimeUnit::Nanosecond), - z => panic!("Duration type with unit of {:?} unsupported", z), + z => panic!("Duration type with unit of {z:?} unsupported"), } } crate::Type::List => { @@ -387,7 +386,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat fsb.scale().try_into().unwrap(), ) } else { - panic!("Unexpected decimal bit width {}", bit_width) + panic!("Unexpected decimal bit width {bit_width}") } } crate::Type::Union => { @@ -396,7 +395,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let union_mode = match union.mode() { crate::UnionMode::Dense => UnionMode::Dense, crate::UnionMode::Sparse => UnionMode::Sparse, - mode => panic!("Unexpected union mode: {:?}", mode), + mode => panic!("Unexpected union mode: {mode:?}"), }; let mut fields = vec![]; diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 231f72910174..17f521e423a4 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -203,13 +203,12 @@ fn create_array( ]; let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::IoError(format!("Field {} does not have dict id", field)) + ArrowError::IoError(format!("Field {field} does not have dict id")) })?; let value_array = dictionaries_by_id.get(&dict_id).ok_or_else(|| { ArrowError::IoError(format!( - "Cannot find a dictionary batch with dict id: {}", - dict_id + "Cannot find a dictionary batch with dict id: {dict_id}" )) })?; node_index += 1; @@ -283,8 +282,7 @@ fn create_array( if length != null_count { return Err(ArrowError::IoError(format!( - "Field {} of NullArray has unequal null_count {} and len {}", - field, null_count, length + "Field {field} of NullArray has unequal null_count {null_count} and len {length}" ))); } @@ -797,7 +795,7 @@ impl FileReader { reader.read_exact(&mut footer_data)?; let footer = crate::root_as_footer(&footer_data[..]).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as footer: {:?}", err)) + ArrowError::IoError(format!("Unable to get root as footer: {err:?}")) })?; let blocks = footer.recordBatches().ok_or_else(|| { @@ -828,10 +826,7 @@ impl FileReader { reader.read_exact(&mut block_data)?; let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::IoError(format!( - "Unable to get root as message: {:?}", - err - )) + ArrowError::IoError(format!("Unable to get root as message: {err:?}")) })?; match message.header_type() { @@ -856,8 +851,7 @@ impl FileReader { } t => { return Err(ArrowError::IoError(format!( - "Expecting DictionaryBatch in dictionary blocks, found {:?}.", - t + "Expecting DictionaryBatch in dictionary blocks, found {t:?}." ))); } } @@ -925,7 +919,7 @@ impl FileReader { let mut block_data = vec![0; meta_len as usize]; self.reader.read_exact(&mut block_data)?; let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as footer: {:?}", err)) + ArrowError::IoError(format!("Unable to get root as footer: {err:?}")) })?; // some old test data's footer metadata is not set, so we account for that @@ -968,7 +962,7 @@ impl FileReader { Ok(None) } t => Err(ArrowError::IoError(format!( - "Reading types other than record batches not yet supported, unable to read {:?}", t + "Reading types other than record batches not yet supported, unable to read {t:?}" ))), } } @@ -1054,7 +1048,7 @@ impl StreamReader { reader.read_exact(&mut meta_buffer)?; let message = crate::root_as_message(meta_buffer.as_slice()).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as message: {:?}", err)) + ArrowError::IoError(format!("Unable to get root as message: {err:?}")) })?; // message header is a Schema, so read it let ipc_schema: crate::Schema = message.header_as_schema().ok_or_else(|| { @@ -1133,7 +1127,7 @@ impl StreamReader { let vecs = &meta_buffer.to_vec(); let message = crate::root_as_message(vecs).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as message: {:?}", err)) + ArrowError::IoError(format!("Unable to get root as message: {err:?}")) })?; match message.header_type() { @@ -1173,7 +1167,7 @@ impl StreamReader { Ok(None) } t => Err(ArrowError::IoError( - format!("Reading types other than record batches not yet supported, unable to read {:?} ", t) + format!("Reading types other than record batches not yet supported, unable to read {t:?} ") )), } } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index ec3cba64aa73..ea6eb360e579 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -118,8 +118,7 @@ impl IpcWriteOptions { } } z => Err(ArrowError::InvalidArgumentError(format!( - "Unsupported crate::MetadataVersion {:?}", - z + "Unsupported crate::MetadataVersion {z:?}" ))), } } @@ -962,7 +961,7 @@ fn write_continuation( writer.write_all(&CONTINUATION_MARKER)?; writer.write_all(&total_len.to_le_bytes()[..])?; } - z => panic!("Unsupported crate::MetadataVersion {:?}", z), + z => panic!("Unsupported crate::MetadataVersion {z:?}"), }; writer.flush()?; @@ -1296,8 +1295,7 @@ fn write_buffer( .try_into() .map_err(|e| { ArrowError::InvalidArgumentError(format!( - "Could not convert compressed size to i64: {}", - e + "Could not convert compressed size to i64: {e}" )) })?; diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 9ffa7d2133a0..267c8bebc83d 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -281,9 +281,9 @@ fn make_decoder( DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, is_nullable)?)), DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, is_nullable)?)), DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { - Err(ArrowError::JsonError(format!("{} is not supported by JSON", data_type))) + Err(ArrowError::JsonError(format!("{data_type} is not supported by JSON"))) } - d => Err(ArrowError::NotYetImplemented(format!("Support for {} in JSON reader", d))) + d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader"))) } } diff --git a/arrow-json/src/raw/tape.rs b/arrow-json/src/raw/tape.rs index 6ca4e2d3f521..b0c814c766b8 100644 --- a/arrow-json/src/raw/tape.rs +++ b/arrow-json/src/raw/tape.rs @@ -562,7 +562,7 @@ fn err(b: u8, ctx: &str) -> ArrowError { fn char_from_surrogate_pair(low: u16, high: u16) -> Result { let n = (((high - 0xD800) as u32) << 10 | (low - 0xDC00) as u32) + 0x1_0000; char::from_u32(n).ok_or_else(|| { - ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {}", n)) + ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")) }) } diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index c2647ebfc18c..1d4cfc740fdf 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -46,7 +46,7 @@ //! let batch = json.next().unwrap().unwrap(); //! ``` -use std::io::{BufRead, BufReader, Read, Seek, SeekFrom}; +use std::io::{BufRead, BufReader, Read, Seek}; use std::sync::Arc; use indexmap::map::IndexMap as HashMap; @@ -104,8 +104,7 @@ impl InferredType { // incompatible types (s, o) => { return Err(ArrowError::JsonError(format!( - "Incompatible type found during schema inference: {:?} v.s. {:?}", - s, o, + "Incompatible type found during schema inference: {s:?} v.s. {o:?}", ))); } } @@ -228,8 +227,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { } Err(e) => { return Some(Err(ArrowError::JsonError(format!( - "Failed to read JSON record: {}", - e + "Failed to read JSON record: {e}" )))); } _ => { @@ -241,7 +239,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { self.record_count += 1; return Some(serde_json::from_str(trimmed_s).map_err(|e| { - ArrowError::JsonError(format!("Not valid JSON: {}", e)) + ArrowError::JsonError(format!("Not valid JSON: {e}")) })); } } @@ -275,7 +273,7 @@ pub fn infer_json_schema_from_seekable( ) -> Result { let schema = infer_json_schema(reader, max_read_records); // return the reader seek back to the start - reader.seek(SeekFrom::Start(0))?; + reader.rewind()?; schema } @@ -336,8 +334,7 @@ fn set_object_scalar_field_type( Ok(()) } t => Err(ArrowError::JsonError(format!( - "Expected scalar or scalar array JSON type, found: {:?}", - t, + "Expected scalar or scalar array JSON type, found: {t:?}", ))), } } @@ -363,8 +360,7 @@ fn infer_scalar_array_type(array: &[Value]) -> Result } Value::Array(_) | Value::Object(_) => { return Err(ArrowError::JsonError(format!( - "Expected scalar value for scalar array, got: {:?}", - v + "Expected scalar value for scalar array, got: {v:?}" ))); } } @@ -383,8 +379,7 @@ fn infer_nested_array_type(array: &[Value]) -> Result } x => { return Err(ArrowError::JsonError(format!( - "Got non array element in nested array: {:?}", - x + "Got non array element in nested array: {x:?}" ))); } } @@ -403,8 +398,7 @@ fn infer_struct_array_type(array: &[Value]) -> Result } _ => { return Err(ArrowError::JsonError(format!( - "Expected struct value for struct array, got: {:?}", - v + "Expected struct value for struct array, got: {v:?}" ))); } } @@ -474,8 +468,7 @@ fn collect_field_types_from_object( } t => { return Err(ArrowError::JsonError(format!( - "Expected array json type, found: {:?}", - t, + "Expected array json type, found: {t:?}", ))); } } @@ -509,8 +502,7 @@ fn collect_field_types_from_object( } t => { return Err(ArrowError::JsonError(format!( - "Expected object json type, found: {:?}", - t, + "Expected object json type, found: {t:?}", ))); } } @@ -547,8 +539,7 @@ where } value => { return Err(ArrowError::JsonError(format!( - "Expected JSON record to be an object, found {:?}", - value + "Expected JSON record to be an object, found {value:?}" ))); } }; @@ -698,8 +689,7 @@ impl Decoder { Value::Object(_) => rows.push(v), _ => { return Err(ArrowError::JsonError(format!( - "Row needs to be of type object, got: {:?}", - v + "Row needs to be of type object, got: {v:?}" ))); } } @@ -803,8 +793,7 @@ impl Decoder { self.list_array_string_array_builder::(&dtype, col_name, rows) } ref e => Err(ArrowError::JsonError(format!( - "Data type is currently not supported for dictionaries in list : {:?}", - e + "Data type is currently not supported for dictionaries in list : {e:?}" ))), } } @@ -832,8 +821,7 @@ impl Decoder { } e => { return Err(ArrowError::JsonError(format!( - "Nested list data builder type is not supported: {:?}", - e + "Nested list data builder type is not supported: {e:?}" ))) } }; @@ -905,8 +893,7 @@ impl Decoder { } e => { return Err(ArrowError::JsonError(format!( - "Nested list data builder type is not supported: {:?}", - e + "Nested list data builder type is not supported: {e:?}" ))) } } @@ -1174,8 +1161,7 @@ impl Decoder { } datatype => { return Err(ArrowError::JsonError(format!( - "Nested list of {:?} not supported", - datatype + "Nested list of {datatype:?} not supported" ))); } }; @@ -1288,8 +1274,7 @@ impl Decoder { field.name(), ), t => Err(ArrowError::JsonError(format!( - "TimeUnit {:?} not supported with Time64", - t + "TimeUnit {t:?} not supported with Time64" ))), }, DataType::Time32(unit) => match unit { @@ -1304,8 +1289,7 @@ impl Decoder { field.name(), ), t => Err(ArrowError::JsonError(format!( - "TimeUnit {:?} not supported with Time32", - t + "TimeUnit {t:?} not supported with Time32" ))), }, DataType::Utf8 => Ok(Arc::new( @@ -2168,7 +2152,7 @@ mod tests { let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); let mut reader = BufReader::new(GzDecoder::new(&file)); let schema = infer_json_schema(&mut reader, None).unwrap(); - file.seek(SeekFrom::Start(0)).unwrap(); + file.rewind().unwrap(); let reader = BufReader::new(GzDecoder::new(&file)); let options = DecoderOptions::new().with_batch_size(64); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 9045bd3a77ee..9d241aed3d28 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -198,8 +198,7 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { Ok(jsonmaps.into_iter().map(Value::Object).collect()) } t => Err(ArrowError::JsonError(format!( - "data type {:?} not supported", - t + "data type {t:?} not supported" ))), } } diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index b8b510a2eb84..89fbccead6f9 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -452,7 +452,7 @@ fn try_to_type_result( ty: &str, ) -> Result { value.ok_or_else(|| { - ArrowError::ComputeError(format!("Could not convert {} with {}", right, ty,)) + ArrowError::ComputeError(format!("Could not convert {right} with {ty}",)) }) } diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 00b6668adaf9..dc352c5b7274 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -123,8 +123,7 @@ where Int64 => compare_dict_primitive::(left, right), t => { return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {:?}", - t + "Dictionaries do not support keys of type {t:?}" ))); } }) @@ -255,15 +254,13 @@ pub fn build_compare( Int64 => compare_dict_string::(left, right), lhs => { return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {:?}", - lhs + "Dictionaries do not support keys of type {lhs:?}" ))); } }, t => { return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries of value data type {:?} are not supported", - t + "Dictionaries of value data type {t:?} are not supported" ))); } } @@ -278,8 +275,7 @@ pub fn build_compare( } (lhs, _) => { return Err(ArrowError::InvalidArgumentError(format!( - "The data type type {:?} has no natural order", - lhs + "The data type type {lhs:?} has no natural order" ))); } }) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index d13a7a03de94..f36e91d648c4 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -282,8 +282,7 @@ pub fn sort_to_indices( } t => { return Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {:?}", - t + "Sort not supported for list type {t:?}" ))); } }, @@ -310,8 +309,7 @@ pub fn sort_to_indices( } t => { return Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {:?}", - t + "Sort not supported for list type {t:?}" ))); } }, @@ -347,11 +345,11 @@ pub fn sort_to_indices( sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) }, t => return Err(ArrowError::ComputeError(format!( - "Unsupported dictionary value type {}", t + "Unsupported dictionary value type {t}" ))), }, t => return Err(ArrowError::ComputeError(format!( - "Unsupported datatype {}", t + "Unsupported datatype {t}" ))), ) } @@ -361,8 +359,7 @@ pub fn sort_to_indices( DataType::LargeBinary => sort_binary::(values, v, n, &options, limit), t => { return Err(ArrowError::ComputeError(format!( - "Sort not supported for data type {:?}", - t + "Sort not supported for data type {t:?}" ))); } }) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index eb9dc29848f0..1d54a008f36b 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -578,8 +578,7 @@ impl RowConverter { pub fn new(fields: Vec) -> Result { if !Self::supports_fields(&fields) { return Err(ArrowError::NotYetImplemented(format!( - "Row format support not yet implemented for: {:?}", - fields + "Row format support not yet implemented for: {fields:?}" ))); } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 1e5c1321c952..78ad0258d512 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -299,7 +299,7 @@ pub enum UnionMode { impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index 6213af8bcf10..cd236c0871a6 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -76,23 +76,23 @@ impl Display for ArrowError { write!(f, "Not yet implemented: {}", &source) } ArrowError::ExternalError(source) => write!(f, "External error: {}", &source), - ArrowError::CastError(desc) => write!(f, "Cast error: {}", desc), - ArrowError::MemoryError(desc) => write!(f, "Memory error: {}", desc), - ArrowError::ParseError(desc) => write!(f, "Parser error: {}", desc), - ArrowError::SchemaError(desc) => write!(f, "Schema error: {}", desc), - ArrowError::ComputeError(desc) => write!(f, "Compute error: {}", desc), + ArrowError::CastError(desc) => write!(f, "Cast error: {desc}"), + ArrowError::MemoryError(desc) => write!(f, "Memory error: {desc}"), + ArrowError::ParseError(desc) => write!(f, "Parser error: {desc}"), + ArrowError::SchemaError(desc) => write!(f, "Schema error: {desc}"), + ArrowError::ComputeError(desc) => write!(f, "Compute error: {desc}"), ArrowError::DivideByZero => write!(f, "Divide by zero error"), - ArrowError::CsvError(desc) => write!(f, "Csv error: {}", desc), - ArrowError::JsonError(desc) => write!(f, "Json error: {}", desc), - ArrowError::IoError(desc) => write!(f, "Io error: {}", desc), + ArrowError::CsvError(desc) => write!(f, "Csv error: {desc}"), + ArrowError::JsonError(desc) => write!(f, "Json error: {desc}"), + ArrowError::IoError(desc) => write!(f, "Io error: {desc}"), ArrowError::InvalidArgumentError(desc) => { - write!(f, "Invalid argument error: {}", desc) + write!(f, "Invalid argument error: {desc}") } ArrowError::ParquetError(desc) => { - write!(f, "Parquet argument error: {}", desc) + write!(f, "Parquet argument error: {desc}") } ArrowError::CDataInterface(desc) => { - write!(f, "C Data interface error: {}", desc) + write!(f, "C Data interface error: {desc}") } ArrowError::DictionaryKeyOverflowError => { write!(f, "Dictionary key bigger than the key type") diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index dc3ab3d6237f..8dcb8cea9e7c 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -476,7 +476,7 @@ impl Field { // TODO: improve display with crate https://crates.io/crates/derive_more ? impl std::fmt::Display for Field { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index e45cedfb6769..b7971027f13e 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -151,8 +151,7 @@ impl Schema { if old_val != &value { return Err(ArrowError::SchemaError(format!( "Fail to merge schema due to conflicting metadata. \ - Key '{}' has different values '{}' and '{}'", - key, old_val, value + Key '{key}' has different values '{old_val}' and '{value}'" ))); } } @@ -212,8 +211,7 @@ impl Schema { let valid_fields: Vec = self.fields.iter().map(|f| f.name().clone()).collect(); ArrowError::SchemaError(format!( - "Unable to get field named \"{}\". Valid fields: {:?}", - name, valid_fields + "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" )) }) } @@ -764,9 +762,7 @@ mod tests { let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'"; assert!( res.to_string().contains(expected), - "Could not find expected string '{}' in '{}'", - expected, - res + "Could not find expected string '{expected}' in '{res}'" ); } } diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index cff8fd25b7f1..be6b0a063275 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -106,8 +106,7 @@ pub fn concat_batches<'a>( .find(|&(_, batch)| batch.schema() != *schema) { return Err(ArrowError::InvalidArgumentError(format!( - "batches[{}] schema is different with argument schema.", - i + "batches[{i}] schema is different with argument schema." ))); } let field_num = schema.fields().len(); @@ -555,8 +554,7 @@ mod tests { assert_eq!( combined.values(), &(Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef), - "Actual: {:#?}", - combined + "Actual: {combined:#?}" ); assert_eq!( diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 9fffa0b5f6de..d8989fa48293 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -100,7 +100,7 @@ where })?; if ix >= len { return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} entries", ix, len)) + format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) ); } Ok(()) @@ -112,7 +112,7 @@ where })?; if ix >= len { return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} entries", ix, len)) + format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) ); } Ok(()) @@ -340,7 +340,7 @@ where if indices_data.is_null(index) { T::default() } else { - panic!("Out-of-bounds index {}", index) + panic!("Out-of-bounds index {index}") } } }) diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index e9219fb2dc09..78fe3a47d1b9 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -101,8 +101,7 @@ pub fn concat_elements_utf8_many( let size = arrays[0].len(); if !arrays.iter().all(|array| array.len() == size) { return Err(ArrowError::ComputeError(format!( - "Arrays must have the same length of {}", - size, + "Arrays must have the same length of {size}", ))); } diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index f7faa0a61435..9651bef2771f 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -176,8 +176,7 @@ pub fn length(array: &dyn Array) -> Result { DataType::Binary => Ok(length_binary::(array)), DataType::LargeBinary => Ok(length_binary::(array)), other => Err(ArrowError::ComputeError(format!( - "length not supported for {:?}", - other + "length not supported for {other:?}" ))), } } @@ -210,8 +209,7 @@ pub fn bit_length(array: &dyn Array) -> Result { DataType::Binary => Ok(bit_length_binary::(array)), DataType::LargeBinary => Ok(bit_length_binary::(array)), other => Err(ArrowError::ComputeError(format!( - "bit_length not supported for {:?}", - other + "bit_length not supported for {other:?}" ))), } } diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index c9cdb7bab18d..10a58b3c00f6 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -266,10 +266,9 @@ fn like<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + Regex::new(&format!("^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e + "Unable to build regex from LIKE pattern: {e}" )) }) }) @@ -313,10 +312,9 @@ fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( })) } else { let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + let re = Regex::new(&format!("^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e + "Unable to build regex from LIKE pattern: {e}" )) })?; @@ -397,10 +395,9 @@ fn nlike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { + Regex::new(&format!("^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e + "Unable to build regex from LIKE pattern: {e}" )) }) }) @@ -445,10 +442,9 @@ fn ilike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e + "Unable to build regex from ILIKE pattern: {e}" )) }) }) @@ -491,11 +487,8 @@ fn ilike_scalar_op bool>( } let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e - )) + let re = Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { + ArrowError::ComputeError(format!("Unable to build regex from ILIKE pattern: {e}")) })?; Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) @@ -537,10 +530,9 @@ fn nilike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { + Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {}", - e + "Unable to build regex from ILIKE pattern: {e}" )) }) }) diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index ddb47969cf29..4072d8ba07e5 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -55,7 +55,7 @@ pub fn regexp_is_match_utf8( Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( |(pattern, flags)| { pattern.map(|pattern| match flags { - Some(flag) => format!("(?{}){}", flag, pattern), + Some(flag) => format!("(?{flag}){pattern}"), None => pattern.to_string(), }) }, @@ -84,8 +84,7 @@ pub fn regexp_is_match_utf8( None => { let re = Regex::new(pattern.as_str()).map_err(|e| { ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e + "Regular expression did not compile: {e:?}" )) })?; patterns.insert(pattern, re.clone()); @@ -127,17 +126,14 @@ pub fn regexp_is_match_utf8_scalar( let mut result = BooleanBufferBuilder::new(array.len()); let pattern = match flag { - Some(flag) => format!("(?{}){}", flag, regex), + Some(flag) => format!("(?{flag}){regex}"), None => regex.to_string(), }; if pattern.is_empty() { result.append_n(array.len(), true); } else { let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) + ArrowError::ComputeError(format!("Regular expression did not compile: {e:?}")) })?; for i in 0..array.len() { let value = array.value(i); @@ -175,7 +171,7 @@ pub fn regexp_match( Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( |(pattern, flags)| { pattern.map(|pattern| match flags { - Some(value) => format!("(?{}){}", value, pattern), + Some(value) => format!("(?{value}){pattern}"), None => pattern.to_string(), }) }, @@ -204,8 +200,7 @@ pub fn regexp_match( None => { let re = Regex::new(pattern.as_str()).map_err(|e| { ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e + "Regular expression did not compile: {e:?}" )) })?; patterns.insert(pattern, re.clone()); diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index ece367553414..7d04304771a6 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -379,8 +379,7 @@ fn utf8_substring( Ok(offset) } else { Err(ArrowError::ComputeError(format!( - "The offset {} is at an invalid utf-8 boundary.", - offset_usize + "The offset {offset_usize} is at an invalid utf-8 boundary." ))) } } diff --git a/arrow/benches/arithmetic_kernels.rs b/arrow/benches/arithmetic_kernels.rs index 2aa2e7191a68..4ed197783b07 100644 --- a/arrow/benches/arithmetic_kernels.rs +++ b/arrow/benches/arithmetic_kernels.rs @@ -33,46 +33,46 @@ fn add_benchmark(c: &mut Criterion) { let arr_b = create_primitive_array::(BATCH_SIZE, null_density); let scalar = seedable_rng().gen(); - c.bench_function(&format!("add({})", null_density), |b| { + c.bench_function(&format!("add({null_density})"), |b| { b.iter(|| criterion::black_box(add(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("add_checked({})", null_density), |b| { + c.bench_function(&format!("add_checked({null_density})"), |b| { b.iter(|| criterion::black_box(add_checked(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("add_scalar({})", null_density), |b| { + c.bench_function(&format!("add_scalar({null_density})"), |b| { b.iter(|| criterion::black_box(add_scalar(&arr_a, scalar).unwrap())) }); - c.bench_function(&format!("subtract({})", null_density), |b| { + c.bench_function(&format!("subtract({null_density})"), |b| { b.iter(|| criterion::black_box(subtract(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("subtract_checked({})", null_density), |b| { + c.bench_function(&format!("subtract_checked({null_density})"), |b| { b.iter(|| criterion::black_box(subtract_checked(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("subtract_scalar({})", null_density), |b| { + c.bench_function(&format!("subtract_scalar({null_density})"), |b| { b.iter(|| criterion::black_box(subtract_scalar(&arr_a, scalar).unwrap())) }); - c.bench_function(&format!("multiply({})", null_density), |b| { + c.bench_function(&format!("multiply({null_density})"), |b| { b.iter(|| criterion::black_box(multiply(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("multiply_checked({})", null_density), |b| { + c.bench_function(&format!("multiply_checked({null_density})"), |b| { b.iter(|| criterion::black_box(multiply_checked(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("multiply_scalar({})", null_density), |b| { + c.bench_function(&format!("multiply_scalar({null_density})"), |b| { b.iter(|| criterion::black_box(multiply_scalar(&arr_a, scalar).unwrap())) }); - c.bench_function(&format!("divide({})", null_density), |b| { + c.bench_function(&format!("divide({null_density})"), |b| { b.iter(|| criterion::black_box(divide(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("divide_checked({})", null_density), |b| { + c.bench_function(&format!("divide_checked({null_density})"), |b| { b.iter(|| criterion::black_box(divide_checked(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("divide_scalar({})", null_density), |b| { + c.bench_function(&format!("divide_scalar({null_density})"), |b| { b.iter(|| criterion::black_box(divide_scalar(&arr_a, scalar).unwrap())) }); - c.bench_function(&format!("modulo({})", null_density), |b| { + c.bench_function(&format!("modulo({null_density})"), |b| { b.iter(|| criterion::black_box(modulus(&arr_a, &arr_b).unwrap())) }); - c.bench_function(&format!("modulo_scalar({})", null_density), |b| { + c.bench_function(&format!("modulo_scalar({null_density})"), |b| { b.iter(|| criterion::black_box(modulus_scalar(&arr_a, scalar).unwrap())) }); } diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 02c8ca2d2993..66a956315b29 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -37,7 +37,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { drop(csv); for batch_size in [128, 1024, 4096] { - c.bench_function(&format!("{} - {}", name, batch_size), |b| { + c.bench_function(&format!("{name} - {batch_size}"), |b| { b.iter(|| { let cursor = Cursor::new(buf.as_slice()); let reader = csv::ReaderBuilder::new() diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs index 0c3eec60c0ce..2bb430e40b0f 100644 --- a/arrow/benches/interleave_kernels.rs +++ b/arrow/benches/interleave_kernels.rs @@ -53,10 +53,9 @@ fn do_bench( }) .collect(); - c.bench_function( - &format!("interleave {} {} {:?}", prefix, len, slices), - |b| b.iter(|| criterion::black_box(interleave(&values, &indices).unwrap())), - ); + c.bench_function(&format!("interleave {prefix} {len} {slices:?}"), |b| { + b.iter(|| criterion::black_box(interleave(&values, &indices).unwrap())) + }); } fn add_benchmark(c: &mut Criterion) { diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index 5c161ec8df0f..30dab9a74667 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -89,16 +89,11 @@ fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { }) .collect(); - c.bench_function( - &format!("lexsort_to_indices({:?}): {}", columns, len), - |b| { - b.iter(|| { - criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap()) - }) - }, - ); + c.bench_function(&format!("lexsort_to_indices({columns:?}): {len}"), |b| { + b.iter(|| criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap())) + }); - c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| { + c.bench_function(&format!("lexsort_rows({columns:?}): {len}"), |b| { b.iter(|| { criterion::black_box({ let fields = arrays diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index ac9f3106f7e7..961cf07de721 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -36,7 +36,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { .map(|x| SortField::new(x.data_type().clone())) .collect(); - c.bench_function(&format!("convert_columns {}", name), |b| { + c.bench_function(&format!("convert_columns {name}"), |b| { b.iter(|| { let mut converter = RowConverter::new(fields.clone()).unwrap(); black_box(converter.convert_columns(&cols).unwrap()) @@ -46,11 +46,11 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let mut converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(&cols).unwrap(); // using a pre-prepared row converter should be faster than the first time - c.bench_function(&format!("convert_columns_prepared {}", name), |b| { + c.bench_function(&format!("convert_columns_prepared {name}"), |b| { b.iter(|| black_box(converter.convert_columns(&cols).unwrap())); }); - c.bench_function(&format!("convert_rows {}", name), |b| { + c.bench_function(&format!("convert_rows {name}"), |b| { b.iter(|| black_box(converter.convert_rows(&rows).unwrap())); }); } diff --git a/arrow/benches/string_dictionary_builder.rs b/arrow/benches/string_dictionary_builder.rs index 411df3d69b52..424400674cd8 100644 --- a/arrow/benches/string_dictionary_builder.rs +++ b/arrow/benches/string_dictionary_builder.rs @@ -37,10 +37,7 @@ fn criterion_benchmark(c: &mut Criterion) { let mut do_bench = |dict_size: usize, total_size: usize, key_len: usize| { group.bench_function( - format!( - "(dict_size:{}, len:{}, key_len: {})", - dict_size, total_size, key_len - ), + format!("(dict_size:{dict_size}, len:{total_size}, key_len: {key_len})"), |b| { let strings = build_strings(dict_size, total_size, key_len); b.iter(|| { diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index bacd550bdfde..312de11b303d 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -52,17 +52,17 @@ fn main() { // Build the `PrimitiveArray` let primitive_array = primitive_array_builder.finish(); // Long arrays will have an ellipsis printed in the middle - println!("{:?}", primitive_array); + println!("{primitive_array:?}"); // Arrays can also be built from `Vec>`. `None` // represents a null value in the array. let date_array: PrimitiveArray = vec![Some(1550902545147), None, Some(1550902545147)].into(); - println!("{:?}", date_array); + println!("{date_array:?}"); let time_array: PrimitiveArray = (0..100).collect::>().into(); - println!("{:?}", time_array); + println!("{time_array:?}"); // We can build arrays directly from the underlying buffers. @@ -83,7 +83,7 @@ fn main() { .build() .unwrap(); let binary_array = StringArray::from(array_data); - println!("{:?}", binary_array); + println!("{binary_array:?}"); // ListArrays are similar to ByteArrays: they are arrays of other // arrays, where each child array is a slice of the underlying @@ -109,7 +109,7 @@ fn main() { .unwrap(); let list_array = ListArray::from(list_data); - println!("{:?}", list_array); + println!("{list_array:?}"); // StructArrays are arrays of tuples, where each tuple element is // from a child array. (In other words, they're like zipping @@ -128,5 +128,5 @@ fn main() { Arc::new(Int32Array::from(vec![42, 28, 19, 31])), ), ]); - println!("{:?}", struct_array); + println!("{struct_array:?}"); } diff --git a/arrow/examples/collect.rs b/arrow/examples/collect.rs index d523a8036a2f..5581186dbe7a 100644 --- a/arrow/examples/collect.rs +++ b/arrow/examples/collect.rs @@ -29,18 +29,18 @@ fn main() { // Create an Int8Array with 4 values let array: Int8Array = vec![1, 2, 3, 4].into_iter().collect(); - println!("{:?}", array); + println!("{array:?}"); // Arrays can also be built from `Vec>`. `None` // represents a null value in the array. let array: Int8Array = vec![Some(1_i8), Some(2), None, Some(3)] .into_iter() .collect(); - println!("{:?}", array); + println!("{array:?}"); assert!(array.is_null(2)); let array: Float32Array = [Some(1.0_f32), Some(2.3), None].into_iter().collect(); - println!("{:?}", array); + println!("{array:?}"); assert_eq!(array.value(0), 1.0_f32); assert_eq!(array.value(1), 2.3_f32); assert!(array.is_null(2)); diff --git a/arrow/examples/tensor_builder.rs b/arrow/examples/tensor_builder.rs index 1ef53920e046..ca31679e250d 100644 --- a/arrow/examples/tensor_builder.rs +++ b/arrow/examples/tensor_builder.rs @@ -39,7 +39,7 @@ fn main() -> Result<()> { // storage data let tensor = Int32Tensor::try_new(buf, Some(vec![2, 8]), None, None)?; println!("Int32 Tensor"); - println!("{:?}", tensor); + println!("{tensor:?}"); // Creating a tensor using float type buffer builder let mut builder = Float32BufferBuilder::new(4); @@ -54,14 +54,14 @@ fn main() -> Result<()> { // storage data let tensor = Float32Tensor::try_new(buf, Some(vec![2, 2]), None, None)?; println!("\nFloat32 Tensor"); - println!("{:?}", tensor); + println!("{tensor:?}"); // In order to build a tensor from an array the function to_byte_slice add the // required padding to the elements in the array. let buf = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7, 9, 10].to_byte_slice()); let tensor = Int32Tensor::try_new(buf, Some(vec![2, 5]), None, None)?; println!("\nInt32 Tensor"); - println!("{:?}", tensor); + println!("{tensor:?}"); Ok(()) } diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index 37fa85fcf5dd..58cad3d08a4e 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -133,8 +133,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { } _ => { return Err(ArrowError::CDataInterface(format!( - "The decimal pattern \"d:{:?}\" is not supported in the Rust implementation", - extra + "The decimal pattern \"d:{extra:?}\" is not supported in the Rust implementation" ))) } } @@ -203,8 +202,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { } _ => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" is still not supported in Rust implementation", - other + "The datatype \"{other:?}\" is still not supported in Rust implementation" ))) } } @@ -304,13 +302,11 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::LargeBinary => Ok("Z".to_string()), DataType::Utf8 => Ok("u".to_string()), DataType::LargeUtf8 => Ok("U".to_string()), - DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{}", num_bytes)), - DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{}", num_elems)), - DataType::Decimal128(precision, scale) => { - Ok(format!("d:{},{}", precision, scale)) - } + DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), + DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), + DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), DataType::Decimal256(precision, scale) => { - Ok(format!("d:{},{},256", precision, scale)) + Ok(format!("d:{precision},{scale},256")) } DataType::Date32 => Ok("tdD".to_string()), DataType::Date64 => Ok("tdm".to_string()), @@ -322,10 +318,10 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()), DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()), DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()), - DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{}", tz)), - DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{}", tz)), - DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{}", tz)), - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{}", tz)), + DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")), + DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")), + DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")), + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")), DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()), DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), @@ -343,8 +339,7 @@ fn get_format_string(dtype: &DataType) -> Result { } } other => Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" is still not supported in Rust implementation", - other + "The datatype \"{other:?}\" is still not supported in Rust implementation" ))), } } diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 4111b858d050..9fcca3c5d9ea 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -345,8 +345,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Timestamp(..), _) | (DataType::Duration(..), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::FixedSizeBinary(num_bytes), 1) => size_of::() * (*num_bytes as usize) * 8, @@ -356,8 +355,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { }, (DataType::FixedSizeBinary(_), _) | (DataType::FixedSizeList(_, _), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) }, // Variable-size list and map have one i32 buffer. @@ -367,14 +365,12 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Utf8, 2) | (DataType::Binary, 2) => size_of::() * 8, (DataType::List(_), _) | (DataType::Map(_, _), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::Utf8, _) | (DataType::Binary, _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } // Variable-sized binaries: have two buffers. @@ -383,8 +379,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2)=> size_of::() * 8, (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) | (DataType::LargeList(_), _)=> { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } // type ids. UnionArray doesn't have null bitmap so buffer index begins with 0. @@ -393,28 +388,24 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Union(_, _, UnionMode::Dense), 1) => size_of::() * 8, (DataType::Union(_, _, UnionMode::Sparse), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 1 buffer, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::Union(_, _, UnionMode::Dense), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 2 buffer, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i + "The datatype \"{data_type:?}\" expects 2 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (_, 0) => { // We don't call this `bit_width` to compute buffer length for null buffer. If any types that don't have null buffer like // UnionArray, they should be handled above. return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented.", - data_type + "The datatype \"{data_type:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." ))) } _ => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" is still not supported in Rust implementation", - data_type + "The datatype \"{data_type:?}\" is still not supported in Rust implementation" ))) } }) @@ -708,8 +699,7 @@ pub trait ArrowArrayRef { Ok(MutableBuffer::new(0).into()) } None => Err(ArrowError::CDataInterface(format!( - "The external buffer at position {} is null.", - index + "The external buffer at position {index} is null." ))), } }) diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 3a85f2ef6421..4313eaaaf34f 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -287,8 +287,7 @@ fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result Ok(Arc::new(schema)) } else { Err(ArrowError::CDataInterface(format!( - "Cannot get schema from input stream. Error code: {:?}", - ret_code + "Cannot get schema from input stream. Error code: {ret_code:?}" ))) } } diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 8db4b154e90c..5fc8e4d43c52 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -101,8 +101,7 @@ pub fn create_random_array( >(size, null_density)), _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Unsupported unit {:?} for Time32", - unit + "Unsupported unit {unit:?} for Time32" ))) } }, @@ -115,8 +114,7 @@ pub fn create_random_array( >(size, null_density)), _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Unsupported unit {:?} for Time64", - unit + "Unsupported unit {unit:?} for Time64" ))) } }, @@ -153,8 +151,7 @@ pub fn create_random_array( } other => { return Err(ArrowError::NotYetImplemented(format!( - "Generating random arrays not yet implemented for {:?}", - other + "Generating random arrays not yet implemented for {other:?}" ))) } }) @@ -186,8 +183,7 @@ fn create_random_list_array( } _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Cannot create list array for field {:?}", - field + "Cannot create list array for field {field:?}" ))) } }; diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 53ae0fddef6f..9027a1cdc448 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -167,7 +167,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -193,7 +193,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -231,7 +231,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{:#?}", table); + assert_eq!(expected, actual, "Actual result:\n{table:#?}"); } #[test] @@ -264,7 +264,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -304,7 +304,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -337,7 +337,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -669,7 +669,7 @@ mod tests { ]; let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -702,7 +702,7 @@ mod tests { ]; let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -761,7 +761,7 @@ mod tests { ]; let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -983,7 +983,7 @@ mod tests { ]; let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -1024,7 +1024,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } @@ -1079,7 +1079,7 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n{}", table); + assert_eq!(expected, actual, "Actual result:\n{table}"); Ok(()) } diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs index 83107aa79239..fd051dea1a8d 100644 --- a/arrow/src/util/test_util.rs +++ b/arrow/src/util/test_util.rs @@ -78,7 +78,7 @@ pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File { pub fn arrow_test_data() -> String { match get_data_dir("ARROW_TEST_DATA", "../testing/data") { Ok(pb) => pb.display().to_string(), - Err(err) => panic!("failed to get arrow data dir: {}", err), + Err(err) => panic!("failed to get arrow data dir: {err}"), } } @@ -100,7 +100,7 @@ pub fn arrow_test_data() -> String { pub fn parquet_test_data() -> String { match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") { Ok(pb) => pb.display().to_string(), - Err(err) => panic!("failed to get parquet data dir: {}", err), + Err(err) => panic!("failed to get parquet data dir: {err}"), } } diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 3a6976d11b0f..cba55845ec46 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -207,7 +207,7 @@ fn hex_encode(bytes: &[u8]) -> String { let mut out = String::with_capacity(bytes.len() * 2); for byte in bytes { // String writing is infallible - let _ = write!(out, "{:02x}", byte); + let _ = write!(out, "{byte:02x}"); } out } @@ -397,7 +397,7 @@ async fn instance_creds( const CREDENTIALS_PATH: &str = "latest/meta-data/iam/security-credentials"; const AWS_EC2_METADATA_TOKEN_HEADER: &str = "X-aws-ec2-metadata-token"; - let token_url = format!("{}/latest/api/token", endpoint); + let token_url = format!("{endpoint}/latest/api/token"); let token_result = client .request(Method::PUT, token_url) @@ -416,7 +416,7 @@ async fn instance_creds( Err(e) => return Err(e.into()), }; - let role_url = format!("{}/{}/", endpoint, CREDENTIALS_PATH); + let role_url = format!("{endpoint}/{CREDENTIALS_PATH}/"); let mut role_request = client.request(Method::GET, role_url); if let Some(token) = &token { @@ -425,7 +425,7 @@ async fn instance_creds( let role = role_request.send_retry(retry_config).await?.text().await?; - let creds_url = format!("{}/{}/{}", endpoint, CREDENTIALS_PATH, role); + let creds_url = format!("{endpoint}/{CREDENTIALS_PATH}/{role}"); let mut creds_request = client.request(Method::GET, creds_url); if let Some(token) = &token { creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); @@ -483,7 +483,7 @@ async fn web_identity( endpoint: &str, ) -> Result>, StdError> { let token = std::fs::read_to_string(token_path) - .map_err(|e| format!("Failed to read token file '{}': {}", token_path, e))?; + .map_err(|e| format!("Failed to read token file '{token_path}': {e}"))?; let bytes = client .request(Method::POST, endpoint) @@ -501,7 +501,7 @@ async fn web_identity( .await?; let resp: AssumeRoleResponse = quick_xml::de::from_reader(bytes.reader()) - .map_err(|e| format!("Invalid AssumeRoleWithWebIdentity response: {}", e))?; + .map_err(|e| format!("Invalid AssumeRoleWithWebIdentity response: {e}"))?; let creds = resp.assume_role_with_web_identity_result.credentials; let now = Utc::now(); @@ -677,7 +677,7 @@ mod tests { // Verify only allows IMDSv2 let resp = client - .request(Method::GET, format!("{}/latest/meta-data/ami-id", endpoint)) + .request(Method::GET, format!("{endpoint}/latest/meta-data/ami-id")) .send() .await .unwrap(); diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 20174692fb5e..a1c9eae84052 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -614,7 +614,7 @@ impl AmazonS3Builder { std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") { builder.metadata_endpoint = - Some(format!("{}{}", METADATA_ENDPOINT, metadata_relative_uri)); + Some(format!("{METADATA_ENDPOINT}{metadata_relative_uri}")); } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { @@ -896,7 +896,7 @@ impl AmazonS3Builder { let session_name = std::env::var("AWS_ROLE_SESSION_NAME") .unwrap_or_else(|_| "WebIdentitySession".to_string()); - let endpoint = format!("https://sts.{}.amazonaws.com", region); + let endpoint = format!("https://sts.{region}.amazonaws.com"); // Disallow non-HTTPs requests let client = self @@ -948,15 +948,15 @@ impl AmazonS3Builder { // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. if self.virtual_hosted_style_request { - endpoint = self.endpoint.unwrap_or_else(|| { - format!("https://{}.s3.{}.amazonaws.com", bucket, region) - }); + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); bucket_endpoint = endpoint.clone(); } else { endpoint = self .endpoint - .unwrap_or_else(|| format!("https://s3.{}.amazonaws.com", region)); - bucket_endpoint = format!("{}/{}", endpoint, bucket); + .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); + bucket_endpoint = format!("{endpoint}/{bucket}"); } let config = S3Config { @@ -1137,8 +1137,7 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = - format!("{}{}", METADATA_ENDPOINT, container_creds_relative_uri); + let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index e42950b90102..39da7177fee5 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -179,12 +179,12 @@ impl AzureClient { Ok(AzureCredential::AuthorizationToken( // we do the conversion to a HeaderValue here, since it is fallible // and we wna to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {}", token)).map_err( - |err| crate::Error::Generic { + HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { + crate::Error::Generic { store: "MicrosoftAzure", source: Box::new(err), - }, - )?, + } + })?, )) } CredentialProvider::SASToken(sas) => { diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 280d8430011c..67023d2f0434 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -593,7 +593,7 @@ mod tests { Some("client_id".into()), None, None, - Some(format!("{}/metadata/identity/oauth2/token", endpoint)), + Some(format!("{endpoint}/metadata/identity/oauth2/token")), client.clone(), ); @@ -618,7 +618,7 @@ mod tests { // Test IMDS server.push_fn(move |req| { - assert_eq!(req.uri().path(), format!("/{}/oauth2/v2.0/token", tenant)); + assert_eq!(req.uri().path(), format!("/{tenant}/oauth2/v2.0/token")); assert_eq!(req.method(), &Method::POST); let body = block_on(to_bytes(req.into_body())).unwrap(); let body = String::from_utf8(body.to_vec()).unwrap(); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 1eea27801a3b..52969063495d 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -327,7 +327,7 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { buf: Vec, part_idx: usize, ) -> Result { - let content_id = format!("{:20}", part_idx); + let content_id = format!("{part_idx:20}"); let block_id: BlockId = content_id.clone().into(); self.client diff --git a/object_store/src/client/backoff.rs b/object_store/src/client/backoff.rs index 5a6126cc45c6..a4ca9765e79e 100644 --- a/object_store/src/client/backoff.rs +++ b/object_store/src/client/backoff.rs @@ -123,7 +123,7 @@ mod tests { }; let assert_fuzzy_eq = - |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{} != {}", a, b); + |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); // Create a static rng that takes the minimum of the range let rng = Box::new(StepRng::new(0, 0)); diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index cee86b3442ca..e6dd2eb8174b 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -41,7 +41,7 @@ impl std::fmt::Display for Error { self.message, self.retries )?; if let Some(source) = &self.source { - write!(f, ": {}", source)?; + write!(f, ": {source}")?; } Ok(()) } @@ -171,7 +171,7 @@ impl RetryExt for reqwest::RequestBuilder { true => match r.text().await { Ok(message) if !message.is_empty() => message, Ok(_) => "No Body".to_string(), - Err(e) => format!("error getting response body: {}", e) + Err(e) => format!("error getting response body: {e}") } false => status.to_string(), }; diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 56468568b35f..c12b37cdd1c0 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -352,8 +352,7 @@ async fn make_metadata_request( audience: &str, ) -> Result { let url = format!( - "http://{}/computeMetadata/v1/instance/service-accounts/default/token", - hostname + "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" ); let response: TokenResponse = client .request(Method::GET, url) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 871413b43801..97f44446f82a 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -1271,8 +1271,7 @@ mod test { assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1291,8 +1290,7 @@ mod test { assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1305,8 +1303,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1322,8 +1319,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( matches!(err, ObjectStoreError::NotFound { .. }), - "unexpected error type: {}", - err + "unexpected error type: {err}" ); } @@ -1352,7 +1348,7 @@ mod test { #[tokio::test] async fn gcs_test_proxy_url() { let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{}", FAKE_KEY).unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); let service_account_path = tfile.path(); let gcs = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) @@ -1400,7 +1396,7 @@ mod test { #[test] fn gcs_test_service_account_key_and_path() { let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{}", FAKE_KEY).unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); let _ = GoogleCloudStorageBuilder::new() .with_service_account_key(FAKE_KEY) .with_service_account_path(tfile.path().to_str().unwrap()) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 4ec58c387e49..8c202886b008 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -441,11 +441,9 @@ impl GetResult { } })?; - file.seek(SeekFrom::Start(0)).map_err(|source| { - local::Error::Seek { - source, - path: path.clone(), - } + file.rewind().map_err(|source| local::Error::Seek { + source, + path: path.clone(), })?; let mut buffer = Vec::with_capacity(len as usize); @@ -611,8 +609,7 @@ mod tests { let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), - "Expected list to be empty; found: {:?}", - content_list + "Expected list to be empty; found: {content_list:?}" ); let location = Path::from("test_dir/test_file.json"); @@ -815,7 +812,7 @@ mod tests { storage.delete(&path).await.unwrap(); let files = flatten_list_stream(storage, None).await.unwrap(); - assert!(files.is_empty(), "{:?}", files); + assert!(files.is_empty(), "{files:?}"); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { @@ -900,8 +897,7 @@ mod tests { let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), - "Expected list to be empty; found: {:?}", - content_list + "Expected list to be empty; found: {content_list:?}" ); let location1 = Path::from("foo/x.json"); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 2ef87adbb093..9a518ba4735a 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -555,7 +555,7 @@ impl ObjectStore for LocalFileSystem { fn get_upload_stage_path(dest: &std::path::Path, multipart_id: &MultipartId) -> PathBuf { let mut staging_path = dest.as_os_str().to_owned(); - staging_path.push(format!("#{}", multipart_id)); + staging_path.push(format!("#{multipart_id}")); staging_path.into() } @@ -607,7 +607,7 @@ impl AsyncWrite for LocalUpload { |condition: &str| -> std::task::Poll> { Poll::Ready(Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("Tried to write to file {}.", condition), + format!("Tried to write to file {condition}."), ))) }; @@ -1040,12 +1040,11 @@ mod tests { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(std::io::Error { .. }),), - "got: {:?}", - source_variant + "got: {source_variant:?}" ); assert!(path.ends_with(NON_EXISTENT_NAME), "{}", path); } else { - panic!("unexpected error type: {:?}", err); + panic!("unexpected error type: {err:?}"); } } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index e4be5b2afddf..372164c2b41f 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -365,12 +365,11 @@ mod tests { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(Error::NoDataInMemory { .. }),), - "got: {:?}", - source_variant + "got: {source_variant:?}" ); assert_eq!(path, NON_EXISTENT_NAME); } else { - panic!("unexpected error type: {:?}", err); + panic!("unexpected error type: {err:?}"); } } } diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 65427d1f2d70..0606fb51eb1c 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -222,7 +222,7 @@ where part.ok_or_else(|| { io::Error::new( io::ErrorKind::Other, - format!("Missing information for upload part {}", idx), + format!("Missing information for upload part {idx}"), ) }) }) diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 020e5f58e096..4b0862e44b73 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -454,63 +454,49 @@ mod tests { // self starts with self assert!( haystack.prefix_matches(&haystack), - "{:?} should have started with {:?}", - haystack, - haystack + "{haystack:?} should have started with {haystack:?}" ); // a longer prefix doesn't match let needle = needle.child("longer now"); assert!( !haystack.prefix_matches(&needle), - "{:?} shouldn't have started with {:?}", - haystack, - needle + "{haystack:?} shouldn't have started with {needle:?}" ); // one dir prefix matches let needle = Path::from_iter(["foo/bar"]); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); // two dir prefix matches let needle = needle.child("baz%2Ftest"); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); // partial dir prefix doesn't match let needle = Path::from_iter(["f"]); assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // one dir and one partial dir doesn't match let needle = Path::from_iter(["foo/bar", "baz"]); assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // empty prefix matches let needle = Path::from(""); assert!( haystack.prefix_matches(&needle), - "{:?} should have started with {:?}", - haystack, - needle + "{haystack:?} should have started with {needle:?}" ); } @@ -524,9 +510,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // All directories match but file name is not a prefix @@ -534,9 +518,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // Not all directories match; file name is a prefix of the next directory; this @@ -545,9 +527,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); // Not all directories match; file name is NOT a prefix of the next directory; @@ -556,9 +536,7 @@ mod tests { assert!( !haystack.prefix_matches(&needle), - "{:?} should not have started with {:?}", - haystack, - needle + "{haystack:?} should not have started with {needle:?}" ); } diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index d8a7f07fba25..f6f65bea8f2c 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -281,7 +281,7 @@ fn build_plain_encoded_string_page_iterator( }; if def_level == max_def_level { let string_value = - format!("Test value {}, row group: {}, page: {}", k, i, j); + format!("Test value {k}, row group: {i}, page: {j}"); values .push(parquet::data_type::ByteArray::from(string_value.as_str())); } @@ -312,7 +312,7 @@ fn build_dictionary_encoded_string_page_iterator( // generate 1% unique values const NUM_UNIQUE_VALUES: usize = VALUES_PER_PAGE / 100; let unique_values = (0..NUM_UNIQUE_VALUES) - .map(|x| format!("Dictionary value {}", x)) + .map(|x| format!("Dictionary value {x}")) .collect::>(); let mut rng = seedable_rng(); let mut pages: Vec> = Vec::new(); diff --git a/parquet/examples/async_read_parquet.rs b/parquet/examples/async_read_parquet.rs index 9b4b6d4ffac6..f600cd0d11e3 100644 --- a/parquet/examples/async_read_parquet.rs +++ b/parquet/examples/async_read_parquet.rs @@ -27,7 +27,7 @@ use tokio::fs::File; async fn main() -> Result<()> { // Create parquet file that will be read. let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let file = File::open(path).await.unwrap(); // Create a async parquet reader builder with batch_size. diff --git a/parquet/examples/read_parquet.rs b/parquet/examples/read_parquet.rs index 3d6d70aeed20..f374fcd2e1f7 100644 --- a/parquet/examples/read_parquet.rs +++ b/parquet/examples/read_parquet.rs @@ -23,7 +23,7 @@ use std::fs::File; fn main() -> Result<()> { // Create parquet file that will be read. let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let file = File::open(path).unwrap(); // Create a sync parquet reader with batch_size. diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 312f0140769c..87165ef8e575 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1161,7 +1161,7 @@ mod tests { ("int64", 10), ]; for (prefix, target_precision) in file_variants { - let path = format!("{}/{}_decimal.parquet", testdata, prefix); + let path = format!("{testdata}/{prefix}_decimal.parquet"); let file = File::open(path).unwrap(); let mut record_reader = ParquetRecordBatchReader::try_new(file, 32).unwrap(); @@ -1726,7 +1726,7 @@ mod tests { // a column that has the same name as one of the struct fields // (see: ARROW-11452) let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/nested_structs.rust.parquet", testdata); + let path = format!("{testdata}/nested_structs.rust.parquet"); let file = File::open(&path).unwrap(); let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); @@ -1776,7 +1776,7 @@ mod tests { #[test] fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/nested_maps.snappy.parquet", testdata); + let path = format!("{testdata}/nested_maps.snappy.parquet"); let file = File::open(path).unwrap(); let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); @@ -1968,7 +1968,7 @@ mod tests { #[test] fn test_read_null_list() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/null_list.parquet", testdata); + let path = format!("{testdata}/null_list.parquet"); let file = File::open(path).unwrap(); let mut record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); @@ -1993,7 +1993,7 @@ mod tests { #[test] fn test_null_schema_inference() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/null_list.parquet", testdata); + let path = format!("{testdata}/null_list.parquet"); let file = File::open(path).unwrap(); let arrow_field = Field::new( @@ -2084,7 +2084,7 @@ mod tests { #[test] fn test_empty_projection() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let file = File::open(path).unwrap(); let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); @@ -2256,7 +2256,7 @@ mod tests { #[test] fn test_scan_row_with_selection() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let test_file = File::open(&path).unwrap(); let mut serial_reader = @@ -2273,10 +2273,7 @@ mod tests { assert_eq!( skip_reader.collect::, _>>().unwrap(), expected, - "batch_size: {}, selection_len: {}, skip_first: {}", - batch_size, - selection_len, - skip_first + "batch_size: {batch_size}, selection_len: {selection_len}, skip_first: {skip_first}" ); } }; @@ -2315,7 +2312,7 @@ mod tests { fn test_batch_size_overallocate() { let testdata = arrow::util::test_util::parquet_test_data(); // `alltypes_plain.parquet` only have 8 rows - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let test_file = File::open(path).unwrap(); let builder = ParquetRecordBatchReaderBuilder::try_new(test_file).unwrap(); @@ -2394,7 +2391,7 @@ mod tests { #[test] fn test_read_lz4_raw() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/lz4_raw_compressed.parquet", testdata); + let path = format!("{testdata}/lz4_raw_compressed.parquet"); let file = File::open(path).unwrap(); let batches = ParquetRecordBatchReader::try_new(file, 1024) @@ -2438,7 +2435,7 @@ mod tests { "non_hadoop_lz4_compressed.parquet", ] { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/{}", testdata, file); + let path = format!("{testdata}/{file}"); let file = File::open(path).unwrap(); let expected_rows = 4; @@ -2470,7 +2467,7 @@ mod tests { #[test] fn test_read_lz4_hadoop_large() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/hadoop_lz4_compressed_larger.parquet", testdata); + let path = format!("{testdata}/hadoop_lz4_compressed_larger.parquet"); let file = File::open(path).unwrap(); let expected_rows = 10000; @@ -2496,7 +2493,7 @@ mod tests { #[cfg(feature = "snap")] fn test_read_nested_lists() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/nested_lists.snappy.parquet", testdata); + let path = format!("{testdata}/nested_lists.snappy.parquet"); let file = File::open(path).unwrap(); let f = file.try_clone().unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index c459d40d73b9..87b4ebc2b080 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -363,8 +363,7 @@ fn write_leaves( ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) | ArrowDataType::RunEndEncoded(_, _) => { Err(ParquetError::NYI( format!( - "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", - data_type + "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" ) )) } @@ -499,8 +498,7 @@ fn write_leaf( _ => { return Err(ParquetError::NYI( format!( - "Attempting to write an Arrow interval type {:?} to parquet that is not yet implemented", - interval_unit + "Attempting to write an Arrow interval type {interval_unit:?} to parquet that is not yet implemented" ) )); } @@ -536,8 +534,8 @@ fn write_leaf( Ok(written as i64) } -fn write_primitive<'a, T: DataType>( - writer: &mut ColumnWriterImpl<'a, T>, +fn write_primitive( + writer: &mut ColumnWriterImpl<'_, T>, values: &[T::T], levels: LevelInfo, ) -> Result { @@ -1197,8 +1195,7 @@ mod tests { assert_eq!( offset_index.len(), 10, - "Expected 9 pages but got {:#?}", - offset_index + "Expected 9 pages but got {offset_index:#?}" ); } @@ -1422,10 +1419,10 @@ mod tests { { bloom_filters.push(sbbf.clone()); } else { - panic!("No bloom filter for column named {} found", file_column); + panic!("No bloom filter for column named {file_column} found"); } } else { - panic!("No column named {} found", file_column); + panic!("No column named {file_column} found"); } } diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 9c96d06502c8..7470814faa17 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -49,8 +49,7 @@ where { if file_size < 8 { return Err(ParquetError::EOF(format!( - "file size of {} is less than footer", - file_size + "file size of {file_size} is less than footer" ))); } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 780ba6f3b4c1..0397df206bff 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -692,8 +692,7 @@ impl<'a> RowGroupCollection for InMemoryRowGroup<'a> { fn column_chunks(&self, i: usize) -> Result> { match &self.column_chunks[i] { None => Err(ParquetError::General(format!( - "Invalid column index {}, column was not fetched", - i + "Invalid column index {i}, column was not fetched" ))), Some(data) => { let page_locations = self @@ -757,8 +756,7 @@ impl ChunkReader for ColumnChunkData { .map(|idx| data[idx].1.slice(0..length)) .map_err(|_| { ParquetError::General(format!( - "Invalid offset in sparse column chunk data: {}", - start + "Invalid offset in sparse column chunk data: {start}" )) }), ColumnChunkData::Dense { offset, data } => { @@ -831,7 +829,7 @@ mod tests { #[tokio::test] async fn test_async_reader() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -886,7 +884,7 @@ mod tests { #[tokio::test] async fn test_async_reader_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -948,7 +946,7 @@ mod tests { #[tokio::test] async fn test_async_reader_skip_pages() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -1005,7 +1003,7 @@ mod tests { #[tokio::test] async fn test_fuzz_async_reader_selection() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -1072,7 +1070,7 @@ mod tests { async fn test_async_reader_zero_row_selector() { //See https://github.com/apache/arrow-rs/issues/2669 let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -1209,7 +1207,7 @@ mod tests { #[tokio::test] async fn test_row_filter_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -1259,7 +1257,7 @@ mod tests { #[tokio::test] async fn test_in_memory_row_group_sparse() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{}/alltypes_tiny_pages.parquet", testdata); + let path = format!("{testdata}/alltypes_tiny_pages.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); @@ -1345,7 +1343,7 @@ mod tests { async fn test_batch_size_overallocate() { let testdata = arrow::util::test_util::parquet_test_data(); // `alltypes_plain.parquet` only have 8 rows - let path = format!("{}/alltypes_plain.parquet", testdata); + let path = format!("{testdata}/alltypes_plain.parquet"); let data = Bytes::from(std::fs::read(path).unwrap()); let metadata = parse_metadata(&data).unwrap(); diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index e5de8eae6238..eb64b11b9440 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -62,7 +62,7 @@ impl AsyncFileReader for ParquetObjectReader { self.store .get_range(&self.meta.location, range) .map_err(|e| { - ParquetError::General(format!("AsyncChunkReader::get_bytes error: {}", e)) + ParquetError::General(format!("AsyncChunkReader::get_bytes error: {e}")) }) .boxed() } @@ -80,8 +80,7 @@ impl AsyncFileReader for ParquetObjectReader { .await .map_err(|e| { ParquetError::General(format!( - "ParquetObjectReader::get_byte_ranges error: {}", - e + "ParquetObjectReader::get_byte_ranges error: {e}" )) }) } @@ -96,8 +95,7 @@ impl AsyncFileReader for ParquetObjectReader { .get_range(&self.meta.location, range) .map_err(|e| { ParquetError::General(format!( - "ParquetObjectReader::get_metadata error: {}", - e + "ParquetObjectReader::get_metadata error: {e}" )) }) }, diff --git a/parquet/src/arrow/buffer/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs index 34a0a4b83e8d..2781190331c5 100644 --- a/parquet/src/arrow/buffer/bit_util.rs +++ b/parquet/src/arrow/buffer/bit_util.rs @@ -53,7 +53,7 @@ pub fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator + '_ { /// Performs big endian sign extension pub fn sign_extend_be(b: &[u8]) -> [u8; N] { - assert!(b.len() <= N, "Array too large, expected less than {}", N); + assert!(b.len() <= N, "Array too large, expected less than {N}"); let is_negative = (b[0] & 128u8) == 128u8; let mut result = if is_negative { [255u8; N] } else { [0u8; N] }; for (d, s) in result.iter_mut().skip(N - b.len()).zip(b) { diff --git a/parquet/src/arrow/decoder/delta_byte_array.rs b/parquet/src/arrow/decoder/delta_byte_array.rs index af73f4f25eb9..dd4a8fa87d27 100644 --- a/parquet/src/arrow/decoder/delta_byte_array.rs +++ b/parquet/src/arrow/decoder/delta_byte_array.rs @@ -49,8 +49,7 @@ impl DeltaByteArrayDecoder { if num_prefix != num_suffix { return Err(general_err!(format!( - "inconsistent DELTA_BYTE_ARRAY lengths, prefixes: {}, suffixes: {}", - num_prefix, num_suffix + "inconsistent DELTA_BYTE_ARRAY lengths, prefixes: {num_prefix}, suffixes: {num_suffix}" ))); } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index d81d6a69bbb9..a000a4656bf9 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -852,7 +852,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i], "{}", i); + assert_eq!(arrow_fields[i], converted_fields[i], "{i}"); } } diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index bdc203b742fe..e971c8632643 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -456,49 +456,49 @@ impl ColumnOrder { impl fmt::Display for Type { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for ConvertedType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for Repetition { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for Encoding { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for Compression { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for PageType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for SortOrder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } impl fmt::Display for ColumnOrder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } @@ -735,7 +735,7 @@ impl From> for ConvertedType { (16, false) => ConvertedType::UINT_16, (32, false) => ConvertedType::UINT_32, (64, false) => ConvertedType::UINT_64, - t => panic!("Integer type {:?} is not supported", t), + t => panic!("Integer type {t:?} is not supported"), }, LogicalType::Unknown => ConvertedType::NONE, LogicalType::Json => ConvertedType::JSON, diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 23913f0eafb3..b1de492f5792 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -133,13 +133,13 @@ impl ParquetFromCsvError { impl Display for ParquetFromCsvError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - ParquetFromCsvError::CommandLineParseError(e) => write!(f, "{}", e), - ParquetFromCsvError::IoError(e) => write!(f, "{}", e), - ParquetFromCsvError::ArrowError(e) => write!(f, "{}", e), - ParquetFromCsvError::ParquetError(e) => write!(f, "{}", e), + ParquetFromCsvError::CommandLineParseError(e) => write!(f, "{e}"), + ParquetFromCsvError::IoError(e) => write!(f, "{e}"), + ParquetFromCsvError::ArrowError(e) => write!(f, "{e}"), + ParquetFromCsvError::ParquetError(e) => write!(f, "{e}"), ParquetFromCsvError::WithContext(c, e) => { - writeln!(f, "{}", e)?; - write!(f, "context: {}", c) + writeln!(f, "{e}")?; + write!(f, "context: {c}") } } } @@ -219,7 +219,7 @@ fn compression_from_str(cmp: &str) -> Result { "LZ4" => Ok(Compression::LZ4), "ZSTD" => Ok(Compression::ZSTD), v => Err( - format!("Unknown compression {0} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help",v) + format!("Unknown compression {v} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help") ) } } @@ -228,10 +228,7 @@ fn writer_version_from_str(cmp: &str) -> Result { match cmp.to_uppercase().as_str() { "1" => Ok(WriterVersion::PARQUET_1_0), "2" => Ok(WriterVersion::PARQUET_2_0), - v => Err(format!( - "Unknown writer version {0} : possible values 1, 2", - v - )), + v => Err(format!("Unknown writer version {v} : possible values 1, 2")), } } @@ -397,7 +394,7 @@ fn main() -> Result<(), ParquetFromCsvError> { #[cfg(test)] mod tests { use std::{ - io::{Seek, SeekFrom, Write}, + io::{Seek, Write}, path::{Path, PathBuf}, }; @@ -424,8 +421,7 @@ mod tests { actual = actual[pos..].to_string(); assert_eq!( expected, actual, - "help text not match. please update to \n---\n{}\n---\n", - actual + "help text not match. please update to \n---\n{actual}\n---\n" ) } @@ -527,18 +523,16 @@ mod tests { match parse_args(vec!["--parquet-compression", "zip"]) { Ok(_) => panic!("unexpected success"), Err(e) => assert_eq!( - format!("{}", e), + format!("{e}"), "error: invalid value 'zip' for '--parquet-compression ': Unknown compression ZIP : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help\n"), } } fn assert_debug_text(debug_text: &str, name: &str, value: &str) { - let pattern = format!(" {}: {}", name, value); + let pattern = format!(" {name}: {value}"); assert!( debug_text.contains(&pattern), - "\"{}\" not contains \"{}\"", - debug_text, - pattern + "\"{debug_text}\" not contains \"{pattern}\"" ) } @@ -571,7 +565,7 @@ mod tests { ])); let reader_builder = configure_reader_builder(&args, arrow_schema); - let builder_debug = format!("{:?}", reader_builder); + let builder_debug = format!("{reader_builder:?}"); assert_debug_text(&builder_debug, "has_header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); assert_debug_text(&builder_debug, "quote", "Some(34)"); @@ -605,7 +599,7 @@ mod tests { Field::new("field5", DataType::Utf8, false), ])); let reader_builder = configure_reader_builder(&args, arrow_schema); - let builder_debug = format!("{:?}", reader_builder); + let builder_debug = format!("{reader_builder:?}"); assert_debug_text(&builder_debug, "has_header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); assert_debug_text(&builder_debug, "quote", "None"); @@ -627,10 +621,10 @@ mod tests { { let csv = input_file.as_file_mut(); for index in 1..2000 { - write!(csv, "{},\"name_{}\"\r\n", index, index).unwrap(); + write!(csv, "{index},\"name_{index}\"\r\n").unwrap(); } csv.flush().unwrap(); - csv.seek(SeekFrom::Start(0)).unwrap(); + csv.rewind().unwrap(); } let output_parquet = NamedTempFile::new().unwrap(); diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index a924ef373c02..d8a72dd796eb 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -86,11 +86,10 @@ impl Args { .zip(reader.metadata().row_groups()) .enumerate() { - println!("Row Group: {}", row_group_idx); + println!("Row Group: {row_group_idx}"); let offset_index = offset_indices.get(column_idx).ok_or_else(|| { ParquetError::General(format!( - "No offset index for row group {} column chunk {}", - row_group_idx, column_idx + "No offset index for row group {row_group_idx} column chunk {column_idx}" )) })?; @@ -156,12 +155,12 @@ fn print_index( idx, o.offset, o.compressed_page_size, row_count ); match &c.min { - Some(m) => print!(", min {:>10}", m), + Some(m) => print!(", min {m:>10}"), None => print!(", min {:>10}", "NONE"), } match &c.max { - Some(m) => print!(", max {:>10}", m), + Some(m) => print!(", max {m:>10}"), None => print!(", max {:>10}", "NONE"), } println!() diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 117f9ee0b17a..c1e08387a550 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -102,6 +102,6 @@ fn print_row(row: &Row, json: bool) { if json { println!("{}", row.to_json_value()) } else { - println!("{}", row); + println!("{row}"); } } diff --git a/parquet/src/bin/parquet-rowcount.rs b/parquet/src/bin/parquet-rowcount.rs index 5069d4b2543b..45eb1c9a476f 100644 --- a/parquet/src/bin/parquet-rowcount.rs +++ b/parquet/src/bin/parquet-rowcount.rs @@ -67,6 +67,6 @@ fn main() { total_num_rows += group_metadata.num_rows(); } - eprintln!("File {}: rowcount={}", filename, total_num_rows); + eprintln!("File {filename}: rowcount={total_num_rows}"); } } diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index ff7798a91cd3..ae79fe4296c3 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -60,7 +60,7 @@ fn main() { let verbose = args.verbose; match SerializedFileReader::new(file) { - Err(e) => panic!("Error when parsing Parquet file: {}", e), + Err(e) => panic!("Error when parsing Parquet file: {e}"), Ok(parquet_reader) => { let metadata = parquet_reader.metadata(); println!("Metadata for file: {}", &filename); diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index ca8f558a6e00..77e29c6fb282 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -80,7 +80,7 @@ fn main() { .expect("Unable to open file as Parquet"); let metadata = file_reader.metadata(); for (ri, row_group) in metadata.row_groups().iter().enumerate() { - println!("Row group #{}", ri); + println!("Row group #{ri}"); println!("{}", "=".repeat(80)); if let Some((column_index, _)) = row_group .columns() diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index e255a8dc12da..4d2040b7f258 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -156,7 +156,7 @@ fn read_bloom_filter_header_and_length( let mut buf_reader = buffer.reader(); let mut prot = TCompactInputProtocol::new(&mut buf_reader); let header = BloomFilterHeader::read_from_in_protocol(&mut prot).map_err(|e| { - ParquetError::General(format!("Could not read bloom filter header: {}", e)) + ParquetError::General(format!("Could not read bloom filter header: {e}")) })?; Ok(( header, @@ -190,8 +190,7 @@ impl Sbbf { pub(crate) fn new_with_ndv_fpp(ndv: u64, fpp: f64) -> Result { if !(0.0..1.0).contains(&fpp) { return Err(ParquetError::General(format!( - "False positive probability must be between 0.0 and 1.0, got {}", - fpp + "False positive probability must be between 0.0 and 1.0, got {fpp}" ))); } let num_bits = num_of_bits_from_ndv_fpp(ndv, fpp); @@ -227,7 +226,7 @@ impl Sbbf { let mut protocol = TCompactOutputProtocol::new(&mut writer); let header = self.header(); header.write_to_out_protocol(&mut protocol).map_err(|e| { - ParquetError::General(format!("Could not write bloom filter header: {}", e)) + ParquetError::General(format!("Could not write bloom filter header: {e}")) })?; protocol.flush()?; self.write_bitset(&mut writer)?; @@ -241,8 +240,7 @@ impl Sbbf { .write_all(block.to_le_bytes().as_slice()) .map_err(|e| { ParquetError::General(format!( - "Could not write bloom filter bit set: {}", - e + "Could not write bloom filter bit set: {e}" )) })?; } @@ -389,7 +387,7 @@ mod tests { ]; let sbbf = Sbbf::new(bitset); for a in 0..10i64 { - let value = format!("a{}", a); + let value = format!("a{a}"); assert!(sbbf.check(&value.as_str())); } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index ddb6d243ebd3..bd3568d13cee 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -222,8 +222,7 @@ impl TryFrom<&PageHeader> for PageMetadata { is_dict: false, }), other => Err(ParquetError::General(format!( - "page type {:?} cannot be converted to PageMetadata", - other + "page type {other:?} cannot be converted to PageMetadata" ))), } } diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index da7fa78fe485..f57b3e16d5d0 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -245,7 +245,7 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { let current_decoder = self .decoders .get_mut(&encoding) - .unwrap_or_else(|| panic!("decoder for encoding {} should be set", encoding)); + .unwrap_or_else(|| panic!("decoder for encoding {encoding} should be set")); current_decoder.get(&mut out[range]) } @@ -258,7 +258,7 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { let current_decoder = self .decoders .get_mut(&encoding) - .unwrap_or_else(|| panic!("decoder for encoding {} should be set", encoding)); + .unwrap_or_else(|| panic!("decoder for encoding {encoding} should be set")); current_decoder.skip(num_values) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index f2417900d99e..51e2614993e1 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1137,7 +1137,7 @@ mod tests { assert!(res.is_err()); if let Err(err) = res { assert_eq!( - format!("{}", err), + format!("{err}"), "Parquet error: Inconsistent length of definition and repetition levels: 3 != 2" ); } @@ -1152,7 +1152,7 @@ mod tests { assert!(res.is_err()); if let Err(err) = res { assert_eq!( - format!("{}", err), + format!("{err}"), "Parquet error: Definition levels are required, because max definition level = 1" ); } @@ -1167,7 +1167,7 @@ mod tests { assert!(res.is_err()); if let Err(err) = res { assert_eq!( - format!("{}", err), + format!("{err}"), "Parquet error: Repetition levels are required, because max repetition level = 1" ); } @@ -1182,7 +1182,7 @@ mod tests { assert!(res.is_err()); if let Err(err) = res { assert_eq!( - format!("{}", err), + format!("{err}"), "Parquet error: Expected to write 4 values, but have only 2" ); } @@ -1907,7 +1907,7 @@ mod tests { assert_eq!(stats.min(), &false); assert_eq!(stats.max(), &true); } else { - panic!("expecting Statistics::Boolean, got {:?}", stats); + panic!("expecting Statistics::Boolean, got {stats:?}"); } } @@ -1920,7 +1920,7 @@ mod tests { assert_eq!(stats.min(), &-2); assert_eq!(stats.max(), &3); } else { - panic!("expecting Statistics::Int32, got {:?}", stats); + panic!("expecting Statistics::Int32, got {stats:?}"); } } @@ -1933,7 +1933,7 @@ mod tests { assert_eq!(stats.min(), &-2); assert_eq!(stats.max(), &3); } else { - panic!("expecting Statistics::Int64, got {:?}", stats); + panic!("expecting Statistics::Int64, got {stats:?}"); } } @@ -1955,7 +1955,7 @@ mod tests { assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); } else { - panic!("expecting Statistics::Int96, got {:?}", stats); + panic!("expecting Statistics::Int96, got {stats:?}"); } } @@ -1968,7 +1968,7 @@ mod tests { assert_eq!(stats.min(), &-2.0); assert_eq!(stats.max(), &3.0); } else { - panic!("expecting Statistics::Float, got {:?}", stats); + panic!("expecting Statistics::Float, got {stats:?}"); } } @@ -1981,7 +1981,7 @@ mod tests { assert_eq!(stats.min(), &-2.0); assert_eq!(stats.max(), &3.0); } else { - panic!("expecting Statistics::Double, got {:?}", stats); + panic!("expecting Statistics::Double, got {stats:?}"); } } @@ -1999,7 +1999,7 @@ mod tests { assert_eq!(stats.min(), &ByteArray::from("aaw")); assert_eq!(stats.max(), &ByteArray::from("zz")); } else { - panic!("expecting Statistics::ByteArray, got {:?}", stats); + panic!("expecting Statistics::ByteArray, got {stats:?}"); } } @@ -2022,7 +2022,7 @@ mod tests { let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); assert_eq!(stats.max(), &expected_max); } else { - panic!("expecting Statistics::FixedLenByteArray, got {:?}", stats); + panic!("expecting Statistics::FixedLenByteArray, got {stats:?}"); } } diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index cf1da20b6842..0727935c345a 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -40,7 +40,7 @@ pub fn max_buffer_size( match encoding { Encoding::RLE => RleEncoder::max_buffer_size(bit_width, num_buffered_values), Encoding::BIT_PACKED => ceil(num_buffered_values * bit_width as usize, 8), - _ => panic!("Unsupported encoding type {}", encoding), + _ => panic!("Unsupported encoding type {encoding}"), } } @@ -76,7 +76,7 @@ impl LevelEncoder { // `max_buffer_size()` method. LevelEncoder::BitPacked(bit_width, BitWriter::new_from_buf(buffer)) } - _ => panic!("Unsupported encoding type {}", encoding), + _ => panic!("Unsupported encoding type {encoding}"), } } @@ -160,7 +160,7 @@ impl LevelDecoder { Encoding::BIT_PACKED => { LevelDecoder::BitPacked(None, bit_width, BitReader::from(Vec::new())) } - _ => panic!("Unsupported encoding type {}", encoding), + _ => panic!("Unsupported encoding type {encoding}"), } } diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 703ff51f44c2..62f7656f14b5 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -48,16 +48,16 @@ impl std::fmt::Display for ParquetError { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { match &self { ParquetError::General(message) => { - write!(fmt, "Parquet error: {}", message) + write!(fmt, "Parquet error: {message}") } - ParquetError::NYI(message) => write!(fmt, "NYI: {}", message), - ParquetError::EOF(message) => write!(fmt, "EOF: {}", message), + ParquetError::NYI(message) => write!(fmt, "NYI: {message}"), + ParquetError::EOF(message) => write!(fmt, "EOF: {message}"), #[cfg(feature = "arrow")] - ParquetError::ArrowError(message) => write!(fmt, "Arrow: {}", message), + ParquetError::ArrowError(message) => write!(fmt, "Arrow: {message}"), ParquetError::IndexOutOfBound(index, ref bound) => { - write!(fmt, "Index {} out of bound: {}", index, bound) + write!(fmt, "Index {index} out of bound: {bound}") } - ParquetError::External(e) => write!(fmt, "External: {}", e), + ParquetError::External(e) => write!(fmt, "External: {e}"), } } } @@ -157,6 +157,6 @@ macro_rules! arrow_err { #[cfg(feature = "arrow")] impl From for ArrowError { fn from(p: ParquetError) -> Self { - Self::ParquetError(format!("{}", p)) + Self::ParquetError(format!("{p}")) } } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 760caa9774e9..a14b3ce4d6c5 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -72,7 +72,7 @@ pub fn decode_metadata(metadata_read: &[u8]) -> Result { // TODO: row group filtering let mut prot = TCompactInputProtocol::new(metadata_read); let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) - .map_err(|e| ParquetError::General(format!("Could not parse metadata: {}", e)))?; + .map_err(|e| ParquetError::General(format!("Could not parse metadata: {e}")))?; let schema = types::from_thrift(&t_file_metadata.schema)?; let schema_descr = Arc::new(SchemaDescriptor::new(schema)); let mut row_groups = Vec::new(); diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 51a5264e3cf1..0696b2901267 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -996,7 +996,7 @@ mod tests { assert!(row_group_meta.is_err()); if let Err(e) = row_group_meta { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Column length mismatch: 2 != 0" ); } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 7d20b736ea0c..cbd31f9a1f32 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -690,8 +690,7 @@ impl ColumnProperties { fn set_bloom_filter_fpp(&mut self, value: f64) { assert!( value > 0. && value < 1.0, - "fpp must be between 0 and 1 exclusive, got {}", - value + "fpp must be between 0 and 1 exclusive, got {value}" ); self.bloom_filter_properties diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8ee37352bdd7..95108ad58af7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -921,7 +921,7 @@ mod tests { r.into_iter().project(proj).unwrap() }) - .map(|r| format!("{}", r)) + .map(|r| format!("{r}")) .collect::>() .join(","); diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 76885fdbf7a5..939ce037f968 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -126,8 +126,7 @@ pub fn from_thrift( let null_count = stats.null_count.unwrap_or(0); assert!( null_count >= 0, - "Statistics null count is negative ({})", - null_count + "Statistics null count is negative ({null_count})" ); // Generic null count. @@ -399,14 +398,14 @@ impl Statistics { impl fmt::Display for Statistics { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Statistics::Boolean(typed) => write!(f, "{}", typed), - Statistics::Int32(typed) => write!(f, "{}", typed), - Statistics::Int64(typed) => write!(f, "{}", typed), - Statistics::Int96(typed) => write!(f, "{}", typed), - Statistics::Float(typed) => write!(f, "{}", typed), - Statistics::Double(typed) => write!(f, "{}", typed), - Statistics::ByteArray(typed) => write!(f, "{}", typed), - Statistics::FixedLenByteArray(typed) => write!(f, "{}", typed), + Statistics::Boolean(typed) => write!(f, "{typed}"), + Statistics::Int32(typed) => write!(f, "{typed}"), + Statistics::Int64(typed) => write!(f, "{typed}"), + Statistics::Int96(typed) => write!(f, "{typed}"), + Statistics::Float(typed) => write!(f, "{typed}"), + Statistics::Double(typed) => write!(f, "{typed}"), + Statistics::ByteArray(typed) => write!(f, "{typed}"), + Statistics::FixedLenByteArray(typed) => write!(f, "{typed}"), } } } @@ -536,17 +535,17 @@ impl fmt::Display for ValueStatistics { write!(f, "{{")?; write!(f, "min: ")?; match self.min { - Some(ref value) => write!(f, "{}", value)?, + Some(ref value) => write!(f, "{value}")?, None => write!(f, "N/A")?, } write!(f, ", max: ")?; match self.max { - Some(ref value) => write!(f, "{}", value)?, + Some(ref value) => write!(f, "{value}")?, None => write!(f, "N/A")?, } write!(f, ", distinct_count: ")?; match self.distinct_count { - Some(value) => write!(f, "{}", value)?, + Some(value) => write!(f, "{value}")?, None => write!(f, "N/A")?, } write!(f, ", null_count: {}", self.null_count)?; @@ -619,14 +618,14 @@ mod tests { fn test_statistics_debug() { let stats = Statistics::int32(Some(1), Some(12), None, 12, true); assert_eq!( - format!("{:?}", stats), + format!("{stats:?}"), "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ min_max_deprecated: true, min_max_backwards_compatible: true})" ); let stats = Statistics::int32(None, None, None, 7, false); assert_eq!( - format!("{:?}", stats), + format!("{stats:?}"), "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ min_max_deprecated: false, min_max_backwards_compatible: false})" ) @@ -636,13 +635,13 @@ mod tests { fn test_statistics_display() { let stats = Statistics::int32(Some(1), Some(12), None, 12, true); assert_eq!( - format!("{}", stats), + format!("{stats}"), "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}" ); let stats = Statistics::int64(None, None, None, 7, false); assert_eq!( - format!("{}", stats), + format!("{stats}"), "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ false}" ); @@ -655,7 +654,7 @@ mod tests { true, ); assert_eq!( - format!("{}", stats), + format!("{stats}"), "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ min_max_deprecated: true}" ); @@ -668,7 +667,7 @@ mod tests { false, ); assert_eq!( - format!("{}", stats), + format!("{stats}"), "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}" ); } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 65f254185334..4983ed55f8f6 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -754,7 +754,7 @@ mod tests { assert!(res.is_err()); if let Err(err) = res { assert_eq!( - format!("{}", err), + format!("{err}"), "Parquet error: Column length mismatch: 1 != 0" ); } diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 8c942cb44ef0..f3511c03df83 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -245,7 +245,7 @@ pub fn make_row(fields: Vec<(String, Field)>) -> Row { impl fmt::Display for Row { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{{")?; - for (i, &(ref key, ref value)) in self.fields.iter().enumerate() { + for (i, (key, value)) in self.fields.iter().enumerate() { key.fmt(f)?; write!(f, ": ")?; value.fmt(f)?; @@ -724,37 +724,37 @@ impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Field::Null => write!(f, "null"), - Field::Bool(value) => write!(f, "{}", value), - Field::Byte(value) => write!(f, "{}", value), - Field::Short(value) => write!(f, "{}", value), - Field::Int(value) => write!(f, "{}", value), - Field::Long(value) => write!(f, "{}", value), - Field::UByte(value) => write!(f, "{}", value), - Field::UShort(value) => write!(f, "{}", value), - Field::UInt(value) => write!(f, "{}", value), - Field::ULong(value) => write!(f, "{}", value), + Field::Bool(value) => write!(f, "{value}"), + Field::Byte(value) => write!(f, "{value}"), + Field::Short(value) => write!(f, "{value}"), + Field::Int(value) => write!(f, "{value}"), + Field::Long(value) => write!(f, "{value}"), + Field::UByte(value) => write!(f, "{value}"), + Field::UShort(value) => write!(f, "{value}"), + Field::UInt(value) => write!(f, "{value}"), + Field::ULong(value) => write!(f, "{value}"), Field::Float(value) => { if !(1e-15..=1e19).contains(&value) { - write!(f, "{:E}", value) + write!(f, "{value:E}") } else if value.trunc() == value { - write!(f, "{}.0", value) + write!(f, "{value}.0") } else { - write!(f, "{}", value) + write!(f, "{value}") } } Field::Double(value) => { if !(1e-15..=1e19).contains(&value) { - write!(f, "{:E}", value) + write!(f, "{value:E}") } else if value.trunc() == value { - write!(f, "{}.0", value) + write!(f, "{value}.0") } else { - write!(f, "{}", value) + write!(f, "{value}") } } Field::Decimal(ref value) => { write!(f, "{}", convert_decimal_to_string(value)) } - Field::Str(ref value) => write!(f, "\"{}\"", value), + Field::Str(ref value) => write!(f, "\"{value}\""), Field::Bytes(ref value) => write!(f, "{:?}", value.data()), Field::Date(value) => write!(f, "{}", convert_date_to_string(value)), Field::TimestampMillis(value) => { @@ -763,7 +763,7 @@ impl fmt::Display for Field { Field::TimestampMicros(value) => { write!(f, "{}", convert_timestamp_micros_to_string(value)) } - Field::Group(ref fields) => write!(f, "{}", fields), + Field::Group(ref fields) => write!(f, "{fields}"), Field::ListInternal(ref list) => { let elems = &list.elements; write!(f, "[")?; @@ -778,7 +778,7 @@ impl fmt::Display for Field { Field::MapInternal(ref map) => { let entries = &map.entries; write!(f, "{{")?; - for (i, &(ref key, ref value)) in entries.iter().enumerate() { + for (i, (key, value)) in entries.iter().enumerate() { key.fmt(f)?; write!(f, " -> ")?; value.fmt(f)?; @@ -1248,7 +1248,7 @@ mod tests { ("a".to_string(), Field::Str("abc".to_string())), ]; let row = Field::Group(make_row(fields)); - assert_eq!(format!("{}", row), "{x: null, Y: 2, z: 3.1, a: \"abc\"}"); + assert_eq!(format!("{row}"), "{x: null, Y: 2, z: 3.1, a: \"abc\"}"); let row = Field::ListInternal(make_list(vec![ Field::Int(2), @@ -1256,14 +1256,14 @@ mod tests { Field::Null, Field::Int(12), ])); - assert_eq!(format!("{}", row), "[2, 1, null, 12]"); + assert_eq!(format!("{row}"), "[2, 1, null, 12]"); let row = Field::MapInternal(make_map(vec![ (Field::Int(1), Field::Float(1.2)), (Field::Int(2), Field::Float(4.5)), (Field::Int(3), Field::Float(2.3)), ])); - assert_eq!(format!("{}", row), "{1 -> 1.2, 2 -> 4.5, 3 -> 2.3}"); + assert_eq!(format!("{row}"), "{1 -> 1.2, 2 -> 4.5, 3 -> 2.3}"); } #[test] diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index a84693536995..eb16c13f6ffe 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -150,16 +150,14 @@ impl TreeBuilder { assert_eq!( field.get_fields().len(), 1, - "Invalid list type {:?}", - field + "Invalid list type {field:?}" ); let repeated_field = field.get_fields()[0].clone(); assert_eq!( repeated_field.get_basic_info().repetition(), Repetition::REPEATED, - "Invalid list type {:?}", - field + "Invalid list type {field:?}" ); if Reader::is_element_type(&repeated_field) { @@ -208,27 +206,23 @@ impl TreeBuilder { assert_eq!( field.get_fields().len(), 1, - "Invalid map type: {:?}", - field + "Invalid map type: {field:?}" ); assert!( !field.get_fields()[0].is_primitive(), - "Invalid map type: {:?}", - field + "Invalid map type: {field:?}" ); let key_value_type = field.get_fields()[0].clone(); assert_eq!( key_value_type.get_basic_info().repetition(), Repetition::REPEATED, - "Invalid map type: {:?}", - field + "Invalid map type: {field:?}" ); assert_eq!( key_value_type.get_fields().len(), 2, - "Invalid map type: {:?}", - field + "Invalid map type: {field:?}" ); path.push(String::from(key_value_type.name())); @@ -236,8 +230,7 @@ impl TreeBuilder { let key_type = &key_value_type.get_fields()[0]; assert!( key_type.is_primitive(), - "Map key type is expected to be a primitive type, but found {:?}", - key_type + "Map key type is expected to be a primitive type, but found {key_type:?}" ); let key_reader = self.reader_tree( key_type.clone(), @@ -411,7 +404,7 @@ impl Reader { } make_row(fields) } - _ => panic!("Cannot call read() on {}", self), + _ => panic!("Cannot call read() on {self}"), } } @@ -611,7 +604,7 @@ impl fmt::Display for Reader { Reader::RepeatedReader(..) => "RepeatedReader", Reader::KeyValueReader(..) => "KeyValueReader", }; - write!(f, "{}", s) + write!(f, "{s}") } } diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index b4b4ea2f4a55..b7318b3d3ac6 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -200,8 +200,7 @@ impl TypedTripletIter { fn new(descr: ColumnDescPtr, batch_size: usize, column_reader: ColumnReader) -> Self { assert!( batch_size > 0, - "Expected positive batch size, found: {}", - batch_size + "Expected positive batch size, found: {batch_size}" ); let max_def_level = descr.max_def_level(); diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 5cfd30dd977c..d90dc423caf7 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -62,7 +62,7 @@ pub fn print_parquet_metadata(out: &mut dyn io::Write, metadata: &ParquetMetaDat writeln!(out, "row groups:"); writeln!(out); for (i, rg) in metadata.row_groups().iter().enumerate() { - writeln!(out, "row group {}:", i); + writeln!(out, "row group {i}:"); print_dashes(out, 80); print_row_group_metadata(out, rg); } @@ -75,7 +75,7 @@ pub fn print_file_metadata(out: &mut dyn io::Write, file_metadata: &FileMetaData writeln!(out, "version: {}", file_metadata.version()); writeln!(out, "num of rows: {}", file_metadata.num_rows()); if let Some(created_by) = file_metadata.created_by().as_ref() { - writeln!(out, "created by: {}", created_by); + writeln!(out, "created by: {created_by}"); } if let Some(metadata) = file_metadata.key_value_metadata() { writeln!(out, "metadata:"); @@ -102,7 +102,7 @@ pub fn print_schema(out: &mut dyn io::Write, tp: &Type) { let mut printer = Printer::new(&mut s); printer.print(tp); } - writeln!(out, "{}", s); + writeln!(out, "{s}"); } #[allow(unused_must_use)] @@ -114,7 +114,7 @@ fn print_row_group_metadata(out: &mut dyn io::Write, rg_metadata: &RowGroupMetaD writeln!(out, "columns: "); for (i, cc) in rg_metadata.columns().iter().enumerate() { writeln!(out); - writeln!(out, "column {}:", i); + writeln!(out, "column {i}:"); print_dashes(out, 80); print_column_chunk_metadata(out, cc); } @@ -130,11 +130,11 @@ fn print_column_chunk_metadata( let encoding_strs: Vec<_> = cc_metadata .encodings() .iter() - .map(|e| format!("{}", e)) + .map(|e| format!("{e}")) .collect(); writeln!(out, "encodings: {}", encoding_strs.join(" ")); let file_path_str = cc_metadata.file_path().unwrap_or("N/A"); - writeln!(out, "file path: {}", file_path_str); + writeln!(out, "file path: {file_path_str}"); writeln!(out, "file offset: {}", cc_metadata.file_offset()); writeln!(out, "num of values: {}", cc_metadata.num_values()); writeln!( @@ -152,42 +152,42 @@ fn print_column_chunk_metadata( None => "N/A".to_owned(), Some(ipo) => ipo.to_string(), }; - writeln!(out, "index page offset: {}", index_page_offset_str); + writeln!(out, "index page offset: {index_page_offset_str}"); let dict_page_offset_str = match cc_metadata.dictionary_page_offset() { None => "N/A".to_owned(), Some(dpo) => dpo.to_string(), }; - writeln!(out, "dictionary page offset: {}", dict_page_offset_str); + writeln!(out, "dictionary page offset: {dict_page_offset_str}"); let statistics_str = match cc_metadata.statistics() { None => "N/A".to_owned(), Some(stats) => stats.to_string(), }; - writeln!(out, "statistics: {}", statistics_str); + writeln!(out, "statistics: {statistics_str}"); let bloom_filter_offset_str = match cc_metadata.bloom_filter_offset() { None => "N/A".to_owned(), Some(bfo) => bfo.to_string(), }; - writeln!(out, "bloom filter offset: {}", bloom_filter_offset_str); + writeln!(out, "bloom filter offset: {bloom_filter_offset_str}"); let offset_index_offset_str = match cc_metadata.offset_index_offset() { None => "N/A".to_owned(), Some(oio) => oio.to_string(), }; - writeln!(out, "offset index offset: {}", offset_index_offset_str); + writeln!(out, "offset index offset: {offset_index_offset_str}"); let offset_index_length_str = match cc_metadata.offset_index_length() { None => "N/A".to_owned(), Some(oil) => oil.to_string(), }; - writeln!(out, "offset index length: {}", offset_index_length_str); + writeln!(out, "offset index length: {offset_index_length_str}"); let column_index_offset_str = match cc_metadata.column_index_offset() { None => "N/A".to_owned(), Some(cio) => cio.to_string(), }; - writeln!(out, "column index offset: {}", column_index_offset_str); + writeln!(out, "column index offset: {column_index_offset_str}"); let column_index_length_str = match cc_metadata.column_index_length() { None => "N/A".to_owned(), Some(cil) => cil.to_string(), }; - writeln!(out, "column index length: {}", column_index_length_str); + writeln!(out, "column index length: {column_index_length_str}"); writeln!(out); } @@ -242,10 +242,10 @@ fn print_logical_and_converted( bit_width, is_signed, } => { - format!("INTEGER({},{})", bit_width, is_signed) + format!("INTEGER({bit_width},{is_signed})") } LogicalType::Decimal { scale, precision } => { - format!("DECIMAL({},{})", precision, scale) + format!("DECIMAL({precision},{scale})") } LogicalType::Timestamp { is_adjusted_to_u_t_c, @@ -283,15 +283,15 @@ fn print_logical_and_converted( // DECIMAL(9) - DECIMAL let precision_scale = match (precision, scale) { (p, s) if p > 0 && s > 0 => { - format!("({},{})", p, s) + format!("({p},{s})") } - (p, 0) if p > 0 => format!("({})", p), + (p, 0) if p > 0 => format!("({p})"), _ => String::new(), }; - format!("{}{}", decimal, precision_scale) + format!("{decimal}{precision_scale}") } other_converted_type => { - format!("{}", other_converted_type) + format!("{other_converted_type}") } } } @@ -313,9 +313,9 @@ impl<'a> Printer<'a> { let phys_type_str = match physical_type { PhysicalType::FIXED_LEN_BYTE_ARRAY => { // We need to include length for fixed byte array - format!("{} ({})", physical_type, type_length) + format!("{physical_type} ({type_length})") } - _ => format!("{}", physical_type), + _ => format!("{physical_type}"), }; // Also print logical type if it is available // If there is a logical type, do not print converted type @@ -358,7 +358,7 @@ impl<'a> Printer<'a> { 0, ); if !logical_str.is_empty() { - write!(self.output, "({}) ", logical_str); + write!(self.output, "({logical_str}) "); } writeln!(self.output, "{{"); } else { diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 1b966b41426c..151f2b69f31e 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -949,9 +949,10 @@ impl SchemaDescriptor { self.leaves.len() ); - *self.leaf_to_base.get(leaf).unwrap_or_else(|| { - panic!("Expected a value for index {} but found None", leaf) - }) + *self + .leaf_to_base + .get(leaf) + .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None")) } fn column_root_of(&self, i: usize) -> &TypePtr { @@ -1279,7 +1280,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'" ); } @@ -1292,7 +1293,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field" ); } @@ -1306,7 +1307,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY" ); } @@ -1323,7 +1324,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'" ); } @@ -1337,7 +1338,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Invalid DECIMAL precision: -1" ); } @@ -1351,7 +1352,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Invalid DECIMAL precision: 0" ); } @@ -1364,7 +1365,7 @@ mod tests { .build(); assert!(result.is_err()); if let Err(e) = result { - assert_eq!(format!("{}", e), "Parquet error: Invalid DECIMAL scale: -1"); + assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1"); } result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) @@ -1376,7 +1377,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)" ); } @@ -1399,7 +1400,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Cannot represent INT32 as DECIMAL with precision 18" ); } @@ -1413,7 +1414,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Cannot represent INT64 as DECIMAL with precision 32" ); } @@ -1428,7 +1429,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11" ); } @@ -1440,7 +1441,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field" ); } @@ -1452,7 +1453,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field" ); } @@ -1464,7 +1465,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field" ); } @@ -1477,7 +1478,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field" ); } @@ -1489,7 +1490,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field" ); } @@ -1501,7 +1502,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: MAP cannot be applied to primitive field 'foo'" ); } @@ -1514,7 +1515,7 @@ mod tests { assert!(result.is_err()); if let Err(e) = result { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'" ); } @@ -1660,8 +1661,8 @@ mod tests { for i in 0..nleaves { let col = descr.column(i); - assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{}", i); - assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{}", i); + assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}"); + assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}"); } assert_eq!(descr.column(0).path().string(), "a"); @@ -1989,7 +1990,7 @@ mod tests { assert!(thrift_schema.is_err()); if let Err(e) = thrift_schema { assert_eq!( - format!("{}", e), + format!("{e}"), "Parquet error: Root schema must be Group type" ); } diff --git a/parquet/src/schema/visitor.rs b/parquet/src/schema/visitor.rs index 9d28fa5e8dcd..f83782c638f1 100644 --- a/parquet/src/schema/visitor.rs +++ b/parquet/src/schema/visitor.rs @@ -49,7 +49,7 @@ pub trait TypeVisitor { fn visit_list(&mut self, list_type: TypePtr, context: C) -> Result { match list_type.as_ref() { Type::PrimitiveType { .. } => { - panic!("{:?} is a list type and must be a group type", list_type) + panic!("{list_type:?} is a list type and must be a group type") } Type::GroupType { basic_info: _, diff --git a/parquet/src/util/bit_pack.rs b/parquet/src/util/bit_pack.rs index 8cea20de2539..94ab9578b991 100644 --- a/parquet/src/util/bit_pack.rs +++ b/parquet/src/util/bit_pack.rs @@ -106,7 +106,7 @@ mod tests { let mut output = [0; 8]; unpack8(&input, &mut output, i); for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}"); } } @@ -114,7 +114,7 @@ mod tests { let mut output = [0; 16]; unpack16(&input, &mut output, i); for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}"); } } @@ -122,7 +122,7 @@ mod tests { let mut output = [0; 32]; unpack32(&input, &mut output, i); for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}"); } } @@ -130,7 +130,7 @@ mod tests { let mut output = [0; 64]; unpack64(&input, &mut output, i); for (idx, out) in output.iter().enumerate() { - assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + assert_eq!(out.trailing_ones() as usize, i, "out[{idx}] = {out}"); } } } diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index c229ea3da26e..597190a46eff 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -638,8 +638,7 @@ impl BitReader { shift += 7; assert!( shift <= MAX_VLQ_BYTE_LEN * 7, - "Num of bytes exceed MAX_VLQ_BYTE_LEN ({})", - MAX_VLQ_BYTE_LEN + "Num of bytes exceed MAX_VLQ_BYTE_LEN ({MAX_VLQ_BYTE_LEN})" ); if byte & 0x80 == 0 { return Some(v); diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index 4e54aa7999cf..c36b9060ca58 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -194,7 +194,7 @@ pub fn make_pages( Encoding::PLAIN => { pb.add_values::(encoding, &values[value_range]); } - enc => panic!("Unexpected encoding {}", enc), + enc => panic!("Unexpected encoding {enc}"), } let data_page = pb.consume(); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index bf24950e99c2..0c66fcd1081d 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -98,8 +98,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { assert_eq!( page.compressed_page_size as usize, page_layout.compressed_size + page_layout.page_header_size, - "index page {} size mismatch", - idx + "index page {idx} size mismatch" ); let next_first_row_index = column_index .get(idx + 1) @@ -109,8 +108,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { let num_rows = next_first_row_index - page.first_row_index; assert_eq!( num_rows as usize, page_layout.rows, - "index page {} row count", - idx + "index page {idx} row count" ); } } @@ -146,8 +144,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { pages.len(), column_layout.pages.len() + column_layout.dictionary_page.is_some() as usize, - "page {} count mismatch", - idx + "page {idx} count mismatch" ); let page_layouts = column_layout @@ -160,8 +157,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { assert_eq!( page.buffer().len(), page_layout.compressed_size, - "page {} size mismatch", - idx + "page {idx} size mismatch" ); assert_eq!(page.page_type(), page_layout.page_type); } @@ -345,7 +341,7 @@ fn test_primitive() { #[test] fn test_string() { let array = Arc::new(StringArray::from_iter_values( - (0..2000).map(|x| format!("{:04}", x)), + (0..2000).map(|x| format!("{x:04}")), )) as _; let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); let props = WriterProperties::builder() From d9c2681a8a477aa19feb492e536f9e1a034d2c8d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Jan 2023 14:37:07 +0000 Subject: [PATCH 0553/1411] Add Push-Based CSV Decoder (#3604) * Add Push-Based CSV Decoder * Clippy * More tests * Clippy --- arrow-csv/src/reader/mod.rs | 370 ++++++++++++++++++++++---------- arrow-csv/src/reader/records.rs | 303 ++++++++++++++------------ 2 files changed, 427 insertions(+), 246 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 82b033f8086a..cff1337dd78f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -57,7 +57,7 @@ use arrow_cast::parse::Parser; use arrow_schema::*; use crate::map_csv_error; -use crate::reader::records::{RecordReader, StringRecords}; +use crate::reader::records::{RecordDecoder, StringRecords}; use arrow_data::decimal::validate_decimal_precision; use csv::StringRecord; use std::ops::Neg; @@ -330,24 +330,11 @@ pub type Reader = BufReader>; /// CSV file reader pub struct BufReader { - /// Explicit schema for the CSV file - schema: SchemaRef, - /// Optional projection for which columns to load (zero-based column indices) - projection: Option>, /// File reader - reader: RecordReader, - /// Rows to skip - to_skip: usize, - /// Current line number - line_number: usize, - /// End line number - end: usize, - /// Number of records per batch - batch_size: usize, - /// datetime format used to parse datetime values, (format understood by chrono) - /// - /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) - datetime_format: Option, + reader: R, + + /// The decoder + decoder: Decoder, } impl fmt::Debug for BufReader @@ -356,10 +343,7 @@ where { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Reader") - .field("schema", &self.schema) - .field("projection", &self.projection) - .field("line_number", &self.line_number) - .field("datetime_format", &self.datetime_format) + .field("decoder", &self.decoder) .finish() } } @@ -383,7 +367,8 @@ impl Reader { ) -> Self { let mut builder = ReaderBuilder::new() .has_header(has_header) - .with_batch_size(batch_size); + .with_batch_size(batch_size) + .with_schema(schema); if let Some(delimiter) = delimiter { builder = builder.with_delimiter(delimiter); @@ -397,21 +382,25 @@ impl Reader { if let Some(format) = datetime_format { builder = builder.with_datetime_format(format) } - builder.build_with_schema(StdBufReader::new(reader), schema) + + Self { + decoder: builder.build_decoder(), + reader: StdBufReader::new(reader), + } } /// Returns the schema of the reader, useful for getting the schema without reading /// record batches pub fn schema(&self) -> SchemaRef { - match &self.projection { + match &self.decoder.projection { Some(projection) => { - let fields = self.schema.fields(); + let fields = self.decoder.schema.fields(); let projected_fields: Vec = projection.iter().map(|i| fields[*i].clone()).collect(); Arc::new(Schema::new(projected_fields)) } - None => self.schema.clone(), + None => self.decoder.schema.clone(), } } @@ -444,38 +433,146 @@ impl Reader { } } +impl BufReader { + fn read(&mut self) -> Result, ArrowError> { + loop { + let buf = self.reader.fill_buf()?; + let decoded = self.decoder.decode(buf)?; + if decoded == 0 { + break; + } + self.reader.consume(decoded); + } + + self.decoder.flush() + } +} + impl Iterator for BufReader { type Item = Result; fn next(&mut self) -> Option { + self.read().transpose() + } +} + +/// A push-based interface for decoding CSV data from an arbitrary byte stream +/// +/// See [`Reader`] for a higher-level interface for interface with [`Read`] +/// +/// The push-based interface facilitates integration with sources that yield arbitrarily +/// delimited bytes ranges, such as [`BufRead`], or a chunked byte stream received from +/// object storage +/// +/// ``` +/// # use std::io::BufRead; +/// # use arrow_array::RecordBatch; +/// # use arrow_csv::ReaderBuilder; +/// # use arrow_schema::{ArrowError, SchemaRef}; +/// # +/// fn read_from_csv( +/// mut reader: R, +/// schema: SchemaRef, +/// batch_size: usize, +/// ) -> Result>, ArrowError> { +/// let mut decoder = ReaderBuilder::new() +/// .with_schema(schema) +/// .with_batch_size(batch_size) +/// .build_decoder(); +/// +/// let mut next = move || { +/// loop { +/// let buf = reader.fill_buf()?; +/// let decoded = decoder.decode(buf)?; +/// if decoded == 0 { +/// break; +/// } +/// +/// // Consume the number of bytes read +/// reader.consume(decoded); +/// } +/// decoder.flush() +/// }; +/// Ok(std::iter::from_fn(move || next().transpose())) +/// } +/// ``` +#[derive(Debug)] +pub struct Decoder { + /// Explicit schema for the CSV file + schema: SchemaRef, + + /// Optional projection for which columns to load (zero-based column indices) + projection: Option>, + + /// Number of records per batch + batch_size: usize, + + /// Rows to skip + to_skip: usize, + + /// Current line number + line_number: usize, + + /// End line number + end: usize, + + /// A decoder for [`StringRecords`] + record_decoder: RecordDecoder, + + /// datetime format used to parse datetime values, (format understood by chrono) + /// + /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) + datetime_format: Option, +} + +impl Decoder { + /// Decode records from `buf` returning the number of bytes read + /// + /// This method returns once `batch_size` objects have been parsed since the + /// last call to [`Self::flush`], or `buf` is exhausted. Any remaining bytes + /// should be included in the next call to [`Self::decode`] + /// + /// There is no requirement that `buf` contains a whole number of records, facilitating + /// integration with arbitrary byte streams, such as that yielded by [`BufRead`] or + /// network sources such as object storage + pub fn decode(&mut self, buf: &[u8]) -> Result { if self.to_skip != 0 { - if let Err(e) = self.reader.skip(std::mem::take(&mut self.to_skip)) { - return Some(Err(e)); - } + // Skip in units of `to_read` to avoid over-allocating buffers + let to_skip = self.to_skip.min(self.batch_size); + let (skipped, bytes) = self.record_decoder.decode(buf, to_skip)?; + self.to_skip -= skipped; + self.record_decoder.clear(); + return Ok(bytes); } - let remaining = self.end - self.line_number; - let to_read = self.batch_size.min(remaining); + let to_read = + self.batch_size.min(self.end - self.line_number) - self.record_decoder.len(); + let (_, bytes) = self.record_decoder.decode(buf, to_read)?; + Ok(bytes) + } - let batch = match self.reader.read(to_read) { - Ok(b) if b.is_empty() => return None, - Ok(b) => b, - Err(e) => return Some(Err(e)), - }; + /// Flushes the currently buffered data to a [`RecordBatch`] + /// + /// This should only be called after [`Self::decode`] has returned `Ok(0)`, + /// otherwise may return an error if part way through decoding a record + /// + /// Returns `Ok(None)` if no buffered data + pub fn flush(&mut self) -> Result, ArrowError> { + if self.record_decoder.is_empty() { + return Ok(None); + } - // parse the batches into a RecordBatch - let result = parse( - &batch, + let rows = self.record_decoder.flush()?; + let batch = parse( + &rows, self.schema.fields(), Some(self.schema.metadata.clone()), self.projection.as_ref(), self.line_number, self.datetime_format.as_deref(), - ); - - self.line_number += batch.len(); - - Some(result) + )?; + self.line_number += rows.len(); + Ok(Some(batch)) } } @@ -1055,29 +1152,35 @@ impl ReaderBuilder { mut reader: R, ) -> Result, ArrowError> { // check if schema should be inferred - let delimiter = self.delimiter.unwrap_or(b','); - let schema = match self.schema.take() { - Some(schema) => schema, - None => { - let roptions = ReaderOptions { - delimiter: Some(delimiter), - max_read_records: self.max_records, - has_header: self.has_header, - escape: self.escape, - quote: self.quote, - terminator: self.terminator, - datetime_re: self.datetime_re.take(), - }; - let (inferred_schema, _) = - infer_file_schema_with_csv_options(&mut reader, roptions)?; - - Arc::new(inferred_schema) - } - }; - Ok(self.build_with_schema(reader, schema)) + if self.schema.is_none() { + let delimiter = self.delimiter.unwrap_or(b','); + let roptions = ReaderOptions { + delimiter: Some(delimiter), + max_read_records: self.max_records, + has_header: self.has_header, + escape: self.escape, + quote: self.quote, + terminator: self.terminator, + datetime_re: self.datetime_re.take(), + }; + let (inferred_schema, _) = + infer_file_schema_with_csv_options(&mut reader, roptions)?; + self.schema = Some(Arc::new(inferred_schema)) + } + + Ok(BufReader { + reader, + decoder: self.build_decoder(), + }) } - fn build_with_schema(self, reader: R, schema: SchemaRef) -> BufReader { + /// Builds a decoder that can be used to decode CSV from an arbitrary byte stream + /// + /// # Panics + /// + /// This method panics if no schema provided + pub fn build_decoder(self) -> Decoder { + let schema = self.schema.expect("schema should be provided"); let mut reader_builder = csv_core::ReaderBuilder::new(); reader_builder.escape(self.escape); @@ -1091,7 +1194,7 @@ impl ReaderBuilder { reader_builder.terminator(csv_core::Terminator::Any(t)); } let delimiter = reader_builder.build(); - let reader = RecordReader::new(reader, delimiter, schema.fields().len()); + let record_decoder = RecordDecoder::new(delimiter, schema.fields().len()); let header = self.has_header as usize; @@ -1100,15 +1203,15 @@ impl ReaderBuilder { None => (header, usize::MAX), }; - BufReader { + Decoder { schema, - projection: self.projection, - reader, to_skip: start, + record_decoder, line_number: start, end, - batch_size: self.batch_size, + projection: self.projection, datetime_format: self.datetime_format, + batch_size: self.batch_size, } } } @@ -1125,49 +1228,46 @@ mod tests { #[test] fn test_csv() { - let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] - .into_iter() - .map(|format| { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - format, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - }) - .collect(); + for format in [None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + format, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + } } #[test] @@ -2089,4 +2189,46 @@ mod tests { assert!(c.value(2)); assert!(c.is_null(3)); } + + #[test] + fn test_buffered() { + let tests = [ + ("test/data/uk_cities.csv", false, 37), + ("test/data/various_types.csv", true, 7), + ("test/data/decimal_test.csv", false, 10), + ]; + + for (path, has_header, expected_rows) in tests { + for batch_size in [1, 4] { + for capacity in [1, 3, 7, 100] { + let reader = ReaderBuilder::new() + .with_batch_size(batch_size) + .has_header(has_header) + .build(File::open(path).unwrap()) + .unwrap(); + + let expected = reader.collect::, _>>().unwrap(); + + assert_eq!( + expected.iter().map(|x| x.num_rows()).sum::(), + expected_rows + ); + + let buffered = std::io::BufReader::with_capacity( + capacity, + File::open(path).unwrap(), + ); + + let reader = ReaderBuilder::new() + .with_batch_size(batch_size) + .has_header(has_header) + .build_buffered(buffered) + .unwrap(); + + let actual = reader.collect::, _>>().unwrap(); + assert_eq!(expected, actual) + } + } + } + } } diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index 76adb719ec17..c4da36ca4bfe 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -17,7 +17,6 @@ use arrow_schema::ArrowError; use csv_core::{ReadRecordResult, Reader}; -use std::io::BufRead; /// The estimated length of a field in bytes const AVERAGE_FIELD_SIZE: usize = 8; @@ -25,112 +24,165 @@ const AVERAGE_FIELD_SIZE: usize = 8; /// The minimum amount of data in a single read const MIN_CAPACITY: usize = 1024; -pub struct RecordReader { - reader: R, +/// [`RecordDecoder`] provides a push-based interface to decoder [`StringRecords`] +#[derive(Debug)] +pub struct RecordDecoder { delimiter: Reader, + /// The expected number of fields per row num_columns: usize, + /// The current line number line_number: usize, + + /// Offsets delimiting field start positions offsets: Vec, + + /// The current offset into `self.offsets` + /// + /// We track this independently of Vec to avoid re-zeroing memory + offsets_len: usize, + + /// The number of fields read for the current record + current_field: usize, + + /// The number of rows buffered + num_rows: usize, + + /// Decoded field data data: Vec, + + /// Offsets into data + /// + /// We track this independently of Vec to avoid re-zeroing memory + data_len: usize, } -impl RecordReader { - pub fn new(reader: R, delimiter: Reader, num_columns: usize) -> Self { +impl RecordDecoder { + pub fn new(delimiter: Reader, num_columns: usize) -> Self { Self { - reader, delimiter, num_columns, line_number: 1, offsets: vec![], + offsets_len: 1, // The first offset is always 0 + current_field: 0, + data_len: 0, data: vec![], + num_rows: 0, } } - /// Clears and then fills the buffers on this [`RecordReader`] - /// returning the number of records read - fn fill_buf(&mut self, to_read: usize) -> Result { - // Reserve sufficient capacity in offsets - self.offsets.resize(to_read * self.num_columns + 1, 0); - - let mut read = 0; + /// Decodes records from `input` returning the number of records and bytes read + /// + /// Note: this expects to be called with an empty `input` to signal EOF + pub fn decode( + &mut self, + input: &[u8], + to_read: usize, + ) -> Result<(usize, usize), ArrowError> { if to_read == 0 { - return Ok(0); + return Ok((0, 0)); } - // The current offset into `self.data` - let mut output_offset = 0; + // Reserve sufficient capacity in offsets + self.offsets + .resize(self.offsets_len + to_read * self.num_columns, 0); + // The current offset into `input` let mut input_offset = 0; - // The current offset into `self.offsets` - let mut field_offset = 1; - // The number of fields read for the current row - let mut field_count = 0; - - 'outer: loop { - let input = self.reader.fill_buf()?; - - 'input: loop { - // Reserve necessary space in output data based on best estimate - let remaining_rows = to_read - read; - let capacity = remaining_rows * self.num_columns * AVERAGE_FIELD_SIZE; - let estimated_data = capacity.max(MIN_CAPACITY); - self.data.resize(output_offset + estimated_data, 0); - - loop { - let (result, bytes_read, bytes_written, end_positions) = - self.delimiter.read_record( - &input[input_offset..], - &mut self.data[output_offset..], - &mut self.offsets[field_offset..], - ); - - field_count += end_positions; - field_offset += end_positions; - input_offset += bytes_read; - output_offset += bytes_written; - - match result { - ReadRecordResult::End => break 'outer, // Reached end of file - ReadRecordResult::InputEmpty => break 'input, // Input exhausted, need to read more - ReadRecordResult::OutputFull => break, // Need to allocate more capacity - ReadRecordResult::OutputEndsFull => { - let line_number = self.line_number + read; - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got more than {}", line_number, self.num_columns, field_count))); + + // The number of rows decoded in this pass + let mut read = 0; + + loop { + // Reserve necessary space in output data based on best estimate + let remaining_rows = to_read - read; + let capacity = remaining_rows * self.num_columns * AVERAGE_FIELD_SIZE; + let estimated_data = capacity.max(MIN_CAPACITY); + self.data.resize(self.data_len + estimated_data, 0); + + // Try to read a record + loop { + let (result, bytes_read, bytes_written, end_positions) = + self.delimiter.read_record( + &input[input_offset..], + &mut self.data[self.data_len..], + &mut self.offsets[self.offsets_len..], + ); + + self.current_field += end_positions; + self.offsets_len += end_positions; + input_offset += bytes_read; + self.data_len += bytes_written; + + match result { + ReadRecordResult::End | ReadRecordResult::InputEmpty => { + // Reached end of input + return Ok((read, input_offset)); + } + // Need to allocate more capacity + ReadRecordResult::OutputFull => break, + ReadRecordResult::OutputEndsFull => { + return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got more than {}", self.line_number, self.num_columns, self.current_field))); + } + ReadRecordResult::Record => { + if self.current_field != self.num_columns { + return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got {}", self.line_number, self.num_columns, self.current_field))); } - ReadRecordResult::Record => { - if field_count != self.num_columns { - let line_number = self.line_number + read; - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got {}", line_number, self.num_columns, field_count))); - } - read += 1; - field_count = 0; - - if read == to_read { - break 'outer; // Read sufficient rows - } - - if input.len() == input_offset { - // Input exhausted, need to read more - // Without this read_record will interpret the empty input - // byte array as indicating the end of the file - break 'input; - } + read += 1; + self.current_field = 0; + self.line_number += 1; + self.num_rows += 1; + + if read == to_read { + // Read sufficient rows + return Ok((read, input_offset)); + } + + if input.len() == input_offset { + // Input exhausted, need to read more + // Without this read_record will interpret the empty input + // byte array as indicating the end of the file + return Ok((read, input_offset)); } } } } - self.reader.consume(input_offset); - input_offset = 0; } - self.reader.consume(input_offset); + } + + /// Returns the current number of buffered records + pub fn len(&self) -> usize { + self.num_rows + } + + /// Returns true if the decoder is empty + pub fn is_empty(&self) -> bool { + self.num_rows == 0 + } + + /// Clears the current contents of the decoder + pub fn clear(&mut self) { + // This does not reset current_field to allow clearing part way through a record + self.offsets_len = 1; + self.data_len = 0; + self.num_rows = 0; + } + + /// Flushes the current contents of the reader + pub fn flush(&mut self) -> Result, ArrowError> { + if self.current_field != 0 { + return Err(ArrowError::CsvError( + "Cannot flush part way through record".to_string(), + )); + } // csv_core::Reader writes end offsets relative to the start of the row // Therefore scan through and offset these based on the cumulative row offsets let mut row_offset = 0; - self.offsets[1..] - .chunks_mut(self.num_columns) + self.offsets[1..self.offsets_len] + .chunks_exact_mut(self.num_columns) .for_each(|row| { let offset = row_offset; row.iter_mut().for_each(|x| { @@ -139,48 +191,23 @@ impl RecordReader { }); }); - self.line_number += read; - - Ok(read) - } - - /// Skips forward `to_skip` rows, returning an error if insufficient lines in source - pub fn skip(&mut self, to_skip: usize) -> Result<(), ArrowError> { - // TODO: This could be done by scanning for unquoted newline delimiters - let mut skipped = 0; - while to_skip > skipped { - let read = self.fill_buf(to_skip.min(1024))?; - if read == 0 { - return Err(ArrowError::CsvError(format!( - "Failed to skip {to_skip} rows only found {skipped}" - ))); - } - - skipped += read; - } - Ok(()) - } - - /// Reads up to `to_read` rows from the reader - pub fn read(&mut self, to_read: usize) -> Result, ArrowError> { - let num_rows = self.fill_buf(to_read)?; - - // Need to slice fields to the actual number of rows read - // - // We intentionally avoid using `Vec::truncate` to avoid having - // to re-initialize the data again - let num_fields = num_rows * self.num_columns; - let last_offset = self.offsets[num_fields]; - - // Need to truncate data to the actual amount of data read - let data = std::str::from_utf8(&self.data[..last_offset]).map_err(|e| { + // Need to truncate data t1o the actual amount of data read + let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| { ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {e}")) })?; + let offsets = &self.offsets[..self.offsets_len]; + let num_rows = self.num_rows; + + // Reset state + self.offsets_len = 1; + self.data_len = 0; + self.num_rows = 0; + Ok(StringRecords { num_rows, num_columns: self.num_columns, - offsets: &self.offsets[..num_fields + 1], + offsets, data, }) } @@ -208,10 +235,6 @@ impl<'a> StringRecords<'a> { self.num_rows } - pub fn is_empty(&self) -> bool { - self.num_rows == 0 - } - pub fn iter(&self) -> impl Iterator> + '_ { (0..self.num_rows).map(|x| self.get(x)) } @@ -237,9 +260,9 @@ impl<'a> StringRecord<'a> { #[cfg(test)] mod tests { - use crate::reader::records::RecordReader; + use crate::reader::records::RecordDecoder; use csv_core::Reader; - use std::io::Cursor; + use std::io::{BufRead, BufReader, Cursor}; #[test] fn test_basic() { @@ -259,30 +282,43 @@ mod tests { ] .into_iter(); - let cursor = Cursor::new(csv.as_bytes()); - let mut reader = RecordReader::new(cursor, Reader::new(), 3); + let mut reader = BufReader::with_capacity(3, Cursor::new(csv.as_bytes())); + let mut decoder = RecordDecoder::new(Reader::new(), 3); loop { - let b = reader.read(3).unwrap(); - if b.is_empty() { + let to_read = 3; + let mut read = 0; + loop { + let buf = reader.fill_buf().unwrap(); + let (records, bytes) = decoder.decode(buf, to_read - read).unwrap(); + + reader.consume(bytes); + read += records; + + if read == to_read || bytes == 0 { + break; + } + } + if read == 0 { break; } + let b = decoder.flush().unwrap(); b.iter().zip(&mut expected).for_each(|(record, expected)| { let actual = (0..3) .map(|field_idx| record.get(field_idx)) .collect::>(); assert_eq!(actual, expected) - }) + }); } + assert!(expected.next().is_none()); } #[test] fn test_invalid_fields() { let csv = "a,b\nb,c\na\n"; - let cursor = Cursor::new(csv.as_bytes()); - let mut reader = RecordReader::new(cursor, Reader::new(), 2); - let err = reader.read(4).unwrap_err().to_string(); + let mut decoder = RecordDecoder::new(Reader::new(), 2); + let err = decoder.decode(csv.as_bytes(), 4).unwrap_err().to_string(); let expected = "Csv error: incorrect number of fields for line 3, expected 2 got 1"; @@ -290,19 +326,22 @@ mod tests { assert_eq!(err, expected); // Test with initial skip - let cursor = Cursor::new(csv.as_bytes()); - let mut reader = RecordReader::new(cursor, Reader::new(), 2); - reader.skip(1).unwrap(); - let err = reader.read(4).unwrap_err().to_string(); + let mut decoder = RecordDecoder::new(Reader::new(), 2); + let (skipped, bytes) = decoder.decode(csv.as_bytes(), 1).unwrap(); + assert_eq!(skipped, 1); + decoder.clear(); + + let remaining = &csv.as_bytes()[bytes..]; + let err = decoder.decode(remaining, 3).unwrap_err().to_string(); assert_eq!(err, expected); } #[test] fn test_skip_insufficient_rows() { let csv = "a\nv\n"; - let cursor = Cursor::new(csv.as_bytes()); - let mut reader = RecordReader::new(cursor, Reader::new(), 1); - let err = reader.skip(3).unwrap_err().to_string(); - assert_eq!(err, "Csv error: Failed to skip 3 rows only found 2"); + let mut decoder = RecordDecoder::new(Reader::new(), 1); + let (read, bytes) = decoder.decode(csv.as_bytes(), 3).unwrap(); + assert_eq!(read, 2); + assert_eq!(bytes, csv.len()); } } From 92376a76570f9a22f3c37637e24f6cdc8dae8f85 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 27 Jan 2023 10:57:41 -0500 Subject: [PATCH 0554/1411] Update to 32.0.0 and update changelog (#3611) * Update version * update change log * fix doc in the update_change_log script --- CHANGELOG-old.md | 76 ++++++++++++ CHANGELOG.md | 124 +++++++++---------- arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +- arrow-csv/Cargo.toml | 12 +- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 14 +-- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +- arrow-json/Cargo.toml | 12 +- arrow-ord/Cargo.toml | 12 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +-- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +- arrow/Cargo.toml | 28 ++--- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 6 +- parquet/Cargo.toml | 20 +-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 250 insertions(+), 174 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 2bf0aef992f8..65a95579e9f8 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,82 @@ # Historical Changelog +## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-13) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) + +**Breaking changes:** + +- support RFC3339 style timestamps in `arrow-json` [\#3449](https://github.com/apache/arrow-rs/pull/3449) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- Improve arrow flight batch splitting and naming [\#3444](https://github.com/apache/arrow-rs/pull/3444) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Parquet record API: timestamp as signed integer [\#3437](https://github.com/apache/arrow-rs/pull/3437) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ByteBaker](https://github.com/ByteBaker)) +- Support decimal int32/64 for writer [\#3431](https://github.com/apache/arrow-rs/pull/3431) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) + +**Implemented enhancements:** + +- Support casting Date32 to timestamp [\#3504](https://github.com/apache/arrow-rs/issues/3504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting strings like `'2001-01-01'` to timestamp [\#3492](https://github.com/apache/arrow-rs/issues/3492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CLI to "rewrite" parquet files [\#3476](https://github.com/apache/arrow-rs/issues/3476) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add more dictionary value type support to `build_compare` [\#3465](https://github.com/apache/arrow-rs/issues/3465) +- Allow `concat_batches` to take non owned RecordBatch [\#3456](https://github.com/apache/arrow-rs/issues/3456) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release Arrow `30.0.1` \(maintenance release for `30.0.0`\) [\#3455](https://github.com/apache/arrow-rs/issues/3455) +- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3442](https://github.com/apache/arrow-rs/issues/3442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- make\_builder Loses Timezone and Decimal Scale Information [\#3435](https://github.com/apache/arrow-rs/issues/3435) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use RFC3339 style timestamps in arrow-json [\#3416](https://github.com/apache/arrow-rs/issues/3416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrayData`get_slice_memory_size` or similar [\#3407](https://github.com/apache/arrow-rs/issues/3407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Unable to read CSV with null boolean value [\#3521](https://github.com/apache/arrow-rs/issues/3521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make consistent behavior on zeros equality on floating point types [\#3509](https://github.com/apache/arrow-rs/issues/3509) +- Sliced batch w/ bool column doesn't roundtrip through IPC [\#3496](https://github.com/apache/arrow-rs/issues/3496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- take kernel on List array introduces nulls instead of empty lists [\#3471](https://github.com/apache/arrow-rs/issues/3471) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Infinite Loop If Skipping More CSV Lines than Present [\#3469](https://github.com/apache/arrow-rs/issues/3469) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix reading null booleans from CSV [\#3523](https://github.com/apache/arrow-rs/pull/3523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- minor fix: use the unified decimal type builder [\#3522](https://github.com/apache/arrow-rs/pull/3522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Update version to `31.0.0` and add changelog [\#3518](https://github.com/apache/arrow-rs/pull/3518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Additional nullif re-export [\#3515](https://github.com/apache/arrow-rs/pull/3515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make consistent behavior on zeros equality on floating point types [\#3510](https://github.com/apache/arrow-rs/pull/3510) ([viirya](https://github.com/viirya)) +- Enable cast Date32 to Timestamp [\#3508](https://github.com/apache/arrow-rs/pull/3508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Update prost-build requirement from =0.11.5 to =0.11.6 [\#3507](https://github.com/apache/arrow-rs/pull/3507) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor fix for the comments [\#3505](https://github.com/apache/arrow-rs/pull/3505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Fix DataTypeLayout for LargeList [\#3503](https://github.com/apache/arrow-rs/pull/3503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3502](https://github.com/apache/arrow-rs/pull/3502) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([snmvaughan](https://github.com/snmvaughan)) +- Add a function to get memory size of array slice [\#3501](https://github.com/apache/arrow-rs/pull/3501) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Fix IPCWriter for Sliced BooleanArray [\#3498](https://github.com/apache/arrow-rs/pull/3498) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Fix: Added support to cast string without time [\#3494](https://github.com/apache/arrow-rs/pull/3494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gaelwjl](https://github.com/gaelwjl)) +- Fix negative interval prettyprint [\#3491](https://github.com/apache/arrow-rs/pull/3491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Fixes a broken link in the arrow lib.rs rustdoc [\#3487](https://github.com/apache/arrow-rs/pull/3487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([AdamGS](https://github.com/AdamGS)) +- Refactoring build\_compare for decimal and using downcast\_primitive [\#3484](https://github.com/apache/arrow-rs/pull/3484) ([viirya](https://github.com/viirya)) +- Add tests for record batch size splitting logic in FlightClient [\#3481](https://github.com/apache/arrow-rs/pull/3481) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- change `concat_batches` parameter to non owned reference [\#3480](https://github.com/apache/arrow-rs/pull/3480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- feat: add `parquet-rewrite` CLI [\#3477](https://github.com/apache/arrow-rs/pull/3477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) +- Preserve empty list array elements in take kernel [\#3473](https://github.com/apache/arrow-rs/pull/3473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- Add a test for stream writer for writing sliced array [\#3472](https://github.com/apache/arrow-rs/pull/3472) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix CSV infinite loop and improve error messages [\#3470](https://github.com/apache/arrow-rs/pull/3470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more dictionary value type support to `build_compare` [\#3466](https://github.com/apache/arrow-rs/pull/3466) ([viirya](https://github.com/viirya)) +- Add tests for `FlightClient::{list_flights, list_actions, do_action, get_schema}` [\#3463](https://github.com/apache/arrow-rs/pull/3463) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Minor: add ticket links to failing ipc integration tests [\#3461](https://github.com/apache/arrow-rs/pull/3461) ([alamb](https://github.com/alamb)) +- feat: `column_name` based index access for `RecordBatch` and `StructArray` [\#3458](https://github.com/apache/arrow-rs/pull/3458) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Support Decimal256 in FFI [\#3453](https://github.com/apache/arrow-rs/pull/3453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove multiversion dependency [\#3452](https://github.com/apache/arrow-rs/pull/3452) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Re-export nullif kernel [\#3451](https://github.com/apache/arrow-rs/pull/3451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Meaningful error message for map builder with null keys [\#3450](https://github.com/apache/arrow-rs/pull/3450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) +- Parquet writer v2: clear buffer after page flush [\#3447](https://github.com/apache/arrow-rs/pull/3447) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- Verify ArrayData::data\_type compatible in PrimitiveArray::from [\#3440](https://github.com/apache/arrow-rs/pull/3440) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Preserve DataType metadata in make\_builder [\#3438](https://github.com/apache/arrow-rs/pull/3438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate arrow ipc tests and increase coverage [\#3427](https://github.com/apache/arrow-rs/pull/3427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Generic bytes dictionary builder [\#3426](https://github.com/apache/arrow-rs/pull/3426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Minor: Improve docs for arrow-ipc, remove clippy ignore [\#3421](https://github.com/apache/arrow-rs/pull/3421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- refactor: convert `*like_dyn`, `*like_utf8_scalar_dyn` and `*like_dict` functions to macros [\#3411](https://github.com/apache/arrow-rs/pull/3411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add parquet-index binary [\#3405](https://github.com/apache/arrow-rs/pull/3405) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Complete mid-level `FlightClient` [\#3402](https://github.com/apache/arrow-rs/pull/3402) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Implement `RecordBatch` \<--\> `FlightData` encode/decode + tests [\#3391](https://github.com/apache/arrow-rs/pull/3391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Provide `into_builder` for bytearray [\#3326](https://github.com/apache/arrow-rs/pull/3326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + ## [30.0.1](https://github.com/apache/arrow-rs/tree/30.0.1) (2023-01-04) [Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.0...30.0.1) diff --git a/CHANGELOG.md b/CHANGELOG.md index 109453c306a5..19b1fe68dbee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,81 +19,81 @@ # Changelog -## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-13) +## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-26) -[Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/31.0.0...32.0.0) **Breaking changes:** -- support RFC3339 style timestamps in `arrow-json` [\#3449](https://github.com/apache/arrow-rs/pull/3449) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) -- Improve arrow flight batch splitting and naming [\#3444](https://github.com/apache/arrow-rs/pull/3444) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Parquet record API: timestamp as signed integer [\#3437](https://github.com/apache/arrow-rs/pull/3437) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ByteBaker](https://github.com/ByteBaker)) -- Support decimal int32/64 for writer [\#3431](https://github.com/apache/arrow-rs/pull/3431) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Allow `StringArray` construction with `Vec>` [\#3602](https://github.com/apache/arrow-rs/pull/3602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sinistersnare](https://github.com/sinistersnare)) +- Use native types in PageIndex \(\#3575\) [\#3578](https://github.com/apache/arrow-rs/pull/3578) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add external variant to ParquetError \(\#3285\) [\#3574](https://github.com/apache/arrow-rs/pull/3574) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Return reference from ListArray::values [\#3561](https://github.com/apache/arrow-rs/pull/3561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Add `RunEndEncodedArray` [\#3553](https://github.com/apache/arrow-rs/pull/3553) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) **Implemented enhancements:** -- Support casting Date32 to timestamp [\#3504](https://github.com/apache/arrow-rs/issues/3504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting strings like `'2001-01-01'` to timestamp [\#3492](https://github.com/apache/arrow-rs/issues/3492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- CLI to "rewrite" parquet files [\#3476](https://github.com/apache/arrow-rs/issues/3476) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add more dictionary value type support to `build_compare` [\#3465](https://github.com/apache/arrow-rs/issues/3465) -- Allow `concat_batches` to take non owned RecordBatch [\#3456](https://github.com/apache/arrow-rs/issues/3456) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Release Arrow `30.0.1` \(maintenance release for `30.0.0`\) [\#3455](https://github.com/apache/arrow-rs/issues/3455) -- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3442](https://github.com/apache/arrow-rs/issues/3442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- make\_builder Loses Timezone and Decimal Scale Information [\#3435](https://github.com/apache/arrow-rs/issues/3435) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use RFC3339 style timestamps in arrow-json [\#3416](https://github.com/apache/arrow-rs/issues/3416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrayData`get_slice_memory_size` or similar [\#3407](https://github.com/apache/arrow-rs/issues/3407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- There should be a `From>>` impl for `GenericStringArray` [\#3599](https://github.com/apache/arrow-rs/issues/3599) +- FlightDataEncoder Optionally send Schema even when no record batches [\#3591](https://github.com/apache/arrow-rs/issues/3591) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) +- Use Native Types in PageIndex [\#3575](https://github.com/apache/arrow-rs/issues/3575) +- Packing array into dictionary of generic byte array [\#3571](https://github.com/apache/arrow-rs/issues/3571) +- Implement `Error::Source` for ArrowError and FlightError [\#3566](https://github.com/apache/arrow-rs/issues/3566) +- \[FlightSQL\] Allow access to underlying FlightClient [\#3551](https://github.com/apache/arrow-rs/issues/3551) +- Arrow CSV writer should not fail when cannot cast the value [\#3547](https://github.com/apache/arrow-rs/issues/3547) +- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) +- Write Deprecated Min Max Statistics When ColumnOrder Signed [\#3526](https://github.com/apache/arrow-rs/issues/3526) +- Support footer kv metadata for IPC file [\#3432](https://github.com/apache/arrow-rs/issues/3432) +- Add `External` variant to ParquetError [\#3285](https://github.com/apache/arrow-rs/issues/3285) **Fixed bugs:** -- Unable to read CSV with null boolean value [\#3521](https://github.com/apache/arrow-rs/issues/3521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make consistent behavior on zeros equality on floating point types [\#3509](https://github.com/apache/arrow-rs/issues/3509) -- Sliced batch w/ bool column doesn't roundtrip through IPC [\#3496](https://github.com/apache/arrow-rs/issues/3496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- take kernel on List array introduces nulls instead of empty lists [\#3471](https://github.com/apache/arrow-rs/issues/3471) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Infinite Loop If Skipping More CSV Lines than Present [\#3469](https://github.com/apache/arrow-rs/issues/3469) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Nullif of NULL Predicate is not NULL [\#3589](https://github.com/apache/arrow-rs/issues/3589) +- BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) +- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) +- Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) +- Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) +- Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) + +**Closed issues:** + +- Panic on Key Overflow in Dictionary Builders [\#3562](https://github.com/apache/arrow-rs/issues/3562) +- Bumping version gives compilation error \(arrow-array\) [\#3525](https://github.com/apache/arrow-rs/issues/3525) +- Release Arrow `31.0.0` \(next release after `30.0.0`\) [\#3415](https://github.com/apache/arrow-rs/issues/3415) **Merged pull requests:** -- Fix reading null booleans from CSV [\#3523](https://github.com/apache/arrow-rs/pull/3523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- minor fix: use the unified decimal type builder [\#3522](https://github.com/apache/arrow-rs/pull/3522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) -- Update version to `31.0.0` and add changelog [\#3518](https://github.com/apache/arrow-rs/pull/3518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Additional nullif re-export [\#3515](https://github.com/apache/arrow-rs/pull/3515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Make consistent behavior on zeros equality on floating point types [\#3510](https://github.com/apache/arrow-rs/pull/3510) ([viirya](https://github.com/viirya)) -- Enable cast Date32 to Timestamp [\#3508](https://github.com/apache/arrow-rs/pull/3508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Update prost-build requirement from =0.11.5 to =0.11.6 [\#3507](https://github.com/apache/arrow-rs/pull/3507) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- minor fix for the comments [\#3505](https://github.com/apache/arrow-rs/pull/3505) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) -- Fix DataTypeLayout for LargeList [\#3503](https://github.com/apache/arrow-rs/pull/3503) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add string comparisons \(starts\_with, ends\_with, and contains\) to kernel [\#3502](https://github.com/apache/arrow-rs/pull/3502) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([snmvaughan](https://github.com/snmvaughan)) -- Add a function to get memory size of array slice [\#3501](https://github.com/apache/arrow-rs/pull/3501) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Fix IPCWriter for Sliced BooleanArray [\#3498](https://github.com/apache/arrow-rs/pull/3498) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Fix: Added support to cast string without time [\#3494](https://github.com/apache/arrow-rs/pull/3494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gaelwjl](https://github.com/gaelwjl)) -- Fix negative interval prettyprint [\#3491](https://github.com/apache/arrow-rs/pull/3491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Fixes a broken link in the arrow lib.rs rustdoc [\#3487](https://github.com/apache/arrow-rs/pull/3487) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([AdamGS](https://github.com/AdamGS)) -- Refactoring build\_compare for decimal and using downcast\_primitive [\#3484](https://github.com/apache/arrow-rs/pull/3484) ([viirya](https://github.com/viirya)) -- Add tests for record batch size splitting logic in FlightClient [\#3481](https://github.com/apache/arrow-rs/pull/3481) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- change `concat_batches` parameter to non owned reference [\#3480](https://github.com/apache/arrow-rs/pull/3480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- feat: add `parquet-rewrite` CLI [\#3477](https://github.com/apache/arrow-rs/pull/3477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([crepererum](https://github.com/crepererum)) -- Preserve empty list array elements in take kernel [\#3473](https://github.com/apache/arrow-rs/pull/3473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) -- Add a test for stream writer for writing sliced array [\#3472](https://github.com/apache/arrow-rs/pull/3472) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix CSV infinite loop and improve error messages [\#3470](https://github.com/apache/arrow-rs/pull/3470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add more dictionary value type support to `build_compare` [\#3466](https://github.com/apache/arrow-rs/pull/3466) ([viirya](https://github.com/viirya)) -- Add tests for `FlightClient::{list_flights, list_actions, do_action, get_schema}` [\#3463](https://github.com/apache/arrow-rs/pull/3463) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Minor: add ticket links to failing ipc integration tests [\#3461](https://github.com/apache/arrow-rs/pull/3461) ([alamb](https://github.com/alamb)) -- feat: `column_name` based index access for `RecordBatch` and `StructArray` [\#3458](https://github.com/apache/arrow-rs/pull/3458) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Support Decimal256 in FFI [\#3453](https://github.com/apache/arrow-rs/pull/3453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove multiversion dependency [\#3452](https://github.com/apache/arrow-rs/pull/3452) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Re-export nullif kernel [\#3451](https://github.com/apache/arrow-rs/pull/3451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Meaningful error message for map builder with null keys [\#3450](https://github.com/apache/arrow-rs/pull/3450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Jefffrey](https://github.com/Jefffrey)) -- Parquet writer v2: clear buffer after page flush [\#3447](https://github.com/apache/arrow-rs/pull/3447) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) -- Verify ArrayData::data\_type compatible in PrimitiveArray::from [\#3440](https://github.com/apache/arrow-rs/pull/3440) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Preserve DataType metadata in make\_builder [\#3438](https://github.com/apache/arrow-rs/pull/3438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Consolidate arrow ipc tests and increase coverage [\#3427](https://github.com/apache/arrow-rs/pull/3427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Generic bytes dictionary builder [\#3426](https://github.com/apache/arrow-rs/pull/3426) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Minor: Improve docs for arrow-ipc, remove clippy ignore [\#3421](https://github.com/apache/arrow-rs/pull/3421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- refactor: convert `*like_dyn`, `*like_utf8_scalar_dyn` and `*like_dict` functions to macros [\#3411](https://github.com/apache/arrow-rs/pull/3411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Add parquet-index binary [\#3405](https://github.com/apache/arrow-rs/pull/3405) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Complete mid-level `FlightClient` [\#3402](https://github.com/apache/arrow-rs/pull/3402) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Implement `RecordBatch` \<--\> `FlightData` encode/decode + tests [\#3391](https://github.com/apache/arrow-rs/pull/3391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Provide `into_builder` for bytearray [\#3326](https://github.com/apache/arrow-rs/pull/3326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update to flatbuffers 23.1.21 [\#3597](https://github.com/apache/arrow-rs/pull/3597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster BooleanBufferBuilder::append\_n for true values [\#3596](https://github.com/apache/arrow-rs/pull/3596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support sending schemas for empty streams [\#3594](https://github.com/apache/arrow-rs/pull/3594) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Faster ListArray to StringArray conversion [\#3593](https://github.com/apache/arrow-rs/pull/3593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add conversion from StringArray to BinaryArray [\#3592](https://github.com/apache/arrow-rs/pull/3592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix nullif null count \(\#3579\) [\#3590](https://github.com/apache/arrow-rs/pull/3590) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clear bits in BooleanBufferBuilder \(\#3587\) [\#3588](https://github.com/apache/arrow-rs/pull/3588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Iterate all dictionary key types in cast test [\#3585](https://github.com/apache/arrow-rs/pull/3585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Propagate EOF Error from AsyncRead [\#3576](https://github.com/apache/arrow-rs/pull/3576) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) +- Show row\_counts also for \(FixedLen\)ByteArray [\#3573](https://github.com/apache/arrow-rs/pull/3573) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bmmeijers](https://github.com/bmmeijers)) +- Packing array into dictionary of generic byte array [\#3572](https://github.com/apache/arrow-rs/pull/3572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove unwrap on datetime cast for CSV writer [\#3570](https://github.com/apache/arrow-rs/pull/3570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Implement `std::error::Error::source` for `ArrowError` and `FlightError` [\#3567](https://github.com/apache/arrow-rs/pull/3567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Improve GenericBytesBuilder offset overflow panic message \(\#139\) [\#3564](https://github.com/apache/arrow-rs/pull/3564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Extend for ArrayBuilder \(\#1841\) [\#3563](https://github.com/apache/arrow-rs/pull/3563) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update pyarrow method call with kwargs [\#3560](https://github.com/apache/arrow-rs/pull/3560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) +- Update pyo3 requirement from 0.17 to 0.18 [\#3557](https://github.com/apache/arrow-rs/pull/3557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Expose Inner FlightServiceClient on FlightSqlServiceClient \(\#3551\) [\#3556](https://github.com/apache/arrow-rs/pull/3556) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix final page row count in parquet-index binary [\#3554](https://github.com/apache/arrow-rs/pull/3554) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Parquet Avoid Reading 8 Byte Footer Twice from AsyncRead [\#3550](https://github.com/apache/arrow-rs/pull/3550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) +- Improve concat kernel capacity estimation [\#3546](https://github.com/apache/arrow-rs/pull/3546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.49 to =1.0.50 [\#3545](https://github.com/apache/arrow-rs/pull/3545) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update pyarrow method call to avoid warning [\#3544](https://github.com/apache/arrow-rs/pull/3544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) +- Enable casting between Utf8/LargeUtf8 and Binary/LargeBinary [\#3542](https://github.com/apache/arrow-rs/pull/3542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- set sum of uncompressed column size as row group size for parquet files [\#3531](https://github.com/apache/arrow-rs/pull/3531) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sidred](https://github.com/sidred)) +- Minor: Add documentation about memory use for ArrayData [\#3529](https://github.com/apache/arrow-rs/pull/3529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Upgrade to clap 4.1 + fix test [\#3528](https://github.com/apache/arrow-rs/pull/3528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Write backwards compatible row group statistics \(\#3526\) [\#3527](https://github.com/apache/arrow-rs/pull/3527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- No panic on timestamp buffer overflow [\#3519](https://github.com/apache/arrow-rs/pull/3519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Support casting from binary to dictionary of binary [\#3482](https://github.com/apache/arrow-rs/pull/3482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 67f82c05821a..774bb11bb090 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "31.0.0" +version = "32.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 155dcc412c25..c109db36973d 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "31.0.0" +version = "32.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index bef5a7a8a875..d46e2f11a1d3 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "31.0.0" +version = "32.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index cab1edcccdf2..2ce83e856806 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "31.0.0" +version = "32.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-select = { version = "31.0.0", path = "../arrow-select" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-select = { version = "32.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 674e159074d6..517ffa33f9f0 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "31.0.0" +version = "32.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index fc5839522e82..42e1f43bf30d 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "31.0.0" +version = "32.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index bad94457bcc0..e5a900a0dd25 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "31.0.0" +version = "32.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "31.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "32.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -50,7 +50,7 @@ flight-sql-experimental = [] tls = ["tonic/tls"] [dev-dependencies] -arrow = { version = "31.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "32.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 5159d5499fa2..cb543c956d72 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "31.0.0" +arrow-flight = "32.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 6177c7b37b82..35b088b1636f 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "31.0.0" +version = "32.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "31.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } +arrow = { version = "32.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 3f07da7d3fa8..46b2bb3691eb 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "31.0.0" +version = "32.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index d2274e8956e9..79b34a7b4601 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "31.0.0" +version = "32.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index c6aa9b486cdc..2a3a7ec1731f 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "31.0.0" +version = "32.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index a8f9fcdf82ae..b029c8b91303 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "31.0.0" +version = "32.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-select = { version = "31.0.0", path = "../arrow-select" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-select = { version = "32.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 7a2dc563a1ac..02a96cf68fd4 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "31.0.0" +version = "32.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "31.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "32.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 436f6d04b427..f82e499cc302 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "31.0.0" +version = "32.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-ord = { version = "31.0.0", path = "../arrow-ord" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-ord = { version = "32.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 8ccf565bf5e5..c36305b0b283 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "31.0.0" +version = "32.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index a1ba58900b65..8a8af0dbb825 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "31.0.0" +version = "32.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-array = { version = "31.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index f62ec919d5fc..47740275c0fd 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "31.0.0" +version = "32.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-select = { version = "31.0.0", path = "../arrow-select" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-select = { version = "32.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index bb67bfc400e1..6de513df3653 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "31.0.0" +version = "32.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "31.0.0", path = "../arrow-arith" } -arrow-array = { version = "31.0.0", path = "../arrow-array" } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "31.0.0", path = "../arrow-cast" } -arrow-csv = { version = "31.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "31.0.0", path = "../arrow-data" } -arrow-ipc = { version = "31.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "31.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "31.0.0", path = "../arrow-ord" } -arrow-row = { version = "31.0.0", path = "../arrow-row" } -arrow-schema = { version = "31.0.0", path = "../arrow-schema" } -arrow-select = { version = "31.0.0", path = "../arrow-select" } -arrow-string = { version = "31.0.0", path = "../arrow-string" } +arrow-arith = { version = "32.0.0", path = "../arrow-arith" } +arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "32.0.0", path = "../arrow-cast" } +arrow-csv = { version = "32.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-ipc = { version = "32.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "32.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "32.0.0", path = "../arrow-ord" } +arrow-row = { version = "32.0.0", path = "../arrow-row" } +arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-select = { version = "32.0.0", path = "../arrow-select" } +arrow-string = { version = "32.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow/README.md b/arrow/README.md index d0c7785821e6..68598078cfd8 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `31.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `32.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 1fcc0862f6bc..f86513822762 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/31.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/32.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 236809ed5f85..2b8396347355 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -25,12 +25,12 @@ # arrow-rs/.github_changelog_generator # # Usage: -# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh +# ARROW_GITHUB_API_TOKEN= ./update_change_log.sh set -e -SINCE_TAG="30.0.0" -FUTURE_RELEASE="31.0.0" +SINCE_TAG="31.0.0" +FUTURE_RELEASE="32.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 43bd52beeb18..a112ec354e8d 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "31.0.0" +version = "32.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "31.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "31.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "31.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "31.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "31.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "31.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "31.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "31.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "32.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "32.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "32.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "32.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "32.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "32.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "32.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "32.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "31.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "32.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 8234d02d4c49..3fdcd66f248c 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "31.0.0" +version = "32.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "31.0.0", default-features = false } +parquet = { path = "../parquet", version = "32.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 72e2568e4b19..14d3c066c7e9 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "31.0.0" -parquet_derive = "31.0.0" +parquet = "32.0.0" +parquet_derive = "32.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 47f5a54b3bab..e3306e7c4659 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "31.0.0" +version = "32.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "31.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "31.0.0", default-features = false } +parquet = { path = "../parquet", version = "32.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "32.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 53a650a5daa5e7a181852b8a0cde9fe8fe66b9a7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Jan 2023 16:27:20 +0000 Subject: [PATCH 0555/1411] Final tweaks to 32.0.0 changelog (#3618) --- CHANGELOG.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19b1fe68dbee..656c86eaf524 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ # Changelog -## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-26) +## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-27) [Full Changelog](https://github.com/apache/arrow-rs/compare/31.0.0...32.0.0) @@ -33,36 +33,35 @@ **Implemented enhancements:** -- There should be a `From>>` impl for `GenericStringArray` [\#3599](https://github.com/apache/arrow-rs/issues/3599) +- There should be a `From>>` impl for `GenericStringArray` [\#3599](https://github.com/apache/arrow-rs/issues/3599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - FlightDataEncoder Optionally send Schema even when no record batches [\#3591](https://github.com/apache/arrow-rs/issues/3591) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) -- Use Native Types in PageIndex [\#3575](https://github.com/apache/arrow-rs/issues/3575) -- Packing array into dictionary of generic byte array [\#3571](https://github.com/apache/arrow-rs/issues/3571) -- Implement `Error::Source` for ArrowError and FlightError [\#3566](https://github.com/apache/arrow-rs/issues/3566) -- \[FlightSQL\] Allow access to underlying FlightClient [\#3551](https://github.com/apache/arrow-rs/issues/3551) -- Arrow CSV writer should not fail when cannot cast the value [\#3547](https://github.com/apache/arrow-rs/issues/3547) -- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) -- Write Deprecated Min Max Statistics When ColumnOrder Signed [\#3526](https://github.com/apache/arrow-rs/issues/3526) +- Use Native Types in PageIndex [\#3575](https://github.com/apache/arrow-rs/issues/3575) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Packing array into dictionary of generic byte array [\#3571](https://github.com/apache/arrow-rs/issues/3571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `Error::Source` for ArrowError and FlightError [\#3566](https://github.com/apache/arrow-rs/issues/3566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- \[FlightSQL\] Allow access to underlying FlightClient [\#3551](https://github.com/apache/arrow-rs/issues/3551) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Arrow CSV writer should not fail when cannot cast the value [\#3547](https://github.com/apache/arrow-rs/issues/3547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Write Deprecated Min Max Statistics When ColumnOrder Signed [\#3526](https://github.com/apache/arrow-rs/issues/3526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Improve Performance of JSON Reader [\#3441](https://github.com/apache/arrow-rs/issues/3441) - Support footer kv metadata for IPC file [\#3432](https://github.com/apache/arrow-rs/issues/3432) -- Add `External` variant to ParquetError [\#3285](https://github.com/apache/arrow-rs/issues/3285) +- Add `External` variant to ParquetError [\#3285](https://github.com/apache/arrow-rs/issues/3285) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** - Nullif of NULL Predicate is not NULL [\#3589](https://github.com/apache/arrow-rs/issues/3589) -- BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) -- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) -- Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) -- Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) -- Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) +- BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Panic on Key Overflow in Dictionary Builders [\#3562](https://github.com/apache/arrow-rs/issues/3562) +- Panic on Key Overflow in Dictionary Builders [\#3562](https://github.com/apache/arrow-rs/issues/3562) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Bumping version gives compilation error \(arrow-array\) [\#3525](https://github.com/apache/arrow-rs/issues/3525) -- Release Arrow `31.0.0` \(next release after `30.0.0`\) [\#3415](https://github.com/apache/arrow-rs/issues/3415) **Merged pull requests:** +- Add Push-Based CSV Decoder [\#3604](https://github.com/apache/arrow-rs/pull/3604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update to flatbuffers 23.1.21 [\#3597](https://github.com/apache/arrow-rs/pull/3597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Faster BooleanBufferBuilder::append\_n for true values [\#3596](https://github.com/apache/arrow-rs/pull/3596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Support sending schemas for empty streams [\#3594](https://github.com/apache/arrow-rs/pull/3594) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) @@ -94,6 +93,7 @@ - Write backwards compatible row group statistics \(\#3526\) [\#3527](https://github.com/apache/arrow-rs/pull/3527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - No panic on timestamp buffer overflow [\#3519](https://github.com/apache/arrow-rs/pull/3519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) - Support casting from binary to dictionary of binary [\#3482](https://github.com/apache/arrow-rs/pull/3482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add Raw JSON Reader \(~2.5x faster\) [\#3479](https://github.com/apache/arrow-rs/pull/3479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From eeecbe548a4c2fe06964ab3813365af4f8f39c6a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Jan 2023 18:06:58 +0000 Subject: [PATCH 0556/1411] Update AWS SDK (#3617) --- object_store/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 8c9ede087b33..c685685b1346 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -53,9 +53,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.53", optional = true } -aws-credential-types = { version = "0.53", optional = true } -aws-config = { version = "0.53", optional = true } +aws-types = { version = "0.54", optional = true } +aws-credential-types = { version = "0.54", optional = true } +aws-config = { version = "0.54", optional = true } [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 8cc8327696e5f1bd5e647ab7e9fc874abf938b6d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 28 Jan 2023 00:13:29 -0800 Subject: [PATCH 0557/1411] Casting generic binary to generic string (#3607) * Casting generic binary to generic string * For CastOptions.safe as false case, applying optimized casting * Remove offset --- arrow-cast/src/cast.rs | 129 +++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 51 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index aec665aa3013..9f20dceb980a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -156,8 +156,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, LargeUtf8) => true, (LargeUtf8, Utf8) => true, - (Binary, LargeBinary) => true, - (LargeBinary, Binary) => true, + (Binary, LargeBinary | Utf8 | LargeUtf8) => true, + (LargeBinary, Binary | Utf8 | LargeUtf8) => true, (Utf8, Binary | LargeBinary @@ -185,7 +185,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, (Date32, Utf8) | (Date32, LargeUtf8) => true, (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => (DataType::is_numeric(from_type) && from_type != &Float16) || from_type == &Binary, + (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) && from_type != &Float16, // start numeric casts ( @@ -1180,30 +1180,8 @@ pub fn cast_with_options( } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), - Binary => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|maybe_value| match maybe_value { - Some(value) => { - let result = std::str::from_utf8(value); - if cast_options.safe { - Ok(result.ok()) - } else { - Some(result.map_err(|_| { - ArrowError::CastError( - "Cannot cast binary to string".to_string(), - ) - })) - .transpose() - } - } - None => Ok(None), - }) - .collect::>()?, - )) - } + Binary => cast_binary_to_generic_string::(array, cast_options), + LargeBinary => cast_binary_to_generic_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1236,30 +1214,8 @@ pub fn cast_with_options( } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), - Binary => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|maybe_value| match maybe_value { - Some(value) => { - let result = std::str::from_utf8(value); - if cast_options.safe { - Ok(result.ok()) - } else { - Some(result.map_err(|_| { - ArrowError::CastError( - "Cannot cast binary to string".to_string(), - ) - })) - .transpose() - } - } - None => Ok(None), - }) - .collect::>()?, - )) - } + Binary => cast_binary_to_generic_string::(array, cast_options), + LargeBinary => cast_binary_to_generic_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -3436,6 +3392,77 @@ fn cast_list_inner( Ok(Arc::new(list) as ArrayRef) } +/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs +/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending +/// `CastOptions`. +fn cast_binary_to_generic_string( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result +where + I: OffsetSizeTrait + ToPrimitive, + O: OffsetSizeTrait + NumCast, +{ + let array = array + .as_any() + .downcast_ref::>>() + .unwrap(); + + if !cast_options.safe { + let offsets = array.value_offsets(); + let values = array.value_data(); + + // We only need to validate that all values are valid UTF-8 + let validated = std::str::from_utf8(values) + .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; + + let mut offset_builder = BufferBuilder::::new(offsets.len()); + offsets + .iter() + .try_for_each::<_, Result<_, ArrowError>>(|offset| { + if !validated.is_char_boundary(offset.as_usize()) { + return Err(ArrowError::CastError( + "Invalid UTF-8 sequence".to_string(), + )); + } + + let offset = ::from(*offset).ok_or_else(|| { + ArrowError::ComputeError(format!( + "{}Binary array too large to cast to {}String array", + I::PREFIX, + O::PREFIX + )) + })?; + offset_builder.append(offset); + Ok(()) + })?; + + let offset_buffer = offset_builder.finish(); + + let builder = ArrayData::builder(GenericStringArray::::DATA_TYPE) + .len(array.len()) + .add_buffer(offset_buffer) + .add_buffer(array.data().buffers()[1].clone()) + .null_count(array.null_count()) + .null_bit_buffer(array.data().null_buffer().cloned()); + + // SAFETY: + // Validated UTF-8 above + Ok(Arc::new(GenericStringArray::::from(unsafe { + builder.build_unchecked() + }))) + } else { + Ok(Arc::new( + array + .iter() + .map(|maybe_value| { + maybe_value.and_then(|value| std::str::from_utf8(value).ok()) + }) + .collect::>>(), + )) + } +} + /// Helper function to cast from one `ByteArrayType` to another and vice versa. /// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error. fn cast_byte_container( From f9a78e0e812f250a696a9f8aa8b97db8d38953d0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 29 Jan 2023 00:05:29 +0100 Subject: [PATCH 0558/1411] Minor: Add test for dictionary encoding of batches (#3608) * Minor: Add test for dictionary encoding of batches * Update arrow-flight/tests/encode_decode.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/tests/encode_decode.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 1990e5b0cbb1..25e74cb3b6bc 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -123,6 +123,34 @@ async fn test_zero_batches_schema_specified() { assert_eq!(decoder.schema(), Some(&schema)); } +#[tokio::test] +async fn test_zero_batches_dictonary_schema_specified() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new( + "b", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + ])); + + // Expect dictionary to be hydrated in output (#3389) + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Utf8, false), + ])); + let stream = FlightDataEncoderBuilder::default() + .with_schema(schema.clone()) + .build(futures::stream::iter(vec![])); + + let mut decoder = FlightRecordBatchStream::new_from_flight_data(stream); + assert!(decoder.schema().is_none()); + // No batches come out + assert!(decoder.next().await.is_none()); + // But schema has been received correctly + assert_eq!(decoder.schema(), Some(&expected_schema)); +} + #[tokio::test] async fn test_app_metadata() { let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(78))]); From e6418cbbcffaf855775559c4a073c01f0b38afdd Mon Sep 17 00:00:00 2001 From: Kevin Schiroo Date: Sun, 29 Jan 2023 11:03:43 -0600 Subject: [PATCH 0559/1411] Fix typo in comment (#3627) Switching Janiary to January. --- parquet/src/record/api.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index f3511c03df83..49fdc3fc71d4 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -525,7 +525,7 @@ pub enum Field { Date(i32), /// Milliseconds from the Unix epoch, 1 January 1970. TimestampMillis(i64), - /// Microseconds from the Unix epoch, 1 Janiary 1970. + /// Microseconds from the Unix epoch, 1 January 1970. TimestampMicros(i64), // ---------------------------------------------------------------------- From 02c695fed9e62dc9a3feb47372eed01e983e4c44 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 29 Jan 2023 18:38:54 +0000 Subject: [PATCH 0560/1411] Remove BitSliceIterator specialization from try_for_each_valid_idx (#3621) --- arrow-data/src/bit_iterator.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arrow-data/src/bit_iterator.rs b/arrow-data/src/bit_iterator.rs index 45a42c3910f7..ea95f1f38b01 100644 --- a/arrow-data/src/bit_iterator.rs +++ b/arrow-data/src/bit_iterator.rs @@ -186,14 +186,7 @@ pub fn try_for_each_valid_idx Result<(), E>>( if valid_count == len { (0..len).try_for_each(f) } else if null_count != len { - let selectivity = valid_count as f64 / len as f64; - if selectivity > 0.8 { - BitSliceIterator::new(nulls.unwrap(), offset, len) - .flat_map(|(start, end)| start..end) - .try_for_each(f) - } else { - BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f) - } + BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f) } else { Ok(()) } From 266e2cf28242c5cd814bf6c47be45485674043e1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 29 Jan 2023 18:40:26 +0000 Subject: [PATCH 0561/1411] Reduce PrimitiveArray::try_unary codegen (#3619) --- arrow-array/src/array/primitive_array.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index a757eb7dd4c1..dfe076306178 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -494,17 +494,6 @@ impl PrimitiveArray { let len = self.len(); let null_count = self.null_count(); - if null_count == 0 { - let values = self.values().iter().map(|v| op(*v)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size because arrays are sized. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); - } - let null_buffer = data.null_buffer().map(|b| b.bit_slice(data.offset(), len)); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); From 3c8d8db68fdbf94b82d6b178e10c8580c0c75d04 Mon Sep 17 00:00:00 2001 From: "chunshao.rcs" Date: Mon, 30 Jan 2023 14:24:27 +0800 Subject: [PATCH 0562/1411] chore: delete wrong comment and refactor set_metadata in `Field` (#3630) --- arrow-schema/src/field.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 8dcb8cea9e7c..b687b629aa75 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -142,13 +142,9 @@ impl Field { } /// Sets the `Field`'s optional custom metadata. - /// The metadata is set as `None` for empty map. #[inline] pub fn set_metadata(&mut self, metadata: HashMap) { - self.metadata = HashMap::default(); - if !metadata.is_empty() { - self.metadata = metadata; - } + self.metadata = metadata; } /// Sets the metadata of this `Field` to be `metadata` and returns self From 3057fa5f63516672b74a75741f1b97851fe1eafd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=AD=E5=B7=8D?= Date: Mon, 30 Jan 2023 23:16:56 +0800 Subject: [PATCH 0563/1411] Improve error messge with detailed schema (#3637) Signed-off-by: Veeupup <931418134@qq.com> --- arrow-select/src/concat.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index be6b0a063275..e463c12a8856 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -106,7 +106,12 @@ pub fn concat_batches<'a>( .find(|&(_, batch)| batch.schema() != *schema) { return Err(ArrowError::InvalidArgumentError(format!( - "batches[{i}] schema is different with argument schema." + "batches[{i}] schema is different with argument schema. + batches[{i}] schema: {:?}, + argument schema: {:?} + ", + batches[i].schema(), + *schema ))); } let field_num = schema.fields().len(); @@ -647,7 +652,7 @@ mod tests { let error = concat_batches(&schema1, [&batch1, &batch2]).unwrap_err(); assert_eq!( error.to_string(), - "Invalid argument error: batches[1] schema is different with argument schema.", + "Invalid argument error: batches[1] schema is different with argument schema.\n batches[1] schema: Schema { fields: [Field { name: \"c\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"d\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} },\n argument schema: Schema { fields: [Field { name: \"a\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"b\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }\n " ); } From a76ea1c8e2a40902300185ce8122a19f85ff550b Mon Sep 17 00:00:00 2001 From: Dan Harris <1327726+thinkharderdev@users.noreply.github.com> Date: Mon, 30 Jan 2023 19:03:50 +0200 Subject: [PATCH 0564/1411] Add limit to ArrowReaderBuilder to push limit down to parquet reader (#3633) * Add limit to ArrowReaderBuilder to push limit down to parquet reader * Update parquet/src/arrow/arrow_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * pr comments * Apply limit to entire file instead of each row group --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/arrow_reader/mod.rs | 58 +++++++- parquet/src/arrow/arrow_reader/selection.rs | 82 ++++++++++- parquet/src/arrow/async_reader/mod.rs | 147 +++++++++++++++++++- 3 files changed, 282 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 87165ef8e575..c4b645da7ce5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -69,6 +69,8 @@ pub struct ArrowReaderBuilder { pub(crate) filter: Option, pub(crate) selection: Option, + + pub(crate) limit: Option, } impl ArrowReaderBuilder { @@ -98,6 +100,7 @@ impl ArrowReaderBuilder { projection: ProjectionMask::all(), filter: None, selection: None, + limit: None, }) } @@ -167,6 +170,17 @@ impl ArrowReaderBuilder { ..self } } + + /// Provide a limit to the number of rows to be read + /// + /// The limit will be applied after any [`Self::with_row_selection`] and [`Self::with_row_filter`] + /// allowing it to limit the final set of rows decoded after any pushed down predicates + pub fn with_limit(self, limit: usize) -> Self { + Self { + limit: Some(limit), + ..self + } + } } /// Arrow reader api. @@ -453,6 +467,19 @@ impl ArrowReaderBuilder> { selection = Some(RowSelection::from(vec![])); } + // If a limit is defined, apply it to the final `RowSelection` + if let Some(limit) = self.limit { + selection = Some( + selection + .map(|selection| selection.limit(limit)) + .unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select( + limit.min(reader.num_rows()), + )]) + }), + ); + } + Ok(ParquetRecordBatchReader::new( batch_size, array_reader, @@ -1215,6 +1242,8 @@ mod tests { row_selections: Option<(RowSelection, usize)>, /// row filter row_filter: Option>, + /// limit + limit: Option, } /// Manually implement this to avoid printing entire contents of row_selections and row_filter @@ -1233,6 +1262,7 @@ mod tests { .field("encoding", &self.encoding) .field("row_selections", &self.row_selections.is_some()) .field("row_filter", &self.row_filter.is_some()) + .field("limit", &self.limit) .finish() } } @@ -1252,6 +1282,7 @@ mod tests { encoding: Encoding::PLAIN, row_selections: None, row_filter: None, + limit: None, } } } @@ -1323,6 +1354,13 @@ mod tests { } } + fn with_limit(self, limit: usize) -> Self { + Self { + limit: Some(limit), + ..self + } + } + fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() .set_data_pagesize_limit(self.max_data_page_size) @@ -1381,6 +1419,14 @@ mod tests { TestOptions::new(2, 256, 127).with_null_percent(0), // Test optional with nulls TestOptions::new(2, 256, 93).with_null_percent(25), + // Test with limit of 0 + TestOptions::new(4, 100, 25).with_limit(0), + // Test with limit of 50 + TestOptions::new(4, 100, 25).with_limit(50), + // Test with limit equal to number of rows + TestOptions::new(4, 100, 25).with_limit(10), + // Test with limit larger than number of rows + TestOptions::new(4, 100, 25).with_limit(101), // Test with no page-level statistics TestOptions::new(2, 256, 91) .with_null_percent(25) @@ -1423,6 +1469,11 @@ mod tests { TestOptions::new(2, 256, 93) .with_null_percent(25) .with_row_selections(), + // Test optional with nulls + TestOptions::new(2, 256, 93) + .with_null_percent(25) + .with_row_selections() + .with_limit(10), // Test filter // Test with row filter @@ -1592,7 +1643,7 @@ mod tests { } }; - let expected_data = match opts.row_filter { + let mut expected_data = match opts.row_filter { Some(filter) => { let expected_data = expected_data .into_iter() @@ -1622,6 +1673,11 @@ mod tests { None => expected_data, }; + if let Some(limit) = opts.limit { + builder = builder.with_limit(limit); + expected_data = expected_data.into_iter().take(limit).collect(); + } + let mut record_reader = builder .with_batch_size(opts.record_batch_size) .build() diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 03c7e01e0840..d2af4516dd08 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -19,6 +19,7 @@ use arrow_array::{Array, BooleanArray}; use arrow_select::filter::SlicesIterator; use std::cmp::Ordering; use std::collections::VecDeque; +use std::mem; use std::ops::Range; /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when @@ -111,7 +112,7 @@ impl RowSelection { } /// Creates a [`RowSelection`] from an iterator of consecutive ranges to keep - fn from_consecutive_ranges>>( + pub(crate) fn from_consecutive_ranges>>( ranges: I, total_rows: usize, ) -> Self { @@ -371,6 +372,32 @@ impl RowSelection { self } + /// Limit this [`RowSelection`] to only select `limit` rows + pub(crate) fn limit(mut self, mut limit: usize) -> Self { + let mut new_selectors = Vec::with_capacity(self.selectors.len()); + for mut selection in mem::take(&mut self.selectors) { + if limit == 0 { + break; + } + + if !selection.skip { + if selection.row_count >= limit { + selection.row_count = limit; + new_selectors.push(selection); + break; + } else { + limit -= selection.row_count; + new_selectors.push(selection); + } + } else { + new_selectors.push(selection); + } + } + + self.selectors = new_selectors; + self + } + /// Returns an iterator over the [`RowSelector`]s for this /// [`RowSelection`]. pub fn iter(&self) -> impl Iterator { @@ -841,6 +868,59 @@ mod tests { assert_eq!(selectors, round_tripped); } + #[test] + fn test_limit() { + // Limit to existing limit should no-op + let selection = + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); + let limited = selection.limit(10); + assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); + + let selection = RowSelection::from(vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]); + + let limited = selection.clone().limit(5); + let expected = vec![RowSelector::select(5)]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(15); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(5), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(0); + let expected = vec![]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(30); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.limit(100); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); + } + #[test] fn test_scan_ranges() { let index = vec![ diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 0397df206bff..71f95e07a756 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -99,7 +99,7 @@ use arrow_schema::SchemaRef; use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; use crate::arrow::arrow_reader::{ evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, - ParquetRecordBatchReader, RowFilter, RowSelection, + ParquetRecordBatchReader, RowFilter, RowSelection, RowSelector, }; use crate::arrow::schema::ParquetField; use crate::arrow::ProjectionMask; @@ -352,6 +352,7 @@ impl ArrowReaderBuilder> { Ok(ParquetRecordBatchStream { metadata: self.metadata, batch_size, + limit: self.limit, row_groups, projection: self.projection, selection: self.selection, @@ -389,6 +390,7 @@ where mut selection: Option, projection: ProjectionMask, batch_size: usize, + limit: Option, ) -> ReadResult { // TODO: calling build_array multiple times is wasteful @@ -430,6 +432,17 @@ where return Ok((self, None)); } + // If a limit is defined, apply it to the final `RowSelection` + if let Some(limit) = limit { + selection = Some( + selection + .map(|selection| selection.limit(limit)) + .unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(limit)]) + }), + ); + } + row_group .fetch(&mut self.input, &projection, selection.as_ref()) .await?; @@ -479,6 +492,8 @@ pub struct ParquetRecordBatchStream { batch_size: usize, + limit: Option, + selection: Option, /// This is an option so it can be moved into a future @@ -519,7 +534,12 @@ where loop { match &mut self.state { StreamState::Decoding(batch_reader) => match batch_reader.next() { - Some(Ok(batch)) => return Poll::Ready(Some(Ok(batch))), + Some(Ok(batch)) => { + if let Some(limit) = self.limit.as_mut() { + *limit -= batch.num_rows(); + } + return Poll::Ready(Some(Ok(batch))); + } Some(Err(e)) => { self.state = StreamState::Error; return Poll::Ready(Some(Err(ParquetError::ArrowError( @@ -548,6 +568,7 @@ where selection, self.projection.clone(), self.batch_size, + self.limit, ) .boxed(); @@ -803,6 +824,7 @@ mod tests { use crate::arrow::ArrowWriter; use crate::file::footer::parse_metadata; use crate::file::page_index::index_reader; + use crate::file::properties::WriterProperties; use arrow::error::Result as ArrowResult; use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; use futures::TryStreamExt; @@ -943,6 +965,70 @@ mod tests { assert_eq!(async_batches, sync_batches); } + #[tokio::test] + async fn test_async_reader_with_limit() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); + + // The builder should have page and offset indexes loaded now + let metadata_with_index = builder.metadata(); + + // Check offset indexes are present for all columns + for rg in metadata_with_index.row_groups() { + let page_locations = + rg.page_offset_index().expect("expected page offset index"); + assert_eq!(page_locations.len(), rg.columns().len()) + } + + // Check page indexes are present for all columns + let page_indexes = metadata_with_index + .page_indexes() + .expect("expected page indexes"); + for (idx, rg) in metadata_with_index.row_groups().iter().enumerate() { + assert_eq!(page_indexes[idx].len(), rg.columns().len()) + } + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); + let stream = builder + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_limit(1) + .build() + .unwrap(); + + let async_batches: Vec<_> = stream.try_collect().await.unwrap(); + + let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data) + .unwrap() + .with_projection(mask) + .with_batch_size(1024) + .with_limit(1) + .build() + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + } + #[tokio::test] async fn test_async_reader_skip_pages() { let testdata = arrow::util::test_util::parquet_test_data(); @@ -1204,6 +1290,61 @@ mod tests { assert_eq!(requests.lock().unwrap().len(), 3); } + #[tokio::test] + async fn test_limit_multiple_row_groups() { + let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]); + let b = StringArray::from_iter_values(["1", "2", "3", "4", "5", "6"]); + let c = Int32Array::from_iter(0..6); + let data = RecordBatch::try_from_iter([ + ("a", Arc::new(a) as ArrayRef), + ("b", Arc::new(b) as ArrayRef), + ("c", Arc::new(c) as ArrayRef), + ]) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let props = WriterProperties::builder() + .set_max_row_group_size(3) + .build(); + let mut writer = + ArrowWriter::try_new(&mut buf, data.schema(), Some(props)).unwrap(); + writer.write(&data).unwrap(); + writer.close().unwrap(); + + let data: Bytes = buf.into(); + let metadata = parse_metadata(&data).unwrap(); + + assert_eq!(metadata.num_row_groups(), 2); + + let test = TestReader { + data, + metadata: Arc::new(metadata), + requests: Default::default(), + }; + + let stream = ParquetRecordBatchStreamBuilder::new(test) + .await + .unwrap() + .with_batch_size(1024) + .with_limit(4) + .build() + .unwrap(); + + let batches: Vec<_> = stream.try_collect().await.unwrap(); + // Expect one batch for each row group + assert_eq!(batches.len(), 2); + + let batch = &batches[0]; + // First batch should contain all rows + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 3); + + let batch = &batches[1]; + // Second batch should trigger the limit and only have one row + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); + } + #[tokio::test] async fn test_row_filter_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); @@ -1330,7 +1471,7 @@ mod tests { let selection = RowSelection::from(selectors); let (_factory, _reader) = reader_factory - .read_row_group(0, Some(selection), projection.clone(), 48) + .read_row_group(0, Some(selection), projection.clone(), 48, None) .await .expect("reading row group"); From 9c955332e97fcc833116d79cfa8c90d4a5678333 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Jan 2023 18:45:00 +0000 Subject: [PATCH 0565/1411] Update labeller for new crates (#3639) --- .github/workflows/dev_pr/labeler.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 35f2a873c6a4..e5b86e8bcdf0 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -16,13 +16,19 @@ # under the License. arrow: + - arrow-arith/**/* - arrow-array/**/* - arrow-buffer/**/* - arrow-cast/**/* - arrow-csv/**/* - arrow-data/**/* + - arrow-flight/**/* + - arrow-integration-test/**/* + - arrow-integration-testing/**/* - arrow-ipc/**/* - arrow-json/**/* + - arrow-ord/**/* + - arrow-row/**/* - arrow-schema/**/* - arrow-select/**/* - arrow-string/**/* From f78a9be8b3a7479418cacc5ea6755b1e60c4b03f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 31 Jan 2023 15:04:12 +0100 Subject: [PATCH 0566/1411] Minor: Update doc strings about Page Index / Column Index (#3625) * Minor: Update doc strings about what Page Index / Column Index * tweaks * typos * Apply suggestions from code review * Update parquet/src/file/metadata.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- parquet/src/file/metadata.rs | 29 +++++++++++++++++++-- parquet/src/file/page_encoding_stats.rs | 2 ++ parquet/src/file/page_index/index.rs | 24 ++++++++++++++--- parquet/src/file/page_index/index_reader.rs | 28 +++++++++++++++++--- parquet/src/file/page_index/mod.rs | 4 +++ parquet/src/file/properties.rs | 2 +- parquet/src/file/reader.rs | 5 ++-- parquet/src/file/serialized_reader.rs | 7 +++-- 8 files changed, 86 insertions(+), 15 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 0696b2901267..a83f02dfdf86 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -50,7 +50,25 @@ use crate::schema::types::{ Type as SchemaType, }; +/// [`Index`] for each row group of each column. +/// +/// `column_index[row_group_number][column_number]` holds the +/// [`Index`] corresponding to column `column_number` of row group +/// `row_group_number`. +/// +/// For example `column_index[2][3]` holds the [`Index`] for the forth +/// column in the third row group of the parquet file. pub type ParquetColumnIndex = Vec>; + +/// [`PageLocation`] for each datapage of each row group of each column. +/// +/// `offset_index[row_group_number][column_number][page_number]` holds +/// the [`PageLocation`] corresponding to page `page_number` of column +/// `column_number`of row group `row_group_number`. +/// +/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for +/// the fifth page of the forth column in the third row group of the +/// parquet file. pub type ParquetOffsetIndex = Vec>>; /// Global Parquet metadata. @@ -65,8 +83,8 @@ pub struct ParquetMetaData { } impl ParquetMetaData { - /// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s - /// for each available row group. + /// Creates Parquet metadata from file metadata and a list of row + /// group metadata pub fn new(file_metadata: FileMetaData, row_groups: Vec) -> Self { ParquetMetaData { file_metadata, @@ -76,6 +94,8 @@ impl ParquetMetaData { } } + /// Creates Parquet metadata from file metadata, a list of row + /// group metadata, and the column index structures. pub fn new_with_page_index( file_metadata: FileMetaData, row_groups: Vec, @@ -232,6 +252,7 @@ pub struct RowGroupMetaData { sorting_columns: Option>, total_byte_size: i64, schema_descr: SchemaDescPtr, + /// `page_offset_index[column_number][page_number]` page_offset_index: Option>>, } @@ -277,6 +298,8 @@ impl RowGroupMetaData { } /// Returns reference of page offset index of all column in this row group. + /// + /// The returned vector contains `page_offset[column_number][page_number]` pub fn page_offset_index(&self) -> Option<&Vec>> { self.page_offset_index.as_ref() } @@ -292,6 +315,8 @@ impl RowGroupMetaData { } /// Sets page offset index for this row group. + /// + /// The vector represents `page_offset[column_number][page_number]` pub fn set_page_offset(&mut self, page_offset: Vec>) { self.page_offset_index = Some(page_offset); } diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index eb26804784a9..95a73118042f 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Per-page encoding information. + use crate::basic::{Encoding, PageType}; use crate::errors::Result; use crate::format::{ diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 83d55caa4ba9..8f9cb66298b5 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! [`Index`] structures holding decoded [`ColumnIndex`] information + use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::{ByteArray, Int96}; @@ -23,7 +25,14 @@ use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; use std::fmt::Debug; -/// The statistics in one page +/// PageIndex Statistics for one data page, as described in [Column Index]. +/// +/// One significant difference from the row group level +/// [`Statistics`](crate::format::Statistics) is that page level +/// statistics may not store actual column values as min and max +/// (e.g. they may store truncated strings to save space) +/// +/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PageIndex { /// The minimum value, It is None when all values are null @@ -48,6 +57,10 @@ impl PageIndex { #[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] +/// Typed statistics for a data page in a column chunk. This structure +/// is obtained from decoding the [ColumnIndex] in the parquet file +/// and can be used to skip decoding pages while reading the file +/// data. pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, @@ -90,14 +103,17 @@ impl Index { } } -/// An index of a column of [`Type`] physical representation +/// Stores the [`PageIndex`] for each page of a column with [`Type`] #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { - /// The physical type + /// The physical type of this column pub physical_type: Type, /// The indexes, one item per page pub indexes: Vec>, - /// the order + /// If the min/max elements are ordered, and if so in which + /// direction. See [source] for details. + /// + /// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964 pub boundary_order: BoundaryOrder, } diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 36b1c9d6c275..3ae37cf87f8b 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Support for reading [`Index`] and [`PageLocation`] from parquet metadata. + use crate::basic::Type; use crate::data_type::Int96; use crate::errors::ParquetError; @@ -25,8 +27,17 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use std::io::{Cursor, Read}; use thrift::protocol::{TCompactInputProtocol, TSerializable}; -/// Read on row group's all columns indexes and change into [`Index`] -/// If not the format not available return an empty vector. +/// Reads per-column [`Index`] for all columns of a row group by +/// decoding [`ColumnIndex`] . +/// +/// Returns a vector of `index[column_number]`. +/// +/// Returns an empty vector if this row group does not contain a +/// [`ColumnIndex`]. +/// +/// See [Column Index Documentation] for more details. +/// +/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], @@ -60,8 +71,17 @@ pub fn read_columns_indexes( .collect() } -/// Read on row group's all indexes and change into [`Index`] -/// If not the format not available return an empty vector. +/// Reads per-page [`PageLocation`] for all columns of a row group by +/// decoding the [`OffsetIndex`]. +/// +/// Returns a vector of `location[column_number][page_number]` +/// +/// Return an empty vector if this row group does not contain an +/// [`OffsetIndex]`. +/// +/// See [Column Index Documentation] for more details. +/// +/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index dcc1120fc4e3..9372645d76ee 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -15,5 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! Page Index of "[Column Index] Layout to Support Page Skipping" +//! +//! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md + pub mod index; pub mod index_reader; diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index cbd31f9a1f32..2ce0050c938e 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Writer properties. +//! [`WriterProperties`] //! //! # Usage //! diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index bb82f229927d..545f227090d9 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Contains file reader API and provides methods to access file metadata, row group -//! readers to read individual column chunks, or access record iterator. +//! File reader API and methods to access file metadata, row group +//! readers to read individual column chunks, or access record +//! iterator. use bytes::Bytes; use std::{boxed::Box, io::Read, sync::Arc}; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 95108ad58af7..e5ed26e9e812 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -189,13 +189,16 @@ impl ReadOptionsBuilder { self } - /// Enable page index in the reading option, + /// Enable reading the page index structures described in + /// "[Column Index] Layout to Support Page Skipping" + /// + /// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn with_page_index(mut self) -> Self { self.enable_page_index = true; self } - /// Set the `ReaderProperties` configuration. + /// Set the [`ReaderProperties`] configuration. pub fn with_reader_properties(mut self, properties: ReaderProperties) -> Self { self.props = Some(properties); self From e80d87fbf7fb36fe415ae6927c5550439a85d937 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 31 Jan 2023 17:07:47 +0000 Subject: [PATCH 0567/1411] Reduce Dictionary Builder Codegen (#3616) * Reduce dictionary builder codegen * Clippy * Format --- .../generic_bytes_dictionary_builder.rs | 48 ++++++++----------- .../builder/primitive_dictionary_builder.rs | 11 +++-- arrow-buffer/src/native.rs | 28 +++++++++-- 3 files changed, 51 insertions(+), 36 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 5af41a51948b..dd9a70b1d431 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -40,10 +40,10 @@ where state: ahash::RandomState, /// Used to provide a lookup from string value to key type /// - /// Note: K's hash implementation is not used, instead the raw entry + /// Note: usize's hash implementation is not used, instead the raw entry /// API is used to store keys w.r.t the hash of the strings themselves /// - dedup: HashMap, + dedup: HashMap, keys_builder: PrimitiveBuilder, values_builder: GenericByteBuilder, @@ -133,23 +133,22 @@ where let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); + K::Native::from_usize(dictionary_values.len()) + .ok_or(ArrowError::DictionaryKeyOverflowError)?; + for (idx, maybe_value) in dictionary_values.iter().enumerate() { match maybe_value { Some(value) => { let value_bytes: &[u8] = value.as_ref(); let hash = state.hash_one(value_bytes); - let key = K::Native::from_usize(idx) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - - let entry = - dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| { - value_bytes == get_bytes(&values_builder, key) - }); + let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| { + value_bytes == get_bytes(&values_builder, *idx) + }); if let RawEntryMut::Vacant(v) = entry { - v.insert_with_hasher(hash, key, (), |key| { - state.hash_one(get_bytes(&values_builder, key)) + v.insert_with_hasher(hash, idx, (), |idx| { + state.hash_one(get_bytes(&values_builder, *idx)) }); } @@ -233,21 +232,20 @@ where let entry = self .dedup .raw_entry_mut() - .from_hash(hash, |key| value_bytes == get_bytes(storage, key)); + .from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx)); let key = match entry { - RawEntryMut::Occupied(entry) => *entry.into_key(), + RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()), RawEntryMut::Vacant(entry) => { - let index = storage.len(); + let idx = storage.len(); storage.append_value(value); - let key = K::Native::from_usize(index) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - - *entry - .insert_with_hasher(hash, key, (), |key| { - state.hash_one(get_bytes(storage, key)) - }) - .0 + + entry.insert_with_hasher(hash, idx, (), |idx| { + state.hash_one(get_bytes(storage, *idx)) + }); + + K::Native::from_usize(idx) + .ok_or(ArrowError::DictionaryKeyOverflowError)? } }; self.keys_builder.append_value(key); @@ -330,14 +328,10 @@ impl> Extend( - values: &'a GenericByteBuilder, - key: &K, -) -> &'a [u8] { +fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[u8] { let offsets = values.offsets_slice(); let values = values.values_slice(); - let idx = key.as_usize(); let end_offset = offsets[idx + 1].as_usize(); let start_offset = offsets[idx].as_usize(); diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index f44f0e30602e..00187cddef18 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -86,7 +86,7 @@ where { keys_builder: PrimitiveBuilder, values_builder: PrimitiveBuilder, - map: HashMap, K::Native>, + map: HashMap, usize>, } impl Default for PrimitiveDictionaryBuilder @@ -180,13 +180,13 @@ where let key = match self.map.entry(Value(value)) { Entry::Vacant(vacant) => { // Append new value. - let key = K::Native::from_usize(self.values_builder.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; + let key = self.values_builder.len(); self.values_builder.append_value(value); vacant.insert(key); - key + K::Native::from_usize(key) + .ok_or(ArrowError::DictionaryKeyOverflowError)? } - Entry::Occupied(o) => *o.get(), + Entry::Occupied(o) => K::Native::usize_as(*o.get()), }; self.keys_builder.append_value(key); @@ -198,6 +198,7 @@ where /// # Panics /// /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX` + #[inline] pub fn append_value(&mut self, value: V::Native) { self.append(value).expect("dictionary key overflow"); } diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 6ac11a16f4d3..4ea06974bb0b 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -58,6 +58,11 @@ pub trait ArrowNativeType: /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast fn as_usize(self) -> usize; + /// Convert from usize according to the [`as`] operator + /// + /// [`as`]: https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast + fn usize_as(i: usize) -> Self; + /// Convert native type to usize. /// /// Returns `None` if [`Self`] is not an integer or conversion would result @@ -119,6 +124,12 @@ macro_rules! native_integer { self as _ } + #[inline] + fn usize_as(i: usize) -> Self { + i as _ + } + + $( #[inline] fn $from(v: $t) -> Option { @@ -140,7 +151,7 @@ native_integer!(u32); native_integer!(u64); macro_rules! native_float { - ($t:ty, $s:ident, $as_usize: expr) => { + ($t:ty, $s:ident, $as_usize: expr, $i:ident, $usize_as: expr) => { impl private::Sealed for $t {} impl ArrowNativeType for $t { #[inline] @@ -162,13 +173,18 @@ macro_rules! native_float { fn as_usize($s) -> usize { $as_usize } + + #[inline] + fn usize_as($i: usize) -> Self { + $usize_as + } } }; } -native_float!(f16, self, self.to_f32() as _); -native_float!(f32, self, self as _); -native_float!(f64, self, self as _); +native_float!(f16, self, self.to_f32() as _, i, f16::from_f32(i as _)); +native_float!(f32, self, self as _, i, i as _); +native_float!(f64, self, self as _, i, i as _); impl private::Sealed for i256 {} impl ArrowNativeType for i256 { @@ -180,6 +196,10 @@ impl ArrowNativeType for i256 { self.to_parts().0 as usize } + fn usize_as(i: usize) -> Self { + Self::from_parts(i as u128, 0) + } + fn to_usize(self) -> Option { let (low, high) = self.to_parts(); if high != 0 { From dd168114a92fc2dc61c74415bbe8ea0a2f5a99ce Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 Jan 2023 09:32:35 -0800 Subject: [PATCH 0568/1411] Specified version of helper function to cast binary to string (#3624) * Specified version of helper function to cast binary to string * Simplify it --- arrow-cast/src/cast.rs | 65 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 9f20dceb980a..c0082b347da7 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1180,7 +1180,7 @@ pub fn cast_with_options( } Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), - Binary => cast_binary_to_generic_string::(array, cast_options), + Binary => cast_binary_to_string::(array, cast_options), LargeBinary => cast_binary_to_generic_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1215,7 +1215,7 @@ pub fn cast_with_options( Date32 => cast_date32_to_string::(array), Date64 => cast_date64_to_string::(array), Binary => cast_binary_to_generic_string::(array, cast_options), - LargeBinary => cast_binary_to_generic_string::(array, cast_options), + LargeBinary => cast_binary_to_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -3392,6 +3392,66 @@ fn cast_list_inner( Ok(Arc::new(list) as ArrayRef) } +/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same +/// offset size so re-encoding offset is unnecessary. +fn cast_binary_to_string( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result +where + O: OffsetSizeTrait + ToPrimitive, +{ + let array = array + .as_any() + .downcast_ref::>>() + .unwrap(); + + if !cast_options.safe { + let offsets = array.value_offsets(); + let values = array.value_data(); + + // We only need to validate that all values are valid UTF-8 + let validated = std::str::from_utf8(values) + .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; + // Checks if the offsets are valid but does not re-encode + for offset in offsets.iter() { + if !validated.is_char_boundary(offset.as_usize()) { + return Err(ArrowError::CastError("Invalid UTF-8 sequence".to_string())); + } + } + + let builder = array + .into_data() + .into_builder() + .data_type(GenericStringArray::::DATA_TYPE); + // SAFETY: + // Validated UTF-8 above + Ok(Arc::new(GenericStringArray::::from(unsafe { + builder.build_unchecked() + }))) + } else { + let mut null_builder = BooleanBufferBuilder::new(array.len()); + array.iter().for_each(|maybe_value| { + null_builder.append( + maybe_value + .and_then(|value| std::str::from_utf8(value).ok()) + .is_some(), + ); + }); + + let builder = array + .into_data() + .into_builder() + .null_bit_buffer(Some(null_builder.finish())) + .data_type(GenericStringArray::::DATA_TYPE); + // SAFETY: + // Validated UTF-8 above + Ok(Arc::new(GenericStringArray::::from(unsafe { + builder.build_unchecked() + }))) + } +} + /// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs /// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending /// `CastOptions`. @@ -3417,6 +3477,7 @@ where .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; let mut offset_builder = BufferBuilder::::new(offsets.len()); + // Checks if the offset is a valid char boundary and re-encode the offset offsets .iter() .try_for_each::<_, Result<_, ArrowError>>(|offset| { From 2b9bbce44abbd93048c674f49c4eb0db72a0a1c8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Feb 2023 20:06:43 +0000 Subject: [PATCH 0569/1411] Prepare object store 0.5.4 (#3636) (#3640) * Prepare object store 0.5.4 (#3636) * Update CHANGELOG-old --- object_store/CHANGELOG-old.md | 35 ++++++++++++++++ object_store/CHANGELOG.md | 40 ++++++++----------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 54 insertions(+), 27 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 2813cfc9df1a..78237a02dd87 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -21,6 +21,41 @@ # Changelog +## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) + +**Implemented enhancements:** + +- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) +- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) +- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) +- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) +- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + ## [object_store_0.5.2](https://github.com/apache/arrow-rs/tree/object_store_0.5.2) (2022-12-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.1...object_store_0.5.2) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 41b029ccab78..c1734ec5ba9f 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,40 +19,32 @@ # Changelog -## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) +## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.2...object_store_0.5.3) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) **Implemented enhancements:** -- Derive Clone for the builders in object-store. [\#3419](https://github.com/apache/arrow-rs/issues/3419) -- Add a constant prefix object store wrapper [\#3328](https://github.com/apache/arrow-rs/issues/3328) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add support for content-type while uploading files through ObjectStore API [\#3300](https://github.com/apache/arrow-rs/issues/3300) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add HttpStore [\#3294](https://github.com/apache/arrow-rs/issues/3294) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add support for Azure Data Lake Storage Gen2 \(aka: ADLS Gen2\) in Object Store library [\#3283](https://github.com/apache/arrow-rs/issues/3283) -- object\_store: Add Put and Multipart Upload Doc Examples [\#2863](https://github.com/apache/arrow-rs/issues/2863) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- Only flush buffered multi-part data on poll\_shutdown not on poll\_flush [\#3390](https://github.com/apache/arrow-rs/issues/3390) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: builder configuration api [\#3436](https://github.com/apache/arrow-rs/pull/3436) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Derive Clone for ObjectStore builders and Make URL Parsing Stricter \(\#3419\) [\#3424](https://github.com/apache/arrow-rs/pull/3424) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add Put and Multipart Put doc examples [\#3420](https://github.com/apache/arrow-rs/pull/3420) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([GeauxEric](https://github.com/GeauxEric)) -- object\_store: update localstack instructions [\#3403](https://github.com/apache/arrow-rs/pull/3403) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- object\_store: Flush buffered multipart only during poll\_shutdown [\#3397](https://github.com/apache/arrow-rs/pull/3397) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([askoa](https://github.com/askoa)) -- Update quick-xml to 0.27 [\#3395](https://github.com/apache/arrow-rs/pull/3395) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add HttpStore \(\#3294\) [\#3380](https://github.com/apache/arrow-rs/pull/3380) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- add support for content-type in `ClientOptions` [\#3358](https://github.com/apache/arrow-rs/pull/3358) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- Update AWS SDK [\#3349](https://github.com/apache/arrow-rs/pull/3349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Upstream newline\_delimited\_stream and ChunkedStore from DataFusion [\#3341](https://github.com/apache/arrow-rs/pull/3341) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat\(object\_store\): add PrefixObjectStore [\#3329](https://github.com/apache/arrow-rs/pull/3329) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- feat\(object\_store\): parse well-known storage urls [\#3327](https://github.com/apache/arrow-rs/pull/3327) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Disable getrandom object\_store [\#3278](https://github.com/apache/arrow-rs/pull/3278) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Reload token from AWS\_WEB\_IDENTITY\_TOKEN\_FILE [\#3274](https://github.com/apache/arrow-rs/pull/3274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Minor: skip aws integration test if TEST\_INTEGRATION is not set [\#3262](https://github.com/apache/arrow-rs/pull/3262) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) +- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) +- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) +- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c685685b1346..686a661675fb 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.3" +version = "0.5.4" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 2f6c809a79bf..5cf5582a9e9b 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.2" -FUTURE_RELEASE="object_store_0.5.3" +SINCE_TAG="object_store_0.5.3" +FUTURE_RELEASE="object_store_0.5.4" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 75a56beb827e86791325906886eaaa4c747a9ea2 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 2 Feb 2023 05:19:03 -0800 Subject: [PATCH 0570/1411] Use array_value_to_string in arrow-csv (#3514) * Use array_value_to_string in arrow-csv * Fix test * Add datetime_array_value_to_string to allow passing datetime format * Rollback the tests * Add option to use RFC3339 in CSV writeR * Update tests * Fix linting errors * fix tests * Change with_rfc3339 factory method and use Option<&str> instead of &Option * Keep old tests intact * Add tests to check rfc3339 * Add back test_conversion_consistency * Fix clippy errors * Minor linting issue * Separate array_value_to_string and datetime_array_value_to_string * Add back invalid cast test * Fix linting and clippy errors * Fix arrow-cast test * Restructuring * Fix formatting errors * Change make_duration_string to use invalid_cast_error * Fix clippy errors --- arrow-cast/src/display.rs | 471 +++++++++++++++++++++++++++----------- arrow-csv/src/writer.rs | 304 ++++++++++++------------ arrow-json/src/writer.rs | 141 ++---------- arrow/tests/csv.rs | 45 ++++ 4 files changed, 555 insertions(+), 406 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 16fbfb0bbce5..7214321127cf 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -28,6 +28,13 @@ use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::*; use chrono::prelude::SecondsFormat; +use chrono::{DateTime, Utc}; + +fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" + )) +} macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -133,57 +140,176 @@ macro_rules! make_string_interval_month_day_nano { } macro_rules! make_string_date { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_date($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) + }}; +} - Ok(array - .value_as_date($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) +macro_rules! make_string_date_with_format { + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .format($format) + .to_string()) + }}; +} + +macro_rules! handle_string_date { + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + match $format { + Some(format) => { + make_string_date_with_format!( + $array_type, + $dt, + format, + $column, + $col_idx, + $row_idx + ) + } + None => make_string_date!($array_type, $dt, $column, $col_idx, $row_idx), + } }}; } macro_rules! make_string_time { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_time($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) + }}; +} - Ok(array - .value_as_time($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) +macro_rules! make_string_time_with_format { + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_time($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .format($format) + .to_string()) }}; } +macro_rules! handle_string_time { + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { + match $format { + Some(format) => { + make_string_time_with_format!( + $array_type, + $dt, + format, + $column, + $col_idx, + $row_idx + ) + } + None => make_string_time!($array_type, $dt, $column, $col_idx, $row_idx), + } + }; +} + macro_rules! make_string_datetime { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + + let s = match $tz_string { + Some(tz_string) => match tz_string.parse::() { + Ok(tz) => array + .value_as_datetime_with_tz($row_idx, tz) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_rfc3339_opts(SecondsFormat::AutoSi, true) + .to_string(), + Err(_) => { + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) + } + }, + None => { + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + format!("{:?}", datetime) + } + }; - Ok(array - .value_as_datetime($row) - .map(|d| format!("{:?}", d)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + Ok(s) }}; } -macro_rules! make_string_datetime_with_tz { - ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - let s = match $tz_string.parse::() { - Ok(tz) => array - .value_as_datetime_with_tz($row, tz) - .map(|d| format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true))) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - Err(_) => array - .value_as_datetime($row) - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), +macro_rules! make_string_datetime_with_format { + ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + + let s = match $tz_string { + Some(tz_string) => match tz_string.parse::() { + Ok(tz) => { + let utc_time = DateTime::::from_utc(datetime, Utc); + let local_time = utc_time.with_timezone(&tz); + local_time.format($format).to_string() + } + Err(_) => { + format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) + } + }, + None => datetime.format($format).to_string(), }; Ok(s) }}; } +macro_rules! handle_string_datetime { + ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { + match $format { + Some(format) => make_string_datetime_with_format!( + $array_type, + $dt, + format, + $tz_string, + $column, + $col_idx, + $row_idx + ), + None => make_string_datetime!( + $array_type, + $dt, + $tz_string, + $column, + $col_idx, + $row_idx + ), + } + }; +} + // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex macro_rules! make_string_hex { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -248,13 +374,14 @@ macro_rules! make_string_from_fixed_size_list { } macro_rules! make_string_from_duration { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_duration($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $column:ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_duration($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) }}; } @@ -323,126 +450,172 @@ fn append_map_field_string( /// /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. -pub fn array_value_to_string( +fn array_value_to_string_internal( column: &ArrayRef, - row: usize, + col_idx: usize, + row_idx: usize, + format: Option<&str>, ) -> Result { - if column.is_null(row) { + if column.is_null(row_idx) { return Ok("".to_string()); } match column.data_type() { - DataType::Utf8 => make_string!(array::StringArray, column, row), - DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row), - DataType::Binary => make_string_hex!(array::BinaryArray, column, row), - DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row), + DataType::Utf8 => make_string!(array::StringArray, column, row_idx), + DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row_idx), + DataType::Binary => make_string_hex!(array::BinaryArray, column, row_idx), + DataType::LargeBinary => { + make_string_hex!(array::LargeBinaryArray, column, row_idx) + } DataType::FixedSizeBinary(_) => { - make_string_hex!(array::FixedSizeBinaryArray, column, row) + make_string_hex!(array::FixedSizeBinaryArray, column, row_idx) } - DataType::Boolean => make_string!(array::BooleanArray, column, row), - DataType::Int8 => make_string!(array::Int8Array, column, row), - DataType::Int16 => make_string!(array::Int16Array, column, row), - DataType::Int32 => make_string!(array::Int32Array, column, row), - DataType::Int64 => make_string!(array::Int64Array, column, row), - DataType::UInt8 => make_string!(array::UInt8Array, column, row), - DataType::UInt16 => make_string!(array::UInt16Array, column, row), - DataType::UInt32 => make_string!(array::UInt32Array, column, row), - DataType::UInt64 => make_string!(array::UInt64Array, column, row), - DataType::Float16 => make_string!(array::Float16Array, column, row), - DataType::Float32 => make_string!(array::Float32Array, column, row), - DataType::Float64 => make_string!(array::Float64Array, column, row), - DataType::Decimal128(..) => make_string_from_decimal(column, row), + DataType::Boolean => make_string!(array::BooleanArray, column, row_idx), + DataType::Int8 => make_string!(array::Int8Array, column, row_idx), + DataType::Int16 => make_string!(array::Int16Array, column, row_idx), + DataType::Int32 => make_string!(array::Int32Array, column, row_idx), + DataType::Int64 => make_string!(array::Int64Array, column, row_idx), + DataType::UInt8 => make_string!(array::UInt8Array, column, row_idx), + DataType::UInt16 => make_string!(array::UInt16Array, column, row_idx), + DataType::UInt32 => make_string!(array::UInt32Array, column, row_idx), + DataType::UInt64 => make_string!(array::UInt64Array, column, row_idx), + DataType::Float16 => make_string!(array::Float16Array, column, row_idx), + DataType::Float32 => make_string!(array::Float32Array, column, row_idx), + DataType::Float64 => make_string!(array::Float64Array, column, row_idx), + DataType::Decimal128(..) => make_string_from_decimal(column, row_idx), DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampSecondArray, - tz_string, - column, - row - ), - None => make_string_datetime!(array::TimestampSecondArray, column, row), - } + handle_string_datetime!( + array::TimestampSecondArray, + "Timestamp", + format, + tz_string_opt, + column, + col_idx, + row_idx + ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMillisecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMillisecondArray, column, row) - } - } + handle_string_datetime!( + array::TimestampMillisecondArray, + "Timestamp", + format, + tz_string_opt, + column, + col_idx, + row_idx + ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMicrosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMicrosecondArray, column, row) - } - } + handle_string_datetime!( + array::TimestampMicrosecondArray, + "Timestamp", + format, + tz_string_opt, + column, + col_idx, + row_idx + ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampNanosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampNanosecondArray, column, row) - } - } + handle_string_datetime!( + array::TimestampNanosecondArray, + "Timestamp", + format, + tz_string_opt, + column, + col_idx, + row_idx + ) + } + DataType::Date32 => { + handle_string_date!( + array::Date32Array, + "Date32", + format, + column, + col_idx, + row_idx + ) + } + DataType::Date64 => { + handle_string_date!( + array::Date64Array, + "Date64", + format, + column, + col_idx, + row_idx + ) } - DataType::Date32 => make_string_date!(array::Date32Array, column, row), - DataType::Date64 => make_string_date!(array::Date64Array, column, row), DataType::Time32(unit) if *unit == TimeUnit::Second => { - make_string_time!(array::Time32SecondArray, column, row) + handle_string_time!( + array::Time32SecondArray, + "Time32", + format, + column, + col_idx, + row_idx + ) } DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - make_string_time!(array::Time32MillisecondArray, column, row) + handle_string_time!( + array::Time32MillisecondArray, + "Time32", + format, + column, + col_idx, + row_idx + ) } DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - make_string_time!(array::Time64MicrosecondArray, column, row) + handle_string_time!( + array::Time64MicrosecondArray, + "Time64", + format, + column, + col_idx, + row_idx + ) } DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - make_string_time!(array::Time64NanosecondArray, column, row) + handle_string_time!( + array::Time64NanosecondArray, + "Time64", + format, + column, + col_idx, + row_idx + ) } DataType::Interval(unit) => match unit { IntervalUnit::DayTime => { - make_string_interval_day_time!(column, row) + make_string_interval_day_time!(column, row_idx) } IntervalUnit::YearMonth => { - make_string_interval_year_month!(column, row) + make_string_interval_year_month!(column, row_idx) } IntervalUnit::MonthDayNano => { - make_string_interval_month_day_nano!(column, row) + make_string_interval_month_day_nano!(column, row_idx) } }, - DataType::List(_) => make_string_from_list!(column, row), - DataType::LargeList(_) => make_string_from_large_list!(column, row), + DataType::List(_) => make_string_from_list!(column, row_idx), + DataType::LargeList(_) => make_string_from_large_list!(column, row_idx), DataType::Dictionary(index_type, _value_type) => match **index_type { - DataType::Int8 => dict_array_value_to_string::(column, row), - DataType::Int16 => dict_array_value_to_string::(column, row), - DataType::Int32 => dict_array_value_to_string::(column, row), - DataType::Int64 => dict_array_value_to_string::(column, row), - DataType::UInt8 => dict_array_value_to_string::(column, row), - DataType::UInt16 => dict_array_value_to_string::(column, row), - DataType::UInt32 => dict_array_value_to_string::(column, row), - DataType::UInt64 => dict_array_value_to_string::(column, row), + DataType::Int8 => dict_array_value_to_string::(column, row_idx), + DataType::Int16 => dict_array_value_to_string::(column, row_idx), + DataType::Int32 => dict_array_value_to_string::(column, row_idx), + DataType::Int64 => dict_array_value_to_string::(column, row_idx), + DataType::UInt8 => dict_array_value_to_string::(column, row_idx), + DataType::UInt16 => dict_array_value_to_string::(column, row_idx), + DataType::UInt32 => dict_array_value_to_string::(column, row_idx), + DataType::UInt64 => dict_array_value_to_string::(column, row_idx), _ => Err(ArrowError::InvalidArgumentError(format!( "Pretty printing not supported for {:?} due to index type", column.data_type() ))), }, - DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row), + DataType::FixedSizeList(_, _) => { + make_string_from_fixed_size_list!(column, row_idx) + } DataType::Struct(_) => { let st = column .as_any() @@ -458,11 +631,11 @@ pub fn array_value_to_string( s.push('{'); let mut kv_iter = st.columns().iter().zip(st.column_names()); if let Some((col, name)) = kv_iter.next() { - append_struct_field_string(&mut s, name, col, row)?; + append_struct_field_string(&mut s, name, col, row_idx)?; } for (col, name) in kv_iter { s.push_str(", "); - append_struct_field_string(&mut s, name, col, row)?; + append_struct_field_string(&mut s, name, col, row_idx)?; } s.push('}'); @@ -475,7 +648,7 @@ pub fn array_value_to_string( "Repl error: could not convert column to map array.".to_string(), ) })?; - let map_entry = map_array.value(row); + let map_entry = map_array.value(row_idx); let st = map_entry .as_any() .downcast_ref::() @@ -501,20 +674,44 @@ pub fn array_value_to_string( Ok(s) } DataType::Union(field_vec, type_ids, mode) => { - union_to_string(column, row, field_vec, type_ids, mode) + union_to_string(column, row_idx, field_vec, type_ids, mode) } DataType::Duration(unit) => match *unit { TimeUnit::Second => { - make_string_from_duration!(array::DurationSecondArray, column, row) + make_string_from_duration!( + array::DurationSecondArray, + "Duration", + column, + col_idx, + row_idx + ) } TimeUnit::Millisecond => { - make_string_from_duration!(array::DurationMillisecondArray, column, row) + make_string_from_duration!( + array::DurationMillisecondArray, + "Duration", + column, + col_idx, + row_idx + ) } TimeUnit::Microsecond => { - make_string_from_duration!(array::DurationMicrosecondArray, column, row) + make_string_from_duration!( + array::DurationMicrosecondArray, + "Duration", + column, + col_idx, + row_idx + ) } TimeUnit::Nanosecond => { - make_string_from_duration!(array::DurationNanosecondArray, column, row) + make_string_from_duration!( + array::DurationNanosecondArray, + "Duration", + column, + col_idx, + row_idx + ) } }, _ => Err(ArrowError::InvalidArgumentError(format!( @@ -524,6 +721,22 @@ pub fn array_value_to_string( } } +pub fn temporal_array_value_to_string( + column: &ArrayRef, + col_idx: usize, + row_idx: usize, + format: Option<&str>, +) -> Result { + array_value_to_string_internal(column, col_idx, row_idx, format) +} + +pub fn array_value_to_string( + column: &ArrayRef, + row_idx: usize, +) -> Result { + array_value_to_string_internal(column, 0, row_idx, None) +} + /// Converts the value of the union array at `row` to a String fn union_to_string( column: &ArrayRef, diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index bc11eef2fcf1..94620be6629f 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -63,12 +63,12 @@ //! } //! ``` -use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; -use arrow_cast::display::{lexical_to_string, make_string_from_decimal}; +use arrow_cast::display::{ + array_value_to_string, lexical_to_string, temporal_array_value_to_string, +}; use arrow_schema::*; -use chrono::{DateTime, Utc}; use std::io::Write; use crate::map_csv_error; @@ -88,25 +88,6 @@ where lexical_to_string(c.value(i)) } -fn invalid_cast_error(dt: &str, col_index: usize, row_index: usize) -> ArrowError { - ArrowError::CastError(format!( - "Cannot cast to {dt} at col index: {col_index} row index: {row_index}" - )) -} - -macro_rules! write_temporal_value { - ($array:expr, $tpe: ident, $format: expr, $col_index: expr, $row_index: expr, $cast_func: ident, $tpe_name: expr) => {{ - $array - .as_any() - .downcast_ref::<$tpe>() - .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? - .$cast_func($row_index) - .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? - .format($format) - .to_string() - }}; -} - /// A CSV writer #[derive(Debug)] pub struct Writer { @@ -115,17 +96,17 @@ pub struct Writer { /// Whether file should be written with headers. Defaults to `true` has_headers: bool, /// The date format for date arrays - date_format: String, + date_format: Option, /// The datetime format for datetime arrays - datetime_format: String, + datetime_format: Option, /// The timestamp format for timestamp arrays #[allow(dead_code)] - timestamp_format: String, + timestamp_format: Option, /// The timestamp format for timestamp (with timezone) arrays #[allow(dead_code)] - timestamp_tz_format: String, + timestamp_tz_format: Option, /// The time format for time arrays - time_format: String, + time_format: Option, /// Is the beginning-of-writer beginning: bool, /// The value to represent null entries @@ -141,11 +122,11 @@ impl Writer { Writer { writer, has_headers: true, - date_format: DEFAULT_DATE_FORMAT.to_string(), - datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - time_format: DEFAULT_TIME_FORMAT.to_string(), - timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - timestamp_tz_format: DEFAULT_TIMESTAMP_TZ_FORMAT.to_string(), + date_format: Some(DEFAULT_DATE_FORMAT.to_string()), + datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + time_format: Some(DEFAULT_TIME_FORMAT.to_string()), + timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, null_value: DEFAULT_NULL_VALUE.to_string(), } @@ -177,88 +158,74 @@ impl Writer { DataType::UInt16 => write_primitive_value::(col, row_index), DataType::UInt32 => write_primitive_value::(col, row_index), DataType::UInt64 => write_primitive_value::(col, row_index), - DataType::Boolean => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_string() - } - DataType::Utf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::LargeUtf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::Date32 => { - write_temporal_value!( - col, - Date32Array, - &self.date_format, - col_index, - row_index, - value_as_date, - "Date32" - ) - } - DataType::Date64 => { - write_temporal_value!( + DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), + DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), + DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), + DataType::Date32 => temporal_array_value_to_string( + col, + col_index, + row_index, + self.date_format.as_deref(), + )? + .to_string(), + DataType::Date64 => temporal_array_value_to_string( + col, + col_index, + row_index, + self.datetime_format.as_deref(), + )? + .to_string(), + DataType::Time32(TimeUnit::Second) => temporal_array_value_to_string( + col, + col_index, + row_index, + self.time_format.as_deref(), + )? + .to_string(), + DataType::Time32(TimeUnit::Millisecond) => { + temporal_array_value_to_string( col, - Date64Array, - &self.datetime_format, col_index, row_index, - value_as_datetime, - "Date64" - ) + self.time_format.as_deref(), + )? + .to_string() } - DataType::Time32(TimeUnit::Second) => { - write_temporal_value!( + DataType::Time64(TimeUnit::Microsecond) => { + temporal_array_value_to_string( col, - Time32SecondArray, - &self.time_format, col_index, row_index, - value_as_time, - "Time32" - ) + self.time_format.as_deref(), + )? + .to_string() } - DataType::Time32(TimeUnit::Millisecond) => { - write_temporal_value!( + DataType::Time64(TimeUnit::Nanosecond) => temporal_array_value_to_string( + col, + col_index, + row_index, + self.time_format.as_deref(), + )? + .to_string(), + DataType::Timestamp(_, time_zone) => match time_zone { + Some(_tz) => temporal_array_value_to_string( col, - Time32MillisecondArray, - &self.time_format, col_index, row_index, - value_as_time, - "Time32" - ) - } - DataType::Time64(TimeUnit::Microsecond) => { - write_temporal_value!( + self.timestamp_tz_format.as_deref(), + )? + .to_string(), + None => temporal_array_value_to_string( col, - Time64MicrosecondArray, - &self.time_format, col_index, row_index, - value_as_time, - "Time64" - ) + self.timestamp_format.as_deref(), + )? + .to_string(), + }, + DataType::Decimal128(..) => { + array_value_to_string(col, row_index)?.to_string() } - DataType::Time64(TimeUnit::Nanosecond) => { - write_temporal_value!( - col, - Time64NanosecondArray, - &self.time_format, - col_index, - row_index, - value_as_time, - "Time64" - ) - } - DataType::Timestamp(time_unit, time_zone) => { - self.handle_timestamp(time_unit, time_zone.as_ref(), row_index, col)? - } - DataType::Decimal128(..) => make_string_from_decimal(col, row_index)?, t => { // List and Struct arrays not supported by the writer, any // other type needs to be implemented @@ -272,52 +239,6 @@ impl Writer { Ok(()) } - fn handle_timestamp( - &self, - time_unit: &TimeUnit, - time_zone: Option<&String>, - row_index: usize, - col: &ArrayRef, - ) -> Result { - use TimeUnit::*; - let datetime = match time_unit { - Second => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Millisecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Microsecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Nanosecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - }; - - let tz: Option = time_zone.map(|x| x.parse()).transpose()?; - match tz { - Some(tz) => { - let utc_time = DateTime::::from_utc(datetime, Utc); - let local_time = utc_time.with_timezone(&tz); - Ok(local_time.format(&self.timestamp_tz_format).to_string()) - } - None => Ok(datetime.format(&self.timestamp_format).to_string()), - } - } - /// Write a vector of record batches to a writable object pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { let num_columns = batch.num_columns(); @@ -463,6 +384,19 @@ impl WriterBuilder { self } + /// Use RFC3339 format for date/time/timestamps by clearing all + /// date/time specific formats. + pub fn with_rfc3339(mut self, use_rfc3339: bool) -> Self { + if use_rfc3339 { + self.date_format = None; + self.datetime_format = None; + self.time_format = None; + self.timestamp_format = None; + self.timestamp_tz_format = None; + } + self + } + /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let delimiter = self.delimiter.unwrap_or(b','); @@ -471,21 +405,11 @@ impl WriterBuilder { Writer { writer, has_headers: self.has_headers, - date_format: self - .date_format - .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()), - datetime_format: self - .datetime_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: self - .time_format - .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: self - .timestamp_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: self - .timestamp_tz_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), + date_format: self.date_format, + datetime_format: self.datetime_format, + time_format: self.time_format, + timestamp_format: self.timestamp_format, + timestamp_tz_format: self.timestamp_tz_format, beginning: true, null_value: self .null_value @@ -502,6 +426,12 @@ mod tests { use std::io::{Cursor, Read, Seek}; use std::sync::Arc; + fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" + )) + } + #[test] fn test_write_csv() { let schema = Schema::new(vec![ @@ -722,6 +652,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); let mut writer = Writer::new(&mut file); let batches = vec![&batch, &batch]; + for batch in batches { writer .write(batch) @@ -735,4 +666,57 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo } drop(writer); } + + #[test] + fn test_write_csv_using_rfc3339() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + true, + ), + Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("c3", DataType::Date32, false), + Field::new("c4", DataType::Time32(TimeUnit::Second), false), + ]); + + let c1 = TimestampMillisecondArray::from(vec![ + Some(1555584887378), + Some(1635577147000), + ]) + .with_timezone("+00:00".to_string()); + let c2 = TimestampMillisecondArray::from(vec![ + Some(1555584887378), + Some(1635577147000), + ]); + let c3 = Date32Array::from(vec![3, 2]); + let c4 = Time32SecondArray::from(vec![1234, 24680]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)], + ) + .unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + + let builder = WriterBuilder::new().with_rfc3339(true); + let mut writer = builder.build(&mut file); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + file.rewind().unwrap(); + let mut buffer: Vec = vec![]; + file.read_to_end(&mut buffer).unwrap(); + + assert_eq!( + "c1,c2,c3,c4 +2019-04-18T10:54:47.378Z,2019-04-18T10:54:47.378,1970-01-04,00:20:34 +2021-10-30T06:59:07Z,2021-10-30T06:59:07,1970-01-03,06:51:20\n", + String::from_utf8(buffer).unwrap() + ); + } } diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 9d241aed3d28..fa7db4b862e9 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -105,7 +105,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_schema::*; -use arrow_cast::display::array_value_to_string; +use arrow_cast::display::temporal_array_value_to_string; fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where @@ -137,6 +137,7 @@ fn struct_array_to_jsonmap_array( row_count, struct_col, inner_col_names[j], + j, )? } Ok(inner_objs) @@ -217,7 +218,7 @@ macro_rules! set_column_by_array_type { } macro_rules! set_temporal_column_by_array_type { - ($array_type:ident, $col_name:ident, $rows:ident, $array:ident, $row_count:ident, $cast_fn:ident) => { + ($col_name:ident, $col_idx:ident, $rows:ident, $array:ident, $row_count:ident) => { $rows .iter_mut() .enumerate() @@ -226,7 +227,10 @@ macro_rules! set_temporal_column_by_array_type { if !$array.is_null(i) { row.insert( $col_name.to_string(), - array_value_to_string($array, i).unwrap().to_string().into(), + temporal_array_value_to_string($array, $col_idx, i, None) + .unwrap() + .to_string() + .into(), ); } }); @@ -260,6 +264,7 @@ fn set_column_for_json_rows( row_count: usize, array: &ArrayRef, col_name: &str, + col_idx: usize, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { @@ -311,144 +316,46 @@ fn set_column_for_json_rows( ); } DataType::Date32 => { - set_temporal_column_by_array_type!( - Date32Array, - col_name, - rows, - array, - row_count, - value_as_date - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Date64 => { - set_temporal_column_by_array_type!( - Date64Array, - col_name, - rows, - array, - row_count, - value_as_date - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Second, _) => { - set_temporal_column_by_array_type!( - TimestampSecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Millisecond, _) => { - set_temporal_column_by_array_type!( - TimestampMillisecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Microsecond, _) => { - set_temporal_column_by_array_type!( - TimestampMicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Nanosecond, _) => { - set_temporal_column_by_array_type!( - TimestampNanosecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time32(TimeUnit::Second) => { - set_temporal_column_by_array_type!( - Time32SecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time32(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!( - Time32MillisecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time64(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!( - Time64MicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time64(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!( - Time64NanosecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Second) => { - set_temporal_column_by_array_type!( - DurationSecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!( - DurationMillisecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!( - DurationMicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!( - DurationNanosecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Struct(_) => { let inner_objs = @@ -492,7 +399,7 @@ fn set_column_for_json_rows( let slice = array.slice(0, row_count); let hydrated = arrow_cast::cast::cast(&slice, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; + set_column_for_json_rows(rows, row_count, &hydrated, col_name, col_idx)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -558,7 +465,7 @@ pub fn record_batches_to_json_rows( let row_count = batch.num_rows(); for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(&mut rows[base..], row_count, col, col_name)? + set_column_for_json_rows(&mut rows[base..], row_count, col, col_name, j)? } base += row_count; } diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 83a279ce4794..5a7c7e962a11 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -62,3 +62,48 @@ fn test_export_csv_timestamps() { let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } + +#[test] +fn test_export_csv_timestamps_using_rfc3339() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Timestamp( + TimeUnit::Millisecond, + Some("Australia/Sydney".to_string()), + ), + true, + ), + Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), + ]); + + let c1 = TimestampMillisecondArray::from( + // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). + // The offset (difference to UTC) is +10:00. + // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) + // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. + // + vec![Some(1555584887378), Some(1635577147000)], + ) + .with_timezone("Australia/Sydney".to_string()); + let c2 = + TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut sw = Vec::new(); + let mut writer = arrow_csv::WriterBuilder::new() + .with_rfc3339(true) + .build(&mut sw); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + let left = "c1,c2 +2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 +2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; + let right = String::from_utf8(sw).unwrap(); + assert_eq!(left, right); +} From d00d4c9a4b20bc28446ea4d5eac6b900f36adfb4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Feb 2023 18:33:08 +0000 Subject: [PATCH 0571/1411] Return references from FixedSizeListArray and MapArray (#3652) * Return references from FixedSizeListArray and MapArray * Clippy --- .../src/array/fixed_size_list_array.rs | 13 ++--- arrow-array/src/array/map_array.rs | 56 ++++++++++++------- arrow-array/src/builder/map_builder.rs | 4 +- arrow-ipc/src/writer.rs | 6 +- arrow-json/src/writer.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 4 +- 6 files changed, 49 insertions(+), 38 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 67a20d142eb5..c361d2d4462b 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -69,8 +69,8 @@ pub struct FixedSizeListArray { impl FixedSizeListArray { /// Returns a reference to the values of this list. - pub fn values(&self) -> ArrayRef { - self.values.clone() + pub fn values(&self) -> &ArrayRef { + &self.values } /// Returns a clone of the value type of this list. @@ -261,8 +261,7 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(&value_data, list_array.values().data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -291,8 +290,7 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(&value_data, list_array.values().data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -368,8 +366,7 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(&value_data, list_array.values().data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(5, list_array.len()); assert_eq!(2, list_array.null_count()); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index f2b9a87a21b9..b0eb4a3c98ab 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -27,50 +27,60 @@ use std::sync::Arc; /// Keys should always be non-null, but values can be null. /// /// [MapArray] is physically a [crate::array::ListArray] that has a -/// [crate::array::StructArray] with 2 child fields. +/// [StructArray] with 2 child fields. #[derive(Clone)] pub struct MapArray { data: ArrayData, + /// The [`StructArray`] that is the direct child of this array + entries: ArrayRef, + /// The first child of `entries`, the "keys" of this MapArray + keys: ArrayRef, + /// The second child of `entries`, the "values" of this MapArray values: ArrayRef, + /// The start and end offsets of each entry value_offsets: RawPtrBox, } impl MapArray { /// Returns a reference to the keys of this map. - pub fn keys(&self) -> ArrayRef { - make_array(self.values.data().child_data()[0].clone()) + pub fn keys(&self) -> &ArrayRef { + &self.keys } /// Returns a reference to the values of this map. - pub fn values(&self) -> ArrayRef { - make_array(self.values.data().child_data()[1].clone()) + pub fn values(&self) -> &ArrayRef { + &self.values } /// Returns the data type of the map's keys. - pub fn key_type(&self) -> DataType { - self.values.data().child_data()[0].data_type().clone() + pub fn key_type(&self) -> &DataType { + self.keys.data_type() } /// Returns the data type of the map's values. - pub fn value_type(&self) -> DataType { - self.values.data().child_data()[1].data_type().clone() + pub fn value_type(&self) -> &DataType { + self.values.data_type() } /// Returns ith value of this map array. + /// + /// This is a [`StructArray`] containing two fields /// # Safety /// Caller must ensure that the index is within the array bounds pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { let end = *self.value_offsets().get_unchecked(i + 1); let start = *self.value_offsets().get_unchecked(i); - self.values + self.entries .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) } /// Returns ith value of this map array. + /// + /// This is a [`StructArray`] containing two fields pub fn value(&self, i: usize) -> ArrayRef { let end = self.value_offsets()[i + 1] as usize; let start = self.value_offsets()[i] as usize; - self.values.slice(start, end - start) + self.entries.slice(start, end - start) } /// Returns the offset values in the offsets buffer @@ -146,7 +156,9 @@ impl MapArray { ))); } - let values = make_array(entries); + let keys = make_array(entries.child_data()[0].clone()); + let values = make_array(entries.child_data()[1].clone()); + let entries = make_array(entries); let value_offsets = data.buffers()[0].as_ptr(); // SAFETY: @@ -159,8 +171,11 @@ impl MapArray { ))); } } + Ok(Self { data, + entries, + keys, values, value_offsets, }) @@ -241,6 +256,8 @@ impl std::fmt::Debug for MapArray { #[cfg(test)] mod tests { + use crate::cast::as_primitive_array; + use crate::types::UInt32Type; use crate::{Int32Array, UInt32Array}; use std::sync::Arc; @@ -335,9 +352,8 @@ mod tests { .unwrap(); let map_array = MapArray::from(map_data); - let values = map_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::UInt32, map_array.value_type()); + assert_eq!(&value_data, map_array.values().data()); + assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(3, map_array.len()); assert_eq!(0, map_array.null_count()); assert_eq!(6, map_array.value_offsets()[2]); @@ -376,9 +392,8 @@ mod tests { .unwrap(); let map_array = MapArray::from(map_data); - let values = map_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::UInt32, map_array.value_type()); + assert_eq!(&value_data, map_array.values().data()); + assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(2, map_array.len()); assert_eq!(0, map_array.null_count()); assert_eq!(6, map_array.value_offsets()[1]); @@ -508,12 +523,11 @@ mod tests { ) .unwrap(); - let values = map_array.values(); assert_eq!( &values_data, - values.as_any().downcast_ref::().unwrap() + as_primitive_array::(map_array.values()) ); - assert_eq!(DataType::UInt32, map_array.value_type()); + assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(3, map_array.len()); assert_eq!(0, map_array.null_count()); assert_eq!(6, map_array.value_offsets()[2]); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 3c03a486c226..cb6cd907c77a 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -49,8 +49,8 @@ use std::sync::Arc; /// /// let array = builder.finish(); /// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]); -/// assert_eq!(*array.values(), Int32Array::from(vec![1, 2, 4])); -/// assert_eq!(*array.keys(), StringArray::from(vec!["joe", "blogs", "foo"])); +/// assert_eq!(array.values().as_ref(), &Int32Array::from(vec![1, 2, 4])); +/// assert_eq!(array.keys().as_ref(), &StringArray::from(vec!["joe", "blogs", "foo"])); /// /// ``` #[derive(Debug)] diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index ea6eb360e579..1879dde08b89 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -245,7 +245,7 @@ impl IpcDataGenerator { .expect("Unable to downcast to fixed size list array"); self.encode_dictionaries( field, - &list.values(), + list.values(), encoded_dictionaries, dictionary_tracker, write_options, @@ -264,7 +264,7 @@ impl IpcDataGenerator { // keys self.encode_dictionaries( keys, - &map_array.keys(), + map_array.keys(), encoded_dictionaries, dictionary_tracker, write_options, @@ -273,7 +273,7 @@ impl IpcDataGenerator { // values self.encode_dictionaries( values, - &map_array.values(), + map_array.values(), encoded_dictionaries, dictionary_tracker, write_options, diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index fa7db4b862e9..d2425a3d58a9 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -415,8 +415,8 @@ fn set_column_for_json_rows( ))); } - let keys = as_string_array(&keys); - let values = array_to_json_array(&values)?; + let keys = as_string_array(keys); + let values = array_to_json_array(values)?; let mut kv = keys.iter().zip(values.into_iter()); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 87b4ebc2b080..9235706d5c38 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -333,8 +333,8 @@ fn write_leaves( .as_any() .downcast_ref::() .expect("Unable to get map array"); - keys.push(map_array.keys()); - values.push(map_array.values()); + keys.push(map_array.keys().clone()); + values.push(map_array.values().clone()); } write_leaves(row_group_writer, &keys, levels)?; From 79bda7d361579cda88fec2eb9b8793ad7f653442 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 3 Feb 2023 11:38:09 +0000 Subject: [PATCH 0572/1411] Handle non-contiguous type_ids in UnionArray (#3653) (#3654) --- arrow-array/src/array/union_array.rs | 111 +++++++++++++++++++++++---- arrow-ipc/src/writer.rs | 9 +-- 2 files changed, 98 insertions(+), 22 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 5870952d7f75..f215fb0def9a 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -107,7 +107,7 @@ use std::any::Any; #[derive(Clone)] pub struct UnionArray { data: ArrayData, - boxed_fields: Vec, + boxed_fields: Vec>, } impl UnionArray { @@ -229,9 +229,8 @@ impl UnionArray { /// Panics if the `type_id` provided is less than zero or greater than the number of types /// in the `Union`. pub fn child(&self, type_id: i8) -> &ArrayRef { - assert!(0 <= type_id); - assert!((type_id as usize) < self.boxed_fields.len()); - &self.boxed_fields[type_id as usize] + let boxed = &self.boxed_fields[type_id as usize]; + boxed.as_ref().expect("invalid type id") } /// Returns the `type_id` for the array slot at `index`. @@ -264,8 +263,8 @@ impl UnionArray { pub fn value(&self, i: usize) -> ArrayRef { let type_id = self.type_id(i); let value_offset = self.value_offset(i) as usize; - let child_data = self.boxed_fields[type_id as usize].clone(); - child_data.slice(value_offset, 1) + let child = self.child(type_id); + child.slice(value_offset, 1) } /// Returns the names of the types in the union. @@ -290,9 +289,14 @@ impl UnionArray { impl From for UnionArray { fn from(data: ArrayData) -> Self { - let mut boxed_fields = vec![]; - for cd in data.child_data() { - boxed_fields.push(make_array(cd.clone())); + let field_ids = match data.data_type() { + DataType::Union(_, ids, _) => ids, + d => panic!("UnionArray expected ArrayData with type Union got {d}"), + }; + let max_id = field_ids.iter().copied().max().unwrap_or_default() as usize; + let mut boxed_fields = vec![None; max_id + 1]; + for (cd, field_id) in data.child_data().iter().zip(field_ids) { + boxed_fields[*field_id as usize] = Some(make_array(cd.clone())); } Self { data, boxed_fields } } @@ -348,21 +352,27 @@ impl std::fmt::Debug for UnionArray { writeln!(f, "-- type id buffer:")?; writeln!(f, "{:?}", self.data().buffers()[0])?; - if self.is_dense() { + let (fields, ids, mode) = match self.data_type() { + DataType::Union(f, ids, mode) => (f, ids, mode), + _ => unreachable!(), + }; + + if mode == &UnionMode::Dense { writeln!(f, "-- offsets buffer:")?; writeln!(f, "{:?}", self.data().buffers()[1])?; } - for (child_index, name) in self.type_names().iter().enumerate() { - let column = &self.boxed_fields[child_index]; + assert_eq!(fields.len(), ids.len()); + for (field, type_id) in fields.iter().zip(ids) { + let child = self.child(*type_id); writeln!( f, "-- child {}: \"{}\" ({:?})", - child_index, - *name, - column.data_type() + type_id, + field.name(), + field.data_type() )?; - std::fmt::Debug::fmt(column, f)?; + std::fmt::Debug::fmt(child, f)?; writeln!(f)?; } writeln!(f, "]") @@ -374,6 +384,7 @@ mod tests { use super::*; use crate::builder::UnionBuilder; + use crate::cast::{as_primitive_array, as_string_array}; use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type}; use crate::RecordBatch; use crate::{Float64Array, Int32Array, Int64Array, StringArray}; @@ -1017,4 +1028,72 @@ mod tests { let record_batch_slice = record_batch.slice(1, 3); test_slice_union(record_batch_slice); } + + #[test] + fn test_custom_type_ids() { + let data_type = DataType::Union( + vec![ + Field::new("strings", DataType::Utf8, false), + Field::new("integers", DataType::Int32, false), + Field::new("floats", DataType::Float64, false), + ], + vec![8, 4, 9], + UnionMode::Dense, + ); + + let string_array = StringArray::from(vec!["foo", "bar", "baz"]); + let int_array = Int32Array::from(vec![5, 6, 4]); + let float_array = Float64Array::from(vec![10.0]); + + let type_ids = Buffer::from_iter([4_i8, 8, 4, 8, 9, 4, 8]); + let value_offsets = Buffer::from_iter([0_i32, 0, 1, 1, 0, 2, 2]); + + let data = ArrayData::builder(data_type) + .len(7) + .buffers(vec![type_ids, value_offsets]) + .child_data(vec![ + string_array.into_data(), + int_array.into_data(), + float_array.into_data(), + ]) + .build() + .unwrap(); + + let array = UnionArray::from(data); + + let v = array.value(0); + assert_eq!(v.data_type(), &DataType::Int32); + assert_eq!(v.len(), 1); + assert_eq!(as_primitive_array::(v.as_ref()).value(0), 5); + + let v = array.value(1); + assert_eq!(v.data_type(), &DataType::Utf8); + assert_eq!(v.len(), 1); + assert_eq!(as_string_array(v.as_ref()).value(0), "foo"); + + let v = array.value(2); + assert_eq!(v.data_type(), &DataType::Int32); + assert_eq!(v.len(), 1); + assert_eq!(as_primitive_array::(v.as_ref()).value(0), 6); + + let v = array.value(3); + assert_eq!(v.data_type(), &DataType::Utf8); + assert_eq!(v.len(), 1); + assert_eq!(as_string_array(v.as_ref()).value(0), "bar"); + + let v = array.value(4); + assert_eq!(v.data_type(), &DataType::Float64); + assert_eq!(v.len(), 1); + assert_eq!(as_primitive_array::(v.as_ref()).value(0), 10.0); + + let v = array.value(5); + assert_eq!(v.data_type(), &DataType::Int32); + assert_eq!(v.len(), 1); + assert_eq!(as_primitive_array::(v.as_ref()).value(0), 4); + + let v = array.value(6); + assert_eq!(v.data_type(), &DataType::Utf8); + assert_eq!(v.len(), 1); + assert_eq!(as_string_array(v.as_ref()).value(0), "baz"); + } } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 1879dde08b89..8835cb49ffce 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -279,13 +279,10 @@ impl IpcDataGenerator { write_options, )?; } - DataType::Union(fields, _, _) => { + DataType::Union(fields, type_ids, _) => { let union = as_union_array(column); - for (field, column) in fields - .iter() - .enumerate() - .map(|(n, f)| (f, union.child(n as i8))) - { + for (field, type_id) in fields.iter().zip(type_ids) { + let column = union.child(*type_id); self.encode_dictionaries( field, column, From 25e10ddf0b8b65f0d73142717e7c22ab150d6870 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sat, 4 Feb 2023 04:24:56 -0500 Subject: [PATCH 0573/1411] Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray (#3603) * Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray * fix clippy issues * minor fix * incorporate pr suggestions * formatting fix * fix clippy issues --------- Co-authored-by: ask --- arrow-array/src/array/run_array.rs | 226 ++++++++++++++- .../src/builder/generic_byte_run_builder.rs | 53 +++- .../src/builder/primitive_run_builder.rs | 31 ++ arrow-array/src/lib.rs | 1 + arrow-array/src/run_iterator.rs | 273 ++++++++++++++++++ arrow/Cargo.toml | 4 + arrow/benches/string_run_builder.rs | 80 +++++ 7 files changed, 656 insertions(+), 12 deletions(-) create mode 100644 arrow-array/src/run_iterator.rs create mode 100644 arrow/benches/string_run_builder.rs diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 48c4896b695c..8cc1f676b6dc 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -24,8 +24,9 @@ use arrow_schema::{ArrowError, DataType, Field}; use crate::{ builder::StringRunBuilder, make_array, + run_iterator::RunArrayIter, types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}, - Array, ArrayRef, PrimitiveArray, + Array, ArrayAccessor, ArrayRef, PrimitiveArray, }; /// @@ -121,6 +122,27 @@ impl RunArray { pub fn values(&self) -> &ArrayRef { &self.values } + + /// Downcast this [`RunArray`] to a [`TypedRunArray`] + /// + /// ``` + /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type}; + /// + /// let orig = [Some("a"), Some("b"), None]; + /// let run_array = RunArray::::from_iter(orig); + /// let typed = run_array.downcast::().unwrap(); + /// assert_eq!(typed.value(0), "a"); + /// assert_eq!(typed.value(1), "b"); + /// assert!(typed.values().is_null(2)); + /// ``` + /// + pub fn downcast(&self) -> Option> { + let values = self.values.as_any().downcast_ref()?; + Some(TypedRunArray { + run_array: self, + values, + }) + } } impl From for RunArray { @@ -274,15 +296,195 @@ pub type Int32RunArray = RunArray; /// ``` pub type Int64RunArray = RunArray; +/// A strongly-typed wrapper around a [`RunArray`] that implements [`ArrayAccessor`] +/// and [`IntoIterator`] allowing fast access to its elements +/// +/// ``` +/// use arrow_array::{RunArray, StringArray, types::Int32Type}; +/// +/// let orig = ["a", "b", "a", "b"]; +/// let ree_array = RunArray::::from_iter(orig); +/// +/// // `TypedRunArray` allows you to access the values directly +/// let typed = ree_array.downcast::().unwrap(); +/// +/// for (maybe_val, orig) in typed.into_iter().zip(orig) { +/// assert_eq!(maybe_val.unwrap(), orig) +/// } +/// ``` +pub struct TypedRunArray<'a, R: RunEndIndexType, V> { + /// The run array + run_array: &'a RunArray, + + /// The values of the run_array + values: &'a V, +} + +// Manually implement `Clone` to avoid `V: Clone` type constraint +impl<'a, R: RunEndIndexType, V> Clone for TypedRunArray<'a, R, V> { + fn clone(&self) -> Self { + Self { + run_array: self.run_array, + values: self.values, + } + } +} + +impl<'a, R: RunEndIndexType, V> Copy for TypedRunArray<'a, R, V> {} + +impl<'a, R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'a, R, V> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + writeln!(f, "TypedRunArray({:?})", self.run_array) + } +} + +impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { + /// Returns the run_ends of this [`TypedRunArray`] + pub fn run_ends(&self) -> &'a PrimitiveArray { + self.run_array.run_ends() + } + + /// Returns the values of this [`TypedRunArray`] + pub fn values(&self) -> &'a V { + self.values + } + + /// Returns index to the physcial array for the given index to the logical array. + /// Performs a binary search on the run_ends array for the input index. + #[inline] + pub fn get_physical_index(&self, logical_index: usize) -> Option { + if logical_index >= self.run_array.len() { + return None; + } + let mut st: usize = 0; + let mut en: usize = self.run_ends().len(); + while st + 1 < en { + let mid: usize = (st + en) / 2; + if logical_index + < unsafe { + // Safety: + // The value of mid will always be between 1 and len - 1, + // where len is length of run ends array. + // This is based on the fact that `st` starts with 0 and + // `en` starts with len. The condition `st + 1 < en` ensures + // `st` and `en` differs atleast by two. So the value of `mid` + // will never be either `st` or `en` + self.run_ends().value_unchecked(mid - 1).as_usize() + } + { + en = mid + } else { + st = mid + } + } + Some(st) + } +} + +impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { + fn as_any(&self) -> &dyn Any { + self.run_array + } + + fn data(&self) -> &ArrayData { + &self.run_array.data + } + + fn into_data(self) -> ArrayData { + self.run_array.into_data() + } +} + +// Array accessor converts the index of logical array to the index of the physical array +// using binary search. The time complexity is O(log N) where N is number of runs. +impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + type Item = <&'a V as ArrayAccessor>::Item; + + fn value(&self, logical_index: usize) -> Self::Item { + assert!( + logical_index < self.len(), + "Trying to access an element at index {} from a TypedRunArray of length {}", + logical_index, + self.len() + ); + unsafe { self.value_unchecked(logical_index) } + } + + unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item { + let physical_index = self.get_physical_index(logical_index).unwrap(); + self.values().value_unchecked(physical_index) + } +} + +impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + type Item = Option<<&'a V as ArrayAccessor>::Item>; + type IntoIter = RunArrayIter<'a, R, V>; + + fn into_iter(self) -> Self::IntoIter { + RunArrayIter::new(self) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; + use rand::seq::SliceRandom; + use rand::thread_rng; + use rand::Rng; + use super::*; use crate::builder::PrimitiveRunBuilder; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int16Array, Int32Array, StringArray}; + fn build_input_array(approx_size: usize) -> Vec> { + // The input array is created by shuffling and repeating + // the seed values random number of times. + let mut seed: Vec> = vec![ + None, + None, + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + ]; + let mut ix = 0; + let mut result: Vec> = Vec::with_capacity(approx_size); + let mut rng = thread_rng(); + while result.len() < approx_size { + // shuffle the seed array if all the values are iterated. + if ix == 0 { + seed.shuffle(&mut rng); + } + // repeat the items between 1 and 7 times. + let num = rand::thread_rng().gen_range(1..8); + for _ in 0..num { + result.push(seed[ix]); + } + ix += 1; + if ix == 8 { + ix = 0 + } + } + println!("Size of input array: {}", result.len()); + result + } + #[test] fn test_run_array() { // Construct a value array @@ -504,4 +706,26 @@ mod tests { let a = RunArray::::from_iter(["32"]); let _ = RunArray::::from(a.into_data()); } + + #[test] + fn test_ree_array_accessor() { + let input_array = build_input_array(256); + + // Encode the input_array to ree_array + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array.iter().copied()); + let run_array = builder.finish(); + let typed = run_array.downcast::>().unwrap(); + + for (i, inp_val) in input_array.iter().enumerate() { + if let Some(val) = inp_val { + let actual = typed.value(i); + assert_eq!(*val, actual) + } else { + let physical_ix = typed.get_physical_index(i).unwrap(); + assert!(typed.values().is_null(physical_ix)); + }; + } + } } diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index c1ecbcb5ddec..c6dbb82ff6eb 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -44,15 +44,14 @@ use arrow_buffer::ArrowNativeType; /// /// let mut builder = /// GenericByteRunBuilder::::new(); -/// builder.append_value(b"abc"); -/// builder.append_value(b"abc"); -/// builder.append_null(); +/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter()); /// builder.append_value(b"def"); +/// builder.append_null(); /// let array = builder.finish(); /// /// assert_eq!( /// array.run_ends(), -/// &Int16Array::from(vec![Some(2), Some(3), Some(4)]) +/// &Int16Array::from(vec![Some(2), Some(3), Some(5), Some(6)]) /// ); /// /// let av = array.values(); @@ -60,6 +59,7 @@ use arrow_buffer::ArrowNativeType; /// assert!(!av.is_null(0)); /// assert!(av.is_null(1)); /// assert!(!av.is_null(2)); +/// assert!(av.is_null(3)); /// /// // Values are polymorphic and so require a downcast. /// let ava: &BinaryArray = as_generic_binary_array(av.as_ref()); @@ -299,6 +299,19 @@ where } } +impl Extend> for GenericByteRunBuilder +where + R: RunEndIndexType, + V: ByteArrayType, + S: AsRef, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + /// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). /// /// ``` @@ -315,9 +328,7 @@ where /// // The builder builds the dictionary value by value /// builder.append_value("abc"); /// builder.append_null(); -/// builder.append_value("def"); -/// builder.append_value("def"); -/// builder.append_value("abc"); +/// builder.extend([Some("def"), Some("def"), Some("abc")]); /// let array = builder.finish(); /// /// assert_eq!( @@ -356,9 +367,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// // The builder builds the dictionary value by value /// builder.append_value(b"abc"); /// builder.append_null(); -/// builder.append_value(b"def"); -/// builder.append_value(b"def"); -/// builder.append_value(b"abc"); +/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); /// let array = builder.finish(); /// /// assert_eq!( @@ -387,7 +396,9 @@ mod tests { use super::*; use crate::array::Array; - use crate::types::Int16Type; + use crate::cast::as_primitive_array; + use crate::cast::as_string_array; + use crate::types::{Int16Type, Int32Type}; use crate::GenericByteArray; use crate::Int16Array; use crate::Int16RunArray; @@ -516,4 +527,24 @@ mod tests { fn test_binary_run_buider_finish_cloned() { test_bytes_run_buider_finish_cloned::(vec![b"abc", b"def", b"ghi"]); } + + #[test] + fn test_extend() { + let mut builder = StringRunBuilder::::new(); + builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some)); + builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 10); + assert_eq!( + as_primitive_array::(array.run_ends()).values(), + &[3, 5, 8, 10] + ); + + let str_array = as_string_array(array.values().as_ref()); + assert_eq!(str_array.value(0), "a"); + assert_eq!(str_array.value(1), ""); + assert_eq!(str_array.value(2), "b"); + assert_eq!(str_array.value(3), "cupcakes"); + } } diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 82c46abfa053..41066228390d 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -253,6 +253,18 @@ where } } +impl Extend> for PrimitiveRunBuilder +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + fn extend>>(&mut self, iter: T) { + for elem in iter { + self.append_option(elem); + } + } +} + #[cfg(test)] mod tests { use crate::builder::PrimitiveRunBuilder; @@ -291,4 +303,23 @@ mod tests { assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); } + + #[test] + fn test_extend() { + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some)); + builder.extend([4, 4, 6, 2].into_iter().map(Some)); + let array = builder.finish(); + + assert_eq!(array.len(), 11); + assert_eq!(array.null_count(), 0); + assert_eq!( + as_primitive_array::(array.run_ends()).values(), + &[1, 3, 5, 9, 10, 11] + ); + assert_eq!( + as_primitive_array::(array.values().as_ref()).values(), + &[1, 2, 5, 4, 6, 2] + ); + } } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index d6a9ab30b85b..d8dc6efe25be 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -178,6 +178,7 @@ pub mod cast; mod delta; pub mod iterator; mod raw_pointer; +pub mod run_iterator; pub mod temporal_conversions; pub mod timezone; mod trusted_len; diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs new file mode 100644 index 000000000000..6a7b785fe1c6 --- /dev/null +++ b/arrow-array/src/run_iterator.rs @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Idiomatic iterator for [`RunArray`](crate::Array) + +use arrow_buffer::ArrowNativeType; + +use crate::{array::ArrayAccessor, types::RunEndIndexType, Array, TypedRunArray}; + +/// The [`RunArrayIter`] provides an idiomatic way to iterate over the run array. +/// It returns Some(T) if there is a value or None if the value is null. +/// +/// The iterator comes with a cost as it has to iterate over three arrays to determine +/// the value to be returned. The run_ends array is used to determine the index of the value. +/// The nulls array is used to determine if the value is null and the values array is used to +/// get the value. +/// +/// Unlike other iterators in this crate, [`RunArrayIter`] does not use [`ArrayAccessor`] +/// because the run array accessor does binary search to access each value which is too slow. +/// The run array iterator can determine the next value in constant time. +/// +#[derive(Debug)] +pub struct RunArrayIter<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + array: TypedRunArray<'a, R, V>, + current_logical: usize, + current_physical: usize, + current_end_logical: usize, + current_end_physical: usize, +} + +impl<'a, R, V> RunArrayIter<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + /// create a new iterator + pub fn new(array: TypedRunArray<'a, R, V>) -> Self { + let logical_len = array.len(); + let physical_len: usize = array.values().len(); + RunArrayIter { + array, + current_logical: 0, + current_physical: 0, + current_end_logical: logical_len, + current_end_physical: physical_len, + } + } +} + +impl<'a, R, V> Iterator for RunArrayIter<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + type Item = Option<<&'a V as ArrayAccessor>::Item>; + + #[inline] + fn next(&mut self) -> Option { + if self.current_logical == self.current_end_logical { + return None; + } + // If current logical index is greater than current run end index then increment + // the physical index. + if self.current_logical + >= self + .array + .run_ends() + .value(self.current_physical) + .as_usize() + { + // As the run_ends is expected to be strictly increasing, there + // should be at least one logical entry in one physical entry. Because of this + // reason the next value can be accessed by incrementing physical index once. + self.current_physical += 1; + } + if self.array.values().is_null(self.current_physical) { + self.current_logical += 1; + Some(None) + } else { + self.current_logical += 1; + // Safety: + // The self.current_physical is kept within bounds of self.current_logical. + // The self.current_logical will not go out of bounds because of the check + // `self.current_logical = self.current_end_logical` above. + unsafe { + Some(Some( + self.array.values().value_unchecked(self.current_physical), + )) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + self.current_end_logical - self.current_logical, + Some(self.current_end_logical - self.current_logical), + ) + } +} + +impl<'a, R, V> DoubleEndedIterator for RunArrayIter<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ + fn next_back(&mut self) -> Option { + if self.current_end_logical == self.current_logical { + return None; + } + + self.current_end_logical -= 1; + + if self.current_end_physical > 0 + && self.current_end_logical + < self + .array + .run_ends() + .value(self.current_end_physical - 1) + .as_usize() + { + // As the run_ends is expected to be strictly increasing, there + // should be at least one logical entry in one physical entry. Because of this + // reason the next value can be accessed by decrementing physical index once. + self.current_end_physical -= 1; + } + + Some(if self.array.values().is_null(self.current_end_physical) { + None + } else { + // Safety: + // The check `self.current_end_physical > 0` ensures the value will not underflow. + // Also self.current_end_physical starts with array.len() and + // decrements based on the bounds of self.current_end_logical. + unsafe { + Some( + self.array + .values() + .value_unchecked(self.current_end_physical), + ) + } + }) + } +} + +/// all arrays have known size. +impl<'a, R, V> ExactSizeIterator for RunArrayIter<'a, R, V> +where + R: RunEndIndexType, + V: Sync + Send, + &'a V: ArrayAccessor, + <&'a V as ArrayAccessor>::Item: Default, +{ +} + +#[cfg(test)] +mod tests { + use crate::{ + array::{Int32Array, StringArray}, + builder::PrimitiveRunBuilder, + types::Int32Type, + Int64RunArray, + }; + + #[test] + fn test_primitive_array_iter_round_trip() { + let mut input_vec = vec![ + Some(32), + Some(32), + None, + Some(64), + Some(64), + Some(64), + Some(72), + ]; + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(input_vec.clone().into_iter()); + let ree_array = builder.finish(); + let ree_array = ree_array.downcast::().unwrap(); + + let output_vec: Vec> = ree_array.into_iter().collect(); + assert_eq!(input_vec, output_vec); + + let rev_output_vec: Vec> = ree_array.into_iter().rev().collect(); + input_vec.reverse(); + assert_eq!(input_vec, rev_output_vec); + } + + #[test] + fn test_double_ended() { + let input_vec = vec![ + Some(32), + Some(32), + None, + Some(64), + Some(64), + Some(64), + Some(72), + ]; + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(input_vec.into_iter()); + let ree_array = builder.finish(); + let ree_array = ree_array.downcast::().unwrap(); + + let mut iter = ree_array.into_iter(); + assert_eq!(Some(Some(32)), iter.next()); + assert_eq!(Some(Some(72)), iter.next_back()); + assert_eq!(Some(Some(32)), iter.next()); + assert_eq!(Some(Some(64)), iter.next_back()); + assert_eq!(Some(None), iter.next()); + assert_eq!(Some(Some(64)), iter.next_back()); + assert_eq!(Some(Some(64)), iter.next()); + assert_eq!(None, iter.next_back()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_string_array_iter_round_trip() { + let input_vec = vec!["ab", "ab", "ba", "cc", "cc"]; + let input_ree_array: Int64RunArray = input_vec.into_iter().collect(); + let string_ree_array = input_ree_array.downcast::().unwrap(); + + // to and from iter, with a +1 + let result: Vec> = string_ree_array + .into_iter() + .map(|e| { + e.map(|e| { + let mut a = e.to_string(); + a.push('b'); + a + }) + }) + .collect(); + + let result_asref: Vec> = + result.iter().map(|f| f.as_deref()).collect(); + + let expected_vec = vec![ + Some("abb"), + Some("abb"), + Some("bab"), + Some("ccb"), + Some("ccb"), + ]; + + assert_eq!(expected_vec, result_asref); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 6de513df3653..57e3907a2fe9 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -237,6 +237,10 @@ required-features = ["test_utils"] name = "string_dictionary_builder" harness = false +[[bench]] +name = "string_run_builder" +harness = false + [[bench]] name = "substring_kernels" harness = false diff --git a/arrow/benches/string_run_builder.rs b/arrow/benches/string_run_builder.rs new file mode 100644 index 000000000000..2f0401bbef48 --- /dev/null +++ b/arrow/benches/string_run_builder.rs @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::StringRunBuilder; +use arrow::datatypes::Int32Type; +use criterion::{criterion_group, criterion_main, Criterion}; +use rand::{thread_rng, Rng}; + +fn build_strings( + physical_array_len: usize, + logical_array_len: usize, + string_len: usize, +) -> Vec { + let mut rng = thread_rng(); + let run_len = logical_array_len / physical_array_len; + let mut values: Vec = (0..physical_array_len) + .map(|_| (0..string_len).map(|_| rng.gen::()).collect()) + .flat_map(|s| std::iter::repeat(s).take(run_len)) + .collect(); + while values.len() < logical_array_len { + let last_val = values[values.len() - 1].clone(); + values.push(last_val); + } + values +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("string_run_builder"); + + let mut do_bench = |physical_array_len: usize, + logical_array_len: usize, + string_len: usize| { + group.bench_function( + format!( + "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, string_len: {string_len})", + ), + |b| { + let strings = + build_strings(physical_array_len, logical_array_len, string_len); + b.iter(|| { + let mut builder = StringRunBuilder::::with_capacity( + physical_array_len, + (string_len + 1) * physical_array_len, + ); + + for val in &strings { + builder.append_value(val); + } + + builder.finish(); + }) + }, + ); + }; + + do_bench(20, 1000, 5); + do_bench(100, 1000, 5); + do_bench(100, 1000, 10); + do_bench(100, 10000, 10); + do_bench(100, 10000, 100); + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 7deb35839d55afb77370a41e3395529ddf78bf59 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 4 Feb 2023 14:12:17 +0000 Subject: [PATCH 0574/1411] Include line and field number in CSV UTF-8 error (#3656) (#3657) * Include line and field number in CSV UTF-8 error (#3656) * Additional test case --- arrow-csv/src/reader/mod.rs | 38 +++++++++++++++++++++++++++++++++ arrow-csv/src/reader/records.rs | 16 +++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index cff1337dd78f..925f504495d5 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -2231,4 +2231,42 @@ mod tests { } } } + + fn err_test(csv: &[u8], expected: &str) { + let schema = Arc::new(Schema::new(vec![ + Field::new("text1", DataType::Utf8, false), + Field::new("text2", DataType::Utf8, false), + ])); + let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); + let b = ReaderBuilder::new() + .with_schema(schema) + .with_batch_size(2) + .build_buffered(buffer) + .unwrap(); + let err = b.collect::, _>>().unwrap_err().to_string(); + assert_eq!(err, expected) + } + + #[test] + fn test_invalid_utf8() { + err_test( + b"sdf,dsfg\ndfd,hgh\xFFue\n,sds\nFalhghse,", + "Csv error: Encountered invalid UTF-8 data for line 2 and field 2", + ); + + err_test( + b"sdf,dsfg\ndksdk,jf\nd\xFFfd,hghue\n,sds\nFalhghse,", + "Csv error: Encountered invalid UTF-8 data for line 3 and field 1", + ); + + err_test( + b"sdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF", + "Csv error: Encountered invalid UTF-8 data for line 5 and field 2", + ); + + err_test( + b"\xFFsdf,dsfg\ndksdk,jf\ndsdsfd,hghue\n,sds\nFalhghse,\xFF", + "Csv error: Encountered invalid UTF-8 data for line 1 and field 1", + ); + } } diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index c4da36ca4bfe..a59d02e0e2d8 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -193,7 +193,21 @@ impl RecordDecoder { // Need to truncate data t1o the actual amount of data read let data = std::str::from_utf8(&self.data[..self.data_len]).map_err(|e| { - ArrowError::CsvError(format!("Encountered invalid UTF-8 data: {e}")) + let valid_up_to = e.valid_up_to(); + + // We can't use binary search because of empty fields + let idx = self.offsets[..self.offsets_len] + .iter() + .rposition(|x| *x <= valid_up_to) + .unwrap(); + + let field = idx % self.num_columns + 1; + let line_offset = self.line_number - self.num_rows; + let line = line_offset + idx / self.num_columns; + + ArrowError::CsvError(format!( + "Encountered invalid UTF-8 data for line {line} and field {field}" + )) })?; let offsets = &self.offsets[..self.offsets_len]; From b7f4cbf876fb3adcbb68fa6aff7f045a9c17a6fb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 5 Feb 2023 00:09:03 -0800 Subject: [PATCH 0575/1411] Add modulus_dyn and modulus_scalar_dyn (#3649) --- arrow-arith/src/arithmetic.rs | 106 +++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 48f0412bf8c7..0db32d575761 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1333,6 +1333,45 @@ where }); } +/// Perform `left % right` operation on two arrays. If either left or right value is null +/// then the result is also null. If any right hand value is zero then the result of this +/// operation will be `Err(ArrowError::DivideByZero)`. +pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.mod_wrapping(b)) + } + }, + math_divide_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.mod_wrapping(b)) + } + }).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. @@ -1551,6 +1590,23 @@ where Ok(unary(array, |a| a.mod_wrapping(modulo))) } +/// Modulus every value in an array by a scalar. If any value in the array is null then the +/// result is also null. If the scalar is zero then the result of this operation will be +/// `Err(ArrowError::DivideByZero)`. +pub fn modulus_scalar_dyn( + array: &dyn Array, + modulo: T::Native, +) -> Result +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp, +{ + if modulo.is_zero() { + return Err(ArrowError::DivideByZero); + } + unary_dyn::<_, T>(array, |value| value.mod_wrapping(modulo)) +} + /// Divide every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. @@ -2170,6 +2226,14 @@ mod tests { assert_eq!(0, c.value(2)); assert_eq!(1, c.value(3)); assert_eq!(0, c.value(4)); + + let c = modulus_dyn(&a, &b).unwrap(); + let c = as_primitive_array::(&c); + assert_eq!(0, c.value(0)); + assert_eq!(3, c.value(1)); + assert_eq!(0, c.value(2)); + assert_eq!(1, c.value(3)); + assert_eq!(0, c.value(4)); } #[test] @@ -2182,6 +2246,16 @@ mod tests { modulus(&a, &b).unwrap(); } + #[test] + #[should_panic( + expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" + )] + fn test_int_array_modulus_dyn_divide_by_zero() { + let a = Int32Array::from(vec![1]); + let b = Int32Array::from(vec![0]); + modulus_dyn(&a, &b).unwrap(); + } + #[test] fn test_int_array_modulus_overflow_wrapping() { let a = Int32Array::from(vec![i32::MIN]); @@ -2258,6 +2332,11 @@ mod tests { let c = modulus_scalar(&a, b).unwrap(); let expected = Int32Array::from(vec![0, 2, 0, 2, 1]); assert_eq!(c, expected); + + let c = modulus_scalar_dyn::(&a, b).unwrap(); + let c = as_primitive_array::(&c); + let expected = Int32Array::from(vec![0, 2, 0, 2, 1]); + assert_eq!(c, &expected); } #[test] @@ -2268,6 +2347,11 @@ mod tests { let actual = modulus_scalar(a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); assert_eq!(actual, expected); + + let actual = modulus_scalar_dyn::(a, 3).unwrap(); + let actual = as_primitive_array::(&actual); + let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); + assert_eq!(actual, &expected); } #[test] @@ -2283,7 +2367,11 @@ mod tests { fn test_int_array_modulus_scalar_overflow_wrapping() { let a = Int32Array::from(vec![i32::MIN]); let result = modulus_scalar(&a, -1).unwrap(); - assert_eq!(0, result.value(0)) + assert_eq!(0, result.value(0)); + + let result = modulus_scalar_dyn::(&a, -1).unwrap(); + let result = as_primitive_array::(&result); + assert_eq!(0, result.value(0)); } #[test] @@ -2566,6 +2654,14 @@ mod tests { modulus(&a, &b).unwrap(); } + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_i32_array_modulus_dyn_by_zero() { + let a = Int32Array::from(vec![15]); + let b = Int32Array::from(vec![0]); + modulus_dyn(&a, &b).unwrap(); + } + #[test] #[should_panic(expected = "DivideByZero")] fn test_f32_array_modulus_by_zero() { @@ -2574,6 +2670,14 @@ mod tests { modulus(&a, &b).unwrap(); } + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_f32_array_modulus_dyn_by_zero() { + let a = Float32Array::from(vec![1.5]); + let b = Float32Array::from(vec![0.0]); + modulus_dyn(&a, &b).unwrap(); + } + #[test] fn test_f64_array_divide() { let a = Float64Array::from(vec![15.0, 15.0, 8.0]); From 4835659e5f144d3a565fb58138ee80c8cbc42106 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 5 Feb 2023 10:35:30 +0000 Subject: [PATCH 0576/1411] Patch git permissions (#3660) (#3661) --- .github/actions/setup-builder/action.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml index 865ff66b9d09..aa1d1d9c14da 100644 --- a/.github/actions/setup-builder/action.yaml +++ b/.github/actions/setup-builder/action.yaml @@ -65,3 +65,7 @@ runs: - name: Enable backtraces shell: bash run: echo "RUST_BACKTRACE=1" >> $GITHUB_ENV + - name: Fixup git permissions + # https://github.com/actions/checkout/issues/766 + shell: bash + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" From 9131c30a40cfb14d3e9454eac0f9b80e000b152b Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Mon, 6 Feb 2023 06:03:02 -0500 Subject: [PATCH 0577/1411] feat: take kernel for RunArray (#3622) * Rebase to master branch * Add take_run kernel and include benchmarks for take_run and primitive run accessor. * fix ci issues * fix ci issues * fix clippy issues * fix clippy * Alternative approach to find physical indices for given logical indices * Remove unused code, refactor benchmarks * minor fixes * some refactor * add some comments * doc fixes * change benchmkar parameters * add test for run_iterator * add comments * Fix some PR suggestions * incorporte pr suggestion --------- Co-authored-by: ask Co-authored-by: Raphael Taylor-Davies --- arrow-array/src/array/run_array.rs | 195 +++++++++++++++++++----- arrow-array/src/cast.rs | 126 +++++++++++++++ arrow-array/src/run_iterator.rs | 77 +++++++++- arrow-select/src/take.rs | 98 +++++++++++- arrow/Cargo.toml | 17 +++ arrow/benches/primitive_run_accessor.rs | 57 +++++++ arrow/benches/primitive_run_take.rs | 79 ++++++++++ arrow/benches/string_run_builder.rs | 22 +-- arrow/benches/string_run_iterator.rs | 84 ++++++++++ arrow/benches/take_kernels.rs | 7 + arrow/src/util/bench_util.rs | 68 +++++++++ 11 files changed, 767 insertions(+), 63 deletions(-) create mode 100644 arrow/benches/primitive_run_accessor.rs create mode 100644 arrow/benches/primitive_run_take.rs create mode 100644 arrow/benches/string_run_iterator.rs diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 8cc1f676b6dc..2e378c90fd49 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -143,6 +143,96 @@ impl RunArray { values, }) } + + /// Returns index to the physical array for the given index to the logical array. + /// Performs a binary search on the run_ends array for the input index. + #[inline] + pub fn get_physical_index(&self, logical_index: usize) -> Option { + if logical_index >= self.len() { + return None; + } + let mut st: usize = 0; + let mut en: usize = self.run_ends().len(); + while st + 1 < en { + let mid: usize = (st + en) / 2; + if logical_index + < unsafe { + // Safety: + // The value of mid will always be between 1 and len - 1, + // where len is length of run ends array. + // This is based on the fact that `st` starts with 0 and + // `en` starts with len. The condition `st + 1 < en` ensures + // `st` and `en` differs atleast by two. So the value of `mid` + // will never be either `st` or `en` + self.run_ends().value_unchecked(mid - 1).as_usize() + } + { + en = mid + } else { + st = mid + } + } + Some(st) + } + + /// Returns the physical indices of the input logical indices. Returns error if any of the logical + /// index cannot be converted to physical index. The logical indices are sorted and iterated along + /// with run_ends array to find matching physical index. The approach used here was chosen over + /// finding physical index for each logical index using binary search using the function + /// `get_physical_index`. Running benchmarks on both approaches showed that the approach used here + /// scaled well for larger inputs. + /// See for more details. + #[inline] + pub fn get_physical_indices( + &self, + logical_indices: &[I], + ) -> Result, ArrowError> + where + I: ArrowNativeType, + { + let indices_len = logical_indices.len(); + + // `ordered_indices` store index into `logical_indices` and can be used + // to iterate `logical_indices` in sorted order. + let mut ordered_indices: Vec = (0..indices_len).collect(); + + // Instead of sorting `logical_idices` directly, sort the `ordered_indices` + // whose values are index of `logical_indices` + ordered_indices.sort_unstable_by(|lhs, rhs| { + logical_indices[*lhs] + .partial_cmp(&logical_indices[*rhs]) + .unwrap() + }); + + let mut physical_indices = vec![0; indices_len]; + + let mut ordered_index = 0_usize; + for (physical_index, run_end) in self.run_ends.values().iter().enumerate() { + // Get the run end index of current physical index + let run_end_value = run_end.as_usize(); + + // All the `logical_indices` that are less than current run end index + // belongs to current physical index. + while ordered_index < indices_len + && logical_indices[ordered_indices[ordered_index]].as_usize() + < run_end_value + { + physical_indices[ordered_indices[ordered_index]] = physical_index; + ordered_index += 1; + } + } + + // If there are input values >= run_ends.last_value then we'll not be able to convert + // all logical indices to physical indices. + if ordered_index < logical_indices.len() { + let logical_index = + logical_indices[ordered_indices[ordered_index]].as_usize(); + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {logical_index}.", + ))); + } + Ok(physical_indices) + } } impl From for RunArray { @@ -348,37 +438,6 @@ impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { pub fn values(&self) -> &'a V { self.values } - - /// Returns index to the physcial array for the given index to the logical array. - /// Performs a binary search on the run_ends array for the input index. - #[inline] - pub fn get_physical_index(&self, logical_index: usize) -> Option { - if logical_index >= self.run_array.len() { - return None; - } - let mut st: usize = 0; - let mut en: usize = self.run_ends().len(); - while st + 1 < en { - let mid: usize = (st + en) / 2; - if logical_index - < unsafe { - // Safety: - // The value of mid will always be between 1 and len - 1, - // where len is length of run ends array. - // This is based on the fact that `st` starts with 0 and - // `en` starts with len. The condition `st + 1 < en` ensures - // `st` and `en` differs atleast by two. So the value of `mid` - // will never be either `st` or `en` - self.run_ends().value_unchecked(mid - 1).as_usize() - } - { - en = mid - } else { - st = mid - } - } - Some(st) - } } impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { @@ -417,7 +476,7 @@ where } unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item { - let physical_index = self.get_physical_index(logical_index).unwrap(); + let physical_index = self.run_array.get_physical_index(logical_index).unwrap(); self.values().value_unchecked(physical_index) } } @@ -447,13 +506,15 @@ mod tests { use super::*; use crate::builder::PrimitiveRunBuilder; + use crate::cast::as_primitive_array; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int16Array, Int32Array, StringArray}; - fn build_input_array(approx_size: usize) -> Vec> { + fn build_input_array(size: usize) -> Vec> { // The input array is created by shuffling and repeating // the seed values random number of times. let mut seed: Vec> = vec![ + None, None, None, Some(1), @@ -462,26 +523,32 @@ mod tests { Some(4), Some(5), Some(6), + Some(7), + Some(8), + Some(9), ]; + let mut result: Vec> = Vec::with_capacity(size); let mut ix = 0; - let mut result: Vec> = Vec::with_capacity(approx_size); let mut rng = thread_rng(); - while result.len() < approx_size { + // run length can go up to 8. Cap the max run length for smaller arrays to size / 2. + let max_run_length = 8_usize.min(1_usize.max(size / 2)); + while result.len() < size { // shuffle the seed array if all the values are iterated. if ix == 0 { seed.shuffle(&mut rng); } - // repeat the items between 1 and 7 times. - let num = rand::thread_rng().gen_range(1..8); + // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays + let num = + max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); for _ in 0..num { result.push(seed[ix]); } ix += 1; - if ix == 8 { + if ix == seed.len() { ix = 0 } } - println!("Size of input array: {}", result.len()); + result.resize(size, None); result } @@ -718,14 +785,62 @@ mod tests { let run_array = builder.finish(); let typed = run_array.downcast::>().unwrap(); + // Access every index and check if the value in the input array matches returned value. for (i, inp_val) in input_array.iter().enumerate() { if let Some(val) = inp_val { let actual = typed.value(i); assert_eq!(*val, actual) } else { - let physical_ix = typed.get_physical_index(i).unwrap(); + let physical_ix = run_array.get_physical_index(i).unwrap(); assert!(typed.values().is_null(physical_ix)); }; } } + + #[test] + fn test_get_physical_indices() { + // Test for logical lengths starting from 10 to 250 increasing by 10 + for logical_len in (0..250).step_by(10) { + let input_array = build_input_array(logical_len); + + // create run array using input_array + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(input_array.clone().into_iter()); + + let run_array = builder.finish(); + let physical_values_array = + as_primitive_array::(run_array.values()); + + // create an array consisting of all the indices repeated twice and shuffled. + let mut logical_indices: Vec = (0_u32..(logical_len as u32)).collect(); + // add same indices once more + logical_indices.append(&mut logical_indices.clone()); + let mut rng = thread_rng(); + logical_indices.shuffle(&mut rng); + + let physical_indices = + run_array.get_physical_indices(&logical_indices).unwrap(); + + assert_eq!(logical_indices.len(), physical_indices.len()); + + // check value in logical index in the input_array matches physical index in typed_run_array + logical_indices + .iter() + .map(|f| f.as_usize()) + .zip(physical_indices.iter()) + .for_each(|(logical_ix, physical_ix)| { + let expected = input_array[logical_ix]; + match expected { + Some(val) => { + assert!(physical_values_array.is_valid(*physical_ix)); + let actual = physical_values_array.value(*physical_ix); + assert_eq!(val, actual); + } + None => { + assert!(physical_values_array.is_null(*physical_ix)) + } + }; + }); + } + } } diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 02d5432c168f..4bae4932c5f1 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -95,6 +95,53 @@ macro_rules! downcast_integer { }; } +/// Given one or more expressions evaluating to an integer [`DataType`] invokes the provided macro +/// `m` with the corresponding integer [`RunEndIndexType`], followed by any additional arguments +/// +/// ``` +/// # use arrow_array::{downcast_primitive, ArrowPrimitiveType, downcast_run_end_index}; +/// # use arrow_schema::{DataType, Field}; +/// +/// macro_rules! run_end_size_helper { +/// ($t:ty, $o:ty) => { +/// std::mem::size_of::<<$t as ArrowPrimitiveType>::Native>() as $o +/// }; +/// } +/// +/// fn run_end_index_size(t: &DataType) -> u8 { +/// match t { +/// DataType::RunEndEncoded(k, _) => downcast_run_end_index! { +/// k.data_type() => (run_end_size_helper, u8), +/// _ => unreachable!(), +/// }, +/// _ => u8::MAX, +/// } +/// } +/// +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int32, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 4); +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int64, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 8); +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int16, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 2); +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_run_end_index { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + match ($($data_type),+) { + $crate::repeat_pat!(arrow_schema::DataType::Int16, $($data_type),+) => { + $m!($crate::types::Int16Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Int32, $($data_type),+) => { + $m!($crate::types::Int32Type $(, $args)*) + } + $crate::repeat_pat!(arrow_schema::DataType::Int64, $($data_type),+) => { + $m!($crate::types::Int64Type $(, $args)*) + } + $(($($p),+) => $fallback,)* + } + }; +} + /// Given one or more expressions evaluating to primitive [`DataType`] invokes the provided macro /// `m` with the corresponding [`ArrowPrimitiveType`], followed by any additional arguments /// @@ -449,6 +496,85 @@ where .expect("Unable to downcast to dictionary array") } +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`RunArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use arrow_array::{ArrayRef, RunArray}; +/// # use arrow_array::cast::as_run_array; +/// # use arrow_array::types::Int32Type; +/// +/// let arr: RunArray = vec![Some("foo")].into_iter().collect(); +/// let arr: ArrayRef = std::sync::Arc::new(arr); +/// let run_array: &RunArray = as_run_array::(&arr); +/// ``` +pub fn as_run_array(arr: &dyn Array) -> &RunArray +where + T: RunEndIndexType, +{ + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to run array") +} + +#[macro_export] +#[doc(hidden)] +macro_rules! downcast_run_array_helper { + ($t:ty, $($values:ident),+, $e:block) => {{ + $(let $values = $crate::cast::as_run_array::<$t>($values);)+ + $e + }}; +} + +/// Downcast an [`Array`] to a [`RunArray`] based on its [`DataType`], accepts +/// a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, StringArray, downcast_run_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_strings(array: &dyn Array) { +/// downcast_run_array!( +/// array => match array.values().data_type() { +/// DataType::Utf8 => { +/// for v in array.downcast::().unwrap() { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported run array value type {}", t), +/// }, +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_run_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + downcast_run_array!($values => {$e} $($p => $fallback)*) + }; + + ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match $values.data_type() { + arrow_schema::DataType::RunEndEncoded(k, _) => { + $crate::downcast_run_end_index! { + k.data_type() => ($crate::downcast_run_array_helper, $values, $e), + k => unreachable!("unsupported run end index type: {}", k) + } + } + $($p => $fallback,)* + } + } +} + /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`GenericListArray`], panic'ing on failure. pub fn as_generic_list_array( diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 6a7b785fe1c6..8bad85a9f1e1 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -149,7 +149,6 @@ where // reason the next value can be accessed by decrementing physical index once. self.current_end_physical -= 1; } - Some(if self.array.values().is_null(self.current_end_physical) { None } else { @@ -180,6 +179,8 @@ where #[cfg(test)] mod tests { + use rand::{seq::SliceRandom, thread_rng, Rng}; + use crate::{ array::{Int32Array, StringArray}, builder::PrimitiveRunBuilder, @@ -187,6 +188,48 @@ mod tests { Int64RunArray, }; + fn build_input_array(size: usize) -> Vec> { + // The input array is created by shuffling and repeating + // the seed values random number of times. + let mut seed: Vec> = vec![ + None, + None, + None, + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ]; + let mut result: Vec> = Vec::with_capacity(size); + let mut ix = 0; + let mut rng = thread_rng(); + // run length can go up to 8. Cap the max run length for smaller arrays to size / 2. + let max_run_length = 8_usize.min(1_usize.max(size / 2)); + while result.len() < size { + // shuffle the seed array if all the values are iterated. + if ix == 0 { + seed.shuffle(&mut rng); + } + // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays + let num = + max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); + for _ in 0..num { + result.push(seed[ix]); + } + ix += 1; + if ix == seed.len() { + ix = 0 + } + } + result.resize(size, None); + result + } + #[test] fn test_primitive_array_iter_round_trip() { let mut input_vec = vec![ @@ -239,6 +282,38 @@ mod tests { assert_eq!(None, iter.next()); } + #[test] + fn test_run_iterator_comprehensive() { + // Test forward and backward iterator for different array lengths. + let logical_lengths = vec![1_usize, 2, 3, 4, 15, 16, 17, 63, 64, 65]; + + for logical_len in logical_lengths { + let input_array = build_input_array(logical_len); + + let mut run_array_builder = + PrimitiveRunBuilder::::new(); + run_array_builder.extend(input_array.iter().copied()); + let run_array = run_array_builder.finish(); + let typed_array = run_array.downcast::().unwrap(); + + // test forward iterator + let mut input_iter = input_array.iter().copied(); + let mut run_array_iter = typed_array.into_iter(); + for _ in 0..logical_len { + assert_eq!(input_iter.next(), run_array_iter.next()); + } + assert_eq!(None, run_array_iter.next()); + + // test reverse iterator + let mut input_iter = input_array.iter().rev().copied(); + let mut run_array_iter = typed_array.into_iter().rev(); + for _ in 0..logical_len { + assert_eq!(input_iter.next(), run_array_iter.next()); + } + assert_eq!(None, run_array_iter.next()); + } + } + #[test] fn test_string_array_iter_round_trip() { let input_vec = vec!["ab", "ab", "ba", "cc", "cc"]; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d8989fa48293..f8668b56e1d6 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -19,13 +19,15 @@ use std::sync::Arc; -use arrow_array::types::*; use arrow_array::*; +use arrow_array::{builder::PrimitiveRunBuilder, types::*}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; -use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; +use arrow_array::cast::{ + as_generic_binary_array, as_largestring_array, as_primitive_array, as_string_array, +}; use num::{ToPrimitive, Zero}; /// Take elements by index from [Array], creating a new [Array] from those indexes. @@ -201,6 +203,10 @@ where values => Ok(Arc::new(take_dict(values, indices)?)), t => unimplemented!("Take not supported for dictionary type {:?}", t) } + DataType::RunEndEncoded(_, _) => downcast_run_array! { + values => Ok(Arc::new(take_run(values, indices)?)), + t => unimplemented!("Take not supported for run type {:?}", t) + } DataType::Binary => { Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) } @@ -810,6 +816,72 @@ where Ok(DictionaryArray::::from(data)) } +macro_rules! primitive_run_take { + ($t:ty, $o:ty, $indices:ident, $value:ident) => { + take_primitive_run_values::<$o, $t>( + $indices, + as_primitive_array::<$t>($value.values()), + ) + }; +} + +/// `take` implementation for run arrays +/// +/// Finds physical indices for the given logical indices and builds output run array +/// by taking values in the input run array at the physical indices. +/// for e.g. an input `RunArray{ run_ends = [2,4,6,8], values=[1,2,1,2] }` and `indices=[2,7]` +/// would be converted to `physical_indices=[1,3]` which will be used to build +/// output `RunArray{ run_ends=[2], values=[2] }` + +fn take_run( + run_array: &RunArray, + logical_indices: &PrimitiveArray, +) -> Result, ArrowError> +where + T: RunEndIndexType, + T::Native: num::Num, + I: ArrowPrimitiveType, + I::Native: ToPrimitive, +{ + match run_array.data_type() { + DataType::RunEndEncoded(_, fl) => { + let physical_indices = + run_array.get_physical_indices(logical_indices.values())?; + + downcast_primitive! { + fl.data_type() => (primitive_run_take, T, physical_indices, run_array), + dt => Err(ArrowError::NotYetImplemented(format!("take_run is not implemented for {dt:?}"))) + } + } + dt => Err(ArrowError::InvalidArgumentError(format!( + "Expected DataType::RunEndEncoded found {dt:?}" + ))), + } +} + +// Builds a `RunArray` by taking values from given array for the given indices. +fn take_primitive_run_values( + physical_indices: Vec, + values: &PrimitiveArray, +) -> Result, ArrowError> +where + R: RunEndIndexType, + V: ArrowPrimitiveType, +{ + let mut builder = PrimitiveRunBuilder::::new(); + let values_len = values.len(); + for ix in physical_indices { + if ix >= values_len { + return Err(ArrowError::InvalidArgumentError("The requested index {ix} is out of bounds for values array with length {values_len}".to_string())); + } else if values.is_null(ix) { + builder.append_null() + } else { + builder.append_value(values.value(ix)) + } + } + Ok(builder.finish()) +} + /// Takes/filters a list array's inner data using the offsets of the list array. /// /// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns @@ -2086,6 +2158,28 @@ mod tests { assert_eq!(null_buf.as_slice(), &[0b11111111]); } + #[test] + fn test_take_runs() { + let logical_array: Vec = vec![1_i32, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2]; + + let mut builder = PrimitiveRunBuilder::::new(); + builder.extend(logical_array.into_iter().map(Some)); + let run_array = builder.finish(); + + let take_indices: PrimitiveArray = + vec![2, 7, 10].into_iter().collect(); + + let take_out = take_run(&run_array, &take_indices).unwrap(); + + assert_eq!(take_out.len(), 3); + + assert_eq!(take_out.run_ends().len(), 1); + assert_eq!(take_out.run_ends().value(0), 3); + + let take_out_values = as_primitive_array::(take_out.values()); + assert_eq!(take_out_values.value(0), 2); + } + #[test] fn test_take_value_index_from_fixed_list() { let list = FixedSizeListArray::from_iter_primitive::( diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 57e3907a2fe9..f86ec09a9ac3 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -102,6 +102,8 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } + + [build-dependencies] [[example]] @@ -240,6 +242,21 @@ harness = false [[bench]] name = "string_run_builder" harness = false +required-features = ["test_utils"] + +[[bench]] +name = "string_run_iterator" +harness = false + +[[bench]] +name = "primitive_run_accessor" +harness = false +required-features = ["test_utils"] + +[[bench]] +name = "primitive_run_take" +harness = false +required-features = ["test_utils"] [[bench]] name = "substring_kernels" diff --git a/arrow/benches/primitive_run_accessor.rs b/arrow/benches/primitive_run_accessor.rs new file mode 100644 index 000000000000..868c314f9716 --- /dev/null +++ b/arrow/benches/primitive_run_accessor.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::Int32Type; +use arrow::{array::PrimitiveArray, util::bench_util::create_primitive_run_array}; +use arrow_array::ArrayAccessor; +use criterion::{criterion_group, criterion_main, Criterion}; + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("primitive_run_accessor"); + + let mut do_bench = |physical_array_len: usize, logical_array_len: usize| { + group.bench_function( + format!( + "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len})"), + |b| { + let run_array = create_primitive_run_array::( + logical_array_len, + physical_array_len, + ); + let typed = run_array + .downcast::>() + .unwrap(); + b.iter(|| { + for i in 0..logical_array_len { + let _ = unsafe { typed.value_unchecked(i) }; + } + }) + }, + ); + }; + + do_bench(128, 512); + do_bench(256, 1024); + do_bench(512, 2048); + do_bench(1024, 4096); + do_bench(2048, 8192); + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow/benches/primitive_run_take.rs b/arrow/benches/primitive_run_take.rs new file mode 100644 index 000000000000..8c9a3fd04b7a --- /dev/null +++ b/arrow/benches/primitive_run_take.rs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::UInt32Builder; +use arrow::compute::take; +use arrow::datatypes::{Int32Type, Int64Type}; +use arrow::util::bench_util::*; +use arrow::util::test_util::seedable_rng; +use arrow_array::UInt32Array; +use criterion::{criterion_group, criterion_main, Criterion}; +use rand::Rng; + +fn create_random_index(size: usize, null_density: f32) -> UInt32Array { + let mut rng = seedable_rng(); + let mut builder = UInt32Builder::with_capacity(size); + for _ in 0..size { + if rng.gen::() < null_density { + builder.append_null(); + } else { + let value = rng.gen_range::(0u32..size as u32); + builder.append_value(value); + } + } + builder.finish() +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("primitive_run_take"); + + let mut do_bench = |physical_array_len: usize, + logical_array_len: usize, + take_len: usize| { + let run_array = create_primitive_run_array::( + logical_array_len, + physical_array_len, + ); + let indices = create_random_index(take_len, 0.0); + group.bench_function( + format!( + "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, take_len:{take_len})"), + |b| { + b.iter(|| { + criterion::black_box(take(&run_array, &indices, None).unwrap()); + }) + }, + ); + }; + + do_bench(64, 512, 512); + do_bench(128, 512, 512); + + do_bench(256, 1024, 512); + do_bench(256, 1024, 1024); + + do_bench(512, 2048, 512); + do_bench(512, 2048, 1024); + + do_bench(1024, 4096, 512); + do_bench(1024, 4096, 1024); + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow/benches/string_run_builder.rs b/arrow/benches/string_run_builder.rs index 2f0401bbef48..dda0f35b801f 100644 --- a/arrow/benches/string_run_builder.rs +++ b/arrow/benches/string_run_builder.rs @@ -17,26 +17,8 @@ use arrow::array::StringRunBuilder; use arrow::datatypes::Int32Type; +use arrow::util::bench_util::create_string_array_for_runs; use criterion::{criterion_group, criterion_main, Criterion}; -use rand::{thread_rng, Rng}; - -fn build_strings( - physical_array_len: usize, - logical_array_len: usize, - string_len: usize, -) -> Vec { - let mut rng = thread_rng(); - let run_len = logical_array_len / physical_array_len; - let mut values: Vec = (0..physical_array_len) - .map(|_| (0..string_len).map(|_| rng.gen::()).collect()) - .flat_map(|s| std::iter::repeat(s).take(run_len)) - .collect(); - while values.len() < logical_array_len { - let last_val = values[values.len() - 1].clone(); - values.push(last_val); - } - values -} fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("string_run_builder"); @@ -50,7 +32,7 @@ fn criterion_benchmark(c: &mut Criterion) { ), |b| { let strings = - build_strings(physical_array_len, logical_array_len, string_len); + create_string_array_for_runs(physical_array_len, logical_array_len, string_len); b.iter(|| { let mut builder = StringRunBuilder::::with_capacity( physical_array_len, diff --git a/arrow/benches/string_run_iterator.rs b/arrow/benches/string_run_iterator.rs new file mode 100644 index 000000000000..cfa44e66e30a --- /dev/null +++ b/arrow/benches/string_run_iterator.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Int32RunArray, StringArray, StringRunBuilder}; +use arrow::datatypes::Int32Type; +use criterion::{criterion_group, criterion_main, Criterion}; +use rand::{thread_rng, Rng}; + +fn build_strings_runs( + physical_array_len: usize, + logical_array_len: usize, + string_len: usize, +) -> Int32RunArray { + let mut rng = thread_rng(); + let run_len = logical_array_len / physical_array_len; + let mut values: Vec = (0..physical_array_len) + .map(|_| (0..string_len).map(|_| rng.gen::()).collect()) + .flat_map(|s| std::iter::repeat(s).take(run_len)) + .collect(); + while values.len() < logical_array_len { + let last_val = values[values.len() - 1].clone(); + values.push(last_val); + } + let mut builder = StringRunBuilder::::with_capacity( + physical_array_len, + (string_len + 1) * physical_array_len, + ); + builder.extend(values.into_iter().map(Some)); + + builder.finish() +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("string_run_iterator"); + + let mut do_bench = |physical_array_len: usize, + logical_array_len: usize, + string_len: usize| { + group.bench_function( + format!( + "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, string_len: {string_len})"), + |b| { + let run_array = + build_strings_runs(physical_array_len, logical_array_len, string_len); + let typed = run_array.downcast::().unwrap(); + b.iter(|| { + let iter = typed.into_iter(); + for _ in iter {} + }) + }, + ); + }; + + do_bench(256, 1024, 5); + do_bench(256, 1024, 25); + do_bench(256, 1024, 100); + + do_bench(512, 2048, 5); + do_bench(512, 2048, 25); + do_bench(512, 2048, 100); + + do_bench(1024, 4096, 5); + do_bench(1024, 4096, 25); + do_bench(1024, 4096, 100); + + group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow/benches/take_kernels.rs b/arrow/benches/take_kernels.rs index c4677cc72616..731426031193 100644 --- a/arrow/benches/take_kernels.rs +++ b/arrow/benches/take_kernels.rs @@ -139,6 +139,13 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("take str null values null indices 1024", |b| { b.iter(|| bench_take(&values, &indices)) }); + + let values = create_primitive_run_array::(1024, 512); + let indices = create_random_index(1024, 0.0); + c.bench_function( + "take primitive run logical len: 1024, physical len: 512, indices: 1024", + |b| b.iter(|| bench_take(&values, &indices)), + ); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 6420b6346feb..33552dbe3b1b 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -22,6 +22,7 @@ use crate::datatypes::*; use crate::util::test_util::seedable_rng; use arrow_buffer::Buffer; use rand::distributions::uniform::SampleUniform; +use rand::thread_rng; use rand::Rng; use rand::SeedableRng; use rand::{ @@ -145,6 +146,73 @@ pub fn create_string_dict_array( data.iter().map(|x| x.as_deref()).collect() } +/// Create primitive run array for given logical and physical array lengths +pub fn create_primitive_run_array( + logical_array_len: usize, + physical_array_len: usize, +) -> RunArray { + assert!(logical_array_len >= physical_array_len); + // typical length of each run + let run_len = logical_array_len / physical_array_len; + + // Some runs should have extra length + let mut run_len_extra = logical_array_len % physical_array_len; + + let mut values: Vec = (0..physical_array_len) + .flat_map(|s| { + let mut take_len = run_len; + if run_len_extra > 0 { + take_len += 1; + run_len_extra -= 1; + } + std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len) + }) + .collect(); + while values.len() < logical_array_len { + let last_val = values[values.len() - 1]; + values.push(last_val); + } + let mut builder = PrimitiveRunBuilder::::with_capacity(physical_array_len); + builder.extend(values.into_iter().map(Some)); + + builder.finish() +} + +/// Create string array to be used by run array builder. The string array +/// will result in run array with physial length of `physical_array_len` +/// and logical length of `logical_array_len` +pub fn create_string_array_for_runs( + physical_array_len: usize, + logical_array_len: usize, + string_len: usize, +) -> Vec { + assert!(logical_array_len >= physical_array_len); + let mut rng = thread_rng(); + + // typical length of each run + let run_len = logical_array_len / physical_array_len; + + // Some runs should have extra length + let mut run_len_extra = logical_array_len % physical_array_len; + + let mut values: Vec = (0..physical_array_len) + .map(|_| (0..string_len).map(|_| rng.gen::()).collect()) + .flat_map(|s| { + let mut take_len = run_len; + if run_len_extra > 0 { + take_len += 1; + run_len_extra -= 1; + } + std::iter::repeat(s).take(take_len) + }) + .collect(); + while values.len() < logical_array_len { + let last_val = values[values.len() - 1].clone(); + values.push(last_val); + } + values +} + /// Creates an random (but fixed-seeded) binary array of a given size and null density pub fn create_binary_array( size: usize, From fb1179201293cccb57263e1175e58f54133116ac Mon Sep 17 00:00:00 2001 From: Rich Date: Mon, 6 Feb 2023 06:14:27 -0500 Subject: [PATCH 0578/1411] object_store: add Path::from_url_path (#3663) * object_store: add Path::from_url_path * reuse existing implementation * Final tweaks * Fix wasm32 build --------- Co-authored-by: Raphael Taylor-Davies --- object_store/src/path/mod.rs | 39 +++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 4b0862e44b73..a15f7ca0f0ab 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -18,7 +18,6 @@ //! Path abstraction for Object Storage use itertools::Itertools; -#[cfg(not(target_arch = "wasm32"))] use percent_encoding::percent_decode; use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; @@ -166,7 +165,7 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`] or does not exist + /// as defined on the docstring for [`Path`] or does not exist /// /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path( @@ -182,8 +181,8 @@ impl Path { #[cfg(not(target_arch = "wasm32"))] /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` is not an absolute path + /// This will return an error if the path contains illegal character sequences, + /// as defined on the docstring for [`Path`], or `base` is not an absolute path pub fn from_absolute_path(path: impl AsRef) -> Result { Self::from_absolute_path_with_base(path, None) } @@ -191,9 +190,9 @@ impl Path { #[cfg(not(target_arch = "wasm32"))] /// Convert a filesystem path to a [`Path`] relative to the provided base /// - /// This will return an error if the path contains illegal character sequences - /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path`, - /// or `base` is not an absolute path + /// This will return an error if the path contains illegal character sequences, + /// as defined on the docstring for [`Path`], or `base` does not refer to a parent + /// path of `path`, or `base` is not an absolute path pub(crate) fn from_absolute_path_with_base( path: impl AsRef, base: Option<&Url>, @@ -210,6 +209,15 @@ impl Path { }; // Reverse any percent encoding performed by conversion to URL + Self::from_url_path(path) + } + + /// Parse a url encoded string as a [`Path`], returning a [`Error`] if invalid + /// + /// This will return an error if the path contains illegal character sequences + /// as defined on the docstring for [`Path`] + pub fn from_url_path(path: impl AsRef) -> Result { + let path = path.as_ref(); let decoded = percent_decode(path.as_bytes()) .decode_utf8() .context(NonUnicodeSnafu { path })?; @@ -551,6 +559,23 @@ mod tests { assert_eq!(b.raw, c.raw); } + #[test] + fn from_url_path() { + let a = Path::from_url_path("foo%20bar").unwrap(); + let b = Path::from_url_path("foo/%2E%2E/bar").unwrap_err(); + let c = Path::from_url_path("foo%2F%252E%252E%2Fbar").unwrap(); + let d = Path::from_url_path("foo/%252E%252E/bar").unwrap(); + let e = Path::from_url_path("%48%45%4C%4C%4F").unwrap(); + let f = Path::from_url_path("foo/%FF/as").unwrap_err(); + + assert_eq!(a.raw, "foo bar"); + assert!(matches!(b, Error::BadSegment { .. })); + assert_eq!(c.raw, "foo/%2E%2E/bar"); + assert_eq!(d.raw, "foo/%2E%2E/bar"); + assert_eq!(e.raw, "HELLO"); + assert!(matches!(f, Error::NonUnicode { .. })); + } + #[test] fn filename_from_path() { let a = Path::from("foo/bar"); From 04500e7dd77a49d925657732e42924e46a82489f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Feb 2023 11:54:11 +0000 Subject: [PATCH 0579/1411] Implement std::fmt::Write for StringBuilder (#3638) (#3659) * Implement std::fmt::Write for StringBuilder (#3638) * Add docs --- .../src/builder/generic_bytes_builder.rs | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 73600d9e0a38..406e79c3169c 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -22,6 +22,7 @@ use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayDataBuilder; use std::any::Any; +use std::fmt::Write; use std::sync::Arc; /// Array builder for [`GenericByteArray`] @@ -232,9 +233,43 @@ impl> Extend> for GenericByteBui } } -/// Array builder for [`GenericStringArray`][crate::GenericStringArray] +/// Array builder for [`GenericStringArray`][crate::GenericStringArray] +/// +/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with +/// [`GenericByteBuilder::append_null`] as normal. +/// +/// Additionally implements [`std::fmt::Write`] with any written data included in the next +/// appended value. This allows use with [`std::fmt::Display`] without intermediate allocations +/// +/// ``` +/// # use std::fmt::Write; +/// # use arrow_array::builder::GenericStringBuilder; +/// let mut builder = GenericStringBuilder::::new(); +/// +/// // Write data +/// write!(builder, "foo").unwrap(); +/// write!(builder, "bar").unwrap(); +/// +/// // Finish value +/// builder.append_value("baz"); +/// +/// // Write second value +/// write!(builder, "v2").unwrap(); +/// builder.append_value(""); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), "foobarbaz"); +/// assert_eq!(array.value(1), "v2"); +/// ``` pub type GenericStringBuilder = GenericByteBuilder>; +impl Write for GenericStringBuilder { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.value_builder.append_slice(s.as_bytes()); + Ok(()) + } +} + /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] pub type GenericBinaryBuilder = GenericByteBuilder>; @@ -443,4 +478,19 @@ mod tests { assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]); assert_eq!(array.value_data(), b"abcabcdcupcakeshello"); } + + #[test] + fn test_write() { + let mut builder = GenericStringBuilder::::new(); + write!(builder, "foo").unwrap(); + builder.append_value(""); + writeln!(builder, "bar").unwrap(); + builder.append_value(""); + write!(builder, "fiz").unwrap(); + write!(builder, "buz").unwrap(); + builder.append_value(""); + let a = builder.finish(); + let r: Vec<_> = a.iter().map(|x| x.unwrap()).collect(); + assert_eq!(r, &["foo", "bar\n", "fizbuz"]) + } } From 26ff44b94102f256deb6cd0254d32461221c85de Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Feb 2023 14:07:27 +0000 Subject: [PATCH 0580/1411] Update proc-macro2 requirement from =1.0.50 to =1.0.51 (#3669) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.50...1.0.51) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index e5a900a0dd25..1fe382935a1d 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -58,7 +58,7 @@ tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.50", default-features = false } +proc-macro2 = { version = "=1.0.51", default-features = false } prost-build = { version = "=0.11.6", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From c7cf7927621a54ed56a4006f4606e4313e0923f0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Feb 2023 18:46:07 +0000 Subject: [PATCH 0581/1411] Add timezone accessor for Timestamp*Array (#3666) --- arrow-array/src/array/primitive_array.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index dfe076306178..6902f13646a2 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1019,6 +1019,14 @@ impl PrimitiveArray { Self::from(data).with_timezone_opt(timezone) } + /// Returns the timezone of this array if any + pub fn timezone(&self) -> Option<&str> { + match self.data_type() { + DataType::Timestamp(_, tz) => tz.as_deref(), + _ => unreachable!(), + } + } + /// Construct a timestamp array with new timezone pub fn with_timezone(&self, timezone: impl Into) -> Self { self.with_timezone_opt(Some(timezone.into())) @@ -2214,4 +2222,13 @@ mod tests { let array = IntervalDayTimeArray::from(vec![1, 2, 3]); let _ = IntervalMonthDayNanoArray::from(array.into_data()); } + + #[test] + fn test_timezone() { + let array = TimestampNanosecondArray::from_iter_values([1, 2]); + assert_eq!(array.timezone(), None); + + let array = array.with_timezone("+02:00"); + assert_eq!(array.timezone(), Some("+02:00")); + } } From b41a8d2d1360ab725d934672511c85000d7fa156 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Feb 2023 18:46:21 +0000 Subject: [PATCH 0582/1411] Faster timezone cast (#3665) --- arrow-cast/src/cast.rs | 51 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c0082b347da7..16a39d773ca0 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -36,6 +36,7 @@ //! ``` use chrono::{DateTime, NaiveDateTime, NaiveTime, Timelike}; +use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, lexical_to_string}; @@ -1687,12 +1688,16 @@ pub fn cast_with_options( let to_size = time_unit_multiple(to_unit); // we either divide or multiply, depending on size of each unit // units are never the same when the types are the same - let converted = if from_size >= to_size { - let divisor = from_size / to_size; - time_array.unary::<_, Int64Type>(|o| o / divisor) - } else { - let mul = to_size / from_size; - time_array.unary::<_, Int64Type>(|o| o * mul) + let converted = match from_size.cmp(&to_size) { + Ordering::Greater => { + let divisor = from_size / to_size; + time_array.unary::<_, Int64Type>(|o| o / divisor) + } + Ordering::Equal => time_array.clone(), + Ordering::Less => { + let mul = to_size / from_size; + time_array.unary::<_, Int64Type>(|o| o * mul) + } }; Ok(make_timestamp_array( &converted, @@ -7844,4 +7849,38 @@ mod tests { assert_eq!(1640995200000000000, c.value(1)); assert!(c.is_null(2)); } + + #[test] + fn test_timezone_cast() { + let a = StringArray::from(vec![ + "2000-01-01T12:00:00", // date + time valid + "2020-12-15T12:34:56", // date + time valid + ]); + let array = Arc::new(a) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let v = as_primitive_array::(b.as_ref()); + + assert_eq!(v.value(0), 946728000000000000); + assert_eq!(v.value(1), 1608035696000000000); + + let b = cast( + &b, + &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), + ) + .unwrap(); + let v = as_primitive_array::(b.as_ref()); + + assert_eq!(v.value(0), 946728000000000000); + assert_eq!(v.value(1), 1608035696000000000); + + let b = cast( + &b, + &DataType::Timestamp(TimeUnit::Millisecond, Some("+02:00".to_string())), + ) + .unwrap(); + let v = as_primitive_array::(b.as_ref()); + + assert_eq!(v.value(0), 946728000000); + assert_eq!(v.value(1), 1608035696000); + } } From a142e5d6b25a8adf4492a6bee9e01dc0e6b8efa9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Feb 2023 18:46:39 +0000 Subject: [PATCH 0583/1411] Fix Date64Array docs (#3670) --- arrow-array/src/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 641d4c2fc157..f9ca050dc0e7 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -151,7 +151,7 @@ make_type!( Date64Type, i64, DataType::Date64, - "A 64-bit date type representing the elapsed time since UNIX epoch in days(32 bits)." + "A 64-bit date type representing the elapsed time since UNIX epoch in milliseconds(64 bits)." ); make_type!( Time32SecondType, From b79f27b512d46715e9881e34fe4bb525b88fef9d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Feb 2023 22:46:23 +0000 Subject: [PATCH 0584/1411] Use dyn Array in cast kernels (#3667) * Use dyn Array in cast kernel * Fix test --- arrow-cast/src/cast.rs | 254 +++++++++++++++----------------------- arrow-cast/src/display.rs | 32 ++--- arrow/src/lib.rs | 2 +- 3 files changed, 116 insertions(+), 172 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 16a39d773ca0..69e42a5485e6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -320,7 +320,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { /// * To or from `StructArray` /// * List to primitive /// * Interval and duration -pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { +pub fn cast(array: &dyn Array, to_type: &DataType) -> Result { cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) } @@ -444,7 +444,7 @@ fn cast_reinterpret_arrays< } fn cast_decimal_to_integer( - array: &ArrayRef, + array: &dyn Array, base: D::Native, scale: i8, cast_options: &CastOptions, @@ -506,7 +506,7 @@ where // cast the decimal array to floating-point array fn cast_decimal_to_float( - array: &ArrayRef, + array: &dyn Array, op: F, ) -> Result where @@ -601,7 +601,7 @@ fn as_time_res_with_timezone( /// * To or from `StructArray` /// * List to primitive pub fn cast_with_options( - array: &ArrayRef, + array: &dyn Array, to_type: &DataType, cast_options: &CastOptions, ) -> Result { @@ -610,7 +610,7 @@ pub fn cast_with_options( // clone array if types are the same if from_type == to_type { - return Ok(array.clone()); + return Ok(make_array(array.data().clone())); } match (from_type, to_type) { ( @@ -683,7 +683,7 @@ pub fn cast_with_options( "cannot cast list to large-list with different child data".into(), )) } else { - cast_list_container::(&**array, cast_options) + cast_list_container::(array, cast_options) } } (LargeList(list_from), List(list_to)) => { @@ -692,7 +692,7 @@ pub fn cast_with_options( "cannot cast large-list to list with different child data".into(), )) } else { - cast_list_container::(&**array, cast_options) + cast_list_container::(array, cast_options) } } (List(_) | LargeList(_), _) => match to_type { @@ -1115,7 +1115,7 @@ pub fn cast_with_options( ))), }, (Utf8, _) => match to_type { - LargeUtf8 => cast_byte_container::(&**array), + LargeUtf8 => cast_byte_container::(array), UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), UInt32 => cast_string_to_numeric::(array, cast_options), @@ -1126,34 +1126,34 @@ pub fn cast_with_options( Int64 => cast_string_to_numeric::(array, cast_options), Float32 => cast_string_to_numeric::(array, cast_options), Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(&**array, cast_options), - Date64 => cast_string_to_date64::(&**array, cast_options), + Date32 => cast_string_to_date32::(array, cast_options), + Date64 => cast_string_to_date64::(array, cast_options), Binary => Ok(Arc::new(BinaryArray::from(as_string_array(array).clone()))), LargeBinary => { let binary = BinaryArray::from(as_string_array(array).clone()); cast_byte_container::(&binary) } Time32(TimeUnit::Second) => { - cast_string_to_time32second::(&**array, cast_options) + cast_string_to_time32second::(array, cast_options) } Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(&**array, cast_options) + cast_string_to_time32millisecond::(array, cast_options) } Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(&**array, cast_options) + cast_string_to_time64microsecond::(array, cast_options) } Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(&**array, cast_options) + cast_string_to_time64nanosecond::(array, cast_options) } Timestamp(TimeUnit::Nanosecond, None) => { - cast_string_to_timestamp_ns::(&**array, cast_options) + cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (_, Utf8) => match from_type { - LargeUtf8 => cast_byte_container::(&**array), + LargeUtf8 => cast_byte_container::(array), UInt8 => cast_numeric_to_string::(array), UInt16 => cast_numeric_to_string::(array), UInt32 => cast_numeric_to_string::(array), @@ -1232,8 +1232,8 @@ pub fn cast_with_options( Int64 => cast_string_to_numeric::(array, cast_options), Float32 => cast_string_to_numeric::(array, cast_options), Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(&**array, cast_options), - Date64 => cast_string_to_date64::(&**array, cast_options), + Date32 => cast_string_to_date32::(array, cast_options), + Date64 => cast_string_to_date64::(array, cast_options), Binary => { let large_binary = LargeBinaryArray::from(as_largestring_array(array).clone()); @@ -1243,19 +1243,19 @@ pub fn cast_with_options( as_largestring_array(array).clone(), ))), Time32(TimeUnit::Second) => { - cast_string_to_time32second::(&**array, cast_options) + cast_string_to_time32second::(array, cast_options) } Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(&**array, cast_options) + cast_string_to_time32millisecond::(array, cast_options) } Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(&**array, cast_options) + cast_string_to_time64microsecond::(array, cast_options) } Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(&**array, cast_options) + cast_string_to_time64nanosecond::(array, cast_options) } Timestamp(TimeUnit::Nanosecond, None) => { - cast_string_to_timestamp_ns::(&**array, cast_options) + cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1263,14 +1263,14 @@ pub fn cast_with_options( }, (Binary, _) => match to_type { LargeBinary => { - cast_byte_container::(&**array) + cast_byte_container::(array) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeBinary, _) => match to_type { - Binary => cast_byte_container::(&**array), + Binary => cast_byte_container::(array), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -2111,7 +2111,7 @@ where /// Convert Array into a PrimitiveArray of type, and apply numeric cast fn cast_numeric_arrays( - from: &ArrayRef, + from: &dyn Array, cast_options: &CastOptions, ) -> Result where @@ -2235,7 +2235,7 @@ where /// Cast timestamp types to Utf8/LargeUtf8 fn cast_timestamp_to_string( - array: &ArrayRef, + array: &dyn Array, tz: Option<&String>, ) -> Result where @@ -2271,7 +2271,7 @@ where /// Cast date32 types to Utf8/LargeUtf8 fn cast_date32_to_string( - array: &ArrayRef, + array: &dyn Array, ) -> Result { let array = array.as_any().downcast_ref::().unwrap(); @@ -2290,7 +2290,7 @@ fn cast_date32_to_string( /// Cast date64 types to Utf8/LargeUtf8 fn cast_date64_to_string( - array: &ArrayRef, + array: &dyn Array, ) -> Result { let array = array.as_any().downcast_ref::().unwrap(); @@ -2309,7 +2309,7 @@ fn cast_date64_to_string( /// Cast numeric types to Utf8 fn cast_numeric_to_string( - array: &ArrayRef, + array: &dyn Array, ) -> Result where FROM: ArrowPrimitiveType, @@ -2339,7 +2339,7 @@ where /// Cast numeric types to Utf8 fn cast_string_to_numeric( - from: &ArrayRef, + from: &dyn Array, cast_options: &CastOptions, ) -> Result where @@ -2795,7 +2795,7 @@ fn cast_string_to_timestamp_ns( /// Casts Utf8 to Boolean fn cast_utf8_to_boolean( - from: &ArrayRef, + from: &dyn Array, cast_options: &CastOptions, ) -> Result where @@ -2963,7 +2963,7 @@ where /// Cast Utf8 to decimal fn cast_string_to_decimal( - from: &ArrayRef, + from: &dyn Array, precision: u8, scale: i8, cast_options: &CastOptions, @@ -2998,7 +2998,7 @@ where /// Cast numeric types to Boolean /// /// Any zero value returns `false` while non-zero returns `true` -fn cast_numeric_to_bool(from: &ArrayRef) -> Result +fn cast_numeric_to_bool(from: &dyn Array) -> Result where FROM: ArrowPrimitiveType, { @@ -3033,7 +3033,7 @@ where /// /// `false` returns 0 while `true` returns 1 fn cast_bool_to_numeric( - from: &ArrayRef, + from: &dyn Array, cast_options: &CastOptions, ) -> Result where @@ -3076,7 +3076,7 @@ where /// /// K is the key type fn dictionary_cast( - array: &ArrayRef, + array: &dyn Array, to_type: &DataType, cast_options: &CastOptions, ) -> Result { @@ -3153,7 +3153,7 @@ fn dictionary_cast( // Unpack a dictionary where the keys are of type into a flattened array of type to_type fn unpack_dictionary( - array: &ArrayRef, + array: &dyn Array, to_type: &DataType, cast_options: &CastOptions, ) -> Result @@ -3195,7 +3195,7 @@ where /// /// K is the key type fn cast_to_dictionary( - array: &ArrayRef, + array: &dyn Array, dict_value_type: &DataType, cast_options: &CastOptions, ) -> Result { @@ -3271,7 +3271,7 @@ fn cast_to_dictionary( // Packs the data from the primitive array of type to a // DictionaryArray with keys of type K and values of value_type V fn pack_numeric_to_dictionary( - array: &ArrayRef, + array: &dyn Array, dict_value_type: &DataType, cast_options: &CastOptions, ) -> Result @@ -3303,7 +3303,7 @@ where // Packs the data as a GenericByteDictionaryBuilder, if possible, with the // key types of K fn pack_byte_to_dictionary( - array: &ArrayRef, + array: &dyn Array, cast_options: &CastOptions, ) -> Result where @@ -3331,7 +3331,7 @@ where /// Helper function that takes a primitive array and casts to a (generic) list array. fn cast_primitive_to_list( - array: &ArrayRef, + array: &dyn Array, to: &Field, to_type: &DataType, cast_options: &CastOptions, @@ -3371,7 +3371,7 @@ fn cast_primitive_to_list( /// Helper function that takes an Generic list container and casts the inner datatype. fn cast_list_inner( - array: &Arc, + array: &dyn Array, to: &Field, to_type: &DataType, cast_options: &CastOptions, @@ -3772,8 +3772,7 @@ mod tests { Some(-3123456), None, ]; - let input_decimal_array = create_decimal_array(array, 20, 4).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 20, 4).unwrap(); // decimal128 to decimal128 let input_type = DataType::Decimal128(20, 4); let output_type = DataType::Decimal128(20, 3); @@ -3816,8 +3815,7 @@ mod tests { Some(i256::from_i128(-3123456)), None, ]; - let input_decimal_array = create_decimal256_array(array, 20, 4).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal256_array(array, 20, 4).unwrap(); // decimal256 to decimal256 let input_type = DataType::Decimal256(20, 4); @@ -3859,8 +3857,7 @@ mod tests { let output_type = DataType::Decimal128(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let input_decimal_array = create_decimal_array(array, 20, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal128Array, @@ -3874,8 +3871,7 @@ mod tests { ); // negative test let array = vec![Some(123456), None]; - let input_decimal_array = create_decimal_array(array, 10, 0).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 10, 0).unwrap(); let result = cast(&array, &DataType::Decimal128(2, 2)); assert!(result.is_ok()); let array = result.unwrap(); @@ -3892,8 +3888,7 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i128::MAX)]; - let input_decimal_array = create_decimal_array(array, 38, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 38, 3).unwrap(); let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); assert_eq!("Cast error: Cannot cast to Decimal128(38, 38). Overflowing on 170141183460469231731687303715884105727", @@ -3907,8 +3902,7 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i128::MAX)]; - let input_decimal_array = create_decimal_array(array, 38, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 38, 3).unwrap(); let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); assert_eq!("Cast error: Cannot cast to Decimal256(76, 76). Overflowing on 170141183460469231731687303715884105727", @@ -3921,8 +3915,7 @@ mod tests { let output_type = DataType::Decimal256(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let input_decimal_array = create_decimal_array(array, 20, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal256Array, @@ -3942,8 +3935,7 @@ mod tests { let output_type = DataType::Decimal128(38, 7); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i256::from_i128(i128::MAX))]; - let input_decimal_array = create_decimal256_array(array, 76, 5).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal256_array(array, 76, 5).unwrap(); let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); assert_eq!("Cast error: Cannot cast to Decimal128(38, 7). Overflowing on 170141183460469231731687303715884105727", @@ -3956,8 +3948,7 @@ mod tests { let output_type = DataType::Decimal256(76, 55); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i256::from_i128(i128::MAX))]; - let input_decimal_array = create_decimal256_array(array, 76, 5).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal256_array(array, 76, 5).unwrap(); let result = cast_with_options(&array, &output_type, &CastOptions { safe: false }); assert_eq!("Cast error: Cannot cast to Decimal256(76, 55). Overflowing on 170141183460469231731687303715884105727", @@ -3975,8 +3966,7 @@ mod tests { Some(i256::from_i128(3123456)), None, ]; - let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal256_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal128Array, @@ -4001,8 +3991,7 @@ mod tests { Some(i256::from_i128(3123456)), None, ]; - let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); - let array = Arc::new(input_decimal_array) as ArrayRef; + let array = create_decimal256_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal256Array, @@ -4020,8 +4009,7 @@ mod tests { fn test_cast_decimal_to_numeric() { let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; - let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal_array(value_array, 38, 2).unwrap(); // u8 generate_cast_test_case!( &array, @@ -4107,8 +4095,7 @@ mod tests { // overflow test: out of range of max u8 let value_array: Vec> = vec![Some(51300)]; - let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal_array(value_array, 38, 2).unwrap(); let casted_array = cast_with_options(&array, &DataType::UInt8, &CastOptions { safe: false }); assert_eq!( @@ -4123,8 +4110,7 @@ mod tests { // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(24400)]; - let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal_array(value_array, 38, 2).unwrap(); let casted_array = cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); assert_eq!( @@ -4149,8 +4135,7 @@ mod tests { Some(112345678), Some(112345679), ]; - let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal_array(value_array, 38, 2).unwrap(); generate_cast_test_case!( &array, Float32Array, @@ -4177,8 +4162,7 @@ mod tests { Some(112345678901234568), Some(112345678901234560), ]; - let decimal_array = create_decimal_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal_array(value_array, 38, 2).unwrap(); generate_cast_test_case!( &array, Float64Array, @@ -4204,8 +4188,7 @@ mod tests { None, Some(i256::from_i128(525)), ]; - let decimal_array = create_decimal256_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal256_array(value_array, 38, 2).unwrap(); // u8 generate_cast_test_case!( &array, @@ -4291,8 +4274,7 @@ mod tests { // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(i256::from_i128(24400))]; - let decimal_array = create_decimal256_array(value_array, 38, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal256_array(value_array, 38, 2).unwrap(); let casted_array = cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); assert_eq!( @@ -4317,8 +4299,7 @@ mod tests { Some(i256::from_i128(112345678)), Some(i256::from_i128(112345679)), ]; - let decimal_array = create_decimal256_array(value_array, 76, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal256_array(value_array, 76, 2).unwrap(); generate_cast_test_case!( &array, Float32Array, @@ -4345,8 +4326,7 @@ mod tests { Some(i256::from_i128(112345678901234568)), Some(i256::from_i128(112345678901234560)), ]; - let decimal_array = create_decimal256_array(value_array, 76, 2).unwrap(); - let array = Arc::new(decimal_array) as ArrayRef; + let array = create_decimal256_array(value_array, 76, 2).unwrap(); generate_cast_test_case!( &array, Float64Array, @@ -4462,7 +4442,6 @@ mod tests { // test u8 to decimal type with overflow the result type // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. let array = UInt8Array::from(vec![1, 2, 3, 4, 100]); - let array = Arc::new(array) as ArrayRef; let casted_array = cast(&array, &DataType::Decimal128(3, 1)); assert!(casted_array.is_ok()); let array = casted_array.unwrap(); @@ -4473,7 +4452,6 @@ mod tests { // test i8 to decimal type with overflow the result type // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. let array = Int8Array::from(vec![1, 2, 3, 4, 100]); - let array = Arc::new(array) as ArrayRef; let casted_array = cast(&array, &DataType::Decimal128(3, 1)); assert!(casted_array.is_ok()); let array = casted_array.unwrap(); @@ -4516,7 +4494,6 @@ mod tests { Some(1.123_456_489_012_345_6), // round down Some(1.123_456_789_012_345_6), // round up ]); - let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( &array, Decimal128Array, @@ -4650,7 +4627,6 @@ mod tests { Some(1.123_456_4), // round down Some(1.123_456_7), // round up ]); - let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( &array, Decimal256Array, @@ -4676,7 +4652,6 @@ mod tests { Some(1.123_456_489_012_345_6), // round down Some(1.123_456_789_012_345_6), // round up ]); - let array = Arc::new(array) as ArrayRef; generate_cast_test_case!( &array, Decimal256Array, @@ -4696,8 +4671,7 @@ mod tests { #[test] fn test_cast_i32_to_f64() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast(&array, &DataType::Float64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(5.0, c.value(0)); @@ -4709,8 +4683,7 @@ mod tests { #[test] fn test_cast_i32_to_u8() { - let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); let b = cast(&array, &DataType::UInt8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert!(!c.is_valid(0)); @@ -4724,8 +4697,7 @@ mod tests { #[test] #[should_panic(expected = "Can't cast value -5 to type UInt8")] fn test_cast_int32_to_u8_with_error() { - let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); // overflow with the error let cast_option = CastOptions { safe: false }; let result = cast_with_options(&array, &DataType::UInt8, &cast_option); @@ -4735,8 +4707,7 @@ mod tests { #[test] fn test_cast_i32_to_u8_sliced() { - let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); assert_eq!(0, array.offset()); let array = array.slice(2, 3); assert_eq!(2, array.offset()); @@ -4752,8 +4723,7 @@ mod tests { #[test] fn test_cast_i32_to_i32() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast(&array, &DataType::Int32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(5, c.value(0)); @@ -4765,8 +4735,7 @@ mod tests { #[test] fn test_cast_i32_to_list_i32() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast( &array, &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), @@ -4790,8 +4759,7 @@ mod tests { #[test] fn test_cast_i32_to_list_i32_nullable() { - let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); let b = cast( &array, &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), @@ -4818,8 +4786,8 @@ mod tests { #[test] fn test_cast_i32_to_list_f64_nullable_sliced() { - let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); - let array = Arc::new(a) as ArrayRef; + let array = + Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); let array = array.slice(2, 4); let b = cast( &array, @@ -4844,8 +4812,7 @@ mod tests { #[test] fn test_cast_utf8_to_i32() { - let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); - let array = Arc::new(a) as ArrayRef; + let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); let b = cast(&array, &DataType::Int32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(5, c.value(0)); @@ -4857,8 +4824,7 @@ mod tests { #[test] fn test_cast_with_options_utf8_to_i32() { - let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); - let array = Arc::new(a) as ArrayRef; + let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); let result = cast_with_options(&array, &DataType::Int32, &CastOptions { safe: false }); match result { @@ -4876,9 +4842,7 @@ mod tests { #[test] fn test_cast_utf8_to_bool() { - let strings = Arc::new(StringArray::from(vec![ - "true", "false", "invalid", " Y ", "", - ])) as ArrayRef; + let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); let casted = cast(&strings, &DataType::Boolean).unwrap(); let expected = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); @@ -4887,9 +4851,7 @@ mod tests { #[test] fn test_cast_with_options_utf8_to_bool() { - let strings = Arc::new(StringArray::from(vec![ - "true", "false", "invalid", " Y ", "", - ])) as ArrayRef; + let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); let casted = cast_with_options(&strings, &DataType::Boolean, &CastOptions { safe: false }); match casted { @@ -4904,8 +4866,7 @@ mod tests { #[test] fn test_cast_bool_to_i32() { - let a = BooleanArray::from(vec![Some(true), Some(false), None]); - let array = Arc::new(a) as ArrayRef; + let array = BooleanArray::from(vec![Some(true), Some(false), None]); let b = cast(&array, &DataType::Int32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(1, c.value(0)); @@ -4915,8 +4876,7 @@ mod tests { #[test] fn test_cast_bool_to_f64() { - let a = BooleanArray::from(vec![Some(true), Some(false), None]); - let array = Arc::new(a) as ArrayRef; + let array = BooleanArray::from(vec![Some(true), Some(false), None]); let b = cast(&array, &DataType::Float64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(1.0, c.value(0)); @@ -4929,8 +4889,7 @@ mod tests { expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" )] fn test_cast_int32_to_timestamp() { - let a = Int32Array::from(vec![Some(2), Some(10), None]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![Some(2), Some(10), None]); cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); } @@ -4952,7 +4911,7 @@ mod tests { .add_child_data(value_data) .build() .unwrap(); - let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; + let list_array = ListArray::from(list_data); let cast_array = cast( &list_array, @@ -5279,8 +5238,7 @@ mod tests { #[test] fn test_cast_date32_to_int32() { - let a = Date32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; + let array = Date32Array::from(vec![10000, 17890]); let b = cast(&array, &DataType::Int32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(10000, c.value(0)); @@ -5289,8 +5247,7 @@ mod tests { #[test] fn test_cast_int32_to_date32() { - let a = Int32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; + let array = Int32Array::from(vec![10000, 17890]); let b = cast(&array, &DataType::Date32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(10000, c.value(0)); @@ -5299,13 +5256,12 @@ mod tests { #[test] fn test_cast_timestamp_to_date32() { - let a = TimestampMillisecondArray::from(vec![ + let array = TimestampMillisecondArray::from(vec![ Some(864000000005), Some(1545696000001), None, ]) .with_timezone("UTC".to_string()); - let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date32).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(10000, c.value(0)); @@ -5315,12 +5271,11 @@ mod tests { #[test] fn test_cast_timestamp_to_date64() { - let a = TimestampMillisecondArray::from(vec![ + let array = TimestampMillisecondArray::from(vec![ Some(864000000005), Some(1545696000001), None, ]); - let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(864000000005, c.value(0)); @@ -5331,9 +5286,8 @@ mod tests { #[test] fn test_cast_timestamp_to_time64() { // test timestamp secs - let a = TimestampSecondArray::from(vec![Some(86405), Some(1), None]) + let array = TimestampSecondArray::from(vec![Some(86405), Some(1), None]) .with_timezone("+01:00".to_string()); - let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(3605000000, c.value(0)); @@ -5484,8 +5438,8 @@ mod tests { #[test] fn test_cast_date64_to_timestamp() { - let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); - let array = Arc::new(a) as ArrayRef; + let array = + Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(864000000, c.value(0)); @@ -5495,8 +5449,8 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ms() { - let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); - let array = Arc::new(a) as ArrayRef; + let array = + Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Millisecond, None)).unwrap(); let c = b .as_any() @@ -5509,8 +5463,8 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_us() { - let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); - let array = Arc::new(a) as ArrayRef; + let array = + Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); let c = b .as_any() @@ -5523,8 +5477,8 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ns() { - let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); - let array = Arc::new(a) as ArrayRef; + let array = + Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); let c = b .as_any() @@ -5537,13 +5491,12 @@ mod tests { #[test] fn test_cast_timestamp_to_i64() { - let a = TimestampMillisecondArray::from(vec![ + let array = TimestampMillisecondArray::from(vec![ Some(864000000005), Some(1545696000001), None, ]) .with_timezone("UTC".to_string()); - let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Int64).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Int64, c.data_type()); @@ -5554,8 +5507,7 @@ mod tests { #[test] fn test_cast_date32_to_string() { - let a = Date32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; + let array = Date32Array::from(vec![10000, 17890]); let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); @@ -5565,8 +5517,7 @@ mod tests { #[test] fn test_cast_date64_to_string() { - let a = Date64Array::from(vec![10000 * 86400000, 17890 * 86400000]); - let array = Arc::new(a) as ArrayRef; + let array = Date64Array::from(vec![10000 * 86400000, 17890 * 86400000]); let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); @@ -5576,12 +5527,11 @@ mod tests { #[test] fn test_cast_between_timestamps() { - let a = TimestampMillisecondArray::from(vec![ + let array = TimestampMillisecondArray::from(vec![ Some(864000003005), Some(1545696002001), None, ]); - let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(864000003, c.value(0)); @@ -5629,7 +5579,7 @@ mod tests { #[test] fn test_cast_to_strings() { - let a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; + let a = Int32Array::from(vec![1, 2, 3]); let out = cast(&a, &DataType::Utf8).unwrap(); let out = out .as_any() @@ -5654,7 +5604,7 @@ mod tests { vec![Some("foo"), Some("bar"), Some("ham")], vec![Some("foo"), None, Some("bar")], ] { - let a = Arc::new(LargeStringArray::from(data.clone())) as ArrayRef; + let a = LargeStringArray::from(data.clone()); let to = cast(&a, &DataType::Utf8).unwrap(); let expect = a .as_any() @@ -5670,7 +5620,7 @@ mod tests { .collect::>(); assert_eq!(expect, out); - let a = Arc::new(StringArray::from(data)) as ArrayRef; + let a = StringArray::from(data); let to = cast(&a, &DataType::LargeUtf8).unwrap(); let expect = a .as_any() @@ -6601,7 +6551,7 @@ mod tests { #[test] fn test_cast_from_int8() { let i8_values: Vec = vec![i8::MIN, 0, i8::MAX]; - let i8_array: ArrayRef = Arc::new(Int8Array::from(i8_values)); + let i8_array = Int8Array::from(i8_values); let f64_expected = vec!["-128.0", "0.0", "127.0"]; assert_eq!( @@ -6665,7 +6615,7 @@ mod tests { } /// Convert `array` into a vector of strings by casting to data type dt - fn get_cast_values(array: &ArrayRef, dt: &DataType) -> Vec + fn get_cast_values(array: &dyn Array, dt: &DataType) -> Vec where T: ArrowPrimitiveType, { @@ -6782,7 +6732,7 @@ mod tests { let val = format!("val{i}"); builder.append(&val).unwrap(); } - let array: ArrayRef = Arc::new(builder.finish()); + let array = builder.finish(); let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let res = cast(&array, &cast_type); diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 7214321127cf..bd482989809e 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -20,7 +20,6 @@ //! record batch pretty printing. use std::fmt::Write; -use std::sync::Arc; use arrow_array::timezone::Tz; use arrow_array::types::*; @@ -387,7 +386,7 @@ macro_rules! make_string_from_duration { #[inline(always)] pub fn make_string_from_decimal( - column: &Arc, + column: &dyn Array, row: usize, ) -> Result { let array = column.as_any().downcast_ref::().unwrap(); @@ -398,7 +397,7 @@ pub fn make_string_from_decimal( fn append_struct_field_string( target: &mut String, name: &str, - field_col: &Arc, + field_col: &dyn Array, row: usize, ) -> Result<(), ArrowError> { target.push('"'); @@ -425,7 +424,7 @@ fn append_struct_field_string( fn append_map_field_string( target: &mut String, - field_col: &Arc, + field_col: &dyn Array, row: usize, ) -> Result<(), ArrowError> { if field_col.is_null(row) { @@ -451,7 +450,7 @@ fn append_map_field_string( /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. fn array_value_to_string_internal( - column: &ArrayRef, + column: &dyn Array, col_idx: usize, row_idx: usize, format: Option<&str>, @@ -722,7 +721,7 @@ fn array_value_to_string_internal( } pub fn temporal_array_value_to_string( - column: &ArrayRef, + column: &dyn Array, col_idx: usize, row_idx: usize, format: Option<&str>, @@ -731,7 +730,7 @@ pub fn temporal_array_value_to_string( } pub fn array_value_to_string( - column: &ArrayRef, + column: &dyn Array, row_idx: usize, ) -> Result { array_value_to_string_internal(column, 0, row_idx, None) @@ -739,7 +738,7 @@ pub fn array_value_to_string( /// Converts the value of the union array at `row` to a String fn union_to_string( - column: &ArrayRef, + column: &dyn Array, row: usize, fields: &[Field], type_ids: &[i8], @@ -773,7 +772,7 @@ fn union_to_string( } /// Converts the value of the dictionary array at `row` to a String fn dict_array_value_to_string( - colum: &ArrayRef, + colum: &dyn Array, row: usize, ) -> Result { let dict_array = colum.as_any().downcast_ref::>().unwrap(); @@ -824,35 +823,30 @@ mod tests { &entry_offsets, ) .unwrap(); - let param = Arc::new(map_array) as ArrayRef; assert_eq!( "{\"d\": 30, \"e\": 40, \"f\": 50}", - array_value_to_string(¶m, 1).unwrap() + array_value_to_string(&map_array, 1).unwrap() ); } #[test] fn test_array_value_to_string_duration() { - let ns_array = - Arc::new(DurationNanosecondArray::from(vec![Some(1), None])) as ArrayRef; + let ns_array = DurationNanosecondArray::from(vec![Some(1), None]); assert_eq!( array_value_to_string(&ns_array, 0).unwrap(), "PT0.000000001S" ); assert_eq!(array_value_to_string(&ns_array, 1).unwrap(), ""); - let us_array = - Arc::new(DurationMicrosecondArray::from(vec![Some(1), None])) as ArrayRef; + let us_array = DurationMicrosecondArray::from(vec![Some(1), None]); assert_eq!(array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S"); assert_eq!(array_value_to_string(&us_array, 1).unwrap(), ""); - let ms_array = - Arc::new(DurationMillisecondArray::from(vec![Some(1), None])) as ArrayRef; + let ms_array = DurationMillisecondArray::from(vec![Some(1), None]); assert_eq!(array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S"); assert_eq!(array_value_to_string(&ms_array, 1).unwrap(), ""); - let s_array = - Arc::new(DurationSecondArray::from(vec![Some(1), None])) as ArrayRef; + let s_array = DurationSecondArray::from(vec![Some(1), None]); assert_eq!(array_value_to_string(&s_array, 0).unwrap(), "PT1S"); assert_eq!(array_value_to_string(&s_array, 1).unwrap(), ""); } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 64e5d6a2cd3d..f7ce24a97d2a 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -186,7 +186,7 @@ //! where //! I: IntoIterator, //! { -//! let array = Arc::new(StringArray::from_iter(iter.into_iter().map(Some))) as _; +//! let array = StringArray::from_iter(iter.into_iter().map(Some)); //! arrow::compute::cast(&array, to_data_type) //! } //! From 9b48f3478320f7df666c0f075b578a059203ff16 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 8 Feb 2023 06:22:50 -0800 Subject: [PATCH 0585/1411] Fix FFI which fails to account for offsets (#3675) --- arrow/src/ffi.rs | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 9fcca3c5d9ea..dc234c8590ad 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -718,6 +718,10 @@ pub trait ArrowArrayRef { dt => dt, }; + // `ffi::ArrowArray` records array offset, we need to add it back to the + // buffer length to get the actual buffer length. + let length = self.array().length as usize + self.array().offset as usize; + // Inner type is not important for buffer length. Ok(match (&data_type, i) { (DataType::Utf8, 1) @@ -730,7 +734,7 @@ pub trait ArrowArrayRef { // the len of the offset buffer (buffer 1) equals length + 1 let bits = bit_width(data_type, i)?; debug_assert_eq!(bits % 8, 0); - (self.array().length as usize + 1) * (bits / 8) + (length + 1) * (bits / 8) } (DataType::Utf8, 2) | (DataType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) @@ -759,7 +763,7 @@ pub trait ArrowArrayRef { // buffer len of primitive types _ => { let bits = bit_width(data_type, i)?; - bit_util::ceil(self.array().length as usize * bits, 8) + bit_util::ceil(length * bits, 8) } }) } @@ -769,7 +773,10 @@ pub trait ArrowArrayRef { /// The C Data interface's null buffer is part of the array of buffers. fn null_bit_buffer(&self) -> Option { // similar to `self.buffer_len(0)`, but without `Result`. - let buffer_len = bit_util::ceil(self.array().length as usize, 8); + // `ffi::ArrowArray` records array offset, we need to add it back to the + // buffer length to get the actual buffer length. + let length = self.array().length as usize + self.array().offset as usize; + let buffer_len = bit_util::ceil(length, 8); unsafe { create_buffer(self.owner().clone(), self.array(), 0, buffer_len) } } @@ -982,6 +989,33 @@ mod tests { Ok(()) } + #[test] + fn test_round_trip_with_offset() -> Result<()> { + // create an array natively + let array = Int32Array::from(vec![Some(1), Some(2), None, Some(3), None]); + + let array = array.slice(1, 2); + + // export it + let array = ArrowArray::try_from(array.into_data())?; + + // (simulate consumer) import it + let data = ArrayData::try_from(array)?; + let array = make_array(data); + + // perform some operation + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array, &Int32Array::from(vec![Some(2), None])); + + let array = kernels::arithmetic::add(array, array).unwrap(); + + // verify + assert_eq!(array, Int32Array::from(vec![Some(4), None])); + + // (drop/release) + Ok(()) + } + #[test] #[cfg(not(feature = "force_validate"))] fn test_decimal_round_trip() -> Result<()> { From a3b344de39dd5652f1216b0497e15ca263b7d648 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 8 Feb 2023 16:58:01 +0000 Subject: [PATCH 0586/1411] Lazy array display (#3638) (#3647) * Lazy array display (#3638) * Update CSV writer * Borrow * Time formatting * Update pretty * Add FixedSizeBinaryArray * Further tweaks * Clippy * More clippy * More tweaks * More clippy * Clippy * Use lexical_core * Update doctest * Review feedback * Bump CI * Review feedback --- arrow-cast/src/display.rs | 1378 +++++++++++----------- arrow-csv/src/writer.rs | 213 +--- arrow-json/src/writer.rs | 88 +- arrow-schema/src/datatype.rs | 2 +- arrow/src/util/pretty.rs | 152 +-- arrow/tests/csv.rs | 2 +- parquet/src/arrow/arrow_writer/levels.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 22 +- parquet/src/arrow/async_reader/mod.rs | 8 +- 9 files changed, 873 insertions(+), 996 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index bd482989809e..6e06a0e39dc0 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -19,56 +19,553 @@ //! purposes. See the `pretty` crate for additional functions for //! record batch pretty printing. -use std::fmt::Write; +use std::fmt::{Display, Formatter, Write}; +use std::ops::Range; +use arrow_array::cast::*; +use arrow_array::temporal_conversions::*; use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::*; -use chrono::prelude::SecondsFormat; -use chrono::{DateTime, Utc}; +use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc}; +use lexical_core::FormattedSize; -fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { - ArrowError::CastError(format!( - "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" - )) +type TimeFormat<'a> = Option<&'a str>; + +/// Options for formatting arrays +/// +/// By default nulls are formatted as `""` and temporal types formatted +/// according to RFC3339 +/// +#[derive(Debug, Clone)] +pub struct FormatOptions<'a> { + /// If set to `true` any formatting errors will be written to the output + /// instead of being converted into a [`std::fmt::Error`] + safe: bool, + /// Format string for nulls + null: &'a str, + /// Date format for date arrays + date_format: TimeFormat<'a>, + /// Format for DateTime arrays + datetime_format: TimeFormat<'a>, + /// Timestamp format for timestamp arrays + timestamp_format: TimeFormat<'a>, + /// Timestamp format for timestamp with timezone arrays + timestamp_tz_format: TimeFormat<'a>, + /// Time format for time arrays + time_format: TimeFormat<'a>, +} + +impl<'a> Default for FormatOptions<'a> { + fn default() -> Self { + Self { + safe: true, + null: "", + date_format: None, + datetime_format: None, + timestamp_format: None, + timestamp_tz_format: None, + time_format: None, + } + } +} + +impl<'a> FormatOptions<'a> { + /// If set to `true` any formatting errors will be written to the output + /// instead of being converted into a [`std::fmt::Error`] + pub fn with_display_error(mut self, safe: bool) -> Self { + self.safe = safe; + self + } + + /// Overrides the string used to represent a null + /// + /// Defaults to `""` + pub fn with_null(self, null: &'a str) -> Self { + Self { null, ..self } + } + + /// Overrides the format used for [`DataType::Date32`] columns + pub fn with_date_format(self, date_format: Option<&'a str>) -> Self { + Self { + date_format, + ..self + } + } + + /// Overrides the format used for [`DataType::Date64`] columns + pub fn with_datetime_format(self, datetime_format: Option<&'a str>) -> Self { + Self { + datetime_format, + ..self + } + } + + /// Overrides the format used for [`DataType::Timestamp`] columns without a timezone + pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) -> Self { + Self { + timestamp_format, + ..self + } + } + + /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone + pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self { + Self { + timestamp_tz_format, + ..self + } + } + + /// Overrides the format used for [`DataType::Time32`] and [`DataType::Time64`] columns + pub fn with_time_format(self, time_format: Option<&'a str>) -> Self { + Self { + time_format, + ..self + } + } +} + +/// Implements [`Display`] for a specific array value +pub struct ValueFormatter<'a> { + idx: usize, + formatter: &'a ArrayFormatter<'a>, +} + +impl<'a> ValueFormatter<'a> { + /// Writes this value to the provided [`Write`] + /// + /// Note: this ignores [`FormatOptions::with_display_error`] and + /// will return an error on formatting issue + pub fn write(&self, s: &mut dyn Write) -> Result<(), ArrowError> { + match self.formatter.format.write(self.idx, s) { + Ok(_) => Ok(()), + Err(FormatError::Arrow(e)) => Err(e), + Err(FormatError::Format(_)) => { + Err(ArrowError::CastError("Format error".to_string())) + } + } + } + + /// Fallibly converts this to a string + pub fn try_to_string(&self) -> Result { + let mut s = String::new(); + self.write(&mut s)?; + Ok(s) + } +} + +impl<'a> Display for ValueFormatter<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.formatter.format.write(self.idx, f) { + Ok(()) => Ok(()), + Err(FormatError::Arrow(e)) if self.formatter.safe => { + write!(f, "ERROR: {e}") + } + Err(_) => Err(std::fmt::Error), + } + } +} + +/// A string formatter for an [`Array`] +/// +/// This can be used with [`std::write`] to write type-erased `dyn Array` +/// +/// ``` +/// # use std::fmt::{Display, Formatter, Write}; +/// # use arrow_array::{Array, ArrayRef, Int32Array}; +/// # use arrow_cast::display::{ArrayFormatter, FormatOptions}; +/// # use arrow_schema::ArrowError; +/// struct MyContainer { +/// values: ArrayRef, +/// } +/// +/// impl Display for MyContainer { +/// fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { +/// let options = FormatOptions::default(); +/// let formatter = ArrayFormatter::try_new(self.values.as_ref(), &options) +/// .map_err(|_| std::fmt::Error)?; +/// +/// let mut iter = 0..self.values.len(); +/// if let Some(idx) = iter.next() { +/// write!(f, "{}", formatter.value(idx))?; +/// } +/// for idx in iter { +/// write!(f, ", {}", formatter.value(idx))?; +/// } +/// Ok(()) +/// } +/// } +/// ``` +/// +/// [`ValueFormatter::write`] can also be used to get a semantic error, instead of the +/// opaque [`std::fmt::Error`] +/// +/// ``` +/// # use std::fmt::Write; +/// # use arrow_array::Array; +/// # use arrow_cast::display::{ArrayFormatter, FormatOptions}; +/// # use arrow_schema::ArrowError; +/// fn format_array( +/// f: &mut dyn Write, +/// array: &dyn Array, +/// options: &FormatOptions, +/// ) -> Result<(), ArrowError> { +/// let formatter = ArrayFormatter::try_new(array, options)?; +/// for i in 0..array.len() { +/// formatter.value(i).write(f)? +/// } +/// Ok(()) +/// } +/// ``` +/// +pub struct ArrayFormatter<'a> { + format: Box, + safe: bool, +} + +impl<'a> ArrayFormatter<'a> { + /// Returns an [`ArrayFormatter`] that can be used to format `array` + /// + /// This returns an error if an array of the given data type cannot be formatted + pub fn try_new( + array: &'a dyn Array, + options: &FormatOptions<'a>, + ) -> Result { + Ok(Self { + format: make_formatter(array, options)?, + safe: options.safe, + }) + } + + /// Returns a [`ValueFormatter`] that implements [`Display`] for + /// the value of the array at `idx` + pub fn value(&self, idx: usize) -> ValueFormatter<'_> { + ValueFormatter { + formatter: self, + idx, + } + } +} + +fn make_formatter<'a>( + array: &'a dyn Array, + options: &FormatOptions<'a>, +) -> Result, ArrowError> { + downcast_primitive_array! { + array => array_format(array, options), + DataType::Null => array_format(as_null_array(array), options), + DataType::Boolean => array_format(as_boolean_array(array), options), + DataType::Utf8 => array_format(as_string_array(array), options), + DataType::LargeUtf8 => array_format(as_largestring_array(array), options), + DataType::Binary => array_format(as_generic_binary_array::(array), options), + DataType::LargeBinary => array_format(as_generic_binary_array::(array), options), + DataType::FixedSizeBinary(_) => { + let a = array.as_any().downcast_ref::().unwrap(); + array_format(a, options) + } + DataType::Dictionary(_, _) => downcast_dictionary_array! { + array => array_format(array, options), + _ => unreachable!() + } + DataType::List(_) => array_format(as_generic_list_array::(array), options), + DataType::LargeList(_) => array_format(as_generic_list_array::(array), options), + DataType::FixedSizeList(_, _) => { + let a = array.as_any().downcast_ref::().unwrap(); + array_format(a, options) + } + DataType::Struct(_) => array_format(as_struct_array(array), options), + DataType::Map(_, _) => array_format(as_map_array(array), options), + DataType::Union(_, _, _) => array_format(as_union_array(array), options), + d => Err(ArrowError::NotYetImplemented(format!("formatting {d} is not yet supported"))), + } +} + +/// Either an [`ArrowError`] or [`std::fmt::Error`] +enum FormatError { + Format(std::fmt::Error), + Arrow(ArrowError), +} + +type FormatResult = Result<(), FormatError>; + +impl From for FormatError { + fn from(value: std::fmt::Error) -> Self { + Self::Format(value) + } +} + +impl From for FormatError { + fn from(value: ArrowError) -> Self { + Self::Arrow(value) + } +} + +/// [`Display`] but accepting an index +trait DisplayIndex { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult; +} + +/// [`DisplayIndex`] with additional state +trait DisplayIndexState<'a> { + type State; + + fn prepare(&self, options: &FormatOptions<'a>) -> Result; + + fn write(&self, state: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult; +} + +impl<'a, T: DisplayIndex> DisplayIndexState<'a> for T { + type State = (); + + fn prepare(&self, _options: &FormatOptions<'a>) -> Result { + Ok(()) + } + + fn write(&self, _: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + DisplayIndex::write(self, idx, f) + } +} + +struct ArrayFormat<'a, F: DisplayIndexState<'a>> { + state: F::State, + array: F, + null: &'a str, +} + +fn array_format<'a, F>( + array: F, + options: &FormatOptions<'a>, +) -> Result, ArrowError> +where + F: DisplayIndexState<'a> + Array + 'a, +{ + let state = array.prepare(options)?; + Ok(Box::new(ArrayFormat { + state, + array, + null: options.null, + })) +} + +impl<'a, F: DisplayIndexState<'a> + Array> DisplayIndex for ArrayFormat<'a, F> { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + if self.array.is_null(idx) { + if !self.null.is_empty() { + f.write_str(self.null)? + } + return Ok(()); + } + DisplayIndexState::write(&self.array, &self.state, idx, f) + } +} + +impl<'a> DisplayIndex for &'a BooleanArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + write!(f, "{}", self.value(idx))?; + Ok(()) + } +} + +impl<'a> DisplayIndex for &'a NullArray { + fn write(&self, _idx: usize, _f: &mut dyn Write) -> FormatResult { + Ok(()) + } +} + +macro_rules! primitive_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndex for &'a PrimitiveArray<$t> + { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let value = self.value(idx); + let mut buffer = [0u8; <$t as ArrowPrimitiveType>::Native::FORMATTED_SIZE]; + // SAFETY: + // buffer is T::FORMATTED_SIZE + let b = unsafe { lexical_core::write_unchecked(value, &mut buffer) }; + // Lexical core produces valid UTF-8 + let s = unsafe { std::str::from_utf8_unchecked(b) }; + f.write_str(s)?; + Ok(()) + } + })+ + }; } -macro_rules! make_string { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); +primitive_display!(Int8Type, Int16Type, Int32Type, Int64Type); +primitive_display!(UInt8Type, UInt16Type, UInt32Type, UInt64Type); +primitive_display!(Float32Type, Float64Type); - Ok(array.value($row).to_string()) - }}; +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + write!(f, "{}", self.value(idx))?; + Ok(()) + } +} + +macro_rules! decimal_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { + type State = (u8, i8); + + fn prepare(&self, _options: &FormatOptions<'a>) -> Result { + Ok((self.precision(), self.scale())) + } + + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + write!(f, "{}", <$t>::format_decimal(self.values()[idx], s.0, s.1))?; + Ok(()) + } + })+ + }; +} + +decimal_display!(Decimal128Type, Decimal256Type); + +fn write_timestamp( + f: &mut dyn Write, + naive: NaiveDateTime, + timezone: Option, + format: Option<&str>, +) -> FormatResult { + match timezone { + Some(tz) => { + let date = Utc.from_utc_datetime(&naive).with_timezone(&tz); + match format { + Some(s) => write!(f, "{}", date.format(s))?, + None => { + write!(f, "{}", date.to_rfc3339_opts(SecondsFormat::AutoSi, true))? + } + } + } + None => match format { + Some(s) => write!(f, "{}", naive.format(s))?, + None => write!(f, "{naive:?}")?, + }, + } + Ok(()) } -macro_rules! make_string_interval_year_month { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); +macro_rules! timestamp_display { + ($($t:ty),+) => { + $(impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { + type State = (Option, TimeFormat<'a>); - let interval = array.value($row) as f64; + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + match self.data_type() { + DataType::Timestamp(_, Some(tz)) => Ok((Some(tz.parse()?), options.timestamp_tz_format)), + DataType::Timestamp(_, None) => Ok((None, options.timestamp_format)), + _ => unreachable!(), + } + } + + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let value = self.value(idx); + let naive = as_datetime::<$t>(value).ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to convert {} to datetime for {}", + value, + self.data_type() + )) + })?; + + write_timestamp(f, naive, s.0, s.1.clone()) + } + })+ + }; +} + +timestamp_display!( + TimestampSecondType, + TimestampMillisecondType, + TimestampMicrosecondType, + TimestampNanosecondType +); + +macro_rules! temporal_display { + ($convert:ident, $format:ident, $t:ty) => { + impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { + type State = TimeFormat<'a>; + + fn prepare( + &self, + options: &FormatOptions<'a>, + ) -> Result { + Ok(options.$format) + } + + fn write( + &self, + fmt: &Self::State, + idx: usize, + f: &mut dyn Write, + ) -> FormatResult { + let value = self.value(idx); + let naive = $convert(value as _).ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to convert {} to temporal for {}", + value, + self.data_type() + )) + })?; + + match fmt { + Some(s) => write!(f, "{}", naive.format(s))?, + None => write!(f, "{naive:?}")?, + } + Ok(()) + } + } + }; +} + +#[inline] +fn date32_to_date(value: i32) -> Option { + Some(date32_to_datetime(value)?.date()) +} + +temporal_display!(date32_to_date, date_format, Date32Type); +temporal_display!(date64_to_datetime, datetime_format, Date64Type); +temporal_display!(time32s_to_time, time_format, Time32SecondType); +temporal_display!(time32ms_to_time, time_format, Time32MillisecondType); +temporal_display!(time64us_to_time, time_format, Time64MicrosecondType); +temporal_display!(time64ns_to_time, time_format, Time64NanosecondType); + +macro_rules! duration_display { + ($convert:ident, $t:ty) => { + impl<'a> DisplayIndex for &'a PrimitiveArray<$t> { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + write!(f, "{}", $convert(self.value(idx)))?; + Ok(()) + } + } + }; +} + +duration_display!(duration_s_to_duration, DurationSecondType); +duration_display!(duration_ms_to_duration, DurationMillisecondType); +duration_display!(duration_us_to_duration, DurationMicrosecondType); +duration_display!(duration_ns_to_duration, DurationNanosecondType); + +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let interval = self.value(idx) as f64; let years = (interval / 12_f64).floor(); let month = interval - (years * 12_f64); - Ok(format!( - "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", - years, month, - )) - }}; + write!( + f, + "{years} years {month} mons 0 days 0 hours 0 mins 0.00 secs", + )?; + Ok(()) + } } -macro_rules! make_string_interval_day_time { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); - - let value: u64 = array.value($row) as u64; +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let value: u64 = self.value(idx) as u64; let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; @@ -88,7 +585,8 @@ macro_rules! make_string_interval_day_time { "" }; - Ok(format!( + write!( + f, "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", days_parts, hours, @@ -96,18 +594,14 @@ macro_rules! make_string_interval_day_time { secs_sign, secs.abs(), milliseconds.abs(), - )) - }}; + )?; + Ok(()) + } } -macro_rules! make_string_interval_month_day_nano { - ($column: ident, $row: ident) => {{ - let array = $column - .as_any() - .downcast_ref::() - .unwrap(); - - let value: u128 = array.value($row) as u128; +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let value: u128 = self.value(idx) as u128; let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; @@ -125,7 +619,8 @@ macro_rules! make_string_interval_month_day_nano { let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; - Ok(format!( + write!( + f, "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", months_part, days_part, @@ -134,657 +629,220 @@ macro_rules! make_string_interval_month_day_nano { secs_sign, secs.abs(), nanoseconds.abs(), - )) - }}; -} - -macro_rules! make_string_date { - ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - Ok($column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .value_as_date($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .to_string()) - }}; -} - -macro_rules! make_string_date_with_format { - ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - Ok($column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .format($format) - .to_string()) - }}; -} - -macro_rules! handle_string_date { - ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - match $format { - Some(format) => { - make_string_date_with_format!( - $array_type, - $dt, - format, - $column, - $col_idx, - $row_idx - ) - } - None => make_string_date!($array_type, $dt, $column, $col_idx, $row_idx), - } - }}; -} - -macro_rules! make_string_time { - ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - Ok($column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .value_as_time($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .to_string()) - }}; -} - -macro_rules! make_string_time_with_format { - ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - Ok($column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .value_as_time($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .format($format) - .to_string()) - }}; -} - -macro_rules! handle_string_time { - ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { - match $format { - Some(format) => { - make_string_time_with_format!( - $array_type, - $dt, - format, - $column, - $col_idx, - $row_idx - ) - } - None => make_string_time!($array_type, $dt, $column, $col_idx, $row_idx), - } - }; + )?; + Ok(()) + } } -macro_rules! make_string_datetime { - ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - let array = $column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - - let s = match $tz_string { - Some(tz_string) => match tz_string.parse::() { - Ok(tz) => array - .value_as_datetime_with_tz($row_idx, tz) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .to_rfc3339_opts(SecondsFormat::AutoSi, true) - .to_string(), - Err(_) => { - let datetime = array - .value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) - } - }, - None => { - let datetime = array - .value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - format!("{:?}", datetime) - } - }; - - Ok(s) - }}; -} - -macro_rules! make_string_datetime_with_format { - ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ - let array = $column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - let datetime = array - .value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - - let s = match $tz_string { - Some(tz_string) => match tz_string.parse::() { - Ok(tz) => { - let utc_time = DateTime::::from_utc(datetime, Utc); - let local_time = utc_time.with_timezone(&tz); - local_time.format($format).to_string() - } - Err(_) => { - format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) - } - }, - None => datetime.format($format).to_string(), - }; +impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericStringArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + write!(f, "{}", self.value(idx))?; + Ok(()) + } +} - Ok(s) - }}; -} - -macro_rules! handle_string_datetime { - ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { - match $format { - Some(format) => make_string_datetime_with_format!( - $array_type, - $dt, - format, - $tz_string, - $column, - $col_idx, - $row_idx - ), - None => make_string_datetime!( - $array_type, - $dt, - $tz_string, - $column, - $col_idx, - $row_idx - ), +impl<'a, O: OffsetSizeTrait> DisplayIndex for &'a GenericBinaryArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let v = self.value(idx); + for byte in v { + write!(f, "{byte:02x}")?; } - }; + Ok(()) + } } -// It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex -macro_rules! make_string_hex { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); +impl<'a> DisplayIndex for &'a FixedSizeBinaryArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let v = self.value(idx); + for byte in v { + write!(f, "{byte:02x}")?; + } + Ok(()) + } +} - let mut tmp = "".to_string(); +impl<'a, K: ArrowDictionaryKeyType> DisplayIndexState<'a> for &'a DictionaryArray { + type State = Box; - for character in array.value($row) { - let _ = write!(tmp, "{:02x}", character); - } + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + make_formatter(self.values().as_ref(), options) + } - Ok(tmp) - }}; -} - -macro_rules! make_string_from_list { - ($column: ident, $row: ident) => {{ - let list = $column - .as_any() - .downcast_ref::() - .ok_or(ArrowError::InvalidArgumentError(format!( - "Repl error: could not convert list column to list array." - )))? - .value($row); - let string_values = (0..list.len()) - .map(|i| array_value_to_string(&list.clone(), i)) - .collect::, _>>()?; - Ok(format!("[{}]", string_values.join(", "))) - }}; -} - -macro_rules! make_string_from_large_list { - ($column: ident, $row: ident) => {{ - let list = $column - .as_any() - .downcast_ref::() - .ok_or(ArrowError::InvalidArgumentError(format!( - "Repl error: could not convert large list column to list array." - )))? - .value($row); - let string_values = (0..list.len()) - .map(|i| array_value_to_string(&list, i)) - .collect::, _>>()?; - Ok(format!("[{}]", string_values.join(", "))) - }}; -} - -macro_rules! make_string_from_fixed_size_list { - ($column: ident, $row: ident) => {{ - let list = $column - .as_any() - .downcast_ref::() - .ok_or(ArrowError::InvalidArgumentError(format!( - "Repl error: could not convert list column to list array." - )))? - .value($row); - let string_values = (0..list.len()) - .map(|i| array_value_to_string(&list.clone(), i)) - .collect::, _>>()?; - Ok(format!("[{}]", string_values.join(", "))) - }}; -} - -macro_rules! make_string_from_duration { - ($array_type:ty, $dt:expr, $column:ident, $col_idx:ident, $row_idx: ident) => {{ - Ok($column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .value_as_duration($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? - .to_string()) - }}; -} - -#[inline(always)] -pub fn make_string_from_decimal( - column: &dyn Array, - row: usize, -) -> Result { - let array = column.as_any().downcast_ref::().unwrap(); + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let value_idx = self.keys().values()[idx].as_usize(); + s.as_ref().write(value_idx, f) + } +} - Ok(array.value_as_string(row)) +fn write_list( + f: &mut dyn Write, + mut range: Range, + values: &dyn DisplayIndex, +) -> FormatResult { + f.write_char('[')?; + if let Some(idx) = range.next() { + values.write(idx, f)?; + } + for idx in range { + write!(f, ", ")?; + values.write(idx, f)?; + } + f.write_char(']')?; + Ok(()) } -fn append_struct_field_string( - target: &mut String, - name: &str, - field_col: &dyn Array, - row: usize, -) -> Result<(), ArrowError> { - target.push('"'); - target.push_str(name); - target.push_str("\": "); - - if field_col.is_null(row) { - target.push_str("null"); - } else { - match field_col.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - target.push('"'); - target.push_str(array_value_to_string(field_col, row)?.as_str()); - target.push('"'); - } - _ => { - target.push_str(array_value_to_string(field_col, row)?.as_str()); - } - } +impl<'a, O: OffsetSizeTrait> DisplayIndexState<'a> for &'a GenericListArray { + type State = Box; + + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + make_formatter(self.values().as_ref(), options) } - Ok(()) + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let offsets = self.value_offsets(); + let end = offsets[idx + 1].as_usize(); + let start = offsets[idx].as_usize(); + write_list(f, start..end, s.as_ref()) + } } -fn append_map_field_string( - target: &mut String, - field_col: &dyn Array, - row: usize, -) -> Result<(), ArrowError> { - if field_col.is_null(row) { - target.push_str("null"); - } else { - match field_col.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => { - target.push('"'); - target.push_str(array_value_to_string(field_col, row)?.as_str()); - target.push('"'); - } - _ => { - target.push_str(array_value_to_string(field_col, row)?.as_str()); - } - } +impl<'a> DisplayIndexState<'a> for &'a FixedSizeListArray { + type State = (usize, Box); + + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + let values = make_formatter(self.values().as_ref(), options)?; + let length = self.value_length(); + Ok((length as usize, values)) } - Ok(()) + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let start = idx * s.0; + let end = start + s.0; + write_list(f, start..end, s.1.as_ref()) + } } -/// Get the value at the given row in an array as a String. -/// -/// Note this function is quite inefficient and is unlikely to be -/// suitable for converting large arrays or record batches. -fn array_value_to_string_internal( - column: &dyn Array, - col_idx: usize, - row_idx: usize, - format: Option<&str>, -) -> Result { - if column.is_null(row_idx) { - return Ok("".to_string()); - } - match column.data_type() { - DataType::Utf8 => make_string!(array::StringArray, column, row_idx), - DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row_idx), - DataType::Binary => make_string_hex!(array::BinaryArray, column, row_idx), - DataType::LargeBinary => { - make_string_hex!(array::LargeBinaryArray, column, row_idx) - } - DataType::FixedSizeBinary(_) => { - make_string_hex!(array::FixedSizeBinaryArray, column, row_idx) - } - DataType::Boolean => make_string!(array::BooleanArray, column, row_idx), - DataType::Int8 => make_string!(array::Int8Array, column, row_idx), - DataType::Int16 => make_string!(array::Int16Array, column, row_idx), - DataType::Int32 => make_string!(array::Int32Array, column, row_idx), - DataType::Int64 => make_string!(array::Int64Array, column, row_idx), - DataType::UInt8 => make_string!(array::UInt8Array, column, row_idx), - DataType::UInt16 => make_string!(array::UInt16Array, column, row_idx), - DataType::UInt32 => make_string!(array::UInt32Array, column, row_idx), - DataType::UInt64 => make_string!(array::UInt64Array, column, row_idx), - DataType::Float16 => make_string!(array::Float16Array, column, row_idx), - DataType::Float32 => make_string!(array::Float32Array, column, row_idx), - DataType::Float64 => make_string!(array::Float64Array, column, row_idx), - DataType::Decimal128(..) => make_string_from_decimal(column, row_idx), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - handle_string_datetime!( - array::TimestampSecondArray, - "Timestamp", - format, - tz_string_opt, - column, - col_idx, - row_idx - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { - handle_string_datetime!( - array::TimestampMillisecondArray, - "Timestamp", - format, - tz_string_opt, - column, - col_idx, - row_idx - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { - handle_string_datetime!( - array::TimestampMicrosecondArray, - "Timestamp", - format, - tz_string_opt, - column, - col_idx, - row_idx - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { - handle_string_datetime!( - array::TimestampNanosecondArray, - "Timestamp", - format, - tz_string_opt, - column, - col_idx, - row_idx - ) - } - DataType::Date32 => { - handle_string_date!( - array::Date32Array, - "Date32", - format, - column, - col_idx, - row_idx - ) - } - DataType::Date64 => { - handle_string_date!( - array::Date64Array, - "Date64", - format, - column, - col_idx, - row_idx - ) - } - DataType::Time32(unit) if *unit == TimeUnit::Second => { - handle_string_time!( - array::Time32SecondArray, - "Time32", - format, - column, - col_idx, - row_idx - ) - } - DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - handle_string_time!( - array::Time32MillisecondArray, - "Time32", - format, - column, - col_idx, - row_idx - ) - } - DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - handle_string_time!( - array::Time64MicrosecondArray, - "Time64", - format, - column, - col_idx, - row_idx - ) - } - DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - handle_string_time!( - array::Time64NanosecondArray, - "Time64", - format, - column, - col_idx, - row_idx - ) - } - DataType::Interval(unit) => match unit { - IntervalUnit::DayTime => { - make_string_interval_day_time!(column, row_idx) - } - IntervalUnit::YearMonth => { - make_string_interval_year_month!(column, row_idx) - } - IntervalUnit::MonthDayNano => { - make_string_interval_month_day_nano!(column, row_idx) - } - }, - DataType::List(_) => make_string_from_list!(column, row_idx), - DataType::LargeList(_) => make_string_from_large_list!(column, row_idx), - DataType::Dictionary(index_type, _value_type) => match **index_type { - DataType::Int8 => dict_array_value_to_string::(column, row_idx), - DataType::Int16 => dict_array_value_to_string::(column, row_idx), - DataType::Int32 => dict_array_value_to_string::(column, row_idx), - DataType::Int64 => dict_array_value_to_string::(column, row_idx), - DataType::UInt8 => dict_array_value_to_string::(column, row_idx), - DataType::UInt16 => dict_array_value_to_string::(column, row_idx), - DataType::UInt32 => dict_array_value_to_string::(column, row_idx), - DataType::UInt64 => dict_array_value_to_string::(column, row_idx), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not supported for {:?} due to index type", - column.data_type() - ))), - }, - DataType::FixedSizeList(_, _) => { - make_string_from_fixed_size_list!(column, row_idx) - } - DataType::Struct(_) => { - let st = column - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert struct column to struct array." - .to_string(), - ) - })?; +/// Pairs a boxed [`DisplayIndex`] with its field name +type FieldDisplay<'a> = (&'a str, Box); - let mut s = String::new(); - s.push('{'); - let mut kv_iter = st.columns().iter().zip(st.column_names()); - if let Some((col, name)) = kv_iter.next() { - append_struct_field_string(&mut s, name, col, row_idx)?; - } - for (col, name) in kv_iter { - s.push_str(", "); - append_struct_field_string(&mut s, name, col, row_idx)?; - } - s.push('}'); +impl<'a> DisplayIndexState<'a> for &'a StructArray { + type State = Vec>; - Ok(s) - } - DataType::Map(_, _) => { - let map_array = - column.as_any().downcast_ref::().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert column to map array.".to_string(), - ) - })?; - let map_entry = map_array.value(row_idx); - let st = map_entry - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert map entry to struct array." - .to_string(), - ) - })?; - let mut s = String::new(); - s.push('{'); - let entries_count = st.column(0).len(); - for i in 0..entries_count { - if i > 0 { - s.push_str(", "); - } - append_map_field_string(&mut s, st.column(0), i)?; - s.push_str(": "); - append_map_field_string(&mut s, st.column(1), i)?; - } - s.push('}'); + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + let fields = match (*self).data_type() { + DataType::Struct(f) => f, + _ => unreachable!(), + }; - Ok(s) + self.columns() + .iter() + .zip(fields) + .map(|(a, f)| { + let format = make_formatter(a.as_ref(), options)?; + Ok((f.name().as_str(), format)) + }) + .collect() + } + + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let mut iter = s.iter(); + f.write_char('{')?; + if let Some((name, display)) = iter.next() { + write!(f, "{name}: ")?; + display.as_ref().write(idx, f)?; } - DataType::Union(field_vec, type_ids, mode) => { - union_to_string(column, row_idx, field_vec, type_ids, mode) + for (name, display) in iter { + write!(f, ", {name}: ")?; + display.as_ref().write(idx, f)?; } - DataType::Duration(unit) => match *unit { - TimeUnit::Second => { - make_string_from_duration!( - array::DurationSecondArray, - "Duration", - column, - col_idx, - row_idx - ) - } - TimeUnit::Millisecond => { - make_string_from_duration!( - array::DurationMillisecondArray, - "Duration", - column, - col_idx, - row_idx - ) - } - TimeUnit::Microsecond => { - make_string_from_duration!( - array::DurationMicrosecondArray, - "Duration", - column, - col_idx, - row_idx - ) - } - TimeUnit::Nanosecond => { - make_string_from_duration!( - array::DurationNanosecondArray, - "Duration", - column, - col_idx, - row_idx - ) - } - }, - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not implemented for {:?} type", - column.data_type() - ))), + f.write_char('}')?; + Ok(()) } } -pub fn temporal_array_value_to_string( - column: &dyn Array, - col_idx: usize, - row_idx: usize, - format: Option<&str>, -) -> Result { - array_value_to_string_internal(column, col_idx, row_idx, format) -} +impl<'a> DisplayIndexState<'a> for &'a MapArray { + type State = (Box, Box); -pub fn array_value_to_string( - column: &dyn Array, - row_idx: usize, -) -> Result { - array_value_to_string_internal(column, 0, row_idx, None) -} + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + let keys = make_formatter(self.keys().as_ref(), options)?; + let values = make_formatter(self.values().as_ref(), options)?; + Ok((keys, values)) + } -/// Converts the value of the union array at `row` to a String -fn union_to_string( - column: &dyn Array, - row: usize, - fields: &[Field], - type_ids: &[i8], - mode: &UnionMode, -) -> Result { - let list = column - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert union column to union array.".to_string(), - ) - })?; - let type_id = list.type_id(row); - let field_idx = type_ids.iter().position(|t| t == &type_id).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Repl error: could not get field name for type id: {type_id} in union array.", - )) - })?; - let name = fields.get(field_idx).unwrap().name(); - - let value = array_value_to_string( - list.child(type_id), - match mode { - UnionMode::Dense => list.value_offset(row) as usize, - UnionMode::Sparse => row, - }, - )?; + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let offsets = self.value_offsets(); + let end = offsets[idx + 1].as_usize(); + let start = offsets[idx].as_usize(); + let mut iter = start..end; + + f.write_char('{')?; + if let Some(idx) = iter.next() { + s.0.write(idx, f)?; + write!(f, ": ")?; + s.1.write(idx, f)?; + } + + for idx in iter { + write!(f, ", ")?; + s.0.write(idx, f)?; + write!(f, ": ")?; + s.1.write(idx, f)?; + } - Ok(format!("{{{name}={value}}}")) + f.write_char('}')?; + Ok(()) + } } -/// Converts the value of the dictionary array at `row` to a String -fn dict_array_value_to_string( - colum: &dyn Array, - row: usize, -) -> Result { - let dict_array = colum.as_any().downcast_ref::>().unwrap(); - let keys_array = dict_array.keys(); +impl<'a> DisplayIndexState<'a> for &'a UnionArray { + type State = ( + Vec)>>, + UnionMode, + ); - if keys_array.is_null(row) { - return Ok(String::from("")); + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + let (fields, type_ids, mode) = match (*self).data_type() { + DataType::Union(fields, type_ids, mode) => (fields, type_ids, mode), + _ => unreachable!(), + }; + + let max_id = type_ids.iter().copied().max().unwrap_or_default() as usize; + let mut out: Vec> = (0..max_id + 1).map(|_| None).collect(); + for (i, field) in type_ids.iter().zip(fields) { + let formatter = make_formatter(self.child(*i).as_ref(), options)?; + out[*i as usize] = Some((field.name().as_str(), formatter)) + } + Ok((out, *mode)) } - let dict_index = keys_array.value(row).as_usize(); - array_value_to_string(dict_array.values(), dict_index) + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let id = self.type_id(idx); + let idx = match s.1 { + UnionMode::Dense => self.value_offset(idx) as usize, + UnionMode::Sparse => idx, + }; + let (name, field) = s.0[id as usize].as_ref().unwrap(); + + write!(f, "{{{name}=")?; + field.write(idx, f)?; + f.write_char('}')?; + Ok(()) + } +} + +/// Get the value at the given row in an array as a String. +/// +/// Note this function is quite inefficient and is unlikely to be +/// suitable for converting large arrays or record batches. +/// +/// Please see [`ArrayFormatter`] for a more performant interface +pub fn array_value_to_string( + column: &dyn Array, + row: usize, +) -> Result { + let options = FormatOptions::default().with_display_error(true); + let formatter = ArrayFormatter::try_new(column, &options)?; + Ok(formatter.value(row).to_string()) } /// Converts numeric type to a `String` @@ -824,7 +882,7 @@ mod tests { ) .unwrap(); assert_eq!( - "{\"d\": 30, \"e\": 40, \"f\": 50}", + "{d: 30, e: 40, f: 50}", array_value_to_string(&map_array, 1).unwrap() ); } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 94620be6629f..e0734a15fd47 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -63,12 +63,10 @@ //! } //! ``` -use arrow_array::types::*; use arrow_array::*; -use arrow_cast::display::{ - array_value_to_string, lexical_to_string, temporal_array_value_to_string, -}; +use arrow_cast::display::*; use arrow_schema::*; +use csv::ByteRecord; use std::io::Write; use crate::map_csv_error; @@ -79,15 +77,6 @@ const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; -fn write_primitive_value(array: &ArrayRef, i: usize) -> String -where - T: ArrowPrimitiveType, - T::Native: lexical_core::ToLexical, -{ - let c = array.as_any().downcast_ref::>().unwrap(); - lexical_to_string(c.value(i)) -} - /// A CSV writer #[derive(Debug)] pub struct Writer { @@ -100,10 +89,8 @@ pub struct Writer { /// The datetime format for datetime arrays datetime_format: Option, /// The timestamp format for timestamp arrays - #[allow(dead_code)] timestamp_format: Option, /// The timestamp format for timestamp (with timezone) arrays - #[allow(dead_code)] timestamp_tz_format: Option, /// The time format for time arrays time_format: Option, @@ -132,113 +119,6 @@ impl Writer { } } - /// Convert a record to a string vector - fn convert( - &self, - batch: &[ArrayRef], - row_index: usize, - buffer: &mut [String], - ) -> Result<(), ArrowError> { - // TODO: it'd be more efficient if we could create `record: Vec<&[u8]> - for (col_index, item) in buffer.iter_mut().enumerate() { - let col = &batch[col_index]; - if col.is_null(row_index) { - // write the configured null value - *item = self.null_value.clone(); - continue; - } - let string = match col.data_type() { - DataType::Float64 => write_primitive_value::(col, row_index), - DataType::Float32 => write_primitive_value::(col, row_index), - DataType::Int8 => write_primitive_value::(col, row_index), - DataType::Int16 => write_primitive_value::(col, row_index), - DataType::Int32 => write_primitive_value::(col, row_index), - DataType::Int64 => write_primitive_value::(col, row_index), - DataType::UInt8 => write_primitive_value::(col, row_index), - DataType::UInt16 => write_primitive_value::(col, row_index), - DataType::UInt32 => write_primitive_value::(col, row_index), - DataType::UInt64 => write_primitive_value::(col, row_index), - DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), - DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date32 => temporal_array_value_to_string( - col, - col_index, - row_index, - self.date_format.as_deref(), - )? - .to_string(), - DataType::Date64 => temporal_array_value_to_string( - col, - col_index, - row_index, - self.datetime_format.as_deref(), - )? - .to_string(), - DataType::Time32(TimeUnit::Second) => temporal_array_value_to_string( - col, - col_index, - row_index, - self.time_format.as_deref(), - )? - .to_string(), - DataType::Time32(TimeUnit::Millisecond) => { - temporal_array_value_to_string( - col, - col_index, - row_index, - self.time_format.as_deref(), - )? - .to_string() - } - DataType::Time64(TimeUnit::Microsecond) => { - temporal_array_value_to_string( - col, - col_index, - row_index, - self.time_format.as_deref(), - )? - .to_string() - } - DataType::Time64(TimeUnit::Nanosecond) => temporal_array_value_to_string( - col, - col_index, - row_index, - self.time_format.as_deref(), - )? - .to_string(), - DataType::Timestamp(_, time_zone) => match time_zone { - Some(_tz) => temporal_array_value_to_string( - col, - col_index, - row_index, - self.timestamp_tz_format.as_deref(), - )? - .to_string(), - None => temporal_array_value_to_string( - col, - col_index, - row_index, - self.timestamp_format.as_deref(), - )? - .to_string(), - }, - DataType::Decimal128(..) => { - array_value_to_string(col, row_index)?.to_string() - } - t => { - // List and Struct arrays not supported by the writer, any - // other type needs to be implemented - return Err(ArrowError::CsvError(format!( - "CSV Writer does not support {t:?} data type" - ))); - } - }; - *item = string; - } - Ok(()) - } - /// Write a vector of record batches to a writable object pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { let num_columns = batch.num_columns(); @@ -257,23 +137,49 @@ impl Writer { self.beginning = false; } - let columns: Vec<_> = batch + let options = FormatOptions::default() + .with_null(&self.null_value) + .with_date_format(self.date_format.as_deref()) + .with_datetime_format(self.datetime_format.as_deref()) + .with_timestamp_format(self.timestamp_format.as_deref()) + .with_timestamp_tz_format(self.timestamp_tz_format.as_deref()) + .with_time_format(self.time_format.as_deref()); + + let converters = batch .columns() .iter() - .map(|array| match array.data_type() { - DataType::Dictionary(_, value_type) => { - arrow_cast::cast(array, value_type) - .expect("cannot cast dictionary to underlying values") - } - _ => array.clone(), + .map(|a| match a.data_type() { + d if d.is_nested() => Err(ArrowError::CsvError(format!( + "Nested type {} is not supported in CSV", + a.data_type() + ))), + DataType::Binary | DataType::LargeBinary => Err(ArrowError::CsvError( + "Binary data cannot be written to CSV".to_string(), + )), + _ => ArrayFormatter::try_new(a.as_ref(), &options), }) - .collect(); - - let mut buffer = vec!["".to_string(); batch.num_columns()]; + .collect::, ArrowError>>()?; + + let mut buffer = String::with_capacity(1024); + let mut byte_record = ByteRecord::with_capacity(1024, converters.len()); + + for row_idx in 0..batch.num_rows() { + byte_record.clear(); + for (col_idx, converter) in converters.iter().enumerate() { + buffer.clear(); + converter.value(row_idx).write(&mut buffer).map_err(|e| { + ArrowError::CsvError(format!( + "Error formatting row {} and column {}: {e}", + row_idx + 1, + col_idx + 1 + )) + })?; + byte_record.push_field(buffer.as_bytes()); + } - for row_index in 0..batch.num_rows() { - self.convert(columns.as_slice(), row_index, &mut buffer)?; - self.writer.write_record(&buffer).map_err(map_csv_error)?; + self.writer + .write_byte_record(&byte_record) + .map_err(map_csv_error)?; } self.writer.flush()?; @@ -384,16 +290,13 @@ impl WriterBuilder { self } - /// Use RFC3339 format for date/time/timestamps by clearing all - /// date/time specific formats. - pub fn with_rfc3339(mut self, use_rfc3339: bool) -> Self { - if use_rfc3339 { - self.date_format = None; - self.datetime_format = None; - self.time_format = None; - self.timestamp_format = None; - self.timestamp_tz_format = None; - } + /// Use RFC3339 format for date/time/timestamps + pub fn with_rfc3339(mut self) -> Self { + self.date_format = None; + self.datetime_format = None; + self.time_format = None; + self.timestamp_format = None; + self.timestamp_tz_format = None; self } @@ -423,15 +326,10 @@ mod tests { use super::*; use crate::Reader; + use arrow_array::types::*; use std::io::{Cursor, Read, Seek}; use std::sync::Arc; - fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { - ArrowError::CastError(format!( - "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" - )) - } - #[test] fn test_write_csv() { let schema = Schema::new(vec![ @@ -654,15 +552,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let batches = vec![&batch, &batch]; for batch in batches { - writer - .write(batch) - .map_err(|e| { - dbg!(e.to_string()); - assert!(e.to_string().ends_with( - invalid_cast_error("Date64", 1, 1).to_string().as_str() - )) - }) - .unwrap_err(); + let err = writer.write(batch).unwrap_err().to_string(); + assert_eq!(err, "Csv error: Error formatting row 2 and column 2: Cast error: Failed to convert 1926632005177685347 to temporal for Date64") } drop(writer); } @@ -700,7 +591,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); - let builder = WriterBuilder::new().with_rfc3339(true); + let builder = WriterBuilder::new().with_rfc3339(); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d2425a3d58a9..028b7d889157 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -105,7 +105,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_schema::*; -use arrow_cast::display::temporal_array_value_to_string; +use arrow_cast::display::{ArrayFormatter, FormatOptions}; fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where @@ -137,7 +137,6 @@ fn struct_array_to_jsonmap_array( row_count, struct_col, inner_col_names[j], - j, )? } Ok(inner_objs) @@ -217,26 +216,6 @@ macro_rules! set_column_by_array_type { }; } -macro_rules! set_temporal_column_by_array_type { - ($col_name:ident, $col_idx:ident, $rows:ident, $array:ident, $row_count:ident) => { - $rows - .iter_mut() - .enumerate() - .take($row_count) - .for_each(|(i, row)| { - if !$array.is_null(i) { - row.insert( - $col_name.to_string(), - temporal_array_value_to_string($array, $col_idx, i, None) - .unwrap() - .to_string() - .into(), - ); - } - }); - }; -} - fn set_column_by_primitive_type( rows: &mut [JsonMap], row_count: usize, @@ -264,7 +243,6 @@ fn set_column_for_json_rows( row_count: usize, array: &ArrayRef, col_name: &str, - col_idx: usize, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { @@ -315,47 +293,23 @@ fn set_column_for_json_rows( row_count ); } - DataType::Date32 => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Date64 => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Timestamp(TimeUnit::Second, _) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Time32(TimeUnit::Second) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Time32(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Time64(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Time64(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Duration(TimeUnit::Second) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Duration(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Duration(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); - } - DataType::Duration(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); + DataType::Date32 + | DataType::Date64 + | DataType::Timestamp(_, _) + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) => { + let options = FormatOptions::default(); + let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; + let data = array.data(); + rows.iter_mut().enumerate().for_each(|(idx, row)| { + if data.is_valid(idx) { + row.insert( + col_name.to_string(), + formatter.value(idx).to_string().into(), + ); + } + }); } DataType::Struct(_) => { let inner_objs = @@ -399,7 +353,7 @@ fn set_column_for_json_rows( let slice = array.slice(0, row_count); let hydrated = arrow_cast::cast::cast(&slice, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, row_count, &hydrated, col_name, col_idx)?; + set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -465,7 +419,7 @@ pub fn record_batches_to_json_rows( let row_count = batch.num_rows(); for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(&mut rows[base..], row_count, col, col_name, j)? + set_column_for_json_rows(&mut rows[base..], row_count, col, col_name)? } base += row_count; } @@ -937,7 +891,7 @@ mod tests { assert_json_eq( &buf, - r#"{"date32":"2018-11-13","date64":"2018-11-13","name":"a"} + r#"{"date32":"2018-11-13","date64":"2018-11-13T17:11:10.011","name":"a"} {"name":"b"} "#, ); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 78ad0258d512..9476535fa970 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -290,7 +290,7 @@ pub enum IntervalUnit { } // Sparse or Dense union layouts -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum UnionMode { Sparse, diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 9027a1cdc448..4defa71a779c 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -19,6 +19,7 @@ //! available unless `feature = "prettyprint"` is enabled. use crate::{array::ArrayRef, record_batch::RecordBatch}; +use arrow_cast::display::{ArrayFormatter, FormatOptions}; use comfy_table::{Cell, Table}; use std::fmt::Display; @@ -68,12 +69,19 @@ fn create_table(results: &[RecordBatch]) -> Result

{ } table.set_header(header); + let options = FormatOptions::default().with_display_error(true); + for batch in results { + let formatters = batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &options)) + .collect::>>()?; + for row in 0..batch.num_rows() { let mut cells = Vec::new(); - for col in 0..batch.num_columns() { - let column = batch.column(col); - cells.push(Cell::new(array_value_to_string(column, row)?)); + for formatter in &formatters { + cells.push(Cell::new(formatter.value(row))); } table.add_row(cells); } @@ -123,6 +131,8 @@ mod tests { use std::fmt::Write; use std::sync::Arc; + use arrow_array::builder::PrimitiveBuilder; + use arrow_array::types::{ArrowTimestampType, TimestampSecondType}; use half::f16; #[test] @@ -366,42 +376,33 @@ mod tests { let expected = $EXPECTED_RESULT; let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n", actual); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); }; } - /// Generate an array with type $ARRAYTYPE with a numeric value of - /// $VALUE, and compare $EXPECTED_RESULT to the output of - /// formatting that array with `pretty_format_batches` - macro_rules! check_datetime_with_timezone { - ($ARRAYTYPE:ident, $VALUE:expr, $TZ_STRING:expr, $EXPECTED_RESULT:expr) => { - let mut builder = $ARRAYTYPE::builder(10); - builder.append_value($VALUE); - builder.append_null(); - let array = builder.finish(); - let array = array.with_timezone($TZ_STRING); - - let schema = Arc::new(Schema::new(vec![Field::new( - "f", - array.data_type().clone(), - true, - )])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); - - let table = pretty_format_batches(&[batch]) - .expect("formatting batches") - .to_string(); - - let expected = $EXPECTED_RESULT; - let actual: Vec<&str> = table.lines().collect(); + fn timestamp_batch( + timezone: &str, + value: T::Native, + ) -> RecordBatch { + let mut builder = PrimitiveBuilder::::with_capacity(10); + builder.append_value(value); + builder.append_null(); + let array = builder.finish(); + let array = array.with_timezone(timezone); - assert_eq!(expected, actual, "Actual result:\n\n{:#?}\n\n", actual); - }; + let schema = Arc::new(Schema::new(vec![Field::new( + "f", + array.data_type().clone(), + true, + )])); + RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap() } #[test] #[cfg(features = "chrono-tz")] fn test_pretty_format_timestamp_second_with_utc_timezone() { + let batch = timestamp_batch::("UTC", 11111111); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+---------------------------+", "| f |", @@ -410,17 +411,15 @@ mod tests { "| |", "+---------------------------+", ]; - check_datetime_with_timezone!( - TimestampSecondArray, - 11111111, - "UTC".to_string(), - expected - ); + let actual: Vec<&str> = table.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); } #[test] #[cfg(features = "chrono-tz")] fn test_pretty_format_timestamp_second_with_non_utc_timezone() { + let batch = timestamp_batch::("Asia/Taipei", 11111111); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+---------------------------+", "| f |", @@ -429,16 +428,15 @@ mod tests { "| |", "+---------------------------+", ]; - check_datetime_with_timezone!( - TimestampSecondArray, - 11111111, - "Asia/Taipei".to_string(), - expected - ); + let actual: Vec<&str> = table.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); } #[test] fn test_pretty_format_timestamp_second_with_fixed_offset_timezone() { + let batch = timestamp_batch::("+08:00", 11111111); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); + let expected = vec![ "+---------------------------+", "| f |", @@ -447,48 +445,24 @@ mod tests { "| |", "+---------------------------+", ]; - check_datetime_with_timezone!( - TimestampSecondArray, - 11111111, - "+08:00".to_string(), - expected - ); + let actual: Vec<&str> = table.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); } #[test] + #[cfg(not(feature = "chrono-tz"))] fn test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() { - let expected = vec![ - "+-------------------------------------------------+", - "| f |", - "+-------------------------------------------------+", - "| 1970-05-09T14:25:11 (Unknown Time Zone '08:00') |", - "| |", - "+-------------------------------------------------+", - ]; - check_datetime_with_timezone!( - TimestampSecondArray, - 11111111, - "08:00".to_string(), - expected - ); + let batch = timestamp_batch::("08:00", 11111111); + let err = pretty_format_batches(&[batch]).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"08:00\": only offset based timezones supported without chrono-tz feature"); } #[test] + #[cfg(not(feature = "chrono-tz"))] fn test_pretty_format_timestamp_second_with_unknown_timezone() { - let expected = vec![ - "+---------------------------------------------------+", - "| f |", - "+---------------------------------------------------+", - "| 1970-05-09T14:25:11 (Unknown Time Zone 'Unknown') |", - "| |", - "+---------------------------------------------------+", - ]; - check_datetime_with_timezone!( - TimestampSecondArray, - 11111111, - "Unknown".to_string(), - expected - ); + let batch = timestamp_batch::("unknown", 11111111); + let err = pretty_format_batches(&[batch]).err().unwrap().to_string(); + assert_eq!(err, "Parser error: Invalid timezone \"unknown\": only offset based timezones supported without chrono-tz feature"); } #[test] @@ -559,12 +533,12 @@ mod tests { #[test] fn test_pretty_format_date_64() { let expected = vec![ - "+------------+", - "| f |", - "+------------+", - "| 2005-03-18 |", - "| |", - "+------------+", + "+---------------------+", + "| f |", + "+---------------------+", + "| 2005-03-18T01:58:20 |", + "| |", + "+---------------------+", ]; check_datetime!(Date64Array, 1111111100000, expected); } @@ -751,13 +725,13 @@ mod tests { let table = pretty_format_batches(&[batch])?.to_string(); let expected = vec![ - r#"+-------------------------------------+----+"#, - r#"| c1 | c2 |"#, - r#"+-------------------------------------+----+"#, - r#"| {"c11": 1, "c12": {"c121": "e"}} | a |"#, - r#"| {"c11": null, "c12": {"c121": "f"}} | b |"#, - r#"| {"c11": 5, "c12": {"c121": "g"}} | c |"#, - r#"+-------------------------------------+----+"#, + "+--------------------------+----+", + "| c1 | c2 |", + "+--------------------------+----+", + "| {c11: 1, c12: {c121: e}} | a |", + "| {c11: , c12: {c121: f}} | b |", + "| {c11: 5, c12: {c121: g}} | c |", + "+--------------------------+----+", ]; let actual: Vec<&str> = table.lines().collect(); diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 5a7c7e962a11..dbb399948302 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -93,7 +93,7 @@ fn test_export_csv_timestamps_using_rfc3339() { let mut sw = Vec::new(); let mut writer = arrow_csv::WriterBuilder::new() - .with_rfc3339(true) + .with_rfc3339() .build(&mut sw); let batches = vec![&batch]; for batch in batches { diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 15197c02e586..f427ce3e19e4 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -1360,8 +1360,8 @@ mod tests { r#""#.to_string(), r#""#.to_string(), r#"[]"#.to_string(), - r#"[{"list": [3, ], "integers": null}]"#.to_string(), - r#"[, {"list": null, "integers": 5}]"#.to_string(), + r#"[{list: [3, ], integers: }]"#.to_string(), + r#"[, {list: , integers: 5}]"#.to_string(), r#"[]"#.to_string(), ]; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 9235706d5c38..6260c2ed4d0d 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2314,17 +2314,17 @@ mod tests { // Verify data is as expected let expected = r#" - +-------------------------------------------------------------------------------------------------------------------------------------+ - | struct_b | - +-------------------------------------------------------------------------------------------------------------------------------------+ - | {"list": [{"leaf_a": 1, "leaf_b": 1}]} | - | {"list": null} | - | {"list": [{"leaf_a": 2, "leaf_b": null}, {"leaf_a": 3, "leaf_b": 2}]} | - | {"list": null} | - | {"list": [{"leaf_a": 4, "leaf_b": null}, {"leaf_a": 5, "leaf_b": null}]} | - | {"list": [{"leaf_a": 6, "leaf_b": null}, {"leaf_a": 7, "leaf_b": null}, {"leaf_a": 8, "leaf_b": null}, {"leaf_a": 9, "leaf_b": 1}]} | - | {"list": [{"leaf_a": 10, "leaf_b": null}]} | - +-------------------------------------------------------------------------------------------------------------------------------------+ + +-------------------------------------------------------------------------------------------------------+ + | struct_b | + +-------------------------------------------------------------------------------------------------------+ + | {list: [{leaf_a: 1, leaf_b: 1}]} | + | {list: } | + | {list: [{leaf_a: 2, leaf_b: }, {leaf_a: 3, leaf_b: 2}]} | + | {list: } | + | {list: [{leaf_a: 4, leaf_b: }, {leaf_a: 5, leaf_b: }]} | + | {list: [{leaf_a: 6, leaf_b: }, {leaf_a: 7, leaf_b: }, {leaf_a: 8, leaf_b: }, {leaf_a: 9, leaf_b: 1}]} | + | {list: [{leaf_a: 10, leaf_b: }]} | + +-------------------------------------------------------------------------------------------------------+ "#.trim().split('\n').map(|x| x.trim()).collect::>().join("\n"); let actual = pretty_format_batches(batches).unwrap().to_string(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 71f95e07a756..3e0d865c0610 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -61,13 +61,13 @@ //! "+----------+-------------+-----------+", //! "| bool_col | tinyint_col | float_col |", //! "+----------+-------------+-----------+", -//! "| true | 0 | 0 |", +//! "| true | 0 | 0.0 |", //! "| false | 1 | 1.1 |", -//! "| true | 0 | 0 |", +//! "| true | 0 | 0.0 |", //! "| false | 1 | 1.1 |", -//! "| true | 0 | 0 |", +//! "| true | 0 | 0.0 |", //! "| false | 1 | 1.1 |", -//! "| true | 0 | 0 |", +//! "| true | 0 | 0.0 |", //! "| false | 1 | 1.1 |", //! "+----------+-------------+-----------+", //! ], From 0b8c003d03deb3590fcce560effbbc1534be826a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Feb 2023 13:23:45 +0000 Subject: [PATCH 0587/1411] Use ArrayFormatter in Cast Kernel (#3668) * Use ArrayFormatter in cast kernel * Fixes * Further fixes * Update arrow-cast/src/cast.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-cast/src/cast.rs | 377 +++++--------------------------------- arrow/tests/array_cast.rs | 12 +- 2 files changed, 53 insertions(+), 336 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 69e42a5485e6..3137e685b212 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,15 +35,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{DateTime, NaiveDateTime, NaiveTime, Timelike}; +use chrono::{NaiveTime, Timelike}; use std::cmp::Ordering; use std::sync::Arc; -use crate::display::{array_value_to_string, lexical_to_string}; +use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::string_to_timestamp_nanos; use arrow_array::{ - builder::*, cast::*, iterator::ArrayIter, temporal_conversions::*, timezone::Tz, - types::*, *, + builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -155,13 +154,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8 || from_type == &LargeUtf8, (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8, - (Utf8, LargeUtf8) => true, - (LargeUtf8, Utf8) => true, (Binary, LargeBinary | Utf8 | LargeUtf8) => true, (LargeBinary, Binary | Utf8 | LargeUtf8) => true, (Utf8, Binary | LargeBinary + | LargeUtf8 | Date32 | Date64 | Time32(TimeUnit::Second) @@ -170,10 +168,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, + (Utf8, _) => to_type.is_numeric() && to_type != &Float16, (LargeUtf8, Binary | LargeBinary + | Utf8 | Date32 | Date64 | Time32(TimeUnit::Second) @@ -182,11 +181,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, - (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, - (Date32, Utf8) | (Date32, LargeUtf8) => true, - (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) && from_type != &Float16, + (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, + (_, Utf8 | LargeUtf8) => from_type.is_primitive(), // start numeric casts ( @@ -1115,7 +1111,6 @@ pub fn cast_with_options( ))), }, (Utf8, _) => match to_type { - LargeUtf8 => cast_byte_container::(array), UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), UInt32 => cast_string_to_numeric::(array, cast_options), @@ -1131,8 +1126,9 @@ pub fn cast_with_options( Binary => Ok(Arc::new(BinaryArray::from(as_string_array(array).clone()))), LargeBinary => { let binary = BinaryArray::from(as_string_array(array).clone()); - cast_byte_container::(&binary) + cast_byte_container::(&binary) } + LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(array, cast_options) } @@ -1152,75 +1148,6 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (_, Utf8) => match from_type { - LargeUtf8 => cast_byte_container::(array), - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< - TimestampNanosecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< - TimestampMicrosecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< - TimestampMillisecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz.as_ref()) - } - Date32 => cast_date32_to_string::(array), - Date64 => cast_date64_to_string::(array), - Binary => cast_binary_to_string::(array, cast_options), - LargeBinary => cast_binary_to_generic_string::(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, - (_, LargeUtf8) => match from_type { - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< - TimestampNanosecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< - TimestampMicrosecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< - TimestampMillisecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz.as_ref()) - } - Date32 => cast_date32_to_string::(array), - Date64 => cast_date64_to_string::(array), - Binary => cast_binary_to_generic_string::(array, cast_options), - LargeBinary => cast_binary_to_string::(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, (LargeUtf8, _) => match to_type { UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), @@ -1234,10 +1161,11 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(array, cast_options), Date64 => cast_string_to_date64::(array, cast_options), + Utf8 => cast_byte_container::(array), Binary => { let large_binary = LargeBinaryArray::from(as_largestring_array(array).clone()); - cast_byte_container::(&large_binary) + cast_byte_container::(&large_binary) } LargeBinary => Ok(Arc::new(LargeBinaryArray::from( as_largestring_array(array).clone(), @@ -1262,19 +1190,31 @@ pub fn cast_with_options( ))), }, (Binary, _) => match to_type { + Utf8 => cast_binary_to_string::(array, cast_options), + LargeUtf8 => { + let array = cast_binary_to_string::(array, cast_options)?; + cast_byte_container::(array.as_ref()) + } LargeBinary => { - cast_byte_container::(array) + cast_byte_container::(array) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeBinary, _) => match to_type { - Binary => cast_byte_container::(array), + Utf8 => { + let array = cast_binary_to_string::(array, cast_options)?; + cast_byte_container::(array.as_ref()) + } + LargeUtf8 => cast_binary_to_string::(array, cast_options), + Binary => cast_byte_container::(array), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, + (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array), + (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array), // start numeric casts (UInt8, UInt16) => { cast_numeric_arrays::(array, cast_options) @@ -2171,172 +2111,26 @@ where from.unary_opt::<_, R>(num::cast::cast::) } -fn as_time_with_string_op< - A: ArrayAccessor, - OffsetSize, - T: ArrowTemporalType, - F, ->( - iter: ArrayIter, - mut builder: GenericStringBuilder, - op: F, -) -> ArrayRef -where - OffsetSize: OffsetSizeTrait, - F: Fn(NaiveDateTime) -> String, - i64: From, -{ - iter.into_iter().for_each(|value| { - if let Some(value) = value { - match as_datetime::(>::from(value)) { - Some(dt) => builder.append_value(op(dt)), - None => builder.append_null(), +fn value_to_string( + array: &dyn Array, +) -> Result { + let mut builder = GenericStringBuilder::::new(); + let options = FormatOptions::default(); + let formatter = ArrayFormatter::try_new(array, &options)?; + let data = array.data(); + for i in 0..data.len() { + match data.is_null(i) { + true => builder.append_null(), + false => { + formatter.value(i).write(&mut builder)?; + // tell the builder the row is finished + builder.append_value(""); } - } else { - builder.append_null(); - } - }); - - Arc::new(builder.finish()) -} - -fn extract_component_from_datetime_array< - A: ArrayAccessor, - OffsetSize, - T: ArrowTemporalType, - F, ->( - iter: ArrayIter, - mut builder: GenericStringBuilder, - tz: &str, - op: F, -) -> Result -where - OffsetSize: OffsetSizeTrait, - F: Fn(DateTime) -> String, - i64: From, -{ - let tz: Tz = tz.parse()?; - for value in iter { - match value { - Some(value) => match as_datetime_with_timezone::(value.into(), tz) { - Some(time) => builder.append_value(op(time)), - _ => { - return Err(ArrowError::ComputeError( - "Unable to read value as datetime".to_string(), - )); - } - }, - None => builder.append_null(), } } Ok(Arc::new(builder.finish())) } -/// Cast timestamp types to Utf8/LargeUtf8 -fn cast_timestamp_to_string( - array: &dyn Array, - tz: Option<&String>, -) -> Result -where - T: ArrowTemporalType + ArrowPrimitiveType, - i64: From<::Native>, - OffsetSize: OffsetSizeTrait, -{ - let array = array.as_any().downcast_ref::>().unwrap(); - - let builder = GenericStringBuilder::::new(); - - if let Some(tz) = tz { - // The macro calls `as_datetime` on timestamp values of the array. - // After applying timezone offset on the datatime, calling `to_string` to get - // the strings. - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, OffsetSize, T, _>( - iter, - builder, - tz, - |t| t.to_string(), - ) - } else { - // No timezone available. Calling `to_string` on the datatime value simply. - let iter = ArrayIter::new(array); - Ok(as_time_with_string_op::<_, OffsetSize, T, _>( - iter, - builder, - |t| t.to_string(), - )) - } -} - -/// Cast date32 types to Utf8/LargeUtf8 -fn cast_date32_to_string( - array: &dyn Array, -) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - Ok(Arc::new( - (0..array.len()) - .map(|ix| { - if array.is_null(ix) { - None - } else { - array.value_as_date(ix).map(|v| v.to_string()) - } - }) - .collect::>(), - )) -} - -/// Cast date64 types to Utf8/LargeUtf8 -fn cast_date64_to_string( - array: &dyn Array, -) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - Ok(Arc::new( - (0..array.len()) - .map(|ix| { - if array.is_null(ix) { - None - } else { - array.value_as_datetime(ix).map(|v| v.to_string()) - } - }) - .collect::>(), - )) -} - -/// Cast numeric types to Utf8 -fn cast_numeric_to_string( - array: &dyn Array, -) -> Result -where - FROM: ArrowPrimitiveType, - FROM::Native: lexical_core::ToLexical, - OffsetSize: OffsetSizeTrait, -{ - Ok(Arc::new(numeric_to_string_cast::( - array - .as_any() - .downcast_ref::>() - .unwrap(), - ))) -} - -fn numeric_to_string_cast( - from: &PrimitiveArray, -) -> GenericStringArray -where - T: ArrowPrimitiveType + ArrowPrimitiveType, - T::Native: lexical_core::ToLexical, - OffsetSize: OffsetSizeTrait, -{ - from.iter() - .map(|maybe_value| maybe_value.map(lexical_to_string)) - .collect() -} - /// Cast numeric types to Utf8 fn cast_string_to_numeric( from: &dyn Array, @@ -3399,13 +3193,10 @@ fn cast_list_inner( /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. -fn cast_binary_to_string( +fn cast_binary_to_string( array: &dyn Array, cast_options: &CastOptions, -) -> Result -where - O: OffsetSizeTrait + ToPrimitive, -{ +) -> Result { let array = array .as_any() .downcast_ref::>>() @@ -3457,86 +3248,12 @@ where } } -/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs -/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending -/// `CastOptions`. -fn cast_binary_to_generic_string( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - I: OffsetSizeTrait + ToPrimitive, - O: OffsetSizeTrait + NumCast, -{ - let array = array - .as_any() - .downcast_ref::>>() - .unwrap(); - - if !cast_options.safe { - let offsets = array.value_offsets(); - let values = array.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values) - .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; - - let mut offset_builder = BufferBuilder::::new(offsets.len()); - // Checks if the offset is a valid char boundary and re-encode the offset - offsets - .iter() - .try_for_each::<_, Result<_, ArrowError>>(|offset| { - if !validated.is_char_boundary(offset.as_usize()) { - return Err(ArrowError::CastError( - "Invalid UTF-8 sequence".to_string(), - )); - } - - let offset = ::from(*offset).ok_or_else(|| { - ArrowError::ComputeError(format!( - "{}Binary array too large to cast to {}String array", - I::PREFIX, - O::PREFIX - )) - })?; - offset_builder.append(offset); - Ok(()) - })?; - - let offset_buffer = offset_builder.finish(); - - let builder = ArrayData::builder(GenericStringArray::::DATA_TYPE) - .len(array.len()) - .add_buffer(offset_buffer) - .add_buffer(array.data().buffers()[1].clone()) - .null_count(array.null_count()) - .null_bit_buffer(array.data().null_buffer().cloned()); - - // SAFETY: - // Validated UTF-8 above - Ok(Arc::new(GenericStringArray::::from(unsafe { - builder.build_unchecked() - }))) - } else { - Ok(Arc::new( - array - .iter() - .map(|maybe_value| { - maybe_value.and_then(|value| std::str::from_utf8(value).ok()) - }) - .collect::>>(), - )) - } -} - /// Helper function to cast from one `ByteArrayType` to another and vice versa. /// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error. -fn cast_byte_container( - array: &dyn Array, -) -> Result +fn cast_byte_container(array: &dyn Array) -> Result where - FROM: ByteArrayType, - TO: ByteArrayType, + FROM: ByteArrayType, + TO: ByteArrayType, FROM::Offset: OffsetSizeTrait + ToPrimitive, TO::Offset: OffsetSizeTrait + NumCast, { @@ -5521,8 +5238,8 @@ mod tests { let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00", c.value(1)); + assert_eq!("1997-05-19T00:00:00", c.value(0)); + assert_eq!("2018-12-25T00:00:00", c.value(1)); } #[test] diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index ae73b1b4200b..7eeb00a8290a 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -56,8 +56,8 @@ fn test_cast_timestamp_to_string() { let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); + assert_eq!("1997-05-19T00:00:00.005Z", c.value(0)); + assert_eq!("2018-12-25T00:00:00.001Z", c.value(1)); assert!(c.is_null(2)); } @@ -442,9 +442,9 @@ fn test_timestamp_cast_utf8() { let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); let expected = StringArray::from(vec![ - Some("1970-01-01 10:30:00"), + Some("1970-01-01T10:30:00"), None, - Some("1970-01-01 23:58:59"), + Some("1970-01-01T23:58:59"), ]); assert_eq!( @@ -458,9 +458,9 @@ fn test_timestamp_cast_utf8() { let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00 +10:00"), + Some("1970-01-01T20:30:00+10:00"), None, - Some("1970-01-02 09:58:59 +10:00"), + Some("1970-01-02T09:58:59+10:00"), ]); assert_eq!( From 6ec7226cdcafa9326d7f11a3e421e1e1038ca677 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 9 Feb 2023 14:24:32 +0100 Subject: [PATCH 0588/1411] Minor: Add some examples to Date*Array and Time*Array (#3678) * Minor: Add some examples to Date*Array and Time*Array * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh * Update for NaiveDate --------- Co-authored-by: Liang-Chi Hsieh --- arrow-array/src/array/primitive_array.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 6902f13646a2..aeece612ded2 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -168,22 +168,42 @@ pub type TimestampNanosecondArray = PrimitiveArray; // TODO: give examples for the below types -/// A primitive array where each element is of 32-bit date type. +/// A primitive array where each element is of 32-bit value +/// representing the elapsed time since UNIX epoch in days." +/// +/// This type is similar to the [`chrono::NaiveDate`] type and can hold +/// values such as `2018-11-13` pub type Date32Array = PrimitiveArray; -/// A primitive array where each element is of 64-bit date type. +/// A primitive array where each element is a 64-bit value +/// representing the elapsed time since the UNIX epoch in milliseconds. +/// +/// This type is similar to the [`chrono::NaiveDateTime`] type and can hold +/// values such as `2018-11-13T17:11:10.011` pub type Date64Array = PrimitiveArray; /// An array where each element is of 32-bit type representing time elapsed in seconds /// since midnight. +/// +/// This type is similar to the [`chrono::NaiveTime`] type and can +/// hold values such as `00:02:00` pub type Time32SecondArray = PrimitiveArray; /// An array where each element is of 32-bit type representing time elapsed in milliseconds /// since midnight. +/// +/// This type is similar to the [`chrono::NaiveTime`] type and can +/// hold values such as `00:02:00.123` pub type Time32MillisecondArray = PrimitiveArray; /// An array where each element is of 64-bit type representing time elapsed in microseconds /// since midnight. +/// +/// This type is similar to the [`chrono::NaiveTime`] type and can +/// hold values such as `00:02:00.123456` pub type Time64MicrosecondArray = PrimitiveArray; /// An array where each element is of 64-bit type representing time elapsed in nanoseconds /// since midnight. +/// +/// This type is similar to the [`chrono::NaiveTime`] type and can +/// hold values such as `00:02:00.123456789` pub type Time64NanosecondArray = PrimitiveArray; /// An array where each element is a “calendar” interval in months. From 7cd29d7353369589c18377de4300c44f91a54462 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Feb 2023 13:24:52 +0000 Subject: [PATCH 0589/1411] Add ArrayData::new_null and DataType::primitive_width (#3676) * Add ArrayData::new_null and DataType::primitive_width * Add FixedSizeBinary test * Update arrow-data/src/data.rs Co-authored-by: askoa <112126368+askoa@users.noreply.github.com> * Only generate nulls for first UnionArray child --------- Co-authored-by: askoa <112126368+askoa@users.noreply.github.com> --- arrow-array/src/array/mod.rs | 279 ++++++++------------------- arrow-data/src/data.rs | 354 +++++++++++++++-------------------- arrow-schema/src/datatype.rs | 33 ++++ arrow/src/ffi.rs | 45 ++--- 4 files changed, 277 insertions(+), 434 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index e953781e5c98..b293d797e46e 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,6 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{Buffer, MutableBuffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -634,207 +633,7 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef { /// assert_eq!(&array, &null_array); /// ``` pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { - // context: https://github.com/apache/arrow/pull/9469#discussion_r574761687 - match data_type { - DataType::Null => Arc::new(NullArray::new(length)), - DataType::Boolean => { - let null_buf: Buffer = MutableBuffer::new_null(length).into(); - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(null_buf.clone()), - 0, - vec![null_buf], - vec![], - ) - }) - } - DataType::Int8 => new_null_sized_array::(data_type, length), - DataType::UInt8 => new_null_sized_array::(data_type, length), - DataType::Int16 => new_null_sized_array::(data_type, length), - DataType::UInt16 => new_null_sized_array::(data_type, length), - DataType::Float16 => new_null_sized_array::(data_type, length), - DataType::Int32 => new_null_sized_array::(data_type, length), - DataType::UInt32 => new_null_sized_array::(data_type, length), - DataType::Float32 => new_null_sized_array::(data_type, length), - DataType::Date32 => new_null_sized_array::(data_type, length), - // expanding this into Date23{unit}Type results in needless branching - DataType::Time32(_) => new_null_sized_array::(data_type, length), - DataType::Int64 => new_null_sized_array::(data_type, length), - DataType::UInt64 => new_null_sized_array::(data_type, length), - DataType::Float64 => new_null_sized_array::(data_type, length), - DataType::Date64 => new_null_sized_array::(data_type, length), - // expanding this into Timestamp{unit}Type results in needless branching - DataType::Timestamp(_, _) => new_null_sized_array::(data_type, length), - DataType::Time64(_) => new_null_sized_array::(data_type, length), - DataType::Duration(_) => new_null_sized_array::(data_type, length), - DataType::Interval(unit) => match unit { - IntervalUnit::YearMonth => { - new_null_sized_array::(data_type, length) - } - IntervalUnit::DayTime => { - new_null_sized_array::(data_type, length) - } - IntervalUnit::MonthDayNano => { - new_null_sized_array::(data_type, length) - } - }, - DataType::FixedSizeBinary(value_len) => make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; *value_len as usize * length])], - vec![], - ) - }), - DataType::Binary | DataType::Utf8 => { - new_null_binary_array::(data_type, length) - } - DataType::LargeBinary | DataType::LargeUtf8 => { - new_null_binary_array::(data_type, length) - } - DataType::List(field) => { - new_null_list_array::(data_type, field.data_type(), length) - } - DataType::LargeList(field) => { - new_null_list_array::(data_type, field.data_type(), length) - } - DataType::FixedSizeList(field, value_len) => make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![], - vec![ - new_null_array(field.data_type(), *value_len as usize * length) - .data() - .clone(), - ], - ) - }), - DataType::Struct(fields) => { - let fields: Vec<_> = fields - .iter() - .map(|field| (field.clone(), new_null_array(field.data_type(), length))) - .collect(); - - let null_buffer = MutableBuffer::new_null(length); - Arc::new(StructArray::from((fields, null_buffer.into()))) - } - DataType::Map(field, _keys_sorted) => { - new_null_list_array::(data_type, field.data_type(), length) - } - DataType::Union(_, _, _) => { - unimplemented!("Creating null Union array not yet supported") - } - DataType::Dictionary(key, value) => { - let keys = new_null_array(key, length); - let keys = keys.data(); - - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - keys.null_buffer().cloned(), - 0, - keys.buffers().into(), - vec![new_empty_array(value.as_ref()).into_data()], - ) - }) - } - DataType::Decimal128(_, _) => { - new_null_sized_decimal(data_type, length, std::mem::size_of::()) - } - DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, length, 32), - DataType::RunEndEncoded(_, _) => todo!(), - } -} - -#[inline] -fn new_null_list_array( - data_type: &DataType, - child_data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from( - vec![OffsetSize::zero(); length + 1].to_byte_slice(), - )], - vec![ArrayData::new_empty(child_data_type)], - ) - }) -} - -#[inline] -fn new_null_binary_array( - data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![ - Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), - MutableBuffer::new(0).into(), - ], - vec![], - ) - }) -} - -#[inline] -fn new_null_sized_array( - data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], - vec![], - ) - }) -} - -#[inline] -fn new_null_sized_decimal( - data_type: &DataType, - length: usize, - byte_width: usize, -) -> ArrayRef { - make_array(unsafe { - ArrayData::new_unchecked( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; length * byte_width])], - vec![], - ) - }) + make_array(ArrayData::new_null(data_type, length)) } // Helper function for printing potentially long arrays. @@ -881,8 +680,10 @@ where #[cfg(test)] mod tests { use super::*; - use crate::cast::downcast_array; - use arrow_schema::Field; + use crate::cast::{as_union_array, downcast_array}; + use crate::downcast_run_array; + use arrow_buffer::{Buffer, MutableBuffer}; + use arrow_schema::{Field, UnionMode}; #[test] fn test_empty_primitive() { @@ -1012,6 +813,76 @@ mod tests { ); } + #[test] + fn test_null_union() { + for mode in [UnionMode::Sparse, UnionMode::Dense] { + let data_type = DataType::Union( + vec![ + Field::new("foo", DataType::Int32, true), + Field::new("bar", DataType::Int64, true), + ], + vec![2, 1], + mode, + ); + let array = new_null_array(&data_type, 4); + + let array = as_union_array(array.as_ref()); + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + + for i in 0..4 { + let a = array.value(i); + assert_eq!(a.len(), 1); + assert_eq!(a.null_count(), 1); + assert!(a.is_null(0)) + } + } + } + + #[test] + #[allow(unused_parens)] + fn test_null_runs() { + for r in [DataType::Int16, DataType::Int32, DataType::Int64] { + let data_type = DataType::RunEndEncoded( + Box::new(Field::new("run_ends", r, false)), + Box::new(Field::new("values", DataType::Utf8, true)), + ); + + let array = new_null_array(&data_type, 4); + let array = array.as_ref(); + + downcast_run_array! { + array => { + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + assert_eq!(array.values().len(), 1); + assert_eq!(array.values().null_count(), 1); + assert_eq!(array.run_ends().values(), &[4]); + + let idx = array.get_physical_indices(&[0, 1, 2, 3]).unwrap(); + assert_eq!(idx, &[0,0,0,0]); + } + d => unreachable!("{d}") + } + } + } + + #[test] + fn test_null_fixed_size_binary() { + for size in [1, 2, 7] { + let array = new_null_array(&DataType::FixedSizeBinary(size), 6); + let array = array + .as_ref() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(array.len(), 6); + assert_eq!(array.null_count(), 6); + array.iter().for_each(|x| assert!(x.is_none())); + } + } + #[test] fn test_memory_size_null() { let null_arr = NullArray::new(32); diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 709262e83464..8b727ec953a9 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -21,8 +21,7 @@ use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; use arrow_buffer::bit_chunk_iterator::BitChunks; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; -use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; -use half::f16; +use arrow_schema::{ArrowError, DataType, UnionMode}; use std::convert::TryInto; use std::mem; use std::ops::Range; @@ -69,71 +68,25 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff let buffer = MutableBuffer::new(bytes); [buffer, empty_buffer] } - DataType::UInt8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Float16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Float32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Float64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Date32 | DataType::Time32(_) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Date64 + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Date32 + | DataType::Time32(_) + | DataType::Date64 | DataType::Time64(_) | DataType::Duration(_) - | DataType::Timestamp(_, _) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Interval(IntervalUnit::YearMonth) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Interval(IntervalUnit::DayTime) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Interval(IntervalUnit::MonthDayNano) => [ - MutableBuffer::new(capacity * mem::size_of::()), + | DataType::Timestamp(_, _) + | DataType::Interval(_) => [ + MutableBuffer::new(capacity * data_type.primitive_width().unwrap()), empty_buffer, ], DataType::Utf8 | DataType::Binary => { @@ -163,41 +116,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeBinary(size) => { [MutableBuffer::new(capacity * *size as usize), empty_buffer] } - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - _ => unreachable!(), - }, + DataType::Dictionary(k, _) => [ + MutableBuffer::new(capacity * k.primitive_width().unwrap()), + empty_buffer, + ], DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => [empty_buffer, MutableBuffer::new(0)], @@ -667,83 +589,125 @@ impl ArrayData { &values.1[self.offset..] } - /// Returns a new empty [ArrayData] valid for `data_type`. - pub fn new_empty(data_type: &DataType) -> Self { - let buffers = new_buffers(data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(data_type, buffer1, buffer2); - - let child_data = match data_type { - DataType::Null - | DataType::Boolean - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) - | DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::FixedSizeBinary(_) - | DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) => vec![], - DataType::List(field) => { - vec![Self::new_empty(field.data_type())] - } - DataType::FixedSizeList(field, _) => { - vec![Self::new_empty(field.data_type())] - } - DataType::LargeList(field) => { - vec![Self::new_empty(field.data_type())] - } - DataType::Struct(fields) => fields - .iter() - .map(|field| Self::new_empty(field.data_type())) - .collect(), - DataType::Map(field, _) => { - vec![Self::new_empty(field.data_type())] - } - DataType::Union(fields, _, _) => fields - .iter() - .map(|field| Self::new_empty(field.data_type())) - .collect(), - DataType::Dictionary(_, data_type) => { - vec![Self::new_empty(data_type)] - } - DataType::RunEndEncoded(run_ends, values) => { - vec![ - Self::new_empty(run_ends.data_type()), - Self::new_empty(values.data_type()), - ] - } + /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values + pub fn new_null(data_type: &DataType, len: usize) -> Self { + let bit_len = bit_util::ceil(len, 8); + let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len)); + + let (buffers, child_data, has_nulls) = match data_type.primitive_width() { + Some(width) => (vec![zeroed(width * len)], vec![], true), + None => match data_type { + DataType::Null => (vec![], vec![], false), + DataType::Boolean => (vec![zeroed(bit_len)], vec![], true), + DataType::Binary | DataType::Utf8 => { + (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true) + } + DataType::LargeBinary | DataType::LargeUtf8 => { + (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true) + } + DataType::FixedSizeBinary(i) => { + (vec![zeroed(*i as usize * len)], vec![], true) + } + DataType::List(f) | DataType::Map(f, _) => ( + vec![zeroed((len + 1) * 4)], + vec![ArrayData::new_empty(f.data_type())], + true, + ), + DataType::LargeList(f) => ( + vec![zeroed((len + 1) * 8)], + vec![ArrayData::new_empty(f.data_type())], + true, + ), + DataType::FixedSizeList(f, list_len) => ( + vec![], + vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)], + true, + ), + DataType::Struct(fields) => ( + vec![], + fields + .iter() + .map(|f| Self::new_null(f.data_type(), len)) + .collect(), + true, + ), + DataType::Dictionary(k, v) => ( + vec![zeroed(k.primitive_width().unwrap() * len)], + vec![ArrayData::new_empty(v.as_ref())], + true, + ), + DataType::Union(f, i, mode) => { + let ids = Buffer::from_iter(std::iter::repeat(i[0]).take(len)); + let buffers = match mode { + UnionMode::Sparse => vec![ids], + UnionMode::Dense => { + let end_offset = i32::from_usize(len).unwrap(); + vec![ids, Buffer::from_iter(0_i32..end_offset)] + } + }; + + let children = f + .iter() + .enumerate() + .map(|(idx, f)| match idx { + 0 => Self::new_null(f.data_type(), len), + _ => Self::new_empty(f.data_type()), + }) + .collect(); + + (buffers, children, false) + } + DataType::RunEndEncoded(r, v) => { + let runs = match r.data_type() { + DataType::Int16 => { + let i = i16::from_usize(len).expect("run overflow"); + Buffer::from_slice_ref([i]) + } + DataType::Int32 => { + let i = i32::from_usize(len).expect("run overflow"); + Buffer::from_slice_ref([i]) + } + DataType::Int64 => { + let i = i64::from_usize(len).expect("run overflow"); + Buffer::from_slice_ref([i]) + } + dt => unreachable!("Invalid run ends data type {dt}"), + }; + + let builder = ArrayData::builder(r.data_type().clone()) + .len(1) + .buffers(vec![runs]); + + // SAFETY: + // Valid by construction + let runs = unsafe { builder.build_unchecked() }; + ( + vec![], + vec![runs, ArrayData::new_null(v.data_type(), 1)], + false, + ) + } + d => unreachable!("{d}"), + }, }; - // Data was constructed correctly above - unsafe { - Self::new_unchecked( - data_type.clone(), - 0, - Some(0), - None, - 0, - buffers, - child_data, - ) + let mut builder = ArrayDataBuilder::new(data_type.clone()) + .len(len) + .buffers(buffers) + .child_data(child_data); + + if has_nulls { + builder = builder.null_count(len).null_bit_buffer(Some(zeroed(len))) } + + // SAFETY: + // Data valid by construction + unsafe { builder.build_unchecked() } + } + + /// Returns a new empty [ArrayData] valid for `data_type`. + pub fn new_empty(data_type: &DataType) -> Self { + Self::new_null(data_type, 0) } /// "cheap" validation of an `ArrayData`. Ensures buffers are @@ -1578,30 +1542,24 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { buffers: vec![BufferSpec::BitMap], can_contain_null_mask: true, }, - DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Float16 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Interval(IntervalUnit::YearMonth) => { - DataTypeLayout::new_fixed_width(size_of::()) - } - DataType::Interval(IntervalUnit::DayTime) => { - DataTypeLayout::new_fixed_width(size_of::()) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - DataTypeLayout::new_fixed_width(size_of::()) + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Timestamp(_, _) + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Interval(_) => { + DataTypeLayout::new_fixed_width(data_type.primitive_width().unwrap()) } DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Binary => DataTypeLayout::new_binary(size_of::()), diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 9476535fa970..56eb6e8cef16 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -410,6 +410,39 @@ impl DataType { } } + /// Returns the bit width of this type if it is a primitive type + /// + /// Returns `None` if not a primitive type + #[inline] + pub fn primitive_width(&self) -> Option { + match self { + DataType::Null => None, + DataType::Boolean => None, + DataType::Int8 | DataType::UInt8 => Some(1), + DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2), + DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4), + DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8), + DataType::Timestamp(_, _) => Some(8), + DataType::Date32 | DataType::Time32(_) => Some(4), + DataType::Date64 | DataType::Time64(_) => Some(8), + DataType::Duration(_) => Some(8), + DataType::Interval(IntervalUnit::YearMonth) => Some(4), + DataType::Interval(IntervalUnit::DayTime) => Some(8), + DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), + DataType::Decimal128(_, _) => Some(16), + DataType::Decimal256(_, _) => Some(32), + DataType::Utf8 | DataType::LargeUtf8 => None, + DataType::Binary | DataType::LargeBinary => None, + DataType::FixedSizeBinary(_) => None, + DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None, + DataType::FixedSizeList(_, _) => None, + DataType::Struct(_) => None, + DataType::Union(_, _, _) => None, + DataType::Dictionary(_, _) => None, + DataType::RunEndEncoded(_, _) => None, + } + } + /// Return size of this instance in bytes. /// /// Includes the size of `Self`. diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index dc234c8590ad..78dd1ef453cb 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -120,7 +120,6 @@ use std::{ sync::Arc, }; -use arrow_buffer::i256; use arrow_schema::UnionMode; use bitflags::bitflags; @@ -311,39 +310,21 @@ impl Drop for FFI_ArrowSchema { // This is set by the Arrow specification #[allow(clippy::manual_bits)] fn bit_width(data_type: &DataType, i: usize) -> Result { + if let Some(primitive) = data_type.primitive_width() { + return match i { + 0 => Err(ArrowError::CDataInterface(format!( + "The datatype \"{data_type:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." + ))), + 1 => Ok(primitive * 8), + i => Err(ArrowError::CDataInterface(format!( + "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + ))), + }; + } + Ok(match (data_type, i) { - // primitive types first buffer's size is given by the native types (DataType::Boolean, 1) => 1, - (DataType::UInt8, 1) => size_of::() * 8, - (DataType::UInt16, 1) => size_of::() * 8, - (DataType::UInt32, 1) => size_of::() * 8, - (DataType::UInt64, 1) => size_of::() * 8, - (DataType::Int8, 1) => size_of::() * 8, - (DataType::Int16, 1) => size_of::() * 8, - (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_), 1) => size_of::() * 8, - (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 1) => size_of::() * 8, - (DataType::Float32, 1) => size_of::() * 8, - (DataType::Float64, 1) => size_of::() * 8, - (DataType::Decimal128(..), 1) => size_of::() * 8, - (DataType::Decimal256(..), 1) => size_of::() * 8, - (DataType::Timestamp(..), 1) => size_of::() * 8, - (DataType::Duration(..), 1) => size_of::() * 8, - // primitive types have a single buffer - (DataType::Boolean, _) | - (DataType::UInt8, _) | - (DataType::UInt16, _) | - (DataType::UInt32, _) | - (DataType::UInt64, _) | - (DataType::Int8, _) | - (DataType::Int16, _) | - (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_), _) | - (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), _) | - (DataType::Float32, _) | - (DataType::Float64, _) | - (DataType::Decimal128(..), _) | - (DataType::Decimal256(..), _) | - (DataType::Timestamp(..), _) | - (DataType::Duration(..), _) => { + (DataType::Boolean, _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) From 98ce68f3229a12e4e057802c4a2cd74170a66c73 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Thu, 9 Feb 2023 08:26:10 -0500 Subject: [PATCH 0590/1411] fix take_run benchmark paraeter (#3679) Co-authored-by: ask --- arrow/benches/primitive_run_take.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow/benches/primitive_run_take.rs b/arrow/benches/primitive_run_take.rs index 8c9a3fd04b7a..82ff35949e79 100644 --- a/arrow/benches/primitive_run_take.rs +++ b/arrow/benches/primitive_run_take.rs @@ -24,14 +24,14 @@ use arrow_array::UInt32Array; use criterion::{criterion_group, criterion_main, Criterion}; use rand::Rng; -fn create_random_index(size: usize, null_density: f32) -> UInt32Array { +fn create_random_index(size: usize, null_density: f32, max_value: usize) -> UInt32Array { let mut rng = seedable_rng(); let mut builder = UInt32Builder::with_capacity(size); for _ in 0..size { if rng.gen::() < null_density { builder.append_null(); } else { - let value = rng.gen_range::(0u32..size as u32); + let value = rng.gen_range::(0u32..max_value as u32); builder.append_value(value); } } @@ -48,7 +48,7 @@ fn criterion_benchmark(c: &mut Criterion) { logical_array_len, physical_array_len, ); - let indices = create_random_index(take_len, 0.0); + let indices = create_random_index(take_len, 0.0, logical_array_len); group.bench_function( format!( "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, take_len:{take_len})"), From bb4fc59009e7c5861a6b1967a53e9daa2554d5c6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Feb 2023 20:45:32 +0000 Subject: [PATCH 0591/1411] Cleanup FFI interface (#3684) (#3683) (#3685) * Cleanup FFI interface (#3684) (#3683) * Add import example * Tweak doc example * Use ManuallyDrop to model external memory --- arrow/src/array/ffi.rs | 20 ++--- arrow/src/array/mod.rs | 1 + arrow/src/ffi.rs | 184 +++++++++++++++++++++++++---------------- arrow/src/pyarrow.rs | 16 ++-- 4 files changed, 126 insertions(+), 95 deletions(-) diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index fb7771ac620e..c7bc8e9f8a74 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -47,6 +47,8 @@ impl TryFrom for ffi::ArrowArray { /// # Safety /// Assumes that these pointers represent valid C Data Interfaces, both in memory /// representation and lifetime via the `release` mechanism. +#[deprecated(note = "Use ArrowArray::new")] +#[allow(deprecated)] pub unsafe fn make_array_from_raw( array: *const ffi::FFI_ArrowArray, schema: *const ffi::FFI_ArrowSchema, @@ -91,30 +93,22 @@ mod tests { StructArray, UInt32Array, UInt64Array, }, datatypes::{DataType, Field}, - ffi::ArrowArray, + ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}, }; use std::convert::TryFrom; use std::sync::Arc; fn test_round_trip(expected: &ArrayData) -> Result<()> { - // create a `ArrowArray` from the data. - let d1 = ArrowArray::try_from(expected.clone())?; - - // here we export the array as 2 pointers. We would have no control over ownership if it was not for - // the release mechanism. - let (array, schema) = ArrowArray::into_raw(d1); + // here we export the array + let array = FFI_ArrowArray::new(expected); + let schema = FFI_ArrowSchema::try_from(expected.data_type())?; // simulate an external consumer by being the consumer - let d1 = unsafe { ArrowArray::try_from_raw(array, schema) }?; + let d1 = ArrowArray::new(array, schema); let result = &ArrayData::try_from(d1)?; assert_eq!(result, expected); - - unsafe { - Arc::from_raw(array); - Arc::from_raw(schema); - } Ok(()) } diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 1a10725df678..09348996eafa 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -34,6 +34,7 @@ pub use arrow_data::{ pub use arrow_data::transform::{Capacities, MutableArrayData}; #[cfg(feature = "ffi")] +#[allow(deprecated)] pub use self::ffi::{export_array_into_raw, make_array_from_raw}; // --------------------- Array's values comparison --------------------- diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 78dd1ef453cb..0f0f94c7a6b8 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -24,68 +24,63 @@ //! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`, //! `Buffer`, etc. This is handled by `ArrowArray`. //! +//! +//! Export to FFI +//! //! ```rust //! # use std::sync::Arc; -//! # use arrow::array::{Int32Array, Array, ArrayData, export_array_into_raw, make_array, make_array_from_raw}; -//! # use arrow::error::{Result, ArrowError}; +//! # use arrow::array::{Int32Array, Array, ArrayData, make_array}; +//! # use arrow::error::Result; //! # use arrow::compute::kernels::arithmetic; //! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; -//! # use std::convert::TryFrom; //! # fn main() -> Result<()> { //! // create an array natively //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); +//! let data = array.into_data(); //! -//! // export it -//! -//! let ffi_array = ArrowArray::try_new(array.data().clone())?; -//! let (array_ptr, schema_ptr) = ArrowArray::into_raw(ffi_array); -//! -//! // consumed and used by something else... +//! // Export it +//! let out_array = FFI_ArrowArray::new(&data); +//! let out_schema = FFI_ArrowSchema::try_from(data.data_type())?; //! //! // import it -//! let array = unsafe { make_array_from_raw(array_ptr, schema_ptr)? }; +//! let array = ArrowArray::new(out_array, out_schema); +//! let array = Int32Array::from(ArrayData::try_from(array)?); //! //! // perform some operation -//! let array = array.as_any().downcast_ref::().ok_or( -//! ArrowError::ParseError("Expects an int32".to_string()), -//! )?; //! let array = arithmetic::add(&array, &array)?; //! //! // verify //! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)])); +//! # +//! # Ok(()) +//! # } +//! ``` //! -//! // Simulate if raw pointers are provided by consumer -//! let array = make_array(Int32Array::from(vec![Some(1), None, Some(3)]).into_data()); -//! -//! let out_array = Box::new(FFI_ArrowArray::empty()); -//! let out_schema = Box::new(FFI_ArrowSchema::empty()); -//! let out_array_ptr = Box::into_raw(out_array); -//! let out_schema_ptr = Box::into_raw(out_schema); -//! -//! // export array into raw pointers from consumer -//! unsafe { export_array_into_raw(array, out_array_ptr, out_schema_ptr)?; }; -//! -//! // import it -//! let array = unsafe { make_array_from_raw(out_array_ptr, out_schema_ptr)? }; +//! Import from FFI //! -//! // perform some operation -//! let array = array.as_any().downcast_ref::().ok_or( -//! ArrowError::ParseError("Expects an int32".to_string()), -//! )?; -//! let array = arithmetic::add(&array, &array)?; -//! -//! // verify -//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)])); +//! ``` +//! # use std::ptr::addr_of_mut; +//! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; +//! # use arrow_array::{ArrayRef, make_array}; +//! # use arrow_schema::ArrowError; +//! # +//! /// A foreign data container that can export to C Data interface +//! struct ForeignArray {}; //! -//! // (drop/release) -//! unsafe { -//! Box::from_raw(out_array_ptr); -//! Box::from_raw(out_schema_ptr); -//! Arc::from_raw(array_ptr); -//! Arc::from_raw(schema_ptr); +//! impl ForeignArray { +//! /// Export from foreign array representation to C Data interface +//! /// e.g. +//! fn export_to_c(&self, array: *mut FFI_ArrowArray, schema: *mut FFI_ArrowSchema) { +//! // ... +//! } //! } //! -//! Ok(()) +//! /// Import an [`ArrayRef`] from a [`ForeignArray`] +//! fn import_array(foreign: &ForeignArray) -> Result { +//! let mut schema = FFI_ArrowSchema::empty(); +//! let mut array = FFI_ArrowArray::empty(); +//! foreign.export_to_c(addr_of_mut!(array), addr_of_mut!(schema)); +//! Ok(make_array(ArrowArray::new(array, schema).try_into()?)) //! } //! ``` @@ -139,7 +134,15 @@ bitflags! { /// ABI-compatible struct for `ArrowSchema` from C Data Interface /// See -/// This was created by bindgen +/// +/// ``` +/// # use arrow::ffi::FFI_ArrowSchema; +/// # use arrow_data::ArrayData; +/// fn array_schema(data: &ArrayData) -> FFI_ArrowSchema { +/// FFI_ArrowSchema::try_from(data.data_type()).unwrap() +/// } +/// ``` +/// #[repr(C)] #[derive(Debug)] pub struct FFI_ArrowSchema { @@ -394,7 +397,14 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { /// ABI-compatible struct for ArrowArray from C Data Interface /// See -/// This was created by bindgen +/// +/// ``` +/// # use arrow::ffi::FFI_ArrowArray; +/// # use arrow_array::Array; +/// fn export_array(array: &dyn Array) -> FFI_ArrowArray { +/// FFI_ArrowArray::new(array.data()) +/// } +/// ``` #[repr(C)] #[derive(Debug)] pub struct FFI_ArrowArray { @@ -859,6 +869,14 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { } impl ArrowArray { + /// Creates a new [`ArrowArray`] from the provided array and schema + pub fn new(array: FFI_ArrowArray, schema: FFI_ArrowSchema) -> Self { + Self { + array: Arc::new(array), + schema: Arc::new(schema), + } + } + /// creates a new `ArrowArray`. This is used to export to the C Data Interface. /// /// # Memory Leaks @@ -878,6 +896,7 @@ impl ArrowArray { /// on managing the allocation of the structs by themselves. /// # Error /// Errors if any of the pointers is null + #[deprecated(note = "Use ArrowArray::new")] pub unsafe fn try_from_raw( array: *const FFI_ArrowArray, schema: *const FFI_ArrowSchema, @@ -911,6 +930,7 @@ impl ArrowArray { } /// exports [ArrowArray] to the C Data Interface + #[deprecated(note = "Use FFI_ArrowArray and FFI_ArrowSchema directly")] pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) { (Arc::into_raw(this.array), Arc::into_raw(this.schema)) } @@ -946,28 +966,49 @@ mod tests { use arrow_array::types::{Float64Type, Int32Type}; use arrow_array::{Float64Array, UnionArray}; use std::convert::TryFrom; + use std::mem::ManuallyDrop; + use std::ptr::addr_of_mut; #[test] - fn test_round_trip() -> Result<()> { + fn test_round_trip() { // create an array natively let array = Int32Array::from(vec![1, 2, 3]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let array = ArrowArray::try_from(array.into_data()).unwrap(); // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = array.as_any().downcast_ref::().unwrap(); - let array = kernels::arithmetic::add(array, array).unwrap(); + let array = Int32Array::from(ArrayData::try_from(array).unwrap()); + let array = kernels::arithmetic::add(&array, &array).unwrap(); // verify assert_eq!(array, Int32Array::from(vec![2, 4, 6])); + } - // (drop/release) - Ok(()) + #[test] + fn test_import() { + // Model receiving const pointers from an external system + + // Create an array natively + let data = Int32Array::from(vec![1, 2, 3]).into_data(); + let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); + let array = FFI_ArrowArray::new(&data); + + // Use ManuallyDrop to avoid Box:Drop recursing + let schema = Box::new(ManuallyDrop::new(schema)); + let array = Box::new(ManuallyDrop::new(array)); + + let schema_ptr = &**schema as *const _; + let array_ptr = &**array as *const _; + + // We can read them back to memory + // SAFETY: + // Pointers are aligned and valid + let array = + unsafe { ArrowArray::new(ptr::read(array_ptr), ptr::read(schema_ptr)) }; + + let array = Int32Array::from(ArrayData::try_from(array).unwrap()); + assert_eq!(array, Int32Array::from(vec![1, 2, 3])); } #[test] @@ -1424,31 +1465,28 @@ mod tests { let array = make_array(Int32Array::from(vec![1, 2, 3]).into_data()); // Assume two raw pointers provided by the consumer - let out_array = Box::new(FFI_ArrowArray::empty()); - let out_schema = Box::new(FFI_ArrowSchema::empty()); - let out_array_ptr = Box::into_raw(out_array); - let out_schema_ptr = Box::into_raw(out_schema); - - unsafe { - export_array_into_raw(array, out_array_ptr, out_schema_ptr)?; + let mut out_array = FFI_ArrowArray::empty(); + let mut out_schema = FFI_ArrowSchema::empty(); + + { + let out_array_ptr = addr_of_mut!(out_array); + let out_schema_ptr = addr_of_mut!(out_schema); + unsafe { + export_array_into_raw(array, out_array_ptr, out_schema_ptr)?; + } } // (simulate consumer) import it - unsafe { - let array = ArrowArray::try_from_raw(out_array_ptr, out_schema_ptr).unwrap(); - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = array.as_any().downcast_ref::().unwrap(); - let array = kernels::arithmetic::add(array, array).unwrap(); + let array = ArrowArray::new(out_array, out_schema); + let data = ArrayData::try_from(array)?; + let array = make_array(data); - // verify - assert_eq!(array, Int32Array::from(vec![2, 4, 6])); + // perform some operation + let array = array.as_any().downcast_ref::().unwrap(); + let array = kernels::arithmetic::add(array, array).unwrap(); - drop(Box::from_raw(out_array_ptr)); - drop(Box::from_raw(out_schema_ptr)); - } + // verify + assert_eq!(array, Int32Array::from(vec![2, 4, 6])); Ok(()) } diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 09933304ecf9..110fd9cfaa82 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -19,6 +19,7 @@ //! arrays from and to Python. use std::convert::{From, TryFrom}; +use std::ptr::addr_of_mut; use std::sync::Arc; use pyo3::ffi::Py_uintptr_t; @@ -30,7 +31,7 @@ use crate::array::{make_array, Array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; -use crate::ffi::FFI_ArrowSchema; +use crate::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; use crate::ffi_stream::{ export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream, }; @@ -111,8 +112,8 @@ impl PyArrowConvert for Schema { impl PyArrowConvert for ArrayData { fn from_pyarrow(value: &PyAny) -> PyResult { // prepare a pointer to receive the Array struct - let (array_pointer, schema_pointer) = - ffi::ArrowArray::into_raw(unsafe { ffi::ArrowArray::empty() }); + let mut array = FFI_ArrowArray::empty(); + let mut schema = FFI_ArrowSchema::empty(); // make the conversion through PyArrow's private API // this changes the pointer's memory and is thus unsafe. @@ -120,15 +121,12 @@ impl PyArrowConvert for ArrayData { value.call_method1( "_export_to_c", ( - array_pointer as Py_uintptr_t, - schema_pointer as Py_uintptr_t, + addr_of_mut!(array) as Py_uintptr_t, + addr_of_mut!(schema) as Py_uintptr_t, ), )?; - let ffi_array = unsafe { - ffi::ArrowArray::try_from_raw(array_pointer, schema_pointer) - .map_err(to_py_err)? - }; + let ffi_array = ffi::ArrowArray::new(array, schema); let data = ArrayData::try_from(ffi_array).map_err(to_py_err)?; Ok(data) From 07e20639b7023fcc61c73f80a5bddf8715c2a06f Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 10 Feb 2023 03:59:42 -0800 Subject: [PATCH 0592/1411] Support UTF8 cast to Timestamp with timezone (#3673) * support cast UTf8 to Timestamp * fmt * fix docs and tests --- arrow-cast/src/cast.rs | 46 +++++++++++++++++++++++++++++++++++---- arrow-cast/src/parse.rs | 21 ++++++++++++++---- arrow/tests/array_cast.rs | 4 ++-- 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3137e685b212..1631f2e0040f 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -166,7 +166,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Nanosecond, None) + | Timestamp(TimeUnit::Nanosecond, _) ) => true, (Utf8, _) => to_type.is_numeric() && to_type != &Float16, (LargeUtf8, @@ -179,7 +179,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Nanosecond, None) + | Timestamp(TimeUnit::Nanosecond, _) ) => true, (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1141,7 +1141,7 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Nanosecond, None) => { + Timestamp(TimeUnit::Nanosecond, _) => { cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( @@ -1182,7 +1182,7 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Nanosecond, None) => { + Timestamp(TimeUnit::Nanosecond, _) => { cast_string_to_timestamp_ns::(array, cast_options) } _ => Err(ArrowError::CastError(format!( @@ -7550,4 +7550,42 @@ mod tests { assert_eq!(v.value(0), 946728000000); assert_eq!(v.value(1), 1608035696000); } + + #[test] + fn test_cast_utf8_to_timestamp() { + fn test_tz(tz: String) { + let valid = StringArray::from(vec![ + "2023-01-01 04:05:06.789000-08:00", + "2023-01-01 04:05:06.789000-07:00", + "2023-01-01 04:05:06.789 -0800", + "2023-01-01 04:05:06.789 -08:00", + "2023-01-01 040506 +0730", + "2023-01-01 040506 +07:30", + "2023-01-01 04:05:06.789", + "2023-01-01 04:05:06", + "2023-01-01", + ]); + + let array = Arc::new(valid) as ArrayRef; + let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))) + .unwrap(); + + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1672574706789000000, c.value(0)); + assert_eq!(1672571106789000000, c.value(1)); + assert_eq!(1672574706789000000, c.value(2)); + assert_eq!(1672574706789000000, c.value(3)); + assert_eq!(1672518906000000000, c.value(4)); + assert_eq!(1672518906000000000, c.value(5)); + assert_eq!(1672545906789000000, c.value(6)); + assert_eq!(1672545906000000000, c.value(7)); + assert_eq!(1672531200000000000, c.value(8)); + } + + test_tz("+00:00".to_owned()); + test_tz("+02:00".to_owned()); + } } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 459b94f37dc8..f23e65b22845 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -68,6 +68,14 @@ use chrono::prelude::*; /// the system timezone is set to Americas/New_York (UTC-5) the /// timestamp will be interpreted as though it were /// `1997-01-31T09:26:56.123-05:00` +/// +/// Some formats that supported by PostgresSql +/// still not supported by chrono, like +/// "2023-01-01 040506 America/Los_Angeles", +/// "2023-01-01 04:05:06.789 +07:30:00", +/// "2023-01-01 040506 +07:30:00", +/// "2023-01-01 04:05:06.789 PST", +/// "2023-01-01 04:05:06.789 -08", #[inline] pub fn string_to_timestamp_nanos(s: &str) -> Result { // Fast path: RFC3339 timestamp (with a T) @@ -81,10 +89,15 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // separating the date and time with a space ' ' rather than 'T' to be // (more) compatible with Apache Spark SQL - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return to_timestamp_nanos(ts.naive_utc()); + let supported_formats = vec![ + "%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00 + "%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30" + ]; + + for f in supported_formats.iter() { + if let Ok(ts) = DateTime::parse_from_str(s, f) { + return to_timestamp_nanos(ts.naive_utc()); + } } // with an explicit Z, using ' ' as a separator diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 7eeb00a8290a..30ded4d70be5 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -97,7 +97,7 @@ fn test_can_cast_types() { /// Create instances of arrays with varying types for cast tests fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); + let tz_name = String::from("+08:00"); let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; vec![ Arc::new(BinaryArray::from(binary_data.clone())), @@ -349,7 +349,7 @@ fn create_decimal_array( // Get a selection of datatypes to try and cast to fn get_all_types() -> Vec { use DataType::*; - let tz_name = String::from("America/New_York"); + let tz_name = String::from("+08:00"); let mut types = vec![ Null, From 5b1821e0564f586f6f98e5c392a0a208890055df Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 10 Feb 2023 09:39:47 -0500 Subject: [PATCH 0593/1411] feat + fix: IPC support for run encoded array. (#3662) * Schema.fbs changes, flatbuffer generated code, flatbuffer gen script changes Add ipc reader, writer and equals Add/Update tests * Add support for non zero offset in run array * clippy fixes * format fix * doc fix * incorporate pr comments * fix formatting * more pr comments * pr suggestions --------- Co-authored-by: ask Co-authored-by: devx --- arrow-array/src/array/run_array.rs | 170 +++++++++++++++++---- arrow-data/src/data.rs | 13 +- arrow-data/src/equal/mod.rs | 5 +- arrow-data/src/equal/run.rs | 84 +++++++++++ arrow-ipc/regen.sh | 32 ++-- arrow-ipc/src/convert.rs | 25 +++- arrow-ipc/src/gen/Schema.rs | 120 ++++++++++++++- arrow-ipc/src/gen/SparseTensor.rs | 25 ++++ arrow-ipc/src/gen/Tensor.rs | 25 ++++ arrow-ipc/src/reader.rs | 126 ++++++++++++++-- arrow-ipc/src/writer.rs | 229 ++++++++++++++++++++++++++--- arrow-select/src/take.rs | 1 - format/Schema.fbs | 14 +- 13 files changed, 779 insertions(+), 90 deletions(-) create mode 100644 arrow-data/src/equal/run.rs diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 2e378c90fd49..33738d649f76 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -67,9 +67,9 @@ pub struct RunArray { } impl RunArray { - // calculates the logical length of the array encoded - // by the given run_ends array. - fn logical_len(run_ends: &PrimitiveArray) -> usize { + /// Calculates the logical length of the array encoded + /// by the given run_ends array. + pub fn logical_len(run_ends: &PrimitiveArray) -> usize { let len = run_ends.len(); if len == 0 { return 0; @@ -145,14 +145,15 @@ impl RunArray { } /// Returns index to the physical array for the given index to the logical array. + /// The function does not adjust the input logical index based on `ArrayData::offset`. /// Performs a binary search on the run_ends array for the input index. #[inline] - pub fn get_physical_index(&self, logical_index: usize) -> Option { - if logical_index >= self.len() { + pub fn get_zero_offset_physical_index(&self, logical_index: usize) -> Option { + if logical_index >= Self::logical_len(&self.run_ends) { return None; } let mut st: usize = 0; - let mut en: usize = self.run_ends().len(); + let mut en: usize = self.run_ends.len(); while st + 1 < en { let mid: usize = (st + en) / 2; if logical_index @@ -164,7 +165,7 @@ impl RunArray { // `en` starts with len. The condition `st + 1 < en` ensures // `st` and `en` differs atleast by two. So the value of `mid` // will never be either `st` or `en` - self.run_ends().value_unchecked(mid - 1).as_usize() + self.run_ends.value_unchecked(mid - 1).as_usize() } { en = mid @@ -175,6 +176,17 @@ impl RunArray { Some(st) } + /// Returns index to the physical array for the given index to the logical array. + /// This function adjusts the input logical index based on `ArrayData::offset` + /// Performs a binary search on the run_ends array for the input index. + #[inline] + pub fn get_physical_index(&self, logical_index: usize) -> Option { + if logical_index >= self.len() { + return None; + } + self.get_zero_offset_physical_index(logical_index + self.offset()) + } + /// Returns the physical indices of the input logical indices. Returns error if any of the logical /// index cannot be converted to physical index. The logical indices are sorted and iterated along /// with run_ends array to find matching physical index. The approach used here was chosen over @@ -192,6 +204,10 @@ impl RunArray { { let indices_len = logical_indices.len(); + if indices_len == 0 { + return Ok(vec![]); + } + // `ordered_indices` store index into `logical_indices` and can be used // to iterate `logical_indices` in sorted order. let mut ordered_indices: Vec = (0..indices_len).collect(); @@ -204,12 +220,30 @@ impl RunArray { .unwrap() }); + // Return early if all the logical indices cannot be converted to physical indices. + let largest_logical_index = + logical_indices[*ordered_indices.last().unwrap()].as_usize(); + if largest_logical_index >= self.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.", + ))); + } + + // Skip some physical indices based on offset. + let skip_value = if self.offset() > 0 { + self.get_zero_offset_physical_index(self.offset()).unwrap() + } else { + 0 + }; + let mut physical_indices = vec![0; indices_len]; let mut ordered_index = 0_usize; - for (physical_index, run_end) in self.run_ends.values().iter().enumerate() { - // Get the run end index of current physical index - let run_end_value = run_end.as_usize(); + for (physical_index, run_end) in + self.run_ends.values().iter().enumerate().skip(skip_value) + { + // Get the run end index (relative to offset) of current physical index + let run_end_value = run_end.as_usize() - self.offset(); // All the `logical_indices` that are less than current run end index // belongs to current physical index. @@ -552,6 +586,34 @@ mod tests { result } + // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]` + fn compare_logical_and_physical_indices( + logical_indices: &[u32], + logical_array: &[Option], + physical_indices: &[usize], + physical_array: &PrimitiveArray, + ) { + assert_eq!(logical_indices.len(), physical_indices.len()); + + // check value in logical index in the logical_array matches physical index in physical_array + logical_indices + .iter() + .map(|f| f.as_usize()) + .zip(physical_indices.iter()) + .for_each(|(logical_ix, physical_ix)| { + let expected = logical_array[logical_ix]; + match expected { + Some(val) => { + assert!(physical_array.is_valid(*physical_ix)); + let actual = physical_array.value(*physical_ix); + assert_eq!(val, actual); + } + None => { + assert!(physical_array.is_null(*physical_ix)) + } + }; + }); + } #[test] fn test_run_array() { // Construct a value array @@ -824,23 +886,77 @@ mod tests { assert_eq!(logical_indices.len(), physical_indices.len()); // check value in logical index in the input_array matches physical index in typed_run_array - logical_indices - .iter() - .map(|f| f.as_usize()) - .zip(physical_indices.iter()) - .for_each(|(logical_ix, physical_ix)| { - let expected = input_array[logical_ix]; - match expected { - Some(val) => { - assert!(physical_values_array.is_valid(*physical_ix)); - let actual = physical_values_array.value(*physical_ix); - assert_eq!(val, actual); - } - None => { - assert!(physical_values_array.is_null(*physical_ix)) - } - }; - }); + compare_logical_and_physical_indices( + &logical_indices, + &input_array, + &physical_indices, + physical_values_array, + ); + } + } + + #[test] + fn test_get_physical_indices_sliced() { + let total_len = 80; + let input_array = build_input_array(total_len); + + // Encode the input_array to run array + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array.iter().copied()); + let run_array = builder.finish(); + let physical_values_array = as_primitive_array::(run_array.values()); + + // test for all slice lengths. + for slice_len in 1..=total_len { + // create an array consisting of all the indices repeated twice and shuffled. + let mut logical_indices: Vec = (0_u32..(slice_len as u32)).collect(); + // add same indices once more + logical_indices.append(&mut logical_indices.clone()); + let mut rng = thread_rng(); + logical_indices.shuffle(&mut rng); + + // test for offset = 0 and slice length = slice_len + // slice the input array using which the run array was built. + let sliced_input_array = &input_array[0..slice_len]; + + // slice the run array + let sliced_run_array: RunArray = + run_array.slice(0, slice_len).into_data().into(); + + // Get physical indices. + let physical_indices = sliced_run_array + .get_physical_indices(&logical_indices) + .unwrap(); + + compare_logical_and_physical_indices( + &logical_indices, + sliced_input_array, + &physical_indices, + physical_values_array, + ); + + // test for offset = total_len - slice_len and slice length = slice_len + // slice the input array using which the run array was built. + let sliced_input_array = &input_array[total_len - slice_len..total_len]; + + // slice the run array + let sliced_run_array: RunArray = run_array + .slice(total_len - slice_len, slice_len) + .into_data() + .into(); + + // Get physical indices + let physical_indices = sliced_run_array + .get_physical_indices(&logical_indices) + .unwrap(); + + compare_logical_and_physical_indices( + &logical_indices, + sliced_input_array, + &physical_indices, + physical_values_array, + ); } } } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 8b727ec953a9..8742f8db9490 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1290,9 +1290,9 @@ impl ArrayData { DataType::RunEndEncoded(run_ends, _values) => { let run_ends_data = self.child_data()[0].clone(); match run_ends.data_type() { - DataType::Int16 => run_ends_data.check_run_ends::(self.len()), - DataType::Int32 => run_ends_data.check_run_ends::(self.len()), - DataType::Int64 => run_ends_data.check_run_ends::(self.len()), + DataType::Int16 => run_ends_data.check_run_ends::(), + DataType::Int32 => run_ends_data.check_run_ends::(), + DataType::Int64 => run_ends_data.check_run_ends::(), _ => unreachable!(), } } @@ -1451,7 +1451,7 @@ impl ArrayData { } /// Validates that each value in run_ends array is positive and strictly increasing. - fn check_run_ends(&self, array_len: usize) -> Result<(), ArrowError> + fn check_run_ends(&self) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { @@ -1478,9 +1478,10 @@ impl ArrayData { Ok(()) })?; - if prev_value.as_usize() != array_len { + if prev_value.as_usize() < (self.offset + self.len) { return Err(ArrowError::InvalidArgumentError(format!( - "The length of array does not match the last value in the run_ends array. The last value of run_ends array is {prev_value} and length of array is {array_len}." + "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.", + self.offset + self.len ))); } Ok(()) diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index aff61e3d37e5..871a312ca47f 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -31,6 +31,7 @@ mod fixed_list; mod list; mod null; mod primitive; +mod run; mod structure; mod union; mod utils; @@ -50,6 +51,8 @@ use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; +use self::run::run_equal; + /// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively /// for `len` slots. #[inline] @@ -137,7 +140,7 @@ fn equal_values( }, DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Map(_, _) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::RunEndEncoded(_, _) => todo!(), + DataType::RunEndEncoded(_, _) => run_equal(lhs, rhs, lhs_start, rhs_start, len), } } diff --git a/arrow-data/src/equal/run.rs b/arrow-data/src/equal/run.rs new file mode 100644 index 000000000000..ede172c999fd --- /dev/null +++ b/arrow-data/src/equal/run.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::ArrayData; + +use super::equal_range; + +/// The current implementation of comparison of run array support physical comparison. +/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will +/// be time consuming as converting from logical index to physical index cannot be done +/// in constant time. The current comparison compares the underlying physical arrays. +pub(super) fn run_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + if lhs_start != 0 + || rhs_start != 0 + || (lhs.len() != len && rhs.len() != len) + || lhs.offset() > 0 + || rhs.offset() > 0 + { + unimplemented!("Logical comparison for run array not supported.") + } + + if lhs.len() != rhs.len() { + return false; + } + + let lhs_run_ends_array = lhs.child_data().get(0).unwrap(); + let lhs_values_array = lhs.child_data().get(1).unwrap(); + + let rhs_run_ends_array = rhs.child_data().get(0).unwrap(); + let rhs_values_array = rhs.child_data().get(1).unwrap(); + + if lhs_run_ends_array.len() != rhs_run_ends_array.len() { + return false; + } + + if lhs_values_array.len() != rhs_values_array.len() { + return false; + } + + // check run ends array are equal. The length of the physical array + // is used to validate the child arrays. + let run_ends_equal = equal_range( + lhs_run_ends_array, + rhs_run_ends_array, + lhs_start, + rhs_start, + lhs_run_ends_array.len(), + ); + + // if run ends array are not the same return early without validating + // values array. + if !run_ends_equal { + return false; + } + + // check values array are equal + equal_range( + lhs_values_array, + rhs_values_array, + lhs_start, + rhs_start, + rhs_values_array.len(), + ) +} diff --git a/arrow-ipc/regen.sh b/arrow-ipc/regen.sh index 9d384b6b63b6..8d8862ccc7f4 100755 --- a/arrow-ipc/regen.sh +++ b/arrow-ipc/regen.sh @@ -18,15 +18,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -# Change to the toplevel Rust directory -pushd $DIR/../../ +# Change to the toplevel `arrow-rs` directory +pushd $DIR/../ echo "Build flatc from source ..." FB_URL="https://github.com/google/flatbuffers" -# https://github.com/google/flatbuffers/pull/6393 -FB_COMMIT="408cf5802415e1dea65fef7489a6c2f3740fb381" -FB_DIR="rust/arrow/.flatbuffers" +FB_DIR="arrow/.flatbuffers" FLATC="$FB_DIR/bazel-bin/flatc" if [ -z $(which bazel) ]; then @@ -44,28 +42,21 @@ else git -C $FB_DIR pull fi -echo "hard reset to $FB_COMMIT" -git -C $FB_DIR reset --hard $FB_COMMIT - pushd $FB_DIR echo "run: bazel build :flatc ..." bazel build :flatc popd -FB_PATCH="rust/arrow/format-0ed34c83.patch" -echo "Patch flatbuffer files with ${FB_PATCH} for cargo doc" -echo "NOTE: the patch MAY need update in case of changes in format/*.fbs" -git apply --check ${FB_PATCH} && git apply ${FB_PATCH} # Execute the code generation: -$FLATC --filename-suffix "" --rust -o rust/arrow/src/ipc/gen/ format/*.fbs +$FLATC --filename-suffix "" --rust -o arrow-ipc/src/gen/ format/*.fbs # Reset changes to format/ git checkout -- format # Now the files are wrongly named so we have to change that. popd -pushd $DIR/src/ipc/gen +pushd $DIR/src/gen PREFIX=$(cat <<'HEREDOC' // Licensed to the Apache Software Foundation (ASF) under one @@ -94,9 +85,9 @@ use flatbuffers::EndianScalar; HEREDOC ) -SCHEMA_IMPORT="\nuse crate::ipc::gen::Schema::*;" -SPARSE_TENSOR_IMPORT="\nuse crate::ipc::gen::SparseTensor::*;" -TENSOR_IMPORT="\nuse crate::ipc::gen::Tensor::*;" +SCHEMA_IMPORT="\nuse crate::gen::Schema::*;" +SPARSE_TENSOR_IMPORT="\nuse crate::gen::SparseTensor::*;" +TENSOR_IMPORT="\nuse crate::gen::Tensor::*;" # For flatbuffer(1.12.0+), remove: use crate::${name}::\*; names=("File" "Message" "Schema" "SparseTensor" "Tensor") @@ -119,8 +110,9 @@ for f in `ls *.rs`; do sed -i '' '/} \/\/ pub mod arrow/d' $f sed -i '' '/} \/\/ pub mod apache/d' $f sed -i '' '/} \/\/ pub mod org/d' $f - sed -i '' '/use std::mem;/d' $f - sed -i '' '/use std::cmp::Ordering;/d' $f + sed -i '' '/use core::mem;/d' $f + sed -i '' '/use core::cmp::Ordering;/d' $f + sed -i '' '/use self::flatbuffers::{EndianScalar, Follow};/d' $f # required by flatc 1.12.0+ sed -i '' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f @@ -150,7 +142,7 @@ done # Return back to base directory popd -cargo +stable fmt -- src/ipc/gen/* +cargo +stable fmt -- src/gen/* echo "DONE!" echo "Please run 'cargo doc' and 'cargo test' with nightly and stable, " diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index c5681b0c8f1b..aede8a448a06 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -364,6 +364,18 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat DataType::Struct(fields) } + crate::Type::RunEndEncoded => { + let children = field.children().unwrap(); + if children.len() != 2 { + panic!( + "RunEndEncoded type should have exactly two children. Found {}", + children.len() + ) + } + let run_ends_field = children.get(0).into(); + let values_field = children.get(1).into(); + DataType::RunEndEncoded(Box::new(run_ends_field), Box::new(values_field)) + } crate::Type::Map => { let map = field.type_as_map().unwrap(); let children = field.children().unwrap(); @@ -710,7 +722,18 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&children[..])), } } - RunEndEncoded(_, _) => todo!(), + RunEndEncoded(run_ends, values) => { + let run_ends_field = build_field(fbb, run_ends); + let values_field = build_field(fbb, values); + let children = vec![run_ends_field, values_field]; + FBFieldType { + type_type: crate::Type::RunEndEncoded, + type_: crate::RunEndEncodedBuilder::new(fbb) + .finish() + .as_union_value(), + children: Some(fbb.create_vector(&children[..])), + } + } Map(map_field, keys_sorted) => { let child = build_field(fbb, map_field); let mut field_type = crate::MapBuilder::new(fbb); diff --git a/arrow-ipc/src/gen/Schema.rs b/arrow-ipc/src/gen/Schema.rs index 6479bece7213..cf3ea0bd4abd 100644 --- a/arrow-ipc/src/gen/Schema.rs +++ b/arrow-ipc/src/gen/Schema.rs @@ -735,13 +735,13 @@ pub const ENUM_MIN_TYPE: u8 = 0; since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] -pub const ENUM_MAX_TYPE: u8 = 21; +pub const ENUM_MAX_TYPE: u8 = 22; #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] #[allow(non_camel_case_types)] -pub const ENUM_VALUES_TYPE: [Type; 22] = [ +pub const ENUM_VALUES_TYPE: [Type; 23] = [ Type::NONE, Type::Null, Type::Int, @@ -764,6 +764,7 @@ pub const ENUM_VALUES_TYPE: [Type; 22] = [ Type::LargeBinary, Type::LargeUtf8, Type::LargeList, + Type::RunEndEncoded, ]; /// ---------------------------------------------------------------------- @@ -796,9 +797,10 @@ impl Type { pub const LargeBinary: Self = Self(19); pub const LargeUtf8: Self = Self(20); pub const LargeList: Self = Self(21); + pub const RunEndEncoded: Self = Self(22); pub const ENUM_MIN: u8 = 0; - pub const ENUM_MAX: u8 = 21; + pub const ENUM_MAX: u8 = 22; pub const ENUM_VALUES: &'static [Self] = &[ Self::NONE, Self::Null, @@ -822,6 +824,7 @@ impl Type { Self::LargeBinary, Self::LargeUtf8, Self::LargeList, + Self::RunEndEncoded, ]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { @@ -848,6 +851,7 @@ impl Type { Self::LargeBinary => Some("LargeBinary"), Self::LargeUtf8 => Some("LargeUtf8"), Self::LargeList => Some("LargeList"), + Self::RunEndEncoded => Some("RunEndEncoded"), _ => None, } } @@ -2646,6 +2650,90 @@ impl core::fmt::Debug for Bool<'_> { ds.finish() } } +pub enum RunEndEncodedOffset {} +#[derive(Copy, Clone, PartialEq)] + +/// Contains two child arrays, run_ends and values. +/// The run_ends child array must be a 16/32/64-bit integer array +/// which encodes the indices at which the run with the value in +/// each corresponding index in the values child array ends. +/// Like list/struct types, the value array can be of any type. +pub struct RunEndEncoded<'a> { + pub _tab: flatbuffers::Table<'a>, +} + +impl<'a> flatbuffers::Follow<'a> for RunEndEncoded<'a> { + type Inner = RunEndEncoded<'a>; + #[inline] + unsafe fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { + Self { + _tab: flatbuffers::Table::new(buf, loc), + } + } +} + +impl<'a> RunEndEncoded<'a> { + #[inline] + pub unsafe fn init_from_table(table: flatbuffers::Table<'a>) -> Self { + RunEndEncoded { _tab: table } + } + #[allow(unused_mut)] + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + _args: &'args RunEndEncodedArgs, + ) -> flatbuffers::WIPOffset> { + let mut builder = RunEndEncodedBuilder::new(_fbb); + builder.finish() + } +} + +impl flatbuffers::Verifiable for RunEndEncoded<'_> { + #[inline] + fn run_verifier( + v: &mut flatbuffers::Verifier, + pos: usize, + ) -> Result<(), flatbuffers::InvalidFlatbuffer> { + use flatbuffers::Verifiable; + v.visit_table(pos)?.finish(); + Ok(()) + } +} +pub struct RunEndEncodedArgs {} +impl<'a> Default for RunEndEncodedArgs { + #[inline] + fn default() -> Self { + RunEndEncodedArgs {} + } +} + +pub struct RunEndEncodedBuilder<'a: 'b, 'b> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, + start_: flatbuffers::WIPOffset, +} +impl<'a: 'b, 'b> RunEndEncodedBuilder<'a, 'b> { + #[inline] + pub fn new( + _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, + ) -> RunEndEncodedBuilder<'a, 'b> { + let start = _fbb.start_table(); + RunEndEncodedBuilder { + fbb_: _fbb, + start_: start, + } + } + #[inline] + pub fn finish(self) -> flatbuffers::WIPOffset> { + let o = self.fbb_.end_table(self.start_); + flatbuffers::WIPOffset::new(o.value()) + } +} + +impl core::fmt::Debug for RunEndEncoded<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let mut ds = f.debug_struct("RunEndEncoded"); + ds.finish() + } +} pub enum DecimalOffset {} #[derive(Copy, Clone, PartialEq)] @@ -4316,6 +4404,21 @@ impl<'a> Field<'a> { None } } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_run_end_encoded(&self) -> Option> { + if self.type_type() == Type::RunEndEncoded { + self.type_().map(|t| { + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + unsafe { RunEndEncoded::init_from_table(t) } + }) + } else { + None + } + } } impl flatbuffers::Verifiable for Field<'_> { @@ -4351,6 +4454,7 @@ impl flatbuffers::Verifiable for Field<'_> { Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), + Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), _ => Ok(()), } })? @@ -4686,6 +4790,16 @@ impl core::fmt::Debug for Field<'_> { ) } } + Type::RunEndEncoded => { + if let Some(x) = self.type_as_run_end_encoded() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } _ => { let x: Option<()> = None; ds.field("type_", &x) diff --git a/arrow-ipc/src/gen/SparseTensor.rs b/arrow-ipc/src/gen/SparseTensor.rs index c5e06c30e03e..83fed4873b62 100644 --- a/arrow-ipc/src/gen/SparseTensor.rs +++ b/arrow-ipc/src/gen/SparseTensor.rs @@ -1524,6 +1524,20 @@ impl<'a> SparseTensor<'a> { } } + #[inline] + #[allow(non_snake_case)] + pub fn type_as_run_end_encoded(&self) -> Option> { + if self.type_type() == Type::RunEndEncoded { + let u = self.type_(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { RunEndEncoded::init_from_table(u) }) + } else { + None + } + } + #[inline] #[allow(non_snake_case)] pub fn sparseIndex_as_sparse_tensor_index_coo( @@ -1604,6 +1618,7 @@ impl flatbuffers::Verifiable for SparseTensor<'_> { Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), + Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), _ => Ok(()), } })? @@ -1943,6 +1958,16 @@ impl core::fmt::Debug for SparseTensor<'_> { ) } } + Type::RunEndEncoded => { + if let Some(x) = self.type_as_run_end_encoded() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } _ => { let x: Option<()> = None; ds.field("type_", &x) diff --git a/arrow-ipc/src/gen/Tensor.rs b/arrow-ipc/src/gen/Tensor.rs index 954ebd29012b..43133fec036d 100644 --- a/arrow-ipc/src/gen/Tensor.rs +++ b/arrow-ipc/src/gen/Tensor.rs @@ -565,6 +565,20 @@ impl<'a> Tensor<'a> { None } } + + #[inline] + #[allow(non_snake_case)] + pub fn type_as_run_end_encoded(&self) -> Option> { + if self.type_type() == Type::RunEndEncoded { + let u = self.type_(); + // Safety: + // Created from a valid Table for this object + // Which contains a valid union in this slot + Some(unsafe { RunEndEncoded::init_from_table(u) }) + } else { + None + } + } } impl flatbuffers::Verifiable for Tensor<'_> { @@ -598,6 +612,7 @@ impl flatbuffers::Verifiable for Tensor<'_> { Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), + Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), _ => Ok(()), } })? @@ -907,6 +922,16 @@ impl core::fmt::Debug for Tensor<'_> { ) } } + Type::RunEndEncoded => { + if let Some(x) = self.type_as_run_end_encoded() { + ds.field("type_", &x) + } else { + ds.field( + "type_", + &"InvalidFlatbuffer: Union discriminant does not match value.", + ) + } + } _ => { let x: Option<()> = None; ds.field("type_", &x) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 17f521e423a4..6842474fb4e2 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -194,6 +194,50 @@ fn create_array( }; Arc::new(struct_array) } + RunEndEncoded(run_ends_field, values_field) => { + let run_node = nodes.get(node_index); + node_index += 1; + + let run_ends_triple = create_array( + nodes, + run_ends_field, + data, + buffers, + dictionaries_by_id, + node_index, + buffer_index, + compression_codec, + metadata, + )?; + node_index = run_ends_triple.1; + buffer_index = run_ends_triple.2; + + let values_triple = create_array( + nodes, + values_field, + data, + buffers, + dictionaries_by_id, + node_index, + buffer_index, + compression_codec, + metadata, + )?; + node_index = values_triple.1; + buffer_index = values_triple.2; + + let run_array_length = run_node.length() as usize; + let run_array_null_count = run_node.null_count() as usize; + let data = ArrayData::builder(data_type.clone()) + .len(run_array_length) + .null_count(run_array_null_count) + .offset(0) + .add_child_data(run_ends_triple.0.into_data()) + .add_child_data(values_triple.0.into_data()) + .build()?; + + make_array(data) + } // Create dictionary array from RecordBatch Dictionary(_, _) => { let index_node = nodes.get(node_index); @@ -361,6 +405,17 @@ fn skip_field( buffer_index = tuple.1; } } + RunEndEncoded(run_ends_field, values_field) => { + node_index += 1; + + let tuple = skip_field(run_ends_field.data_type(), node_index, buffer_index)?; + node_index = tuple.0; + buffer_index = tuple.1; + + let tuple = skip_field(values_field.data_type(), node_index, buffer_index)?; + node_index = tuple.0; + buffer_index = tuple.1; + } Dictionary(_, _) => { node_index += 1; buffer_index += 2; @@ -1189,9 +1244,11 @@ impl RecordBatchReader for StreamReader { #[cfg(test)] mod tests { + use crate::writer::unslice_run_array; + use super::*; - use arrow_array::builder::UnionBuilder; + use arrow_array::builder::{PrimitiveRunBuilder, UnionBuilder}; use arrow_array::types::*; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; @@ -1227,6 +1284,11 @@ mod tests { ]; let struct_data_type = DataType::Struct(struct_fields); + let run_encoded_data_type = DataType::RunEndEncoded( + Box::new(Field::new("run_ends", DataType::Int16, false)), + Box::new(Field::new("values", DataType::Int32, true)), + ); + // define schema Schema::new(vec![ Field::new("f0", DataType::UInt32, false), @@ -1239,9 +1301,10 @@ mod tests { Field::new("f7", DataType::FixedSizeBinary(3), true), Field::new("f8", fixed_size_list_data_type, false), Field::new("f9", struct_data_type, false), - Field::new("f10", DataType::Boolean, false), - Field::new("f11", dict_data_type, false), - Field::new("f12", DataType::Utf8, false), + Field::new("f10", run_encoded_data_type, false), + Field::new("f11", DataType::Boolean, false), + Field::new("f12", dict_data_type, false), + Field::new("f13", DataType::Utf8, false), ]) } @@ -1296,14 +1359,19 @@ mod tests { .unwrap(); let array9: ArrayRef = Arc::new(StructArray::from(array9)); - let array10 = BooleanArray::from(vec![false, false, true]); + let array10_input = vec![Some(1_i32), None, None]; + let mut array10_builder = PrimitiveRunBuilder::::new(); + array10_builder.extend(array10_input.into_iter()); + let array10 = array10_builder.finish(); + + let array11 = BooleanArray::from(vec![false, false, true]); - let array11_values = StringArray::from(vec!["x", "yy", "zzz"]); - let array11_keys = Int8Array::from_iter_values([1, 1, 2]); - let array11 = - DictionaryArray::::try_new(&array11_keys, &array11_values).unwrap(); + let array12_values = StringArray::from(vec!["x", "yy", "zzz"]); + let array12_keys = Int8Array::from_iter_values([1, 1, 2]); + let array12 = + DictionaryArray::::try_new(&array12_keys, &array12_values).unwrap(); - let array12 = StringArray::from(vec!["a", "bb", "ccc"]); + let array13 = StringArray::from(vec!["a", "bb", "ccc"]); // create record batch RecordBatch::try_new( @@ -1322,6 +1390,7 @@ mod tests { Arc::new(array10), Arc::new(array11), Arc::new(array12), + Arc::new(array13), ], ) .unwrap() @@ -1510,6 +1579,43 @@ mod tests { check_union_with_builder(UnionBuilder::new_sparse()); } + #[test] + fn test_roundtrip_stream_run_array_sliced() { + let run_array_1: Int32RunArray = vec!["a", "a", "a", "b", "b", "c", "c", "c"] + .into_iter() + .collect(); + let run_array_1_sliced = run_array_1.slice(2, 5); + + let run_array_2_inupt = vec![Some(1_i32), None, None, Some(2), Some(2)]; + let mut run_array_2_builder = PrimitiveRunBuilder::::new(); + run_array_2_builder.extend(run_array_2_inupt.into_iter()); + let run_array_2 = run_array_2_builder.finish(); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "run_array_1_sliced", + run_array_1_sliced.data_type().clone(), + false, + ), + Field::new("run_array_2", run_array_2.data_type().clone(), false), + ])); + let input_batch = RecordBatch::try_new( + schema, + vec![Arc::new(run_array_1_sliced.clone()), Arc::new(run_array_2)], + ) + .unwrap(); + let output_batch = roundtrip_ipc_stream(&input_batch); + + // As partial comparison not yet supported for run arrays, the sliced run array + // has to be unsliced before comparing with the output. the second run array + // can be compared as such. + assert_eq!(input_batch.column(1), output_batch.column(1)); + + let run_array_1_unsliced = + unslice_run_array(run_array_1_sliced.into_data()).unwrap(); + assert_eq!(run_array_1_unsliced, output_batch.column(0).into_data()); + } + #[test] fn test_roundtrip_stream_nested_dict() { let xs = vec!["AA", "BB", "AA", "CC", "BB"]; diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 8835cb49ffce..f019340154ac 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -24,14 +24,15 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; +use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use flatbuffers::FlatBufferBuilder; use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; use arrow_array::*; use arrow_buffer::bit_util; -use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::{layout, ArrayData, BufferSpec}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::{layout, ArrayData, ArrayDataBuilder, BufferSpec}; use arrow_schema::*; use crate::compression::CompressionCodec; @@ -218,6 +219,24 @@ impl IpcDataGenerator { )?; } } + DataType::RunEndEncoded(_, values) => { + if column.data().child_data().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "The run encoded array should have exactly two child arrays. Found {}", + column.data().child_data().len() + ))); + } + // The run_ends array is not expected to be dictionoary encoded. Hence encode dictionaries + // only for values array. + let values_array = make_array(column.data().child_data()[1].clone()); + self.encode_dictionaries( + values, + &values_array, + encoded_dictionaries, + dictionary_tracker, + write_options, + )?; + } DataType::List(field) => { let list = as_list_array(column); self.encode_dictionaries( @@ -533,6 +552,94 @@ impl IpcDataGenerator { } } +pub(crate) fn unslice_run_array(arr: ArrayData) -> Result { + match arr.data_type() { + DataType::RunEndEncoded(k, _) => match k.data_type() { + DataType::Int16 => Ok(into_zero_offset_run_array( + RunArray::::from(arr), + )? + .into_data()), + DataType::Int32 => Ok(into_zero_offset_run_array( + RunArray::::from(arr), + )? + .into_data()), + DataType::Int64 => Ok(into_zero_offset_run_array( + RunArray::::from(arr), + )? + .into_data()), + d => unreachable!("Unexpected data type {d}"), + }, + d => Err(ArrowError::InvalidArgumentError(format!( + "The given array is not a run array. Data type of given array: {d}" + ))), + } +} + +// Returns a `RunArray` with zero offset and length matching the last value +// in run_ends array. +fn into_zero_offset_run_array( + run_array: RunArray, +) -> Result, ArrowError> { + if run_array.offset() == 0 + && run_array.len() == RunArray::::logical_len(run_array.run_ends()) + { + return Ok(run_array); + } + // The physical index of original run_ends array from which the `ArrayData`is sliced. + let start_physical_index = run_array + .get_zero_offset_physical_index(run_array.offset()) + .unwrap(); + + // The logical length of original run_ends array until which the `ArrayData` is sliced. + let end_logical_index = run_array.offset() + run_array.len() - 1; + // The physical index of original run_ends array until which the `ArrayData`is sliced. + let end_physical_index = run_array + .get_zero_offset_physical_index(end_logical_index) + .unwrap(); + + let physical_length = end_physical_index - start_physical_index + 1; + + // build new run_ends array by subtrating offset from run ends. + let mut builder = BufferBuilder::::new(physical_length); + for ix in start_physical_index..end_physical_index { + let run_end_value = unsafe { + // Safety: + // start_physical_index and end_physical_index are within + // run_ends array bounds. + run_array.run_ends().value_unchecked(ix).as_usize() + }; + let run_end_value = run_end_value - run_array.offset(); + builder.append(R::Native::from_usize(run_end_value).unwrap()); + } + builder.append(R::Native::from_usize(run_array.len()).unwrap()); + let new_run_ends = unsafe { + // Safety: + // The function builds a valid run_ends array and hence need not be validated. + ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) + .len(physical_length) + .null_count(0) + .add_buffer(builder.finish()) + .build_unchecked() + }; + + // build new values by slicing physical indices. + let new_values = run_array + .values() + .slice(start_physical_index, physical_length) + .into_data(); + + let builder = ArrayDataBuilder::new(run_array.data_type().clone()) + .len(run_array.len()) + .add_child_data(new_run_ends) + .add_child_data(new_values); + let array_data = unsafe { + // Safety: + // This function builds a valid run array and hence can skip validation. + builder.build_unchecked() + }; + Ok(array_data.into()) +} + /// Keeps track of dictionaries that have been written, to avoid emitting the same dictionary /// multiple times. Can optionally error if an update to an existing dictionary is attempted, which /// isn't allowed in the `FileWriter`. @@ -968,11 +1075,15 @@ fn write_continuation( /// In V4, null types have no validity bitmap /// In V5 and later, null and union types have no validity bitmap +/// Run end encoded type has no validity bitmap. fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> bool { if write_options.metadata_version < crate::MetadataVersion::V5 { !matches!(data_type, DataType::Null) } else { - !matches!(data_type, DataType::Null | DataType::Union(_, _, _)) + !matches!( + data_type, + DataType::Null | DataType::Union(_, _, _) | DataType::RunEndEncoded(_, _) + ) } } @@ -1242,24 +1353,45 @@ fn write_array_data( } } - if !matches!(array_data.data_type(), DataType::Dictionary(_, _)) { - // recursively write out nested structures - for data_ref in array_data.child_data() { - // write the nested data (e.g list data) - offset = write_array_data( - data_ref, - buffers, - arrow_data, - nodes, - offset, - data_ref.len(), - data_ref.null_count(), - compression_codec, - write_options, - )?; + match array_data.data_type() { + DataType::Dictionary(_, _) => {} + DataType::RunEndEncoded(_, _) => { + // unslice the run encoded array. + let arr = unslice_run_array(array_data.clone())?; + // recursively write out nested structures + for data_ref in arr.child_data() { + // write the nested data (e.g list data) + offset = write_array_data( + data_ref, + buffers, + arrow_data, + nodes, + offset, + data_ref.len(), + data_ref.null_count(), + compression_codec, + write_options, + )?; + } + } + _ => { + // recursively write out nested structures + for data_ref in array_data.child_data() { + // write the nested data (e.g list data) + offset = write_array_data( + data_ref, + buffers, + arrow_data, + nodes, + offset, + data_ref.len(), + data_ref.null_count(), + compression_codec, + write_options, + )?; + } } } - Ok(offset) } @@ -1322,6 +1454,7 @@ mod tests { use crate::MetadataVersion; use crate::reader::*; + use arrow_array::builder::PrimitiveRunBuilder; use arrow_array::builder::UnionBuilder; use arrow_array::types::*; use arrow_schema::DataType; @@ -1992,4 +2125,62 @@ mod tests { let batch2 = reader.next().unwrap().unwrap(); assert_eq!(batch, batch2); } + + #[test] + fn test_run_array_unslice() { + let total_len = 80; + let vals: Vec> = + vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; + let repeats: Vec = vec![3, 4, 1, 2]; + let mut input_array: Vec> = Vec::with_capacity(total_len); + for ix in 0_usize..32 { + let repeat: usize = repeats[ix % repeats.len()]; + let val: Option = vals[ix % vals.len()]; + input_array.resize(input_array.len() + repeat, val); + } + + // Encode the input_array to run array + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array.iter().copied()); + let run_array = builder.finish(); + + // test for all slice lengths. + for slice_len in 1..=total_len { + // test for offset = 0, slice length = slice_len + let sliced_run_array: RunArray = + run_array.slice(0, slice_len).into_data().into(); + + // Create unsliced run array. + let unsliced_run_array = + into_zero_offset_run_array(sliced_run_array).unwrap(); + let typed = unsliced_run_array + .downcast::>() + .unwrap(); + let expected: Vec> = + input_array.iter().take(slice_len).copied().collect(); + let actual: Vec> = typed.into_iter().collect(); + assert_eq!(expected, actual); + + // test for offset = total_len - slice_len, length = slice_len + let sliced_run_array: RunArray = run_array + .slice(total_len - slice_len, slice_len) + .into_data() + .into(); + + // Create unsliced run array. + let unsliced_run_array = + into_zero_offset_run_array(sliced_run_array).unwrap(); + let typed = unsliced_run_array + .downcast::>() + .unwrap(); + let expected: Vec> = input_array + .iter() + .skip(total_len - slice_len) + .copied() + .collect(); + let actual: Vec> = typed.into_iter().collect(); + assert_eq!(expected, actual); + } + } } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index f8668b56e1d6..f8383bbe3d2f 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -832,7 +832,6 @@ macro_rules! primitive_run_take { /// for e.g. an input `RunArray{ run_ends = [2,4,6,8], values=[1,2,1,2] }` and `indices=[2,7]` /// would be converted to `physical_indices=[1,3]` which will be used to build /// output `RunArray{ run_ends=[2], values=[2] }` - fn take_run( run_array: &RunArray, logical_indices: &PrimitiveArray, diff --git a/format/Schema.fbs b/format/Schema.fbs index 7ee827b5de8d..6337f72ec9de 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -19,8 +19,9 @@ /// Format Version History. /// Version 1.0 - Forward and backwards compatibility guaranteed. -/// Version 1.1 - Add Decimal256 (No format release). -/// Version 1.2 (Pending)- Add Interval MONTH_DAY_NANO +/// Version 1.1 - Add Decimal256. +/// Version 1.2 - Add Interval MONTH_DAY_NANO +/// Version 1.3 - Add Run-End Encoded. namespace org.apache.arrow.flatbuf; @@ -178,6 +179,14 @@ table FixedSizeBinary { table Bool { } +/// Contains two child arrays, run_ends and values. +/// The run_ends child array must be a 16/32/64-bit integer array +/// which encodes the indices at which the run with the value in +/// each corresponding index in the values child array ends. +/// Like list/struct types, the value array can be of any type. +table RunEndEncoded { +} + /// Exact decimal value represented as an integer value in two's /// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers /// are used. The representation uses the endianness indicated @@ -417,6 +426,7 @@ union Type { LargeBinary, LargeUtf8, LargeList, + RunEndEncoded, } /// ---------------------------------------------------------------------- From 3e08a754ff08ebe51086b4f65beefb99769b4068 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Feb 2023 15:03:02 +0000 Subject: [PATCH 0594/1411] Add CSV Decoder::capacity (#3674) (#3677) * Add CSV Decoder::capacity (#3674) * Add test * Remove unnecessary extern * Add docs --- arrow-csv/src/reader/mod.rs | 83 ++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 925f504495d5..610f05155b52 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -438,10 +438,15 @@ impl BufReader { loop { let buf = self.reader.fill_buf()?; let decoded = self.decoder.decode(buf)?; - if decoded == 0 { + self.reader.consume(decoded); + // Yield if decoded no bytes or the decoder is full + // + // The capacity check avoids looping around and potentially + // blocking reading data in fill_buf that isn't needed + // to flush the next batch + if decoded == 0 || self.decoder.capacity() == 0 { break; } - self.reader.consume(decoded); } self.decoder.flush() @@ -574,6 +579,11 @@ impl Decoder { self.line_number += rows.len(); Ok(Some(batch)) } + + /// Returns the number of records that can be read before requiring a call to [`Self::flush`] + pub fn capacity(&self) -> usize { + self.batch_size - self.record_decoder.len() + } } /// Parses a slice of [`StringRecords`] into a [RecordBatch] @@ -2269,4 +2279,73 @@ mod tests { "Csv error: Encountered invalid UTF-8 data for line 1 and field 1", ); } + + struct InstrumentedRead { + r: R, + fill_count: usize, + fill_sizes: Vec, + } + + impl InstrumentedRead { + fn new(r: R) -> Self { + Self { + r, + fill_count: 0, + fill_sizes: vec![], + } + } + } + + impl Seek for InstrumentedRead { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.r.seek(pos) + } + } + + impl Read for InstrumentedRead { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.r.read(buf) + } + } + + impl BufRead for InstrumentedRead { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + self.fill_count += 1; + let buf = self.r.fill_buf()?; + self.fill_sizes.push(buf.len()); + Ok(buf) + } + + fn consume(&mut self, amt: usize) { + self.r.consume(amt) + } + } + + #[test] + fn test_io() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + let csv = "foo,bar\nbaz,foo\na,b\nc,d"; + let mut read = InstrumentedRead::new(Cursor::new(csv.as_bytes())); + let reader = ReaderBuilder::new() + .with_schema(schema) + .with_batch_size(3) + .build_buffered(&mut read) + .unwrap(); + + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 2); + assert_eq!(batches[0].num_rows(), 3); + assert_eq!(batches[1].num_rows(), 1); + + // Expect 4 calls to fill_buf + // 1. Read first 3 rows + // 2. Read final row + // 3. Delimit and flush final row + // 4. Iterator finished + assert_eq!(&read.fill_sizes, &[23, 3, 0, 0]); + assert_eq!(read.fill_count, 4); + } } From 560ebaa39cfa3fc4444eef56bd1b640a46f1f260 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 10 Feb 2023 10:13:28 -0500 Subject: [PATCH 0595/1411] Update to 33.0.0 and update changelog (#3686) * Update changelog * Update Version --- CHANGELOG-old.md | 75 +++++++++++++ CHANGELOG.md | 108 +++++++++---------- arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +-- arrow-csv/Cargo.toml | 12 +-- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 14 +-- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +-- arrow-json/Cargo.toml | 12 +-- arrow-ord/Cargo.toml | 12 +-- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +-- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +-- arrow/Cargo.toml | 28 ++--- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 ++-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 235 insertions(+), 170 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 65a95579e9f8..9ac8cb530456 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,81 @@ # Historical Changelog +## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-27) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/31.0.0...32.0.0) + +**Breaking changes:** + +- Allow `StringArray` construction with `Vec>` [\#3602](https://github.com/apache/arrow-rs/pull/3602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sinistersnare](https://github.com/sinistersnare)) +- Use native types in PageIndex \(\#3575\) [\#3578](https://github.com/apache/arrow-rs/pull/3578) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add external variant to ParquetError \(\#3285\) [\#3574](https://github.com/apache/arrow-rs/pull/3574) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Return reference from ListArray::values [\#3561](https://github.com/apache/arrow-rs/pull/3561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Add `RunEndEncodedArray` [\#3553](https://github.com/apache/arrow-rs/pull/3553) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) + +**Implemented enhancements:** + +- There should be a `From>>` impl for `GenericStringArray` [\#3599](https://github.com/apache/arrow-rs/issues/3599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FlightDataEncoder Optionally send Schema even when no record batches [\#3591](https://github.com/apache/arrow-rs/issues/3591) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Use Native Types in PageIndex [\#3575](https://github.com/apache/arrow-rs/issues/3575) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Packing array into dictionary of generic byte array [\#3571](https://github.com/apache/arrow-rs/issues/3571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `Error::Source` for ArrowError and FlightError [\#3566](https://github.com/apache/arrow-rs/issues/3566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- \[FlightSQL\] Allow access to underlying FlightClient [\#3551](https://github.com/apache/arrow-rs/issues/3551) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Arrow CSV writer should not fail when cannot cast the value [\#3547](https://github.com/apache/arrow-rs/issues/3547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Write Deprecated Min Max Statistics When ColumnOrder Signed [\#3526](https://github.com/apache/arrow-rs/issues/3526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Improve Performance of JSON Reader [\#3441](https://github.com/apache/arrow-rs/issues/3441) +- Support footer kv metadata for IPC file [\#3432](https://github.com/apache/arrow-rs/issues/3432) +- Add `External` variant to ParquetError [\#3285](https://github.com/apache/arrow-rs/issues/3285) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Nullif of NULL Predicate is not NULL [\#3589](https://github.com/apache/arrow-rs/issues/3589) +- BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Panic on Key Overflow in Dictionary Builders [\#3562](https://github.com/apache/arrow-rs/issues/3562) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Bumping version gives compilation error \(arrow-array\) [\#3525](https://github.com/apache/arrow-rs/issues/3525) + +**Merged pull requests:** + +- Add Push-Based CSV Decoder [\#3604](https://github.com/apache/arrow-rs/pull/3604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update to flatbuffers 23.1.21 [\#3597](https://github.com/apache/arrow-rs/pull/3597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster BooleanBufferBuilder::append\_n for true values [\#3596](https://github.com/apache/arrow-rs/pull/3596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support sending schemas for empty streams [\#3594](https://github.com/apache/arrow-rs/pull/3594) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Faster ListArray to StringArray conversion [\#3593](https://github.com/apache/arrow-rs/pull/3593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add conversion from StringArray to BinaryArray [\#3592](https://github.com/apache/arrow-rs/pull/3592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix nullif null count \(\#3579\) [\#3590](https://github.com/apache/arrow-rs/pull/3590) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clear bits in BooleanBufferBuilder \(\#3587\) [\#3588](https://github.com/apache/arrow-rs/pull/3588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Iterate all dictionary key types in cast test [\#3585](https://github.com/apache/arrow-rs/pull/3585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Propagate EOF Error from AsyncRead [\#3576](https://github.com/apache/arrow-rs/pull/3576) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) +- Show row\_counts also for \(FixedLen\)ByteArray [\#3573](https://github.com/apache/arrow-rs/pull/3573) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bmmeijers](https://github.com/bmmeijers)) +- Packing array into dictionary of generic byte array [\#3572](https://github.com/apache/arrow-rs/pull/3572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove unwrap on datetime cast for CSV writer [\#3570](https://github.com/apache/arrow-rs/pull/3570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Implement `std::error::Error::source` for `ArrowError` and `FlightError` [\#3567](https://github.com/apache/arrow-rs/pull/3567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Improve GenericBytesBuilder offset overflow panic message \(\#139\) [\#3564](https://github.com/apache/arrow-rs/pull/3564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Extend for ArrayBuilder \(\#1841\) [\#3563](https://github.com/apache/arrow-rs/pull/3563) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update pyarrow method call with kwargs [\#3560](https://github.com/apache/arrow-rs/pull/3560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) +- Update pyo3 requirement from 0.17 to 0.18 [\#3557](https://github.com/apache/arrow-rs/pull/3557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Expose Inner FlightServiceClient on FlightSqlServiceClient \(\#3551\) [\#3556](https://github.com/apache/arrow-rs/pull/3556) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix final page row count in parquet-index binary [\#3554](https://github.com/apache/arrow-rs/pull/3554) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Parquet Avoid Reading 8 Byte Footer Twice from AsyncRead [\#3550](https://github.com/apache/arrow-rs/pull/3550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) +- Improve concat kernel capacity estimation [\#3546](https://github.com/apache/arrow-rs/pull/3546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.49 to =1.0.50 [\#3545](https://github.com/apache/arrow-rs/pull/3545) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update pyarrow method call to avoid warning [\#3544](https://github.com/apache/arrow-rs/pull/3544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) +- Enable casting between Utf8/LargeUtf8 and Binary/LargeBinary [\#3542](https://github.com/apache/arrow-rs/pull/3542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- set sum of uncompressed column size as row group size for parquet files [\#3531](https://github.com/apache/arrow-rs/pull/3531) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sidred](https://github.com/sidred)) +- Minor: Add documentation about memory use for ArrayData [\#3529](https://github.com/apache/arrow-rs/pull/3529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Upgrade to clap 4.1 + fix test [\#3528](https://github.com/apache/arrow-rs/pull/3528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Write backwards compatible row group statistics \(\#3526\) [\#3527](https://github.com/apache/arrow-rs/pull/3527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- No panic on timestamp buffer overflow [\#3519](https://github.com/apache/arrow-rs/pull/3519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Support casting from binary to dictionary of binary [\#3482](https://github.com/apache/arrow-rs/pull/3482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add Raw JSON Reader \(~2.5x faster\) [\#3479](https://github.com/apache/arrow-rs/pull/3479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) ## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-13) [Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 656c86eaf524..4676edd3e0df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,81 +19,71 @@ # Changelog -## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-27) +## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-09) -[Full Changelog](https://github.com/apache/arrow-rs/compare/31.0.0...32.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/32.0.0...33.0.0) **Breaking changes:** -- Allow `StringArray` construction with `Vec>` [\#3602](https://github.com/apache/arrow-rs/pull/3602) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([sinistersnare](https://github.com/sinistersnare)) -- Use native types in PageIndex \(\#3575\) [\#3578](https://github.com/apache/arrow-rs/pull/3578) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add external variant to ParquetError \(\#3285\) [\#3574](https://github.com/apache/arrow-rs/pull/3574) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Return reference from ListArray::values [\#3561](https://github.com/apache/arrow-rs/pull/3561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: Add `RunEndEncodedArray` [\#3553](https://github.com/apache/arrow-rs/pull/3553) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Use ArrayFormatter in Cast Kernel [\#3668](https://github.com/apache/arrow-rs/pull/3668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use dyn Array in cast kernels [\#3667](https://github.com/apache/arrow-rs/pull/3667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return references from FixedSizeListArray and MapArray [\#3652](https://github.com/apache/arrow-rs/pull/3652) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Lazy array display \(\#3638\) [\#3647](https://github.com/apache/arrow-rs/pull/3647) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use array\_value\_to\_string in arrow-csv [\#3514](https://github.com/apache/arrow-rs/pull/3514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) **Implemented enhancements:** -- There should be a `From>>` impl for `GenericStringArray` [\#3599](https://github.com/apache/arrow-rs/issues/3599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- FlightDataEncoder Optionally send Schema even when no record batches [\#3591](https://github.com/apache/arrow-rs/issues/3591) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Use Native Types in PageIndex [\#3575](https://github.com/apache/arrow-rs/issues/3575) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Packing array into dictionary of generic byte array [\#3571](https://github.com/apache/arrow-rs/issues/3571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement `Error::Source` for ArrowError and FlightError [\#3566](https://github.com/apache/arrow-rs/issues/3566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- \[FlightSQL\] Allow access to underlying FlightClient [\#3551](https://github.com/apache/arrow-rs/issues/3551) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Arrow CSV writer should not fail when cannot cast the value [\#3547](https://github.com/apache/arrow-rs/issues/3547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Write Deprecated Min Max Statistics When ColumnOrder Signed [\#3526](https://github.com/apache/arrow-rs/issues/3526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Improve Performance of JSON Reader [\#3441](https://github.com/apache/arrow-rs/issues/3441) -- Support footer kv metadata for IPC file [\#3432](https://github.com/apache/arrow-rs/issues/3432) -- Add `External` variant to ParquetError [\#3285](https://github.com/apache/arrow-rs/issues/3285) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) +- Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) +- A trait for append\_value and append\_null on ArrayBuilders [\#3644](https://github.com/apache/arrow-rs/issues/3644) +- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) +- Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) +- Casting generic binary to generic string [\#3606](https://github.com/apache/arrow-rs/issues/3606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) **Fixed bugs:** -- Nullif of NULL Predicate is not NULL [\#3589](https://github.com/apache/arrow-rs/issues/3589) -- BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ArrowArray::try\_from\_raw Misleading Signature [\#3684](https://github.com/apache/arrow-rs/issues/3684) +- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) +- FFI Fails to Account For Offsets [\#3671](https://github.com/apache/arrow-rs/issues/3671) +- Regression in CSV reader error handling [\#3656](https://github.com/apache/arrow-rs/issues/3656) +- UnionArray Child and Value Fail to Account for non-contiguous Type IDs [\#3653](https://github.com/apache/arrow-rs/issues/3653) +- Panic when accessing RecordBatch from pyarrow [\#3646](https://github.com/apache/arrow-rs/issues/3646) +- Multiplication for decimals is incorrect [\#3645](https://github.com/apache/arrow-rs/issues/3645) +- Inconsistent output between pretty print and CSV writer for Arrow [\#3513](https://github.com/apache/arrow-rs/issues/3513) **Closed issues:** -- Panic on Key Overflow in Dictionary Builders [\#3562](https://github.com/apache/arrow-rs/issues/3562) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Bumping version gives compilation error \(arrow-array\) [\#3525](https://github.com/apache/arrow-rs/issues/3525) +- Release `32.0.0` of `arrow`/`arrow-flight`/`parquet`/`parquet-derive` \(next release after `31.0.0`\) [\#3584](https://github.com/apache/arrow-rs/issues/3584) **Merged pull requests:** -- Add Push-Based CSV Decoder [\#3604](https://github.com/apache/arrow-rs/pull/3604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update to flatbuffers 23.1.21 [\#3597](https://github.com/apache/arrow-rs/pull/3597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster BooleanBufferBuilder::append\_n for true values [\#3596](https://github.com/apache/arrow-rs/pull/3596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support sending schemas for empty streams [\#3594](https://github.com/apache/arrow-rs/pull/3594) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Faster ListArray to StringArray conversion [\#3593](https://github.com/apache/arrow-rs/pull/3593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add conversion from StringArray to BinaryArray [\#3592](https://github.com/apache/arrow-rs/pull/3592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix nullif null count \(\#3579\) [\#3590](https://github.com/apache/arrow-rs/pull/3590) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Clear bits in BooleanBufferBuilder \(\#3587\) [\#3588](https://github.com/apache/arrow-rs/pull/3588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Iterate all dictionary key types in cast test [\#3585](https://github.com/apache/arrow-rs/pull/3585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Propagate EOF Error from AsyncRead [\#3576](https://github.com/apache/arrow-rs/pull/3576) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) -- Show row\_counts also for \(FixedLen\)ByteArray [\#3573](https://github.com/apache/arrow-rs/pull/3573) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bmmeijers](https://github.com/bmmeijers)) -- Packing array into dictionary of generic byte array [\#3572](https://github.com/apache/arrow-rs/pull/3572) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove unwrap on datetime cast for CSV writer [\#3570](https://github.com/apache/arrow-rs/pull/3570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Implement `std::error::Error::source` for `ArrowError` and `FlightError` [\#3567](https://github.com/apache/arrow-rs/pull/3567) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Improve GenericBytesBuilder offset overflow panic message \(\#139\) [\#3564](https://github.com/apache/arrow-rs/pull/3564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement Extend for ArrayBuilder \(\#1841\) [\#3563](https://github.com/apache/arrow-rs/pull/3563) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update pyarrow method call with kwargs [\#3560](https://github.com/apache/arrow-rs/pull/3560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) -- Update pyo3 requirement from 0.17 to 0.18 [\#3557](https://github.com/apache/arrow-rs/pull/3557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Expose Inner FlightServiceClient on FlightSqlServiceClient \(\#3551\) [\#3556](https://github.com/apache/arrow-rs/pull/3556) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Fix final page row count in parquet-index binary [\#3554](https://github.com/apache/arrow-rs/pull/3554) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Parquet Avoid Reading 8 Byte Footer Twice from AsyncRead [\#3550](https://github.com/apache/arrow-rs/pull/3550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Sach1nAgarwal](https://github.com/Sach1nAgarwal)) -- Improve concat kernel capacity estimation [\#3546](https://github.com/apache/arrow-rs/pull/3546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.49 to =1.0.50 [\#3545](https://github.com/apache/arrow-rs/pull/3545) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update pyarrow method call to avoid warning [\#3544](https://github.com/apache/arrow-rs/pull/3544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Frankonly](https://github.com/Frankonly)) -- Enable casting between Utf8/LargeUtf8 and Binary/LargeBinary [\#3542](https://github.com/apache/arrow-rs/pull/3542) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) -- set sum of uncompressed column size as row group size for parquet files [\#3531](https://github.com/apache/arrow-rs/pull/3531) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sidred](https://github.com/sidred)) -- Minor: Add documentation about memory use for ArrayData [\#3529](https://github.com/apache/arrow-rs/pull/3529) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Upgrade to clap 4.1 + fix test [\#3528](https://github.com/apache/arrow-rs/pull/3528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Write backwards compatible row group statistics \(\#3526\) [\#3527](https://github.com/apache/arrow-rs/pull/3527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- No panic on timestamp buffer overflow [\#3519](https://github.com/apache/arrow-rs/pull/3519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Support casting from binary to dictionary of binary [\#3482](https://github.com/apache/arrow-rs/pull/3482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add Raw JSON Reader \(~2.5x faster\) [\#3479](https://github.com/apache/arrow-rs/pull/3479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup FFI interface \(\#3684\) \(\#3683\) [\#3685](https://github.com/apache/arrow-rs/pull/3685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: take\_run benchmark parameter [\#3679](https://github.com/apache/arrow-rs/pull/3679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Minor: Add some examples to Date\*Array and Time\*Array [\#3678](https://github.com/apache/arrow-rs/pull/3678) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add ArrayData::new\_null and DataType::primitive\_width [\#3676](https://github.com/apache/arrow-rs/pull/3676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix FFI which fails to account for offsets [\#3675](https://github.com/apache/arrow-rs/pull/3675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix Date64Array docs [\#3670](https://github.com/apache/arrow-rs/pull/3670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.50 to =1.0.51 [\#3669](https://github.com/apache/arrow-rs/pull/3669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add timezone accessor for Timestamp\*Array [\#3666](https://github.com/apache/arrow-rs/pull/3666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster timezone cast [\#3665](https://github.com/apache/arrow-rs/pull/3665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement std::fmt::Write for StringBuilder \(\#3638\) [\#3659](https://github.com/apache/arrow-rs/pull/3659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Include line and field number in CSV UTF-8 error \(\#3656\) [\#3657](https://github.com/apache/arrow-rs/pull/3657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Handle non-contiguous type\_ids in UnionArray \(\#3653\) [\#3654](https://github.com/apache/arrow-rs/pull/3654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add modulus\_dyn and modulus\_scalar\_dyn [\#3649](https://github.com/apache/arrow-rs/pull/3649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve error messge with detailed schema [\#3637](https://github.com/apache/arrow-rs/pull/3637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Veeupup](https://github.com/Veeupup)) +- Add limit to ArrowReaderBuilder to push limit down to parquet reader [\#3633](https://github.com/apache/arrow-rs/pull/3633) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- chore: delete wrong comment and refactor set\_metadata in `Field` [\#3630](https://github.com/apache/arrow-rs/pull/3630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([chunshao90](https://github.com/chunshao90)) +- Fix typo in comment [\#3627](https://github.com/apache/arrow-rs/pull/3627) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kjschiroo](https://github.com/kjschiroo)) +- Minor: Update doc strings about Page Index / Column Index [\#3625](https://github.com/apache/arrow-rs/pull/3625) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Specified version of helper function to cast binary to string [\#3624](https://github.com/apache/arrow-rs/pull/3624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat: take kernel for RunArray [\#3622](https://github.com/apache/arrow-rs/pull/3622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Remove BitSliceIterator specialization from try\_for\_each\_valid\_idx [\#3621](https://github.com/apache/arrow-rs/pull/3621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Reduce PrimitiveArray::try\_unary codegen [\#3619](https://github.com/apache/arrow-rs/pull/3619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Reduce Dictionary Builder Codegen [\#3616](https://github.com/apache/arrow-rs/pull/3616) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Add test for dictionary encoding of batches [\#3608](https://github.com/apache/arrow-rs/pull/3608) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Casting generic binary to generic string [\#3607](https://github.com/apache/arrow-rs/pull/3607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray [\#3603](https://github.com/apache/arrow-rs/pull/3603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 774bb11bb090..977590308e42 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "32.0.0" +version = "33.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index c109db36973d..bc47672e2594 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "32.0.0" +version = "33.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index d46e2f11a1d3..e84b11a2b596 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "32.0.0" +version = "33.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 2ce83e856806..bb2d725b34f4 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "32.0.0" +version = "33.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-select = { version = "32.0.0", path = "../arrow-select" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-select = { version = "33.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 517ffa33f9f0..9d1582b91c2f 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "32.0.0" +version = "33.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 42e1f43bf30d..ca50d8a12aee 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "32.0.0" +version = "33.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,8 +45,8 @@ force_validate = [] [dependencies] -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1fe382935a1d..603d4a636623 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "32.0.0" +version = "33.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "32.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "33.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -50,7 +50,7 @@ flight-sql-experimental = [] tls = ["tonic/tls"] [dev-dependencies] -arrow = { version = "32.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "33.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index cb543c956d72..7992d93292ce 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "32.0.0" +arrow-flight = "33.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 35b088b1636f..f9ca4297e6e7 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "32.0.0" +version = "33.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "32.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } +arrow = { version = "33.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 46b2bb3691eb..e22a15f52ddc 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "32.0.0" +version = "33.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 79b34a7b4601..6661f35c0635 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "32.0.0" +version = "33.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 2a3a7ec1731f..ab77c1843ec0 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "32.0.0" +version = "33.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index b029c8b91303..682d68dac857 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "32.0.0" +version = "33.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-select = { version = "32.0.0", path = "../arrow-select" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-select = { version = "33.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 02a96cf68fd4..75990dc90279 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "32.0.0" +version = "33.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "32.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "33.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index f82e499cc302..94210a27a14b 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "32.0.0" +version = "33.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-ord = { version = "32.0.0", path = "../arrow-ord" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-ord = { version = "33.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index c36305b0b283..1a25c1022195 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "32.0.0" +version = "33.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 8a8af0dbb825..789a23359a16 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "32.0.0" +version = "33.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-array = { version = "32.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 47740275c0fd..796024e873ef 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "32.0.0" +version = "33.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-select = { version = "32.0.0", path = "../arrow-select" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-select = { version = "33.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f86ec09a9ac3..814ca14c8058 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "32.0.0" +version = "33.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "32.0.0", path = "../arrow-arith" } -arrow-array = { version = "32.0.0", path = "../arrow-array" } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "32.0.0", path = "../arrow-cast" } -arrow-csv = { version = "32.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "32.0.0", path = "../arrow-data" } -arrow-ipc = { version = "32.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "32.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "32.0.0", path = "../arrow-ord" } -arrow-row = { version = "32.0.0", path = "../arrow-row" } -arrow-schema = { version = "32.0.0", path = "../arrow-schema" } -arrow-select = { version = "32.0.0", path = "../arrow-select" } -arrow-string = { version = "32.0.0", path = "../arrow-string" } +arrow-arith = { version = "33.0.0", path = "../arrow-arith" } +arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "33.0.0", path = "../arrow-cast" } +arrow-csv = { version = "33.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-ipc = { version = "33.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "33.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "33.0.0", path = "../arrow-ord" } +arrow-row = { version = "33.0.0", path = "../arrow-row" } +arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-select = { version = "33.0.0", path = "../arrow-select" } +arrow-string = { version = "33.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow/README.md b/arrow/README.md index 68598078cfd8..0714285011fa 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `32.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `33.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index f86513822762..b8018bfaf7b4 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/32.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/33.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 2b8396347355..7b773fd05c61 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="31.0.0" -FUTURE_RELEASE="32.0.0" +SINCE_TAG="32.0.0" +FUTURE_RELEASE="33.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a112ec354e8d..d59f481f362f 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "32.0.0" +version = "33.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "32.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "32.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "32.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "32.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "32.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "32.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "32.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "32.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "33.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "33.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "33.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "33.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "33.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "33.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "33.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "33.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "32.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "33.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 3fdcd66f248c..f648aafbf2fb 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "32.0.0" +version = "33.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "32.0.0", default-features = false } +parquet = { path = "../parquet", version = "33.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 14d3c066c7e9..c8ee7ea81101 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "32.0.0" -parquet_derive = "32.0.0" +parquet = "33.0.0" +parquet_derive = "33.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index e3306e7c4659..df8fa3aef65a 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "32.0.0" +version = "33.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "32.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "32.0.0", default-features = false } +parquet = { path = "../parquet", version = "33.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "33.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 3761ac53cab55c269b06d9a13825dd81b03e0c11 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Feb 2023 15:13:54 +0000 Subject: [PATCH 0596/1411] Move FFI to sub-crates (#3687) * Move FFI to sub-crates * Use ptr::write instead of drop_in_place * Add inline * Avoid unnecessary clone * Format * Clippy * Remove pub(crate) --- arrow-data/Cargo.toml | 5 + arrow-data/src/ffi.rs | 285 +++++++++++++++ arrow-data/src/lib.rs | 3 + arrow-schema/Cargo.toml | 7 +- arrow-schema/src/ffi.rs | 703 +++++++++++++++++++++++++++++++++++++ arrow-schema/src/lib.rs | 3 + arrow/Cargo.toml | 3 +- arrow/src/datatypes/ffi.rs | 502 -------------------------- arrow/src/ffi.rs | 515 ++------------------------- arrow/src/ffi_stream.rs | 29 +- 10 files changed, 1058 insertions(+), 997 deletions(-) create mode 100644 arrow-data/src/ffi.rs create mode 100644 arrow-schema/src/ffi.rs diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index ca50d8a12aee..a1938af4b194 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -42,6 +42,11 @@ bench = false # this is not enabled by default as it is too computationally expensive # but is run as part of our CI checks force_validate = [] +# Enable ffi support +ffi = ["arrow-schema/ffi"] + +[package.metadata.docs.rs] +features = ["ffi"] [dependencies] diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs new file mode 100644 index 000000000000..e506653bb59b --- /dev/null +++ b/arrow-data/src/ffi.rs @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). + +use crate::{layout, ArrayData}; +use arrow_buffer::Buffer; +use arrow_schema::DataType; +use std::ffi::c_void; + +/// ABI-compatible struct for ArrowArray from C Data Interface +/// See +/// +/// ``` +/// # use arrow_data::ArrayData; +/// # use arrow_data::ffi::FFI_ArrowArray; +/// fn export_array(array: &ArrayData) -> FFI_ArrowArray { +/// FFI_ArrowArray::new(array) +/// } +/// ``` +#[repr(C)] +#[derive(Debug)] +pub struct FFI_ArrowArray { + length: i64, + null_count: i64, + offset: i64, + n_buffers: i64, + n_children: i64, + buffers: *mut *const c_void, + children: *mut *mut FFI_ArrowArray, + dictionary: *mut FFI_ArrowArray, + release: Option, + // When exported, this MUST contain everything that is owned by this array. + // for example, any buffer pointed to in `buffers` must be here, as well + // as the `buffers` pointer itself. + // In other words, everything in [FFI_ArrowArray] must be owned by + // `private_data` and can assume that they do not outlive `private_data`. + private_data: *mut c_void, +} + +impl Drop for FFI_ArrowArray { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } +} + +unsafe impl Send for FFI_ArrowArray {} +unsafe impl Sync for FFI_ArrowArray {} + +// callback used to drop [FFI_ArrowArray] when it is exported +unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) { + if array.is_null() { + return; + } + let array = &mut *array; + + // take ownership of `private_data`, therefore dropping it` + let private = Box::from_raw(array.private_data as *mut ArrayPrivateData); + for child in private.children.iter() { + let _ = Box::from_raw(*child); + } + if !private.dictionary.is_null() { + let _ = Box::from_raw(private.dictionary); + } + + array.release = None; +} + +struct ArrayPrivateData { + #[allow(dead_code)] + buffers: Vec>, + buffers_ptr: Box<[*const c_void]>, + children: Box<[*mut FFI_ArrowArray]>, + dictionary: *mut FFI_ArrowArray, +} + +impl FFI_ArrowArray { + /// creates a new `FFI_ArrowArray` from existing data. + /// # Memory Leaks + /// This method releases `buffers`. Consumers of this struct *must* call `release` before + /// releasing this struct, or contents in `buffers` leak. + pub fn new(data: &ArrayData) -> Self { + let data_layout = layout(data.data_type()); + + let buffers = if data_layout.can_contain_null_mask { + // * insert the null buffer at the start + // * make all others `Option`. + std::iter::once(data.null_buffer().cloned()) + .chain(data.buffers().iter().map(|b| Some(b.clone()))) + .collect::>() + } else { + data.buffers().iter().map(|b| Some(b.clone())).collect() + }; + + // `n_buffers` is the number of buffers by the spec. + let n_buffers = { + data_layout.buffers.len() + { + // If the layout has a null buffer by Arrow spec. + // Note that even the array doesn't have a null buffer because it has + // no null value, we still need to count 1 here to follow the spec. + usize::from(data_layout.can_contain_null_mask) + } + } as i64; + + let buffers_ptr = buffers + .iter() + .flat_map(|maybe_buffer| match maybe_buffer { + // note that `raw_data` takes into account the buffer's offset + Some(b) => Some(b.as_ptr() as *const c_void), + // This is for null buffer. We only put a null pointer for + // null buffer if by spec it can contain null mask. + None if data_layout.can_contain_null_mask => Some(std::ptr::null()), + None => None, + }) + .collect::>(); + + let empty = vec![]; + let (child_data, dictionary) = match data.data_type() { + DataType::Dictionary(_, _) => ( + empty.as_slice(), + Box::into_raw(Box::new(FFI_ArrowArray::new(&data.child_data()[0]))), + ), + _ => (data.child_data(), std::ptr::null_mut()), + }; + + let children = child_data + .iter() + .map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child)))) + .collect::>(); + let n_children = children.len() as i64; + + // create the private data owning everything. + // any other data must be added here, e.g. via a struct, to track lifetime. + let mut private_data = Box::new(ArrayPrivateData { + buffers, + buffers_ptr, + children, + dictionary, + }); + + Self { + length: data.len() as i64, + null_count: data.null_count() as i64, + offset: data.offset() as i64, + n_buffers, + n_children, + buffers: private_data.buffers_ptr.as_mut_ptr(), + children: private_data.children.as_mut_ptr(), + dictionary, + release: Some(release_array), + private_data: Box::into_raw(private_data) as *mut c_void, + } + } + + /// create an empty `FFI_ArrowArray`, which can be used to import data into + pub fn empty() -> Self { + Self { + length: 0, + null_count: 0, + offset: 0, + n_buffers: 0, + n_children: 0, + buffers: std::ptr::null_mut(), + children: std::ptr::null_mut(), + dictionary: std::ptr::null_mut(), + release: None, + private_data: std::ptr::null_mut(), + } + } + + /// the length of the array + #[inline] + pub fn len(&self) -> usize { + self.length as usize + } + + /// whether the array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Whether the array has been released + #[inline] + pub fn is_released(&self) -> bool { + self.release.is_none() + } + + /// the offset of the array + #[inline] + pub fn offset(&self) -> usize { + self.offset as usize + } + + /// the null count of the array + #[inline] + pub fn null_count(&self) -> usize { + self.null_count as usize + } + + /// Returns the buffer at the provided index + /// + /// # Panic + /// Panics if index exceeds the number of buffers or the buffer is not correctly aligned + #[inline] + pub fn buffer(&self, index: usize) -> *const u8 { + assert!(!self.buffers.is_null()); + assert!(index < self.num_buffers()); + // SAFETY: + // If buffers is not null must be valid for reads up to num_buffers + unsafe { std::ptr::read_unaligned((self.buffers as *mut *const u8).add(index)) } + } + + /// Returns the number of buffers + #[inline] + pub fn num_buffers(&self) -> usize { + self.n_buffers as _ + } + + /// Returns the child at the provided index + #[inline] + pub fn child(&self, index: usize) -> &FFI_ArrowArray { + assert!(!self.children.is_null()); + assert!(index < self.num_children()); + // Safety: + // If children is not null must be valid for reads up to num_children + unsafe { + let child = std::ptr::read_unaligned(self.children.add(index)); + child.as_ref().unwrap() + } + } + + /// Returns the number of children + #[inline] + pub fn num_children(&self) -> usize { + self.n_children as _ + } + + /// Returns the dictionary if any + #[inline] + pub fn dictionary(&self) -> Option<&Self> { + // Safety: + // If dictionary is not null should be valid for reads of `Self` + unsafe { self.dictionary.as_ref() } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // More tests located in top-level arrow crate + + #[test] + fn null_array_n_buffers() { + let data = ArrayData::new_null(&DataType::Null, 10); + + let ffi_array = FFI_ArrowArray::new(&data); + assert_eq!(0, ffi_array.n_buffers); + + let private_data = + unsafe { Box::from_raw(ffi_array.private_data as *mut ArrayPrivateData) }; + + assert_eq!(0, private_data.buffers_ptr.len()); + + Box::into_raw(private_data); + } +} diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 58571e181176..b37a8c5da72f 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -28,3 +28,6 @@ pub mod transform; pub mod bit_iterator; pub mod bit_mask; pub mod decimal; + +#[cfg(feature = "ffi")] +pub mod ffi; diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 1a25c1022195..e4e7d0082eb8 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -39,9 +39,14 @@ bench = false [dependencies] serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true } +bitflags = { version = "1.2.1", default-features = false, optional = true } [features] -default = [] +# Enable ffi support +ffi = ["bitflags"] + +[package.metadata.docs.rs] +features = ["ffi"] [dev-dependencies] serde_json = "1.0" diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs new file mode 100644 index 000000000000..8e58e3158c8b --- /dev/null +++ b/arrow-schema/src/ffi.rs @@ -0,0 +1,703 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). +//! +//! ``` +//! # use arrow_schema::{DataType, Field, Schema}; +//! # use arrow_schema::ffi::FFI_ArrowSchema; +//! +//! // Create from data type +//! let ffi_data_type = FFI_ArrowSchema::try_from(&DataType::LargeUtf8).unwrap(); +//! let back = DataType::try_from(&ffi_data_type).unwrap(); +//! assert_eq!(back, DataType::LargeUtf8); +//! +//! // Create from schema +//! let schema = Schema::new(vec![Field::new("foo", DataType::Int64, false)]); +//! let ffi_schema = FFI_ArrowSchema::try_from(&schema).unwrap(); +//! let back = Schema::try_from(&ffi_schema).unwrap(); +//! +//! assert_eq!(schema, back); +//! ``` + +use crate::{ArrowError, DataType, Field, Schema, TimeUnit, UnionMode}; +use bitflags::bitflags; +use std::ffi::{c_char, c_void, CStr, CString}; + +bitflags! { + pub struct Flags: i64 { + const DICTIONARY_ORDERED = 0b00000001; + const NULLABLE = 0b00000010; + const MAP_KEYS_SORTED = 0b00000100; + } +} + +/// ABI-compatible struct for `ArrowSchema` from C Data Interface +/// See +/// +/// ``` +/// # use arrow_schema::DataType; +/// # use arrow_schema::ffi::FFI_ArrowSchema; +/// fn array_schema(data_type: &DataType) -> FFI_ArrowSchema { +/// FFI_ArrowSchema::try_from(data_type).unwrap() +/// } +/// ``` +/// +#[repr(C)] +#[derive(Debug)] +pub struct FFI_ArrowSchema { + format: *const c_char, + name: *const c_char, + metadata: *const c_char, + flags: i64, + n_children: i64, + children: *mut *mut FFI_ArrowSchema, + dictionary: *mut FFI_ArrowSchema, + release: Option, + private_data: *mut c_void, +} + +struct SchemaPrivateData { + children: Box<[*mut FFI_ArrowSchema]>, + dictionary: *mut FFI_ArrowSchema, +} + +// callback used to drop [FFI_ArrowSchema] when it is exported. +unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) { + if schema.is_null() { + return; + } + let schema = &mut *schema; + + // take ownership back to release it. + drop(CString::from_raw(schema.format as *mut c_char)); + if !schema.name.is_null() { + drop(CString::from_raw(schema.name as *mut c_char)); + } + if !schema.private_data.is_null() { + let private_data = Box::from_raw(schema.private_data as *mut SchemaPrivateData); + for child in private_data.children.iter() { + drop(Box::from_raw(*child)) + } + if !private_data.dictionary.is_null() { + drop(Box::from_raw(private_data.dictionary)); + } + + drop(private_data); + } + + schema.release = None; +} + +impl FFI_ArrowSchema { + /// create a new [`FFI_ArrowSchema`]. This fails if the fields' + /// [`DataType`] is not supported. + pub fn try_new( + format: &str, + children: Vec, + dictionary: Option, + ) -> Result { + let mut this = Self::empty(); + + let children_ptr = children + .into_iter() + .map(Box::new) + .map(Box::into_raw) + .collect::>(); + + this.format = CString::new(format).unwrap().into_raw(); + this.release = Some(release_schema); + this.n_children = children_ptr.len() as i64; + + let dictionary_ptr = dictionary + .map(|d| Box::into_raw(Box::new(d))) + .unwrap_or(std::ptr::null_mut()); + + let mut private_data = Box::new(SchemaPrivateData { + children: children_ptr, + dictionary: dictionary_ptr, + }); + + // intentionally set from private_data (see https://github.com/apache/arrow-rs/issues/580) + this.children = private_data.children.as_mut_ptr(); + + this.dictionary = dictionary_ptr; + + this.private_data = Box::into_raw(private_data) as *mut c_void; + + Ok(this) + } + + pub fn with_name(mut self, name: &str) -> Result { + self.name = CString::new(name).unwrap().into_raw(); + Ok(self) + } + + pub fn with_flags(mut self, flags: Flags) -> Result { + self.flags = flags.bits(); + Ok(self) + } + + pub fn empty() -> Self { + Self { + format: std::ptr::null_mut(), + name: std::ptr::null_mut(), + metadata: std::ptr::null_mut(), + flags: 0, + n_children: 0, + children: std::ptr::null_mut(), + dictionary: std::ptr::null_mut(), + release: None, + private_data: std::ptr::null_mut(), + } + } + + /// returns the format of this schema. + pub fn format(&self) -> &str { + assert!(!self.format.is_null()); + // safe because the lifetime of `self.format` equals `self` + unsafe { CStr::from_ptr(self.format) } + .to_str() + .expect("The external API has a non-utf8 as format") + } + + /// returns the name of this schema. + pub fn name(&self) -> &str { + assert!(!self.name.is_null()); + // safe because the lifetime of `self.name` equals `self` + unsafe { CStr::from_ptr(self.name) } + .to_str() + .expect("The external API has a non-utf8 as name") + } + + pub fn flags(&self) -> Option { + Flags::from_bits(self.flags) + } + + pub fn child(&self, index: usize) -> &Self { + assert!(index < self.n_children as usize); + unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() } + } + + pub fn children(&self) -> impl Iterator { + (0..self.n_children as usize).map(move |i| self.child(i)) + } + + pub fn nullable(&self) -> bool { + (self.flags / 2) & 1 == 1 + } + + pub fn dictionary(&self) -> Option<&Self> { + unsafe { self.dictionary.as_ref() } + } + + pub fn map_keys_sorted(&self) -> bool { + self.flags & 0b00000100 != 0 + } + + pub fn dictionary_ordered(&self) -> bool { + self.flags & 0b00000001 != 0 + } +} + +impl Drop for FFI_ArrowSchema { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } +} + +impl TryFrom<&FFI_ArrowSchema> for DataType { + type Error = ArrowError; + + /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) + fn try_from(c_schema: &FFI_ArrowSchema) -> Result { + let mut dtype = match c_schema.format() { + "n" => DataType::Null, + "b" => DataType::Boolean, + "c" => DataType::Int8, + "C" => DataType::UInt8, + "s" => DataType::Int16, + "S" => DataType::UInt16, + "i" => DataType::Int32, + "I" => DataType::UInt32, + "l" => DataType::Int64, + "L" => DataType::UInt64, + "e" => DataType::Float16, + "f" => DataType::Float32, + "g" => DataType::Float64, + "z" => DataType::Binary, + "Z" => DataType::LargeBinary, + "u" => DataType::Utf8, + "U" => DataType::LargeUtf8, + "tdD" => DataType::Date32, + "tdm" => DataType::Date64, + "tts" => DataType::Time32(TimeUnit::Second), + "ttm" => DataType::Time32(TimeUnit::Millisecond), + "ttu" => DataType::Time64(TimeUnit::Microsecond), + "ttn" => DataType::Time64(TimeUnit::Nanosecond), + "tDs" => DataType::Duration(TimeUnit::Second), + "tDm" => DataType::Duration(TimeUnit::Millisecond), + "tDu" => DataType::Duration(TimeUnit::Microsecond), + "tDn" => DataType::Duration(TimeUnit::Nanosecond), + "+l" => { + let c_child = c_schema.child(0); + DataType::List(Box::new(Field::try_from(c_child)?)) + } + "+L" => { + let c_child = c_schema.child(0); + DataType::LargeList(Box::new(Field::try_from(c_child)?)) + } + "+s" => { + let fields = c_schema.children().map(Field::try_from); + DataType::Struct(fields.collect::, ArrowError>>()?) + } + "+m" => { + let c_child = c_schema.child(0); + let map_keys_sorted = c_schema.map_keys_sorted(); + DataType::Map(Box::new(Field::try_from(c_child)?), map_keys_sorted) + } + // Parametrized types, requiring string parse + other => { + match other.splitn(2, ':').collect::>().as_slice() { + // FixedSizeBinary type in format "w:num_bytes" + ["w", num_bytes] => { + let parsed_num_bytes = num_bytes.parse::().map_err(|_| { + ArrowError::CDataInterface( + "FixedSizeBinary requires an integer parameter representing number of bytes per element".to_string()) + })?; + DataType::FixedSizeBinary(parsed_num_bytes) + }, + // FixedSizeList type in format "+w:num_elems" + ["+w", num_elems] => { + let c_child = c_schema.child(0); + let parsed_num_elems = num_elems.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The FixedSizeList type requires an integer parameter representing number of elements per list".to_string()) + })?; + DataType::FixedSizeList(Box::new(Field::try_from(c_child)?), parsed_num_elems) + }, + // Decimal types in format "d:precision,scale" or "d:precision,scale,bitWidth" + ["d", extra] => { + match extra.splitn(3, ',').collect::>().as_slice() { + [precision, scale] => { + let parsed_precision = precision.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The decimal type requires an integer precision".to_string(), + ) + })?; + let parsed_scale = scale.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The decimal type requires an integer scale".to_string(), + ) + })?; + DataType::Decimal128(parsed_precision, parsed_scale) + }, + [precision, scale, bits] => { + if *bits != "128" && *bits != "256" { + return Err(ArrowError::CDataInterface("Only 128/256 bit wide decimal is supported in the Rust implementation".to_string())); + } + let parsed_precision = precision.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The decimal type requires an integer precision".to_string(), + ) + })?; + let parsed_scale = scale.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The decimal type requires an integer scale".to_string(), + ) + })?; + if *bits == "128" { + DataType::Decimal128(parsed_precision, parsed_scale) + } else { + DataType::Decimal256(parsed_precision, parsed_scale) + } + } + _ => { + return Err(ArrowError::CDataInterface(format!( + "The decimal pattern \"d:{extra:?}\" is not supported in the Rust implementation" + ))) + } + } + } + // DenseUnion + ["+ud", extra] => { + let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The Union type requires an integer type id".to_string(), + ) + })).collect::, ArrowError>>()?; + let mut fields = Vec::with_capacity(type_ids.len()); + for idx in 0..c_schema.n_children { + let c_child = c_schema.child(idx as usize); + let field = Field::try_from(c_child)?; + fields.push(field); + } + + if fields.len() != type_ids.len() { + return Err(ArrowError::CDataInterface( + "The Union type requires same number of fields and type ids".to_string(), + )); + } + + DataType::Union(fields, type_ids, UnionMode::Dense) + } + // SparseUnion + ["+us", extra] => { + let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { + ArrowError::CDataInterface( + "The Union type requires an integer type id".to_string(), + ) + })).collect::, ArrowError>>()?; + let mut fields = Vec::with_capacity(type_ids.len()); + for idx in 0..c_schema.n_children { + let c_child = c_schema.child(idx as usize); + let field = Field::try_from(c_child)?; + fields.push(field); + } + + if fields.len() != type_ids.len() { + return Err(ArrowError::CDataInterface( + "The Union type requires same number of fields and type ids".to_string(), + )); + } + + DataType::Union(fields, type_ids, UnionMode::Sparse) + } + + // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp. + ["tss", ""] => DataType::Timestamp(TimeUnit::Second, None), + ["tsm", ""] => DataType::Timestamp(TimeUnit::Millisecond, None), + ["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None), + ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None), + ["tss", tz] => { + DataType::Timestamp(TimeUnit::Second, Some(tz.to_string())) + } + ["tsm", tz] => { + DataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())) + } + ["tsu", tz] => { + DataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())) + } + ["tsn", tz] => { + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())) + } + _ => { + return Err(ArrowError::CDataInterface(format!( + "The datatype \"{other:?}\" is still not supported in Rust implementation" + ))) + } + } + } + }; + + if let Some(dict_schema) = c_schema.dictionary() { + let value_type = Self::try_from(dict_schema)?; + dtype = DataType::Dictionary(Box::new(dtype), Box::new(value_type)); + } + + Ok(dtype) + } +} + +impl TryFrom<&FFI_ArrowSchema> for Field { + type Error = ArrowError; + + fn try_from(c_schema: &FFI_ArrowSchema) -> Result { + let dtype = DataType::try_from(c_schema)?; + let field = Field::new(c_schema.name(), dtype, c_schema.nullable()); + Ok(field) + } +} + +impl TryFrom<&FFI_ArrowSchema> for Schema { + type Error = ArrowError; + + fn try_from(c_schema: &FFI_ArrowSchema) -> Result { + // interpret it as a struct type then extract its fields + let dtype = DataType::try_from(c_schema)?; + if let DataType::Struct(fields) = dtype { + Ok(Schema::new(fields)) + } else { + Err(ArrowError::CDataInterface( + "Unable to interpret C data struct as a Schema".to_string(), + )) + } + } +} + +impl TryFrom<&DataType> for FFI_ArrowSchema { + type Error = ArrowError; + + /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) + fn try_from(dtype: &DataType) -> Result { + let format = get_format_string(dtype)?; + // allocate and hold the children + let children = match dtype { + DataType::List(child) + | DataType::LargeList(child) + | DataType::FixedSizeList(child, _) + | DataType::Map(child, _) => { + vec![FFI_ArrowSchema::try_from(child.as_ref())?] + } + DataType::Union(fields, _, _) => fields + .iter() + .map(FFI_ArrowSchema::try_from) + .collect::, ArrowError>>()?, + DataType::Struct(fields) => fields + .iter() + .map(FFI_ArrowSchema::try_from) + .collect::, ArrowError>>()?, + _ => vec![], + }; + let dictionary = if let DataType::Dictionary(_, value_data_type) = dtype { + Some(Self::try_from(value_data_type.as_ref())?) + } else { + None + }; + + let flags = match dtype { + DataType::Map(_, true) => Flags::MAP_KEYS_SORTED, + _ => Flags::empty(), + }; + + FFI_ArrowSchema::try_new(&format, children, dictionary)?.with_flags(flags) + } +} + +fn get_format_string(dtype: &DataType) -> Result { + match dtype { + DataType::Null => Ok("n".to_string()), + DataType::Boolean => Ok("b".to_string()), + DataType::Int8 => Ok("c".to_string()), + DataType::UInt8 => Ok("C".to_string()), + DataType::Int16 => Ok("s".to_string()), + DataType::UInt16 => Ok("S".to_string()), + DataType::Int32 => Ok("i".to_string()), + DataType::UInt32 => Ok("I".to_string()), + DataType::Int64 => Ok("l".to_string()), + DataType::UInt64 => Ok("L".to_string()), + DataType::Float16 => Ok("e".to_string()), + DataType::Float32 => Ok("f".to_string()), + DataType::Float64 => Ok("g".to_string()), + DataType::Binary => Ok("z".to_string()), + DataType::LargeBinary => Ok("Z".to_string()), + DataType::Utf8 => Ok("u".to_string()), + DataType::LargeUtf8 => Ok("U".to_string()), + DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), + DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), + DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), + DataType::Decimal256(precision, scale) => { + Ok(format!("d:{precision},{scale},256")) + } + DataType::Date32 => Ok("tdD".to_string()), + DataType::Date64 => Ok("tdm".to_string()), + DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), + DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()), + DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()), + DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()), + DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()), + DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()), + DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()), + DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()), + DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")), + DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")), + DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")), + DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")), + DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()), + DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), + DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), + DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()), + DataType::List(_) => Ok("+l".to_string()), + DataType::LargeList(_) => Ok("+L".to_string()), + DataType::Struct(_) => Ok("+s".to_string()), + DataType::Map(_, _) => Ok("+m".to_string()), + DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), + DataType::Union(_, type_ids, mode) => { + let formats = type_ids.iter().map(|t| t.to_string()).collect::>(); + match mode { + UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), + UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), + } + } + other => Err(ArrowError::CDataInterface(format!( + "The datatype \"{other:?}\" is still not supported in Rust implementation" + ))), + } +} + +impl TryFrom<&Field> for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(field: &Field) -> Result { + let mut flags = if field.is_nullable() { + Flags::NULLABLE + } else { + Flags::empty() + }; + + if let Some(true) = field.dict_is_ordered() { + flags |= Flags::DICTIONARY_ORDERED; + } + + FFI_ArrowSchema::try_from(field.data_type())? + .with_name(field.name())? + .with_flags(flags) + } +} + +impl TryFrom<&Schema> for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(schema: &Schema) -> Result { + let dtype = DataType::Struct(schema.fields().clone()); + let c_schema = FFI_ArrowSchema::try_from(&dtype)?; + Ok(c_schema) + } +} + +impl TryFrom for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(dtype: DataType) -> Result { + FFI_ArrowSchema::try_from(&dtype) + } +} + +impl TryFrom for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(field: Field) -> Result { + FFI_ArrowSchema::try_from(&field) + } +} + +impl TryFrom for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(schema: Schema) -> Result { + FFI_ArrowSchema::try_from(&schema) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip_type(dtype: DataType) { + let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap(); + let restored = DataType::try_from(&c_schema).unwrap(); + assert_eq!(restored, dtype); + } + + fn round_trip_field(field: Field) { + let c_schema = FFI_ArrowSchema::try_from(&field).unwrap(); + let restored = Field::try_from(&c_schema).unwrap(); + assert_eq!(restored, field); + } + + fn round_trip_schema(schema: Schema) { + let c_schema = FFI_ArrowSchema::try_from(&schema).unwrap(); + let restored = Schema::try_from(&c_schema).unwrap(); + assert_eq!(restored, schema); + } + + #[test] + fn test_type() { + round_trip_type(DataType::Int64); + round_trip_type(DataType::UInt64); + round_trip_type(DataType::Float64); + round_trip_type(DataType::Date64); + round_trip_type(DataType::Time64(TimeUnit::Nanosecond)); + round_trip_type(DataType::FixedSizeBinary(12)); + round_trip_type(DataType::FixedSizeList( + Box::new(Field::new("a", DataType::Int64, false)), + 5, + )); + round_trip_type(DataType::Utf8); + round_trip_type(DataType::List(Box::new(Field::new( + "a", + DataType::Int16, + false, + )))); + round_trip_type(DataType::Struct(vec![Field::new( + "a", + DataType::Utf8, + true, + )])); + } + + #[test] + fn test_field() { + let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]); + round_trip_field(Field::new("test", dtype, true)); + } + + #[test] + fn test_schema() { + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("address", DataType::Utf8, false), + Field::new("priority", DataType::UInt8, false), + ]); + round_trip_schema(schema); + + // test that we can interpret struct types as schema + let dtype = DataType::Struct(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int16, false), + ]); + let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap(); + let schema = Schema::try_from(&c_schema).unwrap(); + assert_eq!(schema.fields().len(), 2); + + // test that we assert the input type + let c_schema = FFI_ArrowSchema::try_from(&DataType::Float64).unwrap(); + let result = Schema::try_from(&c_schema); + assert!(result.is_err()); + } + + #[test] + fn test_map_keys_sorted() { + let keys = Field::new("keys", DataType::Int32, false); + let values = Field::new("values", DataType::UInt32, false); + let entry_struct = DataType::Struct(vec![keys, values]); + + // Construct a map array from the above two + let map_data_type = + DataType::Map(Box::new(Field::new("entries", entry_struct, true)), true); + + let arrow_schema = FFI_ArrowSchema::try_from(map_data_type).unwrap(); + assert!(arrow_schema.map_keys_sorted()); + } + + #[test] + fn test_dictionary_ordered() { + let schema = Schema::new(vec![Field::new_dict( + "dict", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + 0, + true, + )]); + + let arrow_schema = FFI_ArrowSchema::try_from(schema).unwrap(); + assert!(arrow_schema.child(0).dictionary_ordered()); + } +} diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index c2b1aba3b926..6bc2329dbd36 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -26,6 +26,9 @@ pub use field::*; mod schema; pub use schema::*; +#[cfg(feature = "ffi")] +pub mod ffi; + /// Options that define the sort order of a given column #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct SortOptions { diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 814ca14c8058..ef89e5a81232 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -62,7 +62,6 @@ arrow-string = { version = "33.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.18", default-features = false, optional = true } -bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "dyn_arith_dict", "ffi", "pyarrow"] @@ -86,7 +85,7 @@ pyarrow = ["pyo3", "ffi"] # but is run as part of our CI checks force_validate = ["arrow-data/force_validate"] # Enable ffi support -ffi = ["bitflags"] +ffi = ["arrow-schema/ffi", "arrow-data/ffi"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars dyn_cmp_dict = ["arrow-string/dyn_cmp_dict", "arrow-ord/dyn_cmp_dict"] diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index 58cad3d08a4e..b248758bc120 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -14,505 +14,3 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - -use arrow_schema::UnionMode; -use std::convert::TryFrom; - -use crate::datatypes::DataType::Map; -use crate::{ - datatypes::{DataType, Field, Schema, TimeUnit}, - error::{ArrowError, Result}, - ffi::{FFI_ArrowSchema, Flags}, -}; - -impl TryFrom<&FFI_ArrowSchema> for DataType { - type Error = ArrowError; - - /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) - fn try_from(c_schema: &FFI_ArrowSchema) -> Result { - let mut dtype = match c_schema.format() { - "n" => DataType::Null, - "b" => DataType::Boolean, - "c" => DataType::Int8, - "C" => DataType::UInt8, - "s" => DataType::Int16, - "S" => DataType::UInt16, - "i" => DataType::Int32, - "I" => DataType::UInt32, - "l" => DataType::Int64, - "L" => DataType::UInt64, - "e" => DataType::Float16, - "f" => DataType::Float32, - "g" => DataType::Float64, - "z" => DataType::Binary, - "Z" => DataType::LargeBinary, - "u" => DataType::Utf8, - "U" => DataType::LargeUtf8, - "tdD" => DataType::Date32, - "tdm" => DataType::Date64, - "tts" => DataType::Time32(TimeUnit::Second), - "ttm" => DataType::Time32(TimeUnit::Millisecond), - "ttu" => DataType::Time64(TimeUnit::Microsecond), - "ttn" => DataType::Time64(TimeUnit::Nanosecond), - "tDs" => DataType::Duration(TimeUnit::Second), - "tDm" => DataType::Duration(TimeUnit::Millisecond), - "tDu" => DataType::Duration(TimeUnit::Microsecond), - "tDn" => DataType::Duration(TimeUnit::Nanosecond), - "+l" => { - let c_child = c_schema.child(0); - DataType::List(Box::new(Field::try_from(c_child)?)) - } - "+L" => { - let c_child = c_schema.child(0); - DataType::LargeList(Box::new(Field::try_from(c_child)?)) - } - "+s" => { - let fields = c_schema.children().map(Field::try_from); - DataType::Struct(fields.collect::>>()?) - } - "+m" => { - let c_child = c_schema.child(0); - let map_keys_sorted = c_schema.map_keys_sorted(); - DataType::Map(Box::new(Field::try_from(c_child)?), map_keys_sorted) - } - // Parametrized types, requiring string parse - other => { - match other.splitn(2, ':').collect::>().as_slice() { - // FixedSizeBinary type in format "w:num_bytes" - ["w", num_bytes] => { - let parsed_num_bytes = num_bytes.parse::().map_err(|_| { - ArrowError::CDataInterface( - "FixedSizeBinary requires an integer parameter representing number of bytes per element".to_string()) - })?; - DataType::FixedSizeBinary(parsed_num_bytes) - }, - // FixedSizeList type in format "+w:num_elems" - ["+w", num_elems] => { - let c_child = c_schema.child(0); - let parsed_num_elems = num_elems.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The FixedSizeList type requires an integer parameter representing number of elements per list".to_string()) - })?; - DataType::FixedSizeList(Box::new(Field::try_from(c_child)?), parsed_num_elems) - }, - // Decimal types in format "d:precision,scale" or "d:precision,scale,bitWidth" - ["d", extra] => { - match extra.splitn(3, ',').collect::>().as_slice() { - [precision, scale] => { - let parsed_precision = precision.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The decimal type requires an integer precision".to_string(), - ) - })?; - let parsed_scale = scale.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The decimal type requires an integer scale".to_string(), - ) - })?; - DataType::Decimal128(parsed_precision, parsed_scale) - }, - [precision, scale, bits] => { - if *bits != "128" && *bits != "256" { - return Err(ArrowError::CDataInterface("Only 128/256 bit wide decimal is supported in the Rust implementation".to_string())); - } - let parsed_precision = precision.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The decimal type requires an integer precision".to_string(), - ) - })?; - let parsed_scale = scale.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The decimal type requires an integer scale".to_string(), - ) - })?; - if *bits == "128" { - DataType::Decimal128(parsed_precision, parsed_scale) - } else { - DataType::Decimal256(parsed_precision, parsed_scale) - } - } - _ => { - return Err(ArrowError::CDataInterface(format!( - "The decimal pattern \"d:{extra:?}\" is not supported in the Rust implementation" - ))) - } - } - } - // DenseUnion - ["+ud", extra] => { - let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The Union type requires an integer type id".to_string(), - ) - })).collect::>>()?; - let mut fields = Vec::with_capacity(type_ids.len()); - for idx in 0..c_schema.n_children { - let c_child = c_schema.child(idx as usize); - let field = Field::try_from(c_child)?; - fields.push(field); - } - - if fields.len() != type_ids.len() { - return Err(ArrowError::CDataInterface( - "The Union type requires same number of fields and type ids".to_string(), - )); - } - - DataType::Union(fields, type_ids, UnionMode::Dense) - } - // SparseUnion - ["+us", extra] => { - let type_ids = extra.split(',').map(|t| t.parse::().map_err(|_| { - ArrowError::CDataInterface( - "The Union type requires an integer type id".to_string(), - ) - })).collect::>>()?; - let mut fields = Vec::with_capacity(type_ids.len()); - for idx in 0..c_schema.n_children { - let c_child = c_schema.child(idx as usize); - let field = Field::try_from(c_child)?; - fields.push(field); - } - - if fields.len() != type_ids.len() { - return Err(ArrowError::CDataInterface( - "The Union type requires same number of fields and type ids".to_string(), - )); - } - - DataType::Union(fields, type_ids, UnionMode::Sparse) - } - - // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp. - ["tss", ""] => DataType::Timestamp(TimeUnit::Second, None), - ["tsm", ""] => DataType::Timestamp(TimeUnit::Millisecond, None), - ["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None), - ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None), - ["tss", tz] => { - DataType::Timestamp(TimeUnit::Second, Some(tz.to_string())) - } - ["tsm", tz] => { - DataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())) - } - ["tsu", tz] => { - DataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())) - } - ["tsn", tz] => { - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())) - } - _ => { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{other:?}\" is still not supported in Rust implementation" - ))) - } - } - } - }; - - if let Some(dict_schema) = c_schema.dictionary() { - let value_type = Self::try_from(dict_schema)?; - dtype = DataType::Dictionary(Box::new(dtype), Box::new(value_type)); - } - - Ok(dtype) - } -} - -impl TryFrom<&FFI_ArrowSchema> for Field { - type Error = ArrowError; - - fn try_from(c_schema: &FFI_ArrowSchema) -> Result { - let dtype = DataType::try_from(c_schema)?; - let field = Field::new(c_schema.name(), dtype, c_schema.nullable()); - Ok(field) - } -} - -impl TryFrom<&FFI_ArrowSchema> for Schema { - type Error = ArrowError; - - fn try_from(c_schema: &FFI_ArrowSchema) -> Result { - // interpret it as a struct type then extract its fields - let dtype = DataType::try_from(c_schema)?; - if let DataType::Struct(fields) = dtype { - Ok(Schema::new(fields)) - } else { - Err(ArrowError::CDataInterface( - "Unable to interpret C data struct as a Schema".to_string(), - )) - } - } -} - -impl TryFrom<&DataType> for FFI_ArrowSchema { - type Error = ArrowError; - - /// See [CDataInterface docs](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) - fn try_from(dtype: &DataType) -> Result { - let format = get_format_string(dtype)?; - // allocate and hold the children - let children = match dtype { - DataType::List(child) - | DataType::LargeList(child) - | DataType::FixedSizeList(child, _) - | DataType::Map(child, _) => { - vec![FFI_ArrowSchema::try_from(child.as_ref())?] - } - DataType::Union(fields, _, _) => fields - .iter() - .map(FFI_ArrowSchema::try_from) - .collect::>>()?, - DataType::Struct(fields) => fields - .iter() - .map(FFI_ArrowSchema::try_from) - .collect::>>()?, - _ => vec![], - }; - let dictionary = if let DataType::Dictionary(_, value_data_type) = dtype { - Some(Self::try_from(value_data_type.as_ref())?) - } else { - None - }; - - let flags = match dtype { - Map(_, true) => Flags::MAP_KEYS_SORTED, - _ => Flags::empty(), - }; - - FFI_ArrowSchema::try_new(&format, children, dictionary)?.with_flags(flags) - } -} - -fn get_format_string(dtype: &DataType) -> Result { - match dtype { - DataType::Null => Ok("n".to_string()), - DataType::Boolean => Ok("b".to_string()), - DataType::Int8 => Ok("c".to_string()), - DataType::UInt8 => Ok("C".to_string()), - DataType::Int16 => Ok("s".to_string()), - DataType::UInt16 => Ok("S".to_string()), - DataType::Int32 => Ok("i".to_string()), - DataType::UInt32 => Ok("I".to_string()), - DataType::Int64 => Ok("l".to_string()), - DataType::UInt64 => Ok("L".to_string()), - DataType::Float16 => Ok("e".to_string()), - DataType::Float32 => Ok("f".to_string()), - DataType::Float64 => Ok("g".to_string()), - DataType::Binary => Ok("z".to_string()), - DataType::LargeBinary => Ok("Z".to_string()), - DataType::Utf8 => Ok("u".to_string()), - DataType::LargeUtf8 => Ok("U".to_string()), - DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), - DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), - DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), - DataType::Decimal256(precision, scale) => { - Ok(format!("d:{precision},{scale},256")) - } - DataType::Date32 => Ok("tdD".to_string()), - DataType::Date64 => Ok("tdm".to_string()), - DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), - DataType::Time32(TimeUnit::Millisecond) => Ok("ttm".to_string()), - DataType::Time64(TimeUnit::Microsecond) => Ok("ttu".to_string()), - DataType::Time64(TimeUnit::Nanosecond) => Ok("ttn".to_string()), - DataType::Timestamp(TimeUnit::Second, None) => Ok("tss:".to_string()), - DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("tsm:".to_string()), - DataType::Timestamp(TimeUnit::Microsecond, None) => Ok("tsu:".to_string()), - DataType::Timestamp(TimeUnit::Nanosecond, None) => Ok("tsn:".to_string()), - DataType::Timestamp(TimeUnit::Second, Some(tz)) => Ok(format!("tss:{tz}")), - DataType::Timestamp(TimeUnit::Millisecond, Some(tz)) => Ok(format!("tsm:{tz}")), - DataType::Timestamp(TimeUnit::Microsecond, Some(tz)) => Ok(format!("tsu:{tz}")), - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) => Ok(format!("tsn:{tz}")), - DataType::Duration(TimeUnit::Second) => Ok("tDs".to_string()), - DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), - DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), - DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()), - DataType::List(_) => Ok("+l".to_string()), - DataType::LargeList(_) => Ok("+L".to_string()), - DataType::Struct(_) => Ok("+s".to_string()), - DataType::Map(_, _) => Ok("+m".to_string()), - DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), - DataType::Union(_, type_ids, mode) => { - let formats = type_ids.iter().map(|t| t.to_string()).collect::>(); - match mode { - UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), - UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), - } - } - other => Err(ArrowError::CDataInterface(format!( - "The datatype \"{other:?}\" is still not supported in Rust implementation" - ))), - } -} - -impl TryFrom<&Field> for FFI_ArrowSchema { - type Error = ArrowError; - - fn try_from(field: &Field) -> Result { - let mut flags = if field.is_nullable() { - Flags::NULLABLE - } else { - Flags::empty() - }; - - if let Some(true) = field.dict_is_ordered() { - flags |= Flags::DICTIONARY_ORDERED; - } - - FFI_ArrowSchema::try_from(field.data_type())? - .with_name(field.name())? - .with_flags(flags) - } -} - -impl TryFrom<&Schema> for FFI_ArrowSchema { - type Error = ArrowError; - - fn try_from(schema: &Schema) -> Result { - let dtype = DataType::Struct(schema.fields().clone()); - let c_schema = FFI_ArrowSchema::try_from(&dtype)?; - Ok(c_schema) - } -} - -impl TryFrom for FFI_ArrowSchema { - type Error = ArrowError; - - fn try_from(dtype: DataType) -> Result { - FFI_ArrowSchema::try_from(&dtype) - } -} - -impl TryFrom for FFI_ArrowSchema { - type Error = ArrowError; - - fn try_from(field: Field) -> Result { - FFI_ArrowSchema::try_from(&field) - } -} - -impl TryFrom for FFI_ArrowSchema { - type Error = ArrowError; - - fn try_from(schema: Schema) -> Result { - FFI_ArrowSchema::try_from(&schema) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datatypes::{DataType, Field, TimeUnit}; - use crate::error::Result; - use std::convert::TryFrom; - - fn round_trip_type(dtype: DataType) -> Result<()> { - let c_schema = FFI_ArrowSchema::try_from(&dtype)?; - let restored = DataType::try_from(&c_schema)?; - assert_eq!(restored, dtype); - Ok(()) - } - - fn round_trip_field(field: Field) -> Result<()> { - let c_schema = FFI_ArrowSchema::try_from(&field)?; - let restored = Field::try_from(&c_schema)?; - assert_eq!(restored, field); - Ok(()) - } - - fn round_trip_schema(schema: Schema) -> Result<()> { - let c_schema = FFI_ArrowSchema::try_from(&schema)?; - let restored = Schema::try_from(&c_schema)?; - assert_eq!(restored, schema); - Ok(()) - } - - #[test] - fn test_type() -> Result<()> { - round_trip_type(DataType::Int64)?; - round_trip_type(DataType::UInt64)?; - round_trip_type(DataType::Float64)?; - round_trip_type(DataType::Date64)?; - round_trip_type(DataType::Time64(TimeUnit::Nanosecond))?; - round_trip_type(DataType::FixedSizeBinary(12))?; - round_trip_type(DataType::FixedSizeList( - Box::new(Field::new("a", DataType::Int64, false)), - 5, - ))?; - round_trip_type(DataType::Utf8)?; - round_trip_type(DataType::List(Box::new(Field::new( - "a", - DataType::Int16, - false, - ))))?; - round_trip_type(DataType::Struct(vec![Field::new( - "a", - DataType::Utf8, - true, - )]))?; - Ok(()) - } - - #[test] - fn test_field() -> Result<()> { - let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]); - round_trip_field(Field::new("test", dtype, true))?; - Ok(()) - } - - #[test] - fn test_schema() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("address", DataType::Utf8, false), - Field::new("priority", DataType::UInt8, false), - ]); - round_trip_schema(schema)?; - - // test that we can interpret struct types as schema - let dtype = DataType::Struct(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int16, false), - ]); - let c_schema = FFI_ArrowSchema::try_from(&dtype)?; - let schema = Schema::try_from(&c_schema)?; - assert_eq!(schema.fields().len(), 2); - - // test that we assert the input type - let c_schema = FFI_ArrowSchema::try_from(&DataType::Float64)?; - let result = Schema::try_from(&c_schema); - assert!(result.is_err()); - Ok(()) - } - - #[test] - fn test_map_keys_sorted() -> Result<()> { - let keys = Field::new("keys", DataType::Int32, false); - let values = Field::new("values", DataType::UInt32, false); - let entry_struct = DataType::Struct(vec![keys, values]); - - // Construct a map array from the above two - let map_data_type = - DataType::Map(Box::new(Field::new("entries", entry_struct, true)), true); - - let arrow_schema = FFI_ArrowSchema::try_from(map_data_type)?; - assert!(arrow_schema.map_keys_sorted()); - - Ok(()) - } - - #[test] - fn test_dictionary_ordered() -> Result<()> { - let schema = Schema::new(vec![Field::new_dict( - "dict", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - 0, - true, - )]); - - let arrow_schema = FFI_ArrowSchema::try_from(schema)?; - assert!(arrow_schema.child(0).dictionary_ordered()); - - Ok(()) - } -} diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 0f0f94c7a6b8..4d62b9e7cf61 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -104,19 +104,9 @@ To import an array, unsafely create an `ArrowArray` from two pointers using [Arr To export an array, create an `ArrowArray` using [ArrowArray::try_new]. */ -use std::{ - convert::TryFrom, - ffi::CStr, - ffi::CString, - iter, - mem::size_of, - os::raw::{c_char, c_void}, - ptr::{self, NonNull}, - sync::Arc, -}; +use std::{mem::size_of, ptr::NonNull, sync::Arc}; use arrow_schema::UnionMode; -use bitflags::bitflags; use crate::array::{layout, ArrayData}; use crate::buffer::{Buffer, MutableBuffer}; @@ -124,194 +114,11 @@ use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util; -bitflags! { - pub struct Flags: i64 { - const DICTIONARY_ORDERED = 0b00000001; - const NULLABLE = 0b00000010; - const MAP_KEYS_SORTED = 0b00000100; - } -} - -/// ABI-compatible struct for `ArrowSchema` from C Data Interface -/// See -/// -/// ``` -/// # use arrow::ffi::FFI_ArrowSchema; -/// # use arrow_data::ArrayData; -/// fn array_schema(data: &ArrayData) -> FFI_ArrowSchema { -/// FFI_ArrowSchema::try_from(data.data_type()).unwrap() -/// } -/// ``` -/// -#[repr(C)] -#[derive(Debug)] -pub struct FFI_ArrowSchema { - pub(crate) format: *const c_char, - pub(crate) name: *const c_char, - pub(crate) metadata: *const c_char, - pub(crate) flags: i64, - pub(crate) n_children: i64, - pub(crate) children: *mut *mut FFI_ArrowSchema, - pub(crate) dictionary: *mut FFI_ArrowSchema, - pub(crate) release: Option, - pub(crate) private_data: *mut c_void, -} - -struct SchemaPrivateData { - children: Box<[*mut FFI_ArrowSchema]>, - dictionary: *mut FFI_ArrowSchema, -} - -// callback used to drop [FFI_ArrowSchema] when it is exported. -unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) { - if schema.is_null() { - return; - } - let schema = &mut *schema; - - // take ownership back to release it. - drop(CString::from_raw(schema.format as *mut c_char)); - if !schema.name.is_null() { - drop(CString::from_raw(schema.name as *mut c_char)); - } - if !schema.private_data.is_null() { - let private_data = Box::from_raw(schema.private_data as *mut SchemaPrivateData); - for child in private_data.children.iter() { - drop(Box::from_raw(*child)) - } - if !private_data.dictionary.is_null() { - drop(Box::from_raw(private_data.dictionary)); - } - - drop(private_data); - } - - schema.release = None; -} - -impl FFI_ArrowSchema { - /// create a new [`FFI_ArrowSchema`]. This fails if the fields' - /// [`DataType`] is not supported. - pub fn try_new( - format: &str, - children: Vec, - dictionary: Option, - ) -> Result { - let mut this = Self::empty(); - - let children_ptr = children - .into_iter() - .map(Box::new) - .map(Box::into_raw) - .collect::>(); - - this.format = CString::new(format).unwrap().into_raw(); - this.release = Some(release_schema); - this.n_children = children_ptr.len() as i64; - - let dictionary_ptr = dictionary - .map(|d| Box::into_raw(Box::new(d))) - .unwrap_or(std::ptr::null_mut()); - - let mut private_data = Box::new(SchemaPrivateData { - children: children_ptr, - dictionary: dictionary_ptr, - }); - - // intentionally set from private_data (see https://github.com/apache/arrow-rs/issues/580) - this.children = private_data.children.as_mut_ptr(); - - this.dictionary = dictionary_ptr; - - this.private_data = Box::into_raw(private_data) as *mut c_void; - - Ok(this) - } - - pub fn with_name(mut self, name: &str) -> Result { - self.name = CString::new(name).unwrap().into_raw(); - Ok(self) - } - - pub fn with_flags(mut self, flags: Flags) -> Result { - self.flags = flags.bits(); - Ok(self) - } - - pub fn empty() -> Self { - Self { - format: std::ptr::null_mut(), - name: std::ptr::null_mut(), - metadata: std::ptr::null_mut(), - flags: 0, - n_children: 0, - children: ptr::null_mut(), - dictionary: std::ptr::null_mut(), - release: None, - private_data: std::ptr::null_mut(), - } - } - - /// returns the format of this schema. - pub fn format(&self) -> &str { - assert!(!self.format.is_null()); - // safe because the lifetime of `self.format` equals `self` - unsafe { CStr::from_ptr(self.format) } - .to_str() - .expect("The external API has a non-utf8 as format") - } - - /// returns the name of this schema. - pub fn name(&self) -> &str { - assert!(!self.name.is_null()); - // safe because the lifetime of `self.name` equals `self` - unsafe { CStr::from_ptr(self.name) } - .to_str() - .expect("The external API has a non-utf8 as name") - } - - pub fn flags(&self) -> Option { - Flags::from_bits(self.flags) - } - - pub fn child(&self, index: usize) -> &Self { - assert!(index < self.n_children as usize); - unsafe { self.children.add(index).as_ref().unwrap().as_ref().unwrap() } - } - - pub fn children(&self) -> impl Iterator { - (0..self.n_children as usize).map(move |i| self.child(i)) - } - - pub fn nullable(&self) -> bool { - (self.flags / 2) & 1 == 1 - } - - pub fn dictionary(&self) -> Option<&Self> { - unsafe { self.dictionary.as_ref() } - } - - pub fn map_keys_sorted(&self) -> bool { - self.flags & 0b00000100 != 0 - } - - pub fn dictionary_ordered(&self) -> bool { - self.flags & 0b00000001 != 0 - } -} - -impl Drop for FFI_ArrowSchema { - fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; - } -} +pub use arrow_data::ffi::FFI_ArrowArray; +pub use arrow_schema::ffi::{FFI_ArrowSchema, Flags}; // returns the number of bits that buffer `i` (in the C data interface) is expected to have. // This is set by the Arrow specification -#[allow(clippy::manual_bits)] fn bit_width(data_type: &DataType, i: usize) -> Result { if let Some(primitive) = data_type.primitive_width() { return match i { @@ -332,7 +139,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } - (DataType::FixedSizeBinary(num_bytes), 1) => size_of::() * (*num_bytes as usize) * 8, + (DataType::FixedSizeBinary(num_bytes), 1) => *num_bytes as usize * u8::BITS as usize, (DataType::FixedSizeList(f, num_elems), 1) => { let child_bit_width = bit_width(f.data_type(), 1)?; child_bit_width * (*num_elems as usize) @@ -345,8 +152,8 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { // Variable-size list and map have one i32 buffer. // Variable-sized binaries: have two buffers. // "small": first buffer is i32, second is in bytes - (DataType::Utf8, 1) | (DataType::Binary, 1) | (DataType::List(_), 1) | (DataType::Map(_, _), 1) => size_of::() * 8, - (DataType::Utf8, 2) | (DataType::Binary, 2) => size_of::() * 8, + (DataType::Utf8, 1) | (DataType::Binary, 1) | (DataType::List(_), 1) | (DataType::Map(_, _), 1) => i32::BITS as _, + (DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _, (DataType::List(_), _) | (DataType::Map(_, _), _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." @@ -359,17 +166,17 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { } // Variable-sized binaries: have two buffers. // LargeUtf8: first buffer is i64, second is in bytes - (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) | (DataType::LargeList(_), 1) => size_of::() * 8, - (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2)=> size_of::() * 8, + (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) | (DataType::LargeList(_), 1) => i64::BITS as _, + (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2)=> u8::BITS as _, (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) | (DataType::LargeList(_), _)=> { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } // type ids. UnionArray doesn't have null bitmap so buffer index begins with 0. - (DataType::Union(_, _, _), 0) => size_of::() * 8, + (DataType::Union(_, _, _), 0) => i8::BITS as _, // Only DenseUnion has 2nd buffer - (DataType::Union(_, _, UnionMode::Dense), 1) => size_of::() * 8, + (DataType::Union(_, _, UnionMode::Dense), 1) => i32::BITS as _, (DataType::Union(_, _, UnionMode::Sparse), _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." @@ -395,190 +202,6 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { }) } -/// ABI-compatible struct for ArrowArray from C Data Interface -/// See -/// -/// ``` -/// # use arrow::ffi::FFI_ArrowArray; -/// # use arrow_array::Array; -/// fn export_array(array: &dyn Array) -> FFI_ArrowArray { -/// FFI_ArrowArray::new(array.data()) -/// } -/// ``` -#[repr(C)] -#[derive(Debug)] -pub struct FFI_ArrowArray { - pub(crate) length: i64, - pub(crate) null_count: i64, - pub(crate) offset: i64, - pub(crate) n_buffers: i64, - pub(crate) n_children: i64, - pub(crate) buffers: *mut *const c_void, - pub(crate) children: *mut *mut FFI_ArrowArray, - pub(crate) dictionary: *mut FFI_ArrowArray, - pub(crate) release: Option, - // When exported, this MUST contain everything that is owned by this array. - // for example, any buffer pointed to in `buffers` must be here, as well - // as the `buffers` pointer itself. - // In other words, everything in [FFI_ArrowArray] must be owned by - // `private_data` and can assume that they do not outlive `private_data`. - pub(crate) private_data: *mut c_void, -} - -impl Drop for FFI_ArrowArray { - fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; - } -} - -unsafe impl Send for FFI_ArrowArray {} -unsafe impl Sync for FFI_ArrowArray {} - -// callback used to drop [FFI_ArrowArray] when it is exported -unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) { - if array.is_null() { - return; - } - let array = &mut *array; - - // take ownership of `private_data`, therefore dropping it` - let private = Box::from_raw(array.private_data as *mut ArrayPrivateData); - for child in private.children.iter() { - let _ = Box::from_raw(*child); - } - if !private.dictionary.is_null() { - let _ = Box::from_raw(private.dictionary); - } - - array.release = None; -} - -struct ArrayPrivateData { - #[allow(dead_code)] - buffers: Vec>, - buffers_ptr: Box<[*const c_void]>, - children: Box<[*mut FFI_ArrowArray]>, - dictionary: *mut FFI_ArrowArray, -} - -impl FFI_ArrowArray { - /// creates a new `FFI_ArrowArray` from existing data. - /// # Memory Leaks - /// This method releases `buffers`. Consumers of this struct *must* call `release` before - /// releasing this struct, or contents in `buffers` leak. - pub fn new(data: &ArrayData) -> Self { - let data_layout = layout(data.data_type()); - - let buffers = if data_layout.can_contain_null_mask { - // * insert the null buffer at the start - // * make all others `Option`. - iter::once(data.null_buffer().cloned()) - .chain(data.buffers().iter().map(|b| Some(b.clone()))) - .collect::>() - } else { - data.buffers().iter().map(|b| Some(b.clone())).collect() - }; - - // `n_buffers` is the number of buffers by the spec. - let n_buffers = { - data_layout.buffers.len() + { - // If the layout has a null buffer by Arrow spec. - // Note that even the array doesn't have a null buffer because it has - // no null value, we still need to count 1 here to follow the spec. - usize::from(data_layout.can_contain_null_mask) - } - } as i64; - - let buffers_ptr = buffers - .iter() - .flat_map(|maybe_buffer| match maybe_buffer { - // note that `raw_data` takes into account the buffer's offset - Some(b) => Some(b.as_ptr() as *const c_void), - // This is for null buffer. We only put a null pointer for - // null buffer if by spec it can contain null mask. - None if data_layout.can_contain_null_mask => Some(std::ptr::null()), - None => None, - }) - .collect::>(); - - let empty = vec![]; - let (child_data, dictionary) = match data.data_type() { - DataType::Dictionary(_, _) => ( - empty.as_slice(), - Box::into_raw(Box::new(FFI_ArrowArray::new(&data.child_data()[0]))), - ), - _ => (data.child_data(), std::ptr::null_mut()), - }; - - let children = child_data - .iter() - .map(|child| Box::into_raw(Box::new(FFI_ArrowArray::new(child)))) - .collect::>(); - let n_children = children.len() as i64; - - // create the private data owning everything. - // any other data must be added here, e.g. via a struct, to track lifetime. - let mut private_data = Box::new(ArrayPrivateData { - buffers, - buffers_ptr, - children, - dictionary, - }); - - Self { - length: data.len() as i64, - null_count: data.null_count() as i64, - offset: data.offset() as i64, - n_buffers, - n_children, - buffers: private_data.buffers_ptr.as_mut_ptr(), - children: private_data.children.as_mut_ptr(), - dictionary, - release: Some(release_array), - private_data: Box::into_raw(private_data) as *mut c_void, - } - } - - /// create an empty `FFI_ArrowArray`, which can be used to import data into - pub fn empty() -> Self { - Self { - length: 0, - null_count: 0, - offset: 0, - n_buffers: 0, - n_children: 0, - buffers: std::ptr::null_mut(), - children: std::ptr::null_mut(), - dictionary: std::ptr::null_mut(), - release: None, - private_data: std::ptr::null_mut(), - } - } - - /// the length of the array - pub fn len(&self) -> usize { - self.length as usize - } - - /// whether the array is empty - pub fn is_empty(&self) -> bool { - self.length == 0 - } - - /// the offset of the array - pub fn offset(&self) -> usize { - self.offset as usize - } - - /// the null count of the array - pub fn null_count(&self) -> usize { - self.null_count as usize - } -} - /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). /// `bits` is the number of bits that the native type of this buffer has. /// The size of the buffer will be `ceil(self.length * bits, 8)`. @@ -592,38 +215,13 @@ unsafe fn create_buffer( index: usize, len: usize, ) -> Option { - if array.buffers.is_null() || array.n_buffers == 0 { + if array.num_buffers() == 0 { return None; } - let buffers = array.buffers as *mut *const u8; - - assert!(index < array.n_buffers as usize); - let ptr = *buffers.add(index); - - NonNull::new(ptr as *mut u8) + NonNull::new(array.buffer(index) as _) .map(|ptr| Buffer::from_custom_allocation(ptr, len, owner)) } -fn create_child( - owner: Arc, - array: &FFI_ArrowArray, - schema: &FFI_ArrowSchema, - index: usize, -) -> ArrowArrayChild<'static> { - assert!(index < array.n_children as usize); - assert!(!array.children.is_null()); - assert!(!array.children.is_null()); - unsafe { - let arr_ptr = *array.children.add(index); - let schema_ptr = *schema.children.add(index); - assert!(!arr_ptr.is_null()); - assert!(!schema_ptr.is_null()); - let arr_ptr = &*arr_ptr; - let schema_ptr = &*schema_ptr; - ArrowArrayChild::from_raw(arr_ptr, schema_ptr, owner) - } -} - pub trait ArrowArrayRef { fn to_data(&self) -> Result { let data_type = self.data_type()?; @@ -640,7 +238,7 @@ pub trait ArrowArrayRef { None }; - let mut child_data: Vec = (0..self.array().n_children as usize) + let mut child_data: Vec = (0..self.array().num_children()) .map(|i| { let child = self.child(i); child.to_data() @@ -673,11 +271,9 @@ pub trait ArrowArrayRef { /// in the spec of the type) fn buffers(&self, can_contain_null_mask: bool) -> Result> { // + 1: skip null buffer - let buffer_begin = can_contain_null_mask as i64; - (buffer_begin..self.array().n_buffers) + let buffer_begin = can_contain_null_mask as usize; + (buffer_begin..self.array().num_buffers()) .map(|index| { - let index = index as usize; - let len = self.buffer_len(index)?; match unsafe { @@ -711,7 +307,7 @@ pub trait ArrowArrayRef { // `ffi::ArrowArray` records array offset, we need to add it back to the // buffer length to get the actual buffer length. - let length = self.array().length as usize + self.array().offset as usize; + let length = self.array().len() + self.array().offset(); // Inner type is not important for buffer length. Ok(match (&data_type, i) { @@ -733,9 +329,7 @@ pub trait ArrowArrayRef { // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = unsafe { - *(self.array().buffers as *mut *const u8).add(1) as *const i32 - }; + let offset_buffer = self.array().buffer(1) as *const i32; // get last offset (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize } @@ -745,9 +339,7 @@ pub trait ArrowArrayRef { // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = unsafe { - *(self.array().buffers as *mut *const u8).add(1) as *const i64 - }; + let offset_buffer = self.array().buffer(1) as *const i64; // get last offset (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize } @@ -766,14 +358,18 @@ pub trait ArrowArrayRef { // similar to `self.buffer_len(0)`, but without `Result`. // `ffi::ArrowArray` records array offset, we need to add it back to the // buffer length to get the actual buffer length. - let length = self.array().length as usize + self.array().offset as usize; + let length = self.array().len() + self.array().offset(); let buffer_len = bit_util::ceil(length, 8); unsafe { create_buffer(self.owner().clone(), self.array(), 0, buffer_len) } } fn child(&self, index: usize) -> ArrowArrayChild { - create_child(self.owner().clone(), self.array(), self.schema(), index) + ArrowArrayChild { + array: self.array().child(index), + schema: self.schema().child(index), + owner: self.owner(), + } } fn owner(&self) -> &Arc; @@ -781,18 +377,14 @@ pub trait ArrowArrayRef { fn schema(&self) -> &FFI_ArrowSchema; fn data_type(&self) -> Result; fn dictionary(&self) -> Option { - unsafe { - assert!(!(self.array().dictionary.is_null() ^ self.schema().dictionary.is_null()), - "Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema"); - if !self.array().dictionary.is_null() { - Some(ArrowArrayChild::from_raw( - &*self.array().dictionary, - &*self.schema().dictionary, - self.owner().clone(), - )) - } else { - None - } + match (self.array().dictionary(), self.schema().dictionary()) { + (Some(array), Some(schema)) => Some(ArrowArrayChild { + array, + schema, + owner: self.owner(), + }), + (None, None) => None, + _ => panic!("Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema") } } } @@ -827,7 +419,7 @@ pub struct ArrowArray { pub struct ArrowArrayChild<'a> { array: &'a FFI_ArrowArray, schema: &'a FFI_ArrowSchema, - owner: Arc, + owner: &'a Arc, } impl ArrowArrayRef for ArrowArray { @@ -864,7 +456,7 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { } fn owner(&self) -> &Arc { - &self.owner + self.owner } } @@ -936,20 +528,6 @@ impl ArrowArray { } } -impl<'a> ArrowArrayChild<'a> { - fn from_raw( - array: &'a FFI_ArrowArray, - schema: &'a FFI_ArrowSchema, - owner: Arc, - ) -> Self { - Self { - array, - schema, - owner, - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -957,7 +535,7 @@ mod tests { export_array_into_raw, make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericStringArray, - Int32Array, MapArray, NullArray, OffsetSizeTrait, Time32MillisecondArray, + Int32Array, MapArray, OffsetSizeTrait, Time32MillisecondArray, TimestampMillisecondArray, UInt32Array, }; use crate::compute::kernels; @@ -1004,8 +582,9 @@ mod tests { // We can read them back to memory // SAFETY: // Pointers are aligned and valid - let array = - unsafe { ArrowArray::new(ptr::read(array_ptr), ptr::read(schema_ptr)) }; + let array = unsafe { + ArrowArray::new(std::ptr::read(array_ptr), std::ptr::read(schema_ptr)) + }; let array = Int32Array::from(ArrayData::try_from(array).unwrap()); assert_eq!(array, Int32Array::from(vec![1, 2, 3])); @@ -1526,24 +1105,6 @@ mod tests { Ok(()) } - #[test] - fn null_array_n_buffers() -> Result<()> { - let array = NullArray::new(10); - let data = array.data(); - - let ffi_array = FFI_ArrowArray::new(data); - assert_eq!(0, ffi_array.n_buffers); - - let private_data = - unsafe { Box::from_raw(ffi_array.private_data as *mut ArrayPrivateData) }; - - assert_eq!(0, private_data.buffers_ptr.len()); - - Box::into_raw(private_data); - - Ok(()) - } - #[test] fn test_map_array() -> Result<()> { let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"]; diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 4313eaaaf34f..b1046d142f32 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -60,6 +60,7 @@ //! } //! ``` +use std::ptr::addr_of; use std::{ convert::TryFrom, ffi::CString, @@ -203,11 +204,11 @@ impl ExportedArrayStream { let schema = FFI_ArrowSchema::try_from(reader.schema().as_ref()); match schema { - Ok(mut schema) => unsafe { - std::ptr::copy(&schema as *const FFI_ArrowSchema, out, 1); - schema.release = None; + Ok(schema) => { + unsafe { std::ptr::copy(addr_of!(schema), out, 1) }; + std::mem::forget(schema); 0 - }, + } Err(ref err) => { private_data.last_error = err.to_string(); get_error_code(err) @@ -222,21 +223,17 @@ impl ExportedArrayStream { let ret_code = match reader.next() { None => { // Marks ArrowArray released to indicate reaching the end of stream. - unsafe { - (*out).release = None; - } + unsafe { std::ptr::write(out, FFI_ArrowArray::empty()) } 0 } Some(next_batch) => { if let Ok(batch) = next_batch { let struct_array = StructArray::from(batch); - let mut array = FFI_ArrowArray::new(struct_array.data()); + let array = FFI_ArrowArray::new(struct_array.data()); - unsafe { - std::ptr::copy(&array as *const FFI_ArrowArray, out, 1); - array.release = None; - 0 - } + unsafe { std::ptr::copy(addr_of!(array), out, 1) }; + std::mem::forget(array); + 0 } else { let err = &next_batch.unwrap_err(); private_data.last_error = err.to_string(); @@ -362,7 +359,9 @@ impl Iterator for ArrowArrayStreamReader { let ffi_array = unsafe { Arc::from_raw(array_ptr) }; // The end of stream has been reached - ffi_array.release?; + if ffi_array.is_released() { + return None; + } let schema_ref = self.schema(); let schema = FFI_ArrowSchema::try_from(schema_ref.as_ref()).ok()?; @@ -482,7 +481,7 @@ mod tests { // The end of stream has been reached let ffi_array = unsafe { Arc::from_raw(array_ptr) }; - if ffi_array.release.is_none() { + if ffi_array.is_released() { break; } From e572a458b777b52c6f7a2876f2c15f42a5df5303 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Feb 2023 19:49:04 +0000 Subject: [PATCH 0597/1411] Fix Unsound Binary Casting in Unreleased Arrow (#3691) (#3692) * Fix binary casting (#3691) * Clippy * More clippy * Update test --- arrow-array/src/array/string_array.rs | 51 +++++++++++------- arrow-cast/src/cast.rs | 77 ++++++++++++--------------- arrow-row/src/lib.rs | 2 +- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index cb401540d292..2ff1118bc798 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -21,7 +21,7 @@ use crate::{ }; use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; /// Generic struct for \[Large\]StringArray /// @@ -99,6 +99,34 @@ impl GenericStringArray { ) -> impl Iterator> + 'a { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + + /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning + /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data + pub fn try_from_binary( + v: GenericBinaryArray, + ) -> Result { + let offsets = v.value_offsets(); + let values = v.value_data(); + + // We only need to validate that all values are valid UTF-8 + let validated = std::str::from_utf8(values).map_err(|e| { + ArrowError::CastError(format!("Encountered non UTF-8 data: {e}")) + })?; + + for offset in offsets.iter() { + let o = offset.as_usize(); + if !validated.is_char_boundary(o) { + return Err(ArrowError::CastError(format!( + "Split UTF-8 codepoint at offset {o}" + ))); + } + } + + let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); + // SAFETY: + // Validated UTF-8 above + Ok(Self::from(unsafe { builder.build_unchecked() })) + } } impl<'a, Ptr, OffsetSize: OffsetSizeTrait> FromIterator<&'a Option> @@ -172,22 +200,7 @@ impl From> for GenericStringArray { fn from(v: GenericBinaryArray) -> Self { - let offsets = v.value_offsets(); - let values = v.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence"); - for offset in offsets.iter() { - assert!( - validated.is_char_boundary(offset.as_usize()), - "Invalid UTF-8 sequence" - ) - } - - let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - // SAFETY: - // Validated UTF-8 above - Self::from(unsafe { builder.build_unchecked() }) + Self::try_from_binary(v).unwrap() } } @@ -650,7 +663,9 @@ mod tests { } #[test] - #[should_panic(expected = "Invalid UTF-8 sequence: Utf8Error")] + #[should_panic( + expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0" + )] fn test_list_array_utf8_validation() { let mut builder = ListBuilder::new(PrimitiveBuilder::::new()); builder.values().append_value(0xFF); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1631f2e0040f..49461b14c339 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3202,49 +3202,25 @@ fn cast_binary_to_string( .downcast_ref::>>() .unwrap(); - if !cast_options.safe { - let offsets = array.value_offsets(); - let values = array.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values) - .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; - // Checks if the offsets are valid but does not re-encode - for offset in offsets.iter() { - if !validated.is_char_boundary(offset.as_usize()) { - return Err(ArrowError::CastError("Invalid UTF-8 sequence".to_string())); + match GenericStringArray::::try_from_binary(array.clone()) { + Ok(a) => Ok(Arc::new(a)), + Err(e) => match cast_options.safe { + true => { + // Fallback to slow method to convert invalid sequences to nulls + let mut builder = GenericStringBuilder::::with_capacity( + array.len(), + array.value_data().len(), + ); + + let iter = array + .iter() + .map(|v| v.and_then(|v| std::str::from_utf8(v).ok())); + + builder.extend(iter); + Ok(Arc::new(builder.finish())) } - } - - let builder = array - .into_data() - .into_builder() - .data_type(GenericStringArray::::DATA_TYPE); - // SAFETY: - // Validated UTF-8 above - Ok(Arc::new(GenericStringArray::::from(unsafe { - builder.build_unchecked() - }))) - } else { - let mut null_builder = BooleanBufferBuilder::new(array.len()); - array.iter().for_each(|maybe_value| { - null_builder.append( - maybe_value - .and_then(|value| std::str::from_utf8(value).ok()) - .is_some(), - ); - }); - - let builder = array - .into_data() - .into_builder() - .null_bit_buffer(Some(null_builder.finish())) - .data_type(GenericStringArray::::DATA_TYPE); - // SAFETY: - // Validated UTF-8 above - Ok(Arc::new(GenericStringArray::::from(unsafe { - builder.build_unchecked() - }))) + false => Err(e), + }, } } @@ -7588,4 +7564,21 @@ mod tests { test_tz("+00:00".to_owned()); test_tz("+02:00".to_owned()); } + + #[test] + fn test_cast_invalid_utf8() { + let v1: &[u8] = b"\xFF invalid"; + let v2: &[u8] = b"\x00 Foo"; + let s = BinaryArray::from(vec![v1, v2]); + let options = CastOptions { safe: true }; + let array = cast_with_options(&s, &DataType::Utf8, &options).unwrap(); + let a = as_string_array(array.as_ref()); + a.data().validate_full().unwrap(); + + assert_eq!(a.null_count(), 1); + assert_eq!(a.len(), 2); + assert!(a.is_null(0)); + assert_eq!(a.value(0), ""); + assert_eq!(a.value(1), "\x00 Foo"); + } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 1d54a008f36b..2e489c974750 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1734,7 +1734,7 @@ mod tests { } #[test] - #[should_panic(expected = "Invalid UTF-8 sequence")] + #[should_panic(expected = "Encountered non UTF-8 data")] fn test_invalid_utf8() { let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); From 526100928d62e1c16ac41bbef9b966ac59b3324a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Feb 2023 19:49:57 +0000 Subject: [PATCH 0598/1411] Final release tweaks for 33.0.0 (#3688) --- CHANGELOG.md | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4676edd3e0df..66cc9104b0ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ # Changelog -## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-09) +## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-10) [Full Changelog](https://github.com/apache/arrow-rs/compare/32.0.0...33.0.0) @@ -33,40 +33,47 @@ **Implemented enhancements:** -- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) -- Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) +- Support UTF8 cast to Timestamp with timezone [\#3664](https://github.com/apache/arrow-rs/issues/3664) +- Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - A trait for append\_value and append\_null on ArrayBuilders [\#3644](https://github.com/apache/arrow-rs/issues/3644) -- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) -- Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) +- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Casting generic binary to generic string [\#3606](https://github.com/apache/arrow-rs/issues/3606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) +- Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- ArrowArray::try\_from\_raw Misleading Signature [\#3684](https://github.com/apache/arrow-rs/issues/3684) -- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) -- FFI Fails to Account For Offsets [\#3671](https://github.com/apache/arrow-rs/issues/3671) -- Regression in CSV reader error handling [\#3656](https://github.com/apache/arrow-rs/issues/3656) -- UnionArray Child and Value Fail to Account for non-contiguous Type IDs [\#3653](https://github.com/apache/arrow-rs/issues/3653) -- Panic when accessing RecordBatch from pyarrow [\#3646](https://github.com/apache/arrow-rs/issues/3646) +- ArrowArray::try\_from\_raw Misleading Signature [\#3684](https://github.com/apache/arrow-rs/issues/3684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Arrow-csv reader cannot produce RecordBatch even if the bytes are necessary [\#3674](https://github.com/apache/arrow-rs/issues/3674) +- FFI Fails to Account For Offsets [\#3671](https://github.com/apache/arrow-rs/issues/3671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression in CSV reader error handling [\#3656](https://github.com/apache/arrow-rs/issues/3656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- UnionArray Child and Value Fail to Account for non-contiguous Type IDs [\#3653](https://github.com/apache/arrow-rs/issues/3653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Panic when accessing RecordBatch from pyarrow [\#3646](https://github.com/apache/arrow-rs/issues/3646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Multiplication for decimals is incorrect [\#3645](https://github.com/apache/arrow-rs/issues/3645) -- Inconsistent output between pretty print and CSV writer for Arrow [\#3513](https://github.com/apache/arrow-rs/issues/3513) +- Inconsistent output between pretty print and CSV writer for Arrow [\#3513](https://github.com/apache/arrow-rs/issues/3513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Release `32.0.0` of `arrow`/`arrow-flight`/`parquet`/`parquet-derive` \(next release after `31.0.0`\) [\#3584](https://github.com/apache/arrow-rs/issues/3584) +- Release 33.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 32.0.0\) [\#3682](https://github.com/apache/arrow-rs/issues/3682) +- Release `32.0.0` of `arrow`/`arrow-flight`/`parquet`/`parquet-derive` \(next release after `31.0.0`\) [\#3584](https://github.com/apache/arrow-rs/issues/3584) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Merged pull requests:** +- Move FFI to sub-crates [\#3687](https://github.com/apache/arrow-rs/pull/3687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update to 33.0.0 and update changelog [\#3686](https://github.com/apache/arrow-rs/pull/3686) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) - Cleanup FFI interface \(\#3684\) \(\#3683\) [\#3685](https://github.com/apache/arrow-rs/pull/3685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - fix: take\_run benchmark parameter [\#3679](https://github.com/apache/arrow-rs/pull/3679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) - Minor: Add some examples to Date\*Array and Time\*Array [\#3678](https://github.com/apache/arrow-rs/pull/3678) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add CSV Decoder::capacity \(\#3674\) [\#3677](https://github.com/apache/arrow-rs/pull/3677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add ArrayData::new\_null and DataType::primitive\_width [\#3676](https://github.com/apache/arrow-rs/pull/3676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Fix FFI which fails to account for offsets [\#3675](https://github.com/apache/arrow-rs/pull/3675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support UTF8 cast to Timestamp with timezone [\#3673](https://github.com/apache/arrow-rs/pull/3673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) - Fix Date64Array docs [\#3670](https://github.com/apache/arrow-rs/pull/3670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update proc-macro2 requirement from =1.0.50 to =1.0.51 [\#3669](https://github.com/apache/arrow-rs/pull/3669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) - Add timezone accessor for Timestamp\*Array [\#3666](https://github.com/apache/arrow-rs/pull/3666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Faster timezone cast [\#3665](https://github.com/apache/arrow-rs/pull/3665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat + fix: IPC support for run encoded array. [\#3662](https://github.com/apache/arrow-rs/pull/3662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) - Implement std::fmt::Write for StringBuilder \(\#3638\) [\#3659](https://github.com/apache/arrow-rs/pull/3659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Include line and field number in CSV UTF-8 error \(\#3656\) [\#3657](https://github.com/apache/arrow-rs/pull/3657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Handle non-contiguous type\_ids in UnionArray \(\#3653\) [\#3654](https://github.com/apache/arrow-rs/pull/3654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From 3cf64df5afc3e5ba7003b1a473740639a507f625 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Sat, 11 Feb 2023 17:47:05 -0500 Subject: [PATCH 0599/1411] Remove sorting to yield sorted_rank (#3693) Co-authored-by: ask --- arrow-ord/src/sort.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index f36e91d648c4..207f499ef275 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -486,8 +486,10 @@ where fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec { assert_eq!(sorted_value_indices.null_count(), 0); let sorted_indices = sorted_value_indices.values(); - let mut out: Vec<_> = (0..sorted_indices.len() as u32).collect(); - out.sort_unstable_by_key(|x| sorted_indices[*x as usize]); + let mut out: Vec<_> = vec![0_u32; sorted_indices.len()]; + for (ix, val) in sorted_indices.iter().enumerate() { + out[*val as usize] = ix as u32; + } out } From d82298f855a1bc8c99fc635292cbf55675807c46 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 13 Feb 2023 13:05:52 +0000 Subject: [PATCH 0600/1411] Fix pyarrow integration test (#3707) --- arrow-pyarrow-integration-testing/Cargo.toml | 3 --- arrow-pyarrow-integration-testing/pyproject.toml | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 75990dc90279..3ab256e541b3 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,6 +34,3 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", version = "33.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } - -[package.metadata.maturin] -requires-dist = ["pyarrow>=1"] diff --git a/arrow-pyarrow-integration-testing/pyproject.toml b/arrow-pyarrow-integration-testing/pyproject.toml index 27480690e06c..d75f8de1ac4c 100644 --- a/arrow-pyarrow-integration-testing/pyproject.toml +++ b/arrow-pyarrow-integration-testing/pyproject.toml @@ -18,3 +18,5 @@ [build-system] requires = ["maturin"] build-backend = "maturin" + +dependencies = ["pyarrow>=1"] From e37e379f158c644fd3bed63dfc9acc23b49aaf4d Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 13 Feb 2023 15:40:16 +0100 Subject: [PATCH 0601/1411] object_store: azure cli authorization (#3698) * fix: pass bearer token credential as auth header * feat: add azure cli credential * fix: clippy * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * chore: PR feedback * docs: add azure cli link --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/azure/client.rs | 14 ++- object_store/src/azure/credential.rs | 126 ++++++++++++++++++++++++++- object_store/src/azure/mod.rs | 27 +++++- 3 files changed, 164 insertions(+), 3 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 39da7177fee5..76bb45124a66 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -169,6 +169,18 @@ impl AzureClient { CredentialProvider::AccessKey(key) => { Ok(AzureCredential::AccessKey(key.to_owned())) } + CredentialProvider::BearerToken(token) => { + Ok(AzureCredential::AuthorizationToken( + // we do the conversion to a HeaderValue here, since it is fallible + // and we want to use it in an infallible function + HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { + crate::Error::Generic { + store: "MicrosoftAzure", + source: Box::new(err), + } + })?, + )) + } CredentialProvider::TokenCredential(cache, cred) => { let token = cache .get_or_insert_with(|| { @@ -178,7 +190,7 @@ impl AzureClient { .context(AuthorizationSnafu)?; Ok(AzureCredential::AuthorizationToken( // we do the conversion to a HeaderValue here, since it is fallible - // and we wna to use it in an infallible function + // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { store: "MicrosoftAzure", diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 67023d2f0434..9460c2deff0e 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -21,7 +21,7 @@ use crate::util::hmac_sha256; use crate::RetryConfig; use base64::prelude::BASE64_STANDARD; use base64::Engine; -use chrono::Utc; +use chrono::{DateTime, Utc}; use reqwest::header::ACCEPT; use reqwest::{ header::{ @@ -34,6 +34,7 @@ use reqwest::{ use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::borrow::Cow; +use std::process::Command; use std::str; use std::time::{Duration, Instant}; use url::Url; @@ -61,6 +62,12 @@ pub enum Error { #[snafu(display("Error reading federated token file "))] FederatedTokenFile, + + #[snafu(display("'az account get-access-token' command failed: {message}"))] + AzureCli { message: String }, + + #[snafu(display("Failed to parse azure cli response: {source}"))] + AzureCliResponse { source: serde_json::Error }, } pub type Result = std::result::Result; @@ -69,6 +76,7 @@ pub type Result = std::result::Result; #[derive(Debug)] pub enum CredentialProvider { AccessKey(String), + BearerToken(String), SASToken(Vec<(String, String)>), TokenCredential(TokenCache, Box), } @@ -540,6 +548,122 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { } } +mod az_cli_date_format { + use chrono::{DateTime, TimeZone}; + use serde::{self, Deserialize, Deserializer}; + + pub fn deserialize<'de, D>( + deserializer: D, + ) -> Result, D::Error> + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + // expiresOn from azure cli uses the local timezone + let date = chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%d %H:%M:%S.%6f") + .map_err(serde::de::Error::custom)?; + chrono::Local + .from_local_datetime(&date) + .single() + .ok_or(serde::de::Error::custom( + "azure cli returned ambiguous expiry date", + )) + } +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +struct AzureCliTokenResponse { + pub access_token: String, + #[serde(with = "az_cli_date_format")] + pub expires_on: DateTime, + pub token_type: String, +} + +#[derive(Default, Debug)] +pub struct AzureCliCredential { + _private: (), +} + +impl AzureCliCredential { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait::async_trait] +impl TokenCredential for AzureCliCredential { + /// Fetch a token + async fn fetch_token( + &self, + _client: &Client, + _retry: &RetryConfig, + ) -> Result> { + // on window az is a cmd and it should be called like this + // see https://doc.rust-lang.org/nightly/std/process/struct.Command.html + let program = if cfg!(target_os = "windows") { + "cmd" + } else { + "az" + }; + let mut args = Vec::new(); + if cfg!(target_os = "windows") { + args.push("/C"); + args.push("az"); + } + args.push("account"); + args.push("get-access-token"); + args.push("--output"); + args.push("json"); + args.push("--scope"); + args.push(AZURE_STORAGE_SCOPE); + + match Command::new(program).args(args).output() { + Ok(az_output) if az_output.status.success() => { + let output = + str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { + message: "az response is not a valid utf-8 string".to_string(), + })?; + + let token_response = + serde_json::from_str::(output) + .context(AzureCliResponseSnafu)?; + if !token_response.token_type.eq_ignore_ascii_case("bearer") { + return Err(Error::AzureCli { + message: format!( + "got unexpected token type from azure cli: {0}", + token_response.token_type + ), + }); + } + let duration = token_response.expires_on.naive_local() + - chrono::Local::now().naive_local(); + Ok(TemporaryToken { + token: token_response.access_token, + expiry: Instant::now() + + duration.to_std().map_err(|_| Error::AzureCli { + message: "az returned invalid lifetime".to_string(), + })?, + }) + } + Ok(az_output) => { + let message = String::from_utf8_lossy(&az_output.stderr); + Err(Error::AzureCli { + message: message.into(), + }) + } + Err(e) => match e.kind() { + std::io::ErrorKind::NotFound => Err(Error::AzureCli { + message: "Azure Cli not installed".into(), + }), + error_kind => Err(Error::AzureCli { + message: format!("io error: {error_kind:?}"), + }), + }, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 52969063495d..e5f1465ad682 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -400,6 +400,7 @@ pub struct MicrosoftAzureBuilder { object_id: Option, msi_resource_id: Option, federated_token_file: Option, + use_azure_cli: bool, retry_config: RetryConfig, client_options: ClientOptions, } @@ -533,6 +534,13 @@ pub enum AzureConfigKey { /// - `azure_federated_token_file` /// - `federated_token_file` FederatedTokenFile, + + /// Use azure cli for acquiring access token + /// + /// Supported keys: + /// - `azure_use_azure_cli` + /// - `use_azure_cli` + UseAzureCli, } impl AsRef for AzureConfigKey { @@ -550,6 +558,7 @@ impl AsRef for AzureConfigKey { Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", + Self::UseAzureCli => "azure_use_azure_cli", } } } @@ -593,6 +602,7 @@ impl FromStr for AzureConfigKey { "azure_federated_token_file" | "federated_token_file" => { Ok(Self::FederatedTokenFile) } + "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -704,6 +714,9 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => { self.federated_token_file = Some(value.into()) } + AzureConfigKey::UseAzureCli => { + self.use_azure_cli = str_is_truthy(&value.into()) + } AzureConfigKey::UseEmulator => { self.use_emulator = str_is_truthy(&value.into()) } @@ -887,6 +900,13 @@ impl MicrosoftAzureBuilder { self } + /// Set if the Azure Cli should be used for acquiring access token + /// + pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { + self.use_azure_cli = use_azure_cli; + self + } + /// Configure a connection to container with given name on Microsoft Azure /// Blob store. pub fn build(mut self) -> Result { @@ -916,7 +936,7 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(bearer_token) = self.bearer_token { - credential::CredentialProvider::AccessKey(bearer_token) + credential::CredentialProvider::BearerToken(bearer_token) } else if let Some(access_key) = self.access_key { credential::CredentialProvider::AccessKey(access_key) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = @@ -949,6 +969,11 @@ impl MicrosoftAzureBuilder { credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { credential::CredentialProvider::SASToken(split_sas(&sas)?) + } else if self.use_azure_cli { + credential::CredentialProvider::TokenCredential( + TokenCache::default(), + Box::new(credential::AzureCliCredential::new()), + ) } else { let client = self.client_options.clone().with_allow_http(true).client()?; From d011e6adc6c7ff0ae72784e89cf736112016c9c5 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Mon, 13 Feb 2023 10:50:51 -0500 Subject: [PATCH 0602/1411] perf: `take_run` improvements (#3705) * take_run improvements * doc fix * test case update per pr comment --------- Co-authored-by: ask --- arrow-select/src/take.rs | 124 +++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index f8383bbe3d2f..22991c4f2876 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -19,15 +19,14 @@ use std::sync::Arc; +use arrow_array::builder::BufferBuilder; +use arrow_array::types::*; use arrow_array::*; -use arrow_array::{builder::PrimitiveRunBuilder, types::*}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; -use arrow_array::cast::{ - as_generic_binary_array, as_largestring_array, as_primitive_array, as_string_array, -}; +use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; use num::{ToPrimitive, Zero}; /// Take elements by index from [Array], creating a new [Array] from those indexes. @@ -816,22 +815,14 @@ where Ok(DictionaryArray::::from(data)) } -macro_rules! primitive_run_take { - ($t:ty, $o:ty, $indices:ident, $value:ident) => { - take_primitive_run_values::<$o, $t>( - $indices, - as_primitive_array::<$t>($value.values()), - ) - }; -} - /// `take` implementation for run arrays /// /// Finds physical indices for the given logical indices and builds output run array -/// by taking values in the input run array at the physical indices. -/// for e.g. an input `RunArray{ run_ends = [2,4,6,8], values=[1,2,1,2] }` and `indices=[2,7]` -/// would be converted to `physical_indices=[1,3]` which will be used to build -/// output `RunArray{ run_ends=[2], values=[2] }` +/// by taking values in the input run_array.values at the physical indices. +/// The output run array will be run encoded on the physical indices and not on output values. +/// For e.g. an input `RunArray{ run_ends = [2,4,6,8], values=[1,2,1,2] }` and `logical_indices=[2,3,6,7]` +/// would be converted to `physical_indices=[1,1,3,3]` which will be used to build +/// output `RunArray{ run_ends=[2,4], values=[2,2] }`. fn take_run( run_array: &RunArray, logical_indices: &PrimitiveArray, @@ -842,43 +833,60 @@ where I: ArrowPrimitiveType, I::Native: ToPrimitive, { - match run_array.data_type() { - DataType::RunEndEncoded(_, fl) => { - let physical_indices = - run_array.get_physical_indices(logical_indices.values())?; - - downcast_primitive! { - fl.data_type() => (primitive_run_take, T, physical_indices, run_array), - dt => Err(ArrowError::NotYetImplemented(format!("take_run is not implemented for {dt:?}"))) - } + // get physical indices for the input logical indices + let physical_indices = run_array.get_physical_indices(logical_indices.values())?; + + // Run encode the physical indices into new_run_ends_builder + // Keep track of the physical indices to take in take_value_indices + // `unwrap` is used in this function because the unwrapped values are bounded by the corresponding `::Native`. + let mut new_run_ends_builder = BufferBuilder::::new(1); + let mut take_value_indices = BufferBuilder::::new(1); + let mut new_physical_len = 1; + for ix in 1..physical_indices.len() { + if physical_indices[ix] != physical_indices[ix - 1] { + take_value_indices + .append(I::Native::from_usize(physical_indices[ix - 1]).unwrap()); + new_run_ends_builder.append(T::Native::from_usize(ix).unwrap()); + new_physical_len += 1; } - dt => Err(ArrowError::InvalidArgumentError(format!( - "Expected DataType::RunEndEncoded found {dt:?}" - ))), } -} + take_value_indices.append( + I::Native::from_usize(physical_indices[physical_indices.len() - 1]).unwrap(), + ); + new_run_ends_builder.append(T::Native::from_usize(physical_indices.len()).unwrap()); + let new_run_ends = unsafe { + // Safety: + // The function builds a valid run_ends array and hence need not be validated. + ArrayDataBuilder::new(T::DATA_TYPE) + .len(new_physical_len) + .null_count(0) + .add_buffer(new_run_ends_builder.finish()) + .build_unchecked() + }; -// Builds a `RunArray` by taking values from given array for the given indices. -fn take_primitive_run_values( - physical_indices: Vec, - values: &PrimitiveArray, -) -> Result, ArrowError> -where - R: RunEndIndexType, - V: ArrowPrimitiveType, -{ - let mut builder = PrimitiveRunBuilder::::new(); - let values_len = values.len(); - for ix in physical_indices { - if ix >= values_len { - return Err(ArrowError::InvalidArgumentError("The requested index {ix} is out of bounds for values array with length {values_len}".to_string())); - } else if values.is_null(ix) { - builder.append_null() - } else { - builder.append_value(values.value(ix)) - } - } - Ok(builder.finish()) + let take_value_indices: PrimitiveArray = unsafe { + // Safety: + // The function builds a valid take_value_indices array and hence need not be validated. + ArrayDataBuilder::new(I::DATA_TYPE) + .len(new_physical_len) + .null_count(0) + .add_buffer(take_value_indices.finish()) + .build_unchecked() + .into() + }; + + let new_values = take(run_array.values(), &take_value_indices, None)?; + + let builder = ArrayDataBuilder::new(run_array.data_type().clone()) + .len(physical_indices.len()) + .add_child_data(new_run_ends) + .add_child_data(new_values.into_data()); + let array_data = unsafe { + // Safety: + // This function builds a valid run array and hence can skip validation. + builder.build_unchecked() + }; + Ok(array_data.into()) } /// Takes/filters a list array's inner data using the offsets of the list array. @@ -983,7 +991,7 @@ where #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::*; + use arrow_array::{builder::*, cast::as_primitive_array}; use arrow_schema::TimeUnit; fn test_take_decimal_arrays( @@ -2159,24 +2167,24 @@ mod tests { #[test] fn test_take_runs() { - let logical_array: Vec = vec![1_i32, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2]; + let logical_array: Vec = vec![1_i32, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2]; let mut builder = PrimitiveRunBuilder::::new(); builder.extend(logical_array.into_iter().map(Some)); let run_array = builder.finish(); let take_indices: PrimitiveArray = - vec![2, 7, 10].into_iter().collect(); + vec![7, 2, 3, 7, 11, 4, 6].into_iter().collect(); let take_out = take_run(&run_array, &take_indices).unwrap(); - assert_eq!(take_out.len(), 3); + assert_eq!(take_out.len(), 7); - assert_eq!(take_out.run_ends().len(), 1); - assert_eq!(take_out.run_ends().value(0), 3); + assert_eq!(take_out.run_ends().len(), 5); + assert_eq!(take_out.run_ends().values(), &[1_i32, 3, 4, 5, 7]); let take_out_values = as_primitive_array::(take_out.values()); - assert_eq!(take_out_values.value(0), 2); + assert_eq!(take_out_values.values(), &[2, 2, 2, 2, 1]); } #[test] From 5ffc0a87dd5abb4f7db1172bac6ed93da95827f7 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Mon, 13 Feb 2023 10:51:31 -0500 Subject: [PATCH 0603/1411] fix: Handle sliced array in run array iterator (#3681) * Handle sliced array in run array iterator * incorporate PR comments --------- Co-authored-by: ask --- arrow-array/src/array/run_array.rs | 5 ++ arrow-array/src/run_iterator.rs | 113 +++++++++++++++++++++-------- 2 files changed, 88 insertions(+), 30 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 33738d649f76..709933e1b103 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -472,6 +472,11 @@ impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { pub fn values(&self) -> &'a V { self.values } + + /// Returns the run array of this [`TypedRunArray`] + pub fn run_array(&self) -> &'a RunArray { + self.run_array + } } impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 8bad85a9f1e1..a79969c3cb91 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -42,10 +42,10 @@ where <&'a V as ArrayAccessor>::Item: Default, { array: TypedRunArray<'a, R, V>, - current_logical: usize, - current_physical: usize, - current_end_logical: usize, - current_end_physical: usize, + current_front_logical: usize, + current_front_physical: usize, + current_back_logical: usize, + current_back_physical: usize, } impl<'a, R, V> RunArrayIter<'a, R, V> @@ -57,14 +57,19 @@ where { /// create a new iterator pub fn new(array: TypedRunArray<'a, R, V>) -> Self { - let logical_len = array.len(); - let physical_len: usize = array.values().len(); + let current_front_physical: usize = + array.run_array().get_physical_index(0).unwrap(); + let current_back_physical: usize = array + .run_array() + .get_physical_index(array.len() - 1) + .unwrap() + + 1; RunArrayIter { array, - current_logical: 0, - current_physical: 0, - current_end_logical: logical_len, - current_end_physical: physical_len, + current_front_logical: array.offset(), + current_front_physical, + current_back_logical: array.offset() + array.len(), + current_back_physical, } } } @@ -80,35 +85,37 @@ where #[inline] fn next(&mut self) -> Option { - if self.current_logical == self.current_end_logical { + if self.current_front_logical == self.current_back_logical { return None; } // If current logical index is greater than current run end index then increment // the physical index. - if self.current_logical + if self.current_front_logical >= self .array .run_ends() - .value(self.current_physical) + .value(self.current_front_physical) .as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this // reason the next value can be accessed by incrementing physical index once. - self.current_physical += 1; + self.current_front_physical += 1; } - if self.array.values().is_null(self.current_physical) { - self.current_logical += 1; + if self.array.values().is_null(self.current_front_physical) { + self.current_front_logical += 1; Some(None) } else { - self.current_logical += 1; + self.current_front_logical += 1; // Safety: // The self.current_physical is kept within bounds of self.current_logical. // The self.current_logical will not go out of bounds because of the check // `self.current_logical = self.current_end_logical` above. unsafe { Some(Some( - self.array.values().value_unchecked(self.current_physical), + self.array + .values() + .value_unchecked(self.current_front_physical), )) } } @@ -116,8 +123,8 @@ where fn size_hint(&self) -> (usize, Option) { ( - self.current_end_logical - self.current_logical, - Some(self.current_end_logical - self.current_logical), + self.current_back_logical - self.current_front_logical, + Some(self.current_back_logical - self.current_front_logical), ) } } @@ -130,26 +137,26 @@ where <&'a V as ArrayAccessor>::Item: Default, { fn next_back(&mut self) -> Option { - if self.current_end_logical == self.current_logical { + if self.current_back_logical == self.current_front_logical { return None; } - self.current_end_logical -= 1; + self.current_back_logical -= 1; - if self.current_end_physical > 0 - && self.current_end_logical + if self.current_back_physical > 0 + && self.current_back_logical < self .array .run_ends() - .value(self.current_end_physical - 1) + .value(self.current_back_physical - 1) .as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this // reason the next value can be accessed by decrementing physical index once. - self.current_end_physical -= 1; + self.current_back_physical -= 1; } - Some(if self.array.values().is_null(self.current_end_physical) { + Some(if self.array.values().is_null(self.current_back_physical) { None } else { // Safety: @@ -160,7 +167,7 @@ where Some( self.array .values() - .value_unchecked(self.current_end_physical), + .value_unchecked(self.current_back_physical), ) } }) @@ -184,8 +191,8 @@ mod tests { use crate::{ array::{Int32Array, StringArray}, builder::PrimitiveRunBuilder, - types::Int32Type, - Int64RunArray, + types::{Int16Type, Int32Type}, + Array, Int64RunArray, PrimitiveArray, RunArray, }; fn build_input_array(size: usize) -> Vec> { @@ -345,4 +352,50 @@ mod tests { assert_eq!(expected_vec, result_asref); } + + #[test] + fn test_sliced_run_array_iterator() { + let total_len = 80; + let input_array = build_input_array(total_len); + + // Encode the input_array to run array + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array.iter().copied()); + let run_array = builder.finish(); + + // test for all slice lengths. + for slice_len in 1..=total_len { + // test for offset = 0, slice length = slice_len + let sliced_run_array: RunArray = + run_array.slice(0, slice_len).into_data().into(); + let sliced_typed_run_array = sliced_run_array + .downcast::>() + .unwrap(); + + // Iterate on sliced typed run array + let actual: Vec> = sliced_typed_run_array.into_iter().collect(); + let expected: Vec> = + input_array.iter().take(slice_len).copied().collect(); + assert_eq!(expected, actual); + + // test for offset = total_len - slice_len, length = slice_len + let sliced_run_array: RunArray = run_array + .slice(total_len - slice_len, slice_len) + .into_data() + .into(); + let sliced_typed_run_array = sliced_run_array + .downcast::>() + .unwrap(); + + // Iterate on sliced typed run array + let actual: Vec> = sliced_typed_run_array.into_iter().collect(); + let expected: Vec> = input_array + .iter() + .skip(total_len - slice_len) + .copied() + .collect(); + assert_eq!(expected, actual); + } + } } From 38a79ae4e4bff70b3d74f7582f9c4f4dbff62b69 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 13 Feb 2023 22:29:51 +0000 Subject: [PATCH 0604/1411] Filter exact list prefix matches for MemoryStore and HttpStore (#3712) (#3713) * Filter exact list prefix matches for MemoryStore and HttpStore (#3712) * Update object_store/src/lib.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- object_store/src/http/mod.rs | 14 ++++++++++++-- object_store/src/lib.rs | 20 ++++++++++++++++++++ object_store/src/memory.rs | 21 +++++++++++++++++---- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index f05e70024b8c..c91faa2358ac 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -37,6 +37,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; +use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; @@ -163,6 +164,7 @@ impl ObjectStore for HttpStore { &self, prefix: Option<&Path>, ) -> Result>> { + let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); let status = self.client.list(prefix, "infinity").await?; Ok(futures::stream::iter( status @@ -172,7 +174,9 @@ impl ObjectStore for HttpStore { .map(|response| { response.check_ok()?; response.object_meta(self.client.base_url()) - }), + }) + // Filter out exact prefix matches + .filter_ok(move |r| r.location.as_ref().len() > prefix_len), ) .boxed()) } @@ -186,7 +190,13 @@ impl ObjectStore for HttpStore { for response in status.response { response.check_ok()?; match response.is_dir() { - false => objects.push(response.object_meta(self.client.base_url())?), + false => { + let meta = response.object_meta(self.client.base_url())?; + // Filter out exact prefix matches + if meta.location.as_ref().len() > prefix_len { + objects.push(meta); + } + } true => { let path = response.path(self.client.base_url())?; // Exclude the current object diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 8c202886b008..6a3275bb06e6 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -911,9 +911,29 @@ mod tests { let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location1.clone()]); + let result = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(result.objects.len(), 1); + assert_eq!(result.objects[0].location, location1); + assert_eq!(result.common_prefixes, &[]); + + // Listing an existing path (file) should return an empty list: + // https://github.com/apache/arrow-rs/issues/3712 + let content_list = flatten_list_stream(storage, Some(&location1)) + .await + .unwrap(); + assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&location1)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); + let prefix = Path::from("foo/x"); let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[]); + + let list = storage.list_with_delimiter(Some(&prefix)).await.unwrap(); + assert_eq!(list.objects, &[]); + assert_eq!(list.common_prefixes, &[]); } pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 372164c2b41f..40eee55a13cc 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -163,13 +163,21 @@ impl ObjectStore for InMemory { &self, prefix: Option<&Path>, ) -> Result>> { + let root = Path::default(); + let prefix = prefix.unwrap_or(&root); let last_modified = Utc::now(); let storage = self.storage.read(); let values: Vec<_> = storage - .iter() - .filter(move |(key, _)| prefix.map(|p| key.prefix_matches(p)).unwrap_or(true)) - .map(move |(key, value)| { + .range((prefix)..) + .take_while(|(key, _)| key.as_ref().starts_with(prefix.as_ref())) + .filter(|(key, _)| { + // Don't return for exact prefix match + key.prefix_match(prefix) + .map(|mut x| x.next().is_some()) + .unwrap_or(false) + }) + .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), last_modified, @@ -195,14 +203,19 @@ impl ObjectStore for InMemory { // response. Otherwise, we just collect the common prefixes. let mut objects = vec![]; for (k, v) in self.storage.read().range((prefix)..) { + if !k.as_ref().starts_with(prefix.as_ref()) { + break; + } + let mut parts = match k.prefix_match(prefix) { Some(parts) => parts, - None => break, + None => continue, }; // Pop first element let common_prefix = match parts.next() { Some(p) => p, + // Should only return children of the prefix None => continue, }; From ef00365eeffa0af5cbbb2e44ac219e2c0c384fa2 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Tue, 14 Feb 2023 13:03:15 +0100 Subject: [PATCH 0605/1411] Filter exact list prefix matches for azure gen2 accounts (#3714) * fix: consistent list responses for gen1 and gen2 accounts * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/azure/client.rs | 58 +++++++++++++++++--------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 76bb45124a66..c5a5652ab4d1 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -388,7 +388,7 @@ impl AzureClient { .context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); - Ok((response.try_into()?, token)) + Ok((to_list_result(response, prefix)?, token)) } /// Perform a list operation automatically handling pagination @@ -419,33 +419,37 @@ struct ListResultInternal { pub blobs: Blobs, } -impl TryFrom for ListResult { - type Error = crate::Error; - - fn try_from(value: ListResultInternal) -> Result { - let common_prefixes = value - .blobs - .blob_prefix - .into_iter() - .map(|x| Ok(Path::parse(x.name)?)) - .collect::>()?; - - let objects = value - .blobs - .blobs - .into_iter() - .map(ObjectMeta::try_from) - // Note: workaround for gen2 accounts with hierarchical namespaces. These accounts also - // return path segments as "directories". When we cant directories, its always via - // the BlobPrefix mechanics. - .filter_map_ok(|obj| if obj.size > 0 { Some(obj) } else { None }) - .collect::>()?; - - Ok(Self { - common_prefixes, - objects, +fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result { + let prefix = prefix.map(Path::from).unwrap_or_else(Path::default); + let common_prefixes = value + .blobs + .blob_prefix + .into_iter() + .map(|x| Ok(Path::parse(x.name)?)) + .collect::>()?; + + let objects = value + .blobs + .blobs + .into_iter() + .map(ObjectMeta::try_from) + // Note: workaround for gen2 accounts with hierarchical namespaces. These accounts also + // return path segments as "directories" and include blobs in list requests with prefix, + // if the prefix mateches the blob. When we want directories, its always via + // the BlobPrefix mechanics, and during lists we state that prefixes are evaluated on path segement basis. + .filter_map_ok(|obj| { + if obj.size > 0 && obj.location.as_ref().len() > prefix.as_ref().len() { + Some(obj) + } else { + None + } }) - } + .collect::>()?; + + Ok(ListResult { + common_prefixes, + objects, + }) } /// Collection of blobs and potentially shared prefixes returned from list requests. From 7083f12f9eb68fd80118a5b280eb989aee568775 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Feb 2023 15:21:04 +0000 Subject: [PATCH 0606/1411] Add raw MapArrayReader (#3703) * Add raw MapArrayReader * Add formatted --- arrow-json/src/raw/list_array.rs | 15 +-- arrow-json/src/raw/map_array.rs | 143 +++++++++++++++++++++++++++++ arrow-json/src/raw/mod.rs | 49 +++++++++- arrow-json/src/raw/struct_array.rs | 13 +-- arrow-json/src/raw/tape.rs | 17 ++++ 5 files changed, 214 insertions(+), 23 deletions(-) create mode 100644 arrow-json/src/raw/map_array.rs diff --git a/arrow-json/src/raw/list_array.rs b/arrow-json/src/raw/list_array.rs index 9d96885f9943..7d37fc51d390 100644 --- a/arrow-json/src/raw/list_array.rs +++ b/arrow-json/src/raw/list_array.rs @@ -79,16 +79,9 @@ impl ArrayDecoder for ListArrayDecoder { child_pos.push(cur_idx); // Advance to next field - cur_idx = match tape.get(cur_idx) { - TapeElement::String(_) - | TapeElement::Number(_) - | TapeElement::True - | TapeElement::False - | TapeElement::Null => cur_idx + 1, - TapeElement::StartList(end_idx) => end_idx + 1, - TapeElement::StartObject(end_idx) => end_idx + 1, - d => return Err(tape_error(d, "list value")), - } + cur_idx = tape + .next(cur_idx) + .map_err(|d| tape_error(d, "list value"))?; } let offset = O::from_usize(child_pos.len()).ok_or_else(|| { @@ -100,7 +93,7 @@ impl ArrayDecoder for ListArrayDecoder { offsets.append(offset) } - let child_data = self.decoder.decode(tape, &child_pos).unwrap(); + let child_data = self.decoder.decode(tape, &child_pos)?; let data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) diff --git a/arrow-json/src/raw/map_array.rs b/arrow-json/src/raw/map_array.rs new file mode 100644 index 000000000000..670210f66214 --- /dev/null +++ b/arrow-json/src/raw/map_array.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use arrow_buffer::ArrowNativeType; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; + +pub struct MapArrayDecoder { + data_type: DataType, + keys: Box, + values: Box, + is_nullable: bool, +} + +impl MapArrayDecoder { + pub fn new(data_type: DataType, is_nullable: bool) -> Result { + let fields = match &data_type { + DataType::Map(_, true) => { + return Err(ArrowError::NotYetImplemented( + "Decoding MapArray with sorted fields".to_string(), + )) + } + DataType::Map(f, _) => match f.data_type() { + DataType::Struct(fields) if fields.len() == 2 => fields, + d => { + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray must contain struct with two fields, got {d}" + ))) + } + }, + _ => unreachable!(), + }; + + let keys = make_decoder(fields[0].data_type().clone(), fields[0].is_nullable())?; + let values = + make_decoder(fields[1].data_type().clone(), fields[1].is_nullable())?; + + Ok(Self { + data_type, + keys, + values, + is_nullable, + }) + } +} + +impl ArrayDecoder for MapArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let s = match &self.data_type { + DataType::Map(f, _) => match f.data_type() { + s @ DataType::Struct(_) => s, + _ => unreachable!(), + }, + _ => unreachable!(), + }; + + let mut offsets = BufferBuilder::::new(pos.len() + 1); + offsets.append(0); + + let mut key_pos = Vec::with_capacity(pos.len()); + let mut value_pos = Vec::with_capacity(pos.len()); + + let mut null_count = 0; + let mut nulls = self + .is_nullable + .then(|| BooleanBufferBuilder::new(pos.len())); + + for p in pos.iter().copied() { + let end_idx = match (tape.get(p), nulls.as_mut()) { + (TapeElement::StartObject(end_idx), None) => end_idx, + (TapeElement::StartObject(end_idx), Some(nulls)) => { + nulls.append(true); + end_idx + } + (TapeElement::Null, Some(nulls)) => { + nulls.append(false); + null_count += 1; + p + 1 + } + (d, _) => return Err(tape_error(d, "{")), + }; + + let mut cur_idx = p + 1; + while cur_idx < end_idx { + let key = cur_idx; + let value = tape.next(key).map_err(|d| tape_error(d, "map key"))?; + cur_idx = tape.next(value).map_err(|d| tape_error(d, "map value"))?; + + key_pos.push(key); + value_pos.push(value); + } + + let offset = i32::from_usize(key_pos.len()).ok_or_else(|| { + ArrowError::JsonError(format!( + "offset overflow decoding {}", + self.data_type + )) + })?; + offsets.append(offset) + } + + assert_eq!(key_pos.len(), value_pos.len()); + + let key_data = self.keys.decode(tape, &key_pos)?; + let value_data = self.values.decode(tape, &value_pos)?; + + let struct_data = ArrayDataBuilder::new(s.clone()) + .len(key_pos.len()) + .child_data(vec![key_data, value_data]); + + // Safety: + // Valid by construction + let struct_data = unsafe { struct_data.build_unchecked() }; + + let builder = ArrayDataBuilder::new(self.data_type.clone()) + .len(pos.len()) + .buffers(vec![offsets.finish()]) + .null_count(null_count) + .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) + .child_data(vec![struct_data]); + + // Safety: + // Valid by construction + Ok(unsafe { builder.build_unchecked() }) + } +} diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 267c8bebc83d..a45ff8ea85eb 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -21,6 +21,7 @@ use crate::raw::boolean_array::BooleanArrayDecoder; use crate::raw::list_array::ListArrayDecoder; +use crate::raw::map_array::MapArrayDecoder; use crate::raw::primitive_array::PrimitiveArrayDecoder; use crate::raw::string_array::StringArrayDecoder; use crate::raw::struct_array::StructArrayDecoder; @@ -33,6 +34,7 @@ use std::io::BufRead; mod boolean_array; mod list_array; +mod map_array; mod primitive_array; mod string_array; mod struct_array; @@ -283,6 +285,7 @@ fn make_decoder( DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { Err(ArrowError::JsonError(format!("{data_type} is not supported by JSON"))) } + DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, is_nullable)?)), d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader"))) } } @@ -297,11 +300,12 @@ mod tests { use crate::reader::infer_json_schema; use crate::ReaderBuilder; use arrow_array::cast::{ - as_boolean_array, as_largestring_array, as_list_array, as_primitive_array, - as_string_array, as_struct_array, + as_boolean_array, as_largestring_array, as_list_array, as_map_array, + as_primitive_array, as_string_array, as_struct_array, }; use arrow_array::types::Int32Type; use arrow_array::Array; + use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_schema::{DataType, Field, Schema}; use std::fs::File; use std::io::{BufReader, Cursor, Seek}; @@ -541,6 +545,47 @@ mod tests { assert!(c.is_null(1)); } + #[test] + fn test_map() { + let buf = r#" + {"map": {"a": ["foo", null]}} + {"map": {"a": [null], "b": []}} + {"map": {"c": null, "a": ["baz"]}} + "#; + let list = DataType::List(Box::new(Field::new("element", DataType::Utf8, true))); + let entries = DataType::Struct(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", list, true), + ]); + + let map = DataType::Map(Box::new(Field::new("entries", entries, true)), false); + let schema = Arc::new(Schema::new(vec![Field::new("map", map, true)])); + + let batches = do_read(buf, 1024, schema); + assert_eq!(batches.len(), 1); + + let map = as_map_array(batches[0].column(0).as_ref()); + let map_keys = as_string_array(map.keys().as_ref()); + let map_values = as_list_array(map.values().as_ref()); + assert_eq!(map.value_offsets(), &[0, 1, 3, 5]); + + let k: Vec<_> = map_keys.iter().map(|x| x.unwrap()).collect(); + assert_eq!(&k, &["a", "a", "b", "c", "a"]); + + let list_values = as_string_array(map_values.values().as_ref()); + let lv: Vec<_> = list_values.iter().collect(); + assert_eq!(&lv, &[Some("foo"), None, None, Some("baz")]); + assert_eq!(map_values.value_offsets(), &[0, 2, 3, 3, 3, 4]); + assert_eq!(map_values.null_count(), 1); + assert!(map_values.is_null(3)); + + let options = FormatOptions::default().with_null("null"); + let formatter = ArrayFormatter::try_new(map, &options).unwrap(); + assert_eq!(formatter.value(0).to_string(), "{a: [foo, null]}"); + assert_eq!(formatter.value(1).to_string(), "{a: [null], b: []}"); + assert_eq!(formatter.value(2).to_string(), "{c: null, a: [baz]}"); + } + #[test] fn integration_test() { let files = [ diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs index 3b7895f37c7f..418d8abcc48f 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/raw/struct_array.rs @@ -84,16 +84,9 @@ impl ArrayDecoder for StructArrayDecoder { } // Advance to next field - cur_idx = match tape.get(cur_idx + 1) { - TapeElement::String(_) - | TapeElement::Number(_) - | TapeElement::True - | TapeElement::False - | TapeElement::Null => cur_idx + 2, - TapeElement::StartList(end_idx) => end_idx + 1, - TapeElement::StartObject(end_idx) => end_idx + 1, - d => return Err(tape_error(d, "field value")), - } + cur_idx = tape + .next(cur_idx + 1) + .map_err(|d| tape_error(d, "field value"))?; } } diff --git a/arrow-json/src/raw/tape.rs b/arrow-json/src/raw/tape.rs index b0c814c766b8..3f4a317c8700 100644 --- a/arrow-json/src/raw/tape.rs +++ b/arrow-json/src/raw/tape.rs @@ -110,6 +110,23 @@ impl<'a> Tape<'a> { self.elements[idx as usize] } + /// Returns the index of the next field at the same level as `cur_idx` + /// + /// Return an error containing the [`TapeElement`] at `cur_idx` if it + /// is not the start of a field + pub fn next(&self, cur_idx: u32) -> Result { + match self.get(cur_idx) { + TapeElement::String(_) + | TapeElement::Number(_) + | TapeElement::True + | TapeElement::False + | TapeElement::Null => Ok(cur_idx + 1), + TapeElement::StartList(end_idx) => Ok(end_idx + 1), + TapeElement::StartObject(end_idx) => Ok(end_idx + 1), + d => Err(d), + } + } + /// Returns the number of rows pub fn num_rows(&self) -> usize { self.num_rows From 3fcfda9d4260c11d4bfd1f5564b17e5262b710a3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Feb 2023 15:50:11 +0000 Subject: [PATCH 0607/1411] Add pretty format with options (#3717) * Add format with options * Clippy * Fix tests --- arrow/src/util/pretty.rs | 94 ++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index 4defa71a779c..21d035826851 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -19,41 +19,58 @@ //! available unless `feature = "prettyprint"` is enabled. use crate::{array::ArrayRef, record_batch::RecordBatch}; +use arrow_array::Array; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use comfy_table::{Cell, Table}; use std::fmt::Display; use crate::error::Result; -use super::display::array_value_to_string; - -///! Create a visual representation of record batches +/// Create a visual representation of record batches pub fn pretty_format_batches(results: &[RecordBatch]) -> Result { - create_table(results) + let options = FormatOptions::default().with_display_error(true); + pretty_format_batches_with_options(results, &options) } -///! Create a visual representation of columns +/// Create a visual representation of record batches +pub fn pretty_format_batches_with_options( + results: &[RecordBatch], + options: &FormatOptions, +) -> Result { + create_table(results, options) +} + +/// Create a visual representation of columns pub fn pretty_format_columns( col_name: &str, results: &[ArrayRef], ) -> Result { - create_column(col_name, results) + let options = FormatOptions::default().with_display_error(true); + pretty_format_columns_with_options(col_name, results, &options) } -///! Prints a visual representation of record batches to stdout +pub fn pretty_format_columns_with_options( + col_name: &str, + results: &[ArrayRef], + options: &FormatOptions, +) -> Result { + create_column(col_name, results, options) +} + +/// Prints a visual representation of record batches to stdout pub fn print_batches(results: &[RecordBatch]) -> Result<()> { - println!("{}", create_table(results)?); + println!("{}", pretty_format_batches(results)?); Ok(()) } -///! Prints a visual representation of a list of column to stdout +/// Prints a visual representation of a list of column to stdout pub fn print_columns(col_name: &str, results: &[ArrayRef]) -> Result<()> { - println!("{}", create_column(col_name, results)?); + println!("{}", pretty_format_columns(col_name, results)?); Ok(()) } -///! Convert a series of record batches into a table -fn create_table(results: &[RecordBatch]) -> Result
{ +/// Convert a series of record batches into a table +fn create_table(results: &[RecordBatch], options: &FormatOptions) -> Result
{ let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -69,13 +86,11 @@ fn create_table(results: &[RecordBatch]) -> Result
{ } table.set_header(header); - let options = FormatOptions::default().with_display_error(true); - for batch in results { let formatters = batch .columns() .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &options)) + .map(|c| ArrayFormatter::try_new(c.as_ref(), options)) .collect::>>()?; for row in 0..batch.num_rows() { @@ -90,7 +105,11 @@ fn create_table(results: &[RecordBatch]) -> Result
{ Ok(table) } -fn create_column(field: &str, columns: &[ArrayRef]) -> Result
{ +fn create_column( + field: &str, + columns: &[ArrayRef], + options: &FormatOptions, +) -> Result
{ let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -102,8 +121,9 @@ fn create_column(field: &str, columns: &[ArrayRef]) -> Result
{ table.set_header(header); for col in columns { + let formatter = ArrayFormatter::try_new(col.as_ref(), options)?; for row in 0..col.len() { - let cells = vec![Cell::new(array_value_to_string(col, row)?)]; + let cells = vec![Cell::new(formatter.value(row))]; table.add_row(cells); } } @@ -133,6 +153,7 @@ mod tests { use arrow_array::builder::PrimitiveBuilder; use arrow_array::types::{ArrowTimestampType, TimestampSecondType}; + use arrow_cast::display::array_value_to_string; use half::f16; #[test] @@ -1057,4 +1078,43 @@ mod tests { Ok(()) } + + #[test] + fn test_format_options() { + let options = FormatOptions::default().with_null("null"); + let array = Int32Array::from(vec![Some(1), Some(2), None, Some(3), Some(4)]); + let batch = + RecordBatch::try_from_iter([("my_column_name", Arc::new(array) as _)]) + .unwrap(); + + let column = pretty_format_columns_with_options( + "my_column_name", + &[batch.column(0).clone()], + &options, + ) + .unwrap() + .to_string(); + + let batch = pretty_format_batches_with_options(&[batch], &options) + .unwrap() + .to_string(); + + let expected = vec![ + "+----------------+", + "| my_column_name |", + "+----------------+", + "| 1 |", + "| 2 |", + "| null |", + "| 3 |", + "| 4 |", + "+----------------+", + ]; + + let actual: Vec<&str> = column.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n{column}"); + + let actual: Vec<&str> = batch.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n{batch}"); + } } From 86836a8d49059603b833395655c6f6c4a67be7e7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Feb 2023 18:37:04 +0000 Subject: [PATCH 0608/1411] Remove unreachable decimal take (#3716) --- arrow-select/src/take.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 22991c4f2876..6436dc0d56e4 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -127,20 +127,6 @@ where let values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(take_boolean(values, indices)?)) } - DataType::Decimal128(p, s) => { - let decimal_values = values.as_any().downcast_ref::().unwrap(); - let array = take_primitive(decimal_values, indices)? - .with_precision_and_scale(*p, *s) - .unwrap(); - Ok(Arc::new(array)) - } - DataType::Decimal256(p, s) => { - let decimal_values = values.as_any().downcast_ref::().unwrap(); - let array = take_primitive(decimal_values, indices)? - .with_precision_and_scale(*p, *s) - .unwrap(); - Ok(Arc::new(array)) - } DataType::Utf8 => { Ok(Arc::new(take_bytes(as_string_array(values), indices)?)) } From d74051061a8be0d06b8112901d19d748c7ac0d9f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Feb 2023 22:29:43 +0000 Subject: [PATCH 0609/1411] Add From for ByteArray (#3720) * Add From for ByteArray * Format --- parquet/src/data_type.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 5aff88e53402..40d54c78ed1d 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -17,6 +17,7 @@ //! Data types that connect Parquet physical types with their Rust-specific //! representations. +use bytes::Bytes; use std::cmp::Ordering; use std::fmt; use std::mem; @@ -214,6 +215,12 @@ impl From for ByteArray { } } +impl From for ByteArray { + fn from(value: Bytes) -> Self { + ByteBufferPtr::from(value).into() + } +} + impl PartialEq for ByteArray { fn eq(&self, other: &ByteArray) -> bool { match (&self.data, &other.data) { From 22c138156715bf62c8c683fb94e947f7a3200149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E5=B0=8F=E5=88=9A?= <35674070+LiShiZhensPi@users.noreply.github.com> Date: Wed, 15 Feb 2023 06:36:05 +0800 Subject: [PATCH 0610/1411] Feat: arrow csv decimal256 (#3711) * add test for arrow-csv Decimal256 * pass the test There is still room for improvement in the code * add test_write_csv_decimal for csv_writer * support i128 and i256 in one generic function * the test parse_decimal need Neg * Update arrow-array/src/array/primitive_array.rs This will allow simplifying trait bounds in a number of other places Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * return an error instead of panicking on overflow * Decimal256(76, 6) * adding test for Decimal256Type --------- Co-authored-by: suxiaogang Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/array/primitive_array.rs | 4 +- arrow-csv/src/reader/mod.rs | 141 ++++++++++++++++------- arrow-csv/src/writer.rs | 55 +++++++++ 3 files changed, 155 insertions(+), 45 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index aeece612ded2..b64534e9835f 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -23,8 +23,8 @@ use crate::temporal_conversions::{ }; use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; -use crate::types::*; use crate::{print_long_array, Array, ArrayAccessor}; +use crate::{types::*, ArrowNativeTypeOp}; use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; @@ -233,7 +233,7 @@ pub type Decimal256Array = PrimitiveArray; /// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. pub trait ArrowPrimitiveType: 'static { /// Corresponding Rust native type for the primitive type. - type Native: ArrowNativeType; + type Native: ArrowNativeTypeOp; /// the corresponding Arrow data type of this primitive type. const DATA_TYPE: DataType; diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 610f05155b52..c5fe20e9d915 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -42,6 +42,13 @@ mod records; +use arrow_array::builder::PrimitiveBuilder; +use arrow_array::types::*; +use arrow_array::ArrowNativeTypeOp; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_cast::parse::Parser; +use arrow_schema::*; use lazy_static::lazy_static; use regex::{Regex, RegexSet}; use std::collections::HashSet; @@ -50,17 +57,9 @@ use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; use std::sync::Arc; -use arrow_array::builder::Decimal128Builder; -use arrow_array::types::*; -use arrow_array::*; -use arrow_cast::parse::Parser; -use arrow_schema::*; - use crate::map_csv_error; use crate::reader::records::{RecordDecoder, StringRecords}; -use arrow_data::decimal::validate_decimal_precision; use csv::StringRecord; -use std::ops::Neg; lazy_static! { static ref REGEX_SET: RegexSet = RegexSet::new([ @@ -608,7 +607,22 @@ fn parse( match field.data_type() { DataType::Boolean => build_boolean_array(line_number, rows, i), DataType::Decimal128(precision, scale) => { - build_decimal_array(line_number, rows, i, *precision, *scale) + build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + ) + } + DataType::Decimal256(precision, scale) => { + build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + ) } DataType::Int8 => { build_primitive_array::(line_number, rows, i, None) @@ -781,22 +795,22 @@ fn parse_bool(string: &str) -> Option { } // parse the column string to an Arrow Array -fn build_decimal_array( +fn build_decimal_array( _line_number: usize, rows: &StringRecords<'_>, col_idx: usize, precision: u8, scale: i8, ) -> Result { - let mut decimal_builder = Decimal128Builder::with_capacity(rows.len()); + let mut decimal_builder = PrimitiveBuilder::::with_capacity(rows.len()); for row in rows.iter() { let s = row.get(col_idx); if s.is_empty() { // append null decimal_builder.append_null(); } else { - let decimal_value: Result = - parse_decimal_with_parameter(s, precision, scale); + let decimal_value: Result = + parse_decimal_with_parameter::(s, precision, scale); match decimal_value { Ok(v) => { decimal_builder.append_value(v); @@ -814,17 +828,17 @@ fn build_decimal_array( )) } -// Parse the string format decimal value to i128 format and checking the precision and scale. -// The result i128 value can't be out of bounds. -fn parse_decimal_with_parameter( +// Parse the string format decimal value to i128/i256 format and checking the precision and scale. +// The result value can't be out of bounds. +fn parse_decimal_with_parameter( s: &str, precision: u8, scale: i8, -) -> Result { +) -> Result { if PARSE_DECIMAL_RE.is_match(s) { let mut offset = s.len(); let len = s.len(); - let mut base = 1; + let mut base = T::Native::usize_as(1); let scale_usize = usize::from(scale as u8); // handle the value after the '.' and meet the scale @@ -832,7 +846,7 @@ fn parse_decimal_with_parameter( match delimiter_position { None => { // there is no '.' - base = 10_i128.pow(scale as u32); + base = T::Native::usize_as(10).pow_checked(scale as u32)?; } Some(mid) => { // there is the '.' @@ -841,7 +855,8 @@ fn parse_decimal_with_parameter( offset -= len - mid - 1 - scale_usize; } else { // If the string value is "123.12" and the scale is 4, we should append '00' to the tail. - base = 10_i128.pow((scale_usize + 1 + mid - len) as u32); + base = T::Native::usize_as(10) + .pow_checked((scale_usize + 1 + mid - len) as u32)?; } } }; @@ -849,25 +864,29 @@ fn parse_decimal_with_parameter( // each byte is digit、'-' or '.' let bytes = s.as_bytes(); let mut negative = false; - let mut result: i128 = 0; + let mut result = T::Native::usize_as(0); - bytes[0..offset].iter().rev().for_each(|&byte| match byte { - b'-' => { - negative = true; - } - b'0'..=b'9' => { - result += i128::from(byte - b'0') * base; - base *= 10; + for byte in bytes[0..offset].iter().rev() { + match byte { + b'-' => { + negative = true; + } + b'0'..=b'9' => { + let add = + T::Native::usize_as((byte - b'0') as usize).mul_checked(base)?; + result = result.add_checked(add)?; + base = base.mul_checked(T::Native::usize_as(10))?; + } + // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'. + _ => {} } - // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'. - _ => {} - }); + } if negative { - result = result.neg(); + result = result.neg_checked()?; } - match validate_decimal_precision(result, precision) { + match T::validate_decimal_precision(result, precision) { Ok(_) => Ok(result), Err(e) => Err(ArrowError::ParseError(format!( "parse decimal overflow: {e}" @@ -884,6 +903,8 @@ fn parse_decimal_with_parameter( // Like "125.12" to 12512_i128. #[cfg(test)] fn parse_decimal(s: &str) -> Result { + use std::ops::Neg; + if PARSE_DECIMAL_RE.is_match(s) { let mut offset = s.len(); // each byte is digit、'-' or '.' @@ -1230,6 +1251,7 @@ impl ReaderBuilder { mod tests { use super::*; + use arrow_buffer::i256; use std::io::{Cursor, Write}; use tempfile::NamedTempFile; @@ -1318,7 +1340,7 @@ mod tests { let schema = Schema::new(vec![ Field::new("city", DataType::Utf8, false), Field::new("lat", DataType::Decimal128(38, 6), false), - Field::new("lng", DataType::Decimal128(38, 6), false), + Field::new("lng", DataType::Decimal256(76, 6), false), ]); let file = File::open("test/data/decimal_test.csv").unwrap(); @@ -1343,6 +1365,23 @@ mod tests { assert_eq!("123.000000", lat.value_as_string(7)); assert_eq!("123.000000", lat.value_as_string(8)); assert_eq!("-50.760000", lat.value_as_string(9)); + + let lng = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("-3.335724", lng.value_as_string(0)); + assert_eq!("-2.179404", lng.value_as_string(1)); + assert_eq!("-1.778197", lng.value_as_string(2)); + assert_eq!("-3.179090", lng.value_as_string(3)); + assert_eq!("-3.179090", lng.value_as_string(4)); + assert_eq!("0.290472", lng.value_as_string(5)); + assert_eq!("0.290472", lng.value_as_string(6)); + assert_eq!("0.290472", lng.value_as_string(7)); + assert_eq!("0.290472", lng.value_as_string(8)); + assert_eq!("0.290472", lng.value_as_string(9)); } #[test] @@ -1788,26 +1827,42 @@ mod tests { ("-123.", -123000i128), ]; for (s, i) in tests { - let result = parse_decimal_with_parameter(s, 20, 3); - assert_eq!(i, result.unwrap()) + let result_128 = parse_decimal_with_parameter::(s, 20, 3); + assert_eq!(i, result_128.unwrap()); + let result_256 = parse_decimal_with_parameter::(s, 20, 3); + assert_eq!(i256::from_i128(i), result_256.unwrap()); } let can_not_parse_tests = ["123,123", ".", "123.123.123"]; for s in can_not_parse_tests { - let result = parse_decimal_with_parameter(s, 20, 3); + let result_128 = parse_decimal_with_parameter::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string() + ); + let result_256 = parse_decimal_with_parameter::(s, 20, 3); assert_eq!( format!("Parser error: can't parse the string value {s} to decimal"), - result.unwrap_err().to_string() + result_256.unwrap_err().to_string() ); } let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; for s in overflow_parse_tests { - let result = parse_decimal_with_parameter(s, 10, 3); - let expected = "Parser error: parse decimal overflow"; - let actual = result.unwrap_err().to_string(); + let result_128 = parse_decimal_with_parameter::(s, 10, 3); + let expected_128 = "Parser error: parse decimal overflow"; + let actual_128 = result_128.unwrap_err().to_string(); + + assert!( + actual_128.contains(expected_128), + "actual: '{actual_128}', expected: '{expected_128}'" + ); + + let result_256 = parse_decimal_with_parameter::(s, 10, 3); + let expected_256 = "Parser error: parse decimal overflow"; + let actual_256 = result_256.unwrap_err().to_string(); assert!( - actual.contains(expected), - "actual: '{actual}', expected: '{expected}'" + actual_256.contains(expected_256), + "actual: '{actual_256}', expected: '{expected_256}'" ); } } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index e0734a15fd47..d9331053f3d8 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -326,7 +326,9 @@ mod tests { use super::*; use crate::Reader; + use arrow_array::builder::{Decimal128Builder, Decimal256Builder}; use arrow_array::types::*; + use arrow_buffer::i256; use std::io::{Cursor, Read, Seek}; use std::sync::Arc; @@ -406,6 +408,59 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } + #[test] + fn test_write_csv_decimal() { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Decimal128(38, 6), true), + Field::new("c2", DataType::Decimal256(76, 6), true), + ]); + + let mut c1_builder = + Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); + c1_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); + let c1 = c1_builder.finish(); + + let mut c2_builder = + Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); + c2_builder.extend(vec![ + Some(i256::from_i128(-3335724)), + Some(i256::from_i128(2179404)), + None, + Some(i256::from_i128(290472)), + ]); + let c2 = c2_builder.finish(); + + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) + .unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + + let mut writer = Writer::new(&mut file); + let batches = vec![&batch, &batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + // check that file was written successfully + file.rewind().unwrap(); + let mut buffer: Vec = vec![]; + file.read_to_end(&mut buffer).unwrap(); + + let expected = r#"c1,c2 +-3.335724,-3.335724 +2.179404,2.179404 +, +0.290472,0.290472 +-3.335724,-3.335724 +2.179404,2.179404 +, +0.290472,0.290472 +"#; + assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); + } + #[test] fn test_write_csv_custom_options() { let schema = Schema::new(vec![ From 55c598d09d6268f97181e17c080d5f6a99545a4e Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 16 Feb 2023 14:03:27 +0100 Subject: [PATCH 0611/1411] feat: impl `Ord`/`PartialOrd` for `SortOptions` (#3723) --- arrow-schema/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 6bc2329dbd36..3c2af577c2a6 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -30,7 +30,7 @@ pub use schema::*; pub mod ffi; /// Options that define the sort order of a given column -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] pub struct SortOptions { /// Whether to sort in descending order pub descending: bool, From 221533f75a0db3e4ae2bdb0b541dc37dbca2b4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Thu, 16 Feb 2023 21:54:51 +0300 Subject: [PATCH 0612/1411] Not operator overload for SortOptions (#3727) * Not operator overload for SortOptions * clippy fail fixed * Comments are removed --------- Co-authored-by: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> --- arrow-schema/src/lib.rs | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 3c2af577c2a6..e977203e9c71 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -25,6 +25,7 @@ mod field; pub use field::*; mod schema; pub use schema::*; +use std::ops; #[cfg(feature = "ffi")] pub mod ffi; @@ -47,3 +48,50 @@ impl Default for SortOptions { } } } + +/// `!` operator is overloaded for `SortOptions` to invert boolean +/// fields of the struct. +impl ops::Not for SortOptions { + type Output = SortOptions; + + fn not(self) -> SortOptions { + SortOptions { + descending: !self.descending, + nulls_first: !self.nulls_first, + } + } +} + +#[test] +fn test_overloaded_not_sort_options() { + let sort_options_array = [ + SortOptions { + descending: false, + nulls_first: false, + }, + SortOptions { + descending: false, + nulls_first: true, + }, + SortOptions { + descending: true, + nulls_first: false, + }, + SortOptions { + descending: true, + nulls_first: true, + }, + ]; + + assert!((!sort_options_array[0]).descending); + assert!((!sort_options_array[0]).nulls_first); + + assert!((!sort_options_array[1]).descending); + assert!(!(!sort_options_array[1]).nulls_first); + + assert!(!(!sort_options_array[2]).descending); + assert!((!sort_options_array[2]).nulls_first); + + assert!(!(!sort_options_array[3]).descending); + assert!(!(!sort_options_array[3]).nulls_first); +} From 59016e53e5cfa1d368009ed640d1f3dce326e7bb Mon Sep 17 00:00:00 2001 From: Rafael Guerreiro <115128333+rguerreiromsft@users.noreply.github.com> Date: Thu, 16 Feb 2023 12:43:07 -0800 Subject: [PATCH 0613/1411] Using Borrow to avoid consuming the value if you need to keep the json and get the schema. This avoids unnecessary cloning the entire json. (#3728) --- arrow-json/src/reader.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 1d4cfc740fdf..54e687a8b47b 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -46,6 +46,7 @@ //! let batch = json.next().unwrap().unwrap(); //! ``` +use std::borrow::Borrow; use std::io::{BufRead, BufReader, Read, Seek}; use std::sync::Arc; @@ -526,16 +527,17 @@ fn collect_field_types_from_object( /// The reason we diverge here is because we don't have utilities to deal with JSON data once it's /// interpreted as Strings. We should match Spark's behavior once we added more JSON parsing /// kernels in the future. -pub fn infer_json_schema_from_iterator(value_iter: I) -> Result +pub fn infer_json_schema_from_iterator(value_iter: I) -> Result where - I: Iterator>, + I: Iterator>, + V: Borrow, { let mut field_types: HashMap = HashMap::new(); for record in value_iter { - match record? { + match record?.borrow() { Value::Object(map) => { - collect_field_types_from_object(&mut field_types, &map)?; + collect_field_types_from_object(&mut field_types, map)?; } value => { return Err(ArrowError::JsonError(format!( From ea48b9571f88bfbced60f9790ae2a7102502870e Mon Sep 17 00:00:00 2001 From: Runji Wang Date: Fri, 17 Feb 2023 14:09:14 +0800 Subject: [PATCH 0614/1411] fix: encoding batch with no columns (#3724) * fix encoding batch with no column Signed-off-by: Runji Wang * Update arrow-flight/src/encode.rs Co-authored-by: Liang-Chi Hsieh * Update arrow-flight/src/encode.rs Co-authored-by: Liang-Chi Hsieh --------- Signed-off-by: Runji Wang Co-authored-by: Liang-Chi Hsieh --- arrow-flight/src/encode.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 2f06ee58f070..2e93acb0931c 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -18,7 +18,7 @@ use std::{collections::VecDeque, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; use crate::{error::Result, FlightData, SchemaAsIpc}; -use arrow_array::{ArrayRef, RecordBatch}; +use arrow_array::{ArrayRef, RecordBatch, RecordBatchOptions}; use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use bytes::Bytes; @@ -422,8 +422,11 @@ fn prepare_batch_for_flight( .iter() .map(hydrate_dictionary) .collect::>>()?; + let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); - Ok(RecordBatch::try_new(schema, columns)?) + Ok(RecordBatch::try_new_with_options( + schema, columns, &options, + )?) } /// Hydrates a dictionary to its underlying type @@ -499,6 +502,18 @@ mod tests { ); } + #[test] + fn test_encode_no_column_batch() { + let batch = RecordBatch::try_new_with_options( + Arc::new(Schema::empty()), + vec![], + &RecordBatchOptions::new().with_row_count(Some(10)), + ) + .expect("cannot create record batch"); + + prepare_batch_for_flight(&batch, batch.schema()).expect("failed to optimize"); + } + pub fn make_flight_data( batch: &RecordBatch, options: &IpcWriteOptions, From 9a6c516f6e5c5411489a65af2e53dba041a26025 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Fri, 17 Feb 2023 17:34:39 +0800 Subject: [PATCH 0615/1411] [minor] fix doc test fail (#3732) --- arrow-array/src/array/fixed_size_binary_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 9debea08d321..936fb3025cd4 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -42,7 +42,7 @@ use std::any::Any; /// ``` /// use arrow_array::{Array, FixedSizeBinaryArray}; /// let input_arg = vec![ None, Some(vec![7, 8]), Some(vec![9, 10]), None, Some(vec![13, 14]) ]; -/// let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); +/// let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap(); /// assert_eq!(5, arr.len()) /// /// ``` From f4d4f76e75c6f3f3127f025d05a526fc5334459a Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 20 Feb 2023 03:43:23 -0800 Subject: [PATCH 0616/1411] feat: implement generic record batch reader (#3733) --- arrow-array/src/lib.rs | 4 +- arrow-array/src/record_batch.rs | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index d8dc6efe25be..2cee2650eb7e 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -165,7 +165,9 @@ pub mod array; pub use array::*; mod record_batch; -pub use record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; +pub use record_batch::{ + RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, +}; mod arithmetic; pub use arithmetic::ArrowNativeTypeOp; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 035efb4f0f2c..3b517872aac4 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -24,6 +24,8 @@ use std::ops::Index; use std::sync::Arc; /// Trait for types that can read `RecordBatch`'s. +/// +/// To create from an iterator, see [RecordBatchIterator]. pub trait RecordBatchReader: Iterator> { /// Returns the schema of this `RecordBatchReader`. /// @@ -491,6 +493,78 @@ impl Index<&str> for RecordBatch { } } +/// Generic implementation of [RecordBatchReader] that wraps an iterator. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray, RecordBatchIterator, RecordBatchReader}; +/// # +/// let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); +/// let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b"])); +/// +/// let record_batch = RecordBatch::try_from_iter(vec![ +/// ("a", a), +/// ("b", b), +/// ]).unwrap(); +/// +/// let batches: Vec = vec![record_batch.clone(), record_batch.clone()]; +/// +/// let mut reader = RecordBatchIterator::new(batches.into_iter().map(Ok), record_batch.schema()); +/// +/// assert_eq!(reader.schema(), record_batch.schema()); +/// assert_eq!(reader.next().unwrap().unwrap(), record_batch); +/// # assert_eq!(reader.next().unwrap().unwrap(), record_batch); +/// # assert!(reader.next().is_none()); +/// ``` +pub struct RecordBatchIterator +where + I: IntoIterator>, +{ + inner: I::IntoIter, + inner_schema: SchemaRef, +} + +impl RecordBatchIterator +where + I: IntoIterator>, +{ + /// Create a new [RecordBatchIterator]. + /// + /// If `iter` is an infallible iterator, use `.map(Ok)`. + pub fn new(iter: I, schema: SchemaRef) -> Self { + Self { + inner: iter.into_iter(), + inner_schema: schema, + } + } +} + +impl Iterator for RecordBatchIterator +where + I: IntoIterator>, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + self.inner.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl RecordBatchReader for RecordBatchIterator +where + I: IntoIterator>, +{ + fn schema(&self) -> SchemaRef { + self.inner_schema.clone() + } +} + #[cfg(test)] mod tests { use super::*; From 72ad8a728e43d15409f93331da7fc793220252ad Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 20 Feb 2023 19:40:49 +0000 Subject: [PATCH 0617/1411] Make dictionary kernels optional for comparison benchmark (#3738) --- arrow/Cargo.toml | 2 +- arrow/benches/comparison_kernels.rs | 55 ++++++++++++++++------------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ef89e5a81232..2032d5048977 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -160,7 +160,7 @@ required-features = ["test_utils"] [[bench]] name = "comparison_kernels" harness = false -required-features = ["test_utils", "dyn_cmp_dict"] +required-features = ["test_utils"] [[bench]] name = "filter_kernels" diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 7b3b935bcf3a..73db3ffed368 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -26,6 +26,8 @@ use arrow::datatypes::{ArrowNativeTypeOp, ArrowNumericType, IntervalMonthDayNano use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type}; +const SIZE: usize = 65536; + fn bench_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) where T: ArrowNumericType, @@ -102,29 +104,37 @@ fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { .unwrap(); } -fn bench_dict_eq(arr_a: &DictionaryArray, arr_b: &DictionaryArray) -where - T: ArrowNumericType, -{ - cmp_dict_utf8::( - criterion::black_box(arr_a), - criterion::black_box(arr_b), - |a, b| a == b, - ) - .unwrap(); +#[cfg(not(feature = "dyn_cmp_dict"))] +fn dyn_cmp_dict_benchmarks(_c: &mut Criterion) {} + +#[cfg(feature = "dyn_cmp_dict")] +fn dyn_cmp_dict_benchmarks(c: &mut Criterion) { + let strings = create_string_array::(20, 0.); + let dict_arr_a = create_dict_from_values::(SIZE, 0., &strings); + let dict_arr_b = create_dict_from_values::(SIZE, 0., &strings); + + c.bench_function("eq dictionary[10] string[4])", |b| { + b.iter(|| { + cmp_dict_utf8::<_, i32, _>( + criterion::black_box(&dict_arr_a), + criterion::black_box(&dict_arr_b), + |a, b| a == b, + ) + .unwrap() + }) + }); } fn add_benchmark(c: &mut Criterion) { - let size = 65536; - let arr_a = create_primitive_array_with_seed::(size, 0.0, 42); - let arr_b = create_primitive_array_with_seed::(size, 0.0, 43); + let arr_a = create_primitive_array_with_seed::(SIZE, 0.0, 42); + let arr_b = create_primitive_array_with_seed::(SIZE, 0.0, 43); let arr_month_day_nano_a = - create_primitive_array_with_seed::(size, 0.0, 43); + create_primitive_array_with_seed::(SIZE, 0.0, 43); let arr_month_day_nano_b = - create_primitive_array_with_seed::(size, 0.0, 43); + create_primitive_array_with_seed::(SIZE, 0.0, 43); - let arr_string = create_string_array::(size, 0.0); + let arr_string = create_string_array::(SIZE, 0.0); c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); c.bench_function("eq scalar Float32", |b| { @@ -168,8 +178,8 @@ fn add_benchmark(c: &mut Criterion) { }) }); - let arr_a = create_primitive_array_with_seed::(size, 0.0, 42); - let arr_b = create_primitive_array_with_seed::(size, 0.0, 43); + let arr_a = create_primitive_array_with_seed::(SIZE, 0.0, 42); + let arr_b = create_primitive_array_with_seed::(SIZE, 0.0, 43); c.bench_function("eq Int32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); c.bench_function("eq scalar Int32", |b| { @@ -315,12 +325,7 @@ fn add_benchmark(c: &mut Criterion) { }); let strings = create_string_array::(20, 0.); - let dict_arr_a = create_dict_from_values::(size, 0., &strings); - let dict_arr_b = create_dict_from_values::(size, 0., &strings); - - c.bench_function("eq dictionary[10] string[4])", |b| { - b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b)) - }); + let dict_arr_a = create_dict_from_values::(SIZE, 0., &strings); c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| { b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test")) @@ -338,6 +343,8 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| { b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test")) }); + + dyn_cmp_dict_benchmarks(c); } criterion_group!(benches, add_benchmark); From 25da74c35861fe0c5922c0e0792eacbe1bd48e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E5=B0=8F=E5=88=9A?= <35674070+suxiaogang223@users.noreply.github.com> Date: Tue, 21 Feb 2023 16:13:11 +0800 Subject: [PATCH 0618/1411] replace for loop by try_for_each (#3734) * replace for loop by try_for_each * fix * fix --- arrow-csv/src/reader/mod.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index c5fe20e9d915..29bdeb4e2895 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -866,21 +866,25 @@ fn parse_decimal_with_parameter( let mut negative = false; let mut result = T::Native::usize_as(0); - for byte in bytes[0..offset].iter().rev() { - match byte { - b'-' => { - negative = true; - } - b'0'..=b'9' => { - let add = - T::Native::usize_as((byte - b'0') as usize).mul_checked(base)?; - result = result.add_checked(add)?; - base = base.mul_checked(T::Native::usize_as(10))?; + bytes[0..offset] + .iter() + .rev() + .try_for_each::<_, Result<(), ArrowError>>(|&byte| { + match byte { + b'-' => { + negative = true; + } + b'0'..=b'9' => { + let add = T::Native::usize_as((byte - b'0') as usize) + .mul_checked(base)?; + result = result.add_checked(add)?; + base = base.mul_checked(T::Native::usize_as(10))?; + } + // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'. + _ => (), } - // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'. - _ => {} - } - } + Ok(()) + })?; if negative { result = result.neg_checked()?; From e33dbe26989f290010bcac8fe933379014884d68 Mon Sep 17 00:00:00 2001 From: Rafael Guerreiro <115128333+rguerreiromsft@users.noreply.github.com> Date: Tue, 21 Feb 2023 06:00:36 -0800 Subject: [PATCH 0619/1411] Support String Coercion in Raw JSON Reader (#3736) * Add ability to coerce primitive values into string behind an option * Moving coerce_primitive flag into the Decoders because it makes more sense than the Tape --- arrow-json/src/raw/list_array.rs | 12 ++- arrow-json/src/raw/map_array.rs | 19 ++++- arrow-json/src/raw/mod.rs | 128 ++++++++++++++++++++++++++--- arrow-json/src/raw/string_array.rs | 34 +++++++- arrow-json/src/raw/struct_array.rs | 10 ++- 5 files changed, 181 insertions(+), 22 deletions(-) diff --git a/arrow-json/src/raw/list_array.rs b/arrow-json/src/raw/list_array.rs index 7d37fc51d390..91ca4b7275bf 100644 --- a/arrow-json/src/raw/list_array.rs +++ b/arrow-json/src/raw/list_array.rs @@ -31,13 +31,21 @@ pub struct ListArrayDecoder { } impl ListArrayDecoder { - pub fn new(data_type: DataType, is_nullable: bool) -> Result { + pub fn new( + data_type: DataType, + coerce_primitive: bool, + is_nullable: bool, + ) -> Result { let field = match &data_type { DataType::List(f) if !O::IS_LARGE => f, DataType::LargeList(f) if O::IS_LARGE => f, _ => unreachable!(), }; - let decoder = make_decoder(field.data_type().clone(), field.is_nullable())?; + let decoder = make_decoder( + field.data_type().clone(), + coerce_primitive, + field.is_nullable(), + )?; Ok(Self { data_type, diff --git a/arrow-json/src/raw/map_array.rs b/arrow-json/src/raw/map_array.rs index 670210f66214..ac48d8bce1e7 100644 --- a/arrow-json/src/raw/map_array.rs +++ b/arrow-json/src/raw/map_array.rs @@ -30,7 +30,11 @@ pub struct MapArrayDecoder { } impl MapArrayDecoder { - pub fn new(data_type: DataType, is_nullable: bool) -> Result { + pub fn new( + data_type: DataType, + coerce_primitive: bool, + is_nullable: bool, + ) -> Result { let fields = match &data_type { DataType::Map(_, true) => { return Err(ArrowError::NotYetImplemented( @@ -48,9 +52,16 @@ impl MapArrayDecoder { _ => unreachable!(), }; - let keys = make_decoder(fields[0].data_type().clone(), fields[0].is_nullable())?; - let values = - make_decoder(fields[1].data_type().clone(), fields[1].is_nullable())?; + let keys = make_decoder( + fields[0].data_type().clone(), + coerce_primitive, + fields[0].is_nullable(), + )?; + let values = make_decoder( + fields[1].data_type().clone(), + coerce_primitive, + fields[1].is_nullable(), + )?; Ok(Self { data_type, diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index a45ff8ea85eb..e597753a9469 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -43,6 +43,7 @@ mod tape; /// A builder for [`RawReader`] and [`RawDecoder`] pub struct RawReaderBuilder { batch_size: usize, + coerce_primitive: bool, schema: SchemaRef, } @@ -58,6 +59,7 @@ impl RawReaderBuilder { pub fn new(schema: SchemaRef) -> Self { Self { batch_size: 1024, + coerce_primitive: false, schema, } } @@ -67,6 +69,14 @@ impl RawReaderBuilder { Self { batch_size, ..self } } + /// Sets if the decoder should coerce primitive values (bool and number) into string when the Schema's column is Utf8 or LargeUtf8. + pub fn coerce_primitive(self, coerce_primitive: bool) -> Self { + Self { + coerce_primitive, + ..self + } + } + /// Create a [`RawReader`] with the provided [`BufRead`] pub fn build(self, reader: R) -> Result, ArrowError> { Ok(RawReader { @@ -77,7 +87,11 @@ impl RawReaderBuilder { /// Create a [`RawDecoder`] pub fn build_decoder(self) -> Result { - let decoder = make_decoder(DataType::Struct(self.schema.fields.clone()), false)?; + let decoder = make_decoder( + DataType::Struct(self.schema.fields.clone()), + self.coerce_primitive, + false, + )?; let num_fields = self.schema.all_fields().len(); Ok(RawDecoder { @@ -270,6 +284,7 @@ macro_rules! primitive_decoder { fn make_decoder( data_type: DataType, + coerce_primitive: bool, is_nullable: bool, ) -> Result, ArrowError> { downcast_integer! { @@ -277,15 +292,15 @@ fn make_decoder( DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), DataType::Boolean => Ok(Box::::default()), - DataType::Utf8 => Ok(Box::>::default()), - DataType::LargeUtf8 => Ok(Box::>::default()), - DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, is_nullable)?)), - DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, is_nullable)?)), - DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, is_nullable)?)), + DataType::Utf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), + DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), + DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, is_nullable)?)), + DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, is_nullable)?)), + DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, coerce_primitive, is_nullable)?)), DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { Err(ArrowError::JsonError(format!("{data_type} is not supported by JSON"))) } - DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, is_nullable)?)), + DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, coerce_primitive, is_nullable)?)), d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader"))) } } @@ -311,13 +326,19 @@ mod tests { use std::io::{BufReader, Cursor, Seek}; use std::sync::Arc; - fn do_read(buf: &str, batch_size: usize, schema: SchemaRef) -> Vec { + fn do_read( + buf: &str, + batch_size: usize, + coerce_primitive: bool, + schema: SchemaRef, + ) -> Vec { let mut unbuffered = vec![]; // Test with different batch sizes to test for boundary conditions for batch_size in [1, 3, 100, batch_size] { unbuffered = RawReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) + .coerce_primitive(coerce_primitive) .build(Cursor::new(buf.as_bytes())) .unwrap() .collect::, _>>() @@ -331,6 +352,7 @@ mod tests { for b in [1, 3, 5] { let buffered = RawReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) + .coerce_primitive(coerce_primitive) .build(BufReader::with_capacity(b, Cursor::new(buf.as_bytes()))) .unwrap() .collect::, _>>() @@ -360,7 +382,7 @@ mod tests { Field::new("c", DataType::Boolean, true), ])); - let batches = do_read(buf, 1024, schema); + let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); let col1 = as_primitive_array::(batches[0].column(0)); @@ -397,7 +419,7 @@ mod tests { Field::new("b", DataType::LargeUtf8, true), ])); - let batches = do_read(buf, 1024, schema); + let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); let col1 = as_string_array(batches[0].column(0)); @@ -454,7 +476,7 @@ mod tests { ), ])); - let batches = do_read(buf, 1024, schema); + let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); let list = as_list_array(batches[0].column(0).as_ref()); @@ -517,7 +539,7 @@ mod tests { ), ])); - let batches = do_read(buf, 1024, schema); + let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); let nested = as_struct_array(batches[0].column(0).as_ref()); @@ -561,7 +583,7 @@ mod tests { let map = DataType::Map(Box::new(Field::new("entries", entries, true)), false); let schema = Arc::new(Schema::new(vec![Field::new("map", map, true)])); - let batches = do_read(buf, 1024, schema); + let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); let map = as_map_array(batches[0].column(0).as_ref()); @@ -612,4 +634,84 @@ mod tests { assert_eq!(a_result, b_result); } } + + #[test] + fn test_not_coercing_primitive_into_string_without_flag() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)])); + + let buf = r#"{"a": 1}"#; + let result = RawReaderBuilder::new(schema.clone()) + .with_batch_size(1024) + .build(Cursor::new(buf.as_bytes())) + .unwrap() + .read(); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Json error: expected string got number".to_string() + ); + + let buf = r#"{"a": true}"#; + let result = RawReaderBuilder::new(schema) + .with_batch_size(1024) + .build(Cursor::new(buf.as_bytes())) + .unwrap() + .read(); + + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Json error: expected string got true".to_string() + ); + } + + #[test] + fn test_coercing_primitive_into_string() { + let buf = r#" + {"a": 1, "b": 2, "c": true} + {"a": 2E0, "b": 4, "c": false} + + {"b": 6, "a": 2.0} + {"b": "5", "a": 2} + {"b": 4e0} + {"b": 7, "a": null} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let col1 = as_string_array(batches[0].column(0)); + assert_eq!(col1.null_count(), 2); + assert_eq!(col1.value(0), "1"); + assert_eq!(col1.value(1), "2E0"); + assert_eq!(col1.value(2), "2.0"); + assert_eq!(col1.value(3), "2"); + assert!(col1.is_null(4)); + assert!(col1.is_null(5)); + + let col2 = as_string_array(batches[0].column(1)); + assert_eq!(col2.null_count(), 0); + assert_eq!(col2.value(0), "2"); + assert_eq!(col2.value(1), "4"); + assert_eq!(col2.value(2), "6"); + assert_eq!(col2.value(3), "5"); + assert_eq!(col2.value(4), "4e0"); + assert_eq!(col2.value(5), "7"); + + let col3 = as_string_array(batches[0].column(2)); + assert_eq!(col3.null_count(), 4); + assert_eq!(col3.value(0), "true"); + assert_eq!(col3.value(1), "false"); + assert!(col3.is_null(2)); + assert!(col3.is_null(3)); + assert!(col3.is_null(4)); + assert!(col3.is_null(5)); + } } diff --git a/arrow-json/src/raw/string_array.rs b/arrow-json/src/raw/string_array.rs index 31a7a99bec03..104e4e83f101 100644 --- a/arrow-json/src/raw/string_array.rs +++ b/arrow-json/src/raw/string_array.rs @@ -24,13 +24,27 @@ use std::marker::PhantomData; use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{tape_error, ArrayDecoder}; -#[derive(Default)] +const TRUE: &str = "true"; +const FALSE: &str = "false"; + pub struct StringArrayDecoder { + coerce_primitive: bool, phantom: PhantomData, } +impl StringArrayDecoder { + pub fn new(coerce_primitive: bool) -> Self { + Self { + coerce_primitive, + phantom: Default::default(), + } + } +} + impl ArrayDecoder for StringArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let coerce_primitive = self.coerce_primitive; + let mut data_capacity = 0; for p in pos { match tape.get(*p) { @@ -38,6 +52,15 @@ impl ArrayDecoder for StringArrayDecoder { data_capacity += tape.get_string(idx).len(); } TapeElement::Null => {} + TapeElement::True if coerce_primitive => { + data_capacity += TRUE.len(); + } + TapeElement::False if coerce_primitive => { + data_capacity += FALSE.len(); + } + TapeElement::Number(idx) if coerce_primitive => { + data_capacity += tape.get_string(idx).len(); + } d => return Err(tape_error(d, "string")), } } @@ -58,6 +81,15 @@ impl ArrayDecoder for StringArrayDecoder { builder.append_value(tape.get_string(idx)); } TapeElement::Null => builder.append_null(), + TapeElement::True if coerce_primitive => { + builder.append_value(TRUE); + } + TapeElement::False if coerce_primitive => { + builder.append_value(FALSE); + } + TapeElement::Number(idx) if coerce_primitive => { + builder.append_value(tape.get_string(idx)); + } _ => unreachable!(), } } diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs index 418d8abcc48f..64ceff22429b 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/raw/struct_array.rs @@ -28,10 +28,16 @@ pub struct StructArrayDecoder { } impl StructArrayDecoder { - pub fn new(data_type: DataType, is_nullable: bool) -> Result { + pub fn new( + data_type: DataType, + coerce_primitive: bool, + is_nullable: bool, + ) -> Result { let decoders = struct_fields(&data_type) .iter() - .map(|f| make_decoder(f.data_type().clone(), f.is_nullable())) + .map(|f| { + make_decoder(f.data_type().clone(), coerce_primitive, f.is_nullable()) + }) .collect::, ArrowError>>()?; Ok(Self { From 18388b209ae68d16302eeab4c2b09f2d5a65aad7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Feb 2023 14:44:06 +0000 Subject: [PATCH 0620/1411] Cleanup arithmetic kernel type constraints (#3739) --- arrow-arith/src/arithmetic.rs | 266 +++++++++------------------------- 1 file changed, 67 insertions(+), 199 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 0db32d575761..40e7d6780377 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -46,8 +46,6 @@ where LT: ArrowNumericType, RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> LT::Native, - LT::Native: ArrowNativeTypeOp, - RT::Native: ArrowNativeTypeOp, { binary(left, right, op) } @@ -64,8 +62,6 @@ where LT: ArrowNumericType, RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> Result, - LT::Native: ArrowNativeTypeOp, - RT::Native: ArrowNativeTypeOp, { try_binary(left, right, op) } @@ -88,7 +84,7 @@ where RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> Result, { - try_binary(left, right, op) + math_checked_op(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -157,10 +153,7 @@ fn simd_checked_modulus( valid_mask: Option, left: T::Simd, right: T::Simd, -) -> Result -where - T::Native: ArrowNativeTypeOp, -{ +) -> Result { let zero = T::init(T::Native::ZERO); let one = T::init(T::Native::ONE); @@ -194,10 +187,7 @@ fn simd_checked_divide( valid_mask: Option, left: T::Simd, right: T::Simd, -) -> Result -where - T::Native: ArrowNativeTypeOp, -{ +) -> Result { let zero = T::init(T::Native::ZERO); let one = T::init(T::Native::ONE); @@ -237,7 +227,6 @@ fn simd_checked_divide_op_remainder( ) -> Result<(), ArrowError> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, F: Fn(T::Native, T::Native) -> T::Native, { let result_remainder = result_chunks.into_remainder(); @@ -282,7 +271,6 @@ fn simd_checked_divide_op( ) -> Result, ArrowError> where T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, SI: Fn(Option, T::Simd, T::Simd) -> Result, SC: Fn(T::Native, T::Native) -> T::Native, { @@ -540,7 +528,6 @@ where K: ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> T::Native, - T::Native: ArrowNativeTypeOp, { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( @@ -596,7 +583,6 @@ where K: ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> Result, - T::Native: ArrowNativeTypeOp, { // left and right's value types are supposed to be same as guaranteed by the caller macro now. if left.value_type() != T::DATA_TYPE { @@ -709,14 +695,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_checked` instead. -pub fn add( +pub fn add( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { math_op(left, right, |a, b| a.add_wrapping(b)) } @@ -725,15 +707,11 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add` instead. -pub fn add_checked( +pub fn add_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ - try_binary(left, right, |a, b| a.add_checked(b)) +) -> Result, ArrowError> { + math_checked_op(left, right, |a, b| a.add_checked(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -893,14 +871,10 @@ pub fn add_dyn_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_scalar_checked` instead. -pub fn add_scalar( +pub fn add_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { Ok(unary(array, |value| value.add_wrapping(scalar))) } @@ -909,14 +883,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add_scalar` instead. -pub fn add_scalar_checked( +pub fn add_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { try_unary(array, |value| value.add_checked(scalar)) } @@ -928,14 +898,10 @@ where /// For an overflow-checking variant, use `add_scalar_checked_dyn` instead. /// /// This returns an `Err` when the input array is not supported for adding operation. -pub fn add_scalar_dyn( +pub fn add_scalar_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { unary_dyn::<_, T>(array, |value| value.add_wrapping(scalar)) } @@ -948,14 +914,10 @@ where /// /// As this kernel has the branching costs and also prevents LLVM from vectorising it correctly, /// it is usually much slower than non-checking variant. -pub fn add_scalar_checked_dyn( +pub fn add_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { try_unary_dyn::<_, T>(array, |value| value.add_checked(scalar)) .map(|a| Arc::new(a) as ArrayRef) } @@ -965,14 +927,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_checked` instead. -pub fn subtract( +pub fn subtract( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { math_op(left, right, |a, b| a.sub_wrapping(b)) } @@ -981,15 +939,11 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract` instead. -pub fn subtract_checked( +pub fn subtract_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ - try_binary(left, right, |a, b| a.sub_checked(b)) +) -> Result, ArrowError> { + math_checked_op(left, right, |a, b| a.sub_checked(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -1053,14 +1007,10 @@ pub fn subtract_dyn_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_scalar_checked` instead. -pub fn subtract_scalar( +pub fn subtract_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { Ok(unary(array, |value| value.sub_wrapping(scalar))) } @@ -1069,14 +1019,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract_scalar` instead. -pub fn subtract_scalar_checked( +pub fn subtract_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { try_unary(array, |value| value.sub_checked(scalar)) } @@ -1086,14 +1032,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_scalar_checked_dyn` instead. -pub fn subtract_scalar_dyn( +pub fn subtract_scalar_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { unary_dyn::<_, T>(array, |value| value.sub_wrapping(scalar)) } @@ -1103,14 +1045,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract_scalar_dyn` instead. -pub fn subtract_scalar_checked_dyn( +pub fn subtract_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { try_unary_dyn::<_, T>(array, |value| value.sub_checked(scalar)) .map(|a| Arc::new(a) as ArrayRef) } @@ -1119,11 +1057,9 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `negate_checked` instead. -pub fn negate(array: &PrimitiveArray) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +pub fn negate( + array: &PrimitiveArray, +) -> Result, ArrowError> { Ok(unary(array, |x| x.neg_wrapping())) } @@ -1131,13 +1067,9 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `negate` instead. -pub fn negate_checked( +pub fn negate_checked( array: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { try_unary(array, |value| value.neg_checked()) } @@ -1158,14 +1090,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_check` instead. -pub fn multiply( +pub fn multiply( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { math_op(left, right, |a, b| a.mul_wrapping(b)) } @@ -1174,15 +1102,11 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply` instead. -pub fn multiply_checked( +pub fn multiply_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ - try_binary(left, right, |a, b| a.mul_checked(b)) +) -> Result, ArrowError> { + math_checked_op(left, right, |a, b| a.mul_checked(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1246,14 +1170,10 @@ pub fn multiply_dyn_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_scalar_checked` instead. -pub fn multiply_scalar( +pub fn multiply_scalar( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { Ok(unary(array, |value| value.mul_wrapping(scalar))) } @@ -1262,14 +1182,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply_scalar` instead. -pub fn multiply_scalar_checked( +pub fn multiply_scalar_checked( array: &PrimitiveArray, scalar: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { try_unary(array, |value| value.mul_checked(scalar)) } @@ -1279,14 +1195,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_scalar_checked_dyn` instead. -pub fn multiply_scalar_dyn( +pub fn multiply_scalar_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { unary_dyn::<_, T>(array, |value| value.mul_wrapping(scalar)) } @@ -1296,14 +1208,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply_scalar_dyn` instead. -pub fn multiply_scalar_checked_dyn( +pub fn multiply_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { try_unary_dyn::<_, T>(array, |value| value.mul_checked(scalar)) .map(|a| Arc::new(a) as ArrayRef) } @@ -1311,14 +1219,10 @@ where /// Perform `left % right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. -pub fn modulus( +pub fn modulus( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_modulus::, |a, b| { a.mod_wrapping(b) @@ -1378,14 +1282,10 @@ pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result( +pub fn divide_checked( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { #[cfg(feature = "simd")] return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| { a.div_wrapping(b) @@ -1408,14 +1308,10 @@ where /// /// For integer types overflow will wrap around. /// -pub fn divide_opt( +pub fn divide_opt( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { binary_opt(left, right, |a, b| { if b.is_zero() { None @@ -1559,14 +1455,10 @@ pub fn divide_dyn_opt( /// If either left or right value is null then the result is also null. /// /// For an overflow-checking variant, use `divide_checked` instead. -pub fn divide( +pub fn divide( left: &PrimitiveArray, right: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { // TODO: This is incorrect as div_wrapping has side-effects for integer types // and so may panic on null values (#2647) math_op(left, right, |a, b| a.div_wrapping(b)) @@ -1575,14 +1467,10 @@ where /// Modulus every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. -pub fn modulus_scalar( +pub fn modulus_scalar( array: &PrimitiveArray, modulo: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { if modulo.is_zero() { return Err(ArrowError::DivideByZero); } @@ -1593,14 +1481,10 @@ where /// Modulus every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. -pub fn modulus_scalar_dyn( +pub fn modulus_scalar_dyn( array: &dyn Array, modulo: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { if modulo.is_zero() { return Err(ArrowError::DivideByZero); } @@ -1610,14 +1494,10 @@ where /// Divide every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. -pub fn divide_scalar( +pub fn divide_scalar( array: &PrimitiveArray, divisor: T::Native, -) -> Result, ArrowError> -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result, ArrowError> { if divisor.is_zero() { return Err(ArrowError::DivideByZero); } @@ -1631,14 +1511,10 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_scalar_checked_dyn` instead. -pub fn divide_scalar_dyn( +pub fn divide_scalar_dyn( array: &dyn Array, divisor: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { if divisor.is_zero() { return Err(ArrowError::DivideByZero); } @@ -1652,14 +1528,10 @@ where /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `divide_scalar_dyn` instead. -pub fn divide_scalar_checked_dyn( +pub fn divide_scalar_checked_dyn( array: &dyn Array, divisor: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { if divisor.is_zero() { return Err(ArrowError::DivideByZero); } @@ -1678,14 +1550,10 @@ where /// Unlike `divide_scalar_dyn` or `divide_scalar_checked_dyn`, division by zero will get a /// null value instead returning an `Err`, this also doesn't check overflowing, overflowing /// will just wrap the result around. -pub fn divide_scalar_opt_dyn( +pub fn divide_scalar_opt_dyn( array: &dyn Array, divisor: T::Native, -) -> Result -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ +) -> Result { if divisor.is_zero() { match array.data_type() { DataType::Dictionary(_, value_type) => { From 61ea9f226dbc90ed989722b5b534cfeaf33e01be Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 21 Feb 2023 13:28:59 -0800 Subject: [PATCH 0621/1411] Add datetime/interval/duration into dyn scalar comparison (#3730) * Add datatime/interval/duration into comparison * Add some tests --- arrow-ord/src/comparison.rs | 219 ++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 89fbccead6f9..a4f1fdb88091 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -523,6 +523,91 @@ macro_rules! dyn_compare_scalar { let left = as_primitive_array::($LEFT); $OP::(left, right) } + DataType::Date32 => { + let right = try_to_type!($RIGHT, to_i32)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Date64 => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Timestamp(TimeUnit::Second, _) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Time32(TimeUnit::Second) => { + let right = try_to_type!($RIGHT, to_i32)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Time32(TimeUnit::Millisecond) => { + let right = try_to_type!($RIGHT, to_i32)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Time64(TimeUnit::Microsecond) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Time64(TimeUnit::Nanosecond) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Interval(IntervalUnit::YearMonth) => { + let right = try_to_type!($RIGHT, to_i32)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Interval(IntervalUnit::DayTime) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let right = try_to_type!($RIGHT, to_i128)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Duration(TimeUnit::Second) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Duration(TimeUnit::Millisecond) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Duration(TimeUnit::Microsecond) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } + DataType::Duration(TimeUnit::Nanosecond) => { + let right = try_to_type!($RIGHT, to_i64)?; + let left = as_primitive_array::($LEFT); + $OP::(left, right) + } _ => Err(ArrowError::ComputeError(format!( "Unsupported data type {:?} for comparison {} with {:?}", $LEFT.data_type(), @@ -1707,6 +1792,22 @@ macro_rules! typed_compares { DataType::Interval(IntervalUnit::MonthDayNano), DataType::Interval(IntervalUnit::MonthDayNano), ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), + ( + DataType::Duration(TimeUnit::Second), + DataType::Duration(TimeUnit::Second), + ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), + ( + DataType::Duration(TimeUnit::Millisecond), + DataType::Duration(TimeUnit::Millisecond), + ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), + ( + DataType::Duration(TimeUnit::Microsecond), + DataType::Duration(TimeUnit::Microsecond), + ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), + ( + DataType::Duration(TimeUnit::Nanosecond), + DataType::Duration(TimeUnit::Nanosecond), + ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( "Comparing arrays of type {} is not yet implemented", t1 @@ -4006,6 +4107,124 @@ mod tests { ); } + fn test_primitive_dyn_scalar(array: PrimitiveArray) { + let a_eq = eq_dyn_scalar(&array, 8).unwrap(); + assert_eq!( + a_eq, + BooleanArray::from(vec![Some(false), None, Some(true), None, Some(false)]) + ); + + let a_eq = gt_eq_dyn_scalar(&array, 8).unwrap(); + assert_eq!( + a_eq, + BooleanArray::from(vec![Some(false), None, Some(true), None, Some(true)]) + ); + + let a_eq = gt_dyn_scalar(&array, 8).unwrap(); + assert_eq!( + a_eq, + BooleanArray::from(vec![Some(false), None, Some(false), None, Some(true)]) + ); + + let a_eq = lt_eq_dyn_scalar(&array, 8).unwrap(); + assert_eq!( + a_eq, + BooleanArray::from(vec![Some(true), None, Some(true), None, Some(false)]) + ); + + let a_eq = lt_dyn_scalar(&array, 8).unwrap(); + assert_eq!( + a_eq, + BooleanArray::from(vec![Some(true), None, Some(false), None, Some(false)]) + ); + } + + #[test] + fn test_timestamp_dyn_scalar() { + let array = + TimestampSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + TimestampNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_date32_dyn_scalar() { + let array = Date32Array::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_date64_dyn_scalar() { + let array = Date64Array::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_time32_dyn_scalar() { + let array = Time32SecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + Time32MillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_time64_dyn_scalar() { + let array = + Time64MicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + Time64NanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_interval_dyn_scalar() { + let array = + IntervalDayTimeArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + IntervalMonthDayNanoArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + IntervalYearMonthArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + + #[test] + fn test_duration_dyn_scalar() { + let array = + DurationSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + DurationMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + DurationMillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + + let array = + DurationNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + test_primitive_dyn_scalar(array); + } + #[test] fn test_lt_eq_dyn_scalar_with_dict() { let mut builder = From 6bf0aab6bbc0a6a4cc89affd7552af515edc1c38 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Feb 2023 21:48:09 +0000 Subject: [PATCH 0622/1411] Deprecate old JSON reader (#3610) (#3718) --- arrow-json/src/lib.rs | 1 + arrow-json/src/raw/mod.rs | 1 + arrow-json/src/reader.rs | 9 +++++++++ arrow-json/src/writer.rs | 3 +++ arrow/benches/json_reader.rs | 5 +++-- parquet/src/arrow/arrow_writer/levels.rs | 6 ++---- parquet/src/arrow/arrow_writer/mod.rs | 4 +--- 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 7e582c3359a6..5998bc3a4433 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -28,6 +28,7 @@ pub mod writer; mod raw; pub use self::raw::{RawDecoder, RawReader, RawReaderBuilder}; +#[allow(deprecated)] pub use self::reader::{Reader, ReaderBuilder}; pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer}; use half::f16; diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index e597753a9469..595a54c10a9e 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -310,6 +310,7 @@ fn tape_error(d: TapeElement, expected: &str) -> ArrowError { } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use crate::reader::infer_json_schema; diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 54e687a8b47b..7df63bf8d662 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -584,6 +584,7 @@ where /// [`RawDecoder`]: crate::raw::RawDecoder /// [#3610]: https://github.com/apache/arrow-rs/issues/3610 #[derive(Debug)] +#[deprecated(note = "Use RawDecoder instead")] pub struct Decoder { /// Explicit schema for the JSON file schema: SchemaRef, @@ -640,6 +641,7 @@ impl DecoderOptions { } } +#[allow(deprecated)] impl Decoder { /// Create a new JSON decoder from some value that implements an /// iterator over [`serde_json::Value`]s (aka implements the @@ -1606,12 +1608,15 @@ fn flatten_json_string_values(values: &[Value]) -> Vec> { /// [`RawReader`]: crate::raw::RawReader /// [#3610]: https://github.com/apache/arrow-rs/issues/3610 #[derive(Debug)] +#[deprecated(note = "Use RawReader instead")] +#[allow(deprecated)] pub struct Reader { reader: BufReader, /// JSON value decoder decoder: Decoder, } +#[allow(deprecated)] impl Reader { /// Create a new JSON Reader from any value that implements the `Read` trait. /// @@ -1658,6 +1663,7 @@ impl Reader { /// [#3610]: https://github.com/apache/arrow-rs/issues/3610 /// #[derive(Debug, Default)] +#[deprecated(note = "Use RawReaderBuilder instead")] pub struct ReaderBuilder { /// Optional schema for the JSON file /// @@ -1672,6 +1678,7 @@ pub struct ReaderBuilder { options: DecoderOptions, } +#[allow(deprecated)] impl ReaderBuilder { /// Create a new builder for configuring JSON parsing options. /// @@ -1752,6 +1759,7 @@ impl ReaderBuilder { } } +#[allow(deprecated)] impl Iterator for Reader { type Item = Result; @@ -1761,6 +1769,7 @@ impl Iterator for Reader { } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use arrow_array::cast::{ diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 028b7d889157..27ae3876441d 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1218,6 +1218,7 @@ mod tests { ); } + #[allow(deprecated)] fn test_write_for_file(test_file: &str) { let builder = ReaderBuilder::new() .infer_schema(None) @@ -1295,6 +1296,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn json_list_roundtrip() { let json_content = r#" {"list": [{"ints": 1}]} @@ -1406,6 +1408,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_write_single_batch() { let test_file = "test/data/basic.json"; let builder = ReaderBuilder::new() diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index b5d8a53679ef..5651813a6403 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -22,16 +22,17 @@ use arrow::util::bench_util::{ create_primitive_array, create_string_array, create_string_array_with_len, }; use arrow_array::RecordBatch; +use arrow_json::LineDelimitedWriter; use arrow_json::RawReaderBuilder; -use arrow_json::{LineDelimitedWriter, ReaderBuilder}; use std::io::Cursor; use std::sync::Arc; +#[allow(deprecated)] fn do_bench(c: &mut Criterion, name: &str, json: &str, schema: SchemaRef) { c.bench_function(&format!("{name} (basic)"), |b| { b.iter(|| { let cursor = Cursor::new(black_box(json)); - let builder = ReaderBuilder::new() + let builder = arrow_json::ReaderBuilder::new() .with_schema(schema.clone()) .with_batch_size(64); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index f427ce3e19e4..11ed35263e6a 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -1121,7 +1121,7 @@ mod tests { // Note: we are using the JSON Arrow reader for brevity let json_content = r#" {"stocks":{"long": "$AAA", "short": "$BBB"}} - {"stocks":{"long": null, "long": "$CCC", "short": null}} + {"stocks":{"long": "$CCC", "short": null}} {"stocks":{"hedged": "$YYY", "long": null, "short": "$D"}} "#; let entries_struct_type = DataType::Struct(vec![ @@ -1138,9 +1138,7 @@ mod tests { false, ); let schema = Arc::new(Schema::new(vec![stocks_field])); - let builder = arrow::json::ReaderBuilder::new() - .with_schema(schema) - .with_batch_size(64); + let builder = arrow::json::RawReaderBuilder::new(schema).with_batch_size(64); let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 6260c2ed4d0d..e6693a6cff4a 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1026,9 +1026,7 @@ mod tests { true, ); let schema = Arc::new(Schema::new(vec![stocks_field])); - let builder = arrow::json::ReaderBuilder::new() - .with_schema(schema) - .with_batch_size(64); + let builder = arrow::json::RawReaderBuilder::new(schema).with_batch_size(64); let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); From e753dea6634524e821eba60f6c7d91293a317120 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Feb 2023 09:23:57 +0000 Subject: [PATCH 0623/1411] Infer 2020-03-19 00:00:00 as timestamp not Date64 in CSV (#3744) (#3746) * Infer 2020-03-19 00:00:00 as timestamp not Date64 in CSV (#3744) * Update inference logic --- arrow-csv/src/reader/mod.rs | 208 +++++++++++++++++++++++++----------- 1 file changed, 143 insertions(+), 65 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 29bdeb4e2895..e78f2d0ba718 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -51,7 +51,6 @@ use arrow_cast::parse::Parser; use arrow_schema::*; use lazy_static::lazy_static; use regex::{Regex, RegexSet}; -use std::collections::HashSet; use std::fmt; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; @@ -62,44 +61,68 @@ use crate::reader::records::{RecordDecoder, StringRecords}; use csv::StringRecord; lazy_static! { + /// Order should match [`InferredDataType`] static ref REGEX_SET: RegexSet = RegexSet::new([ r"(?i)^(true)$|^(false)$(?-i)", //BOOLEAN - r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL r"^-?(\d+)$", //INTEGER + r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL r"^\d{4}-\d\d-\d\d$", //DATE32 - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$", //DATE64 + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$", //Timestamp(Second) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,3}$", //Timestamp(Millisecond) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,6}$", //Timestamp(Microsecond) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,9}$", //Timestamp(Nanosecond) ]).unwrap(); - //The order should match with REGEX_SET - static ref MATCH_DATA_TYPE: Vec = vec![ - DataType::Boolean, - DataType::Float64, - DataType::Int64, - DataType::Date32, - DataType::Date64, - ]; static ref PARSE_DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); - static ref DATETIME_RE: Regex = - Regex::new(r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}$").unwrap(); } -/// Infer the data type of a record -fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { - // when quoting is enabled in the reader, these quotes aren't escaped, we default to - // Utf8 for them - if string.starts_with('"') { - return DataType::Utf8; - } - let matches = REGEX_SET.matches(string).into_iter().next(); - // match regex in a particular order - match matches { - Some(ix) => MATCH_DATA_TYPE[ix].clone(), - None => { - let datetime_re = datetime_re.unwrap_or_else(|| DATETIME_RE.clone()); - if datetime_re.is_match(string) { - DataType::Timestamp(TimeUnit::Nanosecond, None) - } else { - DataType::Utf8 +#[derive(Default, Copy, Clone)] +struct InferredDataType { + /// Packed booleans indicating type + /// + /// 0 - Boolean + /// 1 - Integer + /// 2 - Float64 + /// 3 - Date32 + /// 4 - Timestamp(Second) + /// 5 - Timestamp(Millisecond) + /// 6 - Timestamp(Microsecond) + /// 7 - Timestamp(Nanosecond) + /// 8 - Utf8 + packed: u16, +} + +impl InferredDataType { + /// Returns the inferred data type + fn get(&self) -> DataType { + match self.packed { + 1 => DataType::Boolean, + 2 => DataType::Int64, + 4 | 6 => DataType::Float64, // Promote Int64 to Float64 + b if b != 0 && (b & !0b11111000) == 0 => match b.leading_zeros() { + // Promote to highest precision temporal type + 8 => DataType::Timestamp(TimeUnit::Nanosecond, None), + 9 => DataType::Timestamp(TimeUnit::Microsecond, None), + 10 => DataType::Timestamp(TimeUnit::Millisecond, None), + 11 => DataType::Timestamp(TimeUnit::Second, None), + 12 => DataType::Date32, + _ => unreachable!(), + }, + _ => DataType::Utf8, + } + } + + /// Updates the [`InferredDataType`] with the given string + fn update(&mut self, string: &str, datetime_re: Option<&Regex>) { + self.packed |= if string.starts_with('"') { + 1 << 8 // Utf8 + } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() { + 1 << m + } else { + match datetime_re { + // Timestamp(Nanosecond) + Some(d) if d.is_match(string) => 1 << 7, + _ => 1 << 8, // Utf8 } } } @@ -230,10 +253,9 @@ fn infer_reader_schema_with_csv_options( let header_length = headers.len(); // keep track of inferred field types - let mut column_types: Vec> = vec![HashSet::new(); header_length]; + let mut column_types: Vec = vec![Default::default(); header_length]; let mut records_count = 0; - let mut fields = vec![]; let mut record = StringRecord::new(); let max_records = roptions.max_read_records.unwrap_or(usize::MAX); @@ -248,40 +270,18 @@ fn infer_reader_schema_with_csv_options( for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) { if let Some(string) = record.get(i) { if !string.is_empty() { - column_type - .insert(infer_field_schema(string, roptions.datetime_re.clone())); + column_type.update(string, roptions.datetime_re.as_ref()) } } } } // build schema from inference results - for i in 0..header_length { - let possibilities = &column_types[i]; - let field_name = &headers[i]; - - // determine data type based on possible types - // if there are incompatible types, use DataType::Utf8 - match possibilities.len() { - 1 => { - for dtype in possibilities.iter() { - fields.push(Field::new(field_name, dtype.clone(), true)); - } - } - 2 => { - if possibilities.contains(&DataType::Int64) - && possibilities.contains(&DataType::Float64) - { - // we have an integer and double, fall down to double - fields.push(Field::new(field_name, DataType::Float64, true)); - } else { - // default to Utf8 for conflicting datatypes (e.g bool and int) - fields.push(Field::new(field_name, DataType::Utf8, true)); - } - } - _ => fields.push(Field::new(field_name, DataType::Utf8, true)), - } - } + let fields = column_types + .iter() + .zip(&headers) + .map(|(inferred, field_name)| Field::new(field_name, inferred.get(), true)) + .collect(); Ok((Schema::new(fields), records_count)) } @@ -681,6 +681,19 @@ fn parse( >( line_number, rows, i, None ), + DataType::Timestamp(TimeUnit::Second, _) => build_primitive_array::< + TimestampSecondType, + >( + line_number, rows, i, None + ), + DataType::Timestamp(TimeUnit::Millisecond, _) => { + build_primitive_array::( + line_number, + rows, + i, + None, + ) + } DataType::Timestamp(TimeUnit::Microsecond, _) => { build_primitive_array::( line_number, @@ -1637,7 +1650,10 @@ mod tests { assert_eq!(&DataType::Float64, schema.field(2).data_type()); assert_eq!(&DataType::Boolean, schema.field(3).data_type()); assert_eq!(&DataType::Date32, schema.field(4).data_type()); - assert_eq!(&DataType::Date64, schema.field(5).data_type()); + assert_eq!( + &DataType::Timestamp(TimeUnit::Second, None), + schema.field(5).data_type() + ); let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); @@ -1698,6 +1714,13 @@ mod tests { } } + /// Infer the data type of a record + fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { + let mut v = InferredDataType::default(); + v.update(string, datetime_re.as_ref()); + v.get() + } + #[test] fn test_infer_field_schema() { assert_eq!(infer_field_schema("A", None), DataType::Utf8); @@ -1712,22 +1735,22 @@ mod tests { assert_eq!(infer_field_schema("2020-11-08", None), DataType::Date32); assert_eq!( infer_field_schema("2020-11-08T14:20:01", None), - DataType::Date64 + DataType::Timestamp(TimeUnit::Second, None) ); assert_eq!( infer_field_schema("2020-11-08 14:20:01", None), - DataType::Date64 + DataType::Timestamp(TimeUnit::Second, None) ); let reg = Regex::new(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$").ok(); assert_eq!( infer_field_schema("2020-11-08 14:20:01", reg), - DataType::Date64 + DataType::Timestamp(TimeUnit::Second, None) ); assert_eq!(infer_field_schema("-5.13", None), DataType::Float64); assert_eq!(infer_field_schema("0.1300", None), DataType::Float64); assert_eq!( infer_field_schema("2021-12-19 13:12:30.921", None), - DataType::Timestamp(TimeUnit::Nanosecond, None) + DataType::Timestamp(TimeUnit::Millisecond, None) ); assert_eq!( infer_field_schema("2021-12-19T13:12:30.123456789", None), @@ -2407,4 +2430,59 @@ mod tests { assert_eq!(&read.fill_sizes, &[23, 3, 0, 0]); assert_eq!(read.fill_count, 4); } + + #[test] + fn test_inference() { + let cases: &[(&[&str], DataType)] = &[ + (&[], DataType::Utf8), + (&["false", "12"], DataType::Utf8), + (&["12", "cupcakes"], DataType::Utf8), + (&["12", "12.4"], DataType::Float64), + (&["14050", "24332"], DataType::Int64), + (&["14050.0", "true"], DataType::Utf8), + (&["14050", "2020-03-19 00:00:00"], DataType::Utf8), + (&["14050", "2340.0", "2020-03-19 00:00:00"], DataType::Utf8), + ( + &["2020-03-19 02:00:00", "2020-03-19 00:00:00"], + DataType::Timestamp(TimeUnit::Second, None), + ), + (&["2020-03-19", "2020-03-20"], DataType::Date32), + ( + &["2020-03-19", "2020-03-19 02:00:00", "2020-03-19 00:00:00"], + DataType::Timestamp(TimeUnit::Second, None), + ), + ( + &[ + "2020-03-19", + "2020-03-19 02:00:00", + "2020-03-19 00:00:00.000", + ], + DataType::Timestamp(TimeUnit::Millisecond, None), + ), + ( + &[ + "2020-03-19", + "2020-03-19 02:00:00", + "2020-03-19 00:00:00.000000", + ], + DataType::Timestamp(TimeUnit::Microsecond, None), + ), + ( + &[ + "2020-03-19", + "2020-03-19 02:00:00.000000000", + "2020-03-19 00:00:00.000000", + ], + DataType::Timestamp(TimeUnit::Nanosecond, None), + ), + ]; + + for (values, expected) in cases { + let mut t = InferredDataType::default(); + for v in *values { + t.update(v, None) + } + assert_eq!(&t.get(), expected, "{:?}", values) + } + } } From ebe6f539844ba781553c87bdaa2dd25190047c49 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Thu, 23 Feb 2023 04:52:12 -0500 Subject: [PATCH 0624/1411] feat: Sort kernel for `RunArray` (#3695) * Handle sliced array in run array iterator * sort_to_indices for RunArray * better loop * sort for run array * improve docs * some minor tweaks * doc fix * format fix * fix sort run to return all logical indices * pr comment * rename test function, pull sort run logic into a separate function --------- Co-authored-by: ask --- arrow-array/src/array/run_array.rs | 28 ++- arrow-array/src/run_iterator.rs | 9 +- arrow-ord/src/sort.rs | 331 +++++++++++++++++++++++++++++ arrow/benches/sort_kernel.rs | 21 +- 4 files changed, 374 insertions(+), 15 deletions(-) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 709933e1b103..9dba3ddab6ae 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -112,17 +112,37 @@ impl RunArray { /// Returns a reference to run_ends array /// - /// Note: any slicing of this array is not applied to the returned array + /// Note: any slicing of this [`RunArray`] array is not applied to the returned array /// and must be handled separately pub fn run_ends(&self) -> &PrimitiveArray { &self.run_ends } /// Returns a reference to values array + /// + /// Note: any slicing of this [`RunArray`] array is not applied to the returned array + /// and must be handled separately pub fn values(&self) -> &ArrayRef { &self.values } + /// Returns the physical index at which the array slice starts. + pub fn get_start_physical_index(&self) -> usize { + if self.offset() == 0 { + return 0; + } + self.get_zero_offset_physical_index(self.offset()).unwrap() + } + + /// Returns the physical index at which the array slice ends. + pub fn get_end_physical_index(&self) -> usize { + if self.offset() + self.len() == Self::logical_len(&self.run_ends) { + return self.run_ends.len() - 1; + } + self.get_zero_offset_physical_index(self.offset() + self.len() - 1) + .unwrap() + } + /// Downcast this [`RunArray`] to a [`TypedRunArray`] /// /// ``` @@ -230,11 +250,7 @@ impl RunArray { } // Skip some physical indices based on offset. - let skip_value = if self.offset() > 0 { - self.get_zero_offset_physical_index(self.offset()).unwrap() - } else { - 0 - }; + let skip_value = self.get_start_physical_index(); let mut physical_indices = vec![0; indices_len]; diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index a79969c3cb91..fbf173b1dbe0 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -57,13 +57,8 @@ where { /// create a new iterator pub fn new(array: TypedRunArray<'a, R, V>) -> Self { - let current_front_physical: usize = - array.run_array().get_physical_index(0).unwrap(); - let current_back_physical: usize = array - .run_array() - .get_physical_index(array.len() - 1) - .unwrap() - + 1; + let current_front_physical = array.run_array().get_start_physical_index(); + let current_back_physical = array.run_array().get_end_physical_index() + 1; RunArrayIter { array, current_front_logical: array.offset(), diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 207f499ef275..c4baa2283885 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -18,14 +18,17 @@ //! Defines sort kernel for `ArrayRef` use crate::ord::{build_compare, DynComparator}; +use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{ArrowNativeType, MutableBuffer}; use arrow_data::ArrayData; +use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; use std::cmp::Ordering; +use std::sync::Arc; pub use arrow_schema::SortOptions; @@ -55,6 +58,9 @@ pub fn sort( values: &ArrayRef, options: Option, ) -> Result { + if let DataType::RunEndEncoded(_, _) = values.data_type() { + return sort_run(values, options, None); + } let indices = sort_to_indices(values, options, None)?; take(values.as_ref(), &indices, None) } @@ -94,6 +100,9 @@ pub fn sort_limit( options: Option, limit: Option, ) -> Result { + if let DataType::RunEndEncoded(_, _) = values.data_type() { + return sort_run(values, options, limit); + } let indices = sort_to_indices(values, options, limit)?; take(values.as_ref(), &indices, None) } @@ -357,6 +366,16 @@ pub fn sort_to_indices( sort_binary::(values, v, n, &options, limit) } DataType::LargeBinary => sort_binary::(values, v, n, &options, limit), + DataType::RunEndEncoded(run_ends_field, _) => match run_ends_field.data_type() { + DataType::Int16 => sort_run_to_indices::(values, &options, limit), + DataType::Int32 => sort_run_to_indices::(values, &options, limit), + DataType::Int64 => sort_run_to_indices::(values, &options, limit), + dt => { + return Err(ArrowError::ComputeError(format!( + "Inavlid run end data type: {dt}" + ))) + } + }, t => { return Err(ArrowError::ComputeError(format!( "Sort not supported for data type {t:?}" @@ -599,6 +618,194 @@ fn insert_valid_values(result_slice: &mut [u32], offset: usize, valids: &[(u3 append_valids(&mut result_slice[offset..offset + valids.len()]); } +// Sort run array and return sorted run array. +// The output RunArray will be encoded at the same level as input run array. +// For e.g. an input RunArray { run_ends = [2,4,6,8], values = [1,2,1,2] } +// will result in output RunArray { run_ends = [2,4,6,8], values = [1,1,2,2] } +// and not RunArray { run_ends = [4,8], values = [1,2] } +fn sort_run( + values: &ArrayRef, + options: Option, + limit: Option, +) -> Result { + match values.data_type() { + DataType::RunEndEncoded(run_ends_field, _) => match run_ends_field.data_type() { + DataType::Int16 => sort_run_downcasted::(values, options, limit), + DataType::Int32 => sort_run_downcasted::(values, options, limit), + DataType::Int64 => sort_run_downcasted::(values, options, limit), + dt => unreachable!("Not valid run ends data type {dt}"), + }, + dt => Err(ArrowError::InvalidArgumentError(format!( + "Input is not a run encoded array. Input data type {dt}" + ))), + } +} + +fn sort_run_downcasted( + values: &ArrayRef, + options: Option, + limit: Option, +) -> Result { + let run_array = values.as_any().downcast_ref::>().unwrap(); + + // Determine the length of output run array. + let output_len = if let Some(limit) = limit { + limit.min(run_array.len()) + } else { + run_array.len() + }; + + let run_ends = run_array.run_ends(); + + let mut new_run_ends_builder = BufferBuilder::::new(run_ends.len()); + let mut new_run_end: usize = 0; + let mut new_physical_len: usize = 0; + + let consume_runs = |run_length, _| { + new_run_end += run_length; + new_physical_len += 1; + new_run_ends_builder.append(R::Native::from_usize(new_run_end).unwrap()); + }; + + let (values_indices, run_values) = + sort_run_inner(run_array, options, output_len, consume_runs); + + let new_run_ends = unsafe { + // Safety: + // The function builds a valid run_ends array and hence need not be validated. + ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) + .len(new_physical_len) + .null_count(0) + .add_buffer(new_run_ends_builder.finish()) + .build_unchecked() + }; + + // slice the sorted value indices based on limit. + let new_values_indices: PrimitiveArray = values_indices + .slice(0, new_run_ends.len()) + .into_data() + .into(); + + let new_values = take(&run_values, &new_values_indices, None)?; + + // Build sorted run array + let builder = ArrayDataBuilder::new(run_array.data_type().clone()) + .len(new_run_end) + .add_child_data(new_run_ends) + .add_child_data(new_values.into_data()); + let array_data: RunArray = unsafe { + // Safety: + // This function builds a valid run array and hence can skip validation. + builder.build_unchecked().into() + }; + Ok(Arc::new(array_data)) +} + +// Sort to indices for run encoded array. +// This function will be slow for run array as it decodes the physical indices to +// logical indices and to get the run array back, the logical indices has to be +// encoded back to run array. +fn sort_run_to_indices( + values: &ArrayRef, + options: &SortOptions, + limit: Option, +) -> UInt32Array { + let run_array = values.as_any().downcast_ref::>().unwrap(); + let output_len = if let Some(limit) = limit { + limit.min(run_array.len()) + } else { + run_array.len() + }; + let mut result: Vec = Vec::with_capacity(output_len); + + //Add all logical indices belonging to a physical index to the output + let consume_runs = |run_length, logical_start| { + result.extend(logical_start as u32..(logical_start + run_length) as u32); + }; + sort_run_inner(run_array, Some(*options), output_len, consume_runs); + + UInt32Array::from(result) +} + +fn sort_run_inner( + run_array: &RunArray, + options: Option, + output_len: usize, + mut consume_runs: F, +) -> (PrimitiveArray, ArrayRef) +where + F: FnMut(usize, usize), +{ + // slice the run_array.values based on offset and length. + let start_physical_index = run_array.get_start_physical_index(); + let end_physical_index = run_array.get_end_physical_index(); + let physical_len = end_physical_index - start_physical_index + 1; + let run_values = run_array.values().slice(start_physical_index, physical_len); + + // All the values have to be sorted irrespective of input limit. + let values_indices = sort_to_indices(&run_values, options, None).unwrap(); + + let mut remaining_len = output_len; + + let run_ends = run_array.run_ends(); + + assert_eq!( + 0, + values_indices.null_count(), + "The output of sort_to_indices should not have null values. Its values is {}", + values_indices.null_count() + ); + + // Calculate `run length` of sorted value indices. + // Find the `logical index` at which the run starts. + // Call the consumer using the run length and starting logical index. + for physical_index in values_indices.values() { + // As the values were sliced with offset = start_physical_index, it has to be added back + // before accesing `RunArray::run_ends` + let physical_index = *physical_index as usize + start_physical_index; + + // calculate the run length and logical index of sorted values + let (run_length, logical_index_start) = unsafe { + // Safety: + // The index will be within bounds as its in bounds of start_physical_index + // and len, both of which are within bounds of run_array + if physical_index == start_physical_index { + ( + run_ends.value_unchecked(physical_index).as_usize() + - run_array.offset(), + 0, + ) + } else if physical_index == end_physical_index { + let prev_run_end = + run_ends.value_unchecked(physical_index - 1).as_usize(); + ( + run_array.offset() + run_array.len() - prev_run_end, + prev_run_end - run_array.offset(), + ) + } else { + let prev_run_end = + run_ends.value_unchecked(physical_index - 1).as_usize(); + ( + run_ends.value_unchecked(physical_index).as_usize() - prev_run_end, + prev_run_end - run_array.offset(), + ) + } + }; + let new_run_length = run_length.min(remaining_len); + consume_runs(new_run_length, logical_index_start); + remaining_len -= new_run_length; + + if remaining_len == 0 { + break; + } + } + + if remaining_len > 0 { + panic!("Remaining length should be zero its values is {remaining_len}") + } + (values_indices, run_values) +} + /// Sort strings fn sort_string( values: &ArrayRef, @@ -1057,6 +1264,7 @@ fn sort_valids_array( #[cfg(test)] mod tests { use super::*; + use arrow_array::builder::PrimitiveRunBuilder; use arrow_buffer::i256; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; @@ -2882,6 +3090,129 @@ mod tests { ); } + #[test] + fn test_sort_run_to_run() { + test_sort_run_inner(|array, sort_options, limit| { + sort_run(array, sort_options, limit) + }); + } + + #[test] + fn test_sort_run_to_indices() { + test_sort_run_inner(|array, sort_options, limit| { + let indices = sort_to_indices(array, sort_options, limit).unwrap(); + take(array, &indices, None) + }); + } + + fn test_sort_run_inner(sort_fn: F) + where + F: Fn( + &ArrayRef, + Option, + Option, + ) -> Result, + { + // Create an input array for testing + let total_len = 80; + let vals: Vec> = + vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; + let repeats: Vec = vec![1, 3, 2, 4]; + let mut input_array: Vec> = Vec::with_capacity(total_len); + for ix in 0_usize..32 { + let repeat: usize = repeats[ix % repeats.len()]; + let val: Option = vals[ix % vals.len()]; + input_array.resize(input_array.len() + repeat, val); + } + + // create run array using input_array + // Encode the input_array to run array + let mut builder = + PrimitiveRunBuilder::::with_capacity(input_array.len()); + builder.extend(input_array.iter().copied()); + let run_array = builder.finish(); + + // slice lengths that are tested + let slice_lens = [ + 1, 2, 3, 4, 5, 6, 7, 37, 38, 39, 40, 41, 42, 43, 74, 75, 76, 77, 78, 79, 80, + ]; + for slice_len in slice_lens { + test_sort_run_inner2( + input_array.as_slice(), + &run_array, + 0, + slice_len, + None, + &sort_fn, + ); + test_sort_run_inner2( + input_array.as_slice(), + &run_array, + total_len - slice_len, + slice_len, + None, + &sort_fn, + ); + // Test with non zero limit + if slice_len > 1 { + test_sort_run_inner2( + input_array.as_slice(), + &run_array, + 0, + slice_len, + Some(slice_len / 2), + &sort_fn, + ); + test_sort_run_inner2( + input_array.as_slice(), + &run_array, + total_len - slice_len, + slice_len, + Some(slice_len / 2), + &sort_fn, + ); + } + } + } + + fn test_sort_run_inner2( + input_array: &[Option], + run_array: &RunArray, + offset: usize, + length: usize, + limit: Option, + sort_fn: &F, + ) where + F: Fn( + &ArrayRef, + Option, + Option, + ) -> Result, + { + // Run the sort and build actual result + let sliced_array = run_array.slice(offset, length); + let sorted_sliced_array = sort_fn(&sliced_array, None, limit).unwrap(); + let sorted_run_array = sorted_sliced_array + .as_any() + .downcast_ref::>() + .unwrap(); + let typed_run_array = sorted_run_array + .downcast::>() + .unwrap(); + let actual: Vec> = typed_run_array.into_iter().collect(); + + // build expected result. + let mut sliced_input = input_array[offset..(offset + length)].to_owned(); + sliced_input.sort(); + let expected = if let Some(limit) = limit { + sliced_input.iter().take(limit).copied().collect() + } else { + sliced_input + }; + + assert_eq!(expected, actual) + } + #[test] fn test_sort_string_dicts() { test_sort_string_dict_arrays::( diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index c4c6819df097..43a9a84d9a74 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -24,8 +24,8 @@ use std::sync::Arc; extern crate arrow; use arrow::compute::kernels::sort::{lexsort, SortColumn}; -use arrow::compute::sort_to_indices; -use arrow::datatypes::Int32Type; +use arrow::compute::{sort_limit, sort_to_indices}; +use arrow::datatypes::{Int16Type, Int32Type}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; @@ -61,6 +61,10 @@ fn bench_sort_to_indices(array: &ArrayRef, limit: Option) { criterion::black_box(sort_to_indices(array, None, limit).unwrap()); } +fn bench_sort_run(array: &ArrayRef, limit: Option) { + criterion::black_box(sort_limit(array, None, limit).unwrap()); +} + fn add_benchmark(c: &mut Criterion) { let arr_a = create_f32_array(2u64.pow(10) as usize, false); let arr_b = create_f32_array(2u64.pow(10) as usize, false); @@ -107,6 +111,19 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_sort_to_indices(&dict_arr, None)) }); + let run_encoded_array = Arc::new(create_primitive_run_array::( + 2u64.pow(12) as usize, + 2u64.pow(10) as usize, + )) as ArrayRef; + + c.bench_function("sort primitive run to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&run_encoded_array, None)) + }); + + c.bench_function("sort primitive run to run 2^12", |b| { + b.iter(|| bench_sort_run(&run_encoded_array, None)) + }); + // with limit { let arr_a = create_f32_array(2u64.pow(12) as usize, false); From 47e4b6166d67c50c87d99cd18efd770d5c331918 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Feb 2023 09:52:34 +0000 Subject: [PATCH 0625/1411] Use Typed Buffers in Arrays (#1811) (#1176) (#3743) * Remove RawPtrBox (#1811) (#1176) * Clippy * Extract get_offsets function --- arrow-array/src/array/boolean_array.rs | 19 ++--- arrow-array/src/array/byte_array.rs | 42 +++++------ .../src/array/fixed_size_binary_array.rs | 10 +-- .../src/array/fixed_size_list_array.rs | 5 +- arrow-array/src/array/list_array.rs | 46 +++--------- arrow-array/src/array/map_array.rs | 27 ++----- arrow-array/src/array/mod.rs | 27 ++++++- arrow-array/src/array/primitive_array.rs | 42 ++++------- arrow-array/src/lib.rs | 1 - arrow-array/src/raw_pointer.rs | 75 ------------------- arrow-array/src/record_batch.rs | 2 +- arrow-buffer/src/buffer/immutable.rs | 70 ++++++++++++----- arrow-buffer/src/buffer/mod.rs | 2 + arrow-buffer/src/buffer/mutable.rs | 4 +- arrow-buffer/src/buffer/offset.rs | 58 ++++++++++++++ arrow-buffer/src/buffer/scalar.rs | 71 +++++++++++------- arrow-buffer/src/bytes.rs | 2 +- 17 files changed, 242 insertions(+), 261 deletions(-) delete mode 100644 arrow-array/src/raw_pointer.rs create mode 100644 arrow-buffer/src/buffer/offset.rs diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 4c83dcf411d4..428a721ddb6c 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. +use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; -use crate::raw_pointer::RawPtrBox; -use crate::{print_long_array, Array, ArrayAccessor}; +use crate::{Array, ArrayAccessor}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; @@ -67,9 +67,7 @@ use std::any::Any; #[derive(Clone)] pub struct BooleanArray { data: ArrayData, - /// Pointer to the value array. The lifetime of this must be <= to the value buffer - /// stored in `data`, so it's safe to store. - raw_values: RawPtrBox, + raw_values: Buffer, } impl std::fmt::Debug for BooleanArray { @@ -102,7 +100,7 @@ impl BooleanArray { /// /// Note this doesn't take the offset of this array into account. pub fn values(&self) -> &Buffer { - &self.data.buffers()[0] + &self.raw_values } /// Returns the number of non null, true values within this array @@ -328,13 +326,8 @@ impl From for BooleanArray { 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let ptr = data.buffers()[0].as_ptr(); - Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above - raw_values: unsafe { RawPtrBox::new(ptr) }, - } + let raw_values = data.buffers()[0].clone(); + Self { data, raw_values } } } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 2cb04efb8e89..f6946228c85c 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{empty_offsets, print_long_array}; +use crate::array::{get_offsets, print_long_array}; use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; -use crate::raw_pointer::RawPtrBox; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, OffsetSizeTrait}; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::buffer::OffsetBuffer; +use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; @@ -39,16 +39,16 @@ use std::any::Any; /// [`LargeBinaryArray`]: crate::LargeBinaryArray pub struct GenericByteArray { data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, + value_offsets: OffsetBuffer, + value_data: Buffer, } impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { data: self.data.clone(), - value_offsets: self.value_offsets, - value_data: self.value_data, + value_offsets: self.value_offsets.clone(), + value_data: self.value_data.clone(), } } } @@ -68,7 +68,7 @@ impl GenericByteArray { /// Returns the raw value data pub fn value_data(&self) -> &[u8] { - self.data.buffers()[1].as_slice() + self.value_data.as_slice() } /// Returns true if all data within this array is ASCII @@ -82,15 +82,7 @@ impl GenericByteArray { /// Returns the offset values in the offsets buffer #[inline] pub fn value_offsets(&self) -> &[T::Offset] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } + &self.value_offsets } /// Returns the element at index `i` @@ -161,6 +153,8 @@ impl GenericByteArray { .slice_with_length(self.data.offset() * element_len, value_len * element_len); drop(self.data); + drop(self.value_data); + drop(self.value_offsets); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -280,18 +274,16 @@ impl From for GenericByteArray { T::Offset::PREFIX, T::PREFIX, ); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - let values = data.buffers()[1].as_ptr(); + // SAFETY: + // ArrayData is valid, and verified type above + let value_offsets = unsafe { get_offsets(&data) }; + let value_data = data.buffers()[1].clone(); Self { data, // SAFETY: // ArrayData must be valid, and validated data type above - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, + value_offsets, + value_data, } } } diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 936fb3025cd4..89ace430d8af 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. +use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; -use crate::raw_pointer::RawPtrBox; -use crate::{print_long_array, Array, ArrayAccessor, FixedSizeListArray}; +use crate::{Array, ArrayAccessor, FixedSizeListArray}; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -50,7 +50,7 @@ use std::any::Any; #[derive(Clone)] pub struct FixedSizeBinaryArray { data: ArrayData, - value_data: RawPtrBox, + value_data: Buffer, length: i32, } @@ -357,14 +357,14 @@ impl From for FixedSizeBinaryArray { 1, "FixedSizeBinaryArray data should contain 1 buffer only (values)" ); - let value_data = data.buffers()[0].as_ptr(); + let value_data = data.buffers()[0].clone(); let length = match data.data_type() { DataType::FixedSizeBinary(len) => *len, _ => panic!("Expected data type to be FixedSizeBinary"), }; Self { data, - value_data: unsafe { RawPtrBox::new(value_data) }, + value_data, length, } } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index c361d2d4462b..6e228ba3c770 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. +use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; -use crate::{ - make_array, print_long_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, -}; +use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index b378549ebf20..6b63269d1615 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::array::make_array; +use crate::array::{get_offsets, make_array, print_long_array}; use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ - iterator::GenericListArrayIter, print_long_array, raw_pointer::RawPtrBox, Array, - ArrayAccessor, ArrayRef, ArrowPrimitiveType, + iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; +use arrow_buffer::buffer::OffsetBuffer; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; @@ -45,35 +45,24 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } -/// Returns a slice of `OffsetSize` consisting of a single zero value -#[inline] -pub(crate) fn empty_offsets() -> &'static [OffsetSize] { - static OFFSET: &[i64] = &[0]; - // SAFETY: - // OffsetSize is ArrowNativeType and is therefore trivially transmutable - let (prefix, val, suffix) = unsafe { OFFSET.align_to::() }; - assert!(prefix.is_empty() && suffix.is_empty()); - val -} - /// Generic struct for a variable-size list array. /// /// Columnar format in Apache Arrow: /// /// /// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` -pub struct GenericListArray { +pub struct GenericListArray { data: ArrayData, values: ArrayRef, - value_offsets: RawPtrBox, + value_offsets: OffsetBuffer, } -impl Clone for GenericListArray { +impl Clone for GenericListArray { fn clone(&self) -> Self { Self { data: self.data.clone(), values: self.values.clone(), - value_offsets: self.value_offsets, + value_offsets: self.value_offsets.clone(), } } } @@ -118,15 +107,7 @@ impl GenericListArray { /// Returns the offset values in the offsets buffer #[inline] pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } + &self.value_offsets } /// Returns the length for value at index `i`. @@ -242,15 +223,10 @@ impl GenericListArray { } let values = make_array(values); - // Handle case of empty offsets - let offsets = match data.is_empty() && data.buffers()[0].is_empty() { - true => empty_offsets::().as_ptr() as *const _, - false => data.buffers()[0].as_ptr(), - }; - // SAFETY: - // Verified list type in call to `Self::get_type` - let value_offsets = unsafe { RawPtrBox::new(offsets) }; + // ArrayData is valid, and verified type above + let value_offsets = unsafe { get_offsets(&data) }; + Ok(Self { data, values, diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index b0eb4a3c98ab..8c9b02921781 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::raw_pointer::RawPtrBox; -use crate::{make_array, print_long_array, Array, ArrayRef, StringArray, StructArray}; +use crate::array::{get_offsets, print_long_array}; +use crate::{make_array, Array, ArrayRef, StringArray, StructArray}; +use arrow_buffer::buffer::OffsetBuffer; use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; @@ -38,7 +39,7 @@ pub struct MapArray { /// The second child of `entries`, the "values" of this MapArray values: ArrayRef, /// The start and end offsets of each entry - value_offsets: RawPtrBox, + value_offsets: OffsetBuffer, } impl MapArray { @@ -86,15 +87,7 @@ impl MapArray { /// Returns the offset values in the offsets buffer #[inline] pub fn value_offsets(&self) -> &[i32] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } + &self.value_offsets } /// Returns the length for value at index `i`. @@ -159,18 +152,10 @@ impl MapArray { let keys = make_array(entries.child_data()[0].clone()); let values = make_array(entries.child_data()[1].clone()); let entries = make_array(entries); - let value_offsets = data.buffers()[0].as_ptr(); // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; - unsafe { - if (*value_offsets.as_ptr().offset(0)) != 0 { - return Err(ArrowError::InvalidArgumentError(String::from( - "offsets do not start at zero", - ))); - } - } + let value_offsets = unsafe { get_offsets(&data) }; Ok(Self { data, diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b293d797e46e..27973a40faa9 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,6 +20,8 @@ mod binary_array; use crate::types::*; +use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer}; +use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -636,8 +638,29 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { make_array(ArrayData::new_null(data_type, length)) } -// Helper function for printing potentially long arrays. -pub(crate) fn print_long_array( +/// Helper function that gets offset from an [`ArrayData`] +/// +/// # Safety +/// +/// - ArrayData must contain a valid [`OffsetBuffer`] as its first buffer +unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { + match data.is_empty() && data.buffers()[0].is_empty() { + true => OffsetBuffer::new_empty(), + false => { + let buffer = ScalarBuffer::new( + data.buffers()[0].clone(), + data.offset(), + data.len() + 1, + ); + // Safety: + // ArrayData is valid + unsafe { OffsetBuffer::new_unchecked(buffer) } + } + } +} + +/// Helper function for printing potentially long arrays. +fn print_long_array( array: &A, f: &mut std::fmt::Formatter, print_item: F, diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index b64534e9835f..53217a06f497 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -15,16 +15,17 @@ // specific language governing permissions and limitations // under the License. +use crate::array::print_long_array; use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; use crate::iterator::PrimitiveIter; -use crate::raw_pointer::RawPtrBox; use crate::temporal_conversions::{ as_date, as_datetime, as_datetime_with_timezone, as_duration, as_time, }; use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; -use crate::{print_long_array, Array, ArrayAccessor}; use crate::{types::*, ArrowNativeTypeOp}; +use crate::{Array, ArrayAccessor}; +use arrow_buffer::buffer::ScalarBuffer; use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; @@ -266,22 +267,16 @@ pub trait ArrowPrimitiveType: 'static { /// ``` pub struct PrimitiveArray { /// Underlying ArrayData - /// # Safety - /// must have exactly one buffer, aligned to type T data: ArrayData, - /// Pointer to the value array. The lifetime of this must be <= to the value buffer - /// stored in `data`, so it's safe to store. - /// # Safety - /// raw_values must have a value equivalent to `data.buffers()[0].raw_data()` - /// raw_values must have alignment for type T::NativeType - raw_values: RawPtrBox, + /// Values data + raw_values: ScalarBuffer, } impl Clone for PrimitiveArray { fn clone(&self) -> Self { Self { data: self.data.clone(), - raw_values: self.raw_values, + raw_values: self.raw_values.clone(), } } } @@ -301,15 +296,7 @@ impl PrimitiveArray { /// Returns a slice of the values of this array #[inline] pub fn values(&self) -> &[T::Native] { - // Soundness - // raw_values alignment & location is ensured by fn from(ArrayDataRef) - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.raw_values.as_ptr().add(self.data.offset()), - self.len(), - ) - } + &self.raw_values } /// Returns a new primitive array builder @@ -339,8 +326,7 @@ impl PrimitiveArray { /// caller must ensure that the passed in offset is less than the array len() #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> T::Native { - let offset = i + self.offset(); - *self.raw_values.as_ptr().add(offset) + *self.raw_values.get_unchecked(i) } /// Returns the primitive value at index `i`. @@ -632,6 +618,7 @@ impl PrimitiveArray { .slice_with_length(self.data.offset() * element_len, len * element_len); drop(self.data); + drop(self.raw_values); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -724,6 +711,7 @@ impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { PrimitiveArray::value(self, index) } + #[inline] unsafe fn value_unchecked(&self, index: usize) -> Self::Item { PrimitiveArray::value_unchecked(self, index) } @@ -1085,13 +1073,9 @@ impl From for PrimitiveArray { "PrimitiveArray data should contain a single buffer only (values buffer)" ); - let ptr = data.buffers()[0].as_ptr(); - Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above - raw_values: unsafe { RawPtrBox::new(ptr) }, - } + let raw_values = + ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + Self { data, raw_values } } } diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 2cee2650eb7e..400b6e262faa 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -179,7 +179,6 @@ pub mod builder; pub mod cast; mod delta; pub mod iterator; -mod raw_pointer; pub mod run_iterator; pub mod temporal_conversions; pub mod timezone; diff --git a/arrow-array/src/raw_pointer.rs b/arrow-array/src/raw_pointer.rs deleted file mode 100644 index 0fea8c186d4c..000000000000 --- a/arrow-array/src/raw_pointer.rs +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::ptr::NonNull; - -/// This struct is highly `unsafe` and offers the possibility to -/// self-reference a [arrow_buffer::Buffer] from -/// [arrow_data::ArrayData], as a pointer to the beginning of its -/// contents. -pub(super) struct RawPtrBox { - ptr: NonNull, -} - -impl Clone for RawPtrBox { - fn clone(&self) -> Self { - Self { ptr: self.ptr } - } -} - -impl Copy for RawPtrBox {} - -impl RawPtrBox { - /// # Safety - /// The user must guarantee that: - /// * the contents where `ptr` points to are never `moved`. This is guaranteed when they are Pinned. - /// * the lifetime of this struct does not outlive the lifetime of `ptr`. - /// Failure to fulfill any the above conditions results in undefined behavior. - /// # Panic - /// This function panics if: - /// * `ptr` is null - /// * `ptr` is not aligned to a slice of type `T`. This is guaranteed if it was built from a slice of type `T`. - pub(super) unsafe fn new(ptr: *const u8) -> Self { - let ptr = NonNull::new(ptr as *mut u8).expect("Pointer cannot be null"); - assert_eq!( - ptr.as_ptr().align_offset(std::mem::align_of::()), - 0, - "memory is not aligned" - ); - Self { ptr: ptr.cast() } - } - - pub(super) fn as_ptr(&self) -> *const T { - self.ptr.as_ptr() - } -} - -unsafe impl Send for RawPtrBox {} -unsafe impl Sync for RawPtrBox {} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - #[should_panic(expected = "memory is not aligned")] - #[cfg_attr(miri, ignore)] // sometimes does not panic as expected - fn test_primitive_array_alignment() { - let bytes = vec![0u8, 1u8]; - unsafe { RawPtrBox::::new(bytes.as_ptr().offset(1)) }; - } -} diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 3b517872aac4..04a559f21603 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -603,7 +603,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 592); + assert_eq!(record_batch.get_array_memory_size(), 640); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 4048787c6a1f..cbfba1e0540c 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +use std::convert::AsRef; use std::fmt::Debug; use std::iter::FromIterator; use std::ptr::NonNull; use std::sync::Arc; -use std::{convert::AsRef, usize}; use crate::alloc::{Allocation, Deallocation}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; @@ -30,26 +30,41 @@ use super::MutableBuffer; /// Buffer represents a contiguous memory region that can be shared with other buffers and across /// thread boundaries. -#[derive(Clone, PartialEq, Debug)] +#[derive(Clone, Debug)] pub struct Buffer { /// the internal byte buffer. data: Arc, - /// The offset into the buffer. - offset: usize, + /// Pointer into `data` valid + /// + /// We store a pointer instead of an offset to avoid pointer arithmetic + /// which causes LLVM to fail to vectorise code correctly + ptr: *const u8, /// Byte length of the buffer. length: usize, } +impl PartialEq for Buffer { + fn eq(&self, other: &Self) -> bool { + self.as_slice().eq(other.as_slice()) + } +} + +impl Eq for Buffer {} + +unsafe impl Send for Buffer where Bytes: Send {} +unsafe impl Sync for Buffer where Bytes: Sync {} + impl Buffer { /// Auxiliary method to create a new Buffer #[inline] pub fn from_bytes(bytes: Bytes) -> Self { let length = bytes.len(); + let ptr = bytes.as_ptr(); Buffer { data: Arc::new(bytes), - offset: 0, + ptr, length, } } @@ -108,9 +123,10 @@ impl Buffer { deallocation: Deallocation, ) -> Self { let bytes = Bytes::new(ptr, len, deallocation); + let ptr = bytes.as_ptr(); Buffer { + ptr, data: Arc::new(bytes), - offset: 0, length: len, } } @@ -136,7 +152,7 @@ impl Buffer { /// Returns the byte slice stored in this buffer pub fn as_slice(&self) -> &[u8] { - &self.data[self.offset..(self.offset + self.length)] + unsafe { std::slice::from_raw_parts(self.ptr, self.length) } } /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. @@ -145,13 +161,18 @@ impl Buffer { /// Panics iff `offset` is larger than `len`. pub fn slice(&self, offset: usize) -> Self { assert!( - offset <= self.len(), + offset <= self.length, "the offset of the new Buffer cannot exceed the existing length" ); + // Safety: + // This cannot overflow as + // `self.offset + self.length < self.data.len()` + // `offset < self.length` + let ptr = unsafe { self.ptr.add(offset) }; Self { data: self.data.clone(), - offset: self.offset + offset, length: self.length - offset, + ptr, } } @@ -162,12 +183,15 @@ impl Buffer { /// Panics iff `(offset + length)` is larger than the existing length. pub fn slice_with_length(&self, offset: usize, length: usize) -> Self { assert!( - offset + length <= self.len(), + offset.saturating_add(length) <= self.length, "the offset of the new Buffer cannot exceed the existing length" ); + // Safety: + // offset + length <= self.length + let ptr = unsafe { self.ptr.add(offset) }; Self { data: self.data.clone(), - offset: self.offset + offset, + ptr, length, } } @@ -178,7 +202,7 @@ impl Buffer { /// stored anywhere, to avoid dangling pointers. #[inline] pub fn as_ptr(&self) -> *const u8 { - unsafe { self.data.ptr().as_ptr().add(self.offset) } + self.ptr } /// View buffer as a slice of a specific type. @@ -231,18 +255,17 @@ impl Buffer { /// Returns `MutableBuffer` for mutating the buffer if this buffer is not shared. /// Returns `Err` if this is shared or its allocation is from an external source. pub fn into_mutable(self) -> Result { - let offset_ptr = self.as_ptr(); - let offset = self.offset; + let ptr = self.ptr; let length = self.length; Arc::try_unwrap(self.data) .and_then(|bytes| { // The pointer of underlying buffer should not be offset. - assert_eq!(offset_ptr, bytes.ptr().as_ptr()); + assert_eq!(ptr, bytes.ptr().as_ptr()); MutableBuffer::from_bytes(bytes).map_err(Arc::new) }) .map_err(|bytes| Buffer { data: bytes, - offset, + ptr, length, }) } @@ -262,7 +285,7 @@ impl> From for Buffer { } /// Creating a `Buffer` instance by storing the boolean values into the buffer -impl std::iter::FromIterator for Buffer { +impl FromIterator for Buffer { fn from_iter(iter: I) -> Self where I: IntoIterator, @@ -321,10 +344,10 @@ impl Buffer { pub unsafe fn try_from_trusted_len_iter< E, T: ArrowNativeType, - I: Iterator>, + I: Iterator>, >( iterator: I, - ) -> std::result::Result { + ) -> Result { Ok(MutableBuffer::try_from_trusted_len_iter(iterator)?.into()) } } @@ -600,4 +623,13 @@ mod tests { let slice = buffer.typed_data::(); assert_eq!(slice, &[2, 3, 4, 5]); } + + #[test] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] + fn slice_overflow() { + let buffer = Buffer::from(MutableBuffer::from_len_zeroed(12)); + buffer.slice_with_length(2, usize::MAX); + } } diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index b9201f774fe0..7c12e1804f9f 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -18,6 +18,8 @@ //! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents //! a contiguous memory region that can be shared via `offsets`. +mod offset; +pub use offset::*; mod immutable; pub use immutable::*; mod mutable; diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index b70a74e84249..2e6e2f1d7b08 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -581,10 +581,10 @@ impl MutableBuffer { pub unsafe fn try_from_trusted_len_iter< E, T: ArrowNativeType, - I: Iterator>, + I: Iterator>, >( iterator: I, - ) -> std::result::Result { + ) -> Result { let item_size = std::mem::size_of::(); let (_, upper) = iterator.size_hint(); let upper = upper.expect("try_from_trusted_len_iter requires an upper limit"); diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs new file mode 100644 index 000000000000..a80c3c7ecb69 --- /dev/null +++ b/arrow-buffer/src/buffer/offset.rs @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::ScalarBuffer; +use crate::{ArrowNativeType, MutableBuffer}; +use std::ops::Deref; + +/// A non-empty buffer of monotonically increasing, positive integers +#[derive(Debug, Clone)] +pub struct OffsetBuffer(ScalarBuffer); + +impl OffsetBuffer { + /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] + /// + /// # Safety + /// + /// `buffer` must be a non-empty buffer containing monotonically increasing + /// values greater than zero + pub unsafe fn new_unchecked(buffer: ScalarBuffer) -> Self { + Self(buffer) + } + + /// Create a new [`OffsetBuffer`] containing a single 0 value + pub fn new_empty() -> Self { + let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); + Self(buffer.into_buffer().into()) + } +} + +impl Deref for OffsetBuffer { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef<[T]> for OffsetBuffer { + #[inline] + fn as_ref(&self) -> &[T] { + self + } +} diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 124f3f6f5894..e688e52fea5c 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -17,6 +17,7 @@ use crate::buffer::Buffer; use crate::native::ArrowNativeType; +use std::marker::PhantomData; use std::ops::Deref; /// Provides a safe API for interpreting a [`Buffer`] as a slice of [`ArrowNativeType`] @@ -25,14 +26,11 @@ use std::ops::Deref; /// /// All [`ArrowNativeType`] are valid for all possible backing byte representations, and as /// a result they are "trivially safely transmutable". -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ScalarBuffer { - #[allow(unused)] + /// Underlying data buffer buffer: Buffer, - // Borrows from `buffer` and is valid for the lifetime of `buffer` - ptr: *const T, - // The length of this slice - len: usize, + phantom: PhantomData, } impl ScalarBuffer { @@ -48,39 +46,50 @@ impl ScalarBuffer { /// * `bytes` is not large enough for the requested slice pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self { let size = std::mem::size_of::(); - let offset_len = offset.checked_add(len).expect("length overflow"); - let start_bytes = offset.checked_mul(size).expect("start bytes overflow"); - let end_bytes = offset_len.checked_mul(size).expect("end bytes overflow"); - - let bytes = &buffer.as_slice()[start_bytes..end_bytes]; - - // SAFETY: all byte sequences correspond to a valid instance of T - let (prefix, offsets, suffix) = unsafe { bytes.align_to::() }; - assert!( - prefix.is_empty() && suffix.is_empty(), - "buffer is not aligned to {size} byte boundary" - ); - - let ptr = offsets.as_ptr(); - Self { buffer, ptr, len } + let byte_offset = offset.checked_mul(size).expect("offset overflow"); + let byte_len = len.checked_mul(size).expect("length overflow"); + buffer.slice_with_length(byte_offset, byte_len).into() } } impl Deref for ScalarBuffer { type Target = [T]; + #[inline] fn deref(&self) -> &Self::Target { - // SAFETY: Bounds checked in constructor and ptr is valid for the lifetime of self - unsafe { std::slice::from_raw_parts(self.ptr, self.len) } + // SAFETY: Verified alignment in From + unsafe { + std::slice::from_raw_parts( + self.buffer.as_ptr() as *const T, + self.buffer.len() / std::mem::size_of::(), + ) + } } } impl AsRef<[T]> for ScalarBuffer { + #[inline] fn as_ref(&self) -> &[T] { self } } +impl From for ScalarBuffer { + fn from(buffer: Buffer) -> Self { + let align = std::mem::align_of::(); + assert_eq!( + buffer.as_ptr().align_offset(align), + 0, + "memory is not aligned" + ); + + Self { + buffer, + phantom: Default::default(), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -103,7 +112,7 @@ mod tests { } #[test] - #[should_panic(expected = "buffer is not aligned to 4 byte boundary")] + #[should_panic(expected = "memory is not aligned")] fn test_unaligned() { let expected = [0_i32, 1, 2]; let buffer = Buffer::from_iter(expected.iter().cloned()); @@ -112,35 +121,39 @@ mod tests { } #[test] - #[should_panic(expected = "range end index 16 out of range for slice of length 12")] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] fn test_length_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 1, 3); } #[test] - #[should_panic(expected = "range end index 16 out of range for slice of length 12")] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] fn test_offset_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 4, 0); } #[test] - #[should_panic(expected = "length overflow")] + #[should_panic(expected = "offset overflow")] fn test_length_overflow() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, usize::MAX, 1); } #[test] - #[should_panic(expected = "start bytes overflow")] + #[should_panic(expected = "offset overflow")] fn test_start_overflow() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, usize::MAX / 4 + 1, 0); } #[test] - #[should_panic(expected = "end bytes overflow")] + #[should_panic(expected = "length overflow")] fn test_end_overflow() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 0, usize::MAX / 4 + 1); diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index fea04ad0d50b..3320dfc261c7 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -61,7 +61,7 @@ impl Bytes { /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. #[inline] pub(crate) unsafe fn new( - ptr: std::ptr::NonNull, + ptr: NonNull, len: usize, deallocation: Deallocation, ) -> Bytes { From 0373a9d77f446918d44b1ee216ed33de3905b688 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 23 Feb 2023 10:57:10 +0100 Subject: [PATCH 0626/1411] Implement fallible streams for `FlightClient::do_put` (#3464) * Implement fallible streams for do_put * Another approach to error wrapping * implement basic client error test * Add last error test * comments * fix docs * Simplify --------- Co-authored-by: Raphael Taylor-Davies --- arrow-flight/src/client.rs | 58 ++++++++++++++++----- arrow-flight/tests/client.rs | 99 +++++++++++++++++++++++++++++++++--- 2 files changed, 136 insertions(+), 21 deletions(-) diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index bdd51dda4f9f..fe1292fcff6e 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::task::Poll; + use crate::{ decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, @@ -24,8 +26,9 @@ use arrow_schema::Schema; use bytes::Bytes; use futures::{ future::ready, + ready, stream::{self, BoxStream}, - Stream, StreamExt, TryStreamExt, + FutureExt, Stream, StreamExt, TryStreamExt, }; use tonic::{metadata::MetadataMap, transport::Channel}; @@ -262,6 +265,15 @@ impl FlightClient { /// [`Stream`](futures::Stream) of [`FlightData`] and returning a /// stream of [`PutResult`]. /// + /// # Note + /// + /// The input stream is [`Result`] so that this can be connected + /// to a streaming data source, such as [`FlightDataEncoder`](crate::encode::FlightDataEncoder), + /// without having to buffer. If the input stream returns an error + /// that error will not be sent to the server, instead it will be + /// placed into the result stream and the server connection + /// terminated. + /// /// # Example: /// ```no_run /// # async fn run() { @@ -279,9 +291,7 @@ impl FlightClient { /// /// // encode the batch as a stream of `FlightData` /// let flight_data_stream = FlightDataEncoderBuilder::new() - /// .build(futures::stream::iter(vec![Ok(batch)])) - /// // data encoder return Results, but do_put requires FlightData - /// .map(|batch|batch.unwrap()); + /// .build(futures::stream::iter(vec![Ok(batch)])); /// /// // send the stream and get the results as `PutResult` /// let response: Vec= client @@ -293,20 +303,40 @@ impl FlightClient { /// .expect("error calling do_put"); /// # } /// ``` - pub async fn do_put + Send + 'static>( + pub async fn do_put> + Send + 'static>( &mut self, request: S, ) -> Result>> { - let request = self.make_request(request); - - let response = self - .inner - .do_put(request) - .await? - .into_inner() - .map_err(FlightError::Tonic); + let (sender, mut receiver) = futures::channel::oneshot::channel(); + + // Intercepts client errors and sends them to the oneshot channel above + let mut request = Box::pin(request); // Pin to heap + let mut sender = Some(sender); // Wrap into Option so can be taken + let request_stream = futures::stream::poll_fn(move |cx| { + Poll::Ready(match ready!(request.poll_next_unpin(cx)) { + Some(Ok(data)) => Some(data), + Some(Err(e)) => { + let _ = sender.take().unwrap().send(e); + None + } + None => None, + }) + }); + + let request = self.make_request(request_stream); + let mut response_stream = self.inner.do_put(request).await?.into_inner(); + + // Forwards errors from the error oneshot with priority over responses from server + let error_stream = futures::stream::poll_fn(move |cx| { + if let Poll::Ready(Ok(err)) = receiver.poll_unpin(cx) { + return Poll::Ready(Some(Err(err))); + } + let next = ready!(response_stream.poll_next_unpin(cx)); + Poll::Ready(next.map(|x| x.map_err(FlightError::Tonic))) + }); - Ok(response.boxed()) + // combine the response from the server and any error from the client + Ok(error_stream.boxed()) } /// Make a `DoExchange` call to the server with the provided diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index ab1cfa1fb053..ed928a52c99a 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -248,8 +248,10 @@ async fn test_do_put() { test_server .set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); + let input_stream = futures::stream::iter(input_flight_data.clone()).map(Ok); + let response_stream = client - .do_put(futures::stream::iter(input_flight_data.clone())) + .do_put(input_stream) .await .expect("error making request"); @@ -266,15 +268,15 @@ async fn test_do_put() { } #[tokio::test] -async fn test_do_put_error() { +async fn test_do_put_error_server() { do_test(|test_server, mut client| async move { client.add_header("foo-header", "bar-header-value").unwrap(); let input_flight_data = test_flight_data().await; - let response = client - .do_put(futures::stream::iter(input_flight_data.clone())) - .await; + let input_stream = futures::stream::iter(input_flight_data.clone()).map(Ok); + + let response = client.do_put(input_stream).await; let response = match response { Ok(_) => panic!("unexpected success"), Err(e) => e, @@ -290,7 +292,7 @@ async fn test_do_put_error() { } #[tokio::test] -async fn test_do_put_error_stream() { +async fn test_do_put_error_stream_server() { do_test(|test_server, mut client| async move { client.add_header("foo-header", "bar-header-value").unwrap(); @@ -307,8 +309,10 @@ async fn test_do_put_error_stream() { test_server.set_do_put_response(response); + let input_stream = futures::stream::iter(input_flight_data.clone()).map(Ok); + let response_stream = client - .do_put(futures::stream::iter(input_flight_data.clone())) + .do_put(input_stream) .await .expect("error making request"); @@ -326,6 +330,87 @@ async fn test_do_put_error_stream() { .await; } +#[tokio::test] +async fn test_do_put_error_client() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let e = Status::invalid_argument("bad arg: client"); + + // input stream to client sends good FlightData followed by an error + let input_flight_data = test_flight_data().await; + let input_stream = futures::stream::iter(input_flight_data.clone()) + .map(Ok) + .chain(futures::stream::iter(vec![Err(FlightError::from( + e.clone(), + ))])); + + // server responds with one good message + let response = vec![Ok(PutResult { + app_metadata: Bytes::from("foo-metadata"), + })]; + test_server.set_do_put_response(response); + + let response_stream = client + .do_put(input_stream) + .await + .expect("error making request"); + + let response: Result, _> = response_stream.try_collect().await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + // expect to the error made from the client + expect_status(response, e); + // server still got the request messages until the client sent the error + assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); + ensure_metadata(&client, &test_server); + }) + .await; +} + +#[tokio::test] +async fn test_do_put_error_client_and_server() { + do_test(|test_server, mut client| async move { + client.add_header("foo-header", "bar-header-value").unwrap(); + + let e_client = Status::invalid_argument("bad arg: client"); + let e_server = Status::invalid_argument("bad arg: server"); + + // input stream to client sends good FlightData followed by an error + let input_flight_data = test_flight_data().await; + let input_stream = futures::stream::iter(input_flight_data.clone()) + .map(Ok) + .chain(futures::stream::iter(vec![Err(FlightError::from( + e_client.clone(), + ))])); + + // server responds with an error (e.g. because it got truncated data) + let response = vec![Err(e_server)]; + test_server.set_do_put_response(response); + + let response_stream = client + .do_put(input_stream) + .await + .expect("error making request"); + + let response: Result, _> = response_stream.try_collect().await; + let response = match response { + Ok(_) => panic!("unexpected success"), + Err(e) => e, + }; + + // expect to the error made from the client (not the server) + expect_status(response, e_client); + // server still got the request messages until the client sent the error + assert_eq!(test_server.take_do_put_request(), Some(input_flight_data)); + ensure_metadata(&client, &test_server); + }) + .await; +} + #[tokio::test] async fn test_do_exchange() { do_test(|test_server, mut client| async move { From 9699e1df7c7e0b83c8ec8be6678ee17d77a17f47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 Feb 2023 14:04:30 +0000 Subject: [PATCH 0627/1411] Update prost-build requirement from =0.11.6 to =0.11.7 (#3753) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/commits) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 603d4a636623..1ed98c919d8e 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -59,7 +59,7 @@ tower = "0.4.13" # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.51", default-features = false } -prost-build = { version = "=0.11.6", default-features = false } +prost-build = { version = "=0.11.7", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] From 57f79c03a8dee9d8bf8601bf555aa271746913fe Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 23 Feb 2023 17:02:52 +0100 Subject: [PATCH 0628/1411] Enable casting of string to timestamp with microsecond resolution (#3752) * Enable casting of string to timestamp with microsecond resolution * Enable string conversion to timestamp with second and millisecond resolution --- arrow-cast/src/cast.rs | 128 ++++++++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 49461b14c339..d49775c98211 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -166,6 +166,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Second, _) + | Timestamp(TimeUnit::Millisecond, _) + | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) ) => true, (Utf8, _) => to_type.is_numeric() && to_type != &Float16, @@ -179,6 +182,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Second, _) + | Timestamp(TimeUnit::Millisecond, _) + | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) ) => true, (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, @@ -1141,8 +1147,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } + Timestamp(TimeUnit::Second, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Millisecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Microsecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp_ns::(array, cast_options) + cast_string_to_timestamp::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1182,8 +1197,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } + Timestamp(TimeUnit::Second, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Millisecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Microsecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp_ns::(array, cast_options) + cast_string_to_timestamp::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -2552,8 +2576,11 @@ fn cast_string_to_time64nanosecond( Ok(Arc::new(array) as ArrayRef) } -/// Casts generic string arrays to TimeStampNanosecondArray -fn cast_string_to_timestamp_ns( +/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) +fn cast_string_to_timestamp< + Offset: OffsetSizeTrait, + TimestampType: ArrowTimestampType, +>( array: &dyn Array, cast_options: &CastOptions, ) -> Result { @@ -2562,26 +2589,36 @@ fn cast_string_to_timestamp_ns( .downcast_ref::>() .unwrap(); + let scale_factor = match TimestampType::get_time_unit() { + TimeUnit::Second => 1_000_000_000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Microsecond => 1_000, + TimeUnit::Nanosecond => 1, + }; + let array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok())); + let iter = string_array.iter().map(|v| { + v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor)) + }); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) } + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } else { let vec = string_array .iter() - .map(|v| v.map(string_to_timestamp_nanos).transpose()) + .map(|v| { + v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor)) + .transpose() + }) .collect::>, _>>()?; // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) } + unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) } }; Ok(Arc::new(array) as ArrayRef) @@ -4704,32 +4741,69 @@ mod tests { #[test] fn test_cast_string_to_timestamp() { let a1 = Arc::new(StringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), + Some("2020-09-08T12:00:00.123456789+00:00"), Some("Not a valid date"), None, ])) as ArrayRef; let a2 = Arc::new(LargeStringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), + Some("2020-09-08T12:00:00.123456789+00:00"), Some("Not a valid date"), None, ])) as ArrayRef; for array in &[a1, a2] { - let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); - let b = cast(array, &to_type).unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1599566400000000000, c.value(0)); - assert!(c.is_null(1)); - assert!(c.is_null(2)); + for time_unit in &[ + TimeUnit::Second, + TimeUnit::Millisecond, + TimeUnit::Microsecond, + TimeUnit::Nanosecond, + ] { + let to_type = DataType::Timestamp(time_unit.clone(), None); + let b = cast(array, &to_type).unwrap(); + + match time_unit { + TimeUnit::Second => { + let c = + b.as_any().downcast_ref::().unwrap(); + assert_eq!(1599566400, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Millisecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Microsecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123456, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Nanosecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123456789, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + } - let options = CastOptions { safe: false }; - let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!( - err.to_string(), - "Cast error: Error parsing 'Not a valid date' as timestamp" - ); + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!( + err.to_string(), + "Cast error: Error parsing 'Not a valid date' as timestamp" + ); + } } } From 350867436ab3477bafd7008355286378ab37045f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Feb 2023 16:40:00 +0000 Subject: [PATCH 0629/1411] Update MIRI for split crates (#2594) (#3754) --- .github/workflows/miri.sh | 5 ++++- arrow-array/src/array/boolean_array.rs | 1 + arrow-array/src/array/run_array.rs | 2 ++ arrow-array/src/run_iterator.rs | 1 + arrow-buffer/src/bigint.rs | 2 +- 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/miri.sh b/.github/workflows/miri.sh index 56da5c5c5d3e..3323bd0996bf 100755 --- a/.github/workflows/miri.sh +++ b/.github/workflows/miri.sh @@ -14,4 +14,7 @@ cargo miri setup cargo clean echo "Starting Arrow MIRI run..." -cargo miri test -p arrow -- --skip csv --skip ipc --skip json +cargo miri test -p arrow-buffer +cargo miri test -p arrow-data --features ffi +cargo miri test -p arrow-schema --features ffi +cargo miri test -p arrow-array diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 428a721ddb6c..8d1296c662fc 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -542,6 +542,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // Takes too long fn test_true_false_count() { let mut rng = thread_rng(); diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 9dba3ddab6ae..126aefde94f3 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -881,6 +881,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // Takes too long fn test_get_physical_indices() { // Test for logical lengths starting from 10 to 250 increasing by 10 for logical_len in (0..250).step_by(10) { @@ -917,6 +918,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // Takes too long fn test_get_physical_indices_sliced() { let total_len = 80; let input_array = build_input_array(total_len); diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index fbf173b1dbe0..44cb59ac7fc4 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -349,6 +349,7 @@ mod tests { } #[test] + #[cfg_attr(miri, ignore)] // Takes too long fn test_sliced_run_array_iterator() { let total_len = 80; let input_array = build_input_array(total_len); diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 0d404df169e1..421a7bdd02d0 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -549,7 +549,7 @@ impl ToPrimitive for i256 { } } -#[cfg(test)] +#[cfg(all(test, not(miri)))] // llvm.x86.subborrow.64 not supported by MIRI mod tests { use super::*; use num::{BigInt, FromPrimitive, Signed, ToPrimitive}; From be2acec50e695f905b524d3235a0dd1ce1efe1f1 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 24 Feb 2023 05:11:59 -0500 Subject: [PATCH 0630/1411] Update to 34.0.0 and update changelog (#3757) * Update version * update update_change_log script * Update changelog --- CHANGELOG-old.md | 74 +++++++++++++++ CHANGELOG.md | 97 +++++++++----------- arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +-- arrow-csv/Cargo.toml | 12 +-- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 14 +-- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +-- arrow-json/Cargo.toml | 12 +-- arrow-ord/Cargo.toml | 12 +-- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +-- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +-- arrow/Cargo.toml | 28 +++--- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 ++-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 226 insertions(+), 167 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 9ac8cb530456..9b9df494efb2 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,79 @@ # Historical Changelog +## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/32.0.0...33.0.0) + +**Breaking changes:** + +- Use ArrayFormatter in Cast Kernel [\#3668](https://github.com/apache/arrow-rs/pull/3668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use dyn Array in cast kernels [\#3667](https://github.com/apache/arrow-rs/pull/3667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return references from FixedSizeListArray and MapArray [\#3652](https://github.com/apache/arrow-rs/pull/3652) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Lazy array display \(\#3638\) [\#3647](https://github.com/apache/arrow-rs/pull/3647) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use array\_value\_to\_string in arrow-csv [\#3514](https://github.com/apache/arrow-rs/pull/3514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) + +**Implemented enhancements:** + +- Support UTF8 cast to Timestamp with timezone [\#3664](https://github.com/apache/arrow-rs/issues/3664) +- Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- A trait for append\_value and append\_null on ArrayBuilders [\#3644](https://github.com/apache/arrow-rs/issues/3644) +- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting generic binary to generic string [\#3606](https://github.com/apache/arrow-rs/issues/3606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- ArrowArray::try\_from\_raw Misleading Signature [\#3684](https://github.com/apache/arrow-rs/issues/3684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Arrow-csv reader cannot produce RecordBatch even if the bytes are necessary [\#3674](https://github.com/apache/arrow-rs/issues/3674) +- FFI Fails to Account For Offsets [\#3671](https://github.com/apache/arrow-rs/issues/3671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression in CSV reader error handling [\#3656](https://github.com/apache/arrow-rs/issues/3656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- UnionArray Child and Value Fail to Account for non-contiguous Type IDs [\#3653](https://github.com/apache/arrow-rs/issues/3653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Panic when accessing RecordBatch from pyarrow [\#3646](https://github.com/apache/arrow-rs/issues/3646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Multiplication for decimals is incorrect [\#3645](https://github.com/apache/arrow-rs/issues/3645) +- Inconsistent output between pretty print and CSV writer for Arrow [\#3513](https://github.com/apache/arrow-rs/issues/3513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Release 33.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 32.0.0\) [\#3682](https://github.com/apache/arrow-rs/issues/3682) +- Release `32.0.0` of `arrow`/`arrow-flight`/`parquet`/`parquet-derive` \(next release after `31.0.0`\) [\#3584](https://github.com/apache/arrow-rs/issues/3584) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Merged pull requests:** + +- Move FFI to sub-crates [\#3687](https://github.com/apache/arrow-rs/pull/3687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update to 33.0.0 and update changelog [\#3686](https://github.com/apache/arrow-rs/pull/3686) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Cleanup FFI interface \(\#3684\) \(\#3683\) [\#3685](https://github.com/apache/arrow-rs/pull/3685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: take\_run benchmark parameter [\#3679](https://github.com/apache/arrow-rs/pull/3679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Minor: Add some examples to Date\*Array and Time\*Array [\#3678](https://github.com/apache/arrow-rs/pull/3678) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add CSV Decoder::capacity \(\#3674\) [\#3677](https://github.com/apache/arrow-rs/pull/3677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ArrayData::new\_null and DataType::primitive\_width [\#3676](https://github.com/apache/arrow-rs/pull/3676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix FFI which fails to account for offsets [\#3675](https://github.com/apache/arrow-rs/pull/3675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support UTF8 cast to Timestamp with timezone [\#3673](https://github.com/apache/arrow-rs/pull/3673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Fix Date64Array docs [\#3670](https://github.com/apache/arrow-rs/pull/3670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.50 to =1.0.51 [\#3669](https://github.com/apache/arrow-rs/pull/3669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add timezone accessor for Timestamp\*Array [\#3666](https://github.com/apache/arrow-rs/pull/3666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster timezone cast [\#3665](https://github.com/apache/arrow-rs/pull/3665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat + fix: IPC support for run encoded array. [\#3662](https://github.com/apache/arrow-rs/pull/3662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Implement std::fmt::Write for StringBuilder \(\#3638\) [\#3659](https://github.com/apache/arrow-rs/pull/3659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Include line and field number in CSV UTF-8 error \(\#3656\) [\#3657](https://github.com/apache/arrow-rs/pull/3657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Handle non-contiguous type\_ids in UnionArray \(\#3653\) [\#3654](https://github.com/apache/arrow-rs/pull/3654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add modulus\_dyn and modulus\_scalar\_dyn [\#3649](https://github.com/apache/arrow-rs/pull/3649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve error messge with detailed schema [\#3637](https://github.com/apache/arrow-rs/pull/3637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Veeupup](https://github.com/Veeupup)) +- Add limit to ArrowReaderBuilder to push limit down to parquet reader [\#3633](https://github.com/apache/arrow-rs/pull/3633) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- chore: delete wrong comment and refactor set\_metadata in `Field` [\#3630](https://github.com/apache/arrow-rs/pull/3630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([chunshao90](https://github.com/chunshao90)) +- Fix typo in comment [\#3627](https://github.com/apache/arrow-rs/pull/3627) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kjschiroo](https://github.com/kjschiroo)) +- Minor: Update doc strings about Page Index / Column Index [\#3625](https://github.com/apache/arrow-rs/pull/3625) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Specified version of helper function to cast binary to string [\#3624](https://github.com/apache/arrow-rs/pull/3624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat: take kernel for RunArray [\#3622](https://github.com/apache/arrow-rs/pull/3622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Remove BitSliceIterator specialization from try\_for\_each\_valid\_idx [\#3621](https://github.com/apache/arrow-rs/pull/3621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Reduce PrimitiveArray::try\_unary codegen [\#3619](https://github.com/apache/arrow-rs/pull/3619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Reduce Dictionary Builder Codegen [\#3616](https://github.com/apache/arrow-rs/pull/3616) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Add test for dictionary encoding of batches [\#3608](https://github.com/apache/arrow-rs/pull/3608) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Casting generic binary to generic string [\#3607](https://github.com/apache/arrow-rs/pull/3607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray [\#3603](https://github.com/apache/arrow-rs/pull/3603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) + ## [32.0.0](https://github.com/apache/arrow-rs/tree/32.0.0) (2023-01-27) [Full Changelog](https://github.com/apache/arrow-rs/compare/31.0.0...32.0.0) @@ -94,6 +167,7 @@ - No panic on timestamp buffer overflow [\#3519](https://github.com/apache/arrow-rs/pull/3519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) - Support casting from binary to dictionary of binary [\#3482](https://github.com/apache/arrow-rs/pull/3482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - Add Raw JSON Reader \(~2.5x faster\) [\#3479](https://github.com/apache/arrow-rs/pull/3479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + ## [31.0.0](https://github.com/apache/arrow-rs/tree/31.0.0) (2023-01-13) [Full Changelog](https://github.com/apache/arrow-rs/compare/30.0.1...31.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66cc9104b0ee..0a25d8d8ff7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,78 +19,63 @@ # Changelog -## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-10) +## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-23) -[Full Changelog](https://github.com/apache/arrow-rs/compare/32.0.0...33.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/33.0.0...34.0.0) **Breaking changes:** -- Use ArrayFormatter in Cast Kernel [\#3668](https://github.com/apache/arrow-rs/pull/3668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use dyn Array in cast kernels [\#3667](https://github.com/apache/arrow-rs/pull/3667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Return references from FixedSizeListArray and MapArray [\#3652](https://github.com/apache/arrow-rs/pull/3652) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Lazy array display \(\#3638\) [\#3647](https://github.com/apache/arrow-rs/pull/3647) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use array\_value\_to\_string in arrow-csv [\#3514](https://github.com/apache/arrow-rs/pull/3514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- Infer 2020-03-19 00:00:00 as timestamp not Date64 in CSV \(\#3744\) [\#3746](https://github.com/apache/arrow-rs/pull/3746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement fallible streams for `FlightClient::do_put` [\#3464](https://github.com/apache/arrow-rs/pull/3464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) **Implemented enhancements:** -- Support UTF8 cast to Timestamp with timezone [\#3664](https://github.com/apache/arrow-rs/issues/3664) -- Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- A trait for append\_value and append\_null on ArrayBuilders [\#3644](https://github.com/apache/arrow-rs/issues/3644) -- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Casting generic binary to generic string [\#3606](https://github.com/apache/arrow-rs/issues/3606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add datatime/interval/duration into comparison kernels [\#3729](https://github.com/apache/arrow-rs/issues/3729) +- ! \(not\) operator overload for SortOptions [\#3726](https://github.com/apache/arrow-rs/issues/3726) +- parquet: convert Bytes to ByteArray directly [\#3719](https://github.com/apache/arrow-rs/issues/3719) +- Implement simple RecordBatchReader [\#3704](https://github.com/apache/arrow-rs/issues/3704) +- Is possible to implement GenericListArray::from\_iter ? [\#3702](https://github.com/apache/arrow-rs/issues/3702) +- `take_run` improvements [\#3701](https://github.com/apache/arrow-rs/issues/3701) +- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) +- Support `as_mut_any` in Array trait [\#3655](https://github.com/apache/arrow-rs/issues/3655) +- `Array` --\> `Display` formatter that supports more options and is configurable [\#3638](https://github.com/apache/arrow-rs/issues/3638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-csv: support decimal256 [\#3474](https://github.com/apache/arrow-rs/issues/3474) +- Skip the wrong JSON line. [\#3392](https://github.com/apache/arrow-rs/issues/3392) **Fixed bugs:** -- ArrowArray::try\_from\_raw Misleading Signature [\#3684](https://github.com/apache/arrow-rs/issues/3684) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Arrow-csv reader cannot produce RecordBatch even if the bytes are necessary [\#3674](https://github.com/apache/arrow-rs/issues/3674) -- FFI Fails to Account For Offsets [\#3671](https://github.com/apache/arrow-rs/issues/3671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Regression in CSV reader error handling [\#3656](https://github.com/apache/arrow-rs/issues/3656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- UnionArray Child and Value Fail to Account for non-contiguous Type IDs [\#3653](https://github.com/apache/arrow-rs/issues/3653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Panic when accessing RecordBatch from pyarrow [\#3646](https://github.com/apache/arrow-rs/issues/3646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Multiplication for decimals is incorrect [\#3645](https://github.com/apache/arrow-rs/issues/3645) -- Inconsistent output between pretty print and CSV writer for Arrow [\#3513](https://github.com/apache/arrow-rs/issues/3513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CSV reader infers Date64 type for fields like "2020-03-19 00:00:00" that it can't parse to Date64 [\#3744](https://github.com/apache/arrow-rs/issues/3744) +- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) **Closed issues:** -- Release 33.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 32.0.0\) [\#3682](https://github.com/apache/arrow-rs/issues/3682) -- Release `32.0.0` of `arrow`/`arrow-flight`/`parquet`/`parquet-derive` \(next release after `31.0.0`\) [\#3584](https://github.com/apache/arrow-rs/issues/3584) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Should we write a "arrow-rs" update blog post? [\#3565](https://github.com/apache/arrow-rs/issues/3565) **Merged pull requests:** -- Move FFI to sub-crates [\#3687](https://github.com/apache/arrow-rs/pull/3687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update to 33.0.0 and update changelog [\#3686](https://github.com/apache/arrow-rs/pull/3686) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Cleanup FFI interface \(\#3684\) \(\#3683\) [\#3685](https://github.com/apache/arrow-rs/pull/3685) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix: take\_run benchmark parameter [\#3679](https://github.com/apache/arrow-rs/pull/3679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Minor: Add some examples to Date\*Array and Time\*Array [\#3678](https://github.com/apache/arrow-rs/pull/3678) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add CSV Decoder::capacity \(\#3674\) [\#3677](https://github.com/apache/arrow-rs/pull/3677) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add ArrayData::new\_null and DataType::primitive\_width [\#3676](https://github.com/apache/arrow-rs/pull/3676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix FFI which fails to account for offsets [\#3675](https://github.com/apache/arrow-rs/pull/3675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support UTF8 cast to Timestamp with timezone [\#3673](https://github.com/apache/arrow-rs/pull/3673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Fix Date64Array docs [\#3670](https://github.com/apache/arrow-rs/pull/3670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.50 to =1.0.51 [\#3669](https://github.com/apache/arrow-rs/pull/3669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add timezone accessor for Timestamp\*Array [\#3666](https://github.com/apache/arrow-rs/pull/3666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster timezone cast [\#3665](https://github.com/apache/arrow-rs/pull/3665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat + fix: IPC support for run encoded array. [\#3662](https://github.com/apache/arrow-rs/pull/3662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Implement std::fmt::Write for StringBuilder \(\#3638\) [\#3659](https://github.com/apache/arrow-rs/pull/3659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Include line and field number in CSV UTF-8 error \(\#3656\) [\#3657](https://github.com/apache/arrow-rs/pull/3657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Handle non-contiguous type\_ids in UnionArray \(\#3653\) [\#3654](https://github.com/apache/arrow-rs/pull/3654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add modulus\_dyn and modulus\_scalar\_dyn [\#3649](https://github.com/apache/arrow-rs/pull/3649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve error messge with detailed schema [\#3637](https://github.com/apache/arrow-rs/pull/3637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Veeupup](https://github.com/Veeupup)) -- Add limit to ArrowReaderBuilder to push limit down to parquet reader [\#3633](https://github.com/apache/arrow-rs/pull/3633) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- chore: delete wrong comment and refactor set\_metadata in `Field` [\#3630](https://github.com/apache/arrow-rs/pull/3630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([chunshao90](https://github.com/chunshao90)) -- Fix typo in comment [\#3627](https://github.com/apache/arrow-rs/pull/3627) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kjschiroo](https://github.com/kjschiroo)) -- Minor: Update doc strings about Page Index / Column Index [\#3625](https://github.com/apache/arrow-rs/pull/3625) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Specified version of helper function to cast binary to string [\#3624](https://github.com/apache/arrow-rs/pull/3624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- feat: take kernel for RunArray [\#3622](https://github.com/apache/arrow-rs/pull/3622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Remove BitSliceIterator specialization from try\_for\_each\_valid\_idx [\#3621](https://github.com/apache/arrow-rs/pull/3621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Reduce PrimitiveArray::try\_unary codegen [\#3619](https://github.com/apache/arrow-rs/pull/3619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Reduce Dictionary Builder Codegen [\#3616](https://github.com/apache/arrow-rs/pull/3616) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Add test for dictionary encoding of batches [\#3608](https://github.com/apache/arrow-rs/pull/3608) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Casting generic binary to generic string [\#3607](https://github.com/apache/arrow-rs/pull/3607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray [\#3603](https://github.com/apache/arrow-rs/pull/3603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Update prost-build requirement from =0.11.6 to =0.11.7 [\#3753](https://github.com/apache/arrow-rs/pull/3753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Use Typed Buffers in Arrays \(\#1811\) \(\#1176\) [\#3743](https://github.com/apache/arrow-rs/pull/3743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup arithmetic kernel type constraints [\#3739](https://github.com/apache/arrow-rs/pull/3739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make dictionary kernels optional for comparison benchmark [\#3738](https://github.com/apache/arrow-rs/pull/3738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support String Coercion in Raw JSON Reader [\#3736](https://github.com/apache/arrow-rs/pull/3736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) +- replace for loop by try\_for\_each [\#3734](https://github.com/apache/arrow-rs/pull/3734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) +- feat: implement generic record batch reader [\#3733](https://github.com/apache/arrow-rs/pull/3733) ([wjones127](https://github.com/wjones127)) +- \[minor\] fix doc test fail [\#3732](https://github.com/apache/arrow-rs/pull/3732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Add datetime/interval/duration into dyn scalar comparison [\#3730](https://github.com/apache/arrow-rs/pull/3730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Using Borrow\ on infer\_json\_schema\_from\_iterator [\#3728](https://github.com/apache/arrow-rs/pull/3728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) +- Not operator overload for SortOptions [\#3727](https://github.com/apache/arrow-rs/pull/3727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([berkaysynnada](https://github.com/berkaysynnada)) +- fix: encoding batch with no columns [\#3724](https://github.com/apache/arrow-rs/pull/3724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([wangrunji0408](https://github.com/wangrunji0408)) +- feat: impl `Ord`/`PartialOrd` for `SortOptions` [\#3723](https://github.com/apache/arrow-rs/pull/3723) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add From\ for ByteArray [\#3720](https://github.com/apache/arrow-rs/pull/3720) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Deprecate old JSON reader \(\#3610\) [\#3718](https://github.com/apache/arrow-rs/pull/3718) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add pretty format with options [\#3717](https://github.com/apache/arrow-rs/pull/3717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unreachable decimal take [\#3716](https://github.com/apache/arrow-rs/pull/3716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Feat: arrow csv decimal256 [\#3711](https://github.com/apache/arrow-rs/pull/3711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) +- perf: `take_run` improvements [\#3705](https://github.com/apache/arrow-rs/pull/3705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add raw MapArrayReader [\#3703](https://github.com/apache/arrow-rs/pull/3703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Sort kernel for `RunArray` [\#3695](https://github.com/apache/arrow-rs/pull/3695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- perf: Remove sorting to yield sorted\_rank [\#3693](https://github.com/apache/arrow-rs/pull/3693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- fix: Handle sliced array in run array iterator [\#3681](https://github.com/apache/arrow-rs/pull/3681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 977590308e42..6b3d82c9c906 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "33.0.0" +version = "34.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index bc47672e2594..5f839426edba 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "33.0.0" +version = "34.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index e84b11a2b596..63e5aaa4476d 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "33.0.0" +version = "34.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index bb2d725b34f4..688e0001f973 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "33.0.0" +version = "34.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-select = { version = "33.0.0", path = "../arrow-select" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-select = { version = "34.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 9d1582b91c2f..62ca69bcaf9b 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "33.0.0" +version = "34.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index a1938af4b194..33de17339131 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "33.0.0" +version = "34.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -50,8 +50,8 @@ features = ["ffi"] [dependencies] -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1ed98c919d8e..0c820ed73ac9 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "33.0.0" +version = "34.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "33.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "34.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -50,7 +50,7 @@ flight-sql-experimental = [] tls = ["tonic/tls"] [dev-dependencies] -arrow = { version = "33.0.0", path = "../arrow", features = ["prettyprint"] } +arrow = { version = "34.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 7992d93292ce..1f8026887485 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "33.0.0" +arrow-flight = "34.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index f9ca4297e6e7..2d92e6292ded 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "33.0.0" +version = "34.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "33.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } +arrow = { version = "34.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index e22a15f52ddc..67d5b7d2745a 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "33.0.0" +version = "34.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 6661f35c0635..040d1c113a5c 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "33.0.0" +version = "34.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index ab77c1843ec0..3869bfd90b19 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "33.0.0" +version = "34.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 682d68dac857..7e7ec7d4fedd 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "33.0.0" +version = "34.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-select = { version = "33.0.0", path = "../arrow-select" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-select = { version = "34.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 3ab256e541b3..cbf2e9cf29d9 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "33.0.0" +version = "34.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,5 +32,5 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "33.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "34.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 94210a27a14b..3ddc195c39a0 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "33.0.0" +version = "34.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-ord = { version = "33.0.0", path = "../arrow-ord" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-ord = { version = "34.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index e4e7d0082eb8..acf6c43b8342 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "33.0.0" +version = "34.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 789a23359a16..540d37cb5aa8 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "33.0.0" +version = "34.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-array = { version = "33.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 796024e873ef..2e8067051644 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "33.0.0" +version = "34.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-select = { version = "33.0.0", path = "../arrow-select" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-select = { version = "34.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2032d5048977..08fc5513d64f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "33.0.0" +version = "34.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "33.0.0", path = "../arrow-arith" } -arrow-array = { version = "33.0.0", path = "../arrow-array" } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "33.0.0", path = "../arrow-cast" } -arrow-csv = { version = "33.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "33.0.0", path = "../arrow-data" } -arrow-ipc = { version = "33.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "33.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "33.0.0", path = "../arrow-ord" } -arrow-row = { version = "33.0.0", path = "../arrow-row" } -arrow-schema = { version = "33.0.0", path = "../arrow-schema" } -arrow-select = { version = "33.0.0", path = "../arrow-select" } -arrow-string = { version = "33.0.0", path = "../arrow-string" } +arrow-arith = { version = "34.0.0", path = "../arrow-arith" } +arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "34.0.0", path = "../arrow-cast" } +arrow-csv = { version = "34.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-ipc = { version = "34.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "34.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "34.0.0", path = "../arrow-ord" } +arrow-row = { version = "34.0.0", path = "../arrow-row" } +arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-select = { version = "34.0.0", path = "../arrow-select" } +arrow-string = { version = "34.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow/README.md b/arrow/README.md index 0714285011fa..6d0772e2d956 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `33.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `34.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index b8018bfaf7b4..70921dd024da 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/33.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/34.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 7b773fd05c61..920498905ccd 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="32.0.0" -FUTURE_RELEASE="33.0.0" +SINCE_TAG="33.0.0" +FUTURE_RELEASE="34.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index d59f481f362f..87f552fbd36a 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "33.0.0" +version = "34.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "33.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "33.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "33.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "33.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "33.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "33.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "33.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "33.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "34.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "34.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "34.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "34.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "34.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "34.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "34.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "34.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "33.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "34.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index f648aafbf2fb..cb16846b0fb1 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "33.0.0" +version = "34.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "33.0.0", default-features = false } +parquet = { path = "../parquet", version = "34.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index c8ee7ea81101..f3f66c45bc98 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "33.0.0" -parquet_derive = "33.0.0" +parquet = "34.0.0" +parquet_derive = "34.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index df8fa3aef65a..33f7675a30ef 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "33.0.0" +version = "34.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "33.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "33.0.0", default-features = false } +parquet = { path = "../parquet", version = "34.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "34.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From bbc1469077e13ba2e5a61f130917ad7eccfcb569 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Feb 2023 10:20:24 +0000 Subject: [PATCH 0631/1411] Final tweaks for arrow 34 (#3758) --- CHANGELOG.md | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a25d8d8ff7e..10a969dca15d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ # Changelog -## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-23) +## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-24) [Full Changelog](https://github.com/apache/arrow-rs/compare/33.0.0...34.0.0) @@ -30,30 +30,27 @@ **Implemented enhancements:** -- Add datatime/interval/duration into comparison kernels [\#3729](https://github.com/apache/arrow-rs/issues/3729) -- ! \(not\) operator overload for SortOptions [\#3726](https://github.com/apache/arrow-rs/issues/3726) -- parquet: convert Bytes to ByteArray directly [\#3719](https://github.com/apache/arrow-rs/issues/3719) +- Support casting string to timestamp with microsecond resolution [\#3751](https://github.com/apache/arrow-rs/issues/3751) +- Add datatime/interval/duration into comparison kernels [\#3729](https://github.com/apache/arrow-rs/issues/3729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ! \(not\) operator overload for SortOptions [\#3726](https://github.com/apache/arrow-rs/issues/3726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: convert Bytes to ByteArray directly [\#3719](https://github.com/apache/arrow-rs/issues/3719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Implement simple RecordBatchReader [\#3704](https://github.com/apache/arrow-rs/issues/3704) - Is possible to implement GenericListArray::from\_iter ? [\#3702](https://github.com/apache/arrow-rs/issues/3702) -- `take_run` improvements [\#3701](https://github.com/apache/arrow-rs/issues/3701) -- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) +- `take_run` improvements [\#3701](https://github.com/apache/arrow-rs/issues/3701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support `as_mut_any` in Array trait [\#3655](https://github.com/apache/arrow-rs/issues/3655) -- `Array` --\> `Display` formatter that supports more options and is configurable [\#3638](https://github.com/apache/arrow-rs/issues/3638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow-csv: support decimal256 [\#3474](https://github.com/apache/arrow-rs/issues/3474) -- Skip the wrong JSON line. [\#3392](https://github.com/apache/arrow-rs/issues/3392) +- `Array` --\> `Display` formatter that supports more options and is configurable [\#3638](https://github.com/apache/arrow-rs/issues/3638) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-csv: support decimal256 [\#3474](https://github.com/apache/arrow-rs/issues/3474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- CSV reader infers Date64 type for fields like "2020-03-19 00:00:00" that it can't parse to Date64 [\#3744](https://github.com/apache/arrow-rs/issues/3744) -- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) - -**Closed issues:** - -- Should we write a "arrow-rs" update blog post? [\#3565](https://github.com/apache/arrow-rs/issues/3565) +- CSV reader infers Date64 type for fields like "2020-03-19 00:00:00" that it can't parse to Date64 [\#3744](https://github.com/apache/arrow-rs/issues/3744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** +- Update to 34.0.0 and update changelog [\#3757](https://github.com/apache/arrow-rs/pull/3757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Update MIRI for split crates \(\#2594\) [\#3754](https://github.com/apache/arrow-rs/pull/3754) ([tustvold](https://github.com/tustvold)) - Update prost-build requirement from =0.11.6 to =0.11.7 [\#3753](https://github.com/apache/arrow-rs/pull/3753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Enable casting of string to timestamp with microsecond resolution [\#3752](https://github.com/apache/arrow-rs/pull/3752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) - Use Typed Buffers in Arrays \(\#1811\) \(\#1176\) [\#3743](https://github.com/apache/arrow-rs/pull/3743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Cleanup arithmetic kernel type constraints [\#3739](https://github.com/apache/arrow-rs/pull/3739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Make dictionary kernels optional for comparison benchmark [\#3738](https://github.com/apache/arrow-rs/pull/3738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) From d016403001e418098b48051f1abbff09acbf4c8d Mon Sep 17 00:00:00 2001 From: Spencer Bartholomew <38776747+spencerbart@users.noreply.github.com> Date: Sat, 25 Feb 2023 02:01:49 -0700 Subject: [PATCH 0632/1411] update object_store deps to patch potential security vulnerabilities (#3761) --- object_store/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 686a661675fb..8ab0c15cb950 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -37,7 +37,7 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.18", features = ["sync", "macros", "rt", "time", "io-util"] } +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -72,4 +72,4 @@ dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" -hyper = { version = "0.14", features = ["server"] } +hyper = { version = "0.14.24", features = ["server"] } From 96791ea47b032ce2ebcb07087d3b3007ea0df536 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sat, 25 Feb 2023 22:58:25 +0100 Subject: [PATCH 0633/1411] feat: add into_inner for csv writer (#3759) --- arrow-csv/src/writer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index d9331053f3d8..4ec0e1bec517 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -185,6 +185,12 @@ impl Writer { Ok(()) } + + /// Unwraps this `Writer`, returning the underlying writer. + pub fn into_inner(self) -> W { + // Safe to call `unwrap` since `write` always flushes the writer. + self.writer.into_inner().unwrap() + } } /// A CSV writer builder From dae7a71cc2980d6778ae3226aa3adb240a41da3c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 27 Feb 2023 14:03:18 +0000 Subject: [PATCH 0634/1411] ArrayData Enumeration for Primitive, Binary and UTF8 (#3749) * Add BooleanBuffer * Add NullBuffer * Add PrimitiveArrayData * Add BytesArrayData * Move module * Make private for now * Move NullBuffer to arrow-buffer * Format * More docs * Seal traits * Doc * Review feedback --- arrow-buffer/src/buffer/boolean.rs | 84 +++++++ arrow-buffer/src/buffer/mod.rs | 7 +- arrow-buffer/src/buffer/null.rs | 90 ++++++++ arrow-data/src/data/bytes.rs | 288 ++++++++++++++++++++++++ arrow-data/src/{data.rs => data/mod.rs} | 7 + arrow-data/src/data/primitive.rs | 185 +++++++++++++++ arrow-data/src/data/types.rs | 153 +++++++++++++ 7 files changed, 812 insertions(+), 2 deletions(-) create mode 100644 arrow-buffer/src/buffer/boolean.rs create mode 100644 arrow-buffer/src/buffer/null.rs create mode 100644 arrow-data/src/data/bytes.rs rename arrow-data/src/{data.rs => data/mod.rs} (99%) create mode 100644 arrow-data/src/data/primitive.rs create mode 100644 arrow-data/src/data/types.rs diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs new file mode 100644 index 000000000000..82755a2b0a27 --- /dev/null +++ b/arrow-buffer/src/buffer/boolean.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{bit_util, Buffer}; + +/// A slice-able [`Buffer`] containing bit-packed booleans +#[derive(Debug, Clone)] +pub struct BooleanBuffer { + buffer: Buffer, + offset: usize, + len: usize, +} + +impl BooleanBuffer { + /// Create a new [`BooleanBuffer`] from a [`Buffer`], an `offset` and `length` in bits + /// + /// # Panics + /// + /// This method will panic if `buffer` is not large enough + pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self { + let total_len = offset.saturating_add(len); + let bit_len = buffer.len().saturating_mul(8); + assert!(total_len <= bit_len); + Self { + buffer, + offset, + len, + } + } + + /// Returns the number of set bits in this buffer + pub fn count_set_bits(&self) -> usize { + self.buffer.count_set_bits_offset(self.offset, self.len) + } + + /// Returns `true` if the bit at index `i` is set + /// + /// # Panics + /// + /// Panics if `i >= self.len()` + #[inline] + pub fn is_set(&self, i: usize) -> bool { + assert!(i < self.len); + unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) } + } + + /// Returns the offset of this [`BooleanBuffer`] in bits + #[inline] + pub fn offset(&self) -> usize { + self.offset + } + + /// Returns the length of this [`BooleanBuffer`] in bits + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this [`BooleanBuffer`] is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the packed values of this [`BooleanBuffer`] not including any offset + #[inline] + pub fn values(&self) -> &[u8] { + &self.buffer + } +} diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index 7c12e1804f9f..f7e41260d80e 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -25,7 +25,10 @@ pub use immutable::*; mod mutable; pub use mutable::*; mod ops; +pub use ops::*; mod scalar; pub use scalar::*; - -pub use ops::*; +mod boolean; +pub use boolean::*; +mod null; +pub use null::*; diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs new file mode 100644 index 000000000000..2d52c9096dce --- /dev/null +++ b/arrow-buffer/src/buffer/null.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::BooleanBuffer; + +#[derive(Debug, Clone)] +pub struct NullBuffer { + buffer: BooleanBuffer, + null_count: usize, +} + +impl NullBuffer { + /// Create a new [`NullBuffer`] computing the null count + pub fn new(buffer: BooleanBuffer) -> Self { + let null_count = buffer.len() - buffer.count_set_bits(); + Self { buffer, null_count } + } + + /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count` + /// + /// # Safety + /// + /// `buffer` must contain `null_count` `0` bits + pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self { + Self { buffer, null_count } + } + + /// Returns the length of this [`NullBuffer`] + #[inline] + pub fn len(&self) -> usize { + self.buffer.len() + } + + /// Returns true if this [`NullBuffer`] is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + /// Returns the null count for this [`NullBuffer`] + #[inline] + pub fn null_count(&self) -> usize { + self.null_count + } + + /// Returns `true` if the value at `idx` is not null + #[inline] + pub fn is_valid(&self, idx: usize) -> bool { + self.buffer.is_set(idx) + } + + /// Returns `true` if the value at `idx` is null + #[inline] + pub fn is_null(&self, idx: usize) -> bool { + !self.is_valid(idx) + } + + /// Returns the inner buffer + #[inline] + pub fn inner(&self) -> &BooleanBuffer { + &self.buffer + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_size() { + // This tests that the niche optimisation eliminates the overhead of an option + assert_eq!( + std::mem::size_of::(), + std::mem::size_of::>() + ); + } +} diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs new file mode 100644 index 000000000000..86839c67124d --- /dev/null +++ b/arrow-data/src/data/bytes.rs @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::{BytesType, OffsetType}; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_schema::DataType; +use std::marker::PhantomData; + +mod private { + use super::*; + + pub trait BytesSealed { + /// Create from bytes without performing any validation + /// + /// # Safety + /// + /// If `str`, `b` must be a valid UTF-8 sequence + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; + + /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] + fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> + where + Self: Bytes; + + /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] + fn downcast(data: ArrayDataBytes) -> Option> + where + Self: Bytes; + + /// Cast [`ArrayDataBytesOffset`] to [`ArrayDataBytes`] + fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes + where + Self: Bytes; + } + + pub trait BytesOffsetSealed { + /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] + fn downcast_ref( + data: &ArrayDataBytesOffset, + ) -> Option<&BytesArrayData> + where + Self: BytesOffset; + + /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] + fn downcast( + data: ArrayDataBytesOffset, + ) -> Option> + where + Self: BytesOffset; + + /// Cast [`BytesArrayData`] to [`ArrayDataBytesOffset`] + fn upcast( + v: BytesArrayData, + ) -> ArrayDataBytesOffset + where + Self: BytesOffset; + } +} + +/// Types backed by a variable length slice of bytes +pub trait Bytes: private::BytesSealed { + const TYPE: BytesType; +} + +impl Bytes for [u8] { + const TYPE: BytesType = BytesType::Binary; +} + +impl private::BytesSealed for [u8] { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + b + } + + fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { + match data { + ArrayDataBytes::Binary(v) => Some(v), + ArrayDataBytes::Utf8(_) => None, + } + } + + fn downcast(data: ArrayDataBytes) -> Option> { + match data { + ArrayDataBytes::Binary(v) => Some(v), + ArrayDataBytes::Utf8(_) => None, + } + } + + fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { + ArrayDataBytes::Binary(v) + } +} + +impl Bytes for str { + const TYPE: BytesType = BytesType::Utf8; +} + +impl private::BytesSealed for str { + unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { + std::str::from_utf8_unchecked(b) + } + + fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { + match data { + ArrayDataBytes::Binary(_) => None, + ArrayDataBytes::Utf8(v) => Some(v), + } + } + + fn downcast(data: ArrayDataBytes) -> Option> { + match data { + ArrayDataBytes::Binary(_) => None, + ArrayDataBytes::Utf8(v) => Some(v), + } + } + + fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { + ArrayDataBytes::Utf8(v) + } +} + +/// Types of offset used by variable length byte arrays +pub trait BytesOffset: private::BytesOffsetSealed + ArrowNativeType { + const TYPE: OffsetType; +} + +impl BytesOffset for i32 { + const TYPE: OffsetType = OffsetType::Int32; +} + +impl private::BytesOffsetSealed for i32 { + fn downcast_ref( + data: &ArrayDataBytesOffset, + ) -> Option<&BytesArrayData> { + match data { + ArrayDataBytesOffset::Small(v) => Some(v), + ArrayDataBytesOffset::Large(_) => None, + } + } + + fn downcast( + data: ArrayDataBytesOffset, + ) -> Option> { + match data { + ArrayDataBytesOffset::Small(v) => Some(v), + ArrayDataBytesOffset::Large(_) => None, + } + } + + fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { + ArrayDataBytesOffset::Small(v) + } +} + +impl BytesOffset for i64 { + const TYPE: OffsetType = OffsetType::Int64; +} + +impl private::BytesOffsetSealed for i64 { + fn downcast_ref( + data: &ArrayDataBytesOffset, + ) -> Option<&BytesArrayData> { + match data { + ArrayDataBytesOffset::Small(_) => None, + ArrayDataBytesOffset::Large(v) => Some(v), + } + } + + fn downcast( + data: ArrayDataBytesOffset, + ) -> Option> { + match data { + ArrayDataBytesOffset::Small(_) => None, + ArrayDataBytesOffset::Large(v) => Some(v), + } + } + + fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { + ArrayDataBytesOffset::Large(v) + } +} + +/// An enumeration of the types of [`ArrayDataBytesOffset`] +pub enum ArrayDataBytes { + Binary(ArrayDataBytesOffset<[u8]>), + Utf8(ArrayDataBytesOffset), +} + +impl ArrayDataBytes { + /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] + pub fn downcast_ref( + &self, + ) -> Option<&BytesArrayData> { + O::downcast_ref(B::downcast_ref(self)?) + } + + /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] + pub fn downcast( + self, + ) -> Option> { + O::downcast(B::downcast(self)?) + } +} + +/// An enumeration of the types of [`BytesArrayData`] +pub enum ArrayDataBytesOffset { + Small(BytesArrayData), + Large(BytesArrayData), +} + +impl From> for ArrayDataBytes { + fn from(value: BytesArrayData) -> Self { + B::upcast(O::upcast(value)) + } +} + +/// ArrayData for arrays of [`Bytes`] +pub struct BytesArrayData { + data_type: DataType, + nulls: Option, + offsets: ScalarBuffer, + values: Buffer, + phantom: PhantomData, +} + +impl BytesArrayData { + /// Creates a new [`BytesArrayData`] + /// + /// # Safety + /// + /// - Each consecutive window of `offsets` must identify a valid slice of `values` + /// - `nulls.len() == offsets.len() + 1` + /// - `data_type` must be valid for this layout + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: ScalarBuffer, + values: Buffer, + nulls: Option, + ) -> Self { + Self { + data_type, + nulls, + offsets, + values, + phantom: Default::default(), + } + } + + /// Returns the raw byte data + #[inline] + pub fn values(&self) -> &B { + // Safety: + // Bytes must be valid + unsafe { B::from_bytes_unchecked(self.values.as_slice()) } + } + + /// Returns the offsets + #[inline] + pub fn value_offsets(&self) -> &[O] { + &self.offsets + } + + /// Returns the null buffer if any + #[inline] + pub fn null_buffer(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data.rs b/arrow-data/src/data/mod.rs similarity index 99% rename from arrow-data/src/data.rs rename to arrow-data/src/data/mod.rs index 8742f8db9490..eb1fe2bcffa2 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data/mod.rs @@ -29,6 +29,13 @@ use std::sync::Arc; use crate::equal; +#[allow(unused)] // Private until ready (#1176) +mod bytes; +#[allow(unused)] // Private until ready (#1176) +mod primitive; +#[allow(unused)] // Private until ready (#1176) +mod types; + #[inline] pub(crate) fn contains_nulls( null_bit_buffer: Option<&Buffer>, diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs new file mode 100644 index 000000000000..d34ef42dbbb7 --- /dev/null +++ b/arrow-data/src/data/primitive.rs @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::{PhysicalType, PrimitiveType}; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::{i256, ArrowNativeType}; +use arrow_schema::DataType; +use half::f16; + +mod private { + use super::*; + + pub trait PrimitiveSealed { + /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] + fn downcast_ref(data: &ArrayDataPrimitive) -> Option<&PrimitiveArrayData> + where + Self: Primitive; + + /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] + fn downcast(data: ArrayDataPrimitive) -> Option> + where + Self: Primitive; + + /// Cast [`ArrayDataPrimitive`] to [`ArrayDataPrimitive`] + fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive + where + Self: Primitive; + } +} + +pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { + const VARIANT: PrimitiveType; +} + +macro_rules! primitive { + ($t:ty,$v:ident) => { + impl Primitive for $t { + const VARIANT: PrimitiveType = PrimitiveType::$v; + } + impl private::PrimitiveSealed for $t { + fn downcast_ref( + data: &ArrayDataPrimitive, + ) -> Option<&PrimitiveArrayData> { + match data { + ArrayDataPrimitive::$v(v) => Some(v), + _ => None, + } + } + + fn downcast(data: ArrayDataPrimitive) -> Option> { + match data { + ArrayDataPrimitive::$v(v) => Some(v), + _ => None, + } + } + + fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive { + ArrayDataPrimitive::$v(v) + } + } + }; +} + +primitive!(i8, Int8); +primitive!(i16, Int16); +primitive!(i32, Int32); +primitive!(i64, Int64); +primitive!(i128, Int128); +primitive!(i256, Int256); +primitive!(u8, UInt8); +primitive!(u16, UInt16); +primitive!(u32, UInt32); +primitive!(u64, UInt64); +primitive!(f16, Float16); +primitive!(f32, Float32); +primitive!(f64, Float64); + +/// An enumeration of the types of [`PrimitiveArrayData`] +pub enum ArrayDataPrimitive { + Int8(PrimitiveArrayData), + Int16(PrimitiveArrayData), + Int32(PrimitiveArrayData), + Int64(PrimitiveArrayData), + Int128(PrimitiveArrayData), + Int256(PrimitiveArrayData), + UInt8(PrimitiveArrayData), + UInt16(PrimitiveArrayData), + UInt32(PrimitiveArrayData), + UInt64(PrimitiveArrayData), + Float16(PrimitiveArrayData), + Float32(PrimitiveArrayData), + Float64(PrimitiveArrayData), +} + +impl ArrayDataPrimitive { + /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] + pub fn downcast_ref(&self) -> Option<&PrimitiveArrayData

> { + P::downcast_ref(self) + } + + /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] + pub fn downcast(self) -> Option> { + P::downcast(self) + } +} + +/// ArrayData for arrays of [`Primitive`] +#[derive(Debug, Clone)] +pub struct PrimitiveArrayData { + data_type: DataType, + nulls: Option, + values: ScalarBuffer, +} + +impl From> for ArrayDataPrimitive { + fn from(value: PrimitiveArrayData

) -> Self { + P::upcast(value) + } +} + +impl PrimitiveArrayData { + /// Create a new [`PrimitiveArrayData`] + /// + /// # Panics + /// + /// Panics if + /// - `nulls` and `values` are different lengths + /// - `data_type` is not compatible with `T` + pub fn new( + data_type: DataType, + values: ScalarBuffer, + nulls: Option, + ) -> Self { + let physical = PhysicalType::from(&data_type); + assert!( + matches!(physical, PhysicalType::Primitive(p) if p == T::VARIANT), + "Illegal physical type for PrimitiveArrayData of datatype {:?}, expected {:?} got {:?}", + data_type, + T::VARIANT, + physical + ); + + if let Some(n) = nulls.as_ref() { + assert_eq!(values.len(), n.len()) + } + + Self { + data_type, + values, + nulls, + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the primitive values + #[inline] + pub fn values(&self) -> &[T] { + &self.values + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs new file mode 100644 index 000000000000..09e169f6aa61 --- /dev/null +++ b/arrow-data/src/data/types.rs @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{DataType, IntervalUnit}; + +/// An enumeration of the primitive types implementing [`ArrowNativeType`](arrow_buffer::ArrowNativeType) +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum PrimitiveType { + Int8, + Int16, + Int32, + Int64, + Int128, + Int256, + UInt8, + UInt16, + UInt32, + UInt64, + Float16, + Float32, + Float64, +} + +/// An enumeration of the types of offsets for variable length encodings +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum OffsetType { + Int32, + Int64, +} + +/// An enumeration of the types of variable length byte arrays +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum BytesType { + Binary, + Utf8, +} + +/// An enumeration of the types of dictionary key +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum DictionaryKeyType { + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, +} + +/// An enumeration of the types of run key +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum RunEndType { + Int16, + Int32, + Int64, +} + +/// Describes the physical representation of a given [`DataType`] +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum PhysicalType { + Null, + Boolean, + Primitive(PrimitiveType), + FixedSizeBinary, + Bytes(OffsetType, BytesType), + FixedSizeList, + List(OffsetType), + Map, + Struct, + Union, + Dictionary(DictionaryKeyType), + Run(RunEndType), +} + +impl From<&DataType> for PhysicalType { + fn from(value: &DataType) -> Self { + match value { + DataType::Null => Self::Null, + DataType::Boolean => Self::Boolean, + DataType::Int8 => Self::Primitive(PrimitiveType::Int8), + DataType::Int16 => Self::Primitive(PrimitiveType::Int16), + DataType::Int32 => Self::Primitive(PrimitiveType::Int32), + DataType::Int64 => Self::Primitive(PrimitiveType::Int64), + DataType::UInt8 => Self::Primitive(PrimitiveType::UInt8), + DataType::UInt16 => Self::Primitive(PrimitiveType::UInt16), + DataType::UInt32 => Self::Primitive(PrimitiveType::UInt32), + DataType::UInt64 => Self::Primitive(PrimitiveType::UInt64), + DataType::Float16 => Self::Primitive(PrimitiveType::Float16), + DataType::Float32 => Self::Primitive(PrimitiveType::Float32), + DataType::Float64 => Self::Primitive(PrimitiveType::Float64), + DataType::Timestamp(_, _) => Self::Primitive(PrimitiveType::Int64), + DataType::Date32 => Self::Primitive(PrimitiveType::Int32), + DataType::Date64 => Self::Primitive(PrimitiveType::Int64), + DataType::Time32(_) => Self::Primitive(PrimitiveType::Int32), + DataType::Time64(_) => Self::Primitive(PrimitiveType::Int64), + DataType::Duration(_) => Self::Primitive(PrimitiveType::Int64), + DataType::Decimal128(_, _) => Self::Primitive(PrimitiveType::Int128), + DataType::Decimal256(_, _) => Self::Primitive(PrimitiveType::Int256), + DataType::Interval(IntervalUnit::YearMonth) => { + Self::Primitive(PrimitiveType::Int32) + } + DataType::Interval(IntervalUnit::DayTime) => { + Self::Primitive(PrimitiveType::Int64) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Self::Primitive(PrimitiveType::Int128) + } + DataType::FixedSizeBinary(_) => Self::FixedSizeBinary, + DataType::Binary => Self::Bytes(OffsetType::Int32, BytesType::Binary), + DataType::LargeBinary => Self::Bytes(OffsetType::Int64, BytesType::Binary), + DataType::Utf8 => Self::Bytes(OffsetType::Int32, BytesType::Utf8), + DataType::LargeUtf8 => Self::Bytes(OffsetType::Int64, BytesType::Utf8), + DataType::List(_) => Self::List(OffsetType::Int32), + DataType::FixedSizeList(_, _) => Self::FixedSizeList, + DataType::LargeList(_) => Self::List(OffsetType::Int64), + DataType::Struct(_) => Self::Struct, + DataType::Union(_, _, _) => Self::Union, + DataType::Dictionary(k, _) => match k.as_ref() { + DataType::Int8 => Self::Dictionary(DictionaryKeyType::Int8), + DataType::Int16 => Self::Dictionary(DictionaryKeyType::Int16), + DataType::Int32 => Self::Dictionary(DictionaryKeyType::Int32), + DataType::Int64 => Self::Dictionary(DictionaryKeyType::Int64), + DataType::UInt8 => Self::Dictionary(DictionaryKeyType::UInt8), + DataType::UInt16 => Self::Dictionary(DictionaryKeyType::UInt16), + DataType::UInt32 => Self::Dictionary(DictionaryKeyType::UInt32), + DataType::UInt64 => Self::Dictionary(DictionaryKeyType::UInt64), + d => panic!("illegal dictionary key data type {d}"), + }, + DataType::Map(_, _) => Self::Map, + DataType::RunEndEncoded(f, _) => match f.data_type() { + DataType::Int16 => Self::Run(RunEndType::Int16), + DataType::Int32 => Self::Run(RunEndType::Int32), + DataType::Int64 => Self::Run(RunEndType::Int64), + d => panic!("illegal run end data type {d}"), + }, + } + } +} From 034c43fdb47389ab120c2ab84a6e61d44d0d2788 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Feb 2023 14:37:39 +0000 Subject: [PATCH 0635/1411] Update prost-build requirement from =0.11.7 to =0.11.8 (#3767) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/commits/v0.11.8) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 0c820ed73ac9..61959143e924 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -59,7 +59,7 @@ tower = "0.4.13" # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.51", default-features = false } -prost-build = { version = "=0.11.7", default-features = false } +prost-build = { version = "=0.11.8", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } [[example]] From f82b704c5c38090a6ecd052b5c25bdfee7010130 Mon Sep 17 00:00:00 2001 From: Willem D'Haeseleer Date: Mon, 27 Feb 2023 09:21:25 -0800 Subject: [PATCH 0636/1411] object-store: fix handling of AWS profile credentials without expiry (#3766) * fix aws profile * fix unused import * support None as expiry * fix clippy * fix fmt * revert fmt whitespace fix --- object_store/src/aws/credential.rs | 18 ++++++------------ object_store/src/azure/credential.rs | 16 +++++++++------- object_store/src/client/token.rs | 22 ++++++++++++++-------- object_store/src/gcp/credential.rs | 6 +++--- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index cba55845ec46..e2332d0fa4df 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -438,7 +438,7 @@ async fn instance_creds( let ttl = (creds.expiration - now).to_std().unwrap_or_default(); Ok(TemporaryToken { token: Arc::new(creds.into()), - expiry: Instant::now() + ttl, + expiry: Some(Instant::now() + ttl), }) } @@ -509,7 +509,7 @@ async fn web_identity( Ok(TemporaryToken { token: Arc::new(creds.into()), - expiry: Instant::now() + ttl, + expiry: Some(Instant::now() + ttl), }) } @@ -553,17 +553,11 @@ mod profile { store: "S3", source: Box::new(source), })?; - let t_now = SystemTime::now(); - let expiry = match c.expiry().and_then(|e| e.duration_since(t_now).ok()) { - Some(ttl) => Instant::now() + ttl, - None => { - return Err(crate::Error::Generic { - store: "S3", - source: "Invalid expiry".into(), - }) - } - }; + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); Ok(TemporaryToken { token: Arc::new(AwsCredential { diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 9460c2deff0e..9e072229ffa9 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -360,7 +360,7 @@ impl TokenCredential for ClientSecretOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -467,7 +467,7 @@ impl TokenCredential for ImdsManagedIdentityOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -541,7 +541,7 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -640,10 +640,12 @@ impl TokenCredential for AzureCliCredential { - chrono::Local::now().naive_local(); Ok(TemporaryToken { token: token_response.access_token, - expiry: Instant::now() - + duration.to_std().map_err(|_| Error::AzureCli { - message: "az returned invalid lifetime".to_string(), - })?, + expiry: Some( + Instant::now() + + duration.to_std().map_err(|_| Error::AzureCli { + message: "az returned invalid lifetime".to_string(), + })?, + ), }) } Ok(az_output) => { diff --git a/object_store/src/client/token.rs b/object_store/src/client/token.rs index 2ff28616e608..7e48d351d9a3 100644 --- a/object_store/src/client/token.rs +++ b/object_store/src/client/token.rs @@ -25,7 +25,8 @@ pub struct TemporaryToken { /// The temporary credential pub token: T, /// The instant at which this credential is no longer valid - pub expiry: Instant, + /// None means the credential does not expire + pub expiry: Option, } /// Provides [`TokenCache::get_or_insert_with`] which can be used to cache a @@ -53,13 +54,18 @@ impl TokenCache { let mut locked = self.cache.lock().await; if let Some(cached) = locked.as_ref() { - let delta = cached - .expiry - .checked_duration_since(now) - .unwrap_or_default(); - - if delta.as_secs() > 300 { - return Ok(cached.token.clone()); + match cached.expiry { + Some(ttl) + if ttl + .checked_duration_since(now) + .unwrap_or_default() + .as_secs() + > 300 => + { + return Ok(cached.token.clone()); + } + None => return Ok(cached.token.clone()), + _ => (), } } diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index c12b37cdd1c0..853e4ce83842 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -220,7 +220,7 @@ impl TokenProvider for OAuthProvider { let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -393,7 +393,7 @@ impl TokenProvider for InstanceCredentialProvider { .await?; let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) } @@ -467,7 +467,7 @@ impl TokenProvider for ApplicationDefaultCredentials { .context(TokenResponseBodySnafu)?; let token = TemporaryToken { token: response.access_token, - expiry: Instant::now() + Duration::from_secs(response.expires_in), + expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) } From 5cc0f9b634393008ea6136a228470b6612b2dee1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 27 Feb 2023 20:18:56 +0000 Subject: [PATCH 0637/1411] Prepare object_store 0.5.5 (#3768) (#3770) --- object_store/CHANGELOG-old.md | 27 +++++++++++++++- object_store/CHANGELOG.md | 31 +++++++++---------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +-- 4 files changed, 43 insertions(+), 21 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 78237a02dd87..58fb8a3b9deb 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,7 +19,32 @@ # Historical Changelog -# Changelog +## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) + +**Implemented enhancements:** + +- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) +- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) +- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) +- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) +- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) ## [object_store_0.5.3](https://github.com/apache/arrow-rs/tree/object_store_0.5.3) (2023-01-04) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index c1734ec5ba9f..b8f2fe8fc3f4 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,32 +19,29 @@ # Changelog -## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) +## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) **Implemented enhancements:** -- \[object\_store\] support more identity based auth flows for azure [\#3580](https://github.com/apache/arrow-rs/issues/3580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Implement workload identity and application default credentials for GCP object store. [\#3533](https://github.com/apache/arrow-rs/issues/3533) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support GCP Workload Identity [\#3490](https://github.com/apache/arrow-rs/issues/3490) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Allow providing service account key directly when building GCP object store client [\#3488](https://github.com/apache/arrow-rs/issues/3488) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Closed issues:** +**Fixed bugs:** -- object\_store: temporary aws credentials not refreshed? [\#3446](https://github.com/apache/arrow-rs/issues/3446) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Final tweaks to 32.0.0 changelog [\#3618](https://github.com/apache/arrow-rs/pull/3618) ([tustvold](https://github.com/tustvold)) -- Update AWS SDK [\#3617](https://github.com/apache/arrow-rs/pull/3617) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add ClientOption.allow\_insecure [\#3600](https://github.com/apache/arrow-rs/pull/3600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([poelzi](https://github.com/poelzi)) -- \[object\_store\] support azure managed and workload identities [\#3581](https://github.com/apache/arrow-rs/pull/3581) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Additional GCP authentication [\#3541](https://github.com/apache/arrow-rs/pull/3541) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([winding-lines](https://github.com/winding-lines)) -- Update aws-config and aws-types requirements from 0.52 to 0.53 [\#3539](https://github.com/apache/arrow-rs/pull/3539) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) -- Use GHA concurrency groups \(\#3495\) [\#3538](https://github.com/apache/arrow-rs/pull/3538) ([tustvold](https://github.com/tustvold)) -- Remove azurite test exception [\#3497](https://github.com/apache/arrow-rs/pull/3497) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat: Allow providing a service account key directly for GCS [\#3489](https://github.com/apache/arrow-rs/pull/3489) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) +- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) +- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 8ab0c15cb950..c0c090cd0f00 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.4" +version = "0.5.5" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 5cf5582a9e9b..de80d0f3eaf3 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.3" -FUTURE_RELEASE="object_store_0.5.4" +SINCE_TAG="object_store_0.5.4" +FUTURE_RELEASE="object_store_0.5.5" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From e7eb304dac442a943c434f8ea248de909f82aa88 Mon Sep 17 00:00:00 2001 From: Satyam Singh Date: Tue, 28 Feb 2023 04:30:55 +0530 Subject: [PATCH 0638/1411] Add support for unsigned payloads in aws (#3741) * Add support for unsigned payloads in aws * Add unsigned payload to AmazonS3ConfigKey * Link to aws doc * Add env test * Add test * Add integration test * Take boolean argument * Fix doc * Clippy fixes * Merge into s3 test --- object_store/src/aws/client.rs | 50 ++++++++++++++++++++++---- object_store/src/aws/credential.rs | 57 +++++++++++++++++++++++++++--- object_store/src/aws/mod.rs | 38 +++++++++++++++++++- 3 files changed, 133 insertions(+), 12 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index b40bcbacf99e..0b0f883b7e51 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -204,6 +204,7 @@ pub struct S3Config { pub credentials: Box, pub retry_config: RetryConfig, pub client_options: ClientOptions, + pub sign_payload: bool, } impl S3Config { @@ -256,7 +257,12 @@ impl S3Client { } let response = builder - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(GetRequestSnafu { @@ -287,7 +293,12 @@ impl S3Client { let response = builder .query(query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(PutRequestSnafu { @@ -309,7 +320,12 @@ impl S3Client { self.client .request(Method::DELETE, url) .query(query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(DeleteRequestSnafu { @@ -328,7 +344,12 @@ impl S3Client { self.client .request(Method::PUT, url) .header("x-amz-copy-source", source) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CopyRequestSnafu { @@ -369,7 +390,12 @@ impl S3Client { .client .request(Method::GET, &url) .query(&query) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(ListRequestSnafu)? @@ -407,7 +433,12 @@ impl S3Client { let response = self .client .request(Method::POST, url) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CreateMultipartRequestSnafu)? @@ -446,7 +477,12 @@ impl S3Client { .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) - .with_aws_sigv4(credential.as_ref(), &self.config.region, "s3") + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + ) .send_retry(&self.config.retry_config) .await .context(CompleteMultipartRequestSnafu)?; diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index e2332d0fa4df..05f2c535bfdc 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -39,6 +39,7 @@ type StdError = Box; /// SHA256 hash of empty string static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; +static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; #[derive(Debug)] pub struct AwsCredential { @@ -72,6 +73,7 @@ struct RequestSigner<'a> { credential: &'a AwsCredential, service: &'a str, region: &'a str, + sign_payload: bool, } const DATE_HEADER: &str = "x-amz-date"; @@ -98,9 +100,13 @@ impl<'a> RequestSigner<'a> { let date_val = HeaderValue::from_str(&date_str).unwrap(); request.headers_mut().insert(DATE_HEADER, date_val); - let digest = match request.body() { - None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), + let digest = if self.sign_payload { + match request.body() { + None => EMPTY_SHA256_HASH.to_string(), + Some(body) => hex_digest(body.as_bytes().unwrap()), + } + } else { + UNSIGNED_PAYLOAD_LITERAL.to_string() }; let header_digest = HeaderValue::from_str(&digest).unwrap(); @@ -158,6 +164,7 @@ pub trait CredentialExt { credential: &AwsCredential, region: &str, service: &str, + sign_payload: bool, ) -> Self; } @@ -167,6 +174,7 @@ impl CredentialExt for RequestBuilder { credential: &AwsCredential, region: &str, service: &str, + sign_payload: bool, ) -> Self { // Hack around lack of access to underlying request // https://github.com/seanmonstar/reqwest/issues/1212 @@ -182,6 +190,7 @@ impl CredentialExt for RequestBuilder { credential, service, region, + sign_payload, }; signer.sign(&mut request); @@ -585,7 +594,7 @@ mod tests { // Test generated using https://docs.aws.amazon.com/general/latest/gr/sigv4-signed-request-examples.html #[test] - fn test_sign() { + fn test_sign_with_signed_payload() { let client = Client::new(); // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html @@ -615,12 +624,51 @@ mod tests { credential: &credential, service: "ec2", region: "us-east-1", + sign_payload: true, }; signer.sign(&mut request); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } + #[test] + fn test_sign_with_unsigned_payload() { + let client = Client::new(); + + // Test credentials from https://docs.aws.amazon.com/AmazonS3/latest/userguide/RESTAuthentication.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + // method = 'GET' + // service = 'ec2' + // host = 'ec2.amazonaws.com' + // region = 'us-east-1' + // endpoint = 'https://ec2.amazonaws.com' + // request_parameters = '' + let date = DateTime::parse_from_rfc3339("2022-08-06T18:01:34Z") + .unwrap() + .with_timezone(&Utc); + + let mut request = client + .request(Method::GET, "https://ec2.amazon.com/") + .build() + .unwrap(); + + let signer = RequestSigner { + date, + credential: &credential, + service: "ec2", + region: "us-east-1", + sign_payload: false, + }; + + signer.sign(&mut request); + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") + } + #[test] fn test_sign_port() { let client = Client::new(); @@ -651,6 +699,7 @@ mod tests { credential: &credential, service: "s3", region: "us-east-1", + sign_payload: true, }; signer.sign(&mut request); diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a1c9eae84052..c724886cf0e6 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -385,6 +385,7 @@ pub struct AmazonS3Builder { retry_config: RetryConfig, imdsv1_fallback: bool, virtual_hosted_style_request: bool, + unsigned_payload: bool, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, @@ -504,6 +505,15 @@ pub enum AmazonS3ConfigKey { /// - `virtual_hosted_style_request` VirtualHostedStyleRequest, + /// Avoid computing payload checksum when calculating signature. + /// + /// See [`AmazonS3Builder::with_unsigned_payload`] for details. + /// + /// Supported keys: + /// - `aws_unsigned_payload` + /// - `unsigned_payload` + UnsignedPayload, + /// Set the instance metadata endpoint /// /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. @@ -535,6 +545,7 @@ impl AsRef for AmazonS3ConfigKey { Self::DefaultRegion => "aws_default_region", Self::MetadataEndpoint => "aws_metadata_endpoint", Self::Profile => "aws_profile", + Self::UnsignedPayload => "aws_unsigned_payload", } } } @@ -563,6 +574,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_profile" | "profile" => Ok(Self::Profile), "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -679,6 +691,9 @@ impl AmazonS3Builder { self.metadata_endpoint = Some(value.into()) } AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), + AmazonS3ConfigKey::UnsignedPayload => { + self.unsigned_payload = str_is_truthy(&value.into()) + } }; Ok(self) } @@ -822,6 +837,15 @@ impl AmazonS3Builder { self } + /// Sets if unsigned payload option has to be used. + /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) + /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. + /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, + pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { + self.unsigned_payload = unsigned_payload; + self + } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), /// used primarily within AWS EC2. /// @@ -967,6 +991,7 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, client_options: self.client_options, + sign_payload: !self.unsigned_payload, }; let client = Arc::new(S3Client::new(config)?); @@ -1125,6 +1150,7 @@ mod tests { "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", &container_creds_relative_uri, ); + env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -1136,9 +1162,9 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert!(builder.unsigned_payload); } #[test] @@ -1154,6 +1180,7 @@ mod tests { ("aws_default_region", aws_default_region.clone()), ("aws_endpoint", aws_endpoint.clone()), ("aws_session_token", aws_session_token.clone()), + ("aws_unsigned_payload", "true".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1166,6 +1193,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert!(builder.unsigned_payload); } #[test] @@ -1181,6 +1209,7 @@ mod tests { (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), (AmazonS3ConfigKey::Token, aws_session_token.clone()), + (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1193,6 +1222,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert!(builder.unsigned_payload); } #[test] @@ -1220,6 +1250,12 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + + // run integration test with unsigned payload enabled + let config = maybe_skip_integration!().with_unsigned_payload(true); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let integration = config.build().unwrap(); + put_get_delete_list_opts(&integration, is_local).await; } #[tokio::test] From 5edc954a939c129e1386c2ac19add45e6fcdc9cb Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 1 Mar 2023 00:07:25 -0800 Subject: [PATCH 0639/1411] Creates PrimitiveDictionaryBuilder from provided keys and values builders (#3777) * Creates PrimitiveDictionaryBuilder from provided keys and values builders * Panics the function if provided builder is not empty --- .../builder/primitive_dictionary_builder.rs | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 00187cddef18..742c09d8cc26 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -113,6 +113,26 @@ where } } + /// Creates a new `PrimitiveDictionaryBuilder` from the provided keys and values builders. + /// + /// # Panics + /// + /// This method panics if `keys_builder` or `values_builder` is not empty. + pub fn new_from_builders( + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + ) -> Self { + assert!( + keys_builder.is_empty() && values_builder.is_empty(), + "keys and values builders must be empty" + ); + Self { + keys_builder, + values_builder, + map: HashMap::new(), + } + } + /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities /// /// `keys_capacity`: the number of keys, i.e. length of array to build @@ -276,7 +296,8 @@ mod tests { use crate::array::Array; use crate::array::UInt32Array; use crate::array::UInt8Array; - use crate::types::{Int32Type, UInt32Type, UInt8Type}; + use crate::builder::Decimal128Builder; + use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] fn test_primitive_dictionary_builder() { @@ -329,4 +350,18 @@ mod tests { // Special error if the key overflows (256th entry) builder.append(1257).unwrap(); } + + #[test] + fn test_primitive_dictionary_with_builders() { + let keys_builder = PrimitiveBuilder::::new(); + let values_builder = + Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let mut builder = + PrimitiveDictionaryBuilder::::new_from_builders( + keys_builder, + values_builder, + ); + let dict_array = builder.finish(); + assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2)); + } } From d440c244bde4d1e99afcc72ac5b2c049a62f9225 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:06:59 +0000 Subject: [PATCH 0640/1411] Zero-copy Vec conversion (#3516) (#1176) (#3756) * Zero-copy Vec conversion (#3516) (#1176) * Fix doc * More tests * Review feedback * More tests --- arrow-array/src/array/list_array.rs | 2 + arrow-buffer/src/alloc/mod.rs | 18 ++- arrow-buffer/src/buffer/immutable.rs | 192 ++++++++++++++++++++++++++- arrow-buffer/src/buffer/mutable.rs | 30 +++-- arrow-buffer/src/buffer/scalar.rs | 9 ++ arrow-buffer/src/bytes.rs | 13 +- 6 files changed, 237 insertions(+), 27 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 6b63269d1615..178139f810e7 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -829,6 +829,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] + #[allow(deprecated)] fn test_primitive_array_alignment() { let ptr = arrow_buffer::alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; @@ -845,6 +846,7 @@ mod tests { // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] + #[allow(deprecated)] fn test_list_array_alignment() { let ptr = arrow_buffer::alloc::allocate_aligned(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index 1493d839f5ab..7600a28d8754 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -45,6 +45,7 @@ fn dangling_ptr() -> NonNull { /// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. /// This is more performant than using [allocate_aligned_zeroed] when all bytes will have /// an unknown or non-zero value and is semantically similar to `malloc`. +#[deprecated(note = "Use Vec")] pub fn allocate_aligned(size: usize) -> NonNull { unsafe { if size == 0 { @@ -60,6 +61,7 @@ pub fn allocate_aligned(size: usize) -> NonNull { /// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. /// This is more performant than using [allocate_aligned] and setting all bytes to zero /// and is semantically similar to `calloc`. +#[deprecated(note = "Use Vec")] pub fn allocate_aligned_zeroed(size: usize) -> NonNull { unsafe { if size == 0 { @@ -80,6 +82,7 @@ pub fn allocate_aligned_zeroed(size: usize) -> NonNull { /// * ptr must denote a block of memory currently allocated via this allocator, /// /// * size must be the same size that was used to allocate that block of memory, +#[deprecated(note = "Use Vec")] pub unsafe fn free_aligned(ptr: NonNull, size: usize) { if size != 0 { std::alloc::dealloc( @@ -100,6 +103,8 @@ pub unsafe fn free_aligned(ptr: NonNull, size: usize) { /// /// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., /// the rounded value must be less than usize::MAX). +#[deprecated(note = "Use Vec")] +#[allow(deprecated)] pub unsafe fn reallocate( ptr: NonNull, old_size: usize, @@ -132,19 +137,18 @@ impl Allocation for T {} /// Mode of deallocating memory regions pub(crate) enum Deallocation { - /// An allocation of the given capacity that needs to be deallocated using arrows's cache aligned allocator. - /// See [allocate_aligned] and [free_aligned]. - Arrow(usize), - /// An allocation from an external source like the FFI interface or a Rust Vec. - /// Deallocation will happen + /// An allocation using [`std::alloc`] + Standard(Layout), + /// An allocation from an external source like the FFI interface + /// Deallocation will happen on `Allocation::drop` Custom(Arc), } impl Debug for Deallocation { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { - Deallocation::Arrow(capacity) => { - write!(f, "Deallocation::Arrow {{ capacity: {capacity} }}") + Deallocation::Standard(layout) => { + write!(f, "Deallocation::Standard {layout:?}") } Deallocation::Custom(_) => { write!(f, "Deallocation::Custom {{ capacity: unknown }}") diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index cbfba1e0540c..5f42035c9e30 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::convert::AsRef; +use std::alloc::Layout; use std::fmt::Debug; use std::iter::FromIterator; use std::ptr::NonNull; use std::sync::Arc; -use crate::alloc::{Allocation, Deallocation}; +use crate::alloc::{Allocation, Deallocation, ALIGNMENT}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::{bytes::Bytes, native::ArrowNativeType}; @@ -42,6 +42,8 @@ pub struct Buffer { ptr: *const u8, /// Byte length of the buffer. + /// + /// Must be less than or equal to `data.len()` length: usize, } @@ -69,6 +71,22 @@ impl Buffer { } } + /// Create a [`Buffer`] from the provided `Vec` without copying + #[inline] + pub fn from_vec(vec: Vec) -> Self { + // Safety + // Vec::as_ptr guaranteed to not be null and ArrowNativeType are trivially transmutable + let ptr = unsafe { NonNull::new_unchecked(vec.as_ptr() as _) }; + let len = vec.len() * std::mem::size_of::(); + // Safety + // Vec guaranteed to have a valid layout matching that of `Layout::array` + // This is based on `RawVec::current_memory` + let layout = unsafe { Layout::array::(vec.capacity()).unwrap_unchecked() }; + std::mem::forget(vec); + let b = unsafe { Bytes::new(ptr, len, Deallocation::Standard(layout)) }; + Self::from_bytes(b) + } + /// Initializes a [Buffer] from a slice of items. pub fn from_slice_ref>(items: T) -> Self { let slice = items.as_ref(); @@ -78,7 +96,7 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing memory region (must already be byte-aligned), this + /// Creates a buffer from an existing aligned memory region (must already be byte-aligned), this /// `Buffer` will free this piece of memory when dropped. /// /// # Arguments @@ -91,9 +109,11 @@ impl Buffer { /// /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. + #[deprecated(note = "Use From>")] pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, capacity: usize) -> Self { assert!(len <= capacity); - Buffer::build_with_arguments(ptr, len, Deallocation::Arrow(capacity)) + let layout = Layout::from_size_align(capacity, ALIGNMENT).unwrap(); + Buffer::build_with_arguments(ptr, len, Deallocation::Standard(layout)) } /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting @@ -253,7 +273,8 @@ impl Buffer { } /// Returns `MutableBuffer` for mutating the buffer if this buffer is not shared. - /// Returns `Err` if this is shared or its allocation is from an external source. + /// Returns `Err` if this is shared or its allocation is from an external source or + /// it is not allocated with alignment [`ALIGNMENT`] pub fn into_mutable(self) -> Result { let ptr = self.ptr; let length = self.length; @@ -269,6 +290,45 @@ impl Buffer { length, }) } + + /// Returns `Vec` for mutating the buffer + /// + /// Returns `Err(self)` if this buffer does not have the same [`Layout`] as + /// the destination Vec or contains a non-zero offset + pub fn into_vec(self) -> Result, Self> { + let layout = match self.data.deallocation() { + Deallocation::Standard(l) => l, + _ => return Err(self), // Custom allocation + }; + + if self.ptr != self.data.as_ptr() { + return Err(self); // Data is offset + } + + let v_capacity = layout.size() / std::mem::size_of::(); + match Layout::array::(v_capacity) { + Ok(expected) if layout == &expected => {} + _ => return Err(self), // Incorrect layout + } + + let length = self.length; + let ptr = self.ptr; + let v_len = self.length / std::mem::size_of::(); + + Arc::try_unwrap(self.data) + .map(|bytes| unsafe { + let ptr = bytes.ptr().as_ptr() as _; + std::mem::forget(bytes); + // Safety + // Verified that bytes layout matches that of Vec + Vec::from_raw_parts(ptr, v_len, v_capacity) + }) + .map_err(|bytes| Buffer { + data: bytes, + ptr, + length, + }) + } } /// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly @@ -378,6 +438,7 @@ impl FromIterator for Buffer { #[cfg(test)] mod tests { + use crate::i256; use std::panic::{RefUnwindSafe, UnwindSafe}; use std::thread; @@ -632,4 +693,125 @@ mod tests { let buffer = Buffer::from(MutableBuffer::from_len_zeroed(12)); buffer.slice_with_length(2, usize::MAX); } + + #[test] + fn test_vec_interop() { + // Test empty vec + let a: Vec = Vec::new(); + let b = Buffer::from_vec(a); + b.into_vec::().unwrap(); + + // Test vec with capacity + let a: Vec = Vec::with_capacity(20); + let b = Buffer::from_vec(a); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 0); + assert_eq!(back.capacity(), 20); + + // Test vec with values + let mut a: Vec = Vec::with_capacity(3); + a.extend_from_slice(&[1, 2, 3]); + let b = Buffer::from_vec(a); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 3); + assert_eq!(back.capacity(), 3); + + // Test vec with values and spare capacity + let mut a: Vec = Vec::with_capacity(20); + a.extend_from_slice(&[1, 4, 7, 8, 9, 3, 6]); + let b = Buffer::from_vec(a); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 7); + assert_eq!(back.capacity(), 20); + + // Test incorrect alignment + let a: Vec = Vec::new(); + let b = Buffer::from_vec(a); + let b = b.into_vec::().unwrap_err(); + b.into_vec::().unwrap_err(); + + // Test convert between types with same alignment + // This is an implementation quirk, but isn't harmful + // as ArrowNativeType are trivially transmutable + let a: Vec = vec![1, 2, 3, 4]; + let b = Buffer::from_vec(a); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 4); + assert_eq!(back.capacity(), 4); + + // i256 has the same layout as i128 so this is valid + let mut b: Vec = Vec::with_capacity(4); + b.extend_from_slice(&[1, 2, 3, 4]); + let b = Buffer::from_vec(b); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 2); + assert_eq!(back.capacity(), 2); + + // Invalid layout + let b: Vec = vec![1, 2, 3]; + let b = Buffer::from_vec(b); + b.into_vec::().unwrap_err(); + + // Invalid layout + let mut b: Vec = Vec::with_capacity(5); + b.extend_from_slice(&[1, 2, 3, 4]); + let b = Buffer::from_vec(b); + b.into_vec::().unwrap_err(); + + // Truncates length + // This is an implementation quirk, but isn't harmful + let mut b: Vec = Vec::with_capacity(4); + b.extend_from_slice(&[1, 2, 3]); + let b = Buffer::from_vec(b); + let back = b.into_vec::().unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(back.capacity(), 2); + + // Cannot use aligned allocation + let b = Buffer::from(MutableBuffer::new(10)); + let b = b.into_vec::().unwrap_err(); + b.into_vec::().unwrap_err(); + + // Test slicing + let mut a: Vec = Vec::with_capacity(20); + a.extend_from_slice(&[1, 4, 7, 8, 9, 3, 6]); + let b = Buffer::from_vec(a); + let slice = b.slice_with_length(0, 64); + + // Shared reference fails + let slice = slice.into_vec::().unwrap_err(); + drop(b); + + // Succeeds as no outstanding shared reference + let back = slice.into_vec::().unwrap(); + assert_eq!(&back, &[1, 4, 7, 8]); + assert_eq!(back.capacity(), 20); + + // Slicing by non-multiple length truncates + let mut a: Vec = Vec::with_capacity(8); + a.extend_from_slice(&[1, 4, 7, 3]); + + let b = Buffer::from_vec(a); + let slice = b.slice_with_length(0, 34); + drop(b); + + let back = slice.into_vec::().unwrap(); + assert_eq!(&back, &[1, 4]); + assert_eq!(back.capacity(), 8); + + // Offset prevents conversion + let a: Vec = vec![1, 3, 4, 6]; + let b = Buffer::from_vec(a).slice(2); + b.into_vec::().unwrap_err(); + + let b = MutableBuffer::new(16).into_buffer(); + let b = b.into_vec::().unwrap_err(); // Invalid layout + let b = b.into_vec::().unwrap_err(); // Invalid layout + b.into_mutable().unwrap(); + + let b = Buffer::from_vec(vec![1_u32, 3, 5]); + let b = b.into_mutable().unwrap_err(); // Invalid layout + let b = b.into_vec::().unwrap(); + assert_eq!(b, &[1, 3, 5]); + } } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 2e6e2f1d7b08..250ac9f31595 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -16,23 +16,28 @@ // under the License. use super::Buffer; -use crate::alloc::Deallocation; +use crate::alloc::{Deallocation, ALIGNMENT}; use crate::{ alloc, bytes::Bytes, native::{ArrowNativeType, ToByteSlice}, util::bit_util, }; +use std::alloc::Layout; use std::mem; use std::ptr::NonNull; /// A [`MutableBuffer`] is Arrow's interface to build a [`Buffer`] out of items or slices of items. +/// /// [`Buffer`]s created from [`MutableBuffer`] (via `into`) are guaranteed to have its pointer aligned /// along cache lines and in multiple of 64 bytes. +/// /// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] /// to insert many items, and `into` to convert it to [`Buffer`]. /// -/// For a safe, strongly typed API consider using `arrow::array::BufferBuilder` +/// For a safe, strongly typed API consider using `Vec` +/// +/// Note: this may be deprecated in a future release ([#1176](https://github.com/apache/arrow-rs/issues/1176)) /// /// # Example /// @@ -62,6 +67,7 @@ impl MutableBuffer { /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. #[inline] + #[allow(deprecated)] pub fn with_capacity(capacity: usize) -> Self { let capacity = bit_util::round_upto_multiple_of_64(capacity); let ptr = alloc::allocate_aligned(capacity); @@ -83,6 +89,7 @@ impl MutableBuffer { /// let data = buffer.as_slice_mut(); /// assert_eq!(data[126], 0u8); /// ``` + #[allow(deprecated)] pub fn from_len_zeroed(len: usize) -> Self { let new_capacity = bit_util::round_upto_multiple_of_64(len); let ptr = alloc::allocate_aligned_zeroed(new_capacity); @@ -95,12 +102,14 @@ impl MutableBuffer { /// Allocates a new [MutableBuffer] from given `Bytes`. pub(crate) fn from_bytes(bytes: Bytes) -> Result { - if !matches!(bytes.deallocation(), Deallocation::Arrow(_)) { - return Err(bytes); - } + let capacity = match bytes.deallocation() { + Deallocation::Standard(layout) if layout.align() == ALIGNMENT => { + layout.size() + } + _ => return Err(bytes), + }; let len = bytes.len(); - let capacity = bytes.capacity(); let ptr = bytes.ptr(); mem::forget(bytes); @@ -224,6 +233,7 @@ impl MutableBuffer { /// buffer.shrink_to_fit(); /// assert!(buffer.capacity() >= 64 && buffer.capacity() < 128); /// ``` + #[allow(deprecated)] pub fn shrink_to_fit(&mut self) { let new_capacity = bit_util::round_upto_multiple_of_64(self.len); if new_capacity < self.capacity { @@ -300,9 +310,9 @@ impl MutableBuffer { #[inline] pub(super) fn into_buffer(self) -> Buffer { - let bytes = unsafe { - Bytes::new(self.data, self.len, Deallocation::Arrow(self.capacity)) - }; + let layout = Layout::from_size_align(self.capacity, ALIGNMENT).unwrap(); + let bytes = + unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(layout)) }; std::mem::forget(self); Buffer::from_bytes(bytes) } @@ -448,6 +458,7 @@ impl MutableBuffer { /// # Safety /// `ptr` must be allocated for `old_capacity`. #[cold] +#[allow(deprecated)] unsafe fn reallocate( ptr: NonNull, old_capacity: usize, @@ -630,6 +641,7 @@ impl std::ops::DerefMut for MutableBuffer { } impl Drop for MutableBuffer { + #[allow(deprecated)] fn drop(&mut self) { unsafe { alloc::free_aligned(self.data, self.capacity) }; } diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index e688e52fea5c..01a64633f532 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -90,6 +90,15 @@ impl From for ScalarBuffer { } } +impl From> for ScalarBuffer { + fn from(value: Vec) -> Self { + Self { + buffer: Buffer::from_vec(value), + phantom: Default::default(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 3320dfc261c7..2820fda781e6 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -23,10 +23,10 @@ use core::slice; use std::ptr::NonNull; use std::{fmt::Debug, fmt::Formatter}; -use crate::alloc; use crate::alloc::Deallocation; /// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. +/// /// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's /// global allocator nor u8 alignment. /// @@ -53,7 +53,7 @@ impl Bytes { /// /// * `ptr` - Pointer to raw parts /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** + /// * `deallocation` - Type of allocation /// /// # Safety /// @@ -93,7 +93,7 @@ impl Bytes { pub fn capacity(&self) -> usize { match self.deallocation { - Deallocation::Arrow(capacity) => capacity, + Deallocation::Standard(layout) => layout.size(), // we cannot determine this in general, // and thus we state that this is externally-owned memory Deallocation::Custom(_) => 0, @@ -115,9 +115,10 @@ impl Drop for Bytes { #[inline] fn drop(&mut self) { match &self.deallocation { - Deallocation::Arrow(capacity) => { - unsafe { alloc::free_aligned(self.ptr, *capacity) }; - } + Deallocation::Standard(layout) => match layout.size() { + 0 => {} // Nothing to do + _ => unsafe { std::alloc::dealloc(self.ptr.as_ptr(), *layout) }, + }, // The automatic drop implementation will free the memory once the reference count reaches zero Deallocation::Custom(_allocation) => (), } From 7852e763fea66b33a2766b6d6421cafcf6a58c29 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Mar 2023 11:54:52 +0000 Subject: [PATCH 0641/1411] ArrayData Enumeration for Remaining Layouts (#3769) * Add StructArrayData * Add ListArrayData * Add DictionaryArrayData * Format * Add FixedSizeBinaryArrayData * Add UnionArrayData * Docs * Add FixedSizeListArrayData * Derive Debug and Clone * Add RunArrayData * Review feedback --- arrow-data/src/data/bytes.rs | 80 +++++++++- arrow-data/src/data/dictionary.rs | 174 +++++++++++++++++++++ arrow-data/src/data/list.rs | 241 ++++++++++++++++++++++++++++++ arrow-data/src/data/mod.rs | 10 ++ arrow-data/src/data/primitive.rs | 22 +-- arrow-data/src/data/run.rs | 149 ++++++++++++++++++ arrow-data/src/data/struct.rs | 81 ++++++++++ arrow-data/src/data/types.rs | 3 +- arrow-data/src/data/union.rs | 77 ++++++++++ 9 files changed, 819 insertions(+), 18 deletions(-) create mode 100644 arrow-data/src/data/dictionary.rs create mode 100644 arrow-data/src/data/list.rs create mode 100644 arrow-data/src/data/run.rs create mode 100644 arrow-data/src/data/struct.rs create mode 100644 arrow-data/src/data/union.rs diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs index 86839c67124d..521c1959aaa1 100644 --- a/arrow-data/src/data/bytes.rs +++ b/arrow-data/src/data/bytes.rs @@ -73,7 +73,7 @@ mod private { } /// Types backed by a variable length slice of bytes -pub trait Bytes: private::BytesSealed { +pub trait Bytes: private::BytesSealed + std::fmt::Debug { const TYPE: BytesType; } @@ -195,6 +195,7 @@ impl private::BytesOffsetSealed for i64 { } /// An enumeration of the types of [`ArrayDataBytesOffset`] +#[derive(Debug, Clone)] pub enum ArrayDataBytes { Binary(ArrayDataBytesOffset<[u8]>), Utf8(ArrayDataBytesOffset), @@ -217,18 +218,29 @@ impl ArrayDataBytes { } /// An enumeration of the types of [`BytesArrayData`] +#[derive(Debug)] pub enum ArrayDataBytesOffset { Small(BytesArrayData), Large(BytesArrayData), } +impl Clone for ArrayDataBytesOffset { + fn clone(&self) -> Self { + match self { + Self::Small(v) => Self::Small(v.clone()), + Self::Large(v) => Self::Large(v.clone()), + } + } +} + impl From> for ArrayDataBytes { fn from(value: BytesArrayData) -> Self { B::upcast(O::upcast(value)) } } -/// ArrayData for arrays of [`Bytes`] +/// ArrayData for [variable-sized arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) of [`Bytes`] +#[derive(Debug)] pub struct BytesArrayData { data_type: DataType, nulls: Option, @@ -237,13 +249,25 @@ pub struct BytesArrayData { phantom: PhantomData, } -impl BytesArrayData { +impl Clone for BytesArrayData { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + offsets: self.offsets.clone(), + values: self.values.clone(), + phantom: Default::default(), + } + } +} + +impl BytesArrayData { /// Creates a new [`BytesArrayData`] /// /// # Safety /// /// - Each consecutive window of `offsets` must identify a valid slice of `values` - /// - `nulls.len() == offsets.len() + 1` + /// - `nulls.len() == offsets.len() - 1` /// - `data_type` must be valid for this layout pub unsafe fn new_unchecked( data_type: DataType, @@ -270,7 +294,7 @@ impl BytesArrayData { /// Returns the offsets #[inline] - pub fn value_offsets(&self) -> &[O] { + pub fn offsets(&self) -> &[O] { &self.offsets } @@ -286,3 +310,49 @@ impl BytesArrayData { &self.data_type } } + +/// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes +#[derive(Debug, Clone)] +pub struct FixedSizeBinaryArrayData { + data_type: DataType, + nulls: Option, + values: Buffer, +} + +impl FixedSizeBinaryArrayData { + /// Creates a new [`FixedSizeBinaryArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `nulls.len() == values.len() / element_size` + pub unsafe fn new_unchecked( + data_type: DataType, + values: Buffer, + nulls: Option, + ) -> Self { + Self { + data_type, + nulls, + values, + } + } + + /// Returns the raw byte data + #[inline] + pub fn values(&self) -> &[u8] { + &self.values + } + + /// Returns the null buffer if any + #[inline] + pub fn null_buffer(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs new file mode 100644 index 000000000000..2ec4ee005287 --- /dev/null +++ b/arrow-data/src/data/dictionary.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::DictionaryKeyType; +use crate::ArrayData; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::DataType; + +mod private { + use super::*; + + pub trait DictionaryKeySealed { + /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] + fn downcast_ref(data: &ArrayDataDictionary) -> Option<&DictionaryArrayData> + where + Self: DictionaryKey; + + /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] + fn downcast(data: ArrayDataDictionary) -> Option> + where + Self: DictionaryKey; + + /// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`] + fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary + where + Self: DictionaryKey; + } +} + +/// Types of dictionary key used by dictionary arrays +pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType { + const TYPE: DictionaryKeyType; +} + +macro_rules! dictionary { + ($t:ty,$v:ident) => { + impl DictionaryKey for $t { + const TYPE: DictionaryKeyType = DictionaryKeyType::$v; + } + impl private::DictionaryKeySealed for $t { + fn downcast_ref( + data: &ArrayDataDictionary, + ) -> Option<&DictionaryArrayData> { + match data { + ArrayDataDictionary::$v(v) => Some(v), + _ => None, + } + } + + fn downcast(data: ArrayDataDictionary) -> Option> { + match data { + ArrayDataDictionary::$v(v) => Some(v), + _ => None, + } + } + + fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary { + ArrayDataDictionary::$v(v) + } + } + }; +} + +dictionary!(i8, Int8); +dictionary!(i16, Int16); +dictionary!(i32, Int32); +dictionary!(i64, Int64); +dictionary!(u8, UInt8); +dictionary!(u16, UInt16); +dictionary!(u32, UInt32); +dictionary!(u64, UInt64); + +/// An enumeration of the types of [`DictionaryArrayData`] +#[derive(Debug, Clone)] +pub enum ArrayDataDictionary { + Int8(DictionaryArrayData), + Int16(DictionaryArrayData), + Int32(DictionaryArrayData), + Int64(DictionaryArrayData), + UInt8(DictionaryArrayData), + UInt16(DictionaryArrayData), + UInt32(DictionaryArrayData), + UInt64(DictionaryArrayData), +} + +impl ArrayDataDictionary { + /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] + pub fn downcast_ref(&self) -> Option<&DictionaryArrayData> { + K::downcast_ref(self) + } + + /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] + pub fn downcast(self) -> Option> { + K::downcast(self) + } +} + +impl From> for ArrayDataDictionary { + fn from(value: DictionaryArrayData) -> Self { + K::upcast(value) + } +} + +/// ArrayData for [dictionary arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) +#[derive(Debug, Clone)] +pub struct DictionaryArrayData { + data_type: DataType, + nulls: Option, + keys: ScalarBuffer, + child: Box, +} + +impl DictionaryArrayData { + /// Create a new [`DictionaryArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - child must have a type matching `data_type` + /// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls` + /// - `nulls` must have the same length as `child` + pub unsafe fn new_unchecked( + data_type: DataType, + keys: ScalarBuffer, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + keys, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the keys + #[inline] + pub fn keys(&self) -> &[K] { + &self.keys + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs new file mode 100644 index 000000000000..59909289e933 --- /dev/null +++ b/arrow-data/src/data/list.rs @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::OffsetType; +use crate::ArrayData; +use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_schema::DataType; + +mod private { + use super::*; + + pub trait ListOffsetSealed { + /// Downcast [`ArrayDataList`] to `[ListArrayData`] + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset; + + /// Downcast [`ArrayDataList`] to `[ListArrayData`] + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset; + + /// Cast [`ListArrayData`] to [`ArrayDataList`] + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset; + } +} + +/// Types of offset used by variable length list arrays +pub trait ListOffset: private::ListOffsetSealed + ArrowNativeType { + const TYPE: OffsetType; +} + +impl ListOffset for i32 { + const TYPE: OffsetType = OffsetType::Int32; +} + +impl private::ListOffsetSealed for i32 { + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(v) => Some(v), + ArrayDataList::Large(_) => None, + } + } + + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(v) => Some(v), + ArrayDataList::Large(_) => None, + } + } + + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset, + { + ArrayDataList::Small(v) + } +} + +impl ListOffset for i64 { + const TYPE: OffsetType = OffsetType::Int64; +} + +impl private::ListOffsetSealed for i64 { + fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(_) => None, + ArrayDataList::Large(v) => Some(v), + } + } + + fn downcast(data: ArrayDataList) -> Option> + where + Self: ListOffset, + { + match data { + ArrayDataList::Small(_) => None, + ArrayDataList::Large(v) => Some(v), + } + } + + fn upcast(v: ListArrayData) -> ArrayDataList + where + Self: ListOffset, + { + ArrayDataList::Large(v) + } +} + +/// An enumeration of the types of [`ListArrayData`] +#[derive(Debug, Clone)] +pub enum ArrayDataList { + Small(ListArrayData), + Large(ListArrayData), +} + +impl ArrayDataList { + /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] + pub fn downcast_ref(&self) -> Option<&ListArrayData> { + O::downcast_ref(self) + } + + /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] + pub fn downcast(self) -> Option> { + O::downcast(self) + } +} + +impl From> for ArrayDataList { + fn from(value: ListArrayData) -> Self { + O::upcast(value) + } +} + +/// ArrayData for [variable-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) +#[derive(Debug, Clone)] +pub struct ListArrayData { + data_type: DataType, + nulls: Option, + offsets: ScalarBuffer, + child: Box, +} + +impl ListArrayData { + /// Create a new [`ListArrayData`] + /// + /// # Safety + /// + /// - Each consecutive window of `offsets` must identify a valid slice of `child` + /// - `nulls.len() == offsets.len() - 1` + /// - `data_type` must be valid for this layout + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: ScalarBuffer, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + offsets, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the offsets + #[inline] + pub fn offsets(&self) -> &[O] { + &self.offsets + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} + +/// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) +#[derive(Debug, Clone)] +pub struct FixedSizeListArrayData { + data_type: DataType, + nulls: Option, + child: Box, +} + +impl FixedSizeListArrayData { + /// Create a new [`FixedSizeListArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `nulls.len() == values.len() / element_size` + pub unsafe fn new_unchecked( + data_type: DataType, + nulls: Option, + child: ArrayData, + ) -> Self { + Self { + data_type, + nulls, + child: Box::new(child), + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index eb1fe2bcffa2..2f9e142b1d96 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -32,9 +32,19 @@ use crate::equal; #[allow(unused)] // Private until ready (#1176) mod bytes; #[allow(unused)] // Private until ready (#1176) +mod dictionary; +#[allow(unused)] // Private until ready (#1176) +mod list; +#[allow(unused)] // Private until ready (#1176) mod primitive; #[allow(unused)] // Private until ready (#1176) +mod run; +#[allow(unused)] // Private until ready (#1176) +mod r#struct; +#[allow(unused)] // Private until ready (#1176) mod types; +#[allow(unused)] // Private until ready (#1176) +mod union; #[inline] pub(crate) fn contains_nulls( diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs index d34ef42dbbb7..058b3e822056 100644 --- a/arrow-data/src/data/primitive.rs +++ b/arrow-data/src/data/primitive.rs @@ -43,13 +43,13 @@ mod private { } pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { - const VARIANT: PrimitiveType; + const TYPE: PrimitiveType; } macro_rules! primitive { ($t:ty,$v:ident) => { impl Primitive for $t { - const VARIANT: PrimitiveType = PrimitiveType::$v; + const TYPE: PrimitiveType = PrimitiveType::$v; } impl private::PrimitiveSealed for $t { fn downcast_ref( @@ -118,7 +118,13 @@ impl ArrayDataPrimitive { } } -/// ArrayData for arrays of [`Primitive`] +impl From> for ArrayDataPrimitive { + fn from(value: PrimitiveArrayData

) -> Self { + P::upcast(value) + } +} + +/// ArrayData for [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of [`Primitive`] #[derive(Debug, Clone)] pub struct PrimitiveArrayData { data_type: DataType, @@ -126,12 +132,6 @@ pub struct PrimitiveArrayData { values: ScalarBuffer, } -impl From> for ArrayDataPrimitive { - fn from(value: PrimitiveArrayData

) -> Self { - P::upcast(value) - } -} - impl PrimitiveArrayData { /// Create a new [`PrimitiveArrayData`] /// @@ -147,10 +147,10 @@ impl PrimitiveArrayData { ) -> Self { let physical = PhysicalType::from(&data_type); assert!( - matches!(physical, PhysicalType::Primitive(p) if p == T::VARIANT), + matches!(physical, PhysicalType::Primitive(p) if p == T::TYPE), "Illegal physical type for PrimitiveArrayData of datatype {:?}, expected {:?} got {:?}", data_type, - T::VARIANT, + T::TYPE, physical ); diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs new file mode 100644 index 000000000000..cd993de1bf25 --- /dev/null +++ b/arrow-data/src/data/run.rs @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::RunEndType; +use crate::ArrayData; +use arrow_buffer::buffer::ScalarBuffer; +use arrow_buffer::ArrowNativeType; +use arrow_schema::DataType; +use std::marker::PhantomData; + +mod private { + use super::*; + + pub trait RunEndSealed { + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> + where + Self: RunEnd; + + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] + fn downcast(data: ArrayDataRun) -> Option> + where + Self: RunEnd; + + /// Cast [`RunArrayData`] to [`ArrayDataRun`] + fn upcast(v: RunArrayData) -> ArrayDataRun + where + Self: RunEnd; + } +} + +pub trait RunEnd: private::RunEndSealed + ArrowNativeType { + const TYPE: RunEndType; +} + +macro_rules! run_end { + ($t:ty,$v:ident) => { + impl RunEnd for $t { + const TYPE: RunEndType = RunEndType::$v; + } + impl private::RunEndSealed for $t { + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { + match data { + ArrayDataRun::$v(v) => Some(v), + _ => None, + } + } + + fn downcast(data: ArrayDataRun) -> Option> { + match data { + ArrayDataRun::$v(v) => Some(v), + _ => None, + } + } + + fn upcast(v: RunArrayData) -> ArrayDataRun { + ArrayDataRun::$v(v) + } + } + }; +} + +run_end!(i16, Int16); +run_end!(i32, Int32); +run_end!(i64, Int64); + +/// An enumeration of the types of [`RunArrayData`] +pub enum ArrayDataRun { + Int16(RunArrayData), + Int32(RunArrayData), + Int64(RunArrayData), +} + +impl ArrayDataRun { + /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] + pub fn downcast_ref(&self) -> Option<&RunArrayData> { + E::downcast_ref(self) + } + + /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] + pub fn downcast(self) -> Option> { + E::downcast(self) + } +} + +impl From> for ArrayDataRun { + fn from(value: RunArrayData) -> Self { + E::upcast(value) + } +} + +/// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) +pub struct RunArrayData { + data_type: DataType, + run_ends: ScalarBuffer, + child: Box, +} + +impl RunArrayData { + /// Create a new [`RunArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `run_ends` must contain monotonically increasing, positive values `<= child.len()` + pub unsafe fn new_unchecked( + data_type: DataType, + run_ends: ScalarBuffer, + child: ArrayData, + ) -> Self { + Self { + data_type, + run_ends, + child: Box::new(child), + } + } + + /// Returns the run ends + #[inline] + pub fn run_ends(&self) -> &[E] { + &self.run_ends + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the child data + #[inline] + pub fn child(&self) -> &ArrayData { + self.child.as_ref() + } +} diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs new file mode 100644 index 000000000000..d9999261902e --- /dev/null +++ b/arrow-data/src/data/struct.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use arrow_buffer::buffer::NullBuffer; +use arrow_schema::DataType; + +/// ArrayData for [struct arrays](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) +#[derive(Debug, Clone)] +pub struct StructArrayData { + data_type: DataType, + len: usize, + nulls: Option, + children: Vec, +} + +impl StructArrayData { + /// Create a new [`StructArrayData`] + /// + /// # Safety + /// + /// - data_type must be a StructArray with fields matching `child_data` + /// - all child data and nulls must have length matching `len` + pub unsafe fn new_unchecked( + data_type: DataType, + len: usize, + nulls: Option, + children: Vec, + ) -> Self { + Self { + data_type, + len, + nulls, + children, + } + } + + /// Returns the length of this [`StructArrayData`] + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if this [`StructArrayData`] has zero length + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the primitive values + #[inline] + pub fn children(&self) -> &[ArrayData] { + &self.children + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs index 09e169f6aa61..3414e481ca66 100644 --- a/arrow-data/src/data/types.rs +++ b/arrow-data/src/data/types.rs @@ -80,7 +80,6 @@ pub enum PhysicalType { Bytes(OffsetType, BytesType), FixedSizeList, List(OffsetType), - Map, Struct, Union, Dictionary(DictionaryKeyType), @@ -141,7 +140,7 @@ impl From<&DataType> for PhysicalType { DataType::UInt64 => Self::Dictionary(DictionaryKeyType::UInt64), d => panic!("illegal dictionary key data type {d}"), }, - DataType::Map(_, _) => Self::Map, + DataType::Map(_, _) => Self::List(OffsetType::Int32), DataType::RunEndEncoded(f, _) => match f.data_type() { DataType::Int16 => Self::Run(RunEndType::Int16), DataType::Int32 => Self::Run(RunEndType::Int32), diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs new file mode 100644 index 000000000000..7861bd154e71 --- /dev/null +++ b/arrow-data/src/data/union.rs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use arrow_buffer::buffer::ScalarBuffer; +use arrow_schema::DataType; + +/// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) +#[derive(Debug, Clone)] +pub struct UnionArrayData { + data_type: DataType, + type_ids: ScalarBuffer, + offsets: Option>, + children: Vec, +} + +impl UnionArrayData { + /// Creates a new [`UnionArrayData`] + /// + /// # Safety + /// + /// - `data_type` must be valid for this layout + /// - `type_ids` must only contain values corresponding to a field in `data_type` + /// - `children` must match the field definitions in `data_type` + /// - For each value id in type_ids, the corresponding offset, must be in bounds for the child + pub unsafe fn new_unchecked( + data_type: DataType, + type_ids: ScalarBuffer, + offsets: Option>, + children: Vec, + ) -> Self { + Self { + data_type, + type_ids, + offsets, + children, + } + } + + /// Returns the type ids for this array + #[inline] + pub fn type_ids(&self) -> &[i8] { + &self.type_ids + } + + /// Returns the offsets for this array if this is a dense union + #[inline] + pub fn offsets(&self) -> Option<&[i32]> { + self.offsets.as_deref() + } + + /// Returns the children of this array + #[inline] + pub fn children(&self) -> &[ArrayData] { + &self.children + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} From eff058fc7a156d1b22569bd60a747d98960d97e7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 12:57:30 +0000 Subject: [PATCH 0642/1411] Use NullBuffer in ArrayData (#3775) (#3778) * Use NullBuffer in ArrayData (#3775) * Clippy * Format * Doc * Tweaks * Review feedback --- arrow-arith/src/aggregate.rs | 28 +- arrow-arith/src/arithmetic.rs | 11 +- arrow-arith/src/arity.rs | 14 +- arrow-arith/src/boolean.rs | 125 +++++---- arrow-array/src/array/binary_array.rs | 2 +- arrow-array/src/array/boolean_array.rs | 14 +- arrow-array/src/array/byte_array.rs | 5 +- arrow-array/src/array/dictionary_array.rs | 33 +-- .../src/array/fixed_size_binary_array.rs | 2 +- arrow-array/src/array/null_array.rs | 2 +- arrow-array/src/array/primitive_array.rs | 18 +- arrow-array/src/array/struct_array.rs | 17 +- arrow-array/src/builder/boolean_builder.rs | 4 +- .../src/builder/generic_bytes_builder.rs | 4 +- arrow-array/src/builder/struct_builder.rs | 6 +- arrow-array/src/lib.rs | 6 +- arrow-array/src/record_batch.rs | 2 +- arrow-buffer/src/buffer/boolean.rs | 60 ++++- arrow-buffer/src/buffer/null.rs | 38 ++- arrow-cast/src/cast.rs | 62 ++--- arrow-data/src/bit_mask.rs | 5 +- arrow-data/src/bitmap.rs | 189 ------------- arrow-data/src/data/mod.rs | 253 ++++++++++-------- arrow-data/src/equal/boolean.rs | 18 +- arrow-data/src/equal/dictionary.rs | 12 +- arrow-data/src/equal/fixed_binary.rs | 21 +- arrow-data/src/equal/fixed_list.rs | 11 +- arrow-data/src/equal/list.rs | 13 +- arrow-data/src/equal/primitive.rs | 21 +- arrow-data/src/equal/structure.rs | 11 +- arrow-data/src/equal/utils.rs | 19 +- arrow-data/src/equal/variable_size.rs | 16 +- arrow-data/src/ffi.rs | 29 +- arrow-data/src/lib.rs | 2 - arrow-data/src/transform/mod.rs | 15 +- arrow-ipc/src/reader.rs | 2 - arrow-ipc/src/writer.rs | 5 +- arrow-json/src/raw/list_array.rs | 9 +- arrow-json/src/raw/map_array.rs | 10 +- arrow-json/src/raw/mod.rs | 10 +- arrow-json/src/raw/struct_array.rs | 10 +- arrow-json/src/reader.rs | 6 +- arrow-ord/src/comparison.rs | 15 +- arrow-ord/src/sort.rs | 1 - arrow-row/src/list.rs | 3 +- arrow-select/src/filter.rs | 8 +- arrow-select/src/nullif.rs | 22 +- arrow-select/src/take.rs | 26 +- arrow-string/src/length.rs | 5 +- arrow-string/src/regexp.rs | 2 +- arrow-string/src/substring.rs | 20 +- arrow/src/lib.rs | 4 - arrow/tests/array_validation.rs | 12 +- parquet/src/arrow/arrow_writer/levels.rs | 16 +- .../arrow/record_reader/definition_levels.rs | 7 +- parquet/src/arrow/record_reader/mod.rs | 15 +- 56 files changed, 577 insertions(+), 719 deletions(-) delete mode 100644 arrow-data/src/bitmap.rs diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index b578dbd4a94c..7777bb0ede43 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -117,8 +117,8 @@ where .map(|i| unsafe { array.value_unchecked(i) }) .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { - let null_buffer = array.data_ref().null_buffer().unwrap(); - let iter = BitIndexIterator::new(null_buffer, array.offset(), array.len()); + let nulls = array.data().nulls().unwrap(); + let iter = BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()); unsafe { let idx = iter.reduce(|acc_idx, idx| { let acc = array.value_unchecked(acc_idx); @@ -288,7 +288,7 @@ where let data: &[T::Native] = array.values(); - match array.data().null_buffer() { + match array.data().nulls() { None => { let sum = data.iter().fold(T::default_value(), |accumulator, value| { accumulator.add_wrapping(*value) @@ -296,12 +296,12 @@ where Some(sum) } - Some(buffer) => { + Some(nulls) => { let mut sum = T::default_value(); let data_chunks = data.chunks_exact(64); let remainder = data_chunks.remainder(); - let bit_chunks = buffer.bit_chunks(array.offset(), array.len()); + let bit_chunks = nulls.inner().bit_chunks(); data_chunks .zip(bit_chunks.iter()) .for_each(|(chunk, mask)| { @@ -347,7 +347,7 @@ where let data: &[T::Native] = array.values(); - match array.data().null_buffer() { + match array.data().nulls() { None => { let sum = data .iter() @@ -357,14 +357,14 @@ where Ok(Some(sum)) } - Some(buffer) => { + Some(nulls) => { let mut sum = T::default_value(); try_for_each_valid_idx( - array.len(), - array.offset(), - null_count, - Some(buffer.as_slice()), + nulls.len(), + nulls.offset(), + nulls.null_count(), + Some(nulls.validity()), |idx| { unsafe { sum = sum.add_checked(array.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) @@ -665,7 +665,7 @@ mod simd { let mut chunk_acc = A::init_accumulator_chunk(); let mut rem_acc = A::init_accumulator_scalar(); - match array.data().null_buffer() { + match array.data().nulls() { None => { let data_chunks = data.chunks_exact(64); let remainder = data_chunks.remainder(); @@ -681,12 +681,12 @@ mod simd { A::accumulate_scalar(&mut rem_acc, *value); }); } - Some(buffer) => { + Some(nulls) => { // process data in chunks of 64 elements since we also get 64 bits of validity information at a time let data_chunks = data.chunks_exact(64); let remainder = data_chunks.remainder(); - let bit_chunks = buffer.bit_chunks(array.offset(), array.len()); + let bit_chunks = nulls.inner().bit_chunks(); let remainder_bits = bit_chunks.remainder_bits(); data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| { diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 40e7d6780377..0fb559f0651f 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1572,6 +1572,7 @@ mod tests { use arrow_array::builder::{ BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, }; + use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::i256; use arrow_data::ArrayDataBuilder; use chrono::NaiveDate; @@ -3057,15 +3058,19 @@ mod tests { // `count_set_bits_offset` takes len in bits as parameter. assert_eq!(null_buffer.count_set_bits_offset(0, 13), 0); + let nulls = BooleanBuffer::new(null_buffer, 0, 13); + assert_eq!(nulls.count_set_bits(), 0); + let nulls = NullBuffer::new(nulls); + assert_eq!(nulls.null_count(), 13); + let mut data_buffer_builder = BufferBuilder::::new(13); data_buffer_builder.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); let data_buffer = data_buffer_builder.finish(); let arg1: Int32Array = ArrayDataBuilder::new(DataType::Int32) .len(13) - .null_count(13) + .nulls(Some(nulls)) .buffers(vec![data_buffer]) - .null_bit_buffer(Some(null_buffer)) .build() .unwrap() .into(); @@ -3078,9 +3083,7 @@ mod tests { let arg2: Int32Array = ArrayDataBuilder::new(DataType::Int32) .len(13) - .null_count(0) .buffers(vec![data_buffer]) - .null_bit_buffer(None) .build() .unwrap() .into(); diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 3e7a81862927..ea078765df1a 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -20,6 +20,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::iterator::ArrayIter; use arrow_array::*; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::bit_mask::combine_option_bitmap; @@ -276,10 +277,7 @@ where let len = a.len(); let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits_offset(0, len)) - .unwrap_or_default(); + let nulls = null_buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len))); let mut builder = a.into_builder()?; @@ -289,13 +287,7 @@ where .zip(b.values()) .for_each(|(l, r)| *l = op(*l, *r)); - let array_builder = builder - .finish() - .data() - .clone() - .into_builder() - .null_bit_buffer(null_buffer) - .null_count(null_count); + let array_builder = builder.finish().into_data().into_builder().nulls(nulls); let array_data = unsafe { array_builder.build_unchecked() }; Ok(Ok(PrimitiveArray::::from(array_data))) diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 4c1a02ad7498..5bd39a673426 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -39,16 +39,13 @@ use arrow_schema::{ArrowError, DataType}; /// of one side if other side is a false value. pub(crate) fn build_null_buffer_for_and_kleene( left_data: &ArrayData, - left_offset: usize, right_data: &ArrayData, - right_offset: usize, - len_in_bits: usize, ) -> Option { let left_buffer = &left_data.buffers()[0]; let right_buffer = &right_data.buffers()[0]; - let left_null_buffer = left_data.null_buffer(); - let right_null_buffer = right_data.null_buffer(); + let left_null_buffer = left_data.nulls(); + let right_null_buffer = right_data.nulls(); match (left_null_buffer, right_null_buffer) { (None, None) => None, @@ -58,22 +55,22 @@ pub(crate) fn build_null_buffer_for_and_kleene( // 1. left null bit is set, or // 2. right data bit is false (because null AND false = false). Some(bitwise_bin_op_helper( - left_null_buffer, - left_offset, + left_null_buffer.buffer(), + left_null_buffer.offset(), right_buffer, - right_offset, - len_in_bits, + right_data.offset(), + left_data.len(), |a, b| a | !b, )) } (None, Some(right_null_buffer)) => { // Same as above Some(bitwise_bin_op_helper( - right_null_buffer, - right_offset, + right_null_buffer.buffer(), + right_null_buffer.offset(), left_buffer, - left_offset, - len_in_bits, + left_data.offset(), + left_data.len(), |a, b| a | !b, )) } @@ -85,13 +82,18 @@ pub(crate) fn build_null_buffer_for_and_kleene( // (a | (c & !d)) & (c | (a & !b)) Some(bitwise_quaternary_op_helper( [ - left_null_buffer, + left_null_buffer.buffer(), left_buffer, - right_null_buffer, + right_null_buffer.buffer(), right_buffer, ], - [left_offset, left_offset, right_offset, right_offset], - len_in_bits, + [ + left_null_buffer.offset(), + left_data.offset(), + right_null_buffer.offset(), + right_data.offset(), + ], + left_data.len(), |a, b, c, d| (a | (c & !d)) & (c | (a & !b)), )) } @@ -101,13 +103,10 @@ pub(crate) fn build_null_buffer_for_and_kleene( /// For AND/OR kernels, the result of null buffer is simply a bitwise `and` operation. pub(crate) fn build_null_buffer_for_and_or( left_data: &ArrayData, - _left_offset: usize, right_data: &ArrayData, - _right_offset: usize, - len_in_bits: usize, ) -> Option { // `arrays` are not empty, so safely do `unwrap` directly. - combine_option_bitmap(&[left_data, right_data], len_in_bits) + combine_option_bitmap(&[left_data, right_data], left_data.len()) } /// Updates null buffer based on data buffer and null buffer of the operand at other side @@ -116,45 +115,39 @@ pub(crate) fn build_null_buffer_for_and_or( /// buffer of one side if other side is a true value. pub(crate) fn build_null_buffer_for_or_kleene( left_data: &ArrayData, - left_offset: usize, right_data: &ArrayData, - right_offset: usize, - len_in_bits: usize, ) -> Option { let left_buffer = &left_data.buffers()[0]; let right_buffer = &right_data.buffers()[0]; - let left_null_buffer = left_data.null_buffer(); - let right_null_buffer = right_data.null_buffer(); - - match (left_null_buffer, right_null_buffer) { + match (left_data.nulls(), right_data.nulls()) { (None, None) => None, - (Some(left_null_buffer), None) => { + (Some(left_nulls), None) => { // The right side has no null values. // The final null bit is set only if: // 1. left null bit is set, or // 2. right data bit is true (because null OR true = true). Some(bitwise_bin_op_helper( - left_null_buffer, - left_offset, + left_nulls.buffer(), + left_nulls.offset(), right_buffer, - right_offset, - len_in_bits, + right_data.offset(), + right_data.len(), |a, b| a | b, )) } - (None, Some(right_null_buffer)) => { + (None, Some(right_nulls)) => { // Same as above Some(bitwise_bin_op_helper( - right_null_buffer, - right_offset, + right_nulls.buffer(), + right_nulls.offset(), left_buffer, - left_offset, - len_in_bits, + left_data.offset(), + left_data.len(), |a, b| a | b, )) } - (Some(left_null_buffer), Some(right_null_buffer)) => { + (Some(left_nulls), Some(right_nulls)) => { // Follow the same logic above. Both sides have null values. // Assume a is left null bits, b is left data bits, c is right null bits, // d is right data bits. @@ -162,13 +155,18 @@ pub(crate) fn build_null_buffer_for_or_kleene( // (a | (c & d)) & (c | (a & b)) Some(bitwise_quaternary_op_helper( [ - left_null_buffer, + left_nulls.buffer(), left_buffer, - right_null_buffer, + right_nulls.buffer(), right_buffer, ], - [left_offset, left_offset, right_offset, right_offset], - len_in_bits, + [ + left_nulls.offset(), + left_data.offset(), + right_nulls.offset(), + right_data.offset(), + ], + left_data.len(), |a, b, c, d| (a | (c & d)) & (c | (a & b)), )) } @@ -184,7 +182,7 @@ pub(crate) fn binary_boolean_kernel( ) -> Result where F: Fn(&Buffer, usize, &Buffer, usize, usize) -> Buffer, - U: Fn(&ArrayData, usize, &ArrayData, usize, usize) -> Option, + U: Fn(&ArrayData, &ArrayData) -> Option, { if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -202,7 +200,7 @@ where let left_offset = left.offset(); let right_offset = right.offset(); - let null_bit_buffer = null_op(left_data, left_offset, right_data, right_offset, len); + let null_bit_buffer = null_op(left_data, right_data); let values = op(left_buffer, left_offset, right_buffer, right_offset, len); @@ -353,10 +351,7 @@ pub fn not(left: &BooleanArray) -> Result { let len = left.len(); let data = left.data_ref(); - let null_bit_buffer = data - .null_bitmap() - .as_ref() - .map(|b| b.buffer().bit_slice(left_offset, len)); + let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let values = buffer_unary_not(&data.buffers()[0], left_offset, len); @@ -388,12 +383,12 @@ pub fn not(left: &BooleanArray) -> Result { pub fn is_null(input: &dyn Array) -> Result { let len = input.len(); - let output = match input.data_ref().null_buffer() { + let output = match input.data_ref().nulls() { None => { let len_bytes = ceil(len, 8); MutableBuffer::from_len_zeroed(len_bytes).into() } - Some(buffer) => buffer_unary_not(buffer, input.offset(), len), + Some(nulls) => buffer_unary_not(nulls.buffer(), nulls.offset(), nulls.len()), }; let data = unsafe { @@ -425,14 +420,14 @@ pub fn is_null(input: &dyn Array) -> Result { pub fn is_not_null(input: &dyn Array) -> Result { let len = input.len(); - let output = match input.data_ref().null_buffer() { + let output = match input.data_ref().nulls() { None => { let len_bytes = ceil(len, 8); MutableBuffer::new(len_bytes) .with_bitset(len_bytes, true) .into() } - Some(buffer) => buffer.bit_slice(input.offset(), len), + Some(nulls) => nulls.inner().sliced(), }; let data = unsafe { @@ -615,7 +610,7 @@ mod tests { let a = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is absent - assert!(a.data_ref().null_bitmap().is_none()); + assert!(a.data().nulls().is_none()); let b = BooleanArray::from(vec![ Some(true), @@ -627,7 +622,7 @@ mod tests { ]); // ensure null bitmap of b is present - assert!(b.data_ref().null_bitmap().is_some()); + assert!(b.data().nulls().is_some()); let c = or_kleene(&a, &b).unwrap(); @@ -655,12 +650,12 @@ mod tests { ]); // ensure null bitmap of b is absent - assert!(a.data_ref().null_bitmap().is_some()); + assert!(a.data().nulls().is_some()); let b = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is present - assert!(b.data_ref().null_bitmap().is_none()); + assert!(b.data().nulls().is_none()); let c = or_kleene(&a, &b).unwrap(); @@ -857,7 +852,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -870,7 +865,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -882,7 +877,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -895,7 +890,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -907,7 +902,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -938,7 +933,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -950,7 +945,7 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } #[test] @@ -981,6 +976,6 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert_eq!(None, res.data_ref().null_bitmap()); + assert!(res.data().nulls().is_none()); } } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 50757dcbe1b6..1a3270a70d80 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -77,7 +77,7 @@ impl GenericBinaryArray { .offset(v.offset()) .add_buffer(v.data_ref().buffers()[0].clone()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .null_bit_buffer(v.data_ref().null_buffer().cloned()); + .nulls(v.data().nulls().cloned()); let data = unsafe { builder.build_unchecked() }; Self::from(data) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 8d1296c662fc..e924824e75ea 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -105,9 +105,9 @@ impl BooleanArray { /// Returns the number of non null, true values within this array pub fn true_count(&self) -> usize { - match self.data.null_buffer() { + match self.data.nulls() { Some(nulls) => { - let null_chunks = nulls.bit_chunks(self.offset(), self.len()); + let null_chunks = nulls.inner().bit_chunks(); let value_chunks = self.values().bit_chunks(self.offset(), self.len()); null_chunks .iter() @@ -187,11 +187,7 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let null_bit_buffer = left - .data() - .null_buffer() - .map(|b| b.bit_slice(left.offset(), left.len())); - + let null_bit_buffer = left.data().nulls().map(|x| x.inner().sliced()); let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) @@ -459,7 +455,7 @@ mod tests { assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); - assert!(arr.data().null_buffer().is_none()); + assert!(arr.data().nulls().is_none()); for i in 0..3 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); @@ -474,7 +470,7 @@ mod tests { assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(2, arr.null_count()); - assert!(arr.data().null_buffer().is_some()); + assert!(arr.data().nulls().is_some()); assert!(arr.is_valid(0)); assert!(arr.is_null(1)); diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f6946228c85c..442e795cec52 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -137,10 +137,7 @@ impl GenericByteArray { /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self - .data - .null_buffer() - .map(|b| b.bit_slice(self.data.offset(), len)); + let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); let offset_buffer = self.data.buffers()[0] diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index eb2f1b606bb1..60426e5b3c4d 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -249,23 +249,15 @@ impl DictionaryArray { // Note: This use the ArrayDataBuilder::build_unchecked and afterwards // call the new function which only validates that the keys are in bounds. - let mut data = ArrayData::builder(dict_data_type) - .len(keys.len()) - .add_buffer(keys.data().buffers()[0].clone()) + let data = keys.data().clone(); + let builder = data + .into_builder() + .data_type(dict_data_type) .add_child_data(values.data().clone()); - match keys.data().null_buffer() { - Some(buffer) if keys.data().null_count() > 0 => { - data = data - .null_bit_buffer(Some(buffer.clone())) - .null_count(keys.data().null_count()); - } - _ => data = data.null_count(0), - } - // Safety: `validate` ensures key type is correct, and // `validate_values` ensures all offsets are within range - let array = unsafe { data.build_unchecked() }; + let array = unsafe { builder.build_unchecked() }; array.validate()?; array.validate_values()?; @@ -430,16 +422,13 @@ impl From for DictionaryArray { // create a zero-copy of the keys' data // SAFETY: // ArrayData is valid and verified type above + let keys = PrimitiveArray::::from(unsafe { - ArrayData::new_unchecked( - T::DATA_TYPE, - data.len(), - Some(data.null_count()), - data.null_buffer().cloned(), - data.offset(), - data.buffers().to_vec(), - vec![], - ) + data.clone() + .into_builder() + .data_type(T::DATA_TYPE) + .child_data(vec![]) + .build_unchecked() }); let values = make_array(data.child_data()[0].clone()); Self { diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 89ace430d8af..e927c8d8ae58 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -408,7 +408,7 @@ impl From for FixedSizeBinaryArray { .len(v.len()) .offset(v.offset()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .null_bit_buffer(v.data_ref().null_buffer().cloned()); + .nulls(v.data_ref().nulls().cloned()); let data = unsafe { builder.build_unchecked() }; Self::from(data) diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 6b68aace706f..8eb8e64b0eda 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -99,7 +99,7 @@ impl From for NullArray { "NullArray data should contain 0 buffers" ); assert!( - data.null_buffer().is_none(), + data.nulls().is_none(), "NullArray data should not contain a null buffer, as no buffers are required" ); Self { data } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 53217a06f497..0e28060b25f8 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -443,7 +443,7 @@ impl PrimitiveArray { let len = self.len(); let null_count = self.null_count(); - let null_buffer = data.null_buffer().map(|b| b.bit_slice(data.offset(), len)); + let null_buffer = data.nulls().map(|b| b.inner().sliced()); let values = self.values().iter().map(|v| op(*v)); // JUSTIFICATION // Benefit @@ -500,7 +500,7 @@ impl PrimitiveArray { let len = self.len(); let null_count = self.null_count(); - let null_buffer = data.null_buffer().map(|b| b.bit_slice(data.offset(), len)); + let null_buffer = data.nulls().map(|b| b.inner().sliced()); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); let slice = buffer.as_slice_mut(); @@ -567,9 +567,10 @@ impl PrimitiveArray { { let data = self.data(); let len = data.len(); - let offset = data.offset(); - let null_count = data.null_count(); - let nulls = data.null_buffer().map(|x| x.as_slice()); + let (nulls, null_count, offset) = match data.nulls() { + Some(n) => (Some(n.validity()), n.null_count(), n.offset()), + None => (None, 0, 0), + }; let mut null_builder = BooleanBufferBuilder::new(len); match nulls { @@ -608,10 +609,7 @@ impl PrimitiveArray { /// data buffer is not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self - .data - .null_buffer() - .map(|b| b.bit_slice(self.data.offset(), len)); + let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); let buffer = self.data.buffers()[0] @@ -1791,7 +1789,7 @@ mod tests { let primitive_array = PrimitiveArray::::from_iter(iter); assert_eq!(primitive_array.len(), 10); assert_eq!(primitive_array.null_count(), 0); - assert_eq!(primitive_array.data().null_buffer(), None); + assert!(primitive_array.data().nulls().is_none()); assert_eq!(primitive_array.values(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 9149895f6ec9..35d4444e0117 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -154,22 +154,20 @@ impl TryFrom> for StructArray { fields.push(Field::new( field_name, array.data_type().clone(), - child_datum.null_buffer().is_some(), + child_datum.nulls().is_some(), )); - if let Some(child_null_buffer) = child_datum.null_buffer() { - let child_datum_offset = child_datum.offset(); - + if let Some(child_nulls) = child_datum.nulls() { null = Some(if let Some(null_buffer) = &null { buffer_bin_or( null_buffer, 0, - child_null_buffer, - child_datum_offset, + child_nulls.buffer(), + child_nulls.offset(), child_datum_len, ) } else { - child_null_buffer.bit_slice(child_datum_offset, child_datum_len) + child_nulls.inner().sliced() }); } else if null.is_some() { // when one of the fields has no nulls, then there is no null in the array @@ -321,7 +319,6 @@ mod tests { BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray, }; use arrow_buffer::ToByteSlice; - use arrow_data::Bitmap; use std::sync::Arc; #[test] @@ -410,8 +407,8 @@ mod tests { assert_eq!(1, struct_data.null_count()); assert_eq!( // 00001011 - Some(&Bitmap::from(Buffer::from(&[11_u8]))), - struct_data.null_bitmap() + &[11_u8], + struct_data.nulls().unwrap().validity() ); let expected_string_data = ArrayData::builder(DataType::Utf8) diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index eeb39b802948..0862b35b07e0 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -289,7 +289,7 @@ mod tests { let array = builder.finish(); assert_eq!(0, array.null_count()); - assert!(array.data().null_buffer().is_none()); + assert!(array.data().nulls().is_none()); } #[test] @@ -311,7 +311,7 @@ mod tests { assert_eq!(4, array.false_count()); assert_eq!(0, array.null_count()); - assert!(array.data().null_buffer().is_none()); + assert!(array.data().nulls().is_none()); } #[test] diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 406e79c3169c..c723b3349930 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -425,7 +425,7 @@ mod tests { builder.append_value("parquet"); let arr = builder.finish(); // array should not have null buffer because there is not `null` value. - assert_eq!(None, arr.data().null_buffer()); + assert!(arr.data().nulls().is_none()); assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) } @@ -454,7 +454,7 @@ mod tests { builder.append_value("parquet"); arr = builder.finish(); - assert!(arr.data().null_buffer().is_some()); + assert!(arr.data().nulls().is_some()); assert_eq!(&[O::zero()], builder.offsets_slice()); assert_eq!(5, arr.len()); } diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 72aa53e189dd..51b4c7cfcdc6 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -284,7 +284,6 @@ impl StructBuilder { mod tests { use super::*; use arrow_buffer::Buffer; - use arrow_data::Bitmap; use crate::array::Array; @@ -329,10 +328,7 @@ mod tests { let struct_data = arr.data(); assert_eq!(4, struct_data.len()); assert_eq!(1, struct_data.null_count()); - assert_eq!( - Some(&Bitmap::from(Buffer::from(&[11_u8]))), - struct_data.null_bitmap() - ); + assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity()); let expected_string_data = ArrayData::builder(DataType::Utf8) .len(4) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 400b6e262faa..bfdc35c6ce5d 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -141,18 +141,18 @@ //! //! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: //! -//! * An optional [`Bitmap`] identifying any null values +//! * An optional [`NullBuffer`] identifying any null values //! * A contiguous [`Buffer`] of 16-bit integers //! //! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: //! -//! * An optional [`Bitmap`] identifying any null values +//! * An optional [`NullBuffer`] identifying any null values //! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer //! * A values [`Buffer`] of UTF-8 encoded string data //! //! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html //! [`&dyn Array`]: Array -//! [`Bitmap`]: arrow_data::Bitmap +//! [`NullBuffer`]: arrow_buffer::buffer::NullBuffer //! [`Buffer`]: arrow_buffer::Buffer //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 04a559f21603..20e4e19bad39 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -603,7 +603,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 640); + assert_eq!(record_batch.get_array_memory_size(), 672); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 82755a2b0a27..0239111cbafe 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -15,16 +15,33 @@ // specific language governing permissions and limitations // under the License. +use crate::bit_chunk_iterator::BitChunks; use crate::{bit_util, Buffer}; /// A slice-able [`Buffer`] containing bit-packed booleans -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq)] pub struct BooleanBuffer { buffer: Buffer, offset: usize, len: usize, } +impl PartialEq for BooleanBuffer { + fn eq(&self, other: &Self) -> bool { + if self.len != other.len { + return false; + } + + let lhs = self.bit_chunks(); + let rhs = other.bit_chunks(); + + if lhs.iter().zip(rhs.iter()).any(|(a, b)| a != b) { + return false; + } + lhs.remainder_bits() == rhs.remainder_bits() + } +} + impl BooleanBuffer { /// Create a new [`BooleanBuffer`] from a [`Buffer`], an `offset` and `length` in bits /// @@ -47,6 +64,12 @@ impl BooleanBuffer { self.buffer.count_set_bits_offset(self.offset, self.len) } + /// Returns a `BitChunks` instance which can be used to iterate over + /// this buffer's bits in `u64` chunks + pub fn bit_chunks(&self) -> BitChunks { + BitChunks::new(self.values(), self.offset, self.len) + } + /// Returns `true` if the bit at index `i` is set /// /// # Panics @@ -81,4 +104,39 @@ impl BooleanBuffer { pub fn values(&self) -> &[u8] { &self.buffer } + + /// Slices this [`BooleanBuffer`] by the provided `offset` and `length` + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced BooleanBuffer cannot exceed the existing length" + ); + Self { + buffer: self.buffer.clone(), + offset: self.offset + offset, + len, + } + } + + /// Returns a [`Buffer`] containing the sliced contents of this [`BooleanBuffer`] + /// + /// Equivalent to `self.buffer.bit_slice(self.offset, self.len)` + pub fn sliced(&self) -> Buffer { + self.buffer.bit_slice(self.offset, self.len) + } + + /// Returns true if this [`BooleanBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + pub fn ptr_eq(&self, other: &Self) -> bool { + self.buffer.as_ptr() == other.buffer.as_ptr() + && self.offset == other.offset + && self.len == other.len + } + + /// Returns the inner [`Buffer`] + #[inline] + pub fn inner(&self) -> &Buffer { + &self.buffer + } } diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 2d52c9096dce..a4854f1adfed 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -16,8 +16,9 @@ // under the License. use crate::buffer::BooleanBuffer; +use crate::{Buffer, MutableBuffer}; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Eq, PartialEq)] pub struct NullBuffer { buffer: BooleanBuffer, null_count: usize, @@ -30,6 +31,16 @@ impl NullBuffer { Self { buffer, null_count } } + /// Create a new [`NullBuffer`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + let buffer = MutableBuffer::new_null(len).into_buffer(); + let buffer = BooleanBuffer::new(buffer, 0, len); + Self { + buffer, + null_count: len, + } + } + /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count` /// /// # Safety @@ -45,6 +56,12 @@ impl NullBuffer { self.buffer.len() } + /// Returns the offset of this [`NullBuffer`] in bits + #[inline] + pub fn offset(&self) -> usize { + self.buffer.offset() + } + /// Returns true if this [`NullBuffer`] is empty #[inline] pub fn is_empty(&self) -> bool { @@ -69,11 +86,28 @@ impl NullBuffer { !self.is_valid(idx) } - /// Returns the inner buffer + /// Returns the packed validity of this [`NullBuffer`] not including any offset + #[inline] + pub fn validity(&self) -> &[u8] { + self.buffer.values() + } + + /// Slices this [`NullBuffer`] by the provided `offset` and `length` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self::new(self.buffer.slice(offset, len)) + } + + /// Returns the inner [`BooleanBuffer`] #[inline] pub fn inner(&self) -> &BooleanBuffer { &self.buffer } + + /// Returns the underlying [`Buffer`] + #[inline] + pub fn buffer(&self) -> &Buffer { + self.buffer.inner() + } } #[cfg(test)] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index d49775c98211..8e3bde990fcd 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2942,22 +2942,15 @@ fn dictionary_cast( ))); } - // keys are data, child_data is values (dictionary) - let data = unsafe { - ArrayData::new_unchecked( - to_type.clone(), - cast_keys.len(), - Some(cast_keys.null_count()), - cast_keys - .data() - .null_bitmap() - .cloned() - .map(|bitmap| bitmap.into_buffer()), - cast_keys.data().offset(), - cast_keys.data().buffers().to_vec(), - vec![cast_values.into_data()], - ) - }; + let data = cast_keys.into_data(); + let builder = data + .into_builder() + .data_type(to_type.clone()) + .child_data(vec![cast_values.into_data()]); + + // Safety + // Cast keys are still valid + let data = unsafe { builder.build_unchecked() }; // create the appropriate array type let new_array: ArrayRef = match **to_index_type { @@ -3184,11 +3177,7 @@ fn cast_primitive_to_list( to_type.clone(), array.len(), Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .cloned() - .map(|bitmap| bitmap.into_buffer()), + cast_array.data().nulls().map(|b| b.inner().sliced()), 0, vec![offsets.into()], vec![cast_array.into_data()], @@ -3207,23 +3196,18 @@ fn cast_list_inner( to_type: &DataType, cast_options: &CastOptions, ) -> Result { - let data = array.data_ref(); + let data = array.data().clone(); let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?; - let array_data = unsafe { - ArrayData::new_unchecked( - to_type.clone(), - array.len(), - Some(data.null_count()), - data.null_bitmap() - .cloned() - .map(|bitmap| bitmap.into_buffer()), - array.offset(), - // reuse offset buffer - data.buffers().to_vec(), - vec![cast_array.into_data()], - ) - }; + let cast_array = + cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; + let builder = data + .into_builder() + .data_type(to_type.clone()) + .child_data(vec![cast_array.into_data()]); + + // Safety + // Data was valid before + let array_data = unsafe { builder.build_unchecked() }; let list = GenericListArray::::from(array_data); Ok(Arc::new(list) as ArrayRef) } @@ -3302,7 +3286,7 @@ where .len(array.len()) .add_buffer(offset_buffer) .add_buffer(str_values_buf) - .null_bit_buffer(data.null_buffer().cloned()); + .nulls(data.nulls().cloned()); let array_data = unsafe { builder.build_unchecked() }; @@ -3377,7 +3361,7 @@ where .len(array.len()) .add_buffer(offset_buffer) .add_child_data(value_data) - .null_bit_buffer(data.null_buffer().cloned()); + .nulls(data.nulls().cloned()); let array_data = unsafe { builder.build_unchecked() }; Ok(make_array(array_data)) diff --git a/arrow-data/src/bit_mask.rs b/arrow-data/src/bit_mask.rs index ed8e65257788..94ea57259ac8 100644 --- a/arrow-data/src/bit_mask.rs +++ b/arrow-data/src/bit_mask.rs @@ -74,7 +74,10 @@ pub fn combine_option_bitmap( ) -> Option { let (buffer, offset) = arrays .iter() - .map(|array| (array.null_buffer().cloned(), array.offset())) + .map(|array| match array.nulls() { + Some(n) => (Some(n.buffer().clone()), n.offset()), + None => (None, 0), + }) .reduce(|acc, buffer_and_offset| match (acc, buffer_and_offset) { ((None, _), (None, _)) => (None, 0), ((Some(buffer), offset), (None, _)) | ((None, _), (Some(buffer), offset)) => { diff --git a/arrow-data/src/bitmap.rs b/arrow-data/src/bitmap.rs deleted file mode 100644 index a356b9ff7d38..000000000000 --- a/arrow-data/src/bitmap.rs +++ /dev/null @@ -1,189 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines [Bitmap] for tracking validity bitmaps - -use arrow_buffer::bit_util; -use arrow_schema::ArrowError; -use std::mem; - -use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; -use std::ops::{BitAnd, BitOr}; - -#[derive(Debug, Clone)] -/// Defines a bitmap, which is used to track which values in an Arrow -/// array are null. -/// -/// This is called a "validity bitmap" in the Arrow documentation. -pub struct Bitmap { - pub(crate) bits: Buffer, -} - -impl Bitmap { - pub fn new(num_bits: usize) -> Self { - let num_bytes = bit_util::ceil(num_bits, 8); - let len = bit_util::round_upto_multiple_of_64(num_bytes); - Bitmap { - bits: Buffer::from(&vec![0xFF; len]), - } - } - - /// Return the length of this Bitmap in bits (not bytes) - pub fn bit_len(&self) -> usize { - self.bits.len() * 8 - } - - pub fn is_empty(&self) -> bool { - self.bits.is_empty() - } - - pub fn is_set(&self, i: usize) -> bool { - assert!(i < (self.bits.len() << 3)); - unsafe { bit_util::get_bit_raw(self.bits.as_ptr(), i) } - } - - pub fn buffer(&self) -> &Buffer { - &self.bits - } - - pub fn buffer_ref(&self) -> &Buffer { - &self.bits - } - - pub fn into_buffer(self) -> Buffer { - self.bits - } - - /// Returns the total number of bytes of memory occupied by the - /// buffers owned by this [Bitmap]. - /// - /// If multiple [`Bitmap`]s refer to the same underlying - /// [`Buffer`] they will both report the same size. - pub fn get_buffer_memory_size(&self) -> usize { - self.bits.capacity() - } - - /// Returns the total number of bytes of memory occupied - /// physically by this [Bitmap] and its [`Buffer`]s. - /// - /// Equivalent to: `size_of_val(self)` + [`Self::get_buffer_memory_size`] - pub fn get_array_memory_size(&self) -> usize { - self.bits.capacity() + mem::size_of_val(self) - } -} - -impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { - type Output = Result; - - fn bitand(self, rhs: &'b Bitmap) -> Result { - if self.bits.len() != rhs.bits.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise AND.".to_string(), - )); - } - Ok(Bitmap::from(buffer_bin_and( - &self.bits, - 0, - &rhs.bits, - 0, - self.bit_len(), - ))) - } -} - -impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { - type Output = Result; - - fn bitor(self, rhs: &'b Bitmap) -> Result { - if self.bits.len() != rhs.bits.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise OR.".to_string(), - )); - } - Ok(Bitmap::from(buffer_bin_or( - &self.bits, - 0, - &rhs.bits, - 0, - self.bit_len(), - ))) - } -} - -impl From for Bitmap { - fn from(buf: Buffer) -> Self { - Self { bits: buf } - } -} - -impl PartialEq for Bitmap { - fn eq(&self, other: &Self) -> bool { - // buffer equality considers capacity, but here we want to only compare - // actual data contents - let self_len = self.bits.len(); - let other_len = other.bits.len(); - if self_len != other_len { - return false; - } - self.bits.as_slice()[..self_len] == other.bits.as_slice()[..self_len] - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bitmap_length() { - assert_eq!(512, Bitmap::new(63 * 8).bit_len()); - assert_eq!(512, Bitmap::new(64 * 8).bit_len()); - assert_eq!(1024, Bitmap::new(65 * 8).bit_len()); - } - - #[test] - fn test_bitwise_and() { - let bitmap1 = Bitmap::from(Buffer::from([0b01101010])); - let bitmap2 = Bitmap::from(Buffer::from([0b01001110])); - assert_eq!( - Bitmap::from(Buffer::from([0b01001010])), - (&bitmap1 & &bitmap2).unwrap() - ); - } - - #[test] - fn test_bitwise_or() { - let bitmap1 = Bitmap::from(Buffer::from([0b01101010])); - let bitmap2 = Bitmap::from(Buffer::from([0b01001110])); - assert_eq!( - Bitmap::from(Buffer::from([0b01101110])), - (&bitmap1 | &bitmap2).unwrap() - ); - } - - #[test] - fn test_bitmap_is_set() { - let bitmap = Bitmap::from(Buffer::from([0b01001010])); - assert!(!bitmap.is_set(0)); - assert!(bitmap.is_set(1)); - assert!(!bitmap.is_set(2)); - assert!(bitmap.is_set(3)); - assert!(!bitmap.is_set(4)); - assert!(!bitmap.is_set(5)); - assert!(bitmap.is_set(6)); - assert!(!bitmap.is_set(7)); - } -} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 2f9e142b1d96..d76cb9eb19e6 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -18,8 +18,9 @@ //! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap}; +use crate::bit_iterator::BitSliceIterator; use arrow_buffer::bit_chunk_iterator::BitChunks; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, UnionMode}; use std::convert::TryInto; @@ -48,27 +49,32 @@ mod union; #[inline] pub(crate) fn contains_nulls( - null_bit_buffer: Option<&Buffer>, + null_bit_buffer: Option<&NullBuffer>, offset: usize, len: usize, ) -> bool { match null_bit_buffer { - Some(buffer) => match BitSliceIterator::new(buffer, offset, len).next() { - Some((start, end)) => start != 0 || end != len, - None => len != 0, // No non-null values - }, + Some(buffer) => { + match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len) + .next() + { + Some((start, end)) => start != 0 || end != len, + None => len != 0, // No non-null values + } + } None => false, // No null buffer } } #[inline] pub(crate) fn count_nulls( - null_bit_buffer: Option<&Buffer>, + null_bit_buffer: Option<&NullBuffer>, offset: usize, len: usize, ) -> usize { if let Some(buf) = null_bit_buffer { - len - buf.count_set_bits_offset(offset, len) + let buffer = buf.buffer(); + len - buffer.count_set_bits_offset(offset + buf.offset(), len) } else { 0 } @@ -232,9 +238,6 @@ pub struct ArrayData { /// The number of elements in this array data len: usize, - /// The number of null elements in this array data - null_count: usize, - /// The offset into this array data, in number of items offset: usize, @@ -249,7 +252,7 @@ pub struct ArrayData { /// The null bitmap. A `None` value for this indicates all values are non-null in /// this array. - null_bitmap: Option, + nulls: Option, } pub type ArrayDataRef = Arc; @@ -281,19 +284,21 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Self { - let null_count = match null_count { - None => count_nulls(null_bit_buffer.as_ref(), offset, len), - Some(null_count) => null_count, - }; - let null_bitmap = null_bit_buffer.filter(|_| null_count > 0).map(Bitmap::from); + let nulls = null_bit_buffer + .map(|b| BooleanBuffer::new(b, offset, len)) + .map(|b| match null_count { + None => NullBuffer::new(b), + Some(null_count) => NullBuffer::new_unchecked(b, null_count), + }) + .filter(|b| b.null_count() > 0); + let new_self = Self { data_type, len, - null_count, offset, buffers, child_data, - null_bitmap, + nulls, }; // Provide a force_validate mode @@ -378,30 +383,26 @@ impl ArrayData { } /// Returns whether the element at index `i` is null + #[inline] pub fn is_null(&self, i: usize) -> bool { - if let Some(ref b) = self.null_bitmap { - return !b.is_set(self.offset + i); + match &self.nulls { + Some(v) => v.is_null(i), + None => false, } - false } - /// Returns a reference to the null bitmap of this [`ArrayData`] + /// Returns a reference to the null buffer of this [`ArrayData`] if any + /// + /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`] #[inline] - pub const fn null_bitmap(&self) -> Option<&Bitmap> { - self.null_bitmap.as_ref() - } - - /// Returns a reference to the null buffer of this [`ArrayData`]. - pub fn null_buffer(&self) -> Option<&Buffer> { - self.null_bitmap().as_ref().map(|b| b.buffer_ref()) + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() } /// Returns whether the element at index `i` is not null + #[inline] pub fn is_valid(&self, i: usize) -> bool { - if let Some(ref b) = self.null_bitmap { - return b.is_set(self.offset + i); - } - true + !self.is_null(i) } /// Returns the length (i.e., number of elements) of this [`ArrayData`]. @@ -424,8 +425,11 @@ impl ArrayData { /// Returns the total number of nulls in this array #[inline] - pub const fn null_count(&self) -> usize { - self.null_count + pub fn null_count(&self) -> usize { + self.nulls + .as_ref() + .map(|x| x.null_count()) + .unwrap_or_default() } /// Returns the total number of bytes of memory occupied by the @@ -444,8 +448,8 @@ impl ArrayData { for buffer in &self.buffers { size += buffer.capacity(); } - if let Some(bitmap) = &self.null_bitmap { - size += bitmap.get_buffer_memory_size() + if let Some(bitmap) = &self.nulls { + size += bitmap.buffer().capacity() } for child in &self.child_data { size += child.get_buffer_memory_size(); @@ -510,7 +514,7 @@ impl ArrayData { } } - if self.null_bitmap().is_some() { + if self.nulls().is_some() { result += bit_util::ceil(self.len, 8); } @@ -536,11 +540,8 @@ impl ArrayData { size += mem::size_of::(); size += buffer.capacity(); } - if let Some(bitmap) = &self.null_bitmap { - // this includes the size of the bitmap struct itself, since it is stored directly in - // this struct we already counted those bytes in the size_of_val(self) above - size += bitmap.get_array_memory_size(); - size -= mem::size_of::(); + if let Some(nulls) = &self.nulls { + size += nulls.buffer().capacity(); } for child in &self.child_data { size += child.get_array_memory_size(); @@ -565,7 +566,6 @@ impl ArrayData { let new_data = ArrayData { data_type: self.data_type().clone(), len: length, - null_count: count_nulls(self.null_buffer(), new_offset, length), offset: new_offset, buffers: self.buffers.clone(), // Slice child data, to propagate offsets down to them @@ -574,7 +574,7 @@ impl ArrayData { .iter() .map(|data| data.slice(offset, length)) .collect(), - null_bitmap: self.null_bitmap().cloned(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)), }; new_data @@ -583,9 +583,7 @@ impl ArrayData { new_data.len = length; new_data.offset = offset + self.offset; - - new_data.null_count = - count_nulls(new_data.null_buffer(), new_data.offset, new_data.len); + new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length)); new_data } @@ -714,7 +712,7 @@ impl ArrayData { .child_data(child_data); if has_nulls { - builder = builder.null_count(len).null_bit_buffer(Some(zeroed(len))) + builder = builder.nulls(Some(NullBuffer::new_null(len))) } // SAFETY: @@ -744,7 +742,7 @@ impl ArrayData { // Check that the data layout conforms to the spec let layout = layout(&self.data_type); - if !layout.can_contain_null_mask && self.null_bitmap.is_some() { + if !layout.can_contain_null_mask && self.nulls.is_some() { return Err(ArrowError::InvalidArgumentError(format!( "Arrays of type {:?} cannot contain a null bitmask", self.data_type, @@ -796,29 +794,31 @@ impl ArrayData { } } - if self.null_count > self.len { - return Err(ArrowError::InvalidArgumentError(format!( - "null_count {} for an array exceeds length of {} elements", - self.null_count, self.len - ))); - } - // check null bit buffer size - if let Some(null_bit_map) = self.null_bitmap.as_ref() { - let null_bit_buffer = null_bit_map.buffer_ref(); + if let Some(nulls) = self.nulls() { + if nulls.null_count() > self.len { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count {} for an array exceeds length of {} elements", + nulls.null_count(), + self.len + ))); + } + + let actual_len = nulls.validity().len(); let needed_len = bit_util::ceil(len_plus_offset, 8); - if null_bit_buffer.len() < needed_len { + if actual_len < needed_len { return Err(ArrowError::InvalidArgumentError(format!( - "null_bit_buffer size too small. got {} needed {}", - null_bit_buffer.len(), - needed_len + "null_bit_buffer size too small. got {actual_len} needed {needed_len}", + ))); + } + + if nulls.len() != self.len { + return Err(ArrowError::InvalidArgumentError(format!( + "null buffer incorrect size. got {} expected {}", + nulls.len(), + self.len ))); } - } else if self.null_count > 0 { - return Err(ArrowError::InvalidArgumentError(format!( - "Array of type {} has {} nulls but no null bitmap", - self.data_type, self.null_count - ))); } self.validate_child_data()?; @@ -1155,14 +1155,14 @@ impl ArrayData { /// Validates the the null count is correct and that any /// nullability requirements of its children are correct pub fn validate_nulls(&self) -> Result<(), ArrowError> { - let nulls = self.null_buffer(); - - let actual_null_count = count_nulls(nulls, self.offset, self.len); - if actual_null_count != self.null_count { - return Err(ArrowError::InvalidArgumentError(format!( - "null_count value ({}) doesn't match actual number of nulls in array ({})", - self.null_count, actual_null_count - ))); + if let Some(nulls) = &self.nulls { + let actual = nulls.len() - nulls.inner().count_set_bits(); + if actual != nulls.null_count() { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count value ({}) doesn't match actual number of nulls in array ({})", + nulls.null_count(), actual + ))); + } } // In general non-nullable children should not contain nulls, however, for certain @@ -1178,7 +1178,7 @@ impl ArrayData { DataType::FixedSizeList(field, len) => { let child = &self.child_data[0]; if !field.is_nullable() { - match nulls { + match &self.nulls { Some(nulls) => { let element_len = *len as usize; let mut buffer = @@ -1187,7 +1187,7 @@ impl ArrayData { // Expand each bit within `null_mask` into `element_len` // bits, constructing the implicit mask of the child elements for i in 0..self.len { - if !bit_util::get_bit(nulls.as_ref(), self.offset + i) { + if nulls.is_null(i) { continue; } for j in 0..element_len { @@ -1207,7 +1207,14 @@ impl ArrayData { DataType::Struct(fields) => { for (field, child) in fields.iter().zip(&self.child_data) { if !field.is_nullable() { - self.validate_non_nullable(nulls, self.offset, child)? + match &self.nulls { + Some(n) => self.validate_non_nullable( + Some(n.buffer()), + n.offset(), + child, + )?, + None => self.validate_non_nullable(None, 0, child)?, + } } } } @@ -1226,7 +1233,7 @@ impl ArrayData { ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask.as_ref(), - None => return match data.null_count { + None => return match data.null_count() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent {}", @@ -1236,10 +1243,10 @@ impl ArrayData { }, }; - match data.null_buffer() { + match data.nulls() { Some(nulls) => { let mask = BitChunks::new(mask, offset, data.len); - let nulls = BitChunks::new(nulls.as_ref(), data.offset, data.len); + let nulls = BitChunks::new(nulls.validity(), nulls.offset(), data.len); mask .iter() .zip(nulls.iter()) @@ -1510,7 +1517,6 @@ impl ArrayData { pub fn ptr_eq(&self, other: &Self) -> bool { if self.offset != other.offset || self.len != other.len - || self.null_count != other.null_count || self.data_type != other.data_type || self.buffers.len() != other.buffers.len() || self.child_data.len() != other.child_data.len() @@ -1518,8 +1524,8 @@ impl ArrayData { return false; } - match (&self.null_bitmap, &other.null_bitmap) { - (Some(a), Some(b)) if a.bits.as_ptr() != b.bits.as_ptr() => return false, + match (&self.nulls, &other.nulls) { + (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false, (Some(_), None) | (None, Some(_)) => return false, _ => {} }; @@ -1714,6 +1720,7 @@ pub struct ArrayDataBuilder { len: usize, null_count: Option, null_bit_buffer: Option, + nulls: Option, offset: usize, buffers: Vec, child_data: Vec, @@ -1727,6 +1734,7 @@ impl ArrayDataBuilder { len: 0, null_count: None, null_bit_buffer: None, + nulls: None, offset: 0, buffers: vec![], child_data: vec![], @@ -1744,12 +1752,20 @@ impl ArrayDataBuilder { self } + pub fn nulls(mut self, nulls: Option) -> Self { + self.nulls = nulls; + self.null_count = None; + self.null_bit_buffer = None; + self + } + pub fn null_count(mut self, null_count: usize) -> Self { self.null_count = Some(null_count); self } pub fn null_bit_buffer(mut self, buf: Option) -> Self { + self.nulls = None; self.null_bit_buffer = buf; self } @@ -1786,43 +1802,53 @@ impl ArrayDataBuilder { /// /// The same caveats as [`ArrayData::new_unchecked`] /// apply. + #[allow(clippy::let_and_return)] pub unsafe fn build_unchecked(self) -> ArrayData { - ArrayData::new_unchecked( - self.data_type, - self.len, - self.null_count, - self.null_bit_buffer, - self.offset, - self.buffers, - self.child_data, - ) + let nulls = self.nulls.or_else(|| { + let buffer = self.null_bit_buffer?; + let buffer = BooleanBuffer::new(buffer, self.offset, self.len); + Some(match self.null_count { + Some(n) => NullBuffer::new_unchecked(buffer, n), + None => NullBuffer::new(buffer), + }) + }); + + let data = ArrayData { + data_type: self.data_type, + len: self.len, + offset: self.offset, + buffers: self.buffers, + child_data: self.child_data, + nulls, + }; + + // Provide a force_validate mode + #[cfg(feature = "force_validate")] + data.validate_data().unwrap(); + data } /// Creates an array data, validating all inputs + #[allow(clippy::let_and_return)] pub fn build(self) -> Result { - ArrayData::try_new( - self.data_type, - self.len, - self.null_bit_buffer, - self.offset, - self.buffers, - self.child_data, - ) + let data = unsafe { self.build_unchecked() }; + #[cfg(not(feature = "force_validate"))] + data.validate_data()?; + Ok(data) } } impl From for ArrayDataBuilder { fn from(d: ArrayData) -> Self { - // TODO: Store Bitmap on ArrayData (#1799) - let null_bit_buffer = d.null_buffer().cloned(); Self { - null_bit_buffer, data_type: d.data_type, len: d.len, - null_count: Some(d.null_count), offset: d.offset, buffers: d.buffers, child_data: d.child_data, + nulls: d.nulls, + null_bit_buffer: None, + null_count: None, } } } @@ -1936,8 +1962,8 @@ mod tests { .null_bit_buffer(Some(Buffer::from(bit_v))) .build() .unwrap(); - assert!(arr_data.null_buffer().is_some()); - assert_eq!(&bit_v, arr_data.null_buffer().unwrap().as_slice()); + assert!(arr_data.nulls().is_some()); + assert_eq!(&bit_v, arr_data.nulls().unwrap().validity()); } #[test] @@ -2055,11 +2081,12 @@ mod tests { #[test] fn test_count_nulls() { - let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111])); - let count = count_nulls(null_buffer.as_ref(), 0, 16); + let buffer = Buffer::from(vec![0b00010110, 0b10011111]); + let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16)); + let count = count_nulls(Some(&buffer), 0, 16); assert_eq!(count, 7); - let count = count_nulls(null_buffer.as_ref(), 4, 8); + let count = count_nulls(Some(&buffer), 4, 8); assert_eq!(count, 3); } @@ -2067,7 +2094,7 @@ mod tests { fn test_contains_nulls() { let buffer: Buffer = MutableBuffer::from_iter([false, false, false, true, true, false]).into(); - + let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6)); assert!(contains_nulls(Some(&buffer), 0, 6)); assert!(contains_nulls(Some(&buffer), 0, 3)); assert!(!contains_nulls(Some(&buffer), 3, 2)); diff --git a/arrow-data/src/equal/boolean.rs b/arrow-data/src/equal/boolean.rs index 52e822f03f30..a20ca5ac0bd7 100644 --- a/arrow-data/src/equal/boolean.rs +++ b/arrow-data/src/equal/boolean.rs @@ -33,7 +33,7 @@ pub(super) fn boolean_equal( let lhs_values = lhs.buffers()[0].as_slice(); let rhs_values = rhs.buffers()[0].as_slice(); - let contains_nulls = contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); + let contains_nulls = contains_nulls(lhs.nulls(), lhs_start, len); if !contains_nulls { // Optimize performance for starting offset at u8 boundary. @@ -76,15 +76,13 @@ pub(super) fn boolean_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); - let lhs_start = lhs.offset() + lhs_start; - let rhs_start = rhs.offset() + rhs_start; - - BitIndexIterator::new(lhs_null_bytes, lhs_start, len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) - }) + BitIndexIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len) + .all(|i| { + let lhs_pos = lhs_start + lhs.offset() + i; + let rhs_pos = rhs_start + rhs.offset() + i; + get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) + }) } } diff --git a/arrow-data/src/equal/dictionary.rs b/arrow-data/src/equal/dictionary.rs index 5638c5c91c5c..1d9c4b8d964f 100644 --- a/arrow-data/src/equal/dictionary.rs +++ b/arrow-data/src/equal/dictionary.rs @@ -16,7 +16,7 @@ // under the License. use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::{bit_util::get_bit, ArrowNativeType}; +use arrow_buffer::ArrowNativeType; use super::equal_range; @@ -35,7 +35,7 @@ pub(super) fn dictionary_equal( // Only checking one null mask here because by the time the control flow reaches // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { + if !contains_nulls(lhs.nulls(), lhs_start, len) { (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; @@ -50,14 +50,14 @@ pub(super) fn dictionary_equal( }) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index 17e470b5c47c..9e0e77ff7eca 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -19,7 +19,6 @@ use crate::bit_iterator::BitSliceIterator; use crate::contains_nulls; use crate::data::ArrayData; use crate::equal::primitive::NULL_SLICES_SELECTIVITY_THRESHOLD; -use arrow_buffer::bit_util::get_bit; use arrow_schema::DataType; use super::utils::equal_len; @@ -41,7 +40,7 @@ pub(super) fn fixed_binary_equal( // Only checking one null mask here because by the time the control flow reaches // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { + if !contains_nulls(lhs.nulls(), lhs_start, len) { equal_len( lhs_values, rhs_values, @@ -54,15 +53,15 @@ pub(super) fn fixed_binary_equal( if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) @@ -75,14 +74,16 @@ pub(super) fn fixed_binary_equal( ) }) } else { + let lhs_nulls = lhs.nulls().unwrap(); let lhs_slices_iter = BitSliceIterator::new( - lhs.null_buffer().as_ref().unwrap(), - lhs_start + lhs.offset(), + lhs_nulls.validity(), + lhs_start + lhs_nulls.offset(), len, ); + let rhs_nulls = lhs.nulls().unwrap(); let rhs_slices_iter = BitSliceIterator::new( - rhs.null_buffer().as_ref().unwrap(), - rhs_start + rhs.offset(), + rhs_nulls.validity(), + rhs_start + rhs_nulls.offset(), len, ); diff --git a/arrow-data/src/equal/fixed_list.rs b/arrow-data/src/equal/fixed_list.rs index 204a8658e747..4b79e5c33fab 100644 --- a/arrow-data/src/equal/fixed_list.rs +++ b/arrow-data/src/equal/fixed_list.rs @@ -16,7 +16,6 @@ // under the License. use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; use arrow_schema::DataType; use super::equal_range; @@ -38,7 +37,7 @@ pub(super) fn fixed_list_equal( // Only checking one null mask here because by the time the control flow reaches // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { + if !contains_nulls(lhs.nulls(), lhs_start, len) { equal_range( lhs_values, rhs_values, @@ -48,15 +47,15 @@ pub(super) fn fixed_list_equal( ) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/arrow-data/src/equal/list.rs b/arrow-data/src/equal/list.rs index 25273f8bad63..cc4ba3cacf9f 100644 --- a/arrow-data/src/equal/list.rs +++ b/arrow-data/src/equal/list.rs @@ -16,7 +16,6 @@ // under the License. use crate::data::{count_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; use arrow_buffer::ArrowNativeType; use num::Integer; @@ -90,8 +89,8 @@ pub(super) fn list_equal( let lhs_values = &lhs.child_data()[0]; let rhs_values = &rhs.child_data()[0]; - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); + let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); if lhs_null_count != rhs_null_count { return false; @@ -112,8 +111,8 @@ pub(super) fn list_equal( ) } else { // get a ref of the parent null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); // with nulls, we need to compare item by item whenever it is not null // TODO: Could potentially compare runs of not NULL values @@ -121,8 +120,8 @@ pub(super) fn list_equal( let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); if lhs_is_null != rhs_is_null { return false; diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs index f52541e2861c..7b3cbc9eb949 100644 --- a/arrow-data/src/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -17,7 +17,6 @@ use crate::bit_iterator::BitSliceIterator; use crate::contains_nulls; -use arrow_buffer::bit_util::get_bit; use std::mem::size_of; use crate::data::ArrayData; @@ -39,7 +38,7 @@ pub(super) fn primitive_equal( // Only checking one null mask here because by the time the control flow reaches // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { + if !contains_nulls(lhs.nulls(), lhs_start, len) { // without nulls, we just need to compare slices equal_len( lhs_values, @@ -53,14 +52,14 @@ pub(super) fn primitive_equal( if selectivity_frac >= NULL_SLICES_SELECTIVITY_THRESHOLD { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); lhs_is_null || (lhs_is_null == rhs_is_null) @@ -73,14 +72,16 @@ pub(super) fn primitive_equal( ) }) } else { + let lhs_nulls = lhs.nulls().unwrap(); let lhs_slices_iter = BitSliceIterator::new( - lhs.null_buffer().as_ref().unwrap(), - lhs_start + lhs.offset(), + lhs_nulls.validity(), + lhs_start + lhs_nulls.offset(), len, ); + let rhs_nulls = rhs.nulls().unwrap(); let rhs_slices_iter = BitSliceIterator::new( - rhs.null_buffer().as_ref().unwrap(), - rhs_start + rhs.offset(), + rhs_nulls.validity(), + rhs_start + rhs_nulls.offset(), len, ); diff --git a/arrow-data/src/equal/structure.rs b/arrow-data/src/equal/structure.rs index 25ab340cd3f8..e4751c26f489 100644 --- a/arrow-data/src/equal/structure.rs +++ b/arrow-data/src/equal/structure.rs @@ -16,7 +16,6 @@ // under the License. use crate::data::{contains_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; use super::equal_range; @@ -46,19 +45,19 @@ pub(super) fn struct_equal( ) -> bool { // Only checking one null mask here because by the time the control flow reaches // this point, the equality of the two masks would have already been verified. - if !contains_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len) { + if !contains_nulls(lhs.nulls(), lhs_start, len) { equal_child_values(lhs, rhs, lhs_start, rhs_start, len) } else { // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs.null_buffer().as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs.null_buffer().as_ref().unwrap().as_slice(); + let lhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); // with nulls, we need to compare item by item whenever it is not null (0..len).all(|i| { let lhs_pos = lhs_start + i; let rhs_pos = rhs_start + i; // if both struct and child had no null buffers, - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); + let lhs_is_null = lhs_nulls.is_null(lhs_pos); + let rhs_is_null = rhs_nulls.is_null(rhs_pos); if lhs_is_null != rhs_is_null { return false; diff --git a/arrow-data/src/equal/utils.rs b/arrow-data/src/equal/utils.rs index b3f7fc0b06ef..d1f0f392a195 100644 --- a/arrow-data/src/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -49,15 +49,16 @@ pub(super) fn equal_nulls( rhs_start: usize, len: usize, ) -> bool { - let lhs_offset = lhs_start + lhs.offset(); - let rhs_offset = rhs_start + rhs.offset(); - - match (lhs.null_buffer(), rhs.null_buffer()) { - (Some(lhs), Some(rhs)) => { - equal_bits(lhs.as_slice(), rhs.as_slice(), lhs_offset, rhs_offset, len) - } - (Some(lhs), None) => !contains_nulls(Some(lhs), lhs_offset, len), - (None, Some(rhs)) => !contains_nulls(Some(rhs), rhs_offset, len), + match (lhs.nulls(), rhs.nulls()) { + (Some(lhs), Some(rhs)) => equal_bits( + lhs.validity(), + rhs.validity(), + lhs.offset() + lhs_start, + rhs.offset() + rhs_start, + len, + ), + (Some(lhs), None) => !contains_nulls(Some(lhs), lhs_start, len), + (None, Some(rhs)) => !contains_nulls(Some(rhs), rhs_start, len), (None, None) => true, } } diff --git a/arrow-data/src/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs index f661c614d301..ae880437450b 100644 --- a/arrow-data/src/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -16,7 +16,6 @@ // under the License. use crate::data::{count_nulls, ArrayData}; -use arrow_buffer::bit_util::get_bit; use arrow_buffer::ArrowNativeType; use num::Integer; @@ -60,8 +59,8 @@ pub(super) fn variable_sized_equal( let lhs_values = lhs.buffers()[1].as_slice(); let rhs_values = rhs.buffers()[1].as_slice(); - let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len); - let rhs_null_count = count_nulls(rhs.null_buffer(), rhs_start + rhs.offset(), len); + let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); if lhs_null_count == 0 && rhs_null_count == 0 @@ -83,15 +82,8 @@ pub(super) fn variable_sized_equal( let rhs_pos = rhs_start + i; // the null bits can still be `None`, indicating that the value is valid. - let lhs_is_null = !lhs - .null_buffer() - .map(|v| get_bit(v.as_slice(), lhs.offset() + lhs_pos)) - .unwrap_or(true); - - let rhs_is_null = !rhs - .null_buffer() - .map(|v| get_bit(v.as_slice(), rhs.offset() + rhs_pos)) - .unwrap_or(true); + let lhs_is_null = lhs.nulls().map(|v| v.is_null(lhs_pos)).unwrap_or_default(); + let rhs_is_null = rhs.nulls().map(|v| v.is_null(rhs_pos)).unwrap_or_default(); lhs_is_null || (lhs_is_null == rhs_is_null) diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index e506653bb59b..b7d690fb9124 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -17,8 +17,10 @@ //! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). +use crate::bit_mask::set_bits; use crate::{layout, ArrayData}; -use arrow_buffer::Buffer; +use arrow_buffer::buffer::NullBuffer; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_schema::DataType; use std::ffi::c_void; @@ -83,6 +85,29 @@ unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) { array.release = None; } +/// Aligns the provided `nulls` to the provided `data_offset` +/// +/// This is a temporary measure until offset is removed from ArrayData (#1799) +fn align_nulls(data_offset: usize, nulls: Option<&NullBuffer>) -> Option { + let nulls = nulls?; + if data_offset == nulls.offset() { + // Underlying buffer is already aligned + return Some(nulls.buffer().clone()); + } + if data_offset == 0 { + return Some(nulls.inner().sliced()); + } + let mut builder = MutableBuffer::new_null(data_offset + nulls.len()); + set_bits( + builder.as_slice_mut(), + nulls.validity(), + data_offset, + nulls.offset(), + nulls.len(), + ); + Some(builder.into()) +} + struct ArrayPrivateData { #[allow(dead_code)] buffers: Vec>, @@ -102,7 +127,7 @@ impl FFI_ArrowArray { let buffers = if data_layout.can_contain_null_mask { // * insert the null buffer at the start // * make all others `Option`. - std::iter::once(data.null_buffer().cloned()) + std::iter::once(align_nulls(data.offset(), data.nulls())) .chain(data.buffers().iter().map(|b| Some(b.clone()))) .collect::>() } else { diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index b37a8c5da72f..2b105f5bb040 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -17,8 +17,6 @@ //! Array data abstractions for [Apache Arrow](https://docs.rs/arrow) -mod bitmap; -pub use bitmap::Bitmap; mod data; pub use data::*; diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index fef6d4be4985..2719b96b6914 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -20,6 +20,7 @@ use super::{ ArrayData, ArrayDataBuilder, }; use crate::bit_mask::set_bits; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, i256, ArrowNativeType, MutableBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode}; use half::f16; @@ -76,26 +77,30 @@ impl<'a> _MutableArrayData<'a> { } }; + let nulls = (self.null_count > 0).then(|| { + let bools = BooleanBuffer::new(self.null_buffer.into(), 0, self.len); + unsafe { NullBuffer::new_unchecked(bools, self.null_count) } + }); + ArrayDataBuilder::new(self.data_type) .offset(0) .len(self.len) - .null_count(self.null_count) + .nulls(nulls) .buffers(buffers) .child_data(child_data) - .null_bit_buffer((self.null_count > 0).then(|| self.null_buffer.into())) } } fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { - if let Some(bitmap) = array.null_bitmap() { - let bytes = bitmap.buffer().as_slice(); + if let Some(nulls) = array.nulls() { + let bytes = nulls.validity(); Box::new(move |mutable, start, len| { utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); mutable.null_count += set_bits( mutable.null_buffer.as_slice_mut(), bytes, mutable.len, - array.offset() + start, + nulls.offset() + start, len, ); }) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 6842474fb4e2..bb367f9447d5 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -227,10 +227,8 @@ fn create_array( buffer_index = values_triple.2; let run_array_length = run_node.length() as usize; - let run_array_null_count = run_node.null_count() as usize; let data = ArrayData::builder(data_type.clone()) .len(run_array_length) - .null_count(run_array_null_count) .offset(0) .add_child_data(run_ends_triple.0.into_data()) .add_child_data(values_triple.0.into_data()) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index f019340154ac..75c48bebcf63 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -617,7 +617,6 @@ fn into_zero_offset_run_array( // The function builds a valid run_ends array and hence need not be validated. ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) .len(physical_length) - .null_count(0) .add_buffer(builder.finish()) .build_unchecked() }; @@ -1220,7 +1219,7 @@ fn write_array_data( } if has_validity_bitmap(array_data.data_type(), write_options) { // write null buffer if exists - let null_buffer = match array_data.null_buffer() { + let null_buffer = match array_data.nulls() { None => { // create a buffer and fill it with valid bits let num_bytes = bit_util::ceil(num_rows, 8); @@ -1228,7 +1227,7 @@ fn write_array_data( let buffer = buffer.with_bitset(num_bytes, true); buffer.into() } - Some(buffer) => buffer.bit_slice(array_data.offset(), array_data.len()), + Some(buffer) => buffer.inner().sliced(), }; offset = write_buffer( diff --git a/arrow-json/src/raw/list_array.rs b/arrow-json/src/raw/list_array.rs index 91ca4b7275bf..a57f4273369b 100644 --- a/arrow-json/src/raw/list_array.rs +++ b/arrow-json/src/raw/list_array.rs @@ -19,6 +19,7 @@ use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_array::OffsetSizeTrait; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::marker::PhantomData; @@ -62,7 +63,6 @@ impl ArrayDecoder for ListArrayDecoder { let mut offsets = BufferBuilder::::new(pos.len() + 1); offsets.append(O::from_usize(0).unwrap()); - let mut null_count = 0; let mut nulls = self .is_nullable .then(|| BooleanBufferBuilder::new(pos.len())); @@ -76,7 +76,6 @@ impl ArrayDecoder for ListArrayDecoder { } (TapeElement::Null, Some(nulls)) => { nulls.append(false); - null_count += 1; *p + 1 } (d, _) => return Err(tape_error(d, "[")), @@ -102,11 +101,13 @@ impl ArrayDecoder for ListArrayDecoder { } let child_data = self.decoder.decode(tape, &child_pos)?; + let nulls = nulls + .as_mut() + .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); let data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) - .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) - .null_count(null_count) + .nulls(nulls) .add_buffer(offsets.finish()) .child_data(vec![child_data]); diff --git a/arrow-json/src/raw/map_array.rs b/arrow-json/src/raw/map_array.rs index ac48d8bce1e7..dee142bef6db 100644 --- a/arrow-json/src/raw/map_array.rs +++ b/arrow-json/src/raw/map_array.rs @@ -18,6 +18,7 @@ use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; @@ -88,7 +89,6 @@ impl ArrayDecoder for MapArrayDecoder { let mut key_pos = Vec::with_capacity(pos.len()); let mut value_pos = Vec::with_capacity(pos.len()); - let mut null_count = 0; let mut nulls = self .is_nullable .then(|| BooleanBufferBuilder::new(pos.len())); @@ -102,7 +102,6 @@ impl ArrayDecoder for MapArrayDecoder { } (TapeElement::Null, Some(nulls)) => { nulls.append(false); - null_count += 1; p + 1 } (d, _) => return Err(tape_error(d, "{")), @@ -140,11 +139,14 @@ impl ArrayDecoder for MapArrayDecoder { // Valid by construction let struct_data = unsafe { struct_data.build_unchecked() }; + let nulls = nulls + .as_mut() + .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + let builder = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) .buffers(vec![offsets.finish()]) - .null_count(null_count) - .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) + .nulls(nulls) .child_data(vec![struct_data]); // Safety: diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 595a54c10a9e..a0dbcbd53eaa 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -481,9 +481,10 @@ mod tests { assert_eq!(batches.len(), 1); let list = as_list_array(batches[0].column(0).as_ref()); + assert_eq!(list.len(), 3); assert_eq!(list.value_offsets(), &[0, 0, 2, 2]); assert_eq!(list.null_count(), 1); - assert!(list.is_null(4)); + assert!(list.is_null(2)); let list_values = as_primitive_array::(list.values().as_ref()); assert_eq!(list_values.values(), &[5, 6]); @@ -501,10 +502,15 @@ mod tests { assert!(b.is_null(2)); let nested_list = as_struct_array(batches[0].column(2).as_ref()); + assert_eq!(nested_list.len(), 3); + assert_eq!(nested_list.null_count(), 1); + assert!(nested_list.is_null(2)); + let list2 = as_list_array(nested_list.column(0).as_ref()); + assert_eq!(list2.len(), 3); assert_eq!(list2.null_count(), 1); assert_eq!(list2.value_offsets(), &[0, 2, 2, 2]); - assert!(list2.is_null(3)); + assert!(list2.is_null(2)); let list2_values = as_struct_array(list2.values().as_ref()); diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs index 64ceff22429b..1d0019993426 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/raw/struct_array.rs @@ -18,6 +18,7 @@ use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::BooleanBufferBuilder; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -54,7 +55,6 @@ impl ArrayDecoder for StructArrayDecoder { let mut child_pos: Vec<_> = (0..fields.len()).map(|_| vec![0; pos.len()]).collect(); - let mut null_count = 0; let mut nulls = self .is_nullable .then(|| BooleanBufferBuilder::new(pos.len())); @@ -68,7 +68,6 @@ impl ArrayDecoder for StructArrayDecoder { } (TapeElement::Null, Some(nulls)) => { nulls.append(false); - null_count += 1; continue; } (d, _) => return Err(tape_error(d, "{")), @@ -108,10 +107,13 @@ impl ArrayDecoder for StructArrayDecoder { .iter() .for_each(|x| assert_eq!(x.len(), pos.len())); + let nulls = nulls + .as_mut() + .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + let data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) - .null_count(null_count) - .null_bit_buffer(nulls.as_mut().map(|x| x.finish())) + .nulls(nulls) .child_data(child_data); // Safety diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 7df63bf8d662..0ef438e36950 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -2378,7 +2378,7 @@ mod tests { Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) ); // compare list null buffers - assert_eq!(read.data().null_buffer(), expected.data().null_buffer()); + assert_eq!(read.data().nulls(), expected.data().nulls()); // build struct from list let struct_array = as_struct_array(read.values()); let expected_struct_array = as_struct_array(expected.values()); @@ -2389,8 +2389,8 @@ mod tests { assert_eq!(1, expected_struct_array.null_count()); // test struct's nulls assert_eq!( - struct_array.data().null_buffer(), - expected_struct_array.data().null_buffer() + struct_array.data().nulls(), + expected_struct_array.data().nulls() ); // test struct's fields let read_b = struct_array.column(0); diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index a4f1fdb88091..2702514edc83 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -104,10 +104,7 @@ pub fn eq_utf8( fn utf8_empty( left: &GenericStringArray, ) -> Result { - let null_bit_buffer = left - .data() - .null_buffer() - .map(|b| b.bit_slice(left.offset(), left.len())); + let null_bit_buffer = left.data().nulls().map(|b| b.inner().sliced()); let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( @@ -213,10 +210,7 @@ pub fn eq_bool_scalar( DataType::Boolean, len, None, - left.data_ref() - .null_bitmap() - .as_ref() - .map(|b| b.buffer().bit_slice(left_offset, len)), + left.data().nulls().map(|b| b.inner().sliced()), 0, vec![values], vec![], @@ -1439,10 +1433,7 @@ where result_remainder.copy_from_slice(remainder_mask_as_bytes); } - let null_bit_buffer = left - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(left.offset(), left.len())); + let null_bit_buffer = left.data().nulls().map(|b| b.inner().sliced()); // null count is the same as in the input since the right side of the scalar comparison cannot be null let null_count = left.null_count(); diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index c4baa2283885..230eb9390f2f 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -675,7 +675,6 @@ fn sort_run_downcasted( // The function builds a valid run_ends array and hence need not be validated. ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) .len(new_physical_len) - .null_count(0) .add_buffer(new_run_ends_builder.finish()) .build_unchecked() }; diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index dcd247be1a7b..833baac7b655 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -168,8 +168,7 @@ pub unsafe fn decode( let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) - .null_count(canonical.null_count()) - .null_bit_buffer(canonical.data().null_buffer().cloned()) + .nulls(canonical.data().nulls().cloned()) .add_buffer(offsets.finish()) .add_child_data(child_data); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index fde4b41b04cf..d8ea9fceb856 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -153,11 +153,12 @@ pub fn build_filter(filter: &BooleanArray) -> Result { /// Remove null values by do a bitmask AND operation with null bits and the boolean bits. pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { let array_data = filter.data_ref(); - let null_bitmap = array_data.null_buffer().unwrap(); + let nulls = array_data.nulls().unwrap(); let mask = filter.values(); let offset = filter.offset(); - let new_mask = buffer_bin_and(mask, offset, null_bitmap, offset, filter.len()); + let new_mask = + buffer_bin_and(mask, offset, nulls.buffer(), nulls.offset(), filter.len()); let array_data = ArrayData::builder(DataType::Boolean) .len(filter.len()) @@ -410,7 +411,8 @@ fn filter_null_mask( return None; } - let nulls = filter_bits(data.null_buffer()?, data.offset(), predicate); + let nulls = data.nulls()?; + let nulls = filter_bits(nulls.buffer(), nulls.offset(), predicate); // The filtered `nulls` has a length of `predicate.count` bits and // therefore the null count is this minus the number of valid bits let null_count = predicate.count - nulls.count_set_bits_offset(0, predicate.count); diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 34876e948b9d..4b052ce004cc 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -53,13 +53,13 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result ( + let (right, r_offset) = match right_data.nulls() { + Some(nulls) => ( buffer_bin_and( &right_data.buffers()[0], right_data.offset(), - buffer, - right_data.offset(), + nulls.buffer(), + nulls.offset(), len, ), 0, @@ -69,15 +69,21 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut valid_count = 0; - let b = - bitwise_bin_op_helper(left, l_offset, &right, r_offset, len, |l, r| { + let b = bitwise_bin_op_helper( + left.buffer(), + left.offset(), + &right, + r_offset, + len, + |l, r| { let t = l & !r; valid_count += t.count_ones() as usize; t - }); + }, + ); (b, len - valid_count) } None => { diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 6436dc0d56e4..771a7eeb5c5a 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -340,12 +340,7 @@ where // Soundness: `slice.map` is `TrustedLen`. let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - Ok(( - buffer, - indices_data - .null_buffer() - .map(|b| b.bit_slice(indices_data.offset(), indices.len())), - )) + Ok((buffer, indices_data.nulls().map(|b| b.inner().sliced()))) } // take implementation when both values and indices contain nulls @@ -530,14 +525,11 @@ where IndexType::Native: ToPrimitive, { let val_buf = take_bits(values.values(), values.offset(), indices)?; - let null_buf = match values.data().null_buffer() { - Some(buf) if values.null_count() > 0 => { - Some(take_bits(buf, values.offset(), indices)?) + let null_buf = match values.data().nulls() { + Some(nulls) if nulls.null_count() > 0 => { + Some(take_bits(nulls.buffer(), nulls.offset(), indices)?) } - _ => indices - .data() - .null_buffer() - .map(|b| b.bit_slice(indices.offset(), indices.len())), + _ => indices.data().nulls().map(|b| b.inner().sliced()), }; let data = unsafe { @@ -626,7 +618,7 @@ where } *offset = length_so_far; } - nulls = indices.data_ref().null_buffer().cloned(); + nulls = indices.data().nulls().map(|b| b.inner().sliced()); } else { let num_bytes = bit_util::ceil(data_len, 8); @@ -791,7 +783,7 @@ where values.data_type().clone(), new_keys.len(), Some(new_keys_data.null_count()), - new_keys_data.null_buffer().cloned(), + new_keys_data.nulls().map(|b| b.inner().sliced()), 0, new_keys_data.buffers().to_vec(), values.data().child_data().to_vec(), @@ -1639,7 +1631,7 @@ mod tests { let expected_list_data = ArrayData::builder(list_data_type) .len(5) // null buffer remains the same as only the indices have nulls - .null_bit_buffer(index.data().null_buffer().cloned()) + .nulls(index.data().nulls().cloned()) .add_buffer(expected_offsets) .add_child_data(expected_data) .build() @@ -1713,7 +1705,7 @@ mod tests { let expected_list_data = ArrayData::builder(list_data_type) .len(5) // null buffer remains the same as only the indices have nulls - .null_bit_buffer(index.data().null_buffer().cloned()) + .nulls(index.data().nulls().cloned()) .add_buffer(expected_offsets) .add_child_data(expected_data) .build() diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 9651bef2771f..cd588fe01c6b 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -37,10 +37,7 @@ macro_rules! unary_offsets { // `values` come from a slice iterator with a known size. let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - let null_bit_buffer = $array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice($array.offset(), $array.len())); + let null_bit_buffer = $array.data().nulls().map(|b| b.inner().sliced()); let data = unsafe { ArrayData::new_unchecked( diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index 4072d8ba07e5..bf6e60cfeaaa 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -122,7 +122,7 @@ pub fn regexp_is_match_utf8_scalar( regex: &str, flag: Option<&str>, ) -> Result { - let null_bit_buffer = array.data().null_buffer().cloned(); + let null_bit_buffer = array.data().nulls().map(|x| x.inner().sliced()); let mut result = BooleanBufferBuilder::new(array.len()); let pattern = match flag { diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index 7d04304771a6..a59a54d7e6e4 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -210,10 +210,7 @@ pub fn substring_by_char( GenericStringArray::::DATA_TYPE, array.len(), None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), + array.data().nulls().map(|b| b.inner().sliced()), 0, vec![new_offsets.finish(), vals.finish()], vec![], @@ -297,10 +294,7 @@ fn binary_substring( GenericBinaryArray::::DATA_TYPE, array.len(), None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), + array.data().nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], @@ -345,10 +339,7 @@ fn fixed_size_binary_substring( DataType::FixedSizeBinary(new_len), num_of_elements, None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), num_of_elements)), + array.data().nulls().map(|b| b.inner().sliced()), 0, vec![new_values.into()], vec![], @@ -427,10 +418,7 @@ fn utf8_substring( GenericStringArray::::DATA_TYPE, array.len(), None, - array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())), + array.data().nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index f7ce24a97d2a..3d1bced298c9 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -306,10 +306,6 @@ pub use arrow_array::{downcast_dictionary_array, downcast_primitive_array}; pub use arrow_buffer::{alloc, buffer}; -pub mod bitmap { - pub use arrow_data::Bitmap; -} - pub mod array; pub mod compute; #[cfg(feature = "csv")] diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 3cdec46b59a0..7e45ee7afcda 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -943,17 +943,7 @@ fn test_try_new_sliced_struct() { let struct_array_slice = struct_array.slice(1, 3); let struct_array_data = struct_array_slice.data(); - let cloned_data = ArrayData::try_new( - struct_array_slice.data_type().clone(), - struct_array_slice.len(), - struct_array_data.null_buffer().cloned(), - struct_array_slice.offset(), - struct_array_data.buffers().to_vec(), - struct_array_data.child_data().to_vec(), - ) - .unwrap(); - let cloned = make_array(cloned_data); - + let cloned = make_array(struct_array_data.clone()); assert_eq!(&struct_array_slice, &cloned); } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 11ed35263e6a..de4cba4adb33 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -270,12 +270,12 @@ impl LevelInfoBuilder { }) }; - match list_data.null_bitmap() { + match list_data.nulls() { Some(nulls) => { - let null_offset = list_data.offset() + range.start; + let null_offset = range.start; // TODO: Faster bitmask iteration (#1757) for (idx, w) in offsets.windows(2).enumerate() { - let is_valid = nulls.is_set(idx + null_offset); + let is_valid = nulls.is_valid(idx + null_offset); let start_idx = w[0].as_usize(); let end_idx = w[1].as_usize(); if !is_valid { @@ -329,15 +329,14 @@ impl LevelInfoBuilder { } }; - match array.data().null_bitmap() { + match array.data().nulls() { Some(validity) => { - let null_offset = array.data().offset(); let mut last_non_null_idx = None; let mut last_null_idx = None; // TODO: Faster bitmask iteration (#1757) for i in range.clone() { - match validity.is_set(i + null_offset) { + match validity.is_valid(i) { true => { if let Some(last_idx) = last_null_idx.take() { write_null(children, last_idx..i) @@ -379,12 +378,11 @@ impl LevelInfoBuilder { def_levels.reserve(len); info.non_null_indices.reserve(len); - match array.data().null_bitmap() { + match array.data().nulls() { Some(nulls) => { - let nulls_offset = array.data().offset(); // TODO: Faster bitmask iteration (#1757) for i in range { - match nulls.is_set(i + nulls_offset) { + match nulls.is_valid(i) { true => { def_levels.push(info.max_def_level); info.non_null_indices.push(i) diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 84b7ab94cebb..7c27a365fc28 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -20,7 +20,6 @@ use std::ops::Range; use arrow_array::builder::BooleanBufferBuilder; use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; use arrow_buffer::Buffer; -use arrow_data::Bitmap; use crate::arrow::buffer::bit_util::count_set_bits; use crate::arrow::record_reader::buffer::BufferQueue; @@ -105,7 +104,7 @@ impl DefinitionLevelBuffer { } /// Split `len` levels out of `self` - pub fn split_bitmask(&mut self, len: usize) -> Bitmap { + pub fn split_bitmask(&mut self, len: usize) -> Buffer { let old_builder = match &mut self.inner { BufferInner::Full { nulls, .. } => nulls, BufferInner::Mask { nulls } => nulls, @@ -124,7 +123,7 @@ impl DefinitionLevelBuffer { // Swap into self self.len = new_builder.len(); - Bitmap::from(std::mem::replace(old_builder, new_builder).finish()) + std::mem::replace(old_builder, new_builder).finish() } pub fn nulls(&self) -> &BooleanBufferBuilder { @@ -516,7 +515,7 @@ mod tests { let bitmap = buffer.split_bitmask(19); // Should have split off 19 records leaving, 81 behind - assert_eq!(bitmap.bit_len(), 3 * 8); // Note: bitmask only tracks bytes not bits + assert_eq!(bitmap.len(), 3); // Note: bitmask only tracks bytes not bits assert_eq!(buffer.nulls().len(), 81); } } diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index ef17b8d0e6f4..e47bdee1c38a 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -18,7 +18,6 @@ use std::cmp::{max, min}; use arrow_buffer::Buffer; -use arrow_data::Bitmap; use crate::arrow::record_reader::{ buffer::{BufferQueue, ScalarBuffer, ValuesBuffer}, @@ -271,7 +270,7 @@ where /// Returns currently stored null bitmap data. /// The side effect is similar to `consume_def_levels`. pub fn consume_bitmap_buffer(&mut self) -> Option { - self.consume_bitmap().map(|b| b.into_buffer()) + self.consume_bitmap() } /// Reset state of record reader. @@ -284,7 +283,7 @@ where } /// Returns bitmap data. - pub fn consume_bitmap(&mut self) -> Option { + pub fn consume_bitmap(&mut self) -> Option { self.def_levels .as_mut() .map(|levels| levels.split_bitmask(self.num_values)) @@ -409,7 +408,6 @@ fn packed_null_mask(descr: &ColumnDescPtr) -> bool { mod tests { use std::sync::Arc; - use arrow::bitmap::Bitmap; use arrow::buffer::Buffer; use arrow_array::builder::{Int16BufferBuilder, Int32BufferBuilder}; @@ -584,8 +582,7 @@ mod tests { // Verify bitmap let expected_valid = &[false, true, false, true, true, false, true]; let expected_buffer = Buffer::from_iter(expected_valid.iter().cloned()); - let expected_bitmap = Bitmap::from(expected_buffer); - assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + assert_eq!(Some(expected_buffer), record_reader.consume_bitmap()); // Verify result record data let actual = record_reader.consume_record_data(); @@ -695,8 +692,7 @@ mod tests { // Verify bitmap let expected_valid = &[true, false, false, true, true, true, true, true, true]; let expected_buffer = Buffer::from_iter(expected_valid.iter().cloned()); - let expected_bitmap = Bitmap::from(expected_buffer); - assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + assert_eq!(Some(expected_buffer), record_reader.consume_bitmap()); // Verify result record data let actual = record_reader.consume_record_data(); @@ -966,8 +962,7 @@ mod tests { // Verify bitmap let expected_valid = &[false, true, true]; let expected_buffer = Buffer::from_iter(expected_valid.iter().cloned()); - let expected_bitmap = Bitmap::from(expected_buffer); - assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + assert_eq!(Some(expected_buffer), record_reader.consume_bitmap()); // Verify result record data let actual = record_reader.consume_record_data(); From 231ae9b31769b62da368b9f1eb355a840540cb06 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 16:07:26 +0000 Subject: [PATCH 0643/1411] Return Buffers from ArrayData::buffers instead of slice (#1799) (#3783) * Return Buffers from ArrayData::buffers instead of slice (#1799) * Clippy --- arrow-arith/src/boolean.rs | 2 +- arrow-array/src/array/primitive_array.rs | 4 +- arrow-array/src/array/union_array.rs | 30 ++++---- arrow-csv/src/reader/mod.rs | 2 +- arrow-data/src/data/buffers.rs | 97 ++++++++++++++++++++++++ arrow-data/src/data/mod.rs | 10 ++- arrow-json/src/reader.rs | 6 +- arrow-select/src/filter.rs | 6 +- arrow-select/src/nullif.rs | 2 +- arrow/src/ffi.rs | 4 +- 10 files changed, 132 insertions(+), 31 deletions(-) create mode 100644 arrow-data/src/data/buffers.rs diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 5bd39a673426..61942dc90b81 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -353,7 +353,7 @@ pub fn not(left: &BooleanArray) -> Result { let data = left.data_ref(); let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); - let values = buffer_unary_not(&data.buffers()[0], left_offset, len); + let values = buffer_unary_not(data.buffers()[0], left_offset, len); let data = unsafe { ArrayData::new_unchecked( diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 0e28060b25f8..408f0c4ae96a 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1234,7 +1234,7 @@ mod tests { fn test_primitive_array_from_vec() { let buf = Buffer::from_slice_ref([0, 1, 2, 3, 4]); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - assert_eq!(buf, arr.data.buffers()[0]); + assert_eq!(buf, *arr.data.buffers()[0]); assert_eq!(5, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1740,7 +1740,7 @@ mod tests { .build() .unwrap(); let arr = Int32Array::from(data); - assert_eq!(buf2, arr.data.buffers()[0]); + assert_eq!(buf2, *arr.data.buffers()[0]); assert_eq!(5, arr.len()); assert_eq!(0, arr.null_count()); for i in 0..3 { diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index f215fb0def9a..867eb8d59fde 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -409,7 +409,7 @@ mod tests { // Check type ids assert_eq!( - union.data().buffers()[0], + *union.data().buffers()[0], Buffer::from_slice_ref(&expected_type_ids) ); for (i, id) in expected_type_ids.iter().enumerate() { @@ -418,7 +418,7 @@ mod tests { // Check offsets assert_eq!( - union.data().buffers()[1], + *union.data().buffers()[1], Buffer::from_slice_ref(&expected_value_offsets) ); for (i, id) in expected_value_offsets.iter().enumerate() { @@ -427,15 +427,15 @@ mod tests { // Check data assert_eq!( - union.data().child_data()[0].buffers()[0], + *union.data().child_data()[0].buffers()[0], Buffer::from_slice_ref([1_i32, 4, 6]) ); assert_eq!( - union.data().child_data()[1].buffers()[0], + *union.data().child_data()[1].buffers()[0], Buffer::from_slice_ref([2_i32, 7]) ); assert_eq!( - union.data().child_data()[2].buffers()[0], + *union.data().child_data()[2].buffers()[0], Buffer::from_slice_ref([3_i32, 5]), ); @@ -467,7 +467,7 @@ mod tests { // Check type ids assert_eq!( - union.data().buffers()[0], + *union.data().buffers()[0], Buffer::from_slice_ref(&expected_type_ids) ); for (i, id) in expected_type_ids.iter().enumerate() { @@ -476,7 +476,7 @@ mod tests { // Check offsets assert_eq!( - union.data().buffers()[1], + *union.data().buffers()[1], Buffer::from_slice_ref(&expected_value_offsets) ); for (i, id) in expected_value_offsets.iter().enumerate() { @@ -660,7 +660,7 @@ mod tests { .unwrap(); // Check type ids - assert_eq!(Buffer::from_slice_ref(type_ids), array.data().buffers()[0]); + assert_eq!(Buffer::from_slice_ref(type_ids), *array.data().buffers()[0]); for (i, id) in type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); } @@ -668,7 +668,7 @@ mod tests { // Check offsets assert_eq!( Buffer::from_slice_ref(value_offsets), - array.data().buffers()[1] + *array.data().buffers()[1] ); for (i, id) in value_offsets.iter().enumerate() { assert_eq!(id, &array.value_offset(i)); @@ -736,7 +736,7 @@ mod tests { // Check type ids assert_eq!( Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] + *union.data().buffers()[0] ); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); @@ -747,16 +747,16 @@ mod tests { // Check data assert_eq!( - union.data().child_data()[0].buffers()[0], + *union.data().child_data()[0].buffers()[0], Buffer::from_slice_ref([1_i32, 0, 0, 4, 0, 6, 0]), ); assert_eq!( Buffer::from_slice_ref([0_i32, 2_i32, 0, 0, 0, 0, 7]), - union.data().child_data()[1].buffers()[0] + *union.data().child_data()[1].buffers()[0] ); assert_eq!( Buffer::from_slice_ref([0_i32, 0, 3_i32, 0, 5, 0, 0]), - union.data().child_data()[2].buffers()[0] + *union.data().child_data()[2].buffers()[0] ); assert_eq!(expected_array_values.len(), union.len()); @@ -785,7 +785,7 @@ mod tests { // Check type ids assert_eq!( Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] + *union.data().buffers()[0] ); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); @@ -847,7 +847,7 @@ mod tests { // Check type ids assert_eq!( Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] + *union.data().buffers()[0] ); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index e78f2d0ba718..84d55c4ae24b 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -2482,7 +2482,7 @@ mod tests { for v in *values { t.update(v, None) } - assert_eq!(&t.get(), expected, "{:?}", values) + assert_eq!(&t.get(), expected, "{values:?}") } } } diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs new file mode 100644 index 000000000000..3b57bfe0e23c --- /dev/null +++ b/arrow-data/src/data/buffers.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_buffer::Buffer; +use std::iter::Chain; +use std::ops::Index; + +/// A collection of [`Buffer`] +#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] +pub struct Buffers<'a>([Option<&'a Buffer>; 2]); + +impl<'a> Buffers<'a> { + /// Temporary will be removed once ArrayData does not store `Vec` directly (#3769) + #[inline] + pub(crate) fn from_slice(a: &'a [Buffer]) -> Self { + match a.len() { + 0 => Self([None, None]), + 1 => Self([Some(&a[0]), None]), + _ => Self([Some(&a[0]), Some(&a[1])]), + } + } + + /// Returns the number of [`Buffer`] in this collection + #[inline] + pub fn len(&self) -> usize { + self.0[0].is_some() as usize + self.0[1].is_some() as usize + } + + /// Returns `true` if this collection is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.0[0].is_none() && self.0[1].is_none() + } + + #[inline] + pub fn iter(&self) -> IntoIter<'a> { + self.into_iter() + } + + /// Converts this [`Buffers`] to a `Vec` + #[inline] + pub fn to_vec(&self) -> Vec { + self.iter().cloned().collect() + } +} + +impl<'a> Index for Buffers<'a> { + type Output = &'a Buffer; + + #[inline] + fn index(&self, index: usize) -> &Self::Output { + self.0[index].as_ref().unwrap() + } +} + +impl<'a> IntoIterator for Buffers<'a> { + type Item = &'a Buffer; + type IntoIter = IntoIter<'a>; + + #[inline] + fn into_iter(self) -> Self::IntoIter { + IntoIter(self.0[0].into_iter().chain(self.0[1].into_iter())) + } +} + +type OptionIter<'a> = std::option::IntoIter<&'a Buffer>; + +/// [`Iterator`] for [`Buffers`] +pub struct IntoIter<'a>(Chain, OptionIter<'a>>); + +impl<'a> Iterator for IntoIter<'a> { + type Item = &'a Buffer; + + #[inline] + fn next(&mut self) -> Option { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index d76cb9eb19e6..051deef07305 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -30,6 +30,9 @@ use std::sync::Arc; use crate::equal; +mod buffers; +pub use buffers::*; + #[allow(unused)] // Private until ready (#1176) mod bytes; #[allow(unused)] // Private until ready (#1176) @@ -371,9 +374,10 @@ impl ArrayData { &self.data_type } - /// Returns a slice of the [`Buffer`]s that hold the data. - pub fn buffers(&self) -> &[Buffer] { - &self.buffers[..] + /// Returns the [`Buffers`] storing data for this [`ArrayData`] + pub fn buffers(&self) -> Buffers<'_> { + // In future ArrayData won't store data contiguously as `Vec` (#1799) + Buffers::from_slice(&self.buffers) } /// Returns a slice of children [`ArrayData`]. This will be non diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 0ef438e36950..3ac39c110fc9 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -2210,7 +2210,7 @@ mod tests { .unwrap(); // test that the list offsets are correct assert_eq!( - cc.data().buffers()[0], + *cc.data().buffers()[0], Buffer::from_slice_ref([0i32, 2, 2, 4, 5]) ); let cc = as_boolean_array(cc.values()); @@ -2230,7 +2230,7 @@ mod tests { .unwrap(); // test that the list offsets are correct assert_eq!( - dd.data().buffers()[0], + *dd.data().buffers()[0], Buffer::from_slice_ref([0i32, 1, 1, 2, 6]) ); @@ -2374,7 +2374,7 @@ mod tests { let read: &ListArray = read.as_any().downcast_ref::().unwrap(); let expected = expected.as_any().downcast_ref::().unwrap(); assert_eq!( - read.data().buffers()[0], + *read.data().buffers()[0], Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) ); // compare list null buffers diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index d8ea9fceb856..6ba08746d8fc 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -82,7 +82,7 @@ impl<'a> IndexIterator<'a> { fn new(filter: &'a BooleanArray, remaining: usize) -> Self { assert_eq!(filter.null_count(), 0); let data = filter.data(); - let iter = BitIndexIterator::new(&data.buffers()[0], data.offset(), data.len()); + let iter = BitIndexIterator::new(data.buffers()[0], data.offset(), data.len()); Self { remaining, iter } } } @@ -470,7 +470,7 @@ fn filter_boolean(values: &BooleanArray, predicate: &FilterPredicate) -> Boolean assert_eq!(data.buffers().len(), 1); assert_eq!(data.child_data().len(), 0); - let values = filter_bits(&data.buffers()[0], data.offset(), predicate); + let values = filter_bits(data.buffers()[0], data.offset(), predicate); let mut builder = ArrayDataBuilder::new(DataType::Boolean) .len(predicate.count) @@ -572,7 +572,7 @@ where Self { src_offsets: array.value_offsets(), - src_values: &array.data().buffers()[1], + src_values: array.data().buffers()[1], dst_offsets, dst_values, cur_offset, diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 4b052ce004cc..ea0c8e3d526c 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -56,7 +56,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result ( buffer_bin_and( - &right_data.buffers()[0], + right_data.buffers()[0], right_data.offset(), nulls.buffer(), nulls.offset(), diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 4d62b9e7cf61..c767a69e6bdf 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1158,7 +1158,7 @@ mod tests { // Check type ids assert_eq!( Buffer::from_slice_ref(&expected_type_ids), - array.data().buffers()[0] + *array.data().buffers()[0] ); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); @@ -1222,7 +1222,7 @@ mod tests { // Check type ids assert_eq!( Buffer::from_slice_ref(&expected_type_ids), - array.data().buffers()[0] + *array.data().buffers()[0] ); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); From 4f6729673b9c97e3daa8983ef52c51033b72a741 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 16:57:52 +0000 Subject: [PATCH 0644/1411] Make LocalFileSystem::put atomic (#3780) (#3781) * Make LocalFileSystem::put atomic (#3780) * Clippy * Add list test --- object_store/src/lib.rs | 6 +++ object_store/src/local.rs | 106 ++++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6a3275bb06e6..671b22d0f9a5 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -840,6 +840,12 @@ mod tests { crate::Error::NotFound { .. } )); + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(&files, &[]); + + let result = storage.list_with_delimiter(None).await.unwrap(); + assert_eq!(&result.objects, &[]); + writer.shutdown().await.unwrap(); let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 9a518ba4735a..f1733f54bab1 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -27,8 +27,8 @@ use futures::future::BoxFuture; use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::fs::{metadata, symlink_metadata, File}; -use std::io::{Read, Seek, SeekFrom, Write}; +use std::fs::{metadata, symlink_metadata, File, OpenOptions}; +use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; @@ -65,6 +65,11 @@ pub(crate) enum Error { source: io::Error, }, + #[snafu(display("Unable to rename file: {}", source))] + UnableToRenameFile { + source: io::Error, + }, + #[snafu(display("Unable to create dir {}: {}", path.display(), source))] UnableToCreateDir { source: io::Error, @@ -266,11 +271,14 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let mut file = open_writable_file(&path)?; + let (mut file, suffix) = new_staged_upload(&path)?; + let staging_path = staged_upload_path(&path, &suffix); file.write_all(&bytes) .context(UnableToCopyDataToFileSnafu)?; + std::fs::rename(staging_path, path).context(UnableToRenameFileSnafu)?; + Ok(()) }) .await @@ -282,28 +290,10 @@ impl ObjectStore for LocalFileSystem { ) -> Result<(MultipartId, Box)> { let dest = self.config.path_to_filesystem(location)?; - // Generate an id in case of concurrent writes - let mut multipart_id = 1; - - // Will write to a temporary path - let staging_path = loop { - let staging_path = get_upload_stage_path(&dest, &multipart_id.to_string()); - - match std::fs::metadata(&staging_path) { - Err(err) if err.kind() == io::ErrorKind::NotFound => break staging_path, - Err(err) => { - return Err(Error::UnableToCopyDataToFile { source: err }.into()) - } - Ok(_) => multipart_id += 1, - } - }; - let multipart_id = multipart_id.to_string(); - - let file = open_writable_file(&staging_path)?; - + let (file, suffix) = new_staged_upload(&dest)?; Ok(( - multipart_id.clone(), - Box::new(LocalUpload::new(dest, multipart_id, Arc::new(file))), + suffix.clone(), + Box::new(LocalUpload::new(dest, suffix, Arc::new(file))), )) } @@ -313,7 +303,7 @@ impl ObjectStore for LocalFileSystem { multipart_id: &MultipartId, ) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; - let staging_path: PathBuf = get_upload_stage_path(&dest, multipart_id); + let staging_path: PathBuf = staged_upload_path(&dest, multipart_id); maybe_spawn_blocking(move || { std::fs::remove_file(&staging_path) @@ -553,9 +543,40 @@ impl ObjectStore for LocalFileSystem { } } -fn get_upload_stage_path(dest: &std::path::Path, multipart_id: &MultipartId) -> PathBuf { +/// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `suffix` +/// +/// Creates any directories if necessary +fn new_staged_upload(base: &std::path::Path) -> Result<(File, String)> { + let mut multipart_id = 1; + loop { + let suffix = multipart_id.to_string(); + let path = staged_upload_path(base, &suffix); + let mut options = OpenOptions::new(); + match options.read(true).write(true).create_new(true).open(&path) { + Ok(f) => return Ok((f, suffix)), + Err(e) if e.kind() == ErrorKind::AlreadyExists => { + multipart_id += 1; + } + Err(err) if err.kind() == ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFileSnafu { path: &path, err })?; + + std::fs::create_dir_all(parent) + .context(UnableToCreateDirSnafu { path: parent })?; + + continue; + } + Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), + } + } +} + +/// Returns the unique upload for the given path and suffix +fn staged_upload_path(dest: &std::path::Path, suffix: &str) -> PathBuf { let mut staging_path = dest.as_os_str().to_owned(); - staging_path.push(format!("#{multipart_id}")); + staging_path.push("#"); + staging_path.push(suffix); staging_path.into() } @@ -700,7 +721,7 @@ impl AsyncWrite for LocalUpload { Poll::Ready(res) => { res?; let staging_path = - get_upload_stage_path(&self.dest, &self.multipart_id); + staged_upload_path(&self.dest, &self.multipart_id); let dest = self.dest.clone(); self.inner_state = LocalUploadState::Committing(Box::pin( runtime @@ -741,7 +762,7 @@ impl AsyncWrite for LocalUpload { } } } else { - let staging_path = get_upload_stage_path(&self.dest, &self.multipart_id); + let staging_path = staged_upload_path(&self.dest, &self.multipart_id); match &mut self.inner_state { LocalUploadState::Idle(file) => { let file = Arc::clone(file); @@ -802,33 +823,6 @@ fn open_file(path: &PathBuf) -> Result { Ok(file) } -fn open_writable_file(path: &PathBuf) -> Result { - match File::create(path) { - Ok(f) => Ok(f), - Err(err) if err.kind() == std::io::ErrorKind::NotFound => { - let parent = path - .parent() - .context(UnableToCreateFileSnafu { path: &path, err })?; - std::fs::create_dir_all(parent) - .context(UnableToCreateDirSnafu { path: parent })?; - - match File::create(path) { - Ok(f) => Ok(f), - Err(err) => Err(Error::UnableToCreateFile { - path: path.to_path_buf(), - err, - } - .into()), - } - } - Err(err) => Err(Error::UnableToCreateFile { - path: path.to_path_buf(), - err, - } - .into()), - } -} - fn convert_entry(entry: DirEntry, location: Path) -> Result { let metadata = entry .metadata() From 661bbad8f817613c9bd5cab8616dcfaa37858865 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Mar 2023 17:07:50 +0000 Subject: [PATCH 0645/1411] Add ObjectStore::append (#3791) --- object_store/src/lib.rs | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 671b22d0f9a5..3af538254183 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -278,7 +278,11 @@ pub type MultipartId = String; /// Universal API to multiple object store services. #[async_trait] pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { - /// Save the provided bytes to the specified location. + /// Save the provided bytes to the specified location + /// + /// The operation is guaranteed to be atomic, it will either successfully + /// write the entirety of `bytes` to `location`, or fail. No clients + /// should be able to observe a partially written object async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; /// Get a multi-part upload that allows writing data in chunks @@ -286,7 +290,9 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Most cloud-based uploads will buffer and upload parts in parallel. /// /// To complete the upload, [AsyncWrite::poll_shutdown] must be called - /// to completion. + /// to completion. This operation is guaranteed to be atomic, it will either + /// make all the written data available at `location`, or fail. No clients + /// should be able to observe a partially written object /// /// For some object stores (S3, GCS, and local in particular), if the /// writer fails or panics, you must call [ObjectStore::abort_multipart] @@ -306,6 +312,33 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { multipart_id: &MultipartId, ) -> Result<()>; + /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` + /// + /// A new object will be created if it doesn't already exist, otherwise it will be + /// opened, with subsequent writes appended to the end. + /// + /// This operation cannot be supported by all stores, most use-cases should prefer + /// [`ObjectStore::put`] and [`ObjectStore::put_multipart`] for better portability + /// and stronger guarantees + /// + /// This API is not guaranteed to be atomic, in particular + /// + /// * On error, `location` may contain partial data + /// * Concurrent calls to [`ObjectStore::list`] may return partially written objects + /// * Concurrent calls to [`ObjectStore::get`] may return partially written data + /// * Concurrent calls to [`ObjectStore::put`] may result in data loss / corruption + /// * Concurrent calls to [`ObjectStore::append`] may result in data loss / corruption + /// + /// Additionally some stores, such as Azure, may only support appending to objects created + /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or + /// [`ObjectStore::put_multipart`] + async fn append( + &self, + _location: &Path, + ) -> Result> { + Err(Error::NotImplemented) + } + /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result; From f8abb047519e2be6044882ef4469ffcd8b6e7c56 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Fri, 3 Mar 2023 12:00:23 +0100 Subject: [PATCH 0646/1411] feat: Implement concat_elements_dyn kernel (#3763) --- arrow-string/src/concat_elements.rs | 150 +++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index 78fe3a47d1b9..4aa5a127c920 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use arrow_array::builder::BufferBuilder; use arrow_array::*; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayDataBuilder; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, DataType}; /// Returns the elementwise concatenation of a [`StringArray`]. /// @@ -156,6 +158,93 @@ pub fn concat_elements_utf8_many( Ok(unsafe { builder.build_unchecked() }.into()) } +pub fn concat_element_binary( + left: &GenericBinaryArray, + right: &GenericBinaryArray, +) -> Result, ArrowError> { + if left.len() != right.len() { + return Err(ArrowError::ComputeError(format!( + "Arrays must have the same length: {} != {}", + left.len(), + right.len() + ))); + } + + let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len()); + + let left_offsets = left.value_offsets(); + let right_offsets = right.value_offsets(); + + let left_values = left.value_data(); + let right_values = right.value_data(); + + let mut output_values = BufferBuilder::::new( + left_values.len() + right_values.len() + - left_offsets[0].as_usize() + - right_offsets[0].as_usize(), + ); + + let mut output_offsets = BufferBuilder::::new(left_offsets.len()); + output_offsets.append(Offset::zero()); + for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { + output_values + .append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); + output_values.append_slice( + &right_values[right_idx[0].as_usize()..right_idx[1].as_usize()], + ); + output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); + } + + let builder = ArrayDataBuilder::new(GenericBinaryArray::::DATA_TYPE) + .len(left.len()) + .add_buffer(output_offsets.finish()) + .add_buffer(output_values.finish()) + .null_bit_buffer(output_bitmap); + + // SAFETY - offsets valid by construction + Ok(unsafe { builder.build_unchecked() }.into()) +} + +pub fn concat_elements_dyn( + left: &dyn Array, + right: &dyn Array, +) -> Result { + if left.data_type() != right.data_type() { + return Err(ArrowError::ComputeError(format!( + "Cannot concat arrays of different types: {} != {}", + left.data_type(), + right.data_type() + ))); + } + match (left.data_type(), right.data_type()) { + (DataType::Utf8, DataType::Utf8) => { + let left = left.as_any().downcast_ref::().unwrap(); + let right = right.as_any().downcast_ref::().unwrap(); + Ok(Arc::new(concat_elements_utf8(left, right).unwrap())) + } + (DataType::LargeUtf8, DataType::LargeUtf8) => { + let left = left.as_any().downcast_ref::().unwrap(); + let right = right.as_any().downcast_ref::().unwrap(); + Ok(Arc::new(concat_elements_utf8(left, right).unwrap())) + } + (DataType::Binary, DataType::Binary) => { + let left = left.as_any().downcast_ref::().unwrap(); + let right = right.as_any().downcast_ref::().unwrap(); + Ok(Arc::new(concat_element_binary(left, right).unwrap())) + } + (DataType::LargeBinary, DataType::LargeBinary) => { + let left = left.as_any().downcast_ref::().unwrap(); + let right = right.as_any().downcast_ref::().unwrap(); + Ok(Arc::new(concat_element_binary(left, right).unwrap())) + } + // unimplemented + _ => Err(ArrowError::NotYetImplemented(format!( + "concat not supported for {}", + left.data_type() + ))), + } +} + #[cfg(test)] mod tests { use super::*; @@ -301,4 +390,63 @@ mod tests { assert_eq!(output, expected); } + + #[test] + fn test_concat_dyn_same_type() { + // test for StringArray + let left = StringArray::from(vec![Some("foo"), Some("bar"), None]); + let right = StringArray::from(vec![None, Some("yyy"), Some("zzz")]); + + let output: StringArray = concat_elements_dyn(&left, &right) + .unwrap() + .into_data() + .into(); + let expected = StringArray::from(vec![None, Some("baryyy"), None]); + assert_eq!(output, expected); + + // test for LargeStringArray + let left = LargeStringArray::from(vec![Some("foo"), Some("bar"), None]); + let right = LargeStringArray::from(vec![None, Some("yyy"), Some("zzz")]); + + let output: LargeStringArray = concat_elements_dyn(&left, &right) + .unwrap() + .into_data() + .into(); + let expected = LargeStringArray::from(vec![None, Some("baryyy"), None]); + assert_eq!(output, expected); + + // test for BinaryArray + let left = BinaryArray::from_opt_vec(vec![Some(b"foo"), Some(b"bar"), None]); + let right = BinaryArray::from_opt_vec(vec![None, Some(b"yyy"), Some(b"zzz")]); + let output: BinaryArray = concat_elements_dyn(&left, &right) + .unwrap() + .into_data() + .into(); + let expected = BinaryArray::from_opt_vec(vec![None, Some(b"baryyy"), None]); + assert_eq!(output, expected); + + // test for LargeBinaryArray + let left = LargeBinaryArray::from_opt_vec(vec![Some(b"foo"), Some(b"bar"), None]); + let right = + LargeBinaryArray::from_opt_vec(vec![None, Some(b"yyy"), Some(b"zzz")]); + let output: LargeBinaryArray = concat_elements_dyn(&left, &right) + .unwrap() + .into_data() + .into(); + let expected = LargeBinaryArray::from_opt_vec(vec![None, Some(b"baryyy"), None]); + assert_eq!(output, expected); + } + + #[test] + fn test_concat_dyn_different_type() { + let left = StringArray::from(vec![Some("foo"), Some("bar"), None]); + let right = LargeStringArray::from(vec![None, Some("1"), Some("2")]); + + let output = concat_elements_dyn(&left, &right); + assert_eq!( + output.unwrap_err().to_string(), + "Compute error: Cannot concat arrays of different types: Utf8 != LargeUtf8" + .to_string() + ); + } } From b9fcd7fa2154f848823521a11aeeba4e687025b8 Mon Sep 17 00:00:00 2001 From: Max Burke Date: Fri, 3 Mar 2023 03:32:57 -0800 Subject: [PATCH 0647/1411] Preallocate buffers for FixedSizeBinary array creation (#3793) * Preallocate buffers for FixedSizeBinary array creation * fix build * code review fixup --- .../src/array/fixed_size_binary_array.rs | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index e927c8d8ae58..87f1b955723d 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -141,8 +141,11 @@ impl FixedSizeBinaryArray { let mut len = 0; let mut size = None; let mut byte = 0; - let mut null_buf = MutableBuffer::from_len_zeroed(0); - let mut buffer = MutableBuffer::from_len_zeroed(0); + + let iter_size_hint = iter.size_hint().0; + let mut null_buf = MutableBuffer::new(bit_util::ceil(iter_size_hint, 8)); + let mut buffer = MutableBuffer::new(0); + let mut prepend = 0; iter.try_for_each(|item| -> Result<(), ArrowError> { // extend null bitmask by one byte per each 8 items @@ -163,7 +166,12 @@ impl FixedSizeBinaryArray { ))); } } else { - size = Some(slice.len()); + let len = slice.len(); + size = Some(len); + // Now that we know how large each element is we can reserve + // sufficient capacity in the underlying mutable buffer for + // the data. + buffer.reserve(iter_size_hint * len); buffer.extend_zeros(slice.len() * prepend); } bit_util::set_bit(null_buf.as_slice_mut(), len); @@ -234,8 +242,10 @@ impl FixedSizeBinaryArray { { let mut len = 0; let mut byte = 0; - let mut null_buf = MutableBuffer::from_len_zeroed(0); - let mut buffer = MutableBuffer::from_len_zeroed(0); + + let iter_size_hint = iter.size_hint().0; + let mut null_buf = MutableBuffer::new(bit_util::ceil(iter_size_hint, 8)); + let mut buffer = MutableBuffer::new(iter_size_hint * (size as usize)); iter.try_for_each(|item| -> Result<(), ArrowError> { // extend null bitmask by one byte per each 8 items @@ -304,7 +314,9 @@ impl FixedSizeBinaryArray { { let mut len = 0; let mut size = None; - let mut buffer = MutableBuffer::from_len_zeroed(0); + let iter_size_hint = iter.size_hint().0; + let mut buffer = MutableBuffer::new(0); + iter.try_for_each(|item| -> Result<(), ArrowError> { let slice = item.as_ref(); if let Some(size) = size { @@ -316,8 +328,11 @@ impl FixedSizeBinaryArray { ))); } } else { - size = Some(slice.len()); + let len = slice.len(); + size = Some(len); + buffer.reserve(iter_size_hint * len); } + buffer.extend_from_slice(slice); len += 1; From 40e2874e1d83dd8dc64981b7f4a19f894befe615 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 3 Mar 2023 12:38:34 +0100 Subject: [PATCH 0648/1411] refactor: assorted `FlightSqlServiceClient` improvements (#3788) * refactor: assorted `FlightSqlServiceClient` improvements - **TLS config:** Do NOT alter existing method signatures if the TLS feature is enabled. Features should be purely additive in Rust. Instead use a new method to pass TLS configs. The config is now passed as `ClientTlsConfig` to allow more flexibility, e.g. just to use TLS w/o any client certs. - **token handlng:** Allow the token to be passed in from an external source. The [auth spec] is super flexibility ("application-defined") and we cannot derive a way to determine the token in all cases. The current handshake-based mechanism is OK though. Also make sure the token is used in all relevant methods. - **headers:** Allow users to pass in additional headers. This is helpful for certain applications. [auth spec]: https://arrow.apache.org/docs/format/Flight.html#authentication * refactor: simplify flight SQL client construction Just accept a channel and let the caller set it up to their liking. Simplify example as well so that we no longer do totally different things under different features (since features shall be additive). Instead use a single example. --- .github/workflows/arrow_flight.yml | 3 - arrow-flight/Cargo.toml | 2 +- arrow-flight/examples/flight_sql_server.rs | 85 +++++++------ arrow-flight/src/sql/client.rs | 134 +++++++++------------ 4 files changed, 99 insertions(+), 125 deletions(-) diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 02c149aaae0b..7facf17197fc 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -60,9 +60,6 @@ jobs: run: | cargo test -p arrow-flight --all-features - name: Test --examples - run: | - cargo test -p arrow-flight --features=flight-sql-experimental --examples - - name: Test --examples with TLS run: | cargo test -p arrow-flight --features=flight-sql-experimental,tls --examples - name: Verify workspace clean diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 61959143e924..fd77a814ab88 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -64,4 +64,4 @@ tonic-build = { version = "=0.8.4", default-features = false, features = ["trans [[example]] name = "flight_sql_server" -required-features = ["flight-sql-experimental"] +required-features = ["flight-sql-experimental", "tls"] diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 28aef4e921a7..425ceab42779 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -31,7 +31,6 @@ use prost::Message; use std::pin::Pin; use std::sync::Arc; use tonic::transport::Server; -#[cfg(feature = "tls")] use tonic::transport::{Certificate, Identity, ServerTlsConfig}; use tonic::{Request, Response, Status, Streaming}; @@ -451,7 +450,6 @@ impl FlightSqlService for FlightSqlServiceImpl { /// This example shows how to run a FlightSql server #[tokio::main] -#[cfg(not(feature = "tls"))] async fn main() -> Result<(), Box> { let addr = "0.0.0.0:50051".parse()?; @@ -459,34 +457,24 @@ async fn main() -> Result<(), Box> { println!("Listening on {:?}", addr); - Server::builder().add_service(svc).serve(addr).await?; + if std::env::var("USE_TLS").ok().is_some() { + let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; + let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; + let client_ca = + std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; - Ok(()) -} - -/// This example shows how to run a HTTPs FlightSql server -#[tokio::main] -#[cfg(feature = "tls")] -async fn main() -> Result<(), Box> { - let addr = "0.0.0.0:50051".parse()?; - - let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); - - println!("Listening on {addr:?}"); - - let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; - let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; - let client_ca = std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; - - let tls_config = ServerTlsConfig::new() - .identity(Identity::from_pem(&cert, &key)) - .client_ca_root(Certificate::from_pem(&client_ca)); + let tls_config = ServerTlsConfig::new() + .identity(Identity::from_pem(&cert, &key)) + .client_ca_root(Certificate::from_pem(&client_ca)); - Server::builder() - .tls_config(tls_config)? - .add_service(svc) - .serve(addr) - .await?; + Server::builder() + .tls_config(tls_config)? + .add_service(svc) + .serve(addr) + .await?; + } else { + Server::builder().add_service(svc).serve(addr).await?; + } Ok(()) } @@ -523,8 +511,6 @@ mod tests { use tokio_stream::wrappers::UnixListenerStream; use tonic::body::BoxBody; use tonic::codegen::{http, Body, Service}; - - #[cfg(feature = "tls")] use tonic::transport::ClientTlsConfig; use arrow::util::pretty::pretty_format_batches; @@ -533,10 +519,9 @@ mod tests { use tonic::transport::{Certificate, Channel, Endpoint}; use tower::{service_fn, ServiceExt}; - #[cfg(not(feature = "tls"))] async fn client_with_uds(path: String) -> FlightSqlServiceClient { let connector = service_fn(move |_| UnixStream::connect(path.clone())); - let channel = Endpoint::try_from("https://example.com") + let channel = Endpoint::try_from("http://example.com") .unwrap() .connect_with_connector(connector) .await @@ -544,7 +529,6 @@ mod tests { FlightSqlServiceClient::new(channel) } - #[cfg(feature = "tls")] async fn create_https_server() -> Result<(), tonic::transport::Error> { let cert = std::fs::read_to_string("examples/data/server.pem").unwrap(); let key = std::fs::read_to_string("examples/data/server.key").unwrap(); @@ -567,7 +551,6 @@ mod tests { } #[tokio::test] - #[cfg(feature = "tls")] async fn test_select_https() { tokio::spawn(async { create_https_server().await.unwrap(); @@ -580,15 +563,16 @@ mod tests { let key = std::fs::read_to_string("examples/data/client1.key").unwrap(); let server_ca = std::fs::read_to_string("examples/data/ca.pem").unwrap(); - let mut client = FlightSqlServiceClient::new_with_endpoint( - Identity::from_pem(cert, key), - Certificate::from_pem(&server_ca), - "localhost", - "127.0.0.1", - 50051, - ) - .await - .unwrap(); + let tls_config = ClientTlsConfig::new() + .domain_name("localhost") + .ca_certificate(Certificate::from_pem(&server_ca)) + .identity(Identity::from_pem(cert, key)); + let endpoint = endpoint(String::from("https://127.0.0.1:50051")) + .unwrap() + .tls_config(tls_config) + .unwrap(); + let channel = endpoint.connect().await.unwrap(); + let mut client = FlightSqlServiceClient::new(channel); let token = client.handshake("admin", "password").await.unwrap(); println!("Auth succeeded with token: {:?}", token); let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); @@ -615,7 +599,6 @@ mod tests { } #[tokio::test] - #[cfg(not(feature = "tls"))] async fn test_select_1() { let file = NamedTempFile::new().unwrap(); let path = file.into_temp_path().to_str().unwrap().to_string(); @@ -657,4 +640,18 @@ mod tests { _ = request_future => println!("Client finished!"), } } + + fn endpoint(addr: String) -> Result { + let endpoint = Endpoint::new(addr) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .connect_timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(20)) + .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait + .tcp_keepalive(Option::Some(Duration::from_secs(3600))) + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(20)) + .keep_alive_while_idle(true); + + Ok(endpoint) + } } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 31ba1e274f88..a61f06d32922 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -19,7 +19,8 @@ use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; use std::collections::HashMap; -use std::time::Duration; +use std::str::FromStr; +use tonic::metadata::AsciiMetadataKey; use crate::flight_service_client::FlightServiceClient; use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; @@ -44,16 +45,15 @@ use arrow_ipc::{root_as_message, MessageHeader}; use arrow_schema::{ArrowError, Schema, SchemaRef}; use futures::{stream, TryStreamExt}; use prost::Message; -#[cfg(feature = "tls")] -use tonic::transport::{Certificate, ClientTlsConfig, Identity}; -use tonic::transport::{Channel, Endpoint}; -use tonic::Streaming; +use tonic::transport::Channel; +use tonic::{IntoRequest, Streaming}; /// A FlightSQLServiceClient is an endpoint for retrieving or storing Arrow data /// by FlightSQL protocol. #[derive(Debug, Clone)] pub struct FlightSqlServiceClient { token: Option, + headers: HashMap, flight_client: FlightServiceClient, } @@ -61,68 +61,13 @@ pub struct FlightSqlServiceClient { /// This client is in the "experimental" stage. It is not guaranteed to follow the spec in all instances. /// Github issues are welcomed. impl FlightSqlServiceClient { - /// Creates a new FlightSql Client that connects via TCP to a server - #[cfg(not(feature = "tls"))] - pub async fn new_with_endpoint(host: &str, port: u16) -> Result { - let addr = format!("http://{}:{}", host, port); - let endpoint = Endpoint::new(addr) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? - .connect_timeout(Duration::from_secs(20)) - .timeout(Duration::from_secs(20)) - .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait - .tcp_keepalive(Option::Some(Duration::from_secs(3600))) - .http2_keep_alive_interval(Duration::from_secs(300)) - .keep_alive_timeout(Duration::from_secs(20)) - .keep_alive_while_idle(true); - - let channel = endpoint.connect().await.map_err(|e| { - ArrowError::IoError(format!("Cannot connect to endpoint: {}", e)) - })?; - Ok(Self::new(channel)) - } - - /// Creates a new HTTPs FlightSql Client that connects via TCP to a server - #[cfg(feature = "tls")] - pub async fn new_with_endpoint( - client_ident: Identity, - server_ca: Certificate, - domain: &str, - host: &str, - port: u16, - ) -> Result { - let addr = format!("https://{host}:{port}"); - - let endpoint = Endpoint::new(addr) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? - .connect_timeout(Duration::from_secs(20)) - .timeout(Duration::from_secs(20)) - .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait - .tcp_keepalive(Option::Some(Duration::from_secs(3600))) - .http2_keep_alive_interval(Duration::from_secs(300)) - .keep_alive_timeout(Duration::from_secs(20)) - .keep_alive_while_idle(true); - - let tls_config = ClientTlsConfig::new() - .domain_name(domain) - .ca_certificate(server_ca) - .identity(client_ident); - - let endpoint = endpoint - .tls_config(tls_config) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))?; - - let channel = endpoint.connect().await.map_err(|e| { - ArrowError::IoError(format!("Cannot connect to endpoint: {e}")) - })?; - Ok(Self::new(channel)) - } - /// Creates a new FlightSql client that connects to a server over an arbitrary tonic `Channel` pub fn new(channel: Channel) -> Self { let flight_client = FlightServiceClient::new(channel); FlightSqlServiceClient { token: None, flight_client, + headers: HashMap::default(), } } @@ -141,14 +86,27 @@ impl FlightSqlServiceClient { self.flight_client } + /// Set auth token to the given value. + pub fn set_token(&mut self, token: String) { + self.token = Some(token); + } + + /// Set header value. + pub fn set_header(&mut self, key: impl Into, value: impl Into) { + let key: String = key.into(); + let value: String = value.into(); + self.headers.insert(key, value); + } + async fn get_flight_info_for_command( &mut self, cmd: M, ) -> Result { let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let req = self.set_request_headers(descriptor.into_request())?; let fi = self .flight_client - .get_flight_info(descriptor) + .get_flight_info(req) .await .map_err(status_to_arrow_error)? .into_inner(); @@ -178,6 +136,7 @@ impl FlightSqlServiceClient { .parse() .map_err(|_| ArrowError::ParseError("Cannot parse header".to_string()))?; req.metadata_mut().insert("authorization", val); + let req = self.set_request_headers(req)?; let resp = self .flight_client .handshake(req) @@ -199,25 +158,29 @@ impl FlightSqlServiceClient { ArrowError::ParseError("Can't collect responses".to_string()) })?; let resp = match responses.as_slice() { - [resp] => resp, - [] => Err(ArrowError::ParseError("No handshake response".to_string()))?, + [resp] => resp.payload.clone(), + [] => Bytes::new(), _ => Err(ArrowError::ParseError( "Multiple handshake responses".to_string(), ))?, }; - Ok(resp.payload.clone()) + Ok(resp) } /// Execute a update query on the server, and return the number of records affected pub async fn execute_update(&mut self, query: String) -> Result { let cmd = CommandStatementUpdate { query }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); - let mut result = self - .flight_client - .do_put(stream::iter(vec![FlightData { + let req = self.set_request_headers( + stream::iter(vec![FlightData { flight_descriptor: Some(descriptor), ..Default::default() - }])) + }]) + .into_request(), + )?; + let mut result = self + .flight_client + .do_put(req) .await .map_err(status_to_arrow_error)? .into_inner(); @@ -251,9 +214,10 @@ impl FlightSqlServiceClient { &mut self, ticket: Ticket, ) -> Result, ArrowError> { + let req = self.set_request_headers(ticket.into_request())?; Ok(self .flight_client - .do_get(ticket) + .do_get(req) .await .map_err(status_to_arrow_error)? .into_inner()) @@ -329,13 +293,7 @@ impl FlightSqlServiceClient { r#type: CREATE_PREPARED_STATEMENT.to_string(), body: cmd.as_any().encode_to_vec().into(), }; - let mut req = tonic::Request::new(action); - if let Some(token) = &self.token { - let val = format!("Bearer {token}").parse().map_err(|_| { - ArrowError::IoError("Statement already closed.".to_string()) - })?; - req.metadata_mut().insert("authorization", val); - } + let req = self.set_request_headers(action.into_request())?; let mut result = self .flight_client .do_action(req) @@ -369,6 +327,28 @@ impl FlightSqlServiceClient { pub async fn close(&mut self) -> Result<(), ArrowError> { Ok(()) } + + fn set_request_headers( + &self, + mut req: tonic::Request, + ) -> Result, ArrowError> { + for (k, v) in &self.headers { + let k = AsciiMetadataKey::from_str(k.as_str()).map_err(|e| { + ArrowError::IoError(format!("Cannot convert header key \"{k}\": {e}")) + })?; + let v = v.parse().map_err(|e| { + ArrowError::IoError(format!("Cannot convert header value \"{v}\": {e}")) + })?; + req.metadata_mut().insert(k, v); + } + if let Some(token) = &self.token { + let val = format!("Bearer {token}").parse().map_err(|e| { + ArrowError::IoError(format!("Cannot convert token to header value: {e}")) + })?; + req.metadata_mut().insert("authorization", val); + } + Ok(req) + } } /// A PreparedStatement From e52574c8b02410128f35d65aa92d4876b1404d6c Mon Sep 17 00:00:00 2001 From: Xinyu Zeng Date: Fri, 3 Mar 2023 19:39:26 +0800 Subject: [PATCH 0649/1411] minor: make Parquet CLI input args consistent (#3786) * make input args consistent * positional arg for filename * make all mandatory args positional * pyspark integration * fix --- parquet/pytest/test_parquet_integration.py | 6 ++---- parquet/src/bin/parquet-read.rs | 2 +- parquet/src/bin/parquet-rowcount.rs | 2 -- parquet/src/bin/parquet-schema.rs | 2 +- parquet/src/bin/parquet-show-bloom-filter.rs | 12 +++--------- 5 files changed, 7 insertions(+), 17 deletions(-) diff --git a/parquet/pytest/test_parquet_integration.py b/parquet/pytest/test_parquet_integration.py index 268caa8fab06..e0846d4e779f 100755 --- a/parquet/pytest/test_parquet_integration.py +++ b/parquet/pytest/test_parquet_integration.py @@ -68,15 +68,13 @@ def get_show_filter_cli_output(output_dir, data, col_name="id"): (parquet_file,) = sorted(pathlib.Path(output_dir).glob("*.parquet")) args = [ "parquet-show-bloom-filter", - "--file-name", parquet_file, - "--column", col_name, ] for v in data: - args.extend(["--values", v[0]]) + args.extend([v[0]]) for v in data: - args.extend(["--values", v[1]]) + args.extend([v[1]]) return subprocess.check_output(args) diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index c1e08387a550..a8a835ab870d 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -45,7 +45,7 @@ use std::{fs::File, path::Path}; #[derive(Debug, Parser)] #[clap(author, version, about("Binary file to read data from a Parquet file"), long_about = None)] struct Args { - #[clap(short, long, help("Path to a parquet file, or - for stdin"))] + #[clap(help("Path to a parquet file, or - for stdin"))] file_name: String, #[clap( short, diff --git a/parquet/src/bin/parquet-rowcount.rs b/parquet/src/bin/parquet-rowcount.rs index 45eb1c9a476f..55c76c5f73e4 100644 --- a/parquet/src/bin/parquet-rowcount.rs +++ b/parquet/src/bin/parquet-rowcount.rs @@ -44,8 +44,6 @@ use std::{fs::File, path::Path}; #[clap(author, version, about("Binary file to return the number of rows found from Parquet file(s)"), long_about = None)] struct Args { #[clap( - short, - long, number_of_values(1), help("List of Parquet files to read from separated by space") )] diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index ae79fe4296c3..bfcb77d67b2e 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -46,7 +46,7 @@ use std::{fs::File, path::Path}; #[derive(Debug, Parser)] #[clap(author, version, about("Binary file to print the schema and metadata of a Parquet file"), long_about = None)] struct Args { - #[clap(short, long)] + #[clap(help("Path to the parquet file"))] file_path: String, #[clap(short, long, help("Enable printing full file metadata"))] verbose: bool, diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index 77e29c6fb282..80db51978433 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -25,7 +25,7 @@ //! ``` //! After this `parquet-show-bloom-filter` should be available: //! ``` -//! parquet-show-bloom-filter --file-name XYZ.parquet --column id --values a +//! parquet-show-bloom-filter XYZ.parquet id a //! ``` //! //! The binary can also be built from the source code and run as follows: @@ -44,17 +44,11 @@ use std::{fs::File, path::Path}; #[derive(Debug, Parser)] #[clap(author, version, about("Binary file to read bloom filter data from a Parquet file"), long_about = None)] struct Args { - #[clap(short, long, help("Path to the parquet file"))] + #[clap(help("Path to the parquet file"))] file_name: String, - #[clap( - short, - long, - help("Check the bloom filter indexes for the given column") - )] + #[clap(help("Check the bloom filter indexes for the given column"))] column: String, #[clap( - short, - long, help("Check if the given values match bloom filter, the values will be evaluated as strings"), required = true )] From 72474a674270685d6ea2d631760da4cd19dfeeea Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 4 Mar 2023 00:03:23 -0800 Subject: [PATCH 0650/1411] Add `into_primitive_dict_builder` to `DictionaryArray` (#3715) * Add into_primitive_dict_builder * For review --- arrow-array/src/array/dictionary_array.rs | 97 ++++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 38 +++++++- 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 60426e5b3c4d..22e99a44c326 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::StringDictionaryBuilder; +use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder}; +use crate::cast::as_primitive_array; use crate::iterator::ArrayIter; use crate::types::*; use crate::{ @@ -394,6 +395,44 @@ impl DictionaryArray { // Offsets were valid before and verified length is greater than or equal Self::from(unsafe { builder.build_unchecked() }) } + + /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating + /// its keys and values if the underlying data buffer is not shared by others. + pub fn into_primitive_dict_builder( + self, + ) -> Result, Self> + where + V: ArrowPrimitiveType, + { + if !self.value_type().is_primitive() { + return Err(self); + } + + let key_array = as_primitive_array::(self.keys()).clone(); + let value_array = as_primitive_array::(self.values()).clone(); + + drop(self.data); + drop(self.keys); + drop(self.values); + + let key_builder = key_array.into_builder(); + let value_builder = value_array.into_builder(); + + match (key_builder, value_builder) { + (Ok(key_builder), Ok(value_builder)) => Ok(unsafe { + PrimitiveDictionaryBuilder::new_from_builders(key_builder, value_builder) + }), + (Err(key_array), Ok(mut value_builder)) => { + Err(Self::try_new(&key_array, &value_builder.finish()).unwrap()) + } + (Ok(mut key_builder), Err(value_array)) => { + Err(Self::try_new(&key_builder.finish(), &value_array).unwrap()) + } + (Err(key_array), Err(value_array)) => { + Err(Self::try_new(&key_array, &value_array).unwrap()) + } + } + } } /// Constructs a `DictionaryArray` from an array data reference. @@ -644,11 +683,13 @@ where mod tests { use super::*; use crate::builder::PrimitiveDictionaryBuilder; + use crate::cast::as_dictionary_array; use crate::types::{ Float32Type, Int16Type, Int32Type, Int8Type, UInt32Type, UInt8Type, }; use crate::{Float32Array, Int16Array, Int32Array, Int8Array}; use arrow_buffer::{Buffer, ToByteSlice}; + use std::sync::Arc; #[test] fn test_dictionary_array() { @@ -930,4 +971,58 @@ mod tests { let a = DictionaryArray::::from_iter(["32"]); let _ = DictionaryArray::::from(a.into_data()); } + + #[test] + fn test_into_primitive_dict_builder() { + let values = Int32Array::from_iter_values([10_i32, 12, 15]); + let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); + + let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let boxed: ArrayRef = Arc::new(dict_array); + let col: DictionaryArray = as_dictionary_array(&boxed).clone(); + + drop(boxed); + drop(keys); + drop(values); + + let mut builder = col.into_primitive_dict_builder::().unwrap(); + + let slice = builder.values_slice_mut(); + assert_eq!(slice, &[10, 12, 15]); + + slice[0] = 4; + slice[1] = 2; + slice[2] = 1; + + let values = Int32Array::from_iter_values([4_i32, 2, 1]); + let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); + + let expected = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let new_array = builder.finish(); + assert_eq!(expected, new_array); + } + + #[test] + fn test_into_primitive_dict_builder_cloned_array() { + let values = Int32Array::from_iter_values([10_i32, 12, 15]); + let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); + + let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + + let boxed: ArrayRef = Arc::new(dict_array); + + let col: DictionaryArray = + DictionaryArray::::from(boxed.data().clone()); + let err = col.into_primitive_dict_builder::(); + + let returned = err.unwrap_err(); + + let values = Int32Array::from_iter_values([10_i32, 12, 15]); + let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); + + let expected = DictionaryArray::::try_new(&keys, &values).unwrap(); + assert_eq!(expected, returned); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 742c09d8cc26..9f410994114f 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -118,7 +118,7 @@ where /// # Panics /// /// This method panics if `keys_builder` or `values_builder` is not empty. - pub fn new_from_builders( + pub fn new_from_empty_builders( keys_builder: PrimitiveBuilder, values_builder: PrimitiveBuilder, ) -> Self { @@ -133,6 +133,30 @@ where } } + /// Creates a new `PrimitiveDictionaryBuilder` from existing `PrimitiveBuilder`s of keys and values. + /// + /// # Safety + /// + /// caller must ensure that the passed in builders are valid for DictionaryArray. + pub unsafe fn new_from_builders( + keys_builder: PrimitiveBuilder, + values_builder: PrimitiveBuilder, + ) -> Self { + let keys = keys_builder.values_slice(); + let values = values_builder.values_slice(); + let mut map = HashMap::with_capacity(values.len()); + + keys.iter().zip(values.iter()).for_each(|(key, value)| { + map.insert(Value(*value), K::Native::to_usize(*key).unwrap()); + }); + + Self { + keys_builder, + values_builder, + map, + } + } + /// Creates a new `PrimitiveDictionaryBuilder` with the provided capacities /// /// `keys_capacity`: the number of keys, i.e. length of array to build @@ -276,6 +300,16 @@ where DictionaryArray::from(unsafe { builder.build_unchecked() }) } + + /// Returns the current dictionary values buffer as a slice + pub fn values_slice(&self) -> &[V::Native] { + self.values_builder.values_slice() + } + + /// Returns the current dictionary values buffer as a mutable slice + pub fn values_slice_mut(&mut self) -> &mut [V::Native] { + self.values_builder.values_slice_mut() + } } impl Extend> @@ -357,7 +391,7 @@ mod tests { let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); let mut builder = - PrimitiveDictionaryBuilder::::new_from_builders( + PrimitiveDictionaryBuilder::::new_from_empty_builders( keys_builder, values_builder, ); From 7fdd0d8b1afe051c07cfdfb12c3d52a6d93e92b6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 4 Mar 2023 10:13:00 +0000 Subject: [PATCH 0651/1411] Add concat_elements_bytes (#3798) --- arrow-string/src/concat_elements.rs | 109 ++++++++++------------------ 1 file changed, 40 insertions(+), 69 deletions(-) diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index 4aa5a127c920..1f85b4deb549 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -18,29 +18,18 @@ use std::sync::Arc; use arrow_array::builder::BufferBuilder; +use arrow_array::types::ByteArrayType; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; -/// Returns the elementwise concatenation of a [`StringArray`]. -/// -/// An index of the resulting [`StringArray`] is null if any of -/// `StringArray` are null at that location. -/// -/// ```text -/// e.g: -/// -/// ["Hello"] + ["World"] = ["HelloWorld"] -/// -/// ["a", "b"] + [None, "c"] = [None, "bc"] -/// ``` -/// -/// An error will be returned if `left` and `right` have different lengths -pub fn concat_elements_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result, ArrowError> { +/// Returns the elementwise concatenation of a [`GenericByteArray`]. +pub fn concat_elements_bytes( + left: &GenericByteArray, + right: &GenericByteArray, +) -> Result, ArrowError> { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( "Arrays must have the same length: {} != {}", @@ -63,18 +52,18 @@ pub fn concat_elements_utf8( - right_offsets[0].as_usize(), ); - let mut output_offsets = BufferBuilder::::new(left_offsets.len()); - output_offsets.append(Offset::zero()); + let mut output_offsets = BufferBuilder::::new(left_offsets.len()); + output_offsets.append(T::Offset::usize_as(0)); for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { output_values .append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); output_values.append_slice( &right_values[right_idx[0].as_usize()..right_idx[1].as_usize()], ); - output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); + output_offsets.append(T::Offset::from_usize(output_values.len()).unwrap()); } - let builder = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) + let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(left.len()) .add_buffer(output_offsets.finish()) .add_buffer(output_values.finish()) @@ -84,6 +73,35 @@ pub fn concat_elements_utf8( Ok(unsafe { builder.build_unchecked() }.into()) } +/// Returns the elementwise concatenation of a [`GenericStringArray`]. +/// +/// An index of the resulting [`GenericStringArray`] is null if any of +/// `StringArray` are null at that location. +/// +/// ```text +/// e.g: +/// +/// ["Hello"] + ["World"] = ["HelloWorld"] +/// +/// ["a", "b"] + [None, "c"] = [None, "bc"] +/// ``` +/// +/// An error will be returned if `left` and `right` have different lengths +pub fn concat_elements_utf8( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result, ArrowError> { + concat_elements_bytes(left, right) +} + +/// Returns the elementwise concatenation of a [`GenericBinaryArray`]. +pub fn concat_element_binary( + left: &GenericBinaryArray, + right: &GenericBinaryArray, +) -> Result, ArrowError> { + concat_elements_bytes(left, right) +} + /// Returns the elementwise concatenation of [`StringArray`]. /// ```text /// e.g: @@ -158,53 +176,6 @@ pub fn concat_elements_utf8_many( Ok(unsafe { builder.build_unchecked() }.into()) } -pub fn concat_element_binary( - left: &GenericBinaryArray, - right: &GenericBinaryArray, -) -> Result, ArrowError> { - if left.len() != right.len() { - return Err(ArrowError::ComputeError(format!( - "Arrays must have the same length: {} != {}", - left.len(), - right.len() - ))); - } - - let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len()); - - let left_offsets = left.value_offsets(); - let right_offsets = right.value_offsets(); - - let left_values = left.value_data(); - let right_values = right.value_data(); - - let mut output_values = BufferBuilder::::new( - left_values.len() + right_values.len() - - left_offsets[0].as_usize() - - right_offsets[0].as_usize(), - ); - - let mut output_offsets = BufferBuilder::::new(left_offsets.len()); - output_offsets.append(Offset::zero()); - for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { - output_values - .append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); - output_values.append_slice( - &right_values[right_idx[0].as_usize()..right_idx[1].as_usize()], - ); - output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); - } - - let builder = ArrayDataBuilder::new(GenericBinaryArray::::DATA_TYPE) - .len(left.len()) - .add_buffer(output_offsets.finish()) - .add_buffer(output_values.finish()) - .null_bit_buffer(output_bitmap); - - // SAFETY - offsets valid by construction - Ok(unsafe { builder.build_unchecked() }.into()) -} - pub fn concat_elements_dyn( left: &dyn Array, right: &dyn Array, From 6cd09171f2f880b0e411e14e1ae1075eafab6ab5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 4 Mar 2023 11:13:50 +0000 Subject: [PATCH 0652/1411] Timezone aware timestamp parsing (#3794) (#3795) * Timezone aware timestamp parsing (#3794) * Add further test * Update arrow-cast/src/parse.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-cast/src/parse.rs | 133 ++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 45 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f23e65b22845..a48dd2bac7d2 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -20,11 +20,7 @@ use arrow_array::ArrowPrimitiveType; use arrow_schema::ArrowError; use chrono::prelude::*; -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// Accepts a string and parses it relative to the provided `timezone` /// /// In addition to RFC3339 / ISO8601 standard timestamps, it also /// accepts strings that use a space ` ` to separate the date and time @@ -38,36 +34,6 @@ use chrono::prelude::*; /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds /// * `1997-01-31` # close to RCF3339, only date no time -// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function interprets strings without an explicit time zone as -/// timestamps with offsets of the local time on the machine -/// -/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as -/// it has an explicit timezone specifier (“Z” for Zulu/UTC) -/// -/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in -/// the timezone of the machine. For example, if -/// the system timezone is set to Americas/New_York (UTC-5) the -/// timestamp will be interpreted as though it were -/// `1997-01-31T09:26:56.123-05:00` /// /// Some formats that supported by PostgresSql /// still not supported by chrono, like @@ -76,12 +42,14 @@ use chrono::prelude::*; /// "2023-01-01 040506 +07:30:00", /// "2023-01-01 04:05:06.789 PST", /// "2023-01-01 04:05:06.789 -08", -#[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { +pub fn string_to_datetime( + timezone: &T, + s: &str, +) -> Result, ArrowError> { // Fast path: RFC3339 timestamp (with a T) // Example: 2020-09-08T13:42:29.190855Z if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.timestamp_nanos()); + return Ok(ts.with_timezone(timezone)); } // Implement quasi-RFC3339 support by trying to parse the @@ -96,14 +64,14 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { for f in supported_formats.iter() { if let Ok(ts) = DateTime::parse_from_str(s, f) { - return to_timestamp_nanos(ts.naive_utc()); + return Ok(ts.with_timezone(timezone)); } } // with an explicit Z, using ' ' as a separator // Example: 2020-09-08 13:42:29Z if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return to_timestamp_nanos(ts.naive_utc()); + return Ok(ts.with_timezone(timezone)); } // Support timestamps without an explicit timezone offset, again @@ -112,34 +80,44 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { // without a timezone specifier as a local time, using T as a separator // Example: 2020-09-08T13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { - return to_timestamp_nanos(ts); + if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { + return Ok(DateTime::from_local(ts, offset)); + } } // without a timezone specifier as a local time, using T as a // separator, no fractional seconds // Example: 2020-09-08T13:42:29 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return Ok(ts.timestamp_nanos()); + if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { + return Ok(DateTime::from_local(ts, offset)); + } } // without a timezone specifier as a local time, using ' ' as a separator // Example: 2020-09-08 13:42:29.190855 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") { - return to_timestamp_nanos(ts); + if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { + return Ok(DateTime::from_local(ts, offset)); + } } // without a timezone specifier as a local time, using ' ' as a // separator, no fractional seconds // Example: 2020-09-08 13:42:29 if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return Ok(ts.timestamp_nanos()); + if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { + return Ok(DateTime::from_local(ts, offset)); + } } // without a timezone specifier as a local time, only date // Example: 2020-09-08 if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") { if let Some(ts) = dt.and_hms_opt(0, 0, 0) { - return Ok(ts.timestamp_nanos()); + if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { + return Ok(DateTime::from_local(ts, offset)); + } } } @@ -153,6 +131,42 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Accepts a string in RFC3339 / ISO8601 standard format and some +/// variants and converts it to a nanosecond precision timestamp. +/// +/// See [`string_to_datetime`] for the full set of supported formats +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// We hope to extend this function in the future with a second +/// parameter to specifying the format string. +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// This function interprets string without an explicit time zone as timestamps +/// relative to UTC, see [`string_to_datetime`] for alternative semantics +/// +/// For example, both `1997-01-31 09:26:56.123Z`, `1997-01-31T09:26:56.123`, +/// and `1997-01-31T14:26:56.123-05:00` will be parsed as the same value +/// +#[inline] +pub fn string_to_timestamp_nanos(s: &str) -> Result { + to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) +} + /// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates #[inline] fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { @@ -448,6 +462,7 @@ impl Parser for Date64Type { #[cfg(test)] mod tests { use super::*; + use arrow_array::timezone::Tz; #[test] fn string_to_timestamp_timezone() { @@ -614,6 +629,34 @@ mod tests { naive_datetime.timestamp_nanos(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); + + let tz: Tz = "+02:00".parse().unwrap(); + let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 11:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 13:42:29"); + + let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 13:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 15:42:29"); + + let dt = + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") + .unwrap(); + let local: Tz = "+08:00".parse().unwrap(); + + // Parsed as offset from UTC + let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); + assert_eq!(dt, date.naive_utc()); + assert_ne!(dt, date.naive_local()); + + // Parsed as offset from local + let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); + assert_eq!(dt, date.naive_local()); + assert_ne!(dt, date.naive_utc()); } #[test] From 79518cf67a6dd5fc391e271fd92c0c21ee7e8a74 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sat, 4 Mar 2023 18:37:06 +0100 Subject: [PATCH 0653/1411] Make InMemory object store track last modified time for each entry (#3796) * refactor: allow InMemoryUpload to store timestamp * use new last modified timestamp --- object_store/src/memory.rs | 67 +++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 40eee55a13cc..1433701e8512 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -20,7 +20,7 @@ use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; -use chrono::Utc; +use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; use snafu::{ensure, OptionExt, Snafu}; @@ -33,6 +33,9 @@ use std::sync::Arc; use std::task::Poll; use tokio::io::AsyncWrite; +type Entry = (Bytes, DateTime); +type StorageType = Arc>>; + /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -73,7 +76,7 @@ impl From for super::Error { /// storage provider. #[derive(Debug, Default)] pub struct InMemory { - storage: Arc>>, + storage: StorageType, } impl std::fmt::Display for InMemory { @@ -85,7 +88,9 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage.write().insert(location.clone(), bytes); + self.storage + .write() + .insert(location.clone(), (bytes, Utc::now())); Ok(()) } @@ -113,19 +118,19 @@ impl ObjectStore for InMemory { } async fn get(&self, location: &Path) -> Result { - let data = self.get_bytes(location).await?; + let data = self.entry(location).await?; Ok(GetResult::Stream( - futures::stream::once(async move { Ok(data) }).boxed(), + futures::stream::once(async move { Ok(data.0) }).boxed(), )) } async fn get_range(&self, location: &Path, range: Range) -> Result { - let data = self.get_bytes(location).await?; - ensure!(range.end <= data.len(), OutOfRangeSnafu); + let data = self.entry(location).await?; + ensure!(range.end <= data.0.len(), OutOfRangeSnafu); ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.slice(range)) + Ok(data.0.slice(range)) } async fn get_ranges( @@ -133,24 +138,23 @@ impl ObjectStore for InMemory { location: &Path, ranges: &[Range], ) -> Result> { - let data = self.get_bytes(location).await?; + let data = self.entry(location).await?; ranges .iter() .map(|range| { - ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.end <= data.0.len(), OutOfRangeSnafu); ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.slice(range.clone())) + Ok(data.0.slice(range.clone())) }) .collect() } async fn head(&self, location: &Path) -> Result { - let last_modified = Utc::now(); - let bytes = self.get_bytes(location).await?; + let entry = self.entry(location).await?; Ok(ObjectMeta { location: location.clone(), - last_modified, - size: bytes.len(), + last_modified: entry.1, + size: entry.0.len(), }) } @@ -165,7 +169,6 @@ impl ObjectStore for InMemory { ) -> Result>> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); - let last_modified = Utc::now(); let storage = self.storage.read(); let values: Vec<_> = storage @@ -180,8 +183,8 @@ impl ObjectStore for InMemory { .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), - last_modified, - size: value.len(), + last_modified: value.1, + size: value.0.len(), }) }) .collect(); @@ -197,7 +200,6 @@ impl ObjectStore for InMemory { let prefix = prefix.unwrap_or(&root); let mut common_prefixes = BTreeSet::new(); - let last_modified = Utc::now(); // Only objects in this base level should be returned in the // response. Otherwise, we just collect the common prefixes. @@ -224,8 +226,8 @@ impl ObjectStore for InMemory { } else { let object = ObjectMeta { location: k.clone(), - last_modified, - size: v.len(), + last_modified: v.1, + size: v.0.len(), }; objects.push(object); } @@ -238,13 +240,15 @@ impl ObjectStore for InMemory { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.get_bytes(from).await?; - self.storage.write().insert(to.clone(), data); + let data = self.entry(from).await?; + self.storage + .write() + .insert(to.clone(), (data.0, Utc::now())); Ok(()) } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.get_bytes(from).await?; + let data = self.entry(from).await?; let mut storage = self.storage.write(); if storage.contains_key(to) { return Err(Error::AlreadyExists { @@ -252,7 +256,7 @@ impl ObjectStore for InMemory { } .into()); } - storage.insert(to.clone(), data); + storage.insert(to.clone(), (data.0, Utc::now())); Ok(()) } } @@ -273,22 +277,23 @@ impl InMemory { } } - async fn get_bytes(&self, location: &Path) -> Result { + async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { let storage = self.storage.read(); - let bytes = storage + let value = storage .get(location) .cloned() .context(NoDataInMemorySnafu { path: location.to_string(), })?; - Ok(bytes) + + Ok(value) } } struct InMemoryUpload { location: Path, data: Vec, - storage: Arc>>, + storage: StorageType, } impl AsyncWrite for InMemoryUpload { @@ -313,7 +318,9 @@ impl AsyncWrite for InMemoryUpload { _cx: &mut std::task::Context<'_>, ) -> std::task::Poll> { let data = Bytes::from(std::mem::take(&mut self.data)); - self.storage.write().insert(self.location.clone(), data); + self.storage + .write() + .insert(self.location.clone(), (data, Utc::now())); Poll::Ready(Ok(())) } } From 7eb588d7a9cee6516d53be6228130eda24810d37 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 5 Mar 2023 13:10:12 +0000 Subject: [PATCH 0654/1411] Cleanup ApplicationDefaultCredentials (#3799) * Cleanup ApplicationDefaultCredentials * Fix doc --- object_store/src/gcp/credential.rs | 84 +++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 853e4ce83842..a8dce7132755 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -62,8 +62,8 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, - #[snafu(display("A configuration file was passed in but was not used."))] - UnusedConfigurationFile, + #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] + UnsupportedCredentialsType { type_: String }, #[snafu(display("Error creating client: {}", source))] Client { source: crate::Error }, @@ -399,36 +399,60 @@ impl TokenProvider for InstanceCredentialProvider { } } +/// ApplicationDefaultCredentials +/// +#[derive(Debug)] +pub enum ApplicationDefaultCredentials { + /// + AuthorizedUser { + client_id: String, + client_secret: String, + refresh_token: String, + }, +} + +impl ApplicationDefaultCredentials { + pub fn new(path: Option<&str>) -> Result, Error> { + let file = match ApplicationDefaultCredentialsFile::read(path)? { + Some(f) => f, + None => return Ok(None), + }; + + Ok(Some(match file.type_.as_str() { + "authorized_user" => Self::AuthorizedUser { + client_id: file.client_id, + client_secret: file.client_secret, + refresh_token: file.refresh_token, + }, + type_ => return UnsupportedCredentialsTypeSnafu { type_ }.fail(), + })) + } +} + /// A deserialized `application_default_credentials.json`-file. /// -#[derive(serde::Deserialize, Debug)] -pub struct ApplicationDefaultCredentials { +#[derive(serde::Deserialize)] +struct ApplicationDefaultCredentialsFile { + #[serde(default)] client_id: String, + #[serde(default)] client_secret: String, + #[serde(default)] refresh_token: String, #[serde(rename = "type")] type_: String, } -impl ApplicationDefaultCredentials { - const DEFAULT_TOKEN_GCP_URI: &'static str = - "https://accounts.google.com/o/oauth2/token"; +impl ApplicationDefaultCredentialsFile { const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; - const EXPECTED_TYPE: &str = "authorized_user"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. // 2. without argument if the well-known configuration file is present. - pub fn new(path: Option<&str>) -> Result, Error> { + fn read(path: Option<&str>) -> Result, Error> { if let Some(path) = path { - if let Ok(credentials) = read_credentials_file::(path) { - if credentials.type_ == Self::EXPECTED_TYPE { - return Ok(Some(credentials)); - } - } - // Return an error if the path has not been used. - return Err(Error::UnusedConfigurationFile); + return read_credentials_file::(path).map(Some); } if let Some(home) = env::var_os("HOME") { let path = Path::new(&home).join(Self::CREDENTIALS_PATH); @@ -442,6 +466,8 @@ impl ApplicationDefaultCredentials { } } +const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; + #[async_trait] impl TokenProvider for ApplicationDefaultCredentials { async fn fetch_token( @@ -449,16 +475,24 @@ impl TokenProvider for ApplicationDefaultCredentials { client: &Client, retry: &RetryConfig, ) -> Result, Error> { - let body = [ - ("grant_type", "refresh_token"), - ("client_id", &self.client_id), - ("client_secret", &self.client_secret), - ("refresh_token", &self.refresh_token), - ]; + let builder = client.request(Method::POST, DEFAULT_TOKEN_GCP_URI); + let builder = match self { + Self::AuthorizedUser { + client_id, + client_secret, + refresh_token, + } => { + let body = [ + ("grant_type", "refresh_token"), + ("client_id", client_id), + ("client_secret", client_secret), + ("refresh_token", refresh_token), + ]; + builder.form(&body) + } + }; - let response = client - .request(Method::POST, Self::DEFAULT_TOKEN_GCP_URI) - .form(&body) + let response = builder .send_retry(retry) .await .context(TokenRequestSnafu)? From 2f54ae9cf7736d491cfc890bdc28384c8aaefd6b Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 6 Mar 2023 10:01:40 +0100 Subject: [PATCH 0655/1411] fix: change uft8 to timestamp with timezone (#3806) --- arrow-cast/src/cast.rs | 66 ++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8e3bde990fcd..af192fdd5e1e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -609,7 +609,6 @@ pub fn cast_with_options( ) -> Result { use DataType::*; let from_type = array.data_type(); - // clone array if types are the same if from_type == to_type { return Ok(make_array(array.data().clone())); @@ -1147,17 +1146,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Second, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } - Timestamp(TimeUnit::Millisecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Millisecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz, cast_options) } - Timestamp(TimeUnit::Microsecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Microsecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } - Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Nanosecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1197,17 +1196,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } - Timestamp(TimeUnit::Second, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } - Timestamp(TimeUnit::Millisecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Millisecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } - Timestamp(TimeUnit::Microsecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Microsecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } - Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp::(array, cast_options) + Timestamp(TimeUnit::Nanosecond, to_tz) => { + cast_string_to_timestamp::(array, to_tz,cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -2582,6 +2581,7 @@ fn cast_string_to_timestamp< TimestampType: ArrowTimestampType, >( array: &dyn Array, + to_tz: &Option, cast_options: &CastOptions, ) -> Result { let string_array = array @@ -2604,7 +2604,11 @@ fn cast_string_to_timestamp< // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + + unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + .with_timezone_opt(to_tz.clone()) + } } else { let vec = string_array .iter() @@ -2618,7 +2622,10 @@ fn cast_string_to_timestamp< // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) } + unsafe { + PrimitiveArray::::from_trusted_len_iter(vec.iter()) + .with_timezone_opt(to_tz.clone()) + } }; Ok(Arc::new(array) as ArrayRef) @@ -7639,4 +7646,25 @@ mod tests { assert_eq!(a.value(0), ""); assert_eq!(a.value(1), "\x00 Foo"); } + + #[test] + fn test_cast_utf8_to_timestamptz() { + let valid = StringArray::from(vec!["2023-01-01"]); + + let array = Arc::new(valid) as ArrayRef; + let b = cast( + &array, + &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_owned())), + ) + .unwrap(); + + let expect = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_owned())); + + assert_eq!(b.data_type(), &expect); + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1672531200000000000, c.value(0)); + } } From 7f460aff0a6438f2ff90087fb9ecd6aa0e1e891b Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 6 Mar 2023 18:40:50 +0100 Subject: [PATCH 0656/1411] feat: add simple flight SQL CLI client (#3789) --- arrow-flight/Cargo.toml | 13 ++ arrow-flight/src/bin/flight_sql_client.rs | 199 ++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 arrow-flight/src/bin/flight_sql_client.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index fd77a814ab88..f1cd7d4fb23b 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -41,6 +41,12 @@ prost-derive = { version = "0.11", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } +# CLI-related dependencies +arrow = { version = "34.0.0", path = "../arrow", optional = true } +clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } +tracing-log = { version = "0.1", optional = true } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "fmt"], optional = true } + [package.metadata.docs.rs] all-features = true @@ -49,6 +55,9 @@ default = [] flight-sql-experimental = [] tls = ["tonic/tls"] +# Enable CLI tools +cli = ["arrow/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] + [dev-dependencies] arrow = { version = "34.0.0", path = "../arrow", features = ["prettyprint"] } tempfile = "3.3" @@ -65,3 +74,7 @@ tonic-build = { version = "=0.8.4", default-features = false, features = ["trans [[example]] name = "flight_sql_server" required-features = ["flight-sql-experimental", "tls"] + +[[bin]] +name = "flight_sql_client" +required-features = ["cli", "flight-sql-experimental", "tls"] diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs new file mode 100644 index 000000000000..9f211eaf63bc --- /dev/null +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{sync::Arc, time::Duration}; + +use arrow::error::Result; +use arrow::util::pretty::pretty_format_batches; +use arrow_array::RecordBatch; +use arrow_flight::{ + sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, +}; +use arrow_schema::{ArrowError, Schema}; +use clap::Parser; +use futures::TryStreamExt; +use tonic::transport::{ClientTlsConfig, Endpoint}; +use tracing_log::log::info; + +/// A ':' separated key value pair +#[derive(Debug, Clone)] +struct KeyValue { + pub key: K, + pub value: V, +} + +impl std::str::FromStr for KeyValue +where + K: std::str::FromStr, + V: std::str::FromStr, + K::Err: std::fmt::Display, + V::Err: std::fmt::Display, +{ + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + let parts = s.splitn(2, ':').collect::>(); + match parts.as_slice() { + [key, value] => { + let key = K::from_str(key).map_err(|e| e.to_string())?; + let value = V::from_str(value).map_err(|e| e.to_string())?; + Ok(Self { key, value }) + } + _ => Err(format!( + "Invalid key value pair - expected 'KEY:VALUE' got '{s}'" + )), + } + } +} + +#[derive(Debug, Parser)] +struct ClientArgs { + /// Additional headers. + /// + /// Values should be key value pairs separated by ':' + #[clap(long, value_delimiter = ',')] + headers: Vec>, + + /// Username + #[clap(long)] + username: Option, + + /// Password + #[clap(long)] + password: Option, + + /// Auth token. + #[clap(long)] + token: Option, + + /// Use TLS. + #[clap(long)] + tls: bool, + + /// Server host. + #[clap(long)] + host: String, + + /// Server port. + #[clap(long)] + port: Option, +} + +#[derive(Debug, Parser)] +struct Args { + /// Client args. + #[clap(flatten)] + client_args: ClientArgs, + + /// SQL query. + query: String, +} + +#[tokio::main] +async fn main() { + let args = Args::parse(); + setup_logging(); + let mut client = setup_client(args.client_args).await.expect("setup client"); + + let info = client.execute(args.query).await.expect("prepare statement"); + info!("got flight info"); + + let schema = Arc::new(Schema::try_from(info.clone()).expect("valid schema")); + let mut batches = Vec::with_capacity(info.endpoint.len() + 1); + batches.push(RecordBatch::new_empty(schema)); + info!("decoded schema"); + + for endpoint in info.endpoint { + let Some(ticket) = &endpoint.ticket else { + panic!("did not get ticket"); + }; + let flight_data = client.do_get(ticket.clone()).await.expect("do get"); + let flight_data: Vec = flight_data + .try_collect() + .await + .expect("collect data stream"); + let mut endpoint_batches = flight_data_to_batches(&flight_data) + .expect("convert flight data to record batches"); + batches.append(&mut endpoint_batches); + } + info!("received data"); + + let res = pretty_format_batches(batches.as_slice()).expect("format results"); + println!("{res}"); +} + +fn setup_logging() { + tracing_log::LogTracer::init().expect("tracing log init"); + tracing_subscriber::fmt::init(); +} + +async fn setup_client(args: ClientArgs) -> Result { + let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); + + let mut endpoint = Endpoint::new(format!("https://{}:{}", args.host, port)) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .connect_timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(20)) + .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait + .tcp_keepalive(Option::Some(Duration::from_secs(3600))) + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(20)) + .keep_alive_while_idle(true); + + if args.tls { + let tls_config = ClientTlsConfig::new(); + endpoint = endpoint + .tls_config(tls_config) + .map_err(|_| ArrowError::IoError("Cannot create TLS endpoint".to_string()))?; + } + + let channel = endpoint + .connect() + .await + .map_err(|e| ArrowError::IoError(format!("Cannot connect to endpoint: {e}")))?; + + let mut client = FlightSqlServiceClient::new(channel); + info!("connected"); + + for kv in args.headers { + client.set_header(kv.key, kv.value); + } + + if let Some(token) = args.token { + client.set_token(token); + info!("token set"); + } + + match (args.username, args.password) { + (None, None) => {} + (Some(username), Some(password)) => { + client + .handshake(&username, &password) + .await + .expect("handshake"); + info!("performed handshake"); + } + (Some(_), None) => { + panic!("when username is set, you also need to set a password") + } + (None, Some(_)) => { + panic!("when password is set, you also need to set a username") + } + } + + Ok(client) +} From 3df7c00a358cff34da8bacd819e791892755d3a9 Mon Sep 17 00:00:00 2001 From: bold Date: Mon, 6 Mar 2023 23:45:51 +0100 Subject: [PATCH 0657/1411] Support reading decimal arrays from json (#3805) * support decimals in json reader * fix clippy * move decimal parsing to cast and improve impl * iterate over bytes instead of chars Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * make decimal string check private Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * replace macro by generics * Test cleanup * apply scale to integers --------- Co-authored-by: b.specht Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- arrow-cast/src/parse.rs | 173 +++++++++++++++++++++++++- arrow-csv/src/reader/mod.rs | 212 +------------------------------- arrow-json/src/reader.rs | 150 +++++++++++++++++++++- arrow-json/test/data/basic.json | 24 ++-- 4 files changed, 331 insertions(+), 228 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index a48dd2bac7d2..f498bf142bd7 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -16,7 +16,8 @@ // under the License. use arrow_array::types::*; -use arrow_array::ArrowPrimitiveType; +use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; use chrono::prelude::*; @@ -459,10 +460,114 @@ impl Parser for Date64Type { } } +/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. +/// The result value can't be out of bounds. +pub fn parse_decimal( + s: &str, + precision: u8, + scale: i8, +) -> Result { + if !is_valid_decimal(s) { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + let mut offset = s.len(); + let len = s.len(); + let mut base = T::Native::usize_as(1); + let scale_usize = usize::from(scale as u8); + + // handle the value after the '.' and meet the scale + let delimiter_position = s.find('.'); + match delimiter_position { + None => { + // there is no '.' + base = T::Native::usize_as(10).pow_checked(scale as u32)?; + } + Some(mid) => { + // there is the '.' + if len - mid >= scale_usize + 1 { + // If the string value is "123.12345" and the scale is 2, we should just remain '.12' and drop the '345' value. + offset -= len - mid - 1 - scale_usize; + } else { + // If the string value is "123.12" and the scale is 4, we should append '00' to the tail. + base = T::Native::usize_as(10) + .pow_checked((scale_usize + 1 + mid - len) as u32)?; + } + } + }; + + // each byte is digit、'-' or '.' + let bytes = s.as_bytes(); + let mut negative = false; + let mut result = T::Native::usize_as(0); + + bytes[0..offset] + .iter() + .rev() + .try_for_each::<_, Result<(), ArrowError>>(|&byte| { + match byte { + b'-' => { + negative = true; + } + b'0'..=b'9' => { + let add = + T::Native::usize_as((byte - b'0') as usize).mul_checked(base)?; + result = result.add_checked(add)?; + base = base.mul_checked(T::Native::usize_as(10))?; + } + // because we have checked the string value + _ => (), + } + Ok(()) + })?; + + if negative { + result = result.neg_checked()?; + } + + match T::validate_decimal_precision(result, precision) { + Ok(_) => Ok(result), + Err(e) => Err(ArrowError::ParseError(format!( + "parse decimal overflow: {e}" + ))), + } +} + +fn is_valid_decimal(s: &str) -> bool { + let mut seen_dot = false; + let mut seen_digit = false; + let mut seen_sign = false; + + for c in s.as_bytes() { + match c { + b'-' | b'+' => { + if seen_digit || seen_dot || seen_sign { + return false; + } + seen_sign = true; + } + b'.' => { + if seen_dot { + return false; + } + seen_dot = true; + } + b'0'..=b'9' => { + seen_digit = true; + } + _ => return false, + } + } + + seen_digit +} + #[cfg(test)] mod tests { use super::*; use arrow_array::timezone::Tz; + use arrow_buffer::i256; #[test] fn string_to_timestamp_timezone() { @@ -920,4 +1025,70 @@ mod tests { .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) .unwrap_err(); } + + #[test] + fn test_parse_decimal_with_parameter() { + let tests = [ + ("123.123", 123123i128), + ("123.1234", 123123i128), + ("123.1", 123100i128), + ("123", 123000i128), + ("-123.123", -123123i128), + ("-123.1234", -123123i128), + ("-123.1", -123100i128), + ("-123", -123000i128), + ("0.0000123", 0i128), + ("12.", 12000i128), + ("-12.", -12000i128), + ("00.1", 100i128), + ("-00.1", -100i128), + ("12345678912345678.1234", 12345678912345678123i128), + ("-12345678912345678.1234", -12345678912345678123i128), + ("99999999999999999.999", 99999999999999999999i128), + ("-99999999999999999.999", -99999999999999999999i128), + (".123", 123i128), + ("-.123", -123i128), + ("123.", 123000i128), + ("-123.", -123000i128), + ]; + for (s, i) in tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!(i, result_128.unwrap()); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!(i256::from_i128(i), result_256.unwrap()); + } + let can_not_parse_tests = ["123,123", ".", "123.123.123"]; + for s in can_not_parse_tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string() + ); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_256.unwrap_err().to_string() + ); + } + let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; + for s in overflow_parse_tests { + let result_128 = parse_decimal::(s, 10, 3); + let expected_128 = "Parser error: parse decimal overflow"; + let actual_128 = result_128.unwrap_err().to_string(); + + assert!( + actual_128.contains(expected_128), + "actual: '{actual_128}', expected: '{expected_128}'" + ); + + let result_256 = parse_decimal::(s, 10, 3); + let expected_256 = "Parser error: parse decimal overflow"; + let actual_256 = result_256.unwrap_err().to_string(); + + assert!( + actual_256.contains(expected_256), + "actual: '{actual_256}', expected: '{expected_256}'" + ); + } + } } diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 84d55c4ae24b..8b1cd2f79930 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -44,10 +44,8 @@ mod records; use arrow_array::builder::PrimitiveBuilder; use arrow_array::types::*; -use arrow_array::ArrowNativeTypeOp; use arrow_array::*; -use arrow_buffer::ArrowNativeType; -use arrow_cast::parse::Parser; +use arrow_cast::parse::{parse_decimal, Parser}; use arrow_schema::*; use lazy_static::lazy_static; use regex::{Regex, RegexSet}; @@ -72,8 +70,6 @@ lazy_static! { r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,6}$", //Timestamp(Microsecond) r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,9}$", //Timestamp(Nanosecond) ]).unwrap(); - static ref PARSE_DECIMAL_RE: Regex = - Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); } #[derive(Default, Copy, Clone)] @@ -823,7 +819,7 @@ fn build_decimal_array( decimal_builder.append_null(); } else { let decimal_value: Result = - parse_decimal_with_parameter::(s, precision, scale); + parse_decimal::(s, precision, scale); match decimal_value { Ok(v) => { decimal_builder.append_value(v); @@ -841,127 +837,6 @@ fn build_decimal_array( )) } -// Parse the string format decimal value to i128/i256 format and checking the precision and scale. -// The result value can't be out of bounds. -fn parse_decimal_with_parameter( - s: &str, - precision: u8, - scale: i8, -) -> Result { - if PARSE_DECIMAL_RE.is_match(s) { - let mut offset = s.len(); - let len = s.len(); - let mut base = T::Native::usize_as(1); - let scale_usize = usize::from(scale as u8); - - // handle the value after the '.' and meet the scale - let delimiter_position = s.find('.'); - match delimiter_position { - None => { - // there is no '.' - base = T::Native::usize_as(10).pow_checked(scale as u32)?; - } - Some(mid) => { - // there is the '.' - if len - mid >= scale_usize + 1 { - // If the string value is "123.12345" and the scale is 2, we should just remain '.12' and drop the '345' value. - offset -= len - mid - 1 - scale_usize; - } else { - // If the string value is "123.12" and the scale is 4, we should append '00' to the tail. - base = T::Native::usize_as(10) - .pow_checked((scale_usize + 1 + mid - len) as u32)?; - } - } - }; - - // each byte is digit、'-' or '.' - let bytes = s.as_bytes(); - let mut negative = false; - let mut result = T::Native::usize_as(0); - - bytes[0..offset] - .iter() - .rev() - .try_for_each::<_, Result<(), ArrowError>>(|&byte| { - match byte { - b'-' => { - negative = true; - } - b'0'..=b'9' => { - let add = T::Native::usize_as((byte - b'0') as usize) - .mul_checked(base)?; - result = result.add_checked(add)?; - base = base.mul_checked(T::Native::usize_as(10))?; - } - // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'. - _ => (), - } - Ok(()) - })?; - - if negative { - result = result.neg_checked()?; - } - - match T::validate_decimal_precision(result, precision) { - Ok(_) => Ok(result), - Err(e) => Err(ArrowError::ParseError(format!( - "parse decimal overflow: {e}" - ))), - } - } else { - Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))) - } -} - -// Parse the string format decimal value to i128 format without checking the precision and scale. -// Like "125.12" to 12512_i128. -#[cfg(test)] -fn parse_decimal(s: &str) -> Result { - use std::ops::Neg; - - if PARSE_DECIMAL_RE.is_match(s) { - let mut offset = s.len(); - // each byte is digit、'-' or '.' - let bytes = s.as_bytes(); - let mut negative = false; - let mut result: i128 = 0; - let mut base = 1; - while offset > 0 { - match bytes[offset - 1] { - b'-' => { - negative = true; - } - b'.' => { - // do nothing - } - b'0'..=b'9' => { - result += i128::from(bytes[offset - 1] - b'0') * base; - base *= 10; - } - _ => { - return Err(ArrowError::ParseError(format!( - "can't match byte {}", - bytes[offset - 1] - ))); - } - } - offset -= 1; - } - if negative { - Ok(result.neg()) - } else { - Ok(result) - } - } else { - Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))) - } -} - // parses a specific column (col_idx) into an Arrow Array. fn build_primitive_array( line_number: usize, @@ -1268,7 +1143,6 @@ impl ReaderBuilder { mod tests { use super::*; - use arrow_buffer::i256; use std::io::{Cursor, Write}; use tempfile::NamedTempFile; @@ -1812,88 +1686,6 @@ mod tests { ); } - #[test] - fn test_parse_decimal() { - let tests = [ - ("123.00", 12300i128), - ("123.123", 123123i128), - ("0.0123", 123i128), - ("0.12300", 12300i128), - ("-5.123", -5123i128), - ("-45.432432", -45432432i128), - ]; - for (s, i) in tests { - let result = parse_decimal(s); - assert_eq!(i, result.unwrap()); - } - } - - #[test] - fn test_parse_decimal_with_parameter() { - let tests = [ - ("123.123", 123123i128), - ("123.1234", 123123i128), - ("123.1", 123100i128), - ("123", 123000i128), - ("-123.123", -123123i128), - ("-123.1234", -123123i128), - ("-123.1", -123100i128), - ("-123", -123000i128), - ("0.0000123", 0i128), - ("12.", 12000i128), - ("-12.", -12000i128), - ("00.1", 100i128), - ("-00.1", -100i128), - ("12345678912345678.1234", 12345678912345678123i128), - ("-12345678912345678.1234", -12345678912345678123i128), - ("99999999999999999.999", 99999999999999999999i128), - ("-99999999999999999.999", -99999999999999999999i128), - (".123", 123i128), - ("-.123", -123i128), - ("123.", 123000i128), - ("-123.", -123000i128), - ]; - for (s, i) in tests { - let result_128 = parse_decimal_with_parameter::(s, 20, 3); - assert_eq!(i, result_128.unwrap()); - let result_256 = parse_decimal_with_parameter::(s, 20, 3); - assert_eq!(i256::from_i128(i), result_256.unwrap()); - } - let can_not_parse_tests = ["123,123", ".", "123.123.123"]; - for s in can_not_parse_tests { - let result_128 = parse_decimal_with_parameter::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_128.unwrap_err().to_string() - ); - let result_256 = parse_decimal_with_parameter::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_256.unwrap_err().to_string() - ); - } - let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; - for s in overflow_parse_tests { - let result_128 = parse_decimal_with_parameter::(s, 10, 3); - let expected_128 = "Parser error: parse decimal overflow"; - let actual_128 = result_128.unwrap_err().to_string(); - - assert!( - actual_128.contains(expected_128), - "actual: '{actual_128}', expected: '{expected_128}'" - ); - - let result_256 = parse_decimal_with_parameter::(s, 10, 3); - let expected_256 = "Parser error: parse decimal overflow"; - let actual_256 = result_256.unwrap_err().to_string(); - - assert!( - actual_256.contains(expected_256), - "actual: '{actual_256}', expected: '{expected_256}'" - ); - } - } - #[test] fn test_parse_timestamp_microseconds() { assert_eq!( diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 3ac39c110fc9..f4610eb345ea 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -58,8 +58,8 @@ use serde_json::{map::Map as JsonMap, Value}; use arrow_array::builder::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_cast::parse::Parser; +use arrow_buffer::{bit_util, i256, Buffer, MutableBuffer}; +use arrow_cast::parse::{parse_decimal, Parser}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::*; @@ -1019,6 +1019,76 @@ impl Decoder { )) } + fn build_decimal128_array( + &self, + rows: &[Value], + col_name: &str, + precision: u8, + scale: i8, + ) -> Result { + Ok(Arc::new( + rows.iter() + .map(|row| { + row.get(col_name).and_then(|value| { + if value.is_i64() { + let mul = 10i128.pow(scale as _); + value + .as_i64() + .and_then(num::cast::cast) + .map(|v: i128| v * mul) + } else if value.is_u64() { + let mul = 10i128.pow(scale as _); + value + .as_u64() + .and_then(num::cast::cast) + .map(|v: i128| v * mul) + } else if value.is_string() { + value.as_str().and_then(|s| { + parse_decimal::(s, precision, scale).ok() + }) + } else { + let mul = 10_f64.powi(scale as i32); + value.as_f64().map(|f| (f * mul).round() as i128) + } + }) + }) + .collect::() + .with_precision_and_scale(precision, scale)?, + )) + } + + fn build_decimal256_array( + &self, + rows: &[Value], + col_name: &str, + precision: u8, + scale: i8, + ) -> Result { + let mul = 10_f64.powi(scale as i32); + Ok(Arc::new( + rows.iter() + .map(|row| { + row.get(col_name).and_then(|value| { + if value.is_i64() { + let mul = i256::from_i128(10).pow_wrapping(scale as _); + value.as_i64().map(|i| i256::from_i128(i as _) * mul) + } else if value.is_u64() { + let mul = i256::from_i128(10).pow_wrapping(scale as _); + value.as_u64().map(|i| i256::from_i128(i as _) * mul) + } else if value.is_string() { + value.as_str().and_then(|s| { + parse_decimal::(s, precision, scale).ok() + }) + } else { + value.as_f64().and_then(|f| i256::from_f64(f * mul.round())) + } + }) + }) + .collect::() + .with_precision_and_scale(precision, scale)?, + )) + } + /// Build a nested GenericListArray from a list of unnested `Value`s fn build_nested_list_array( &self, @@ -1379,6 +1449,10 @@ impl Decoder { field.data_type(), map_field, ), + DataType::Decimal128(precision, scale) => self + .build_decimal128_array(rows, field.name(), *precision, *scale), + DataType::Decimal256(precision, scale) => self + .build_decimal256_array(rows, field.name(), *precision, *scale), _ => Err(ArrowError::JsonError(format!( "{:?} type is not supported", field.data_type() @@ -1776,7 +1850,7 @@ mod tests { as_boolean_array, as_dictionary_array, as_primitive_array, as_string_array, as_struct_array, }; - use arrow_buffer::ToByteSlice; + use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::DataType::{Dictionary, List}; use flate2::read::GzDecoder; use std::fs::File; @@ -1790,7 +1864,7 @@ mod tests { .unwrap(); let batch = reader.next().unwrap().unwrap(); - assert_eq!(5, batch.num_columns()); + assert_eq!(6, batch.num_columns()); assert_eq!(12, batch.num_rows()); let schema = reader.schema(); @@ -3328,7 +3402,7 @@ mod tests { let mut sum_a = 0; for batch in reader { let batch = batch.unwrap(); - assert_eq!(5, batch.num_columns()); + assert_eq!(6, batch.num_columns()); sum_num_rows += batch.num_rows(); num_batches += 1; let batch_schema = batch.schema(); @@ -3352,4 +3426,70 @@ mod tests { let cloned = options.clone(); assert_eq!(options, cloned); } + + pub fn decimal_json_tests(data_type: DataType) { + let schema = Schema::new(vec![ + Field::new("a", data_type.clone(), true), + Field::new("b", data_type.clone(), true), + Field::new("f", data_type.clone(), true), + ]); + + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/basic.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(3, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + let b = schema.column_with_name("b").unwrap(); + let f = schema.column_with_name("f").unwrap(); + assert_eq!(&data_type, a.1.data_type()); + assert_eq!(&data_type, b.1.data_type()); + assert_eq!(&data_type, f.1.data_type()); + + let aa = batch + .column(a.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(T::Native::usize_as(100), aa.value(0)); + assert_eq!(T::Native::usize_as(100), aa.value(3)); + assert_eq!(T::Native::usize_as(500), aa.value(7)); + + let bb = batch + .column(b.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(T::Native::usize_as(200), bb.value(0)); + assert_eq!(T::Native::usize_as(350).neg_wrapping(), bb.value(1)); + assert_eq!(T::Native::usize_as(60), bb.value(8)); + + let ff = batch + .column(f.0) + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(T::Native::usize_as(102), ff.value(0)); + assert_eq!(T::Native::usize_as(30).neg_wrapping(), ff.value(1)); + assert_eq!(T::Native::usize_as(137722), ff.value(2)); + + assert_eq!(T::Native::usize_as(133700), ff.value(3)); + assert_eq!(T::Native::usize_as(9999999999), ff.value(7)); + } + + #[test] + fn test_decimal_from_json() { + decimal_json_tests::(DataType::Decimal128(10, 2)); + decimal_json_tests::(DataType::Decimal256(10, 2)); + } } diff --git a/arrow-json/test/data/basic.json b/arrow-json/test/data/basic.json index 556c39c46be9..8de246e1ac28 100644 --- a/arrow-json/test/data/basic.json +++ b/arrow-json/test/data/basic.json @@ -1,12 +1,12 @@ -{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2"} -{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31"} -{"a":2, "b":0.6, "c":false, "d":"text", "e": "1970-01-02 11:11:11"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":7, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":5, "b":-3.5, "c":true, "d":"4"} -{"a":1, "b":0.6, "c":false, "d":"text"} -{"a":1, "b":2.0, "c":false, "d":"4"} -{"a":1, "b":-3.5, "c":true, "d":"4"} -{"a":100000000000000, "b":0.6, "c":false, "d":"text"} +{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2", "f": "1.02"} +{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31", "f": "-0.3"} +{"a":2, "b":0.6, "c":false, "d":"text", "e": "1970-01-02 11:11:11", "f": "1377.223"} +{"a":1, "b":2.0, "c":false, "d":"4", "f": "1337.009"} +{"a":7, "b":-3.5, "c":true, "d":"4", "f": "1"} +{"a":1, "b":0.6, "c":false, "d":"text", "f": "1338"} +{"a":1, "b":2.0, "c":false, "d":"4", "f": "12345829100000"} +{"a":5, "b":-3.5, "c":true, "d":"4", "f": "99999999.99"} +{"a":1, "b":0.6, "c":false, "d":"text", "f": "1"} +{"a":1, "b":2.0, "c":false, "d":"4", "f": "1"} +{"a":1, "b":-3.5, "c":true, "d":"4", "f": "1"} +{"a":100000000000000, "b":0.6, "c":false, "d":"text", "f": "1"} From 14544fb959fd8bf8a733e137302a567ca9381b95 Mon Sep 17 00:00:00 2001 From: Jie Han <11144133+doki23@users.noreply.github.com> Date: Tue, 7 Mar 2023 18:37:21 +0800 Subject: [PATCH 0658/1411] Support for casting `Utf8` and `LargeUtf8` --> `Interval` (#3762) * cast string to interval * cast string to interval * unit tests * fix * update * code clean * update unit tests and align_interval_parts * fix ut * make clippy happy * Update arrow-cast/src/parse.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * change return types of calculate_from_part and fix bug of align_interval_parts * make clippy happy * remote useless overflow check * remove the "convert to higher units" logic --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-cast/src/cast.rs | 293 ++++++++++++++++++++++++++++++++- arrow-cast/src/parse.rs | 349 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 641 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index af192fdd5e1e..ae901665473d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -40,7 +40,10 @@ use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; -use crate::parse::string_to_timestamp_nanos; +use crate::parse::{ + parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, + string_to_timestamp_nanos, +}; use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; @@ -170,6 +173,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(TimeUnit::Millisecond, _) | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) + | Interval(_) ) => true, (Utf8, _) => to_type.is_numeric() && to_type != &Float16, (LargeUtf8, @@ -186,6 +190,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Timestamp(TimeUnit::Millisecond, _) | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) + | Interval(_) ) => true, (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1158,6 +1163,15 @@ pub fn cast_with_options( Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz,cast_options) } + Interval(IntervalUnit::YearMonth) => { + cast_string_to_year_month_interval::(array, cast_options) + } + Interval(IntervalUnit::DayTime) => { + cast_string_to_day_time_interval::(array, cast_options) + } + Interval(IntervalUnit::MonthDayNano) => { + cast_string_to_month_day_nano_interval::(array, cast_options) + } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1208,6 +1222,15 @@ pub fn cast_with_options( Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz,cast_options) } + Interval(IntervalUnit::YearMonth) => { + cast_string_to_year_month_interval::(array, cast_options) + } + Interval(IntervalUnit::DayTime) => { + cast_string_to_day_time_interval::(array, cast_options) + } + Interval(IntervalUnit::MonthDayNano) => { + cast_string_to_month_day_nano_interval::(array, cast_options) + } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -2631,6 +2654,105 @@ fn cast_string_to_timestamp< Ok(Arc::new(array) as ArrayRef) } +fn cast_string_to_year_month_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let interval_array = if cast_options.safe { + let iter = string_array + .iter() + .map(|v| v.and_then(|v| parse_interval_year_month(v).ok())); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalYearMonthArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| v.map(parse_interval_year_month).transpose()) + .collect::, ArrowError>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalYearMonthArray::from_trusted_len_iter(vec) } + }; + Ok(Arc::new(interval_array) as ArrayRef) +} + +fn cast_string_to_day_time_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let interval_array = if cast_options.safe { + let iter = string_array + .iter() + .map(|v| v.and_then(|v| parse_interval_day_time(v).ok())); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalDayTimeArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| v.map(parse_interval_day_time).transpose()) + .collect::, ArrowError>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalDayTimeArray::from_trusted_len_iter(vec) } + }; + Ok(Arc::new(interval_array) as ArrayRef) +} + +fn cast_string_to_month_day_nano_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let interval_array = if cast_options.safe { + let iter = string_array + .iter() + .map(|v| v.and_then(|v| parse_interval_month_day_nano(v).ok())); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| v.map(parse_interval_month_day_nano).transpose()) + .collect::, ArrowError>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(vec) } + }; + Ok(Arc::new(interval_array) as ArrayRef) +} + /// Casts Utf8 to Boolean fn cast_utf8_to_boolean( from: &dyn Array, @@ -4966,6 +5088,175 @@ mod tests { } } + macro_rules! test_safe_string_to_interval { + ($data_vec:expr, $interval_unit:expr, $array_ty:ty, $expect_vec:expr) => { + let source_string_array = + Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; + + let options = CastOptions { safe: true }; + + let target_interval_array = cast_with_options( + &source_string_array.clone(), + &DataType::Interval($interval_unit), + &options, + ) + .unwrap() + .as_any() + .downcast_ref::<$array_ty>() + .unwrap() + .clone() as $array_ty; + + let target_string_array = + cast_with_options(&target_interval_array, &DataType::Utf8, &options) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + + let expect_string_array = StringArray::from($expect_vec); + + assert_eq!(target_string_array, expect_string_array); + + let target_large_string_array = + cast_with_options(&target_interval_array, &DataType::LargeUtf8, &options) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + + let expect_large_string_array = LargeStringArray::from($expect_vec); + + assert_eq!(target_large_string_array, expect_large_string_array); + }; + } + + #[test] + fn test_cast_string_to_interval_year_month() { + test_safe_string_to_interval!( + vec![ + Some("1 year 1 month"), + Some("1.5 years 13 month"), + Some("30 days"), + Some("31 days"), + Some("2 months 31 days"), + Some("2 months 31 days 1 second"), + Some("foobar"), + ], + IntervalUnit::YearMonth, + IntervalYearMonthArray, + vec![ + Some("1 years 1 mons 0 days 0 hours 0 mins 0.00 secs"), + Some("2 years 7 mons 0 days 0 hours 0 mins 0.00 secs"), + None, + None, + None, + None, + None, + ] + ); + } + + #[test] + fn test_cast_string_to_interval_day_time() { + test_safe_string_to_interval!( + vec![ + Some("1 year 1 month"), + Some("1.5 years 13 month"), + Some("30 days"), + Some("1 day 2 second 3.5 milliseconds"), + Some("foobar"), + ], + IntervalUnit::DayTime, + IntervalDayTimeArray, + vec![ + Some("0 years 0 mons 390 days 0 hours 0 mins 0.000 secs"), + Some("0 years 0 mons 930 days 0 hours 0 mins 0.000 secs"), + Some("0 years 0 mons 30 days 0 hours 0 mins 0.000 secs"), + None, + None, + ] + ); + } + + #[test] + fn test_cast_string_to_interval_month_day_nano() { + test_safe_string_to_interval!( + vec![ + Some("1 year 1 month 1 day"), + None, + Some("1.5 years 13 month 35 days 1.4 milliseconds"), + Some("3 days"), + Some("8 seconds"), + None, + Some("1 day 29800 milliseconds"), + Some("3 months 1 second"), + Some("6 minutes 120 second"), + Some("2 years 39 months 9 days 19 hours 1 minute 83 seconds 399222 milliseconds"), + Some("foobar"), + ], + IntervalUnit::MonthDayNano, + IntervalMonthDayNanoArray, + vec![ + Some("0 years 13 mons 1 days 0 hours 0 mins 0.000000000 secs"), + None, + Some("0 years 31 mons 35 days 0 hours 0 mins 0.001400000 secs"), + Some("0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs"), + Some("0 years 0 mons 0 days 0 hours 0 mins 8.000000000 secs"), + None, + Some("0 years 0 mons 1 days 0 hours 0 mins 29.800000000 secs"), + Some("0 years 3 mons 0 days 0 hours 0 mins 1.000000000 secs"), + Some("0 years 0 mons 0 days 0 hours 8 mins 0.000000000 secs"), + Some("0 years 63 mons 9 days 19 hours 9 mins 2.222000000 secs"), + None, + ] + ); + } + + macro_rules! test_unsafe_string_to_interval_err { + ($data_vec:expr, $interval_unit:expr, $error_msg:expr) => { + let string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; + let options = CastOptions { safe: false }; + let arrow_err = cast_with_options( + &string_array.clone(), + &DataType::Interval($interval_unit), + &options, + ) + .unwrap_err(); + assert_eq!($error_msg, arrow_err.to_string()); + }; + } + + #[test] + fn test_cast_string_to_interval_err() { + test_unsafe_string_to_interval_err!( + vec![Some("foobar")], + IntervalUnit::YearMonth, + r#"Not yet implemented: Unsupported Interval Expression with value "foobar""# + ); + test_unsafe_string_to_interval_err!( + vec![Some("foobar")], + IntervalUnit::DayTime, + r#"Not yet implemented: Unsupported Interval Expression with value "foobar""# + ); + test_unsafe_string_to_interval_err!( + vec![Some("foobar")], + IntervalUnit::MonthDayNano, + r#"Not yet implemented: Unsupported Interval Expression with value "foobar""# + ); + test_unsafe_string_to_interval_err!( + vec![Some("2 months 31 days 1 second")], + IntervalUnit::YearMonth, + r#"Cast error: Cannot cast 2 months 31 days 1 second to IntervalYearMonth. Only year and month fields are allowed."# + ); + test_unsafe_string_to_interval_err!( + vec![Some("1 day 1.5 milliseconds")], + IntervalUnit::DayTime, + r#"Cast error: Cannot cast 1 day 1.5 milliseconds to IntervalDayTime because the nanos part isn't multiple of milliseconds"# + ); + } + #[test] fn test_cast_string_to_binary() { let string_1 = "Hi"; diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f498bf142bd7..7f6ca742d345 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -20,6 +20,7 @@ use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; use chrono::prelude::*; +use std::str::FromStr; /// Accepts a string and parses it relative to the provided `timezone` /// @@ -563,6 +564,233 @@ fn is_valid_decimal(s: &str) -> bool { seen_digit } +pub fn parse_interval_year_month( + value: &str, +) -> Result<::Native, ArrowError> { + let (result_months, result_days, result_nanos) = parse_interval("years", value)?; + if result_days != 0 || result_nanos != 0 { + return Err(ArrowError::CastError(format!( + "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." + ))); + } + Ok(IntervalYearMonthType::make_value(0, result_months)) +} + +pub fn parse_interval_day_time( + value: &str, +) -> Result<::Native, ArrowError> { + let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?; + if result_nanos % 1_000_000 != 0 { + return Err(ArrowError::CastError(format!( + "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" + ))); + } + result_days += result_months * 30; + Ok(IntervalDayTimeType::make_value( + result_days, + (result_nanos / 1_000_000) as i32, + )) +} + +pub fn parse_interval_month_day_nano( + value: &str, +) -> Result<::Native, ArrowError> { + let (result_months, result_days, result_nanos) = parse_interval("months", value)?; + Ok(IntervalMonthDayNanoType::make_value( + result_months, + result_days, + result_nanos, + )) +} + +const SECONDS_PER_HOUR: f64 = 3_600_f64; +const NANOS_PER_MILLIS: f64 = 1_000_000_f64; +const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS; +#[cfg(test)] +const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND; +#[cfg(test)] +const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE; +#[cfg(test)] +const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR; + +#[derive(Clone, Copy)] +#[repr(u16)] +enum IntervalType { + Century = 0b_00_0000_0001, + Decade = 0b_00_0000_0010, + Year = 0b_00_0000_0100, + Month = 0b_00_0000_1000, + Week = 0b_00_0001_0000, + Day = 0b_00_0010_0000, + Hour = 0b_00_0100_0000, + Minute = 0b_00_1000_0000, + Second = 0b_01_0000_0000, + Millisecond = 0b_10_0000_0000, +} + +impl FromStr for IntervalType { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "century" | "centuries" => Ok(Self::Century), + "decade" | "decades" => Ok(Self::Decade), + "year" | "years" => Ok(Self::Year), + "month" | "months" => Ok(Self::Month), + "week" | "weeks" => Ok(Self::Week), + "day" | "days" => Ok(Self::Day), + "hour" | "hours" => Ok(Self::Hour), + "minute" | "minutes" => Ok(Self::Minute), + "second" | "seconds" => Ok(Self::Second), + "millisecond" | "milliseconds" => Ok(Self::Millisecond), + _ => Err(ArrowError::NotYetImplemented(format!( + "Unknown interval type: {s}" + ))), + } + } +} + +pub type MonthDayNano = (i32, i32, i64); + +/// parse string value to a triple of aligned months, days, nanos. +/// leading field is the default unit. e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when leading_filed = 'second' +fn parse_interval(leading_field: &str, value: &str) -> Result { + let mut used_interval_types = 0; + + let mut calculate_from_part = |interval_period_str: &str, + interval_type: &str| + -> Result<(i32, i32, i64), ArrowError> { + // TODO: Use fixed-point arithmetic to avoid truncation and rounding errors (#3809) + let interval_period = match f64::from_str(interval_period_str) { + Ok(n) => n, + Err(_) => { + return Err(ArrowError::NotYetImplemented(format!( + "Unsupported Interval Expression with value {value:?}" + ))); + } + }; + + if interval_period > (i64::MAX as f64) { + return Err(ArrowError::ParseError(format!( + "Interval field value out of range: {value:?}" + ))); + } + + let it = IntervalType::from_str(interval_type).map_err(|_| { + ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}" + )) + })?; + + // Disallow duplicate interval types + if used_interval_types & (it as u16) != 0 { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'" + ))); + } else { + used_interval_types |= it as u16; + } + + match it { + IntervalType::Century => { + align_interval_parts(interval_period * 1200_f64, 0.0, 0.0) + } + IntervalType::Decade => { + align_interval_parts(interval_period * 120_f64, 0.0, 0.0) + } + IntervalType::Year => { + align_interval_parts(interval_period * 12_f64, 0.0, 0.0) + } + IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0), + IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0), + IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0), + IntervalType::Hour => Ok(( + 0, + 0, + (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64, + )), + IntervalType::Minute => { + Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64)) + } + IntervalType::Second => { + Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64)) + } + IntervalType::Millisecond => { + Ok((0, 0, (interval_period * 1_000_000f64) as i64)) + } + } + }; + + let mut result_month: i32 = 0; + let mut result_days: i32 = 0; + let mut result_nanos: i64 = 0; + + let mut parts = value.split_whitespace(); + + while let Some(interval_period_str) = parts.next() { + let unit = parts.next().unwrap_or(leading_field); + + let (diff_month, diff_days, diff_nanos) = + calculate_from_part(interval_period_str, unit)?; + + result_month = + result_month + .checked_add(diff_month) + .ok_or(ArrowError::ParseError(format!( + "Interval field value out of range: {value:?}" + )))?; + + result_days = + result_days + .checked_add(diff_days) + .ok_or(ArrowError::ParseError(format!( + "Interval field value out of range: {value:?}" + )))?; + + result_nanos = + result_nanos + .checked_add(diff_nanos) + .ok_or(ArrowError::ParseError(format!( + "Interval field value out of range: {value:?}" + )))?; + } + + Ok((result_month, result_days, result_nanos)) +} + +/// The fractional units must be spilled to smaller units. +/// [reference Postgresql doc](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) +/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days +/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours +fn align_interval_parts( + month_part: f64, + mut day_part: f64, + mut nanos_part: f64, +) -> Result<(i32, i32, i64), ArrowError> { + // Convert fractional month to days, It's not supported by Arrow types, but anyway + day_part += (month_part - (month_part as i64) as f64) * 30_f64; + + // Convert fractional days to hours + nanos_part += (day_part - ((day_part as i64) as f64)) + * 24_f64 + * SECONDS_PER_HOUR + * NANOS_PER_SECOND; + + if month_part > i32::MAX as f64 + || month_part < i32::MIN as f64 + || day_part > i32::MAX as f64 + || day_part < i32::MIN as f64 + || nanos_part > i64::MAX as f64 + || nanos_part < i64::MIN as f64 + { + return Err(ArrowError::ParseError(format!( + "Parsed interval field value out of range: {month_part} months {day_part} days {nanos_part} nanos" + ))); + } + + Ok((month_part as i32, day_part as i32, nanos_part as i64)) +} + #[cfg(test)] mod tests { use super::*; @@ -1019,6 +1247,127 @@ mod tests { ); } + #[test] + fn test_parse_interval() { + assert_eq!( + (1i32, 0i32, 0i64), + parse_interval("months", "1 month").unwrap(), + ); + + assert_eq!( + (2i32, 0i32, 0i64), + parse_interval("months", "2 month").unwrap(), + ); + + assert_eq!( + (-1i32, -18i32, (-0.2 * NANOS_PER_DAY) as i64), + parse_interval("months", "-1.5 months -3.2 days").unwrap(), + ); + + assert_eq!( + (2i32, 10i32, (9.0 * NANOS_PER_HOUR) as i64), + parse_interval("months", "2.1 months 7.25 days 3 hours").unwrap(), + ); + + assert_eq!( + parse_interval("months", "1 centurys 1 month") + .unwrap_err() + .to_string(), + r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# + ); + + assert_eq!( + (37i32, 0i32, 0i64), + parse_interval("months", "3 year 1 month").unwrap(), + ); + + assert_eq!( + (35i32, 0i32, 0i64), + parse_interval("months", "3 year -1 month").unwrap(), + ); + + assert_eq!( + (-37i32, 0i32, 0i64), + parse_interval("months", "-3 year -1 month").unwrap(), + ); + + assert_eq!( + (-35i32, 0i32, 0i64), + parse_interval("months", "-3 year 1 month").unwrap(), + ); + + assert_eq!( + (0i32, 5i32, 0i64), + parse_interval("months", "5 days").unwrap(), + ); + + assert_eq!( + (0i32, 7i32, (3f64 * NANOS_PER_HOUR) as i64), + parse_interval("months", "7 days 3 hours").unwrap(), + ); + + assert_eq!( + (0i32, 7i32, (5f64 * NANOS_PER_MINUTE) as i64), + parse_interval("months", "7 days 5 minutes").unwrap(), + ); + + assert_eq!( + (0i32, 7i32, (-5f64 * NANOS_PER_MINUTE) as i64), + parse_interval("months", "7 days -5 minutes").unwrap(), + ); + + assert_eq!( + (0i32, -7i32, (5f64 * NANOS_PER_HOUR) as i64), + parse_interval("months", "-7 days 5 hours").unwrap(), + ); + + assert_eq!( + ( + 0i32, + -7i32, + (-5f64 * NANOS_PER_HOUR + - 5f64 * NANOS_PER_MINUTE + - 5f64 * NANOS_PER_SECOND) as i64 + ), + parse_interval("months", "-7 days -5 hours -5 minutes -5 seconds").unwrap(), + ); + + assert_eq!( + (12i32, 0i32, (25f64 * NANOS_PER_MILLIS) as i64), + parse_interval("months", "1 year 25 millisecond").unwrap(), + ); + + assert_eq!( + (12i32, 1i32, (0.000000001 * NANOS_PER_SECOND) as i64), + parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(), + ); + + assert_eq!( + (12i32, 1i32, (0.1 * NANOS_PER_MILLIS) as i64), + parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(), + ); + + assert_eq!( + (1i32, 0i32, (-NANOS_PER_SECOND) as i64), + parse_interval("months", "1 month -1 second").unwrap(), + ); + + assert_eq!( + (-13i32, -8i32, (- NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - 1.11 * NANOS_PER_MILLIS) as i64), + parse_interval("months", "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond").unwrap(), + ); + } + + #[test] + fn test_duplicate_interval_type() { + let err = parse_interval("months", "1 month 1 second 1 second") + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, + format!("{err:?}") + ); + } + #[test] fn string_to_timestamp_old() { parse_timestamp("1677-06-14T07:29:01.256") From 379bd23816b3e2d32d14316d433de4037361e2bc Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 7 Mar 2023 11:40:15 +0100 Subject: [PATCH 0659/1411] fix: regexp_match skips first match (#3807) --- arrow-string/src/regexp.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index bf6e60cfeaaa..4b1e2dcde228 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -182,6 +182,7 @@ pub fn regexp_match( .map(|pattern| pattern.map(|pattern| pattern.to_string())), ), }; + array .iter() .zip(complete_pattern) @@ -209,9 +210,14 @@ pub fn regexp_match( }; match re.captures(value) { Some(caps) => { - for m in caps.iter().skip(1).flatten() { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { list_builder.values().append_value(m.as_str()); } + list_builder.append(true); } None => list_builder.append(false), @@ -282,6 +288,20 @@ mod tests { assert_eq!(&expected, result); } + #[test] + fn test_single_group_not_skip_match() { + let array = StringArray::from(vec![Some("foo"), Some("bar")]); + let pattern = GenericStringArray::::from(vec![r"foo"]); + let actual = regexp_match(&array, &pattern, None).unwrap(); + let result = actual.as_any().downcast_ref::().unwrap(); + let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + expected_builder.values().append_value("foo"); + expected_builder.append(true); + let expected = expected_builder.finish(); + assert_eq!(&expected, result); + } + macro_rules! test_flag_utf8 { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] From 6678b23dde333aa930e9d5ff845492872b0faa76 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Mar 2023 13:43:25 +0100 Subject: [PATCH 0660/1411] Restrict DictionaryArray to ArrowDictionaryKeyType (#3813) * Restrict DictionaryArray to ArrowDictionaryKeyType * Fixes --- arrow-arith/src/arithmetic.rs | 8 ++-- arrow-arith/src/arity.rs | 5 +- arrow-array/src/array/dictionary_array.rs | 46 ++++++++----------- arrow-array/src/array/mod.rs | 2 +- .../builder/primitive_dictionary_builder.rs | 7 +-- arrow-ord/src/comparison.rs | 18 ++++---- arrow-select/src/filter.rs | 4 +- arrow-select/src/take.rs | 2 +- arrow-string/src/like.rs | 2 +- 9 files changed, 44 insertions(+), 50 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 0fb559f0651f..c2415a67bb66 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -525,7 +525,7 @@ fn math_op_dict( op: F, ) -> Result, ArrowError> where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> T::Native, { @@ -580,7 +580,7 @@ fn math_checked_op_dict( op: F, ) -> Result, ArrowError> where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> Result, { @@ -613,7 +613,7 @@ fn math_divide_checked_op_dict( op: F, ) -> Result, ArrowError> where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> Result, { @@ -666,7 +666,7 @@ fn math_divide_safe_op_dict( op: F, ) -> Result where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowNumericType, F: Fn(T::Native, T::Native) -> Option, { diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index ea078765df1a..5d4973714a06 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -19,6 +19,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::iterator::ArrayIter; +use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{Buffer, MutableBuffer}; @@ -96,7 +97,7 @@ where /// A helper function that applies an infallible unary function to a dictionary array with primitive value type. fn unary_dict(array: &DictionaryArray, op: F) -> Result where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowPrimitiveType, F: Fn(T::Native) -> T::Native, { @@ -111,7 +112,7 @@ fn try_unary_dict( op: F, ) -> Result where - K: ArrowNumericType, + K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 22e99a44c326..74b5cec19475 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -205,7 +205,7 @@ pub type UInt64DictionaryArray = DictionaryArray; /// .collect(); /// assert_eq!(&array, &expected); /// ``` -pub struct DictionaryArray { +pub struct DictionaryArray { /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, /// as, in the current implementation, `values` below are the first child of this struct. data: ArrayData, @@ -223,7 +223,7 @@ pub struct DictionaryArray { is_ordered: bool, } -impl Clone for DictionaryArray { +impl Clone for DictionaryArray { fn clone(&self) -> Self { Self { data: self.data.clone(), @@ -234,7 +234,7 @@ impl Clone for DictionaryArray { } } -impl DictionaryArray { +impl DictionaryArray { /// Attempt to create a new DictionaryArray with a specified keys /// (indexes into the dictionary) and values (dictionary) /// array. Returns an error if there are any keys that are outside @@ -436,7 +436,7 @@ impl DictionaryArray { } /// Constructs a `DictionaryArray` from an array data reference. -impl From for DictionaryArray { +impl From for DictionaryArray { fn from(data: ArrayData) -> Self { assert_eq!( data.buffers().len(), @@ -482,7 +482,7 @@ impl From for DictionaryArray { } } -impl From> for ArrayData { +impl From> for ArrayData { fn from(array: DictionaryArray) -> Self { array.data } @@ -543,7 +543,7 @@ impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray } } -impl Array for DictionaryArray { +impl Array for DictionaryArray { fn as_any(&self) -> &dyn Any { self } @@ -557,7 +557,7 @@ impl Array for DictionaryArray { } } -impl std::fmt::Debug for DictionaryArray { +impl std::fmt::Debug for DictionaryArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!( f, @@ -583,7 +583,7 @@ impl std::fmt::Debug for DictionaryArray { /// assert_eq!(maybe_val.unwrap(), orig) /// } /// ``` -pub struct TypedDictionaryArray<'a, K: ArrowPrimitiveType, V> { +pub struct TypedDictionaryArray<'a, K: ArrowDictionaryKeyType, V> { /// The dictionary array dictionary: &'a DictionaryArray, /// The values of the dictionary @@ -591,7 +591,7 @@ pub struct TypedDictionaryArray<'a, K: ArrowPrimitiveType, V> { } // Manually implement `Clone` to avoid `V: Clone` type constraint -impl<'a, K: ArrowPrimitiveType, V> Clone for TypedDictionaryArray<'a, K, V> { +impl<'a, K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'a, K, V> { fn clone(&self) -> Self { Self { dictionary: self.dictionary, @@ -600,15 +600,17 @@ impl<'a, K: ArrowPrimitiveType, V> Clone for TypedDictionaryArray<'a, K, V> { } } -impl<'a, K: ArrowPrimitiveType, V> Copy for TypedDictionaryArray<'a, K, V> {} +impl<'a, K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'a, K, V> {} -impl<'a, K: ArrowPrimitiveType, V> std::fmt::Debug for TypedDictionaryArray<'a, K, V> { +impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug + for TypedDictionaryArray<'a, K, V> +{ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) } } -impl<'a, K: ArrowPrimitiveType, V> TypedDictionaryArray<'a, K, V> { +impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> { /// Returns the keys of this [`TypedDictionaryArray`] pub fn keys(&self) -> &'a PrimitiveArray { self.dictionary.keys() @@ -620,7 +622,7 @@ impl<'a, K: ArrowPrimitiveType, V> TypedDictionaryArray<'a, K, V> { } } -impl<'a, K: ArrowPrimitiveType, V: Sync> Array for TypedDictionaryArray<'a, K, V> { +impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, K, V> { fn as_any(&self) -> &dyn Any { self.dictionary } @@ -636,7 +638,7 @@ impl<'a, K: ArrowPrimitiveType, V: Sync> Array for TypedDictionaryArray<'a, K, V impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, Self: ArrayAccessor, { type Item = Option<::Item>; @@ -649,7 +651,7 @@ where impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V> where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, V: Sync + Send, &'a V: ArrayAccessor, <&'a V as ArrayAccessor>::Item: Default, @@ -684,10 +686,8 @@ mod tests { use super::*; use crate::builder::PrimitiveDictionaryBuilder; use crate::cast::as_dictionary_array; - use crate::types::{ - Float32Type, Int16Type, Int32Type, Int8Type, UInt32Type, UInt8Type, - }; - use crate::{Float32Array, Int16Array, Int32Array, Int8Array}; + use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type, UInt8Type}; + use crate::{Int16Array, Int32Array, Int8Array}; use arrow_buffer::{Buffer, ToByteSlice}; use std::sync::Arc; @@ -955,14 +955,6 @@ mod tests { DictionaryArray::::try_new(&keys, &values).unwrap(); } - #[test] - #[should_panic(expected = "Dictionary key type must be integer, but was Float32")] - fn test_try_wrong_dictionary_key_type() { - let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); - let keys: Float32Array = [Some(0_f32), None, Some(3_f32)].into_iter().collect(); - DictionaryArray::::try_new(&keys, &values).unwrap(); - } - #[test] #[should_panic( expected = "DictionaryArray's data type must match, expected Int64 got Int32" diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 27973a40faa9..f3c35e51faa5 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -424,7 +424,7 @@ impl PartialEq for PrimitiveArray { } } -impl PartialEq for DictionaryArray { +impl PartialEq for DictionaryArray { fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 9f410994114f..a996128d5e9d 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -16,6 +16,7 @@ // under the License. use crate::builder::{ArrayBuilder, PrimitiveBuilder}; +use crate::types::ArrowDictionaryKeyType; use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; @@ -172,7 +173,7 @@ where impl ArrayBuilder for PrimitiveDictionaryBuilder where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, V: ArrowPrimitiveType, { /// Returns the builder as an non-mutable `Any` reference. @@ -213,7 +214,7 @@ where impl PrimitiveDictionaryBuilder where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, V: ArrowPrimitiveType, { /// Append a primitive value to the array. Return an existing index @@ -312,7 +313,7 @@ where } } -impl Extend> +impl Extend> for PrimitiveDictionaryBuilder { #[inline] diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 2702514edc83..b235df036077 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1253,7 +1253,7 @@ fn unpack_dict_comparison( dict_comparison: BooleanArray, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, K::Native: num::ToPrimitive, { // TODO: Use take_boolean (#2967) @@ -2035,7 +2035,7 @@ fn cmp_dict_primitive( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, T: ArrowPrimitiveType + Sync + Send, F: Fn(T::Native, T::Native) -> bool, { @@ -2055,7 +2055,7 @@ fn cmp_dict_string_array( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(&str, &str) -> bool, { compare_op( @@ -2078,7 +2078,7 @@ fn cmp_dict_boolean_array( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(bool, bool) -> bool, { compare_op( @@ -2097,7 +2097,7 @@ fn cmp_dict_binary_array( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(&[u8], &[u8]) -> bool, { compare_op( @@ -2121,7 +2121,7 @@ pub fn cmp_dict( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, T: ArrowPrimitiveType + Sync + Send, F: Fn(T::Native, T::Native) -> bool, { @@ -2141,7 +2141,7 @@ pub fn cmp_dict_bool( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(bool, bool) -> bool, { compare_op( @@ -2160,7 +2160,7 @@ pub fn cmp_dict_utf8( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(&str, &str) -> bool, { compare_op( @@ -2182,7 +2182,7 @@ pub fn cmp_dict_binary( op: F, ) -> Result where - K: ArrowPrimitiveType, + K: ArrowDictionaryKeyType, F: Fn(&[u8], &[u8]) -> bool, { compare_op( diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 6ba08746d8fc..1818c4fb50c4 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use arrow_array::builder::BooleanBufferBuilder; use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; -use arrow_array::types::ByteArrayType; +use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; use arrow_buffer::bit_util; use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; @@ -671,7 +671,7 @@ fn filter_dict( predicate: &FilterPredicate, ) -> DictionaryArray where - T: ArrowPrimitiveType, + T: ArrowDictionaryKeyType, T::Native: num::Num, { let builder = filter_primitive::(array.keys(), predicate) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 771a7eeb5c5a..58b5c91f1bf6 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -770,7 +770,7 @@ fn take_dict( indices: &PrimitiveArray, ) -> Result, ArrowError> where - T: ArrowPrimitiveType, + T: ArrowDictionaryKeyType, T::Native: num::Num, I: ArrowPrimitiveType, I::Native: ToPrimitive, diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 10a58b3c00f6..9ae635e0c520 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -201,7 +201,7 @@ macro_rules! dict_function { /// /// See the documentation on [`like_utf8`] for more details. #[cfg(feature = "dyn_cmp_dict")] -fn $fn_name( +fn $fn_name( left: &DictionaryArray, right: &DictionaryArray, ) -> Result { From 5d757290dc39c6cd4b08309e081fe2b0dcffcf62 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Mar 2023 17:58:57 +0100 Subject: [PATCH 0661/1411] Add regexp_match docs (#3812) * Add regexp_match docs * Update arrow-string/src/regexp.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- arrow-string/src/regexp.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index 4b1e2dcde228..f3ba90d8a741 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -157,6 +157,26 @@ pub fn regexp_is_match_utf8_scalar( } /// Extract all groups matched by a regular expression for a given String array. +/// +/// Modelled after the Postgres [regexp_match]. +/// +/// Returns a ListArray of [`GenericStringArray`] with each element containing the leftmost-first +/// match of the corresponding index in `regex_array` to string in `array` +/// +/// If there is no match, the list element is NULL. +/// +/// If a match is found, and the pattern contains no capturing parenthesized subexpressions, +/// then the list element is a single-element [`GenericStringArray`] containing the substring +/// matching the whole pattern. +/// +/// If a match is found, and the pattern contains capturing parenthesized subexpressions, then the +/// list element is a [`GenericStringArray`] whose n'th element is the substring matching +/// the n'th capturing parenthesized subexpression of the pattern. +/// +/// The flags parameter is an optional text string containing zero or more single-letter flags +/// that change the function's behavior. +/// +/// [regexp_match]: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP pub fn regexp_match( array: &GenericStringArray, regex_array: &GenericStringArray, From 81ed3348999f6145dc4b49a1c8b7186bf69b35dc Mon Sep 17 00:00:00 2001 From: Stuart Carnie Date: Thu, 9 Mar 2023 02:30:56 +1100 Subject: [PATCH 0662/1411] fix: Ensure Flight schema includes parent metadata (#3811) * fix: Ensure prepared schema includes parent metadata Closes #3779 * Add a test and run fmt --------- Co-authored-by: Andrew Lamb --- arrow-flight/src/encode.rs | 14 +++++++++++++- arrow-flight/tests/encode_decode.rs | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 2e93acb0931c..557663922121 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -323,7 +323,7 @@ fn prepare_schema_for_flight(schema: &Schema) -> Schema { }) .collect(); - Schema::new(fields) + Schema::new(fields).with_metadata(schema.metadata().clone()) } /// Split [`RecordBatch`] so it hopefully fits into a gRPC response. @@ -461,6 +461,7 @@ mod tests { use arrow_array::{ DictionaryArray, Int16Array, Int32Array, Int64Array, StringArray, UInt64Array, }; + use std::collections::HashMap; use super::*; @@ -502,6 +503,17 @@ mod tests { ); } + #[test] + fn test_schema_metadata_encoded() { + let schema = + Schema::new(vec![Field::new("data", DataType::Int32, false)]).with_metadata( + HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), + ); + + let got = prepare_schema_for_flight(&schema); + assert!(got.metadata().contains_key("some_key")); + } + #[test] fn test_encode_no_column_batch() { let batch = RecordBatch::try_new_with_options( diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 25e74cb3b6bc..8c73a516b2b0 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -17,7 +17,7 @@ //! Tests for round trip encoding / decoding -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; use arrow::{compute::concat_batches, datatypes::Int32Type}; use arrow_array::{ArrayRef, DictionaryArray, Float64Array, RecordBatch, UInt8Array}; @@ -62,6 +62,18 @@ async fn test_primative_one() { roundtrip(vec![make_primative_batch(5)]).await; } +#[tokio::test] +async fn test_schema_metadata() { + let batch = make_primative_batch(5); + let metadata = HashMap::from([("some_key".to_owned(), "some_value".to_owned())]); + + // create a batch that has schema level metadata + let schema = Arc::new(batch.schema().as_ref().clone().with_metadata(metadata)); + let batch = RecordBatch::try_new(schema, batch.columns().to_vec()).unwrap(); + + roundtrip(vec![batch]).await; +} + #[tokio::test] async fn test_primative_many() { roundtrip(vec![ From 36f2db3a35e07dfbfdb6b32e457d40ef8ccfb601 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 8 Mar 2023 20:17:31 +0100 Subject: [PATCH 0663/1411] Add RunEndBuffer (#1799) (#3817) * Add RunEndBuffer (#1799) * Fix test * Revert rename * Format * Clippy * Remove unnecessary check * Fix * Tweak docs * Add docs --- arrow-array/src/array/mod.rs | 1 + arrow-array/src/array/run_array.rs | 138 +++++------- .../src/builder/generic_byte_run_builder.rs | 37 +--- .../src/builder/primitive_run_builder.rs | 17 +- arrow-array/src/run_iterator.rs | 19 +- arrow-buffer/src/buffer/mod.rs | 2 + arrow-buffer/src/buffer/run.rs | 200 ++++++++++++++++++ arrow-ipc/src/writer.rs | 32 +-- arrow-ord/src/sort.rs | 14 +- arrow-select/src/take.rs | 3 +- 10 files changed, 286 insertions(+), 177 deletions(-) create mode 100644 arrow-buffer/src/buffer/run.rs diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index f3c35e51faa5..dfdaac85bf85 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -880,6 +880,7 @@ mod tests { assert_eq!(array.null_count(), 0); assert_eq!(array.values().len(), 1); assert_eq!(array.values().null_count(), 1); + assert_eq!(array.run_ends().len(), 4); assert_eq!(array.run_ends().values(), &[4]); let idx = array.get_physical_indices(&[0, 1, 2, 3]).unwrap(); diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 126aefde94f3..e50903f30f9b 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -17,6 +17,7 @@ use std::any::Any; +use arrow_buffer::buffer::RunEndBuffer; use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -62,7 +63,7 @@ use crate::{ pub struct RunArray { data: ArrayData, - run_ends: PrimitiveArray, + run_ends: RunEndBuffer, values: ArrayRef, } @@ -110,11 +111,8 @@ impl RunArray { Ok(array_data.into()) } - /// Returns a reference to run_ends array - /// - /// Note: any slicing of this [`RunArray`] array is not applied to the returned array - /// and must be handled separately - pub fn run_ends(&self) -> &PrimitiveArray { + /// Returns a reference to [`RunEndBuffer`] + pub fn run_ends(&self) -> &RunEndBuffer { &self.run_ends } @@ -128,19 +126,12 @@ impl RunArray { /// Returns the physical index at which the array slice starts. pub fn get_start_physical_index(&self) -> usize { - if self.offset() == 0 { - return 0; - } - self.get_zero_offset_physical_index(self.offset()).unwrap() + self.run_ends.get_start_physical_index() } /// Returns the physical index at which the array slice ends. pub fn get_end_physical_index(&self) -> usize { - if self.offset() + self.len() == Self::logical_len(&self.run_ends) { - return self.run_ends.len() - 1; - } - self.get_zero_offset_physical_index(self.offset() + self.len() - 1) - .unwrap() + self.run_ends.get_end_physical_index() } /// Downcast this [`RunArray`] to a [`TypedRunArray`] @@ -164,47 +155,13 @@ impl RunArray { }) } - /// Returns index to the physical array for the given index to the logical array. - /// The function does not adjust the input logical index based on `ArrayData::offset`. - /// Performs a binary search on the run_ends array for the input index. - #[inline] - pub fn get_zero_offset_physical_index(&self, logical_index: usize) -> Option { - if logical_index >= Self::logical_len(&self.run_ends) { - return None; - } - let mut st: usize = 0; - let mut en: usize = self.run_ends.len(); - while st + 1 < en { - let mid: usize = (st + en) / 2; - if logical_index - < unsafe { - // Safety: - // The value of mid will always be between 1 and len - 1, - // where len is length of run ends array. - // This is based on the fact that `st` starts with 0 and - // `en` starts with len. The condition `st + 1 < en` ensures - // `st` and `en` differs atleast by two. So the value of `mid` - // will never be either `st` or `en` - self.run_ends.value_unchecked(mid - 1).as_usize() - } - { - en = mid - } else { - st = mid - } - } - Some(st) - } - /// Returns index to the physical array for the given index to the logical array. /// This function adjusts the input logical index based on `ArrayData::offset` /// Performs a binary search on the run_ends array for the input index. - #[inline] - pub fn get_physical_index(&self, logical_index: usize) -> Option { - if logical_index >= self.len() { - return None; - } - self.get_zero_offset_physical_index(logical_index + self.offset()) + /// + /// The result is arbitrary if `logical_index >= self.len()` + pub fn get_physical_index(&self, logical_index: usize) -> usize { + self.run_ends.get_physical_index(logical_index) } /// Returns the physical indices of the input logical indices. Returns error if any of the logical @@ -222,6 +179,9 @@ impl RunArray { where I: ArrowNativeType, { + let len = self.run_ends().len(); + let offset = self.run_ends().offset(); + let indices_len = logical_indices.len(); if indices_len == 0 { @@ -243,7 +203,7 @@ impl RunArray { // Return early if all the logical indices cannot be converted to physical indices. let largest_logical_index = logical_indices[*ordered_indices.last().unwrap()].as_usize(); - if largest_logical_index >= self.len() { + if largest_logical_index >= len { return Err(ArrowError::InvalidArgumentError(format!( "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.", ))); @@ -259,7 +219,7 @@ impl RunArray { self.run_ends.values().iter().enumerate().skip(skip_value) { // Get the run end index (relative to offset) of current physical index - let run_end_value = run_end.as_usize() - self.offset(); + let run_end_value = run_end.as_usize() - offset; // All the `logical_indices` that are less than current run end index // belongs to current physical index. @@ -295,7 +255,15 @@ impl From for RunArray { } } - let run_ends = PrimitiveArray::::from(data.child_data()[0].clone()); + // Safety + // ArrayData is valid + let child = &data.child_data()[0]; + assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type"); + let run_ends = unsafe { + let scalar = child.buffers()[0].clone().into(); + RunEndBuffer::new_unchecked(scalar, data.offset(), data.len()) + }; + let values = make_array(data.child_data()[1].clone()); Self { data, @@ -330,7 +298,8 @@ impl std::fmt::Debug for RunArray { writeln!( f, "RunArray {{run_ends: {:?}, values: {:?}}}", - self.run_ends, self.values + self.run_ends.values(), + self.values ) } } @@ -347,7 +316,7 @@ impl std::fmt::Debug for RunArray { /// .map(|&x| if x == "b" { None } else { Some(x) }) /// .collect(); /// assert_eq!( -/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 5,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", +/// "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", /// format!("{:?}", array) /// ); /// ``` @@ -374,7 +343,7 @@ impl<'a, T: RunEndIndexType> FromIterator> for RunArray { /// let test = vec!["a", "a", "b", "c"]; /// let array: RunArray = test.into_iter().collect(); /// assert_eq!( -/// "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", /// format!("{:?}", array) /// ); /// ``` @@ -401,7 +370,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { /// /// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); +/// assert_eq!(array.run_ends().values(), &[2, 3, 5]); /// assert_eq!(array.values(), &values); /// ``` pub type Int16RunArray = RunArray; @@ -416,7 +385,7 @@ pub type Int16RunArray = RunArray; /// /// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.run_ends(), &Int32Array::from(vec![2, 3, 5])); +/// assert_eq!(array.run_ends().values(), &[2, 3, 5]); /// assert_eq!(array.values(), &values); /// ``` pub type Int32RunArray = RunArray; @@ -431,7 +400,7 @@ pub type Int32RunArray = RunArray; /// /// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); /// let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); -/// assert_eq!(array.run_ends(), &Int64Array::from(vec![2, 3, 5])); +/// assert_eq!(array.run_ends().values(), &[2, 3, 5]); /// assert_eq!(array.values(), &values); /// ``` pub type Int64RunArray = RunArray; @@ -480,7 +449,7 @@ impl<'a, R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'a, R, V> { impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> { /// Returns the run_ends of this [`TypedRunArray`] - pub fn run_ends(&self) -> &'a PrimitiveArray { + pub fn run_ends(&self) -> &'a RunEndBuffer { self.run_array.run_ends() } @@ -531,7 +500,7 @@ where } unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item { - let physical_index = self.run_array.get_physical_index(logical_index).unwrap(); + let physical_index = self.run_array.get_physical_index(logical_index); self.values().value_unchecked(physical_index) } } @@ -563,7 +532,7 @@ mod tests { use crate::builder::PrimitiveRunBuilder; use crate::cast::as_primitive_array; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; - use crate::{Array, Int16Array, Int32Array, StringArray}; + use crate::{Array, Int32Array, StringArray}; fn build_input_array(size: usize) -> Vec> { // The input array is created by shuffling and repeating @@ -643,9 +612,10 @@ mod tests { ]); // Construct a run_ends array: - let run_ends_data = PrimitiveArray::::from_iter_values([ - 4_i16, 6, 7, 9, 13, 18, 20, 22, - ]); + let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22]; + let run_ends_data = PrimitiveArray::::from_iter_values( + run_ends_values.iter().copied(), + ); // Construct a run ends encoded array from the above two let ree_array = @@ -659,8 +629,7 @@ mod tests { assert_eq!(&DataType::Int8, values.data_type()); let run_ends = ree_array.run_ends(); - assert_eq!(&run_ends_data.into_data(), run_ends.data()); - assert_eq!(&DataType::Int16, run_ends.data_type()); + assert_eq!(run_ends.values(), &run_ends_values); } #[test] @@ -671,7 +640,7 @@ mod tests { builder.append_value(22345678); let array = builder.finish(); assert_eq!( - "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", + "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray\n[\n 12345678,\n null,\n 22345678,\n]}\n", format!("{array:?}") ); @@ -685,7 +654,7 @@ mod tests { assert_eq!(array.null_count(), 0); assert_eq!( - "RunArray {run_ends: PrimitiveArray\n[\n 20,\n], values: PrimitiveArray\n[\n 1,\n]}\n", + "RunArray {run_ends: [20], values: PrimitiveArray\n[\n 1,\n]}\n", format!("{array:?}") ); } @@ -698,7 +667,7 @@ mod tests { .map(|&x| if x == "b" { None } else { Some(x) }) .collect(); assert_eq!( - "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", + "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n null,\n \"c\",\n]}\n", format!("{array:?}") ); @@ -707,7 +676,7 @@ mod tests { let array: RunArray = test.into_iter().collect(); assert_eq!( - "RunArray {run_ends: PrimitiveArray\n[\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", + "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", format!("{array:?}") ); } @@ -721,8 +690,6 @@ mod tests { assert_eq!(array.null_count(), 0); let run_ends = array.run_ends(); - assert_eq!(&DataType::Int16, run_ends.data_type()); - assert_eq!(0, run_ends.null_count()); assert_eq!(&[1, 2, 3, 4], run_ends.values()); } @@ -735,9 +702,6 @@ mod tests { assert_eq!(array.null_count(), 0); let run_ends = array.run_ends(); - assert_eq!(&DataType::Int32, run_ends.data_type()); - assert_eq!(0, run_ends.null_count()); - assert_eq!(5, run_ends.len()); assert_eq!(&[1, 2, 3, 5, 6], run_ends.values()); let values_data = array.values(); @@ -754,7 +718,7 @@ mod tests { assert_eq!(array.null_count(), 0); let run_ends = array.run_ends(); - assert_eq!(1, run_ends.len()); + assert_eq!(3, run_ends.len()); assert_eq!(&[3], run_ends.values()); let values_data = array.values(); @@ -770,16 +734,14 @@ mod tests { [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); let array = RunArray::::try_new(&run_ends, &values).unwrap(); - assert_eq!(array.run_ends().data_type(), &DataType::Int32); assert_eq!(array.values().data_type(), &DataType::Utf8); assert_eq!(array.null_count(), 0); assert_eq!(array.len(), 4); - assert_eq!(array.run_ends.null_count(), 0); assert_eq!(array.values().null_count(), 1); assert_eq!( - "RunArray {run_ends: PrimitiveArray\n[\n 1,\n 2,\n 3,\n 4,\n], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", + "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n \"foo\",\n \"bar\",\n null,\n \"baz\",\n]}\n", format!("{array:?}") ); } @@ -788,7 +750,7 @@ mod tests { fn test_run_array_int16_type_definition() { let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect(); let values: Arc = Arc::new(StringArray::from(vec!["a", "b", "c"])); - assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 3, 5])); + assert_eq!(array.run_ends().values(), &[2, 3, 5]); assert_eq!(array.values(), &values); } @@ -796,7 +758,7 @@ mod tests { fn test_run_array_empty_string() { let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect(); let values: Arc = Arc::new(StringArray::from(vec!["a", "", "c"])); - assert_eq!(array.run_ends(), &Int16Array::from(vec![2, 4, 5])); + assert_eq!(array.run_ends().values(), &[2, 4, 5]); assert_eq!(array.values(), &values); } @@ -849,9 +811,7 @@ mod tests { } #[test] - #[should_panic( - expected = "PrimitiveArray expected ArrayData with type Int64 got Int32" - )] + #[should_panic(expected = "Incorrect run ends type")] fn test_run_array_run_ends_data_type_mismatch() { let a = RunArray::::from_iter(["32"]); let _ = RunArray::::from(a.into_data()); @@ -874,7 +834,7 @@ mod tests { let actual = typed.value(i); assert_eq!(*val, actual) } else { - let physical_ix = run_array.get_physical_index(i).unwrap(); + let physical_ix = run_array.get_physical_index(i); assert!(typed.values().is_null(physical_ix)); }; } diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index c6dbb82ff6eb..5c15b1544ed3 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -49,10 +49,7 @@ use arrow_buffer::ArrowNativeType; /// builder.append_null(); /// let array = builder.finish(); /// -/// assert_eq!( -/// array.run_ends(), -/// &Int16Array::from(vec![Some(2), Some(3), Some(5), Some(6)]) -/// ); +/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]); /// /// let av = array.values(); /// @@ -331,10 +328,7 @@ where /// builder.extend([Some("def"), Some("def"), Some("abc")]); /// let array = builder.finish(); /// -/// assert_eq!( -/// array.run_ends(), -/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) -/// ); +/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); /// /// // Values are polymorphic and so require a downcast. /// let av = array.values(); @@ -370,10 +364,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]); /// let array = builder.finish(); /// -/// assert_eq!( -/// array.run_ends(), -/// &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) -/// ); +/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); /// /// // Values are polymorphic and so require a downcast. /// let av = array.values(); @@ -396,11 +387,9 @@ mod tests { use super::*; use crate::array::Array; - use crate::cast::as_primitive_array; use crate::cast::as_string_array; use crate::types::{Int16Type, Int32Type}; use crate::GenericByteArray; - use crate::Int16Array; use crate::Int16RunArray; fn test_bytes_run_buider(values: Vec<&T::Native>) @@ -426,10 +415,7 @@ mod tests { assert_eq!(array.len(), 11); assert_eq!(array.null_count(), 0); - assert_eq!( - array.run_ends(), - &Int16Array::from(vec![Some(3), Some(5), Some(7), Some(11)]) - ); + assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]); // Values are polymorphic and so require a downcast. let av = array.values(); @@ -475,10 +461,7 @@ mod tests { assert_eq!(array.len(), 5); assert_eq!(array.null_count(), 0); - assert_eq!( - array.run_ends(), - &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(5)]) - ); + assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]); // Values are polymorphic and so require a downcast. let av = array.values(); @@ -500,10 +483,7 @@ mod tests { assert_eq!(array.len(), 8); assert_eq!(array.null_count(), 0); - assert_eq!( - array.run_ends(), - &Int16Array::from(vec![Some(1), Some(2), Some(4), Some(7), Some(8),]) - ); + assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]); // Values are polymorphic and so require a downcast. let av2 = array.values(); @@ -536,10 +516,7 @@ mod tests { let array = builder.finish(); assert_eq!(array.len(), 10); - assert_eq!( - as_primitive_array::(array.run_ends()).values(), - &[3, 5, 8, 10] - ); + assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]); let str_array = as_string_array(array.values().as_ref()); assert_eq!(str_array.value(0), "a"); diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 41066228390d..e7c822ee6b19 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -44,10 +44,7 @@ use arrow_buffer::ArrowNativeType; /// builder.append_value(5678); /// let array = builder.finish(); /// -/// assert_eq!( -/// array.run_ends(), -/// &Int16Array::from(vec![Some(3), Some(4), Some(6)]) -/// ); +/// assert_eq!(array.run_ends().values(), &[3, 4, 6]); /// /// let av = array.values(); /// @@ -270,7 +267,7 @@ mod tests { use crate::builder::PrimitiveRunBuilder; use crate::cast::as_primitive_array; use crate::types::{Int16Type, UInt32Type}; - use crate::{Array, Int16Array, UInt32Array}; + use crate::{Array, UInt32Array}; #[test] fn test_primitive_ree_array_builder() { @@ -287,10 +284,7 @@ mod tests { assert_eq!(array.null_count(), 0); assert_eq!(array.len(), 6); - assert_eq!( - array.run_ends(), - &Int16Array::from(vec![Some(3), Some(4), Some(6)]) - ); + assert_eq!(array.run_ends().values(), &[3, 4, 6]); let av = array.values(); @@ -313,10 +307,7 @@ mod tests { assert_eq!(array.len(), 11); assert_eq!(array.null_count(), 0); - assert_eq!( - as_primitive_array::(array.run_ends()).values(), - &[1, 3, 5, 9, 10, 11] - ); + assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]); assert_eq!( as_primitive_array::(array.values().as_ref()).values(), &[1, 2, 5, 4, 6, 2] diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 44cb59ac7fc4..60022113c3dd 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -17,9 +17,8 @@ //! Idiomatic iterator for [`RunArray`](crate::Array) -use arrow_buffer::ArrowNativeType; - use crate::{array::ArrayAccessor, types::RunEndIndexType, Array, TypedRunArray}; +use arrow_buffer::ArrowNativeType; /// The [`RunArrayIter`] provides an idiomatic way to iterate over the run array. /// It returns Some(T) if there is a value or None if the value is null. @@ -83,14 +82,11 @@ where if self.current_front_logical == self.current_back_logical { return None; } + // If current logical index is greater than current run end index then increment // the physical index. - if self.current_front_logical - >= self - .array - .run_ends() - .value(self.current_front_physical) - .as_usize() + let run_ends = self.array.run_ends().values(); + if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this @@ -138,13 +134,10 @@ where self.current_back_logical -= 1; + let run_ends = self.array.run_ends().values(); if self.current_back_physical > 0 && self.current_back_logical - < self - .array - .run_ends() - .value(self.current_back_physical - 1) - .as_usize() + < run_ends[self.current_back_physical - 1].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index f7e41260d80e..ed53d3361daa 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -32,3 +32,5 @@ mod boolean; pub use boolean::*; mod null; pub use null::*; +mod run; +pub use run::*; diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs new file mode 100644 index 000000000000..a7c39638758c --- /dev/null +++ b/arrow-buffer/src/buffer/run.rs @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::ScalarBuffer; +use crate::ArrowNativeType; + +/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// +/// # Logical vs Physical +/// +/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is +/// called the physical index. The logical index is then the corresponding index in the logical +/// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. +/// +/// Each value in [`RunEndBuffer::values`] is the cumulative length of all runs in the +/// logical array, up to that physical index. +/// +/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, +/// as there are `3` values, and the maximum logical index is `6`, as the maximum run end +/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 1, 2, 2]` +/// +/// ```text +/// ┌─────────┐ ┌─────────┐ ┌─────────┐ +/// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │ +/// ├─────────┤ ├─────────┤ │ ├─────────┤ +/// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │ +/// ├─────────┤ ├─────────┤ │ │ ├─────────┤ +/// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ +/// └─────────┘ ├─────────┤ │ │ └─────────┘ +/// run ends │ 3 │ ───┤ │ physical indices +/// ├─────────┤ │ │ +/// │ 4 │ ───┘ │ +/// ├─────────┤ │ +/// │ 5 │ ─────┤ +/// ├─────────┤ │ +/// │ 6 │ ─────┘ +/// └─────────┘ +/// logical indices +/// ``` +/// +/// # Slicing +/// +/// In order to provide zero-copy slicing, this container stores a separate offset and length +/// +/// For example, a [`RunEndBuffer`] containing values `[3, 6, 8]` with offset and length `4` would +/// describe the physical indices `1, 1, 2, 2` +/// +/// For example, a [`RunEndBuffer`] containing values `[6, 8, 9]` with offset `2` and length `5` +/// would describe the physical indices `0, 0, 0, 0, 1` +/// +/// [Run-End encoded layout]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout +#[derive(Debug, Clone)] +pub struct RunEndBuffer { + run_ends: ScalarBuffer, + len: usize, + offset: usize, +} + +impl RunEndBuffer +where + E: ArrowNativeType, +{ + /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], an `offset` and `len` + /// + /// # Panics + /// + /// - `buffer` does not contain strictly increasing values greater than zero + /// - the last value of `buffer` is less than `offset + len` + pub fn new(run_ends: ScalarBuffer, offset: usize, len: usize) -> Self { + assert!( + run_ends.windows(2).all(|w| w[0] < w[1]), + "run-ends not strictly increasing" + ); + + if len != 0 { + assert!(!run_ends.is_empty(), "non-empty slice but empty run-ends"); + let end = E::from_usize(offset.saturating_add(len)).unwrap(); + assert!( + *run_ends.first().unwrap() >= E::usize_as(0), + "run-ends not greater than 0" + ); + assert!( + *run_ends.last().unwrap() >= end, + "slice beyond bounds of run-ends" + ); + } + + Self { + run_ends, + offset, + len, + } + } + + /// Create a new [`RunEndBuffer`] from an [`ScalarBuffer`], an `offset` and `len` + /// + /// # Safety + /// + /// - `buffer` must contain strictly increasing values greater than zero + /// - The last value of `buffer` must be greater than or equal to `offset + len` + pub unsafe fn new_unchecked( + run_ends: ScalarBuffer, + offset: usize, + len: usize, + ) -> Self { + Self { + run_ends, + offset, + len, + } + } + + /// Returns the logical offset into the run-ends stored by this buffer + #[inline] + pub fn offset(&self) -> usize { + self.offset + } + + /// Returns the logical length of the run-ends stored by this buffer + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this buffer is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the values of this [`RunEndBuffer`] not including any offset + #[inline] + pub fn values(&self) -> &[E] { + &self.run_ends + } + + /// Returns the maximum run-end encoded in the underlying buffer + #[inline] + pub fn max_value(&self) -> usize { + self.values().last().copied().unwrap_or_default().as_usize() + } + + /// Performs a binary search to find the physical index for the given logical index + /// + /// The result is arbitrary if `logical_index >= self.len()` + pub fn get_physical_index(&self, logical_index: usize) -> usize { + let logical_index = E::usize_as(self.offset + logical_index); + let cmp = |p: &E| p.partial_cmp(&logical_index).unwrap(); + + match self.run_ends.binary_search_by(cmp) { + Ok(idx) => idx + 1, + Err(idx) => idx, + } + } + + /// Returns the physical index at which the logical array starts + pub fn get_start_physical_index(&self) -> usize { + if self.offset == 0 { + return 0; + } + // Fallback to binary search + self.get_physical_index(0) + } + + /// Returns the physical index at which the logical array ends + pub fn get_end_physical_index(&self) -> usize { + if self.max_value() == self.offset + self.len { + return self.values().len() - 1; + } + // Fallback to binary search + self.get_physical_index(self.len - 1) + } + + /// Slices this [`RunEndBuffer`] by the provided `offset` and `length` + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced RunEndBuffer cannot exceed the existing length" + ); + Self { + run_ends: self.run_ends.clone(), + offset: self.offset + offset, + len, + } + } +} diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 75c48bebcf63..b57692749878 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -580,42 +580,30 @@ pub(crate) fn unslice_run_array(arr: ArrayData) -> Result fn into_zero_offset_run_array( run_array: RunArray, ) -> Result, ArrowError> { - if run_array.offset() == 0 - && run_array.len() == RunArray::::logical_len(run_array.run_ends()) - { + let run_ends = run_array.run_ends(); + if run_ends.offset() == 0 && run_ends.max_value() == run_ends.len() { return Ok(run_array); } + // The physical index of original run_ends array from which the `ArrayData`is sliced. - let start_physical_index = run_array - .get_zero_offset_physical_index(run_array.offset()) - .unwrap(); + let start_physical_index = run_ends.get_start_physical_index(); - // The logical length of original run_ends array until which the `ArrayData` is sliced. - let end_logical_index = run_array.offset() + run_array.len() - 1; // The physical index of original run_ends array until which the `ArrayData`is sliced. - let end_physical_index = run_array - .get_zero_offset_physical_index(end_logical_index) - .unwrap(); + let end_physical_index = run_ends.get_end_physical_index(); let physical_length = end_physical_index - start_physical_index + 1; - // build new run_ends array by subtrating offset from run ends. + // build new run_ends array by subtracting offset from run ends. + let offset = R::Native::usize_as(run_ends.offset()); let mut builder = BufferBuilder::::new(physical_length); - for ix in start_physical_index..end_physical_index { - let run_end_value = unsafe { - // Safety: - // start_physical_index and end_physical_index are within - // run_ends array bounds. - run_array.run_ends().value_unchecked(ix).as_usize() - }; - let run_end_value = run_end_value - run_array.offset(); - builder.append(R::Native::from_usize(run_end_value).unwrap()); + for run_end_value in &run_ends.values()[start_physical_index..end_physical_index] { + builder.append(run_end_value.sub_wrapping(offset)); } builder.append(R::Native::from_usize(run_array.len()).unwrap()); let new_run_ends = unsafe { // Safety: // The function builds a valid run_ends array and hence need not be validated. - ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) + ArrayDataBuilder::new(R::DATA_TYPE) .len(physical_length) .add_buffer(builder.finish()) .build_unchecked() diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 230eb9390f2f..0f248ee637b0 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -673,7 +673,7 @@ fn sort_run_downcasted( let new_run_ends = unsafe { // Safety: // The function builds a valid run_ends array and hence need not be validated. - ArrayDataBuilder::new(run_array.run_ends().data_type().clone()) + ArrayDataBuilder::new(R::DATA_TYPE) .len(new_physical_len) .add_buffer(new_run_ends_builder.finish()) .build_unchecked() @@ -746,7 +746,7 @@ where let mut remaining_len = output_len; - let run_ends = run_array.run_ends(); + let run_ends = run_array.run_ends().values(); assert_eq!( 0, @@ -770,22 +770,20 @@ where // and len, both of which are within bounds of run_array if physical_index == start_physical_index { ( - run_ends.value_unchecked(physical_index).as_usize() + run_ends.get_unchecked(physical_index).as_usize() - run_array.offset(), 0, ) } else if physical_index == end_physical_index { - let prev_run_end = - run_ends.value_unchecked(physical_index - 1).as_usize(); + let prev_run_end = run_ends.get_unchecked(physical_index - 1).as_usize(); ( run_array.offset() + run_array.len() - prev_run_end, prev_run_end - run_array.offset(), ) } else { - let prev_run_end = - run_ends.value_unchecked(physical_index - 1).as_usize(); + let prev_run_end = run_ends.get_unchecked(physical_index - 1).as_usize(); ( - run_ends.value_unchecked(physical_index).as_usize() - prev_run_end, + run_ends.get_unchecked(physical_index).as_usize() - prev_run_end, prev_run_end - run_array.offset(), ) } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 58b5c91f1bf6..68b22f6feabc 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -2157,8 +2157,7 @@ mod tests { let take_out = take_run(&run_array, &take_indices).unwrap(); assert_eq!(take_out.len(), 7); - - assert_eq!(take_out.run_ends().len(), 5); + assert_eq!(take_out.run_ends().len(), 7); assert_eq!(take_out.run_ends().values(), &[1_i32, 3, 4, 5, 7]); let take_out_values = as_primitive_array::(take_out.values()); From 1883bb691a33c39ce13e355e7f0a82414fc74010 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 9 Mar 2023 00:33:21 -0800 Subject: [PATCH 0664/1411] Add unary_dict_mut (#3804) * Add unary_dict_mut * Remove unused function --- arrow-arith/src/arity.rs | 21 ++++++++++++- arrow-array/src/array/dictionary_array.rs | 38 +++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 5d4973714a06..74edd654bbcd 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -61,7 +61,7 @@ where pub fn unary_mut( array: PrimitiveArray, op: F, -) -> std::result::Result, PrimitiveArray> +) -> Result, PrimitiveArray> where I: ArrowPrimitiveType, F: Fn(I::Native) -> I::Native, @@ -647,4 +647,23 @@ mod tests { .unwrap() .expect_err("should got error"); } + + #[test] + fn test_unary_dict_mut() { + let values = Int32Array::from(vec![Some(10), Some(20), None]); + let keys = Int8Array::from_iter_values([0, 0, 1, 2]); + let dictionary = DictionaryArray::::try_new(&keys, &values).unwrap(); + + drop(keys); + drop(values); + + let updated = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap(); + let typed = updated.downcast_dict::().unwrap(); + assert_eq!(typed.value(0), 11); + assert_eq!(typed.value(1), 11); + assert_eq!(typed.value(2), 21); + + let values = updated.values(); + assert!(values.is_null(2)); + } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 74b5cec19475..f9a40c6f3400 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -433,6 +433,44 @@ impl DictionaryArray { } } } + + /// Applies an unary and infallible function to a mutable dictionary array. + /// Mutable dictionary array means that the buffers are not shared with other arrays. + /// As a result, this mutates the buffers directly without allocating new buffers. + /// + /// # Implementation + /// + /// This will apply the function for all dictionary values, including those on null slots. + /// This implies that the operation must be infallible for any value of the corresponding type + /// or this function may panic. + /// # Example + /// ``` + /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}}; + /// use arrow_array::{Int8Array, Int32Array}; + /// let values = Int32Array::from(vec![Some(10), Some(20), None]); + /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]); + /// let dictionary = DictionaryArray::::try_new(&keys, &values).unwrap(); + /// drop(keys); + /// drop(values); + /// let c = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap(); + /// let typed = c.downcast_dict::().unwrap(); + /// assert_eq!(typed.value(0), 11); + /// assert_eq!(typed.value(1), 11); + /// assert_eq!(typed.value(2), 21); + /// ``` + pub fn unary_mut(self, op: F) -> Result, DictionaryArray> + where + V: ArrowPrimitiveType, + F: Fn(V::Native) -> V::Native, + { + let mut builder: PrimitiveDictionaryBuilder = + self.into_primitive_dict_builder()?; + builder + .values_slice_mut() + .iter_mut() + .for_each(|v| *v = op(*v)); + Ok(builder.finish()) + } } /// Constructs a `DictionaryArray` from an array data reference. From 053973a06a7194a1d10f5b206375957a2fcaa049 Mon Sep 17 00:00:00 2001 From: bold Date: Thu, 9 Mar 2023 10:28:17 +0100 Subject: [PATCH 0665/1411] Support decoding decimals in raw decoder (#3820) --- arrow-json/src/raw/decimal_array.rs | 76 +++++++++++++++++++++++++++++ arrow-json/src/raw/mod.rs | 61 +++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 arrow-json/src/raw/decimal_array.rs diff --git a/arrow-json/src/raw/decimal_array.rs b/arrow-json/src/raw/decimal_array.rs new file mode 100644 index 000000000000..0518b4cef7c4 --- /dev/null +++ b/arrow-json/src/raw/decimal_array.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::marker::PhantomData; + +use arrow_array::builder::PrimitiveBuilder; +use arrow_array::types::DecimalType; +use arrow_array::Array; +use arrow_cast::parse::parse_decimal; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{tape_error, ArrayDecoder}; + +pub struct DecimalArrayDecoder { + precision: u8, + scale: i8, + // Invariant and Send + phantom: PhantomData D>, +} + +impl DecimalArrayDecoder { + pub fn new(precision: u8, scale: i8) -> Self { + Self { + precision, + scale, + phantom: PhantomData, + } + } +} + +impl ArrayDecoder for DecimalArrayDecoder +where + D: DecimalType, +{ + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut builder = PrimitiveBuilder::::with_capacity(pos.len()); + + for p in pos { + match tape.get(*p) { + TapeElement::Null => builder.append_null(), + TapeElement::String(idx) => { + let s = tape.get_string(idx); + let value = parse_decimal::(s, self.precision, self.scale)?; + builder.append_value(value) + } + TapeElement::Number(idx) => { + let s = tape.get_string(idx); + let value = parse_decimal::(s, self.precision, self.scale)?; + builder.append_value(value) + } + d => return Err(tape_error(d, "decimal")), + } + } + + Ok(builder + .finish() + .with_precision_and_scale(self.precision, self.scale)? + .into_data()) + } +} diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index a0dbcbd53eaa..5b699b1d51fb 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -20,6 +20,7 @@ //! [`Reader`]: crate::reader::Reader use crate::raw::boolean_array::BooleanArrayDecoder; +use crate::raw::decimal_array::DecimalArrayDecoder; use crate::raw::list_array::ListArrayDecoder; use crate::raw::map_array::MapArrayDecoder; use crate::raw::primitive_array::PrimitiveArrayDecoder; @@ -33,6 +34,7 @@ use arrow_schema::{ArrowError, DataType, SchemaRef}; use std::io::BufRead; mod boolean_array; +mod decimal_array; mod list_array; mod map_array; mod primitive_array; @@ -291,6 +293,8 @@ fn make_decoder( data_type => (primitive_decoder, data_type), DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), + DataType::Decimal128(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), + DataType::Decimal256(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Boolean => Ok(Box::::default()), DataType::Utf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), @@ -321,6 +325,7 @@ mod tests { }; use arrow_array::types::Int32Type; use arrow_array::Array; + use arrow_buffer::ArrowNativeType; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_schema::{DataType, Field, Schema}; use std::fs::File; @@ -721,4 +726,60 @@ mod tests { assert!(col3.is_null(4)); assert!(col3.is_null(5)); } + + fn test_decimal(data_type: DataType) { + let buf = r#" + {"a": 1, "b": 2, "c": 38.30} + {"a": 2, "b": 4, "c": 123.456} + + {"b": 1337, "a": "2.0452"} + {"b": "5", "a": "11034.2"} + {"b": 40} + {"b": 1234, "a": null} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", data_type.clone(), true), + Field::new("b", data_type.clone(), true), + Field::new("c", data_type, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let col1 = as_primitive_array::(batches[0].column(0)); + assert_eq!(col1.null_count(), 2); + assert!(col1.is_null(4)); + assert!(col1.is_null(5)); + assert_eq!( + col1.values(), + &[100, 200, 204, 1103420, 0, 0].map(T::Native::usize_as) + ); + + let col2 = as_primitive_array::(batches[0].column(1)); + assert_eq!(col2.null_count(), 0); + assert_eq!( + col2.values(), + &[200, 400, 133700, 500, 4000, 123400].map(T::Native::usize_as) + ); + + let col3 = as_primitive_array::(batches[0].column(2)); + assert_eq!(col3.null_count(), 4); + assert!(!col3.is_null(0)); + assert!(!col3.is_null(1)); + assert!(col3.is_null(2)); + assert!(col3.is_null(3)); + assert!(col3.is_null(4)); + assert!(col3.is_null(5)); + assert_eq!( + col3.values(), + &[3830, 12345, 0, 0, 0, 0].map(T::Native::usize_as) + ); + } + + #[test] + fn test_decimals() { + test_decimal::(DataType::Decimal128(10, 2)); + test_decimal::(DataType::Decimal256(10, 2)); + } } From fb1dcc6d792bc390b0eeea867ca76f3d2c1d8334 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 9 Mar 2023 04:59:39 -0800 Subject: [PATCH 0666/1411] Fix CSV error text (#3822) --- arrow-csv/src/writer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 4ec0e1bec517..b64e306b3a14 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -169,7 +169,7 @@ impl Writer { buffer.clear(); converter.value(row_idx).write(&mut buffer).map_err(|e| { ArrowError::CsvError(format!( - "Error formatting row {} and column {}: {e}", + "Error processing row {}, col {}: {e}", row_idx + 1, col_idx + 1 )) @@ -614,7 +614,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo for batch in batches { let err = writer.write(batch).unwrap_err().to_string(); - assert_eq!(err, "Csv error: Error formatting row 2 and column 2: Cast error: Failed to convert 1926632005177685347 to temporal for Date64") + assert_eq!(err, "Csv error: Error processing row 2, col 2: Cast error: Failed to convert 1926632005177685347 to temporal for Date64") } drop(writer); } From fb35d264225c9aa7aa5eeefbca72e914590fac25 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 9 Mar 2023 16:16:48 +0100 Subject: [PATCH 0667/1411] test: add test for FlightSQL CLI client (#3816) * test: add test for FlightSQL CLI client Closes #3814. * refactor: improve test code Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-flight/Cargo.toml | 6 + arrow-flight/src/bin/flight_sql_client.rs | 4 +- arrow-flight/tests/flight_sql_client_cli.rs | 545 ++++++++++++++++++++ 3 files changed, 554 insertions(+), 1 deletion(-) create mode 100644 arrow-flight/tests/flight_sql_client_cli.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index f1cd7d4fb23b..db9f0a023bf4 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -60,6 +60,7 @@ cli = ["arrow/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/ [dev-dependencies] arrow = { version = "34.0.0", path = "../arrow", features = ["prettyprint"] } +assert_cmd = "2.0.8" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" @@ -78,3 +79,8 @@ required-features = ["flight-sql-experimental", "tls"] [[bin]] name = "flight_sql_client" required-features = ["cli", "flight-sql-experimental", "tls"] + +[[test]] +name = "flight_sql_client_cli" +path = "tests/flight_sql_client_cli.rs" +required-features = ["cli", "flight-sql-experimental", "tls"] diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index 9f211eaf63bc..d05efc227e2d 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -144,7 +144,9 @@ fn setup_logging() { async fn setup_client(args: ClientArgs) -> Result { let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); - let mut endpoint = Endpoint::new(format!("https://{}:{}", args.host, port)) + let protocol = if args.tls { "https" } else { "http" }; + + let mut endpoint = Endpoint::new(format!("{}://{}:{}", protocol, args.host, port)) .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs new file mode 100644 index 000000000000..2c54bd263fdb --- /dev/null +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{net::SocketAddr, pin::Pin, sync::Arc, time::Duration}; + +use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray}; +use arrow_flight::{ + flight_service_server::{FlightService, FlightServiceServer}, + sql::{ + server::FlightSqlService, ActionClosePreparedStatementRequest, + ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any, + CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, + CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, + CommandStatementQuery, CommandStatementUpdate, ProstMessageExt, SqlInfo, + TicketStatementQuery, + }, + utils::batches_to_flight_data, + Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, + HandshakeResponse, IpcMessage, SchemaAsIpc, Ticket, +}; +use arrow_ipc::writer::IpcWriteOptions; +use arrow_schema::{ArrowError, DataType, Field, Schema}; +use assert_cmd::Command; +use futures::Stream; +use prost::Message; +use tokio::{net::TcpListener, task::JoinHandle}; +use tonic::{Request, Response, Status, Streaming}; + +const QUERY: &str = "SELECT * FROM table;"; + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +async fn test_simple() { + let test_server = FlightSqlServiceImpl {}; + let fixture = TestFixture::new(&test_server).await; + let addr = fixture.addr; + + let stdout = tokio::task::spawn_blocking(move || { + Command::cargo_bin("flight_sql_client") + .unwrap() + .env_clear() + .env("RUST_BACKTRACE", "1") + .env("RUST_LOG", "warn") + .arg("--host") + .arg(addr.ip().to_string()) + .arg("--port") + .arg(addr.port().to_string()) + .arg(QUERY) + .assert() + .success() + .get_output() + .stdout + .clone() + }) + .await + .unwrap(); + + fixture.shutdown_and_wait().await; + + assert_eq!( + std::str::from_utf8(&stdout).unwrap().trim(), + "+--------------+-----------+\ + \n| field_string | field_int |\ + \n+--------------+-----------+\ + \n| Hello | 42 |\ + \n| lovely | |\ + \n| FlightSQL! | 1337 |\ + \n+--------------+-----------+", + ); +} + +/// All tests must complete within this many seconds or else the test server is shutdown +const DEFAULT_TIMEOUT_SECONDS: u64 = 30; + +#[derive(Clone)] +pub struct FlightSqlServiceImpl {} + +impl FlightSqlServiceImpl { + /// Return an [`FlightServiceServer`] that can be used with a + /// [`Server`](tonic::transport::Server) + pub fn service(&self) -> FlightServiceServer { + // wrap up tonic goop + FlightServiceServer::new(self.clone()) + } + + fn fake_result() -> Result { + let schema = Schema::new(vec![ + Field::new("field_string", DataType::Utf8, false), + Field::new("field_int", DataType::Int64, true), + ]); + + let string_array = StringArray::from(vec!["Hello", "lovely", "FlightSQL!"]); + let int_array = Int64Array::from(vec![Some(42), None, Some(1337)]); + + let cols = vec![ + Arc::new(string_array) as ArrayRef, + Arc::new(int_array) as ArrayRef, + ]; + RecordBatch::try_new(Arc::new(schema), cols) + } +} + +#[tonic::async_trait] +impl FlightSqlService for FlightSqlServiceImpl { + type FlightService = FlightSqlServiceImpl; + + async fn do_handshake( + &self, + _request: Request>, + ) -> Result< + Response> + Send>>>, + Status, + > { + Err(Status::unimplemented("do_handshake not implemented")) + } + + async fn do_get_fallback( + &self, + _request: Request, + message: Any, + ) -> Result::DoGetStream>, Status> { + let part = message.unpack::().unwrap().unwrap().handle; + let batch = Self::fake_result().unwrap(); + let batch = match part.as_str() { + "part_1" => batch.slice(0, 2), + "part_2" => batch.slice(2, 1), + ticket => panic!("Invalid ticket: {ticket:?}"), + }; + let schema = (*batch.schema()).clone(); + let batches = vec![batch]; + let flight_data = batches_to_flight_data(schema, batches) + .unwrap() + .into_iter() + .map(Ok); + + let stream: Pin> + Send>> = + Box::pin(futures::stream::iter(flight_data)); + let resp = Response::new(stream); + Ok(resp) + } + + async fn get_flight_info_statement( + &self, + query: CommandStatementQuery, + _request: Request, + ) -> Result, Status> { + assert_eq!(query.query, QUERY); + + let batch = Self::fake_result().unwrap(); + + let IpcMessage(schema_bytes) = + SchemaAsIpc::new(batch.schema().as_ref(), &IpcWriteOptions::default()) + .try_into() + .unwrap(); + + let info = FlightInfo { + schema: schema_bytes, + flight_descriptor: None, + endpoint: vec![ + FlightEndpoint { + ticket: Some(Ticket { + ticket: FetchResults { + handle: String::from("part_1"), + } + .as_any() + .encode_to_vec() + .into(), + }), + location: vec![], + }, + FlightEndpoint { + ticket: Some(Ticket { + ticket: FetchResults { + handle: String::from("part_2"), + } + .as_any() + .encode_to_vec() + .into(), + }), + location: vec![], + }, + ], + total_records: batch.num_rows() as i64, + total_bytes: batch.get_array_memory_size() as i64, + }; + let resp = Response::new(info); + Ok(resp) + } + + async fn get_flight_info_prepared_statement( + &self, + _cmd: CommandPreparedStatementQuery, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_prepared_statement not implemented", + )) + } + + async fn get_flight_info_catalogs( + &self, + _query: CommandGetCatalogs, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_catalogs not implemented", + )) + } + + async fn get_flight_info_schemas( + &self, + _query: CommandGetDbSchemas, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_schemas not implemented", + )) + } + + async fn get_flight_info_tables( + &self, + _query: CommandGetTables, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_tables not implemented", + )) + } + + async fn get_flight_info_table_types( + &self, + _query: CommandGetTableTypes, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_table_types not implemented", + )) + } + + async fn get_flight_info_sql_info( + &self, + _query: CommandGetSqlInfo, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_sql_info not implemented", + )) + } + + async fn get_flight_info_primary_keys( + &self, + _query: CommandGetPrimaryKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_primary_keys not implemented", + )) + } + + async fn get_flight_info_exported_keys( + &self, + _query: CommandGetExportedKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_exported_keys not implemented", + )) + } + + async fn get_flight_info_imported_keys( + &self, + _query: CommandGetImportedKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) + } + + async fn get_flight_info_cross_reference( + &self, + _query: CommandGetCrossReference, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) + } + + // do_get + async fn do_get_statement( + &self, + _ticket: TicketStatementQuery, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_statement not implemented")) + } + + async fn do_get_prepared_statement( + &self, + _query: CommandPreparedStatementQuery, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_prepared_statement not implemented", + )) + } + + async fn do_get_catalogs( + &self, + _query: CommandGetCatalogs, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_catalogs not implemented")) + } + + async fn do_get_schemas( + &self, + _query: CommandGetDbSchemas, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_schemas not implemented")) + } + + async fn do_get_tables( + &self, + _query: CommandGetTables, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_tables not implemented")) + } + + async fn do_get_table_types( + &self, + _query: CommandGetTableTypes, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_table_types not implemented")) + } + + async fn do_get_sql_info( + &self, + _query: CommandGetSqlInfo, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_sql_info not implemented")) + } + + async fn do_get_primary_keys( + &self, + _query: CommandGetPrimaryKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented("do_get_primary_keys not implemented")) + } + + async fn do_get_exported_keys( + &self, + _query: CommandGetExportedKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_exported_keys not implemented", + )) + } + + async fn do_get_imported_keys( + &self, + _query: CommandGetImportedKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_imported_keys not implemented", + )) + } + + async fn do_get_cross_reference( + &self, + _query: CommandGetCrossReference, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_cross_reference not implemented", + )) + } + + // do_put + async fn do_put_statement_update( + &self, + _ticket: CommandStatementUpdate, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_statement_update not implemented", + )) + } + + async fn do_put_prepared_statement_query( + &self, + _query: CommandPreparedStatementQuery, + _request: Request>, + ) -> Result::DoPutStream>, Status> { + Err(Status::unimplemented( + "do_put_prepared_statement_query not implemented", + )) + } + + async fn do_put_prepared_statement_update( + &self, + _query: CommandPreparedStatementUpdate, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_prepared_statement_update not implemented", + )) + } + + async fn do_action_create_prepared_statement( + &self, + _query: ActionCreatePreparedStatementRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_create_prepared_statement not implemented", + )) + } + + async fn do_action_close_prepared_statement( + &self, + _query: ActionClosePreparedStatementRequest, + _request: Request, + ) { + } + + async fn register_sql_info(&self, _id: i32, _result: &SqlInfo) {} +} + +/// Creates and manages a running TestServer with a background task +struct TestFixture { + /// channel to send shutdown command + shutdown: Option>, + + /// Address the server is listening on + addr: SocketAddr, + + // handle for the server task + handle: Option>>, +} + +impl TestFixture { + /// create a new test fixture from the server + pub async fn new(test_server: &FlightSqlServiceImpl) -> Self { + // let OS choose a a free port + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + + println!("Listening on {addr}"); + + // prepare the shutdown channel + let (tx, rx) = tokio::sync::oneshot::channel(); + + let server_timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECONDS); + + let shutdown_future = async move { + rx.await.ok(); + }; + + let serve_future = tonic::transport::Server::builder() + .timeout(server_timeout) + .add_service(test_server.service()) + .serve_with_incoming_shutdown( + tokio_stream::wrappers::TcpListenerStream::new(listener), + shutdown_future, + ); + + // Run the server in its own background task + let handle = tokio::task::spawn(serve_future); + + Self { + shutdown: Some(tx), + addr, + handle: Some(handle), + } + } + + /// Stops the test server and waits for the server to shutdown + pub async fn shutdown_and_wait(mut self) { + if let Some(shutdown) = self.shutdown.take() { + shutdown.send(()).expect("server quit early"); + } + if let Some(handle) = self.handle.take() { + println!("Waiting on server to finish"); + handle + .await + .expect("task join error (panic?)") + .expect("Server Error found at shutdown"); + } + } +} + +impl Drop for TestFixture { + fn drop(&mut self) { + if let Some(shutdown) = self.shutdown.take() { + shutdown.send(()).ok(); + } + if self.handle.is_some() { + // tests should properly clean up TestFixture + println!("TestFixture::Drop called prior to `shutdown_and_wait`"); + } + } +} + +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FetchResults { + #[prost(string, tag = "1")] + pub handle: ::prost::alloc::string::String, +} + +impl ProstMessageExt for FetchResults { + fn type_url() -> &'static str { + "type.googleapis.com/arrow.flight.protocol.sql.FetchResults" + } + + fn as_any(&self) -> Any { + Any { + type_url: FetchResults::type_url().to_string(), + value: ::prost::Message::encode_to_vec(self).into(), + } + } +} From de9f82693f94721916e37f4896e9394411d8076f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:40:17 +0100 Subject: [PATCH 0668/1411] Update to latest Clippy (#3832) * Renamed clippy lint * More clippy * Even more clippy * Clippy --- arrow-arith/src/arithmetic.rs | 14 ++++++-------- arrow-schema/src/schema.rs | 2 +- arrow/benches/boolean_append_packed.rs | 1 - arrow/src/array/ffi.rs | 6 +++--- arrow/src/ffi.rs | 2 +- arrow/tests/array_cast.rs | 1 - parquet/src/arrow/async_reader/mod.rs | 2 -- 7 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index c2415a67bb66..00375d32a677 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -2571,9 +2571,9 @@ mod tests { #[test] fn test_primitive_array_negate() { - let a: Int64Array = (0..100).into_iter().map(Some).collect(); + let a: Int64Array = (0..100).map(Some).collect(); let actual = negate(&a).unwrap(); - let expected: Int64Array = (0..100).into_iter().map(|i| Some(-i)).collect(); + let expected: Int64Array = (0..100).map(|i| Some(-i)).collect(); assert_eq!(expected, actual); } @@ -2590,20 +2590,18 @@ mod tests { #[test] fn test_arithmetic_kernel_should_not_rely_on_padding() { - let a: UInt8Array = (0..128_u8).into_iter().map(Some).collect(); + let a: UInt8Array = (0..128_u8).map(Some).collect(); let a = a.slice(63, 65); let a = a.as_any().downcast_ref::().unwrap(); - let b: UInt8Array = (0..128_u8).into_iter().map(Some).collect(); + let b: UInt8Array = (0..128_u8).map(Some).collect(); let b = b.slice(63, 65); let b = b.as_any().downcast_ref::().unwrap(); let actual = add(a, b).unwrap(); let actual: Vec> = actual.iter().collect(); - let expected: Vec> = (63..63_u8 + 65_u8) - .into_iter() - .map(|i| Some(i + i)) - .collect(); + let expected: Vec> = + (63..63_u8 + 65_u8).map(|i| Some(i + i)).collect(); assert_eq!(expected, actual); } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index b7971027f13e..10a72ba0cdf6 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -264,7 +264,7 @@ impl fmt::Display for Schema { } // need to implement `Hash` manually because `HashMap` implement Eq but no `Hash` -#[allow(clippy::derive_hash_xor_eq)] +#[allow(clippy::derived_hash_with_manual_eq)] impl Hash for Schema { fn hash(&self, state: &mut H) { self.fields.hash(state); diff --git a/arrow/benches/boolean_append_packed.rs b/arrow/benches/boolean_append_packed.rs index 62bcbcc352fd..40873422dbd5 100644 --- a/arrow/benches/boolean_append_packed.rs +++ b/arrow/benches/boolean_append_packed.rs @@ -30,7 +30,6 @@ fn boolean_append_packed(c: &mut Criterion) { let mut rng = thread_rng(); let source = rand_bytes(1024); let ranges: Vec<_> = (0..100) - .into_iter() .map(|_| { let start: usize = rng.gen_range(0..1024 * 8); let end: usize = rng.gen_range(start..1024 * 8); diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index c7bc8e9f8a74..0751fe2c0f2d 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -215,7 +215,7 @@ mod tests { #[test] fn test_fixed_size_list() -> Result<()> { - let v: Vec = (0..9).into_iter().collect(); + let v: Vec = (0..9).collect(); let value_data = ArrayData::builder(DataType::Int64) .len(9) .add_buffer(Buffer::from_slice_ref(v)) @@ -240,7 +240,7 @@ mod tests { bit_util::set_bit(&mut validity_bits, 2); bit_util::set_bit(&mut validity_bits, 6); - let v: Vec = (0..16).into_iter().collect(); + let v: Vec = (0..16).collect(); let value_data = ArrayData::builder(DataType::Int16) .len(16) .add_buffer(Buffer::from_slice_ref(v)) @@ -260,7 +260,7 @@ mod tests { #[test] fn test_fixed_size_list_nested() -> Result<()> { - let v: Vec = (0..16).into_iter().collect(); + let v: Vec = (0..16).collect(); let value_data = ArrayData::builder(DataType::Int32) .len(16) .add_buffer(Buffer::from_slice_ref(v)) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index c767a69e6bdf..81c32594861c 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -957,7 +957,7 @@ mod tests { let mut validity_bits: [u8; 1] = [0; 1]; bit_util::set_bit(&mut validity_bits, 2); - let v: Vec = (0..9).into_iter().collect(); + let v: Vec = (0..9).collect(); let value_data = ArrayData::builder(DataType::Int32) .len(9) .add_buffer(Buffer::from_slice_ref(&v)) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 30ded4d70be5..dfcaa990dbb5 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -428,7 +428,6 @@ fn get_all_types() -> Vec { Dictionary(Box::new(key_type), Box::new(Decimal256(76, 0))), ] }) - .into_iter() .collect::>(); types.append(&mut dictionary_types); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 3e0d865c0610..213f61818c15 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -622,7 +622,6 @@ impl<'a> InMemoryRowGroup<'a> { .iter() .zip(self.metadata.columns()) .enumerate() - .into_iter() .filter_map(|(idx, (chunk, chunk_meta))| { (chunk.is_none() && projection.leaf_included(idx)).then(|| { // If the first page does not start at the beginning of the column, @@ -671,7 +670,6 @@ impl<'a> InMemoryRowGroup<'a> { .column_chunks .iter() .enumerate() - .into_iter() .filter_map(|(idx, chunk)| { (chunk.is_none() && projection.leaf_included(idx)).then(|| { let column = self.metadata.column(idx); From cdb042ed02a6c9afd0cf3f6f86072b09c599f59a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:42:54 +0100 Subject: [PATCH 0669/1411] Faster timestamp parsing (~70-90% faster) (#3801) * Faster timestamp parsing * Faster timezone parsing * More tests * Review feedback * Review feedback * Fix test * Format --- arrow-array/src/timezone.rs | 59 ++--- arrow-cast/Cargo.toml | 5 + arrow-cast/benches/parse_timestamp.rs | 44 ++++ arrow-cast/src/cast.rs | 10 +- arrow-cast/src/parse.rs | 299 +++++++++++++++++--------- arrow/Cargo.toml | 4 + arrow/tests/timezone.rs | 81 +++++++ 7 files changed, 371 insertions(+), 131 deletions(-) create mode 100644 arrow-cast/benches/parse_timestamp.rs create mode 100644 arrow/tests/timezone.rs diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index 3af76c3dafb7..f56189c46512 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -18,29 +18,34 @@ //! Timezone for timestamp arrays use arrow_schema::ArrowError; -use chrono::format::{parse, Parsed, StrftimeItems}; use chrono::FixedOffset; pub use private::{Tz, TzOffset}; -/// Parses a fixed offset of the form "+09:00" -fn parse_fixed_offset(tz: &str) -> Result { - let mut parsed = Parsed::new(); - - if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z")) - .and_then(|_| parsed.to_fixed_offset()) - { - return Ok(fixed_offset); +/// Parses a fixed offset of the form "+09:00", "-09" or "+0930" +fn parse_fixed_offset(tz: &str) -> Option { + let bytes = tz.as_bytes(); + + let mut values = match bytes.len() { + // [+-]XX:XX + 6 if bytes[3] == b':' => [bytes[1], bytes[2], bytes[4], bytes[5]], + // [+-]XXXX + 5 => [bytes[1], bytes[2], bytes[3], bytes[4]], + // [+-]XX + 3 => [bytes[1], bytes[2], b'0', b'0'], + _ => return None, + }; + values.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if values.iter().any(|x| *x > 9) { + return None; } + let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60 + + (values[2] * 10 + values[3]) as i32 * 60; - if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z")) - .and_then(|_| parsed.to_fixed_offset()) - { - return Ok(fixed_offset); + match bytes[0] { + b'+' => FixedOffset::east_opt(secs), + b'-' => FixedOffset::west_opt(secs), + _ => None, } - - Err(ArrowError::ParseError(format!( - "Invalid timezone \"{tz}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX" - ))) } #[cfg(feature = "chrono-tz")] @@ -83,12 +88,11 @@ mod private { type Err = ArrowError; fn from_str(tz: &str) -> Result { - if tz.starts_with('+') || tz.starts_with('-') { - Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?))) - } else { - Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { + match parse_fixed_offset(tz) { + Some(offset) => Ok(Self(TzInner::Offset(offset))), + None => Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { ArrowError::ParseError(format!("Invalid timezone \"{tz}\": {e}")) - })?))) + })?))), } } } @@ -261,13 +265,12 @@ mod private { type Err = ArrowError; fn from_str(tz: &str) -> Result { - if tz.starts_with('+') || tz.starts_with('-') { - Ok(Self(parse_fixed_offset(tz)?)) - } else { - Err(ArrowError::ParseError(format!( + let offset = parse_fixed_offset(tz).ok_or_else(|| { + ArrowError::ParseError(format!( "Invalid timezone \"{tz}\": only offset based timezones supported without chrono-tz feature" - ))) - } + )) + })?; + Ok(Self(offset)) } } diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 688e0001f973..c383369c4403 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -48,5 +48,10 @@ num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } [dev-dependencies] +criterion = { version = "0.4", default-features = false } [build-dependencies] + +[[bench]] +name = "parse_timestamp" +harness = false diff --git a/arrow-cast/benches/parse_timestamp.rs b/arrow-cast/benches/parse_timestamp.rs new file mode 100644 index 000000000000..d3ab41863e70 --- /dev/null +++ b/arrow-cast/benches/parse_timestamp.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_cast::parse::string_to_timestamp_nanos; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let timestamps = [ + "2020-09-08", + "2020-09-08T13:42:29", + "2020-09-08T13:42:29.190", + "2020-09-08T13:42:29.190855", + "2020-09-08T13:42:29.190855999", + "2020-09-08T13:42:29+00:00", + "2020-09-08T13:42:29.190+00:00", + "2020-09-08T13:42:29.190855+00:00", + "2020-09-08T13:42:29.190855999-05:00", + "2020-09-08T13:42:29.190855Z", + ]; + + for timestamp in timestamps { + let t = black_box(timestamp); + c.bench_function(t, |b| { + b.iter(|| string_to_timestamp_nanos(t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ae901665473d..35bf62969851 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4914,7 +4914,7 @@ mod tests { let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!( err.to_string(), - "Cast error: Error parsing 'Not a valid date' as timestamp" + "Parser error: Error parsing timestamp from 'Not a valid date': error parsing date" ); } } @@ -7899,8 +7899,12 @@ mod tests { ]); let array = Arc::new(valid) as ArrayRef; - let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))) - .unwrap(); + let b = cast_with_options( + &array, + &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)), + &CastOptions { safe: false }, + ) + .unwrap(); let c = b .as_any() diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 7f6ca742d345..38fb4fc29934 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; use arrow_buffer::ArrowNativeType; @@ -22,6 +23,116 @@ use arrow_schema::ArrowError; use chrono::prelude::*; use std::str::FromStr; +/// Helper for parsing timestamps +struct TimestampParser { + /// The timestamp bytes to parse minus `b'0'` + /// + /// This makes interpretation as an integer inexpensive + digits: [u8; 32], + /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit + mask: u32, +} + +impl TimestampParser { + fn new(bytes: &[u8]) -> Self { + let mut digits = [0; 32]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u32) << idx + } + + Self { digits, mask } + } + + /// Returns true if the byte at `idx` in the original string equals `b` + fn test(&self, idx: usize, b: u8) -> bool { + self.digits[idx] == b.wrapping_sub(b'0') + } + + /// Parses a date of the form `1997-01-31` + fn date(&self) -> Option { + if self.mask & 0b1111111111 != 0b1101101111 + || !self.test(4, b'-') + || !self.test(7, b'-') + { + return None; + } + + let year = self.digits[0] as u16 * 1000 + + self.digits[1] as u16 * 100 + + self.digits[2] as u16 * 10 + + self.digits[3] as u16; + + let month = self.digits[5] * 10 + self.digits[6]; + let day = self.digits[8] * 10 + self.digits[9]; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) + } + + /// Parses a time of any of forms + /// - `09:26:56` + /// - `09:26:56.123` + /// - `09:26:56.123456` + /// - `09:26:56.123456789` + /// - `092656` + /// + /// Returning the end byte offset + fn time(&self) -> Option<(NaiveTime, usize)> { + match (self.mask >> 11) & 0b11111111 { + // 09:26:56 + 0b11011011 if self.test(13, b':') && self.test(16, b':') => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[14] * 10 + self.digits[15]; + let second = self.digits[17] * 10 + self.digits[18]; + let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; + + let millis = || { + self.digits[20] as u32 * 100_000_000 + + self.digits[21] as u32 * 10_000_000 + + self.digits[22] as u32 * 1_000_000 + }; + + let micros = || { + self.digits[23] as u32 * 100_000 + + self.digits[24] as u32 * 10_000 + + self.digits[25] as u32 * 1_000 + }; + + let nanos = || { + self.digits[26] as u32 * 100 + + self.digits[27] as u32 * 10 + + self.digits[28] as u32 + }; + + match self.test(19, b'.') { + true => match (self.mask >> 20).trailing_ones() { + 3 => Some((time.with_nanosecond(millis())?, 23)), + 6 => Some((time.with_nanosecond(millis() + micros())?, 26)), + 9 => Some(( + time.with_nanosecond(millis() + micros() + nanos())?, + 29, + )), + _ => None, + }, + false => Some((time, 19)), + } + } + // 092656 + 0b111111 => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[13] * 10 + self.digits[14]; + let second = self.digits[15] * 10 + self.digits[16]; + let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; + Some((time, 17)) + } + _ => None, + } + } +} + /// Accepts a string and parses it relative to the provided `timezone` /// /// In addition to RFC3339 / ISO8601 standard timestamps, it also @@ -32,105 +143,83 @@ use std::str::FromStr; /// * `1997-01-31T09:26:56.123Z` # RCF3339 /// * `1997-01-31T09:26:56.123-05:00` # RCF3339 /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T +/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator /// * `1997-01-31` # close to RCF3339, only date no time /// -/// Some formats that supported by PostgresSql -/// still not supported by chrono, like -/// "2023-01-01 040506 America/Los_Angeles", -/// "2023-01-01 04:05:06.789 +07:30:00", -/// "2023-01-01 040506 +07:30:00", -/// "2023-01-01 04:05:06.789 PST", -/// "2023-01-01 04:05:06.789 -08", +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// Some formats supported by PostgresSql +/// are not supported, like +/// +/// * "2023-01-01 04:05:06.789 +07:30:00", +/// * "2023-01-01 040506 +07:30:00", +/// * "2023-01-01 04:05:06.789 PST", +/// +/// [IANA timezones]: https://www.iana.org/time-zones pub fn string_to_datetime( timezone: &T, s: &str, ) -> Result, ArrowError> { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.with_timezone(timezone)); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - let supported_formats = vec![ - "%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00 - "%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30" - ]; - - for f in supported_formats.iter() { - if let Ok(ts) = DateTime::parse_from_str(s, f) { - return Ok(ts.with_timezone(timezone)); - } - } + let err = |ctx: &str| { + ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) + }; - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.with_timezone(timezone)); + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); } - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. + let parser = TimestampParser::new(bytes); + let date = parser.date().ok_or_else(|| err("error parsing date"))?; + if bytes.len() == 10 { + let offset = timezone.offset_from_local_date(&date); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + return Ok(DateTime::from_local(date.and_time(time), offset)); } - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + if !parser.test(10, b'T') && !parser.test(10, b' ') { + return Err(err("invalid timestamp separator")); } - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let datetime = date.and_time(time); + if bytes.len() <= tz_offset { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_local(datetime, offset)); } - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_utc(datetime, offset)); } - // without a timezone specifier as a local time, only date - // Example: 2020-09-08 - if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") { - if let Some(ts) = dt.and_hms_opt(0, 0, 0) { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } - } - } - - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(ArrowError::CastError(format!( - "Error parsing '{s}' as timestamp" - ))) + // Parse remainder of string as timezone + let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; + let offset = parsed_tz.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) } /// Accepts a string in RFC3339 / ISO8601 standard format and some @@ -418,19 +507,20 @@ const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented a impl Parser for Date32Type { fn parse(string: &str) -> Option { - let date = string.parse::().ok()?; + let parser = TimestampParser::new(string.as_bytes()); + let date = parser.date()?; Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } fn parse_formatted(string: &str, format: &str) -> Option { - let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; + let date = NaiveDate::parse_from_str(string, format).ok()?; Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } } impl Parser for Date64Type { fn parse(string: &str) -> Option { - let date_time = string.parse::().ok()?; + let date_time = string_to_datetime(&Utc, string).ok()?; Some(date_time.timestamp_millis()) } @@ -896,14 +986,35 @@ mod tests { #[test] fn string_to_timestamp_invalid() { // Test parsing invalid formats + let cases = [ + ("", "timestamp must contain at least 10 characters"), + ("SS", "timestamp must contain at least 10 characters"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), + ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), + ("1997-01-31 09:26:56.123Z", "error parsing time"), + ("1997:01:31T09:26:56.123Z", "error parsing date"), + ("1997:1:31T09:26:56.123Z", "error parsing date"), + ("1997-01-32T09:26:56.123Z", "error parsing date"), + ("1997-13-32T09:26:56.123Z", "error parsing date"), + ("1997-02-29T09:26:56.123Z", "error parsing date"), + ("2015-02-30T17:35:20-08:00", "error parsing date"), + ("1997-01-10T9:26:56.123Z", "error parsing time"), + ("2015-01-20T25:35:20-08:00", "error parsing time"), + ("1997-01-10T09:61:56.123Z", "error parsing time"), + ("1997-01-10T09:61:90.123Z", "error parsing time"), + ("1997-01-10T12:00:56.12Z", "error parsing time"), + ("1997-01-10T12:00:56.1234Z", "error parsing time"), + ("1997-01-10T12:00:56.12345Z", "error parsing time"), + ("1997-01-10T12:00:6.123Z", "error parsing time"), + ("1997-01-31T092656.123Z", "error parsing time"), + ]; - // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing '' as timestamp"); - expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp"); - expect_timestamp_parse_error( - "Wed, 18 Feb 2015 23:16:09 GMT", - "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp", - ); + for (s, ctx) in cases { + let expected = + format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } } // Parse a timestamp to timestamp int with a useful human readable error message @@ -915,18 +1026,6 @@ mod tests { result } - fn expect_timestamp_parse_error(s: &str, expected_err: &str) { - match string_to_timestamp_nanos(s) { - Ok(v) => panic!( - "Expected error '{expected_err}' while parsing '{s}', but parsed {v} instead" - ), - Err(e) => { - assert!(e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}' while parsing '{s}'. Actual error '{e}'"); - } - } - } - #[test] fn string_without_timezone_to_timestamp() { // string without timezone should always output the same regardless the local or session timezone diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 08fc5513d64f..587f42d7e58f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -296,3 +296,7 @@ required-features = ["pyarrow"] [[test]] name = "array_cast" required-features = ["chrono-tz"] + +[[test]] +name = "timezone" +required-features = ["chrono-tz"] diff --git a/arrow/tests/timezone.rs b/arrow/tests/timezone.rs new file mode 100644 index 000000000000..b71d04d64be0 --- /dev/null +++ b/arrow/tests/timezone.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_cast::parse::string_to_datetime; +use chrono::Utc; + +#[test] +fn test_parse_timezone() { + let cases = [ + ( + "2023-01-01 040506 America/Los_Angeles", + "2023-01-01T12:05:06+00:00", + ), + ( + "2023-01-01 04:05:06.345 America/Los_Angeles", + "2023-01-01T12:05:06.345+00:00", + ), + ( + "2023-01-01 04:05:06.345 America/Los_Angeles", + "2023-01-01T12:05:06.345+00:00", + ), + ( + "2023-01-01 04:05:06.789 -08", + "2023-01-01T12:05:06.789+00:00", + ), + ( + "2023-03-12 040506 America/Los_Angeles", + "2023-03-12T11:05:06+00:00", + ), // Daylight savings + ]; + + for (s, expected) in cases { + let actual = string_to_datetime(&Utc, s).unwrap().to_rfc3339(); + assert_eq!(actual, expected, "{s}") + } +} + +#[test] +fn test_parse_timezone_invalid() { + let cases = [ + ( + "2015-01-20T17:35:20-24:00", + "Parser error: Invalid timezone \"-24:00\": '-24:00' is not a valid timezone", + ), + ( + "2023-01-01 04:05:06.789 +07:30:00", + "Parser error: Invalid timezone \"+07:30:00\": '+07:30:00' is not a valid timezone" + ), + ( + // Sunday, 12 March 2023, 02:00:00 clocks are turned forward 1 hour to + // Sunday, 12 March 2023, 03:00:00 local daylight time instead. + "2023-03-12 02:05:06 America/Los_Angeles", + "Parser error: Error parsing timestamp from '2023-03-12 02:05:06 America/Los_Angeles': error computing timezone offset", + ), + ( + // Sunday, 5 November 2023, 02:00:00 clocks are turned backward 1 hour to + // Sunday, 5 November 2023, 01:00:00 local standard time instead. + "2023-11-05 01:30:06 America/Los_Angeles", + "Parser error: Error parsing timestamp from '2023-11-05 01:30:06 America/Los_Angeles': error computing timezone offset", + ), + ]; + + for (s, expected) in cases { + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } +} From defa599bf26647553c8b5884b813ef1e2368ffae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Mar 2023 17:17:15 +0100 Subject: [PATCH 0670/1411] Move prettyprint to arrow-cast (#3828) * More prettyprint to arrow-cast * Fix flight_sql_client.rs * Fix flight_sql_server * Format * Test schema --- arrow-cast/Cargo.toml | 8 + arrow-cast/src/lib.rs | 3 + {arrow/src/util => arrow-cast/src}/pretty.rs | 224 ++++++------------- arrow-flight/Cargo.toml | 5 +- arrow-flight/examples/flight_sql_server.rs | 9 +- arrow-flight/src/bin/flight_sql_client.rs | 5 +- arrow-flight/src/encode.rs | 17 +- arrow-flight/tests/encode_decode.rs | 13 +- arrow/Cargo.toml | 9 +- arrow/src/util/mod.rs | 2 +- arrow/tests/array_cast.rs | 57 +++++ 11 files changed, 165 insertions(+), 187 deletions(-) rename {arrow/src/util => arrow-cast/src}/pretty.rs (82%) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index c383369c4403..79c073b9dd4f 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -37,6 +37,12 @@ name = "arrow_cast" path = "src/lib.rs" bench = false +[package.metadata.docs.rs] +features = ["prettyprint"] + +[features] +prettyprint = ["comfy-table"] + [dependencies] arrow-array = { version = "34.0.0", path = "../arrow-array" } arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } @@ -46,9 +52,11 @@ arrow-select = { version = "34.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } +comfy-table = { version = "6.0", optional = true, default-features = false } [dev-dependencies] criterion = { version = "0.4", default-features = false } +half = { version = "2.1", default-features = false } [build-dependencies] diff --git a/arrow-cast/src/lib.rs b/arrow-cast/src/lib.rs index 397e5667e6ea..d2677a0e0a53 100644 --- a/arrow-cast/src/lib.rs +++ b/arrow-cast/src/lib.rs @@ -21,3 +21,6 @@ pub mod cast; pub use cast::*; pub mod display; pub mod parse; + +#[cfg(feature = "prettyprint")] +pub mod pretty; diff --git a/arrow/src/util/pretty.rs b/arrow-cast/src/pretty.rs similarity index 82% rename from arrow/src/util/pretty.rs rename to arrow-cast/src/pretty.rs index 21d035826851..5e7715eec832 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -15,19 +15,19 @@ // specific language governing permissions and limitations // under the License. -//! Utilities for printing record batches. Note this module is not +//! Utilities for pretty printing record batches. Note this module is not //! available unless `feature = "prettyprint"` is enabled. -use crate::{array::ArrayRef, record_batch::RecordBatch}; -use arrow_array::Array; -use arrow_cast::display::{ArrayFormatter, FormatOptions}; +use crate::display::{ArrayFormatter, FormatOptions}; +use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_schema::ArrowError; use comfy_table::{Cell, Table}; use std::fmt::Display; -use crate::error::Result; - /// Create a visual representation of record batches -pub fn pretty_format_batches(results: &[RecordBatch]) -> Result { +pub fn pretty_format_batches( + results: &[RecordBatch], +) -> Result { let options = FormatOptions::default().with_display_error(true); pretty_format_batches_with_options(results, &options) } @@ -36,7 +36,7 @@ pub fn pretty_format_batches(results: &[RecordBatch]) -> Result { pub fn pretty_format_batches_with_options( results: &[RecordBatch], options: &FormatOptions, -) -> Result { +) -> Result { create_table(results, options) } @@ -44,7 +44,7 @@ pub fn pretty_format_batches_with_options( pub fn pretty_format_columns( col_name: &str, results: &[ArrayRef], -) -> Result { +) -> Result { let options = FormatOptions::default().with_display_error(true); pretty_format_columns_with_options(col_name, results, &options) } @@ -53,24 +53,27 @@ pub fn pretty_format_columns_with_options( col_name: &str, results: &[ArrayRef], options: &FormatOptions, -) -> Result { +) -> Result { create_column(col_name, results, options) } /// Prints a visual representation of record batches to stdout -pub fn print_batches(results: &[RecordBatch]) -> Result<()> { +pub fn print_batches(results: &[RecordBatch]) -> Result<(), ArrowError> { println!("{}", pretty_format_batches(results)?); Ok(()) } /// Prints a visual representation of a list of column to stdout -pub fn print_columns(col_name: &str, results: &[ArrayRef]) -> Result<()> { +pub fn print_columns(col_name: &str, results: &[ArrayRef]) -> Result<(), ArrowError> { println!("{}", pretty_format_columns(col_name, results)?); Ok(()) } /// Convert a series of record batches into a table -fn create_table(results: &[RecordBatch], options: &FormatOptions) -> Result

{ +fn create_table( + results: &[RecordBatch], + options: &FormatOptions, +) -> Result { let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -91,7 +94,7 @@ fn create_table(results: &[RecordBatch], options: &FormatOptions) -> Result>>()?; + .collect::, ArrowError>>()?; for row in 0..batch.num_rows() { let mut cells = Vec::new(); @@ -109,7 +112,7 @@ fn create_column( field: &str, columns: &[ArrayRef], options: &FormatOptions, -) -> Result
{ +) -> Result { let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -133,31 +136,20 @@ fn create_column( #[cfg(test)] mod tests { - use crate::{ - array::{ - self, new_null_array, Array, Date32Array, Date64Array, - FixedSizeBinaryBuilder, Float16Array, Int32Array, StringArray, - StringDictionaryBuilder, StructArray, Time32MillisecondArray, - Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, - TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UnionArray, UnionBuilder, - }, - buffer::Buffer, - datatypes::{DataType, Field, Float64Type, Int32Type, Schema, UnionMode}, - }; use super::*; - use crate::array::{Decimal128Array, FixedSizeListBuilder}; + use crate::display::array_value_to_string; + use arrow_array::builder::*; + use arrow_array::types::*; + use arrow_array::*; + use arrow_buffer::Buffer; + use arrow_schema::*; + use half::f16; use std::fmt::Write; use std::sync::Arc; - use arrow_array::builder::PrimitiveBuilder; - use arrow_array::types::{ArrowTimestampType, TimestampSecondType}; - use arrow_cast::display::array_value_to_string; - use half::f16; - #[test] - fn test_pretty_format_batches() -> Result<()> { + fn test_pretty_format_batches() { // define a schema. let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Utf8, true), @@ -181,9 +173,10 @@ mod tests { Some(100), ])), ], - )?; + ) + .unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+---+-----+", @@ -199,12 +192,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_columns() -> Result<()> { + fn test_pretty_format_columns() { let columns = vec![ Arc::new(array::StringArray::from(vec![ Some("a"), @@ -215,7 +206,7 @@ mod tests { Arc::new(array::StringArray::from(vec![Some("e"), None, Some("g")])), ]; - let table = pretty_format_columns("a", &columns)?.to_string(); + let table = pretty_format_columns("a", &columns).unwrap().to_string(); let expected = vec![ "+---+", "| a |", "+---+", "| a |", "| b |", "| |", "| d |", "| e |", @@ -225,8 +216,6 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] @@ -266,7 +255,7 @@ mod tests { } #[test] - fn test_pretty_format_dictionary() -> Result<()> { + fn test_pretty_format_dictionary() { // define a schema. let field_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); @@ -274,14 +263,14 @@ mod tests { let mut builder = StringDictionaryBuilder::::new(); - builder.append("one")?; + builder.append_value("one"); builder.append_null(); - builder.append("three")?; + builder.append_value("three"); let array = Arc::new(builder.finish()); - let batch = RecordBatch::try_new(schema, vec![array])?; + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+-------+", @@ -296,12 +285,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_fixed_size_list() -> Result<()> { + fn test_pretty_format_fixed_size_list() { // define a schema. let field_type = DataType::FixedSizeList( Box::new(Field::new("item", DataType::Int32, true)), @@ -321,8 +308,8 @@ mod tests { let array = Arc::new(builder.finish()); - let batch = RecordBatch::try_new(schema, vec![array])?; - let table = pretty_format_batches(&[batch])?.to_string(); + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+-----------+", "| d1 |", @@ -336,12 +323,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_fixed_size_binary() -> Result<()> { + fn test_pretty_format_fixed_size_binary() { // define a schema. let field_type = DataType::FixedSizeBinary(3); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); @@ -354,8 +339,8 @@ mod tests { let array = Arc::new(builder.finish()); - let batch = RecordBatch::try_new(schema, vec![array])?; - let table = pretty_format_batches(&[batch])?.to_string(); + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+--------+", "| d1 |", @@ -369,8 +354,6 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } /// Generate an array with type $ARRAYTYPE with a numeric value of @@ -419,40 +402,6 @@ mod tests { RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap() } - #[test] - #[cfg(features = "chrono-tz")] - fn test_pretty_format_timestamp_second_with_utc_timezone() { - let batch = timestamp_batch::("UTC", 11111111); - let table = pretty_format_batches(&[batch]).unwrap().to_string(); - let expected = vec![ - "+---------------------------+", - "| f |", - "+---------------------------+", - "| 1970-05-09T14:25:11+00:00 |", - "| |", - "+---------------------------+", - ]; - let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); - } - - #[test] - #[cfg(features = "chrono-tz")] - fn test_pretty_format_timestamp_second_with_non_utc_timezone() { - let batch = timestamp_batch::("Asia/Taipei", 11111111); - let table = pretty_format_batches(&[batch]).unwrap().to_string(); - let expected = vec![ - "+---------------------------+", - "| f |", - "+---------------------------+", - "| 1970-05-09T22:25:11+08:00 |", - "| |", - "+---------------------------+", - ]; - let actual: Vec<&str> = table.lines().collect(); - assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); - } - #[test] fn test_pretty_format_timestamp_second_with_fixed_offset_timezone() { let batch = timestamp_batch::("+08:00", 11111111); @@ -470,22 +419,6 @@ mod tests { assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); } - #[test] - #[cfg(not(feature = "chrono-tz"))] - fn test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() { - let batch = timestamp_batch::("08:00", 11111111); - let err = pretty_format_batches(&[batch]).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"08:00\": only offset based timezones supported without chrono-tz feature"); - } - - #[test] - #[cfg(not(feature = "chrono-tz"))] - fn test_pretty_format_timestamp_second_with_unknown_timezone() { - let batch = timestamp_batch::("unknown", 11111111); - let err = pretty_format_batches(&[batch]).err().unwrap().to_string(); - assert_eq!(err, "Parser error: Invalid timezone \"unknown\": only offset based timezones supported without chrono-tz feature"); - } - #[test] fn test_pretty_format_timestamp_second() { let expected = vec![ @@ -617,7 +550,7 @@ mod tests { } #[test] - fn test_int_display() -> Result<()> { + fn test_int_display() { let array = Arc::new(Int32Array::from(vec![6, 3])) as ArrayRef; let actual_one = array_value_to_string(&array, 0).unwrap(); let expected_one = "6"; @@ -626,11 +559,10 @@ mod tests { let expected_two = "3"; assert_eq!(actual_one, expected_one); assert_eq!(actual_two, expected_two); - Ok(()) } #[test] - fn test_decimal_display() -> Result<()> { + fn test_decimal_display() { let precision = 10; let scale = 2; @@ -648,9 +580,9 @@ mod tests { true, )])); - let batch = RecordBatch::try_new(schema, vec![dm])?; + let batch = RecordBatch::try_new(schema, vec![dm]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+-------+", @@ -665,12 +597,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_decimal_display_zero_scale() -> Result<()> { + fn test_decimal_display_zero_scale() { let precision = 5; let scale = 0; @@ -688,9 +618,9 @@ mod tests { true, )])); - let batch = RecordBatch::try_new(schema, vec![dm])?; + let batch = RecordBatch::try_new(schema, vec![dm]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+------+", "| f |", "+------+", "| 101 |", "| |", "| 200 |", "| 3040 |", "+------+", @@ -698,12 +628,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_struct() -> Result<()> { + fn test_pretty_format_struct() { let schema = Schema::new(vec![ Field::new( "c1", @@ -744,7 +672,7 @@ mod tests { RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) .unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+--------------------------+----+", "| c1 | c2 |", @@ -757,12 +685,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_dense_union() -> Result<()> { + fn test_pretty_format_dense_union() { let mut builder = UnionBuilder::new_dense(); builder.append::("a", 1).unwrap(); builder.append::("b", 3.2234).unwrap(); @@ -785,7 +711,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ "+------------+", @@ -799,11 +725,10 @@ mod tests { ]; assert_eq!(expected, actual); - Ok(()) } #[test] - fn test_pretty_format_sparse_union() -> Result<()> { + fn test_pretty_format_sparse_union() { let mut builder = UnionBuilder::new_sparse(); builder.append::("a", 1).unwrap(); builder.append::("b", 3.2234).unwrap(); @@ -826,7 +751,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ "+------------+", @@ -840,11 +765,10 @@ mod tests { ]; assert_eq!(expected, actual); - Ok(()) } #[test] - fn test_pretty_format_nested_union() -> Result<()> { + fn test_pretty_format_nested_union() { //Inner UnionArray let mut builder = UnionBuilder::new_dense(); builder.append::("b", 1).unwrap(); @@ -890,7 +814,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(outer)]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ "+-----------------------------+", @@ -904,11 +828,10 @@ mod tests { "+-----------------------------+", ]; assert_eq!(expected, actual); - Ok(()) } #[test] - fn test_writing_formatted_batches() -> Result<()> { + fn test_writing_formatted_batches() { // define a schema. let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Utf8, true), @@ -932,10 +855,11 @@ mod tests { Some(100), ])), ], - )?; + ) + .unwrap(); let mut buf = String::new(); - write!(&mut buf, "{}", pretty_format_batches(&[batch])?).unwrap(); + write!(&mut buf, "{}", pretty_format_batches(&[batch]).unwrap()).unwrap(); let s = vec![ "+---+-----+", @@ -949,12 +873,10 @@ mod tests { ]; let expected = s.join("\n"); assert_eq!(expected, buf); - - Ok(()) } #[test] - fn test_float16_display() -> Result<()> { + fn test_float16_display() { let values = vec![ Some(f16::from_f32(f32::NAN)), Some(f16::from_f32(4.0)), @@ -968,9 +890,9 @@ mod tests { true, )])); - let batch = RecordBatch::try_new(schema, vec![array])?; + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+------+", "| f16 |", "+------+", "| NaN |", "| 4 |", "| -inf |", @@ -979,12 +901,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_interval_day_time() -> Result<()> { + fn test_pretty_format_interval_day_time() { let arr = Arc::new(arrow_array::IntervalDayTimeArray::from(vec![ Some(-600000), Some(4294966295), @@ -1000,9 +920,9 @@ mod tests { true, )])); - let batch = RecordBatch::try_new(schema, vec![arr])?; + let batch = RecordBatch::try_new(schema, vec![arr]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+----------------------------------------------------+", @@ -1020,12 +940,10 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] - fn test_pretty_format_interval_month_day_nano_array() -> Result<()> { + fn test_pretty_format_interval_month_day_nano_array() { let arr = Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![ Some(-600000000000), Some(18446744072709551615), @@ -1048,9 +966,9 @@ mod tests { true, )])); - let batch = RecordBatch::try_new(schema, vec![arr])?; + let batch = RecordBatch::try_new(schema, vec![arr]).unwrap(); - let table = pretty_format_batches(&[batch])?.to_string(); + let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ "+-----------------------------------------------------------+", @@ -1075,8 +993,6 @@ mod tests { let actual: Vec<&str> = table.lines().collect(); assert_eq!(expected, actual, "Actual result:\n{table}"); - - Ok(()) } #[test] diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index db9f0a023bf4..819818191c83 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -42,7 +42,6 @@ tokio = { version = "1.0", default-features = false, features = ["macros", "rt", futures = { version = "0.3", default-features = false, features = ["alloc"] } # CLI-related dependencies -arrow = { version = "34.0.0", path = "../arrow", optional = true } clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } tracing-log = { version = "0.1", optional = true } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "fmt"], optional = true } @@ -56,10 +55,10 @@ flight-sql-experimental = [] tls = ["tonic/tls"] # Enable CLI tools -cli = ["arrow/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] -arrow = { version = "34.0.0", path = "../arrow", features = ["prettyprint"] } +arrow-cast = { version = "34.0.0", path = "../arrow-cast", features = ["prettyprint"] } assert_cmd = "2.0.8" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 425ceab42779..ac38b0232f74 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -499,7 +499,6 @@ impl ProstMessageExt for FetchResults { } #[cfg(test)] -#[allow(unused_imports)] mod tests { use super::*; use futures::TryStreamExt; @@ -509,15 +508,13 @@ mod tests { use tokio::net::{UnixListener, UnixStream}; use tokio::time::sleep; use tokio_stream::wrappers::UnixListenerStream; - use tonic::body::BoxBody; - use tonic::codegen::{http, Body, Service}; use tonic::transport::ClientTlsConfig; - use arrow::util::pretty::pretty_format_batches; + use arrow_cast::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; use arrow_flight::utils::flight_data_to_batches; - use tonic::transport::{Certificate, Channel, Endpoint}; - use tower::{service_fn, ServiceExt}; + use tonic::transport::{Certificate, Endpoint}; + use tower::service_fn; async fn client_with_uds(path: String) -> FlightSqlServiceClient { let connector = service_fn(move |_| UnixStream::connect(path.clone())); diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index d05efc227e2d..c6a46a387d01 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -17,9 +17,8 @@ use std::{sync::Arc, time::Duration}; -use arrow::error::Result; -use arrow::util::pretty::pretty_format_batches; use arrow_array::RecordBatch; +use arrow_cast::pretty::pretty_format_batches; use arrow_flight::{ sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, }; @@ -141,7 +140,7 @@ fn setup_logging() { tracing_subscriber::fmt::init(); } -async fn setup_client(args: ClientArgs) -> Result { +async fn setup_client(args: ClientArgs) -> Result { let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); let protocol = if args.tls { "https" } else { "http" }; diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 557663922121..6432965032c1 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -453,14 +453,9 @@ fn hydrate_dictionary(array: &ArrayRef) -> Result { #[cfg(test)] mod tests { - use arrow::{ - array::{UInt32Array, UInt8Array}, - compute::concat_batches, - datatypes::Int32Type, - }; - use arrow_array::{ - DictionaryArray, Int16Array, Int32Array, Int64Array, StringArray, UInt64Array, - }; + use arrow_array::types::*; + use arrow_array::*; + use arrow_cast::pretty::pretty_format_batches; use std::collections::HashMap; use super::*; @@ -469,7 +464,7 @@ mod tests { /// ensure only the batch's used data (not the allocated data) is sent /// fn test_encode_flight_data() { - let options = arrow::ipc::writer::IpcWriteOptions::default(); + let options = IpcWriteOptions::default(); let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c1) as ArrayRef)]) @@ -569,7 +564,9 @@ mod tests { split.iter().map(|batch| batch.num_rows()).sum::(), n_rows ); - assert_eq!(concat_batches(&batch.schema(), &split).unwrap(), batch); + let a = pretty_format_batches(&split).unwrap().to_string(); + let b = pretty_format_batches(&[batch]).unwrap().to_string(); + assert_eq!(a, b); } #[test] diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 8c73a516b2b0..5a8eb6c376e4 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -19,8 +19,9 @@ use std::{collections::HashMap, sync::Arc}; -use arrow::{compute::concat_batches, datatypes::Int32Type}; +use arrow_array::types::Int32Type; use arrow_array::{ArrayRef, DictionaryArray, Float64Array, RecordBatch, UInt8Array}; +use arrow_cast::pretty::pretty_format_batches; use arrow_flight::{ decode::{DecodedPayload, FlightDataDecoder, FlightRecordBatchStream}, encode::FlightDataEncoderBuilder, @@ -242,9 +243,13 @@ async fn test_max_message_size_fuzz() { let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); let output: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); - let input_batch = concat_batches(&input[0].schema(), &input).unwrap(); - let output_batch = concat_batches(&output[0].schema(), &output).unwrap(); - assert_eq!(input_batch, output_batch); + for b in &output { + assert_eq!(b.schema(), input[0].schema()); + } + + let a = pretty_format_batches(&input).unwrap().to_string(); + let b = pretty_format_batches(&output).unwrap().to_string(); + assert_eq!(a, b); } } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 587f42d7e58f..0c387f305a8e 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -60,7 +60,6 @@ arrow-select = { version = "34.0.0", path = "../arrow-select" } arrow-string = { version = "34.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.18", default-features = false, optional = true } [package.metadata.docs.rs] @@ -73,7 +72,7 @@ csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["arrow-json"] simd = ["arrow-array/simd", "arrow-ord/simd", "arrow-arith/simd"] -prettyprint = ["comfy-table"] +prettyprint = ["arrow-cast/prettyprint"] # The test utils feature enables code used in benchmarks and tests but # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown @@ -97,12 +96,10 @@ chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] chrono = { version = "0.4.23", default-features = false, features = ["clock"] } criterion = { version = "0.4", default-features = false } -half = { version = "2.1", default-features = false, features = ["num-traits"] } +half = { version = "2.1", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } - - [build-dependencies] [[example]] @@ -295,7 +292,7 @@ required-features = ["pyarrow"] [[test]] name = "array_cast" -required-features = ["chrono-tz"] +required-features = ["chrono-tz", "prettyprint"] [[test]] name = "timezone" diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 4369ebe7dd45..7f7257100cd9 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -25,7 +25,7 @@ pub mod bench_util; #[cfg(feature = "test_utils")] pub mod data_gen; #[cfg(feature = "prettyprint")] -pub mod pretty; +pub use arrow_cast::pretty; pub mod string_writer; #[cfg(any(test, feature = "test_utils"))] pub mod test_util; diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index dfcaa990dbb5..33695e2edeb6 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -37,6 +37,7 @@ use arrow_array::{ UInt64Array, UInt8Array, UnionArray, }; use arrow_buffer::{i256, Buffer}; +use arrow_cast::pretty::pretty_format_columns; use arrow_cast::{can_cast_types, cast}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; @@ -467,3 +468,59 @@ fn test_timestamp_cast_utf8() { &expected ); } + +fn format_timezone(tz: &str) -> Result { + let array = Arc::new( + TimestampSecondArray::from(vec![Some(11111111), None]).with_timezone(tz), + ); + Ok(pretty_format_columns("f", &[array])?.to_string()) +} + +#[test] +fn test_pretty_format_timestamp_second_with_utc_timezone() { + let table = format_timezone("UTC").unwrap(); + let expected = vec![ + "+----------------------+", + "| f |", + "+----------------------+", + "| 1970-05-09T14:25:11Z |", + "| |", + "+----------------------+", + ]; + let actual: Vec<&str> = table.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); +} + +#[test] +fn test_pretty_format_timestamp_second_with_non_utc_timezone() { + let table = format_timezone("Asia/Taipei").unwrap(); + + let expected = vec![ + "+---------------------------+", + "| f |", + "+---------------------------+", + "| 1970-05-09T22:25:11+08:00 |", + "| |", + "+---------------------------+", + ]; + let actual: Vec<&str> = table.lines().collect(); + assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n"); +} + +#[test] +fn test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() { + let err = format_timezone("08:00").unwrap_err().to_string(); + assert_eq!( + err, + "Parser error: Invalid timezone \"08:00\": '08:00' is not a valid timezone" + ); +} + +#[test] +fn test_pretty_format_timestamp_second_with_unknown_timezone() { + let err = format_timezone("unknown").unwrap_err().to_string(); + assert_eq!( + err, + "Parser error: Invalid timezone \"unknown\": 'unknown' is not a valid timezone" + ); +} From 2a3fd96486055883db599524965f0df2686edb5c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Mar 2023 19:10:56 +0100 Subject: [PATCH 0671/1411] RunEndBuffer review feedback (#3825) * RunEndBuffer review feedback * Fix handling of zero-length buffers * More tests --- arrow-buffer/src/buffer/run.rs | 42 ++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index a7c39638758c..6da2d689c2e3 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -30,8 +30,8 @@ use crate::ArrowNativeType; /// logical array, up to that physical index. /// /// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, -/// as there are `3` values, and the maximum logical index is `6`, as the maximum run end -/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 1, 2, 2]` +/// as there are `3` values, and the maximum logical index is `5`, as the maximum run end +/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]` /// /// ```text /// ┌─────────┐ ┌─────────┐ ┌─────────┐ @@ -41,13 +41,11 @@ use crate::ArrowNativeType; /// ├─────────┤ ├─────────┤ │ │ ├─────────┤ /// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ /// └─────────┘ ├─────────┤ │ │ └─────────┘ -/// run ends │ 3 │ ───┤ │ physical indices -/// ├─────────┤ │ │ -/// │ 4 │ ───┘ │ +/// run ends │ 3 │ ───┘ │ physical indices /// ├─────────┤ │ -/// │ 5 │ ─────┤ +/// │ 4 │ ─────┤ /// ├─────────┤ │ -/// │ 6 │ ─────┘ +/// │ 5 │ ─────┘ /// └─────────┘ /// logical indices /// ``` @@ -90,7 +88,7 @@ where assert!(!run_ends.is_empty(), "non-empty slice but empty run-ends"); let end = E::from_usize(offset.saturating_add(len)).unwrap(); assert!( - *run_ends.first().unwrap() >= E::usize_as(0), + *run_ends.first().unwrap() > E::usize_as(0), "run-ends not greater than 0" ); assert!( @@ -169,7 +167,7 @@ where /// Returns the physical index at which the logical array starts pub fn get_start_physical_index(&self) -> usize { - if self.offset == 0 { + if self.offset == 0 || self.len == 0 { return 0; } // Fallback to binary search @@ -178,6 +176,9 @@ where /// Returns the physical index at which the logical array ends pub fn get_end_physical_index(&self) -> usize { + if self.len == 0 { + return 0; + } if self.max_value() == self.offset + self.len { return self.values().len() - 1; } @@ -198,3 +199,26 @@ where } } } + +#[cfg(test)] +mod tests { + use crate::buffer::RunEndBuffer; + + #[test] + fn test_zero_length_slice() { + let buffer = RunEndBuffer::new(vec![1_i32, 4_i32].into(), 0, 4); + assert_eq!(buffer.get_start_physical_index(), 0); + assert_eq!(buffer.get_end_physical_index(), 1); + assert_eq!(buffer.get_physical_index(3), 1); + + for offset in 0..4 { + let sliced = buffer.slice(offset, 0); + assert_eq!(sliced.get_start_physical_index(), 0); + assert_eq!(sliced.get_end_physical_index(), 0); + } + + let buffer = RunEndBuffer::new(Vec::::new().into(), 0, 0); + assert_eq!(buffer.get_start_physical_index(), 0); + assert_eq!(buffer.get_end_physical_index(), 0); + } +} From 495682aa72ffe92bbd0d6d8d93e0c00b5483ff7d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Mar 2023 19:11:26 +0100 Subject: [PATCH 0672/1411] Add ArrayDataLayout, port validation (#1799) (#3818) * Add ArrayDataLayout (#1799) * Fix ArrayData::buffer * Don't export macros, yet * Fix doc * Review feedback * Further review feedback --- arrow-buffer/src/buffer/boolean.rs | 5 + arrow-buffer/src/buffer/offset.rs | 15 ++ arrow-buffer/src/buffer/run.rs | 10 ++ arrow-buffer/src/buffer/scalar.rs | 15 ++ arrow-data/src/data/boolean.rs | 139 +++++++++++++++ arrow-data/src/data/buffers.rs | 11 +- arrow-data/src/data/bytes.rs | 223 +++++++++++++++++++++-- arrow-data/src/data/dictionary.rs | 129 ++++++++++++- arrow-data/src/data/list.rs | 211 ++++++++++++++++++++-- arrow-data/src/data/mod.rs | 278 +++++++++++++++++------------ arrow-data/src/data/null.rs | 104 +++++++++++ arrow-data/src/data/primitive.rs | 139 +++++++++++++-- arrow-data/src/data/run.rs | 162 +++++++++++++++-- arrow-data/src/data/struct.rs | 52 +++++- arrow-data/src/data/types.rs | 14 +- arrow-data/src/data/union.rs | 106 ++++++++++- 16 files changed, 1424 insertions(+), 189 deletions(-) create mode 100644 arrow-data/src/data/boolean.rs create mode 100644 arrow-data/src/data/null.rs diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 0239111cbafe..8a7f279f32ed 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -139,4 +139,9 @@ impl BooleanBuffer { pub fn inner(&self) -> &Buffer { &self.buffer } + + /// Returns the inner [`Buffer`], consuming self + pub fn into_inner(self) -> Buffer { + self.buffer + } } diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index a80c3c7ecb69..808e43cbf453 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -39,6 +39,21 @@ impl OffsetBuffer { let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); Self(buffer.into_buffer().into()) } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.0 + } + + /// Returns the inner [`ScalarBuffer`], consuming self + pub fn into_inner(self) -> ScalarBuffer { + self.0 + } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self(self.0.slice(offset, len.saturating_add(1))) + } } impl Deref for OffsetBuffer { diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index 6da2d689c2e3..29c0f3dfd949 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -198,6 +198,16 @@ where len, } } + + /// Returns the inner [`ScalarBuffer`] + pub fn inner(&self) -> &ScalarBuffer { + &self.run_ends + } + + /// Returns the inner [`ScalarBuffer`], consuming self + pub fn into_inner(self) -> ScalarBuffer { + self.run_ends + } } #[cfg(test)] diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 01a64633f532..9b3a47785098 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -50,6 +50,21 @@ impl ScalarBuffer { let byte_len = len.checked_mul(size).expect("length overflow"); buffer.slice_with_length(byte_offset, byte_len).into() } + + /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self::new(self.buffer.clone(), offset, len) + } + + /// Returns the inner [`Buffer`] + pub fn inner(&self) -> &Buffer { + &self.buffer + } + + /// Returns the inner [`Buffer`], consuming self + pub fn into_inner(self) -> Buffer { + self.buffer + } } impl Deref for ScalarBuffer { diff --git a/arrow-data/src/data/boolean.rs b/arrow-data/src/data/boolean.rs new file mode 100644 index 000000000000..258624cc1c66 --- /dev/null +++ b/arrow-data/src/data/boolean.rs @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::PhysicalType; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_schema::DataType; + +#[derive(Debug, Clone)] +pub struct BooleanArrayData { + data_type: DataType, + values: BooleanBuffer, + nulls: Option, +} + +impl BooleanArrayData { + /// Create a new [`BooleanArrayData`] + /// + /// # Panics + /// + /// Panics if + /// - `nulls` and `values` are different lengths + /// - `PhysicalType::from(&data_type) != PhysicalType::Boolean` + pub fn new( + data_type: DataType, + values: BooleanBuffer, + nulls: Option, + ) -> Self { + let physical = PhysicalType::from(&data_type); + assert_eq!( + physical, PhysicalType::Boolean, + "Illegal physical type for BooleanArrayData of datatype {:?}, expected {:?} got {:?}", + data_type, + PhysicalType::Boolean, + physical + ); + + if let Some(n) = nulls.as_ref() { + assert_eq!(values.len(), n.len()) + } + Self { + data_type, + values, + nulls, + } + } + + /// Create a new [`BooleanArrayData`] + /// + /// # Safety + /// + /// - `nulls` and `values` are the same lengths + /// - `PhysicalType::from(&data_type) == PhysicalType::Boolean` + pub unsafe fn new_unchecked( + data_type: DataType, + values: BooleanBuffer, + nulls: Option, + ) -> Self { + Self { + data_type, + values, + nulls, + } + } + + /// Creates a new [`BooleanArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`BooleanArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let values = builder.buffers.into_iter().next().unwrap(); + let values = BooleanBuffer::new(values, builder.offset, builder.len); + Self { + values, + data_type: builder.data_type, + nulls: builder.nulls, + } + } + + /// Returns the null buffer if any + #[inline] + pub fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + /// Returns the boolean values + #[inline] + pub fn values(&self) -> &BooleanBuffer { + &self.values + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the underlying parts of this [`BooleanArrayData`] + pub fn into_parts(self) -> (DataType, BooleanBuffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, len), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.values.len(), + offset: self.values.offset(), + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.values().inner()), + child_data: &[], + } + } +} diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs index 3b57bfe0e23c..8a498d319aae 100644 --- a/arrow-data/src/data/buffers.rs +++ b/arrow-data/src/data/buffers.rs @@ -25,7 +25,6 @@ pub struct Buffers<'a>([Option<&'a Buffer>; 2]); impl<'a> Buffers<'a> { /// Temporary will be removed once ArrayData does not store `Vec` directly (#3769) - #[inline] pub(crate) fn from_slice(a: &'a [Buffer]) -> Self { match a.len() { 0 => Self([None, None]), @@ -34,6 +33,16 @@ impl<'a> Buffers<'a> { } } + #[inline] + pub(crate) fn one(b: &'a Buffer) -> Self { + Self([Some(b), None]) + } + + #[inline] + pub(crate) fn two(a: &'a Buffer, b: &'a Buffer) -> Self { + Self([Some(a), Some(b)]) + } + /// Returns the number of [`Buffer`] in this collection #[inline] pub fn len(&self) -> usize { diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs index 521c1959aaa1..9ac267130b7a 100644 --- a/arrow-data/src/data/bytes.rs +++ b/arrow-data/src/data/bytes.rs @@ -16,7 +16,9 @@ // under the License. use crate::data::types::{BytesType, OffsetType}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_schema::DataType; use std::marker::PhantomData; @@ -194,6 +196,22 @@ impl private::BytesOffsetSealed for i64 { } } +/// Applies op to each variant of [`ArrayDataBytes`] +macro_rules! bytes_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataBytes::Binary($array) => match $array { + ArrayDataBytesOffset::Small($array) => $op + ArrayDataBytesOffset::Large($array) => $op + } + ArrayDataBytes::Utf8($array) => match $array { + ArrayDataBytesOffset::Small($array) => $op + ArrayDataBytesOffset::Large($array) => $op + } + } + }; +} + /// An enumeration of the types of [`ArrayDataBytesOffset`] #[derive(Debug, Clone)] pub enum ArrayDataBytes { @@ -215,6 +233,48 @@ impl ArrayDataBytes { ) -> Option> { O::downcast(B::downcast(self)?) } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + bytes_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + bytes_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataBytes`] from raw buffers + /// + /// # Safety + /// + /// See [`BytesArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + offset: OffsetType, + bytes: BytesType, + ) -> Self { + match bytes { + BytesType::Binary => Self::Binary(match offset { + OffsetType::Int32 => { + ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) + } + OffsetType::Int64 => { + ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) + } + }), + BytesType::Utf8 => Self::Utf8(match offset { + OffsetType::Int32 => { + ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) + } + OffsetType::Int64 => { + ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) + } + }), + } + } } /// An enumeration of the types of [`BytesArrayData`] @@ -243,9 +303,9 @@ impl From> for ArrayData #[derive(Debug)] pub struct BytesArrayData { data_type: DataType, - nulls: Option, - offsets: ScalarBuffer, + offsets: OffsetBuffer, values: Buffer, + nulls: Option, phantom: PhantomData, } @@ -268,10 +328,10 @@ impl BytesArrayData { /// /// - Each consecutive window of `offsets` must identify a valid slice of `values` /// - `nulls.len() == offsets.len() - 1` - /// - `data_type` must be valid for this layout + /// - `PhysicalType::from(&data_type) == PhysicalType::Bytes(O::TYPE, B::TYPE)` pub unsafe fn new_unchecked( data_type: DataType, - offsets: ScalarBuffer, + offsets: OffsetBuffer, values: Buffer, nulls: Option, ) -> Self { @@ -284,6 +344,46 @@ impl BytesArrayData { } } + /// Creates a new [`BytesArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let mut iter = builder.buffers.into_iter(); + let offsets = iter.next().unwrap(); + let values = iter.next().unwrap(); + + let offsets = match builder.len { + 0 => OffsetBuffer::new_empty(), + _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( + offsets, + builder.offset, + builder.len + 1, + )), + }; + + Self { + values, + offsets, + data_type: builder.data_type, + nulls: builder.nulls, + phantom: Default::default(), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.offsets.len().wrapping_sub(1) + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.offsets.len() <= 1 + } + /// Returns the raw byte data #[inline] pub fn values(&self) -> &B { @@ -294,13 +394,13 @@ impl BytesArrayData { /// Returns the offsets #[inline] - pub fn offsets(&self) -> &[O] { + pub fn offsets(&self) -> &OffsetBuffer { &self.offsets } /// Returns the null buffer if any #[inline] - pub fn null_buffer(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { self.nulls.as_ref() } @@ -309,14 +409,44 @@ impl BytesArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`BytesArrayData`] + pub fn into_parts(self) -> (DataType, OffsetBuffer, Buffer, Option) { + (self.data_type, self.offsets, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + values: self.values.clone(), + offsets: self.offsets.slice(offset, len), + data_type: self.data_type.clone(), + nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), + phantom: Default::default(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.offsets.len().wrapping_sub(1), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::two(self.offsets.inner().inner(), &self.values), + child_data: &[], + } + } } /// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes #[derive(Debug, Clone)] pub struct FixedSizeBinaryArrayData { data_type: DataType, - nulls: Option, + len: usize, + element_size: usize, values: Buffer, + nulls: Option, } impl FixedSizeBinaryArrayData { @@ -324,10 +454,12 @@ impl FixedSizeBinaryArrayData { /// /// # Safety /// - /// - `data_type` must be valid for this layout - /// - `nulls.len() == values.len() / element_size` + /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeBinary(element_size)` + /// - `nulls.len() == values.len() / element_size == len` pub unsafe fn new_unchecked( data_type: DataType, + len: usize, + element_size: usize, values: Buffer, nulls: Option, ) -> Self { @@ -335,9 +467,46 @@ impl FixedSizeBinaryArrayData { data_type, nulls, values, + len, + element_size, } } + /// Creates a new [`FixedSizeBinaryArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`FixedSizeBinaryArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { + let values = builder.buffers[0] + .slice_with_length(builder.offset * size, builder.len * size); + Self { + values, + data_type: builder.data_type, + len: builder.len, + element_size: size, + nulls: builder.nulls, + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the size of each element + #[inline] + pub fn element_size(&self) -> usize { + self.element_size + } + /// Returns the raw byte data #[inline] pub fn values(&self) -> &[u8] { @@ -346,7 +515,7 @@ impl FixedSizeBinaryArrayData { /// Returns the null buffer if any #[inline] - pub fn null_buffer(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { self.nulls.as_ref() } @@ -355,4 +524,36 @@ impl FixedSizeBinaryArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`FixedSizeBinaryArrayData`] + pub fn into_parts(self) -> (DataType, Buffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let offset_element = offset.checked_mul(self.element_size).expect("overflow"); + let len_element = len.checked_mul(self.element_size).expect("overflow"); + let values = self.values.slice_with_length(offset_element, len_element); + + Self { + len, + values, + data_type: self.data_type.clone(), + element_size: self.element_size, + nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(&self.values), + child_data: &[], + } + } } diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs index 2ec4ee005287..c95ee464b608 100644 --- a/arrow-data/src/data/dictionary.rs +++ b/arrow-data/src/data/dictionary.rs @@ -16,7 +16,8 @@ // under the License. use crate::data::types::DictionaryKeyType; -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; @@ -85,6 +86,22 @@ dictionary!(u16, UInt16); dictionary!(u32, UInt32); dictionary!(u64, UInt64); +/// Applies op to each variant of [`ArrayDataDictionary`] +macro_rules! dictionary_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataDictionary::Int8($array) => $op + ArrayDataDictionary::Int16($array) => $op + ArrayDataDictionary::Int32($array) => $op + ArrayDataDictionary::Int64($array) => $op + ArrayDataDictionary::UInt8($array) => $op + ArrayDataDictionary::UInt16($array) => $op + ArrayDataDictionary::UInt32($array) => $op + ArrayDataDictionary::UInt64($array) => $op + } + }; +} + /// An enumeration of the types of [`DictionaryArrayData`] #[derive(Debug, Clone)] pub enum ArrayDataDictionary { @@ -108,6 +125,46 @@ impl ArrayDataDictionary { pub fn downcast(self) -> Option> { K::downcast(self) } + + /// Returns the values of this dictionary + pub fn values(&self) -> &ArrayData { + let s = self; + dictionary_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + dictionary_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + dictionary_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataDictionary`] from raw buffers + /// + /// # Safety + /// + /// See [`DictionaryArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + key: DictionaryKeyType, + ) -> Self { + use DictionaryKeyType::*; + match key { + Int8 => Self::Int8(DictionaryArrayData::from_raw(builder)), + Int16 => Self::Int16(DictionaryArrayData::from_raw(builder)), + Int32 => Self::Int32(DictionaryArrayData::from_raw(builder)), + Int64 => Self::Int64(DictionaryArrayData::from_raw(builder)), + UInt8 => Self::UInt8(DictionaryArrayData::from_raw(builder)), + UInt16 => Self::UInt16(DictionaryArrayData::from_raw(builder)), + UInt32 => Self::UInt32(DictionaryArrayData::from_raw(builder)), + UInt64 => Self::UInt64(DictionaryArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataDictionary { @@ -122,7 +179,7 @@ pub struct DictionaryArrayData { data_type: DataType, nulls: Option, keys: ScalarBuffer, - child: Box, + values: Box, } impl DictionaryArrayData { @@ -130,7 +187,7 @@ impl DictionaryArrayData { /// /// # Safety /// - /// - `data_type` must be valid for this layout + /// - `PhysicalType::from(&data_type) == PhysicalType::Dictionary(K::TYPE)` /// - child must have a type matching `data_type` /// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls` /// - `nulls` must have the same length as `child` @@ -144,10 +201,39 @@ impl DictionaryArrayData { data_type, nulls, keys, - child: Box::new(child), + values: Box::new(child), + } + } + + /// Creates a new [`DictionaryArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let keys = builder.buffers.into_iter().next().unwrap(); + let keys = ScalarBuffer::new(keys, builder.offset, builder.len); + let values = builder.child_data.into_iter().next().unwrap(); + Self { + keys, + data_type: builder.data_type, + nulls: builder.nulls, + values: Box::new(values), } } + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.keys.len() + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.keys.is_empty() + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -160,10 +246,10 @@ impl DictionaryArrayData { &self.keys } - /// Returns the child data + /// Returns the values data #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + self.values.as_ref() } /// Returns the data type of this array @@ -171,4 +257,33 @@ impl DictionaryArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`DictionaryArrayData`] + pub fn into_parts( + self, + ) -> (DataType, ScalarBuffer, Option, ArrayData) { + (self.data_type, self.keys, self.nulls, *self.values) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + keys: self.keys.slice(offset, len), + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + values: self.values.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.keys.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.keys.inner()), + child_data: std::slice::from_ref(self.values.as_ref()), + } + } } diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs index 59909289e933..bcc89f8ba2ca 100644 --- a/arrow-data/src/data/list.rs +++ b/arrow-data/src/data/list.rs @@ -16,9 +16,10 @@ // under the License. use crate::data::types::OffsetType; -use crate::ArrayData; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; mod private { @@ -113,6 +114,16 @@ impl private::ListOffsetSealed for i64 { } } +/// Applies op to each variant of [`ListArrayData`] +macro_rules! list_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataList::Small($array) => $op + ArrayDataList::Large($array) => $op + } + }; +} + /// An enumeration of the types of [`ListArrayData`] #[derive(Debug, Clone)] pub enum ArrayDataList { @@ -130,6 +141,36 @@ impl ArrayDataList { pub fn downcast(self) -> Option> { O::downcast(self) } + + /// Returns the values of this [`ArrayDataList`] + pub fn values(&self) -> &ArrayData { + let s = self; + list_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + list_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + list_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataList`] from raw buffers + /// + /// # Safety + /// + /// See [`ListArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, offset: OffsetType) -> Self { + match offset { + OffsetType::Int32 => Self::Small(ListArrayData::from_raw(builder)), + OffsetType::Int64 => Self::Large(ListArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataList { @@ -143,8 +184,8 @@ impl From> for ArrayDataList { pub struct ListArrayData { data_type: DataType, nulls: Option, - offsets: ScalarBuffer, - child: Box, + offsets: OffsetBuffer, + values: Box, } impl ListArrayData { @@ -152,23 +193,61 @@ impl ListArrayData { /// /// # Safety /// + /// - `PhysicalType::from(&data_type) == PhysicalType::List(O::TYPE)` /// - Each consecutive window of `offsets` must identify a valid slice of `child` /// - `nulls.len() == offsets.len() - 1` - /// - `data_type` must be valid for this layout pub unsafe fn new_unchecked( data_type: DataType, - offsets: ScalarBuffer, + offsets: OffsetBuffer, nulls: Option, - child: ArrayData, + values: ArrayData, ) -> Self { Self { data_type, nulls, offsets, - child: Box::new(child), + values: Box::new(values), } } + /// Creates a new [`ListArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`Self::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let offsets = builder.buffers.into_iter().next().unwrap(); + let values = builder.child_data.into_iter().next().unwrap(); + + let offsets = match builder.len { + 0 => OffsetBuffer::new_empty(), + _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( + offsets, + builder.offset, + builder.len + 1, + )), + }; + + Self { + offsets, + data_type: builder.data_type, + nulls: builder.nulls, + values: Box::new(values), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.offsets.len().wrapping_sub(1) + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.offsets.len() <= 1 + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -177,14 +256,14 @@ impl ListArrayData { /// Returns the offsets #[inline] - pub fn offsets(&self) -> &[O] { + pub fn offsets(&self) -> &OffsetBuffer { &self.offsets } - /// Returns the child data + /// Returns the values of this [`ListArrayData`] #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + self.values.as_ref() } /// Returns the data type of this array @@ -192,12 +271,43 @@ impl ListArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`ListArrayData`] + pub fn into_parts( + self, + ) -> (DataType, OffsetBuffer, Option, ArrayData) { + (self.data_type, self.offsets, self.nulls, *self.values) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + offsets: self.offsets.slice(offset, len), + values: self.values.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.offsets.inner().inner()), + child_data: std::slice::from_ref(self.values.as_ref()), + } + } } /// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) #[derive(Debug, Clone)] pub struct FixedSizeListArrayData { data_type: DataType, + len: usize, + element_size: usize, nulls: Option, child: Box, } @@ -207,20 +317,59 @@ impl FixedSizeListArrayData { /// /// # Safety /// - /// - `data_type` must be valid for this layout - /// - `nulls.len() == values.len() / element_size` + /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeList(element_size)` + /// - `nulls.len() == values.len() / element_size == len` pub unsafe fn new_unchecked( data_type: DataType, + len: usize, + element_size: usize, nulls: Option, child: ArrayData, ) -> Self { Self { data_type, + len, + element_size, nulls, child: Box::new(child), } } + /// Creates a new [`FixedSizeListArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`FixedSizeListArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { + let child = + builder.child_data[0].slice(builder.offset * size, builder.len * size); + Self { + data_type: builder.data_type, + len: builder.len, + element_size: size, + nulls: builder.nulls, + child: Box::new(child), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the size of each element + #[inline] + pub fn element_size(&self) -> usize { + self.element_size + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -238,4 +387,36 @@ impl FixedSizeListArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`FixedSizeListArrayData`] + pub fn into_parts(self) -> (DataType, Option, ArrayData) { + (self.data_type, self.nulls, *self.child) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let offset_element = offset.checked_mul(self.element_size).expect("overflow"); + let len_element = len.checked_mul(self.element_size).expect("overflow"); + let child = self.child.slice(offset_element, len_element); + + Self { + len, + data_type: self.data_type.clone(), + element_size: self.element_size, + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + child: Box::new(child), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::default(), + child_data: std::slice::from_ref(self.child.as_ref()), + } + } } diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 051deef07305..784911dc0a85 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -33,21 +33,25 @@ use crate::equal; mod buffers; pub use buffers::*; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) +mod boolean; +#[allow(unused)] // Private until ready (#1799) mod bytes; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod dictionary; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod list; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) +mod null; +#[allow(unused)] // Private until ready (#1799) mod primitive; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod run; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod r#struct; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod types; -#[allow(unused)] // Private until ready (#1176) +#[allow(unused)] // Private until ready (#1799) mod union; #[inline] @@ -277,7 +281,6 @@ impl ArrayData { /// Note: This is a low level API and most users of the arrow /// crate should create arrays using the methods in the `array` /// module. - #[allow(clippy::let_and_return)] pub unsafe fn new_unchecked( data_type: DataType, len: usize, @@ -287,27 +290,17 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Self { - let nulls = null_bit_buffer - .map(|b| BooleanBuffer::new(b, offset, len)) - .map(|b| match null_count { - None => NullBuffer::new(b), - Some(null_count) => NullBuffer::new_unchecked(b, null_count), - }) - .filter(|b| b.null_count() > 0); - - let new_self = Self { + ArrayDataBuilder { data_type, len, + null_count, + null_bit_buffer, + nulls: None, offset, buffers, child_data, - nulls, - }; - - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - new_self.validate_data().unwrap(); - new_self + } + .build_unchecked() } /// Create a new ArrayData, validating that the provided buffers form a valid @@ -358,7 +351,7 @@ impl ArrayData { // We don't need to validate children as we can assume that the // [`ArrayData`] in `child_data` have already been validated through // a call to `ArrayData::try_new` or created using unsafe - new_self.validate_data()?; + ArrayDataLayout::new(&new_self).validate_data()?; Ok(new_self) } @@ -448,14 +441,15 @@ impl ArrayData { /// If multiple [`ArrayData`]s refer to the same underlying /// [`Buffer`]s they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { + let s = ArrayDataLayout::new(self); let mut size = 0; - for buffer in &self.buffers { + for buffer in s.buffers { size += buffer.capacity(); } - if let Some(bitmap) = &self.nulls { + if let Some(bitmap) = s.nulls { size += bitmap.buffer().capacity() } - for child in &self.child_data { + for child in s.child_data { size += child.get_buffer_memory_size(); } size @@ -474,14 +468,15 @@ impl ArrayData { /// first `20` elements, then [`Self::get_slice_memory_size`] on the /// sliced [`ArrayData`] would return `20 * 8 = 160`. pub fn get_slice_memory_size(&self) -> Result { + let s = ArrayDataLayout::new(self); let mut result: usize = 0; - let layout = layout(&self.data_type); + let layout = layout(s.data_type); for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width } => { let buffer_size = - self.len.checked_mul(*byte_width).ok_or_else(|| { + s.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) @@ -490,26 +485,26 @@ impl ArrayData { } BufferSpec::VariableWidth => { let buffer_len: usize; - match self.data_type { + match s.data_type { DataType::Utf8 | DataType::Binary => { - let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0] ) as usize; + let offsets = s.typed_offsets::()?; + buffer_len = (offsets[s.len] - offsets[0]) as usize; } DataType::LargeUtf8 | DataType::LargeBinary => { - let offsets = self.typed_offsets::()?; - buffer_len = (offsets[self.len] - offsets[0]) as usize; + let offsets = s.typed_offsets::()?; + buffer_len = (offsets[s.len] - offsets[0]) as usize; } _ => { return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", - self.data_type + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", + s.data_type ))) } }; result += buffer_len; } BufferSpec::BitMap => { - let buffer_size = bit_util::ceil(self.len, 8); + let buffer_size = bit_util::ceil(s.len, 8); result += buffer_size; } BufferSpec::AlwaysNull => { @@ -518,11 +513,11 @@ impl ArrayData { } } - if self.nulls().is_some() { - result += bit_util::ceil(self.len, 8); + if s.nulls.is_some() { + result += bit_util::ceil(s.len, 8); } - for child in &self.child_data { + for child in s.child_data { result += child.get_slice_memory_size()?; } Ok(result) @@ -537,17 +532,18 @@ impl ArrayData { /// [`Self::get_buffer_memory_size`] + /// `size_of_val(child)` for all children pub fn get_array_memory_size(&self) -> usize { + let s = ArrayDataLayout::new(self); let mut size = mem::size_of_val(self); // Calculate rest of the fields top down which contain actual data - for buffer in &self.buffers { + for buffer in s.buffers { size += mem::size_of::(); size += buffer.capacity(); } - if let Some(nulls) = &self.nulls { + if let Some(nulls) = s.nulls { size += nulls.buffer().capacity(); } - for child in &self.child_data { + for child in s.child_data { size += child.get_array_memory_size(); } @@ -598,14 +594,8 @@ impl ArrayData { /// This function panics if: /// * the buffer is not byte-aligned with type T, or /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) - #[inline] pub fn buffer(&self, buffer: usize) -> &[T] { - let values = unsafe { self.buffers[buffer].as_slice().align_to::() }; - if !values.0.is_empty() || !values.2.is_empty() { - panic!("The buffer is not byte-aligned with its interpretation") - }; - assert_ne!(self.data_type, DataType::Boolean); - &values.1[self.offset..] + &self.buffers()[buffer].typed_data()[self.offset..] } /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values @@ -740,11 +730,101 @@ impl ArrayData { /// See [ArrayData::validate_data] to validate fully the offset content /// and the validity of utf8 data pub fn validate(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate() + } + + /// Validate that the data contained within this [`ArrayData`] is valid + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 4. All dictionary offsets are valid + /// + /// Internally this calls: + /// + /// * [`Self::validate`] + /// * [`Self::validate_nulls`] + /// * [`Self::validate_values`] + /// + /// Note: this does not recurse into children, for a recursive variant + /// see [`Self::validate_full`] + pub fn validate_data(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_data() + } + + /// Performs a full recursive validation of this [`ArrayData`] and all its children + /// + /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] + /// and all its children recursively + pub fn validate_full(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_full() + } + + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + /// Validates the the null count is correct and that any + /// nullability requirements of its children are correct + pub fn validate_nulls(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_nulls() + } + + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + pub fn validate_values(&self) -> Result<(), ArrowError> { + ArrayDataLayout::new(self).validate_values() + } + + /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + pub fn ptr_eq(&self, other: &Self) -> bool { + ArrayDataLayout::new(self).ptr_eq(&ArrayDataLayout::new(other)) + } + + /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] + pub fn into_builder(self) -> ArrayDataBuilder { + self.into() + } +} + +/// A flat representation of [`ArrayData`] +/// +/// This is temporary measure to bridge the gap between the strongly-typed +/// ArrayData enumeration and the older-style struct representation (#1799) +#[derive(Copy, Clone)] +pub(crate) struct ArrayDataLayout<'a> { + data_type: &'a DataType, + len: usize, + offset: usize, + nulls: Option<&'a NullBuffer>, + buffers: Buffers<'a>, + child_data: &'a [ArrayData], +} + +impl<'a> ArrayDataLayout<'a> { + fn new(data: &'a ArrayData) -> Self { + Self { + data_type: &data.data_type, + len: data.len, + offset: data.offset, + nulls: data.nulls.as_ref(), + buffers: Buffers::from_slice(&data.buffers), + child_data: &data.child_data, + } + } + + fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; // Check that the data layout conforms to the spec - let layout = layout(&self.data_type); + let layout = layout(self.data_type); if !layout.can_contain_null_mask && self.nulls.is_some() { return Err(ArrowError::InvalidArgumentError(format!( @@ -799,7 +879,7 @@ impl ArrayData { } // check null bit buffer size - if let Some(nulls) = self.nulls() { + if let Some(nulls) = self.nulls { if nulls.null_count() > self.len { return Err(ArrowError::InvalidArgumentError(format!( "null_count {} for an array exceeds length of {} elements", @@ -1013,7 +1093,7 @@ impl ArrayData { run_ends_data.len, values_data.len ))); } - if run_ends_data.null_count() > 0 { + if run_ends_data.nulls.is_some() { return Err(ArrowError::InvalidArgumentError( "Found null values in run_ends array. The run_ends array should not have null values.".to_string(), )); @@ -1061,17 +1141,17 @@ impl ArrayData { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result<&ArrayData, ArrowError> { + ) -> Result, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } /// Returns `Err` if self.child_data does not have exactly `expected_len` elements fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> { - if self.child_data().len() != expected_len { + if self.child_data.len() != expected_len { Err(ArrowError::InvalidArgumentError(format!( "Value data for {} should contain {} child data array(s), had {}", - self.data_type(), + self.data_type, expected_len, self.child_data.len() ))) @@ -1086,7 +1166,7 @@ impl ArrayData { &self, i: usize, expected_type: &DataType, - ) -> Result<&ArrayData, ArrowError> { + ) -> Result { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -1095,8 +1175,9 @@ impl ArrayData { self.data_type, i+1, self.child_data.len() )) })?; + let values_data = ArrayDataLayout::new(values_data); - if expected_type != &values_data.data_type { + if expected_type != values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( "Child type mismatch for {}. Expected {} but child data had {}", self.data_type, expected_type, values_data.data_type @@ -1107,22 +1188,7 @@ impl ArrayData { Ok(values_data) } - /// Validate that the data contained within this [`ArrayData`] is valid - /// - /// 1. Null count is correct - /// 2. All offsets are valid - /// 3. All String data is valid UTF-8 - /// 4. All dictionary offsets are valid - /// - /// Internally this calls: - /// - /// * [`Self::validate`] - /// * [`Self::validate_nulls`] - /// * [`Self::validate_values`] - /// - /// Note: this does not recurse into children, for a recursive variant - /// see [`Self::validate_full`] - pub fn validate_data(&self) -> Result<(), ArrowError> { + fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; self.validate_nulls()?; @@ -1130,11 +1196,7 @@ impl ArrayData { Ok(()) } - /// Performs a full recursive validation of this [`ArrayData`] and all its children - /// - /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] - /// and all its children recursively - pub fn validate_full(&self) -> Result<(), ArrowError> { + fn validate_full(&self) -> Result<(), ArrowError> { self.validate_data()?; // validate all children recursively self.child_data @@ -1151,14 +1213,7 @@ impl ArrayData { Ok(()) } - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - /// Validates the the null count is correct and that any - /// nullability requirements of its children are correct - pub fn validate_nulls(&self) -> Result<(), ArrowError> { + fn validate_nulls(&self) -> Result<(), ArrowError> { if let Some(nulls) = &self.nulls { let actual = nulls.len() - nulls.inner().count_set_bits(); if actual != nulls.null_count() { @@ -1176,11 +1231,12 @@ impl ArrayData { match &self.data_type { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { if !f.is_nullable() { - self.validate_non_nullable(None, 0, &self.child_data[0])? + let child = ArrayDataLayout::new(&self.child_data[0]); + self.validate_non_nullable(None, 0, child)? } } DataType::FixedSizeList(field, len) => { - let child = &self.child_data[0]; + let child = ArrayDataLayout::new(&self.child_data[0]); if !field.is_nullable() { match &self.nulls { Some(nulls) => { @@ -1209,7 +1265,8 @@ impl ArrayData { } } DataType::Struct(fields) => { - for (field, child) in fields.iter().zip(&self.child_data) { + for (field, child) in fields.iter().zip(self.child_data) { + let child = ArrayDataLayout::new(child); if !field.is_nullable() { match &self.nulls { Some(n) => self.validate_non_nullable( @@ -1233,24 +1290,24 @@ impl ArrayData { &self, mask: Option<&Buffer>, offset: usize, - data: &ArrayData, + child: ArrayDataLayout<'_>, ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask.as_ref(), - None => return match data.null_count() { + None => return match child.nulls.map(|x| x.null_count()).unwrap_or_default() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent {}", - data.data_type(), + child.data_type, self.data_type ))), }, }; - match data.nulls() { + match child.nulls { Some(nulls) => { - let mask = BitChunks::new(mask, offset, data.len); - let nulls = BitChunks::new(nulls.validity(), nulls.offset(), data.len); + let mask = BitChunks::new(mask, offset, child.len); + let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); mask .iter() .zip(nulls.iter()) @@ -1261,7 +1318,7 @@ impl ArrayData { if (m & !c) != 0 { return Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent", - data.data_type() + child.data_type ))) } Ok(()) @@ -1276,7 +1333,7 @@ impl ArrayData { /// /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - pub fn validate_values(&self) -> Result<(), ArrowError> { + fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), @@ -1286,11 +1343,11 @@ impl ArrayData { } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len) + self.validate_offsets_full::(child.len()) } DataType::LargeList(_) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len) + self.validate_offsets_full::(child.len()) } DataType::Union(_, _, _) => { // Validate Union Array as part of implementing new Union semantics @@ -1301,7 +1358,7 @@ impl ArrayData { Ok(()) } DataType::Dictionary(key_type, _value_type) => { - let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); + let dictionary_length: i64 = self.child_data[0].len().try_into().unwrap(); let max_value = dictionary_length - 1; match key_type.as_ref() { DataType::UInt8 => self.check_bounds::(max_value), @@ -1316,7 +1373,7 @@ impl ArrayData { } } DataType::RunEndEncoded(run_ends, _values) => { - let run_ends_data = self.child_data()[0].clone(); + let run_ends_data = ArrayDataLayout::new(&self.child_data[0]); match run_ends.data_type() { DataType::Int16 => run_ends_data.check_run_ends::(), DataType::Int32 => run_ends_data.check_run_ends::(), @@ -1460,7 +1517,7 @@ impl ArrayData { indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) - if self.is_null(i) { + if self.nulls.map(|x| x.is_null(i)).unwrap_or_default() { return Ok(()); } let dict_index: i64 = dict_index.try_into().map_err(|_| { @@ -1483,7 +1540,7 @@ impl ArrayData { where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { - let values = self.typed_buffer::(0, self.len())?; + let values = self.typed_buffer::(0, self.len)?; let mut prev_value: i64 = 0_i64; values.iter().enumerate().try_for_each(|(ix, &inp_value)| { let value: i64 = inp_value.try_into().map_err(|_| { @@ -1548,11 +1605,6 @@ impl ArrayData { .zip(other.child_data.iter()) .all(|(a, b)| a.ptr_eq(b)) } - - /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] - pub fn into_builder(self) -> ArrayDataBuilder { - self.into() - } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1823,7 +1875,7 @@ impl ArrayDataBuilder { offset: self.offset, buffers: self.buffers, child_data: self.child_data, - nulls, + nulls: nulls.filter(|b| b.null_count() != 0), }; // Provide a force_validate mode @@ -1837,7 +1889,7 @@ impl ArrayDataBuilder { pub fn build(self) -> Result { let data = unsafe { self.build_unchecked() }; #[cfg(not(feature = "force_validate"))] - data.validate_data()?; + ArrayDataLayout::new(&data).validate_data()?; Ok(data) } } diff --git a/arrow-data/src/data/null.rs b/arrow-data/src/data/null.rs new file mode 100644 index 000000000000..b8a4d7270833 --- /dev/null +++ b/arrow-data/src/data/null.rs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::types::PhysicalType; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; +use arrow_schema::DataType; + +/// ArrayData for [null arrays](https://arrow.apache.org/docs/format/Columnar.html#null-layout) +#[derive(Debug, Clone)] +pub struct NullArrayData { + data_type: DataType, + len: usize, +} + +impl NullArrayData { + /// Create a new [`NullArrayData`] + /// + /// # Panic + /// + /// - `PhysicalType::from(&data_type) != PhysicalType::Null` + pub fn new(data_type: DataType, len: usize) -> Self { + assert_eq!( + PhysicalType::from(&data_type), + PhysicalType::Null, + "Illegal physical type for NullArrayData of datatype {data_type:?}", + ); + Self { data_type, len } + } + + /// Create a new [`NullArrayData`] + /// + /// # Safety + /// + /// - `PhysicalType::from(&data_type) == PhysicalType::Null` + pub unsafe fn new_unchecked(data_type: DataType, len: usize) -> Self { + Self { data_type, len } + } + + /// Creates a new [`NullArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`NullArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + Self { + data_type: builder.data_type, + len: builder.len, + } + } + + /// Returns the data type of this array + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns the [`DataType`] and length of this [`NullArrayData`] + pub fn into_parts(self) -> (DataType, usize) { + (self.data_type, self.len) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let new_len = offset.saturating_add(len); + assert!(new_len <= self.len); + Self { + data_type: self.data_type.clone(), + len, + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: None, + buffers: Buffers::default(), + child_data: &[], + } + } +} diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs index 058b3e822056..ed8ed8d7aabb 100644 --- a/arrow-data/src/data/primitive.rs +++ b/arrow-data/src/data/primitive.rs @@ -16,6 +16,8 @@ // under the License. use crate::data::types::{PhysicalType, PrimitiveType}; +use crate::data::ArrayDataLayout; +use crate::{ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; use arrow_buffer::{i256, ArrowNativeType}; use arrow_schema::DataType; @@ -46,6 +48,27 @@ pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { const TYPE: PrimitiveType; } +/// Applies op to each variant of [`ArrayDataPrimitive`] +macro_rules! primitive_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataPrimitive::Int8($array) => $op + ArrayDataPrimitive::Int16($array) => $op + ArrayDataPrimitive::Int32($array) => $op + ArrayDataPrimitive::Int64($array) => $op + ArrayDataPrimitive::Int128($array) => $op + ArrayDataPrimitive::Int256($array) => $op + ArrayDataPrimitive::UInt8($array) => $op + ArrayDataPrimitive::UInt16($array) => $op + ArrayDataPrimitive::UInt32($array) => $op + ArrayDataPrimitive::UInt64($array) => $op + ArrayDataPrimitive::Float16($array) => $op + ArrayDataPrimitive::Float32($array) => $op + ArrayDataPrimitive::Float64($array) => $op + } + }; +} + macro_rules! primitive { ($t:ty,$v:ident) => { impl Primitive for $t { @@ -90,6 +113,7 @@ primitive!(f32, Float32); primitive!(f64, Float64); /// An enumeration of the types of [`PrimitiveArrayData`] +#[derive(Debug, Clone)] pub enum ArrayDataPrimitive { Int8(PrimitiveArrayData), Int16(PrimitiveArrayData), @@ -116,6 +140,45 @@ impl ArrayDataPrimitive { pub fn downcast(self) -> Option> { P::downcast(self) } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + primitive_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + primitive_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataPrimitive`] from raw buffers + /// + /// # Safety + /// + /// See [`PrimitiveArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw( + builder: ArrayDataBuilder, + primitive: PrimitiveType, + ) -> Self { + use PrimitiveType::*; + match primitive { + Int8 => Self::Int8(PrimitiveArrayData::from_raw(builder)), + Int16 => Self::Int16(PrimitiveArrayData::from_raw(builder)), + Int32 => Self::Int32(PrimitiveArrayData::from_raw(builder)), + Int64 => Self::Int64(PrimitiveArrayData::from_raw(builder)), + Int128 => Self::Int128(PrimitiveArrayData::from_raw(builder)), + Int256 => Self::Int256(PrimitiveArrayData::from_raw(builder)), + UInt8 => Self::UInt8(PrimitiveArrayData::from_raw(builder)), + UInt16 => Self::UInt16(PrimitiveArrayData::from_raw(builder)), + UInt32 => Self::UInt32(PrimitiveArrayData::from_raw(builder)), + UInt64 => Self::UInt64(PrimitiveArrayData::from_raw(builder)), + Float16 => Self::Float16(PrimitiveArrayData::from_raw(builder)), + Float32 => Self::Float32(PrimitiveArrayData::from_raw(builder)), + Float64 => Self::Float64(PrimitiveArrayData::from_raw(builder)), + } + } } impl From> for ArrayDataPrimitive { @@ -128,8 +191,8 @@ impl From> for ArrayDataPrimitive { #[derive(Debug, Clone)] pub struct PrimitiveArrayData { data_type: DataType, - nulls: Option, values: ScalarBuffer, + nulls: Option, } impl PrimitiveArrayData { @@ -138,20 +201,17 @@ impl PrimitiveArrayData { /// # Panics /// /// Panics if + /// - `PhysicalType::from(&data_type) != PhysicalType::Primitive(T::TYPE)` /// - `nulls` and `values` are different lengths - /// - `data_type` is not compatible with `T` pub fn new( data_type: DataType, values: ScalarBuffer, nulls: Option, ) -> Self { - let physical = PhysicalType::from(&data_type); - assert!( - matches!(physical, PhysicalType::Primitive(p) if p == T::TYPE), - "Illegal physical type for PrimitiveArrayData of datatype {:?}, expected {:?} got {:?}", - data_type, - T::TYPE, - physical + assert_eq!( + PhysicalType::from(&data_type), + PhysicalType::Primitive(T::TYPE), + "Illegal physical type for PrimitiveArrayData of datatype {data_type:?}", ); if let Some(n) = nulls.as_ref() { @@ -165,6 +225,39 @@ impl PrimitiveArrayData { } } + /// Create a new [`PrimitiveArrayData`] + /// + /// # Safety + /// + /// - `PhysicalType::from(&data_type) == PhysicalType::Primitive(T::TYPE)` + /// - `nulls` and `values` must be the same length + pub unsafe fn new_unchecked( + data_type: DataType, + values: ScalarBuffer, + nulls: Option, + ) -> Self { + Self { + data_type, + values, + nulls, + } + } + + /// Creates a new [`PrimitiveArrayData`] from an [`ArrayDataBuilder`] + /// + /// # Safety + /// + /// See [`PrimitiveArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let values = builder.buffers.into_iter().next().unwrap(); + let values = ScalarBuffer::new(values, builder.offset, builder.len); + Self { + values, + data_type: builder.data_type, + nulls: builder.nulls, + } + } + /// Returns the null buffer if any #[inline] pub fn nulls(&self) -> Option<&NullBuffer> { @@ -173,7 +266,7 @@ impl PrimitiveArrayData { /// Returns the primitive values #[inline] - pub fn values(&self) -> &[T] { + pub fn values(&self) -> &ScalarBuffer { &self.values } @@ -182,4 +275,30 @@ impl PrimitiveArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`PrimitiveArrayData`] + pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { + (self.data_type, self.values, self.nulls) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, len), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.values.len(), + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::one(self.values.inner()), + child_data: &[], + } + } } diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs index cd993de1bf25..7f80206a70fa 100644 --- a/arrow-data/src/data/run.rs +++ b/arrow-data/src/data/run.rs @@ -15,17 +15,20 @@ // specific language governing permissions and limitations // under the License. +use crate::data::primitive::{Primitive, PrimitiveArrayData}; use crate::data::types::RunEndType; -use crate::ArrayData; -use arrow_buffer::buffer::ScalarBuffer; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; +use arrow_buffer::buffer::{RunEndBuffer, ScalarBuffer}; use arrow_buffer::ArrowNativeType; use arrow_schema::DataType; -use std::marker::PhantomData; mod private { use super::*; pub trait RunEndSealed { + const ENDS_TYPE: DataType; + /// Downcast [`ArrayDataRun`] to `[RunArrayData`] fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> where @@ -43,7 +46,7 @@ mod private { } } -pub trait RunEnd: private::RunEndSealed + ArrowNativeType { +pub trait RunEnd: private::RunEndSealed + ArrowNativeType + Primitive { const TYPE: RunEndType; } @@ -53,6 +56,8 @@ macro_rules! run_end { const TYPE: RunEndType = RunEndType::$v; } impl private::RunEndSealed for $t { + const ENDS_TYPE: DataType = DataType::$v; + fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { match data { ArrayDataRun::$v(v) => Some(v), @@ -78,7 +83,19 @@ run_end!(i16, Int16); run_end!(i32, Int32); run_end!(i64, Int64); +/// Applies op to each variant of [`ArrayDataRun`] +macro_rules! run_op { + ($array:ident, $op:block) => { + match $array { + ArrayDataRun::Int16($array) => $op + ArrayDataRun::Int32($array) => $op + ArrayDataRun::Int64($array) => $op + } + }; +} + /// An enumeration of the types of [`RunArrayData`] +#[derive(Debug, Clone)] pub enum ArrayDataRun { Int16(RunArrayData), Int32(RunArrayData), @@ -88,26 +105,65 @@ pub enum ArrayDataRun { impl ArrayDataRun { /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] pub fn downcast_ref(&self) -> Option<&RunArrayData> { - E::downcast_ref(self) + ::downcast_ref(self) } /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] pub fn downcast(self) -> Option> { - E::downcast(self) + ::downcast(self) + } + + /// Returns the values of this [`ArrayDataRun`] + #[inline] + pub fn values(&self) -> &ArrayData { + let s = self; + run_op!(s, { s.values() }) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let s = self; + run_op!(s, { s.slice(offset, len).into() }) + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let s = self; + run_op!(s, { s.layout() }) + } + + /// Creates a new [`ArrayDataRun`] from raw buffers + /// + /// # Safety + /// + /// See [`RunArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, run: RunEndType) -> Self { + use RunEndType::*; + match run { + Int16 => Self::Int16(RunArrayData::from_raw(builder)), + Int32 => Self::Int32(RunArrayData::from_raw(builder)), + Int64 => Self::Int64(RunArrayData::from_raw(builder)), + } } } impl From> for ArrayDataRun { fn from(value: RunArrayData) -> Self { - E::upcast(value) + ::upcast(value) } } /// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) +#[derive(Debug, Clone)] pub struct RunArrayData { data_type: DataType, - run_ends: ScalarBuffer, - child: Box, + run_ends: RunEndBuffer, + /// The children of this RunArrayData: + /// 1: the run ends + /// 2: the values + /// + /// We store an array so that a slice can be returned in [`RunArrayData::layout`] + children: Box<[ArrayData; 2]>, } impl RunArrayData { @@ -115,23 +171,68 @@ impl RunArrayData { /// /// # Safety /// - /// - `data_type` must be valid for this layout - /// - `run_ends` must contain monotonically increasing, positive values `<= child.len()` + /// - `PhysicalType::from(&data_type) == PhysicalType::Run(E::TYPE)` + /// - `run_ends` must contain monotonically increasing, positive values `<= len` + /// - `run_ends.get_end_physical_index() < values.len()` pub unsafe fn new_unchecked( data_type: DataType, - run_ends: ScalarBuffer, - child: ArrayData, + run_ends: RunEndBuffer, + values: ArrayData, ) -> Self { + let inner = run_ends.inner(); + let child = ArrayDataBuilder::new(E::ENDS_TYPE) + .len(inner.len()) + .buffers(vec![inner.inner().clone()]) + .build_unchecked(); + Self { data_type, run_ends, - child: Box::new(child), + children: Box::new([child, values]), } } + /// Creates a new [`RunArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`RunArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let mut iter = builder.child_data.into_iter(); + let child1 = iter.next().unwrap(); + let child2 = iter.next().unwrap(); + + let p = ScalarBuffer::new(child1.buffers[0].clone(), child1.offset, child1.len); + let run_ends = RunEndBuffer::new_unchecked(p, builder.offset, builder.len); + + Self { + run_ends, + data_type: builder.data_type, + children: Box::new([child1, child2]), + } + } + + /// Returns the length + #[inline] + pub fn len(&self) -> usize { + self.run_ends.len() + } + + /// Returns the offset + #[inline] + pub fn offset(&self) -> usize { + self.run_ends.offset() + } + + /// Returns true if this array is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.run_ends.is_empty() + } + /// Returns the run ends #[inline] - pub fn run_ends(&self) -> &[E] { + pub fn run_ends(&self) -> &RunEndBuffer { &self.run_ends } @@ -143,7 +244,34 @@ impl RunArrayData { /// Returns the child data #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() + pub fn values(&self) -> &ArrayData { + &self.children[1] + } + + /// Returns the underlying parts of this [`RunArrayData`] + pub fn into_parts(self) -> (DataType, RunEndBuffer, ArrayData) { + let child = self.children.into_iter().nth(1).unwrap(); + (self.data_type, self.run_ends, child) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + data_type: self.data_type.clone(), + run_ends: self.run_ends.slice(offset, len), + children: self.children.clone(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.run_ends.len(), + offset: self.run_ends.offset(), + nulls: None, + buffers: Buffers::default(), + child_data: self.children.as_ref(), + } } } diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs index d9999261902e..229c10912a59 100644 --- a/arrow-data/src/data/struct.rs +++ b/arrow-data/src/data/struct.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::NullBuffer; use arrow_schema::DataType; @@ -33,7 +34,7 @@ impl StructArrayData { /// /// # Safety /// - /// - data_type must be a StructArray with fields matching `child_data` + /// - `PhysicalType::from(&data_type) == PhysicalType::Struct` /// - all child data and nulls must have length matching `len` pub unsafe fn new_unchecked( data_type: DataType, @@ -49,6 +50,26 @@ impl StructArrayData { } } + /// Creates a new [`StructArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`StructArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { + let children = builder + .child_data + .into_iter() + .map(|x| x.slice(builder.offset, builder.len)) + .collect(); + + Self { + data_type: builder.data_type, + len: builder.len, + nulls: builder.nulls, + children, + } + } + /// Returns the length of this [`StructArrayData`] #[inline] pub fn len(&self) -> usize { @@ -78,4 +99,31 @@ impl StructArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`StructArrayData`] + pub fn into_parts(self) -> (DataType, Option, Vec) { + (self.data_type, self.nulls, self.children) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + Self { + len, + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), + children: self.children.iter().map(|c| c.slice(offset, len)).collect(), + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + ArrayDataLayout { + data_type: &self.data_type, + len: self.len, + offset: 0, + nulls: self.nulls.as_ref(), + buffers: Buffers::default(), + child_data: &self.children, + } + } } diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs index 3414e481ca66..bb65b42124f3 100644 --- a/arrow-data/src/data/types.rs +++ b/arrow-data/src/data/types.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_schema::{DataType, IntervalUnit}; +use arrow_schema::{DataType, IntervalUnit, UnionMode}; /// An enumeration of the primitive types implementing [`ArrowNativeType`](arrow_buffer::ArrowNativeType) #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] @@ -76,12 +76,12 @@ pub enum PhysicalType { Null, Boolean, Primitive(PrimitiveType), - FixedSizeBinary, + FixedSizeBinary(usize), Bytes(OffsetType, BytesType), - FixedSizeList, + FixedSizeList(usize), List(OffsetType), Struct, - Union, + Union(UnionMode), Dictionary(DictionaryKeyType), Run(RunEndType), } @@ -119,16 +119,16 @@ impl From<&DataType> for PhysicalType { DataType::Interval(IntervalUnit::MonthDayNano) => { Self::Primitive(PrimitiveType::Int128) } - DataType::FixedSizeBinary(_) => Self::FixedSizeBinary, + DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(*size as usize), DataType::Binary => Self::Bytes(OffsetType::Int32, BytesType::Binary), DataType::LargeBinary => Self::Bytes(OffsetType::Int64, BytesType::Binary), DataType::Utf8 => Self::Bytes(OffsetType::Int32, BytesType::Utf8), DataType::LargeUtf8 => Self::Bytes(OffsetType::Int64, BytesType::Utf8), DataType::List(_) => Self::List(OffsetType::Int32), - DataType::FixedSizeList(_, _) => Self::FixedSizeList, + DataType::FixedSizeList(_, size) => Self::FixedSizeList(*size as usize), DataType::LargeList(_) => Self::List(OffsetType::Int64), DataType::Struct(_) => Self::Struct, - DataType::Union(_, _, _) => Self::Union, + DataType::Union(_, _, mode) => Self::Union(*mode), DataType::Dictionary(k, _) => match k.as_ref() { DataType::Int8 => Self::Dictionary(DictionaryKeyType::Int8), DataType::Int16 => Self::Dictionary(DictionaryKeyType::Int16), diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs index 7861bd154e71..7d53a1f18067 100644 --- a/arrow-data/src/data/union.rs +++ b/arrow-data/src/data/union.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::ArrayData; +use crate::data::ArrayDataLayout; +use crate::{ArrayData, ArrayDataBuilder, Buffers}; use arrow_buffer::buffer::ScalarBuffer; -use arrow_schema::DataType; +use arrow_schema::{DataType, UnionMode}; /// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) #[derive(Debug, Clone)] @@ -33,7 +34,8 @@ impl UnionArrayData { /// /// # Safety /// - /// - `data_type` must be valid for this layout + /// - `PhysicalType::from(&data_type) == PhysicalType::Union(mode)` + /// - `offsets` is `Some` iff the above `mode == UnionMode::Sparse` /// - `type_ids` must only contain values corresponding to a field in `data_type` /// - `children` must match the field definitions in `data_type` /// - For each value id in type_ids, the corresponding offset, must be in bounds for the child @@ -51,16 +53,62 @@ impl UnionArrayData { } } + /// Creates a new [`UnionArrayData`] from raw buffers + /// + /// # Safety + /// + /// See [`UnionArrayData::new_unchecked`] + pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, mode: UnionMode) -> Self { + match mode { + UnionMode::Sparse => { + let type_ids = builder.buffers.into_iter().next().unwrap(); + let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); + let children = builder + .child_data + .into_iter() + .map(|x| x.slice(builder.offset, builder.len)) + .collect(); + + Self { + type_ids, + children, + data_type: builder.data_type, + offsets: None, + } + } + UnionMode::Dense => { + let mut iter = builder.buffers.into_iter(); + let type_ids = iter.next().unwrap(); + let offsets = iter.next().unwrap(); + let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); + let offsets = ScalarBuffer::new(offsets, builder.offset, builder.len); + + Self { + type_ids, + data_type: builder.data_type, + offsets: Some(offsets), + children: builder.child_data, + } + } + } + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.type_ids.len() + } + /// Returns the type ids for this array #[inline] - pub fn type_ids(&self) -> &[i8] { + pub fn type_ids(&self) -> &ScalarBuffer { &self.type_ids } /// Returns the offsets for this array if this is a dense union #[inline] - pub fn offsets(&self) -> Option<&[i32]> { - self.offsets.as_deref() + pub fn offsets(&self) -> Option<&ScalarBuffer> { + self.offsets.as_ref() } /// Returns the children of this array @@ -74,4 +122,50 @@ impl UnionArrayData { pub fn data_type(&self) -> &DataType { &self.data_type } + + /// Returns the underlying parts of this [`UnionArrayData`] + pub fn into_parts( + self, + ) -> ( + DataType, + ScalarBuffer, + Option>, + Vec, + ) { + (self.data_type, self.type_ids, self.offsets, self.children) + } + + /// Returns a zero-copy slice of this array + pub fn slice(&self, offset: usize, len: usize) -> Self { + let (offsets, children) = match &self.offsets { + Some(offsets) => (Some(offsets.slice(offset, len)), self.children.clone()), + None => ( + None, + self.children.iter().map(|c| c.slice(offset, len)).collect(), + ), + }; + Self { + data_type: self.data_type.clone(), + type_ids: self.type_ids.slice(offset, len), + offsets, + children, + } + } + + /// Returns an [`ArrayDataLayout`] representation of this + pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { + let buffers = match &self.offsets { + Some(offsets) => Buffers::two(self.type_ids.inner(), offsets.inner()), + None => Buffers::one(self.type_ids.inner()), + }; + + ArrayDataLayout { + data_type: &self.data_type, + len: self.type_ids.len(), + offset: 0, + nulls: None, + buffers, + child_data: &self.children, + } + } } From 61c4f12e84330db243789fc98375512d67628e57 Mon Sep 17 00:00:00 2001 From: bold Date: Fri, 10 Mar 2023 12:30:54 +0100 Subject: [PATCH 0673/1411] Support timestamp/time and date json decoding (#3835) * Support timestamp/time and date json decoding * Don't support timezones for now Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-json/src/raw/mod.rs | 191 +++++++++++++++++++++++++++++++++++++- 1 file changed, 187 insertions(+), 4 deletions(-) diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 5b699b1d51fb..1ab879d203fb 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -30,7 +30,7 @@ use crate::raw::tape::{Tape, TapeDecoder, TapeElement}; use arrow_array::types::*; use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, SchemaRef}; +use arrow_schema::{ArrowError, DataType, SchemaRef, TimeUnit}; use std::io::BufRead; mod boolean_array; @@ -293,6 +293,16 @@ fn make_decoder( data_type => (primitive_decoder, data_type), DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), + DataType::Timestamp(TimeUnit::Second, None) => primitive_decoder!(TimestampSecondType, data_type), + DataType::Timestamp(TimeUnit::Millisecond, None) => primitive_decoder!(TimestampMillisecondType, data_type), + DataType::Timestamp(TimeUnit::Microsecond, None) => primitive_decoder!(TimestampMicrosecondType, data_type), + DataType::Timestamp(TimeUnit::Nanosecond, None) => primitive_decoder!(TimestampNanosecondType, data_type), + DataType::Date32 => primitive_decoder!(Date32Type, data_type), + DataType::Date64 => primitive_decoder!(Date64Type, data_type), + DataType::Time32(TimeUnit::Second) => primitive_decoder!(Time32SecondType, data_type), + DataType::Time32(TimeUnit::Millisecond) => primitive_decoder!(Time32MillisecondType, data_type), + DataType::Time64(TimeUnit::Microsecond) => primitive_decoder!(Time64MicrosecondType, data_type), + DataType::Time64(TimeUnit::Nanosecond) => primitive_decoder!(Time64NanosecondType, data_type), DataType::Decimal128(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Decimal256(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Boolean => Ok(Box::::default()), @@ -373,10 +383,10 @@ mod tests { #[test] fn test_basic() { let buf = r#" - {"a": 1, "b": 2, "c": true} - {"a": 2E0, "b": 4, "c": false} + {"a": 1, "b": 2, "c": true, "d": 1} + {"a": 2E0, "b": 4, "c": false, "d": 2, "e": 254} - {"b": 6, "a": 2.0} + {"b": 6, "a": 2.0, "d": 45} {"b": "5", "a": 2} {"b": 4e0} {"b": 7, "a": null} @@ -386,6 +396,8 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new("b", DataType::Int32, true), Field::new("c", DataType::Boolean, true), + Field::new("d", DataType::Date32, true), + Field::new("e", DataType::Date64, true), ])); let batches = do_read(buf, 1024, false, schema); @@ -407,6 +419,18 @@ mod tests { assert!(!col3.is_null(0)); assert!(!col3.value(1)); assert!(!col3.is_null(1)); + + let col4 = as_primitive_array::(batches[0].column(3)); + assert_eq!(col4.null_count(), 3); + assert!(col4.is_null(3)); + assert_eq!(col4.values(), &[1, 2, 45, 0, 0, 0]); + + let col5 = as_primitive_array::(batches[0].column(4)); + assert_eq!(col5.null_count(), 5); + assert!(col5.is_null(0)); + assert!(col5.is_null(2)); + assert!(col5.is_null(3)); + assert_eq!(col5.values(), &[0, 254, 0, 0, 0, 0]); } #[test] @@ -782,4 +806,163 @@ mod tests { test_decimal::(DataType::Decimal128(10, 2)); test_decimal::(DataType::Decimal256(10, 2)); } + + fn test_timestamp() { + let buf = r#" + {"a": 1, "b": "2020-09-08T13:42:29.190855+00:00", "c": 38.30} + {"a": 2, "b": "2020-09-08T13:42:29.190855Z", "c": 123.456} + + {"b": 1337, "b": "2020-09-08T13:42:29Z", "c": "1997-01-31T09:26:56.123"} + {"b": 40, "c": "2020-09-08T13:42:29.190855+00:00"} + {"b": 1234, "a": null, "c": "1997-01-31 09:26:56.123Z"} + {"c": "1997-01-31T14:26:56.123-05:00"} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", T::DATA_TYPE, true), + Field::new("b", T::DATA_TYPE, true), + Field::new("c", T::DATA_TYPE, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let unit = match T::DATA_TYPE { + DataType::Timestamp(unit, _) => unit, + _ => unreachable!(), + }; + let unit_in_nanos = match unit { + TimeUnit::Second => 1_000_000_000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Microsecond => 1_000, + TimeUnit::Nanosecond => 1, + }; + + let col1 = as_primitive_array::(batches[0].column(0)); + assert_eq!(col1.null_count(), 4); + assert!(col1.is_null(2)); + assert!(col1.is_null(3)); + assert!(col1.is_null(4)); + assert!(col1.is_null(5)); + assert_eq!(col1.values(), &[1, 2, 0, 0, 0, 0].map(T::Native::usize_as)); + + let col2 = as_primitive_array::(batches[0].column(1)); + assert_eq!(col2.null_count(), 1); + assert!(col2.is_null(5)); + assert_eq!( + col2.values(), + &[ + 1599572549190855000 / unit_in_nanos, + 1599572549190855000 / unit_in_nanos, + 1599572549000000000 / unit_in_nanos, + 40, + 1234, + 0 + ] + .map(T::Native::usize_as) + ); + + let col3 = as_primitive_array::(batches[0].column(2)); + assert_eq!(col3.null_count(), 0); + assert_eq!( + col3.values(), + &[ + 38, + 123, + 854702816123000000 / unit_in_nanos, + 1599572549190855000 / unit_in_nanos, + 854702816123000000 / unit_in_nanos, + 854738816123000000 / unit_in_nanos + ] + .map(T::Native::usize_as) + ); + } + + #[test] + fn test_timestamps() { + test_timestamp::(); + test_timestamp::(); + test_timestamp::(); + test_timestamp::(); + } + + fn test_time() { + let buf = r#" + {"a": 1, "b": "09:26:56.123 AM", "c": 38.30} + {"a": 2, "b": "23:59:59", "c": 123.456} + + {"b": 1337, "b": "6:00 pm", "c": "09:26:56.123"} + {"b": 40, "c": "13:42:29.190855"} + {"b": 1234, "a": null, "c": "09:26:56.123"} + {"c": "14:26:56.123"} + "#; + + let unit = match T::DATA_TYPE { + DataType::Time32(unit) | DataType::Time64(unit) => unit, + _ => unreachable!(), + }; + + let unit_in_nanos = match unit { + TimeUnit::Second => 1_000_000_000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Microsecond => 1_000, + TimeUnit::Nanosecond => 1, + }; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", T::DATA_TYPE, true), + Field::new("b", T::DATA_TYPE, true), + Field::new("c", T::DATA_TYPE, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let col1 = as_primitive_array::(batches[0].column(0)); + assert_eq!(col1.null_count(), 4); + assert!(col1.is_null(2)); + assert!(col1.is_null(3)); + assert!(col1.is_null(4)); + assert!(col1.is_null(5)); + assert_eq!(col1.values(), &[1, 2, 0, 0, 0, 0].map(T::Native::usize_as)); + + let col2 = as_primitive_array::(batches[0].column(1)); + assert_eq!(col2.null_count(), 1); + assert!(col2.is_null(5)); + assert_eq!( + col2.values(), + &[ + 34016123000000 / unit_in_nanos, + 86399000000000 / unit_in_nanos, + 64800000000000 / unit_in_nanos, + 40, + 1234, + 0 + ] + .map(T::Native::usize_as) + ); + + let col3 = as_primitive_array::(batches[0].column(2)); + assert_eq!(col3.null_count(), 0); + assert_eq!( + col3.values(), + &[ + 38, + 123, + 34016123000000 / unit_in_nanos, + 49349190855000 / unit_in_nanos, + 34016123000000 / unit_in_nanos, + 52016123000000 / unit_in_nanos + ] + .map(T::Native::usize_as) + ); + } + + #[test] + fn test_times() { + test_time::(); + test_time::(); + test_time::(); + test_time::(); + } } From 69c04db962b915b8a2d3783853da0ce95a94c0ef Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Mar 2023 14:46:31 +0100 Subject: [PATCH 0674/1411] Make dictionary preservation optional in row encoding (#3831) * Make dictionary preservation optional in row encoding * Review feedback --- arrow-row/src/dictionary.rs | 20 +++- arrow-row/src/lib.rs | 208 ++++++++++++++++++++++++++++++------ arrow/benches/row_format.rs | 39 ++++--- 3 files changed, 223 insertions(+), 44 deletions(-) diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index e332e11316fd..bacc116cade7 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -17,7 +17,7 @@ use crate::fixed::{FixedLengthEncoding, FromSlice}; use crate::interner::{Interned, OrderPreservingInterner}; -use crate::{null_sentinel, Rows}; +use crate::{null_sentinel, Row, Rows}; use arrow_array::builder::*; use arrow_array::cast::*; use arrow_array::types::*; @@ -56,6 +56,24 @@ pub fn compute_dictionary_mapping( } } +/// Encode dictionary values not preserving the dictionary encoding +pub fn encode_dictionary_values( + out: &mut Rows, + column: &DictionaryArray, + values: &Rows, + null: &Row<'_>, +) { + for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { + let row = match k { + Some(k) => values.row(k.as_usize()).data, + None => null.data, + }; + let end_offset = *offset + row.len(); + out.buffer[*offset..end_offset].copy_from_slice(row); + *offset = end_offset; + } +} + /// Dictionary types are encoded as /// /// - single `0_u8` if null diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 2e489c974750..e4b02fbf230d 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -137,6 +137,7 @@ use arrow_schema::*; use crate::dictionary::{ compute_dictionary_mapping, decode_dictionary, encode_dictionary, + encode_dictionary_values, }; use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; use crate::interner::OrderPreservingInterner; @@ -426,7 +427,14 @@ enum Codec { /// No additional codec state is necessary Stateless, /// The interner used to encode dictionary values + /// + /// Used when preserving the dictionary encoding Dictionary(OrderPreservingInterner), + /// A row converter for the dictionary values + /// and the encoding of a row containing only nulls + /// + /// Used when not preserving dictionary encoding + DictionaryValues(RowConverter, OwnedRow), /// A row converter for the child fields /// and the encoding of a row containing only nulls Struct(RowConverter, OwnedRow), @@ -437,7 +445,25 @@ enum Codec { impl Codec { fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { - DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())), + DataType::Dictionary(_, values) => match sort_field.preserve_dictionaries { + true => Ok(Self::Dictionary(Default::default())), + false => { + let sort_field = SortField::new_with_options( + values.as_ref().clone(), + sort_field.options, + ); + + let mut converter = RowConverter::new(vec![sort_field])?; + let null_array = new_null_array(values.as_ref(), 1); + let nulls = converter.convert_columns(&[null_array])?; + + let owned = OwnedRow { + data: nulls.buffer, + config: nulls.config, + }; + Ok(Self::DictionaryValues(converter, owned)) + } + }, d if !d.is_nested() => Ok(Self::Stateless), DataType::List(f) | DataType::LargeList(f) => { // The encoded contents will be inverted if descending is set to true @@ -501,6 +527,15 @@ impl Codec { Ok(Encoder::Dictionary(mapping)) } + Codec::DictionaryValues(converter, nulls) => { + let values = downcast_dictionary_array! { + array => array.values(), + _ => unreachable!() + }; + + let rows = converter.convert_columns(&[values.clone()])?; + Ok(Encoder::DictionaryValues(rows, nulls.row())) + } Codec::Struct(converter, null) => { let v = as_struct_array(array); let rows = converter.convert_columns(v.columns())?; @@ -522,6 +557,9 @@ impl Codec { match self { Codec::Stateless => 0, Codec::Dictionary(interner) => interner.size(), + Codec::DictionaryValues(converter, nulls) => { + converter.size() + nulls.data.len() + } Codec::Struct(converter, nulls) => converter.size() + nulls.data.len(), Codec::List(converter) => converter.size(), } @@ -534,6 +572,8 @@ enum Encoder<'a> { Stateless, /// The mapping from dictionary keys to normalized keys Dictionary(Vec>), + /// The encoding of the child array and the encoding of a null row + DictionaryValues(Rows, Row<'a>), /// The row encoding of the child arrays and the encoding of a null row /// /// It is necessary to encode to a temporary [`Rows`] to avoid serializing @@ -551,6 +591,8 @@ pub struct SortField { options: SortOptions, /// Data type data_type: DataType, + /// Preserve dictionaries + preserve_dictionaries: bool, } impl SortField { @@ -561,7 +603,30 @@ impl SortField { /// Create a new column with the given data type and [`SortOptions`] pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self { - Self { options, data_type } + Self { + options, + data_type, + preserve_dictionaries: true, + } + } + + /// By default dictionaries are preserved as described on [`RowConverter`] + /// + /// However, this process requires maintaining and incrementally updating + /// an order-preserving mapping of dictionary values. This is relatively expensive + /// computationally but reduces the size of the encoded rows, minimising memory + /// usage and potentially yielding faster comparisons. + /// + /// Some applications may wish to instead trade-off space efficiency, for improved + /// encoding performance, by instead encoding dictionary values directly + /// + /// When `preserve_dictionaries` is true, fields will instead be encoded as their + /// underlying value, reversing any dictionary encoding + pub fn preserve_dictionaries(self, preserve_dictionaries: bool) -> Self { + Self { + preserve_dictionaries, + ..self + } } /// Return size of this instance in bytes. @@ -1045,6 +1110,19 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> _ => unreachable!(), } } + Encoder::DictionaryValues(values, null) => { + downcast_dictionary_array! { + array => { + for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { + *length += match v { + Some(k) => values.row(k.as_usize()).data.len(), + None => null.data.len(), + } + } + } + _ => unreachable!(), + } + } Encoder::Struct(rows, null) => { let array = as_struct_array(array); lengths.iter_mut().enumerate().for_each(|(idx, length)| { @@ -1143,6 +1221,12 @@ fn encode_column( _ => unreachable!() } } + Encoder::DictionaryValues(values, nulls) => { + downcast_dictionary_array! { + column => encode_dictionary_values(out, column, values, nulls), + _ => unreachable!() + } + } Encoder::Struct(rows, null) => { let array = as_struct_array(column); let null_sentinel = null_sentinel(opts); @@ -1221,6 +1305,10 @@ unsafe fn decode_column( _ => unreachable!() } } + Codec::DictionaryValues(converter, _) => { + let cols = converter.convert_raw(rows, validate_utf8)?; + cols.into_iter().next().unwrap() + } Codec::Struct(converter, _) => { let (null_count, nulls) = fixed::decode_nulls(rows); rows.iter_mut().for_each(|row| *row = &row[1..]); @@ -1557,8 +1645,25 @@ mod tests { assert_eq!(&cols[0], &col); } + /// If `exact` is false performs a logical comparison between a and dictionary-encoded b + fn dictionary_eq(exact: bool, a: &dyn Array, b: &dyn Array) { + match b.data_type() { + DataType::Dictionary(_, v) if !exact => { + assert_eq!(a.data_type(), v.as_ref()); + let b = arrow_cast::cast(b, v).unwrap(); + assert_eq!(a.data(), b.data()) + } + _ => assert_eq!(a.data(), b.data()), + } + } + #[test] fn test_string_dictionary() { + test_string_dictionary_impl(false); + test_string_dictionary_impl(true); + } + + fn test_string_dictionary_impl(preserve: bool) { let a = Arc::new(DictionaryArray::::from_iter([ Some("foo"), Some("hello"), @@ -1570,8 +1675,8 @@ mod tests { Some("hello"), ])) as ArrayRef; - let mut converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); + let field = SortField::new(a.data_type().clone()).preserve_dictionaries(preserve); + let mut converter = RowConverter::new(vec![field]).unwrap(); let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); assert!(rows_a.row(3) < rows_a.row(5)); @@ -1584,7 +1689,7 @@ mod tests { assert_eq!(rows_a.row(1), rows_a.row(7)); let cols = converter.convert_rows(&rows_a).unwrap(); - assert_eq!(&cols[0], &a); + dictionary_eq(preserve, &cols[0], &a); let b = Arc::new(DictionaryArray::::from_iter([ Some("hello"), @@ -1598,7 +1703,7 @@ mod tests { assert!(rows_b.row(2) < rows_a.row(0)); let cols = converter.convert_rows(&rows_b).unwrap(); - assert_eq!(&cols[0], &b); + dictionary_eq(preserve, &cols[0], &b); let mut converter = RowConverter::new(vec![SortField::new_with_options( a.data_type().clone(), @@ -1606,7 +1711,8 @@ mod tests { descending: true, nulls_first: false, }, - )]) + ) + .preserve_dictionaries(preserve)]) .unwrap(); let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); @@ -1616,7 +1722,7 @@ mod tests { assert!(rows_c.row(3) > rows_c.row(0)); let cols = converter.convert_rows(&rows_c).unwrap(); - assert_eq!(&cols[0], &a); + dictionary_eq(preserve, &cols[0], &a); let mut converter = RowConverter::new(vec![SortField::new_with_options( a.data_type().clone(), @@ -1624,7 +1730,8 @@ mod tests { descending: true, nulls_first: true, }, - )]) + ) + .preserve_dictionaries(preserve)]) .unwrap(); let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); @@ -1634,7 +1741,7 @@ mod tests { assert!(rows_c.row(3) < rows_c.row(0)); let cols = converter.convert_rows(&rows_c).unwrap(); - assert_eq!(&cols[0], &a); + dictionary_eq(preserve, &cols[0], &a); } #[test] @@ -1694,15 +1801,19 @@ mod tests { builder.append(-1).unwrap(); let a = builder.finish(); - - let mut converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); - let rows = converter.convert_columns(&[Arc::new(a)]).unwrap(); - assert!(rows.row(0) < rows.row(1)); - assert!(rows.row(2) < rows.row(0)); - assert!(rows.row(3) < rows.row(2)); - assert!(rows.row(6) < rows.row(2)); - assert!(rows.row(3) < rows.row(6)); + let data_type = a.data_type().clone(); + let columns = [Arc::new(a) as ArrayRef]; + + for preserve in [true, false] { + let field = SortField::new(data_type.clone()).preserve_dictionaries(preserve); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&columns).unwrap(); + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(2) < rows.row(0)); + assert!(rows.row(3) < rows.row(2)); + assert!(rows.row(6) < rows.row(2)); + assert!(rows.row(3) < rows.row(6)); + } } #[test] @@ -1722,15 +1833,17 @@ mod tests { .build() .unwrap(); - let mut converter = RowConverter::new(vec![SortField::new(data_type)]).unwrap(); - let rows = converter - .convert_columns(&[Arc::new(DictionaryArray::::from(data))]) - .unwrap(); + let columns = [Arc::new(DictionaryArray::::from(data)) as ArrayRef]; + for preserve in [true, false] { + let field = SortField::new(data_type.clone()).preserve_dictionaries(preserve); + let mut converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&columns).unwrap(); - assert_eq!(rows.row(0), rows.row(1)); - assert_eq!(rows.row(3), rows.row(4)); - assert_eq!(rows.row(4), rows.row(5)); - assert!(rows.row(3) < rows.row(0)); + assert_eq!(rows.row(0), rows.row(1)); + assert_eq!(rows.row(3), rows.row(4)); + assert_eq!(rows.row(4), rows.row(5)); + assert!(rows.row(3) < rows.row(0)); + } } #[test] @@ -1974,6 +2087,35 @@ mod tests { test_nested_list::(); } + #[test] + fn test_dictionary_preserving() { + let mut dict = StringDictionaryBuilder::::new(); + dict.append_value("foo"); + dict.append_value("foo"); + dict.append_value("bar"); + dict.append_value("bar"); + dict.append_value("bar"); + dict.append_value("bar"); + + let array = Arc::new(dict.finish()) as ArrayRef; + let preserve = SortField::new(array.data_type().clone()); + let non_preserve = preserve.clone().preserve_dictionaries(false); + + let mut c1 = RowConverter::new(vec![preserve]).unwrap(); + let r1 = c1.convert_columns(&[array.clone()]).unwrap(); + + let mut c2 = RowConverter::new(vec![non_preserve]).unwrap(); + let r2 = c2.convert_columns(&[array.clone()]).unwrap(); + + for r in r1.iter() { + assert_eq!(r.data.len(), 3); + } + + for r in r2.iter() { + assert_eq!(r.data.len(), 34); + } + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, @@ -2129,12 +2271,18 @@ mod tests { }) .collect(); + let preserve: Vec<_> = (0..num_columns).map(|_| rng.gen_bool(0.5)).collect(); + let comparator = LexicographicalComparator::try_new(&sort_columns).unwrap(); let columns = options .into_iter() .zip(&arrays) - .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o)) + .zip(&preserve) + .map(|((o, a), p)| { + SortField::new_with_options(a.data_type().clone(), o) + .preserve_dictionaries(*p) + }) .collect(); let mut converter = RowConverter::new(columns).unwrap(); @@ -2160,9 +2308,9 @@ mod tests { } let back = converter.convert_rows(&rows).unwrap(); - for (actual, expected) in back.iter().zip(&arrays) { + for ((actual, expected), preserve) in back.iter().zip(&arrays).zip(preserve) { actual.data().validate_full().unwrap(); - assert_eq!(actual, expected) + dictionary_eq(preserve, actual, expected) } } } diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index 961cf07de721..12ce71764f7e 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -30,10 +30,18 @@ use arrow_array::Array; use criterion::{black_box, Criterion}; use std::sync::Arc; -fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { +fn do_bench( + c: &mut Criterion, + name: &str, + cols: Vec, + preserve_dictionaries: bool, +) { let fields: Vec<_> = cols .iter() - .map(|x| SortField::new(x.data_type().clone())) + .map(|x| { + SortField::new(x.data_type().clone()) + .preserve_dictionaries(preserve_dictionaries) + }) .collect(); c.bench_function(&format!("convert_columns {name}"), |b| { @@ -57,42 +65,46 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { fn row_bench(c: &mut Criterion) { let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; - do_bench(c, "4096 u64(0)", cols); + do_bench(c, "4096 u64(0)", cols, true); let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; - do_bench(c, "4096 i64(0)", cols); + do_bench(c, "4096 i64(0)", cols, true); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; - do_bench(c, "4096 string(10, 0)", cols); + do_bench(c, "4096 string(10, 0)", cols, true); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; - do_bench(c, "4096 string(30, 0)", cols); + do_bench(c, "4096 string(30, 0)", cols, true); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; - do_bench(c, "4096 string(100, 0)", cols); + do_bench(c, "4096 string(100, 0)", cols, true); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; - do_bench(c, "4096 string(100, 0.5)", cols); + do_bench(c, "4096 string(100, 0.5)", cols, true); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(10, 0)", cols); + do_bench(c, "4096 string_dictionary(10, 0)", cols, true); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(30, 0)", cols); + do_bench(c, "4096 string_dictionary(30, 0)", cols, true); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(100, 0)", cols); + do_bench(c, "4096 string_dictionary(100, 0)", cols.clone(), true); + let name = "4096 string_dictionary_non_preserving(100, 0)"; + do_bench(c, name, cols, false); let cols = vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(100, 0.5)", cols); + do_bench(c, "4096 string_dictionary(100, 0.5)", cols.clone(), true); + let name = "4096 string_dictionary_non_preserving(100, 0.5)"; + do_bench(c, name, cols, false); let cols = vec![ Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, @@ -104,6 +116,7 @@ fn row_bench(c: &mut Criterion) { c, "4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", cols, + false, ); let cols = vec![ @@ -112,7 +125,7 @@ fn row_bench(c: &mut Criterion) { Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef, Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, ]; - do_bench(c, "4096 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", cols); + do_bench(c, "4096 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", cols, false); } criterion_group!(benches, row_bench); From 5f402752f8695eda88b1b6f7342a391ae58e21cc Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Fri, 10 Mar 2023 14:48:42 +0100 Subject: [PATCH 0675/1411] refactor: timestamp overflow check (#3840) * refactor: timestamp overflow check * update tests * refactor: cast based on cast_option --- arrow-cast/src/cast.rs | 43 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 35bf62969851..0a4b88ec89f6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1682,7 +1682,11 @@ pub fn cast_with_options( Ordering::Equal => time_array.clone(), Ordering::Less => { let mul = to_size / from_size; - time_array.unary::<_, Int64Type>(|o| o * mul) + if cast_options.safe { + time_array.unary_opt::<_, Int64Type>(|o| o.checked_mul(mul)) + } else { + time_array.try_unary::<_, Int64Type, _>(|o| o.mul_checked(mul))? + } } }; Ok(make_timestamp_array( @@ -1709,8 +1713,22 @@ pub fn cast_with_options( Ok(Arc::new(b.finish()) as ArrayRef) } (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new( - as_primitive_array::(array) - .unary::<_, Date64Type>(|x| x * MILLISECONDS), + match cast_options.safe { + true => { + // change error to None + as_primitive_array::(array) + .unary_opt::<_, Date64Type>(|x| { + x.checked_mul(MILLISECONDS) + }) + } + false => { + as_primitive_array::(array).try_unary::<_, Date64Type, _>( + |x| { + x.mul_checked(MILLISECONDS) + }, + )? + } + }, )), (Timestamp(TimeUnit::Millisecond, _), Date64) => { cast_reinterpret_arrays::(array) @@ -5330,6 +5348,23 @@ mod tests { assert_eq!(864000000005, c.value(0)); assert_eq!(1545696000001, c.value(1)); assert!(c.is_null(2)); + + let array = + TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); + let b = cast(&array, &DataType::Date64).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(864000000005000, c.value(0)); + assert_eq!(1545696000001000, c.value(1)); + + // test overflow, safe cast + let array = TimestampSecondArray::from(vec![Some(i64::MAX)]); + let b = cast(&array, &DataType::Date64).unwrap(); + assert!(b.is_null(0)); + // test overflow, unsafe cast + let array = TimestampSecondArray::from(vec![Some(i64::MAX)]); + let options = CastOptions { safe: false }; + let b = cast_with_options(&array, &DataType::Date64, &options); + assert!(b.is_err()); } #[test] @@ -5406,6 +5441,8 @@ mod tests { assert!(b.is_err()); let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)); assert!(b.is_err()); + let b = cast(&array, &DataType::Time64(TimeUnit::Millisecond)); + assert!(b.is_err()); } #[test] From 0028acd8c1791f7b766db669d93e9281200a71c9 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:06:14 -0500 Subject: [PATCH 0676/1411] Prep for 35.0.0 (#3836) * Update version * update change log script --- arrow-arith/Cargo.toml | 10 +++---- arrow-array/Cargo.toml | 8 +++--- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 ++++----- arrow-csv/Cargo.toml | 12 ++++----- arrow-data/Cargo.toml | 6 ++--- arrow-flight/Cargo.toml | 14 +++++----- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 ++--- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 ++++----- arrow-json/Cargo.toml | 12 ++++----- arrow-ord/Cargo.toml | 12 ++++----- arrow-pyarrow-integration-testing/Cargo.toml | 4 +-- arrow-row/Cargo.toml | 14 +++++----- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +++---- arrow-string/Cargo.toml | 12 ++++----- arrow/Cargo.toml | 28 ++++++++++---------- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/file_release_pr.sh | 4 +-- dev/release/update_change_log.sh | 4 +-- parquet/Cargo.toml | 20 +++++++------- parquet_derive/Cargo.toml | 4 +-- parquet_derive/README.md | 4 +-- parquet_derive_test/Cargo.toml | 6 ++--- 27 files changed, 113 insertions(+), 113 deletions(-) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 6b3d82c9c906..4360332d9c7a 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "34.0.0" +version = "35.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 5f839426edba..1675f59838a7 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "34.0.0" +version = "35.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 63e5aaa4476d..699a1000132f 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "34.0.0" +version = "35.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 79c073b9dd4f..235dca135e5a 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "34.0.0" +version = "35.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,11 +44,11 @@ features = ["prettyprint"] prettyprint = ["comfy-table"] [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-select = { version = "34.0.0", path = "../arrow-select" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-select = { version = "35.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 62ca69bcaf9b..7ceb1401d1c0 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "34.0.0" +version = "35.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index 33de17339131..d58413a762bd 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "34.0.0" +version = "35.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -50,8 +50,8 @@ features = ["ffi"] [dependencies] -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 819818191c83..827be6d058df 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "34.0.0" +version = "35.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "34.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "35.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -58,7 +58,7 @@ tls = ["tonic/tls"] cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] -arrow-cast = { version = "34.0.0", path = "../arrow-cast", features = ["prettyprint"] } +arrow-cast = { version = "35.0.0", path = "../arrow-cast", features = ["prettyprint"] } assert_cmd = "2.0.8" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 1f8026887485..41312cc0c559 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "34.0.0" +arrow-flight = "35.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 2d92e6292ded..ca14401b6899 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "34.0.0" +version = "35.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "34.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } +arrow = { version = "35.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 67d5b7d2745a..48700bbe90d3 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "34.0.0" +version = "35.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 040d1c113a5c..8bd7d31485e4 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "34.0.0" +version = "35.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 3869bfd90b19..92c1a3eb282f 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "34.0.0" +version = "35.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 7e7ec7d4fedd..bc6feb4f2a29 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "34.0.0" +version = "35.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-select = { version = "34.0.0", path = "../arrow-select" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-select = { version = "35.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index cbf2e9cf29d9..ba084c435e64 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "34.0.0" +version = "35.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,5 +32,5 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "34.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "35.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3ddc195c39a0..e2796fbe134c 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "34.0.0" +version = "35.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-ord = { version = "34.0.0", path = "../arrow-ord" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-ord = { version = "35.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index acf6c43b8342..7b240e1ac69c 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "34.0.0" +version = "35.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 540d37cb5aa8..35c51c2da3ea 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "34.0.0" +version = "35.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-array = { version = "34.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 2e8067051644..923b8e8c00c4 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "34.0.0" +version = "35.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-select = { version = "34.0.0", path = "../arrow-select" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-select = { version = "35.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 0c387f305a8e..8814f233bad1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "34.0.0" +version = "35.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "34.0.0", path = "../arrow-arith" } -arrow-array = { version = "34.0.0", path = "../arrow-array" } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "34.0.0", path = "../arrow-cast" } -arrow-csv = { version = "34.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "34.0.0", path = "../arrow-data" } -arrow-ipc = { version = "34.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "34.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "34.0.0", path = "../arrow-ord" } -arrow-row = { version = "34.0.0", path = "../arrow-row" } -arrow-schema = { version = "34.0.0", path = "../arrow-schema" } -arrow-select = { version = "34.0.0", path = "../arrow-select" } -arrow-string = { version = "34.0.0", path = "../arrow-string" } +arrow-arith = { version = "35.0.0", path = "../arrow-arith" } +arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "35.0.0", path = "../arrow-cast" } +arrow-csv = { version = "35.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-ipc = { version = "35.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "35.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "35.0.0", path = "../arrow-ord" } +arrow-row = { version = "35.0.0", path = "../arrow-row" } +arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-select = { version = "35.0.0", path = "../arrow-select" } +arrow-string = { version = "35.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.18", default-features = false, optional = true } diff --git a/arrow/README.md b/arrow/README.md index 6d0772e2d956..479213833244 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `34.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `35.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index 70921dd024da..c7c14b8d58c1 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/34.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/35.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/file_release_pr.sh b/dev/release/file_release_pr.sh index 2db3d7986d3f..081b7c436aa5 100644 --- a/dev/release/file_release_pr.sh +++ b/dev/release/file_release_pr.sh @@ -25,8 +25,8 @@ set -e -FUTURE_RELEASE="29.0.0" -ISSUE_NUMBER=3216 +FUTURE_RELEASE="35.0.0" +ISSUE_NUMBER=3830 TITLE="Update version to \`$FUTURE_RELEASE\` and update \`CHANGELOG\`" BODY="# Which issue does this PR close?\n\nCloses #$ISSUE_NUMBER.\n\n# Rationale for this change\nPrepare for biweekly release\n\n# What changes are included in this PR?\n\n# Are there any user-facing changes?\nYes" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 920498905ccd..b01d190a4f38 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="33.0.0" -FUTURE_RELEASE="34.0.0" +SINCE_TAG="34.0.0" +FUTURE_RELEASE="35.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 87f552fbd36a..a822a966f29d 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "34.0.0" +version = "35.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "34.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "34.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "34.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "34.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "34.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "34.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "34.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "34.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "35.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "35.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "35.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "35.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "35.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "35.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "35.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "35.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "34.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "35.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index cb16846b0fb1..e41ba19086d7 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "34.0.0" +version = "35.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "34.0.0", default-features = false } +parquet = { path = "../parquet", version = "35.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index f3f66c45bc98..2bed2d550e62 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "34.0.0" -parquet_derive = "34.0.0" +parquet = "35.0.0" +parquet_derive = "35.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 33f7675a30ef..cca778d6f51b 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "34.0.0" +version = "35.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "34.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "34.0.0", default-features = false } +parquet = { path = "../parquet", version = "35.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "35.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From c96274a562625f091ca4c06fca21ac35ef330358 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 10 Mar 2023 19:33:30 +0100 Subject: [PATCH 0677/1411] Update changelog for 35.0.0 (#3843) --- CHANGELOG-old.md | 55 ++++++++++++++++++++++++++++ CHANGELOG.md | 95 +++++++++++++++++++++++++++--------------------- 2 files changed, 109 insertions(+), 41 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 9b9df494efb2..2d7903e96a7d 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,61 @@ # Historical Changelog +## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-24) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/33.0.0...34.0.0) + +**Breaking changes:** + +- Infer 2020-03-19 00:00:00 as timestamp not Date64 in CSV \(\#3744\) [\#3746](https://github.com/apache/arrow-rs/pull/3746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement fallible streams for `FlightClient::do_put` [\#3464](https://github.com/apache/arrow-rs/pull/3464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) + +**Implemented enhancements:** + +- Support casting string to timestamp with microsecond resolution [\#3751](https://github.com/apache/arrow-rs/issues/3751) +- Add datatime/interval/duration into comparison kernels [\#3729](https://github.com/apache/arrow-rs/issues/3729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- ! \(not\) operator overload for SortOptions [\#3726](https://github.com/apache/arrow-rs/issues/3726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: convert Bytes to ByteArray directly [\#3719](https://github.com/apache/arrow-rs/issues/3719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Implement simple RecordBatchReader [\#3704](https://github.com/apache/arrow-rs/issues/3704) +- Is possible to implement GenericListArray::from\_iter ? [\#3702](https://github.com/apache/arrow-rs/issues/3702) +- `take_run` improvements [\#3701](https://github.com/apache/arrow-rs/issues/3701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `as_mut_any` in Array trait [\#3655](https://github.com/apache/arrow-rs/issues/3655) +- `Array` --\> `Display` formatter that supports more options and is configurable [\#3638](https://github.com/apache/arrow-rs/issues/3638) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-csv: support decimal256 [\#3474](https://github.com/apache/arrow-rs/issues/3474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- CSV reader infers Date64 type for fields like "2020-03-19 00:00:00" that it can't parse to Date64 [\#3744](https://github.com/apache/arrow-rs/issues/3744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Update to 34.0.0 and update changelog [\#3757](https://github.com/apache/arrow-rs/pull/3757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Update MIRI for split crates \(\#2594\) [\#3754](https://github.com/apache/arrow-rs/pull/3754) ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.11.6 to =0.11.7 [\#3753](https://github.com/apache/arrow-rs/pull/3753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Enable casting of string to timestamp with microsecond resolution [\#3752](https://github.com/apache/arrow-rs/pull/3752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Use Typed Buffers in Arrays \(\#1811\) \(\#1176\) [\#3743](https://github.com/apache/arrow-rs/pull/3743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup arithmetic kernel type constraints [\#3739](https://github.com/apache/arrow-rs/pull/3739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make dictionary kernels optional for comparison benchmark [\#3738](https://github.com/apache/arrow-rs/pull/3738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support String Coercion in Raw JSON Reader [\#3736](https://github.com/apache/arrow-rs/pull/3736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) +- replace for loop by try\_for\_each [\#3734](https://github.com/apache/arrow-rs/pull/3734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) +- feat: implement generic record batch reader [\#3733](https://github.com/apache/arrow-rs/pull/3733) ([wjones127](https://github.com/wjones127)) +- \[minor\] fix doc test fail [\#3732](https://github.com/apache/arrow-rs/pull/3732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Add datetime/interval/duration into dyn scalar comparison [\#3730](https://github.com/apache/arrow-rs/pull/3730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Using Borrow\ on infer\_json\_schema\_from\_iterator [\#3728](https://github.com/apache/arrow-rs/pull/3728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) +- Not operator overload for SortOptions [\#3727](https://github.com/apache/arrow-rs/pull/3727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([berkaysynnada](https://github.com/berkaysynnada)) +- fix: encoding batch with no columns [\#3724](https://github.com/apache/arrow-rs/pull/3724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([wangrunji0408](https://github.com/wangrunji0408)) +- feat: impl `Ord`/`PartialOrd` for `SortOptions` [\#3723](https://github.com/apache/arrow-rs/pull/3723) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add From\ for ByteArray [\#3720](https://github.com/apache/arrow-rs/pull/3720) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Deprecate old JSON reader \(\#3610\) [\#3718](https://github.com/apache/arrow-rs/pull/3718) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add pretty format with options [\#3717](https://github.com/apache/arrow-rs/pull/3717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove unreachable decimal take [\#3716](https://github.com/apache/arrow-rs/pull/3716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Feat: arrow csv decimal256 [\#3711](https://github.com/apache/arrow-rs/pull/3711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) +- perf: `take_run` improvements [\#3705](https://github.com/apache/arrow-rs/pull/3705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- Add raw MapArrayReader [\#3703](https://github.com/apache/arrow-rs/pull/3703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Sort kernel for `RunArray` [\#3695](https://github.com/apache/arrow-rs/pull/3695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- perf: Remove sorting to yield sorted\_rank [\#3693](https://github.com/apache/arrow-rs/pull/3693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- fix: Handle sliced array in run array iterator [\#3681](https://github.com/apache/arrow-rs/pull/3681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) + ## [33.0.0](https://github.com/apache/arrow-rs/tree/33.0.0) (2023-02-10) [Full Changelog](https://github.com/apache/arrow-rs/compare/32.0.0...33.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10a969dca15d..4a7700ca773f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,60 +19,73 @@ # Changelog -## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-24) +## [35.0.0](https://github.com/apache/arrow-rs/tree/35.0.0) (2023-03-10) -[Full Changelog](https://github.com/apache/arrow-rs/compare/33.0.0...34.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/34.0.0...35.0.0) **Breaking changes:** -- Infer 2020-03-19 00:00:00 as timestamp not Date64 in CSV \(\#3744\) [\#3746](https://github.com/apache/arrow-rs/pull/3746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement fallible streams for `FlightClient::do_put` [\#3464](https://github.com/apache/arrow-rs/pull/3464) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Add RunEndBuffer \(\#1799\) [\#3817](https://github.com/apache/arrow-rs/pull/3817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Restrict DictionaryArray to ArrowDictionaryKeyType [\#3813](https://github.com/apache/arrow-rs/pull/3813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- refactor: assorted `FlightSqlServiceClient` improvements [\#3788](https://github.com/apache/arrow-rs/pull/3788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- minor: make Parquet CLI input args consistent [\#3786](https://github.com/apache/arrow-rs/pull/3786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Return Buffers from ArrayData::buffers instead of slice \(\#1799\) [\#3783](https://github.com/apache/arrow-rs/pull/3783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use NullBuffer in ArrayData \(\#3775\) [\#3778](https://github.com/apache/arrow-rs/pull/3778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support casting string to timestamp with microsecond resolution [\#3751](https://github.com/apache/arrow-rs/issues/3751) -- Add datatime/interval/duration into comparison kernels [\#3729](https://github.com/apache/arrow-rs/issues/3729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ! \(not\) operator overload for SortOptions [\#3726](https://github.com/apache/arrow-rs/issues/3726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet: convert Bytes to ByteArray directly [\#3719](https://github.com/apache/arrow-rs/issues/3719) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Implement simple RecordBatchReader [\#3704](https://github.com/apache/arrow-rs/issues/3704) -- Is possible to implement GenericListArray::from\_iter ? [\#3702](https://github.com/apache/arrow-rs/issues/3702) -- `take_run` improvements [\#3701](https://github.com/apache/arrow-rs/issues/3701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `as_mut_any` in Array trait [\#3655](https://github.com/apache/arrow-rs/issues/3655) -- `Array` --\> `Display` formatter that supports more options and is configurable [\#3638](https://github.com/apache/arrow-rs/issues/3638) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow-csv: support decimal256 [\#3474](https://github.com/apache/arrow-rs/issues/3474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support timestamp/time and date types in json decoder [\#3834](https://github.com/apache/arrow-rs/issues/3834) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support decoding decimals in new raw json decoder [\#3819](https://github.com/apache/arrow-rs/issues/3819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Timezone Aware Timestamp Parsing [\#3794](https://github.com/apache/arrow-rs/issues/3794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Preallocate buffers for FixedSizeBinary array creation [\#3792](https://github.com/apache/arrow-rs/issues/3792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make Parquet CLI args consistent [\#3785](https://github.com/apache/arrow-rs/issues/3785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3776](https://github.com/apache/arrow-rs/issues/3776) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use NullBuffer in ArrayData [\#3775](https://github.com/apache/arrow-rs/issues/3775) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support unary\_dict\_mut in arth [\#3710](https://github.com/apache/arrow-rs/issues/3710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support cast \<\> String to interval [\#3643](https://github.com/apache/arrow-rs/issues/3643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Zero-Copy Conversion from Vec to/from MutableBuffer [\#3516](https://github.com/apache/arrow-rs/issues/3516) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- CSV reader infers Date64 type for fields like "2020-03-19 00:00:00" that it can't parse to Date64 [\#3744](https://github.com/apache/arrow-rs/issues/3744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Timestamp Unit Casts are Unchecked [\#3833](https://github.com/apache/arrow-rs/issues/3833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- regexp\_match skips first match when returning match [\#3803](https://github.com/apache/arrow-rs/issues/3803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast to timestamp with time zone returns timestamp [\#3800](https://github.com/apache/arrow-rs/issues/3800) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Schema-level metadata is not encoded in Flight responses [\#3779](https://github.com/apache/arrow-rs/issues/3779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Closed issues:** + +- FlightSQL CLI client: simple test [\#3814](https://github.com/apache/arrow-rs/issues/3814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Merged pull requests:** -- Update to 34.0.0 and update changelog [\#3757](https://github.com/apache/arrow-rs/pull/3757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Update MIRI for split crates \(\#2594\) [\#3754](https://github.com/apache/arrow-rs/pull/3754) ([tustvold](https://github.com/tustvold)) -- Update prost-build requirement from =0.11.6 to =0.11.7 [\#3753](https://github.com/apache/arrow-rs/pull/3753) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Enable casting of string to timestamp with microsecond resolution [\#3752](https://github.com/apache/arrow-rs/pull/3752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) -- Use Typed Buffers in Arrays \(\#1811\) \(\#1176\) [\#3743](https://github.com/apache/arrow-rs/pull/3743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup arithmetic kernel type constraints [\#3739](https://github.com/apache/arrow-rs/pull/3739) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Make dictionary kernels optional for comparison benchmark [\#3738](https://github.com/apache/arrow-rs/pull/3738) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support String Coercion in Raw JSON Reader [\#3736](https://github.com/apache/arrow-rs/pull/3736) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) -- replace for loop by try\_for\_each [\#3734](https://github.com/apache/arrow-rs/pull/3734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) -- feat: implement generic record batch reader [\#3733](https://github.com/apache/arrow-rs/pull/3733) ([wjones127](https://github.com/wjones127)) -- \[minor\] fix doc test fail [\#3732](https://github.com/apache/arrow-rs/pull/3732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Add datetime/interval/duration into dyn scalar comparison [\#3730](https://github.com/apache/arrow-rs/pull/3730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Using Borrow\ on infer\_json\_schema\_from\_iterator [\#3728](https://github.com/apache/arrow-rs/pull/3728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rguerreiromsft](https://github.com/rguerreiromsft)) -- Not operator overload for SortOptions [\#3727](https://github.com/apache/arrow-rs/pull/3727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([berkaysynnada](https://github.com/berkaysynnada)) -- fix: encoding batch with no columns [\#3724](https://github.com/apache/arrow-rs/pull/3724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([wangrunji0408](https://github.com/wangrunji0408)) -- feat: impl `Ord`/`PartialOrd` for `SortOptions` [\#3723](https://github.com/apache/arrow-rs/pull/3723) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Add From\ for ByteArray [\#3720](https://github.com/apache/arrow-rs/pull/3720) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Deprecate old JSON reader \(\#3610\) [\#3718](https://github.com/apache/arrow-rs/pull/3718) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add pretty format with options [\#3717](https://github.com/apache/arrow-rs/pull/3717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove unreachable decimal take [\#3716](https://github.com/apache/arrow-rs/pull/3716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Feat: arrow csv decimal256 [\#3711](https://github.com/apache/arrow-rs/pull/3711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([suxiaogang223](https://github.com/suxiaogang223)) -- perf: `take_run` improvements [\#3705](https://github.com/apache/arrow-rs/pull/3705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- Add raw MapArrayReader [\#3703](https://github.com/apache/arrow-rs/pull/3703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: Sort kernel for `RunArray` [\#3695](https://github.com/apache/arrow-rs/pull/3695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- perf: Remove sorting to yield sorted\_rank [\#3693](https://github.com/apache/arrow-rs/pull/3693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) -- fix: Handle sliced array in run array iterator [\#3681](https://github.com/apache/arrow-rs/pull/3681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- refactor: timestamp overflow check [\#3840](https://github.com/apache/arrow-rs/pull/3840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Prep for 35.0.0 [\#3836](https://github.com/apache/arrow-rs/pull/3836) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Support timestamp/time and date json decoding [\#3835](https://github.com/apache/arrow-rs/pull/3835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Make dictionary preservation optional in row encoding [\#3831](https://github.com/apache/arrow-rs/pull/3831) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move prettyprint to arrow-cast [\#3828](https://github.com/apache/arrow-rs/pull/3828) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Support decoding decimals in raw decoder [\#3820](https://github.com/apache/arrow-rs/pull/3820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Add ArrayDataLayout, port validation \(\#1799\) [\#3818](https://github.com/apache/arrow-rs/pull/3818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- test: add test for FlightSQL CLI client [\#3816](https://github.com/apache/arrow-rs/pull/3816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Add regexp\_match docs [\#3812](https://github.com/apache/arrow-rs/pull/3812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: Ensure Flight schema includes parent metadata [\#3811](https://github.com/apache/arrow-rs/pull/3811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) +- fix: regexp\_match skips first match [\#3807](https://github.com/apache/arrow-rs/pull/3807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- fix: change uft8 to timestamp with timezone [\#3806](https://github.com/apache/arrow-rs/pull/3806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support reading decimal arrays from json [\#3805](https://github.com/apache/arrow-rs/pull/3805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Add unary\_dict\_mut [\#3804](https://github.com/apache/arrow-rs/pull/3804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Faster timestamp parsing \(~70-90% faster\) [\#3801](https://github.com/apache/arrow-rs/pull/3801) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add concat\_elements\_bytes [\#3798](https://github.com/apache/arrow-rs/pull/3798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Timezone aware timestamp parsing \(\#3794\) [\#3795](https://github.com/apache/arrow-rs/pull/3795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Preallocate buffers for FixedSizeBinary array creation [\#3793](https://github.com/apache/arrow-rs/pull/3793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- feat: simple flight sql CLI client [\#3789](https://github.com/apache/arrow-rs/pull/3789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3777](https://github.com/apache/arrow-rs/pull/3777) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- ArrayData Enumeration for Remaining Layouts [\#3769](https://github.com/apache/arrow-rs/pull/3769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.11.7 to =0.11.8 [\#3767](https://github.com/apache/arrow-rs/pull/3767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Implement concat\_elements\_dyn kernel [\#3763](https://github.com/apache/arrow-rs/pull/3763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support for casting `Utf8` and `LargeUtf8` --\> `Interval` [\#3762](https://github.com/apache/arrow-rs/pull/3762) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- into\_inner\(\) for CSV Writer [\#3759](https://github.com/apache/arrow-rs/pull/3759) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Zero-copy Vec conversion \(\#3516\) \(\#1176\) [\#3756](https://github.com/apache/arrow-rs/pull/3756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- ArrayData Enumeration for Primitive, Binary and UTF8 [\#3749](https://github.com/apache/arrow-rs/pull/3749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `into_primitive_dict_builder` to `DictionaryArray` [\#3715](https://github.com/apache/arrow-rs/pull/3715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) From 9ce0ebb06550be943febc226f61bf083016d7652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Metehan=20Y=C4=B1ld=C4=B1r=C4=B1m?= <100111937+metesynnada@users.noreply.github.com> Date: Sat, 11 Mar 2023 14:30:57 +0300 Subject: [PATCH 0678/1411] [ObjectStore] Add `append` API impl for `LocalFileSystem` (#3824) * Append Push API * wasm is not enabled. --- arrow-csv/src/writer.rs | 2 +- object_store/Cargo.toml | 7 +- object_store/src/local.rs | 139 +++++++++++++++++++++++++++++++++++++- 3 files changed, 145 insertions(+), 3 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index b64e306b3a14..28a939d88f34 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -194,7 +194,7 @@ impl Writer { } /// A CSV writer builder -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct WriterBuilder { /// Optional column delimiter. Defaults to `b','` delimiter: Option, diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c0c090cd0f00..c6bb7e855785 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -37,7 +37,6 @@ itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" -tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } tracing = { version = "0.1" } url = "2.2" walkdir = "2" @@ -57,6 +56,12 @@ aws-types = { version = "0.54", optional = true } aws-credential-types = { version = "0.54", optional = true } aws-config = { version = "0.54", optional = true } +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } + +[target.'cfg(target_arch = "wasm32")'.dependencies] +tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] diff --git a/object_store/src/local.rs b/object_store/src/local.rs index f1733f54bab1..ac0b02070d5e 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -269,7 +269,6 @@ impl Config { impl ObjectStore for LocalFileSystem { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { let path = self.config.path_to_filesystem(location)?; - maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); @@ -313,6 +312,53 @@ impl ObjectStore for LocalFileSystem { .await } + async fn append( + &self, + location: &Path, + ) -> Result> { + #[cfg(not(target_arch = "wasm32"))] + // Get the path to the file from the configuration. + let path = self.config.path_to_filesystem(location)?; + loop { + // Create new `OpenOptions`. + let mut options = tokio::fs::OpenOptions::new(); + + // Attempt to open the file with the given options. + match options + .truncate(false) + .append(true) + .create(true) + .open(&path) + .await + { + // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. + Ok(file) => return Ok(Box::new(file)), + // If the error is that the file was not found, attempt to create the file and any necessary parent directories. + Err(err) if err.kind() == ErrorKind::NotFound => { + // Get the path to the parent directory of the file. + let parent = path + .parent() + // If the parent directory does not exist, return a `UnableToCreateFileSnafu` error. + .context(UnableToCreateFileSnafu { path: &path, err })?; + + // Create the parent directory and any necessary ancestors. + tokio::fs::create_dir_all(parent) + .await + // If creating the directory fails, return a `UnableToCreateDirSnafu` error. + .context(UnableToCreateDirSnafu { path: parent })?; + // Try again to open the file. + continue; + } + // If any other error occurs, return a `UnableToOpenFile` error. + Err(source) => { + return Err(Error::UnableToOpenFile { source, path }.into()) + } + } + } + #[cfg(target_arch = "wasm32")] + Err(super::Error::NotImplemented) + } + async fn get(&self, location: &Path) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { @@ -1305,3 +1351,94 @@ mod tests { integration.list_with_delimiter(Some(&path)).await.unwrap(); } } + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(test)] +mod not_wasm_tests { + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; + use bytes::Bytes; + use tempfile::TempDir; + use tokio::io::AsyncWriteExt; + + #[tokio::test] + async fn creates_dir_if_not_present_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("nested/file/test_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + let mut writer = integration.append(&location).await.unwrap(); + + writer.write_all(data.as_ref()).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn unknown_length_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + let mut writer = integration.append(&location).await.unwrap(); + + writer.write_all(data.as_ref()).await.unwrap(); + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn multiple_append() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + + let data = vec![ + Bytes::from("arbitrary"), + Bytes::from("data"), + Bytes::from("gnz"), + ]; + + let mut writer = integration.append(&location).await.unwrap(); + for d in &data { + writer.write_all(d).await.unwrap(); + } + + let mut writer = integration.append(&location).await.unwrap(); + for d in &data { + writer.write_all(d).await.unwrap(); + } + + let read_data = integration + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); + assert_eq!(&*read_data, expected_data); + } +} From 527d770a36ecf9e98febad751ae1f06db252e01f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Mar 2023 09:26:22 -0700 Subject: [PATCH 0679/1411] Update bitflags requirement from 1.2.1 to 2.0.0 (#3852) Updates the requirements on [bitflags](https://github.com/bitflags/bitflags) to permit the latest version. - [Release notes](https://github.com/bitflags/bitflags/releases) - [Changelog](https://github.com/bitflags/bitflags/blob/main/CHANGELOG.md) - [Commits](https://github.com/bitflags/bitflags/compare/1.2.1...2.0.0) --- updated-dependencies: - dependency-name: bitflags dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-schema/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 7b240e1ac69c..62cb9f3c257e 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -39,7 +39,7 @@ bench = false [dependencies] serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true } -bitflags = { version = "1.2.1", default-features = false, optional = true } +bitflags = { version = "2.0.0", default-features = false, optional = true } [features] # Enable ffi support From c1567152812ea049a8544830ffcb3d92946217d9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Mar 2023 09:26:38 -0700 Subject: [PATCH 0680/1411] Update proc-macro2 requirement from =1.0.51 to =1.0.52 (#3853) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.51...1.0.52) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 827be6d058df..fa333889bfc7 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -67,7 +67,7 @@ tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.51", default-features = false } +proc-macro2 = { version = "=1.0.52", default-features = false } prost-build = { version = "=0.11.8", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From dfb8c769606efd4fd8731706b287993479b339ca Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Mar 2023 08:43:57 +0000 Subject: [PATCH 0681/1411] Add offset pushdown to parquet (#3848) --- parquet/src/arrow/arrow_reader/mod.rs | 92 ++++++++++++--- parquet/src/arrow/arrow_reader/selection.rs | 124 +++++++++++++++++--- parquet/src/arrow/async_reader/mod.rs | 114 ++++++++++++++---- 3 files changed, 280 insertions(+), 50 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c4b645da7ce5..6c8d08de251d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -71,6 +71,8 @@ pub struct ArrowReaderBuilder { pub(crate) selection: Option, pub(crate) limit: Option, + + pub(crate) offset: Option, } impl ArrowReaderBuilder { @@ -101,6 +103,7 @@ impl ArrowReaderBuilder { filter: None, selection: None, limit: None, + offset: None, }) } @@ -181,6 +184,17 @@ impl ArrowReaderBuilder { ..self } } + + /// Provide an offset to skip over the given number of rows + /// + /// The offset will be applied after any [`Self::with_row_selection`] and [`Self::with_row_filter`] + /// allowing it to skip rows after any pushed down predicates + pub fn with_offset(self, offset: usize) -> Self { + Self { + offset: Some(offset), + ..self + } + } } /// Arrow reader api. @@ -467,23 +481,10 @@ impl ArrowReaderBuilder> { selection = Some(RowSelection::from(vec![])); } - // If a limit is defined, apply it to the final `RowSelection` - if let Some(limit) = self.limit { - selection = Some( - selection - .map(|selection| selection.limit(limit)) - .unwrap_or_else(|| { - RowSelection::from(vec![RowSelector::select( - limit.min(reader.num_rows()), - )]) - }), - ); - } - Ok(ParquetRecordBatchReader::new( batch_size, array_reader, - selection, + apply_range(selection, reader.num_rows(), self.offset, self.limit), )) } } @@ -620,6 +621,41 @@ pub(crate) fn selects_any(selection: Option<&RowSelection>) -> bool { selection.map(|x| x.selects_any()).unwrap_or(true) } +/// Applies an optional offset and limit to an optional [`RowSelection`] +pub(crate) fn apply_range( + mut selection: Option, + row_count: usize, + offset: Option, + limit: Option, +) -> Option { + // If an offset is defined, apply it to the `selection` + if let Some(offset) = offset { + selection = Some(match row_count.checked_sub(offset) { + None => RowSelection::from(vec![]), + Some(remaining) => selection + .map(|selection| selection.offset(offset)) + .unwrap_or_else(|| { + RowSelection::from(vec![ + RowSelector::skip(offset), + RowSelector::select(remaining), + ]) + }), + }); + } + + // If a limit is defined, apply it to the final `selection` + if let Some(limit) = limit { + selection = Some( + selection + .map(|selection| selection.limit(limit)) + .unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(limit.min(row_count))]) + }), + ); + } + selection +} + /// Evaluates an [`ArrowPredicate`] returning the [`RowSelection`] /// /// If this [`ParquetRecordBatchReader`] has a [`RowSelection`], the @@ -1244,6 +1280,8 @@ mod tests { row_filter: Option>, /// limit limit: Option, + /// offset + offset: Option, } /// Manually implement this to avoid printing entire contents of row_selections and row_filter @@ -1263,6 +1301,7 @@ mod tests { .field("row_selections", &self.row_selections.is_some()) .field("row_filter", &self.row_filter.is_some()) .field("limit", &self.limit) + .field("offset", &self.offset) .finish() } } @@ -1283,6 +1322,7 @@ mod tests { row_selections: None, row_filter: None, limit: None, + offset: None, } } } @@ -1361,6 +1401,13 @@ mod tests { } } + fn with_offset(self, offset: usize) -> Self { + Self { + offset: Some(offset), + ..self + } + } + fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() .set_data_pagesize_limit(self.max_data_page_size) @@ -1427,6 +1474,12 @@ mod tests { TestOptions::new(4, 100, 25).with_limit(10), // Test with limit larger than number of rows TestOptions::new(4, 100, 25).with_limit(101), + // Test with limit + offset equal to number of rows + TestOptions::new(4, 100, 25).with_offset(30).with_limit(20), + // Test with limit + offset equal to number of rows + TestOptions::new(4, 100, 25).with_offset(20).with_limit(80), + // Test with limit + offset larger than number of rows + TestOptions::new(4, 100, 25).with_offset(20).with_limit(81), // Test with no page-level statistics TestOptions::new(2, 256, 91) .with_null_percent(25) @@ -1474,6 +1527,12 @@ mod tests { .with_null_percent(25) .with_row_selections() .with_limit(10), + // Test optional with nulls + TestOptions::new(2, 256, 93) + .with_null_percent(25) + .with_row_selections() + .with_offset(20) + .with_limit(10), // Test filter // Test with row filter @@ -1673,6 +1732,11 @@ mod tests { None => expected_data, }; + if let Some(offset) = opts.offset { + builder = builder.with_offset(offset); + expected_data = expected_data.into_iter().skip(offset).collect(); + } + if let Some(limit) = opts.limit { builder = builder.with_limit(limit); expected_data = expected_data.into_iter().take(limit).collect(); diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index d2af4516dd08..d3abf968b3b2 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -19,7 +19,6 @@ use arrow_array::{Array, BooleanArray}; use arrow_select::filter::SlicesIterator; use std::cmp::Ordering; use std::collections::VecDeque; -use std::mem; use std::ops::Range; /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when @@ -236,13 +235,13 @@ impl RowSelection { let mut total_count = 0; // Find the index where the selector exceeds the row count - let find = self.selectors.iter().enumerate().find(|(_, selector)| { + let find = self.selectors.iter().position(|selector| { total_count += selector.row_count; total_count > row_count }); let split_idx = match find { - Some((idx, _)) => idx, + Some(idx) => idx, None => { let selectors = std::mem::take(&mut self.selectors); return Self { selectors }; @@ -372,29 +371,63 @@ impl RowSelection { self } + /// Applies an offset to this [`RowSelection`], skipping the first `offset` selected rows + pub(crate) fn offset(mut self, offset: usize) -> Self { + if offset == 0 { + return self; + } + + let mut selected_count = 0; + let mut skipped_count = 0; + + // Find the index where the selector exceeds the row count + let find = self + .selectors + .iter() + .position(|selector| match selector.skip { + true => { + skipped_count += selector.row_count; + false + } + false => { + selected_count += selector.row_count; + selected_count > offset + } + }); + + let split_idx = match find { + Some(idx) => idx, + None => { + self.selectors.clear(); + return self; + } + }; + + let mut selectors = Vec::with_capacity(self.selectors.len() - split_idx + 1); + selectors.push(RowSelector::skip(skipped_count + offset)); + selectors.push(RowSelector::select(selected_count - offset)); + selectors.extend_from_slice(&self.selectors[split_idx + 1..]); + + Self { selectors } + } + /// Limit this [`RowSelection`] to only select `limit` rows pub(crate) fn limit(mut self, mut limit: usize) -> Self { - let mut new_selectors = Vec::with_capacity(self.selectors.len()); - for mut selection in mem::take(&mut self.selectors) { - if limit == 0 { - break; - } + if limit == 0 { + self.selectors.clear(); + } + for (idx, selection) in self.selectors.iter_mut().enumerate() { if !selection.skip { if selection.row_count >= limit { selection.row_count = limit; - new_selectors.push(selection); + self.selectors.truncate(idx + 1); break; } else { limit -= selection.row_count; - new_selectors.push(selection); } - } else { - new_selectors.push(selection); } } - - self.selectors = new_selectors; self } @@ -403,6 +436,11 @@ impl RowSelection { pub fn iter(&self) -> impl Iterator { self.selectors.iter() } + + /// Returns the number of selected rows + pub fn row_count(&self) -> usize { + self.iter().filter(|s| !s.skip).map(|s| s.row_count).sum() + } } impl From> for RowSelection { @@ -593,6 +631,64 @@ mod tests { assert!(selection.selectors.is_empty()); } + #[test] + fn test_offset() { + let selection = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ]); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(5); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(5), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(33), + RowSelector::select(2), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(68), RowSelector::select(6),] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(71), RowSelector::select(3),] + ); + } + #[test] fn test_and() { let mut a = RowSelection::from(vec![ diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 213f61818c15..99fe650695a0 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -98,8 +98,8 @@ use arrow_schema::SchemaRef; use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; use crate::arrow::arrow_reader::{ - evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, - ParquetRecordBatchReader, RowFilter, RowSelection, RowSelector, + apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, + ParquetRecordBatchReader, RowFilter, RowSelection, }; use crate::arrow::schema::ParquetField; use crate::arrow::ProjectionMask; @@ -347,12 +347,13 @@ impl ArrowReaderBuilder> { filter: self.filter, metadata: self.metadata.clone(), fields: self.fields, + limit: self.limit, + offset: self.offset, }; Ok(ParquetRecordBatchStream { metadata: self.metadata, batch_size, - limit: self.limit, row_groups, projection: self.projection, selection: self.selection, @@ -375,6 +376,10 @@ struct ReaderFactory { input: T, filter: Option, + + limit: Option, + + offset: Option, } impl ReaderFactory @@ -390,7 +395,6 @@ where mut selection: Option, projection: ProjectionMask, batch_size: usize, - limit: Option, ) -> ReadResult { // TODO: calling build_array multiple times is wasteful @@ -428,19 +432,37 @@ where } } - if !selects_any(selection.as_ref()) { + // Compute the number of rows in the selection before applying limit and offset + let rows_before = selection + .as_ref() + .map(|s| s.row_count()) + .unwrap_or(row_group.row_count); + + if rows_before == 0 { + return Ok((self, None)); + } + + selection = apply_range(selection, row_group.row_count, self.offset, self.limit); + + // Compute the number of rows in the selection after applying limit and offset + let rows_after = selection + .as_ref() + .map(|s| s.row_count()) + .unwrap_or(row_group.row_count); + + // Update offset if necessary + if let Some(offset) = &mut self.offset { + // Reduction is either because of offset or limit, as limit is applied + // after offset has been "exhausted" can just use saturating sub here + *offset = offset.saturating_sub(rows_before - rows_after) + } + + if rows_after == 0 { return Ok((self, None)); } - // If a limit is defined, apply it to the final `RowSelection` - if let Some(limit) = limit { - selection = Some( - selection - .map(|selection| selection.limit(limit)) - .unwrap_or_else(|| { - RowSelection::from(vec![RowSelector::select(limit)]) - }), - ); + if let Some(limit) = &mut self.limit { + *limit -= rows_after; } row_group @@ -492,8 +514,6 @@ pub struct ParquetRecordBatchStream { batch_size: usize, - limit: Option, - selection: Option, /// This is an option so it can be moved into a future @@ -535,9 +555,6 @@ where match &mut self.state { StreamState::Decoding(batch_reader) => match batch_reader.next() { Some(Ok(batch)) => { - if let Some(limit) = self.limit.as_mut() { - *limit -= batch.num_rows(); - } return Poll::Ready(Some(Ok(batch))); } Some(Err(e)) => { @@ -568,7 +585,6 @@ where selection, self.projection.clone(), self.batch_size, - self.limit, ) .boxed(); @@ -824,11 +840,14 @@ mod tests { use crate::file::page_index::index_reader; use crate::file::properties::WriterProperties; use arrow::error::Result as ArrowResult; + use arrow_array::cast::as_primitive_array; + use arrow_array::types::Int32Type; use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; use futures::TryStreamExt; use rand::{thread_rng, Rng}; use std::sync::Mutex; + #[derive(Clone)] struct TestReader { data: Bytes, metadata: Arc, @@ -1320,7 +1339,7 @@ mod tests { requests: Default::default(), }; - let stream = ParquetRecordBatchStreamBuilder::new(test) + let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) .await .unwrap() .with_batch_size(1024) @@ -1336,11 +1355,60 @@ mod tests { // First batch should contain all rows assert_eq!(batch.num_rows(), 3); assert_eq!(batch.num_columns(), 3); + let col2 = as_primitive_array::(batch.column(2)); + assert_eq!(col2.values(), &[0, 1, 2]); let batch = &batches[1]; // Second batch should trigger the limit and only have one row assert_eq!(batch.num_rows(), 1); assert_eq!(batch.num_columns(), 3); + let col2 = as_primitive_array::(batch.column(2)); + assert_eq!(col2.values(), &[3]); + + let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) + .await + .unwrap() + .with_offset(2) + .with_limit(3) + .build() + .unwrap(); + + let batches: Vec<_> = stream.try_collect().await.unwrap(); + // Expect one batch for each row group + assert_eq!(batches.len(), 2); + + let batch = &batches[0]; + // First batch should contain one row + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 3); + let col2 = as_primitive_array::(batch.column(2)); + assert_eq!(col2.values(), &[2]); + + let batch = &batches[1]; + // Second batch should contain two rows + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + let col2 = as_primitive_array::(batch.column(2)); + assert_eq!(col2.values(), &[3, 4]); + + let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) + .await + .unwrap() + .with_offset(4) + .with_limit(20) + .build() + .unwrap(); + + let batches: Vec<_> = stream.try_collect().await.unwrap(); + // Should skip first row group + assert_eq!(batches.len(), 1); + + let batch = &batches[0]; + // First batch should contain two rows + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + let col2 = as_primitive_array::(batch.column(2)); + assert_eq!(col2.values(), &[4, 5]); } #[tokio::test] @@ -1440,6 +1508,8 @@ mod tests { fields, input: async_reader, filter: None, + limit: None, + offset: None, }; let mut skip = true; @@ -1469,7 +1539,7 @@ mod tests { let selection = RowSelection::from(selectors); let (_factory, _reader) = reader_factory - .read_row_group(0, Some(selection), projection.clone(), 48, None) + .read_row_group(0, Some(selection), projection.clone(), 48) .await .expect("reading row group"); From 7b94b08c7d98e1f449955e2f31a94b871dc3e78e Mon Sep 17 00:00:00 2001 From: bold Date: Tue, 14 Mar 2023 13:10:47 +0100 Subject: [PATCH 0682/1411] Support compression levels (#3847) * Support zstd compression levels * Support gzip compression levels * Fix tests * Support brotli compression level * Fix tests * Add tests for all supported compression levels --- parquet/src/basic.rs | 52 ++++++--- parquet/src/bin/parquet-fromcsv.rs | 21 +++- parquet/src/bin/parquet-layout.rs | 6 +- parquet/src/bin/parquet-rewrite.rs | 6 +- parquet/src/compression.rs | 182 ++++++++++++++++++++++++----- parquet/src/file/properties.rs | 11 +- 6 files changed, 217 insertions(+), 61 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index e971c8632643..9f4f4ee1d1d6 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -20,6 +20,7 @@ use std::{fmt, str}; +use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::format as parquet; use crate::errors::{ParquetError, Result}; @@ -286,11 +287,11 @@ pub enum Encoding { pub enum Compression { UNCOMPRESSED, SNAPPY, - GZIP, + GZIP(GzipLevel), LZO, - BROTLI, + BROTLI(BrotliLevel), LZ4, - ZSTD, + ZSTD(ZstdLevel), LZ4_RAW, } @@ -830,11 +831,11 @@ impl TryFrom for Compression { Ok(match value { parquet::CompressionCodec::UNCOMPRESSED => Compression::UNCOMPRESSED, parquet::CompressionCodec::SNAPPY => Compression::SNAPPY, - parquet::CompressionCodec::GZIP => Compression::GZIP, + parquet::CompressionCodec::GZIP => Compression::GZIP(Default::default()), parquet::CompressionCodec::LZO => Compression::LZO, - parquet::CompressionCodec::BROTLI => Compression::BROTLI, + parquet::CompressionCodec::BROTLI => Compression::BROTLI(Default::default()), parquet::CompressionCodec::LZ4 => Compression::LZ4, - parquet::CompressionCodec::ZSTD => Compression::ZSTD, + parquet::CompressionCodec::ZSTD => Compression::ZSTD(Default::default()), parquet::CompressionCodec::LZ4_RAW => Compression::LZ4_RAW, _ => { return Err(general_err!( @@ -851,11 +852,11 @@ impl From for parquet::CompressionCodec { match value { Compression::UNCOMPRESSED => parquet::CompressionCodec::UNCOMPRESSED, Compression::SNAPPY => parquet::CompressionCodec::SNAPPY, - Compression::GZIP => parquet::CompressionCodec::GZIP, + Compression::GZIP(_) => parquet::CompressionCodec::GZIP, Compression::LZO => parquet::CompressionCodec::LZO, - Compression::BROTLI => parquet::CompressionCodec::BROTLI, + Compression::BROTLI(_) => parquet::CompressionCodec::BROTLI, Compression::LZ4 => parquet::CompressionCodec::LZ4, - Compression::ZSTD => parquet::CompressionCodec::ZSTD, + Compression::ZSTD(_) => parquet::CompressionCodec::ZSTD, Compression::LZ4_RAW => parquet::CompressionCodec::LZ4_RAW, } } @@ -1783,11 +1784,20 @@ mod tests { fn test_display_compression() { assert_eq!(Compression::UNCOMPRESSED.to_string(), "UNCOMPRESSED"); assert_eq!(Compression::SNAPPY.to_string(), "SNAPPY"); - assert_eq!(Compression::GZIP.to_string(), "GZIP"); + assert_eq!( + Compression::GZIP(Default::default()).to_string(), + "GZIP(GzipLevel(6))" + ); assert_eq!(Compression::LZO.to_string(), "LZO"); - assert_eq!(Compression::BROTLI.to_string(), "BROTLI"); + assert_eq!( + Compression::BROTLI(Default::default()).to_string(), + "BROTLI(BrotliLevel(1))" + ); assert_eq!(Compression::LZ4.to_string(), "LZ4"); - assert_eq!(Compression::ZSTD.to_string(), "ZSTD"); + assert_eq!( + Compression::ZSTD(Default::default()).to_string(), + "ZSTD(ZstdLevel(1))" + ); } #[test] @@ -1802,7 +1812,7 @@ mod tests { ); assert_eq!( Compression::try_from(parquet::CompressionCodec::GZIP).unwrap(), - Compression::GZIP + Compression::GZIP(Default::default()) ); assert_eq!( Compression::try_from(parquet::CompressionCodec::LZO).unwrap(), @@ -1810,7 +1820,7 @@ mod tests { ); assert_eq!( Compression::try_from(parquet::CompressionCodec::BROTLI).unwrap(), - Compression::BROTLI + Compression::BROTLI(Default::default()) ); assert_eq!( Compression::try_from(parquet::CompressionCodec::LZ4).unwrap(), @@ -1818,7 +1828,7 @@ mod tests { ); assert_eq!( Compression::try_from(parquet::CompressionCodec::ZSTD).unwrap(), - Compression::ZSTD + Compression::ZSTD(Default::default()) ); } @@ -1832,14 +1842,20 @@ mod tests { parquet::CompressionCodec::SNAPPY, Compression::SNAPPY.into() ); - assert_eq!(parquet::CompressionCodec::GZIP, Compression::GZIP.into()); + assert_eq!( + parquet::CompressionCodec::GZIP, + Compression::GZIP(Default::default()).into() + ); assert_eq!(parquet::CompressionCodec::LZO, Compression::LZO.into()); assert_eq!( parquet::CompressionCodec::BROTLI, - Compression::BROTLI.into() + Compression::BROTLI(Default::default()).into() ); assert_eq!(parquet::CompressionCodec::LZ4, Compression::LZ4.into()); - assert_eq!(parquet::CompressionCodec::ZSTD, Compression::ZSTD.into()); + assert_eq!( + parquet::CompressionCodec::ZSTD, + Compression::ZSTD(Default::default()).into() + ); } #[test] diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index b1de492f5792..0a9950e9cfcd 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -213,11 +213,11 @@ fn compression_from_str(cmp: &str) -> Result { match cmp.to_uppercase().as_str() { "UNCOMPRESSED" => Ok(Compression::UNCOMPRESSED), "SNAPPY" => Ok(Compression::SNAPPY), - "GZIP" => Ok(Compression::GZIP), + "GZIP" => Ok(Compression::GZIP(Default::default())), "LZO" => Ok(Compression::LZO), - "BROTLI" => Ok(Compression::BROTLI), + "BROTLI" => Ok(Compression::BROTLI(Default::default())), "LZ4" => Ok(Compression::LZ4), - "ZSTD" => Ok(Compression::ZSTD), + "ZSTD" => Ok(Compression::ZSTD(Default::default())), v => Err( format!("Unknown compression {v} : possible values UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD \n\nFor more information try --help") ) @@ -507,15 +507,24 @@ mod tests { let args = parse_args(vec!["--parquet-compression", "snappy"]).unwrap(); assert_eq!(args.parquet_compression, Compression::SNAPPY); let args = parse_args(vec!["--parquet-compression", "gzip"]).unwrap(); - assert_eq!(args.parquet_compression, Compression::GZIP); + assert_eq!( + args.parquet_compression, + Compression::GZIP(Default::default()) + ); let args = parse_args(vec!["--parquet-compression", "lzo"]).unwrap(); assert_eq!(args.parquet_compression, Compression::LZO); let args = parse_args(vec!["--parquet-compression", "lz4"]).unwrap(); assert_eq!(args.parquet_compression, Compression::LZ4); let args = parse_args(vec!["--parquet-compression", "brotli"]).unwrap(); - assert_eq!(args.parquet_compression, Compression::BROTLI); + assert_eq!( + args.parquet_compression, + Compression::BROTLI(Default::default()) + ); let args = parse_args(vec!["--parquet-compression", "zstd"]).unwrap(); - assert_eq!(args.parquet_compression, Compression::ZSTD); + assert_eq!( + args.parquet_compression, + Compression::ZSTD(Default::default()) + ); } #[test] diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 7a685d2069e8..7278c718c968 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -184,11 +184,11 @@ fn compression(compression: Compression) -> Option<&'static str> { match compression { Compression::UNCOMPRESSED => None, Compression::SNAPPY => Some("snappy"), - Compression::GZIP => Some("gzip"), + Compression::GZIP(_) => Some("gzip"), Compression::LZO => Some("lzo"), - Compression::BROTLI => Some("brotli"), + Compression::BROTLI(_) => Some("brotli"), Compression::LZ4 => Some("lz4"), - Compression::ZSTD => Some("zstd"), + Compression::ZSTD(_) => Some("zstd"), Compression::LZ4_RAW => Some("lz4_raw"), } } diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs index cd60225cad84..57e8885c3ed1 100644 --- a/parquet/src/bin/parquet-rewrite.rs +++ b/parquet/src/bin/parquet-rewrite.rs @@ -79,11 +79,11 @@ impl From for Compression { match value { CompressionArgs::None => Self::UNCOMPRESSED, CompressionArgs::Snappy => Self::SNAPPY, - CompressionArgs::Gzip => Self::GZIP, + CompressionArgs::Gzip => Self::GZIP(Default::default()), CompressionArgs::Lzo => Self::LZO, - CompressionArgs::Brotli => Self::BROTLI, + CompressionArgs::Brotli => Self::BROTLI(Default::default()), CompressionArgs::Lz4 => Self::LZ4, - CompressionArgs::Zstd => Self::ZSTD, + CompressionArgs::Zstd => Self::ZSTD(Default::default()), CompressionArgs::Lz4Raw => Self::LZ4_RAW, } } diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 4ee321609e04..4c4057e7a77c 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -121,6 +121,26 @@ impl CodecOptionsBuilder { } } +/// Defines valid compression levels. +pub(crate) trait CompressionLevel { + const MINIMUM_LEVEL: T; + const MAXIMUM_LEVEL: T; + + /// Tests if the provided compression level is valid. + fn is_valid_level(level: T) -> Result<()> { + let compression_range = Self::MINIMUM_LEVEL..=Self::MAXIMUM_LEVEL; + if compression_range.contains(&level) { + Ok(()) + } else { + Err(ParquetError::General(format!( + "valid compression range {}..={} exceeded.", + compression_range.start(), + compression_range.end() + ))) + } + } +} + /// Given the compression type `codec`, returns a codec used to compress and decompress /// bytes for the compression type. /// This returns `None` if the codec type is `UNCOMPRESSED`. @@ -130,9 +150,9 @@ pub fn create_codec( ) -> Result>> { match codec { #[cfg(any(feature = "brotli", test))] - CodecType::BROTLI => Ok(Some(Box::new(BrotliCodec::new()))), + CodecType::BROTLI(level) => Ok(Some(Box::new(BrotliCodec::new(level)))), #[cfg(any(feature = "flate2", test))] - CodecType::GZIP => Ok(Some(Box::new(GZipCodec::new()))), + CodecType::GZIP(level) => Ok(Some(Box::new(GZipCodec::new(level)))), #[cfg(any(feature = "snap", test))] CodecType::SNAPPY => Ok(Some(Box::new(SnappyCodec::new()))), #[cfg(any(feature = "lz4", test))] @@ -140,7 +160,7 @@ pub fn create_codec( _options.backward_compatible_lz4, )))), #[cfg(any(feature = "zstd", test))] - CodecType::ZSTD => Ok(Some(Box::new(ZSTDCodec::new()))), + CodecType::ZSTD(level) => Ok(Some(Box::new(ZSTDCodec::new(level)))), #[cfg(any(feature = "lz4", test))] CodecType::LZ4_RAW => Ok(Some(Box::new(LZ4RawCodec::new()))), CodecType::UNCOMPRESSED => Ok(None), @@ -214,13 +234,17 @@ mod gzip_codec { use crate::compression::Codec; use crate::errors::Result; + use super::GzipLevel; + /// Codec for GZIP compression algorithm. - pub struct GZipCodec {} + pub struct GZipCodec { + level: GzipLevel, + } impl GZipCodec { /// Creates new GZIP compression codec. - pub(crate) fn new() -> Self { - Self {} + pub(crate) fn new(level: GzipLevel) -> Self { + Self { level } } } @@ -236,7 +260,8 @@ mod gzip_codec { } fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { - let mut encoder = write::GzEncoder::new(output_buf, Compression::default()); + let mut encoder = + write::GzEncoder::new(output_buf, Compression::new(self.level.0)); encoder.write_all(input_buf)?; encoder.try_finish().map_err(|e| e.into()) } @@ -245,6 +270,37 @@ mod gzip_codec { #[cfg(any(feature = "flate2", test))] pub use gzip_codec::*; +/// Represents a valid gzip compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct GzipLevel(u32); + +impl Default for GzipLevel { + fn default() -> Self { + // The default as of miniz_oxide 0.5.1 is 6 for compression level + // (miniz_oxide::deflate::CompressionLevel::DefaultLevel) + Self(6) + } +} + +impl CompressionLevel for GzipLevel { + const MINIMUM_LEVEL: u32 = 0; + const MAXIMUM_LEVEL: u32 = 10; +} + +impl GzipLevel { + /// Attempts to create a gzip compression level. + /// + /// Compression levels must be valid (i.e. be acceptable for [`flate2::Compression`]). + pub fn try_new(level: u32) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> u32 { + self.0 + } +} + #[cfg(any(feature = "brotli", test))] mod brotli_codec { @@ -253,17 +309,20 @@ mod brotli_codec { use crate::compression::Codec; use crate::errors::Result; + use super::BrotliLevel; + const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; - const BROTLI_DEFAULT_COMPRESSION_QUALITY: u32 = 1; // supported levels 0-9 const BROTLI_DEFAULT_LG_WINDOW_SIZE: u32 = 22; // recommended between 20-22 /// Codec for Brotli compression algorithm. - pub struct BrotliCodec {} + pub struct BrotliCodec { + level: BrotliLevel, + } impl BrotliCodec { /// Creates new Brotli compression codec. - pub(crate) fn new() -> Self { - Self {} + pub(crate) fn new(level: BrotliLevel) -> Self { + Self { level } } } @@ -284,7 +343,7 @@ mod brotli_codec { let mut encoder = brotli::CompressorWriter::new( output_buf, BROTLI_DEFAULT_BUFFER_SIZE, - BROTLI_DEFAULT_COMPRESSION_QUALITY, + self.level.0, BROTLI_DEFAULT_LG_WINDOW_SIZE, ); encoder.write_all(input_buf)?; @@ -295,6 +354,35 @@ mod brotli_codec { #[cfg(any(feature = "brotli", test))] pub use brotli_codec::*; +/// Represents a valid brotli compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct BrotliLevel(u32); + +impl Default for BrotliLevel { + fn default() -> Self { + Self(1) + } +} + +impl CompressionLevel for BrotliLevel { + const MINIMUM_LEVEL: u32 = 0; + const MAXIMUM_LEVEL: u32 = 11; +} + +impl BrotliLevel { + /// Attempts to create a brotli compression level. + /// + /// Compression levels must be valid. + pub fn try_new(level: u32) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> u32 { + self.0 + } +} + #[cfg(any(feature = "lz4", test))] mod lz4_codec { use std::io::{Read, Write}; @@ -357,22 +445,21 @@ pub use lz4_codec::*; mod zstd_codec { use std::io::{self, Write}; - use crate::compression::Codec; + use crate::compression::{Codec, ZstdLevel}; use crate::errors::Result; /// Codec for Zstandard compression algorithm. - pub struct ZSTDCodec {} + pub struct ZSTDCodec { + level: ZstdLevel, + } impl ZSTDCodec { /// Creates new Zstandard compression codec. - pub(crate) fn new() -> Self { - Self {} + pub(crate) fn new(level: ZstdLevel) -> Self { + Self { level } } } - /// Compression level (1-21) for ZSTD. Choose 1 here for better compression speed. - const ZSTD_COMPRESSION_LEVEL: i32 = 1; - impl Codec for ZSTDCodec { fn decompress( &mut self, @@ -388,7 +475,7 @@ mod zstd_codec { } fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { - let mut encoder = zstd::Encoder::new(output_buf, ZSTD_COMPRESSION_LEVEL)?; + let mut encoder = zstd::Encoder::new(output_buf, self.level.0)?; encoder.write_all(input_buf)?; match encoder.finish() { Ok(_) => Ok(()), @@ -400,6 +487,37 @@ mod zstd_codec { #[cfg(any(feature = "zstd", test))] pub use zstd_codec::*; +/// Represents a valid zstd compression level. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub struct ZstdLevel(i32); + +impl CompressionLevel for ZstdLevel { + // zstd binds to C, and hence zstd::compression_level_range() is not const as this calls the + // underlying C library. + const MINIMUM_LEVEL: i32 = 1; + const MAXIMUM_LEVEL: i32 = 22; +} + +impl ZstdLevel { + /// Attempts to create a zstd compression level from a given compression level. + /// + /// Compression levels must be valid (i.e. be acceptable for [`zstd::compression_level_range`]). + pub fn try_new(level: i32) -> Result { + Self::is_valid_level(level).map(|_| Self(level)) + } + + /// Returns the compression level. + pub fn compression_level(&self) -> i32 { + self.0 + } +} + +impl Default for ZstdLevel { + fn default() -> Self { + Self(1) + } +} + #[cfg(any(feature = "lz4", test))] mod lz4_raw_codec { use crate::compression::Codec; @@ -647,7 +765,8 @@ mod lz4_hadoop_codec { let compressed_size = compressed_size as u32; let uncompressed_size = input_buf.len() as u32; output_buf[..SIZE_U32].copy_from_slice(&uncompressed_size.to_be_bytes()); - output_buf[SIZE_U32..PREFIX_LEN].copy_from_slice(&compressed_size.to_be_bytes()); + output_buf[SIZE_U32..PREFIX_LEN] + .copy_from_slice(&compressed_size.to_be_bytes()); Ok(()) } @@ -742,14 +861,20 @@ mod tests { #[test] fn test_codec_gzip() { - test_codec_with_size(CodecType::GZIP); - test_codec_without_size(CodecType::GZIP); + for level in GzipLevel::MINIMUM_LEVEL..=GzipLevel::MAXIMUM_LEVEL { + let level = GzipLevel::try_new(level).unwrap(); + test_codec_with_size(CodecType::GZIP(level)); + test_codec_without_size(CodecType::GZIP(level)); + } } #[test] fn test_codec_brotli() { - test_codec_with_size(CodecType::BROTLI); - test_codec_without_size(CodecType::BROTLI); + for level in BrotliLevel::MINIMUM_LEVEL..=BrotliLevel::MAXIMUM_LEVEL { + let level = BrotliLevel::try_new(level).unwrap(); + test_codec_with_size(CodecType::BROTLI(level)); + test_codec_without_size(CodecType::BROTLI(level)); + } } #[test] @@ -759,8 +884,11 @@ mod tests { #[test] fn test_codec_zstd() { - test_codec_with_size(CodecType::ZSTD); - test_codec_without_size(CodecType::ZSTD); + for level in ZstdLevel::MINIMUM_LEVEL..=ZstdLevel::MAXIMUM_LEVEL { + let level = ZstdLevel::try_new(level).unwrap(); + test_codec_with_size(CodecType::ZSTD(level)); + test_codec_without_size(CodecType::ZSTD(level)); + } } #[test] diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 2ce0050c938e..1d6f38dcd3c4 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -937,7 +937,7 @@ mod tests { )])) // global column settings .set_encoding(Encoding::DELTA_BINARY_PACKED) - .set_compression(Compression::GZIP) + .set_compression(Compression::GZIP(Default::default())) .set_dictionary_enabled(false) .set_statistics_enabled(EnabledStatistics::None) .set_max_statistics_size(50) @@ -972,7 +972,10 @@ mod tests { props.encoding(&ColumnPath::from("a")), Some(Encoding::DELTA_BINARY_PACKED) ); - assert_eq!(props.compression(&ColumnPath::from("a")), Compression::GZIP); + assert_eq!( + props.compression(&ColumnPath::from("a")), + Compression::GZIP(Default::default()) + ); assert!(!props.dictionary_enabled(&ColumnPath::from("a"))); assert_eq!( props.statistics_enabled(&ColumnPath::from("a")), @@ -1004,7 +1007,7 @@ mod tests { fn test_writer_properties_builder_partial_defaults() { let props = WriterProperties::builder() .set_encoding(Encoding::DELTA_BINARY_PACKED) - .set_compression(Compression::GZIP) + .set_compression(Compression::GZIP(Default::default())) .set_bloom_filter_enabled(true) .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) .build(); @@ -1015,7 +1018,7 @@ mod tests { ); assert_eq!( props.compression(&ColumnPath::from("col")), - Compression::GZIP + Compression::GZIP(Default::default()) ); assert_eq!( props.dictionary_enabled(&ColumnPath::from("col")), From 047d699525ee28d3fa499274f3ff52efca721c47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 Mar 2023 14:28:40 +0000 Subject: [PATCH 0683/1411] Update quick-xml requirement from 0.27.0 to 0.28.0 (#3857) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.27.0...v0.28.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c6bb7e855785..a385886e144e 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -43,7 +43,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } -quick-xml = { version = "0.27.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 488b7bab9fa2b0b615639f6a7dd73b4acb794e1e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 14 Mar 2023 18:52:22 +0000 Subject: [PATCH 0684/1411] Parse timestamps with arbitrary seconds fraction (#3858) * Parse timestamps with arbitrary seconds fraction * Review feedback --- arrow-cast/src/parse.rs | 98 ++++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 38fb4fc29934..36bc8777c43d 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -23,6 +23,14 @@ use arrow_schema::ArrowError; use chrono::prelude::*; use std::str::FromStr; +#[inline] +fn parse_nanos(digits: &[u8]) -> u32 { + digits[..N] + .iter() + .fold(0_u32, |acc, v| acc * 10 + *v as u32) + * 10_u32.pow((9 - N) as _) +} + /// Helper for parsing timestamps struct TimestampParser { /// The timestamp bytes to parse minus `b'0'` @@ -89,34 +97,23 @@ impl TimestampParser { let second = self.digits[17] * 10 + self.digits[18]; let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; - let millis = || { - self.digits[20] as u32 * 100_000_000 - + self.digits[21] as u32 * 10_000_000 - + self.digits[22] as u32 * 1_000_000 - }; - - let micros = || { - self.digits[23] as u32 * 100_000 - + self.digits[24] as u32 * 10_000 - + self.digits[25] as u32 * 1_000 - }; - - let nanos = || { - self.digits[26] as u32 * 100 - + self.digits[27] as u32 * 10 - + self.digits[28] as u32 - }; - match self.test(19, b'.') { - true => match (self.mask >> 20).trailing_ones() { - 3 => Some((time.with_nanosecond(millis())?, 23)), - 6 => Some((time.with_nanosecond(millis() + micros())?, 26)), - 9 => Some(( - time.with_nanosecond(millis() + micros() + nanos())?, - 29, - )), - _ => None, - }, + true => { + let digits = (self.mask >> 20).trailing_ones(); + let nanos = match digits { + 0 => return None, + 1 => parse_nanos::<1>(&self.digits[20..21]), + 2 => parse_nanos::<2>(&self.digits[20..22]), + 3 => parse_nanos::<3>(&self.digits[20..23]), + 4 => parse_nanos::<4>(&self.digits[20..24]), + 5 => parse_nanos::<5>(&self.digits[20..25]), + 6 => parse_nanos::<6>(&self.digits[20..26]), + 7 => parse_nanos::<7>(&self.digits[20..27]), + 8 => parse_nanos::<8>(&self.digits[20..28]), + _ => parse_nanos::<9>(&self.digits[20..29]), + }; + Some((time.with_nanosecond(nanos)?, 20 + digits as usize)) + } false => Some((time, 19)), } } @@ -195,8 +192,16 @@ pub fn string_to_datetime( return Err(err("invalid timestamp separator")); } - let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; let datetime = date.and_time(time); + + if tz_offset == 32 { + // Decimal overrun + while bytes[tz_offset].is_ascii_digit() && tz_offset < bytes.len() { + tz_offset += 1; + } + } + if bytes.len() <= tz_offset { let offset = timezone.offset_from_local_datetime(&datetime); let offset = offset @@ -983,6 +988,38 @@ mod tests { ) } + #[test] + fn string_to_timestamp_chrono() { + let cases = [ + "2020-09-08T13:42:29Z", + "1969-01-01T00:00:00.1Z", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12+00:00", + "2020-09-08T12:00:12.1+00:00", + "2020-09-08T12:00:12.12+00:00", + "2020-09-08T12:00:12.123+00:00", + "2020-09-08T12:00:12.1234+00:00", + "2020-09-08T12:00:12.12345+00:00", + "2020-09-08T12:00:12.123456+00:00", + "2020-09-08T12:00:12.1234567+00:00", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12.123456789+00:00", + "2020-09-08T12:00:12.12345678912z", + "2020-09-08T12:00:12.123456789123Z", + "2020-09-08T12:00:12.123456789123+02:00", + "2020-09-08T12:00:12.12345678912345Z", + "2020-09-08T12:00:12.1234567891234567+02:00", + ]; + + for case in cases { + let chrono = DateTime::parse_from_rfc3339(case).unwrap(); + let chrono_utc = chrono.with_timezone(&Utc); + + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono_utc, custom) + } + } + #[test] fn string_to_timestamp_invalid() { // Test parsing invalid formats @@ -1002,11 +1039,10 @@ mod tests { ("2015-01-20T25:35:20-08:00", "error parsing time"), ("1997-01-10T09:61:56.123Z", "error parsing time"), ("1997-01-10T09:61:90.123Z", "error parsing time"), - ("1997-01-10T12:00:56.12Z", "error parsing time"), - ("1997-01-10T12:00:56.1234Z", "error parsing time"), - ("1997-01-10T12:00:56.12345Z", "error parsing time"), ("1997-01-10T12:00:6.123Z", "error parsing time"), ("1997-01-31T092656.123Z", "error parsing time"), + ("1997-01-10T12:00:06.", "error parsing time"), + ("1997-01-10T12:00:06. ", "error parsing time"), ]; for (s, ctx) in cases { From 11a8ed9a49a9311c0539d584e08c1a7515b1cac3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Mar 2023 15:19:17 +0000 Subject: [PATCH 0685/1411] Parse timestamps with leap seconds (#3861) (#3862) * Parse timestamps with leap seconds (#3861) * Handle lower case timestamp separator (#3863) --- arrow-cast/src/parse.rs | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 36bc8777c43d..23c2642e765e 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -89,13 +89,21 @@ impl TimestampParser { /// /// Returning the end byte offset fn time(&self) -> Option<(NaiveTime, usize)> { + // Make a NaiveTime handling leap seconds + let time = |hour, min, sec, nano| match sec { + 60 => { + let nano = 1_000_000_000 + nano; + NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) + } + _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), + }; + match (self.mask >> 11) & 0b11111111 { // 09:26:56 0b11011011 if self.test(13, b':') && self.test(16, b':') => { let hour = self.digits[11] * 10 + self.digits[12]; let minute = self.digits[14] * 10 + self.digits[15]; let second = self.digits[17] * 10 + self.digits[18]; - let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; match self.test(19, b'.') { true => { @@ -112,9 +120,9 @@ impl TimestampParser { 8 => parse_nanos::<8>(&self.digits[20..28]), _ => parse_nanos::<9>(&self.digits[20..29]), }; - Some((time.with_nanosecond(nanos)?, 20 + digits as usize)) + Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) } - false => Some((time, 19)), + false => Some((time(hour, minute, second, 0)?, 19)), } } // 092656 @@ -122,7 +130,7 @@ impl TimestampParser { let hour = self.digits[11] * 10 + self.digits[12]; let minute = self.digits[13] * 10 + self.digits[14]; let second = self.digits[15] * 10 + self.digits[16]; - let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; + let time = time(hour, minute, second, 0)?; Some((time, 17)) } _ => None, @@ -188,7 +196,7 @@ pub fn string_to_datetime( return Ok(DateTime::from_local(date.and_time(time), offset)); } - if !parser.test(10, b'T') && !parser.test(10, b' ') { + if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { return Err(err("invalid timestamp separator")); } @@ -1009,6 +1017,14 @@ mod tests { "2020-09-08T12:00:12.123456789123+02:00", "2020-09-08T12:00:12.12345678912345Z", "2020-09-08T12:00:12.1234567891234567+02:00", + "2020-09-08T12:00:60Z", + "2020-09-08T12:00:60.123Z", + "2020-09-08T12:00:60.123456+02:00", + "2020-09-08T12:00:60.1234567891234567+02:00", + "2020-09-08T12:00:60.999999999+02:00", + "2020-09-08t12:00:12.12345678+00:00", + "2020-09-08t12:00:12+00:00", + "2020-09-08t12:00:12Z", ]; for case in cases { From 3a8636e55e6eecab021e178328f737eaac473e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Metehan=20Y=C4=B1ld=C4=B1r=C4=B1m?= <100111937+metesynnada@users.noreply.github.com> Date: Wed, 15 Mar 2023 21:40:29 +0300 Subject: [PATCH 0686/1411] Supporting metadata fetch without open file read mode (#3868) * Initial implementation * Formatting and test timeout. * Clippy issue * Fmt issue * Update object_store/Cargo.toml Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/local.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fmt --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/Cargo.toml | 3 +++ object_store/src/local.rs | 47 +++++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a385886e144e..317087241a30 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -62,6 +62,9 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut [target.'cfg(target_arch = "wasm32")'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } +[target.'cfg(target_family="unix")'.dev-dependencies] +nix = "0.26.1" + [features] cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] diff --git a/object_store/src/local.rs b/object_store/src/local.rs index ac0b02070d5e..9e710c28c072 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -400,13 +400,20 @@ impl ObjectStore for LocalFileSystem { let location = location.clone(); maybe_spawn_blocking(move || { - let file = open_file(&path)?; - let metadata = - file.metadata().map_err(|e| Error::UnableToAccessMetadata { - source: e.into(), - path: location.to_string(), - })?; - + let metadata = match metadata(&path) { + Err(e) => Err(if e.kind() == ErrorKind::NotFound { + Error::NotFound { + path: path.clone(), + source: e, + } + } else { + Error::UnableToAccessMetadata { + source: e.into(), + path: location.to_string(), + } + }), + Ok(m) => Ok(m), + }?; convert_metadata(metadata, location) }) .await @@ -1442,3 +1449,29 @@ mod not_wasm_tests { assert_eq!(&*read_data, expected_data); } } + +#[cfg(target_family = "unix")] +#[cfg(test)] +mod unix_test { + use crate::local::LocalFileSystem; + use crate::{ObjectStore, Path}; + use nix::sys::stat; + use nix::unistd; + use std::time::Duration; + use tempfile::TempDir; + use tokio::time::timeout; + + #[tokio::test] + async fn test_head_fifo() { + let filename = "some_file"; + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + unistd::mkfifo(&root.path().join(filename), stat::Mode::S_IRWXU).unwrap(); + let location = Path::from(filename); + if (timeout(Duration::from_millis(10), integration.head(&location)).await) + .is_err() + { + panic!("Did not receive value within 10 ms"); + } + } +} From b991aee98fc0766cd7ea741d0facb12240d8b5ee Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Mar 2023 18:44:08 +0000 Subject: [PATCH 0687/1411] Rename PrefixObjectStore to PrefixStore (#3870) --- object_store/src/prefix.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index d61fc22271a2..c3a0ebd1b787 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -27,21 +27,25 @@ use crate::{ Result as ObjectStoreResult, }; +#[doc(hidden)] +#[deprecated(note = "Use PrefixStore")] +pub type PrefixObjectStore = PrefixStore; + /// Store wrapper that applies a constant prefix to all paths handled by the store. #[derive(Debug, Clone)] -pub struct PrefixObjectStore { +pub struct PrefixStore { prefix: Path, inner: T, } -impl std::fmt::Display for PrefixObjectStore { +impl std::fmt::Display for PrefixStore { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "PrefixObjectStore({})", self.prefix.as_ref()) } } -impl PrefixObjectStore { - /// Create a new instance of [`PrefixObjectStore`] +impl PrefixStore { + /// Create a new instance of [`PrefixStore`] pub fn new(store: T, prefix: impl Into) -> Self { Self { prefix: prefix.into(), @@ -61,7 +65,7 @@ impl PrefixObjectStore { } #[async_trait::async_trait] -impl ObjectStore for PrefixObjectStore { +impl ObjectStore for PrefixStore { /// Save the provided bytes to the specified location. async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { let full_path = self.full_path(location); @@ -221,7 +225,7 @@ mod tests { async fn prefix_test() { let root = TempDir::new().unwrap(); let inner = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - let integration = PrefixObjectStore::new(inner, "prefix"); + let integration = PrefixStore::new(inner, "prefix"); put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; @@ -242,7 +246,7 @@ mod tests { local.put(&location, data).await.unwrap(); - let prefix = PrefixObjectStore::new(local, "prefix"); + let prefix = PrefixStore::new(local, "prefix"); let location_prefix = Path::from("test_file.json"); let content_list = flatten_list_stream(&prefix, None).await.unwrap(); From 1d5e08e99f95b01419ad8b306a66fc7be2bcc949 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Mar 2023 19:12:13 +0000 Subject: [PATCH 0688/1411] Implement append for LimitStore, PrefixObjectStore, ThrottledStore (#3869) --- object_store/src/limit.rs | 9 +++++++ object_store/src/prefix.rs | 48 +++++++++++++++++------------------- object_store/src/throttle.rs | 7 ++++++ 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 09c88aa2a4bc..b3e55a918b9a 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -95,6 +95,15 @@ impl ObjectStore for LimitStore { self.inner.abort_multipart(location, multipart_id).await } + async fn append( + &self, + location: &Path, + ) -> Result> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let write = self.inner.append(location).await?; + Ok(Box::new(PermitWrapper::new(write, permit))) + } + async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); match self.inner.get(location).await? { diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index c3a0ebd1b787..7e7e7167bd0b 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -66,20 +66,24 @@ impl PrefixStore { #[async_trait::async_trait] impl ObjectStore for PrefixStore { - /// Save the provided bytes to the specified location. async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } - /// Return the bytes that are stored at the specified location. + async fn append( + &self, + location: &Path, + ) -> ObjectStoreResult> { + let full_path = self.full_path(location); + self.inner.append(&full_path).await + } + async fn get(&self, location: &Path) -> ObjectStoreResult { let full_path = self.full_path(location); self.inner.get(&full_path).await } - /// Return the bytes that are stored at the specified location - /// in the given byte range async fn get_range( &self, location: &Path, @@ -89,7 +93,15 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } - /// Return the metadata for the specified location + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> ObjectStoreResult> { + let full_path = self.full_path(location); + self.inner.get_ranges(&full_path, ranges).await + } + async fn head(&self, location: &Path) -> ObjectStoreResult { let full_path = self.full_path(location); self.inner.head(&full_path).await.map(|meta| ObjectMeta { @@ -99,16 +111,11 @@ impl ObjectStore for PrefixStore { }) } - /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { let full_path = self.full_path(location); self.inner.delete(&full_path).await } - /// List all the objects with the given prefix. - /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. async fn list( &self, prefix: Option<&Path>, @@ -125,12 +132,6 @@ impl ObjectStore for PrefixStore { .boxed()) } - /// List objects with the given prefix and an implementation specific - /// delimiter. Returns common prefixes (directories) in addition to object - /// metadata. - /// - /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of - /// `foo/bar_baz/x`. async fn list_with_delimiter( &self, prefix: Option<&Path>, @@ -160,27 +161,24 @@ impl ObjectStore for PrefixStore { }) } - /// Copy an object from one path to another in the same object store. - /// - /// If there exists an object at the destination, it will be overwritten. async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy(&full_from, &full_to).await } - /// Copy an object from one path to another, only if destination is empty. - /// - /// Will return an error if the destination already has an object. async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy_if_not_exists(&full_from, &full_to).await } - /// Move an object from one path to another in the same object store. - /// - /// Will return an error if the destination already has an object. + async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + let full_from = self.full_path(from); + let full_to = self.full_path(to); + self.inner.rename(&full_from, &full_to).await + } + async fn rename_if_not_exists( &self, from: &Path, diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 90f427cc2651..6dff64aab69c 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -166,6 +166,13 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } + async fn append( + &self, + _location: &Path, + ) -> Result> { + Err(super::Error::NotImplemented) + } + async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; From f4ac4e403ff2d7b1b10d5eb274c68add1af91383 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 16 Mar 2023 00:38:11 -0700 Subject: [PATCH 0689/1411] Allow precision loss on multiplying decimal arrays (#3690) * Add multiply_decimal. * Fix scale for multiple value * Fix * Add doc * Update * Rename to mul_fixed_point_checked * For review * More * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-arith/src/arithmetic.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-arith/src/arithmetic.rs | 171 ++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 00375d32a677..8e2b7915357a 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -26,8 +26,11 @@ use crate::arity::*; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; +use arrow_buffer::i256; +use arrow_buffer::ArrowNativeType; use arrow_schema::*; use num::traits::Pow; +use std::cmp::min; use std::sync::Arc; /// Helper function to perform math lambda function on values from two arrays. If either @@ -1165,6 +1168,77 @@ pub fn multiply_dyn_checked( } } +/// Perform `left * right` operation on two decimal arrays. If either left or right value is +/// null then the result is also null. +/// +/// This performs decimal multiplication which allows precision loss if an exact representation +/// is not possible for the result, according to the required scale. In the case, the result +/// will be rounded to the required scale. +/// +/// If the required scale is greater than the product scale, an error is returned. +/// +/// It is implemented for compatibility with precision loss `multiply` function provided by +/// other data processing engines. For multiplication with precision loss detection, use +/// `multiply` or `multiply_checked` instead. +pub fn multiply_fixed_point_checked( + left: &PrimitiveArray, + right: &PrimitiveArray, + required_scale: i8, +) -> Result, ArrowError> { + let product_scale = left.scale() + right.scale(); + let precision = min( + left.precision() + right.precision() + 1, + DECIMAL128_MAX_PRECISION, + ); + + if required_scale == product_scale { + return multiply_checked(left, right)? + .with_precision_and_scale(precision, required_scale); + } + + if required_scale > product_scale { + return Err(ArrowError::ComputeError(format!( + "Required scale {} is greater than product scale {}", + required_scale, product_scale + ))); + } + + let divisor = + i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); + + try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { + let a = i256::from_i128(a); + let b = i256::from_i128(b); + + let mut mul = a.wrapping_mul(b); + mul = divide_and_round::(mul, divisor); + mul.to_i128().ok_or_else(|| { + ArrowError::ComputeError(format!("Overflow happened on: {:?} * {:?}", a, b)) + }) + }) + .and_then(|a| a.with_precision_and_scale(precision, required_scale)) +} + +/// Divide a decimal native value by given divisor and round the result. +fn divide_and_round(input: I::Native, div: I::Native) -> I::Native +where + I: DecimalType, + I::Native: ArrowNativeTypeOp, +{ + let d = input.div_wrapping(div); + let r = input.mod_wrapping(div); + + let half = div.div_wrapping(I::Native::from_usize(2).unwrap()); + let half_neg = half.neg_wrapping(); + + // Round result + match input >= I::Native::ZERO { + true if r >= half => d.add_wrapping(I::Native::ONE), + false if r <= half_neg => d.sub_wrapping(I::Native::ONE), + _ => d, + } +} + /// Multiply every value in an array by a scalar. If any value in the array is null then the /// result is also null. /// @@ -3231,4 +3305,101 @@ mod tests { assert_eq!(&expected, &result); } + + #[test] + fn test_decimal_multiply_allow_precision_loss() { + // Overflow happening as i128 cannot hold multiplying result. + // [123456789] + let a = Decimal128Array::from(vec![123456789000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [10] + let b = Decimal128Array::from(vec![10000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + let err = multiply_dyn_checked(&a, &b).unwrap_err(); + assert!(err.to_string().contains( + "Overflow happened on: 123456789000000000000000000 * 10000000000000000000" + )); + + // Allow precision loss. + let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); + // [1234567890] + let expected = + Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); + + assert_eq!(&expected, &result); + assert_eq!( + result.value_as_string(0), + "1234567890.0000000000000000000000000000" + ); + + // Rounding case + // [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555] + let a = Decimal128Array::from(vec![ + 1, + 123456789555555555555555555, + 1555555555555555555, + ]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [1.555555555555555555, 11.222222222222222222, 0.000000000000000001] + let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1]) + .with_precision_and_scale(38, 18) + .unwrap(); + + let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); + // [ + // 0.0000000000000000015555555556, + // 1385459527.2345679012071330528765432099, + // 0.0000000000000000015555555556 + // ] + let expected = Decimal128Array::from(vec![ + 15555555556, + 13854595272345679012071330528765432099, + 15555555556, + ]) + .with_precision_and_scale(38, 28) + .unwrap(); + + assert_eq!(&expected, &result); + + // Rounded the value "1385459527.234567901207133052876543209876543210". + assert_eq!( + result.value_as_string(1), + "1385459527.2345679012071330528765432099" + ); + assert_eq!(result.value_as_string(0), "0.0000000000000000015555555556"); + assert_eq!(result.value_as_string(2), "0.0000000000000000015555555556"); + + let a = Decimal128Array::from(vec![1230]) + .with_precision_and_scale(4, 2) + .unwrap(); + + let b = Decimal128Array::from(vec![1000]) + .with_precision_and_scale(4, 2) + .unwrap(); + + // Required scale is same as the product of the input scales. Behavior is same as multiply. + let result = multiply_fixed_point_checked(&a, &b, 4).unwrap(); + assert_eq!(result.precision(), 9); + assert_eq!(result.scale(), 4); + + let expected = multiply_checked(&a, &b) + .unwrap() + .with_precision_and_scale(9, 4) + .unwrap(); + assert_eq!(&expected, &result); + + // Required scale cannot be larger than the product of the input scales. + let result = multiply_fixed_point_checked(&a, &b, 5).unwrap_err(); + assert!(result + .to_string() + .contains("Required scale 5 is greater than product scale 4")); + } } From b466cc71a46eae12bb49f42bd1f3ef06a2d701e5 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Thu, 16 Mar 2023 13:29:00 +0100 Subject: [PATCH 0690/1411] chore: remove LevelDecode (#3872) --- parquet/src/encodings/levels.rs | 398 +------------------------------- 1 file changed, 3 insertions(+), 395 deletions(-) diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 0727935c345a..62c3b89db36b 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -15,17 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::{cmp, mem}; +use std::mem; -use super::rle::{RleDecoder, RleEncoder}; +use super::rle::RleEncoder; use crate::basic::Encoding; use crate::data_type::AsBytes; -use crate::errors::Result; -use crate::util::{ - bit_util::{ceil, num_required_bits, read_num_bytes, BitReader, BitWriter}, - memory::ByteBufferPtr, -}; +use crate::util::bit_util::{ceil, num_required_bits, BitWriter}; /// Computes max buffer size for level encoder/decoder based on encoding, max /// repetition/definition level and number of total buffered values (includes null @@ -133,391 +129,3 @@ impl LevelEncoder { } } } - -/// Decoder for definition/repetition levels. -/// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and -/// RLE for Data Page v2. -#[allow(unused)] -pub enum LevelDecoder { - Rle(Option, RleDecoder), - RleV2(Option, RleDecoder), - BitPacked(Option, u8, BitReader), -} - -#[allow(unused)] -impl LevelDecoder { - /// Creates new level decoder based on encoding and max definition/repetition level. - /// This method only initializes level decoder, `set_data` method must be called - /// before reading any value. - /// - /// Used to encode levels for Data Page v1. - /// - /// Panics if encoding is not supported - pub fn v1(encoding: Encoding, max_level: i16) -> Self { - let bit_width = num_required_bits(max_level as u64); - match encoding { - Encoding::RLE => LevelDecoder::Rle(None, RleDecoder::new(bit_width)), - Encoding::BIT_PACKED => { - LevelDecoder::BitPacked(None, bit_width, BitReader::from(Vec::new())) - } - _ => panic!("Unsupported encoding type {encoding}"), - } - } - - /// Creates new level decoder based on RLE encoding. - /// Used to decode Data Page v2 repetition and definition levels. - /// - /// To set data for this decoder, use `set_data_range` method. - pub fn v2(max_level: i16) -> Self { - let bit_width = num_required_bits(max_level as u64); - LevelDecoder::RleV2(None, RleDecoder::new(bit_width)) - } - - /// Sets data for this level decoder, and returns total number of bytes set. - /// This is used for Data Page v1 levels. - /// - /// `data` is encoded data as byte buffer, `num_buffered_values` represents total - /// number of values that is expected. - /// - /// Both RLE and BIT_PACKED level decoders set `num_buffered_values` as total number - /// of values that they can return and track num values. - #[inline] - pub fn set_data(&mut self, num_buffered_values: usize, data: ByteBufferPtr) -> usize { - match *self { - LevelDecoder::Rle(ref mut num_values, ref mut decoder) => { - *num_values = Some(num_buffered_values); - let i32_size = mem::size_of::(); - let data_size = read_num_bytes::(i32_size, data.as_ref()) as usize; - decoder.set_data(data.range(i32_size, data_size)); - i32_size + data_size - } - LevelDecoder::BitPacked(ref mut num_values, bit_width, ref mut decoder) => { - *num_values = Some(num_buffered_values); - // Set appropriate number of bytes: if max size is larger than buffer - - // set full buffer - let num_bytes = - ceil((num_buffered_values * bit_width as usize) as i64, 8); - let data_size = cmp::min(num_bytes as usize, data.len()); - decoder.reset(data.range(0, data_size)); - data_size - } - _ => panic!(), - } - } - - /// Sets byte array explicitly when start position `start` and length `len` are known - /// in advance. Only supported by RLE level decoder and used for Data Page v2 levels. - /// Returns number of total bytes set for this decoder (len). - #[inline] - pub fn set_data_range( - &mut self, - num_buffered_values: usize, - data: &ByteBufferPtr, - start: usize, - len: usize, - ) -> usize { - match *self { - LevelDecoder::RleV2(ref mut num_values, ref mut decoder) => { - decoder.set_data(data.range(start, len)); - *num_values = Some(num_buffered_values); - len - } - _ => panic!( - "set_data_range() method is only supported by RLE v2 encoding type" - ), - } - } - - /// Returns true if data is set for decoder, false otherwise. - #[inline] - pub fn is_data_set(&self) -> bool { - match self { - LevelDecoder::Rle(ref num_values, _) => num_values.is_some(), - LevelDecoder::RleV2(ref num_values, _) => num_values.is_some(), - LevelDecoder::BitPacked(ref num_values, ..) => num_values.is_some(), - } - } - - /// Decodes values and puts them into `buffer`. - /// Returns number of values that were successfully decoded (less than or equal to - /// buffer length). - #[inline] - pub fn get(&mut self, buffer: &mut [i16]) -> Result { - assert!(self.is_data_set(), "No data set for decoding"); - match *self { - LevelDecoder::Rle(ref mut num_values, ref mut decoder) - | LevelDecoder::RleV2(ref mut num_values, ref mut decoder) => { - // Max length we can read - let len = cmp::min(num_values.unwrap(), buffer.len()); - let values_read = decoder.get_batch::(&mut buffer[0..len])?; - *num_values = num_values.map(|len| len - values_read); - Ok(values_read) - } - LevelDecoder::BitPacked(ref mut num_values, bit_width, ref mut decoder) => { - // When extracting values from bit reader, it might return more values - // than left because of padding to a full byte, we use - // num_values to track precise number of values. - let len = cmp::min(num_values.unwrap(), buffer.len()); - let values_read = - decoder.get_batch::(&mut buffer[..len], bit_width as usize); - *num_values = num_values.map(|len| len - values_read); - Ok(values_read) - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::util::test_common::rand_gen::random_numbers_range; - - fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { - let mut encoder = if v2 { - LevelEncoder::v2(max_level, levels.len()) - } else { - LevelEncoder::v1(enc, max_level, levels.len()) - }; - encoder.put(levels); - let encoded_levels = encoder.consume(); - - let byte_buf = ByteBufferPtr::new(encoded_levels); - let mut decoder; - if v2 { - decoder = LevelDecoder::v2(max_level); - decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); - } else { - decoder = LevelDecoder::v1(enc, max_level); - decoder.set_data(levels.len(), byte_buf); - }; - - let mut buffer = vec![0; levels.len()]; - let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); - assert_eq!(num_decoded, levels.len()); - assert_eq!(buffer, levels); - } - - // Performs incremental read until all bytes are read - fn test_internal_roundtrip_incremental( - enc: Encoding, - levels: &[i16], - max_level: i16, - v2: bool, - ) { - let mut encoder = if v2 { - LevelEncoder::v2(max_level, levels.len()) - } else { - LevelEncoder::v1(enc, max_level, levels.len()) - }; - encoder.put(levels); - let encoded_levels = encoder.consume(); - - let byte_buf = ByteBufferPtr::new(encoded_levels); - let mut decoder; - if v2 { - decoder = LevelDecoder::v2(max_level); - decoder.set_data_range(levels.len(), &byte_buf, 0, byte_buf.len()); - } else { - decoder = LevelDecoder::v1(enc, max_level); - decoder.set_data(levels.len(), byte_buf); - } - - let mut buffer = vec![0; levels.len() * 2]; - let mut total_decoded = 0; - let mut safe_stop = levels.len() * 2; // still terminate in case of issues in the code - while safe_stop > 0 { - safe_stop -= 1; - let num_decoded = decoder - .get(&mut buffer[total_decoded..total_decoded + 1]) - .expect("get() should be OK"); - if num_decoded == 0 { - break; - } - total_decoded += num_decoded; - } - assert!( - safe_stop > 0, - "Failed to read values incrementally, reached safe stop" - ); - assert_eq!(total_decoded, levels.len()); - assert_eq!(&buffer[0..levels.len()], levels); - } - - // Tests encoding/decoding of values when output buffer is larger than number of - // encoded values - fn test_internal_roundtrip_underflow( - enc: Encoding, - levels: &[i16], - max_level: i16, - v2: bool, - ) { - let mut encoder = if v2 { - LevelEncoder::v2(max_level, levels.len()) - } else { - LevelEncoder::v1(enc, max_level, levels.len()) - }; - // Encode only one value - let num_encoded = encoder.put(&levels[0..1]); - let encoded_levels = encoder.consume(); - assert_eq!(num_encoded, 1); - - let byte_buf = ByteBufferPtr::new(encoded_levels); - let mut decoder; - // Set one encoded value as `num_buffered_values` - if v2 { - decoder = LevelDecoder::v2(max_level); - decoder.set_data_range(1, &byte_buf, 0, byte_buf.len()); - } else { - decoder = LevelDecoder::v1(enc, max_level); - decoder.set_data(1, byte_buf); - } - - let mut buffer = vec![0; levels.len()]; - let num_decoded = decoder.get(&mut buffer).expect("get() should be OK"); - assert_eq!(num_decoded, num_encoded); - assert_eq!(buffer[0..num_decoded], levels[0..num_decoded]); - } - - #[test] - fn test_roundtrip_one() { - let levels = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; - let max_level = 1; - test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_roundtrip() { - let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; - let max_level = 10; - test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_roundtrip_incremental() { - let levels = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; - let max_level = 10; - test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip_incremental( - Encoding::BIT_PACKED, - &levels, - max_level, - false, - ); - test_internal_roundtrip_incremental(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_roundtrip_all_zeros() { - let levels = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - let max_level = 1; - test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_roundtrip_random() { - // This test is mainly for bit packed level encoder/decoder - let mut levels = Vec::new(); - let max_level = 5; - random_numbers_range::(120, 0, max_level, &mut levels); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_rountrip_max() { - let levels = vec![0, i16::MAX, i16::MAX, i16::MAX, 0]; - let max_level = i16::MAX; - test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_roundtrip_underflow() { - let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; - let max_level = 3; - test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip_underflow( - Encoding::BIT_PACKED, - &levels, - max_level, - false, - ); - test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, true); - } - - #[test] - fn test_rle_decoder_set_data_range() { - // Buffer containing both repetition and definition levels - let buffer = ByteBufferPtr::new(vec![5, 198, 2, 5, 42, 168, 10, 0, 2, 3, 36, 73]); - - let max_rep_level = 1; - let mut decoder = LevelDecoder::v2(max_rep_level); - assert_eq!(decoder.set_data_range(10, &buffer, 0, 3), 3); - let mut result = vec![0; 10]; - let num_decoded = decoder.get(&mut result).expect("get() should be OK"); - assert_eq!(num_decoded, 10); - assert_eq!(result, vec![0, 1, 1, 0, 0, 0, 1, 1, 0, 1]); - - let max_def_level = 2; - let mut decoder = LevelDecoder::v2(max_def_level); - assert_eq!(decoder.set_data_range(10, &buffer, 3, 5), 5); - let mut result = vec![0; 10]; - let num_decoded = decoder.get(&mut result).expect("get() should be OK"); - assert_eq!(num_decoded, 10); - assert_eq!(result, vec![2, 2, 2, 0, 0, 2, 2, 2, 2, 2]); - } - - #[test] - #[should_panic( - expected = "set_data_range() method is only supported by RLE v2 encoding type" - )] - fn test_bit_packed_decoder_set_data_range() { - // Buffer containing both repetition and definition levels - let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); - let max_level = 1; - let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); - decoder.set_data_range(10, &buffer, 0, 3); - } - - #[test] - fn test_bit_packed_decoder_set_data() { - // Test the maximum size that is assigned based on number of values and buffer - // length - let buffer = ByteBufferPtr::new(vec![1, 2, 3, 4, 5]); - let max_level = 1; - let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_level); - // This should reset to entire buffer - assert_eq!(decoder.set_data(1024, buffer.all()), buffer.len()); - // This should set smallest num bytes - assert_eq!(decoder.set_data(3, buffer.all()), 1); - } - - #[test] - #[should_panic(expected = "No data set for decoding")] - fn test_rle_level_decoder_get_no_set_data() { - // `get()` normally panics because bit_reader is not set for RLE decoding - // we have explicit check now in set_data - let max_rep_level = 2; - let mut decoder = LevelDecoder::v1(Encoding::RLE, max_rep_level); - let mut buffer = vec![0; 16]; - decoder.get(&mut buffer).unwrap(); - } - - #[test] - #[should_panic(expected = "No data set for decoding")] - fn test_bit_packed_level_decoder_get_no_set_data() { - let max_rep_level = 2; - let mut decoder = LevelDecoder::v1(Encoding::BIT_PACKED, max_rep_level); - let mut buffer = vec![0; 16]; - decoder.get(&mut buffer).unwrap(); - } -} From 0df21883cb7a5f414894a833ae301fcd8d8e464c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:40:50 +0000 Subject: [PATCH 0691/1411] Faster time parsing (~93% faster) (#3860) * Faster time parsing * Clippy * Clippy * WIP * Tests and fixes * Review feedback --- arrow-cast/Cargo.toml | 4 + arrow-cast/benches/parse_time.rs | 42 +++++ arrow-cast/src/parse.rs | 277 ++++++++++++++++++++++--------- 3 files changed, 243 insertions(+), 80 deletions(-) create mode 100644 arrow-cast/benches/parse_time.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 235dca135e5a..859254c3a81d 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -63,3 +63,7 @@ half = { version = "2.1", default-features = false } [[bench]] name = "parse_timestamp" harness = false + +[[bench]] +name = "parse_time" +harness = false diff --git a/arrow-cast/benches/parse_time.rs b/arrow-cast/benches/parse_time.rs new file mode 100644 index 000000000000..d28b9c7c613d --- /dev/null +++ b/arrow-cast/benches/parse_time.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_cast::parse::string_to_time_nanoseconds; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let timestamps = [ + "9:50", + "09:50", + "09:50 PM", + "9:50:12 AM", + "09:50:12 PM", + "09:50:12.123456789", + "9:50:12.123456789", + "09:50:12.123456789 PM", + ]; + + for timestamp in timestamps { + let t = black_box(timestamp); + c.bench_function(t, |b| { + b.iter(|| string_to_time_nanoseconds(t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 23c2642e765e..30cebb4bf3d0 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -23,11 +23,12 @@ use arrow_schema::ArrowError; use chrono::prelude::*; use std::str::FromStr; +/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` #[inline] -fn parse_nanos(digits: &[u8]) -> u32 { +fn parse_nanos(digits: &[u8]) -> u32 { digits[..N] .iter() - .fold(0_u32, |acc, v| acc * 10 + *v as u32) + .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) * 10_u32.pow((9 - N) as _) } @@ -110,15 +111,15 @@ impl TimestampParser { let digits = (self.mask >> 20).trailing_ones(); let nanos = match digits { 0 => return None, - 1 => parse_nanos::<1>(&self.digits[20..21]), - 2 => parse_nanos::<2>(&self.digits[20..22]), - 3 => parse_nanos::<3>(&self.digits[20..23]), - 4 => parse_nanos::<4>(&self.digits[20..24]), - 5 => parse_nanos::<5>(&self.digits[20..25]), - 6 => parse_nanos::<6>(&self.digits[20..26]), - 7 => parse_nanos::<7>(&self.digits[20..27]), - 8 => parse_nanos::<8>(&self.digits[20..28]), - _ => parse_nanos::<9>(&self.digits[20..29]), + 1 => parse_nanos::<1, 0>(&self.digits[20..21]), + 2 => parse_nanos::<2, 0>(&self.digits[20..22]), + 3 => parse_nanos::<3, 0>(&self.digits[20..23]), + 4 => parse_nanos::<4, 0>(&self.digits[20..24]), + 5 => parse_nanos::<5, 0>(&self.digits[20..25]), + 6 => parse_nanos::<6, 0>(&self.digits[20..26]), + 7 => parse_nanos::<7, 0>(&self.digits[20..27]), + 8 => parse_nanos::<8, 0>(&self.digits[20..28]), + _ => parse_nanos::<9, 0>(&self.digits[20..29]), }; Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) } @@ -299,79 +300,120 @@ fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { /// This function does not support parsing strings with a timezone /// or offset specified, as it considers only time since midnight. pub fn string_to_time_nanoseconds(s: &str) -> Result { - // colon count, presence of decimal, presence of whitespace - fn preprocess_time_string(string: &str) -> (usize, bool, bool) { - string - .as_bytes() - .iter() - .fold((0, false, false), |tup, char| match char { - b':' => (tup.0 + 1, tup.1, tup.2), - b'.' => (tup.0, true, tup.2), - b' ' => (tup.0, tup.1, true), - _ => tup, - }) + let nt = string_to_time(s).ok_or_else(|| { + ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) + })?; + Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) +} + +fn string_to_time(s: &str) -> Option { + let bytes = s.as_bytes(); + if bytes.len() < 4 { + return None; } - // Do a preprocess pass of the string to prune which formats to attempt parsing for - let formats: &[&str] = match preprocess_time_string(s.trim()) { - // 24-hour clock, with hour, minutes, seconds and fractions of a second specified - // Examples: - // * 09:50:12.123456789 - // * 9:50:12.123456789 - (2, true, false) => &["%H:%M:%S%.f", "%k:%M:%S%.f"], - - // 12-hour clock, with hour, minutes, seconds and fractions of a second specified - // Examples: - // * 09:50:12.123456789 PM - // * 09:50:12.123456789 pm - // * 9:50:12.123456789 AM - // * 9:50:12.123456789 am - (2, true, true) => &[ - "%I:%M:%S%.f %P", - "%I:%M:%S%.f %p", - "%l:%M:%S%.f %P", - "%l:%M:%S%.f %p", - ], - - // 24-hour clock, with hour, minutes and seconds specified - // Examples: - // * 09:50:12 - // * 9:50:12 - (2, false, false) => &["%H:%M:%S", "%k:%M:%S"], - - // 12-hour clock, with hour, minutes and seconds specified - // Examples: - // * 09:50:12 PM - // * 09:50:12 pm - // * 9:50:12 AM - // * 9:50:12 am - (2, false, true) => &["%I:%M:%S %P", "%I:%M:%S %p", "%l:%M:%S %P", "%l:%M:%S %p"], - - // 24-hour clock, with hour and minutes specified - // Examples: - // * 09:50 - // * 9:50 - (1, false, false) => &["%H:%M", "%k:%M"], - - // 12-hour clock, with hour and minutes specified - // Examples: - // * 09:50 PM - // * 09:50 pm - // * 9:50 AM - // * 9:50 am - (1, false, true) => &["%I:%M %P", "%I:%M %p", "%l:%M %P", "%l:%M %p"], - - _ => &[], + let (am, bytes) = match bytes.get(bytes.len() - 3..) { + Some(b" AM" | b" am" | b" Am" | b" aM") => { + (Some(true), &bytes[..bytes.len() - 3]) + } + Some(b" PM" | b" pm" | b" pM" | b" Pm") => { + (Some(false), &bytes[..bytes.len() - 3]) + } + _ => (None, bytes), }; - formats - .iter() - .find_map(|f| NaiveTime::parse_from_str(s, f).ok()) - .map(|nt| { - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64 - }) - // Return generic error if failed to parse as unknown which format user intended for the string - .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{s}' as time"))) + if bytes.len() < 4 { + return None; + } + + let mut digits = [b'0'; 6]; + + // Extract hour + let bytes = match (bytes[1], bytes[2]) { + (b':', _) => { + digits[1] = bytes[0]; + &bytes[2..] + } + (_, b':') => { + digits[0] = bytes[0]; + digits[1] = bytes[1]; + &bytes[3..] + } + _ => return None, + }; + + if bytes.len() < 2 { + return None; // Minutes required + } + + // Extract minutes + digits[2] = bytes[0]; + digits[3] = bytes[1]; + + let nanoseconds = match bytes.get(2) { + Some(b':') => { + if bytes.len() < 5 { + return None; + } + + // Extract seconds + digits[4] = bytes[3]; + digits[5] = bytes[4]; + + // Extract sub-seconds if any + match bytes.get(5) { + Some(b'.') => { + let decimal = &bytes[6..]; + if decimal.iter().any(|x| !x.is_ascii_digit()) { + return None; + } + match decimal.len() { + 0 => return None, + 1 => parse_nanos::<1, b'0'>(decimal), + 2 => parse_nanos::<2, b'0'>(decimal), + 3 => parse_nanos::<3, b'0'>(decimal), + 4 => parse_nanos::<4, b'0'>(decimal), + 5 => parse_nanos::<5, b'0'>(decimal), + 6 => parse_nanos::<6, b'0'>(decimal), + 7 => parse_nanos::<7, b'0'>(decimal), + 8 => parse_nanos::<8, b'0'>(decimal), + _ => parse_nanos::<9, b'0'>(decimal), + } + } + Some(_) => return None, + None => 0, + } + } + Some(_) => return None, + None => 0, + }; + + digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if digits.iter().any(|x| *x > 9) { + return None; + } + + let hour = match (digits[0] * 10 + digits[1], am) { + (12, Some(true)) => 0, // 12:00 AM -> 00:00 + (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 + (12, Some(false)) => 12, // 12:00 PM -> 12:00 + (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 + (_, Some(_)) => return None, + (h, None) => h, + }; + + // Handle leap second + let (second, nanoseconds) = match digits[4] * 10 + digits[5] { + 60 => (59, nanoseconds + 1_000_000_000), + s => (s, nanoseconds), + }; + + NaiveTime::from_hms_nano_opt( + hour as _, + (digits[2] * 10 + digits[3]) as _, + second as _, + nanoseconds, + ) } /// Specialized parsing implementations @@ -900,6 +942,13 @@ mod tests { use arrow_array::timezone::Tz; use arrow_buffer::i256; + #[test] + fn test_parse_nanos() { + assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); + assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); + assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); + } + #[test] fn string_to_timestamp_timezone() { // Explicit timezone @@ -1398,6 +1447,74 @@ mod tests { ); } + #[test] + fn test_string_to_time_invalid() { + let cases = [ + "25:00", + "9:00:", + "009:00", + "09:0:00", + "25:00:00", + "13:00 AM", + "13:00 PM", + "12:00. AM", + "09:0:00", + "09:01:0", + "09:01:1", + "9:1:0", + "09:01:0", + "1:00.123", + "1:00:00.123f", + " 9:00:00", + ":09:00", + "T9:00:00", + "AM", + ]; + for case in cases { + assert!(string_to_time(case).is_none(), "{case}"); + } + } + + #[test] + fn test_string_to_time_chrono() { + let cases = [ + ("1:00", "%H:%M"), + ("12:00", "%H:%M"), + ("13:00", "%H:%M"), + ("24:00", "%H:%M"), + ("1:00:00", "%H:%M:%S"), + ("12:00:30", "%H:%M:%S"), + ("13:00:59", "%H:%M:%S"), + ("24:00:60", "%H:%M:%S"), + ("09:00:00", "%H:%M:%S%.f"), + ("0:00:30.123456", "%H:%M:%S%.f"), + ("0:00 AM", "%I:%M %P"), + ("1:00 AM", "%I:%M %P"), + ("12:00 AM", "%I:%M %P"), + ("13:00 AM", "%I:%M %P"), + ("0:00 PM", "%I:%M %P"), + ("1:00 PM", "%I:%M %P"), + ("12:00 PM", "%I:%M %P"), + ("13:00 PM", "%I:%M %P"), + ("1:00 pM", "%I:%M %P"), + ("1:00 Pm", "%I:%M %P"), + ("1:00 aM", "%I:%M %P"), + ("1:00 Am", "%I:%M %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), + ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), + ]; + for (s, format) in cases { + let chrono = NaiveTime::parse_from_str(s, format).ok(); + let custom = string_to_time(s); + assert_eq!(chrono, custom, "{s}"); + } + } + #[test] fn test_parse_interval() { assert_eq!( From eacb135f5e4b67ca9ddab266c028b105ced1a180 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 15:22:12 +0000 Subject: [PATCH 0692/1411] Add timezone support to JSON reader (#3845) * Add timezone support to JSON reader * Fix doc --- arrow-array/src/array/primitive_array.rs | 2 +- arrow-array/src/types.rs | 26 +++---- arrow-cast/src/cast.rs | 2 +- arrow-json/src/raw/mod.rs | 73 +++++++++++++---- arrow-json/src/raw/timestamp_array.rs | 99 ++++++++++++++++++++++++ 5 files changed, 169 insertions(+), 33 deletions(-) create mode 100644 arrow-json/src/raw/timestamp_array.rs diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 408f0c4ae96a..d792f6819ae5 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1049,7 +1049,7 @@ impl PrimitiveArray { self.data .clone() .into_builder() - .data_type(DataType::Timestamp(T::get_time_unit(), timezone)) + .data_type(DataType::Timestamp(T::UNIT, timezone)) .build_unchecked() }; PrimitiveArray::from(array_data) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index f9ca050dc0e7..48eee4f5c3dc 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -287,30 +287,28 @@ impl ArrowTemporalType for DurationMicrosecondType {} impl ArrowTemporalType for DurationNanosecondType {} /// A timestamp type allows us to create array builders that take a timestamp. -pub trait ArrowTimestampType: ArrowTemporalType { +pub trait ArrowTimestampType: ArrowTemporalType { + /// The [`TimeUnit`] of this timestamp. + const UNIT: TimeUnit; + /// Returns the `TimeUnit` of this timestamp. - fn get_time_unit() -> TimeUnit; + #[deprecated(note = "Use Self::UNIT")] + fn get_time_unit() -> TimeUnit { + Self::UNIT + } } impl ArrowTimestampType for TimestampSecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Second - } + const UNIT: TimeUnit = TimeUnit::Second; } impl ArrowTimestampType for TimestampMillisecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Millisecond - } + const UNIT: TimeUnit = TimeUnit::Millisecond; } impl ArrowTimestampType for TimestampMicrosecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Microsecond - } + const UNIT: TimeUnit = TimeUnit::Microsecond; } impl ArrowTimestampType for TimestampNanosecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Nanosecond - } + const UNIT: TimeUnit = TimeUnit::Nanosecond; } impl IntervalYearMonthType { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 0a4b88ec89f6..1bd5027406b9 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2630,7 +2630,7 @@ fn cast_string_to_timestamp< .downcast_ref::>() .unwrap(); - let scale_factor = match TimestampType::get_time_unit() { + let scale_factor = match TimestampType::UNIT { TimeUnit::Second => 1_000_000_000, TimeUnit::Millisecond => 1_000_000, TimeUnit::Microsecond => 1_000, diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 1ab879d203fb..57bec9ee49c0 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -27,10 +27,13 @@ use crate::raw::primitive_array::PrimitiveArrayDecoder; use crate::raw::string_array::StringArrayDecoder; use crate::raw::struct_array::StructArrayDecoder; use crate::raw::tape::{Tape, TapeDecoder, TapeElement}; +use crate::raw::timestamp_array::TimestampArrayDecoder; +use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, SchemaRef, TimeUnit}; +use chrono::Utc; use std::io::BufRead; mod boolean_array; @@ -41,6 +44,7 @@ mod primitive_array; mod string_array; mod struct_array; mod tape; +mod timestamp_array; /// A builder for [`RawReader`] and [`RawDecoder`] pub struct RawReaderBuilder { @@ -293,10 +297,34 @@ fn make_decoder( data_type => (primitive_decoder, data_type), DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), - DataType::Timestamp(TimeUnit::Second, None) => primitive_decoder!(TimestampSecondType, data_type), - DataType::Timestamp(TimeUnit::Millisecond, None) => primitive_decoder!(TimestampMillisecondType, data_type), - DataType::Timestamp(TimeUnit::Microsecond, None) => primitive_decoder!(TimestampMicrosecondType, data_type), - DataType::Timestamp(TimeUnit::Nanosecond, None) => primitive_decoder!(TimestampNanosecondType, data_type), + DataType::Timestamp(TimeUnit::Second, None) => { + Ok(Box::new(TimestampArrayDecoder::::new(data_type, Utc))) + }, + DataType::Timestamp(TimeUnit::Millisecond, None) => { + Ok(Box::new(TimestampArrayDecoder::::new(data_type, Utc))) + }, + DataType::Timestamp(TimeUnit::Microsecond, None) => { + Ok(Box::new(TimestampArrayDecoder::::new(data_type, Utc))) + }, + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + Ok(Box::new(TimestampArrayDecoder::::new(data_type, Utc))) + }, + DataType::Timestamp(TimeUnit::Second, Some(ref tz)) => { + let tz: Tz = tz.parse()?; + Ok(Box::new(TimestampArrayDecoder::::new(data_type, tz))) + }, + DataType::Timestamp(TimeUnit::Millisecond, Some(ref tz)) => { + let tz: Tz = tz.parse()?; + Ok(Box::new(TimestampArrayDecoder::::new(data_type, tz))) + }, + DataType::Timestamp(TimeUnit::Microsecond, Some(ref tz)) => { + let tz: Tz = tz.parse()?; + Ok(Box::new(TimestampArrayDecoder::::new(data_type, tz))) + }, + DataType::Timestamp(TimeUnit::Nanosecond, Some(ref tz)) => { + let tz: Tz = tz.parse()?; + Ok(Box::new(TimestampArrayDecoder::::new(data_type, tz))) + }, DataType::Date32 => primitive_decoder!(Date32Type, data_type), DataType::Date64 => primitive_decoder!(Date64Type, data_type), DataType::Time32(TimeUnit::Second) => primitive_decoder!(Time32SecondType, data_type), @@ -809,29 +837,27 @@ mod tests { fn test_timestamp() { let buf = r#" - {"a": 1, "b": "2020-09-08T13:42:29.190855+00:00", "c": 38.30} - {"a": 2, "b": "2020-09-08T13:42:29.190855Z", "c": 123.456} + {"a": 1, "b": "2020-09-08T13:42:29.190855+00:00", "c": 38.30, "d": "1997-01-31T09:26:56.123"} + {"a": 2, "b": "2020-09-08T13:42:29.190855Z", "c": 123.456, "d": 123.456} - {"b": 1337, "b": "2020-09-08T13:42:29Z", "c": "1997-01-31T09:26:56.123"} - {"b": 40, "c": "2020-09-08T13:42:29.190855+00:00"} - {"b": 1234, "a": null, "c": "1997-01-31 09:26:56.123Z"} - {"c": "1997-01-31T14:26:56.123-05:00"} + {"b": 1337, "b": "2020-09-08T13:42:29Z", "c": "1997-01-31T09:26:56.123", "d": "1997-01-31T09:26:56.123Z"} + {"b": 40, "c": "2020-09-08T13:42:29.190855+00:00", "d": "1997-01-31 09:26:56.123-05:00"} + {"b": 1234, "a": null, "c": "1997-01-31 09:26:56.123Z", "d": "1997-01-31 092656"} + {"c": "1997-01-31T14:26:56.123-05:00", "d": "1997-01-31"} "#; + let with_timezone = DataType::Timestamp(T::UNIT, Some("+08:00".to_string())); let schema = Arc::new(Schema::new(vec![ Field::new("a", T::DATA_TYPE, true), Field::new("b", T::DATA_TYPE, true), Field::new("c", T::DATA_TYPE, true), + Field::new("d", with_timezone, true), ])); let batches = do_read(buf, 1024, true, schema); assert_eq!(batches.len(), 1); - let unit = match T::DATA_TYPE { - DataType::Timestamp(unit, _) => unit, - _ => unreachable!(), - }; - let unit_in_nanos = match unit { + let unit_in_nanos: i64 = match T::UNIT { TimeUnit::Second => 1_000_000_000, TimeUnit::Millisecond => 1_000_000, TimeUnit::Microsecond => 1_000, @@ -859,7 +885,6 @@ mod tests { 1234, 0 ] - .map(T::Native::usize_as) ); let col3 = as_primitive_array::(batches[0].column(2)); @@ -874,7 +899,21 @@ mod tests { 854702816123000000 / unit_in_nanos, 854738816123000000 / unit_in_nanos ] - .map(T::Native::usize_as) + ); + + let col4 = as_primitive_array::(batches[0].column(3)); + + assert_eq!(col4.null_count(), 0); + assert_eq!( + col4.values(), + &[ + 854674016123000000 / unit_in_nanos, + 123, + 854702816123000000 / unit_in_nanos, + 854720816123000000 / unit_in_nanos, + 854674016000000000 / unit_in_nanos, + 854640000000000000 / unit_in_nanos + ] ); } diff --git a/arrow-json/src/raw/timestamp_array.rs b/arrow-json/src/raw/timestamp_array.rs new file mode 100644 index 000000000000..07feaa974ee4 --- /dev/null +++ b/arrow-json/src/raw/timestamp_array.rs @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use chrono::TimeZone; +use num::NumCast; +use std::marker::PhantomData; + +use arrow_array::builder::PrimitiveBuilder; +use arrow_array::types::ArrowTimestampType; +use arrow_array::Array; +use arrow_cast::parse::string_to_datetime; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, TimeUnit}; + +use crate::raw::tape::{Tape, TapeElement}; +use crate::raw::{tape_error, ArrayDecoder}; + +/// A specialized [`ArrayDecoder`] for timestamps +pub struct TimestampArrayDecoder { + data_type: DataType, + timezone: Tz, + // Invariant and Send + phantom: PhantomData P>, +} + +impl TimestampArrayDecoder { + pub fn new(data_type: DataType, timezone: Tz) -> Self { + Self { + data_type, + timezone, + phantom: Default::default(), + } + } +} + +impl ArrayDecoder for TimestampArrayDecoder +where + P: ArrowTimestampType, + Tz: TimeZone + Send, +{ + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) + .with_data_type(self.data_type.clone()); + + for p in pos { + match tape.get(*p) { + TapeElement::Null => builder.append_null(), + TapeElement::String(idx) => { + let s = tape.get_string(idx); + let date = string_to_datetime(&self.timezone, s).map_err(|e| { + ArrowError::JsonError(format!( + "failed to parse \"{s}\" as {}: {}", + self.data_type, e + )) + })?; + + let value = match P::UNIT { + TimeUnit::Second => date.timestamp(), + TimeUnit::Millisecond => date.timestamp_millis(), + TimeUnit::Microsecond => date.timestamp_micros(), + TimeUnit::Nanosecond => date.timestamp_nanos(), + }; + builder.append_value(value) + } + TapeElement::Number(idx) => { + let s = tape.get_string(idx); + let value = lexical_core::parse::(s.as_bytes()) + .ok() + .and_then(NumCast::from) + .ok_or_else(|| { + ArrowError::JsonError(format!( + "failed to parse {s} as {}", + self.data_type + )) + })?; + + builder.append_value(value) + } + d => return Err(tape_error(d, "primitive")), + } + } + + Ok(builder.finish().into_data()) + } +} From b9090ae1e04f5462c4c1c6b9c2b4443e4dc9f31d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 16:24:07 +0000 Subject: [PATCH 0693/1411] Add BitIterator (#3856) * Add BitIterator * Review feedback --- arrow-data/src/bit_iterator.rs | 102 ++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/arrow-data/src/bit_iterator.rs b/arrow-data/src/bit_iterator.rs index ea95f1f38b01..39898b1c4295 100644 --- a/arrow-data/src/bit_iterator.rs +++ b/arrow-data/src/bit_iterator.rs @@ -16,8 +16,72 @@ // under the License. use arrow_buffer::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use arrow_buffer::bit_util::{ceil, get_bit_raw}; use std::result::Result; +/// Iterator over the bits within a packed bitmask +/// +/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and [`BitSliceIterator`] +pub struct BitIterator<'a> { + buffer: &'a [u8], + current_offset: usize, + end_offset: usize, +} + +impl<'a> BitIterator<'a> { + /// Create a new [`BitIterator`] from the provided `buffer`, + /// and `offset` and `len` in bits + /// + /// # Panic + /// + /// Panics if `buffer` is too short for the provided offset and length + pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { + let end_offset = offset.checked_add(len).unwrap(); + let required_len = ceil(end_offset, 8); + assert!( + buffer.len() >= required_len, + "BitIterator buffer too small, expected {required_len} got {}", + buffer.len() + ); + + Self { + buffer, + current_offset: offset, + end_offset, + } + } +} + +impl<'a> Iterator for BitIterator<'a> { + type Item = bool; + + fn next(&mut self) -> Option { + if self.current_offset == self.end_offset { + return None; + } + // Safety: + // offsets in bounds + let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.current_offset) }; + self.current_offset += 1; + Some(v) + } +} + +impl<'a> ExactSizeIterator for BitIterator<'a> {} + +impl<'a> DoubleEndedIterator for BitIterator<'a> { + fn next_back(&mut self) -> Option { + if self.current_offset == self.end_offset { + return None; + } + self.end_offset -= 1; + // Safety: + // offsets in bounds + let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) }; + Some(v) + } +} + /// Iterator of contiguous ranges of set bits within a provided packed bitmask /// /// Returns `(usize, usize)` each representing an interval where the corresponding @@ -32,7 +96,7 @@ pub struct BitSliceIterator<'a> { } impl<'a> BitSliceIterator<'a> { - /// Create a new [`BitSliceIterator`] from the provide `buffer`, + /// Create a new [`BitSliceIterator`] from the provided `buffer`, /// and `offset` and `len` in bits pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self { let chunk = UnalignedBitChunk::new(buffer, offset, len); @@ -192,4 +256,38 @@ pub fn try_for_each_valid_idx Result<(), E>>( } } -// Note: tests located in filter module +// Note: further tests located in arrow_select::filter module + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bit_iterator() { + let mask = &[0b00010010, 0b00100011, 0b00000101, 0b00010001, 0b10010011]; + let actual: Vec<_> = BitIterator::new(mask, 0, 5).collect(); + assert_eq!(actual, &[false, true, false, false, true]); + + let actual: Vec<_> = BitIterator::new(mask, 4, 5).collect(); + assert_eq!(actual, &[true, false, false, false, true]); + + let actual: Vec<_> = BitIterator::new(mask, 12, 14).collect(); + assert_eq!( + actual, + &[ + false, true, false, false, true, false, true, false, false, false, false, + false, true, false + ] + ); + + assert_eq!(BitIterator::new(mask, 0, 0).count(), 0); + assert_eq!(BitIterator::new(mask, 40, 0).count(), 0); + } + + #[test] + #[should_panic(expected = "BitIterator buffer too small, expected 3 got 2")] + fn test_bit_iterator_bounds() { + let mask = &[223, 23]; + BitIterator::new(mask, 17, 0); + } +} From 7f5f6b8d707011aab667fae43a8fcde4cf403145 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 16:59:04 +0000 Subject: [PATCH 0694/1411] Flatten arrow_buffer (#3883) --- arrow-array/src/array/byte_array.rs | 2 +- arrow-array/src/array/list_array.rs | 2 +- arrow-array/src/array/map_array.rs | 2 +- arrow-array/src/array/mod.rs | 3 +-- arrow-array/src/array/primitive_array.rs | 3 +-- arrow-array/src/array/run_array.rs | 3 +-- arrow-array/src/array/struct_array.rs | 3 +-- arrow-array/src/lib.rs | 2 +- arrow-buffer/src/lib.rs | 2 +- 9 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 442e795cec52..078a081957bb 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -21,7 +21,7 @@ use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, OffsetSizeTrait}; -use arrow_buffer::buffer::OffsetBuffer; +use arrow_buffer::OffsetBuffer; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayData; use arrow_schema::DataType; diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 178139f810e7..830e307f05fa 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -20,8 +20,8 @@ use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; -use arrow_buffer::buffer::OffsetBuffer; use arrow_buffer::ArrowNativeType; +use arrow_buffer::OffsetBuffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 8c9b02921781..923a64fc0a8e 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -17,7 +17,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::{make_array, Array, ArrayRef, StringArray, StructArray}; -use arrow_buffer::buffer::OffsetBuffer; +use arrow_buffer::OffsetBuffer; use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index dfdaac85bf85..ba63a19aafd6 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,8 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::buffer::{OffsetBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index d792f6819ae5..0e78083c4795 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -25,8 +25,7 @@ use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::{types::*, ArrowNativeTypeOp}; use crate::{Array, ArrayAccessor}; -use arrow_buffer::buffer::ScalarBuffer; -use arrow_buffer::{i256, ArrowNativeType, Buffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, ScalarBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index e50903f30f9b..5b7f7fb9e407 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -17,8 +17,7 @@ use std::any::Any; -use arrow_buffer::buffer::RunEndBuffer; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, RunEndBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 35d4444e0117..3c4ddef407c7 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -16,8 +16,7 @@ // under the License. use crate::{make_array, Array, ArrayRef}; -use arrow_buffer::buffer::buffer_bin_or; -use arrow_buffer::Buffer; +use arrow_buffer::{buffer_bin_or, Buffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use std::{any::Any, ops::Index}; diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index bfdc35c6ce5d..ada59564bf0e 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -152,7 +152,7 @@ //! //! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html //! [`&dyn Array`]: Array -//! [`NullBuffer`]: arrow_buffer::buffer::NullBuffer +//! [`NullBuffer`]: arrow_buffer::NullBuffer //! [`Buffer`]: arrow_buffer::Buffer //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 13d44e4d57ff..364e92db229c 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -19,7 +19,7 @@ pub mod alloc; pub mod buffer; -pub use buffer::{Buffer, MutableBuffer}; +pub use buffer::*; mod bigint; mod bytes; From b08490eb60e9bdde5193c142005190d3ea1abb63 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 17:39:03 +0000 Subject: [PATCH 0695/1411] Seal ArrowPrimitiveType (#3882) * Seal ArrowPrimitiveType * Fix doc * Review feedback --- arrow-array/src/array/primitive_array.rs | 24 +---------- arrow-array/src/types.rs | 55 ++++++++++++++++-------- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 0e78083c4795..9b3b11c8215e 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -23,7 +23,7 @@ use crate::temporal_conversions::{ }; use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; -use crate::{types::*, ArrowNativeTypeOp}; +use crate::types::*; use crate::{Array, ArrayAccessor}; use arrow_buffer::{i256, ArrowNativeType, Buffer, ScalarBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; @@ -229,27 +229,7 @@ pub type Decimal128Array = PrimitiveArray; /// scale less or equal to 76. pub type Decimal256Array = PrimitiveArray; -/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the -/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. -pub trait ArrowPrimitiveType: 'static { - /// Corresponding Rust native type for the primitive type. - type Native: ArrowNativeTypeOp; - - /// the corresponding Arrow data type of this primitive type. - const DATA_TYPE: DataType; - - /// Returns the byte width of this primitive type. - fn get_byte_width() -> usize { - std::mem::size_of::() - } - - /// Returns a default value of this primitive type. - /// - /// This is useful for aggregate array ops like `sum()`, `mean()`. - fn default_value() -> Self::Native { - Default::default() - } -} +pub use crate::types::ArrowPrimitiveType; /// Array whose elements are of primitive types. /// diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 48eee4f5c3dc..9f1965b77570 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -17,9 +17,8 @@ //! Zero-sized types used to parameterize generic array implementations -use crate::array::ArrowPrimitiveType; use crate::delta::shift_months; -use crate::OffsetSizeTrait; +use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::i256; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; use arrow_schema::{ @@ -39,10 +38,38 @@ use std::ops::{Add, Sub}; pub struct BooleanType {} impl BooleanType { - /// Type represetings is arrow [`DataType`] + /// The corresponding Arrow data type pub const DATA_TYPE: DataType = DataType::Boolean; } +/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the +/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. +/// +/// [`ArrowNativeType`]: arrow_buffer::ArrowNativeType +pub trait ArrowPrimitiveType: primitive::PrimitiveTypeSealed + 'static { + /// Corresponding Rust native type for the primitive type. + type Native: ArrowNativeTypeOp; + + /// the corresponding Arrow data type of this primitive type. + const DATA_TYPE: DataType; + + /// Returns the byte width of this primitive type. + fn get_byte_width() -> usize { + std::mem::size_of::() + } + + /// Returns a default value of this primitive type. + /// + /// This is useful for aggregate array ops like `sum()`, `mean()`. + fn default_value() -> Self::Native { + Default::default() + } +} + +mod primitive { + pub trait PrimitiveTypeSealed {} +} + macro_rules! make_type { ($name:ident, $native_ty:ty, $data_ty:expr, $doc_string: literal) => { #[derive(Debug)] @@ -53,6 +80,8 @@ macro_rules! make_type { type Native = $native_ty; const DATA_TYPE: DataType = $data_ty; } + + impl primitive::PrimitiveTypeSealed for $name {} }; } @@ -240,24 +269,10 @@ impl ArrowDictionaryKeyType for UInt32Type {} impl ArrowDictionaryKeyType for UInt64Type {} -mod run { - use super::*; - - pub trait RunEndTypeSealed {} - - impl RunEndTypeSealed for Int16Type {} - - impl RunEndTypeSealed for Int32Type {} - - impl RunEndTypeSealed for Int64Type {} -} - /// A subtype of primitive type that is used as run-ends index /// in `RunArray`. /// See -/// -/// Note: The implementation of this trait is sealed to avoid accidental misuse. -pub trait RunEndIndexType: ArrowPrimitiveType + run::RunEndTypeSealed {} +pub trait RunEndIndexType: ArrowPrimitiveType {} impl RunEndIndexType for Int16Type {} @@ -646,6 +661,8 @@ impl ArrowPrimitiveType for Decimal128Type { const DATA_TYPE: DataType = ::DEFAULT_TYPE; } +impl primitive::PrimitiveTypeSealed for Decimal128Type {} + /// The decimal type for a Decimal256Array #[derive(Debug)] pub struct Decimal256Type {} @@ -674,6 +691,8 @@ impl ArrowPrimitiveType for Decimal256Type { const DATA_TYPE: DataType = ::DEFAULT_TYPE; } +impl primitive::PrimitiveTypeSealed for Decimal256Type {} + fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String { let (sign, rest) = match value_str.strip_prefix('-') { Some(stripped) => ("-", stripped), From 4e4a7f7dbfa348d97a806240954eec8402d3b259 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 17:57:55 +0000 Subject: [PATCH 0696/1411] Add Array::to_data and Array::nulls (#3880) (#3881) * Add Array::to_data and Array::nulls (#3880) * Review feedback * Format --- arrow-arith/src/aggregate.rs | 8 ++-- arrow-arith/src/arity.rs | 3 +- arrow-arith/src/boolean.rs | 24 +++++----- arrow-array/src/array/binary_array.rs | 2 +- arrow-array/src/array/boolean_array.rs | 17 ++++++- arrow-array/src/array/byte_array.rs | 18 +++++++- arrow-array/src/array/dictionary_array.rs | 27 ++++++++++++ .../src/array/fixed_size_binary_array.rs | 17 ++++++- .../src/array/fixed_size_list_array.rs | 15 +++++++ arrow-array/src/array/list_array.rs | 17 ++++++- arrow-array/src/array/map_array.rs | 16 ++++++- arrow-array/src/array/mod.rs | 44 +++++++++++++++---- arrow-array/src/array/null_array.rs | 17 ++++++- arrow-array/src/array/primitive_array.rs | 18 +++++++- arrow-array/src/array/run_array.rs | 28 +++++++++++- arrow-array/src/array/struct_array.rs | 17 +++++-- arrow-array/src/array/union_array.rs | 14 ++++++ arrow-array/src/builder/boolean_builder.rs | 4 +- .../src/builder/fixed_size_list_builder.rs | 8 ++-- arrow-array/src/cast.rs | 2 +- arrow-array/src/record_batch.rs | 4 +- arrow-integration-test/src/lib.rs | 4 +- arrow-json/src/reader.rs | 23 ++++------ arrow-ord/src/comparison.rs | 7 ++- arrow-row/src/lib.rs | 4 +- arrow-select/src/concat.rs | 2 +- arrow-select/src/filter.rs | 2 +- arrow-string/src/length.rs | 8 ++-- arrow-string/src/regexp.rs | 2 +- arrow-string/src/substring.rs | 8 ++-- arrow/benches/array_data_validate.rs | 2 +- arrow/examples/dynamic_types.rs | 2 +- arrow/src/ffi.rs | 6 +-- arrow/src/util/bench_util.rs | 2 +- arrow/src/util/data_gen.rs | 4 +- 35 files changed, 301 insertions(+), 95 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 7777bb0ede43..8e760da21909 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -117,7 +117,7 @@ where .map(|i| unsafe { array.value_unchecked(i) }) .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { - let nulls = array.data().nulls().unwrap(); + let nulls = array.nulls().unwrap(); let iter = BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()); unsafe { let idx = iter.reduce(|acc_idx, idx| { @@ -288,7 +288,7 @@ where let data: &[T::Native] = array.values(); - match array.data().nulls() { + match array.nulls() { None => { let sum = data.iter().fold(T::default_value(), |accumulator, value| { accumulator.add_wrapping(*value) @@ -347,7 +347,7 @@ where let data: &[T::Native] = array.values(); - match array.data().nulls() { + match array.nulls() { None => { let sum = data .iter() @@ -665,7 +665,7 @@ mod simd { let mut chunk_acc = A::init_accumulator_chunk(); let mut rem_acc = A::init_accumulator_scalar(); - match array.data().nulls() { + match array.nulls() { None => { let data_chunks = data.chunks_exact(64); let remainder = data_chunks.remainder(); diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 74edd654bbcd..162b56ef1fe3 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -414,8 +414,7 @@ where let array_builder = builder .finish() - .data() - .clone() + .into_data() .into_builder() .null_bit_buffer(null_buffer) .null_count(null_count); diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 61942dc90b81..3e21c2f1b484 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -610,7 +610,7 @@ mod tests { let a = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is absent - assert!(a.data().nulls().is_none()); + assert!(a.nulls().is_none()); let b = BooleanArray::from(vec![ Some(true), @@ -622,7 +622,7 @@ mod tests { ]); // ensure null bitmap of b is present - assert!(b.data().nulls().is_some()); + assert!(b.nulls().is_some()); let c = or_kleene(&a, &b).unwrap(); @@ -650,12 +650,12 @@ mod tests { ]); // ensure null bitmap of b is absent - assert!(a.data().nulls().is_some()); + assert!(a.nulls().is_some()); let b = BooleanArray::from(vec![false, false, false, true, true, true]); // ensure null bitmap of a is present - assert!(b.data().nulls().is_none()); + assert!(b.nulls().is_none()); let c = or_kleene(&a, &b).unwrap(); @@ -852,7 +852,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -865,7 +865,7 @@ mod tests { let expected = BooleanArray::from(vec![false, false, false, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -877,7 +877,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -890,7 +890,7 @@ mod tests { let expected = BooleanArray::from(vec![true, true, true, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -902,7 +902,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -933,7 +933,7 @@ mod tests { let expected = BooleanArray::from(vec![false, true, false, true]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -945,7 +945,7 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } #[test] @@ -976,6 +976,6 @@ mod tests { let expected = BooleanArray::from(vec![true, false, true, false]); assert_eq!(expected, res); - assert!(res.data().nulls().is_none()); + assert!(res.nulls().is_none()); } } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 1a3270a70d80..b965279fb796 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -77,7 +77,7 @@ impl GenericBinaryArray { .offset(v.offset()) .add_buffer(v.data_ref().buffers()[0].clone()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .nulls(v.data().nulls().cloned()); + .nulls(v.nulls().cloned()); let data = unsafe { builder.build_unchecked() }; Self::from(data) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index e924824e75ea..89fdca507b00 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -18,12 +18,14 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; -use crate::{Array, ArrayAccessor}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// Array of bools /// @@ -265,9 +267,22 @@ impl Array for BooleanArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a> ArrayAccessor for &'a BooleanArray { diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 078a081957bb..991e02501505 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -20,12 +20,13 @@ use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; -use crate::{Array, ArrayAccessor, OffsetSizeTrait}; -use arrow_buffer::OffsetBuffer; +use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// Generic struct for variable-size byte arrays /// @@ -237,9 +238,22 @@ impl Array for GenericByteArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index f9a40c6f3400..ee58a485c71c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -23,10 +23,12 @@ use crate::{ make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, PrimitiveArray, StringArray, }; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; +use std::sync::Arc; /// /// A dictionary array where each element is a single value indexed by an integer key. @@ -590,9 +592,22 @@ impl Array for DictionaryArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl std::fmt::Debug for DictionaryArray { @@ -669,9 +684,21 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, &self.dictionary.data } + fn to_data(&self) -> ArrayData { + self.dictionary.to_data() + } + fn into_data(self) -> ArrayData { self.dictionary.into_data() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + self.dictionary.slice(offset, length) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.dictionary.nulls() + } } impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 87f1b955723d..062961a20abb 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -17,11 +17,13 @@ use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; -use crate::{Array, ArrayAccessor, FixedSizeListArray}; +use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; +use std::sync::Arc; /// An array where each element is a fixed-size sequence of bytes. /// @@ -462,9 +464,22 @@ impl Array for FixedSizeBinaryArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a> ArrayAccessor for &'a FixedSizeBinaryArray { diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 6e228ba3c770..7d65927cdeec 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -18,9 +18,11 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// A list array where each element is a fixed-size sequence of values with the same /// type whose maximum length is represented by a i32. @@ -205,9 +207,22 @@ impl Array for FixedSizeListArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl ArrayAccessor for FixedSizeListArray { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 830e307f05fa..dca256008db2 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -20,12 +20,12 @@ use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; -use arrow_buffer::ArrowNativeType; -use arrow_buffer::OffsetBuffer; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; use std::any::Any; +use std::sync::Arc; /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { @@ -244,9 +244,22 @@ impl Array for GenericListArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray { diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 923a64fc0a8e..6cd627cbd838 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -17,8 +17,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::{make_array, Array, ArrayRef, StringArray, StructArray}; -use arrow_buffer::OffsetBuffer; -use arrow_buffer::{ArrowNativeType, Buffer, ToByteSlice}; +use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; @@ -214,10 +213,23 @@ impl Array for MapArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } + /// Returns the total number of bytes of memory occupied by the buffers owned by this [MapArray]. fn get_buffer_memory_size(&self) -> usize { self.data.get_buffer_memory_size() diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index ba63a19aafd6..1ddcc2881863 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -94,13 +94,22 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn as_any(&self) -> &dyn Any; - /// Returns a reference to the underlying data of this array. + /// Returns a reference to the underlying data of this array + /// + /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) fn data(&self) -> &ArrayData; - /// Returns the underlying data of this array. + /// Returns the underlying data of this array + fn to_data(&self) -> ArrayData; + + /// Returns the underlying data of this array + /// + /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones fn into_data(self) -> ArrayData; /// Returns a reference-counted pointer to the underlying data of this array. + /// + /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) fn data_ref(&self) -> &ArrayData { self.data() } @@ -134,9 +143,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array_slice.as_ref(), &Int32Array::from(vec![2, 3, 4])); /// ``` - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - make_array(self.data_ref().slice(offset, length)) - } + fn slice(&self, offset: usize, length: usize) -> ArrayRef; /// Returns the length (i.e., number of elements) of this array. /// @@ -188,6 +195,9 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.data_ref().offset() } + /// Returns the null buffers of this array if any + fn nulls(&self) -> Option<&NullBuffer>; + /// Returns whether the element at `index` is null. /// When using this function on a slice, the index is relative to the slice. /// @@ -202,7 +212,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.is_null(1), true); /// ``` fn is_null(&self, index: usize) -> bool { - self.data_ref().is_null(index) + self.nulls().map(|n| n.is_null(index)).unwrap_or_default() } /// Returns whether the element at `index` is not null. @@ -219,7 +229,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.is_valid(1), false); /// ``` fn is_valid(&self, index: usize) -> bool { - self.data_ref().is_valid(index) + !self.is_null(index) } /// Returns the total number of null values in this array. @@ -235,7 +245,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.null_count(), 2); /// ``` fn null_count(&self) -> usize { - self.data_ref().null_count() + self.nulls().map(|n| n.null_count()).unwrap_or_default() } /// Returns the total number of bytes of memory pointed to by this array. @@ -268,6 +278,10 @@ impl Array for ArrayRef { self.as_ref().data() } + fn to_data(&self) -> ArrayData { + self.as_ref().to_data() + } + fn into_data(self) -> ArrayData { self.data().clone() } @@ -296,6 +310,10 @@ impl Array for ArrayRef { self.as_ref().offset() } + fn nulls(&self) -> Option<&NullBuffer> { + self.as_ref().nulls() + } + fn is_null(&self, index: usize) -> bool { self.as_ref().is_null(index) } @@ -326,6 +344,10 @@ impl<'a, T: Array> Array for &'a T { T::data(self) } + fn to_data(&self) -> ArrayData { + T::to_data(self) + } + fn into_data(self) -> ArrayData { self.data().clone() } @@ -354,6 +376,10 @@ impl<'a, T: Array> Array for &'a T { T::offset(self) } + fn nulls(&self) -> Option<&NullBuffer> { + T::nulls(self) + } + fn is_null(&self, index: usize) -> bool { T::is_null(self, index) } diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 8eb8e64b0eda..fba6e41e871d 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -17,10 +17,12 @@ //! Contains the `NullArray` type. -use crate::Array; +use crate::{Array, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; +use std::sync::Arc; /// An Array where all elements are nulls /// @@ -63,10 +65,23 @@ impl Array for NullArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } + /// Returns whether the element at `index` is null. /// All elements of a `NullArray` are always null. fn is_null(&self, _index: usize) -> bool { diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 9b3b11c8215e..4a484de78828 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -24,14 +24,15 @@ use crate::temporal_conversions::{ use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::types::*; -use crate::{Array, ArrayAccessor}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, ScalarBuffer}; +use crate::{Array, ArrayAccessor, ArrayRef}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; use std::any::Any; +use std::sync::Arc; /// /// # Example: Using `collect` @@ -676,9 +677,22 @@ impl Array for PrimitiveArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() + } } impl<'a, T: ArrowPrimitiveType> ArrayAccessor for &'a PrimitiveArray { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 5b7f7fb9e407..f62da38fb241 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -16,8 +16,9 @@ // under the License. use std::any::Any; +use std::sync::Arc; -use arrow_buffer::{ArrowNativeType, RunEndBuffer}; +use arrow_buffer::{ArrowNativeType, NullBuffer, RunEndBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -287,9 +288,22 @@ impl Array for RunArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } } impl std::fmt::Debug for RunArray { @@ -472,9 +486,21 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { &self.run_array.data } + fn to_data(&self) -> ArrayData { + self.run_array.to_data() + } + fn into_data(self) -> ArrayData { self.run_array.into_data() } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + self.run_array.slice(offset, length) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.run_array.nulls() + } } // Array accessor converts the index of logical array to the index of the physical array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 3c4ddef407c7..4fe59c0c240f 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -16,9 +16,10 @@ // under the License. use crate::{make_array, Array, ArrayRef}; -use arrow_buffer::{buffer_bin_or, Buffer}; +use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; +use std::sync::Arc; use std::{any::Any, ops::Index}; /// A nested array type where each child (called *field*) is represented by a separate @@ -195,13 +196,21 @@ impl Array for StructArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } - /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> usize { - self.data_ref().len() + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + // TODO: Slice buffers directly (#3880) + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.data.nulls() } } diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 867eb8d59fde..5a4d2af7ca45 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -16,12 +16,14 @@ // under the License. use crate::{make_array, Array, ArrayRef}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field, UnionMode}; /// Contains the `UnionArray` type. /// use std::any::Any; +use std::sync::Arc; /// An Array that can represent slots of varying types. /// @@ -317,10 +319,22 @@ impl Array for UnionArray { &self.data } + fn to_data(&self) -> ArrayData { + self.data.clone() + } + fn into_data(self) -> ArrayData { self.into() } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(Self::from(self.data.slice(offset, length))) + } + + fn nulls(&self) -> Option<&NullBuffer> { + None + } + /// Union types always return non null as there is no validity buffer. /// To check validity correctly you must check the underlying vector. fn is_null(&self, _index: usize) -> bool { diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 0862b35b07e0..0002309a3d55 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -289,7 +289,7 @@ mod tests { let array = builder.finish(); assert_eq!(0, array.null_count()); - assert!(array.data().nulls().is_none()); + assert!(array.nulls().is_none()); } #[test] @@ -311,7 +311,7 @@ mod tests { assert_eq!(4, array.false_count()); assert_eq!(0, array.null_count()); - assert!(array.data().nulls().is_none()); + assert!(array.nulls().is_none()); } #[test] diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index bc4ce466ac39..f8cd5d15f852 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -157,7 +157,7 @@ where pub fn finish(&mut self) -> FixedSizeListArray { let len = self.len(); let values_arr = self.values_builder.finish(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); assert_eq!( values_data.len(), len * self.list_len as usize, @@ -173,7 +173,7 @@ where self.list_len, )) .len(len) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; @@ -185,7 +185,7 @@ where pub fn finish_cloned(&self) -> FixedSizeListArray { let len = self.len(); let values_arr = self.values_builder.finish_cloned(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); assert_eq!( values_data.len(), len * self.list_len as usize, @@ -204,7 +204,7 @@ where self.list_len, )) .len(len) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 4bae4932c5f1..81d250cafffe 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -706,7 +706,7 @@ pub fn downcast_array(array: &dyn Array) -> T where T: From, { - T::from(array.data().clone()) + T::from(array.to_data()) } #[cfg(test)] diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 20e4e19bad39..9e9f15daea4b 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -611,8 +611,8 @@ mod tests { assert_eq!(2, record_batch.num_columns()); assert_eq!(&DataType::Int32, record_batch.schema().field(0).data_type()); assert_eq!(&DataType::Utf8, record_batch.schema().field(1).data_type()); - assert_eq!(num_rows, record_batch.column(0).data().len()); - assert_eq!(num_rows, record_batch.column(1).data().len()); + assert_eq!(num_rows, record_batch.column(0).len()); + assert_eq!(num_rows, record_batch.column(1).len()); } #[test] diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 87a7edc8740b..6f9e8a4eb1aa 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1271,8 +1271,8 @@ mod tests { ]); let struct_data = ArrayData::builder(struct_data_type) .len(3) - .add_child_data(structs_int32s.data().clone()) - .add_child_data(structs_utf8s.data().clone()) + .add_child_data(structs_int32s.into_data()) + .add_child_data(structs_utf8s.into_data()) .null_bit_buffer(Some(Buffer::from([0b00000011]))) .build() .unwrap(); diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index f4610eb345ea..5d86f9a578c2 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -1178,13 +1178,11 @@ impl Decoder { DataType::Utf8 => flatten_json_string_values(rows) .into_iter() .collect::() - .data() - .clone(), + .into_data(), DataType::LargeUtf8 => flatten_json_string_values(rows) .into_iter() .collect::() - .data() - .clone(), + .into_data(), DataType::List(field) => { let child = self .build_nested_list_array::(&flatten_json_values(rows), field)?; @@ -2411,7 +2409,7 @@ mod tests { ]); let c = ArrayDataBuilder::new(c_field.data_type().clone()) .len(7) - .add_child_data(d.data().clone()) + .add_child_data(d.to_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00111011]))) .build() .unwrap(); @@ -2426,7 +2424,7 @@ mod tests { ]); let a = ArrayDataBuilder::new(a_struct_field.data_type().clone()) .len(7) - .add_child_data(b.data().clone()) + .add_child_data(b.to_data()) .add_child_data(c.clone()) .null_bit_buffer(Some(Buffer::from(vec![0b00111111]))) .build() @@ -2452,7 +2450,7 @@ mod tests { Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) ); // compare list null buffers - assert_eq!(read.data().nulls(), expected.data().nulls()); + assert_eq!(read.nulls(), expected.nulls()); // build struct from list let struct_array = as_struct_array(read.values()); let expected_struct_array = as_struct_array(expected.values()); @@ -2462,10 +2460,7 @@ mod tests { assert_eq!(7, expected_struct_array.len()); assert_eq!(1, expected_struct_array.null_count()); // test struct's nulls - assert_eq!( - struct_array.data().nulls(), - expected_struct_array.data().nulls() - ); + assert_eq!(struct_array.nulls(), expected_struct_array.nulls()); // test struct's fields let read_b = struct_array.column(0); assert_eq!(b.data_ref(), read_b.data_ref()); @@ -2512,13 +2507,11 @@ mod tests { let expected_keys = StringArray::from(vec![ "long", "short", "long", "short", "hedged", "long", "short", ]) - .data() - .clone(); + .into_data(); let expected_value_array_data = StringArray::from(vec![ "$AAA", "$BBB", "$CCC", "$D", "$AAA", "$CCC", "$D", "$YYY", "$D", ]) - .data() - .clone(); + .into_data(); // Create the list that holds ["$_", "$_"] let expected_values = ArrayDataBuilder::new(value_list_type) .len(7) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index b235df036077..e0853da32e80 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -210,7 +210,7 @@ pub fn eq_bool_scalar( DataType::Boolean, len, None, - left.data().nulls().map(|b| b.inner().sliced()), + left.nulls().map(|b| b.inner().sliced()), 0, vec![values], vec![], @@ -1433,7 +1433,7 @@ where result_remainder.copy_from_slice(remainder_mask_as_bytes); } - let null_bit_buffer = left.data().nulls().map(|b| b.inner().sliced()); + let null_bit_buffer = left.nulls().map(|b| b.inner().sliced()); // null count is the same as in the input since the right side of the scalar comparison cannot be null let null_count = left.null_count(); @@ -3519,8 +3519,7 @@ mod tests { None, Some(7), ]) - .data() - .clone(); + .into_data(); let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 6, 9]); let list_data_type = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index e4b02fbf230d..2c1de68c1926 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1314,7 +1314,7 @@ unsafe fn decode_column( rows.iter_mut().for_each(|row| *row = &row[1..]); let children = converter.convert_raw(rows, validate_utf8)?; - let child_data = children.iter().map(|c| c.data().clone()).collect(); + let child_data = children.iter().map(|c| c.to_data()).collect(); let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) .null_count(null_count) @@ -1532,7 +1532,7 @@ mod tests { // Construct dictionary with a timezone let dict = a.finish(); - let values = TimestampNanosecondArray::from(dict.values().data().clone()); + let values = TimestampNanosecondArray::from(dict.values().to_data()); let dict_with_tz = dict.with_values(&values.with_timezone("+02:00".to_string())); let d = DataType::Dictionary( Box::new(DataType::Int32), diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index e463c12a8856..7d42584514f1 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -539,7 +539,7 @@ mod tests { fn test_dictionary_concat_reuse() { let array: DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); - let copy: DictionaryArray = array.data().clone().into(); + let copy: DictionaryArray = array.to_data().into(); // dictionary is "a", "b", "c" assert_eq!( diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 1818c4fb50c4..a75acda79583 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -222,7 +222,7 @@ impl FilterBuilder { /// Create a new [`FilterBuilder`] that can be used to construct a [`FilterPredicate`] pub fn new(filter: &BooleanArray) -> Self { let filter = match filter.null_count() { - 0 => BooleanArray::from(filter.data().clone()), + 0 => filter.clone(), _ => prep_null_mask_filter(filter), }; diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index cd588fe01c6b..acef1da51aad 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -349,7 +349,7 @@ mod tests { let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -369,7 +369,7 @@ mod tests { .map(|e| e.map(|e| e as i64)) .collect::>() .into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -528,7 +528,7 @@ mod tests { let result = result.as_any().downcast_ref::().unwrap(); let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } @@ -548,7 +548,7 @@ mod tests { .map(|e| e.map(|e| e as i64)) .collect::>() .into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }) } diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index f3ba90d8a741..ec785c01e818 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -122,7 +122,7 @@ pub fn regexp_is_match_utf8_scalar( regex: &str, flag: Option<&str>, ) -> Result { - let null_bit_buffer = array.data().nulls().map(|x| x.inner().sliced()); + let null_bit_buffer = array.nulls().map(|x| x.inner().sliced()); let mut result = BooleanBufferBuilder::new(array.len()); let pattern = match flag { diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index a59a54d7e6e4..7ee33f7fc282 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -210,7 +210,7 @@ pub fn substring_by_char( GenericStringArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![new_offsets.finish(), vals.finish()], vec![], @@ -294,7 +294,7 @@ fn binary_substring( GenericBinaryArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], @@ -339,7 +339,7 @@ fn fixed_size_binary_substring( DataType::FixedSizeBinary(new_len), num_of_elements, None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![new_values.into()], vec![], @@ -418,7 +418,7 @@ fn utf8_substring( GenericStringArray::::DATA_TYPE, array.len(), None, - array.data().nulls().map(|b| b.inner().sliced()), + array.nulls().map(|b| b.inner().sliced()), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 3b0fdbe63c97..68fc66a635bc 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -56,7 +56,7 @@ fn validate_benchmark(c: &mut Criterion) { let byte_array = BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); c.bench_function("byte_array_to_string_array 20000", |b| { - b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone()))) + b.iter(|| StringArray::from(BinaryArray::from(byte_array.to_data()))) }); } diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index eefbf6dcd4ff..d4aec4d38423 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -100,7 +100,7 @@ fn process(batch: &RecordBatch) { Arc::new(projected_schema), vec![ id.clone(), // NOTE: this is cloning the Arc not the array data - Arc::new(Float64Array::from(nested_c.data().clone())), + Arc::new(Float64Array::from(nested_c.to_data())), ], ) .unwrap(); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 81c32594861c..333d6425a38e 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -628,7 +628,7 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(Array::data(&original_array).clone())?; + let array = ArrowArray::try_from(Array::to_data(&original_array))?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1122,7 +1122,7 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(map_array.data().clone())?; + let array = ArrowArray::try_from(map_array.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1209,7 +1209,7 @@ mod tests { let union = builder.build().unwrap(); // export it - let array = ArrowArray::try_from(union.data().clone())?; + let array = ArrowArray::try_from(union.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 33552dbe3b1b..b8199031796e 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -290,7 +290,7 @@ where .len(size) .null_bit_buffer(nulls) .add_buffer(keys) - .add_child_data(values.data().clone()) + .add_child_data(values.to_data()) .build() .unwrap(); diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 5fc8e4d43c52..7ead5fa61522 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -191,7 +191,7 @@ fn create_random_list_array( // Create list's child data let child_array = create_random_array(list_field, child_len, null_density, true_density)?; - let child_data = child_array.data(); + let child_data = child_array.to_data(); // Create list's null buffers, if it is nullable let null_buffer = match field.is_nullable() { true => Some(create_random_null_buffer(size, null_density)), @@ -205,7 +205,7 @@ fn create_random_list_array( null_buffer, 0, vec![offsets], - vec![child_data.clone()], + vec![child_data], ) }; Ok(make_array(list_data)) From 30250d3158fffa284b793457c6952c0b36837510 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Mar 2023 18:03:45 +0000 Subject: [PATCH 0697/1411] Implement Bit Operations for i256 (#3884) * Implement BitAnd, BitOr and BitXor for i256 * Implement Shl and Shr for i256 * Clippy --- arrow-buffer/src/bigint.rs | 102 +++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 421a7bdd02d0..4b446e19b996 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -18,6 +18,7 @@ use num::cast::AsPrimitive; use num::{BigInt, FromPrimitive, Num, ToPrimitive}; use std::cmp::Ordering; +use std::ops::{BitAnd, BitOr, BitXor, Shl, Shr}; /// A signed 256-bit integer #[allow(non_camel_case_types)] @@ -492,6 +493,84 @@ impl std::ops::Neg for i256 { } } +impl BitAnd for i256 { + type Output = i256; + + #[inline] + fn bitand(self, rhs: Self) -> Self::Output { + Self { + low: self.low & rhs.low, + high: self.high & rhs.high, + } + } +} + +impl BitOr for i256 { + type Output = i256; + + #[inline] + fn bitor(self, rhs: Self) -> Self::Output { + Self { + low: self.low | rhs.low, + high: self.high | rhs.high, + } + } +} + +impl BitXor for i256 { + type Output = i256; + + #[inline] + fn bitxor(self, rhs: Self) -> Self::Output { + Self { + low: self.low ^ rhs.low, + high: self.high ^ rhs.high, + } + } +} + +impl Shl for i256 { + type Output = i256; + + #[inline] + fn shl(self, rhs: u8) -> Self::Output { + if rhs == 0 { + self + } else if rhs < 128 { + Self { + high: self.high << rhs | (self.low >> (128 - rhs)) as i128, + low: self.low << rhs, + } + } else { + Self { + high: (self.low << (rhs - 128)) as i128, + low: 0, + } + } + } +} + +impl Shr for i256 { + type Output = i256; + + #[inline] + fn shr(self, rhs: u8) -> Self::Output { + if rhs == 0 { + self + } else if rhs < 128 { + Self { + high: self.high >> rhs, + low: self.low >> rhs | ((self.high as u128) << (128 - rhs)), + } + } else { + Self { + high: self.high >> 127, + low: (self.high >> (rhs - 128)) as u128, + } + } + } +} + macro_rules! define_as_primitive { ($native_ty:ty) => { impl AsPrimitive for $native_ty { @@ -684,6 +763,29 @@ mod tests { ), } } + + // Bit operations + let actual = il & ir; + let (expected, _) = i256::from_bigint_with_overflow(bl.clone() & br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let actual = il | ir; + let (expected, _) = i256::from_bigint_with_overflow(bl.clone() | br.clone()); + assert_eq!(actual.to_string(), expected.to_string()); + + let actual = il ^ ir; + let (expected, _) = i256::from_bigint_with_overflow(bl.clone() ^ br); + assert_eq!(actual.to_string(), expected.to_string()); + + for shift in [0_u8, 1, 4, 126, 128, 129, 254, 255] { + let actual = il << shift; + let (expected, _) = i256::from_bigint_with_overflow(bl.clone() << shift); + assert_eq!(actual.to_string(), expected.to_string()); + + let actual = il >> shift; + let (expected, _) = i256::from_bigint_with_overflow(bl.clone() >> shift); + assert_eq!(actual.to_string(), expected.to_string()); + } } #[test] From 7bf7ea5e341c15dbd8653b16413459f5fa4784eb Mon Sep 17 00:00:00 2001 From: waymost Date: Sat, 18 Mar 2023 05:02:31 -0700 Subject: [PATCH 0698/1411] Added support for byte vectors and slices to parquet_derive (#3864) (#3878) --- parquet/src/data_type.rs | 10 ++++ parquet_derive/src/parquet_field.rs | 76 +++++++++++++++++++++++++---- parquet_derive_test/src/lib.rs | 18 +++++++ 3 files changed, 95 insertions(+), 9 deletions(-) diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 40d54c78ed1d..48ee7f89fc5d 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -199,6 +199,16 @@ impl From> for ByteArray { } } +impl<'a> From<&'a [u8]> for ByteArray { + fn from(b: &'a [u8]) -> ByteArray { + let mut v = Vec::new(); + v.extend_from_slice(b); + Self { + data: Some(ByteBufferPtr::new(v)), + } + } +} + impl<'a> From<&'a str> for ByteArray { fn from(s: &'a str) -> ByteArray { let mut v = Vec::new(); diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 48b6d3ac41b8..ea6878283a33 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -92,6 +92,10 @@ impl Field { Type::TypePath(_) => self.option_into_vals(), _ => unimplemented!("Unsupported type encountered"), }, + Type::Vec(ref first_type) => match **first_type { + Type::TypePath(_) => self.option_into_vals(), + _ => unimplemented!("Unsupported type encountered"), + }, ref f => unimplemented!("Unsupported: {:#?}", f), }, Type::Reference(_, ref first_type) => match **first_type { @@ -99,11 +103,27 @@ impl Field { Type::Option(ref second_type) => match **second_type { Type::TypePath(_) => self.option_into_vals(), Type::Reference(_, ref second_type) => match **second_type { + Type::TypePath(_) => self.option_into_vals(), + Type::Slice(ref second_type) => match **second_type { + Type::TypePath(_) => self.option_into_vals(), + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + _ => unimplemented!("Unsupported type encountered"), + }, + Type::Vec(ref first_type) => match **first_type { Type::TypePath(_) => self.option_into_vals(), _ => unimplemented!("Unsupported type encountered"), }, ref f => unimplemented!("Unsupported: {:#?}", f), }, + Type::Slice(ref second_type) => match **second_type { + Type::TypePath(_) => self.copied_direct_vals(), + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + Type::Vec(ref first_type) => match **first_type { + Type::TypePath(_) => self.copied_direct_vals(), ref f => unimplemented!("Unsupported: {:#?}", f), }, f => unimplemented!("Unsupported: {:#?}", f), @@ -116,26 +136,55 @@ impl Field { Type::Option(_) => unimplemented!("Unsupported nesting encountered"), Type::Reference(_, ref second_type) | Type::Vec(ref second_type) - | Type::Array(ref second_type) => match **second_type { + | Type::Array(ref second_type) + | Type::Slice(ref second_type) => match **second_type { Type::TypePath(_) => Some(self.optional_definition_levels()), _ => unimplemented!("Unsupported nesting encountered"), }, }, Type::Reference(_, ref first_type) | Type::Vec(ref first_type) - | Type::Array(ref first_type) => match **first_type { + | Type::Array(ref first_type) + | Type::Slice(ref first_type) => match **first_type { Type::TypePath(_) => None, - Type::Reference(_, ref second_type) - | Type::Vec(ref second_type) + Type::Vec(ref second_type) | Type::Array(ref second_type) - | Type::Option(ref second_type) => match **second_type { - Type::TypePath(_) => Some(self.optional_definition_levels()), + | Type::Slice(ref second_type) => match **second_type { + Type::TypePath(_) => None, Type::Reference(_, ref third_type) => match **third_type { - Type::TypePath(_) => Some(self.optional_definition_levels()), + Type::TypePath(_) => None, _ => unimplemented!("Unsupported definition encountered"), }, _ => unimplemented!("Unsupported definition encountered"), }, + Type::Reference(_, ref second_type) | Type::Option(ref second_type) => { + match **second_type { + Type::TypePath(_) => Some(self.optional_definition_levels()), + Type::Vec(ref third_type) + | Type::Array(ref third_type) + | Type::Slice(ref third_type) => match **third_type { + Type::TypePath(_) => Some(self.optional_definition_levels()), + Type::Reference(_, ref fourth_type) => match **fourth_type { + Type::TypePath(_) => { + Some(self.optional_definition_levels()) + } + _ => unimplemented!("Unsupported definition encountered"), + }, + _ => unimplemented!("Unsupported definition encountered"), + }, + Type::Reference(_, ref third_type) => match **third_type { + Type::TypePath(_) => Some(self.optional_definition_levels()), + Type::Slice(ref fourth_type) => match **fourth_type { + Type::TypePath(_) => { + Some(self.optional_definition_levels()) + } + _ => unimplemented!("Unsupported definition encountered"), + }, + _ => unimplemented!("Unsupported definition encountered"), + }, + _ => unimplemented!("Unsupported definition encountered"), + } + } }, }; @@ -323,6 +372,7 @@ impl Field { enum Type { Array(Box), Option(Box), + Slice(Box), Vec(Box), TypePath(syn::Type), Reference(Option, Box), @@ -374,6 +424,7 @@ impl Type { Type::Option(ref first_type) | Type::Vec(ref first_type) | Type::Array(ref first_type) + | Type::Slice(ref first_type) | Type::Reference(_, ref first_type) => { Type::leaf_type_recursive_helper(first_type, Some(ty)) } @@ -391,6 +442,7 @@ impl Type { Type::Option(ref first_type) | Type::Vec(ref first_type) | Type::Array(ref first_type) + | Type::Slice(ref first_type) | Type::Reference(_, ref first_type) => match **first_type { Type::TypePath(ref type_) => type_, _ => unimplemented!("leaf_type() should only return shallow types"), @@ -443,7 +495,7 @@ impl Type { } } } - Type::Vec(ref first_type) => { + Type::Vec(ref first_type) | Type::Slice(ref first_type) => { if let Type::TypePath(_) = **first_type { if last_part == "u8" { return BasicType::BYTE_ARRAY; @@ -484,7 +536,7 @@ impl Type { } } } - Type::Vec(ref first_type) => { + Type::Vec(ref first_type) | Type::Slice(ref first_type) => { if let Type::TypePath(_) = **first_type { if last_part == "u8" { return quote! { None }; @@ -572,6 +624,7 @@ impl Type { syn::Type::Path(ref p) => Type::from_type_path(f, p), syn::Type::Reference(ref tr) => Type::from_type_reference(f, tr), syn::Type::Array(ref ta) => Type::from_type_array(f, ta), + syn::Type::Slice(ref ts) => Type::from_type_slice(f, ts), other => unimplemented!( "Unable to derive {:?} - it is currently an unsupported type\n{:#?}", f.ident.as_ref().unwrap(), @@ -622,6 +675,11 @@ impl Type { let inner_type = Type::from_type(f, ta.elem.as_ref()); Type::Array(Box::new(inner_type)) } + + fn from_type_slice(f: &syn::Field, ts: &syn::TypeSlice) -> Self { + let inner_type = Type::from_type(f, ts.elem.as_ref()); + Type::Slice(Box::new(inner_type)) + } } #[cfg(test)] diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index 746644793ff2..2aa174974aba 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -42,6 +42,11 @@ struct ACompleteRecord<'a> { pub borrowed_maybe_a_string: &'a Option, pub borrowed_maybe_a_str: &'a Option<&'a str>, pub now: chrono::NaiveDateTime, + pub byte_vec: Vec, + pub maybe_byte_vec: Option>, + pub borrowed_byte_vec: &'a [u8], + pub borrowed_maybe_byte_vec: &'a Option>, + pub borrowed_maybe_borrowed_byte_vec: &'a Option<&'a [u8]>, } #[cfg(test)] @@ -84,6 +89,11 @@ mod tests { OPTIONAL BINARY borrowed_maybe_a_string (STRING); OPTIONAL BINARY borrowed_maybe_a_str (STRING); REQUIRED INT64 now (TIMESTAMP_MILLIS); + REQUIRED BINARY byte_vec; + OPTIONAL BINARY maybe_byte_vec; + REQUIRED BINARY borrowed_byte_vec; + OPTIONAL BINARY borrowed_maybe_byte_vec; + OPTIONAL BINARY borrowed_maybe_borrowed_byte_vec; }"; let schema = Arc::new(parse_message_type(schema_str).unwrap()); @@ -92,6 +102,9 @@ mod tests { let a_borrowed_string = "cool news".to_owned(); let maybe_a_string = Some("it's true, I'm a string".to_owned()); let maybe_a_str = Some(&a_str[..]); + let borrowed_byte_vec = vec![0x68, 0x69, 0x70]; + let borrowed_maybe_byte_vec = Some(vec![0x71, 0x72]); + let borrowed_maybe_borrowed_byte_vec = Some(&borrowed_byte_vec[..]); let drs: Vec = vec![ACompleteRecord { a_bool: true, @@ -115,6 +128,11 @@ mod tests { borrowed_maybe_a_string: &maybe_a_string, borrowed_maybe_a_str: &maybe_a_str, now: chrono::Utc::now().naive_local(), + byte_vec: vec![0x65, 0x66, 0x67], + maybe_byte_vec: Some(vec![0x88, 0x89, 0x90]), + borrowed_byte_vec: &borrowed_byte_vec, + borrowed_maybe_byte_vec: &borrowed_maybe_byte_vec, + borrowed_maybe_borrowed_byte_vec: &borrowed_maybe_borrowed_byte_vec, }]; let generated_schema = drs.as_slice().schema().unwrap(); From e41e0ca82ce4967a66a7eda04cb1630487aaeca9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Mar 2023 09:58:33 -0700 Subject: [PATCH 0699/1411] Update syn requirement from 1.0 to 2.0 (#3890) Updates the requirements on [syn](https://github.com/dtolnay/syn) to permit the latest version. - [Release notes](https://github.com/dtolnay/syn/releases) - [Commits](https://github.com/dtolnay/syn/compare/1.0.0...2.0.3) --- updated-dependencies: - dependency-name: syn dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- parquet_derive/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index e41ba19086d7..ddf34c4bf793 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -34,5 +34,5 @@ proc-macro = true [dependencies] proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } -syn = { version = "1.0", features = ["extra-traits"] } +syn = { version = "2.0", features = ["extra-traits"] } parquet = { path = "../parquet", version = "35.0.0", default-features = false } From 66883638dad39ad5ad54b105e87ec73eed8f8123 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 21 Mar 2023 11:39:18 +0100 Subject: [PATCH 0700/1411] making use of checked multiplication and addition to avoid silent overflow (#3886) --- arrow-cast/src/cast.rs | 22 ++++++++++++++++++++++ arrow-cast/src/parse.rs | 27 ++++++++++++++++----------- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1bd5027406b9..72d1bc1cb254 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -5273,6 +5273,28 @@ mod tests { IntervalUnit::DayTime, r#"Cast error: Cannot cast 1 day 1.5 milliseconds to IntervalDayTime because the nanos part isn't multiple of milliseconds"# ); + + // overflow + test_unsafe_string_to_interval_err!( + vec![Some(format!( + "{} century {} year {} month", + i64::MAX - 2, + i64::MAX - 2, + i64::MAX - 2 + ))], + IntervalUnit::DayTime, + r#"Parser error: Parsed interval field value out of range: 11068046444225730000000 months 331764692165666300000000 days 28663672503769583000000000000000000000 nanos"# + ); + test_unsafe_string_to_interval_err!( + vec![Some(format!( + "{} year {} month {} day", + i64::MAX - 2, + i64::MAX - 2, + i64::MAX - 2 + ))], + IntervalUnit::MonthDayNano, + r#"Parser error: Parsed interval field value out of range: 110680464442257310000 months 3043712772162076000000 days 262179884170819100000000000000000000 nanos"# + ); } #[test] diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 30cebb4bf3d0..ced951ca8f65 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -838,13 +838,13 @@ fn parse_interval(leading_field: &str, value: &str) -> Result { - align_interval_parts(interval_period * 1200_f64, 0.0, 0.0) + align_interval_parts(interval_period.mul_checked(1200_f64)?, 0.0, 0.0) } IntervalType::Decade => { - align_interval_parts(interval_period * 120_f64, 0.0, 0.0) + align_interval_parts(interval_period.mul_checked(120_f64)?, 0.0, 0.0) } IntervalType::Year => { - align_interval_parts(interval_period * 12_f64, 0.0, 0.0) + align_interval_parts(interval_period.mul_checked(12_f64)?, 0.0, 0.0) } IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0), IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0), @@ -852,16 +852,21 @@ fn parse_interval(leading_field: &str, value: &str) -> Result Ok(( 0, 0, - (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64, + (interval_period.mul_checked(SECONDS_PER_HOUR * NANOS_PER_SECOND))? + as i64, + )), + IntervalType::Minute => Ok(( + 0, + 0, + (interval_period.mul_checked(60_f64 * NANOS_PER_SECOND))? as i64, + )), + IntervalType::Second => Ok(( + 0, + 0, + (interval_period.mul_checked(NANOS_PER_SECOND))? as i64, )), - IntervalType::Minute => { - Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64)) - } - IntervalType::Second => { - Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64)) - } IntervalType::Millisecond => { - Ok((0, 0, (interval_period * 1_000_000f64) as i64)) + Ok((0, 0, (interval_period.mul_checked(1_000_000f64))? as i64)) } } }; From 74c9f30cbd2af5b9dada23f81cd5c42caa612415 Mon Sep 17 00:00:00 2001 From: kinrany Date: Tue, 21 Mar 2023 15:04:12 +0300 Subject: [PATCH 0701/1411] Impl ObjectStore for trait object (#3866) --- object_store/src/lib.rs | 80 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 3af538254183..706cc076672c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -418,6 +418,86 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } } +#[async_trait] +impl ObjectStore for Box { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.as_ref().put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.as_ref().put_multipart(location).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.as_ref().abort_multipart(location, multipart_id).await + } + + async fn append( + &self, + location: &Path, + ) -> Result> { + self.as_ref().append(location).await + } + + async fn get(&self, location: &Path) -> Result { + self.as_ref().get(location).await + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + self.as_ref().get_range(location, range).await + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + self.as_ref().get_ranges(location, ranges).await + } + + async fn head(&self, location: &Path) -> Result { + self.as_ref().head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + self.as_ref().delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.as_ref().list(prefix).await + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + self.as_ref().list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy(from, to).await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy_if_not_exists(from, to).await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename_if_not_exists(from, to).await + } +} + /// Result of a list call that includes objects, prefixes (directories) and a /// token for the next set of results. Individual result sets may be limited to /// 1,000 objects based on the underlying object storage's limitations. From f1d5a3734d41e9533aa1b8893f1f44a2f6555713 Mon Sep 17 00:00:00 2001 From: bold Date: Tue, 21 Mar 2023 13:04:29 +0100 Subject: [PATCH 0702/1411] Improve decimal parsing performance (#3854) --- arrow-cast/Cargo.toml | 4 + arrow-cast/benches/parse_decimal.rs | 56 ++++++++++ arrow-cast/src/parse.rs | 153 +++++++++++++--------------- 3 files changed, 133 insertions(+), 80 deletions(-) create mode 100644 arrow-cast/benches/parse_decimal.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 859254c3a81d..53c62ffb60d3 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -67,3 +67,7 @@ harness = false [[bench]] name = "parse_time" harness = false + +[[bench]] +name = "parse_decimal" +harness = false diff --git a/arrow-cast/benches/parse_decimal.rs b/arrow-cast/benches/parse_decimal.rs new file mode 100644 index 000000000000..5682859dd25a --- /dev/null +++ b/arrow-cast/benches/parse_decimal.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::types::Decimal256Type; +use arrow_cast::parse::parse_decimal; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let decimals = [ + "123.123", + "123.1234", + "123.1", + "123", + "-123.123", + "-123.1234", + "-123.1", + "-123", + "0.0000123", + "12.", + "-12.", + "00.1", + "-00.1", + "12345678912345678.1234", + "-12345678912345678.1234", + "99999999999999999.999", + "-99999999999999999.999", + ".123", + "-.123", + "123.", + "-123.", + ]; + + for decimal in decimals { + let d = black_box(decimal); + c.bench_function(d, |b| { + b.iter(|| parse_decimal::(d, 20, 3).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index ced951ca8f65..710a6a4979c1 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -613,100 +613,93 @@ pub fn parse_decimal( precision: u8, scale: i8, ) -> Result { - if !is_valid_decimal(s) { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - let mut offset = s.len(); - let len = s.len(); - let mut base = T::Native::usize_as(1); - let scale_usize = usize::from(scale as u8); - - // handle the value after the '.' and meet the scale - let delimiter_position = s.find('.'); - match delimiter_position { - None => { - // there is no '.' - base = T::Native::usize_as(10).pow_checked(scale as u32)?; - } - Some(mid) => { - // there is the '.' - if len - mid >= scale_usize + 1 { - // If the string value is "123.12345" and the scale is 2, we should just remain '.12' and drop the '345' value. - offset -= len - mid - 1 - scale_usize; - } else { - // If the string value is "123.12" and the scale is 4, we should append '00' to the tail. - base = T::Native::usize_as(10) - .pow_checked((scale_usize + 1 + mid - len) as u32)?; - } - } - }; - - // each byte is digit、'-' or '.' - let bytes = s.as_bytes(); - let mut negative = false; - let mut result = T::Native::usize_as(0); - - bytes[0..offset] - .iter() - .rev() - .try_for_each::<_, Result<(), ArrowError>>(|&byte| { - match byte { - b'-' => { - negative = true; - } - b'0'..=b'9' => { - let add = - T::Native::usize_as((byte - b'0') as usize).mul_checked(base)?; - result = result.add_checked(add)?; - base = base.mul_checked(T::Native::usize_as(10))?; - } - // because we have checked the string value - _ => (), - } - Ok(()) - })?; - - if negative { - result = result.neg_checked()?; - } - - match T::validate_decimal_precision(result, precision) { - Ok(_) => Ok(result), - Err(e) => Err(ArrowError::ParseError(format!( - "parse decimal overflow: {e}" - ))), - } -} - -fn is_valid_decimal(s: &str) -> bool { let mut seen_dot = false; - let mut seen_digit = false; let mut seen_sign = false; + let mut negative = false; - for c in s.as_bytes() { - match c { - b'-' | b'+' => { - if seen_digit || seen_dot || seen_sign { - return false; + let mut result = T::Native::usize_as(0); + let mut fractionals = 0; + let mut digits = 0; + let base = T::Native::usize_as(10); + let mut bs = s.as_bytes().iter(); + while let Some(b) = bs.next() { + match b { + b'0'..=b'9' => { + if seen_dot { + if fractionals == scale { + // We have processed and validated the whole part of our decimal (including sign and dot). + // All that is left is to validate the fractional part. + if bs.any(|b| !b.is_ascii_digit()) { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + break; + } + fractionals += 1; } - seen_sign = true; + digits += 1; + if digits > precision { + return Err(ArrowError::ParseError( + "parse decimal overflow".to_string(), + )); + } + result = result.mul_checked(base)?; + result = result.add_checked(T::Native::usize_as((b - b'0') as usize))?; } b'.' => { if seen_dot { - return false; + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); } seen_dot = true; } - b'0'..=b'9' => { - seen_digit = true; + b'-' => { + if seen_sign || digits > 0 || seen_dot { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + seen_sign = true; + negative = true; + } + b'+' => { + if seen_sign || digits > 0 || seen_dot { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + seen_sign = true; + } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); } - _ => return false, } } + // Fail on "." + if digits == 0 { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } - seen_digit + if fractionals < scale { + let exp = scale - fractionals; + if exp as u8 + digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + let mul = base.pow_checked(exp as _)?; + result = result.mul_checked(mul)?; + } + + Ok(if negative { + result.neg_checked()? + } else { + result + }) } pub fn parse_interval_year_month( From 67fe80733b6c60565eaafd5e9cb9574186a764c9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 12:21:10 +0000 Subject: [PATCH 0703/1411] Flesh out NullBuffer abstraction (#3880) (#3885) * Flesh out NullBuffer abstraction (#3880) * Review feedback --- arrow-arith/src/arity.rs | 72 ++++++------------- arrow-array/src/array/boolean_array.rs | 30 +++----- arrow-array/src/array/primitive_array.rs | 54 +++++++------- .../src/builder/generic_bytes_builder.rs | 4 +- arrow-buffer/src/buffer/boolean.rs | 41 ++++++++++- arrow-buffer/src/buffer/null.rs | 35 +++++++++ .../src/util}/bit_iterator.rs | 5 +- arrow-buffer/src/util/mod.rs | 1 + arrow-cast/src/cast.rs | 2 +- arrow-data/src/lib.rs | 2 +- arrow-ord/src/comparison.rs | 2 +- arrow-row/src/list.rs | 2 +- arrow-select/src/take.rs | 10 +-- arrow-string/src/concat_elements.rs | 6 +- arrow-string/src/length.rs | 2 +- arrow-string/src/like.rs | 21 +++--- arrow-string/src/regexp.rs | 21 +++--- parquet/src/arrow/arrow_writer/levels.rs | 4 +- 18 files changed, 170 insertions(+), 144 deletions(-) rename {arrow-data/src => arrow-buffer/src/util}/bit_iterator.rs (98%) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 162b56ef1fe3..0a8815cc8059 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -21,11 +21,9 @@ use arrow_array::builder::BufferBuilder; use arrow_array::iterator::ArrayIter; use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::ArrowError; use std::sync::Arc; @@ -33,18 +31,15 @@ use std::sync::Arc; unsafe fn build_primitive_array( len: usize, buffer: Buffer, - null_count: usize, - null_buffer: Option, + nulls: Option, ) -> PrimitiveArray { - PrimitiveArray::from(ArrayData::new_unchecked( - O::DATA_TYPE, - len, - Some(null_count), - null_buffer, - 0, - vec![buffer], - vec![], - )) + PrimitiveArray::from( + ArrayDataBuilder::new(O::DATA_TYPE) + .len(len) + .nulls(nulls) + .buffers(vec![buffer]) + .build_unchecked(), + ) } /// See [`PrimitiveArray::unary`] @@ -220,11 +215,7 @@ where return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits_offset(0, len)) - .unwrap_or_default(); + let nulls = NullBuffer::union(a.nulls(), b.nulls()); let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); // JUSTIFICATION @@ -234,7 +225,7 @@ where // `values` is an iterator with a known size from a PrimitiveArray let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) + Ok(unsafe { build_primitive_array(len, buffer, nulls) }) } /// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating @@ -275,10 +266,7 @@ where )))); } - let len = a.len(); - - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - let nulls = null_buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len))); + let nulls = NullBuffer::union(a.nulls(), b.nulls()); let mut builder = a.into_builder()?; @@ -326,18 +314,13 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls(len, a, b, op) } else { - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits_offset(0, len)) - .unwrap_or_default(); + let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); let slice = buffer.as_slice_mut(); - try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + nulls.try_for_each_valid_idx(|idx| { unsafe { *slice.get_unchecked_mut(idx) = op(a.value_unchecked(idx), b.value_unchecked(idx))? @@ -345,9 +328,7 @@ where Ok::<_, ArrowError>(()) })?; - Ok(unsafe { - build_primitive_array(len, buffer.finish(), null_count, null_buffer) - }) + Ok(unsafe { build_primitive_array(len, buffer.finish(), Some(nulls)) }) } } @@ -391,17 +372,12 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls_mut(len, a, b, op) } else { - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len); - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits_offset(0, len)) - .unwrap_or_default(); - + let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); let mut builder = a.into_builder()?; let slice = builder.values_slice_mut(); - match try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + match nulls.try_for_each_valid_idx(|idx| { unsafe { *slice.get_unchecked_mut(idx) = op(*slice.get_unchecked(idx), b.value_unchecked(idx))? @@ -412,14 +388,8 @@ where Err(err) => return Ok(Err(err)), }; - let array_builder = builder - .finish() - .into_data() - .into_builder() - .null_bit_buffer(null_buffer) - .null_count(null_count); - - let array_data = unsafe { array_builder.build_unchecked() }; + let array_builder = builder.finish().into_data().into_builder(); + let array_data = unsafe { array_builder.nulls(Some(nulls)).build_unchecked() }; Ok(Ok(PrimitiveArray::::from(array_data))) } } @@ -442,7 +412,7 @@ where buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?); }; } - Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) + Ok(unsafe { build_primitive_array(len, buffer.into(), None) }) } /// This intentional inline(never) attribute helps LLVM optimize the loop. diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 89fdca507b00..a7ed870ed5cb 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -19,10 +19,8 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_data::ArrayData; +use arrow_buffer::{bit_util, Buffer, MutableBuffer, NullBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -189,7 +187,7 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let null_bit_buffer = left.data().nulls().map(|x| x.inner().sliced()); + let null_bit_buffer = left.nulls().map(|x| x.inner().sliced()); let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) @@ -235,24 +233,18 @@ impl BooleanArray { { assert_eq!(left.len(), right.len()); - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); - + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i), right.value_unchecked(i)) }); let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ) + ArrayDataBuilder::new(DataType::Boolean) + .len(left.len()) + .nulls(nulls) + .buffers(vec![buffer.into()]) + .build_unchecked() }; Self::from(data) } @@ -470,7 +462,7 @@ mod tests { assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); - assert!(arr.data().nulls().is_none()); + assert!(arr.nulls().is_none()); for i in 0..3 { assert!(!arr.is_null(i)); assert!(arr.is_valid(i)); @@ -485,7 +477,7 @@ mod tests { assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(2, arr.null_count()); - assert!(arr.data().nulls().is_some()); + assert!(arr.nulls().is_some()); assert!(arr.is_valid(0)); assert!(arr.is_null(1)); diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4a484de78828..78859bd5956f 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -25,9 +25,11 @@ use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; +use arrow_buffer::{ + i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, +}; use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; @@ -344,7 +346,7 @@ impl PrimitiveArray { pub fn from_value(value: T::Native, count: usize) -> Self { unsafe { let val_buf = Buffer::from_trusted_len_iter((0..count).map(|_| value)); - build_primitive_array(count, val_buf, 0, None) + build_primitive_array(count, val_buf, None) } } @@ -421,9 +423,8 @@ impl PrimitiveArray { { let data = self.data(); let len = self.len(); - let null_count = self.null_count(); - let null_buffer = data.nulls().map(|b| b.inner().sliced()); + let nulls = data.nulls().cloned(); let values = self.values().iter().map(|v| op(*v)); // JUSTIFICATION // Benefit @@ -431,7 +432,7 @@ impl PrimitiveArray { // Soundness // `values` is an iterator with a known size because arrays are sized. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } + unsafe { build_primitive_array(len, buffer, nulls) } } /// Applies an unary and infallible function to a mutable primitive array. @@ -478,21 +479,23 @@ impl PrimitiveArray { { let data = self.data(); let len = self.len(); - let null_count = self.null_count(); - let null_buffer = data.nulls().map(|b| b.inner().sliced()); + let nulls = data.nulls().cloned(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); let slice = buffer.as_slice_mut(); - try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + let f = |idx| { unsafe { *slice.get_unchecked_mut(idx) = op(self.value_unchecked(idx))? }; Ok::<_, E>(()) - })?; + }; - Ok(unsafe { - build_primitive_array(len, buffer.finish(), null_count, null_buffer) - }) + match &nulls { + Some(nulls) => nulls.try_for_each_valid_idx(f)?, + None => (0..len).try_for_each(f)?, + } + + Ok(unsafe { build_primitive_array(len, buffer.finish(), nulls) }) } /// Applies an unary and fallible function to all valid values in a mutable primitive array. @@ -575,12 +578,12 @@ impl PrimitiveArray { Ok::<_, ()>(()) }); + let nulls = BooleanBuffer::new(null_builder.finish(), 0, len); unsafe { build_primitive_array( len, buffer.finish(), - out_null_count, - Some(null_builder.finish()), + Some(NullBuffer::new_unchecked(nulls, out_null_count)), ) } } @@ -648,18 +651,15 @@ impl PrimitiveArray { unsafe fn build_primitive_array( len: usize, buffer: Buffer, - null_count: usize, - null_buffer: Option, + nulls: Option, ) -> PrimitiveArray { - PrimitiveArray::from(ArrayData::new_unchecked( - O::DATA_TYPE, - len, - Some(null_count), - null_buffer, - 0, - vec![buffer], - vec![], - )) + PrimitiveArray::from( + ArrayDataBuilder::new(O::DATA_TYPE) + .len(len) + .buffers(vec![buffer]) + .nulls(nulls) + .build_unchecked(), + ) } impl From> for ArrayData { @@ -1782,7 +1782,7 @@ mod tests { let primitive_array = PrimitiveArray::::from_iter(iter); assert_eq!(primitive_array.len(), 10); assert_eq!(primitive_array.null_count(), 0); - assert!(primitive_array.data().nulls().is_none()); + assert!(primitive_array.nulls().is_none()); assert_eq!(primitive_array.values(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) } diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index c723b3349930..a3598d8bf26d 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -425,7 +425,7 @@ mod tests { builder.append_value("parquet"); let arr = builder.finish(); // array should not have null buffer because there is not `null` value. - assert!(arr.data().nulls().is_none()); + assert!(arr.nulls().is_none()); assert_eq!(GenericStringArray::::from(vec!["arrow", "parquet"]), arr,) } @@ -454,7 +454,7 @@ mod tests { builder.append_value("parquet"); arr = builder.finish(); - assert!(arr.data().nulls().is_some()); + assert!(arr.nulls().is_some()); assert_eq!(&[O::zero()], builder.offsets_slice()); assert_eq!(5, arr.len()); } diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 8a7f279f32ed..43b74c6031af 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -16,7 +16,8 @@ // under the License. use crate::bit_chunk_iterator::BitChunks; -use crate::{bit_util, Buffer}; +use crate::{bit_util, buffer_bin_and, buffer_bin_or, Buffer}; +use std::ops::{BitAnd, BitOr}; /// A slice-able [`Buffer`] containing bit-packed booleans #[derive(Debug, Clone, Eq)] @@ -145,3 +146,41 @@ impl BooleanBuffer { self.buffer } } + +impl BitAnd<&BooleanBuffer> for &BooleanBuffer { + type Output = BooleanBuffer; + + fn bitand(self, rhs: &BooleanBuffer) -> Self::Output { + assert_eq!(self.len, rhs.len); + BooleanBuffer { + buffer: buffer_bin_and( + &self.buffer, + self.offset, + &rhs.buffer, + rhs.offset, + self.len, + ), + offset: 0, + len: self.len, + } + } +} + +impl BitOr<&BooleanBuffer> for &BooleanBuffer { + type Output = BooleanBuffer; + + fn bitor(self, rhs: &BooleanBuffer) -> Self::Output { + assert_eq!(self.len, rhs.len); + BooleanBuffer { + buffer: buffer_bin_or( + &self.buffer, + self.offset, + &rhs.buffer, + rhs.offset, + self.len, + ), + offset: 0, + len: self.len, + } + } +} diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index a4854f1adfed..2f8c864ca957 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::bit_iterator::BitIndexIterator; use crate::buffer::BooleanBuffer; use crate::{Buffer, MutableBuffer}; @@ -50,6 +51,22 @@ impl NullBuffer { Self { buffer, null_count } } + /// Computes the union of the nulls in two optional [`NullBuffer`] + /// + /// This is commonly used by binary operations where the result is NULL if either + /// of the input values is NULL. Handling the null mask separately in this way + /// can yield significant performance improvements over an iterator approach + pub fn union( + lhs: Option<&NullBuffer>, + rhs: Option<&NullBuffer>, + ) -> Option { + match (lhs, rhs) { + (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())), + (Some(n), None) | (None, Some(n)) => Some(n.clone()), + (None, None) => None, + } + } + /// Returns the length of this [`NullBuffer`] #[inline] pub fn len(&self) -> usize { @@ -97,12 +114,30 @@ impl NullBuffer { Self::new(self.buffer.slice(offset, len)) } + /// Calls the provided closure for each index in this null mask that is set + #[inline] + pub fn try_for_each_valid_idx Result<(), E>>( + &self, + f: F, + ) -> Result<(), E> { + if self.null_count == self.len() { + return Ok(()); + } + BitIndexIterator::new(self.validity(), self.offset(), self.len()).try_for_each(f) + } + /// Returns the inner [`BooleanBuffer`] #[inline] pub fn inner(&self) -> &BooleanBuffer { &self.buffer } + /// Returns the inner [`BooleanBuffer`] + #[inline] + pub fn into_inner(self) -> BooleanBuffer { + self.buffer + } + /// Returns the underlying [`Buffer`] #[inline] pub fn buffer(&self) -> &Buffer { diff --git a/arrow-data/src/bit_iterator.rs b/arrow-buffer/src/util/bit_iterator.rs similarity index 98% rename from arrow-data/src/bit_iterator.rs rename to arrow-buffer/src/util/bit_iterator.rs index 39898b1c4295..1a8dd9226318 100644 --- a/arrow-data/src/bit_iterator.rs +++ b/arrow-buffer/src/util/bit_iterator.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow_buffer::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; -use arrow_buffer::bit_util::{ceil, get_bit_raw}; -use std::result::Result; +use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; +use crate::bit_util::{ceil, get_bit_raw}; /// Iterator over the bits within a packed bitmask /// diff --git a/arrow-buffer/src/util/mod.rs b/arrow-buffer/src/util/mod.rs index c1cb284dcc1f..0f1825eae9d4 100644 --- a/arrow-buffer/src/util/mod.rs +++ b/arrow-buffer/src/util/mod.rs @@ -16,4 +16,5 @@ // under the License. pub mod bit_chunk_iterator; +pub mod bit_iterator; pub mod bit_util; diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 72d1bc1cb254..43048c2aba45 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3324,7 +3324,7 @@ fn cast_primitive_to_list( to_type.clone(), array.len(), Some(cast_array.null_count()), - cast_array.data().nulls().map(|b| b.inner().sliced()), + cast_array.nulls().map(|b| b.inner().sliced()), 0, vec![offsets.into()], vec![cast_array.into_data()], diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 2b105f5bb040..15f6acd2c97d 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -23,7 +23,7 @@ pub use data::*; mod equal; pub mod transform; -pub mod bit_iterator; +pub use arrow_buffer::bit_iterator; pub mod bit_mask; pub mod decimal; diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index e0853da32e80..76760f8bc4f5 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -104,7 +104,7 @@ pub fn eq_utf8( fn utf8_empty( left: &GenericStringArray, ) -> Result { - let null_bit_buffer = left.data().nulls().map(|b| b.inner().sliced()); + let null_bit_buffer = left.nulls().map(|b| b.inner().sliced()); let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 833baac7b655..e232e717c9e8 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -168,7 +168,7 @@ pub unsafe fn decode( let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) - .nulls(canonical.data().nulls().cloned()) + .nulls(canonical.nulls().cloned()) .add_buffer(offsets.finish()) .add_child_data(child_data); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 68b22f6feabc..741b05493ea4 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -525,11 +525,11 @@ where IndexType::Native: ToPrimitive, { let val_buf = take_bits(values.values(), values.offset(), indices)?; - let null_buf = match values.data().nulls() { + let null_buf = match values.nulls() { Some(nulls) if nulls.null_count() > 0 => { Some(take_bits(nulls.buffer(), nulls.offset(), indices)?) } - _ => indices.data().nulls().map(|b| b.inner().sliced()), + _ => indices.nulls().map(|b| b.inner().sliced()), }; let data = unsafe { @@ -618,7 +618,7 @@ where } *offset = length_so_far; } - nulls = indices.data().nulls().map(|b| b.inner().sliced()); + nulls = indices.nulls().map(|b| b.inner().sliced()); } else { let num_bytes = bit_util::ceil(data_len, 8); @@ -1631,7 +1631,7 @@ mod tests { let expected_list_data = ArrayData::builder(list_data_type) .len(5) // null buffer remains the same as only the indices have nulls - .nulls(index.data().nulls().cloned()) + .nulls(index.nulls().cloned()) .add_buffer(expected_offsets) .add_child_data(expected_data) .build() @@ -1705,7 +1705,7 @@ mod tests { let expected_list_data = ArrayData::builder(list_data_type) .len(5) // null buffer remains the same as only the indices have nulls - .nulls(index.data().nulls().cloned()) + .nulls(index.nulls().cloned()) .add_buffer(expected_offsets) .add_child_data(expected_data) .build() diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index 1f85b4deb549..4da9e2539e7e 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use arrow_array::builder::BufferBuilder; use arrow_array::types::ByteArrayType; use arrow_array::*; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, NullBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; @@ -38,7 +38,7 @@ pub fn concat_elements_bytes( ))); } - let output_bitmap = combine_option_bitmap(&[left.data(), right.data()], left.len()); + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let left_offsets = left.value_offsets(); let right_offsets = right.value_offsets(); @@ -67,7 +67,7 @@ pub fn concat_elements_bytes( .len(left.len()) .add_buffer(output_offsets.finish()) .add_buffer(output_values.finish()) - .null_bit_buffer(output_bitmap); + .nulls(nulls); // SAFETY - offsets valid by construction Ok(unsafe { builder.build_unchecked() }.into()) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index acef1da51aad..a48fc13409f1 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -37,7 +37,7 @@ macro_rules! unary_offsets { // `values` come from a slice iterator with a known size. let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - let null_bit_buffer = $array.data().nulls().map(|b| b.inner().sliced()); + let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced()); let data = unsafe { ArrayData::new_unchecked( diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 9ae635e0c520..e8ec699969bd 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -18,8 +18,8 @@ use arrow_array::builder::BooleanBufferBuilder; use arrow_array::cast::*; use arrow_array::*; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_data::ArrayData; +use arrow_buffer::NullBuffer; +use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; use regex::Regex; @@ -581,8 +581,7 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len()); + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { @@ -605,15 +604,11 @@ where } let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) + ArrayDataBuilder::new(DataType::Boolean) + .len(left.len()) + .nulls(nulls) + .buffers(vec![result.finish()]) + .build_unchecked() }; Ok(BooleanArray::from(data)) } diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index ec785c01e818..7ccd450de8d2 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -20,8 +20,8 @@ use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; use arrow_array::*; -use arrow_data::bit_mask::combine_option_bitmap; -use arrow_data::ArrayData; +use arrow_buffer::NullBuffer; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use regex::Regex; use std::collections::HashMap; @@ -45,8 +45,7 @@ pub fn regexp_is_match_utf8( .to_string(), )); } - let null_bit_buffer = - combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len()); + let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); let mut patterns: HashMap = HashMap::new(); let mut result = BooleanBufferBuilder::new(array.len()); @@ -100,15 +99,11 @@ pub fn regexp_is_match_utf8( .collect::, ArrowError>>()?; let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - array.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ) + ArrayDataBuilder::new(DataType::Boolean) + .len(array.len()) + .buffers(vec![result.finish()]) + .nulls(nulls) + .build_unchecked() }; Ok(BooleanArray::from(data)) } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index de4cba4adb33..f21931d00884 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -329,7 +329,7 @@ impl LevelInfoBuilder { } }; - match array.data().nulls() { + match array.nulls() { Some(validity) => { let mut last_non_null_idx = None; let mut last_null_idx = None; @@ -378,7 +378,7 @@ impl LevelInfoBuilder { def_levels.reserve(len); info.non_null_indices.reserve(len); - match array.data().nulls() { + match array.nulls() { Some(nulls) => { // TODO: Faster bitmask iteration (#1757) for i in range { From 5d3307a03ffaa749e9edb574496d43029964cd94 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 14:34:05 +0000 Subject: [PATCH 0704/1411] Remove old object_store releases automatically (#3892) --- object_store/dev/release/release-tarball.sh | 3 ++ .../dev/release/remove-old-releases.sh | 45 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100755 object_store/dev/release/remove-old-releases.sh diff --git a/object_store/dev/release/release-tarball.sh b/object_store/dev/release/release-tarball.sh index 75ff886c6b1e..958118639143 100755 --- a/object_store/dev/release/release-tarball.sh +++ b/object_store/dev/release/release-tarball.sh @@ -74,3 +74,6 @@ rm -rf ${tmp_dir} echo "Success!" echo "The release is available here:" echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" + +echo "Clean up old versions from svn" +"${SOURCE_TOP_DIR}"/dev/release/remove-old-releases.sh diff --git a/object_store/dev/release/remove-old-releases.sh b/object_store/dev/release/remove-old-releases.sh new file mode 100755 index 000000000000..c8bd8b748396 --- /dev/null +++ b/object_store/dev/release/remove-old-releases.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script removes all but the most recent versions of arrow-rs +# from svn +# +# The older versions are in SVN history as well as available on the +# archive page https://archive.apache.org/dist/ +# +# See +# https://infra.apache.org/release-download-pages.html + +set -e +set -u + +svn_base="https://dist.apache.org/repos/dist/release/arrow" + +echo "Remove all but the most recent version" +old_releases=$( + svn ls ${svn_base} | \ + grep -E '^arrow-object-store-rs-[0-9\.]+' | \ + sort --version-sort --reverse | \ + tail -n +2 +) +for old_release_version in $old_releases; do + echo "Remove old release ${old_release_version}" + svn delete -m "Removing ${old_release_version}" ${svn_base}/${old_release_version} +done From 90cb00d2d4abef31513bdb633365ea02bce3e57c Mon Sep 17 00:00:00 2001 From: Satyam Singh Date: Tue, 21 Mar 2023 20:05:41 +0530 Subject: [PATCH 0705/1411] Add support for checksum algorithms in AWS (#3873) * Add support for checksum algorithms in aws * Remove other algorithms * Only set when checksum algorithm is sha256 * Fix --- object_store/src/aws/checksum.rs | 51 ++++++++++++++++++++++++++++++ object_store/src/aws/client.rs | 24 ++++++++++++-- object_store/src/aws/credential.rs | 22 ++++++++----- object_store/src/aws/mod.rs | 37 ++++++++++++++++++++++ 4 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 object_store/src/aws/checksum.rs diff --git a/object_store/src/aws/checksum.rs b/object_store/src/aws/checksum.rs new file mode 100644 index 000000000000..ae35f0612456 --- /dev/null +++ b/object_store/src/aws/checksum.rs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use ring::digest::{self, digest as ring_digest}; + +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Enum representing checksum algorithm supported by S3. +pub enum Checksum { + /// SHA-256 algorithm. + SHA256, +} + +impl Checksum { + pub(super) fn digest(&self, bytes: &[u8]) -> Vec { + match self { + Self::SHA256 => ring_digest(&digest::SHA256, bytes).as_ref().to_owned(), + } + } + + pub(super) fn header_name(&self) -> &'static str { + match self { + Self::SHA256 => "x-amz-checksum-sha256", + } + } +} + +impl TryFrom<&String> for Checksum { + type Error = (); + + fn try_from(value: &String) -> Result { + match value.as_str() { + "sha256" => Ok(Self::SHA256), + _ => Err(()), + } + } +} diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 0b0f883b7e51..bd58d09676aa 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; use crate::aws::STRICT_PATH_ENCODE_SET; use crate::client::pagination::stream_paginated; @@ -26,6 +27,8 @@ use crate::{ BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, RetryConfig, StreamExt, }; +use base64::prelude::BASE64_STANDARD; +use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; @@ -205,6 +208,7 @@ pub struct S3Config { pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, + pub checksum: Option, } impl S3Config { @@ -262,6 +266,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -281,10 +286,19 @@ impl S3Client { ) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let mut builder = self.client.request(Method::PUT, url); + let mut payload_sha256 = None; + if let Some(bytes) = bytes { - builder = builder.body(bytes) + if let Some(checksum) = self.config().checksum { + let digest = checksum.digest(&bytes); + builder = builder + .header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + if checksum == Checksum::SHA256 { + payload_sha256 = Some(digest); + } + } + builder = builder.body(bytes); } if let Some(value) = self.config().client_options.get_content_type(path) { @@ -298,6 +312,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + payload_sha256, ) .send_retry(&self.config.retry_config) .await @@ -325,6 +340,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -349,6 +365,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -395,6 +412,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -438,6 +456,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await @@ -482,6 +501,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, + None, ) .send_retry(&self.config.retry_config) .await diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 05f2c535bfdc..183e8434650b 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -84,7 +84,7 @@ const AUTH_HEADER: &str = "authorization"; const ALL_HEADERS: &[&str; 4] = &[DATE_HEADER, HASH_HEADER, TOKEN_HEADER, AUTH_HEADER]; impl<'a> RequestSigner<'a> { - fn sign(&self, request: &mut Request) { + fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); request.headers_mut().insert(TOKEN_HEADER, token_val); @@ -101,9 +101,13 @@ impl<'a> RequestSigner<'a> { request.headers_mut().insert(DATE_HEADER, date_val); let digest = if self.sign_payload { - match request.body() { - None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), + if let Some(digest) = pre_calculated_digest { + hex_encode(&digest) + } else { + match request.body() { + None => EMPTY_SHA256_HASH.to_string(), + Some(body) => hex_digest(body.as_bytes().unwrap()), + } } } else { UNSIGNED_PAYLOAD_LITERAL.to_string() @@ -165,6 +169,7 @@ pub trait CredentialExt { region: &str, service: &str, sign_payload: bool, + payload_sha256: Option>, ) -> Self; } @@ -175,6 +180,7 @@ impl CredentialExt for RequestBuilder { region: &str, service: &str, sign_payload: bool, + payload_sha256: Option>, ) -> Self { // Hack around lack of access to underlying request // https://github.com/seanmonstar/reqwest/issues/1212 @@ -193,7 +199,7 @@ impl CredentialExt for RequestBuilder { sign_payload, }; - signer.sign(&mut request); + signer.sign(&mut request, payload_sha256); for header in ALL_HEADERS { if let Some(val) = request.headers_mut().remove(*header) { @@ -627,7 +633,7 @@ mod tests { sign_payload: true, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } @@ -665,7 +671,7 @@ mod tests { sign_payload: false, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") } @@ -702,7 +708,7 @@ mod tests { sign_payload: true, }; - signer.sign(&mut request); + signer.sign(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index c724886cf0e6..7d10f3728238 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -47,6 +47,7 @@ use tokio::io::AsyncWrite; use tracing::info; use url::Url; +pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, @@ -59,6 +60,7 @@ use crate::{ Result, RetryConfig, StreamExt, }; +mod checksum; mod client; mod credential; @@ -101,6 +103,9 @@ enum Error { source: std::num::ParseIntError, }, + #[snafu(display("Invalid Checksum algorithm"))] + InvalidChecksumAlgorithm, + #[snafu(display("Missing region"))] MissingRegion, @@ -386,6 +391,7 @@ pub struct AmazonS3Builder { imdsv1_fallback: bool, virtual_hosted_style_request: bool, unsigned_payload: bool, + checksum_algorithm: Option, metadata_endpoint: Option, profile: Option, client_options: ClientOptions, @@ -514,6 +520,11 @@ pub enum AmazonS3ConfigKey { /// - `unsigned_payload` UnsignedPayload, + /// Set the checksum algorithm for this client + /// + /// See [`AmazonS3Builder::with_checksum_algorithm`] + Checksum, + /// Set the instance metadata endpoint /// /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. @@ -546,6 +557,7 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", + Self::Checksum => "aws_checksum_algorithm", } } } @@ -575,6 +587,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), + "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), } } @@ -694,6 +707,11 @@ impl AmazonS3Builder { AmazonS3ConfigKey::UnsignedPayload => { self.unsigned_payload = str_is_truthy(&value.into()) } + AmazonS3ConfigKey::Checksum => { + let algorithm = Checksum::try_from(&value.into()) + .map_err(|_| Error::InvalidChecksumAlgorithm)?; + self.checksum_algorithm = Some(algorithm) + } }; Ok(self) } @@ -846,6 +864,14 @@ impl AmazonS3Builder { self } + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. + /// + /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { + self.checksum_algorithm = Some(checksum_algorithm); + self + } + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), /// used primarily within AWS EC2. /// @@ -992,6 +1018,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload, + checksum: self.checksum_algorithm, }; let client = Arc::new(S3Client::new(config)?); @@ -1151,6 +1178,7 @@ mod tests { &container_creds_relative_uri, ); env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); + env::set_var("AWS_CHECKSUM_ALGORITHM", "sha256"); let builder = AmazonS3Builder::from_env(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); @@ -1164,6 +1192,7 @@ mod tests { assert_eq!(builder.token.unwrap(), aws_session_token); let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); assert!(builder.unsigned_payload); } @@ -1181,6 +1210,7 @@ mod tests { ("aws_endpoint", aws_endpoint.clone()), ("aws_session_token", aws_session_token.clone()), ("aws_unsigned_payload", "true".to_string()), + ("aws_checksum_algorithm", "sha256".to_string()), ]); let builder = AmazonS3Builder::new() @@ -1193,6 +1223,7 @@ mod tests { assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); assert!(builder.unsigned_payload); } @@ -1256,6 +1287,12 @@ mod tests { let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; + + // run integration test with checksum set to sha256 + let config = maybe_skip_integration!().with_checksum_algorithm(Checksum::SHA256); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let integration = config.build().unwrap(); + put_get_delete_list_opts(&integration, is_local).await; } #[tokio::test] From 3a6f8bdb5d0c9b3340b963ed7b23f5fd3267454d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 16:38:09 +0000 Subject: [PATCH 0706/1411] Fix pyarrow memory leak (#3683) (#3893) * Fix pyarrow memory leak (#3683) Remove ArrowArray::into_raw and try_from_raw * Update docs * Further deprecation * Clippy --- arrow-data/src/ffi.rs | 3 -- arrow/src/array/ffi.rs | 18 ++---------- arrow/src/array/mod.rs | 2 +- arrow/src/ffi.rs | 63 +++++++----------------------------------- arrow/src/pyarrow.rs | 10 +++---- 5 files changed, 18 insertions(+), 78 deletions(-) diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index b7d690fb9124..7623ced043cc 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -118,9 +118,6 @@ struct ArrayPrivateData { impl FFI_ArrowArray { /// creates a new `FFI_ArrowArray` from existing data. - /// # Memory Leaks - /// This method releases `buffers`. Consumers of this struct *must* call `release` before - /// releasing this struct, or contents in `buffers` leak. pub fn new(data: &ArrayData) -> Self { let data_layout = layout(data.data_type()); diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 0751fe2c0f2d..5f556dfff587 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -25,7 +25,7 @@ use crate::{ ffi::ArrowArrayRef, }; -use super::{make_array, ArrayData, ArrayRef}; +use super::{ArrayData, ArrayRef}; impl TryFrom for ArrayData { type Error = ArrowError; @@ -43,21 +43,6 @@ impl TryFrom for ffi::ArrowArray { } } -/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -#[deprecated(note = "Use ArrowArray::new")] -#[allow(deprecated)] -pub unsafe fn make_array_from_raw( - array: *const ffi::FFI_ArrowArray, - schema: *const ffi::FFI_ArrowSchema, -) -> Result { - let array = ffi::ArrowArray::try_from_raw(array, schema)?; - let data = ArrayData::try_from(array)?; - Ok(make_array(data)) -} - /// Exports an array to raw pointers of the C Data Interface provided by the consumer. /// # Safety /// Assumes that these pointers represent valid C Data Interfaces, both in memory @@ -66,6 +51,7 @@ pub unsafe fn make_array_from_raw( /// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and /// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. /// Usually the raw pointers are provided by the array data consumer. +#[deprecated(note = "Use FFI_ArrowArray::new and FFI_ArrowSchema::try_from")] pub unsafe fn export_array_into_raw( src: ArrayRef, out_array: *mut ffi::FFI_ArrowArray, diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 09348996eafa..ff3a170c698a 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -35,7 +35,7 @@ pub use arrow_data::transform::{Capacities, MutableArrayData}; #[cfg(feature = "ffi")] #[allow(deprecated)] -pub use self::ffi::{export_array_into_raw, make_array_from_raw}; +pub use self::ffi::export_array_into_raw; // --------------------- Array's values comparison --------------------- diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 333d6425a38e..9d0ed0b85fb6 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -394,21 +394,15 @@ pub trait ArrowArrayRef { /// Its main responsibility is to expose functionality that requires /// both [FFI_ArrowArray] and [FFI_ArrowSchema]. /// -/// This struct has two main paths: -/// /// ## Import from the C Data Interface -/// * [ArrowArray::empty] to allocate memory to be filled by an external call -/// * [ArrowArray::try_from_raw] to consume two non-null allocated pointers +/// * [ArrowArray::new] to create an array from [`FFI_ArrowArray`] and [`FFI_ArrowSchema`] +/// /// ## Export to the C Data Interface -/// * [ArrowArray::try_new] to create a new [ArrowArray] from Rust-specific information -/// * [ArrowArray::into_raw] to expose two pointers for [FFI_ArrowArray] and [FFI_ArrowSchema]. +/// * Use [`FFI_ArrowArray`] and [`FFI_ArrowSchema`] directly /// /// # Safety -/// Whoever creates this struct is responsible for releasing their resources. Specifically, -/// consumers *must* call [ArrowArray::into_raw] and take ownership of the individual pointers, -/// calling [FFI_ArrowArray::release] and [FFI_ArrowSchema::release] accordingly. /// -/// Furthermore, this struct assumes that the incoming data agrees with the C data interface. +/// This struct assumes that the incoming data agrees with the C data interface. #[derive(Debug)] pub struct ArrowArray { pub(crate) array: Arc, @@ -480,38 +474,6 @@ impl ArrowArray { Ok(ArrowArray { array, schema }) } - /// creates a new [ArrowArray] from two pointers. Used to import from the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - /// Note that this function will copy the content pointed by the raw pointers. Considering - /// the raw pointers can be from `Arc::into_raw` or other raw pointers, users must be responsible - /// on managing the allocation of the structs by themselves. - /// # Error - /// Errors if any of the pointers is null - #[deprecated(note = "Use ArrowArray::new")] - pub unsafe fn try_from_raw( - array: *const FFI_ArrowArray, - schema: *const FFI_ArrowSchema, - ) -> Result { - if array.is_null() || schema.is_null() { - return Err(ArrowError::MemoryError( - "At least one of the pointers passed to `try_from_raw` is null" - .to_string(), - )); - }; - - let array_mut = array as *mut FFI_ArrowArray; - let schema_mut = schema as *mut FFI_ArrowSchema; - - let array_data = std::ptr::replace(array_mut, FFI_ArrowArray::empty()); - let schema_data = std::ptr::replace(schema_mut, FFI_ArrowSchema::empty()); - - Ok(Self { - array: Arc::new(array_data), - schema: Arc::new(schema_data), - }) - } - /// creates a new empty [ArrowArray]. Used to import from the C Data Interface. /// # Safety /// See safety of [ArrowArray] @@ -520,23 +482,16 @@ impl ArrowArray { let array = Arc::new(FFI_ArrowArray::empty()); ArrowArray { array, schema } } - - /// exports [ArrowArray] to the C Data Interface - #[deprecated(note = "Use FFI_ArrowArray and FFI_ArrowSchema directly")] - pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) { - (Arc::into_raw(this.array), Arc::into_raw(this.schema)) - } } #[cfg(test)] mod tests { use super::*; use crate::array::{ - export_array_into_raw, make_array, Array, ArrayData, BooleanArray, - Decimal128Array, DictionaryArray, DurationSecondArray, FixedSizeBinaryArray, - FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericStringArray, - Int32Array, MapArray, OffsetSizeTrait, Time32MillisecondArray, - TimestampMillisecondArray, UInt32Array, + make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, + DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, + GenericBinaryArray, GenericListArray, GenericStringArray, Int32Array, MapArray, + OffsetSizeTrait, Time32MillisecondArray, TimestampMillisecondArray, UInt32Array, }; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; @@ -1040,7 +995,9 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_export_array_into_raw() -> Result<()> { + use crate::array::export_array_into_raw; let array = make_array(Int32Array::from(vec![1, 2, 3]).into_data()); // Assume two raw pointers provided by the consumer diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 110fd9cfaa82..8cc08988cbe6 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -19,7 +19,7 @@ //! arrays from and to Python. use std::convert::{From, TryFrom}; -use std::ptr::addr_of_mut; +use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; use pyo3::ffi::Py_uintptr_t; @@ -133,16 +133,16 @@ impl PyArrowConvert for ArrayData { } fn to_pyarrow(&self, py: Python) -> PyResult { - let array = ffi::ArrowArray::try_from(self.clone()).map_err(to_py_err)?; - let (array_pointer, schema_pointer) = ffi::ArrowArray::into_raw(array); + let array = FFI_ArrowArray::new(self); + let schema = FFI_ArrowSchema::try_from(self.data_type()).map_err(to_py_err)?; let module = py.import("pyarrow")?; let class = module.getattr("Array")?; let array = class.call_method1( "_import_from_c", ( - array_pointer as Py_uintptr_t, - schema_pointer as Py_uintptr_t, + addr_of!(array) as Py_uintptr_t, + addr_of!(schema) as Py_uintptr_t, ), )?; Ok(array.to_object(py)) From ae4db601642c752b63a0331a4545ee71f8b6d7cd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 19:01:03 +0000 Subject: [PATCH 0707/1411] Revert structured ArrayData (#1799) (#3894) --- arrow-data/src/data/boolean.rs | 139 -------- arrow-data/src/data/buffers.rs | 10 - arrow-data/src/data/bytes.rs | 559 ------------------------------ arrow-data/src/data/dictionary.rs | 289 --------------- arrow-data/src/data/list.rs | 422 ---------------------- arrow-data/src/data/mod.rs | 230 ++++-------- arrow-data/src/data/null.rs | 104 ------ arrow-data/src/data/primitive.rs | 304 ---------------- arrow-data/src/data/run.rs | 277 --------------- arrow-data/src/data/struct.rs | 129 ------- arrow-data/src/data/types.rs | 152 -------- arrow-data/src/data/union.rs | 171 --------- 12 files changed, 72 insertions(+), 2714 deletions(-) delete mode 100644 arrow-data/src/data/boolean.rs delete mode 100644 arrow-data/src/data/bytes.rs delete mode 100644 arrow-data/src/data/dictionary.rs delete mode 100644 arrow-data/src/data/list.rs delete mode 100644 arrow-data/src/data/null.rs delete mode 100644 arrow-data/src/data/primitive.rs delete mode 100644 arrow-data/src/data/run.rs delete mode 100644 arrow-data/src/data/struct.rs delete mode 100644 arrow-data/src/data/types.rs delete mode 100644 arrow-data/src/data/union.rs diff --git a/arrow-data/src/data/boolean.rs b/arrow-data/src/data/boolean.rs deleted file mode 100644 index 258624cc1c66..000000000000 --- a/arrow-data/src/data/boolean.rs +++ /dev/null @@ -1,139 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::PhysicalType; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::DataType; - -#[derive(Debug, Clone)] -pub struct BooleanArrayData { - data_type: DataType, - values: BooleanBuffer, - nulls: Option, -} - -impl BooleanArrayData { - /// Create a new [`BooleanArrayData`] - /// - /// # Panics - /// - /// Panics if - /// - `nulls` and `values` are different lengths - /// - `PhysicalType::from(&data_type) != PhysicalType::Boolean` - pub fn new( - data_type: DataType, - values: BooleanBuffer, - nulls: Option, - ) -> Self { - let physical = PhysicalType::from(&data_type); - assert_eq!( - physical, PhysicalType::Boolean, - "Illegal physical type for BooleanArrayData of datatype {:?}, expected {:?} got {:?}", - data_type, - PhysicalType::Boolean, - physical - ); - - if let Some(n) = nulls.as_ref() { - assert_eq!(values.len(), n.len()) - } - Self { - data_type, - values, - nulls, - } - } - - /// Create a new [`BooleanArrayData`] - /// - /// # Safety - /// - /// - `nulls` and `values` are the same lengths - /// - `PhysicalType::from(&data_type) == PhysicalType::Boolean` - pub unsafe fn new_unchecked( - data_type: DataType, - values: BooleanBuffer, - nulls: Option, - ) -> Self { - Self { - data_type, - values, - nulls, - } - } - - /// Creates a new [`BooleanArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`BooleanArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let values = builder.buffers.into_iter().next().unwrap(); - let values = BooleanBuffer::new(values, builder.offset, builder.len); - Self { - values, - data_type: builder.data_type, - nulls: builder.nulls, - } - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the boolean values - #[inline] - pub fn values(&self) -> &BooleanBuffer { - &self.values - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`BooleanArrayData`] - pub fn into_parts(self) -> (DataType, BooleanBuffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - values: self.values.slice(offset, len), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.values.len(), - offset: self.values.offset(), - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.values().inner()), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs index 8a498d319aae..883e92e36d82 100644 --- a/arrow-data/src/data/buffers.rs +++ b/arrow-data/src/data/buffers.rs @@ -33,16 +33,6 @@ impl<'a> Buffers<'a> { } } - #[inline] - pub(crate) fn one(b: &'a Buffer) -> Self { - Self([Some(b), None]) - } - - #[inline] - pub(crate) fn two(a: &'a Buffer, b: &'a Buffer) -> Self { - Self([Some(a), Some(b)]) - } - /// Returns the number of [`Buffer`] in this collection #[inline] pub fn len(&self) -> usize { diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs deleted file mode 100644 index 9ac267130b7a..000000000000 --- a/arrow-data/src/data/bytes.rs +++ /dev/null @@ -1,559 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::{BytesType, OffsetType}; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow_buffer::{ArrowNativeType, Buffer}; -use arrow_schema::DataType; -use std::marker::PhantomData; - -mod private { - use super::*; - - pub trait BytesSealed { - /// Create from bytes without performing any validation - /// - /// # Safety - /// - /// If `str`, `b` must be a valid UTF-8 sequence - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self; - - /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> - where - Self: Bytes; - - /// Downcast [`ArrayDataBytes`] to `[ArrayDataBytesOffset`] - fn downcast(data: ArrayDataBytes) -> Option> - where - Self: Bytes; - - /// Cast [`ArrayDataBytesOffset`] to [`ArrayDataBytes`] - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes - where - Self: Bytes; - } - - pub trait BytesOffsetSealed { - /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> - where - Self: BytesOffset; - - /// Downcast [`ArrayDataBytesOffset`] to `[BytesArrayData`] - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> - where - Self: BytesOffset; - - /// Cast [`BytesArrayData`] to [`ArrayDataBytesOffset`] - fn upcast( - v: BytesArrayData, - ) -> ArrayDataBytesOffset - where - Self: BytesOffset; - } -} - -/// Types backed by a variable length slice of bytes -pub trait Bytes: private::BytesSealed + std::fmt::Debug { - const TYPE: BytesType; -} - -impl Bytes for [u8] { - const TYPE: BytesType = BytesType::Binary; -} - -impl private::BytesSealed for [u8] { - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { - b - } - - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { - match data { - ArrayDataBytes::Binary(v) => Some(v), - ArrayDataBytes::Utf8(_) => None, - } - } - - fn downcast(data: ArrayDataBytes) -> Option> { - match data { - ArrayDataBytes::Binary(v) => Some(v), - ArrayDataBytes::Utf8(_) => None, - } - } - - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { - ArrayDataBytes::Binary(v) - } -} - -impl Bytes for str { - const TYPE: BytesType = BytesType::Utf8; -} - -impl private::BytesSealed for str { - unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { - std::str::from_utf8_unchecked(b) - } - - fn downcast_ref(data: &ArrayDataBytes) -> Option<&ArrayDataBytesOffset> { - match data { - ArrayDataBytes::Binary(_) => None, - ArrayDataBytes::Utf8(v) => Some(v), - } - } - - fn downcast(data: ArrayDataBytes) -> Option> { - match data { - ArrayDataBytes::Binary(_) => None, - ArrayDataBytes::Utf8(v) => Some(v), - } - } - - fn upcast(v: ArrayDataBytesOffset) -> ArrayDataBytes { - ArrayDataBytes::Utf8(v) - } -} - -/// Types of offset used by variable length byte arrays -pub trait BytesOffset: private::BytesOffsetSealed + ArrowNativeType { - const TYPE: OffsetType; -} - -impl BytesOffset for i32 { - const TYPE: OffsetType = OffsetType::Int32; -} - -impl private::BytesOffsetSealed for i32 { - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> { - match data { - ArrayDataBytesOffset::Small(v) => Some(v), - ArrayDataBytesOffset::Large(_) => None, - } - } - - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> { - match data { - ArrayDataBytesOffset::Small(v) => Some(v), - ArrayDataBytesOffset::Large(_) => None, - } - } - - fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { - ArrayDataBytesOffset::Small(v) - } -} - -impl BytesOffset for i64 { - const TYPE: OffsetType = OffsetType::Int64; -} - -impl private::BytesOffsetSealed for i64 { - fn downcast_ref( - data: &ArrayDataBytesOffset, - ) -> Option<&BytesArrayData> { - match data { - ArrayDataBytesOffset::Small(_) => None, - ArrayDataBytesOffset::Large(v) => Some(v), - } - } - - fn downcast( - data: ArrayDataBytesOffset, - ) -> Option> { - match data { - ArrayDataBytesOffset::Small(_) => None, - ArrayDataBytesOffset::Large(v) => Some(v), - } - } - - fn upcast(v: BytesArrayData) -> ArrayDataBytesOffset { - ArrayDataBytesOffset::Large(v) - } -} - -/// Applies op to each variant of [`ArrayDataBytes`] -macro_rules! bytes_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataBytes::Binary($array) => match $array { - ArrayDataBytesOffset::Small($array) => $op - ArrayDataBytesOffset::Large($array) => $op - } - ArrayDataBytes::Utf8($array) => match $array { - ArrayDataBytesOffset::Small($array) => $op - ArrayDataBytesOffset::Large($array) => $op - } - } - }; -} - -/// An enumeration of the types of [`ArrayDataBytesOffset`] -#[derive(Debug, Clone)] -pub enum ArrayDataBytes { - Binary(ArrayDataBytesOffset<[u8]>), - Utf8(ArrayDataBytesOffset), -} - -impl ArrayDataBytes { - /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] - pub fn downcast_ref( - &self, - ) -> Option<&BytesArrayData> { - O::downcast_ref(B::downcast_ref(self)?) - } - - /// Downcast this [`ArrayDataBytes`] to the corresponding [`BytesArrayData`] - pub fn downcast( - self, - ) -> Option> { - O::downcast(B::downcast(self)?) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - bytes_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - bytes_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataBytes`] from raw buffers - /// - /// # Safety - /// - /// See [`BytesArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - offset: OffsetType, - bytes: BytesType, - ) -> Self { - match bytes { - BytesType::Binary => Self::Binary(match offset { - OffsetType::Int32 => { - ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) - } - OffsetType::Int64 => { - ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) - } - }), - BytesType::Utf8 => Self::Utf8(match offset { - OffsetType::Int32 => { - ArrayDataBytesOffset::Small(BytesArrayData::from_raw(builder)) - } - OffsetType::Int64 => { - ArrayDataBytesOffset::Large(BytesArrayData::from_raw(builder)) - } - }), - } - } -} - -/// An enumeration of the types of [`BytesArrayData`] -#[derive(Debug)] -pub enum ArrayDataBytesOffset { - Small(BytesArrayData), - Large(BytesArrayData), -} - -impl Clone for ArrayDataBytesOffset { - fn clone(&self) -> Self { - match self { - Self::Small(v) => Self::Small(v.clone()), - Self::Large(v) => Self::Large(v.clone()), - } - } -} - -impl From> for ArrayDataBytes { - fn from(value: BytesArrayData) -> Self { - B::upcast(O::upcast(value)) - } -} - -/// ArrayData for [variable-sized arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) of [`Bytes`] -#[derive(Debug)] -pub struct BytesArrayData { - data_type: DataType, - offsets: OffsetBuffer, - values: Buffer, - nulls: Option, - phantom: PhantomData, -} - -impl Clone for BytesArrayData { - fn clone(&self) -> Self { - Self { - data_type: self.data_type.clone(), - nulls: self.nulls.clone(), - offsets: self.offsets.clone(), - values: self.values.clone(), - phantom: Default::default(), - } - } -} - -impl BytesArrayData { - /// Creates a new [`BytesArrayData`] - /// - /// # Safety - /// - /// - Each consecutive window of `offsets` must identify a valid slice of `values` - /// - `nulls.len() == offsets.len() - 1` - /// - `PhysicalType::from(&data_type) == PhysicalType::Bytes(O::TYPE, B::TYPE)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: OffsetBuffer, - values: Buffer, - nulls: Option, - ) -> Self { - Self { - data_type, - nulls, - offsets, - values, - phantom: Default::default(), - } - } - - /// Creates a new [`BytesArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let mut iter = builder.buffers.into_iter(); - let offsets = iter.next().unwrap(); - let values = iter.next().unwrap(); - - let offsets = match builder.len { - 0 => OffsetBuffer::new_empty(), - _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( - offsets, - builder.offset, - builder.len + 1, - )), - }; - - Self { - values, - offsets, - data_type: builder.data_type, - nulls: builder.nulls, - phantom: Default::default(), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.offsets.len().wrapping_sub(1) - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.offsets.len() <= 1 - } - - /// Returns the raw byte data - #[inline] - pub fn values(&self) -> &B { - // Safety: - // Bytes must be valid - unsafe { B::from_bytes_unchecked(self.values.as_slice()) } - } - - /// Returns the offsets - #[inline] - pub fn offsets(&self) -> &OffsetBuffer { - &self.offsets - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`BytesArrayData`] - pub fn into_parts(self) -> (DataType, OffsetBuffer, Buffer, Option) { - (self.data_type, self.offsets, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - values: self.values.clone(), - offsets: self.offsets.slice(offset, len), - data_type: self.data_type.clone(), - nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), - phantom: Default::default(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.offsets.len().wrapping_sub(1), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::two(self.offsets.inner().inner(), &self.values), - child_data: &[], - } - } -} - -/// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes -#[derive(Debug, Clone)] -pub struct FixedSizeBinaryArrayData { - data_type: DataType, - len: usize, - element_size: usize, - values: Buffer, - nulls: Option, -} - -impl FixedSizeBinaryArrayData { - /// Creates a new [`FixedSizeBinaryArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeBinary(element_size)` - /// - `nulls.len() == values.len() / element_size == len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - element_size: usize, - values: Buffer, - nulls: Option, - ) -> Self { - Self { - data_type, - nulls, - values, - len, - element_size, - } - } - - /// Creates a new [`FixedSizeBinaryArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`FixedSizeBinaryArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { - let values = builder.buffers[0] - .slice_with_length(builder.offset * size, builder.len * size); - Self { - values, - data_type: builder.data_type, - len: builder.len, - element_size: size, - nulls: builder.nulls, - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the size of each element - #[inline] - pub fn element_size(&self) -> usize { - self.element_size - } - - /// Returns the raw byte data - #[inline] - pub fn values(&self) -> &[u8] { - &self.values - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`FixedSizeBinaryArrayData`] - pub fn into_parts(self) -> (DataType, Buffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let offset_element = offset.checked_mul(self.element_size).expect("overflow"); - let len_element = len.checked_mul(self.element_size).expect("overflow"); - let values = self.values.slice_with_length(offset_element, len_element); - - Self { - len, - values, - data_type: self.data_type.clone(), - element_size: self.element_size, - nulls: self.nulls().as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(&self.values), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/dictionary.rs b/arrow-data/src/data/dictionary.rs deleted file mode 100644 index c95ee464b608..000000000000 --- a/arrow-data/src/data/dictionary.rs +++ /dev/null @@ -1,289 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::DictionaryKeyType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait DictionaryKeySealed { - /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] - fn downcast_ref(data: &ArrayDataDictionary) -> Option<&DictionaryArrayData> - where - Self: DictionaryKey; - - /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`] - fn downcast(data: ArrayDataDictionary) -> Option> - where - Self: DictionaryKey; - - /// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`] - fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary - where - Self: DictionaryKey; - } -} - -/// Types of dictionary key used by dictionary arrays -pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType { - const TYPE: DictionaryKeyType; -} - -macro_rules! dictionary { - ($t:ty,$v:ident) => { - impl DictionaryKey for $t { - const TYPE: DictionaryKeyType = DictionaryKeyType::$v; - } - impl private::DictionaryKeySealed for $t { - fn downcast_ref( - data: &ArrayDataDictionary, - ) -> Option<&DictionaryArrayData> { - match data { - ArrayDataDictionary::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataDictionary) -> Option> { - match data { - ArrayDataDictionary::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: DictionaryArrayData) -> ArrayDataDictionary { - ArrayDataDictionary::$v(v) - } - } - }; -} - -dictionary!(i8, Int8); -dictionary!(i16, Int16); -dictionary!(i32, Int32); -dictionary!(i64, Int64); -dictionary!(u8, UInt8); -dictionary!(u16, UInt16); -dictionary!(u32, UInt32); -dictionary!(u64, UInt64); - -/// Applies op to each variant of [`ArrayDataDictionary`] -macro_rules! dictionary_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataDictionary::Int8($array) => $op - ArrayDataDictionary::Int16($array) => $op - ArrayDataDictionary::Int32($array) => $op - ArrayDataDictionary::Int64($array) => $op - ArrayDataDictionary::UInt8($array) => $op - ArrayDataDictionary::UInt16($array) => $op - ArrayDataDictionary::UInt32($array) => $op - ArrayDataDictionary::UInt64($array) => $op - } - }; -} - -/// An enumeration of the types of [`DictionaryArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataDictionary { - Int8(DictionaryArrayData), - Int16(DictionaryArrayData), - Int32(DictionaryArrayData), - Int64(DictionaryArrayData), - UInt8(DictionaryArrayData), - UInt16(DictionaryArrayData), - UInt32(DictionaryArrayData), - UInt64(DictionaryArrayData), -} - -impl ArrayDataDictionary { - /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] - pub fn downcast_ref(&self) -> Option<&DictionaryArrayData> { - K::downcast_ref(self) - } - - /// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`] - pub fn downcast(self) -> Option> { - K::downcast(self) - } - - /// Returns the values of this dictionary - pub fn values(&self) -> &ArrayData { - let s = self; - dictionary_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - dictionary_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - dictionary_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataDictionary`] from raw buffers - /// - /// # Safety - /// - /// See [`DictionaryArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - key: DictionaryKeyType, - ) -> Self { - use DictionaryKeyType::*; - match key { - Int8 => Self::Int8(DictionaryArrayData::from_raw(builder)), - Int16 => Self::Int16(DictionaryArrayData::from_raw(builder)), - Int32 => Self::Int32(DictionaryArrayData::from_raw(builder)), - Int64 => Self::Int64(DictionaryArrayData::from_raw(builder)), - UInt8 => Self::UInt8(DictionaryArrayData::from_raw(builder)), - UInt16 => Self::UInt16(DictionaryArrayData::from_raw(builder)), - UInt32 => Self::UInt32(DictionaryArrayData::from_raw(builder)), - UInt64 => Self::UInt64(DictionaryArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataDictionary { - fn from(value: DictionaryArrayData) -> Self { - K::upcast(value) - } -} - -/// ArrayData for [dictionary arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) -#[derive(Debug, Clone)] -pub struct DictionaryArrayData { - data_type: DataType, - nulls: Option, - keys: ScalarBuffer, - values: Box, -} - -impl DictionaryArrayData { - /// Create a new [`DictionaryArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Dictionary(K::TYPE)` - /// - child must have a type matching `data_type` - /// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls` - /// - `nulls` must have the same length as `child` - pub unsafe fn new_unchecked( - data_type: DataType, - keys: ScalarBuffer, - nulls: Option, - child: ArrayData, - ) -> Self { - Self { - data_type, - nulls, - keys, - values: Box::new(child), - } - } - - /// Creates a new [`DictionaryArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let keys = builder.buffers.into_iter().next().unwrap(); - let keys = ScalarBuffer::new(keys, builder.offset, builder.len); - let values = builder.child_data.into_iter().next().unwrap(); - Self { - keys, - data_type: builder.data_type, - nulls: builder.nulls, - values: Box::new(values), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.keys.len() - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.keys.is_empty() - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the keys - #[inline] - pub fn keys(&self) -> &[K] { - &self.keys - } - - /// Returns the values data - #[inline] - pub fn values(&self) -> &ArrayData { - self.values.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`DictionaryArrayData`] - pub fn into_parts( - self, - ) -> (DataType, ScalarBuffer, Option, ArrayData) { - (self.data_type, self.keys, self.nulls, *self.values) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - keys: self.keys.slice(offset, len), - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - values: self.values.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.keys.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.keys.inner()), - child_data: std::slice::from_ref(self.values.as_ref()), - } - } -} diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs deleted file mode 100644 index bcc89f8ba2ca..000000000000 --- a/arrow-data/src/data/list.rs +++ /dev/null @@ -1,422 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::OffsetType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait ListOffsetSealed { - /// Downcast [`ArrayDataList`] to `[ListArrayData`] - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset; - - /// Downcast [`ArrayDataList`] to `[ListArrayData`] - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset; - - /// Cast [`ListArrayData`] to [`ArrayDataList`] - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset; - } -} - -/// Types of offset used by variable length list arrays -pub trait ListOffset: private::ListOffsetSealed + ArrowNativeType { - const TYPE: OffsetType; -} - -impl ListOffset for i32 { - const TYPE: OffsetType = OffsetType::Int32; -} - -impl private::ListOffsetSealed for i32 { - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(v) => Some(v), - ArrayDataList::Large(_) => None, - } - } - - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(v) => Some(v), - ArrayDataList::Large(_) => None, - } - } - - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset, - { - ArrayDataList::Small(v) - } -} - -impl ListOffset for i64 { - const TYPE: OffsetType = OffsetType::Int64; -} - -impl private::ListOffsetSealed for i64 { - fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(_) => None, - ArrayDataList::Large(v) => Some(v), - } - } - - fn downcast(data: ArrayDataList) -> Option> - where - Self: ListOffset, - { - match data { - ArrayDataList::Small(_) => None, - ArrayDataList::Large(v) => Some(v), - } - } - - fn upcast(v: ListArrayData) -> ArrayDataList - where - Self: ListOffset, - { - ArrayDataList::Large(v) - } -} - -/// Applies op to each variant of [`ListArrayData`] -macro_rules! list_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataList::Small($array) => $op - ArrayDataList::Large($array) => $op - } - }; -} - -/// An enumeration of the types of [`ListArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataList { - Small(ListArrayData), - Large(ListArrayData), -} - -impl ArrayDataList { - /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] - pub fn downcast_ref(&self) -> Option<&ListArrayData> { - O::downcast_ref(self) - } - - /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`] - pub fn downcast(self) -> Option> { - O::downcast(self) - } - - /// Returns the values of this [`ArrayDataList`] - pub fn values(&self) -> &ArrayData { - let s = self; - list_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - list_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - list_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataList`] from raw buffers - /// - /// # Safety - /// - /// See [`ListArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, offset: OffsetType) -> Self { - match offset { - OffsetType::Int32 => Self::Small(ListArrayData::from_raw(builder)), - OffsetType::Int64 => Self::Large(ListArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataList { - fn from(value: ListArrayData) -> Self { - O::upcast(value) - } -} - -/// ArrayData for [variable-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) -#[derive(Debug, Clone)] -pub struct ListArrayData { - data_type: DataType, - nulls: Option, - offsets: OffsetBuffer, - values: Box, -} - -impl ListArrayData { - /// Create a new [`ListArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::List(O::TYPE)` - /// - Each consecutive window of `offsets` must identify a valid slice of `child` - /// - `nulls.len() == offsets.len() - 1` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: OffsetBuffer, - nulls: Option, - values: ArrayData, - ) -> Self { - Self { - data_type, - nulls, - offsets, - values: Box::new(values), - } - } - - /// Creates a new [`ListArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`Self::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let offsets = builder.buffers.into_iter().next().unwrap(); - let values = builder.child_data.into_iter().next().unwrap(); - - let offsets = match builder.len { - 0 => OffsetBuffer::new_empty(), - _ => OffsetBuffer::new_unchecked(ScalarBuffer::new( - offsets, - builder.offset, - builder.len + 1, - )), - }; - - Self { - offsets, - data_type: builder.data_type, - nulls: builder.nulls, - values: Box::new(values), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.offsets.len().wrapping_sub(1) - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.offsets.len() <= 1 - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the offsets - #[inline] - pub fn offsets(&self) -> &OffsetBuffer { - &self.offsets - } - - /// Returns the values of this [`ListArrayData`] - #[inline] - pub fn values(&self) -> &ArrayData { - self.values.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`ListArrayData`] - pub fn into_parts( - self, - ) -> (DataType, OffsetBuffer, Option, ArrayData) { - (self.data_type, self.offsets, self.nulls, *self.values) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - offsets: self.offsets.slice(offset, len), - values: self.values.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.offsets.inner().inner()), - child_data: std::slice::from_ref(self.values.as_ref()), - } - } -} - -/// ArrayData for [fixed-size list arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) -#[derive(Debug, Clone)] -pub struct FixedSizeListArrayData { - data_type: DataType, - len: usize, - element_size: usize, - nulls: Option, - child: Box, -} - -impl FixedSizeListArrayData { - /// Create a new [`FixedSizeListArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::FixedSizeList(element_size)` - /// - `nulls.len() == values.len() / element_size == len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - element_size: usize, - nulls: Option, - child: ArrayData, - ) -> Self { - Self { - data_type, - len, - element_size, - nulls, - child: Box::new(child), - } - } - - /// Creates a new [`FixedSizeListArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`FixedSizeListArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, size: usize) -> Self { - let child = - builder.child_data[0].slice(builder.offset * size, builder.len * size); - Self { - data_type: builder.data_type, - len: builder.len, - element_size: size, - nulls: builder.nulls, - child: Box::new(child), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the size of each element - #[inline] - pub fn element_size(&self) -> usize { - self.element_size - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the child data - #[inline] - pub fn child(&self) -> &ArrayData { - self.child.as_ref() - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`FixedSizeListArrayData`] - pub fn into_parts(self) -> (DataType, Option, ArrayData) { - (self.data_type, self.nulls, *self.child) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let offset_element = offset.checked_mul(self.element_size).expect("overflow"); - let len_element = len.checked_mul(self.element_size).expect("overflow"); - let child = self.child.slice(offset_element, len_element); - - Self { - len, - data_type: self.data_type.clone(), - element_size: self.element_size, - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - child: Box::new(child), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::default(), - child_data: std::slice::from_ref(self.child.as_ref()), - } - } -} diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 784911dc0a85..cc908d639553 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -33,27 +33,6 @@ use crate::equal; mod buffers; pub use buffers::*; -#[allow(unused)] // Private until ready (#1799) -mod boolean; -#[allow(unused)] // Private until ready (#1799) -mod bytes; -#[allow(unused)] // Private until ready (#1799) -mod dictionary; -#[allow(unused)] // Private until ready (#1799) -mod list; -#[allow(unused)] // Private until ready (#1799) -mod null; -#[allow(unused)] // Private until ready (#1799) -mod primitive; -#[allow(unused)] // Private until ready (#1799) -mod run; -#[allow(unused)] // Private until ready (#1799) -mod r#struct; -#[allow(unused)] // Private until ready (#1799) -mod types; -#[allow(unused)] // Private until ready (#1799) -mod union; - #[inline] pub(crate) fn contains_nulls( null_bit_buffer: Option<&NullBuffer>, @@ -351,7 +330,7 @@ impl ArrayData { // We don't need to validate children as we can assume that the // [`ArrayData`] in `child_data` have already been validated through // a call to `ArrayData::try_new` or created using unsafe - ArrayDataLayout::new(&new_self).validate_data()?; + new_self.validate_data()?; Ok(new_self) } @@ -441,15 +420,14 @@ impl ArrayData { /// If multiple [`ArrayData`]s refer to the same underlying /// [`Buffer`]s they will both report the same size. pub fn get_buffer_memory_size(&self) -> usize { - let s = ArrayDataLayout::new(self); let mut size = 0; - for buffer in s.buffers { + for buffer in &self.buffers { size += buffer.capacity(); } - if let Some(bitmap) = s.nulls { + if let Some(bitmap) = &self.nulls { size += bitmap.buffer().capacity() } - for child in s.child_data { + for child in &self.child_data { size += child.get_buffer_memory_size(); } size @@ -468,15 +446,14 @@ impl ArrayData { /// first `20` elements, then [`Self::get_slice_memory_size`] on the /// sliced [`ArrayData`] would return `20 * 8 = 160`. pub fn get_slice_memory_size(&self) -> Result { - let s = ArrayDataLayout::new(self); let mut result: usize = 0; - let layout = layout(s.data_type); + let layout = layout(&self.data_type); for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width } => { let buffer_size = - s.len.checked_mul(*byte_width).ok_or_else(|| { + self.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( "Integer overflow computing buffer size".to_string(), ) @@ -485,26 +462,26 @@ impl ArrayData { } BufferSpec::VariableWidth => { let buffer_len: usize; - match s.data_type { + match self.data_type { DataType::Utf8 | DataType::Binary => { - let offsets = s.typed_offsets::()?; - buffer_len = (offsets[s.len] - offsets[0]) as usize; + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0] ) as usize; } DataType::LargeUtf8 | DataType::LargeBinary => { - let offsets = s.typed_offsets::()?; - buffer_len = (offsets[s.len] - offsets[0]) as usize; + let offsets = self.typed_offsets::()?; + buffer_len = (offsets[self.len] - offsets[0]) as usize; } _ => { return Err(ArrowError::NotYetImplemented(format!( - "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", - s.data_type + "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}", + self.data_type ))) } }; result += buffer_len; } BufferSpec::BitMap => { - let buffer_size = bit_util::ceil(s.len, 8); + let buffer_size = bit_util::ceil(self.len, 8); result += buffer_size; } BufferSpec::AlwaysNull => { @@ -513,11 +490,11 @@ impl ArrayData { } } - if s.nulls.is_some() { - result += bit_util::ceil(s.len, 8); + if self.nulls().is_some() { + result += bit_util::ceil(self.len, 8); } - for child in s.child_data { + for child in &self.child_data { result += child.get_slice_memory_size()?; } Ok(result) @@ -532,18 +509,17 @@ impl ArrayData { /// [`Self::get_buffer_memory_size`] + /// `size_of_val(child)` for all children pub fn get_array_memory_size(&self) -> usize { - let s = ArrayDataLayout::new(self); let mut size = mem::size_of_val(self); // Calculate rest of the fields top down which contain actual data - for buffer in s.buffers { + for buffer in &self.buffers { size += mem::size_of::(); size += buffer.capacity(); } - if let Some(nulls) = s.nulls { + if let Some(nulls) = &self.nulls { size += nulls.buffer().capacity(); } - for child in s.child_data { + for child in &self.child_data { size += child.get_array_memory_size(); } @@ -730,101 +706,11 @@ impl ArrayData { /// See [ArrayData::validate_data] to validate fully the offset content /// and the validity of utf8 data pub fn validate(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate() - } - - /// Validate that the data contained within this [`ArrayData`] is valid - /// - /// 1. Null count is correct - /// 2. All offsets are valid - /// 3. All String data is valid UTF-8 - /// 4. All dictionary offsets are valid - /// - /// Internally this calls: - /// - /// * [`Self::validate`] - /// * [`Self::validate_nulls`] - /// * [`Self::validate_values`] - /// - /// Note: this does not recurse into children, for a recursive variant - /// see [`Self::validate_full`] - pub fn validate_data(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_data() - } - - /// Performs a full recursive validation of this [`ArrayData`] and all its children - /// - /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] - /// and all its children recursively - pub fn validate_full(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_full() - } - - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - /// Validates the the null count is correct and that any - /// nullability requirements of its children are correct - pub fn validate_nulls(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_nulls() - } - - /// Validates the values stored within this [`ArrayData`] are valid - /// without recursing into child [`ArrayData`] - /// - /// Does not (yet) check - /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - pub fn validate_values(&self) -> Result<(), ArrowError> { - ArrayDataLayout::new(self).validate_values() - } - - /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons - /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may - /// return false when the arrays are logically equal - pub fn ptr_eq(&self, other: &Self) -> bool { - ArrayDataLayout::new(self).ptr_eq(&ArrayDataLayout::new(other)) - } - - /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] - pub fn into_builder(self) -> ArrayDataBuilder { - self.into() - } -} - -/// A flat representation of [`ArrayData`] -/// -/// This is temporary measure to bridge the gap between the strongly-typed -/// ArrayData enumeration and the older-style struct representation (#1799) -#[derive(Copy, Clone)] -pub(crate) struct ArrayDataLayout<'a> { - data_type: &'a DataType, - len: usize, - offset: usize, - nulls: Option<&'a NullBuffer>, - buffers: Buffers<'a>, - child_data: &'a [ArrayData], -} - -impl<'a> ArrayDataLayout<'a> { - fn new(data: &'a ArrayData) -> Self { - Self { - data_type: &data.data_type, - len: data.len, - offset: data.offset, - nulls: data.nulls.as_ref(), - buffers: Buffers::from_slice(&data.buffers), - child_data: &data.child_data, - } - } - - fn validate(&self) -> Result<(), ArrowError> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; // Check that the data layout conforms to the spec - let layout = layout(self.data_type); + let layout = layout(&self.data_type); if !layout.can_contain_null_mask && self.nulls.is_some() { return Err(ArrowError::InvalidArgumentError(format!( @@ -879,7 +765,7 @@ impl<'a> ArrayDataLayout<'a> { } // check null bit buffer size - if let Some(nulls) = self.nulls { + if let Some(nulls) = self.nulls() { if nulls.null_count() > self.len { return Err(ArrowError::InvalidArgumentError(format!( "null_count {} for an array exceeds length of {} elements", @@ -1141,7 +1027,7 @@ impl<'a> ArrayDataLayout<'a> { fn get_single_valid_child_data( &self, expected_type: &DataType, - ) -> Result, ArrowError> { + ) -> Result<&ArrayData, ArrowError> { self.validate_num_child_data(1)?; self.get_valid_child_data(0, expected_type) } @@ -1166,7 +1052,7 @@ impl<'a> ArrayDataLayout<'a> { &self, i: usize, expected_type: &DataType, - ) -> Result { + ) -> Result<&ArrayData, ArrowError> { let values_data = self.child_data .get(i) .ok_or_else(|| { @@ -1175,9 +1061,8 @@ impl<'a> ArrayDataLayout<'a> { self.data_type, i+1, self.child_data.len() )) })?; - let values_data = ArrayDataLayout::new(values_data); - if expected_type != values_data.data_type { + if expected_type != &values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( "Child type mismatch for {}. Expected {} but child data had {}", self.data_type, expected_type, values_data.data_type @@ -1188,7 +1073,22 @@ impl<'a> ArrayDataLayout<'a> { Ok(values_data) } - fn validate_data(&self) -> Result<(), ArrowError> { + /// Validate that the data contained within this [`ArrayData`] is valid + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 4. All dictionary offsets are valid + /// + /// Internally this calls: + /// + /// * [`Self::validate`] + /// * [`Self::validate_nulls`] + /// * [`Self::validate_values`] + /// + /// Note: this does not recurse into children, for a recursive variant + /// see [`Self::validate_full`] + pub fn validate_data(&self) -> Result<(), ArrowError> { self.validate()?; self.validate_nulls()?; @@ -1196,7 +1096,11 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } - fn validate_full(&self) -> Result<(), ArrowError> { + /// Performs a full recursive validation of this [`ArrayData`] and all its children + /// + /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`] + /// and all its children recursively + pub fn validate_full(&self) -> Result<(), ArrowError> { self.validate_data()?; // validate all children recursively self.child_data @@ -1213,7 +1117,14 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } - fn validate_nulls(&self) -> Result<(), ArrowError> { + /// Validates the values stored within this [`ArrayData`] are valid + /// without recursing into child [`ArrayData`] + /// + /// Does not (yet) check + /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) + /// Validates the the null count is correct and that any + /// nullability requirements of its children are correct + pub fn validate_nulls(&self) -> Result<(), ArrowError> { if let Some(nulls) = &self.nulls { let actual = nulls.len() - nulls.inner().count_set_bits(); if actual != nulls.null_count() { @@ -1231,12 +1142,11 @@ impl<'a> ArrayDataLayout<'a> { match &self.data_type { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { if !f.is_nullable() { - let child = ArrayDataLayout::new(&self.child_data[0]); - self.validate_non_nullable(None, 0, child)? + self.validate_non_nullable(None, 0, &self.child_data[0])? } } DataType::FixedSizeList(field, len) => { - let child = ArrayDataLayout::new(&self.child_data[0]); + let child = &self.child_data[0]; if !field.is_nullable() { match &self.nulls { Some(nulls) => { @@ -1265,8 +1175,7 @@ impl<'a> ArrayDataLayout<'a> { } } DataType::Struct(fields) => { - for (field, child) in fields.iter().zip(self.child_data) { - let child = ArrayDataLayout::new(child); + for (field, child) in fields.iter().zip(&self.child_data) { if !field.is_nullable() { match &self.nulls { Some(n) => self.validate_non_nullable( @@ -1290,11 +1199,11 @@ impl<'a> ArrayDataLayout<'a> { &self, mask: Option<&Buffer>, offset: usize, - child: ArrayDataLayout<'_>, + child: &ArrayData, ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask.as_ref(), - None => return match child.nulls.map(|x| x.null_count()).unwrap_or_default() { + None => return match child.null_count() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent {}", @@ -1304,7 +1213,7 @@ impl<'a> ArrayDataLayout<'a> { }, }; - match child.nulls { + match child.nulls() { Some(nulls) => { let mask = BitChunks::new(mask, offset, child.len); let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); @@ -1333,7 +1242,7 @@ impl<'a> ArrayDataLayout<'a> { /// /// Does not (yet) check /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) - fn validate_values(&self) -> Result<(), ArrowError> { + pub fn validate_values(&self) -> Result<(), ArrowError> { match &self.data_type { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), @@ -1343,11 +1252,11 @@ impl<'a> ArrayDataLayout<'a> { } DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len()) + self.validate_offsets_full::(child.len) } DataType::LargeList(_) => { let child = &self.child_data[0]; - self.validate_offsets_full::(child.len()) + self.validate_offsets_full::(child.len) } DataType::Union(_, _, _) => { // Validate Union Array as part of implementing new Union semantics @@ -1358,7 +1267,7 @@ impl<'a> ArrayDataLayout<'a> { Ok(()) } DataType::Dictionary(key_type, _value_type) => { - let dictionary_length: i64 = self.child_data[0].len().try_into().unwrap(); + let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); let max_value = dictionary_length - 1; match key_type.as_ref() { DataType::UInt8 => self.check_bounds::(max_value), @@ -1373,7 +1282,7 @@ impl<'a> ArrayDataLayout<'a> { } } DataType::RunEndEncoded(run_ends, _values) => { - let run_ends_data = ArrayDataLayout::new(&self.child_data[0]); + let run_ends_data = self.child_data()[0].clone(); match run_ends.data_type() { DataType::Int16 => run_ends_data.check_run_ends::(), DataType::Int32 => run_ends_data.check_run_ends::(), @@ -1517,7 +1426,7 @@ impl<'a> ArrayDataLayout<'a> { indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) - if self.nulls.map(|x| x.is_null(i)).unwrap_or_default() { + if self.is_null(i) { return Ok(()); } let dict_index: i64 = dict_index.try_into().map_err(|_| { @@ -1605,6 +1514,11 @@ impl<'a> ArrayDataLayout<'a> { .zip(other.child_data.iter()) .all(|(a, b)| a.ptr_eq(b)) } + + /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] + pub fn into_builder(self) -> ArrayDataBuilder { + self.into() + } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1889,7 +1803,7 @@ impl ArrayDataBuilder { pub fn build(self) -> Result { let data = unsafe { self.build_unchecked() }; #[cfg(not(feature = "force_validate"))] - ArrayDataLayout::new(&data).validate_data()?; + data.validate_data()?; Ok(data) } } diff --git a/arrow-data/src/data/null.rs b/arrow-data/src/data/null.rs deleted file mode 100644 index b8a4d7270833..000000000000 --- a/arrow-data/src/data/null.rs +++ /dev/null @@ -1,104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::PhysicalType; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_schema::DataType; - -/// ArrayData for [null arrays](https://arrow.apache.org/docs/format/Columnar.html#null-layout) -#[derive(Debug, Clone)] -pub struct NullArrayData { - data_type: DataType, - len: usize, -} - -impl NullArrayData { - /// Create a new [`NullArrayData`] - /// - /// # Panic - /// - /// - `PhysicalType::from(&data_type) != PhysicalType::Null` - pub fn new(data_type: DataType, len: usize) -> Self { - assert_eq!( - PhysicalType::from(&data_type), - PhysicalType::Null, - "Illegal physical type for NullArrayData of datatype {data_type:?}", - ); - Self { data_type, len } - } - - /// Create a new [`NullArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Null` - pub unsafe fn new_unchecked(data_type: DataType, len: usize) -> Self { - Self { data_type, len } - } - - /// Creates a new [`NullArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`NullArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - Self { - data_type: builder.data_type, - len: builder.len, - } - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns the [`DataType`] and length of this [`NullArrayData`] - pub fn into_parts(self) -> (DataType, usize) { - (self.data_type, self.len) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let new_len = offset.saturating_add(len); - assert!(new_len <= self.len); - Self { - data_type: self.data_type.clone(), - len, - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: None, - buffers: Buffers::default(), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs deleted file mode 100644 index ed8ed8d7aabb..000000000000 --- a/arrow-data/src/data/primitive.rs +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::types::{PhysicalType, PrimitiveType}; -use crate::data::ArrayDataLayout; -use crate::{ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{NullBuffer, ScalarBuffer}; -use arrow_buffer::{i256, ArrowNativeType}; -use arrow_schema::DataType; -use half::f16; - -mod private { - use super::*; - - pub trait PrimitiveSealed { - /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] - fn downcast_ref(data: &ArrayDataPrimitive) -> Option<&PrimitiveArrayData> - where - Self: Primitive; - - /// Downcast [`ArrayDataPrimitive`] to `[PrimitiveArrayData`] - fn downcast(data: ArrayDataPrimitive) -> Option> - where - Self: Primitive; - - /// Cast [`ArrayDataPrimitive`] to [`ArrayDataPrimitive`] - fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive - where - Self: Primitive; - } -} - -pub trait Primitive: private::PrimitiveSealed + ArrowNativeType { - const TYPE: PrimitiveType; -} - -/// Applies op to each variant of [`ArrayDataPrimitive`] -macro_rules! primitive_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataPrimitive::Int8($array) => $op - ArrayDataPrimitive::Int16($array) => $op - ArrayDataPrimitive::Int32($array) => $op - ArrayDataPrimitive::Int64($array) => $op - ArrayDataPrimitive::Int128($array) => $op - ArrayDataPrimitive::Int256($array) => $op - ArrayDataPrimitive::UInt8($array) => $op - ArrayDataPrimitive::UInt16($array) => $op - ArrayDataPrimitive::UInt32($array) => $op - ArrayDataPrimitive::UInt64($array) => $op - ArrayDataPrimitive::Float16($array) => $op - ArrayDataPrimitive::Float32($array) => $op - ArrayDataPrimitive::Float64($array) => $op - } - }; -} - -macro_rules! primitive { - ($t:ty,$v:ident) => { - impl Primitive for $t { - const TYPE: PrimitiveType = PrimitiveType::$v; - } - impl private::PrimitiveSealed for $t { - fn downcast_ref( - data: &ArrayDataPrimitive, - ) -> Option<&PrimitiveArrayData> { - match data { - ArrayDataPrimitive::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataPrimitive) -> Option> { - match data { - ArrayDataPrimitive::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: PrimitiveArrayData) -> ArrayDataPrimitive { - ArrayDataPrimitive::$v(v) - } - } - }; -} - -primitive!(i8, Int8); -primitive!(i16, Int16); -primitive!(i32, Int32); -primitive!(i64, Int64); -primitive!(i128, Int128); -primitive!(i256, Int256); -primitive!(u8, UInt8); -primitive!(u16, UInt16); -primitive!(u32, UInt32); -primitive!(u64, UInt64); -primitive!(f16, Float16); -primitive!(f32, Float32); -primitive!(f64, Float64); - -/// An enumeration of the types of [`PrimitiveArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataPrimitive { - Int8(PrimitiveArrayData), - Int16(PrimitiveArrayData), - Int32(PrimitiveArrayData), - Int64(PrimitiveArrayData), - Int128(PrimitiveArrayData), - Int256(PrimitiveArrayData), - UInt8(PrimitiveArrayData), - UInt16(PrimitiveArrayData), - UInt32(PrimitiveArrayData), - UInt64(PrimitiveArrayData), - Float16(PrimitiveArrayData), - Float32(PrimitiveArrayData), - Float64(PrimitiveArrayData), -} - -impl ArrayDataPrimitive { - /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] - pub fn downcast_ref(&self) -> Option<&PrimitiveArrayData

> { - P::downcast_ref(self) - } - - /// Downcast this [`ArrayDataPrimitive`] to the corresponding [`PrimitiveArrayData`] - pub fn downcast(self) -> Option> { - P::downcast(self) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - primitive_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - primitive_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataPrimitive`] from raw buffers - /// - /// # Safety - /// - /// See [`PrimitiveArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw( - builder: ArrayDataBuilder, - primitive: PrimitiveType, - ) -> Self { - use PrimitiveType::*; - match primitive { - Int8 => Self::Int8(PrimitiveArrayData::from_raw(builder)), - Int16 => Self::Int16(PrimitiveArrayData::from_raw(builder)), - Int32 => Self::Int32(PrimitiveArrayData::from_raw(builder)), - Int64 => Self::Int64(PrimitiveArrayData::from_raw(builder)), - Int128 => Self::Int128(PrimitiveArrayData::from_raw(builder)), - Int256 => Self::Int256(PrimitiveArrayData::from_raw(builder)), - UInt8 => Self::UInt8(PrimitiveArrayData::from_raw(builder)), - UInt16 => Self::UInt16(PrimitiveArrayData::from_raw(builder)), - UInt32 => Self::UInt32(PrimitiveArrayData::from_raw(builder)), - UInt64 => Self::UInt64(PrimitiveArrayData::from_raw(builder)), - Float16 => Self::Float16(PrimitiveArrayData::from_raw(builder)), - Float32 => Self::Float32(PrimitiveArrayData::from_raw(builder)), - Float64 => Self::Float64(PrimitiveArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataPrimitive { - fn from(value: PrimitiveArrayData

) -> Self { - P::upcast(value) - } -} - -/// ArrayData for [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of [`Primitive`] -#[derive(Debug, Clone)] -pub struct PrimitiveArrayData { - data_type: DataType, - values: ScalarBuffer, - nulls: Option, -} - -impl PrimitiveArrayData { - /// Create a new [`PrimitiveArrayData`] - /// - /// # Panics - /// - /// Panics if - /// - `PhysicalType::from(&data_type) != PhysicalType::Primitive(T::TYPE)` - /// - `nulls` and `values` are different lengths - pub fn new( - data_type: DataType, - values: ScalarBuffer, - nulls: Option, - ) -> Self { - assert_eq!( - PhysicalType::from(&data_type), - PhysicalType::Primitive(T::TYPE), - "Illegal physical type for PrimitiveArrayData of datatype {data_type:?}", - ); - - if let Some(n) = nulls.as_ref() { - assert_eq!(values.len(), n.len()) - } - - Self { - data_type, - values, - nulls, - } - } - - /// Create a new [`PrimitiveArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Primitive(T::TYPE)` - /// - `nulls` and `values` must be the same length - pub unsafe fn new_unchecked( - data_type: DataType, - values: ScalarBuffer, - nulls: Option, - ) -> Self { - Self { - data_type, - values, - nulls, - } - } - - /// Creates a new [`PrimitiveArrayData`] from an [`ArrayDataBuilder`] - /// - /// # Safety - /// - /// See [`PrimitiveArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let values = builder.buffers.into_iter().next().unwrap(); - let values = ScalarBuffer::new(values, builder.offset, builder.len); - Self { - values, - data_type: builder.data_type, - nulls: builder.nulls, - } - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the primitive values - #[inline] - pub fn values(&self) -> &ScalarBuffer { - &self.values - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`PrimitiveArrayData`] - pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { - (self.data_type, self.values, self.nulls) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - values: self.values.slice(offset, len), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.values.len(), - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::one(self.values.inner()), - child_data: &[], - } - } -} diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs deleted file mode 100644 index 7f80206a70fa..000000000000 --- a/arrow-data/src/data/run.rs +++ /dev/null @@ -1,277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::primitive::{Primitive, PrimitiveArrayData}; -use crate::data::types::RunEndType; -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::{RunEndBuffer, ScalarBuffer}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::DataType; - -mod private { - use super::*; - - pub trait RunEndSealed { - const ENDS_TYPE: DataType; - - /// Downcast [`ArrayDataRun`] to `[RunArrayData`] - fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> - where - Self: RunEnd; - - /// Downcast [`ArrayDataRun`] to `[RunArrayData`] - fn downcast(data: ArrayDataRun) -> Option> - where - Self: RunEnd; - - /// Cast [`RunArrayData`] to [`ArrayDataRun`] - fn upcast(v: RunArrayData) -> ArrayDataRun - where - Self: RunEnd; - } -} - -pub trait RunEnd: private::RunEndSealed + ArrowNativeType + Primitive { - const TYPE: RunEndType; -} - -macro_rules! run_end { - ($t:ty,$v:ident) => { - impl RunEnd for $t { - const TYPE: RunEndType = RunEndType::$v; - } - impl private::RunEndSealed for $t { - const ENDS_TYPE: DataType = DataType::$v; - - fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData> { - match data { - ArrayDataRun::$v(v) => Some(v), - _ => None, - } - } - - fn downcast(data: ArrayDataRun) -> Option> { - match data { - ArrayDataRun::$v(v) => Some(v), - _ => None, - } - } - - fn upcast(v: RunArrayData) -> ArrayDataRun { - ArrayDataRun::$v(v) - } - } - }; -} - -run_end!(i16, Int16); -run_end!(i32, Int32); -run_end!(i64, Int64); - -/// Applies op to each variant of [`ArrayDataRun`] -macro_rules! run_op { - ($array:ident, $op:block) => { - match $array { - ArrayDataRun::Int16($array) => $op - ArrayDataRun::Int32($array) => $op - ArrayDataRun::Int64($array) => $op - } - }; -} - -/// An enumeration of the types of [`RunArrayData`] -#[derive(Debug, Clone)] -pub enum ArrayDataRun { - Int16(RunArrayData), - Int32(RunArrayData), - Int64(RunArrayData), -} - -impl ArrayDataRun { - /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] - pub fn downcast_ref(&self) -> Option<&RunArrayData> { - ::downcast_ref(self) - } - - /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`] - pub fn downcast(self) -> Option> { - ::downcast(self) - } - - /// Returns the values of this [`ArrayDataRun`] - #[inline] - pub fn values(&self) -> &ArrayData { - let s = self; - run_op!(s, { s.values() }) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let s = self; - run_op!(s, { s.slice(offset, len).into() }) - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let s = self; - run_op!(s, { s.layout() }) - } - - /// Creates a new [`ArrayDataRun`] from raw buffers - /// - /// # Safety - /// - /// See [`RunArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, run: RunEndType) -> Self { - use RunEndType::*; - match run { - Int16 => Self::Int16(RunArrayData::from_raw(builder)), - Int32 => Self::Int32(RunArrayData::from_raw(builder)), - Int64 => Self::Int64(RunArrayData::from_raw(builder)), - } - } -} - -impl From> for ArrayDataRun { - fn from(value: RunArrayData) -> Self { - ::upcast(value) - } -} - -/// ArrayData for [run-end encoded arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) -#[derive(Debug, Clone)] -pub struct RunArrayData { - data_type: DataType, - run_ends: RunEndBuffer, - /// The children of this RunArrayData: - /// 1: the run ends - /// 2: the values - /// - /// We store an array so that a slice can be returned in [`RunArrayData::layout`] - children: Box<[ArrayData; 2]>, -} - -impl RunArrayData { - /// Create a new [`RunArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Run(E::TYPE)` - /// - `run_ends` must contain monotonically increasing, positive values `<= len` - /// - `run_ends.get_end_physical_index() < values.len()` - pub unsafe fn new_unchecked( - data_type: DataType, - run_ends: RunEndBuffer, - values: ArrayData, - ) -> Self { - let inner = run_ends.inner(); - let child = ArrayDataBuilder::new(E::ENDS_TYPE) - .len(inner.len()) - .buffers(vec![inner.inner().clone()]) - .build_unchecked(); - - Self { - data_type, - run_ends, - children: Box::new([child, values]), - } - } - - /// Creates a new [`RunArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`RunArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let mut iter = builder.child_data.into_iter(); - let child1 = iter.next().unwrap(); - let child2 = iter.next().unwrap(); - - let p = ScalarBuffer::new(child1.buffers[0].clone(), child1.offset, child1.len); - let run_ends = RunEndBuffer::new_unchecked(p, builder.offset, builder.len); - - Self { - run_ends, - data_type: builder.data_type, - children: Box::new([child1, child2]), - } - } - - /// Returns the length - #[inline] - pub fn len(&self) -> usize { - self.run_ends.len() - } - - /// Returns the offset - #[inline] - pub fn offset(&self) -> usize { - self.run_ends.offset() - } - - /// Returns true if this array is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.run_ends.is_empty() - } - - /// Returns the run ends - #[inline] - pub fn run_ends(&self) -> &RunEndBuffer { - &self.run_ends - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the child data - #[inline] - pub fn values(&self) -> &ArrayData { - &self.children[1] - } - - /// Returns the underlying parts of this [`RunArrayData`] - pub fn into_parts(self) -> (DataType, RunEndBuffer, ArrayData) { - let child = self.children.into_iter().nth(1).unwrap(); - (self.data_type, self.run_ends, child) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - data_type: self.data_type.clone(), - run_ends: self.run_ends.slice(offset, len), - children: self.children.clone(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.run_ends.len(), - offset: self.run_ends.offset(), - nulls: None, - buffers: Buffers::default(), - child_data: self.children.as_ref(), - } - } -} diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs deleted file mode 100644 index 229c10912a59..000000000000 --- a/arrow-data/src/data/struct.rs +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::NullBuffer; -use arrow_schema::DataType; - -/// ArrayData for [struct arrays](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) -#[derive(Debug, Clone)] -pub struct StructArrayData { - data_type: DataType, - len: usize, - nulls: Option, - children: Vec, -} - -impl StructArrayData { - /// Create a new [`StructArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Struct` - /// - all child data and nulls must have length matching `len` - pub unsafe fn new_unchecked( - data_type: DataType, - len: usize, - nulls: Option, - children: Vec, - ) -> Self { - Self { - data_type, - len, - nulls, - children, - } - } - - /// Creates a new [`StructArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`StructArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder) -> Self { - let children = builder - .child_data - .into_iter() - .map(|x| x.slice(builder.offset, builder.len)) - .collect(); - - Self { - data_type: builder.data_type, - len: builder.len, - nulls: builder.nulls, - children, - } - } - - /// Returns the length of this [`StructArrayData`] - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Returns `true` if this [`StructArrayData`] has zero length - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the null buffer if any - #[inline] - pub fn nulls(&self) -> Option<&NullBuffer> { - self.nulls.as_ref() - } - - /// Returns the primitive values - #[inline] - pub fn children(&self) -> &[ArrayData] { - &self.children - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`StructArrayData`] - pub fn into_parts(self) -> (DataType, Option, Vec) { - (self.data_type, self.nulls, self.children) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - Self { - len, - data_type: self.data_type.clone(), - nulls: self.nulls.as_ref().map(|x| x.slice(offset, len)), - children: self.children.iter().map(|c| c.slice(offset, len)).collect(), - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - ArrayDataLayout { - data_type: &self.data_type, - len: self.len, - offset: 0, - nulls: self.nulls.as_ref(), - buffers: Buffers::default(), - child_data: &self.children, - } - } -} diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs deleted file mode 100644 index bb65b42124f3..000000000000 --- a/arrow-data/src/data/types.rs +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_schema::{DataType, IntervalUnit, UnionMode}; - -/// An enumeration of the primitive types implementing [`ArrowNativeType`](arrow_buffer::ArrowNativeType) -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum PrimitiveType { - Int8, - Int16, - Int32, - Int64, - Int128, - Int256, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, -} - -/// An enumeration of the types of offsets for variable length encodings -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum OffsetType { - Int32, - Int64, -} - -/// An enumeration of the types of variable length byte arrays -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum BytesType { - Binary, - Utf8, -} - -/// An enumeration of the types of dictionary key -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum DictionaryKeyType { - Int8, - Int16, - Int32, - Int64, - UInt8, - UInt16, - UInt32, - UInt64, -} - -/// An enumeration of the types of run key -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum RunEndType { - Int16, - Int32, - Int64, -} - -/// Describes the physical representation of a given [`DataType`] -#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub enum PhysicalType { - Null, - Boolean, - Primitive(PrimitiveType), - FixedSizeBinary(usize), - Bytes(OffsetType, BytesType), - FixedSizeList(usize), - List(OffsetType), - Struct, - Union(UnionMode), - Dictionary(DictionaryKeyType), - Run(RunEndType), -} - -impl From<&DataType> for PhysicalType { - fn from(value: &DataType) -> Self { - match value { - DataType::Null => Self::Null, - DataType::Boolean => Self::Boolean, - DataType::Int8 => Self::Primitive(PrimitiveType::Int8), - DataType::Int16 => Self::Primitive(PrimitiveType::Int16), - DataType::Int32 => Self::Primitive(PrimitiveType::Int32), - DataType::Int64 => Self::Primitive(PrimitiveType::Int64), - DataType::UInt8 => Self::Primitive(PrimitiveType::UInt8), - DataType::UInt16 => Self::Primitive(PrimitiveType::UInt16), - DataType::UInt32 => Self::Primitive(PrimitiveType::UInt32), - DataType::UInt64 => Self::Primitive(PrimitiveType::UInt64), - DataType::Float16 => Self::Primitive(PrimitiveType::Float16), - DataType::Float32 => Self::Primitive(PrimitiveType::Float32), - DataType::Float64 => Self::Primitive(PrimitiveType::Float64), - DataType::Timestamp(_, _) => Self::Primitive(PrimitiveType::Int64), - DataType::Date32 => Self::Primitive(PrimitiveType::Int32), - DataType::Date64 => Self::Primitive(PrimitiveType::Int64), - DataType::Time32(_) => Self::Primitive(PrimitiveType::Int32), - DataType::Time64(_) => Self::Primitive(PrimitiveType::Int64), - DataType::Duration(_) => Self::Primitive(PrimitiveType::Int64), - DataType::Decimal128(_, _) => Self::Primitive(PrimitiveType::Int128), - DataType::Decimal256(_, _) => Self::Primitive(PrimitiveType::Int256), - DataType::Interval(IntervalUnit::YearMonth) => { - Self::Primitive(PrimitiveType::Int32) - } - DataType::Interval(IntervalUnit::DayTime) => { - Self::Primitive(PrimitiveType::Int64) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - Self::Primitive(PrimitiveType::Int128) - } - DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(*size as usize), - DataType::Binary => Self::Bytes(OffsetType::Int32, BytesType::Binary), - DataType::LargeBinary => Self::Bytes(OffsetType::Int64, BytesType::Binary), - DataType::Utf8 => Self::Bytes(OffsetType::Int32, BytesType::Utf8), - DataType::LargeUtf8 => Self::Bytes(OffsetType::Int64, BytesType::Utf8), - DataType::List(_) => Self::List(OffsetType::Int32), - DataType::FixedSizeList(_, size) => Self::FixedSizeList(*size as usize), - DataType::LargeList(_) => Self::List(OffsetType::Int64), - DataType::Struct(_) => Self::Struct, - DataType::Union(_, _, mode) => Self::Union(*mode), - DataType::Dictionary(k, _) => match k.as_ref() { - DataType::Int8 => Self::Dictionary(DictionaryKeyType::Int8), - DataType::Int16 => Self::Dictionary(DictionaryKeyType::Int16), - DataType::Int32 => Self::Dictionary(DictionaryKeyType::Int32), - DataType::Int64 => Self::Dictionary(DictionaryKeyType::Int64), - DataType::UInt8 => Self::Dictionary(DictionaryKeyType::UInt8), - DataType::UInt16 => Self::Dictionary(DictionaryKeyType::UInt16), - DataType::UInt32 => Self::Dictionary(DictionaryKeyType::UInt32), - DataType::UInt64 => Self::Dictionary(DictionaryKeyType::UInt64), - d => panic!("illegal dictionary key data type {d}"), - }, - DataType::Map(_, _) => Self::List(OffsetType::Int32), - DataType::RunEndEncoded(f, _) => match f.data_type() { - DataType::Int16 => Self::Run(RunEndType::Int16), - DataType::Int32 => Self::Run(RunEndType::Int32), - DataType::Int64 => Self::Run(RunEndType::Int64), - d => panic!("illegal run end data type {d}"), - }, - } - } -} diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs deleted file mode 100644 index 7d53a1f18067..000000000000 --- a/arrow-data/src/data/union.rs +++ /dev/null @@ -1,171 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::data::ArrayDataLayout; -use crate::{ArrayData, ArrayDataBuilder, Buffers}; -use arrow_buffer::buffer::ScalarBuffer; -use arrow_schema::{DataType, UnionMode}; - -/// ArrayData for [union arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout) -#[derive(Debug, Clone)] -pub struct UnionArrayData { - data_type: DataType, - type_ids: ScalarBuffer, - offsets: Option>, - children: Vec, -} - -impl UnionArrayData { - /// Creates a new [`UnionArrayData`] - /// - /// # Safety - /// - /// - `PhysicalType::from(&data_type) == PhysicalType::Union(mode)` - /// - `offsets` is `Some` iff the above `mode == UnionMode::Sparse` - /// - `type_ids` must only contain values corresponding to a field in `data_type` - /// - `children` must match the field definitions in `data_type` - /// - For each value id in type_ids, the corresponding offset, must be in bounds for the child - pub unsafe fn new_unchecked( - data_type: DataType, - type_ids: ScalarBuffer, - offsets: Option>, - children: Vec, - ) -> Self { - Self { - data_type, - type_ids, - offsets, - children, - } - } - - /// Creates a new [`UnionArrayData`] from raw buffers - /// - /// # Safety - /// - /// See [`UnionArrayData::new_unchecked`] - pub(crate) unsafe fn from_raw(builder: ArrayDataBuilder, mode: UnionMode) -> Self { - match mode { - UnionMode::Sparse => { - let type_ids = builder.buffers.into_iter().next().unwrap(); - let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); - let children = builder - .child_data - .into_iter() - .map(|x| x.slice(builder.offset, builder.len)) - .collect(); - - Self { - type_ids, - children, - data_type: builder.data_type, - offsets: None, - } - } - UnionMode::Dense => { - let mut iter = builder.buffers.into_iter(); - let type_ids = iter.next().unwrap(); - let offsets = iter.next().unwrap(); - let type_ids = ScalarBuffer::new(type_ids, builder.offset, builder.len); - let offsets = ScalarBuffer::new(offsets, builder.offset, builder.len); - - Self { - type_ids, - data_type: builder.data_type, - offsets: Some(offsets), - children: builder.child_data, - } - } - } - } - - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.type_ids.len() - } - - /// Returns the type ids for this array - #[inline] - pub fn type_ids(&self) -> &ScalarBuffer { - &self.type_ids - } - - /// Returns the offsets for this array if this is a dense union - #[inline] - pub fn offsets(&self) -> Option<&ScalarBuffer> { - self.offsets.as_ref() - } - - /// Returns the children of this array - #[inline] - pub fn children(&self) -> &[ArrayData] { - &self.children - } - - /// Returns the data type of this array - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns the underlying parts of this [`UnionArrayData`] - pub fn into_parts( - self, - ) -> ( - DataType, - ScalarBuffer, - Option>, - Vec, - ) { - (self.data_type, self.type_ids, self.offsets, self.children) - } - - /// Returns a zero-copy slice of this array - pub fn slice(&self, offset: usize, len: usize) -> Self { - let (offsets, children) = match &self.offsets { - Some(offsets) => (Some(offsets.slice(offset, len)), self.children.clone()), - None => ( - None, - self.children.iter().map(|c| c.slice(offset, len)).collect(), - ), - }; - Self { - data_type: self.data_type.clone(), - type_ids: self.type_ids.slice(offset, len), - offsets, - children, - } - } - - /// Returns an [`ArrayDataLayout`] representation of this - pub(crate) fn layout(&self) -> ArrayDataLayout<'_> { - let buffers = match &self.offsets { - Some(offsets) => Buffers::two(self.type_ids.inner(), offsets.inner()), - None => Buffers::one(self.type_ids.inner()), - }; - - ArrayDataLayout { - data_type: &self.data_type, - len: self.type_ids.len(), - offset: 0, - nulls: None, - buffers, - child_data: &self.children, - } - } -} From 2730da13187df85c064e4715032d2d7f4ab92bff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 21 Mar 2023 22:05:01 +0000 Subject: [PATCH 0708/1411] Use BooleanBuffer in BooleanArray (#3879) (#3895) * Use BooleanBuffer in BooleanArray (#3879) * Review feedback --- arrow-array/src/array/boolean_array.rs | 33 ++++++++--------- arrow-array/src/builder/boolean_builder.rs | 2 +- arrow-array/src/builder/primitive_builder.rs | 2 +- arrow-buffer/src/buffer/boolean.rs | 39 ++++++++++++++++++-- arrow-buffer/src/buffer/null.rs | 2 +- arrow-ord/src/comparison.rs | 17 +++------ arrow-select/src/filter.rs | 17 +++------ arrow-select/src/take.rs | 2 +- 8 files changed, 66 insertions(+), 48 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index a7ed870ed5cb..c5775ad3b959 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::{bit_util, Buffer, MutableBuffer, NullBuffer}; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; @@ -67,7 +67,7 @@ use std::sync::Arc; #[derive(Clone)] pub struct BooleanArray { data: ArrayData, - raw_values: Buffer, + values: BooleanBuffer, } impl std::fmt::Debug for BooleanArray { @@ -96,11 +96,9 @@ impl BooleanArray { BooleanBuilder::with_capacity(capacity) } - /// Returns a `Buffer` holding all the values of this array. - /// - /// Note this doesn't take the offset of this array into account. - pub fn values(&self) -> &Buffer { - &self.raw_values + /// Returns the underlying [`BooleanBuffer`] holding all the values of this array + pub fn values(&self) -> &BooleanBuffer { + &self.values } /// Returns the number of non null, true values within this array @@ -108,7 +106,7 @@ impl BooleanArray { match self.data.nulls() { Some(nulls) => { let null_chunks = nulls.inner().bit_chunks(); - let value_chunks = self.values().bit_chunks(self.offset(), self.len()); + let value_chunks = self.values().bit_chunks(); null_chunks .iter() .zip(value_chunks.iter()) @@ -119,9 +117,7 @@ impl BooleanArray { .map(|(a, b)| (a & b).count_ones() as usize) .sum() } - None => self - .values() - .count_set_bits_offset(self.offset(), self.len()), + None => self.values().count_set_bits(), } } @@ -135,8 +131,7 @@ impl BooleanArray { /// # Safety /// This doesn't check bounds, the caller must ensure that index < self.len() pub unsafe fn value_unchecked(&self, i: usize) -> bool { - let offset = i + self.offset(); - bit_util::get_bit_raw(self.raw_values.as_ptr(), offset) + self.values.value_unchecked(i) } /// Returns the boolean value at index `i`. @@ -329,8 +324,10 @@ impl From for BooleanArray { 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let raw_values = data.buffers()[0].clone(); - Self { data, raw_values } + let values = + BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + + Self { data, values } } } @@ -424,7 +421,7 @@ mod tests { fn test_boolean_array_from_vec() { let buf = Buffer::from([10_u8]); let arr = BooleanArray::from(vec![false, true, false, true]); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -439,7 +436,7 @@ mod tests { fn test_boolean_array_from_vec_option() { let buf = Buffer::from([10_u8]); let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(4, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(1, arr.null_count()); @@ -501,7 +498,7 @@ mod tests { .build() .unwrap(); let arr = BooleanArray::from(data); - assert_eq!(&buf2, arr.values()); + assert_eq!(&buf2, arr.values().inner()); assert_eq!(5, arr.len()); assert_eq!(2, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 0002309a3d55..bc3b62f99234 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -240,7 +240,7 @@ mod tests { } let arr = builder.finish(); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(10, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 71671fe7db53..1c2cd908ca26 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -448,7 +448,7 @@ mod tests { } let arr = builder.finish(); - assert_eq!(&buf, arr.values()); + assert_eq!(&buf, arr.values().inner()); assert_eq!(10, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 43b74c6031af..9d5953594d5d 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -16,8 +16,8 @@ // under the License. use crate::bit_chunk_iterator::BitChunks; -use crate::{bit_util, buffer_bin_and, buffer_bin_or, Buffer}; -use std::ops::{BitAnd, BitOr}; +use crate::{bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer}; +use std::ops::{BitAnd, BitOr, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans #[derive(Debug, Clone, Eq)] @@ -77,9 +77,9 @@ impl BooleanBuffer { /// /// Panics if `i >= self.len()` #[inline] + #[deprecated(note = "use BooleanBuffer::value")] pub fn is_set(&self, i: usize) -> bool { - assert!(i < self.len); - unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) } + self.value(i) } /// Returns the offset of this [`BooleanBuffer`] in bits @@ -100,6 +100,25 @@ impl BooleanBuffer { self.len == 0 } + /// Returns the boolean value at index `i`. + /// + /// # Panics + /// + /// Panics if `i >= self.len()` + pub fn value(&self, idx: usize) -> bool { + assert!(idx < self.len); + unsafe { self.value_unchecked(idx) } + } + + /// Returns the boolean value at index `i`. + /// + /// # Safety + /// This doesn't check bounds, the caller must ensure that index < self.len() + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> bool { + unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) } + } + /// Returns the packed values of this [`BooleanBuffer`] not including any offset #[inline] pub fn values(&self) -> &[u8] { @@ -147,6 +166,18 @@ impl BooleanBuffer { } } +impl Not for &BooleanBuffer { + type Output = BooleanBuffer; + + fn not(self) -> Self::Output { + BooleanBuffer { + buffer: buffer_unary_not(&self.buffer, self.offset, self.len), + offset: 0, + len: self.len, + } + } +} + impl BitAnd<&BooleanBuffer> for &BooleanBuffer { type Output = BooleanBuffer; diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 2f8c864ca957..cbadb7f42dbf 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -94,7 +94,7 @@ impl NullBuffer { /// Returns `true` if the value at `idx` is not null #[inline] pub fn is_valid(&self, idx: usize) -> bool { - self.buffer.is_set(idx) + self.buffer.value(idx) } /// Returns `true` if the value at `idx` is null diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 76760f8bc4f5..eb672e769ac3 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -26,7 +26,6 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::buffer::buffer_unary_not; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; @@ -196,23 +195,19 @@ pub fn eq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - let len = left.len(); - let left_offset = left.offset(); - - let values = if right { - left.values().bit_slice(left_offset, len) - } else { - buffer_unary_not(left.values(), left.offset(), left.len()) + let values = match right { + true => left.values().clone(), + false => !left.values(), }; let data = unsafe { ArrayData::new_unchecked( DataType::Boolean, - len, + values.len(), None, left.nulls().map(|b| b.inner().sliced()), - 0, - vec![values], + values.offset(), + vec![values.into_inner()], vec![], ) }; diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index a75acda79583..35c11970c0f6 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -24,7 +24,7 @@ use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; use arrow_buffer::bit_util; -use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer}; +use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::bit_iterator::{BitIndexIterator, BitSliceIterator}; use arrow_data::transform::MutableArrayData; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -109,9 +109,7 @@ impl<'a> Iterator for IndexIterator<'a> { /// Counts the number of set bits in `filter` fn filter_count(filter: &BooleanArray) -> usize { - filter - .values() - .count_set_bits_offset(filter.offset(), filter.len()) + filter.values().count_set_bits() } /// Function that can filter arbitrary arrays @@ -154,15 +152,12 @@ pub fn build_filter(filter: &BooleanArray) -> Result { pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { let array_data = filter.data_ref(); let nulls = array_data.nulls().unwrap(); - let mask = filter.values(); - let offset = filter.offset(); - - let new_mask = - buffer_bin_and(mask, offset, nulls.buffer(), nulls.offset(), filter.len()); + let mask = filter.values() & nulls.inner(); let array_data = ArrayData::builder(DataType::Boolean) - .len(filter.len()) - .add_buffer(new_mask); + .len(mask.len()) + .offset(mask.offset()) + .add_buffer(mask.into_inner()); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 741b05493ea4..421157bdf041 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -524,7 +524,7 @@ where IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { - let val_buf = take_bits(values.values(), values.offset(), indices)?; + let val_buf = take_bits(values.values().inner(), values.offset(), indices)?; let null_buf = match values.nulls() { Some(nulls) if nulls.null_count() > 0 => { Some(take_bits(nulls.buffer(), nulls.offset(), indices)?) From a498a0342a15a7ddfa10c17752b0cc258bd80afd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Mar 2023 10:01:53 +0000 Subject: [PATCH 0709/1411] Return ScalarBuffer from PrimitiveArray::values (#3879) (#3896) * Return ScalarBuffer from PrimitiveArray::values (#3879) * Fix docs * Review feedback --- arrow-array/src/array/primitive_array.rs | 4 +-- arrow-buffer/src/buffer/scalar.rs | 33 ++++++++++++++++++++++++ arrow/src/lib.rs | 6 ++--- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 78859bd5956f..241e2a051197 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -275,9 +275,9 @@ impl PrimitiveArray { self.data.is_empty() } - /// Returns a slice of the values of this array + /// Returns the values of this array #[inline] - pub fn values(&self) -> &[T::Native] { + pub fn values(&self) -> &ScalarBuffer { &self.raw_values } diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 9b3a47785098..04c6d9dcc7ac 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -114,6 +114,39 @@ impl From> for ScalarBuffer { } } +impl<'a, T: ArrowNativeType> IntoIterator for &'a ScalarBuffer { + type Item = &'a T; + type IntoIter = std::slice::Iter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + self.as_ref().iter() + } +} + +impl + ?Sized> PartialEq for ScalarBuffer { + fn eq(&self, other: &S) -> bool { + self.as_ref().eq(other.as_ref()) + } +} + +impl PartialEq> for [T; N] { + fn eq(&self, other: &ScalarBuffer) -> bool { + self.as_ref().eq(other.as_ref()) + } +} + +impl PartialEq> for [T] { + fn eq(&self, other: &ScalarBuffer) -> bool { + self.as_ref().eq(other.as_ref()) + } +} + +impl PartialEq> for Vec { + fn eq(&self, other: &ScalarBuffer) -> bool { + self.as_slice().eq(other.as_ref()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 3d1bced298c9..4b1251ebcd2b 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -69,7 +69,7 @@ //! //! let collected: Vec<_> = array.iter().collect(); //! assert_eq!(collected, vec![Some(1), None, Some(3)]); -//! assert_eq!(array.values(), [1, 0, 3]) +//! assert_eq!(array.values(), &[1, 0, 3]) //! ``` //! //! It is also possible to write generic code. For example, the following is generic over @@ -168,7 +168,7 @@ //! //! let array = parse_strings(["1", "2", "3"], DataType::Int32); //! let integers = array.as_any().downcast_ref::().unwrap(); -//! assert_eq!(integers.values(), [1, 2, 3]) +//! assert_eq!(integers.values(), &[1, 2, 3]) //! ``` //! //! # Compute Kernels @@ -192,7 +192,7 @@ //! //! let array = parse_strings(["1", "2", "3"], &DataType::UInt32).unwrap(); //! let integers = array.as_any().downcast_ref::().unwrap(); -//! assert_eq!(integers.values(), [1, 2, 3]) +//! assert_eq!(integers.values(), &[1, 2, 3]) //! ``` //! //! This module also implements many common vertical operations: From 8ac54d33d199b464a27cf58b234f0daf7cd453b5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Mar 2023 13:57:37 +0000 Subject: [PATCH 0710/1411] Fix parsing timestamps of exactly 32 characters (#3902) * Fix parsing timestamps of exactly 32 characters * Format --- arrow-cast/src/parse.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 710a6a4979c1..4acd2b3376be 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -206,7 +206,7 @@ pub fn string_to_datetime( if tz_offset == 32 { // Decimal overrun - while bytes[tz_offset].is_ascii_digit() && tz_offset < bytes.len() { + while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { tz_offset += 1; } } @@ -1083,6 +1083,22 @@ mod tests { } } + #[test] + fn string_to_timestamp_naive() { + let cases = [ + "2018-11-13T17:11:10.011375885995", + "2030-12-04T17:11:10.123", + "2030-12-04T17:11:10.1234", + "2030-12-04T17:11:10.123456", + ]; + for case in cases { + let chrono = + NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono, custom.naive_utc()) + } + } + #[test] fn string_to_timestamp_invalid() { // Test parsing invalid formats From 1dde86dff800fb9d25f01dd3ea032d1966a1ecc4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Mar 2023 15:40:04 +0000 Subject: [PATCH 0711/1411] Update proc-macro2 requirement from =1.0.52 to =1.0.53 (#3905) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.52...1.0.53) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index fa333889bfc7..5f839ca6838a 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -67,7 +67,7 @@ tower = "0.4.13" [build-dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.52", default-features = false } +proc-macro2 = { version = "=1.0.53", default-features = false } prost-build = { version = "=0.11.8", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From dc23fa3badbc9605e2439983439befaca0647fe9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Mar 2023 18:21:51 +0000 Subject: [PATCH 0712/1411] Improve ScalarBuffer debug output (#3907) --- arrow-buffer/src/buffer/scalar.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 04c6d9dcc7ac..4c16a736b10b 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -17,6 +17,7 @@ use crate::buffer::Buffer; use crate::native::ArrowNativeType; +use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; @@ -26,13 +27,19 @@ use std::ops::Deref; /// /// All [`ArrowNativeType`] are valid for all possible backing byte representations, and as /// a result they are "trivially safely transmutable". -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct ScalarBuffer { /// Underlying data buffer buffer: Buffer, phantom: PhantomData, } +impl std::fmt::Debug for ScalarBuffer { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("ScalarBuffer").field(&self.as_ref()).finish() + } +} + impl ScalarBuffer { /// Create a new [`ScalarBuffer`] from a [`Buffer`], and an `offset` /// and `length` in units of `T` @@ -168,6 +175,12 @@ mod tests { assert!(typed.is_empty()); } + #[test] + fn test_debug() { + let buffer = ScalarBuffer::from(vec![1, 2, 3]); + assert_eq!(format!("{buffer:?}"), "ScalarBuffer([1, 2, 3])"); + } + #[test] #[should_panic(expected = "memory is not aligned")] fn test_unaligned() { From 0e80ce601299977c4373e75c5d0d6c8350c5ddd9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Mar 2023 18:44:32 +0000 Subject: [PATCH 0713/1411] Re-export parquet compression level structs (#3903) --- parquet/src/basic.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 9f4f4ee1d1d6..266c0436bb2c 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -20,7 +20,7 @@ use std::{fmt, str}; -use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; +pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::format as parquet; use crate::errors::{ParquetError, Result}; From b1cfe84bc0f5e1ef95e299f5f16e6f8b5f4ee7d5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Mar 2023 18:47:06 +0000 Subject: [PATCH 0714/1411] Array equality for &dyn Array (#3880) (#3899) --- arrow-array/src/array/mod.rs | 4 +- arrow/tests/array_equal.rs | 214 ++++++++++++++--------------------- 2 files changed, 88 insertions(+), 130 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1ddcc2881863..9afefc07f8d4 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -425,13 +425,13 @@ pub trait ArrayAccessor: Array { unsafe fn value_unchecked(&self, index: usize) -> Self::Item; } -impl PartialEq for dyn Array { +impl PartialEq for dyn Array + '_ { fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } -impl PartialEq for dyn Array { +impl PartialEq for dyn Array + '_ { fn eq(&self, other: &T) -> bool { self.data().eq(other.data()) } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index d24a24e2ea48..b6f81f6a4c1a 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -23,6 +23,7 @@ use arrow::array::{ }; use arrow::datatypes::{Int16Type, Int32Type}; use arrow_array::builder::{StringBuilder, StructBuilder}; +use arrow_array::{DictionaryArray, FixedSizeListArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field}; @@ -31,14 +32,11 @@ use std::sync::Arc; #[test] fn test_null_equal() { let a = NullArray::new(12); - let a = a.data(); let b = NullArray::new(12); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); let b = NullArray::new(10); - let b = b.data(); - test_equal(a, b, false); + test_equal(&a, &b, false); // Test the case where offset != 0 @@ -54,69 +52,53 @@ fn test_null_equal() { #[test] fn test_boolean_equal() { let a = BooleanArray::from(vec![false, false, true]); - let a = a.data(); let b = BooleanArray::from(vec![false, false, true]); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); let b = BooleanArray::from(vec![false, false, false]); - let b = b.data(); - test_equal(a, b, false); + test_equal(&a, &b, false); } #[test] fn test_boolean_equal_nulls() { let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let a = a.data(); let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); let b = BooleanArray::from(vec![None, None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); + test_equal(&a, &b, false); let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); - let b = b.data(); - test_equal(a, b, false); + test_equal(&a, &b, false); } #[test] fn test_boolean_equal_offset() { let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); - let a = a.data(); let b = BooleanArray::from(vec![true, false, false, false, true, false, true, true]); - let b = b.data(); - assert_ne!(a, b); - assert_ne!(b, a); + test_equal(&a, &b, false); let a_slice = a.slice(2, 3); let b_slice = b.slice(3, 3); - assert_eq!(a_slice, b_slice); - assert_eq!(b_slice, a_slice); + test_equal(&a_slice, &b_slice, true); let a_slice = a.slice(3, 4); let b_slice = b.slice(4, 4); - assert_ne!(a_slice, b_slice); - assert_ne!(b_slice, a_slice); + test_equal(&a_slice, &b_slice, false); // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) // Elements fill in `u8`'s exactly. let mut vector = vec![false, false, true, true, true, true, true, true]; let a = BooleanArray::from(vector.clone()); - let a = a.data(); let b = BooleanArray::from(vector.clone()); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); // Elements fill in `u8`s + suffix bits. vector.push(true); let a = BooleanArray::from(vector.clone()); - let a = a.data(); let b = BooleanArray::from(vector); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); } #[test] @@ -151,10 +133,8 @@ fn test_primitive() { for (lhs, rhs, expected) in cases { let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); + test_equal(&lhs, &rhs, expected); } } @@ -207,10 +187,8 @@ fn test_primitive_slice() { for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); test_equal(&lhs, &rhs, expected); @@ -218,7 +196,7 @@ fn test_primitive_slice() { } #[allow(clippy::eq_op)] -fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { +fn test_equal(lhs: &dyn Array, rhs: &dyn Array, expected: bool) { // equality is symmetric assert_eq!(lhs, lhs); assert_eq!(rhs, rhs); @@ -275,10 +253,8 @@ fn test_generic_string_equal() { for (lhs, rhs, expected) in cases { let lhs: GenericStringArray = lhs.into_iter().collect(); - let lhs = lhs.data(); let rhs: GenericStringArray = rhs.into_iter().collect(); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); + test_equal(&lhs, &rhs, expected); } } @@ -305,10 +281,8 @@ fn test_generic_binary_equal() { .map(|x| x.as_deref().map(|x| x.as_bytes())) .collect(); let lhs = GenericBinaryArray::::from_opt_vec(lhs); - let lhs = lhs.data(); let rhs = GenericBinaryArray::::from_opt_vec(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); + test_equal(&lhs, &rhs, expected); } } @@ -326,32 +300,26 @@ fn test_large_binary_equal() { fn test_fixed_size_binary_array() { let a_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; let a = FixedSizeBinaryArray::try_from_iter(a_input_arg.into_iter()).unwrap(); - let a = a.data(); let b_input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; let b = FixedSizeBinaryArray::try_from_iter(b_input_arg.into_iter()).unwrap(); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); } #[test] fn test_string_offset() { let a = StringArray::from(vec![Some("a"), None, Some("b")]); - let a = a.data(); let a = a.slice(2, 1); let b = StringArray::from(vec![Some("b")]); - let b = b.data(); - test_equal(&a, b, true); + test_equal(&a, &b, true); } #[test] fn test_string_offset_larger() { let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); - let a = a.data(); let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); - let b = b.data(); test_equal(&a.slice(2, 2), &b.slice(0, 2), false); test_equal(&a.slice(2, 2), &b.slice(1, 2), true); @@ -361,17 +329,14 @@ fn test_string_offset_larger() { #[test] fn test_null() { let a = NullArray::new(2); - let a = a.data(); let b = NullArray::new(2); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); let b = NullArray::new(1); - let b = b.data(); - test_equal(a, b, false); + test_equal(&a, &b, false); } -fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { +fn create_list_array, T: AsRef<[Option]>>(data: T) -> ListArray { let mut builder = ListBuilder::new(Int32Builder::with_capacity(10)); for d in data.as_ref() { if let Some(v) = d { @@ -381,7 +346,7 @@ fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayDa builder.append(false); } } - builder.finish().into_data() + builder.finish() } #[test] @@ -400,7 +365,7 @@ fn test_empty_offsets_list_equal() { let values = Int32Array::from(empty); let empty_offsets: [u8; 0] = []; - let a = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let a: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( "item", DataType::Int32, true, @@ -410,9 +375,10 @@ fn test_empty_offsets_list_equal() { .add_child_data(values.data().clone()) .null_bit_buffer(Some(Buffer::from(&empty_offsets))) .build() - .unwrap(); + .unwrap() + .into(); - let b = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let b: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( "item", DataType::Int32, true, @@ -422,11 +388,12 @@ fn test_empty_offsets_list_equal() { .add_child_data(values.data().clone()) .null_bit_buffer(Some(Buffer::from(&empty_offsets))) .build() - .unwrap(); + .unwrap() + .into(); test_equal(&a, &b, true); - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let c: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( "item", DataType::Int32, true, @@ -440,7 +407,8 @@ fn test_empty_offsets_list_equal() { ) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() - .unwrap(); + .unwrap() + .into(); test_equal(&a, &c, true); } @@ -467,7 +435,7 @@ fn test_list_null() { // a list where the nullness of values is determined by the list's bitmap let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let c: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( "item", DataType::Int32, true, @@ -477,7 +445,8 @@ fn test_list_null() { .add_child_data(c_values.into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() - .unwrap(); + .unwrap() + .into(); let d_values = Int32Array::from(vec![ Some(1), @@ -489,7 +458,7 @@ fn test_list_null() { None, None, ]); - let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let d: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( "item", DataType::Int32, true, @@ -499,7 +468,8 @@ fn test_list_null() { .add_child_data(d_values.into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() - .unwrap(); + .unwrap() + .into(); test_equal(&c, &d, true); } @@ -524,7 +494,7 @@ fn test_list_offsets() { fn create_fixed_size_binary_array, T: AsRef<[Option]>>( data: T, -) -> ArrayData { +) -> FixedSizeBinaryArray { let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); for d in data.as_ref() { @@ -534,7 +504,7 @@ fn create_fixed_size_binary_array, T: AsRef<[Option]>>( builder.append_null(); } } - builder.finish().into_data() + builder.finish() } #[test] @@ -598,12 +568,11 @@ fn test_fixed_size_binary_offsets() { test_equal(&a_slice, &b_slice, false); } -fn create_decimal_array(data: Vec>) -> ArrayData { +fn create_decimal_array(data: Vec>) -> Decimal128Array { data.into_iter() .collect::() .with_precision_and_scale(23, 6) .unwrap() - .into() } #[test] @@ -687,7 +656,7 @@ fn test_decimal_offsets() { /// Create a fixed size list of 2 value lengths fn create_fixed_size_list_array, T: AsRef<[Option]>>( data: T, -) -> ArrayData { +) -> FixedSizeListArray { let mut builder = FixedSizeListBuilder::new(Int32Builder::with_capacity(10), 3); for d in data.as_ref() { @@ -701,7 +670,7 @@ fn create_fixed_size_list_array, T: AsRef<[Option]>>( builder.append(false); } } - builder.finish().into_data() + builder.finish() } #[test] @@ -813,12 +782,10 @@ fn test_struct_equal() { let a = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) .unwrap(); - let a = a.data(); let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); - let b = b.data(); - test_equal(a, b, true); + test_equal(&a, &b, true); } #[test] @@ -845,8 +812,8 @@ fn test_struct_equal_null() { ])) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints.data_ref().clone()) + .add_child_data(strings.to_data()) + .add_child_data(ints.to_data()) .build() .unwrap(); let a = make_array(a); @@ -857,13 +824,13 @@ fn test_struct_equal_null() { ])) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) + .add_child_data(strings.to_data()) + .add_child_data(ints_non_null.to_data()) .build() .unwrap(); let b = make_array(b); - test_equal(a.data_ref(), b.data_ref(), true); + test_equal(&a, &b, true); // test with arrays that are not equal let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); @@ -873,13 +840,13 @@ fn test_struct_equal_null() { ])) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(c_ints_non_null.data_ref().clone()) + .add_child_data(strings.to_data()) + .add_child_data(c_ints_non_null.to_data()) .build() .unwrap(); let c = make_array(c); - test_equal(a.data_ref(), c.data_ref(), false); + test_equal(&a, &c, false); // test a nested struct let a = ArrayData::builder(DataType::Struct(vec![Field::new( @@ -908,8 +875,8 @@ fn test_struct_equal_null() { ])) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) + .add_child_data(strings.to_data()) + .add_child_data(ints_non_null.to_data()) .build() .unwrap(); @@ -925,7 +892,7 @@ fn test_struct_equal_null() { .unwrap(); let b = make_array(b); - test_equal(a.data_ref(), b.data_ref(), true); + test_equal(&a, &b, true); } #[test] @@ -970,7 +937,7 @@ fn test_struct_equal_null_variable_size() { .unwrap(); let b = make_array(b); - test_equal(a.data_ref(), b.data_ref(), true); + test_equal(&a, &b, true); // test with arrays that are not equal let strings3: ArrayRef = Arc::new(StringArray::from(vec![ @@ -987,15 +954,18 @@ fn test_struct_equal_null_variable_size() { )])) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) - .add_child_data(strings3.data_ref().clone()) + .add_child_data(strings3.to_data()) .build() .unwrap(); let c = make_array(c); - test_equal(a.data_ref(), c.data_ref(), false); + test_equal(&a, &c, false); } -fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { +fn create_dictionary_array( + values: &[&str], + keys: &[Option<&str>], +) -> DictionaryArray { let values = StringArray::from(values.to_vec()); let mut builder = StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) @@ -1007,7 +977,7 @@ fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData builder.append_null() } } - builder.finish().into_data() + builder.finish() } #[test] @@ -1085,36 +1055,24 @@ fn test_dictionary_equal_null() { #[test] fn test_non_null_empty_strings() { - let s = StringArray::from(vec![Some(""), Some(""), Some("")]); - - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); + let s1 = StringArray::from(vec![Some(""), Some(""), Some("")]); + let data = s1.to_data().into_builder().nulls(None).build().unwrap(); + let s2 = StringArray::from(data); - // string2 is identical to string1 except that it has no validity buffer but since there - // are no nulls, string1 and string2 are equal - test_equal(string1, &string2, true); + // s2 is identical to s1 except that it has no validity buffer but since there + // are no nulls, s1 and s2 are equal + test_equal(&s1, &s2, true); } #[test] fn test_null_empty_strings() { - let s = StringArray::from(vec![Some(""), None, Some("")]); + let s1 = StringArray::from(vec![Some(""), None, Some("")]); + let data = s1.to_data().into_builder().nulls(None).build().unwrap(); + let s2 = StringArray::from(data); - let string1 = s.data(); - - let string2 = ArrayData::builder(DataType::Utf8) - .len(string1.len()) - .buffers(string1.buffers().to_vec()) - .build() - .unwrap(); - - // string2 is identical to string1 except that it has no validity buffer since string1 has - // nulls in it, string1 and string2 are not equal - test_equal(string1, &string2, false); + // s2 is identical to s1 except that it has no validity buffer since string1 has + // nulls in it, s1 and s2 are not equal + test_equal(&s1, &s2, false); } #[test] @@ -1159,9 +1117,9 @@ fn test_union_equal_dense() { builder.append::("b", 7).unwrap(); let union4 = builder.build().unwrap(); - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); + test_equal(&union1, &union2, true); + test_equal(&union1, &union3, false); + test_equal(&union1, &union4, false); } #[test] @@ -1206,22 +1164,22 @@ fn test_union_equal_sparse() { builder.append::("b", 7).unwrap(); let union4 = builder.build().unwrap(); - test_equal(union1.data(), union2.data(), true); - test_equal(union1.data(), union3.data(), false); - test_equal(union1.data(), union4.data(), false); + test_equal(&union1, &union2, true); + test_equal(&union1, &union3, false); + test_equal(&union1, &union4, false); } #[test] fn test_boolean_slice() { let array = BooleanArray::from(vec![true; 32]); let slice = array.slice(4, 12); - assert_eq!(slice.data(), slice.data()); + assert_eq!(&slice, &slice); let slice = array.slice(8, 12); - assert_eq!(slice.data(), slice.data()); + assert_eq!(&slice, &slice); let slice = array.slice(8, 24); - assert_eq!(slice.data(), slice.data()); + assert_eq!(&slice, &slice); } #[test] @@ -1230,7 +1188,7 @@ fn test_sliced_nullable_boolean_array() { let b = BooleanArray::from(vec![true; 32]); let slice_a = a.slice(1, 12); let slice_b = b.slice(1, 12); - assert_ne!(slice_a.data(), slice_b.data()); + assert_ne!(&slice_a, &slice_b); } #[test] @@ -1333,5 +1291,5 @@ fn test_struct_equal_slice() { ]); assert_eq!(a, &b); - test_equal(a.data(), b.data(), true); + test_equal(&a, &b, true); } From 2d68ed5686a2a41d1e486b4dc562c9a19db76c07 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 12:24:55 +0000 Subject: [PATCH 0715/1411] Add PrimitiveArray::new (#3879) (#3909) * Add PrimitiveArray::new (#3879) * Review feedback * Format --- arrow-arith/src/arity.rs | 26 ++---- arrow-array/src/array/primitive_array.rs | 101 +++++++++++++---------- arrow-buffer/src/buffer/scalar.rs | 7 ++ 3 files changed, 69 insertions(+), 65 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 0a8815cc8059..782c8270cf85 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -23,25 +23,10 @@ use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_data::ArrayData; use arrow_schema::ArrowError; use std::sync::Arc; -#[inline] -unsafe fn build_primitive_array( - len: usize, - buffer: Buffer, - nulls: Option, -) -> PrimitiveArray { - PrimitiveArray::from( - ArrayDataBuilder::new(O::DATA_TYPE) - .len(len) - .nulls(nulls) - .buffers(vec![buffer]) - .build_unchecked(), - ) -} - /// See [`PrimitiveArray::unary`] pub fn unary(array: &PrimitiveArray, op: F) -> PrimitiveArray where @@ -209,7 +194,6 @@ where "Cannot perform binary operation on arrays of different length".to_string(), )); } - let len = a.len(); if a.is_empty() { return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); @@ -224,8 +208,7 @@ where // Soundness // `values` is an iterator with a known size from a PrimitiveArray let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - - Ok(unsafe { build_primitive_array(len, buffer, nulls) }) + Ok(PrimitiveArray::new(O::DATA_TYPE, buffer.into(), nulls)) } /// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating @@ -328,7 +311,8 @@ where Ok::<_, ArrowError>(()) })?; - Ok(unsafe { build_primitive_array(len, buffer.finish(), Some(nulls)) }) + let values = buffer.finish().into(); + Ok(PrimitiveArray::new(O::DATA_TYPE, values, Some(nulls))) } } @@ -412,7 +396,7 @@ where buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?); }; } - Ok(unsafe { build_primitive_array(len, buffer.into(), None) }) + Ok(PrimitiveArray::new(O::DATA_TYPE, buffer.into(), None)) } /// This intentional inline(never) attribute helps LLVM optimize the loop. diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 241e2a051197..6faecb1f0e55 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -29,7 +29,7 @@ use arrow_buffer::{ i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, }; use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; @@ -251,19 +251,58 @@ pub struct PrimitiveArray { /// Underlying ArrayData data: ArrayData, /// Values data - raw_values: ScalarBuffer, + values: ScalarBuffer, } impl Clone for PrimitiveArray { fn clone(&self) -> Self { Self { data: self.data.clone(), - raw_values: self.raw_values.clone(), + values: self.values.clone(), } } } impl PrimitiveArray { + /// Create a new [`PrimitiveArray`] from the provided data_type, values, nulls + /// + /// # Panics + /// + /// Panics if: + /// - `values.len() != nulls.len()` + /// - `!Self::is_compatible(data_type)` + pub fn new( + data_type: DataType, + values: ScalarBuffer, + nulls: Option, + ) -> Self { + Self::assert_compatible(&data_type); + if let Some(n) = nulls.as_ref() { + assert_eq!(values.len(), n.len()); + } + + // TODO: Don't store ArrayData inside arrays (#3880) + let data = unsafe { + ArrayData::builder(data_type) + .len(values.len()) + .nulls(nulls) + .buffers(vec![values.inner().clone()]) + .build_unchecked() + }; + + Self { data, values } + } + + /// Asserts that `data_type` is compatible with `Self` + fn assert_compatible(data_type: &DataType) { + assert!( + Self::is_compatible(data_type), + "PrimitiveArray expected data type {} got {}", + T::DATA_TYPE, + data_type + ); + } + /// Returns the length of this array. #[inline] pub fn len(&self) -> usize { @@ -278,7 +317,7 @@ impl PrimitiveArray { /// Returns the values of this array #[inline] pub fn values(&self) -> &ScalarBuffer { - &self.raw_values + &self.values } /// Returns a new primitive array builder @@ -308,7 +347,7 @@ impl PrimitiveArray { /// caller must ensure that the passed in offset is less than the array len() #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> T::Native { - *self.raw_values.get_unchecked(i) + *self.values.get_unchecked(i) } /// Returns the primitive value at index `i`. @@ -346,7 +385,7 @@ impl PrimitiveArray { pub fn from_value(value: T::Native, count: usize) -> Self { unsafe { let val_buf = Buffer::from_trusted_len_iter((0..count).map(|_| value)); - build_primitive_array(count, val_buf, None) + Self::new(T::DATA_TYPE, val_buf.into(), None) } } @@ -422,7 +461,6 @@ impl PrimitiveArray { F: Fn(T::Native) -> O::Native, { let data = self.data(); - let len = self.len(); let nulls = data.nulls().cloned(); let values = self.values().iter().map(|v| op(*v)); @@ -432,7 +470,7 @@ impl PrimitiveArray { // Soundness // `values` is an iterator with a known size because arrays are sized. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - unsafe { build_primitive_array(len, buffer, nulls) } + PrimitiveArray::new(O::DATA_TYPE, buffer.into(), nulls) } /// Applies an unary and infallible function to a mutable primitive array. @@ -495,7 +533,8 @@ impl PrimitiveArray { None => (0..len).try_for_each(f)?, } - Ok(unsafe { build_primitive_array(len, buffer.finish(), nulls) }) + let values = buffer.finish().into(); + Ok(PrimitiveArray::new(O::DATA_TYPE, values, nulls)) } /// Applies an unary and fallible function to all valid values in a mutable primitive array. @@ -579,13 +618,9 @@ impl PrimitiveArray { }); let nulls = BooleanBuffer::new(null_builder.finish(), 0, len); - unsafe { - build_primitive_array( - len, - buffer.finish(), - Some(NullBuffer::new_unchecked(nulls, out_null_count)), - ) - } + let values = buffer.finish().into(); + let nulls = unsafe { NullBuffer::new_unchecked(nulls, out_null_count) }; + PrimitiveArray::new(O::DATA_TYPE, values, Some(nulls)) } /// Returns `PrimitiveBuilder` of this primitive array for mutating its values if the underlying @@ -599,7 +634,7 @@ impl PrimitiveArray { .slice_with_length(self.data.offset() * element_len, len * element_len); drop(self.data); - drop(self.raw_values); + drop(self.values); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -647,21 +682,6 @@ impl PrimitiveArray { } } -#[inline] -unsafe fn build_primitive_array( - len: usize, - buffer: Buffer, - nulls: Option, -) -> PrimitiveArray { - PrimitiveArray::from( - ArrayDataBuilder::new(O::DATA_TYPE) - .len(len) - .buffers(vec![buffer]) - .nulls(nulls) - .build_unchecked(), - ) -} - impl From> for ArrayData { fn from(array: PrimitiveArray) -> Self { array.data @@ -1052,21 +1072,16 @@ impl PrimitiveArray { /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { - assert!( - Self::is_compatible(data.data_type()), - "PrimitiveArray expected ArrayData with type {} got {}", - T::DATA_TYPE, - data.data_type() - ); + Self::assert_compatible(data.data_type()); assert_eq!( data.buffers().len(), 1, "PrimitiveArray data should contain a single buffer only (values buffer)" ); - let raw_values = + let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - Self { data, raw_values } + Self { data, values } } } @@ -1833,9 +1848,7 @@ mod tests { } #[test] - #[should_panic( - expected = "PrimitiveArray expected ArrayData with type Int64 got Int32" - )] + #[should_panic(expected = "PrimitiveArray expected data type Int64 got Int32")] fn test_from_array_data_validation() { let foo = PrimitiveArray::::from_iter([1, 2, 3]); let _ = PrimitiveArray::::from(foo.into_data()); @@ -2211,7 +2224,7 @@ mod tests { #[test] #[should_panic( - expected = "PrimitiveArray expected ArrayData with type Interval(MonthDayNano) got Interval(DayTime)" + expected = "PrimitiveArray expected data type Interval(MonthDayNano) got Interval(DayTime)" )] fn test_invalid_interval_type() { let array = IntervalDayTimeArray::from(vec![1, 2, 3]); diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 4c16a736b10b..1a4680111bd1 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -17,6 +17,7 @@ use crate::buffer::Buffer; use crate::native::ArrowNativeType; +use crate::MutableBuffer; use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; @@ -96,6 +97,12 @@ impl AsRef<[T]> for ScalarBuffer { } } +impl From for ScalarBuffer { + fn from(value: MutableBuffer) -> Self { + Buffer::from(value).into() + } +} + impl From for ScalarBuffer { fn from(buffer: Buffer) -> Self { let align = std::mem::align_of::(); From 1a42f4c279ef13c2970933f5de79eb3ed32ae3f9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 12:25:05 +0000 Subject: [PATCH 0716/1411] Add iterators to BooleanBuffer and NullBuffer (#3901) * Add iterators to BooleanBuffer and NullBuffer * Clippy * Review feedback --- arrow-arith/src/aggregate.rs | 4 +--- arrow-buffer/src/buffer/boolean.rs | 25 ++++++++++++++++++++ arrow-buffer/src/buffer/null.rs | 37 ++++++++++++++++++++++++++++-- arrow-select/src/filter.rs | 3 +-- 4 files changed, 62 insertions(+), 7 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 8e760da21909..54f2240db558 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -22,7 +22,6 @@ use arrow_array::iterator::ArrayIter; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::bit_iterator::BitIndexIterator; use arrow_schema::ArrowError; use arrow_schema::*; @@ -118,9 +117,8 @@ where .reduce(|acc, item| if cmp(&acc, &item) { item } else { acc }) } else { let nulls = array.nulls().unwrap(); - let iter = BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len()); unsafe { - let idx = iter.reduce(|acc_idx, idx| { + let idx = nulls.valid_indices().reduce(|acc_idx, idx| { let acc = array.value_unchecked(acc_idx); let item = array.value_unchecked(idx); if cmp(&acc, &item) { diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 9d5953594d5d..fea04cc79b5a 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -16,6 +16,7 @@ // under the License. use crate::bit_chunk_iterator::BitChunks; +use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::{bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer}; use std::ops::{BitAnd, BitOr, Not}; @@ -164,6 +165,21 @@ impl BooleanBuffer { pub fn into_inner(self) -> Buffer { self.buffer } + + /// Returns an iterator over the bits in this [`BooleanBuffer`] + pub fn iter(&self) -> BitIterator<'_> { + self.into_iter() + } + + /// Returns an iterator over the set bit positions in this [`BooleanBuffer`] + pub fn set_indices(&self) -> BitIndexIterator<'_> { + BitIndexIterator::new(self.values(), self.offset, self.len) + } + + /// Returns a [`BitSliceIterator`] yielding contiguous ranges of set bits + pub fn set_slices(&self) -> BitSliceIterator<'_> { + BitSliceIterator::new(self.values(), self.offset, self.len) + } } impl Not for &BooleanBuffer { @@ -215,3 +231,12 @@ impl BitOr<&BooleanBuffer> for &BooleanBuffer { } } } + +impl<'a> IntoIterator for &'a BooleanBuffer { + type Item = bool; + type IntoIter = BitIterator<'a>; + + fn into_iter(self) -> Self::IntoIter { + BitIterator::new(self.values(), self.offset, self.len) + } +} diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index cbadb7f42dbf..f088e7fa62e9 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::bit_iterator::BitIndexIterator; +use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::buffer::BooleanBuffer; use crate::{Buffer, MutableBuffer}; @@ -114,6 +114,30 @@ impl NullBuffer { Self::new(self.buffer.slice(offset, len)) } + /// Returns an iterator over the bits in this [`NullBuffer`] + /// + /// * `true` indicates that the corresponding value is not NULL + /// * `false` indicates that the corresponding value is NULL + /// + /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases + pub fn iter(&self) -> BitIterator<'_> { + self.buffer.iter() + } + + /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`] + /// + /// Valid indices indicate the corresponding value is not NULL + pub fn valid_indices(&self) -> BitIndexIterator<'_> { + self.buffer.set_indices() + } + + /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices + /// + /// Valid indices indicate the corresponding value is not NULL + pub fn valid_slices(&self) -> BitSliceIterator<'_> { + self.buffer.set_slices() + } + /// Calls the provided closure for each index in this null mask that is set #[inline] pub fn try_for_each_valid_idx Result<(), E>>( @@ -123,7 +147,7 @@ impl NullBuffer { if self.null_count == self.len() { return Ok(()); } - BitIndexIterator::new(self.validity(), self.offset(), self.len()).try_for_each(f) + self.valid_indices().try_for_each(f) } /// Returns the inner [`BooleanBuffer`] @@ -145,6 +169,15 @@ impl NullBuffer { } } +impl<'a> IntoIterator for &'a NullBuffer { + type Item = bool; + type IntoIter = BitIterator<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.buffer.iter() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 35c11970c0f6..784bfa02014d 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -81,8 +81,7 @@ struct IndexIterator<'a> { impl<'a> IndexIterator<'a> { fn new(filter: &'a BooleanArray, remaining: usize) -> Self { assert_eq!(filter.null_count(), 0); - let data = filter.data(); - let iter = BitIndexIterator::new(data.buffers()[0], data.offset(), data.len()); + let iter = filter.values().set_indices(); Self { remaining, iter } } } From d38f8e0a0a5af134a585686c109d022eae981574 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 12:39:00 +0000 Subject: [PATCH 0717/1411] Add BooleanArray::new (#3879) (#3898) * Add BooleanArray::new (#3879) * Review feedback --- arrow-array/src/array/boolean_array.rs | 57 ++++++++++++++------------ arrow-buffer/src/buffer/boolean.rs | 10 ++++- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index c5775ad3b959..98de62da0912 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -19,8 +19,8 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; -use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; +use arrow_data::ArrayData; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -81,6 +81,28 @@ impl std::fmt::Debug for BooleanArray { } impl BooleanArray { + /// Create a new [`BooleanArray`] from the provided values and nulls + /// + /// # Panics + /// + /// Panics if `values.len() != nulls.len()` + pub fn new(values: BooleanBuffer, nulls: Option) -> Self { + if let Some(n) = nulls.as_ref() { + assert_eq!(values.len(), n.len()); + } + + // TODO: Don't store ArrayData inside arrays (#3880) + let data = unsafe { + ArrayData::builder(DataType::Boolean) + .len(values.len()) + .offset(values.offset()) + .nulls(nulls) + .buffers(vec![values.inner().clone()]) + .build_unchecked() + }; + Self { data, values } + } + /// Returns the length of this array. pub fn len(&self) -> usize { self.data.len() @@ -182,24 +204,12 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let null_bit_buffer = left.nulls().map(|x| x.inner().sliced()); - let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + let nulls = left.nulls().cloned(); + let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) }); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ) - }; - Self::from(data) + Self::new(values, nulls) } /// Create a [`BooleanArray`] by evaluating the binary operation for @@ -229,19 +239,11 @@ impl BooleanArray { assert_eq!(left.len(), right.len()); let nulls = NullBuffer::union(left.nulls(), right.nulls()); - let buffer = MutableBuffer::collect_bool(left.len(), |i| unsafe { + let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i), right.value_unchecked(i)) }); - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(left.len()) - .nulls(nulls) - .buffers(vec![buffer.into()]) - .build_unchecked() - }; - Self::from(data) + Self::new(values, nulls) } } @@ -393,6 +395,7 @@ impl>> FromIterator for BooleanArray #[cfg(test)] mod tests { use super::*; + use arrow_buffer::Buffer; use rand::{thread_rng, Rng}; #[test] diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index fea04cc79b5a..53ead45732d3 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -17,7 +17,9 @@ use crate::bit_chunk_iterator::BitChunks; use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; -use crate::{bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer}; +use crate::{ + bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, +}; use std::ops::{BitAnd, BitOr, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans @@ -61,6 +63,12 @@ impl BooleanBuffer { } } + /// Invokes `f` with indexes `0..len` collecting the boolean results into a new `BooleanBuffer` + pub fn collect_bool bool>(len: usize, f: F) -> Self { + let buffer = MutableBuffer::collect_bool(len, f); + Self::new(buffer.into(), 0, len) + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer.count_set_bits_offset(self.offset, self.len) From 526c57a0f65ee7aaa838f252f48c8179f7d9ce03 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 23 Mar 2023 13:45:08 +0100 Subject: [PATCH 0718/1411] Minor: add examples for `ListBuilder` and `GenericListBuilder` (#3891) * Minor: add examples for `ListBuilder` and `GenericListBuilder` * use '[' and ']' for list notation --- .../src/builder/generic_list_builder.rs | 10 ++- arrow-array/src/builder/mod.rs | 65 ++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 6228475542bd..de09694fb68f 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -24,7 +24,15 @@ use arrow_schema::Field; use std::any::Any; use std::sync::Arc; -/// Array builder for [`GenericListArray`] +/// Array builder for [`GenericListArray`]s. +/// +/// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. +/// +/// +/// [`ListBuilder`]: crate::builder::ListBuilder +/// [`ListArray`]: crate::array::ListArray +/// [`LargeListBuilder`]: crate::builder::LargeListBuilder +/// [`LargeListArray`]: crate::array::LargeListArray #[derive(Debug)] pub struct GenericListBuilder { offsets_builder: BufferBuilder, diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index fc2454635d99..df26fa35832f 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -132,9 +132,70 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } -/// A list array builder with i32 offsets +/// Builder for [`ListArray`]s (i32 offsets) +/// +/// [`ListArray`]: crate::array::ListArray +/// +/// # Example +/// +/// ``` +/// # use arrow_array::builder::{StringBuilder, ListBuilder}; +/// # use arrow_array::ListArray; +/// // Build a 3 element array of lists: +/// // +/// // column +/// // --------- +/// // [one] +/// // [] +/// // [two, three] +/// +/// let mut builder = ListBuilder::new(StringBuilder::new()); +/// // [one] +/// builder.values().append_value("one"); +/// builder.append(true); +/// // [] +/// builder.append(true); +/// // [two, three] +/// builder.values().append_value("two"); +/// builder.values().append_value("three"); +/// builder.append(true); +/// +/// // Create an array +/// let list_array: ListArray = builder.finish(); +/// ``` pub type ListBuilder = GenericListBuilder; -/// A list array builder with i64 offsets + +/// Builder for [`LargeListArray`]s (i64 offsets) +/// +/// [`LargeListArray`]: crate::array::LargeListArray +/// +/// # Example +/// +/// ``` +/// # use arrow_array::builder::{StringBuilder, LargeListBuilder}; +/// # use arrow_array::LargeListArray; +/// // Build a 3 element array of lists: +/// // +/// // column +/// // --------- +/// // [one], +/// // [], +/// // [two, three] +/// +/// let mut builder = LargeListBuilder::new(StringBuilder::new()); +/// // [one] +/// builder.values().append_value("one"); +/// builder.append(true); +/// // [] +/// builder.append(true); +/// // [two, three] +/// builder.values().append_value("two"); +/// builder.values().append_value("three"); +/// builder.append(true); +/// +/// // Create an array +/// let list_array: LargeListArray = builder.finish(); +/// ``` pub type LargeListBuilder = GenericListBuilder; /// A binary array builder with i32 offsets From 7e461c3337f05bf637862fc16a115e4bafa0c281 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 23 Mar 2023 16:40:11 +0100 Subject: [PATCH 0719/1411] Support microsecond and nanosecond in interval parsing (#3916) --- arrow-cast/src/parse.rs | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 4acd2b3376be..45a255626f6b 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -751,19 +751,22 @@ const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE; #[cfg(test)] const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR; +#[rustfmt::skip] #[derive(Clone, Copy)] #[repr(u16)] enum IntervalType { - Century = 0b_00_0000_0001, - Decade = 0b_00_0000_0010, - Year = 0b_00_0000_0100, - Month = 0b_00_0000_1000, - Week = 0b_00_0001_0000, - Day = 0b_00_0010_0000, - Hour = 0b_00_0100_0000, - Minute = 0b_00_1000_0000, - Second = 0b_01_0000_0000, - Millisecond = 0b_10_0000_0000, + Century = 0b_0000_0000_0001, + Decade = 0b_0000_0000_0010, + Year = 0b_0000_0000_0100, + Month = 0b_0000_0000_1000, + Week = 0b_0000_0001_0000, + Day = 0b_0000_0010_0000, + Hour = 0b_0000_0100_0000, + Minute = 0b_0000_1000_0000, + Second = 0b_0001_0000_0000, + Millisecond = 0b_0010_0000_0000, + Microsecond = 0b_0100_0000_0000, + Nanosecond = 0b_1000_0000_0000, } impl FromStr for IntervalType { @@ -781,6 +784,8 @@ impl FromStr for IntervalType { "minute" | "minutes" => Ok(Self::Minute), "second" | "seconds" => Ok(Self::Second), "millisecond" | "milliseconds" => Ok(Self::Millisecond), + "microsecond" | "microseconds" => Ok(Self::Microsecond), + "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), _ => Err(ArrowError::NotYetImplemented(format!( "Unknown interval type: {s}" ))), @@ -861,6 +866,10 @@ fn parse_interval(leading_field: &str, value: &str) -> Result { Ok((0, 0, (interval_period.mul_checked(1_000_000f64))? as i64)) } + IntervalType::Microsecond => { + Ok((0, 0, (interval_period.mul_checked(1_000f64)?) as i64)) + } + IntervalType::Nanosecond => Ok((0, 0, interval_period as i64)), } }; @@ -1629,6 +1638,16 @@ mod tests { parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(), ); + assert_eq!( + (12i32, 1i32, 1000i64), + parse_interval("months", "1 year 1 day 1 microsecond").unwrap(), + ); + + assert_eq!( + (12i32, 1i32, 1i64), + parse_interval("months", "1 year 1 day 1 nanoseconds").unwrap(), + ); + assert_eq!( (1i32, 0i32, (-NANOS_PER_SECOND) as i64), parse_interval("months", "1 month -1 second").unwrap(), From 8b7bfa69150bbd42bf2c05797987efc03077332f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 20:29:50 +0000 Subject: [PATCH 0720/1411] Add AsArray trait for more ergonomic downcasting (#3912) * Add AsArray trait for more ergonomic downcasting * Clippy * Review feedback --- arrow-arith/src/aggregate.rs | 8 +- arrow-arith/src/arithmetic.rs | 57 +++-- arrow-arith/src/arity.rs | 2 +- arrow-array/src/array/dictionary_array.rs | 6 +- arrow-array/src/array/map_array.rs | 4 +- arrow-array/src/array/run_array.rs | 7 +- arrow-array/src/array/union_array.rs | 16 +- .../src/builder/generic_byte_run_builder.rs | 16 +- .../src/builder/generic_list_builder.rs | 5 +- .../src/builder/primitive_run_builder.rs | 10 +- arrow-array/src/cast.rs | 159 ++++++++++++++ arrow-array/src/lib.rs | 8 +- arrow-cast/src/cast.rs | 198 +++++++++--------- arrow-cast/src/display.rs | 8 +- arrow-csv/src/reader/mod.rs | 6 +- arrow-ipc/src/writer.rs | 4 +- arrow-json/src/raw/mod.rs | 83 ++++---- arrow-json/src/reader.rs | 38 ++-- arrow-json/src/writer.rs | 16 +- arrow-ord/src/comparison.rs | 136 ++++-------- arrow-ord/src/sort.rs | 6 +- arrow-row/src/dictionary.rs | 4 +- arrow-row/src/lib.rs | 16 +- arrow-select/src/filter.rs | 10 +- arrow-select/src/interleave.rs | 10 +- arrow-select/src/nullif.rs | 18 +- arrow-select/src/take.rs | 26 +-- arrow-string/src/length.rs | 10 +- arrow-string/src/like.rs | 12 +- arrow/src/lib.rs | 23 +- arrow/src/util/data_gen.rs | 6 +- parquet/src/arrow/arrow_writer/mod.rs | 8 +- parquet/src/arrow/async_reader/mod.rs | 12 +- 33 files changed, 515 insertions(+), 433 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 54f2240db558..9e9d9333fdcb 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -1219,7 +1219,7 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = as_primitive_array::(&sliced_input); + let sliced_input = sliced_input.as_primitive::(); assert_eq!(sliced_input, &input); @@ -1242,7 +1242,7 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = as_boolean_array(&sliced_input); + let sliced_input = sliced_input.as_boolean(); assert_eq!(sliced_input, &input); @@ -1265,7 +1265,7 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = as_string_array(&sliced_input); + let sliced_input = sliced_input.as_string::(); assert_eq!(sliced_input, &input); @@ -1288,7 +1288,7 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = as_generic_binary_array::(&sliced_input); + let sliced_input = sliced_input.as_binary::(); assert_eq!(sliced_input, &input); diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 8e2b7915357a..de4b0ccb8858 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -728,20 +728,20 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { - let l = as_primitive_array::(left); + let l = left.as_primitive::(); match right.data_type() { DataType::Interval(IntervalUnit::YearMonth) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_year_months)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::DayTime) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_day_time)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_month_day_nano)?; Ok(Arc::new(res)) } @@ -752,20 +752,20 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { - let l = as_primitive_array::(left); + let l = left.as_primitive::(); match right.data_type() { DataType::Interval(IntervalUnit::YearMonth) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_year_months)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::DayTime) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_day_time)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_month_day_nano)?; Ok(Arc::new(res)) } @@ -808,20 +808,20 @@ pub fn add_dyn_checked( ) } DataType::Date32 => { - let l = as_primitive_array::(left); + let l = left.as_primitive::(); match right.data_type() { DataType::Interval(IntervalUnit::YearMonth) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_year_months)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::DayTime) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_day_time)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date32Type::add_month_day_nano)?; Ok(Arc::new(res)) } @@ -832,20 +832,20 @@ pub fn add_dyn_checked( } } DataType::Date64 => { - let l = as_primitive_array::(left); + let l = left.as_primitive::(); match right.data_type() { DataType::Interval(IntervalUnit::YearMonth) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_year_months)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::DayTime) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_day_time)?; Ok(Arc::new(res)) } DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = as_primitive_array::(right); + let r = right.as_primitive::(); let res = math_op(l, r, Date64Type::add_month_day_nano)?; Ok(Arc::new(res)) } @@ -2079,8 +2079,7 @@ mod tests { fn test_primitive_array_add_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = as_primitive_array(&a); - let actual = add_scalar(a, 3).unwrap(); + let actual = add_scalar(a.as_primitive(), 3).unwrap(); let expected = Int32Array::from(vec![None, Some(12), Some(11), None]); assert_eq!(actual, expected); } @@ -2110,8 +2109,7 @@ mod tests { fn test_primitive_array_subtract_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = as_primitive_array(&a); - let actual = subtract_scalar(a, 3).unwrap(); + let actual = subtract_scalar(a.as_primitive(), 3).unwrap(); let expected = Int32Array::from(vec![None, Some(6), Some(5), None]); assert_eq!(actual, expected); } @@ -2141,8 +2139,7 @@ mod tests { fn test_primitive_array_multiply_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = as_primitive_array(&a); - let actual = multiply_scalar(a, 3).unwrap(); + let actual = multiply_scalar(a.as_primitive(), 3).unwrap(); let expected = Int32Array::from(vec![None, Some(27), Some(24), None]); assert_eq!(actual, expected); } @@ -2171,7 +2168,7 @@ mod tests { assert_eq!(0, c.value(4)); let c = modulus_dyn(&a, &b).unwrap(); - let c = as_primitive_array::(&c); + let c = c.as_primitive::(); assert_eq!(0, c.value(0)); assert_eq!(3, c.value(1)); assert_eq!(0, c.value(2)); @@ -2262,8 +2259,7 @@ mod tests { fn test_primitive_array_divide_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = as_primitive_array(&a); - let actual = divide_scalar(a, 3).unwrap(); + let actual = divide_scalar(a.as_primitive(), 3).unwrap(); let expected = Int32Array::from(vec![None, Some(3), Some(2), None]); assert_eq!(actual, expected); } @@ -2277,7 +2273,7 @@ mod tests { assert_eq!(c, expected); let c = modulus_scalar_dyn::(&a, b).unwrap(); - let c = as_primitive_array::(&c); + let c = c.as_primitive::(); let expected = Int32Array::from(vec![0, 2, 0, 2, 1]); assert_eq!(c, &expected); } @@ -2286,13 +2282,13 @@ mod tests { fn test_int_array_modulus_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = as_primitive_array(&a); + let a = a.as_primitive(); let actual = modulus_scalar(a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); assert_eq!(actual, expected); let actual = modulus_scalar_dyn::(a, 3).unwrap(); - let actual = as_primitive_array::(&actual); + let actual = actual.as_primitive::(); let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); assert_eq!(actual, &expected); } @@ -2313,7 +2309,7 @@ mod tests { assert_eq!(0, result.value(0)); let result = modulus_scalar_dyn::(&a, -1).unwrap(); - let result = as_primitive_array::(&result); + let result = result.as_primitive::(); assert_eq!(0, result.value(0)); } @@ -3295,7 +3291,8 @@ mod tests { .unwrap(); let result = add_scalar_dyn::(&a, 1).unwrap(); - let result = as_primitive_array::(&result) + let result = result + .as_primitive::() .clone() .with_precision_and_scale(38, 2) .unwrap(); diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 782c8270cf85..501a240f37d5 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -507,7 +507,7 @@ mod tests { let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); let input_slice = input.slice(1, 4); - let input_slice: &Float64Array = as_primitive_array(&input_slice); + let input_slice: &Float64Array = input_slice.as_primitive(); let result = unary(input_slice, |n| n.round()); assert_eq!( result, diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index ee58a485c71c..0862230a499e 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder}; -use crate::cast::as_primitive_array; +use crate::cast::AsArray; use crate::iterator::ArrayIter; use crate::types::*; use crate::{ @@ -410,8 +410,8 @@ impl DictionaryArray { return Err(self); } - let key_array = as_primitive_array::(self.keys()).clone(); - let value_array = as_primitive_array::(self.values()).clone(); + let key_array = self.keys().clone(); + let value_array = self.values().as_primitive::().clone(); drop(self.data); drop(self.keys); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 6cd627cbd838..c9651f0b2019 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -253,7 +253,7 @@ impl std::fmt::Debug for MapArray { #[cfg(test)] mod tests { - use crate::cast::as_primitive_array; + use crate::cast::AsArray; use crate::types::UInt32Type; use crate::{Int32Array, UInt32Array}; use std::sync::Arc; @@ -522,7 +522,7 @@ mod tests { assert_eq!( &values_data, - as_primitive_array::(map_array.values()) + map_array.values().as_primitive::() ); assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(3, map_array.len()); diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index f62da38fb241..3aefb53b83f6 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -555,7 +555,7 @@ mod tests { use super::*; use crate::builder::PrimitiveRunBuilder; - use crate::cast::as_primitive_array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type}; use crate::{Array, Int32Array, StringArray}; @@ -877,8 +877,7 @@ mod tests { builder.extend(input_array.clone().into_iter()); let run_array = builder.finish(); - let physical_values_array = - as_primitive_array::(run_array.values()); + let physical_values_array = run_array.values().as_primitive::(); // create an array consisting of all the indices repeated twice and shuffled. let mut logical_indices: Vec = (0_u32..(logical_len as u32)).collect(); @@ -913,7 +912,7 @@ mod tests { PrimitiveRunBuilder::::with_capacity(input_array.len()); builder.extend(input_array.iter().copied()); let run_array = builder.finish(); - let physical_values_array = as_primitive_array::(run_array.values()); + let physical_values_array = run_array.values().as_primitive::(); // test for all slice lengths. for slice_len in 1..=total_len { diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 5a4d2af7ca45..fe227226f77d 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -398,7 +398,7 @@ mod tests { use super::*; use crate::builder::UnionBuilder; - use crate::cast::{as_primitive_array, as_string_array}; + use crate::cast::AsArray; use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type}; use crate::RecordBatch; use crate::{Float64Array, Int32Array, Int64Array, StringArray}; @@ -1078,36 +1078,36 @@ mod tests { let v = array.value(0); assert_eq!(v.data_type(), &DataType::Int32); assert_eq!(v.len(), 1); - assert_eq!(as_primitive_array::(v.as_ref()).value(0), 5); + assert_eq!(v.as_primitive::().value(0), 5); let v = array.value(1); assert_eq!(v.data_type(), &DataType::Utf8); assert_eq!(v.len(), 1); - assert_eq!(as_string_array(v.as_ref()).value(0), "foo"); + assert_eq!(v.as_string::().value(0), "foo"); let v = array.value(2); assert_eq!(v.data_type(), &DataType::Int32); assert_eq!(v.len(), 1); - assert_eq!(as_primitive_array::(v.as_ref()).value(0), 6); + assert_eq!(v.as_primitive::().value(0), 6); let v = array.value(3); assert_eq!(v.data_type(), &DataType::Utf8); assert_eq!(v.len(), 1); - assert_eq!(as_string_array(v.as_ref()).value(0), "bar"); + assert_eq!(v.as_string::().value(0), "bar"); let v = array.value(4); assert_eq!(v.data_type(), &DataType::Float64); assert_eq!(v.len(), 1); - assert_eq!(as_primitive_array::(v.as_ref()).value(0), 10.0); + assert_eq!(v.as_primitive::().value(0), 10.0); let v = array.value(5); assert_eq!(v.data_type(), &DataType::Int32); assert_eq!(v.len(), 1); - assert_eq!(as_primitive_array::(v.as_ref()).value(0), 4); + assert_eq!(v.as_primitive::().value(0), 4); let v = array.value(6); assert_eq!(v.data_type(), &DataType::Utf8); assert_eq!(v.len(), 1); - assert_eq!(as_string_array(v.as_ref()).value(0), "baz"); + assert_eq!(v.as_string::().value(0), "baz"); } } diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 5c15b1544ed3..9c26d7be6904 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -40,7 +40,7 @@ use arrow_buffer::ArrowNativeType; /// # use arrow_array::{GenericByteArray, BinaryArray}; /// # use arrow_array::types::{BinaryType, Int16Type}; /// # use arrow_array::{Array, Int16Array}; -/// # use arrow_array::cast::as_generic_binary_array; +/// # use arrow_array::cast::AsArray; /// /// let mut builder = /// GenericByteRunBuilder::::new(); @@ -59,7 +59,7 @@ use arrow_buffer::ArrowNativeType; /// assert!(av.is_null(3)); /// /// // Values are polymorphic and so require a downcast. -/// let ava: &BinaryArray = as_generic_binary_array(av.as_ref()); +/// let ava: &BinaryArray = av.as_binary(); /// /// assert_eq!(ava.value(0), b"abc"); /// assert_eq!(ava.value(2), b"def"); @@ -318,7 +318,7 @@ where /// # use arrow_array::builder::StringRunBuilder; /// # use arrow_array::{Int16Array, StringArray}; /// # use arrow_array::types::Int16Type; -/// # use arrow_array::cast::as_string_array; +/// # use arrow_array::cast::AsArray; /// /// let mut builder = StringRunBuilder::::new(); /// @@ -332,7 +332,7 @@ where /// /// // Values are polymorphic and so require a downcast. /// let av = array.values(); -/// let ava: &StringArray = as_string_array(av.as_ref()); +/// let ava: &StringArray = av.as_string::(); /// /// assert_eq!(ava.value(0), "abc"); /// assert!(av.is_null(1)); @@ -353,8 +353,8 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// /// # use arrow_array::builder::BinaryRunBuilder; /// # use arrow_array::{BinaryArray, Int16Array}; +/// # use arrow_array::cast::AsArray; /// # use arrow_array::types::Int16Type; -/// # use arrow_array::cast::as_generic_binary_array; /// /// let mut builder = BinaryRunBuilder::::new(); /// @@ -368,7 +368,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// /// // Values are polymorphic and so require a downcast. /// let av = array.values(); -/// let ava: &BinaryArray = as_generic_binary_array::(av.as_ref()); +/// let ava: &BinaryArray = av.as_binary(); /// /// assert_eq!(ava.value(0), b"abc"); /// assert!(av.is_null(1)); @@ -387,7 +387,7 @@ mod tests { use super::*; use crate::array::Array; - use crate::cast::as_string_array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type}; use crate::GenericByteArray; use crate::Int16RunArray; @@ -518,7 +518,7 @@ mod tests { assert_eq!(array.len(), 10); assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]); - let str_array = as_string_array(array.values().as_ref()); + let str_array = array.values().as_string::(); assert_eq!(str_array.value(0), "a"); assert_eq!(str_array.value(1), ""); assert_eq!(str_array.value(2), "b"); diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index de09694fb68f..f390b3c15da2 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -214,7 +214,7 @@ where mod tests { use super::*; use crate::builder::{Int32Builder, ListBuilder}; - use crate::cast::as_primitive_array; + use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; use arrow_buffer::Buffer; @@ -413,8 +413,7 @@ mod tests { assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]); assert_eq!(array.null_count(), 1); assert!(array.is_null(3)); - let a_values = array.values(); - let elements = as_primitive_array::(a_values.as_ref()); + let elements = array.values().as_primitive::(); assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); assert_eq!(elements.null_count(), 1); assert!(elements.is_null(3)); diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index e7c822ee6b19..30750b6f3421 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -30,7 +30,7 @@ use arrow_buffer::ArrowNativeType; /// ``` /// /// # use arrow_array::builder::PrimitiveRunBuilder; -/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_array::cast::AsArray; /// # use arrow_array::types::{UInt32Type, Int16Type}; /// # use arrow_array::{Array, UInt32Array, Int16Array}; /// @@ -53,7 +53,7 @@ use arrow_buffer::ArrowNativeType; /// assert!(!av.is_null(2)); /// /// // Values are polymorphic and so require a downcast. -/// let ava: &UInt32Array = as_primitive_array::(av.as_ref()); +/// let ava: &UInt32Array = av.as_primitive::(); /// /// assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); /// ``` @@ -265,7 +265,7 @@ where #[cfg(test)] mod tests { use crate::builder::PrimitiveRunBuilder; - use crate::cast::as_primitive_array; + use crate::cast::AsArray; use crate::types::{Int16Type, UInt32Type}; use crate::{Array, UInt32Array}; @@ -293,7 +293,7 @@ mod tests { assert!(!av.is_null(2)); // Values are polymorphic and so require a downcast. - let ava: &UInt32Array = as_primitive_array::(av.as_ref()); + let ava: &UInt32Array = av.as_primitive::(); assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None, Some(5678)])); } @@ -309,7 +309,7 @@ mod tests { assert_eq!(array.null_count(), 0); assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]); assert_eq!( - as_primitive_array::(array.values().as_ref()).values(), + array.values().as_primitive::().values(), &[1, 2, 5, 4, 6, 2] ); } diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 81d250cafffe..a39ff88c6bcd 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -709,6 +709,165 @@ where T::from(array.to_data()) } +mod private { + pub trait Sealed {} +} + +/// An extension trait for `dyn Array` that provides ergonomic downcasting +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, Int32Array}; +/// # use arrow_array::cast::AsArray; +/// # use arrow_array::types::Int32Type; +/// let col = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; +/// assert_eq!(col.as_primitive::().values(), &[1, 2, 3]); +/// ``` +pub trait AsArray: private::Sealed { + /// Downcast this to a [`BooleanArray`] returning `None` if not possible + fn as_boolean_opt(&self) -> Option<&BooleanArray>; + + /// Downcast this to a [`BooleanArray`] panicking if not possible + fn as_boolean(&self) -> &BooleanArray { + self.as_boolean_opt().expect("boolean array") + } + + /// Downcast this to a [`PrimitiveArray`] returning `None` if not possible + fn as_primitive_opt(&self) -> Option<&PrimitiveArray>; + + /// Downcast this to a [`PrimitiveArray`] panicking if not possible + fn as_primitive(&self) -> &PrimitiveArray { + self.as_primitive_opt().expect("primitive array") + } + + /// Downcast this to a [`GenericByteArray`] returning `None` if not possible + fn as_bytes_opt(&self) -> Option<&GenericByteArray>; + + /// Downcast this to a [`GenericByteArray`] panicking if not possible + fn as_bytes(&self) -> &GenericByteArray { + self.as_bytes_opt().expect("byte array") + } + + /// Downcast this to a [`GenericStringArray`] returning `None` if not possible + fn as_string_opt(&self) -> Option<&GenericStringArray> { + self.as_bytes_opt() + } + + /// Downcast this to a [`GenericStringArray`] panicking if not possible + fn as_string(&self) -> &GenericStringArray { + self.as_bytes_opt().expect("string array") + } + + /// Downcast this to a [`GenericBinaryArray`] returning `None` if not possible + fn as_binary_opt(&self) -> Option<&GenericBinaryArray> { + self.as_bytes_opt() + } + + /// Downcast this to a [`GenericBinaryArray`] panicking if not possible + fn as_binary(&self) -> &GenericBinaryArray { + self.as_bytes_opt().expect("binary array") + } + + /// Downcast this to a [`StructArray`] returning `None` if not possible + fn as_struct_opt(&self) -> Option<&StructArray>; + + /// Downcast this to a [`StructArray`] panicking if not possible + fn as_struct(&self) -> &StructArray { + self.as_struct_opt().expect("struct array") + } + + /// Downcast this to a [`GenericListArray`] returning `None` if not possible + fn as_list_opt(&self) -> Option<&GenericListArray>; + + /// Downcast this to a [`GenericListArray`] panicking if not possible + fn as_list(&self) -> &GenericListArray { + self.as_list_opt().expect("list array") + } + + /// Downcast this to a [`MapArray`] returning `None` if not possible + fn as_map_opt(&self) -> Option<&MapArray>; + + /// Downcast this to a [`MapArray`] panicking if not possible + fn as_map(&self) -> &MapArray { + self.as_map_opt().expect("map array") + } + + /// Downcast this to a [`DictionaryArray`] returning `None` if not possible + fn as_dictionary_opt(&self) + -> Option<&DictionaryArray>; + + /// Downcast this to a [`DictionaryArray`] panicking if not possible + fn as_dictionary(&self) -> &DictionaryArray { + self.as_dictionary_opt().expect("dictionary array") + } +} + +impl private::Sealed for dyn Array + '_ {} +impl AsArray for dyn Array + '_ { + fn as_boolean_opt(&self) -> Option<&BooleanArray> { + self.as_any().downcast_ref() + } + + fn as_primitive_opt(&self) -> Option<&PrimitiveArray> { + self.as_any().downcast_ref() + } + + fn as_bytes_opt(&self) -> Option<&GenericByteArray> { + self.as_any().downcast_ref() + } + + fn as_struct_opt(&self) -> Option<&StructArray> { + self.as_any().downcast_ref() + } + + fn as_list_opt(&self) -> Option<&GenericListArray> { + self.as_any().downcast_ref() + } + + fn as_map_opt(&self) -> Option<&MapArray> { + self.as_any().downcast_ref() + } + + fn as_dictionary_opt( + &self, + ) -> Option<&DictionaryArray> { + self.as_any().downcast_ref() + } +} + +impl private::Sealed for ArrayRef {} +impl AsArray for ArrayRef { + fn as_boolean_opt(&self) -> Option<&BooleanArray> { + self.as_ref().as_boolean_opt() + } + + fn as_primitive_opt(&self) -> Option<&PrimitiveArray> { + self.as_ref().as_primitive_opt() + } + + fn as_bytes_opt(&self) -> Option<&GenericByteArray> { + self.as_ref().as_bytes_opt() + } + + fn as_struct_opt(&self) -> Option<&StructArray> { + self.as_ref().as_struct_opt() + } + + fn as_list_opt(&self) -> Option<&GenericListArray> { + self.as_ref().as_list_opt() + } + + fn as_map_opt(&self) -> Option<&MapArray> { + self.as_any().downcast_ref() + } + + fn as_dictionary_opt( + &self, + ) -> Option<&DictionaryArray> { + self.as_ref().as_dictionary_opt() + } +} + #[cfg(test)] mod tests { use arrow_buffer::i256; diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index ada59564bf0e..ff1ddb1f67ce 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -43,17 +43,15 @@ //! } //! ``` //! -//! Additionally, there are convenient functions to do this casting -//! such as [`cast::as_primitive_array`] and [`cast::as_string_array`]: +//! The [`cast::AsArray`] extension trait can make this more ergonomic //! //! ``` //! # use arrow_array::Array; -//! # use arrow_array::cast::as_primitive_array; +//! # use arrow_array::cast::{AsArray, as_primitive_array}; //! # use arrow_array::types::Float32Type; //! //! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! // use as_primtive_array -//! as_primitive_array::(array).values() +//! array.as_primitive::().values() //! } //! ``` diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 43048c2aba45..ba909649da3a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -445,9 +445,7 @@ fn cast_reinterpret_arrays< >( array: &dyn Array, ) -> Result { - Ok(Arc::new( - as_primitive_array::(array).reinterpret_cast::(), - )) + Ok(Arc::new(array.as_primitive::().reinterpret_cast::())) } fn cast_decimal_to_integer( @@ -716,7 +714,7 @@ pub fn cast_with_options( } (Decimal128(_, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal::( - as_primitive_array(array), + array.as_primitive(), *s1, *p2, *s2, @@ -725,7 +723,7 @@ pub fn cast_with_options( } (Decimal256(_, s1), Decimal256(p2, s2)) => { cast_decimal_to_decimal::( - as_primitive_array(array), + array.as_primitive(), *s1, *p2, *s2, @@ -734,7 +732,7 @@ pub fn cast_with_options( } (Decimal128(_, s1), Decimal256(p2, s2)) => { cast_decimal_to_decimal::( - as_primitive_array(array), + array.as_primitive(), *s1, *p2, *s2, @@ -743,7 +741,7 @@ pub fn cast_with_options( } (Decimal256(_, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal::( - as_primitive_array(array), + array.as_primitive(), *s1, *p2, *s2, @@ -888,69 +886,69 @@ pub fn cast_with_options( // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), UInt16 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), UInt32 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), UInt64 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), Int8 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), Int16 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), Int32 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), Int64 => cast_integer_to_decimal::<_, Decimal128Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, 10_i128, cast_options, ), Float32 => cast_floating_point_to_decimal128( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, cast_options, ), Float64 => cast_floating_point_to_decimal128( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, cast_options, @@ -977,69 +975,69 @@ pub fn cast_with_options( // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), UInt16 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), UInt32 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), UInt64 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), Int8 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), Int16 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), Int32 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), Int64 => cast_integer_to_decimal::<_, Decimal256Type, _>( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, i256::from_i128(10_i128), cast_options, ), Float32 => cast_floating_point_to_decimal256( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, cast_options, ), Float64 => cast_floating_point_to_decimal256( - as_primitive_array::(array), + array.as_primitive::(), *precision, *scale, cast_options, @@ -1133,9 +1131,9 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(array, cast_options), Date64 => cast_string_to_date64::(array, cast_options), - Binary => Ok(Arc::new(BinaryArray::from(as_string_array(array).clone()))), + Binary => Ok(Arc::new(BinaryArray::from(array.as_string::().clone()))), LargeBinary => { - let binary = BinaryArray::from(as_string_array(array).clone()); + let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } LargeUtf8 => cast_byte_container::(array), @@ -1192,11 +1190,11 @@ pub fn cast_with_options( Utf8 => cast_byte_container::(array), Binary => { let large_binary = - LargeBinaryArray::from(as_largestring_array(array).clone()); + LargeBinaryArray::from(array.as_string::().clone()); cast_byte_container::(&large_binary) } LargeBinary => Ok(Arc::new(LargeBinaryArray::from( - as_largestring_array(array).clone(), + array.as_string::().clone(), ))), Time32(TimeUnit::Second) => { cast_string_to_time32second::(array, cast_options) @@ -1580,71 +1578,71 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (Date32, Date64) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Date64Type>(|x| x as i64 * MILLISECONDS_IN_DAY), )), (Date64, Date32) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Date32Type>(|x| (x / MILLISECONDS_IN_DAY) as i32), )), (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32MillisecondType>(|x| x * MILLISECONDS as i32), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x as i64 * MICROSECONDS), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64NanosecondType>(|x| x as i64 * NANOSECONDS), )), (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32SecondType>(|x| x / MILLISECONDS as i32), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64MicrosecondType>(|x| { x as i64 * (MICROSECONDS / MILLISECONDS) }), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64NanosecondType>(|x| { x as i64 * (MICROSECONDS / NANOSECONDS) }), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32SecondType>(|x| (x / MICROSECONDS) as i32), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32MillisecondType>(|x| { (x / (MICROSECONDS / MILLISECONDS)) as i32 }), )), (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64NanosecondType>(|x| x * (NANOSECONDS / MICROSECONDS)), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32SecondType>(|x| (x / NANOSECONDS) as i32), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time32MillisecondType>(|x| { (x / (NANOSECONDS / MILLISECONDS)) as i32 }), )), (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), )), @@ -1662,14 +1660,14 @@ pub fn cast_with_options( } (Int64, Timestamp(unit, tz)) => Ok(make_timestamp_array( - as_primitive_array(array), + array.as_primitive(), unit.clone(), tz.clone(), )), (Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => { let array = cast_with_options(array, &Int64, cast_options)?; - let time_array = as_primitive_array::(array.as_ref()); + let time_array = array.as_primitive::(); let from_size = time_unit_multiple(from_unit); let to_size = time_unit_multiple(to_unit); // we either divide or multiply, depending on size of each unit @@ -1697,7 +1695,7 @@ pub fn cast_with_options( } (Timestamp(from_unit, _), Date32) => { let array = cast_with_options(array, &Int64, cast_options)?; - let time_array = as_primitive_array::(array.as_ref()); + let time_array = array.as_primitive::(); let from_size = time_unit_multiple(from_unit) * SECONDS_IN_DAY; let mut b = Date32Builder::with_capacity(array.len()); @@ -1716,13 +1714,13 @@ pub fn cast_with_options( match cast_options.safe { true => { // change error to None - as_primitive_array::(array) + array.as_primitive::() .unary_opt::<_, Date64Type>(|x| { x.checked_mul(MILLISECONDS) }) } false => { - as_primitive_array::(array).try_unary::<_, Date64Type, _>( + array.as_primitive::().try_unary::<_, Date64Type, _>( |x| { x.mul_checked(MILLISECONDS) }, @@ -1734,17 +1732,17 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (Timestamp(TimeUnit::Microsecond, _), Date64) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Date64Type>(|x| x / (MICROSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Nanosecond, _), Date64) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampSecondType, @@ -1755,7 +1753,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampSecondType, @@ -1766,7 +1764,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1777,7 +1775,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1788,7 +1786,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1799,7 +1797,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1810,7 +1808,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1821,7 +1819,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1832,7 +1830,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampSecondType, @@ -1843,7 +1841,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampSecondType, @@ -1854,7 +1852,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1865,7 +1863,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1876,7 +1874,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1887,7 +1885,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1898,7 +1896,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1909,7 +1907,7 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1919,38 +1917,38 @@ pub fn cast_with_options( } (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, TimestampSecondType>(|x| x / MILLISECONDS), )), (Date64, Timestamp(TimeUnit::Millisecond, None)) => { cast_reinterpret_arrays::(array) } (Date64, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - as_primitive_array::(array).unary::<_, TimestampMicrosecondType>( + array.as_primitive::().unary::<_, TimestampMicrosecondType>( |x| x * (MICROSECONDS / MILLISECONDS), ), )), (Date64, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - as_primitive_array::(array).unary::<_, TimestampNanosecondType>( + array.as_primitive::().unary::<_, TimestampNanosecondType>( |x| x * (NANOSECONDS / MILLISECONDS), ), )), (Date32, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, TimestampSecondType>(|x| (x as i64) * SECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Millisecond, None)) => Ok(Arc::new( - as_primitive_array::(array).unary::<_, TimestampMillisecondType>( + array.as_primitive::().unary::<_, TimestampMillisecondType>( |x| (x as i64) * MILLISECONDS_IN_DAY, ), )), (Date32, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - as_primitive_array::(array).unary::<_, TimestampMicrosecondType>( + array.as_primitive::().unary::<_, TimestampMicrosecondType>( |x| (x as i64) * MICROSECONDS_IN_DAY, ), )), (Date32, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - as_primitive_array::(array) + array.as_primitive::() .unary::<_, TimestampNanosecondType>(|x| (x as i64) * NANOSECONDS_IN_DAY), )), (Int64, Duration(TimeUnit::Second)) => { @@ -3736,7 +3734,7 @@ mod tests { let result = cast(&array, &DataType::Decimal128(2, 2)); assert!(result.is_ok()); let array = result.unwrap(); - let array: &Decimal128Array = as_primitive_array(&array); + let array: &Decimal128Array = array.as_primitive(); let err = array.validate_decimal_precision(2); assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99", err.unwrap_err().to_string()); @@ -4306,7 +4304,7 @@ mod tests { let casted_array = cast(&array, &DataType::Decimal128(3, 1)); assert!(casted_array.is_ok()); let array = casted_array.unwrap(); - let array: &Decimal128Array = as_primitive_array(&array); + let array: &Decimal128Array = array.as_primitive(); let err = array.validate_decimal_precision(3); assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); @@ -4316,7 +4314,7 @@ mod tests { let casted_array = cast(&array, &DataType::Decimal128(3, 1)); assert!(casted_array.is_ok()); let array = casted_array.unwrap(); - let array: &Decimal128Array = as_primitive_array(&array); + let array: &Decimal128Array = array.as_primitive(); let err = array.validate_decimal_precision(3); assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); @@ -4475,7 +4473,7 @@ mod tests { let casted_array = cast(&array, &DataType::Decimal256(3, 1)); assert!(casted_array.is_ok()); let array = casted_array.unwrap(); - let array: &Decimal256Array = as_primitive_array(&array); + let array: &Decimal256Array = array.as_primitive(); let err = array.validate_decimal_precision(3); assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal256 of precision 3. Max is 999", err.unwrap_err().to_string()); @@ -4603,14 +4601,14 @@ mod tests { ) .unwrap(); assert_eq!(5, b.len()); - let arr = b.as_any().downcast_ref::().unwrap(); + let arr = b.as_list::(); assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); assert_eq!(1, arr.value_length(3)); assert_eq!(1, arr.value_length(4)); - let c = as_primitive_array::(arr.values()); + let c = arr.values().as_primitive::(); assert_eq!(5, c.value(0)); assert_eq!(6, c.value(1)); assert_eq!(7, c.value(2)); @@ -4628,7 +4626,7 @@ mod tests { .unwrap(); assert_eq!(5, b.len()); assert_eq!(1, b.null_count()); - let arr = b.as_any().downcast_ref::().unwrap(); + let arr = b.as_list::(); assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); @@ -4636,7 +4634,7 @@ mod tests { assert_eq!(1, arr.value_length(3)); assert_eq!(1, arr.value_length(4)); - let c = as_primitive_array::(arr.values()); + let c = arr.values().as_primitive::(); assert_eq!(1, c.null_count()); assert_eq!(5, c.value(0)); assert!(!c.is_valid(1)); @@ -4657,13 +4655,13 @@ mod tests { .unwrap(); assert_eq!(4, b.len()); assert_eq!(1, b.null_count()); - let arr = b.as_any().downcast_ref::().unwrap(); + let arr = b.as_list::(); assert_eq!(&[0, 1, 2, 3, 4], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); assert_eq!(1, arr.value_length(1)); assert_eq!(1, arr.value_length(2)); assert_eq!(1, arr.value_length(3)); - let c = as_primitive_array::(arr.values()); + let c = arr.values().as_primitive::(); assert_eq!(1, c.null_count()); assert_eq!(7.0, c.value(0)); assert_eq!(8.0, c.value(1)); @@ -4802,7 +4800,7 @@ mod tests { assert_eq!(2, array.value_length(2)); // expect 4 nulls: negative numbers and overflow - let u16arr = as_primitive_array::(array.values()); + let u16arr = array.values().as_primitive::(); assert_eq!(4, u16arr.null_count()); // expect 4 nulls: negative numbers and overflow @@ -6946,7 +6944,7 @@ mod tests { let expected = $ARR_TYPE::from(vec![None; 6]); let cast_type = DataType::$DATATYPE; let cast_array = cast(&array, &cast_type).expect("cast failed"); - let cast_array = as_primitive_array::<$TYPE>(&cast_array); + let cast_array = cast_array.as_primitive::<$TYPE>(); assert_eq!(cast_array.data_type(), &cast_type); assert_eq!(cast_array, &expected); } @@ -7439,7 +7437,7 @@ mod tests { ); let casted_array = cast(&array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("1123450", decimal_arr.value_as_string(0)); assert_eq!("2123460", decimal_arr.value_as_string(1)); @@ -7456,7 +7454,7 @@ mod tests { ])) as ArrayRef; let casted_array = cast(&array, &decimal_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("1123450", decimal_arr.value_as_string(0)); assert_eq!("2123450", decimal_arr.value_as_string(1)); @@ -7469,7 +7467,7 @@ mod tests { ])) as ArrayRef; let casted_array = cast(&array, &decimal_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("1120", decimal_arr.value_as_string(0)); assert_eq!("2120", decimal_arr.value_as_string(1)); @@ -7492,7 +7490,7 @@ mod tests { ); let casted_array = cast(&array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("1200", decimal_arr.value_as_string(0)); @@ -7507,7 +7505,7 @@ mod tests { ); let casted_array = cast(&array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("1300", decimal_arr.value_as_string(0)); } @@ -7632,7 +7630,7 @@ mod tests { assert!(can_cast_types(array.data_type(), &output_type)); let casted_array = cast(&array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("123.45", decimal_arr.value_as_string(0)); assert_eq!("1.23", decimal_arr.value_as_string(1)); @@ -7653,7 +7651,7 @@ mod tests { assert!(can_cast_types(array.data_type(), &output_type)); let casted_array = cast(&array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!("123.450", decimal_arr.value_as_string(0)); assert_eq!("1.235", decimal_arr.value_as_string(1)); @@ -7751,7 +7749,7 @@ mod tests { fn test_cast_string_to_decimal128_overflow(overflow_array: ArrayRef) { let output_type = DataType::Decimal128(38, 2); let casted_array = cast(&overflow_array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert!(decimal_arr.is_null(0)); assert!(decimal_arr.is_null(1)); @@ -7797,7 +7795,7 @@ mod tests { fn test_cast_string_to_decimal256_overflow(overflow_array: ArrayRef) { let output_type = DataType::Decimal256(76, 2); let casted_array = cast(&overflow_array, &output_type).unwrap(); - let decimal_arr = as_primitive_array::(&casted_array); + let decimal_arr = casted_array.as_primitive::(); assert_eq!( "170141183460469231731687303715884105727.00", @@ -7916,7 +7914,7 @@ mod tests { ]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); - let v = as_primitive_array::(b.as_ref()); + let v = b.as_primitive::(); assert_eq!(v.value(0), 946728000000000000); assert_eq!(v.value(1), 1608035696000000000); @@ -7926,7 +7924,7 @@ mod tests { &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), ) .unwrap(); - let v = as_primitive_array::(b.as_ref()); + let v = b.as_primitive::(); assert_eq!(v.value(0), 946728000000000000); assert_eq!(v.value(1), 1608035696000000000); @@ -7936,7 +7934,7 @@ mod tests { &DataType::Timestamp(TimeUnit::Millisecond, Some("+02:00".to_string())), ) .unwrap(); - let v = as_primitive_array::(b.as_ref()); + let v = b.as_primitive::(); assert_eq!(v.value(0), 946728000000); assert_eq!(v.value(1), 1608035696000); @@ -7991,7 +7989,7 @@ mod tests { let s = BinaryArray::from(vec![v1, v2]); let options = CastOptions { safe: true }; let array = cast_with_options(&s, &DataType::Utf8, &options).unwrap(); - let a = as_string_array(array.as_ref()); + let a = array.as_string::(); a.data().validate_full().unwrap(); assert_eq!(a.null_count(), 1); diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 6e06a0e39dc0..c8025f000eab 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -258,10 +258,10 @@ fn make_formatter<'a>( array => array_format(array, options), DataType::Null => array_format(as_null_array(array), options), DataType::Boolean => array_format(as_boolean_array(array), options), - DataType::Utf8 => array_format(as_string_array(array), options), - DataType::LargeUtf8 => array_format(as_largestring_array(array), options), - DataType::Binary => array_format(as_generic_binary_array::(array), options), - DataType::LargeBinary => array_format(as_generic_binary_array::(array), options), + DataType::Utf8 => array_format(array.as_string::(), options), + DataType::LargeUtf8 => array_format(array.as_string::(), options), + DataType::Binary => array_format(array.as_binary::(), options), + DataType::LargeBinary => array_format(array.as_binary::(), options), DataType::FixedSizeBinary(_) => { let a = array.as_any().downcast_ref::().unwrap(); array_format(a, options) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 8b1cd2f79930..046bfafc4641 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -1146,7 +1146,7 @@ mod tests { use std::io::{Cursor, Write}; use tempfile::NamedTempFile; - use arrow_array::cast::as_boolean_array; + use arrow_array::cast::AsArray; use chrono::prelude::*; #[test] @@ -2059,14 +2059,14 @@ mod tests { assert_eq!(b.num_rows(), 4); assert_eq!(b.num_columns(), 2); - let c = as_boolean_array(b.column(0)); + let c = b.column(0).as_boolean(); assert_eq!(c.null_count(), 1); assert!(c.value(0)); assert!(!c.value(1)); assert!(c.is_null(2)); assert!(!c.value(3)); - let c = as_boolean_array(b.column(1)); + let c = b.column(1).as_boolean(); assert_eq!(c.null_count(), 1); assert!(!c.value(0)); assert!(c.value(1)); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b57692749878..2d859f608387 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -2024,7 +2024,7 @@ mod tests { ); let sliced = array.slice(1, 2); - let read_sliced: &UInt32Array = as_primitive_array(&sliced); + let read_sliced: &UInt32Array = sliced.as_primitive(); assert_eq!( vec![Some(2), Some(3)], read_sliced.iter().collect::>() @@ -2044,7 +2044,7 @@ mod tests { let mut reader = StreamReader::try_new(&outbuf[..], None).expect("new reader"); let read_batch = reader.next().unwrap().expect("read batch"); - let read_array: &UInt32Array = as_primitive_array(read_batch.column(0)); + let read_array: &UInt32Array = read_batch.column(0).as_primitive(); assert_eq!( vec![Some(2), Some(3)], read_array.iter().collect::>() diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 57bec9ee49c0..2e5055bf149e 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -357,10 +357,7 @@ mod tests { use super::*; use crate::reader::infer_json_schema; use crate::ReaderBuilder; - use arrow_array::cast::{ - as_boolean_array, as_largestring_array, as_list_array, as_map_array, - as_primitive_array, as_string_array, as_struct_array, - }; + use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; use arrow_array::Array; use arrow_buffer::ArrowNativeType; @@ -431,29 +428,29 @@ mod tests { let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); - let col1 = as_primitive_array::(batches[0].column(0)); + let col1 = batches[0].column(0).as_primitive::(); assert_eq!(col1.null_count(), 2); assert_eq!(col1.values(), &[1, 2, 2, 2, 0, 0]); assert!(col1.is_null(4)); assert!(col1.is_null(5)); - let col2 = as_primitive_array::(batches[0].column(1)); + let col2 = batches[0].column(1).as_primitive::(); assert_eq!(col2.null_count(), 0); assert_eq!(col2.values(), &[2, 4, 6, 5, 4, 7]); - let col3 = as_boolean_array(batches[0].column(2)); + let col3 = batches[0].column(2).as_boolean(); assert_eq!(col3.null_count(), 4); assert!(col3.value(0)); assert!(!col3.is_null(0)); assert!(!col3.value(1)); assert!(!col3.is_null(1)); - let col4 = as_primitive_array::(batches[0].column(3)); + let col4 = batches[0].column(3).as_primitive::(); assert_eq!(col4.null_count(), 3); assert!(col4.is_null(3)); assert_eq!(col4.values(), &[1, 2, 45, 0, 0, 0]); - let col5 = as_primitive_array::(batches[0].column(4)); + let col5 = batches[0].column(4).as_primitive::(); assert_eq!(col5.null_count(), 5); assert!(col5.is_null(0)); assert!(col5.is_null(2)); @@ -480,7 +477,7 @@ mod tests { let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); - let col1 = as_string_array(batches[0].column(0)); + let col1 = batches[0].column(0).as_string::(); assert_eq!(col1.null_count(), 2); assert_eq!(col1.value(0), "1"); assert_eq!(col1.value(1), "hello"); @@ -488,7 +485,7 @@ mod tests { assert!(col1.is_null(3)); assert!(col1.is_null(4)); - let col2 = as_largestring_array(batches[0].column(1)); + let col2 = batches[0].column(1).as_string::(); assert_eq!(col2.null_count(), 1); assert_eq!(col2.value(0), "2"); assert_eq!(col2.value(1), "shoo"); @@ -537,41 +534,41 @@ mod tests { let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); - let list = as_list_array(batches[0].column(0).as_ref()); + let list = batches[0].column(0).as_list::(); assert_eq!(list.len(), 3); assert_eq!(list.value_offsets(), &[0, 0, 2, 2]); assert_eq!(list.null_count(), 1); assert!(list.is_null(2)); - let list_values = as_primitive_array::(list.values().as_ref()); + let list_values = list.values().as_primitive::(); assert_eq!(list_values.values(), &[5, 6]); - let nested = as_struct_array(batches[0].column(1).as_ref()); - let a = as_primitive_array::(nested.column(0).as_ref()); + let nested = batches[0].column(1).as_struct(); + let a = nested.column(0).as_primitive::(); assert_eq!(list.null_count(), 1); assert_eq!(a.values(), &[1, 7, 0]); assert!(list.is_null(2)); - let b = as_primitive_array::(nested.column(1).as_ref()); + let b = nested.column(1).as_primitive::(); assert_eq!(b.null_count(), 2); assert_eq!(b.len(), 3); assert_eq!(b.value(0), 2); assert!(b.is_null(1)); assert!(b.is_null(2)); - let nested_list = as_struct_array(batches[0].column(2).as_ref()); + let nested_list = batches[0].column(2).as_struct(); assert_eq!(nested_list.len(), 3); assert_eq!(nested_list.null_count(), 1); assert!(nested_list.is_null(2)); - let list2 = as_list_array(nested_list.column(0).as_ref()); + let list2 = nested_list.column(0).as_list::(); assert_eq!(list2.len(), 3); assert_eq!(list2.null_count(), 1); assert_eq!(list2.value_offsets(), &[0, 2, 2, 2]); assert!(list2.is_null(2)); - let list2_values = as_struct_array(list2.values().as_ref()); + let list2_values = list2.values().as_struct(); - let c = as_primitive_array::(list2_values.column(0)); + let c = list2_values.column(0).as_primitive::(); assert_eq!(c.values(), &[3, 4]); } @@ -606,26 +603,26 @@ mod tests { let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); - let nested = as_struct_array(batches[0].column(0).as_ref()); + let nested = batches[0].column(0).as_struct(); assert_eq!(nested.num_columns(), 1); - let a = as_primitive_array::(nested.column(0).as_ref()); + let a = nested.column(0).as_primitive::(); assert_eq!(a.null_count(), 0); assert_eq!(a.values(), &[1, 7]); - let nested_list = as_struct_array(batches[0].column(1).as_ref()); + let nested_list = batches[0].column(1).as_struct(); assert_eq!(nested_list.num_columns(), 1); assert_eq!(nested_list.null_count(), 0); - let list2 = as_list_array(nested_list.column(0).as_ref()); + let list2 = nested_list.column(0).as_list::(); assert_eq!(list2.value_offsets(), &[0, 2, 2]); assert_eq!(list2.null_count(), 0); - let child = as_struct_array(list2.values().as_ref()); + let child = list2.values().as_struct(); assert_eq!(child.num_columns(), 1); assert_eq!(child.len(), 2); assert_eq!(child.null_count(), 0); - let c = as_primitive_array::(child.column(0).as_ref()); + let c = child.column(0).as_primitive::(); assert_eq!(c.values(), &[5, 0]); assert_eq!(c.null_count(), 1); assert!(c.is_null(1)); @@ -650,15 +647,15 @@ mod tests { let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); - let map = as_map_array(batches[0].column(0).as_ref()); - let map_keys = as_string_array(map.keys().as_ref()); - let map_values = as_list_array(map.values().as_ref()); + let map = batches[0].column(0).as_map(); + let map_keys = map.keys().as_string::(); + let map_values = map.values().as_list::(); assert_eq!(map.value_offsets(), &[0, 1, 3, 5]); let k: Vec<_> = map_keys.iter().map(|x| x.unwrap()).collect(); assert_eq!(&k, &["a", "a", "b", "c", "a"]); - let list_values = as_string_array(map_values.values().as_ref()); + let list_values = map_values.values().as_string::(); let lv: Vec<_> = list_values.iter().collect(); assert_eq!(&lv, &[Some("foo"), None, None, Some("baz")]); assert_eq!(map_values.value_offsets(), &[0, 2, 3, 3, 3, 4]); @@ -751,7 +748,7 @@ mod tests { let batches = do_read(buf, 1024, true, schema); assert_eq!(batches.len(), 1); - let col1 = as_string_array(batches[0].column(0)); + let col1 = batches[0].column(0).as_string::(); assert_eq!(col1.null_count(), 2); assert_eq!(col1.value(0), "1"); assert_eq!(col1.value(1), "2E0"); @@ -760,7 +757,7 @@ mod tests { assert!(col1.is_null(4)); assert!(col1.is_null(5)); - let col2 = as_string_array(batches[0].column(1)); + let col2 = batches[0].column(1).as_string::(); assert_eq!(col2.null_count(), 0); assert_eq!(col2.value(0), "2"); assert_eq!(col2.value(1), "4"); @@ -769,7 +766,7 @@ mod tests { assert_eq!(col2.value(4), "4e0"); assert_eq!(col2.value(5), "7"); - let col3 = as_string_array(batches[0].column(2)); + let col3 = batches[0].column(2).as_string::(); assert_eq!(col3.null_count(), 4); assert_eq!(col3.value(0), "true"); assert_eq!(col3.value(1), "false"); @@ -799,7 +796,7 @@ mod tests { let batches = do_read(buf, 1024, true, schema); assert_eq!(batches.len(), 1); - let col1 = as_primitive_array::(batches[0].column(0)); + let col1 = batches[0].column(0).as_primitive::(); assert_eq!(col1.null_count(), 2); assert!(col1.is_null(4)); assert!(col1.is_null(5)); @@ -808,14 +805,14 @@ mod tests { &[100, 200, 204, 1103420, 0, 0].map(T::Native::usize_as) ); - let col2 = as_primitive_array::(batches[0].column(1)); + let col2 = batches[0].column(1).as_primitive::(); assert_eq!(col2.null_count(), 0); assert_eq!( col2.values(), &[200, 400, 133700, 500, 4000, 123400].map(T::Native::usize_as) ); - let col3 = as_primitive_array::(batches[0].column(2)); + let col3 = batches[0].column(2).as_primitive::(); assert_eq!(col3.null_count(), 4); assert!(!col3.is_null(0)); assert!(!col3.is_null(1)); @@ -864,7 +861,7 @@ mod tests { TimeUnit::Nanosecond => 1, }; - let col1 = as_primitive_array::(batches[0].column(0)); + let col1 = batches[0].column(0).as_primitive::(); assert_eq!(col1.null_count(), 4); assert!(col1.is_null(2)); assert!(col1.is_null(3)); @@ -872,7 +869,7 @@ mod tests { assert!(col1.is_null(5)); assert_eq!(col1.values(), &[1, 2, 0, 0, 0, 0].map(T::Native::usize_as)); - let col2 = as_primitive_array::(batches[0].column(1)); + let col2 = batches[0].column(1).as_primitive::(); assert_eq!(col2.null_count(), 1); assert!(col2.is_null(5)); assert_eq!( @@ -887,7 +884,7 @@ mod tests { ] ); - let col3 = as_primitive_array::(batches[0].column(2)); + let col3 = batches[0].column(2).as_primitive::(); assert_eq!(col3.null_count(), 0); assert_eq!( col3.values(), @@ -901,7 +898,7 @@ mod tests { ] ); - let col4 = as_primitive_array::(batches[0].column(3)); + let col4 = batches[0].column(3).as_primitive::(); assert_eq!(col4.null_count(), 0); assert_eq!( @@ -957,7 +954,7 @@ mod tests { let batches = do_read(buf, 1024, true, schema); assert_eq!(batches.len(), 1); - let col1 = as_primitive_array::(batches[0].column(0)); + let col1 = batches[0].column(0).as_primitive::(); assert_eq!(col1.null_count(), 4); assert!(col1.is_null(2)); assert!(col1.is_null(3)); @@ -965,7 +962,7 @@ mod tests { assert!(col1.is_null(5)); assert_eq!(col1.values(), &[1, 2, 0, 0, 0, 0].map(T::Native::usize_as)); - let col2 = as_primitive_array::(batches[0].column(1)); + let col2 = batches[0].column(1).as_primitive::(); assert_eq!(col2.null_count(), 1); assert!(col2.is_null(5)); assert_eq!( @@ -981,7 +978,7 @@ mod tests { .map(T::Native::usize_as) ); - let col3 = as_primitive_array::(batches[0].column(2)); + let col3 = batches[0].column(2).as_primitive::(); assert_eq!(col3.null_count(), 0); assert_eq!( col3.values(), diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 5d86f9a578c2..8e33613886f1 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -1844,10 +1844,7 @@ impl Iterator for Reader { #[allow(deprecated)] mod tests { use super::*; - use arrow_array::cast::{ - as_boolean_array, as_dictionary_array, as_primitive_array, as_string_array, - as_struct_array, - }; + use arrow_array::cast::AsArray; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::DataType::{Dictionary, List}; use flate2::read::GzDecoder; @@ -2133,20 +2130,12 @@ mod tests { let d = schema.column_with_name("d").unwrap(); assert_eq!(&DataType::Utf8, d.1.data_type()); - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); + let aa = batch.column(a.0).as_primitive::(); assert_eq!(1, aa.value(0)); assert_eq!(-10, aa.value(1)); assert_eq!(1627668684594000000, aa.value(2)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - let bb = as_primitive_array::(bb.values()); + let bb = batch.column(b.0).as_list::(); + let bb = bb.values().as_primitive::(); assert_eq!(9, bb.len()); assert_eq!(2.0, bb.value(0)); assert_eq!(-6.1, bb.value(5)); @@ -2157,7 +2146,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let cc = as_boolean_array(cc.values()); + let cc = cc.values().as_boolean(); assert_eq!(6, cc.len()); assert!(!cc.value(0)); assert!(!cc.value(4)); @@ -2271,7 +2260,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let bb = as_primitive_array::(bb.values()); + let bb = bb.values().as_primitive::(); assert_eq!(10, bb.len()); assert_eq!(4.0, bb.value(9)); @@ -2285,7 +2274,7 @@ mod tests { *cc.data().buffers()[0], Buffer::from_slice_ref([0i32, 2, 2, 4, 5]) ); - let cc = as_boolean_array(cc.values()); + let cc = cc.values().as_boolean(); let cc_expected = BooleanArray::from(vec![ Some(false), Some(true), @@ -2306,7 +2295,7 @@ mod tests { Buffer::from_slice_ref([0i32, 1, 1, 2, 6]) ); - let dd = as_string_array(dd.values()); + let dd = dd.values().as_string::(); // values are 6 because a `d: null` is treated as a null slot // and a list's null slot can be omitted from the child (i.e. same offset) assert_eq!(6, dd.len()); @@ -2452,8 +2441,8 @@ mod tests { // compare list null buffers assert_eq!(read.nulls(), expected.nulls()); // build struct from list - let struct_array = as_struct_array(read.values()); - let expected_struct_array = as_struct_array(expected.values()); + let struct_array = read.values().as_struct(); + let expected_struct_array = expected.values().as_struct(); assert_eq!(7, struct_array.len()); assert_eq!(1, struct_array.null_count()); @@ -2767,14 +2756,13 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let evs_list = as_dictionary_array::(evs_list.values()); + let evs_list = evs_list.values().as_dictionary::(); assert_eq!(6, evs_list.len()); assert!(evs_list.is_valid(1)); assert_eq!(DataType::Utf8, evs_list.value_type()); // dict from the events list - let dict_el = evs_list.values(); - let dict_el = dict_el.as_any().downcast_ref::().unwrap(); + let dict_el = evs_list.values().as_string::(); assert_eq!(3, dict_el.len()); assert_eq!("Elect Leader", dict_el.value(0)); assert_eq!("Do Ballot", dict_el.value(1)); @@ -2824,7 +2812,7 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - let evs_list = as_dictionary_array::(evs_list.values()); + let evs_list = evs_list.values().as_dictionary::(); assert_eq!(8, evs_list.len()); assert!(evs_list.is_valid(1)); assert_eq!(DataType::Utf8, evs_list.value_type()); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 27ae3876441d..bbc04c9dc096 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -112,7 +112,8 @@ where T: ArrowPrimitiveType, T::Native: JsonSerializable, { - Ok(as_primitive_array::(array) + Ok(array + .as_primitive::() .iter() .map(|maybe_value| match maybe_value { Some(v) => v.into_json_value().unwrap_or(Value::Null), @@ -146,7 +147,8 @@ fn struct_array_to_jsonmap_array( pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), - DataType::Boolean => Ok(as_boolean_array(array) + DataType::Boolean => Ok(array + .as_boolean() .iter() .map(|maybe_value| match maybe_value { Some(v) => v.into(), @@ -154,14 +156,16 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { }) .collect()), - DataType::Utf8 => Ok(as_string_array(array) + DataType::Utf8 => Ok(array + .as_string::() .iter() .map(|maybe_value| match maybe_value { Some(v) => v.into(), None => Value::Null, }) .collect()), - DataType::LargeUtf8 => Ok(as_largestring_array(array) + DataType::LargeUtf8 => Ok(array + .as_string::() .iter() .map(|maybe_value| match maybe_value { Some(v) => v.into(), @@ -225,7 +229,7 @@ fn set_column_by_primitive_type( T: ArrowPrimitiveType, T::Native: JsonSerializable, { - let primitive_arr = as_primitive_array::(array); + let primitive_arr = array.as_primitive::(); rows.iter_mut() .zip(primitive_arr.iter()) @@ -369,7 +373,7 @@ fn set_column_for_json_rows( ))); } - let keys = as_string_array(keys); + let keys = keys.as_string::(); let values = array_to_json_array(values)?; let mut kv = keys.iter().zip(values.into_iter()); diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index eb672e769ac3..0f9414378c4a 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -829,14 +829,8 @@ pub fn eq_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - eq_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - eq_binary_scalar(left, right) - } + DataType::Binary => eq_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => eq_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "eq_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), )), @@ -850,14 +844,8 @@ pub fn neq_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - neq_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - neq_binary_scalar(left, right) - } + DataType::Binary => neq_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => neq_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "neq_dyn_binary_scalar only supports Binary or LargeBinary arrays" .to_string(), @@ -872,14 +860,8 @@ pub fn lt_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - lt_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - lt_binary_scalar(left, right) - } + DataType::Binary => lt_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => lt_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "lt_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), )), @@ -893,14 +875,8 @@ pub fn lt_eq_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - lt_eq_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - lt_eq_binary_scalar(left, right) - } + DataType::Binary => lt_eq_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => lt_eq_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "lt_eq_dyn_binary_scalar only supports Binary or LargeBinary arrays" .to_string(), @@ -915,14 +891,8 @@ pub fn gt_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - gt_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - gt_binary_scalar(left, right) - } + DataType::Binary => gt_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => gt_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "gt_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), )), @@ -936,14 +906,8 @@ pub fn gt_eq_dyn_binary_scalar( right: &[u8], ) -> Result { match left.data_type() { - DataType::Binary => { - let left = as_generic_binary_array::(left); - gt_eq_binary_scalar(left, right) - } - DataType::LargeBinary => { - let left = as_generic_binary_array::(left); - gt_eq_binary_scalar(left, right) - } + DataType::Binary => gt_eq_binary_scalar(left.as_binary::(), right), + DataType::LargeBinary => gt_eq_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( "gt_eq_dyn_binary_scalar only supports Binary or LargeBinary arrays" .to_string(), @@ -967,12 +931,10 @@ pub fn eq_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - eq_utf8_scalar(left, right) + eq_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - eq_utf8_scalar(left, right) + eq_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -997,12 +959,10 @@ pub fn lt_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - lt_utf8_scalar(left, right) + lt_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - lt_utf8_scalar(left, right) + lt_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "lt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -1027,12 +987,10 @@ pub fn gt_eq_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - gt_eq_utf8_scalar(left, right) + gt_eq_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - gt_eq_utf8_scalar(left, right) + gt_eq_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "gt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -1057,12 +1015,10 @@ pub fn lt_eq_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - lt_eq_utf8_scalar(left, right) + lt_eq_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - lt_eq_utf8_scalar(left, right) + lt_eq_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "lt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -1087,12 +1043,10 @@ pub fn gt_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - gt_utf8_scalar(left, right) + gt_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - gt_utf8_scalar(left, right) + gt_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "gt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -1117,12 +1071,10 @@ pub fn neq_dyn_utf8_scalar( )), }, DataType::Utf8 => { - let left = as_string_array(left); - neq_utf8_scalar(left, right) + neq_utf8_scalar(left.as_string::(), right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); - neq_utf8_scalar(left, right) + neq_utf8_scalar(left.as_string::(), right) } _ => Err(ArrowError::ComputeError( "neq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), @@ -1138,10 +1090,7 @@ pub fn eq_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - eq_bool_scalar(left, right) - } + DataType::Boolean => eq_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "eq_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1156,10 +1105,7 @@ pub fn lt_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - lt_bool_scalar(left, right) - } + DataType::Boolean => lt_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "lt_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1174,10 +1120,7 @@ pub fn gt_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - gt_bool_scalar(left, right) - } + DataType::Boolean => gt_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "gt_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1192,10 +1135,7 @@ pub fn lt_eq_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - lt_eq_bool_scalar(left, right) - } + DataType::Boolean => lt_eq_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "lt_eq_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1210,10 +1150,7 @@ pub fn gt_eq_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - gt_eq_bool_scalar(left, right) - } + DataType::Boolean => gt_eq_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "gt_eq_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1228,10 +1165,7 @@ pub fn neq_dyn_bool_scalar( right: bool, ) -> Result { let result = match left.data_type() { - DataType::Boolean => { - let left = as_boolean_array(left); - neq_bool_scalar(left, right) - } + DataType::Boolean => neq_bool_scalar(left.as_boolean(), right), _ => Err(ArrowError::ComputeError( "neq_dyn_bool_scalar only supports BooleanArray".to_string(), )), @@ -1455,8 +1389,8 @@ fn cmp_primitive_array( where F: Fn(T::Native, T::Native) -> bool, { - let left_array = as_primitive_array::(left); - let right_array = as_primitive_array::(right); + let left_array = left.as_primitive::(); + let right_array = right.as_primitive::(); compare_op(left_array, right_array, op) } @@ -2036,7 +1970,7 @@ where { compare_op( left.downcast_dict::>().unwrap(), - as_primitive_array::(right), + right.as_primitive::(), op, ) } @@ -3046,7 +2980,7 @@ mod tests { fn test_primitive_array_eq_scalar_with_slice() { let a = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); let a = a.slice(1, 3); - let a: &Int32Array = as_primitive_array(&a); + let a: &Int32Array = a.as_primitive(); let a_eq = eq_scalar(a, 2).unwrap(); assert_eq!( a_eq, @@ -3848,7 +3782,7 @@ mod tests { vec![Some("hi"), None, Some("hello"), Some("world"), Some("")], ); let a = a.slice(1, 4); - let a = as_string_array(&a); + let a = a.as_string::(); let a_eq = eq_utf8_scalar(a, "hello").unwrap(); assert_eq!( a_eq, diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 0f248ee637b0..ab6460e835f9 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -489,7 +489,7 @@ where { // create tuples that are used for sorting let valids = { - let values = as_primitive_array::(values); + let values = values.as_primitive::(); value_indices .into_iter() .map(|index| (index, values.value(index as usize))) @@ -1043,7 +1043,7 @@ pub struct SortColumn { /// # use std::sync::Arc; /// # use arrow_array::{ArrayRef, StringArray, PrimitiveArray}; /// # use arrow_array::types::Int64Type; -/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_array::cast::AsArray; /// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort}; /// /// let sorted_columns = lexsort(&vec![ @@ -1072,7 +1072,7 @@ pub struct SortColumn { /// }, /// ], None).unwrap(); /// -/// assert_eq!(as_primitive_array::(&sorted_columns[0]).value(1), -64); +/// assert_eq!(sorted_columns[0].as_primitive::().value(1), -64); /// assert!(sorted_columns[0].is_null(0)); /// ``` /// diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index bacc116cade7..273b7439d0d1 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -45,11 +45,11 @@ pub fn compute_dictionary_mapping( interner.intern(iter) } DataType::Utf8 => { - let iter = as_string_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + let iter = values.as_string::().iter().map(|x| x.map(|x| x.as_bytes())); interner.intern(iter) } DataType::LargeUtf8 => { - let iter = as_largestring_array(values).iter().map(|x| x.map(|x| x.as_bytes())); + let iter = values.as_string::().iter().map(|x| x.map(|x| x.as_bytes())); interner.intern(iter) } _ => unreachable!(), diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 2c1de68c1926..2f0defe5268a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -52,7 +52,7 @@ //! # use std::sync::Arc; //! # use arrow_row::{RowConverter, SortField}; //! # use arrow_array::{ArrayRef, Int32Array, StringArray}; -//! # use arrow_array::cast::{as_primitive_array, as_string_array}; +//! # use arrow_array::cast::{AsArray, as_string_array}; //! # use arrow_array::types::Int32Type; //! # use arrow_schema::DataType; //! @@ -89,10 +89,10 @@ //! // Convert selection of rows back to arrays //! let selection = [rows.row(0), rows2.row(1), rows.row(2), rows2.row(0)]; //! let converted = converter.convert_rows(selection).unwrap(); -//! let c1 = as_primitive_array::(converted[0].as_ref()); +//! let c1 = converted[0].as_primitive::(); //! assert_eq!(c1.values(), &[-1, 4, 0, 3]); //! -//! let c2 = as_string_array(converted[1].as_ref()); +//! let c2 = converted[1].as_string::(); //! let c2_values: Vec<_> = c2.iter().flatten().collect(); //! assert_eq!(&c2_values, &["a", "f", "c", "e"]); //! ``` @@ -1078,13 +1078,13 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> .iter() .zip(lengths.iter_mut()) .for_each(|(slice, length)| *length += variable::encoded_len(slice)), - DataType::Utf8 => as_string_array(array) + DataType::Utf8 => array.as_string::() .iter() .zip(lengths.iter_mut()) .for_each(|(slice, length)| { *length += variable::encoded_len(slice.map(|x| x.as_bytes())) }), - DataType::LargeUtf8 => as_largestring_array(array) + DataType::LargeUtf8 => array.as_string::() .iter() .zip(lengths.iter_mut()) .for_each(|(slice, length)| { @@ -1189,7 +1189,7 @@ fn encode_column( downcast_primitive_array! { column => fixed::encode(out, column, opts), DataType::Null => {} - DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts), + DataType::Boolean => fixed::encode(out, column.as_boolean(), opts), DataType::Binary => { variable::encode(out, as_generic_binary_array::(column).iter(), opts) } @@ -1198,12 +1198,12 @@ fn encode_column( } DataType::Utf8 => variable::encode( out, - as_string_array(column).iter().map(|x| x.map(|x| x.as_bytes())), + column.as_string::().iter().map(|x| x.map(|x| x.as_bytes())), opts, ), DataType::LargeUtf8 => variable::encode( out, - as_largestring_array(column) + column.as_string::() .iter() .map(|x| x.map(|x| x.as_bytes())), opts, diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 784bfa02014d..14fd5d9d1d32 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; +use arrow_array::cast::AsArray; use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; use arrow_buffer::bit_util; @@ -349,16 +349,16 @@ fn filter_array( Ok(Arc::new(filter_boolean(values, predicate))) } DataType::Utf8 => { - Ok(Arc::new(filter_bytes(as_string_array(values), predicate))) + Ok(Arc::new(filter_bytes(values.as_string::(), predicate))) } DataType::LargeUtf8 => { - Ok(Arc::new(filter_bytes(as_largestring_array(values), predicate))) + Ok(Arc::new(filter_bytes(values.as_string::(), predicate))) } DataType::Binary => { - Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) + Ok(Arc::new(filter_bytes(values.as_binary::(), predicate))) } DataType::LargeBinary => { - Ok(Arc::new(filter_bytes(as_generic_binary_array::(values), predicate))) + Ok(Arc::new(filter_bytes(values.as_binary::(), predicate))) } DataType::Dictionary(_, _) => downcast_dictionary_array! { values => Ok(Arc::new(filter_dict(values, predicate))), diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 95b694aba732..f274a3ebc30f 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -225,7 +225,7 @@ fn interleave_fallback( mod tests { use super::*; use arrow_array::builder::{Int32Builder, ListBuilder}; - use arrow_array::cast::{as_primitive_array, as_string_array}; + use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; use arrow_array::{Int32Array, ListArray, StringArray}; use arrow_schema::DataType; @@ -237,7 +237,7 @@ mod tests { let c = Int32Array::from_iter_values([8, 9, 10]); let values = interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); - let v = as_primitive_array::(&values); + let v = values.as_primitive::(); assert_eq!(v.values(), &[4, 4, 10, 8, 6]); } @@ -247,9 +247,7 @@ mod tests { let b = Int32Array::from_iter([Some(1), Some(4), None]); let values = interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (0, 2)]).unwrap(); - let v: Vec<_> = as_primitive_array::(&values) - .into_iter() - .collect(); + let v: Vec<_> = values.as_primitive::().into_iter().collect(); assert_eq!(&v, &[Some(2), None, None, Some(4), Some(3)]) } @@ -267,7 +265,7 @@ mod tests { let b = StringArray::from_iter_values(["hello", "world", "foo"]); let values = interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); - let v = as_string_array(&values); + let v = values.as_string::(); let values: Vec<_> = v.into_iter().collect(); assert_eq!( &values, diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index ea0c8e3d526c..a1b9c0e3e183 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -124,7 +124,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result(&res); + let res = res.as_primitive::(); assert_eq!(&expected, res); } @@ -175,7 +175,7 @@ mod tests { Some(8), // None => keep it None, // true => None ]); - let res = as_primitive_array::(&res); + let res = res.as_primitive::(); assert_eq!(&expected, res) } @@ -201,7 +201,7 @@ mod tests { ]); let a = nullif(&s, &select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); + let r: Vec<_> = a.as_string::().iter().collect(); assert_eq!( r, vec![None, None, Some("world"), None, Some("b"), None, None] @@ -209,9 +209,9 @@ mod tests { let s = s.slice(2, 3); let select = select.slice(1, 3); - let select = as_boolean_array(select.as_ref()); + let select = select.as_boolean(); let a = nullif(s.as_ref(), select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); + let r: Vec<_> = a.as_string::().iter().collect(); assert_eq!(r, vec![None, Some("a"), None]); } @@ -456,7 +456,7 @@ mod tests { let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); let res = nullif(&a, &comp).unwrap(); - let res = as_primitive_array::(res.as_ref()); + let res = res.as_primitive::(); let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); assert_eq!(res, &expected); @@ -500,7 +500,7 @@ mod tests { for (a_offset, a_length) in a_slices { let a = a.slice(a_offset, a_length); - let a = as_primitive_array::(a.as_ref()); + let a = a.as_primitive::(); for i in 1..65 { let b_start_offset = rng.gen_range(0..i); @@ -510,7 +510,7 @@ mod tests { .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) .collect(); let b = b.slice(b_start_offset, a_length); - let b = as_boolean_array(b.as_ref()); + let b = b.as_boolean(); test_nullif(a, b); } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 421157bdf041..83b58519fdb8 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -20,13 +20,13 @@ use std::sync::Arc; use arrow_array::builder::BufferBuilder; +use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; -use arrow_array::cast::{as_generic_binary_array, as_largestring_array, as_string_array}; use num::{ToPrimitive, Zero}; /// Take elements by index from [Array], creating a new [Array] from those indexes. @@ -128,24 +128,16 @@ where Ok(Arc::new(take_boolean(values, indices)?)) } DataType::Utf8 => { - Ok(Arc::new(take_bytes(as_string_array(values), indices)?)) + Ok(Arc::new(take_bytes(values.as_string::(), indices)?)) } DataType::LargeUtf8 => { - Ok(Arc::new(take_bytes(as_largestring_array(values), indices)?)) + Ok(Arc::new(take_bytes(values.as_string::(), indices)?)) } DataType::List(_) => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_list::<_, Int32Type>(values, indices)?)) + Ok(Arc::new(take_list::<_, Int32Type>(values.as_list(), indices)?)) } DataType::LargeList(_) => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_list::<_, Int64Type>(values, indices)?)) + Ok(Arc::new(take_list::<_, Int64Type>(values.as_list(), indices)?)) } DataType::FixedSizeList(_, length) => { let values = values @@ -193,10 +185,10 @@ where t => unimplemented!("Take not supported for run type {:?}", t) } DataType::Binary => { - Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) + Ok(Arc::new(take_bytes(values.as_binary::(), indices)?)) } DataType::LargeBinary => { - Ok(Arc::new(take_bytes(as_generic_binary_array::(values), indices)?)) + Ok(Arc::new(take_bytes(values.as_binary::(), indices)?)) } DataType::FixedSizeBinary(size) => { let values = values @@ -969,7 +961,7 @@ where #[cfg(test)] mod tests { use super::*; - use arrow_array::{builder::*, cast::as_primitive_array}; + use arrow_array::builder::*; use arrow_schema::TimeUnit; fn test_take_decimal_arrays( @@ -2160,7 +2152,7 @@ mod tests { assert_eq!(take_out.run_ends().len(), 7); assert_eq!(take_out.run_ends().values(), &[1_i32, 3, 4, 5, 7]); - let take_out_values = as_primitive_array::(take_out.values()); + let take_out_values = take_out.values().as_primitive::(); assert_eq!(take_out_values.values(), &[2, 2, 2, 2, 1]); } diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index a48fc13409f1..f0c09a7ec4d8 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -214,7 +214,7 @@ pub fn bit_length(array: &dyn Array) -> Result { #[cfg(test)] mod tests { use super::*; - use arrow_array::cast::as_primitive_array; + use arrow_array::cast::AsArray; fn double_vec(v: Vec) -> Vec { [&v[..], &v[..]].concat() @@ -427,7 +427,7 @@ mod tests { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); let result = length(b.as_ref()).unwrap(); - let result: &Int32Array = as_primitive_array(&result); + let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(1), Some(5), None]); assert_eq!(&expected, result); @@ -440,7 +440,7 @@ mod tests { let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = length(b.as_ref()).unwrap(); - let result: &Int32Array = as_primitive_array(&result); + let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(1), Some(2), None]); assert_eq!(&expected, result); @@ -582,7 +582,7 @@ mod tests { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); let result = bit_length(b.as_ref()).unwrap(); - let result: &Int32Array = as_primitive_array(&result); + let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(8), Some(40), None]); assert_eq!(&expected, result); @@ -595,7 +595,7 @@ mod tests { let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = bit_length(b.as_ref()).unwrap(); - let result: &Int32Array = as_primitive_array(&result); + let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(0), Some(40), None]); assert_eq!(&expected, result); diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e8ec699969bd..7b6c7d50cac3 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -71,13 +71,13 @@ macro_rules! dyn_function { pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result { match (left.data_type(), right.data_type()) { (DataType::Utf8, DataType::Utf8) => { - let left = as_string_array(left); - let right = as_string_array(right); + let left = left.as_string::(); + let right = right.as_string::(); $fn_utf8(left, right) } (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = as_largestring_array(left); - let right = as_largestring_array(right); + let left = left.as_string::(); + let right = right.as_string::(); $fn_utf8(left, right) } #[cfg(feature = "dyn_cmp_dict")] @@ -139,11 +139,11 @@ pub fn $fn_name( ) -> Result { match left.data_type() { DataType::Utf8 => { - let left = as_string_array(left); + let left = left.as_string::(); $fn_scalar(left, right) } DataType::LargeUtf8 => { - let left = as_largestring_array(left); + let left = left.as_string::(); $fn_scalar(left, right) } DataType::Dictionary(_, _) => { diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 4b1251ebcd2b..40b09a976178 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -135,6 +135,25 @@ //! } //! ``` //! +//! To facilitate downcasting, the [`AsArray`](crate::array::AsArray) extension trait can be used +//! +//! ```rust +//! # use arrow::array::{Array, Float32Array, AsArray}; +//! # use arrow::array::StringArray; +//! # use arrow::datatypes::DataType; +//! # +//! fn impl_string(array: &StringArray) {} +//! fn impl_f32(array: &Float32Array) {} +//! +//! fn impl_dyn(array: &dyn Array) { +//! match array.data_type() { +//! DataType::Utf8 => impl_string(array.as_string()), +//! DataType::Float32 => impl_f32(array.as_primitive()), +//! _ => unimplemented!() +//! } +//! } +//! ``` +//! //! It is also common to want to write a function that returns one of a number of possible //! array implementations. [`ArrayRef`] is a type-alias for [`Arc`](array::Array) //! which is frequently used for this purpose @@ -207,7 +226,7 @@ //! //! ``` //! # use arrow::compute::gt_scalar; -//! # use arrow_array::cast::as_primitive_array; +//! # use arrow_array::cast::AsArray; //! # use arrow_array::Int32Array; //! # use arrow_array::types::Int32Type; //! # use arrow_select::filter::filter; @@ -216,7 +235,7 @@ //! let filtered = filter(&array, &predicate).unwrap(); //! //! let expected = Int32Array::from_iter(61..100); -//! assert_eq!(&expected, as_primitive_array::(&filtered)); +//! assert_eq!(&expected, filtered.as_primitive::()); //! ``` //! //! As well as some horizontal operations, such as: diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 7ead5fa61522..0956893a870d 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -335,10 +335,8 @@ mod tests { let col_c = struct_array.column_by_name("c").unwrap(); let col_c = col_c.as_any().downcast_ref::().unwrap(); assert_eq!(col_c.len(), size); - let col_c_values = col_c.values(); - assert!(col_c_values.len() > size); - // col_c_values should be a list - let col_c_list = as_list_array(col_c_values); + let col_c_list = col_c.values().as_list::(); + assert!(col_c_list.len() > size); // Its values should be FixedSizeBinary(6) let fsb = col_c_list.values(); assert_eq!(fsb.data_type(), &DataType::FixedSizeBinary(6)); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index e6693a6cff4a..2d867c9596c7 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -21,7 +21,7 @@ use std::collections::VecDeque; use std::io::Write; use std::sync::Arc; -use arrow_array::cast::as_primitive_array; +use arrow_array::cast::AsArray; use arrow_array::types::Decimal128Type; use arrow_array::{types, Array, ArrayRef, RecordBatch}; use arrow_schema::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; @@ -400,7 +400,8 @@ fn write_leaf( } ArrowDataType::Decimal128(_, _) => { // use the int32 to represent the decimal with low precision - let array = as_primitive_array::(column) + let array = column + .as_primitive::() .unary::<_, types::Int32Type>(|v| v as i32); write_primitive(typed, array.values(), levels)? } @@ -444,7 +445,8 @@ fn write_leaf( } ArrowDataType::Decimal128(_, _) => { // use the int64 to represent the decimal with low precision - let array = as_primitive_array::(column) + let array = column + .as_primitive::() .unary::<_, types::Int64Type>(|v| v as i64); write_primitive(typed, array.values(), levels)? } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 99fe650695a0..2d39284c763f 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -840,7 +840,7 @@ mod tests { use crate::file::page_index::index_reader; use crate::file::properties::WriterProperties; use arrow::error::Result as ArrowResult; - use arrow_array::cast::as_primitive_array; + use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; use futures::TryStreamExt; @@ -1355,14 +1355,14 @@ mod tests { // First batch should contain all rows assert_eq!(batch.num_rows(), 3); assert_eq!(batch.num_columns(), 3); - let col2 = as_primitive_array::(batch.column(2)); + let col2 = batch.column(2).as_primitive::(); assert_eq!(col2.values(), &[0, 1, 2]); let batch = &batches[1]; // Second batch should trigger the limit and only have one row assert_eq!(batch.num_rows(), 1); assert_eq!(batch.num_columns(), 3); - let col2 = as_primitive_array::(batch.column(2)); + let col2 = batch.column(2).as_primitive::(); assert_eq!(col2.values(), &[3]); let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) @@ -1381,14 +1381,14 @@ mod tests { // First batch should contain one row assert_eq!(batch.num_rows(), 1); assert_eq!(batch.num_columns(), 3); - let col2 = as_primitive_array::(batch.column(2)); + let col2 = batch.column(2).as_primitive::(); assert_eq!(col2.values(), &[2]); let batch = &batches[1]; // Second batch should contain two rows assert_eq!(batch.num_rows(), 2); assert_eq!(batch.num_columns(), 3); - let col2 = as_primitive_array::(batch.column(2)); + let col2 = batch.column(2).as_primitive::(); assert_eq!(col2.values(), &[3, 4]); let stream = ParquetRecordBatchStreamBuilder::new(test.clone()) @@ -1407,7 +1407,7 @@ mod tests { // First batch should contain two rows assert_eq!(batch.num_rows(), 2); assert_eq!(batch.num_columns(), 3); - let col2 = as_primitive_array::(batch.column(2)); + let col2 = batch.column(2).as_primitive::(); assert_eq!(col2.values(), &[4, 5]); } From bcfe63ec5b0420b030e45091b8fc7c79e091646a Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 23 Mar 2023 15:53:56 -0500 Subject: [PATCH 0721/1411] fix: Specify content length for gcp copy request (#3921) * fix: Specify content length for gcp copy request * Include comment about native-tls/rust-tls Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/gcp/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 97f44446f82a..fe79a6e07ef2 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -437,6 +437,9 @@ impl GoogleCloudStorageClient { builder .bearer_auth(token) + // Needed if reqwest is compiled with native-tls instead of rustls-tls + // See https://github.com/apache/arrow-rs/pull/3921 + .header(header::CONTENT_LENGTH, 0) .send_retry(&self.retry_config) .await .map_err(|err| { From e4e6c67eedc69e4f402c66dacb5176880d5101bf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 21:04:48 +0000 Subject: [PATCH 0722/1411] Add OffsetBuffer::new (#3910) --- arrow-buffer/src/buffer/offset.rs | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 808e43cbf453..ada290f09286 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -24,6 +24,22 @@ use std::ops::Deref; pub struct OffsetBuffer(ScalarBuffer); impl OffsetBuffer { + /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] + /// + /// # Panics + /// + /// Panics if `buffer` is not a non-empty buffer containing + /// monotonically increasing values greater than zero + pub fn new(buffer: ScalarBuffer) -> Self { + assert!(!buffer.is_empty(), "offsets cannot be empty"); + assert!(buffer[0] > O::usize_as(0), "offsets must be greater than 0"); + assert!( + buffer.windows(2).all(|w| w[0] <= w[1]), + "offsets must be monotonically increasing" + ); + Self(buffer) + } + /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] /// /// # Safety @@ -71,3 +87,26 @@ impl AsRef<[T]> for OffsetBuffer { self } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[should_panic(expected = "offsets cannot be empty")] + fn empty_offsets() { + OffsetBuffer::new(Vec::::new().into()); + } + + #[test] + #[should_panic(expected = "offsets must be greater than 0")] + fn negative_offsets() { + OffsetBuffer::new(vec![-1, 0, 1].into()); + } + + #[test] + #[should_panic(expected = "offsets must be monotonically increasing")] + fn non_monotonic_offsets() { + OffsetBuffer::new(vec![1, 2, 0].into()); + } +} From 6af2bf67582b52efdbe05a4902e68ad938dd14b7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 21:22:33 +0000 Subject: [PATCH 0723/1411] Support timezones in CSV reader (#3841) (#3908) --- arrow-csv/src/reader/mod.rs | 192 +++++++++++++++++++++--------------- 1 file changed, 115 insertions(+), 77 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 046bfafc4641..262c057d4283 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -45,8 +45,9 @@ mod records; use arrow_array::builder::PrimitiveBuilder; use arrow_array::types::*; use arrow_array::*; -use arrow_cast::parse::{parse_decimal, Parser}; +use arrow_cast::parse::{parse_decimal, string_to_datetime, Parser}; use arrow_schema::*; +use chrono::{TimeZone, Utc}; use lazy_static::lazy_static; use regex::{Regex, RegexSet}; use std::fmt; @@ -56,6 +57,7 @@ use std::sync::Arc; use crate::map_csv_error; use crate::reader::records::{RecordDecoder, StringRecords}; +use arrow_array::timezone::Tz; use csv::StringRecord; lazy_static! { @@ -677,33 +679,36 @@ fn parse( >( line_number, rows, i, None ), - DataType::Timestamp(TimeUnit::Second, _) => build_primitive_array::< - TimestampSecondType, - >( - line_number, rows, i, None - ), - DataType::Timestamp(TimeUnit::Millisecond, _) => { - build_primitive_array::( + DataType::Timestamp(TimeUnit::Second, tz) => { + build_timestamp_array::( line_number, rows, i, - None, + tz.as_deref(), ) } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - build_primitive_array::( + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + build_timestamp_array::( line_number, rows, i, - None, + tz.as_deref(), ) } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - build_primitive_array::( + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + build_timestamp_array::( line_number, rows, i, - None, + tz.as_deref(), + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + build_timestamp_array::( + line_number, + rows, + i, + tz.as_deref(), ) } DataType::Utf8 => Ok(Arc::new( @@ -871,6 +876,54 @@ fn build_primitive_array( .map(|e| Arc::new(e) as ArrayRef) } +fn build_timestamp_array( + line_number: usize, + rows: &StringRecords<'_>, + col_idx: usize, + timezone: Option<&str>, +) -> Result { + Ok(Arc::new(match timezone { + Some(timezone) => { + let tz: Tz = timezone.parse()?; + build_timestamp_array_impl::(line_number, rows, col_idx, &tz)? + .with_timezone(timezone) + } + None => build_timestamp_array_impl::(line_number, rows, col_idx, &Utc)?, + })) +} + +fn build_timestamp_array_impl( + line_number: usize, + rows: &StringRecords<'_>, + col_idx: usize, + timezone: &Tz, +) -> Result, ArrowError> { + rows.iter() + .enumerate() + .map(|(row_index, row)| { + let s = row.get(col_idx); + if s.is_empty() { + return Ok(None); + } + + let date = string_to_datetime(timezone, s).map_err(|e| { + ArrowError::ParseError(format!( + "Error parsing column {col_idx} at line {}: {}", + line_number + row_index, + e + )) + })?; + + Ok(Some(match T::UNIT { + TimeUnit::Second => date.timestamp(), + TimeUnit::Millisecond => date.timestamp_millis(), + TimeUnit::Microsecond => date.timestamp_micros(), + TimeUnit::Nanosecond => date.timestamp_nanos(), + })) + }) + .collect() +} + // parses a specific column (col_idx) into an Arrow Array. fn build_boolean_array( line_number: usize, @@ -1147,7 +1200,6 @@ mod tests { use tempfile::NamedTempFile; use arrow_array::cast::AsArray; - use chrono::prelude::*; #[test] fn test_csv() { @@ -1686,75 +1738,61 @@ mod tests { ); } - #[test] - fn test_parse_timestamp_microseconds() { - assert_eq!( - parse_item::("1970-01-01T00:00:00Z").unwrap(), - 0 - ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), - NaiveTime::from_hms_nano_opt(17, 11, 10, 0).unwrap(), - ); - assert_eq!( - parse_item::("2018-11-13T17:11:10").unwrap(), - naive_datetime.timestamp_nanos() / 1000 - ); - assert_eq!( - parse_item::("2018-11-13 17:11:10").unwrap(), - naive_datetime.timestamp_nanos() / 1000 - ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), - NaiveTime::from_hms_nano_opt(17, 11, 10, 11000000).unwrap(), - ); - assert_eq!( - parse_item::("2018-11-13T17:11:10.011").unwrap(), - naive_datetime.timestamp_nanos() / 1000 - ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(1900, 2, 28).unwrap(), - NaiveTime::from_hms_nano_opt(12, 34, 56, 0).unwrap(), - ); - assert_eq!( - parse_item::("1900-02-28T12:34:56").unwrap(), - naive_datetime.timestamp_nanos() / 1000 - ); + fn test_parse_timestamp_impl( + timezone: Option, + expected: &[i64], + ) { + let csv = [ + "1970-01-01T00:00:00", + "1970-01-01T00:00:00Z", + "1970-01-01T00:00:00+02:00", + ] + .join("\n"); + let mut decoder = ReaderBuilder::new() + .with_schema(Arc::new(Schema::new(vec![Field::new( + "field", + DataType::Timestamp(T::UNIT, timezone.clone()), + true, + )]))) + .build_decoder(); + + let decoded = decoder.decode(csv.as_bytes()).unwrap(); + assert_eq!(decoded, csv.len()); + decoder.decode(&[]).unwrap(); + + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), 3); + let col = batch.column(0).as_primitive::(); + assert_eq!(col.values(), expected); + assert_eq!(col.data_type(), &DataType::Timestamp(T::UNIT, timezone)); } #[test] - fn test_parse_timestamp_nanoseconds() { - assert_eq!( - parse_item::("1970-01-01T00:00:00Z").unwrap(), - 0 - ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), - NaiveTime::from_hms_nano_opt(17, 11, 10, 0).unwrap(), - ); - assert_eq!( - parse_item::("2018-11-13T17:11:10").unwrap(), - naive_datetime.timestamp_nanos() + fn test_parse_timestamp() { + test_parse_timestamp_impl::( + None, + &[0, 0, -7_200_000_000_000], ); - assert_eq!( - parse_item::("2018-11-13 17:11:10").unwrap(), - naive_datetime.timestamp_nanos() + test_parse_timestamp_impl::( + Some("+00:00".to_string()), + &[0, 0, -7_200_000_000_000], ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2018, 11, 13).unwrap(), - NaiveTime::from_hms_nano_opt(17, 11, 10, 11000000).unwrap(), + test_parse_timestamp_impl::( + Some("-05:00".to_string()), + &[18_000_000_000_000, 0, -7_200_000_000_000], ); - assert_eq!( - parse_item::("2018-11-13T17:11:10.011").unwrap(), - naive_datetime.timestamp_nanos() + test_parse_timestamp_impl::( + Some("-03".to_string()), + &[10_800_000_000, 0, -7_200_000_000], ); - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(1900, 2, 28).unwrap(), - NaiveTime::from_hms_nano_opt(12, 34, 56, 0).unwrap(), + test_parse_timestamp_impl::( + Some("-03".to_string()), + &[10_800_000, 0, -7_200_000], ); - assert_eq!( - parse_item::("1900-02-28T12:34:56").unwrap(), - naive_datetime.timestamp_nanos() + test_parse_timestamp_impl::( + Some("-03".to_string()), + &[10_800, 0, -7_200], ); } From de9a90b1e09a43a6e8d2d3f3375f02a755d0cde1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 23 Mar 2023 22:01:35 +0000 Subject: [PATCH 0724/1411] Cleanup uses of Array::data_ref (#3880) (#3918) * Cleanup uses of Array::data_ref (#3880) * Further cleanup and fixes --- arrow-arith/src/arithmetic.rs | 54 +-- arrow-arith/src/boolean.rs | 351 +++++++----------- arrow-array/src/array/dictionary_array.rs | 2 +- .../src/array/fixed_size_binary_array.rs | 2 +- .../src/array/fixed_size_list_array.rs | 2 +- arrow-array/src/array/list_array.rs | 2 +- arrow-array/src/array/null_array.rs | 2 +- arrow-ord/src/sort.rs | 21 +- arrow-select/src/filter.rs | 22 +- arrow/benches/mutable_array.rs | 3 +- arrow/tests/array_equal.rs | 6 +- .../src/arrow/array_reader/struct_array.rs | 2 +- 12 files changed, 170 insertions(+), 299 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index de4b0ccb8858..7d60d131bf52 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -103,14 +103,13 @@ fn math_checked_divide_op_on_iters( left: impl Iterator>, right: impl Iterator>, op: F, - len: usize, - null_bit_buffer: Option, + nulls: Option, ) -> Result, ArrowError> where T: ArrowNumericType, F: Fn(T::Native, T::Native) -> Result, { - let buffer = if null_bit_buffer.is_some() { + let buffer = if nulls.is_some() { let values = left.zip(right).map(|(left, right)| { if let (Some(l), Some(r)) = (left, right) { op(l, r) @@ -130,18 +129,7 @@ where unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } }?; - let data = unsafe { - arrow_data::ArrayData::new_unchecked( - T::DATA_TYPE, - len, - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - Ok(PrimitiveArray::::from(data)) + Ok(PrimitiveArray::new(T::DATA_TYPE, buffer.into(), nulls)) } /// Calculates the modulus operation `left % right` on two SIMD inputs. @@ -284,20 +272,16 @@ where } // Create the combined `Bitmap` - let null_bit_buffer = arrow_data::bit_mask::combine_option_bitmap( - &[left.data_ref(), right.data_ref()], - left.len(), - ); + let nulls = arrow_buffer::NullBuffer::union(left.nulls(), right.nulls()); let lanes = T::lanes(); let buffer_size = left.len() * std::mem::size_of::(); let mut result = arrow_buffer::MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - match &null_bit_buffer { + match &nulls { Some(b) => { - // combine_option_bitmap returns a slice or new buffer starting at 0 - let valid_chunks = b.bit_chunks(0, left.len()); + let valid_chunks = b.inner().bit_chunks(); // process data in chunks of 64 elements since we also get 64 bits of validity information at a time @@ -372,18 +356,7 @@ where } } - let data = unsafe { - arrow_data::ArrayData::new_unchecked( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ) - }; - Ok(PrimitiveArray::::from(data)) + Ok(PrimitiveArray::new(T::DATA_TYPE, result.into(), nulls)) } /// Applies $OP to $LEFT and $RIGHT which are two dictionaries which have (the same) key type $KT @@ -628,10 +601,7 @@ where ))); } - let null_bit_buffer = arrow_data::bit_mask::combine_option_bitmap( - &[left.data_ref(), right.data_ref()], - left.len(), - ); + let nulls = arrow_buffer::NullBuffer::union(left.nulls(), right.nulls()); // Safety justification: Since the inputs are valid Arrow arrays, all values are // valid indexes into the dictionary (which is verified during construction) @@ -653,13 +623,7 @@ where .take_iter_unchecked(right.keys_iter()) }; - math_checked_divide_op_on_iters( - left_iter, - right_iter, - op, - left.len(), - null_bit_buffer, - ) + math_checked_divide_op_on_iters(left_iter, right_iter, op, nulls) } #[cfg(feature = "dyn_arith_dict")] diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 3e21c2f1b484..eaef1378258b 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -24,30 +24,55 @@ use arrow_array::*; use arrow_buffer::bit_util::ceil; -use arrow_buffer::buffer::{ - bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, - buffer_unary_not, -}; -use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_data::bit_mask::combine_option_bitmap; +use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; +use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -/// Updates null buffer based on data buffer and null buffer of the operand at other side -/// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false -/// results false. So we cannot simply AND two null buffers. This function updates null buffer -/// of one side if other side is a false value. -pub(crate) fn build_null_buffer_for_and_kleene( - left_data: &ArrayData, - right_data: &ArrayData, -) -> Option { - let left_buffer = &left_data.buffers()[0]; - let right_buffer = &right_data.buffers()[0]; - - let left_null_buffer = left_data.nulls(); - let right_null_buffer = right_data.nulls(); - - match (left_null_buffer, right_null_buffer) { +/// Logical 'and' boolean values with Kleene logic +/// +/// # Behavior +/// +/// This function behaves as follows with nulls: +/// +/// * `true` and `null` = `null` +/// * `null` and `true` = `null` +/// * `false` and `null` = `false` +/// * `null` and `false` = `false` +/// * `null` and `null` = `null` +/// +/// In other words, in this context a null value really means \"unknown\", +/// and an unknown value 'and' false is always false. +/// For a different null behavior, see function \"and\". +/// +/// # Example +/// +/// ```rust +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::and_kleene; +/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); +/// let b = BooleanArray::from(vec![None, None, None]); +/// let and_ab = and_kleene(&a, &b).unwrap(); +/// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None])); +/// ``` +/// +/// # Fails +/// +/// If the operands have different lengths +pub fn and_kleene( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform bitwise operation on arrays of different length".to_string(), + )); + } + + let left_values = left.values(); + let right_values = right.values(); + + let buffer = match (left.nulls(), right.nulls()) { (None, None) => None, (Some(left_null_buffer), None) => { // The right side has no null values. @@ -57,9 +82,9 @@ pub(crate) fn build_null_buffer_for_and_kleene( Some(bitwise_bin_op_helper( left_null_buffer.buffer(), left_null_buffer.offset(), - right_buffer, - right_data.offset(), - left_data.len(), + right_values.inner(), + right_values.offset(), + left.len(), |a, b| a | !b, )) } @@ -68,9 +93,9 @@ pub(crate) fn build_null_buffer_for_and_kleene( Some(bitwise_bin_op_helper( right_null_buffer.buffer(), right_null_buffer.offset(), - left_buffer, - left_data.offset(), - left_data.len(), + left_values.inner(), + left_values.offset(), + left.len(), |a, b| a | !b, )) } @@ -83,44 +108,69 @@ pub(crate) fn build_null_buffer_for_and_kleene( Some(bitwise_quaternary_op_helper( [ left_null_buffer.buffer(), - left_buffer, + left_values.inner(), right_null_buffer.buffer(), - right_buffer, + right_values.inner(), ], [ left_null_buffer.offset(), - left_data.offset(), + left_values.offset(), right_null_buffer.offset(), - right_data.offset(), + right_values.offset(), ], - left_data.len(), + left.len(), |a, b, c, d| (a | (c & !d)) & (c | (a & !b)), )) } - } + }; + let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len()))); + Ok(BooleanArray::new(left_values & right_values, nulls)) } -/// For AND/OR kernels, the result of null buffer is simply a bitwise `and` operation. -pub(crate) fn build_null_buffer_for_and_or( - left_data: &ArrayData, - right_data: &ArrayData, -) -> Option { - // `arrays` are not empty, so safely do `unwrap` directly. - combine_option_bitmap(&[left_data, right_data], left_data.len()) -} +/// Logical 'or' boolean values with Kleene logic +/// +/// # Behavior +/// +/// This function behaves as follows with nulls: +/// +/// * `true` or `null` = `true` +/// * `null` or `true` = `true` +/// * `false` or `null` = `null` +/// * `null` or `false` = `null` +/// * `null` or `null` = `null` +/// +/// In other words, in this context a null value really means \"unknown\", +/// and an unknown value 'or' true is always true. +/// For a different null behavior, see function \"or\". +/// +/// # Example +/// +/// ```rust +/// # use arrow_array::BooleanArray; +/// # use arrow_arith::boolean::or_kleene; +/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); +/// let b = BooleanArray::from(vec![None, None, None]); +/// let or_ab = or_kleene(&a, &b).unwrap(); +/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None])); +/// ``` +/// +/// # Fails +/// +/// If the operands have different lengths +pub fn or_kleene( + left: &BooleanArray, + right: &BooleanArray, +) -> Result { + if left.len() != right.len() { + return Err(ArrowError::ComputeError( + "Cannot perform bitwise operation on arrays of different length".to_string(), + )); + } + + let left_values = left.values(); + let right_values = right.values(); -/// Updates null buffer based on data buffer and null buffer of the operand at other side -/// in boolean OR kernel with Kleene logic. In short, because for OR kernel, null OR true -/// results true. So we cannot simply AND two null buffers. This function updates null -/// buffer of one side if other side is a true value. -pub(crate) fn build_null_buffer_for_or_kleene( - left_data: &ArrayData, - right_data: &ArrayData, -) -> Option { - let left_buffer = &left_data.buffers()[0]; - let right_buffer = &right_data.buffers()[0]; - - match (left_data.nulls(), right_data.nulls()) { + let buffer = match (left.nulls(), right.nulls()) { (None, None) => None, (Some(left_nulls), None) => { // The right side has no null values. @@ -130,9 +180,9 @@ pub(crate) fn build_null_buffer_for_or_kleene( Some(bitwise_bin_op_helper( left_nulls.buffer(), left_nulls.offset(), - right_buffer, - right_data.offset(), - right_data.len(), + right_values.inner(), + right_values.offset(), + left.len(), |a, b| a | b, )) } @@ -141,9 +191,9 @@ pub(crate) fn build_null_buffer_for_or_kleene( Some(bitwise_bin_op_helper( right_nulls.buffer(), right_nulls.offset(), - left_buffer, - left_data.offset(), - left_data.len(), + left_values.inner(), + left_values.offset(), + left.len(), |a, b| a | b, )) } @@ -156,33 +206,34 @@ pub(crate) fn build_null_buffer_for_or_kleene( Some(bitwise_quaternary_op_helper( [ left_nulls.buffer(), - left_buffer, + left_values.inner(), right_nulls.buffer(), - right_buffer, + right_values.inner(), ], [ left_nulls.offset(), - left_data.offset(), + left_values.offset(), right_nulls.offset(), - right_data.offset(), + right_values.offset(), ], - left_data.len(), + left.len(), |a, b, c, d| (a | (c & d)) & (c | (a & b)), )) } - } + }; + + let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len()))); + Ok(BooleanArray::new(left_values | right_values, nulls)) } /// Helper function to implement binary kernels -pub(crate) fn binary_boolean_kernel( +pub(crate) fn binary_boolean_kernel( left: &BooleanArray, right: &BooleanArray, op: F, - null_op: U, ) -> Result where - F: Fn(&Buffer, usize, &Buffer, usize, usize) -> Buffer, - U: Fn(&ArrayData, &ArrayData) -> Option, + F: Fn(&BooleanBuffer, &BooleanBuffer) -> BooleanBuffer, { if left.len() != right.len() { return Err(ArrowError::ComputeError( @@ -190,32 +241,9 @@ where )); } - let len = left.len(); - - let left_data = left.data_ref(); - let right_data = right.data_ref(); - - let left_buffer = &left_data.buffers()[0]; - let right_buffer = &right_data.buffers()[0]; - let left_offset = left.offset(); - let right_offset = right.offset(); - - let null_bit_buffer = null_op(left_data, right_data); - - let values = op(left_buffer, left_offset, right_buffer, right_offset, len); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let nulls = NullBuffer::union(left.nulls(), right.nulls()); + let values = op(left.values(), right.values()); + Ok(BooleanArray::new(values, nulls)) } /// Performs `AND` operation on two arrays. If either left or right value is null then the @@ -235,49 +263,7 @@ pub fn and( left: &BooleanArray, right: &BooleanArray, ) -> Result { - binary_boolean_kernel(left, right, buffer_bin_and, build_null_buffer_for_and_or) -} - -/// Logical 'and' boolean values with Kleene logic -/// -/// # Behavior -/// -/// This function behaves as follows with nulls: -/// -/// * `true` and `null` = `null` -/// * `null` and `true` = `null` -/// * `false` and `null` = `false` -/// * `null` and `false` = `false` -/// * `null` and `null` = `null` -/// -/// In other words, in this context a null value really means \"unknown\", -/// and an unknown value 'and' false is always false. -/// For a different null behavior, see function \"and\". -/// -/// # Example -/// -/// ```rust -/// # use arrow_array::BooleanArray; -/// # use arrow_arith::boolean::and_kleene; -/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); -/// let b = BooleanArray::from(vec![None, None, None]); -/// let and_ab = and_kleene(&a, &b).unwrap(); -/// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None])); -/// ``` -/// -/// # Fails -/// -/// If the operands have different lengths -pub fn and_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { - binary_boolean_kernel( - left, - right, - buffer_bin_and, - build_null_buffer_for_and_kleene, - ) + binary_boolean_kernel(left, right, |a, b| a & b) } /// Performs `OR` operation on two arrays. If either left or right value is null then the @@ -294,44 +280,7 @@ pub fn and_kleene( /// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), Some(true), None])); /// ``` pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { - binary_boolean_kernel(left, right, buffer_bin_or, build_null_buffer_for_and_or) -} - -/// Logical 'or' boolean values with Kleene logic -/// -/// # Behavior -/// -/// This function behaves as follows with nulls: -/// -/// * `true` or `null` = `true` -/// * `null` or `true` = `true` -/// * `false` or `null` = `null` -/// * `null` or `false` = `null` -/// * `null` or `null` = `null` -/// -/// In other words, in this context a null value really means \"unknown\", -/// and an unknown value 'or' true is always true. -/// For a different null behavior, see function \"or\". -/// -/// # Example -/// -/// ```rust -/// # use arrow_array::BooleanArray; -/// # use arrow_arith::boolean::or_kleene; -/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); -/// let b = BooleanArray::from(vec![None, None, None]); -/// let or_ab = or_kleene(&a, &b).unwrap(); -/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None])); -/// ``` -/// -/// # Fails -/// -/// If the operands have different lengths -pub fn or_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { - binary_boolean_kernel(left, right, buffer_bin_or, build_null_buffer_for_or_kleene) + binary_boolean_kernel(left, right, |a, b| a | b) } /// Performs unary `NOT` operation on an arrays. If value is null then the result is also @@ -347,26 +296,9 @@ pub fn or_kleene( /// assert_eq!(not_a, BooleanArray::from(vec![Some(true), Some(false), None])); /// ``` pub fn not(left: &BooleanArray) -> Result { - let left_offset = left.offset(); - let len = left.len(); - - let data = left.data_ref(); - let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); - - let values = buffer_unary_not(data.buffers()[0], left_offset, len); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let nulls = left.nulls().cloned(); + let values = !left.values(); + Ok(BooleanArray::new(values, nulls)) } /// Returns a non-null [BooleanArray] with whether each value of the array is null. @@ -381,29 +313,12 @@ pub fn not(left: &BooleanArray) -> Result { /// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true])); /// ``` pub fn is_null(input: &dyn Array) -> Result { - let len = input.len(); - - let output = match input.data_ref().nulls() { - None => { - let len_bytes = ceil(len, 8); - MutableBuffer::from_len_zeroed(len_bytes).into() - } - Some(nulls) => buffer_unary_not(nulls.buffer(), nulls.offset(), nulls.len()), + let values = match input.nulls() { + None => NullBuffer::new_null(input.len()).into_inner(), + Some(nulls) => !nulls.inner(), }; - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - None, - 0, - vec![output], - vec![], - ) - }; - - Ok(BooleanArray::from(data)) + Ok(BooleanArray::new(values, None)) } /// Returns a non-null [BooleanArray] with whether each value of the array is not null. @@ -420,7 +335,7 @@ pub fn is_null(input: &dyn Array) -> Result { pub fn is_not_null(input: &dyn Array) -> Result { let len = input.len(); - let output = match input.data_ref().nulls() { + let output = match input.nulls() { None => { let len_bytes = ceil(len, 8); MutableBuffer::new(len_bytes) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 0862230a499e..49a184369801 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -294,7 +294,7 @@ impl DictionaryArray { /// Returns a clone of the value type of this list. pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() + self.values.data_type().clone() } /// The length of the dictionary is the length of the keys array. diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 062961a20abb..af51ff787722 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -425,7 +425,7 @@ impl From for FixedSizeBinaryArray { .len(v.len()) .offset(v.offset()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) - .nulls(v.data_ref().nulls().cloned()); + .nulls(v.nulls().cloned()); let data = unsafe { builder.build_unchecked() }; Self::from(data) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 7d65927cdeec..0910e2944f76 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -76,7 +76,7 @@ impl FixedSizeListArray { /// Returns a clone of the value type of this list. pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() + self.values.data_type().clone() } /// Returns ith value of this list array. diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index dca256008db2..c7e2a817ba33 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -85,7 +85,7 @@ impl GenericListArray { /// Returns a clone of the value type of this list. pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() + self.values.data_type().clone() } /// Returns ith value of this list array. diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index fba6e41e871d..3d65e9e9ebad 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -97,7 +97,7 @@ impl Array for NullArray { /// Returns the total number of null values in this array. /// The null count of a `NullArray` always equals its length. fn null_count(&self) -> usize { - self.data_ref().len() + self.len() } } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index ab6460e835f9..c6fedb960345 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -22,7 +22,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, MutableBuffer}; +use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; @@ -1145,9 +1145,9 @@ where } type LexicographicalCompareItem<'a> = ( - &'a ArrayData, // data - DynComparator, // comparator - SortOptions, // sort_option + Option<&'a NullBuffer>, // nulls + DynComparator, // comparator + SortOptions, // sort_option ); /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data @@ -1159,8 +1159,13 @@ pub struct LexicographicalComparator<'a> { impl LexicographicalComparator<'_> { /// lexicographically compare values at the wrapped columns with given indices. pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { - for (data, comparator, sort_option) in &self.compare_items { - match (data.is_valid(a_idx), data.is_valid(b_idx)) { + for (nulls, comparator, sort_option) in &self.compare_items { + let (lhs_valid, rhs_valid) = match nulls { + Some(n) => (n.is_valid(a_idx), n.is_valid(b_idx)), + None => (true, true), + }; + + match (lhs_valid, rhs_valid) { (true, true) => { match (comparator)(a_idx, b_idx) { // equal, move on to next column @@ -1205,11 +1210,9 @@ impl LexicographicalComparator<'_> { .iter() .map(|column| { // flatten and convert build comparators - // use ArrayData for is_valid checks later to avoid dynamic call let values = column.values.as_ref(); - let data = values.data_ref(); Ok(( - data, + values.nulls(), build_compare(values, values)?, column.options.unwrap_or_default(), )) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 14fd5d9d1d32..567aaa58e8bf 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -53,11 +53,7 @@ pub struct SlicesIterator<'a>(BitSliceIterator<'a>); impl<'a> SlicesIterator<'a> { pub fn new(filter: &'a BooleanArray) -> Self { - let values = &filter.data_ref().buffers()[0]; - let len = filter.len(); - let offset = filter.offset(); - - Self(BitSliceIterator::new(values, offset, len)) + Self(filter.values().set_slices()) } } @@ -149,18 +145,9 @@ pub fn build_filter(filter: &BooleanArray) -> Result { /// Remove null values by do a bitmask AND operation with null bits and the boolean bits. pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { - let array_data = filter.data_ref(); - let nulls = array_data.nulls().unwrap(); + let nulls = filter.nulls().unwrap(); let mask = filter.values() & nulls.inner(); - - let array_data = ArrayData::builder(DataType::Boolean) - .len(mask.len()) - .offset(mask.offset()) - .add_buffer(mask.into_inner()); - - let array_data = unsafe { array_data.build_unchecked() }; - - BooleanArray::from(array_data) + BooleanArray::new(mask, None) } /// Filters an [Array], returning elements matching the filter (i.e. where the values are true). @@ -365,9 +352,10 @@ fn filter_array( t => unimplemented!("Filter not supported for dictionary type {:?}", t) } _ => { + let data = values.to_data(); // fallback to using MutableArrayData let mut mutable = MutableArrayData::new( - vec![values.data_ref()], + vec![&data], false, predicate.count, ); diff --git a/arrow/benches/mutable_array.rs b/arrow/benches/mutable_array.rs index 3a42ec1be3c3..b04e5cd84926 100644 --- a/arrow/benches/mutable_array.rs +++ b/arrow/benches/mutable_array.rs @@ -39,7 +39,8 @@ fn create_slices(size: usize) -> Vec<(usize, usize)> { } fn bench(v1: &T, slices: &[(usize, usize)]) { - let mut mutable = MutableArrayData::new(vec![v1.data_ref()], false, 5); + let data = v1.to_data(); + let mut mutable = MutableArrayData::new(vec![&data], false, 5); for (start, end) in slices { mutable.extend(0, *start, *end) } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index b6f81f6a4c1a..af81b17e4aa8 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -856,7 +856,7 @@ fn test_struct_equal_null() { )])) .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) .len(5) - .add_child_data(a.data_ref().clone()) + .add_child_data(a.to_data()) .build() .unwrap(); let a = make_array(a); @@ -920,7 +920,7 @@ fn test_struct_equal_null_variable_size() { )])) .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) .len(5) - .add_child_data(strings1.data_ref().clone()) + .add_child_data(strings1.to_data()) .build() .unwrap(); let a = make_array(a); @@ -932,7 +932,7 @@ fn test_struct_equal_null_variable_size() { )])) .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) .len(5) - .add_child_data(strings2.data_ref().clone()) + .add_child_data(strings2.to_data()) .build() .unwrap(); let b = make_array(b); diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index b470be5ad408..91e839fc1890 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -257,7 +257,7 @@ mod tests { assert_eq!( vec![true, false, false, false, false], (0..5) - .map(|idx| struct_array.data_ref().is_null(idx)) + .map(|idx| struct_array.is_null(idx)) .collect::>() ); assert_eq!( From 33bbaa52f7f3dc421d09cb5e50682d05127d2cb1 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Fri, 24 Mar 2023 13:23:04 +0300 Subject: [PATCH 0725/1411] feat: add comparison/sort support for Float16 (#3915) * feat: add comparison/sort support for Float16 * fix: tests with the feature "dyn_cmp_dict" --- arrow-array/src/array/primitive_array.rs | 1 + arrow-ord/Cargo.toml | 1 + arrow-ord/src/comparison.rs | 211 +++++++++++++++++++++++ arrow-ord/src/ord.rs | 11 ++ arrow-ord/src/sort.rs | 160 +++++++++++++++++ 5 files changed, 384 insertions(+) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 6faecb1f0e55..bc62677c738b 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -996,6 +996,7 @@ def_numeric_from_vec!(UInt8Type); def_numeric_from_vec!(UInt16Type); def_numeric_from_vec!(UInt32Type); def_numeric_from_vec!(UInt64Type); +def_numeric_from_vec!(Float16Type); def_numeric_from_vec!(Float32Type); def_numeric_from_vec!(Float64Type); def_numeric_from_vec!(Decimal128Type); diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index bc6feb4f2a29..161fce9606d7 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -44,6 +44,7 @@ arrow-data = { version = "35.0.0", path = "../arrow-data" } arrow-schema = { version = "35.0.0", path = "../arrow-schema" } arrow-select = { version = "35.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } +half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 0f9414378c4a..aa2f1416d83d 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -31,6 +31,7 @@ use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; +use half::f16; /// Helper function to perform boolean lambda function on values from two array accessors, this /// version does not attempt to use SIMD. @@ -497,6 +498,11 @@ macro_rules! dyn_compare_scalar { let left = as_primitive_array::($LEFT); $OP::(left, right) } + DataType::Float16 => { + let right = try_to_type!($RIGHT, to_f32)?; + let left = as_primitive_array::($LEFT); + $OP::(left, f16::from_f32(right)) + } DataType::Float32 => { let right = try_to_type!($RIGHT, to_f32)?; let left = as_primitive_array::($LEFT); @@ -1524,6 +1530,9 @@ macro_rules! typed_cmp_dict_non_dict { (DataType::UInt64, DataType::UInt64) => { typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), UInt64Type, $OP_BOOL, $OP) } + (DataType::Float16, DataType::Float16) => { + typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float16Type, $OP_BOOL, $OP_FLOAT) + } (DataType::Float32, DataType::Float32) => { typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float32Type, $OP_BOOL, $OP_FLOAT) } @@ -1621,6 +1630,9 @@ macro_rules! typed_compares { (DataType::UInt64, DataType::UInt64) => { cmp_primitive_array::($LEFT, $RIGHT, $OP) } + (DataType::Float16, DataType::Float16) => { + cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) + } (DataType::Float32, DataType::Float32) => { cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) } @@ -1772,6 +1784,9 @@ macro_rules! typed_dict_cmp { (DataType::UInt64, DataType::UInt64) => { cmp_dict::<$KT, UInt64Type, _>($LEFT, $RIGHT, $OP) } + (DataType::Float16, DataType::Float16) => { + cmp_dict::<$KT, Float16Type, _>($LEFT, $RIGHT, $OP_FLOAT) + } (DataType::Float32, DataType::Float32) => { cmp_dict::<$KT, Float32Type, _>($LEFT, $RIGHT, $OP_FLOAT) } @@ -4988,6 +5003,30 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(neq(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5040,6 +5079,31 @@ mod tests { #[test] fn test_lt_dyn_lt_eq_dyn_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] + .into_iter() + .map(Some) + .collect(); + let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)] + .into_iter() + .map(Some) + .collect(); + + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(lt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] .into_iter() .map(Some) @@ -5093,6 +5157,31 @@ mod tests { #[test] fn test_gt_dyn_gt_eq_dyn_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] + .into_iter() + .map(Some) + .collect(); + let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)] + .into_iter() + .map(Some) + .collect(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(gt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + + #[cfg(not(feature = "simd"))] + assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] .into_iter() .map(Some) @@ -5146,6 +5235,30 @@ mod tests { #[test] fn test_eq_dyn_scalar_neq_dyn_scalar_float_nan() { + let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + #[cfg(feature = "simd")] + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); + assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); + + #[cfg(feature = "simd")] + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); + assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); + let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5197,6 +5310,30 @@ mod tests { #[test] fn test_lt_dyn_scalar_lt_eq_dyn_scalar_float_nan() { + let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + #[cfg(feature = "simd")] + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_dyn_scalar(&array, f16::NAN).unwrap(), expected); + + #[cfg(feature = "simd")] + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); + assert_eq!(lt_eq_dyn_scalar(&array, f16::NAN).unwrap(), expected); + let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5248,6 +5385,25 @@ mod tests { #[test] fn test_gt_dyn_scalar_gt_eq_dyn_scalar_float_nan() { + let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_dyn_scalar(&array, f16::NAN).unwrap(), expected); + + #[cfg(feature = "simd")] + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(false), Some(false), Some(false)], + ); + assert_eq!(gt_eq_dyn_scalar(&array, f16::NAN).unwrap(), expected); + let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5502,6 +5658,25 @@ mod tests { #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dict_non_dict_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] + .into_iter() + .map(Some) + .collect(); + let values = + Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(10.0)]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(true), Some(true)], + ); + assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(false), Some(false)], + ); + assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] .into_iter() .map(Some) @@ -5542,6 +5717,24 @@ mod tests { #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_dict_non_dict_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] + .into_iter() + .map(Some) + .collect(); + let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], + ); + assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(false), Some(false)], + ); + assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] .into_iter() .map(Some) @@ -5582,6 +5775,24 @@ mod tests { #[test] #[cfg(feature = "dyn_cmp_dict")] fn test_gt_dyn_gt_eq_dyn_dict_non_dict_float_nan() { + let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] + .into_iter() + .map(Some) + .collect(); + let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + + let expected = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], + ); + assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), Some(true), Some(true)], + ); + assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); + let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] .into_iter() .map(Some) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index dc352c5b7274..66058907f15a 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -173,6 +173,7 @@ pub fn build_compare( (Int16, Int16) => compare_primitives::(left, right), (Int32, Int32) => compare_primitives::(left, right), (Int64, Int64) => compare_primitives::(left, right), + (Float16, Float16) => compare_primitives::(left, right), (Float32, Float32) => compare_primitives::(left, right), (Float64, Float64) => compare_primitives::(left, right), (Decimal128(_, _), Decimal128(_, _)) => { @@ -286,6 +287,7 @@ pub mod tests { use super::*; use arrow_array::{FixedSizeBinaryArray, Float64Array, Int32Array}; use arrow_buffer::i256; + use half::f16; use std::cmp::Ordering; #[test] @@ -329,6 +331,15 @@ pub mod tests { assert_eq!(Ordering::Less, (cmp)(0, 0)); } + #[test] + fn test_f16() { + let array = Float16Array::from(vec![f16::from_f32(1.0), f16::from_f32(2.0)]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, (cmp)(0, 1)); + } + #[test] fn test_f64() { let array = Float64Array::from(vec![1.0, 2.0]); diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index c6fedb960345..9b17651f9258 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -183,6 +183,14 @@ pub fn sort_to_indices( DataType::UInt64 => { sort_primitive::(values, v, n, cmp, &options, limit) } + DataType::Float16 => sort_primitive::( + values, + v, + n, + |x, y| x.total_cmp(&y), + &options, + limit, + ), DataType::Float32 => sort_primitive::( values, v, @@ -283,6 +291,9 @@ pub fn sort_to_indices( DataType::UInt64 => { sort_list::(values, v, n, &options, limit) } + DataType::Float16 => { + sort_list::(values, v, n, &options, limit) + } DataType::Float32 => { sort_list::(values, v, n, &options, limit) } @@ -310,6 +321,9 @@ pub fn sort_to_indices( DataType::UInt64 => { sort_list::(values, v, n, &options, limit) } + DataType::Float16 => { + sort_list::(values, v, n, &options, limit) + } DataType::Float32 => { sort_list::(values, v, n, &options, limit) } @@ -1266,6 +1280,7 @@ mod tests { use super::*; use arrow_array::builder::PrimitiveRunBuilder; use arrow_buffer::i256; + use half::f16; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; use std::convert::TryFrom; @@ -1702,6 +1717,19 @@ mod tests { None, vec![0, 5, 3, 1, 4, 2], ); + test_sort_to_indices_primitive_arrays::( + vec![ + None, + Some(f16::from_f32(-0.05)), + Some(f16::from_f32(2.225)), + Some(f16::from_f32(-1.01)), + Some(f16::from_f32(-0.05)), + None, + ], + None, + None, + vec![0, 5, 3, 1, 4, 2], + ); test_sort_to_indices_primitive_arrays::( vec![ None, @@ -1770,6 +1798,23 @@ mod tests { vec![2, 1, 4, 3, 5, 0], ); + test_sort_to_indices_primitive_arrays::( + vec![ + None, + Some(f16::from_f32(0.005)), + Some(f16::from_f32(20.22)), + Some(f16::from_f32(-10.3)), + Some(f16::from_f32(0.005)), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![2, 1, 4, 3, 5, 0], + ); + test_sort_to_indices_primitive_arrays::( vec![ None, @@ -1838,6 +1883,23 @@ mod tests { vec![5, 0, 2, 1, 4, 3], ); + test_sort_to_indices_primitive_arrays::( + vec![ + None, + Some(f16::from_f32(0.1)), + Some(f16::from_f32(0.2)), + Some(f16::from_f32(-1.3)), + Some(f16::from_f32(0.01)), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![5, 0, 2, 1, 4, 3], + ); + test_sort_to_indices_primitive_arrays::( vec![None, Some(0.1), Some(0.2), Some(-1.3), Some(0.01), None], Some(SortOptions { @@ -2650,6 +2712,30 @@ mod tests { vec![None, None, Some(2)], ); + test_sort_primitive_arrays::( + vec![ + None, + Some(f16::from_f32(0.0)), + Some(f16::from_f32(2.0)), + Some(f16::from_f32(-1.0)), + Some(f16::from_f32(0.0)), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: true, + }), + None, + vec![ + None, + None, + Some(f16::from_f32(2.0)), + Some(f16::from_f32(0.0)), + Some(f16::from_f32(0.0)), + Some(f16::from_f32(-1.0)), + ], + ); + test_sort_primitive_arrays::( vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], Some(SortOptions { @@ -2715,6 +2801,29 @@ mod tests { None, vec![None, None, Some(-1), Some(0), Some(0), Some(2)], ); + test_sort_primitive_arrays::( + vec![ + None, + Some(f16::from_f32(0.0)), + Some(f16::from_f32(2.0)), + Some(f16::from_f32(-1.0)), + Some(f16::from_f32(0.0)), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![ + None, + None, + Some(f16::from_f32(-1.0)), + Some(f16::from_f32(0.0)), + Some(f16::from_f32(0.0)), + Some(f16::from_f32(2.0)), + ], + ); test_sort_primitive_arrays::( vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], Some(SortOptions { @@ -3391,6 +3500,57 @@ mod tests { Some(1), ); + test_sort_list_arrays::( + vec![ + Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(0.0))]), + Some(vec![ + Some(f16::from_f32(4.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(2.0)), + Some(f16::from_f32(1.0)), + ]), + Some(vec![ + Some(f16::from_f32(2.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(4.0)), + ]), + Some(vec![ + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + ]), + Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(1.0))]), + ], + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![ + Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(0.0))]), + Some(vec![Some(f16::from_f32(1.0)), Some(f16::from_f32(1.0))]), + Some(vec![ + Some(f16::from_f32(2.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(4.0)), + ]), + Some(vec![ + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(3.0)), + ]), + Some(vec![ + Some(f16::from_f32(4.0)), + Some(f16::from_f32(3.0)), + Some(f16::from_f32(2.0)), + Some(f16::from_f32(1.0)), + ]), + ], + None, + ); + test_sort_list_arrays::( vec![ Some(vec![Some(1.0), Some(0.0)]), From f3304965a7bf662111253c98445c64173db3477a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Mar 2023 11:02:21 +0000 Subject: [PATCH 0726/1411] Move protoc generation to binary crate, unpin prost/tonic build (#3876) (#3927) * Move protoc generation to binary crate (#3876) * Review feedback * Format * Fix link --- .github/workflows/arrow_flight.yml | 15 ++++- Cargo.toml | 1 + arrow-flight/CONTRIBUTING.md | 41 ++++++++++++ arrow-flight/Cargo.toml | 7 -- arrow-flight/build.rs | 102 ----------------------------- arrow-flight/gen/Cargo.toml | 37 +++++++++++ arrow-flight/gen/src/main.rs | 86 ++++++++++++++++++++++++ arrow-flight/regen.sh | 21 ++++++ 8 files changed, 199 insertions(+), 111 deletions(-) create mode 100644 arrow-flight/CONTRIBUTING.md delete mode 100644 arrow-flight/build.rs create mode 100644 arrow-flight/gen/Cargo.toml create mode 100644 arrow-flight/gen/src/main.rs create mode 100755 arrow-flight/regen.sh diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 7facf17197fc..5301a3f8563f 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -41,7 +41,6 @@ on: - .github/** jobs: - # test the crate linux-test: name: Test runs-on: ubuntu-latest @@ -62,7 +61,19 @@ jobs: - name: Test --examples run: | cargo test -p arrow-flight --features=flight-sql-experimental,tls --examples - - name: Verify workspace clean + + vendor: + name: Verify Vendored Code + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v3 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Run gen + run: ./arrow-flight/regen.sh + - name: Verify workspace clean (if this fails, run ./arrow-flight/regen.sh and check in results) run: git diff --exit-code clippy: diff --git a/Cargo.toml b/Cargo.toml index ebecc9eaf078..64ce3166e608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "arrow-csv", "arrow-data", "arrow-flight", + "arrow-flight/gen", "arrow-integration-test", "arrow-integration-testing", "arrow-ipc", diff --git a/arrow-flight/CONTRIBUTING.md b/arrow-flight/CONTRIBUTING.md new file mode 100644 index 000000000000..156a0b9caaed --- /dev/null +++ b/arrow-flight/CONTRIBUTING.md @@ -0,0 +1,41 @@ + + +# Flight + +## Generated Code + +The prost/tonic code can be generated by running, which in turn invokes the Rust binary located in [gen](./gen) + +This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a +valid installation of [protoc](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation). + +```bash +./regen.sh +``` + +### Why Vendor + +The standard approach to integrating `prost-build` / `tonic-build` is to use a `build.rs` script that automatically generates the code as part of the standard build process. + +Unfortunately this caused a lot of friction for users: + +- Requires all users to have a protoc install in order to compile the crate - [#2616](https://github.com/apache/arrow-rs/issues/2616) +- Some distributions have very old versions of protoc that don't support required functionality - [#1574](https://github.com/apache/arrow-rs/issues/1574) +- Inconsistent support within IDEs for code completion of automatically generated code diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 5f839ca6838a..e8f57345eca0 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -64,13 +64,6 @@ tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" -[build-dependencies] -# Pin specific version of the tonic-build dependencies to avoid auto-generated -# (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.53", default-features = false } -prost-build = { version = "=0.11.8", default-features = false } -tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } - [[example]] name = "flight_sql_server" required-features = ["flight-sql-experimental", "tls"] diff --git a/arrow-flight/build.rs b/arrow-flight/build.rs deleted file mode 100644 index 3f50fa81279f..000000000000 --- a/arrow-flight/build.rs +++ /dev/null @@ -1,102 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{ - fs::OpenOptions, - io::{Read, Write}, - path::Path, -}; - -fn main() -> Result<(), Box> { - // The current working directory can vary depending on how the project is being - // built or released so we build an absolute path to the proto file - let path = Path::new("../format/Flight.proto"); - if path.exists() { - // avoid rerunning build if the file has not changed - println!("cargo:rerun-if-changed=../format/Flight.proto"); - - let proto_dir = Path::new("../format"); - let proto_path = Path::new("../format/Flight.proto"); - - tonic_build::configure() - // protoc in unbuntu builder needs this option - .protoc_arg("--experimental_allow_proto3_optional") - .out_dir("src") - .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; - - // read file contents to string - let mut file = OpenOptions::new() - .read(true) - .open("src/arrow.flight.protocol.rs")?; - let mut buffer = String::new(); - file.read_to_string(&mut buffer)?; - // append warning that file was auto-generate - let mut file = OpenOptions::new() - .write(true) - .truncate(true) - .open("src/arrow.flight.protocol.rs")?; - file.write_all("// This file was automatically generated through the build.rs script, and should not be edited.\n\n".as_bytes())?; - file.write_all(buffer.as_bytes())?; - } - - // The current working directory can vary depending on how the project is being - // built or released so we build an absolute path to the proto file - let path = Path::new("../format/FlightSql.proto"); - if path.exists() { - // avoid rerunning build if the file has not changed - println!("cargo:rerun-if-changed=../format/FlightSql.proto"); - - let proto_dir = Path::new("../format"); - let proto_path = Path::new("../format/FlightSql.proto"); - - tonic_build::configure() - // protoc in ubuntu builder needs this option - .protoc_arg("--experimental_allow_proto3_optional") - .out_dir("src/sql") - .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; - - // read file contents to string - let mut file = OpenOptions::new() - .read(true) - .open("src/sql/arrow.flight.protocol.sql.rs")?; - let mut buffer = String::new(); - file.read_to_string(&mut buffer)?; - // append warning that file was auto-generate - let mut file = OpenOptions::new() - .write(true) - .truncate(true) - .open("src/sql/arrow.flight.protocol.sql.rs")?; - file.write_all("// This file was automatically generated through the build.rs script, and should not be edited.\n\n".as_bytes())?; - file.write_all(buffer.as_bytes())?; - } - - // Prost currently generates an empty file, this was fixed but then reverted - // https://github.com/tokio-rs/prost/pull/639 - let google_protobuf_rs = Path::new("src/sql/google.protobuf.rs"); - if google_protobuf_rs.exists() && google_protobuf_rs.metadata().unwrap().len() == 0 { - std::fs::remove_file(google_protobuf_rs).unwrap(); - } - - // As the proto file is checked in, the build should not fail if the file is not found - Ok(()) -} - -fn prost_config() -> prost_build::Config { - let mut config = prost_build::Config::new(); - config.bytes([".arrow"]); - config -} diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml new file mode 100644 index 000000000000..c3b9cbd8c13a --- /dev/null +++ b/arrow-flight/gen/Cargo.toml @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "gen" +description = "Code generation for arrow-flight" +version = "0.1.0" +edition = "2021" +rust-version = "1.62" +authors = ["Apache Arrow "] +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +license = "Apache-2.0" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# Pin specific version of the tonic-build dependencies to avoid auto-generated +# (and checked in) arrow.flight.protocol.rs from changing +proc-macro2 = { version = "=1.0.53", default-features = false } +prost-build = { version = "=0.11.8", default-features = false } +tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/gen/src/main.rs b/arrow-flight/gen/src/main.rs new file mode 100644 index 000000000000..a3541c63b173 --- /dev/null +++ b/arrow-flight/gen/src/main.rs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + fs::OpenOptions, + io::{Read, Write}, + path::Path, +}; + +fn main() -> Result<(), Box> { + let proto_dir = Path::new("../format"); + let proto_path = Path::new("../format/Flight.proto"); + + tonic_build::configure() + // protoc in unbuntu builder needs this option + .protoc_arg("--experimental_allow_proto3_optional") + .out_dir("src") + .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; + + // read file contents to string + let mut file = OpenOptions::new() + .read(true) + .open("src/arrow.flight.protocol.rs")?; + let mut buffer = String::new(); + file.read_to_string(&mut buffer)?; + // append warning that file was auto-generate + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .open("src/arrow.flight.protocol.rs")?; + file.write_all("// This file was automatically generated through the build.rs script, and should not be edited.\n\n".as_bytes())?; + file.write_all(buffer.as_bytes())?; + + let proto_dir = Path::new("../format"); + let proto_path = Path::new("../format/FlightSql.proto"); + + tonic_build::configure() + // protoc in ubuntu builder needs this option + .protoc_arg("--experimental_allow_proto3_optional") + .out_dir("src/sql") + .compile_with_config(prost_config(), &[proto_path], &[proto_dir])?; + + // read file contents to string + let mut file = OpenOptions::new() + .read(true) + .open("src/sql/arrow.flight.protocol.sql.rs")?; + let mut buffer = String::new(); + file.read_to_string(&mut buffer)?; + // append warning that file was auto-generate + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .open("src/sql/arrow.flight.protocol.sql.rs")?; + file.write_all("// This file was automatically generated through the build.rs script, and should not be edited.\n\n".as_bytes())?; + file.write_all(buffer.as_bytes())?; + + // Prost currently generates an empty file, this was fixed but then reverted + // https://github.com/tokio-rs/prost/pull/639 + let google_protobuf_rs = Path::new("src/sql/google.protobuf.rs"); + if google_protobuf_rs.exists() && google_protobuf_rs.metadata().unwrap().len() == 0 { + std::fs::remove_file(google_protobuf_rs).unwrap(); + } + + // As the proto file is checked in, the build should not fail if the file is not found + Ok(()) +} + +fn prost_config() -> prost_build::Config { + let mut config = prost_build::Config::new(); + config.bytes([".arrow"]); + config +} diff --git a/arrow-flight/regen.sh b/arrow-flight/regen.sh new file mode 100755 index 000000000000..d83f9d580e8d --- /dev/null +++ b/arrow-flight/regen.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR && cargo run --manifest-path gen/Cargo.toml From 09fd4528dd3fe3539511aa3f528891eb1cabea1e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:25:38 +0000 Subject: [PATCH 0727/1411] Enforce struct nullability in JSON raw reader (#3900) (#3904) (#3906) * Enforce struct nullability in JSON raw reader (#3900) (#3904) * Fix tests * Review feedback --- arrow-array/src/array/boolean_array.rs | 11 +-- arrow-buffer/src/buffer/boolean.rs | 10 +- arrow-buffer/src/util/bit_chunk_iterator.rs | 6 ++ arrow-data/src/data/mod.rs | 9 +- arrow-data/src/equal/utils.rs | 13 +-- arrow-json/src/raw/mod.rs | 102 +++++++++++++++++++- arrow-json/src/raw/struct_array.rs | 32 ++++-- 7 files changed, 142 insertions(+), 41 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 98de62da0912..dea5c07da281 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -127,15 +127,10 @@ impl BooleanArray { pub fn true_count(&self) -> usize { match self.data.nulls() { Some(nulls) => { - let null_chunks = nulls.inner().bit_chunks(); - let value_chunks = self.values().bit_chunks(); + let null_chunks = nulls.inner().bit_chunks().iter_padded(); + let value_chunks = self.values().bit_chunks().iter_padded(); null_chunks - .iter() - .zip(value_chunks.iter()) - .chain(std::iter::once(( - null_chunks.remainder_bits(), - value_chunks.remainder_bits(), - ))) + .zip(value_chunks) .map(|(a, b)| (a & b).count_ones() as usize) .sum() } diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 53ead45732d3..c89cfb3324c9 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -36,13 +36,9 @@ impl PartialEq for BooleanBuffer { return false; } - let lhs = self.bit_chunks(); - let rhs = other.bit_chunks(); - - if lhs.iter().zip(rhs.iter()).any(|(a, b)| a != b) { - return false; - } - lhs.remainder_bits() == rhs.remainder_bits() + let lhs = self.bit_chunks().iter_padded(); + let rhs = other.bit_chunks().iter_padded(); + lhs.zip(rhs).all(|(a, b)| a == b) } } diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index a739a9694200..3d9632e73229 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -296,6 +296,12 @@ impl<'a> BitChunks<'a> { index: 0, } } + + /// Returns an iterator over chunks of 64 bits, with the remaining bits zero padded to 64-bits + #[inline] + pub fn iter_padded(&self) -> impl Iterator + 'a { + self.iter().chain(std::iter::once(self.remainder_bits())) + } } impl<'a> IntoIterator for BitChunks<'a> { diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index cc908d639553..7241a5d80ee0 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -1218,12 +1218,9 @@ impl ArrayData { let mask = BitChunks::new(mask, offset, child.len); let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); mask - .iter() - .zip(nulls.iter()) - .chain(std::iter::once(( - mask.remainder_bits(), - nulls.remainder_bits(), - ))).try_for_each(|(m, c)| { + .iter_padded() + .zip(nulls.iter_padded()) + .try_for_each(|(m, c)| { if (m & !c) != 0 { return Err(ArrowError::InvalidArgumentError(format!( "non-nullable child of type {} contains nulls not present in parent", diff --git a/arrow-data/src/equal/utils.rs b/arrow-data/src/equal/utils.rs index d1f0f392a195..6b9a7940dc96 100644 --- a/arrow-data/src/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -29,16 +29,9 @@ pub(super) fn equal_bits( rhs_start: usize, len: usize, ) -> bool { - let lhs = BitChunks::new(lhs_values, lhs_start, len); - let rhs = BitChunks::new(rhs_values, rhs_start, len); - - for (a, b) in lhs.iter().zip(rhs.iter()) { - if a != b { - return false; - } - } - - lhs.remainder_bits() == rhs.remainder_bits() + let lhs = BitChunks::new(lhs_values, lhs_start, len).iter_padded(); + let rhs = BitChunks::new(rhs_values, rhs_start, len).iter_padded(); + lhs.zip(rhs).all(|(a, b)| a == b) } #[inline] diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 2e5055bf149e..21e6191ac7b2 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -359,7 +359,7 @@ mod tests { use crate::ReaderBuilder; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::Array; + use arrow_array::{Array, StructArray}; use arrow_buffer::ArrowNativeType; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_schema::{DataType, Field, Schema}; @@ -511,8 +511,8 @@ mod tests { Field::new( "nested", DataType::Struct(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, true), ]), true, ), @@ -591,7 +591,7 @@ mod tests { "list2", DataType::List(Box::new(Field::new( "element", - DataType::Struct(vec![Field::new("d", DataType::Int32, false)]), + DataType::Struct(vec![Field::new("d", DataType::Int32, true)]), false, ))), true, @@ -1001,4 +1001,98 @@ mod tests { test_time::(); test_time::(); } + + #[test] + fn test_delta_checkpoint() { + let json = "{\"protocol\":{\"minReaderVersion\":1,\"minWriterVersion\":2}}"; + let schema = Arc::new(Schema::new(vec![ + Field::new( + "protocol", + DataType::Struct(vec![ + Field::new("minReaderVersion", DataType::Int32, true), + Field::new("minWriterVersion", DataType::Int32, true), + ]), + true, + ), + Field::new( + "add", + DataType::Struct(vec![Field::new( + "partitionValues", + DataType::Map( + Box::new(Field::new( + "key_value", + DataType::Struct(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ]), + false, + )), + false, + ), + false, + )]), + true, + ), + ])); + + let batches = do_read(json, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let s: StructArray = batches.into_iter().next().unwrap().into(); + let opts = FormatOptions::default().with_null("null"); + let formatter = ArrayFormatter::try_new(&s, &opts).unwrap(); + assert_eq!( + formatter.value(0).to_string(), + "{protocol: {minReaderVersion: 1, minWriterVersion: 2}, add: null}" + ); + } + + #[test] + fn struct_nullability() { + let do_test = |child: DataType| { + // Test correctly enforced nullability + let non_null = r#"{"foo": {}}"#; + let schema = Arc::new(Schema::new(vec![Field::new( + "foo", + DataType::Struct(vec![Field::new("bar", child, false)]), + true, + )])); + let mut reader = RawReaderBuilder::new(schema.clone()) + .build(Cursor::new(non_null.as_bytes())) + .unwrap(); + assert!(reader.next().unwrap().is_err()); // Should error as not nullable + + let null = r#"{"foo": {bar: null}}"#; + let mut reader = RawReaderBuilder::new(schema.clone()) + .build(Cursor::new(null.as_bytes())) + .unwrap(); + assert!(reader.next().unwrap().is_err()); // Should error as not nullable + + // Test nulls in nullable parent can mask nulls in non-nullable child + let null = r#"{"foo": null}"#; + let mut reader = RawReaderBuilder::new(schema) + .build(Cursor::new(null.as_bytes())) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 1); + let foo = batch.column(0).as_struct(); + assert_eq!(foo.len(), 1); + assert!(foo.is_null(0)); + assert_eq!(foo.num_columns(), 1); + + let bar = foo.column(0); + assert_eq!(bar.len(), 1); + // Non-nullable child can still contain null as masked by parent + assert!(bar.is_null(0)); + }; + + do_test(DataType::Boolean); + do_test(DataType::Int32); + do_test(DataType::Utf8); + do_test(DataType::Decimal128(2, 1)); + do_test(DataType::Timestamp( + TimeUnit::Microsecond, + Some("+00:00".to_string()), + )); + } } diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs index 1d0019993426..219f56ae639d 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/raw/struct_array.rs @@ -37,7 +37,11 @@ impl StructArrayDecoder { let decoders = struct_fields(&data_type) .iter() .map(|f| { - make_decoder(f.data_type().clone(), coerce_primitive, f.is_nullable()) + // If this struct nullable, need to permit nullability in child array + // StructArrayDecoder::decode verifies that if the child is not nullable + // it doesn't contain any nulls not masked by its parent + let nullable = f.is_nullable() || is_nullable; + make_decoder(f.data_type().clone(), coerce_primitive, nullable) }) .collect::, ArrowError>>()?; @@ -102,15 +106,31 @@ impl ArrayDecoder for StructArrayDecoder { .map(|(d, pos)| d.decode(tape, &pos)) .collect::, ArrowError>>()?; - // Sanity check - child_data - .iter() - .for_each(|x| assert_eq!(x.len(), pos.len())); - let nulls = nulls .as_mut() .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + for (c, f) in child_data.iter().zip(fields) { + // Sanity check + assert_eq!(c.len(), pos.len()); + + if !f.is_nullable() && c.null_count() != 0 { + // Need to verify nulls + let valid = match nulls.as_ref() { + Some(nulls) => { + let lhs = nulls.inner().bit_chunks().iter_padded(); + let rhs = c.nulls().unwrap().inner().bit_chunks().iter_padded(); + lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) + } + None => false, + }; + + if !valid { + return Err(ArrowError::JsonError(format!("Encountered unmasked nulls in non-nullable StructArray child: {f}"))); + } + } + } + let data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) .nulls(nulls) From 156858db6f857336863179a8a6805526e0392af8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:38:46 +0000 Subject: [PATCH 0728/1411] Derive RunArray Clone (#3932) --- arrow-array/src/array/run_array.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 3aefb53b83f6..652ec0be6e6f 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -67,6 +67,16 @@ pub struct RunArray { values: ArrayRef, } +impl Clone for RunArray { + fn clone(&self) -> Self { + Self { + data: self.data.clone(), + run_ends: self.run_ends.clone(), + values: self.values.clone(), + } + } +} + impl RunArray { /// Calculates the logical length of the array encoded /// by the given run_ends array. From 5eeccab922c377a18e54cf39ad49a2e4d54ffaf2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:38:58 +0000 Subject: [PATCH 0729/1411] Use dyn Array in sort kernels (#3931) --- arrow-ord/src/sort.rs | 51 ++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 9b17651f9258..6e0becc36c67 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -47,22 +47,21 @@ pub use arrow_schema::SortOptions; /// # Example /// ```rust /// # use std::sync::Arc; -/// # use arrow_array::{Int32Array, ArrayRef}; +/// # use arrow_array::Int32Array; /// # use arrow_ord::sort::sort; -/// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); +/// let array = Int32Array::from(vec![5, 4, 3, 2, 1]); /// let sorted_array = sort(&array, None).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![1, 2, 3, 4, 5])); +/// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![1, 2, 3, 4, 5])); /// ``` pub fn sort( - values: &ArrayRef, + values: &dyn Array, options: Option, ) -> Result { if let DataType::RunEndEncoded(_, _) = values.data_type() { return sort_run(values, options, None); } let indices = sort_to_indices(values, options, None)?; - take(values.as_ref(), &indices, None) + take(values, &indices, None) } /// Sort the `ArrayRef` partially. @@ -77,14 +76,13 @@ pub fn sort( /// # Example /// ```rust /// # use std::sync::Arc; -/// # use arrow_array::{Int32Array, ArrayRef}; +/// # use arrow_array::Int32Array; /// # use arrow_ord::sort::{sort_limit, SortOptions}; -/// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); +/// let array = Int32Array::from(vec![5, 4, 3, 2, 1]); /// /// // Find the the top 2 items /// let sorted_array = sort_limit(&array, None, Some(2)).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![1, 2])); +/// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![1, 2])); /// /// // Find the bottom top 2 items /// let options = Some(SortOptions { @@ -92,11 +90,10 @@ pub fn sort( /// ..Default::default() /// }); /// let sorted_array = sort_limit(&array, options, Some(2)).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![5, 4])); +/// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![5, 4])); /// ``` pub fn sort_limit( - values: &ArrayRef, + values: &dyn Array, options: Option, limit: Option, ) -> Result { @@ -104,7 +101,7 @@ pub fn sort_limit( return sort_run(values, options, limit); } let indices = sort_to_indices(values, options, limit)?; - take(values.as_ref(), &indices, None) + take(values, &indices, None) } /// we can only do this if the T is primitive @@ -128,7 +125,7 @@ where } // partition indices into valid and null indices -fn partition_validity(array: &ArrayRef) -> (Vec, Vec) { +fn partition_validity(array: &dyn Array) -> (Vec, Vec) { match array.null_count() { // faster path 0 => ((0..(array.len() as u32)).collect(), vec![]), @@ -143,7 +140,7 @@ fn partition_validity(array: &ArrayRef) -> (Vec, Vec) { /// For floating point arrays any NaN values are considered to be greater than any other non-null value /// limit is an option for partial_sort pub fn sort_to_indices( - values: &ArrayRef, + values: &dyn Array, options: Option, limit: Option, ) -> Result { @@ -407,7 +404,7 @@ pub fn sort_to_indices( /// and [tri-color sort](https://en.wikipedia.org/wiki/Dutch_national_flag_problem) /// can be used instead. fn sort_boolean( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, mut null_indices: Vec, options: &SortOptions, @@ -489,7 +486,7 @@ fn sort_boolean( /// Sort primitive values fn sort_primitive( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, null_indices: Vec, cmp: F, @@ -638,7 +635,7 @@ fn insert_valid_values(result_slice: &mut [u32], offset: usize, valids: &[(u3 // will result in output RunArray { run_ends = [2,4,6,8], values = [1,1,2,2] } // and not RunArray { run_ends = [4,8], values = [1,2] } fn sort_run( - values: &ArrayRef, + values: &dyn Array, options: Option, limit: Option, ) -> Result { @@ -656,7 +653,7 @@ fn sort_run( } fn sort_run_downcasted( - values: &ArrayRef, + values: &dyn Array, options: Option, limit: Option, ) -> Result { @@ -719,7 +716,7 @@ fn sort_run_downcasted( // logical indices and to get the run array back, the logical indices has to be // encoded back to run array. fn sort_run_to_indices( - values: &ArrayRef, + values: &dyn Array, options: &SortOptions, limit: Option, ) -> UInt32Array { @@ -819,7 +816,7 @@ where /// Sort strings fn sort_string( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, null_indices: Vec, options: &SortOptions, @@ -905,7 +902,7 @@ where } fn sort_list( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, null_indices: Vec, options: &SortOptions, @@ -920,7 +917,7 @@ where } fn sort_list_inner( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, mut null_indices: Vec, options: &SortOptions, @@ -971,7 +968,7 @@ where } fn sort_binary( - values: &ArrayRef, + values: &dyn Array, value_indices: Vec, mut null_indices: Vec, options: &SortOptions, @@ -3217,7 +3214,7 @@ mod tests { fn test_sort_run_inner(sort_fn: F) where F: Fn( - &ArrayRef, + &dyn Array, Option, Option, ) -> Result, @@ -3293,7 +3290,7 @@ mod tests { sort_fn: &F, ) where F: Fn( - &ArrayRef, + &dyn Array, Option, Option, ) -> Result, From 114b905410fd8478246dc30744da5864df87a3ae Mon Sep 17 00:00:00 2001 From: Jay Han <11144133+doki23@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:58:43 +0800 Subject: [PATCH 0730/1411] Fix JSON Temporal Encoding of Multiple Batches (#3924) * fix * fix ut and use raw reader --- arrow-json/src/reader.rs | 4 +- arrow-json/src/writer.rs | 65 +++++++++++++++++++++++++++++---- arrow-json/test/data/basic.json | 8 ++-- 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 8e33613886f1..d68d7ca91ff7 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -1859,7 +1859,7 @@ mod tests { .unwrap(); let batch = reader.next().unwrap().unwrap(); - assert_eq!(6, batch.num_columns()); + assert_eq!(7, batch.num_columns()); assert_eq!(12, batch.num_rows()); let schema = reader.schema(); @@ -3383,7 +3383,7 @@ mod tests { let mut sum_a = 0; for batch in reader { let batch = batch.unwrap(); - assert_eq!(6, batch.num_columns()); + assert_eq!(7, batch.num_columns()); sum_num_rows += batch.num_rows(); num_batches += 1; let batch_schema = batch.schema(); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index bbc04c9dc096..92883e577060 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -306,14 +306,17 @@ fn set_column_for_json_rows( let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let data = array.data(); - rows.iter_mut().enumerate().for_each(|(idx, row)| { - if data.is_valid(idx) { - row.insert( - col_name.to_string(), - formatter.value(idx).to_string().into(), - ); - } - }); + rows.iter_mut() + .take(row_count) + .enumerate() + .for_each(|(idx, row)| { + if data.is_valid(idx) { + row.insert( + col_name.to_string(), + formatter.value(idx).to_string().into(), + ); + } + }); } DataType::Struct(_) => { let inner_objs = @@ -608,9 +611,11 @@ where #[cfg(test)] mod tests { use std::fs::{read_to_string, File}; + use std::io::BufReader; use std::sync::Arc; use crate::reader::*; + use crate::RawReaderBuilder; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; use serde_json::json; @@ -1442,4 +1447,48 @@ mod tests { assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } } + + #[test] + fn test_write_multi_batches() { + let test_file = "test/data/basic.json"; + + let schema = SchemaRef::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Float64, true), + Field::new("c", DataType::Boolean, true), + Field::new("d", DataType::Utf8, true), + Field::new("e", DataType::Utf8, true), + Field::new("f", DataType::Utf8, true), + Field::new("g", DataType::Timestamp(TimeUnit::Millisecond, None), true), + ])); + + let mut reader = RawReaderBuilder::new(schema.clone()) + .build(BufReader::new(File::open(test_file).unwrap())) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + // test batches = an empty batch + 2 same batches, finally result should be eq to 2 same batches + let batches = [RecordBatch::new_empty(schema), batch.clone(), batch]; + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&batches).unwrap(); + } + + let result = String::from_utf8(buf).unwrap(); + let expected = read_to_string(test_file).unwrap(); + // result is eq to 2 same batches + let expected = format!("{expected}\n{expected}"); + for (r, e) in result.lines().zip(expected.lines()) { + let mut expected_json = serde_json::from_str::(e).unwrap(); + // remove null value from object to make comparision consistent: + if let Value::Object(obj) = expected_json { + expected_json = Value::Object( + obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), + ); + } + assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); + } + } } diff --git a/arrow-json/test/data/basic.json b/arrow-json/test/data/basic.json index 8de246e1ac28..598838dfc536 100644 --- a/arrow-json/test/data/basic.json +++ b/arrow-json/test/data/basic.json @@ -1,12 +1,12 @@ -{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2", "f": "1.02"} -{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31", "f": "-0.3"} +{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2", "f": "1.02", "g": "2012-04-23T18:25:43.511"} +{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31", "f": "-0.3", "g": "2016-04-23T18:25:43.511"} {"a":2, "b":0.6, "c":false, "d":"text", "e": "1970-01-02 11:11:11", "f": "1377.223"} {"a":1, "b":2.0, "c":false, "d":"4", "f": "1337.009"} {"a":7, "b":-3.5, "c":true, "d":"4", "f": "1"} -{"a":1, "b":0.6, "c":false, "d":"text", "f": "1338"} +{"a":1, "b":0.6, "c":false, "d":"text", "f": "1338", "g": "2018-10-23T18:33:16.481"} {"a":1, "b":2.0, "c":false, "d":"4", "f": "12345829100000"} {"a":5, "b":-3.5, "c":true, "d":"4", "f": "99999999.99"} {"a":1, "b":0.6, "c":false, "d":"text", "f": "1"} {"a":1, "b":2.0, "c":false, "d":"4", "f": "1"} {"a":1, "b":-3.5, "c":true, "d":"4", "f": "1"} -{"a":100000000000000, "b":0.6, "c":false, "d":"text", "f": "1"} +{"a":100000000000000, "b":0.6, "c":false, "d":"text", "f": "1"} \ No newline at end of file From a61453ea1ba63d09310e5ea7484067509958c73e Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 24 Mar 2023 13:03:28 -0400 Subject: [PATCH 0731/1411] Prep for 36.0.0 (#3913) * Update version * Create changelog --- CHANGELOG-old.md | 68 +++++++++++++ CHANGELOG.md | 102 +++++++++---------- arrow-arith/Cargo.toml | 10 +- arrow-array/Cargo.toml | 8 +- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 12 +-- arrow-csv/Cargo.toml | 12 +-- arrow-data/Cargo.toml | 6 +- arrow-flight/Cargo.toml | 14 +-- arrow-flight/README.md | 2 +- arrow-integration-test/Cargo.toml | 6 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-ipc/Cargo.toml | 12 +-- arrow-json/Cargo.toml | 12 +-- arrow-ord/Cargo.toml | 12 +-- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow-row/Cargo.toml | 14 +-- arrow-schema/Cargo.toml | 2 +- arrow-select/Cargo.toml | 10 +- arrow-string/Cargo.toml | 12 +-- arrow/Cargo.toml | 28 ++--- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet/Cargo.toml | 20 ++-- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 28 files changed, 227 insertions(+), 165 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 2d7903e96a7d..8ddd7c6b6619 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,74 @@ # Historical Changelog +## [35.0.0](https://github.com/apache/arrow-rs/tree/35.0.0) (2023-03-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/34.0.0...35.0.0) + +**Breaking changes:** + +- Add RunEndBuffer \(\#1799\) [\#3817](https://github.com/apache/arrow-rs/pull/3817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Restrict DictionaryArray to ArrowDictionaryKeyType [\#3813](https://github.com/apache/arrow-rs/pull/3813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- refactor: assorted `FlightSqlServiceClient` improvements [\#3788](https://github.com/apache/arrow-rs/pull/3788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- minor: make Parquet CLI input args consistent [\#3786](https://github.com/apache/arrow-rs/pull/3786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Return Buffers from ArrayData::buffers instead of slice \(\#1799\) [\#3783](https://github.com/apache/arrow-rs/pull/3783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use NullBuffer in ArrayData \(\#3775\) [\#3778](https://github.com/apache/arrow-rs/pull/3778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Support timestamp/time and date types in json decoder [\#3834](https://github.com/apache/arrow-rs/issues/3834) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support decoding decimals in new raw json decoder [\#3819](https://github.com/apache/arrow-rs/issues/3819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Timezone Aware Timestamp Parsing [\#3794](https://github.com/apache/arrow-rs/issues/3794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Preallocate buffers for FixedSizeBinary array creation [\#3792](https://github.com/apache/arrow-rs/issues/3792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make Parquet CLI args consistent [\#3785](https://github.com/apache/arrow-rs/issues/3785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3776](https://github.com/apache/arrow-rs/issues/3776) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use NullBuffer in ArrayData [\#3775](https://github.com/apache/arrow-rs/issues/3775) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support unary\_dict\_mut in arth [\#3710](https://github.com/apache/arrow-rs/issues/3710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support cast \<\> String to interval [\#3643](https://github.com/apache/arrow-rs/issues/3643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Zero-Copy Conversion from Vec to/from MutableBuffer [\#3516](https://github.com/apache/arrow-rs/issues/3516) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Timestamp Unit Casts are Unchecked [\#3833](https://github.com/apache/arrow-rs/issues/3833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- regexp\_match skips first match when returning match [\#3803](https://github.com/apache/arrow-rs/issues/3803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast to timestamp with time zone returns timestamp [\#3800](https://github.com/apache/arrow-rs/issues/3800) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Schema-level metadata is not encoded in Flight responses [\#3779](https://github.com/apache/arrow-rs/issues/3779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Closed issues:** + +- FlightSQL CLI client: simple test [\#3814](https://github.com/apache/arrow-rs/issues/3814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Merged pull requests:** + +- refactor: timestamp overflow check [\#3840](https://github.com/apache/arrow-rs/pull/3840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Prep for 35.0.0 [\#3836](https://github.com/apache/arrow-rs/pull/3836) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Support timestamp/time and date json decoding [\#3835](https://github.com/apache/arrow-rs/pull/3835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Make dictionary preservation optional in row encoding [\#3831](https://github.com/apache/arrow-rs/pull/3831) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move prettyprint to arrow-cast [\#3828](https://github.com/apache/arrow-rs/pull/3828) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Support decoding decimals in raw decoder [\#3820](https://github.com/apache/arrow-rs/pull/3820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Add ArrayDataLayout, port validation \(\#1799\) [\#3818](https://github.com/apache/arrow-rs/pull/3818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- test: add test for FlightSQL CLI client [\#3816](https://github.com/apache/arrow-rs/pull/3816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Add regexp\_match docs [\#3812](https://github.com/apache/arrow-rs/pull/3812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: Ensure Flight schema includes parent metadata [\#3811](https://github.com/apache/arrow-rs/pull/3811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) +- fix: regexp\_match skips first match [\#3807](https://github.com/apache/arrow-rs/pull/3807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- fix: change uft8 to timestamp with timezone [\#3806](https://github.com/apache/arrow-rs/pull/3806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support reading decimal arrays from json [\#3805](https://github.com/apache/arrow-rs/pull/3805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Add unary\_dict\_mut [\#3804](https://github.com/apache/arrow-rs/pull/3804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Faster timestamp parsing \(~70-90% faster\) [\#3801](https://github.com/apache/arrow-rs/pull/3801) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add concat\_elements\_bytes [\#3798](https://github.com/apache/arrow-rs/pull/3798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Timezone aware timestamp parsing \(\#3794\) [\#3795](https://github.com/apache/arrow-rs/pull/3795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Preallocate buffers for FixedSizeBinary array creation [\#3793](https://github.com/apache/arrow-rs/pull/3793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- feat: simple flight sql CLI client [\#3789](https://github.com/apache/arrow-rs/pull/3789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3777](https://github.com/apache/arrow-rs/pull/3777) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- ArrayData Enumeration for Remaining Layouts [\#3769](https://github.com/apache/arrow-rs/pull/3769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.11.7 to =0.11.8 [\#3767](https://github.com/apache/arrow-rs/pull/3767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Implement concat\_elements\_dyn kernel [\#3763](https://github.com/apache/arrow-rs/pull/3763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support for casting `Utf8` and `LargeUtf8` --\> `Interval` [\#3762](https://github.com/apache/arrow-rs/pull/3762) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- into\_inner\(\) for CSV Writer [\#3759](https://github.com/apache/arrow-rs/pull/3759) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Zero-copy Vec conversion \(\#3516\) \(\#1176\) [\#3756](https://github.com/apache/arrow-rs/pull/3756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- ArrayData Enumeration for Primitive, Binary and UTF8 [\#3749](https://github.com/apache/arrow-rs/pull/3749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `into_primitive_dict_builder` to `DictionaryArray` [\#3715](https://github.com/apache/arrow-rs/pull/3715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + ## [34.0.0](https://github.com/apache/arrow-rs/tree/34.0.0) (2023-02-24) [Full Changelog](https://github.com/apache/arrow-rs/compare/33.0.0...34.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a7700ca773f..bd4a6522cfd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,73 +19,67 @@ # Changelog -## [35.0.0](https://github.com/apache/arrow-rs/tree/35.0.0) (2023-03-10) +## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-23) -[Full Changelog](https://github.com/apache/arrow-rs/compare/34.0.0...35.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/35.0.0...36.0.0) **Breaking changes:** -- Add RunEndBuffer \(\#1799\) [\#3817](https://github.com/apache/arrow-rs/pull/3817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Restrict DictionaryArray to ArrowDictionaryKeyType [\#3813](https://github.com/apache/arrow-rs/pull/3813) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- refactor: assorted `FlightSqlServiceClient` improvements [\#3788](https://github.com/apache/arrow-rs/pull/3788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- minor: make Parquet CLI input args consistent [\#3786](https://github.com/apache/arrow-rs/pull/3786) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XinyuZeng](https://github.com/XinyuZeng)) -- Return Buffers from ArrayData::buffers instead of slice \(\#1799\) [\#3783](https://github.com/apache/arrow-rs/pull/3783) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use NullBuffer in ArrayData \(\#3775\) [\#3778](https://github.com/apache/arrow-rs/pull/3778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return ScalarBuffer from PrimitiveArray::values \(\#3879\) [\#3896](https://github.com/apache/arrow-rs/pull/3896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use BooleanBuffer in BooleanArray \(\#3879\) [\#3895](https://github.com/apache/arrow-rs/pull/3895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Seal ArrowPrimitiveType [\#3882](https://github.com/apache/arrow-rs/pull/3882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support compression levels [\#3847](https://github.com/apache/arrow-rs/pull/3847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([spebern](https://github.com/spebern)) **Implemented enhancements:** -- Support timestamp/time and date types in json decoder [\#3834](https://github.com/apache/arrow-rs/issues/3834) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support decoding decimals in new raw json decoder [\#3819](https://github.com/apache/arrow-rs/issues/3819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Timezone Aware Timestamp Parsing [\#3794](https://github.com/apache/arrow-rs/issues/3794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Preallocate buffers for FixedSizeBinary array creation [\#3792](https://github.com/apache/arrow-rs/issues/3792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make Parquet CLI args consistent [\#3785](https://github.com/apache/arrow-rs/issues/3785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3776](https://github.com/apache/arrow-rs/issues/3776) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use NullBuffer in ArrayData [\#3775](https://github.com/apache/arrow-rs/issues/3775) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support unary\_dict\_mut in arth [\#3710](https://github.com/apache/arrow-rs/issues/3710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support cast \<\> String to interval [\#3643](https://github.com/apache/arrow-rs/issues/3643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support Zero-Copy Conversion from Vec to/from MutableBuffer [\#3516](https://github.com/apache/arrow-rs/issues/3516) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) +- Add compression options \(levels\) [\#3844](https://github.com/apache/arrow-rs/issues/3844) +- Use Unsigned Integer for Fixed Size DataType [\#3815](https://github.com/apache/arrow-rs/issues/3815) +- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) +- Common trait for RecordBatch and StructArray [\#3764](https://github.com/apache/arrow-rs/issues/3764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support for Async JSON Writer [\#3742](https://github.com/apache/arrow-rs/issues/3742) +- Support for Async CSV Writer [\#3740](https://github.com/apache/arrow-rs/issues/3740) +- Allow precision loss on multiplying decimal arrays [\#3689](https://github.com/apache/arrow-rs/issues/3689) **Fixed bugs:** -- Timestamp Unit Casts are Unchecked [\#3833](https://github.com/apache/arrow-rs/issues/3833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- regexp\_match skips first match when returning match [\#3803](https://github.com/apache/arrow-rs/issues/3803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Cast to timestamp with time zone returns timestamp [\#3800](https://github.com/apache/arrow-rs/issues/3800) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Schema-level metadata is not encoded in Flight responses [\#3779](https://github.com/apache/arrow-rs/issues/3779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] - -**Closed issues:** - -- FlightSQL CLI client: simple test [\#3814](https://github.com/apache/arrow-rs/issues/3814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- parquet\_derive doesn't support Vec\ [\#3864](https://github.com/apache/arrow-rs/issues/3864) +- \[REGRESSION\] Parsing timestamps with lower case time separator [\#3863](https://github.com/apache/arrow-rs/issues/3863) +- \[REGRESSION\] Parsing timestamps with leap seconds [\#3861](https://github.com/apache/arrow-rs/issues/3861) +- \[REGRESSION\] Parsing timestamps with fractional seconds / microseconds / milliseconds / nanoseconds [\#3859](https://github.com/apache/arrow-rs/issues/3859) +- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- refactor: timestamp overflow check [\#3840](https://github.com/apache/arrow-rs/pull/3840) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Prep for 35.0.0 [\#3836](https://github.com/apache/arrow-rs/pull/3836) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Support timestamp/time and date json decoding [\#3835](https://github.com/apache/arrow-rs/pull/3835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) -- Make dictionary preservation optional in row encoding [\#3831](https://github.com/apache/arrow-rs/pull/3831) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move prettyprint to arrow-cast [\#3828](https://github.com/apache/arrow-rs/pull/3828) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Support decoding decimals in raw decoder [\#3820](https://github.com/apache/arrow-rs/pull/3820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) -- Add ArrayDataLayout, port validation \(\#1799\) [\#3818](https://github.com/apache/arrow-rs/pull/3818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- test: add test for FlightSQL CLI client [\#3816](https://github.com/apache/arrow-rs/pull/3816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Add regexp\_match docs [\#3812](https://github.com/apache/arrow-rs/pull/3812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix: Ensure Flight schema includes parent metadata [\#3811](https://github.com/apache/arrow-rs/pull/3811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) -- fix: regexp\_match skips first match [\#3807](https://github.com/apache/arrow-rs/pull/3807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- fix: change uft8 to timestamp with timezone [\#3806](https://github.com/apache/arrow-rs/pull/3806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Support reading decimal arrays from json [\#3805](https://github.com/apache/arrow-rs/pull/3805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) -- Add unary\_dict\_mut [\#3804](https://github.com/apache/arrow-rs/pull/3804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Faster timestamp parsing \(~70-90% faster\) [\#3801](https://github.com/apache/arrow-rs/pull/3801) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add concat\_elements\_bytes [\#3798](https://github.com/apache/arrow-rs/pull/3798) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Timezone aware timestamp parsing \(\#3794\) [\#3795](https://github.com/apache/arrow-rs/pull/3795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Preallocate buffers for FixedSizeBinary array creation [\#3793](https://github.com/apache/arrow-rs/pull/3793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) -- feat: simple flight sql CLI client [\#3789](https://github.com/apache/arrow-rs/pull/3789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Creates PrimitiveDictionaryBuilder from provided keys and values builders [\#3777](https://github.com/apache/arrow-rs/pull/3777) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- ArrayData Enumeration for Remaining Layouts [\#3769](https://github.com/apache/arrow-rs/pull/3769) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update prost-build requirement from =0.11.7 to =0.11.8 [\#3767](https://github.com/apache/arrow-rs/pull/3767) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Implement concat\_elements\_dyn kernel [\#3763](https://github.com/apache/arrow-rs/pull/3763) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Support for casting `Utf8` and `LargeUtf8` --\> `Interval` [\#3762](https://github.com/apache/arrow-rs/pull/3762) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) -- into\_inner\(\) for CSV Writer [\#3759](https://github.com/apache/arrow-rs/pull/3759) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Zero-copy Vec conversion \(\#3516\) \(\#1176\) [\#3756](https://github.com/apache/arrow-rs/pull/3756) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- ArrayData Enumeration for Primitive, Binary and UTF8 [\#3749](https://github.com/apache/arrow-rs/pull/3749) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add `into_primitive_dict_builder` to `DictionaryArray` [\#3715](https://github.com/apache/arrow-rs/pull/3715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add PrimitiveArray::new \(\#3879\) [\#3909](https://github.com/apache/arrow-rs/pull/3909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve ScalarBuffer debug output [\#3907](https://github.com/apache/arrow-rs/pull/3907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.52 to =1.0.53 [\#3905](https://github.com/apache/arrow-rs/pull/3905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Re-export parquet compression level structs [\#3903](https://github.com/apache/arrow-rs/pull/3903) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix parsing timestamps of exactly 32 characters [\#3902](https://github.com/apache/arrow-rs/pull/3902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add iterators to BooleanBuffer and NullBuffer [\#3901](https://github.com/apache/arrow-rs/pull/3901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Array equality for &dyn Array \(\#3880\) [\#3899](https://github.com/apache/arrow-rs/pull/3899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::new \(\#3879\) [\#3898](https://github.com/apache/arrow-rs/pull/3898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Fix pyarrow memory leak \(\#3683\) [\#3893](https://github.com/apache/arrow-rs/pull/3893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add examples for `ListBuilder` and `GenericListBuilder` [\#3891](https://github.com/apache/arrow-rs/pull/3891) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update syn requirement from 1.0 to 2.0 [\#3890](https://github.com/apache/arrow-rs/pull/3890) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Use of `mul_checked` to avoid silent overflow in interval arithmetic [\#3886](https://github.com/apache/arrow-rs/pull/3886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Flesh out NullBuffer abstraction \(\#3880\) [\#3885](https://github.com/apache/arrow-rs/pull/3885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Bit Operations for i256 [\#3884](https://github.com/apache/arrow-rs/pull/3884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Flatten arrow\_buffer [\#3883](https://github.com/apache/arrow-rs/pull/3883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Array::to\_data and Array::nulls \(\#3880\) [\#3881](https://github.com/apache/arrow-rs/pull/3881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Added support for byte vectors and slices to parquet\_derive \(\#3864\) [\#3878](https://github.com/apache/arrow-rs/pull/3878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([waymost](https://github.com/waymost)) +- chore: remove LevelDecoder [\#3872](https://github.com/apache/arrow-rs/pull/3872) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) +- Parse timestamps with leap seconds \(\#3861\) [\#3862](https://github.com/apache/arrow-rs/pull/3862) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster time parsing \(~93% faster\) [\#3860](https://github.com/apache/arrow-rs/pull/3860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Parse timestamps with arbitrary seconds fraction [\#3858](https://github.com/apache/arrow-rs/pull/3858) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BitIterator [\#3856](https://github.com/apache/arrow-rs/pull/3856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve decimal parsing performance [\#3854](https://github.com/apache/arrow-rs/pull/3854) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Update proc-macro2 requirement from =1.0.51 to =1.0.52 [\#3853](https://github.com/apache/arrow-rs/pull/3853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update bitflags requirement from 1.2.1 to 2.0.0 [\#3852](https://github.com/apache/arrow-rs/pull/3852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add offset pushdown to parquet [\#3848](https://github.com/apache/arrow-rs/pull/3848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add timezone support to JSON reader [\#3845](https://github.com/apache/arrow-rs/pull/3845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow precision loss on multiplying decimal arrays [\#3690](https://github.com/apache/arrow-rs/pull/3690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 4360332d9c7a..f509af76b733 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-arith" -version = "35.0.0" +version = "36.0.0" description = "Arrow arithmetic kernels" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 1675f59838a7..7ea969a03f7d 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-array" -version = "35.0.0" +version = "36.0.0" description = "Array abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,9 +45,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 699a1000132f..be9a08eb8333 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "35.0.0" +version = "36.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 53c62ffb60d3..1eee7108f139 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-cast" -version = "35.0.0" +version = "36.0.0" description = "Cast kernel and utilities for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,11 +44,11 @@ features = ["prettyprint"] prettyprint = ["comfy-table"] [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-select = { version = "35.0.0", path = "../arrow-select" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-select = { version = "36.0.0", path = "../arrow-select" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 7ceb1401d1c0..9f8015f1eec8 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-csv" -version = "35.0.0" +version = "36.0.0" description = "Support for parsing CSV format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1"} diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index d58413a762bd..c3630d2c9164 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-data" -version = "35.0.0" +version = "36.0.0" description = "Array data abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -50,8 +50,8 @@ features = ["ffi"] [dependencies] -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index e8f57345eca0..729304aed92f 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "35.0.0" +version = "36.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,12 +27,12 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "35.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-ipc = { version = "36.0.0", path = "../arrow-ipc" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -58,7 +58,7 @@ tls = ["tonic/tls"] cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] -arrow-cast = { version = "35.0.0", path = "../arrow-cast", features = ["prettyprint"] } +arrow-cast = { version = "36.0.0", path = "../arrow-cast", features = ["prettyprint"] } assert_cmd = "2.0.8" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 41312cc0c559..f8f9e95d8377 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "35.0.0" +arrow-flight = "36.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index ca14401b6899..61ffae23fbe7 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-integration-test" -version = "35.0.0" +version = "36.0.0" description = "Support for the Apache Arrow JSON test data format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,8 +38,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "35.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } +arrow = { version = "36.0.0", path = "../arrow", default-features = false } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 48700bbe90d3..81691c4b370f 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "35.0.0" +version = "36.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 8bd7d31485e4..0c358170bc16 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ipc" -version = "35.0.0" +version = "36.0.0" description = "Support for the Arrow IPC format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 92c1a3eb282f..91bac277b1f6 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-json" -version = "35.0.0" +version = "36.0.0" description = "Support for parsing JSON format into the Arrow format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index 161fce9606d7..aac6a8cc0786 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-ord" -version = "35.0.0" +version = "36.0.0" description = "Ordering kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-select = { version = "35.0.0", path = "../arrow-select" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-select = { version = "36.0.0", path = "../arrow-select" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index ba084c435e64..8aaf20b498fe 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "35.0.0" +version = "36.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,5 +32,5 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "35.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "36.0.0", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index e2796fbe134c..96d494077026 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-row" -version = "35.0.0" +version = "36.0.0" description = "Arrow row format" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -44,17 +44,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-ord = { version = "35.0.0", path = "../arrow-ord" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-ord = { version = "36.0.0", path = "../arrow-ord" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 62cb9f3c257e..89e82a0ff164 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-schema" -version = "35.0.0" +version = "36.0.0" description = "Defines the logical types for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index 35c51c2da3ea..c0aa9444c1f1 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-select" -version = "35.0.0" +version = "36.0.0" description = "Selection kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,10 +38,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-array = { version = "35.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 923b8e8c00c4..90746e9395e3 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-string" -version = "35.0.0" +version = "36.0.0" description = "String kernels for arrow arrays" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -38,11 +38,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-select = { version = "35.0.0", path = "../arrow-select" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-select = { version = "36.0.0", path = "../arrow-select" } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8814f233bad1..0e8ea3cac124 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "35.0.0" +version = "36.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "35.0.0", path = "../arrow-arith" } -arrow-array = { version = "35.0.0", path = "../arrow-array" } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "35.0.0", path = "../arrow-cast" } -arrow-csv = { version = "35.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "35.0.0", path = "../arrow-data" } -arrow-ipc = { version = "35.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "35.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "35.0.0", path = "../arrow-ord" } -arrow-row = { version = "35.0.0", path = "../arrow-row" } -arrow-schema = { version = "35.0.0", path = "../arrow-schema" } -arrow-select = { version = "35.0.0", path = "../arrow-select" } -arrow-string = { version = "35.0.0", path = "../arrow-string" } +arrow-arith = { version = "36.0.0", path = "../arrow-arith" } +arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "36.0.0", path = "../arrow-cast" } +arrow-csv = { version = "36.0.0", path = "../arrow-csv", optional = true } +arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-ipc = { version = "36.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "36.0.0", path = "../arrow-json", optional = true } +arrow-ord = { version = "36.0.0", path = "../arrow-ord" } +arrow-row = { version = "36.0.0", path = "../arrow-row" } +arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-select = { version = "36.0.0", path = "../arrow-select" } +arrow-string = { version = "36.0.0", path = "../arrow-string" } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.18", default-features = false, optional = true } diff --git a/arrow/README.md b/arrow/README.md index 479213833244..d7a5877b49fa 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `35.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `36.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index c7c14b8d58c1..11bcbe866e32 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/35.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/36.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index b01d190a4f38..77f9c5f9780b 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="34.0.0" -FUTURE_RELEASE="35.0.0" +SINCE_TAG="35.0.0" +FUTURE_RELEASE="36.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a822a966f29d..46a6aa441271 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "35.0.0" +version = "36.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "35.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "35.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "35.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "35.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "35.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "35.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "35.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "35.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { version = "36.0.0", path = "../arrow-array", default-features = false, optional = true } +arrow-buffer = { version = "36.0.0", path = "../arrow-buffer", default-features = false, optional = true } +arrow-cast = { version = "36.0.0", path = "../arrow-cast", default-features = false, optional = true } +arrow-csv = { version = "36.0.0", path = "../arrow-csv", default-features = false, optional = true } +arrow-data = { version = "36.0.0", path = "../arrow-data", default-features = false, optional = true } +arrow-schema = { version = "36.0.0", path = "../arrow-schema", default-features = false, optional = true } +arrow-select = { version = "36.0.0", path = "../arrow-select", default-features = false, optional = true } +arrow-ipc = { version = "36.0.0", path = "../arrow-ipc", default-features = false, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "35.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "36.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index ddf34c4bf793..9ecb40cc4729 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "35.0.0" +version = "36.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "2.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "35.0.0", default-features = false } +parquet = { path = "../parquet", version = "36.0.0", default-features = false } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 2bed2d550e62..70be54015c56 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "35.0.0" -parquet_derive = "35.0.0" +parquet = "36.0.0" +parquet_derive = "36.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index cca778d6f51b..10694851c938 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "35.0.0" +version = "36.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "35.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "35.0.0", default-features = false } +parquet = { path = "../parquet", version = "36.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "36.0.0", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 71ecc39f36c8f38a5fc93bc3878a607c831b2f12 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:17:26 +0000 Subject: [PATCH 0732/1411] Prepare arrow 36 (#3935) --- CHANGELOG.md | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd4a6522cfd4..2b1c59b3089e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,12 +19,14 @@ # Changelog -## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-23) +## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-24) [Full Changelog](https://github.com/apache/arrow-rs/compare/35.0.0...36.0.0) **Breaking changes:** +- Use dyn Array in sort kernels [\#3931](https://github.com/apache/arrow-rs/pull/3931) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Enforce struct nullability in JSON raw reader \(\#3900\) \(\#3904\) [\#3906](https://github.com/apache/arrow-rs/pull/3906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Return ScalarBuffer from PrimitiveArray::values \(\#3879\) [\#3896](https://github.com/apache/arrow-rs/pull/3896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Use BooleanBuffer in BooleanArray \(\#3879\) [\#3895](https://github.com/apache/arrow-rs/pull/3895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Seal ArrowPrimitiveType [\#3882](https://github.com/apache/arrow-rs/pull/3882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) @@ -32,26 +34,37 @@ **Implemented enhancements:** -- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) -- Add compression options \(levels\) [\#3844](https://github.com/apache/arrow-rs/issues/3844) +- Improve speed of parsing string to Times [\#3919](https://github.com/apache/arrow-rs/issues/3919) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- feat: add comparison/sort support for Float16 [\#3914](https://github.com/apache/arrow-rs/issues/3914) +- Pinned version in arrow-flight's build-dependencies are causing conflicts [\#3876](https://github.com/apache/arrow-rs/issues/3876) +- Add compression options \(levels\) [\#3844](https://github.com/apache/arrow-rs/issues/3844) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Use Unsigned Integer for Fixed Size DataType [\#3815](https://github.com/apache/arrow-rs/issues/3815) -- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) - Common trait for RecordBatch and StructArray [\#3764](https://github.com/apache/arrow-rs/issues/3764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support for Async JSON Writer [\#3742](https://github.com/apache/arrow-rs/issues/3742) -- Support for Async CSV Writer [\#3740](https://github.com/apache/arrow-rs/issues/3740) -- Allow precision loss on multiplying decimal arrays [\#3689](https://github.com/apache/arrow-rs/issues/3689) +- Allow precision loss on multiplying decimal arrays [\#3689](https://github.com/apache/arrow-rs/issues/3689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- parquet\_derive doesn't support Vec\ [\#3864](https://github.com/apache/arrow-rs/issues/3864) -- \[REGRESSION\] Parsing timestamps with lower case time separator [\#3863](https://github.com/apache/arrow-rs/issues/3863) -- \[REGRESSION\] Parsing timestamps with leap seconds [\#3861](https://github.com/apache/arrow-rs/issues/3861) -- \[REGRESSION\] Parsing timestamps with fractional seconds / microseconds / milliseconds / nanoseconds [\#3859](https://github.com/apache/arrow-rs/issues/3859) +- Raw JSON Reader Allows Non-Nullable Struct Children to Contain Nulls [\#3904](https://github.com/apache/arrow-rs/issues/3904) +- Nullable field with nested not nullable map in json [\#3900](https://github.com/apache/arrow-rs/issues/3900) +- parquet\_derive doesn't support Vec\ [\#3864](https://github.com/apache/arrow-rs/issues/3864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[REGRESSION\] Parsing timestamps with lower case time separator [\#3863](https://github.com/apache/arrow-rs/issues/3863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[REGRESSION\] Parsing timestamps with leap seconds [\#3861](https://github.com/apache/arrow-rs/issues/3861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[REGRESSION\] Parsing timestamps with fractional seconds / microseconds / milliseconds / nanoseconds [\#3859](https://github.com/apache/arrow-rs/issues/3859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CSV Reader Doesn't set Timezone [\#3841](https://github.com/apache/arrow-rs/issues/3841) - PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** +- Derive RunArray Clone [\#3932](https://github.com/apache/arrow-rs/pull/3932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move protoc generation to binary crate, unpin prost/tonic build \(\#3876\) [\#3927](https://github.com/apache/arrow-rs/pull/3927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix JSON Temporal Encoding of Multiple Batches [\#3924](https://github.com/apache/arrow-rs/pull/3924) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- Cleanup uses of Array::data\_ref \(\#3880\) [\#3918](https://github.com/apache/arrow-rs/pull/3918) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support microsecond and nanosecond in interval parsing [\#3916](https://github.com/apache/arrow-rs/pull/3916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- feat: add comparison/sort support for Float16 [\#3915](https://github.com/apache/arrow-rs/pull/3915) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add AsArray trait for more ergonomic downcasting [\#3912](https://github.com/apache/arrow-rs/pull/3912) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add OffsetBuffer::new [\#3910](https://github.com/apache/arrow-rs/pull/3910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add PrimitiveArray::new \(\#3879\) [\#3909](https://github.com/apache/arrow-rs/pull/3909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support timezones in CSV reader \(\#3841\) [\#3908](https://github.com/apache/arrow-rs/pull/3908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Improve ScalarBuffer debug output [\#3907](https://github.com/apache/arrow-rs/pull/3907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update proc-macro2 requirement from =1.0.52 to =1.0.53 [\#3905](https://github.com/apache/arrow-rs/pull/3905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) - Re-export parquet compression level structs [\#3903](https://github.com/apache/arrow-rs/pull/3903) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) From 1500c9b5613bd427ca7abf6b5f2b8f23ffc705d7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sat, 25 Mar 2023 03:55:17 -0700 Subject: [PATCH 0733/1411] feat: add take for MapArray (#3925) * feat: add take for MapArray * refactor: use into_builder --- arrow-array/src/array/map_array.rs | 16 +++++++++++++++- arrow-select/src/take.rs | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index c9651f0b2019..fbe32d4b2092 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::array::{get_offsets, print_long_array}; -use crate::{make_array, Array, ArrayRef, StringArray, StructArray}; +use crate::{make_array, Array, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; @@ -251,6 +251,20 @@ impl std::fmt::Debug for MapArray { } } +impl From for ListArray { + fn from(value: MapArray) -> Self { + let field = match value.data_type() { + DataType::Map(field, _) => field, + _ => unreachable!("This should be a map type."), + }; + let data_type = DataType::List(field.clone()); + let builder = value.into_data().into_builder().data_type(data_type); + let array_data = unsafe { builder.build_unchecked() }; + + ListArray::from(array_data) + } +} + #[cfg(test)] mod tests { use crate::cast::AsArray; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 83b58519fdb8..2e076d93843e 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -150,6 +150,12 @@ where *length as u32, )?)) } + DataType::Map(_, _) => { + let list_arr = ListArray::from(values.as_map().clone()); + let list_data = take_list::<_, Int32Type>(&list_arr, indices)?; + let builder = list_data.into_data().into_builder().data_type(values.data_type().clone()); + Ok(Arc::new(MapArray::from(unsafe { builder.build_unchecked() }))) + } DataType::Struct(fields) => { let struct_: &StructArray = values.as_any().downcast_ref::().unwrap(); @@ -1919,6 +1925,30 @@ mod tests { take(&list_array, &index, None).unwrap(); } + #[test] + fn test_take_map() { + let values = Int32Array::from(vec![1, 2, 3, 4]); + let array = MapArray::new_from_strings( + vec!["a", "b", "c", "a"].into_iter(), + &values, + &[0, 3, 4], + ) + .unwrap(); + + let index = UInt32Array::from(vec![0]); + + let result = take(&array, &index, None).unwrap(); + let expected: ArrayRef = Arc::new( + MapArray::new_from_strings( + vec!["a", "b", "c"].into_iter(), + &values.slice(0, 3), + &[0, 3], + ) + .unwrap(), + ); + assert_eq!(&expected, &result); + } + #[test] fn test_take_struct() { let array = create_test_struct(vec![ From bc8ba3c136d5afebae18884040e5f91b89a410d5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 25 Mar 2023 12:16:17 +0000 Subject: [PATCH 0734/1411] Add Zero-Copy Conversion between Vec and MutableBuffer (#3920) * Add Zero-Copy Conversion between Vec and MutableBuffer * Update test * Fix docs * Clippy --- arrow-array/src/array/list_array.rs | 8 +- arrow-buffer/src/alloc/mod.rs | 104 +---------------- arrow-buffer/src/buffer/immutable.rs | 17 +-- arrow-buffer/src/buffer/mutable.rs | 166 +++++++++++++++------------ arrow-buffer/src/bytes.rs | 4 +- 5 files changed, 102 insertions(+), 197 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index c7e2a817ba33..895f150079e5 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -842,10 +842,8 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] - #[allow(deprecated)] fn test_primitive_array_alignment() { - let ptr = arrow_buffer::alloc::allocate_aligned(8); - let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; + let buf = Buffer::from_slice_ref([0_u64]); let buf2 = buf.slice(1); let array_data = ArrayData::builder(DataType::Int32) .add_buffer(buf2) @@ -859,10 +857,8 @@ mod tests { // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] - #[allow(deprecated)] fn test_list_array_alignment() { - let ptr = arrow_buffer::alloc::allocate_aligned(8); - let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; + let buf = Buffer::from_slice_ref([0_u64]); let buf2 = buf.slice(1); let values: [i32; 8] = [0; 8]; diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index 7600a28d8754..d1236eeaa9a6 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -18,117 +18,15 @@ //! Defines memory-related functions, such as allocate/deallocate/reallocate memory //! regions, cache and allocation alignments. -use std::alloc::{handle_alloc_error, Layout}; +use std::alloc::Layout; use std::fmt::{Debug, Formatter}; use std::panic::RefUnwindSafe; -use std::ptr::NonNull; use std::sync::Arc; mod alignment; pub use alignment::ALIGNMENT; -/// Returns an aligned non null pointer similar to [`NonNull::dangling`] -/// -/// Note that the pointer value may potentially represent a valid pointer, which means -/// this must not be used as a "not yet initialized" sentinel value. -/// -/// Types that lazily allocate must track initialization by some other means. -#[inline] -fn dangling_ptr() -> NonNull { - // SAFETY: ALIGNMENT is a non-zero usize which is then casted - // to a *mut T. Therefore, `ptr` is not null and the conditions for - // calling new_unchecked() are respected. - unsafe { NonNull::new_unchecked(ALIGNMENT as *mut u8) } -} - -/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. -/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have -/// an unknown or non-zero value and is semantically similar to `malloc`. -#[deprecated(note = "Use Vec")] -pub fn allocate_aligned(size: usize) -> NonNull { - unsafe { - if size == 0 { - dangling_ptr() - } else { - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc(layout); - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. -/// This is more performant than using [allocate_aligned] and setting all bytes to zero -/// and is semantically similar to `calloc`. -#[deprecated(note = "Use Vec")] -pub fn allocate_aligned_zeroed(size: usize) -> NonNull { - unsafe { - if size == 0 { - dangling_ptr() - } else { - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc_zeroed(layout); - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must denote a block of memory currently allocated via this allocator, -/// -/// * size must be the same size that was used to allocate that block of memory, -#[deprecated(note = "Use Vec")] -pub unsafe fn free_aligned(ptr: NonNull, size: usize) { - if size != 0 { - std::alloc::dealloc( - ptr.as_ptr() as *mut u8, - Layout::from_size_align_unchecked(size, ALIGNMENT), - ); - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must be currently allocated via this allocator, -/// -/// * new_size must be greater than zero. -/// -/// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., -/// the rounded value must be less than usize::MAX). -#[deprecated(note = "Use Vec")] -#[allow(deprecated)] -pub unsafe fn reallocate( - ptr: NonNull, - old_size: usize, - new_size: usize, -) -> NonNull { - if old_size == 0 { - return allocate_aligned(new_size); - } - - if new_size == 0 { - free_aligned(ptr, old_size); - return dangling_ptr(); - } - - let raw_ptr = std::alloc::realloc( - ptr.as_ptr() as *mut u8, - Layout::from_size_align_unchecked(old_size, ALIGNMENT), - new_size, - ); - NonNull::new(raw_ptr).unwrap_or_else(|| { - handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) - }) -} - /// The owner of an allocation. /// The trait implementation is responsible for dropping the allocations once no more references exist. pub trait Allocation: RefUnwindSafe + Send + Sync {} diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 5f42035c9e30..15d9ff7838c6 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -71,20 +71,10 @@ impl Buffer { } } - /// Create a [`Buffer`] from the provided `Vec` without copying + /// Create a [`Buffer`] from the provided [`Vec`] without copying #[inline] pub fn from_vec(vec: Vec) -> Self { - // Safety - // Vec::as_ptr guaranteed to not be null and ArrowNativeType are trivially transmutable - let ptr = unsafe { NonNull::new_unchecked(vec.as_ptr() as _) }; - let len = vec.len() * std::mem::size_of::(); - // Safety - // Vec guaranteed to have a valid layout matching that of `Layout::array` - // This is based on `RawVec::current_memory` - let layout = unsafe { Layout::array::(vec.capacity()).unwrap_unchecked() }; - std::mem::forget(vec); - let b = unsafe { Bytes::new(ptr, len, Deallocation::Standard(layout)) }; - Self::from_bytes(b) + MutableBuffer::from_vec(vec).into() } /// Initializes a [Buffer] from a slice of items. @@ -810,7 +800,8 @@ mod tests { b.into_mutable().unwrap(); let b = Buffer::from_vec(vec![1_u32, 3, 5]); - let b = b.into_mutable().unwrap_err(); // Invalid layout + let b = b.into_mutable().unwrap(); + let b = Buffer::from(b); let b = b.into_vec::().unwrap(); assert_eq!(b, &[1, 3, 5]); } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 250ac9f31595..9a905a3223b6 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -15,17 +15,18 @@ // specific language governing permissions and limitations // under the License. -use super::Buffer; +use std::alloc::{handle_alloc_error, Layout}; +use std::mem; +use std::ptr::NonNull; + use crate::alloc::{Deallocation, ALIGNMENT}; use crate::{ - alloc, bytes::Bytes, native::{ArrowNativeType, ToByteSlice}, util::bit_util, }; -use std::alloc::Layout; -use std::mem; -use std::ptr::NonNull; + +use super::Buffer; /// A [`MutableBuffer`] is Arrow's interface to build a [`Buffer`] out of items or slices of items. /// @@ -55,7 +56,7 @@ pub struct MutableBuffer { data: NonNull, // invariant: len <= capacity len: usize, - capacity: usize, + layout: Layout, } impl MutableBuffer { @@ -67,14 +68,21 @@ impl MutableBuffer { /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. #[inline] - #[allow(deprecated)] pub fn with_capacity(capacity: usize) -> Self { let capacity = bit_util::round_upto_multiple_of_64(capacity); - let ptr = alloc::allocate_aligned(capacity); + let layout = Layout::from_size_align(capacity, ALIGNMENT).unwrap(); + let data = match layout.size() { + 0 => dangling_ptr(), + _ => { + // Safety: Verified size != 0 + let raw_ptr = unsafe { std::alloc::alloc(layout) }; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + }; Self { - data: ptr, + data, len: 0, - capacity, + layout, } } @@ -89,35 +97,46 @@ impl MutableBuffer { /// let data = buffer.as_slice_mut(); /// assert_eq!(data[126], 0u8); /// ``` - #[allow(deprecated)] pub fn from_len_zeroed(len: usize) -> Self { - let new_capacity = bit_util::round_upto_multiple_of_64(len); - let ptr = alloc::allocate_aligned_zeroed(new_capacity); - Self { - data: ptr, - len, - capacity: new_capacity, - } + let layout = Layout::from_size_align(len, ALIGNMENT).unwrap(); + let data = match layout.size() { + 0 => dangling_ptr(), + _ => { + // Safety: Verified size != 0 + let raw_ptr = unsafe { std::alloc::alloc_zeroed(layout) }; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + }; + Self { data, len, layout } + } + + /// Create a [`MutableBuffer`] from the provided [`Vec`] without copying + #[inline] + pub fn from_vec(vec: Vec) -> Self { + // Safety + // Vec::as_ptr guaranteed to not be null and ArrowNativeType are trivially transmutable + let data = unsafe { NonNull::new_unchecked(vec.as_ptr() as _) }; + let len = vec.len() * mem::size_of::(); + // Safety + // Vec guaranteed to have a valid layout matching that of `Layout::array` + // This is based on `RawVec::current_memory` + let layout = unsafe { Layout::array::(vec.capacity()).unwrap_unchecked() }; + mem::forget(vec); + Self { data, len, layout } } /// Allocates a new [MutableBuffer] from given `Bytes`. pub(crate) fn from_bytes(bytes: Bytes) -> Result { - let capacity = match bytes.deallocation() { - Deallocation::Standard(layout) if layout.align() == ALIGNMENT => { - layout.size() - } + let layout = match bytes.deallocation() { + Deallocation::Standard(layout) => *layout, _ => return Err(bytes), }; let len = bytes.len(); - let ptr = bytes.ptr(); + let data = bytes.ptr(); mem::forget(bytes); - Ok(Self { - data: ptr, - len, - capacity, - }) + Ok(Self { data, len, layout }) } /// creates a new [MutableBuffer] with capacity and length capable of holding `len` bits. @@ -134,7 +153,7 @@ impl MutableBuffer { /// the buffer directly (e.g., modifying the buffer by holding a mutable reference /// from `data_mut()`). pub fn with_bitset(mut self, end: usize, val: bool) -> Self { - assert!(end <= self.capacity); + assert!(end <= self.layout.size()); let v = if val { 255 } else { 0 }; unsafe { std::ptr::write_bytes(self.data.as_ptr(), v, end); @@ -149,7 +168,7 @@ impl MutableBuffer { /// `len` of the buffer and so can be used to initialize the memory region from /// `len` to `capacity`. pub fn set_null_bits(&mut self, start: usize, count: usize) { - assert!(start + count <= self.capacity); + assert!(start + count <= self.layout.size()); unsafe { std::ptr::write_bytes(self.data.as_ptr().add(start), 0, count); } @@ -171,17 +190,33 @@ impl MutableBuffer { #[inline(always)] pub fn reserve(&mut self, additional: usize) { let required_cap = self.len + additional; - if required_cap > self.capacity { - // JUSTIFICATION - // Benefit - // necessity - // Soundness - // `self.data` is valid for `self.capacity`. - let (ptr, new_capacity) = - unsafe { reallocate(self.data, self.capacity, required_cap) }; - self.data = ptr; - self.capacity = new_capacity; + if required_cap > self.layout.size() { + let new_capacity = bit_util::round_upto_multiple_of_64(required_cap); + let new_capacity = std::cmp::max(new_capacity, self.layout.size() * 2); + self.reallocate(new_capacity) + } + } + + #[cold] + fn reallocate(&mut self, capacity: usize) { + let new_layout = Layout::from_size_align(capacity, self.layout.align()).unwrap(); + if new_layout.size() == 0 { + if self.layout.size() != 0 { + // Safety: data was allocated with layout + unsafe { std::alloc::dealloc(self.as_mut_ptr(), self.layout) }; + self.layout = new_layout + } + return; } + + let data = match self.layout.size() { + // Safety: new_layout is not empty + 0 => unsafe { std::alloc::alloc(new_layout) }, + // Safety: verified new layout is valid and not empty + _ => unsafe { std::alloc::realloc(self.as_mut_ptr(), self.layout, capacity) }, + }; + self.data = NonNull::new(data).unwrap_or_else(|| handle_alloc_error(new_layout)); + self.layout = new_layout; } /// Truncates this buffer to `len` bytes @@ -233,20 +268,10 @@ impl MutableBuffer { /// buffer.shrink_to_fit(); /// assert!(buffer.capacity() >= 64 && buffer.capacity() < 128); /// ``` - #[allow(deprecated)] pub fn shrink_to_fit(&mut self) { let new_capacity = bit_util::round_upto_multiple_of_64(self.len); - if new_capacity < self.capacity { - // JUSTIFICATION - // Benefit - // necessity - // Soundness - // `self.data` is valid for `self.capacity`. - let ptr = - unsafe { alloc::reallocate(self.data, self.capacity, new_capacity) }; - - self.data = ptr; - self.capacity = new_capacity; + if new_capacity < self.layout.size() { + self.reallocate(new_capacity) } } @@ -267,7 +292,7 @@ impl MutableBuffer { /// The invariant `buffer.len() <= buffer.capacity()` is always upheld. #[inline] pub const fn capacity(&self) -> usize { - self.capacity + self.layout.size() } /// Clear all existing data from this buffer. @@ -310,9 +335,9 @@ impl MutableBuffer { #[inline] pub(super) fn into_buffer(self) -> Buffer { - let layout = Layout::from_size_align(self.capacity, ALIGNMENT).unwrap(); - let bytes = - unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(layout)) }; + let bytes = unsafe { + Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) + }; std::mem::forget(self); Buffer::from_bytes(bytes) } @@ -455,19 +480,12 @@ impl MutableBuffer { } } -/// # Safety -/// `ptr` must be allocated for `old_capacity`. -#[cold] -#[allow(deprecated)] -unsafe fn reallocate( - ptr: NonNull, - old_capacity: usize, - new_capacity: usize, -) -> (NonNull, usize) { - let new_capacity = bit_util::round_upto_multiple_of_64(new_capacity); - let new_capacity = std::cmp::max(new_capacity, old_capacity * 2); - let ptr = alloc::reallocate(ptr, old_capacity, new_capacity); - (ptr, new_capacity) +#[inline] +fn dangling_ptr() -> NonNull { + // SAFETY: ALIGNMENT is a non-zero usize which is then casted + // to a *mut T. Therefore, `ptr` is not null and the conditions for + // calling new_unchecked() are respected. + unsafe { NonNull::new_unchecked(ALIGNMENT as *mut u8) } } impl Extend for MutableBuffer { @@ -492,7 +510,7 @@ impl MutableBuffer { // this is necessary because of https://github.com/rust-lang/rust/issues/32155 let mut len = SetLenOnDrop::new(&mut self.len); let mut dst = unsafe { self.data.as_ptr().add(len.local_len) }; - let capacity = self.capacity; + let capacity = self.layout.size(); while len.local_len + item_size <= capacity { if let Some(item) = iterator.next() { @@ -641,9 +659,11 @@ impl std::ops::DerefMut for MutableBuffer { } impl Drop for MutableBuffer { - #[allow(deprecated)] fn drop(&mut self) { - unsafe { alloc::free_aligned(self.data, self.capacity) }; + if self.layout.size() != 0 { + // Safety: data was allocated with standard allocator with given layout + unsafe { std::alloc::dealloc(self.data.as_ptr() as _, self.layout) }; + } } } @@ -652,7 +672,7 @@ impl PartialEq for MutableBuffer { if self.len != other.len { return false; } - if self.capacity != other.capacity { + if self.layout != other.layout { return false; } self.as_slice() == other.as_slice() diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 2820fda781e6..b3105ed5a3b4 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -30,8 +30,8 @@ use crate::alloc::Deallocation; /// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's /// global allocator nor u8 alignment. /// -/// In the most common case, this buffer is allocated using [`allocate_aligned`](crate::alloc::allocate_aligned) -/// and deallocated accordingly [`free_aligned`](crate::alloc::free_aligned). +/// In the most common case, this buffer is allocated using [`alloc`](std::alloc::alloc) +/// with an alignment of [`ALIGNMENT`](crate::alloc::ALIGNMENT) /// /// When the region is allocated by a different allocator, [Deallocation::Custom], this calls the /// custom deallocator to deallocate the region when it is no longer needed. From ce74578c99d09a2ac3f6f590db6748e879c65550 Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Sat, 25 Mar 2023 21:57:25 +0800 Subject: [PATCH 0735/1411] Fix: FlightSqlClient panic when execute_update. (#3938) * Fix FlightService::do_put() for FlightSqlService. server missing '.as_any()' when encode DoPutUpdateResult, which result in empty Bytes. * Add test_execute_update for example FlightSqlServiceImpl. --- arrow-flight/examples/flight_sql_server.rs | 40 ++++++++++++++++++++-- arrow-flight/src/sql/server.rs | 4 +-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index ac38b0232f74..bc9d24656913 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -70,6 +70,10 @@ impl FlightSqlServiceImpl { let cols = vec![Arc::new(builder.finish()) as ArrayRef]; RecordBatch::try_new(Arc::new(schema), cols) } + + fn fake_update_result() -> i64 { + 1 + } } #[tonic::async_trait] @@ -391,9 +395,7 @@ impl FlightSqlService for FlightSqlServiceImpl { _ticket: CommandStatementUpdate, _request: Request>, ) -> Result { - Err(Status::unimplemented( - "do_put_statement_update not implemented", - )) + Ok(FlightSqlServiceImpl::fake_update_result()) } async fn do_put_prepared_statement_query( @@ -638,6 +640,38 @@ mod tests { } } + #[tokio::test] + async fn test_execute_update() { + let file = NamedTempFile::new().unwrap(); + let path = file.into_temp_path().to_str().unwrap().to_string(); + let _ = fs::remove_file(path.clone()); + + let uds = UnixListener::bind(path.clone()).unwrap(); + let stream = UnixListenerStream::new(uds); + + // We would just listen on TCP, but it seems impossible to know when tonic is ready to serve + let service = FlightSqlServiceImpl {}; + let serve_future = Server::builder() + .add_service(FlightServiceServer::new(service)) + .serve_with_incoming(stream); + + let request_future = async { + let mut client = client_with_uds(path).await; + let token = client.handshake("admin", "password").await.unwrap(); + println!("Auth succeeded with token: {:?}", token); + let res = client + .execute_update("creat table test(a int);".to_string()) + .await + .unwrap(); + assert_eq!(res, FlightSqlServiceImpl::fake_update_result()); + }; + + tokio::select! { + _ = serve_future => panic!("server returned first"), + _ = request_future => println!("Client finished!"), + } + } + fn endpoint(addr: String) -> Result { let endpoint = Endpoint::new(addr) .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index d48181189a56..848bfb3852f5 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -473,7 +473,7 @@ where let record_count = self.do_put_statement_update(token, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.encode_to_vec().into(), + app_metadata: result.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } @@ -494,7 +494,7 @@ where .await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.encode_to_vec().into(), + app_metadata: result.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } From 888c1cab7c76f0b28cecdd704c14e42256736def Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 25 Mar 2023 19:06:28 +0000 Subject: [PATCH 0736/1411] Avoid memory copies in take_list (#3940) --- arrow-select/src/take.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 2e076d93843e..f59e64015ceb 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -668,7 +668,7 @@ where IndexType::Native: ToPrimitive, OffsetType: ArrowPrimitiveType, OffsetType::Native: ToPrimitive + OffsetSizeTrait, - PrimitiveArray: From>>, + PrimitiveArray: From>, { // TODO: Some optimizations can be done here such as if it is // taking the whole list or a contiguous sublist @@ -676,7 +676,7 @@ where take_value_indices_from_list::(values, indices)?; let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; - let value_offsets = Buffer::from_slice_ref(offsets); + let value_offsets = Buffer::from_vec(offsets); // create a new list with taken data and computed null information let list_data = ArrayDataBuilder::new(values.data_type().clone()) .len(indices.len()) @@ -887,7 +887,7 @@ where IndexType::Native: ToPrimitive, OffsetType: ArrowPrimitiveType, OffsetType::Native: OffsetSizeTrait + std::ops::Add + num::Zero + num::One, - PrimitiveArray: From>>, + PrimitiveArray: From>, { // TODO: benchmark this function, there might be a faster unsafe alternative let offsets: &[OffsetType::Native] = list.value_offsets(); @@ -918,7 +918,7 @@ where // if start == end, this slot is empty while curr < end { - values.push(Some(curr)); + values.push(curr); curr += num::One::one(); } if !list.is_valid(ix) { From 86b384fa81d3fd747e4facd20cdcd47f1581e3b9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 26 Mar 2023 11:27:31 +0100 Subject: [PATCH 0737/1411] Fix checked i256 arithmetic (#3942) (#3941) (#3943) * Fix checked i256 arithmetic (#3942) (#3941) * Tweak * Add is_positive * Make const --- arrow-buffer/src/bigint.rs | 75 +++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 4b446e19b996..4625d24c109a 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -243,9 +243,9 @@ impl i256 { /// Performs checked addition #[inline] pub fn checked_add(self, other: Self) -> Option { - let (low, carry) = self.low.overflowing_add(other.low); - let high = self.high.checked_add(other.high)?.checked_add(carry as _)?; - Some(Self { low, high }) + let r = self.wrapping_add(other); + ((other.is_negative() && r < self) || (!other.is_negative() && r >= self)) + .then_some(r) } /// Performs wrapping subtraction @@ -259,9 +259,9 @@ impl i256 { /// Performs checked subtraction #[inline] pub fn checked_sub(self, other: Self) -> Option { - let (low, carry) = self.low.overflowing_sub(other.low); - let high = self.high.checked_sub(other.high)?.checked_sub(carry as _)?; - Some(Self { low, high }) + let r = self.wrapping_sub(other); + ((other.is_negative() && r > self) || (!other.is_negative() && r <= self)) + .then_some(r) } /// Performs wrapping multiplication @@ -282,10 +282,14 @@ impl i256 { /// Performs checked multiplication #[inline] pub fn checked_mul(self, other: Self) -> Option { + if self == i256::ZERO || other == i256::ZERO { + return Some(i256::ZERO); + } + // Shift sign bit down to construct mask of all set bits if negative let l_sa = self.high >> 127; let r_sa = other.high >> 127; - let out_sa = l_sa ^ r_sa; + let out_sa = (l_sa ^ r_sa) as u128; // Compute absolute values let l_abs = self.wrapping_abs(); @@ -303,13 +307,15 @@ impl i256 { let hl = (l_abs.high as u128).checked_mul(r_abs.low)?; let lh = l_abs.low.checked_mul(r_abs.high as u128)?; - let high: i128 = high.checked_add(hl)?.checked_add(lh)?.try_into().ok()?; + let high = high.checked_add(hl)?.checked_add(lh)?; // Reverse absolute value, if necessary - let (low, c) = (low ^ out_sa as u128).overflowing_sub(out_sa as u128); - let high = (high ^ out_sa).wrapping_sub(out_sa).wrapping_sub(c as i128); + let (low, c) = (low ^ out_sa).overflowing_sub(out_sa); + let high = (high ^ out_sa).wrapping_sub(out_sa).wrapping_sub(c as u128) as i128; - Some(Self { low, high }) + // Check for overflow in final conversion + (high.is_negative() == (self.is_negative() ^ other.is_negative())) + .then_some(Self { low, high }) } /// Performs wrapping division @@ -398,6 +404,12 @@ impl i256 { // needless overflow. acc.wrapping_mul(base) } + + /// Returns `true` if this [`i256`] is negative + #[inline] + pub const fn is_negative(self) -> bool { + self.high.is_negative() + } } /// Temporary workaround due to lack of stable const array slicing @@ -709,7 +721,7 @@ mod tests { let checked = il.checked_add(ir); match overflow { true => assert!(checked.is_none()), - false => assert_eq!(checked.unwrap(), actual), + false => assert_eq!(checked, Some(actual)), } // Subtraction @@ -721,7 +733,7 @@ mod tests { let checked = il.checked_sub(ir); match overflow { true => assert!(checked.is_none()), - false => assert_eq!(checked.unwrap(), actual), + false => assert_eq!(checked, Some(actual), "{bl} - {br} = {expected}"), } // Multiplication @@ -737,14 +749,14 @@ mod tests { "{il} * {ir} = {actual} vs {bl} * {br} = {expected}" ), false => assert_eq!( - checked.unwrap(), - actual, + checked, + Some(actual), "{il} * {ir} = {actual} vs {bl} * {br} = {expected}" ), } // Exponentiation - for exp in vec![0, 1, 3, 8, 100].into_iter() { + for exp in vec![0, 1, 2, 3, 8, 100].into_iter() { let actual = il.wrapping_pow(exp); let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone().pow(exp)); @@ -757,9 +769,9 @@ mod tests { "{il} ^ {exp} = {actual} vs {bl} * {exp} = {expected}" ), false => assert_eq!( - checked.unwrap(), - actual, - "{il} ^ {exp} = {actual} vs {bl} * {exp} = {expected}" + checked, + Some(actual), + "{il} ^ {exp} = {actual} vs {bl} ^ {exp} = {expected}" ), } } @@ -791,14 +803,33 @@ mod tests { #[test] fn test_i256() { let candidates = [ - i256::from_parts(0, 0), - i256::from_parts(0, 1), - i256::from_parts(0, -1), + i256::ZERO, + i256::ONE, + i256::MINUS_ONE, + i256::from_i128(2), + i256::from_i128(-2), i256::from_parts(u128::MAX, 1), i256::from_parts(u128::MAX, -1), i256::from_parts(0, 1), i256::from_parts(0, -1), + i256::from_parts(1, -1), + i256::from_parts(1, 1), + i256::from_parts(0, i128::MAX), + i256::from_parts(0, i128::MIN), + i256::from_parts(1, i128::MAX), + i256::from_parts(1, i128::MIN), + i256::from_parts(u128::MAX, i128::MIN), i256::from_parts(100, 32), + i256::MIN, + i256::MAX, + i256::MIN >> 1, + i256::MAX >> 1, + i256::ONE << 127, + i256::ONE << 128, + i256::ONE << 129, + i256::MINUS_ONE << 127, + i256::MINUS_ONE << 128, + i256::MINUS_ONE << 129, ]; for il in candidates { From 62919cfe32eefea93fc459bfabac94b268c05da4 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Sun, 26 Mar 2023 06:53:04 -0700 Subject: [PATCH 0738/1411] Remove incorrect validation logic on S3 bucket names (#3947) S3 bucket names can have dots in them, see [this documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html) > Bucket names can consist only of lowercase letters, numbers, dots (.), and hyphens (-). This was originally reported in delta-io/delta-rs#1239 by @gray-sat --- object_store/src/aws/mod.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 7d10f3728238..752fb2e7df9d 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -414,7 +414,7 @@ pub struct AmazonS3Builder { /// let typed_options = vec![ /// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), /// ]; -/// let azure = AmazonS3Builder::new() +/// let aws = AmazonS3Builder::new() /// .try_with_options(options) /// .unwrap() /// .try_with_options(typed_options) @@ -738,13 +738,9 @@ impl AmazonS3Builder { fn parse_url(&mut self, url: &str) -> Result<()> { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; match parsed.scheme() { - "s3" | "s3a" => self.bucket_name = Some(validate(host)?), + "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { Some(("s3", bucket, "amazonaws", "com")) => { self.bucket_name = Some(bucket.to_string()); @@ -1389,6 +1385,15 @@ mod tests { builder.parse_url("s3://bucket/path").unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("s3://buckets.can.have.dots/path") + .unwrap(); + assert_eq!( + builder.bucket_name, + Some("buckets.can.have.dots".to_string()) + ); + let mut builder = AmazonS3Builder::new(); builder .parse_url("https://s3.bucket.amazonaws.com") @@ -1405,7 +1410,6 @@ mod tests { let err_cases = [ "mailto://bucket/path", - "s3://bucket.mydomain/path", "https://s3.bucket.mydomain.com", "https://s3.bucket.foo.amazonaws.com", "https://bucket.mydomain.region.amazonaws.com", From e2c3f22b07255a0906820b470a991f59f2d4d8d8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 26 Mar 2023 14:13:30 -0700 Subject: [PATCH 0739/1411] Add multiply_fixed_point (#3945) * Add multiply_fixed_point * Add test * Rename to as_i128 --- arrow-arith/src/arithmetic.rs | 113 ++++++++++++++++++++++++++++++++++ arrow-buffer/src/bigint.rs | 24 ++++++++ 2 files changed, 137 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 7d60d131bf52..f44776ad2b90 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1183,6 +1183,58 @@ pub fn multiply_fixed_point_checked( .and_then(|a| a.with_precision_and_scale(precision, required_scale)) } +/// Perform `left * right` operation on two decimal arrays. If either left or right value is +/// null then the result is also null. +/// +/// This performs decimal multiplication which allows precision loss if an exact representation +/// is not possible for the result, according to the required scale. In the case, the result +/// will be rounded to the required scale. +/// +/// If the required scale is greater than the product scale, an error is returned. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_fixed_point_checked` instead. +/// +/// It is implemented for compatibility with precision loss `multiply` function provided by +/// other data processing engines. For multiplication with precision loss detection, use +/// `multiply` or `multiply_checked` instead. +pub fn multiply_fixed_point( + left: &PrimitiveArray, + right: &PrimitiveArray, + required_scale: i8, +) -> Result, ArrowError> { + let product_scale = left.scale() + right.scale(); + let precision = min( + left.precision() + right.precision() + 1, + DECIMAL128_MAX_PRECISION, + ); + + if required_scale == product_scale { + return multiply(left, right)? + .with_precision_and_scale(precision, required_scale); + } + + if required_scale > product_scale { + return Err(ArrowError::ComputeError(format!( + "Required scale {} is greater than product scale {}", + required_scale, product_scale + ))); + } + + let divisor = + i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); + + binary::<_, _, _, Decimal128Type>(left, right, |a, b| { + let a = i256::from_i128(a); + let b = i256::from_i128(b); + + let mut mul = a.wrapping_mul(b); + mul = divide_and_round::(mul, divisor); + mul.as_i128() + }) + .and_then(|a| a.with_precision_and_scale(precision, required_scale)) +} + /// Divide a decimal native value by given divisor and round the result. fn divide_and_round(input: I::Native, div: I::Native) -> I::Native where @@ -3363,4 +3415,65 @@ mod tests { .to_string() .contains("Required scale 5 is greater than product scale 4")); } + + #[test] + fn test_decimal_multiply_allow_precision_loss_overflow() { + // [99999999999123456789] + let a = Decimal128Array::from(vec![99999999999123456789000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [9999999999910] + let b = Decimal128Array::from(vec![9999999999910000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + let err = multiply_fixed_point_checked(&a, &b, 28).unwrap_err(); + assert!(err.to_string().contains( + "Overflow happened on: 99999999999123456789000000000000000000 * 9999999999910000000000000000000" + )); + + let result = multiply_fixed_point(&a, &b, 28).unwrap(); + let expected = + Decimal128Array::from(vec![62946009661555981610246871926660136960]) + .with_precision_and_scale(38, 28) + .unwrap(); + + assert_eq!(&expected, &result); + } + + #[test] + fn test_decimal_multiply_fixed_point() { + // [123456789] + let a = Decimal128Array::from(vec![123456789000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [10] + let b = Decimal128Array::from(vec![10000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // `multiply` overflows on this case. + let result = multiply(&a, &b).unwrap(); + let expected = + Decimal128Array::from(vec![-16672482290199102048610367863168958464]) + .with_precision_and_scale(38, 10) + .unwrap(); + assert_eq!(&expected, &result); + + // Avoid overflow by reducing the scale. + let result = multiply_fixed_point(&a, &b, 28).unwrap(); + // [1234567890] + let expected = + Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); + + assert_eq!(&expected, &result); + assert_eq!( + result.value_as_string(0), + "1234567890.0000000000000000000000000000" + ); + } } diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 4625d24c109a..5abfb7c85230 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -152,6 +152,11 @@ impl i256 { (high_negative == low_negative && high_valid).then_some(self.low as i128) } + /// Wraps this `i256` into an `i128` + pub fn as_i128(self) -> i128 { + self.low as i128 + } + /// Return the memory representation of this integer as a byte array in little-endian byte order. #[inline] pub const fn to_le_bytes(self) -> [u8; 32] { @@ -891,4 +896,23 @@ mod tests { assert!(a.to_i64().is_none()); assert!(a.to_u64().is_none()); } + + #[test] + fn test_i256_as_i128() { + let a = i256::from_i128(i128::MAX).wrapping_add(i256::from_i128(1)); + let i128 = a.as_i128(); + assert_eq!(i128, i128::MIN); + + let a = i256::from_i128(i128::MAX).wrapping_add(i256::from_i128(2)); + let i128 = a.as_i128(); + assert_eq!(i128, i128::MIN + 1); + + let a = i256::from_i128(i128::MIN).wrapping_sub(i256::from_i128(1)); + let i128 = a.as_i128(); + assert_eq!(i128, i128::MAX); + + let a = i256::from_i128(i128::MIN).wrapping_sub(i256::from_i128(2)); + let i128 = a.as_i128(); + assert_eq!(i128, i128::MAX - 1); + } } From 9bd2bae586ed5b0edfd699f89a0855d79f61b611 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 26 Mar 2023 22:22:13 +0100 Subject: [PATCH 0740/1411] Zero-copy conversion from Vec to PrimitiveArray (#3917) * Zero-copy conversion from Vec to PrimitiveArray * Further tweaks --- arrow-array/src/array/binary_array.rs | 4 ++-- arrow-array/src/array/primitive_array.rs | 2 +- arrow-array/src/record_batch.rs | 2 +- arrow-string/src/substring.rs | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index b965279fb796..5a07f9a0ab5c 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -209,8 +209,8 @@ where let data_len = offsets.len() - 1; let array_data = ArrayData::builder(Self::DATA_TYPE) .len(data_len) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .add_buffer(Buffer::from_vec(offsets)) + .add_buffer(Buffer::from_vec(values)) .null_bit_buffer(Some(null_buf.into())); let array_data = unsafe { array_data.build_unchecked() }; Self::from(array_data) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index bc62677c738b..c6ae275855a1 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -971,7 +971,7 @@ macro_rules! def_numeric_from_vec { fn from(data: Vec<<$ty as ArrowPrimitiveType>::Native>) -> Self { let array_data = ArrayData::builder($ty::DATA_TYPE) .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)); + .add_buffer(Buffer::from_vec(data)); let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 9e9f15daea4b..02ced1a0ba92 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -603,7 +603,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 672); + assert_eq!(record_batch.get_array_memory_size(), 628); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index 7ee33f7fc282..997b26361587 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -296,7 +296,7 @@ fn binary_substring( None, array.nulls().map(|b| b.inner().sliced()), 0, - vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], + vec![Buffer::from_vec(new_offsets), new_values.into()], vec![], ) }; @@ -420,7 +420,7 @@ fn utf8_substring( None, array.nulls().map(|b| b.inner().sliced()), 0, - vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], + vec![Buffer::from_vec(new_offsets), new_values.into()], vec![], ) }; From b05522f54480ad95f323514dabc419a73acc5092 Mon Sep 17 00:00:00 2001 From: WEI Xikai Date: Tue, 28 Mar 2023 19:02:48 +0800 Subject: [PATCH 0741/1411] feat: support async writer (#1269) (#3957) * feat: support async writer * fix: clippy warnings and test failure * fix: broken docs * feat: flush the inner async writer when threshold is exceeded --- parquet/src/arrow/async_writer/mod.rs | 369 ++++++++++++++++++++++++++ parquet/src/arrow/mod.rs | 4 + 2 files changed, 373 insertions(+) create mode 100644 parquet/src/arrow/async_writer/mod.rs diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs new file mode 100644 index 000000000000..dc000f248c9b --- /dev/null +++ b/parquet/src/arrow/async_writer/mod.rs @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains async writer which writes arrow data into parquet data. +//! +//! Provides `async` API for writing [`RecordBatch`]es as parquet files. The API is +//! similar to the [`sync` API](crate::arrow::arrow_writer::ArrowWriter), so please +//! read the documentation there before using this API. +//! +//! Here is an example for using [`AsyncArrowWriter`]: +//! ``` +//! # #[tokio::main(flavor="current_thread")] +//! # async fn main() { +//! # +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Int64Array, RecordBatch, RecordBatchReader}; +//! use bytes::Bytes; +//! use parquet::arrow::{AsyncArrowWriter, arrow_reader::ParquetRecordBatchReaderBuilder}; +//! +//! let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; +//! let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); +//! +//! let mut buffer = Vec::new(); +//! let mut writer = +//! AsyncArrowWriter::try_new(&mut buffer, to_write.schema(), 0, None).unwrap(); +//! writer.write(&to_write).await.unwrap(); +//! writer.close().await.unwrap(); +//! +//! let buffer = Bytes::from(buffer); +//! let mut reader = ParquetRecordBatchReaderBuilder::try_new(buffer.clone()) +//! .unwrap() +//! .build() +//! .unwrap(); +//! let read = reader.next().unwrap().unwrap(); +//! +//! assert_eq!(to_write, read); +//! # } +//! ``` + +use std::{ + io::Write, + sync::{Arc, Mutex}, +}; + +use crate::{ + arrow::ArrowWriter, + errors::{ParquetError, Result}, + file::properties::WriterProperties, + format::{FileMetaData, KeyValue}, +}; +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; +use tokio::io::{AsyncWrite, AsyncWriteExt}; + +/// Async arrow writer. +/// +/// It is implemented based on the sync writer [`ArrowWriter`] with an inner buffer. +/// The buffered data will be flushed to the writer provided by caller when the +/// buffer's threshold is exceeded. +pub struct AsyncArrowWriter { + /// Underlying sync writer + sync_writer: ArrowWriter, + + /// Async writer provided by caller + async_writer: W, + + /// The inner buffer shared by the `sync_writer` and the `async_writer` + shared_buffer: SharedBuffer, + + /// The threshold triggering buffer flush + buffer_flush_threshold: usize, +} + +impl AsyncArrowWriter { + /// Try to create a new Async Arrow Writer. + /// + /// `buffer_flush_threshold` will be used to trigger flush of the inner buffer. + pub fn try_new( + writer: W, + arrow_schema: SchemaRef, + buffer_flush_threshold: usize, + props: Option, + ) -> Result { + let shared_buffer = SharedBuffer::default(); + let sync_writer = + ArrowWriter::try_new(shared_buffer.clone(), arrow_schema, props)?; + + Ok(Self { + sync_writer, + async_writer: writer, + shared_buffer, + buffer_flush_threshold, + }) + } + + /// Enqueues the provided `RecordBatch` to be written + /// + /// After every sync write by the inner [ArrowWriter], the inner buffer will be + /// checked and flush if threshold is reached. + pub async fn write(&mut self, batch: &RecordBatch) -> Result<()> { + self.sync_writer.write(batch)?; + Self::try_flush( + &self.shared_buffer, + &mut self.async_writer, + self.buffer_flush_threshold, + ) + .await + } + + /// Append [`KeyValue`] metadata in addition to those in [`WriterProperties`] + /// + /// This method allows to append metadata after [`RecordBatch`]es are written. + pub fn append_key_value_metadata(&mut self, kv_metadata: KeyValue) { + self.sync_writer.append_key_value_metadata(kv_metadata); + } + + /// Close and finalize the writer. + /// + /// All the data in the inner buffer will be force flushed. + pub async fn close(mut self) -> Result { + let metadata = self.sync_writer.close()?; + + // Force to flush the remaining data. + Self::try_flush(&self.shared_buffer, &mut self.async_writer, 0).await?; + + Ok(metadata) + } + + /// Flush the data in the [`SharedBuffer`] into the `async_writer` if its size + /// exceeds the threshold. + async fn try_flush( + shared_buffer: &SharedBuffer, + async_writer: &mut W, + threshold: usize, + ) -> Result<()> { + let mut buffer = { + let mut buffer = shared_buffer.buffer.lock().unwrap(); + + if buffer.is_empty() || buffer.len() < threshold { + // no need to flush + return Ok(()); + } + std::mem::take(&mut *buffer) + }; + + async_writer + .write(&buffer) + .await + .map_err(|e| ParquetError::External(Box::new(e)))?; + async_writer + .flush() + .await + .map_err(|e| ParquetError::External(Box::new(e)))?; + + // reuse the buffer. + buffer.clear(); + *shared_buffer.buffer.lock().unwrap() = buffer; + + Ok(()) + } +} + +/// A buffer with interior mutability shared by the [`ArrowWriter`] and +/// [`AsyncArrowWriter`]. +#[derive(Clone, Default)] +struct SharedBuffer { + /// The inner buffer for reading and writing + /// + /// The lock is used to obtain internal mutability, so no worry about the + /// lock contention. + buffer: Arc>>, +} + +impl Write for SharedBuffer { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let mut buffer = self.buffer.lock().unwrap(); + Write::write(&mut *buffer, buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + let mut buffer = self.buffer.lock().unwrap(); + Write::flush(&mut *buffer) + } +} + +#[cfg(test)] +mod tests { + use arrow_array::{ArrayRef, Int64Array, RecordBatchReader}; + use bytes::Bytes; + use tokio::pin; + + use crate::arrow::arrow_reader::{ + ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, + }; + + use super::*; + + fn get_test_reader() -> ParquetRecordBatchReader { + let testdata = arrow::util::test_util::parquet_test_data(); + // This test file is large enough to generate multiple row groups. + let path = format!("{}/alltypes_tiny_pages_plain.parquet", testdata); + let original_data = Bytes::from(std::fs::read(path).unwrap()); + ParquetRecordBatchReaderBuilder::try_new(original_data) + .unwrap() + .build() + .unwrap() + } + + #[tokio::test] + async fn test_async_writer() { + let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; + let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); + + let mut buffer = Vec::new(); + let mut writer = + AsyncArrowWriter::try_new(&mut buffer, to_write.schema(), 0, None).unwrap(); + writer.write(&to_write).await.unwrap(); + writer.close().await.unwrap(); + + let buffer = Bytes::from(buffer); + let mut reader = ParquetRecordBatchReaderBuilder::try_new(buffer) + .unwrap() + .build() + .unwrap(); + let read = reader.next().unwrap().unwrap(); + + assert_eq!(to_write, read); + } + + // Read the data from the test file and write it by the async writer and sync writer. + // And then compares the results of the two writers. + #[tokio::test] + async fn test_async_writer_with_sync_writer() { + let reader = get_test_reader(); + + let write_props = WriterProperties::builder() + .set_max_row_group_size(64) + .build(); + + let mut async_buffer = Vec::new(); + let mut async_writer = AsyncArrowWriter::try_new( + &mut async_buffer, + reader.schema(), + 1024, + Some(write_props.clone()), + ) + .unwrap(); + + let mut sync_buffer = Vec::new(); + let mut sync_writer = + ArrowWriter::try_new(&mut sync_buffer, reader.schema(), Some(write_props)) + .unwrap(); + for record_batch in reader { + let record_batch = record_batch.unwrap(); + async_writer.write(&record_batch).await.unwrap(); + sync_writer.write(&record_batch).unwrap(); + } + sync_writer.close().unwrap(); + async_writer.close().await.unwrap(); + + assert_eq!(sync_buffer, async_buffer); + } + + struct TestAsyncSink { + sink: Vec, + min_accept_bytes: usize, + expect_total_bytes: usize, + } + + impl AsyncWrite for TestAsyncSink { + fn poll_write( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + let written_bytes = self.sink.len(); + if written_bytes + buf.len() < self.expect_total_bytes { + assert!(buf.len() >= self.min_accept_bytes); + } else { + assert_eq!(written_bytes + buf.len(), self.expect_total_bytes); + } + + let sink = &mut self.get_mut().sink; + pin!(sink); + sink.poll_write(cx, buf) + } + + fn poll_flush( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let sink = &mut self.get_mut().sink; + pin!(sink); + sink.poll_flush(cx) + } + + fn poll_shutdown( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let sink = &mut self.get_mut().sink; + pin!(sink); + sink.poll_shutdown(cx) + } + } + + #[tokio::test] + async fn test_async_writer_with_buffer_flush_threshold() { + let write_props = WriterProperties::builder() + .set_max_row_group_size(2048) + .build(); + let expect_encode_size = { + let reader = get_test_reader(); + let mut buffer = Vec::new(); + let mut async_writer = AsyncArrowWriter::try_new( + &mut buffer, + reader.schema(), + 0, + Some(write_props.clone()), + ) + .unwrap(); + for record_batch in reader { + let record_batch = record_batch.unwrap(); + async_writer.write(&record_batch).await.unwrap(); + } + async_writer.close().await.unwrap(); + buffer.len() + }; + + let test_buffer_flush_thresholds = + vec![0, 1024, 40 * 1024, 50 * 1024, 100 * 1024, usize::MAX]; + + for buffer_flush_threshold in test_buffer_flush_thresholds { + let reader = get_test_reader(); + let mut test_async_sink = TestAsyncSink { + sink: Vec::new(), + min_accept_bytes: buffer_flush_threshold, + expect_total_bytes: expect_encode_size, + }; + let mut async_writer = AsyncArrowWriter::try_new( + &mut test_async_sink, + reader.schema(), + buffer_flush_threshold, + Some(write_props.clone()), + ) + .unwrap(); + + for record_batch in reader { + let record_batch = record_batch.unwrap(); + async_writer.write(&record_batch).await.unwrap(); + } + async_writer.close().await.unwrap(); + } + } +} diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 97d0c25e2b4f..73a4f2350047 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -112,6 +112,8 @@ mod decoder; #[cfg(feature = "async")] pub mod async_reader; +#[cfg(feature = "async")] +pub mod async_writer; mod record_reader; experimental!(mod schema); @@ -121,6 +123,8 @@ pub use self::arrow_reader::{ArrowReader, ParquetFileArrowReader}; pub use self::arrow_writer::ArrowWriter; #[cfg(feature = "async")] pub use self::async_reader::ParquetRecordBatchStreamBuilder; +#[cfg(feature = "async")] +pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; pub use self::schema::{ From a667af8bfa57d367e7a1ec0a990d2d85bc8cc4d2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Mar 2023 13:05:08 +0100 Subject: [PATCH 0742/1411] Add Strongly Typed Array Slice (#3929) (#3930) * Add PrimitiveArray slice (#3880) * Add ByteArray::slice (#3929) * Add strongly typed Array::slice (#3929) --- arrow-arith/src/aggregate.rs | 21 ++++++++----------- arrow-arith/src/arithmetic.rs | 13 ++++++------ arrow-arith/src/arity.rs | 6 ++---- arrow-arith/src/boolean.rs | 8 +++---- arrow-array/src/array/byte_array.rs | 9 ++++++-- arrow-array/src/array/dictionary_array.rs | 11 +++++++--- .../src/array/fixed_size_binary_array.rs | 9 ++++++-- .../src/array/fixed_size_list_array.rs | 9 ++++++-- arrow-array/src/array/list_array.rs | 9 ++++++-- arrow-array/src/array/map_array.rs | 9 ++++++-- arrow-array/src/array/mod.rs | 2 +- arrow-array/src/array/null_array.rs | 9 ++++++-- arrow-array/src/array/primitive_array.rs | 9 ++++++-- arrow-array/src/array/run_array.rs | 11 +++++++--- arrow-array/src/array/struct_array.rs | 9 ++++++-- arrow-array/src/array/union_array.rs | 8 ++++++- arrow-ipc/src/writer.rs | 8 ++----- arrow-ord/src/comparison.rs | 10 ++++----- arrow-select/src/concat.rs | 15 +++++-------- arrow-select/src/filter.rs | 12 +++++------ arrow-select/src/nullif.rs | 5 ++--- arrow-select/src/take.rs | 8 +------ arrow-string/src/length.rs | 8 +++---- arrow/tests/array_validation.rs | 4 +--- parquet/src/arrow/array_reader/list_array.rs | 4 ++-- 25 files changed, 127 insertions(+), 99 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 9e9d9333fdcb..a9944db13ee1 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -1219,13 +1219,12 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = sliced_input.as_primitive::(); - assert_eq!(sliced_input, &input); + assert_eq!(&sliced_input, &input); - let actual = min(sliced_input); + let actual = min(&sliced_input); assert_eq!(actual, expected); - let actual = max(sliced_input); + let actual = max(&sliced_input); assert_eq!(actual, expected); } @@ -1265,13 +1264,12 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = sliced_input.as_string::(); - assert_eq!(sliced_input, &input); + assert_eq!(&sliced_input, &input); - let actual = min_string(sliced_input); + let actual = min_string(&sliced_input); assert_eq!(actual, expected); - let actual = max_string(sliced_input); + let actual = max_string(&sliced_input); assert_eq!(actual, expected); } @@ -1288,13 +1286,12 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = sliced_input.as_binary::(); - assert_eq!(sliced_input, &input); + assert_eq!(&sliced_input, &input); - let actual = min_binary(sliced_input); + let actual = min_binary(&sliced_input); assert_eq!(actual, expected); - let actual = max_binary(sliced_input); + let actual = max_binary(&sliced_input); assert_eq!(actual, expected); } diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index f44776ad2b90..501878afd1ff 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -2095,7 +2095,7 @@ mod tests { fn test_primitive_array_add_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let actual = add_scalar(a.as_primitive(), 3).unwrap(); + let actual = add_scalar(&a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(12), Some(11), None]); assert_eq!(actual, expected); } @@ -2125,7 +2125,7 @@ mod tests { fn test_primitive_array_subtract_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let actual = subtract_scalar(a.as_primitive(), 3).unwrap(); + let actual = subtract_scalar(&a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(6), Some(5), None]); assert_eq!(actual, expected); } @@ -2155,7 +2155,7 @@ mod tests { fn test_primitive_array_multiply_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let actual = multiply_scalar(a.as_primitive(), 3).unwrap(); + let actual = multiply_scalar(&a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(27), Some(24), None]); assert_eq!(actual, expected); } @@ -2275,7 +2275,7 @@ mod tests { fn test_primitive_array_divide_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let actual = divide_scalar(a.as_primitive(), 3).unwrap(); + let actual = divide_scalar(&a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(3), Some(2), None]); assert_eq!(actual, expected); } @@ -2298,12 +2298,11 @@ mod tests { fn test_int_array_modulus_scalar_sliced() { let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); let a = a.slice(1, 4); - let a = a.as_primitive(); - let actual = modulus_scalar(a, 3).unwrap(); + let actual = modulus_scalar(&a, 3).unwrap(); let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); assert_eq!(actual, expected); - let actual = modulus_scalar_dyn::(a, 3).unwrap(); + let actual = modulus_scalar_dyn::(&a, 3).unwrap(); let actual = actual.as_primitive::(); let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); assert_eq!(actual, &expected); diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 501a240f37d5..d69bbde8d056 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -499,7 +499,6 @@ where mod tests { use super::*; use arrow_array::builder::*; - use arrow_array::cast::*; use arrow_array::types::*; #[test] @@ -507,14 +506,13 @@ mod tests { let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); let input_slice = input.slice(1, 4); - let input_slice: &Float64Array = input_slice.as_primitive(); - let result = unary(input_slice, |n| n.round()); + let result = unary(&input_slice, |n| n.round()); assert_eq!( result, Float64Array::from(vec![None, Some(7.0), None, Some(7.0)]) ); - let result = unary_dyn::<_, Float64Type>(input_slice, |n| n + 1.0).unwrap(); + let result = unary_dyn::<_, Float64Type>(&input_slice, |n| n + 1.0).unwrap(); assert_eq!( result.as_any().downcast_ref::().unwrap(), diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index eaef1378258b..258d683ad71a 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -775,7 +775,7 @@ mod tests { let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]); let a = a.slice(8, 4); - let res = is_null(a.as_ref()).unwrap(); + let res = is_null(&a).unwrap(); let expected = BooleanArray::from(vec![false, false, false, false]); @@ -800,7 +800,7 @@ mod tests { let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]); let a = a.slice(8, 4); - let res = is_not_null(a.as_ref()).unwrap(); + let res = is_not_null(&a).unwrap(); let expected = BooleanArray::from(vec![true, true, true, true]); @@ -843,7 +843,7 @@ mod tests { ]); let a = a.slice(8, 4); - let res = is_null(a.as_ref()).unwrap(); + let res = is_null(&a).unwrap(); let expected = BooleanArray::from(vec![false, true, false, true]); @@ -886,7 +886,7 @@ mod tests { ]); let a = a.slice(8, 4); - let res = is_not_null(a.as_ref()).unwrap(); + let res = is_not_null(&a).unwrap(); let expected = BooleanArray::from(vec![true, false, true, false]); diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 991e02501505..34e7d79ab3e0 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -134,6 +134,12 @@ impl GenericByteArray { ArrayIter::new(self) } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { @@ -247,8 +253,7 @@ impl Array for GenericByteArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 49a184369801..343fed76846a 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -323,6 +323,12 @@ impl DictionaryArray { self.keys.is_valid(i).then(|| self.keys.value(i).as_usize()) } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Downcast this dictionary to a [`TypedDictionaryArray`] /// /// ``` @@ -601,8 +607,7 @@ impl Array for DictionaryArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { @@ -693,7 +698,7 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - self.dictionary.slice(offset, length) + Arc::new(self.dictionary.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index af51ff787722..bb76fd63d649 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -110,6 +110,12 @@ impl FixedSizeBinaryArray { self.data.buffers()[0].clone() } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Create an array from an iterable argument of sparse byte slices. /// Sparsity means that items returned by the iterator are optional, i.e input argument can /// contain `None` items. @@ -473,8 +479,7 @@ impl Array for FixedSizeBinaryArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 0910e2944f76..1a421fe53c25 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -106,6 +106,12 @@ impl FixedSizeListArray { i as i32 * self.length } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Creates a [`FixedSizeListArray`] from an iterator of primitive values /// # Example /// ``` @@ -216,8 +222,7 @@ impl Array for FixedSizeListArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 895f150079e5..af5ce59fe4d8 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -132,6 +132,12 @@ impl GenericListArray { } } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Creates a [`GenericListArray`] from an iterator of primitive values /// # Example /// ``` @@ -253,8 +259,7 @@ impl Array for GenericListArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index fbe32d4b2092..439aaf7064de 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -95,6 +95,12 @@ impl MapArray { let offsets = self.value_offsets(); offsets[i + 1] - offsets[i] } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } } impl From for MapArray { @@ -222,8 +228,7 @@ impl Array for MapArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9afefc07f8d4..9b3855eabafa 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -141,7 +141,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// // Make slice over the values [2, 3, 4] /// let array_slice = array.slice(1, 3); /// - /// assert_eq!(array_slice.as_ref(), &Int32Array::from(vec![2, 3, 4])); + /// assert_eq!(&array_slice, &Int32Array::from(vec![2, 3, 4])); /// ``` fn slice(&self, offset: usize, length: usize) -> ArrayRef; diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 3d65e9e9ebad..b5d9247a6d7f 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -54,6 +54,12 @@ impl NullArray { let array_data = unsafe { array_data.build_unchecked() }; NullArray::from(array_data) } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } } impl Array for NullArray { @@ -74,8 +80,7 @@ impl Array for NullArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index c6ae275855a1..b463e016c852 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -408,6 +408,12 @@ impl PrimitiveArray { indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index))) } + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } + /// Reinterprets this array's contents as a different data type without copying /// /// This can be used to efficiently convert between primitive arrays with the @@ -706,8 +712,7 @@ impl Array for PrimitiveArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 652ec0be6e6f..3cd5848f1f49 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -253,6 +253,12 @@ impl RunArray { } Ok(physical_indices) } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } } impl From for RunArray { @@ -307,8 +313,7 @@ impl Array for RunArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { @@ -505,7 +510,7 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - self.run_array.slice(offset, length) + Arc::new(self.run_array.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 4fe59c0c240f..4c9613afbf88 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -100,6 +100,12 @@ impl StructArray { .position(|c| c == &column_name) .map(|pos| self.column(pos)) } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } } impl From for StructArray { @@ -205,8 +211,7 @@ impl Array for StructArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index fe227226f77d..6c372d8d05b9 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -287,6 +287,12 @@ impl UnionArray { _ => unreachable!("Union array's data type is not a union!"), } } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + // TODO: Slice buffers directly (#3880) + self.data.slice(offset, length).into() + } } impl From for UnionArray { @@ -328,7 +334,7 @@ impl Array for UnionArray { } fn slice(&self, offset: usize, length: usize) -> ArrayRef { - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) } fn nulls(&self) -> Option<&NullBuffer> { diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 2d859f608387..07d4b0fe9f93 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -2024,15 +2024,11 @@ mod tests { ); let sliced = array.slice(1, 2); - let read_sliced: &UInt32Array = sliced.as_primitive(); - assert_eq!( - vec![Some(2), Some(3)], - read_sliced.iter().collect::>() - ); + assert_eq!(vec![Some(2), Some(3)], sliced.iter().collect::>()); let batch = RecordBatch::try_new( Arc::new(Schema::new(vec![Field::new("a", DataType::UInt32, true)])), - vec![sliced], + vec![Arc::new(sliced)], ) .expect("new batch"); diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index aa2f1416d83d..d984bee0fdb9 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -2858,7 +2858,7 @@ mod tests { // slice and test if the dynamic array works let a = a.slice(0, a.len()); let b = b.slice(0, b.len()); - let c = $DYN_KERNEL(a.as_ref(), b.as_ref()).unwrap(); + let c = $DYN_KERNEL(&a, &b).unwrap(); assert_eq!(BooleanArray::from($EXPECTED), c); // test with a larger version of the same data to ensure we cover the chunked part of the comparison @@ -2995,8 +2995,7 @@ mod tests { fn test_primitive_array_eq_scalar_with_slice() { let a = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); let a = a.slice(1, 3); - let a: &Int32Array = a.as_primitive(); - let a_eq = eq_scalar(a, 2).unwrap(); + let a_eq = eq_scalar(&a, 2).unwrap(); assert_eq!( a_eq, BooleanArray::from(vec![None, Some(true), Some(false)]) @@ -3797,14 +3796,13 @@ mod tests { vec![Some("hi"), None, Some("hello"), Some("world"), Some("")], ); let a = a.slice(1, 4); - let a = a.as_string::(); - let a_eq = eq_utf8_scalar(a, "hello").unwrap(); + let a_eq = eq_utf8_scalar(&a, "hello").unwrap(); assert_eq!( a_eq, BooleanArray::from(vec![None, Some(true), Some(false), Some(false)]) ); - let a_eq2 = eq_utf8_scalar(a, "").unwrap(); + let a_eq2 = eq_utf8_scalar(&a, "").unwrap(); assert_eq!( a_eq2, diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 7d42584514f1..e34cc9edb884 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -243,7 +243,7 @@ mod tests { None, ]) .slice(1, 3); - let arr = concat(&[input_1.as_ref(), input_2.as_ref()]).unwrap(); + let arr = concat(&[&input_1, &input_2]).unwrap(); let expected_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -399,11 +399,8 @@ mod tests { ])); let input_struct_2 = StructArray::from(vec![(field, input_primitive_2)]); - let arr = concat(&[ - input_struct_1.slice(1, 3).as_ref(), - input_struct_2.slice(1, 2).as_ref(), - ]) - .unwrap(); + let arr = + concat(&[&input_struct_1.slice(1, 3), &input_struct_2.slice(1, 2)]).unwrap(); let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -426,8 +423,7 @@ mod tests { let input_1 = StringArray::from(vec!["hello", "A", "B", "C"]); let input_2 = StringArray::from(vec!["world", "D", "E", "Z"]); - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()]) - .unwrap(); + let arr = concat(&[&input_1.slice(1, 3), &input_2.slice(1, 2)]).unwrap(); let expected_output = StringArray::from(vec!["A", "B", "C", "D", "E"]); @@ -440,8 +436,7 @@ mod tests { let input_1 = StringArray::from(vec![Some("hello"), None, Some("A"), Some("C")]); let input_2 = StringArray::from(vec![None, Some("world"), Some("D"), None]); - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()]) - .unwrap(); + let arr = concat(&[&input_1.slice(1, 3), &input_2.slice(1, 2)]).unwrap(); let expected_output = StringArray::from(vec![None, Some("A"), Some("C"), Some("world"), Some("D")]); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 567aaa58e8bf..f71a3cbc2ab0 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -764,13 +764,12 @@ mod tests { #[test] fn test_filter_array_slice() { - let a_slice = Int32Array::from(vec![5, 6, 7, 8, 9]).slice(1, 4); - let a = a_slice.as_ref(); + let a = Int32Array::from(vec![5, 6, 7, 8, 9]).slice(1, 4); let b = BooleanArray::from(vec![true, false, false, true]); // filtering with sliced filter array is not currently supported // let b_slice = BooleanArray::from(vec![true, false, false, true, false]).slice(1, 4); // let b = b_slice.as_any().downcast_ref().unwrap(); - let c = filter(a, &b).unwrap(); + let c = filter(&a, &b).unwrap(); let d = c.as_ref().as_any().downcast_ref::().unwrap(); assert_eq!(2, d.len()); assert_eq!(6, d.value(0)); @@ -868,14 +867,13 @@ mod tests { #[test] fn test_filter_array_slice_with_null() { - let a_slice = + let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]).slice(1, 4); - let a = a_slice.as_ref(); let b = BooleanArray::from(vec![true, false, false, true]); // filtering with sliced filter array is not currently supported // let b_slice = BooleanArray::from(vec![true, false, false, true, false]).slice(1, 4); // let b = b_slice.as_any().downcast_ref().unwrap(); - let c = filter(a, &b).unwrap(); + let c = filter(&a, &b).unwrap(); let d = c.as_ref().as_any().downcast_ref::().unwrap(); assert_eq!(2, d.len()); assert!(d.is_null(0)); @@ -996,7 +994,7 @@ mod tests { let mask1 = BooleanArray::from(vec![Some(true), Some(true), None]); let out = filter(&a, &mask1).unwrap(); - assert_eq!(&out, &a.slice(0, 2)); + assert_eq!(out.as_ref(), &a.slice(0, 2)); } #[test] diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index a1b9c0e3e183..0fbbb3868691 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -210,7 +210,7 @@ mod tests { let s = s.slice(2, 3); let select = select.slice(1, 3); let select = select.as_boolean(); - let a = nullif(s.as_ref(), select).unwrap(); + let a = nullif(&s, select).unwrap(); let r: Vec<_> = a.as_string::().iter().collect(); assert_eq!(r, vec![None, Some("a"), None]); } @@ -500,7 +500,6 @@ mod tests { for (a_offset, a_length) in a_slices { let a = a.slice(a_offset, a_length); - let a = a.as_primitive::(); for i in 1..65 { let b_start_offset = rng.gen_range(0..i); @@ -512,7 +511,7 @@ mod tests { let b = b.slice(b_start_offset, a_length); let b = b.as_boolean(); - test_nullif(a, b); + test_nullif(&a, b); } } } diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index f59e64015ceb..7a497da55492 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1567,14 +1567,8 @@ mod tests { StringArray::from(vec![Some("hello"), None, Some("world"), None, Some("hi")]); let indices = Int32Array::from(vec![Some(0), Some(1), None, Some(0), Some(2)]); let indices_slice = indices.slice(1, 4); - let indices_slice = indices_slice - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); - let expected = StringArray::from(vec![None, None, Some("hello"), Some("world")]); - let result = take(&strings, indices_slice, None).unwrap(); + let result = take(&strings, &indices_slice, None).unwrap(); assert_eq!(result.as_ref(), &expected); } diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index f0c09a7ec4d8..c206fffb9166 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -426,7 +426,7 @@ mod tests { fn length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = length(b.as_ref()).unwrap(); + let result = length(&b).unwrap(); let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(1), Some(5), None]); @@ -439,7 +439,7 @@ mod tests { vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = length(b.as_ref()).unwrap(); + let result = length(&b).unwrap(); let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(1), Some(2), None]); @@ -581,7 +581,7 @@ mod tests { fn bit_length_offsets_string() { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); - let result = bit_length(b.as_ref()).unwrap(); + let result = bit_length(&b).unwrap(); let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(8), Some(40), None]); @@ -594,7 +594,7 @@ mod tests { vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); - let result = bit_length(b.as_ref()).unwrap(); + let result = bit_length(&b).unwrap(); let result: &Int32Array = result.as_primitive(); let expected = Int32Array::from(vec![Some(0), Some(40), None]); diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 7e45ee7afcda..6c4249e38c58 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -941,9 +941,7 @@ fn test_try_new_sliced_struct() { let struct_array = builder.finish(); let struct_array_slice = struct_array.slice(1, 3); - let struct_array_data = struct_array_slice.data(); - - let cloned = make_array(struct_array_data.clone()); + let cloned = struct_array_slice.clone(); assert_eq!(&struct_array_slice, &cloned); } diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 965142f3840b..dbbac657ebd1 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -401,10 +401,10 @@ mod tests { let expected_2 = expected.slice(2, 2); let actual = l1.next_batch(2).unwrap(); - assert_eq!(expected_1.as_ref(), actual.as_ref()); + assert_eq!(actual.as_ref(), &expected_1); let actual = l1.next_batch(1024).unwrap(); - assert_eq!(expected_2.as_ref(), actual.as_ref()); + assert_eq!(actual.as_ref(), &expected_2); } fn test_required_list() { From 151ce6f9fc9661648fb52f474cf8dd09511c6ee5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Mar 2023 13:58:54 +0100 Subject: [PATCH 0743/1411] Deprecate Array::data_ref (#3880) (#3923) --- arrow-array/src/array/binary_array.rs | 7 +- .../src/array/fixed_size_binary_array.rs | 8 +- arrow-array/src/array/mod.rs | 27 +++--- arrow-cast/src/cast.rs | 24 ++---- arrow-ord/src/comparison.rs | 83 +++---------------- arrow-select/src/take.rs | 26 +++--- arrow-select/src/window.rs | 2 +- arrow/tests/array_validation.rs | 29 +------ 8 files changed, 62 insertions(+), 144 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 5a07f9a0ab5c..530f3835ce10 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -47,13 +47,14 @@ impl GenericBinaryArray { } fn from_list(v: GenericListArray) -> Self { + let v = v.into_data(); assert_eq!( - v.data_ref().child_data().len(), + v.child_data().len(), 1, "BinaryArray can only be created from list array of u8 values \ (i.e. List>)." ); - let child_data = &v.data_ref().child_data()[0]; + let child_data = &v.child_data()[0]; assert_eq!( child_data.child_data().len(), @@ -75,7 +76,7 @@ impl GenericBinaryArray { let builder = ArrayData::builder(Self::DATA_TYPE) .len(v.len()) .offset(v.offset()) - .add_buffer(v.data_ref().buffers()[0].clone()) + .add_buffer(v.buffers()[0].clone()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .nulls(v.nulls().cloned()); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index bb76fd63d649..75f6bf91442d 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -402,13 +402,15 @@ impl From for ArrayData { /// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array impl From for FixedSizeBinaryArray { fn from(v: FixedSizeListArray) -> Self { + let value_len = v.value_length(); + let v = v.into_data(); assert_eq!( - v.data_ref().child_data().len(), + v.child_data().len(), 1, "FixedSizeBinaryArray can only be created from list array of u8 values \ (i.e. FixedSizeList>)." ); - let child_data = &v.data_ref().child_data()[0]; + let child_data = &v.child_data()[0]; assert_eq!( child_data.child_data().len(), @@ -427,7 +429,7 @@ impl From for FixedSizeBinaryArray { "The child array cannot contain null values." ); - let builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) + let builder = ArrayData::builder(DataType::FixedSizeBinary(value_len)) .len(v.len()) .offset(v.offset()) .add_buffer(child_data.buffers()[0].slice(child_data.offset())) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9b3855eabafa..1e4019b4f61d 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -110,6 +110,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns a reference-counted pointer to the underlying data of this array. /// /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) + #[deprecated(note = "Use Array::to_data or Array::into_data")] fn data_ref(&self) -> &ArrayData { self.data() } @@ -126,6 +127,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(*array.data_type(), DataType::Int32); /// ``` + #[allow(deprecated)] // (#3880) fn data_type(&self) -> &DataType { self.data_ref().data_type() } @@ -156,6 +158,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.len(), 5); /// ``` + #[allow(deprecated)] // (#3880) fn len(&self) -> usize { self.data_ref().len() } @@ -171,6 +174,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.is_empty(), false); /// ``` + #[allow(deprecated)] // (#3880) fn is_empty(&self) -> bool { self.data_ref().is_empty() } @@ -191,6 +195,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// assert_eq!(array.offset(), 0); /// assert_eq!(array_slice.offset(), 1); /// ``` + #[allow(deprecated)] // (#3880) fn offset(&self) -> usize { self.data_ref().offset() } @@ -250,6 +255,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. + #[allow(deprecated)] // (#3880) fn get_buffer_memory_size(&self) -> usize { self.data_ref().get_buffer_memory_size() } @@ -257,6 +263,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory occupied physically by this array. /// This value will always be greater than returned by `get_buffer_memory_size()` and /// includes the overhead of the data structures that contain the pointers to the various buffers. + #[allow(deprecated)] // (#3880) fn get_array_memory_size(&self) -> usize { // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, // to only count additional fields of this array substract size_of(ArrayData) @@ -286,6 +293,7 @@ impl Array for ArrayRef { self.data().clone() } + #[allow(deprecated)] fn data_ref(&self) -> &ArrayData { self.as_ref().data_ref() } @@ -352,6 +360,7 @@ impl<'a, T: Array> Array for &'a T { self.data().clone() } + #[allow(deprecated)] fn data_ref(&self) -> &ArrayData { T::data_ref(self) } @@ -997,19 +1006,17 @@ mod tests { (0..256).map(|i| (i % values.len()) as i16), ); - let dict_data = ArrayData::builder(DataType::Dictionary( + let dict_data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), - )) - .len(keys.len()) - .buffers(keys.data_ref().buffers().to_vec()) - .child_data(vec![ArrayData::builder(DataType::Int64) - .len(values.len()) - .buffers(values.data_ref().buffers().to_vec()) + ); + let dict_data = keys + .into_data() + .into_builder() + .data_type(dict_data_type) + .child_data(vec![values.into_data()]) .build() - .unwrap()]) - .build() - .unwrap(); + .unwrap(); let empty_data = ArrayData::new_empty(&DataType::Dictionary( Box::new(DataType::Int16), diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ba909649da3a..806ff8771573 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3448,9 +3448,9 @@ where OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, OffsetSizeTo: OffsetSizeTrait + NumCast, { - let data = array.data_ref(); + let list = array.as_list::(); // the value data stored by the list - let value_data = data.child_data()[0].clone(); + let values = list.values(); let out_dtype = match array.data_type() { DataType::List(value_type) => { @@ -3473,7 +3473,7 @@ where std::mem::size_of::(), std::mem::size_of::() ); - if value_data.len() > i32::MAX as usize { + if values.len() > i32::MAX as usize { return Err(ArrowError::ComputeError( "LargeList too large to cast to List".into(), )); @@ -3484,14 +3484,7 @@ where _ => unreachable!(), }; - // Safety: - // The first buffer is the offsets and they are aligned to OffSetSizeFrom: (i64 or i32) - // Justification: - // The safe variant data.buffer:: take the offset into account and we - // cannot create a list array with offsets starting at non zero. - let offsets = unsafe { data.buffers()[0].as_slice().align_to::() }.1; - - let iter = offsets.iter().map(|idx| { + let iter = list.value_offsets().iter().map(|idx| { let idx: OffsetSizeTo = NumCast::from(*idx).unwrap(); idx }); @@ -3502,14 +3495,13 @@ where // wrap up let builder = ArrayData::builder(out_dtype) - .offset(array.offset()) - .len(array.len()) + .len(list.len()) .add_buffer(offset_buffer) - .add_child_data(value_data) - .nulls(data.nulls().cloned()); + .add_child_data(values.to_data()) + .nulls(list.nulls().cloned()); let array_data = unsafe { builder.build_unchecked() }; - Ok(make_array(array_data)) + Ok(Arc::new(GenericListArray::::from(array_data))) } #[cfg(test)] diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index d984bee0fdb9..683fd068af40 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -26,8 +26,7 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_data::bit_mask::combine_option_bitmap; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; use arrow_select::take::take; @@ -1220,8 +1219,7 @@ where )); } - let null_bit_buffer = - combine_option_bitmap(&[left.data_ref(), right.data_ref()], len); + let nulls = NullBuffer::union(left.nulls(), right.nulls()); // we process the data in chunks so that each iteration results in one u64 of comparison result bits const CHUNK_SIZE: usize = 64; @@ -1282,18 +1280,8 @@ where result_remainder.copy_from_slice(remainder_mask_as_bytes); } - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let values = BooleanBuffer::new(result.into(), 0, len); + Ok(BooleanArray::new(values, nulls)) } /// Helper function to perform boolean lambda function on values from an array and a scalar value using @@ -2724,19 +2712,13 @@ where let num_bytes = bit_util::ceil(left_len, 8); - let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); let bool_slice = bool_buf.as_slice_mut(); // if both array slots are valid, check if list contains primitive for i in 0..left_len { - if bit_util::get_bit(not_both_null_bitmap, i) { + if nulls.as_ref().map(|n| n.is_valid(i)).unwrap_or(true) { let list = right.value(i); let list = list.as_any().downcast_ref::>().unwrap(); @@ -2749,18 +2731,8 @@ where } } - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + let values = BooleanBuffer::new(bool_buf.into(), 0, left_len); + Ok(BooleanArray::new(values, None)) } /// Checks if a [`GenericListArray`] contains a value in the [`GenericStringArray`] @@ -2781,24 +2753,15 @@ where let num_bytes = bit_util::ceil(left_len, 8); - let not_both_null_bit_buffer = - match combine_option_bitmap(&[left.data_ref(), right.data_ref()], left_len) { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - + let nulls = NullBuffer::union(left.nulls(), right.nulls()); let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); let bool_slice = &mut bool_buf; for i in 0..left_len { // contains(null, null) = false - if bit_util::get_bit(not_both_null_bitmap, i) { + if nulls.as_ref().map(|n| n.is_valid(i)).unwrap_or(true) { let list = right.value(i); - let list = list - .as_any() - .downcast_ref::>() - .unwrap(); + let list = list.as_string::(); for j in 0..list.len() { if list.is_valid(j) && (left.value(i) == list.value(j)) { @@ -2808,28 +2771,8 @@ where } } } - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -// create a buffer and fill it with valid bits -#[inline] -fn new_all_set_buffer(len: usize) -> Buffer { - let buffer = MutableBuffer::new(len); - let buffer = buffer.with_bitset(len, true); - - buffer.into() + let values = BooleanBuffer::new(bool_buf.into(), 0, left_len); + Ok(BooleanArray::new(values, None)) } // disable wrapping inside literal vectors used for test data and assertions diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 7a497da55492..316f78d62f43 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -741,13 +741,13 @@ where IndexType: ArrowPrimitiveType, IndexType::Native: ToPrimitive, { - let data_ref = values.data_ref(); + let nulls = values.nulls(); let array_iter = indices .values() .iter() .map(|idx| { let idx = maybe_usize::(*idx)?; - if data_ref.is_valid(idx) { + if nulls.map(|n| n.is_valid(idx)).unwrap_or(true) { Ok(Some(values.value(idx))) } else { Ok(None) @@ -774,20 +774,14 @@ where I::Native: ToPrimitive, { let new_keys = take_primitive::(values.keys(), indices)?; - let new_keys_data = new_keys.data_ref(); - - let data = unsafe { - ArrayData::new_unchecked( - values.data_type().clone(), - new_keys.len(), - Some(new_keys_data.null_count()), - new_keys_data.nulls().map(|b| b.inner().sliced()), - 0, - new_keys_data.buffers().to_vec(), - values.data().child_data().to_vec(), - ) - }; - + let builder = new_keys + .into_data() + .into_builder() + .data_type(values.data_type().clone()) + .child_data(vec![values.values().to_data()]); + + // Safety: Indices were valid before + let data = unsafe { builder.build_unchecked() }; Ok(DictionaryArray::::from(data)) } diff --git a/arrow-select/src/window.rs b/arrow-select/src/window.rs index 70ac86857db2..2ad51561c69b 100644 --- a/arrow-select/src/window.rs +++ b/arrow-select/src/window.rs @@ -55,7 +55,7 @@ use num::abs; pub fn shift(array: &dyn Array, offset: i64) -> Result { let value_len = array.len() as i64; if offset == 0 { - Ok(make_array(array.data_ref().clone())) + Ok(make_array(array.to_data())) } else if offset == i64::MIN || abs(offset) >= value_len { Ok(new_null_array(array.data_type(), array.len())) } else { diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 6c4249e38c58..e8485e961f45 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -16,9 +16,8 @@ // under the License. use arrow::array::{ - make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, - Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, - UInt8Builder, + make_array, Array, BooleanBuilder, Decimal128Builder, Int32Array, Int32Builder, + Int64Array, StringArray, StructBuilder, UInt64Array, }; use arrow_array::Decimal128Array; use arrow_buffer::{ArrowNativeType, Buffer}; @@ -994,28 +993,8 @@ fn test_string_data_from_foreign() { #[test] fn test_decimal_full_validation() { - let values_builder = UInt8Builder::with_capacity(10); - let byte_width = 16; - let mut fixed_size_builder = FixedSizeListBuilder::new(values_builder, byte_width); - let value_as_bytes = 123456_i128.to_le_bytes(); - fixed_size_builder - .values() - .append_slice(value_as_bytes.as_slice()); - fixed_size_builder.append(true); - let fixed_size_array = fixed_size_builder.finish(); - - // Build ArrayData for Decimal - let builder = ArrayData::builder(DataType::Decimal128(5, 3)) - .len(fixed_size_array.len()) - .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); - let array_data = unsafe { builder.build_unchecked() }; - array_data.validate_full().unwrap(); - - let array = Decimal128Array::from(array_data); - let error = array - .validate_decimal_precision(array.precision()) - .unwrap_err(); - + let array = Decimal128Array::from(vec![123456_i128]); + let error = array.validate_decimal_precision(5).unwrap_err(); assert_eq!( "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", error.to_string() From 72794a4397343db9ca36c945e2d7c9dacdf65796 Mon Sep 17 00:00:00 2001 From: Huxley Hu Date: Tue, 28 Mar 2023 21:23:40 +0800 Subject: [PATCH 0744/1411] Fix reading ipc files with unordered projections (#3966) --- arrow-ipc/src/reader.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index bb367f9447d5..bd7e33185a40 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -650,15 +650,15 @@ pub fn read_record_batch( // keep track of buffer and node index, the functions that create arrays mutate these let mut buffer_index = 0; let mut node_index = 0; - let mut arrays = vec![]; let options = RecordBatchOptions::new().with_row_count(Some(batch.length() as usize)); if let Some(projection) = projection { + let mut arrays = vec![]; // project fields for (idx, field) in schema.fields().iter().enumerate() { // Create array for projected field - if projection.contains(&idx) { + if let Some(proj_idx) = projection.iter().position(|p| p == &idx) { let triple = create_array( field_nodes, field, @@ -672,7 +672,7 @@ pub fn read_record_batch( )?; node_index = triple.1; buffer_index = triple.2; - arrays.push(triple.0); + arrays.push((proj_idx, triple.0)); } else { // Skip field. // This must be called to advance `node_index` and `buffer_index`. @@ -681,13 +681,14 @@ pub fn read_record_batch( buffer_index = tuple.1; } } - + arrays.sort_by_key(|t| t.0); RecordBatch::try_new_with_options( Arc::new(schema.project(projection)?), - arrays, + arrays.into_iter().map(|t| t.1).collect(), &options, ) } else { + let mut arrays = vec![]; // keep track of index as lists require more than one node for field in schema.fields() { let triple = create_array( @@ -1423,6 +1424,17 @@ mod tests { // check the projected column equals the expected column assert_eq!(projected_column.as_ref(), expected_column.as_ref()); } + + { + // read record batch with reversed projection + let reader = FileReader::try_new( + std::io::Cursor::new(buf.clone()), + Some(vec![3, 2, 1]), + ); + let read_batch = reader.unwrap().next().unwrap().unwrap(); + let expected_batch = batch.project(&[3, 2, 1]).unwrap(); + assert_eq!(read_batch, expected_batch); + } } #[test] From 552b6ada001f0ffc7b6a1da5932fc6661a163f75 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 Mar 2023 15:48:06 +0100 Subject: [PATCH 0745/1411] Update proc-macro2 requirement from =1.0.53 to =1.0.54 (#3968) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.53...1.0.54) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index c3b9cbd8c13a..8d04a821be56 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.53", default-features = false } +proc-macro2 = { version = "=1.0.54", default-features = false } prost-build = { version = "=0.11.8", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From eb36d37ceb2a41e972b07f571f2ae6c4e963f951 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Mar 2023 16:06:03 +0100 Subject: [PATCH 0746/1411] Add ListBuilder::append_value (#3949) (#3954) * Add ListBuilder::append_value (#3949) * Review feedback --- .../src/builder/generic_list_builder.rs | 92 ++++++++++++++++++- arrow-array/src/builder/mod.rs | 60 +----------- 2 files changed, 94 insertions(+), 58 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index f390b3c15da2..333a1bddb42b 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -125,11 +125,99 @@ where /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` #[inline] pub fn append(&mut self, is_valid: bool) { - self.offsets_builder - .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.offsets_builder.append(self.next_offset()); self.null_buffer_builder.append(is_valid); } + /// Returns the next offset + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + fn next_offset(&self) -> OffsetSize { + OffsetSize::from_usize(self.values_builder.len()).unwrap() + } + + /// Append a value to this [`GenericListBuilder`] + /// + /// ``` + /// # use arrow_array::builder::{Int32Builder, ListBuilder}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_array::types::Int32Type; + /// let mut builder = ListBuilder::new(Int32Builder::new()); + /// + /// builder.append_value([Some(1), Some(2), Some(3)]); + /// builder.append_value([]); + /// builder.append_value([None]); + /// + /// let array = builder.finish(); + /// assert_eq!(array.len(), 3); + /// + /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); + /// let values = array.values().as_primitive::(); + /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); + /// ``` + /// + /// This is an alternative API to appending directly to [`Self::values`] and + /// delimiting the result with [`Self::append`] + /// + /// ``` + /// # use arrow_array::builder::{Int32Builder, ListBuilder}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_array::types::Int32Type; + /// let mut builder = ListBuilder::new(Int32Builder::new()); + /// + /// builder.values().append_value(1); + /// builder.values().append_value(2); + /// builder.values().append_value(3); + /// builder.append(true); + /// builder.append(true); + /// builder.values().append_null(); + /// builder.append(true); + /// + /// let array = builder.finish(); + /// assert_eq!(array.len(), 3); + /// + /// assert_eq!(array.value_offsets(), &[0, 3, 3, 4]); + /// let values = array.values().as_primitive::(); + /// assert_eq!(values, &Int32Array::from(vec![Some(1), Some(2), Some(3), None])); + /// ``` + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(self.next_offset()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + /// Builds the [`GenericListArray`] and reset this builder. pub fn finish(&mut self) -> GenericListArray { let len = self.len(); diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index df26fa35832f..928b14165e0a 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -134,68 +134,16 @@ pub trait ArrayBuilder: Any + Send { /// Builder for [`ListArray`]s (i32 offsets) /// -/// [`ListArray`]: crate::array::ListArray -/// -/// # Example -/// -/// ``` -/// # use arrow_array::builder::{StringBuilder, ListBuilder}; -/// # use arrow_array::ListArray; -/// // Build a 3 element array of lists: -/// // -/// // column -/// // --------- -/// // [one] -/// // [] -/// // [two, three] -/// -/// let mut builder = ListBuilder::new(StringBuilder::new()); -/// // [one] -/// builder.values().append_value("one"); -/// builder.append(true); -/// // [] -/// builder.append(true); -/// // [two, three] -/// builder.values().append_value("two"); -/// builder.values().append_value("three"); -/// builder.append(true); +/// See [`GenericListBuilder`] for usage examples /// -/// // Create an array -/// let list_array: ListArray = builder.finish(); -/// ``` +/// [`ListArray`]: crate::array::ListArray pub type ListBuilder = GenericListBuilder; /// Builder for [`LargeListArray`]s (i64 offsets) /// -/// [`LargeListArray`]: crate::array::LargeListArray -/// -/// # Example -/// -/// ``` -/// # use arrow_array::builder::{StringBuilder, LargeListBuilder}; -/// # use arrow_array::LargeListArray; -/// // Build a 3 element array of lists: -/// // -/// // column -/// // --------- -/// // [one], -/// // [], -/// // [two, three] -/// -/// let mut builder = LargeListBuilder::new(StringBuilder::new()); -/// // [one] -/// builder.values().append_value("one"); -/// builder.append(true); -/// // [] -/// builder.append(true); -/// // [two, three] -/// builder.values().append_value("two"); -/// builder.values().append_value("three"); -/// builder.append(true); +/// See [`GenericListBuilder`] for usage examples /// -/// // Create an array -/// let list_array: LargeListArray = builder.finish(); -/// ``` +/// [`LargeListArray`]: crate::array::LargeListArray pub type LargeListBuilder = GenericListBuilder; /// A binary array builder with i32 offsets From 562061218c15ef5083d84bf33412d083759c3bfc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Mar 2023 16:45:00 +0100 Subject: [PATCH 0747/1411] Improve array builder documentation (#3949) (#3951) * Improve array builder documentation (#3949) * Review feedback --- .../src/builder/generic_list_builder.rs | 6 + arrow-array/src/builder/mod.rs | 131 +++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 333a1bddb42b..719070356a6f 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -40,6 +40,12 @@ pub struct GenericListBuilder { values_builder: T, } +impl Default for GenericListBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + impl GenericListBuilder { /// Creates a new [`GenericListBuilder`] from a given values array builder pub fn new(values_builder: T) -> Self { diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 928b14165e0a..41a4d92b0219 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -15,7 +15,136 @@ // specific language governing permissions and limitations // under the License. -//! Defines builders for the various array types +//! Defines builders that can be used to safely build arrays +//! +//! # Basic Usage +//! +//! Builders can be used to build simple, non-nested arrays +//! +//! ``` +//! # use arrow_array::builder::Int32Builder; +//! # use arrow_array::PrimitiveArray; +//! let mut a = Int32Builder::new(); +//! a.append_value(1); +//! a.append_null(); +//! a.append_value(2); +//! let a = a.finish(); +//! +//! assert_eq!(a, PrimitiveArray::from(vec![Some(1), None, Some(2)])); +//! ``` +//! +//! ``` +//! # use arrow_array::builder::StringBuilder; +//! # use arrow_array::{Array, StringArray}; +//! let mut a = StringBuilder::new(); +//! a.append_value("foo"); +//! a.append_value("bar"); +//! a.append_null(); +//! let a = a.finish(); +//! +//! assert_eq!(a, StringArray::from_iter([Some("foo"), Some("bar"), None])); +//! ``` +//! +//! # Nested Usage +//! +//! Builders can also be used to build more complex nested arrays, such as lists +//! +//! ``` +//! # use arrow_array::builder::{Int32Builder, ListBuilder}; +//! # use arrow_array::ListArray; +//! # use arrow_array::types::Int32Type; +//! let mut a = ListBuilder::new(Int32Builder::new()); +//! // [1, 2] +//! a.values().append_value(1); +//! a.values().append_value(2); +//! a.append(true); +//! // null +//! a.append(false); +//! // [] +//! a.append(true); +//! // [3, null] +//! a.values().append_value(3); +//! a.values().append_null(); +//! a.append(true); +//! +//! // [[1, 2], null, [], [3, null]] +//! let a = a.finish(); +//! +//! assert_eq!(a, ListArray::from_iter_primitive::([ +//! Some(vec![Some(1), Some(2)]), +//! None, +//! Some(vec![]), +//! Some(vec![Some(3), None])] +//! )) +//! ``` +//! +//! # Custom Builders +//! +//! It is common to have a collection of statically defined Rust types that +//! you want to convert to Arrow arrays. An example of doing so is below +//! +//! ``` +//! # use std::any::Any; +//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; +//! # use arrow_array::{ArrayRef, RecordBatch, StructArray}; +//! # use arrow_schema::{DataType, Field}; +//! # use std::sync::Arc; +//! /// A custom row representation +//! struct MyRow { +//! i32: i32, +//! optional_i32: Option, +//! string: Option, +//! i32_list: Option>>, +//! } +//! +//! /// Converts `Vec` into `StructArray` +//! #[derive(Debug, Default)] +//! struct MyRowBuilder { +//! i32: Int32Builder, +//! string: StringBuilder, +//! i32_list: ListBuilder, +//! } +//! +//! impl MyRowBuilder { +//! fn append(&mut self, row: &MyRow) { +//! self.i32.append_value(row.i32); +//! self.string.append_option(row.string.as_ref()); +//! self.i32_list.append_option(row.i32_list.as_ref().map(|x| x.iter().copied())); +//! } +//! +//! /// Note: returns StructArray to allow nesting within another array if desired +//! fn finish(&mut self) -> StructArray { +//! let i32 = Arc::new(self.i32.finish()) as ArrayRef; +//! let i32_field = Field::new("i32", DataType::Int32, false); +//! +//! let string = Arc::new(self.string.finish()) as ArrayRef; +//! let string_field = Field::new("i32", DataType::Utf8, false); +//! +//! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; +//! let value_field = Box::new(Field::new("item", DataType::Int32, true)); +//! let i32_list_field = Field::new("i32_list", DataType::List(value_field), true); +//! +//! StructArray::from(vec![ +//! (i32_field, i32), +//! (string_field, string), +//! (i32_list_field, i32_list), +//! ]) +//! } +//! } +//! +//! impl<'a> Extend<&'a MyRow> for MyRowBuilder { +//! fn extend>(&mut self, iter: T) { +//! iter.into_iter().for_each(|row| self.append(row)); +//! } +//! } +//! +//! /// Converts a slice of [`MyRow`] to a [`RecordBatch`] +//! fn rows_to_batch(rows: &[MyRow]) -> RecordBatch { +//! let mut builder = MyRowBuilder::default(); +//! builder.extend(rows); +//! RecordBatch::from(&builder.finish()) +//! } +//! ``` mod boolean_buffer_builder; pub use boolean_buffer_builder::*; From 6c13dd70d952046d2c33779a41430416e7cbc4e1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 28 Mar 2023 15:20:15 -0700 Subject: [PATCH 0748/1411] PrimitiveDictionaryBuilder.finish should use actual value type (#3972) --- .../src/builder/primitive_dictionary_builder.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index a996128d5e9d..41880d3a478c 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -273,8 +273,10 @@ where let values = self.values_builder.finish(); let keys = self.keys_builder.finish(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + let data_type = DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + ); let builder = keys .into_data() @@ -398,5 +400,12 @@ mod tests { ); let dict_array = builder.finish(); assert_eq!(dict_array.value_type(), DataType::Decimal128(1, 2)); + assert_eq!( + dict_array.data_type(), + &DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal128(1, 2)), + ) + ); } } From e919e992a1154dc44dac43c8623be68b05fe6ca1 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 28 Mar 2023 22:43:43 -0700 Subject: [PATCH 0749/1411] feat: enable metadata import/export through C data interface (#3944) * feat: enable metadata export through C data interface * chore: clippy warnings * Update arrow-schema/src/ffi.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * make parsing more defensive. * use IntoIterator * handle integer overflow --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- .../tests/test_sql.py | 6 + arrow-schema/src/ffi.rs | 169 +++++++++++++++++- arrow/src/ffi.rs | 27 ++- 3 files changed, 195 insertions(+), 7 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 98564408d937..f631f67cbfea 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -138,6 +138,12 @@ def test_field_roundtrip(pyarrow_type): field = rust.round_trip_field(pyarrow_field) assert field == pyarrow_field +def test_field_metadata_roundtrip(): + metadata = {"hello": "World! 😊", "x": "2"} + pyarrow_field = pa.field("test", pa.int32(), metadata=metadata) + field = rust.round_trip_field(pyarrow_field) + assert field == pyarrow_field + assert field.metadata == pyarrow_field.metadata def test_schema_roundtrip(): pyarrow_fields = zip(string.ascii_lowercase, _supported_pyarrow_types) diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 8e58e3158c8b..058febbdd35c 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -36,7 +36,10 @@ use crate::{ArrowError, DataType, Field, Schema, TimeUnit, UnionMode}; use bitflags::bitflags; -use std::ffi::{c_char, c_void, CStr, CString}; +use std::{ + collections::HashMap, + ffi::{c_char, c_void, CStr, CString}, +}; bitflags! { pub struct Flags: i64 { @@ -74,6 +77,7 @@ pub struct FFI_ArrowSchema { struct SchemaPrivateData { children: Box<[*mut FFI_ArrowSchema]>, dictionary: *mut FFI_ArrowSchema, + metadata: Option>, } // callback used to drop [FFI_ArrowSchema] when it is exported. @@ -130,6 +134,7 @@ impl FFI_ArrowSchema { let mut private_data = Box::new(SchemaPrivateData { children: children_ptr, dictionary: dictionary_ptr, + metadata: None, }); // intentionally set from private_data (see https://github.com/apache/arrow-rs/issues/580) @@ -152,6 +157,63 @@ impl FFI_ArrowSchema { Ok(self) } + pub fn with_metadata(mut self, metadata: I) -> Result + where + I: IntoIterator, + S: AsRef, + { + let metadata: Vec<(S, S)> = metadata.into_iter().collect(); + // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata + let new_metadata = if !metadata.is_empty() { + let mut metadata_serialized: Vec = Vec::new(); + let num_entries: i32 = metadata.len().try_into().map_err(|_| { + ArrowError::CDataInterface(format!( + "metadata can only have {} entries, but {} were provided", + i32::MAX, + metadata.len() + )) + })?; + metadata_serialized.extend(num_entries.to_ne_bytes()); + + for (key, value) in metadata.into_iter() { + let key_len: i32 = key.as_ref().len().try_into().map_err(|_| { + ArrowError::CDataInterface(format!( + "metadata key can only have {} bytes, but {} were provided", + i32::MAX, + key.as_ref().len() + )) + })?; + let value_len: i32 = value.as_ref().len().try_into().map_err(|_| { + ArrowError::CDataInterface(format!( + "metadata value can only have {} bytes, but {} were provided", + i32::MAX, + value.as_ref().len() + )) + })?; + + metadata_serialized.extend(key_len.to_ne_bytes()); + metadata_serialized.extend_from_slice(key.as_ref().as_bytes()); + metadata_serialized.extend(value_len.to_ne_bytes()); + metadata_serialized.extend_from_slice(value.as_ref().as_bytes()); + } + + self.metadata = metadata_serialized.as_ptr() as *const c_char; + Some(metadata_serialized) + } else { + self.metadata = std::ptr::null_mut(); + None + }; + + unsafe { + let mut private_data = + Box::from_raw(self.private_data as *mut SchemaPrivateData); + private_data.metadata = new_metadata; + self.private_data = Box::into_raw(private_data) as *mut c_void; + } + + Ok(self) + } + pub fn empty() -> Self { Self { format: std::ptr::null_mut(), @@ -212,6 +274,71 @@ impl FFI_ArrowSchema { pub fn dictionary_ordered(&self) -> bool { self.flags & 0b00000001 != 0 } + + pub fn metadata(&self) -> Result, ArrowError> { + if self.metadata.is_null() { + Ok(HashMap::new()) + } else { + let mut pos = 0; + let buffer: *const u8 = self.metadata as *const u8; + + fn next_four_bytes(buffer: *const u8, pos: &mut isize) -> [u8; 4] { + let out = unsafe { + [ + *buffer.offset(*pos), + *buffer.offset(*pos + 1), + *buffer.offset(*pos + 2), + *buffer.offset(*pos + 3), + ] + }; + *pos += 4; + out + } + + fn next_n_bytes(buffer: *const u8, pos: &mut isize, n: i32) -> &[u8] { + let out = unsafe { + std::slice::from_raw_parts(buffer.offset(*pos), n.try_into().unwrap()) + }; + *pos += isize::try_from(n).unwrap(); + out + } + + let num_entries = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos)); + if num_entries < 0 { + return Err(ArrowError::CDataInterface( + "Negative number of metadata entries".to_string(), + )); + } + + let mut metadata = HashMap::with_capacity( + num_entries.try_into().expect("Too many metadata entries"), + ); + + for _ in 0..num_entries { + let key_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos)); + if key_length < 0 { + return Err(ArrowError::CDataInterface( + "Negative key length in metadata".to_string(), + )); + } + let key = String::from_utf8( + next_n_bytes(buffer, &mut pos, key_length).to_vec(), + )?; + let value_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos)); + if value_length < 0 { + return Err(ArrowError::CDataInterface( + "Negative value length in metadata".to_string(), + )); + } + let value = String::from_utf8( + next_n_bytes(buffer, &mut pos, value_length).to_vec(), + )?; + metadata.insert(key, value); + } + + Ok(metadata) + } + } } impl Drop for FFI_ArrowSchema { @@ -421,7 +548,8 @@ impl TryFrom<&FFI_ArrowSchema> for Field { fn try_from(c_schema: &FFI_ArrowSchema) -> Result { let dtype = DataType::try_from(c_schema)?; - let field = Field::new(c_schema.name(), dtype, c_schema.nullable()); + let mut field = Field::new(c_schema.name(), dtype, c_schema.nullable()); + field.set_metadata(c_schema.metadata()?); Ok(field) } } @@ -433,7 +561,7 @@ impl TryFrom<&FFI_ArrowSchema> for Schema { // interpret it as a struct type then extract its fields let dtype = DataType::try_from(c_schema)?; if let DataType::Struct(fields) = dtype { - Ok(Schema::new(fields)) + Ok(Schema::new(fields).with_metadata(c_schema.metadata()?)) } else { Err(ArrowError::CDataInterface( "Unable to interpret C data struct as a Schema".to_string(), @@ -558,7 +686,8 @@ impl TryFrom<&Field> for FFI_ArrowSchema { FFI_ArrowSchema::try_from(field.data_type())? .with_name(field.name())? - .with_flags(flags) + .with_flags(flags)? + .with_metadata(field.metadata()) } } @@ -567,7 +696,8 @@ impl TryFrom<&Schema> for FFI_ArrowSchema { fn try_from(schema: &Schema) -> Result { let dtype = DataType::Struct(schema.fields().clone()); - let c_schema = FFI_ArrowSchema::try_from(&dtype)?; + let c_schema = + FFI_ArrowSchema::try_from(&dtype)?.with_metadata(&schema.metadata)?; Ok(c_schema) } } @@ -655,7 +785,9 @@ mod tests { Field::new("name", DataType::Utf8, false), Field::new("address", DataType::Utf8, false), Field::new("priority", DataType::UInt8, false), - ]); + ]) + .with_metadata([("hello".to_string(), "world".to_string())].into()); + round_trip_schema(schema); // test that we can interpret struct types as schema @@ -700,4 +832,29 @@ mod tests { let arrow_schema = FFI_ArrowSchema::try_from(schema).unwrap(); assert!(arrow_schema.child(0).dictionary_ordered()); } + + #[test] + fn test_set_field_metadata() { + let metadata_cases: Vec> = vec![ + [].into(), + [("key".to_string(), "value".to_string())].into(), + [ + ("key".to_string(), "".to_string()), + ("ascii123".to_string(), "你好".to_string()), + ("".to_string(), "value".to_string()), + ] + .into(), + ]; + + let mut schema = FFI_ArrowSchema::try_new("b", vec![], None) + .unwrap() + .with_name("test") + .unwrap(); + + for metadata in metadata_cases { + schema = schema.with_metadata(&metadata).unwrap(); + let field = Field::try_from(&schema).unwrap(); + assert_eq!(field.metadata(), &metadata); + } + } } diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 9d0ed0b85fb6..9179b1279ff6 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -497,7 +497,8 @@ mod tests { use crate::datatypes::{Field, Int8Type}; use arrow_array::builder::UnionBuilder; use arrow_array::types::{Float64Type, Int32Type}; - use arrow_array::{Float64Array, UnionArray}; + use arrow_array::{Float64Array, StructArray, UnionArray}; + use std::collections::HashMap; use std::convert::TryFrom; use std::mem::ManuallyDrop; use std::ptr::addr_of_mut; @@ -1092,6 +1093,30 @@ mod tests { Ok(()) } + #[test] + fn test_struct_array() -> Result<()> { + let metadata: HashMap = + [("Hello".to_string(), "World! 😊".to_string())].into(); + let struct_array = StructArray::from(vec![( + Field::new("a", DataType::Int32, false).with_metadata(metadata), + Arc::new(Int32Array::from(vec![2, 4, 6])) as Arc, + )]); + + // export it + let array = ArrowArray::try_from(struct_array.data().clone())?; + + // (simulate consumer) import it + let data = ArrayData::try_from(array)?; + let array = make_array(data); + + // perform some operation + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array.data_type(), struct_array.data_type()); + assert_eq!(array, &struct_array); + + Ok(()) + } + #[test] fn test_union_sparse_array() -> Result<()> { let mut builder = UnionBuilder::new_sparse(); From 869e6bc13bdbd503dddfa620a0ae62f3c0145a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinan=20Gen=C3=A7o=C4=9Flu?= Date: Wed, 29 Mar 2023 15:44:34 +0300 Subject: [PATCH 0750/1411] add Date32/Date64 support to subtract_dyn (#3974) * add Date32/Date64 support to subtract_dyn * This commit includes: 1- add subtract date utilities to types 2- add test for subtract_dyn function * fix formatting --- arrow-arith/src/arithmetic.rs | 190 ++++++++++++++++++++++++++++++++++ arrow-array/src/types.rs | 102 ++++++++++++++++++ 2 files changed, 292 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 501878afd1ff..6c0fd497efbe 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -923,6 +923,54 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { typed_dict_math_op!(left, right, |a, b| a.sub_wrapping(b), math_op_dict) } + DataType::Date32 => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Date64 => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } _ => { downcast_primitive_array!( (left, right) => { @@ -955,6 +1003,54 @@ pub fn subtract_dyn_checked( math_checked_op_dict ) } + DataType::Date32 => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date32Type::subtract_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Date64 => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_op(l, r, Date64Type::subtract_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } _ => { downcast_primitive_array!( (left, right) => { @@ -1864,6 +1960,100 @@ mod tests { assert_eq!(1, c.value(4)); } + #[test] + fn test_date32_month_subtract() { + let a = Date32Array::from(vec![Date32Type::from_naive_date( + NaiveDate::from_ymd_opt(2000, 7, 1).unwrap(), + )]); + let b = + IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(6, 3)]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(1994, 4, 1).unwrap()) + ); + } + + #[test] + fn test_date32_day_time_subtract() { + let a = Date32Array::from(vec![Date32Type::from_naive_date( + NaiveDate::from_ymd_opt(2023, 3, 29).unwrap(), + )]); + let b = + IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 86500)]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 27).unwrap()) + ); + } + + #[test] + fn test_date32_month_day_nano_subtract() { + let a = Date32Array::from(vec![Date32Type::from_naive_date( + NaiveDate::from_ymd_opt(2023, 3, 15).unwrap(), + )]); + let b = + IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( + 1, 2, 0, + )]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 2, 13).unwrap()) + ); + } + + #[test] + fn test_date64_month_subtract() { + let a = Date64Array::from(vec![Date64Type::from_naive_date( + NaiveDate::from_ymd_opt(2000, 7, 1).unwrap(), + )]); + let b = + IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(6, 3)]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(1994, 4, 1).unwrap()) + ); + } + + #[test] + fn test_date64_day_time_subtract() { + let a = Date64Array::from(vec![Date64Type::from_naive_date( + NaiveDate::from_ymd_opt(2023, 3, 29).unwrap(), + )]); + let b = + IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 86500)]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 27).unwrap()) + ); + } + + #[test] + fn test_date64_month_day_nano_subtract() { + let a = Date64Array::from(vec![Date64Type::from_naive_date( + NaiveDate::from_ymd_opt(2023, 3, 15).unwrap(), + )]); + let b = + IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( + 1, 2, 0, + )]); + let c = subtract_dyn(&a, &b).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 2, 13).unwrap()) + ); + } + #[test] #[cfg(feature = "dyn_arith_dict")] fn test_primitive_array_subtract_dyn_dict() { diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 9f1965b77570..60a632a060d7 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -511,6 +511,57 @@ impl Date32Type { let res = res.add(Duration::nanoseconds(nanos)); Date32Type::from_naive_date(res) } + + /// Subtract the given IntervalYearMonthType to an arrow Date32Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to substract + pub fn subtract_year_months( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let prior = Date32Type::to_naive_date(date); + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + Date32Type::from_naive_date(posterior) + } + + /// Subtract the given IntervalDayTimeType to an arrow Date32Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to subtract + pub fn subtract_day_time( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = Date32Type::to_naive_date(date); + let res = res.add(Duration::days(days as i64)); + let res = res.add(Duration::milliseconds(ms as i64)); + Date32Type::from_naive_date(res) + } + + /// Subtract the given IntervalMonthDayNanoType to an arrow Date32Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to subtract + pub fn subtract_month_day_nano( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = Date32Type::to_naive_date(date); + let res = shift_months(res, -months); + let res = res.add(Duration::days(-days as i64)); + let res = res.add(Duration::nanoseconds(-nanos)); + Date32Type::from_naive_date(res) + } } impl Date64Type { @@ -584,6 +635,57 @@ impl Date64Type { let res = res.add(Duration::nanoseconds(nanos)); Date64Type::from_naive_date(res) } + + /// Subtract the given IntervalYearMonthType to an arrow Date64Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to substract + pub fn subtract_year_months( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let prior = Date64Type::to_naive_date(date); + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + Date64Type::from_naive_date(posterior) + } + + /// Subtract the given IntervalDayTimeType to an arrow Date64Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to subtract + pub fn subtract_day_time( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = Date64Type::to_naive_date(date); + let res = res.add(Duration::days(days as i64)); + let res = res.add(Duration::milliseconds(ms as i64)); + Date64Type::from_naive_date(res) + } + + /// Subtract the given IntervalMonthDayNanoType to an arrow Date64Type + /// + /// # Arguments + /// + /// * `date` - The date on which to perform the operation + /// * `delta` - The interval to subtract + pub fn subtract_month_day_nano( + date: ::Native, + delta: ::Native, + ) -> ::Native { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = Date64Type::to_naive_date(date); + let res = shift_months(res, -months); + let res = res.add(Duration::days(-days as i64)); + let res = res.add(Duration::nanoseconds(-nanos)); + Date64Type::from_naive_date(res) + } } /// Crate private types for Decimal Arrays From c876bbd74c4f3b5803f4c92576270a1ea4a24c5a Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 29 Mar 2023 15:00:09 +0200 Subject: [PATCH 0751/1411] feat: cast between `Binary`/`LargeBinary` and `FixedSizeBinary` (#3961) * feat: cast between Binary/LargeBinary and FixedSizeBinary * update tests * check LargeBinary overflow * refactor code --- arrow-cast/src/cast.rs | 151 ++++++++++++++++++++++++++++++++++---- arrow/tests/array_cast.rs | 2 +- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 806ff8771573..492c5db057a9 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -157,8 +157,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8 || from_type == &LargeUtf8, (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8, - (Binary, LargeBinary | Utf8 | LargeUtf8) => true, - (LargeBinary, Binary | Utf8 | LargeUtf8) => true, + (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, + (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, + (FixedSizeBinary(_), Binary | LargeBinary) => true, (Utf8, Binary | LargeBinary @@ -1242,6 +1243,9 @@ pub fn cast_with_options( LargeBinary => { cast_byte_container::(array) } + FixedSizeBinary(size) => { + cast_binary_to_fixed_size_binary::(array,*size, cast_options) + } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1253,6 +1257,17 @@ pub fn cast_with_options( } LargeUtf8 => cast_binary_to_string::(array, cast_options), Binary => cast_byte_container::(array), + FixedSizeBinary(size) => { + cast_binary_to_fixed_size_binary::(array, *size, cast_options) + } + _ => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), + }, + (FixedSizeBinary(size), _) => match to_type { + Binary => cast_fixed_size_binary_to_binary::(array, *size), + LargeBinary => + cast_fixed_size_binary_to_binary::(array, *size), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -3390,6 +3405,69 @@ fn cast_binary_to_string( } } +/// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 'FixedSizeBinaryArray'. +fn cast_binary_to_fixed_size_binary( + array: &dyn Array, + byte_width: i32, + cast_options: &CastOptions, +) -> Result { + let array = array.as_binary::(); + let mut builder = FixedSizeBinaryBuilder::with_capacity(array.len(), byte_width); + + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + } else { + match builder.append_value(array.value(i)) { + Ok(_) => {} + Err(e) => match cast_options.safe { + true => builder.append_null(), + false => return Err(e), + }, + } + } + } + + Ok(Arc::new(builder.finish())) +} + +/// Helper function to cast from 'FixedSizeBinaryArray' to one `BinaryArray` or 'LargeBinaryArray'. +/// If the target one is too large for the source array it will return an Error. +fn cast_fixed_size_binary_to_binary( + array: &dyn Array, + byte_width: i32, +) -> Result { + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + + let offsets: i128 = byte_width as i128 * array.len() as i128; + + let is_binary = matches!(GenericBinaryType::::DATA_TYPE, DataType::Binary); + if is_binary && offsets > i32::MAX as i128 { + return Err(ArrowError::ComputeError( + "FixedSizeBinary array too large to cast to Binary array".to_string(), + )); + } else if !is_binary && offsets > i64::MAX as i128 { + return Err(ArrowError::ComputeError( + "FixedSizeBinary array too large to cast to LargeBinary array".to_string(), + )); + } + + let mut builder = GenericBinaryBuilder::::with_capacity(array.len(), array.len()); + + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + } else { + builder.append_value(array.value(i)); + } + } + + Ok(Arc::new(builder.finish())) +} + /// Helper function to cast from one `ByteArrayType` to another and vice versa. /// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error. fn cast_byte_container(array: &dyn Array) -> Result @@ -5288,31 +5366,74 @@ mod tests { } #[test] - fn test_cast_string_to_binary() { - let string_1 = "Hi"; - let string_2 = "Hello"; - - let bytes_1 = string_1.as_bytes(); - let bytes_2 = string_2.as_bytes(); + fn test_cast_binary_to_fixed_size_binary() { + let bytes_1 = "Hiiii".as_bytes(); + let bytes_2 = "Hello".as_bytes(); - let string_data = vec![Some(string_1), Some(string_2), None]; - let a1 = Arc::new(StringArray::from(string_data.clone())) as ArrayRef; - let a2 = Arc::new(LargeStringArray::from(string_data)) as ArrayRef; + let binary_data = vec![Some(bytes_1), Some(bytes_2), None]; + let a1 = Arc::new(BinaryArray::from(binary_data.clone())) as ArrayRef; + let a2 = Arc::new(LargeBinaryArray::from(binary_data)) as ArrayRef; - let mut array_ref = cast(&a1, &DataType::Binary).unwrap(); - let down_cast = array_ref.as_any().downcast_ref::().unwrap(); + let array_ref = cast(&a1, &DataType::FixedSizeBinary(5)).unwrap(); + let down_cast = array_ref + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(bytes_1, down_cast.value(0)); assert_eq!(bytes_2, down_cast.value(1)); assert!(down_cast.is_null(2)); - array_ref = cast(&a2, &DataType::LargeBinary).unwrap(); + let array_ref = cast(&a2, &DataType::FixedSizeBinary(5)).unwrap(); let down_cast = array_ref .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(bytes_1, down_cast.value(0)); assert_eq!(bytes_2, down_cast.value(1)); assert!(down_cast.is_null(2)); + + // test error cases when the length of binary are not same + let bytes_1 = "Hi".as_bytes(); + let bytes_2 = "Hello".as_bytes(); + + let binary_data = vec![Some(bytes_1), Some(bytes_2), None]; + let a1 = Arc::new(BinaryArray::from(binary_data.clone())) as ArrayRef; + let a2 = Arc::new(LargeBinaryArray::from(binary_data)) as ArrayRef; + + let array_ref = cast_with_options( + &a1, + &DataType::FixedSizeBinary(5), + &CastOptions { safe: false }, + ); + assert!(array_ref.is_err()); + + let array_ref = cast_with_options( + &a2, + &DataType::FixedSizeBinary(5), + &CastOptions { safe: false }, + ); + assert!(array_ref.is_err()); + } + + #[test] + fn test_fixed_size_binary_to_binary() { + let bytes_1 = "Hiiii".as_bytes(); + let bytes_2 = "Hello".as_bytes(); + + let binary_data = vec![Some(bytes_1), Some(bytes_2), None]; + let a1 = Arc::new(FixedSizeBinaryArray::from(binary_data.clone())) as ArrayRef; + + let array_ref = cast(&a1, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let array_ref = cast(&a1, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); } #[test] diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 33695e2edeb6..4c1f5019597c 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -388,7 +388,7 @@ fn get_all_types() -> Vec { Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano), Binary, - FixedSizeBinary(10), + FixedSizeBinary(3), LargeBinary, Utf8, LargeUtf8, From 8d2f5dcf14b8d6236c6c8d8086da6d2751c9a0de Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 09:17:08 +0100 Subject: [PATCH 0752/1411] Add typed buffers to UnionArray (#3880) (#3933) * Add typed buffers to UnionArray (#3880) * Clippy * Update arrow-array/src/array/union_array.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- arrow-array/src/array/union_array.rs | 172 ++++++++++++----------- arrow-array/src/builder/union_builder.rs | 24 ++-- arrow-cast/src/display.rs | 2 +- arrow/src/ffi.rs | 33 ++--- 4 files changed, 115 insertions(+), 116 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 6c372d8d05b9..00ad94111a4d 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -17,7 +17,7 @@ use crate::{make_array, Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::Buffer; +use arrow_buffer::{Buffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field, UnionMode}; /// Contains the `UnionArray` type. @@ -109,6 +109,8 @@ use std::sync::Arc; #[derive(Clone)] pub struct UnionArray { data: ArrayData, + type_ids: ScalarBuffer, + offsets: Option>, boxed_fields: Vec>, } @@ -241,8 +243,17 @@ impl UnionArray { /// /// Panics if `index` is greater than the length of the array. pub fn type_id(&self, index: usize) -> i8 { - assert!(index < self.len()); - self.data().buffers()[0].as_slice()[self.offset() + index] as i8 + self.type_ids[index] + } + + /// Returns the `type_ids` buffer for this array + pub fn type_ids(&self) -> &ScalarBuffer { + &self.type_ids + } + + /// Returns the `offsets` buffer if this is a dense array + pub fn offsets(&self) -> Option<&ScalarBuffer> { + self.offsets.as_ref() } /// Returns the offset into the underlying values array for the array slot at `index`. @@ -250,12 +261,11 @@ impl UnionArray { /// # Panics /// /// Panics if `index` is greater than the length of the array. - pub fn value_offset(&self, index: usize) -> i32 { + pub fn value_offset(&self, index: usize) -> usize { assert!(index < self.len()); - if self.is_dense() { - self.data().buffers()[1].typed_data::()[self.offset() + index] - } else { - (self.offset() + index) as i32 + match &self.offsets { + Some(offsets) => offsets[index] as usize, + None => self.offset() + index, } } @@ -264,7 +274,7 @@ impl UnionArray { /// Panics if index `i` is out of bounds pub fn value(&self, i: usize) -> ArrayRef { let type_id = self.type_id(i); - let value_offset = self.value_offset(i) as usize; + let value_offset = self.value_offset(i); let child = self.child(type_id); child.slice(value_offset, 1) } @@ -297,16 +307,36 @@ impl UnionArray { impl From for UnionArray { fn from(data: ArrayData) -> Self { - let field_ids = match data.data_type() { - DataType::Union(_, ids, _) => ids, + let (field_ids, mode) = match data.data_type() { + DataType::Union(_, ids, mode) => (ids, *mode), d => panic!("UnionArray expected ArrayData with type Union got {d}"), }; + let (type_ids, offsets) = match mode { + UnionMode::Sparse => ( + ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()), + None, + ), + UnionMode::Dense => ( + ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()), + Some(ScalarBuffer::new( + data.buffers()[1].clone(), + data.offset(), + data.len(), + )), + ), + }; + let max_id = field_ids.iter().copied().max().unwrap_or_default() as usize; let mut boxed_fields = vec![None; max_id + 1]; for (cd, field_id) in data.child_data().iter().zip(field_ids) { boxed_fields[*field_id as usize] = Some(make_array(cd.clone())); } - Self { data, boxed_fields } + Self { + data, + type_ids, + offsets, + boxed_fields, + } } } @@ -370,16 +400,16 @@ impl std::fmt::Debug for UnionArray { writeln!(f, "{header}")?; writeln!(f, "-- type id buffer:")?; - writeln!(f, "{:?}", self.data().buffers()[0])?; + writeln!(f, "{:?}", self.type_ids)?; - let (fields, ids, mode) = match self.data_type() { - DataType::Union(f, ids, mode) => (f, ids, mode), + let (fields, ids) = match self.data_type() { + DataType::Union(f, ids, _) => (f, ids), _ => unreachable!(), }; - if mode == &UnionMode::Dense { + if let Some(offsets) = &self.offsets { writeln!(f, "-- offsets buffer:")?; - writeln!(f, "{:?}", self.data().buffers()[1])?; + writeln!(f, "{:?}", offsets)?; } assert_eq!(fields.len(), ids.len()); @@ -424,39 +454,33 @@ mod tests { let union = builder.build().unwrap(); let expected_type_ids = vec![0_i8, 1, 2, 0, 2, 0, 1]; - let expected_value_offsets = vec![0_i32, 0, 0, 1, 1, 2, 1]; + let expected_offsets = vec![0_i32, 0, 0, 1, 1, 2, 1]; let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7]; // Check type ids - assert_eq!( - *union.data().buffers()[0], - Buffer::from_slice_ref(&expected_type_ids) - ); + assert_eq!(*union.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); } // Check offsets - assert_eq!( - *union.data().buffers()[1], - Buffer::from_slice_ref(&expected_value_offsets) - ); - for (i, id) in expected_value_offsets.iter().enumerate() { - assert_eq!(&union.value_offset(i), id); + assert_eq!(*union.offsets().unwrap(), expected_offsets); + for (i, id) in expected_offsets.iter().enumerate() { + assert_eq!(union.value_offset(i), *id as usize); } // Check data assert_eq!( - *union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref([1_i32, 4, 6]) + *union.child(0).as_primitive::().values(), + [1_i32, 4, 6] ); assert_eq!( - *union.data().child_data()[1].buffers()[0], - Buffer::from_slice_ref([2_i32, 7]) + *union.child(1).as_primitive::().values(), + [2_i32, 7] ); assert_eq!( - *union.data().child_data()[2].buffers()[0], - Buffer::from_slice_ref([3_i32, 5]), + *union.child(2).as_primitive::().values(), + [3_i32, 5] ); assert_eq!(expected_array_values.len(), union.len()); @@ -476,7 +500,7 @@ mod tests { let mut builder = UnionBuilder::new_dense(); let expected_type_ids = vec![0_i8; 1024]; - let expected_value_offsets: Vec<_> = (0..1024).collect(); + let expected_offsets: Vec<_> = (0..1024).collect(); let expected_array_values: Vec<_> = (1..=1024).collect(); expected_array_values @@ -486,27 +510,21 @@ mod tests { let union = builder.build().unwrap(); // Check type ids - assert_eq!( - *union.data().buffers()[0], - Buffer::from_slice_ref(&expected_type_ids) - ); + assert_eq!(*union.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); } // Check offsets - assert_eq!( - *union.data().buffers()[1], - Buffer::from_slice_ref(&expected_value_offsets) - ); - for (i, id) in expected_value_offsets.iter().enumerate() { - assert_eq!(&union.value_offset(i), id); + assert_eq!(*union.offsets().unwrap(), expected_offsets); + for (i, id) in expected_offsets.iter().enumerate() { + assert_eq!(union.value_offset(i), *id as usize); } for (i, expected_value) in expected_array_values.iter().enumerate() { assert!(!union.is_null(i)); let slot = union.value(i); - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert_eq!(slot.len(), 1); let value = slot.value(0); assert_eq!(expected_value, &value); @@ -655,10 +673,10 @@ mod tests { let float_array = Float64Array::from(vec![10.0]); let type_ids = [1_i8, 0, 0, 2, 0, 1]; - let value_offsets = [0_i32, 0, 1, 0, 2, 1]; + let offsets = [0_i32, 0, 1, 0, 2, 1]; let type_id_buffer = Buffer::from_slice_ref(type_ids); - let value_offsets_buffer = Buffer::from_slice_ref(value_offsets); + let value_offsets_buffer = Buffer::from_slice_ref(offsets); let children: Vec<(Field, Arc)> = vec![ ( @@ -680,18 +698,15 @@ mod tests { .unwrap(); // Check type ids - assert_eq!(Buffer::from_slice_ref(type_ids), *array.data().buffers()[0]); + assert_eq!(*array.type_ids(), type_ids); for (i, id) in type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); } // Check offsets - assert_eq!( - Buffer::from_slice_ref(value_offsets), - *array.data().buffers()[1] - ); - for (i, id) in value_offsets.iter().enumerate() { - assert_eq!(id, &array.value_offset(i)); + assert_eq!(*array.offsets().unwrap(), offsets); + for (i, id) in offsets.iter().enumerate() { + assert_eq!(*id as usize, array.value_offset(i)); } // Check values @@ -754,29 +769,26 @@ mod tests { let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7]; // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - *union.data().buffers()[0] - ); + assert_eq!(*union.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); } // Check offsets, sparse union should only have a single buffer - assert_eq!(union.data().buffers().len(), 1); + assert!(union.offsets().is_none()); // Check data assert_eq!( - *union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref([1_i32, 0, 0, 4, 0, 6, 0]), + *union.child(0).as_primitive::().values(), + [1_i32, 0, 0, 4, 0, 6, 0], ); assert_eq!( - Buffer::from_slice_ref([0_i32, 2_i32, 0, 0, 0, 0, 7]), - *union.data().child_data()[1].buffers()[0] + *union.child(1).as_primitive::().values(), + [0_i32, 2_i32, 0, 0, 0, 0, 7] ); assert_eq!( - Buffer::from_slice_ref([0_i32, 0, 3_i32, 0, 5, 0, 0]), - *union.data().child_data()[2].buffers()[0] + *union.child(2).as_primitive::().values(), + [0_i32, 0, 3_i32, 0, 5, 0, 0] ); assert_eq!(expected_array_values.len(), union.len()); @@ -803,16 +815,13 @@ mod tests { let expected_type_ids = vec![0_i8, 1, 0, 1, 0]; // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - *union.data().buffers()[0] - ); + assert_eq!(*union.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); } // Check offsets, sparse union should only have a single buffer, i.e. no offsets - assert_eq!(union.data().buffers().len(), 1); + assert!(union.offsets().is_none()); for i in 0..union.len() { let slot = union.value(i); @@ -865,16 +874,13 @@ mod tests { let expected_type_ids = vec![0_i8, 0, 1, 0]; // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - *union.data().buffers()[0] - ); + assert_eq!(*union.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &union.type_id(i)); } // Check offsets, sparse union should only have a single buffer, i.e. no offsets - assert_eq!(union.data().buffers().len(), 1); + assert!(union.offsets().is_none()); for i in 0..union.len() { let slot = union.value(i); @@ -925,7 +931,7 @@ mod tests { match i { 0 => assert!(slot.is_null(0)), 1 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); @@ -933,7 +939,7 @@ mod tests { } 2 => assert!(slot.is_null(0)), 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); @@ -1018,18 +1024,18 @@ mod tests { assert_eq!(union_slice.type_id(2), 1); let slot = union_slice.value(0); - let array = slot.as_any().downcast_ref::().unwrap(); + let array = slot.as_primitive::(); assert_eq!(array.len(), 1); assert!(array.is_null(0)); let slot = union_slice.value(1); - let array = slot.as_any().downcast_ref::().unwrap(); + let array = slot.as_primitive::(); assert_eq!(array.len(), 1); assert!(array.is_valid(0)); assert_eq!(array.value(0), 3.0); let slot = union_slice.value(2); - let array = slot.as_any().downcast_ref::().unwrap(); + let array = slot.as_primitive::(); assert_eq!(array.len(), 1); assert!(array.is_null(0)); } @@ -1065,8 +1071,8 @@ mod tests { let int_array = Int32Array::from(vec![5, 6, 4]); let float_array = Float64Array::from(vec![10.0]); - let type_ids = Buffer::from_iter([4_i8, 8, 4, 8, 9, 4, 8]); - let value_offsets = Buffer::from_iter([0_i32, 0, 1, 1, 0, 2, 2]); + let type_ids = Buffer::from_vec(vec![4_i8, 8, 4, 8, 9, 4, 8]); + let value_offsets = Buffer::from_vec(vec![0_i32, 0, 1, 1, 0, 2, 2]); let data = ArrayData::builder(data_type) .len(7) diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index 28fb7e5d999a..8ca303da8cb4 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -113,13 +113,13 @@ impl FieldData { /// builder.append::("a", 4).unwrap(); /// let union = builder.build().unwrap(); /// -/// assert_eq!(union.type_id(0), 0_i8); -/// assert_eq!(union.type_id(1), 1_i8); -/// assert_eq!(union.type_id(2), 0_i8); +/// assert_eq!(union.type_id(0), 0); +/// assert_eq!(union.type_id(1), 1); +/// assert_eq!(union.type_id(2), 0); /// -/// assert_eq!(union.value_offset(0), 0_i32); -/// assert_eq!(union.value_offset(1), 0_i32); -/// assert_eq!(union.value_offset(2), 1_i32); +/// assert_eq!(union.value_offset(0), 0); +/// assert_eq!(union.value_offset(1), 0); +/// assert_eq!(union.value_offset(2), 1); /// ``` /// /// Example: **Sparse Memory Layout** @@ -133,13 +133,13 @@ impl FieldData { /// builder.append::("a", 4).unwrap(); /// let union = builder.build().unwrap(); /// -/// assert_eq!(union.type_id(0), 0_i8); -/// assert_eq!(union.type_id(1), 1_i8); -/// assert_eq!(union.type_id(2), 0_i8); +/// assert_eq!(union.type_id(0), 0); +/// assert_eq!(union.type_id(1), 1); +/// assert_eq!(union.type_id(2), 0); /// -/// assert_eq!(union.value_offset(0), 0_i32); -/// assert_eq!(union.value_offset(1), 1_i32); -/// assert_eq!(union.value_offset(2), 2_i32); +/// assert_eq!(union.value_offset(0), 0); +/// assert_eq!(union.value_offset(1), 1); +/// assert_eq!(union.value_offset(2), 2); /// ``` #[derive(Debug)] pub struct UnionBuilder { diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index c8025f000eab..d1090369767f 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -818,7 +818,7 @@ impl<'a> DisplayIndexState<'a> for &'a UnionArray { fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { let id = self.type_id(idx); let idx = match s.1 { - UnionMode::Dense => self.value_offset(idx) as usize, + UnionMode::Dense => self.value_offset(idx), UnionMode::Sparse => idx, }; let (name, field) = s.0[id as usize].as_ref().unwrap(); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 9179b1279ff6..2d6bbf1a091d 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -496,8 +496,9 @@ mod tests { use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; use arrow_array::builder::UnionBuilder; + use arrow_array::cast::AsArray; use arrow_array::types::{Float64Type, Int32Type}; - use arrow_array::{Float64Array, StructArray, UnionArray}; + use arrow_array::{StructArray, UnionArray}; use std::collections::HashMap; use std::convert::TryFrom; use std::mem::ManuallyDrop; @@ -1138,22 +1139,19 @@ mod tests { let expected_type_ids = vec![0_i8, 0, 1, 0]; // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - *array.data().buffers()[0] - ); + assert_eq!(*array.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); } // Check offsets, sparse union should only have a single buffer, i.e. no offsets - assert_eq!(array.data().buffers().len(), 1); + assert!(array.offsets().is_none()); for i in 0..array.len() { let slot = array.value(i); match i { 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); @@ -1161,14 +1159,14 @@ mod tests { } 1 => assert!(slot.is_null(0)), 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); assert_eq!(value, 3_f64); } 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); @@ -1195,28 +1193,23 @@ mod tests { // (simulate consumer) import it let data = ArrayData::try_from(array)?; - let array = make_array(data); - - let array = array.as_any().downcast_ref::().unwrap(); + let array = UnionArray::from(data); let expected_type_ids = vec![0_i8, 0, 1, 0]; // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - *array.data().buffers()[0] - ); + assert_eq!(*array.type_ids(), expected_type_ids); for (i, id) in expected_type_ids.iter().enumerate() { assert_eq!(id, &array.type_id(i)); } - assert_eq!(array.data().buffers().len(), 2); + assert!(array.offsets().is_some()); for i in 0..array.len() { let slot = array.value(i); match i { 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); @@ -1224,14 +1217,14 @@ mod tests { } 1 => assert!(slot.is_null(0)), 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); assert_eq!(value, 3_f64); } 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); + let slot = slot.as_primitive::(); assert!(!slot.is_null(0)); assert_eq!(slot.len(), 1); let value = slot.value(0); From be491b41b7698cf10a788663ad2906de5eb778ef Mon Sep 17 00:00:00 2001 From: byteink Date: Thu, 30 Mar 2023 16:17:41 +0800 Subject: [PATCH 0753/1411] Fix documentation of string_to_timestamp_nanos (#3977) --- arrow-cast/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 45a255626f6b..e2c7f9bcc2ca 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -265,7 +265,7 @@ pub fn string_to_datetime( /// relative to UTC, see [`string_to_datetime`] for alternative semantics /// /// For example, both `1997-01-31 09:26:56.123Z`, `1997-01-31T09:26:56.123`, -/// and `1997-01-31T14:26:56.123-05:00` will be parsed as the same value +/// and `1997-01-31T14:26:56.123+05:00` will be parsed as the same value /// #[inline] pub fn string_to_timestamp_nanos(s: &str) -> Result { From 9a4374fb92d3b552f6efa06ee0da2a4d7a2b6c02 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 11:53:15 +0100 Subject: [PATCH 0754/1411] Store Timezone as Arc (#3976) * Store Timezone as Arc * Fix serde * Fix chrono-tz * Format * Add construction example --- arrow-array/src/array/primitive_array.rs | 4 +- arrow-array/src/builder/primitive_builder.rs | 3 +- arrow-array/src/builder/struct_builder.rs | 2 +- arrow-cast/src/cast.rs | 18 +++---- arrow-csv/src/reader/mod.rs | 12 ++--- arrow-csv/src/writer.rs | 2 +- arrow-integration-test/src/datatype.rs | 2 +- arrow-integration-test/src/lib.rs | 8 +-- arrow-integration-test/src/schema.rs | 4 +- arrow-ipc/src/convert.rs | 8 +-- arrow-json/src/raw/mod.rs | 4 +- arrow-row/src/lib.rs | 4 +- arrow-schema/Cargo.toml | 2 +- arrow-schema/src/datatype.rs | 12 ++++- arrow-schema/src/ffi.rs | 9 ++-- arrow-select/src/take.rs | 2 +- arrow/tests/array_cast.rs | 2 +- arrow/tests/csv.rs | 12 ++--- parquet/src/arrow/schema/mod.rs | 56 +++++++------------- parquet/src/arrow/schema/primitive.rs | 2 +- 20 files changed, 76 insertions(+), 92 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index b463e016c852..f857e26c7f89 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1063,12 +1063,12 @@ impl PrimitiveArray { } /// Construct a timestamp array with an optional timezone - pub fn with_timezone_opt(&self, timezone: Option) -> Self { + pub fn with_timezone_opt>>(&self, timezone: Option) -> Self { let array_data = unsafe { self.data .clone() .into_builder() - .data_type(DataType::Timestamp(T::UNIT, timezone)) + .data_type(DataType::Timestamp(T::UNIT, timezone.map(Into::into))) .build_unchecked() }; PrimitiveArray::from(array_data) diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 1c2cd908ca26..6688d07b7055 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -575,8 +575,7 @@ mod tests { assert_eq!(array.precision(), 1); assert_eq!(array.scale(), 2); - let data_type = - DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())); + let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); builder.append_value(1); diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 51b4c7cfcdc6..5f362036a8cd 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -491,7 +491,7 @@ mod tests { Field::new("f1", DataType::Decimal128(1, 2), false), Field::new( "f2", - DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), false, ), ]; diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 492c5db057a9..51cc69a7908a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -541,7 +541,7 @@ macro_rules! cast_list_to_string { fn make_timestamp_array( array: &PrimitiveArray, unit: TimeUnit, - tz: Option, + tz: Option>, ) -> ArrayRef { match unit { TimeUnit::Second => Arc::new( @@ -2635,7 +2635,7 @@ fn cast_string_to_timestamp< TimestampType: ArrowTimestampType, >( array: &dyn Array, - to_tz: &Option, + to_tz: &Option>, cast_options: &CastOptions, ) -> Result { let string_array = array @@ -8034,7 +8034,7 @@ mod tests { let b = cast( &b, - &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), + &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), ) .unwrap(); let v = b.as_primitive::(); @@ -8044,7 +8044,7 @@ mod tests { let b = cast( &b, - &DataType::Timestamp(TimeUnit::Millisecond, Some("+02:00".to_string())), + &DataType::Timestamp(TimeUnit::Millisecond, Some("+02:00".into())), ) .unwrap(); let v = b.as_primitive::(); @@ -8055,7 +8055,7 @@ mod tests { #[test] fn test_cast_utf8_to_timestamp() { - fn test_tz(tz: String) { + fn test_tz(tz: Arc) { let valid = StringArray::from(vec![ "2023-01-01 04:05:06.789000-08:00", "2023-01-01 04:05:06.789000-07:00", @@ -8091,8 +8091,8 @@ mod tests { assert_eq!(1672531200000000000, c.value(8)); } - test_tz("+00:00".to_owned()); - test_tz("+02:00".to_owned()); + test_tz("+00:00".into()); + test_tz("+02:00".into()); } #[test] @@ -8119,11 +8119,11 @@ mod tests { let array = Arc::new(valid) as ArrayRef; let b = cast( &array, - &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_owned())), + &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), ) .unwrap(); - let expect = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_owned())); + let expect = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); assert_eq!(b.data_type(), &expect); let c = b diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 262c057d4283..46e97b1f848f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -1739,7 +1739,7 @@ mod tests { } fn test_parse_timestamp_impl( - timezone: Option, + timezone: Option>, expected: &[i64], ) { let csv = [ @@ -1775,23 +1775,23 @@ mod tests { &[0, 0, -7_200_000_000_000], ); test_parse_timestamp_impl::( - Some("+00:00".to_string()), + Some("+00:00".into()), &[0, 0, -7_200_000_000_000], ); test_parse_timestamp_impl::( - Some("-05:00".to_string()), + Some("-05:00".into()), &[18_000_000_000_000, 0, -7_200_000_000_000], ); test_parse_timestamp_impl::( - Some("-03".to_string()), + Some("-03".into()), &[10_800_000_000, 0, -7_200_000_000], ); test_parse_timestamp_impl::( - Some("-03".to_string()), + Some("-03".into()), &[10_800_000, 0, -7_200_000], ); test_parse_timestamp_impl::( - Some("-03".to_string()), + Some("-03".into()), &[10_800, 0, -7_200], ); } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 28a939d88f34..946803decf90 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -624,7 +624,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let schema = Schema::new(vec![ Field::new( "c1", - DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), true, ), Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index ece64e16eb08..d0f4ca66fda9 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -89,7 +89,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { }; let tz = match map.get("timezone") { None => Ok(None), - Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), + Some(Value::String(tz)) => Ok(Some(tz.as_str().into())), _ => Err(ArrowError::ParseError( "timezone must be a string".to_string(), )), diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 6f9e8a4eb1aa..0b890ea33657 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1104,10 +1104,10 @@ mod tests { #[test] fn test_arrow_data_equality() { - let secs_tz = Some("Europe/Budapest".to_string()); - let millis_tz = Some("America/New_York".to_string()); - let micros_tz = Some("UTC".to_string()); - let nanos_tz = Some("Africa/Johannesburg".to_string()); + let secs_tz = Some("Europe/Budapest".into()); + let millis_tz = Some("America/New_York".into()); + let micros_tz = Some("UTC".into()); + let nanos_tz = Some("Africa/Johannesburg".into()); let schema = Schema::new(vec![ Field::new("bools-with-metadata-map", DataType::Boolean, true).with_metadata( diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index 8147589390a3..bb17b1adb1ac 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -131,14 +131,14 @@ mod tests { Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), Field::new( "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())), false, ), Field::new( "c17", DataType::Timestamp( TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), + Some("Africa/Johannesburg".into()), ), false, ), diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index aede8a448a06..8f8593cfd8f1 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -290,7 +290,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat } crate::Type::Timestamp => { let timestamp = field.type_as_timestamp().unwrap(); - let timezone: Option = timestamp.timezone().map(|tz| tz.to_string()); + let timezone: Option<_> = timestamp.timezone().map(|tz| tz.into()); match timestamp.unit() { crate::TimeUnit::SECOND => { DataType::Timestamp(TimeUnit::Second, timezone) @@ -636,8 +636,8 @@ pub(crate) fn get_fb_field_type<'a>( } } Timestamp(unit, tz) => { - let tz = tz.clone().unwrap_or_default(); - let tz_str = fbb.create_string(tz.as_str()); + let tz = tz.as_deref().unwrap_or_default(); + let tz_str = fbb.create_string(tz); let mut builder = crate::TimestampBuilder::new(fbb); let time_unit = match unit { TimeUnit::Second => crate::TimeUnit::SECOND, @@ -882,7 +882,7 @@ mod tests { "timestamp[us]", DataType::Timestamp( TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), + Some("Africa/Johannesburg".into()), ), false, ), diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 21e6191ac7b2..b63763159a99 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -843,7 +843,7 @@ mod tests { {"c": "1997-01-31T14:26:56.123-05:00", "d": "1997-01-31"} "#; - let with_timezone = DataType::Timestamp(T::UNIT, Some("+08:00".to_string())); + let with_timezone = DataType::Timestamp(T::UNIT, Some("+08:00".into())); let schema = Arc::new(Schema::new(vec![ Field::new("a", T::DATA_TYPE, true), Field::new("b", T::DATA_TYPE, true), @@ -1092,7 +1092,7 @@ mod tests { do_test(DataType::Decimal128(2, 1)); do_test(DataType::Timestamp( TimeUnit::Microsecond, - Some("+00:00".to_string()), + Some("+00:00".into()), )); } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 2f0defe5268a..56b3ec2b36b0 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1533,12 +1533,12 @@ mod tests { // Construct dictionary with a timezone let dict = a.finish(); let values = TimestampNanosecondArray::from(dict.values().to_data()); - let dict_with_tz = dict.with_values(&values.with_timezone("+02:00".to_string())); + let dict_with_tz = dict.with_values(&values.with_timezone("+02:00")); let d = DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Timestamp( TimeUnit::Nanosecond, - Some("+02:00".to_string()), + Some("+02:00".into()), )), ); diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 89e82a0ff164..2ef08072a00d 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -38,7 +38,7 @@ path = "src/lib.rs" bench = false [dependencies] -serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true } bitflags = { version = "2.0.0", default-features = false, optional = true } [features] diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 56eb6e8cef16..bcfea5a91023 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -16,6 +16,7 @@ // under the License. use std::fmt; +use std::sync::Arc; use crate::field::Field; @@ -131,7 +132,14 @@ pub enum DataType { /// empty to "Europe/Paris" would require converting the timestamp values /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is /// nevertheless correct). - Timestamp(TimeUnit, Option), + /// + /// ``` + /// # use arrow_schema::{DataType, TimeUnit}; + /// DataType::Timestamp(TimeUnit::Second, None); + /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); + /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); + /// ``` + Timestamp(TimeUnit, Option>), /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days (32 bits). Date32, @@ -476,7 +484,7 @@ impl DataType { | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => 0, DataType::Timestamp(_, s) => { - s.as_ref().map(|s| s.capacity()).unwrap_or_default() + s.as_ref().map(|s| s.len()).unwrap_or_default() } DataType::List(field) | DataType::FixedSizeList(field, _) diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 058febbdd35c..e830f39052eb 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -36,6 +36,7 @@ use crate::{ArrowError, DataType, Field, Schema, TimeUnit, UnionMode}; use bitflags::bitflags; +use std::sync::Arc; use std::{ collections::HashMap, ffi::{c_char, c_void, CStr, CString}, @@ -514,16 +515,16 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { ["tsu", ""] => DataType::Timestamp(TimeUnit::Microsecond, None), ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None), ["tss", tz] => { - DataType::Timestamp(TimeUnit::Second, Some(tz.to_string())) + DataType::Timestamp(TimeUnit::Second, Some(Arc::from(*tz))) } ["tsm", tz] => { - DataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())) + DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from(*tz))) } ["tsu", tz] => { - DataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())) + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from(*tz))) } ["tsn", tz] => { - DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())) + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from(*tz))) } _ => { return Err(ArrowError::CDataInterface(format!( diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 316f78d62f43..76909587db76 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1364,7 +1364,7 @@ mod tests { let result = take_impl(&input, &index, None).unwrap(); match result.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) => { - assert_eq!(tz.clone(), Some("UTC".to_owned())) + assert_eq!(tz.clone(), Some("UTC".into())) } _ => panic!(), } diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 4c1f5019597c..7ee65a3575cd 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -350,7 +350,7 @@ fn create_decimal_array( // Get a selection of datatypes to try and cast to fn get_all_types() -> Vec { use DataType::*; - let tz_name = String::from("+08:00"); + let tz_name: Arc = Arc::from("+08:00"); let mut types = vec![ Null, diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index dbb399948302..3ee319101757 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -25,10 +25,7 @@ fn test_export_csv_timestamps() { let schema = Schema::new(vec![ Field::new( "c1", - DataType::Timestamp( - TimeUnit::Millisecond, - Some("Australia/Sydney".to_string()), - ), + DataType::Timestamp(TimeUnit::Millisecond, Some("Australia/Sydney".into())), true, ), Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), @@ -68,10 +65,7 @@ fn test_export_csv_timestamps_using_rfc3339() { let schema = Schema::new(vec![ Field::new( "c1", - DataType::Timestamp( - TimeUnit::Millisecond, - Some("Australia/Sydney".to_string()), - ), + DataType::Timestamp(TimeUnit::Millisecond, Some("Australia/Sydney".into())), true, ), Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), @@ -85,7 +79,7 @@ fn test_export_csv_timestamps_using_rfc3339() { // vec![Some(1555584887378), Some(1635577147000)], ) - .with_timezone("Australia/Sydney".to_string()); + .with_timezone("Australia/Sydney"); let c2 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); let batch = diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index a000a4656bf9..a63d859aaf7b 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -327,7 +327,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::primitive_type_builder(name, PhysicalType::INT64) .with_logical_type(Some(LogicalType::Timestamp { // If timezone set, values are normalized to UTC timezone - is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_str().is_empty()), + is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_ref().is_empty()), unit: match time_unit { TimeUnit::Second => unreachable!(), TimeUnit::Millisecond => { @@ -507,7 +507,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result { let dict_field = Field::new(name, *value.clone(), field.is_nullable()); arrow_to_parquet_type(&dict_field) } - DataType::RunEndEncoded(_, _) => Err(arrow_err!("Converting RunEndEncodedType to parquet not supported",)) + DataType::RunEndEncoded(_, _) => Err(arrow_err!( + "Converting RunEndEncodedType to parquet not supported", + )), } } @@ -641,7 +643,7 @@ mod tests { ProjectionMask::all(), None, ) - .unwrap(); + .unwrap(); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -1317,7 +1319,7 @@ mod tests { ), Field::new( "ts_nano", - DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), false, ), Field::new( @@ -1343,20 +1345,9 @@ mod tests { ))), false, ), - Field::new( - "decimal_int32", - DataType::Decimal128(8, 2), - false, - ), - Field::new( - "decimal_int64", - DataType::Decimal128(16, 2), - false, - ), - Field::new( - "decimal_fix_length", - DataType::Decimal128(30, 2), - false, ), + Field::new("decimal_int32", DataType::Decimal128(8, 2), false), + Field::new("decimal_int64", DataType::Decimal128(16, 2), false), + Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false), ]; assert_eq!(arrow_fields, converted_arrow_fields); @@ -1447,27 +1438,27 @@ mod tests { ), Field::new( "ts_seconds", - DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Second, Some("UTC".into())), false, ), Field::new( "ts_micro_utc", - DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), false, ), Field::new( "ts_millis_zero_offset", - DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), false, ), Field::new( "ts_millis_zero_negative_offset", - DataType::Timestamp(TimeUnit::Millisecond, Some("-00:00".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("-00:00".into())), false, ), Field::new( "ts_micro_non_utc", - DataType::Timestamp(TimeUnit::Microsecond, Some("+01:00".to_string())), + DataType::Timestamp(TimeUnit::Microsecond, Some("+01:00".into())), false, ), Field::new( @@ -1492,18 +1483,9 @@ mod tests { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), false, ), - Field::new( - "decimal_int32", - DataType::Decimal128(8, 2), - false), - Field::new("decimal_int64", - DataType::Decimal128(16, 2), - false), - Field::new( - "decimal_fix_length", - DataType::Decimal128(30, 2), - false, - ), + Field::new("decimal_int32", DataType::Decimal128(8, 2), false), + Field::new("decimal_int64", DataType::Decimal128(16, 2), false), + Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap(); @@ -1594,14 +1576,14 @@ mod tests { Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), Field::new( "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())), false, ), Field::new( "c17", DataType::Timestamp( TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), + Some("Africa/Johannesburg".into()), ), false, ), diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index e95db2b033e5..6565f7eaeefb 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -198,7 +198,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result TimeUnit::Nanosecond, }, if is_adjusted_to_u_t_c { - Some("UTC".to_string()) + Some("UTC".into()) } else { None }, From abf5367d4828b71e152bc159ce7c70c86181eebc Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Thu, 30 Mar 2023 18:55:47 +0800 Subject: [PATCH 0755/1411] Fix(flight_sql): PreparedStatement has no token for auth. (#3948) * Fix(flight_sql): PreparedStatement need FlightSqlServiceClient to set headers. In particular, the token is required for auth in each request. * refactor: make FlightSqlServiceClient generic. * test: example FlightSqlServiceImpl check token for each request . * remove FlightSqlServiceClient::get_flight_info. * keep consistent of do_get/do_action/do_put. * code reuse in tests of example FlightSqlServiceImpl. * add cases for auth failure. --- arrow-flight/examples/flight_sql_server.rs | 180 ++++++++++++++------- arrow-flight/src/bin/flight_sql_client.rs | 6 +- arrow-flight/src/sql/client.rs | 67 +++++--- 3 files changed, 168 insertions(+), 85 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index bc9d24656913..08744b65f7ac 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -59,10 +59,35 @@ macro_rules! status { }; } +const FAKE_TOKEN: &str = "uuid_token"; +const FAKE_HANDLE: &str = "uuid_handle"; +const FAKE_UPDATE_RESULT: i64 = 1; + #[derive(Clone)] pub struct FlightSqlServiceImpl {} impl FlightSqlServiceImpl { + fn check_token(&self, req: &Request) -> Result<(), Status> { + let metadata = req.metadata(); + let auth = metadata.get("authorization").ok_or_else(|| { + Status::internal(format!("No authorization header! metadata = {metadata:?}")) + })?; + let str = auth + .to_str() + .map_err(|e| Status::internal(format!("Error parsing header: {e}")))?; + let authorization = str.to_string(); + let bearer = "Bearer "; + if !authorization.starts_with(bearer) { + Err(Status::internal("Invalid auth header!"))?; + } + let token = authorization[bearer.len()..].to_string(); + if token == FAKE_TOKEN { + Ok(()) + } else { + Err(Status::unauthenticated("invalid token ")) + } + } + fn fake_result() -> Result { let schema = Schema::new(vec![Field::new("salutation", DataType::Utf8, false)]); let mut builder = StringBuilder::new(); @@ -70,10 +95,6 @@ impl FlightSqlServiceImpl { let cols = vec![Arc::new(builder.finish()) as ArrayRef]; RecordBatch::try_new(Arc::new(schema), cols) } - - fn fake_update_result() -> i64 { - 1 - } } #[tonic::async_trait] @@ -118,7 +139,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let result = HandshakeResponse { protocol_version: 0, - payload: "random_uuid_token".into(), + payload: FAKE_TOKEN.into(), }; let result = Ok(result); let output = futures::stream::iter(vec![result]); @@ -127,9 +148,10 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_get_fallback( &self, - _request: Request, + request: Request, _message: Any, ) -> Result::DoGetStream>, Status> { + self.check_token(&request)?; let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; let schema = (*batch.schema()).clone(); @@ -158,8 +180,9 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_prepared_statement( &self, cmd: CommandPreparedStatementQuery, - _request: Request, + request: Request, ) -> Result, Status> { + self.check_token(&request)?; let handle = std::str::from_utf8(&cmd.prepared_statement_handle) .map_err(|e| status!("Unable to parse handle", e))?; let batch = @@ -395,7 +418,7 @@ impl FlightSqlService for FlightSqlServiceImpl { _ticket: CommandStatementUpdate, _request: Request>, ) -> Result { - Ok(FlightSqlServiceImpl::fake_update_result()) + Ok(FAKE_UPDATE_RESULT) } async fn do_put_prepared_statement_query( @@ -421,9 +444,9 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_action_create_prepared_statement( &self, _query: ActionCreatePreparedStatementRequest, - _request: Request, + request: Request, ) -> Result { - let handle = "some_uuid"; + self.check_token(&request)?; let schema = Self::fake_result() .map_err(|e| status!("Error getting result schema", e))? .schema(); @@ -432,7 +455,7 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(|e| status!("Unable to serialize schema", e))?; let IpcMessage(schema_bytes) = message; let res = ActionCreatePreparedStatementResult { - prepared_statement_handle: handle.into(), + prepared_statement_handle: FAKE_HANDLE.into(), dataset_schema: schema_bytes, parameter_schema: Default::default(), // TODO: parameters }; @@ -505,12 +528,13 @@ mod tests { use super::*; use futures::TryStreamExt; use std::fs; + use std::future::Future; use std::time::Duration; use tempfile::NamedTempFile; use tokio::net::{UnixListener, UnixStream}; use tokio::time::sleep; use tokio_stream::wrappers::UnixListenerStream; - use tonic::transport::ClientTlsConfig; + use tonic::transport::{Channel, ClientTlsConfig}; use arrow_cast::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; @@ -518,7 +542,7 @@ mod tests { use tonic::transport::{Certificate, Endpoint}; use tower::service_fn; - async fn client_with_uds(path: String) -> FlightSqlServiceClient { + async fn client_with_uds(path: String) -> FlightSqlServiceClient { let connector = service_fn(move |_| UnixStream::connect(path.clone())); let channel = Endpoint::try_from("http://example.com") .unwrap() @@ -549,6 +573,20 @@ mod tests { .await } + fn endpoint(addr: String) -> Result { + let endpoint = Endpoint::new(addr) + .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .connect_timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(20)) + .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait + .tcp_keepalive(Option::Some(Duration::from_secs(3600))) + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(20)) + .keep_alive_while_idle(true); + + Ok(endpoint) + } + #[tokio::test] async fn test_select_https() { tokio::spawn(async { @@ -573,6 +611,7 @@ mod tests { let channel = endpoint.connect().await.unwrap(); let mut client = FlightSqlServiceClient::new(channel); let token = client.handshake("admin", "password").await.unwrap(); + client.set_token(String::from_utf8(token.to_vec()).unwrap()); println!("Auth succeeded with token: {:?}", token); let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); let flight_info = stmt.execute().await.unwrap(); @@ -597,8 +636,16 @@ mod tests { } } - #[tokio::test] - async fn test_select_1() { + async fn auth_client(client: &mut FlightSqlServiceClient) { + let token = client.handshake("admin", "password").await.unwrap(); + client.set_token(String::from_utf8(token.to_vec()).unwrap()); + } + + async fn test_client(f: F) + where + F: FnOnce(FlightSqlServiceClient) -> C, + C: Future, + { let file = NamedTempFile::new().unwrap(); let path = file.into_temp_path().to_str().unwrap().to_string(); let _ = fs::remove_file(path.clone()); @@ -613,9 +660,20 @@ mod tests { .serve_with_incoming(stream); let request_future = async { - let mut client = client_with_uds(path).await; - let token = client.handshake("admin", "password").await.unwrap(); - println!("Auth succeeded with token: {:?}", token); + let client = client_with_uds(path).await; + f(client).await + }; + + tokio::select! { + _ = serve_future => panic!("server returned first"), + _ = request_future => println!("Client finished!"), + } + } + + #[tokio::test] + async fn test_select_1() { + test_client(|mut client| async move { + auth_client(&mut client).await; let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); let flight_info = stmt.execute().await.unwrap(); let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); @@ -632,57 +690,61 @@ mod tests { .trim() .to_string(); assert_eq!(res.to_string(), expected); - }; - - tokio::select! { - _ = serve_future => panic!("server returned first"), - _ = request_future => println!("Client finished!"), - } + }) + .await } #[tokio::test] async fn test_execute_update() { - let file = NamedTempFile::new().unwrap(); - let path = file.into_temp_path().to_str().unwrap().to_string(); - let _ = fs::remove_file(path.clone()); - - let uds = UnixListener::bind(path.clone()).unwrap(); - let stream = UnixListenerStream::new(uds); - - // We would just listen on TCP, but it seems impossible to know when tonic is ready to serve - let service = FlightSqlServiceImpl {}; - let serve_future = Server::builder() - .add_service(FlightServiceServer::new(service)) - .serve_with_incoming(stream); - - let request_future = async { - let mut client = client_with_uds(path).await; - let token = client.handshake("admin", "password").await.unwrap(); - println!("Auth succeeded with token: {:?}", token); + test_client(|mut client| async move { + auth_client(&mut client).await; let res = client .execute_update("creat table test(a int);".to_string()) .await .unwrap(); - assert_eq!(res, FlightSqlServiceImpl::fake_update_result()); - }; - - tokio::select! { - _ = serve_future => panic!("server returned first"), - _ = request_future => println!("Client finished!"), - } + assert_eq!(res, FAKE_UPDATE_RESULT); + }) + .await } - fn endpoint(addr: String) -> Result { - let endpoint = Endpoint::new(addr) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? - .connect_timeout(Duration::from_secs(20)) - .timeout(Duration::from_secs(20)) - .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait - .tcp_keepalive(Option::Some(Duration::from_secs(3600))) - .http2_keep_alive_interval(Duration::from_secs(300)) - .keep_alive_timeout(Duration::from_secs(20)) - .keep_alive_while_idle(true); + #[tokio::test] + async fn test_auth() { + test_client(|mut client| async move { + // no handshake + assert!(client + .prepare("select 1;".to_string()) + .await + .unwrap_err() + .to_string() + .contains("No authorization header")); - Ok(endpoint) + // Invalid credentials + assert!(client + .handshake("admin", "password2") + .await + .unwrap_err() + .to_string() + .contains("Invalid credentials")); + + // forget to set_token + client.handshake("admin", "password").await.unwrap(); + assert!(client + .prepare("select 1;".to_string()) + .await + .unwrap_err() + .to_string() + .contains("No authorization header")); + + // Invalid Tokens + client.handshake("admin", "password").await.unwrap(); + client.set_token("wrong token".to_string()); + assert!(client + .prepare("select 1;".to_string()) + .await + .unwrap_err() + .to_string() + .contains("invalid token")); + }) + .await } } diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index c6a46a387d01..1891a331be96 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -25,7 +25,7 @@ use arrow_flight::{ use arrow_schema::{ArrowError, Schema}; use clap::Parser; use futures::TryStreamExt; -use tonic::transport::{ClientTlsConfig, Endpoint}; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; use tracing_log::log::info; /// A ':' separated key value pair @@ -140,7 +140,9 @@ fn setup_logging() { tracing_subscriber::fmt::init(); } -async fn setup_client(args: ClientArgs) -> Result { +async fn setup_client( + args: ClientArgs, +) -> Result, ArrowError> { let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); let protocol = if args.tls { "https" } else { "http" }; diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index a61f06d32922..a8868fba1867 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -35,7 +35,7 @@ use crate::sql::{ }; use crate::{ Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, Ticket, + HandshakeResponse, IpcMessage, PutResult, Ticket, }; use arrow_array::RecordBatch; use arrow_buffer::Buffer; @@ -51,16 +51,16 @@ use tonic::{IntoRequest, Streaming}; /// A FlightSQLServiceClient is an endpoint for retrieving or storing Arrow data /// by FlightSQL protocol. #[derive(Debug, Clone)] -pub struct FlightSqlServiceClient { +pub struct FlightSqlServiceClient { token: Option, headers: HashMap, - flight_client: FlightServiceClient, + flight_client: FlightServiceClient, } /// A FlightSql protocol client that can run queries against FlightSql servers /// This client is in the "experimental" stage. It is not guaranteed to follow the spec in all instances. /// Github issues are welcomed. -impl FlightSqlServiceClient { +impl FlightSqlServiceClient { /// Creates a new FlightSql client that connects to a server over an arbitrary tonic `Channel` pub fn new(channel: Channel) -> Self { let flight_client = FlightServiceClient::new(channel); @@ -212,7 +212,7 @@ impl FlightSqlServiceClient { /// Given a flight ticket, request to be sent the stream. Returns record batch stream reader pub async fn do_get( &mut self, - ticket: Ticket, + ticket: impl IntoRequest, ) -> Result, ArrowError> { let req = self.set_request_headers(ticket.into_request())?; Ok(self @@ -223,6 +223,34 @@ impl FlightSqlServiceClient { .into_inner()) } + /// Push a stream to the flight service associated with a particular flight stream. + pub async fn do_put( + &mut self, + request: impl tonic::IntoStreamingRequest, + ) -> Result, ArrowError> { + let req = self.set_request_headers(request.into_streaming_request())?; + Ok(self + .flight_client + .do_put(req) + .await + .map_err(status_to_arrow_error)? + .into_inner()) + } + + /// DoAction allows a flight client to do a specific action against a flight service + pub async fn do_action( + &mut self, + request: impl IntoRequest, + ) -> Result, ArrowError> { + let req = self.set_request_headers(request.into_request())?; + Ok(self + .flight_client + .do_action(req) + .await + .map_err(status_to_arrow_error)? + .into_inner()) + } + /// Request a list of tables. pub async fn get_tables( &mut self, @@ -316,7 +344,7 @@ impl FlightSqlServiceClient { _ => Schema::try_from(IpcMessage(prepared_result.parameter_schema))?, }; Ok(PreparedStatement::new( - self.flight_client.clone(), + self.clone(), prepared_result.prepared_statement_handle, dataset_schema, parameter_schema, @@ -354,7 +382,7 @@ impl FlightSqlServiceClient { /// A PreparedStatement #[derive(Debug, Clone)] pub struct PreparedStatement { - flight_client: FlightServiceClient, + flight_sql_client: FlightSqlServiceClient, parameter_binding: Option, handle: Bytes, dataset_schema: Schema, @@ -363,13 +391,13 @@ pub struct PreparedStatement { impl PreparedStatement { pub(crate) fn new( - flight_client: FlightServiceClient, + flight_client: FlightSqlServiceClient, handle: impl Into, dataset_schema: Schema, parameter_schema: Schema, ) -> Self { PreparedStatement { - flight_client, + flight_sql_client: flight_client, parameter_binding: None, handle: handle.into(), dataset_schema, @@ -382,13 +410,10 @@ impl PreparedStatement { let cmd = CommandPreparedStatementQuery { prepared_statement_handle: self.handle.clone(), }; - let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let result = self - .flight_client - .get_flight_info(descriptor) - .await - .map_err(status_to_arrow_error)? - .into_inner(); + .flight_sql_client + .get_flight_info_for_command(cmd) + .await?; Ok(result) } @@ -399,14 +424,12 @@ impl PreparedStatement { }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let mut result = self - .flight_client + .flight_sql_client .do_put(stream::iter(vec![FlightData { flight_descriptor: Some(descriptor), ..Default::default() }])) - .await - .map_err(status_to_arrow_error)? - .into_inner(); + .await?; let result = result .message() .await @@ -447,11 +470,7 @@ impl PreparedStatement { r#type: CLOSE_PREPARED_STATEMENT.to_string(), body: cmd.as_any().encode_to_vec().into(), }; - let _ = self - .flight_client - .do_action(action) - .await - .map_err(status_to_arrow_error)?; + let _ = self.flight_sql_client.do_action(action).await?; Ok(()) } } From a9ac325b4b3a3d49e24eb5032b4a1b38dc887f84 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 11:57:39 +0100 Subject: [PATCH 0756/1411] Add Fields abstraction (#3955) (#3965) * Add Fields abstraction (#3955) * Fix ffi * Fix array_cast * Fix doc * Add Into for StructBuilder * Further SchemaBuilder utilities --- arrow-array/src/array/map_array.rs | 5 +- arrow-array/src/array/mod.rs | 8 +- arrow-array/src/array/struct_array.rs | 121 ++++++----- arrow-array/src/builder/struct_builder.rs | 101 +++++---- arrow-array/src/record_batch.rs | 51 ++--- arrow-cast/src/cast.rs | 6 +- arrow-cast/src/pretty.rs | 12 +- arrow-csv/src/reader/mod.rs | 12 +- arrow-data/src/data/mod.rs | 3 +- arrow-flight/src/encode.rs | 6 +- arrow-flight/tests/encode_decode.rs | 6 +- arrow-integration-test/src/datatype.rs | 4 +- arrow-integration-test/src/field.rs | 33 ++- arrow-integration-test/src/lib.rs | 14 +- arrow-integration-test/src/schema.rs | 33 +-- .../src/bin/arrow-json-integration-test.rs | 20 +- arrow-ipc/src/convert.rs | 36 ++-- arrow-ipc/src/reader.rs | 14 +- arrow-ipc/src/writer.rs | 2 +- arrow-json/src/raw/mod.rs | 42 ++-- arrow-json/src/raw/struct_array.rs | 4 +- arrow-json/src/reader.rs | 59 +++--- arrow-json/src/writer.rs | 26 ++- arrow-schema/src/datatype.rs | 42 ++-- arrow-schema/src/ffi.rs | 25 ++- arrow-schema/src/field.rs | 49 +++-- arrow-schema/src/fields.rs | 150 ++++++++++++++ arrow-schema/src/lib.rs | 2 + arrow-schema/src/schema.rs | 194 +++++++++++++----- arrow-select/src/nullif.rs | 6 +- arrow-select/src/take.rs | 18 +- arrow/examples/dynamic_types.rs | 5 +- arrow/src/compute/kernels/limit.rs | 2 +- arrow/src/datatypes/mod.rs | 3 +- arrow/src/util/data_gen.rs | 9 +- arrow/tests/array_cast.rs | 8 +- arrow/tests/array_equal.rs | 58 +++--- arrow/tests/array_transform.rs | 6 +- arrow/tests/array_validation.rs | 4 +- arrow/tests/schema.rs | 11 +- parquet/benches/arrow_writer.rs | 20 +- parquet/src/arrow/array_reader/builder.rs | 36 ++-- parquet/src/arrow/array_reader/empty_array.rs | 4 +- parquet/src/arrow/array_reader/list_array.rs | 9 +- parquet/src/arrow/array_reader/map_array.rs | 5 +- .../src/arrow/array_reader/struct_array.rs | 5 +- parquet/src/arrow/arrow_reader/mod.rs | 60 +++--- parquet/src/arrow/arrow_writer/levels.rs | 23 ++- parquet/src/arrow/arrow_writer/mod.rs | 59 +++--- parquet/src/arrow/schema/complex.rs | 17 +- parquet/src/arrow/schema/mod.rs | 108 +++++----- 51 files changed, 935 insertions(+), 621 deletions(-) create mode 100644 arrow-schema/src/fields.rs diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 439aaf7064de..112789fd51e8 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -275,6 +275,7 @@ mod tests { use crate::cast::AsArray; use crate::types::UInt32Type; use crate::{Int32Array, UInt32Array}; + use arrow_schema::Fields; use std::sync::Arc; use super::*; @@ -515,10 +516,10 @@ mod tests { fn test_from_array_data_validation() { // A DictionaryArray has similar buffer layout to a MapArray // but the meaning of the values differs - let struct_t = DataType::Struct(vec![ + let struct_t = DataType::Struct(Fields::from(vec![ Field::new("keys", DataType::Int32, true), Field::new("values", DataType::UInt32, true), - ]); + ])); let dict_t = DataType::Dictionary(Box::new(DataType::Int32), Box::new(struct_t)); let _ = MapArray::from(ArrayData::new_empty(&dict_t)); } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 1e4019b4f61d..8d20c6cb2ad4 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -740,7 +740,7 @@ mod tests { use crate::cast::{as_union_array, downcast_array}; use crate::downcast_run_array; use arrow_buffer::{Buffer, MutableBuffer}; - use arrow_schema::{Field, UnionMode}; + use arrow_schema::{Field, Fields, UnionMode}; #[test] fn test_empty_primitive() { @@ -794,7 +794,7 @@ mod tests { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details let struct_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); + DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); let array = new_null_array(&struct_type, 9); let a = array.as_any().downcast_ref::().unwrap(); @@ -837,10 +837,10 @@ mod tests { let data_type = DataType::Map( Box::new(Field::new( "entry", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Int32, true), - ]), + ])), false, )), false, diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 4c9613afbf88..0604f71d3294 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::{make_array, Array, ArrayRef}; +use crate::{make_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType, Field, SchemaBuilder}; use std::sync::Arc; use std::{any::Any, ops::Index}; @@ -157,11 +157,11 @@ impl TryFrom> for StructArray { len = Some(child_datum_len) } child_data.push(child_datum.clone()); - fields.push(Field::new( + fields.push(Arc::new(Field::new( field_name, array.data_type().clone(), child_datum.nulls().is_some(), - )); + ))); if let Some(child_nulls) = child_datum.nulls() { null = Some(if let Some(null_buffer) = &null { @@ -182,7 +182,7 @@ impl TryFrom> for StructArray { } let len = len.unwrap(); - let builder = ArrayData::builder(DataType::Struct(fields)) + let builder = ArrayData::builder(DataType::Struct(fields.into())) .len(len) .null_bit_buffer(null) .child_data(child_data); @@ -221,29 +221,32 @@ impl Array for StructArray { impl From> for StructArray { fn from(v: Vec<(Field, ArrayRef)>) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = v.into_iter().unzip(); - - let length = field_values.get(0).map(|a| a.len()).unwrap_or(0); - field_types.iter().zip(field_values.iter()).for_each( - |(field_type, field_value)| { - // Check the length of the child arrays - assert_eq!( - length, - field_value.len(), - "all child arrays of a StructArray must have the same length" - ); - // Check data types of child arrays - assert_eq!( - field_type.data_type(), - field_value.data().data_type(), - "the field data types must match the array data in a StructArray" - ); - }, - ); - + let iter = v.into_iter(); + let capacity = iter.size_hint().0; + + let mut len = None; + let mut schema = SchemaBuilder::with_capacity(capacity); + let mut child_data = Vec::with_capacity(capacity); + for (field, array) in iter { + // Check the length of the child arrays + assert_eq!( + *len.get_or_insert(array.len()), + array.len(), + "all child arrays of a StructArray must have the same length" + ); + // Check data types of child arrays + assert_eq!( + field.data_type(), + array.data_type(), + "the field data types must match the array data in a StructArray" + ); + schema.push(field); + child_data.push(array.to_data()); + } + let field_types = schema.finish().fields; let array_data = ArrayData::builder(DataType::Struct(field_types)) - .child_data(field_values.into_iter().map(|a| a.into_data()).collect()) - .len(length); + .child_data(child_data) + .len(len.unwrap_or_default()); let array_data = unsafe { array_data.build_unchecked() }; // We must validate nullability @@ -274,30 +277,31 @@ impl std::fmt::Debug for StructArray { impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip(); - - let length = field_values.get(0).map(|a| a.len()).unwrap_or(0); - field_types.iter().zip(field_values.iter()).for_each( - |(field_type, field_value)| { - // Check the length of the child arrays - assert_eq!( - length, - field_value.len(), - "all child arrays of a StructArray must have the same length" - ); - // Check data types of child arrays - assert_eq!( - field_type.data_type(), - field_value.data().data_type(), - "the field data types must match the array data in a StructArray" - ); - }, - ); - + let capacity = pair.0.len(); + let mut len = None; + let mut schema = SchemaBuilder::with_capacity(capacity); + let mut child_data = Vec::with_capacity(capacity); + for (field, array) in pair.0 { + // Check the length of the child arrays + assert_eq!( + *len.get_or_insert(array.len()), + array.len(), + "all child arrays of a StructArray must have the same length" + ); + // Check data types of child arrays + assert_eq!( + field.data_type(), + array.data_type(), + "the field data types must match the array data in a StructArray" + ); + schema.push(field); + child_data.push(array.to_data()); + } + let field_types = schema.finish().fields; let array_data = ArrayData::builder(DataType::Struct(field_types)) .null_bit_buffer(Some(pair.1)) - .child_data(field_values.into_iter().map(|a| a.into_data()).collect()) - .len(length); + .child_data(child_data) + .len(len.unwrap_or_default()); let array_data = unsafe { array_data.build_unchecked() }; // We must validate nullability @@ -307,6 +311,21 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { } } +impl From for StructArray { + fn from(value: RecordBatch) -> Self { + // TODO: Don't store ArrayData inside arrays (#3880) + let builder = ArrayData::builder(DataType::Struct(value.schema().fields.clone())) + .child_data(value.columns().iter().map(|x| x.to_data()).collect()) + .len(value.num_rows()); + + // Safety: RecordBatch must be valid + Self { + data: unsafe { builder.build_unchecked() }, + boxed_fields: value.columns().to_vec(), + } + } +} + impl Index<&str> for StructArray { type Output = ArrayRef; @@ -345,7 +364,7 @@ mod tests { Field::new("a", DataType::Boolean, false), Field::new("b", DataType::Int64, false), ]; - let struct_array_data = ArrayData::builder(DataType::Struct(fields)) + let struct_array_data = ArrayData::builder(DataType::Struct(fields.into())) .len(4) .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) @@ -514,7 +533,7 @@ mod tests { Field::new("a", DataType::Boolean, true), Field::new("b", DataType::Int32, true), ]; - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into())) .len(5) .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 5f362036a8cd..7371df3b021c 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -20,7 +20,7 @@ use crate::builder::*; use crate::{Array, ArrayRef, StructArray}; use arrow_buffer::Buffer; use arrow_data::ArrayData; -use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; +use arrow_schema::{DataType, Fields, IntervalUnit, TimeUnit}; use std::any::Any; use std::sync::Arc; @@ -29,7 +29,7 @@ use std::sync::Arc; /// Note that callers should make sure that methods of all the child field builders are /// properly called to maintain the consistency of the data structure. pub struct StructBuilder { - fields: Vec, + fields: Fields, field_builders: Vec>, null_buffer_builder: NullBufferBuilder, } @@ -180,16 +180,20 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box, field_builders: Vec>) -> Self { + pub fn new( + fields: impl Into, + field_builders: Vec>, + ) -> Self { Self { - fields, field_builders, + fields: fields.into(), null_buffer_builder: NullBufferBuilder::new(0), } } - /// Creates a new `StructBuilder` from vector of [`Field`] with `capacity` - pub fn from_fields(fields: Vec, capacity: usize) -> Self { + /// Creates a new `StructBuilder` from [`Fields`] and `capacity` + pub fn from_fields(fields: impl Into, capacity: usize) -> Self { + let fields = fields.into(); let mut builders = Vec::with_capacity(fields.len()); for field in &fields { builders.push(make_builder(field.data_type(), capacity)); @@ -284,6 +288,7 @@ impl StructBuilder { mod tests { use super::*; use arrow_buffer::Buffer; + use arrow_schema::Field; use crate::array::Array; @@ -292,12 +297,14 @@ mod tests { let string_builder = StringBuilder::new(); let int_builder = Int32Builder::new(); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Utf8, false)); - field_builders.push(Box::new(string_builder) as Box); - fields.push(Field::new("f2", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); + let fields = vec![ + Field::new("f1", DataType::Utf8, false), + Field::new("f2", DataType::Int32, false), + ]; + let field_builders = vec![ + Box::new(string_builder) as Box, + Box::new(int_builder) as Box, + ]; let mut builder = StructBuilder::new(fields, field_builders); assert_eq!(2, builder.num_fields()); @@ -354,12 +361,14 @@ mod tests { let int_builder = Int32Builder::new(); let bool_builder = BooleanBuilder::new(); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); - field_builders.push(Box::new(bool_builder) as Box); + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; let mut builder = StructBuilder::new(fields, field_builders); builder @@ -412,12 +421,14 @@ mod tests { let int_builder = Int32Builder::new(); let bool_builder = BooleanBuilder::new(); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); - field_builders.push(Box::new(bool_builder) as Box); + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; let mut builder = StructBuilder::new(fields, field_builders); builder @@ -475,7 +486,7 @@ mod tests { Field::new("g1", DataType::Int32, false), Field::new("g2", DataType::Boolean, false), ]; - let struct_type = DataType::Struct(sub_fields); + let struct_type = DataType::Struct(sub_fields.into()); fields.push(Field::new("f3", struct_type, false)); let mut builder = StructBuilder::from_fields(fields, 5); @@ -487,14 +498,14 @@ mod tests { #[test] fn test_datatype_properties() { - let fields = vec![ + let fields = Fields::from(vec![ Field::new("f1", DataType::Decimal128(1, 2), false), Field::new( "f2", DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), false, ), - ]; + ]); let mut builder = StructBuilder::from_fields(fields.clone(), 1); builder .field_builder::(0) @@ -517,10 +528,12 @@ mod tests { expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { - let mut fields = vec![Field::new("f1", DataType::Int16, false)]; let list_type = DataType::List(Box::new(Field::new("item", DataType::Int64, true))); - fields.push(Field::new("f2", list_type, false)); + let fields = vec![ + Field::new("f1", DataType::Int16, false), + Field::new("f2", list_type, false), + ]; let _ = StructBuilder::from_fields(fields, 5); } @@ -529,10 +542,8 @@ mod tests { fn test_struct_array_builder_field_builder_type_mismatch() { let int_builder = Int32Builder::with_capacity(10); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); + let fields = vec![Field::new("f1", DataType::Int32, false)]; + let field_builders = vec![Box::new(int_builder) as Box]; let mut builder = StructBuilder::new(fields, field_builders); assert!(builder.field_builder::(0).is_none()); @@ -548,12 +559,14 @@ mod tests { int_builder.append_value(2); bool_builder.append_value(true); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); - field_builders.push(Box::new(bool_builder) as Box); + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![ + Box::new(int_builder) as Box, + Box::new(bool_builder) as Box, + ]; let mut builder = StructBuilder::new(fields, field_builders); builder.append(true); @@ -568,11 +581,11 @@ mod tests { fn test_struct_array_builder_unequal_field_field_builders() { let int_builder = Int32Builder::with_capacity(10); - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); + let fields = vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Boolean, false), + ]; + let field_builders = vec![Box::new(int_builder) as Box]; let mut builder = StructBuilder::new(fields, field_builders); builder.finish(); diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 02ced1a0ba92..2754d04bfcaa 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -19,7 +19,7 @@ //! [schema](arrow_schema::Schema). use crate::{new_empty_array, Array, ArrayRef, StructArray}; -use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef}; +use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaBuilder, SchemaRef}; use std::ops::Index; use std::sync::Arc; @@ -387,19 +387,18 @@ impl RecordBatch { I: IntoIterator, F: AsRef, { - // TODO: implement `TryFrom` trait, once - // https://github.com/rust-lang/rust/issues/50133 is no longer an - // issue - let (fields, columns) = value - .into_iter() - .map(|(field_name, array, nullable)| { - let field_name = field_name.as_ref(); - let field = Field::new(field_name, array.data_type().clone(), nullable); - (field, array) - }) - .unzip(); + let iter = value.into_iter(); + let capacity = iter.size_hint().0; + let mut schema = SchemaBuilder::with_capacity(capacity); + let mut columns = Vec::with_capacity(capacity); + + for (field_name, array, nullable) in iter { + let field_name = field_name.as_ref(); + schema.push(Field::new(field_name, array.data_type().clone(), nullable)); + columns.push(array); + } - let schema = Arc::new(Schema::new(fields)); + let schema = Arc::new(schema.finish()); RecordBatch::try_new(schema, columns) } @@ -467,19 +466,6 @@ impl From<&StructArray> for RecordBatch { } } -impl From for StructArray { - fn from(batch: RecordBatch) -> Self { - batch - .schema - .fields - .iter() - .zip(batch.columns.iter()) - .map(|t| (t.0.clone(), t.1.clone())) - .collect::>() - .into() - } -} - impl Index<&str> for RecordBatch { type Output = ArrayRef; @@ -573,6 +559,7 @@ mod tests { }; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayDataBuilder; + use arrow_schema::Fields; #[test] fn create_record_batch() { @@ -653,7 +640,7 @@ mod tests { #[test] #[should_panic(expected = "assertion failed: (offset + length) <= self.num_rows()")] fn create_record_batch_slice_empty_batch() { - let schema = Schema::new(vec![]); + let schema = Schema::empty(); let record_batch = RecordBatch::new_empty(Arc::new(schema)); @@ -729,7 +716,7 @@ mod tests { false, ), ]; - let struct_type = DataType::Struct(struct_fields); + let struct_type = DataType::Struct(struct_fields.into()); let schema = Arc::new(Schema::new(vec![Field::new("a", struct_type, true)])); let a1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); @@ -745,10 +732,10 @@ mod tests { .build() .unwrap(); let a2: ArrayRef = Arc::new(ListArray::from(a2)); - let a = ArrayDataBuilder::new(DataType::Struct(vec![ + let a = ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![ Field::new("aa1", DataType::Int32, false), Field::new("a2", a2.data_type().clone(), false), - ])) + ]))) .add_child_data(a1.into_data()) .add_child_data(a2.into_data()) .len(2) @@ -801,7 +788,7 @@ mod tests { assert_eq!(4, batch.num_rows()); assert_eq!( struct_array.data_type(), - &DataType::Struct(batch.schema().fields().to_vec()) + &DataType::Struct(batch.schema().fields().clone()) ); assert_eq!(batch.column(0).as_ref(), boolean.as_ref()); assert_eq!(batch.column(1).as_ref(), int.as_ref()); @@ -1024,7 +1011,7 @@ mod tests { #[test] fn test_no_column_record_batch() { - let schema = Arc::new(Schema::new(vec![])); + let schema = Arc::new(Schema::empty()); let err = RecordBatch::try_new(schema.clone(), vec![]).unwrap_err(); assert!(err diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 51cc69a7908a..d14c8d2fa4ba 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -7106,10 +7106,10 @@ mod tests { let data_type = DataType::Map( Box::new(Field::new( "entry", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Int32, true), - ]), + ])), false, )), false, @@ -7138,7 +7138,7 @@ mod tests { // Cast null from and to struct let data_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)]); + DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); cast_from_null_to_other(&data_type); } diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 5e7715eec832..ffa5af82d154 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -635,14 +635,16 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("c11", DataType::Int32, true), Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), - ]), + ])), false, ), Field::new("c2", DataType::Utf8, false), @@ -656,7 +658,9 @@ mod tests { ( Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), Arc::new(StructArray::from(vec![( diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 46e97b1f848f..894c113aefc9 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -275,7 +275,7 @@ fn infer_reader_schema_with_csv_options( } // build schema from inference results - let fields = column_types + let fields: Fields = column_types .iter() .zip(&headers) .map(|(inferred, field_name)| Field::new(field_name, inferred.get(), true)) @@ -392,10 +392,8 @@ impl Reader { match &self.decoder.projection { Some(projection) => { let fields = self.decoder.schema.fields(); - let projected_fields: Vec = - projection.iter().map(|i| fields[*i].clone()).collect(); - - Arc::new(Schema::new(projected_fields)) + let projected = projection.iter().map(|i| fields[*i].clone()); + Arc::new(Schema::new(projected.collect::())) } None => self.decoder.schema.clone(), } @@ -586,7 +584,7 @@ impl Decoder { /// Parses a slice of [`StringRecords`] into a [RecordBatch] fn parse( rows: &StringRecords<'_>, - fields: &[Field], + fields: &Fields, metadata: Option>, projection: Option<&Vec>, line_number: usize, @@ -772,7 +770,7 @@ fn parse( }) .collect(); - let projected_fields: Vec = + let projected_fields: Fields = projection.iter().map(|i| fields[*i].clone()).collect(); let projected_schema = Arc::new(match metadata { diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 7241a5d80ee0..c47c836637a4 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -1874,7 +1874,8 @@ mod tests { ) .unwrap(); - let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]); + let field = Arc::new(Field::new("x", DataType::Int32, true)); + let data_type = DataType::Struct(vec![field].into()); let arr_data = ArrayData::builder(data_type) .len(5) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 6432965032c1..f8915a96320d 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -20,7 +20,7 @@ use std::{collections::VecDeque, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; use crate::{error::Result, FlightData, SchemaAsIpc}; use arrow_array::{ArrayRef, RecordBatch, RecordBatchOptions}; use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; -use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; @@ -309,7 +309,7 @@ impl Stream for FlightDataEncoder { /// /// See hydrate_dictionary for more information fn prepare_schema_for_flight(schema: &Schema) -> Schema { - let fields = schema + let fields: Fields = schema .fields() .iter() .map(|field| match field.data_type() { @@ -319,7 +319,7 @@ fn prepare_schema_for_flight(schema: &Schema) -> Schema { field.is_nullable(), ) .with_metadata(field.metadata().clone()), - _ => field.clone(), + _ => field.as_ref().clone(), }) .collect(); diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 5a8eb6c376e4..2841d4bf5edb 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -27,7 +27,7 @@ use arrow_flight::{ encode::FlightDataEncoderBuilder, error::FlightError, }; -use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; use bytes::Bytes; use futures::{StreamExt, TryStreamExt}; @@ -484,7 +484,7 @@ async fn roundtrip_with_encoder( /// Workaround for https://github.com/apache/arrow-rs/issues/1206 fn prepare_schema_for_flight(schema: &Schema) -> Schema { - let fields = schema + let fields: Fields = schema .fields() .iter() .map(|field| match field.data_type() { @@ -494,7 +494,7 @@ fn prepare_schema_for_flight(schema: &Schema) -> Schema { field.is_nullable(), ) .with_metadata(field.metadata().clone()), - _ => field.clone(), + _ => field.as_ref().clone(), }) .collect(); diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index d0f4ca66fda9..a08368d582a4 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionMode}; use arrow::error::{ArrowError, Result}; /// Parse a data type from a JSON representation. @@ -206,7 +206,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { } Some(s) if s == "struct" => { // return an empty `struct` type as its children aren't defined in the map - Ok(DataType::Struct(vec![])) + Ok(DataType::Struct(Fields::empty())) } Some(s) if s == "map" => { if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index abed0bd1d908..a60cd91c5b37 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -150,13 +150,10 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { )); } }, - DataType::Struct(mut fields) => match map.get("children") { - Some(Value::Array(values)) => { - let struct_fields: Result> = - values.iter().map(field_from_json).collect(); - fields.append(&mut struct_fields?); - DataType::Struct(fields) - } + DataType::Struct(_) => match map.get("children") { + Some(Value::Array(values)) => DataType::Struct( + values.iter().map(field_from_json).collect::>()?, + ), Some(_) => { return Err(ArrowError::ParseError( "Field 'children' must be an array".to_string(), @@ -265,7 +262,9 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { /// Generate a JSON representation of the `Field`. pub fn field_to_json(field: &Field) -> serde_json::Value { let children: Vec = match field.data_type() { - DataType::Struct(fields) => fields.iter().map(field_to_json).collect(), + DataType::Struct(fields) => { + fields.iter().map(|x| field_to_json(x.as_ref())).collect() + } DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) @@ -297,17 +296,17 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::UnionMode; + use arrow::datatypes::{Fields, UnionMode}; use serde_json::Value; #[test] fn struct_field_to_json() { let f = Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ]), + ])), false, ); let value: Value = serde_json::from_str( @@ -350,10 +349,10 @@ mod tests { DataType::Map( Box::new(Field::new( "my_entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), Field::new("my_values", DataType::UInt16, true), - ]), + ])), false, )), true, @@ -455,10 +454,10 @@ mod tests { let expected = Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ]), + ])), false, ); @@ -514,10 +513,10 @@ mod tests { DataType::Map( Box::new(Field::new( "my_entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), Field::new("my_values", DataType::UInt16, true), - ]), + ])), false, )), true, diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 0b890ea33657..06f16ca1dc9b 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -81,6 +81,12 @@ pub struct ArrowJsonField { pub metadata: Option, } +impl From<&FieldRef> for ArrowJsonField { + fn from(value: &FieldRef) -> Self { + Self::from(value.as_ref()) + } +} + impl From<&Field> for ArrowJsonField { fn from(field: &Field) -> Self { let metadata_value = match field.metadata().is_empty() { @@ -1183,10 +1189,10 @@ mod tests { ), Field::new( "structs", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("int32s", DataType::Int32, true), Field::new("utf8s", DataType::Utf8, true), - ]), + ])), true, ), ]); @@ -1265,10 +1271,10 @@ mod tests { let structs_int32s = Int32Array::from(vec![None, Some(-2), None]); let structs_utf8s = StringArray::from(vec![None, None, Some("aaaaaa")]); - let struct_data_type = DataType::Struct(vec![ + let struct_data_type = DataType::Struct(Fields::from(vec![ Field::new("int32s", DataType::Int32, true), Field::new("utf8s", DataType::Utf8, true), - ]); + ])); let struct_data = ArrayData::builder(struct_data_type) .len(3) .add_child_data(structs_int32s.into_data()) diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index bb17b1adb1ac..d640e298c6ad 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -16,14 +16,14 @@ // under the License. use crate::{field_from_json, field_to_json}; -use arrow::datatypes::Schema; +use arrow::datatypes::{Fields, Schema}; use arrow::error::{ArrowError, Result}; use std::collections::HashMap; /// Generate a JSON representation of the `Schema`. pub fn schema_to_json(schema: &Schema) -> serde_json::Value { serde_json::json!({ - "fields": schema.fields().iter().map(field_to_json).collect::>(), + "fields": schema.fields().iter().map(|f| field_to_json(f.as_ref())).collect::>(), "metadata": serde_json::to_value(schema.metadata()).unwrap() }) } @@ -33,12 +33,15 @@ pub fn schema_from_json(json: &serde_json::Value) -> Result { use serde_json::Value; match *json { Value::Object(ref schema) => { - let fields = if let Some(Value::Array(fields)) = schema.get("fields") { - fields.iter().map(field_from_json).collect::>()? - } else { - return Err(ArrowError::ParseError( - "Schema fields should be an array".to_string(), - )); + let fields: Fields = match schema.get("fields") { + Some(Value::Array(fields)) => { + fields.iter().map(field_from_json).collect::>()? + } + _ => { + return Err(ArrowError::ParseError( + "Schema fields should be an array".to_string(), + )) + } }; let metadata = if let Some(value) = schema.get("metadata") { @@ -100,7 +103,7 @@ struct MetadataKeyValue { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; + use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; use serde_json::Value; #[test] @@ -169,7 +172,7 @@ mod tests { "inner_list", DataType::List(Box::new(Field::new( "struct", - DataType::Struct(vec![]), + DataType::Struct(Fields::empty()), true, ))), false, @@ -178,10 +181,10 @@ mod tests { ), Field::new( "c25", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), - ]), + ])), false, ), Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), @@ -209,7 +212,7 @@ mod tests { "inner_large_list", DataType::LargeList(Box::new(Field::new( "struct", - DataType::Struct(vec![]), + DataType::Struct(Fields::empty()), false, ))), true, @@ -221,10 +224,10 @@ mod tests { DataType::Map( Box::new(Field::new( "my_entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), Field::new("my_values", DataType::UInt16, true), - ]), + ])), false, )), true, diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 0702a8a68cae..1d65be41c41c 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::Schema; use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{Fields, Schema}; use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; @@ -24,6 +24,7 @@ use arrow_integration_test::*; use arrow_integration_testing::read_json_file; use clap::Parser; use std::fs::File; +use std::sync::Arc; #[derive(clap::ValueEnum, Debug, Clone)] #[clap(rename_all = "SCREAMING_SNAKE_CASE")] @@ -120,27 +121,28 @@ fn canonicalize_schema(schema: &Schema) -> Schema { DataType::Map(child_field, sorted) => match child_field.data_type() { DataType::Struct(fields) if fields.len() == 2 => { let first_field = fields.get(0).unwrap(); - let key_field = Field::new( + let key_field = Arc::new(Field::new( "key", first_field.data_type().clone(), first_field.is_nullable(), - ); + )); let second_field = fields.get(1).unwrap(); - let value_field = Field::new( + let value_field = Arc::new(Field::new( "value", second_field.data_type().clone(), second_field.is_nullable(), - ); + )); - let struct_type = DataType::Struct(vec![key_field, value_field]); + let fields = Fields::from([key_field, value_field]); + let struct_type = DataType::Struct(fields); let child_field = Field::new("entries", struct_type, child_field.is_nullable()); - Field::new( + Arc::new(Field::new( field.name().as_str(), DataType::Map(Box::new(child_field), *sorted), field.is_nullable(), - ) + )) } _ => panic!( "The child field of Map type should be Struct type with 2 fields." @@ -148,7 +150,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { }, _ => field.clone(), }) - .collect::>(); + .collect::(); Schema::new(fields).with_metadata(schema.metadata().clone()) } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 8f8593cfd8f1..7e44f37d46d2 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -355,13 +355,10 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) } crate::Type::Struct_ => { - let mut fields = vec![]; - if let Some(children) = field.children() { - for i in 0..children.len() { - fields.push(children.get(i).into()); - } + let fields = match field.children() { + Some(children) => children.iter().map(Field::from).collect(), + None => Fields::empty(), }; - DataType::Struct(fields) } crate::Type::RunEndEncoded => { @@ -915,54 +912,51 @@ mod tests { ), Field::new( "list[struct]", - DataType::List(Box::new(Field::new( + List(Box::new(Field::new( "struct", - DataType::Struct(vec![ + Struct(Fields::from(vec![ Field::new("float32", DataType::UInt8, false), Field::new("int32", DataType::Int32, true), Field::new("bool", DataType::Boolean, true), - ]), + ])), true, ))), false, ), Field::new( "struct>", - DataType::Struct(vec![Field::new( + Struct(Fields::from(vec![Field::new( "dictionary", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), false, - )]), + )])), false, ), Field::new( "struct]>]>", - DataType::Struct(vec![ + Struct(Fields::from(vec![ Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", DataType::List(Box::new(Field::new( "struct", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("date32", DataType::Date32, true), Field::new( "list[struct<>]", DataType::List(Box::new(Field::new( "struct", - DataType::Struct(vec![]), + DataType::Struct(Fields::empty()), false, ))), false, ), - ]), + ])), false, ))), false, ), - ]), + ])), false, ), Field::new( @@ -1004,7 +998,7 @@ mod tests { ), false, ), - Field::new("struct<>", DataType::Struct(vec![]), true), + Field::new("struct<>", DataType::Struct(Fields::empty()), true), Field::new( "union<>", DataType::Union(vec![], vec![], UnionMode::Dense), diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index bd7e33185a40..4597ed82d27f 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -183,7 +183,7 @@ fn create_array( )?; node_index = triple.1; buffer_index = triple.2; - struct_arrays.push((struct_field.clone(), triple.0)); + struct_arrays.push((struct_field.as_ref().clone(), triple.0)); } let null_count = struct_node.null_count() as usize; let struct_array = if null_count > 0 { @@ -737,10 +737,8 @@ pub fn read_dictionary( let dictionary_values: ArrayRef = match first_field.data_type() { DataType::Dictionary(_, ref value_type) => { // Make a fake schema for the dictionary batch. - let schema = Schema { - fields: vec![Field::new("", value_type.as_ref().clone(), true)], - metadata: HashMap::new(), - }; + let value = value_type.as_ref().clone(); + let schema = Schema::new(vec![Field::new("", value, true)]); // Read a single column let record_batch = read_record_batch( buf, @@ -1273,14 +1271,14 @@ mod tests { ]; let union_data_type = DataType::Union(union_fileds, vec![0, 1], UnionMode::Dense); - let struct_fields = vec![ + let struct_fields = Fields::from(vec![ Field::new("id", DataType::Int32, false), Field::new( "list", DataType::List(Box::new(Field::new("item", DataType::Int8, true))), false, ), - ]; + ]); let struct_data_type = DataType::Struct(struct_fields); let run_encoded_data_type = DataType::RunEndEncoded( @@ -1829,7 +1827,7 @@ mod tests { #[test] fn test_no_columns_batch() { - let schema = Arc::new(Schema::new(vec![])); + let schema = Arc::new(Schema::empty()); let options = RecordBatchOptions::new() .with_match_field_names(true) .with_row_count(Some(10)); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 07d4b0fe9f93..ceb9b6ffa90f 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -937,7 +937,7 @@ impl StreamWriter { /// 255, 255, 255, 255, 0, 0, 0, 0 /// ]; /// - /// let schema = Schema::new(vec![]); + /// let schema = Schema::empty(); /// let buffer: Vec = Vec::new(); /// let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5)?; /// let stream_writer = StreamWriter::try_new_with_options(buffer, &schema, options)?; diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index b63763159a99..a567b93c9d0f 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -362,7 +362,7 @@ mod tests { use arrow_array::{Array, StructArray}; use arrow_buffer::ArrowNativeType; use arrow_cast::display::{ArrayFormatter, FormatOptions}; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{DataType, Field, Fields, Schema}; use std::fs::File; use std::io::{BufReader, Cursor, Seek}; use std::sync::Arc; @@ -510,23 +510,25 @@ mod tests { ), Field::new( "nested", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), - ]), + ])), true, ), Field::new( "nested_list", - DataType::Struct(vec![Field::new( + DataType::Struct(Fields::from(vec![Field::new( "list2", DataType::List(Box::new(Field::new( "element", - DataType::Struct(vec![Field::new("c", DataType::Int32, false)]), + DataType::Struct( + vec![Field::new("c", DataType::Int32, false)].into(), + ), false, ))), true, - )]), + )])), true, ), ])); @@ -582,20 +584,22 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new( "nested", - DataType::Struct(vec![Field::new("a", DataType::Int32, false)]), + DataType::Struct(vec![Field::new("a", DataType::Int32, false)].into()), true, ), Field::new( "nested_list", - DataType::Struct(vec![Field::new( + DataType::Struct(Fields::from(vec![Field::new( "list2", DataType::List(Box::new(Field::new( "element", - DataType::Struct(vec![Field::new("d", DataType::Int32, true)]), + DataType::Struct( + vec![Field::new("d", DataType::Int32, true)].into(), + ), false, ))), true, - )]), + )])), true, ), ])); @@ -636,10 +640,10 @@ mod tests { {"map": {"c": null, "a": ["baz"]}} "#; let list = DataType::List(Box::new(Field::new("element", DataType::Utf8, true))); - let entries = DataType::Struct(vec![ + let entries = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", list, true), - ]); + ])); let map = DataType::Map(Box::new(Field::new("entries", entries, true)), false); let schema = Arc::new(Schema::new(vec![Field::new("map", map, true)])); @@ -1008,29 +1012,29 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new( "protocol", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("minReaderVersion", DataType::Int32, true), Field::new("minWriterVersion", DataType::Int32, true), - ]), + ])), true, ), Field::new( "add", - DataType::Struct(vec![Field::new( + DataType::Struct(Fields::from(vec![Field::new( "partitionValues", DataType::Map( Box::new(Field::new( "key_value", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Utf8, true), - ]), + ])), false, )), false, ), false, - )]), + )])), true, ), ])); @@ -1054,7 +1058,7 @@ mod tests { let non_null = r#"{"foo": {}}"#; let schema = Arc::new(Schema::new(vec![Field::new( "foo", - DataType::Struct(vec![Field::new("bar", child, false)]), + DataType::Struct(vec![Field::new("bar", child, false)].into()), true, )])); let mut reader = RawReaderBuilder::new(schema.clone()) diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/raw/struct_array.rs index 219f56ae639d..a73bb148621a 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/raw/struct_array.rs @@ -20,7 +20,7 @@ use crate::raw::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::BooleanBufferBuilder; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType, Fields}; pub struct StructArrayDecoder { data_type: DataType, @@ -142,7 +142,7 @@ impl ArrayDecoder for StructArrayDecoder { } } -fn struct_fields(data_type: &DataType) -> &[Field] { +fn struct_fields(data_type: &DataType) -> &Fields { match &data_type { DataType::Struct(f) => f, _ => unreachable!(), diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index d68d7ca91ff7..c95f7c0be812 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -159,9 +159,7 @@ fn generate_datatype(t: &InferredType) -> Result { }) } -fn generate_fields( - spec: &HashMap, -) -> Result, ArrowError> { +fn generate_fields(spec: &HashMap) -> Result { spec.iter() .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) .collect() @@ -656,7 +654,7 @@ impl Decoder { match &self.options.projection { Some(projection) => { let fields = self.schema.fields(); - let projected_fields: Vec = fields + let projected_fields: Fields = fields .iter() .filter_map(|field| { if projection.contains(field.name()) { @@ -708,17 +706,13 @@ impl Decoder { let arrays = self.build_struct_array(rows, self.schema.fields(), &self.options.projection); - let projected_fields = if let Some(projection) = self.options.projection.as_ref() - { - projection + let projected_fields: Fields = match self.options.projection.as_ref() { + Some(projection) => projection .iter() - .filter_map(|name| self.schema.column_with_name(name)) - .map(|(_, field)| field.clone()) - .collect() - } else { - self.schema.fields().to_vec() + .filter_map(|name| Some(self.schema.fields.find(name)?.1.clone())) + .collect(), + None => self.schema.fields.clone(), }; - let projected_schema = Arc::new(Schema::new(projected_fields)); arrays.and_then(|arr| { @@ -1219,8 +1213,7 @@ impl Decoder { } }) .collect(); - let arrays = - self.build_struct_array(rows.as_slice(), fields.as_slice(), &None)?; + let arrays = self.build_struct_array(rows.as_slice(), fields, &None)?; let data_type = DataType::Struct(fields.clone()); let buf = null_buffer.into(); unsafe { @@ -1258,7 +1251,7 @@ impl Decoder { fn build_struct_array( &self, rows: &[Value], - struct_fields: &[Field], + struct_fields: &Fields, projection: &Option>, ) -> Result, ArrowError> { let arrays: Result, ArrowError> = struct_fields @@ -1529,7 +1522,7 @@ impl Decoder { let struct_children = self.build_struct_array( struct_rows.as_slice(), - &[key_field.clone(), value_field.clone()], + &Fields::from([key_field.clone(), value_field.clone()]), &None, )?; @@ -2167,7 +2160,7 @@ mod tests { fn test_invalid_json_read_record() { let schema = Arc::new(Schema::new(vec![Field::new( "a", - DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), + DataType::Struct(vec![Field::new("a", DataType::Utf8, true)].into()), true, )])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); @@ -2311,15 +2304,15 @@ mod tests { fn test_nested_struct_json_arrays() { let c_field = Field::new( "c", - DataType::Struct(vec![Field::new("d", DataType::Utf8, true)]), + DataType::Struct(vec![Field::new("d", DataType::Utf8, true)].into()), true, ); let a_field = Field::new( "a", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("b", DataType::Boolean, true), c_field.clone(), - ]), + ])), true, ); let schema = Arc::new(Schema::new(vec![a_field.clone()])); @@ -2361,15 +2354,15 @@ mod tests { fn test_nested_list_json_arrays() { let c_field = Field::new( "c", - DataType::Struct(vec![Field::new("d", DataType::Utf8, true)]), + DataType::Struct(vec![Field::new("d", DataType::Utf8, true)].into()), true, ); let a_struct_field = Field::new( "a", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("b", DataType::Boolean, true), c_field.clone(), - ]), + ])), true, ); let a_field = @@ -2467,10 +2460,10 @@ mod tests { let account_field = Field::new("account", DataType::UInt16, false); let value_list_type = DataType::List(Box::new(Field::new("item", DataType::Utf8, false))); - let entries_struct_type = DataType::Struct(vec![ + let entries_struct_type = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", value_list_type.clone(), true), - ]); + ])); let stocks_field = Field::new( "stocks", DataType::Map( @@ -2970,14 +2963,16 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Boolean, true), Field::new( "b", - DataType::Struct(vec![Field::new("c", DataType::Utf8, true)]), + DataType::Struct( + vec![Field::new("c", DataType::Utf8, true)].into(), + ), true, ), - ]), + ])), true, ), Field::new("c2", DataType::Int64, true), @@ -3004,11 +2999,11 @@ mod tests { "c1", DataType::List(Box::new(Field::new( "item", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, true), Field::new("b", DataType::Int64, true), Field::new("c", DataType::Boolean, true), - ]), + ])), true, ))), true, @@ -3305,7 +3300,7 @@ mod tests { "c1", DataType::List(Box::new(Field::new( "item", - DataType::Struct(vec![Field::new("a", DataType::Int64, true)]), + DataType::Struct(vec![Field::new("a", DataType::Int64, true)].into()), true, ))), true, diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 92883e577060..534aea91af4e 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -997,14 +997,16 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("c11", DataType::Int32, true), Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), - ]), + ])), false, ), Field::new("c2", DataType::Utf8, false), @@ -1018,7 +1020,9 @@ mod tests { ( Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), Arc::new(StructArray::from(vec![( @@ -1158,14 +1162,16 @@ mod tests { "c1", DataType::List(Box::new(Field::new( "s", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("c11", DataType::Int32, true), Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), - ]), + ])), false, ))), true, @@ -1181,7 +1187,9 @@ mod tests { ( Field::new( "c12", - DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)]), + DataType::Struct( + vec![Field::new("c121", DataType::Utf8, false)].into(), + ), false, ), Arc::new(StructArray::from(vec![( @@ -1316,7 +1324,7 @@ mod tests { {"list": [null]} "#; let ints_struct = - DataType::Struct(vec![Field::new("ints", DataType::Int32, true)]); + DataType::Struct(vec![Field::new("ints", DataType::Int32, true)].into()); let list_type = DataType::List(Box::new(Field::new("item", ints_struct, true))); let list_field = Field::new("list", list_type, true); let schema = Arc::new(Schema::new(vec![list_field])); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index bcfea5a91023..58747fb26a0e 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -19,6 +19,7 @@ use std::fmt; use std::sync::Arc; use crate::field::Field; +use crate::Fields; /// The set of datatypes that are supported by this implementation of Apache Arrow. /// @@ -190,7 +191,7 @@ pub enum DataType { /// A single LargeList array can store up to [`i64::MAX`] elements in total LargeList(Box), /// A nested datatype that contains a number of sub-fields. - Struct(Vec), + Struct(Fields), /// A nested datatype that can represent slots of differing types. Components: /// /// 1. [`Field`] for each possible child type the Union can hold @@ -490,7 +491,8 @@ impl DataType { | DataType::FixedSizeList(field, _) | DataType::LargeList(field) | DataType::Map(field, _) => field.size(), - DataType::Struct(fields) | DataType::Union(fields, _, _) => { + DataType::Struct(fields) => fields.size(), + DataType::Union(fields, _, _) => { fields .iter() .map(|field| field.size() - std::mem::size_of_val(field)) @@ -542,18 +544,18 @@ mod tests { let last_name = Field::new("last_name", DataType::Utf8, false) .with_metadata(HashMap::default()); - let person = DataType::Struct(vec![ + let person = DataType::Struct(Fields::from(vec![ first_name, last_name, Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ]), + ])), false, ), - ]); + ])); let serialized = serde_json::to_string(&person).unwrap(); @@ -600,24 +602,26 @@ mod tests { assert!(!list_e.equals_datatype(&list_g)); assert!(!list_f.equals_datatype(&list_g)); - let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]); - let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]); - let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]); - let list_k = DataType::Struct(vec![ + let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)])); + let list_i = + DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)])); + let list_j = + DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)])); + let list_k = DataType::Struct(Fields::from(vec![ Field::new("f1", list_f.clone(), false), Field::new("f2", list_g.clone(), false), Field::new("f3", DataType::Utf8, true), - ]); - let list_l = DataType::Struct(vec![ + ])); + let list_l = DataType::Struct(Fields::from(vec![ Field::new("ff1", list_f.clone(), false), Field::new("ff2", list_g.clone(), false), Field::new("ff3", DataType::LargeUtf8, true), - ]); - let list_m = DataType::Struct(vec![ + ])); + let list_m = DataType::Struct(Fields::from(vec![ Field::new("ff1", list_f, false), Field::new("ff2", list_g, false), Field::new("ff3", DataType::Utf8, true), - ]); + ])); assert!(list_h.equals_datatype(&list_i)); assert!(!list_h.equals_datatype(&list_j)); assert!(!list_k.equals_datatype(&list_l)); @@ -626,18 +630,18 @@ mod tests { #[test] fn create_struct_type() { - let _person = DataType::Struct(vec![ + let _person = DataType::Struct(Fields::from(vec![ Field::new("first_name", DataType::Utf8, false), Field::new("last_name", DataType::Utf8, false), Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ]), + ])), false, ), - ]); + ])); } #[test] diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index e830f39052eb..0cfc1800f53f 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -34,7 +34,7 @@ //! assert_eq!(schema, back); //! ``` -use crate::{ArrowError, DataType, Field, Schema, TimeUnit, UnionMode}; +use crate::{ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionMode}; use bitflags::bitflags; use std::sync::Arc; use std::{ @@ -394,7 +394,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { } "+s" => { let fields = c_schema.children().map(Field::try_from); - DataType::Struct(fields.collect::, ArrowError>>()?) + DataType::Struct(fields.collect::>()?) } "+m" => { let c_child = c_schema.child(0); @@ -671,6 +671,14 @@ fn get_format_string(dtype: &DataType) -> Result { } } +impl TryFrom<&FieldRef> for FFI_ArrowSchema { + type Error = ArrowError; + + fn try_from(value: &FieldRef) -> Result { + value.as_ref().try_into() + } +} + impl TryFrom<&Field> for FFI_ArrowSchema { type Error = ArrowError; @@ -730,6 +738,7 @@ impl TryFrom for FFI_ArrowSchema { #[cfg(test)] mod tests { use super::*; + use crate::Fields; fn round_trip_type(dtype: DataType) { let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap(); @@ -767,16 +776,16 @@ mod tests { DataType::Int16, false, )))); - round_trip_type(DataType::Struct(vec![Field::new( + round_trip_type(DataType::Struct(Fields::from(vec![Field::new( "a", DataType::Utf8, true, - )])); + )]))); } #[test] fn test_field() { - let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]); + let dtype = DataType::Struct(vec![Field::new("a", DataType::Utf8, true)].into()); round_trip_field(Field::new("test", dtype, true)); } @@ -792,10 +801,10 @@ mod tests { round_trip_schema(schema); // test that we can interpret struct types as schema - let dtype = DataType::Struct(vec![ + let dtype = DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, true), Field::new("b", DataType::Int16, false), - ]); + ])); let c_schema = FFI_ArrowSchema::try_from(&dtype).unwrap(); let schema = Schema::try_from(&c_schema).unwrap(); assert_eq!(schema.fields().len(), 2); @@ -810,7 +819,7 @@ mod tests { fn test_map_keys_sorted() { let keys = Field::new("keys", DataType::Int32, false); let values = Field::new("values", DataType::UInt32, false); - let entry_struct = DataType::Struct(vec![keys, values]); + let entry_struct = DataType::Struct(vec![keys, values].into()); // Construct a map array from the above two let map_data_type = diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b687b629aa75..8ef9fd2b81e5 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -21,6 +21,10 @@ use std::collections::HashMap; use std::hash::{Hash, Hasher}; use crate::datatype::DataType; +use crate::schema::SchemaBuilder; + +/// A reference counted [`Field`] +pub type FieldRef = std::sync::Arc; /// Describes a single column in a [`Schema`](super::Schema). /// @@ -230,7 +234,8 @@ impl Field { fn _fields(dt: &DataType) -> Vec<&Field> { match dt { - DataType::Struct(fields) | DataType::Union(fields, _, _) => { + DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), + DataType::Union(fields, _, _) => { fields.iter().flat_map(|f| f.fields()).collect() } DataType::List(field) @@ -326,15 +331,9 @@ impl Field { match &mut self.data_type { DataType::Struct(nested_fields) => match &from.data_type { DataType::Struct(from_nested_fields) => { - for from_field in from_nested_fields { - match nested_fields - .iter_mut() - .find(|self_field| self_field.name == from_field.name) - { - Some(self_field) => self_field.try_merge(from_field)?, - None => nested_fields.push(from_field.clone()), - } - } + let mut builder = SchemaBuilder::new(); + nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?; + *nested_fields = builder.finish().fields; } _ => { return Err(ArrowError::SchemaError( @@ -479,6 +478,7 @@ impl std::fmt::Display for Field { #[cfg(test)] mod test { use super::*; + use crate::Fields; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; @@ -525,29 +525,29 @@ mod test { let field = Field::new( "struct]>", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ dict1.clone(), Field::new( "list[struct]>]", DataType::List(Box::new(Field::new( "struct]>", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ dict1.clone(), Field::new( "list[struct]", DataType::List(Box::new(Field::new( "struct", - DataType::Struct(vec![dict2.clone()]), + DataType::Struct(vec![dict2.clone()].into()), false, ))), false, ), - ]), + ])), false, ))), false, ), - ]), + ])), false, ); @@ -632,14 +632,18 @@ mod test { fn test_contains_transitivity() { let child_field = Field::new("child1", DataType::Float16, false); - let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); + let mut field1 = Field::new( + "field1", + DataType::Struct(Fields::from(vec![child_field])), + false, + ); field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])); - let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); + let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true); field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))])); field2.try_merge(&field1).unwrap(); - let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); + let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false); field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))])); field3.try_merge(&field2).unwrap(); @@ -665,11 +669,14 @@ mod test { let child_field1 = Field::new("child1", DataType::Float16, false); let child_field2 = Field::new("child2", DataType::Float16, false); - let field1 = - Field::new("field1", DataType::Struct(vec![child_field1.clone()]), true); + let field1 = Field::new( + "field1", + DataType::Struct(vec![child_field1.clone()].into()), + true, + ); let field2 = Field::new( "field1", - DataType::Struct(vec![child_field1, child_field2]), + DataType::Struct(vec![child_field1, child_field2].into()), true, ); diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs new file mode 100644 index 000000000000..26822613666a --- /dev/null +++ b/arrow-schema/src/fields.rs @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{Field, FieldRef}; +use std::ops::Deref; +use std::sync::Arc; + +/// A cheaply cloneable, owned slice of [`FieldRef`] +/// +/// Similar to `Arc>` or `Arc<[FieldPtr]>` +/// +/// Can be constructed in a number of ways +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_schema::{DataType, Field, Fields}; +/// // Can be constructed from Vec +/// Fields::from(vec![Field::new("a", DataType::Boolean, false)]); +/// // Can be constructed from Vec +/// Fields::from(vec![Arc::new(Field::new("a", DataType::Boolean, false))]); +/// // Can be constructed from an iterator of Field +/// std::iter::once(Field::new("a", DataType::Boolean, false)).collect::(); +/// // Can be constructed from an iterator of FieldRef +/// std::iter::once(Arc::new(Field::new("a", DataType::Boolean, false))).collect::(); +/// ``` +/// +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct Fields(Arc<[FieldRef]>); + +impl std::fmt::Debug for Fields { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.as_ref().fmt(f) + } +} + +impl Fields { + /// Returns a new empty [`Fields`] + pub fn empty() -> Self { + Self(Arc::new([])) + } + + /// Return size of this instance in bytes. + pub fn size(&self) -> usize { + self.iter().map(|field| field.size()).sum() + } + + /// Searches for a field by name, returning it along with its index if found + pub fn find(&self, name: &str) -> Option<(usize, &FieldRef)> { + self.0.iter().enumerate().find(|(_, b)| b.name() == name) + } +} + +impl Default for Fields { + fn default() -> Self { + Self::empty() + } +} + +impl FromIterator for Fields { + fn from_iter>(iter: T) -> Self { + iter.into_iter().map(Arc::new).collect() + } +} + +impl FromIterator for Fields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From> for Fields { + fn from(value: Vec) -> Self { + value.into_iter().collect() + } +} + +impl From> for Fields { + fn from(value: Vec) -> Self { + Self(value.into()) + } +} + +impl From<&[FieldRef]> for Fields { + fn from(value: &[FieldRef]) -> Self { + Self(value.into()) + } +} + +impl From<[FieldRef; N]> for Fields { + fn from(value: [FieldRef; N]) -> Self { + Self(Arc::new(value)) + } +} + +impl Deref for Fields { + type Target = [FieldRef]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl<'a> IntoIterator for &'a Fields { + type Item = &'a FieldRef; + type IntoIter = std::slice::Iter<'a, FieldRef>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +// Manually implement to avoid needing serde rc feature +#[cfg(feature = "serde")] +impl serde::Serialize for Fields { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeSeq; + let mut seq = serializer.serialize_seq(Some(self.len()))?; + for e in self.iter() { + seq.serialize_element(e.as_ref())?; + } + seq.end() + } +} + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for Fields { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + Ok(Vec::::deserialize(deserializer)?.into()) + } +} diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index e977203e9c71..0e9edc7b4b26 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -23,6 +23,8 @@ mod error; pub use error::*; mod field; pub use field::*; +mod fields; +pub use fields::*; mod schema; pub use schema::*; use std::ops; diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 10a72ba0cdf6..6089c1ae5b94 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -18,12 +18,101 @@ use std::collections::HashMap; use std::fmt; use std::hash::Hash; +use std::sync::Arc; use crate::error::ArrowError; use crate::field::Field; +use crate::{FieldRef, Fields}; + +/// A builder to facilitate building a [`Schema`] from iteratively from [`FieldRef`] +#[derive(Debug, Default)] +pub struct SchemaBuilder { + fields: Vec, +} + +impl SchemaBuilder { + /// Creates a new empty [`SchemaBuilder`] + pub fn new() -> Self { + Self::default() + } + + /// Creates a new empty [`SchemaBuilder`] with space for `capacity` fields + pub fn with_capacity(capacity: usize) -> Self { + Self { + fields: Vec::with_capacity(capacity), + } + } + + /// Appends a [`FieldRef`] to this [`SchemaBuilder`] without checking for collision + pub fn push(&mut self, field: impl Into) { + self.fields.push(field.into()) + } + + /// Appends a [`FieldRef`] to this [`SchemaBuilder`] checking for collision + /// + /// If an existing field exists with the same name, calls [`Field::try_merge`] + pub fn try_merge(&mut self, field: &FieldRef) -> Result<(), ArrowError> { + // This could potentially be sped up with a HashMap or similar + let existing = self.fields.iter_mut().find(|f| f.name() == field.name()); + match existing { + Some(e) if Arc::ptr_eq(e, field) => {} // Nothing to do + Some(e) => match Arc::get_mut(e) { + Some(e) => e.try_merge(field.as_ref())?, + None => { + let mut t = e.as_ref().clone(); + t.try_merge(field)?; + *e = Arc::new(t) + } + }, + None => self.fields.push(field.clone()), + } + Ok(()) + } + + /// Consume this [`SchemaBuilder`] yielding the final [`Schema`] + pub fn finish(self) -> Schema { + Schema::new(self.fields) + } +} + +impl From<&Fields> for SchemaBuilder { + fn from(value: &Fields) -> Self { + Self { + fields: value.to_vec(), + } + } +} + +impl From for SchemaBuilder { + fn from(value: Fields) -> Self { + Self { + fields: value.to_vec(), + } + } +} + +impl Extend for SchemaBuilder { + fn extend>(&mut self, iter: T) { + let iter = iter.into_iter(); + self.fields.reserve(iter.size_hint().0); + for f in iter { + self.push(f) + } + } +} + +impl Extend for SchemaBuilder { + fn extend>(&mut self, iter: T) { + let iter = iter.into_iter(); + self.fields.reserve(iter.size_hint().0); + for f in iter { + self.push(f) + } + } +} /// A reference-counted reference to a [`Schema`]. -pub type SchemaRef = std::sync::Arc; +pub type SchemaRef = Arc; /// Describes the meta-data of an ordered sequence of relative types. /// @@ -32,7 +121,7 @@ pub type SchemaRef = std::sync::Arc; #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Schema { - pub fields: Vec, + pub fields: Fields, /// A map of key-value pairs containing additional meta data. pub metadata: HashMap, } @@ -41,7 +130,7 @@ impl Schema { /// Creates an empty `Schema` pub fn empty() -> Self { Self { - fields: vec![], + fields: Default::default(), metadata: HashMap::new(), } } @@ -57,7 +146,7 @@ impl Schema { /// /// let schema = Schema::new(vec![field_a, field_b]); /// ``` - pub fn new(fields: Vec) -> Self { + pub fn new(fields: impl Into) -> Self { Self::new_with_metadata(fields, HashMap::new()) } @@ -79,11 +168,14 @@ impl Schema { /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); /// ``` #[inline] - pub const fn new_with_metadata( - fields: Vec, + pub fn new_with_metadata( + fields: impl Into, metadata: HashMap, ) -> Self { - Self { fields, metadata } + Self { + fields: fields.into(), + metadata, + } } /// Sets the metadata of this `Schema` to be `metadata` and returns self @@ -141,39 +233,34 @@ impl Schema { pub fn try_merge( schemas: impl IntoIterator, ) -> Result { - schemas - .into_iter() - .try_fold(Self::empty(), |mut merged, schema| { - let Schema { metadata, fields } = schema; - for (key, value) in metadata.into_iter() { - // merge metadata - if let Some(old_val) = merged.metadata.get(&key) { - if old_val != &value { - return Err(ArrowError::SchemaError(format!( - "Fail to merge schema due to conflicting metadata. \ + let mut out_meta = HashMap::new(); + let mut out_fields = SchemaBuilder::new(); + for schema in schemas { + let Schema { metadata, fields } = schema; + + // merge metadata + for (key, value) in metadata.into_iter() { + if let Some(old_val) = out_meta.get(&key) { + if old_val != &value { + return Err(ArrowError::SchemaError(format!( + "Fail to merge schema due to conflicting metadata. \ Key '{key}' has different values '{old_val}' and '{value}'" - ))); - } + ))); } - merged.metadata.insert(key, value); } - // merge fields - for field in fields.into_iter() { - let merged_field = - merged.fields.iter_mut().find(|f| f.name() == field.name()); - match merged_field { - Some(merged_field) => merged_field.try_merge(&field)?, - // found a new field, add to field list - None => merged.fields.push(field), - } - } - Ok(merged) - }) + out_meta.insert(key, value); + } + + // merge fields + fields.iter().try_for_each(|x| out_fields.try_merge(x))? + } + + Ok(out_fields.finish().with_metadata(out_meta)) } /// Returns an immutable reference of the vector of `Field` instances. #[inline] - pub const fn fields(&self) -> &Vec { + pub const fn fields(&self) -> &Fields { &self.fields } @@ -205,15 +292,13 @@ impl Schema { /// Find the index of the column with the given name. pub fn index_of(&self, name: &str) -> Result { - (0..self.fields.len()) - .find(|idx| self.fields[*idx].name() == name) - .ok_or_else(|| { - let valid_fields: Vec = - self.fields.iter().map(|f| f.name().clone()).collect(); - ArrowError::SchemaError(format!( - "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" - )) - }) + let (idx, _) = self.fields().find(name).ok_or_else(|| { + let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect(); + ArrowError::SchemaError(format!( + "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" + )) + })?; + Ok(idx) } /// Returns an immutable reference to the Map of custom metadata key-value pairs. @@ -225,10 +310,8 @@ impl Schema { /// Look up a column by name and return a immutable reference to the column along with /// its index. pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> { - self.fields - .iter() - .enumerate() - .find(|&(_, c)| c.name() == name) + let (idx, field) = self.fields.find(name)?; + Some((idx, field.as_ref())) } /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: @@ -281,10 +364,11 @@ impl Hash for Schema { #[cfg(test)] mod tests { - use super::*; use crate::datatype::DataType; use crate::{TimeUnit, UnionMode}; + use super::*; + #[test] #[cfg(feature = "serde")] fn test_ser_de_metadata() { @@ -525,10 +609,10 @@ mod tests { Field::new("last_name", DataType::Utf8, false), Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ]), + ])), false, ), Field::new_dict( @@ -634,7 +718,9 @@ mod tests { Field::new("last_name", DataType::Utf8, false), Field::new( "address", - DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]), + DataType::Struct( + vec![Field::new("zip", DataType::UInt16, false)].into(), + ), false, ), ]), @@ -644,12 +730,12 @@ mod tests { Field::new("last_name", DataType::Utf8, true), Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ // add new nested field Field::new("street", DataType::Utf8, false), // nullable merge on nested field Field::new("zip", DataType::UInt16, true), - ]), + ])), false, ), // new field @@ -671,10 +757,10 @@ mod tests { Field::new("last_name", DataType::Utf8, true), Field::new( "address", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("zip", DataType::UInt16, true), Field::new("street", DataType::Utf8, false), - ]), + ])), false, ), Field::new("number", DataType::Utf8, true), diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 0fbbb3868691..0895b99c7f59 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -128,7 +128,7 @@ mod tests { use arrow_array::types::Int32Type; use arrow_array::{Int32Array, StringArray, StructArray}; use arrow_data::ArrayData; - use arrow_schema::{DataType, Field}; + use arrow_schema::{DataType, Field, Fields}; use rand::{thread_rng, Rng}; #[test] @@ -376,10 +376,10 @@ mod tests { /// also need the top level is_valid bits to be correct. fn create_foo_struct(values: Vec) -> StructArray { let mut struct_array = StructBuilder::new( - vec![ + Fields::from(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Boolean, true), - ], + ]), vec![ Box::new(Int32Builder::with_capacity(values.len())), Box::new(BooleanBuilder::with_capacity(values.len())), diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 76909587db76..cf28c9682ae5 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -157,23 +157,21 @@ where Ok(Arc::new(MapArray::from(unsafe { builder.build_unchecked() }))) } DataType::Struct(fields) => { - let struct_: &StructArray = - values.as_any().downcast_ref::().unwrap(); - let arrays: Result, _> = struct_ + let array: &StructArray = values.as_struct(); + let arrays = array .columns() .iter() .map(|a| take_impl(a.as_ref(), indices, Some(options.clone()))) - .collect(); - let arrays = arrays?; + .collect::, _>>()?; let fields: Vec<(Field, ArrayRef)> = - fields.clone().into_iter().zip(arrays).collect(); + fields.iter().map(|f| f.as_ref().clone()).zip(arrays).collect(); // Create the null bit buffer. let is_valid: Buffer = indices .iter() .map(|index| { if let Some(index) = index { - struct_.is_valid(index.to_usize().unwrap()) + array.is_valid(index.to_usize().unwrap()) } else { false } @@ -962,7 +960,7 @@ where mod tests { use super::*; use arrow_array::builder::*; - use arrow_schema::TimeUnit; + use arrow_schema::{Fields, TimeUnit}; fn test_take_decimal_arrays( data: Vec>, @@ -1060,10 +1058,10 @@ mod tests { values: Vec, Option)>>, ) -> StructArray { let mut struct_builder = StructBuilder::new( - vec![ + Fields::from(vec![ Field::new("a", DataType::Boolean, true), Field::new("b", DataType::Int32, true), - ], + ]), vec![ Box::new(BooleanBuilder::with_capacity(values.len())), Box::new(Int32Builder::with_capacity(values.len())), diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index d4aec4d38423..cb26a0d33f1e 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -27,6 +27,7 @@ use arrow::record_batch::*; #[cfg(feature = "prettyprint")] use arrow::util::pretty::print_batches; +use arrow_schema::Fields; fn main() -> Result<()> { // define schema @@ -34,11 +35,11 @@ fn main() -> Result<()> { Field::new("id", DataType::Int32, false), Field::new( "nested", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::Float64, false), Field::new("c", DataType::Float64, false), - ]), + ])), false, ), ]); diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 0d92e98cf718..357e9b13ae82 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -161,7 +161,7 @@ mod tests { Field::new("a", DataType::Boolean, true), Field::new("b", DataType::Int32, true), ]; - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) + let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into())) .len(5) .add_child_data(boolean_data.clone()) .add_child_data(int_data.clone()) diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index c25240096812..d1977d42bba0 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -29,7 +29,8 @@ pub use arrow_array::{ pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ - DataType, Field, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionMode, + DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, + TimeUnit, UnionMode, }; #[cfg(feature = "ffi")] diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 0956893a870d..1983ea72d2fb 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -249,6 +249,7 @@ fn create_random_null_buffer(size: usize, null_density: f32) -> Buffer { #[cfg(test)] mod tests { use super::*; + use arrow_schema::Fields; #[test] fn test_create_batch() { @@ -298,7 +299,7 @@ mod tests { #[test] fn test_create_struct_array() { let size = 32; - let struct_fields = vec![ + let struct_fields = Fields::from(vec![ Field::new("b", DataType::Boolean, true), Field::new( "c", @@ -315,14 +316,14 @@ mod tests { ), Field::new( "d", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("d_x", DataType::Int32, true), Field::new("d_y", DataType::Float32, false), Field::new("d_z", DataType::Binary, true), - ]), + ])), true, ), - ]; + ]); let field = Field::new("struct", DataType::Struct(struct_fields), true); let array = create_random_array(&field, size, 0.2, 0.5).unwrap(); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 7ee65a3575cd..b113ec04ccab 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -40,7 +40,9 @@ use arrow_buffer::{i256, Buffer}; use arrow_cast::pretty::pretty_format_columns; use arrow_cast::{can_cast_types, cast}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use arrow_schema::{ + ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionMode, +}; use half::f16; use std::sync::Arc; @@ -398,10 +400,10 @@ fn get_all_types() -> Vec { FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), LargeList(Box::new(Field::new("item", DataType::Int8, true))), LargeList(Box::new(Field::new("item", DataType::Utf8, false))), - Struct(vec![ + Struct(Fields::from(vec![ Field::new("f1", DataType::Int32, true), Field::new("f2", DataType::Utf8, true), - ]), + ])), Union( vec![ Field::new("f1", DataType::Int32, false), diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index af81b17e4aa8..dbbeb934d37c 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -26,7 +26,7 @@ use arrow_array::builder::{StringBuilder, StructBuilder}; use arrow_array::{DictionaryArray, FixedSizeListArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{DataType, Field}; +use arrow_schema::{DataType, Field, Fields}; use std::sync::Arc; #[test] @@ -806,10 +806,10 @@ fn test_struct_equal_null() { ])); let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); - let a = ArrayData::builder(DataType::Struct(vec![ + let a = ArrayData::builder(DataType::Struct(Fields::from(vec![ Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), - ])) + ]))) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) .add_child_data(strings.to_data()) @@ -818,10 +818,10 @@ fn test_struct_equal_null() { .unwrap(); let a = make_array(a); - let b = ArrayData::builder(DataType::Struct(vec![ + let b = ArrayData::builder(DataType::Struct(Fields::from(vec![ Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), - ])) + ]))) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) .add_child_data(strings.to_data()) @@ -834,10 +834,10 @@ fn test_struct_equal_null() { // test with arrays that are not equal let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); - let c = ArrayData::builder(DataType::Struct(vec![ + let c = ArrayData::builder(DataType::Struct(Fields::from(vec![ Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), - ])) + ]))) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) .add_child_data(strings.to_data()) @@ -849,11 +849,9 @@ fn test_struct_equal_null() { test_equal(&a, &c, false); // test a nested struct - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - a.data_type().clone(), - true, - )])) + let a = ArrayData::builder(DataType::Struct( + vec![Field::new("f3", a.data_type().clone(), true)].into(), + )) .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) .len(5) .add_child_data(a.to_data()) @@ -869,10 +867,10 @@ fn test_struct_equal_null() { Some("mark"), Some("doe"), ])); - let b = ArrayData::builder(DataType::Struct(vec![ + let b = ArrayData::builder(DataType::Struct(Fields::from(vec![ Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), - ])) + ]))) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) .add_child_data(strings.to_data()) @@ -880,11 +878,9 @@ fn test_struct_equal_null() { .build() .unwrap(); - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - b.data_type().clone(), - true, - )])) + let b = ArrayData::builder(DataType::Struct( + vec![Field::new("f3", b.data_type().clone(), true)].into(), + )) .null_bit_buffer(Some(Buffer::from(vec![0b00011110]))) .len(5) .add_child_data(b) @@ -913,11 +909,9 @@ fn test_struct_equal_null_variable_size() { Some("doe"), ])); - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) + let a = ArrayData::builder(DataType::Struct( + vec![Field::new("f1", DataType::Utf8, true)].into(), + )) .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) .len(5) .add_child_data(strings1.to_data()) @@ -925,11 +919,9 @@ fn test_struct_equal_null_variable_size() { .unwrap(); let a = make_array(a); - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) + let b = ArrayData::builder(DataType::Struct( + vec![Field::new("f1", DataType::Utf8, true)].into(), + )) .null_bit_buffer(Some(Buffer::from(vec![0b00001010]))) .len(5) .add_child_data(strings2.to_data()) @@ -947,11 +939,9 @@ fn test_struct_equal_null_variable_size() { Some("doe"), Some("joe"), ])); - let c = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) + let c = ArrayData::builder(DataType::Struct( + vec![Field::new("f1", DataType::Utf8, true)].into(), + )) .null_bit_buffer(Some(Buffer::from(vec![0b00001011]))) .len(5) .add_child_data(strings3.to_data()) diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 34ef6cbae428..57816306ba4e 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -25,7 +25,7 @@ use arrow::datatypes::Int16Type; use arrow_buffer::Buffer; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; -use arrow_schema::{DataType, Field}; +use arrow_schema::{DataType, Field, Fields}; use std::sync::Arc; fn create_decimal_array( @@ -778,10 +778,10 @@ fn test_map_nulls_append() { DataType::Map( Box::new(Field::new( "entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("keys", DataType::Int64, false), Field::new("values", DataType::Int64, true), - ]), + ])), false, )), false, diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index e8485e961f45..73e013ff1c15 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -386,7 +386,7 @@ fn test_validate_struct_child_type() { // validate the the type of struct fields matches child fields ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), + DataType::Struct(vec![Field::new("field1", DataType::Int64, true)].into()), 3, None, 0, @@ -407,7 +407,7 @@ fn test_validate_struct_child_length() { .collect::(); ArrayData::try_new( - DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), + DataType::Struct(vec![Field::new("field1", DataType::Int32, true)].into()), 6, None, 0, diff --git a/arrow/tests/schema.rs b/arrow/tests/schema.rs index ff544b68937b..f252d77ca65d 100644 --- a/arrow/tests/schema.rs +++ b/arrow/tests/schema.rs @@ -29,18 +29,19 @@ fn schema_destructure() { let field = Field::new("c1", DataType::Utf8, false); let schema = Schema::new(vec![field]).with_metadata(meta); - // Destructuring a Schema allows rewriting fields and metadata + // Destructuring a Schema allows rewriting metadata // without copying // // Model this usecase below: let Schema { - mut fields, - metadata, + fields, + mut metadata, } = schema; - fields.push(Field::new("c2", DataType::Utf8, false)); + + metadata.insert("foo".to_string(), "bar".to_string()); let new_schema = Schema::new(fields).with_metadata(metadata); - assert_eq!(new_schema.fields().len(), 2); + assert_eq!(new_schema.metadata.get("foo").unwrap(), "bar"); } diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index a590ceb5911c..818fe0b3e49b 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -233,25 +233,25 @@ fn _create_nested_bench_batch( let fields = vec![ Field::new( "_1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("_1", DataType::Int8, true), Field::new( "_2", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("_1", DataType::Int8, true), Field::new( "_1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("_1", DataType::Int8, true), Field::new("_2", DataType::Utf8, true), - ]), + ])), true, ), Field::new("_2", DataType::UInt8, true), - ]), + ])), true, ), - ]), + ])), true, ), Field::new( @@ -260,14 +260,14 @@ fn _create_nested_bench_batch( "item", DataType::List(Box::new(Field::new( "item", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new( "_1", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("_1", DataType::Int8, true), Field::new("_2", DataType::Int16, true), Field::new("_3", DataType::Int32, true), - ]), + ])), true, ), Field::new( @@ -279,7 +279,7 @@ fn _create_nested_bench_batch( ))), true, ), - ]), + ])), true, ))), true, diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 246bccfece4e..60cc84f9f8d4 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use arrow_schema::DataType; +use arrow_schema::{DataType, Fields, SchemaBuilder}; use crate::arrow::array_reader::empty_array::make_empty_array_reader; use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader; @@ -82,6 +82,7 @@ fn build_map_reader( match (key_reader, value_reader) { (Some(key_reader), Some(value_reader)) => { + // Need to retrieve underlying data type to handle projection let key_type = key_reader.get_data_type().clone(); let value_type = value_reader.get_data_type().clone(); @@ -89,11 +90,12 @@ fn build_map_reader( DataType::Map(map_field, is_sorted) => match map_field.data_type() { DataType::Struct(fields) => { assert_eq!(fields.len(), 2); - let struct_field = - map_field.clone().with_data_type(DataType::Struct(vec![ - fields[0].clone().with_data_type(key_type), - fields[1].clone().with_data_type(value_type), - ])); + let struct_field = map_field.clone().with_data_type( + DataType::Struct(Fields::from(vec![ + fields[0].as_ref().clone().with_data_type(key_type), + fields[1].as_ref().clone().with_data_type(value_type), + ])), + ); DataType::Map(Box::new(struct_field), *is_sorted) } _ => unreachable!(), @@ -111,11 +113,9 @@ fn build_map_reader( )))) } (None, None) => Ok(None), - _ => { - Err(general_err!( - "partial projection of MapArray is not supported" - )) - } + _ => Err(general_err!( + "partial projection of MapArray is not supported" + )), } } @@ -131,6 +131,7 @@ fn build_list_reader( let reader = match build_reader(&children[0], mask, row_groups)? { Some(item_reader) => { + // Need to retrieve underlying data type to handle projection let item_type = item_reader.get_data_type().clone(); let data_type = match &field.arrow_type { DataType::List(f) => { @@ -270,12 +271,13 @@ fn build_struct_reader( assert_eq!(arrow_fields.len(), children.len()); let mut readers = Vec::with_capacity(children.len()); - let mut projected_fields = Vec::with_capacity(children.len()); + let mut builder = SchemaBuilder::with_capacity(children.len()); for (arrow, parquet) in arrow_fields.iter().zip(children) { if let Some(reader) = build_reader(parquet, mask, row_groups)? { + // Need to retrieve underlying data type to handle projection let child_type = reader.get_data_type().clone(); - projected_fields.push(arrow.clone().with_data_type(child_type)); + builder.push(arrow.as_ref().clone().with_data_type(child_type)); readers.push(reader); } } @@ -285,7 +287,7 @@ fn build_struct_reader( } Ok(Some(Box::new(StructArrayReader::new( - DataType::Struct(projected_fields), + DataType::Struct(builder.finish().fields), readers, field.def_level, field.rep_level, @@ -321,11 +323,11 @@ mod tests { build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap(); // Create arrow types - let arrow_type = DataType::Struct(vec![Field::new( + let arrow_type = DataType::Struct(Fields::from(vec![Field::new( "b_struct", - DataType::Struct(vec![Field::new("b_c_int", DataType::Int32, true)]), + DataType::Struct(vec![Field::new("b_c_int", DataType::Int32, true)].into()), true, - )]); + )])); assert_eq!(array_reader.get_data_type(), &arrow_type); } diff --git a/parquet/src/arrow/array_reader/empty_array.rs b/parquet/src/arrow/array_reader/empty_array.rs index 2a3711fa0309..51673f2f8cf2 100644 --- a/parquet/src/arrow/array_reader/empty_array.rs +++ b/parquet/src/arrow/array_reader/empty_array.rs @@ -17,7 +17,7 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::Result; -use arrow_schema::DataType as ArrowType; +use arrow_schema::{DataType as ArrowType, Fields}; use arrow_array::{ArrayRef, StructArray}; use arrow_data::ArrayDataBuilder; use std::any::Any; @@ -40,7 +40,7 @@ struct EmptyArrayReader { impl EmptyArrayReader { pub fn new(row_count: usize) -> Self { Self { - data_type: ArrowType::Struct(vec![]), + data_type: ArrowType::Struct(Fields::empty()), remaining_rows: row_count, need_consume_records: 0, } diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index dbbac657ebd1..6218a5466da2 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -261,6 +261,7 @@ mod tests { use arrow::datatypes::{Field, Int32Type as ArrowInt32, Int32Type}; use arrow_array::{Array, PrimitiveArray}; use arrow_data::ArrayDataBuilder; + use arrow_schema::Fields; use std::sync::Arc; fn list_type( @@ -581,15 +582,17 @@ mod tests { assert_eq!(batch.data_type(), array_reader.get_data_type()); assert_eq!( batch.data_type(), - &ArrowType::Struct(vec![Field::new( + &ArrowType::Struct(Fields::from(vec![Field::new( "table_info", ArrowType::List(Box::new(Field::new( "table_info", - ArrowType::Struct(vec![Field::new("name", ArrowType::Binary, false)]), + ArrowType::Struct( + vec![Field::new("name", ArrowType::Binary, false)].into() + ), false ))), false - )]) + )])) ); assert_eq!(batch.len(), 0); } diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index cd1a76e86388..621292ee7900 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -129,6 +129,7 @@ mod tests { use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder}; use arrow_array::cast::*; use arrow_array::RecordBatch; + use arrow_schema::Fields; use bytes::Bytes; #[test] @@ -150,10 +151,10 @@ mod tests { ArrowType::Map( Box::new(Field::new( "entries", - ArrowType::Struct(vec![ + ArrowType::Struct(Fields::from(vec![ Field::new("keys", ArrowType::Utf8, false), Field::new("values", ArrowType::Int32, true), - ]), + ])), false, )), false, // Map field not sorted diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 91e839fc1890..22724ae3f081 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -218,6 +218,7 @@ mod tests { use arrow::buffer::Buffer; use arrow::datatypes::Field; use arrow_array::{Array, Int32Array, ListArray}; + use arrow_schema::Fields; #[test] fn test_struct_array_reader() { @@ -237,10 +238,10 @@ mod tests { Some(vec![0, 1, 1, 1, 1]), ); - let struct_type = ArrowType::Struct(vec![ + let struct_type = ArrowType::Struct(Fields::from(vec![ Field::new("f1", array_1.data_type().clone(), true), Field::new("f2", array_2.data_type().clone(), true), - ]); + ])); let mut struct_array_reader = StructArrayReader::new( struct_type, diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6c8d08de251d..8464b959215d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -704,7 +704,7 @@ mod tests { use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_buffer::Buffer; use arrow_data::ArrayDataBuilder; - use arrow_schema::{DataType as ArrowDataType, Field, Schema}; + use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema}; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader, @@ -1104,16 +1104,17 @@ mod tests { fn test_decimal_nullable_struct() { let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); - let data = ArrayDataBuilder::new(ArrowDataType::Struct(vec![Field::new( - "decimals", - decimals.data_type().clone(), - false, - )])) - .len(8) - .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) - .child_data(vec![decimals.into_data()]) - .build() - .unwrap(); + let data = + ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( + "decimals", + decimals.data_type().clone(), + false, + )]))) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![decimals.into_data()]) + .build() + .unwrap(); let written = RecordBatch::try_from_iter([( "struct", @@ -1140,16 +1141,17 @@ mod tests { #[test] fn test_int32_nullable_struct() { let int32 = Int32Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); - let data = ArrayDataBuilder::new(ArrowDataType::Struct(vec![Field::new( - "int32", - int32.data_type().clone(), - false, - )])) - .len(8) - .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) - .child_data(vec![int32.into_data()]) - .build() - .unwrap(); + let data = + ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( + "int32", + int32.data_type().clone(), + false, + )]))) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![int32.into_data()]) + .build() + .unwrap(); let written = RecordBatch::try_from_iter([( "struct", @@ -1867,19 +1869,19 @@ mod tests { let expected_schema = Schema::new(vec![ Field::new( "roll_num", - ArrowDataType::Struct(vec![Field::new( + ArrowDataType::Struct(Fields::from(vec![Field::new( "count", ArrowDataType::UInt64, false, - )]), + )])), false, ), Field::new( "PC_CUR", - ArrowDataType::Struct(vec![ + ArrowDataType::Struct(Fields::from(vec![ Field::new("mean", ArrowDataType::Int64, false), Field::new("sum", ArrowDataType::Int64, false), - ]), + ])), false, ), ]); @@ -1947,11 +1949,13 @@ mod tests { let reader = builder.with_projection(mask).build().unwrap(); - let expected_schema = Schema::new(vec![Field::new( + let expected_schema = Schema::new(Fields::from(vec![Field::new( "group", - ArrowDataType::Struct(vec![Field::new("leaf", ArrowDataType::Int32, false)]), + ArrowDataType::Struct( + vec![Field::new("leaf", ArrowDataType::Int32, false)].into(), + ), true, - )]); + )])); let batch = reader.into_iter().next().unwrap().unwrap(); assert_eq!(batch.schema().as_ref(), &expected_schema); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index f21931d00884..9a6a97df4467 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -486,7 +486,7 @@ mod tests { use arrow_buffer::{Buffer, ToByteSlice}; use arrow_cast::display::array_value_to_string; use arrow_data::ArrayDataBuilder; - use arrow_schema::Schema; + use arrow_schema::{Fields, Schema}; #[test] fn test_calculate_array_levels_twitter_example() { @@ -947,7 +947,7 @@ mod tests { ); let struct_field_e = Field::new( "e", - DataType::Struct(vec![struct_field_f.clone(), struct_field_g.clone()]), + DataType::Struct(vec![struct_field_f.clone(), struct_field_g.clone()].into()), true, ); let schema = Schema::new(vec![ @@ -955,7 +955,9 @@ mod tests { Field::new("b", DataType::Int32, true), Field::new( "c", - DataType::Struct(vec![struct_field_d.clone(), struct_field_e.clone()]), + DataType::Struct( + vec![struct_field_d.clone(), struct_field_e.clone()].into(), + ), true, // https://github.com/apache/arrow-rs/issues/245 ), ]); @@ -1067,7 +1069,7 @@ mod tests { let offset_field = Field::new("offset", DataType::Int32, true); let schema = Schema::new(vec![Field::new( "some_nested_object", - DataType::Struct(vec![offset_field.clone()]), + DataType::Struct(vec![offset_field.clone()].into()), false, )]); @@ -1090,7 +1092,7 @@ mod tests { let offset_field = Field::new("offset", DataType::Int32, true); let schema = Schema::new(vec![Field::new( "some_nested_object", - DataType::Struct(vec![offset_field.clone()]), + DataType::Struct(vec![offset_field.clone()].into()), true, )]); @@ -1122,10 +1124,10 @@ mod tests { {"stocks":{"long": "$CCC", "short": null}} {"stocks":{"hedged": "$YYY", "long": null, "short": "$D"}} "#; - let entries_struct_type = DataType::Struct(vec![ + let entries_struct_type = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Utf8, true), - ]); + ])); let stocks_field = Field::new( "stocks", DataType::Map( @@ -1182,13 +1184,12 @@ mod tests { fn test_list_of_struct() { // define schema let int_field = Field::new("a", DataType::Int32, true); - let item_field = - Field::new("item", DataType::Struct(vec![int_field.clone()]), true); + let fields = Fields::from([Arc::new(int_field)]); + let item_field = Field::new("item", DataType::Struct(fields.clone()), true); let list_field = Field::new("list", DataType::List(Box::new(item_field)), true); let int_builder = Int32Builder::with_capacity(10); - let struct_builder = - StructBuilder::new(vec![int_field], vec![Box::new(int_builder)]); + let struct_builder = StructBuilder::new(fields, vec![Box::new(int_builder)]); let mut list_builder = ListBuilder::new(struct_builder); // [{a: 1}], [], null, [null, null], [{a: null}], [{a: 2}] diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2d867c9596c7..94c19cb2e9e0 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -639,6 +639,7 @@ mod tests { use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; use arrow_array::RecordBatch; + use arrow_schema::Fields; use crate::basic::Encoding; use crate::file::metadata::ParquetMetaData; @@ -899,22 +900,24 @@ mod tests { ); let struct_field_e = Field::new( "e", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ struct_field_f.clone(), struct_field_g.clone(), struct_field_h.clone(), - ]), + ])), false, ); - let schema = Schema::new(vec![ + let schema = Schema::new(Fields::from(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, true), Field::new( "c", - DataType::Struct(vec![struct_field_d.clone(), struct_field_e.clone()]), + DataType::Struct( + vec![struct_field_d.clone(), struct_field_e.clone()].into(), + ), false, ), - ]); + ])); // create some data let a = Int32Array::from(vec![1, 2, 3, 4, 5]); @@ -980,11 +983,11 @@ mod tests { let topic_field = Field::new("topic", DataType::Utf8, true); let schema = Schema::new(vec![Field::new( "some_nested_object", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ offset_field.clone(), partition_field.clone(), topic_field.clone(), - ]), + ])), false, )]); @@ -1015,10 +1018,10 @@ mod tests { {"stocks":{"long": null, "long": "$CCC", "short": null}} {"stocks":{"hedged": "$YYY", "long": null, "short": "$D"}} "#; - let entries_struct_type = DataType::Struct(vec![ + let entries_struct_type = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Utf8, true), - ]); + ])); let stocks_field = Field::new( "stocks", DataType::Map( @@ -1039,8 +1042,9 @@ mod tests { fn arrow_writer_2_level_struct() { // tests writing > let field_c = Field::new("c", DataType::Int32, true); - let field_b = Field::new("b", DataType::Struct(vec![field_c]), true); - let field_a = Field::new("a", DataType::Struct(vec![field_b.clone()]), true); + let field_b = Field::new("b", DataType::Struct(vec![field_c].into()), true); + let type_a = DataType::Struct(vec![field_b.clone()].into()); + let field_a = Field::new("a", type_a, true); let schema = Schema::new(vec![field_a.clone()]); // create data @@ -1073,19 +1077,21 @@ mod tests { fn arrow_writer_2_level_struct_non_null() { // tests writing > let field_c = Field::new("c", DataType::Int32, false); - let field_b = Field::new("b", DataType::Struct(vec![field_c]), false); - let field_a = Field::new("a", DataType::Struct(vec![field_b.clone()]), false); - let schema = Schema::new(vec![field_a.clone()]); + let type_b = DataType::Struct(vec![field_c].into()); + let field_b = Field::new("b", type_b.clone(), false); + let type_a = DataType::Struct(vec![field_b].into()); + let field_a = Field::new("a", type_a.clone(), false); + let schema = Schema::new(vec![field_a]); // create data let c = Int32Array::from(vec![1, 2, 3, 4, 5, 6]); - let b_data = ArrayDataBuilder::new(field_b.data_type().clone()) + let b_data = ArrayDataBuilder::new(type_b) .len(6) .add_child_data(c.into_data()) .build() .unwrap(); let b = StructArray::from(b_data); - let a_data = ArrayDataBuilder::new(field_a.data_type().clone()) + let a_data = ArrayDataBuilder::new(type_a) .len(6) .add_child_data(b.into_data()) .build() @@ -1105,13 +1111,15 @@ mod tests { fn arrow_writer_2_level_struct_mixed_null() { // tests writing > let field_c = Field::new("c", DataType::Int32, false); - let field_b = Field::new("b", DataType::Struct(vec![field_c]), true); - let field_a = Field::new("a", DataType::Struct(vec![field_b.clone()]), false); - let schema = Schema::new(vec![field_a.clone()]); + let type_b = DataType::Struct(vec![field_c].into()); + let field_b = Field::new("b", type_b.clone(), true); + let type_a = DataType::Struct(vec![field_b].into()); + let field_a = Field::new("a", type_a.clone(), false); + let schema = Schema::new(vec![field_a]); // create data let c = Int32Array::from(vec![1, 2, 3, 4, 5, 6]); - let b_data = ArrayDataBuilder::new(field_b.data_type().clone()) + let b_data = ArrayDataBuilder::new(type_b) .len(6) .null_bit_buffer(Some(Buffer::from(vec![0b00100111]))) .add_child_data(c.into_data()) @@ -1119,7 +1127,7 @@ mod tests { .unwrap(); let b = StructArray::from(b_data); // a intentionally has no null buffer, to test that this is handled correctly - let a_data = ArrayDataBuilder::new(field_a.data_type().clone()) + let a_data = ArrayDataBuilder::new(type_a) .len(6) .add_child_data(b.into_data()) .build() @@ -2244,13 +2252,16 @@ mod tests { let field_b = Field::new("leaf_b", DataType::Int32, true); let struct_a = Field::new( "struct_a", - DataType::Struct(vec![field_a.clone(), field_b.clone()]), + DataType::Struct(vec![field_a.clone(), field_b.clone()].into()), true, ); let list_a = Field::new("list", DataType::List(Box::new(struct_a)), true); - let struct_b = - Field::new("struct_b", DataType::Struct(vec![list_a.clone()]), false); + let struct_b = Field::new( + "struct_b", + DataType::Struct(vec![list_a.clone()].into()), + false, + ); let schema = Arc::new(Schema::new(vec![struct_b])); diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 70cee9ef9ab4..ad6ded1b842f 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -16,6 +16,7 @@ // under the License. use std::collections::HashMap; +use std::sync::Arc; use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::ProjectionMask; @@ -23,7 +24,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError; use crate::errors::Result; use crate::schema::types::{SchemaDescriptor, Type, TypePtr}; -use arrow_schema::{DataType, Field, Schema}; +use arrow_schema::{DataType, Field, Fields, Schema, SchemaBuilder}; fn get_repetition(t: &Type) -> Repetition { let info = t.get_basic_info(); @@ -193,7 +194,7 @@ impl Visitor { None => None, }; - let mut child_fields = Vec::with_capacity(parquet_fields.len()); + let mut child_fields = SchemaBuilder::with_capacity(parquet_fields.len()); let mut children = Vec::with_capacity(parquet_fields.len()); // Perform a DFS of children @@ -213,7 +214,7 @@ impl Visitor { None => None, }; - let arrow_field = arrow_fields.map(|x| &x[idx]); + let arrow_field = arrow_fields.map(|x| &*x[idx]); let child_ctx = VisitorContext { rep_level, def_level, @@ -236,7 +237,7 @@ impl Visitor { rep_level, def_level, nullable, - arrow_type: DataType::Struct(child_fields), + arrow_type: DataType::Struct(child_fields.finish().fields), field_type: ParquetFieldType::Group { children }, }; @@ -302,7 +303,7 @@ impl Visitor { )); } - (Some(field), Some(&fields[0]), Some(&fields[1]), *sorted) + (Some(field), Some(&*fields[0]), Some(&*fields[1]), *sorted) } d => { return Err(arrow_err!( @@ -343,8 +344,8 @@ impl Visitor { // Need both columns to be projected match (maybe_key, maybe_value) { (Some(key), Some(value)) => { - let key_field = convert_field(map_key, &key, arrow_key); - let value_field = convert_field(map_value, &value, arrow_value); + let key_field = Arc::new(convert_field(map_key, &key, arrow_key)); + let value_field = Arc::new(convert_field(map_value, &value, arrow_value)); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), _ => HashMap::default(), @@ -352,7 +353,7 @@ impl Visitor { let map_field = Field::new( map_key_value.name(), - DataType::Struct(vec![key_field, value_field]), + DataType::Struct(Fields::from([key_field, value_field])), false, // The inner map field is always non-nullable (#1697) ) .with_metadata(field_metadata); diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index a63d859aaf7b..09109d290e22 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -29,7 +29,7 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_ipc::writer; -use arrow_schema::{DataType, Field, Schema, TimeUnit}; +use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, @@ -96,7 +96,7 @@ pub(crate) fn parquet_to_array_schema_and_fields( )), _ => unreachable!(), }, - None => Ok((Schema::new_with_metadata(vec![], metadata), None)), + None => Ok((Schema::new_with_metadata(Fields::empty(), metadata), None)), } } @@ -551,7 +551,7 @@ mod tests { let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); - let arrow_fields = vec![ + let arrow_fields = Fields::from(vec![ Field::new("boolean", DataType::Boolean, false), Field::new("int8", DataType::Int8, false), Field::new("int16", DataType::Int16, false), @@ -564,7 +564,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new("string_2", DataType::Utf8, true), Field::new("json", DataType::Utf8, true), - ]; + ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -586,12 +586,12 @@ mod tests { let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); - let arrow_fields = vec![ + let arrow_fields = Fields::from(vec![ Field::new("decimal1", DataType::Decimal128(4, 2), false), Field::new("decimal2", DataType::Decimal128(12, 2), false), Field::new("decimal3", DataType::Decimal128(30, 2), false), Field::new("decimal4", DataType::Decimal128(33, 2), false), - ]; + ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -610,10 +610,10 @@ mod tests { let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); - let arrow_fields = vec![ + let arrow_fields = Fields::from(vec![ Field::new("binary", DataType::Binary, false), Field::new("fixed_binary", DataType::FixedSizeBinary(20), false), - ]; + ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -632,10 +632,10 @@ mod tests { let converted_arrow_schema = parquet_to_arrow_schema(&parquet_schema, None).unwrap(); - let arrow_fields = vec![ + let arrow_fields = Fields::from(vec![ Field::new("boolean", DataType::Boolean, false), Field::new("int8", DataType::Int8, false), - ]; + ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); let converted_arrow_schema = parquet_to_arrow_schema_by_columns( @@ -786,10 +786,10 @@ mod tests { // }; // } { - let arrow_struct = DataType::Struct(vec![ + let arrow_struct = DataType::Struct(Fields::from(vec![ Field::new("str", DataType::Utf8, false), Field::new("num", DataType::Int32, false), - ]); + ])); arrow_fields.push(Field::new( "my_list", DataType::List(Box::new(Field::new("element", arrow_struct, false))), @@ -805,8 +805,8 @@ mod tests { // } // Special case: group is named array { - let arrow_struct = - DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let fields = vec![Field::new("str", DataType::Utf8, false)].into(); + let arrow_struct = DataType::Struct(fields); arrow_fields.push(Field::new( "my_list", DataType::List(Box::new(Field::new("array", arrow_struct, false))), @@ -822,8 +822,8 @@ mod tests { // } // Special case: group named ends in _tuple { - let arrow_struct = - DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let fields = vec![Field::new("str", DataType::Utf8, false)].into(); + let arrow_struct = DataType::Struct(fields); arrow_fields.push(Field::new( "my_list", DataType::List(Box::new(Field::new( @@ -854,7 +854,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i], "{i}"); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref(), "{i}"); } } @@ -933,7 +933,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -978,10 +978,10 @@ mod tests { DataType::Map( Box::new(Field::new( "key_value", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Int32, true), - ]), + ])), false, )), false, @@ -1003,10 +1003,10 @@ mod tests { DataType::Map( Box::new(Field::new( "map", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("str", DataType::Utf8, false), Field::new("num", DataType::Int32, false), - ]), + ])), false, // (#1697) )), false, @@ -1028,10 +1028,10 @@ mod tests { DataType::Map( Box::new(Field::new( "map", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Int32, true), - ]), + ])), false, // (#1697) )), false, @@ -1049,7 +1049,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -1057,10 +1057,10 @@ mod tests { fn test_nested_schema() { let mut arrow_fields = Vec::new(); { - let group1_fields = vec![ + let group1_fields = Fields::from(vec![ Field::new("leaf1", DataType::Boolean, false), Field::new("leaf2", DataType::Int32, false), - ]; + ]); let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1_struct); @@ -1087,7 +1087,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -1095,11 +1095,11 @@ mod tests { fn test_nested_schema_partial() { let mut arrow_fields = Vec::new(); { - let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)]; + let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)].into(); let group1 = Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1); - let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)]; + let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)].into(); let group2 = Field::new("group2", DataType::Struct(group2_fields), false); arrow_fields.push(group2); @@ -1138,7 +1138,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -1146,11 +1146,11 @@ mod tests { fn test_nested_schema_partial_ordering() { let mut arrow_fields = Vec::new(); { - let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)]; + let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)].into(); let group1 = Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1); - let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)]; + let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)].into(); let group2 = Field::new("group2", DataType::Struct(group2_fields), false); arrow_fields.push(group2); @@ -1189,7 +1189,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -1203,7 +1203,9 @@ mod tests { "innerGroup", DataType::List(Box::new(Field::new( "innerGroup", - DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), + DataType::Struct( + vec![Field::new("leaf3", DataType::Int32, true)].into(), + ), false, ))), false, @@ -1213,10 +1215,10 @@ mod tests { "outerGroup", DataType::List(Box::new(Field::new( "outerGroup", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("leaf2", DataType::Int32, true), inner_group_list, - ]), + ])), false, ))), false, @@ -1244,7 +1246,7 @@ mod tests { assert_eq!(arrow_fields.len(), converted_fields.len()); for i in 0..arrow_fields.len() { - assert_eq!(arrow_fields[i], converted_fields[i]); + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } } @@ -1463,7 +1465,7 @@ mod tests { ), Field::new( "struct", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("bools", DataType::Boolean, false), Field::new("uint32", DataType::UInt32, false), Field::new( @@ -1475,7 +1477,7 @@ mod tests { ))), false, ), - ]), + ])), false, ), Field::new( @@ -1519,7 +1521,11 @@ mod tests { #[test] #[should_panic(expected = "Parquet does not support writing empty structs")] fn test_empty_struct_field() { - let arrow_fields = vec![Field::new("struct", DataType::Struct(vec![]), false)]; + let arrow_fields = vec![Field::new( + "struct", + DataType::Struct(Fields::empty()), + false, + )]; let arrow_schema = Schema::new(arrow_fields); let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema); @@ -1616,10 +1622,10 @@ mod tests { // ), Field::new( "c24", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), - ]), + ])), false, ), Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), @@ -1659,7 +1665,7 @@ mod tests { DataType::Map( Box::new(Field::new( "key_value", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new( "value", @@ -1670,7 +1676,7 @@ mod tests { ))), true, ), - ]), + ])), false, // #1697 )), false, // fails to roundtrip keys_sorted @@ -1682,7 +1688,7 @@ mod tests { DataType::Map( Box::new(Field::new( "my_entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("my_key", DataType::Utf8, false), Field::new( "my_value", @@ -1693,7 +1699,7 @@ mod tests { ))), true, ), - ]), + ])), false, // #1697 )), false, // fails to roundtrip keys_sorted @@ -1705,7 +1711,7 @@ mod tests { DataType::Map( Box::new(Field::new( "my_entries", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("my_key", DataType::Utf8, false), Field::new( "my_value", @@ -1716,7 +1722,7 @@ mod tests { ))), true, ), - ]), + ])), false, )), false, // fails to roundtrip keys_sorted @@ -1777,10 +1783,10 @@ mod tests { "items", DataType::LargeList(Box::new(Field::new( "items", - DataType::Struct(vec![ + DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false), - ]), + ])), true, ))), true, From e5a1676950ab5c04b0a74953ec5418da67cedb45 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 14:58:21 +0100 Subject: [PATCH 0757/1411] Add UnionFields (#3955) (#3981) * Add UnionFields (#3955) * Fix array_cast * Review feedback * Clippy --- arrow-array/src/array/mod.rs | 16 ++-- arrow-array/src/array/union_array.rs | 53 ++++++----- arrow-array/src/record_batch.rs | 2 +- arrow-cast/src/display.rs | 14 +-- arrow-cast/src/pretty.rs | 42 +++++---- arrow-data/src/data/mod.rs | 25 +++--- arrow-data/src/equal/mod.rs | 2 +- arrow-data/src/equal/union.rs | 26 +++--- arrow-data/src/equal/utils.rs | 2 +- arrow-data/src/transform/mod.rs | 6 +- arrow-integration-test/src/datatype.rs | 22 ++--- arrow-integration-test/src/field.rs | 29 +++--- arrow-integration-test/src/lib.rs | 9 +- arrow-ipc/src/convert.rs | 99 +++++++++++---------- arrow-ipc/src/reader.rs | 31 ++++--- arrow-ipc/src/writer.rs | 20 +++-- arrow-schema/src/datatype.rs | 26 +++--- arrow-schema/src/ffi.rs | 19 ++-- arrow-schema/src/field.rs | 37 ++------ arrow-schema/src/fields.rs | 117 ++++++++++++++++++++++++- arrow-schema/src/schema.rs | 44 ++++++---- arrow/src/datatypes/mod.rs | 2 +- arrow/src/ffi.rs | 8 +- arrow/tests/array_cast.rs | 14 +-- arrow/tests/array_validation.rs | 50 ++++++----- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/schema/mod.rs | 2 +- 27 files changed, 430 insertions(+), 289 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 8d20c6cb2ad4..9a5172d0deec 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -586,7 +586,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, - DataType::Union(_, _, _) => Arc::new(UnionArray::from(data)) as ArrayRef, + DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, DataType::FixedSizeList(_, _) => { Arc::new(FixedSizeListArray::from(data)) as ArrayRef } @@ -740,7 +740,7 @@ mod tests { use crate::cast::{as_union_array, downcast_array}; use crate::downcast_run_array; use arrow_buffer::{Buffer, MutableBuffer}; - use arrow_schema::{Field, Fields, UnionMode}; + use arrow_schema::{Field, Fields, UnionFields, UnionMode}; #[test] fn test_empty_primitive() { @@ -874,11 +874,13 @@ mod tests { fn test_null_union() { for mode in [UnionMode::Sparse, UnionMode::Dense] { let data_type = DataType::Union( - vec![ - Field::new("foo", DataType::Int32, true), - Field::new("bar", DataType::Int64, true), - ], - vec![2, 1], + UnionFields::new( + vec![2, 1], + vec![ + Field::new("foo", DataType::Int32, true), + Field::new("bar", DataType::Int64, true), + ], + ), mode, ); let array = new_null_array(&data_type, 4); diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 00ad94111a4d..335b6b14f8a3 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -19,7 +19,7 @@ use crate::{make_array, Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{Buffer, ScalarBuffer}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, Field, UnionMode}; +use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; /// Contains the `UnionArray` type. /// use std::any::Any; @@ -145,8 +145,7 @@ impl UnionArray { value_offsets: Option, child_arrays: Vec<(Field, ArrayRef)>, ) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = - child_arrays.into_iter().unzip(); + let (fields, field_values): (Vec<_>, Vec<_>) = child_arrays.into_iter().unzip(); let len = type_ids.len(); let mode = if value_offsets.is_some() { @@ -156,8 +155,7 @@ impl UnionArray { }; let builder = ArrayData::builder(DataType::Union( - field_types, - Vec::from(field_type_ids), + UnionFields::new(field_type_ids.iter().copied(), fields), mode, )) .add_buffer(type_ids) @@ -282,9 +280,9 @@ impl UnionArray { /// Returns the names of the types in the union. pub fn type_names(&self) -> Vec<&str> { match self.data.data_type() { - DataType::Union(fields, _, _) => fields + DataType::Union(fields, _) => fields .iter() - .map(|f| f.name().as_str()) + .map(|(_, f)| f.name().as_str()) .collect::>(), _ => unreachable!("Union array's data type is not a union!"), } @@ -293,7 +291,7 @@ impl UnionArray { /// Returns whether the `UnionArray` is dense (or sparse if `false`). fn is_dense(&self) -> bool { match self.data.data_type() { - DataType::Union(_, _, mode) => mode == &UnionMode::Dense, + DataType::Union(_, mode) => mode == &UnionMode::Dense, _ => unreachable!("Union array's data type is not a union!"), } } @@ -307,8 +305,8 @@ impl UnionArray { impl From for UnionArray { fn from(data: ArrayData) -> Self { - let (field_ids, mode) = match data.data_type() { - DataType::Union(_, ids, mode) => (ids, *mode), + let (fields, mode) = match data.data_type() { + DataType::Union(fields, mode) => (fields, *mode), d => panic!("UnionArray expected ArrayData with type Union got {d}"), }; let (type_ids, offsets) = match mode { @@ -326,10 +324,10 @@ impl From for UnionArray { ), }; - let max_id = field_ids.iter().copied().max().unwrap_or_default() as usize; + let max_id = fields.iter().map(|(i, _)| i).max().unwrap_or_default() as usize; let mut boxed_fields = vec![None; max_id + 1]; - for (cd, field_id) in data.child_data().iter().zip(field_ids) { - boxed_fields[*field_id as usize] = Some(make_array(cd.clone())); + for (cd, (field_id, _)) in data.child_data().iter().zip(fields.iter()) { + boxed_fields[field_id as usize] = Some(make_array(cd.clone())); } Self { data, @@ -402,19 +400,18 @@ impl std::fmt::Debug for UnionArray { writeln!(f, "-- type id buffer:")?; writeln!(f, "{:?}", self.type_ids)?; - let (fields, ids) = match self.data_type() { - DataType::Union(f, ids, _) => (f, ids), - _ => unreachable!(), - }; - if let Some(offsets) = &self.offsets { writeln!(f, "-- offsets buffer:")?; writeln!(f, "{:?}", offsets)?; } - assert_eq!(fields.len(), ids.len()); - for (field, type_id) in fields.iter().zip(ids) { - let child = self.child(*type_id); + let fields = match self.data_type() { + DataType::Union(fields, _) => fields, + _ => unreachable!(), + }; + + for (type_id, field) in fields.iter() { + let child = self.child(type_id); writeln!( f, "-- child {}: \"{}\" ({:?})", @@ -1058,12 +1055,14 @@ mod tests { #[test] fn test_custom_type_ids() { let data_type = DataType::Union( - vec![ - Field::new("strings", DataType::Utf8, false), - Field::new("integers", DataType::Int32, false), - Field::new("floats", DataType::Float64, false), - ], - vec![8, 4, 9], + UnionFields::new( + vec![8, 4, 9], + vec![ + Field::new("strings", DataType::Utf8, false), + Field::new("integers", DataType::Int32, false), + Field::new("floats", DataType::Float64, false), + ], + ), UnionMode::Dense, ); diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 2754d04bfcaa..17b1f04e80af 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -590,7 +590,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 628); + assert_eq!(record_batch.get_array_memory_size(), 564); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index d1090369767f..0bca9ce657b8 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -278,7 +278,7 @@ fn make_formatter<'a>( } DataType::Struct(_) => array_format(as_struct_array(array), options), DataType::Map(_, _) => array_format(as_map_array(array), options), - DataType::Union(_, _, _) => array_format(as_union_array(array), options), + DataType::Union(_, _) => array_format(as_union_array(array), options), d => Err(ArrowError::NotYetImplemented(format!("formatting {d} is not yet supported"))), } } @@ -801,16 +801,16 @@ impl<'a> DisplayIndexState<'a> for &'a UnionArray { ); fn prepare(&self, options: &FormatOptions<'a>) -> Result { - let (fields, type_ids, mode) = match (*self).data_type() { - DataType::Union(fields, type_ids, mode) => (fields, type_ids, mode), + let (fields, mode) = match (*self).data_type() { + DataType::Union(fields, mode) => (fields, mode), _ => unreachable!(), }; - let max_id = type_ids.iter().copied().max().unwrap_or_default() as usize; + let max_id = fields.iter().map(|(id, _)| id).max().unwrap_or_default() as usize; let mut out: Vec> = (0..max_id + 1).map(|_| None).collect(); - for (i, field) in type_ids.iter().zip(fields) { - let formatter = make_formatter(self.child(*i).as_ref(), options)?; - out[*i as usize] = Some((field.name().as_str(), formatter)) + for (i, field) in fields.iter() { + let formatter = make_formatter(self.child(i).as_ref(), options)?; + out[i as usize] = Some((field.name().as_str(), formatter)) } Ok((out, *mode)) } diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index ffa5af82d154..818e9d3c0770 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -703,11 +703,13 @@ mod tests { let schema = Schema::new(vec![Field::new( "Teamsters", DataType::Union( - vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float64, false), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float64, false), + ], + ), UnionMode::Dense, ), false, @@ -743,11 +745,13 @@ mod tests { let schema = Schema::new(vec![Field::new( "Teamsters", DataType::Union( - vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float64, false), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float64, false), + ], + ), UnionMode::Sparse, ), false, @@ -785,11 +789,13 @@ mod tests { let inner_field = Field::new( "European Union", DataType::Union( - vec![ - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Float64, false), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Float64, false), + ], + ), UnionMode::Dense, ), false, @@ -809,8 +815,10 @@ mod tests { let schema = Schema::new(vec![Field::new( "Teamsters", DataType::Union( - vec![Field::new("a", DataType::Int32, true), inner_field], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![Field::new("a", DataType::Int32, true), inner_field], + ), UnionMode::Sparse, ), false, diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index c47c836637a4..581d4a10cc1c 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -136,7 +136,7 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, ], - DataType::Union(_, _, mode) => { + DataType::Union(_, mode) => { let type_ids = MutableBuffer::new(capacity * mem::size_of::()); match mode { UnionMode::Sparse => [type_ids, empty_buffer], @@ -162,7 +162,7 @@ pub(crate) fn into_buffers( | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], - DataType::Union(_, _, mode) => { + DataType::Union(_, mode) => { match mode { // Based on Union's DataTypeLayout UnionMode::Sparse => vec![buffer1.into()], @@ -621,8 +621,9 @@ impl ArrayData { vec![ArrayData::new_empty(v.as_ref())], true, ), - DataType::Union(f, i, mode) => { - let ids = Buffer::from_iter(std::iter::repeat(i[0]).take(len)); + DataType::Union(f, mode) => { + let (id, _) = f.iter().next().unwrap(); + let ids = Buffer::from_iter(std::iter::repeat(id).take(len)); let buffers = match mode { UnionMode::Sparse => vec![ids], UnionMode::Dense => { @@ -634,7 +635,7 @@ impl ArrayData { let children = f .iter() .enumerate() - .map(|(idx, f)| match idx { + .map(|(idx, (_, f))| match idx { 0 => Self::new_null(f.data_type(), len), _ => Self::new_empty(f.data_type()), }) @@ -986,10 +987,10 @@ impl ArrayData { } Ok(()) } - DataType::Union(fields, _, mode) => { + DataType::Union(fields, mode) => { self.validate_num_child_data(fields.len())?; - for (i, field) in fields.iter().enumerate() { + for (i, (_, field)) in fields.iter().enumerate() { let field_data = self.get_valid_child_data(i, field.data_type())?; if mode == &UnionMode::Sparse @@ -1255,7 +1256,7 @@ impl ArrayData { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) } - DataType::Union(_, _, _) => { + DataType::Union(_, _) => { // Validate Union Array as part of implementing new Union semantics // See comments in `ArrayData::validate()` // https://github.com/apache/arrow-rs/issues/85 @@ -1568,7 +1569,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, - DataType::Union(_, _, mode) => { + DataType::Union(_, mode) => { let type_ids = BufferSpec::FixedWidth { byte_width: size_of::(), }; @@ -1823,7 +1824,7 @@ impl From for ArrayDataBuilder { #[cfg(test)] mod tests { use super::*; - use arrow_schema::Field; + use arrow_schema::{Field, UnionFields}; // See arrow/tests/array_data_validation.rs for test of array validation @@ -2072,8 +2073,8 @@ mod tests { #[test] fn test_into_buffers() { let data_types = vec![ - DataType::Union(vec![], vec![], UnionMode::Dense), - DataType::Union(vec![], vec![], UnionMode::Sparse), + DataType::Union(UnionFields::empty(), UnionMode::Dense), + DataType::Union(UnionFields::empty(), UnionMode::Sparse), ]; for data_type in data_types { diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 871a312ca47f..fbc868d3f5c4 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -112,7 +112,7 @@ fn equal_values( fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Union(_, _, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), + DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Dictionary(data_type, _) => match data_type.as_ref() { DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Int16 => { diff --git a/arrow-data/src/equal/union.rs b/arrow-data/src/equal/union.rs index fdf770096867..4f04bc287aa8 100644 --- a/arrow-data/src/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -16,7 +16,7 @@ // under the License. use crate::data::ArrayData; -use arrow_schema::{DataType, UnionMode}; +use arrow_schema::{DataType, UnionFields, UnionMode}; use super::equal_range; @@ -28,8 +28,8 @@ fn equal_dense( rhs_type_ids: &[i8], lhs_offsets: &[i32], rhs_offsets: &[i32], - lhs_field_type_ids: &[i8], - rhs_field_type_ids: &[i8], + lhs_fields: &UnionFields, + rhs_fields: &UnionFields, ) -> bool { let offsets = lhs_offsets.iter().zip(rhs_offsets.iter()); @@ -38,13 +38,13 @@ fn equal_dense( .zip(rhs_type_ids.iter()) .zip(offsets) .all(|((l_type_id, r_type_id), (l_offset, r_offset))| { - let lhs_child_index = lhs_field_type_ids + let lhs_child_index = lhs_fields .iter() - .position(|r| r == l_type_id) + .position(|(r, _)| r == *l_type_id) .unwrap(); - let rhs_child_index = rhs_field_type_ids + let rhs_child_index = rhs_fields .iter() - .position(|r| r == r_type_id) + .position(|(r, _)| r == *r_type_id) .unwrap(); let lhs_values = &lhs.child_data()[lhs_child_index]; let rhs_values = &rhs.child_data()[rhs_child_index]; @@ -89,8 +89,8 @@ pub(super) fn union_equal( match (lhs.data_type(), rhs.data_type()) { ( - DataType::Union(_, lhs_type_ids, UnionMode::Dense), - DataType::Union(_, rhs_type_ids, UnionMode::Dense), + DataType::Union(lhs_fields, UnionMode::Dense), + DataType::Union(rhs_fields, UnionMode::Dense), ) => { let lhs_offsets = lhs.buffer::(1); let rhs_offsets = rhs.buffer::(1); @@ -106,13 +106,13 @@ pub(super) fn union_equal( rhs_type_id_range, lhs_offsets_range, rhs_offsets_range, - lhs_type_ids, - rhs_type_ids, + lhs_fields, + rhs_fields, ) } ( - DataType::Union(_, _, UnionMode::Sparse), - DataType::Union(_, _, UnionMode::Sparse), + DataType::Union(_, UnionMode::Sparse), + DataType::Union(_, UnionMode::Sparse), ) => { lhs_type_id_range == rhs_type_id_range && equal_sparse(lhs, rhs, lhs_start, rhs_start, len) diff --git a/arrow-data/src/equal/utils.rs b/arrow-data/src/equal/utils.rs index 6b9a7940dc96..fa6211542550 100644 --- a/arrow-data/src/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -59,7 +59,7 @@ pub(super) fn equal_nulls( #[inline] pub(super) fn base_equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { let equal_type = match (lhs.data_type(), rhs.data_type()) { - (DataType::Union(l_fields, _, l_mode), DataType::Union(r_fields, _, r_mode)) => { + (DataType::Union(l_fields, l_mode), DataType::Union(r_fields, r_mode)) => { l_fields == r_fields && l_mode == r_mode } (DataType::Map(l_field, l_sorted), DataType::Map(r_field, r_sorted)) => { diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 2719b96b6914..ccdbaec3b5ea 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -231,7 +231,7 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), DataType::Float16 => primitive::build_extend::(array), DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array), - DataType::Union(_, _, mode) => match mode { + DataType::Union(_, mode) => match mode { UnionMode::Sparse => union::build_extend_sparse(array), UnionMode::Dense => union::build_extend_dense(array), }, @@ -283,7 +283,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls, DataType::Float16 => primitive::extend_nulls::, DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls, - DataType::Union(_, _, mode) => match mode { + DataType::Union(_, mode) => match mode { UnionMode::Sparse => union::extend_nulls_sparse, UnionMode::Dense => union::extend_nulls_dense, }, @@ -501,7 +501,7 @@ impl<'a> MutableArrayData<'a> { .collect::>(); vec![MutableArrayData::new(childs, use_nulls, array_capacity)] } - DataType::Union(fields, _, _) => (0..fields.len()) + DataType::Union(fields, _) => (0..fields.len()) .map(|i| { let child_arrays = arrays .iter() diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index a08368d582a4..5a5dd67fc7a1 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -17,6 +17,7 @@ use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit, UnionMode}; use arrow::error::{ArrowError, Result}; +use std::sync::Arc; /// Parse a data type from a JSON representation. pub fn data_type_from_json(json: &serde_json::Value) -> Result { @@ -229,20 +230,15 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { "Unknown union mode {mode:?} for union" ))); }; - if let Some(type_ids) = map.get("typeIds") { - let type_ids = type_ids - .as_array() - .unwrap() + if let Some(values) = map.get("typeIds") { + let field = Arc::new(default_field); + let values = values.as_array().unwrap(); + let fields = values .iter() - .map(|t| t.as_i64().unwrap() as i8) - .collect::>(); + .map(|t| (t.as_i64().unwrap() as i8, field.clone())) + .collect(); - let default_fields = type_ids - .iter() - .map(|_| default_field.clone()) - .collect::>(); - - Ok(DataType::Union(default_fields, type_ids, union_mode)) + Ok(DataType::Union(fields, union_mode)) } else { Err(ArrowError::ParseError( "Expecting a typeIds for union ".to_string(), @@ -290,7 +286,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { json!({"name": "fixedsizebinary", "byteWidth": byte_width}) } DataType::Struct(_) => json!({"name": "struct"}), - DataType::Union(_, _, _) => json!({"name": "union"}), + DataType::Union(_, _) => json!({"name": "union"}), DataType::List(_) => json!({ "name": "list"}), DataType::LargeList(_) => json!({ "name": "largelist"}), DataType::FixedSizeList(_, length) => { diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index a60cd91c5b37..c714fe4671d6 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -19,6 +19,7 @@ use crate::{data_type_from_json, data_type_to_json}; use arrow::datatypes::{DataType, Field}; use arrow::error::{ArrowError, Result}; use std::collections::HashMap; +use std::sync::Arc; /// Parse a `Field` definition from a JSON representation. pub fn field_from_json(json: &serde_json::Value) -> Result { @@ -194,11 +195,17 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { } } } - DataType::Union(_, type_ids, mode) => match map.get("children") { + DataType::Union(fields, mode) => match map.get("children") { Some(Value::Array(values)) => { - let union_fields: Vec = - values.iter().map(field_from_json).collect::>()?; - DataType::Union(union_fields, type_ids, mode) + let fields = fields + .iter() + .zip(values) + .map(|((id, _), value)| { + Ok((id, Arc::new(field_from_json(value)?))) + }) + .collect::>()?; + + DataType::Union(fields, mode) } Some(_) => { return Err(ArrowError::ParseError( @@ -296,7 +303,7 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::{Fields, UnionMode}; + use arrow::datatypes::{Fields, UnionFields, UnionMode}; use serde_json::Value; #[test] @@ -569,11 +576,13 @@ mod tests { let expected = Field::new( "my_union", DataType::Union( - vec![ - Field::new("f1", DataType::Int32, true), - Field::new("f2", DataType::Utf8, true), - ], - vec![5, 7], + UnionFields::new( + vec![5, 7], + vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ], + ), UnionMode::Sparse, ), false, diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 06f16ca1dc9b..61bcbea5a707 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -858,7 +858,7 @@ pub fn array_from_json( let array = MapArray::from(array_data); Ok(Arc::new(array)) } - DataType::Union(fields, field_type_ids, _) => { + DataType::Union(fields, _) => { let type_ids = if let Some(type_id) = json_col.type_id { type_id } else { @@ -874,13 +874,14 @@ pub fn array_from_json( }); let mut children: Vec<(Field, Arc)> = vec![]; - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + for ((_, field), col) in fields.iter().zip(json_col.children.unwrap()) { let array = array_from_json(field, col, dictionaries)?; - children.push((field.clone(), array)); + children.push((field.as_ref().clone(), array)); } + let field_type_ids = fields.iter().map(|(id, _)| id).collect::>(); let array = UnionArray::try_new( - field_type_ids, + &field_type_ids, Buffer::from(&type_ids.to_byte_slice()), offset, children, diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 7e44f37d46d2..8ca0d514f462 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -410,16 +410,16 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let mut fields = vec![]; if let Some(children) = field.children() { for i in 0..children.len() { - fields.push(children.get(i).into()); + fields.push(Field::from(children.get(i))); } }; - let type_ids: Vec = match union.typeIds() { - None => (0_i8..fields.len() as i8).collect(), - Some(ids) => ids.iter().map(|i| i as i8).collect(), + let fields = match union.typeIds() { + None => UnionFields::new(0_i8..fields.len() as i8, fields), + Some(ids) => UnionFields::new(ids.iter().map(|i| i as i8), fields), }; - DataType::Union(fields, type_ids, union_mode) + DataType::Union(fields, union_mode) } t => unimplemented!("Type {:?} not supported", t), } @@ -769,9 +769,9 @@ pub(crate) fn get_fb_field_type<'a>( children: Some(fbb.create_vector(&empty_fields[..])), } } - Union(fields, type_ids, mode) => { + Union(fields, mode) => { let mut children = vec![]; - for field in fields { + for (_, field) in fields.iter() { children.push(build_field(fbb, field)); } @@ -781,7 +781,7 @@ pub(crate) fn get_fb_field_type<'a>( }; let fbb_type_ids = fbb - .create_vector(&type_ids.iter().map(|t| *t as i32).collect::>()); + .create_vector(&fields.iter().map(|(t, _)| t as i32).collect::>()); let mut builder = crate::UnionBuilder::new(fbb); builder.add_mode(union_mode); builder.add_typeIds(fbb_type_ids); @@ -962,38 +962,47 @@ mod tests { Field::new( "union]>]>", DataType::Union( - vec![ - Field::new("int64", DataType::Int64, true), - Field::new( - "list[union]>]", - DataType::List(Box::new(Field::new( - "union]>", - DataType::Union( - vec![ - Field::new("date32", DataType::Date32, true), - Field::new( - "list[union<>]", - DataType::List(Box::new(Field::new( - "union", - DataType::Union( - vec![], - vec![], - UnionMode::Sparse, + UnionFields::new( + vec![0, 1], + vec![ + Field::new("int64", DataType::Int64, true), + Field::new( + "list[union]>]", + DataType::List(Box::new(Field::new( + "union]>", + DataType::Union( + UnionFields::new( + vec![0, 1], + vec![ + Field::new( + "date32", + DataType::Date32, + true, + ), + Field::new( + "list[union<>]", + DataType::List(Box::new( + Field::new( + "union", + DataType::Union( + UnionFields::empty(), + UnionMode::Sparse, + ), + false, + ), + )), + false, ), - false, - ))), - false, + ], ), - ], - vec![0, 1], - UnionMode::Dense, - ), + UnionMode::Dense, + ), + false, + ))), false, - ))), - false, - ), - ], - vec![0, 1], + ), + ], + ), UnionMode::Sparse, ), false, @@ -1001,22 +1010,24 @@ mod tests { Field::new("struct<>", DataType::Struct(Fields::empty()), true), Field::new( "union<>", - DataType::Union(vec![], vec![], UnionMode::Dense), + DataType::Union(UnionFields::empty(), UnionMode::Dense), true, ), Field::new( "union<>", - DataType::Union(vec![], vec![], UnionMode::Sparse), + DataType::Union(UnionFields::empty(), UnionMode::Sparse), true, ), Field::new( "union", DataType::Union( - vec![ - Field::new("int32", DataType::Int32, true), - Field::new("utf8", DataType::Utf8, true), - ], - vec![2, 3], // non-default type ids + UnionFields::new( + vec![2, 3], // non-default type ids + vec![ + Field::new("int32", DataType::Int32, true), + Field::new("utf8", DataType::Utf8, true), + ], + ), UnionMode::Dense, ), true, diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 4597ed82d27f..4f2e51336e34 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -263,7 +263,7 @@ fn create_array( value_array.clone(), )? } - Union(fields, field_type_ids, mode) => { + Union(fields, mode) => { let union_node = nodes.get(node_index); node_index += 1; @@ -292,9 +292,10 @@ fn create_array( UnionMode::Sparse => None, }; - let mut children = vec![]; + let mut children = Vec::with_capacity(fields.len()); + let mut ids = Vec::with_capacity(fields.len()); - for field in fields { + for (id, field) in fields.iter() { let triple = create_array( nodes, field, @@ -310,11 +311,11 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - children.push((field.clone(), triple.0)); + children.push((field.as_ref().clone(), triple.0)); + ids.push(id); } - let array = - UnionArray::try_new(field_type_ids, type_ids, value_offsets, children)?; + let array = UnionArray::try_new(&ids, type_ids, value_offsets, children)?; Arc::new(array) } Null => { @@ -418,7 +419,7 @@ fn skip_field( node_index += 1; buffer_index += 2; } - Union(fields, _field_type_ids, mode) => { + Union(fields, mode) => { node_index += 1; buffer_index += 1; @@ -429,7 +430,7 @@ fn skip_field( UnionMode::Sparse => {} }; - for field in fields { + for (_, field) in fields.iter() { let tuple = skip_field(field.data_type(), node_index, buffer_index)?; node_index = tuple.0; @@ -1265,11 +1266,15 @@ mod tests { let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type)); - let union_fileds = vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float64, false), - ]; - let union_data_type = DataType::Union(union_fileds, vec![0, 1], UnionMode::Dense); + let union_fields = UnionFields::new( + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float64, false), + ], + ); + + let union_data_type = DataType::Union(union_fields, UnionMode::Dense); let struct_fields = Fields::from(vec![ Field::new("id", DataType::Int32, false), diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index ceb9b6ffa90f..0e999dc72756 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -298,10 +298,10 @@ impl IpcDataGenerator { write_options, )?; } - DataType::Union(fields, type_ids, _) => { + DataType::Union(fields, _) => { let union = as_union_array(column); - for (field, type_id) in fields.iter().zip(type_ids) { - let column = union.child(*type_id); + for (type_id, field) in fields.iter() { + let column = union.child(type_id); self.encode_dictionaries( field, column, @@ -1069,7 +1069,7 @@ fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> } else { !matches!( data_type, - DataType::Null | DataType::Union(_, _, _) | DataType::RunEndEncoded(_, _) + DataType::Null | DataType::Union(_, _) | DataType::RunEndEncoded(_, _) ) } } @@ -1781,11 +1781,13 @@ mod tests { let schema = Schema::new(vec![Field::new( "union", DataType::Union( - vec![ - Field::new("a", DataType::Int32, false), - Field::new("c", DataType::Float64, false), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("c", DataType::Float64, false), + ], + ), UnionMode::Sparse, ), true, diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 58747fb26a0e..57a5c68386fc 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -19,7 +19,7 @@ use std::fmt; use std::sync::Arc; use crate::field::Field; -use crate::Fields; +use crate::{Fields, UnionFields}; /// The set of datatypes that are supported by this implementation of Apache Arrow. /// @@ -194,10 +194,9 @@ pub enum DataType { Struct(Fields), /// A nested datatype that can represent slots of differing types. Components: /// - /// 1. [`Field`] for each possible child type the Union can hold - /// 2. The corresponding `type_id` used to identify which Field - /// 3. The type of union (Sparse or Dense) - Union(Vec, Vec, UnionMode), + /// 1. [`UnionFields`] + /// 2. The type of union (Sparse or Dense) + Union(UnionFields, UnionMode), /// A dictionary encoded array (`key_type`, `value_type`), where /// each array element is an index of `key_type` into an /// associated dictionary of `value_type`. @@ -384,7 +383,7 @@ impl DataType { | FixedSizeList(_, _) | LargeList(_) | Struct(_) - | Union(_, _, _) + | Union(_, _) | Map(_, _) => true, _ => false, } @@ -446,7 +445,7 @@ impl DataType { DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) => None, DataType::FixedSizeList(_, _) => None, DataType::Struct(_) => None, - DataType::Union(_, _, _) => None, + DataType::Union(_, _) => None, DataType::Dictionary(_, _) => None, DataType::RunEndEncoded(_, _) => None, } @@ -492,13 +491,7 @@ impl DataType { | DataType::LargeList(field) | DataType::Map(field, _) => field.size(), DataType::Struct(fields) => fields.size(), - DataType::Union(fields, _, _) => { - fields - .iter() - .map(|field| field.size() - std::mem::size_of_val(field)) - .sum::() - + (std::mem::size_of::() * fields.capacity()) - } + DataType::Union(fields, _) => fields.size(), DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(), DataType::RunEndEncoded(run_ends, values) => { run_ends.size() - std::mem::size_of_val(run_ends) + values.size() @@ -670,4 +663,9 @@ mod tests { Box::new(list) ))); } + + #[test] + fn size_should_not_regress() { + assert_eq!(std::mem::size_of::(), 24); + } } diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 0cfc1800f53f..72afc5b0bbcb 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -34,7 +34,9 @@ //! assert_eq!(schema, back); //! ``` -use crate::{ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionMode}; +use crate::{ + ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionFields, UnionMode, +}; use bitflags::bitflags; use std::sync::Arc; use std::{ @@ -484,7 +486,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { )); } - DataType::Union(fields, type_ids, UnionMode::Dense) + DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense) } // SparseUnion ["+us", extra] => { @@ -506,7 +508,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { )); } - DataType::Union(fields, type_ids, UnionMode::Sparse) + DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Sparse) } // Timestamps in format "tts:" and "tts:America/New_York" for no timezones and timezones resp. @@ -585,9 +587,9 @@ impl TryFrom<&DataType> for FFI_ArrowSchema { | DataType::Map(child, _) => { vec![FFI_ArrowSchema::try_from(child.as_ref())?] } - DataType::Union(fields, _, _) => fields + DataType::Union(fields, _) => fields .iter() - .map(FFI_ArrowSchema::try_from) + .map(|(_, f)| f.as_ref().try_into()) .collect::, ArrowError>>()?, DataType::Struct(fields) => fields .iter() @@ -658,8 +660,11 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::Struct(_) => Ok("+s".to_string()), DataType::Map(_, _) => Ok("+m".to_string()), DataType::Dictionary(key_data_type, _) => get_format_string(key_data_type), - DataType::Union(_, type_ids, mode) => { - let formats = type_ids.iter().map(|t| t.to_string()).collect::>(); + DataType::Union(fields, mode) => { + let formats = fields + .iter() + .map(|(t, _)| t.to_string()) + .collect::>(); match mode { UnionMode::Dense => Ok(format!("{}:{}", "+ud", formats.join(","))), UnionMode::Sparse => Ok(format!("{}:{}", "+us", formats.join(","))), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 8ef9fd2b81e5..d68392f51f03 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -235,8 +235,8 @@ impl Field { fn _fields(dt: &DataType) -> Vec<&Field> { match dt { DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), - DataType::Union(fields, _, _) => { - fields.iter().flat_map(|f| f.fields()).collect() + DataType::Union(fields, _) => { + fields.iter().flat_map(|(_, f)| f.fields()).collect() } DataType::List(field) | DataType::LargeList(field) @@ -341,36 +341,9 @@ impl Field { self.name, from.data_type) ))} }, - DataType::Union(nested_fields, type_ids, _) => match &from.data_type { - DataType::Union(from_nested_fields, from_type_ids, _) => { - for (idx, from_field) in from_nested_fields.iter().enumerate() { - let mut is_new_field = true; - let field_type_id = from_type_ids.get(idx).unwrap(); - - for (self_idx, self_field) in nested_fields.iter_mut().enumerate() - { - if from_field == self_field { - let self_type_id = type_ids.get(self_idx).unwrap(); - - // If the nested fields in two unions are the same, they must have same - // type id. - if self_type_id != field_type_id { - return Err(ArrowError::SchemaError( - format!("Fail to merge schema field '{}' because the self_type_id = {} does not equal field_type_id = {}", - self.name, self_type_id, field_type_id) - )); - } - - is_new_field = false; - break; - } - } - - if is_new_field { - nested_fields.push(from_field.clone()); - type_ids.push(*field_type_id); - } - } + DataType::Union(nested_fields, _) => match &from.data_type { + DataType::Union(from_nested_fields, _) => { + nested_fields.try_merge(from_nested_fields)? } _ => { return Err(ArrowError::SchemaError( diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 26822613666a..1de5e5efdeeb 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::{Field, FieldRef}; +use crate::{ArrowError, Field, FieldRef}; use std::ops::Deref; use std::sync::Arc; /// A cheaply cloneable, owned slice of [`FieldRef`] /// -/// Similar to `Arc>` or `Arc<[FieldPtr]>` +/// Similar to `Arc>` or `Arc<[FieldRef]>` /// /// Can be constructed in a number of ways /// @@ -55,7 +55,9 @@ impl Fields { /// Return size of this instance in bytes. pub fn size(&self) -> usize { - self.iter().map(|field| field.size()).sum() + self.iter() + .map(|field| field.size() + std::mem::size_of::()) + .sum() } /// Searches for a field by name, returning it along with its index if found @@ -148,3 +150,112 @@ impl<'de> serde::Deserialize<'de> for Fields { Ok(Vec::::deserialize(deserializer)?.into()) } } + +/// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "serde", serde(transparent))] +pub struct UnionFields(Arc<[(i8, FieldRef)]>); + +impl std::fmt::Debug for UnionFields { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.as_ref().fmt(f) + } +} + +impl UnionFields { + /// Create a new [`UnionFields`] with no fields + pub fn empty() -> Self { + Self(Arc::from([])) + } + + /// Create a new [`UnionFields`] from a [`Fields`] and array of type_ids + /// + /// See + /// + /// ``` + /// use arrow_schema::{DataType, Field, UnionFields}; + /// // Create a new UnionFields with type id mapping + /// // 1 -> DataType::UInt8 + /// // 3 -> DataType::Utf8 + /// UnionFields::new( + /// vec![1, 3], + /// vec![ + /// Field::new("field1", DataType::UInt8, false), + /// Field::new("field3", DataType::Utf8, false), + /// ], + /// ); + /// ``` + pub fn new(type_ids: T, fields: F) -> Self + where + F: IntoIterator, + F::Item: Into, + T: IntoIterator, + { + let fields = fields.into_iter().map(Into::into); + type_ids.into_iter().zip(fields).collect() + } + + /// Return size of this instance in bytes. + pub fn size(&self) -> usize { + self.iter() + .map(|(_, field)| field.size() + std::mem::size_of::<(i8, FieldRef)>()) + .sum() + } + + /// Returns the number of fields in this [`UnionFields`] + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns `true` if this is empty + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns an iterator over the fields and type ids in this [`UnionFields`] + /// + /// Note: the iteration order is not guaranteed + pub fn iter(&self) -> impl Iterator + '_ { + self.0.iter().map(|(id, f)| (*id, f)) + } + + /// Merge this field into self if it is compatible. + /// + /// See [`Field::try_merge`] + pub(crate) fn try_merge(&mut self, other: &Self) -> Result<(), ArrowError> { + // TODO: This currently may produce duplicate type IDs (#3982) + let mut output: Vec<_> = self.iter().map(|(id, f)| (id, f.clone())).collect(); + for (field_type_id, from_field) in other.iter() { + let mut is_new_field = true; + for (self_type_id, self_field) in output.iter_mut() { + if from_field == self_field { + // If the nested fields in two unions are the same, they must have same + // type id. + if *self_type_id != field_type_id { + return Err(ArrowError::SchemaError( + format!("Fail to merge schema field '{}' because the self_type_id = {} does not equal field_type_id = {}", + self_field.name(), self_type_id, field_type_id) + )); + } + + is_new_field = false; + break; + } + } + + if is_new_field { + output.push((field_type_id, from_field.clone())) + } + } + *self = output.into_iter().collect(); + Ok(()) + } +} + +impl FromIterator<(i8, FieldRef)> for UnionFields { + fn from_iter>(iter: T) -> Self { + // TODO: Should this validate type IDs are unique (#3982) + Self(iter.into_iter().collect()) + } +} diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 6089c1ae5b94..501c5c7fdd39 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -365,7 +365,7 @@ impl Hash for Schema { #[cfg(test)] mod tests { use crate::datatype::DataType; - use crate::{TimeUnit, UnionMode}; + use crate::{TimeUnit, UnionFields, UnionMode}; use super::*; @@ -778,11 +778,13 @@ mod tests { Schema::new(vec![Field::new( "c1", DataType::Union( - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + ] + ), UnionMode::Dense ), false @@ -790,11 +792,17 @@ mod tests { Schema::new(vec![Field::new( "c1", DataType::Union( - vec![ - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ], - vec![1, 2], + UnionFields::new( + vec![1, 2], + vec![ + Field::new("c12", DataType::Utf8, true), + Field::new( + "c13", + DataType::Time64(TimeUnit::Second), + true + ), + ] + ), UnionMode::Dense ), false @@ -804,12 +812,14 @@ mod tests { Schema::new(vec![Field::new( "c1", DataType::Union( - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ], - vec![0, 1, 2], + UnionFields::new( + vec![0, 1, 2], + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + Field::new("c13", DataType::Time64(TimeUnit::Second), true), + ] + ), UnionMode::Dense ), false diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index d1977d42bba0..74dad6b4a8c8 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -30,7 +30,7 @@ pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, - TimeUnit, UnionMode, + TimeUnit, UnionFields, UnionMode, }; #[cfg(feature = "ffi")] diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 2d6bbf1a091d..fe2e186a72f9 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -174,15 +174,15 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { ))) } // type ids. UnionArray doesn't have null bitmap so buffer index begins with 0. - (DataType::Union(_, _, _), 0) => i8::BITS as _, + (DataType::Union(_, _), 0) => i8::BITS as _, // Only DenseUnion has 2nd buffer - (DataType::Union(_, _, UnionMode::Dense), 1) => i32::BITS as _, - (DataType::Union(_, _, UnionMode::Sparse), _) => { + (DataType::Union(_, UnionMode::Dense), 1) => i32::BITS as _, + (DataType::Union(_, UnionMode::Sparse), _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } - (DataType::Union(_, _, UnionMode::Dense), _) => { + (DataType::Union(_, UnionMode::Dense), _) => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{data_type:?}\" expects 2 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index b113ec04ccab..27fb1dcd232b 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -41,7 +41,7 @@ use arrow_cast::pretty::pretty_format_columns; use arrow_cast::{can_cast_types, cast}; use arrow_data::ArrayData; use arrow_schema::{ - ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionMode, + ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit, UnionFields, UnionMode, }; use half::f16; use std::sync::Arc; @@ -405,11 +405,13 @@ fn get_all_types() -> Vec { Field::new("f2", DataType::Utf8, true), ])), Union( - vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, true), + ], + ), UnionMode::Dense, ), Decimal128(38, 0), diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 73e013ff1c15..ef0d40d64e2d 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -22,7 +22,7 @@ use arrow::array::{ use arrow_array::Decimal128Array; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayData; -use arrow_schema::{DataType, Field, UnionMode}; +use arrow_schema::{DataType, Field, UnionFields, UnionMode}; use std::ptr::NonNull; use std::sync::Arc; @@ -768,11 +768,13 @@ fn test_validate_union_different_types() { ArrayData::try_new( DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), // data is int32 - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), // data is int32 + ], + ), UnionMode::Sparse, ), 2, @@ -799,11 +801,13 @@ fn test_validate_union_sparse_different_child_len() { ArrayData::try_new( DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + ), UnionMode::Sparse, ), 2, @@ -826,11 +830,13 @@ fn test_validate_union_dense_without_offsets() { ArrayData::try_new( DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + ), UnionMode::Dense, ), 2, @@ -854,11 +860,13 @@ fn test_validate_union_dense_with_bad_len() { ArrayData::try_new( DataType::Union( - vec![ - Field::new("field1", DataType::Int32, true), - Field::new("field2", DataType::Int64, true), - ], - vec![0, 1], + UnionFields::new( + vec![0, 1], + vec![ + Field::new("field1", DataType::Int32, true), + Field::new("field2", DataType::Int64, true), + ], + ), UnionMode::Dense, ), 2, diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 94c19cb2e9e0..f594f2f79947 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -360,7 +360,7 @@ fn write_leaves( ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), )), - ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) | ArrowDataType::RunEndEncoded(_, _) => { + ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _) | ArrowDataType::RunEndEncoded(_, _) => { Err(ParquetError::NYI( format!( "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 09109d290e22..b541a754ba41 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -501,7 +501,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { )) } } - DataType::Union(_, _, _) => unimplemented!("See ARROW-8817."), + DataType::Union(_, _) => unimplemented!("See ARROW-8817."), DataType::Dictionary(_, ref value) => { // Dictionary encoding not handled at the schema level let dict_field = Field::new(name, *value.clone(), field.is_nullable()); From 0f13a664cb8faa2f5c4d255b912f33d78867b255 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:09:53 +0100 Subject: [PATCH 0758/1411] Use workspace dependencies (#3936) * Use workspace dependencies * Fix rustfmt * Fix pyarrow integration test --- Cargo.toml | 33 ++++++++++++++++ arrow-arith/Cargo.toml | 30 +++++++-------- arrow-array/Cargo.toml | 28 ++++++-------- arrow-buffer/Cargo.toml | 22 +++++------ arrow-cast/Cargo.toml | 32 +++++++--------- arrow-csv/Cargo.toml | 34 ++++++++--------- arrow-data/Cargo.toml | 26 ++++++------- arrow-flight/Cargo.toml | 26 ++++++------- arrow-flight/gen/Cargo.toml | 12 +++--- arrow-integration-test/Cargo.toml | 26 ++++++------- arrow-integration-testing/Cargo.toml | 14 +++---- arrow-ipc/Cargo.toml | 32 +++++++--------- arrow-json/Cargo.toml | 32 +++++++--------- arrow-ord/Cargo.toml | 32 +++++++--------- arrow-pyarrow-integration-testing/Cargo.toml | 5 ++- arrow-row/Cargo.toml | 34 ++++++++--------- arrow-schema/Cargo.toml | 22 +++++------ arrow-select/Cargo.toml | 30 +++++++-------- arrow-string/Cargo.toml | 32 +++++++--------- arrow/Cargo.toml | 40 ++++++++++---------- object_store/Cargo.toml | 4 +- parquet/Cargo.toml | 32 ++++++++-------- parquet_derive/Cargo.toml | 16 ++++---- parquet_derive_test/Cargo.toml | 18 ++++----- 24 files changed, 295 insertions(+), 317 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 64ce3166e608..a287f66b082c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ members = [ "parquet_derive", "parquet_derive_test", ] + # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # # Critically this prevents dev-dependencies from enabling features even when not building a target that @@ -54,3 +55,35 @@ resolver = "2" # how it is compiled within the workspace, causing the whole workspace to be compiled from scratch # this way, this is a stand-alone package that compiles independently of the others. exclude = ["arrow-pyarrow-integration-testing"] + +[workspace.package] +version = "36.0.0" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[workspace.dependencies] +arrow = { version = "36.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "36.0.0", path = "./arrow-arith" } +arrow-array = { version = "36.0.0", path = "./arrow-array" } +arrow-buffer = { version = "36.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "36.0.0", path = "./arrow-cast" } +arrow-csv = { version = "36.0.0", path = "./arrow-csv" } +arrow-data = { version = "36.0.0", path = "./arrow-data" } +arrow-ipc = { version = "36.0.0", path = "./arrow-ipc" } +arrow-json = { version = "36.0.0", path = "./arrow-json" } +arrow-ord = { version = "36.0.0", path = "./arrow-ord" } +arrow-row = { version = "36.0.0", path = "./arrow-row" } +arrow-schema = { version = "36.0.0", path = "./arrow-schema" } +arrow-select = { version = "36.0.0", path = "./arrow-select" } +arrow-string = { version = "36.0.0", path = "./arrow-string" } +parquet = { version = "36.0.0", path = "./parquet", default-features = false } diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index f509af76b733..4460d116b466 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-arith" -version = "36.0.0" +version = { workspace = true } description = "Arrow arithmetic kernels" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_arith" @@ -38,10 +34,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } chrono = { version = "0.4.23", default-features = false } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 7ea969a03f7d..1b417bb0e858 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-array" -version = "36.0.0" +version = { workspace = true } description = "Array abstractions for Apache Arrow" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_array" @@ -45,9 +41,9 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } +arrow-data = { workspace = true } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index be9a08eb8333..3d2fd71c973c 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-buffer" -version = "36.0.0" +version = { workspace = true } description = "Buffer abstractions for Apache Arrow" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_buffer" diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 1eee7108f139..e42e75b838ce 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-cast" -version = "36.0.0" +version = { workspace = true } description = "Cast kernel and utilities for Apache Arrow" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_cast" @@ -44,11 +40,11 @@ features = ["prettyprint"] prettyprint = ["comfy-table"] [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-select = { version = "36.0.0", path = "../arrow-select" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +arrow-select = { workspace = true } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 9f8015f1eec8..d4526ba32cf2 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-csv" -version = "36.0.0" +version = { workspace = true } description = "Support for parsing CSV format into the Arrow format" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_csv" @@ -38,14 +34,14 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } csv = { version = "1.1", default-features = false } -csv-core = { version = "0.1"} +csv-core = { version = "0.1" } lazy_static = { version = "1.4", default-features = false } lexical-core = { version = "^0.8", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } diff --git a/arrow-data/Cargo.toml b/arrow-data/Cargo.toml index c3630d2c9164..c83f867523d5 100644 --- a/arrow-data/Cargo.toml +++ b/arrow-data/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-data" -version = "36.0.0" +version = { workspace = true } description = "Array data abstractions for Apache Arrow" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_data" @@ -50,8 +46,8 @@ features = ["ffi"] [dependencies] -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 729304aed92f..2f0994e18070 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,21 +18,21 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "36.0.0" -edition = "2021" -rust-version = "1.62" -authors = ["Apache Arrow "] -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -license = "Apache-2.0" +version = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +authors = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-ipc = { version = "36.0.0", path = "../arrow-ipc" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-cast = { workspace = true } +arrow-ipc = { workspace = true } +arrow-schema = { workspace = true } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } @@ -58,7 +58,7 @@ tls = ["tonic/tls"] cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] -arrow-cast = { version = "36.0.0", path = "../arrow-cast", features = ["prettyprint"] } +arrow-cast = { workspace = true, features = ["prettyprint"] } assert_cmd = "2.0.8" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 8d04a821be56..02613a85a18d 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -19,12 +19,12 @@ name = "gen" description = "Code generation for arrow-flight" version = "0.1.0" -edition = "2021" -rust-version = "1.62" -authors = ["Apache Arrow "] -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -license = "Apache-2.0" +edition = { workspace = true } +rust-version = { workspace = true } +authors = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 61ffae23fbe7..6ede476eb569 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-integration-test" -version = "36.0.0" +version = { workspace = true } description = "Support for the Apache Arrow JSON test data format" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_integration_test" @@ -38,8 +34,8 @@ path = "src/lib.rs" bench = false [dependencies] -arrow = { version = "36.0.0", path = "../arrow", default-features = false } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } +arrow = { workspace = true } +arrow-buffer = { workspace = true, path = "../arrow-buffer" } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 81691c4b370f..3a65ec41c0f3 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -18,14 +18,14 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests (NOT PUBLISHED TO crates.io)" -version = "36.0.0" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -edition = "2021" +version = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +edition = { workspace = true } publish = false -rust-version = "1.62" +rust-version = { workspace = true } [features] logging = ["tracing-subscriber"] diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index 0c358170bc16..a03f53d6641c 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-ipc" -version = "36.0.0" +version = { workspace = true } description = "Support for the Arrow IPC format" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_ipc" @@ -38,11 +34,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } flatbuffers = { version = "23.1.21", default-features = false } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 91bac277b1f6..34bd447da183 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-json" -version = "36.0.0" +version = { workspace = true } description = "Support for parsing JSON format into the Arrow format" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_json" @@ -38,11 +34,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index aac6a8cc0786..fb061b9b5499 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-ord" -version = "36.0.0" +version = { workspace = true } description = "Ordering kernels for arrow arrays" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_ord" @@ -38,11 +34,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-select = { version = "36.0.0", path = "../arrow-select" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +arrow-select = { workspace = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 8aaf20b498fe..5809e935ec16 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "36.0.0" +version = "0.1.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -26,11 +26,12 @@ license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" rust-version = "1.62" +publish = false [lib] name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "36.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", features = ["pyarrow"] } pyo3 = { version = "0.18", features = ["extension-module"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 96d494077026..8f5de1177288 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-row" -version = "36.0.0" +version = { workspace = true } description = "Arrow row format" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_row" @@ -44,17 +40,17 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } hashbrown = { version = "0.13", default-features = false } [dev-dependencies] -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-ord = { version = "36.0.0", path = "../arrow-ord" } +arrow-cast = { workspace = true } +arrow-ord = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 2ef08072a00d..628d4a683cac 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-schema" -version = "36.0.0" +version = { workspace = true } description = "Defines the logical types for arrow arrays" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_schema" diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index c0aa9444c1f1..ff8a212c7b52 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-select" -version = "36.0.0" +version = { workspace = true } description = "Selection kernels for arrow arrays" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_select" @@ -38,10 +34,10 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-array = { version = "36.0.0", path = "../arrow-array" } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +arrow-array = { workspace = true } num = { version = "0.4", default-features = false, features = ["std"] } [features] diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 90746e9395e3..f24b17a5c89b 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -17,20 +17,16 @@ [package] name = "arrow-string" -version = "36.0.0" +version = { workspace = true } description = "String kernels for arrow arrays" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = ["arrow"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2021" -rust-version = "1.62" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow_string" @@ -38,11 +34,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-select = { version = "36.0.0", path = "../arrow-select" } +arrow-buffer = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +arrow-array = { workspace = true } +arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 0e8ea3cac124..2c9bf64eccf1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,12 +17,12 @@ [package] name = "arrow" -version = "36.0.0" +version = { workspace = true } description = "Rust implementation of Apache Arrow" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] -license = "Apache-2.0" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } keywords = ["arrow"] include = [ "benches/*.rs", @@ -30,8 +30,8 @@ include = [ "tests/*.rs", "Cargo.toml", ] -edition = "2021" -rust-version = "1.62" +edition = { workspace = true } +rust-version = { workspace = true } [lib] name = "arrow" @@ -45,19 +45,19 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-arith = { version = "36.0.0", path = "../arrow-arith" } -arrow-array = { version = "36.0.0", path = "../arrow-array" } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer" } -arrow-cast = { version = "36.0.0", path = "../arrow-cast" } -arrow-csv = { version = "36.0.0", path = "../arrow-csv", optional = true } -arrow-data = { version = "36.0.0", path = "../arrow-data" } -arrow-ipc = { version = "36.0.0", path = "../arrow-ipc", optional = true } -arrow-json = { version = "36.0.0", path = "../arrow-json", optional = true } -arrow-ord = { version = "36.0.0", path = "../arrow-ord" } -arrow-row = { version = "36.0.0", path = "../arrow-row" } -arrow-schema = { version = "36.0.0", path = "../arrow-schema" } -arrow-select = { version = "36.0.0", path = "../arrow-select" } -arrow-string = { version = "36.0.0", path = "../arrow-string" } +arrow-arith = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-csv = { workspace = true, optional = true } +arrow-data = { workspace = true } +arrow-ipc = { workspace = true, optional = true } +arrow-json = { workspace = true, optional = true } +arrow-ord = { workspace = true } +arrow-row = { workspace = true } +arrow-schema = { workspace = true } +arrow-select = { workspace = true } +arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.18", default-features = false, optional = true } diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 317087241a30..d9b075fcc5cd 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -18,12 +18,12 @@ [package] name = "object_store" version = "0.5.5" -edition = "2021" +edition = { workspace = true } license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = "https://github.com/apache/arrow-rs" +repository = { workspace = true } [package.metadata.docs.rs] all-features = true diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 46a6aa441271..be61a7cf1435 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,16 +17,16 @@ [package] name = "parquet" -version = "36.0.0" -license = "Apache-2.0" +version = { workspace = true } +license = { workspace = true } description = "Apache Parquet implementation in Rust" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" -edition = "2021" -rust-version = "1.62" +edition = { workspace = true } +rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } @@ -35,14 +35,14 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-array = { version = "36.0.0", path = "../arrow-array", default-features = false, optional = true } -arrow-buffer = { version = "36.0.0", path = "../arrow-buffer", default-features = false, optional = true } -arrow-cast = { version = "36.0.0", path = "../arrow-cast", default-features = false, optional = true } -arrow-csv = { version = "36.0.0", path = "../arrow-csv", default-features = false, optional = true } -arrow-data = { version = "36.0.0", path = "../arrow-data", default-features = false, optional = true } -arrow-schema = { version = "36.0.0", path = "../arrow-schema", default-features = false, optional = true } -arrow-select = { version = "36.0.0", path = "../arrow-select", default-features = false, optional = true } -arrow-ipc = { version = "36.0.0", path = "../arrow-ipc", default-features = false, optional = true } +arrow-array = { workspace = true, optional = true } +arrow-buffer = { workspace = true, optional = true } +arrow-cast = { workspace = true, optional = true } +arrow-csv = { workspace = true, optional = true } +arrow-data = { workspace = true, optional = true } +arrow-schema = { workspace = true, optional = true } +arrow-select = { workspace = true, optional = true } +arrow-ipc = { workspace = true, optional = true } object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -76,7 +76,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "36.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 9ecb40cc4729..68d19e54c4f4 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,16 +17,16 @@ [package] name = "parquet_derive" -version = "36.0.0" -license = "Apache-2.0" +version = { workspace = true } +license = { workspace = true } description = "Derive macros for the Rust implementation of Apache Parquet" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } keywords = ["parquet"] readme = "README.md" -edition = "2021" -rust-version = "1.62" +edition = { workspace = true } +rust-version = { workspace = true } [lib] proc-macro = true @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "2.0", features = ["extra-traits"] } -parquet = { path = "../parquet", version = "36.0.0", default-features = false } +parquet = { workspace = true } diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 10694851c938..be24db85a109 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,18 +17,18 @@ [package] name = "parquet_derive_test" -version = "36.0.0" -license = "Apache-2.0" +version = { workspace = true } +license = { workspace = true } description = "Integration test package for parquet-derive" -homepage = "https://github.com/apache/arrow-rs" -repository = "https://github.com/apache/arrow-rs" -authors = ["Apache Arrow "] +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } keywords = [ "parquet" ] -edition = "2021" +edition = { workspace = true } publish = false -rust-version = "1.62" +rust-version = { workspace = true } [dependencies] -parquet = { path = "../parquet", version = "36.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "36.0.0", default-features = false } +parquet = { workspace = true } +parquet_derive = { path = "../parquet_derive", default-features = false } chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } From 9eb3490c2699c388941930617f5414094bd360fe Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:18:35 +0100 Subject: [PATCH 0759/1411] Async writer tweaks (#3967) * Async writer tweaks * Use capacity --- parquet/src/arrow/async_writer/mod.rs | 73 +++++++++++++-------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index dc000f248c9b..abfb1c54ed44 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -51,10 +51,7 @@ //! # } //! ``` -use std::{ - io::Write, - sync::{Arc, Mutex}, -}; +use std::{io::Write, sync::Arc}; use crate::{ arrow::ArrowWriter, @@ -80,22 +77,24 @@ pub struct AsyncArrowWriter { /// The inner buffer shared by the `sync_writer` and the `async_writer` shared_buffer: SharedBuffer, - - /// The threshold triggering buffer flush - buffer_flush_threshold: usize, } impl AsyncArrowWriter { /// Try to create a new Async Arrow Writer. /// - /// `buffer_flush_threshold` will be used to trigger flush of the inner buffer. + /// `buffer_size` determines the initial size of the intermediate buffer. + /// + /// The intermediate buffer will automatically be resized if necessary + /// + /// [`Self::write`] will flush this intermediate buffer if it is at least + /// half full pub fn try_new( writer: W, arrow_schema: SchemaRef, - buffer_flush_threshold: usize, + buffer_size: usize, props: Option, ) -> Result { - let shared_buffer = SharedBuffer::default(); + let shared_buffer = SharedBuffer::new(buffer_size); let sync_writer = ArrowWriter::try_new(shared_buffer.clone(), arrow_schema, props)?; @@ -103,22 +102,16 @@ impl AsyncArrowWriter { sync_writer, async_writer: writer, shared_buffer, - buffer_flush_threshold, }) } /// Enqueues the provided `RecordBatch` to be written /// /// After every sync write by the inner [ArrowWriter], the inner buffer will be - /// checked and flush if threshold is reached. + /// checked and flush if at least half full pub async fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.sync_writer.write(batch)?; - Self::try_flush( - &self.shared_buffer, - &mut self.async_writer, - self.buffer_flush_threshold, - ) - .await + Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, false).await } /// Append [`KeyValue`] metadata in addition to those in [`WriterProperties`] @@ -135,7 +128,7 @@ impl AsyncArrowWriter { let metadata = self.sync_writer.close()?; // Force to flush the remaining data. - Self::try_flush(&self.shared_buffer, &mut self.async_writer, 0).await?; + Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, true).await?; Ok(metadata) } @@ -143,24 +136,21 @@ impl AsyncArrowWriter { /// Flush the data in the [`SharedBuffer`] into the `async_writer` if its size /// exceeds the threshold. async fn try_flush( - shared_buffer: &SharedBuffer, + shared_buffer: &mut SharedBuffer, async_writer: &mut W, - threshold: usize, + force: bool, ) -> Result<()> { - let mut buffer = { - let mut buffer = shared_buffer.buffer.lock().unwrap(); - - if buffer.is_empty() || buffer.len() < threshold { - // no need to flush - return Ok(()); - } - std::mem::take(&mut *buffer) - }; + let mut buffer = shared_buffer.buffer.try_lock().unwrap(); + if !force && buffer.len() < buffer.capacity() / 2 { + // no need to flush + return Ok(()); + } async_writer - .write(&buffer) + .write(buffer.as_slice()) .await .map_err(|e| ParquetError::External(Box::new(e)))?; + async_writer .flush() .await @@ -168,7 +158,6 @@ impl AsyncArrowWriter { // reuse the buffer. buffer.clear(); - *shared_buffer.buffer.lock().unwrap() = buffer; Ok(()) } @@ -176,23 +165,31 @@ impl AsyncArrowWriter { /// A buffer with interior mutability shared by the [`ArrowWriter`] and /// [`AsyncArrowWriter`]. -#[derive(Clone, Default)] +#[derive(Clone)] struct SharedBuffer { /// The inner buffer for reading and writing /// /// The lock is used to obtain internal mutability, so no worry about the /// lock contention. - buffer: Arc>>, + buffer: Arc>>, +} + +impl SharedBuffer { + pub fn new(capacity: usize) -> Self { + Self { + buffer: Arc::new(futures::lock::Mutex::new(Vec::with_capacity(capacity))), + } + } } impl Write for SharedBuffer { fn write(&mut self, buf: &[u8]) -> std::io::Result { - let mut buffer = self.buffer.lock().unwrap(); + let mut buffer = self.buffer.try_lock().unwrap(); Write::write(&mut *buffer, buf) } fn flush(&mut self) -> std::io::Result<()> { - let mut buffer = self.buffer.lock().unwrap(); + let mut buffer = self.buffer.try_lock().unwrap(); Write::flush(&mut *buffer) } } @@ -342,7 +339,7 @@ mod tests { }; let test_buffer_flush_thresholds = - vec![0, 1024, 40 * 1024, 50 * 1024, 100 * 1024, usize::MAX]; + vec![0, 1024, 40 * 1024, 50 * 1024, 100 * 1024]; for buffer_flush_threshold in test_buffer_flush_thresholds { let reader = get_test_reader(); @@ -354,7 +351,7 @@ mod tests { let mut async_writer = AsyncArrowWriter::try_new( &mut test_async_sink, reader.schema(), - buffer_flush_threshold, + buffer_flush_threshold * 2, Some(write_props.clone()), ) .unwrap(); From 8262b926f890059ad03dc3f23d71a37ced198cea Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:18:55 +0100 Subject: [PATCH 0760/1411] Use FieldRef in DataType (#3955) (#3983) --- arrow-array/src/array/binary_array.rs | 9 +-- .../src/array/fixed_size_binary_array.rs | 6 +- .../src/array/fixed_size_list_array.rs | 11 +-- arrow-array/src/array/list_array.rs | 26 +++---- arrow-array/src/array/map_array.rs | 8 +-- arrow-array/src/array/mod.rs | 10 +-- arrow-array/src/array/run_array.rs | 4 +- arrow-array/src/array/string_array.rs | 7 +- .../src/builder/fixed_size_list_builder.rs | 4 +- .../src/builder/generic_list_builder.rs | 4 +- arrow-array/src/builder/map_builder.rs | 2 +- arrow-array/src/builder/mod.rs | 2 +- arrow-array/src/builder/struct_builder.rs | 2 +- arrow-array/src/cast.rs | 7 +- arrow-array/src/record_batch.rs | 4 +- arrow-cast/src/cast.rs | 34 ++++----- arrow-cast/src/pretty.rs | 2 +- arrow-data/src/transform/mod.rs | 3 +- arrow-integration-test/src/datatype.rs | 13 ++-- arrow-integration-test/src/field.rs | 12 ++-- arrow-integration-test/src/lib.rs | 6 +- arrow-integration-test/src/schema.rs | 15 ++-- .../src/bin/arrow-json-integration-test.rs | 2 +- arrow-ipc/src/convert.rs | 23 +++--- arrow-ipc/src/reader.rs | 18 ++--- arrow-json/src/raw/mod.rs | 12 ++-- arrow-json/src/reader.rs | 70 +++++++++--------- arrow-json/src/writer.rs | 12 ++-- arrow-ord/src/comparison.rs | 3 +- arrow-schema/src/datatype.rs | 30 ++++---- arrow-schema/src/ffi.rs | 14 ++-- arrow-schema/src/field.rs | 5 +- arrow-select/src/filter.rs | 8 +-- arrow-select/src/take.rs | 8 +-- arrow/benches/json_reader.rs | 8 +-- arrow/examples/builders.rs | 2 +- arrow/src/array/ffi.rs | 8 +-- arrow/src/compute/kernels/limit.rs | 2 +- arrow/src/ffi.rs | 4 +- arrow/src/util/data_gen.rs | 6 +- arrow/tests/array_cast.rs | 18 ++--- arrow/tests/array_equal.rs | 10 +-- arrow/tests/array_transform.rs | 10 +-- arrow/tests/array_validation.rs | 10 +-- parquet/benches/arrow_writer.rs | 18 ++--- parquet/src/arrow/array_reader/builder.rs | 12 ++-- .../array_reader/fixed_len_byte_array.rs | 2 +- parquet/src/arrow/array_reader/list_array.rs | 4 +- parquet/src/arrow/array_reader/map_array.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 6 +- parquet/src/arrow/arrow_writer/levels.rs | 28 ++++---- parquet/src/arrow/arrow_writer/mod.rs | 24 +++---- parquet/src/arrow/schema/complex.rs | 6 +- parquet/src/arrow/schema/mod.rs | 72 +++++++++---------- 54 files changed, 328 insertions(+), 320 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 530f3835ce10..ccce3cda9989 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -303,6 +303,7 @@ mod tests { use super::*; use crate::{ListArray, StringArray}; use arrow_schema::Field; + use std::sync::Arc; #[test] fn test_binary_array() { @@ -453,7 +454,7 @@ mod tests { .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt8, false), )); @@ -503,7 +504,7 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt8, false), )); @@ -548,7 +549,7 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt8, true), )); @@ -641,7 +642,7 @@ mod tests { let offsets: [i32; 4] = [0, 5, 5, 12]; let data_type = - DataType::List(Box::new(Field::new("item", DataType::UInt32, false))); + DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); let array_data = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(offsets)) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 75f6bf91442d..fa303b4a8dbc 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -583,7 +583,7 @@ mod tests { // [null, [10, 11, 12, 13]] let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, false)), + Arc::new(Field::new("item", DataType::UInt8, false)), 4, )) .len(2) @@ -619,7 +619,7 @@ mod tests { let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), + Arc::new(Field::new("item", DataType::Binary, false)), 4, )) .len(3) @@ -643,7 +643,7 @@ mod tests { let array_data = unsafe { ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt8, false)), + Arc::new(Field::new("item", DataType::UInt8, false)), 4, )) .len(3) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 1a421fe53c25..4a592d869437 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -30,6 +30,7 @@ use std::sync::Arc; /// # Example /// /// ``` +/// # use std::sync::Arc; /// # use arrow_array::{Array, FixedSizeListArray, Int32Array}; /// # use arrow_data::ArrayData; /// # use arrow_schema::{DataType, Field}; @@ -41,7 +42,7 @@ use std::sync::Arc; /// .build() /// .unwrap(); /// let list_data_type = DataType::FixedSizeList( -/// Box::new(Field::new("item", DataType::Int32, false)), +/// Arc::new(Field::new("item", DataType::Int32, false)), /// 3, /// ); /// let list_data = ArrayData::builder(list_data_type.clone()) @@ -270,7 +271,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type.clone()) @@ -343,7 +344,7 @@ mod tests { // Construct a list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 3, ); let list_data = unsafe { @@ -374,7 +375,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) @@ -435,7 +436,7 @@ mod tests { // Construct a fixed size list array from the above two let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index af5ce59fe4d8..8961d606e4f7 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -71,7 +71,7 @@ impl GenericListArray { /// The data type constructor of list array. /// The input is the schema of the child array and /// the output is the [`DataType`], List or LargeList. - pub const DATA_TYPE_CONSTRUCTOR: fn(Box) -> DataType = if OffsetSize::IS_LARGE + pub const DATA_TYPE_CONSTRUCTOR: fn(Arc) -> DataType = if OffsetSize::IS_LARGE { DataType::LargeList } else { @@ -368,7 +368,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -405,7 +405,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(0) .add_buffer(value_offsets) @@ -432,7 +432,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -522,7 +522,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -619,7 +619,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -683,7 +683,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -750,7 +750,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -778,7 +778,7 @@ mod tests { .build_unchecked() }; let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -798,7 +798,7 @@ mod tests { fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -831,7 +831,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -874,7 +874,7 @@ mod tests { }; let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(buf2) @@ -958,7 +958,7 @@ mod tests { #[test] fn test_empty_offsets() { - let f = Box::new(Field::new("element", DataType::Int32, true)); + let f = Arc::new(Field::new("element", DataType::Int32, true)); let string = ListArray::from( ArrayData::builder(DataType::List(f.clone())) .buffers(vec![Buffer::from(&[])]) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 112789fd51e8..fd4e2bd593e4 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -193,7 +193,7 @@ impl MapArray { ]); let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, @@ -308,7 +308,7 @@ mod tests { // Construct a map array from the above two let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, @@ -354,7 +354,7 @@ mod tests { // Construct a map array from the above two let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, @@ -483,7 +483,7 @@ mod tests { // Construct a map array from the above two let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9a5172d0deec..ead8b3b99d46 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -762,7 +762,7 @@ mod tests { #[test] fn test_empty_list_primitive() { let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let array = new_empty_array(&data_type); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); @@ -822,7 +822,7 @@ mod tests { #[test] fn test_null_list_primitive() { let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let array = new_null_array(&data_type, 9); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 9); @@ -835,7 +835,7 @@ mod tests { #[test] fn test_null_map() { let data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entry", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), @@ -903,8 +903,8 @@ mod tests { fn test_null_runs() { for r in [DataType::Int16, DataType::Int32, DataType::Int64] { let data_type = DataType::RunEndEncoded( - Box::new(Field::new("run_ends", r, false)), - Box::new(Field::new("values", DataType::Utf8, true)), + Arc::new(Field::new("run_ends", r, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), ); let array = new_null_array(&data_type, 4); diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 3cd5848f1f49..c3c5269374f1 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -98,8 +98,8 @@ impl RunArray { let run_ends_type = run_ends.data_type().clone(); let values_type = values.data_type().clone(); let ree_array_type = DataType::RunEndEncoded( - Box::new(Field::new("run_ends", run_ends_type, false)), - Box::new(Field::new("values", values_type, true)), + Arc::new(Field::new("run_ends", run_ends_type, false)), + Arc::new(Field::new("values", values_type, true)), ); let len = RunArray::logical_len(run_ends); let builder = ArrayDataBuilder::new(ree_array_type) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 2ff1118bc798..f339a616f300 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -263,6 +263,7 @@ mod tests { use crate::types::UInt8Type; use arrow_buffer::Buffer; use arrow_schema::Field; + use std::sync::Arc; #[test] fn test_string_array_from_u8_slice() { @@ -548,7 +549,7 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt8, false), )); @@ -596,7 +597,7 @@ mod tests { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt8, true), )); @@ -632,7 +633,7 @@ mod tests { .unwrap(); let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::UInt16, false), )); diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index f8cd5d15f852..57af768447c8 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -169,7 +169,7 @@ where let null_bit_buffer = self.null_buffer_builder.finish(); let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", values_data.data_type().clone(), true)), + Arc::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) @@ -200,7 +200,7 @@ where .as_slice() .map(Buffer::from_slice_ref); let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", values_data.data_type().clone(), true)), + Arc::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 719070356a6f..5f726a5b121c 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -233,7 +233,7 @@ where let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.null_buffer_builder.finish(); self.offsets_builder.append(OffsetSize::zero()); - let field = Box::new(Field::new( + let field = Arc::new(Field::new( "item", values_data.data_type().clone(), true, // TODO: find a consistent way of getting this @@ -261,7 +261,7 @@ where .null_buffer_builder .as_slice() .map(Buffer::from_slice_ref); - let field = Box::new(Field::new( + let field = Arc::new(Field::new( "item", values_data.data_type().clone(), true, // TODO: find a consistent way of getting this diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index cb6cd907c77a..72fa1bb919fb 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -195,7 +195,7 @@ impl MapBuilder { let struct_array = StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); - let map_field = Box::new(Field::new( + let map_field = Arc::new(Field::new( self.field_names.entry.as_str(), struct_array.data_type().clone(), false, // always non-nullable diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 41a4d92b0219..b0c0a49886d8 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -121,7 +121,7 @@ //! let string_field = Field::new("i32", DataType::Utf8, false); //! //! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; -//! let value_field = Box::new(Field::new("item", DataType::Int32, true)); +//! let value_field = Arc::new(Field::new("item", DataType::Int32, true)); //! let i32_list_field = Field::new("i32_list", DataType::List(value_field), true); //! //! StructArray::from(vec![ diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 7371df3b021c..499ae183f3e9 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -529,7 +529,7 @@ mod tests { )] fn test_struct_array_builder_from_schema_unsupported_type() { let list_type = - DataType::List(Box::new(Field::new("item", DataType::Int64, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); let fields = vec![ Field::new("f1", DataType::Int16, false), Field::new("f2", list_type, false), diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index a39ff88c6bcd..feb9167b2981 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -99,6 +99,7 @@ macro_rules! downcast_integer { /// `m` with the corresponding integer [`RunEndIndexType`], followed by any additional arguments /// /// ``` +/// # use std::sync::Arc; /// # use arrow_array::{downcast_primitive, ArrowPrimitiveType, downcast_run_end_index}; /// # use arrow_schema::{DataType, Field}; /// @@ -118,9 +119,9 @@ macro_rules! downcast_integer { /// } /// } /// -/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int32, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 4); -/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int64, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 8); -/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Box::new(Field::new("a", DataType::Int16, false)), Box::new(Field::new("b", DataType::Utf8, true)))), 2); +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Arc::new(Field::new("a", DataType::Int32, false)), Arc::new(Field::new("b", DataType::Utf8, true)))), 4); +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Arc::new(Field::new("a", DataType::Int64, false)), Arc::new(Field::new("b", DataType::Utf8, true)))), 8); +/// assert_eq!(run_end_index_size(&DataType::RunEndEncoded(Arc::new(Field::new("a", DataType::Int16, false)), Arc::new(Field::new("b", DataType::Utf8, true)))), 2); /// ``` /// /// [`DataType`]: arrow_schema::DataType diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 17b1f04e80af..8d4d04f0f525 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -712,7 +712,7 @@ mod tests { Field::new("a1", DataType::Int32, false), Field::new( "a2", - DataType::List(Box::new(Field::new("item", DataType::Int8, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int8, false))), false, ), ]; @@ -721,7 +721,7 @@ mod tests { let a1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); let a2_child = Int8Array::from(vec![1, 2, 3, 4]); - let a2 = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let a2 = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "array", DataType::Int8, false, diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index d14c8d2fa4ba..02b87e73114c 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4667,7 +4667,7 @@ mod tests { let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -4691,7 +4691,7 @@ mod tests { let array = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), ) .unwrap(); assert_eq!(5, b.len()); @@ -4720,7 +4720,7 @@ mod tests { let array = array.slice(2, 4); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), ) .unwrap(); assert_eq!(4, b.len()); @@ -4833,7 +4833,7 @@ mod tests { // Construct a list array from the above two // [[0,0,0], [-1, -2, -1], [2, 100000000]] let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4844,7 +4844,7 @@ mod tests { let cast_array = cast( &list_array, - &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))), + &DataType::List(Arc::new(Field::new("item", DataType::UInt16, true))), ) .unwrap(); @@ -4896,7 +4896,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4907,7 +4907,7 @@ mod tests { cast( &list_array, - &DataType::List(Box::new(Field::new( + &DataType::List(Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Microsecond, None), true, @@ -7104,7 +7104,7 @@ mod tests { fn test_cast_null_from_and_to_nested_type() { // Cast null from and to map let data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entry", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), @@ -7118,13 +7118,13 @@ mod tests { // Cast null from and to list let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); let data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); let data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), + Arc::new(Field::new("item", DataType::Int32, true)), 4, ); cast_from_null_to_other(&data_type); @@ -7229,7 +7229,7 @@ mod tests { let array = Arc::new(make_large_list_array()) as ArrayRef; let list_array = cast( &array, - &DataType::List(Box::new(Field::new("", DataType::Int32, false))), + &DataType::List(Arc::new(Field::new("", DataType::Int32, false))), ) .unwrap(); let actual = list_array.as_any().downcast_ref::().unwrap(); @@ -7243,7 +7243,7 @@ mod tests { let array = Arc::new(make_list_array()) as ArrayRef; let large_list_array = cast( &array, - &DataType::LargeList(Box::new(Field::new("", DataType::Int32, false))), + &DataType::LargeList(Arc::new(Field::new("", DataType::Int32, false))), ) .unwrap(); let actual = large_list_array @@ -7271,7 +7271,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7295,7 +7295,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7324,7 +7324,7 @@ mod tests { let array1 = make_list_array().slice(1, 2); let array2 = Arc::new(make_list_array()) as ArrayRef; - let dt = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + let dt = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); let out1 = cast(&array1, &dt).unwrap(); let out2 = cast(&array2, &dt).unwrap(); @@ -7342,7 +7342,7 @@ mod tests { .unwrap(); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))); + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 818e9d3c0770..7aa04a2dbcb3 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -291,7 +291,7 @@ mod tests { fn test_pretty_format_fixed_size_list() { // define a schema. let field_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), + Arc::new(Field::new("item", DataType::Int32, true)), 3, ); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index ccdbaec3b5ea..52ce5ead725c 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -669,10 +669,11 @@ impl<'a> MutableArrayData<'a> { mod test { use super::*; use arrow_schema::Field; + use std::sync::Arc; #[test] fn test_list_append_with_capacities() { - let array = ArrayData::new_empty(&DataType::List(Box::new(Field::new( + let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new( "element", DataType::Int64, false, diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 5a5dd67fc7a1..47bacc7cc74b 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -22,7 +22,7 @@ use std::sync::Arc; /// Parse a data type from a JSON representation. pub fn data_type_from_json(json: &serde_json::Value) -> Result { use serde_json::Value; - let default_field = Field::new("", DataType::Boolean, true); + let default_field = Arc::new(Field::new("", DataType::Boolean, true)); match *json { Value::Object(ref map) => match map.get("name") { Some(s) if s == "null" => Ok(DataType::Null), @@ -186,17 +186,17 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { }, Some(s) if s == "list" => { // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_field))) + Ok(DataType::List(default_field)) } Some(s) if s == "largelist" => { // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_field))) + Ok(DataType::LargeList(default_field)) } Some(s) if s == "fixedsizelist" => { // return a list with any type as its child isn't defined in the map if let Some(Value::Number(size)) = map.get("listSize") { Ok(DataType::FixedSizeList( - Box::new(default_field), + default_field, size.as_i64().unwrap() as i32, )) } else { @@ -212,7 +212,7 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { Some(s) if s == "map" => { if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { // Return a map with an empty type as its children aren't defined in the map - Ok(DataType::Map(Box::new(default_field), *keys_sorted)) + Ok(DataType::Map(default_field, *keys_sorted)) } else { Err(ArrowError::ParseError( "Expecting a keysSorted for map".to_string(), @@ -231,11 +231,10 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { ))); }; if let Some(values) = map.get("typeIds") { - let field = Arc::new(default_field); let values = values.as_array().unwrap(); let fields = values .iter() - .map(|t| (t.as_i64().unwrap() as i8, field.clone())) + .map(|t| (t.as_i64().unwrap() as i8, default_field.clone())) .collect(); Ok(DataType::Union(fields, union_mode)) diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index c714fe4671d6..a0cd4adc83f0 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -126,13 +126,13 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { } match data_type { DataType::List(_) => { - DataType::List(Box::new(field_from_json(&values[0])?)) + DataType::List(Arc::new(field_from_json(&values[0])?)) } - DataType::LargeList(_) => DataType::LargeList(Box::new( + DataType::LargeList(_) => DataType::LargeList(Arc::new( field_from_json(&values[0])?, )), DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Box::new(field_from_json(&values[0])?), + Arc::new(field_from_json(&values[0])?), int, ), _ => unreachable!( @@ -173,7 +173,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // child must be a struct match child.data_type() { DataType::Struct(map_fields) if map_fields.len() == 2 => { - DataType::Map(Box::new(child), keys_sorted) + DataType::Map(Arc::new(child), keys_sorted) } t => { return Err(ArrowError::ParseError( @@ -354,7 +354,7 @@ mod tests { let f = Field::new( "my_map", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "my_entries", DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), @@ -518,7 +518,7 @@ mod tests { let expected = Field::new( "my_map", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "my_entries", DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 61bcbea5a707..8ee7bc60085e 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -1098,7 +1098,7 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "custom_item", DataType::Int32, false, @@ -1185,7 +1185,7 @@ mod tests { Field::new("utf8s", DataType::Utf8, true), Field::new( "lists", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), Field::new( @@ -1260,7 +1260,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from_slice_ref([0, 3, 4, 4]); let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index d640e298c6ad..6e143c2838d9 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -105,6 +105,7 @@ mod tests { use super::*; use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; use serde_json::Value; + use std::sync::Arc; #[test] fn schema_json() { @@ -155,22 +156,22 @@ mod tests { Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( "c22", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), false, ), Field::new( "c23", DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), + Arc::new(Field::new("bools", DataType::Boolean, false)), 5, ), false, ), Field::new( "c24", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "inner_list", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "struct", DataType::Struct(Fields::empty()), true, @@ -208,9 +209,9 @@ mod tests { Field::new("c35", DataType::LargeUtf8, true), Field::new( "c36", - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(Arc::new(Field::new( "inner_large_list", - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(Arc::new(Field::new( "struct", DataType::Struct(Fields::empty()), false, @@ -222,7 +223,7 @@ mod tests { Field::new( "c37", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "my_entries", DataType::Struct(Fields::from(vec![ Field::new("my_keys", DataType::Utf8, false), diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 1d65be41c41c..90a2d171d347 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -140,7 +140,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { Arc::new(Field::new( field.name().as_str(), - DataType::Map(Box::new(child_field), *sorted), + DataType::Map(Arc::new(child_field), *sorted), field.is_nullable(), )) } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 8ca0d514f462..334b9f65627b 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -22,6 +22,7 @@ use flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; use std::collections::HashMap; +use std::sync::Arc; use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER}; use DataType::*; @@ -337,14 +338,14 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat if children.len() != 1 { panic!("expect a list to have one child") } - DataType::List(Box::new(children.get(0).into())) + DataType::List(Arc::new(children.get(0).into())) } crate::Type::LargeList => { let children = field.children().unwrap(); if children.len() != 1 { panic!("expect a large list to have one child") } - DataType::LargeList(Box::new(children.get(0).into())) + DataType::LargeList(Arc::new(children.get(0).into())) } crate::Type::FixedSizeList => { let children = field.children().unwrap(); @@ -352,7 +353,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat panic!("expect a list to have one child") } let fsl = field.type_as_fixed_size_list().unwrap(); - DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) + DataType::FixedSizeList(Arc::new(children.get(0).into()), fsl.listSize()) } crate::Type::Struct_ => { let fields = match field.children() { @@ -371,7 +372,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat } let run_ends_field = children.get(0).into(); let values_field = children.get(1).into(); - DataType::RunEndEncoded(Box::new(run_ends_field), Box::new(values_field)) + DataType::RunEndEncoded(Arc::new(run_ends_field), Arc::new(values_field)) } crate::Type::Map => { let map = field.type_as_map().unwrap(); @@ -379,7 +380,7 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat if children.len() != 1 { panic!("expect a map to have one child") } - DataType::Map(Box::new(children.get(0).into()), map.keysSorted()) + DataType::Map(Arc::new(children.get(0).into()), map.keysSorted()) } crate::Type::Decimal => { let fsb = field.type_as_decimal().unwrap(); @@ -907,12 +908,12 @@ mod tests { Field::new("binary", DataType::Binary, false), Field::new( "list[u8]", - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))), + DataType::List(Arc::new(Field::new("item", DataType::UInt8, false))), true, ), Field::new( "list[struct]", - List(Box::new(Field::new( + List(Arc::new(Field::new( "struct", Struct(Fields::from(vec![ Field::new("float32", DataType::UInt8, false), @@ -938,13 +939,13 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[struct]>]", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "struct", DataType::Struct(Fields::from(vec![ Field::new("date32", DataType::Date32, true), Field::new( "list[struct<>]", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "struct", DataType::Struct(Fields::empty()), false, @@ -968,7 +969,7 @@ mod tests { Field::new("int64", DataType::Int64, true), Field::new( "list[union]>]", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "union]>", DataType::Union( UnionFields::new( @@ -981,7 +982,7 @@ mod tests { ), Field::new( "list[union<>]", - DataType::List(Box::new( + DataType::List(Arc::new( Field::new( "union", DataType::Union( diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 4f2e51336e34..c20f7bd012fb 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1254,10 +1254,10 @@ mod tests { fn create_test_projection_schema() -> Schema { // define field types let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let fixed_size_list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 3, ); @@ -1280,15 +1280,15 @@ mod tests { Field::new("id", DataType::Int32, false), Field::new( "list", - DataType::List(Box::new(Field::new("item", DataType::Int8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int8, true))), false, ), ]); let struct_data_type = DataType::Struct(struct_fields); let run_encoded_data_type = DataType::RunEndEncoded( - Box::new(Field::new("run_ends", DataType::Int16, false)), - Box::new(Field::new("values", DataType::Int32, true)), + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), ); // define schema @@ -1691,7 +1691,7 @@ mod tests { (values_field, make_array(value_dict_array.into_data())), ]); let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, @@ -1763,7 +1763,7 @@ mod tests { #[test] fn test_roundtrip_stream_dict_of_list_of_dict() { // list - let list_data_type = DataType::List(Box::new(Field::new_dict( + let list_data_type = DataType::List(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), true, @@ -1777,7 +1777,7 @@ mod tests { ); // large list - let list_data_type = DataType::LargeList(Box::new(Field::new_dict( + let list_data_type = DataType::LargeList(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), true, @@ -1799,7 +1799,7 @@ mod tests { let dict_data = dict_array.data(); let list_data_type = DataType::FixedSizeList( - Box::new(Field::new_dict( + Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), true, diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index a567b93c9d0f..c784bd347b4b 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -505,7 +505,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new( "list", - DataType::List(Box::new(Field::new("element", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), true, ), Field::new( @@ -520,7 +520,7 @@ mod tests { "nested_list", DataType::Struct(Fields::from(vec![Field::new( "list2", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "element", DataType::Struct( vec![Field::new("c", DataType::Int32, false)].into(), @@ -591,7 +591,7 @@ mod tests { "nested_list", DataType::Struct(Fields::from(vec![Field::new( "list2", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "element", DataType::Struct( vec![Field::new("d", DataType::Int32, true)].into(), @@ -639,13 +639,13 @@ mod tests { {"map": {"a": [null], "b": []}} {"map": {"c": null, "a": ["baz"]}} "#; - let list = DataType::List(Box::new(Field::new("element", DataType::Utf8, true))); + let list = DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))); let entries = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", list, true), ])); - let map = DataType::Map(Box::new(Field::new("entries", entries, true)), false); + let map = DataType::Map(Arc::new(Field::new("entries", entries, true)), false); let schema = Arc::new(Schema::new(vec![Field::new("map", map, true)])); let batches = do_read(buf, 1024, false, schema); @@ -1023,7 +1023,7 @@ mod tests { DataType::Struct(Fields::from(vec![Field::new( "partitionValues", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "key_value", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index c95f7c0be812..df6b998bee04 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -129,14 +129,14 @@ fn coerce_data_type(dt: Vec<&DataType>) -> DataType { (DataType::Float64, DataType::Float64) | (DataType::Float64, DataType::Int64) | (DataType::Int64, DataType::Float64) => DataType::Float64, - (DataType::List(l), DataType::List(r)) => DataType::List(Box::new(Field::new( + (DataType::List(l), DataType::List(r)) => DataType::List(Arc::new(Field::new( "item", coerce_data_type(vec![l.data_type(), r.data_type()]), true, ))), // coerce scalar and scalar array into scalar array (DataType::List(e), not_list) | (not_list, DataType::List(e)) => { - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", coerce_data_type(vec![e.data_type(), ¬_list]), true, @@ -150,7 +150,7 @@ fn generate_datatype(t: &InferredType) -> Result { Ok(match t { InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), - InferredType::Array(ele_type) => DataType::List(Box::new(Field::new( + InferredType::Array(ele_type) => DataType::List(Arc::new(Field::new( "item", generate_datatype(ele_type)?, true, @@ -1087,7 +1087,7 @@ impl Decoder { fn build_nested_list_array( &self, rows: &[Value], - list_field: &Field, + list_field: &FieldRef, ) -> Result { // build list offsets let mut cur_offset = OffsetSize::zero(); @@ -1231,7 +1231,7 @@ impl Decoder { } }; // build list - let list_data = ArrayData::builder(DataType::List(Box::new(list_field.clone()))) + let list_data = ArrayData::builder(DataType::List(list_field.clone())) .len(list_len) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_child_data(array_data) @@ -2112,12 +2112,12 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); @@ -2176,32 +2176,32 @@ mod tests { use arrow_schema::DataType::*; assert_eq!( - List(Box::new(Field::new("item", Float64, true))), + List(Arc::new(Field::new("item", Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(Field::new("item", Float64, true))) + &List(Arc::new(Field::new("item", Float64, true))) ]) ); assert_eq!( - List(Box::new(Field::new("item", Float64, true))), + List(Arc::new(Field::new("item", Float64, true))), coerce_data_type(vec![ &Float64, - &List(Box::new(Field::new("item", Int64, true))) + &List(Arc::new(Field::new("item", Int64, true))) ]) ); assert_eq!( - List(Box::new(Field::new("item", Int64, true))), + List(Arc::new(Field::new("item", Int64, true))), coerce_data_type(vec![ &Int64, - &List(Box::new(Field::new("item", Int64, true))) + &List(Arc::new(Field::new("item", Int64, true))) ]) ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Box::new(Field::new("item", Utf8, true))), + List(Arc::new(Field::new("item", Utf8, true))), coerce_data_type(vec![ &Boolean, - &List(Box::new(Field::new("item", Float64, true))) + &List(Arc::new(Field::new("item", Float64, true))) ]) ); } @@ -2234,17 +2234,17 @@ mod tests { assert_eq!(&DataType::Int64, a.1.data_type()); let b = schema.column_with_name("b").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), b.1.data_type() ); let c = schema.column_with_name("c").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), c.1.data_type() ); let d = schema.column_with_name("d").unwrap(); assert_eq!( - &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + &DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), d.1.data_type() ); @@ -2366,7 +2366,7 @@ mod tests { true, ); let a_field = - Field::new("a", DataType::List(Box::new(a_struct_field.clone())), true); + Field::new("a", DataType::List(Arc::new(a_struct_field.clone())), true); let schema = Arc::new(Schema::new(vec![a_field.clone()])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); let json_content = r#" @@ -2459,7 +2459,7 @@ mod tests { fn test_map_json_arrays() { let account_field = Field::new("account", DataType::UInt16, false); let value_list_type = - DataType::List(Box::new(Field::new("item", DataType::Utf8, false))); + DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))); let entries_struct_type = DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", value_list_type.clone(), true), @@ -2467,7 +2467,7 @@ mod tests { let stocks_field = Field::new( "stocks", DataType::Map( - Box::new(Field::new("entries", entries_struct_type.clone(), false)), + Arc::new(Field::new("entries", entries_struct_type.clone(), false)), false, ), true, @@ -2712,7 +2712,7 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Field::new( + List(Arc::new(Field::new( "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, @@ -2736,7 +2736,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Field::new( + &List(Arc::new(Field::new( "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true @@ -2766,7 +2766,7 @@ mod tests { fn test_list_of_string_dictionary_from_json_with_nulls() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Field::new( + List(Arc::new(Field::new( "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true, @@ -2792,7 +2792,7 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Field::new( + &List(Arc::new(Field::new( "item", Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), true @@ -2930,17 +2930,17 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new( "d", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true, ), ]); @@ -2997,7 +2997,7 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Utf8, true), @@ -3012,7 +3012,7 @@ mod tests { Field::new( "c3", // empty json array's inner types are inferred as null - DataType::List(Box::new(Field::new("item", DataType::Null, true))), + DataType::List(Arc::new(Field::new("item", DataType::Null, true))), true, ), ]); @@ -3039,9 +3039,9 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true, ))), true, @@ -3263,9 +3263,9 @@ mod tests { fn test_json_read_nested_list() { let schema = Schema::new(vec![Field::new( "c1", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true, ))), true, @@ -3298,7 +3298,7 @@ mod tests { fn test_json_read_list_of_structs() { let schema = Schema::new(vec![Field::new( "c1", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Struct(vec![Field::new("a", DataType::Int64, true)].into()), true, diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 534aea91af4e..d3ac46c937b8 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1057,7 +1057,7 @@ mod tests { fn write_struct_with_list_field() { let field_c1 = Field::new( "c1", - DataType::List(Box::new(Field::new("c_list", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new("c_list", DataType::Utf8, false))), false, ); let field_c2 = Field::new("c2", DataType::Int32, false); @@ -1102,12 +1102,12 @@ mod tests { fn write_nested_list() { let list_inner_type = Field::new( "a", - DataType::List(Box::new(Field::new("b", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("b", DataType::Int32, false))), false, ); let field_c1 = Field::new( "c1", - DataType::List(Box::new(list_inner_type.clone())), + DataType::List(Arc::new(list_inner_type.clone())), false, ); let field_c2 = Field::new("c2", DataType::Utf8, true); @@ -1160,7 +1160,7 @@ mod tests { fn write_list_of_struct() { let field_c1 = Field::new( "c1", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "s", DataType::Struct(Fields::from(vec![ Field::new("c11", DataType::Int32, true), @@ -1325,7 +1325,7 @@ mod tests { "#; let ints_struct = DataType::Struct(vec![Field::new("ints", DataType::Int32, true)].into()); - let list_type = DataType::List(Box::new(Field::new("item", ints_struct, true))); + let list_type = DataType::List(Arc::new(Field::new("item", ints_struct, true))); let list_field = Field::new("list", list_type, true); let schema = Arc::new(Schema::new(vec![list_field])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); @@ -1379,7 +1379,7 @@ mod tests { ]); let map_data_type = DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", entry_struct.data_type().clone(), true, diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 683fd068af40..e68e064c775d 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -2785,6 +2785,7 @@ mod tests { }; use arrow_buffer::i256; use arrow_schema::Field; + use std::sync::Arc; /// Evaluate `KERNEL` with two vectors as inputs and assert against the expected output. /// `A_VEC` and `B_VEC` can be of type `Vec` or `Vec>` where `T` is the native @@ -3408,7 +3409,7 @@ mod tests { .into_data(); let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 6, 9]); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 57a5c68386fc..3ec5597b2854 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -18,8 +18,7 @@ use std::fmt; use std::sync::Arc; -use crate::field::Field; -use crate::{Fields, UnionFields}; +use crate::{FieldRef, Fields, UnionFields}; /// The set of datatypes that are supported by this implementation of Apache Arrow. /// @@ -183,13 +182,13 @@ pub enum DataType { /// A list of some logical data type with variable length. /// /// A single List array can store up to [`i32::MAX`] elements in total - List(Box), + List(FieldRef), /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), + FixedSizeList(FieldRef, i32), /// A list of some logical data type with variable length and 64-bit offsets. /// /// A single LargeList array can store up to [`i64::MAX`] elements in total - LargeList(Box), + LargeList(FieldRef), /// A nested datatype that contains a number of sub-fields. Struct(Fields), /// A nested datatype that can represent slots of differing types. Components: @@ -249,7 +248,7 @@ pub enum DataType { /// has two children: key type and the second the value type. The names of the /// child fields may be respectively "entries", "key", and "value", but this is /// not enforced. - Map(Box, bool), + Map(FieldRef, bool), /// A run-end encoding (REE) is a variation of run-length encoding (RLE). These /// encodings are well-suited for representing data containing sequences of the /// same value, called runs. Each run is represented as a value and an integer giving @@ -261,7 +260,7 @@ pub enum DataType { /// /// These child arrays are prescribed the standard names of "run_ends" and "values" /// respectively. - RunEndEncoded(Box, Box), + RunEndEncoded(FieldRef, FieldRef), } /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. @@ -520,6 +519,7 @@ pub const DECIMAL_DEFAULT_SCALE: i8 = 10; #[cfg(test)] mod tests { use super::*; + use crate::Field; #[test] #[cfg(feature = "serde")] @@ -574,21 +574,21 @@ mod tests { #[test] fn test_list_datatype_equality() { // tests that list type equality is checked while ignoring list names - let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true))); - let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true))); + let list_a = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_b = DataType::List(Arc::new(Field::new("array", DataType::Int32, true))); + let list_c = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_d = DataType::List(Arc::new(Field::new("item", DataType::UInt32, true))); assert!(list_a.equals_datatype(&list_b)); assert!(!list_a.equals_datatype(&list_c)); assert!(!list_b.equals_datatype(&list_c)); assert!(!list_a.equals_datatype(&list_d)); let list_e = - DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new("item", list_a, false)), 3); let list_f = - DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new("array", list_b, false)), 3); let list_g = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)), + Arc::new(Field::new("item", DataType::FixedSizeBinary(3), true)), 3, ); assert!(list_e.equals_datatype(&list_f)); @@ -639,7 +639,7 @@ mod tests { #[test] fn test_nested() { - let list = DataType::List(Box::new(Field::new("foo", DataType::Utf8, true))); + let list = DataType::List(Arc::new(Field::new("foo", DataType::Utf8, true))); assert!(!DataType::is_nested(&DataType::Boolean)); assert!(!DataType::is_nested(&DataType::Int32)); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 72afc5b0bbcb..9078e35b32e3 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -388,11 +388,11 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "tDn" => DataType::Duration(TimeUnit::Nanosecond), "+l" => { let c_child = c_schema.child(0); - DataType::List(Box::new(Field::try_from(c_child)?)) + DataType::List(Arc::new(Field::try_from(c_child)?)) } "+L" => { let c_child = c_schema.child(0); - DataType::LargeList(Box::new(Field::try_from(c_child)?)) + DataType::LargeList(Arc::new(Field::try_from(c_child)?)) } "+s" => { let fields = c_schema.children().map(Field::try_from); @@ -401,7 +401,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "+m" => { let c_child = c_schema.child(0); let map_keys_sorted = c_schema.map_keys_sorted(); - DataType::Map(Box::new(Field::try_from(c_child)?), map_keys_sorted) + DataType::Map(Arc::new(Field::try_from(c_child)?), map_keys_sorted) } // Parametrized types, requiring string parse other => { @@ -421,7 +421,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { ArrowError::CDataInterface( "The FixedSizeList type requires an integer parameter representing number of elements per list".to_string()) })?; - DataType::FixedSizeList(Box::new(Field::try_from(c_child)?), parsed_num_elems) + DataType::FixedSizeList(Arc::new(Field::try_from(c_child)?), parsed_num_elems) }, // Decimal types in format "d:precision,scale" or "d:precision,scale,bitWidth" ["d", extra] => { @@ -772,11 +772,11 @@ mod tests { round_trip_type(DataType::Time64(TimeUnit::Nanosecond)); round_trip_type(DataType::FixedSizeBinary(12)); round_trip_type(DataType::FixedSizeList( - Box::new(Field::new("a", DataType::Int64, false)), + Arc::new(Field::new("a", DataType::Int64, false)), 5, )); round_trip_type(DataType::Utf8); - round_trip_type(DataType::List(Box::new(Field::new( + round_trip_type(DataType::List(Arc::new(Field::new( "a", DataType::Int16, false, @@ -828,7 +828,7 @@ mod tests { // Construct a map array from the above two let map_data_type = - DataType::Map(Box::new(Field::new("entries", entry_struct, true)), true); + DataType::Map(Arc::new(Field::new("entries", entry_struct, true)), true); let arrow_schema = FFI_ArrowSchema::try_from(map_data_type).unwrap(); assert!(arrow_schema.map_keys_sorted()); diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index d68392f51f03..ac02eadd6640 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -454,6 +454,7 @@ mod test { use crate::Fields; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; + use std::sync::Arc; #[test] fn test_new_with_string() { @@ -502,13 +503,13 @@ mod test { dict1.clone(), Field::new( "list[struct]>]", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "struct]>", DataType::Struct(Fields::from(vec![ dict1.clone(), Field::new( "list[struct]", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "struct", DataType::Struct(vec![dict2.clone()].into()), false, diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index f71a3cbc2ab0..1cab72b6d9f2 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -913,7 +913,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8, 8]); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) @@ -937,7 +937,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0i64, 3, 3]); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false))); let expected = ArrayData::builder(list_data_type) .len(2) .add_buffer(value_offsets) @@ -1291,7 +1291,7 @@ mod tests { .build() .unwrap(); let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 3, ); let list_data = ArrayData::builder(list_data_type) @@ -1350,7 +1350,7 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), + Arc::new(Field::new("item", DataType::Int32, false)), 2, ); let list_data = ArrayData::builder(list_data_type) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index cf28c9682ae5..83fe1bb56f35 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1574,7 +1574,7 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( + let list_data_type = DataType::$list_data_type(Arc::new(Field::new( "item", DataType::Int32, false, @@ -1646,7 +1646,7 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( + let list_data_type = DataType::$list_data_type(Arc::new(Field::new( "item", DataType::Int32, true, @@ -1719,7 +1719,7 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( + let list_data_type = DataType::$list_data_type(Arc::new(Field::new( "item", DataType::Int32, true, @@ -1895,7 +1895,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index 5651813a6403..8ad6cfd3ab48 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -117,22 +117,22 @@ fn small_bench_list(c: &mut Criterion) { let schema = Arc::new(Schema::new(vec![ Field::new( "c1", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true, ), Field::new( "c2", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c3", - DataType::List(Box::new(Field::new("item", DataType::UInt32, true))), + DataType::List(Arc::new(Field::new("item", DataType::UInt32, true))), true, ), Field::new( "c4", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), true, ), ])); diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index 312de11b303d..d0e6b31085e5 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -100,7 +100,7 @@ fn main() { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 5f556dfff587..0249a70d168f 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -207,7 +207,7 @@ mod tests { .add_buffer(Buffer::from_slice_ref(v)) .build()?; let list_data_type = - DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int64, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int64, false)), 3); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -232,7 +232,7 @@ mod tests { .add_buffer(Buffer::from_slice_ref(v)) .build()?; let list_data_type = - DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int16, false)), 2); + DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int16, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(8) .null_bit_buffer(Some(Buffer::from(validity_bits))) @@ -255,7 +255,7 @@ mod tests { let offsets: Vec = vec![0, 2, 4, 6, 8, 10, 12, 14, 16]; let value_offsets = Buffer::from_slice_ref(offsets); let inner_list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let inner_list_data = ArrayData::builder(inner_list_data_type.clone()) .len(8) .add_buffer(value_offsets) @@ -267,7 +267,7 @@ mod tests { bit_util::set_bit(&mut validity_bits, 2); let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("f", inner_list_data_type, false)), + Arc::new(Field::new("f", inner_list_data_type, false)), 2, ); let list_data = ArrayData::builder(list_data_type) diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 357e9b13ae82..74cbd2096bfd 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -110,7 +110,7 @@ mod tests { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index fe2e186a72f9..7b26cf7f25a5 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -662,7 +662,7 @@ mod tests { .collect::(); // Construct a list array from the above two - let list_data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + let list_data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( Field::new("item", DataType::Int32, false), )); @@ -921,7 +921,7 @@ mod tests { .build()?; let list_data_type = - DataType::FixedSizeList(Box::new(Field::new("f", DataType::Int32, false)), 3); + DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int32, false)), 3); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .null_bit_buffer(Some(Buffer::from(validity_bits))) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 1983ea72d2fb..29e7420f10be 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -273,7 +273,7 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, true))), false, ), Field::new("a", DataType::Int32, false), @@ -303,9 +303,9 @@ mod tests { Field::new("b", DataType::Boolean, true), Field::new( "c", - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(Arc::new(Field::new( "item", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::FixedSizeBinary(6), true, diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 27fb1dcd232b..2807bbd79b83 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -241,7 +241,7 @@ fn make_fixed_size_list_array() -> FixedSizeListArray { // Construct a fixed size list array from the above two let list_data_type = - DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, true)), 2); + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -275,7 +275,7 @@ fn make_list_array() -> ListArray { // Construct a list array from the above two let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -299,7 +299,7 @@ fn make_large_list_array() -> LargeListArray { // Construct a list array from the above two let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -394,12 +394,12 @@ fn get_all_types() -> Vec { LargeBinary, Utf8, LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + List(Arc::new(Field::new("item", DataType::Int8, true))), + List(Arc::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Arc::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Arc::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Arc::new(Field::new("item", DataType::Int8, true))), + LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), Struct(Fields::from(vec![ Field::new("f1", DataType::Int32, true), Field::new("f2", DataType::Utf8, true), diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index dbbeb934d37c..37968ec6a055 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -365,7 +365,7 @@ fn test_empty_offsets_list_equal() { let values = Int32Array::from(empty); let empty_offsets: [u8; 0] = []; - let a: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let a: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "item", DataType::Int32, true, @@ -378,7 +378,7 @@ fn test_empty_offsets_list_equal() { .unwrap() .into(); - let b: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let b: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "item", DataType::Int32, true, @@ -393,7 +393,7 @@ fn test_empty_offsets_list_equal() { test_equal(&a, &b, true); - let c: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "item", DataType::Int32, true, @@ -435,7 +435,7 @@ fn test_list_null() { // a list where the nullness of values is determined by the list's bitmap let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let c: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "item", DataType::Int32, true, @@ -458,7 +458,7 @@ fn test_list_null() { None, None, ]); - let d: ListArray = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( + let d: ListArray = ArrayDataBuilder::new(DataType::List(Arc::new(Field::new( "item", DataType::Int32, true, diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 57816306ba4e..97869544ddd0 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -560,7 +560,7 @@ fn test_list_append() { ]); let list_value_offsets = Buffer::from_slice_ref([0i32, 3, 5, 11, 13, 13, 15, 15, 17]); let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), 8, None, 0, @@ -639,7 +639,7 @@ fn test_list_nulls_append() { let list_value_offsets = Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), 12, Some(Buffer::from(&[0b11011011, 0b1110])), 0, @@ -776,7 +776,7 @@ fn test_map_nulls_append() { let expected_list_data = ArrayData::try_new( DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", DataType::Struct(Fields::from(vec![ Field::new("keys", DataType::Int64, false), @@ -854,7 +854,7 @@ fn test_list_of_strings_append() { ]); let list_value_offsets = Buffer::from_slice_ref([0, 3, 5, 6, 9, 10, 13]); let expected_list_data = ArrayData::try_new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), 6, None, 0, @@ -986,7 +986,7 @@ fn test_fixed_size_list_append() -> Result<()> { ]); let expected_list_data = ArrayData::new( DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), + Arc::new(Field::new("item", DataType::UInt16, true)), 2, ), 12, diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index ef0d40d64e2d..082d020ca462 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -367,7 +367,7 @@ fn test_validate_fixed_size_list() { // 10 is off the end of the buffer let field = Field::new("field", DataType::Int32, true); ArrayData::try_new( - DataType::FixedSizeList(Box::new(field), 2), + DataType::FixedSizeList(Arc::new(field), 2), 3, None, 0, @@ -715,7 +715,7 @@ fn check_list_offsets(data_type: DataType) { )] fn test_validate_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::List(Box::new(field_type))); + check_list_offsets::(DataType::List(Arc::new(field_type))); } #[test] @@ -724,7 +724,7 @@ fn test_validate_list_offsets() { )] fn test_validate_large_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); - check_list_offsets::(DataType::LargeList(Box::new(field_type))); + check_list_offsets::(DataType::LargeList(Arc::new(field_type))); } /// Test that the list of type `data_type` generates correct errors for negative offsets @@ -735,7 +735,7 @@ fn test_validate_large_list_offsets() { fn test_validate_list_negative_offsets() { let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); let field_type = Field::new("f", values.data_type().clone(), true); - let data_type = DataType::List(Box::new(field_type)); + let data_type = DataType::List(Arc::new(field_type)); // -1 is an invalid offset any way you look at it let offsets: Vec = vec![0, 2, -1, 4]; @@ -1027,7 +1027,7 @@ fn test_sliced_array_child() { let offsets = Buffer::from_iter([1_i32, 3_i32]); let list_field = Field::new("element", DataType::Int32, false); - let data_type = DataType::List(Box::new(list_field)); + let data_type = DataType::List(Arc::new(list_field)); let data = unsafe { ArrayData::new_unchecked( diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 818fe0b3e49b..a494d9a97791 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -171,17 +171,17 @@ fn create_list_primitive_bench_batch( let fields = vec![ Field::new( "_1", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), Field::new( "_2", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new( "_3", - DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, true))), true, ), ]; @@ -202,17 +202,17 @@ fn create_list_primitive_bench_batch_non_null( let fields = vec![ Field::new( "_1", - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), false, ), Field::new( "_2", - DataType::List(Box::new(Field::new("item", DataType::Boolean, false))), + DataType::List(Arc::new(Field::new("item", DataType::Boolean, false))), false, ), Field::new( "_3", - DataType::LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + DataType::LargeList(Arc::new(Field::new("item", DataType::Utf8, false))), false, ), ]; @@ -256,9 +256,9 @@ fn _create_nested_bench_batch( ), Field::new( "_2", - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(Arc::new(Field::new( "item", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Struct(Fields::from(vec![ Field::new( @@ -272,7 +272,7 @@ fn _create_nested_bench_batch( ), Field::new( "_2", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "", DataType::FixedSizeBinary(2), true, diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 60cc84f9f8d4..241a5efe078a 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -90,13 +90,13 @@ fn build_map_reader( DataType::Map(map_field, is_sorted) => match map_field.data_type() { DataType::Struct(fields) => { assert_eq!(fields.len(), 2); - let struct_field = map_field.clone().with_data_type( + let struct_field = map_field.as_ref().clone().with_data_type( DataType::Struct(Fields::from(vec![ fields[0].as_ref().clone().with_data_type(key_type), fields[1].as_ref().clone().with_data_type(value_type), ])), ); - DataType::Map(Box::new(struct_field), *is_sorted) + DataType::Map(Arc::new(struct_field), *is_sorted) } _ => unreachable!(), }, @@ -135,11 +135,11 @@ fn build_list_reader( let item_type = item_reader.get_data_type().clone(); let data_type = match &field.arrow_type { DataType::List(f) => { - DataType::List(Box::new(f.clone().with_data_type(item_type))) - } - DataType::LargeList(f) => { - DataType::LargeList(Box::new(f.clone().with_data_type(item_type))) + DataType::List(Arc::new(f.as_ref().clone().with_data_type(item_type))) } + DataType::LargeList(f) => DataType::LargeList(Arc::new( + f.as_ref().clone().with_data_type(item_type), + )), _ => unreachable!(), }; diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index e8d426d3a850..fee032a4d763 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -440,7 +440,7 @@ mod tests { let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] - let data = ArrayDataBuilder::new(ArrowType::List(Box::new(Field::new( + let data = ArrayDataBuilder::new(ArrowType::List(Arc::new(Field::new( "item", decimals.data_type().clone(), false, diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 6218a5466da2..504591c0ca89 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -268,7 +268,7 @@ mod tests { data_type: ArrowType, item_nullable: bool, ) -> ArrowType { - let field = Box::new(Field::new("item", data_type, item_nullable)); + let field = Arc::new(Field::new("item", data_type, item_nullable)); GenericListArray::::DATA_TYPE_CONSTRUCTOR(field) } @@ -584,7 +584,7 @@ mod tests { batch.data_type(), &ArrowType::Struct(Fields::from(vec![Field::new( "table_info", - ArrowType::List(Box::new(Field::new( + ArrowType::List(Arc::new(Field::new( "table_info", ArrowType::Struct( vec![Field::new("name", ArrowType::Binary, false)].into() diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 621292ee7900..d7645a593505 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -149,7 +149,7 @@ mod tests { let schema = Schema::new(vec![Field::new( "map", ArrowType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", ArrowType::Struct(Fields::from(vec![ Field::new("keys", ArrowType::Utf8, false), diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8464b959215d..9507967836f1 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1181,7 +1181,7 @@ mod tests { let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] - let data = ArrayDataBuilder::new(ArrowDataType::List(Box::new(Field::new( + let data = ArrayDataBuilder::new(ArrowDataType::List(Arc::new(Field::new( "item", decimals.data_type().clone(), false, @@ -2122,7 +2122,7 @@ mod tests { let arrow_field = Field::new( "emptylist", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Null, true))), + ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Null, true))), true, ); @@ -2236,7 +2236,7 @@ mod tests { fn test_row_group_batch(row_group_size: usize, batch_size: usize) { let schema = Arc::new(Schema::new(vec![Field::new( "list", - ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))), + ArrowDataType::List(Arc::new(Field::new("item", ArrowDataType::Int32, true))), true, )])); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 9a6a97df4467..4239f3fba59b 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -494,9 +494,9 @@ mod tests { // [[a, b, c], [d, e, f, g]], [[h], [i,j]] let leaf_type = Field::new("item", DataType::Int32, false); - let inner_type = DataType::List(Box::new(leaf_type)); + let inner_type = DataType::List(Arc::new(leaf_type)); let inner_field = Field::new("l2", inner_type.clone(), false); - let outer_type = DataType::List(Box::new(inner_field)); + let outer_type = DataType::List(Arc::new(inner_field)); let outer_field = Field::new("l1", outer_type.clone(), false); let primitives = Int32Array::from_iter(0..10); @@ -579,7 +579,7 @@ mod tests { #[test] fn test_calculate_array_levels_1() { let leaf_field = Field::new("item", DataType::Int32, false); - let list_type = DataType::List(Box::new(leaf_field)); + let list_type = DataType::List(Arc::new(leaf_field)); // if all array values are defined (e.g. batch>) // [[0], [1], [2], [3], [4]] @@ -659,7 +659,7 @@ mod tests { let leaf = Int32Array::from_iter(0..11); let leaf_field = Field::new("leaf", DataType::Int32, false); - let list_type = DataType::List(Box::new(leaf_field)); + let list_type = DataType::List(Arc::new(leaf_field)); let list = ArrayData::builder(list_type.clone()) .len(5) .add_child_data(leaf.into_data()) @@ -700,7 +700,7 @@ mod tests { let leaf = Int32Array::from_iter(100..122); let leaf_field = Field::new("leaf", DataType::Int32, true); - let l1_type = DataType::List(Box::new(leaf_field)); + let l1_type = DataType::List(Arc::new(leaf_field)); let offsets = Buffer::from_iter([0_i32, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]); let l1 = ArrayData::builder(l1_type.clone()) .len(11) @@ -710,7 +710,7 @@ mod tests { .unwrap(); let l1_field = Field::new("l1", l1_type, true); - let l2_type = DataType::List(Box::new(l1_field)); + let l2_type = DataType::List(Arc::new(l1_field)); let l2 = ArrayData::builder(l2_type) .len(5) .add_child_data(l1) @@ -742,7 +742,7 @@ mod tests { #[test] fn test_calculate_array_levels_nested_list() { let leaf_field = Field::new("leaf", DataType::Int32, false); - let list_type = DataType::List(Box::new(leaf_field)); + let list_type = DataType::List(Arc::new(leaf_field)); // if all array values are defined (e.g. batch>) // The array at this level looks like: @@ -813,7 +813,7 @@ mod tests { let leaf = Int32Array::from_iter(201..216); let leaf_field = Field::new("leaf", DataType::Int32, false); - let list_1_type = DataType::List(Box::new(leaf_field)); + let list_1_type = DataType::List(Arc::new(leaf_field)); let list_1 = ArrayData::builder(list_1_type.clone()) .len(7) .add_buffer(Buffer::from_iter([0_i32, 1, 3, 3, 6, 10, 10, 15])) @@ -822,7 +822,7 @@ mod tests { .unwrap(); let list_1_field = Field::new("l1", list_1_type, true); - let list_2_type = DataType::List(Box::new(list_1_field)); + let list_2_type = DataType::List(Arc::new(list_1_field)); let list_2 = ArrayData::builder(list_2_type.clone()) .len(4) .add_buffer(Buffer::from_iter([0_i32, 0, 3, 5, 7])) @@ -899,7 +899,7 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from_iter([0_i32, 1, 3, 3, 6, 10]); let a_list_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let a_list_data = ArrayData::builder(a_list_type.clone()) .len(5) .add_buffer(a_value_offsets) @@ -942,7 +942,7 @@ mod tests { let struct_field_f = Field::new("f", DataType::Float32, true); let struct_field_g = Field::new( "g", - DataType::List(Box::new(Field::new("items", DataType::Int16, false))), + DataType::List(Arc::new(Field::new("items", DataType::Int16, false))), false, ); let struct_field_e = Field::new( @@ -1131,7 +1131,7 @@ mod tests { let stocks_field = Field::new( "stocks", DataType::Map( - Box::new(Field::new("entries", entries_struct_type, false)), + Arc::new(Field::new("entries", entries_struct_type, false)), false, ), // not nullable, so the keys have max level = 1 @@ -1186,7 +1186,7 @@ mod tests { let int_field = Field::new("a", DataType::Int32, true); let fields = Fields::from([Arc::new(int_field)]); let item_field = Field::new("item", DataType::Struct(fields.clone()), true); - let list_field = Field::new("list", DataType::List(Box::new(item_field)), true); + let list_field = Field::new("list", DataType::List(Arc::new(item_field)), true); let int_builder = Int32Builder::with_capacity(10); let struct_builder = StructBuilder::new(fields, vec![Box::new(int_builder)]); @@ -1336,7 +1336,7 @@ mod tests { let offsets = Buffer::from_iter([0_i32, 0, 2, 2, 3, 5, 5]); let nulls = Buffer::from([0b00111100]); - let list_type = DataType::List(Box::new(Field::new( + let list_type = DataType::List(Arc::new(Field::new( "struct", struct_a.data_type().clone(), true, diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index f594f2f79947..0515ed4e39e2 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -750,7 +750,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), true, )]); @@ -763,7 +763,7 @@ mod tests { arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( "item", DataType::Int32, false, @@ -791,7 +791,7 @@ mod tests { // define schema let schema = Schema::new(vec![Field::new( "a", - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, false))), false, )]); @@ -804,7 +804,7 @@ mod tests { arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( "item", DataType::Int32, false, @@ -890,12 +890,12 @@ mod tests { let struct_field_f = Field::new("f", DataType::Float32, true); let struct_field_g = Field::new( "g", - DataType::List(Box::new(Field::new("item", DataType::Int16, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int16, true))), false, ); let struct_field_h = Field::new( "h", - DataType::List(Box::new(Field::new("item", DataType::Int16, false))), + DataType::List(Arc::new(Field::new("item", DataType::Int16, false))), true, ); let struct_field_e = Field::new( @@ -1025,7 +1025,7 @@ mod tests { let stocks_field = Field::new( "stocks", DataType::Map( - Box::new(Field::new("entries", entries_struct_type, false)), + Arc::new(Field::new("entries", entries_struct_type, false)), false, ), true, @@ -1766,14 +1766,14 @@ mod tests { fn null_list_single_column() { let null_field = Field::new("item", DataType::Null, true); let list_field = - Field::new("emptylist", DataType::List(Box::new(null_field)), true); + Field::new("emptylist", DataType::List(Arc::new(null_field)), true); let schema = Schema::new(vec![list_field]); // Build [[], null, [null, null]] let a_values = NullArray::new(2); let a_value_offsets = arrow::buffer::Buffer::from(&[0, 0, 0, 2].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( "item", DataType::Null, true, @@ -1804,7 +1804,7 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new( + let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( "item", DataType::Int32, false, @@ -1829,7 +1829,7 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); - let a_list_data = ArrayData::builder(DataType::LargeList(Box::new(Field::new( + let a_list_data = ArrayData::builder(DataType::LargeList(Arc::new(Field::new( "large_item", DataType::Int32, true, @@ -2256,7 +2256,7 @@ mod tests { true, ); - let list_a = Field::new("list", DataType::List(Box::new(struct_a)), true); + let list_a = Field::new("list", DataType::List(Arc::new(struct_a)), true); let struct_b = Field::new( "struct_b", DataType::Struct(vec![list_a.clone()].into()), diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index ad6ded1b842f..25227aeeebc8 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -62,7 +62,7 @@ impl ParquetField { rep_level: self.rep_level, def_level: self.def_level, nullable: false, - arrow_type: DataType::List(Box::new(Field::new( + arrow_type: DataType::List(Arc::new(Field::new( name, self.arrow_type.clone(), false, @@ -362,7 +362,7 @@ impl Visitor { rep_level, def_level, nullable, - arrow_type: DataType::Map(Box::new(map_field), sorted), + arrow_type: DataType::Map(Arc::new(map_field), sorted), field_type: ParquetFieldType::Group { children: vec![key, value], }, @@ -479,7 +479,7 @@ impl Visitor { match self.dispatch(item_type, new_context) { Ok(Some(item)) => { - let item_field = Box::new(convert_field(item_type, &item, arrow_field)); + let item_field = Arc::new(convert_field(item_type, &item, arrow_field)); // Use arrow type as hint for index size let arrow_type = match context.data_type { diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index b541a754ba41..81ed5e8177bb 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -711,7 +711,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))), false, )); } @@ -725,7 +725,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), true, )); } @@ -744,10 +744,10 @@ mod tests { // } { let arrow_inner_list = - DataType::List(Box::new(Field::new("element", DataType::Int32, false))); + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(Field::new("element", arrow_inner_list, false))), + DataType::List(Arc::new(Field::new("element", arrow_inner_list, false))), true, )); } @@ -761,7 +761,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("str", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new("str", DataType::Utf8, false))), true, )); } @@ -773,7 +773,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), true, )); } @@ -792,7 +792,7 @@ mod tests { ])); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", arrow_struct, false))), + DataType::List(Arc::new(Field::new("element", arrow_struct, false))), true, )); } @@ -809,7 +809,7 @@ mod tests { let arrow_struct = DataType::Struct(fields); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("array", arrow_struct, false))), + DataType::List(Arc::new(Field::new("array", arrow_struct, false))), true, )); } @@ -826,7 +826,7 @@ mod tests { let arrow_struct = DataType::Struct(fields); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "my_list_tuple", arrow_struct, false, @@ -840,7 +840,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(Field::new("name", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("name", DataType::Int32, false))), false, )); } @@ -891,7 +891,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list1", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))), false, )); } @@ -905,7 +905,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list2", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), true, )); } @@ -919,7 +919,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list3", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), false, )); } @@ -976,7 +976,7 @@ mod tests { arrow_fields.push(Field::new( "my_map1", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "key_value", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), @@ -1001,7 +1001,7 @@ mod tests { arrow_fields.push(Field::new( "my_map2", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "map", DataType::Struct(Fields::from(vec![ Field::new("str", DataType::Utf8, false), @@ -1026,7 +1026,7 @@ mod tests { arrow_fields.push(Field::new( "my_map3", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "map", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), @@ -1201,7 +1201,7 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "innerGroup", DataType::Struct( vec![Field::new("leaf3", DataType::Int32, true)].into(), @@ -1213,7 +1213,7 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "outerGroup", DataType::Struct(Fields::from(vec![ Field::new("leaf2", DataType::Int32, true), @@ -1302,7 +1302,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))), + DataType::List(Arc::new(Field::new("bools", DataType::Boolean, false))), false, ), Field::new("date", DataType::Date32, true), @@ -1326,12 +1326,12 @@ mod tests { ), Field::new( "int_list", - DataType::List(Box::new(Field::new("int_list", DataType::Int32, false))), + DataType::List(Arc::new(Field::new("int_list", DataType::Int32, false))), false, ), Field::new( "byte_list", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "byte_list", DataType::Binary, false, @@ -1340,7 +1340,7 @@ mod tests { ), Field::new( "string_list", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "string_list", DataType::Utf8, false, @@ -1417,12 +1417,12 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("element", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("element", DataType::Boolean, true))), true, ), Field::new( "bools_non_null", - DataType::List(Box::new(Field::new("element", DataType::Boolean, false))), + DataType::List(Arc::new(Field::new("element", DataType::Boolean, false))), false, ), Field::new("date", DataType::Date32, true), @@ -1470,7 +1470,7 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "element", DataType::Int32, true, @@ -1602,7 +1602,7 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new( "c21", - DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), + DataType::List(Arc::new(Field::new("list", DataType::Boolean, true))), false, ), // Field::new( @@ -1663,13 +1663,13 @@ mod tests { Field::new( "c39", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "key_value", DataType::Struct(Fields::from(vec![ Field::new("key", DataType::Utf8, false), Field::new( "value", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "element", DataType::Utf8, true, @@ -1686,13 +1686,13 @@ mod tests { Field::new( "c40", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "my_entries", DataType::Struct(Fields::from(vec![ Field::new("my_key", DataType::Utf8, false), Field::new( "my_value", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Utf8, true, @@ -1709,13 +1709,13 @@ mod tests { Field::new( "c41", DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "my_entries", DataType::Struct(Fields::from(vec![ Field::new("my_key", DataType::Utf8, false), Field::new( "my_value", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "item", DataType::Utf8, true, @@ -1762,7 +1762,7 @@ mod tests { vec![ Field::new( "c21", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "array", DataType::Boolean, true, @@ -1772,16 +1772,16 @@ mod tests { Field::new( "c22", DataType::FixedSizeList( - Box::new(Field::new("items", DataType::Boolean, false)), + Arc::new(Field::new("items", DataType::Boolean, false)), 5, ), false, ), Field::new( "c23", - DataType::List(Box::new(Field::new( + DataType::List(Arc::new(Field::new( "items", - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(Arc::new(Field::new( "items", DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Int16, true), From f1d5797fd97a8d5363586d6f471414c7e778105e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:19:59 +0100 Subject: [PATCH 0761/1411] Cleanup row count handling in JSON writer (#3934) --- arrow-json/src/writer.rs | 110 ++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 66 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d3ac46c937b8..5d8abfafc4b1 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -124,21 +124,15 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, - row_count: usize, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); let mut inner_objs = iter::repeat(JsonMap::new()) - .take(row_count) + .take(array.len()) .collect::>>(); for (j, struct_col) in array.columns().iter().enumerate() { - set_column_for_json_rows( - &mut inner_objs, - row_count, - struct_col, - inner_col_names[j], - )? + set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j])? } Ok(inner_objs) } @@ -197,8 +191,7 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { }) .collect(), DataType::Struct(_) => { - let jsonmaps = - struct_array_to_jsonmap_array(as_struct_array(array), array.len())?; + let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } t => Err(ArrowError::JsonError(format!( @@ -208,21 +201,21 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $row_count:ident) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident) => { let arr = $cast_fn($array); - $rows.iter_mut().zip(arr.iter()).take($row_count).for_each( - |(row, maybe_value)| { + $rows + .iter_mut() + .zip(arr.iter()) + .for_each(|(row, maybe_value)| { if let Some(v) = maybe_value { row.insert($col_name.to_string(), v.into()); } - }, - ); + }); }; } fn set_column_by_primitive_type( rows: &mut [JsonMap], - row_count: usize, array: &ArrayRef, col_name: &str, ) where @@ -233,7 +226,6 @@ fn set_column_by_primitive_type( rows.iter_mut() .zip(primitive_arr.iter()) - .take(row_count) .for_each(|(row, maybe_value)| { // when value is null, we simply skip setting the key if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { @@ -244,58 +236,51 @@ fn set_column_by_primitive_type( fn set_column_for_json_rows( rows: &mut [JsonMap], - row_count: usize, array: &ArrayRef, col_name: &str, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int16 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int32 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Int64 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Float32 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Float64 => { - set_column_by_primitive_type::(rows, row_count, array, col_name); + set_column_by_primitive_type::(rows, array, col_name); } DataType::Null => { // when value is null, we simply skip setting the key } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array, row_count); + set_column_by_array_type!(as_boolean_array, col_name, rows, array); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array, row_count); + set_column_by_array_type!(as_string_array, col_name, rows, array); } DataType::LargeUtf8 => { - set_column_by_array_type!( - as_largestring_array, - col_name, - rows, - array, - row_count - ); + set_column_by_array_type!(as_largestring_array, col_name, rows, array); } DataType::Date32 | DataType::Date64 @@ -306,23 +291,18 @@ fn set_column_for_json_rows( let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let data = array.data(); - rows.iter_mut() - .take(row_count) - .enumerate() - .for_each(|(idx, row)| { - if data.is_valid(idx) { - row.insert( - col_name.to_string(), - formatter.value(idx).to_string().into(), - ); - } - }); + rows.iter_mut().enumerate().for_each(|(idx, row)| { + if data.is_valid(idx) { + row.insert( + col_name.to_string(), + formatter.value(idx).to_string().into(), + ); + } + }); } DataType::Struct(_) => { - let inner_objs = - struct_array_to_jsonmap_array(as_struct_array(array), row_count)?; + let inner_objs = struct_array_to_jsonmap_array(array.as_struct())?; rows.iter_mut() - .take(row_count) .zip(inner_objs.into_iter()) .for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); @@ -330,10 +310,8 @@ fn set_column_for_json_rows( } DataType::List(_) => { let listarr = as_list_array(array); - rows.iter_mut() - .zip(listarr.iter()) - .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut().zip(listarr.iter()).try_for_each( + |(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { row.insert( col_name.to_string(), @@ -341,26 +319,25 @@ fn set_column_for_json_rows( ); } Ok(()) - })?; + }, + )?; } DataType::LargeList(_) => { let listarr = as_large_list_array(array); - rows.iter_mut() - .zip(listarr.iter()) - .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut().zip(listarr.iter()).try_for_each( + |(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { let val = array_to_json_array(&v)?; row.insert(col_name.to_string(), Value::Array(val)); } Ok(()) - })?; + }, + )?; } DataType::Dictionary(_, value_type) => { - let slice = array.slice(0, row_count); - let hydrated = arrow_cast::cast::cast(&slice, value_type) + let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; + set_column_for_json_rows(rows, &hydrated, col_name)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -381,7 +358,7 @@ fn set_column_for_json_rows( let mut kv = keys.iter().zip(values.into_iter()); - for (i, row) in rows.iter_mut().take(row_count).enumerate() { + for (i, row) in rows.iter_mut().enumerate() { if maparr.is_null(i) { row.insert(col_name.to_string(), serde_json::Value::Null); continue; @@ -424,9 +401,10 @@ pub fn record_batches_to_json_rows( let mut base = 0; for batch in batches { let row_count = batch.num_rows(); + let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(&mut rows[base..], row_count, col, col_name)? + set_column_for_json_rows(row_slice, col, col_name)? } base += row_count; } From cf7f7c0a173591656d5e58d20d9b4099fbcbf68a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:47:43 +0100 Subject: [PATCH 0762/1411] Use serde rc (#3980) --- arrow-schema/src/fields.rs | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 1de5e5efdeeb..07dff2aae6bd 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -39,6 +39,8 @@ use std::sync::Arc; /// ``` /// #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "serde", serde(transparent))] pub struct Fields(Arc<[FieldRef]>); impl std::fmt::Debug for Fields { @@ -125,32 +127,6 @@ impl<'a> IntoIterator for &'a Fields { } } -// Manually implement to avoid needing serde rc feature -#[cfg(feature = "serde")] -impl serde::Serialize for Fields { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - use serde::ser::SerializeSeq; - let mut seq = serializer.serialize_seq(Some(self.len()))?; - for e in self.iter() { - seq.serialize_element(e.as_ref())?; - } - seq.end() - } -} - -#[cfg(feature = "serde")] -impl<'de> serde::Deserialize<'de> for Fields { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - Ok(Vec::::deserialize(deserializer)?.into()) - } -} - /// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From 20522a8a71279bf46da10c887a448879a62b4284 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 17:47:55 +0100 Subject: [PATCH 0763/1411] Convert string_to_timestamp_nanos to doctest (#3978) * Convert string_to_timestamp_nanos to doctest * Update arrow-cast/src/parse.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-cast/src/parse.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index e2c7f9bcc2ca..d7e5529bcd42 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -264,8 +264,18 @@ pub fn string_to_datetime( /// This function interprets string without an explicit time zone as timestamps /// relative to UTC, see [`string_to_datetime`] for alternative semantics /// -/// For example, both `1997-01-31 09:26:56.123Z`, `1997-01-31T09:26:56.123`, -/// and `1997-01-31T14:26:56.123+05:00` will be parsed as the same value +/// In particular: +/// +/// ``` +/// # use arrow_cast::parse::string_to_timestamp_nanos; +/// // Note all three of these timestamps are parsed as the same value +/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); +/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); +/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); +/// +/// assert_eq!(a, b); +/// assert_eq!(b, c); +/// ``` /// #[inline] pub fn string_to_timestamp_nanos(s: &str) -> Result { From dc07f9454251b42388c2a7cae8e3d65264d7130b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 18:08:04 +0100 Subject: [PATCH 0764/1411] Add ObjectStore::list_with_offset (#3970) (#3973) * Stub out ObjectStore::list_with_offset (#3970) * Add tests and add AWS implementation * Update localstack * Add further implementations --- .github/workflows/object_store.yml | 2 +- object_store/src/aws/client.rs | 27 +++++++-- object_store/src/aws/mod.rs | 19 ++++++- object_store/src/chunked.rs | 8 +++ object_store/src/lib.rs | 91 +++++++++++++++++++++++++++++- object_store/src/limit.rs | 10 ++++ object_store/src/throttle.rs | 61 +++++++++++--------- 7 files changed, 181 insertions(+), 37 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index f182d21eef13..8e97c4440567 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -106,7 +106,7 @@ jobs: AWS_SECRET_ACCESS_KEY: test AWS_ENDPOINT: http://localhost:4566 run: | - docker run -d -p 4566:4566 localstack/localstack:0.14.4 + docker run -d -p 4566:4566 localstack/localstack:2.0 docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index bd58d09676aa..7ac4b705b36c 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -382,6 +382,7 @@ impl S3Client { prefix: Option<&str>, delimiter: bool, token: Option<&str>, + offset: Option<&str>, ) -> Result<(ListResult, Option)> { let credential = self.get_credential().await?; let url = self.config.bucket_endpoint.clone(); @@ -403,6 +404,10 @@ impl S3Client { query.push(("prefix", prefix)) } + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + let response = self .client .request(Method::GET, &url) @@ -433,14 +438,24 @@ impl S3Client { &self, prefix: Option<&Path>, delimiter: bool, + offset: Option<&Path>, ) -> BoxStream<'_, Result> { + let offset = offset.map(|x| x.to_string()); let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let (r, next_token) = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - Ok((r, prefix, next_token)) - }) + stream_paginated( + (prefix, offset), + move |(prefix, offset), token| async move { + let (r, next_token) = self + .list_request( + prefix.as_deref(), + delimiter, + token.as_deref(), + offset.as_deref(), + ) + .await?; + Ok((r, (prefix, offset), next_token)) + }, + ) .boxed() } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 752fb2e7df9d..1e302e688978 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -273,7 +273,22 @@ impl ObjectStore for AmazonS3 { ) -> Result>> { let stream = self .client - .list_paginated(prefix, false) + .list_paginated(prefix, false, None) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let stream = self + .client + .list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() .boxed(); @@ -282,7 +297,7 @@ impl ObjectStore for AmazonS3 { } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); + let mut stream = self.client.list_paginated(prefix, true, None); let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 76865ef96701..aebefec61559 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -174,6 +174,14 @@ impl ObjectStore for ChunkedStore { self.inner.list(prefix).await } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.inner.list_with_offset(prefix, offset).await + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.inner.list_with_delimiter(prefix).await } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 706cc076672c..5737071286c8 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -258,7 +258,7 @@ use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT} use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; -use futures::{stream::BoxStream, StreamExt}; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; #[cfg(not(target_arch = "wasm32"))] @@ -371,11 +371,33 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of /// `foo/bar_baz/x`. + /// + /// Note: the order of returned [`ObjectMeta`] is not guaranteed async fn list( &self, prefix: Option<&Path>, ) -> Result>>; + /// List all the objects with the given prefix and a location greater than `offset` + /// + /// Some stores, such as S3 and GCS, may be able to push `offset` down to reduce + /// the number of network requests required + /// + /// Note: the order of returned [`ObjectMeta`] is not guaranteed + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let offset = offset.clone(); + let stream = self + .list(prefix) + .await? + .try_filter(move |f| futures::future::ready(f.location > offset)) + .boxed(); + Ok(stream) + } + /// List objects with the given prefix and an implementation specific /// delimiter. Returns common prefixes (directories) in addition to object /// metadata. @@ -477,6 +499,14 @@ impl ObjectStore for Box { self.as_ref().list(prefix).await } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.as_ref().list_with_offset(prefix, offset).await + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.as_ref().list_with_delimiter(prefix).await } @@ -926,6 +956,65 @@ mod tests { let files = flatten_list_stream(storage, None).await.unwrap(); assert!(files.is_empty(), "{files:?}"); + + // Test list order + let files = vec![ + Path::from("a a/b.file"), + Path::parse("a%2Fa.file").unwrap(), + Path::from("a/😀.file"), + Path::from("a/a file"), + Path::parse("a/a%2F.file").unwrap(), + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("a/b.file"), + Path::from("aa/a.file"), + Path::from("ab/a.file"), + ]; + + for file in &files { + storage.put(file, "foo".into()).await.unwrap(); + } + + let cases = [ + (None, Path::from("a")), + (None, Path::from("a/a file")), + (None, Path::from("a/a/b.file")), + (None, Path::from("ab/a.file")), + (None, Path::from("a%2Fa.file")), + (None, Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("")), + (Some(Path::from("a")), Path::from("a")), + (Some(Path::from("a")), Path::from("a/😀")), + (Some(Path::from("a")), Path::from("a/😀.file")), + (Some(Path::from("a")), Path::from("a/b")), + (Some(Path::from("a")), Path::from("a/a/b.file")), + ]; + + for (prefix, offset) in cases { + let s = storage + .list_with_offset(prefix.as_ref(), &offset) + .await + .unwrap(); + + let mut actual: Vec<_> = + s.map_ok(|x| x.location).try_collect().await.unwrap(); + + actual.sort_unstable(); + + let expected: Vec<_> = files + .iter() + .cloned() + .filter(|x| { + let prefix_match = + prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); + prefix_match && x > &offset + }) + .collect(); + + assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); + } + + delete_fixtures(storage).await; } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index b3e55a918b9a..d0d9f73c5c59 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -147,6 +147,16 @@ impl ObjectStore for LimitStore { Ok(PermitWrapper::new(s, permit).boxed()) } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let s = self.inner.list_with_offset(prefix, offset).await?; + Ok(PermitWrapper::new(s, permit).boxed()) + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.list_with_delimiter(prefix).await diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 6dff64aab69c..e51303114788 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -24,7 +24,7 @@ use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; use async_trait::async_trait; use bytes::Bytes; -use futures::{stream::BoxStream, StreamExt}; +use futures::{stream::BoxStream, FutureExt, StreamExt}; use std::time::Duration; use tokio::io::AsyncWrite; @@ -185,19 +185,10 @@ impl ObjectStore for ThrottledStore { GetResult::File(_, _) => unimplemented!(), }; - GetResult::Stream( - s.then(move |bytes_result| async move { - match bytes_result { - Ok(bytes) => { - let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); - sleep(wait_get_per_byte * bytes_len).await; - Ok(bytes) - } - Err(err) => Err(err), - } - }) - .boxed(), - ) + GetResult::Stream(throttle_stream(s, move |bytes| { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + wait_get_per_byte * bytes_len + })) }) } @@ -247,20 +238,21 @@ impl ObjectStore for ThrottledStore { // need to copy to avoid moving / referencing `self` let wait_list_per_entry = self.config().wait_list_per_entry; + let stream = self.inner.list(prefix).await?; + Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + } - self.inner.list(prefix).await.map(|stream| { - stream - .then(move |result| async move { - match result { - Ok(entry) => { - sleep(wait_list_per_entry).await; - Ok(entry) - } - Err(err) => Err(err), - } - }) - .boxed() - }) + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + sleep(self.config().wait_list_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_list_per_entry = self.config().wait_list_per_entry; + let stream = self.inner.list_with_offset(prefix, offset).await?; + Ok(throttle_stream(stream, move |_| wait_list_per_entry)) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -307,6 +299,21 @@ fn usize_to_u32_saturate(x: usize) -> u32 { x.try_into().unwrap_or(u32::MAX) } +fn throttle_stream( + stream: BoxStream<'_, Result>, + delay: F, +) -> BoxStream<'_, Result> +where + F: Fn(&T) -> Duration + Send + Sync + 'static, +{ + stream + .then(move |result| { + let delay = result.as_ref().ok().map(&delay).unwrap_or_default(); + sleep(delay).then(|_| futures::future::ready(result)) + }) + .boxed() +} + #[cfg(test)] mod tests { use super::*; From 37758df25147ce36d81824650d24679e6c581b19 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 30 Mar 2023 22:06:57 +0100 Subject: [PATCH 0765/1411] Faster i256 parsing (#3950) * Faster i256 parsing * Fix overflow --- arrow-buffer/Cargo.toml | 5 ++ arrow-buffer/benches/i256.rs | 44 +++++++++++ arrow-buffer/src/bigint.rs | 142 ++++++++++++++++++++++++++++++++--- 3 files changed, 181 insertions(+), 10 deletions(-) create mode 100644 arrow-buffer/benches/i256.rs diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 3d2fd71c973c..c5b0c6c26b0b 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -38,6 +38,11 @@ num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } [dev-dependencies] +criterion = { version = "0.4", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [build-dependencies] + +[[bench]] +name = "i256" +harness = false \ No newline at end of file diff --git a/arrow-buffer/benches/i256.rs b/arrow-buffer/benches/i256.rs new file mode 100644 index 000000000000..a04e4cb6cde8 --- /dev/null +++ b/arrow-buffer/benches/i256.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_buffer::i256; +use criterion::*; +use std::str::FromStr; + +fn criterion_benchmark(c: &mut Criterion) { + let numbers = vec![ + i256::ZERO, + i256::ONE, + i256::MINUS_ONE, + i256::from_i128(1233456789), + i256::from_i128(-1233456789), + i256::from_i128(i128::MAX), + i256::from_i128(i128::MIN), + i256::MIN, + i256::MAX, + ]; + + for number in numbers { + let t = black_box(number.to_string()); + c.bench_function(&format!("i256_parse({t})"), |b| { + b.iter(|| i256::from_str(&t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 5abfb7c85230..3a9c4aac8163 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -16,9 +16,28 @@ // under the License. use num::cast::AsPrimitive; -use num::{BigInt, FromPrimitive, Num, ToPrimitive}; +use num::{BigInt, FromPrimitive, ToPrimitive}; use std::cmp::Ordering; -use std::ops::{BitAnd, BitOr, BitXor, Shl, Shr}; +use std::num::ParseIntError; +use std::ops::{BitAnd, BitOr, BitXor, Neg, Shl, Shr}; +use std::str::FromStr; + +/// An opaque error similar to [`std::num::ParseIntError`] +#[derive(Debug)] +pub struct ParseI256Error {} + +impl From for ParseI256Error { + fn from(_: ParseIntError) -> Self { + Self {} + } +} + +impl std::fmt::Display for ParseI256Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Failed to parse as i256") + } +} +impl std::error::Error for ParseI256Error {} /// A signed 256-bit integer #[allow(non_camel_case_types)] @@ -40,6 +59,67 @@ impl std::fmt::Display for i256 { } } +impl FromStr for i256 { + type Err = ParseI256Error; + + fn from_str(s: &str) -> Result { + // i128 can store up to 38 decimal digits + if s.len() <= 38 { + return Ok(Self::from_i128(i128::from_str(s)?)); + } + + let (negative, s) = match s.as_bytes()[0] { + b'-' => (true, &s[1..]), + b'+' => (false, &s[1..]), + _ => (false, s), + }; + + // Trim leading 0s + let s = s.trim_start_matches('0'); + if s.is_empty() { + return Ok(i256::ZERO); + } + + if !s.as_bytes()[0].is_ascii_digit() { + // Ensures no duplicate sign + return Err(ParseI256Error {}); + } + + parse_impl(s, negative) + } +} + +/// Parse `s` with any sign and leading 0s removed +fn parse_impl(s: &str, negative: bool) -> Result { + if s.len() <= 38 { + let low = i128::from_str(s)?; + return Ok(match negative { + true => i256::from_parts(low.neg() as _, -1), + false => i256::from_parts(low as _, 0), + }); + } + + let split = s.len() - 38; + if !s.as_bytes()[split].is_ascii_digit() { + // Ensures not splitting codepoint and no sign + return Err(ParseI256Error {}); + } + let (hs, ls) = s.split_at(split); + + let mut low = i128::from_str(ls)?; + let high = parse_impl(hs, negative)?; + + if negative { + low = -low; + } + + let low = i256::from_i128(low); + + high.checked_mul(i256::from_i128(10_i128.pow(38))) + .and_then(|high| high.checked_add(low)) + .ok_or(ParseI256Error {}) +} + impl PartialOrd for i256 { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -106,14 +186,7 @@ impl i256 { /// Create an integer value from its representation as string. #[inline] pub fn from_string(value_str: &str) -> Option { - let numbers = BigInt::from_str_radix(value_str, 10).ok()?; - let (integer, overflow) = Self::from_bigint_with_overflow(numbers); - - if overflow { - None - } else { - Some(integer) - } + value_str.parse().ok() } /// Create an optional i256 from the provided `f64`. Returning `None` @@ -915,4 +988,53 @@ mod tests { let i128 = a.as_i128(); assert_eq!(i128, i128::MAX - 1); } + + #[test] + fn test_string_roundtrip() { + let roundtrip_cases = [ + i256::ZERO, + i256::ONE, + i256::MINUS_ONE, + i256::from_i128(123456789), + i256::from_i128(-123456789), + i256::from_i128(i128::MIN), + i256::from_i128(i128::MAX), + i256::MIN, + i256::MAX, + ]; + for case in roundtrip_cases { + let formatted = case.to_string(); + let back: i256 = formatted.parse().unwrap(); + assert_eq!(case, back); + } + } + + #[test] + fn test_from_string() { + let cases = [ + ( + "000000000000000000000000000000000000000011", + Some(i256::from_i128(11)), + ), + ( + "-000000000000000000000000000000000000000011", + Some(i256::from_i128(-11)), + ), + ( + "-0000000000000000000000000000000000000000123456789", + Some(i256::from_i128(-123456789)), + ), + ("-", None), + ("+", None), + ("--1", None), + ("-+1", None), + ("000000000000000000000000000000000000000", Some(i256::ZERO)), + ("0000000000000000000000000000000000000000-11", None), + ("11-1111111111111111111111111111111111111", None), + ("115792089237316195423570985008687907853269984665640564039457584007913129639936", None) + ]; + for (case, expected) in cases { + assert_eq!(i256::from_string(case), expected) + } + } } From 756505495103ef67ec2c44e78ea117c4936ce959 Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Fri, 31 Mar 2023 18:59:44 +0800 Subject: [PATCH 0766/1411] Fix typos (#3985) * fix typos in comments * fix typos in changelog. * fix typos in readme. * fix typos in string literals. * fix typos in unit tests func names. * fix typos in codes. --- CHANGELOG-old.md | 36 ++++++------ arrow-array/src/array/list_array.rs | 2 +- arrow-array/src/array/map_array.rs | 2 +- arrow-array/src/array/mod.rs | 6 +- arrow-array/src/array/run_array.rs | 2 +- arrow-array/src/array/string_array.rs | 2 +- arrow-array/src/array/union_array.rs | 2 +- arrow-array/src/cast.rs | 2 +- arrow-array/src/types.rs | 4 +- arrow-csv/src/reader/mod.rs | 10 ++-- arrow-data/src/data/mod.rs | 2 +- arrow-data/src/transform/mod.rs | 8 +-- arrow-flight/src/client.rs | 2 +- arrow-flight/src/error.rs | 2 +- arrow-flight/tests/encode_decode.rs | 56 +++++++++---------- arrow-integration-testing/tests/ipc_reader.rs | 2 +- arrow-ipc/src/compression.rs | 2 +- arrow-ipc/src/convert.rs | 2 +- arrow-json/src/reader.rs | 4 +- arrow-json/src/writer.rs | 6 +- arrow-ord/src/sort.rs | 4 +- arrow-select/src/take.rs | 2 +- arrow/examples/README.md | 2 +- arrow/src/util/bench_util.rs | 2 +- arrow/src/util/data_gen.rs | 2 +- dev/release/README.md | 6 +- object_store/CHANGELOG-old.md | 2 +- object_store/src/azure/client.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 2 +- parquet/src/arrow/arrow_reader/selection.rs | 2 +- parquet/src/basic.rs | 2 +- parquet/src/compression.rs | 2 +- parquet/src/data_type.rs | 4 +- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/record/api.rs | 2 +- parquet/src/record/reader.rs | 2 +- parquet/src/record/triplet.rs | 2 +- 37 files changed, 98 insertions(+), 98 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 8ddd7c6b6619..ebdab71b2401 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -159,7 +159,7 @@ - Support UTF8 cast to Timestamp with timezone [\#3664](https://github.com/apache/arrow-rs/issues/3664) - Add modulus\_dyn and modulus\_scalar\_dyn [\#3648](https://github.com/apache/arrow-rs/issues/3648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - A trait for append\_value and append\_null on ArrayBuilders [\#3644](https://github.com/apache/arrow-rs/issues/3644) -- Improve error messge "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve error message "batches\[0\] schema is different with argument schema" [\#3628](https://github.com/apache/arrow-rs/issues/3628) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Specified version of helper function to cast binary to string [\#3623](https://github.com/apache/arrow-rs/issues/3623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Casting generic binary to generic string [\#3606](https://github.com/apache/arrow-rs/issues/3606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Use `array_value_to_string` in `arrow-csv` [\#3483](https://github.com/apache/arrow-rs/issues/3483) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] @@ -201,7 +201,7 @@ - Include line and field number in CSV UTF-8 error \(\#3656\) [\#3657](https://github.com/apache/arrow-rs/pull/3657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Handle non-contiguous type\_ids in UnionArray \(\#3653\) [\#3654](https://github.com/apache/arrow-rs/pull/3654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add modulus\_dyn and modulus\_scalar\_dyn [\#3649](https://github.com/apache/arrow-rs/pull/3649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve error messge with detailed schema [\#3637](https://github.com/apache/arrow-rs/pull/3637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Veeupup](https://github.com/Veeupup)) +- Improve error message with detailed schema [\#3637](https://github.com/apache/arrow-rs/pull/3637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Veeupup](https://github.com/Veeupup)) - Add limit to ArrowReaderBuilder to push limit down to parquet reader [\#3633](https://github.com/apache/arrow-rs/pull/3633) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) - chore: delete wrong comment and refactor set\_metadata in `Field` [\#3630](https://github.com/apache/arrow-rs/pull/3630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([chunshao90](https://github.com/chunshao90)) - Fix typo in comment [\#3627](https://github.com/apache/arrow-rs/pull/3627) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kjschiroo](https://github.com/kjschiroo)) @@ -245,7 +245,7 @@ - Nullif of NULL Predicate is not NULL [\#3589](https://github.com/apache/arrow-rs/issues/3589) - BooleanBufferBuilder Fails to Clear Set Bits On Truncate [\#3587](https://github.com/apache/arrow-rs/issues/3587) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `nullif` incorrectly calculates `null_count`, sometimes panics with substraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `nullif` incorrectly calculates `null_count`, sometimes panics with subtraction overflow error [\#3579](https://github.com/apache/arrow-rs/issues/3579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Meet warning when use pyarrow [\#3543](https://github.com/apache/arrow-rs/issues/3543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Incorrect row group total\_byte\_size written to parquet file [\#3530](https://github.com/apache/arrow-rs/issues/3530) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Overflow when casting timestamps prior to the epoch [\#3512](https://github.com/apache/arrow-rs/issues/3512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] @@ -538,7 +538,7 @@ - Update prost-build requirement from =0.11.2 to =0.11.3 [\#3225](https://github.com/apache/arrow-rs/pull/3225) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) - Get the round result for decimal to a decimal with smaller scale [\#3224](https://github.com/apache/arrow-rs/pull/3224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) - Move tests which require chrono-tz feature from `arrow-cast` to `arrow` [\#3222](https://github.com/apache/arrow-rs/pull/3222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- add test cases for extracing week with/without timezone [\#3218](https://github.com/apache/arrow-rs/pull/3218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) +- add test cases for extracting week with/without timezone [\#3218](https://github.com/apache/arrow-rs/pull/3218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waitingkuo](https://github.com/waitingkuo)) - Use RegexSet for matching DataType [\#3217](https://github.com/apache/arrow-rs/pull/3217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) - Update tonic-build to 0.8.3 [\#3214](https://github.com/apache/arrow-rs/pull/3214) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) - Support StructArray in Row Format \(\#3159\) [\#3212](https://github.com/apache/arrow-rs/pull/3212) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) @@ -600,7 +600,7 @@ **Fixed bugs:** -- arithmatic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arithmetic overflow leads to segfault in `concat_batches` [\#3123](https://github.com/apache/arrow-rs/issues/3123) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Clippy failing on master : error: use of deprecated associated function chrono::NaiveDate::from\_ymd: use from\_ymd\_opt\(\) instead [\#3097](https://github.com/apache/arrow-rs/issues/3097) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Pretty print for interval types has wrong formatting [\#3092](https://github.com/apache/arrow-rs/issues/3092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Field is not serializable with binary formats [\#3082](https://github.com/apache/arrow-rs/issues/3082) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] @@ -1142,7 +1142,7 @@ - Use same codebase for boolean kernels [\#2507](https://github.com/apache/arrow-rs/issues/2507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Use u8 for Decimal Precision and Scale [\#2496](https://github.com/apache/arrow-rs/issues/2496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Integrate skip row without pageIndex in SerializedPageReader in Fuzz Test [\#2475](https://github.com/apache/arrow-rs/issues/2475) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Avoid unecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Avoid unnecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add GenericColumnReader::skip\_records Missing OffsetIndex Fallback [\#2433](https://github.com/apache/arrow-rs/issues/2433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Support Reading PageIndex with ParquetRecordBatchStream [\#2430](https://github.com/apache/arrow-rs/issues/2430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Specialize FixedLenByteArrayReader for Parquet [\#2318](https://github.com/apache/arrow-rs/issues/2318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] @@ -1151,11 +1151,11 @@ **Fixed bugs:** - Casting timestamp array to string should not ignore timezone [\#2607](https://github.com/apache/arrow-rs/issues/2607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Ilike\_ut8\_scalar kernals have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Ilike\_ut8\_scalar kernels have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Always validate the array data when creating array in IPC reader [\#2541](https://github.com/apache/arrow-rs/issues/2541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Int96Converter Truncates Timestamps [\#2480](https://github.com/apache/arrow-rs/issues/2480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Error Reading Page Index When Not Available [\#2434](https://github.com/apache/arrow-rs/issues/2434) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ParquetFileArrowReader::get_record_reader[_by_colum]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ParquetFileArrowReader::get_record_reader[_by_column]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Documentation updates:** @@ -1197,7 +1197,7 @@ - Compare dictionary array with string array [\#2549](https://github.com/apache/arrow-rs/pull/2549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - Always validate the array data \(except the `Decimal`\) when creating array in IPC reader [\#2547](https://github.com/apache/arrow-rs/pull/2547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) - MINOR: Fix test\_row\_type\_validation test [\#2546](https://github.com/apache/arrow-rs/pull/2546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix ilike\_utf8\_scalar kernals [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Fix ilike\_utf8\_scalar kernels [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) - fix typo [\#2540](https://github.com/apache/arrow-rs/pull/2540) ([00Masato](https://github.com/00Masato)) - Compare dictionary array and primitive array in lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn kernels [\#2539](https://github.com/apache/arrow-rs/pull/2539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) - \[MINOR\]Avoid large over allocate buffer in async reader [\#2537](https://github.com/apache/arrow-rs/pull/2537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) @@ -1627,7 +1627,7 @@ - Incorrect `null_count` of DictionaryArray [\#1962](https://github.com/apache/arrow-rs/issues/1962) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support multi diskRanges for ChunkReader [\#1955](https://github.com/apache/arrow-rs/issues/1955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Persisting Arrow timestamps with Parquet produces missing `TIMESTAMP` in schema [\#1920](https://github.com/apache/arrow-rs/issues/1920) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Sperate get\_next\_page\_header from get\_next\_page in PageReader [\#1834](https://github.com/apache/arrow-rs/issues/1834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Separate get\_next\_page\_header from get\_next\_page in PageReader [\#1834](https://github.com/apache/arrow-rs/issues/1834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Merged pull requests:** @@ -1684,7 +1684,7 @@ - `PrimitiveArray::from_iter` should omit validity buffer if all values are valid [\#1856](https://github.com/apache/arrow-rs/issues/1856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add `from(v: Vec>)` and `from(v: Vec<&[u8]>)` for `FixedSizedBInaryArray` [\#1852](https://github.com/apache/arrow-rs/issues/1852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add `Vec`-inspired APIs to `BufferBuilder` [\#1850](https://github.com/apache/arrow-rs/issues/1850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- PyArrow intergation test for C Stream Interface [\#1847](https://github.com/apache/arrow-rs/issues/1847) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- PyArrow integration test for C Stream Interface [\#1847](https://github.com/apache/arrow-rs/issues/1847) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add `nilike` support in `comparison` [\#1845](https://github.com/apache/arrow-rs/issues/1845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Split up `arrow::array::builder` module [\#1843](https://github.com/apache/arrow-rs/issues/1843) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add `quarter` support in `temporal` kernels [\#1835](https://github.com/apache/arrow-rs/issues/1835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] @@ -2081,7 +2081,7 @@ **Fixed bugs:** -- Error Infering Schema for LogicalType::UNKNOWN [\#1557](https://github.com/apache/arrow-rs/issues/1557) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Error Inferring Schema for LogicalType::UNKNOWN [\#1557](https://github.com/apache/arrow-rs/issues/1557) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - Read dictionary from nested struct in ipc stream reader panics [\#1549](https://github.com/apache/arrow-rs/issues/1549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - `filter` produces invalid sparse `UnionArray`s [\#1547](https://github.com/apache/arrow-rs/issues/1547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Documentation for `GenericListBuilder` is not exposed. [\#1518](https://github.com/apache/arrow-rs/issues/1518) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] @@ -2607,7 +2607,7 @@ * [094037d418381584178db1d886cad3b5024b414a](https://github.com/apache/arrow-rs/commit/094037d418381584178db1d886cad3b5024b414a) Update comfy-table to 5.0 ([#957](https://github.com/apache/arrow-rs/pull/957)) ([#964](https://github.com/apache/arrow-rs/pull/964)) * [9f635021eee6786c5377c891218c5f88ebce07c3](https://github.com/apache/arrow-rs/commit/9f635021eee6786c5377c891218c5f88ebce07c3) Fix csv writing of timestamps to show timezone. ([#849](https://github.com/apache/arrow-rs/pull/849)) ([#963](https://github.com/apache/arrow-rs/pull/963)) * [f7deba4c3a050a52608462ee8a827bb8f6364140](https://github.com/apache/arrow-rs/commit/f7deba4c3a050a52608462ee8a827bb8f6364140) Adding ability to parse float from number with leading decimal ([#831](https://github.com/apache/arrow-rs/pull/831)) ([#962](https://github.com/apache/arrow-rs/pull/962)) -* [59f96e842d05b63882f7ba285c66a9739761cf84](https://github.com/apache/arrow-rs/commit/59f96e842d05b63882f7ba285c66a9739761cf84) add ilike comparitor ([#874](https://github.com/apache/arrow-rs/pull/874)) ([#961](https://github.com/apache/arrow-rs/pull/961)) +* [59f96e842d05b63882f7ba285c66a9739761cf84](https://github.com/apache/arrow-rs/commit/59f96e842d05b63882f7ba285c66a9739761cf84) add ilike comparator ([#874](https://github.com/apache/arrow-rs/pull/874)) ([#961](https://github.com/apache/arrow-rs/pull/961)) * [54023c8a5543c9f9fa4955afa01189029f3e96f5](https://github.com/apache/arrow-rs/commit/54023c8a5543c9f9fa4955afa01189029f3e96f5) Remove unpassable cargo publish check from verify-release-candidate.sh ([#882](https://github.com/apache/arrow-rs/pull/882)) ([#949](https://github.com/apache/arrow-rs/pull/949)) @@ -2704,7 +2704,7 @@ **Fixed bugs:** - Converting from string to timestamp uses microseconds instead of milliseconds [\#780](https://github.com/apache/arrow-rs/issues/780) -- Document has no link to `RowColumIter` [\#762](https://github.com/apache/arrow-rs/issues/762) +- Document has no link to `RowColumnIter` [\#762](https://github.com/apache/arrow-rs/issues/762) - length on slices with null doesn't work [\#744](https://github.com/apache/arrow-rs/issues/744) ## [5.4.0](https://github.com/apache/arrow-rs/tree/5.4.0) (2021-09-10) @@ -2762,7 +2762,7 @@ - Remove undefined behavior in `value` method of boolean and primitive arrays [\#645](https://github.com/apache/arrow-rs/issues/645) - Avoid materialization of indices in filter\_record\_batch for single arrays [\#636](https://github.com/apache/arrow-rs/issues/636) - Add a note about arrow crate security / safety [\#627](https://github.com/apache/arrow-rs/issues/627) -- Allow the creation of String arrays from an interator of &Option\<&str\> [\#598](https://github.com/apache/arrow-rs/issues/598) +- Allow the creation of String arrays from an iterator of &Option\<&str\> [\#598](https://github.com/apache/arrow-rs/issues/598) - Support arrow map datatype [\#395](https://github.com/apache/arrow-rs/issues/395) **Fixed bugs:** @@ -2891,7 +2891,7 @@ - Add C data interface for decimal128 and timestamp [\#453](https://github.com/apache/arrow-rs/pull/453) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alippai](https://github.com/alippai)) - Implement the Iterator trait for the json Reader. [\#451](https://github.com/apache/arrow-rs/pull/451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([LaurentMazare](https://github.com/LaurentMazare)) - Update release docs + release email template [\#450](https://github.com/apache/arrow-rs/pull/450) ([alamb](https://github.com/alamb)) -- remove clippy unnecessary wraps suppresions in cast kernel [\#449](https://github.com/apache/arrow-rs/pull/449) ([Jimexist](https://github.com/Jimexist)) +- remove clippy unnecessary wraps suppression in cast kernel [\#449](https://github.com/apache/arrow-rs/pull/449) ([Jimexist](https://github.com/Jimexist)) - Use partition for bool sort [\#448](https://github.com/apache/arrow-rs/pull/448) ([Jimexist](https://github.com/Jimexist)) - remove unnecessary wraps in sort [\#445](https://github.com/apache/arrow-rs/pull/445) ([Jimexist](https://github.com/Jimexist)) - Python FFI bridge for Schema, Field and DataType [\#439](https://github.com/apache/arrow-rs/pull/439) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kszucs](https://github.com/kszucs)) @@ -2964,7 +2964,7 @@ - ARROW-12504: Buffer::from\_slice\_ref set correct capacity [\#18](https://github.com/apache/arrow-rs/pull/18) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Add GitHub templates [\#17](https://github.com/apache/arrow-rs/pull/17) ([andygrove](https://github.com/andygrove)) - ARROW-12493: Add support for writing dictionary arrays to CSV and JSON [\#16](https://github.com/apache/arrow-rs/pull/16) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- ARROW-12426: \[Rust\] Fix concatentation of arrow dictionaries [\#15](https://github.com/apache/arrow-rs/pull/15) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- ARROW-12426: \[Rust\] Fix concatenation of arrow dictionaries [\#15](https://github.com/apache/arrow-rs/pull/15) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update repository and homepage urls [\#14](https://github.com/apache/arrow-rs/pull/14) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Dandandan](https://github.com/Dandandan)) - Added rebase-needed bot [\#13](https://github.com/apache/arrow-rs/pull/13) ([jorgecarleitao](https://github.com/jorgecarleitao)) - Added Integration tests against arrow [\#10](https://github.com/apache/arrow-rs/pull/10) ([jorgecarleitao](https://github.com/jorgecarleitao)) @@ -3108,7 +3108,7 @@ - Support sort [\#215](https://github.com/apache/arrow-rs/issues/215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support stable Rust [\#214](https://github.com/apache/arrow-rs/issues/214) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Remove Rust and point integration tests to arrow-rs repo [\#211](https://github.com/apache/arrow-rs/issues/211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- ArrayData buffers are inconsistent accross implementations [\#207](https://github.com/apache/arrow-rs/issues/207) +- ArrayData buffers are inconsistent across implementations [\#207](https://github.com/apache/arrow-rs/issues/207) - 3.0.1 patch release [\#204](https://github.com/apache/arrow-rs/issues/204) - Document patch release process [\#202](https://github.com/apache/arrow-rs/issues/202) - Simplify Offset [\#186](https://github.com/apache/arrow-rs/issues/186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 8961d606e4f7..8b314596d959 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -182,7 +182,7 @@ impl GenericListArray { impl From for GenericListArray { fn from(data: ArrayData) -> Self { Self::try_new_from_array_data(data).expect( - "Expected infallable creation of GenericListArray from ArrayDataRef failed", + "Expected infallible creation of GenericListArray from ArrayDataRef failed", ) } } diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index fd4e2bd593e4..3d78387cdf50 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -106,7 +106,7 @@ impl MapArray { impl From for MapArray { fn from(data: ArrayData) -> Self { Self::try_new_from_array_data(data) - .expect("Expected infallable creation of MapArray from ArrayData failed") + .expect("Expected infallible creation of MapArray from ArrayData failed") } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index ead8b3b99d46..589cf1eaf4aa 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -266,7 +266,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { #[allow(deprecated)] // (#3880) fn get_array_memory_size(&self) -> usize { // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, - // to only count additional fields of this array substract size_of(ArrayData) + // to only count additional fields of this array subtract size_of(ArrayData) self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) - std::mem::size_of::() } @@ -964,7 +964,7 @@ mod tests { let empty = PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); - // substract empty array to avoid magic numbers for the size of additional fields + // subtract empty array to avoid magic numbers for the size of additional fields assert_eq!( arr.get_array_memory_size() - empty.get_array_memory_size(), 128 * std::mem::size_of::() @@ -993,7 +993,7 @@ mod tests { empty_with_bitmap.get_array_memory_size() ); - // substract empty array to avoid magic numbers for the size of additional fields + // subtract empty array to avoid magic numbers for the size of additional fields // the size of the validity bitmap is rounded up to 64 bytes assert_eq!( arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(), diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index c3c5269374f1..ada34b47f8a5 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -202,7 +202,7 @@ impl RunArray { // to iterate `logical_indices` in sorted order. let mut ordered_indices: Vec = (0..indices_len).collect(); - // Instead of sorting `logical_idices` directly, sort the `ordered_indices` + // Instead of sorting `logical_indices` directly, sort the `ordered_indices` // whose values are index of `logical_indices` ordered_indices.sort_unstable_by(|lhs, rhs| { logical_indices[*lhs] diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index f339a616f300..304f0ab3eee9 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -614,7 +614,7 @@ mod tests { #[test] #[should_panic(expected = "The child array cannot contain null values.")] - fn test_stirng_array_from_list_array_with_child_nulls_failed() { + fn test_string_array_from_list_array_with_child_nulls_failed() { _test_generic_string_array_from_list_array_with_child_nulls_failed::(); } diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 335b6b14f8a3..67848b4a85cb 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -957,7 +957,7 @@ mod tests { } #[test] - fn test_union_array_validaty() { + fn test_union_array_validity() { let mut builder = UnionBuilder::new_sparse(); builder.append::("a", 1).unwrap(); builder.append_null::("a").unwrap(); diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index feb9167b2981..21993114ea7d 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -396,7 +396,7 @@ macro_rules! downcast_primitive_array { /// /// let arr: ArrayRef = Arc::new(Int32Array::from(vec![Some(1)])); /// -/// // Downcast an `ArrayRef` to Int32Array / PrimiveArray: +/// // Downcast an `ArrayRef` to Int32Array / PrimitiveArray: /// let primitive_array: &Int32Array = as_primitive_array(&arr); /// /// // Equivalently: diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 60a632a060d7..827729ca682e 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -517,7 +517,7 @@ impl Date32Type { /// # Arguments /// /// * `date` - The date on which to perform the operation - /// * `delta` - The interval to substract + /// * `delta` - The interval to subtract pub fn subtract_year_months( date: ::Native, delta: ::Native, @@ -641,7 +641,7 @@ impl Date64Type { /// # Arguments /// /// * `date` - The date on which to perform the operation - /// * `delta` - The interval to substract + /// * `delta` - The interval to subtract pub fn subtract_year_months( date: ::Native, delta: ::Native, diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 894c113aefc9..7fecc1ad92b4 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -183,7 +183,7 @@ fn infer_file_schema_with_csv_options( /// `max_read_records` controlling the maximum number of records to read. If `max_read_records` is /// not set, all records are read to infer the schema. /// -/// Return infered schema and number of records used for inference. +/// Return inferred schema and number of records used for inference. pub fn infer_reader_schema( reader: R, delimiter: u8, @@ -287,7 +287,7 @@ fn infer_reader_schema_with_csv_options( /// Infer schema from a list of CSV files by reading through first n records /// with `max_read_records` controlling the maximum number of records to read. /// -/// Files will be read in the given order untill n records have been reached. +/// Files will be read in the given order until n records have been reached. /// /// If `max_read_records` is not set, all files will be read fully to infer the schema. pub fn infer_schema_from_files( @@ -1048,14 +1048,14 @@ impl ReaderBuilder { } /// Set the datetime regex used to parse the string to Date64Type - /// this regex is used while infering schema + /// this regex is used while inferring schema pub fn with_datetime_re(mut self, datetime_re: Regex) -> Self { self.datetime_re = Some(datetime_re); self } - /// Set the datetime fromat used to parse the string to Date64Type - /// this fromat is used while when the schema wants to parse Date64Type. + /// Set the datetime format used to parse the string to Date64Type + /// this format is used while when the schema wants to parse Date64Type. /// /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) /// diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 581d4a10cc1c..10bf973065a0 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -180,7 +180,7 @@ pub(crate) fn into_buffers( /// # Memory Layout /// /// `ArrayData` has references to one or more underlying data buffers -/// and optional child ArrayDatas, depending on type as illustrated +/// and optional child ArrayData, depending on type as illustrated /// below. Bitmaps are not shown for simplicity but they are stored /// similarly to the buffers. /// diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 52ce5ead725c..c74875072233 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -418,7 +418,7 @@ impl<'a> MutableArrayData<'a> { | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { - let childs = arrays + let children = arrays .iter() .map(|array| &array.child_data()[0]) .collect::>(); @@ -435,7 +435,7 @@ impl<'a> MutableArrayData<'a> { }; vec![MutableArrayData::with_capacities( - childs, use_nulls, capacities, + children, use_nulls, capacities, )] } // the dictionary type just appends keys and clones the values. @@ -495,11 +495,11 @@ impl<'a> MutableArrayData<'a> { ] } DataType::FixedSizeList(_, _) => { - let childs = arrays + let children = arrays .iter() .map(|array| &array.child_data()[0]) .collect::>(); - vec![MutableArrayData::new(childs, use_nulls, array_capacity)] + vec![MutableArrayData::new(children, use_nulls, array_capacity)] } DataType::Union(fields, _) => (0..fields.len()) .map(|i| { diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index fe1292fcff6e..f843bbf7cd0c 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -391,7 +391,7 @@ impl FlightClient { } /// Make a `ListFlights` call to the server with the provided - /// critera and returning a [`Stream`](futures::Stream) of [`FlightInfo`]. + /// criteria and returning a [`Stream`](futures::Stream) of [`FlightInfo`]. /// /// # Example: /// ```no_run diff --git a/arrow-flight/src/error.rs b/arrow-flight/src/error.rs index 5524dd1a4654..e054883e965d 100644 --- a/arrow-flight/src/error.rs +++ b/arrow-flight/src/error.rs @@ -30,7 +30,7 @@ pub enum FlightError { Tonic(tonic::Status), /// Some unexpected message was received ProtocolError(String), - /// An error occured during decoding + /// An error occurred during decoding DecodeError(String), /// External error that can provide source of error by calling `Error::source`. ExternalError(Box), diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 2841d4bf5edb..ec86fbcc0bdf 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -38,7 +38,7 @@ async fn test_empty() { #[tokio::test] async fn test_empty_batch() { - let batch = make_primative_batch(5); + let batch = make_primitive_batch(5); let empty = RecordBatch::new_empty(batch.schema()); roundtrip(vec![empty]).await; } @@ -59,13 +59,13 @@ async fn test_error() { } #[tokio::test] -async fn test_primative_one() { - roundtrip(vec![make_primative_batch(5)]).await; +async fn test_primitive_one() { + roundtrip(vec![make_primitive_batch(5)]).await; } #[tokio::test] async fn test_schema_metadata() { - let batch = make_primative_batch(5); + let batch = make_primitive_batch(5); let metadata = HashMap::from([("some_key".to_owned(), "some_value".to_owned())]); // create a batch that has schema level metadata @@ -76,18 +76,18 @@ async fn test_schema_metadata() { } #[tokio::test] -async fn test_primative_many() { +async fn test_primitive_many() { roundtrip(vec![ - make_primative_batch(1), - make_primative_batch(7), - make_primative_batch(32), + make_primitive_batch(1), + make_primitive_batch(7), + make_primitive_batch(32), ]) .await; } #[tokio::test] -async fn test_primative_empty() { - let batch = make_primative_batch(5); +async fn test_primitive_empty() { + let batch = make_primitive_batch(5); let empty = RecordBatch::new_empty(batch.schema()); roundtrip(vec![batch, empty]).await; @@ -137,7 +137,7 @@ async fn test_zero_batches_schema_specified() { } #[tokio::test] -async fn test_zero_batches_dictonary_schema_specified() { +async fn test_zero_batches_dictionary_schema_specified() { let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int64, false), Field::new( @@ -166,7 +166,7 @@ async fn test_zero_batches_dictonary_schema_specified() { #[tokio::test] async fn test_app_metadata() { - let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(78))]); + let input_batch_stream = futures::stream::iter(vec![Ok(make_primitive_batch(78))]); let app_metadata = Bytes::from("My Metadata"); let encoder = FlightDataEncoderBuilder::default().with_metadata(app_metadata.clone()); @@ -196,7 +196,7 @@ async fn test_app_metadata() { #[tokio::test] async fn test_max_message_size() { - let input_batch_stream = futures::stream::iter(vec![Ok(make_primative_batch(5))]); + let input_batch_stream = futures::stream::iter(vec![Ok(make_primitive_batch(5))]); // 5 input rows, with a very small limit should result in 5 batch messages let encoder = FlightDataEncoderBuilder::default().with_max_flight_data_size(1); @@ -223,13 +223,13 @@ async fn test_max_message_size_fuzz() { // send through batches of varying sizes with various max // batch sizes and ensure the data gets through ok let input = vec![ - make_primative_batch(123), - make_primative_batch(17), - make_primative_batch(201), - make_primative_batch(2), - make_primative_batch(1), - make_primative_batch(11), - make_primative_batch(127), + make_primitive_batch(123), + make_primitive_batch(17), + make_primitive_batch(201), + make_primitive_batch(2), + make_primitive_batch(1), + make_primitive_batch(11), + make_primitive_batch(127), ]; for max_message_size_bytes in [10, 1024, 2048, 6400, 3211212] { @@ -257,7 +257,7 @@ async fn test_max_message_size_fuzz() { async fn test_mismatched_record_batch_schema() { // send 2 batches with different schemas let input_batch_stream = futures::stream::iter(vec![ - Ok(make_primative_batch(5)), + Ok(make_primitive_batch(5)), Ok(make_dictionary_batch(3)), ]); @@ -274,7 +274,7 @@ async fn test_mismatched_record_batch_schema() { #[tokio::test] async fn test_chained_streams_batch_decoder() { - let batch1 = make_primative_batch(5); + let batch1 = make_primitive_batch(5); let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas @@ -299,7 +299,7 @@ async fn test_chained_streams_batch_decoder() { #[tokio::test] async fn test_chained_streams_data_decoder() { - let batch1 = make_primative_batch(5); + let batch1 = make_primitive_batch(5); let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas @@ -363,27 +363,27 @@ async fn test_mismatched_schema_message() { // primitive batch first (has more columns) do_test( - make_primative_batch(5), + make_primitive_batch(5), make_dictionary_batch(3), "Error decoding ipc RecordBatch: Io error: Invalid data for schema", ) .await; - // dictioanry batch first + // dictionary batch first do_test( make_dictionary_batch(3), - make_primative_batch(5), + make_primitive_batch(5), "Error decoding ipc RecordBatch: Invalid argument error", ) .await; } -/// Make a primtive batch for testing +/// Make a primitive batch for testing /// /// Example: /// i: 0, 1, None, 3, 4 /// f: 5.0, 4.0, None, 2.0, 1.0 -fn make_primative_batch(num_rows: usize) -> RecordBatch { +fn make_primitive_batch(num_rows: usize) -> RecordBatch { let i: UInt8Array = (0..num_rows) .map(|i| { if i == num_rows / 2 { diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index d6e81cd9883b..9205f4318393 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -96,7 +96,7 @@ fn read_1_0_0_bigendian() { FileReader::try_new(file, None).unwrap(); // While the the reader doesn't error but the values are not - // read correctly on little endian platforms so verifing the + // read correctly on little endian platforms so verifying the // contents fails // // https://github.com/apache/arrow-rs/issues/3459 diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index e6e203bc0034..dd60bfdeec66 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -98,7 +98,7 @@ impl CompressionCodec { // compressed let decompressed_length = read_uncompressed_size(input); let buffer = if decompressed_length == 0 { - // emtpy + // empty Buffer::from([]) } else if decompressed_length == LENGTH_NO_COMPRESSED_DATA { // no compression diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 334b9f65627b..cc2a7786c3ff 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -161,7 +161,7 @@ pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result Result { // There are two protocol types: https://issues.apache.org/jira/browse/ARROW-6313 - // The original protocal is: + // The original protocol is: // 4 bytes - the byte length of the payload // a flatbuffer Message whose header is the Schema // The latest version of protocol is: diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index df6b998bee04..f5bf884fb2ca 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -672,7 +672,7 @@ impl Decoder { } /// Read the next batch of [`serde_json::Value`] records from the - /// interator into a [`RecordBatch`]. + /// iterator into a [`RecordBatch`]. /// /// Returns `None` if the input iterator is exhausted. pub fn next_batch( @@ -2172,7 +2172,7 @@ mod tests { } #[test] - fn test_coersion_scalar_and_list() { + fn test_coercion_scalar_and_list() { use arrow_schema::DataType::*; assert_eq!( diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 5d8abfafc4b1..1b950f794275 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1233,7 +1233,7 @@ mod tests { let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); - // remove null value from object to make comparision consistent: + // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { expected_json = Value::Object( obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), @@ -1424,7 +1424,7 @@ mod tests { let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); - // remove null value from object to make comparision consistent: + // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { expected_json = Value::Object( obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), @@ -1468,7 +1468,7 @@ mod tests { let expected = format!("{expected}\n{expected}"); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); - // remove null value from object to make comparision consistent: + // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { expected_json = Value::Object( obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 6e0becc36c67..b4f498813a82 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -383,7 +383,7 @@ pub fn sort_to_indices( DataType::Int64 => sort_run_to_indices::(values, &options, limit), dt => { return Err(ArrowError::ComputeError(format!( - "Inavlid run end data type: {dt}" + "Invalid run end data type: {dt}" ))) } }, @@ -771,7 +771,7 @@ where // Call the consumer using the run length and starting logical index. for physical_index in values_indices.values() { // As the values were sliced with offset = start_physical_index, it has to be added back - // before accesing `RunArray::run_ends` + // before accessing `RunArray::run_ends` let physical_index = *physical_index as usize + start_physical_index; // calculate the run length and logical index of sorted values diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 83fe1bb56f35..2befcd05447a 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -36,7 +36,7 @@ use num::{ToPrimitive, Zero}; /// │ A │ │ 0 │ │ A │ /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ /// │ D │ │ 2 │ │ B │ -/// ├─────────────────┤ ├─────────┤ take(values, indicies) ├─────────────────┤ +/// ├─────────────────┤ ├─────────┤ take(values, indices) ├─────────────────┤ /// │ B │ │ 3 │ ─────────────────────────▶ │ C │ /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ /// │ C │ │ 1 │ │ D │ diff --git a/arrow/examples/README.md b/arrow/examples/README.md index 314ce9c620f1..7ec3b008b768 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -22,6 +22,6 @@ - [`builders.rs`](builders.rs): Using the Builder API - [`collect.rs`](collect.rs): Using the `FromIter` API - [`dynamic_types.rs`](dynamic_types.rs): -- [`read_csv.rs`](read_csv.rs): Reading CSV files with explict schema, pretty printing Arrays +- [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty printing Arrays - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, pretty printing Arrays - [`tensor_builder.rs`](tensor_builder.rs): Using tensor builder diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index b8199031796e..9bdc24783736 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -179,7 +179,7 @@ pub fn create_primitive_run_array( } /// Create string array to be used by run array builder. The string array -/// will result in run array with physial length of `physical_array_len` +/// will result in run array with physical length of `physical_array_len` /// and logical length of `logical_array_len` pub fn create_string_array_for_runs( physical_array_len: usize, diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 29e7420f10be..0b0a06875432 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -78,7 +78,7 @@ pub fn create_random_array( UInt64 => Arc::new(create_primitive_array::(size, null_density)), Float16 => { return Err(ArrowError::NotYetImplemented( - "Float16 is not implememted".to_string(), + "Float16 is not implemented".to_string(), )) } Float32 => Arc::new(create_primitive_array::(size, null_density)), diff --git a/dev/release/README.md b/dev/release/README.md index 11bcbe866e32..c74d7d865dd8 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -79,7 +79,7 @@ git commit -a -m 'Update version' export ARROW_GITHUB_API_TOKEN= -# manully edit ./dev/release/update_change_log.sh to reflect the release version +# manually edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog ./dev/release/update_change_log.sh @@ -94,7 +94,7 @@ python dev/release/label_issues.py git commit -a -m 'Create changelog' -# Manully edit ./dev/release/update_change_log.sh to reflect the release version +# Manually edit ./dev/release/update_change_log.sh to reflect the release version # Create the changelog CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh # Review change log / edit issues and labels if needed, rerun @@ -227,7 +227,7 @@ Rust Arrow Crates: ./object_store/dev/release/release-tarball.sh 4.1.0 2 ``` -Congratulations! The release is now offical! +Congratulations! The release is now official! ### Publish on Crates.io diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 58fb8a3b9deb..19a2766d2005 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -217,6 +217,6 @@ - Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) - Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) - Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Increase upper wait time to reduce flakiness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index c5a5652ab4d1..494303dffd35 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -436,7 +436,7 @@ fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result 0 && obj.location.as_ref().len() > prefix.as_ref().len() { Some(obj) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 9507967836f1..ba322e29d868 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2642,7 +2642,7 @@ mod tests { } #[test] - fn test_arbitary_decimal() { + fn test_arbitrary_decimal() { let values = [1, 2, 3, 4, 5, 6, 7, 8]; let decimals_19_0 = Decimal128Array::from_iter_values(values) .with_precision_and_scale(19, 0) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index d3abf968b3b2..76f950620688 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -1116,7 +1116,7 @@ mod tests { RowSelector::select(5), // Skip full page past page boundary RowSelector::skip(12), - // Select to final page bounday + // Select to final page boundary RowSelector::select(12), RowSelector::skip(1), // Skip across final page boundary diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 266c0436bb2c..ec1d4a07ae68 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -2023,7 +2023,7 @@ mod tests { } #[test] - fn test_column_order_get_coverted_type_sort_order() { + fn test_column_order_get_converted_type_sort_order() { // Helper to check the order in a list of values. // Only converted type is checked. fn check_sort_order(types: Vec, expected_order: SortOrder) { diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 4c4057e7a77c..f1831ed48444 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -726,7 +726,7 @@ mod lz4_hadoop_codec { } Err(e) if !self.backward_compatible_lz4 => Err(e.into()), // Fallback done to be backward compatible with older versions of this - // libray and older versions of parquet-cpp. + // library and older versions of parquet-cpp. Err(_) => { // Truncate any inserted element before tryingg next algorithm. output_buf.truncate(output_len); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 48ee7f89fc5d..2e7f73bf0a4f 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -106,7 +106,7 @@ pub struct ByteArray { data: Option, } -// Special case Debug that prints out byte arrays that are vaid utf8 as &str's +// Special case Debug that prints out byte arrays that are valid utf8 as &str's impl std::fmt::Debug for ByteArray { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut debug_struct = f.debug_struct("ByteArray"); @@ -259,7 +259,7 @@ impl fmt::Display for ByteArray { /// types, although there are code paths in the Rust (and potentially the C++) versions that /// warrant this. /// -/// With this wrapper type the compiler generates more targetted code paths matching the higher +/// With this wrapper type the compiler generates more targeted code paths matching the higher /// level logical types, removing the data-hazard from all decoding and encoding paths. #[repr(transparent)] #[derive(Clone, Debug, Default)] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index e5ed26e9e812..2ddbf0f7c29b 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1327,7 +1327,7 @@ mod tests { // parquet-tools column-index ./data_index_bloom_encoding_stats.parquet // row group 0: // column index for column String: - // Boudary order: ASCENDING + // Boundary order: ASCENDING // page-0 : // null count min max // 0 Hello today diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 49fdc3fc71d4..1809e3ace889 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -133,7 +133,7 @@ pub trait RowAccessor { fn get_map(&self, i: usize) -> Result<&Map>; } -/// Trait for formating fields within a Row. +/// Trait for formatting fields within a Row. /// /// # Examples /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index eb16c13f6ffe..b7298a45b2e8 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -611,7 +611,7 @@ impl fmt::Display for Reader { // ---------------------------------------------------------------------- // Row iterators -/// The enum Either with variants That represet a reference and a box of +/// The enum Either with variants That represents a reference and a box of /// [`FileReader`](crate::file::reader::FileReader). enum Either<'a> { Left(&'a dyn FileReader), diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index b7318b3d3ac6..14a4a39454fd 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -496,7 +496,7 @@ mod tests { } } - // Check values of a selectd column in a file + // Check values of a selected column in a file fn test_column_in_file( file_name: &str, batch_size: usize, From 569143361e694960ba1c7b29a31aa60de0021fbd Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Fri, 31 Mar 2023 19:00:39 +0800 Subject: [PATCH 0767/1411] fix: remove unused type parameters. (#3986) receive warning from cargo clippy. --- arrow-ord/src/ord.rs | 9 ++--- arrow-ord/src/sort.rs | 88 +++++++++++++++---------------------------- 2 files changed, 34 insertions(+), 63 deletions(-) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 66058907f15a..db1fff6d3e2f 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -45,10 +45,7 @@ fn compare_boolean(left: &dyn Array, right: &dyn Array) -> DynComparator { Box::new(move |i, j| left.value(i).cmp(&right.value(j))) } -fn compare_string(left: &dyn Array, right: &dyn Array) -> DynComparator -where - T: OffsetSizeTrait, -{ +fn compare_string(left: &dyn Array, right: &dyn Array) -> DynComparator { let left: StringArray = StringArray::from(left.data().clone()); let right: StringArray = StringArray::from(right.data().clone()); @@ -229,8 +226,8 @@ pub fn build_compare( (Duration(Nanosecond), Duration(Nanosecond)) => { compare_primitives::(left, right) } - (Utf8, Utf8) => compare_string::(left, right), - (LargeUtf8, LargeUtf8) => compare_string::(left, right), + (Utf8, Utf8) => compare_string(left, right), + (LargeUtf8, LargeUtf8) => compare_string(left, right), ( Dictionary(key_type_lhs, value_type_lhs), Dictionary(key_type_rhs, value_type_rhs), diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index b4f498813a82..7661479291c6 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -271,62 +271,38 @@ pub fn sort_to_indices( } DataType::Utf8 => sort_string::(values, v, n, &options, limit), DataType::LargeUtf8 => sort_string::(values, v, n, &options, limit), - DataType::List(field) | DataType::FixedSizeList(field, _) => match field - .data_type() - { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float64 => { - sort_list::(values, v, n, &options, limit) - } - t => { - return Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {t:?}" - ))); + DataType::List(field) | DataType::FixedSizeList(field, _) => { + match field.data_type() { + DataType::Int8 => sort_list::(values, v, n, &options, limit), + DataType::Int16 => sort_list::(values, v, n, &options, limit), + DataType::Int32 => sort_list::(values, v, n, &options, limit), + DataType::Int64 => sort_list::(values, v, n, &options, limit), + DataType::UInt8 => sort_list::(values, v, n, &options, limit), + DataType::UInt16 => sort_list::(values, v, n, &options, limit), + DataType::UInt32 => sort_list::(values, v, n, &options, limit), + DataType::UInt64 => sort_list::(values, v, n, &options, limit), + DataType::Float16 => sort_list::(values, v, n, &options, limit), + DataType::Float32 => sort_list::(values, v, n, &options, limit), + DataType::Float64 => sort_list::(values, v, n, &options, limit), + t => { + return Err(ArrowError::ComputeError(format!( + "Sort not supported for list type {t:?}" + ))); + } } - }, + } DataType::LargeList(field) => match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::Float64 => { - sort_list::(values, v, n, &options, limit) - } + DataType::Int8 => sort_list::(values, v, n, &options, limit), + DataType::Int16 => sort_list::(values, v, n, &options, limit), + DataType::Int32 => sort_list::(values, v, n, &options, limit), + DataType::Int64 => sort_list::(values, v, n, &options, limit), + DataType::UInt8 => sort_list::(values, v, n, &options, limit), + DataType::UInt16 => sort_list::(values, v, n, &options, limit), + DataType::UInt32 => sort_list::(values, v, n, &options, limit), + DataType::UInt64 => sort_list::(values, v, n, &options, limit), + DataType::Float16 => sort_list::(values, v, n, &options, limit), + DataType::Float32 => sort_list::(values, v, n, &options, limit), + DataType::Float64 => sort_list::(values, v, n, &options, limit), t => { return Err(ArrowError::ComputeError(format!( "Sort not supported for list type {t:?}" @@ -901,7 +877,7 @@ where } } -fn sort_list( +fn sort_list( values: &dyn Array, value_indices: Vec, null_indices: Vec, @@ -910,8 +886,6 @@ fn sort_list( ) -> UInt32Array where S: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: PartialOrd, { sort_list_inner::(values, value_indices, null_indices, options, limit) } From a9805063785ab94a13ea45cb8d5b4ce544fd0434 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 12:02:06 +0100 Subject: [PATCH 0768/1411] Prepare object_store 0.5.6 (#3984) --- object_store/CHANGELOG-old.md | 24 +++++++++++ object_store/CHANGELOG.md | 40 +++++++++++++------ object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 19a2766d2005..cc9453b321bc 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,30 @@ # Historical Changelog +## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) + +**Implemented enhancements:** + +- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) +- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) +- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) + ## [object_store_0.5.4](https://github.com/apache/arrow-rs/tree/object_store_0.5.4) (2023-01-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.3...object_store_0.5.4) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index b8f2fe8fc3f4..b26ae7180004 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,29 +19,43 @@ # Changelog -## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) +## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) **Implemented enhancements:** -- object\_store: support azure cli credential [\#3697](https://github.com/apache/arrow-rs/issues/3697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: support encoded path as input [\#3651](https://github.com/apache/arrow-rs/issues/3651) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- object-store: aws\_profile fails to load static credentials [\#3765](https://github.com/apache/arrow-rs/issues/3765) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Inconsistent Behaviour Listing File [\#3712](https://github.com/apache/arrow-rs/issues/3712) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: bearer token is azure is used like access key [\#3696](https://github.com/apache/arrow-rs/issues/3696) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object-store: fix handling of AWS profile credentials without expiry [\#3766](https://github.com/apache/arrow-rs/pull/3766) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([helmus](https://github.com/helmus)) -- update object\_store deps to patch potential security vulnerabilities [\#3761](https://github.com/apache/arrow-rs/pull/3761) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([spencerbart](https://github.com/spencerbart)) -- Filter exact list prefix matches for azure gen2 accounts [\#3714](https://github.com/apache/arrow-rs/pull/3714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Filter exact list prefix matches for MemoryStore and HttpStore \(\#3712\) [\#3713](https://github.com/apache/arrow-rs/pull/3713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: azure cli authorization [\#3698](https://github.com/apache/arrow-rs/pull/3698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- object\_store: add Path::from\_url\_path [\#3663](https://github.com/apache/arrow-rs/pull/3663) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jychen7](https://github.com/jychen7)) +- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) +- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) +- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) +- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) +- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index d9b075fcc5cd..bd0bbb760f47 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.5" +version = "0.5.6" edition = { workspace = true } license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index de80d0f3eaf3..b69d36f8456c 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.4" -FUTURE_RELEASE="object_store_0.5.5" +SINCE_TAG="object_store_0.5.5" +FUTURE_RELEASE="object_store_0.5.6" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 234b7847ecb737e96df3f4623df7b330b34b3d1b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 12:17:53 +0100 Subject: [PATCH 0769/1411] Revert workspace links for object_store (#3987) --- object_store/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bd0bbb760f47..9bf104334ed9 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -18,12 +18,12 @@ [package] name = "object_store" version = "0.5.6" -edition = { workspace = true } +edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] -repository = { workspace = true } +repository = "https://github.com/apache/arrow-rs/tree/master/object_store" [package.metadata.docs.rs] all-features = true From 605a7842e87abbbdc26c310a82abb4398000a43d Mon Sep 17 00:00:00 2001 From: bold Date: Fri, 31 Mar 2023 15:50:50 +0200 Subject: [PATCH 0770/1411] Faster decimal parsing (30-60%) (#3939) * Improve decimal parsing * Add edge tests for decimal parsing * Add more decimal parsing tests Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix test and improve performance further * Move overflow check out of the loop * Fix "0" parsing * Add failing decimal parsing tests * Fix parse decimal tests --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-cast/src/parse.rs | 159 +++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index d7e5529bcd42..cc8254916854 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -623,64 +623,64 @@ pub fn parse_decimal( precision: u8, scale: i8, ) -> Result { - let mut seen_dot = false; - let mut seen_sign = false; - let mut negative = false; - let mut result = T::Native::usize_as(0); let mut fractionals = 0; let mut digits = 0; let base = T::Native::usize_as(10); - let mut bs = s.as_bytes().iter(); + + let bs = s.as_bytes(); + let (bs, negative) = match bs.first() { + Some(b'-') => (&bs[1..], true), + Some(b'+') => (&bs[1..], false), + _ => (bs, false), + }; + + if bs.is_empty() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + let mut bs = bs.iter(); + // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. + // Thus, if we validate the precision correctly, we can skip overflow checks. while let Some(b) = bs.next() { match b { b'0'..=b'9' => { - if seen_dot { - if fractionals == scale { - // We have processed and validated the whole part of our decimal (including sign and dot). - // All that is left is to validate the fractional part. - if bs.any(|b| !b.is_ascii_digit()) { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - break; - } - fractionals += 1; + if digits == 0 && *b == b'0' { + // Ignore leading zeros. + continue; } digits += 1; - if digits > precision { - return Err(ArrowError::ParseError( - "parse decimal overflow".to_string(), - )); - } - result = result.mul_checked(base)?; - result = result.add_checked(T::Native::usize_as((b - b'0') as usize))?; + result = result.mul_wrapping(base); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } b'.' => { - if seen_dot { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - seen_dot = true; - } - b'-' => { - if seen_sign || digits > 0 || seen_dot { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); + for b in bs.by_ref() { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + if fractionals == scale { + // We have processed all the digits that we need. All that + // is left is to validate that the rest of the string contains + // valid digits. + continue; + } + fractionals += 1; + digits += 1; + result = result.mul_wrapping(base); + result = + result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } - seen_sign = true; - negative = true; - } - b'+' => { - if seen_sign || digits > 0 || seen_dot { + + // Fail on "." + if digits == 0 { return Err(ArrowError::ParseError(format!( "can't parse the string value {s} to decimal" ))); } - seen_sign = true; } _ => { return Err(ArrowError::ParseError(format!( @@ -689,24 +689,20 @@ pub fn parse_decimal( } } } - // Fail on "." - if digits == 0 { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } if fractionals < scale { let exp = scale - fractionals; if exp as u8 + digits > precision { return Err(ArrowError::ParseError("parse decimal overflow".to_string())); } - let mul = base.pow_checked(exp as _)?; - result = result.mul_checked(mul)?; + let mul = base.pow_wrapping(exp as _); + result = result.mul_wrapping(mul); + } else if digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); } Ok(if negative { - result.neg_checked()? + result.neg_wrapping() } else { result }) @@ -1689,6 +1685,7 @@ mod tests { #[test] fn test_parse_decimal_with_parameter() { let tests = [ + ("0", 0i128), ("123.123", 123123i128), ("123.1234", 123123i128), ("123.1", 123100i128), @@ -1717,7 +1714,7 @@ mod tests { let result_256 = parse_decimal::(s, 20, 3); assert_eq!(i256::from_i128(i), result_256.unwrap()); } - let can_not_parse_tests = ["123,123", ".", "123.123.123"]; + let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; for s in can_not_parse_tests { let result_128 = parse_decimal::(s, 20, 3); assert_eq!( @@ -1750,5 +1747,63 @@ mod tests { "actual: '{actual_256}', expected: '{expected_256}'" ); } + + let edge_tests_128 = [ + ( + "99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 0, + ), + ( + "999999999999999999999999999999999999.99", + 99999999999999999999999999999999999999i128, + 2, + ), + ( + "9999999999999999999999999.9999999999999", + 99999999999999999999999999999999999999i128, + 13, + ), + ( + "9999999999999999999999999", + 99999999999999999999999990000000000000i128, + 13, + ), + ( + "0.99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 38, + ), + ]; + for (s, i, scale) in edge_tests_128 { + let result_128 = parse_decimal::(s, 38, scale); + assert_eq!(i, result_128.unwrap()); + } + let edge_tests_256 = [ + ( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", +i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 0, + ), + ( + "999999999999999999999999999999999999999999999999999999999999999999999999.9999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 4, + ), + ( + "99999999999999999999999999999999999999999999999999.99999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 26, + ), + ( + "99999999999999999999999999999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + 26, + ), + ]; + for (s, i, scale) in edge_tests_256 { + let result = parse_decimal::(s, 76, scale); + assert_eq!(i, result.unwrap()); + } } } From ef5c58ce5d7e8a13cbd8e1fa4d6a5310951a6de0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 19:06:01 +0100 Subject: [PATCH 0771/1411] Add Field Constructors for Complex Fields (#3992) * Improve ergonomics of declaring complex fields * Apply suggestions from code review Co-authored-by: Andrew Lamb * Review feedback --------- Co-authored-by: Andrew Lamb --- arrow-array/src/record_batch.rs | 11 +- arrow-cast/src/cast.rs | 6 +- arrow-cast/src/pretty.rs | 93 +++----- arrow-csv/src/reader/mod.rs | 12 +- arrow-csv/src/writer.rs | 6 +- arrow-flight/tests/encode_decode.rs | 6 +- arrow-integration-test/src/field.rs | 67 +++--- arrow-ipc/src/convert.rs | 116 +++++----- arrow-ipc/src/reader.rs | 13 +- arrow-ipc/src/writer.rs | 19 +- arrow-json/src/raw/mod.rs | 91 ++++---- arrow-json/src/writer.rs | 12 +- arrow-schema/src/field.rs | 113 +++++++++- arrow-schema/src/schema.rs | 65 ++---- parquet/src/arrow/arrow_writer/mod.rs | 30 +-- parquet/src/arrow/schema/complex.rs | 6 +- parquet/src/arrow/schema/mod.rs | 295 ++++++++++---------------- 17 files changed, 424 insertions(+), 537 deletions(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 8d4d04f0f525..db4bb1230ca7 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -708,16 +708,11 @@ mod tests { #[test] fn create_record_batch_field_name_mismatch() { - let struct_fields = vec![ + let fields = vec![ Field::new("a1", DataType::Int32, false), - Field::new( - "a2", - DataType::List(Arc::new(Field::new("item", DataType::Int8, false))), - false, - ), + Field::new_list("a2", Field::new("item", DataType::Int8, false), false), ]; - let struct_type = DataType::Struct(struct_fields.into()); - let schema = Arc::new(Schema::new(vec![Field::new("a", struct_type, true)])); + let schema = Arc::new(Schema::new(vec![Field::new_struct("a", fields, true)])); let a1: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); let a2_child = Int8Array::from(vec![1, 2, 3, 4]); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 02b87e73114c..9886decd9ddc 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -7104,12 +7104,12 @@ mod tests { fn test_cast_null_from_and_to_nested_type() { // Cast null from and to map let data_type = DataType::Map( - Arc::new(Field::new( + Arc::new(Field::new_struct( "entry", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Int32, true), - ])), + ], false, )), false, diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 7aa04a2dbcb3..c75721ab8517 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -257,9 +257,8 @@ mod tests { #[test] fn test_pretty_format_dictionary() { // define a schema. - let field_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); + let field = Field::new_dictionary("d1", DataType::Int32, DataType::Utf8, true); + let schema = Arc::new(Schema::new(vec![field])); let mut builder = StringDictionaryBuilder::::new(); @@ -633,18 +632,16 @@ mod tests { #[test] fn test_pretty_format_struct() { let schema = Schema::new(vec![ - Field::new( + Field::new_struct( "c1", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("c11", DataType::Int32, true), - Field::new( + Field::new_struct( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + vec![Field::new("c121", DataType::Utf8, false)], false, ), - ])), + ], false, ), Field::new("c2", DataType::Utf8, false), @@ -656,11 +653,9 @@ mod tests { Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( - Field::new( + Field::new_struct( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + vec![Field::new("c121", DataType::Utf8, false)], false, ), Arc::new(StructArray::from(vec![( @@ -700,19 +695,14 @@ mod tests { builder.append_null::("a").unwrap(); let union = builder.build().unwrap(); - let schema = Schema::new(vec![Field::new( + let schema = Schema::new(vec![Field::new_union( "Teamsters", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float64, false), - ], - ), - UnionMode::Dense, - ), - false, + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float64, false), + ], + UnionMode::Dense, )]); let batch = @@ -742,19 +732,14 @@ mod tests { builder.append_null::("a").unwrap(); let union = builder.build().unwrap(); - let schema = Schema::new(vec![Field::new( + let schema = Schema::new(vec![Field::new_union( "Teamsters", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Float64, false), - ], - ), - UnionMode::Sparse, - ), - false, + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Float64, false), + ], + UnionMode::Sparse, )]); let batch = @@ -786,19 +771,14 @@ mod tests { builder.append_null::("c").unwrap(); let inner = builder.build().unwrap(); - let inner_field = Field::new( + let inner_field = Field::new_union( "European Union", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Float64, false), - ], - ), - UnionMode::Dense, - ), - false, + vec![0, 1], + vec![ + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Float64, false), + ], + UnionMode::Dense, ); // Can't use UnionBuilder with non-primitive types, so manually build outer UnionArray @@ -812,16 +792,11 @@ mod tests { let outer = UnionArray::try_new(&[0, 1], type_ids, None, children).unwrap(); - let schema = Schema::new(vec![Field::new( + let schema = Schema::new(vec![Field::new_union( "Teamsters", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![Field::new("a", DataType::Int32, true), inner_field], - ), - UnionMode::Sparse, - ), - false, + vec![0, 1], + vec![Field::new("a", DataType::Int32, true), inner_field], + UnionMode::Sparse, )]); let batch = diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 7fecc1ad92b4..3fa712819a92 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -1485,11 +1485,7 @@ mod tests { #[test] fn test_csv_with_dictionary() { let schema = Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), + Field::new_dictionary("city", DataType::Int32, DataType::Utf8, false), Field::new("lat", DataType::Float64, false), Field::new("lng", DataType::Float64, false), ]); @@ -1507,11 +1503,7 @@ mod tests { None, ); let projected_schema = Arc::new(Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), + Field::new_dictionary("city", DataType::Int32, DataType::Utf8, false), Field::new("lat", DataType::Float64, false), ])); assert_eq!(projected_schema, csv.schema()); diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 946803decf90..90c32832a8f4 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -347,11 +347,7 @@ mod tests { Field::new("c4", DataType::Boolean, true), Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true), Field::new("c6", DataType::Time32(TimeUnit::Second), false), - Field::new( - "c7", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), + Field::new_dictionary("c7", DataType::Int32, DataType::Utf8, false), ]); let c1 = StringArray::from(vec![ diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index ec86fbcc0bdf..90fa2b7a6832 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -140,11 +140,7 @@ async fn test_zero_batches_schema_specified() { async fn test_zero_batches_dictionary_schema_specified() { let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int64, false), - Field::new( - "b", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), + Field::new_dictionary("b", DataType::Int32, DataType::Utf8, false), ])); // Expect dictionary to be hydrated in output (#3389) diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index a0cd4adc83f0..f59314ca02db 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -303,17 +303,17 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::{Fields, UnionFields, UnionMode}; + use arrow::datatypes::UnionMode; use serde_json::Value; #[test] fn struct_field_to_json() { - let f = Field::new( + let f = Field::new_struct( "address", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ])), + ], false, ); let value: Value = serde_json::from_str( @@ -351,19 +351,12 @@ mod tests { #[test] fn map_field_to_json() { - let f = Field::new( + let f = Field::new_map( "my_map", - DataType::Map( - Arc::new(Field::new( - "my_entries", - DataType::Struct(Fields::from(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ])), - false, - )), - true, - ), + "my_entries", + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + true, false, ); let value: Value = serde_json::from_str( @@ -459,12 +452,12 @@ mod tests { let value: Value = serde_json::from_str(json).unwrap(); let dt = field_from_json(&value).unwrap(); - let expected = Field::new( + let expected = Field::new_struct( "address", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("street", DataType::Utf8, false), Field::new("zip", DataType::UInt16, false), - ])), + ], false, ); @@ -515,19 +508,12 @@ mod tests { let value: Value = serde_json::from_str(json).unwrap(); let dt = field_from_json(&value).unwrap(); - let expected = Field::new( + let expected = Field::new_map( "my_map", - DataType::Map( - Arc::new(Field::new( - "my_entries", - DataType::Struct(Fields::from(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ])), - false, - )), - true, - ), + "my_entries", + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + true, false, ); @@ -573,19 +559,14 @@ mod tests { let value: Value = serde_json::from_str(json).unwrap(); let dt = field_from_json(&value).unwrap(); - let expected = Field::new( + let expected = Field::new_union( "my_union", - DataType::Union( - UnionFields::new( - vec![5, 7], - vec![ - Field::new("f1", DataType::Int32, true), - Field::new("f2", DataType::Utf8, true), - ], - ), - UnionMode::Sparse, - ), - false, + vec![5, 7], + vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ], + UnionMode::Sparse, ); assert_eq!(expected, dt); diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index cc2a7786c3ff..c7b5559fa49a 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -906,107 +906,91 @@ mod tests { ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), - Field::new( + Field::new_list( "list[u8]", - DataType::List(Arc::new(Field::new("item", DataType::UInt8, false))), + Field::new("item", DataType::UInt8, false), true, ), - Field::new( + Field::new_list( "list[struct]", - List(Arc::new(Field::new( + Field::new_struct( "struct", - Struct(Fields::from(vec![ - Field::new("float32", DataType::UInt8, false), - Field::new("int32", DataType::Int32, true), - Field::new("bool", DataType::Boolean, true), - ])), + vec![ + Field::new("float32", UInt8, false), + Field::new("int32", Int32, true), + Field::new("bool", Boolean, true), + ], true, - ))), + ), false, ), - Field::new( + Field::new_struct( "struct>", - Struct(Fields::from(vec![Field::new( + vec![Field::new( "dictionary", Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), false, - )])), + )], false, ), - Field::new( + Field::new_struct( "struct]>]>", - Struct(Fields::from(vec![ + vec![ Field::new("int64", DataType::Int64, true), - Field::new( + Field::new_list( "list[struct]>]", - DataType::List(Arc::new(Field::new( + Field::new_struct( "struct", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("date32", DataType::Date32, true), - Field::new( + Field::new_list( "list[struct<>]", - DataType::List(Arc::new(Field::new( + Field::new( "struct", DataType::Struct(Fields::empty()), false, - ))), + ), false, ), - ])), + ], false, - ))), + ), false, ), - ])), + ], false, ), - Field::new( + Field::new_union( "union]>]>", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("int64", DataType::Int64, true), - Field::new( - "list[union]>]", - DataType::List(Arc::new(Field::new( - "union]>", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new( - "date32", - DataType::Date32, - true, - ), - Field::new( - "list[union<>]", - DataType::List(Arc::new( - Field::new( - "union", - DataType::Union( - UnionFields::empty(), - UnionMode::Sparse, - ), - false, - ), - )), - false, - ), - ], + vec![0, 1], + vec![ + Field::new("int64", DataType::Int64, true), + Field::new_list( + "list[union]>]", + Field::new_union( + "union]>", + vec![0, 1], + vec![ + Field::new("date32", DataType::Date32, true), + Field::new_list( + "list[union<>]", + Field::new( + "union", + DataType::Union( + UnionFields::empty(), + UnionMode::Sparse, ), - UnionMode::Dense, + false, ), false, - ))), - false, - ), - ], + ), + ], + UnionMode::Dense, + ), + false, ), - UnionMode::Sparse, - ), - false, + ], + UnionMode::Sparse, ), Field::new("struct<>", DataType::Struct(Fields::empty()), true), Field::new( diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index c20f7bd012fb..75d078456f0a 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1261,11 +1261,6 @@ mod tests { 3, ); - let key_type = DataType::Int8; - let value_type = DataType::Utf8; - let dict_data_type = - DataType::Dictionary(Box::new(key_type), Box::new(value_type)); - let union_fields = UnionFields::new( vec![0, 1], vec![ @@ -1278,11 +1273,7 @@ mod tests { let struct_fields = Fields::from(vec![ Field::new("id", DataType::Int32, false), - Field::new( - "list", - DataType::List(Arc::new(Field::new("item", DataType::Int8, true))), - false, - ), + Field::new_list("list", Field::new("item", DataType::Int8, true), false), ]); let struct_data_type = DataType::Struct(struct_fields); @@ -1305,7 +1296,7 @@ mod tests { Field::new("f9", struct_data_type, false), Field::new("f10", run_encoded_data_type, false), Field::new("f11", DataType::Boolean, false), - Field::new("f12", dict_data_type, false), + Field::new_dictionary("f12", DataType::Int8, DataType::Utf8, false), Field::new("f13", DataType::Utf8, false), ]) } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 0e999dc72756..12c173f64c0f 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1778,19 +1778,14 @@ mod tests { } fn write_union_file(options: IpcWriteOptions) { - let schema = Schema::new(vec![Field::new( + let schema = Schema::new(vec![Field::new_union( "union", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("a", DataType::Int32, false), - Field::new("c", DataType::Float64, false), - ], - ), - UnionMode::Sparse, - ), - true, + vec![0, 1], + vec![ + Field::new("a", DataType::Int32, false), + Field::new("c", DataType::Float64, false), + ], + UnionMode::Sparse, )]); let mut builder = UnionBuilder::with_capacity_sparse(5); builder.append::("a", 1).unwrap(); diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index c784bd347b4b..1bae8ac529e7 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -362,7 +362,7 @@ mod tests { use arrow_array::{Array, StructArray}; use arrow_buffer::ArrowNativeType; use arrow_cast::display::{ArrayFormatter, FormatOptions}; - use arrow_schema::{DataType, Field, Fields, Schema}; + use arrow_schema::{DataType, Field, Schema}; use std::fs::File; use std::io::{BufReader, Cursor, Seek}; use std::sync::Arc; @@ -503,32 +503,26 @@ mod tests { "#; let schema = Arc::new(Schema::new(vec![ - Field::new( - "list", - DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), - true, - ), - Field::new( + Field::new_list("list", Field::new("element", DataType::Int32, false), true), + Field::new_struct( "nested", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Int32, true), - ])), + ], true, ), - Field::new( + Field::new_struct( "nested_list", - DataType::Struct(Fields::from(vec![Field::new( + vec![Field::new_list( "list2", - DataType::List(Arc::new(Field::new( + Field::new_struct( "element", - DataType::Struct( - vec![Field::new("c", DataType::Int32, false)].into(), - ), + vec![Field::new("c", DataType::Int32, false)], false, - ))), + ), true, - )])), + )], true, ), ])); @@ -582,24 +576,22 @@ mod tests { "#; let schema = Arc::new(Schema::new(vec![ - Field::new( + Field::new_struct( "nested", - DataType::Struct(vec![Field::new("a", DataType::Int32, false)].into()), + vec![Field::new("a", DataType::Int32, false)], true, ), - Field::new( + Field::new_struct( "nested_list", - DataType::Struct(Fields::from(vec![Field::new( + vec![Field::new_list( "list2", - DataType::List(Arc::new(Field::new( + Field::new_struct( "element", - DataType::Struct( - vec![Field::new("d", DataType::Int32, true)].into(), - ), + vec![Field::new("d", DataType::Int32, true)], false, - ))), + ), true, - )])), + )], true, ), ])); @@ -639,14 +631,16 @@ mod tests { {"map": {"a": [null], "b": []}} {"map": {"c": null, "a": ["baz"]}} "#; - let list = DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))); - let entries = DataType::Struct(Fields::from(vec![ + let map = Field::new_map( + "map", + "entries", Field::new("key", DataType::Utf8, false), - Field::new("value", list, true), - ])); + Field::new_list("value", Field::new("element", DataType::Utf8, true), true), + false, + true, + ); - let map = DataType::Map(Arc::new(Field::new("entries", entries, true)), false); - let schema = Arc::new(Schema::new(vec![Field::new("map", map, true)])); + let schema = Arc::new(Schema::new(vec![map])); let batches = do_read(buf, 1024, false, schema); assert_eq!(batches.len(), 1); @@ -1010,31 +1004,24 @@ mod tests { fn test_delta_checkpoint() { let json = "{\"protocol\":{\"minReaderVersion\":1,\"minWriterVersion\":2}}"; let schema = Arc::new(Schema::new(vec![ - Field::new( + Field::new_struct( "protocol", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("minReaderVersion", DataType::Int32, true), Field::new("minWriterVersion", DataType::Int32, true), - ])), + ], true, ), - Field::new( + Field::new_struct( "add", - DataType::Struct(Fields::from(vec![Field::new( + vec![Field::new_map( "partitionValues", - DataType::Map( - Arc::new(Field::new( - "key_value", - DataType::Struct(Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Utf8, true), - ])), - false, - )), - false, - ), + "key_value", + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + false, false, - )])), + )], true, ), ])); @@ -1056,9 +1043,9 @@ mod tests { let do_test = |child: DataType| { // Test correctly enforced nullability let non_null = r#"{"foo": {}}"#; - let schema = Arc::new(Schema::new(vec![Field::new( + let schema = Arc::new(Schema::new(vec![Field::new_struct( "foo", - DataType::Struct(vec![Field::new("bar", child, false)].into()), + vec![Field::new("bar", child, false)], true, )])); let mut reader = RawReaderBuilder::new(schema.clone()) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 1b950f794275..d66d32017c26 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -680,16 +680,8 @@ mod tests { #[test] fn write_dictionary() { let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - Field::new( - "c2", - DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), - true, - ), + Field::new_dictionary("c1", DataType::Int32, DataType::Utf8, true), + Field::new_dictionary("c2", DataType::Int8, DataType::Utf8, true), ]); let a: DictionaryArray = vec![ diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index ac02eadd6640..1af157e4d212 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -19,12 +19,14 @@ use crate::error::ArrowError; use std::cmp::Ordering; use std::collections::HashMap; use std::hash::{Hash, Hasher}; +use std::sync::Arc; use crate::datatype::DataType; use crate::schema::SchemaBuilder; +use crate::{Fields, UnionFields, UnionMode}; /// A reference counted [`Field`] -pub type FieldRef = std::sync::Arc; +pub type FieldRef = Arc; /// Describes a single column in a [`Schema`](super::Schema). /// @@ -145,6 +147,115 @@ impl Field { } } + /// Create a new [`Field`] with [`DataType::Dictionary`] + /// + /// Use [`Self::new_dict`] for more advanced dictionary options + /// + /// # Panics + /// + /// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type] + pub fn new_dictionary( + name: impl Into, + key: DataType, + value: DataType, + nullable: bool, + ) -> Self { + assert!( + key.is_dictionary_key_type(), + "{key} is not a valid dictionary key" + ); + let data_type = DataType::Dictionary(Box::new(key), Box::new(value)); + Self::new(name, data_type, nullable) + } + + /// Create a new [`Field`] with [`DataType::Struct`] + /// + /// - `name`: the name of the [`DataType::List`] field + /// - `fields`: the description of each struct element + /// - `nullable`: if the [`DataType::Struct`] array is nullable + pub fn new_struct( + name: impl Into, + fields: impl Into, + nullable: bool, + ) -> Self { + Self::new(name, DataType::Struct(fields.into()), nullable) + } + + /// Create a new [`Field`] with [`DataType::List`] + /// + /// - `name`: the name of the [`DataType::List`] field + /// - `value`: the description of each list element + /// - `nullable`: if the [`DataType::List`] array is nullable + /// + /// Uses "item" as the name of the child field, this can be overridden with [`Self::new`] + pub fn new_list( + name: impl Into, + value: impl Into, + nullable: bool, + ) -> Self { + Self::new(name, DataType::List(value.into()), nullable) + } + + /// Create a new [`Field`] with [`DataType::LargeList`] + /// + /// - `name`: the name of the [`DataType::LargeList`] field + /// - `value`: the description of each list element + /// - `nullable`: if the [`DataType::LargeList`] array is nullable + pub fn new_large_list( + name: impl Into, + value: impl Into, + nullable: bool, + ) -> Self { + Self::new(name, DataType::LargeList(value.into()), nullable) + } + + /// Create a new [`Field`] with [`DataType::Map`] + /// + /// - `name`: the name of the [`DataType::Map`] field + /// - `entries`: the name of the inner [`DataType::Struct`] field + /// - `keys`: the map keys + /// - `values`: the map values + /// - `sorted`: if the [`DataType::Map`] array is sorted + /// - `nullable`: if the [`DataType::Map`] array is nullable + pub fn new_map( + name: impl Into, + entries: impl Into, + keys: impl Into, + values: impl Into, + sorted: bool, + nullable: bool, + ) -> Self { + let data_type = DataType::Map( + Arc::new(Field::new( + entries.into(), + DataType::Struct(Fields::from([keys.into(), values.into()])), + false, // The inner map field is always non-nullable (#1697), + )), + sorted, + ); + Self::new(name, data_type, nullable) + } + + /// Create a new [`Field`] with [`DataType::Union`] + /// + /// - `name`: the name of the [`DataType::Union`] field + /// - `type_ids`: the union type ids + /// - `fields`: the union fields + /// - `mode`: the union mode + pub fn new_union(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self + where + S: Into, + F: IntoIterator, + F::Item: Into, + T: IntoIterator, + { + Self::new( + name, + DataType::Union(UnionFields::new(type_ids, fields), mode), + false, // Unions cannot be nullable + ) + } + /// Sets the `Field`'s optional custom metadata. #[inline] pub fn set_metadata(&mut self, metadata: HashMap) { diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 501c5c7fdd39..2cc892f5a8c2 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -365,7 +365,7 @@ impl Hash for Schema { #[cfg(test)] mod tests { use crate::datatype::DataType; - use crate::{TimeUnit, UnionFields, UnionMode}; + use crate::{TimeUnit, UnionMode}; use super::*; @@ -775,54 +775,35 @@ mod tests { // support merge union fields assert_eq!( Schema::try_merge(vec![ - Schema::new(vec![Field::new( + Schema::new(vec![Field::new_union( "c1", - DataType::Union( - UnionFields::new( - vec![0, 1], - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - ] - ), - UnionMode::Dense - ), - false + vec![0, 1], + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + ], + UnionMode::Dense ),]), - Schema::new(vec![Field::new( + Schema::new(vec![Field::new_union( "c1", - DataType::Union( - UnionFields::new( - vec![1, 2], - vec![ - Field::new("c12", DataType::Utf8, true), - Field::new( - "c13", - DataType::Time64(TimeUnit::Second), - true - ), - ] - ), - UnionMode::Dense - ), - false + vec![1, 2], + vec![ + Field::new("c12", DataType::Utf8, true), + Field::new("c13", DataType::Time64(TimeUnit::Second), true), + ], + UnionMode::Dense ),]) ]) .unwrap(), - Schema::new(vec![Field::new( + Schema::new(vec![Field::new_union( "c1", - DataType::Union( - UnionFields::new( - vec![0, 1, 2], - vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ] - ), - UnionMode::Dense - ), - false + vec![0, 1, 2], + vec![ + Field::new("c11", DataType::Utf8, true), + Field::new("c12", DataType::Utf8, true), + Field::new("c13", DataType::Time64(TimeUnit::Second), true), + ], + UnionMode::Dense ),]), ); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0515ed4e39e2..86f7764ec4cf 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -888,36 +888,28 @@ mod tests { // define schema let struct_field_d = Field::new("d", DataType::Float64, true); let struct_field_f = Field::new("f", DataType::Float32, true); - let struct_field_g = Field::new( - "g", - DataType::List(Arc::new(Field::new("item", DataType::Int16, true))), - false, - ); - let struct_field_h = Field::new( - "h", - DataType::List(Arc::new(Field::new("item", DataType::Int16, false))), - true, - ); - let struct_field_e = Field::new( + let struct_field_g = + Field::new_list("g", Field::new("item", DataType::Int16, true), false); + let struct_field_h = + Field::new_list("h", Field::new("item", DataType::Int16, false), true); + let struct_field_e = Field::new_struct( "e", - DataType::Struct(Fields::from(vec![ + vec![ struct_field_f.clone(), struct_field_g.clone(), struct_field_h.clone(), - ])), + ], false, ); - let schema = Schema::new(Fields::from(vec![ + let schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, true), - Field::new( + Field::new_struct( "c", - DataType::Struct( - vec![struct_field_d.clone(), struct_field_e.clone()].into(), - ), + vec![struct_field_d.clone(), struct_field_e.clone()], false, ), - ])); + ]); // create some data let a = Int32Array::from(vec![1, 2, 3, 4, 5]); diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 25227aeeebc8..c1699aafcfe8 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -24,7 +24,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError; use crate::errors::Result; use crate::schema::types::{SchemaDescriptor, Type, TypePtr}; -use arrow_schema::{DataType, Field, Fields, Schema, SchemaBuilder}; +use arrow_schema::{DataType, Field, Schema, SchemaBuilder}; fn get_repetition(t: &Type) -> Repetition { let info = t.get_basic_info(); @@ -351,9 +351,9 @@ impl Visitor { _ => HashMap::default(), }; - let map_field = Field::new( + let map_field = Field::new_struct( map_key_value.name(), - DataType::Struct(Fields::from([key_field, value_field])), + [key_field, value_field], false, // The inner map field is always non-nullable (#1697) ) .with_metadata(field_metadata); diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 81ed5e8177bb..399dcba9e981 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -709,9 +709,9 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))), + Field::new("element", DataType::Utf8, true), false, )); } @@ -723,9 +723,9 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + Field::new("element", DataType::Utf8, false), true, )); } @@ -743,11 +743,10 @@ mod tests { // } // } { - let arrow_inner_list = - DataType::List(Arc::new(Field::new("element", DataType::Int32, false))); - arrow_fields.push(Field::new( + let arrow_inner_list = Field::new("element", DataType::Int32, false); + arrow_fields.push(Field::new_list( "array_of_arrays", - DataType::List(Arc::new(Field::new("element", arrow_inner_list, false))), + Field::new_list("element", arrow_inner_list, false), true, )); } @@ -759,9 +758,9 @@ mod tests { // }; // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("str", DataType::Utf8, false))), + Field::new("str", DataType::Utf8, false), true, )); } @@ -771,9 +770,9 @@ mod tests { // repeated int32 element; // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), + Field::new("element", DataType::Int32, false), true, )); } @@ -786,13 +785,13 @@ mod tests { // }; // } { - let arrow_struct = DataType::Struct(Fields::from(vec![ + let fields = vec![ Field::new("str", DataType::Utf8, false), Field::new("num", DataType::Int32, false), - ])); - arrow_fields.push(Field::new( + ]; + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("element", arrow_struct, false))), + Field::new_struct("element", fields, false), true, )); } @@ -805,11 +804,10 @@ mod tests { // } // Special case: group is named array { - let fields = vec![Field::new("str", DataType::Utf8, false)].into(); - let arrow_struct = DataType::Struct(fields); - arrow_fields.push(Field::new( + let fields = vec![Field::new("str", DataType::Utf8, false)]; + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new("array", arrow_struct, false))), + Field::new_struct("array", fields, false), true, )); } @@ -822,15 +820,10 @@ mod tests { // } // Special case: group named ends in _tuple { - let fields = vec![Field::new("str", DataType::Utf8, false)].into(); - let arrow_struct = DataType::Struct(fields); - arrow_fields.push(Field::new( + let fields = vec![Field::new("str", DataType::Utf8, false)]; + arrow_fields.push(Field::new_list( "my_list", - DataType::List(Arc::new(Field::new( - "my_list_tuple", - arrow_struct, - false, - ))), + Field::new_struct("my_list_tuple", fields, false), true, )); } @@ -838,9 +831,9 @@ mod tests { // One-level encoding: Only allows required lists with required cells // repeated value_type name { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "name", - DataType::List(Arc::new(Field::new("name", DataType::Int32, false))), + Field::new("name", DataType::Int32, false), false, )); } @@ -889,9 +882,9 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list1", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, true))), + Field::new("element", DataType::Utf8, true), false, )); } @@ -903,9 +896,9 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list2", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + Field::new("element", DataType::Utf8, false), true, )); } @@ -917,9 +910,9 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_list( "my_list3", - DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + Field::new("element", DataType::Utf8, false), false, )); } @@ -973,19 +966,12 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_map( "my_map1", - DataType::Map( - Arc::new(Field::new( - "key_value", - DataType::Struct(Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Int32, true), - ])), - false, - )), - false, - ), + "key_value", + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + false, false, )); } @@ -998,19 +984,12 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_map( "my_map2", - DataType::Map( - Arc::new(Field::new( - "map", - DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Utf8, false), - Field::new("num", DataType::Int32, false), - ])), - false, // (#1697) - )), - false, - ), + "map", + Field::new("str", DataType::Utf8, false), + Field::new("num", DataType::Int32, false), + false, true, )); } @@ -1023,19 +1002,12 @@ mod tests { // } // } { - arrow_fields.push(Field::new( + arrow_fields.push(Field::new_map( "my_map3", - DataType::Map( - Arc::new(Field::new( - "map", - DataType::Struct(Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Int32, true), - ])), - false, // (#1697) - )), - false, - ), + "map", + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + false, true, )); } @@ -1199,28 +1171,23 @@ mod tests { { arrow_fields.push(Field::new("leaf1", DataType::Int32, true)); - let inner_group_list = Field::new( + let inner_group_list = Field::new_list( "innerGroup", - DataType::List(Arc::new(Field::new( + Field::new_struct( "innerGroup", - DataType::Struct( - vec![Field::new("leaf3", DataType::Int32, true)].into(), - ), + vec![Field::new("leaf3", DataType::Int32, true)], false, - ))), + ), false, ); - let outer_group_list = Field::new( + let outer_group_list = Field::new_list( "outerGroup", - DataType::List(Arc::new(Field::new( + Field::new_struct( "outerGroup", - DataType::Struct(Fields::from(vec![ - Field::new("leaf2", DataType::Int32, true), - inner_group_list, - ])), + vec![Field::new("leaf2", DataType::Int32, true), inner_group_list], false, - ))), + ), false, ); arrow_fields.push(outer_group_list); @@ -1300,9 +1267,9 @@ mod tests { Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), Field::new("string", DataType::Utf8, true), - Field::new( + Field::new_list( "bools", - DataType::List(Arc::new(Field::new("bools", DataType::Boolean, false))), + Field::new("bools", DataType::Boolean, false), false, ), Field::new("date", DataType::Date32, true), @@ -1324,27 +1291,19 @@ mod tests { DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), false, ), - Field::new( + Field::new_list( "int_list", - DataType::List(Arc::new(Field::new("int_list", DataType::Int32, false))), + Field::new("int_list", DataType::Int32, false), false, ), - Field::new( + Field::new_list( "byte_list", - DataType::List(Arc::new(Field::new( - "byte_list", - DataType::Binary, - false, - ))), + Field::new("byte_list", DataType::Binary, false), false, ), - Field::new( + Field::new_list( "string_list", - DataType::List(Arc::new(Field::new( - "string_list", - DataType::Utf8, - false, - ))), + Field::new("string_list", DataType::Utf8, false), false, ), Field::new("decimal_int32", DataType::Decimal128(8, 2), false), @@ -1415,14 +1374,14 @@ mod tests { Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), Field::new("string", DataType::Utf8, true), - Field::new( + Field::new_list( "bools", - DataType::List(Arc::new(Field::new("element", DataType::Boolean, true))), + Field::new("element", DataType::Boolean, true), true, ), - Field::new( + Field::new_list( "bools_non_null", - DataType::List(Arc::new(Field::new("element", DataType::Boolean, false))), + Field::new("element", DataType::Boolean, false), false, ), Field::new("date", DataType::Date32, true), @@ -1463,26 +1422,23 @@ mod tests { DataType::Timestamp(TimeUnit::Microsecond, Some("+01:00".into())), false, ), - Field::new( + Field::new_struct( "struct", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("bools", DataType::Boolean, false), Field::new("uint32", DataType::UInt32, false), - Field::new( + Field::new_list( "int32", - DataType::List(Arc::new(Field::new( - "element", - DataType::Int32, - true, - ))), + Field::new("element", DataType::Int32, true), false, ), - ])), + ], false, ), - Field::new( + Field::new_dictionary( "dictionary_strings", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + DataType::Int32, + DataType::Utf8, false, ), Field::new("decimal_int32", DataType::Decimal128(8, 2), false), @@ -1600,9 +1556,9 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new( + Field::new_list( "c21", - DataType::List(Arc::new(Field::new("list", DataType::Boolean, true))), + Field::new("list", DataType::Boolean, true), false, ), // Field::new( @@ -1660,73 +1616,40 @@ mod tests { Field::new("c36", DataType::Decimal128(2, 1), false), Field::new("c37", DataType::Decimal128(50, 20), false), Field::new("c38", DataType::Decimal128(18, 12), true), - Field::new( + Field::new_map( "c39", - DataType::Map( - Arc::new(Field::new( - "key_value", - DataType::Struct(Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new( - "value", - DataType::List(Arc::new(Field::new( - "element", - DataType::Utf8, - true, - ))), - true, - ), - ])), - false, // #1697 - )), - false, // fails to roundtrip keys_sorted + "key_value", + Field::new("key", DataType::Utf8, false), + Field::new_list( + "value", + Field::new("element", DataType::Utf8, true), + true, ), + false, // fails to roundtrip keys_sorted true, ), - Field::new( + Field::new_map( "c40", - DataType::Map( - Arc::new(Field::new( - "my_entries", - DataType::Struct(Fields::from(vec![ - Field::new("my_key", DataType::Utf8, false), - Field::new( - "my_value", - DataType::List(Arc::new(Field::new( - "item", - DataType::Utf8, - true, - ))), - true, - ), - ])), - false, // #1697 - )), - false, // fails to roundtrip keys_sorted + "my_entries", + Field::new("my_key", DataType::Utf8, false), + Field::new_list( + "my_value", + Field::new("item", DataType::Utf8, true), + true, ), + false, // fails to roundtrip keys_sorted true, ), - Field::new( + Field::new_map( "c41", - DataType::Map( - Arc::new(Field::new( - "my_entries", - DataType::Struct(Fields::from(vec![ - Field::new("my_key", DataType::Utf8, false), - Field::new( - "my_value", - DataType::List(Arc::new(Field::new( - "item", - DataType::Utf8, - true, - ))), - true, - ), - ])), - false, - )), - false, // fails to roundtrip keys_sorted + "my_entries", + Field::new("my_key", DataType::Utf8, false), + Field::new_list( + "my_value", + Field::new("item", DataType::Utf8, true), + true, ), + false, // fails to roundtrip keys_sorted false, ), ], @@ -1760,13 +1683,9 @@ mod tests { let schema = Schema::new_with_metadata( vec![ - Field::new( + Field::new_list( "c21", - DataType::List(Arc::new(Field::new( - "array", - DataType::Boolean, - true, - ))), + Field::new("array", DataType::Boolean, true), false, ), Field::new( @@ -1777,20 +1696,20 @@ mod tests { ), false, ), - Field::new( + Field::new_list( "c23", - DataType::List(Arc::new(Field::new( + Field::new_large_list( "items", - DataType::LargeList(Arc::new(Field::new( + Field::new_struct( "items", - DataType::Struct(Fields::from(vec![ + vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false), - ])), + ], true, - ))), + ), true, - ))), + ), true, ), ], From 27334b27be13188295aac88630d4bf92a66d527a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 31 Mar 2023 19:10:45 +0100 Subject: [PATCH 0772/1411] Update AWS SDK (#3993) --- object_store/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 9bf104334ed9..fcdbd98ed9bb 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -52,9 +52,9 @@ ring = { version = "0.16", default-features = false, features = ["std"], optiona rustls-pemfile = { version = "1.0", default-features = false, optional = true } # AWS Profile support -aws-types = { version = "0.54", optional = true } -aws-credential-types = { version = "0.54", optional = true } -aws-config = { version = "0.54", optional = true } +aws-types = { version = "0.55", optional = true } +aws-credential-types = { version = "0.55", optional = true } +aws-config = { version = "0.55", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } From 8bac91d732bc3cfc01ecb2869bce013c9a6cf2e5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 1 Apr 2023 11:27:49 +0100 Subject: [PATCH 0773/1411] Panic instead of discarding nulls converting StructArray to RecordBatch - (#3951) (#3953) * Don't discard nulls converting StructArray to RecordBatch (#3951) * Add more docs --- arrow-array/src/array/struct_array.rs | 32 ++++++++++++- arrow-array/src/record_batch.rs | 48 ++++++++++++------- arrow/src/ffi_stream.rs | 4 +- .../src/arrow/array_reader/struct_array.rs | 5 +- parquet/src/arrow/arrow_reader/mod.rs | 14 +++--- 5 files changed, 75 insertions(+), 28 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 0604f71d3294..e31594d4b073 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -18,12 +18,34 @@ use crate::{make_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, Field, SchemaBuilder}; +use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder}; use std::sync::Arc; use std::{any::Any, ops::Index}; /// A nested array type where each child (called *field*) is represented by a separate /// array. +/// +/// +/// # Comparison with [RecordBatch] +/// +/// Both [`RecordBatch`] and [`StructArray`] represent a collection of columns / arrays with the +/// same length. +/// +/// However, there are a couple of key differences: +/// +/// * [`StructArray`] can be nested within other [`Array`], including itself +/// * [`RecordBatch`] can contain top-level metadata on its associated [`Schema`][arrow_schema::Schema] +/// * [`StructArray`] can contain top-level nulls, i.e. `null` +/// * [`RecordBatch`] can only represent nulls in its child columns, i.e. `{"field": null}` +/// +/// [`StructArray`] is therefore a more general data container than [`RecordBatch`], and as such +/// code that needs to handle both will typically share an implementation in terms of +/// [`StructArray`] and convert to/from [`RecordBatch`] as necessary. +/// +/// [`From`] implementations are provided to facilitate this conversion, however, converting +/// from a [`StructArray`] containing top-level nulls to a [`RecordBatch`] will panic, as there +/// is no way to preserve them. +/// /// # Example: Create an array from a vector of fields /// /// ``` @@ -89,6 +111,14 @@ impl StructArray { } } + /// Returns the [`Fields`] of this [`StructArray`] + pub fn fields(&self) -> &Fields { + match self.data_type() { + DataType::Struct(f) => f, + _ => unreachable!(), + } + } + /// Return child array whose field name equals to column_name /// /// Note: A schema can currently have duplicate field names, in which case diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index db4bb1230ca7..081bd55fc650 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -446,23 +446,28 @@ impl Default for RecordBatchOptions { Self::new() } } +impl From for RecordBatch { + fn from(value: StructArray) -> Self { + assert_eq!( + value.null_count(), + 0, + "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" + ); + let row_count = value.len(); + let schema = Arc::new(Schema::new(value.fields().clone())); + let columns = value.boxed_fields; + + RecordBatch { + schema, + row_count, + columns, + } + } +} + impl From<&StructArray> for RecordBatch { - /// Create a record batch from struct array, where each field of - /// the `StructArray` becomes a `Field` in the schema. - /// - /// This currently does not flatten and nested struct types fn from(struct_array: &StructArray) -> Self { - if let DataType::Struct(fields) = struct_array.data_type() { - let schema = Schema::new(fields.clone()); - let columns = struct_array.boxed_fields.clone(); - RecordBatch { - schema: Arc::new(schema), - row_count: struct_array.len(), - columns, - } - } else { - unreachable!("unable to get datatype as struct") - } + struct_array.clone().into() } } @@ -558,7 +563,7 @@ mod tests { BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, }; use arrow_buffer::{Buffer, ToByteSlice}; - use arrow_data::ArrayDataBuilder; + use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; #[test] @@ -1046,4 +1051,15 @@ mod tests { assert!(!options.match_field_names); assert_eq!(options.row_count.unwrap(), 20) } + + #[test] + #[should_panic(expected = "Cannot convert nullable StructArray to RecordBatch")] + fn test_from_struct() { + let s = StructArray::from(ArrayData::new_null( + // Note child is not nullable + &DataType::Struct(vec![Field::new("foo", DataType::Int32, false)].into()), + 2, + )); + let _ = RecordBatch::from(s); + } } diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index b1046d142f32..6b3067ab7d75 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -373,7 +373,7 @@ impl Iterator for ArrowArrayStreamReader { .to_data() .ok()?; - let record_batch = RecordBatch::from(&StructArray::from(data)); + let record_batch = RecordBatch::from(StructArray::from(data)); Some(Ok(record_batch)) } else { @@ -492,7 +492,7 @@ mod tests { .to_data() .unwrap(); - let record_batch = RecordBatch::from(&StructArray::from(array)); + let record_batch = RecordBatch::from(StructArray::from(array)); produced_batches.push(record_batch); } diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 22724ae3f081..0670701a0375 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -217,6 +217,7 @@ mod tests { use crate::arrow::array_reader::ListArrayReader; use arrow::buffer::Buffer; use arrow::datatypes::Field; + use arrow_array::cast::AsArray; use arrow_array::{Array, Int32Array, ListArray}; use arrow_schema::Fields; @@ -252,7 +253,7 @@ mod tests { ); let struct_array = struct_array_reader.next_batch(5).unwrap(); - let struct_array = struct_array.as_any().downcast_ref::().unwrap(); + let struct_array = struct_array.as_struct(); assert_eq!(5, struct_array.len()); assert_eq!( @@ -328,7 +329,7 @@ mod tests { ); let actual = struct_reader.next_batch(1024).unwrap(); - let actual = actual.as_any().downcast_ref::().unwrap(); + let actual = actual.as_struct(); assert_eq!(actual, &expected) } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index ba322e29d868..4b88a33f3a25 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -20,7 +20,8 @@ use std::collections::VecDeque; use std::sync::Arc; -use arrow_array::{Array, StructArray}; +use arrow_array::cast::AsArray; +use arrow_array::Array; use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; use arrow_select::filter::prep_null_mask_filter; @@ -559,12 +560,11 @@ impl Iterator for ParquetRecordBatchReader { match self.array_reader.consume_batch() { Err(error) => Some(Err(error.into())), Ok(array) => { - let struct_array = - array.as_any().downcast_ref::().ok_or_else(|| { - ArrowError::ParquetError( - "Struct array reader should return struct array".to_string(), - ) - }); + let struct_array = array.as_struct_opt().ok_or_else(|| { + ArrowError::ParquetError( + "Struct array reader should return struct array".to_string(), + ) + }); match struct_array { Err(err) => Some(Err(err)), From 4e7bb45050622d5b43505aa64dacf410cb329941 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 1 Apr 2023 11:27:29 -0700 Subject: [PATCH 0774/1411] Handle precision overflow when casting from integer to decimal (#3996) * Handle overflow precision when casting from integer to decimal * fix clippy * Update test * Update test --- arrow-cast/src/cast.rs | 73 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 9886decd9ddc..e4f4370fdde5 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -358,13 +358,29 @@ where let array = if scale < 0 { match cast_options.safe { - true => array.unary_opt::<_, D>(|v| v.as_().div_checked(scale_factor).ok()), - false => array.try_unary::<_, D, _>(|v| v.as_().div_checked(scale_factor))?, + true => array.unary_opt::<_, D>(|v| { + v.as_().div_checked(scale_factor).ok().and_then(|v| { + (D::validate_decimal_precision(v, precision).is_ok()).then_some(v) + }) + }), + false => array.try_unary::<_, D, _>(|v| { + v.as_() + .div_checked(scale_factor) + .and_then(|v| D::validate_decimal_precision(v, precision).map(|_| v)) + })?, } } else { match cast_options.safe { - true => array.unary_opt::<_, D>(|v| v.as_().mul_checked(scale_factor).ok()), - false => array.try_unary::<_, D, _>(|v| v.as_().mul_checked(scale_factor))?, + true => array.unary_opt::<_, D>(|v| { + v.as_().mul_checked(scale_factor).ok().and_then(|v| { + (D::validate_decimal_precision(v, precision).is_ok()).then_some(v) + }) + }), + false => array.try_unary::<_, D, _>(|v| { + v.as_() + .mul_checked(scale_factor) + .and_then(|v| D::validate_decimal_precision(v, precision).map(|_| v)) + })?, } }; @@ -4375,8 +4391,7 @@ mod tests { assert!(casted_array.is_ok()); let array = casted_array.unwrap(); let array: &Decimal128Array = array.as_primitive(); - let err = array.validate_decimal_precision(3); - assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); + assert!(array.is_null(4)); // test i8 to decimal type with overflow the result type // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. @@ -4385,8 +4400,7 @@ mod tests { assert!(casted_array.is_ok()); let array = casted_array.unwrap(); let array: &Decimal128Array = array.as_primitive(); - let err = array.validate_decimal_precision(3); - assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", err.unwrap_err().to_string()); + assert!(array.is_null(4)); // test f32 to decimal type let array = Float32Array::from(vec![ @@ -4544,8 +4558,7 @@ mod tests { assert!(casted_array.is_ok()); let array = casted_array.unwrap(); let array: &Decimal256Array = array.as_primitive(); - let err = array.validate_decimal_precision(3); - assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal256 of precision 3. Max is 999", err.unwrap_err().to_string()); + assert!(array.is_null(4)); // test f32 to decimal type let array = Float32Array::from(vec![ @@ -8132,4 +8145,44 @@ mod tests { .unwrap(); assert_eq!(1672531200000000000, c.value(0)); } + + #[test] + fn test_cast_numeric_to_decimal128_precision_overflow() { + let array = Int64Array::from(vec![1234567]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(7, 3), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal128(7, 3), + &CastOptions { safe: false }, + ); + assert_eq!("Invalid argument error: 1234567000 is too large to store in a Decimal128 of precision 7. Max is 9999999", err.unwrap_err().to_string()); + } + + #[test] + fn test_cast_numeric_to_decimal256_precision_overflow() { + let array = Int64Array::from(vec![1234567]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(7, 3), + &CastOptions { safe: true }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal256(7, 3), + &CastOptions { safe: false }, + ); + assert_eq!("Invalid argument error: 1234567000 is too large to store in a Decimal256 of precision 7. Max is 9999999", err.unwrap_err().to_string()); + } } From 591f0ef45c8184c2cbf5d46fc1ddf81c207e55ee Mon Sep 17 00:00:00 2001 From: comphead Date: Sat, 1 Apr 2023 13:45:54 -0700 Subject: [PATCH 0775/1411] Support CAST from Decimal datatype to String (#3994) * Support CAST Decimal to Utf8 * remove null * fmt * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * comments --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- arrow-cast/src/cast.rs | 61 +++++++++++++++++++++++++++++++++++++++ arrow/tests/array_cast.rs | 4 +-- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index e4f4370fdde5..5d7bea0e9d0f 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -145,6 +145,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { // decimal to signed numeric (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, + // decimal to Utf8 + (Decimal128(_, _), Utf8 | LargeUtf8) => true, + (Decimal256(_, _), Utf8 | LargeUtf8) => true, // Utf8 to decimal (Utf8 | LargeUtf8, Decimal128(_, _)) => true, (Utf8 | LargeUtf8, Decimal256(_, _)) => true, @@ -826,6 +829,8 @@ pub fn cast_with_options( x as f64 / 10_f64.powi(*scale as i32) }) } + Utf8 => value_to_string::(array), + LargeUtf8 => value_to_string::(array), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -893,6 +898,8 @@ pub fn cast_with_options( x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }) } + Utf8 => value_to_string::(array), + LargeUtf8 => value_to_string::(array), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -8146,6 +8153,60 @@ mod tests { assert_eq!(1672531200000000000, c.value(0)); } + #[test] + fn test_cast_decimal_to_utf8() { + fn test_decimal_to_string( + output_type: DataType, + array: PrimitiveArray, + ) { + let b = cast(&array, &output_type).unwrap(); + + assert_eq!(b.data_type(), &output_type); + let c = b.as_string::(); + + assert_eq!("1123.454", c.value(0)); + assert_eq!("2123.456", c.value(1)); + assert_eq!("-3123.453", c.value(2)); + assert_eq!("-3123.456", c.value(3)); + assert_eq!("0.000", c.value(4)); + assert_eq!("0.123", c.value(5)); + assert_eq!("1234.567", c.value(6)); + assert_eq!("-1234.567", c.value(7)); + assert!(c.is_null(8)); + } + let array128: Vec> = vec![ + Some(1123454), + Some(2123456), + Some(-3123453), + Some(-3123456), + Some(0), + Some(123), + Some(123456789), + Some(-123456789), + None, + ]; + + let array256: Vec> = + array128.iter().map(|v| v.map(i256::from_i128)).collect(); + + test_decimal_to_string::( + DataType::Utf8, + create_decimal_array(array128.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::LargeUtf8, + create_decimal_array(array128, 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::Utf8, + create_decimal256_array(array256.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::LargeUtf8, + create_decimal256_array(array256, 7, 3).unwrap(), + ); + } + #[test] fn test_cast_numeric_to_decimal128_precision_overflow() { let array = Int64Array::from(vec![1234567]); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 2807bbd79b83..96a4f2b41f3c 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -185,9 +185,7 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - Arc::new( - create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0).unwrap(), - ), + Arc::new(create_decimal_array(vec![Some(1), Some(2), Some(3)], 38, 0).unwrap()), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), From e90c9b411f8f6be3d69907115ebb486e103e640d Mon Sep 17 00:00:00 2001 From: Tsui Yik Ching Date: Sun, 2 Apr 2023 14:09:50 +0100 Subject: [PATCH 0776/1411] Remove non-existent feature from README (#4001) The feature was removed in PR #674, but the feature list in the README wasn't updated --- arrow/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow/README.md b/arrow/README.md index d7a5877b49fa..1e8da360f443 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -46,7 +46,6 @@ The `arrow` crate provides the following features which may be enabled in your ` - `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) - `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns -- `js` - support for building arrow for WebAssembly / JavaScript - `simd` - (_Requires Nightly Rust_) Use alternate hand optimized implementations of some [compute](https://github.com/apache/arrow-rs/tree/master/arrow/src/compute/kernels) kernels using explicit SIMD instructions via [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). From ecd44fd4cd1ab65533979983791d5ef524a2eac6 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Sun, 2 Apr 2023 19:05:01 +0300 Subject: [PATCH 0777/1411] feat: add the implementation BitXor to BooleanBuffer (#3997) * feat: add the implementation BitXor to BooleanBuffer * feat: add tests to "BooleanBuffer" for BitAnd, BitOr, BitXor, Not --- arrow-buffer/src/buffer/boolean.rs | 87 +++++++++++++++++++++++++++++- arrow-buffer/src/buffer/ops.rs | 25 +++++++++ 2 files changed, 110 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index c89cfb3324c9..2202e46d2876 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -18,9 +18,10 @@ use crate::bit_chunk_iterator::BitChunks; use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::{ - bit_util, buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, + bit_util, buffer_bin_and, buffer_bin_or, buffer_bin_xor, buffer_unary_not, Buffer, + MutableBuffer, }; -use std::ops::{BitAnd, BitOr, Not}; +use std::ops::{BitAnd, BitOr, BitXor, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans #[derive(Debug, Clone, Eq)] @@ -236,6 +237,25 @@ impl BitOr<&BooleanBuffer> for &BooleanBuffer { } } +impl BitXor<&BooleanBuffer> for &BooleanBuffer { + type Output = BooleanBuffer; + + fn bitxor(self, rhs: &BooleanBuffer) -> Self::Output { + assert_eq!(self.len, rhs.len); + BooleanBuffer { + buffer: buffer_bin_xor( + &self.buffer, + self.offset, + &rhs.buffer, + rhs.offset, + self.len, + ), + offset: 0, + len: self.len, + } + } +} + impl<'a> IntoIterator for &'a BooleanBuffer { type Item = bool; type IntoIter = BitIterator<'a>; @@ -244,3 +264,66 @@ impl<'a> IntoIterator for &'a BooleanBuffer { BitIterator::new(self.values(), self.offset, self.len) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bitand() { + let offset = 0; + let len = 40; + + let buf1 = Buffer::from(&[0, 1, 1, 0, 0]); + let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len); + + let buf2 = Buffer::from(&[0, 1, 1, 1, 0]); + let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len); + + let expected = BooleanBuffer::new(Buffer::from(&[0, 1, 1, 0, 0]), offset, len); + assert_eq!(boolean_buf1 & boolean_buf2, expected); + } + + #[test] + fn test_bitor() { + let offset = 0; + let len = 40; + + let buf1 = Buffer::from(&[0, 1, 1, 0, 0]); + let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len); + + let buf2 = Buffer::from(&[0, 1, 1, 1, 0]); + let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len); + + let expected = BooleanBuffer::new(Buffer::from(&[0, 1, 1, 1, 0]), offset, len); + assert_eq!(boolean_buf1 | boolean_buf2, expected); + } + + #[test] + fn test_bitxor() { + let offset = 0; + let len = 40; + + let buf1 = Buffer::from(&[0, 1, 1, 0, 0]); + let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len); + + let buf2 = Buffer::from(&[0, 1, 1, 1, 0]); + let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len); + + let expected = BooleanBuffer::new(Buffer::from(&[0, 0, 0, 1, 0]), offset, len); + assert_eq!(boolean_buf1 ^ boolean_buf2, expected); + } + + #[test] + fn test_not() { + let offset = 0; + let len = 40; + + let buf = Buffer::from(&[0, 1, 1, 0, 0]); + let boolean_buf = &BooleanBuffer::new(buf, offset, len); + + let expected = + BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); + assert_eq!(!boolean_buf, expected); + } +} diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 87dc5c003fb2..eccff6280dd8 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -125,6 +125,8 @@ where result.into() } +/// Apply a bitwise and to two inputs and return the result as a Buffer. +/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. pub fn buffer_bin_and( left: &Buffer, left_offset_in_bits: usize, @@ -142,6 +144,8 @@ pub fn buffer_bin_and( ) } +/// Apply a bitwise or to two inputs and return the result as a Buffer. +/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. pub fn buffer_bin_or( left: &Buffer, left_offset_in_bits: usize, @@ -159,6 +163,27 @@ pub fn buffer_bin_or( ) } +/// Apply a bitwise xor to two inputs and return the result as a Buffer. +/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +pub fn buffer_bin_xor( + left: &Buffer, + left_offset_in_bits: usize, + right: &Buffer, + right_offset_in_bits: usize, + len_in_bits: usize, +) -> Buffer { + bitwise_bin_op_helper( + left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + |a, b| a ^ b, + ) +} + +/// Apply a bitwise not to one input and return the result as a Buffer. +/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. pub fn buffer_unary_not( left: &Buffer, offset_in_bits: usize, From 533e98325270cc4e399344279a07e397271a1340 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Mon, 3 Apr 2023 14:23:25 +0300 Subject: [PATCH 0778/1411] Minor: add tests for BooleanBuffer (#4004) * feat: add tests for BooleanBuffer * fix: use "offset" and "len" like methods --- arrow-buffer/src/buffer/boolean.rs | 72 ++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 2202e46d2876..ffee13bd4956 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -270,7 +270,71 @@ mod tests { use super::*; #[test] - fn test_bitand() { + fn test_boolean_new() { + let bytes = &[0, 1, 2, 3, 4]; + let buf = Buffer::from(bytes); + let offset = 0; + let len = 24; + + let boolean_buf = BooleanBuffer::new(buf.clone(), offset, len); + assert_eq!(bytes, boolean_buf.values()); + assert_eq!(offset, boolean_buf.offset()); + assert_eq!(len, boolean_buf.len()); + + assert_eq!(2, boolean_buf.count_set_bits()); + assert_eq!(&buf, boolean_buf.inner()); + assert_eq!(buf, boolean_buf.clone().into_inner()); + + assert!(!boolean_buf.is_empty()) + } + + #[test] + fn test_boolean_data_equality() { + let boolean_buf1 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 32); + let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 32); + assert_eq!(boolean_buf1, boolean_buf2); + + // slice with same offset and same length should still preserve equality + let boolean_buf3 = boolean_buf1.slice(8, 16); + assert_ne!(boolean_buf1, boolean_buf3); + let boolean_buf4 = boolean_buf1.slice(0, 32); + assert_eq!(boolean_buf1, boolean_buf4); + + // unequal because of different elements + let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 0, 2, 3, 4]), 0, 32); + assert_ne!(boolean_buf1, boolean_buf2); + + // unequal because of different length + let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 24); + assert_ne!(boolean_buf1, boolean_buf2); + + // ptr_eq + assert!(boolean_buf1.ptr_eq(&boolean_buf1)); + assert!(boolean_buf2.ptr_eq(&boolean_buf2)); + assert!(!boolean_buf1.ptr_eq(&boolean_buf2)); + } + + #[test] + fn test_boolean_slice() { + let bytes = &[0, 3, 2, 6, 2]; + let boolean_buf1 = BooleanBuffer::new(Buffer::from(bytes), 0, 32); + let boolean_buf2 = BooleanBuffer::new(Buffer::from(bytes), 0, 32); + + let boolean_slice1 = boolean_buf1.slice(16, 16); + let boolean_slice2 = boolean_buf2.slice(0, 16); + assert_eq!(boolean_slice1.values(), boolean_slice2.values()); + + assert_eq!(bytes, boolean_slice1.values()); + assert_eq!(16, boolean_slice1.offset); + assert_eq!(16, boolean_slice1.len); + + assert_eq!(bytes, boolean_slice2.values()); + assert_eq!(0, boolean_slice2.offset); + assert_eq!(16, boolean_slice2.len); + } + + #[test] + fn test_boolean_bitand() { let offset = 0; let len = 40; @@ -285,7 +349,7 @@ mod tests { } #[test] - fn test_bitor() { + fn test_boolean_bitor() { let offset = 0; let len = 40; @@ -300,7 +364,7 @@ mod tests { } #[test] - fn test_bitxor() { + fn test_boolean_bitxor() { let offset = 0; let len = 40; @@ -315,7 +379,7 @@ mod tests { } #[test] - fn test_not() { + fn test_boolean_not() { let offset = 0; let len = 40; From 877a3a05a3791cdac14a1a08b23396ac86df4017 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Apr 2023 12:55:22 +0100 Subject: [PATCH 0779/1411] Deprecate combine_option_bitmap (#4005) --- arrow-data/src/bit_mask.rs | 3 +++ arrow-string/src/concat_elements.rs | 14 ++++---------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arrow-data/src/bit_mask.rs b/arrow-data/src/bit_mask.rs index 94ea57259ac8..d978f2b74618 100644 --- a/arrow-data/src/bit_mask.rs +++ b/arrow-data/src/bit_mask.rs @@ -68,6 +68,7 @@ pub fn set_bits( /// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. /// /// This function is useful when implementing operations on higher level arrays. +#[deprecated(note = "Use NullBuffer::union")] pub fn combine_option_bitmap( arrays: &[&ArrayData], len_in_bits: usize, @@ -247,6 +248,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_combine_option_bitmap() { let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); let some_bitmap = @@ -298,6 +300,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_combine_option_bitmap_with_offsets() { let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); let bitmap0 = diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index 4da9e2539e7e..a6e02d04dd3f 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -21,7 +21,6 @@ use arrow_array::builder::BufferBuilder; use arrow_array::types::ByteArrayType; use arrow_array::*; use arrow_buffer::{ArrowNativeType, NullBuffer}; -use arrow_data::bit_mask::combine_option_bitmap; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; @@ -125,14 +124,9 @@ pub fn concat_elements_utf8_many( ))); } - let output_bitmap = combine_option_bitmap( - arrays - .iter() - .map(|a| a.data()) - .collect::>() - .as_slice(), - size, - ); + let nulls = arrays + .iter() + .fold(None, |acc, a| NullBuffer::union(acc.as_ref(), a.nulls())); let data_values = arrays .iter() @@ -170,7 +164,7 @@ pub fn concat_elements_utf8_many( .len(size) .add_buffer(output_offsets.finish()) .add_buffer(output_values.finish()) - .null_bit_buffer(output_bitmap); + .nulls(nulls); // SAFETY - offsets valid by construction Ok(unsafe { builder.build_unchecked() }.into()) From 5a63a63f8c8a9a46840a072d72786846e7b1bb89 Mon Sep 17 00:00:00 2001 From: Huxley Hu Date: Mon, 3 Apr 2023 19:55:49 +0800 Subject: [PATCH 0780/1411] Support to read/write customized metadata in ipc files (#4003) Test Plan: Pass CI --- arrow-ipc/src/convert.rs | 52 ++++++++++++++++++---------------------- arrow-ipc/src/reader.rs | 38 +++++++++++++++++++++++++++++ arrow-ipc/src/writer.rs | 12 ++++++++++ 3 files changed, 73 insertions(+), 29 deletions(-) diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index c7b5559fa49a..07f716dea843 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -24,7 +24,7 @@ use flatbuffers::{ use std::collections::HashMap; use std::sync::Arc; -use crate::{size_prefixed_root_as_message, CONTINUATION_MARKER}; +use crate::{size_prefixed_root_as_message, KeyValue, CONTINUATION_MARKER}; use DataType::*; /// Serialize a schema in IPC format @@ -38,6 +38,25 @@ pub fn schema_to_fb(schema: &Schema) -> FlatBufferBuilder { fbb } +pub fn metadata_to_fb<'a>( + fbb: &mut FlatBufferBuilder<'a>, + metadata: &HashMap, +) -> WIPOffset>>> { + let custom_metadata = metadata + .iter() + .map(|(k, v)| { + let fb_key_name = fbb.create_string(k); + let fb_val_name = fbb.create_string(v); + + let mut kv_builder = crate::KeyValueBuilder::new(fbb); + kv_builder.add_key(fb_key_name); + kv_builder.add_value(fb_val_name); + kv_builder.finish() + }) + .collect::>(); + fbb.create_vector(&custom_metadata) +} + pub fn schema_to_fb_offset<'a>( fbb: &mut FlatBufferBuilder<'a>, schema: &Schema, @@ -49,24 +68,8 @@ pub fn schema_to_fb_offset<'a>( .collect::>(); let fb_field_list = fbb.create_vector(&fields); - let fb_metadata_list = if !schema.metadata().is_empty() { - let custom_metadata = schema - .metadata() - .iter() - .map(|(k, v)| { - let fb_key_name = fbb.create_string(k); - let fb_val_name = fbb.create_string(v); - - let mut kv_builder = crate::KeyValueBuilder::new(fbb); - kv_builder.add_key(fb_key_name); - kv_builder.add_value(fb_val_name); - kv_builder.finish() - }) - .collect::>(); - Some(fbb.create_vector(&custom_metadata)) - } else { - None - }; + let fb_metadata_list = + (!schema.metadata().is_empty()).then(|| metadata_to_fb(fbb, schema.metadata())); let mut builder = crate::SchemaBuilder::new(fbb); builder.add_fields(fb_field_list); @@ -440,16 +443,7 @@ pub(crate) fn build_field<'a>( // Optional custom metadata. let mut fb_metadata = None; if !field.metadata().is_empty() { - let mut kv_vec = vec![]; - for (k, v) in field.metadata() { - let kv_args = crate::KeyValueArgs { - key: Some(fbb.create_string(k.as_str())), - value: Some(fbb.create_string(v.as_str())), - }; - let kv_offset = crate::KeyValue::create(fbb, &kv_args); - kv_vec.push(kv_offset); - } - fb_metadata = Some(fbb.create_vector(&kv_vec)); + fb_metadata = Some(metadata_to_fb(fbb, field.metadata())); }; let fb_field_name = fbb.create_string(field.name().as_str()); diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 75d078456f0a..60633487aeaf 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -792,6 +792,9 @@ pub struct FileReader { /// Metadata version metadata_version: crate::MetadataVersion, + /// User defined metadata + custom_metadata: HashMap, + /// Optional projection and projected_schema projection: Option<(Vec, Schema)>, } @@ -862,6 +865,16 @@ impl FileReader { let ipc_schema = footer.schema().unwrap(); let schema = crate::convert::fb_to_schema(ipc_schema); + let mut custom_metadata = HashMap::new(); + if let Some(fb_custom_metadata) = footer.custom_metadata() { + for kv in fb_custom_metadata.into_iter() { + custom_metadata.insert( + kv.key().unwrap().to_string(), + kv.value().unwrap().to_string(), + ); + } + } + // Create an array of optional dictionary value arrays, one per field. let mut dictionaries_by_id = HashMap::new(); if let Some(dictionaries) = footer.dictionaries() { @@ -926,10 +939,16 @@ impl FileReader { total_blocks, dictionaries_by_id, metadata_version: footer.version(), + custom_metadata, projection, }) } + /// Return user defined customized metadata + pub fn custom_metadata(&self) -> &HashMap { + &self.custom_metadata + } + /// Return the number of batches in the file pub fn num_batches(&self) -> usize { self.total_blocks @@ -1522,6 +1541,25 @@ mod tests { reader.next().unwrap().unwrap() } + #[test] + fn test_roundtrip_with_custom_metadata() { + let schema = Schema::new(vec![Field::new("dummy", DataType::Float64, false)]); + let mut buf = Vec::new(); + let mut writer = crate::writer::FileWriter::try_new(&mut buf, &schema).unwrap(); + let mut test_metadata = HashMap::new(); + test_metadata.insert("abc".to_string(), "abc".to_string()); + test_metadata.insert("def".to_string(), "def".to_string()); + for (k, v) in &test_metadata { + writer.write_metadata(k, v); + } + writer.finish().unwrap(); + drop(writer); + + let reader = + crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + assert_eq!(reader.custom_metadata(), &test_metadata); + } + #[test] fn test_roundtrip_nested_dict() { let inner: DictionaryArray = vec!["a", "b", "a"].into_iter().collect(); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 12c173f64c0f..7d29f048a762 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -706,6 +706,8 @@ pub struct FileWriter { finished: bool, /// Keeps track of dictionaries that have been written dictionary_tracker: DictionaryTracker, + /// User level customized metadata + custom_metadata: HashMap, data_gen: IpcDataGenerator, } @@ -742,10 +744,15 @@ impl FileWriter { record_blocks: vec![], finished: false, dictionary_tracker: DictionaryTracker::new(true), + custom_metadata: HashMap::new(), data_gen, }) } + pub fn write_metadata(&mut self, key: impl Into, value: impl Into) { + self.custom_metadata.insert(key.into(), value.into()); + } + /// Write a record batch to the file pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { if self.finished { @@ -798,6 +805,8 @@ impl FileWriter { let dictionaries = fbb.create_vector(&self.dictionary_blocks); let record_batches = fbb.create_vector(&self.record_blocks); let schema = crate::convert::schema_to_fb_offset(&mut fbb, &self.schema); + let fb_custom_metadata = (!self.custom_metadata.is_empty()) + .then(|| crate::convert::metadata_to_fb(&mut fbb, &self.custom_metadata)); let root = { let mut footer_builder = crate::FooterBuilder::new(&mut fbb); @@ -805,6 +814,9 @@ impl FileWriter { footer_builder.add_schema(schema); footer_builder.add_dictionaries(dictionaries); footer_builder.add_recordBatches(record_batches); + if let Some(fb_custom_metadata) = fb_custom_metadata { + footer_builder.add_custom_metadata(fb_custom_metadata); + } footer_builder.finish() }; fbb.finish(root, None); From d7bba0ad26f4ef8b5a9bc142c78e84154f208e19 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Apr 2023 13:51:35 +0100 Subject: [PATCH 0781/1411] Cleanup more uses of Array::data (#3880) (#4002) * Cleanup more uses of Array::data (#3880) * Fix failing test * Fix parquet map array * Further cleanup * Further cleanup * More cleanup * Fix test * Clippy --- arrow-array/src/array/binary_array.rs | 4 +- arrow-array/src/array/dictionary_array.rs | 16 +-- .../src/array/fixed_size_binary_array.rs | 2 +- .../src/array/fixed_size_list_array.rs | 18 +-- arrow-array/src/array/list_array.rs | 12 +- arrow-array/src/array/map_array.rs | 22 ++- arrow-array/src/array/mod.rs | 4 +- arrow-array/src/array/primitive_array.rs | 23 ++-- arrow-array/src/array/run_array.rs | 6 +- arrow-array/src/array/string_array.rs | 4 +- arrow-array/src/array/struct_array.rs | 10 +- arrow-array/src/array/union_array.rs | 2 +- .../src/builder/generic_list_builder.rs | 56 ++++---- arrow-array/src/builder/struct_builder.rs | 11 +- arrow-cast/src/cast.rs | 8 +- arrow-ipc/src/reader.rs | 10 +- arrow-json/src/reader.rs | 44 ++---- arrow-json/src/writer.rs | 4 +- arrow-ord/src/comparison.rs | 7 +- arrow-ord/src/ord.rs | 45 +++---- arrow-row/src/lib.rs | 25 ++-- arrow-row/src/list.rs | 2 +- arrow-select/src/filter.rs | 46 +++---- arrow-select/src/nullif.rs | 12 +- arrow-select/src/take.rs | 2 +- arrow-string/src/like.rs | 2 +- arrow/benches/array_data_validate.rs | 6 +- arrow/src/array/ffi.rs | 44 +++--- arrow/src/compute/kernels/limit.rs | 4 +- arrow/src/ffi.rs | 4 +- arrow/src/util/data_gen.rs | 4 +- arrow/tests/array_equal.rs | 10 +- arrow/tests/array_transform.rs | 125 +++++++++--------- arrow/tests/array_validation.rs | 10 +- .../array_reader/byte_array_dictionary.rs | 5 +- parquet/src/arrow/array_reader/list_array.rs | 10 +- parquet/src/arrow/array_reader/map_array.rs | 2 +- .../src/arrow/array_reader/struct_array.rs | 4 +- parquet/src/arrow/arrow_reader/mod.rs | 6 +- parquet/src/arrow/arrow_writer/levels.rs | 57 ++++---- parquet/src/arrow/arrow_writer/mod.rs | 76 +++++------ parquet/src/arrow/buffer/dictionary_buffer.rs | 3 +- 42 files changed, 352 insertions(+), 415 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index ccce3cda9989..be861474f659 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -683,7 +683,7 @@ mod tests { let data = vec![None]; let array = BinaryArray::from(data); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } @@ -693,7 +693,7 @@ mod tests { let data = vec![None]; let array = LargeBinaryArray::from(data); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 343fed76846a..dd6213d543ea 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -252,11 +252,11 @@ impl DictionaryArray { // Note: This use the ArrayDataBuilder::build_unchecked and afterwards // call the new function which only validates that the keys are in bounds. - let data = keys.data().clone(); + let data = keys.to_data(); let builder = data .into_builder() .data_type(dict_data_type) - .add_child_data(values.data().clone()); + .add_child_data(values.to_data()); // Safety: `validate` ensures key type is correct, and // `validate_values` ensures all offsets are within range @@ -397,7 +397,7 @@ impl DictionaryArray { Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()), )) - .child_data(vec![values.data().clone()]); + .child_data(vec![values.to_data()]); // SAFETY: // Offsets were valid before and verified length is greater than or equal @@ -789,7 +789,7 @@ mod tests { let dict_array = Int16DictionaryArray::from(dict_data); let values = dict_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int8, dict_array.value_type()); assert_eq!(3, dict_array.len()); @@ -809,7 +809,7 @@ mod tests { let dict_array = Int16DictionaryArray::from(dict_data); let values = dict_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int8, dict_array.value_type()); assert_eq!(2, dict_array.len()); assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4])); @@ -911,7 +911,7 @@ mod tests { let test = vec![None, None, None]; let array: DictionaryArray = test.into_iter().collect(); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } @@ -987,7 +987,7 @@ mod tests { assert_eq!(array.keys().data_type(), &DataType::Int32); assert_eq!(array.values().data_type(), &DataType::Utf8); - assert_eq!(array.data().null_count(), 1); + assert_eq!(array.null_count(), 1); assert!(array.keys().is_valid(0)); assert!(array.keys().is_valid(1)); @@ -1076,7 +1076,7 @@ mod tests { let boxed: ArrayRef = Arc::new(dict_array); let col: DictionaryArray = - DictionaryArray::::from(boxed.data().clone()); + DictionaryArray::::from(boxed.to_data()); let err = col.into_primitive_dict_builder::(); let returned = err.unwrap_err(); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index fa303b4a8dbc..f8d2f04dee69 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -792,7 +792,7 @@ mod tests { FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0) .unwrap(); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 4a592d869437..a56bb017f6b0 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -256,6 +256,8 @@ impl std::fmt::Debug for FixedSizeListArray { #[cfg(test)] mod tests { use super::*; + use crate::cast::AsArray; + use crate::types::Int32Type; use crate::Int32Array; use arrow_buffer::{bit_util, Buffer}; use arrow_schema::Field; @@ -281,7 +283,7 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - assert_eq!(&value_data, list_array.values().data()); + assert_eq!(value_data, list_array.values().to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -310,19 +312,11 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - assert_eq!(&value_data, list_array.values().data()); + assert_eq!(value_data, list_array.values().to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); - assert_eq!( - 3, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); + assert_eq!(3, list_array.value(0).as_primitive::().value(0)); assert_eq!(6, list_array.value_offset(1)); assert_eq!(3, list_array.value_length()); } @@ -386,7 +380,7 @@ mod tests { .unwrap(); let list_array = FixedSizeListArray::from(list_data); - assert_eq!(&value_data, list_array.values().data()); + assert_eq!(value_data, list_array.values().to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(5, list_array.len()); assert_eq!(2, list_array.null_count()); diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 8b314596d959..fb94fe12c87c 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -442,7 +442,7 @@ mod tests { let list_array = ListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -482,7 +482,7 @@ mod tests { let list_array = ListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -532,7 +532,7 @@ mod tests { let list_array = LargeListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -572,7 +572,7 @@ mod tests { let list_array = LargeListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -630,7 +630,7 @@ mod tests { let list_array = ListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(9, list_array.len()); assert_eq!(4, list_array.null_count()); @@ -694,7 +694,7 @@ mod tests { let list_array = LargeListArray::from(list_data); let values = list_array.values(); - assert_eq!(&value_data, values.data()); + assert_eq!(value_data, values.to_data()); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(9, list_array.len()); assert_eq!(4, list_array.null_count()); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 3d78387cdf50..22ebbe533a2f 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -52,6 +52,11 @@ impl MapArray { &self.values } + /// Returns a reference to the [`StructArray`] entries of this map + pub fn entries(&self) -> &ArrayRef { + &self.entries + } + /// Returns the data type of the map's keys. pub fn key_type(&self) -> &DataType { self.keys.data_type() @@ -189,7 +194,7 @@ impl MapArray { let entry_struct = StructArray::from(vec![ (keys_field, Arc::new(keys_data) as ArrayRef), - (values_field, make_array(values.data().clone())), + (values_field, make_array(values.to_data())), ]); let map_data_type = DataType::Map( @@ -369,7 +374,7 @@ mod tests { .unwrap(); let map_array = MapArray::from(map_data); - assert_eq!(&value_data, map_array.values().data()); + assert_eq!(value_data, map_array.values().to_data()); assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(3, map_array.len()); assert_eq!(0, map_array.null_count()); @@ -400,16 +405,9 @@ mod tests { } // Now test with a non-zero offset - let map_data = ArrayData::builder(map_array.data_type().clone()) - .len(2) - .offset(1) - .add_buffer(map_array.data().buffers()[0].clone()) - .add_child_data(map_array.data().child_data()[0].clone()) - .build() - .unwrap(); - let map_array = MapArray::from(map_data); + let map_array = map_array.slice(1, 2); - assert_eq!(&value_data, map_array.values().data()); + assert_eq!(value_data, map_array.values().to_data()); assert_eq!(&DataType::UInt32, map_array.value_type()); assert_eq!(2, map_array.len()); assert_eq!(0, map_array.null_count()); @@ -446,7 +444,7 @@ mod tests { let sliced_array = map_array.slice(1, 2); assert_eq!(2, sliced_array.len()); assert_eq!(1, sliced_array.offset()); - let sliced_array_data = sliced_array.data(); + let sliced_array_data = sliced_array.to_data(); for array_data in sliced_array_data.child_data() { assert_eq!(array_data.offset(), 1); } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 589cf1eaf4aa..41d5c8bebe29 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -290,7 +290,7 @@ impl Array for ArrayRef { } fn into_data(self) -> ArrayData { - self.data().clone() + self.to_data() } #[allow(deprecated)] @@ -357,7 +357,7 @@ impl<'a, T: Array> Array for &'a T { } fn into_data(self) -> ArrayData { - self.data().clone() + self.to_data() } #[allow(deprecated)] diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index f857e26c7f89..5dfcb4da4d16 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -466,9 +466,7 @@ impl PrimitiveArray { O: ArrowPrimitiveType, F: Fn(T::Native) -> O::Native, { - let data = self.data(); - - let nulls = data.nulls().cloned(); + let nulls = self.nulls().cloned(); let values = self.values().iter().map(|v| op(*v)); // JUSTIFICATION // Benefit @@ -593,9 +591,8 @@ impl PrimitiveArray { O: ArrowPrimitiveType, F: Fn(T::Native) -> Option, { - let data = self.data(); - let len = data.len(); - let (nulls, null_count, offset) = match data.nulls() { + let len = self.len(); + let (nulls, null_count, offset) = match self.nulls() { Some(n) => (Some(n.validity()), n.null_count(), n.offset()), None => (None, 0, 0), }; @@ -1185,7 +1182,7 @@ impl PrimitiveArray { pub fn precision(&self) -> u8 { match T::BYTE_LENGTH { 16 => { - if let DataType::Decimal128(p, _) = self.data().data_type() { + if let DataType::Decimal128(p, _) = self.data_type() { *p } else { unreachable!( @@ -1195,7 +1192,7 @@ impl PrimitiveArray { } } 32 => { - if let DataType::Decimal256(p, _) = self.data().data_type() { + if let DataType::Decimal256(p, _) = self.data_type() { *p } else { unreachable!( @@ -1212,7 +1209,7 @@ impl PrimitiveArray { pub fn scale(&self) -> i8 { match T::BYTE_LENGTH { 16 => { - if let DataType::Decimal128(_, s) = self.data().data_type() { + if let DataType::Decimal128(_, s) = self.data_type() { *s } else { unreachable!( @@ -1222,7 +1219,7 @@ impl PrimitiveArray { } } 32 => { - if let DataType::Decimal256(_, s) = self.data().data_type() { + if let DataType::Decimal256(_, s) = self.data_type() { *s } else { unreachable!( @@ -1874,7 +1871,7 @@ mod tests { let array = PrimitiveArray::::from(values.clone()); assert_eq!(array.values(), &values); - let array = PrimitiveArray::::from(array.data().clone()); + let array = PrimitiveArray::::from(array.to_data()); assert_eq!(array.values(), &values); } @@ -1894,7 +1891,7 @@ mod tests { let array = PrimitiveArray::::from(values.clone()); assert_eq!(array.values(), &values); - let array = PrimitiveArray::::from(array.data().clone()); + let array = PrimitiveArray::::from(array.to_data()); assert_eq!(array.values(), &values); } @@ -2190,7 +2187,7 @@ mod tests { let boxed: ArrayRef = Arc::new(array); - let col: Int32Array = PrimitiveArray::::from(boxed.data().clone()); + let col: Int32Array = PrimitiveArray::::from(boxed.to_data()); let err = col.into_builder(); match err { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index ada34b47f8a5..0754913e9d3e 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -104,8 +104,8 @@ impl RunArray { let len = RunArray::logical_len(run_ends); let builder = ArrayDataBuilder::new(ree_array_type) .len(len) - .add_child_data(run_ends.data().clone()) - .add_child_data(values.data().clone()); + .add_child_data(run_ends.to_data()) + .add_child_data(values.to_data()); // `build_unchecked` is used to avoid recursive validation of child arrays. let array_data = unsafe { builder.build_unchecked() }; @@ -665,7 +665,7 @@ mod tests { assert_eq!(ree_array.null_count(), 0); let values = ree_array.values(); - assert_eq!(&value_data.into_data(), values.data()); + assert_eq!(value_data.into_data(), values.to_data()); assert_eq!(&DataType::Int8, values.data_type()); let run_ends = ree_array.run_ends(); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 304f0ab3eee9..e042f29c22d1 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -456,7 +456,7 @@ mod tests { let data: Vec> = vec![None]; let array = StringArray::from(data); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } @@ -466,7 +466,7 @@ mod tests { let data: Vec> = vec![None]; let array = LargeStringArray::from(data); array - .data() + .into_data() .validate_full() .expect("All null array has valid array data"); } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index e31594d4b073..27e10a31fd00 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -464,7 +464,7 @@ mod tests { StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) .unwrap(); - let struct_data = arr.data(); + let struct_data = arr.into_data(); assert_eq!(4, struct_data.len()); assert_eq!(1, struct_data.null_count()); assert_eq!( @@ -488,8 +488,8 @@ mod tests { .build() .unwrap(); - assert_eq!(expected_string_data, *arr.column(0).data()); - assert_eq!(expected_int_data, *arr.column(1).data()); + assert_eq!(expected_string_data, struct_data.child_data()[0]); + assert_eq!(expected_int_data, struct_data.child_data()[1]); } #[test] @@ -579,8 +579,8 @@ mod tests { assert!(struct_array.is_valid(2)); assert!(struct_array.is_null(3)); assert!(struct_array.is_valid(4)); - assert_eq!(&boolean_data, struct_array.column(0).data()); - assert_eq!(&int_data, struct_array.column(1).data()); + assert_eq!(boolean_data, struct_array.column(0).to_data()); + assert_eq!(int_data, struct_array.column(1).to_data()); let c0 = struct_array.column(0); let c0 = c0.as_any().downcast_ref::().unwrap(); diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 67848b4a85cb..7b818f3130b7 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -219,7 +219,7 @@ impl UnionArray { let new_self = unsafe { Self::new_unchecked(field_type_ids, type_ids, value_offsets, child_arrays) }; - new_self.data().validate()?; + new_self.to_data().validate()?; Ok(new_self) } diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 5f726a5b121c..b6d0707982be 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -17,7 +17,7 @@ use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; -use crate::{ArrayRef, GenericListArray, OffsetSizeTrait}; +use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::Field; @@ -228,7 +228,7 @@ where pub fn finish(&mut self) -> GenericListArray { let len = self.len(); let values_arr = self.values_builder.finish(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); let offset_buffer = self.offsets_builder.finish(); let null_bit_buffer = self.null_buffer_builder.finish(); @@ -242,7 +242,7 @@ where let array_data_builder = ArrayData::builder(data_type) .len(len) .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data_builder.build_unchecked() }; @@ -254,7 +254,7 @@ where pub fn finish_cloned(&self) -> GenericListArray { let len = self.len(); let values_arr = self.values_builder.finish_cloned(); - let values_data = values_arr.data(); + let values_data = values_arr.to_data(); let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); let null_bit_buffer = self @@ -270,7 +270,7 @@ where let array_data_builder = ArrayData::builder(data_type) .len(len) .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) + .add_child_data(values_data) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { array_data_builder.build_unchecked() }; @@ -311,7 +311,6 @@ mod tests { use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; - use arrow_buffer::Buffer; use arrow_schema::DataType; fn _test_generic_list_array_builder() { @@ -332,12 +331,9 @@ mod tests { builder.append(true); let list_array = builder.finish(); - let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7]), values); - assert_eq!( - Buffer::from_slice_ref([0, 3, 6, 8].map(|n| O::from_usize(n).unwrap())), - list_array.data().buffers()[0].clone() - ); + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6, 8].map(O::usize_as)); assert_eq!(DataType::Int32, list_array.value_type()); assert_eq!(3, list_array.len()); assert_eq!(0, list_array.null_count()); @@ -469,28 +465,22 @@ mod tests { builder.values().append(true); builder.append(true); - let list_array = builder.finish(); + let l1 = builder.finish(); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!( - Buffer::from_slice_ref([0, 2, 5, 5, 6]), - list_array.data().buffers()[0].clone() - ); - - assert_eq!(6, list_array.values().data().len()); - assert_eq!(1, list_array.values().data().null_count()); - assert_eq!( - Buffer::from_slice_ref([0, 2, 4, 7, 7, 8, 10]), - list_array.values().data().buffers()[0].clone() - ); - - assert_eq!(10, list_array.values().data().child_data()[0].len()); - assert_eq!(0, list_array.values().data().child_data()[0].null_count()); - assert_eq!( - Buffer::from_slice_ref([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - list_array.values().data().child_data()[0].buffers()[0].clone() - ); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); } #[test] diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 499ae183f3e9..ebffeafcf75f 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -233,7 +233,7 @@ impl StructBuilder { let mut child_data = Vec::with_capacity(self.field_builders.len()); for f in &mut self.field_builders { let arr = f.finish(); - child_data.push(arr.data().clone()); + child_data.push(arr.to_data()); } let length = self.len(); let null_bit_buffer = self.null_buffer_builder.finish(); @@ -254,7 +254,7 @@ impl StructBuilder { let mut child_data = Vec::with_capacity(self.field_builders.len()); for f in &self.field_builders { let arr = f.finish_cloned(); - child_data.push(arr.data().clone()); + child_data.push(arr.to_data()); } let length = self.len(); let null_bit_buffer = self @@ -330,9 +330,8 @@ mod tests { builder.append_null(); builder.append(true); - let arr = builder.finish(); + let struct_data = builder.finish().into_data(); - let struct_data = arr.data(); assert_eq!(4, struct_data.len()); assert_eq!(1, struct_data.null_count()); assert_eq!(&[11_u8], struct_data.nulls().unwrap().validity()); @@ -352,8 +351,8 @@ mod tests { .build() .unwrap(); - assert_eq!(expected_string_data, *arr.column(0).data()); - assert_eq!(expected_int_data, *arr.column(1).data()); + assert_eq!(expected_string_data, struct_data.child_data()[0]); + assert_eq!(expected_int_data, struct_data.child_data()[1]); } #[test] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 5d7bea0e9d0f..0ea6332a7ea5 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -634,7 +634,7 @@ pub fn cast_with_options( let from_type = array.data_type(); // clone array if types are the same if from_type == to_type { - return Ok(make_array(array.data().clone())); + return Ok(make_array(array.to_data())); } match (from_type, to_type) { ( @@ -3108,7 +3108,7 @@ fn dictionary_cast( })?; let keys_array: ArrayRef = - Arc::new(PrimitiveArray::::from(dict_array.keys().data().clone())); + Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); let values_array = dict_array.values(); let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; let cast_values = @@ -3182,7 +3182,7 @@ where // Note take requires first casting the indices to u32 let keys_array: ArrayRef = - Arc::new(PrimitiveArray::::from(dict_array.keys().data().clone())); + Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); let indices = cast_with_options(&keys_array, &DataType::UInt32, cast_options)?; let u32_indices = indices @@ -3379,7 +3379,7 @@ fn cast_list_inner( to_type: &DataType, cast_options: &CastOptions, ) -> Result { - let data = array.data().clone(); + let data = array.to_data(); let underlying_array = make_array(data.child_data()[0].clone()); let cast_array = cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 60633487aeaf..3d803b62728c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1608,7 +1608,7 @@ mod tests { let union1 = rb.column(0); let union2 = rb2.column(0); - assert_eq!(union1.data().buffers(), union2.data().buffers()); + assert_eq!(union1, union2); } #[test] @@ -1762,14 +1762,14 @@ mod tests { let values = StringArray::from(vec![Some("a"), None, Some("c"), None]); let keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); - let dict_data = dict_array.data(); + let dict_data = dict_array.to_data(); let value_offsets = Buffer::from_slice_ref(offsets); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) - .add_child_data(dict_data.clone()) + .add_child_data(dict_data) .build() .unwrap(); let list_array = GenericListArray::::from(list_data); @@ -1825,7 +1825,7 @@ mod tests { let values = StringArray::from(vec![Some("a"), None, Some("c"), None]); let keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3, 1, 2]); let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); - let dict_data = dict_array.data(); + let dict_data = dict_array.to_data(); let list_data_type = DataType::FixedSizeList( Arc::new(Field::new_dict( @@ -1839,7 +1839,7 @@ mod tests { ); let list_data = ArrayData::builder(list_data_type) .len(3) - .add_child_data(dict_data.clone()) + .add_child_data(dict_data) .build() .unwrap(); let list_array = FixedSizeListArray::from(list_data); diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index f5bf884fb2ca..39f829052f59 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -2257,16 +2257,9 @@ mod tests { assert_eq!(10, bb.len()); assert_eq!(4.0, bb.value(9)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); + let cc = batch.column(c.0).as_list::(); // test that the list offsets are correct - assert_eq!( - *cc.data().buffers()[0], - Buffer::from_slice_ref([0i32, 2, 2, 4, 5]) - ); + assert_eq!(cc.value_offsets(), &[0, 2, 2, 4, 5]); let cc = cc.values().as_boolean(); let cc_expected = BooleanArray::from(vec![ Some(false), @@ -2275,18 +2268,11 @@ mod tests { None, Some(false), ]); - assert_eq!(cc.data_ref(), cc_expected.data_ref()); + assert_eq!(cc, &cc_expected); - let dd: &ListArray = batch - .column(d.0) - .as_any() - .downcast_ref::() - .unwrap(); + let dd = batch.column(d.0).as_list::(); // test that the list offsets are correct - assert_eq!( - *dd.data().buffers()[0], - Buffer::from_slice_ref([0i32, 1, 1, 2, 6]) - ); + assert_eq!(dd.value_offsets(), &[0, 1, 1, 2, 6]); let dd = dd.values().as_string::(); // values are 6 because a `d: null` is treated as a null slot @@ -2342,12 +2328,7 @@ mod tests { // compare `a` with result from json reader let batch = reader.next().unwrap().unwrap(); let read = batch.column(0); - assert!( - expected.data_ref() == read.data_ref(), - "{:?} != {:?}", - expected.data(), - read.data(), - ); + assert_eq!(&expected, read); } #[test] @@ -2425,12 +2406,9 @@ mod tests { let read = batch.column(0); assert_eq!(read.len(), 6); // compare the arrays the long way around, to better detect differences - let read: &ListArray = read.as_any().downcast_ref::().unwrap(); - let expected = expected.as_any().downcast_ref::().unwrap(); - assert_eq!( - *read.data().buffers()[0], - Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7]) - ); + let read: &ListArray = read.as_list::(); + let expected = expected.as_list::(); + assert_eq!(read.value_offsets(), &[0, 2, 3, 6, 6, 6, 7]); // compare list null buffers assert_eq!(read.nulls(), expected.nulls()); // build struct from list @@ -2525,10 +2503,10 @@ mod tests { assert_eq!(batch.num_rows(), 3); assert_eq!(batch.num_columns(), 2); let col1 = batch.column(0); - assert_eq!(col1.data(), expected_accounts.data()); + assert_eq!(col1.as_ref(), &expected_accounts); // Compare the map let col2 = batch.column(1); - assert_eq!(col2.data(), expected_stocks.data()); + assert_eq!(col2.as_ref(), &expected_stocks); } #[test] diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d66d32017c26..cf65e8a9356b 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -290,9 +290,9 @@ fn set_column_for_json_rows( | DataType::Duration(_) => { let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; - let data = array.data(); + let nulls = array.nulls(); rows.iter_mut().enumerate().for_each(|(idx, row)| { - if data.is_valid(idx) { + if nulls.map(|x| x.is_valid(idx)).unwrap_or(true) { row.insert( col_name.to_string(), formatter.value(idx).to_string().into(), diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index e68e064c775d..2927354da291 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1192,7 +1192,7 @@ where { // TODO: Use take_boolean (#2967) let array = take(&dict_comparison, dict.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) + Ok(BooleanArray::from(array.to_data())) } /// Helper function to perform boolean lambda function on values from two arrays using @@ -3382,10 +3382,7 @@ mod tests { let array_b: PrimitiveArray = vec![2; item_count].into(); let result_mask = gt_eq(&array_a, &array_b).unwrap(); - assert_eq!( - result_mask.data().buffers()[0].len(), - select_mask.data().buffers()[0].len() - ); + assert_eq!(result_mask.values().len(), select_mask.values().len()); } // Expected behaviour: diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index db1fff6d3e2f..bfe74d9e3e7a 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -17,6 +17,7 @@ //! Contains functions and function factories to compare arrays. +use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; @@ -33,21 +34,21 @@ fn compare_primitives( where T::Native: ArrowNativeTypeOp, { - let left: PrimitiveArray = PrimitiveArray::from(left.data().clone()); - let right: PrimitiveArray = PrimitiveArray::from(right.data().clone()); + let left: PrimitiveArray = PrimitiveArray::from(left.to_data()); + let right: PrimitiveArray = PrimitiveArray::from(right.to_data()); Box::new(move |i, j| left.value(i).compare(right.value(j))) } fn compare_boolean(left: &dyn Array, right: &dyn Array) -> DynComparator { - let left: BooleanArray = BooleanArray::from(left.data().clone()); - let right: BooleanArray = BooleanArray::from(right.data().clone()); + let left: BooleanArray = BooleanArray::from(left.to_data()); + let right: BooleanArray = BooleanArray::from(right.to_data()); Box::new(move |i, j| left.value(i).cmp(&right.value(j))) } fn compare_string(left: &dyn Array, right: &dyn Array) -> DynComparator { - let left: StringArray = StringArray::from(left.data().clone()); - let right: StringArray = StringArray::from(right.data().clone()); + let left: StringArray = StringArray::from(left.to_data()); + let right: StringArray = StringArray::from(right.to_data()); Box::new(move |i, j| left.value(i).cmp(right.value(j))) } @@ -58,15 +59,13 @@ where V: ArrowPrimitiveType, V::Native: ArrowNativeTypeOp, { - let left = left.as_any().downcast_ref::>().unwrap(); - let right = right.as_any().downcast_ref::>().unwrap(); + let left = left.as_dictionary::(); + let right = right.as_dictionary::(); - let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().data().clone()); - let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().data().clone()); - let left_values: PrimitiveArray = - PrimitiveArray::from(left.values().data().clone()); - let right_values: PrimitiveArray = - PrimitiveArray::from(right.values().data().clone()); + let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().to_data()); + let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().to_data()); + let left_values: PrimitiveArray = left.values().to_data().into(); + let right_values: PrimitiveArray = right.values().to_data().into(); Box::new(move |i: usize, j: usize| { let key_left = left_keys.value(i).as_usize(); @@ -81,13 +80,13 @@ fn compare_dict_string(left: &dyn Array, right: &dyn Array) -> DynComparator where T: ArrowDictionaryKeyType, { - let left = left.as_any().downcast_ref::>().unwrap(); - let right = right.as_any().downcast_ref::>().unwrap(); + let left = left.as_dictionary::(); + let right = right.as_dictionary::(); - let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().data().clone()); - let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().data().clone()); - let left_values = StringArray::from(left.values().data().clone()); - let right_values = StringArray::from(right.values().data().clone()); + let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().to_data()); + let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().to_data()); + let left_values = StringArray::from(left.values().to_data()); + let right_values = StringArray::from(right.values().to_data()); Box::new(move |i: usize, j: usize| { let key_left = left_keys.value(i).as_usize(); @@ -264,10 +263,8 @@ pub fn build_compare( } } (FixedSizeBinary(_), FixedSizeBinary(_)) => { - let left: FixedSizeBinaryArray = - FixedSizeBinaryArray::from(left.data().clone()); - let right: FixedSizeBinaryArray = - FixedSizeBinaryArray::from(right.data().clone()); + let left: FixedSizeBinaryArray = left.to_data().into(); + let right: FixedSizeBinaryArray = right.to_data().into(); Box::new(move |i, j| left.value(i).cmp(right.value(j))) } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 56b3ec2b36b0..9cc7b4f301cb 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1651,9 +1651,9 @@ mod tests { DataType::Dictionary(_, v) if !exact => { assert_eq!(a.data_type(), v.as_ref()); let b = arrow_cast::cast(b, v).unwrap(); - assert_eq!(a.data(), b.data()) + assert_eq!(a, b.as_ref()) } - _ => assert_eq!(a.data(), b.data()), + _ => assert_eq!(a, b), } } @@ -1767,8 +1767,7 @@ mod tests { // Test struct nullability let data = s1 - .data() - .clone() + .to_data() .into_builder() .null_bit_buffer(Some(Buffer::from_slice_ref([0b00001010]))) .null_count(2) @@ -1786,7 +1785,7 @@ mod tests { assert_eq!(back.len(), 1); assert_eq!(&back[0], &s2); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); } #[test] @@ -1910,7 +1909,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); let options = SortOptions { @@ -1930,7 +1929,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); let options = SortOptions { @@ -1950,7 +1949,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); let options = SortOptions { @@ -1970,7 +1969,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); } @@ -2033,7 +2032,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); let options = SortOptions { @@ -2052,7 +2051,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); let options = SortOptions { @@ -2071,7 +2070,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - back[0].data().validate_full().unwrap(); + back[0].to_data().validate_full().unwrap(); assert_eq!(&back[0], &list); } @@ -2171,7 +2170,7 @@ mod tests { .into_data() .into_builder() .data_type(data_type) - .add_child_data(values.data().clone()) + .add_child_data(values.to_data()) .build() .unwrap(); diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index e232e717c9e8..e4ff878dd135 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -164,7 +164,7 @@ pub unsafe fn decode( let child = converter.convert_raw(&mut child_rows, validate_utf8)?; assert_eq!(child.len(), 1); - let child_data = child[0].data().clone(); + let child_data = child[0].to_data(); let builder = ArrayDataBuilder::new(field.data_type.clone()) .len(rows.len()) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 1cab72b6d9f2..ba8fc4a2cc1a 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -23,7 +23,7 @@ use arrow_array::builder::BooleanBufferBuilder; use arrow_array::cast::AsArray; use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; -use arrow_buffer::bit_util; +use arrow_buffer::{bit_util, BooleanBuffer, NullBuffer}; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::bit_iterator::{BitIndexIterator, BitSliceIterator}; use arrow_data::transform::MutableArrayData; @@ -317,7 +317,7 @@ fn filter_array( match predicate.strategy { IterationStrategy::None => Ok(new_empty_array(values.data_type())), - IterationStrategy::All => Ok(make_array(values.data().slice(0, predicate.count))), + IterationStrategy::All => Ok(values.slice(0, predicate.count)), // actually filter _ => downcast_primitive_array! { values => Ok(Arc::new(filter_primitive(values, predicate))), @@ -386,15 +386,15 @@ fn filter_array( /// in the filtered output, and `null_buffer` is the filtered null buffer /// fn filter_null_mask( - data: &ArrayData, + nulls: Option<&NullBuffer>, predicate: &FilterPredicate, ) -> Option<(usize, Buffer)> { - if data.null_count() == 0 { + let nulls = nulls?; + if nulls.null_count() == 0 { return None; } - let nulls = data.nulls()?; - let nulls = filter_bits(nulls.buffer(), nulls.offset(), predicate); + let nulls = filter_bits(nulls.inner(), predicate); // The filtered `nulls` has a length of `predicate.count` bits and // therefore the null count is this minus the number of valid bits let null_count = predicate.count - nulls.count_set_bits_offset(0, predicate.count); @@ -407,8 +407,9 @@ fn filter_null_mask( } /// Filter the packed bitmask `buffer`, with `predicate` starting at bit offset `offset` -fn filter_bits(buffer: &Buffer, offset: usize, predicate: &FilterPredicate) -> Buffer { - let src = buffer.as_slice(); +fn filter_bits(buffer: &BooleanBuffer, predicate: &FilterPredicate) -> Buffer { + let src = buffer.values(); + let offset = buffer.offset(); match &predicate.strategy { IterationStrategy::IndexIterator => { @@ -447,18 +448,14 @@ fn filter_bits(buffer: &Buffer, offset: usize, predicate: &FilterPredicate) -> B } /// `filter` implementation for boolean buffers -fn filter_boolean(values: &BooleanArray, predicate: &FilterPredicate) -> BooleanArray { - let data = values.data(); - assert_eq!(data.buffers().len(), 1); - assert_eq!(data.child_data().len(), 0); - - let values = filter_bits(data.buffers()[0], data.offset(), predicate); +fn filter_boolean(array: &BooleanArray, predicate: &FilterPredicate) -> BooleanArray { + let values = filter_bits(array.values(), predicate); let mut builder = ArrayDataBuilder::new(DataType::Boolean) .len(predicate.count) .add_buffer(values); - if let Some((null_count, nulls)) = filter_null_mask(data, predicate) { + if let Some((null_count, nulls)) = filter_null_mask(array.nulls(), predicate) { builder = builder.null_count(null_count).null_bit_buffer(Some(nulls)); } @@ -468,17 +465,13 @@ fn filter_boolean(values: &BooleanArray, predicate: &FilterPredicate) -> Boolean /// `filter` implementation for primitive arrays fn filter_primitive( - values: &PrimitiveArray, + array: &PrimitiveArray, predicate: &FilterPredicate, ) -> PrimitiveArray where T: ArrowPrimitiveType, { - let data = values.data(); - assert_eq!(data.buffers().len(), 1); - assert_eq!(data.child_data().len(), 0); - - let values = data.buffer::(0); + let values = array.values(); assert!(values.len() >= predicate.filter.len()); let buffer = match &predicate.strategy { @@ -514,11 +507,11 @@ where IterationStrategy::All | IterationStrategy::None => unreachable!(), }; - let mut builder = ArrayDataBuilder::new(data.data_type().clone()) + let mut builder = ArrayDataBuilder::new(array.data_type().clone()) .len(predicate.count) .add_buffer(buffer.into()); - if let Some((null_count, nulls)) = filter_null_mask(data, predicate) { + if let Some((null_count, nulls)) = filter_null_mask(array.nulls(), predicate) { builder = builder.null_count(null_count).null_bit_buffer(Some(nulls)); } @@ -554,7 +547,7 @@ where Self { src_offsets: array.value_offsets(), - src_values: array.data().buffers()[1], + src_values: array.value_data(), dst_offsets, dst_values, cur_offset, @@ -617,9 +610,6 @@ fn filter_bytes( where T: ByteArrayType, { - let data = array.data(); - assert_eq!(data.buffers().len(), 2); - assert_eq!(data.child_data().len(), 0); let mut filter = FilterBytes::new(predicate.count, array); match &predicate.strategy { @@ -639,7 +629,7 @@ where .add_buffer(filter.dst_offsets.into()) .add_buffer(filter.dst_values.into()); - if let Some((null_count, nulls)) = filter_null_mask(data, predicate) { + if let Some((null_count, nulls)) = filter_null_mask(array.nulls(), predicate) { builder = builder.null_count(null_count).null_bit_buffer(Some(nulls)); } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 0895b99c7f59..6039d53eaedc 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -27,8 +27,8 @@ use arrow_schema::ArrowError; /// /// Typically used to implement NULLIF. pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - let left_data = left.data(); - let right_data = right.data(); + let left_data = left.to_data(); + let right_data = right.to_data(); if left_data.len() != right_data.len() { return Err(ArrowError::ComputeError( @@ -40,7 +40,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Result>() .unwrap(); - let result_values: StringArray = result.values().data().clone().into(); + let result_values: StringArray = result.values().to_data().into(); // dictionary values should stay the same let expected_values = StringArray::from(vec!["foo", "bar", ""]); diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 7b6c7d50cac3..383ac5fd11c6 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -152,7 +152,7 @@ pub fn $fn_name( let dict_comparison = $fn_name(left.values().as_ref(), right)?; // TODO: Use take_boolean (#2967) let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.data().clone())) + Ok(BooleanArray::from(array.to_data())) } t => Err(ArrowError::ComputeError(format!( "Should be DictionaryArray but got: {}", t diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 68fc66a635bc..529205e7e28f 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -37,8 +37,8 @@ fn create_binary_array_data(length: i32) -> ArrayData { .unwrap() } -fn validate_utf8_array(arr: &StringArray) { - arr.data().validate_values().unwrap(); +fn validate_utf8_array(arr: &ArrayData) { + arr.validate_values().unwrap(); } fn validate_benchmark(c: &mut Criterion) { @@ -48,7 +48,7 @@ fn validate_benchmark(c: &mut Criterion) { }); //Utf8 Array - let str_arr = StringArray::from(vec!["test"; 20000]); + let str_arr = StringArray::from(vec!["test"; 20000]).to_data(); c.bench_function("validate_utf8_array_data 20000", |b| { b.iter(|| validate_utf8_array(&str_arr)) }); diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 0249a70d168f..d4c284ad2cd1 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -57,8 +57,8 @@ pub unsafe fn export_array_into_raw( out_array: *mut ffi::FFI_ArrowArray, out_schema: *mut ffi::FFI_ArrowSchema, ) -> Result<()> { - let data = src.data(); - let array = ffi::FFI_ArrowArray::new(data); + let data = src.to_data(); + let array = ffi::FFI_ArrowArray::new(&data); let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?; std::ptr::write_unaligned(out_array, array); @@ -101,22 +101,22 @@ mod tests { #[test] fn test_u32() -> Result<()> { let array = UInt32Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] fn test_u64() -> Result<()> { let array = UInt64Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] fn test_i64() -> Result<()> { let array = Int64Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -148,8 +148,8 @@ mod tests { Arc::new(UInt32Array::from(vec![42, 28, 19, 31])), ), ]); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -169,8 +169,8 @@ mod tests { ]); let array = DictionaryArray::try_new(&keys, &values)?; - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -178,8 +178,8 @@ mod tests { let values = vec![vec![10, 10, 10], vec![20, 20, 20], vec![30, 30, 30]]; let array = FixedSizeBinaryArray::try_from_iter(values.into_iter())?; - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -195,8 +195,8 @@ mod tests { let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -214,8 +214,8 @@ mod tests { .build()?; let array = FixedSizeListArray::from(list_data); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -240,8 +240,8 @@ mod tests { .build()?; let array = FixedSizeListArray::from(list_data); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } #[test] @@ -278,7 +278,7 @@ mod tests { let array = FixedSizeListArray::from(list_data); - let data = array.data(); - test_round_trip(data) + let data = array.into_data(); + test_round_trip(&data) } } diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs index 74cbd2096bfd..097b8e949443 100644 --- a/arrow/src/compute/kernels/limit.rs +++ b/arrow/src/compute/kernels/limit.rs @@ -172,8 +172,8 @@ mod tests { assert_eq!(5, struct_array.len()); assert_eq!(1, struct_array.null_count()); - assert_eq!(&boolean_data, struct_array.column(0).data()); - assert_eq!(&int_data, struct_array.column(1).data()); + assert_eq!(boolean_data, struct_array.column(0).to_data()); + assert_eq!(int_data, struct_array.column(1).to_data()); let array: ArrayRef = Arc::new(struct_array); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 7b26cf7f25a5..0af1b1111ca4 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1104,7 +1104,7 @@ mod tests { )]); // export it - let array = ArrowArray::try_from(struct_array.data().clone())?; + let array = ArrowArray::try_from(struct_array.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1128,7 +1128,7 @@ mod tests { let union = builder.build().unwrap(); // export it - let array = ArrowArray::try_from(union.data().clone())?; + let array = ArrowArray::try_from(union.to_data())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 0b0a06875432..c1094b127bba 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -289,8 +289,8 @@ mod tests { } // Test that the list's child values are non-null let b_array = batch.column(1); - let list_array = b_array.as_any().downcast_ref::().unwrap(); - let child_array = make_array(list_array.data().child_data()[0].clone()); + let list_array = b_array.as_list::(); + let child_array = list_array.values(); assert_eq!(child_array.null_count(), 0); // There should be more values than the list, to show that it's a list assert!(child_array.len() > list_array.len()); diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 37968ec6a055..93296c3b0e43 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -372,7 +372,7 @@ fn test_empty_offsets_list_equal() { )))) .len(0) .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) + .add_child_data(values.to_data()) .null_bit_buffer(Some(Buffer::from(&empty_offsets))) .build() .unwrap() @@ -385,7 +385,7 @@ fn test_empty_offsets_list_equal() { )))) .len(0) .add_buffer(Buffer::from(&empty_offsets)) - .add_child_data(values.data().clone()) + .add_child_data(values.to_data()) .null_bit_buffer(Some(Buffer::from(&empty_offsets))) .build() .unwrap() @@ -400,11 +400,7 @@ fn test_empty_offsets_list_equal() { )))) .len(0) .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data( - Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]) - .data() - .clone(), - ) + .add_child_data(Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]).into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() .unwrap() diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 97869544ddd0..30a8bad60368 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -59,8 +59,8 @@ fn test_decimal() { fn test_decimal_offset() { let decimal_array = create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; + let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 + let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); a.extend(0, 0, 2); // 2, null let result = a.freeze(); @@ -74,8 +74,8 @@ fn test_decimal_offset() { fn test_decimal_null_offset_nulls() { let decimal_array = create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let decimal_array = decimal_array.slice(1, 3); // 2, null, 3 - let arrays = vec![decimal_array.data()]; + let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 + let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); a.extend(0, 0, 2); // 2, null a.extend_nulls(3); // 2, null, null, null, null @@ -90,8 +90,8 @@ fn test_decimal_null_offset_nulls() { /// tests extending from a primitive array w/ offset nor nulls #[test] fn test_primitive() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let arrays = vec![b.data()]; + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]).into_data(); + let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 3); a.extend(0, 0, 2); let result = a.freeze(); @@ -103,9 +103,9 @@ fn test_primitive() { /// tests extending from a primitive array with offset w/ nulls #[test] fn test_primitive_offset() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]).into_data(); let b = b.slice(1, 2); - let arrays = vec![b.data()]; + let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 2); a.extend(0, 0, 2); let result = a.freeze(); @@ -118,8 +118,8 @@ fn test_primitive_offset() { #[test] fn test_primitive_null_offset() { let b = UInt8Array::from(vec![Some(1), None, Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; + let b = b.slice(1, 2).into_data(); + let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, false, 2); a.extend(0, 0, 2); let result = a.freeze(); @@ -130,9 +130,9 @@ fn test_primitive_null_offset() { #[test] fn test_primitive_null_offset_nulls() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); + let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]).into_data(); let b = b.slice(1, 2); - let arrays = vec![b.data()]; + let arrays = vec![&b]; let mut a = MutableArrayData::new(arrays, true, 2); a.extend(0, 0, 2); a.extend_nulls(3); @@ -153,8 +153,8 @@ fn test_list_null_offset() { builder.append(true); builder.values().append_slice(&[6, 7, 8]); builder.append(true); - let array = builder.finish(); - let arrays = vec![array.data()]; + let array = builder.finish().into_data(); + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); mutable.extend(0, 0, 1); @@ -174,8 +174,9 @@ fn test_list_null_offset() { /// tests extending from a variable-sized (strings and binary) array w/ offset with nulls #[test] fn test_variable_sized_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let arrays = vec![array.data()]; + let array = + StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]).into_data(); + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -193,9 +194,9 @@ fn test_variable_sized_nulls() { #[test] fn test_variable_sized_offsets() { let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); + let array = array.into_data().slice(1, 3); - let arrays = vec![array.data()]; + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -211,9 +212,9 @@ fn test_variable_sized_offsets() { #[test] fn test_string_offsets() { let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); + let array = array.into_data().slice(1, 3); - let arrays = vec![array.data()]; + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -228,10 +229,10 @@ fn test_string_offsets() { #[test] fn test_multiple_with_nulls() { - let array1 = StringArray::from(vec!["hello", "world"]); - let array2 = StringArray::from(vec![Some("1"), None]); + let array1 = StringArray::from(vec!["hello", "world"]).into_data(); + let array2 = StringArray::from(vec![Some("1"), None]).into_data(); - let arrays = vec![array1.data(), array2.data()]; + let arrays = vec![&array1, &array2]; let mut mutable = MutableArrayData::new(arrays, false, 5); @@ -248,9 +249,9 @@ fn test_multiple_with_nulls() { #[test] fn test_string_null_offset_nulls() { let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); + let array = array.into_data().slice(1, 3); - let arrays = vec![array.data()]; + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, true, 0); @@ -266,8 +267,9 @@ fn test_string_null_offset_nulls() { #[test] fn test_bool() { - let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); - let arrays = vec![array.data()]; + let array = + BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]).into_data(); + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -282,9 +284,9 @@ fn test_bool() { #[test] fn test_null() { - let array1 = NullArray::new(10); - let array2 = NullArray::new(5); - let arrays = vec![array1.data(), array2.data()]; + let array1 = NullArray::new(10).into_data(); + let array2 = NullArray::new(5).into_data(); + let arrays = vec![&array1, &array2]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -352,8 +354,9 @@ fn test_struct() { let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; + .unwrap() + .into_data(); + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); mutable.extend(0, 1, 3); @@ -388,8 +391,9 @@ fn test_struct_offset() { let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) .unwrap() + .into_data() .slice(1, 3); - let arrays = vec![array.data()]; + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); mutable.extend(0, 1, 3); @@ -424,8 +428,9 @@ fn test_struct_nulls() { let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; + .unwrap() + .into_data(); + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -462,8 +467,9 @@ fn test_struct_many() { let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data(), array.data()]; + .unwrap() + .into_data(); + let arrays = vec![&array, &array]; let mut mutable = MutableArrayData::new(arrays, false, 0); mutable.extend(0, 1, 3); @@ -488,10 +494,10 @@ fn test_binary_fixed_sized_offsets() { vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), ) .expect("Failed to create FixedSizeBinaryArray from iterable"); - let array = array.slice(1, 2); + let array = array.slice(1, 2).into_data(); // = [[0, 1], [0, 2]] due to the offset = 1 - let arrays = vec![array.data()]; + let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -517,7 +523,7 @@ fn test_list_append() { builder.values().append_slice(&[6, 7, 8]); builder.values().append_slice(&[9, 10, 11]); builder.append(true); - let a = builder.finish(); + let a = builder.finish().into_data(); let a_builder = Int64Builder::with_capacity(24); let mut a_builder = ListBuilder::::new(a_builder); @@ -526,11 +532,11 @@ fn test_list_append() { a_builder.append(true); a_builder.values().append_slice(&[14, 15]); a_builder.append(true); - let b = a_builder.finish(); + let b = a_builder.finish().into_data(); let c = b.slice(1, 2); - let mut mutable = MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); + let mut mutable = MutableArrayData::new(vec![&a, &b, &c], false, 1); mutable.extend(0, 0, a.len()); mutable.extend(1, 0, b.len()); mutable.extend(2, 0, c.len()); @@ -584,8 +590,7 @@ fn test_list_nulls_append() { builder.values().append_null(); builder.values().append_slice(&[9, 10, 11]); builder.append(true); - let a = builder.finish(); - let a = a.data(); + let a = builder.finish().into_data(); let mut builder = ListBuilder::::new(Int64Builder::with_capacity(32)); builder.values().append_slice(&[12, 13]); @@ -596,12 +601,11 @@ fn test_list_nulls_append() { builder.values().append_null(); builder.values().append_slice(&[14, 15]); builder.append(true); - let b = builder.finish(); - let b = b.data(); + let b = builder.finish().into_data(); let c = b.slice(1, 2); let d = b.slice(2, 2); - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + let mut mutable = MutableArrayData::new(vec![&a, &b, &c, &d], false, 10); mutable.extend(0, 0, a.len()); mutable.extend(1, 0, b.len()); @@ -671,8 +675,7 @@ fn test_map_nulls_append() { builder.values().append_slice(&[9, 10, 11]); builder.append(true).unwrap(); - let a = builder.finish(); - let a = a.data(); + let a = builder.finish().into_data(); let mut builder = MapBuilder::::new( None, @@ -691,12 +694,11 @@ fn test_map_nulls_append() { builder.values().append_slice(&[14, 15]); builder.append(true).unwrap(); - let b = builder.finish(); - let b = b.data(); + let b = builder.finish().into_data(); let c = b.slice(1, 2); let d = b.slice(2, 2); - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); + let mut mutable = MutableArrayData::new(vec![&a, &b, &c, &d], false, 10); mutable.extend(0, 0, a.len()); mutable.extend(1, 0, b.len()); @@ -804,7 +806,7 @@ fn test_list_of_strings_append() { builder.values().append_value("Arrow"); builder.values().append_null(); builder.append(true); - let a = builder.finish(); + let a = builder.finish().into_data(); // [["alpha", "beta"], [None], ["gamma", "delta", None]] let mut builder = ListBuilder::new(StringBuilder::new()); @@ -817,9 +819,9 @@ fn test_list_of_strings_append() { builder.values().append_value("delta"); builder.values().append_null(); builder.append(true); - let b = builder.finish(); + let b = builder.finish().into_data(); - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + let mut mutable = MutableArrayData::new(vec![&a, &b], false, 10); mutable.extend(0, 0, a.len()); mutable.extend(1, 0, b.len()); @@ -869,7 +871,8 @@ fn test_list_of_strings_append() { fn test_fixed_size_binary_append() { let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; let a = FixedSizeBinaryArray::try_from_sparse_iter_with_size(a.into_iter(), 2) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + .expect("Failed to create FixedSizeBinaryArray from iterable") + .into_data(); let b = vec![ None, @@ -880,9 +883,10 @@ fn test_fixed_size_binary_append() { None, ]; let b = FixedSizeBinaryArray::try_from_sparse_iter_with_size(b.into_iter(), 2) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + .expect("Failed to create FixedSizeBinaryArray from iterable") + .into_data(); - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); + let mut mutable = MutableArrayData::new(vec![&a, &b], false, 10); mutable.extend(0, 0, a.len()); mutable.extend(1, 0, b.len()); @@ -913,8 +917,9 @@ fn test_fixed_size_binary_append() { ]; let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(&result, expected.data()); + .expect("Failed to create FixedSizeBinaryArray from iterable") + .into_data(); + assert_eq!(result, expected); } /* diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 082d020ca462..67960ada6c98 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -606,7 +606,7 @@ fn test_validate_dictionary_index_too_large() { 2, None, 0, - vec![keys.data().buffers()[0].clone()], + vec![keys.into_data().buffers()[0].clone()], vec![values.into_data()], ) .unwrap(); @@ -630,7 +630,7 @@ fn test_validate_dictionary_index_negative() { 2, None, 0, - vec![keys.data().buffers()[0].clone()], + vec![keys.into_data().buffers()[0].clone()], vec![values.into_data()], ) .unwrap(); @@ -655,7 +655,7 @@ fn test_validate_dictionary_index_negative_but_not_referenced() { 1, None, 0, - vec![keys.data().buffers()[0].clone()], + vec![keys.into_data().buffers()[0].clone()], vec![values.into_data()], ) .unwrap(); @@ -681,7 +681,7 @@ fn test_validate_dictionary_index_giant_negative() { 2, None, 0, - vec![keys.data().buffers()[0].clone()], + vec![keys.into_data().buffers()[0].clone()], vec![values.into_data()], ) .unwrap(); @@ -1016,7 +1016,7 @@ fn test_decimal_validation() { builder.append_value(20000); let array = builder.finish(); - array.data().validate_full().unwrap(); + array.into_data().validate_full().unwrap(); } #[test] diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index c4ed7e9070cc..763a6ccee2c3 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -355,7 +355,8 @@ where assert_eq!(dict.data_type(), &self.value_type); - let dict_buffers = dict.data().buffers(); + let data = dict.to_data(); + let dict_buffers = data.buffers(); let dict_offsets = dict_buffers[0].typed_data::(); let dict_values = dict_buffers[1].as_slice(); @@ -391,8 +392,8 @@ where #[cfg(test)] mod tests { - use arrow_array::{Array, StringArray}; use arrow::compute::cast; + use arrow_array::{Array, StringArray}; use crate::arrow::array_reader::test_util::{ byte_array_all_encodings, encode_dictionary, utf8_column, diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 504591c0ca89..a6b354f902df 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -143,11 +143,9 @@ impl ArrayReader for ListArrayReader { let mut skipped = 0; // Builder used to construct the filtered child data, skipping empty lists and nulls - let mut child_data_builder = MutableArrayData::new( - vec![next_batch_array.data()], - false, - next_batch_array.len(), - ); + let data = next_batch_array.to_data(); + let mut child_data_builder = + MutableArrayData::new(vec![&data], false, next_batch_array.len()); def_levels.iter().zip(rep_levels).try_for_each(|(d, r)| { match r.cmp(&self.rep_level) { @@ -201,7 +199,7 @@ impl ArrayReader for ListArrayReader { let child_data = if skipped == 0 { // No filtered values - can reuse original array - next_batch_array.data().clone() + next_batch_array.to_data() } else { // One or more filtered values - must build new array if let Some(start) = filter_start.take() { diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index d7645a593505..9bfc047322a7 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -96,7 +96,7 @@ impl ArrayReader for MapArrayReader { // A MapArray is just a ListArray with a StructArray child // we can therefore just alter the ArrayData let array = self.reader.consume_batch().unwrap(); - let data = array.data().clone(); + let data = array.to_data(); let builder = data.into_builder().data_type(self.data_type.clone()); // SAFETY - we can assume that ListArrayReader produces valid ListArray diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 0670701a0375..11e019f29a59 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -17,7 +17,7 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::{ParquetError, Result}; -use arrow_array::{builder::BooleanBufferBuilder, ArrayRef, StructArray}; +use arrow_array::{builder::BooleanBufferBuilder, ArrayRef, StructArray, Array}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType as ArrowType; use std::any::Any; @@ -130,7 +130,7 @@ impl ArrayReader for StructArrayReader { .child_data( children_array .iter() - .map(|x| x.data().clone()) + .map(|x| x.to_data()) .collect::>(), ); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 4b88a33f3a25..57741283a2f9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1761,7 +1761,7 @@ mod tests { let b = Arc::clone(batch.column(0)); assert_eq!(a.data_type(), b.data_type()); - assert_eq!(a.data(), b.data(), "{:#?} vs {:#?}", a.data(), b.data()); + assert_eq!(a.to_data(), b.to_data()); assert_eq!( a.as_any().type_id(), b.as_any().type_id(), @@ -1960,7 +1960,7 @@ mod tests { let batch = reader.into_iter().next().unwrap().unwrap(); assert_eq!(batch.schema().as_ref(), &expected_schema); assert_eq!(batch.num_rows(), 4); - assert_eq!(batch.column(0).data().null_count(), 2); + assert_eq!(batch.column(0).null_count(), 2); } #[test] @@ -2077,7 +2077,7 @@ mod tests { ); let get_dict = - |batch: &RecordBatch| batch.column(0).data().child_data()[0].clone(); + |batch: &RecordBatch| batch.column(0).to_data().child_data()[0].clone(); // First and second batch in same row group -> same dictionary assert_eq!(get_dict(&batches[0]), get_dict(&batches[1])); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 4239f3fba59b..680d31480939 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -41,10 +41,9 @@ //! \[1\] [parquet-format#nested-encoding](https://github.com/apache/parquet-format#nested-encoding) use crate::errors::{ParquetError, Result}; -use arrow_array::{ - make_array, Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, StructArray, -}; -use arrow_data::ArrayData; +use arrow_array::cast::AsArray; +use arrow_array::{Array, ArrayRef, OffsetSizeTrait, StructArray}; +use arrow_buffer::NullBuffer; use arrow_schema::{DataType, Field}; use std::ops::Range; @@ -183,29 +182,37 @@ impl LevelInfoBuilder { self.write_leaf(array, range) } DataType::Struct(_) => { - let array = array.as_any().downcast_ref::().unwrap(); + let array = array.as_struct(); self.write_struct(array, range) } DataType::List(_) => { - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - self.write_list(array.value_offsets(), array.data(), range) + let array = array.as_list::(); + self.write_list( + array.value_offsets(), + array.nulls(), + array.values(), + range, + ) } DataType::LargeList(_) => { - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - self.write_list(array.value_offsets(), array.data(), range) + let array = array.as_list::(); + self.write_list( + array.value_offsets(), + array.nulls(), + array.values(), + range, + ) } DataType::Map(_, _) => { - let array = array.as_any().downcast_ref::().unwrap(); + let array = array.as_map(); // A Map is just as ListArray with a StructArray child, we therefore // treat it as such to avoid code duplication - self.write_list(array.value_offsets(), array.data(), range) + self.write_list( + array.value_offsets(), + array.nulls(), + array.entries(), + range, + ) } _ => unreachable!(), } @@ -217,7 +224,8 @@ impl LevelInfoBuilder { fn write_list( &mut self, offsets: &[O], - list_data: &ArrayData, + nulls: Option<&NullBuffer>, + values: &ArrayRef, range: Range, ) { let (child, ctx) = match self { @@ -226,11 +234,10 @@ impl LevelInfoBuilder { }; let offsets = &offsets[range.start..range.end + 1]; - let child_array = make_array(list_data.child_data()[0].clone()); let write_non_null_slice = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { - child.write(&child_array, start_idx..end_idx); + child.write(values, start_idx..end_idx); child.visit_leaves(|leaf| { let rep_levels = leaf.rep_levels.as_mut().unwrap(); let mut rev = rep_levels.iter_mut().rev(); @@ -270,7 +277,7 @@ impl LevelInfoBuilder { }) }; - match list_data.nulls() { + match nulls { Some(nulls) => { let null_offset = range.start; // TODO: Faster bitmask iteration (#1757) @@ -485,7 +492,7 @@ mod tests { use arrow_array::*; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_cast::display::array_value_to_string; - use arrow_data::ArrayDataBuilder; + use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{Fields, Schema}; #[test] @@ -1243,7 +1250,7 @@ mod tests { let array = Arc::new(list_builder.finish()); - let values_len = array.data().child_data()[0].len(); + let values_len = array.values().len(); assert_eq!(values_len, 5); let schema = Arc::new(Schema::new(vec![list_field])); @@ -1278,7 +1285,7 @@ mod tests { ]); // This test assumes that nulls don't take up space - assert_eq!(inner.data().child_data()[0].len(), 7); + assert_eq!(inner.values().len(), 7); let field = Field::new("list", inner.data_type().clone(), true); let array = Arc::new(inner) as ArrayRef; diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 86f7764ec4cf..4cf54dc8897e 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -22,7 +22,7 @@ use std::io::Write; use std::sync::Arc; use arrow_array::cast::AsArray; -use arrow_array::types::Decimal128Type; +use arrow_array::types::{Decimal128Type, Int32Type, Int64Type, UInt32Type, UInt64Type}; use arrow_array::{types, Array, ArrayRef, RecordBatch}; use arrow_schema::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; @@ -33,11 +33,12 @@ use super::schema::{ use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; +use crate::data_type::{ByteArray, DataType, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{KeyValue, RowGroupMetaDataPtr}; use crate::file::properties::WriterProperties; +use crate::file::writer::SerializedFileWriter; use crate::file::writer::SerializedRowGroupWriter; -use crate::{data_type::*, file::writer::SerializedFileWriter}; use levels::{calculate_array_levels, LevelInfo}; mod byte_array; @@ -292,16 +293,21 @@ fn write_leaves( } col_writer.close() } - ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { + ArrowDataType::List(_) => { let arrays: Vec<_> = arrays.iter().map(|array|{ - // write the child list - let data = array.data(); - arrow_array::make_array(data.child_data()[0].clone()) + array.as_list::().values().clone() }).collect(); write_leaves(row_group_writer, &arrays, levels)?; Ok(()) } + ArrowDataType::LargeList(_) => { + let arrays: Vec<_> = arrays.iter().map(|array|{ + array.as_list::().values().clone() + }).collect(); + write_leaves(row_group_writer, &arrays, levels)?; + Ok(()) + } ArrowDataType::Struct(fields) => { // Groups child arrays by field let mut field_arrays = vec![Vec::with_capacity(arrays.len()); fields.len()]; @@ -384,19 +390,15 @@ fn write_leaf( let array = arrow_cast::cast(column, &ArrowDataType::Date32)?; let array = arrow_cast::cast(&array, &ArrowDataType::Int32)?; - let array = array - .as_any() - .downcast_ref::() - .expect("Unable to get int32 array"); + let array = array.as_primitive::(); write_primitive(typed, array.values(), levels)? } ArrowDataType::UInt32 => { - let data = column.data(); - let offset = data.offset(); + let values = column.as_primitive::().values(); // follow C++ implementation and use overflow/reinterpret cast from u32 to i32 which will map // `(i32::MAX as u32)..u32::MAX` to `i32::MIN..0` - let array: &[i32] = data.buffers()[0].typed_data(); - write_primitive(typed, &array[offset..offset + data.len()], levels)? + let array = values.inner().typed_data::(); + write_primitive(typed, array, levels)? } ArrowDataType::Decimal128(_, _) => { // use the int32 to represent the decimal with low precision @@ -407,19 +409,13 @@ fn write_leaf( } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; - let array = array - .as_any() - .downcast_ref::() - .expect("Unable to get i32 array"); + let array = array.as_primitive::(); write_primitive(typed, array.values(), levels)? } } } ColumnWriter::BoolColumnWriter(ref mut typed) => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get boolean array"); + let array = column.as_boolean(); typed.write_batch( get_bool_array_slice(array, indices).as_slice(), levels.def_levels(), @@ -429,19 +425,15 @@ fn write_leaf( ColumnWriter::Int64ColumnWriter(ref mut typed) => { match column.data_type() { ArrowDataType::Int64 => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get i64 array"); + let array = column.as_primitive::(); write_primitive(typed, array.values(), levels)? } ArrowDataType::UInt64 => { + let values = column.as_primitive::().values(); // follow C++ implementation and use overflow/reinterpret cast from u64 to i64 which will map // `(i64::MAX as u64)..u64::MAX` to `i64::MIN..0` - let data = column.data(); - let offset = data.offset(); - let array: &[i64] = data.buffers()[0].typed_data(); - write_primitive(typed, &array[offset..offset + data.len()], levels)? + let array = values.inner().typed_data::(); + write_primitive(typed, array, levels)? } ArrowDataType::Decimal128(_, _) => { // use the int64 to represent the decimal with low precision @@ -452,10 +444,7 @@ fn write_leaf( } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; - let array = array - .as_any() - .downcast_ref::() - .expect("Unable to get i64 array"); + let array = array.as_primitive::(); write_primitive(typed, array.values(), levels)? } } @@ -642,6 +631,7 @@ mod tests { use arrow_schema::Fields; use crate::basic::Encoding; + use crate::data_type::AsBytes; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index_reader::read_pages_locations; use crate::file::properties::{ReaderProperties, WriterVersion}; @@ -723,8 +713,8 @@ mod tests { assert_eq!(expected_batch.num_columns(), actual_batch.num_columns()); assert_eq!(expected_batch.num_rows(), actual_batch.num_rows()); for i in 0..expected_batch.num_columns() { - let expected_data = expected_batch.column(i).data().clone(); - let actual_data = actual_batch.column(i).data().clone(); + let expected_data = expected_batch.column(i).to_data(); + let actual_data = actual_batch.column(i).to_data(); assert_eq!(expected_data, actual_data); } @@ -779,7 +769,7 @@ mod tests { // build a record batch let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); - assert_eq!(batch.column(0).data().null_count(), 1); + assert_eq!(batch.column(0).null_count(), 1); // This test fails if the max row group size is less than the batch's length // see https://github.com/apache/arrow-rs/issues/518 @@ -821,7 +811,7 @@ mod tests { // This test fails if the max row group size is less than the batch's length // see https://github.com/apache/arrow-rs/issues/518 - assert_eq!(batch.column(0).data().null_count(), 0); + assert_eq!(batch.column(0).null_count(), 0); roundtrip(batch, None); } @@ -928,7 +918,7 @@ mod tests { let g_list_data = ArrayData::builder(struct_field_g.data_type().clone()) .len(5) .add_buffer(g_value_offsets.clone()) - .add_child_data(g_value.data().clone()) + .add_child_data(g_value.to_data()) .build() .unwrap(); let g = ListArray::from(g_list_data); @@ -936,7 +926,7 @@ mod tests { let h_list_data = ArrayData::builder(struct_field_h.data_type().clone()) .len(5) .add_buffer(g_value_offsets) - .add_child_data(g_value.data().clone()) + .add_child_data(g_value.to_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00011011]))) .build() .unwrap(); @@ -1251,9 +1241,9 @@ mod tests { assert_eq!(expected_batch.num_columns(), actual_batch.num_columns()); assert_eq!(expected_batch.num_rows(), actual_batch.num_rows()); for i in 0..expected_batch.num_columns() { - let expected_data = expected_batch.column(i).data(); - let actual_data = actual_batch.column(i).data(); - validate(expected_data, actual_data); + let expected_data = expected_batch.column(i).to_data(); + let actual_data = actual_batch.column(i).to_data(); + validate(&expected_data, &actual_data); } file diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 23ebea57b5b2..529c28872642 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -107,7 +107,8 @@ impl Self::Values { values } => Ok(values), Self::Dict { keys, values } => { let mut spilled = OffsetBuffer::default(); - let dict_buffers = values.data().buffers(); + let data = values.to_data(); + let dict_buffers = data.buffers(); let dict_offsets = dict_buffers[0].typed_data::(); let dict_values = dict_buffers[1].as_slice(); From a9b1120eb81f9414aa84cfc97f5133c8bfbae9ca Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 3 Apr 2023 14:56:47 +0200 Subject: [PATCH 0782/1411] feat: add etag for objectMeta (#3937) * feat: add etag for objectMeta * replace the manual etag in response * fix typo * use option for e_tag * remove useless packages --- object_store/src/aws/client.rs | 3 +++ object_store/src/aws/mod.rs | 7 ++++++- object_store/src/azure/client.rs | 4 +++- object_store/src/azure/mod.rs | 12 +++++++++++- object_store/src/gcp/mod.rs | 5 ++++- object_store/src/http/client.rs | 7 ++++++- object_store/src/lib.rs | 2 ++ object_store/src/local.rs | 4 +++- object_store/src/memory.rs | 4 ++++ object_store/src/prefix.rs | 3 +++ 10 files changed, 45 insertions(+), 6 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 7ac4b705b36c..9634c740d01d 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -164,6 +164,8 @@ pub struct ListContents { pub key: String, pub size: usize, pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, } impl TryFrom for ObjectMeta { @@ -174,6 +176,7 @@ impl TryFrom for ObjectMeta { location: Path::parse(value.key)?, last_modified: value.last_modified, size: value.size, + e_tag: value.e_tag, }) } } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 1e302e688978..f88960b4b338 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -232,7 +232,7 @@ impl ObjectStore for AmazonS3 { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax @@ -256,10 +256,15 @@ impl ObjectStore for AmazonS3 { let content_length = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; + + let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, + e_tag: Some(e_tag.to_string()), }) } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 494303dffd35..87432f62b5cd 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -489,6 +489,7 @@ impl TryFrom for ObjectMeta { location: Path::parse(value.name)?, last_modified: value.properties.last_modified, size: value.properties.content_length as usize, + e_tag: value.properties.e_tag, }) } } @@ -501,7 +502,6 @@ impl TryFrom for ObjectMeta { struct BlobProperties { #[serde(deserialize_with = "deserialize_rfc1123", rename = "Last-Modified")] pub last_modified: DateTime, - pub etag: String, #[serde(rename = "Content-Length")] pub content_length: u64, #[serde(rename = "Content-Type")] @@ -510,6 +510,8 @@ struct BlobProperties { pub content_encoding: Option, #[serde(rename = "Content-Language")] pub content_language: Option, + #[serde(rename = "Etag")] + pub e_tag: Option, } #[derive(Debug, Clone, PartialEq, Eq)] diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index e5f1465ad682..c2e72f214d73 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -140,6 +140,9 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("ETag Header missing from response"))] + MissingEtag, } impl From for super::Error { @@ -232,7 +235,7 @@ impl ObjectStore for MicrosoftAzure { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, LAST_MODIFIED}; + use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties @@ -257,10 +260,17 @@ impl ObjectStore for MicrosoftAzure { .parse() .context(InvalidContentLengthSnafu { content_length })?; + let e_tag = headers + .get(ETAG) + .ok_or(Error::MissingEtag)? + .to_str() + .context(BadHeaderSnafu)?; + Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, + e_tag: Some(e_tag.to_string()), }) } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index fe79a6e07ef2..5247693e6585 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -196,6 +196,8 @@ struct Object { name: String, size: String, updated: DateTime, + #[serde(rename = "etag")] + e_tag: Option, } #[derive(serde::Deserialize, Debug)] @@ -209,7 +211,6 @@ struct InitiateMultipartUploadResult { struct MultipartPart { #[serde(rename = "PartNumber")] part_number: usize, - #[serde(rename = "ETag")] e_tag: String, } @@ -1170,11 +1171,13 @@ fn convert_object_meta(object: &Object) -> Result { let location = Path::parse(&object.name)?; let last_modified = object.updated; let size = object.size.parse().context(InvalidSizeSnafu)?; + let e_tag = object.e_tag.clone(); Ok(ObjectMeta { location, last_modified, size, + e_tag, }) } diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 799c5be0c5eb..5ef272180abc 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -335,10 +335,12 @@ impl MultiStatusResponse { /// Returns this objects metadata as [`ObjectMeta`] pub fn object_meta(&self, base_url: &Url) -> Result { + let last_modified = self.prop_stat.prop.last_modified; Ok(ObjectMeta { location: self.path(base_url)?, - last_modified: self.prop_stat.prop.last_modified, + last_modified, size: self.size()?, + e_tag: self.prop_stat.prop.e_tag.clone(), }) } @@ -364,6 +366,9 @@ pub struct Prop { #[serde(rename = "resourcetype")] resource_type: ResourceType, + + #[serde(rename = "getetag")] + e_tag: Option, } #[derive(Deserialize)] diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 5737071286c8..c31027c0715c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -548,6 +548,8 @@ pub struct ObjectMeta { pub last_modified: DateTime, /// The size in bytes of the object pub size: usize, + /// The unique identifier for the object + pub e_tag: Option, } /// Result for a get request diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 9e710c28c072..d2553d46f244 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -23,6 +23,7 @@ use crate::{ }; use async_trait::async_trait; use bytes::Bytes; +use chrono::{DateTime, Utc}; use futures::future::BoxFuture; use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; @@ -887,7 +888,7 @@ fn convert_entry(entry: DirEntry, location: Path) -> Result { } fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { - let last_modified = metadata + let last_modified: DateTime = metadata .modified() .expect("Modified file time should be supported on this platform") .into(); @@ -900,6 +901,7 @@ fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result Result { let entry = self.entry(location).await?; + Ok(ObjectMeta { location: location.clone(), last_modified: entry.1, size: entry.0.len(), + e_tag: None, }) } @@ -185,6 +187,7 @@ impl ObjectStore for InMemory { location: key.clone(), last_modified: value.1, size: value.0.len(), + e_tag: None, }) }) .collect(); @@ -228,6 +231,7 @@ impl ObjectStore for InMemory { location: k.clone(), last_modified: v.1, size: v.0.len(), + e_tag: None, }; objects.push(object); } diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 7e7e7167bd0b..eba379553733 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -108,6 +108,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + e_tag: meta.e_tag, }) } @@ -128,6 +129,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location).unwrap_or(meta.location), + e_tag: meta.e_tag, }) .boxed()) } @@ -155,6 +157,7 @@ impl ObjectStore for PrefixStore { last_modified: meta.last_modified, size: meta.size, location: self.strip_prefix(&meta.location)?, + e_tag: meta.e_tag.clone(), }) }) .collect(), From ffa73df0eb12d7e11449c91b1d3f10a3620f575b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Apr 2023 16:09:38 +0100 Subject: [PATCH 0783/1411] Update proc-macro2 requirement from =1.0.54 to =1.0.56 (#4008) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.54...1.0.56) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 02613a85a18d..b336fa589c47 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.54", default-features = false } +proc-macro2 = { version = "=1.0.56", default-features = false } prost-build = { version = "=0.11.8", default-features = false } tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } From 26a8257deb668bd163d00836e77a892dbf338ff1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Apr 2023 16:36:25 +0100 Subject: [PATCH 0784/1411] Cleanup Primitive take (#4006) --- arrow-select/src/take.rs | 92 ++++++++++++---------------------------- 1 file changed, 28 insertions(+), 64 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index e2f3630bdfbc..01d6148132bd 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -23,7 +23,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -36,7 +36,7 @@ use num::{ToPrimitive, Zero}; /// │ A │ │ 0 │ │ A │ /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ /// │ D │ │ 2 │ │ B │ -/// ├─────────────────┤ ├─────────┤ take(values, indices) ├─────────────────┤ +/// ├─────────────────┤ ├─────────┤ take(values, indices) ├─────────────────┤ /// │ B │ │ 3 │ ─────────────────────────▶ │ C │ /// ├─────────────────┤ ├─────────┤ ├─────────────────┤ /// │ C │ │ 1 │ │ D │ @@ -252,19 +252,8 @@ where // take implementation when only values contain nulls fn take_values_nulls( - values: &PrimitiveArray, - indices: &[I], -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowPrimitiveType, - I: ArrowNativeType, -{ - take_values_nulls_inner(values.data(), values.values(), indices) -} - -fn take_values_nulls_inner( - values_data: &ArrayData, values: &[T], + values_nulls: &NullBuffer, indices: &[I], ) -> Result<(Buffer, Option), ArrowError> where @@ -278,7 +267,7 @@ where let values = indices.iter().enumerate().map(|(i, index)| { let index = maybe_usize::(*index)?; - if values_data.is_null(index) { + if values_nulls.is_null(index) { null_count += 1; bit_util::unset_bit(null_slice, i); } @@ -299,21 +288,9 @@ where // take implementation when only indices contain nulls fn take_indices_nulls( - values: &[T], - indices: &PrimitiveArray, -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowNativeType, - I: ArrowPrimitiveType, - I::Native: ToPrimitive, -{ - take_indices_nulls_inner(values, indices.values(), indices.data()) -} - -fn take_indices_nulls_inner( values: &[T], indices: &[I], - indices_data: &ArrayData, + indices_nulls: &NullBuffer, ) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, @@ -324,7 +301,7 @@ where Result::<_, ArrowError>::Ok(match values.get(index) { Some(value) => *value, None => { - if indices_data.is_null(index) { + if indices_nulls.is_null(index) { T::default() } else { panic!("Out-of-bounds index {index}") @@ -335,33 +312,15 @@ where // Soundness: `slice.map` is `TrustedLen`. let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - Ok((buffer, indices_data.nulls().map(|b| b.inner().sliced()))) + Ok((buffer, Some(indices_nulls.inner().sliced()))) } // take implementation when both values and indices contain nulls fn take_values_indices_nulls( - values: &PrimitiveArray, - indices: &PrimitiveArray, -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowPrimitiveType, - I: ArrowPrimitiveType, - I::Native: ToPrimitive, -{ - take_values_indices_nulls_inner( - values.values(), - values.data(), - indices.values(), - indices.data(), - ) -} - -fn take_values_indices_nulls_inner( values: &[T], - values_data: &ArrayData, + values_nulls: &NullBuffer, indices: &[I], - indices_data: &ArrayData, + indices_nulls: &NullBuffer, ) -> Result<(Buffer, Option), ArrowError> where T: ArrowNativeType, @@ -373,13 +332,13 @@ where let mut null_count = 0; let values = indices.iter().enumerate().map(|(i, &index)| { - if indices_data.is_null(i) { + if indices_nulls.is_null(i) { null_count += 1; bit_util::unset_bit(null_slice, i); Ok(T::default()) } else { let index = maybe_usize::(index)?; - if values_data.is_null(index) { + if values_nulls.is_null(index) { null_count += 1; bit_util::unset_bit(null_slice, i); } @@ -417,31 +376,36 @@ where I: ArrowPrimitiveType, I::Native: ToPrimitive, { - let indices_has_nulls = indices.null_count() > 0; - let values_has_nulls = values.null_count() > 0; + let indices_nulls = indices.nulls().filter(|x| x.null_count() > 0); + let values_nulls = values.nulls().filter(|x| x.null_count() > 0); + // note: this function should only panic when "an index is not null and out of bounds". // if the index is null, its value is undefined and therefore we should not read from it. - - let (buffer, nulls) = match (values_has_nulls, indices_has_nulls) { - (false, false) => { + let (buffer, nulls) = match (values_nulls, indices_nulls) { + (None, None) => { // * no nulls // * all `indices.values()` are valid - take_no_nulls::(values.values(), indices.values())? + take_no_nulls(values.values(), indices.values())? } - (true, false) => { + (Some(values_nulls), None) => { // * nulls come from `values` alone // * all `indices.values()` are valid - take_values_nulls::(values, indices.values())? + take_values_nulls(values.values(), values_nulls, indices.values())? } - (false, true) => { + (None, Some(indices_nulls)) => { // in this branch it is unsound to read and use `index.values()`, // as doing so is UB when they come from a null slot. - take_indices_nulls::(values.values(), indices)? + take_indices_nulls(values.values(), indices.values(), indices_nulls)? } - (true, true) => { + (Some(values_nulls), Some(indices_nulls)) => { // in this branch it is unsound to read and use `index.values()`, // as doing so is UB when they come from a null slot. - take_values_indices_nulls::(values, indices)? + take_values_indices_nulls( + values.values(), + values_nulls, + indices.values(), + indices_nulls, + )? } }; From 901c0614a57e96e9319d1f3ab6a4cc2581df3150 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 3 Apr 2023 17:54:02 +0100 Subject: [PATCH 0785/1411] Update tonic 0.9.1 (#4011) --- arrow-flight/Cargo.toml | 5 +- arrow-flight/gen/Cargo.toml | 2 +- arrow-flight/src/arrow.flight.protocol.rs | 234 +++++++++++++++++----- arrow-integration-testing/Cargo.toml | 2 +- 4 files changed, 192 insertions(+), 51 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 2f0994e18070..732c24572856 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -34,10 +34,9 @@ arrow-cast = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } base64 = { version = "0.21", default-features = false, features = ["std"] } -tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } -prost = { version = "0.11", default-features = false } -prost-derive = { version = "0.11", default-features = false } +prost = { version = "0.11", default-features = false, features = ["prost-derive"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index b336fa589c47..08afb572deb0 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -34,4 +34,4 @@ publish = false # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.56", default-features = false } prost-build = { version = "=0.11.8", default-features = false } -tonic-build = { version = "=0.8.4", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "=0.9.1", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index c79ec65ef921..200c858cf5f1 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -293,7 +293,7 @@ pub mod flight_service_client { /// Attempt to create a new client by connecting to a given endpoint. pub async fn connect(dst: D) -> Result where - D: std::convert::TryInto, + D: TryInto, D::Error: Into, { let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; @@ -349,6 +349,22 @@ pub mod flight_service_client { self.inner = self.inner.accept_compressed(encoding); self } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } /// /// Handshake between client and server. Depending on the server, the /// handshake may be required to determine the token that should be used for @@ -357,7 +373,7 @@ pub mod flight_service_client { pub async fn handshake( &mut self, request: impl tonic::IntoStreamingRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -374,7 +390,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/Handshake", ); - self.inner.streaming(request.into_streaming_request(), path, codec).await + let mut req = request.into_streaming_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "Handshake"), + ); + self.inner.streaming(req, path, codec).await } /// /// Get a list of available streams given a particular criteria. Most flight @@ -386,7 +407,7 @@ pub mod flight_service_client { pub async fn list_flights( &mut self, request: impl tonic::IntoRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -403,7 +424,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/ListFlights", ); - self.inner.server_streaming(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "ListFlights"), + ); + self.inner.server_streaming(req, path, codec).await } /// /// For a given FlightDescriptor, get information about how the flight can be @@ -419,7 +445,7 @@ pub mod flight_service_client { pub async fn get_flight_info( &mut self, request: impl tonic::IntoRequest, - ) -> Result, tonic::Status> { + ) -> std::result::Result, tonic::Status> { self.inner .ready() .await @@ -433,7 +459,15 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/GetFlightInfo", ); - self.inner.unary(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "arrow.flight.protocol.FlightService", + "GetFlightInfo", + ), + ); + self.inner.unary(req, path, codec).await } /// /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema @@ -443,7 +477,7 @@ pub mod flight_service_client { pub async fn get_schema( &mut self, request: impl tonic::IntoRequest, - ) -> Result, tonic::Status> { + ) -> std::result::Result, tonic::Status> { self.inner .ready() .await @@ -457,7 +491,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/GetSchema", ); - self.inner.unary(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "GetSchema"), + ); + self.inner.unary(req, path, codec).await } /// /// Retrieve a single stream associated with a particular descriptor @@ -467,7 +506,7 @@ pub mod flight_service_client { pub async fn do_get( &mut self, request: impl tonic::IntoRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -484,7 +523,10 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/DoGet", ); - self.inner.server_streaming(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("arrow.flight.protocol.FlightService", "DoGet")); + self.inner.server_streaming(req, path, codec).await } /// /// Push a stream to the flight service associated with a particular @@ -496,7 +538,7 @@ pub mod flight_service_client { pub async fn do_put( &mut self, request: impl tonic::IntoStreamingRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -513,7 +555,10 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/DoPut", ); - self.inner.streaming(request.into_streaming_request(), path, codec).await + let mut req = request.into_streaming_request(); + req.extensions_mut() + .insert(GrpcMethod::new("arrow.flight.protocol.FlightService", "DoPut")); + self.inner.streaming(req, path, codec).await } /// /// Open a bidirectional data channel for a given descriptor. This @@ -524,7 +569,7 @@ pub mod flight_service_client { pub async fn do_exchange( &mut self, request: impl tonic::IntoStreamingRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -541,7 +586,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/DoExchange", ); - self.inner.streaming(request.into_streaming_request(), path, codec).await + let mut req = request.into_streaming_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "DoExchange"), + ); + self.inner.streaming(req, path, codec).await } /// /// Flight services can support an arbitrary number of simple actions in @@ -553,7 +603,7 @@ pub mod flight_service_client { pub async fn do_action( &mut self, request: impl tonic::IntoRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -570,7 +620,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/DoAction", ); - self.inner.server_streaming(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "DoAction"), + ); + self.inner.server_streaming(req, path, codec).await } /// /// A flight service exposes all of the available action types that it has @@ -579,7 +634,7 @@ pub mod flight_service_client { pub async fn list_actions( &mut self, request: impl tonic::IntoRequest, - ) -> Result< + ) -> std::result::Result< tonic::Response>, tonic::Status, > { @@ -596,7 +651,12 @@ pub mod flight_service_client { let path = http::uri::PathAndQuery::from_static( "/arrow.flight.protocol.FlightService/ListActions", ); - self.inner.server_streaming(request.into_request(), path, codec).await + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new("arrow.flight.protocol.FlightService", "ListActions"), + ); + self.inner.server_streaming(req, path, codec).await } } } @@ -609,7 +669,7 @@ pub mod flight_service_server { pub trait FlightService: Send + Sync + 'static { /// Server streaming response type for the Handshake method. type HandshakeStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -621,10 +681,10 @@ pub mod flight_service_server { async fn handshake( &self, request: tonic::Request>, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListFlights method. type ListFlightsStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -638,7 +698,10 @@ pub mod flight_service_server { async fn list_flights( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// /// For a given FlightDescriptor, get information about how the flight can be /// consumed. This is a useful interface if the consumer of the interface @@ -653,7 +716,7 @@ pub mod flight_service_server { async fn get_flight_info( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// /// For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema /// This is used when a consumer needs the Schema of flight stream. Similar to @@ -662,10 +725,10 @@ pub mod flight_service_server { async fn get_schema( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoGet method. type DoGetStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -677,10 +740,10 @@ pub mod flight_service_server { async fn do_get( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoPut method. type DoPutStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -694,10 +757,10 @@ pub mod flight_service_server { async fn do_put( &self, request: tonic::Request>, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoExchange method. type DoExchangeStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -710,10 +773,10 @@ pub mod flight_service_server { async fn do_exchange( &self, request: tonic::Request>, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoAction method. type DoActionStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -727,10 +790,10 @@ pub mod flight_service_server { async fn do_action( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListActions method. type ListActionsStream: futures_core::Stream< - Item = Result, + Item = std::result::Result, > + Send + 'static; @@ -741,7 +804,10 @@ pub mod flight_service_server { async fn list_actions( &self, request: tonic::Request, - ) -> Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } /// /// A flight service is an endpoint for retrieving or storing Arrow data. A @@ -753,6 +819,8 @@ pub mod flight_service_server { inner: _Inner, accept_compression_encodings: EnabledCompressionEncodings, send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, } struct _Inner(Arc); impl FlightServiceServer { @@ -765,6 +833,8 @@ pub mod flight_service_server { inner, accept_compression_encodings: Default::default(), send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, } } pub fn with_interceptor( @@ -788,6 +858,22 @@ pub mod flight_service_server { self.send_compression_encodings.enable(encoding); self } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } } impl tonic::codegen::Service> for FlightServiceServer where @@ -801,7 +887,7 @@ pub mod flight_service_server { fn poll_ready( &mut self, _cx: &mut Context<'_>, - ) -> Poll> { + ) -> Poll> { Poll::Ready(Ok(())) } fn call(&mut self, req: http::Request) -> Self::Future { @@ -826,13 +912,15 @@ pub mod flight_service_server { tonic::Streaming, >, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).handshake(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -842,6 +930,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.streaming(method, req).await; Ok(res) @@ -865,7 +957,7 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).list_flights(request).await }; @@ -874,6 +966,8 @@ pub mod flight_service_server { } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -883,6 +977,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.server_streaming(method, req).await; Ok(res) @@ -905,7 +1003,7 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).get_flight_info(request).await }; @@ -914,6 +1012,8 @@ pub mod flight_service_server { } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -923,6 +1023,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.unary(method, req).await; Ok(res) @@ -945,13 +1049,15 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).get_schema(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -961,6 +1067,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.unary(method, req).await; Ok(res) @@ -984,13 +1094,15 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).do_get(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -1000,6 +1112,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.server_streaming(method, req).await; Ok(res) @@ -1023,13 +1139,15 @@ pub mod flight_service_server { &mut self, request: tonic::Request>, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).do_put(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -1039,6 +1157,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.streaming(method, req).await; Ok(res) @@ -1062,13 +1184,15 @@ pub mod flight_service_server { &mut self, request: tonic::Request>, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).do_exchange(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -1078,6 +1202,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.streaming(method, req).await; Ok(res) @@ -1101,13 +1229,15 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).do_action(request).await }; Box::pin(fut) } } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -1117,6 +1247,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.server_streaming(method, req).await; Ok(res) @@ -1140,7 +1274,7 @@ pub mod flight_service_server { &mut self, request: tonic::Request, ) -> Self::Future { - let inner = self.0.clone(); + let inner = Arc::clone(&self.0); let fut = async move { (*inner).list_actions(request).await }; @@ -1149,6 +1283,8 @@ pub mod flight_service_server { } let accept_compression_encodings = self.accept_compression_encodings; let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { let inner = inner.0; @@ -1158,6 +1294,10 @@ pub mod flight_service_server { .apply_compression_config( accept_compression_encodings, send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, ); let res = grpc.server_streaming(method, req).await; Ok(res) @@ -1186,12 +1326,14 @@ pub mod flight_service_server { inner, accept_compression_encodings: self.accept_compression_encodings, send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, } } } impl Clone for _Inner { fn clone(&self) -> Self { - Self(self.0.clone()) + Self(Arc::clone(&self.0)) } } impl std::fmt::Debug for _Inner { diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 3a65ec41c0f3..7f78cf50a9d7 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -43,7 +43,7 @@ prost = { version = "0.11", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.8", default-features = false } +tonic = { version = "0.9", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } From e3f212cc114743dbdc5852ddd777b19eaba3233e Mon Sep 17 00:00:00 2001 From: Stuart Carnie Date: Tue, 4 Apr 2023 05:10:50 +1000 Subject: [PATCH 0786/1411] feat: Add Commands enum to decode prost messages to strong type (#3887) * feat: Add Commands enum to decode known messages to strong type * chore: paste needs to be a dependency * chore: rustfmt * Add docs and use Commands * chore: Rename to `Command`; impl TryFrom * chore: Add `into_any` and `type_url` API * Tweak documentation * fixup * clippy * feat: Add `Command::Unknown(Any)` variant * Updated `do_get` and `do_put` functions to use `Command` enum * Added test for Unknown variant * chore: placate clippy * chore: combine errors * chore: don't change error code --------- Co-authored-by: Andrew Lamb --- arrow-flight/Cargo.toml | 1 + arrow-flight/src/sql/mod.rs | 132 +++++++++++++++-- arrow-flight/src/sql/server.rs | 262 +++++++++++++-------------------- 3 files changed, 224 insertions(+), 171 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 732c24572856..e22642b2a727 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -36,6 +36,7 @@ arrow-schema = { workspace = true } base64 = { version = "0.21", default-features = false, features = ["std"] } tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } +paste = { version = "1.0" } prost = { version = "0.11", default-features = false, features = ["prost-derive"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"] } diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 9ea74c3f35bb..2c26f2bf69b6 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -17,6 +17,7 @@ use arrow_schema::ArrowError; use bytes::Bytes; +use paste::paste; use prost::Message; mod gen { @@ -71,22 +72,110 @@ pub trait ProstMessageExt: prost::Message + Default { fn as_any(&self) -> Any; } +/// Macro to coerce a token to an item, specifically +/// to build the `Commands` enum. +/// +/// See: +macro_rules! as_item { + ($i:item) => { + $i + }; +} + macro_rules! prost_message_ext { - ($($name:ty,)*) => { - $( - impl ProstMessageExt for $name { - fn type_url() -> &'static str { - concat!("type.googleapis.com/arrow.flight.protocol.sql.", stringify!($name)) + ($($name:tt,)*) => { + paste! { + $( + const [<$name:snake:upper _TYPE_URL>]: &'static str = concat!("type.googleapis.com/arrow.flight.protocol.sql.", stringify!($name)); + )* + + as_item! { + /// Helper to convert to/from protobuf [`Any`] + /// to a strongly typed enum. + /// + /// # Example + /// ```rust + /// # use arrow_flight::sql::{Any, CommandStatementQuery, Command}; + /// let flightsql_message = CommandStatementQuery { + /// query: "SELECT * FROM foo".to_string(), + /// }; + /// + /// // Given a packed FlightSQL Any message + /// let any_message = Any::pack(&flightsql_message).unwrap(); + /// + /// // decode it to Command: + /// match Command::try_from(any_message).unwrap() { + /// Command::CommandStatementQuery(decoded) => { + /// assert_eq!(flightsql_message, decoded); + /// } + /// _ => panic!("Unexpected decoded message"), + /// } + /// ``` + #[derive(Clone, Debug, PartialEq)] + pub enum Command { + $($name($name),)* + + /// Any message that is not any FlightSQL command. + Unknown(Any), } + } - fn as_any(&self) -> Any { - Any { - type_url: <$name>::type_url().to_string(), - value: self.encode_to_vec().into(), + impl Command { + /// Convert the command to [`Any`]. + pub fn into_any(self) -> Any { + match self { + $( + Self::$name(cmd) => cmd.as_any(), + )* + Self::Unknown(any) => any, + } + } + + /// Get the URL for the command. + pub fn type_url(&self) -> &str { + match self { + $( + Self::$name(_) => [<$name:snake:upper _TYPE_URL>], + )* + Self::Unknown(any) => any.type_url.as_str(), + } + } + } + + impl TryFrom for Command { + type Error = ArrowError; + + fn try_from(any: Any) -> Result { + match any.type_url.as_str() { + $( + [<$name:snake:upper _TYPE_URL>] + => { + let m: $name = Message::decode(&*any.value).map_err(|err| { + ArrowError::ParseError(format!("Unable to decode Any value: {err}")) + })?; + Ok(Self::$name(m)) + } + )* + _ => Ok(Self::Unknown(any)), } } } - )* + + $( + impl ProstMessageExt for $name { + fn type_url() -> &'static str { + [<$name:snake:upper _TYPE_URL>] + } + + fn as_any(&self) -> Any { + Any { + type_url: <$name>::type_url().to_string(), + value: self.encode_to_vec().into(), + } + } + } + )* + } }; } @@ -190,4 +279,27 @@ mod tests { let unpack_query: CommandStatementQuery = any.unpack().unwrap().unwrap(); assert_eq!(query, unpack_query); } + + #[test] + fn test_command() { + let query = CommandStatementQuery { + query: "select 1".to_string(), + }; + let any = Any::pack(&query).unwrap(); + let cmd: Command = any.try_into().unwrap(); + + assert!(matches!(cmd, Command::CommandStatementQuery(_))); + assert_eq!(cmd.type_url(), COMMAND_STATEMENT_QUERY_TYPE_URL); + + // Unknown variant + + let any = Any { + type_url: "fake_url".to_string(), + value: Default::default(), + }; + + let cmd: Command = any.try_into().unwrap(); + assert!(matches!(cmd, Command::Unknown(_))); + assert_eq!(cmd.type_url(), "fake_url"); + } } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 848bfb3852f5..b11fa3e3c3db 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -17,7 +17,7 @@ use std::pin::Pin; -use crate::sql::Any; +use crate::sql::{Any, Command}; use futures::Stream; use prost::Message; use tonic::{Request, Response, Status, Streaming}; @@ -315,90 +315,46 @@ where let message = Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_statement(token, request).await; - } - if message.is::() { - let handle = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self - .get_flight_info_prepared_statement(handle, request) - .await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_catalogs(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_schemas(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_tables(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_table_types(token, request).await; + match Command::try_from(message).map_err(arrow_error_to_status)? { + Command::CommandStatementQuery(token) => { + self.get_flight_info_statement(token, request).await + } + Command::CommandPreparedStatementQuery(handle) => { + self.get_flight_info_prepared_statement(handle, request) + .await + } + Command::CommandGetCatalogs(token) => { + self.get_flight_info_catalogs(token, request).await + } + Command::CommandGetDbSchemas(token) => { + return self.get_flight_info_schemas(token, request).await + } + Command::CommandGetTables(token) => { + self.get_flight_info_tables(token, request).await + } + Command::CommandGetTableTypes(token) => { + self.get_flight_info_table_types(token, request).await + } + Command::CommandGetSqlInfo(token) => { + self.get_flight_info_sql_info(token, request).await + } + Command::CommandGetPrimaryKeys(token) => { + self.get_flight_info_primary_keys(token, request).await + } + Command::CommandGetExportedKeys(token) => { + self.get_flight_info_exported_keys(token, request).await + } + Command::CommandGetImportedKeys(token) => { + self.get_flight_info_imported_keys(token, request).await + } + Command::CommandGetCrossReference(token) => { + self.get_flight_info_cross_reference(token, request).await + } + cmd => Err(Status::unimplemented(format!( + "get_flight_info: The defined request is invalid: {}", + cmd.type_url() + ))), } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_sql_info(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_primary_keys(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_exported_keys(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_imported_keys(token, request).await; - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.get_flight_info_cross_reference(token, request).await; - } - - Err(Status::unimplemented(format!( - "get_flight_info: The defined request is invalid: {}", - message.type_url - ))) } async fn get_schema( @@ -415,47 +371,42 @@ where let msg: Any = Message::decode(&*request.get_ref().ticket) .map_err(decode_error_to_status)?; - fn unpack(msg: Any) -> Result { - msg.unpack() - .map_err(arrow_error_to_status)? - .ok_or_else(|| Status::internal("Expected a command, but found none.")) - } - - if msg.is::() { - return self.do_get_statement(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_prepared_statement(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_catalogs(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_schemas(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_tables(unpack(msg)?, request).await; + match Command::try_from(msg).map_err(arrow_error_to_status)? { + Command::TicketStatementQuery(command) => { + self.do_get_statement(command, request).await + } + Command::CommandPreparedStatementQuery(command) => { + self.do_get_prepared_statement(command, request).await + } + Command::CommandGetCatalogs(command) => { + self.do_get_catalogs(command, request).await + } + Command::CommandGetDbSchemas(command) => { + self.do_get_schemas(command, request).await + } + Command::CommandGetTables(command) => { + self.do_get_tables(command, request).await + } + Command::CommandGetTableTypes(command) => { + self.do_get_table_types(command, request).await + } + Command::CommandGetSqlInfo(command) => { + self.do_get_sql_info(command, request).await + } + Command::CommandGetPrimaryKeys(command) => { + self.do_get_primary_keys(command, request).await + } + Command::CommandGetExportedKeys(command) => { + self.do_get_exported_keys(command, request).await + } + Command::CommandGetImportedKeys(command) => { + self.do_get_imported_keys(command, request).await + } + Command::CommandGetCrossReference(command) => { + self.do_get_cross_reference(command, request).await + } + cmd => self.do_get_fallback(request, cmd.into_any()).await, } - if msg.is::() { - return self.do_get_table_types(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_sql_info(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_primary_keys(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_exported_keys(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_imported_keys(unpack(msg)?, request).await; - } - if msg.is::() { - return self.do_get_cross_reference(unpack(msg)?, request).await; - } - - self.do_get_fallback(request, msg).await } async fn do_put( @@ -465,44 +416,33 @@ where let cmd = request.get_mut().message().await?.unwrap(); let message = Any::decode(&*cmd.flight_descriptor.unwrap().cmd) .map_err(decode_error_to_status)?; - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - let record_count = self.do_put_statement_update(token, request).await?; - let result = DoPutUpdateResult { record_count }; - let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), - })]); - return Ok(Response::new(Box::pin(output))); - } - if message.is::() { - let token = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - return self.do_put_prepared_statement_query(token, request).await; - } - if message.is::() { - let handle = message - .unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"); - let record_count = self - .do_put_prepared_statement_update(handle, request) - .await?; - let result = DoPutUpdateResult { record_count }; - let output = futures::stream::iter(vec![Ok(PutResult { - app_metadata: result.as_any().encode_to_vec().into(), - })]); - return Ok(Response::new(Box::pin(output))); + match Command::try_from(message).map_err(arrow_error_to_status)? { + Command::CommandStatementUpdate(command) => { + let record_count = self.do_put_statement_update(command, request).await?; + let result = DoPutUpdateResult { record_count }; + let output = futures::stream::iter(vec![Ok(PutResult { + app_metadata: result.as_any().encode_to_vec().into(), + })]); + Ok(Response::new(Box::pin(output))) + } + Command::CommandPreparedStatementQuery(command) => { + self.do_put_prepared_statement_query(command, request).await + } + Command::CommandPreparedStatementUpdate(command) => { + let record_count = self + .do_put_prepared_statement_update(command, request) + .await?; + let result = DoPutUpdateResult { record_count }; + let output = futures::stream::iter(vec![Ok(PutResult { + app_metadata: result.as_any().encode_to_vec().into(), + })]); + Ok(Response::new(Box::pin(output))) + } + cmd => Err(Status::invalid_argument(format!( + "do_put: The defined request is invalid: {}", + cmd.type_url() + ))), } - - Err(Status::invalid_argument(format!( - "do_put: The defined request is invalid: {}", - message.type_url - ))) } async fn list_actions( From 5f27509d98cd401b31edc87fab686531b6ee34e3 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Tue, 4 Apr 2023 00:36:05 +0300 Subject: [PATCH 0787/1411] Minor: Float16Tensor (#4013) --- arrow/src/tensor.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow/src/tensor.rs b/arrow/src/tensor.rs index a46a1d08df85..b2abffc517c8 100644 --- a/arrow/src/tensor.rs +++ b/arrow/src/tensor.rs @@ -93,6 +93,7 @@ pub type UInt8Tensor<'a> = Tensor<'a, UInt8Type>; pub type UInt16Tensor<'a> = Tensor<'a, UInt16Type>; pub type UInt32Tensor<'a> = Tensor<'a, UInt32Type>; pub type UInt64Tensor<'a> = Tensor<'a, UInt64Type>; +pub type Float16Tensor<'a> = Tensor<'a, Float16Type>; pub type Float32Tensor<'a> = Tensor<'a, Float32Type>; pub type Float64Tensor<'a> = Tensor<'a, Float64Type>; From 789189c1f181d38be04c621a536858012c369da3 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Tue, 4 Apr 2023 21:47:04 +0300 Subject: [PATCH 0788/1411] fix: f16::ZERO and f16::ONE are mixed up (#4017) --- arrow-array/src/arithmetic.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index dcb6a1be7241..bb809507c2f1 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -377,6 +377,6 @@ macro_rules! native_type_float_op { }; } -native_type_float_op!(f16, f16::ONE, f16::ZERO); +native_type_float_op!(f16, f16::ZERO, f16::ONE); native_type_float_op!(f32, 0., 1.); native_type_float_op!(f64, 0., 1.); From 7bac07aa41e75286d8bb7995be311731b7d07688 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 4 Apr 2023 22:17:07 +0200 Subject: [PATCH 0789/1411] Add FlightSQL module docs and links to `arrow-flight` crates (#4012) * Add FlightSQL module docs and links to `arrow-flight` crates * Updates * Update arrow-flight/src/sql/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Copy editing and improve links --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/lib.rs | 5 +++++ arrow-flight/src/sql/client.rs | 2 ++ arrow-flight/src/sql/mod.rs | 25 +++++++++++++++++++++++-- arrow-flight/src/sql/server.rs | 2 ++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 7aebd92e2ba2..a80358ff00c5 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -30,6 +30,11 @@ //! //! 2. Low level [tonic] generated [`flight_service_client`] and //! [`flight_service_server`]. +//! +//! 3. Experimental support for [Flight SQL] in [`sql`]. Requires the +//! `flight-sql-experimental` feature of this crate to be activated. +//! +//! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index a8868fba1867..d96c90afa806 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! A FlightSQL Client [`FlightSqlServiceClient`] + use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 2c26f2bf69b6..df828c9c08af 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -15,6 +15,27 @@ // specific language governing permissions and limitations // under the License. +//! Support for execute SQL queries using [Apache Arrow] [Flight SQL]. +//! +//! [Flight SQL] is built on top of Arrow Flight RPC framework, by +//! defining specific messages, encoded using the protobuf format, +//! sent in the[`FlightDescriptor::cmd`] field to [`FlightService`] +//! endpoints such as[`get_flight_info`] and [`do_get`]. +//! +//! This module contains: +//! 1. [prost] generated structs for FlightSQL messages such as [`CommandStatementQuery`] +//! 2. Helpers for encoding and decoding FlightSQL messages: [`Any`] and [`Command`] +//! 3. A [`FlightSqlServiceClient`] for interacting with FlightSQL servers. +//! 4. A [`FlightSqlService`] to help building FlightSQL servers from [`FlightService`]. +//! +//! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html +//! [Apache Arrow]: https://arrow.apache.org +//! [`FlightDescriptor::cmd`]: crate::FlightDescriptor::cmd +//! [`FlightService`]: crate::flight_service_server::FlightService +//! [`get_flight_info`]: crate::flight_service_server::FlightService::get_flight_info +//! [`do_get`]: crate::flight_service_server::FlightService::do_get +//! [`FlightSqlServiceClient`]: client::FlightSqlServiceClient +//! [`FlightSqlService`]: server::FlightSqlService use arrow_schema::ArrowError; use bytes::Bytes; use paste::paste; @@ -90,8 +111,8 @@ macro_rules! prost_message_ext { )* as_item! { - /// Helper to convert to/from protobuf [`Any`] - /// to a strongly typed enum. + /// Helper to convert to/from protobuf [`Any`] message + /// to a specific FlightSQL command message. /// /// # Example /// ```rust diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index b11fa3e3c3db..f25ddb13db99 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Helper trait [`FlightSqlService`] for implementing a [`FlightService`] that implements FlightSQL. + use std::pin::Pin; use crate::sql::{Any, Command}; From 39a48e18d0b880be9932252dc0755f45cde50188 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 5 Apr 2023 12:03:32 +0100 Subject: [PATCH 0790/1411] Deprecate Array::data (#3880) (#4019) * Deprecate Array::data (#3880) * Review feedback --- arrow-array/src/array/binary_array.rs | 3 -- arrow-array/src/array/mod.rs | 25 +++++++++--- arrow-array/src/array/primitive_array.rs | 3 +- arrow-array/src/array/struct_array.rs | 18 ++++----- arrow-cast/src/cast.rs | 38 +++++++------------ arrow-integration-test/src/lib.rs | 2 +- .../src/bin/arrow-json-integration-test.rs | 4 +- .../integration_test.rs | 4 +- arrow-ipc/src/writer.rs | 25 ++++++------ arrow-row/src/lib.rs | 2 +- arrow-select/src/concat.rs | 32 +++++++++------- arrow-select/src/filter.rs | 4 +- arrow-select/src/interleave.rs | 3 +- arrow-select/src/take.rs | 24 ++++-------- arrow-select/src/zip.rs | 6 +-- arrow-string/src/length.rs | 4 +- arrow/src/ffi_stream.rs | 8 ++-- arrow/src/pyarrow.rs | 2 +- arrow/tests/array_transform.rs | 4 +- 19 files changed, 102 insertions(+), 109 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index be861474f659..3b13a513f646 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -467,9 +467,6 @@ mod tests { let list_array = GenericListArray::::from(array_data2); let binary_array2 = GenericBinaryArray::::from(list_array); - assert_eq!(2, binary_array2.data().buffers().len()); - assert_eq!(0, binary_array2.data().child_data().len()); - assert_eq!(binary_array1.len(), binary_array2.len()); assert_eq!(binary_array1.null_count(), binary_array2.null_count()); assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 41d5c8bebe29..fa6e970b497a 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -95,8 +95,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn as_any(&self) -> &dyn Any; /// Returns a reference to the underlying data of this array - /// - /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) + #[deprecated(note = "Use Array::to_data or Array::into_data")] fn data(&self) -> &ArrayData; /// Returns the underlying data of this array @@ -108,9 +107,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn into_data(self) -> ArrayData; /// Returns a reference-counted pointer to the underlying data of this array. - /// - /// This will be deprecated in a future release [(#3880)](https://github.com/apache/arrow-rs/issues/3880) #[deprecated(note = "Use Array::to_data or Array::into_data")] + #[allow(deprecated)] fn data_ref(&self) -> &ArrayData { self.data() } @@ -281,6 +279,7 @@ impl Array for ArrayRef { self.as_ref().as_any() } + #[allow(deprecated)] fn data(&self) -> &ArrayData { self.as_ref().data() } @@ -348,6 +347,7 @@ impl<'a, T: Array> Array for &'a T { T::as_any(self) } + #[allow(deprecated)] fn data(&self) -> &ArrayData { T::data(self) } @@ -435,78 +435,91 @@ pub trait ArrayAccessor: Array { } impl PartialEq for dyn Array + '_ { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for dyn Array + '_ { + #[allow(deprecated)] fn eq(&self, other: &T) -> bool { self.data().eq(other.data()) } } impl PartialEq for NullArray { + #[allow(deprecated)] fn eq(&self, other: &NullArray) -> bool { self.data().eq(other.data()) } } impl PartialEq for PrimitiveArray { + #[allow(deprecated)] fn eq(&self, other: &PrimitiveArray) -> bool { self.data().eq(other.data()) } } impl PartialEq for DictionaryArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for BooleanArray { + #[allow(deprecated)] fn eq(&self, other: &BooleanArray) -> bool { self.data().eq(other.data()) } } impl PartialEq for GenericStringArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for GenericBinaryArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for FixedSizeBinaryArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for GenericListArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for MapArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for FixedSizeListArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } } impl PartialEq for StructArray { + #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { self.data().eq(other.data()) } @@ -865,8 +878,8 @@ mod tests { let null_array = new_null_array(array.data_type(), 9); assert_eq!(&array, &null_array); assert_eq!( - array.data().buffers()[0].len(), - null_array.data().buffers()[0].len() + array.to_data().buffers()[0].len(), + null_array.to_data().buffers()[0].len() ); } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 5dfcb4da4d16..75bf85b3f2a0 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -519,10 +519,9 @@ impl PrimitiveArray { O: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { - let data = self.data(); let len = self.len(); - let nulls = data.nulls().cloned(); + let nulls = self.nulls().cloned(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); let slice = buffer.as_slice_mut(); diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 27e10a31fd00..1dccfc7d4ef3 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -174,7 +174,7 @@ impl TryFrom> for StructArray { // null: the null mask of the arrays. let mut null: Option = None; for (field_name, array) in values { - let child_datum = array.data(); + let child_datum = array.to_data(); let child_datum_len = child_datum.len(); if let Some(len) = len { if len != child_datum_len { @@ -186,7 +186,6 @@ impl TryFrom> for StructArray { } else { len = Some(child_datum_len) } - child_data.push(child_datum.clone()); fields.push(Arc::new(Field::new( field_name, array.data_type().clone(), @@ -209,6 +208,7 @@ impl TryFrom> for StructArray { // when one of the fields has no nulls, then there is no null in the array null = None; } + child_data.push(child_datum); } let len = len.unwrap(); @@ -385,10 +385,8 @@ mod tests { #[test] fn test_struct_array_builder() { - let array = BooleanArray::from(vec![false, false, true, true]); - let boolean_data = array.data(); - let array = Int64Array::from(vec![42, 28, 19, 31]); - let int_data = array.data(); + let boolean_array = BooleanArray::from(vec![false, false, true, true]); + let int_array = Int64Array::from(vec![42, 28, 19, 31]); let fields = vec![ Field::new("a", DataType::Boolean, false), @@ -396,14 +394,14 @@ mod tests { ]; let struct_array_data = ArrayData::builder(DataType::Struct(fields.into())) .len(4) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) + .add_child_data(boolean_array.to_data()) + .add_child_data(int_array.to_data()) .build() .unwrap(); let struct_array = StructArray::from(struct_array_data); - assert_eq!(boolean_data, struct_array.column(0).data()); - assert_eq!(int_data, struct_array.column(1).data()); + assert_eq!(struct_array.column(0).as_ref(), &boolean_array); + assert_eq!(struct_array.column(1).as_ref(), &int_array); } #[test] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 0ea6332a7ea5..372fcc1a3132 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2217,9 +2217,9 @@ fn value_to_string( let mut builder = GenericStringBuilder::::new(); let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array, &options)?; - let data = array.data(); - for i in 0..data.len() { - match data.is_null(i) { + let nulls = array.nulls(); + for i in 0..array.len() { + match nulls.map(|x| x.is_null(i)).unwrap_or_default() { true => builder.append_null(), false => { formatter.value(i).write(&mut builder)?; @@ -3500,7 +3500,7 @@ where FROM::Offset: OffsetSizeTrait + ToPrimitive, TO::Offset: OffsetSizeTrait + NumCast, { - let data = array.data(); + let data = array.to_data(); assert_eq!(data.data_type(), &FROM::DATA_TYPE); let str_values_buf = data.buffers()[1].clone(); let offsets = data.buffers()[0].typed_data::(); @@ -4844,9 +4844,8 @@ mod tests { #[test] fn test_cast_list_i32_to_list_u16() { - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]) - .data() - .clone(); + let value_data = + Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); @@ -4875,15 +4874,9 @@ mod tests { assert_eq!(0, cast_array.null_count()); // offsets should be the same - assert_eq!( - list_array.data().buffers().to_vec(), - cast_array.data().buffers().to_vec() - ); - let array = cast_array - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); + let array = cast_array.as_list::(); + assert_eq!(list_array.value_offsets(), array.value_offsets()); + assert_eq!(DataType::UInt16, array.value_type()); assert_eq!(3, array.value_length(0)); assert_eq!(3, array.value_length(1)); @@ -4908,9 +4901,8 @@ mod tests { )] fn test_cast_list_i32_to_list_timestamp() { // Construct a value array - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]) - .data() - .clone(); + let value_data = + Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 9]); @@ -7355,11 +7347,7 @@ mod tests { fn test_list_to_string() { let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); - let value_data = ArrayData::builder(DataType::Utf8) - .len(str_array.len()) - .buffers(str_array.data().buffers().to_vec()) - .build() - .unwrap(); + let value_data = str_array.into_data(); let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); @@ -8123,7 +8111,7 @@ mod tests { let options = CastOptions { safe: true }; let array = cast_with_options(&s, &DataType::Utf8, &options).unwrap(); let a = array.as_string::(); - a.data().validate_full().unwrap(); + a.to_data().validate_full().unwrap(); assert_eq!(a.null_count(), 1); assert_eq!(a.len(), 2); diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 8ee7bc60085e..04bbcf3f6f23 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -938,7 +938,7 @@ pub fn dictionary_array_from_json( // convert key and value to dictionary data let dict_data = ArrayData::builder(field.data_type().clone()) .len(keys.len()) - .add_buffer(keys.data().buffers()[0].clone()) + .add_buffer(keys.to_data().buffers()[0].clone()) .null_bit_buffer(Some(null_buf)) .add_child_data(values.into_data()) .build() diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 90a2d171d347..2c36e8d9b8ae 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -200,8 +200,8 @@ fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { for i in 0..num_columns { assert_eq!( - arrow_batch.column(i).data(), - json_batch.column(i).data(), + arrow_batch.column(i).as_ref(), + json_batch.column(i).as_ref(), "Arrow and JSON batch columns not the same" ); } diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 3c537c5f61d8..a55c2dec0580 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -232,8 +232,8 @@ async fn consume_flight_location( let field = schema.field(i); let field_name = field.name(); - let expected_data = expected_batch.column(i).data(); - let actual_data = actual_batch.column(i).data(); + let expected_data = expected_batch.column(i).as_ref(); + let actual_data = actual_batch.column(i).as_ref(); assert_eq!(expected_data, actual_data, "Data for field {field_name}"); } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 7d29f048a762..7d44d8f24030 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -220,15 +220,16 @@ impl IpcDataGenerator { } } DataType::RunEndEncoded(_, values) => { - if column.data().child_data().len() != 2 { + let data = column.to_data(); + if data.child_data().len() != 2 { return Err(ArrowError::InvalidArgumentError(format!( "The run encoded array should have exactly two child arrays. Found {}", - column.data().child_data().len() + data.child_data().len() ))); } - // The run_ends array is not expected to be dictionoary encoded. Hence encode dictionaries + // The run_ends array is not expected to be dictionary encoded. Hence encode dictionaries // only for values array. - let values_array = make_array(column.data().child_data()[1].clone()); + let values_array = make_array(data.child_data()[1].clone()); self.encode_dictionaries( values, &values_array, @@ -330,7 +331,7 @@ impl IpcDataGenerator { let dict_id = field .dict_id() .expect("All Dictionary types have `dict_id`"); - let dict_data = column.data(); + let dict_data = column.to_data(); let dict_values = &dict_data.child_data()[0]; let values = make_array(dict_data.child_data()[0].clone()); @@ -418,9 +419,9 @@ impl IpcDataGenerator { batch_compression_type.map(TryInto::try_into).transpose()?; for array in batch.columns() { - let array_data = array.data(); + let array_data = array.to_data(); offset = write_array_data( - array_data, + &array_data, &mut buffers, &mut arrow_data, &mut nodes, @@ -631,7 +632,7 @@ fn into_zero_offset_run_array( /// multiple times. Can optionally error if an update to an existing dictionary is attempted, which /// isn't allowed in the `FileWriter`. pub struct DictionaryTracker { - written: HashMap, + written: HashMap, error_on_replacement: bool, } @@ -660,18 +661,18 @@ impl DictionaryTracker { dict_id: i64, column: &ArrayRef, ) -> Result { - let dict_data = column.data(); + let dict_data = column.to_data(); let dict_values = &dict_data.child_data()[0]; // If a dictionary with this id was already emitted, check if it was the same. if let Some(last) = self.written.get(&dict_id) { - if ArrayData::ptr_eq(&last.data().child_data()[0], dict_values) { + if ArrayData::ptr_eq(&last.child_data()[0], dict_values) { // Same dictionary values => no need to emit it again return Ok(false); } if self.error_on_replacement { // If error on replacement perform a logical comparison - if last.data().child_data()[0] == *dict_values { + if last.child_data()[0] == *dict_values { // Same dictionary values => no need to emit it again return Ok(false); } @@ -684,7 +685,7 @@ impl DictionaryTracker { } } - self.written.insert(dict_id, column.clone()); + self.written.insert(dict_id, dict_data); Ok(true) } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 9cc7b4f301cb..71e1de416617 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2308,7 +2308,7 @@ mod tests { let back = converter.convert_rows(&rows).unwrap(); for ((actual, expected), preserve) in back.iter().zip(&arrays).zip(preserve) { - actual.data().validate_full().unwrap(); + actual.to_data().validate_full().unwrap(); dictionary_eq(preserve, actual, expected) } } diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index e34cc9edb884..ed27520cc61d 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -81,7 +81,8 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { _ => Capacities::Array(arrays.iter().map(|a| a.len()).sum()), }; - let array_data = arrays.iter().map(|a| a.data()).collect::>(); + let array_data: Vec<_> = arrays.iter().map(|a| a.to_data()).collect::>(); + let array_data = array_data.iter().collect(); let mut mutable = MutableArrayData::with_capacities(array_data, false, capacity); for (i, a) in arrays.iter().enumerate() { @@ -131,6 +132,7 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; + use arrow_array::cast::AsArray; use arrow_schema::{Field, Schema}; use std::sync::Arc; @@ -527,7 +529,7 @@ mod tests { let arr = concat(&[&a, &b, &c]).unwrap(); // this would have been 1280 if we did not precompute the value lengths. - assert_eq!(arr.data().buffers()[1].capacity(), 960); + assert_eq!(arr.to_data().buffers()[1].capacity(), 960); } #[test] @@ -563,16 +565,20 @@ mod tests { ); // Should have reused the dictionary - assert!(array.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); - assert!(copy.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); + assert!(array + .values() + .to_data() + .ptr_eq(&combined.values().to_data())); + assert!(copy.values().to_data().ptr_eq(&combined.values().to_data())); let new: DictionaryArray = vec!["d"].into_iter().collect(); let combined = concat(&[© as _, &array as _, &new as _]).unwrap(); + let com = combined.as_dictionary::(); // Should not have reused the dictionary - assert!(!array.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); - assert!(!copy.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); - assert!(!new.data().child_data()[0].ptr_eq(&combined.data().child_data()[0])); + assert!(!array.values().to_data().ptr_eq(&com.values().to_data())); + assert!(!copy.values().to_data().ptr_eq(&com.values().to_data())); + assert!(!new.values().to_data().ptr_eq(&com.values().to_data())); } #[test] @@ -656,12 +662,12 @@ mod tests { let a = Int32Array::from_iter_values(0..100); let b = Int32Array::from_iter_values(10..20); let a = concat(&[&a, &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); assert_eq!(data.buffers()[0].len(), 440); assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 let a = concat(&[&a.slice(10, 20), &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); assert_eq!(data.buffers()[0].len(), 120); assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 64 @@ -669,7 +675,7 @@ mod tests { let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]); let a = concat(&[&a, &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); // (100 + 4 + 1) * size_of() assert_eq!(data.buffers()[0].len(), 420); assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 @@ -679,7 +685,7 @@ mod tests { assert_eq!(data.buffers()[1].capacity(), 320); // Nearest multiple of 64 let a = concat(&[&a.slice(10, 40), &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); // (40 + 4 + 5) * size_of() assert_eq!(data.buffers()[0].len(), 180); assert_eq!(data.buffers()[0].capacity(), 192); // Nearest multiple of 64 @@ -693,7 +699,7 @@ mod tests { LargeBinaryArray::from_iter_values(std::iter::repeat(b"cupcakes").take(10)); let a = concat(&[&a, &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); // (100 + 10 + 1) * size_of() assert_eq!(data.buffers()[0].len(), 888); assert_eq!(data.buffers()[0].capacity(), 896); // Nearest multiple of 64 @@ -703,7 +709,7 @@ mod tests { assert_eq!(data.buffers()[1].capacity(), 384); // Nearest multiple of 64 let a = concat(&[&a.slice(10, 40), &b]).unwrap(); - let data = a.data(); + let data = a.to_data(); // (40 + 10 + 1) * size_of() assert_eq!(data.buffers()[0].len(), 408); assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index ba8fc4a2cc1a..06f0833561d6 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -650,7 +650,7 @@ where .into_data() .into_builder() .data_type(array.data_type().clone()) - .child_data(array.data().child_data().to_vec()); + .child_data(vec![array.values().to_data()]); // SAFETY: // Keys were valid before, filtered subset is therefore still valid @@ -1433,7 +1433,7 @@ mod tests { builder.append::("A", 3).unwrap(); let expected = builder.build().unwrap(); - assert_eq!(filtered.data(), expected.data()); + assert_eq!(filtered.to_data(), expected.to_data()); } #[test] diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index f274a3ebc30f..491395d1cc1a 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -193,7 +193,8 @@ fn interleave_fallback( values: &[&dyn Array], indices: &[(usize, usize)], ) -> Result { - let arrays: Vec<_> = values.iter().map(|x| x.data()).collect(); + let arrays: Vec<_> = values.iter().map(|x| x.to_data()).collect(); + let arrays: Vec<_> = arrays.iter().collect(); let mut array_data = MutableArrayData::new(arrays, false, indices.len()); let mut cur_array = indices[0].0; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 01d6148132bd..3e7432530743 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -1531,9 +1531,8 @@ mod tests { macro_rules! test_take_list { ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]] - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]) - .data() - .clone(); + let value_data = + Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); // Construct offsets let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); @@ -1570,8 +1569,7 @@ mod tests { Some(0), Some(0), ]) - .data() - .clone(); + .into_data(); // construct offsets let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 5, 8]; let expected_offsets = Buffer::from_slice_ref(&expected_offsets); @@ -1604,8 +1602,7 @@ mod tests { Some(5), None, ]) - .data() - .clone(); + .into_data(); // Construct offsets let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from_slice_ref(&value_offsets); @@ -1644,8 +1641,7 @@ mod tests { None, Some(0), ]) - .data() - .clone(); + .into_data(); // construct offsets let expected_offsets: [$offset_type; 6] = [0, 1, 1, 4, 6, 9]; let expected_offsets = Buffer::from_slice_ref(&expected_offsets); @@ -1677,8 +1673,7 @@ mod tests { Some(5), None, ]) - .data() - .clone(); + .into_data(); // Construct offsets let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); @@ -1716,8 +1711,7 @@ mod tests { None, Some(0), ]) - .data() - .clone(); + .into_data(); // construct offsets let expected_offsets: [$offset_type; 6] = [0, 0, 0, 3, 5, 8]; let expected_offsets = Buffer::from_slice_ref(&expected_offsets); @@ -1852,9 +1846,7 @@ mod tests { #[should_panic(expected = "index out of bounds: the len is 4 but the index is 1000")] fn test_take_list_out_of_bounds() { // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]) - .data() - .clone(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); // Construct offsets let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index e5d0f25e8fdb..b5df891544a8 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -42,10 +42,10 @@ pub fn zip( "all arrays should have the same length".into(), )); } - let falsy = falsy.data(); - let truthy = truthy.data(); + let falsy = falsy.to_data(); + let truthy = truthy.to_data(); - let mut mutable = MutableArrayData::new(vec![truthy, falsy], false, truthy.len()); + let mut mutable = MutableArrayData::new(vec![&truthy, &falsy], false, truthy.len()); // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to // fill with falsy values diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index c206fffb9166..bd022532d6e1 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -243,7 +243,7 @@ mod tests { let result = $kernel(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }}; } @@ -256,7 +256,7 @@ mod tests { let result = length(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); - assert_eq!(expected.data(), result.data()); + assert_eq!(&expected, result); }}; } diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 6b3067ab7d75..0e358c36a0dc 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -220,7 +220,7 @@ impl ExportedArrayStream { let mut private_data = self.get_private_data(); let reader = &mut private_data.batch_reader; - let ret_code = match reader.next() { + match reader.next() { None => { // Marks ArrowArray released to indicate reaching the end of stream. unsafe { std::ptr::write(out, FFI_ArrowArray::empty()) } @@ -229,7 +229,7 @@ impl ExportedArrayStream { Some(next_batch) => { if let Ok(batch) = next_batch { let struct_array = StructArray::from(batch); - let array = FFI_ArrowArray::new(struct_array.data()); + let array = FFI_ArrowArray::new(&struct_array.to_data()); unsafe { std::ptr::copy(addr_of!(array), out, 1) }; std::mem::forget(array); @@ -240,9 +240,7 @@ impl ExportedArrayStream { get_error_code(err) } } - }; - - ret_code + } } pub fn get_last_error(&mut self) -> &String { diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 8cc08988cbe6..081cc8063366 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -187,7 +187,7 @@ impl PyArrowConvert for RecordBatch { let columns = self.columns().iter(); for array in columns { - py_arrays.push(array.data().to_pyarrow(py)?); + py_arrays.push(array.to_data().to_pyarrow(py)?); } let py_schema = schema.to_pyarrow(py)?; diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 30a8bad60368..7cd0007cce75 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -44,8 +44,8 @@ fn create_decimal_array( #[cfg(not(feature = "force_validate"))] fn test_decimal() { let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); - let arrays = vec![Array::data(&decimal_array)]; + create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3).into_data(); + let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 3); a.extend(0, 0, 3); a.extend(0, 2, 3); From 2b2ce2f93b6c1798ef04658b286353b71d17682e Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Wed, 5 Apr 2023 18:22:10 +0300 Subject: [PATCH 0791/1411] feat: add tests for ArrowNativeTypeOp (#4018) --- arrow-array/src/arithmetic.rs | 401 ++++++++++++++++++++++++++++++++++ 1 file changed, 401 insertions(+) diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index bb809507c2f1..abeb46b99688 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -380,3 +380,404 @@ macro_rules! native_type_float_op { native_type_float_op!(f16, f16::ZERO, f16::ONE); native_type_float_op!(f32, 0., 1.); native_type_float_op!(f64, 0., 1.); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_native_type_is_zero() { + assert!(0_i8.is_zero()); + assert!(0_i16.is_zero()); + assert!(0_i32.is_zero()); + assert!(0_i64.is_zero()); + assert!(0_i128.is_zero()); + assert!(i256::ZERO.is_zero()); + assert!(0_u8.is_zero()); + assert!(0_u16.is_zero()); + assert!(0_u32.is_zero()); + assert!(0_u64.is_zero()); + assert!(f16::ZERO.is_zero()); + assert!(0.0_f32.is_zero()); + assert!(0.0_f64.is_zero()); + } + + #[test] + fn test_native_type_comparison() { + // is_eq + assert!(8_i8.is_eq(8_i8)); + assert!(8_i16.is_eq(8_i16)); + assert!(8_i32.is_eq(8_i32)); + assert!(8_i64.is_eq(8_i64)); + assert!(8_i128.is_eq(8_i128)); + assert!(i256::from_parts(8, 0).is_eq(i256::from_parts(8, 0))); + assert!(8_u8.is_eq(8_u8)); + assert!(8_u16.is_eq(8_u16)); + assert!(8_u32.is_eq(8_u32)); + assert!(8_u64.is_eq(8_u64)); + assert!(f16::from_f32(8.0).is_eq(f16::from_f32(8.0))); + assert!(8.0_f32.is_eq(8.0_f32)); + assert!(8.0_f64.is_eq(8.0_f64)); + + // is_ne + assert!(8_i8.is_ne(1_i8)); + assert!(8_i16.is_ne(1_i16)); + assert!(8_i32.is_ne(1_i32)); + assert!(8_i64.is_ne(1_i64)); + assert!(8_i128.is_ne(1_i128)); + assert!(i256::from_parts(8, 0).is_ne(i256::from_parts(1, 0))); + assert!(8_u8.is_ne(1_u8)); + assert!(8_u16.is_ne(1_u16)); + assert!(8_u32.is_ne(1_u32)); + assert!(8_u64.is_ne(1_u64)); + assert!(f16::from_f32(8.0).is_ne(f16::from_f32(1.0))); + assert!(8.0_f32.is_ne(1.0_f32)); + assert!(8.0_f64.is_ne(1.0_f64)); + + // is_lt + assert!(8_i8.is_lt(10_i8)); + assert!(8_i16.is_lt(10_i16)); + assert!(8_i32.is_lt(10_i32)); + assert!(8_i64.is_lt(10_i64)); + assert!(8_i128.is_lt(10_i128)); + assert!(i256::from_parts(8, 0).is_lt(i256::from_parts(10, 0))); + assert!(8_u8.is_lt(10_u8)); + assert!(8_u16.is_lt(10_u16)); + assert!(8_u32.is_lt(10_u32)); + assert!(8_u64.is_lt(10_u64)); + assert!(f16::from_f32(8.0).is_lt(f16::from_f32(10.0))); + assert!(8.0_f32.is_lt(10.0_f32)); + assert!(8.0_f64.is_lt(10.0_f64)); + + // is_gt + assert!(8_i8.is_gt(1_i8)); + assert!(8_i16.is_gt(1_i16)); + assert!(8_i32.is_gt(1_i32)); + assert!(8_i64.is_gt(1_i64)); + assert!(8_i128.is_gt(1_i128)); + assert!(i256::from_parts(8, 0).is_gt(i256::from_parts(1, 0))); + assert!(8_u8.is_gt(1_u8)); + assert!(8_u16.is_gt(1_u16)); + assert!(8_u32.is_gt(1_u32)); + assert!(8_u64.is_gt(1_u64)); + assert!(f16::from_f32(8.0).is_gt(f16::from_f32(1.0))); + assert!(8.0_f32.is_gt(1.0_f32)); + assert!(8.0_f64.is_gt(1.0_f64)); + } + + #[test] + fn test_native_type_add() { + // add_wrapping + assert_eq!(8_i8.add_wrapping(2_i8), 10_i8); + assert_eq!(8_i16.add_wrapping(2_i16), 10_i16); + assert_eq!(8_i32.add_wrapping(2_i32), 10_i32); + assert_eq!(8_i64.add_wrapping(2_i64), 10_i64); + assert_eq!(8_i128.add_wrapping(2_i128), 10_i128); + assert_eq!( + i256::from_parts(8, 0).add_wrapping(i256::from_parts(2, 0)), + i256::from_parts(10, 0) + ); + assert_eq!(8_u8.add_wrapping(2_u8), 10_u8); + assert_eq!(8_u16.add_wrapping(2_u16), 10_u16); + assert_eq!(8_u32.add_wrapping(2_u32), 10_u32); + assert_eq!(8_u64.add_wrapping(2_u64), 10_u64); + assert_eq!( + f16::from_f32(8.0).add_wrapping(f16::from_f32(2.0)), + f16::from_f32(10.0) + ); + assert_eq!(8.0_f32.add_wrapping(2.0_f32), 10_f32); + assert_eq!(8.0_f64.add_wrapping(2.0_f64), 10_f64); + + // add_checked + assert_eq!(8_i8.add_checked(2_i8).unwrap(), 10_i8); + assert_eq!(8_i16.add_checked(2_i16).unwrap(), 10_i16); + assert_eq!(8_i32.add_checked(2_i32).unwrap(), 10_i32); + assert_eq!(8_i64.add_checked(2_i64).unwrap(), 10_i64); + assert_eq!(8_i128.add_checked(2_i128).unwrap(), 10_i128); + assert_eq!( + i256::from_parts(8, 0) + .add_checked(i256::from_parts(2, 0)) + .unwrap(), + i256::from_parts(10, 0) + ); + assert_eq!(8_u8.add_checked(2_u8).unwrap(), 10_u8); + assert_eq!(8_u16.add_checked(2_u16).unwrap(), 10_u16); + assert_eq!(8_u32.add_checked(2_u32).unwrap(), 10_u32); + assert_eq!(8_u64.add_checked(2_u64).unwrap(), 10_u64); + assert_eq!( + f16::from_f32(8.0).add_checked(f16::from_f32(2.0)).unwrap(), + f16::from_f32(10.0) + ); + assert_eq!(8.0_f32.add_checked(2.0_f32).unwrap(), 10_f32); + assert_eq!(8.0_f64.add_checked(2.0_f64).unwrap(), 10_f64); + } + + #[test] + fn test_native_type_sub() { + // sub_wrapping + assert_eq!(8_i8.sub_wrapping(2_i8), 6_i8); + assert_eq!(8_i16.sub_wrapping(2_i16), 6_i16); + assert_eq!(8_i32.sub_wrapping(2_i32), 6_i32); + assert_eq!(8_i64.sub_wrapping(2_i64), 6_i64); + assert_eq!(8_i128.sub_wrapping(2_i128), 6_i128); + assert_eq!( + i256::from_parts(8, 0).sub_wrapping(i256::from_parts(2, 0)), + i256::from_parts(6, 0) + ); + assert_eq!(8_u8.sub_wrapping(2_u8), 6_u8); + assert_eq!(8_u16.sub_wrapping(2_u16), 6_u16); + assert_eq!(8_u32.sub_wrapping(2_u32), 6_u32); + assert_eq!(8_u64.sub_wrapping(2_u64), 6_u64); + assert_eq!( + f16::from_f32(8.0).sub_wrapping(f16::from_f32(2.0)), + f16::from_f32(6.0) + ); + assert_eq!(8.0_f32.sub_wrapping(2.0_f32), 6_f32); + assert_eq!(8.0_f64.sub_wrapping(2.0_f64), 6_f64); + + // sub_checked + assert_eq!(8_i8.sub_checked(2_i8).unwrap(), 6_i8); + assert_eq!(8_i16.sub_checked(2_i16).unwrap(), 6_i16); + assert_eq!(8_i32.sub_checked(2_i32).unwrap(), 6_i32); + assert_eq!(8_i64.sub_checked(2_i64).unwrap(), 6_i64); + assert_eq!(8_i128.sub_checked(2_i128).unwrap(), 6_i128); + assert_eq!( + i256::from_parts(8, 0) + .sub_checked(i256::from_parts(2, 0)) + .unwrap(), + i256::from_parts(6, 0) + ); + assert_eq!(8_u8.sub_checked(2_u8).unwrap(), 6_u8); + assert_eq!(8_u16.sub_checked(2_u16).unwrap(), 6_u16); + assert_eq!(8_u32.sub_checked(2_u32).unwrap(), 6_u32); + assert_eq!(8_u64.sub_checked(2_u64).unwrap(), 6_u64); + assert_eq!( + f16::from_f32(8.0).sub_checked(f16::from_f32(2.0)).unwrap(), + f16::from_f32(6.0) + ); + assert_eq!(8.0_f32.sub_checked(2.0_f32).unwrap(), 6_f32); + assert_eq!(8.0_f64.sub_checked(2.0_f64).unwrap(), 6_f64); + } + + #[test] + fn test_native_type_mul() { + // mul_wrapping + assert_eq!(8_i8.mul_wrapping(2_i8), 16_i8); + assert_eq!(8_i16.mul_wrapping(2_i16), 16_i16); + assert_eq!(8_i32.mul_wrapping(2_i32), 16_i32); + assert_eq!(8_i64.mul_wrapping(2_i64), 16_i64); + assert_eq!(8_i128.mul_wrapping(2_i128), 16_i128); + assert_eq!( + i256::from_parts(8, 0).mul_wrapping(i256::from_parts(2, 0)), + i256::from_parts(16, 0) + ); + assert_eq!(8_u8.mul_wrapping(2_u8), 16_u8); + assert_eq!(8_u16.mul_wrapping(2_u16), 16_u16); + assert_eq!(8_u32.mul_wrapping(2_u32), 16_u32); + assert_eq!(8_u64.mul_wrapping(2_u64), 16_u64); + assert_eq!( + f16::from_f32(8.0).mul_wrapping(f16::from_f32(2.0)), + f16::from_f32(16.0) + ); + assert_eq!(8.0_f32.mul_wrapping(2.0_f32), 16_f32); + assert_eq!(8.0_f64.mul_wrapping(2.0_f64), 16_f64); + + // mul_checked + assert_eq!(8_i8.mul_checked(2_i8).unwrap(), 16_i8); + assert_eq!(8_i16.mul_checked(2_i16).unwrap(), 16_i16); + assert_eq!(8_i32.mul_checked(2_i32).unwrap(), 16_i32); + assert_eq!(8_i64.mul_checked(2_i64).unwrap(), 16_i64); + assert_eq!(8_i128.mul_checked(2_i128).unwrap(), 16_i128); + assert_eq!( + i256::from_parts(8, 0) + .mul_checked(i256::from_parts(2, 0)) + .unwrap(), + i256::from_parts(16, 0) + ); + assert_eq!(8_u8.mul_checked(2_u8).unwrap(), 16_u8); + assert_eq!(8_u16.mul_checked(2_u16).unwrap(), 16_u16); + assert_eq!(8_u32.mul_checked(2_u32).unwrap(), 16_u32); + assert_eq!(8_u64.mul_checked(2_u64).unwrap(), 16_u64); + assert_eq!( + f16::from_f32(8.0).mul_checked(f16::from_f32(2.0)).unwrap(), + f16::from_f32(16.0) + ); + assert_eq!(8.0_f32.mul_checked(2.0_f32).unwrap(), 16_f32); + assert_eq!(8.0_f64.mul_checked(2.0_f64).unwrap(), 16_f64); + } + + #[test] + fn test_native_type_div() { + // div_wrapping + assert_eq!(8_i8.div_wrapping(2_i8), 4_i8); + assert_eq!(8_i16.div_wrapping(2_i16), 4_i16); + assert_eq!(8_i32.div_wrapping(2_i32), 4_i32); + assert_eq!(8_i64.div_wrapping(2_i64), 4_i64); + assert_eq!(8_i128.div_wrapping(2_i128), 4_i128); + assert_eq!( + i256::from_parts(8, 0).div_wrapping(i256::from_parts(2, 0)), + i256::from_parts(4, 0) + ); + assert_eq!(8_u8.div_wrapping(2_u8), 4_u8); + assert_eq!(8_u16.div_wrapping(2_u16), 4_u16); + assert_eq!(8_u32.div_wrapping(2_u32), 4_u32); + assert_eq!(8_u64.div_wrapping(2_u64), 4_u64); + assert_eq!( + f16::from_f32(8.0).div_wrapping(f16::from_f32(2.0)), + f16::from_f32(4.0) + ); + assert_eq!(8.0_f32.div_wrapping(2.0_f32), 4_f32); + assert_eq!(8.0_f64.div_wrapping(2.0_f64), 4_f64); + + // div_checked + assert_eq!(8_i8.div_checked(2_i8).unwrap(), 4_i8); + assert_eq!(8_i16.div_checked(2_i16).unwrap(), 4_i16); + assert_eq!(8_i32.div_checked(2_i32).unwrap(), 4_i32); + assert_eq!(8_i64.div_checked(2_i64).unwrap(), 4_i64); + assert_eq!(8_i128.div_checked(2_i128).unwrap(), 4_i128); + assert_eq!( + i256::from_parts(8, 0) + .div_checked(i256::from_parts(2, 0)) + .unwrap(), + i256::from_parts(4, 0) + ); + assert_eq!(8_u8.div_checked(2_u8).unwrap(), 4_u8); + assert_eq!(8_u16.div_checked(2_u16).unwrap(), 4_u16); + assert_eq!(8_u32.div_checked(2_u32).unwrap(), 4_u32); + assert_eq!(8_u64.div_checked(2_u64).unwrap(), 4_u64); + assert_eq!( + f16::from_f32(8.0).div_checked(f16::from_f32(2.0)).unwrap(), + f16::from_f32(4.0) + ); + assert_eq!(8.0_f32.div_checked(2.0_f32).unwrap(), 4_f32); + assert_eq!(8.0_f64.div_checked(2.0_f64).unwrap(), 4_f64); + } + + #[test] + fn test_native_type_mod() { + // mod_wrapping + assert_eq!(9_i8.mod_wrapping(2_i8), 1_i8); + assert_eq!(9_i16.mod_wrapping(2_i16), 1_i16); + assert_eq!(9_i32.mod_wrapping(2_i32), 1_i32); + assert_eq!(9_i64.mod_wrapping(2_i64), 1_i64); + assert_eq!(9_i128.mod_wrapping(2_i128), 1_i128); + assert_eq!( + i256::from_parts(9, 0).mod_wrapping(i256::from_parts(2, 0)), + i256::from_parts(1, 0) + ); + assert_eq!(9_u8.mod_wrapping(2_u8), 1_u8); + assert_eq!(9_u16.mod_wrapping(2_u16), 1_u16); + assert_eq!(9_u32.mod_wrapping(2_u32), 1_u32); + assert_eq!(9_u64.mod_wrapping(2_u64), 1_u64); + assert_eq!( + f16::from_f32(9.0).mod_wrapping(f16::from_f32(2.0)), + f16::from_f32(1.0) + ); + assert_eq!(9.0_f32.mod_wrapping(2.0_f32), 1_f32); + assert_eq!(9.0_f64.mod_wrapping(2.0_f64), 1_f64); + + // mod_checked + assert_eq!(9_i8.mod_checked(2_i8).unwrap(), 1_i8); + assert_eq!(9_i16.mod_checked(2_i16).unwrap(), 1_i16); + assert_eq!(9_i32.mod_checked(2_i32).unwrap(), 1_i32); + assert_eq!(9_i64.mod_checked(2_i64).unwrap(), 1_i64); + assert_eq!(9_i128.mod_checked(2_i128).unwrap(), 1_i128); + assert_eq!( + i256::from_parts(9, 0) + .mod_checked(i256::from_parts(2, 0)) + .unwrap(), + i256::from_parts(1, 0) + ); + assert_eq!(9_u8.mod_checked(2_u8).unwrap(), 1_u8); + assert_eq!(9_u16.mod_checked(2_u16).unwrap(), 1_u16); + assert_eq!(9_u32.mod_checked(2_u32).unwrap(), 1_u32); + assert_eq!(9_u64.mod_checked(2_u64).unwrap(), 1_u64); + assert_eq!( + f16::from_f32(9.0).mod_checked(f16::from_f32(2.0)).unwrap(), + f16::from_f32(1.0) + ); + assert_eq!(9.0_f32.mod_checked(2.0_f32).unwrap(), 1_f32); + assert_eq!(9.0_f64.mod_checked(2.0_f64).unwrap(), 1_f64); + } + + #[test] + fn test_native_type_neg() { + // neg_wrapping + assert_eq!(8_i8.neg_wrapping(), -8_i8); + assert_eq!(8_i16.neg_wrapping(), -8_i16); + assert_eq!(8_i32.neg_wrapping(), -8_i32); + assert_eq!(8_i64.neg_wrapping(), -8_i64); + assert_eq!(8_i128.neg_wrapping(), -8_i128); + assert_eq!(i256::from_parts(8, 0).neg_wrapping(), i256::from_i128(-8)); + assert_eq!(8_u8.neg_wrapping(), u8::MAX - 7_u8); + assert_eq!(8_u16.neg_wrapping(), u16::MAX - 7_u16); + assert_eq!(8_u32.neg_wrapping(), u32::MAX - 7_u32); + assert_eq!(8_u64.neg_wrapping(), u64::MAX - 7_u64); + assert_eq!(f16::from_f32(8.0).neg_wrapping(), f16::from_f32(-8.0)); + assert_eq!(8.0_f32.neg_wrapping(), -8_f32); + assert_eq!(8.0_f64.neg_wrapping(), -8_f64); + + // neg_checked + assert_eq!(8_i8.neg_checked().unwrap(), -8_i8); + assert_eq!(8_i16.neg_checked().unwrap(), -8_i16); + assert_eq!(8_i32.neg_checked().unwrap(), -8_i32); + assert_eq!(8_i64.neg_checked().unwrap(), -8_i64); + assert_eq!(8_i128.neg_checked().unwrap(), -8_i128); + assert_eq!( + i256::from_parts(8, 0).neg_checked().unwrap(), + i256::from_i128(-8) + ); + assert!(8_u8.neg_checked().is_err()); + assert!(8_u16.neg_checked().is_err()); + assert!(8_u32.neg_checked().is_err()); + assert!(8_u64.neg_checked().is_err()); + assert_eq!( + f16::from_f32(8.0).neg_checked().unwrap(), + f16::from_f32(-8.0) + ); + assert_eq!(8.0_f32.neg_checked().unwrap(), -8_f32); + assert_eq!(8.0_f64.neg_checked().unwrap(), -8_f64); + } + + #[test] + fn test_native_type_pow() { + // pow_wrapping + assert_eq!(8_i8.pow_wrapping(2_u32), 64_i8); + assert_eq!(8_i16.pow_wrapping(2_u32), 64_i16); + assert_eq!(8_i32.pow_wrapping(2_u32), 64_i32); + assert_eq!(8_i64.pow_wrapping(2_u32), 64_i64); + assert_eq!(8_i128.pow_wrapping(2_u32), 64_i128); + assert_eq!( + i256::from_parts(8, 0).pow_wrapping(2_u32), + i256::from_parts(64, 0) + ); + assert_eq!(8_u8.pow_wrapping(2_u32), 64_u8); + assert_eq!(8_u16.pow_wrapping(2_u32), 64_u16); + assert_eq!(8_u32.pow_wrapping(2_u32), 64_u32); + assert_eq!(8_u64.pow_wrapping(2_u32), 64_u64); + assert_eq!(f16::from_f32(8.0).pow_wrapping(2_u32), f16::from_f32(64.0)); + assert_eq!(8.0_f32.pow_wrapping(2_u32), 64_f32); + assert_eq!(8.0_f64.pow_wrapping(2_u32), 64_f64); + + // pow_checked + assert_eq!(8_i8.pow_checked(2_u32).unwrap(), 64_i8); + assert_eq!(8_i16.pow_checked(2_u32).unwrap(), 64_i16); + assert_eq!(8_i32.pow_checked(2_u32).unwrap(), 64_i32); + assert_eq!(8_i64.pow_checked(2_u32).unwrap(), 64_i64); + assert_eq!(8_i128.pow_checked(2_u32).unwrap(), 64_i128); + assert_eq!( + i256::from_parts(8, 0).pow_checked(2_u32).unwrap(), + i256::from_parts(64, 0) + ); + assert_eq!(8_u8.pow_checked(2_u32).unwrap(), 64_u8); + assert_eq!(8_u16.pow_checked(2_u32).unwrap(), 64_u16); + assert_eq!(8_u32.pow_checked(2_u32).unwrap(), 64_u32); + assert_eq!(8_u64.pow_checked(2_u32).unwrap(), 64_u64); + assert_eq!( + f16::from_f32(8.0).pow_checked(2_u32).unwrap(), + f16::from_f32(64.0) + ); + assert_eq!(8.0_f32.pow_checked(2_u32).unwrap(), 64_f32); + assert_eq!(8.0_f64.pow_checked(2_u32).unwrap(), 64_f64); + } +} From 2b354a3e8d7b57f2ad5eb12aeb283cc15bc9e170 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 5 Apr 2023 18:35:15 +0100 Subject: [PATCH 0792/1411] Support Rust structures --> `RecordBatch` by adding `Serde` support to `RawDecoder` (#3949) (#3979) * Add serde support to RawDecoder (#3949) * Clippy * More examples * Use BTreeMap for deterministic test output * Use new Field constructors * Review feedback --- arrow-json/Cargo.toml | 2 + arrow-json/src/raw/mod.rs | 181 +++++++++++++ arrow-json/src/raw/serializer.rs | 422 +++++++++++++++++++++++++++++++ arrow-json/src/raw/tape.rs | 23 ++ arrow/Cargo.toml | 1 + arrow/src/lib.rs | 46 ++++ 6 files changed, 675 insertions(+) create mode 100644 arrow-json/src/raw/serializer.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 34bd447da183..453e4aa35182 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -42,6 +42,7 @@ arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } indexmap = { version = "1.9", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } +serde = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } lexical-core = { version = "0.8", default-features = false } @@ -49,3 +50,4 @@ lexical-core = { version = "0.8", default-features = false } [dev-dependencies] tempfile = "3.3" flate2 = { version = "1", default-features = false, features = ["rust_backend"] } +serde = { version = "1.0", default-features = false, features = ["derive"] } diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index 1bae8ac529e7..f1f1ffb779d0 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -29,11 +29,13 @@ use crate::raw::struct_array::StructArrayDecoder; use crate::raw::tape::{Tape, TapeDecoder, TapeElement}; use crate::raw::timestamp_array::TimestampArrayDecoder; use arrow_array::timezone::Tz; +use arrow_array::types::Float32Type; use arrow_array::types::*; use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, SchemaRef, TimeUnit}; use chrono::Utc; +use serde::Serialize; use std::io::BufRead; mod boolean_array; @@ -41,6 +43,7 @@ mod decimal_array; mod list_array; mod map_array; mod primitive_array; +mod serializer; mod string_array; mod struct_array; mod tape; @@ -233,6 +236,184 @@ impl RawDecoder { self.tape_decoder.decode(buf) } + /// Serialize `rows` to this [`RawDecoder`] + /// + /// This provides a simple way to convert [serde]-compatible datastructures into arrow + /// [`RecordBatch`]. + /// + /// Custom conversion logic as described in [arrow_array::builder] will likely outperform this, + /// especially where the schema is known at compile-time, however, this provides a mechanism + /// to get something up and running quickly + /// + /// It can be used with [`serde_json::Value`] + /// + /// ``` + /// # use std::sync::Arc; + /// # use serde_json::{Value, json}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::types::Float32Type; + /// # use arrow_json::RawReaderBuilder; + /// # use arrow_schema::{DataType, Field, Schema}; + /// let json = vec![json!({"float": 2.3}), json!({"float": 5.7})]; + /// + /// let schema = Schema::new(vec![Field::new("float", DataType::Float32, true)]); + /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// + /// decoder.serialize(&json).unwrap(); + /// let batch = decoder.flush().unwrap().unwrap(); + /// assert_eq!(batch.num_rows(), 2); + /// assert_eq!(batch.num_columns(), 1); + /// let values = batch.column(0).as_primitive::().values(); + /// assert_eq!(values, &[2.3, 5.7]) + /// ``` + /// + /// Or with arbitrary [`Serialize`] types + /// + /// ``` + /// # use std::sync::Arc; + /// # use arrow_json::RawReaderBuilder; + /// # use arrow_schema::{DataType, Field, Schema}; + /// # use serde::Serialize; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::types::{Float32Type, Int32Type}; + /// # + /// #[derive(Serialize)] + /// struct MyStruct { + /// int32: i32, + /// float: f32, + /// } + /// + /// let schema = Schema::new(vec![ + /// Field::new("int32", DataType::Int32, false), + /// Field::new("float", DataType::Float32, false), + /// ]); + /// + /// let rows = vec![ + /// MyStruct{ int32: 0, float: 3. }, + /// MyStruct{ int32: 4, float: 67.53 }, + /// ]; + /// + /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// decoder.serialize(&rows).unwrap(); + /// + /// let batch = decoder.flush().unwrap().unwrap(); + /// + /// // Expect batch containing two columns + /// let int32 = batch.column(0).as_primitive::(); + /// assert_eq!(int32.values(), &[0, 4]); + /// + /// let float = batch.column(1).as_primitive::(); + /// assert_eq!(float.values(), &[3., 67.53]); + /// ``` + /// + /// Or even complex nested types + /// + /// ``` + /// # use std::collections::BTreeMap; + /// # use std::sync::Arc; + /// # use arrow_array::StructArray; + /// # use arrow_cast::display::{ArrayFormatter, FormatOptions}; + /// # use arrow_json::RawReaderBuilder; + /// # use arrow_schema::{DataType, Field, Fields, Schema}; + /// # use serde::Serialize; + /// # + /// #[derive(Serialize)] + /// struct MyStruct { + /// int32: i32, + /// list: Vec, + /// nested: Vec>, + /// } + /// + /// impl MyStruct { + /// /// Returns the [`Fields`] for [`MyStruct`] + /// fn fields() -> Fields { + /// let nested = DataType::Struct(Nested::fields()); + /// Fields::from([ + /// Arc::new(Field::new("int32", DataType::Int32, false)), + /// Arc::new(Field::new_list( + /// "list", + /// Field::new("element", DataType::Float64, false), + /// false, + /// )), + /// Arc::new(Field::new_list( + /// "nested", + /// Field::new("element", nested, true), + /// true, + /// )), + /// ]) + /// } + /// } + /// + /// #[derive(Serialize)] + /// struct Nested { + /// map: BTreeMap> + /// } + /// + /// impl Nested { + /// /// Returns the [`Fields`] for [`Nested`] + /// fn fields() -> Fields { + /// let element = Field::new("element", DataType::Utf8, false); + /// Fields::from([ + /// Arc::new(Field::new_map( + /// "map", + /// "entries", + /// Field::new("key", DataType::Utf8, false), + /// Field::new_list("value", element, false), + /// false, // sorted + /// false, // nullable + /// )) + /// ]) + /// } + /// } + /// + /// let data = vec![ + /// MyStruct { + /// int32: 34, + /// list: vec![1., 2., 34.], + /// nested: vec![ + /// None, + /// Some(Nested { + /// map: vec![ + /// ("key1".to_string(), vec!["foo".to_string(), "bar".to_string()]), + /// ("key2".to_string(), vec!["baz".to_string()]) + /// ].into_iter().collect() + /// }) + /// ] + /// }, + /// MyStruct { + /// int32: 56, + /// list: vec![], + /// nested: vec![] + /// }, + /// MyStruct { + /// int32: 24, + /// list: vec![-1., 245.], + /// nested: vec![None] + /// } + /// ]; + /// + /// let schema = Schema::new(MyStruct::fields()); + /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// decoder.serialize(&data).unwrap(); + /// let batch = decoder.flush().unwrap().unwrap(); + /// assert_eq!(batch.num_rows(), 3); + /// assert_eq!(batch.num_columns(), 3); + /// + /// // Convert to StructArray to format + /// let s = StructArray::from(batch); + /// let options = FormatOptions::default().with_null("null"); + /// let formatter = ArrayFormatter::try_new(&s, &options).unwrap(); + /// + /// assert_eq!(&formatter.value(0).to_string(), "{int32: 34, list: [1.0, 2.0, 34.0], nested: [null, {map: {key1: [foo, bar], key2: [baz]}}]}"); + /// assert_eq!(&formatter.value(1).to_string(), "{int32: 56, list: [], nested: []}"); + /// assert_eq!(&formatter.value(2).to_string(), "{int32: 24, list: [-1.0, 245.0], nested: [null]}"); + /// ``` + /// + /// Note: this ignores any batch size setting, and always decodes all rows + pub fn serialize(&mut self, rows: &[S]) -> Result<(), ArrowError> { + self.tape_decoder.serialize(rows) + } + /// Flushes the currently buffered data to a [`RecordBatch`] /// /// Returns `Ok(None)` if no buffered data diff --git a/arrow-json/src/raw/serializer.rs b/arrow-json/src/raw/serializer.rs new file mode 100644 index 000000000000..d743b6dba126 --- /dev/null +++ b/arrow-json/src/raw/serializer.rs @@ -0,0 +1,422 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::raw::tape::TapeElement; +use lexical_core::FormattedSize; +use serde::ser::{ + Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple, + SerializeTupleStruct, +}; +use serde::{Serialize, Serializer}; + +#[derive(Debug)] +pub struct SerializerError(String); + +impl std::error::Error for SerializerError {} + +impl std::fmt::Display for SerializerError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl serde::ser::Error for SerializerError { + fn custom(msg: T) -> Self + where + T: std::fmt::Display, + { + Self(msg.to_string()) + } +} + +/// [`Serializer`] for [`TapeElement`] +/// +/// Heavily based on +pub struct TapeSerializer<'a> { + elements: &'a mut Vec, + + /// A buffer of parsed string data + bytes: &'a mut Vec, + + /// Offsets into `data` + offsets: &'a mut Vec, +} + +impl<'a> TapeSerializer<'a> { + pub fn new( + elements: &'a mut Vec, + bytes: &'a mut Vec, + offsets: &'a mut Vec, + ) -> Self { + Self { + elements, + bytes, + offsets, + } + } +} + +/// The tape stores all values as strings, and so must serialize numeric types +/// +/// Formatting to a string only to parse it back again is rather wasteful, +/// it may be possible to tweak the tape representation to avoid this +/// +/// Need to use macro as const generic expressions are unstable +/// +macro_rules! serialize_numeric { + ($s:ident, $t:ty, $v:ident) => {{ + let mut buffer = [0_u8; <$t>::FORMATTED_SIZE]; + let s = lexical_core::write($v, &mut buffer); + $s.serialize_bytes(s) + }}; +} + +impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> { + type Ok = (); + + type Error = SerializerError; + + type SerializeSeq = ListSerializer<'a, 'b>; + type SerializeTuple = ListSerializer<'a, 'b>; + type SerializeTupleStruct = ListSerializer<'a, 'b>; + type SerializeTupleVariant = Impossible<(), SerializerError>; + type SerializeMap = ObjectSerializer<'a, 'b>; + type SerializeStruct = ObjectSerializer<'a, 'b>; + type SerializeStructVariant = Impossible<(), SerializerError>; + + fn serialize_bool(self, v: bool) -> Result<(), SerializerError> { + self.elements.push(match v { + true => TapeElement::True, + false => TapeElement::False, + }); + Ok(()) + } + + fn serialize_i8(self, v: i8) -> Result<(), SerializerError> { + serialize_numeric!(self, i8, v) + } + + fn serialize_i16(self, v: i16) -> Result<(), SerializerError> { + serialize_numeric!(self, i16, v) + } + + fn serialize_i32(self, v: i32) -> Result<(), SerializerError> { + serialize_numeric!(self, i32, v) + } + + fn serialize_i64(self, v: i64) -> Result<(), SerializerError> { + serialize_numeric!(self, i64, v) + } + + fn serialize_u8(self, v: u8) -> Result<(), SerializerError> { + serialize_numeric!(self, u8, v) + } + + fn serialize_u16(self, v: u16) -> Result<(), SerializerError> { + serialize_numeric!(self, u16, v) + } + + fn serialize_u32(self, v: u32) -> Result<(), SerializerError> { + serialize_numeric!(self, u32, v) + } + + fn serialize_u64(self, v: u64) -> Result<(), SerializerError> { + serialize_numeric!(self, u64, v) + } + + fn serialize_f32(self, v: f32) -> Result<(), SerializerError> { + serialize_numeric!(self, f32, v) + } + + fn serialize_f64(self, v: f64) -> Result<(), SerializerError> { + serialize_numeric!(self, f64, v) + } + + fn serialize_char(self, v: char) -> Result<(), SerializerError> { + self.serialize_str(&v.to_string()) + } + + fn serialize_str(self, v: &str) -> Result<(), SerializerError> { + self.serialize_bytes(v.as_bytes()) + } + + fn serialize_bytes(self, v: &[u8]) -> Result<(), SerializerError> { + self.bytes.extend_from_slice(v); + let idx = self.offsets.len() - 1; + self.elements.push(TapeElement::String(idx as _)); + self.offsets.push(self.bytes.len()); + Ok(()) + } + + fn serialize_none(self) -> Result<(), SerializerError> { + self.serialize_unit() + } + + fn serialize_some(self, value: &T) -> Result<(), SerializerError> + where + T: ?Sized + Serialize, + { + value.serialize(self) + } + + fn serialize_unit(self) -> Result<(), SerializerError> { + self.elements.push(TapeElement::Null); + Ok(()) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result<(), SerializerError> { + self.serialize_unit() + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + variant: &'static str, + ) -> Result<(), SerializerError> { + self.serialize_str(variant) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T, + ) -> Result<(), SerializerError> + where + T: ?Sized + Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + variant: &'static str, + value: &T, + ) -> Result<(), SerializerError> + where + T: ?Sized + Serialize, + { + let mut serializer = self.serialize_map(Some(1))?; + serializer.serialize_key(variant)?; + serializer.serialize_value(value)?; + serializer.finish(); + Ok(()) + } + + fn serialize_seq( + self, + _len: Option, + ) -> Result { + Ok(ListSerializer::new(self)) + } + + fn serialize_tuple( + self, + len: usize, + ) -> Result { + self.serialize_seq(Some(len)) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + len: usize, + ) -> Result { + self.serialize_seq(Some(len)) + } + + fn serialize_tuple_variant( + self, + name: &'static str, + _variant_index: u32, + variant: &'static str, + _len: usize, + ) -> Result { + Err(SerializerError(format!( + "serializing tuple variants is not currently supported: {name}::{variant}" + ))) + } + + // Maps are represented in JSON as `{ K: V, K: V, ... }`. + fn serialize_map( + self, + _len: Option, + ) -> Result { + Ok(ObjectSerializer::new(self)) + } + + fn serialize_struct( + self, + _name: &'static str, + len: usize, + ) -> Result { + self.serialize_map(Some(len)) + } + + fn serialize_struct_variant( + self, + name: &'static str, + _variant_index: u32, + variant: &'static str, + _len: usize, + ) -> Result { + Err(SerializerError(format!( + "serializing struct variants is not currently supported: {name}::{variant}" + ))) + } +} + +pub struct ObjectSerializer<'a, 'b> { + serializer: &'a mut TapeSerializer<'b>, + start: usize, +} + +impl<'a, 'b> ObjectSerializer<'a, 'b> { + fn new(serializer: &'a mut TapeSerializer<'b>) -> Self { + let start = serializer.elements.len(); + serializer.elements.push(TapeElement::StartObject(0)); + Self { serializer, start } + } + + fn finish(self) { + let end = self.serializer.elements.len() as _; + self.serializer.elements[self.start] = TapeElement::StartObject(end); + + let end = TapeElement::EndObject(self.start as _); + self.serializer.elements.push(end); + } +} + +impl<'a, 'b> SerializeMap for ObjectSerializer<'a, 'b> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + key.serialize(&mut *self.serializer) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<(), Self::Error> { + self.finish(); + Ok(()) + } +} + +impl<'a, 'b> SerializeStruct for ObjectSerializer<'a, 'b> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where + T: Serialize, + { + key.serialize(&mut *self.serializer)?; + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<(), Self::Error> { + self.finish(); + Ok(()) + } +} + +pub struct ListSerializer<'a, 'b> { + serializer: &'a mut TapeSerializer<'b>, + start: usize, +} + +impl<'a, 'b> ListSerializer<'a, 'b> { + fn new(serializer: &'a mut TapeSerializer<'b>) -> Self { + let start = serializer.elements.len(); + serializer.elements.push(TapeElement::StartList(0)); + Self { serializer, start } + } + + fn finish(self) { + let end = self.serializer.elements.len() as _; + self.serializer.elements[self.start] = TapeElement::StartList(end); + + let end = TapeElement::EndList(self.start as _); + self.serializer.elements.push(end); + } +} + +impl<'a, 'b> SerializeSeq for ListSerializer<'a, 'b> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<(), Self::Error> { + self.finish(); + Ok(()) + } +} + +impl<'a, 'b> SerializeTuple for ListSerializer<'a, 'b> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<(), Self::Error> { + self.finish(); + Ok(()) + } +} + +impl<'a, 'b> SerializeTupleStruct for ListSerializer<'a, 'b> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + value.serialize(&mut *self.serializer) + } + + fn end(self) -> Result<(), Self::Error> { + self.finish(); + Ok(()) + } +} diff --git a/arrow-json/src/raw/tape.rs b/arrow-json/src/raw/tape.rs index 3f4a317c8700..2720c2502585 100644 --- a/arrow-json/src/raw/tape.rs +++ b/arrow-json/src/raw/tape.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. +use crate::raw::serializer::TapeSerializer; use arrow_schema::ArrowError; +use serde::Serialize; use std::fmt::{Display, Formatter}; /// We decode JSON to a flattened tape representation, @@ -452,6 +454,27 @@ impl TapeDecoder { Ok(buf.len() - iter.len()) } + /// Writes any type that implements [`Serialize`] into this [`TapeDecoder`] + pub fn serialize(&mut self, rows: &[S]) -> Result<(), ArrowError> { + if let Some(b) = self.stack.last() { + return Err(ArrowError::JsonError(format!( + "Cannot serialize to tape containing partial decode state {}", + b.as_str() + ))); + } + + let mut serializer = + TapeSerializer::new(&mut self.elements, &mut self.bytes, &mut self.offsets); + + rows.iter() + .try_for_each(|row| row.serialize(&mut serializer)) + .map_err(|e| ArrowError::JsonError(e.to_string()))?; + + self.num_rows += rows.len(); + + Ok(()) + } + /// Finishes the current [`Tape`] pub fn finish(&self) -> Result, ArrowError> { if let Some(b) = self.stack.last() { diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2c9bf64eccf1..58fe54fd1f29 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -99,6 +99,7 @@ criterion = { version = "0.4", default-features = false } half = { version = "2.1", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } +serde = { version = "1.0", default-features = false, features = ["derive"] } [build-dependencies] diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 40b09a976178..41b846b0475e 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -271,6 +271,52 @@ //! //! Parquet is published as a [separate crate](https://crates.io/crates/parquet) //! +//! # Serde Compatibility +//! +//! [`arrow_json::RawDecoder`] provides a mechanism to convert arbitrary, serde-compatible +//! structures into [`RecordBatch`]. +//! +//! Whilst likely less performant than implementing a custom builder, as described in +//! [arrow_array::builder], this provides a simple mechanism to get up and running quickly +//! +//! ``` +//! # use std::sync::Arc; +//! # use arrow_json::RawReaderBuilder; +//! # use arrow_schema::{DataType, Field, Schema}; +//! # use serde::Serialize; +//! # use arrow_array::cast::AsArray; +//! # use arrow_array::types::{Float32Type, Int32Type}; +//! # +//! #[derive(Serialize)] +//! struct MyStruct { +//! int32: i32, +//! string: String, +//! } +//! +//! let schema = Schema::new(vec![ +//! Field::new("int32", DataType::Int32, false), +//! Field::new("string", DataType::Utf8, false), +//! ]); +//! +//! let rows = vec![ +//! MyStruct{ int32: 5, string: "bar".to_string() }, +//! MyStruct{ int32: 8, string: "foo".to_string() }, +//! ]; +//! +//! let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); +//! decoder.serialize(&rows).unwrap(); +//! +//! let batch = decoder.flush().unwrap().unwrap(); +//! +//! // Expect batch containing two columns +//! let int32 = batch.column(0).as_primitive::(); +//! assert_eq!(int32.values(), &[5, 8]); +//! +//! let string = batch.column(1).as_string::(); +//! assert_eq!(string.value(0), "bar"); +//! assert_eq!(string.value(1), "foo"); +//! ``` +//! //! # Memory and Buffers //! //! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, From c26bb810700149e912b020048ed378e09e5267df Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 6 Apr 2023 20:10:32 +0100 Subject: [PATCH 0793/1411] Add Fields::contains (#4026) * Add Fields::contains * Fix typo --- arrow-schema/src/field.rs | 15 +++------------ arrow-schema/src/fields.rs | 17 +++++++++++++++++ arrow-schema/src/schema.rs | 20 ++++++++------------ 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 1af157e4d212..5edd5be7a8e5 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -522,18 +522,9 @@ impl Field { // self need to be nullable or both of them are not nullable && (self.nullable || !other.nullable) // make sure self.metadata is a superset of other.metadata - && match (&self.metadata.is_empty(), &other.metadata.is_empty()) { - (_, true) => true, - (true, false) => false, - (false, false) => { - other.metadata().iter().all(|(k, v)| { - match self.metadata().get(k) { - Some(s) => s == v, - None => false - } - }) - } - } + && other.metadata.iter().all(|(k, v1)| { + self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default() + }) } /// Return size of this instance in bytes. diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 07dff2aae6bd..b93735328ac6 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -66,6 +66,23 @@ impl Fields { pub fn find(&self, name: &str) -> Option<(usize, &FieldRef)> { self.0.iter().enumerate().find(|(_, b)| b.name() == name) } + + /// Check to see if `self` is a superset of `other` + /// + /// In particular returns true if both have the same number of fields, and [`Field::contains`] + /// for each field across self and other + /// + /// In other words, any record that conforms to `other` should also conform to `self` + pub fn contains(&self, other: &Fields) -> bool { + if Arc::ptr_eq(&self.0, &other.0) { + return true; + } + self.len() == other.len() + && self + .iter() + .zip(other.iter()) + .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b)) + } } impl Default for Fields { diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 2cc892f5a8c2..a00e8a588757 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -314,22 +314,18 @@ impl Schema { Some((idx, field.as_ref())) } - /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: + /// Check to see if `self` is a superset of `other` schema. /// - /// * `self` and `other` should contain the same number of fields - /// * for every field `f` in `other`, the field in `self` with corresponding index should be a - /// superset of `f`. - /// * self.metadata is a superset of other.metadata + /// In particular returns true if `self.metadata` is a superset of `other.metadata` + /// and [`Fields::contains`] for `self.fields` and `other.fields` /// - /// In other words, any record conforms to `other` should also conform to `self`. + /// In other words, any record that conforms to `other` should also conform to `self`. pub fn contains(&self, other: &Schema) -> bool { - self.fields.len() == other.fields.len() - && self.fields.iter().zip(other.fields.iter()).all(|(f1, f2)| f1.contains(f2)) // make sure self.metadata is a superset of other.metadata - && other.metadata.iter().all(|(k, v1)| match self.metadata.get(k) { - Some(v2) => v1 == v2, - _ => false, - }) + self.fields.contains(&other.fields) + && other.metadata.iter().all(|(k, v1)| { + self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default() + }) } } From 9bf99840b135ae9a7ae365e114f4df1d30627998 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 6 Apr 2023 20:10:43 +0100 Subject: [PATCH 0794/1411] Only require compatible batch schema in ArrowWriter (#4027) --- parquet/src/arrow/arrow_writer/mod.rs | 52 ++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 4cf54dc8897e..d026f971e946 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -132,7 +132,10 @@ impl ArrowWriter { /// and drop any fully written `RecordBatch` pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { // validate batch schema against writer's supplied schema - if self.arrow_schema != batch.schema() { + let batch_schema = batch.schema(); + if !(Arc::ptr_eq(&self.arrow_schema, &batch_schema) + || self.arrow_schema.contains(&batch_schema)) + { return Err(ParquetError::ArrowError( "Record batch schema does not match writer schema".to_string(), )); @@ -2358,4 +2361,51 @@ mod tests { let actual = pretty_format_batches(&batches).unwrap().to_string(); assert_eq!(actual, expected); } + + #[test] + fn test_arrow_writer_metadata() { + let batch_schema = Schema::new(vec![Field::new("int32", DataType::Int32, false)]); + let file_schema = batch_schema.clone().with_metadata( + vec![("foo".to_string(), "bar".to_string())] + .into_iter() + .collect(), + ); + + let batch = RecordBatch::try_new( + Arc::new(batch_schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _], + ) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buf, Arc::new(file_schema), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } + + #[test] + fn test_arrow_writer_nullable() { + let batch_schema = Schema::new(vec![Field::new("int32", DataType::Int32, false)]); + let file_schema = Schema::new(vec![Field::new("int32", DataType::Int32, true)]); + let file_schema = Arc::new(file_schema); + + let batch = RecordBatch::try_new( + Arc::new(batch_schema), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _], + ) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buf, file_schema.clone(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let mut read = ParquetRecordBatchReader::try_new(Bytes::from(buf), 1024).unwrap(); + let back = read.next().unwrap().unwrap(); + assert_eq!(back.schema(), file_schema); + assert_ne!(back.schema(), batch.schema()); + assert_eq!(back.column(0).as_ref(), batch.column(0).as_ref()); + } } From 2b77f643947084a8c10164dc6dfdfecf6bc1a039 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Fri, 7 Apr 2023 02:05:21 +0300 Subject: [PATCH 0795/1411] Minor: add methods "is_positive" and "signum" to i256 (#4024) * Minor: add method "is_positive" to i256 * fix: cancel changes to functions "checked_add" and "checked_sub" * feat: signum; test for is_positive, is_negative, signum * feat: use i256::ONE, i256::MINUS_ONE, i256::ZERO --- arrow-buffer/src/bigint.rs | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 3a9c4aac8163..fab75b792abd 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -483,11 +483,31 @@ impl i256 { acc.wrapping_mul(base) } + /// Returns a number [`i256`] representing sign of this [`i256`]. + /// + /// 0 if the number is zero + /// 1 if the number is positive + /// -1 if the number is negative + pub const fn signum(self) -> Self { + if self.is_positive() { + i256::ONE + } else if self.is_negative() { + i256::MINUS_ONE + } else { + i256::ZERO + } + } + /// Returns `true` if this [`i256`] is negative #[inline] pub const fn is_negative(self) -> bool { self.high.is_negative() } + + /// Returns `true` if this [`i256`] is positive + pub const fn is_positive(self) -> bool { + self.high.is_positive() || self.high == 0 && self.low != 0 + } } /// Temporary workaround due to lack of stable const array slicing @@ -917,6 +937,27 @@ mod tests { } } + #[test] + fn test_signed_ops() { + // signum + assert_eq!(i256::from_i128(1).signum(), i256::ONE); + assert_eq!(i256::from_i128(0).signum(), i256::ZERO); + assert_eq!(i256::from_i128(-0).signum(), i256::ZERO); + assert_eq!(i256::from_i128(-1).signum(), i256::MINUS_ONE); + + // is_positive + assert!(i256::from_i128(1).is_positive()); + assert!(!i256::from_i128(0).is_positive()); + assert!(!i256::from_i128(-0).is_positive()); + assert!(!i256::from_i128(-1).is_positive()); + + // is_negative + assert!(!i256::from_i128(1).is_negative()); + assert!(!i256::from_i128(0).is_negative()); + assert!(!i256::from_i128(-0).is_negative()); + assert!(i256::from_i128(-1).is_negative()); + } + #[test] #[cfg_attr(miri, ignore)] fn test_i256_fuzz() { From e45f8b5132c10505a6f2b9d8a8e8e9207049605f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Apr 2023 09:20:10 +0100 Subject: [PATCH 0796/1411] Add RecordBatch::with_schema (#4028) --- arrow-array/src/record_batch.rs | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 081bd55fc650..1350285f8b26 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -204,6 +204,25 @@ impl RecordBatch { }) } + /// Override the schema of this [`RecordBatch`] + /// + /// Returns an error if `schema` is not a superset of the current schema + /// as determined by [`Schema::contains`] + pub fn with_schema(self, schema: SchemaRef) -> Result { + if !schema.contains(self.schema.as_ref()) { + return Err(ArrowError::SchemaError(format!( + "{schema} is not a superset of {}", + self.schema + ))); + } + + Ok(Self { + schema, + columns: self.columns, + row_count: self.row_count, + }) + } + /// Returns the [`Schema`](arrow_schema::Schema) of the record batch. pub fn schema(&self) -> SchemaRef { self.schema.clone() @@ -1062,4 +1081,34 @@ mod tests { )); let _ = RecordBatch::from(s); } + + #[test] + fn test_with_schema() { + let required_schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let required_schema = Arc::new(required_schema); + let nullable_schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let nullable_schema = Arc::new(nullable_schema); + + let batch = RecordBatch::try_new( + required_schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as _], + ) + .unwrap(); + + // Can add nullability + let batch = batch.with_schema(nullable_schema.clone()).unwrap(); + + // Cannot remove nullability + batch.clone().with_schema(required_schema).unwrap_err(); + + // Can add metadata + let metadata = vec![("foo".to_string(), "bar".to_string())] + .into_iter() + .collect(); + let metadata_schema = nullable_schema.as_ref().clone().with_metadata(metadata); + let batch = batch.with_schema(Arc::new(metadata_schema)).unwrap(); + + // Cannot remove metadata + batch.with_schema(nullable_schema).unwrap_err(); + } } From d946cc4f495bc514cc04dc03561d9dc59f339c7d Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 7 Apr 2023 08:11:07 -0400 Subject: [PATCH 0797/1411] Prep for 37.0.0 (#4031) * Update version * Update changelog --- CHANGELOG-old.md | 75 +++++++++++++++++ CHANGELOG.md | 137 +++++++++++++++++-------------- Cargo.toml | 32 ++++---- arrow-flight/README.md | 2 +- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet_derive/README.md | 4 +- 8 files changed, 174 insertions(+), 84 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index ebdab71b2401..f1219e514675 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,81 @@ # Historical Changelog +## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-24) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/35.0.0...36.0.0) + +**Breaking changes:** + +- Use dyn Array in sort kernels [\#3931](https://github.com/apache/arrow-rs/pull/3931) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Enforce struct nullability in JSON raw reader \(\#3900\) \(\#3904\) [\#3906](https://github.com/apache/arrow-rs/pull/3906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return ScalarBuffer from PrimitiveArray::values \(\#3879\) [\#3896](https://github.com/apache/arrow-rs/pull/3896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use BooleanBuffer in BooleanArray \(\#3879\) [\#3895](https://github.com/apache/arrow-rs/pull/3895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Seal ArrowPrimitiveType [\#3882](https://github.com/apache/arrow-rs/pull/3882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support compression levels [\#3847](https://github.com/apache/arrow-rs/pull/3847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([spebern](https://github.com/spebern)) + +**Implemented enhancements:** + +- Improve speed of parsing string to Times [\#3919](https://github.com/apache/arrow-rs/issues/3919) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- feat: add comparison/sort support for Float16 [\#3914](https://github.com/apache/arrow-rs/issues/3914) +- Pinned version in arrow-flight's build-dependencies are causing conflicts [\#3876](https://github.com/apache/arrow-rs/issues/3876) +- Add compression options \(levels\) [\#3844](https://github.com/apache/arrow-rs/issues/3844) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use Unsigned Integer for Fixed Size DataType [\#3815](https://github.com/apache/arrow-rs/issues/3815) +- Common trait for RecordBatch and StructArray [\#3764](https://github.com/apache/arrow-rs/issues/3764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow precision loss on multiplying decimal arrays [\#3689](https://github.com/apache/arrow-rs/issues/3689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Raw JSON Reader Allows Non-Nullable Struct Children to Contain Nulls [\#3904](https://github.com/apache/arrow-rs/issues/3904) +- Nullable field with nested not nullable map in json [\#3900](https://github.com/apache/arrow-rs/issues/3900) +- parquet\_derive doesn't support Vec\ [\#3864](https://github.com/apache/arrow-rs/issues/3864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[REGRESSION\] Parsing timestamps with lower case time separator [\#3863](https://github.com/apache/arrow-rs/issues/3863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[REGRESSION\] Parsing timestamps with leap seconds [\#3861](https://github.com/apache/arrow-rs/issues/3861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[REGRESSION\] Parsing timestamps with fractional seconds / microseconds / milliseconds / nanoseconds [\#3859](https://github.com/apache/arrow-rs/issues/3859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CSV Reader Doesn't set Timezone [\#3841](https://github.com/apache/arrow-rs/issues/3841) +- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Derive RunArray Clone [\#3932](https://github.com/apache/arrow-rs/pull/3932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move protoc generation to binary crate, unpin prost/tonic build \(\#3876\) [\#3927](https://github.com/apache/arrow-rs/pull/3927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix JSON Temporal Encoding of Multiple Batches [\#3924](https://github.com/apache/arrow-rs/pull/3924) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) +- Cleanup uses of Array::data\_ref \(\#3880\) [\#3918](https://github.com/apache/arrow-rs/pull/3918) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support microsecond and nanosecond in interval parsing [\#3916](https://github.com/apache/arrow-rs/pull/3916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- feat: add comparison/sort support for Float16 [\#3915](https://github.com/apache/arrow-rs/pull/3915) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add AsArray trait for more ergonomic downcasting [\#3912](https://github.com/apache/arrow-rs/pull/3912) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add OffsetBuffer::new [\#3910](https://github.com/apache/arrow-rs/pull/3910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add PrimitiveArray::new \(\#3879\) [\#3909](https://github.com/apache/arrow-rs/pull/3909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support timezones in CSV reader \(\#3841\) [\#3908](https://github.com/apache/arrow-rs/pull/3908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve ScalarBuffer debug output [\#3907](https://github.com/apache/arrow-rs/pull/3907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.52 to =1.0.53 [\#3905](https://github.com/apache/arrow-rs/pull/3905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Re-export parquet compression level structs [\#3903](https://github.com/apache/arrow-rs/pull/3903) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix parsing timestamps of exactly 32 characters [\#3902](https://github.com/apache/arrow-rs/pull/3902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add iterators to BooleanBuffer and NullBuffer [\#3901](https://github.com/apache/arrow-rs/pull/3901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Array equality for &dyn Array \(\#3880\) [\#3899](https://github.com/apache/arrow-rs/pull/3899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BooleanArray::new \(\#3879\) [\#3898](https://github.com/apache/arrow-rs/pull/3898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Fix pyarrow memory leak \(\#3683\) [\#3893](https://github.com/apache/arrow-rs/pull/3893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add examples for `ListBuilder` and `GenericListBuilder` [\#3891](https://github.com/apache/arrow-rs/pull/3891) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update syn requirement from 1.0 to 2.0 [\#3890](https://github.com/apache/arrow-rs/pull/3890) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Use of `mul_checked` to avoid silent overflow in interval arithmetic [\#3886](https://github.com/apache/arrow-rs/pull/3886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Flesh out NullBuffer abstraction \(\#3880\) [\#3885](https://github.com/apache/arrow-rs/pull/3885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Bit Operations for i256 [\#3884](https://github.com/apache/arrow-rs/pull/3884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Flatten arrow\_buffer [\#3883](https://github.com/apache/arrow-rs/pull/3883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Array::to\_data and Array::nulls \(\#3880\) [\#3881](https://github.com/apache/arrow-rs/pull/3881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Added support for byte vectors and slices to parquet\_derive \(\#3864\) [\#3878](https://github.com/apache/arrow-rs/pull/3878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([waymost](https://github.com/waymost)) +- chore: remove LevelDecoder [\#3872](https://github.com/apache/arrow-rs/pull/3872) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) +- Parse timestamps with leap seconds \(\#3861\) [\#3862](https://github.com/apache/arrow-rs/pull/3862) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster time parsing \(~93% faster\) [\#3860](https://github.com/apache/arrow-rs/pull/3860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Parse timestamps with arbitrary seconds fraction [\#3858](https://github.com/apache/arrow-rs/pull/3858) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add BitIterator [\#3856](https://github.com/apache/arrow-rs/pull/3856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve decimal parsing performance [\#3854](https://github.com/apache/arrow-rs/pull/3854) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Update proc-macro2 requirement from =1.0.51 to =1.0.52 [\#3853](https://github.com/apache/arrow-rs/pull/3853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update bitflags requirement from 1.2.1 to 2.0.0 [\#3852](https://github.com/apache/arrow-rs/pull/3852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add offset pushdown to parquet [\#3848](https://github.com/apache/arrow-rs/pull/3848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add timezone support to JSON reader [\#3845](https://github.com/apache/arrow-rs/pull/3845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow precision loss on multiplying decimal arrays [\#3690](https://github.com/apache/arrow-rs/pull/3690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) + ## [35.0.0](https://github.com/apache/arrow-rs/tree/35.0.0) (2023-03-10) [Full Changelog](https://github.com/apache/arrow-rs/compare/34.0.0...35.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b1c59b3089e..18212cbb78e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,80 +19,95 @@ # Changelog -## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-24) +## [37.0.0](https://github.com/apache/arrow-rs/tree/37.0.0) (2023-04-07) -[Full Changelog](https://github.com/apache/arrow-rs/compare/35.0.0...36.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/36.0.0...37.0.0) **Breaking changes:** -- Use dyn Array in sort kernels [\#3931](https://github.com/apache/arrow-rs/pull/3931) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Enforce struct nullability in JSON raw reader \(\#3900\) \(\#3904\) [\#3906](https://github.com/apache/arrow-rs/pull/3906) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Return ScalarBuffer from PrimitiveArray::values \(\#3879\) [\#3896](https://github.com/apache/arrow-rs/pull/3896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use BooleanBuffer in BooleanArray \(\#3879\) [\#3895](https://github.com/apache/arrow-rs/pull/3895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Seal ArrowPrimitiveType [\#3882](https://github.com/apache/arrow-rs/pull/3882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support compression levels [\#3847](https://github.com/apache/arrow-rs/pull/3847) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([spebern](https://github.com/spebern)) +- Update tonic 0.9.1 [\#4011](https://github.com/apache/arrow-rs/pull/4011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Use FieldRef in DataType \(\#3955\) [\#3983](https://github.com/apache/arrow-rs/pull/3983) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Store Timezone as Arc\ [\#3976](https://github.com/apache/arrow-rs/pull/3976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Panic instead of discarding nulls converting StructArray to RecordBatch - \(\#3951\) [\#3953](https://github.com/apache/arrow-rs/pull/3953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix\(flight\_sql\): PreparedStatement has no token for auth. [\#3948](https://github.com/apache/arrow-rs/pull/3948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) +- Add Strongly Typed Array Slice \(\#3929\) [\#3930](https://github.com/apache/arrow-rs/pull/3930) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Zero-Copy Conversion between Vec and MutableBuffer [\#3920](https://github.com/apache/arrow-rs/pull/3920) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Improve speed of parsing string to Times [\#3919](https://github.com/apache/arrow-rs/issues/3919) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- feat: add comparison/sort support for Float16 [\#3914](https://github.com/apache/arrow-rs/issues/3914) -- Pinned version in arrow-flight's build-dependencies are causing conflicts [\#3876](https://github.com/apache/arrow-rs/issues/3876) -- Add compression options \(levels\) [\#3844](https://github.com/apache/arrow-rs/issues/3844) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use Unsigned Integer for Fixed Size DataType [\#3815](https://github.com/apache/arrow-rs/issues/3815) -- Common trait for RecordBatch and StructArray [\#3764](https://github.com/apache/arrow-rs/issues/3764) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow precision loss on multiplying decimal arrays [\#3689](https://github.com/apache/arrow-rs/issues/3689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Decimals cast to Utf8/LargeUtf [\#3991](https://github.com/apache/arrow-rs/issues/3991) +- Support Date32/Date64 minus Interval [\#3962](https://github.com/apache/arrow-rs/issues/3962) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Reduce Cloning of Field [\#3955](https://github.com/apache/arrow-rs/issues/3955) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Consider renaming rather than removing Decoder [\#3949](https://github.com/apache/arrow-rs/issues/3949) +- Add multiply\_fixed\_point [\#3946](https://github.com/apache/arrow-rs/issues/3946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Strongly Typed Array Slicing [\#3929](https://github.com/apache/arrow-rs/issues/3929) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make it easier to match FlightSQL messages [\#3874](https://github.com/apache/arrow-rs/issues/3874) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Casting Between Binary / LargeBinary and FixedSizeBinary [\#3826](https://github.com/apache/arrow-rs/issues/3826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Raw JSON Reader Allows Non-Nullable Struct Children to Contain Nulls [\#3904](https://github.com/apache/arrow-rs/issues/3904) -- Nullable field with nested not nullable map in json [\#3900](https://github.com/apache/arrow-rs/issues/3900) -- parquet\_derive doesn't support Vec\ [\#3864](https://github.com/apache/arrow-rs/issues/3864) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[REGRESSION\] Parsing timestamps with lower case time separator [\#3863](https://github.com/apache/arrow-rs/issues/3863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[REGRESSION\] Parsing timestamps with leap seconds [\#3861](https://github.com/apache/arrow-rs/issues/3861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[REGRESSION\] Parsing timestamps with fractional seconds / microseconds / milliseconds / nanoseconds [\#3859](https://github.com/apache/arrow-rs/issues/3859) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- CSV Reader Doesn't set Timezone [\#3841](https://github.com/apache/arrow-rs/issues/3841) -- PyArrowConvert Leaks Memory [\#3683](https://github.com/apache/arrow-rs/issues/3683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- f16::ZERO and f16::ONE are mixed up [\#4016](https://github.com/apache/arrow-rs/issues/4016) +- Handle overflow precision when casting from integer to decimal [\#3995](https://github.com/apache/arrow-rs/issues/3995) +- PrimitiveDictionaryBuilder.finish should use actual value type [\#3971](https://github.com/apache/arrow-rs/issues/3971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RecordBatch From StructArray Silently Discards Nulls [\#3952](https://github.com/apache/arrow-rs/issues/3952) +- I256 Checked Subtraction Overflows for i256::MINUS\_ONE [\#3942](https://github.com/apache/arrow-rs/issues/3942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- I256 Checked Multiply Overflows for i256::MIN [\#3941](https://github.com/apache/arrow-rs/issues/3941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Remove non-existent `js` feature from README [\#4000](https://github.com/apache/arrow-rs/issues/4000) +- Support take on MapArray [\#3875](https://github.com/apache/arrow-rs/issues/3875) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Derive RunArray Clone [\#3932](https://github.com/apache/arrow-rs/pull/3932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move protoc generation to binary crate, unpin prost/tonic build \(\#3876\) [\#3927](https://github.com/apache/arrow-rs/pull/3927) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Fix JSON Temporal Encoding of Multiple Batches [\#3924](https://github.com/apache/arrow-rs/pull/3924) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([doki23](https://github.com/doki23)) -- Cleanup uses of Array::data\_ref \(\#3880\) [\#3918](https://github.com/apache/arrow-rs/pull/3918) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support microsecond and nanosecond in interval parsing [\#3916](https://github.com/apache/arrow-rs/pull/3916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- feat: add comparison/sort support for Float16 [\#3915](https://github.com/apache/arrow-rs/pull/3915) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add AsArray trait for more ergonomic downcasting [\#3912](https://github.com/apache/arrow-rs/pull/3912) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add OffsetBuffer::new [\#3910](https://github.com/apache/arrow-rs/pull/3910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add PrimitiveArray::new \(\#3879\) [\#3909](https://github.com/apache/arrow-rs/pull/3909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support timezones in CSV reader \(\#3841\) [\#3908](https://github.com/apache/arrow-rs/pull/3908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve ScalarBuffer debug output [\#3907](https://github.com/apache/arrow-rs/pull/3907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.52 to =1.0.53 [\#3905](https://github.com/apache/arrow-rs/pull/3905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Re-export parquet compression level structs [\#3903](https://github.com/apache/arrow-rs/pull/3903) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix parsing timestamps of exactly 32 characters [\#3902](https://github.com/apache/arrow-rs/pull/3902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add iterators to BooleanBuffer and NullBuffer [\#3901](https://github.com/apache/arrow-rs/pull/3901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Array equality for &dyn Array \(\#3880\) [\#3899](https://github.com/apache/arrow-rs/pull/3899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add BooleanArray::new \(\#3879\) [\#3898](https://github.com/apache/arrow-rs/pull/3898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) -- Fix pyarrow memory leak \(\#3683\) [\#3893](https://github.com/apache/arrow-rs/pull/3893) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: add examples for `ListBuilder` and `GenericListBuilder` [\#3891](https://github.com/apache/arrow-rs/pull/3891) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Update syn requirement from 1.0 to 2.0 [\#3890](https://github.com/apache/arrow-rs/pull/3890) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Use of `mul_checked` to avoid silent overflow in interval arithmetic [\#3886](https://github.com/apache/arrow-rs/pull/3886) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Flesh out NullBuffer abstraction \(\#3880\) [\#3885](https://github.com/apache/arrow-rs/pull/3885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement Bit Operations for i256 [\#3884](https://github.com/apache/arrow-rs/pull/3884) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Flatten arrow\_buffer [\#3883](https://github.com/apache/arrow-rs/pull/3883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add Array::to\_data and Array::nulls \(\#3880\) [\#3881](https://github.com/apache/arrow-rs/pull/3881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Added support for byte vectors and slices to parquet\_derive \(\#3864\) [\#3878](https://github.com/apache/arrow-rs/pull/3878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([waymost](https://github.com/waymost)) -- chore: remove LevelDecoder [\#3872](https://github.com/apache/arrow-rs/pull/3872) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) -- Parse timestamps with leap seconds \(\#3861\) [\#3862](https://github.com/apache/arrow-rs/pull/3862) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster time parsing \(~93% faster\) [\#3860](https://github.com/apache/arrow-rs/pull/3860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Parse timestamps with arbitrary seconds fraction [\#3858](https://github.com/apache/arrow-rs/pull/3858) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add BitIterator [\#3856](https://github.com/apache/arrow-rs/pull/3856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve decimal parsing performance [\#3854](https://github.com/apache/arrow-rs/pull/3854) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) -- Update proc-macro2 requirement from =1.0.51 to =1.0.52 [\#3853](https://github.com/apache/arrow-rs/pull/3853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update bitflags requirement from 1.2.1 to 2.0.0 [\#3852](https://github.com/apache/arrow-rs/pull/3852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add offset pushdown to parquet [\#3848](https://github.com/apache/arrow-rs/pull/3848) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add timezone support to JSON reader [\#3845](https://github.com/apache/arrow-rs/pull/3845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Allow precision loss on multiplying decimal arrays [\#3690](https://github.com/apache/arrow-rs/pull/3690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Only require compatible batch schema in ArrowWriter [\#4027](https://github.com/apache/arrow-rs/pull/4027) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add Fields::contains [\#4026](https://github.com/apache/arrow-rs/pull/4026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add methods "is\_positive" and "signum" to i256 [\#4024](https://github.com/apache/arrow-rs/pull/4024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Deprecate Array::data \(\#3880\) [\#4019](https://github.com/apache/arrow-rs/pull/4019) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add tests for ArrowNativeTypeOp [\#4018](https://github.com/apache/arrow-rs/pull/4018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- fix: f16::ZERO and f16::ONE are mixed up [\#4017](https://github.com/apache/arrow-rs/pull/4017) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Minor: Float16Tensor [\#4013](https://github.com/apache/arrow-rs/pull/4013) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add FlightSQL module docs and links to `arrow-flight` crates [\#4012](https://github.com/apache/arrow-rs/pull/4012) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Update proc-macro2 requirement from =1.0.54 to =1.0.56 [\#4008](https://github.com/apache/arrow-rs/pull/4008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Cleanup Primitive take [\#4006](https://github.com/apache/arrow-rs/pull/4006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate combine\_option\_bitmap [\#4005](https://github.com/apache/arrow-rs/pull/4005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add tests for BooleanBuffer [\#4004](https://github.com/apache/arrow-rs/pull/4004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- feat: support to read/write customized metadata in ipc files [\#4003](https://github.com/apache/arrow-rs/pull/4003) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) +- Cleanup more uses of Array::data \(\#3880\) [\#4002](https://github.com/apache/arrow-rs/pull/4002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove js feature from README [\#4001](https://github.com/apache/arrow-rs/pull/4001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([akazukin5151](https://github.com/akazukin5151)) +- feat: add the implementation BitXor to BooleanBuffer [\#3997](https://github.com/apache/arrow-rs/pull/3997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Handle precision overflow when casting from integer to decimal [\#3996](https://github.com/apache/arrow-rs/pull/3996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support CAST from Decimal datatype to String [\#3994](https://github.com/apache/arrow-rs/pull/3994) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Add Field Constructors for Complex Fields [\#3992](https://github.com/apache/arrow-rs/pull/3992) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- fix: remove unused type parameters. [\#3986](https://github.com/apache/arrow-rs/pull/3986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([youngsofun](https://github.com/youngsofun)) +- Add UnionFields \(\#3955\) [\#3981](https://github.com/apache/arrow-rs/pull/3981) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup Fields Serde [\#3980](https://github.com/apache/arrow-rs/pull/3980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support Rust structures --\> `RecordBatch` by adding `Serde` support to `RawDecoder` \(\#3949\) [\#3979](https://github.com/apache/arrow-rs/pull/3979) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Convert string\_to\_timestamp\_nanos to doctest [\#3978](https://github.com/apache/arrow-rs/pull/3978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix documentation of string\_to\_timestamp\_nanos [\#3977](https://github.com/apache/arrow-rs/pull/3977) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([byteink](https://github.com/byteink)) +- add Date32/Date64 support to subtract\_dyn [\#3974](https://github.com/apache/arrow-rs/pull/3974) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SinanGncgl](https://github.com/SinanGncgl)) +- PrimitiveDictionaryBuilder.finish should use actual value type [\#3972](https://github.com/apache/arrow-rs/pull/3972) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update proc-macro2 requirement from =1.0.53 to =1.0.54 [\#3968](https://github.com/apache/arrow-rs/pull/3968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Async writer tweaks [\#3967](https://github.com/apache/arrow-rs/pull/3967) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix reading ipc files with unordered projections [\#3966](https://github.com/apache/arrow-rs/pull/3966) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) +- Add Fields abstraction \(\#3955\) [\#3965](https://github.com/apache/arrow-rs/pull/3965) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: cast between `Binary`/`LargeBinary` and `FixedSizeBinary` [\#3961](https://github.com/apache/arrow-rs/pull/3961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat: support async writer \(\#1269\) [\#3957](https://github.com/apache/arrow-rs/pull/3957) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ShiKaiWi](https://github.com/ShiKaiWi)) +- Add ListBuilder::append\_value \(\#3949\) [\#3954](https://github.com/apache/arrow-rs/pull/3954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve array builder documentation \(\#3949\) [\#3951](https://github.com/apache/arrow-rs/pull/3951) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster i256 parsing [\#3950](https://github.com/apache/arrow-rs/pull/3950) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add multiply\_fixed\_point [\#3945](https://github.com/apache/arrow-rs/pull/3945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat: enable metadata import/export through C data interface [\#3944](https://github.com/apache/arrow-rs/pull/3944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Fix checked i256 arithmetic \(\#3942\) \(\#3941\) [\#3943](https://github.com/apache/arrow-rs/pull/3943) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Avoid memory copies in take\_list [\#3940](https://github.com/apache/arrow-rs/pull/3940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster decimal parsing \(30-60%\) [\#3939](https://github.com/apache/arrow-rs/pull/3939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Fix: FlightSqlClient panic when execute\_update. [\#3938](https://github.com/apache/arrow-rs/pull/3938) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) +- Cleanup row count handling in JSON writer [\#3934](https://github.com/apache/arrow-rs/pull/3934) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add typed buffers to UnionArray \(\#3880\) [\#3933](https://github.com/apache/arrow-rs/pull/3933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add take for MapArray [\#3925](https://github.com/apache/arrow-rs/pull/3925) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Deprecate Array::data\_ref \(\#3880\) [\#3923](https://github.com/apache/arrow-rs/pull/3923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Zero-copy conversion from Vec to PrimitiveArray [\#3917](https://github.com/apache/arrow-rs/pull/3917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Add Commands enum to decode prost messages to strong type [\#3887](https://github.com/apache/arrow-rs/pull/3887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) diff --git a/Cargo.toml b/Cargo.toml index a287f66b082c..7d5ff0c7f949 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ resolver = "2" exclude = ["arrow-pyarrow-integration-testing"] [workspace.package] -version = "36.0.0" +version = "37.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -72,18 +72,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "36.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "36.0.0", path = "./arrow-arith" } -arrow-array = { version = "36.0.0", path = "./arrow-array" } -arrow-buffer = { version = "36.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "36.0.0", path = "./arrow-cast" } -arrow-csv = { version = "36.0.0", path = "./arrow-csv" } -arrow-data = { version = "36.0.0", path = "./arrow-data" } -arrow-ipc = { version = "36.0.0", path = "./arrow-ipc" } -arrow-json = { version = "36.0.0", path = "./arrow-json" } -arrow-ord = { version = "36.0.0", path = "./arrow-ord" } -arrow-row = { version = "36.0.0", path = "./arrow-row" } -arrow-schema = { version = "36.0.0", path = "./arrow-schema" } -arrow-select = { version = "36.0.0", path = "./arrow-select" } -arrow-string = { version = "36.0.0", path = "./arrow-string" } -parquet = { version = "36.0.0", path = "./parquet", default-features = false } +arrow = { version = "37.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "37.0.0", path = "./arrow-arith" } +arrow-array = { version = "37.0.0", path = "./arrow-array" } +arrow-buffer = { version = "37.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "37.0.0", path = "./arrow-cast" } +arrow-csv = { version = "37.0.0", path = "./arrow-csv" } +arrow-data = { version = "37.0.0", path = "./arrow-data" } +arrow-ipc = { version = "37.0.0", path = "./arrow-ipc" } +arrow-json = { version = "37.0.0", path = "./arrow-json" } +arrow-ord = { version = "37.0.0", path = "./arrow-ord" } +arrow-row = { version = "37.0.0", path = "./arrow-row" } +arrow-schema = { version = "37.0.0", path = "./arrow-schema" } +arrow-select = { version = "37.0.0", path = "./arrow-select" } +arrow-string = { version = "37.0.0", path = "./arrow-string" } +parquet = { version = "37.0.0", path = "./parquet", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index f8f9e95d8377..7ddc2043465b 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "36.0.0" +arrow-flight = "37.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow/README.md b/arrow/README.md index 1e8da360f443..c7a0416a6747 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `36.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `37.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index c74d7d865dd8..e844e676a5e1 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/36.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/37.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 77f9c5f9780b..1293617c6f53 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="35.0.0" -FUTURE_RELEASE="36.0.0" +SINCE_TAG="36.0.0" +FUTURE_RELEASE="37.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 70be54015c56..26112d0097a9 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "36.0.0" -parquet_derive = "36.0.0" +parquet = "37.0.0" +parquet_derive = "37.0.0" ``` and this to your crate root: From bc15cbdfc1ada7b729eda5cdfb09fc7eda0c90ce Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Apr 2023 16:08:54 +0100 Subject: [PATCH 0798/1411] Fix timestamp handling in cast kernel (#1936) (#4033) (#4034) --- arrow-array/src/types.rs | 26 +++++++- arrow-cast/src/cast.rs | 139 +++++++++++++++++++++++++-------------- 2 files changed, 113 insertions(+), 52 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 827729ca682e..e2d7a2492227 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -26,7 +26,7 @@ use arrow_schema::{ DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; -use chrono::{Duration, NaiveDate}; +use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; use std::marker::PhantomData; use std::ops::{Add, Sub}; @@ -311,19 +311,43 @@ pub trait ArrowTimestampType: ArrowTemporalType { fn get_time_unit() -> TimeUnit { Self::UNIT } + + /// Creates a ArrowTimestampType::Native from the provided [`NaiveDateTime`] + /// + /// See [`DataType::Timestamp`] for more information on timezone handling + fn make_value(naive: NaiveDateTime) -> Option; } impl ArrowTimestampType for TimestampSecondType { const UNIT: TimeUnit = TimeUnit::Second; + + fn make_value(naive: NaiveDateTime) -> Option { + Some(naive.timestamp()) + } } impl ArrowTimestampType for TimestampMillisecondType { const UNIT: TimeUnit = TimeUnit::Millisecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let millis = naive.timestamp().checked_mul(1_000)?; + millis.checked_add(naive.timestamp_subsec_millis() as i64) + } } impl ArrowTimestampType for TimestampMicrosecondType { const UNIT: TimeUnit = TimeUnit::Microsecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let micros = naive.timestamp().checked_mul(1_000_000)?; + micros.checked_add(naive.timestamp_subsec_micros() as i64) + } } impl ArrowTimestampType for TimestampNanosecondType { const UNIT: TimeUnit = TimeUnit::Nanosecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let nanos = naive.timestamp().checked_mul(1_000_000_000)?; + nanos.checked_add(naive.timestamp_subsec_nanos() as i64) + } } impl IntervalYearMonthType { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 372fcc1a3132..05b56a0e8d32 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,14 +35,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{NaiveTime, Timelike}; +use chrono::{NaiveTime, TimeZone, Timelike, Utc}; use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, - string_to_timestamp_nanos, + string_to_datetime, }; use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, @@ -1233,16 +1233,16 @@ pub fn cast_with_options( cast_string_to_time64nanosecond::(array, cast_options) } Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Nanosecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Interval(IntervalUnit::YearMonth) => { cast_string_to_year_month_interval::(array, cast_options) @@ -2653,45 +2653,58 @@ fn cast_string_to_time64nanosecond( } /// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) -fn cast_string_to_timestamp< - Offset: OffsetSizeTrait, - TimestampType: ArrowTimestampType, ->( +fn cast_string_to_timestamp( array: &dyn Array, to_tz: &Option>, cast_options: &CastOptions, ) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let scale_factor = match TimestampType::UNIT { - TimeUnit::Second => 1_000_000_000, - TimeUnit::Millisecond => 1_000_000, - TimeUnit::Microsecond => 1_000, - TimeUnit::Nanosecond => 1, + let array = array.as_string::(); + let out: PrimitiveArray = match to_tz { + Some(tz) => { + let tz: Tz = tz.as_ref().parse()?; + cast_string_to_timestamp_impl(array, &tz, cast_options)? + } + None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, }; + Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) +} - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor)) +fn cast_string_to_timestamp_impl< + O: OffsetSizeTrait, + T: ArrowTimestampType, + Tz: TimeZone, +>( + array: &GenericStringArray, + tz: &Tz, + cast_options: &CastOptions, +) -> Result, ArrowError> { + if cast_options.safe { + let iter = array.iter().map(|v| { + v.and_then(|v| { + let naive = string_to_datetime(tz, v).ok()?.naive_utc(); + T::make_value(naive) + }) }); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - .with_timezone_opt(to_tz.clone()) - } + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) } else { - let vec = string_array + let vec = array .iter() .map(|v| { - v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor)) - .transpose() + v.map(|v| { + let naive = string_to_datetime(tz, v)?.naive_utc(); + T::make_value(naive).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow converting {naive} to {:?}", + T::UNIT + )) + }) + }) + .transpose() }) .collect::>, _>>()?; @@ -2699,13 +2712,8 @@ fn cast_string_to_timestamp< // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { - PrimitiveArray::::from_trusted_len_iter(vec.iter()) - .with_timezone_opt(to_tz.clone()) - } - }; - - Ok(Arc::new(array) as ArrayRef) + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) + } } fn cast_string_to_year_month_interval( @@ -5018,6 +5026,14 @@ mod tests { } } + #[test] + fn test_cast_string_to_timestamp_overflow() { + let array = StringArray::from(vec!["9800-09-08T12:00:00.123456789"]); + let result = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result.values(), &[247112596800]); + } + #[test] fn test_cast_string_to_date32() { let a1 = Arc::new(StringArray::from(vec![ @@ -8079,24 +8095,45 @@ mod tests { let array = Arc::new(valid) as ArrayRef; let b = cast_with_options( &array, - &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)), + &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.clone())), &CastOptions { safe: false }, ) .unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1672574706789000000, c.value(0)); - assert_eq!(1672571106789000000, c.value(1)); - assert_eq!(1672574706789000000, c.value(2)); - assert_eq!(1672574706789000000, c.value(3)); - assert_eq!(1672518906000000000, c.value(4)); - assert_eq!(1672518906000000000, c.value(5)); - assert_eq!(1672545906789000000, c.value(6)); - assert_eq!(1672545906000000000, c.value(7)); - assert_eq!(1672531200000000000, c.value(8)); + let tz = tz.as_ref().parse().unwrap(); + + let as_tz = |v: i64| { + as_datetime_with_timezone::(v, tz).unwrap() + }; + + let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string(); + let as_local = |v: &i64| as_tz(*v).naive_local().to_string(); + + let values = b.as_primitive::().values(); + let utc_results: Vec<_> = values.iter().map(as_utc).collect(); + let local_results: Vec<_> = values.iter().map(as_local).collect(); + + // Absolute timestamps should be parsed preserving the same UTC instant + assert_eq!( + &utc_results[..6], + &[ + "2023-01-01 12:05:06.789".to_string(), + "2023-01-01 11:05:06.789".to_string(), + "2023-01-01 12:05:06.789".to_string(), + "2023-01-01 12:05:06.789".to_string(), + "2022-12-31 20:35:06".to_string(), + "2022-12-31 20:35:06".to_string(), + ] + ); + // Non-absolute timestamps should be parsed preserving the same local instant + assert_eq!( + &local_results[6..], + &[ + "2023-01-01 04:05:06.789".to_string(), + "2023-01-01 04:05:06".to_string(), + "2023-01-01 00:00:00".to_string() + ] + ) } test_tz("+00:00".into()); From 17ef9ad537181709a745943f9ef755b967ff4755 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Apr 2023 16:21:03 +0100 Subject: [PATCH 0799/1411] Final changelog tweaks for arrow 37.0.0 (#4032) * Final changelog tweaks for arrow 37.0.0 * Final update --- CHANGELOG.md | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18212cbb78e1..df0f088cdd0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ **Breaking changes:** +- Fix timestamp handling in cast kernel \(\#1936\) \(\#4033\) [\#4034](https://github.com/apache/arrow-rs/pull/4034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Update tonic 0.9.1 [\#4011](https://github.com/apache/arrow-rs/pull/4011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) - Use FieldRef in DataType \(\#3955\) [\#3983](https://github.com/apache/arrow-rs/pull/3983) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Store Timezone as Arc\ [\#3976](https://github.com/apache/arrow-rs/pull/3976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) @@ -35,31 +36,34 @@ **Implemented enhancements:** -- Support Decimals cast to Utf8/LargeUtf [\#3991](https://github.com/apache/arrow-rs/issues/3991) +- Support Decimals cast to Utf8/LargeUtf [\#3991](https://github.com/apache/arrow-rs/issues/3991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support Date32/Date64 minus Interval [\#3962](https://github.com/apache/arrow-rs/issues/3962) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Reduce Cloning of Field [\#3955](https://github.com/apache/arrow-rs/issues/3955) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Consider renaming rather than removing Decoder [\#3949](https://github.com/apache/arrow-rs/issues/3949) +- Reduce Cloning of Field [\#3955](https://github.com/apache/arrow-rs/issues/3955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Deserializing Serde DataTypes to Arrow [\#3949](https://github.com/apache/arrow-rs/issues/3949) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Add multiply\_fixed\_point [\#3946](https://github.com/apache/arrow-rs/issues/3946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Strongly Typed Array Slicing [\#3929](https://github.com/apache/arrow-rs/issues/3929) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make it easier to match FlightSQL messages [\#3874](https://github.com/apache/arrow-rs/issues/3874) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Strongly Typed Array Slicing [\#3929](https://github.com/apache/arrow-rs/issues/3929) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make it easier to match FlightSQL messages [\#3874](https://github.com/apache/arrow-rs/issues/3874) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] - Support Casting Between Binary / LargeBinary and FixedSizeBinary [\#3826](https://github.com/apache/arrow-rs/issues/3826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- f16::ZERO and f16::ONE are mixed up [\#4016](https://github.com/apache/arrow-rs/issues/4016) -- Handle overflow precision when casting from integer to decimal [\#3995](https://github.com/apache/arrow-rs/issues/3995) +- Incorrect Overflow Casting String to Timestamp [\#4033](https://github.com/apache/arrow-rs/issues/4033) +- f16::ZERO and f16::ONE are mixed up [\#4016](https://github.com/apache/arrow-rs/issues/4016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Handle overflow precision when casting from integer to decimal [\#3995](https://github.com/apache/arrow-rs/issues/3995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - PrimitiveDictionaryBuilder.finish should use actual value type [\#3971](https://github.com/apache/arrow-rs/issues/3971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RecordBatch From StructArray Silently Discards Nulls [\#3952](https://github.com/apache/arrow-rs/issues/3952) +- RecordBatch From StructArray Silently Discards Nulls [\#3952](https://github.com/apache/arrow-rs/issues/3952) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - I256 Checked Subtraction Overflows for i256::MINUS\_ONE [\#3942](https://github.com/apache/arrow-rs/issues/3942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - I256 Checked Multiply Overflows for i256::MIN [\#3941](https://github.com/apache/arrow-rs/issues/3941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Remove non-existent `js` feature from README [\#4000](https://github.com/apache/arrow-rs/issues/4000) +- Remove non-existent `js` feature from README [\#4000](https://github.com/apache/arrow-rs/issues/4000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support take on MapArray [\#3875](https://github.com/apache/arrow-rs/issues/3875) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** +- Prep for 37.0.0 [\#4031](https://github.com/apache/arrow-rs/pull/4031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Add RecordBatch::with\_schema [\#4028](https://github.com/apache/arrow-rs/pull/4028) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Only require compatible batch schema in ArrowWriter [\#4027](https://github.com/apache/arrow-rs/pull/4027) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - Add Fields::contains [\#4026](https://github.com/apache/arrow-rs/pull/4026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) - Minor: add methods "is\_positive" and "signum" to i256 [\#4024](https://github.com/apache/arrow-rs/pull/4024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) From 6e9751f6b33e17cb811bd89ed94f29b92707e248 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Apr 2023 17:19:30 +0100 Subject: [PATCH 0800/1411] Split object_store into separate workspace (#4036) --- Cargo.toml | 14 +++++++++----- dev/release/create-tarball.sh | 5 +---- dev/release/verify-release-candidate.sh | 15 ++------------- parquet/Cargo.toml | 3 ++- 4 files changed, 14 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7d5ff0c7f949..34a7951b3937 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ # under the License. [workspace] + members = [ "arrow", "arrow-arith", @@ -35,7 +36,6 @@ members = [ "arrow-schema", "arrow-select", "arrow-string", - "object_store", "parquet", "parquet_derive", "parquet_derive_test", @@ -51,10 +51,14 @@ members = [ # resolver = "2" -# this package is excluded because it requires different compilation flags, thereby significantly changing -# how it is compiled within the workspace, causing the whole workspace to be compiled from scratch -# this way, this is a stand-alone package that compiles independently of the others. -exclude = ["arrow-pyarrow-integration-testing"] +exclude = [ + # arrow-pyarrow-integration-testing is excluded because it requires different compilation flags, thereby + # significantly changing how it is compiled within the workspace, causing the whole workspace to be compiled from + # scratch this way, this is a stand-alone package that compiles independently of the others. + "arrow-pyarrow-integration-testing", + # object_store is excluded because it follows a separate release cycle from the other arrow crates + "object_store" +] [workspace.package] version = "37.0.0" diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index 0463f89f77ae..a77ddbe75701 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -117,14 +117,11 @@ echo "---------------------------------------------------------" # create containing the files in git at $release_hash # the files in the tarball are prefixed with {tag} (e.g. 4.0.1) -# use --delete to filter out: -# 1. `object_store` files -# 2. Workspace `Cargo.toml` file (which refers to object_store) +# use --delete to filter out `object_store` files mkdir -p ${distdir} (cd "${SOURCE_TOP_DIR}" && \ git archive ${release_hash} --prefix ${release}/ \ | $tar --delete ${release}/'object_store' \ - | $tar --delete ${release}/'Cargo.toml' \ | gzip > ${tarball}) echo "Running rat license checker on ${tarball}" diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index c42391222fce..2629d362aaff 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -105,10 +105,7 @@ test_source_distribution() { # raises on any formatting errors rustup component add rustfmt --toolchain stable - (cd arrow && cargo fmt --check) - (cd arrow-flight && cargo fmt --check) - (cd parquet && cargo fmt --check) - (cd parquet_derive && cargo fmt --check) + cargo fmt --all -- --check # Clone testing repositories if not cloned already git clone https://github.com/apache/arrow-testing.git arrow-testing-data @@ -116,15 +113,7 @@ test_source_distribution() { export ARROW_TEST_DATA=$PWD/arrow-testing-data/data export PARQUET_TEST_DATA=$PWD/parquet-testing-data/data - (cd arrow && cargo build && cargo test) - (cd arrow-flight && cargo build && cargo test) - # To avoid https://github.com/apache/arrow-rs/issues/3410, - # remove path reference from parquet: - # object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } - # object_store = { version = "0.5", default-features = false, optional = true } - sed -i -e 's/\(^object_store.*\)\(path = ".*", \)/\1/g' parquet/Cargo.toml - (cd parquet && cargo build && cargo test) - (cd parquet_derive && cargo build && cargo test) + cargo test --all # verify that the leaf crates can be published to crates.io # we can't verify crates that depend on others diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index be61a7cf1435..ef5ea8cd15d9 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -43,7 +43,8 @@ arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } -object_store = { version = "0.5", path = "../object_store", default-features = false, optional = true } +# Intentionally not a path dependency as object_store is released separately +object_store = { version = "0.5", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } From ff670c58c619f058629d3dadc47e60edbe5b9258 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 7 Apr 2023 21:40:53 +0100 Subject: [PATCH 0801/1411] Fix object_store CI (#4037) --- .github/workflows/object_store.yml | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 8e97c4440567..65c78df18466 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -39,6 +39,9 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust + defaults: + run: + working-directory: object_store steps: - uses: actions/checkout@v3 - name: Setup Rust toolchain @@ -50,19 +53,19 @@ jobs: # features that might be enabled by dev-dependencies of other # targets. - name: Run clippy with default features - run: cargo clippy -p object_store -- -D warnings + run: cargo clippy -- -D warnings - name: Run clippy with aws feature - run: cargo clippy -p object_store --features aws -- -D warnings + run: cargo clippy --features aws -- -D warnings - name: Run clippy with aws_profile feature - run: cargo clippy -p object_store --features aws_profile -- -D warnings + run: cargo clippy --features aws_profile -- -D warnings - name: Run clippy with gcp feature - run: cargo clippy -p object_store --features gcp -- -D warnings + run: cargo clippy --features gcp -- -D warnings - name: Run clippy with azure feature - run: cargo clippy -p object_store --features azure -- -D warnings + run: cargo clippy --features azure -- -D warnings - name: Run clippy with all features - run: cargo clippy -p object_store --all-features -- -D warnings + run: cargo clippy --all-features -- -D warnings - name: Run clippy with all features and all targets - run: cargo clippy -p object_store --all-features --all-targets -- -D warnings + run: cargo clippy --all-features --all-targets -- -D warnings # test the crate # This runs outside a container to workaround lack of support for passing arguments @@ -70,6 +73,9 @@ jobs: linux-test: name: Emulator Tests runs-on: ubuntu-latest + defaults: + run: + working-directory: object_store env: # Disable full debug symbol generation to speed up CI build and keep memory down # "1" means line tables only, which is useful for panic tracebacks. @@ -128,7 +134,7 @@ jobs: OBJECT_STORE_AWS_ACCESS_KEY_ID: test OBJECT_STORE_AWS_SECRET_ACCESS_KEY: test OBJECT_STORE_AWS_ENDPOINT: http://localhost:4566 - run: cargo test -p object_store --features=aws,azure,gcp,http + run: cargo test --features=aws,azure,gcp,http # test the object_store crate builds against wasm32 in stable rust wasm32-build: @@ -136,6 +142,9 @@ jobs: runs-on: ubuntu-latest container: image: amd64/rust + defaults: + run: + working-directory: object_store steps: - uses: actions/checkout@v3 with: @@ -145,6 +154,6 @@ jobs: with: target: wasm32-unknown-unknown,wasm32-wasi - name: Build wasm32-unknown-unknown - run: cargo build -p object_store --target wasm32-unknown-unknown + run: cargo build --target wasm32-unknown-unknown - name: Build wasm32-wasi - run: cargo build -p object_store --target wasm32-wasi \ No newline at end of file + run: cargo build --target wasm32-wasi \ No newline at end of file From 2e698480f4ea081a0009b99e8c20c426ee6a7fa3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 9 Apr 2023 20:32:41 +0100 Subject: [PATCH 0802/1411] Use reqwest build_split (#4039) * Use reqwest build_split * Fix typo --- object_store/src/aws/credential.rs | 21 ++++------------ object_store/src/azure/credential.rs | 36 ++++++++++++---------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 183e8434650b..c4cb7cfe1a01 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -81,8 +81,6 @@ const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; -const ALL_HEADERS: &[&str; 4] = &[DATE_HEADER, HASH_HEADER, TOKEN_HEADER, AUTH_HEADER]; - impl<'a> RequestSigner<'a> { fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { if let Some(ref token) = self.credential.token { @@ -175,20 +173,15 @@ pub trait CredentialExt { impl CredentialExt for RequestBuilder { fn with_aws_sigv4( - mut self, + self, credential: &AwsCredential, region: &str, service: &str, sign_payload: bool, payload_sha256: Option>, ) -> Self { - // Hack around lack of access to underlying request - // https://github.com/seanmonstar/reqwest/issues/1212 - let mut request = self - .try_clone() - .expect("not stream") - .build() - .expect("request valid"); + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); let date = Utc::now(); let signer = RequestSigner { @@ -200,13 +193,7 @@ impl CredentialExt for RequestBuilder { }; signer.sign(&mut request, payload_sha256); - - for header in ALL_HEADERS { - if let Some(val) = request.headers_mut().remove(*header) { - self = self.header(*header, val) - } - } - self + Self::from_parts(client, request) } } diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 9e072229ffa9..0196d93d8d2a 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -124,16 +124,11 @@ impl CredentialExt for RequestBuilder { .header(DATE, &date_val) .header(&VERSION, &AZURE_VERSION); - // Hack around lack of access to underlying request - // https://github.com/seanmonstar/reqwest/issues/1212 - let request = self - .try_clone() - .expect("not stream") - .build() - .expect("request valid"); - match credential { AzureCredential::AccessKey(key) => { + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); + let signature = generate_authorization( request.headers(), request.url(), @@ -141,22 +136,21 @@ impl CredentialExt for RequestBuilder { account, key.as_str(), ); - self = self - // "signature" is a base 64 encoded string so it should never contain illegal characters. - .header( - AUTHORIZATION, - HeaderValue::from_str(signature.as_str()).unwrap(), - ); + + // "signature" is a base 64 encoded string so it should never + // contain illegal characters + request.headers_mut().append( + AUTHORIZATION, + HeaderValue::from_str(signature.as_str()).unwrap(), + ); + + Self::from_parts(client, request) } AzureCredential::AuthorizationToken(token) => { - self = self.header(AUTHORIZATION, token); + self.header(AUTHORIZATION, token) } - AzureCredential::SASToken(query_pairs) => { - self = self.query(&query_pairs); - } - }; - - self + AzureCredential::SASToken(query_pairs) => self.query(&query_pairs), + } } } From fec282fd43add7df97ca8f58eb5eaa42eb9c928d Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Mon, 10 Apr 2023 08:05:27 +0900 Subject: [PATCH 0803/1411] refactor: refactor infer_json_scheam reader type from BufReader to BufRead (#4041) --- arrow-json/src/reader.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs index 39f829052f59..d343d3ed986d 100644 --- a/arrow-json/src/reader.rs +++ b/arrow-json/src/reader.rs @@ -187,17 +187,17 @@ fn generate_schema(spec: HashMap) -> Result { - reader: &'a mut BufReader, +pub struct ValueIter<'a, R: BufRead> { + reader: &'a mut R, max_read_records: Option, record_count: usize, // reuse line buffer to avoid allocation on each record line_buf: String, } -impl<'a, R: Read> ValueIter<'a, R> { +impl<'a, R: BufRead> ValueIter<'a, R> { /// Creates a new `ValueIter` - pub fn new(reader: &'a mut BufReader, max_read_records: Option) -> Self { + pub fn new(reader: &'a mut R, max_read_records: Option) -> Self { Self { reader, max_read_records, @@ -207,7 +207,7 @@ impl<'a, R: Read> ValueIter<'a, R> { } } -impl<'a, R: Read> Iterator for ValueIter<'a, R> { +impl<'a, R: BufRead> Iterator for ValueIter<'a, R> { type Item = Result; fn next(&mut self) -> Option { @@ -303,8 +303,8 @@ pub fn infer_json_schema_from_seekable( /// // seek back to start so that the original file is usable again /// file.seek(SeekFrom::Start(0)).unwrap(); /// ``` -pub fn infer_json_schema( - reader: &mut BufReader, +pub fn infer_json_schema( + reader: &mut R, max_read_records: Option, ) -> Result { infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) From 6c5c34b6fb3d2fa700856077b6ea2555ff5fb598 Mon Sep 17 00:00:00 2001 From: "r.4ntix" Date: Mon, 10 Apr 2023 23:13:00 +0800 Subject: [PATCH 0804/1411] Add get_config_value to AWS/Azure/GCP Builders (#4035) * minor: make struct fields of Builders(S3/Azure/GCS) to pub * minor: use `get_config_value` method instead of public fields * fix clippy error --- object_store/src/aws/checksum.rs | 10 ++- object_store/src/aws/mod.rs | 103 +++++++++++++++++++++++++++++++ object_store/src/azure/mod.rs | 81 ++++++++++++++++++++++++ object_store/src/gcp/mod.rs | 56 +++++++++++++++++ 4 files changed, 249 insertions(+), 1 deletion(-) diff --git a/object_store/src/aws/checksum.rs b/object_store/src/aws/checksum.rs index ae35f0612456..c787c28a8df0 100644 --- a/object_store/src/aws/checksum.rs +++ b/object_store/src/aws/checksum.rs @@ -39,11 +39,19 @@ impl Checksum { } } +impl std::fmt::Display for Checksum { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Self::SHA256 => write!(f, "sha256"), + } + } +} + impl TryFrom<&String> for Checksum { type Error = (); fn try_from(value: &String) -> Result { - match value.as_str() { + match value.to_lowercase().as_str() { "sha256" => Ok(Self::SHA256), _ => Err(()), } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index f88960b4b338..de62360d0522 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -400,20 +400,35 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { /// ``` #[derive(Debug, Default, Clone)] pub struct AmazonS3Builder { + /// Access key id access_key_id: Option, + /// Secret access_key secret_access_key: Option, + /// Region region: Option, + /// Bucket name bucket_name: Option, + /// Endpoint for communicating with AWS S3 endpoint: Option, + /// Token to use for requests token: Option, + /// Url url: Option, + /// Retry config retry_config: RetryConfig, + /// When set to true, fallback to IMDSv1 imdsv1_fallback: bool, + /// When set to true, virtual hosted style request has to be used virtual_hosted_style_request: bool, + /// When set to true, unsigned payload option has to be used unsigned_payload: bool, + /// Checksum algorithm which has to be used for object integrity check during upload checksum_algorithm: Option, + /// Metadata endpoint, see metadata_endpoint: Option, + /// Profile name, see profile: Option, + /// Client options client_options: ClientOptions, } @@ -751,6 +766,38 @@ impl AmazonS3Builder { Ok(self) } + /// Get config value via a [`AmazonS3ConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; + /// + /// let builder = AmazonS3Builder::from_env() + /// .with_bucket_name("foo"); + /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); + /// assert_eq!("foo", &bucket_name); + /// ``` + pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { + self.region.clone() + } + AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), + AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), + AmazonS3ConfigKey::Token => self.token.clone(), + AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + Some(self.virtual_hosted_style_request.to_string()) + } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), + AmazonS3ConfigKey::Profile => self.profile.clone(), + AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), + AmazonS3ConfigKey::Checksum => self.checksum_algorithm.map(|v| v.to_string()), + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1272,6 +1319,62 @@ mod tests { assert!(builder.unsigned_payload); } + #[test] + fn s3_test_config_get_value() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), + ( + AmazonS3ConfigKey::SecretAccessKey, + aws_secret_access_key.clone(), + ), + (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), + (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), + (AmazonS3ConfigKey::Token, aws_session_token.clone()), + (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), + ]); + + let builder = AmazonS3Builder::new().try_with_options(&options).unwrap(); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::AccessKeyId) + .unwrap(), + aws_access_key_id + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) + .unwrap(), + aws_secret_access_key + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::DefaultRegion) + .unwrap(), + aws_default_region + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::Endpoint) + .unwrap(), + aws_endpoint + ); + assert_eq!( + builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), + aws_session_token + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) + .unwrap(), + "true" + ); + } + #[test] fn s3_test_config_fallible_options() { let aws_access_key_id = "object_store:fake_access_key_id".to_string(); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index c2e72f214d73..11350a202c72 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -394,24 +394,43 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { /// ``` #[derive(Default, Clone)] pub struct MicrosoftAzureBuilder { + /// Account name account_name: Option, + /// Access key access_key: Option, + /// Container name container_name: Option, + /// Bearer token bearer_token: Option, + /// Client id client_id: Option, + /// Client secret client_secret: Option, + /// Tenant id tenant_id: Option, + /// Query pairs for shared access signature authorization sas_query_pairs: Option>, + /// Shared access signature sas_key: Option, + /// Authority host authority_host: Option, + /// Url url: Option, + /// When set to true, azurite storage emulator has to be used use_emulator: bool, + /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, + /// Object id for use with managed identity authentication object_id: Option, + /// Msi resource id for use with managed identity authentication msi_resource_id: Option, + /// File containing token for Azure AD workload identity federation federated_token_file: Option, + /// When set to true, azure cli has to be used for acquiring access token use_azure_cli: bool, + /// Retry config retry_config: RetryConfig, + /// Client options client_options: ClientOptions, } @@ -747,6 +766,35 @@ impl MicrosoftAzureBuilder { Ok(self) } + /// Get config value via a [`AzureConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; + /// + /// let builder = MicrosoftAzureBuilder::from_env() + /// .with_account("foo"); + /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); + /// assert_eq!("foo", &account_name); + /// ``` + pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { + match key { + AzureConfigKey::AccountName => self.account_name.clone(), + AzureConfigKey::AccessKey => self.access_key.clone(), + AzureConfigKey::ClientId => self.client_id.clone(), + AzureConfigKey::ClientSecret => self.client_secret.clone(), + AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::SasKey => self.sas_key.clone(), + AzureConfigKey::Token => self.bearer_token.clone(), + AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), + AzureConfigKey::ObjectId => self.object_id.clone(), + AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), + AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), + AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1252,6 +1300,39 @@ mod tests { assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); } + #[test] + fn azure_test_config_get_value() { + let azure_client_id = "object_store:fake_access_key_id".to_string(); + let azure_storage_account_name = "object_store:fake_secret_key".to_string(); + let azure_storage_token = "object_store:fake_default_region".to_string(); + let options = HashMap::from([ + (AzureConfigKey::ClientId, azure_client_id.clone()), + ( + AzureConfigKey::AccountName, + azure_storage_account_name.clone(), + ), + (AzureConfigKey::Token, azure_storage_token.clone()), + ]); + + let builder = MicrosoftAzureBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder.get_config_value(&AzureConfigKey::ClientId).unwrap(), + azure_client_id + ); + assert_eq!( + builder + .get_config_value(&AzureConfigKey::AccountName) + .unwrap(), + azure_storage_account_name + ); + assert_eq!( + builder.get_config_value(&AzureConfigKey::Token).unwrap(), + azure_storage_token + ); + } + #[test] fn azure_test_config_fallible_options() { let azure_client_id = "object_store:fake_access_key_id".to_string(); diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 5247693e6585..a6cf660220bd 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -768,12 +768,19 @@ impl ObjectStore for GoogleCloudStorage { /// ``` #[derive(Debug, Clone)] pub struct GoogleCloudStorageBuilder { + /// Bucket name bucket_name: Option, + /// Url url: Option, + /// Path to the service account file service_account_path: Option, + /// The serialized service account key service_account_key: Option, + /// Path to the application credentials file. application_credentials_path: Option, + /// Retry config retry_config: RetryConfig, + /// Client options client_options: ClientOptions, } @@ -983,6 +990,28 @@ impl GoogleCloudStorageBuilder { Ok(self) } + /// Get config value via a [`GoogleConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; + /// + /// let builder = GoogleCloudStorageBuilder::from_env() + /// .with_service_account_key("foo"); + /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); + /// assert_eq!("foo", &service_account_key); + /// ``` + pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { + match key { + GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), + GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), + GoogleConfigKey::Bucket => self.bucket_name.clone(), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path.clone() + } + } + } + /// Sets properties on this builder based on a URL /// /// This is a separate member function to allow fallible computation to @@ -1452,6 +1481,33 @@ mod test { assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); } + #[test] + fn gcs_test_config_get_value() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ( + GoogleConfigKey::ServiceAccount, + google_service_account.clone(), + ), + (GoogleConfigKey::Bucket, google_bucket_name.clone()), + ]); + + let builder = GoogleCloudStorageBuilder::new() + .try_with_options(&options) + .unwrap(); + assert_eq!( + builder + .get_config_value(&GoogleConfigKey::ServiceAccount) + .unwrap(), + google_service_account + ); + assert_eq!( + builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), + google_bucket_name + ); + } + #[test] fn gcs_test_config_fallible_options() { let google_service_account = "object_store:fake_service_account".to_string(); From 9a4b78780ba106ccc73d7b08a7c3be9aab028806 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 10 Apr 2023 21:11:24 +0100 Subject: [PATCH 0805/1411] Fix sparse union array equality (#4044) (#4045) --- arrow-data/src/equal/union.rs | 8 +++++++- arrow/tests/array_equal.rs | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arrow-data/src/equal/union.rs b/arrow-data/src/equal/union.rs index 4f04bc287aa8..5869afc30dbe 100644 --- a/arrow-data/src/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -70,7 +70,13 @@ fn equal_sparse( .iter() .zip(rhs.child_data()) .all(|(lhs_values, rhs_values)| { - equal_range(lhs_values, rhs_values, lhs_start, rhs_start, len) + equal_range( + lhs_values, + rhs_values, + lhs_start + lhs.offset(), + rhs_start + rhs.offset(), + len, + ) }) } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 93296c3b0e43..83a280db67b8 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -1155,6 +1155,22 @@ fn test_union_equal_sparse() { test_equal(&union1, &union4, false); } +#[test] +fn test_union_equal_sparse_slice() { + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("a", 2).unwrap(); + builder.append::("b", 3).unwrap(); + let a1 = builder.build().unwrap(); + + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 2).unwrap(); + builder.append::("b", 3).unwrap(); + let a2 = builder.build().unwrap(); + + test_equal(&a1.slice(1, 2), &a2, true) +} + #[test] fn test_boolean_slice() { let array = BooleanArray::from(vec![true; 32]); From 884ab4edf2c89d527e3408de1661bb1555640e8b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 11 Apr 2023 14:12:22 +0100 Subject: [PATCH 0806/1411] Document Async decoder usage (#4043) (#78) (#4046) * Document Async decoder usage (#4043) (#78) * Review feedback * Review feedback --- arrow-csv/Cargo.toml | 3 + arrow-csv/src/reader/mod.rs | 83 +++++++++++++++++++++++++++ arrow-json/Cargo.toml | 3 + arrow-json/src/raw/mod.rs | 108 ++++++++++++++++++++++++++++++++++++ 4 files changed, 197 insertions(+) diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index d4526ba32cf2..1f1a762d5065 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -48,3 +48,6 @@ regex = { version = "1.7.0", default-features = false, features = ["std", "unico [dev-dependencies] tempfile = "3.3" +futures = "0.3" +tokio = { version = "1.27", default-features = false, features = ["io-util"] } +bytes = "1.4" diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 3fa712819a92..5bfcbc6452fb 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -17,6 +17,8 @@ //! CSV Reader //! +//! # Basic Usage +//! //! This CSV reader allows CSV files to be read into the Arrow memory model. Records are //! loaded in batches and are then converted from row-based data to columnar data. //! @@ -39,6 +41,87 @@ //! let mut csv = Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); //! let batch = csv.next().unwrap().unwrap(); //! ``` +//! +//! # Async Usage +//! +//! The lower-level [`Decoder`] can be integrated with various forms of async data streams, +//! and is designed to be agnostic to the various different kinds of async IO primitives found +//! within the Rust ecosystem. +//! +//! For example, see below for how it can be used with an arbitrary `Stream` of `Bytes` +//! +//! ``` +//! # use std::task::{Poll, ready}; +//! # use bytes::{Buf, Bytes}; +//! # use arrow_schema::ArrowError; +//! # use futures::stream::{Stream, StreamExt}; +//! # use arrow_array::RecordBatch; +//! # use arrow_csv::reader::Decoder; +//! # +//! fn decode_stream + Unpin>( +//! mut decoder: Decoder, +//! mut input: S, +//! ) -> impl Stream> { +//! let mut buffered = Bytes::new(); +//! futures::stream::poll_fn(move |cx| { +//! loop { +//! if buffered.is_empty() { +//! if let Some(b) = ready!(input.poll_next_unpin(cx)) { +//! buffered = b; +//! } +//! // Note: don't break on `None` as the decoder needs +//! // to be called with an empty array to delimit the +//! // final record +//! } +//! let decoded = match decoder.decode(buffered.as_ref()) { +//! Ok(0) => break, +//! Ok(decoded) => decoded, +//! Err(e) => return Poll::Ready(Some(Err(e))), +//! }; +//! buffered.advance(decoded); +//! } +//! +//! Poll::Ready(decoder.flush().transpose()) +//! }) +//! } +//! +//! ``` +//! +//! In a similar vein, it can also be used with tokio-based IO primitives +//! +//! ``` +//! # use std::pin::Pin; +//! # use std::task::{Poll, ready}; +//! # use futures::Stream; +//! # use tokio::io::AsyncBufRead; +//! # use arrow_array::RecordBatch; +//! # use arrow_csv::reader::Decoder; +//! # use arrow_schema::ArrowError; +//! fn decode_stream( +//! mut decoder: Decoder, +//! mut reader: R, +//! ) -> impl Stream> { +//! futures::stream::poll_fn(move |cx| { +//! loop { +//! let b = match ready!(Pin::new(&mut reader).poll_fill_buf(cx)) { +//! Ok(b) => b, +//! Err(e) => return Poll::Ready(Some(Err(e.into()))), +//! }; +//! let decoded = match decoder.decode(b) { +//! // Note: the decoder needs to be called with an empty +//! // array to delimit the final record +//! Ok(0) => break, +//! Ok(decoded) => decoded, +//! Err(e) => return Poll::Ready(Some(Err(e))), +//! }; +//! Pin::new(&mut reader).consume(decoded); +//! } +//! +//! Poll::Ready(decoder.flush().transpose()) +//! }) +//! } +//! ``` +//! mod records; diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 453e4aa35182..d9b3a0df9c87 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -51,3 +51,6 @@ lexical-core = { version = "0.8", default-features = false } tempfile = "3.3" flate2 = { version = "1", default-features = false, features = ["rust_backend"] } serde = { version = "1.0", default-features = false, features = ["derive"] } +futures = "0.3" +tokio = { version = "1.27", default-features = false, features = ["io-util"] } +bytes = "1.4" diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index f1f1ffb779d0..c195524766c0 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -18,6 +18,114 @@ //! A faster JSON reader that will eventually replace [`Reader`] //! //! [`Reader`]: crate::reader::Reader +//! +//! # Basic Usage +//! +//! [`RawReader`] can be used directly with synchronous data sources, such as [`std::fs::File`] +//! +//! ``` +//! # use arrow_schema::*; +//! # use std::fs::File; +//! # use std::io::BufReader; +//! # use std::sync::Arc; +//! +//! let schema = Arc::new(Schema::new(vec![ +//! Field::new("a", DataType::Float64, false), +//! Field::new("b", DataType::Float64, false), +//! Field::new("c", DataType::Boolean, true), +//! ])); +//! +//! let file = File::open("test/data/basic.json").unwrap(); +//! +//! let mut json = arrow_json::RawReaderBuilder::new(schema).build(BufReader::new(file)).unwrap(); +//! let batch = json.next().unwrap().unwrap(); +//! ``` +//! +//! # Async Usage +//! +//! The lower-level [`RawDecoder`] can be integrated with various forms of async data streams, +//! and is designed to be agnostic to the various different kinds of async IO primitives found +//! within the Rust ecosystem. +//! +//! For example, see below for how it can be used with an arbitrary `Stream` of `Bytes` +//! +//! ``` +//! # use std::task::{Poll, ready}; +//! # use bytes::{Buf, Bytes}; +//! # use arrow_schema::ArrowError; +//! # use futures::stream::{Stream, StreamExt}; +//! # use arrow_array::RecordBatch; +//! # use arrow_json::RawDecoder; +//! # +//! fn decode_stream + Unpin>( +//! mut decoder: RawDecoder, +//! mut input: S, +//! ) -> impl Stream> { +//! let mut buffered = Bytes::new(); +//! futures::stream::poll_fn(move |cx| { +//! loop { +//! if buffered.is_empty() { +//! buffered = match ready!(input.poll_next_unpin(cx)) { +//! Some(b) => b, +//! None => break, +//! }; +//! } +//! let decoded = match decoder.decode(buffered.as_ref()) { +//! Ok(decoded) => decoded, +//! Err(e) => return Poll::Ready(Some(Err(e))), +//! }; +//! let read = buffered.len(); +//! buffered.advance(decoded); +//! if decoded != read { +//! break +//! } +//! } +//! +//! Poll::Ready(decoder.flush().transpose()) +//! }) +//! } +//! +//! ``` +//! +//! In a similar vein, it can also be used with tokio-based IO primitives +//! +//! ``` +//! # use std::sync::Arc; +//! # use arrow_schema::{DataType, Field, Schema}; +//! # use std::pin::Pin; +//! # use std::task::{Poll, ready}; +//! # use futures::{Stream, TryStreamExt}; +//! # use tokio::io::AsyncBufRead; +//! # use arrow_array::RecordBatch; +//! # use arrow_json::RawDecoder; +//! # use arrow_schema::ArrowError; +//! fn decode_stream( +//! mut decoder: RawDecoder, +//! mut reader: R, +//! ) -> impl Stream> { +//! futures::stream::poll_fn(move |cx| { +//! loop { +//! let b = match ready!(Pin::new(&mut reader).poll_fill_buf(cx)) { +//! Ok(b) if b.is_empty() => break, +//! Ok(b) => b, +//! Err(e) => return Poll::Ready(Some(Err(e.into()))), +//! }; +//! let read = b.len(); +//! let decoded = match decoder.decode(b) { +//! Ok(decoded) => decoded, +//! Err(e) => return Poll::Ready(Some(Err(e))), +//! }; +//! Pin::new(&mut reader).consume(decoded); +//! if decoded != read { +//! break; +//! } +//! } +//! +//! Poll::Ready(decoder.flush().transpose()) +//! }) +//! } +//! ``` +//! use crate::raw::boolean_array::BooleanArrayDecoder; use crate::raw::decimal_array::DecimalArrayDecoder; From 768430f17fbab1b29969fab721ca1708b4719957 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 11 Apr 2023 14:17:31 +0100 Subject: [PATCH 0807/1411] Use lexical_core in CSV and JSON parser (#4050) --- arrow-cast/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index cc8254916854..fd248f2be850 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -452,7 +452,7 @@ macro_rules! parser_primitive { ($t:ty) => { impl Parser for $t { fn parse(string: &str) -> Option { - string.parse::().ok() + lexical_core::parse::(string.as_bytes()).ok() } } }; From 6b17775f37b939221d855514db4ffb3344deb1f4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 11 Apr 2023 14:33:19 +0100 Subject: [PATCH 0808/1411] Fix precision loss in Raw JSON decoder (#4049) (#4051) --- arrow-json/src/raw/mod.rs | 24 ++++++++++++++ arrow-json/src/raw/primitive_array.rs | 47 ++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs index c195524766c0..38b4cce9bd9a 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/raw/mod.rs @@ -1375,4 +1375,28 @@ mod tests { Some("+00:00".into()), )); } + + #[test] + fn test_truncation() { + let buf = r#" + {"i64": 9223372036854775807, "u64": 18446744073709551615 } + {"i64": "9223372036854775807", "u64": "18446744073709551615" } + {"i64": -9223372036854775808, "u64": 0 } + {"i64": "-9223372036854775808", "u64": 0 } + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("i64", DataType::Int64, true), + Field::new("u64", DataType::UInt64, true), + ])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let i64 = batches[0].column(0).as_primitive::(); + assert_eq!(i64.values(), &[i64::MAX, i64::MAX, i64::MIN, i64::MIN]); + + let u64 = batches[0].column(1).as_primitive::(); + assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]); + } } diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/raw/primitive_array.rs index 72ce30203d01..6985821d65fe 100644 --- a/arrow-json/src/raw/primitive_array.rs +++ b/arrow-json/src/raw/primitive_array.rs @@ -27,6 +27,45 @@ use arrow_schema::{ArrowError, DataType}; use crate::raw::tape::{Tape, TapeElement}; use crate::raw::{tape_error, ArrayDecoder}; +/// A trait for JSON-specific primitive parsing logic +/// +/// According to the specification unquoted fields should be parsed as a double-precision +/// floating point numbers, including scientific representation such as `2e3` +/// +/// In practice, it is common to serialize numbers outside the range of an `f64` and expect +/// them to round-trip correctly. As such when parsing integers we first parse as the integer +/// and fallback to parsing as a floating point if this fails +trait ParseJsonNumber: Sized { + fn parse(s: &[u8]) -> Option; +} + +macro_rules! primitive_parse { + ($($t:ty),+) => { + $(impl ParseJsonNumber for $t { + fn parse(s: &[u8]) -> Option { + match lexical_core::parse::(s) { + Ok(f) => Some(f), + Err(_) => lexical_core::parse::(s).ok().and_then(NumCast::from), + } + } + })+ + }; +} + +primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64); + +impl ParseJsonNumber for f32 { + fn parse(s: &[u8]) -> Option { + lexical_core::parse::(s).ok() + } +} + +impl ParseJsonNumber for f64 { + fn parse(s: &[u8]) -> Option { + lexical_core::parse::(s).ok() + } +} + pub struct PrimitiveArrayDecoder { data_type: DataType, // Invariant and Send @@ -45,7 +84,7 @@ impl PrimitiveArrayDecoder

{ impl

ArrayDecoder for PrimitiveArrayDecoder

where P: ArrowPrimitiveType + Parser, - P::Native: NumCast, + P::Native: ParseJsonNumber, { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) @@ -67,10 +106,8 @@ where } TapeElement::Number(idx) => { let s = tape.get_string(idx); - let value = lexical_core::parse::(s.as_bytes()) - .ok() - .and_then(NumCast::from) - .ok_or_else(|| { + let value = + ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| { ArrowError::JsonError(format!( "failed to parse {s} as {}", self.data_type From ee4003328d6615e011303ba57c24264a3f454e12 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 11 Apr 2023 17:06:22 +0100 Subject: [PATCH 0809/1411] Add offsets accessors to variable length arrays (#3879) (#4048) * Add offsets accessors to variable length arrays (#3879) * Review feedback --- arrow-array/src/array/byte_array.rs | 18 ++++++++++++++++++ arrow-array/src/array/list_array.rs | 12 +++++++++++- arrow-array/src/array/map_array.rs | 13 +++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 34e7d79ab3e0..f0e43e6949e9 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -67,6 +67,24 @@ impl GenericByteArray { offsets[i + 1] - offsets[i] } + /// Returns a reference to the offsets of this array + /// + /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &OffsetBuffer { + &self.value_offsets + } + + /// Returns the values of this array + /// + /// Unlike [`Self::value_data`] this returns the [`Buffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn values(&self) -> &Buffer { + &self.value_data + } + /// Returns the raw value data pub fn value_data(&self) -> &[u8] { self.value_data.as_slice() diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index fb94fe12c87c..f47ea80696e7 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -78,7 +78,17 @@ impl GenericListArray { DataType::List }; - /// Returns a reference to the values of this list. + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &OffsetBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] pub fn values(&self) -> &ArrayRef { &self.values } diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 22ebbe533a2f..1629532b8452 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -42,12 +42,21 @@ pub struct MapArray { } impl MapArray { - /// Returns a reference to the keys of this map. + /// Returns a reference to the offsets of this map + /// + /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &OffsetBuffer { + &self.value_offsets + } + + /// Returns a reference to the keys of this map pub fn keys(&self) -> &ArrayRef { &self.keys } - /// Returns a reference to the values of this map. + /// Returns a reference to the values of this map pub fn values(&self) -> &ArrayRef { &self.values } From 96569c1e9cf89cf4bdb0d9bf6bc30387a3e3555b Mon Sep 17 00:00:00 2001 From: kindly Date: Wed, 12 Apr 2023 04:44:25 +0100 Subject: [PATCH 0810/1411] Shutdown parquet async writer (#4059) --- parquet/src/arrow/async_writer/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index abfb1c54ed44..339618364324 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -129,6 +129,7 @@ impl AsyncArrowWriter { // Force to flush the remaining data. Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, true).await?; + self.async_writer.shutdown().await?; Ok(metadata) } From eec499db94a5794e2e4fd979177f2d63b112e41c Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Wed, 12 Apr 2023 06:46:36 +0300 Subject: [PATCH 0811/1411] feat: additional data type groups (#4057) --- arrow-schema/src/datatype.rs | 58 ++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 3ec5597b2854..3f684285c067 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -355,14 +355,33 @@ impl DataType { ) } + /// Returns true if this type is floating: (Float*). + pub fn is_floating(&self) -> bool { + use DataType::*; + matches!(self, Float16 | Float32 | Float64) + } + + /// Returns true if this type is integer: (Int*, UInt*). + pub fn is_integer(&self) -> bool { + self.is_signed_integer() || self.is_unsigned_integer() + } + + /// Returns true if this type is signed integer: (Int*). + pub fn is_signed_integer(&self) -> bool { + use DataType::*; + matches!(self, Int8 | Int16 | Int32 | Int64) + } + + /// Returns true if this type is unsigned integer: (UInt*). + pub fn is_unsigned_integer(&self) -> bool { + use DataType::*; + matches!(self, UInt8 | UInt16 | UInt32 | UInt64) + } + /// Returns true if this type is valid as a dictionary key #[inline] pub fn is_dictionary_key_type(&self) -> bool { - use DataType::*; - matches!( - self, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 - ) + self.is_integer() } /// Returns true if this type is valid for run-ends array in RunArray @@ -664,6 +683,35 @@ mod tests { ))); } + #[test] + fn test_integer() { + // is_integer + assert!(DataType::is_integer(&DataType::Int32)); + assert!(DataType::is_integer(&DataType::UInt64)); + assert!(!DataType::is_integer(&DataType::Float16)); + + // is_signed_integer + assert!(DataType::is_signed_integer(&DataType::Int32)); + assert!(!DataType::is_signed_integer(&DataType::UInt64)); + assert!(!DataType::is_signed_integer(&DataType::Float16)); + + // is_unsigned_integer + assert!(!DataType::is_unsigned_integer(&DataType::Int32)); + assert!(DataType::is_unsigned_integer(&DataType::UInt64)); + assert!(!DataType::is_unsigned_integer(&DataType::Float16)); + + // is_dictionary_key_type + assert!(DataType::is_dictionary_key_type(&DataType::Int32)); + assert!(DataType::is_dictionary_key_type(&DataType::UInt64)); + assert!(!DataType::is_dictionary_key_type(&DataType::Float16)); + } + + #[test] + fn test_floating() { + assert!(DataType::is_floating(&DataType::Float16)); + assert!(!DataType::is_floating(&DataType::Int32)); + } + #[test] fn size_should_not_regress() { assert_eq!(std::mem::size_of::(), 24); From 6ce332a3d099aaa421676075d4ca8c4644666d14 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 12 Apr 2023 05:08:55 +0100 Subject: [PATCH 0812/1411] Remove old JSON Reader and Decoder (#3610) (#4052) * Remove old JSON Reader and Decoder (#3610) * More tests * Fix doc * Fix test * Fix bench --- arrow-json/src/lib.rs | 14 +- arrow-json/src/reader.rs | 3449 ----------------- .../src/{raw => reader}/boolean_array.rs | 4 +- .../src/{raw => reader}/decimal_array.rs | 4 +- arrow-json/src/{raw => reader}/list_array.rs | 4 +- arrow-json/src/{raw => reader}/map_array.rs | 4 +- arrow-json/src/{raw => reader}/mod.rs | 687 +++- .../src/{raw => reader}/primitive_array.rs | 4 +- arrow-json/src/reader/schema.rs | 710 ++++ arrow-json/src/{raw => reader}/serializer.rs | 2 +- .../src/{raw => reader}/string_array.rs | 4 +- .../src/{raw => reader}/struct_array.rs | 4 +- arrow-json/src/{raw => reader}/tape.rs | 2 +- .../src/{raw => reader}/timestamp_array.rs | 4 +- arrow-json/src/writer.rs | 35 +- arrow/benches/json_reader.rs | 21 +- arrow/src/lib.rs | 6 +- parquet/src/arrow/arrow_writer/levels.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- 19 files changed, 1341 insertions(+), 3621 deletions(-) delete mode 100644 arrow-json/src/reader.rs rename arrow-json/src/{raw => reader}/boolean_array.rs (94%) rename arrow-json/src/{raw => reader}/decimal_array.rs (96%) rename arrow-json/src/{raw => reader}/list_array.rs (97%) rename arrow-json/src/{raw => reader}/map_array.rs (97%) rename arrow-json/src/{raw => reader}/mod.rs (68%) rename arrow-json/src/{raw => reader}/primitive_array.rs (97%) create mode 100644 arrow-json/src/reader/schema.rs rename arrow-json/src/{raw => reader}/serializer.rs (99%) rename arrow-json/src/{raw => reader}/string_array.rs (97%) rename arrow-json/src/{raw => reader}/struct_array.rs (98%) rename arrow-json/src/{raw => reader}/tape.rs (99%) rename arrow-json/src/{raw => reader}/timestamp_array.rs (97%) diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 5998bc3a4433..88415ff2ecac 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -25,10 +25,18 @@ pub mod reader; pub mod writer; -mod raw; +#[doc(hidden)] +#[deprecated(note = "Use Decoder")] +pub type RawDecoder = reader::Decoder; + +#[doc(hidden)] +#[deprecated(note = "Use Reader")] +pub type RawReader = Reader; + +#[doc(hidden)] +#[deprecated(note = "Use ReaderBuilder")] +pub type RawReaderBuilder = ReaderBuilder; -pub use self::raw::{RawDecoder, RawReader, RawReaderBuilder}; -#[allow(deprecated)] pub use self::reader::{Reader, ReaderBuilder}; pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer}; use half::f16; diff --git a/arrow-json/src/reader.rs b/arrow-json/src/reader.rs deleted file mode 100644 index d343d3ed986d..000000000000 --- a/arrow-json/src/reader.rs +++ /dev/null @@ -1,3449 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! # JSON Reader -//! -//! This JSON reader allows JSON line-delimited files to be read into the Arrow memory -//! model. Records are loaded in batches and are then converted from row-based data to -//! columnar data. -//! -//! Example: -//! -//! ``` -//! # use arrow_schema::*; -//! # use std::fs::File; -//! # use std::io::BufReader; -//! # use std::sync::Arc; -//! -//! let schema = Schema::new(vec![ -//! Field::new("a", DataType::Float64, false), -//! Field::new("b", DataType::Float64, false), -//! Field::new("c", DataType::Float64, true), -//! ]); -//! -//! let file = File::open("test/data/basic.json").unwrap(); -//! -//! let mut json = arrow_json::Reader::new( -//! BufReader::new(file), -//! Arc::new(schema), -//! arrow_json::reader::DecoderOptions::new(), -//! ); -//! -//! let batch = json.next().unwrap().unwrap(); -//! ``` - -use std::borrow::Borrow; -use std::io::{BufRead, BufReader, Read, Seek}; -use std::sync::Arc; - -use indexmap::map::IndexMap as HashMap; -use indexmap::set::IndexSet as HashSet; -use serde_json::json; -use serde_json::{map::Map as JsonMap, Value}; - -use arrow_array::builder::*; -use arrow_array::types::*; -use arrow_array::*; -use arrow_buffer::{bit_util, i256, Buffer, MutableBuffer}; -use arrow_cast::parse::{parse_decimal, Parser}; -use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::*; - -#[derive(Debug, Clone)] -enum InferredType { - Scalar(HashSet), - Array(Box), - Object(HashMap), - Any, -} - -impl InferredType { - fn merge(&mut self, other: InferredType) -> Result<(), ArrowError> { - match (self, other) { - (InferredType::Array(s), InferredType::Array(o)) => { - s.merge(*o)?; - } - (InferredType::Scalar(self_hs), InferredType::Scalar(other_hs)) => { - other_hs.into_iter().for_each(|v| { - self_hs.insert(v); - }); - } - (InferredType::Object(self_map), InferredType::Object(other_map)) => { - for (k, v) in other_map { - self_map.entry(k).or_insert(InferredType::Any).merge(v)?; - } - } - (s @ InferredType::Any, v) => { - *s = v; - } - (_, InferredType::Any) => {} - // convert a scalar type to a single-item scalar array type. - ( - InferredType::Array(self_inner_type), - other_scalar @ InferredType::Scalar(_), - ) => { - self_inner_type.merge(other_scalar)?; - } - (s @ InferredType::Scalar(_), InferredType::Array(mut other_inner_type)) => { - other_inner_type.merge(s.clone())?; - *s = InferredType::Array(other_inner_type); - } - // incompatible types - (s, o) => { - return Err(ArrowError::JsonError(format!( - "Incompatible type found during schema inference: {s:?} v.s. {o:?}", - ))); - } - } - - Ok(()) - } -} - -/// Coerce data type during inference -/// -/// * `Int64` and `Float64` should be `Float64` -/// * Lists and scalars are coerced to a list of a compatible scalar -/// * All other types are coerced to `Utf8` -fn coerce_data_type(dt: Vec<&DataType>) -> DataType { - let mut dt_iter = dt.into_iter().cloned(); - let dt_init = dt_iter.next().unwrap_or(DataType::Utf8); - - dt_iter.fold(dt_init, |l, r| match (l, r) { - (DataType::Boolean, DataType::Boolean) => DataType::Boolean, - (DataType::Int64, DataType::Int64) => DataType::Int64, - (DataType::Float64, DataType::Float64) - | (DataType::Float64, DataType::Int64) - | (DataType::Int64, DataType::Float64) => DataType::Float64, - (DataType::List(l), DataType::List(r)) => DataType::List(Arc::new(Field::new( - "item", - coerce_data_type(vec![l.data_type(), r.data_type()]), - true, - ))), - // coerce scalar and scalar array into scalar array - (DataType::List(e), not_list) | (not_list, DataType::List(e)) => { - DataType::List(Arc::new(Field::new( - "item", - coerce_data_type(vec![e.data_type(), ¬_list]), - true, - ))) - } - _ => DataType::Utf8, - }) -} - -fn generate_datatype(t: &InferredType) -> Result { - Ok(match t { - InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), - InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), - InferredType::Array(ele_type) => DataType::List(Arc::new(Field::new( - "item", - generate_datatype(ele_type)?, - true, - ))), - InferredType::Any => DataType::Null, - }) -} - -fn generate_fields(spec: &HashMap) -> Result { - spec.iter() - .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) - .collect() -} - -/// Generate schema from JSON field names and inferred data types -fn generate_schema(spec: HashMap) -> Result { - Ok(Schema::new(generate_fields(&spec)?)) -} - -/// JSON file reader that produces a serde_json::Value iterator from a Read trait -/// -/// # Example -/// -/// ``` -/// use std::fs::File; -/// use std::io::BufReader; -/// use arrow_json::reader::ValueIter; -/// -/// let mut reader = -/// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); -/// let mut value_reader = ValueIter::new(&mut reader, None); -/// for value in value_reader { -/// println!("JSON value: {}", value.unwrap()); -/// } -/// ``` -#[derive(Debug)] -pub struct ValueIter<'a, R: BufRead> { - reader: &'a mut R, - max_read_records: Option, - record_count: usize, - // reuse line buffer to avoid allocation on each record - line_buf: String, -} - -impl<'a, R: BufRead> ValueIter<'a, R> { - /// Creates a new `ValueIter` - pub fn new(reader: &'a mut R, max_read_records: Option) -> Self { - Self { - reader, - max_read_records, - record_count: 0, - line_buf: String::new(), - } - } -} - -impl<'a, R: BufRead> Iterator for ValueIter<'a, R> { - type Item = Result; - - fn next(&mut self) -> Option { - if let Some(max) = self.max_read_records { - if self.record_count >= max { - return None; - } - } - - loop { - self.line_buf.truncate(0); - match self.reader.read_line(&mut self.line_buf) { - Ok(0) => { - // read_line returns 0 when stream reached EOF - return None; - } - Err(e) => { - return Some(Err(ArrowError::JsonError(format!( - "Failed to read JSON record: {e}" - )))); - } - _ => { - let trimmed_s = self.line_buf.trim(); - if trimmed_s.is_empty() { - // ignore empty lines - continue; - } - - self.record_count += 1; - return Some(serde_json::from_str(trimmed_s).map_err(|e| { - ArrowError::JsonError(format!("Not valid JSON: {e}")) - })); - } - } - } - } -} - -/// Infer the fields of a JSON file by reading the first n records of the file, with -/// `max_read_records` controlling the maximum number of records to read. -/// -/// If `max_read_records` is not set, the whole file is read to infer its field types. -/// -/// Contrary to [`infer_json_schema`], this function will seek back to the start of the `reader`. -/// That way, the `reader` can be used immediately afterwards to create a [`Reader`]. -/// -/// # Examples -/// ``` -/// use std::fs::File; -/// use std::io::BufReader; -/// use arrow_json::reader::infer_json_schema_from_seekable; -/// -/// let file = File::open("test/data/mixed_arrays.json").unwrap(); -/// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(file); -/// let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); -/// // file's cursor's offset automatically set at 0 -/// ``` -pub fn infer_json_schema_from_seekable( - reader: &mut BufReader, - max_read_records: Option, -) -> Result { - let schema = infer_json_schema(reader, max_read_records); - // return the reader seek back to the start - reader.rewind()?; - - schema -} - -/// Infer the fields of a JSON file by reading the first n records of the buffer, with -/// `max_read_records` controlling the maximum number of records to read. -/// -/// If `max_read_records` is not set, the whole file is read to infer its field types. -/// -/// This function will not seek back to the start of the `reader`. The user has to manage the -/// original file's cursor. This function is useful when the `reader`'s cursor is not available -/// (does not implement [`Seek`]), such is the case for compressed streams decoders. -/// -/// # Examples -/// ``` -/// use std::fs::File; -/// use std::io::{BufReader, SeekFrom, Seek}; -/// use flate2::read::GzDecoder; -/// use arrow_json::reader::infer_json_schema; -/// -/// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); -/// -/// // file's cursor's offset at 0 -/// let mut reader = BufReader::new(GzDecoder::new(&file)); -/// let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); -/// // cursor's offset at end of file -/// -/// // seek back to start so that the original file is usable again -/// file.seek(SeekFrom::Start(0)).unwrap(); -/// ``` -pub fn infer_json_schema( - reader: &mut R, - max_read_records: Option, -) -> Result { - infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) -} - -fn set_object_scalar_field_type( - field_types: &mut HashMap, - key: &str, - ftype: DataType, -) -> Result<(), ArrowError> { - if !field_types.contains_key(key) { - field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); - } - - match field_types.get_mut(key).unwrap() { - InferredType::Scalar(hs) => { - hs.insert(ftype); - Ok(()) - } - // in case of column contains both scalar type and scalar array type, we convert type of - // this column to scalar array. - scalar_array @ InferredType::Array(_) => { - let mut hs = HashSet::new(); - hs.insert(ftype); - scalar_array.merge(InferredType::Scalar(hs))?; - Ok(()) - } - t => Err(ArrowError::JsonError(format!( - "Expected scalar or scalar array JSON type, found: {t:?}", - ))), - } -} - -fn infer_scalar_array_type(array: &[Value]) -> Result { - let mut hs = HashSet::new(); - - for v in array { - match v { - Value::Null => {} - Value::Number(n) => { - if n.is_i64() { - hs.insert(DataType::Int64); - } else { - hs.insert(DataType::Float64); - } - } - Value::Bool(_) => { - hs.insert(DataType::Boolean); - } - Value::String(_) => { - hs.insert(DataType::Utf8); - } - Value::Array(_) | Value::Object(_) => { - return Err(ArrowError::JsonError(format!( - "Expected scalar value for scalar array, got: {v:?}" - ))); - } - } - } - - Ok(InferredType::Scalar(hs)) -} - -fn infer_nested_array_type(array: &[Value]) -> Result { - let mut inner_ele_type = InferredType::Any; - - for v in array { - match v { - Value::Array(inner_array) => { - inner_ele_type.merge(infer_array_element_type(inner_array)?)?; - } - x => { - return Err(ArrowError::JsonError(format!( - "Got non array element in nested array: {x:?}" - ))); - } - } - } - - Ok(InferredType::Array(Box::new(inner_ele_type))) -} - -fn infer_struct_array_type(array: &[Value]) -> Result { - let mut field_types = HashMap::new(); - - for v in array { - match v { - Value::Object(map) => { - collect_field_types_from_object(&mut field_types, map)?; - } - _ => { - return Err(ArrowError::JsonError(format!( - "Expected struct value for struct array, got: {v:?}" - ))); - } - } - } - - Ok(InferredType::Object(field_types)) -} - -fn infer_array_element_type(array: &[Value]) -> Result { - match array.iter().take(1).next() { - None => Ok(InferredType::Any), // empty array, return any type that can be updated later - Some(a) => match a { - Value::Array(_) => infer_nested_array_type(array), - Value::Object(_) => infer_struct_array_type(array), - _ => infer_scalar_array_type(array), - }, - } -} - -fn collect_field_types_from_object( - field_types: &mut HashMap, - map: &JsonMap, -) -> Result<(), ArrowError> { - for (k, v) in map { - match v { - Value::Array(array) => { - let ele_type = infer_array_element_type(array)?; - - if !field_types.contains_key(k) { - match ele_type { - InferredType::Scalar(_) => { - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Scalar( - HashSet::new(), - ))), - ); - } - InferredType::Object(_) => { - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Object( - HashMap::new(), - ))), - ); - } - InferredType::Any | InferredType::Array(_) => { - // set inner type to any for nested array as well - // so it can be updated properly from subsequent type merges - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Any)), - ); - } - } - } - - match field_types.get_mut(k).unwrap() { - InferredType::Array(inner_type) => { - inner_type.merge(ele_type)?; - } - // in case of column contains both scalar type and scalar array type, we - // convert type of this column to scalar array. - field_type @ InferredType::Scalar(_) => { - field_type.merge(ele_type)?; - *field_type = InferredType::Array(Box::new(field_type.clone())); - } - t => { - return Err(ArrowError::JsonError(format!( - "Expected array json type, found: {t:?}", - ))); - } - } - } - Value::Bool(_) => { - set_object_scalar_field_type(field_types, k, DataType::Boolean)?; - } - Value::Null => { - // do nothing, we treat json as nullable by default when - // inferring - } - Value::Number(n) => { - if n.is_f64() { - set_object_scalar_field_type(field_types, k, DataType::Float64)?; - } else { - // default to i64 - set_object_scalar_field_type(field_types, k, DataType::Int64)?; - } - } - Value::String(_) => { - set_object_scalar_field_type(field_types, k, DataType::Utf8)?; - } - Value::Object(inner_map) => { - if !field_types.contains_key(k) { - field_types - .insert(k.to_string(), InferredType::Object(HashMap::new())); - } - match field_types.get_mut(k).unwrap() { - InferredType::Object(inner_field_types) => { - collect_field_types_from_object(inner_field_types, inner_map)?; - } - t => { - return Err(ArrowError::JsonError(format!( - "Expected object json type, found: {t:?}", - ))); - } - } - } - } - } - - Ok(()) -} - -/// Infer the fields of a JSON file by reading all items from the JSON Value Iterator. -/// -/// The following type coercion logic is implemented: -/// * `Int64` and `Float64` are converted to `Float64` -/// * Lists and scalars are coerced to a list of a compatible scalar -/// * All other cases are coerced to `Utf8` (String) -/// -/// Note that the above coercion logic is different from what Spark has, where it would default to -/// String type in case of List and Scalar values appeared in the same field. -/// -/// The reason we diverge here is because we don't have utilities to deal with JSON data once it's -/// interpreted as Strings. We should match Spark's behavior once we added more JSON parsing -/// kernels in the future. -pub fn infer_json_schema_from_iterator(value_iter: I) -> Result -where - I: Iterator>, - V: Borrow, -{ - let mut field_types: HashMap = HashMap::new(); - - for record in value_iter { - match record?.borrow() { - Value::Object(map) => { - collect_field_types_from_object(&mut field_types, map)?; - } - value => { - return Err(ArrowError::JsonError(format!( - "Expected JSON record to be an object, found {value:?}" - ))); - } - }; - } - - generate_schema(field_types) -} - -/// JSON values to Arrow record batch decoder. -/// -/// A [`Decoder`] decodes arbitrary streams of [`serde_json::Value`]s and -/// converts them to [`RecordBatch`]es. To decode JSON formatted files, -/// see [`Reader`]. -/// -/// Note: Consider instead using [`RawDecoder`] which is faster and will -/// eventually replace this implementation as part of [#3610] -/// -/// # Examples -/// ``` -/// use arrow_json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; -/// use std::fs::File; -/// use std::io::{BufReader, Seek, SeekFrom}; -/// use std::sync::Arc; -/// -/// let mut reader = -/// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); -/// let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); -/// let options = DecoderOptions::new() -/// .with_batch_size(1024); -/// let decoder = Decoder::new(Arc::new(inferred_schema), options); -/// -/// // seek back to start so that the original file is usable again -/// reader.seek(SeekFrom::Start(0)).unwrap(); -/// let mut value_reader = ValueIter::new(&mut reader, None); -/// let batch = decoder.next_batch(&mut value_reader).unwrap().unwrap(); -/// assert_eq!(4, batch.num_rows()); -/// assert_eq!(4, batch.num_columns()); -/// ``` -/// -/// [`RawDecoder`]: crate::raw::RawDecoder -/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 -#[derive(Debug)] -#[deprecated(note = "Use RawDecoder instead")] -pub struct Decoder { - /// Explicit schema for the JSON file - schema: SchemaRef, - /// This is a collection of options for json decoder - options: DecoderOptions, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -/// Options for JSON decoding -pub struct DecoderOptions { - /// Batch size (number of records to load each time), defaults to 1024 records - batch_size: usize, - /// Optional projection for which columns to load (case-sensitive names) - projection: Option>, - /// optional HashMap of column name to its format string - format_strings: Option>, -} - -impl Default for DecoderOptions { - fn default() -> Self { - Self { - batch_size: 1024, - projection: None, - format_strings: None, - } - } -} - -impl DecoderOptions { - /// Creates a new `DecoderOptions` - pub fn new() -> Self { - Default::default() - } - - /// Set the batch size (number of records to load at one time) - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = batch_size; - self - } - - /// Set the reader's column projection - pub fn with_projection(mut self, projection: Vec) -> Self { - self.projection = Some(projection); - self - } - - /// Set the decoder's format Strings param - pub fn with_format_strings( - mut self, - format_strings: HashMap, - ) -> Self { - self.format_strings = Some(format_strings); - self - } -} - -#[allow(deprecated)] -impl Decoder { - /// Create a new JSON decoder from some value that implements an - /// iterator over [`serde_json::Value`]s (aka implements the - /// `Iterator>` trait). - pub fn new(schema: SchemaRef, options: DecoderOptions) -> Self { - Self { schema, options } - } - - /// Returns the schema of the reader, useful for getting the schema without reading - /// record batches - pub fn schema(&self) -> SchemaRef { - match &self.options.projection { - Some(projection) => { - let fields = self.schema.fields(); - let projected_fields: Fields = fields - .iter() - .filter_map(|field| { - if projection.contains(field.name()) { - Some(field.clone()) - } else { - None - } - }) - .collect(); - - Arc::new(Schema::new(projected_fields)) - } - None => self.schema.clone(), - } - } - - /// Read the next batch of [`serde_json::Value`] records from the - /// iterator into a [`RecordBatch`]. - /// - /// Returns `None` if the input iterator is exhausted. - pub fn next_batch( - &self, - value_iter: &mut I, - ) -> Result, ArrowError> - where - I: Iterator>, - { - let batch_size = self.options.batch_size; - let mut rows: Vec = Vec::with_capacity(batch_size); - - for value in value_iter.by_ref().take(batch_size) { - let v = value?; - match v { - Value::Object(_) => rows.push(v), - _ => { - return Err(ArrowError::JsonError(format!( - "Row needs to be of type object, got: {v:?}" - ))); - } - } - } - if rows.is_empty() { - // reached end of file - return Ok(None); - } - - let rows = &rows[..]; - - let arrays = - self.build_struct_array(rows, self.schema.fields(), &self.options.projection); - - let projected_fields: Fields = match self.options.projection.as_ref() { - Some(projection) => projection - .iter() - .filter_map(|name| Some(self.schema.fields.find(name)?.1.clone())) - .collect(), - None => self.schema.fields.clone(), - }; - let projected_schema = Arc::new(Schema::new(projected_fields)); - - arrays.and_then(|arr| { - RecordBatch::try_new_with_options( - projected_schema, - arr, - &RecordBatchOptions::new() - .with_match_field_names(true) - .with_row_count(Some(rows.len())), - ) - .map(Some) - }) - } - - fn build_wrapped_list_array( - &self, - rows: &[Value], - col_name: &str, - key_type: &DataType, - ) -> Result { - match *key_type { - DataType::Int8 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int8), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int16 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int16), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int32 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::Int64 => { - let dtype = DataType::Dictionary( - Box::new(DataType::Int64), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt8 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt16 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt16), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt32 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt32), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - DataType::UInt64 => { - let dtype = DataType::Dictionary( - Box::new(DataType::UInt64), - Box::new(DataType::Utf8), - ); - self.list_array_string_array_builder::(&dtype, col_name, rows) - } - ref e => Err(ArrowError::JsonError(format!( - "Data type is currently not supported for dictionaries in list : {e:?}" - ))), - } - } - - #[inline(always)] - fn list_array_string_array_builder

( - &self, - data_type: &DataType, - col_name: &str, - rows: &[Value], - ) -> Result - where - DT: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - let mut builder: Box = match data_type { - DataType::Utf8 => { - let values_builder = - StringBuilder::with_capacity(rows.len(), rows.len() * 5); - Box::new(ListBuilder::new(values_builder)) - } - DataType::Dictionary(_, _) => { - let values_builder = - self.build_string_dictionary_builder::
(rows.len() * 5); - Box::new(ListBuilder::new(values_builder)) - } - e => { - return Err(ArrowError::JsonError(format!( - "Nested list data builder type is not supported: {e:?}" - ))) - } - }; - - for row in rows { - if let Some(value) = row.get(col_name) { - // value can be an array or a scalar - let vals: Vec> = if let Value::String(v) = value { - vec![Some(v.to_string())] - } else if let Value::Array(n) = value { - n.iter() - .map(|v: &Value| { - if v.is_string() { - Some(v.as_str().unwrap().to_string()) - } else if v.is_array() || v.is_object() || v.is_null() { - // implicitly drop nested values - // TODO support deep-nesting - None - } else { - Some(v.to_string()) - } - }) - .collect() - } else if let Value::Null = value { - vec![None] - } else if !value.is_object() { - vec![Some(value.to_string())] - } else { - return Err(ArrowError::JsonError( - "Only scalars are currently supported in JSON arrays".to_string(), - )); - }; - - // TODO: ARROW-10335: APIs of dictionary arrays and others are different. Unify - // them. - match data_type { - DataType::Utf8 => { - let builder = builder - .as_any_mut() - .downcast_mut::>() - .ok_or_else(||ArrowError::JsonError( - "Cast failed for ListBuilder during nested data parsing".to_string(), - ))?; - for val in vals { - if let Some(v) = val { - builder.values().append_value(&v); - } else { - builder.values().append_null(); - }; - } - - // Append to the list - builder.append(true); - } - DataType::Dictionary(_, _) => { - let builder = builder.as_any_mut().downcast_mut::>>().ok_or_else(||ArrowError::JsonError( - "Cast failed for ListBuilder during nested data parsing".to_string(), - ))?; - for val in vals { - if let Some(v) = val { - let _ = builder.values().append(&v); - } else { - builder.values().append_null(); - }; - } - - // Append to the list - builder.append(true); - } - e => { - return Err(ArrowError::JsonError(format!( - "Nested list data builder type is not supported: {e:?}" - ))) - } - } - } - } - - Ok(builder.finish() as ArrayRef) - } - - #[inline(always)] - fn build_string_dictionary_builder( - &self, - row_len: usize, - ) -> StringDictionaryBuilder - where - T: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - StringDictionaryBuilder::with_capacity(row_len, row_len, row_len * 5) - } - - #[inline(always)] - fn build_string_dictionary_array( - &self, - rows: &[Value], - col_name: &str, - key_type: &DataType, - value_type: &DataType, - ) -> Result { - if let DataType::Utf8 = *value_type { - match *key_type { - DataType::Int8 => self.build_dictionary_array::(rows, col_name), - DataType::Int16 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::Int32 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::Int64 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt8 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt16 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt32 => { - self.build_dictionary_array::(rows, col_name) - } - DataType::UInt64 => { - self.build_dictionary_array::(rows, col_name) - } - _ => Err(ArrowError::JsonError( - "unsupported dictionary key type".to_string(), - )), - } - } else { - Err(ArrowError::JsonError( - "dictionary types other than UTF-8 not yet supported".to_string(), - )) - } - } - - fn build_boolean_array( - &self, - rows: &[Value], - col_name: &str, - ) -> Result { - let mut builder = BooleanBuilder::with_capacity(rows.len()); - for row in rows { - if let Some(value) = row.get(col_name) { - if let Some(boolean) = value.as_bool() { - builder.append_value(boolean); - } else { - builder.append_null(); - } - } else { - builder.append_null(); - } - } - Ok(Arc::new(builder.finish())) - } - - fn build_primitive_array( - &self, - rows: &[Value], - col_name: &str, - ) -> Result - where - T: ArrowPrimitiveType, - T::Native: num::NumCast, - { - let format_string = self - .options - .format_strings - .as_ref() - .and_then(|fmts| fmts.get(col_name)); - Ok(Arc::new( - rows.iter() - .map(|row| { - row.get(col_name).and_then(|value| { - if value.is_i64() { - value.as_i64().and_then(num::cast::cast) - } else if value.is_u64() { - value.as_u64().and_then(num::cast::cast) - } else if value.is_string() { - match format_string { - Some(fmt) => { - T::parse_formatted(value.as_str().unwrap(), fmt) - } - None => T::parse(value.as_str().unwrap()), - } - } else { - value.as_f64().and_then(num::cast::cast) - } - }) - }) - .collect::>(), - )) - } - - fn build_decimal128_array( - &self, - rows: &[Value], - col_name: &str, - precision: u8, - scale: i8, - ) -> Result { - Ok(Arc::new( - rows.iter() - .map(|row| { - row.get(col_name).and_then(|value| { - if value.is_i64() { - let mul = 10i128.pow(scale as _); - value - .as_i64() - .and_then(num::cast::cast) - .map(|v: i128| v * mul) - } else if value.is_u64() { - let mul = 10i128.pow(scale as _); - value - .as_u64() - .and_then(num::cast::cast) - .map(|v: i128| v * mul) - } else if value.is_string() { - value.as_str().and_then(|s| { - parse_decimal::(s, precision, scale).ok() - }) - } else { - let mul = 10_f64.powi(scale as i32); - value.as_f64().map(|f| (f * mul).round() as i128) - } - }) - }) - .collect::() - .with_precision_and_scale(precision, scale)?, - )) - } - - fn build_decimal256_array( - &self, - rows: &[Value], - col_name: &str, - precision: u8, - scale: i8, - ) -> Result { - let mul = 10_f64.powi(scale as i32); - Ok(Arc::new( - rows.iter() - .map(|row| { - row.get(col_name).and_then(|value| { - if value.is_i64() { - let mul = i256::from_i128(10).pow_wrapping(scale as _); - value.as_i64().map(|i| i256::from_i128(i as _) * mul) - } else if value.is_u64() { - let mul = i256::from_i128(10).pow_wrapping(scale as _); - value.as_u64().map(|i| i256::from_i128(i as _) * mul) - } else if value.is_string() { - value.as_str().and_then(|s| { - parse_decimal::(s, precision, scale).ok() - }) - } else { - value.as_f64().and_then(|f| i256::from_f64(f * mul.round())) - } - }) - }) - .collect::() - .with_precision_and_scale(precision, scale)?, - )) - } - - /// Build a nested GenericListArray from a list of unnested `Value`s - fn build_nested_list_array( - &self, - rows: &[Value], - list_field: &FieldRef, - ) -> Result { - // build list offsets - let mut cur_offset = OffsetSize::zero(); - let list_len = rows.len(); - let num_list_bytes = bit_util::ceil(list_len, 8); - let mut offsets = Vec::with_capacity(list_len + 1); - let mut list_nulls = MutableBuffer::from_len_zeroed(num_list_bytes); - let list_nulls = list_nulls.as_slice_mut(); - offsets.push(cur_offset); - rows.iter().enumerate().for_each(|(i, v)| { - if let Value::Array(a) = v { - cur_offset += OffsetSize::from_usize(a.len()).unwrap(); - bit_util::set_bit(list_nulls, i); - } else if let Value::Null = v { - // value is null, not incremented - } else { - cur_offset += OffsetSize::one(); - } - offsets.push(cur_offset); - }); - let valid_len = cur_offset.to_usize().unwrap(); - let array_data = match list_field.data_type() { - DataType::Null => NullArray::new(valid_len).into_data(), - DataType::Boolean => { - let num_bytes = bit_util::ceil(valid_len, 8); - let mut bool_values = MutableBuffer::from_len_zeroed(num_bytes); - let mut bool_nulls = - MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let mut curr_index = 0; - rows.iter().for_each(|v| { - if let Value::Array(vs) = v { - vs.iter().for_each(|value| { - if let Value::Bool(child) = value { - // if valid boolean, append value - if *child { - bit_util::set_bit( - bool_values.as_slice_mut(), - curr_index, - ); - } - } else { - // null slot - bit_util::unset_bit( - bool_nulls.as_slice_mut(), - curr_index, - ); - } - curr_index += 1; - }); - } - }); - unsafe { - ArrayData::builder(list_field.data_type().clone()) - .len(valid_len) - .add_buffer(bool_values.into()) - .null_bit_buffer(Some(bool_nulls.into())) - .build_unchecked() - } - } - DataType::Int8 => self.read_primitive_list_values::(rows), - DataType::Int16 => self.read_primitive_list_values::(rows), - DataType::Int32 => self.read_primitive_list_values::(rows), - DataType::Int64 => self.read_primitive_list_values::(rows), - DataType::UInt8 => self.read_primitive_list_values::(rows), - DataType::UInt16 => self.read_primitive_list_values::(rows), - DataType::UInt32 => self.read_primitive_list_values::(rows), - DataType::UInt64 => self.read_primitive_list_values::(rows), - DataType::Float16 => { - return Err(ArrowError::JsonError("Float16 not supported".to_string())) - } - DataType::Float32 => self.read_primitive_list_values::(rows), - DataType::Float64 => self.read_primitive_list_values::(rows), - DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) => { - return Err(ArrowError::JsonError( - "Temporal types are not yet supported, see ARROW-4803".to_string(), - )) - } - DataType::Utf8 => flatten_json_string_values(rows) - .into_iter() - .collect::() - .into_data(), - DataType::LargeUtf8 => flatten_json_string_values(rows) - .into_iter() - .collect::() - .into_data(), - DataType::List(field) => { - let child = self - .build_nested_list_array::(&flatten_json_values(rows), field)?; - child.into_data() - } - DataType::LargeList(field) => { - let child = self - .build_nested_list_array::(&flatten_json_values(rows), field)?; - child.into_data() - } - DataType::Struct(fields) => { - // extract list values, with non-lists converted to Value::Null - let array_item_count = cur_offset.to_usize().unwrap(); - let num_bytes = bit_util::ceil(array_item_count, 8); - let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes); - let mut struct_index = 0; - let rows: Vec = rows - .iter() - .flat_map(|row| match row { - Value::Array(values) if !values.is_empty() => { - values.iter().for_each(|value| { - if !value.is_null() { - bit_util::set_bit( - null_buffer.as_slice_mut(), - struct_index, - ); - } - struct_index += 1; - }); - values.clone() - } - _ => { - vec![] - } - }) - .collect(); - let arrays = self.build_struct_array(rows.as_slice(), fields, &None)?; - let data_type = DataType::Struct(fields.clone()); - let buf = null_buffer.into(); - unsafe { - ArrayDataBuilder::new(data_type) - .len(rows.len()) - .null_bit_buffer(Some(buf)) - .child_data(arrays.into_iter().map(|a| a.into_data()).collect()) - .build_unchecked() - } - } - datatype => { - return Err(ArrowError::JsonError(format!( - "Nested list of {datatype:?} not supported" - ))); - } - }; - // build list - let list_data = ArrayData::builder(DataType::List(list_field.clone())) - .len(list_len) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(array_data) - .null_bit_buffer(Some(list_nulls.into())); - let list_data = unsafe { list_data.build_unchecked() }; - Ok(Arc::new(GenericListArray::::from(list_data))) - } - - /// Builds the child values of a `StructArray`, falling short of constructing the StructArray. - /// The function does not construct the StructArray as some callers would want the child arrays. - /// - /// *Note*: The function is recursive, and will read nested structs. - /// - /// If `projection` is &None, then all values are returned. The first level of projection - /// occurs at the `RecordBatch` level. No further projection currently occurs, but would be - /// useful if plucking values from a struct, e.g. getting `a.b.c.e` from `a.b.c.{d, e}`. - fn build_struct_array( - &self, - rows: &[Value], - struct_fields: &Fields, - projection: &Option>, - ) -> Result, ArrowError> { - let arrays: Result, ArrowError> = struct_fields - .iter() - .filter(|field| { - projection - .as_ref() - .map(|p| p.contains(field.name())) - .unwrap_or(true) - }) - .map(|field| { - match field.data_type() { - DataType::Null => { - Ok(Arc::new(NullArray::new(rows.len())) as ArrayRef) - } - DataType::Boolean => self.build_boolean_array(rows, field.name()), - DataType::Float64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Float32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Int64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Int32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Int16 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Int8 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt16 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt8 => { - self.build_primitive_array::(rows, field.name()) - } - // TODO: this is incomplete - DataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => self - .build_primitive_array::( - rows, - field.name(), - ), - TimeUnit::Microsecond => self - .build_primitive_array::( - rows, - field.name(), - ), - TimeUnit::Millisecond => self - .build_primitive_array::( - rows, - field.name(), - ), - TimeUnit::Nanosecond => self - .build_primitive_array::( - rows, - field.name(), - ), - }, - DataType::Date64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Date32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Time64(unit) => match unit { - TimeUnit::Microsecond => self - .build_primitive_array::( - rows, - field.name(), - ), - TimeUnit::Nanosecond => self - .build_primitive_array::( - rows, - field.name(), - ), - t => Err(ArrowError::JsonError(format!( - "TimeUnit {t:?} not supported with Time64" - ))), - }, - DataType::Time32(unit) => match unit { - TimeUnit::Second => self - .build_primitive_array::( - rows, - field.name(), - ), - TimeUnit::Millisecond => self - .build_primitive_array::( - rows, - field.name(), - ), - t => Err(ArrowError::JsonError(format!( - "TimeUnit {t:?} not supported with Time32" - ))), - }, - DataType::Utf8 => Ok(Arc::new( - rows.iter() - .map(|row| { - let maybe_value = row.get(field.name()); - maybe_value.and_then(|value| value.as_str()) - }) - .collect::(), - ) as ArrayRef), - DataType::Binary => Ok(Arc::new( - rows.iter() - .map(|row| { - let maybe_value = row.get(field.name()); - maybe_value.and_then(|value| value.as_str()) - }) - .collect::(), - ) as ArrayRef), - DataType::List(ref list_field) => { - match list_field.data_type() { - DataType::Dictionary(ref key_ty, _) => { - self.build_wrapped_list_array(rows, field.name(), key_ty) - } - _ => { - // extract rows by name - let extracted_rows = rows - .iter() - .map(|row| { - row.get(field.name()) - .cloned() - .unwrap_or(Value::Null) - }) - .collect::>(); - self.build_nested_list_array::( - extracted_rows.as_slice(), - list_field, - ) - } - } - } - DataType::Dictionary(ref key_ty, ref val_ty) => self - .build_string_dictionary_array( - rows, - field.name(), - key_ty, - val_ty, - ), - DataType::Struct(fields) => { - let len = rows.len(); - let num_bytes = bit_util::ceil(len, 8); - let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes); - let struct_rows = rows - .iter() - .enumerate() - .map(|(i, row)| { - (i, row.as_object().and_then(|v| v.get(field.name()))) - }) - .map(|(i, v)| match v { - // we want the field as an object, if it's not, we treat as null - Some(Value::Object(value)) => { - bit_util::set_bit(null_buffer.as_slice_mut(), i); - Value::Object(value.clone()) - } - _ => Value::Object(Default::default()), - }) - .collect::>(); - let arrays = - self.build_struct_array(&struct_rows, fields, &None)?; - // construct a struct array's data in order to set null buffer - let data_type = DataType::Struct(fields.clone()); - let data = ArrayDataBuilder::new(data_type) - .len(len) - .null_bit_buffer(Some(null_buffer.into())) - .child_data( - arrays.into_iter().map(|a| a.into_data()).collect(), - ); - let data = unsafe { data.build_unchecked() }; - Ok(make_array(data)) - } - DataType::Map(map_field, _) => self.build_map_array( - rows, - field.name(), - field.data_type(), - map_field, - ), - DataType::Decimal128(precision, scale) => self - .build_decimal128_array(rows, field.name(), *precision, *scale), - DataType::Decimal256(precision, scale) => self - .build_decimal256_array(rows, field.name(), *precision, *scale), - _ => Err(ArrowError::JsonError(format!( - "{:?} type is not supported", - field.data_type() - ))), - } - }) - .collect(); - arrays - } - - fn build_map_array( - &self, - rows: &[Value], - field_name: &str, - map_type: &DataType, - struct_field: &Field, - ) -> Result { - // A map has the format {"key": "value"} where key is most commonly a string, - // but could be a string, number or boolean (🤷🏾‍♂️) (e.g. {1: "value"}). - // A map is also represented as a flattened contiguous array, with the number - // of key-value pairs being separated by a list offset. - // If row 1 has 2 key-value pairs, and row 2 has 3, the offsets would be - // [0, 2, 5]. - // - // Thus we try to read a map by iterating through the keys and values - - let (key_field, value_field) = - if let DataType::Struct(fields) = struct_field.data_type() { - if fields.len() != 2 { - return Err(ArrowError::InvalidArgumentError(format!( - "DataType::Map expects a struct with 2 fields, found {} fields", - fields.len() - ))); - } - (&fields[0], &fields[1]) - } else { - return Err(ArrowError::InvalidArgumentError(format!( - "JSON map array builder expects a DataType::Map, found {:?}", - struct_field.data_type() - ))); - }; - let value_map_iter = rows.iter().map(|value| { - value - .get(field_name) - .and_then(|v| v.as_object().map(|map| (map, map.len() as i32))) - }); - let rows_len = rows.len(); - let mut list_offsets = Vec::with_capacity(rows_len + 1); - list_offsets.push(0i32); - let mut last_offset = 0; - let num_bytes = bit_util::ceil(rows_len, 8); - let mut list_bitmap = MutableBuffer::from_len_zeroed(num_bytes); - let null_data = list_bitmap.as_slice_mut(); - - let struct_rows = value_map_iter - .enumerate() - .filter_map(|(i, v)| match v { - Some((map, len)) => { - list_offsets.push(last_offset + len); - last_offset += len; - bit_util::set_bit(null_data, i); - Some(map.iter().map(|(k, v)| { - json!({ - key_field.name(): k, - value_field.name(): v - }) - })) - } - None => { - list_offsets.push(last_offset); - None - } - }) - .flatten() - .collect::>(); - - let struct_children = self.build_struct_array( - struct_rows.as_slice(), - &Fields::from([key_field.clone(), value_field.clone()]), - &None, - )?; - - unsafe { - Ok(make_array(ArrayData::new_unchecked( - map_type.clone(), - rows_len, - None, - Some(list_bitmap.into()), - 0, - vec![Buffer::from_slice_ref(&list_offsets)], - vec![ArrayData::new_unchecked( - struct_field.data_type().clone(), - struct_children[0].len(), - None, - None, - 0, - vec![], - struct_children - .into_iter() - .map(|array| array.into_data()) - .collect(), - )], - ))) - } - } - - #[inline(always)] - fn build_dictionary_array( - &self, - rows: &[Value], - col_name: &str, - ) -> Result - where - T::Native: num::NumCast, - T: ArrowPrimitiveType + ArrowDictionaryKeyType, - { - let mut builder: StringDictionaryBuilder = - self.build_string_dictionary_builder(rows.len()); - for row in rows { - if let Some(value) = row.get(col_name) { - if let Some(str_v) = value.as_str() { - builder.append(str_v).map(drop)? - } else { - builder.append_null(); - } - } else { - builder.append_null(); - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - - /// Read the primitive list's values into ArrayData - fn read_primitive_list_values(&self, rows: &[Value]) -> ArrayData - where - T: ArrowPrimitiveType, - T::Native: num::NumCast, - { - let values = rows - .iter() - .flat_map(|row| { - // read values from list - if let Value::Array(values) = row { - values - .iter() - .map(|value| { - let v: Option = - value.as_f64().and_then(num::cast::cast); - v - }) - .collect::>>() - } else if let Value::Number(value) = row { - // handle the scalar number case - let v: Option = value.as_f64().and_then(num::cast::cast); - v.map(|v| vec![Some(v)]).unwrap_or_default() - } else { - vec![] - } - }) - .collect::>>(); - let array = values.iter().collect::>(); - array.into_data() - } -} - -/// Reads a JSON value as a string, regardless of its type. -/// This is useful if the expected datatype is a string, in which case we preserve -/// all the values regardless of they type. -/// -/// Applying `value.to_string()` unfortunately results in an escaped string, which -/// is not what we want. -#[inline(always)] -fn json_value_as_string(value: &Value) -> Option { - match value { - Value::Null => None, - Value::String(string) => Some(string.clone()), - _ => Some(value.to_string()), - } -} - -/// Flattens a list of JSON values, by flattening lists, and treating all other values as -/// single-value lists. -/// This is used to read into nested lists (list of list, list of struct) and non-dictionary lists. -#[inline] -fn flatten_json_values(values: &[Value]) -> Vec { - values - .iter() - .flat_map(|row| { - if let Value::Array(values) = row { - values.clone() - } else if let Value::Null = row { - vec![Value::Null] - } else { - // we interpret a scalar as a single-value list to minimise data loss - vec![row.clone()] - } - }) - .collect() -} - -/// Flattens a list into string values, dropping Value::Null in the process. -/// This is useful for interpreting any JSON array as string, dropping nulls. -/// See `json_value_as_string`. -#[inline] -fn flatten_json_string_values(values: &[Value]) -> Vec> { - values - .iter() - .flat_map(|row| { - if let Value::Array(values) = row { - values - .iter() - .map(json_value_as_string) - .collect::>>() - } else if let Value::Null = row { - vec![] - } else { - vec![json_value_as_string(row)] - } - }) - .collect::>>() -} -/// JSON file reader -/// -/// Note: Consider instead using [`RawReader`] which is faster and will -/// eventually replace this implementation as part of [#3610] -/// -/// [`RawReader`]: crate::raw::RawReader -/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 -#[derive(Debug)] -#[deprecated(note = "Use RawReader instead")] -#[allow(deprecated)] -pub struct Reader { - reader: BufReader, - /// JSON value decoder - decoder: Decoder, -} - -#[allow(deprecated)] -impl Reader { - /// Create a new JSON Reader from any value that implements the `Read` trait. - /// - /// If reading a `File`, you can customise the Reader, such as to enable schema - /// inference, use `ReaderBuilder`. - pub fn new(reader: R, schema: SchemaRef, options: DecoderOptions) -> Self { - Self::from_buf_reader(BufReader::new(reader), schema, options) - } - - /// Create a new JSON Reader from a `BufReader` - /// - /// To customize the schema, such as to enable schema inference, use `ReaderBuilder` - pub fn from_buf_reader( - reader: BufReader, - schema: SchemaRef, - options: DecoderOptions, - ) -> Self { - Self { - reader, - decoder: Decoder::new(schema, options), - } - } - - /// Returns the schema of the reader, useful for getting the schema without reading - /// record batches - pub fn schema(&self) -> SchemaRef { - self.decoder.schema() - } - - /// Read the next batch of records - #[allow(clippy::should_implement_trait)] - pub fn next(&mut self) -> Result, ArrowError> { - self.decoder - .next_batch(&mut ValueIter::new(&mut self.reader, None)) - } -} - -/// JSON file reader builder -/// -/// Note: Consider instead using [`RawReaderBuilder`] which is faster and will -/// eventually replace this implementation as part of [#3610] -/// -/// [`RawReaderBuilder`]: crate::raw::RawReaderBuilder -/// [#3610]: https://github.com/apache/arrow-rs/issues/3610 -/// -#[derive(Debug, Default)] -#[deprecated(note = "Use RawReaderBuilder instead")] -pub struct ReaderBuilder { - /// Optional schema for the JSON file - /// - /// If the schema is not supplied, the reader will try to infer the schema - /// based on the JSON structure. - schema: Option, - /// Optional maximum number of records to read during schema inference - /// - /// If a number is not provided, all the records are read. - max_records: Option, - /// Options for json decoder - options: DecoderOptions, -} - -#[allow(deprecated)] -impl ReaderBuilder { - /// Create a new builder for configuring JSON parsing options. - /// - /// To convert a builder into a reader, call `Reader::from_builder` - /// - /// # Example - /// - /// ``` - /// # use std::fs::File; - /// - /// fn example() -> arrow_json::Reader { - /// let file = File::open("test/data/basic.json").unwrap(); - /// - /// // create a builder, inferring the schema with the first 100 records - /// let builder = arrow_json::ReaderBuilder::new().infer_schema(Some(100)); - /// - /// let reader = builder.build::(file).unwrap(); - /// - /// reader - /// } - /// ``` - pub fn new() -> Self { - Self::default() - } - - /// Set the JSON file's schema - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); - self - } - - /// Set the JSON reader to infer the schema of the file - pub fn infer_schema(mut self, max_records: Option) -> Self { - // remove any schema that is set - self.schema = None; - self.max_records = max_records; - self - } - - /// Set the batch size (number of records to load at one time) - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.options = self.options.with_batch_size(batch_size); - self - } - - /// Set the reader's column projection - pub fn with_projection(mut self, projection: Vec) -> Self { - self.options = self.options.with_projection(projection); - self - } - - /// Set the decoder's format Strings param - pub fn with_format_strings( - mut self, - format_strings: HashMap, - ) -> Self { - self.options = self.options.with_format_strings(format_strings); - self - } - - /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, source: R) -> Result, ArrowError> - where - R: Read + Seek, - { - let mut buf_reader = BufReader::new(source); - - // check if schema should be inferred - let schema = match self.schema { - Some(schema) => schema, - None => Arc::new(infer_json_schema_from_seekable( - &mut buf_reader, - self.max_records, - )?), - }; - - Ok(Reader::from_buf_reader(buf_reader, schema, self.options)) - } -} - -#[allow(deprecated)] -impl Iterator for Reader { - type Item = Result; - - fn next(&mut self) -> Option { - self.next().transpose() - } -} - -#[cfg(test)] -#[allow(deprecated)] -mod tests { - use super::*; - use arrow_array::cast::AsArray; - use arrow_buffer::{ArrowNativeType, ToByteSlice}; - use arrow_schema::DataType::{Dictionary, List}; - use flate2::read::GzDecoder; - use std::fs::File; - use std::io::Cursor; - - #[test] - fn test_json_basic() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(7, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(1, b.0); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(2, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(3, d.0); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(2.0, bb.value(0)); - assert_eq!(-3.5, bb.value(1)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!cc.value(0)); - assert!(cc.value(10)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!("4", dd.value(0)); - assert_eq!("text", dd.value(8)); - } - - #[test] - fn test_json_empty_projection() { - let builder = ReaderBuilder::new() - .infer_schema(None) - .with_batch_size(64) - .with_projection(vec![]); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(0, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - } - - #[test] - fn test_json_basic_with_nulls() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float64, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(bb.is_valid(0)); - assert!(!bb.is_valid(2)); - assert!(!bb.is_valid(11)); - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(cc.is_valid(0)); - assert!(!cc.is_valid(4)); - assert!(!cc.is_valid(11)); - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(!dd.is_valid(0)); - assert!(dd.is_valid(1)); - assert!(!dd.is_valid(4)); - assert!(!dd.is_valid(11)); - } - - #[test] - fn test_json_basic_schema() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - Field::new("d", DataType::Utf8, false), - ]); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - Arc::new(schema.clone()), - DecoderOptions::new(), - ); - let reader_schema = reader.schema(); - assert_eq!(reader_schema, Arc::new(schema)); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int32, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!(&DataType::Float32, b.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(&DataType::Boolean, c.1.data_type()); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1, aa.value(0)); - // test that a 64bit value is returned as null due to overflowing - assert!(!aa.is_valid(11)); - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(2.0, bb.value(0)); - assert_eq!(-3.5, bb.value(1)); - } - - #[test] - fn test_json_format_strings_for_date() { - let schema = Arc::new(Schema::new(vec![Field::new("e", DataType::Date32, true)])); - let e = schema.column_with_name("e").unwrap(); - assert_eq!(&DataType::Date32, e.1.data_type()); - let mut fmts = HashMap::new(); - let date_format = "%Y-%m-%d".to_string(); - fmts.insert("e".to_string(), date_format.clone()); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - schema.clone(), - DecoderOptions::new().with_format_strings(fmts), - ); - let reader_schema = reader.schema(); - assert_eq!(reader_schema, schema); - let batch = reader.next().unwrap().unwrap(); - - let ee = batch - .column(e.0) - .as_any() - .downcast_ref::() - .unwrap(); - let dt = Date32Type::parse_formatted("1970-1-2", &date_format).unwrap(); - assert_eq!(dt, ee.value(0)); - let dt = Date32Type::parse_formatted("1969-12-31", &date_format).unwrap(); - assert_eq!(dt, ee.value(1)); - assert!(!ee.is_valid(2)); - } - - #[test] - fn test_json_basic_schema_projection() { - // We test implicit and explicit projection: - // Implicit: omitting fields from a schema - // Explicit: supplying a vec of fields to take - let schema = Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Float32, false), - Field::new("c", DataType::Boolean, false), - ]); - - let mut reader: Reader = Reader::new( - File::open("test/data/basic.json").unwrap(), - Arc::new(schema), - DecoderOptions::new().with_projection(vec!["a".to_string(), "c".to_string()]), - ); - let reader_schema = reader.schema(); - let expected_schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("c", DataType::Boolean, false), - ])); - assert_eq!(reader_schema, expected_schema); - - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(2, batch.num_columns()); - assert_eq!(2, batch.schema().fields().len()); - assert_eq!(12, batch.num_rows()); - - let schema = batch.schema(); - assert_eq!(reader_schema, schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(0, a.0); - assert_eq!(&DataType::Int32, a.1.data_type()); - let c = schema.column_with_name("c").unwrap(); - assert_eq!(1, c.0); - assert_eq!(&DataType::Boolean, c.1.data_type()); - } - - #[test] - fn test_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(4, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!(&DataType::Utf8, d.1.data_type()); - - let aa = batch.column(a.0).as_primitive::(); - assert_eq!(1, aa.value(0)); - assert_eq!(-10, aa.value(1)); - assert_eq!(1627668684594000000, aa.value(2)); - let bb = batch.column(b.0).as_list::(); - let bb = bb.values().as_primitive::(); - assert_eq!(9, bb.len()); - assert_eq!(2.0, bb.value(0)); - assert_eq!(-6.1, bb.value(5)); - assert!(!bb.is_valid(7)); - - let cc = batch - .column(c.0) - .as_any() - .downcast_ref::() - .unwrap(); - let cc = cc.values().as_boolean(); - assert_eq!(6, cc.len()); - assert!(!cc.value(0)); - assert!(!cc.value(4)); - assert!(!cc.is_valid(5)); - } - - #[test] - fn test_invalid_json_infer_schema() { - let re = - infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new(b"}")), None); - assert_eq!( - re.err().unwrap().to_string(), - "Json error: Not valid JSON: expected value at line 1 column 1", - ); - } - - #[test] - fn test_invalid_json_read_record() { - let schema = Arc::new(Schema::new(vec![Field::new( - "a", - DataType::Struct(vec![Field::new("a", DataType::Utf8, true)].into()), - true, - )])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader = builder.build(Cursor::new(b"}")).unwrap(); - assert_eq!( - reader.next().err().unwrap().to_string(), - "Json error: Not valid JSON: expected value at line 1 column 1", - ); - } - - #[test] - fn test_coercion_scalar_and_list() { - use arrow_schema::DataType::*; - - assert_eq!( - List(Arc::new(Field::new("item", Float64, true))), - coerce_data_type(vec![ - &Float64, - &List(Arc::new(Field::new("item", Float64, true))) - ]) - ); - assert_eq!( - List(Arc::new(Field::new("item", Float64, true))), - coerce_data_type(vec![ - &Float64, - &List(Arc::new(Field::new("item", Int64, true))) - ]) - ); - assert_eq!( - List(Arc::new(Field::new("item", Int64, true))), - coerce_data_type(vec![ - &Int64, - &List(Arc::new(Field::new("item", Int64, true))) - ]) - ); - // boolean and number are incompatible, return utf8 - assert_eq!( - List(Arc::new(Field::new("item", Utf8, true))), - coerce_data_type(vec![ - &Boolean, - &List(Arc::new(Field::new("item", Float64, true))) - ]) - ); - } - - #[test] - fn test_mixed_json_arrays() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/mixed_arrays.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let schema = infer_json_schema(&mut reader, None).unwrap(); - file.rewind().unwrap(); - - let reader = BufReader::new(GzDecoder::new(&file)); - let options = DecoderOptions::new().with_batch_size(64); - let mut reader = Reader::from_buf_reader(reader, Arc::new(schema), options); - let batch_gz = reader.next().unwrap().unwrap(); - - for batch in vec![batch, batch_gz] { - assert_eq!(4, batch.num_columns()); - assert_eq!(4, batch.num_rows()); - - let schema = batch.schema(); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, a.1.data_type()); - let b = schema.column_with_name("b").unwrap(); - assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), - b.1.data_type() - ); - let c = schema.column_with_name("c").unwrap(); - assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), - c.1.data_type() - ); - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - d.1.data_type() - ); - - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::() - .unwrap(); - let bb = bb.values().as_primitive::(); - assert_eq!(10, bb.len()); - assert_eq!(4.0, bb.value(9)); - - let cc = batch.column(c.0).as_list::(); - // test that the list offsets are correct - assert_eq!(cc.value_offsets(), &[0, 2, 2, 4, 5]); - let cc = cc.values().as_boolean(); - let cc_expected = BooleanArray::from(vec![ - Some(false), - Some(true), - Some(false), - None, - Some(false), - ]); - assert_eq!(cc, &cc_expected); - - let dd = batch.column(d.0).as_list::(); - // test that the list offsets are correct - assert_eq!(dd.value_offsets(), &[0, 1, 1, 2, 6]); - - let dd = dd.values().as_string::(); - // values are 6 because a `d: null` is treated as a null slot - // and a list's null slot can be omitted from the child (i.e. same offset) - assert_eq!(6, dd.len()); - assert_eq!("text", dd.value(1)); - assert_eq!("1", dd.value(2)); - assert_eq!("false", dd.value(3)); - assert_eq!("array", dd.value(4)); - assert_eq!("2.4", dd.value(5)); - } - } - - #[test] - fn test_nested_struct_json_arrays() { - let c_field = Field::new( - "c", - DataType::Struct(vec![Field::new("d", DataType::Utf8, true)].into()), - true, - ); - let a_field = Field::new( - "a", - DataType::Struct(Fields::from(vec![ - Field::new("b", DataType::Boolean, true), - c_field.clone(), - ])), - true, - ); - let schema = Arc::new(Schema::new(vec![a_field.clone()])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/nested_structs.json").unwrap()) - .unwrap(); - - // build expected output - let d = StringArray::from(vec![Some("text"), None, Some("text"), None]); - let c = ArrayDataBuilder::new(c_field.data_type().clone()) - .len(4) - .add_child_data(d.into_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00000101]))) - .build() - .unwrap(); - let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); - let a = ArrayDataBuilder::new(a_field.data_type().clone()) - .len(4) - .add_child_data(b.into_data()) - .add_child_data(c) - .null_bit_buffer(Some(Buffer::from(vec![0b00000111]))) - .build() - .unwrap(); - let expected = make_array(a); - - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(&expected, read); - } - - #[test] - fn test_nested_list_json_arrays() { - let c_field = Field::new( - "c", - DataType::Struct(vec![Field::new("d", DataType::Utf8, true)].into()), - true, - ); - let a_struct_field = Field::new( - "a", - DataType::Struct(Fields::from(vec![ - Field::new("b", DataType::Boolean, true), - c_field.clone(), - ])), - true, - ); - let a_field = - Field::new("a", DataType::List(Arc::new(a_struct_field.clone())), true); - let schema = Arc::new(Schema::new(vec![a_field.clone()])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" - {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} - {"a": [{"b": false, "c": null}]} - {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} - {"a": null} - {"a": []} - {"a": [null]} - "#; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); - - // build expected output - let d = StringArray::from(vec![ - Some("a_text"), - Some("b_text"), - None, - Some("c_text"), - Some("d_text"), - None, - None, - ]); - let c = ArrayDataBuilder::new(c_field.data_type().clone()) - .len(7) - .add_child_data(d.to_data()) - .null_bit_buffer(Some(Buffer::from(vec![0b00111011]))) - .build() - .unwrap(); - let b = BooleanArray::from(vec![ - Some(true), - Some(false), - Some(false), - Some(true), - None, - Some(true), - None, - ]); - let a = ArrayDataBuilder::new(a_struct_field.data_type().clone()) - .len(7) - .add_child_data(b.to_data()) - .add_child_data(c.clone()) - .null_bit_buffer(Some(Buffer::from(vec![0b00111111]))) - .build() - .unwrap(); - let a_list = ArrayDataBuilder::new(a_field.data_type().clone()) - .len(6) - .add_buffer(Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7])) - .add_child_data(a) - .null_bit_buffer(Some(Buffer::from(vec![0b00110111]))) - .build() - .unwrap(); - let expected = make_array(a_list); - - // compare `a` with result from json reader - let batch = reader.next().unwrap().unwrap(); - let read = batch.column(0); - assert_eq!(read.len(), 6); - // compare the arrays the long way around, to better detect differences - let read: &ListArray = read.as_list::(); - let expected = expected.as_list::(); - assert_eq!(read.value_offsets(), &[0, 2, 3, 6, 6, 6, 7]); - // compare list null buffers - assert_eq!(read.nulls(), expected.nulls()); - // build struct from list - let struct_array = read.values().as_struct(); - let expected_struct_array = expected.values().as_struct(); - - assert_eq!(7, struct_array.len()); - assert_eq!(1, struct_array.null_count()); - assert_eq!(7, expected_struct_array.len()); - assert_eq!(1, expected_struct_array.null_count()); - // test struct's nulls - assert_eq!(struct_array.nulls(), expected_struct_array.nulls()); - // test struct's fields - let read_b = struct_array.column(0); - assert_eq!(b.data_ref(), read_b.data_ref()); - let read_c = struct_array.column(1); - assert_eq!(&c, read_c.data_ref()); - let read_c: &StructArray = read_c.as_any().downcast_ref::().unwrap(); - let read_d = read_c.column(0); - assert_eq!(d.data_ref(), read_d.data_ref()); - - assert_eq!(read.data_ref(), expected.data_ref()); - } - - #[test] - fn test_map_json_arrays() { - let account_field = Field::new("account", DataType::UInt16, false); - let value_list_type = - DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))); - let entries_struct_type = DataType::Struct(Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", value_list_type.clone(), true), - ])); - let stocks_field = Field::new( - "stocks", - DataType::Map( - Arc::new(Field::new("entries", entries_struct_type.clone(), false)), - false, - ), - true, - ); - let schema = Arc::new(Schema::new(vec![account_field, stocks_field.clone()])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - // Note: account 456 has 'long' twice, to show that the JSON reader will overwrite - // existing keys. This thus guarantees unique keys for the map - let json_content = r#" - {"account": 123, "stocks":{"long": ["$AAA", "$BBB"], "short": ["$CCC", "$D"]}} - {"account": 456, "stocks":{"long": null, "long": ["$AAA", "$CCC", "$D"], "short": null}} - {"account": 789, "stocks":{"hedged": ["$YYY"], "long": null, "short": ["$D"]}} - "#; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); - - // build expected output - let expected_accounts = UInt16Array::from(vec![123, 456, 789]); - - let expected_keys = StringArray::from(vec![ - "long", "short", "long", "short", "hedged", "long", "short", - ]) - .into_data(); - let expected_value_array_data = StringArray::from(vec![ - "$AAA", "$BBB", "$CCC", "$D", "$AAA", "$CCC", "$D", "$YYY", "$D", - ]) - .into_data(); - // Create the list that holds ["$_", "$_"] - let expected_values = ArrayDataBuilder::new(value_list_type) - .len(7) - .add_buffer(Buffer::from( - vec![0i32, 2, 4, 7, 7, 8, 8, 9].to_byte_slice(), - )) - .add_child_data(expected_value_array_data) - .null_bit_buffer(Some(Buffer::from(vec![0b01010111]))) - .build() - .unwrap(); - let expected_stocks_entries_data = ArrayDataBuilder::new(entries_struct_type) - .len(7) - .add_child_data(expected_keys) - .add_child_data(expected_values) - .build() - .unwrap(); - let expected_stocks_data = - ArrayDataBuilder::new(stocks_field.data_type().clone()) - .len(3) - .add_buffer(Buffer::from(vec![0i32, 2, 4, 7].to_byte_slice())) - .add_child_data(expected_stocks_entries_data) - .build() - .unwrap(); - - let expected_stocks = make_array(expected_stocks_data); - - // compare with result from json reader - let batch = reader.next().unwrap().unwrap(); - assert_eq!(batch.num_rows(), 3); - assert_eq!(batch.num_columns(), 2); - let col1 = batch.column(0); - assert_eq!(col1.as_ref(), &expected_accounts); - // Compare the map - let col2 = batch.column(1); - assert_eq!(col2.as_ref(), &expected_stocks); - } - - #[test] - fn test_dictionary_from_json_basic_with_nulls() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - d.1.data_type() - ); - - let dd = batch - .column(d.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert!(!dd.is_valid(0)); - assert!(dd.is_valid(1)); - assert!(dd.is_valid(2)); - assert!(!dd.is_valid(11)); - - assert_eq!( - dd.keys(), - &Int16Array::from(vec![ - None, - Some(0), - Some(1), - Some(0), - None, - None, - Some(0), - None, - Some(1), - Some(0), - Some(0), - None - ]) - ); - } - - #[test] - fn test_dictionary_from_json_int8() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_dictionary_from_json_int32() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_dictionary_from_json_int64() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_skip_empty_lines() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " - {\"a\": 1} - - {\"a\": 2} - - {\"a\": 3}"; - let mut reader = builder.build(Cursor::new(json_content)).unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let c = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Int64, c.1.data_type()); - } - - #[test] - fn test_row_type_validation() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(64); - let json_content = " - [1, \"hello\"] - \"world\""; - let re = builder.build(Cursor::new(json_content)); - assert_eq!( - re.err().unwrap().to_string(), - r#"Json error: Expected JSON record to be an object, found Array [Number(1), String("hello")]"#, - ); - } - - #[test] - fn test_list_of_string_dictionary_from_json() { - let schema = Schema::new(vec![Field::new( - "events", - List(Arc::new(Field::new( - "item", - Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - ))), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/list_string_dict_nested.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let events = schema.column_with_name("events").unwrap(); - assert_eq!( - &List(Arc::new(Field::new( - "item", - Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true - ))), - events.1.data_type() - ); - - let evs_list = batch - .column(events.0) - .as_any() - .downcast_ref::() - .unwrap(); - let evs_list = evs_list.values().as_dictionary::(); - assert_eq!(6, evs_list.len()); - assert!(evs_list.is_valid(1)); - assert_eq!(DataType::Utf8, evs_list.value_type()); - - // dict from the events list - let dict_el = evs_list.values().as_string::(); - assert_eq!(3, dict_el.len()); - assert_eq!("Elect Leader", dict_el.value(0)); - assert_eq!("Do Ballot", dict_el.value(1)); - assert_eq!("Send Data", dict_el.value(2)); - } - - #[test] - fn test_list_of_string_dictionary_from_json_with_nulls() { - let schema = Schema::new(vec![Field::new( - "events", - List(Arc::new(Field::new( - "item", - Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - ))), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::( - File::open("test/data/list_string_dict_nested_nulls.json").unwrap(), - ) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(3, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let events = schema.column_with_name("events").unwrap(); - assert_eq!( - &List(Arc::new(Field::new( - "item", - Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true - ))), - events.1.data_type() - ); - - let evs_list = batch - .column(events.0) - .as_any() - .downcast_ref::() - .unwrap(); - let evs_list = evs_list.values().as_dictionary::(); - assert_eq!(8, evs_list.len()); - assert!(evs_list.is_valid(1)); - assert_eq!(DataType::Utf8, evs_list.value_type()); - - // dict from the events list - let dict_el = evs_list.values(); - let dict_el = dict_el.as_any().downcast_ref::().unwrap(); - assert_eq!(2, evs_list.null_count()); - assert_eq!(3, dict_el.len()); - assert_eq!("Elect Leader", dict_el.value(0)); - assert_eq!("Do Ballot", dict_el.value(1)); - assert_eq!("Send Data", dict_el.value(2)); - } - - #[test] - fn test_dictionary_from_json_uint8() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_dictionary_from_json_uint32() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_dictionary_from_json_uint64() { - let schema = Schema::new(vec![Field::new( - "d", - Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - true, - )]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let d = schema.column_with_name("d").unwrap(); - assert_eq!( - &Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), - d.1.data_type() - ); - } - - #[test] - fn test_with_multiple_batches() { - let builder = ReaderBuilder::new() - .infer_schema(Some(4)) - .with_batch_size(5); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - - let mut num_records = Vec::new(); - while let Some(rb) = reader.next().unwrap() { - num_records.push(rb.num_rows()); - } - - assert_eq!(vec![5, 5, 2], num_records); - } - - #[test] - fn test_json_infer_schema() { - let schema = Schema::new(vec![ - Field::new("a", DataType::Int64, true), - Field::new( - "b", - DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), - true, - ), - Field::new( - "c", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), - true, - ), - Field::new( - "d", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - true, - ), - ]); - - let mut reader = - BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - - let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); - let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); - - assert_eq!(inferred_schema, schema); - } - - #[test] - fn test_json_infer_schema_nested_structs() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::Struct(Fields::from(vec![ - Field::new("a", DataType::Boolean, true), - Field::new( - "b", - DataType::Struct( - vec![Field::new("c", DataType::Utf8, true)].into(), - ), - true, - ), - ])), - true, - ), - Field::new("c2", DataType::Int64, true), - Field::new("c3", DataType::Utf8, true), - ]); - - let inferred_schema = infer_json_schema_from_iterator( - vec![ - Ok(serde_json::json!({"c1": {"a": true, "b": {"c": "text"}}, "c2": 1})), - Ok(serde_json::json!({"c1": {"a": false, "b": null}, "c2": 0})), - Ok(serde_json::json!({"c1": {"a": true, "b": {"c": "text"}}, "c3": "ok"})), - ] - .into_iter(), - ) - .unwrap(); - - assert_eq!(inferred_schema, schema); - } - - #[test] - fn test_json_infer_schema_struct_in_list() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::Struct(Fields::from(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int64, true), - Field::new("c", DataType::Boolean, true), - ])), - true, - ))), - true, - ), - Field::new("c2", DataType::Float64, true), - Field::new( - "c3", - // empty json array's inner types are inferred as null - DataType::List(Arc::new(Field::new("item", DataType::Null, true))), - true, - ), - ]); - - let inferred_schema = infer_json_schema_from_iterator( - vec![ - Ok(serde_json::json!({ - "c1": [{"a": "foo", "b": 100}], "c2": 1, "c3": [], - })), - Ok(serde_json::json!({ - "c1": [{"a": "bar", "b": 2}, {"a": "foo", "c": true}], "c2": 0, "c3": [], - })), - Ok(serde_json::json!({"c1": [], "c2": 0.5, "c3": []})), - ] - .into_iter(), - ) - .unwrap(); - - assert_eq!(inferred_schema, schema); - } - - #[test] - fn test_json_infer_schema_nested_list() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - true, - ))), - true, - ), - Field::new("c2", DataType::Float64, true), - ]); - - let inferred_schema = infer_json_schema_from_iterator( - vec![ - Ok(serde_json::json!({ - "c1": [], - "c2": 12, - })), - Ok(serde_json::json!({ - "c1": [["a", "b"], ["c"]], - })), - Ok(serde_json::json!({ - "c1": [["foo"]], - "c2": 0.11, - })), - ] - .into_iter(), - ) - .unwrap(); - - assert_eq!(inferred_schema, schema); - } - - #[test] - fn test_timestamp_from_json_seconds() { - let schema = Schema::new(vec![Field::new( - "a", - DataType::Timestamp(TimeUnit::Second, None), - true, - )]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!( - &DataType::Timestamp(TimeUnit::Second, None), - a.1.data_type() - ); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(2)); - assert_eq!(1, aa.value(0)); - assert_eq!(1, aa.value(3)); - assert_eq!(5, aa.value(7)); - } - - #[test] - fn test_timestamp_from_json_milliseconds() { - let schema = Schema::new(vec![Field::new( - "a", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - )]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!( - &DataType::Timestamp(TimeUnit::Millisecond, None), - a.1.data_type() - ); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(2)); - assert_eq!(1, aa.value(0)); - assert_eq!(1, aa.value(3)); - assert_eq!(5, aa.value(7)); - } - - #[test] - fn test_date_from_json_milliseconds() { - let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Date64, a.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(2)); - assert_eq!(1, aa.value(0)); - assert_eq!(1, aa.value(3)); - assert_eq!(5, aa.value(7)); - } - - #[test] - fn test_time_from_json_nanoseconds() { - let schema = Schema::new(vec![Field::new( - "a", - DataType::Time64(TimeUnit::Nanosecond), - true, - )]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(1, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - assert_eq!(&DataType::Time64(TimeUnit::Nanosecond), a.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(aa.is_valid(0)); - assert!(!aa.is_valid(1)); - assert!(!aa.is_valid(2)); - assert_eq!(1, aa.value(0)); - assert_eq!(1, aa.value(3)); - assert_eq!(5, aa.value(7)); - } - - #[test] - fn test_time_from_string() { - parse_string_column::(4); - parse_string_column::(4); - parse_string_column::(4); - parse_string_column::(4); - } - - fn parse_string_column(value: T::Native) - where - T: ArrowPrimitiveType, - { - let schema = Schema::new(vec![Field::new("d", T::DATA_TYPE, true)]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic_nulls.json").unwrap()) - .unwrap(); - - let batch = reader.next().unwrap().unwrap(); - let dd = batch - .column(0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(value, dd.value(1)); - assert!(!dd.is_valid(2)); - } - - #[test] - fn test_json_read_nested_list() { - let schema = Schema::new(vec![Field::new( - "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - true, - ))), - true, - )]); - - let decoder = Decoder::new(Arc::new(schema), DecoderOptions::new()); - let batch = decoder - .next_batch( - &mut vec![ - Ok(serde_json::json!({ - "c1": [], - })), - Ok(serde_json::json!({ - "c1": [["a", "b"], ["c"], ["e", "f"], ["g"], ["h"], ["i"], ["j"], ["k"]], - })), - Ok(serde_json::json!({ - "c1": [["foo"], ["bar"]], - })), - ] - .into_iter(), - ) - .unwrap() - .unwrap(); - - assert_eq!(batch.num_columns(), 1); - assert_eq!(batch.num_rows(), 3); - } - - #[test] - fn test_json_read_list_of_structs() { - let schema = Schema::new(vec![Field::new( - "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::Struct(vec![Field::new("a", DataType::Int64, true)].into()), - true, - ))), - true, - )]); - - let decoder = Decoder::new(Arc::new(schema), DecoderOptions::new()); - let batch = decoder - .next_batch( - // NOTE: total struct element count needs to be greater than - // bit_util::ceil(array_count, 8) to test validity bit buffer length calculation - // logic - &mut vec![ - Ok(serde_json::json!({ - "c1": [{"a": 1}], - })), - Ok(serde_json::json!({ - "c1": [{"a": 2}, {"a": 3}, {"a": 4}, {"a": 5}, {"a": 6}, {"a": 7}], - })), - Ok(serde_json::json!({ - "c1": [{"a": 10}, {"a": 11}], - })), - ] - .into_iter(), - ) - .unwrap() - .unwrap(); - - assert_eq!(batch.num_columns(), 1); - assert_eq!(batch.num_rows(), 3); - } - - #[test] - fn test_json_read_binary_structs() { - let schema = Schema::new(vec![Field::new("c1", DataType::Binary, true)]); - let decoder = Decoder::new(Arc::new(schema), DecoderOptions::new()); - let batch = decoder - .next_batch( - &mut vec![ - Ok(serde_json::json!({ - "c1": "₁₂₃", - })), - Ok(serde_json::json!({ - "c1": "foo", - })), - ] - .into_iter(), - ) - .unwrap() - .unwrap(); - let data = batch.columns().iter().collect::>(); - - let schema = Schema::new(vec![Field::new("c1", DataType::Binary, true)]); - let binary_values = BinaryArray::from(vec!["₁₂₃".as_bytes(), "foo".as_bytes()]); - let expected_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(binary_values)]) - .unwrap(); - let expected_data = expected_batch.columns().iter().collect::>(); - - assert_eq!(data, expected_data); - assert_eq!(batch.num_columns(), 1); - assert_eq!(batch.num_rows(), 2); - } - - #[test] - fn test_json_iterator() { - let builder = ReaderBuilder::new().infer_schema(None).with_batch_size(5); - let reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let schema = reader.schema(); - let (col_a_index, _) = schema.column_with_name("a").unwrap(); - - let mut sum_num_rows = 0; - let mut num_batches = 0; - let mut sum_a = 0; - for batch in reader { - let batch = batch.unwrap(); - assert_eq!(7, batch.num_columns()); - sum_num_rows += batch.num_rows(); - num_batches += 1; - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - let a_array = batch - .column(col_a_index) - .as_any() - .downcast_ref::() - .unwrap(); - sum_a += (0..a_array.len()).map(|i| a_array.value(i)).sum::(); - } - assert_eq!(12, sum_num_rows); - assert_eq!(3, num_batches); - assert_eq!(100000000000011, sum_a); - } - - #[test] - fn test_options_clone() { - // ensure options have appropriate derivation - let options = DecoderOptions::new().with_batch_size(64); - let cloned = options.clone(); - assert_eq!(options, cloned); - } - - pub fn decimal_json_tests(data_type: DataType) { - let schema = Schema::new(vec![ - Field::new("a", data_type.clone(), true), - Field::new("b", data_type.clone(), true), - Field::new("f", data_type.clone(), true), - ]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/basic.json").unwrap()) - .unwrap(); - let batch = reader.next().unwrap().unwrap(); - - assert_eq!(3, batch.num_columns()); - assert_eq!(12, batch.num_rows()); - - let schema = reader.schema(); - let batch_schema = batch.schema(); - assert_eq!(schema, batch_schema); - - let a = schema.column_with_name("a").unwrap(); - let b = schema.column_with_name("b").unwrap(); - let f = schema.column_with_name("f").unwrap(); - assert_eq!(&data_type, a.1.data_type()); - assert_eq!(&data_type, b.1.data_type()); - assert_eq!(&data_type, f.1.data_type()); - - let aa = batch - .column(a.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(T::Native::usize_as(100), aa.value(0)); - assert_eq!(T::Native::usize_as(100), aa.value(3)); - assert_eq!(T::Native::usize_as(500), aa.value(7)); - - let bb = batch - .column(b.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(T::Native::usize_as(200), bb.value(0)); - assert_eq!(T::Native::usize_as(350).neg_wrapping(), bb.value(1)); - assert_eq!(T::Native::usize_as(60), bb.value(8)); - - let ff = batch - .column(f.0) - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(T::Native::usize_as(102), ff.value(0)); - assert_eq!(T::Native::usize_as(30).neg_wrapping(), ff.value(1)); - assert_eq!(T::Native::usize_as(137722), ff.value(2)); - - assert_eq!(T::Native::usize_as(133700), ff.value(3)); - assert_eq!(T::Native::usize_as(9999999999), ff.value(7)); - } - - #[test] - fn test_decimal_from_json() { - decimal_json_tests::(DataType::Decimal128(10, 2)); - decimal_json_tests::(DataType::Decimal256(10, 2)); - } -} diff --git a/arrow-json/src/raw/boolean_array.rs b/arrow-json/src/reader/boolean_array.rs similarity index 94% rename from arrow-json/src/raw/boolean_array.rs rename to arrow-json/src/reader/boolean_array.rs index 12917785e5b0..9a7f226805da 100644 --- a/arrow-json/src/raw/boolean_array.rs +++ b/arrow-json/src/reader/boolean_array.rs @@ -20,8 +20,8 @@ use arrow_array::Array; use arrow_data::ArrayData; use arrow_schema::ArrowError; -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{tape_error, ArrayDecoder}; #[derive(Default)] pub struct BooleanArrayDecoder {} diff --git a/arrow-json/src/raw/decimal_array.rs b/arrow-json/src/reader/decimal_array.rs similarity index 96% rename from arrow-json/src/raw/decimal_array.rs rename to arrow-json/src/reader/decimal_array.rs index 0518b4cef7c4..508409ec75bd 100644 --- a/arrow-json/src/raw/decimal_array.rs +++ b/arrow-json/src/reader/decimal_array.rs @@ -24,8 +24,8 @@ use arrow_cast::parse::parse_decimal; use arrow_data::ArrayData; use arrow_schema::ArrowError; -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{tape_error, ArrayDecoder}; pub struct DecimalArrayDecoder { precision: u8, diff --git a/arrow-json/src/raw/list_array.rs b/arrow-json/src/reader/list_array.rs similarity index 97% rename from arrow-json/src/raw/list_array.rs rename to arrow-json/src/reader/list_array.rs index a57f4273369b..ac35f998876c 100644 --- a/arrow-json/src/raw/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_array::OffsetSizeTrait; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; diff --git a/arrow-json/src/raw/map_array.rs b/arrow-json/src/reader/map_array.rs similarity index 97% rename from arrow-json/src/raw/map_array.rs rename to arrow-json/src/reader/map_array.rs index dee142bef6db..3662e594ba90 100644 --- a/arrow-json/src/raw/map_array.rs +++ b/arrow-json/src/reader/map_array.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::ArrowNativeType; diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/reader/mod.rs similarity index 68% rename from arrow-json/src/raw/mod.rs rename to arrow-json/src/reader/mod.rs index 38b4cce9bd9a..d36493a47c88 100644 --- a/arrow-json/src/raw/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. -//! A faster JSON reader that will eventually replace [`Reader`] +//! JSON reader //! -//! [`Reader`]: crate::reader::Reader +//! This JSON reader allows JSON line-delimited files to be read into the Arrow memory +//! model. Records are loaded in batches and are then converted from row-based data to +//! columnar data. //! //! # Basic Usage //! -//! [`RawReader`] can be used directly with synchronous data sources, such as [`std::fs::File`] +//! [`Reader`] can be used directly with synchronous data sources, such as [`std::fs::File`] //! //! ``` //! # use arrow_schema::*; @@ -37,13 +39,13 @@ //! //! let file = File::open("test/data/basic.json").unwrap(); //! -//! let mut json = arrow_json::RawReaderBuilder::new(schema).build(BufReader::new(file)).unwrap(); +//! let mut json = arrow_json::ReaderBuilder::new(schema).build(BufReader::new(file)).unwrap(); //! let batch = json.next().unwrap().unwrap(); //! ``` //! //! # Async Usage //! -//! The lower-level [`RawDecoder`] can be integrated with various forms of async data streams, +//! The lower-level [`Decoder`] can be integrated with various forms of async data streams, //! and is designed to be agnostic to the various different kinds of async IO primitives found //! within the Rust ecosystem. //! @@ -55,10 +57,10 @@ //! # use arrow_schema::ArrowError; //! # use futures::stream::{Stream, StreamExt}; //! # use arrow_array::RecordBatch; -//! # use arrow_json::RawDecoder; +//! # use arrow_json::reader::Decoder; //! # //! fn decode_stream + Unpin>( -//! mut decoder: RawDecoder, +//! mut decoder: Decoder, //! mut input: S, //! ) -> impl Stream> { //! let mut buffered = Bytes::new(); @@ -97,10 +99,10 @@ //! # use futures::{Stream, TryStreamExt}; //! # use tokio::io::AsyncBufRead; //! # use arrow_array::RecordBatch; -//! # use arrow_json::RawDecoder; +//! # use arrow_json::reader::Decoder; //! # use arrow_schema::ArrowError; //! fn decode_stream( -//! mut decoder: RawDecoder, +//! mut decoder: Decoder, //! mut reader: R, //! ) -> impl Stream> { //! futures::stream::poll_fn(move |cx| { @@ -127,46 +129,51 @@ //! ``` //! -use crate::raw::boolean_array::BooleanArrayDecoder; -use crate::raw::decimal_array::DecimalArrayDecoder; -use crate::raw::list_array::ListArrayDecoder; -use crate::raw::map_array::MapArrayDecoder; -use crate::raw::primitive_array::PrimitiveArrayDecoder; -use crate::raw::string_array::StringArrayDecoder; -use crate::raw::struct_array::StructArrayDecoder; -use crate::raw::tape::{Tape, TapeDecoder, TapeElement}; -use crate::raw::timestamp_array::TimestampArrayDecoder; +use std::io::BufRead; + +use chrono::Utc; +use serde::Serialize; + use arrow_array::timezone::Tz; use arrow_array::types::Float32Type; use arrow_array::types::*; -use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader}; +use arrow_array::{downcast_integer, RecordBatch, RecordBatchReader, StructArray}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, SchemaRef, TimeUnit}; -use chrono::Utc; -use serde::Serialize; -use std::io::BufRead; +pub use schema::*; + +use crate::reader::boolean_array::BooleanArrayDecoder; +use crate::reader::decimal_array::DecimalArrayDecoder; +use crate::reader::list_array::ListArrayDecoder; +use crate::reader::map_array::MapArrayDecoder; +use crate::reader::primitive_array::PrimitiveArrayDecoder; +use crate::reader::string_array::StringArrayDecoder; +use crate::reader::struct_array::StructArrayDecoder; +use crate::reader::tape::{Tape, TapeDecoder, TapeElement}; +use crate::reader::timestamp_array::TimestampArrayDecoder; mod boolean_array; mod decimal_array; mod list_array; mod map_array; mod primitive_array; +mod schema; mod serializer; mod string_array; mod struct_array; mod tape; mod timestamp_array; -/// A builder for [`RawReader`] and [`RawDecoder`] -pub struct RawReaderBuilder { +/// A builder for [`Reader`] and [`Decoder`] +pub struct ReaderBuilder { batch_size: usize, coerce_primitive: bool, schema: SchemaRef, } -impl RawReaderBuilder { - /// Create a new [`RawReaderBuilder`] with the provided [`SchemaRef`] +impl ReaderBuilder { + /// Create a new [`ReaderBuilder`] with the provided [`SchemaRef`] /// /// This could be obtained using [`infer_json_schema`] if not known /// @@ -194,16 +201,16 @@ impl RawReaderBuilder { } } - /// Create a [`RawReader`] with the provided [`BufRead`] - pub fn build(self, reader: R) -> Result, ArrowError> { - Ok(RawReader { + /// Create a [`Reader`] with the provided [`BufRead`] + pub fn build(self, reader: R) -> Result, ArrowError> { + Ok(Reader { reader, decoder: self.build_decoder()?, }) } - /// Create a [`RawDecoder`] - pub fn build_decoder(self) -> Result { + /// Create a [`Decoder`] + pub fn build_decoder(self) -> Result { let decoder = make_decoder( DataType::Struct(self.schema.fields.clone()), self.coerce_primitive, @@ -211,7 +218,7 @@ impl RawReaderBuilder { )?; let num_fields = self.schema.all_fields().len(); - Ok(RawDecoder { + Ok(Decoder { decoder, tape_decoder: TapeDecoder::new(self.batch_size, num_fields), batch_size: self.batch_size, @@ -222,26 +229,21 @@ impl RawReaderBuilder { /// Reads JSON data with a known schema directly into arrow [`RecordBatch`] /// -/// This is significantly faster than [`Reader`] and eventually intended -/// to replace it ([#3610](https://github.com/apache/arrow-rs/issues/3610)) -/// /// Lines consisting solely of ASCII whitespace are ignored -/// -/// [`Reader`]: crate::reader::Reader -pub struct RawReader { +pub struct Reader { reader: R, - decoder: RawDecoder, + decoder: Decoder, } -impl std::fmt::Debug for RawReader { +impl std::fmt::Debug for Reader { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("RawReader") + f.debug_struct("Reader") .field("decoder", &self.decoder) .finish() } } -impl RawReader { +impl Reader { /// Reads the next [`RecordBatch`] returning `Ok(None)` if EOF fn read(&mut self) -> Result, ArrowError> { loop { @@ -261,7 +263,7 @@ impl RawReader { } } -impl Iterator for RawReader { +impl Iterator for Reader { type Item = Result; fn next(&mut self) -> Option { @@ -269,7 +271,7 @@ impl Iterator for RawReader { } } -impl RecordBatchReader for RawReader { +impl RecordBatchReader for Reader { fn schema(&self) -> SchemaRef { self.decoder.schema.clone() } @@ -277,7 +279,7 @@ impl RecordBatchReader for RawReader { /// A low-level interface for reading JSON data from a byte stream /// -/// See [`RawReader`] for a higher-level interface for interface with [`BufRead`] +/// See [`Reader`] for a higher-level interface for interface with [`BufRead`] /// /// The push-based interface facilitates integration with sources that yield arbitrarily /// delimited bytes ranges, such as [`BufRead`], or a chunked byte stream received from @@ -286,17 +288,17 @@ impl RecordBatchReader for RawReader { /// ``` /// # use std::io::BufRead; /// # use arrow_array::RecordBatch; -/// # use arrow_json::{RawDecoder, RawReaderBuilder}; +/// # use arrow_json::reader::{Decoder, ReaderBuilder}; /// # use arrow_schema::{ArrowError, SchemaRef}; /// # /// fn read_from_json( /// mut reader: R, /// schema: SchemaRef, /// ) -> Result>, ArrowError> { -/// let mut decoder = RawReaderBuilder::new(schema).build_decoder()?; +/// let mut decoder = ReaderBuilder::new(schema).build_decoder()?; /// let mut next = move || { /// loop { -/// // RawDecoder is agnostic that buf doesn't contain whole records +/// // Decoder is agnostic that buf doesn't contain whole records /// let buf = reader.fill_buf()?; /// if buf.is_empty() { /// break; // Input exhausted @@ -315,23 +317,23 @@ impl RecordBatchReader for RawReader { /// Ok(std::iter::from_fn(move || next().transpose())) /// } /// ``` -pub struct RawDecoder { +pub struct Decoder { tape_decoder: TapeDecoder, decoder: Box, batch_size: usize, schema: SchemaRef, } -impl std::fmt::Debug for RawDecoder { +impl std::fmt::Debug for Decoder { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("RawDecoder") + f.debug_struct("Decoder") .field("schema", &self.schema) .field("batch_size", &self.batch_size) .finish() } } -impl RawDecoder { +impl Decoder { /// Read JSON objects from `buf`, returning the number of bytes read /// /// This method returns once `batch_size` objects have been parsed since the @@ -344,7 +346,7 @@ impl RawDecoder { self.tape_decoder.decode(buf) } - /// Serialize `rows` to this [`RawDecoder`] + /// Serialize `rows` to this [`Decoder`] /// /// This provides a simple way to convert [serde]-compatible datastructures into arrow /// [`RecordBatch`]. @@ -360,12 +362,12 @@ impl RawDecoder { /// # use serde_json::{Value, json}; /// # use arrow_array::cast::AsArray; /// # use arrow_array::types::Float32Type; - /// # use arrow_json::RawReaderBuilder; + /// # use arrow_json::ReaderBuilder; /// # use arrow_schema::{DataType, Field, Schema}; /// let json = vec![json!({"float": 2.3}), json!({"float": 5.7})]; /// /// let schema = Schema::new(vec![Field::new("float", DataType::Float32, true)]); - /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); /// /// decoder.serialize(&json).unwrap(); /// let batch = decoder.flush().unwrap().unwrap(); @@ -379,7 +381,7 @@ impl RawDecoder { /// /// ``` /// # use std::sync::Arc; - /// # use arrow_json::RawReaderBuilder; + /// # use arrow_json::ReaderBuilder; /// # use arrow_schema::{DataType, Field, Schema}; /// # use serde::Serialize; /// # use arrow_array::cast::AsArray; @@ -401,7 +403,7 @@ impl RawDecoder { /// MyStruct{ int32: 4, float: 67.53 }, /// ]; /// - /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); /// decoder.serialize(&rows).unwrap(); /// /// let batch = decoder.flush().unwrap().unwrap(); @@ -421,7 +423,7 @@ impl RawDecoder { /// # use std::sync::Arc; /// # use arrow_array::StructArray; /// # use arrow_cast::display::{ArrayFormatter, FormatOptions}; - /// # use arrow_json::RawReaderBuilder; + /// # use arrow_json::ReaderBuilder; /// # use arrow_schema::{DataType, Field, Fields, Schema}; /// # use serde::Serialize; /// # @@ -501,7 +503,7 @@ impl RawDecoder { /// ]; /// /// let schema = Schema::new(MyStruct::fields()); - /// let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); + /// let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); /// decoder.serialize(&data).unwrap(); /// let batch = decoder.flush().unwrap().unwrap(); /// assert_eq!(batch.num_rows(), 3); @@ -554,14 +556,8 @@ impl RawDecoder { assert_eq!(decoded.null_count(), 0); assert_eq!(decoded.len(), pos.len()); - // Clear out buffer - let columns = decoded - .child_data() - .iter() - .map(|x| make_array(x.clone())) - .collect(); - - let batch = RecordBatch::try_new(self.schema.clone(), columns)?; + let batch = RecordBatch::from(StructArray::from(decoded)) + .with_schema(self.schema.clone())?; Ok(Some(batch)) } } @@ -641,20 +637,25 @@ fn tape_error(d: TapeElement, expected: &str) -> ArrowError { } #[cfg(test)] -#[allow(deprecated)] mod tests { - use super::*; - use crate::reader::infer_json_schema; - use crate::ReaderBuilder; + use std::fs::File; + use std::io::{BufReader, Cursor, Seek}; + use std::sync::Arc; + use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{Array, StructArray}; - use arrow_buffer::ArrowNativeType; + use arrow_array::{ + make_array, Array, BooleanArray, ListArray, StringArray, StructArray, + }; + use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; + use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType, Field, Schema}; - use std::fs::File; - use std::io::{BufReader, Cursor, Seek}; - use std::sync::Arc; + + use crate::reader::infer_json_schema; + use crate::ReaderBuilder; + + use super::*; fn do_read( buf: &str, @@ -666,7 +667,7 @@ mod tests { // Test with different batch sizes to test for boundary conditions for batch_size in [1, 3, 100, batch_size] { - unbuffered = RawReaderBuilder::new(schema.clone()) + unbuffered = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) .coerce_primitive(coerce_primitive) .build(Cursor::new(buf.as_bytes())) @@ -680,7 +681,7 @@ mod tests { // Test with different buffer sizes to test for boundary conditions for b in [1, 3, 5] { - let buffered = RawReaderBuilder::new(schema.clone()) + let buffered = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) .coerce_primitive(coerce_primitive) .build(BufReader::with_capacity(b, Cursor::new(buf.as_bytes()))) @@ -956,39 +957,12 @@ mod tests { assert_eq!(formatter.value(2).to_string(), "{c: null, a: [baz]}"); } - #[test] - fn integration_test() { - let files = [ - "test/data/basic.json", - "test/data/basic_nulls.json", - "test/data/list_string_dict_nested_nulls.json", - ]; - - for file in files { - let mut f = BufReader::new(File::open(file).unwrap()); - let schema = Arc::new(infer_json_schema(&mut f, None).unwrap()); - - f.rewind().unwrap(); - let a = ReaderBuilder::new() - .with_schema(schema.clone()) - .build(&mut f) - .unwrap(); - let a_result = a.into_iter().collect::, _>>().unwrap(); - - f.rewind().unwrap(); - let b = RawReaderBuilder::new(schema).build(f).unwrap(); - let b_result = b.into_iter().collect::, _>>().unwrap(); - - assert_eq!(a_result, b_result); - } - } - #[test] fn test_not_coercing_primitive_into_string_without_flag() { let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)])); let buf = r#"{"a": 1}"#; - let result = RawReaderBuilder::new(schema.clone()) + let result = ReaderBuilder::new(schema.clone()) .with_batch_size(1024) .build(Cursor::new(buf.as_bytes())) .unwrap() @@ -1001,7 +975,7 @@ mod tests { ); let buf = r#"{"a": true}"#; - let result = RawReaderBuilder::new(schema) + let result = ReaderBuilder::new(schema) .with_batch_size(1024) .build(Cursor::new(buf.as_bytes())) .unwrap() @@ -1337,20 +1311,20 @@ mod tests { vec![Field::new("bar", child, false)], true, )])); - let mut reader = RawReaderBuilder::new(schema.clone()) + let mut reader = ReaderBuilder::new(schema.clone()) .build(Cursor::new(non_null.as_bytes())) .unwrap(); assert!(reader.next().unwrap().is_err()); // Should error as not nullable let null = r#"{"foo": {bar: null}}"#; - let mut reader = RawReaderBuilder::new(schema.clone()) + let mut reader = ReaderBuilder::new(schema.clone()) .build(Cursor::new(null.as_bytes())) .unwrap(); assert!(reader.next().unwrap().is_err()); // Should error as not nullable // Test nulls in nullable parent can mask nulls in non-nullable child let null = r#"{"foo": null}"#; - let mut reader = RawReaderBuilder::new(schema) + let mut reader = ReaderBuilder::new(schema) .build(Cursor::new(null.as_bytes())) .unwrap(); let batch = reader.next().unwrap().unwrap(); @@ -1399,4 +1373,497 @@ mod tests { let u64 = batches[0].column(1).as_primitive::(); assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]); } + + fn read_file(path: &str, schema: Option) -> Reader> { + let file = File::open(path).unwrap(); + let mut reader = BufReader::new(file); + let schema = schema.unwrap_or_else(|| { + let schema = infer_json_schema(&mut reader, None).unwrap(); + reader.rewind().unwrap(); + schema + }); + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(64); + builder.build(reader).unwrap() + } + + #[test] + fn test_json_basic() { + let mut reader = read_file("test/data/basic.json", None); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(7, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(0, a.0); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(1, b.0); + assert_eq!(&DataType::Float64, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(2, c.0); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(3, d.0); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert_eq!(1, aa.value(0)); + assert_eq!(-10, aa.value(1)); + let bb = batch.column(b.0).as_primitive::(); + assert_eq!(2.0, bb.value(0)); + assert_eq!(-3.5, bb.value(1)); + let cc = batch.column(c.0).as_boolean(); + assert!(!cc.value(0)); + assert!(cc.value(10)); + let dd = batch.column(d.0).as_string::(); + assert_eq!("4", dd.value(0)); + assert_eq!("text", dd.value(8)); + } + + #[test] + fn test_json_empty_projection() { + let mut reader = read_file("test/data/basic.json", Some(Schema::empty())); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(0, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + } + + #[test] + fn test_json_basic_with_nulls() { + let mut reader = read_file("test/data/basic_nulls.json", None); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(&DataType::Float64, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(11)); + let bb = batch.column(b.0).as_primitive::(); + assert!(bb.is_valid(0)); + assert!(!bb.is_valid(2)); + assert!(!bb.is_valid(11)); + let cc = batch.column(c.0).as_boolean(); + assert!(cc.is_valid(0)); + assert!(!cc.is_valid(4)); + assert!(!cc.is_valid(11)); + let dd = batch.column(d.0).as_string::(); + assert!(!dd.is_valid(0)); + assert!(dd.is_valid(1)); + assert!(!dd.is_valid(4)); + assert!(!dd.is_valid(11)); + } + + #[test] + fn test_json_basic_schema() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Float32, false), + Field::new("c", DataType::Boolean, false), + Field::new("d", DataType::Utf8, false), + ]); + + let mut reader = read_file("test/data/basic.json", Some(schema.clone())); + let reader_schema = reader.schema(); + assert_eq!(reader_schema.as_ref(), &schema); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = batch.schema(); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!(&DataType::Float32, b.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(&DataType::Boolean, c.1.data_type()); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert_eq!(1, aa.value(0)); + assert_eq!(100000000000000, aa.value(11)); + let bb = batch.column(b.0).as_primitive::(); + assert_eq!(2.0, bb.value(0)); + assert_eq!(-3.5, bb.value(1)); + } + + #[test] + fn test_json_basic_schema_projection() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("c", DataType::Boolean, false), + ]); + + let mut reader = read_file("test/data/basic.json", Some(schema.clone())); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(2, batch.num_columns()); + assert_eq!(2, batch.schema().fields().len()); + assert_eq!(12, batch.num_rows()); + + assert_eq!(batch.schema().as_ref(), &schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(0, a.0); + assert_eq!(&DataType::Int64, a.1.data_type()); + let c = schema.column_with_name("c").unwrap(); + assert_eq!(1, c.0); + assert_eq!(&DataType::Boolean, c.1.data_type()); + } + + #[test] + fn test_json_arrays() { + let mut reader = read_file("test/data/arrays.json", None); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(4, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = batch.schema(); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, a.1.data_type()); + let b = schema.column_with_name("b").unwrap(); + assert_eq!( + &DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + b.1.data_type() + ); + let c = schema.column_with_name("c").unwrap(); + assert_eq!( + &DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + c.1.data_type() + ); + let d = schema.column_with_name("d").unwrap(); + assert_eq!(&DataType::Utf8, d.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert_eq!(1, aa.value(0)); + assert_eq!(-10, aa.value(1)); + assert_eq!(1627668684594000000, aa.value(2)); + let bb = batch.column(b.0).as_list::(); + let bb = bb.values().as_primitive::(); + assert_eq!(9, bb.len()); + assert_eq!(2.0, bb.value(0)); + assert_eq!(-6.1, bb.value(5)); + assert!(!bb.is_valid(7)); + + let cc = batch + .column(c.0) + .as_any() + .downcast_ref::() + .unwrap(); + let cc = cc.values().as_boolean(); + assert_eq!(6, cc.len()); + assert!(!cc.value(0)); + assert!(!cc.value(4)); + assert!(!cc.is_valid(5)); + } + + #[test] + fn test_nested_list_json_arrays() { + let c_field = + Field::new_struct("c", vec![Field::new("d", DataType::Utf8, true)], true); + let a_struct_field = Field::new_struct( + "a", + vec![Field::new("b", DataType::Boolean, true), c_field.clone()], + true, + ); + let a_field = + Field::new("a", DataType::List(Arc::new(a_struct_field.clone())), true); + let schema = Arc::new(Schema::new(vec![a_field.clone()])); + let builder = ReaderBuilder::new(schema).with_batch_size(64); + let json_content = r#" + {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} + {"a": [{"b": false, "c": null}]} + {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} + {"a": null} + {"a": []} + {"a": [null]} + "#; + let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + + // build expected output + let d = StringArray::from(vec![ + Some("a_text"), + Some("b_text"), + None, + Some("c_text"), + Some("d_text"), + None, + None, + ]); + let c = ArrayDataBuilder::new(c_field.data_type().clone()) + .len(7) + .add_child_data(d.to_data()) + .null_bit_buffer(Some(Buffer::from(vec![0b00111011]))) + .build() + .unwrap(); + let b = BooleanArray::from(vec![ + Some(true), + Some(false), + Some(false), + Some(true), + None, + Some(true), + None, + ]); + let a = ArrayDataBuilder::new(a_struct_field.data_type().clone()) + .len(7) + .add_child_data(b.to_data()) + .add_child_data(c.clone()) + .null_bit_buffer(Some(Buffer::from(vec![0b00111111]))) + .build() + .unwrap(); + let a_list = ArrayDataBuilder::new(a_field.data_type().clone()) + .len(6) + .add_buffer(Buffer::from_slice_ref([0i32, 2, 3, 6, 6, 6, 7])) + .add_child_data(a) + .null_bit_buffer(Some(Buffer::from(vec![0b00110111]))) + .build() + .unwrap(); + let expected = make_array(a_list); + + // compare `a` with result from json reader + let batch = reader.next().unwrap().unwrap(); + let read = batch.column(0); + assert_eq!(read.len(), 6); + // compare the arrays the long way around, to better detect differences + let read: &ListArray = read.as_list::(); + let expected = expected.as_list::(); + assert_eq!(read.value_offsets(), &[0, 2, 3, 6, 6, 6, 7]); + // compare list null buffers + assert_eq!(read.nulls(), expected.nulls()); + // build struct from list + let struct_array = read.values().as_struct(); + let expected_struct_array = expected.values().as_struct(); + + assert_eq!(7, struct_array.len()); + assert_eq!(1, struct_array.null_count()); + assert_eq!(7, expected_struct_array.len()); + assert_eq!(1, expected_struct_array.null_count()); + // test struct's nulls + assert_eq!(struct_array.nulls(), expected_struct_array.nulls()); + // test struct's fields + let read_b = struct_array.column(0); + assert_eq!(read_b.as_ref(), &b); + let read_c = struct_array.column(1); + assert_eq!(read_c.to_data(), c); + let read_c = read_c.as_struct(); + let read_d = read_c.column(0); + assert_eq!(read_d.as_ref(), &d); + + assert_eq!(read, expected); + } + + #[test] + fn test_skip_empty_lines() { + let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(64); + let json_content = " + {\"a\": 1} + {\"a\": 2} + {\"a\": 3}"; + let mut reader = builder.build(Cursor::new(json_content)).unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = reader.schema(); + let c = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Int64, c.1.data_type()); + } + + #[test] + fn test_with_multiple_batches() { + let file = File::open("test/data/basic_nulls.json").unwrap(); + let mut reader = BufReader::new(file); + let schema = infer_json_schema(&mut reader, None).unwrap(); + reader.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); + let mut reader = builder.build(reader).unwrap(); + + let mut num_records = Vec::new(); + while let Some(rb) = reader.next().transpose().unwrap() { + num_records.push(rb.num_rows()); + } + + assert_eq!(vec![5, 5, 2], num_records); + } + + #[test] + fn test_timestamp_from_json_seconds() { + let schema = Schema::new(vec![Field::new( + "a", + DataType::Timestamp(TimeUnit::Second, None), + true, + )]); + + let mut reader = read_file("test/data/basic_nulls.json", Some(schema)); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!( + &DataType::Timestamp(TimeUnit::Second, None), + a.1.data_type() + ); + + let aa = batch.column(a.0).as_primitive::(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(2)); + assert_eq!(1, aa.value(0)); + assert_eq!(1, aa.value(3)); + assert_eq!(5, aa.value(7)); + } + + #[test] + fn test_timestamp_from_json_milliseconds() { + let schema = Schema::new(vec![Field::new( + "a", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + )]); + + let mut reader = read_file("test/data/basic_nulls.json", Some(schema)); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!( + &DataType::Timestamp(TimeUnit::Millisecond, None), + a.1.data_type() + ); + + let aa = batch.column(a.0).as_primitive::(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(2)); + assert_eq!(1, aa.value(0)); + assert_eq!(1, aa.value(3)); + assert_eq!(5, aa.value(7)); + } + + #[test] + fn test_date_from_json_milliseconds() { + let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); + + let mut reader = read_file("test/data/basic_nulls.json", Some(schema)); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Date64, a.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(2)); + assert_eq!(1, aa.value(0)); + assert_eq!(1, aa.value(3)); + assert_eq!(5, aa.value(7)); + } + + #[test] + fn test_time_from_json_nanoseconds() { + let schema = Schema::new(vec![Field::new( + "a", + DataType::Time64(TimeUnit::Nanosecond), + true, + )]); + + let mut reader = read_file("test/data/basic_nulls.json", Some(schema)); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(12, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let a = schema.column_with_name("a").unwrap(); + assert_eq!(&DataType::Time64(TimeUnit::Nanosecond), a.1.data_type()); + + let aa = batch.column(a.0).as_primitive::(); + assert!(aa.is_valid(0)); + assert!(!aa.is_valid(1)); + assert!(!aa.is_valid(2)); + assert_eq!(1, aa.value(0)); + assert_eq!(1, aa.value(3)); + assert_eq!(5, aa.value(7)); + } + + #[test] + fn test_json_iterator() { + let file = File::open("test/data/basic.json").unwrap(); + let mut reader = BufReader::new(file); + let schema = infer_json_schema(&mut reader, None).unwrap(); + reader.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); + let reader = builder.build(reader).unwrap(); + let schema = reader.schema(); + let (col_a_index, _) = schema.column_with_name("a").unwrap(); + + let mut sum_num_rows = 0; + let mut num_batches = 0; + let mut sum_a = 0; + for batch in reader { + let batch = batch.unwrap(); + assert_eq!(7, batch.num_columns()); + sum_num_rows += batch.num_rows(); + num_batches += 1; + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + let a_array = batch.column(col_a_index).as_primitive::(); + sum_a += (0..a_array.len()).map(|i| a_array.value(i)).sum::(); + } + assert_eq!(12, sum_num_rows); + assert_eq!(3, num_batches); + assert_eq!(100000000000011, sum_a); + } } diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs similarity index 97% rename from arrow-json/src/raw/primitive_array.rs rename to arrow-json/src/reader/primitive_array.rs index 6985821d65fe..2d45d9c45a3c 100644 --- a/arrow-json/src/raw/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -24,8 +24,8 @@ use arrow_cast::parse::Parser; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{tape_error, ArrayDecoder}; /// A trait for JSON-specific primitive parsing logic /// diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs new file mode 100644 index 000000000000..22d25c8be27a --- /dev/null +++ b/arrow-json/src/reader/schema.rs @@ -0,0 +1,710 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; +use indexmap::map::IndexMap as HashMap; +use indexmap::set::IndexSet as HashSet; +use serde_json::Value; +use std::borrow::Borrow; +use std::io::{BufRead, BufReader, Read, Seek}; +use std::sync::Arc; + +#[derive(Debug, Clone)] +enum InferredType { + Scalar(HashSet), + Array(Box), + Object(HashMap), + Any, +} + +impl InferredType { + fn merge(&mut self, other: InferredType) -> Result<(), ArrowError> { + match (self, other) { + (InferredType::Array(s), InferredType::Array(o)) => { + s.merge(*o)?; + } + (InferredType::Scalar(self_hs), InferredType::Scalar(other_hs)) => { + other_hs.into_iter().for_each(|v| { + self_hs.insert(v); + }); + } + (InferredType::Object(self_map), InferredType::Object(other_map)) => { + for (k, v) in other_map { + self_map.entry(k).or_insert(InferredType::Any).merge(v)?; + } + } + (s @ InferredType::Any, v) => { + *s = v; + } + (_, InferredType::Any) => {} + // convert a scalar type to a single-item scalar array type. + ( + InferredType::Array(self_inner_type), + other_scalar @ InferredType::Scalar(_), + ) => { + self_inner_type.merge(other_scalar)?; + } + (s @ InferredType::Scalar(_), InferredType::Array(mut other_inner_type)) => { + other_inner_type.merge(s.clone())?; + *s = InferredType::Array(other_inner_type); + } + // incompatible types + (s, o) => { + return Err(ArrowError::JsonError(format!( + "Incompatible type found during schema inference: {s:?} v.s. {o:?}", + ))); + } + } + + Ok(()) + } +} + +/// Coerce data type during inference +/// +/// * `Int64` and `Float64` should be `Float64` +/// * Lists and scalars are coerced to a list of a compatible scalar +/// * All other types are coerced to `Utf8` +fn coerce_data_type(dt: Vec<&DataType>) -> DataType { + let mut dt_iter = dt.into_iter().cloned(); + let dt_init = dt_iter.next().unwrap_or(DataType::Utf8); + + dt_iter.fold(dt_init, |l, r| match (l, r) { + (DataType::Boolean, DataType::Boolean) => DataType::Boolean, + (DataType::Int64, DataType::Int64) => DataType::Int64, + (DataType::Float64, DataType::Float64) + | (DataType::Float64, DataType::Int64) + | (DataType::Int64, DataType::Float64) => DataType::Float64, + (DataType::List(l), DataType::List(r)) => DataType::List(Arc::new(Field::new( + "item", + coerce_data_type(vec![l.data_type(), r.data_type()]), + true, + ))), + // coerce scalar and scalar array into scalar array + (DataType::List(e), not_list) | (not_list, DataType::List(e)) => { + DataType::List(Arc::new(Field::new( + "item", + coerce_data_type(vec![e.data_type(), ¬_list]), + true, + ))) + } + _ => DataType::Utf8, + }) +} + +fn generate_datatype(t: &InferredType) -> Result { + Ok(match t { + InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), + InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), + InferredType::Array(ele_type) => DataType::List(Arc::new(Field::new( + "item", + generate_datatype(ele_type)?, + true, + ))), + InferredType::Any => DataType::Null, + }) +} + +fn generate_fields(spec: &HashMap) -> Result { + spec.iter() + .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) + .collect() +} + +/// Generate schema from JSON field names and inferred data types +fn generate_schema(spec: HashMap) -> Result { + Ok(Schema::new(generate_fields(&spec)?)) +} + +/// JSON file reader that produces a serde_json::Value iterator from a Read trait +/// +/// # Example +/// +/// ``` +/// use std::fs::File; +/// use std::io::BufReader; +/// use arrow_json::reader::ValueIter; +/// +/// let mut reader = +/// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); +/// let mut value_reader = ValueIter::new(&mut reader, None); +/// for value in value_reader { +/// println!("JSON value: {}", value.unwrap()); +/// } +/// ``` +#[derive(Debug)] +pub struct ValueIter<'a, R: Read> { + reader: &'a mut BufReader, + max_read_records: Option, + record_count: usize, + // reuse line buffer to avoid allocation on each record + line_buf: String, +} + +impl<'a, R: Read> ValueIter<'a, R> { + /// Creates a new `ValueIter` + pub fn new(reader: &'a mut BufReader, max_read_records: Option) -> Self { + Self { + reader, + max_read_records, + record_count: 0, + line_buf: String::new(), + } + } +} + +impl<'a, R: Read> Iterator for ValueIter<'a, R> { + type Item = Result; + + fn next(&mut self) -> Option { + if let Some(max) = self.max_read_records { + if self.record_count >= max { + return None; + } + } + + loop { + self.line_buf.truncate(0); + match self.reader.read_line(&mut self.line_buf) { + Ok(0) => { + // read_line returns 0 when stream reached EOF + return None; + } + Err(e) => { + return Some(Err(ArrowError::JsonError(format!( + "Failed to read JSON record: {e}" + )))); + } + _ => { + let trimmed_s = self.line_buf.trim(); + if trimmed_s.is_empty() { + // ignore empty lines + continue; + } + + self.record_count += 1; + return Some(serde_json::from_str(trimmed_s).map_err(|e| { + ArrowError::JsonError(format!("Not valid JSON: {e}")) + })); + } + } + } + } +} + +/// Infer the fields of a JSON file by reading the first n records of the file, with +/// `max_read_records` controlling the maximum number of records to read. +/// +/// If `max_read_records` is not set, the whole file is read to infer its field types. +/// +/// Contrary to [`infer_json_schema`], this function will seek back to the start of the `reader`. +/// That way, the `reader` can be used immediately afterwards to create a [`Reader`]. +/// +/// # Examples +/// ``` +/// use std::fs::File; +/// use std::io::BufReader; +/// use arrow_json::reader::infer_json_schema_from_seekable; +/// +/// let file = File::open("test/data/mixed_arrays.json").unwrap(); +/// // file's cursor's offset at 0 +/// let mut reader = BufReader::new(file); +/// let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); +/// // file's cursor's offset automatically set at 0 +/// ``` +/// +/// [`Reader`]: super::Reader +pub fn infer_json_schema_from_seekable( + reader: &mut BufReader, + max_read_records: Option, +) -> Result { + let schema = infer_json_schema(reader, max_read_records); + // return the reader seek back to the start + reader.rewind()?; + + schema +} + +/// Infer the fields of a JSON file by reading the first n records of the buffer, with +/// `max_read_records` controlling the maximum number of records to read. +/// +/// If `max_read_records` is not set, the whole file is read to infer its field types. +/// +/// This function will not seek back to the start of the `reader`. The user has to manage the +/// original file's cursor. This function is useful when the `reader`'s cursor is not available +/// (does not implement [`Seek`]), such is the case for compressed streams decoders. +/// +/// # Examples +/// ``` +/// use std::fs::File; +/// use std::io::{BufReader, SeekFrom, Seek}; +/// use flate2::read::GzDecoder; +/// use arrow_json::reader::infer_json_schema; +/// +/// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); +/// +/// // file's cursor's offset at 0 +/// let mut reader = BufReader::new(GzDecoder::new(&file)); +/// let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); +/// // cursor's offset at end of file +/// +/// // seek back to start so that the original file is usable again +/// file.seek(SeekFrom::Start(0)).unwrap(); +/// ``` +pub fn infer_json_schema( + reader: &mut BufReader, + max_read_records: Option, +) -> Result { + infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) +} + +fn set_object_scalar_field_type( + field_types: &mut HashMap, + key: &str, + ftype: DataType, +) -> Result<(), ArrowError> { + if !field_types.contains_key(key) { + field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); + } + + match field_types.get_mut(key).unwrap() { + InferredType::Scalar(hs) => { + hs.insert(ftype); + Ok(()) + } + // in case of column contains both scalar type and scalar array type, we convert type of + // this column to scalar array. + scalar_array @ InferredType::Array(_) => { + let mut hs = HashSet::new(); + hs.insert(ftype); + scalar_array.merge(InferredType::Scalar(hs))?; + Ok(()) + } + t => Err(ArrowError::JsonError(format!( + "Expected scalar or scalar array JSON type, found: {t:?}", + ))), + } +} + +fn infer_scalar_array_type(array: &[Value]) -> Result { + let mut hs = HashSet::new(); + + for v in array { + match v { + Value::Null => {} + Value::Number(n) => { + if n.is_i64() { + hs.insert(DataType::Int64); + } else { + hs.insert(DataType::Float64); + } + } + Value::Bool(_) => { + hs.insert(DataType::Boolean); + } + Value::String(_) => { + hs.insert(DataType::Utf8); + } + Value::Array(_) | Value::Object(_) => { + return Err(ArrowError::JsonError(format!( + "Expected scalar value for scalar array, got: {v:?}" + ))); + } + } + } + + Ok(InferredType::Scalar(hs)) +} + +fn infer_nested_array_type(array: &[Value]) -> Result { + let mut inner_ele_type = InferredType::Any; + + for v in array { + match v { + Value::Array(inner_array) => { + inner_ele_type.merge(infer_array_element_type(inner_array)?)?; + } + x => { + return Err(ArrowError::JsonError(format!( + "Got non array element in nested array: {x:?}" + ))); + } + } + } + + Ok(InferredType::Array(Box::new(inner_ele_type))) +} + +fn infer_struct_array_type(array: &[Value]) -> Result { + let mut field_types = HashMap::new(); + + for v in array { + match v { + Value::Object(map) => { + collect_field_types_from_object(&mut field_types, map)?; + } + _ => { + return Err(ArrowError::JsonError(format!( + "Expected struct value for struct array, got: {v:?}" + ))); + } + } + } + + Ok(InferredType::Object(field_types)) +} + +fn infer_array_element_type(array: &[Value]) -> Result { + match array.iter().take(1).next() { + None => Ok(InferredType::Any), // empty array, return any type that can be updated later + Some(a) => match a { + Value::Array(_) => infer_nested_array_type(array), + Value::Object(_) => infer_struct_array_type(array), + _ => infer_scalar_array_type(array), + }, + } +} + +fn collect_field_types_from_object( + field_types: &mut HashMap, + map: &serde_json::map::Map, +) -> Result<(), ArrowError> { + for (k, v) in map { + match v { + Value::Array(array) => { + let ele_type = infer_array_element_type(array)?; + + if !field_types.contains_key(k) { + match ele_type { + InferredType::Scalar(_) => { + field_types.insert( + k.to_string(), + InferredType::Array(Box::new(InferredType::Scalar( + HashSet::new(), + ))), + ); + } + InferredType::Object(_) => { + field_types.insert( + k.to_string(), + InferredType::Array(Box::new(InferredType::Object( + HashMap::new(), + ))), + ); + } + InferredType::Any | InferredType::Array(_) => { + // set inner type to any for nested array as well + // so it can be updated properly from subsequent type merges + field_types.insert( + k.to_string(), + InferredType::Array(Box::new(InferredType::Any)), + ); + } + } + } + + match field_types.get_mut(k).unwrap() { + InferredType::Array(inner_type) => { + inner_type.merge(ele_type)?; + } + // in case of column contains both scalar type and scalar array type, we + // convert type of this column to scalar array. + field_type @ InferredType::Scalar(_) => { + field_type.merge(ele_type)?; + *field_type = InferredType::Array(Box::new(field_type.clone())); + } + t => { + return Err(ArrowError::JsonError(format!( + "Expected array json type, found: {t:?}", + ))); + } + } + } + Value::Bool(_) => { + set_object_scalar_field_type(field_types, k, DataType::Boolean)?; + } + Value::Null => { + // do nothing, we treat json as nullable by default when + // inferring + } + Value::Number(n) => { + if n.is_f64() { + set_object_scalar_field_type(field_types, k, DataType::Float64)?; + } else { + // default to i64 + set_object_scalar_field_type(field_types, k, DataType::Int64)?; + } + } + Value::String(_) => { + set_object_scalar_field_type(field_types, k, DataType::Utf8)?; + } + Value::Object(inner_map) => { + if !field_types.contains_key(k) { + field_types + .insert(k.to_string(), InferredType::Object(HashMap::new())); + } + match field_types.get_mut(k).unwrap() { + InferredType::Object(inner_field_types) => { + collect_field_types_from_object(inner_field_types, inner_map)?; + } + t => { + return Err(ArrowError::JsonError(format!( + "Expected object json type, found: {t:?}", + ))); + } + } + } + } + } + + Ok(()) +} + +/// Infer the fields of a JSON file by reading all items from the JSON Value Iterator. +/// +/// The following type coercion logic is implemented: +/// * `Int64` and `Float64` are converted to `Float64` +/// * Lists and scalars are coerced to a list of a compatible scalar +/// * All other cases are coerced to `Utf8` (String) +/// +/// Note that the above coercion logic is different from what Spark has, where it would default to +/// String type in case of List and Scalar values appeared in the same field. +/// +/// The reason we diverge here is because we don't have utilities to deal with JSON data once it's +/// interpreted as Strings. We should match Spark's behavior once we added more JSON parsing +/// kernels in the future. +pub fn infer_json_schema_from_iterator(value_iter: I) -> Result +where + I: Iterator>, + V: Borrow, +{ + let mut field_types: HashMap = HashMap::new(); + + for record in value_iter { + match record?.borrow() { + Value::Object(map) => { + collect_field_types_from_object(&mut field_types, map)?; + } + value => { + return Err(ArrowError::JsonError(format!( + "Expected JSON record to be an object, found {value:?}" + ))); + } + }; + } + + generate_schema(field_types) +} + +#[cfg(test)] +mod tests { + use super::*; + use flate2::read::GzDecoder; + use std::fs::File; + use std::io::Cursor; + + #[test] + fn test_json_infer_schema() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new( + "b", + DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), + true, + ), + Field::new( + "c", + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), + true, + ), + Field::new( + "d", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + ]); + + let mut reader = + BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); + let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + + assert_eq!(inferred_schema, schema); + + let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); + let mut reader = BufReader::new(GzDecoder::new(&file)); + let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + + assert_eq!(inferred_schema, schema); + } + + #[test] + fn test_json_infer_schema_nested_structs() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Boolean, true), + Field::new( + "b", + DataType::Struct( + vec![Field::new("c", DataType::Utf8, true)].into(), + ), + true, + ), + ])), + true, + ), + Field::new("c2", DataType::Int64, true), + Field::new("c3", DataType::Utf8, true), + ]); + + let inferred_schema = infer_json_schema_from_iterator( + vec![ + Ok(serde_json::json!({"c1": {"a": true, "b": {"c": "text"}}, "c2": 1})), + Ok(serde_json::json!({"c1": {"a": false, "b": null}, "c2": 0})), + Ok(serde_json::json!({"c1": {"a": true, "b": {"c": "text"}}, "c3": "ok"})), + ] + .into_iter(), + ) + .unwrap(); + + assert_eq!(inferred_schema, schema); + } + + #[test] + fn test_json_infer_schema_struct_in_list() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int64, true), + Field::new("c", DataType::Boolean, true), + ])), + true, + ))), + true, + ), + Field::new("c2", DataType::Float64, true), + Field::new( + "c3", + // empty json array's inner types are inferred as null + DataType::List(Arc::new(Field::new("item", DataType::Null, true))), + true, + ), + ]); + + let inferred_schema = infer_json_schema_from_iterator( + vec![ + Ok(serde_json::json!({ + "c1": [{"a": "foo", "b": 100}], "c2": 1, "c3": [], + })), + Ok(serde_json::json!({ + "c1": [{"a": "bar", "b": 2}, {"a": "foo", "c": true}], "c2": 0, "c3": [], + })), + Ok(serde_json::json!({"c1": [], "c2": 0.5, "c3": []})), + ] + .into_iter(), + ) + .unwrap(); + + assert_eq!(inferred_schema, schema); + } + + #[test] + fn test_json_infer_schema_nested_list() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::List(Arc::new(Field::new( + "item", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ))), + true, + ), + Field::new("c2", DataType::Float64, true), + ]); + + let inferred_schema = infer_json_schema_from_iterator( + vec![ + Ok(serde_json::json!({ + "c1": [], + "c2": 12, + })), + Ok(serde_json::json!({ + "c1": [["a", "b"], ["c"]], + })), + Ok(serde_json::json!({ + "c1": [["foo"]], + "c2": 0.11, + })), + ] + .into_iter(), + ) + .unwrap(); + + assert_eq!(inferred_schema, schema); + } + + #[test] + fn test_coercion_scalar_and_list() { + use arrow_schema::DataType::*; + + assert_eq!( + List(Arc::new(Field::new("item", Float64, true))), + coerce_data_type(vec![ + &Float64, + &List(Arc::new(Field::new("item", Float64, true))) + ]) + ); + assert_eq!( + List(Arc::new(Field::new("item", Float64, true))), + coerce_data_type(vec![ + &Float64, + &List(Arc::new(Field::new("item", Int64, true))) + ]) + ); + assert_eq!( + List(Arc::new(Field::new("item", Int64, true))), + coerce_data_type(vec![ + &Int64, + &List(Arc::new(Field::new("item", Int64, true))) + ]) + ); + // boolean and number are incompatible, return utf8 + assert_eq!( + List(Arc::new(Field::new("item", Utf8, true))), + coerce_data_type(vec![ + &Boolean, + &List(Arc::new(Field::new("item", Float64, true))) + ]) + ); + } + + #[test] + fn test_invalid_json_infer_schema() { + let re = + infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new(b"}")), None); + assert_eq!( + re.err().unwrap().to_string(), + "Json error: Not valid JSON: expected value at line 1 column 1", + ); + } +} diff --git a/arrow-json/src/raw/serializer.rs b/arrow-json/src/reader/serializer.rs similarity index 99% rename from arrow-json/src/raw/serializer.rs rename to arrow-json/src/reader/serializer.rs index d743b6dba126..a68d1d5476c4 100644 --- a/arrow-json/src/raw/serializer.rs +++ b/arrow-json/src/reader/serializer.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::raw::tape::TapeElement; +use crate::reader::tape::TapeElement; use lexical_core::FormattedSize; use serde::ser::{ Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple, diff --git a/arrow-json/src/raw/string_array.rs b/arrow-json/src/reader/string_array.rs similarity index 97% rename from arrow-json/src/raw/string_array.rs rename to arrow-json/src/reader/string_array.rs index 104e4e83f101..8060804c9ce8 100644 --- a/arrow-json/src/raw/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -21,8 +21,8 @@ use arrow_data::ArrayData; use arrow_schema::ArrowError; use std::marker::PhantomData; -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{tape_error, ArrayDecoder}; const TRUE: &str = "true"; const FALSE: &str = "false"; diff --git a/arrow-json/src/raw/struct_array.rs b/arrow-json/src/reader/struct_array.rs similarity index 98% rename from arrow-json/src/raw/struct_array.rs rename to arrow-json/src/reader/struct_array.rs index a73bb148621a..013f862c51ad 100644 --- a/arrow-json/src/raw/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{make_decoder, tape_error, ArrayDecoder}; use arrow_array::builder::BooleanBufferBuilder; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; diff --git a/arrow-json/src/raw/tape.rs b/arrow-json/src/reader/tape.rs similarity index 99% rename from arrow-json/src/raw/tape.rs rename to arrow-json/src/reader/tape.rs index 2720c2502585..885257ed107a 100644 --- a/arrow-json/src/raw/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::raw::serializer::TapeSerializer; +use crate::reader::serializer::TapeSerializer; use arrow_schema::ArrowError; use serde::Serialize; use std::fmt::{Display, Formatter}; diff --git a/arrow-json/src/raw/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs similarity index 97% rename from arrow-json/src/raw/timestamp_array.rs rename to arrow-json/src/reader/timestamp_array.rs index 07feaa974ee4..73d1cda9150c 100644 --- a/arrow-json/src/raw/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -26,8 +26,8 @@ use arrow_cast::parse::string_to_datetime; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, TimeUnit}; -use crate::raw::tape::{Tape, TapeElement}; -use crate::raw::{tape_error, ArrayDecoder}; +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{tape_error, ArrayDecoder}; /// A specialized [`ArrayDecoder`] for timestamps pub struct TimestampArrayDecoder { diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index cf65e8a9356b..60b212101e58 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -589,11 +589,10 @@ where #[cfg(test)] mod tests { use std::fs::{read_to_string, File}; - use std::io::BufReader; + use std::io::{BufReader, Seek}; use std::sync::Arc; use crate::reader::*; - use crate::RawReaderBuilder; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; use serde_json::json; @@ -1205,14 +1204,14 @@ mod tests { ); } - #[allow(deprecated)] fn test_write_for_file(test_file: &str) { - let builder = ReaderBuilder::new() - .infer_schema(None) - .with_batch_size(1024); - let mut reader: Reader = builder - .build::(File::open(test_file).unwrap()) - .unwrap(); + let file = File::open(test_file).unwrap(); + let mut reader = BufReader::new(file); + let schema = infer_json_schema(&mut reader, None).unwrap(); + reader.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024); + let mut reader = builder.build(reader).unwrap(); let batch = reader.next().unwrap().unwrap(); let mut buf = Vec::new(); @@ -1298,7 +1297,7 @@ mod tests { let list_type = DataType::List(Arc::new(Field::new("item", ints_struct, true))); let list_field = Field::new("list", list_type, true); let schema = Arc::new(Schema::new(vec![list_field])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); + let builder = ReaderBuilder::new(schema).with_batch_size(64); let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); @@ -1395,15 +1394,15 @@ mod tests { } #[test] - #[allow(deprecated)] fn test_write_single_batch() { let test_file = "test/data/basic.json"; - let builder = ReaderBuilder::new() - .infer_schema(None) - .with_batch_size(1024); - let mut reader: Reader = builder - .build::(File::open(test_file).unwrap()) - .unwrap(); + let file = File::open(test_file).unwrap(); + let mut reader = BufReader::new(file); + let schema = infer_json_schema(&mut reader, None).unwrap(); + reader.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024); + let mut reader = builder.build(reader).unwrap(); let batch = reader.next().unwrap().unwrap(); let mut buf = Vec::new(); @@ -1440,7 +1439,7 @@ mod tests { Field::new("g", DataType::Timestamp(TimeUnit::Millisecond, None), true), ])); - let mut reader = RawReaderBuilder::new(schema.clone()) + let mut reader = ReaderBuilder::new(schema.clone()) .build(BufReader::new(File::open(test_file).unwrap())) .unwrap(); let batch = reader.next().unwrap().unwrap(); diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index 8ad6cfd3ab48..8cebc42e4cf6 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -22,31 +22,16 @@ use arrow::util::bench_util::{ create_primitive_array, create_string_array, create_string_array_with_len, }; use arrow_array::RecordBatch; -use arrow_json::LineDelimitedWriter; -use arrow_json::RawReaderBuilder; +use arrow_json::{LineDelimitedWriter, ReaderBuilder}; use std::io::Cursor; use std::sync::Arc; #[allow(deprecated)] fn do_bench(c: &mut Criterion, name: &str, json: &str, schema: SchemaRef) { - c.bench_function(&format!("{name} (basic)"), |b| { + c.bench_function(name, |b| { b.iter(|| { let cursor = Cursor::new(black_box(json)); - let builder = arrow_json::ReaderBuilder::new() - .with_schema(schema.clone()) - .with_batch_size(64); - - let mut reader = builder.build(cursor).unwrap(); - while let Some(next) = reader.next().transpose() { - next.unwrap(); - } - }) - }); - - c.bench_function(&format!("{name} (raw)"), |b| { - b.iter(|| { - let cursor = Cursor::new(black_box(json)); - let builder = RawReaderBuilder::new(schema.clone()).with_batch_size(64); + let builder = ReaderBuilder::new(schema.clone()).with_batch_size(64); let reader = builder.build(cursor).unwrap(); for next in reader { next.unwrap(); diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 41b846b0475e..8bad29bf74b7 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -273,7 +273,7 @@ //! //! # Serde Compatibility //! -//! [`arrow_json::RawDecoder`] provides a mechanism to convert arbitrary, serde-compatible +//! [`arrow_json::reader::Decoder`] provides a mechanism to convert arbitrary, serde-compatible //! structures into [`RecordBatch`]. //! //! Whilst likely less performant than implementing a custom builder, as described in @@ -281,7 +281,7 @@ //! //! ``` //! # use std::sync::Arc; -//! # use arrow_json::RawReaderBuilder; +//! # use arrow_json::ReaderBuilder; //! # use arrow_schema::{DataType, Field, Schema}; //! # use serde::Serialize; //! # use arrow_array::cast::AsArray; @@ -303,7 +303,7 @@ //! MyStruct{ int32: 8, string: "foo".to_string() }, //! ]; //! -//! let mut decoder = RawReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); +//! let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap(); //! decoder.serialize(&rows).unwrap(); //! //! let batch = decoder.flush().unwrap().unwrap(); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 680d31480939..d662a16eaf28 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -1145,7 +1145,7 @@ mod tests { false, ); let schema = Arc::new(Schema::new(vec![stocks_field])); - let builder = arrow::json::RawReaderBuilder::new(schema).with_batch_size(64); + let builder = arrow::json::ReaderBuilder::new(schema).with_batch_size(64); let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index d026f971e946..3987cccf6c56 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1016,7 +1016,7 @@ mod tests { true, ); let schema = Arc::new(Schema::new(vec![stocks_field])); - let builder = arrow::json::RawReaderBuilder::new(schema).with_batch_size(64); + let builder = arrow::json::ReaderBuilder::new(schema).with_batch_size(64); let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); From a35c6c5f4309787a9a2f523920af2efd9b1682b9 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 12 Apr 2023 11:59:02 +0200 Subject: [PATCH 0813/1411] feat: DataType::contains support nested type (#4042) * feat: DataType::contains support nested type * support recurse * check typeID for Union --- arrow-schema/src/datatype.rs | 26 +++++++++++++++ arrow-schema/src/field.rs | 64 +++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 3f684285c067..64e8d0e778b0 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -517,6 +517,32 @@ impl DataType { } } } + + /// Check to see if `self` is a superset of `other` + /// + /// If DataType is a nested type, then it will check to see if the nested type is a superset of the other nested type + /// else it will check to see if the DataType is equal to the other DataType + pub fn contains(&self, other: &DataType) -> bool { + match (self, other) { + (DataType::List(f1), DataType::List(f2)) + | (DataType::LargeList(f1), DataType::LargeList(f2)) => f1.contains(f2), + (DataType::FixedSizeList(f1, s1), DataType::FixedSizeList(f2, s2)) => { + s1 == s2 && f1.contains(f2) + } + (DataType::Map(f1, s1), DataType::Map(f2, s2)) => s1 == s2 && f1.contains(f2), + (DataType::Struct(f1), DataType::Struct(f2)) => f1.contains(f2), + (DataType::Union(f1, s1), DataType::Union(f2, s2)) => { + s1 == s2 + && f1 + .iter() + .all(|f1| f2.iter().any(|f2| f1.0 == f2.0 && f1.1.contains(f2.1))) + } + (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => { + k1.contains(k2) && v1.contains(v2) + } + _ => self == other, + } + } } /// The maximum precision for [DataType::Decimal128] values diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 5edd5be7a8e5..f38e1e26ad26 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -516,7 +516,7 @@ impl Field { /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { self.name == other.name - && self.data_type == other.data_type + && self.data_type.contains(&other.data_type) && self.dict_id == other.dict_id && self.dict_is_ordered == other.dict_is_ordered // self need to be nullable or both of them are not nullable @@ -758,6 +758,68 @@ mod test { assert!(!field1.contains(&field2)); assert!(!field2.contains(&field1)); + + // UnionFields with different type ID + let field1 = Field::new( + "field1", + DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("field1", DataType::UInt8, true), + Field::new("field3", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ), + true, + ); + let field2 = Field::new( + "field1", + DataType::Union( + UnionFields::new( + vec![1, 3], + vec![ + Field::new("field1", DataType::UInt8, false), + Field::new("field3", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ), + true, + ); + assert!(!field1.contains(&field2)); + + // UnionFields with same type ID + let field1 = Field::new( + "field1", + DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("field1", DataType::UInt8, true), + Field::new("field3", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ), + true, + ); + let field2 = Field::new( + "field1", + DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("field1", DataType::UInt8, false), + Field::new("field3", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ), + true, + ); + assert!(field1.contains(&field2)); } #[cfg(feature = "serde")] From 1e08706b4cd26533928ef65e253257f9410c0d2c Mon Sep 17 00:00:00 2001 From: c-thiel Date: Wed, 12 Apr 2023 19:24:51 +0100 Subject: [PATCH 0814/1411] Add CommandGetXdbcTypeInfo to Flight SQL Server (#4055) * Add CommandGetXdbcTypeInfo to Flight SQL Server * Add CommandGetXdbcTypeInfo in a few moer places * Add get_xdbc_type_info to flight-sql client --------- Co-authored-by: Andrew Lamb --- arrow-flight/examples/flight_sql_server.rs | 26 ++++++++++++++++++--- arrow-flight/src/sql/client.rs | 14 ++++++++--- arrow-flight/src/sql/mod.rs | 2 ++ arrow-flight/src/sql/server.rs | 26 ++++++++++++++++++--- arrow-flight/tests/flight_sql_client_cli.rs | 26 ++++++++++++++++++--- 5 files changed, 82 insertions(+), 12 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 08744b65f7ac..675692aba6f9 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -44,9 +44,9 @@ use arrow_flight::{ ActionCreatePreparedStatementRequest, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, - CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, - TicketStatementQuery, + CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, + CommandStatementQuery, CommandStatementUpdate, TicketStatementQuery, }, FlightDescriptor, FlightInfo, }; @@ -315,6 +315,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn get_flight_info_xdbc_type_info( + &self, + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_xdbc_type_info not implemented", + )) + } + // do_get async fn do_get_statement( &self, @@ -412,6 +422,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn do_get_xdbc_type_info( + &self, + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_xdbc_type_info not implemented", + )) + } + // do_put async fn do_put_statement_update( &self, diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index d96c90afa806..15a896c109e1 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -31,9 +31,9 @@ use crate::sql::{ ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, - CommandStatementQuery, CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, - SqlInfo, + CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandStatementQuery, CommandStatementUpdate, + DoPutUpdateResult, ProstMessageExt, SqlInfo, }; use crate::{ Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, @@ -313,6 +313,14 @@ impl FlightSqlServiceClient { self.get_flight_info_for_command(request).await } + /// Request XDBC SQL information. + pub async fn get_xdbc_type_info( + &mut self, + request: CommandGetXdbcTypeInfo, + ) -> Result { + self.get_flight_info_for_command(request).await + } + /// Create a prepared statement object. pub async fn prepare( &mut self, diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index df828c9c08af..ed26b38751c5 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -58,6 +58,7 @@ pub use gen::CommandGetPrimaryKeys; pub use gen::CommandGetSqlInfo; pub use gen::CommandGetTableTypes; pub use gen::CommandGetTables; +pub use gen::CommandGetXdbcTypeInfo; pub use gen::CommandPreparedStatementQuery; pub use gen::CommandPreparedStatementUpdate; pub use gen::CommandStatementQuery; @@ -214,6 +215,7 @@ prost_message_ext!( CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, + CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index f25ddb13db99..9a0183495434 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -34,9 +34,9 @@ use super::{ ActionCreatePreparedStatementResult, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, - TicketStatementQuery, + CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, + CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, + DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, }; pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; @@ -151,6 +151,13 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { request: Request, ) -> Result, Status>; + /// Get a FlightInfo to extract information about the supported XDBC types. + async fn get_flight_info_xdbc_type_info( + &self, + query: CommandGetXdbcTypeInfo, + request: Request, + ) -> Result, Status>; + // do_get /// Get a FlightDataStream containing the query results. @@ -230,6 +237,13 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { request: Request, ) -> Result::DoGetStream>, Status>; + /// Get a FlightDataStream containing the data related to the supported XDBC types. + async fn do_get_xdbc_type_info( + &self, + query: CommandGetXdbcTypeInfo, + request: Request, + ) -> Result::DoGetStream>, Status>; + // do_put /// Execute an update SQL statement. @@ -352,6 +366,9 @@ where Command::CommandGetCrossReference(token) => { self.get_flight_info_cross_reference(token, request).await } + Command::CommandGetXdbcTypeInfo(token) => { + self.get_flight_info_xdbc_type_info(token, request).await + } cmd => Err(Status::unimplemented(format!( "get_flight_info: The defined request is invalid: {}", cmd.type_url() @@ -407,6 +424,9 @@ where Command::CommandGetCrossReference(command) => { self.do_get_cross_reference(command, request).await } + Command::CommandGetXdbcTypeInfo(command) => { + self.do_get_xdbc_type_info(command, request).await + } cmd => self.do_get_fallback(request, cmd.into_any()).await, } } diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 2c54bd263fdb..248b3732ff97 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -26,9 +26,9 @@ use arrow_flight::{ CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, - CommandStatementQuery, CommandStatementUpdate, ProstMessageExt, SqlInfo, - TicketStatementQuery, + CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, + CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, + ProstMessageExt, SqlInfo, TicketStatementQuery, }, utils::batches_to_flight_data, Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, @@ -302,6 +302,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn get_flight_info_xdbc_type_info( + &self, + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_xdbc_type_info not implemented", + )) + } + // do_get async fn do_get_statement( &self, @@ -399,6 +409,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn do_get_xdbc_type_info( + &self, + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_xdbc_type_info not implemented", + )) + } + // do_put async fn do_put_statement_update( &self, From efd8b9254a7b16427c8ff0374727bb89b5a3a966 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 12 Apr 2023 20:28:23 +0100 Subject: [PATCH 0815/1411] Remove ArrayData from Array (#3880) (#4061) * Remove ArrayData from Array (#3880) * Fix doc * Fix pyarrow-integration-testing * Review feedback --- arrow-arith/src/aggregate.rs | 7 +- arrow-array/src/array/boolean_array.rs | 82 ++++++--- arrow-array/src/array/byte_array.rs | 90 ++++++--- arrow-array/src/array/dictionary_array.rs | 99 +++++++--- .../src/array/fixed_size_binary_array.rs | 101 +++++++--- .../src/array/fixed_size_list_array.rs | 149 ++++++++------- arrow-array/src/array/list_array.rs | 76 ++++++-- arrow-array/src/array/map_array.rs | 67 +++++-- arrow-array/src/array/mod.rs | 119 +++--------- arrow-array/src/array/null_array.rs | 54 ++++-- arrow-array/src/array/primitive_array.rs | 135 ++++++++------ arrow-array/src/array/run_array.rs | 89 +++++++-- arrow-array/src/array/struct_array.rs | 105 ++++++++--- arrow-array/src/array/union_array.rs | 107 +++++++++-- arrow-array/src/record_batch.rs | 4 +- arrow-cast/src/cast.rs | 2 - arrow-ipc/src/writer.rs | 173 ++++-------------- arrow-pyarrow-integration-testing/src/lib.rs | 10 +- arrow-select/src/nullif.rs | 6 +- arrow/src/lib.rs | 2 +- 20 files changed, 895 insertions(+), 582 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index a9944db13ee1..9ed6dee516a4 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -1241,13 +1241,12 @@ mod tests { .into_iter() .collect(); let sliced_input = sliced_input.slice(4, 2); - let sliced_input = sliced_input.as_boolean(); - assert_eq!(sliced_input, &input); + assert_eq!(sliced_input, input); - let actual = min_boolean(sliced_input); + let actual = min_boolean(&sliced_input); assert_eq!(actual, expected); - let actual = max_boolean(sliced_input); + let actual = max_boolean(&sliced_input); assert_eq!(actual, expected); } diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index dea5c07da281..d03f0fd040f2 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -20,7 +20,7 @@ use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef}; use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -66,8 +66,8 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct BooleanArray { - data: ArrayData, values: BooleanBuffer, + nulls: Option, } impl std::fmt::Debug for BooleanArray { @@ -90,27 +90,25 @@ impl BooleanArray { if let Some(n) = nulls.as_ref() { assert_eq!(values.len(), n.len()); } - - // TODO: Don't store ArrayData inside arrays (#3880) - let data = unsafe { - ArrayData::builder(DataType::Boolean) - .len(values.len()) - .offset(values.offset()) - .nulls(nulls) - .buffers(vec![values.inner().clone()]) - .build_unchecked() - }; - Self { data, values } + Self { values, nulls } } /// Returns the length of this array. pub fn len(&self) -> usize { - self.data.len() + self.values.len() } /// Returns whether this array is empty. pub fn is_empty(&self) -> bool { - self.data.is_empty() + self.values.is_empty() + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + values: self.values.slice(offset, length), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Returns a new boolean array builder @@ -125,7 +123,7 @@ impl BooleanArray { /// Returns the number of non null, true values within this array pub fn true_count(&self) -> usize { - match self.data.nulls() { + match self.nulls() { Some(nulls) => { let null_chunks = nulls.inner().bit_chunks().iter_padded(); let value_chunks = self.values().bit_chunks().iter_padded(); @@ -247,25 +245,48 @@ impl Array for BooleanArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &DataType::Boolean + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { - // TODO: Slice buffers directly (#3880) - Arc::new(Self::from(self.data.slice(offset, length))) + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.values.len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + self.values.offset() } fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.values.inner().capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -324,13 +345,22 @@ impl From for BooleanArray { let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - Self { data, values } + Self { + values, + nulls: data.nulls().cloned(), + } } } impl From for ArrayData { fn from(array: BooleanArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(DataType::Boolean) + .len(array.values.len()) + .offset(array.values.offset()) + .nulls(array.nulls) + .buffers(vec![array.values.into_inner()]); + + unsafe { builder.build_unchecked() } } } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f0e43e6949e9..e23079ef9be9 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -23,7 +23,7 @@ use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_buffer::{NullBuffer, OffsetBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -39,17 +39,19 @@ use std::sync::Arc; /// [`BinaryArray`]: crate::BinaryArray /// [`LargeBinaryArray`]: crate::LargeBinaryArray pub struct GenericByteArray { - data: ArrayData, + data_type: DataType, value_offsets: OffsetBuffer, value_data: Buffer, + nulls: Option, } impl Clone for GenericByteArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), value_offsets: self.value_offsets.clone(), value_data: self.value_data.clone(), + nulls: self.nulls.clone(), } } } @@ -135,7 +137,7 @@ impl GenericByteArray { /// Panics if index `i` is out of bounds. pub fn value(&self, i: usize) -> &T::Native { assert!( - i < self.data.len(), + i < self.len(), "Trying to access an element at index {} from a {}{}Array of length {}", i, T::Offset::PREFIX, @@ -154,29 +156,33 @@ impl GenericByteArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_data: self.value_data.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Returns `GenericByteBuilder` of this byte array for mutating its values if the underlying /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); + let value_len = + T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); + + let data = self.into_data(); + let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let offset_buffer = self.data.buffers()[0] - .slice_with_length(self.data.offset() * element_len, (len + 1) * element_len); + let offset_buffer = data.buffers()[0] + .slice_with_length(data.offset() * element_len, (len + 1) * element_len); let element_len = std::mem::size_of::(); - let value_len = - T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); - let value_buffer = self.data.buffers()[1] - .slice_with_length(self.data.offset() * element_len, value_len * element_len); + let value_buffer = data.buffers()[1] + .slice_with_length(data.offset() * element_len, value_len * element_len); - drop(self.data); - drop(self.value_data); - drop(self.value_offsets); + drop(data); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -258,24 +264,49 @@ impl Array for GenericByteArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.value_offsets.inner().inner().capacity(); + sum += self.value_data.capacity(); + if let Some(x) = &self.nulls { + sum += x.buffer().capacity() + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -313,18 +344,25 @@ impl From for GenericByteArray { let value_offsets = unsafe { get_offsets(&data) }; let value_data = data.buffers()[1].clone(); Self { - data, - // SAFETY: - // ArrayData must be valid, and validated data type above value_offsets, value_data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), } } } impl From> for ArrayData { fn from(array: GenericByteArray) -> Self { - array.data + let len = array.len(); + + let offsets = array.value_offsets.into_inner().into_inner(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(vec![offsets, array.value_data]) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } } } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index dd6213d543ea..f25a077a81ba 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -208,9 +208,7 @@ pub type UInt64DictionaryArray = DictionaryArray; /// assert_eq!(&array, &expected); /// ``` pub struct DictionaryArray { - /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, - /// as, in the current implementation, `values` below are the first child of this struct. - data: ArrayData, + data_type: DataType, /// The keys of this dictionary. These are constructed from the /// buffer and null bitmap of `data`. Also, note that these do @@ -228,7 +226,7 @@ pub struct DictionaryArray { impl Clone for DictionaryArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), keys: self.keys.clone(), values: self.values.clone(), is_ordered: self.is_ordered, @@ -325,8 +323,12 @@ impl DictionaryArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + keys: self.keys.slice(offset, length), + values: self.values.clone(), + is_ordered: self.is_ordered, + } } /// Downcast this dictionary to a [`TypedDictionaryArray`] @@ -390,8 +392,7 @@ impl DictionaryArray { assert!(values.len() >= self.values.len()); let builder = self - .data - .clone() + .to_data() .into_builder() .data_type(DataType::Dictionary( Box::new(K::DATA_TYPE), @@ -419,7 +420,6 @@ impl DictionaryArray { let key_array = self.keys().clone(); let value_array = self.values().as_primitive::().clone(); - drop(self.data); drop(self.keys); drop(self.values); @@ -504,20 +504,22 @@ impl From for DictionaryArray { key_data_type ); + let values = make_array(data.child_data()[0].clone()); + let data_type = data.data_type().clone(); + // create a zero-copy of the keys' data // SAFETY: // ArrayData is valid and verified type above let keys = PrimitiveArray::::from(unsafe { - data.clone() - .into_builder() + data.into_builder() .data_type(T::DATA_TYPE) .child_data(vec![]) .build_unchecked() }); - let values = make_array(data.child_data()[0].clone()); + Self { - data, + data_type, keys, values, is_ordered: false, @@ -530,7 +532,14 @@ impl From for DictionaryArray { impl From> for ArrayData { fn from(array: DictionaryArray) -> Self { - array.data + let builder = array + .keys + .into_data() + .into_builder() + .data_type(array.data_type) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -594,24 +603,46 @@ impl Array for DictionaryArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.keys.len() + } + + fn is_empty(&self) -> bool { + self.keys.is_empty() + } + + fn offset(&self) -> usize { + self.keys.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.keys.nulls() + } + + fn get_buffer_memory_size(&self) -> usize { + self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + + self.keys.get_buffer_memory_size() + + self.values.get_array_memory_size() } } @@ -685,10 +716,6 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary } - fn data(&self) -> &ArrayData { - &self.dictionary.data - } - fn to_data(&self) -> ArrayData { self.dictionary.to_data() } @@ -697,13 +724,37 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary.into_data() } + fn data_type(&self) -> &DataType { + self.dictionary.data_type() + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.dictionary.slice(offset, length)) } + fn len(&self) -> usize { + self.dictionary.len() + } + + fn is_empty(&self) -> bool { + self.dictionary.is_empty() + } + + fn offset(&self) -> usize { + self.dictionary.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { self.dictionary.nulls() } + + fn get_buffer_memory_size(&self) -> usize { + self.dictionary.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + self.dictionary.get_array_memory_size() + } } impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index f8d2f04dee69..08ce76c066c3 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -20,7 +20,7 @@ use crate::iterator::FixedSizeBinaryIter; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{bit_util, Buffer, MutableBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -51,9 +51,11 @@ use std::sync::Arc; /// #[derive(Clone)] pub struct FixedSizeBinaryArray { - data: ArrayData, + data_type: DataType, // Must be DataType::FixedSizeBinary(value_length) value_data: Buffer, - length: i32, + nulls: Option, + len: usize, + value_length: i32, } impl FixedSizeBinaryArray { @@ -62,12 +64,12 @@ impl FixedSizeBinaryArray { /// Panics if index `i` is out of bounds. pub fn value(&self, i: usize) -> &[u8] { assert!( - i < self.data.len(), + i < self.len(), "Trying to access an element at index {} from a FixedSizeBinaryArray of length {}", i, self.len() ); - let offset = i + self.data.offset(); + let offset = i + self.offset(); unsafe { let pos = self.value_offset_at(offset); std::slice::from_raw_parts( @@ -81,7 +83,7 @@ impl FixedSizeBinaryArray { /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let offset = i + self.data.offset(); + let offset = i + self.offset(); let pos = self.value_offset_at(offset); std::slice::from_raw_parts( self.value_data.as_ptr().offset(pos as isize), @@ -94,7 +96,7 @@ impl FixedSizeBinaryArray { /// Note this doesn't do any bound checking, for performance reason. #[inline] pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) + self.value_offset_at(self.offset() + i) } /// Returns the length for an element. @@ -102,18 +104,30 @@ impl FixedSizeBinaryArray { /// All elements have the same length as the array is a fixed size. #[inline] pub fn value_length(&self) -> i32 { - self.length + self.value_length } /// Returns a clone of the value data buffer pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() + self.value_data.clone() } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced FixedSizeBinaryArray cannot exceed the existing length" + ); + + let size = self.value_length as usize; + + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + value_length: self.value_length, + value_data: self.value_data.slice_with_length(offset * size, len * size), + len, + } } /// Create an array from an iterable argument of sparse byte slices. @@ -364,7 +378,7 @@ impl FixedSizeBinaryArray { #[inline] fn value_offset_at(&self, i: usize) -> i32 { - self.length * i as i32 + self.value_length * i as i32 } /// constructs a new iterator @@ -380,22 +394,33 @@ impl From for FixedSizeBinaryArray { 1, "FixedSizeBinaryArray data should contain 1 buffer only (values)" ); - let value_data = data.buffers()[0].clone(); - let length = match data.data_type() { + let value_length = match data.data_type() { DataType::FixedSizeBinary(len) => *len, _ => panic!("Expected data type to be FixedSizeBinary"), }; + + let size = value_length as usize; + let value_data = + data.buffers()[0].slice_with_length(data.offset() * size, data.len() * size); + Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + len: data.len(), value_data, - length, + value_length, } } } impl From for ArrayData { fn from(array: FixedSizeBinaryArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .buffers(vec![array.value_data]) + .nulls(array.nulls); + + unsafe { builder.build_unchecked() } } } @@ -468,24 +493,48 @@ impl Array for FixedSizeBinaryArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.value_data.capacity(); + if let Some(n) = &self.nulls { + sum += n.buffer().capacity(); + } + sum + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -566,9 +615,9 @@ mod tests { fixed_size_binary_array.value(1) ); assert_eq!(2, fixed_size_binary_array.len()); - assert_eq!(5, fixed_size_binary_array.value_offset(0)); + assert_eq!(0, fixed_size_binary_array.value_offset(0)); assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(1)); + assert_eq!(5, fixed_size_binary_array.value_offset(1)); } #[test] diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index a56bb017f6b0..86adafa066f0 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; use arrow_buffer::buffer::NullBuffer; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -64,9 +64,11 @@ use std::sync::Arc; /// [crate::array::FixedSizeBinaryArray] #[derive(Clone)] pub struct FixedSizeListArray { - data: ArrayData, + data_type: DataType, // Must be DataType::FixedSizeList(value_length) values: ArrayRef, - length: i32, + nulls: Option, + value_length: i32, + len: usize, } impl FixedSizeListArray { @@ -91,7 +93,7 @@ impl FixedSizeListArray { /// Note this doesn't do any bound checking, for performance reason. #[inline] pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) + self.value_offset_at(i) } /// Returns the length for an element. @@ -99,18 +101,29 @@ impl FixedSizeListArray { /// All elements have the same length as the array is a fixed size. #[inline] pub const fn value_length(&self) -> i32 { - self.length + self.value_length } #[inline] const fn value_offset_at(&self, i: usize) -> i32 { - i as i32 * self.length + i as i32 * self.value_length } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced FixedSizeListArray cannot exceed the existing length" + ); + let size = self.value_length as usize; + + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset * size, len * size), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + value_length: self.value_length, + len, + } } /// Creates a [`FixedSizeListArray`] from an iterator of primitive values @@ -163,45 +176,35 @@ impl FixedSizeListArray { impl From for FixedSizeListArray { fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 0, - "FixedSizeListArray data should not contain a buffer for value offsets" - ); - assert_eq!( - data.child_data().len(), - 1, - "FixedSizeListArray should contain a single child array (values array)" - ); - let values = make_array(data.child_data()[0].clone()); - let length = match data.data_type() { - DataType::FixedSizeList(_, len) => { - if *len > 0 { - // check that child data is multiple of length - assert_eq!( - values.len() % *len as usize, - 0, - "FixedSizeListArray child array length should be a multiple of {len}" - ); - } - - *len - } + let value_length = match data.data_type() { + DataType::FixedSizeList(_, len) => *len, _ => { panic!("FixedSizeListArray data should contain a FixedSizeList data type") } }; + + let size = value_length as usize; + let values = make_array( + data.child_data()[0].slice(data.offset() * size, data.len() * size), + ); Self { - data, + data_type: data.data_type().clone(), values, - length, + nulls: data.nulls().cloned(), + value_length, + len: data.len(), } } } impl From for ArrayData { fn from(array: FixedSizeListArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .nulls(array.nulls) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -210,24 +213,52 @@ impl Array for FixedSizeListArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -258,7 +289,6 @@ mod tests { use super::*; use crate::cast::AsArray; use crate::types::Int32Type; - use crate::Int32Array; use arrow_buffer::{bit_util, Buffer}; use arrow_schema::Field; @@ -289,15 +319,7 @@ mod tests { assert_eq!(0, list_array.null_count()); assert_eq!(6, list_array.value_offset(2)); assert_eq!(3, list_array.value_length()); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); + assert_eq!(0, list_array.value(0).as_primitive::().value(0)); for i in 0..3 { assert!(list_array.is_valid(i)); assert!(!list_array.is_null(i)); @@ -305,26 +327,24 @@ mod tests { // Now test with a non-zero offset let list_data = ArrayData::builder(list_data_type) - .len(3) + .len(2) .offset(1) .add_child_data(value_data.clone()) .build() .unwrap(); let list_array = FixedSizeListArray::from(list_data); - assert_eq!(value_data, list_array.values().to_data()); + assert_eq!(value_data.slice(3, 6), list_array.values().to_data()); assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); + assert_eq!(2, list_array.len()); assert_eq!(0, list_array.null_count()); assert_eq!(3, list_array.value(0).as_primitive::().value(0)); - assert_eq!(6, list_array.value_offset(1)); + assert_eq!(3, list_array.value_offset(1)); assert_eq!(3, list_array.value_length()); } #[test] - #[should_panic( - expected = "FixedSizeListArray child array length should be a multiple of 3" - )] + #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -389,11 +409,10 @@ mod tests { let sliced_array = list_array.slice(1, 4); assert_eq!(4, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(2, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); @@ -406,12 +425,14 @@ mod tests { .downcast_ref::() .unwrap(); assert_eq!(2, sliced_list_array.value_length()); - assert_eq!(6, sliced_list_array.value_offset(2)); - assert_eq!(8, sliced_list_array.value_offset(3)); + assert_eq!(4, sliced_list_array.value_offset(2)); + assert_eq!(6, sliced_list_array.value_offset(3)); } #[test] - #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] + #[should_panic( + expected = "the offset of the new Buffer cannot exceed the existing length" + )] fn test_fixed_size_list_array_index_out_of_bound() { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f47ea80696e7..8e6f84743f2a 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -21,7 +21,7 @@ use crate::{ iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, }; use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use num::Integer; use std::any::Any; @@ -52,7 +52,8 @@ impl OffsetSizeTrait for i64 { /// /// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` pub struct GenericListArray { - data: ArrayData, + data_type: DataType, + nulls: Option, values: ArrayRef, value_offsets: OffsetBuffer, } @@ -60,7 +61,8 @@ pub struct GenericListArray { impl Clone for GenericListArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), values: self.values.clone(), value_offsets: self.value_offsets.clone(), } @@ -144,8 +146,12 @@ impl GenericListArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + } } /// Creates a [`GenericListArray`] from an iterator of primitive values @@ -201,7 +207,14 @@ impl From> for ArrayData { fn from(array: GenericListArray) -> Self { - array.data + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner()]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -244,7 +257,8 @@ impl GenericListArray { let value_offsets = unsafe { get_offsets(&data) }; Ok(Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), values, value_offsets, }) @@ -256,24 +270,54 @@ impl Array for GenericListArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -649,11 +693,10 @@ mod tests { let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(3, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); @@ -713,11 +756,10 @@ mod tests { let sliced_array = list_array.slice(1, 6); assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); assert_eq!(3, sliced_array.null_count()); for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + if bit_util::get_bit(&null_bits, 1 + i) { assert!(sliced_array.is_valid(i)); } else { assert!(sliced_array.is_null(i)); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 1629532b8452..18b3eb3cec32 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -18,7 +18,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::{make_array, Array, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; @@ -30,7 +30,8 @@ use std::sync::Arc; /// [StructArray] with 2 child fields. #[derive(Clone)] pub struct MapArray { - data: ArrayData, + data_type: DataType, + nulls: Option, /// The [`StructArray`] that is the direct child of this array entries: ArrayRef, /// The first child of `entries`, the "keys" of this MapArray @@ -112,8 +113,14 @@ impl MapArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + entries: self.entries.clone(), + keys: self.keys.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + } } } @@ -126,7 +133,14 @@ impl From for MapArray { impl From for ArrayData { fn from(array: MapArray) -> Self { - array.data + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![array.value_offsets.into_inner().into_inner()]) + .child_data(vec![array.entries.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -177,7 +191,8 @@ impl MapArray { let value_offsets = unsafe { get_offsets(&data) }; Ok(Self { - data, + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), entries, keys, values, @@ -229,34 +244,54 @@ impl Array for MapArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into_data() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.value_offsets.len() - 1 + } + + fn is_empty(&self) -> bool { + self.value_offsets.len() <= 1 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [MapArray]. fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() + let mut size = self.entries.get_buffer_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } - /// Returns the total number of bytes of memory occupied physically by this [MapArray]. fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + std::mem::size_of_val(self) + let mut size = std::mem::size_of::() + self.entries.get_array_memory_size(); + size += self.value_offsets.inner().inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index fa6e970b497a..e6fd6828bac7 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -94,10 +94,6 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn as_any(&self) -> &dyn Any; - /// Returns a reference to the underlying data of this array - #[deprecated(note = "Use Array::to_data or Array::into_data")] - fn data(&self) -> &ArrayData; - /// Returns the underlying data of this array fn to_data(&self) -> ArrayData; @@ -106,13 +102,6 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones fn into_data(self) -> ArrayData; - /// Returns a reference-counted pointer to the underlying data of this array. - #[deprecated(note = "Use Array::to_data or Array::into_data")] - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - self.data() - } - /// Returns a reference to the [`DataType`](arrow_schema::DataType) of this array. /// /// # Example: @@ -125,10 +114,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(*array.data_type(), DataType::Int32); /// ``` - #[allow(deprecated)] // (#3880) - fn data_type(&self) -> &DataType { - self.data_ref().data_type() - } + fn data_type(&self) -> &DataType; /// Returns a zero-copy slice of this array with the indicated offset and length. /// @@ -156,10 +142,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.len(), 5); /// ``` - #[allow(deprecated)] // (#3880) - fn len(&self) -> usize { - self.data_ref().len() - } + fn len(&self) -> usize; /// Returns whether this array is empty. /// @@ -172,10 +155,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// /// assert_eq!(array.is_empty(), false); /// ``` - #[allow(deprecated)] // (#3880) - fn is_empty(&self) -> bool { - self.data_ref().is_empty() - } + fn is_empty(&self) -> bool; /// Returns the offset into the underlying data used by this array(-slice). /// Note that the underlying data can be shared by many arrays. @@ -184,19 +164,15 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// # Example: /// /// ``` - /// use arrow_array::{Array, Int32Array}; + /// use arrow_array::{Array, BooleanArray}; /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// // Make slice over the values [2, 3, 4] + /// let array = BooleanArray::from(vec![false, false, true, true]); /// let array_slice = array.slice(1, 3); /// /// assert_eq!(array.offset(), 0); /// assert_eq!(array_slice.offset(), 1); /// ``` - #[allow(deprecated)] // (#3880) - fn offset(&self) -> usize { - self.data_ref().offset() - } + fn offset(&self) -> usize; /// Returns the null buffers of this array if any fn nulls(&self) -> Option<&NullBuffer>; @@ -253,21 +229,12 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. - #[allow(deprecated)] // (#3880) - fn get_buffer_memory_size(&self) -> usize { - self.data_ref().get_buffer_memory_size() - } + fn get_buffer_memory_size(&self) -> usize; /// Returns the total number of bytes of memory occupied physically by this array. /// This value will always be greater than returned by `get_buffer_memory_size()` and /// includes the overhead of the data structures that contain the pointers to the various buffers. - #[allow(deprecated)] // (#3880) - fn get_array_memory_size(&self) -> usize { - // both data.get_array_memory_size and size_of_val(self) include ArrayData fields, - // to only count additional fields of this array subtract size_of(ArrayData) - self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) - - std::mem::size_of::() - } + fn get_array_memory_size(&self) -> usize; } /// A reference-counted reference to a generic `Array`. @@ -279,11 +246,6 @@ impl Array for ArrayRef { self.as_ref().as_any() } - #[allow(deprecated)] - fn data(&self) -> &ArrayData { - self.as_ref().data() - } - fn to_data(&self) -> ArrayData { self.as_ref().to_data() } @@ -292,11 +254,6 @@ impl Array for ArrayRef { self.to_data() } - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - self.as_ref().data_ref() - } - fn data_type(&self) -> &DataType { self.as_ref().data_type() } @@ -347,11 +304,6 @@ impl<'a, T: Array> Array for &'a T { T::as_any(self) } - #[allow(deprecated)] - fn data(&self) -> &ArrayData { - T::data(self) - } - fn to_data(&self) -> ArrayData { T::to_data(self) } @@ -360,11 +312,6 @@ impl<'a, T: Array> Array for &'a T { self.to_data() } - #[allow(deprecated)] - fn data_ref(&self) -> &ArrayData { - T::data_ref(self) - } - fn data_type(&self) -> &DataType { T::data_type(self) } @@ -435,93 +382,80 @@ pub trait ArrayAccessor: Array { } impl PartialEq for dyn Array + '_ { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for dyn Array + '_ { - #[allow(deprecated)] fn eq(&self, other: &T) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for NullArray { - #[allow(deprecated)] fn eq(&self, other: &NullArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for PrimitiveArray { - #[allow(deprecated)] fn eq(&self, other: &PrimitiveArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for DictionaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for BooleanArray { - #[allow(deprecated)] fn eq(&self, other: &BooleanArray) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericStringArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericBinaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for FixedSizeBinaryArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for GenericListArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for MapArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for FixedSizeListArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } impl PartialEq for StructArray { - #[allow(deprecated)] fn eq(&self, other: &Self) -> bool { - self.data().eq(other.data()) + self.to_data().eq(&other.to_data()) } } @@ -752,7 +686,7 @@ mod tests { use super::*; use crate::cast::{as_union_array, downcast_array}; use crate::downcast_run_array; - use arrow_buffer::{Buffer, MutableBuffer}; + use arrow_buffer::MutableBuffer; use arrow_schema::{Field, Fields, UnionFields, UnionMode}; #[test] @@ -962,13 +896,9 @@ mod tests { assert_eq!(0, null_arr.get_buffer_memory_size()); assert_eq!( - std::mem::size_of::(), + std::mem::size_of::(), null_arr.get_array_memory_size() ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::(), - ); } #[test] @@ -1001,8 +931,7 @@ mod tests { // which includes the optional validity buffer // plus one buffer on the heap assert_eq!( - std::mem::size_of::>() - + std::mem::size_of::(), + std::mem::size_of::>(), empty_with_bitmap.get_array_memory_size() ); diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index b5d9247a6d7f..c7f61d91da70 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -19,7 +19,7 @@ use crate::{Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; @@ -40,7 +40,7 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct NullArray { - data: ArrayData, + len: usize, } impl NullArray { @@ -50,15 +50,17 @@ impl NullArray { /// other [`DataType`]. /// pub fn new(length: usize) -> Self { - let array_data = ArrayData::builder(DataType::Null).len(length); - let array_data = unsafe { array_data.build_unchecked() }; - NullArray::from(array_data) + Self { len: length } } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced BooleanBuffer cannot exceed the existing length" + ); + + Self { len } } } @@ -67,22 +69,34 @@ impl Array for NullArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &DataType::Null + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { None } @@ -104,6 +118,14 @@ impl Array for NullArray { fn null_count(&self) -> usize { self.len() } + + fn get_buffer_memory_size(&self) -> usize { + 0 + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + } } impl From for NullArray { @@ -122,13 +144,14 @@ impl From for NullArray { data.nulls().is_none(), "NullArray data should not contain a null buffer, as no buffers are required" ); - Self { data } + Self { len: data.len() } } } impl From for ArrayData { fn from(array: NullArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(DataType::Null).len(array.len); + unsafe { builder.build_unchecked() } } } @@ -158,7 +181,6 @@ mod tests { let array2 = array1.slice(8, 16); assert_eq!(array2.len(), 16); assert_eq!(array2.null_count(), 16); - assert_eq!(array2.offset(), 8); } #[test] diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 75bf85b3f2a0..3199104382a6 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -29,7 +29,7 @@ use arrow_buffer::{ i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, }; use arrow_data::bit_iterator::try_for_each_valid_idx; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; @@ -248,17 +248,18 @@ pub use crate::types::ArrowPrimitiveType; /// } /// ``` pub struct PrimitiveArray { - /// Underlying ArrayData - data: ArrayData, + data_type: DataType, /// Values data values: ScalarBuffer, + nulls: Option, } impl Clone for PrimitiveArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), values: self.values.clone(), + nulls: self.nulls.clone(), } } } @@ -281,16 +282,11 @@ impl PrimitiveArray { assert_eq!(values.len(), n.len()); } - // TODO: Don't store ArrayData inside arrays (#3880) - let data = unsafe { - ArrayData::builder(data_type) - .len(values.len()) - .nulls(nulls) - .buffers(vec![values.inner().clone()]) - .build_unchecked() - }; - - Self { data, values } + Self { + data_type, + values, + nulls, + } } /// Asserts that `data_type` is compatible with `Self` @@ -306,12 +302,12 @@ impl PrimitiveArray { /// Returns the length of this array. #[inline] pub fn len(&self) -> usize { - self.data.len() + self.values.len() } /// Returns whether this array is empty. pub fn is_empty(&self) -> bool { - self.data.is_empty() + self.values.is_empty() } /// Returns the values of this array @@ -367,18 +363,12 @@ impl PrimitiveArray { /// Creates a PrimitiveArray based on an iterator of values without nulls pub fn from_iter_values>(iter: I) -> Self { let val_buf: Buffer = iter.into_iter().collect(); - let data = unsafe { - ArrayData::new_unchecked( - T::DATA_TYPE, - val_buf.len() / std::mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ) - }; - PrimitiveArray::from(data) + let len = val_buf.len() / std::mem::size_of::(); + Self { + data_type: T::DATA_TYPE, + values: ScalarBuffer::new(val_buf, 0, len), + nulls: None, + } } /// Creates a PrimitiveArray based on a constant value with `count` elements @@ -410,8 +400,11 @@ impl PrimitiveArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + values: self.values.slice(offset, length), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + } } /// Reinterprets this array's contents as a different data type without copying @@ -436,7 +429,7 @@ impl PrimitiveArray { where K: ArrowPrimitiveType, { - let d = self.data.clone().into_builder().data_type(K::DATA_TYPE); + let d = self.to_data().into_builder().data_type(K::DATA_TYPE); // SAFETY: // Native type is the same @@ -629,14 +622,14 @@ impl PrimitiveArray { /// data buffer is not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let null_bit_buffer = self.data.nulls().map(|b| b.inner().sliced()); + let data = self.into_data(); + let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let buffer = self.data.buffers()[0] - .slice_with_length(self.data.offset() * element_len, len * element_len); + let buffer = data.buffers()[0] + .slice_with_length(data.offset() * element_len, len * element_len); - drop(self.data); - drop(self.values); + drop(data); let try_mutable_null_buffer = match null_bit_buffer { None => Ok(None), @@ -686,7 +679,12 @@ impl PrimitiveArray { impl From> for ArrayData { fn from(array: PrimitiveArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.values.len()) + .nulls(array.nulls) + .buffers(vec![array.values.into_inner()]); + + unsafe { builder.build_unchecked() } } } @@ -695,24 +693,48 @@ impl Array for PrimitiveArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.values.len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + self.get_buffer_memory_size() } } @@ -1061,8 +1083,7 @@ impl PrimitiveArray { /// Construct a timestamp array with an optional timezone pub fn with_timezone_opt>>(&self, timezone: Option) -> Self { let array_data = unsafe { - self.data - .clone() + self.to_data() .into_builder() .data_type(DataType::Timestamp(T::UNIT, timezone.map(Into::into))) .build_unchecked() @@ -1083,7 +1104,11 @@ impl From for PrimitiveArray { let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); - Self { data, values } + Self { + data_type: data.data_type().clone(), + values, + nulls: data.nulls().cloned(), + } } } @@ -1108,12 +1133,10 @@ impl PrimitiveArray { self.validate_precision_scale(precision, scale)?; // safety: self.data is valid DataType::Decimal as checked above - let new_data_type = T::TYPE_CONSTRUCTOR(precision, scale); - let data = self.data.into_builder().data_type(new_data_type); - - // SAFETY - // Validated data above - Ok(unsafe { data.build_unchecked().into() }) + Ok(Self { + data_type: T::TYPE_CONSTRUCTOR(precision, scale), + ..self + }) } // validate that the new precision and scale are valid or not @@ -1244,7 +1267,7 @@ mod tests { fn test_primitive_array_from_vec() { let buf = Buffer::from_slice_ref([0, 1, 2, 3, 4]); let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - assert_eq!(buf, *arr.data.buffers()[0]); + assert_eq!(&buf, arr.values.inner()); assert_eq!(5, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1484,7 +1507,6 @@ mod tests { let arr2 = arr.slice(2, 5); assert_eq!(5, arr2.len()); - assert_eq!(2, arr2.offset()); assert_eq!(1, arr2.null_count()); for i in 0..arr2.len() { @@ -1497,7 +1519,6 @@ mod tests { let arr3 = arr2.slice(2, 3); assert_eq!(3, arr3.len()); - assert_eq!(4, arr3.offset()); assert_eq!(0, arr3.null_count()); let int_arr3 = arr3.as_any().downcast_ref::().unwrap(); @@ -1742,7 +1763,7 @@ mod tests { fn test_primitive_array_builder() { // Test building a primitive array with ArrayData builder and offset let buf = Buffer::from_slice_ref([0i32, 1, 2, 3, 4, 5, 6]); - let buf2 = buf.clone(); + let buf2 = buf.slice_with_length(8, 20); let data = ArrayData::builder(DataType::Int32) .len(5) .offset(2) @@ -1750,7 +1771,7 @@ mod tests { .build() .unwrap(); let arr = Int32Array::from(data); - assert_eq!(buf2, *arr.data.buffers()[0]); + assert_eq!(&buf2, arr.values.inner()); assert_eq!(5, arr.len()); assert_eq!(0, arr.null_count()); for i in 0..3 { diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 0754913e9d3e..e7e71d3840bb 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -62,7 +62,7 @@ use crate::{ /// ``` pub struct RunArray { - data: ArrayData, + data_type: DataType, run_ends: RunEndBuffer, values: ArrayRef, } @@ -70,7 +70,7 @@ pub struct RunArray { impl Clone for RunArray { fn clone(&self) -> Self { Self { - data: self.data.clone(), + data_type: self.data_type.clone(), run_ends: self.run_ends.clone(), values: self.values.clone(), } @@ -256,8 +256,11 @@ impl RunArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + Self { + data_type: self.data_type.clone(), + run_ends: self.run_ends.slice(offset, length), + values: self.values.clone(), + } } } @@ -282,7 +285,7 @@ impl From for RunArray { let values = make_array(data.child_data()[1].clone()); Self { - data, + data_type: data.data_type().clone(), run_ends, values, } @@ -291,7 +294,21 @@ impl From for RunArray { impl From> for ArrayData { fn from(array: RunArray) -> Self { - array.data + let len = array.run_ends.len(); + let offset = array.run_ends.offset(); + + let run_ends = ArrayDataBuilder::new(R::DATA_TYPE) + .len(array.run_ends.values().len()) + .buffers(vec![array.run_ends.into_inner().into_inner()]); + + let run_ends = unsafe { run_ends.build_unchecked() }; + + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .offset(offset) + .child_data(vec![run_ends, array.values.to_data()]); + + unsafe { builder.build_unchecked() } } } @@ -300,25 +317,47 @@ impl Array for RunArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.run_ends.len() + } + + fn is_empty(&self) -> bool { + self.run_ends.is_empty() + } + + fn offset(&self) -> usize { + self.run_ends.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { None } + + fn get_buffer_memory_size(&self) -> usize { + self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + std::mem::size_of::() + + self.run_ends.inner().inner().capacity() + + self.values.get_array_memory_size() + } } impl std::fmt::Debug for RunArray { @@ -497,10 +536,6 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array } - fn data(&self) -> &ArrayData { - &self.run_array.data - } - fn to_data(&self) -> ArrayData { self.run_array.to_data() } @@ -509,13 +544,37 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array.into_data() } + fn data_type(&self) -> &DataType { + self.run_array.data_type() + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.run_array.slice(offset, length)) } + fn len(&self) -> usize { + self.run_array.len() + } + + fn is_empty(&self) -> bool { + self.run_array.is_empty() + } + + fn offset(&self) -> usize { + self.run_array.offset() + } + fn nulls(&self) -> Option<&NullBuffer> { self.run_array.nulls() } + + fn get_buffer_memory_size(&self) -> usize { + self.run_array.get_buffer_memory_size() + } + + fn get_array_memory_size(&self) -> usize { + self.run_array.get_array_memory_size() + } } // Array accessor converts the index of logical array to the index of the physical array diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 1dccfc7d4ef3..fa43062b77bf 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -17,7 +17,7 @@ use crate::{make_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder}; use std::sync::Arc; use std::{any::Any, ops::Index}; @@ -74,24 +74,26 @@ use std::{any::Any, ops::Index}; /// ``` #[derive(Clone)] pub struct StructArray { - data: ArrayData, - pub(crate) boxed_fields: Vec, + len: usize, + data_type: DataType, + nulls: Option, + pub(crate) fields: Vec, } impl StructArray { /// Returns the field at `pos`. pub fn column(&self, pos: usize) -> &ArrayRef { - &self.boxed_fields[pos] + &self.fields[pos] } /// Return the number of fields in this struct array pub fn num_columns(&self) -> usize { - self.boxed_fields.len() + self.fields.len() } /// Returns the fields of the struct array pub fn columns(&self) -> &[ArrayRef] { - &self.boxed_fields + &self.fields } /// Returns child array refs of the struct array @@ -102,7 +104,7 @@ impl StructArray { /// Return field names in this struct array pub fn column_names(&self) -> Vec<&str> { - match self.data.data_type() { + match self.data_type() { DataType::Struct(fields) => fields .iter() .map(|f| f.name().as_str()) @@ -132,27 +134,48 @@ impl StructArray { } /// Returns a zero-copy slice of this array with the indicated offset and length. - pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + pub fn slice(&self, offset: usize, len: usize) -> Self { + assert!( + offset.saturating_add(len) <= self.len, + "the length + offset of the sliced StructArray cannot exceed the existing length" + ); + + let fields = self.fields.iter().map(|a| a.slice(offset, len)).collect(); + + Self { + len, + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)), + fields, + } } } impl From for StructArray { fn from(data: ArrayData) -> Self { - let boxed_fields = data + let fields = data .child_data() .iter() .map(|cd| make_array(cd.clone())) .collect(); - Self { data, boxed_fields } + Self { + len: data.len(), + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + fields, + } } } impl From for ArrayData { fn from(array: StructArray) -> Self { - array.data + let builder = ArrayDataBuilder::new(array.data_type) + .len(array.len) + .nulls(array.nulls) + .child_data(array.fields.iter().map(|x| x.to_data()).collect()); + + unsafe { builder.build_unchecked() } } } @@ -228,24 +251,53 @@ impl Array for StructArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.len + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { - self.data.nulls() + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = self.fields.iter().map(|a| a.get_array_memory_size()).sum(); + size += std::mem::size_of::(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size } } @@ -343,15 +395,11 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { impl From for StructArray { fn from(value: RecordBatch) -> Self { - // TODO: Don't store ArrayData inside arrays (#3880) - let builder = ArrayData::builder(DataType::Struct(value.schema().fields.clone())) - .child_data(value.columns().iter().map(|x| x.to_data()).collect()) - .len(value.num_rows()); - - // Safety: RecordBatch must be valid Self { - data: unsafe { builder.build_unchecked() }, - boxed_fields: value.columns().to_vec(), + len: value.num_rows(), + data_type: DataType::Struct(value.schema().fields().clone()), + nulls: None, + fields: value.columns().to_vec(), } } } @@ -607,7 +655,6 @@ mod tests { let sliced_array = struct_array.slice(2, 3); let sliced_array = sliced_array.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_array.len()); - assert_eq!(2, sliced_array.offset()); assert_eq!(1, sliced_array.null_count()); assert!(sliced_array.is_valid(0)); assert!(sliced_array.is_null(1)); @@ -616,7 +663,6 @@ mod tests { let sliced_c0 = sliced_array.column(0); let sliced_c0 = sliced_c0.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_c0.len()); - assert_eq!(2, sliced_c0.offset()); assert!(sliced_c0.is_null(0)); assert!(sliced_c0.is_null(1)); assert!(sliced_c0.is_valid(2)); @@ -625,7 +671,6 @@ mod tests { let sliced_c1 = sliced_array.column(1); let sliced_c1 = sliced_c1.as_any().downcast_ref::().unwrap(); assert_eq!(3, sliced_c1.len()); - assert_eq!(2, sliced_c1.offset()); assert!(sliced_c1.is_valid(0)); assert_eq!(42, sliced_c1.value(0)); assert!(sliced_c1.is_null(1)); diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 7b818f3130b7..172ae082197c 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -18,7 +18,7 @@ use crate::{make_array, Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::{Buffer, ScalarBuffer}; -use arrow_data::ArrayData; +use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; /// Contains the `UnionArray` type. /// @@ -108,10 +108,10 @@ use std::sync::Arc; /// ``` #[derive(Clone)] pub struct UnionArray { - data: ArrayData, + data_type: DataType, type_ids: ScalarBuffer, offsets: Option>, - boxed_fields: Vec>, + fields: Vec>, } impl UnionArray { @@ -231,7 +231,7 @@ impl UnionArray { /// Panics if the `type_id` provided is less than zero or greater than the number of types /// in the `Union`. pub fn child(&self, type_id: i8) -> &ArrayRef { - let boxed = &self.boxed_fields[type_id as usize]; + let boxed = &self.fields[type_id as usize]; boxed.as_ref().expect("invalid type id") } @@ -279,7 +279,7 @@ impl UnionArray { /// Returns the names of the types in the union. pub fn type_names(&self) -> Vec<&str> { - match self.data.data_type() { + match self.data_type() { DataType::Union(fields, _) => fields .iter() .map(|(_, f)| f.name().as_str()) @@ -290,7 +290,7 @@ impl UnionArray { /// Returns whether the `UnionArray` is dense (or sparse if `false`). fn is_dense(&self) -> bool { - match self.data.data_type() { + match self.data_type() { DataType::Union(_, mode) => mode == &UnionMode::Dense, _ => unreachable!("Union array's data type is not a union!"), } @@ -298,8 +298,26 @@ impl UnionArray { /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { - // TODO: Slice buffers directly (#3880) - self.data.slice(offset, length).into() + let (offsets, fields) = match self.offsets.as_ref() { + // If dense union, slice offsets + Some(offsets) => (Some(offsets.slice(offset, length)), self.fields.clone()), + // Otherwise need to slice sparse children + None => { + let fields = self + .fields + .iter() + .map(|x| x.as_ref().map(|x| x.slice(offset, length))) + .collect(); + (None, fields) + } + }; + + Self { + data_type: self.data_type.clone(), + type_ids: self.type_ids.slice(offset, length), + offsets, + fields, + } } } @@ -330,17 +348,36 @@ impl From for UnionArray { boxed_fields[field_id as usize] = Some(make_array(cd.clone())); } Self { - data, + data_type: data.data_type().clone(), type_ids, offsets, - boxed_fields, + fields: boxed_fields, } } } impl From for ArrayData { fn from(array: UnionArray) -> Self { - array.data + let len = array.len(); + let f = match &array.data_type { + DataType::Union(f, _) => f, + _ => unreachable!(), + }; + let buffers = match array.offsets { + Some(o) => vec![array.type_ids.into_inner(), o.into_inner()], + None => vec![array.type_ids.into_inner()], + }; + + let child = f + .iter() + .map(|(i, _)| array.fields[i as usize].as_ref().unwrap().to_data()) + .collect(); + + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .buffers(buffers) + .child_data(child); + unsafe { builder.build_unchecked() } } } @@ -349,22 +386,34 @@ impl Array for UnionArray { self } - fn data(&self) -> &ArrayData { - &self.data - } - fn to_data(&self) -> ArrayData { - self.data.clone() + self.clone().into() } fn into_data(self) -> ArrayData { self.into() } + fn data_type(&self) -> &DataType { + &self.data_type + } + fn slice(&self, offset: usize, length: usize) -> ArrayRef { Arc::new(self.slice(offset, length)) } + fn len(&self) -> usize { + self.type_ids.len() + } + + fn is_empty(&self) -> bool { + self.type_ids.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + fn nulls(&self) -> Option<&NullBuffer> { None } @@ -386,6 +435,32 @@ impl Array for UnionArray { fn null_count(&self) -> usize { 0 } + + fn get_buffer_memory_size(&self) -> usize { + let mut sum = self.type_ids.inner().capacity(); + if let Some(o) = self.offsets.as_ref() { + sum += o.inner().capacity() + } + self.fields + .iter() + .flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size())) + .sum::() + + sum + } + + fn get_array_memory_size(&self) -> usize { + let mut sum = self.type_ids.inner().capacity(); + if let Some(o) = self.offsets.as_ref() { + sum += o.inner().capacity() + } + std::mem::size_of::() + + self + .fields + .iter() + .flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size())) + .sum::() + + sum + } } impl std::fmt::Debug for UnionArray { diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 1350285f8b26..ee61d2da6597 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -474,7 +474,7 @@ impl From for RecordBatch { ); let row_count = value.len(); let schema = Arc::new(Schema::new(value.fields().clone())); - let columns = value.boxed_fields; + let columns = value.fields; RecordBatch { schema, @@ -614,7 +614,7 @@ mod tests { let record_batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - assert_eq!(record_batch.get_array_memory_size(), 564); + assert_eq!(record_batch.get_array_memory_size(), 364); } fn check_batch(record_batch: RecordBatch, num_rows: usize) { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 05b56a0e8d32..2c1dae5187fa 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4667,10 +4667,8 @@ mod tests { let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); assert_eq!(0, array.offset()); let array = array.slice(2, 3); - assert_eq!(2, array.offset()); let b = cast(&array, &DataType::UInt8).unwrap(); assert_eq!(3, b.len()); - assert_eq!(0, b.offset()); let c = b.as_any().downcast_ref::().unwrap(); assert!(!c.is_valid(0)); assert_eq!(8, c.value(1)); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 7d44d8f24030..08ddd1812bb7 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -24,11 +24,11 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; -use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use flatbuffers::FlatBufferBuilder; use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; +use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use arrow_array::*; use arrow_buffer::bit_util; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; @@ -1107,94 +1107,33 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } -/// Returns byte width for binary value_offset buffer spec. -#[inline] -fn get_value_offset_byte_width(data_type: &DataType) -> usize { - match data_type { - DataType::Binary | DataType::Utf8 => 4, - DataType::LargeBinary | DataType::LargeUtf8 => 8, - _ => unreachable!(), +/// Returns the values and offsets [`Buffer`] for a ByteArray with offset type `O` +/// +/// In particular, this handles re-encoding the offsets if they don't start at `0`, +/// slicing the values buffer as appropriate. This helps reduce the encoded +/// size of sliced arrays, as values that have been sliced away are not encoded +fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { + if data.is_empty() { + return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); } -} -/// Returns the number of total bytes in base binary arrays. -fn get_binary_buffer_len(array_data: &ArrayData) -> usize { - if array_data.is_empty() { - return 0; - } - match array_data.data_type() { - DataType::Binary => { - let array: BinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::LargeBinary => { - let array: LargeBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::Utf8 => { - let array: StringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - DataType::LargeUtf8 => { - let array: LargeStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - (offsets[array_data.len()] - offsets[0]) as usize - } - _ => unreachable!(), - } -} + let buffers = data.buffers(); + let offsets: &[O] = buffers[0].typed_data::(); + let offset_slice = &offsets[data.offset()..data.offset() + data.len() + 1]; -/// Rebase value offsets for given ArrayData to zero-based. -fn get_zero_based_value_offsets( - array_data: &ArrayData, -) -> Buffer { - match array_data.data_type() { - DataType::Binary | DataType::LargeBinary => { - let array: GenericBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - let start_offset = offsets[0]; - - let mut builder = BufferBuilder::::new(array_data.len() + 1); - for x in offsets { - builder.append(*x - start_offset); - } + let start_offset = offset_slice.first().unwrap(); + let end_offset = offset_slice.last().unwrap(); - builder.finish() - } - DataType::Utf8 | DataType::LargeUtf8 => { - let array: GenericStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - let start_offset = offsets[0]; - - let mut builder = BufferBuilder::::new(array_data.len() + 1); - for x in offsets { - builder.append(*x - start_offset); - } - - builder.finish() - } - _ => unreachable!(), - } -} + let offsets = match start_offset.as_usize() { + 0 => buffers[0].clone(), + _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), + }; -/// Returns the start offset of base binary array. -fn get_buffer_offset(array_data: &ArrayData) -> OffsetSize { - match array_data.data_type() { - DataType::Binary | DataType::LargeBinary => { - let array: GenericBinaryArray = array_data.clone().into(); - let offsets = array.value_offsets(); - offsets[0] - } - DataType::Utf8 | DataType::LargeUtf8 => { - let array: GenericStringArray = array_data.clone().into(); - let offsets = array.value_offsets(); - offsets[0] - } - _ => unreachable!(), - } + let values = buffers[1].slice_with_length( + start_offset.as_usize(), + end_offset.as_usize() - start_offset.as_usize(), + ); + (offsets, values) } /// Write array data to a vector of bytes @@ -1241,65 +1180,27 @@ fn write_array_data( } let data_type = array_data.data_type(); - if matches!( - data_type, - DataType::Binary | DataType::LargeBinary | DataType::Utf8 | DataType::LargeUtf8 - ) { - let offset_buffer = &array_data.buffers()[0]; - let value_offset_byte_width = get_value_offset_byte_width(data_type); - let min_length = (array_data.len() + 1) * value_offset_byte_width; - if buffer_need_truncate( - array_data.offset(), - offset_buffer, - &BufferSpec::FixedWidth { - byte_width: value_offset_byte_width, - }, - min_length, - ) { - // Rebase offsets and truncate values - let (new_offsets, byte_offset) = - if matches!(data_type, DataType::Binary | DataType::Utf8) { - ( - get_zero_based_value_offsets::(array_data), - get_buffer_offset::(array_data) as usize, - ) - } else { - ( - get_zero_based_value_offsets::(array_data), - get_buffer_offset::(array_data) as usize, - ) - }; - + if matches!(data_type, DataType::Binary | DataType::Utf8) { + let (offsets, values) = get_byte_array_buffers::(array_data); + for buffer in [offsets, values] { offset = write_buffer( - new_offsets.as_slice(), + buffer.as_slice(), buffers, arrow_data, offset, compression_codec, )?; - - let total_bytes = get_binary_buffer_len(array_data); - let value_buffer = &array_data.buffers()[1]; - let buffer_length = min(total_bytes, value_buffer.len() - byte_offset); - let buffer_slice = - &value_buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; + } + } else if matches!(data_type, DataType::LargeBinary | DataType::LargeUtf8) { + let (offsets, values) = get_byte_array_buffers::(array_data); + for buffer in [offsets, values] { offset = write_buffer( - buffer_slice, + buffer.as_slice(), buffers, arrow_data, offset, compression_codec, )?; - } else { - for buffer in array_data.buffers() { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - )?; - } } } else if DataType::is_numeric(data_type) || DataType::is_temporal(data_type) @@ -1445,20 +1346,20 @@ fn pad_to_8(len: u32) -> usize { #[cfg(test)] mod tests { - use super::*; - use std::io::Cursor; use std::io::Seek; use std::sync::Arc; - use crate::MetadataVersion; - - use crate::reader::*; use arrow_array::builder::PrimitiveRunBuilder; use arrow_array::builder::UnionBuilder; use arrow_array::types::*; use arrow_schema::DataType; + use crate::reader::*; + use crate::MetadataVersion; + + use super::*; + #[test] #[cfg(feature = "lz4")] fn test_write_empty_record_batch_lz4_compression() { diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index cf94b0dd40af..af400868ffa9 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -52,7 +52,7 @@ fn double(array: &PyAny, py: Python) -> PyResult { let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; // export - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } /// calls a lambda function that receives and returns an array @@ -64,7 +64,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as ArrayRef; // to py - let pyarray = array.data().to_pyarrow(py)?; + let pyarray = array.to_data().to_pyarrow(py)?; let pyarray = lambda.call1((pyarray,))?; let array = make_array(ArrayData::from_pyarrow(pyarray)?); @@ -75,7 +75,7 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult { fn make_empty_array(datatype: PyArrowType, py: Python) -> PyResult { let array = new_empty_array(&datatype.0); - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } /// Returns the substring @@ -90,7 +90,7 @@ fn substring( // substring let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; - Ok(array.data().to_owned().into()) + Ok(array.to_data().into()) } /// Returns the concatenate @@ -101,7 +101,7 @@ fn concatenate(array: PyArrowType, py: Python) -> PyResult // concat let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; - array.data().to_pyarrow(py) + array.to_data().to_pyarrow(py) } #[pyfunction] diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 6039d53eaedc..aaa3423d69e5 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -208,8 +208,7 @@ mod tests { let s = s.slice(2, 3); let select = select.slice(1, 3); - let select = select.as_boolean(); - let a = nullif(&s, select).unwrap(); + let a = nullif(&s, &select).unwrap(); let r: Vec<_> = a.as_string::().iter().collect(); assert_eq!(r, vec![None, Some("a"), None]); } @@ -509,9 +508,8 @@ mod tests { .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) .collect(); let b = b.slice(b_start_offset, a_length); - let b = b.as_boolean(); - test_nullif(&a, b); + test_nullif(&a, &b); } } } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 8bad29bf74b7..27c905ba0cd6 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -322,7 +322,7 @@ //! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, //! for FFI or high-performance conversion from other formats. This interface is provided by //! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can be accessed -//! with [`Array::data`](array::Array::data) +//! with [`Array::to_data`](array::Array::to_data) //! //! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants, with the former //! performing extensive, but potentially expensive validation to ensure the buffers are well-formed. From f14a6787cae77d9eb7d44a8cbb71d9fcc9f8365a Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Thu, 13 Apr 2023 00:24:43 +0200 Subject: [PATCH 0816/1411] feat: Prevent UnionArray with Repeated Type IDs (#4070) * feat: Prevent UnionArray with Repeated Type IDs * fix format * add tests * use should_panic * fix clippy --- arrow-schema/src/datatype.rs | 16 ++++++++++++++++ arrow-schema/src/fields.rs | 15 ++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 64e8d0e778b0..0bbd64f30abb 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -742,4 +742,20 @@ mod tests { fn size_should_not_regress() { assert_eq!(std::mem::size_of::(), 24); } + + #[test] + #[should_panic(expected = "duplicate type id: 1")] + fn test_union_with_duplicated_type_id() { + let type_ids = vec![1, 1]; + let _union = DataType::Union( + UnionFields::new( + type_ids, + vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ); + } } diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index b93735328ac6..07e9abeee56a 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -186,7 +186,20 @@ impl UnionFields { T: IntoIterator, { let fields = fields.into_iter().map(Into::into); - type_ids.into_iter().zip(fields).collect() + let mut set = 0_u128; + type_ids + .into_iter() + .map(|idx| { + let mask = 1_u128 << idx; + if (set & mask) != 0 { + panic!("duplicate type id: {}", idx); + } else { + set |= mask; + } + idx + }) + .zip(fields) + .collect() } /// Return size of this instance in bytes. From 512294e954b86853da89f7f25079aa1eeb626893 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 13 Apr 2023 16:18:13 +0100 Subject: [PATCH 0817/1411] Update prost-build requirement from =0.11.8 to =0.11.9 (#4080) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/compare/v0.11.8...v0.11.9) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 08afb572deb0..85dd3366d2cc 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.56", default-features = false } -prost-build = { version = "=0.11.8", default-features = false } +prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.1", default-features = false, features = ["transport", "prost"] } From 485696e3e4b555d9c84300e5fa788df351b83936 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 13 Apr 2023 22:36:52 +0200 Subject: [PATCH 0818/1411] object_store: fix: Incorrect parsing of https Path Style S3 url (#4082) * fix: parse reagion from path-style urls, not bucket * fix: test * fix: parse s3 bucket from first path segment * test: add test for parsing bucket from path style url --- object_store/src/aws/mod.rs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index de62360d0522..34d468f395a4 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -805,12 +805,16 @@ impl AmazonS3Builder { fn parse_url(&mut self, url: &str) -> Result<()> { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - match parsed.scheme() { "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { - Some(("s3", bucket, "amazonaws", "com")) => { - self.bucket_name = Some(bucket.to_string()); + Some(("s3", region, "amazonaws", "com")) => { + self.region = Some(region.to_string()); + if let Some(bucket) = + parsed.path_segments().and_then(|mut path| path.next()) + { + self.bucket_name = Some(bucket.into()); + } } Some((bucket, "s3", region, "amazonaws.com")) => { self.bucket_name = Some(bucket.to_string()); @@ -1519,10 +1523,24 @@ mod tests { let mut builder = AmazonS3Builder::new(); builder - .parse_url("https://s3.bucket.amazonaws.com") + .parse_url("https://s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket") .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); assert_eq!(builder.bucket_name, Some("bucket".to_string())); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); + let mut builder = AmazonS3Builder::new(); builder .parse_url("https://bucket.s3.region.amazonaws.com") From 0121cdfad5207bf4e8e1c4a7c20775297e4c5ae8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Apr 2023 21:39:38 +0100 Subject: [PATCH 0819/1411] Improve JSON decoder errors (#4076) (#4079) * Improve JSON decoder errors (#4076) * Clippy * Review feedback --- arrow-json/src/reader/boolean_array.rs | 4 +- arrow-json/src/reader/decimal_array.rs | 4 +- arrow-json/src/reader/list_array.rs | 8 +- arrow-json/src/reader/map_array.rs | 8 +- arrow-json/src/reader/mod.rs | 115 ++++++++++++++++++++--- arrow-json/src/reader/primitive_array.rs | 4 +- arrow-json/src/reader/string_array.rs | 4 +- arrow-json/src/reader/struct_array.rs | 21 +++-- arrow-json/src/reader/tape.rs | 72 +++++++++----- arrow-json/src/reader/timestamp_array.rs | 4 +- 10 files changed, 183 insertions(+), 61 deletions(-) diff --git a/arrow-json/src/reader/boolean_array.rs b/arrow-json/src/reader/boolean_array.rs index 9a7f226805da..9094391cd7dd 100644 --- a/arrow-json/src/reader/boolean_array.rs +++ b/arrow-json/src/reader/boolean_array.rs @@ -21,7 +21,7 @@ use arrow_data::ArrayData; use arrow_schema::ArrowError; use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{tape_error, ArrayDecoder}; +use crate::reader::ArrayDecoder; #[derive(Default)] pub struct BooleanArrayDecoder {} @@ -34,7 +34,7 @@ impl ArrayDecoder for BooleanArrayDecoder { TapeElement::Null => builder.append_null(), TapeElement::True => builder.append_value(true), TapeElement::False => builder.append_value(false), - d => return Err(tape_error(d, "boolean")), + _ => return Err(tape.error(*p, "boolean")), } } diff --git a/arrow-json/src/reader/decimal_array.rs b/arrow-json/src/reader/decimal_array.rs index 508409ec75bd..fc3c9aaa6b43 100644 --- a/arrow-json/src/reader/decimal_array.rs +++ b/arrow-json/src/reader/decimal_array.rs @@ -25,7 +25,7 @@ use arrow_data::ArrayData; use arrow_schema::ArrowError; use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{tape_error, ArrayDecoder}; +use crate::reader::ArrayDecoder; pub struct DecimalArrayDecoder { precision: u8, @@ -64,7 +64,7 @@ where let value = parse_decimal::(s, self.precision, self.scale)?; builder.append_value(value) } - d => return Err(tape_error(d, "decimal")), + _ => return Err(tape.error(*p, "decimal")), } } diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index ac35f998876c..aa3538bd5349 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_array::OffsetSizeTrait; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; @@ -78,7 +78,7 @@ impl ArrayDecoder for ListArrayDecoder { nulls.append(false); *p + 1 } - (d, _) => return Err(tape_error(d, "[")), + _ => return Err(tape.error(*p, "[")), }; let mut cur_idx = *p + 1; @@ -86,9 +86,7 @@ impl ArrayDecoder for ListArrayDecoder { child_pos.push(cur_idx); // Advance to next field - cur_idx = tape - .next(cur_idx) - .map_err(|d| tape_error(d, "list value"))?; + cur_idx = tape.next(cur_idx, "list value")?; } let offset = O::from_usize(child_pos.len()).ok_or_else(|| { diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs index 3662e594ba90..5e800a0d62dd 100644 --- a/arrow-json/src/reader/map_array.rs +++ b/arrow-json/src/reader/map_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::ArrowNativeType; @@ -104,14 +104,14 @@ impl ArrayDecoder for MapArrayDecoder { nulls.append(false); p + 1 } - (d, _) => return Err(tape_error(d, "{")), + _ => return Err(tape.error(p, "{")), }; let mut cur_idx = p + 1; while cur_idx < end_idx { let key = cur_idx; - let value = tape.next(key).map_err(|d| tape_error(d, "map key"))?; - cur_idx = tape.next(value).map_err(|d| tape_error(d, "map value"))?; + let value = tape.next(key, "map key")?; + cur_idx = tape.next(value, "map value")?; key_pos.push(key); value_pos.push(value); diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index d36493a47c88..51bba322bfa8 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -632,10 +632,6 @@ fn make_decoder( } } -fn tape_error(d: TapeElement, expected: &str) -> ArrowError { - ArrowError::JsonError(format!("expected {expected} got {d}")) -} - #[cfg(test)] mod tests { use std::fs::File; @@ -962,29 +958,29 @@ mod tests { let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)])); let buf = r#"{"a": 1}"#; - let result = ReaderBuilder::new(schema.clone()) + let err = ReaderBuilder::new(schema.clone()) .with_batch_size(1024) .build(Cursor::new(buf.as_bytes())) .unwrap() - .read(); + .read() + .unwrap_err(); - assert!(result.is_err()); assert_eq!( - result.unwrap_err().to_string(), - "Json error: expected string got number".to_string() + err.to_string(), + "Json error: whilst decoding field 'a': expected string got 1" ); let buf = r#"{"a": true}"#; - let result = ReaderBuilder::new(schema) + let err = ReaderBuilder::new(schema) .with_batch_size(1024) .build(Cursor::new(buf.as_bytes())) .unwrap() - .read(); + .read() + .unwrap_err(); - assert!(result.is_err()); assert_eq!( - result.unwrap_err().to_string(), - "Json error: expected string got true".to_string() + err.to_string(), + "Json error: whilst decoding field 'a': expected string got true" ); } @@ -1866,4 +1862,95 @@ mod tests { assert_eq!(3, num_batches); assert_eq!(100000000000011, sum_a); } + + #[test] + fn test_decoder_error() { + let schema = Arc::new(Schema::new(vec![Field::new_struct( + "a", + vec![Field::new("child", DataType::Int32, false)], + true, + )])); + + let parse_err = |s: &str| { + ReaderBuilder::new(schema.clone()) + .build(Cursor::new(s.as_bytes())) + .unwrap() + .next() + .unwrap() + .unwrap_err() + .to_string() + }; + + let err = parse_err(r#"{"a": 123}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': expected { got 123" + ); + + let err = parse_err(r#"{"a": ["bar"]}"#); + assert_eq!( + err, + r#"Json error: whilst decoding field 'a': expected { got ["bar"]"# + ); + + let err = parse_err(r#"{"a": []}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': expected { got []" + ); + + let err = parse_err(r#"{"a": [{"child": 234}]}"#); + assert_eq!( + err, + r#"Json error: whilst decoding field 'a': expected { got [{"child": 234}]"# + ); + + let err = parse_err(r#"{"a": [{"child": {"foo": [{"foo": ["bar"]}]}}]}"#); + assert_eq!( + err, + r#"Json error: whilst decoding field 'a': expected { got [{"child": {"foo": [{"foo": ["bar"]}]}}]"# + ); + + let err = parse_err(r#"{"a": true}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': expected { got true" + ); + + let err = parse_err(r#"{"a": false}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': expected { got false" + ); + + let err = parse_err(r#"{"a": "foo"}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': expected { got \"foo\"" + ); + + let err = parse_err(r#"{"a": {"child": false}}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got false" + ); + + let err = parse_err(r#"{"a": {"child": []}}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got []" + ); + + let err = parse_err(r#"{"a": {"child": [123]}}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123]" + ); + + let err = parse_err(r#"{"a": {"child": [123, 3465346]}}"#); + assert_eq!( + err, + "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123, 3465346]" + ); + } } diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs index 2d45d9c45a3c..cde52391f654 100644 --- a/arrow-json/src/reader/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -25,7 +25,7 @@ use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{tape_error, ArrayDecoder}; +use crate::reader::ArrayDecoder; /// A trait for JSON-specific primitive parsing logic /// @@ -116,7 +116,7 @@ where builder.append_value(value) } - d => return Err(tape_error(d, "primitive")), + _ => return Err(tape.error(*p, "primitive")), } } diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 8060804c9ce8..ea9a7157423f 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -22,7 +22,7 @@ use arrow_schema::ArrowError; use std::marker::PhantomData; use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{tape_error, ArrayDecoder}; +use crate::reader::ArrayDecoder; const TRUE: &str = "true"; const FALSE: &str = "false"; @@ -61,7 +61,7 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { data_capacity += tape.get_string(idx).len(); } - d => return Err(tape_error(d, "string")), + _ => return Err(tape.error(*p, "string")), } } diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs index 013f862c51ad..6c6f1457bfc2 100644 --- a/arrow-json/src/reader/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -16,7 +16,7 @@ // under the License. use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{make_decoder, tape_error, ArrayDecoder}; +use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::BooleanBufferBuilder; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -74,7 +74,7 @@ impl ArrayDecoder for StructArrayDecoder { nulls.append(false); continue; } - (d, _) => return Err(tape_error(d, "{")), + _ => return Err(tape.error(*p, "{")), }; let mut cur_idx = *p + 1; @@ -82,7 +82,7 @@ impl ArrayDecoder for StructArrayDecoder { // Read field name let field_name = match tape.get(cur_idx) { TapeElement::String(s) => tape.get_string(s), - d => return Err(tape_error(d, "field name")), + _ => return Err(tape.error(cur_idx, "field name")), }; // Update child pos if match found @@ -93,9 +93,7 @@ impl ArrayDecoder for StructArrayDecoder { } // Advance to next field - cur_idx = tape - .next(cur_idx + 1) - .map_err(|d| tape_error(d, "field value"))?; + cur_idx = tape.next(cur_idx + 1, "field value")?; } } @@ -103,7 +101,16 @@ impl ArrayDecoder for StructArrayDecoder { .decoders .iter_mut() .zip(child_pos) - .map(|(d, pos)| d.decode(tape, &pos)) + .zip(fields) + .map(|((d, pos), f)| { + d.decode(tape, &pos).map_err(|e| match e { + ArrowError::JsonError(s) => ArrowError::JsonError(format!( + "whilst decoding field '{}': {s}", + f.name() + )), + e => e, + }) + }) .collect::, ArrowError>>()?; let nulls = nulls diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 885257ed107a..5eca7b43dcc7 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -18,7 +18,6 @@ use crate::reader::serializer::TapeSerializer; use arrow_schema::ArrowError; use serde::Serialize; -use std::fmt::{Display, Formatter}; /// We decode JSON to a flattened tape representation, /// allowing for efficient traversal of the JSON data @@ -63,22 +62,6 @@ pub enum TapeElement { Null, } -impl Display for TapeElement { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - TapeElement::StartObject(_) => write!(f, "{{"), - TapeElement::EndObject(_) => write!(f, "}}"), - TapeElement::StartList(_) => write!(f, "["), - TapeElement::EndList(_) => write!(f, "]"), - TapeElement::String(_) => write!(f, "string"), - TapeElement::Number(_) => write!(f, "number"), - TapeElement::True => write!(f, "true"), - TapeElement::False => write!(f, "false"), - TapeElement::Null => write!(f, "null"), - } - } -} - /// A decoded JSON tape /// /// String and numeric data is stored alongside an array of [`TapeElement`] @@ -114,9 +97,8 @@ impl<'a> Tape<'a> { /// Returns the index of the next field at the same level as `cur_idx` /// - /// Return an error containing the [`TapeElement`] at `cur_idx` if it - /// is not the start of a field - pub fn next(&self, cur_idx: u32) -> Result { + /// Return an error if `cur_idx` is not the start of a field + pub fn next(&self, cur_idx: u32, expected: &str) -> Result { match self.get(cur_idx) { TapeElement::String(_) | TapeElement::Number(_) @@ -125,7 +107,7 @@ impl<'a> Tape<'a> { | TapeElement::Null => Ok(cur_idx + 1), TapeElement::StartList(end_idx) => Ok(end_idx + 1), TapeElement::StartObject(end_idx) => Ok(end_idx + 1), - d => Err(d), + _ => Err(self.error(cur_idx, expected)), } } @@ -133,6 +115,54 @@ impl<'a> Tape<'a> { pub fn num_rows(&self) -> usize { self.num_rows } + + /// Serialize the tape element at index `idx` to `out` returning the next field index + fn serialize(&self, out: &mut String, idx: u32) -> u32 { + match self.get(idx) { + TapeElement::StartObject(end) => { + out.push('{'); + let mut cur_idx = idx + 1; + while cur_idx < end { + cur_idx = self.serialize(out, cur_idx); + out.push_str(": "); + cur_idx = self.serialize(out, cur_idx); + } + out.push('}'); + return end + 1; + } + TapeElement::EndObject(_) => out.push('}'), + TapeElement::StartList(end) => { + out.push('['); + let mut cur_idx = idx + 1; + while cur_idx < end { + cur_idx = self.serialize(out, cur_idx); + if cur_idx < end { + out.push_str(", "); + } + } + out.push(']'); + return end + 1; + } + TapeElement::EndList(_) => out.push(']'), + TapeElement::String(s) => { + out.push('"'); + out.push_str(self.get_string(s)); + out.push('"') + } + TapeElement::Number(n) => out.push_str(self.get_string(n)), + TapeElement::True => out.push_str("true"), + TapeElement::False => out.push_str("false"), + TapeElement::Null => out.push_str("null"), + } + idx + 1 + } + + /// Returns an error reading index `idx` + pub fn error(&self, idx: u32, expected: &str) -> ArrowError { + let mut out = String::with_capacity(64); + self.serialize(&mut out, idx); + ArrowError::JsonError(format!("expected {expected} got {out}")) + } } /// States based on diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index 73d1cda9150c..249613d33ad1 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -27,7 +27,7 @@ use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, TimeUnit}; use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{tape_error, ArrayDecoder}; +use crate::reader::ArrayDecoder; /// A specialized [`ArrayDecoder`] for timestamps pub struct TimestampArrayDecoder { @@ -90,7 +90,7 @@ where builder.append_value(value) } - d => return Err(tape_error(d, "primitive")), + _ => return Err(tape.error(*p, "primitive")), } } From 637d3832e714d0bcd9166761e99b05456e158687 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Apr 2023 07:59:40 +0100 Subject: [PATCH 0820/1411] Store StructArray entries in MapArray (#4085) --- arrow-array/src/array/map_array.rs | 30 +++++++----------------- parquet/src/arrow/arrow_writer/levels.rs | 9 ++++--- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 18b3eb3cec32..62e12c30e00c 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -33,11 +33,7 @@ pub struct MapArray { data_type: DataType, nulls: Option, /// The [`StructArray`] that is the direct child of this array - entries: ArrayRef, - /// The first child of `entries`, the "keys" of this MapArray - keys: ArrayRef, - /// The second child of `entries`, the "values" of this MapArray - values: ArrayRef, + entries: StructArray, /// The start and end offsets of each entry value_offsets: OffsetBuffer, } @@ -54,35 +50,34 @@ impl MapArray { /// Returns a reference to the keys of this map pub fn keys(&self) -> &ArrayRef { - &self.keys + self.entries.column(0) } /// Returns a reference to the values of this map pub fn values(&self) -> &ArrayRef { - &self.values + self.entries.column(1) } /// Returns a reference to the [`StructArray`] entries of this map - pub fn entries(&self) -> &ArrayRef { + pub fn entries(&self) -> &StructArray { &self.entries } /// Returns the data type of the map's keys. pub fn key_type(&self) -> &DataType { - self.keys.data_type() + self.keys().data_type() } /// Returns the data type of the map's values. pub fn value_type(&self) -> &DataType { - self.values.data_type() + self.values().data_type() } /// Returns ith value of this map array. /// - /// This is a [`StructArray`] containing two fields /// # Safety /// Caller must ensure that the index is within the array bounds - pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + pub unsafe fn value_unchecked(&self, i: usize) -> StructArray { let end = *self.value_offsets().get_unchecked(i + 1); let start = *self.value_offsets().get_unchecked(i); self.entries @@ -92,7 +87,7 @@ impl MapArray { /// Returns ith value of this map array. /// /// This is a [`StructArray`] containing two fields - pub fn value(&self, i: usize) -> ArrayRef { + pub fn value(&self, i: usize) -> StructArray { let end = self.value_offsets()[i + 1] as usize; let start = self.value_offsets()[i] as usize; self.entries.slice(start, end - start) @@ -117,8 +112,6 @@ impl MapArray { data_type: self.data_type.clone(), nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), entries: self.entries.clone(), - keys: self.keys.clone(), - values: self.values.clone(), value_offsets: self.value_offsets.slice(offset, length), } } @@ -181,10 +174,7 @@ impl MapArray { entries.data_type() ))); } - - let keys = make_array(entries.child_data()[0].clone()); - let values = make_array(entries.child_data()[1].clone()); - let entries = make_array(entries); + let entries = entries.into(); // SAFETY: // ArrayData is valid, and verified type above @@ -194,8 +184,6 @@ impl MapArray { data_type: data.data_type().clone(), nulls: data.nulls().cloned(), entries, - keys, - values, value_offsets, }) } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index d662a16eaf28..fe6126ba486a 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -175,7 +175,7 @@ impl LevelInfoBuilder { } /// Given an `array`, write the level data for the elements in `range` - fn write(&mut self, array: &ArrayRef, range: Range) { + fn write(&mut self, array: &dyn Array, range: Range) { match array.data_type() { d if is_leaf(d) => self.write_leaf(array, range), DataType::Dictionary(_, v) if is_leaf(v.as_ref()) => { @@ -225,7 +225,7 @@ impl LevelInfoBuilder { &mut self, offsets: &[O], nulls: Option<&NullBuffer>, - values: &ArrayRef, + values: &dyn Array, range: Range, ) { let (child, ctx) = match self { @@ -372,7 +372,7 @@ impl LevelInfoBuilder { } /// Write a primitive array, as defined by [`is_leaf`] - fn write_leaf(&mut self, array: &ArrayRef, range: Range) { + fn write_leaf(&mut self, array: &dyn Array, range: Range) { let info = match self { Self::Primitive(info) => info, _ => unreachable!(), @@ -918,12 +918,11 @@ mod tests { assert_eq!(a_list_data.null_count(), 1); let a = ListArray::from(a_list_data); - let values = Arc::new(a) as _; let item_field = Field::new("item", a_list_type, true); let mut builder = LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&values, 2..4); + builder.write(&a, 2..4); let levels = builder.finish(); assert_eq!(levels.len(), 1); From 98cc3ef757c7a8a89d0b88051f1c4da6f1da7327 Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Fri, 14 Apr 2023 23:31:58 +0300 Subject: [PATCH 0821/1411] Add hash trait to SortOptions. (#4089) --- arrow-schema/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 0e9edc7b4b26..2d539417f78b 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -33,7 +33,7 @@ use std::ops; pub mod ffi; /// Options that define the sort order of a given column -#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[derive(Clone, Hash, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)] pub struct SortOptions { /// Whether to sort in descending order pub descending: bool, From 2e9fc22d787b42e4e77b7f8b5d713d7bf9b9e08b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:34:07 +0100 Subject: [PATCH 0822/1411] Fix timestamp numeric truncation (#4074) --- arrow-json/src/reader/mod.rs | 23 +++++++++++++++++++++++ arrow-json/src/reader/timestamp_array.rs | 9 ++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 51bba322bfa8..603a0cd7e602 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -1370,6 +1370,29 @@ mod tests { assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]); } + #[test] + fn test_timestamp_truncation() { + let buf = r#" + {"time": 9223372036854775807 } + {"time": -9223372036854775808 } + {"time": 9e5 } + "#; + + let schema = Arc::new(Schema::new(vec![Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + + let batches = do_read(buf, 1024, true, schema); + assert_eq!(batches.len(), 1); + + let i64 = batches[0] + .column(0) + .as_primitive::(); + assert_eq!(i64.values(), &[i64::MAX, i64::MIN, 900000]); + } + fn read_file(path: &str, schema: Option) -> Reader> { let file = File::open(path).unwrap(); let mut reader = BufReader::new(file); diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index 249613d33ad1..ef69deabce2d 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -16,7 +16,6 @@ // under the License. use chrono::TimeZone; -use num::NumCast; use std::marker::PhantomData; use arrow_array::builder::PrimitiveBuilder; @@ -78,10 +77,10 @@ where } TapeElement::Number(idx) => { let s = tape.get_string(idx); - let value = lexical_core::parse::(s.as_bytes()) - .ok() - .and_then(NumCast::from) - .ok_or_else(|| { + let b = s.as_bytes(); + let value = lexical_core::parse::(b) + .or_else(|_| lexical_core::parse::(b).map(|x| x as i64)) + .map_err(|_| { ArrowError::JsonError(format!( "failed to parse {s} as {}", self.data_type From 3ce9b9777f5b048586c397148e7c057ab79cb032 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Apr 2023 22:22:15 +0100 Subject: [PATCH 0823/1411] Include byte offsets in parquet-layout (#4086) --- parquet/src/bin/parquet-layout.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 7278c718c968..5f71551e1f20 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -70,6 +70,7 @@ struct Page { compression: Option<&'static str>, encoding: &'static str, page_type: &'static str, + offset: u64, compressed_bytes: i32, uncompressed_bytes: i32, header_bytes: i32, @@ -104,6 +105,7 @@ fn do_layout(reader: &C) -> Result { compression, encoding: encoding(dictionary.encoding), page_type: "dictionary", + offset: start, compressed_bytes: header.compressed_page_size, uncompressed_bytes: header.uncompressed_page_size, header_bytes: header_len as _, @@ -114,6 +116,7 @@ fn do_layout(reader: &C) -> Result { compression, encoding: encoding(data_page.encoding), page_type: "data_page_v1", + offset: start, compressed_bytes: header.compressed_page_size, uncompressed_bytes: header.uncompressed_page_size, header_bytes: header_len as _, @@ -126,6 +129,7 @@ fn do_layout(reader: &C) -> Result { compression: compression.filter(|_| is_compressed), encoding: encoding(data_page.encoding), page_type: "data_page_v2", + offset: start, compressed_bytes: header.compressed_page_size, uncompressed_bytes: header.uncompressed_page_size, header_bytes: header_len as _, From 3582c55d292e5436b184d5238475330d379210ca Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Sat, 15 Apr 2023 00:22:39 +0300 Subject: [PATCH 0824/1411] feat: Support dyn_compare_scalar for Decimal256 (#4084) * feat: Support dyn_compare_scalar for Decimal256 * feat: add test_decimal256_scalar_i256 --- arrow-ord/src/comparison.rs | 128 ++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 2927354da291..b9ffddd5e1b4 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -26,6 +26,7 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; +use arrow_buffer::i256; use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; @@ -517,6 +518,11 @@ macro_rules! dyn_compare_scalar { let left = as_primitive_array::($LEFT); $OP::(left, right) } + DataType::Decimal256(_, _) => { + let right = try_to_type!($RIGHT, to_i128)?; + let left = as_primitive_array::($LEFT); + $OP::(left, i256::from_i128(right)) + } DataType::Date32 => { let right = try_to_type!($RIGHT, to_i32)?; let left = as_primitive_array::($LEFT); @@ -6165,6 +6171,128 @@ mod tests { assert_eq!(e, r); } + #[test] + fn test_decimal256_scalar_i128() { + let a = Decimal256Array::from_iter_values( + [1, 2, 3, 4, 5].into_iter().map(i256::from_i128), + ); + let b = i256::from_i128(3); + // array eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), Some(false)], + ); + let r = eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array neq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), Some(true)], + ); + let r = neq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = neq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array lt scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), Some(false)], + ); + let r = lt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array lt_eq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(false), Some(false)], + ); + let r = lt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array gt scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), Some(true)], + ); + let r = gt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + + // array gt_eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), Some(true)], + ); + let r = gt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_eq_dyn_scalar(&a, b).unwrap(); + assert_eq!(e, r); + } + + #[test] + fn test_decimal256_scalar_i256() { + let a = Decimal256Array::from_iter_values( + [1, 2, 3, 4, 5].into_iter().map(i256::from_i128), + ); + let b = i256::MAX; + // array eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + let r = eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = eq_dyn_scalar(&a, b).is_err(); + assert!(r); + + // array neq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); + let r = neq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = neq_dyn_scalar(&a, b).is_err(); + assert!(r); + + // array lt scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); + let r = lt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_dyn_scalar(&a, b).is_err(); + assert!(r); + + // array lt_eq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(true), Some(true)], + ); + let r = lt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = lt_eq_dyn_scalar(&a, b).is_err(); + assert!(r); + + // array gt scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + let r = gt_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_dyn_scalar(&a, b).is_err(); + assert!(r); + + // array gt_eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(false), Some(false)], + ); + let r = gt_eq_scalar(&a, b).unwrap(); + assert_eq!(e, r); + let r = gt_eq_dyn_scalar(&a, b).is_err(); + assert!(r); + } + #[test] #[cfg(not(feature = "simd"))] fn test_floating_zeros() { From 628920f8e70744c71d8f8978862b316cf9620441 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Apr 2023 22:23:16 +0100 Subject: [PATCH 0825/1411] Relax JSON schema inference generics (#4063) * Relax JSON schema inference generics * Clippy --- arrow-json/src/reader/schema.rs | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 22d25c8be27a..427c20e027d6 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -20,7 +20,7 @@ use indexmap::map::IndexMap as HashMap; use indexmap::set::IndexSet as HashSet; use serde_json::Value; use std::borrow::Borrow; -use std::io::{BufRead, BufReader, Read, Seek}; +use std::io::{BufRead, Seek}; use std::sync::Arc; #[derive(Debug, Clone)] @@ -147,17 +147,17 @@ fn generate_schema(spec: HashMap) -> Result { - reader: &'a mut BufReader, +pub struct ValueIter { + reader: R, max_read_records: Option, record_count: usize, // reuse line buffer to avoid allocation on each record line_buf: String, } -impl<'a, R: Read> ValueIter<'a, R> { +impl ValueIter { /// Creates a new `ValueIter` - pub fn new(reader: &'a mut BufReader, max_read_records: Option) -> Self { + pub fn new(reader: R, max_read_records: Option) -> Self { Self { reader, max_read_records, @@ -167,7 +167,7 @@ impl<'a, R: Read> ValueIter<'a, R> { } } -impl<'a, R: Read> Iterator for ValueIter<'a, R> { +impl Iterator for ValueIter { type Item = Result; fn next(&mut self) -> Option { @@ -228,11 +228,11 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { /// ``` /// /// [`Reader`]: super::Reader -pub fn infer_json_schema_from_seekable( - reader: &mut BufReader, +pub fn infer_json_schema_from_seekable( + mut reader: R, max_read_records: Option, ) -> Result { - let schema = infer_json_schema(reader, max_read_records); + let schema = infer_json_schema(&mut reader, max_read_records); // return the reader seek back to the start reader.rewind()?; @@ -265,8 +265,8 @@ pub fn infer_json_schema_from_seekable( /// // seek back to start so that the original file is usable again /// file.seek(SeekFrom::Start(0)).unwrap(); /// ``` -pub fn infer_json_schema( - reader: &mut BufReader, +pub fn infer_json_schema( + reader: R, max_read_records: Option, ) -> Result { infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) @@ -515,7 +515,7 @@ mod tests { use super::*; use flate2::read::GzDecoder; use std::fs::File; - use std::io::Cursor; + use std::io::{BufReader, Cursor}; #[test] fn test_json_infer_schema() { @@ -700,8 +700,7 @@ mod tests { #[test] fn test_invalid_json_infer_schema() { - let re = - infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new(b"}")), None); + let re = infer_json_schema_from_seekable(Cursor::new(b"}"), None); assert_eq!( re.err().unwrap().to_string(), "Json error: Not valid JSON: expected value at line 1 column 1", From f24afc8c57141734a14a9c46d6b8dc1610f936f3 Mon Sep 17 00:00:00 2001 From: Benson Muite Date: Sat, 15 Apr 2023 13:38:32 +0300 Subject: [PATCH 0826/1411] Simplify reference to GitHub issues (#4092) Most Arrow projects use GitHub issues --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 55bdad6cb55c..df05d1463b2f 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,8 @@ a great place to meet other contributors and get guidance on where to contribute 2. the [GitHub Discussions][discussions] 3. the [Discord channel](https://discord.gg/YAb2TdazKQ) -Unlike other parts of the Arrow ecosystem, the Rust implementation uses [GitHub issues][issues] as the system of record for new features -and bug fixes and this plays a critical role in the release process. +The Rust implementation uses [GitHub issues][issues] as the system of record for new features and bug fixes and +this plays a critical role in the release process. For design discussions we generally collaborate on Google documents and file a GitHub issue linking to the document. From 7bdc1df7efe033d6aeea98a1b869c0ab8aff6550 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 15 Apr 2023 17:55:35 -0400 Subject: [PATCH 0827/1411] Increase minimum chrono version to 0.4.24 (#4093) --- arrow-array/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 1b417bb0e858..634a0aa647fb 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -44,7 +44,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrow-buffer = { workspace = true } arrow-schema = { workspace = true } arrow-data = { workspace = true } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { version = "0.4.24", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } From 89850e0606ed3ee1a730c6d1a4b22daa266cd365 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sun, 16 Apr 2023 13:03:53 +0200 Subject: [PATCH 0828/1411] feat: Support Timestamp +/- Interval types (#4038) * feat: Support Timestamp +/- Interval types * fix clippy * fix typos * support timezone * update toml * update tests * update tests * update tests with different non-zero fields --- arrow-arith/src/arithmetic.rs | 734 ++++++++++++++++++++++++++++++++++ arrow-array/src/types.rs | 644 +++++++++++++++++++++++++++++ 2 files changed, 1378 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 6c0fd497efbe..5d2e2a8f5feb 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -739,6 +739,105 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::add_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::add_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::add_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::add_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::add_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::add_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::add_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::add_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::add_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::add_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::add_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::add_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } _ => { downcast_primitive_array!( (left, right) => { @@ -971,6 +1070,104 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::subtract_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::subtract_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampSecondType::subtract_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::subtract_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::subtract_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampMillisecondType::subtract_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::subtract_year_months)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::subtract_day_time)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = right.as_primitive::(); + let res = math_checked_op(l, r, TimestampNanosecondType::subtract_month_day_nano)?; + Ok(Arc::new(res.with_timezone_opt(l.timezone()))) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } _ => { downcast_primitive_array!( (left, right) => { @@ -1758,6 +1955,7 @@ mod tests { use arrow_array::builder::{ BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, }; + use arrow_array::temporal_conversions::SECONDS_IN_DAY; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::i256; use arrow_data::ArrayDataBuilder; @@ -3665,4 +3863,540 @@ mod tests { "1234567890.0000000000000000000000000000" ); } + + #[test] + fn test_timestamp_second_add_interval() { + // timestamp second + interval year month + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 + SECONDS_IN_DAY * (365 + 31 + 28), + 2 + SECONDS_IN_DAY * (365 + 31 + 28), + 3 + SECONDS_IN_DAY * (365 + 31 + 28), + 4 + SECONDS_IN_DAY * (365 + 31 + 28), + 5 + SECONDS_IN_DAY * (365 + 31 + 28), + ]); + assert_eq!(result, &expected); + + // timestamp second + interval day time + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 + SECONDS_IN_DAY, + 2 + SECONDS_IN_DAY, + 3 + SECONDS_IN_DAY, + 4 + SECONDS_IN_DAY, + 5 + SECONDS_IN_DAY, + ]); + assert_eq!(&expected, result); + + // timestamp second + interval month day nanosecond + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 + SECONDS_IN_DAY, + 2 + SECONDS_IN_DAY, + 3 + SECONDS_IN_DAY, + 4 + SECONDS_IN_DAY, + 5 + SECONDS_IN_DAY, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_second_subtract_interval() { + // timestamp second + interval year month + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 - SECONDS_IN_DAY * (31 + 30 + 365), + 2 - SECONDS_IN_DAY * (31 + 30 + 365), + 3 - SECONDS_IN_DAY * (31 + 30 + 365), + 4 - SECONDS_IN_DAY * (31 + 30 + 365), + 5 - SECONDS_IN_DAY * (31 + 30 + 365), + ]); + assert_eq!(&expected, result); + + // timestamp second + interval day time + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 - SECONDS_IN_DAY, + 2 - SECONDS_IN_DAY, + 3 - SECONDS_IN_DAY, + 4 - SECONDS_IN_DAY, + 5 - SECONDS_IN_DAY, + ]); + assert_eq!(&expected, result); + + // timestamp second + interval month day nanosecond + let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampSecondArray::from(vec![ + 1 - SECONDS_IN_DAY, + 2 - SECONDS_IN_DAY, + 3 - SECONDS_IN_DAY, + 4 - SECONDS_IN_DAY, + 5 - SECONDS_IN_DAY, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_millisecond_add_interval() { + // timestamp millisecond + interval year month + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, + 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, + 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, + 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, + 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, + ]); + assert_eq!(result, &expected); + + // timestamp millisecond + interval day time + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000, + 2 + SECONDS_IN_DAY * 1_000, + 3 + SECONDS_IN_DAY * 1_000, + 4 + SECONDS_IN_DAY * 1_000, + 5 + SECONDS_IN_DAY * 1_000, + ]); + assert_eq!(&expected, result); + + // timestamp millisecond + interval month day nanosecond + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000, + 2 + SECONDS_IN_DAY * 1_000, + 3 + SECONDS_IN_DAY * 1_000, + 4 + SECONDS_IN_DAY * 1_000, + 5 + SECONDS_IN_DAY * 1_000, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_millisecond_subtract_interval() { + // timestamp millisecond + interval year month + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, + 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, + 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, + 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, + 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, + ]); + assert_eq!(&expected, result); + + // timestamp millisecond + interval day time + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000, + 2 - SECONDS_IN_DAY * 1_000, + 3 - SECONDS_IN_DAY * 1_000, + 4 - SECONDS_IN_DAY * 1_000, + 5 - SECONDS_IN_DAY * 1_000, + ]); + assert_eq!(&expected, result); + + // timestamp millisecond + interval month day nanosecond + let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMillisecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000, + 2 - SECONDS_IN_DAY * 1_000, + 3 - SECONDS_IN_DAY * 1_000, + 4 - SECONDS_IN_DAY * 1_000, + 5 - SECONDS_IN_DAY * 1_000, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_microsecond_add_interval() { + // timestamp microsecond + interval year month + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, + 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, + 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, + 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, + 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, + ]); + assert_eq!(result, &expected); + + // timestamp microsecond + interval day time + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000_000, + 2 + SECONDS_IN_DAY * 1_000_000, + 3 + SECONDS_IN_DAY * 1_000_000, + 4 + SECONDS_IN_DAY * 1_000_000, + 5 + SECONDS_IN_DAY * 1_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp microsecond + interval month day nanosecond + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000_000, + 2 + SECONDS_IN_DAY * 1_000_000, + 3 + SECONDS_IN_DAY * 1_000_000, + 4 + SECONDS_IN_DAY * 1_000_000, + 5 + SECONDS_IN_DAY * 1_000_000, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_microsecond_subtract_interval() { + // timestamp microsecond + interval year month + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, + 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, + 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, + 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, + 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp microsecond + interval day time + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000_000, + 2 - SECONDS_IN_DAY * 1_000_000, + 3 - SECONDS_IN_DAY * 1_000_000, + 4 - SECONDS_IN_DAY * 1_000_000, + 5 - SECONDS_IN_DAY * 1_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp microsecond + interval month day nanosecond + let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampMicrosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000_000, + 2 - SECONDS_IN_DAY * 1_000_000, + 3 - SECONDS_IN_DAY * 1_000_000, + 4 - SECONDS_IN_DAY * 1_000_000, + 5 - SECONDS_IN_DAY * 1_000_000, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_nanosecond_add_interval() { + // timestamp nanosecond + interval year month + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, + 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, + 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, + 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, + 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, + ]); + assert_eq!(result, &expected); + + // timestamp nanosecond + interval day time + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000_000_000, + 2 + SECONDS_IN_DAY * 1_000_000_000, + 3 + SECONDS_IN_DAY * 1_000_000_000, + 4 + SECONDS_IN_DAY * 1_000_000_000, + 5 + SECONDS_IN_DAY * 1_000_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp nanosecond + interval month day nanosecond + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = add_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 + SECONDS_IN_DAY * 1_000_000_000, + 2 + SECONDS_IN_DAY * 1_000_000_000, + 3 + SECONDS_IN_DAY * 1_000_000_000, + 4 + SECONDS_IN_DAY * 1_000_000_000, + 5 + SECONDS_IN_DAY * 1_000_000_000, + ]); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_nanosecond_subtract_interval() { + // timestamp nanosecond + interval year month + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalYearMonthArray::from(vec![ + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + Some(IntervalYearMonthType::make_value(1, 2)), + ]); + + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, + 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, + 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, + 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, + 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp nanosecond + interval day time + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalDayTimeArray::from(vec![ + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + Some(IntervalDayTimeType::make_value(1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000_000_000, + 2 - SECONDS_IN_DAY * 1_000_000_000, + 3 - SECONDS_IN_DAY * 1_000_000_000, + 4 - SECONDS_IN_DAY * 1_000_000_000, + 5 - SECONDS_IN_DAY * 1_000_000_000, + ]); + assert_eq!(&expected, result); + + // timestamp nanosecond + interval month day nanosecond + let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let b = IntervalMonthDayNanoArray::from(vec![ + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), + ]); + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + + let expected = TimestampNanosecondArray::from(vec![ + 1 - SECONDS_IN_DAY * 1_000_000_000, + 2 - SECONDS_IN_DAY * 1_000_000_000, + 3 - SECONDS_IN_DAY * 1_000_000_000, + 4 - SECONDS_IN_DAY * 1_000_000_000, + 5 - SECONDS_IN_DAY * 1_000_000_000, + ]); + assert_eq!(&expected, result); + } } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index e2d7a2492227..cec78db9fc70 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -350,6 +350,650 @@ impl ArrowTimestampType for TimestampNanosecondType { } } +impl TimestampSecondType { + /// Adds the given IntervalYearMonthType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let prior = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + + let months = IntervalYearMonthType::to_months(delta); + let posterior = shift_months(prior, months); + TimestampSecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalDayTimeType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampSecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalMonthDayNanoType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, months); + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampSecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalYearMonthType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let prior = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + TimestampSecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalDayTimeType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::microseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampSecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampSecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, -months); + let res = res + .checked_add_signed(Duration::days(-days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(-nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampSecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } +} + +impl TimestampMicrosecondType { + /// Adds the given IntervalYearMonthType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let prior = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let months = IntervalYearMonthType::to_months(delta); + let posterior = shift_months(prior, months); + TimestampMicrosecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalDayTimeType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMicrosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalMonthDayNanoType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, months); + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMicrosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalYearMonthType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let prior = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + TimestampMicrosecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalDayTimeType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMicrosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampMicrosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, -months); + let res = res + .checked_add_signed(Duration::days(-days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(-nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMicrosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } +} + +impl TimestampMillisecondType { + /// Adds the given IntervalYearMonthType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let prior = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let months = IntervalYearMonthType::to_months(delta); + let posterior = shift_months(prior, months); + TimestampMillisecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalDayTimeType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMillisecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalMonthDayNanoType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, months); + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMillisecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalYearMonthType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let prior = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + TimestampMillisecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalDayTimeType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMillisecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampMillisecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> + { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = shift_months(res, -months); + let res = res + .checked_add_signed(Duration::days(-days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(-nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampMillisecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } +} + +impl TimestampNanosecondType { + /// Adds the given IntervalYearMonthType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let prior = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + let months = IntervalYearMonthType::to_months(delta); + let posterior = shift_months(prior, months); + TimestampNanosecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalDayTimeType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampNanosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Adds the given IntervalMonthDayNanoType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn add_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = shift_months(res, months); + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampNanosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracs the given IntervalYearMonthType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_year_months( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let prior = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + let months = IntervalYearMonthType::to_months(-delta); + let posterior = shift_months(prior, months); + TimestampNanosecondType::make_value(posterior) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracs the given IntervalDayTimeType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_day_time( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + + let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let res = res + .checked_add_signed(Duration::days(days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::milliseconds(ms as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampNanosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } + + /// Subtracs the given IntervalMonthDayNanoType to an arrow TimestampNanosecondType + /// + /// # Arguments + /// + /// * `timestamp` - The date on which to perform the operation + /// * `delta` - The interval to add + pub fn subtract_month_day_nano( + timestamp: ::Native, + delta: ::Native, + ) -> Result<::Native, ArrowError> { + let seconds = timestamp / 1_000_000_000; + let nanos = timestamp % 1_000_000_000; + let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( + || ArrowError::ComputeError("Timestamp out of range".to_string()), + )?; + + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = shift_months(res, -months); + let res = res + .checked_add_signed(Duration::days(-days as i64)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + let res = res + .checked_add_signed(Duration::nanoseconds(-nanos)) + .ok_or_else(|| { + ArrowError::ComputeError("Timestamp out of range".to_string()) + })?; + TimestampNanosecondType::make_value(res) + .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + } +} + impl IntervalYearMonthType { /// Creates a IntervalYearMonthType::Native /// From 682231c7ff9ae79efa42db88fecf75052f064248 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sun, 16 Apr 2023 13:15:21 +0200 Subject: [PATCH 0829/1411] feat: cast from/to interval and duration (#4020) * feat: cast from/to interval and duration * update can_cast_type * refactor the function * skip the computation when scale is 1 * speed up process * update tests * handle nanosecond overflow * update tests for safe cast_option when overflowing * update tests to check null when overflowing --- arrow-cast/src/cast.rs | 405 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 404 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 2c1dae5187fa..bc37174b94f2 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -303,6 +303,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { IntervalUnit::MonthDayNano => false, } } + (Duration(_), Interval(IntervalUnit::MonthDayNano)) => true, + (Interval(IntervalUnit::MonthDayNano), Duration(_)) => true, (_, _) => false, } } @@ -458,6 +460,122 @@ where } } +/// Cast the array from interval to duration +fn cast_interval_to_duration>( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let array = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::ComputeError( + "Internal Error: Cannot cast interval to IntervalArray of expected type" + .to_string(), + ) + })?; + + let scale = match D::DATA_TYPE { + DataType::Duration(TimeUnit::Second) => 1_000_000_000, + DataType::Duration(TimeUnit::Millisecond) => 1_000_000, + DataType::Duration(TimeUnit::Microsecond) => 1_000, + DataType::Duration(TimeUnit::Nanosecond) => 1, + _ => unreachable!(), + }; + + if cast_options.safe { + let iter = array.iter().map(|v| { + v.and_then(|v| { + let v = v / scale; + if v > i64::MAX as i128 { + None + } else { + Some(v as i64) + } + }) + }); + Ok(Arc::new(unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + })) + } else { + let vec = array + .iter() + .map(|v| { + v.map(|v| { + let v = v / scale; + if v > i64::MAX as i128 { + Err(ArrowError::ComputeError(format!( + "Cannot cast to {:?}. Overflowing on {:?}", + D::DATA_TYPE, + v + ))) + } else { + Ok(v as i64) + } + }) + .transpose() + }) + .collect::, _>>()?; + Ok(Arc::new(unsafe { + PrimitiveArray::::from_trusted_len_iter(vec.iter()) + })) + } +} + +/// Cast the array from duration and interval +fn cast_duration_to_interval>( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let array = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::ComputeError( + "Internal Error: Cannot cast duration to DurationArray of expected type" + .to_string(), + ) + })?; + + let scale = match array.data_type() { + DataType::Duration(TimeUnit::Second) => 1_000_000_000, + DataType::Duration(TimeUnit::Millisecond) => 1_000_000, + DataType::Duration(TimeUnit::Microsecond) => 1_000, + DataType::Duration(TimeUnit::Nanosecond) => 1, + _ => unreachable!(), + }; + + if cast_options.safe { + let iter = array + .iter() + .map(|v| v.and_then(|v| v.checked_mul(scale).map(|v| v as i128))); + Ok(Arc::new(unsafe { + PrimitiveArray::::from_trusted_len_iter(iter) + })) + } else { + let vec = array + .iter() + .map(|v| { + v.map(|v| { + if let Ok(v) = v.mul_checked(scale) { + Ok(v as i128) + } else { + Err(ArrowError::ComputeError(format!( + "Cannot cast to {:?}. Overflowing on {:?}", + IntervalMonthDayNanoType::DATA_TYPE, + v + ))) + } + }) + .transpose() + }) + .collect::, _>>()?; + Ok(Arc::new(unsafe { + PrimitiveArray::::from_trusted_len_iter(vec.iter()) + })) + } +} + /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] fn cast_reinterpret_arrays< I: ArrowPrimitiveType, @@ -2014,7 +2132,30 @@ pub fn cast_with_options( (Duration(TimeUnit::Nanosecond), Int64) => { cast_reinterpret_arrays::(array) } - + (Duration(TimeUnit::Second), Interval(IntervalUnit::MonthDayNano)) => { + cast_duration_to_interval::(array, cast_options) + } + (Duration(TimeUnit::Millisecond), Interval(IntervalUnit::MonthDayNano)) => { + cast_duration_to_interval::(array, cast_options) + } + (Duration(TimeUnit::Microsecond), Interval(IntervalUnit::MonthDayNano)) => { + cast_duration_to_interval::(array, cast_options) + } + (Duration(TimeUnit::Nanosecond), Interval(IntervalUnit::MonthDayNano)) => { + cast_duration_to_interval::(array, cast_options) + } + (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Second)) => { + cast_interval_to_duration::(array, cast_options) + } + (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Millisecond)) => { + cast_interval_to_duration::(array, cast_options) + } + (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Microsecond)) => { + cast_interval_to_duration::(array, cast_options) + } + (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Nanosecond)) => { + cast_interval_to_duration::(array, cast_options) + } (Interval(IntervalUnit::YearMonth), Int64) => { cast_numeric_arrays::(array, cast_options) } @@ -8269,4 +8410,266 @@ mod tests { ); assert_eq!("Invalid argument error: 1234567000 is too large to store in a Decimal256 of precision 7. Max is 9999999", err.unwrap_err().to_string()); } + + /// helper function to test casting from duration to interval + fn cast_from_duration_to_interval( + array: Vec, + cast_options: &CastOptions, + ) -> Result, ArrowError> + where + arrow_array::PrimitiveArray: From>, + { + let array = PrimitiveArray::::from(array); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Interval(IntervalUnit::MonthDayNano), + cast_options, + )?; + casted_array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::ComputeError( + "Failed to downcast to IntervalMonthDayNanoArray".to_string(), + ) + }) + .cloned() + } + + #[test] + fn test_cast_from_duration_to_interval() { + // from duration second to interval month day nano + let array = vec![1234567]; + let casted_array = cast_from_duration_to_interval::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 1234567000000000); + + let array = vec![i64::MAX]; + let casted_array = cast_from_duration_to_interval::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_duration_to_interval::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from duration millisecond to interval month day nano + let array = vec![1234567]; + let casted_array = cast_from_duration_to_interval::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 1234567000000); + + let array = vec![i64::MAX]; + let casted_array = cast_from_duration_to_interval::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_duration_to_interval::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from duration microsecond to interval month day nano + let array = vec![1234567]; + let casted_array = cast_from_duration_to_interval::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 1234567000); + + let array = vec![i64::MAX]; + let casted_array = cast_from_duration_to_interval::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_duration_to_interval::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from duration nanosecond to interval month day nano + let array = vec![1234567]; + let casted_array = cast_from_duration_to_interval::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 1234567); + + let array = vec![i64::MAX]; + let casted_array = cast_from_duration_to_interval::( + array, + &CastOptions { safe: false }, + ) + .unwrap(); + assert_eq!(casted_array.value(0), 9223372036854775807); + } + + // helper function to test casting from interval to duration + fn cast_from_interval_to_duration( + array: Vec, + cast_options: &CastOptions, + ) -> Result, ArrowError> { + let array = IntervalMonthDayNanoArray::from(array); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options(&array, &T::DATA_TYPE, cast_options)?; + casted_array + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "Failed to downcast to {}", + T::DATA_TYPE + )) + }) + .cloned() + } + + #[test] + fn test_cast_from_interval_to_duration() { + // from interval month day nano to duration second + let array = vec![1234567]; + let casted_array = cast_from_interval_to_duration::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Duration(TimeUnit::Second) + ); + assert_eq!(casted_array.value(0), 0); + + let array = vec![i128::MAX]; + let casted_array = cast_from_interval_to_duration::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_interval_to_duration::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from interval month day nano to duration millisecond + let array = vec![1234567]; + let casted_array = cast_from_interval_to_duration::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!(casted_array.value(0), 1); + + let array = vec![i128::MAX]; + let casted_array = cast_from_interval_to_duration::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_interval_to_duration::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from interval month day nano to duration microsecond + let array = vec![1234567]; + let casted_array = cast_from_interval_to_duration::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Duration(TimeUnit::Microsecond) + ); + assert_eq!(casted_array.value(0), 1234); + + let array = vec![i128::MAX]; + let casted_array = cast_from_interval_to_duration::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_interval_to_duration::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + + // from interval month day nano to duration nanosecond + let array = vec![1234567]; + let casted_array = cast_from_interval_to_duration::( + array, + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Duration(TimeUnit::Nanosecond) + ); + assert_eq!(casted_array.value(0), 1234567); + + let array = vec![i128::MAX]; + let casted_array = cast_from_interval_to_duration::( + array.clone(), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Duration(TimeUnit::Nanosecond) + ); + assert!(!casted_array.is_valid(0)); + + let casted_array = cast_from_interval_to_duration::( + array, + &CastOptions { safe: false }, + ); + assert!(casted_array.is_err()); + } } From 472c97731603b5156db11c25d8b9d8e8f25856d7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 16 Apr 2023 10:19:41 -0400 Subject: [PATCH 0830/1411] Add ListArray Constructors (#3879) (#4065) * Add ListArray constructors (#3879) * More cleanup * Checked arithmetic * Add try_new * Add tests * Clippy * Update arrow-array/src/array/list_array.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-array/src/array/list_array.rs | 197 ++++++++++++++++++++++++---- arrow-buffer/src/buffer/offset.rs | 36 ++++- 2 files changed, 204 insertions(+), 29 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 8e6f84743f2a..d5e0c365b8e6 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -18,11 +18,12 @@ use crate::array::{get_offsets, make_array, print_long_array}; use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ - iterator::GenericListArrayIter, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, + iterator::GenericListArrayIter, new_empty_array, Array, ArrayAccessor, ArrayRef, + ArrowPrimitiveType, }; use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType, FieldRef}; use num::Integer; use std::any::Any; use std::sync::Arc; @@ -73,13 +74,114 @@ impl GenericListArray { /// The data type constructor of list array. /// The input is the schema of the child array and /// the output is the [`DataType`], List or LargeList. - pub const DATA_TYPE_CONSTRUCTOR: fn(Arc) -> DataType = if OffsetSize::IS_LARGE - { + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { DataType::LargeList } else { DataType::List }; + /// Create a new [`GenericListArray`] from the provided parts + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() - 1 != nulls.len()` + /// * `offsets.last() > values.len()` + /// * `!field.is_nullable() && values.null_count() != 0` + pub fn try_new( + field: FieldRef, + offsets: OffsetBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + let len = offsets.len() - 1; // Offsets guaranteed to not be empty + let end_offset = offsets.last().unwrap().as_usize(); + // don't need to check other values of `offsets` because they are checked + // during construction of `OffsetsbBuffer` + if end_offset > values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Max offset of {end_offset} exceeds length of values {}", + values.len() + ))); + } + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of nulls for {}ListArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if !field.is_nullable() && values.null_count() != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + }) + } + + /// Create a new [`GenericListArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: OffsetBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, values, nulls).unwrap() + } + + /// Create a new [`GenericListArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: OffsetBuffer::new_zeroed(len), + values, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + OffsetBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::List(f) | DataType::LargeList(f) => f, + _ => unreachable!(), + }; + (f, self.value_offsets, self.values, self.nulls) + } + /// Returns a reference to the offsets of this list /// /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] @@ -405,31 +507,16 @@ mod tests { use super::*; use crate::builder::{Int32Builder, ListBuilder}; use crate::types::Int32Type; - use crate::Int32Array; - use arrow_buffer::{bit_util, Buffer, ToByteSlice}; + use crate::{Int32Array, Int64Array}; + use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; + use arrow_schema::Field; fn create_from_buffers() -> ListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build() - .unwrap(); - - // Construct a buffer for value offsets, for the nested array: // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build() - .unwrap(); - ListArray::from(list_data) + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6, 8])); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + ListArray::new(field, offsets, Arc::new(values), None) } #[test] @@ -1029,4 +1116,62 @@ mod tests { assert_eq!(string.len(), 0); assert_eq!(string.value_offsets(), &[0]); } + + #[test] + fn test_try_new() { + let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); + let values = Int32Array::new(DataType::Int32, vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListArray::new(field.clone(), offsets.clone(), values.clone(), None); + + let nulls = NullBuffer::new_null(3); + ListArray::new(field.clone(), offsets, values.clone(), Some(nulls)); + + let nulls = NullBuffer::new_null(3); + let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into()); + let err = + LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = + LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(DataType::Int64, vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), None) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListArray \"element\" cannot contain nulls" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, true)); + LargeListArray::new(field.clone(), offsets.clone(), values, None); + + let values = Int64Array::new(DataType::Int64, vec![0; 2].into(), None); + let err = + LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Max offset of 5 exceeds length of values 2" + ); + } } diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index ada290f09286..bfafe3306aed 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -29,10 +29,13 @@ impl OffsetBuffer { /// # Panics /// /// Panics if `buffer` is not a non-empty buffer containing - /// monotonically increasing values greater than zero + /// monotonically increasing values greater than or equal to zero pub fn new(buffer: ScalarBuffer) -> Self { assert!(!buffer.is_empty(), "offsets cannot be empty"); - assert!(buffer[0] > O::usize_as(0), "offsets must be greater than 0"); + assert!( + buffer[0] >= O::usize_as(0), + "offsets must be greater than 0" + ); assert!( buffer.windows(2).all(|w| w[0] <= w[1]), "offsets must be monotonically increasing" @@ -45,7 +48,7 @@ impl OffsetBuffer { /// # Safety /// /// `buffer` must be a non-empty buffer containing monotonically increasing - /// values greater than zero + /// values greater than or equal to zero pub unsafe fn new_unchecked(buffer: ScalarBuffer) -> Self { Self(buffer) } @@ -56,6 +59,16 @@ impl OffsetBuffer { Self(buffer.into_buffer().into()) } + /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values + pub fn new_zeroed(len: usize) -> Self { + let len_bytes = len + .checked_add(1) + .and_then(|o| o.checked_mul(std::mem::size_of::())) + .expect("overflow"); + let buffer = MutableBuffer::from_len_zeroed(len_bytes); + Self(buffer.into_buffer().into()) + } + /// Returns the inner [`ScalarBuffer`] pub fn inner(&self) -> &ScalarBuffer { &self.0 @@ -104,6 +117,23 @@ mod tests { OffsetBuffer::new(vec![-1, 0, 1].into()); } + #[test] + fn offsets() { + OffsetBuffer::new(vec![0, 1, 2, 3].into()); + + let offsets = OffsetBuffer::::new_zeroed(3); + assert_eq!(offsets.as_ref(), &[0; 4]); + + let offsets = OffsetBuffer::::new_zeroed(0); + assert_eq!(offsets.as_ref(), &[0; 1]); + } + + #[test] + #[should_panic(expected = "overflow")] + fn offsets_new_zeroed_overflow() { + OffsetBuffer::::new_zeroed(usize::MAX); + } + #[test] #[should_panic(expected = "offsets must be monotonically increasing")] fn non_monotonic_offsets() { From 295ca863412ae328ad24c88c38f7f9cfd1a5526f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Apr 2023 07:49:44 -0400 Subject: [PATCH 0831/1411] Add DictionaryArray Constructors (#3879) (#4068) --- arrow-arith/src/aggregate.rs | 17 +-- arrow-arith/src/arithmetic.rs | 8 +- arrow-arith/src/arity.rs | 5 +- arrow-arith/src/temporal.rs | 108 ++++++++---------- arrow-array/src/array/dictionary_array.rs | 129 ++++++++++++++-------- arrow-flight/src/encode.rs | 2 +- arrow-ipc/src/reader.rs | 25 ++--- arrow-ipc/src/writer.rs | 2 +- arrow-ord/src/comparison.rs | 94 ++++++++-------- arrow-ord/src/ord.rs | 29 ++--- arrow-ord/src/sort.rs | 7 +- arrow-string/src/length.rs | 2 +- arrow-string/src/substring.rs | 2 +- arrow/src/array/ffi.rs | 2 +- 14 files changed, 218 insertions(+), 214 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 9ed6dee516a4..2833300ddc07 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -759,6 +759,7 @@ mod tests { use super::*; use crate::arithmetic::add; use arrow_array::types::*; + use std::sync::Arc; #[test] fn test_primitive_array_sum() { @@ -1142,9 +1143,10 @@ mod tests { #[test] fn test_sum_dyn() { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); + let values = Arc::new(values) as ArrayRef; let keys = Int8Array::from_iter_values([2_i8, 3, 4]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert_eq!(39, sum_array::(array).unwrap()); @@ -1152,12 +1154,12 @@ mod tests { assert_eq!(15, sum_array::(&a).unwrap()); let keys = Int8Array::from(vec![Some(2_i8), None, Some(4)]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert_eq!(26, sum_array::(array).unwrap()); let keys = Int8Array::from(vec![None, None, None]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert!(sum_array::(array).is_none()); } @@ -1166,8 +1168,9 @@ mod tests { fn test_max_min_dyn() { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); let keys = Int8Array::from_iter_values([2_i8, 3, 4]); + let values = Arc::new(values) as ArrayRef; - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert_eq!(14, max_array::(array).unwrap()); @@ -1179,14 +1182,14 @@ mod tests { assert_eq!(1, min_array::(&a).unwrap()); let keys = Int8Array::from(vec![Some(2_i8), None, Some(7)]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert_eq!(17, max_array::(array).unwrap()); let array = dict_array.downcast_dict::().unwrap(); assert_eq!(12, min_array::(array).unwrap()); let keys = Int8Array::from(vec![None, None, None]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, values.clone()); let array = dict_array.downcast_dict::().unwrap(); assert!(max_array::(array).is_none()); let array = dict_array.downcast_dict::().unwrap(); @@ -1198,7 +1201,7 @@ mod tests { let values = Float32Array::from(vec![5.0_f32, 2.0_f32, f32::NAN]); let keys = Int8Array::from_iter_values([0_i8, 1, 2]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array = dict_array.downcast_dict::().unwrap(); assert!(max_array::(array).unwrap().is_nan()); diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 5d2e2a8f5feb..4c1bad4d2e5d 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -3624,11 +3624,11 @@ mod tests { fn test_dict_decimal() { let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); - let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let result = add_dyn(&array1, &array2).unwrap(); let expected = @@ -3650,7 +3650,7 @@ mod tests { ]); let keys = Int8Array::from(vec![Some(1_i8), None, Some(5), Some(4), Some(3), None]); - let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal256Array::from_iter_values([ i256::from_i128(7), @@ -3661,7 +3661,7 @@ mod tests { ]); let keys = Int8Array::from(vec![Some(0_i8), Some(0), None, Some(2), Some(3), Some(4)]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let result = add_dyn(&array1, &array2).unwrap(); let expected = Arc::new(Decimal256Array::from(vec![ diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index d69bbde8d056..2f1f6c345b32 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -603,10 +603,7 @@ mod tests { fn test_unary_dict_mut() { let values = Int32Array::from(vec![Some(10), Some(20), None]); let keys = Int8Array::from_iter_values([0, 0, 1, 2]); - let dictionary = DictionaryArray::::try_new(&keys, &values).unwrap(); - - drop(keys); - drop(values); + let dictionary = DictionaryArray::new(keys, Arc::new(values)); let updated = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap(); let typed = updated.downcast_dict::().unwrap(); diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index ac76358ef2dd..f62e7e9a653a 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -17,9 +17,10 @@ //! Defines temporal kernels for time and date related functions. -use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use std::sync::Arc; +use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; + use arrow_array::builder::*; use arrow_array::iterator::ArrayIter; use arrow_array::temporal_conversions::{ @@ -970,12 +971,14 @@ mod tests { .with_timezone("+01:00".to_string()); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 1]); - let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let dict = DictionaryArray::try_new(keys.clone(), Arc::new(a)).unwrap(); let b = hour_dyn(&dict).unwrap(); - let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![11, 21, 7])).unwrap(); + let expected_dict = DictionaryArray::new( + keys.clone(), + Arc::new(Int32Array::from(vec![11, 21, 7])), + ); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); @@ -984,7 +987,7 @@ mod tests { let b_old = minute_dyn(&dict).unwrap(); let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 2, 3])).unwrap(); + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![1, 2, 3]))); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); @@ -994,7 +997,7 @@ mod tests { let b_old = second_dyn(&dict).unwrap(); let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 2, 3])).unwrap(); + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![1, 2, 3]))); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); @@ -1003,8 +1006,7 @@ mod tests { time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![0, 0, 0, 0, 0])) - .unwrap(); + DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![0, 0, 0, 0, 0]))); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); } @@ -1015,15 +1017,14 @@ mod tests { vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); - let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = year_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![2018, 2019, 2019, 2018]), - ) - .unwrap(); + let expected_dict = DictionaryArray::new( + keys, + Arc::new(Int32Array::from(vec![2018, 2019, 2019, 2018])), + ); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); } @@ -1036,21 +1037,21 @@ mod tests { vec![Some(1514764800000), Some(1566275025000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); - let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = quarter_dyn(&dict).unwrap(); - let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 3, 3, 1])).unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let expected = DictionaryArray::new( + keys.clone(), + Arc::new(Int32Array::from(vec![1, 3, 3, 1])), + ); + assert_eq!(b.as_ref(), &expected); let b = month_dyn(&dict).unwrap(); - let expected_dict = - DictionaryArray::try_new(&keys, &Int32Array::from(vec![1, 8, 8, 1])).unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let expected = + DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![1, 8, 8, 1]))); + assert_eq!(b.as_ref(), &expected); } #[test] @@ -1061,57 +1062,37 @@ mod tests { vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), Some(0), None]); - let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = num_days_from_monday_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![Some(0), Some(2), Some(2), Some(0), None]), - ) - .unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let a = Int32Array::from(vec![Some(0), Some(2), Some(2), Some(0), None]); + let expected = DictionaryArray::new(keys.clone(), Arc::new(a)); + assert_eq!(b.as_ref(), &expected); let b = num_days_from_sunday_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![Some(1), Some(3), Some(3), Some(1), None]), - ) - .unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let a = Int32Array::from(vec![Some(1), Some(3), Some(3), Some(1), None]); + let expected = DictionaryArray::new(keys.clone(), Arc::new(a)); + assert_eq!(b.as_ref(), &expected); let b = day_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![Some(1), Some(20), Some(20), Some(1), None]), - ) - .unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let a = Int32Array::from(vec![Some(1), Some(20), Some(20), Some(1), None]); + let expected = DictionaryArray::new(keys.clone(), Arc::new(a)); + assert_eq!(b.as_ref(), &expected); let b = doy_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![Some(1), Some(51), Some(51), Some(1), None]), - ) - .unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let a = Int32Array::from(vec![Some(1), Some(51), Some(51), Some(1), None]); + let expected = DictionaryArray::new(keys.clone(), Arc::new(a)); + assert_eq!(b.as_ref(), &expected); let b = week_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]), - ) - .unwrap(); - let expected = Arc::new(expected_dict) as ArrayRef; - assert_eq!(&expected, &b); + let a = Int32Array::from(vec![Some(1), Some(8), Some(8), Some(1), None]); + let expected = DictionaryArray::new(keys, Arc::new(a)); + assert_eq!(b.as_ref(), &expected); } #[test] @@ -1129,14 +1110,11 @@ mod tests { assert_eq!(453_000_000, b.value(1)); let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1)]); - let dict = DictionaryArray::try_new(&keys, &a).unwrap(); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = nanosecond_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::try_new( - &keys, - &Int32Array::from(vec![None, Some(453_000_000)]), - ) - .unwrap(); + let a = Int32Array::from(vec![None, Some(453_000_000)]); + let expected_dict = DictionaryArray::new(keys, Arc::new(a)); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); } diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index f25a077a81ba..75fd4c6d0d68 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -20,8 +20,8 @@ use crate::cast::AsArray; use crate::iterator::ArrayIter; use crate::types::*; use crate::{ - make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, PrimitiveArray, - StringArray, + make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, + PrimitiveArray, StringArray, }; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; @@ -196,12 +196,13 @@ pub type UInt64DictionaryArray = DictionaryArray; /// Example from existing arrays: /// /// ``` +/// use std::sync::Arc; /// use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type}; /// // You can form your own DictionaryArray by providing the /// // values (dictionary) and keys (indexes into the dictionary): /// let values = StringArray::from_iter_values(["a", "b", "c"]); /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]); -/// let array = DictionaryArray::::try_new(&keys, &values).unwrap(); +/// let array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); /// let expected: DictionaryArray:: = vec!["a", "a", "b", "c"] /// .into_iter() /// .collect(); @@ -237,33 +238,72 @@ impl Clone for DictionaryArray { impl DictionaryArray { /// Attempt to create a new DictionaryArray with a specified keys /// (indexes into the dictionary) and values (dictionary) - /// array. Returns an error if there are any keys that are outside - /// of the dictionary array. + /// array. + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new(keys: PrimitiveArray, values: ArrayRef) -> Self { + Self::try_new(keys, values).unwrap() + } + + /// Attempt to create a new DictionaryArray with a specified keys + /// (indexes into the dictionary) and values (dictionary) + /// array. + /// + /// # Errors + /// + /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0` pub fn try_new( - keys: &PrimitiveArray, - values: &dyn Array, + keys: PrimitiveArray, + values: ArrayRef, ) -> Result { - let dict_data_type = DataType::Dictionary( + let data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), ); - // Note: This use the ArrayDataBuilder::build_unchecked and afterwards - // call the new function which only validates that the keys are in bounds. - let data = keys.to_data(); - let builder = data - .into_builder() - .data_type(dict_data_type) - .add_child_data(values.to_data()); + let zero = K::Native::usize_as(0); + let values_len = values.len(); - // Safety: `validate` ensures key type is correct, and - // `validate_values` ensures all offsets are within range - let array = unsafe { builder.build_unchecked() }; + if let Some((idx, v)) = keys.values().iter().enumerate().find(|(idx, v)| { + (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx) + }) { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}", + ))); + } - array.validate()?; - array.validate_values()?; + Ok(Self { + data_type, + keys, + values, + is_ordered: false, + }) + } + + /// Create a new [`DictionaryArray`] without performing validation + /// + /// # Safety + /// + /// Safe provided [`Self::try_new`] would not return an error + pub unsafe fn new_unchecked(keys: PrimitiveArray, values: ArrayRef) -> Self { + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + Self { + data_type, + keys, + values, + is_ordered: false, + } + } - Ok(array.into()) + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (PrimitiveArray, ArrayRef) { + (self.keys, self.values) } /// Return an array view of the keys of this dictionary as a PrimitiveArray. @@ -362,9 +402,9 @@ impl DictionaryArray { /// Panics if `values` has a length less than the current values /// /// ``` - /// use arrow_array::builder::PrimitiveDictionaryBuilder; - /// use arrow_array::{Int8Array, Int64Array, ArrayAccessor}; - /// use arrow_array::types::{Int32Type, Int8Type}; + /// # use arrow_array::builder::PrimitiveDictionaryBuilder; + /// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor}; + /// # use arrow_array::types::{Int32Type, Int8Type}; /// /// // Construct a Dict(Int32, Int8) /// let mut builder = PrimitiveDictionaryBuilder::::with_capacity(2, 200); @@ -431,13 +471,13 @@ impl DictionaryArray { PrimitiveDictionaryBuilder::new_from_builders(key_builder, value_builder) }), (Err(key_array), Ok(mut value_builder)) => { - Err(Self::try_new(&key_array, &value_builder.finish()).unwrap()) + Err(Self::try_new(key_array, Arc::new(value_builder.finish())).unwrap()) } (Ok(mut key_builder), Err(value_array)) => { - Err(Self::try_new(&key_builder.finish(), &value_array).unwrap()) + Err(Self::try_new(key_builder.finish(), Arc::new(value_array)).unwrap()) } (Err(key_array), Err(value_array)) => { - Err(Self::try_new(&key_array, &value_array).unwrap()) + Err(Self::try_new(key_array, Arc::new(value_array)).unwrap()) } } } @@ -453,13 +493,12 @@ impl DictionaryArray { /// or this function may panic. /// # Example /// ``` - /// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}}; - /// use arrow_array::{Int8Array, Int32Array}; + /// # use std::sync::Arc; + /// # use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}}; + /// # use arrow_array::{Int8Array, Int32Array}; /// let values = Int32Array::from(vec![Some(10), Some(20), None]); /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]); - /// let dictionary = DictionaryArray::::try_new(&keys, &values).unwrap(); - /// drop(keys); - /// drop(values); + /// let dictionary = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); /// let c = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap(); /// let typed = c.downcast_dict::().unwrap(); /// assert_eq!(typed.value(0), 11); @@ -807,7 +846,7 @@ mod tests { use super::*; use crate::builder::PrimitiveDictionaryBuilder; use crate::cast::as_dictionary_array; - use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type, UInt8Type}; + use crate::types::{Int32Type, Int8Type, UInt32Type, UInt8Type}; use crate::{Int16Array, Int32Array, Int8Array}; use arrow_buffer::{Buffer, ToByteSlice}; use std::sync::Arc; @@ -974,7 +1013,7 @@ mod tests { let keys = Int16Array::from_iter_values([2_i16, 3, 4]); // Construct a dictionary array from the above two - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let mut key_iter = dict_array.keys_iter(); assert_eq!(2, key_iter.next().unwrap().unwrap()); @@ -1021,7 +1060,7 @@ mod tests { let keys = Int8Array::from(vec![Some(2), None, Some(1)]); let values = StringArray::from(vec!["foo", "bar", "baz", "blarg"]); - let array = DictionaryArray::try_new(&keys, &values).unwrap(); + let array = DictionaryArray::new(keys, Arc::new(values)); assert_eq!(array.key(0), Some(2)); assert_eq!(array.key(1), None); assert_eq!(array.key(2), Some(1)); @@ -1034,7 +1073,7 @@ mod tests { .collect(); let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect(); - let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array = DictionaryArray::new(keys, Arc::new(values)); assert_eq!(array.keys().data_type(), &DataType::Int32); assert_eq!(array.values().data_type(), &DataType::Utf8); @@ -1057,23 +1096,23 @@ mod tests { #[test] #[should_panic( - expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" + expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2" )] fn test_try_new_index_too_large() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); // dictionary only has 2 values, so offset 3 is out of bounds let keys: Int32Array = [Some(0), Some(3)].into_iter().collect(); - DictionaryArray::::try_new(&keys, &values).unwrap(); + DictionaryArray::new(keys, Arc::new(values)); } #[test] #[should_panic( - expected = "Value at position 0 out of bounds: -100 (should be in [0, 1])" + expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2" )] fn test_try_new_index_too_small() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); let keys: Int32Array = [Some(-100)].into_iter().collect(); - DictionaryArray::::try_new(&keys, &values).unwrap(); + DictionaryArray::new(keys, Arc::new(values)); } #[test] @@ -1090,14 +1129,12 @@ mod tests { let values = Int32Array::from_iter_values([10_i32, 12, 15]); let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let boxed: ArrayRef = Arc::new(dict_array); let col: DictionaryArray = as_dictionary_array(&boxed).clone(); drop(boxed); - drop(keys); - drop(values); let mut builder = col.into_primitive_dict_builder::().unwrap(); @@ -1111,7 +1148,7 @@ mod tests { let values = Int32Array::from_iter_values([4_i32, 2, 1]); let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); - let expected = DictionaryArray::::try_new(&keys, &values).unwrap(); + let expected = DictionaryArray::new(keys, Arc::new(values)); let new_array = builder.finish(); assert_eq!(expected, new_array); @@ -1122,7 +1159,7 @@ mod tests { let values = Int32Array::from_iter_values([10_i32, 12, 15]); let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let boxed: ArrayRef = Arc::new(dict_array); @@ -1135,7 +1172,7 @@ mod tests { let values = Int32Array::from_iter_values([10_i32, 12, 15]); let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]); - let expected = DictionaryArray::::try_new(&keys, &values).unwrap(); + let expected = DictionaryArray::new(keys, Arc::new(values)); assert_eq!(expected, returned); } } diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index f8915a96320d..f97311d6f9e3 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -719,7 +719,7 @@ mod tests { // large dictionary (1024 distinct values) that are used throughout the array let values = StringArray::from_iter_values((0..1024).map(|i| "******".repeat(i))); let keys = Int32Array::from_iter_values((0..3000).map(|i| (3000 - i) % 1024)); - let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array = DictionaryArray::new(keys, Arc::new(values)); let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 3d803b62728c..e41119937339 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1380,8 +1380,7 @@ mod tests { let array12_values = StringArray::from(vec!["x", "yy", "zzz"]); let array12_keys = Int8Array::from_iter_values([1, 1, 2]); - let array12 = - DictionaryArray::::try_new(&array12_keys, &array12_values).unwrap(); + let array12 = DictionaryArray::new(array12_keys, Arc::new(array12_values)); let array13 = StringArray::from(vec!["a", "bb", "ccc"]); @@ -1693,13 +1692,12 @@ mod tests { #[test] fn test_roundtrip_stream_nested_dict_of_map_of_dict() { let values = StringArray::from(vec![Some("a"), None, Some("b"), Some("c")]); + let values = Arc::new(values) as ArrayRef; let value_dict_keys = Int8Array::from_iter_values([0, 1, 1, 2, 3, 1]); - let value_dict_array = - DictionaryArray::::try_new(&value_dict_keys, &values).unwrap(); + let value_dict_array = DictionaryArray::new(value_dict_keys, values.clone()); let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); - let key_dict_array = - DictionaryArray::::try_new(&key_dict_keys, &values).unwrap(); + let key_dict_array = DictionaryArray::new(key_dict_keys, values); let keys_field = Field::new_dict( "keys", @@ -1738,8 +1736,7 @@ mod tests { let map_array = MapArray::from(map_data); let dict_keys = Int8Array::from_iter_values([0, 1, 1, 2, 2, 1]); - let dict_dict_array = - DictionaryArray::::try_new(&dict_keys, &map_array).unwrap(); + let dict_dict_array = DictionaryArray::new(dict_keys, Arc::new(map_array)); let schema = Arc::new(Schema::new(vec![Field::new( "f1", @@ -1761,7 +1758,7 @@ mod tests { ) { let values = StringArray::from(vec![Some("a"), None, Some("c"), None]); let keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let dict_data = dict_array.to_data(); let value_offsets = Buffer::from_slice_ref(offsets); @@ -1775,8 +1772,7 @@ mod tests { let list_array = GenericListArray::::from(list_data); let keys_for_dict = Int8Array::from_iter_values([0, 3, 0, 1, 1, 2, 0, 1, 3]); - let dict_dict_array = - DictionaryArray::::try_new(&keys_for_dict, &list_array).unwrap(); + let dict_dict_array = DictionaryArray::new(keys_for_dict, Arc::new(list_array)); let schema = Arc::new(Schema::new(vec![Field::new( "f1", @@ -1824,8 +1820,8 @@ mod tests { fn test_roundtrip_stream_dict_of_fixed_size_list_of_dict() { let values = StringArray::from(vec![Some("a"), None, Some("c"), None]); let keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3, 1, 2]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); - let dict_data = dict_array.to_data(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); + let dict_data = dict_array.into_data(); let list_data_type = DataType::FixedSizeList( Arc::new(Field::new_dict( @@ -1845,8 +1841,7 @@ mod tests { let list_array = FixedSizeListArray::from(list_data); let keys_for_dict = Int8Array::from_iter_values([0, 1, 0, 1, 1, 2, 0, 1, 2]); - let dict_dict_array = - DictionaryArray::::try_new(&keys_for_dict, &list_array).unwrap(); + let dict_dict_array = DictionaryArray::new(keys_for_dict, Arc::new(list_array)); let schema = Arc::new(Schema::new(vec![Field::new( "f1", diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 08ddd1812bb7..0c9ca17b7426 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1841,7 +1841,7 @@ mod tests { let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect(); - let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array = DictionaryArray::new(keys, Arc::new(values)); let schema = Schema::new(vec![Field::new("dict", array.data_type().clone(), true)]); diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index b9ffddd5e1b4..c771182f7917 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -4622,11 +4622,12 @@ mod tests { fn test_eq_dyn_neq_dyn_dictionary_i8_array() { // Construct a value array let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); + let values = Arc::new(values) as ArrayRef; let keys1 = Int8Array::from_iter_values([2_i8, 3, 4]); let keys2 = Int8Array::from_iter_values([2_i8, 4, 4]); - let dict_array1 = DictionaryArray::try_new(&keys1, &values).unwrap(); - let dict_array2 = DictionaryArray::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!(result.unwrap(), BooleanArray::from(vec![true, false, true])); @@ -4642,13 +4643,12 @@ mod tests { #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_u64_array() { let values = UInt64Array::from_iter_values([10_u64, 11, 12, 13, 14, 15, 16, 17]); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([1_u64, 3, 4]); let keys2 = UInt64Array::from_iter_values([2_u64, 3, 5]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!( @@ -4695,13 +4695,12 @@ mod tests { .into_iter() .map(|b| Some(b.as_bytes())) .collect(); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([0_u64, 1, 2]); let keys2 = UInt64Array::from_iter_values([0_u64, 2, 1]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!( @@ -4717,13 +4716,12 @@ mod tests { #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_interval_array() { let values = IntervalDayTimeArray::from(vec![1, 6, 10, 2, 3, 5]); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([1_u64, 0, 3]); let keys2 = UInt64Array::from_iter_values([2_u64, 0, 3]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!(result.unwrap(), BooleanArray::from(vec![false, true, true])); @@ -4739,13 +4737,12 @@ mod tests { #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_date_array() { let values = Date32Array::from(vec![1, 6, 10, 2, 3, 5]); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([1_u64, 0, 3]); let keys2 = UInt64Array::from_iter_values([2_u64, 0, 3]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!(result.unwrap(), BooleanArray::from(vec![false, true, true])); @@ -4761,13 +4758,12 @@ mod tests { #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_bool_array() { let values = BooleanArray::from(vec![true, false]); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([1_u64, 1, 1]); let keys2 = UInt64Array::from_iter_values([0_u64, 1, 0]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = eq_dyn(&dict_array1, &dict_array2); assert_eq!( @@ -4784,11 +4780,12 @@ mod tests { fn test_lt_dyn_gt_dyn_dictionary_i8_array() { // Construct a value array let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); + let values = Arc::new(values) as ArrayRef; let keys1 = Int8Array::from_iter_values([3_i8, 4, 4]); let keys2 = Int8Array::from_iter_values([4_i8, 3, 4]); - let dict_array1 = DictionaryArray::try_new(&keys1, &values).unwrap(); - let dict_array2 = DictionaryArray::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = lt_dyn(&dict_array1, &dict_array2); assert_eq!( @@ -4813,13 +4810,12 @@ mod tests { #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_gt_dyn_dictionary_bool_array() { let values = BooleanArray::from(vec![true, false]); + let values = Arc::new(values) as ArrayRef; let keys1 = UInt64Array::from_iter_values([1_u64, 1, 0]); let keys2 = UInt64Array::from_iter_values([0_u64, 1, 1]); - let dict_array1 = - DictionaryArray::::try_new(&keys1, &values).unwrap(); - let dict_array2 = - DictionaryArray::::try_new(&keys2, &values).unwrap(); + let dict_array1 = DictionaryArray::new(keys1, values.clone()); + let dict_array2 = DictionaryArray::new(keys2, values.clone()); let result = lt_dyn(&dict_array1, &dict_array2); assert_eq!( @@ -4858,7 +4854,7 @@ mod tests { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); let keys = Int8Array::from_iter_values([2_i8, 3, 4]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array = Int8Array::from_iter([Some(12_i8), None, Some(14)]); @@ -4893,7 +4889,7 @@ mod tests { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); let keys = Int8Array::from_iter_values([2_i8, 3, 4]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array = Int8Array::from_iter([Some(12_i8), None, Some(11)]); @@ -5503,7 +5499,7 @@ mod tests { .collect(); let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array: BinaryArray = ["hello", "", "parquet", "test"] .into_iter() @@ -5544,7 +5540,7 @@ mod tests { .collect(); let keys = UInt64Array::from(vec![Some(0_u64), None, Some(2), Some(2)]); - let dict_array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array: BinaryArray = ["hello", "", "parquet", "test"] .into_iter() @@ -5610,7 +5606,7 @@ mod tests { let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(10.0)]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], @@ -5628,7 +5624,7 @@ mod tests { .collect(); let values = Float32Array::from(vec![f32::NAN, 8.0, 10.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], @@ -5646,7 +5642,7 @@ mod tests { .collect(); let values = Float64Array::from(vec![f64::NAN, 8.0, 10.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], @@ -5668,7 +5664,7 @@ mod tests { .collect(); let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], @@ -5686,7 +5682,7 @@ mod tests { .collect(); let values = Float32Array::from(vec![f32::NAN, 8.0, 9.0, 10.0, 1.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], @@ -5704,7 +5700,7 @@ mod tests { .collect(); let values = Float64Array::from(vec![f64::NAN, 8.0, 9.0, 10.0, 1.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], @@ -5726,7 +5722,7 @@ mod tests { .collect(); let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -5744,7 +5740,7 @@ mod tests { .collect(); let values = Float32Array::from(vec![f32::NAN, 8.0, 9.0, 10.0, 1.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -5762,7 +5758,7 @@ mod tests { .collect(); let values = Float64Array::from(vec![f64::NAN, 8.0, 9.0, 10.0, 1.0]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -5783,7 +5779,7 @@ mod tests { let values = BooleanArray::from(test1); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array: BooleanArray = test2.iter().collect(); @@ -5820,7 +5816,7 @@ mod tests { let values = BooleanArray::from(test1); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2]); - let dict_array = DictionaryArray::try_new(&keys, &values).unwrap(); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); let array: BooleanArray = test2.iter().collect(); @@ -5878,11 +5874,11 @@ mod tests { fn test_cmp_dict_decimal128() { let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); - let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], @@ -5918,7 +5914,7 @@ mod tests { let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], @@ -5953,13 +5949,13 @@ mod tests { [0, 1, 2, 3, 4, 5].into_iter().map(i256::from_i128), ); let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); - let array1 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal256Array::from_iter_values( [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), ); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], @@ -5998,7 +5994,7 @@ mod tests { [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), ); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(true), Some(true), Some(false)], diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index bfe74d9e3e7a..a33ead8ab041 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -283,6 +283,7 @@ pub mod tests { use arrow_buffer::i256; use half::f16; use std::cmp::Ordering; + use std::sync::Arc; #[test] fn test_fixed_size_binary() { @@ -423,11 +424,11 @@ pub mod tests { fn test_primitive_dict() { let values = Int32Array::from(vec![1_i32, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Int32Array::from(vec![2_i32, 3, 4, 5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -442,11 +443,11 @@ pub mod tests { fn test_float_dict() { let values = Float32Array::from(vec![1.0, 0.5, 2.1, 5.5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::try_new(keys, Arc::new(values)).unwrap(); let values = Float32Array::from(vec![1.2, 3.2, 4.0, 5.5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -461,11 +462,11 @@ pub mod tests { fn test_timestamp_dict() { let values = TimestampSecondArray::from(vec![1, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = TimestampSecondArray::from(vec![2, 3, 4, 5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -480,11 +481,11 @@ pub mod tests { fn test_interval_dict() { let values = IntervalDayTimeArray::from(vec![1, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = IntervalDayTimeArray::from(vec![2, 3, 4, 5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -499,11 +500,11 @@ pub mod tests { fn test_duration_dict() { let values = DurationSecondArray::from(vec![1, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = DurationSecondArray::from(vec![2, 3, 4, 5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -518,11 +519,11 @@ pub mod tests { fn test_decimal_dict() { let values = Decimal128Array::from(vec![1, 0, 2, 5]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal128Array::from(vec![2, 3, 4, 5]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); @@ -542,7 +543,7 @@ pub mod tests { i256::from_i128(5), ]); let keys = Int8Array::from_iter_values([0, 0, 1, 3]); - let array1 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array1 = DictionaryArray::new(keys, Arc::new(values)); let values = Decimal256Array::from(vec![ i256::from_i128(2), @@ -551,7 +552,7 @@ pub mod tests { i256::from_i128(5), ]); let keys = Int8Array::from_iter_values([0, 1, 1, 3]); - let array2 = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array2 = DictionaryArray::new(keys, Arc::new(values)); let cmp = build_compare(&array1, &array2).unwrap(); diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 7661479291c6..a44d9a910f5d 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -1480,12 +1480,9 @@ mod tests { ) where PrimitiveArray: From>>, { - let array = DictionaryArray::::try_new(&keys, &values).unwrap(); + let array = DictionaryArray::::new(keys, Arc::new(values)); let array_values = array.values().clone(); - let dict = array_values - .as_any() - .downcast_ref::>() - .expect("Unable to get dictionary values"); + let dict = array_values.as_primitive::(); let sorted = match limit { Some(_) => { diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index bd022532d6e1..90efdd7b67cc 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -66,7 +66,7 @@ macro_rules! kernel_dict { stringify!($gt), $array.data_type()) }); let values = $kernel(dict.values())?; - let result = DictionaryArray::try_new(dict.keys(), &values)?; + let result = DictionaryArray::try_new(dict.keys().clone(), values)?; Ok(Arc::new(result)) }, )* diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index 997b26361587..a8250c75d287 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -83,7 +83,7 @@ pub fn substring( stringify!($gt), array.data_type()) }); let values = substring(dict.values(), start, length)?; - let result = DictionaryArray::try_new(dict.keys(), &values)?; + let result = DictionaryArray::try_new(dict.keys().clone(), values)?; Ok(Arc::new(result)) }, )* diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index d4c284ad2cd1..1611dc5303d6 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -167,7 +167,7 @@ mod tests { Some(1), None, ]); - let array = DictionaryArray::try_new(&keys, &values)?; + let array = DictionaryArray::new(keys, Arc::new(values)); let data = array.into_data(); test_round_trip(&data) From 53236791f17b60ede73323bc0092ecf306ab21c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Apr 2023 08:23:24 -0400 Subject: [PATCH 0832/1411] Add PrimitiveArray::try_new (#3879) (#4067) * Add PrimitiveArray::try_new (#3879) * Add tests * Review feedback --- arrow-array/src/array/primitive_array.rs | 74 +++++++++++++++++++++--- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 3199104382a6..febafcc6f02a 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -269,24 +269,55 @@ impl PrimitiveArray { /// /// # Panics /// - /// Panics if: - /// - `values.len() != nulls.len()` - /// - `!Self::is_compatible(data_type)` + /// Panics if [`Self::try_new`] returns an error pub fn new( data_type: DataType, values: ScalarBuffer, nulls: Option, ) -> Self { - Self::assert_compatible(&data_type); + Self::try_new(data_type, values, nulls).unwrap() + } + + /// Create a new [`PrimitiveArray`] from the provided data_type, values, nulls + /// + /// # Errors + /// + /// Errors if: + /// - `values.len() != nulls.len()` + /// - `!Self::is_compatible(data_type)` + pub fn try_new( + data_type: DataType, + values: ScalarBuffer, + nulls: Option, + ) -> Result { + if !Self::is_compatible(&data_type) { + return Err(ArrowError::InvalidArgumentError(format!( + "PrimitiveArray expected data type {} got {}", + T::DATA_TYPE, + data_type + ))); + } + if let Some(n) = nulls.as_ref() { - assert_eq!(values.len(), n.len()); + if n.len() != values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for PrimitiveArray, expected {} got {}", + values.len(), + n.len(), + ))); + } } - Self { + Ok(Self { data_type, values, nulls, - } + }) + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { + (self.data_type, self.values, self.nulls) } /// Asserts that `data_type` is compatible with `Self` @@ -2262,4 +2293,33 @@ mod tests { let array = array.with_timezone("+02:00"); assert_eq!(array.timezone(), Some("+02:00")); } + + #[test] + fn test_try_new() { + Int32Array::new(DataType::Int32, vec![1, 2, 3, 4].into(), None); + Int32Array::new( + DataType::Int32, + vec![1, 2, 3, 4].into(), + Some(NullBuffer::new_null(4)), + ); + let err = Int32Array::try_new(DataType::Int64, vec![1, 2, 3, 4].into(), None) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: PrimitiveArray expected data type Int32 got Int64" + ); + + let err = Int32Array::try_new( + DataType::Int32, + vec![1, 2, 3, 4].into(), + Some(NullBuffer::new_null(3)), + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for PrimitiveArray, expected 4 got 3" + ); + } } From 6665512deace028c685660fc2e1f968c0a58b1d0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Apr 2023 09:44:34 -0400 Subject: [PATCH 0833/1411] Use Into> for PrimitiveArray::with_timezone (#4097) * Use Into> for PrimitiveArray::with_timezone * Update test --- arrow-array/src/array/primitive_array.rs | 2 +- arrow-json/src/writer.rs | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index febafcc6f02a..187d7617a0bb 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1102,7 +1102,7 @@ impl PrimitiveArray { } /// Construct a timestamp array with new timezone - pub fn with_timezone(&self, timezone: impl Into) -> Self { + pub fn with_timezone(&self, timezone: impl Into>) -> Self { self.with_timezone_opt(Some(timezone.into())) } diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 60b212101e58..a096590ec058 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -787,12 +787,12 @@ mod tests { let arr_secs = TimestampSecondArray::from(vec![Some(ts_secs), None]); let arr_names = StringArray::from(vec![Some("a"), Some("b")]); - let tz = "+00:00".to_string(); + let tz = "+00:00"; - let arr_nanos = arr_nanos.with_timezone(&tz); - let arr_micros = arr_micros.with_timezone(&tz); - let arr_millis = arr_millis.with_timezone(&tz); - let arr_secs = arr_secs.with_timezone(&tz); + let arr_nanos = arr_nanos.with_timezone(tz); + let arr_micros = arr_micros.with_timezone(tz); + let arr_millis = arr_millis.with_timezone(tz); + let arr_secs = arr_secs.with_timezone(tz); let schema = Schema::new(vec![ Field::new("nanos", arr_nanos.data_type().clone(), true), From 4ec95c2972b2d49421fce66e20dd2df18c559658 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Apr 2023 09:45:35 -0400 Subject: [PATCH 0834/1411] Serialize numeric to tape (#4069) (#4073) --- arrow-json/src/reader/mod.rs | 23 +++++++++++++++++++++++ arrow-json/src/reader/serializer.rs | 10 +++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 603a0cd7e602..4abcb1ea75ba 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -634,6 +634,7 @@ fn make_decoder( #[cfg(test)] mod tests { + use serde_json::json; use std::fs::File; use std::io::{BufReader, Cursor, Seek}; use std::sync::Arc; @@ -1976,4 +1977,26 @@ mod tests { "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123, 3465346]" ); } + + #[test] + fn test_serialize_timestamp() { + let json = vec![ + json!({"timestamp": 1681319393}), + json!({"timestamp": "1970-01-01T00:00:00+02:00"}), + ]; + let schema = Schema::new(vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Second, None), + true, + )]); + let mut decoder = ReaderBuilder::new(Arc::new(schema)) + .build_decoder() + .unwrap(); + decoder.serialize(&json).unwrap(); + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 1); + let values = batch.column(0).as_primitive::(); + assert_eq!(values.values(), &[1681319393, -7200]); + } } diff --git a/arrow-json/src/reader/serializer.rs b/arrow-json/src/reader/serializer.rs index a68d1d5476c4..2aa72de943f7 100644 --- a/arrow-json/src/reader/serializer.rs +++ b/arrow-json/src/reader/serializer.rs @@ -68,6 +68,13 @@ impl<'a> TapeSerializer<'a> { offsets, } } + + fn serialize_number(&mut self, v: &[u8]) { + self.bytes.extend_from_slice(v); + let idx = self.offsets.len() - 1; + self.elements.push(TapeElement::Number(idx as _)); + self.offsets.push(self.bytes.len()); + } } /// The tape stores all values as strings, and so must serialize numeric types @@ -81,7 +88,8 @@ macro_rules! serialize_numeric { ($s:ident, $t:ty, $v:ident) => {{ let mut buffer = [0_u8; <$t>::FORMATTED_SIZE]; let s = lexical_core::write($v, &mut buffer); - $s.serialize_bytes(s) + $s.serialize_number(s); + Ok(()) }}; } From c20f637744540843419b7057d1ac4eaccf4cfaff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Apr 2023 09:02:26 -0700 Subject: [PATCH 0835/1411] Update tonic-build requirement from =0.9.1 to =0.9.2 (#4099) Updates the requirements on [tonic-build](https://github.com/hyperium/tonic) to permit the latest version. - [Release notes](https://github.com/hyperium/tonic/releases) - [Changelog](https://github.com/hyperium/tonic/blob/master/CHANGELOG.md) - [Commits](https://github.com/hyperium/tonic/compare/v0.9.1...v0.9.2) --- updated-dependencies: - dependency-name: tonic-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 85dd3366d2cc..59144870a9e7 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -34,4 +34,4 @@ publish = false # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.56", default-features = false } prost-build = { version = "=0.11.9", default-features = false } -tonic-build = { version = "=0.9.1", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 2f68c483b936eea9c27fdd3b029894bbe8179e09 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Apr 2023 08:12:34 -0400 Subject: [PATCH 0836/1411] Remove DataType from PrimitiveArray constructors (#4098) --- arrow-arith/src/arithmetic.rs | 4 +- arrow-arith/src/arity.rs | 6 +- arrow-array/src/array/list_array.rs | 6 +- arrow-array/src/array/primitive_array.rs | 76 +++++++++++------------- 4 files changed, 44 insertions(+), 48 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 4c1bad4d2e5d..acd0b551c173 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -129,7 +129,7 @@ where unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } }?; - Ok(PrimitiveArray::new(T::DATA_TYPE, buffer.into(), nulls)) + Ok(PrimitiveArray::new(buffer.into(), nulls)) } /// Calculates the modulus operation `left % right` on two SIMD inputs. @@ -356,7 +356,7 @@ where } } - Ok(PrimitiveArray::new(T::DATA_TYPE, result.into(), nulls)) + Ok(PrimitiveArray::new(result.into(), nulls)) } /// Applies $OP to $LEFT and $RIGHT which are two dictionaries which have (the same) key type $KT diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 2f1f6c345b32..ce766aff66f7 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -208,7 +208,7 @@ where // Soundness // `values` is an iterator with a known size from a PrimitiveArray let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - Ok(PrimitiveArray::new(O::DATA_TYPE, buffer.into(), nulls)) + Ok(PrimitiveArray::new(buffer.into(), nulls)) } /// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating @@ -312,7 +312,7 @@ where })?; let values = buffer.finish().into(); - Ok(PrimitiveArray::new(O::DATA_TYPE, values, Some(nulls))) + Ok(PrimitiveArray::new(values, Some(nulls))) } } @@ -396,7 +396,7 @@ where buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?); }; } - Ok(PrimitiveArray::new(O::DATA_TYPE, buffer.into(), None)) + Ok(PrimitiveArray::new(buffer.into(), None)) } /// This intentional inline(never) attribute helps LLVM optimize the loop. diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index d5e0c365b8e6..7f6f54e4ccc3 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -1120,7 +1120,7 @@ mod tests { #[test] fn test_try_new() { let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); - let values = Int32Array::new(DataType::Int32, vec![1, 2, 3, 4, 5].into(), None); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); let values = Arc::new(values) as ArrayRef; let field = Arc::new(Field::new("element", DataType::Int32, false)); @@ -1151,7 +1151,7 @@ mod tests { ); let nulls = NullBuffer::new_null(7); - let values = Int64Array::new(DataType::Int64, vec![0; 7].into(), Some(nulls)); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); let values = Arc::new(values); let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), None) @@ -1165,7 +1165,7 @@ mod tests { let field = Arc::new(Field::new("element", DataType::Int64, true)); LargeListArray::new(field.clone(), offsets.clone(), values, None); - let values = Int64Array::new(DataType::Int64, vec![0; 2].into(), None); + let values = Int64Array::new(vec![0; 2].into(), None); let err = LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 187d7617a0bb..9fb78eb1459d 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -270,12 +270,8 @@ impl PrimitiveArray { /// # Panics /// /// Panics if [`Self::try_new`] returns an error - pub fn new( - data_type: DataType, - values: ScalarBuffer, - nulls: Option, - ) -> Self { - Self::try_new(data_type, values, nulls).unwrap() + pub fn new(values: ScalarBuffer, nulls: Option) -> Self { + Self::try_new(values, nulls).unwrap() } /// Create a new [`PrimitiveArray`] from the provided data_type, values, nulls @@ -284,20 +280,10 @@ impl PrimitiveArray { /// /// Errors if: /// - `values.len() != nulls.len()` - /// - `!Self::is_compatible(data_type)` pub fn try_new( - data_type: DataType, values: ScalarBuffer, nulls: Option, ) -> Result { - if !Self::is_compatible(&data_type) { - return Err(ArrowError::InvalidArgumentError(format!( - "PrimitiveArray expected data type {} got {}", - T::DATA_TYPE, - data_type - ))); - } - if let Some(n) = nulls.as_ref() { if n.len() != values.len() { return Err(ArrowError::InvalidArgumentError(format!( @@ -309,7 +295,7 @@ impl PrimitiveArray { } Ok(Self { - data_type, + data_type: T::DATA_TYPE, values, nulls, }) @@ -320,6 +306,19 @@ impl PrimitiveArray { (self.data_type, self.values, self.nulls) } + /// Overrides the [`DataType`] of this [`PrimitiveArray`] + /// + /// Prefer using [`Self::with_timezone`] or [`Self::with_precision_and_scale`] where + /// the primitive type is suitably constrained, as these cannot panic + /// + /// # Panics + /// + /// Panics if ![Self::is_compatible] + pub fn with_data_type(self, data_type: DataType) -> Self { + Self::assert_compatible(&data_type); + Self { data_type, ..self } + } + /// Asserts that `data_type` is compatible with `Self` fn assert_compatible(data_type: &DataType) { assert!( @@ -406,7 +405,7 @@ impl PrimitiveArray { pub fn from_value(value: T::Native, count: usize) -> Self { unsafe { let val_buf = Buffer::from_trusted_len_iter((0..count).map(|_| value)); - Self::new(T::DATA_TYPE, val_buf.into(), None) + Self::new(val_buf.into(), None) } } @@ -498,7 +497,7 @@ impl PrimitiveArray { // Soundness // `values` is an iterator with a known size because arrays are sized. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - PrimitiveArray::new(O::DATA_TYPE, buffer.into(), nulls) + PrimitiveArray::new(buffer.into(), nulls) } /// Applies an unary and infallible function to a mutable primitive array. @@ -561,7 +560,7 @@ impl PrimitiveArray { } let values = buffer.finish().into(); - Ok(PrimitiveArray::new(O::DATA_TYPE, values, nulls)) + Ok(PrimitiveArray::new(values, nulls)) } /// Applies an unary and fallible function to all valid values in a mutable primitive array. @@ -646,7 +645,7 @@ impl PrimitiveArray { let nulls = BooleanBuffer::new(null_builder.finish(), 0, len); let values = buffer.finish().into(); let nulls = unsafe { NullBuffer::new_unchecked(nulls, out_null_count) }; - PrimitiveArray::new(O::DATA_TYPE, values, Some(nulls)) + PrimitiveArray::new(values, Some(nulls)) } /// Returns `PrimitiveBuilder` of this primitive array for mutating its values if the underlying @@ -1292,6 +1291,7 @@ mod tests { use crate::builder::{Decimal128Builder, Decimal256Builder}; use crate::cast::downcast_array; use crate::{ArrayRef, BooleanArray}; + use arrow_schema::TimeUnit; use std::sync::Arc; #[test] @@ -2296,30 +2296,26 @@ mod tests { #[test] fn test_try_new() { - Int32Array::new(DataType::Int32, vec![1, 2, 3, 4].into(), None); - Int32Array::new( - DataType::Int32, - vec![1, 2, 3, 4].into(), - Some(NullBuffer::new_null(4)), - ); - let err = Int32Array::try_new(DataType::Int64, vec![1, 2, 3, 4].into(), None) - .unwrap_err(); - - assert_eq!( - err.to_string(), - "Invalid argument error: PrimitiveArray expected data type Int32 got Int64" - ); + Int32Array::new(vec![1, 2, 3, 4].into(), None); + Int32Array::new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(4))); - let err = Int32Array::try_new( - DataType::Int32, - vec![1, 2, 3, 4].into(), - Some(NullBuffer::new_null(3)), - ) - .unwrap_err(); + let err = + Int32Array::try_new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(3))) + .unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Incorrect length of null buffer for PrimitiveArray, expected 4 got 3" ); + + TimestampNanosecondArray::new(vec![1, 2, 3, 4].into(), None).with_data_type( + DataType::Timestamp(TimeUnit::Nanosecond, Some("03:00".into())), + ); + } + + #[test] + #[should_panic(expected = "PrimitiveArray expected data type Int32 got Date32")] + fn test_with_data_type() { + Int32Array::new(vec![1, 2, 3, 4].into(), None).with_data_type(DataType::Date32); } } From 1097203b1650fc2cfbb822d04b47610d84100481 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 19 Apr 2023 09:05:05 -0400 Subject: [PATCH 0837/1411] Add ByteArray constructors (#3879) (#4081) * Add ByteArray constructors (#3879) * Clippy * Make ListArray error message consistent * Review feedback --- arrow-array/src/array/byte_array.rs | 142 +++++++++++++++++++++++++- arrow-array/src/array/list_array.rs | 4 +- arrow-array/src/array/string_array.rs | 28 +---- arrow-array/src/types.rs | 52 +++++++++- 4 files changed, 197 insertions(+), 29 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index e23079ef9be9..12f9aab674e8 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -21,10 +21,10 @@ use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; -use arrow_buffer::{ArrowNativeType, Buffer}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -60,6 +60,87 @@ impl GenericByteArray { /// Data type of the array. pub const DATA_TYPE: DataType = T::DATA_TYPE; + /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`GenericByteArray::try_new`] returns an error + pub fn new( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Self { + Self::try_new(offsets, values, nulls).unwrap() + } + + /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `offsets.len() - 1 != nulls.len()` + /// * Any consecutive pair of `offsets` does not denote a valid slice of `values` + pub fn try_new( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Result { + let len = offsets.len() - 1; + + // Verify that each pair of offsets is a valid slices of values + T::validate(&offsets, &values)?; + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}{}Array, expected {len} got {}", + T::Offset::PREFIX, + T::PREFIX, + n.len(), + ))); + } + } + + Ok(Self { + data_type: T::DATA_TYPE, + value_offsets: offsets, + value_data: values, + nulls, + }) + } + + /// Create a new [`GenericByteArray`] from the provided parts, without validation + /// + /// # Safety + /// + /// Safe if [`Self::try_new`] would not error + pub fn new_unchecked( + offsets: OffsetBuffer, + values: Buffer, + nulls: Option, + ) -> Self { + Self { + data_type: T::DATA_TYPE, + value_offsets: offsets, + value_data: values, + nulls, + } + } + + /// Create a new [`GenericByteArray`] of length `len` where all values are null + pub fn new_null(len: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + value_offsets: OffsetBuffer::new_zeroed(len), + value_data: MutableBuffer::new(0).into(), + nulls: Some(NullBuffer::new_null(len)), + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (OffsetBuffer, Buffer, Option) { + (self.value_offsets, self.value_data, self.nulls) + } + /// Returns the length for value at index `i`. /// # Panics /// Panics if index `i` is out of bounds. @@ -374,3 +455,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray { ArrayIter::new(self) } } + +#[cfg(test)] +mod tests { + use crate::{BinaryArray, StringArray}; + use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; + + #[test] + fn try_new() { + let data = Buffer::from_slice_ref("helloworld"); + let offsets = OffsetBuffer::new(vec![0, 5, 10].into()); + StringArray::new(offsets.clone(), data.clone(), None); + + let nulls = NullBuffer::new_null(3); + let err = + StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"); + + let err = + BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"); + + let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld"); + let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"); + + BinaryArray::new(offsets, non_utf8_data, None); + + let offsets = OffsetBuffer::new(vec![0, 5, 11].into()); + let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Offset of 11 exceeds length of values 10" + ); + + let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Maximum offset of 11 is larger than values of length 10" + ); + + let non_ascii_data = Buffer::from_slice_ref("heìloworld"); + StringArray::new(offsets.clone(), non_ascii_data.clone(), None); + BinaryArray::new(offsets, non_ascii_data.clone(), None); + + let offsets = OffsetBuffer::new(vec![0, 3, 10].into()); + let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None) + .unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Split UTF-8 codepoint at offset 3" + ); + + BinaryArray::new(offsets, non_ascii_data, None); + } +} diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 7f6f54e4ccc3..f4e5b4b79c77 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -109,7 +109,7 @@ impl GenericListArray { if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( - "Incorrect number of nulls for {}ListArray, expected {len} got {}", + "Incorrect length of null buffer for {}ListArray, expected {len} got {}", OffsetSize::PREFIX, n.len(), ))); @@ -1137,7 +1137,7 @@ mod tests { assert_eq!( err.to_string(), - "Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3" + "Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3" ); let field = Arc::new(Field::new("element", DataType::Int64, false)); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index e042f29c22d1..7c4a375299db 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -16,9 +16,7 @@ // under the License. use crate::types::GenericStringType; -use crate::{ - Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, -}; +use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -105,27 +103,8 @@ impl GenericStringArray { pub fn try_from_binary( v: GenericBinaryArray, ) -> Result { - let offsets = v.value_offsets(); - let values = v.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values).map_err(|e| { - ArrowError::CastError(format!("Encountered non UTF-8 data: {e}")) - })?; - - for offset in offsets.iter() { - let o = offset.as_usize(); - if !validated.is_char_boundary(o) { - return Err(ArrowError::CastError(format!( - "Split UTF-8 codepoint at offset {o}" - ))); - } - } - - let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); - // SAFETY: - // Validated UTF-8 above - Ok(Self::from(unsafe { builder.build_unchecked() })) + let (offsets, values, nulls) = v.into_parts(); + Self::try_new(offsets, values, nulls) } } @@ -261,6 +240,7 @@ mod tests { use super::*; use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder}; use crate::types::UInt8Type; + use crate::Array; use arrow_buffer::Buffer; use arrow_schema::Field; use std::sync::Arc; diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index cec78db9fc70..b50018ca9751 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -19,7 +19,7 @@ use crate::delta::shift_months; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; -use arrow_buffer::i256; +use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; use arrow_schema::{ ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, @@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { /// Utf8Array will have native type has &str /// BinaryArray will have type as [u8] type Native: bytes::ByteArrayNativeType + AsRef + AsRef<[u8]> + ?Sized; + /// "Binary" or "String", for use in error messages const PREFIX: &'static str; + /// Datatype of array elements const DATA_TYPE: DataType; + + /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values` + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError>; } /// [`ByteArrayType`] for string arrays @@ -1547,6 +1555,33 @@ impl ByteArrayType for GenericStringType { } else { DataType::Utf8 }; + + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError> { + // Verify that the slice as a whole is valid UTF-8 + let validated = std::str::from_utf8(values).map_err(|e| { + ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) + })?; + + // Verify each offset is at a valid character boundary in this UTF-8 array + for offset in offsets.iter() { + let o = offset.as_usize(); + if !validated.is_char_boundary(o) { + if o < validated.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Split UTF-8 codepoint at offset {o}" + ))); + } + return Err(ArrowError::InvalidArgumentError(format!( + "Offset of {o} exceeds length of values {}", + validated.len() + ))); + } + } + Ok(()) + } } /// An arrow utf8 array with i32 offsets @@ -1569,6 +1604,21 @@ impl ByteArrayType for GenericBinaryType { } else { DataType::Binary }; + + fn validate( + offsets: &OffsetBuffer, + values: &Buffer, + ) -> Result<(), ArrowError> { + // offsets are guaranteed to be monotonically increasing and non-empty + let max_offset = offsets.last().unwrap().as_usize(); + if values.len() < max_offset { + return Err(ArrowError::InvalidArgumentError(format!( + "Maximum offset of {max_offset} is larger than values of length {}", + values.len() + ))); + } + Ok(()) + } } /// An arrow binary array with i32 offsets From 93484a10d145617434432d610e241640a06b382f Mon Sep 17 00:00:00 2001 From: Rin Arakaki Date: Thu, 20 Apr 2023 06:33:22 +0900 Subject: [PATCH 0838/1411] Update writer.rs (#4100) --- parquet/src/file/writer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4983ed55f8f6..9923970bedde 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -544,7 +544,7 @@ pub struct SerializedColumnWriter<'a> { } impl<'a> SerializedColumnWriter<'a> { - /// Create a new [`SerializedColumnWriter`] from a `[`ColumnWriter`] and an + /// Create a new [`SerializedColumnWriter`] from a [`ColumnWriter`] and an /// optional callback to be invoked on [`Self::close`] pub fn new( inner: ColumnWriter<'a>, @@ -563,7 +563,7 @@ impl<'a> SerializedColumnWriter<'a> { get_typed_column_writer_mut(&mut self.inner) } - /// Close this [`SerializedColumnWriter] + /// Close this [`SerializedColumnWriter`] pub fn close(mut self) -> Result<()> { let r = match self.inner { ColumnWriter::BoolColumnWriter(typed) => typed.close()?, From 88906d8b4261d28d4f2c76d163fba14628230400 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Fri, 21 Apr 2023 03:39:21 +0200 Subject: [PATCH 0839/1411] chore: format the test (#4104) * chore: format the test * chore: format the test --- arrow-string/src/like.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 383ac5fd11c6..f896f0c6c7d9 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -1244,7 +1244,7 @@ mod tests { "ffkoß", "😃sadlksFFkoSSsh😃klF", // Original was case insensitive "😃sadlksffkosSsh😃klF" "😱slgFFkoSSsh😃klF", // Original was case insensitive "😱slgffkosSsh😃klF" - "FFkoSS", // "FFKoSS" + "FFkoSS", // "FFKoSS" ], "FFkoSS", contains_utf8_scalar, From bbd57c615213bc6e80fb0192674942f688e5f6a8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:34:43 -0400 Subject: [PATCH 0840/1411] Prepare 38.0.0 (#4108) --- CHANGELOG-old.md | 93 +++++++++++++++++++++ CHANGELOG.md | 136 ++++++++++++++----------------- Cargo.toml | 32 ++++---- dev/release/update_change_log.sh | 4 +- 4 files changed, 170 insertions(+), 95 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index f1219e514675..e04f0f5d2762 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,99 @@ # Historical Changelog +## [37.0.0](https://github.com/apache/arrow-rs/tree/37.0.0) (2023-04-07) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/36.0.0...37.0.0) + +**Breaking changes:** + +- Fix timestamp handling in cast kernel \(\#1936\) \(\#4033\) [\#4034](https://github.com/apache/arrow-rs/pull/4034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update tonic 0.9.1 [\#4011](https://github.com/apache/arrow-rs/pull/4011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Use FieldRef in DataType \(\#3955\) [\#3983](https://github.com/apache/arrow-rs/pull/3983) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Store Timezone as Arc\ [\#3976](https://github.com/apache/arrow-rs/pull/3976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Panic instead of discarding nulls converting StructArray to RecordBatch - \(\#3951\) [\#3953](https://github.com/apache/arrow-rs/pull/3953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix\(flight\_sql\): PreparedStatement has no token for auth. [\#3948](https://github.com/apache/arrow-rs/pull/3948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) +- Add Strongly Typed Array Slice \(\#3929\) [\#3930](https://github.com/apache/arrow-rs/pull/3930) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Zero-Copy Conversion between Vec and MutableBuffer [\#3920](https://github.com/apache/arrow-rs/pull/3920) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Support Decimals cast to Utf8/LargeUtf [\#3991](https://github.com/apache/arrow-rs/issues/3991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Date32/Date64 minus Interval [\#3962](https://github.com/apache/arrow-rs/issues/3962) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Reduce Cloning of Field [\#3955](https://github.com/apache/arrow-rs/issues/3955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Deserializing Serde DataTypes to Arrow [\#3949](https://github.com/apache/arrow-rs/issues/3949) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add multiply\_fixed\_point [\#3946](https://github.com/apache/arrow-rs/issues/3946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Strongly Typed Array Slicing [\#3929](https://github.com/apache/arrow-rs/issues/3929) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make it easier to match FlightSQL messages [\#3874](https://github.com/apache/arrow-rs/issues/3874) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Casting Between Binary / LargeBinary and FixedSizeBinary [\#3826](https://github.com/apache/arrow-rs/issues/3826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Incorrect Overflow Casting String to Timestamp [\#4033](https://github.com/apache/arrow-rs/issues/4033) +- f16::ZERO and f16::ONE are mixed up [\#4016](https://github.com/apache/arrow-rs/issues/4016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Handle overflow precision when casting from integer to decimal [\#3995](https://github.com/apache/arrow-rs/issues/3995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- PrimitiveDictionaryBuilder.finish should use actual value type [\#3971](https://github.com/apache/arrow-rs/issues/3971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RecordBatch From StructArray Silently Discards Nulls [\#3952](https://github.com/apache/arrow-rs/issues/3952) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- I256 Checked Subtraction Overflows for i256::MINUS\_ONE [\#3942](https://github.com/apache/arrow-rs/issues/3942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- I256 Checked Multiply Overflows for i256::MIN [\#3941](https://github.com/apache/arrow-rs/issues/3941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Remove non-existent `js` feature from README [\#4000](https://github.com/apache/arrow-rs/issues/4000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support take on MapArray [\#3875](https://github.com/apache/arrow-rs/issues/3875) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Prep for 37.0.0 [\#4031](https://github.com/apache/arrow-rs/pull/4031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Add RecordBatch::with\_schema [\#4028](https://github.com/apache/arrow-rs/pull/4028) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Only require compatible batch schema in ArrowWriter [\#4027](https://github.com/apache/arrow-rs/pull/4027) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add Fields::contains [\#4026](https://github.com/apache/arrow-rs/pull/4026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add methods "is\_positive" and "signum" to i256 [\#4024](https://github.com/apache/arrow-rs/pull/4024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Deprecate Array::data \(\#3880\) [\#4019](https://github.com/apache/arrow-rs/pull/4019) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add tests for ArrowNativeTypeOp [\#4018](https://github.com/apache/arrow-rs/pull/4018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- fix: f16::ZERO and f16::ONE are mixed up [\#4017](https://github.com/apache/arrow-rs/pull/4017) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Minor: Float16Tensor [\#4013](https://github.com/apache/arrow-rs/pull/4013) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add FlightSQL module docs and links to `arrow-flight` crates [\#4012](https://github.com/apache/arrow-rs/pull/4012) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Update proc-macro2 requirement from =1.0.54 to =1.0.56 [\#4008](https://github.com/apache/arrow-rs/pull/4008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Cleanup Primitive take [\#4006](https://github.com/apache/arrow-rs/pull/4006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate combine\_option\_bitmap [\#4005](https://github.com/apache/arrow-rs/pull/4005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: add tests for BooleanBuffer [\#4004](https://github.com/apache/arrow-rs/pull/4004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- feat: support to read/write customized metadata in ipc files [\#4003](https://github.com/apache/arrow-rs/pull/4003) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) +- Cleanup more uses of Array::data \(\#3880\) [\#4002](https://github.com/apache/arrow-rs/pull/4002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove js feature from README [\#4001](https://github.com/apache/arrow-rs/pull/4001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([akazukin5151](https://github.com/akazukin5151)) +- feat: add the implementation BitXor to BooleanBuffer [\#3997](https://github.com/apache/arrow-rs/pull/3997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Handle precision overflow when casting from integer to decimal [\#3996](https://github.com/apache/arrow-rs/pull/3996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support CAST from Decimal datatype to String [\#3994](https://github.com/apache/arrow-rs/pull/3994) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Add Field Constructors for Complex Fields [\#3992](https://github.com/apache/arrow-rs/pull/3992) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- fix: remove unused type parameters. [\#3986](https://github.com/apache/arrow-rs/pull/3986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([youngsofun](https://github.com/youngsofun)) +- Add UnionFields \(\#3955\) [\#3981](https://github.com/apache/arrow-rs/pull/3981) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup Fields Serde [\#3980](https://github.com/apache/arrow-rs/pull/3980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support Rust structures --\> `RecordBatch` by adding `Serde` support to `RawDecoder` \(\#3949\) [\#3979](https://github.com/apache/arrow-rs/pull/3979) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Convert string\_to\_timestamp\_nanos to doctest [\#3978](https://github.com/apache/arrow-rs/pull/3978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix documentation of string\_to\_timestamp\_nanos [\#3977](https://github.com/apache/arrow-rs/pull/3977) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([byteink](https://github.com/byteink)) +- add Date32/Date64 support to subtract\_dyn [\#3974](https://github.com/apache/arrow-rs/pull/3974) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SinanGncgl](https://github.com/SinanGncgl)) +- PrimitiveDictionaryBuilder.finish should use actual value type [\#3972](https://github.com/apache/arrow-rs/pull/3972) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update proc-macro2 requirement from =1.0.53 to =1.0.54 [\#3968](https://github.com/apache/arrow-rs/pull/3968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Async writer tweaks [\#3967](https://github.com/apache/arrow-rs/pull/3967) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix reading ipc files with unordered projections [\#3966](https://github.com/apache/arrow-rs/pull/3966) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) +- Add Fields abstraction \(\#3955\) [\#3965](https://github.com/apache/arrow-rs/pull/3965) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: cast between `Binary`/`LargeBinary` and `FixedSizeBinary` [\#3961](https://github.com/apache/arrow-rs/pull/3961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat: support async writer \(\#1269\) [\#3957](https://github.com/apache/arrow-rs/pull/3957) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ShiKaiWi](https://github.com/ShiKaiWi)) +- Add ListBuilder::append\_value \(\#3949\) [\#3954](https://github.com/apache/arrow-rs/pull/3954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve array builder documentation \(\#3949\) [\#3951](https://github.com/apache/arrow-rs/pull/3951) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster i256 parsing [\#3950](https://github.com/apache/arrow-rs/pull/3950) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add multiply\_fixed\_point [\#3945](https://github.com/apache/arrow-rs/pull/3945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat: enable metadata import/export through C data interface [\#3944](https://github.com/apache/arrow-rs/pull/3944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Fix checked i256 arithmetic \(\#3942\) \(\#3941\) [\#3943](https://github.com/apache/arrow-rs/pull/3943) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Avoid memory copies in take\_list [\#3940](https://github.com/apache/arrow-rs/pull/3940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster decimal parsing \(30-60%\) [\#3939](https://github.com/apache/arrow-rs/pull/3939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) +- Fix: FlightSqlClient panic when execute\_update. [\#3938](https://github.com/apache/arrow-rs/pull/3938) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) +- Cleanup row count handling in JSON writer [\#3934](https://github.com/apache/arrow-rs/pull/3934) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add typed buffers to UnionArray \(\#3880\) [\#3933](https://github.com/apache/arrow-rs/pull/3933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add take for MapArray [\#3925](https://github.com/apache/arrow-rs/pull/3925) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Deprecate Array::data\_ref \(\#3880\) [\#3923](https://github.com/apache/arrow-rs/pull/3923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Zero-copy conversion from Vec to PrimitiveArray [\#3917](https://github.com/apache/arrow-rs/pull/3917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Add Commands enum to decode prost messages to strong type [\#3887](https://github.com/apache/arrow-rs/pull/3887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) ## [36.0.0](https://github.com/apache/arrow-rs/tree/36.0.0) (2023-03-24) [Full Changelog](https://github.com/apache/arrow-rs/compare/35.0.0...36.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index df0f088cdd0e..f5dfa46ea012 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,99 +19,81 @@ # Changelog -## [37.0.0](https://github.com/apache/arrow-rs/tree/37.0.0) (2023-04-07) +## [38.0.0](https://github.com/apache/arrow-rs/tree/38.0.0) (2023-04-21) -[Full Changelog](https://github.com/apache/arrow-rs/compare/36.0.0...37.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/37.0.0...38.0.0) **Breaking changes:** -- Fix timestamp handling in cast kernel \(\#1936\) \(\#4033\) [\#4034](https://github.com/apache/arrow-rs/pull/4034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update tonic 0.9.1 [\#4011](https://github.com/apache/arrow-rs/pull/4011) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Use FieldRef in DataType \(\#3955\) [\#3983](https://github.com/apache/arrow-rs/pull/3983) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Store Timezone as Arc\ [\#3976](https://github.com/apache/arrow-rs/pull/3976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Panic instead of discarding nulls converting StructArray to RecordBatch - \(\#3951\) [\#3953](https://github.com/apache/arrow-rs/pull/3953) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix\(flight\_sql\): PreparedStatement has no token for auth. [\#3948](https://github.com/apache/arrow-rs/pull/3948) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) -- Add Strongly Typed Array Slice \(\#3929\) [\#3930](https://github.com/apache/arrow-rs/pull/3930) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add Zero-Copy Conversion between Vec and MutableBuffer [\#3920](https://github.com/apache/arrow-rs/pull/3920) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove DataType from PrimitiveArray constructors [\#4098](https://github.com/apache/arrow-rs/pull/4098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use Into\\> for PrimitiveArray::with\_timezone [\#4097](https://github.com/apache/arrow-rs/pull/4097) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Store StructArray entries in MapArray [\#4085](https://github.com/apache/arrow-rs/pull/4085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add DictionaryArray Constructors \(\#3879\) [\#4068](https://github.com/apache/arrow-rs/pull/4068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Relax JSON schema inference generics [\#4063](https://github.com/apache/arrow-rs/pull/4063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove ArrayData from Array \(\#3880\) [\#4061](https://github.com/apache/arrow-rs/pull/4061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add CommandGetXdbcTypeInfo to Flight SQL Server [\#4055](https://github.com/apache/arrow-rs/pull/4055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([c-thiel](https://github.com/c-thiel)) +- Remove old JSON Reader and Decoder \(\#3610\) [\#4052](https://github.com/apache/arrow-rs/pull/4052) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use BufRead for JSON Schema Inference [\#4041](https://github.com/apache/arrow-rs/pull/4041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([WenyXu](https://github.com/WenyXu)) **Implemented enhancements:** -- Support Decimals cast to Utf8/LargeUtf [\#3991](https://github.com/apache/arrow-rs/issues/3991) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support Date32/Date64 minus Interval [\#3962](https://github.com/apache/arrow-rs/issues/3962) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Reduce Cloning of Field [\#3955](https://github.com/apache/arrow-rs/issues/3955) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support Deserializing Serde DataTypes to Arrow [\#3949](https://github.com/apache/arrow-rs/issues/3949) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add multiply\_fixed\_point [\#3946](https://github.com/apache/arrow-rs/issues/3946) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Strongly Typed Array Slicing [\#3929](https://github.com/apache/arrow-rs/issues/3929) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Make it easier to match FlightSQL messages [\#3874](https://github.com/apache/arrow-rs/issues/3874) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support Casting Between Binary / LargeBinary and FixedSizeBinary [\#3826](https://github.com/apache/arrow-rs/issues/3826) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support dyn\_compare\_scalar for Decimal256 [\#4083](https://github.com/apache/arrow-rs/issues/4083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Better JSON Reader Error Messages [\#4076](https://github.com/apache/arrow-rs/issues/4076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Additional data type groups [\#4056](https://github.com/apache/arrow-rs/issues/4056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Async JSON reader [\#4043](https://github.com/apache/arrow-rs/issues/4043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Field::contains Should Recurse into DataType [\#4029](https://github.com/apache/arrow-rs/issues/4029) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Prevent UnionArray with Repeated Type IDs [\#3982](https://github.com/apache/arrow-rs/issues/3982) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Timestamp` `+`/`-` `Interval` types [\#3963](https://github.com/apache/arrow-rs/issues/3963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- First-Class Array Abstractions [\#3880](https://github.com/apache/arrow-rs/issues/3880) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Incorrect Overflow Casting String to Timestamp [\#4033](https://github.com/apache/arrow-rs/issues/4033) -- f16::ZERO and f16::ONE are mixed up [\#4016](https://github.com/apache/arrow-rs/issues/4016) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Handle overflow precision when casting from integer to decimal [\#3995](https://github.com/apache/arrow-rs/issues/3995) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- PrimitiveDictionaryBuilder.finish should use actual value type [\#3971](https://github.com/apache/arrow-rs/issues/3971) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RecordBatch From StructArray Silently Discards Nulls [\#3952](https://github.com/apache/arrow-rs/issues/3952) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- I256 Checked Subtraction Overflows for i256::MINUS\_ONE [\#3942](https://github.com/apache/arrow-rs/issues/3942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- I256 Checked Multiply Overflows for i256::MIN [\#3941](https://github.com/apache/arrow-rs/issues/3941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- OffsetBuffer::new Rejects 0 Offsets [\#4066](https://github.com/apache/arrow-rs/issues/4066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet AsyncArrowWriter not shutting down inner async writer. [\#4058](https://github.com/apache/arrow-rs/issues/4058) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Flight SQL Server missing command type.googleapis.com/arrow.flight.protocol.sql.CommandGetXdbcTypeInfo [\#4054](https://github.com/apache/arrow-rs/issues/4054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- RawJsonReader Errors with Empty Schema [\#4053](https://github.com/apache/arrow-rs/issues/4053) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RawJsonReader Integer Truncation [\#4049](https://github.com/apache/arrow-rs/issues/4049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sparse UnionArray Equality Incorrect Offset Handling [\#4044](https://github.com/apache/arrow-rs/issues/4044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Write blog about improvements in JSON and CSV processing [\#4062](https://github.com/apache/arrow-rs/issues/4062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Remove non-existent `js` feature from README [\#4000](https://github.com/apache/arrow-rs/issues/4000) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support take on MapArray [\#3875](https://github.com/apache/arrow-rs/issues/3875) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader of Int96 columns and coercion to timestamps [\#4075](https://github.com/apache/arrow-rs/issues/4075) +- Serializing timestamp from int \(json raw decoder\) [\#4069](https://github.com/apache/arrow-rs/issues/4069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting to/from Interval and Duration [\#3998](https://github.com/apache/arrow-rs/issues/3998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Prep for 37.0.0 [\#4031](https://github.com/apache/arrow-rs/pull/4031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Add RecordBatch::with\_schema [\#4028](https://github.com/apache/arrow-rs/pull/4028) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Only require compatible batch schema in ArrowWriter [\#4027](https://github.com/apache/arrow-rs/pull/4027) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add Fields::contains [\#4026](https://github.com/apache/arrow-rs/pull/4026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: add methods "is\_positive" and "signum" to i256 [\#4024](https://github.com/apache/arrow-rs/pull/4024) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Deprecate Array::data \(\#3880\) [\#4019](https://github.com/apache/arrow-rs/pull/4019) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: add tests for ArrowNativeTypeOp [\#4018](https://github.com/apache/arrow-rs/pull/4018) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- fix: f16::ZERO and f16::ONE are mixed up [\#4017](https://github.com/apache/arrow-rs/pull/4017) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Minor: Float16Tensor [\#4013](https://github.com/apache/arrow-rs/pull/4013) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add FlightSQL module docs and links to `arrow-flight` crates [\#4012](https://github.com/apache/arrow-rs/pull/4012) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Update proc-macro2 requirement from =1.0.54 to =1.0.56 [\#4008](https://github.com/apache/arrow-rs/pull/4008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Cleanup Primitive take [\#4006](https://github.com/apache/arrow-rs/pull/4006) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Deprecate combine\_option\_bitmap [\#4005](https://github.com/apache/arrow-rs/pull/4005) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: add tests for BooleanBuffer [\#4004](https://github.com/apache/arrow-rs/pull/4004) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- feat: support to read/write customized metadata in ipc files [\#4003](https://github.com/apache/arrow-rs/pull/4003) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) -- Cleanup more uses of Array::data \(\#3880\) [\#4002](https://github.com/apache/arrow-rs/pull/4002) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove js feature from README [\#4001](https://github.com/apache/arrow-rs/pull/4001) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([akazukin5151](https://github.com/akazukin5151)) -- feat: add the implementation BitXor to BooleanBuffer [\#3997](https://github.com/apache/arrow-rs/pull/3997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Handle precision overflow when casting from integer to decimal [\#3996](https://github.com/apache/arrow-rs/pull/3996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support CAST from Decimal datatype to String [\#3994](https://github.com/apache/arrow-rs/pull/3994) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Add Field Constructors for Complex Fields [\#3992](https://github.com/apache/arrow-rs/pull/3992) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- fix: remove unused type parameters. [\#3986](https://github.com/apache/arrow-rs/pull/3986) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([youngsofun](https://github.com/youngsofun)) -- Add UnionFields \(\#3955\) [\#3981](https://github.com/apache/arrow-rs/pull/3981) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup Fields Serde [\#3980](https://github.com/apache/arrow-rs/pull/3980) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support Rust structures --\> `RecordBatch` by adding `Serde` support to `RawDecoder` \(\#3949\) [\#3979](https://github.com/apache/arrow-rs/pull/3979) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Convert string\_to\_timestamp\_nanos to doctest [\#3978](https://github.com/apache/arrow-rs/pull/3978) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix documentation of string\_to\_timestamp\_nanos [\#3977](https://github.com/apache/arrow-rs/pull/3977) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([byteink](https://github.com/byteink)) -- add Date32/Date64 support to subtract\_dyn [\#3974](https://github.com/apache/arrow-rs/pull/3974) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SinanGncgl](https://github.com/SinanGncgl)) -- PrimitiveDictionaryBuilder.finish should use actual value type [\#3972](https://github.com/apache/arrow-rs/pull/3972) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Update proc-macro2 requirement from =1.0.53 to =1.0.54 [\#3968](https://github.com/apache/arrow-rs/pull/3968) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Async writer tweaks [\#3967](https://github.com/apache/arrow-rs/pull/3967) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix reading ipc files with unordered projections [\#3966](https://github.com/apache/arrow-rs/pull/3966) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([framlog](https://github.com/framlog)) -- Add Fields abstraction \(\#3955\) [\#3965](https://github.com/apache/arrow-rs/pull/3965) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- feat: cast between `Binary`/`LargeBinary` and `FixedSizeBinary` [\#3961](https://github.com/apache/arrow-rs/pull/3961) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- feat: support async writer \(\#1269\) [\#3957](https://github.com/apache/arrow-rs/pull/3957) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ShiKaiWi](https://github.com/ShiKaiWi)) -- Add ListBuilder::append\_value \(\#3949\) [\#3954](https://github.com/apache/arrow-rs/pull/3954) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve array builder documentation \(\#3949\) [\#3951](https://github.com/apache/arrow-rs/pull/3951) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster i256 parsing [\#3950](https://github.com/apache/arrow-rs/pull/3950) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add multiply\_fixed\_point [\#3945](https://github.com/apache/arrow-rs/pull/3945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- feat: enable metadata import/export through C data interface [\#3944](https://github.com/apache/arrow-rs/pull/3944) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Fix checked i256 arithmetic \(\#3942\) \(\#3941\) [\#3943](https://github.com/apache/arrow-rs/pull/3943) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Avoid memory copies in take\_list [\#3940](https://github.com/apache/arrow-rs/pull/3940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster decimal parsing \(30-60%\) [\#3939](https://github.com/apache/arrow-rs/pull/3939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([spebern](https://github.com/spebern)) -- Fix: FlightSqlClient panic when execute\_update. [\#3938](https://github.com/apache/arrow-rs/pull/3938) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([youngsofun](https://github.com/youngsofun)) -- Cleanup row count handling in JSON writer [\#3934](https://github.com/apache/arrow-rs/pull/3934) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add typed buffers to UnionArray \(\#3880\) [\#3933](https://github.com/apache/arrow-rs/pull/3933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: add take for MapArray [\#3925](https://github.com/apache/arrow-rs/pull/3925) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Deprecate Array::data\_ref \(\#3880\) [\#3923](https://github.com/apache/arrow-rs/pull/3923) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Zero-copy conversion from Vec to PrimitiveArray [\#3917](https://github.com/apache/arrow-rs/pull/3917) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: Add Commands enum to decode prost messages to strong type [\#3887](https://github.com/apache/arrow-rs/pull/3887) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([stuartcarnie](https://github.com/stuartcarnie)) +- Fix Docs Typos [\#4100](https://github.com/apache/arrow-rs/pull/4100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rnarkk](https://github.com/rnarkk)) +- Update tonic-build requirement from =0.9.1 to =0.9.2 [\#4099](https://github.com/apache/arrow-rs/pull/4099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Increase minimum chrono version to 0.4.24 [\#4093](https://github.com/apache/arrow-rs/pull/4093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- \[Minor\]: Add `Hash` trait to SortOptions. [\#4089](https://github.com/apache/arrow-rs/pull/4089) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mustafasrepo](https://github.com/mustafasrepo)) +- Include byte offsets in parquet-layout [\#4086](https://github.com/apache/arrow-rs/pull/4086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: Support dyn\_compare\_scalar for Decimal256 [\#4084](https://github.com/apache/arrow-rs/pull/4084) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add ByteArray constructors \(\#3879\) [\#4081](https://github.com/apache/arrow-rs/pull/4081) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.11.8 to =0.11.9 [\#4080](https://github.com/apache/arrow-rs/pull/4080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Improve JSON decoder errors \(\#4076\) [\#4079](https://github.com/apache/arrow-rs/pull/4079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix Timestamp Numeric Truncation in JSON Reader [\#4074](https://github.com/apache/arrow-rs/pull/4074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Serialize numeric to tape \(\#4069\) [\#4073](https://github.com/apache/arrow-rs/pull/4073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Prevent UnionArray with Repeated Type IDs [\#4070](https://github.com/apache/arrow-rs/pull/4070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Add PrimitiveArray::try\_new \(\#3879\) [\#4067](https://github.com/apache/arrow-rs/pull/4067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ListArray Constructors \(\#3879\) [\#4065](https://github.com/apache/arrow-rs/pull/4065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Shutdown parquet async writer [\#4059](https://github.com/apache/arrow-rs/pull/4059) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kindly](https://github.com/kindly)) +- feat: additional data type groups [\#4057](https://github.com/apache/arrow-rs/pull/4057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Fix precision loss in Raw JSON decoder \(\#4049\) [\#4051](https://github.com/apache/arrow-rs/pull/4051) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use lexical\_core in CSV and JSON parser \(~25% faster\) [\#4050](https://github.com/apache/arrow-rs/pull/4050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add offsets accessors to variable length arrays \(\#3879\) [\#4048](https://github.com/apache/arrow-rs/pull/4048) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document Async decoder usage \(\#4043\) \(\#78\) [\#4046](https://github.com/apache/arrow-rs/pull/4046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix sparse union array equality \(\#4044\) [\#4045](https://github.com/apache/arrow-rs/pull/4045) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: DataType::contains support nested type [\#4042](https://github.com/apache/arrow-rs/pull/4042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat: Support Timestamp +/- Interval types [\#4038](https://github.com/apache/arrow-rs/pull/4038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- feat: cast from/to interval and duration [\#4020](https://github.com/apache/arrow-rs/pull/4020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) diff --git a/Cargo.toml b/Cargo.toml index 34a7951b3937..872bb2919f60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "37.0.0" +version = "38.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "37.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "37.0.0", path = "./arrow-arith" } -arrow-array = { version = "37.0.0", path = "./arrow-array" } -arrow-buffer = { version = "37.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "37.0.0", path = "./arrow-cast" } -arrow-csv = { version = "37.0.0", path = "./arrow-csv" } -arrow-data = { version = "37.0.0", path = "./arrow-data" } -arrow-ipc = { version = "37.0.0", path = "./arrow-ipc" } -arrow-json = { version = "37.0.0", path = "./arrow-json" } -arrow-ord = { version = "37.0.0", path = "./arrow-ord" } -arrow-row = { version = "37.0.0", path = "./arrow-row" } -arrow-schema = { version = "37.0.0", path = "./arrow-schema" } -arrow-select = { version = "37.0.0", path = "./arrow-select" } -arrow-string = { version = "37.0.0", path = "./arrow-string" } -parquet = { version = "37.0.0", path = "./parquet", default-features = false } +arrow = { version = "38.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "38.0.0", path = "./arrow-arith" } +arrow-array = { version = "38.0.0", path = "./arrow-array" } +arrow-buffer = { version = "38.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "38.0.0", path = "./arrow-cast" } +arrow-csv = { version = "38.0.0", path = "./arrow-csv" } +arrow-data = { version = "38.0.0", path = "./arrow-data" } +arrow-ipc = { version = "38.0.0", path = "./arrow-ipc" } +arrow-json = { version = "38.0.0", path = "./arrow-json" } +arrow-ord = { version = "38.0.0", path = "./arrow-ord" } +arrow-row = { version = "38.0.0", path = "./arrow-row" } +arrow-schema = { version = "38.0.0", path = "./arrow-schema" } +arrow-select = { version = "38.0.0", path = "./arrow-select" } +arrow-string = { version = "38.0.0", path = "./arrow-string" } +parquet = { version = "38.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 1293617c6f53..c1f3167e7934 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="36.0.0" -FUTURE_RELEASE="37.0.0" +SINCE_TAG="37.0.0" +FUTURE_RELEASE="38.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From b04968593c6ded2138ad8225aec7228225d78676 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Apr 2023 11:03:53 -0400 Subject: [PATCH 0841/1411] Update regex-syntax requirement from 0.6.27 to 0.7.1 (#4107) Updates the requirements on [regex-syntax](https://github.com/rust-lang/regex) to permit the latest version. - [Release notes](https://github.com/rust-lang/regex/releases) - [Changelog](https://github.com/rust-lang/regex/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/regex/commits) --- updated-dependencies: - dependency-name: regex-syntax dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-string/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index f24b17a5c89b..6e16e0163a36 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -40,7 +40,7 @@ arrow-schema = { workspace = true } arrow-array = { workspace = true } arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } +regex-syntax = { version = "0.7.1", default-features = false, features = ["unicode"] } [package.metadata.docs.rs] all-features = true From b99af3c3c07348e6c1c76095a659d9d0169ae09e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 21 Apr 2023 14:06:48 -0400 Subject: [PATCH 0842/1411] Fix object_store tests with latest aho_corasick (#4109) --- object_store/src/util.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 08bfd86d9f67..1ec63f219a20 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -229,7 +229,7 @@ mod tests { #[tokio::test] async fn test_coalesce_ranges() { let fetches = do_fetch(vec![], 0).await; - assert_eq!(fetches, vec![]); + assert!(fetches.is_empty()); let fetches = do_fetch(vec![0..3], 0).await; assert_eq!(fetches, vec![0..3]); From 6099864180b9a42472bd0a0a17d16a1b612902f4 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Mon, 24 Apr 2023 11:15:44 +0800 Subject: [PATCH 0843/1411] optimize cast for same decimal type and same scale (#4088) --- arrow-cast/src/cast.rs | 288 +++++++++++++++++++++++----------- arrow/benches/cast_kernels.rs | 7 + 2 files changed, 201 insertions(+), 94 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index bc37174b94f2..61a296e99a4f 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -851,7 +851,7 @@ pub fn cast_with_options( cast_primitive_to_list::(array, to, to_type, cast_options) } (Decimal128(_, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal::( + cast_decimal_to_decimal_same_type::( array.as_primitive(), *s1, *p2, @@ -860,7 +860,7 @@ pub fn cast_with_options( ) } (Decimal256(_, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal::( + cast_decimal_to_decimal_same_type::( array.as_primitive(), *s1, *p2, @@ -1292,16 +1292,16 @@ pub fn cast_with_options( cast_string_to_time64nanosecond::(array, cast_options) } Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Millisecond, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Nanosecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Interval(IntervalUnit::YearMonth) => { cast_string_to_year_month_interval::(array, cast_options) @@ -1385,7 +1385,7 @@ pub fn cast_with_options( cast_byte_container::(array) } FixedSizeBinary(size) => { - cast_binary_to_fixed_size_binary::(array,*size, cast_options) + cast_binary_to_fixed_size_binary::(array, *size, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1876,12 +1876,12 @@ pub fn cast_with_options( }) } false => { - array.as_primitive::().try_unary::<_, Date64Type, _>( - |x| { - x.mul_checked(MILLISECONDS) - }, - )? - } + array.as_primitive::().try_unary::<_, Date64Type, _>( + |x| { + x.mul_checked(MILLISECONDS) + }, + )? + } }, )), (Timestamp(TimeUnit::Millisecond, _), Date64) => { @@ -1922,10 +1922,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { - Ok(time_to_time64us(as_time_res_with_timezone::< - TimestampMillisecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Nanosecond)) => { @@ -1933,10 +1933,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { - Ok(time_to_time64ns(as_time_res_with_timezone::< - TimestampMillisecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Microsecond)) => { @@ -1944,10 +1944,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { - Ok(time_to_time64us(as_time_res_with_timezone::< - TimestampMicrosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Nanosecond)) => { @@ -1955,10 +1955,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { - Ok(time_to_time64ns(as_time_res_with_timezone::< - TimestampMicrosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Microsecond)) => { @@ -1966,10 +1966,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { - Ok(time_to_time64us(as_time_res_with_timezone::< - TimestampNanosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64us(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Nanosecond)) => { @@ -1977,10 +1977,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { - Ok(time_to_time64ns(as_time_res_with_timezone::< - TimestampNanosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time64ns(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Second)) => { @@ -2021,10 +2021,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { - Ok(time_to_time32ms(as_time_res_with_timezone::< - TimestampMillisecondType, - >(x, tz)?)) - })?, + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampMillisecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Second)) => { @@ -2043,10 +2043,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { - Ok(time_to_time32ms(as_time_res_with_timezone::< - TimestampMicrosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampMicrosecondType, + >(x, tz)?)) + })?, )) } (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Second)) => { @@ -2065,10 +2065,10 @@ pub fn cast_with_options( Ok(Arc::new( array.as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { - Ok(time_to_time32ms(as_time_res_with_timezone::< - TimestampNanosecondType, - >(x, tz)?)) - })?, + Ok(time_to_time32ms(as_time_res_with_timezone::< + TimestampNanosecondType, + >(x, tz)?)) + })?, )) } @@ -2222,20 +2222,17 @@ impl DecimalCast for i256 { } } -fn cast_decimal_to_decimal( - array: &PrimitiveArray, - input_scale: i8, +fn cast_decimal_to_decimal_error( output_precision: u8, output_scale: i8, - cast_options: &CastOptions, -) -> Result +) -> impl Fn(::Native) -> ArrowError where I: DecimalType, O: DecimalType, I::Native: DecimalCast + ArrowNativeTypeOp, O::Native: DecimalCast + ArrowNativeTypeOp, { - let error = |x| { + move |x: I::Native| { ArrowError::CastError(format!( "Cannot cast to {}({}, {}). Overflowing on {:?}", O::PREFIX, @@ -2243,45 +2240,148 @@ where output_scale, x )) - }; + } +} - let array: PrimitiveArray = if input_scale > output_scale { - let div = I::Native::from_decimal(10_i128) - .unwrap() - .pow_checked((input_scale - output_scale) as u32)?; +fn convert_to_smaller_scale_decimal( + array: &PrimitiveArray, + input_scale: i8, + output_precision: u8, + output_scale: i8, + cast_options: &CastOptions, +) -> Result, ArrowError> +where + I: DecimalType, + O: DecimalType, + I::Native: DecimalCast + ArrowNativeTypeOp, + O::Native: DecimalCast + ArrowNativeTypeOp, +{ + let error = cast_decimal_to_decimal_error::(output_precision, output_scale); + let div = I::Native::from_decimal(10_i128) + .unwrap() + .pow_checked((input_scale - output_scale) as u32)?; - let half = div.div_wrapping(I::Native::from_usize(2).unwrap()); - let half_neg = half.neg_wrapping(); + let half = div.div_wrapping(I::Native::from_usize(2).unwrap()); + let half_neg = half.neg_wrapping(); - let f = |x: I::Native| { - // div is >= 10 and so this cannot overflow - let d = x.div_wrapping(div); - let r = x.mod_wrapping(div); + let f = |x: I::Native| { + // div is >= 10 and so this cannot overflow + let d = x.div_wrapping(div); + let r = x.mod_wrapping(div); - // Round result - let adjusted = match x >= I::Native::ZERO { - true if r >= half => d.add_wrapping(I::Native::ONE), - false if r <= half_neg => d.sub_wrapping(I::Native::ONE), - _ => d, - }; - O::Native::from_decimal(adjusted) + // Round result + let adjusted = match x >= I::Native::ZERO { + true if r >= half => d.add_wrapping(I::Native::ONE), + false if r <= half_neg => d.sub_wrapping(I::Native::ONE), + _ => d, }; + O::Native::from_decimal(adjusted) + }; - match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, - } - } else { - let mul = O::Native::from_decimal(10_i128) - .unwrap() - .pow_checked((output_scale - input_scale) as u32)?; + Ok(match cast_options.safe { + true => array.unary_opt(f), + false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + }) +} - let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok()); +fn convert_to_bigger_or_equal_scale_decimal( + array: &PrimitiveArray, + input_scale: i8, + output_precision: u8, + output_scale: i8, + cast_options: &CastOptions, +) -> Result, ArrowError> +where + I: DecimalType, + O: DecimalType, + I::Native: DecimalCast + ArrowNativeTypeOp, + O::Native: DecimalCast + ArrowNativeTypeOp, +{ + let error = cast_decimal_to_decimal_error::(output_precision, output_scale); + let mul = O::Native::from_decimal(10_i128) + .unwrap() + .pow_checked((output_scale - input_scale) as u32)?; - match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok()); + + Ok(match cast_options.safe { + true => array.unary_opt(f), + false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + }) +} + +// Only support one type of decimal cast operations +fn cast_decimal_to_decimal_same_type( + array: &PrimitiveArray, + input_scale: i8, + output_precision: u8, + output_scale: i8, + cast_options: &CastOptions, +) -> Result +where + T: DecimalType, + T::Native: DecimalCast + ArrowNativeTypeOp, +{ + let array: PrimitiveArray = match input_scale.cmp(&output_scale) { + Ordering::Equal => { + // the scale doesn't change, the native value don't need to be changed + array.clone() } + Ordering::Greater => convert_to_smaller_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )?, + Ordering::Less => { + // input_scale < output_scale + convert_to_bigger_or_equal_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )? + } + }; + + Ok(Arc::new(array.with_precision_and_scale( + output_precision, + output_scale, + )?)) +} + +// Support two different types of decimal cast operations +fn cast_decimal_to_decimal( + array: &PrimitiveArray, + input_scale: i8, + output_precision: u8, + output_scale: i8, + cast_options: &CastOptions, +) -> Result +where + I: DecimalType, + O: DecimalType, + I::Native: DecimalCast + ArrowNativeTypeOp, + O::Native: DecimalCast + ArrowNativeTypeOp, +{ + let array: PrimitiveArray = if input_scale > output_scale { + convert_to_smaller_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )? + } else { + convert_to_bigger_or_equal_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )? }; Ok(Arc::new(array.with_precision_and_scale( @@ -7821,7 +7921,7 @@ mod tests { Decimal128Type::format_decimal( parse_string_to_decimal_native::("12345", 2).unwrap(), 38, - 2 + 2, ), "12345.00" ); @@ -7829,7 +7929,7 @@ mod tests { Decimal128Type::format_decimal( parse_string_to_decimal_native::("0.12345", 2).unwrap(), 38, - 2 + 2, ), "0.12" ); @@ -7837,7 +7937,7 @@ mod tests { Decimal128Type::format_decimal( parse_string_to_decimal_native::(".12345", 2).unwrap(), 38, - 2 + 2, ), "0.12" ); @@ -7845,7 +7945,7 @@ mod tests { Decimal128Type::format_decimal( parse_string_to_decimal_native::(".1265", 2).unwrap(), 38, - 2 + 2, ), "0.13" ); @@ -7853,7 +7953,7 @@ mod tests { Decimal128Type::format_decimal( parse_string_to_decimal_native::(".1265", 2).unwrap(), 38, - 2 + 2, ), "0.13" ); @@ -7862,7 +7962,7 @@ mod tests { Decimal256Type::format_decimal( parse_string_to_decimal_native::("123.45", 3).unwrap(), 38, - 3 + 3, ), "123.450" ); @@ -7870,7 +7970,7 @@ mod tests { Decimal256Type::format_decimal( parse_string_to_decimal_native::("12345", 3).unwrap(), 38, - 3 + 3, ), "12345.000" ); @@ -7878,7 +7978,7 @@ mod tests { Decimal256Type::format_decimal( parse_string_to_decimal_native::("0.12345", 3).unwrap(), 38, - 3 + 3, ), "0.123" ); @@ -7886,7 +7986,7 @@ mod tests { Decimal256Type::format_decimal( parse_string_to_decimal_native::(".12345", 3).unwrap(), 38, - 3 + 3, ), "0.123" ); @@ -7894,7 +7994,7 @@ mod tests { Decimal256Type::format_decimal( parse_string_to_decimal_native::(".1265", 3).unwrap(), 38, - 3 + 3, ), "0.127" ); diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 7ef4d1d7e74a..933ddd4a06b4 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -230,6 +230,13 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal256 to decimal256 512", |b| { b.iter(|| cast_array(&decimal256_array, DataType::Decimal256(50, 5))) }); + + c.bench_function("cast decimal128 to decimal128 512 with same scale", |b| { + b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 3))) + }); + c.bench_function("cast decimal256 to decimal256 512 with same scale", |b| { + b.iter(|| cast_array(&decimal256_array, DataType::Decimal256(60, 3))) + }); } criterion_group!(benches, add_benchmark); From abfe184831a105e34d9939070acbaa9fcbfe56f2 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Tue, 25 Apr 2023 13:54:41 +0300 Subject: [PATCH 0844/1411] Ignore Field Metadata in equals_datatype for Dictionary, RunEndEncoded, Map and Union (#4111) * fix: equality of nested data types * fix: cargo clippy * fix: cargo fmt * feat: add tests with differing nullability --- arrow-schema/src/datatype.rs | 152 +++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 6 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 0bbd64f30abb..edd1dd09620e 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -431,7 +431,40 @@ impl DataType { ( DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted), - ) => a_field == b_field && a_is_sorted == b_is_sorted, + ) => { + a_field.is_nullable() == b_field.is_nullable() + && a_field.data_type().equals_datatype(b_field.data_type()) + && a_is_sorted == b_is_sorted + } + ( + DataType::Dictionary(a_key, a_value), + DataType::Dictionary(b_key, b_value), + ) => a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value), + ( + DataType::RunEndEncoded(a_run_ends, a_values), + DataType::RunEndEncoded(b_run_ends, b_values), + ) => { + a_run_ends.is_nullable() == b_run_ends.is_nullable() + && a_run_ends + .data_type() + .equals_datatype(b_run_ends.data_type()) + && a_values.is_nullable() == b_values.is_nullable() + && a_values.data_type().equals_datatype(b_values.data_type()) + } + ( + DataType::Union(a_union_fields, a_union_mode), + DataType::Union(b_union_fields, b_union_mode), + ) => { + a_union_mode == b_union_mode + && a_union_fields.len() == b_union_fields.len() + && a_union_fields.iter().all(|a| { + b_union_fields.iter().any(|b| { + a.0 == b.0 + && a.1.is_nullable() == b.1.is_nullable() + && a.1.data_type().equals_datatype(b.1.data_type()) + }) + }) + } _ => self == other, } } @@ -564,7 +597,7 @@ pub const DECIMAL_DEFAULT_SCALE: i8 = 10; #[cfg(test)] mod tests { use super::*; - use crate::Field; + use crate::{Field, UnionMode}; #[test] #[cfg(feature = "serde")] @@ -628,10 +661,14 @@ mod tests { assert!(!list_b.equals_datatype(&list_c)); assert!(!list_a.equals_datatype(&list_d)); - let list_e = - DataType::FixedSizeList(Arc::new(Field::new("item", list_a, false)), 3); - let list_f = - DataType::FixedSizeList(Arc::new(Field::new("array", list_b, false)), 3); + let list_e = DataType::FixedSizeList( + Arc::new(Field::new("item", list_a.clone(), false)), + 3, + ); + let list_f = DataType::FixedSizeList( + Arc::new(Field::new("array", list_b.clone(), false)), + 3, + ); let list_g = DataType::FixedSizeList( Arc::new(Field::new("item", DataType::FixedSizeBinary(3), true)), 3, @@ -664,6 +701,109 @@ mod tests { assert!(!list_h.equals_datatype(&list_j)); assert!(!list_k.equals_datatype(&list_l)); assert!(list_k.equals_datatype(&list_m)); + + let list_n = + DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true); + let list_o = + DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true); + let list_p = + DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false); + let list_q = + DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true); + let list_r = + DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true); + + assert!(list_n.equals_datatype(&list_o)); + assert!(!list_n.equals_datatype(&list_p)); + assert!(!list_n.equals_datatype(&list_q)); + assert!(!list_n.equals_datatype(&list_r)); + + let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a)); + let list_t = + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone())); + let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b)); + let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c)); + + assert!(list_s.equals_datatype(&list_t)); + assert!(!list_s.equals_datatype(&list_u)); + assert!(!list_s.equals_datatype(&list_v)); + + let union_a = DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("f1", DataType::Utf8, false), + Field::new("f2", DataType::UInt8, false), + ], + ), + UnionMode::Sparse, + ); + let union_b = DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("ff1", DataType::Utf8, false), + Field::new("ff2", DataType::UInt8, false), + ], + ), + UnionMode::Sparse, + ); + let union_c = DataType::Union( + UnionFields::new( + vec![2, 1], + vec![ + Field::new("fff2", DataType::UInt8, false), + Field::new("fff1", DataType::Utf8, false), + ], + ), + UnionMode::Sparse, + ); + let union_d = DataType::Union( + UnionFields::new( + vec![2, 1], + vec![ + Field::new("fff1", DataType::Int8, false), + Field::new("fff2", DataType::UInt8, false), + ], + ), + UnionMode::Sparse, + ); + let union_e = DataType::Union( + UnionFields::new( + vec![1, 2], + vec![ + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::UInt8, false), + ], + ), + UnionMode::Sparse, + ); + + assert!(union_a.equals_datatype(&union_b)); + assert!(union_a.equals_datatype(&union_c)); + assert!(!union_a.equals_datatype(&union_d)); + assert!(!union_a.equals_datatype(&union_e)); + + let list_w = DataType::RunEndEncoded( + Arc::new(Field::new("f1", DataType::Int64, true)), + Arc::new(Field::new("f2", DataType::Utf8, true)), + ); + let list_x = DataType::RunEndEncoded( + Arc::new(Field::new("ff1", DataType::Int64, true)), + Arc::new(Field::new("ff2", DataType::Utf8, true)), + ); + let list_y = DataType::RunEndEncoded( + Arc::new(Field::new("ff1", DataType::UInt16, true)), + Arc::new(Field::new("ff2", DataType::Utf8, true)), + ); + let list_z = DataType::RunEndEncoded( + Arc::new(Field::new("f1", DataType::Int64, false)), + Arc::new(Field::new("f2", DataType::Utf8, true)), + ); + + assert!(list_w.equals_datatype(&list_x)); + assert!(!list_w.equals_datatype(&list_y)); + assert!(!list_w.equals_datatype(&list_z)); } #[test] From 244cd92915901ab9b713182623d139d4aeec993c Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 25 Apr 2023 21:35:07 +0800 Subject: [PATCH 0845/1411] feat: add get_{ref, mut} to arrow_ipc Reader and Writer (#4122) Signed-off-by: Yilin Chen --- arrow-ipc/src/reader.rs | 28 ++++++++++++++++++++++++++++ arrow-ipc/src/writer.rs | 24 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index e41119937339..16cb99b920d9 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1038,6 +1038,20 @@ impl FileReader { ))), } } + + /// Gets a reference to the underlying reader. + /// + /// It is inadvisable to directly read from the underlying reader. + pub fn get_ref(&self) -> &R { + self.reader.get_ref() + } + + /// Gets a mutable reference to the underlying reader. + /// + /// It is inadvisable to directly read from the underlying reader. + pub fn get_mut(&mut self) -> &mut R { + self.reader.get_mut() + } } impl Iterator for FileReader { @@ -1243,6 +1257,20 @@ impl StreamReader { )), } } + + /// Gets a reference to the underlying reader. + /// + /// It is inadvisable to directly read from the underlying reader. + pub fn get_ref(&self) -> &R { + self.reader.get_ref() + } + + /// Gets a mutable reference to the underlying reader. + /// + /// It is inadvisable to directly read from the underlying reader. + pub fn get_mut(&mut self) -> &mut R { + self.reader.get_mut() + } } impl Iterator for StreamReader { diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 0c9ca17b7426..abaecea1faf2 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -832,6 +832,18 @@ impl FileWriter { Ok(()) } + /// Gets a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.writer.get_ref() + } + + /// Gets a mutable reference to the underlying writer. + /// + /// It is inadvisable to directly write to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.writer.get_mut() + } + /// Unwraps the BufWriter housed in FileWriter.writer, returning the underlying /// writer /// @@ -920,6 +932,18 @@ impl StreamWriter { Ok(()) } + /// Gets a reference to the underlying writer. + pub fn get_ref(&self) -> &W { + self.writer.get_ref() + } + + /// Gets a mutable reference to the underlying writer. + /// + /// It is inadvisable to directly write to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.writer.get_mut() + } + /// Unwraps the BufWriter housed in StreamWriter.writer, returning the underlying /// writer /// From 31b86b1ae38ec0b6258c2a484f29e06a93c51612 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Apr 2023 10:15:37 -0400 Subject: [PATCH 0846/1411] Fix flaky unknown_length_append (#4123) --- object_store/src/local.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index d2553d46f244..286853da2eda 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1406,6 +1406,7 @@ mod not_wasm_tests { let mut writer = integration.append(&location).await.unwrap(); writer.write_all(data.as_ref()).await.unwrap(); + writer.flush().await.unwrap(); let read_data = integration .get(&location) From f3b4a73de2e732445513257edf9d1395f4d9e624 Mon Sep 17 00:00:00 2001 From: zhenxing jiang Date: Tue, 25 Apr 2023 22:29:38 +0800 Subject: [PATCH 0847/1411] add support empty array (#4114) Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-json/src/reader/mod.rs | 61 ++++++++++++++++++++++++++++- arrow-json/src/reader/null_array.rs | 35 +++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 arrow-json/src/reader/null_array.rs diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 4abcb1ea75ba..9541e0372102 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -146,6 +146,7 @@ use crate::reader::boolean_array::BooleanArrayDecoder; use crate::reader::decimal_array::DecimalArrayDecoder; use crate::reader::list_array::ListArrayDecoder; use crate::reader::map_array::MapArrayDecoder; +use crate::reader::null_array::NullArrayDecoder; use crate::reader::primitive_array::PrimitiveArrayDecoder; use crate::reader::string_array::StringArrayDecoder; use crate::reader::struct_array::StructArrayDecoder; @@ -156,6 +157,7 @@ mod boolean_array; mod decimal_array; mod list_array; mod map_array; +mod null_array; mod primitive_array; mod schema; mod serializer; @@ -580,6 +582,7 @@ fn make_decoder( ) -> Result, ArrowError> { downcast_integer! { data_type => (primitive_decoder, data_type), + DataType::Null => Ok(Box::::default()), DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), DataType::Timestamp(TimeUnit::Second, None) => { @@ -647,7 +650,7 @@ mod tests { use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{DataType, Field, FieldRef, Schema}; use crate::reader::infer_json_schema; use crate::ReaderBuilder; @@ -1602,6 +1605,62 @@ mod tests { assert!(!cc.is_valid(5)); } + #[test] + fn test_empty_json_arrays() { + let json_content = r#" + {"items": []} + {"items": null} + {} + "#; + + let schema = Arc::new(Schema::new(vec![Field::new( + "items", + DataType::List(FieldRef::new(Field::new("item", DataType::Null, true))), + true, + )])); + + let batches = do_read(json_content, 1024, false, schema); + assert_eq!(batches.len(), 1); + + let col1 = batches[0].column(0).as_list::(); + assert_eq!(col1.null_count(), 2); + assert!(col1.value(0).is_empty()); + assert_eq!(col1.value(0).data_type(), &DataType::Null); + assert!(col1.is_null(1)); + assert!(col1.is_null(2)); + } + + #[test] + fn test_nested_empty_json_arrays() { + let json_content = r#" + {"items": [[],[]]} + {"items": [[null, null],[null]]} + "#; + + let schema = Arc::new(Schema::new(vec![Field::new( + "items", + DataType::List(FieldRef::new(Field::new( + "item", + DataType::List(FieldRef::new(Field::new("item", DataType::Null, true))), + true, + ))), + true, + )])); + + let batches = do_read(json_content, 1024, false, schema); + assert_eq!(batches.len(), 1); + + let col1 = batches[0].column(0).as_list::(); + assert_eq!(col1.null_count(), 0); + assert_eq!(col1.value(0).len(), 2); + assert!(col1.value(0).as_list::().value(0).is_empty()); + assert!(col1.value(0).as_list::().value(1).is_empty()); + + assert_eq!(col1.value(1).len(), 2); + assert_eq!(col1.value(1).as_list::().value(0).len(), 2); + assert_eq!(col1.value(1).as_list::().value(1).len(), 1); + } + #[test] fn test_nested_list_json_arrays() { let c_field = diff --git a/arrow-json/src/reader/null_array.rs b/arrow-json/src/reader/null_array.rs new file mode 100644 index 000000000000..4270045fb3c2 --- /dev/null +++ b/arrow-json/src/reader/null_array.rs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::ArrayDecoder; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; + +#[derive(Default)] +pub struct NullArrayDecoder {} + +impl ArrayDecoder for NullArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + for p in pos { + if !matches!(tape.get(*p), TapeElement::Null) { + return Err(tape.error(*p, "null")); + } + } + ArrayDataBuilder::new(DataType::Null).len(pos.len()).build() + } +} From 0059049768035486470c99ee2b9675f0983ced32 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 25 Apr 2023 16:40:04 +0200 Subject: [PATCH 0848/1411] feat: set FlightDescriptor on FlightDataEncoderBuilder (#4101) * feat: set FlightDescriptor on FlightDataEncoderBuilder * send a separate descriptor message when the descriptor is provided * include the flight descriptor in the first FlightData --- arrow-flight/src/encode.rs | 26 ++++++++++++++++++++++++-- arrow-flight/tests/encode_decode.rs | 25 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index f97311d6f9e3..9650031d8b5f 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -17,7 +17,7 @@ use std::{collections::VecDeque, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; -use crate::{error::Result, FlightData, SchemaAsIpc}; +use crate::{error::Result, FlightData, FlightDescriptor, SchemaAsIpc}; use arrow_array::{ArrayRef, RecordBatch, RecordBatchOptions}; use arrow_ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; @@ -72,6 +72,8 @@ pub struct FlightDataEncoderBuilder { app_metadata: Bytes, /// Optional schema, if known before data. schema: Option, + /// Optional flight descriptor, if known before data. + descriptor: Option, } /// Default target size for encoded [`FlightData`]. @@ -87,6 +89,7 @@ impl Default for FlightDataEncoderBuilder { options: IpcWriteOptions::default(), app_metadata: Bytes::new(), schema: None, + descriptor: None, } } } @@ -134,6 +137,15 @@ impl FlightDataEncoderBuilder { self } + /// Specify a flight descriptor in the first FlightData message. + pub fn with_flight_descriptor( + mut self, + descriptor: Option, + ) -> Self { + self.descriptor = descriptor; + self + } + /// Return a [`Stream`](futures::Stream) of [`FlightData`], /// consuming self. More details on [`FlightDataEncoder`] pub fn build(self, input: S) -> FlightDataEncoder @@ -145,6 +157,7 @@ impl FlightDataEncoderBuilder { options, app_metadata, schema, + descriptor, } = self; FlightDataEncoder::new( @@ -153,6 +166,7 @@ impl FlightDataEncoderBuilder { max_flight_data_size, options, app_metadata, + descriptor, ) } } @@ -176,6 +190,8 @@ pub struct FlightDataEncoder { queue: VecDeque, /// Is this stream done (inner is empty or errored) done: bool, + /// cleared after the first FlightData message is sent + descriptor: Option, } impl FlightDataEncoder { @@ -185,6 +201,7 @@ impl FlightDataEncoder { max_flight_data_size: usize, options: IpcWriteOptions, app_metadata: Bytes, + descriptor: Option, ) -> Self { let mut encoder = Self { inner, @@ -194,17 +211,22 @@ impl FlightDataEncoder { app_metadata: Some(app_metadata), queue: VecDeque::new(), done: false, + descriptor, }; // If schema is known up front, enqueue it immediately if let Some(schema) = schema { encoder.encode_schema(&schema); } + encoder } /// Place the `FlightData` in the queue to send - fn queue_message(&mut self, data: FlightData) { + fn queue_message(&mut self, mut data: FlightData) { + if let Some(descriptor) = self.descriptor.take() { + data.flight_descriptor = Some(descriptor); + } self.queue.push_back(data); } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 90fa2b7a6832..4f1a8e667ffc 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -22,6 +22,8 @@ use std::{collections::HashMap, sync::Arc}; use arrow_array::types::Int32Type; use arrow_array::{ArrayRef, DictionaryArray, Float64Array, RecordBatch, UInt8Array}; use arrow_cast::pretty::pretty_format_batches; +use arrow_flight::flight_descriptor::DescriptorType; +use arrow_flight::FlightDescriptor; use arrow_flight::{ decode::{DecodedPayload, FlightDataDecoder, FlightRecordBatchStream}, encode::FlightDataEncoderBuilder, @@ -136,6 +138,29 @@ async fn test_zero_batches_schema_specified() { assert_eq!(decoder.schema(), Some(&schema)); } +#[tokio::test] +async fn test_with_flight_descriptor() { + let stream = futures::stream::iter(vec![Ok(make_dictionary_batch(5))]); + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)])); + + let descriptor = Some(FlightDescriptor { + r#type: DescriptorType::Path.into(), + path: vec!["table_name".to_string()], + cmd: Bytes::default(), + }); + + let encoder = FlightDataEncoderBuilder::default() + .with_schema(schema.clone()) + .with_flight_descriptor(descriptor.clone()); + + let mut encoder = encoder.build(stream); + + // First batch should be the schema + let first_batch = encoder.next().await.unwrap().unwrap(); + + assert_eq!(first_batch.flight_descriptor, descriptor); +} + #[tokio::test] async fn test_zero_batches_dictionary_schema_specified() { let schema = Arc::new(Schema::new(vec![ From b26d943fc637ea6d901d0171d77a0ddb0d95dab5 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 25 Apr 2023 17:53:36 +0200 Subject: [PATCH 0849/1411] feat: support `Interval` + `Timestamp` and `Interval` + `Date` (#4117) * feat: support Interval + Timestamp and Interval + Date in addition to Timestamp + Interval and Interval + Date * avoid recursing indefinitely --- arrow-arith/src/arithmetic.rs | 88 +++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index acd0b551c173..2b8a2f3b7db2 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -838,6 +838,15 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result + { + add_dyn(right, left) + } _ => { downcast_primitive_array!( (left, right) => { @@ -1987,6 +1996,13 @@ mod tests { c.value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) + ); } #[test] @@ -2001,6 +2017,13 @@ mod tests { c.value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) + ); } #[test] @@ -2018,6 +2041,13 @@ mod tests { c.value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) + ); } #[test] @@ -2033,6 +2063,13 @@ mod tests { c.value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) + ); } #[test] @@ -2047,6 +2084,13 @@ mod tests { c.value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) + ); } #[test] @@ -2064,6 +2108,13 @@ mod tests { c.value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); + + let c = add_dyn(&b, &a).unwrap(); + let c = c.as_any().downcast_ref::().unwrap(); + assert_eq!( + c.value(0), + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) + ); } #[test] @@ -3888,6 +3939,10 @@ mod tests { ]); assert_eq!(result, &expected); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); + // timestamp second + interval day time let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); let b = IntervalDayTimeArray::from(vec![ @@ -3908,6 +3963,9 @@ mod tests { 5 + SECONDS_IN_DAY, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp second + interval month day nanosecond let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); @@ -3929,6 +3987,9 @@ mod tests { 5 + SECONDS_IN_DAY, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); } #[test] @@ -4021,6 +4082,9 @@ mod tests { 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, ]); assert_eq!(result, &expected); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp millisecond + interval day time let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4042,6 +4106,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp millisecond + interval month day nanosecond let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4063,6 +4130,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); } #[test] @@ -4155,6 +4225,9 @@ mod tests { 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, ]); assert_eq!(result, &expected); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp microsecond + interval day time let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4176,6 +4249,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp microsecond + interval month day nanosecond let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4197,6 +4273,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); } #[test] @@ -4289,6 +4368,9 @@ mod tests { 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, ]); assert_eq!(result, &expected); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp nanosecond + interval day time let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4310,6 +4392,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000_000_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); // timestamp nanosecond + interval month day nanosecond let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); @@ -4331,6 +4416,9 @@ mod tests { 5 + SECONDS_IN_DAY * 1_000_000_000, ]); assert_eq!(&expected, result); + let result = add_dyn(&b, &a).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result, &expected); } #[test] From be33ec5a397b8ece12c9baf87af66a73848a24ab Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Tue, 25 Apr 2023 20:03:03 +0300 Subject: [PATCH 0850/1411] Add Type Declarations for All Primitive Tensors and Buffer Builders (#4113) * feat: support all primitive tensors and buffer builders * fix: cargo fmt --- arrow-array/src/builder/buffer_builder.rs | 10 +++++++++ arrow/src/tensor.rs | 25 ++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index d4eed0de9de7..f88a6392083e 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -17,6 +17,7 @@ use crate::array::ArrowPrimitiveType; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use half::f16; use std::marker::PhantomData; use crate::types::*; @@ -37,11 +38,20 @@ pub type UInt16BufferBuilder = BufferBuilder; pub type UInt32BufferBuilder = BufferBuilder; /// Buffer builder for usigned 64-bit integer type. pub type UInt64BufferBuilder = BufferBuilder; +/// Buffer builder for 16-bit floating point type. +pub type Float16BufferBuilder = BufferBuilder; /// Buffer builder for 32-bit floating point type. pub type Float32BufferBuilder = BufferBuilder; /// Buffer builder for 64-bit floating point type. pub type Float64BufferBuilder = BufferBuilder; +/// Buffer builder for 128-bit decimal type. +pub type Decimal128BufferBuilder = + BufferBuilder<::Native>; +/// Buffer builder for 256-bit decimal type. +pub type Decimal256BufferBuilder = + BufferBuilder<::Native>; + /// Buffer builder for timestamp type of second unit. pub type TimestampSecondBufferBuilder = BufferBuilder<::Native>; diff --git a/arrow/src/tensor.rs b/arrow/src/tensor.rs index b2abffc517c8..299c4f2b8403 100644 --- a/arrow/src/tensor.rs +++ b/arrow/src/tensor.rs @@ -85,17 +85,36 @@ pub struct Tensor<'a, T: ArrowPrimitiveType> { } pub type BooleanTensor<'a> = Tensor<'a, BooleanType>; +pub type Date32Tensor<'a> = Tensor<'a, Date32Type>; +pub type Date64Tensor<'a> = Tensor<'a, Date64Type>; +pub type Decimal128Tensor<'a> = Tensor<'a, Decimal128Type>; +pub type Decimal256Tensor<'a> = Tensor<'a, Decimal256Type>; +pub type DurationMicrosecondTensor<'a> = Tensor<'a, DurationMicrosecondType>; +pub type DurationMillisecondTensor<'a> = Tensor<'a, DurationMillisecondType>; +pub type DurationNanosecondTensor<'a> = Tensor<'a, DurationNanosecondType>; +pub type DurationSecondTensor<'a> = Tensor<'a, DurationSecondType>; +pub type Float16Tensor<'a> = Tensor<'a, Float16Type>; +pub type Float32Tensor<'a> = Tensor<'a, Float32Type>; +pub type Float64Tensor<'a> = Tensor<'a, Float64Type>; pub type Int8Tensor<'a> = Tensor<'a, Int8Type>; pub type Int16Tensor<'a> = Tensor<'a, Int16Type>; pub type Int32Tensor<'a> = Tensor<'a, Int32Type>; pub type Int64Tensor<'a> = Tensor<'a, Int64Type>; +pub type IntervalDayTimeTensor<'a> = Tensor<'a, IntervalDayTimeType>; +pub type IntervalMonthDayNanoTensor<'a> = Tensor<'a, IntervalMonthDayNanoType>; +pub type IntervalYearMonthTensor<'a> = Tensor<'a, IntervalYearMonthType>; +pub type Time32MillisecondTensor<'a> = Tensor<'a, Time32MillisecondType>; +pub type Time32SecondTensor<'a> = Tensor<'a, Time32SecondType>; +pub type Time64MicrosecondTensor<'a> = Tensor<'a, Time64MicrosecondType>; +pub type Time64NanosecondTensor<'a> = Tensor<'a, Time64NanosecondType>; +pub type TimestampMicrosecondTensor<'a> = Tensor<'a, TimestampMicrosecondType>; +pub type TimestampMillisecondTensor<'a> = Tensor<'a, TimestampMillisecondType>; +pub type TimestampNanosecondTensor<'a> = Tensor<'a, TimestampNanosecondType>; +pub type TimestampSecondTensor<'a> = Tensor<'a, TimestampSecondType>; pub type UInt8Tensor<'a> = Tensor<'a, UInt8Type>; pub type UInt16Tensor<'a> = Tensor<'a, UInt16Type>; pub type UInt32Tensor<'a> = Tensor<'a, UInt32Type>; pub type UInt64Tensor<'a> = Tensor<'a, UInt64Type>; -pub type Float16Tensor<'a> = Tensor<'a, Float16Type>; -pub type Float32Tensor<'a> = Tensor<'a, Float32Type>; -pub type Float64Tensor<'a> = Tensor<'a, Float64Type>; impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { /// Creates a new `Tensor` From 8d166a14467ac8e59a47174de676971f9f896e78 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 Apr 2023 14:04:27 -0400 Subject: [PATCH 0851/1411] Add StructArray Constructors (#3879) (#4064) * Add StructArray Constructors (#3879) * Fix doc * Add try_new * Update other constructors --- arrow-array/src/array/struct_array.rs | 302 ++++++++++++++------------ arrow-array/src/record_batch.rs | 11 +- arrow-buffer/src/buffer/null.rs | 7 + arrow-json/src/reader/struct_array.rs | 16 +- 4 files changed, 173 insertions(+), 163 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index fa43062b77bf..a18f38c082c9 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::{make_array, Array, ArrayRef, RecordBatch}; -use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer}; +use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch}; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder}; use std::sync::Arc; @@ -77,10 +77,136 @@ pub struct StructArray { len: usize, data_type: DataType, nulls: Option, - pub(crate) fields: Vec, + fields: Vec, } impl StructArray { + /// Create a new [`StructArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new(fields: Fields, arrays: Vec, nulls: Option) -> Self { + Self::try_new(fields, arrays, nulls).unwrap() + } + + /// Create a new [`StructArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// Errors if + /// + /// * `fields.len() != arrays.len()` + /// * `fields[i].data_type() != arrays[i].data_type()` + /// * `arrays[i].len() != arrays[j].len()` + /// * `arrays[i].len() != nulls.len()` + /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())` + pub fn try_new( + fields: Fields, + arrays: Vec, + nulls: Option, + ) -> Result { + if fields.len() != arrays.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of arrays for StructArray fields, expected {} got {}", + fields.len(), + arrays.len() + ))); + } + let len = arrays.first().map(|x| x.len()).unwrap_or_default(); + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of nulls for StructArray, expected {len} got {}", + n.len(), + ))); + } + } + + for (f, a) in fields.iter().zip(&arrays) { + if f.data_type() != a.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect datatype for StructArray field {:?}, expected {} got {}", + f.name(), + f.data_type(), + a.data_type() + ))); + } + + if a.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect array length for StructArray field {:?}, expected {} got {}", + f.name(), + len, + a.len() + ))); + } + + if let Some(a) = a.nulls() { + let nulls_valid = f.is_nullable() + || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); + + if !nulls_valid { + return Err(ArrowError::InvalidArgumentError(format!( + "Found unmasked nulls for non-nullable StructArray field {:?}", + f.name() + ))); + } + } + } + + Ok(Self { + len, + data_type: DataType::Struct(fields), + nulls: nulls.filter(|n| n.null_count() > 0), + fields: arrays, + }) + } + + /// Create a new [`StructArray`] of length `len` where all values are null + pub fn new_null(fields: Fields, len: usize) -> Self { + let arrays = fields + .iter() + .map(|f| new_null_array(f.data_type(), len)) + .collect(); + + Self { + len, + data_type: DataType::Struct(fields), + nulls: Some(NullBuffer::new_null(len)), + fields: arrays, + } + } + + /// Create a new [`StructArray`] from the provided parts without validation + /// + /// # Safety + /// + /// Safe if [`Self::new`] would not panic with the given arguments + pub unsafe fn new_unchecked( + fields: Fields, + arrays: Vec, + nulls: Option, + ) -> Self { + let len = arrays.first().map(|x| x.len()).unwrap_or_default(); + Self { + len, + data_type: DataType::Struct(fields), + nulls, + fields: arrays, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (Fields, Vec, Option) { + let f = match self.data_type { + DataType::Struct(f) => f, + _ => unreachable!(), + }; + (f, self.fields, self.nulls) + } + /// Returns the field at `pos`. pub fn column(&self, pos: usize) -> &ArrayRef { &self.fields[pos] @@ -183,66 +309,18 @@ impl TryFrom> for StructArray { type Error = ArrowError; /// builds a StructArray from a vector of names and arrays. - /// This errors if the values have a different length. - /// An entry is set to Null when all values are null. fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { - let values_len = values.len(); - - // these will be populated - let mut fields = Vec::with_capacity(values_len); - let mut child_data = Vec::with_capacity(values_len); - - // len: the size of the arrays. - let mut len: Option = None; - // null: the null mask of the arrays. - let mut null: Option = None; - for (field_name, array) in values { - let child_datum = array.to_data(); - let child_datum_len = child_datum.len(); - if let Some(len) = len { - if len != child_datum_len { - return Err(ArrowError::InvalidArgumentError( - format!("Array of field \"{field_name}\" has length {child_datum_len}, but previous elements have length {len}. - All arrays in every entry in a struct array must have the same length.") - )); - } - } else { - len = Some(child_datum_len) - } - fields.push(Arc::new(Field::new( - field_name, - array.data_type().clone(), - child_datum.nulls().is_some(), - ))); - - if let Some(child_nulls) = child_datum.nulls() { - null = Some(if let Some(null_buffer) = &null { - buffer_bin_or( - null_buffer, - 0, - child_nulls.buffer(), - child_nulls.offset(), - child_datum_len, - ) - } else { - child_nulls.inner().sliced() - }); - } else if null.is_some() { - // when one of the fields has no nulls, then there is no null in the array - null = None; - } - child_data.push(child_datum); - } - let len = len.unwrap(); - - let builder = ArrayData::builder(DataType::Struct(fields.into())) - .len(len) - .null_bit_buffer(null) - .child_data(child_data); - - let array_data = unsafe { builder.build_unchecked() }; - - Ok(StructArray::from(array_data)) + let (schema, arrays): (SchemaBuilder, _) = values + .into_iter() + .map(|(name, array)| { + ( + Field::new(name, array.data_type().clone(), array.nulls().is_some()), + array, + ) + }) + .unzip(); + + StructArray::try_new(schema.finish().fields, arrays, None) } } @@ -303,38 +381,8 @@ impl Array for StructArray { impl From> for StructArray { fn from(v: Vec<(Field, ArrayRef)>) -> Self { - let iter = v.into_iter(); - let capacity = iter.size_hint().0; - - let mut len = None; - let mut schema = SchemaBuilder::with_capacity(capacity); - let mut child_data = Vec::with_capacity(capacity); - for (field, array) in iter { - // Check the length of the child arrays - assert_eq!( - *len.get_or_insert(array.len()), - array.len(), - "all child arrays of a StructArray must have the same length" - ); - // Check data types of child arrays - assert_eq!( - field.data_type(), - array.data_type(), - "the field data types must match the array data in a StructArray" - ); - schema.push(field); - child_data.push(array.to_data()); - } - let field_types = schema.finish().fields; - let array_data = ArrayData::builder(DataType::Struct(field_types)) - .child_data(child_data) - .len(len.unwrap_or_default()); - let array_data = unsafe { array_data.build_unchecked() }; - - // We must validate nullability - array_data.validate_nulls().unwrap(); - - Self::from(array_data) + let (schema, arrays): (SchemaBuilder, _) = v.into_iter().unzip(); + StructArray::new(schema.finish().fields, arrays, None) } } @@ -359,37 +407,10 @@ impl std::fmt::Debug for StructArray { impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self { - let capacity = pair.0.len(); - let mut len = None; - let mut schema = SchemaBuilder::with_capacity(capacity); - let mut child_data = Vec::with_capacity(capacity); - for (field, array) in pair.0 { - // Check the length of the child arrays - assert_eq!( - *len.get_or_insert(array.len()), - array.len(), - "all child arrays of a StructArray must have the same length" - ); - // Check data types of child arrays - assert_eq!( - field.data_type(), - array.data_type(), - "the field data types must match the array data in a StructArray" - ); - schema.push(field); - child_data.push(array.to_data()); - } - let field_types = schema.finish().fields; - let array_data = ArrayData::builder(DataType::Struct(field_types)) - .null_bit_buffer(Some(pair.1)) - .child_data(child_data) - .len(len.unwrap_or_default()); - let array_data = unsafe { array_data.build_unchecked() }; - - // We must validate nullability - array_data.validate_nulls().unwrap(); - - Self::from(array_data) + let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default(); + let (fields, arrays): (SchemaBuilder, Vec<_>) = pair.0.into_iter().unzip(); + let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len)); + Self::new(fields.finish().fields, arrays, Some(nulls)) } } @@ -512,12 +533,7 @@ mod tests { let struct_data = arr.into_data(); assert_eq!(4, struct_data.len()); - assert_eq!(1, struct_data.null_count()); - assert_eq!( - // 00001011 - &[11_u8], - struct_data.nulls().unwrap().validity() - ); + assert_eq!(0, struct_data.null_count()); let expected_string_data = ArrayData::builder(DataType::Utf8) .len(4) @@ -549,20 +565,20 @@ mod tests { let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); - let arr = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]); + let err = + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap_err() + .to_string(); - match arr { - Err(ArrowError::InvalidArgumentError(e)) => { - assert!(e.starts_with("Array of field \"f2\" has length 4, but previous elements have length 3.")); - } - _ => panic!("This test got an unexpected error type"), - }; + assert_eq!( + err, + "Invalid argument error: Incorrect array length for StructArray field \"f2\", expected 3 got 4" + ) } #[test] #[should_panic( - expected = "the field data types must match the array data in a StructArray" + expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean" )] fn test_struct_array_from_mismatched_types_single() { drop(StructArray::from(vec![( @@ -574,7 +590,7 @@ mod tests { #[test] #[should_panic( - expected = "the field data types must match the array data in a StructArray" + expected = "Incorrect datatype for StructArray field \\\"b\\\", expected Int16 got Boolean" )] fn test_struct_array_from_mismatched_types_multiple() { drop(StructArray::from(vec![ @@ -679,7 +695,7 @@ mod tests { #[test] #[should_panic( - expected = "all child arrays of a StructArray must have the same length" + expected = "Incorrect array length for StructArray field \\\"c\\\", expected 1 got 2" )] fn test_invalid_struct_child_array_lengths() { drop(StructArray::from(vec![ @@ -702,7 +718,7 @@ mod tests { #[test] #[should_panic( - expected = "non-nullable child of type Int32 contains nulls not present in parent Struct" + expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"" )] fn test_struct_array_from_mismatched_nullability() { drop(StructArray::from(vec![( diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index ee61d2da6597..8fb08111c846 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -467,17 +467,12 @@ impl Default for RecordBatchOptions { } impl From for RecordBatch { fn from(value: StructArray) -> Self { - assert_eq!( - value.null_count(), - 0, - "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" - ); let row_count = value.len(); - let schema = Arc::new(Schema::new(value.fields().clone())); - let columns = value.fields; + let (fields, columns, nulls) = value.into_parts(); + assert_eq!(nulls.map(|n| n.null_count()).unwrap_or_default(), 0, "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation"); RecordBatch { - schema, + schema: Arc::new(Schema::new(fields)), row_count, columns, } diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index f088e7fa62e9..cdb0c2aeb824 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -67,6 +67,13 @@ impl NullBuffer { } } + /// Returns true if all nulls in `other` also exist in self + pub fn contains(&self, other: &NullBuffer) -> bool { + let lhs = self.inner().bit_chunks().iter_padded(); + let rhs = other.inner().bit_chunks().iter_padded(); + lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) + } + /// Returns the length of this [`NullBuffer`] #[inline] pub fn len(&self) -> usize { diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs index 6c6f1457bfc2..707b56d50eef 100644 --- a/arrow-json/src/reader/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -120,19 +120,11 @@ impl ArrayDecoder for StructArrayDecoder { for (c, f) in child_data.iter().zip(fields) { // Sanity check assert_eq!(c.len(), pos.len()); + if let Some(a) = c.nulls() { + let nulls_valid = f.is_nullable() + || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); - if !f.is_nullable() && c.null_count() != 0 { - // Need to verify nulls - let valid = match nulls.as_ref() { - Some(nulls) => { - let lhs = nulls.inner().bit_chunks().iter_padded(); - let rhs = c.nulls().unwrap().inner().bit_chunks().iter_padded(); - lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) - } - None => false, - }; - - if !valid { + if !nulls_valid { return Err(ArrowError::JsonError(format!("Encountered unmasked nulls in non-nullable StructArray child: {f}"))); } } From b9819219c8fb6c7e5c7336e3c2fcd86cb5befd98 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Wed, 26 Apr 2023 06:11:39 +0800 Subject: [PATCH 0852/1411] Display the path in the open GCS credentials error (#4124) --- object_store/src/gcp/credential.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index a8dce7132755..057e013334ed 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -29,14 +29,17 @@ use snafu::{ResultExt, Snafu}; use std::env; use std::fs::File; use std::io::BufReader; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use tracing::info; #[derive(Debug, Snafu)] pub enum Error { - #[snafu(display("Unable to open service account file: {}", source))] - OpenCredentials { source: std::io::Error }, + #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] + OpenCredentials { + source: std::io::Error, + path: PathBuf, + }, #[snafu(display("Unable to decode service account file: {}", source))] DecodeCredentials { source: serde_json::Error }, @@ -233,7 +236,9 @@ fn read_credentials_file( where T: serde::de::DeserializeOwned, { - let file = File::open(service_account_path).context(OpenCredentialsSnafu)?; + let file = File::open(&service_account_path).context(OpenCredentialsSnafu { + path: service_account_path.as_ref().to_owned(), + })?; let reader = BufReader::new(file); serde_json::from_reader(reader).context(DecodeCredentialsSnafu) } From b8d8cb71af82eb4604a1d2730fe0fc9c7a47d78b Mon Sep 17 00:00:00 2001 From: kindly Date: Wed, 26 Apr 2023 12:07:08 +0100 Subject: [PATCH 0853/1411] Retry when no or partial response from server. (#4120) Retry when server fails unexpectedly, or if there are network issues that are not handled by hyper. --- object_store/Cargo.toml | 3 ++- object_store/src/client/retry.rs | 42 ++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index fcdbd98ed9bb..b27482bcfabc 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -43,6 +43,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } +hyper = { version = "0.14", default-features = false, optional = true } quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } @@ -66,7 +67,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut nix = "0.26.1" [features] -cloud = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json","reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index e6dd2eb8174b..e6e92f086b2b 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -24,6 +24,7 @@ use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; use std::time::{Duration, Instant}; use tracing::info; +use snafu::Error as SnafuError; /// Retry request error #[derive(Debug)] @@ -192,11 +193,29 @@ impl RetryExt for reqwest::RequestBuilder { }, Err(e) => { - return Err(Error{ - retries, - message: "request error".to_string(), - source: Some(e) - }) + let mut do_retry = false; + if let Some(source) = e.source() { + if let Some(e) = source.downcast_ref::() { + if e.is_connect() || e.is_closed() || e.is_incomplete_message() { + do_retry = true; + } + } + } + + if retries == max_retries + || now.elapsed() > retry_timeout + || !do_retry { + + return Err(Error{ + retries, + message: "request error".to_string(), + source: Some(e) + }) + } + let sleep = backoff.next(); + retries += 1; + info!("Encountered request error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); + tokio::time::sleep(sleep).await; } } } @@ -345,6 +364,19 @@ mod tests { assert_eq!(e.retries, retry.max_retries); assert_eq!(e.message, "502 Bad Gateway"); + // Panic results in an incomplete message error in the client + mock.push_fn(|_| {panic!()}); + let r = do_request().await.unwrap(); + assert_eq!(r.status(), StatusCode::OK); + + // Gives up after retrying mulitiple panics + for _ in 0..=retry.max_retries { + mock.push_fn(|_| {panic!()}); + } + let e = do_request().await.unwrap_err(); + assert_eq!(e.retries, retry.max_retries); + assert_eq!(e.message, "request error"); + // Shutdown mock.shutdown().await } From 9cf48c1a7559af150699fdeeb01031e357946d75 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 26 Apr 2023 13:29:47 +0200 Subject: [PATCH 0854/1411] refactor: construct `StructArray` w/ `FieldRef` (#4116) `DataType` uses `Fields`/`FieldRef` internally. Accepting `Field` just to wrap it into an `Arc` is unnecessary expensive, esp. when the `Field` was cloned from an `FieldRef` (happens in some non-test code). I've decided to NOT allow the construction from `Field` anymore because in prod code this is most likely a performance bug. --- arrow-array/src/array/map_array.rs | 22 +++++----- arrow-array/src/array/struct_array.rs | 34 +++++++------- arrow-array/src/builder/map_builder.rs | 8 ++-- arrow-array/src/builder/mod.rs | 6 +-- arrow-array/src/record_batch.rs | 4 +- arrow-cast/src/pretty.rs | 8 ++-- arrow-ipc/src/reader.rs | 19 ++++---- arrow-ipc/src/writer.rs | 13 ++++-- arrow-json/src/writer.rs | 20 ++++----- arrow-row/src/lib.rs | 4 +- arrow-select/src/concat.rs | 4 +- arrow-select/src/take.rs | 8 ++-- arrow/examples/builders.rs | 4 +- arrow/examples/dynamic_types.rs | 6 +-- arrow/src/array/ffi.rs | 10 ++--- arrow/src/ffi.rs | 2 +- arrow/tests/array_cast.rs | 4 +- arrow/tests/array_transform.rs | 4 +- .../src/arrow/array_reader/struct_array.rs | 2 +- parquet/src/arrow/arrow_writer/levels.rs | 32 +++++++------- parquet/src/arrow/arrow_writer/mod.rs | 44 +++++++++++-------- 21 files changed, 136 insertions(+), 122 deletions(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 62e12c30e00c..c53e452a67dd 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -197,12 +197,12 @@ impl MapArray { let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); let keys_data = StringArray::from_iter_values(keys); - let keys_field = Field::new("keys", DataType::Utf8, false); - let values_field = Field::new( + let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false)); + let values_field = Arc::new(Field::new( "values", values.data_type().clone(), values.null_count() > 0, - ); + )); let entry_struct = StructArray::from(vec![ (keys_field, Arc::new(keys_data) as ArrayRef), @@ -336,8 +336,8 @@ mod tests { // [[0, 1, 2], [3, 4, 5], [6, 7]] let entry_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); - let keys = Field::new("keys", DataType::Int32, false); - let values = Field::new("values", DataType::UInt32, false); + let keys = Arc::new(Field::new("keys", DataType::Int32, false)); + let values = Arc::new(Field::new("values", DataType::UInt32, false)); let entry_struct = StructArray::from(vec![ (keys, make_array(keys_data)), (values, make_array(values_data)), @@ -382,8 +382,8 @@ mod tests { // [[0, 1, 2], [3, 4, 5], [6, 7]] let entry_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); - let keys_field = Field::new("keys", DataType::Int32, false); - let values_field = Field::new("values", DataType::UInt32, true); + let keys_field = Arc::new(Field::new("keys", DataType::Int32, false)); + let values_field = Arc::new(Field::new("values", DataType::UInt32, true)); let entry_struct = StructArray::from(vec![ (keys_field.clone(), make_array(key_data)), (values_field.clone(), make_array(value_data.clone())), @@ -504,8 +504,8 @@ mod tests { // [[3, 4, 5], [6, 7]] let entry_offsets = Buffer::from(&[0, 3, 5].to_byte_slice()); - let keys = Field::new("keys", DataType::Int32, false); - let values = Field::new("values", DataType::UInt32, false); + let keys = Arc::new(Field::new("keys", DataType::Int32, false)); + let values = Arc::new(Field::new("values", DataType::UInt32, false)); let entry_struct = StructArray::from(vec![ (keys, make_array(keys_data)), (values, make_array(values_data)), @@ -582,8 +582,8 @@ mod tests { let key_array = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; let value_array = Arc::new(UInt32Array::from(vec![0u32, 10, 20])) as ArrayRef; - let keys_field = Field::new("keys", DataType::Utf8, false); - let values_field = Field::new("values", DataType::UInt32, false); + let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false)); + let values_field = Arc::new(Field::new("values", DataType::UInt32, false)); let struct_array = StructArray::from(vec![(keys_field, key_array), (values_field, value_array)]); assert_eq!( diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index a18f38c082c9..fac947f14bfd 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -18,7 +18,7 @@ use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder}; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, SchemaBuilder}; use std::sync::Arc; use std::{any::Any, ops::Index}; @@ -58,11 +58,11 @@ use std::{any::Any, ops::Index}; /// /// let struct_array = StructArray::from(vec![ /// ( -/// Field::new("b", DataType::Boolean, false), +/// Arc::new(Field::new("b", DataType::Boolean, false)), /// boolean.clone() as ArrayRef, /// ), /// ( -/// Field::new("c", DataType::Int32, false), +/// Arc::new(Field::new("c", DataType::Int32, false)), /// int.clone() as ArrayRef, /// ), /// ]); @@ -379,8 +379,8 @@ impl Array for StructArray { } } -impl From> for StructArray { - fn from(v: Vec<(Field, ArrayRef)>) -> Self { +impl From> for StructArray { + fn from(v: Vec<(FieldRef, ArrayRef)>) -> Self { let (schema, arrays): (SchemaBuilder, _) = v.into_iter().unzip(); StructArray::new(schema.finish().fields, arrays, None) } @@ -405,8 +405,8 @@ impl std::fmt::Debug for StructArray { } } -impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { - fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self { +impl From<(Vec<(FieldRef, ArrayRef)>, Buffer)> for StructArray { + fn from(pair: (Vec<(FieldRef, ArrayRef)>, Buffer)) -> Self { let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default(); let (fields, arrays): (SchemaBuilder, Vec<_>) = pair.0.into_iter().unzip(); let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len)); @@ -480,11 +480,11 @@ mod tests { let struct_array = StructArray::from(vec![ ( - Field::new("b", DataType::Boolean, false), + Arc::new(Field::new("b", DataType::Boolean, false)), boolean.clone() as ArrayRef, ), ( - Field::new("c", DataType::Int32, false), + Arc::new(Field::new("c", DataType::Int32, false)), int.clone() as ArrayRef, ), ]); @@ -503,11 +503,11 @@ mod tests { let struct_array = StructArray::from(vec![ ( - Field::new("b", DataType::Boolean, false), + Arc::new(Field::new("b", DataType::Boolean, false)), boolean.clone() as ArrayRef, ), ( - Field::new("c", DataType::Int32, false), + Arc::new(Field::new("c", DataType::Int32, false)), int.clone() as ArrayRef, ), ]); @@ -582,7 +582,7 @@ mod tests { )] fn test_struct_array_from_mismatched_types_single() { drop(StructArray::from(vec![( - Field::new("b", DataType::Int16, false), + Arc::new(Field::new("b", DataType::Int16, false)), Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, )])); @@ -595,12 +595,12 @@ mod tests { fn test_struct_array_from_mismatched_types_multiple() { drop(StructArray::from(vec![ ( - Field::new("b", DataType::Int16, false), + Arc::new(Field::new("b", DataType::Int16, false)), Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( - Field::new("c", DataType::Utf8, false), + Arc::new(Field::new("c", DataType::Utf8, false)), Arc::new(Int32Array::from(vec![42, 28, 19, 31])), ), ])); @@ -700,11 +700,11 @@ mod tests { fn test_invalid_struct_child_array_lengths() { drop(StructArray::from(vec![ ( - Field::new("b", DataType::Float32, false), + Arc::new(Field::new("b", DataType::Float32, false)), Arc::new(Float32Array::from(vec![1.1])) as Arc, ), ( - Field::new("c", DataType::Float64, false), + Arc::new(Field::new("c", DataType::Float64, false)), Arc::new(Float64Array::from(vec![2.2, 3.3])), ), ])); @@ -722,7 +722,7 @@ mod tests { )] fn test_struct_array_from_mismatched_nullability() { drop(StructArray::from(vec![( - Field::new("c", DataType::Int32, false), + Arc::new(Field::new("c", DataType::Int32, false)), Arc::new(Int32Array::from(vec![Some(42), None, Some(19)])) as ArrayRef, )])); } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 72fa1bb919fb..db85465c8d5c 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -181,16 +181,16 @@ impl MapBuilder { keys_arr.null_count() ); - let keys_field = Field::new( + let keys_field = Arc::new(Field::new( self.field_names.key.as_str(), keys_arr.data_type().clone(), false, // always non-nullable - ); - let values_field = Field::new( + )); + let values_field = Arc::new(Field::new( self.field_names.value.as_str(), values_arr.data_type().clone(), true, - ); + )); let struct_array = StructArray::from(vec![(keys_field, keys_arr), (values_field, values_arr)]); diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index b0c0a49886d8..081f4d5f41f6 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -115,14 +115,14 @@ //! /// Note: returns StructArray to allow nesting within another array if desired //! fn finish(&mut self) -> StructArray { //! let i32 = Arc::new(self.i32.finish()) as ArrayRef; -//! let i32_field = Field::new("i32", DataType::Int32, false); +//! let i32_field = Arc::new(Field::new("i32", DataType::Int32, false)); //! //! let string = Arc::new(self.string.finish()) as ArrayRef; -//! let string_field = Field::new("i32", DataType::Utf8, false); +//! let string_field = Arc::new(Field::new("i32", DataType::Utf8, false)); //! //! let i32_list = Arc::new(self.i32_list.finish()) as ArrayRef; //! let value_field = Arc::new(Field::new("item", DataType::Int32, true)); -//! let i32_list_field = Field::new("i32_list", DataType::List(value_field), true); +//! let i32_list_field = Arc::new(Field::new("i32_list", DataType::List(value_field), true)); //! //! StructArray::from(vec![ //! (i32_field, i32), diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 8fb08111c846..bd1cc65c7341 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -788,11 +788,11 @@ mod tests { let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); let struct_array = StructArray::from(vec![ ( - Field::new("b", DataType::Boolean, false), + Arc::new(Field::new("b", DataType::Boolean, false)), boolean.clone() as ArrayRef, ), ( - Field::new("c", DataType::Int32, false), + Arc::new(Field::new("c", DataType::Int32, false)), int.clone() as ArrayRef, ), ]); diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index c75721ab8517..13d1df6a118d 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -649,17 +649,17 @@ mod tests { let c1 = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, true), + Arc::new(Field::new("c11", DataType::Int32, true)), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( - Field::new_struct( + Arc::new(Field::new_struct( "c12", vec![Field::new("c121", DataType::Utf8, false)], false, - ), + )), Arc::new(StructArray::from(vec![( - Field::new("c121", DataType::Utf8, false), + Arc::new(Field::new("c121", DataType::Utf8, false)), Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 16cb99b920d9..d198696169b1 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -183,7 +183,7 @@ fn create_array( )?; node_index = triple.1; buffer_index = triple.2; - struct_arrays.push((struct_field.as_ref().clone(), triple.0)); + struct_arrays.push((struct_field.clone(), triple.0)); } let null_count = struct_node.null_count() as usize; let struct_array = if null_count > 0 { @@ -1593,7 +1593,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; - let dctfield = Field::new("dict", array.data_type().clone(), false); + let dctfield = Arc::new(Field::new("dict", array.data_type().clone(), false)); let s = StructArray::from(vec![(dctfield, array)]); let struct_array = Arc::new(s) as ArrayRef; @@ -1695,9 +1695,12 @@ mod tests { ); let string_array: ArrayRef = Arc::new(StringArray::from(xs.clone())); let struct_array = StructArray::from(vec![ - (Field::new("f2.1", DataType::Utf8, false), string_array), ( - Field::new("f2.2_struct", dict.data_type().clone(), false), + Arc::new(Field::new("f2.1", DataType::Utf8, false)), + string_array, + ), + ( + Arc::new(Field::new("f2.2_struct", dict.data_type().clone(), false)), dict.clone() as ArrayRef, ), ]); @@ -1727,20 +1730,20 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, values); - let keys_field = Field::new_dict( + let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), true, 1, false, - ); - let values_field = Field::new_dict( + )); + let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), true, 1, false, - ); + )); let entry_struct = StructArray::from(vec![ (keys_field, make_array(key_dict_array.into_data())), (values_field, make_array(value_dict_array.into_data())), diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index abaecea1faf2..8f36f8c04dc0 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1692,8 +1692,13 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 - let dctfield = - Field::new_dict("dict", array.data_type().clone(), false, 2, false); + let dctfield = Arc::new(Field::new_dict( + "dict", + array.data_type().clone(), + false, + 2, + false, + )); let s = StructArray::from(vec![(dctfield, array)]); let struct_array = Arc::new(s) as ArrayRef; @@ -1896,11 +1901,11 @@ mod tests { let struct_array = StructArray::from(vec![ ( - Field::new("s", DataType::Utf8, true), + Arc::new(Field::new("s", DataType::Utf8, true)), Arc::new(strings) as ArrayRef, ), ( - Field::new("c", DataType::Int32, true), + Arc::new(Field::new("c", DataType::Int32, true)), Arc::new(ints) as ArrayRef, ), ]); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index a096590ec058..d610dd9a35b4 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -983,19 +983,19 @@ mod tests { let c1 = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, true), + Arc::new(Field::new("c11", DataType::Int32, true)), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( - Field::new( + Arc::new(Field::new( "c12", DataType::Struct( vec![Field::new("c121", DataType::Utf8, false)].into(), ), false, - ), + )), Arc::new(StructArray::from(vec![( - Field::new("c121", DataType::Utf8, false), + Arc::new(Field::new("c121", DataType::Utf8, false)), Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, @@ -1150,19 +1150,19 @@ mod tests { let struct_values = StructArray::from(vec![ ( - Field::new("c11", DataType::Int32, true), + Arc::new(Field::new("c11", DataType::Int32, true)), Arc::new(Int32Array::from(vec![Some(1), None, Some(5)])) as ArrayRef, ), ( - Field::new( + Arc::new(Field::new( "c12", DataType::Struct( vec![Field::new("c121", DataType::Utf8, false)].into(), ), false, - ), + )), Arc::new(StructArray::from(vec![( - Field::new("c121", DataType::Utf8, false), + Arc::new(Field::new("c121", DataType::Utf8, false)), Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, @@ -1340,8 +1340,8 @@ mod tests { super::StringArray::from(vec!["foo", "bar", "baz", "qux", "quux"]); let values_array = super::Int64Array::from(vec![10, 20, 30, 40, 50]); - let keys = Field::new("keys", DataType::Utf8, false); - let values = Field::new("values", DataType::Int64, false); + let keys = Arc::new(Field::new("keys", DataType::Utf8, false)); + let values = Arc::new(Field::new("values", DataType::Int64, false)); let entry_struct = StructArray::from(vec![ (keys, Arc::new(keys_array) as ArrayRef), (values, Arc::new(values_array) as ArrayRef), diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 71e1de416617..9010c8d9a2a9 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1748,9 +1748,9 @@ mod tests { fn test_struct() { // Test basic let a = Arc::new(Int32Array::from(vec![1, 1, 2, 2])) as ArrayRef; - let a_f = Field::new("int", DataType::Int32, false); + let a_f = Arc::new(Field::new("int", DataType::Int32, false)); let u = Arc::new(StringArray::from(vec!["a", "b", "c", "d"])) as ArrayRef; - let u_f = Field::new("s", DataType::Utf8, false); + let u_f = Arc::new(Field::new("s", DataType::Utf8, false)); let s1 = Arc::new(StructArray::from(vec![(a_f, a), (u_f, u)])) as ArrayRef; let sort_fields = vec![SortField::new(s1.data_type().clone())]; diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index ed27520cc61d..0bf4c97ff827 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -326,7 +326,7 @@ mod tests { #[test] fn test_concat_struct_arrays() { - let field = Field::new("field", DataType::Int64, true); + let field = Arc::new(Field::new("field", DataType::Int64, true)); let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -381,7 +381,7 @@ mod tests { #[test] fn test_concat_struct_array_slices() { - let field = Field::new("field", DataType::Int64, true); + let field = Arc::new(Field::new("field", DataType::Int64, true)); let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ Some(-1), diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 3e7432530743..5d6507e71526 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -25,7 +25,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType, FieldRef}; use num::{ToPrimitive, Zero}; @@ -163,8 +163,8 @@ where .iter() .map(|a| take_impl(a.as_ref(), indices, Some(options.clone()))) .collect::, _>>()?; - let fields: Vec<(Field, ArrayRef)> = - fields.iter().map(|f| f.as_ref().clone()).zip(arrays).collect(); + let fields: Vec<(FieldRef, ArrayRef)> = + fields.iter().cloned().zip(arrays).collect(); // Create the null bit buffer. let is_valid: Buffer = indices @@ -924,7 +924,7 @@ where mod tests { use super::*; use arrow_array::builder::*; - use arrow_schema::{Fields, TimeUnit}; + use arrow_schema::{Field, Fields, TimeUnit}; fn test_take_decimal_arrays( data: Vec>, diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index d0e6b31085e5..a6d8c563b4ca 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -119,12 +119,12 @@ fn main() { // helper, which takes the underlying arrays and field types. let struct_array = StructArray::from(vec![ ( - Field::new("b", DataType::Boolean, false), + Arc::new(Field::new("b", DataType::Boolean, false)), Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( - Field::new("c", DataType::Int32, false), + Arc::new(Field::new("c", DataType::Int32, false)), Arc::new(Int32Array::from(vec![42, 28, 19, 31])), ), ]); diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index cb26a0d33f1e..5470131d6d41 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -49,15 +49,15 @@ fn main() -> Result<()> { let nested = StructArray::from(vec![ ( - Field::new("a", DataType::Utf8, false), + Arc::new(Field::new("a", DataType::Utf8, false)), Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])) as Arc, ), ( - Field::new("b", DataType::Float64, false), + Arc::new(Field::new("b", DataType::Float64, false)), Arc::new(Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5])), ), ( - Field::new("c", DataType::Float64, false), + Arc::new(Field::new("c", DataType::Float64, false)), Arc::new(Float64Array::from(vec![2.2, 3.3, 4.4, 5.5, 6.6])), ), ]); diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 1611dc5303d6..56b9b6ecf8fd 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -123,28 +123,28 @@ mod tests { fn test_struct() -> Result<()> { let inner = StructArray::from(vec![ ( - Field::new("a1", DataType::Boolean, false), + Arc::new(Field::new("a1", DataType::Boolean, false)), Arc::new(BooleanArray::from(vec![true, true, false, false])) as Arc, ), ( - Field::new("a2", DataType::UInt32, false), + Arc::new(Field::new("a2", DataType::UInt32, false)), Arc::new(UInt32Array::from(vec![1, 2, 3, 4])), ), ]); let array = StructArray::from(vec![ ( - Field::new("a", inner.data_type().clone(), false), + Arc::new(Field::new("a", inner.data_type().clone(), false)), Arc::new(inner) as Arc, ), ( - Field::new("b", DataType::Boolean, false), + Arc::new(Field::new("b", DataType::Boolean, false)), Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( - Field::new("c", DataType::UInt32, false), + Arc::new(Field::new("c", DataType::UInt32, false)), Arc::new(UInt32Array::from(vec![42, 28, 19, 31])), ), ]); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 0af1b1111ca4..d8b5be69a517 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -1099,7 +1099,7 @@ mod tests { let metadata: HashMap = [("Hello".to_string(), "World! 😊".to_string())].into(); let struct_array = StructArray::from(vec![( - Field::new("a", DataType::Int32, false).with_metadata(metadata), + Arc::new(Field::new("a", DataType::Int32, false).with_metadata(metadata)), Arc::new(Int32Array::from(vec![2, 4, 6])) as Arc, )]); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 96a4f2b41f3c..bf7e7a326efc 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -127,12 +127,12 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(make_fixed_size_binary_array()), Arc::new(StructArray::from(vec![ ( - Field::new("a", DataType::Boolean, false), + Arc::new(Field::new("a", DataType::Boolean, false)), Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( - Field::new("b", DataType::Int32, false), + Arc::new(Field::new("b", DataType::Int32, false)), Arc::new(Int32Array::from(vec![42, 28, 19, 31])), ), ])), diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 7cd0007cce75..40938c80f4c3 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -764,11 +764,11 @@ fn test_map_nulls_append() { let expected_entry_array = StructArray::from(vec![ ( - Field::new("keys", DataType::Int64, false), + Arc::new(Field::new("keys", DataType::Int64, false)), Arc::new(expected_key_array) as ArrayRef, ), ( - Field::new("values", DataType::Int64, true), + Arc::new(Field::new("values", DataType::Int64, true)), Arc::new(expected_value_array) as ArrayRef, ), ]); diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 11e019f29a59..600fda4fb6c4 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -292,7 +292,7 @@ mod tests { let validity = Buffer::from([0b00000111]); let struct_fields = vec![( - Field::new("foo", expected_l.data_type().clone(), true), + Arc::new(Field::new("foo", expected_l.data_type().clone(), true)), expected_l.clone() as ArrayRef, )]; let expected = StructArray::from((struct_fields, validity)); diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index fe6126ba486a..fc5b9460322a 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -675,7 +675,7 @@ mod tests { .unwrap(); let list = make_array(list); - let list_field = Field::new("list", list_type, true); + let list_field = Arc::new(Field::new("list", list_type, true)); let struct_array = StructArray::from((vec![(list_field, list)], Buffer::from([0b00011010]))); @@ -793,7 +793,7 @@ mod tests { .build() .unwrap(); let list = make_array(list); - let list_field = Field::new("list", list_type, true); + let list_field = Arc::new(Field::new("list", list_type, true)); let struct_array = StructArray::from(vec![(list_field, list)]); let array = Arc::new(struct_array) as ArrayRef; @@ -839,7 +839,7 @@ mod tests { .unwrap(); let list_2 = make_array(list_2); - let list_2_field = Field::new("list_2", list_2_type, true); + let list_2_field = Arc::new(Field::new("list_2", list_2_type, true)); let struct_array = StructArray::from((vec![(list_2_field, list_2)], Buffer::from([0b00001111]))); @@ -871,13 +871,13 @@ mod tests { // - {a: {b: {c: 6}}} let c = Int32Array::from_iter([Some(1), None, Some(3), None, Some(5), Some(6)]); - let c_field = Field::new("c", DataType::Int32, true); + let c_field = Arc::new(Field::new("c", DataType::Int32, true)); let b = StructArray::from(( (vec![(c_field, Arc::new(c) as ArrayRef)]), Buffer::from([0b00110111]), )); - let b_field = Field::new("b", b.data_type().clone(), true); + let b_field = Arc::new(Field::new("b", b.data_type().clone(), true)); let a = StructArray::from(( (vec![(b_field, Arc::new(b) as ArrayRef)]), Buffer::from([0b00101111]), @@ -944,18 +944,18 @@ mod tests { // this tests the level generation from the equivalent arrow_writer_complex test // define schema - let struct_field_d = Field::new("d", DataType::Float64, true); - let struct_field_f = Field::new("f", DataType::Float32, true); - let struct_field_g = Field::new( + let struct_field_d = Arc::new(Field::new("d", DataType::Float64, true)); + let struct_field_f = Arc::new(Field::new("f", DataType::Float32, true)); + let struct_field_g = Arc::new(Field::new( "g", DataType::List(Arc::new(Field::new("items", DataType::Int16, false))), false, - ); - let struct_field_e = Field::new( + )); + let struct_field_e = Arc::new(Field::new( "e", DataType::Struct(vec![struct_field_f.clone(), struct_field_g.clone()].into()), true, - ); + )); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, true), @@ -1072,7 +1072,7 @@ mod tests { #[test] fn test_null_vs_nonnull_struct() { // define schema - let offset_field = Field::new("offset", DataType::Int32, true); + let offset_field = Arc::new(Field::new("offset", DataType::Int32, true)); let schema = Schema::new(vec![Field::new( "some_nested_object", DataType::Struct(vec![offset_field.clone()].into()), @@ -1095,7 +1095,7 @@ mod tests { // create second batch // define schema - let offset_field = Field::new("offset", DataType::Int32, true); + let offset_field = Arc::new(Field::new("offset", DataType::Int32, true)); let schema = Schema::new(vec![Field::new( "some_nested_object", DataType::Struct(vec![offset_field.clone()].into()), @@ -1286,7 +1286,7 @@ mod tests { // This test assumes that nulls don't take up space assert_eq!(inner.values().len(), 7); - let field = Field::new("list", inner.data_type().clone(), true); + let field = Arc::new(Field::new("list", inner.data_type().clone(), true)); let array = Arc::new(inner) as ArrayRef; let nulls = Buffer::from([0b01010111]); let struct_a = StructArray::from((vec![(field, array)], nulls)); @@ -1331,8 +1331,8 @@ mod tests { None, ])) as ArrayRef; - let field_a1 = Field::new("list", a1.data_type().clone(), true); - let field_a2 = Field::new("integers", a2.data_type().clone(), true); + let field_a1 = Arc::new(Field::new("list", a1.data_type().clone(), true)); + let field_a2 = Arc::new(Field::new("integers", a2.data_type().clone(), true)); let nulls = Buffer::from([0b00110111]); let struct_a = Arc::new( diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 3987cccf6c56..67fec4489cdd 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -879,13 +879,19 @@ mod tests { #[test] fn arrow_writer_complex() { // define schema - let struct_field_d = Field::new("d", DataType::Float64, true); - let struct_field_f = Field::new("f", DataType::Float32, true); - let struct_field_g = - Field::new_list("g", Field::new("item", DataType::Int16, true), false); - let struct_field_h = - Field::new_list("h", Field::new("item", DataType::Int16, false), true); - let struct_field_e = Field::new_struct( + let struct_field_d = Arc::new(Field::new("d", DataType::Float64, true)); + let struct_field_f = Arc::new(Field::new("f", DataType::Float32, true)); + let struct_field_g = Arc::new(Field::new_list( + "g", + Field::new("item", DataType::Int16, true), + false, + )); + let struct_field_h = Arc::new(Field::new_list( + "h", + Field::new("item", DataType::Int16, false), + true, + )); + let struct_field_e = Arc::new(Field::new_struct( "e", vec![ struct_field_f.clone(), @@ -893,7 +899,7 @@ mod tests { struct_field_h.clone(), ], false, - ); + )); let schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, true), @@ -963,9 +969,9 @@ mod tests { // It was subsequently fixed while investigating https://github.com/apache/arrow-rs/issues/245. // define schema - let offset_field = Field::new("offset", DataType::Int32, false); - let partition_field = Field::new("partition", DataType::Int64, true); - let topic_field = Field::new("topic", DataType::Utf8, true); + let offset_field = Arc::new(Field::new("offset", DataType::Int32, false)); + let partition_field = Arc::new(Field::new("partition", DataType::Int64, true)); + let topic_field = Arc::new(Field::new("topic", DataType::Utf8, true)); let schema = Schema::new(vec![Field::new( "some_nested_object", DataType::Struct(Fields::from(vec![ @@ -1857,7 +1863,7 @@ mod tests { #[test] fn struct_single_column() { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - let struct_field_a = Field::new("f", DataType::Int32, false); + let struct_field_a = Arc::new(Field::new("f", DataType::Int32, false)); let s = StructArray::from(vec![(struct_field_a, Arc::new(a_values) as ArrayRef)]); let values = Arc::new(s); @@ -2233,20 +2239,20 @@ mod tests { #[test] fn complex_aggregate() { // Tests aggregating nested data - let field_a = Field::new("leaf_a", DataType::Int32, false); - let field_b = Field::new("leaf_b", DataType::Int32, true); - let struct_a = Field::new( + let field_a = Arc::new(Field::new("leaf_a", DataType::Int32, false)); + let field_b = Arc::new(Field::new("leaf_b", DataType::Int32, true)); + let struct_a = Arc::new(Field::new( "struct_a", DataType::Struct(vec![field_a.clone(), field_b.clone()].into()), true, - ); + )); - let list_a = Field::new("list", DataType::List(Arc::new(struct_a)), true); - let struct_b = Field::new( + let list_a = Arc::new(Field::new("list", DataType::List(struct_a), true)); + let struct_b = Arc::new(Field::new( "struct_b", DataType::Struct(vec![list_a.clone()].into()), false, - ); + )); let schema = Arc::new(Schema::new(vec![struct_b])); From f28be8b1a5cce6840aace6a957178dbbe6c41b24 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Thu, 27 Apr 2023 01:13:26 +0800 Subject: [PATCH 0855/1411] docs: fix the wrong ln command in CONTRIBUTING.md (#4139) --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 67121f6cd5a3..9614ed2e5688 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -150,7 +150,7 @@ If the file already exists, to avoid mistakenly **overriding**, you MAY have to the link source or file content. Else if not exist, let's safely soft link [pre-commit.sh](pre-commit.sh) as file `.git/hooks/pre-commit`: ```bash -ln -s ../../rust/pre-commit.sh .git/hooks/pre-commit +ln -s ../../pre-commit.sh .git/hooks/pre-commit ``` If sometimes you want to commit without checking, just run `git commit` with `--no-verify`: From 341f2645962b32015b33e20738d9f803f26bae2b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:07:13 -0400 Subject: [PATCH 0856/1411] Return BooleanBuffer from BooleanBufferBuilder (#4140) * Return BooleanBuffer from BooleanBufferBuilder * Clippy --- arrow-arith/src/arithmetic.rs | 9 +--- arrow-array/src/array/primitive_array.rs | 6 +-- .../src/builder/boolean_buffer_builder.rs | 43 +++++++++++-------- arrow-array/src/builder/boolean_builder.rs | 2 +- .../src/builder/null_buffer_builder.rs | 3 +- arrow-integration-test/Cargo.toml | 2 +- arrow-json/src/reader/list_array.rs | 6 +-- arrow-json/src/reader/map_array.rs | 6 +-- arrow-json/src/reader/struct_array.rs | 6 +-- arrow-row/src/dictionary.rs | 4 +- arrow-select/src/filter.rs | 4 +- arrow-select/src/interleave.rs | 2 +- arrow-select/src/nullif.rs | 2 +- arrow-string/src/like.rs | 2 +- arrow-string/src/regexp.rs | 4 +- parquet/src/arrow/array_reader/list_array.rs | 4 +- .../src/arrow/array_reader/primitive_array.rs | 2 +- .../src/arrow/array_reader/struct_array.rs | 4 +- .../arrow/record_reader/definition_levels.rs | 2 +- 19 files changed, 53 insertions(+), 60 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 2b8a2f3b7db2..7f5a081900df 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1965,7 +1965,7 @@ mod tests { BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, }; use arrow_array::temporal_conversions::SECONDS_IN_DAY; - use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; + use arrow_buffer::buffer::NullBuffer; use arrow_buffer::i256; use arrow_data::ArrayDataBuilder; use chrono::NaiveDate; @@ -3575,12 +3575,7 @@ mod tests { null_buffer_builder.resize(13); assert_eq!(null_buffer_builder.len(), 13); - let null_buffer = null_buffer_builder.finish(); - - // `count_set_bits_offset` takes len in bits as parameter. - assert_eq!(null_buffer.count_set_bits_offset(0, 13), 0); - - let nulls = BooleanBuffer::new(null_buffer, 0, 13); + let nulls = null_buffer_builder.finish(); assert_eq!(nulls.count_set_bits(), 0); let nulls = NullBuffer::new(nulls); assert_eq!(nulls.null_count(), 13); diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 9fb78eb1459d..8c8562b5be38 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -25,9 +25,7 @@ use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::types::*; use crate::{Array, ArrayAccessor, ArrayRef}; -use arrow_buffer::{ - i256, ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer, -}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; @@ -642,7 +640,7 @@ impl PrimitiveArray { Ok::<_, ()>(()) }); - let nulls = BooleanBuffer::new(null_builder.finish(), 0, len); + let nulls = null_builder.finish(); let values = buffer.finish().into(); let nulls = unsafe { NullBuffer::new_unchecked(nulls, out_null_count) }; PrimitiveArray::new(values, Some(nulls)) diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index ac2a96feade0..f721504d08aa 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer}; use arrow_data::bit_mask; use std::ops::Range; @@ -214,12 +214,12 @@ impl BooleanBufferBuilder { self.buffer.as_slice_mut() } - /// Creates a [`Buffer`] + /// Creates a [`BooleanBuffer`] #[inline] - pub fn finish(&mut self) -> Buffer { + pub fn finish(&mut self) -> BooleanBuffer { let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() + let len = std::mem::replace(&mut self.len, 0); + BooleanBuffer::new(buf.into(), 0, len) } } @@ -230,6 +230,13 @@ impl From for Buffer { } } +impl From for BooleanBuffer { + #[inline] + fn from(builder: BooleanBufferBuilder) -> Self { + BooleanBuffer::new(builder.buffer.into(), 0, builder.len) + } +} + #[cfg(test)] mod tests { use super::*; @@ -244,7 +251,7 @@ mod tests { assert_eq!(4, b.len()); assert_eq!(512, b.capacity()); let buffer = b.finish(); - assert_eq!(1, buffer.len()); + assert_eq!(4, buffer.len()); // Overallocate capacity let mut b = BooleanBufferBuilder::new(8); @@ -252,7 +259,7 @@ mod tests { assert_eq!(4, b.len()); assert_eq!(512, b.capacity()); let buffer = b.finish(); - assert_eq!(1, buffer.len()); + assert_eq!(4, buffer.len()); } #[test] @@ -264,7 +271,7 @@ mod tests { buffer.append(true); buffer.set_bit(0, false); assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b1010_u8]); + assert_eq!(buffer.finish().values(), &[0b1010_u8]); } #[test] @@ -276,7 +283,7 @@ mod tests { buffer.append(true); buffer.set_bit(3, false); assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b0011_u8]); + assert_eq!(buffer.finish().values(), &[0b0011_u8]); } #[test] @@ -288,7 +295,7 @@ mod tests { buffer.append(true); buffer.set_bit(1, false); assert_eq!(buffer.len(), 4); - assert_eq!(buffer.finish().as_slice(), &[0b1001_u8]); + assert_eq!(buffer.finish().values(), &[0b1001_u8]); } #[test] @@ -302,7 +309,7 @@ mod tests { buffer.set_bit(1, false); buffer.set_bit(2, false); assert_eq!(buffer.len(), 5); - assert_eq!(buffer.finish().as_slice(), &[0b10001_u8]); + assert_eq!(buffer.finish().values(), &[0b10001_u8]); } #[test] @@ -313,7 +320,7 @@ mod tests { buffer.set_bit(3, false); buffer.set_bit(9, false); assert_eq!(buffer.len(), 10); - assert_eq!(buffer.finish().as_slice(), &[0b11110110_u8, 0b01_u8]); + assert_eq!(buffer.finish().values(), &[0b11110110_u8, 0b01_u8]); } #[test] @@ -329,7 +336,7 @@ mod tests { buffer.set_bit(14, true); buffer.set_bit(13, false); assert_eq!(buffer.len(), 15); - assert_eq!(buffer.finish().as_slice(), &[0b01010110_u8, 0b1011100_u8]); + assert_eq!(buffer.finish().values(), &[0b01010110_u8, 0b1011100_u8]); } #[test] @@ -394,7 +401,7 @@ mod tests { let start = a.min(b); let end = a.max(b); - buffer.append_packed_range(start..end, compacted_src.as_slice()); + buffer.append_packed_range(start..end, compacted_src.values()); all_bools.extend_from_slice(&src[start..end]); } @@ -430,14 +437,14 @@ mod tests { let mut builder = BooleanBufferBuilder::new_from_buffer(b, 2); builder.advance(2); let finished = builder.finish(); - assert_eq!(finished.as_slice(), &[0b00000011]); + assert_eq!(finished.values(), &[0b00000011]); let mut builder = BooleanBufferBuilder::new(10); builder.append_n(5, true); builder.resize(3); builder.advance(2); let finished = builder.finish(); - assert_eq!(finished.as_slice(), &[0b00000111]); + assert_eq!(finished.values(), &[0b00000111]); let mut builder = BooleanBufferBuilder::new(10); builder.append_n(16, true); @@ -478,7 +485,7 @@ mod tests { } let buf2 = builder.finish(); - assert_eq!(buf.len(), buf2.len()); - assert_eq!(buf.as_slice(), buf2.as_slice()); + assert_eq!(buf.len(), buf2.inner().len()); + assert_eq!(buf.as_slice(), buf2.values()); } } diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index bc3b62f99234..c7974967a700 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -149,7 +149,7 @@ impl BooleanBuilder { let null_bit_buffer = self.null_buffer_builder.finish(); let builder = ArrayData::builder(DataType::Boolean) .len(len) - .add_buffer(self.values_builder.finish()) + .add_buffer(self.values_builder.finish().into_inner()) .null_bit_buffer(null_bit_buffer); let array_data = unsafe { builder.build_unchecked() }; diff --git a/arrow-array/src/builder/null_buffer_builder.rs b/arrow-array/src/builder/null_buffer_builder.rs index 0061f70c7ed4..f37ce3a747ff 100644 --- a/arrow-array/src/builder/null_buffer_builder.rs +++ b/arrow-array/src/builder/null_buffer_builder.rs @@ -129,8 +129,7 @@ impl NullBufferBuilder { /// Builds the null buffer and resets the builder. /// Returns `None` if the builder only contains `true`s. pub fn finish(&mut self) -> Option { - let buf = self.bitmap_builder.as_mut().map(|b| b.finish()); - self.bitmap_builder = None; + let buf = self.bitmap_builder.take().map(Into::into); self.len = 0; buf } diff --git a/arrow-integration-test/Cargo.toml b/arrow-integration-test/Cargo.toml index 6ede476eb569..8afbfacff7c3 100644 --- a/arrow-integration-test/Cargo.toml +++ b/arrow-integration-test/Cargo.toml @@ -35,7 +35,7 @@ bench = false [dependencies] arrow = { workspace = true } -arrow-buffer = { workspace = true, path = "../arrow-buffer" } +arrow-buffer = { workspace = true } hex = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index aa3538bd5349..ad27eb516fab 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -19,7 +19,7 @@ use crate::reader::tape::{Tape, TapeElement}; use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; use arrow_array::OffsetSizeTrait; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::marker::PhantomData; @@ -99,9 +99,7 @@ impl ArrayDecoder for ListArrayDecoder { } let child_data = self.decoder.decode(tape, &child_pos)?; - let nulls = nulls - .as_mut() - .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); let data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs index 5e800a0d62dd..2d6fde34d433 100644 --- a/arrow-json/src/reader/map_array.rs +++ b/arrow-json/src/reader/map_array.rs @@ -18,7 +18,7 @@ use crate::reader::tape::{Tape, TapeElement}; use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; @@ -139,9 +139,7 @@ impl ArrayDecoder for MapArrayDecoder { // Valid by construction let struct_data = unsafe { struct_data.build_unchecked() }; - let nulls = nulls - .as_mut() - .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); let builder = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs index 707b56d50eef..3d24a927d85c 100644 --- a/arrow-json/src/reader/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -18,7 +18,7 @@ use crate::reader::tape::{Tape, TapeElement}; use crate::reader::{make_decoder, ArrayDecoder}; use arrow_array::builder::BooleanBufferBuilder; -use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; +use arrow_buffer::buffer::NullBuffer; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Fields}; @@ -113,9 +113,7 @@ impl ArrayDecoder for StructArrayDecoder { }) .collect::, ArrowError>>()?; - let nulls = nulls - .as_mut() - .map(|x| NullBuffer::new(BooleanBuffer::new(x.finish(), 0, pos.len()))); + let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); for (c, f) in child_data.iter().zip(fields) { // Sanity check diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index 273b7439d0d1..d790d951ee3a 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -202,7 +202,7 @@ pub unsafe fn decode_dictionary( let builder = ArrayDataBuilder::new(data_type) .len(len) - .null_bit_buffer(Some(null_builder.finish())) + .null_bit_buffer(Some(null_builder.into())) .null_count(null_count) .add_buffer(keys.finish()) .add_child_data(child); @@ -250,7 +250,7 @@ fn decode_bool(values: &[&[u8]]) -> ArrayData { let builder = ArrayDataBuilder::new(DataType::Boolean) .len(values.len()) - .add_buffer(builder.finish()); + .add_buffer(builder.into()); // SAFETY: Buffers correct length unsafe { builder.build_unchecked() } diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 06f0833561d6..c89491944a21 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -433,7 +433,7 @@ fn filter_bits(buffer: &BooleanBuffer, predicate: &FilterPredicate) -> Buffer { for (start, end) in SlicesIterator::new(&predicate.filter) { builder.append_packed_range(start + offset..end + offset, src) } - builder.finish() + builder.into() } IterationStrategy::Slices(slices) => { let mut builder = @@ -441,7 +441,7 @@ fn filter_bits(buffer: &BooleanBuffer, predicate: &FilterPredicate) -> Buffer { for (start, end) in slices { builder.append_packed_range(*start + offset..*end + offset, src) } - builder.finish() + builder.into() } IterationStrategy::All | IterationStrategy::None => unreachable!(), } diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 491395d1cc1a..c0d2026808af 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -122,7 +122,7 @@ impl<'a, T: Array + 'static> Interleave<'a, T> { null_count += !v as usize; builder.append(v) } - builder.finish() + builder.into() }); Self { diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index aaa3423d69e5..3d9148016af0 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -105,7 +105,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result( let data = unsafe { ArrayDataBuilder::new(DataType::Boolean) .len(array.len()) - .buffers(vec![result.finish()]) + .buffers(vec![result.into()]) .nulls(nulls) .build_unchecked() }; @@ -136,7 +136,7 @@ pub fn regexp_is_match_utf8_scalar( } } - let buffer = result.finish(); + let buffer = result.into(); let data = unsafe { ArrayData::new_unchecked( DataType::Boolean, diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index a6b354f902df..932034417c81 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -220,9 +220,9 @@ impl ArrayReader for ListArrayReader { .add_buffer(value_offsets) .add_child_data(child_data); - if let Some(mut builder) = validity { + if let Some(builder) = validity { assert_eq!(builder.len(), list_offsets.len() - 1); - data_builder = data_builder.null_bit_buffer(Some(builder.finish())) + data_builder = data_builder.null_bit_buffer(Some(builder.into())) } let list_data = unsafe { data_builder.build_unchecked() }; diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 012cad5c4c69..772026960a3f 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -147,7 +147,7 @@ where for e in record_data.as_slice() { boolean_buffer.append(*e > 0); } - boolean_buffer.finish() + boolean_buffer.into() } PhysicalType::INT96 => { // SAFETY - record_data is an aligned buffer of Int96 diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 600fda4fb6c4..a147c4e9557e 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -17,7 +17,7 @@ use crate::arrow::array_reader::ArrayReader; use crate::errors::{ParquetError, Result}; -use arrow_array::{builder::BooleanBufferBuilder, ArrayRef, StructArray, Array}; +use arrow_array::{builder::BooleanBufferBuilder, Array, ArrayRef, StructArray}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType as ArrowType; use std::any::Any; @@ -170,7 +170,7 @@ impl ArrayReader for StructArrayReader { } array_data_builder = - array_data_builder.null_bit_buffer(Some(bitmap_builder.finish())); + array_data_builder.null_bit_buffer(Some(bitmap_builder.into())); } let array_data = unsafe { array_data_builder.build_unchecked() }; diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 7c27a365fc28..272716caf664 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -123,7 +123,7 @@ impl DefinitionLevelBuffer { // Swap into self self.len = new_builder.len(); - std::mem::replace(old_builder, new_builder).finish() + std::mem::replace(old_builder, new_builder).into() } pub fn nulls(&self) -> &BooleanBufferBuilder { From 83f784da567807d238d810e14d6ee3e646d99577 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Thu, 27 Apr 2023 02:50:10 +0800 Subject: [PATCH 0857/1411] Infer Float64 for JSON Numerics Beyond Bounds of i64 (#4138) * Prioritize Int64 in infer_json_schema() * add test --- arrow-json/src/reader/schema.rs | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 427c20e027d6..c8250ac37716 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -442,11 +442,10 @@ fn collect_field_types_from_object( // inferring } Value::Number(n) => { - if n.is_f64() { - set_object_scalar_field_type(field_types, k, DataType::Float64)?; - } else { - // default to i64 + if n.is_i64() { set_object_scalar_field_type(field_types, k, DataType::Int64)?; + } else { + set_object_scalar_field_type(field_types, k, DataType::Float64)?; } } Value::String(_) => { @@ -663,6 +662,24 @@ mod tests { assert_eq!(inferred_schema, schema); } + #[test] + fn test_infer_json_schema_bigger_than_i64_max() { + let bigger_than_i64_max = (i64::MAX as i128) + 1; + let smaller_than_i64_min = (i64::MIN as i128) - 1; + let json = format!( + "{{ \"bigger_than_i64_max\": {}, \"smaller_than_i64_min\": {} }}", + bigger_than_i64_max, smaller_than_i64_min + ); + let mut buf_reader = BufReader::new(json.as_bytes()); + let inferred_schema = infer_json_schema(&mut buf_reader, Some(1)).unwrap(); + let fields = inferred_schema.fields(); + + let (_, big_field) = fields.find("bigger_than_i64_max").unwrap(); + assert_eq!(big_field.data_type(), &DataType::Float64); + let (_, small_field) = fields.find("smaller_than_i64_min").unwrap(); + assert_eq!(small_field.data_type(), &DataType::Float64); + } + #[test] fn test_coercion_scalar_and_list() { use arrow_schema::DataType::*; From b1642ab150ee61f730b2cda51bb917d42d9aeeb1 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 26 Apr 2023 21:42:12 +0200 Subject: [PATCH 0858/1411] chore: clean the code by using as_primitive (#4143) --- arrow-cast/src/cast.rs | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 61a296e99a4f..12e80cab4ffe 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -598,7 +598,7 @@ where D: DecimalType + ArrowPrimitiveType, ::Native: ArrowNativeTypeOp + ToPrimitive, { - let array = array.as_any().downcast_ref::>().unwrap(); + let array = array.as_primitive::(); let div: D::Native = base.pow_checked(scale as u32).map_err(|_| { ArrowError::CastError(format!( @@ -655,7 +655,7 @@ fn cast_decimal_to_float( where F: Fn(D::Native) -> T::Native, { - let array = array.as_any().downcast_ref::>().unwrap(); + let array = array.as_primitive::(); let array = array.unary::<_, T>(op); Ok(Arc::new(array)) } @@ -2404,16 +2404,12 @@ where if cast_options.safe { // If the value can't be casted to the `TO::Native`, return null Ok(Arc::new(numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), + from.as_primitive::(), ))) } else { // If the value can't be casted to the `TO::Native`, return error Ok(Arc::new(try_numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), + from.as_primitive::(), )?)) } } @@ -3265,12 +3261,8 @@ fn cast_numeric_to_bool(from: &dyn Array) -> Result where FROM: ArrowPrimitiveType, { - numeric_to_bool_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - ) - .map(|to| Arc::new(to) as ArrayRef) + numeric_to_bool_cast::(from.as_primitive::()) + .map(|to| Arc::new(to) as ArrayRef) } fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result @@ -3537,10 +3529,7 @@ where { // attempt to cast the source array values to the target value type (the dictionary values type) let cast_values = cast_with_options(array, dict_value_type, cast_options)?; - let values = cast_values - .as_any() - .downcast_ref::>() - .unwrap(); + let values = cast_values.as_primitive::(); let mut b = PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); @@ -7106,7 +7095,7 @@ mod tests { T: ArrowPrimitiveType, { let c = cast(array, dt).unwrap(); - let a = c.as_any().downcast_ref::>().unwrap(); + let a = c.as_primitive::(); let mut v: Vec = vec![]; for i in 0..array.len() { if a.is_null(i) { From d8a3b1c95d88292985d4f2dc306836b3143b8a6b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Apr 2023 07:29:44 -0400 Subject: [PATCH 0859/1411] Don't Duplicate Offset Index on RowGroupMetadata (#4142) * Remove offset index from RowGroupMetadata * Rename index accessors * Update layout test --- parquet/src/arrow/async_reader/mod.rs | 90 +++++++++++---------------- parquet/src/bin/parquet-index.rs | 4 +- parquet/src/file/metadata.rs | 60 +++++++----------- parquet/src/file/serialized_reader.rs | 85 +++++++++++++------------ parquet/tests/arrow_writer_layout.rs | 10 ++- 5 files changed, 112 insertions(+), 137 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 2d39284c763f..a0e7ff72a153 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -84,7 +84,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::format::OffsetIndex; +use crate::format::{OffsetIndex, PageLocation}; use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::ready; @@ -237,10 +237,8 @@ impl ArrowReaderBuilder> { let mut metadata = input.get_metadata().await?; if options.page_index - && metadata - .page_indexes() - .zip(metadata.offset_indexes()) - .is_none() + && metadata.column_index().is_none() + && metadata.offset_index().is_none() { let mut fetch_ranges = vec![]; let mut index_lengths: Vec> = vec![]; @@ -284,7 +282,6 @@ impl ArrowReaderBuilder> { offset_index.push(offset.page_locations); } - rg.set_page_offset(offset_index.clone()); offset_indexes.push(offset_index); let index_data = chunks.next().unwrap(); @@ -399,11 +396,17 @@ where // TODO: calling build_array multiple times is wasteful let meta = self.metadata.row_group(row_group_idx); + let page_locations = self + .metadata + .offset_index() + .map(|x| x[row_group_idx].as_slice()); + let mut row_group = InMemoryRowGroup { metadata: meta, // schema: meta.schema_descr_ptr(), row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], + page_locations, }; if let Some(filter) = self.filter.as_mut() { @@ -614,6 +617,7 @@ where /// An in-memory collection of column chunks struct InMemoryRowGroup<'a> { metadata: &'a RowGroupMetaData, + page_locations: Option<&'a [Vec]>, column_chunks: Vec>>, row_count: usize, } @@ -626,9 +630,7 @@ impl<'a> InMemoryRowGroup<'a> { projection: &ProjectionMask, selection: Option<&RowSelection>, ) -> Result<()> { - if let Some((selection, page_locations)) = - selection.zip(self.metadata.page_offset_index().as_ref()) - { + if let Some((selection, page_locations)) = selection.zip(self.page_locations) { // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the // `RowSelection` let mut page_start_offsets: Vec> = vec![]; @@ -730,11 +732,7 @@ impl<'a> RowGroupCollection for InMemoryRowGroup<'a> { "Invalid column index {i}, column was not fetched" ))), Some(data) => { - let page_locations = self - .metadata - .page_offset_index() - .as_ref() - .map(|index| index[i].clone()); + let page_locations = self.page_locations.map(|index| index[i].clone()); let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), @@ -947,19 +945,24 @@ mod tests { let metadata_with_index = builder.metadata(); // Check offset indexes are present for all columns - for rg in metadata_with_index.row_groups() { - let page_locations = - rg.page_offset_index().expect("expected page offset index"); - assert_eq!(page_locations.len(), rg.columns().len()) - } + let offset_index = metadata_with_index.offset_index().unwrap(); + let column_index = metadata_with_index.column_index().unwrap(); + + assert_eq!(offset_index.len(), metadata_with_index.num_row_groups()); + assert_eq!(column_index.len(), metadata_with_index.num_row_groups()); + + let num_columns = metadata_with_index + .file_metadata() + .schema_descr() + .num_columns(); // Check page indexes are present for all columns - let page_indexes = metadata_with_index - .page_indexes() - .expect("expected page indexes"); - for (idx, rg) in metadata_with_index.row_groups().iter().enumerate() { - assert_eq!(page_indexes[idx].len(), rg.columns().len()) - } + offset_index + .iter() + .for_each(|x| assert_eq!(x.len(), num_columns)); + column_index + .iter() + .for_each(|x| assert_eq!(x.len(), num_columns)); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); let stream = builder @@ -999,29 +1002,9 @@ mod tests { requests: Default::default(), }; - let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); - - // The builder should have page and offset indexes loaded now - let metadata_with_index = builder.metadata(); - - // Check offset indexes are present for all columns - for rg in metadata_with_index.row_groups() { - let page_locations = - rg.page_offset_index().expect("expected page offset index"); - assert_eq!(page_locations.len(), rg.columns().len()) - } - - // Check page indexes are present for all columns - let page_indexes = metadata_with_index - .page_indexes() - .expect("expected page indexes"); - for (idx, rg) in metadata_with_index.row_groups().iter().enumerate() { - assert_eq!(page_indexes[idx].len(), rg.columns().len()) - } + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); let stream = builder @@ -1473,10 +1456,13 @@ mod tests { index_reader::read_pages_locations(&data, metadata.row_group(0).columns()) .expect("reading offset index"); - let mut row_group_meta = metadata.row_group(0).clone(); - row_group_meta.set_page_offset(offset_index.clone()); - let metadata = - ParquetMetaData::new(metadata.file_metadata().clone(), vec![row_group_meta]); + let row_group_meta = metadata.row_group(0).clone(); + let metadata = ParquetMetaData::new_with_page_index( + metadata.file_metadata().clone(), + vec![row_group_meta], + None, + Some(vec![offset_index.clone()]), + ); let metadata = Arc::new(metadata); diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index d8a72dd796eb..4b82c21967a0 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -70,13 +70,13 @@ impl Args { // Column index data for all row groups and columns let column_index = reader .metadata() - .page_indexes() + .column_index() .ok_or_else(|| ParquetError::General("Column index not found".to_string()))?; // Offset index data for all row groups and columns let offset_index = reader .metadata() - .offset_indexes() + .offset_index() .ok_or_else(|| ParquetError::General("Offset index not found".to_string()))?; // Iterate through each row group diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index a83f02dfdf86..41097e1075a9 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -77,9 +77,9 @@ pub struct ParquetMetaData { file_metadata: FileMetaData, row_groups: Vec, /// Page index for all pages in each column chunk - page_indexes: Option, + column_index: Option, /// Offset index for all pages in each column chunk - offset_indexes: Option, + offset_index: Option, } impl ParquetMetaData { @@ -89,8 +89,8 @@ impl ParquetMetaData { ParquetMetaData { file_metadata, row_groups, - page_indexes: None, - offset_indexes: None, + column_index: None, + offset_index: None, } } @@ -99,14 +99,14 @@ impl ParquetMetaData { pub fn new_with_page_index( file_metadata: FileMetaData, row_groups: Vec, - page_indexes: Option, - offset_indexes: Option, + column_index: Option, + offset_index: Option, ) -> Self { ParquetMetaData { file_metadata, row_groups, - page_indexes, - offset_indexes, + column_index, + offset_index, } } @@ -132,13 +132,25 @@ impl ParquetMetaData { } /// Returns page indexes in this file. + #[deprecated(note = "Use Self::column_index")] pub fn page_indexes(&self) -> Option<&ParquetColumnIndex> { - self.page_indexes.as_ref() + self.column_index.as_ref() } - /// Returns offset indexes in this file. + /// Returns the column index for this file if loaded + pub fn column_index(&self) -> Option<&ParquetColumnIndex> { + self.column_index.as_ref() + } + + /// Returns the offset index for this file if loaded + #[deprecated(note = "Use Self::offset_index")] pub fn offset_indexes(&self) -> Option<&ParquetOffsetIndex> { - self.offset_indexes.as_ref() + self.offset_index.as_ref() + } + + /// Returns offset indexes in this file. + pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> { + self.offset_index.as_ref() } } @@ -252,8 +264,6 @@ pub struct RowGroupMetaData { sorting_columns: Option>, total_byte_size: i64, schema_descr: SchemaDescPtr, - /// `page_offset_index[column_number][page_number]` - page_offset_index: Option>>, } impl RowGroupMetaData { @@ -297,13 +307,6 @@ impl RowGroupMetaData { self.columns.iter().map(|c| c.total_compressed_size).sum() } - /// Returns reference of page offset index of all column in this row group. - /// - /// The returned vector contains `page_offset[column_number][page_number]` - pub fn page_offset_index(&self) -> Option<&Vec>> { - self.page_offset_index.as_ref() - } - /// Returns reference to a schema descriptor. pub fn schema_descr(&self) -> &SchemaDescriptor { self.schema_descr.as_ref() @@ -314,13 +317,6 @@ impl RowGroupMetaData { self.schema_descr.clone() } - /// Sets page offset index for this row group. - /// - /// The vector represents `page_offset[column_number][page_number]` - pub fn set_page_offset(&mut self, page_offset: Vec>) { - self.page_offset_index = Some(page_offset); - } - /// Method to convert from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, @@ -341,7 +337,6 @@ impl RowGroupMetaData { sorting_columns, total_byte_size, schema_descr, - page_offset_index: None, }) } @@ -366,7 +361,6 @@ pub struct RowGroupMetaDataBuilder { num_rows: i64, sorting_columns: Option>, total_byte_size: i64, - page_offset_index: Option>>, } impl RowGroupMetaDataBuilder { @@ -378,7 +372,6 @@ impl RowGroupMetaDataBuilder { num_rows: 0, sorting_columns: None, total_byte_size: 0, - page_offset_index: None, } } @@ -406,12 +399,6 @@ impl RowGroupMetaDataBuilder { self } - /// Sets page offset index for this row group. - pub fn set_page_offset(mut self, page_offset: Vec>) -> Self { - self.page_offset_index = Some(page_offset); - self - } - /// Builds row group metadata. pub fn build(self) -> Result { if self.schema_descr.num_columns() != self.columns.len() { @@ -428,7 +415,6 @@ impl RowGroupMetaDataBuilder { sorting_columns: self.sorting_columns, total_byte_size: self.total_byte_size, schema_descr: self.schema_descr, - page_offset_index: self.page_offset_index, }) } } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2ddbf0f7c29b..7346b1a12b83 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -269,7 +269,6 @@ impl SerializedFileReader { index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; - rg.set_page_offset(offset_index.clone()); columns_indexes.push(column_index); offset_indexes.push(offset_index); } @@ -328,9 +327,10 @@ impl FileReader for SerializedFileReader { // Row groups should be processed sequentially. let props = Arc::clone(&self.props); let f = Arc::clone(&self.chunk_reader); - Ok(Box::new(SerializedRowGroupReader::new_with_properties( + Ok(Box::new(SerializedRowGroupReader::new( f, row_group_metadata, + self.metadata.offset_index().map(|x| x[i].as_slice()), props, )?)) } @@ -344,15 +344,17 @@ impl FileReader for SerializedFileReader { pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, + page_locations: Option<&'a [Vec]>, props: ReaderPropertiesPtr, bloom_filters: Vec>, } impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { /// Creates new row group reader from a file, row group metadata and custom config. - fn new_with_properties( + fn new( chunk_reader: Arc, metadata: &'a RowGroupMetaData, + page_locations: Option<&'a [Vec]>, props: ReaderPropertiesPtr, ) -> Result { let bloom_filters = if props.read_bloom_filter() { @@ -367,6 +369,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { Ok(Self { chunk_reader, metadata, + page_locations, props, bloom_filters, }) @@ -386,11 +389,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' fn get_column_page_reader(&self, i: usize) -> Result> { let col = self.metadata.column(i); - let page_locations = self - .metadata - .page_offset_index() - .as_ref() - .map(|x| x[i].clone()); + let page_locations = self.page_locations.map(|x| x[i].clone()); let props = Arc::clone(&self.props); Ok(Box::new(SerializedPageReader::new_with_properties( @@ -1350,11 +1349,11 @@ mod tests { let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 1); - let page_indexes = metadata.page_indexes().unwrap(); + let column_index = metadata.column_index().unwrap(); // only one row group - assert_eq!(page_indexes.len(), 1); - let index = if let Index::BYTE_ARRAY(index) = &page_indexes[0][0] { + assert_eq!(column_index.len(), 1); + let index = if let Index::BYTE_ARRAY(index) = &column_index[0][0] { index } else { unreachable!() @@ -1372,7 +1371,7 @@ mod tests { assert_eq!(b"Hello", min.as_bytes()); assert_eq!(b"today", max.as_bytes()); - let offset_indexes = metadata.offset_indexes().unwrap(); + let offset_indexes = metadata.offset_index().unwrap(); // only one row group assert_eq!(offset_indexes.len(), 1); let offset_index = &offset_indexes[0]; @@ -1396,19 +1395,19 @@ mod tests { let metadata = reader.metadata(); assert_eq!(metadata.num_row_groups(), 1); - let page_indexes = metadata.page_indexes().unwrap(); - let row_group_offset_indexes = &metadata.offset_indexes().unwrap()[0]; + let column_index = metadata.column_index().unwrap(); + let row_group_offset_indexes = &metadata.offset_index().unwrap()[0]; // only one row group - assert_eq!(page_indexes.len(), 1); + assert_eq!(column_index.len(), 1); let row_group_metadata = metadata.row_group(0); //col0->id: INT32 UNCOMPRESSED DO:0 FPO:4 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 7299, num_nulls: 0] - assert!(!&page_indexes[0][0].is_sorted()); - let boundary_order = &page_indexes[0][0].get_boundary_order(); + assert!(!&column_index[0][0].is_sorted()); + let boundary_order = &column_index[0][0].get_boundary_order(); assert!(boundary_order.is_some()); matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED); - if let Index::INT32(index) = &page_indexes[0][0] { + if let Index::INT32(index) = &column_index[0][0] { check_native_page_index( index, 325, @@ -1420,16 +1419,16 @@ mod tests { unreachable!() }; //col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0] - assert!(&page_indexes[0][1].is_sorted()); - if let Index::BOOLEAN(index) = &page_indexes[0][1] { + assert!(&column_index[0][1].is_sorted()); + if let Index::BOOLEAN(index) = &column_index[0][1] { assert_eq!(index.indexes.len(), 82); assert_eq!(row_group_offset_indexes[1].len(), 82); } else { unreachable!() }; //col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] - assert!(&page_indexes[0][2].is_sorted()); - if let Index::INT32(index) = &page_indexes[0][2] { + assert!(&column_index[0][2].is_sorted()); + if let Index::INT32(index) = &column_index[0][2] { check_native_page_index( index, 325, @@ -1441,8 +1440,8 @@ mod tests { unreachable!() }; //col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] - assert!(&page_indexes[0][3].is_sorted()); - if let Index::INT32(index) = &page_indexes[0][3] { + assert!(&column_index[0][3].is_sorted()); + if let Index::INT32(index) = &column_index[0][3] { check_native_page_index( index, 325, @@ -1454,8 +1453,8 @@ mod tests { unreachable!() }; //col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] - assert!(&page_indexes[0][4].is_sorted()); - if let Index::INT32(index) = &page_indexes[0][4] { + assert!(&column_index[0][4].is_sorted()); + if let Index::INT32(index) = &column_index[0][4] { check_native_page_index( index, 325, @@ -1467,8 +1466,8 @@ mod tests { unreachable!() }; //col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90, num_nulls: 0] - assert!(!&page_indexes[0][5].is_sorted()); - if let Index::INT64(index) = &page_indexes[0][5] { + assert!(!&column_index[0][5].is_sorted()); + if let Index::INT64(index) = &column_index[0][5] { check_native_page_index( index, 528, @@ -1480,8 +1479,8 @@ mod tests { unreachable!() }; //col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9, num_nulls: 0] - assert!(&page_indexes[0][6].is_sorted()); - if let Index::FLOAT(index) = &page_indexes[0][6] { + assert!(&column_index[0][6].is_sorted()); + if let Index::FLOAT(index) = &column_index[0][6] { check_native_page_index( index, 325, @@ -1493,8 +1492,8 @@ mod tests { unreachable!() }; //col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249 SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 90.89999999999999, num_nulls: 0] - assert!(!&page_indexes[0][7].is_sorted()); - if let Index::DOUBLE(index) = &page_indexes[0][7] { + assert!(!&column_index[0][7].is_sorted()); + if let Index::DOUBLE(index) = &column_index[0][7] { check_native_page_index( index, 528, @@ -1506,8 +1505,8 @@ mod tests { unreachable!() }; //col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max: 12/31/10, num_nulls: 0] - assert!(!&page_indexes[0][8].is_sorted()); - if let Index::BYTE_ARRAY(index) = &page_indexes[0][8] { + assert!(!&column_index[0][8].is_sorted()); + if let Index::BYTE_ARRAY(index) = &column_index[0][8] { check_native_page_index( index, 974, @@ -1519,8 +1518,8 @@ mod tests { unreachable!() }; //col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795 SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9, num_nulls: 0] - assert!(&page_indexes[0][9].is_sorted()); - if let Index::BYTE_ARRAY(index) = &page_indexes[0][9] { + assert!(&column_index[0][9].is_sorted()); + if let Index::BYTE_ARRAY(index) = &column_index[0][9] { check_native_page_index( index, 352, @@ -1533,15 +1532,15 @@ mod tests { }; //col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093 SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0, min/max not defined] //Notice: min_max values for each page for this col not exits. - assert!(!&page_indexes[0][10].is_sorted()); - if let Index::NONE = &page_indexes[0][10] { + assert!(!&column_index[0][10].is_sorted()); + if let Index::NONE = &column_index[0][10] { assert_eq!(row_group_offset_indexes[10].len(), 974); } else { unreachable!() }; //col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0] - assert!(&page_indexes[0][11].is_sorted()); - if let Index::INT32(index) = &page_indexes[0][11] { + assert!(&column_index[0][11].is_sorted()); + if let Index::INT32(index) = &column_index[0][11] { check_native_page_index( index, 325, @@ -1553,8 +1552,8 @@ mod tests { unreachable!() }; //col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0] - assert!(!&page_indexes[0][12].is_sorted()); - if let Index::INT32(index) = &page_indexes[0][12] { + assert!(!&column_index[0][12].is_sorted()); + if let Index::INT32(index) = &column_index[0][12] { check_native_page_index( index, 325, @@ -1768,7 +1767,7 @@ mod tests { let b = Bytes::from(out); let options = ReadOptionsBuilder::new().with_page_index().build(); let reader = SerializedFileReader::new_with_options(b, options).unwrap(); - let index = reader.metadata().page_indexes().unwrap(); + let index = reader.metadata().column_index().unwrap(); // 1 row group assert_eq!(index.len(), 1); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 0c66fcd1081d..4bf649f245b0 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -78,10 +78,14 @@ fn do_test(test: LayoutTest) { fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { assert_eq!(meta.row_groups().len(), layout.row_groups.len()); - for (row_group, row_group_layout) in meta.row_groups().iter().zip(&layout.row_groups) - { + let iter = meta + .row_groups() + .iter() + .zip(&layout.row_groups) + .zip(meta.offset_index().unwrap()); + + for ((row_group, row_group_layout), offset_index) in iter { // Check against offset index - let offset_index = row_group.page_offset_index().unwrap(); assert_eq!(offset_index.len(), row_group_layout.columns.len()); for (column_index, column_layout) in From 547512172737004321ff5a02145882e15a52df0d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Apr 2023 08:32:53 -0400 Subject: [PATCH 0860/1411] Don't hardcode port in FlightSQL tests (#4145) * Don't hardcode port in FlightSQL tests * Remove sleep --- arrow-flight/examples/flight_sql_server.rs | 47 +++++++++++++--------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 675692aba6f9..43154420d424 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -546,22 +546,31 @@ impl ProstMessageExt for FetchResults { #[cfg(test)] mod tests { use super::*; - use futures::TryStreamExt; + use futures::future::BoxFuture; + use futures::{FutureExt, TryStreamExt}; use std::fs; use std::future::Future; + use std::net::SocketAddr; use std::time::Duration; use tempfile::NamedTempFile; - use tokio::net::{UnixListener, UnixStream}; - use tokio::time::sleep; + use tokio::net::{TcpListener, UnixListener, UnixStream}; use tokio_stream::wrappers::UnixListenerStream; use tonic::transport::{Channel, ClientTlsConfig}; use arrow_cast::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; use arrow_flight::utils::flight_data_to_batches; + use tonic::transport::server::TcpIncoming; use tonic::transport::{Certificate, Endpoint}; use tower::service_fn; + async fn bind_tcp() -> (TcpIncoming, SocketAddr) { + let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let incoming = TcpIncoming::from_listener(listener, true, None).unwrap(); + (incoming, addr) + } + async fn client_with_uds(path: String) -> FlightSqlServiceClient { let connector = service_fn(move |_| UnixStream::connect(path.clone())); let channel = Endpoint::try_from("http://example.com") @@ -572,7 +581,10 @@ mod tests { FlightSqlServiceClient::new(channel) } - async fn create_https_server() -> Result<(), tonic::transport::Error> { + type ServeFut = BoxFuture<'static, Result<(), tonic::transport::Error>>; + + async fn create_https_server( + ) -> Result<(ServeFut, SocketAddr), tonic::transport::Error> { let cert = std::fs::read_to_string("examples/data/server.pem").unwrap(); let key = std::fs::read_to_string("examples/data/server.key").unwrap(); let client_ca = std::fs::read_to_string("examples/data/client_ca.pem").unwrap(); @@ -581,20 +593,22 @@ mod tests { .identity(Identity::from_pem(&cert, &key)) .client_ca_root(Certificate::from_pem(&client_ca)); - let addr = "0.0.0.0:50051".parse().unwrap(); + let (incoming, addr) = bind_tcp().await; let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); - Server::builder() + let serve = Server::builder() .tls_config(tls_config) .unwrap() .add_service(svc) - .serve(addr) - .await + .serve_with_incoming(incoming) + .boxed(); + + Ok((serve, addr)) } - fn endpoint(addr: String) -> Result { - let endpoint = Endpoint::new(addr) + fn endpoint(uri: String) -> Result { + let endpoint = Endpoint::new(uri) .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) @@ -609,11 +623,8 @@ mod tests { #[tokio::test] async fn test_select_https() { - tokio::spawn(async { - create_https_server().await.unwrap(); - }); - - sleep(Duration::from_millis(2000)).await; + let (serve, addr) = create_https_server().await.unwrap(); + let uri = format!("https://{}:{}", addr.ip(), addr.port()); let request_future = async { let cert = std::fs::read_to_string("examples/data/client1.pem").unwrap(); @@ -624,10 +635,7 @@ mod tests { .domain_name("localhost") .ca_certificate(Certificate::from_pem(&server_ca)) .identity(Identity::from_pem(cert, key)); - let endpoint = endpoint(String::from("https://127.0.0.1:50051")) - .unwrap() - .tls_config(tls_config) - .unwrap(); + let endpoint = endpoint(uri).unwrap().tls_config(tls_config).unwrap(); let channel = endpoint.connect().await.unwrap(); let mut client = FlightSqlServiceClient::new(channel); let token = client.handshake("admin", "password").await.unwrap(); @@ -652,6 +660,7 @@ mod tests { }; tokio::select! { + _ = serve => panic!("server finished"), _ = request_future => println!("Client finished!"), } } From 9fa8125fbe14a3a85b4995617945bda51ee3b055 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Apr 2023 08:33:12 -0400 Subject: [PATCH 0861/1411] Cleanup CSV schema inference (#4129) (#4130) (#4133) * Cleanup CSV schema inference (#4129) (#4130) * Update tests * Update parquet-fromcsv --- arrow-csv/src/reader/mod.rs | 979 ++++++++++-------------- arrow-csv/src/writer.rs | 18 +- arrow/benches/csv_reader.rs | 3 +- arrow/examples/read_csv.rs | 6 +- arrow/examples/read_csv_infer_schema.rs | 13 +- parquet/src/bin/parquet-fromcsv.rs | 3 +- 6 files changed, 405 insertions(+), 617 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 5bfcbc6452fb..74294f42e8b2 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -26,7 +26,7 @@ //! //! ``` //! # use arrow_schema::*; -//! # use arrow_csv::Reader; +//! # use arrow_csv::{Reader, ReaderBuilder}; //! # use std::fs::File; //! # use std::sync::Arc; //! @@ -38,7 +38,7 @@ //! //! let file = File::open("test/data/uk_cities.csv").unwrap(); //! -//! let mut csv = Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); +//! let mut csv = ReaderBuilder::new(Arc::new(schema)).build(file).unwrap(); //! let batch = csv.next().unwrap().unwrap(); //! ``` //! @@ -131,8 +131,9 @@ use arrow_array::*; use arrow_cast::parse::{parse_decimal, string_to_datetime, Parser}; use arrow_schema::*; use chrono::{TimeZone, Utc}; +use csv::StringRecord; use lazy_static::lazy_static; -use regex::{Regex, RegexSet}; +use regex::RegexSet; use std::fmt; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; @@ -141,7 +142,6 @@ use std::sync::Arc; use crate::map_csv_error; use crate::reader::records::{RecordDecoder, StringRecords}; use arrow_array::timezone::Tz; -use csv::StringRecord; lazy_static! { /// Order should match [`InferredDataType`] @@ -194,32 +194,150 @@ impl InferredDataType { } /// Updates the [`InferredDataType`] with the given string - fn update(&mut self, string: &str, datetime_re: Option<&Regex>) { + fn update(&mut self, string: &str) { self.packed |= if string.starts_with('"') { 1 << 8 // Utf8 } else if let Some(m) = REGEX_SET.matches(string).into_iter().next() { 1 << m } else { - match datetime_re { - // Timestamp(Nanosecond) - Some(d) if d.is_match(string) => 1 << 7, - _ => 1 << 8, // Utf8 - } + 1 << 8 // Utf8 } } } -/// This is a collection of options for csv reader when the builder pattern cannot be used -/// and the parameters need to be passed around -#[derive(Debug, Default, Clone)] -struct ReaderOptions { +/// The format specification for the CSV file +#[derive(Debug, Clone, Default)] +pub struct Format { has_header: bool, delimiter: Option, escape: Option, quote: Option, terminator: Option, - max_read_records: Option, - datetime_re: Option, +} + +impl Format { + pub fn with_header(mut self, has_header: bool) -> Self { + self.has_header = has_header; + self + } + + pub fn with_delimiter(mut self, delimiter: u8) -> Self { + self.delimiter = Some(delimiter); + self + } + + pub fn with_escape(mut self, escape: u8) -> Self { + self.escape = Some(escape); + self + } + + pub fn with_quote(mut self, quote: u8) -> Self { + self.quote = Some(quote); + self + } + + pub fn with_terminator(mut self, terminator: u8) -> Self { + self.terminator = Some(terminator); + self + } + + /// Infer schema of CSV records from the provided `reader` + /// + /// If `max_records` is `None`, all records will be read, otherwise up to `max_records` + /// records are read to infer the schema + /// + /// Returns inferred schema and number of records read + pub fn infer_schema( + &self, + reader: R, + max_records: Option, + ) -> Result<(Schema, usize), ArrowError> { + let mut csv_reader = self.build_reader(reader); + + // get or create header names + // when has_header is false, creates default column names with column_ prefix + let headers: Vec = if self.has_header { + let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); + headers.iter().map(|s| s.to_string()).collect() + } else { + let first_record_count = &csv_reader.headers().map_err(map_csv_error)?.len(); + (0..*first_record_count) + .map(|i| format!("column_{}", i + 1)) + .collect() + }; + + let header_length = headers.len(); + // keep track of inferred field types + let mut column_types: Vec = + vec![Default::default(); header_length]; + + let mut records_count = 0; + + let mut record = StringRecord::new(); + let max_records = max_records.unwrap_or(usize::MAX); + while records_count < max_records { + if !csv_reader.read_record(&mut record).map_err(map_csv_error)? { + break; + } + records_count += 1; + + // Note since we may be looking at a sample of the data, we make the safe assumption that + // they could be nullable + for (i, column_type) in + column_types.iter_mut().enumerate().take(header_length) + { + if let Some(string) = record.get(i) { + if !string.is_empty() { + column_type.update(string) + } + } + } + } + + // build schema from inference results + let fields: Fields = column_types + .iter() + .zip(&headers) + .map(|(inferred, field_name)| Field::new(field_name, inferred.get(), true)) + .collect(); + + Ok((Schema::new(fields), records_count)) + } + + /// Build a [`csv::Reader`] for this [`Format`] + fn build_reader(&self, reader: R) -> csv::Reader { + let mut builder = csv::ReaderBuilder::new(); + builder.has_headers(self.has_header); + + if let Some(c) = self.delimiter { + builder.delimiter(c); + } + builder.escape(self.escape); + if let Some(c) = self.quote { + builder.quote(c); + } + if let Some(t) = self.terminator { + builder.terminator(csv::Terminator::Any(t)); + } + builder.from_reader(reader) + } + + /// Build a [`csv_core::Reader`] for this [`Format`] + fn build_parser(&self) -> csv_core::Reader { + let mut builder = csv_core::ReaderBuilder::new(); + builder.escape(self.escape); + + if let Some(c) = self.delimiter { + builder.delimiter(c); + } + if let Some(c) = self.quote { + builder.quote(c); + } + if let Some(t) = self.terminator { + builder.terminator(csv_core::Terminator::Any(t)); + } + builder.build() + } } /// Infer the schema of a CSV file by reading through the first n records of the file, @@ -231,34 +349,19 @@ struct ReaderOptions { /// reader cursor offset. /// /// The inferred schema will always have each field set as nullable. +#[deprecated(note = "Use Format::infer_schema")] +#[allow(deprecated)] pub fn infer_file_schema( - reader: R, + mut reader: R, delimiter: u8, max_read_records: Option, has_header: bool, -) -> Result<(Schema, usize), ArrowError> { - let roptions = ReaderOptions { - delimiter: Some(delimiter), - max_read_records, - has_header, - ..Default::default() - }; - - infer_file_schema_with_csv_options(reader, roptions) -} - -fn infer_file_schema_with_csv_options( - mut reader: R, - roptions: ReaderOptions, ) -> Result<(Schema, usize), ArrowError> { let saved_offset = reader.stream_position()?; - - let (schema, records_count) = - infer_reader_schema_with_csv_options(&mut reader, roptions)?; + let r = infer_reader_schema(&mut reader, delimiter, max_read_records, has_header)?; // return the reader seek back to the start reader.seek(SeekFrom::Start(saved_offset))?; - - Ok((schema, records_count)) + Ok(r) } /// Infer schema of CSV records provided by struct that implements `Read` trait. @@ -267,104 +370,19 @@ fn infer_file_schema_with_csv_options( /// not set, all records are read to infer the schema. /// /// Return inferred schema and number of records used for inference. +#[deprecated(note = "Use Format::infer_schema")] pub fn infer_reader_schema( reader: R, delimiter: u8, max_read_records: Option, has_header: bool, ) -> Result<(Schema, usize), ArrowError> { - let roptions = ReaderOptions { + let format = Format { delimiter: Some(delimiter), - max_read_records, has_header, ..Default::default() }; - infer_reader_schema_with_csv_options(reader, roptions) -} - -/// Creates a `csv::Reader` -fn build_csv_reader( - reader: R, - has_header: bool, - delimiter: Option, - escape: Option, - quote: Option, - terminator: Option, -) -> csv::Reader { - let mut reader_builder = csv::ReaderBuilder::new(); - reader_builder.has_headers(has_header); - - if let Some(c) = delimiter { - reader_builder.delimiter(c); - } - reader_builder.escape(escape); - if let Some(c) = quote { - reader_builder.quote(c); - } - if let Some(t) = terminator { - reader_builder.terminator(csv::Terminator::Any(t)); - } - reader_builder.from_reader(reader) -} - -fn infer_reader_schema_with_csv_options( - reader: R, - roptions: ReaderOptions, -) -> Result<(Schema, usize), ArrowError> { - let mut csv_reader = build_csv_reader( - reader, - roptions.has_header, - roptions.delimiter, - roptions.escape, - roptions.quote, - roptions.terminator, - ); - - // get or create header names - // when has_header is false, creates default column names with column_ prefix - let headers: Vec = if roptions.has_header { - let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); - headers.iter().map(|s| s.to_string()).collect() - } else { - let first_record_count = &csv_reader.headers().map_err(map_csv_error)?.len(); - (0..*first_record_count) - .map(|i| format!("column_{}", i + 1)) - .collect() - }; - - let header_length = headers.len(); - // keep track of inferred field types - let mut column_types: Vec = vec![Default::default(); header_length]; - - let mut records_count = 0; - - let mut record = StringRecord::new(); - let max_records = roptions.max_read_records.unwrap_or(usize::MAX); - while records_count < max_records { - if !csv_reader.read_record(&mut record).map_err(map_csv_error)? { - break; - } - records_count += 1; - - // Note since we may be looking at a sample of the data, we make the safe assumption that - // they could be nullable - for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) { - if let Some(string) = record.get(i) { - if !string.is_empty() { - column_type.update(string, roptions.datetime_re.as_ref()) - } - } - } - } - - // build schema from inference results - let fields: Fields = column_types - .iter() - .zip(&headers) - .map(|(inferred, field_name)| Field::new(field_name, inferred.get(), true)) - .collect(); - - Ok((Schema::new(fields), records_count)) + format.infer_schema(reader, max_read_records) } /// Infer schema from a list of CSV files by reading through first n records @@ -381,14 +399,15 @@ pub fn infer_schema_from_files( ) -> Result { let mut schemas = vec![]; let mut records_to_read = max_read_records.unwrap_or(usize::MAX); + let format = Format { + delimiter: Some(delimiter), + has_header, + ..Default::default() + }; for fname in files.iter() { - let (schema, records_read) = infer_file_schema( - &mut File::open(fname)?, - delimiter, - Some(records_to_read), - has_header, - )?; + let f = File::open(fname)?; + let (schema, records_read) = format.infer_schema(f, Some(records_to_read))?; if records_read == 0 { continue; } @@ -429,46 +448,6 @@ where } impl Reader { - /// Create a new CsvReader from any value that implements the `Read` trait. - /// - /// If reading a `File` or an input that supports `std::io::Read` and `std::io::Seek`; - /// you can customise the Reader, such as to enable schema inference, use - /// `ReaderBuilder`. - #[allow(clippy::too_many_arguments)] - pub fn new( - reader: R, - schema: SchemaRef, - has_header: bool, - delimiter: Option, - batch_size: usize, - bounds: Bounds, - projection: Option>, - datetime_format: Option, - ) -> Self { - let mut builder = ReaderBuilder::new() - .has_header(has_header) - .with_batch_size(batch_size) - .with_schema(schema); - - if let Some(delimiter) = delimiter { - builder = builder.with_delimiter(delimiter); - } - if let Some((start, end)) = bounds { - builder = builder.with_bounds(start, end); - } - if let Some(projection) = projection { - builder = builder.with_projection(projection) - } - if let Some(format) = datetime_format { - builder = builder.with_datetime_format(format) - } - - Self { - decoder: builder.build_decoder(), - reader: StdBufReader::new(reader), - } - } - /// Returns the schema of the reader, useful for getting the schema without reading /// record batches pub fn schema(&self) -> SchemaRef { @@ -481,34 +460,6 @@ impl Reader { None => self.decoder.schema.clone(), } } - - /// Create a new CsvReader from a Reader - /// - /// This constructor allows you more flexibility in what records are processed by the - /// csv reader. - #[allow(clippy::too_many_arguments)] - #[deprecated(note = "Use Reader::new or ReaderBuilder")] - pub fn from_reader( - reader: R, - schema: SchemaRef, - has_header: bool, - delimiter: Option, - batch_size: usize, - bounds: Bounds, - projection: Option>, - datetime_format: Option, - ) -> Self { - Self::new( - reader, - schema, - has_header, - delimiter, - batch_size, - bounds, - projection, - datetime_format, - ) - } } impl BufReader { @@ -558,8 +509,7 @@ impl Iterator for BufReader { /// schema: SchemaRef, /// batch_size: usize, /// ) -> Result>, ArrowError> { -/// let mut decoder = ReaderBuilder::new() -/// .with_schema(schema) +/// let mut decoder = ReaderBuilder::new(schema) /// .with_batch_size(batch_size) /// .build_decoder(); /// @@ -601,11 +551,6 @@ pub struct Decoder { /// A decoder for [`StringRecords`] record_decoder: RecordDecoder, - - /// datetime format used to parse datetime values, (format understood by chrono) - /// - /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) - datetime_format: Option, } impl Decoder { @@ -652,7 +597,6 @@ impl Decoder { Some(self.schema.metadata.clone()), self.projection.as_ref(), self.line_number, - self.datetime_format.as_deref(), )?; self.line_number += rows.len(); Ok(Some(batch)) @@ -671,7 +615,6 @@ fn parse( metadata: Option>, projection: Option<&Vec>, line_number: usize, - datetime_format: Option<&str>, ) -> Result { let projection: Vec = match projection { Some(v) => v.clone(), @@ -703,63 +646,52 @@ fn parse( *scale, ) } - DataType::Int8 => { - build_primitive_array::(line_number, rows, i, None) - } + DataType::Int8 => build_primitive_array::(line_number, rows, i), DataType::Int16 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::Int32 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::Int64 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::UInt8 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::UInt16 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::UInt32 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::UInt64 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::Float32 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::Float64 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) } DataType::Date32 => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) + } + DataType::Date64 => { + build_primitive_array::(line_number, rows, i) } - DataType::Date64 => build_primitive_array::( - line_number, - rows, - i, - datetime_format, - ), DataType::Time32(TimeUnit::Second) => { - build_primitive_array::(line_number, rows, i, None) + build_primitive_array::(line_number, rows, i) + } + DataType::Time32(TimeUnit::Millisecond) => { + build_primitive_array::(line_number, rows, i) + } + DataType::Time64(TimeUnit::Microsecond) => { + build_primitive_array::(line_number, rows, i) + } + DataType::Time64(TimeUnit::Nanosecond) => { + build_primitive_array::(line_number, rows, i) } - DataType::Time32(TimeUnit::Millisecond) => build_primitive_array::< - Time32MillisecondType, - >( - line_number, rows, i, None - ), - DataType::Time64(TimeUnit::Microsecond) => build_primitive_array::< - Time64MicrosecondType, - >( - line_number, rows, i, None - ), - DataType::Time64(TimeUnit::Nanosecond) => build_primitive_array::< - Time64NanosecondType, - >( - line_number, rows, i, None - ), DataType::Timestamp(TimeUnit::Second, tz) => { build_timestamp_array::( line_number, @@ -871,13 +803,6 @@ fn parse( ) }) } -fn parse_item(string: &str) -> Option { - T::parse(string) -} - -fn parse_formatted(string: &str, format: &str) -> Option { - T::parse_formatted(string, format) -} fn parse_bool(string: &str) -> Option { if string.eq_ignore_ascii_case("false") { @@ -928,7 +853,6 @@ fn build_primitive_array( line_number: usize, rows: &StringRecords<'_>, col_idx: usize, - format: Option<&str>, ) -> Result { rows.iter() .enumerate() @@ -938,11 +862,7 @@ fn build_primitive_array( return Ok(None); } - let parsed = match format { - Some(format) => parse_formatted::(s, format), - _ => parse_item::(s), - }; - match parsed { + match T::parse(s) { Some(e) => Ok(Some(e)), None => Err(ArrowError::ParseError(format!( // TODO: we should surface the underlying error here. @@ -1037,28 +957,10 @@ fn build_boolean_array( /// CSV file reader builder #[derive(Debug)] pub struct ReaderBuilder { - /// Optional schema for the CSV file - /// - /// If the schema is not supplied, the reader will try to infer the schema - /// based on the CSV structure. - schema: Option, - /// Whether the file has headers or not - /// - /// If schema inference is run on a file with no headers, default column names - /// are created. - has_header: bool, - /// An optional column delimiter. Defaults to `b','` - delimiter: Option, - /// An optional escape character. Defaults None - escape: Option, - /// An optional quote character. Defaults b'\"' - quote: Option, - /// An optional record terminator. Defaults CRLF - terminator: Option, - /// Optional maximum number of records to read during schema inference - /// - /// If a number is not provided, all the records are read. - max_records: Option, + /// Schema of the CSV file + schema: SchemaRef, + /// Format of the CSV file + format: Format, /// Batch size (number of records to load each time) /// /// The default batch size when using the `ReaderBuilder` is 1024 records @@ -1067,29 +969,6 @@ pub struct ReaderBuilder { bounds: Bounds, /// Optional projection for which columns to load (zero-based column indices) projection: Option>, - /// DateTime format to be used while trying to infer datetime format - datetime_re: Option, - /// DateTime format to be used while parsing datetime format - datetime_format: Option, -} - -impl Default for ReaderBuilder { - fn default() -> Self { - Self { - schema: None, - has_header: false, - delimiter: None, - escape: None, - quote: None, - terminator: None, - max_records: None, - batch_size: 1024, - bounds: None, - projection: None, - datetime_re: None, - datetime_format: None, - } - } } impl ReaderBuilder { @@ -1100,79 +979,60 @@ impl ReaderBuilder { /// # Example /// /// ``` - /// use arrow_csv::{Reader, ReaderBuilder}; - /// use std::fs::File; - /// - /// fn example() -> Reader { - /// let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - /// - /// // create a builder, inferring the schema with the first 100 records - /// let builder = ReaderBuilder::new().infer_schema(Some(100)); - /// - /// let reader = builder.build(file).unwrap(); + /// # use arrow_csv::{Reader, ReaderBuilder}; + /// # use std::fs::File; + /// # use std::io::Seek; + /// # use std::sync::Arc; + /// # use arrow_csv::reader::Format; + /// # + /// let mut file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + /// // Infer the schema with the first 100 records + /// let (schema, _) = Format::default().infer_schema(&mut file, Some(100)).unwrap(); + /// file.rewind().unwrap(); /// - /// reader - /// } + /// // create a builder + /// ReaderBuilder::new(Arc::new(schema)).build(file).unwrap(); /// ``` - pub fn new() -> ReaderBuilder { - ReaderBuilder::default() - } - - /// Set the CSV file's schema - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); - self + pub fn new(schema: SchemaRef) -> ReaderBuilder { + Self { + schema, + format: Format::default(), + batch_size: 1024, + bounds: None, + projection: None, + } } /// Set whether the CSV file has headers pub fn has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.format.has_header = has_header; self } - /// Set the datetime regex used to parse the string to Date64Type - /// this regex is used while inferring schema - pub fn with_datetime_re(mut self, datetime_re: Regex) -> Self { - self.datetime_re = Some(datetime_re); - self - } - - /// Set the datetime format used to parse the string to Date64Type - /// this format is used while when the schema wants to parse Date64Type. - /// - /// For format refer to [chrono docs](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html) - /// - pub fn with_datetime_format(mut self, datetime_format: String) -> Self { - self.datetime_format = Some(datetime_format); + /// Overrides the [`Format`] of this [`ReaderBuilder] + pub fn with_format(mut self, format: Format) -> Self { + self.format = format; self } /// Set the CSV file's column delimiter as a byte character pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); + self.format.delimiter = Some(delimiter); self } pub fn with_escape(mut self, escape: u8) -> Self { - self.escape = Some(escape); + self.format.escape = Some(escape); self } pub fn with_quote(mut self, quote: u8) -> Self { - self.quote = Some(quote); + self.format.quote = Some(quote); self } pub fn with_terminator(mut self, terminator: u8) -> Self { - self.terminator = Some(terminator); - self - } - - /// Set the CSV reader to infer the schema of the file - pub fn infer_schema(mut self, max_records: Option) -> Self { - // remove any schema that is set - self.schema = None; - self.max_records = max_records; + self.format.terminator = Some(terminator); self } @@ -1199,32 +1059,15 @@ impl ReaderBuilder { /// /// If `R: BufRead` consider using [`Self::build_buffered`] to avoid unnecessary additional /// buffering, as internally this method wraps `reader` in [`std::io::BufReader`] - pub fn build(self, reader: R) -> Result, ArrowError> { + pub fn build(self, reader: R) -> Result, ArrowError> { self.build_buffered(StdBufReader::new(reader)) } /// Create a new `BufReader` from a buffered reader - pub fn build_buffered( - mut self, - mut reader: R, + pub fn build_buffered( + self, + reader: R, ) -> Result, ArrowError> { - // check if schema should be inferred - if self.schema.is_none() { - let delimiter = self.delimiter.unwrap_or(b','); - let roptions = ReaderOptions { - delimiter: Some(delimiter), - max_read_records: self.max_records, - has_header: self.has_header, - escape: self.escape, - quote: self.quote, - terminator: self.terminator, - datetime_re: self.datetime_re.take(), - }; - let (inferred_schema, _) = - infer_file_schema_with_csv_options(&mut reader, roptions)?; - self.schema = Some(Arc::new(inferred_schema)) - } - Ok(BufReader { reader, decoder: self.build_decoder(), @@ -1232,28 +1075,11 @@ impl ReaderBuilder { } /// Builds a decoder that can be used to decode CSV from an arbitrary byte stream - /// - /// # Panics - /// - /// This method panics if no schema provided pub fn build_decoder(self) -> Decoder { - let schema = self.schema.expect("schema should be provided"); - let mut reader_builder = csv_core::ReaderBuilder::new(); - reader_builder.escape(self.escape); - - if let Some(c) = self.delimiter { - reader_builder.delimiter(c); - } - if let Some(c) = self.quote { - reader_builder.quote(c); - } - if let Some(t) = self.terminator { - reader_builder.terminator(csv_core::Terminator::Any(t)); - } - let delimiter = reader_builder.build(); - let record_decoder = RecordDecoder::new(delimiter, schema.fields().len()); + let delimiter = self.format.build_parser(); + let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len()); - let header = self.has_header as usize; + let header = self.format.has_header as usize; let (start, end) = match self.bounds { Some((start, end)) => (start + header, end + header), @@ -1261,13 +1087,12 @@ impl ReaderBuilder { }; Decoder { - schema, + schema: self.schema, to_skip: start, record_decoder, line_number: start, end, projection: self.projection, - datetime_format: self.datetime_format, batch_size: self.batch_size, } } @@ -1284,74 +1109,46 @@ mod tests { #[test] fn test_csv() { - for format in [None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - format, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); + let schema = Arc::new(Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ])); - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut csv = ReaderBuilder::new(schema.clone()).build(file).unwrap(); + assert_eq!(schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } + // access data from a primitive array + let lat = batch.column(1).as_primitive::(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch.column(0).as_string::(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); } #[test] fn test_csv_schema_metadata() { let mut metadata = std::collections::HashMap::new(); metadata.insert("foo".to_owned(), "bar".to_owned()); - let schema = Schema::new_with_metadata( + let schema = Arc::new(Schema::new_with_metadata( vec![ Field::new("city", DataType::Utf8, false), Field::new("lat", DataType::Float64, false), Field::new("lng", DataType::Float64, false), ], metadata.clone(), - ); + )); let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); + let mut csv = ReaderBuilder::new(schema.clone()).build(file).unwrap(); + assert_eq!(schema, csv.schema()); let batch = csv.next().unwrap().unwrap(); assert_eq!(37, batch.num_rows()); assert_eq!(3, batch.num_columns()); @@ -1361,16 +1158,15 @@ mod tests { #[test] fn test_csv_reader_with_decimal() { - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("city", DataType::Utf8, false), Field::new("lat", DataType::Decimal128(38, 6), false), Field::new("lng", DataType::Decimal256(76, 6), false), - ]); + ])); let file = File::open("test/data/decimal_test.csv").unwrap(); - let mut csv = - Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); + let mut csv = ReaderBuilder::new(schema).build(file).unwrap(); let batch = csv.next().unwrap().unwrap(); // access data from a primitive array let lat = batch @@ -1422,16 +1218,10 @@ mod tests { let both_files = file_with_headers .chain(Cursor::new("\n".to_string())) .chain(file_without_headers); - let mut csv = Reader::new( - both_files, - Arc::new(schema), - true, - None, - 1024, - None, - None, - None, - ); + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .has_header(true) + .build(both_files) + .unwrap(); let batch = csv.next().unwrap().unwrap(); assert_eq!(74, batch.num_rows()); assert_eq!(3, batch.num_columns()); @@ -1439,9 +1229,15 @@ mod tests { #[test] fn test_csv_with_schema_inference() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let mut file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let builder = ReaderBuilder::new().has_header(true).infer_schema(None); + let (schema, _) = Format::default() + .with_header(true) + .infer_schema(&mut file, None) + .unwrap(); + + file.rewind().unwrap(); + let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true); let mut csv = builder.build(file).unwrap(); let expected_schema = Schema::new(vec![ @@ -1474,11 +1270,12 @@ mod tests { #[test] fn test_csv_with_schema_inference_no_headers() { - let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut file = File::open("test/data/uk_cities.csv").unwrap(); - let builder = ReaderBuilder::new().infer_schema(None); + let (schema, _) = Format::default().infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); - let mut csv = builder.build(file).unwrap(); + let mut csv = ReaderBuilder::new(Arc::new(schema)).build(file).unwrap(); // csv field names should be 'column_{number}' let schema = csv.schema(); @@ -1512,10 +1309,15 @@ mod tests { #[test] fn test_csv_builder_with_bounds() { - let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut file = File::open("test/data/uk_cities.csv").unwrap(); // Set the bounds to the lines 0, 1 and 2. - let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); + let (schema, _) = Format::default().infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .with_bounds(0, 2) + .build(file) + .unwrap(); let batch = csv.next().unwrap().unwrap(); // access data from a string array (ListArray) @@ -1536,24 +1338,19 @@ mod tests { #[test] fn test_csv_with_projection() { - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("city", DataType::Utf8, false), Field::new("lat", DataType::Float64, false), Field::new("lng", DataType::Float64, false), - ]); + ])); let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); + let mut csv = ReaderBuilder::new(schema) + .with_projection(vec![0, 1]) + .build(file) + .unwrap(); + let projected_schema = Arc::new(Schema::new(vec![ Field::new("city", DataType::Utf8, false), Field::new("lat", DataType::Float64, false), @@ -1567,24 +1364,19 @@ mod tests { #[test] fn test_csv_with_dictionary() { - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new_dictionary("city", DataType::Int32, DataType::Utf8, false), Field::new("lat", DataType::Float64, false), Field::new("lng", DataType::Float64, false), - ]); + ])); let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); + let mut csv = ReaderBuilder::new(schema) + .with_projection(vec![0, 1]) + .build(file) + .unwrap(); + let projected_schema = Arc::new(Schema::new(vec![ Field::new_dictionary("city", DataType::Int32, DataType::Utf8, false), Field::new("lat", DataType::Float64, false), @@ -1596,7 +1388,7 @@ mod tests { assert_eq!(2, batch.num_columns()); let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); - let strings = strings.as_any().downcast_ref::().unwrap(); + let strings = strings.as_string::(); assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); @@ -1605,17 +1397,20 @@ mod tests { #[test] fn test_nulls() { - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("c_int", DataType::UInt64, false), Field::new("c_float", DataType::Float32, true), Field::new("c_string", DataType::Utf8, false), Field::new("c_bool", DataType::Boolean, false), - ]); + ])); let file = File::open("test/data/null_test.csv").unwrap(); - let mut csv = - Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); + let mut csv = ReaderBuilder::new(schema) + .has_header(true) + .build(file) + .unwrap(); + let batch = csv.next().unwrap().unwrap(); assert!(!batch.column(1).is_null(0)); @@ -1627,12 +1422,14 @@ mod tests { #[test] fn test_nulls_with_inference() { - let file = File::open("test/data/various_types.csv").unwrap(); + let mut file = File::open("test/data/various_types.csv").unwrap(); + let format = Format::default().with_header(true).with_delimiter(b'|'); - let builder = ReaderBuilder::new() - .infer_schema(None) - .has_header(true) - .with_delimiter(b'|') + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let builder = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) .with_batch_size(512) .with_projection(vec![0, 1, 2, 3, 4, 5]); @@ -1693,8 +1490,7 @@ mod tests { Field::new("c_bool", DataType::Boolean, false), ]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) + let builder = ReaderBuilder::new(Arc::new(schema)) .has_header(true) .with_delimiter(b'|') .with_batch_size(512) @@ -1714,95 +1510,94 @@ mod tests { } /// Infer the data type of a record - fn infer_field_schema(string: &str, datetime_re: Option) -> DataType { + fn infer_field_schema(string: &str) -> DataType { let mut v = InferredDataType::default(); - v.update(string, datetime_re.as_ref()); + v.update(string); v.get() } #[test] fn test_infer_field_schema() { - assert_eq!(infer_field_schema("A", None), DataType::Utf8); - assert_eq!(infer_field_schema("\"123\"", None), DataType::Utf8); - assert_eq!(infer_field_schema("10", None), DataType::Int64); - assert_eq!(infer_field_schema("10.2", None), DataType::Float64); - assert_eq!(infer_field_schema(".2", None), DataType::Float64); - assert_eq!(infer_field_schema("2.", None), DataType::Float64); - assert_eq!(infer_field_schema("true", None), DataType::Boolean); - assert_eq!(infer_field_schema("trUe", None), DataType::Boolean); - assert_eq!(infer_field_schema("false", None), DataType::Boolean); - assert_eq!(infer_field_schema("2020-11-08", None), DataType::Date32); + assert_eq!(infer_field_schema("A"), DataType::Utf8); + assert_eq!(infer_field_schema("\"123\""), DataType::Utf8); + assert_eq!(infer_field_schema("10"), DataType::Int64); + assert_eq!(infer_field_schema("10.2"), DataType::Float64); + assert_eq!(infer_field_schema(".2"), DataType::Float64); + assert_eq!(infer_field_schema("2."), DataType::Float64); + assert_eq!(infer_field_schema("true"), DataType::Boolean); + assert_eq!(infer_field_schema("trUe"), DataType::Boolean); + assert_eq!(infer_field_schema("false"), DataType::Boolean); + assert_eq!(infer_field_schema("2020-11-08"), DataType::Date32); assert_eq!( - infer_field_schema("2020-11-08T14:20:01", None), + infer_field_schema("2020-11-08T14:20:01"), DataType::Timestamp(TimeUnit::Second, None) ); assert_eq!( - infer_field_schema("2020-11-08 14:20:01", None), + infer_field_schema("2020-11-08 14:20:01"), DataType::Timestamp(TimeUnit::Second, None) ); - let reg = Regex::new(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$").ok(); assert_eq!( - infer_field_schema("2020-11-08 14:20:01", reg), + infer_field_schema("2020-11-08 14:20:01"), DataType::Timestamp(TimeUnit::Second, None) ); - assert_eq!(infer_field_schema("-5.13", None), DataType::Float64); - assert_eq!(infer_field_schema("0.1300", None), DataType::Float64); + assert_eq!(infer_field_schema("-5.13"), DataType::Float64); + assert_eq!(infer_field_schema("0.1300"), DataType::Float64); assert_eq!( - infer_field_schema("2021-12-19 13:12:30.921", None), + infer_field_schema("2021-12-19 13:12:30.921"), DataType::Timestamp(TimeUnit::Millisecond, None) ); assert_eq!( - infer_field_schema("2021-12-19T13:12:30.123456789", None), + infer_field_schema("2021-12-19T13:12:30.123456789"), DataType::Timestamp(TimeUnit::Nanosecond, None) ); } #[test] fn parse_date32() { - assert_eq!(parse_item::("1970-01-01").unwrap(), 0); - assert_eq!(parse_item::("2020-03-15").unwrap(), 18336); - assert_eq!(parse_item::("1945-05-08").unwrap(), -9004); + assert_eq!(Date32Type::parse("1970-01-01").unwrap(), 0); + assert_eq!(Date32Type::parse("2020-03-15").unwrap(), 18336); + assert_eq!(Date32Type::parse("1945-05-08").unwrap(), -9004); } #[test] fn parse_time() { assert_eq!( - parse_item::("12:10:01.123456789 AM"), + Time64NanosecondType::parse("12:10:01.123456789 AM"), Some(601_123_456_789) ); assert_eq!( - parse_item::("12:10:01.123456 am"), + Time64MicrosecondType::parse("12:10:01.123456 am"), Some(601_123_456) ); assert_eq!( - parse_item::("2:10:01.12 PM"), + Time32MillisecondType::parse("2:10:01.12 PM"), Some(51_001_120) ); - assert_eq!(parse_item::("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); } #[test] fn parse_date64() { - assert_eq!(parse_item::("1970-01-01T00:00:00").unwrap(), 0); + assert_eq!(Date64Type::parse("1970-01-01T00:00:00").unwrap(), 0); assert_eq!( - parse_item::("2018-11-13T17:11:10").unwrap(), + Date64Type::parse("2018-11-13T17:11:10").unwrap(), 1542129070000 ); assert_eq!( - parse_item::("2018-11-13T17:11:10.011").unwrap(), + Date64Type::parse("2018-11-13T17:11:10.011").unwrap(), 1542129070011 ); assert_eq!( - parse_item::("1900-02-28T12:34:56").unwrap(), + Date64Type::parse("1900-02-28T12:34:56").unwrap(), -2203932304000 ); assert_eq!( - parse_formatted::("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S") + Date64Type::parse_formatted("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S") .unwrap(), -2203932304000 ); assert_eq!( - parse_formatted::( + Date64Type::parse_formatted( "1900-02-28 12:34:56+0030", "%Y-%m-%d %H:%M:%S%z" ) @@ -1821,13 +1616,13 @@ mod tests { "1970-01-01T00:00:00+02:00", ] .join("\n"); - let mut decoder = ReaderBuilder::new() - .with_schema(Arc::new(Schema::new(vec![Field::new( - "field", - DataType::Timestamp(T::UNIT, timezone.clone()), - true, - )]))) - .build_decoder(); + let schema = Arc::new(Schema::new(vec![Field::new( + "field", + DataType::Timestamp(T::UNIT, timezone.clone()), + true, + )])); + + let mut decoder = ReaderBuilder::new(schema).build_decoder(); let decoded = decoder.decode(csv.as_bytes()).unwrap(); assert_eq!(decoded, csv.len()); @@ -1933,17 +1728,12 @@ mod tests { let reader = std::io::Cursor::new(data); - let mut csv = Reader::new( - reader, - Arc::new(schema), - false, - None, - 2, - // starting at row 2 and up to row 6. - Some((2, 6)), - Some(vec![0]), - None, - ); + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .with_batch_size(2) + .with_projection(vec![0]) + .with_bounds(2, 6) + .build_buffered(reader) + .unwrap(); let batch = csv.next().unwrap().unwrap(); let a = batch.column(0); @@ -1968,20 +1758,12 @@ mod tests { .map(|x| x.join(",")) .collect::>() .join("\n"); - let data = data.as_bytes(); - let reader = std::io::Cursor::new(data); - - let mut csv = Reader::new( - reader, - Arc::new(schema), - false, - None, - 2, - None, - Some(vec![]), - None, - ); + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .with_batch_size(2) + .with_projection(vec![]) + .build_buffered(Cursor::new(data.as_bytes())) + .unwrap(); let batch = csv.next().unwrap().unwrap(); assert_eq!(batch.columns().len(), 0); @@ -2012,23 +1794,21 @@ mod tests { #[test] fn test_parsing_float() { - assert_eq!(Some(12.34), parse_item::("12.34")); - assert_eq!(Some(-12.34), parse_item::("-12.34")); - assert_eq!(Some(12.0), parse_item::("12")); - assert_eq!(Some(0.0), parse_item::("0")); - assert_eq!(Some(2.0), parse_item::("2.")); - assert_eq!(Some(0.2), parse_item::(".2")); - assert!(parse_item::("nan").unwrap().is_nan()); - assert!(parse_item::("NaN").unwrap().is_nan()); - assert!(parse_item::("inf").unwrap().is_infinite()); - assert!(parse_item::("inf").unwrap().is_sign_positive()); - assert!(parse_item::("-inf").unwrap().is_infinite()); - assert!(parse_item::("-inf") - .unwrap() - .is_sign_negative()); - assert_eq!(None, parse_item::("")); - assert_eq!(None, parse_item::("dd")); - assert_eq!(None, parse_item::("12.34.56")); + assert_eq!(Some(12.34), Float64Type::parse("12.34")); + assert_eq!(Some(-12.34), Float64Type::parse("-12.34")); + assert_eq!(Some(12.0), Float64Type::parse("12")); + assert_eq!(Some(0.0), Float64Type::parse("0")); + assert_eq!(Some(2.0), Float64Type::parse("2.")); + assert_eq!(Some(0.2), Float64Type::parse(".2")); + assert!(Float64Type::parse("nan").unwrap().is_nan()); + assert!(Float64Type::parse("NaN").unwrap().is_nan()); + assert!(Float64Type::parse("inf").unwrap().is_infinite()); + assert!(Float64Type::parse("inf").unwrap().is_sign_positive()); + assert!(Float64Type::parse("-inf").unwrap().is_infinite()); + assert!(Float64Type::parse("-inf").unwrap().is_sign_negative()); + assert_eq!(None, Float64Type::parse("")); + assert_eq!(None, Float64Type::parse("dd")); + assert_eq!(None, Float64Type::parse("12.34.56")); } #[test] @@ -2037,8 +1817,7 @@ mod tests { Field::new("text1", DataType::Utf8, false), Field::new("text2", DataType::Utf8, false), ]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) + let builder = ReaderBuilder::new(Arc::new(schema)) .has_header(false) .with_quote(b'~'); // default is ", change to ~ @@ -2070,8 +1849,7 @@ mod tests { Field::new("text1", DataType::Utf8, false), Field::new("text2", DataType::Utf8, false), ]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) + let builder = ReaderBuilder::new(Arc::new(schema)) .has_header(false) .with_escape(b'\\'); // default is None, change to \ @@ -2103,8 +1881,7 @@ mod tests { Field::new("text1", DataType::Utf8, false), Field::new("text2", DataType::Utf8, false), ]); - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) + let builder = ReaderBuilder::new(Arc::new(schema)) .has_header(false) .with_terminator(b'\n'); // default is CRLF, change to LF @@ -2141,14 +1918,18 @@ mod tests { (Some((0, 4)), true, 4), (Some((1, 4)), true, 3), ]; + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("a", DataType::Utf8, false), + ])); for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() { - let mut reader = ReaderBuilder::new().has_header(has_header); + let mut reader = ReaderBuilder::new(schema.clone()).has_header(has_header); if let Some((start, end)) = bounds { reader = reader.with_bounds(start, end); } let b = reader - .build(Cursor::new(csv.as_bytes())) + .build_buffered(Cursor::new(csv.as_bytes())) .unwrap() .next() .unwrap() @@ -2160,7 +1941,12 @@ mod tests { #[test] fn test_null_boolean() { let csv = "true,false\nFalse,True\n,True\nFalse,"; - let b = ReaderBuilder::new() + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Boolean, true), + Field::new("a", DataType::Boolean, true), + ])); + + let b = ReaderBuilder::new(schema) .build_buffered(Cursor::new(csv.as_bytes())) .unwrap() .next() @@ -2194,9 +1980,14 @@ mod tests { ]; for (path, has_header, expected_rows) in tests { + let (schema, _) = Format::default() + .infer_schema(File::open(path).unwrap(), None) + .unwrap(); + let schema = Arc::new(schema); + for batch_size in [1, 4] { for capacity in [1, 3, 7, 100] { - let reader = ReaderBuilder::new() + let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) .has_header(has_header) .build(File::open(path).unwrap()) @@ -2214,7 +2005,7 @@ mod tests { File::open(path).unwrap(), ); - let reader = ReaderBuilder::new() + let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) .has_header(has_header) .build_buffered(buffered) @@ -2233,8 +2024,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ])); let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); - let b = ReaderBuilder::new() - .with_schema(schema) + let b = ReaderBuilder::new(schema) .with_batch_size(2) .build_buffered(buffer) .unwrap(); @@ -2314,8 +2104,7 @@ mod tests { ])); let csv = "foo,bar\nbaz,foo\na,b\nc,d"; let mut read = InstrumentedRead::new(Cursor::new(csv.as_bytes())); - let reader = ReaderBuilder::new() - .with_schema(schema) + let reader = ReaderBuilder::new(schema) .with_batch_size(3) .build_buffered(&mut read) .unwrap(); @@ -2383,7 +2172,7 @@ mod tests { for (values, expected) in cases { let mut t = InferredDataType::default(); for v in *values { - t.update(v, None) + t.update(v) } assert_eq!(&t.get(), expected, "{values:?}") } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 90c32832a8f4..5f542be30a73 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -331,7 +331,7 @@ impl WriterBuilder { mod tests { use super::*; - use crate::Reader; + use crate::ReaderBuilder; use arrow_array::builder::{Decimal128Builder, Decimal256Builder}; use arrow_array::types::*; use arrow_buffer::i256; @@ -560,17 +560,11 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo } buf.set_position(0); - let mut reader = Reader::new( - buf, - Arc::new(schema), - false, - None, - 3, - // starting at row 2 and up to row 6. - None, - None, - None, - ); + let mut reader = ReaderBuilder::new(Arc::new(schema)) + .with_batch_size(3) + .build_buffered(buf) + .unwrap(); + let rb = reader.next().unwrap().unwrap(); let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); let c2 = rb.column(1).as_any().downcast_ref::().unwrap(); diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 66a956315b29..c2491a5a0b04 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -40,8 +40,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { c.bench_function(&format!("{name} - {batch_size}"), |b| { b.iter(|| { let cursor = Cursor::new(buf.as_slice()); - let reader = csv::ReaderBuilder::new() - .with_schema(batch.schema()) + let reader = csv::ReaderBuilder::new(batch.schema()) .with_batch_size(batch_size) .has_header(true) .build_buffered(cursor) diff --git a/arrow/examples/read_csv.rs b/arrow/examples/read_csv.rs index efb55c6d2876..60545a6e52d8 100644 --- a/arrow/examples/read_csv.rs +++ b/arrow/examples/read_csv.rs @@ -37,8 +37,10 @@ fn main() { ); let file = File::open(path).unwrap(); - let mut csv = - csv::Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); + let mut csv = csv::ReaderBuilder::new(Arc::new(schema)) + .build(file) + .unwrap(); + let batch = csv.next().unwrap().unwrap(); print_batches(&[batch]).unwrap(); } diff --git a/arrow/examples/read_csv_infer_schema.rs b/arrow/examples/read_csv_infer_schema.rs index 2a713ba6109c..bd3c1c6a4623 100644 --- a/arrow/examples/read_csv_infer_schema.rs +++ b/arrow/examples/read_csv_infer_schema.rs @@ -19,17 +19,22 @@ extern crate arrow; use arrow::csv; use arrow::util::pretty::print_batches; +use arrow_csv::reader::Format; use std::fs::File; +use std::io::Seek; +use std::sync::Arc; fn main() { let path = format!( "{}/../arrow-csv/test/data/uk_cities_with_headers.csv", env!("CARGO_MANIFEST_DIR") ); - let file = File::open(path).unwrap(); - let builder = csv::ReaderBuilder::new() - .has_header(true) - .infer_schema(Some(100)); + let mut file = File::open(path).unwrap(); + let format = Format::default().with_header(true); + let (schema, _) = format.infer_schema(&mut file, Some(100)).unwrap(); + file.rewind().unwrap(); + + let builder = csv::ReaderBuilder::new(Arc::new(schema)).with_format(format); let mut csv = builder.build(file).unwrap(); let batch = csv.next().unwrap().unwrap(); print_batches(&[batch]).unwrap(); diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 0a9950e9cfcd..4e96fb87851b 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -314,8 +314,7 @@ fn configure_reader_builder(args: &Args, arrow_schema: Arc) -> ReaderBui } } - let mut builder = ReaderBuilder::new() - .with_schema(arrow_schema) + let mut builder = ReaderBuilder::new(arrow_schema) .with_batch_size(args.batch_size) .has_header(args.has_header) .with_delimiter(args.get_delimiter()); From 67176f0f1ae925a76fdc2b09d541775cb76d1b67 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Fri, 28 Apr 2023 11:46:23 +0200 Subject: [PATCH 0862/1411] feat: support `bitwise` shift left/right (#4148) * feat: support bitwise shift left/right * ignore truncation * add test when shifting by more than the number of bits --- arrow-arith/src/bitwise.rs | 56 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arrow-arith/src/bitwise.rs b/arrow-arith/src/bitwise.rs index 08cc246b351a..f9f456bf95fc 100644 --- a/arrow-arith/src/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -17,7 +17,9 @@ use crate::arity::{binary, unary}; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; +use num::traits::{WrappingShl, WrappingShr}; use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array @@ -121,6 +123,38 @@ where Ok(unary(array, |value| value ^ scalar)) } +/// Perform bitwise 'left << right' operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_shift_left( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result, ArrowError> +where + T: ArrowNumericType, + T::Native: WrappingShl, +{ + bitwise_op(left, right, |a, b| { + let b = b.as_usize(); + a.wrapping_shl(b as u32) + }) +} + +/// Perform bitwise 'left >> right' operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_shift_right( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result, ArrowError> +where + T: ArrowNumericType, + T::Native: WrappingShr, +{ + bitwise_op(left, right, |a, b| { + let b = b.as_usize(); + a.wrapping_shr(b as u32) + }) +} + #[cfg(test)] mod tests { use super::*; @@ -143,6 +177,28 @@ mod tests { Ok(()) } + #[test] + fn test_bitwise_shift_left() { + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); + let right = + UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]); + let expected = + UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]); + let result = bitwise_shift_left(&left, &right).unwrap(); + assert_eq!(expected, result); + } + + #[test] + fn test_bitwise_shift_right() { + let left = + UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let right = + UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]); + let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(1)]); + let result = bitwise_shift_right(&left, &right).unwrap(); + assert_eq!(expected, result); + } + #[test] fn test_bitwise_and_array_scalar() { // unsigned value From b717b39393367d1de7577078c13b91c59a62d581 Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Fri, 28 Apr 2023 02:47:17 -0700 Subject: [PATCH 0863/1411] Better flight SQL example codes (#4144) * Better flight sql example codes * Better flight sql example codes * feat: flight sql server enable tcp no deplay * Remove unnecessary doc --------- Co-authored-by: Raphael Taylor-Davies --- arrow-flight/examples/flight_sql_server.rs | 196 +++++++++++---------- 1 file changed, 107 insertions(+), 89 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 43154420d424..23d71090ae47 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -546,8 +546,7 @@ impl ProstMessageExt for FetchResults { #[cfg(test)] mod tests { use super::*; - use futures::future::BoxFuture; - use futures::{FutureExt, TryStreamExt}; + use futures::TryStreamExt; use std::fs; use std::future::Future; use std::net::SocketAddr; @@ -571,42 +570,6 @@ mod tests { (incoming, addr) } - async fn client_with_uds(path: String) -> FlightSqlServiceClient { - let connector = service_fn(move |_| UnixStream::connect(path.clone())); - let channel = Endpoint::try_from("http://example.com") - .unwrap() - .connect_with_connector(connector) - .await - .unwrap(); - FlightSqlServiceClient::new(channel) - } - - type ServeFut = BoxFuture<'static, Result<(), tonic::transport::Error>>; - - async fn create_https_server( - ) -> Result<(ServeFut, SocketAddr), tonic::transport::Error> { - let cert = std::fs::read_to_string("examples/data/server.pem").unwrap(); - let key = std::fs::read_to_string("examples/data/server.key").unwrap(); - let client_ca = std::fs::read_to_string("examples/data/client_ca.pem").unwrap(); - - let tls_config = ServerTlsConfig::new() - .identity(Identity::from_pem(&cert, &key)) - .client_ca_root(Certificate::from_pem(&client_ca)); - - let (incoming, addr) = bind_tcp().await; - - let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); - - let serve = Server::builder() - .tls_config(tls_config) - .unwrap() - .add_service(svc) - .serve_with_incoming(incoming) - .boxed(); - - Ok((serve, addr)) - } - fn endpoint(uri: String) -> Result { let endpoint = Endpoint::new(uri) .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? @@ -621,56 +584,12 @@ mod tests { Ok(endpoint) } - #[tokio::test] - async fn test_select_https() { - let (serve, addr) = create_https_server().await.unwrap(); - let uri = format!("https://{}:{}", addr.ip(), addr.port()); - - let request_future = async { - let cert = std::fs::read_to_string("examples/data/client1.pem").unwrap(); - let key = std::fs::read_to_string("examples/data/client1.key").unwrap(); - let server_ca = std::fs::read_to_string("examples/data/ca.pem").unwrap(); - - let tls_config = ClientTlsConfig::new() - .domain_name("localhost") - .ca_certificate(Certificate::from_pem(&server_ca)) - .identity(Identity::from_pem(cert, key)); - let endpoint = endpoint(uri).unwrap().tls_config(tls_config).unwrap(); - let channel = endpoint.connect().await.unwrap(); - let mut client = FlightSqlServiceClient::new(channel); - let token = client.handshake("admin", "password").await.unwrap(); - client.set_token(String::from_utf8(token.to_vec()).unwrap()); - println!("Auth succeeded with token: {:?}", token); - let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); - let flight_info = stmt.execute().await.unwrap(); - let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); - let flight_data = client.do_get(ticket).await.unwrap(); - let flight_data: Vec = flight_data.try_collect().await.unwrap(); - let batches = flight_data_to_batches(&flight_data).unwrap(); - let res = pretty_format_batches(batches.as_slice()).unwrap(); - let expected = r#" -+-------------------+ -| salutation | -+-------------------+ -| Hello, FlightSQL! | -+-------------------+"# - .trim() - .to_string(); - assert_eq!(res.to_string(), expected); - }; - - tokio::select! { - _ = serve => panic!("server finished"), - _ = request_future => println!("Client finished!"), - } - } - async fn auth_client(client: &mut FlightSqlServiceClient) { let token = client.handshake("admin", "password").await.unwrap(); client.set_token(String::from_utf8(token.to_vec()).unwrap()); } - async fn test_client(f: F) + async fn test_uds_client(f: F) where F: FnOnce(FlightSqlServiceClient) -> C, C: Future, @@ -682,14 +601,91 @@ mod tests { let uds = UnixListener::bind(path.clone()).unwrap(); let stream = UnixListenerStream::new(uds); - // We would just listen on TCP, but it seems impossible to know when tonic is ready to serve let service = FlightSqlServiceImpl {}; let serve_future = Server::builder() .add_service(FlightServiceServer::new(service)) .serve_with_incoming(stream); let request_future = async { - let client = client_with_uds(path).await; + let connector = service_fn(move |_| UnixStream::connect(path.clone())); + let channel = Endpoint::try_from("http://example.com") + .unwrap() + .connect_with_connector(connector) + .await + .unwrap(); + let client = FlightSqlServiceClient::new(channel); + f(client).await + }; + + tokio::select! { + _ = serve_future => panic!("server returned first"), + _ = request_future => println!("Client finished!"), + } + } + + async fn test_http_client(f: F) + where + F: FnOnce(FlightSqlServiceClient) -> C, + C: Future, + { + let (incoming, addr) = bind_tcp().await; + let uri = format!("http://{}:{}", addr.ip(), addr.port()); + + let service = FlightSqlServiceImpl {}; + let serve_future = Server::builder() + .add_service(FlightServiceServer::new(service)) + .serve_with_incoming(incoming); + + let request_future = async { + let endpoint = endpoint(uri).unwrap(); + let channel = endpoint.connect().await.unwrap(); + let client = FlightSqlServiceClient::new(channel); + f(client).await + }; + + tokio::select! { + _ = serve_future => panic!("server returned first"), + _ = request_future => println!("Client finished!"), + } + } + + async fn test_https_client(f: F) + where + F: FnOnce(FlightSqlServiceClient) -> C, + C: Future, + { + let cert = std::fs::read_to_string("examples/data/server.pem").unwrap(); + let key = std::fs::read_to_string("examples/data/server.key").unwrap(); + let client_ca = std::fs::read_to_string("examples/data/client_ca.pem").unwrap(); + + let tls_config = ServerTlsConfig::new() + .identity(Identity::from_pem(&cert, &key)) + .client_ca_root(Certificate::from_pem(&client_ca)); + + let (incoming, addr) = bind_tcp().await; + let uri = format!("https://{}:{}", addr.ip(), addr.port()); + + let svc = FlightServiceServer::new(FlightSqlServiceImpl {}); + + let serve_future = Server::builder() + .tls_config(tls_config) + .unwrap() + .add_service(svc) + .serve_with_incoming(incoming); + + let request_future = async { + let cert = std::fs::read_to_string("examples/data/client1.pem").unwrap(); + let key = std::fs::read_to_string("examples/data/client1.key").unwrap(); + let server_ca = std::fs::read_to_string("examples/data/ca.pem").unwrap(); + + let tls_config = ClientTlsConfig::new() + .domain_name("localhost") + .ca_certificate(Certificate::from_pem(&server_ca)) + .identity(Identity::from_pem(cert, key)); + + let endpoint = endpoint(uri).unwrap().tls_config(tls_config).unwrap(); + let channel = endpoint.connect().await.unwrap(); + let client = FlightSqlServiceClient::new(channel); f(client).await }; @@ -699,16 +695,38 @@ mod tests { } } + async fn test_all_clients(task: F) + where + F: FnOnce(FlightSqlServiceClient) -> C + Copy, + C: Future, + { + println!("testing uds client"); + test_uds_client(task).await; + println!("======="); + + println!("testing http client"); + test_http_client(task).await; + println!("======="); + + println!("testing https client"); + test_https_client(task).await; + println!("======="); + } + #[tokio::test] - async fn test_select_1() { - test_client(|mut client| async move { + async fn test_select() { + test_all_clients(|mut client| async move { auth_client(&mut client).await; + let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); + let flight_info = stmt.execute().await.unwrap(); + let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); let flight_data = client.do_get(ticket).await.unwrap(); let flight_data: Vec = flight_data.try_collect().await.unwrap(); let batches = flight_data_to_batches(&flight_data).unwrap(); + let res = pretty_format_batches(batches.as_slice()).unwrap(); let expected = r#" +-------------------+ @@ -725,7 +743,7 @@ mod tests { #[tokio::test] async fn test_execute_update() { - test_client(|mut client| async move { + test_all_clients(|mut client| async move { auth_client(&mut client).await; let res = client .execute_update("creat table test(a int);".to_string()) @@ -738,7 +756,7 @@ mod tests { #[tokio::test] async fn test_auth() { - test_client(|mut client| async move { + test_all_clients(|mut client| async move { // no handshake assert!(client .prepare("select 1;".to_string()) From 5e5561e619739f383a54e34292d09998a693ad4f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Apr 2023 13:43:36 -0400 Subject: [PATCH 0864/1411] Remove Type from NativeIndex (#4146) * Remove Type from NativeIndex * Review feedback --- parquet/src/file/page_index/index.rs | 16 ++++++---------- parquet/src/file/page_index/index_reader.rs | 18 ++++++++---------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 8f9cb66298b5..f3a09046a63c 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -19,7 +19,7 @@ use crate::basic::Type; use crate::data_type::private::ParquetValueType; -use crate::data_type::{ByteArray, Int96}; +use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; use crate::errors::ParquetError; use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; @@ -73,7 +73,7 @@ pub enum Index { FLOAT(NativeIndex), DOUBLE(NativeIndex), BYTE_ARRAY(NativeIndex), - FIXED_LEN_BYTE_ARRAY(NativeIndex), + FIXED_LEN_BYTE_ARRAY(NativeIndex), } impl Index { @@ -103,11 +103,9 @@ impl Index { } } -/// Stores the [`PageIndex`] for each page of a column with [`Type`] +/// Stores the [`PageIndex`] for each page of a column #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { - /// The physical type of this column - pub physical_type: Type, /// The indexes, one item per page pub indexes: Vec>, /// If the min/max elements are ordered, and if so in which @@ -118,11 +116,10 @@ pub struct NativeIndex { } impl NativeIndex { + pub const PHYSICAL_TYPE: Type = T::PHYSICAL_TYPE; + /// Creates a new [`NativeIndex`] - pub(crate) fn try_new( - index: ColumnIndex, - physical_type: Type, - ) -> Result { + pub(crate) fn try_new(index: ColumnIndex) -> Result { let len = index.min_values.len(); let null_counts = index @@ -153,7 +150,6 @@ impl NativeIndex { .collect::, ParquetError>>()?; Ok(Self { - physical_type, indexes, boundary_order: index.boundary_order, }) diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 3ae37cf87f8b..27e9a6260090 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -174,17 +174,15 @@ pub(crate) fn deserialize_column_index( let index = ColumnIndex::read_from_in_protocol(&mut prot)?; let index = match column_type { - Type::BOOLEAN => { - Index::BOOLEAN(NativeIndex::::try_new(index, column_type)?) - } - Type::INT32 => Index::INT32(NativeIndex::::try_new(index, column_type)?), - Type::INT64 => Index::INT64(NativeIndex::::try_new(index, column_type)?), - Type::INT96 => Index::INT96(NativeIndex::::try_new(index, column_type)?), - Type::FLOAT => Index::FLOAT(NativeIndex::::try_new(index, column_type)?), - Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new(index, column_type)?), - Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new(index, column_type)?), + Type::BOOLEAN => Index::BOOLEAN(NativeIndex::::try_new(index)?), + Type::INT32 => Index::INT32(NativeIndex::::try_new(index)?), + Type::INT64 => Index::INT64(NativeIndex::::try_new(index)?), + Type::INT96 => Index::INT96(NativeIndex::::try_new(index)?), + Type::FLOAT => Index::FLOAT(NativeIndex::::try_new(index)?), + Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new(index)?), + Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new(index)?), Type::FIXED_LEN_BYTE_ARRAY => { - Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new(index, column_type)?) + Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new(index)?) } }; From 521fdb99374cfe856fdb03e84bb56f09b020d785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Fri, 28 Apr 2023 21:15:09 +0300 Subject: [PATCH 0865/1411] InMemory append API (#4153) * ready to review * clippy fix * Refactor code to remove byte duplication * simplify shutdown Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: metesynnada <100111937+metesynnada@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/memory.rs | 104 +++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 057a260f757b..b01ffbb02495 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -117,6 +117,17 @@ impl ObjectStore for InMemory { Ok(()) } + async fn append( + &self, + location: &Path, + ) -> Result> { + Ok(Box::new(InMemoryAppend { + location: location.clone(), + data: Vec::::new(), + storage: StorageType::clone(&self.storage), + })) + } + async fn get(&self, location: &Path) -> Result { let data = self.entry(location).await?; @@ -329,8 +340,55 @@ impl AsyncWrite for InMemoryUpload { } } +struct InMemoryAppend { + location: Path, + data: Vec, + storage: StorageType, +} + +impl AsyncWrite for InMemoryAppend { + fn poll_write( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + self.data.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let storage = StorageType::clone(&self.storage); + + let mut writer = storage.write(); + + if let Some((bytes, _)) = writer.remove(&self.location) { + let buf = std::mem::take(&mut self.data); + let concat = Bytes::from_iter(bytes.into_iter().chain(buf.into_iter())); + writer.insert(self.location.clone(), (concat, Utc::now())); + } else { + writer.insert( + self.location.clone(), + (Bytes::from(std::mem::take(&mut self.data)), Utc::now()), + ); + }; + Poll::Ready(Ok(())) + } + + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.poll_flush(cx) + } +} + #[cfg(test)] mod tests { + use tokio::io::AsyncWriteExt; + use super::*; use crate::{ @@ -396,4 +454,50 @@ mod tests { panic!("unexpected error type: {err:?}"); } } + + #[tokio::test] + async fn test_append_new() { + let in_memory = InMemory::new(); + let location = Path::from("some_file"); + let data = Bytes::from("arbitrary data"); + let expected_data = data.clone(); + + let mut writer = in_memory.append(&location).await.unwrap(); + writer.write_all(&data).await.unwrap(); + writer.flush().await.unwrap(); + + let read_data = in_memory + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } + + #[tokio::test] + async fn test_append_existing() { + let in_memory = InMemory::new(); + let location = Path::from("some_file"); + let data = Bytes::from("arbitrary"); + let data_appended = Bytes::from(" data"); + let expected_data = Bytes::from("arbitrary data"); + + let mut writer = in_memory.append(&location).await.unwrap(); + writer.write_all(&data).await.unwrap(); + writer.flush().await.unwrap(); + + writer.write_all(&data_appended).await.unwrap(); + writer.flush().await.unwrap(); + + let read_data = in_memory + .get(&location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + assert_eq!(&*read_data, expected_data); + } } From 1434d1f4ddbe50e7729b7b69bdb8b7e10934f806 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Apr 2023 17:57:30 -0400 Subject: [PATCH 0866/1411] Cleanup reading page index (#4149) (#4090) (#4151) * Cleanup reading page index (#4149) (#4090) * Review feedback * Add test * Review feedback --- parquet/src/arrow/async_reader/mod.rs | 124 ++++++++---------- parquet/src/file/metadata.rs | 15 +++ parquet/src/file/page_index/index_reader.rs | 137 +++++++------------- parquet/src/file/serialized_reader.rs | 25 ++++ 4 files changed, 138 insertions(+), 163 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index a0e7ff72a153..09b107e31f30 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -78,18 +78,16 @@ use std::collections::VecDeque; use std::fmt::Formatter; -use std::io::{Cursor, SeekFrom}; +use std::io::SeekFrom; use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::format::{OffsetIndex, PageLocation}; use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::ready; use futures::stream::Stream; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; @@ -109,9 +107,13 @@ use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use crate::file::page_index::index::Index; +use crate::file::page_index::index_reader::{ + acc_range, decode_column_index, decode_offset_index, +}; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; +use crate::format::PageLocation; -use crate::file::page_index::index_reader; use crate::file::FOOTER_SIZE; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; @@ -121,6 +123,7 @@ pub use metadata::*; #[cfg(feature = "object_store")] mod store; + #[cfg(feature = "object_store")] pub use store::*; @@ -240,78 +243,53 @@ impl ArrowReaderBuilder> { && metadata.column_index().is_none() && metadata.offset_index().is_none() { - let mut fetch_ranges = vec![]; - let mut index_lengths: Vec> = vec![]; - - for rg in metadata.row_groups() { - let (loc_offset, loc_length) = - index_reader::get_location_offset_and_total_length(rg.columns())?; - - let (idx_offset, idx_lengths) = - index_reader::get_index_offset_and_lengths(rg.columns())?; - let idx_length = idx_lengths.iter().sum::(); - - // If index data is missing, return without any indexes - if loc_length == 0 || idx_length == 0 { - return Self::new_builder(AsyncReader(input), metadata, options); - } - - fetch_ranges.push(loc_offset as usize..loc_offset as usize + loc_length); - fetch_ranges.push(idx_offset as usize..idx_offset as usize + idx_length); - index_lengths.push(idx_lengths); - } - - let mut chunks = input.get_byte_ranges(fetch_ranges).await?.into_iter(); - let mut index_lengths = index_lengths.into_iter(); - - let mut row_groups = metadata.row_groups().to_vec(); - - let mut columns_indexes = vec![]; - let mut offset_indexes = vec![]; - - for rg in row_groups.iter_mut() { - let columns = rg.columns(); - - let location_data = chunks.next().unwrap(); - let mut cursor = Cursor::new(location_data); - let mut offset_index = vec![]; - - for _ in 0..columns.len() { - let mut prot = TCompactInputProtocol::new(&mut cursor); - let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - offset_index.push(offset.page_locations); + let fetch = metadata.row_groups().iter().flat_map(|r| r.columns()).fold( + None, + |a, c| { + let a = acc_range(a, c.column_index_range()); + acc_range(a, c.offset_index_range()) + }, + ); + + if let Some(fetch) = fetch { + let bytes = input.get_bytes(fetch.clone()).await?; + let get = |r: Range| { + &bytes[(r.start - fetch.start)..(r.end - fetch.start)] + }; + + let mut offset_index = Vec::with_capacity(metadata.num_row_groups()); + let mut column_index = Vec::with_capacity(metadata.num_row_groups()); + for rg in metadata.row_groups() { + let columns = rg.columns(); + let mut rg_offset_index = Vec::with_capacity(columns.len()); + let mut rg_column_index = Vec::with_capacity(columns.len()); + + for chunk in rg.columns() { + let t = chunk.column_type(); + let c = match chunk.column_index_range() { + Some(range) => decode_column_index(get(range), t)?, + None => Index::NONE, + }; + + let o = match chunk.offset_index_range() { + Some(range) => decode_offset_index(get(range))?, + None => return Err(general_err!("missing offset index")), + }; + + rg_column_index.push(c); + rg_offset_index.push(o); + } + offset_index.push(rg_offset_index); + column_index.push(rg_column_index); } - offset_indexes.push(offset_index); - - let index_data = chunks.next().unwrap(); - let index_lengths = index_lengths.next().unwrap(); - - let mut start = 0; - let data = index_lengths.into_iter().map(|length| { - let r = index_data.slice(start..start + length); - start += length; - r - }); - - let indexes = rg - .columns() - .iter() - .zip(data) - .map(|(column, data)| { - let column_type = column.column_type(); - index_reader::deserialize_column_index(&data, column_type) - }) - .collect::>>()?; - columns_indexes.push(indexes); + metadata = Arc::new(ParquetMetaData::new_with_page_index( + metadata.file_metadata().clone(), + metadata.row_groups().to_vec(), + Some(column_index), + Some(offset_index), + )); } - - metadata = Arc::new(ParquetMetaData::new_with_page_index( - metadata.file_metadata().clone(), - row_groups, - Some(columns_indexes), - Some(offset_indexes), - )); } Self::new_builder(AsyncReader(input), metadata, options) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 41097e1075a9..85287c3e0e85 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -33,6 +33,7 @@ //! [`ColumnChunkMetaData`](struct.ColumnChunkMetaData.html) has information about column //! chunk (primitive leaf column), including encoding/compression, number of values, etc. +use std::ops::Range; use std::sync::Arc; use crate::format::{ @@ -565,6 +566,13 @@ impl ColumnChunkMetaData { self.column_index_length } + /// Returns the range for the offset index if any + pub(crate) fn column_index_range(&self) -> Option> { + let offset = usize::try_from(self.column_index_offset?).ok()?; + let length = usize::try_from(self.column_index_length?).ok()?; + Some(offset..(offset + length)) + } + /// Returns the offset for the offset index. pub fn offset_index_offset(&self) -> Option { self.offset_index_offset @@ -575,6 +583,13 @@ impl ColumnChunkMetaData { self.offset_index_length } + /// Returns the range for the offset index if any + pub(crate) fn offset_index_range(&self) -> Option> { + let offset = usize::try_from(self.offset_index_offset?).ok()?; + let length = usize::try_from(self.offset_index_length?).ok()?; + Some(offset..(offset + length)) + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 27e9a6260090..c36708a59aeb 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -24,9 +24,23 @@ use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; -use std::io::{Cursor, Read}; +use std::io::Cursor; +use std::ops::Range; use thrift::protocol::{TCompactInputProtocol, TSerializable}; +/// Computes the covering range of two optional ranges +/// +/// For example `acc_range(Some(7..9), Some(1..3)) = Some(1..9)` +pub(crate) fn acc_range( + a: Option>, + b: Option>, +) -> Option> { + match (a, b) { + (Some(a), Some(b)) => Some(a.start.min(b.start)..a.end.max(b.end)), + (None, x) | (x, None) => x, + } +} + /// Reads per-column [`Index`] for all columns of a row group by /// decoding [`ColumnIndex`] . /// @@ -42,31 +56,23 @@ pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], ) -> Result, ParquetError> { - let (offset, lengths) = get_index_offset_and_lengths(chunks)?; - let length = lengths.iter().sum::(); - - if length == 0 { - return Ok(vec![Index::NONE; chunks.len()]); - } + let fetch = chunks + .iter() + .fold(None, |range, c| acc_range(range, c.column_index_range())); - //read all need data into buffer - let mut reader = reader.get_read(offset, length)?; - let mut data = vec![0; length]; - reader.read_exact(&mut data)?; + let fetch = match fetch { + Some(r) => r, + None => return Ok(vec![Index::NONE; chunks.len()]), + }; - let mut start = 0; - let data = lengths.into_iter().map(|length| { - let r = &data[start..start + length]; - start += length; - r - }); + let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; + let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; chunks .iter() - .zip(data) - .map(|(chunk, data)| { - let column_type = chunk.column_type(); - deserialize_column_index(data, column_type) + .map(|c| match c.column_index_range() { + Some(r) => decode_column_index(get(r), c.column_type()), + None => Ok(Index::NONE), }) .collect() } @@ -86,88 +92,39 @@ pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], ) -> Result>, ParquetError> { - let (offset, total_length) = get_location_offset_and_total_length(chunks)?; - - if total_length == 0 { - return Ok(vec![]); - } - - //read all need data into buffer - let mut reader = reader.get_read(offset, total_length)?; - let mut data = vec![0; total_length]; - reader.read_exact(&mut data)?; - - let mut d = Cursor::new(data); - let mut result = vec![]; - - for _ in 0..chunks.len() { - let mut prot = TCompactInputProtocol::new(&mut d); - let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - result.push(offset.page_locations); - } - Ok(result) -} + let fetch = chunks + .iter() + .fold(None, |range, c| acc_range(range, c.offset_index_range())); -//Get File offsets of every ColumnChunk's page_index -//If there are invalid offset return a zero offset with empty lengths. -pub(crate) fn get_index_offset_and_lengths( - chunks: &[ColumnChunkMetaData], -) -> Result<(u64, Vec), ParquetError> { - let first_col_metadata = if let Some(chunk) = chunks.first() { - chunk - } else { - return Ok((0, vec![])); + let fetch = match fetch { + Some(r) => r, + None => return Ok(vec![]), }; - let offset: u64 = if let Some(offset) = first_col_metadata.column_index_offset() { - offset.try_into().unwrap() - } else { - return Ok((0, vec![])); - }; + let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; + let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; - let lengths = chunks + chunks .iter() - .map(|x| x.column_index_length()) - .map(|maybe_length| { - let index_length = maybe_length.unwrap_or(0); - Ok(index_length.try_into().unwrap()) + .map(|c| match c.offset_index_range() { + Some(r) => decode_offset_index(get(r)), + None => Err(general_err!("missing offset index")), }) - .collect::, ParquetError>>()?; - - Ok((offset, lengths)) + .collect() } -//Get File offset of ColumnChunk's pages_locations -//If there are invalid offset return a zero offset with zero length. -pub(crate) fn get_location_offset_and_total_length( - chunks: &[ColumnChunkMetaData], -) -> Result<(u64, usize), ParquetError> { - let metadata = if let Some(chunk) = chunks.first() { - chunk - } else { - return Ok((0, 0)); - }; - - let offset: u64 = if let Some(offset) = metadata.offset_index_offset() { - offset.try_into().unwrap() - } else { - return Ok((0, 0)); - }; - - let total_length = chunks - .iter() - .map(|x| x.offset_index_length().unwrap()) - .sum::() as usize; - Ok((offset, total_length)) +pub(crate) fn decode_offset_index( + data: &[u8], +) -> Result, ParquetError> { + let mut prot = TCompactInputProtocol::new(data); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + Ok(offset.page_locations) } -pub(crate) fn deserialize_column_index( +pub(crate) fn decode_column_index( data: &[u8], column_type: Type, ) -> Result { - if data.is_empty() { - return Ok(Index::NONE); - } let mut d = Cursor::new(data); let mut prot = TCompactInputProtocol::new(&mut d); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 7346b1a12b83..7b08567427e6 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -835,6 +835,9 @@ mod tests { use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; + use crate::file::page_index::index_reader::{ + read_columns_indexes, read_pages_locations, + }; use crate::file::properties::WriterProperties; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; @@ -1382,6 +1385,28 @@ mod tests { assert_eq!(0, page_offset.first_row_index); } + #[test] + fn test_page_index_reader_out_of_order() { + let test_file = get_test_file("alltypes_tiny_pages_plain.parquet"); + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(test_file, options).unwrap(); + let metadata = reader.metadata(); + + let test_file = get_test_file("alltypes_tiny_pages_plain.parquet"); + let columns = metadata.row_group(0).columns(); + let reversed: Vec<_> = columns.iter().cloned().rev().collect(); + + let a = read_columns_indexes(&test_file, columns).unwrap(); + let mut b = read_columns_indexes(&test_file, &reversed).unwrap(); + b.reverse(); + assert_eq!(a, b); + + let a = read_pages_locations(&test_file, columns).unwrap(); + let mut b = read_pages_locations(&test_file, &reversed).unwrap(); + b.reverse(); + assert_eq!(a, b); + } + #[test] fn test_page_index_reader_all_type() { let test_file = get_test_file("alltypes_tiny_pages_plain.parquet"); From 6c3688b2766e02988837f3894cedbfc42bef77ac Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 28 Apr 2023 17:58:11 -0400 Subject: [PATCH 0867/1411] Remove deprecated parquet ArrowReader (#4125) * Remove deprecated parquet ArrowReader * Update doctest --- parquet/src/arrow/arrow_reader/mod.rs | 182 +------------------------- parquet/src/arrow/arrow_writer/mod.rs | 6 +- parquet/src/arrow/mod.rs | 2 - 3 files changed, 7 insertions(+), 183 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 57741283a2f9..c69fa420d564 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -29,12 +29,12 @@ use arrow_select::filter::prep_null_mask_filter; use crate::arrow::array_reader::{ build_array_reader, ArrayReader, FileReaderRowGroupCollection, RowGroupCollection, }; -use crate::arrow::schema::{parquet_to_array_schema_and_fields, parquet_to_arrow_schema}; -use crate::arrow::schema::{parquet_to_arrow_schema_by_columns, ParquetField}; +use crate::arrow::schema::parquet_to_array_schema_and_fields; +use crate::arrow::schema::ParquetField; use crate::arrow::ProjectionMask; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{KeyValue, ParquetMetaData}; -use crate::file::reader::{ChunkReader, FileReader, SerializedFileReader}; +use crate::file::metadata::ParquetMetaData; +use crate::file::reader::{ChunkReader, SerializedFileReader}; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::schema::types::SchemaDescriptor; @@ -198,43 +198,6 @@ impl ArrowReaderBuilder { } } -/// Arrow reader api. -/// With this api, user can get arrow schema from parquet file, and read parquet data -/// into arrow arrays. -#[deprecated(note = "Use ParquetRecordBatchReaderBuilder instead")] -pub trait ArrowReader { - type RecordReader: RecordBatchReader; - - /// Read parquet schema and convert it into arrow schema. - fn get_schema(&mut self) -> Result; - - /// Read parquet schema and convert it into arrow schema. - /// This schema only includes columns identified by `mask`. - fn get_schema_by_columns(&mut self, mask: ProjectionMask) -> Result; - - /// Returns record batch reader from whole parquet file. - /// - /// # Arguments - /// - /// `batch_size`: The size of each record batch returned from this reader. Only the - /// last batch may contain records less than this size, otherwise record batches - /// returned from this reader should contains exactly `batch_size` elements. - fn get_record_reader(&mut self, batch_size: usize) -> Result; - - /// Returns record batch reader whose record batch contains columns identified by - /// `mask`. - /// - /// # Arguments - /// - /// `mask`: The columns that should be included in record batches. - /// `batch_size`: Please refer to `get_record_reader`. - fn get_record_reader_by_columns( - &mut self, - mask: ProjectionMask, - batch_size: usize, - ) -> Result; -} - /// Options that control how metadata is read for a parquet file /// /// See [`ArrowReaderBuilder`] for how to configure how the column data @@ -273,143 +236,6 @@ impl ArrowReaderOptions { } } -/// An `ArrowReader` that can be used to synchronously read parquet data as [`RecordBatch`] -/// -/// See [`crate::arrow::async_reader`] for an asynchronous interface -#[deprecated(note = "Use ParquetRecordBatchReaderBuilder instead")] -pub struct ParquetFileArrowReader { - file_reader: Arc, - - #[allow(deprecated)] - options: ArrowReaderOptions, -} - -#[allow(deprecated)] -impl ArrowReader for ParquetFileArrowReader { - type RecordReader = ParquetRecordBatchReader; - - fn get_schema(&mut self) -> Result { - let file_metadata = self.file_reader.metadata().file_metadata(); - parquet_to_arrow_schema(file_metadata.schema_descr(), self.get_kv_metadata()) - } - - fn get_schema_by_columns(&mut self, mask: ProjectionMask) -> Result { - let file_metadata = self.file_reader.metadata().file_metadata(); - parquet_to_arrow_schema_by_columns( - file_metadata.schema_descr(), - mask, - self.get_kv_metadata(), - ) - } - - fn get_record_reader( - &mut self, - batch_size: usize, - ) -> Result { - self.get_record_reader_by_columns(ProjectionMask::all(), batch_size) - } - - fn get_record_reader_by_columns( - &mut self, - mask: ProjectionMask, - batch_size: usize, - ) -> Result { - let (_, field) = parquet_to_array_schema_and_fields( - self.parquet_schema(), - mask, - self.get_kv_metadata(), - )?; - let array_reader = build_array_reader( - field.as_ref(), - &ProjectionMask::all(), - &self.file_reader, - )?; - - // Try to avoid allocate large buffer - let batch_size = self.file_reader.num_rows().min(batch_size); - Ok(ParquetRecordBatchReader::new( - batch_size, - array_reader, - None, - )) - } -} - -#[allow(deprecated)] -impl ParquetFileArrowReader { - /// Create a new [`ParquetFileArrowReader`] with the provided [`ChunkReader`] - /// - /// ```no_run - /// # use std::fs::File; - /// # use bytes::Bytes; - /// # use parquet::arrow::ParquetFileArrowReader; - /// - /// let file = File::open("file.parquet").unwrap(); - /// let reader = ParquetFileArrowReader::try_new(file).unwrap(); - /// - /// let bytes = Bytes::from(vec![]); - /// let reader = ParquetFileArrowReader::try_new(bytes).unwrap(); - /// ``` - pub fn try_new(chunk_reader: R) -> Result { - Self::try_new_with_options(chunk_reader, Default::default()) - } - - /// Create a new [`ParquetFileArrowReader`] with the provided [`ChunkReader`] - /// and [`ArrowReaderOptions`] - pub fn try_new_with_options( - chunk_reader: R, - options: ArrowReaderOptions, - ) -> Result { - let file_reader = Arc::new(SerializedFileReader::new(chunk_reader)?); - Ok(Self::new_with_options(file_reader, options)) - } - - /// Create a new [`ParquetFileArrowReader`] with the provided [`Arc`] - pub fn new(file_reader: Arc) -> Self { - Self::new_with_options(file_reader, Default::default()) - } - - /// Create a new [`ParquetFileArrowReader`] with the provided [`Arc`] - /// and [`ArrowReaderOptions`] - pub fn new_with_options( - file_reader: Arc, - options: ArrowReaderOptions, - ) -> Self { - Self { - file_reader, - options, - } - } - - /// Expose the reader metadata - #[deprecated = "use metadata() instead"] - pub fn get_metadata(&mut self) -> ParquetMetaData { - self.file_reader.metadata().clone() - } - - /// Returns the parquet metadata - pub fn metadata(&self) -> &ParquetMetaData { - self.file_reader.metadata() - } - - /// Returns the parquet schema - pub fn parquet_schema(&self) -> &SchemaDescriptor { - self.file_reader.metadata().file_metadata().schema_descr() - } - - /// Returns the key value metadata, returns `None` if [`ArrowReaderOptions::skip_arrow_metadata`] - fn get_kv_metadata(&self) -> Option<&Vec> { - if self.options.skip_arrow_metadata { - return None; - } - - self.file_reader - .metadata() - .file_metadata() - .key_value_metadata() - } -} - #[doc(hidden)] /// A newtype used within [`ReaderOptionsBuilder`] to distinguish sync readers from async pub struct SyncReader(SerializedFileReader); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 67fec4489cdd..14eb30f0b9c5 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -56,7 +56,8 @@ mod levels; /// # use bytes::Bytes; /// # use arrow_array::{ArrayRef, Int64Array}; /// # use arrow_array::RecordBatch; -/// # use parquet::arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}; +/// # use parquet::arrow::arrow_writer::ArrowWriter; +/// # use parquet::arrow::arrow_reader::ParquetRecordBatchReader; /// let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; /// let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap(); /// @@ -65,8 +66,7 @@ mod levels; /// writer.write(&to_write).unwrap(); /// writer.close().unwrap(); /// -/// let mut reader = ParquetFileArrowReader::try_new(Bytes::from(buffer)).unwrap(); -/// let mut reader = reader.get_record_reader(1024).unwrap(); +/// let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 1024).unwrap(); /// let read = reader.next().unwrap().unwrap(); /// /// assert_eq!(to_write, read); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 73a4f2350047..da7e850c3d60 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -118,8 +118,6 @@ pub mod async_writer; mod record_reader; experimental!(mod schema); -#[allow(deprecated)] -pub use self::arrow_reader::{ArrowReader, ParquetFileArrowReader}; pub use self::arrow_writer::ArrowWriter; #[cfg(feature = "async")] pub use self::async_reader::ParquetRecordBatchStreamBuilder; From 9d72cc5986ed4673d35e29c9de14097e04f21446 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Apr 2023 09:53:44 -0400 Subject: [PATCH 0868/1411] Make arrow_json::ReaderBuilder method names consistent (#4128) --- arrow-json/src/reader/mod.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 9541e0372102..5f1a2bb43f8a 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -195,8 +195,16 @@ impl ReaderBuilder { Self { batch_size, ..self } } - /// Sets if the decoder should coerce primitive values (bool and number) into string when the Schema's column is Utf8 or LargeUtf8. + /// Sets if the decoder should coerce primitive values (bool and number) into string + /// when the Schema's column is Utf8 or LargeUtf8. + #[deprecated(note = "Use with_coerce_primitive")] pub fn coerce_primitive(self, coerce_primitive: bool) -> Self { + self.with_coerce_primitive(coerce_primitive) + } + + /// Sets if the decoder should coerce primitive values (bool and number) into string + /// when the Schema's column is Utf8 or LargeUtf8. + pub fn with_coerce_primitive(self, coerce_primitive: bool) -> Self { Self { coerce_primitive, ..self @@ -669,7 +677,7 @@ mod tests { for batch_size in [1, 3, 100, batch_size] { unbuffered = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .coerce_primitive(coerce_primitive) + .with_coerce_primitive(coerce_primitive) .build(Cursor::new(buf.as_bytes())) .unwrap() .collect::, _>>() @@ -683,7 +691,7 @@ mod tests { for b in [1, 3, 5] { let buffered = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .coerce_primitive(coerce_primitive) + .with_coerce_primitive(coerce_primitive) .build(BufReader::with_capacity(b, Cursor::new(buf.as_bytes()))) .unwrap() .collect::, _>>() From 08dc16c9645def758e59651bed55f5aa95e2f42e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 30 Apr 2023 00:49:00 -0700 Subject: [PATCH 0869/1411] Support fixed point multiplication for DictionaryArray of Decimals (#4136) * Add multiply_fixed_point_dyn * Fix clippy * For review --- arrow-arith/src/arithmetic.rs | 252 ++++++++++++++++++++++++++++++---- 1 file changed, 222 insertions(+), 30 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 7f5a081900df..40ae3255b98c 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1434,6 +1434,114 @@ pub fn multiply_dyn_checked( } } +#[cfg(feature = "dyn_arith_dict")] +fn get_precision_scale(dt: &DataType) -> Result<(u8, i8), ArrowError> { + match dt { + DataType::Decimal128(precision, scale) => Ok((*precision, *scale)), + _ => Err(ArrowError::ComputeError( + "Cannot get precision and scale from non-decimal type".to_string(), + )), + } +} + +/// Returns the precision and scale of the result of a multiplication of two decimal types, +/// and the divisor for fixed point multiplication. +fn get_fixed_point_info( + left: (u8, i8), + right: (u8, i8), + required_scale: i8, +) -> Result<(u8, i8, i256), ArrowError> { + let product_scale = left.1 + right.1; + let precision = min(left.0 + right.0 + 1, DECIMAL128_MAX_PRECISION); + + if required_scale > product_scale { + return Err(ArrowError::ComputeError(format!( + "Required scale {} is greater than product scale {}", + required_scale, product_scale + ))); + } + + let divisor = + i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); + + Ok((precision, product_scale, divisor)) +} + +#[cfg(feature = "dyn_arith_dict")] +/// Perform `left * right` operation on two decimal arrays. If either left or right value is +/// null then the result is also null. +/// +/// This performs decimal multiplication which allows precision loss if an exact representation +/// is not possible for the result, according to the required scale. In the case, the result +/// will be rounded to the required scale. +/// +/// If the required scale is greater than the product scale, an error is returned. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// +/// It is implemented for compatibility with precision loss `multiply` function provided by +/// other data processing engines. For multiplication with precision loss detection, use +/// `multiply_dyn` or `multiply_dyn_checked` instead. +pub fn multiply_fixed_point_dyn( + left: &dyn Array, + right: &dyn Array, + required_scale: i8, +) -> Result { + match (left.data_type(), right.data_type()) { + ( + DataType::Dictionary(_, lhs_value_type), + DataType::Dictionary(_, rhs_value_type), + ) if matches!(lhs_value_type.as_ref(), &DataType::Decimal128(_, _)) + && matches!(rhs_value_type.as_ref(), &DataType::Decimal128(_, _)) => + { + downcast_dictionary_array!( + left => match left.values().data_type() { + DataType::Decimal128(_, _) => { + let lhs_precision_scale = get_precision_scale(lhs_value_type.as_ref())?; + let rhs_precision_scale = get_precision_scale(rhs_value_type.as_ref())?; + + let (precision, product_scale, divisor) = get_fixed_point_info(lhs_precision_scale, rhs_precision_scale, required_scale)?; + + let right = as_dictionary_array::<_>(right); + + if required_scale == product_scale { + let mul = multiply_dyn(left, right)?; + let array = mul.as_any().downcast_ref::().unwrap(); + let array = array.clone().with_precision_and_scale(precision, required_scale)?; + return Ok(Arc::new(array)) + } + + let array = math_op_dict::<_, Decimal128Type, _>(left, right, |a, b| { + let a = i256::from_i128(a); + let b = i256::from_i128(b); + + let mut mul = a.wrapping_mul(b); + mul = divide_and_round::(mul, divisor); + mul.as_i128() + }).and_then(|a| a.with_precision_and_scale(precision, required_scale))?; + + Ok(Arc::new(array)) + } + t => unreachable!("Unsupported dictionary value type {}", t), + }, + t => unreachable!("Unsupported data type {}", t), + ) + } + (DataType::Decimal128(_, _), DataType::Decimal128(_, _)) => { + let left = left.as_any().downcast_ref::().unwrap(); + let right = right.as_any().downcast_ref::().unwrap(); + + multiply_fixed_point(left, right, required_scale) + .map(|a| Arc::new(a) as ArrayRef) + } + (_, _) => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), + right.data_type() + ))), + } +} + /// Perform `left * right` operation on two decimal arrays. If either left or right value is /// null then the result is also null. /// @@ -1451,27 +1559,17 @@ pub fn multiply_fixed_point_checked( right: &PrimitiveArray, required_scale: i8, ) -> Result, ArrowError> { - let product_scale = left.scale() + right.scale(); - let precision = min( - left.precision() + right.precision() + 1, - DECIMAL128_MAX_PRECISION, - ); + let (precision, product_scale, divisor) = get_fixed_point_info( + (left.precision(), left.scale()), + (right.precision(), right.scale()), + required_scale, + )?; if required_scale == product_scale { return multiply_checked(left, right)? .with_precision_and_scale(precision, required_scale); } - if required_scale > product_scale { - return Err(ArrowError::ComputeError(format!( - "Required scale {} is greater than product scale {}", - required_scale, product_scale - ))); - } - - let divisor = - i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); - try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { let a = i256::from_i128(a); let b = i256::from_i128(b); @@ -1505,27 +1603,17 @@ pub fn multiply_fixed_point( right: &PrimitiveArray, required_scale: i8, ) -> Result, ArrowError> { - let product_scale = left.scale() + right.scale(); - let precision = min( - left.precision() + right.precision() + 1, - DECIMAL128_MAX_PRECISION, - ); + let (precision, product_scale, divisor) = get_fixed_point_info( + (left.precision(), left.scale()), + (right.precision(), right.scale()), + required_scale, + )?; if required_scale == product_scale { return multiply(left, right)? .with_precision_and_scale(precision, required_scale); } - if required_scale > product_scale { - return Err(ArrowError::ComputeError(format!( - "Required scale {} is greater than product scale {}", - required_scale, product_scale - ))); - } - - let divisor = - i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); - binary::<_, _, _, Decimal128Type>(left, right, |a, b| { let a = i256::from_i128(a); let b = i256::from_i128(b); @@ -3910,6 +3998,110 @@ mod tests { ); } + #[test] + #[cfg(feature = "dyn_arith_dict")] + fn test_decimal_multiply_fixed_point_dyn() { + // [123456789] + let a = Decimal128Array::from(vec![123456789000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [10] + let b = Decimal128Array::from(vec![10000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // Avoid overflow by reducing the scale. + let result = multiply_fixed_point_dyn(&a, &b, 28).unwrap(); + // [1234567890] + let expected = Arc::new( + Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(), + ) as ArrayRef; + + assert_eq!(&expected, &result); + assert_eq!( + result.as_primitive::().value_as_string(0), + "1234567890.0000000000000000000000000000" + ); + + // [123456789, 10] + let a = Decimal128Array::from(vec![ + 123456789000000000000000000, + 10000000000000000000, + ]) + .with_precision_and_scale(38, 18) + .unwrap(); + + // [10, 123456789, 12] + let b = Decimal128Array::from(vec![ + 10000000000000000000, + 123456789000000000000000000, + 12000000000000000000, + ]) + .with_precision_and_scale(38, 18) + .unwrap(); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), None]); + let array1 = DictionaryArray::new(keys, Arc::new(a)); + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(2), None]); + let array2 = DictionaryArray::new(keys, Arc::new(b)); + + let result = multiply_fixed_point_dyn(&array1, &array2, 28).unwrap(); + let expected = Arc::new( + Decimal128Array::from(vec![ + Some(12345678900000000000000000000000000000), + Some(12345678900000000000000000000000000000), + Some(1200000000000000000000000000000), + None, + ]) + .with_precision_and_scale(38, 28) + .unwrap(), + ) as ArrayRef; + + assert_eq!(&expected, &result); + assert_eq!( + result.as_primitive::().value_as_string(0), + "1234567890.0000000000000000000000000000" + ); + assert_eq!( + result.as_primitive::().value_as_string(1), + "1234567890.0000000000000000000000000000" + ); + assert_eq!( + result.as_primitive::().value_as_string(2), + "120.0000000000000000000000000000" + ); + + // Required scale is same as the product of the input scales. Behavior is same as multiply_dyn. + let a = Decimal128Array::from(vec![123, 100]) + .with_precision_and_scale(3, 2) + .unwrap(); + + let b = Decimal128Array::from(vec![100, 123, 120]) + .with_precision_and_scale(3, 2) + .unwrap(); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), None]); + let array1 = DictionaryArray::new(keys, Arc::new(a)); + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(2), None]); + let array2 = DictionaryArray::new(keys, Arc::new(b)); + + let result = multiply_fixed_point_dyn(&array1, &array2, 4).unwrap(); + let expected = multiply_dyn(&array1, &array2).unwrap(); + let expected = Arc::new( + expected + .as_any() + .downcast_ref::() + .unwrap() + .clone() + .with_precision_and_scale(7, 4) + .unwrap(), + ) as ArrayRef; + assert_eq!(&expected, &result); + } + #[test] fn test_timestamp_second_add_interval() { // timestamp second + interval year month From eb5ac6953a9c24b8c5d03c95e4c49128b4e3b696 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 2 May 2023 17:41:42 +0100 Subject: [PATCH 0870/1411] Cleanup ChunkReader (#4118) (#4156) * Remove length from ChunkReader (#4118) * Remove ChunkReader::T Send bound * Remove FileSource * Tweak docs --- parquet/src/arrow/async_reader/mod.rs | 39 ++-- parquet/src/bin/parquet-layout.rs | 3 +- parquet/src/file/footer.rs | 2 +- parquet/src/file/reader.rs | 68 ++++++- parquet/src/file/serialized_reader.rs | 63 +------ parquet/src/util/io.rs | 246 -------------------------- parquet/src/util/mod.rs | 1 - 7 files changed, 88 insertions(+), 334 deletions(-) delete mode 100644 parquet/src/util/io.rs diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 09b107e31f30..248d80d1a35b 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -744,6 +744,25 @@ enum ColumnChunkData { Dense { offset: usize, data: Bytes }, } +impl ColumnChunkData { + fn get(&self, start: u64) -> Result { + match &self { + ColumnChunkData::Sparse { data, .. } => data + .binary_search_by_key(&start, |(offset, _)| *offset as u64) + .map(|idx| data[idx].1.clone()) + .map_err(|_| { + ParquetError::General(format!( + "Invalid offset in sparse column chunk data: {start}" + )) + }), + ColumnChunkData::Dense { offset, data } => { + let start = start as usize - *offset; + Ok(data.slice(start..)) + } + } + } +} + impl Length for ColumnChunkData { fn len(&self) -> u64 { match &self { @@ -756,26 +775,12 @@ impl Length for ColumnChunkData { impl ChunkReader for ColumnChunkData { type T = bytes::buf::Reader; - fn get_read(&self, start: u64, length: usize) -> Result { - Ok(self.get_bytes(start, length)?.reader()) + fn get_read(&self, start: u64) -> Result { + Ok(self.get(start)?.reader()) } fn get_bytes(&self, start: u64, length: usize) -> Result { - match &self { - ColumnChunkData::Sparse { data, .. } => data - .binary_search_by_key(&start, |(offset, _)| *offset as u64) - .map(|idx| data[idx].1.slice(0..length)) - .map_err(|_| { - ParquetError::General(format!( - "Invalid offset in sparse column chunk data: {start}" - )) - }), - ColumnChunkData::Dense { offset, data } => { - let start = start as usize - *offset; - let end = start + length; - Ok(data.slice(start..end)) - } - } + Ok(self.get(start)?.slice(..length)) } } diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 5f71551e1f20..d749bb8a4ba7 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -175,8 +175,7 @@ fn read_page_header( } } - let len = reader.len().checked_sub(offset).unwrap() as usize; - let input = reader.get_read(offset, len)?; + let input = reader.get_read(offset)?; let mut tracked = TrackedRead(input, 0); let mut prot = TCompactInputProtocol::new(&mut tracked); let header = PageHeader::read_from_in_protocol(&mut prot)?; diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index a14b3ce4d6c5..7cc92afc014a 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -46,7 +46,7 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result; + type T: Read; + + /// Get a [`Read`] starting at the provided file offset + /// + /// Subsequent or concurrent calls to [`Self::get_read`] or [`Self::get_bytes`] may + /// side-effect on previously returned [`Self::T`]. Care should be taken to avoid this + /// + /// See [`File::try_clone`] for more information + fn get_read(&self, start: u64) -> Result; /// Get a range as bytes - /// This should fail if the exact number of bytes cannot be read + /// + /// Concurrent calls to [`Self::get_bytes`] may result in interleaved output + /// + /// See [`File::try_clone`] for more information + fn get_bytes(&self, start: u64, length: usize) -> Result; +} + +impl Length for File { + fn len(&self) -> u64 { + self.metadata().map(|m| m.len()).unwrap_or(0u64) + } +} + +impl ChunkReader for File { + type T = BufReader; + + fn get_read(&self, start: u64) -> Result { + let mut reader = self.try_clone()?; + reader.seek(SeekFrom::Start(start))?; + Ok(BufReader::new(self.try_clone()?)) + } + fn get_bytes(&self, start: u64, length: usize) -> Result { let mut buffer = Vec::with_capacity(length); - let read = self.get_read(start, length)?.read_to_end(&mut buffer)?; + let mut reader = self.try_clone()?; + reader.seek(SeekFrom::Start(start))?; + let read = reader.take(length as _).read_to_end(&mut buffer)?; if read != length { return Err(eof_err!( @@ -69,6 +99,26 @@ pub trait ChunkReader: Length + Send + Sync { } } +impl Length for Bytes { + fn len(&self) -> u64 { + self.len() as u64 + } +} + +impl ChunkReader for Bytes { + type T = bytes::buf::Reader; + + fn get_read(&self, start: u64) -> Result { + let start = start as usize; + Ok(self.slice(start..).reader()) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + let start = start as usize; + Ok(self.slice(start..start + length)) + } +} + // ---------------------------------------------------------------------- // APIs for file & row group readers diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 7b08567427e6..bf843562ed02 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -40,60 +40,8 @@ use crate::format::{PageHeader, PageLocation, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; -use crate::util::{io::TryClone, memory::ByteBufferPtr}; -use bytes::{Buf, Bytes}; +use crate::util::memory::ByteBufferPtr; use thrift::protocol::{TCompactInputProtocol, TSerializable}; -// export `SliceableCursor` and `FileSource` publicly so clients can -// re-use the logic in their own ParquetFileWriter wrappers -pub use crate::util::io::FileSource; - -// ---------------------------------------------------------------------- -// Implementations of traits facilitating the creation of a new reader - -impl Length for File { - fn len(&self) -> u64 { - self.metadata().map(|m| m.len()).unwrap_or(0u64) - } -} - -impl TryClone for File { - fn try_clone(&self) -> std::io::Result { - self.try_clone() - } -} - -impl ChunkReader for File { - type T = FileSource; - - fn get_read(&self, start: u64, length: usize) -> Result { - Ok(FileSource::new(self, start, length)) - } -} - -impl Length for Bytes { - fn len(&self) -> u64 { - self.len() as u64 - } -} - -impl TryClone for Bytes { - fn try_clone(&self) -> std::io::Result { - Ok(self.clone()) - } -} - -impl ChunkReader for Bytes { - type T = bytes::buf::Reader; - - fn get_read(&self, start: u64, length: usize) -> Result { - Ok(self.get_bytes(start, length)?.reader()) - } - - fn get_bytes(&self, start: u64, length: usize) -> Result { - let start = start as usize; - Ok(self.slice(start..start + length)) - } -} impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -662,7 +610,7 @@ impl PageReader for SerializedPageReader { return Ok(None); } - let mut read = self.reader.get_read(*offset as u64, *remaining)?; + let mut read = self.reader.get_read(*offset as u64)?; let header = if let Some(header) = next_page_header.take() { *header } else { @@ -752,8 +700,7 @@ impl PageReader for SerializedPageReader { continue; } } else { - let mut read = - self.reader.get_read(*offset as u64, *remaining_bytes)?; + let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; *offset += header_len; *remaining_bytes -= header_len; @@ -807,8 +754,7 @@ impl PageReader for SerializedPageReader { *offset += buffered_header.compressed_page_size as usize; *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { - let mut read = - self.reader.get_read(*offset as u64, *remaining_bytes)?; + let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; let data_page_size = header.compressed_page_size as usize; *offset += header_len + data_page_size; @@ -827,6 +773,7 @@ impl PageReader for SerializedPageReader { #[cfg(test)] mod tests { + use bytes::Bytes; use std::sync::Arc; use crate::format::BoundaryOrder; diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs deleted file mode 100644 index 43d78866d9ef..000000000000 --- a/parquet/src/util/io.rs +++ /dev/null @@ -1,246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{cell::RefCell, cmp, fmt, io::*}; - -use crate::file::reader::Length; - -const DEFAULT_BUF_SIZE: usize = 8 * 1024; - -// ---------------------------------------------------------------------- - -/// TryClone tries to clone the type and should maintain the `Seek` position of the given -/// instance. -pub trait TryClone: Sized { - /// Clones the type returning a new instance or an error if it's not possible - /// to clone it. - fn try_clone(&self) -> Result; -} - -/// ParquetReader is the interface which needs to be fulfilled to be able to parse a -/// parquet source. -pub trait ParquetReader: Read + Seek + Length + TryClone {} -impl ParquetReader for T {} - -// Read/Write wrappers for `File`. - -/// Struct that represents a slice of a file data with independent start position and -/// length. Internally clones provided file handle, wraps with a custom implementation -/// of BufReader that resets position before any read. -/// -/// This is workaround and alternative for `file.try_clone()` method. It clones `File` -/// while preserving independent position, which is not available with `try_clone()`. -/// -/// Designed after `arrow::io::RandomAccessFile` and `std::io::BufReader` -pub struct FileSource { - reader: RefCell, - start: u64, // start position in a file - end: u64, // end position in a file - buf: Vec, // buffer where bytes read in advance are stored - buf_pos: usize, // current position of the reader in the buffer - buf_cap: usize, // current number of bytes read into the buffer -} - -impl fmt::Debug for FileSource { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("FileSource") - .field("reader", &"OPAQUE") - .field("start", &self.start) - .field("end", &self.end) - .field("buf.len", &self.buf.len()) - .field("buf_pos", &self.buf_pos) - .field("buf_cap", &self.buf_cap) - .finish() - } -} - -impl FileSource { - /// Creates new file reader with start and length from a file handle - pub fn new(fd: &R, start: u64, length: usize) -> Self { - let reader = RefCell::new(fd.try_clone().unwrap()); - Self { - reader, - start, - end: start + length as u64, - buf: vec![0_u8; DEFAULT_BUF_SIZE], - buf_pos: 0, - buf_cap: 0, - } - } - - fn fill_inner_buf(&mut self) -> Result<&[u8]> { - if self.buf_pos >= self.buf_cap { - // If we've reached the end of our internal buffer then we need to fetch - // some more data from the underlying reader. - // Branch using `>=` instead of the more correct `==` - // to tell the compiler that the pos..cap slice is always valid. - debug_assert!(self.buf_pos == self.buf_cap); - let mut reader = self.reader.borrow_mut(); - reader.seek(SeekFrom::Start(self.start))?; // always seek to start before reading - self.buf_cap = reader.read(&mut self.buf)?; - self.buf_pos = 0; - } - Ok(&self.buf[self.buf_pos..self.buf_cap]) - } - - fn skip_inner_buf(&mut self, buf: &mut [u8]) -> Result { - // discard buffer - self.buf_pos = 0; - self.buf_cap = 0; - // read directly into param buffer - let mut reader = self.reader.borrow_mut(); - reader.seek(SeekFrom::Start(self.start))?; // always seek to start before reading - let nread = reader.read(buf)?; - self.start += nread as u64; - Ok(nread) - } -} - -impl Read for FileSource { - fn read(&mut self, buf: &mut [u8]) -> Result { - let bytes_to_read = cmp::min(buf.len(), (self.end - self.start) as usize); - let buf = &mut buf[0..bytes_to_read]; - - // If we don't have any buffered data and we're doing a massive read - // (larger than our internal buffer), bypass our internal buffer - // entirely. - if self.buf_pos == self.buf_cap && buf.len() >= self.buf.len() { - return self.skip_inner_buf(buf); - } - let nread = { - let mut rem = self.fill_inner_buf()?; - // copy the data from the inner buffer to the param buffer - rem.read(buf)? - }; - // consume from buffer - self.buf_pos = cmp::min(self.buf_pos + nread, self.buf_cap); - - self.start += nread as u64; - Ok(nread) - } -} - -impl Length for FileSource { - fn len(&self) -> u64 { - self.end - self.start - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::iter; - - use crate::util::test_common::file_util::get_test_file; - - #[test] - fn test_io_read_fully() { - let mut buf = vec![0; 8]; - let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); - - let bytes_read = src.read(&mut buf[..]).unwrap(); - assert_eq!(bytes_read, 4); - assert_eq!(buf, vec![b'P', b'A', b'R', b'1', 0, 0, 0, 0]); - } - - #[test] - fn test_io_read_in_chunks() { - let mut buf = vec![0; 4]; - let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); - - let bytes_read = src.read(&mut buf[0..2]).unwrap(); - assert_eq!(bytes_read, 2); - let bytes_read = src.read(&mut buf[2..]).unwrap(); - assert_eq!(bytes_read, 2); - assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); - } - - #[test] - fn test_io_read_pos() { - let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); - - let _ = src.read(&mut [0; 1]).unwrap(); - assert_eq!(src.start, 1); - - let _ = src.read(&mut [0; 4]).unwrap(); - assert_eq!(src.start, 4); - } - - #[test] - fn test_io_read_over_limit() { - let mut src = FileSource::new(&get_test_file("alltypes_plain.parquet"), 0, 4); - - // Read all bytes from source - let _ = src.read(&mut [0; 128]).unwrap(); - assert_eq!(src.start, 4); - - // Try reading again, should return 0 bytes. - let bytes_read = src.read(&mut [0; 128]).unwrap(); - assert_eq!(bytes_read, 0); - assert_eq!(src.start, 4); - } - - #[test] - fn test_io_seek_switch() { - let mut buf = vec![0; 4]; - let mut file = get_test_file("alltypes_plain.parquet"); - let mut src = FileSource::new(&file, 0, 4); - - file.seek(SeekFrom::Start(5_u64)) - .expect("File seek to a position"); - - let bytes_read = src.read(&mut buf[..]).unwrap(); - assert_eq!(bytes_read, 4); - assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); - } - - #[test] - fn test_io_large_read() { - // Generate repeated 'abcdef' pattern and write it into a file - let patterned_data: Vec = iter::repeat(vec![0, 1, 2, 3, 4, 5]) - .flatten() - .take(3 * DEFAULT_BUF_SIZE) - .collect(); - - let mut file = tempfile::tempfile().unwrap(); - file.write_all(&patterned_data).unwrap(); - - // seek the underlying file to the first 'd' - file.seek(SeekFrom::Start(3)).unwrap(); - - // create the FileSource reader that starts at pos 1 ('b') - let mut chunk = FileSource::new(&file, 1, patterned_data.len() - 1); - - // read the 'b' at pos 1 - let mut res = vec![0u8; 1]; - chunk.read_exact(&mut res).unwrap(); - assert_eq!(res, &[1]); - - // the underlying file is sought to 'e' - file.seek(SeekFrom::Start(4)).unwrap(); - - // now read large chunk that starts with 'c' (after 'b') - let mut res = vec![0u8; 2 * DEFAULT_BUF_SIZE]; - chunk.read_exact(&mut res).unwrap(); - assert_eq!( - res, - &patterned_data[2..2 + 2 * DEFAULT_BUF_SIZE], - "read buf and original data are not equal" - ); - } -} diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index 5f43023941fd..d96a62a9f363 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -pub mod io; pub mod memory; #[macro_use] pub mod bit_util; From 547cb80ad02b77033f2ef20d21d2f94cd0507096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E5=B0=8F=E5=88=9A?= <35674070+suxiaogang223@users.noreply.github.com> Date: Thu, 4 May 2023 04:20:10 +0800 Subject: [PATCH 0871/1411] Support Compression in parquet-fromcsv (#4160) * parquet-fromcsv support read compressed * add test * fix for clippy * fix label error * add dependences for parquet_fromcsv * Unified import format of decompression packages --- parquet/Cargo.toml | 2 +- parquet/src/bin/parquet-fromcsv-help.txt | 7 +- parquet/src/bin/parquet-fromcsv.rs | 126 +++++++++++++++++++++-- 3 files changed, 123 insertions(+), 12 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index ef5ea8cd15d9..84142824e372 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -133,7 +133,7 @@ required-features = ["cli"] [[bin]] name = "parquet-fromcsv" -required-features = ["arrow", "cli"] +required-features = ["arrow", "cli", "snap", "brotli", "flate2", "lz4", "zstd"] [[bin]] name = "parquet-show-bloom-filter" diff --git a/parquet/src/bin/parquet-fromcsv-help.txt b/parquet/src/bin/parquet-fromcsv-help.txt index 44d75f5a036d..ac38c5689279 100644 --- a/parquet/src/bin/parquet-fromcsv-help.txt +++ b/parquet/src/bin/parquet-fromcsv-help.txt @@ -47,8 +47,13 @@ Options: [possible values: true, false] + -C, --csv-compression + compression mode of csv + + [default: UNCOMPRESSED] + -c, --parquet-compression - compression mode + compression mode of parquet [default: SNAPPY] diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 4e96fb87851b..f2a911c00301 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -61,6 +61,7 @@ //! ```text //! - `-i`, `--input-file` : Path to input CSV file //! - `-f`, `--input-format` : Dialect for input file, `csv` or `tsv`. +//! - `-C`, `--csv-compression` : Compression option for csv, default is UNCOMPRESSED //! - `-d`, `--delimiter : Field delimiter for CSV file, default depends `--input-format` //! - `-e`, `--escape` : Escape character for input file //! - `-h`, `--has-header` : Input has header @@ -72,6 +73,7 @@ use std::{ fmt::Display, fs::{read_to_string, File}, + io::Read, path::{Path, PathBuf}, sync::Arc, }; @@ -193,7 +195,10 @@ struct Args { quote_char: Option, #[clap(short('D'), long, help("double quote"))] double_quote: Option, - #[clap(short('c'), long, help("compression mode"), default_value_t=Compression::SNAPPY)] + #[clap(short('C'), long, help("compression mode of csv"), default_value_t=Compression::UNCOMPRESSED)] + #[clap(value_parser=compression_from_str)] + csv_compression: Compression, + #[clap(short('c'), long, help("compression mode of parquet"), default_value_t=Compression::SNAPPY)] #[clap(value_parser=compression_from_str)] parquet_compression: Compression, @@ -368,9 +373,31 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { &format!("Failed to open input file {:#?}", &args.input_file), ) })?; + + // open input file decoder + let input_file_decoder = match args.csv_compression { + Compression::UNCOMPRESSED => Box::new(input_file) as Box, + Compression::SNAPPY => { + Box::new(snap::read::FrameDecoder::new(input_file)) as Box + } + Compression::GZIP(_) => { + Box::new(flate2::read::GzDecoder::new(input_file)) as Box + } + Compression::BROTLI(_) => { + Box::new(brotli::Decompressor::new(input_file, 0)) as Box + } + Compression::LZ4 => Box::new(lz4::Decoder::new(input_file).map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to create lz4::Decoder") + })?) as Box, + Compression::ZSTD(_) => Box::new(zstd::Decoder::new(input_file).map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to create zstd::Decoder") + })?) as Box, + d => unimplemented!("compression type {d}"), + }; + // create input csv reader let builder = configure_reader_builder(args, arrow_schema); - let reader = builder.build(input_file)?; + let reader = builder.build(input_file_decoder)?; for batch_result in reader { let batch = batch_result.map_err(|e| { ParquetFromCsvError::with_context(e, "Failed to read RecordBatch from CSV") @@ -393,13 +420,17 @@ fn main() -> Result<(), ParquetFromCsvError> { #[cfg(test)] mod tests { use std::{ - io::{Seek, Write}, + io::Write, path::{Path, PathBuf}, }; use super::*; use arrow::datatypes::{DataType, Field}; + use brotli::CompressorWriter; use clap::{CommandFactory, Parser}; + use flate2::write::GzEncoder; + use parquet::basic::{BrotliLevel, GzipLevel, ZstdLevel}; + use snap::write::FrameEncoder; use tempfile::NamedTempFile; #[test] @@ -558,6 +589,7 @@ mod tests { escape_char: None, quote_char: None, double_quote: None, + csv_compression: Compression::UNCOMPRESSED, parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, @@ -593,6 +625,7 @@ mod tests { escape_char: Some('\\'), quote_char: None, double_quote: None, + csv_compression: Compression::UNCOMPRESSED, parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, @@ -616,8 +649,7 @@ mod tests { assert_debug_text(&builder_debug, "escape", "Some(92)"); } - #[test] - fn test_convert_csv_to_parquet() { + fn test_convert_compressed_csv_to_parquet(csv_compression: Compression) { let schema = NamedTempFile::new().unwrap(); let schema_text = r"message schema { optional int32 id; @@ -626,14 +658,71 @@ mod tests { schema.as_file().write_all(schema_text.as_bytes()).unwrap(); let mut input_file = NamedTempFile::new().unwrap(); - { - let csv = input_file.as_file_mut(); + + fn write_tmp_file(w: &mut T) { for index in 1..2000 { - write!(csv, "{index},\"name_{index}\"\r\n").unwrap(); + write!(w, "{index},\"name_{index}\"\r\n").unwrap(); } - csv.flush().unwrap(); - csv.rewind().unwrap(); + w.flush().unwrap(); } + + // make sure the input_file's lifetime being long enough + input_file = match csv_compression { + Compression::UNCOMPRESSED => { + write_tmp_file(&mut input_file); + input_file + } + Compression::SNAPPY => { + let mut encoder = FrameEncoder::new(input_file); + write_tmp_file(&mut encoder); + encoder.into_inner().unwrap() + } + Compression::GZIP(level) => { + let mut encoder = GzEncoder::new( + input_file, + flate2::Compression::new(level.compression_level()), + ); + write_tmp_file(&mut encoder); + encoder.finish().unwrap() + } + Compression::BROTLI(level) => { + let mut encoder = + CompressorWriter::new(input_file, 0, level.compression_level(), 0); + write_tmp_file(&mut encoder); + encoder.into_inner() + } + Compression::LZ4 => { + let mut encoder = lz4::EncoderBuilder::new() + .build(input_file) + .map_err(|e| { + ParquetFromCsvError::with_context( + e, + "Failed to create lz4::Encoder", + ) + }) + .unwrap(); + write_tmp_file(&mut encoder); + let (inner, err) = encoder.finish(); + err.unwrap(); + inner + } + + Compression::ZSTD(level) => { + let mut encoder = + zstd::Encoder::new(input_file, level.compression_level()) + .map_err(|e| { + ParquetFromCsvError::with_context( + e, + "Failed to create zstd::Encoder", + ) + }) + .unwrap(); + write_tmp_file(&mut encoder); + encoder.finish().unwrap() + } + d => unimplemented!("compression type {d}"), + }; + let output_parquet = NamedTempFile::new().unwrap(); let args = Args { @@ -648,6 +737,7 @@ mod tests { escape_char: None, quote_char: None, double_quote: None, + csv_compression, parquet_compression: Compression::SNAPPY, writer_version: None, max_row_group_size: None, @@ -657,4 +747,20 @@ mod tests { }; convert_csv_to_parquet(&args).unwrap(); } + + #[test] + fn test_convert_csv_to_parquet() { + test_convert_compressed_csv_to_parquet(Compression::UNCOMPRESSED); + test_convert_compressed_csv_to_parquet(Compression::SNAPPY); + test_convert_compressed_csv_to_parquet(Compression::GZIP( + GzipLevel::try_new(1).unwrap(), + )); + test_convert_compressed_csv_to_parquet(Compression::BROTLI( + BrotliLevel::try_new(2).unwrap(), + )); + test_convert_compressed_csv_to_parquet(Compression::LZ4); + test_convert_compressed_csv_to_parquet(Compression::ZSTD( + ZstdLevel::try_new(1).unwrap(), + )); + } } From c4ac758e13cbd76651d16d06e786bdf7c04f902a Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Thu, 4 May 2023 09:58:43 +0300 Subject: [PATCH 0872/1411] feat: support bitwise shift left/right with scalars (#4159) --- arrow-arith/src/bitwise.rs | 91 ++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/arrow-arith/src/bitwise.rs b/arrow-arith/src/bitwise.rs index f9f456bf95fc..a5dec4638703 100644 --- a/arrow-arith/src/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -22,7 +22,7 @@ use arrow_schema::ArrowError; use num::traits::{WrappingShl, WrappingShr}; use std::ops::{BitAnd, BitOr, BitXor, Not}; -// The helper function for bitwise operation with two array +/// The helper function for bitwise operation with two array fn bitwise_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -74,6 +74,38 @@ where bitwise_op(left, right, |a, b| a ^ b) } +/// Perform bitwise `left << right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_shift_left( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result, ArrowError> +where + T: ArrowNumericType, + T::Native: WrappingShl, +{ + bitwise_op(left, right, |a, b| { + let b = b.as_usize(); + a.wrapping_shl(b as u32) + }) +} + +/// Perform bitwise `left >> right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_shift_right( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result, ArrowError> +where + T: ArrowNumericType, + T::Native: WrappingShr, +{ + bitwise_op(left, right, |a, b| { + let b = b.as_usize(); + a.wrapping_shr(b as u32) + }) +} + /// Perform `!array` operation on array. If array value is null /// then the result is also null. pub fn bitwise_not(array: &PrimitiveArray) -> Result, ArrowError> @@ -123,36 +155,36 @@ where Ok(unary(array, |value| value ^ scalar)) } -/// Perform bitwise 'left << right' operation on two arrays. If either left or right value is null -/// then the result is also null. -pub fn bitwise_shift_left( - left: &PrimitiveArray, - right: &PrimitiveArray, +/// Perform bitwise `left << right` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_shift_left_scalar( + array: &PrimitiveArray, + scalar: T::Native, ) -> Result, ArrowError> where T: ArrowNumericType, T::Native: WrappingShl, { - bitwise_op(left, right, |a, b| { - let b = b.as_usize(); - a.wrapping_shl(b as u32) - }) + Ok(unary(array, |value| { + let scalar = scalar.as_usize(); + value.wrapping_shl(scalar as u32) + })) } -/// Perform bitwise 'left >> right' operation on two arrays. If either left or right value is null -/// then the result is also null. -pub fn bitwise_shift_right( - left: &PrimitiveArray, - right: &PrimitiveArray, +/// Perform bitwise `left >> right` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_shift_right_scalar( + array: &PrimitiveArray, + scalar: T::Native, ) -> Result, ArrowError> where T: ArrowNumericType, T::Native: WrappingShr, { - bitwise_op(left, right, |a, b| { - let b = b.as_usize(); - a.wrapping_shr(b as u32) - }) + Ok(unary(array, |value| { + let scalar = scalar.as_usize(); + value.wrapping_shr(scalar as u32) + })) } #[cfg(test)] @@ -188,6 +220,16 @@ mod tests { assert_eq!(expected, result); } + #[test] + fn test_bitwise_shift_left_scalar() { + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); + let scalar = 2; + let expected = + UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]); + let result = bitwise_shift_left_scalar(&left, scalar).unwrap(); + assert_eq!(expected, result); + } + #[test] fn test_bitwise_shift_right() { let left = @@ -199,6 +241,17 @@ mod tests { assert_eq!(expected, result); } + #[test] + fn test_bitwise_shift_right_scalar() { + let left = + UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let scalar = 2; + let expected = + UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]); + let result = bitwise_shift_right_scalar(&left, scalar).unwrap(); + assert_eq!(expected, result); + } + #[test] fn test_bitwise_and_array_scalar() { // unsigned value From b6174150399f6c0e13b5dfb8ad283468a2c6c31a Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner <14581281+iajoiner@users.noreply.github.com> Date: Fri, 5 May 2023 09:05:15 -0400 Subject: [PATCH 0873/1411] Prep for 39.0.0 (#4171) * feat: update versions * feat: update versions * feat: update versions * docs: update changelog --- CHANGELOG-old.md | 76 +++++++++++++++++++++++++ CHANGELOG.md | 98 ++++++++++++++------------------ Cargo.toml | 32 +++++------ arrow-flight/README.md | 2 +- arrow/README.md | 2 +- dev/release/README.md | 2 +- dev/release/update_change_log.sh | 4 +- parquet_derive/README.md | 4 +- 8 files changed, 141 insertions(+), 79 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index e04f0f5d2762..fa932b103615 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,82 @@ # Historical Changelog +## [38.0.0](https://github.com/apache/arrow-rs/tree/38.0.0) (2023-04-21) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/37.0.0...38.0.0) + +**Breaking changes:** + +- Remove DataType from PrimitiveArray constructors [\#4098](https://github.com/apache/arrow-rs/pull/4098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use Into\\> for PrimitiveArray::with\_timezone [\#4097](https://github.com/apache/arrow-rs/pull/4097) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Store StructArray entries in MapArray [\#4085](https://github.com/apache/arrow-rs/pull/4085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add DictionaryArray Constructors \(\#3879\) [\#4068](https://github.com/apache/arrow-rs/pull/4068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Relax JSON schema inference generics [\#4063](https://github.com/apache/arrow-rs/pull/4063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove ArrayData from Array \(\#3880\) [\#4061](https://github.com/apache/arrow-rs/pull/4061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add CommandGetXdbcTypeInfo to Flight SQL Server [\#4055](https://github.com/apache/arrow-rs/pull/4055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([c-thiel](https://github.com/c-thiel)) +- Remove old JSON Reader and Decoder \(\#3610\) [\#4052](https://github.com/apache/arrow-rs/pull/4052) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use BufRead for JSON Schema Inference [\#4041](https://github.com/apache/arrow-rs/pull/4041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([WenyXu](https://github.com/WenyXu)) + +**Implemented enhancements:** + +- Support dyn\_compare\_scalar for Decimal256 [\#4083](https://github.com/apache/arrow-rs/issues/4083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Better JSON Reader Error Messages [\#4076](https://github.com/apache/arrow-rs/issues/4076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Additional data type groups [\#4056](https://github.com/apache/arrow-rs/issues/4056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Async JSON reader [\#4043](https://github.com/apache/arrow-rs/issues/4043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Field::contains Should Recurse into DataType [\#4029](https://github.com/apache/arrow-rs/issues/4029) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Prevent UnionArray with Repeated Type IDs [\#3982](https://github.com/apache/arrow-rs/issues/3982) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Timestamp` `+`/`-` `Interval` types [\#3963](https://github.com/apache/arrow-rs/issues/3963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- First-Class Array Abstractions [\#3880](https://github.com/apache/arrow-rs/issues/3880) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- OffsetBuffer::new Rejects 0 Offsets [\#4066](https://github.com/apache/arrow-rs/issues/4066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet AsyncArrowWriter not shutting down inner async writer. [\#4058](https://github.com/apache/arrow-rs/issues/4058) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Flight SQL Server missing command type.googleapis.com/arrow.flight.protocol.sql.CommandGetXdbcTypeInfo [\#4054](https://github.com/apache/arrow-rs/issues/4054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- RawJsonReader Errors with Empty Schema [\#4053](https://github.com/apache/arrow-rs/issues/4053) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RawJsonReader Integer Truncation [\#4049](https://github.com/apache/arrow-rs/issues/4049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sparse UnionArray Equality Incorrect Offset Handling [\#4044](https://github.com/apache/arrow-rs/issues/4044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Write blog about improvements in JSON and CSV processing [\#4062](https://github.com/apache/arrow-rs/issues/4062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Parquet reader of Int96 columns and coercion to timestamps [\#4075](https://github.com/apache/arrow-rs/issues/4075) +- Serializing timestamp from int \(json raw decoder\) [\#4069](https://github.com/apache/arrow-rs/issues/4069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting to/from Interval and Duration [\#3998](https://github.com/apache/arrow-rs/issues/3998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix Docs Typos [\#4100](https://github.com/apache/arrow-rs/pull/4100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rnarkk](https://github.com/rnarkk)) +- Update tonic-build requirement from =0.9.1 to =0.9.2 [\#4099](https://github.com/apache/arrow-rs/pull/4099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Increase minimum chrono version to 0.4.24 [\#4093](https://github.com/apache/arrow-rs/pull/4093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- \[Minor\]: Add `Hash` trait to SortOptions. [\#4089](https://github.com/apache/arrow-rs/pull/4089) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mustafasrepo](https://github.com/mustafasrepo)) +- Include byte offsets in parquet-layout [\#4086](https://github.com/apache/arrow-rs/pull/4086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: Support dyn\_compare\_scalar for Decimal256 [\#4084](https://github.com/apache/arrow-rs/pull/4084) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add ByteArray constructors \(\#3879\) [\#4081](https://github.com/apache/arrow-rs/pull/4081) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update prost-build requirement from =0.11.8 to =0.11.9 [\#4080](https://github.com/apache/arrow-rs/pull/4080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Improve JSON decoder errors \(\#4076\) [\#4079](https://github.com/apache/arrow-rs/pull/4079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix Timestamp Numeric Truncation in JSON Reader [\#4074](https://github.com/apache/arrow-rs/pull/4074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Serialize numeric to tape \(\#4069\) [\#4073](https://github.com/apache/arrow-rs/pull/4073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: Prevent UnionArray with Repeated Type IDs [\#4070](https://github.com/apache/arrow-rs/pull/4070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Add PrimitiveArray::try\_new \(\#3879\) [\#4067](https://github.com/apache/arrow-rs/pull/4067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ListArray Constructors \(\#3879\) [\#4065](https://github.com/apache/arrow-rs/pull/4065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Shutdown parquet async writer [\#4059](https://github.com/apache/arrow-rs/pull/4059) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kindly](https://github.com/kindly)) +- feat: additional data type groups [\#4057](https://github.com/apache/arrow-rs/pull/4057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Fix precision loss in Raw JSON decoder \(\#4049\) [\#4051](https://github.com/apache/arrow-rs/pull/4051) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use lexical\_core in CSV and JSON parser \(~25% faster\) [\#4050](https://github.com/apache/arrow-rs/pull/4050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add offsets accessors to variable length arrays \(\#3879\) [\#4048](https://github.com/apache/arrow-rs/pull/4048) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document Async decoder usage \(\#4043\) \(\#78\) [\#4046](https://github.com/apache/arrow-rs/pull/4046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix sparse union array equality \(\#4044\) [\#4045](https://github.com/apache/arrow-rs/pull/4045) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: DataType::contains support nested type [\#4042](https://github.com/apache/arrow-rs/pull/4042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat: Support Timestamp +/- Interval types [\#4038](https://github.com/apache/arrow-rs/pull/4038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- feat: cast from/to interval and duration [\#4020](https://github.com/apache/arrow-rs/pull/4020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) + ## [37.0.0](https://github.com/apache/arrow-rs/tree/37.0.0) (2023-04-07) [Full Changelog](https://github.com/apache/arrow-rs/compare/36.0.0...37.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5dfa46ea012..d5b6293b67b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,81 +19,67 @@ # Changelog -## [38.0.0](https://github.com/apache/arrow-rs/tree/38.0.0) (2023-04-21) +## [39.0.0](https://github.com/apache/arrow-rs/tree/39.0.0) (2023-05-05) -[Full Changelog](https://github.com/apache/arrow-rs/compare/37.0.0...38.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/38.0.0...39.0.0) **Breaking changes:** -- Remove DataType from PrimitiveArray constructors [\#4098](https://github.com/apache/arrow-rs/pull/4098) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use Into\\> for PrimitiveArray::with\_timezone [\#4097](https://github.com/apache/arrow-rs/pull/4097) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Store StructArray entries in MapArray [\#4085](https://github.com/apache/arrow-rs/pull/4085) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add DictionaryArray Constructors \(\#3879\) [\#4068](https://github.com/apache/arrow-rs/pull/4068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Relax JSON schema inference generics [\#4063](https://github.com/apache/arrow-rs/pull/4063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove ArrayData from Array \(\#3880\) [\#4061](https://github.com/apache/arrow-rs/pull/4061) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add CommandGetXdbcTypeInfo to Flight SQL Server [\#4055](https://github.com/apache/arrow-rs/pull/4055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([c-thiel](https://github.com/c-thiel)) -- Remove old JSON Reader and Decoder \(\#3610\) [\#4052](https://github.com/apache/arrow-rs/pull/4052) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use BufRead for JSON Schema Inference [\#4041](https://github.com/apache/arrow-rs/pull/4041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([WenyXu](https://github.com/WenyXu)) +- Cleanup ChunkReader \(\#4118\) [\#4156](https://github.com/apache/arrow-rs/pull/4156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove Type from NativeIndex [\#4146](https://github.com/apache/arrow-rs/pull/4146) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Don't Duplicate Offset Index on RowGroupMetadata [\#4142](https://github.com/apache/arrow-rs/pull/4142) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Return BooleanBuffer from BooleanBufferBuilder [\#4140](https://github.com/apache/arrow-rs/pull/4140) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup CSV schema inference \(\#4129\) \(\#4130\) [\#4133](https://github.com/apache/arrow-rs/pull/4133) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove deprecated parquet ArrowReader [\#4125](https://github.com/apache/arrow-rs/pull/4125) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- refactor: construct `StructArray` w/ `FieldRef` [\#4116](https://github.com/apache/arrow-rs/pull/4116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Ignore Field Metadata in equals\_datatype for Dictionary, RunEndEncoded, Map and Union [\#4111](https://github.com/apache/arrow-rs/pull/4111) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add StructArray Constructors \(\#3879\) [\#4064](https://github.com/apache/arrow-rs/pull/4064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support dyn\_compare\_scalar for Decimal256 [\#4083](https://github.com/apache/arrow-rs/issues/4083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Better JSON Reader Error Messages [\#4076](https://github.com/apache/arrow-rs/issues/4076) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Additional data type groups [\#4056](https://github.com/apache/arrow-rs/issues/4056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Async JSON reader [\#4043](https://github.com/apache/arrow-rs/issues/4043) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Field::contains Should Recurse into DataType [\#4029](https://github.com/apache/arrow-rs/issues/4029) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Prevent UnionArray with Repeated Type IDs [\#3982](https://github.com/apache/arrow-rs/issues/3982) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `Timestamp` `+`/`-` `Interval` types [\#3963](https://github.com/apache/arrow-rs/issues/3963) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- First-Class Array Abstractions [\#3880](https://github.com/apache/arrow-rs/issues/3880) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) +- Fixed point decimal multiplication for DictionaryArray [\#4135](https://github.com/apache/arrow-rs/issues/4135) +- Remove Seek Requirement from CSV ReaderBuilder [\#4130](https://github.com/apache/arrow-rs/issues/4130) +- Inconsistent CSV Inference and Parsing DateTime Handling [\#4129](https://github.com/apache/arrow-rs/issues/4129) +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) +- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4112](https://github.com/apache/arrow-rs/issues/4112) +- Support `Interval + Timestamp` and `Interval + Date` in addition to `Timestamp + Interval` and `Interval + Date` [\#4094](https://github.com/apache/arrow-rs/issues/4094) +- Enable setting FlightDescriptor on FlightDataEncoderBuilder [\#3855](https://github.com/apache/arrow-rs/issues/3855) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) -- OffsetBuffer::new Rejects 0 Offsets [\#4066](https://github.com/apache/arrow-rs/issues/4066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet AsyncArrowWriter not shutting down inner async writer. [\#4058](https://github.com/apache/arrow-rs/issues/4058) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Flight SQL Server missing command type.googleapis.com/arrow.flight.protocol.sql.CommandGetXdbcTypeInfo [\#4054](https://github.com/apache/arrow-rs/issues/4054) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- RawJsonReader Errors with Empty Schema [\#4053](https://github.com/apache/arrow-rs/issues/4053) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- RawJsonReader Integer Truncation [\#4049](https://github.com/apache/arrow-rs/issues/4049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Sparse UnionArray Equality Incorrect Offset Handling [\#4044](https://github.com/apache/arrow-rs/issues/4044) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet Page Index Reader Assumes Consecutive Offsets [\#4149](https://github.com/apache/arrow-rs/issues/4149) +- Equality of nested data types [\#4110](https://github.com/apache/arrow-rs/issues/4110) **Documentation updates:** -- Write blog about improvements in JSON and CSV processing [\#4062](https://github.com/apache/arrow-rs/issues/4062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve Documentation of Parquet ChunkReader [\#4118](https://github.com/apache/arrow-rs/issues/4118) **Closed issues:** -- Parquet reader of Int96 columns and coercion to timestamps [\#4075](https://github.com/apache/arrow-rs/issues/4075) -- Serializing timestamp from int \(json raw decoder\) [\#4069](https://github.com/apache/arrow-rs/issues/4069) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support casting to/from Interval and Duration [\#3998](https://github.com/apache/arrow-rs/issues/3998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- add specific error log for empty JSON array [\#4105](https://github.com/apache/arrow-rs/issues/4105) **Merged pull requests:** -- Fix Docs Typos [\#4100](https://github.com/apache/arrow-rs/pull/4100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([rnarkk](https://github.com/rnarkk)) -- Update tonic-build requirement from =0.9.1 to =0.9.2 [\#4099](https://github.com/apache/arrow-rs/pull/4099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Increase minimum chrono version to 0.4.24 [\#4093](https://github.com/apache/arrow-rs/pull/4093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) -- \[Minor\]: Add `Hash` trait to SortOptions. [\#4089](https://github.com/apache/arrow-rs/pull/4089) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mustafasrepo](https://github.com/mustafasrepo)) -- Include byte offsets in parquet-layout [\#4086](https://github.com/apache/arrow-rs/pull/4086) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- feat: Support dyn\_compare\_scalar for Decimal256 [\#4084](https://github.com/apache/arrow-rs/pull/4084) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add ByteArray constructors \(\#3879\) [\#4081](https://github.com/apache/arrow-rs/pull/4081) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update prost-build requirement from =0.11.8 to =0.11.9 [\#4080](https://github.com/apache/arrow-rs/pull/4080) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Improve JSON decoder errors \(\#4076\) [\#4079](https://github.com/apache/arrow-rs/pull/4079) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix Timestamp Numeric Truncation in JSON Reader [\#4074](https://github.com/apache/arrow-rs/pull/4074) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Serialize numeric to tape \(\#4069\) [\#4073](https://github.com/apache/arrow-rs/pull/4073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: Prevent UnionArray with Repeated Type IDs [\#4070](https://github.com/apache/arrow-rs/pull/4070) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Add PrimitiveArray::try\_new \(\#3879\) [\#4067](https://github.com/apache/arrow-rs/pull/4067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add ListArray Constructors \(\#3879\) [\#4065](https://github.com/apache/arrow-rs/pull/4065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Shutdown parquet async writer [\#4059](https://github.com/apache/arrow-rs/pull/4059) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([kindly](https://github.com/kindly)) -- feat: additional data type groups [\#4057](https://github.com/apache/arrow-rs/pull/4057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Fix precision loss in Raw JSON decoder \(\#4049\) [\#4051](https://github.com/apache/arrow-rs/pull/4051) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use lexical\_core in CSV and JSON parser \(~25% faster\) [\#4050](https://github.com/apache/arrow-rs/pull/4050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add offsets accessors to variable length arrays \(\#3879\) [\#4048](https://github.com/apache/arrow-rs/pull/4048) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Document Async decoder usage \(\#4043\) \(\#78\) [\#4046](https://github.com/apache/arrow-rs/pull/4046) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix sparse union array equality \(\#4044\) [\#4045](https://github.com/apache/arrow-rs/pull/4045) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: DataType::contains support nested type [\#4042](https://github.com/apache/arrow-rs/pull/4042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- feat: Support Timestamp +/- Interval types [\#4038](https://github.com/apache/arrow-rs/pull/4038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) -- feat: cast from/to interval and duration [\#4020](https://github.com/apache/arrow-rs/pull/4020) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support Compression in parquet-fromcsv [\#4160](https://github.com/apache/arrow-rs/pull/4160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([suxiaogang223](https://github.com/suxiaogang223)) +- feat: support bitwise shift left/right with scalars [\#4159](https://github.com/apache/arrow-rs/pull/4159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Cleanup reading page index \(\#4149\) \(\#4090\) [\#4151](https://github.com/apache/arrow-rs/pull/4151) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: support `bitwise` shift left/right [\#4148](https://github.com/apache/arrow-rs/pull/4148) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Don't hardcode port in FlightSQL tests [\#4145](https://github.com/apache/arrow-rs/pull/4145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Better flight SQL example codes [\#4144](https://github.com/apache/arrow-rs/pull/4144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([sundy-li](https://github.com/sundy-li)) +- chore: clean the code by using `as_primitive` [\#4143](https://github.com/apache/arrow-rs/pull/4143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Infer Float64 for JSON Numerics Beyond Bounds of i64 [\#4138](https://github.com/apache/arrow-rs/pull/4138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SteveLauC](https://github.com/SteveLauC)) +- Support fixed point multiplication for DictionaryArray of Decimals [\#4136](https://github.com/apache/arrow-rs/pull/4136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make arrow\_json::ReaderBuilder method names consistent [\#4128](https://github.com/apache/arrow-rs/pull/4128) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add get\_{ref, mut} to arrow\_ipc Reader and Writer [\#4122](https://github.com/apache/arrow-rs/pull/4122) ([sticnarf](https://github.com/sticnarf)) +- feat: support `Interval` + `Timestamp` and `Interval` + `Date` [\#4117](https://github.com/apache/arrow-rs/pull/4117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support NullArray in JSON Reader [\#4114](https://github.com/apache/arrow-rs/pull/4114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jiangzhx](https://github.com/jiangzhx)) +- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4113](https://github.com/apache/arrow-rs/pull/4113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Update regex-syntax requirement from 0.6.27 to 0.7.1 [\#4107](https://github.com/apache/arrow-rs/pull/4107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: set FlightDescriptor on FlightDataEncoderBuilder [\#4101](https://github.com/apache/arrow-rs/pull/4101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Weijun-H](https://github.com/Weijun-H)) +- optimize cast for same decimal type and same scale [\#4088](https://github.com/apache/arrow-rs/pull/4088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) diff --git a/Cargo.toml b/Cargo.toml index 872bb2919f60..1b3a76db9ae5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "38.0.0" +version = "39.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "38.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "38.0.0", path = "./arrow-arith" } -arrow-array = { version = "38.0.0", path = "./arrow-array" } -arrow-buffer = { version = "38.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "38.0.0", path = "./arrow-cast" } -arrow-csv = { version = "38.0.0", path = "./arrow-csv" } -arrow-data = { version = "38.0.0", path = "./arrow-data" } -arrow-ipc = { version = "38.0.0", path = "./arrow-ipc" } -arrow-json = { version = "38.0.0", path = "./arrow-json" } -arrow-ord = { version = "38.0.0", path = "./arrow-ord" } -arrow-row = { version = "38.0.0", path = "./arrow-row" } -arrow-schema = { version = "38.0.0", path = "./arrow-schema" } -arrow-select = { version = "38.0.0", path = "./arrow-select" } -arrow-string = { version = "38.0.0", path = "./arrow-string" } -parquet = { version = "38.0.0", path = "./parquet", default-features = false } +arrow = { version = "39.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "39.0.0", path = "./arrow-arith" } +arrow-array = { version = "39.0.0", path = "./arrow-array" } +arrow-buffer = { version = "39.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "39.0.0", path = "./arrow-cast" } +arrow-csv = { version = "39.0.0", path = "./arrow-csv" } +arrow-data = { version = "39.0.0", path = "./arrow-data" } +arrow-ipc = { version = "39.0.0", path = "./arrow-ipc" } +arrow-json = { version = "39.0.0", path = "./arrow-json" } +arrow-ord = { version = "39.0.0", path = "./arrow-ord" } +arrow-row = { version = "39.0.0", path = "./arrow-row" } +arrow-schema = { version = "39.0.0", path = "./arrow-schema" } +arrow-select = { version = "39.0.0", path = "./arrow-select" } +arrow-string = { version = "39.0.0", path = "./arrow-string" } +parquet = { version = "39.0.0", path = "./parquet", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 7ddc2043465b..86ef8f00b70e 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "37.0.0" +arrow-flight = "39.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow/README.md b/arrow/README.md index c7a0416a6747..fde71607246e 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `37.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `39.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags diff --git a/dev/release/README.md b/dev/release/README.md index e844e676a5e1..8c699d16374f 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -70,7 +70,7 @@ git pull git checkout -b # Update versions. Make sure to run it before the next step since we do not want CHANGELOG-old.md affected. -sed -i '' -e 's/14.0.0/37.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/39.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' # Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index c1f3167e7934..0b8ad7052838 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="37.0.0" -FUTURE_RELEASE="38.0.0" +SINCE_TAG="38.0.0" +FUTURE_RELEASE="39.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 26112d0097a9..b20721079c2d 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "37.0.0" -parquet_derive = "37.0.0" +parquet = "39.0.0" +parquet_derive = "39.0.0" ``` and this to your crate root: From a2bd2696435ef818f545bc09a55914bebaf493e6 Mon Sep 17 00:00:00 2001 From: ming08108 Date: Fri, 5 May 2023 08:14:49 -0500 Subject: [PATCH 0874/1411] Allow creating unbuffered streamreader (#4165) * allow creating unbuffered streamreader * remove unneeded annotations * address PR comments --- arrow-ipc/src/reader.rs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index d198696169b1..162e92914901 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1075,8 +1075,8 @@ impl RecordBatchReader for FileReader { /// Arrow Stream reader pub struct StreamReader { - /// Buffered stream reader - reader: BufReader, + /// Stream reader + reader: R, /// The schema that is read from the stream's first message schema: SchemaRef, @@ -1107,8 +1107,8 @@ impl fmt::Debug for StreamReader { } } -impl StreamReader { - /// Try to create a new stream reader +impl StreamReader> { + /// Try to create a new stream reader with the reader wrapped in a BufReader /// /// The first message in the stream is the schema, the reader will fail if it does not /// encounter a schema. @@ -1117,7 +1117,18 @@ impl StreamReader { reader: R, projection: Option>, ) -> Result { - let mut reader = BufReader::new(reader); + Self::try_new_unbuffered(BufReader::new(reader), projection) + } +} + +impl StreamReader { + /// Try to create a new stream reader but do not wrap the reader in a BufReader. + /// + /// Unless you need the StreamReader to be unbuffered you likely want to use `StreamReader::try_new` instead. + pub fn try_new_unbuffered( + mut reader: R, + projection: Option>, + ) -> Result, ArrowError> { // determine metadata length let mut meta_size: [u8; 4] = [0; 4]; reader.read_exact(&mut meta_size)?; @@ -1262,14 +1273,14 @@ impl StreamReader { /// /// It is inadvisable to directly read from the underlying reader. pub fn get_ref(&self) -> &R { - self.reader.get_ref() + &self.reader } /// Gets a mutable reference to the underlying reader. /// /// It is inadvisable to directly read from the underlying reader. pub fn get_mut(&mut self) -> &mut R { - self.reader.get_mut() + &mut self.reader } } From 575a199fa669d75833c13a2a69d71255b9a9f2e6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 5 May 2023 15:40:40 +0100 Subject: [PATCH 0875/1411] Final changelog tweaks for 39.0.0 (#4175) --- CHANGELOG.md | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5b6293b67b9..023a65941947 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ **Breaking changes:** +- Allow creating unbuffered streamreader [\#4165](https://github.com/apache/arrow-rs/pull/4165) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) - Cleanup ChunkReader \(\#4118\) [\#4156](https://github.com/apache/arrow-rs/pull/4156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - Remove Type from NativeIndex [\#4146](https://github.com/apache/arrow-rs/pull/4146) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) - Don't Duplicate Offset Index on RowGroupMetadata [\#4142](https://github.com/apache/arrow-rs/pull/4142) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) @@ -37,20 +38,19 @@ **Implemented enhancements:** -- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) -- Fixed point decimal multiplication for DictionaryArray [\#4135](https://github.com/apache/arrow-rs/issues/4135) -- Remove Seek Requirement from CSV ReaderBuilder [\#4130](https://github.com/apache/arrow-rs/issues/4130) -- Inconsistent CSV Inference and Parsing DateTime Handling [\#4129](https://github.com/apache/arrow-rs/issues/4129) +- Release 39.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 38.0.0\) [\#4170](https://github.com/apache/arrow-rs/issues/4170) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Fixed point decimal multiplication for DictionaryArray [\#4135](https://github.com/apache/arrow-rs/issues/4135) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Seek Requirement from CSV ReaderBuilder [\#4130](https://github.com/apache/arrow-rs/issues/4130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inconsistent CSV Inference and Parsing DateTime Handling [\#4129](https://github.com/apache/arrow-rs/issues/4129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) -- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) -- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4112](https://github.com/apache/arrow-rs/issues/4112) -- Support `Interval + Timestamp` and `Interval + Date` in addition to `Timestamp + Interval` and `Interval + Date` [\#4094](https://github.com/apache/arrow-rs/issues/4094) -- Enable setting FlightDescriptor on FlightDataEncoderBuilder [\#3855](https://github.com/apache/arrow-rs/issues/3855) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4112](https://github.com/apache/arrow-rs/issues/4112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Interval + Timestamp` and `Interval + Date` in addition to `Timestamp + Interval` and `Interval + Date` [\#4094](https://github.com/apache/arrow-rs/issues/4094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable setting FlightDescriptor on FlightDataEncoderBuilder [\#3855](https://github.com/apache/arrow-rs/issues/3855) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Parquet Page Index Reader Assumes Consecutive Offsets [\#4149](https://github.com/apache/arrow-rs/issues/4149) -- Equality of nested data types [\#4110](https://github.com/apache/arrow-rs/issues/4110) +- Parquet Page Index Reader Assumes Consecutive Offsets [\#4149](https://github.com/apache/arrow-rs/issues/4149) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Equality of nested data types [\#4110](https://github.com/apache/arrow-rs/issues/4110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** @@ -58,10 +58,11 @@ **Closed issues:** -- add specific error log for empty JSON array [\#4105](https://github.com/apache/arrow-rs/issues/4105) +- add specific error log for empty JSON array [\#4105](https://github.com/apache/arrow-rs/issues/4105) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** +- Prep for 39.0.0 [\#4171](https://github.com/apache/arrow-rs/pull/4171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) - Support Compression in parquet-fromcsv [\#4160](https://github.com/apache/arrow-rs/pull/4160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([suxiaogang223](https://github.com/suxiaogang223)) - feat: support bitwise shift left/right with scalars [\#4159](https://github.com/apache/arrow-rs/pull/4159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) - Cleanup reading page index \(\#4149\) \(\#4090\) [\#4151](https://github.com/apache/arrow-rs/pull/4151) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) From 0e9bdd651fffff155f6c38be692e4242d7f58577 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Sat, 6 May 2023 08:29:55 +0800 Subject: [PATCH 0876/1411] feat: add compression info to print_column_chunk_metadata() (#4176) --- parquet/src/schema/printer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index d90dc423caf7..ad4acb0cb8b1 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -137,6 +137,7 @@ fn print_column_chunk_metadata( writeln!(out, "file path: {file_path_str}"); writeln!(out, "file offset: {}", cc_metadata.file_offset()); writeln!(out, "num of values: {}", cc_metadata.num_values()); + writeln!(out, "compression: {}", cc_metadata.compression()); writeln!( out, "total compressed size (in bytes): {}", From 5e2350a9804201407c33d9ecd5fa90d598b52cfb Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Mon, 8 May 2023 18:53:24 +0300 Subject: [PATCH 0877/1411] Minor: support new types in struct_builder.rs (#4177) --- arrow-array/src/builder/primitive_builder.rs | 4 +++- arrow-array/src/builder/struct_builder.rs | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 6688d07b7055..8721004d27e4 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -41,6 +41,8 @@ pub type UInt16Builder = PrimitiveBuilder; pub type UInt32Builder = PrimitiveBuilder; /// An usigned 64-bit integer array builder. pub type UInt64Builder = PrimitiveBuilder; +/// A 16-bit floating point array builder. +pub type Float16Builder = PrimitiveBuilder; /// A 32-bit floating point array builder. pub type Float32Builder = PrimitiveBuilder; /// A 64-bit floating point array builder. @@ -180,7 +182,7 @@ impl PrimitiveBuilder { /// data type of the generated array. /// /// This method allows overriding the data type, to allow specifying timezones - /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] + /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] /// /// # Panics /// diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index ebffeafcf75f..4702bb734266 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -109,9 +109,13 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box Box::new(UInt16Builder::with_capacity(capacity)), DataType::UInt32 => Box::new(UInt32Builder::with_capacity(capacity)), DataType::UInt64 => Box::new(UInt64Builder::with_capacity(capacity)), + DataType::Float16 => Box::new(Float16Builder::with_capacity(capacity)), DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)), DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), + DataType::LargeBinary => { + Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)) + } DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } @@ -119,7 +123,14 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box Box::new( + Decimal256Builder::with_capacity(capacity) + .with_data_type(DataType::Decimal256(*p, *s)), + ), DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), + DataType::LargeUtf8 => { + Box::new(LargeStringBuilder::with_capacity(capacity, 1024)) + } DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), DataType::Time32(TimeUnit::Second) => { From 62958223fb522945ee466b5ec426cbb3f15f405a Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 9 May 2023 03:36:50 -0700 Subject: [PATCH 0878/1411] Allow format specification in cast (#4169) * Allow format specification in cast * Add documentation change * Pass format options into value_to_string for Decimal types --- arrow-cast/src/cast.rs | 500 +++++++++++++++++++++++++++++++------- arrow-cast/src/display.rs | 2 +- 2 files changed, 414 insertions(+), 88 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 12e80cab4ffe..d015f4952836 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -56,12 +56,21 @@ use num::{NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors #[derive(Debug, Clone, PartialEq, Eq)] -pub struct CastOptions { +pub struct CastOptions<'a> { /// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false) pub safe: bool, + /// Formatting options when casting from temporal types to string + pub format_options: FormatOptions<'a>, } -pub const DEFAULT_CAST_OPTIONS: CastOptions = CastOptions { safe: true }; +impl<'a> Default for CastOptions<'a> { + fn default() -> Self { + Self { + safe: true, + format_options: FormatOptions::default(), + } + } +} /// Return true if a value of type `from_type` can be cast into a /// value of `to_type`. Note that such as cast may be lossy. @@ -334,7 +343,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { /// * List to primitive /// * Interval and duration pub fn cast(array: &dyn Array, to_type: &DataType) -> Result { - cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) + cast_with_options(array, to_type, &CastOptions::default()) } fn cast_integer_to_decimal< @@ -947,8 +956,8 @@ pub fn cast_with_options( x as f64 / 10_f64.powi(*scale as i32) }) } - Utf8 => value_to_string::(array), - LargeUtf8 => value_to_string::(array), + Utf8 => value_to_string::(array, Some(&cast_options.format_options)), + LargeUtf8 => value_to_string::(array, Some(&cast_options.format_options)), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -1016,8 +1025,8 @@ pub fn cast_with_options( x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }) } - Utf8 => value_to_string::(array), - LargeUtf8 => value_to_string::(array), + Utf8 => value_to_string::(array, Some(&cast_options.format_options)), + LargeUtf8 => value_to_string::(array, Some(&cast_options.format_options)), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -1413,8 +1422,8 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array), - (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array), + (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array, Some(&cast_options.format_options)), + (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array, Some(&cast_options.format_options)), // start numeric casts (UInt8, UInt16) => { cast_numeric_arrays::(array, cast_options) @@ -2450,10 +2459,14 @@ where fn value_to_string( array: &dyn Array, + options: Option<&FormatOptions>, ) -> Result { let mut builder = GenericStringBuilder::::new(); - let options = FormatOptions::default(); - let formatter = ArrayFormatter::try_new(array, &options)?; + let mut fmt_options = &FormatOptions::default(); + if let Some(fmt_opts) = options { + fmt_options = fmt_opts; + }; + let formatter = ArrayFormatter::try_new(array, fmt_options)?; let nulls = array.nulls(); for i in 0..array.len() { match nulls.map(|x| x.is_null(i)).unwrap_or_default() { @@ -3871,7 +3884,10 @@ mod tests { } } - let cast_option = CastOptions { safe: false }; + let cast_option = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let casted_array_with_option = cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); let result_array = casted_array_with_option @@ -4079,8 +4095,14 @@ mod tests { let array = vec![Some(i128::MAX)]; let array = create_decimal_array(array, 38, 3).unwrap(); - let result = - cast_with_options(&array, &output_type, &CastOptions { safe: false }); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!("Cast error: Cannot cast to Decimal128(38, 38). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -4093,8 +4115,14 @@ mod tests { let array = vec![Some(i128::MAX)]; let array = create_decimal_array(array, 38, 3).unwrap(); - let result = - cast_with_options(&array, &output_type, &CastOptions { safe: false }); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!("Cast error: Cannot cast to Decimal256(76, 76). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -4126,8 +4154,14 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i256::from_i128(i128::MAX))]; let array = create_decimal256_array(array, 76, 5).unwrap(); - let result = - cast_with_options(&array, &output_type, &CastOptions { safe: false }); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!("Cast error: Cannot cast to Decimal128(38, 7). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -4139,8 +4173,14 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i256::from_i128(i128::MAX))]; let array = create_decimal256_array(array, 76, 5).unwrap(); - let result = - cast_with_options(&array, &output_type, &CastOptions { safe: false }); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!("Cast error: Cannot cast to Decimal256(76, 55). Overflowing on 170141183460469231731687303715884105727", result.unwrap_err().to_string()); } @@ -4286,30 +4326,54 @@ mod tests { // overflow test: out of range of max u8 let value_array: Vec> = vec![Some(51300)]; let array = create_decimal_array(value_array, 38, 2).unwrap(); - let casted_array = - cast_with_options(&array, &DataType::UInt8, &CastOptions { safe: false }); + let casted_array = cast_with_options( + &array, + &DataType::UInt8, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!( "Cast error: value of 513 is out of range UInt8".to_string(), casted_array.unwrap_err().to_string() ); - let casted_array = - cast_with_options(&array, &DataType::UInt8, &CastOptions { safe: true }); + let casted_array = cast_with_options( + &array, + &DataType::UInt8, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(24400)]; let array = create_decimal_array(value_array, 38, 2).unwrap(); - let casted_array = - cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); + let casted_array = cast_with_options( + &array, + &DataType::Int8, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!( "Cast error: value of 244 is out of range Int8".to_string(), casted_array.unwrap_err().to_string() ); - let casted_array = - cast_with_options(&array, &DataType::Int8, &CastOptions { safe: true }); + let casted_array = cast_with_options( + &array, + &DataType::Int8, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -4465,15 +4529,27 @@ mod tests { // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(i256::from_i128(24400))]; let array = create_decimal256_array(value_array, 38, 2).unwrap(); - let casted_array = - cast_with_options(&array, &DataType::Int8, &CastOptions { safe: false }); + let casted_array = cast_with_options( + &array, + &DataType::Int8, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); assert_eq!( "Cast error: value of 244 is out of range Int8".to_string(), casted_array.unwrap_err().to_string() ); - let casted_array = - cast_with_options(&array, &DataType::Int8, &CastOptions { safe: true }); + let casted_array = cast_with_options( + &array, + &DataType::Int8, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -4886,7 +4962,10 @@ mod tests { fn test_cast_int32_to_u8_with_error() { let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); // overflow with the error - let cast_option = CastOptions { safe: false }; + let cast_option = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let result = cast_with_options(&array, &DataType::UInt8, &cast_option); assert!(result.is_err()); result.unwrap(); @@ -5010,8 +5089,14 @@ mod tests { #[test] fn test_cast_with_options_utf8_to_i32() { let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); - let result = - cast_with_options(&array, &DataType::Int32, &CastOptions { safe: false }); + let result = cast_with_options( + &array, + &DataType::Int32, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); match result { Ok(_) => panic!("expected error"), Err(e) => { @@ -5037,8 +5122,14 @@ mod tests { #[test] fn test_cast_with_options_utf8_to_bool() { let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); - let casted = - cast_with_options(&strings, &DataType::Boolean, &CastOptions { safe: false }); + let casted = cast_with_options( + &strings, + &DataType::Boolean, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); match casted { Ok(_) => panic!("expected error"), Err(e) => { @@ -5244,7 +5335,10 @@ mod tests { } } - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!( err.to_string(), @@ -5282,7 +5376,10 @@ mod tests { assert!(c.is_null(1)); assert!(c.is_null(2)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type"); } @@ -5314,7 +5411,10 @@ mod tests { assert!(c.is_null(3)); assert!(c.is_null(4)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string '08:08:61.091323414' to value of Time32(Second) type"); } @@ -5346,7 +5446,10 @@ mod tests { assert!(c.is_null(3)); assert!(c.is_null(4)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string '08:08:61.091323414' to value of Time32(Millisecond) type"); } @@ -5372,7 +5475,10 @@ mod tests { assert!(c.is_null(1)); assert!(c.is_null(2)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid time' to value of Time64(Microsecond) type"); } @@ -5398,7 +5504,10 @@ mod tests { assert!(c.is_null(1)); assert!(c.is_null(2)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid time' to value of Time64(Nanosecond) type"); } @@ -5424,7 +5533,10 @@ mod tests { assert!(c.is_null(1)); assert!(c.is_null(2)); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type"); } @@ -5435,7 +5547,10 @@ mod tests { let source_string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; - let options = CastOptions { safe: true }; + let options = CastOptions { + safe: true, + format_options: FormatOptions::default(), + }; let target_interval_array = cast_with_options( &source_string_array.clone(), @@ -5559,7 +5674,10 @@ mod tests { macro_rules! test_unsafe_string_to_interval_err { ($data_vec:expr, $interval_unit:expr, $error_msg:expr) => { let string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let arrow_err = cast_with_options( &string_array.clone(), &DataType::Interval($interval_unit), @@ -5659,14 +5777,20 @@ mod tests { let array_ref = cast_with_options( &a1, &DataType::FixedSizeBinary(5), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(array_ref.is_err()); let array_ref = cast_with_options( &a2, &DataType::FixedSizeBinary(5), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(array_ref.is_err()); } @@ -5751,7 +5875,10 @@ mod tests { assert!(b.is_null(0)); // test overflow, unsafe cast let array = TimestampSecondArray::from(vec![Some(i64::MAX)]); - let options = CastOptions { safe: false }; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let b = cast_with_options(&array, &DataType::Date64, &options); assert!(b.is_err()); } @@ -6000,6 +6127,136 @@ mod tests { assert_eq!("2018-12-25T00:00:00", c.value(1)); } + #[test] + fn test_cast_timestamp_to_strings() { + // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None + let array = TimestampMillisecondArray::from(vec![ + Some(864000003005), + Some(1545696002001), + None, + ]); + let out = cast(&array, &DataType::Utf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19T00:00:03.005"), + Some("2018-12-25T00:00:02.001"), + None + ] + ); + let out = cast(&array, &DataType::LargeUtf8).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19T00:00:03.005"), + Some("2018-12-25T00:00:02.001"), + None + ] + ); + } + + #[test] + fn test_cast_timestamp_to_strings_opt() { + let ts_format = "%Y-%m-%d %H:%M:%S%.6f"; + let tz = "+0545"; // UTC + 0545 is Asia/Kathmandu + let cast_options = CastOptions { + safe: true, + format_options: FormatOptions::default() + .with_timestamp_format(Some(ts_format)) + .with_timestamp_tz_format(Some(ts_format)), + }; + // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None + let array_without_tz = TimestampMillisecondArray::from(vec![ + Some(864000003005), + Some(1545696002001), + None, + ]); + let out = + cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19 00:00:03.005000"), + Some("2018-12-25 00:00:02.001000"), + None + ] + ); + let out = + cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19 00:00:03.005000"), + Some("2018-12-25 00:00:02.001000"), + None + ] + ); + + let array_with_tz = TimestampMillisecondArray::from(vec![ + Some(864000003005), + Some(1545696002001), + None, + ]) + .with_timezone(tz.to_string()); + let out = + cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19 05:45:03.005000"), + Some("2018-12-25 05:45:02.001000"), + None + ] + ); + let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options) + .unwrap(); + let out = out + .as_any() + .downcast_ref::() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!( + out, + vec![ + Some("1997-05-19 05:45:03.005000"), + Some("2018-12-25 05:45:02.001000"), + None + ] + ); + } + #[test] fn test_cast_between_timestamps() { let array = TimestampMillisecondArray::from(vec![ @@ -7698,7 +7955,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal128(38, 30), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -7706,7 +7966,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal128(38, 30), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); } @@ -7718,7 +7981,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal256(76, 76), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -7726,7 +7992,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal256(76, 76), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); } @@ -7738,7 +8007,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal128(38, 30), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -7746,7 +8018,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal128(38, 30), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); let err = casted_array.unwrap_err().to_string(); let expected_error = "Cast error: Cannot cast to Decimal128(38, 30)"; @@ -7763,7 +8038,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal256(76, 50), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -7771,7 +8049,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal256(76, 50), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); let err = casted_array.unwrap_err().to_string(); let expected_error = "Cast error: Cannot cast to Decimal256(76, 50)"; @@ -8097,7 +8378,10 @@ mod tests { let output_type = DataType::Decimal128(38, 2); let str_array = StringArray::from(vec!["4.4.5"]); let array = Arc::new(str_array) as ArrayRef; - let option = CastOptions { safe: false }; + let option = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; let casted_err = cast_with_options(&array, &output_type, &option).unwrap_err(); assert!(casted_err .to_string() @@ -8324,7 +8608,10 @@ mod tests { let b = cast_with_options( &array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.clone())), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ) .unwrap(); @@ -8373,7 +8660,10 @@ mod tests { let v1: &[u8] = b"\xFF invalid"; let v2: &[u8] = b"\x00 Foo"; let s = BinaryArray::from(vec![v1, v2]); - let options = CastOptions { safe: true }; + let options = CastOptions { + safe: true, + format_options: FormatOptions::default(), + }; let array = cast_with_options(&s, &DataType::Utf8, &options).unwrap(); let a = array.as_string::(); a.to_data().validate_full().unwrap(); @@ -8467,7 +8757,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal128(7, 3), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -8475,7 +8768,10 @@ mod tests { let err = cast_with_options( &array, &DataType::Decimal128(7, 3), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert_eq!("Invalid argument error: 1234567000 is too large to store in a Decimal128 of precision 7. Max is 9999999", err.unwrap_err().to_string()); } @@ -8487,7 +8783,10 @@ mod tests { let casted_array = cast_with_options( &array, &DataType::Decimal256(7, 3), - &CastOptions { safe: true }, + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_ok()); assert!(casted_array.unwrap().is_null(0)); @@ -8495,7 +8794,10 @@ mod tests { let err = cast_with_options( &array, &DataType::Decimal256(7, 3), - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert_eq!("Invalid argument error: 1234567000 is too large to store in a Decimal256 of precision 7. Max is 9999999", err.unwrap_err().to_string()); } @@ -8532,7 +8834,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_duration_to_interval::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8544,14 +8846,17 @@ mod tests { let array = vec![i64::MAX]; let casted_array = cast_from_duration_to_interval::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_duration_to_interval::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8559,7 +8864,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_duration_to_interval::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8571,14 +8876,17 @@ mod tests { let array = vec![i64::MAX]; let casted_array = cast_from_duration_to_interval::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_duration_to_interval::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8586,7 +8894,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_duration_to_interval::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8598,14 +8906,17 @@ mod tests { let array = vec![i64::MAX]; let casted_array = cast_from_duration_to_interval::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_duration_to_interval::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8613,7 +8924,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_duration_to_interval::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8625,7 +8936,10 @@ mod tests { let array = vec![i64::MAX]; let casted_array = cast_from_duration_to_interval::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ) .unwrap(); assert_eq!(casted_array.value(0), 9223372036854775807); @@ -8657,7 +8971,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_interval_to_duration::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8669,14 +8983,17 @@ mod tests { let array = vec![i128::MAX]; let casted_array = cast_from_interval_to_duration::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_interval_to_duration::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8684,7 +9001,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_interval_to_duration::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!(casted_array.value(0), 1); @@ -8692,14 +9009,17 @@ mod tests { let array = vec![i128::MAX]; let casted_array = cast_from_interval_to_duration::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_interval_to_duration::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8707,7 +9027,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_interval_to_duration::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8719,14 +9039,17 @@ mod tests { let array = vec![i128::MAX]; let casted_array = cast_from_interval_to_duration::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = cast_from_interval_to_duration::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); @@ -8734,7 +9057,7 @@ mod tests { let array = vec![1234567]; let casted_array = cast_from_interval_to_duration::( array, - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8746,7 +9069,7 @@ mod tests { let array = vec![i128::MAX]; let casted_array = cast_from_interval_to_duration::( array.clone(), - &DEFAULT_CAST_OPTIONS, + &CastOptions::default(), ) .unwrap(); assert_eq!( @@ -8757,7 +9080,10 @@ mod tests { let casted_array = cast_from_interval_to_duration::( array, - &CastOptions { safe: false }, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, ); assert!(casted_array.is_err()); } diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 0bca9ce657b8..1c2ecfc5ed0d 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -39,7 +39,7 @@ type TimeFormat<'a> = Option<&'a str>; /// By default nulls are formatted as `""` and temporal types formatted /// according to RFC3339 /// -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct FormatOptions<'a> { /// If set to `true` any formatting errors will be written to the output /// instead of being converted into a [`std::fmt::Error`] From 51a9d0f9c9f40bbe229b1f85c131ac659b065e43 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 9 May 2023 17:40:19 +0100 Subject: [PATCH 0879/1411] Add Sliced ListArray test (#3748) (#4186) --- arrow-ipc/src/writer.rs | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 8f36f8c04dc0..b2fcec08d845 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1374,8 +1374,8 @@ mod tests { use std::io::Seek; use std::sync::Arc; - use arrow_array::builder::PrimitiveRunBuilder; use arrow_array::builder::UnionBuilder; + use arrow_array::builder::{ListBuilder, PrimitiveRunBuilder, UInt32Builder}; use arrow_array::types::*; use arrow_schema::DataType; @@ -2106,4 +2106,39 @@ mod tests { assert_eq!(expected, actual); } } + + #[test] + fn encode_lists() { + let val_inner = Field::new("item", DataType::UInt32, true); + let val_list_field = Field::new_list("val", val_inner, false); + + let schema = Arc::new(Schema::new(vec![val_list_field])); + + let values = { + let u32 = UInt32Builder::new(); + let mut ls = ListBuilder::new(u32); + + for list in vec![vec![1u32, 2, 3], vec![4, 5, 6], vec![7, 8, 9, 10]] { + for value in list { + ls.values().append_value(value); + } + ls.append(true) + } + + ls.finish() + }; + + let batch = + RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(values)]).unwrap(); + let batch = batch.slice(1, 1); + + let mut writer = FileWriter::try_new(Vec::::new(), &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + let data = writer.into_inner().unwrap(); + + let mut reader = FileReader::try_new(Cursor::new(data), None).unwrap(); + let batch2 = reader.next().unwrap().unwrap(); + assert_eq!(batch, batch2); + } } From 6280a709c025e5585ada7fcc27431ca9314bb403 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 9 May 2023 17:43:23 +0100 Subject: [PATCH 0880/1411] Faster prefix match in object_store path handling (#4164) * Faster prefix match * Simplify parts --- object_store/src/path/mod.rs | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index a15f7ca0f0ab..29b134176955 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -227,14 +227,9 @@ impl Path { /// Returns the [`PathPart`] of this [`Path`] pub fn parts(&self) -> impl Iterator> { - match self.raw.is_empty() { - true => itertools::Either::Left(std::iter::empty()), - false => itertools::Either::Right( - self.raw - .split(DELIMITER) - .map(|s| PathPart { raw: s.into() }), - ), - } + self.raw + .split_terminator(DELIMITER) + .map(|s| PathPart { raw: s.into() }) } /// Returns the last path segment containing the filename stored in this [`Path`] @@ -265,20 +260,14 @@ impl Path { &self, prefix: &Self, ) -> Option> + '_> { - let diff = itertools::diff_with(self.parts(), prefix.parts(), |a, b| a == b); - - match diff { - // Both were equal - None => Some(itertools::Either::Left(std::iter::empty())), - // Mismatch or prefix was longer => None - Some( - itertools::Diff::FirstMismatch(_, _, _) | itertools::Diff::Longer(_, _), - ) => None, - // Match with remaining - Some(itertools::Diff::Shorter(_, back)) => { - Some(itertools::Either::Right(back)) - } + let mut stripped = self.raw.strip_prefix(&prefix.raw)?; + if !stripped.is_empty() && !prefix.raw.is_empty() { + stripped = stripped.strip_prefix(DELIMITER)?; } + let iter = stripped + .split_terminator(DELIMITER) + .map(|x| PathPart { raw: x.into() }); + Some(iter) } /// Returns true if this [`Path`] starts with `prefix` @@ -453,6 +442,8 @@ mod tests { let prefix = existing_path.clone(); assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + + assert_eq!(Path::default().parts().count(), 0); } #[test] From d67142d43881681a2782476587de69d4e072247b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 9 May 2023 19:23:40 +0100 Subject: [PATCH 0881/1411] Remove powf_scalar kernel (#4187) --- arrow-arith/src/arithmetic.rs | 25 ------------------------- arrow-array/src/numeric.rs | 29 ----------------------------- arrow/src/datatypes/mod.rs | 4 +--- 3 files changed, 1 insertion(+), 57 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 40ae3255b98c..42f6e3974301 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -29,7 +29,6 @@ use arrow_array::*; use arrow_buffer::i256; use arrow_buffer::ArrowNativeType; use arrow_schema::*; -use num::traits::Pow; use std::cmp::min; use std::sync::Arc; @@ -1342,18 +1341,6 @@ pub fn negate_checked( try_unary(array, |value| value.neg_checked()) } -/// Raise array with floating point values to the power of a scalar. -pub fn powf_scalar( - array: &PrimitiveArray, - raise: T::Native, -) -> Result, ArrowError> -where - T: ArrowFloatNumericType, - T::Native: Pow, -{ - Ok(unary(array, |x| x.pow(raise))) -} - /// Perform `left * right` operation on two arrays. If either left or right value is null /// then the result is also null. /// @@ -3217,18 +3204,6 @@ mod tests { assert_eq!(expected, actual); } - #[test] - fn test_primitive_array_raise_power_scalar() { - let a = Float64Array::from(vec![1.0, 2.0, 3.0]); - let actual = powf_scalar(&a, 2.0).unwrap(); - let expected = Float64Array::from(vec![1.0, 4.0, 9.0]); - assert_eq!(expected, actual); - let a = Float64Array::from(vec![Some(1.0), None, Some(3.0)]); - let actual = powf_scalar(&a, 2.0).unwrap(); - let expected = Float64Array::from(vec![Some(1.0), None, Some(9.0)]); - assert_eq!(expected, actual); - } - #[test] fn test_primitive_add_wrapping_overflow() { let a = Int32Array::from(vec![i32::MAX, i32::MIN]); diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index 9d9048085106..afc0e2c33010 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -558,35 +558,6 @@ impl ArrowNumericType for Decimal256Type { } } -/// A subtype of primitive type that represents numeric float values -#[cfg(feature = "simd")] -pub trait ArrowFloatNumericType: ArrowNumericType { - /// SIMD version of pow - fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd; -} - -/// A subtype of primitive type that represents numeric float values -#[cfg(not(feature = "simd"))] -pub trait ArrowFloatNumericType: ArrowNumericType {} - -macro_rules! make_float_numeric_type { - ($impl_ty:ty, $simd_ty:ident) => { - #[cfg(feature = "simd")] - impl ArrowFloatNumericType for $impl_ty { - #[inline] - fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd { - base.powf(raise) - } - } - - #[cfg(not(feature = "simd"))] - impl ArrowFloatNumericType for $impl_ty {} - }; -} - -make_float_numeric_type!(Float32Type, f32x16); -make_float_numeric_type!(Float64Type, f64x8); - #[cfg(all(test, feature = "simd"))] mod tests { use super::*; diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 74dad6b4a8c8..840e98ab0ded 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -23,9 +23,7 @@ //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. pub use arrow_array::types::*; -pub use arrow_array::{ - ArrowFloatNumericType, ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, -}; +pub use arrow_array::{ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType}; pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ From 9f0fe6b7ecb3c8bd7591f2ccd78dd92b50563988 Mon Sep 17 00:00:00 2001 From: jakevin Date: Wed, 10 May 2023 15:54:24 +0800 Subject: [PATCH 0882/1411] refactor: simplify can_cast_types code. (#4185) --- arrow-cast/src/cast.rs | 184 ++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 122 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index d015f4952836..37fede0a6fe0 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -78,6 +78,8 @@ impl<'a> Default for CastOptions<'a> { /// If this function returns true to stay consistent with the `cast` kernel below. pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { use self::DataType::*; + use self::IntervalUnit::*; + use self::TimeUnit::*; if from_type == to_type { return true; } @@ -113,7 +115,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | FixedSizeList(_, _) | Struct(_) | Map(_, _) - | Dictionary(_, _) + | Dictionary(_, _), ) => true, // Dictionary/List conditions should be put in front of others (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { @@ -133,7 +135,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (LargeList(list_from), List(list_to)) => { list_from.data_type() == list_to.data_type() } - (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => can_cast_types(list_from.data_type(), to_type), + (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => { + can_cast_types(list_from.data_type(), to_type) + } (List(_), _) => false, (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), @@ -149,114 +153,54 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | // decimal to unsigned numeric - (Decimal128(_, _), UInt8 | UInt16 | UInt32 | UInt64) | - (Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | + (Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric - (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | - (Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, + (Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, // decimal to Utf8 - (Decimal128(_, _), Utf8 | LargeUtf8) => true, - (Decimal256(_, _), Utf8 | LargeUtf8) => true, + (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, // Utf8 to decimal - (Utf8 | LargeUtf8, Decimal128(_, _)) => true, - (Utf8 | LargeUtf8, Decimal256(_, _)) => true, - (Decimal128(_, _), _) => false, - (_, Decimal128(_, _)) => false, - (Decimal256(_, _), _) => false, - (_, Decimal256(_, _)) => false, + (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, + (Decimal128(_, _) | Decimal256(_, _), _) => false, + (_, Decimal128(_, _) | Decimal256(_, _)) => false, (Struct(_), _) => false, (_, Struct(_)) => false, - (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8 || from_type == &LargeUtf8, - (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8, + (_, Boolean) => { + DataType::is_numeric(from_type) + || from_type == &Utf8 + || from_type == &LargeUtf8 + } + (Boolean, _) => { + DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 + } (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, (FixedSizeBinary(_), Binary | LargeBinary) => true, - (Utf8, - Binary - | LargeBinary - | LargeUtf8 - | Date32 - | Date64 - | Time32(TimeUnit::Second) - | Time32(TimeUnit::Millisecond) - | Time64(TimeUnit::Microsecond) - | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Second, _) - | Timestamp(TimeUnit::Millisecond, _) - | Timestamp(TimeUnit::Microsecond, _) - | Timestamp(TimeUnit::Nanosecond, _) - | Interval(_) - ) => true, - (Utf8, _) => to_type.is_numeric() && to_type != &Float16, - (LargeUtf8, + ( + Utf8 | LargeUtf8, Binary | LargeBinary | Utf8 + | LargeUtf8 | Date32 | Date64 - | Time32(TimeUnit::Second) - | Time32(TimeUnit::Millisecond) - | Time64(TimeUnit::Microsecond) - | Time64(TimeUnit::Nanosecond) - | Timestamp(TimeUnit::Second, _) - | Timestamp(TimeUnit::Millisecond, _) - | Timestamp(TimeUnit::Microsecond, _) - | Timestamp(TimeUnit::Nanosecond, _) - | Interval(_) + | Time32(Second) + | Time32(Millisecond) + | Time64(Microsecond) + | Time64(Nanosecond) + | Timestamp(Second, _) + | Timestamp(Millisecond, _) + | Timestamp(Microsecond, _) + | Timestamp(Nanosecond, _) + | Interval(_), ) => true, - (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, + (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), // start numeric casts ( - UInt8, - UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - UInt16, - UInt8 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - UInt32, - UInt8 | UInt16 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - UInt64, - UInt8 | UInt16 | UInt32 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - Int8, - UInt8 | UInt16 | UInt32 | UInt64 | Int16 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - Int16, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int32 | Int64 | Float32 | Float64, - ) => true, - - ( - Int32, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int64 | Float32 | Float64, - ) => true, - - ( - Int64, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Float32 | Float64, - ) => true, - - ( - Float32, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float64, - ) => true, - - ( - Float64, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, ) => true, // end numeric casts @@ -267,53 +211,49 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Int64, Date64 | Date32 | Time64(_)) => true, (Date64, Int64 | Int32) => true, (Time64(_), Int64) => true, - (Date32, Date64) => true, - (Date64, Date32) => true, - (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => true, - (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => true, + (Date32 | Date64, Date32 | Date64) => true, + // time casts + (Time32(_), Time32(_)) => true, (Time32(_), Time64(_)) => true, - (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => true, - (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => true, + (Time64(_), Time64(_)) => true, (Time64(_), Time32(to_unit)) => { - matches!(to_unit, TimeUnit::Second | TimeUnit::Millisecond) + matches!(to_unit, Second | Millisecond) } (Timestamp(_, _), Int64) => true, (Int64, Timestamp(_, _)) => true, (Date64, Timestamp(_, None)) => true, (Date32, Timestamp(_, None)) => true, - (Timestamp(_, _), + ( + Timestamp(_, _), Timestamp(_, _) | Date32 | Date64 - | Time32(TimeUnit::Second) - | Time32(TimeUnit::Millisecond) - | Time64(TimeUnit::Microsecond) - | Time64(TimeUnit::Nanosecond)) => true, + | Time32(Second) + | Time32(Millisecond) + | Time64(Microsecond) + | Time64(Nanosecond), + ) => true, (Int64, Duration(_)) => true, (Duration(_), Int64) => true, (Interval(from_type), Int64) => { match from_type { - IntervalUnit::YearMonth => true, - IntervalUnit::DayTime => true, - IntervalUnit::MonthDayNano => false, // Native type is i128 - } - } - (Int32, Interval(to_type)) => { - match to_type { - IntervalUnit::YearMonth => true, - IntervalUnit::DayTime => false, - IntervalUnit::MonthDayNano => false, - } - } - (Int64, Interval(to_type)) => { - match to_type { - IntervalUnit::YearMonth => false, - IntervalUnit::DayTime => true, - IntervalUnit::MonthDayNano => false, + YearMonth => true, + DayTime => true, + MonthDayNano => false, // Native type is i128 } } - (Duration(_), Interval(IntervalUnit::MonthDayNano)) => true, - (Interval(IntervalUnit::MonthDayNano), Duration(_)) => true, + (Int32, Interval(to_type)) => match to_type { + YearMonth => true, + DayTime => false, + MonthDayNano => false, + }, + (Int64, Interval(to_type)) => match to_type { + YearMonth => false, + DayTime => true, + MonthDayNano => false, + }, + (Duration(_), Interval(MonthDayNano)) => true, + (Interval(MonthDayNano), Duration(_)) => true, (_, _) => false, } } From 016e7a07f88ca510efe41269500a9130262a99bb Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Wed, 10 May 2023 01:37:17 -0700 Subject: [PATCH 0883/1411] Object Store (AWS): Support dynamically resolving S3 bucket region (#4188) * feat(object_store): resolve aws region using bucket name * feat(object_store): resolve bucket region as floating fn * fix(object_store): clippy warnings * Cleanup error handling --------- Co-authored-by: Raphael Taylor-Davies --- object_store/src/aws/mod.rs | 73 ++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 34d468f395a4..bc852ed48759 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -38,7 +38,7 @@ use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; use std::ops::Range; use std::str::FromStr; @@ -144,6 +144,18 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, } impl From for super::Error { @@ -160,6 +172,38 @@ impl From for super::Error { } } +/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html +pub async fn resolve_bucket_region( + bucket: &str, + client_options: &ClientOptions, +) -> Result { + use reqwest::StatusCode; + + let endpoint = format!("https://{}.s3.amazonaws.com", bucket); + + let client = client_options.client()?; + + let response = client + .head(&endpoint) + .send() + .await + .context(ResolveRegionSnafu { bucket })?; + + ensure!( + response.status() != StatusCode::NOT_FOUND, + BucketNotFoundSnafu { bucket } + ); + + let region = response + .headers() + .get("x-amz-bucket-region") + .and_then(|x| x.to_str().ok()) + .context(RegionParseSnafu { bucket })?; + + Ok(region.to_string()) +} + /// Interface for [Amazon S3](https://aws.amazon.com/s3/). #[derive(Debug)] pub struct AmazonS3 { @@ -1563,3 +1607,30 @@ mod tests { } } } + +#[cfg(test)] +mod s3_resolve_bucket_region_tests { + use super::*; + + #[tokio::test] + async fn test_private_bucket() { + let bucket = "bloxbender"; + + let region = resolve_bucket_region(bucket, &ClientOptions::new()) + .await + .unwrap(); + + let expected = "us-west-2".to_string(); + + assert_eq!(region, expected); + } + + #[tokio::test] + async fn test_bucket_does_not_exist() { + let bucket = "please-dont-exist"; + + let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; + + assert!(result.is_err()); + } +} From 2ec8571e2527e64d3b82e16f03dd538a41be0fe7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 14:21:55 +0100 Subject: [PATCH 0884/1411] Fix ImdsManagedIdentityProvider (#4096) (#4193) --- object_store/src/azure/credential.rs | 21 +++++++++++++++------ object_store/src/azure/mod.rs | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 0196d93d8d2a..8130df6361fd 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -50,8 +50,17 @@ pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; const CONTENT_TYPE_JSON: &str = "application/json"; const MSI_SECRET_ENV_KEY: &str = "IDENTITY_HEADER"; const MSI_API_VERSION: &str = "2019-08-01"; + +/// OIDC scope used when interacting with OAuth2 APIs +/// +/// const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; +/// Resource ID used when obtaining an access token from the metadata endpoint +/// +/// +const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Error performing token request: {}", source))] @@ -383,7 +392,7 @@ struct MsiTokenResponse { /// This authentication type works in Azure VMs, App Service and Azure Functions applications, as well as the Azure Cloud Shell /// #[derive(Debug)] -pub struct ImdsManagedIdentityOAuthProvider { +pub struct ImdsManagedIdentityProvider { msi_endpoint: String, client_id: Option, object_id: Option, @@ -391,8 +400,8 @@ pub struct ImdsManagedIdentityOAuthProvider { client: Client, } -impl ImdsManagedIdentityOAuthProvider { - /// Create a new [`ImdsManagedIdentityOAuthProvider`] for an azure backed store +impl ImdsManagedIdentityProvider { + /// Create a new [`ImdsManagedIdentityProvider`] for an azure backed store pub fn new( client_id: Option, object_id: Option, @@ -415,7 +424,7 @@ impl ImdsManagedIdentityOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for ImdsManagedIdentityOAuthProvider { +impl TokenCredential for ImdsManagedIdentityProvider { /// Fetch a token async fn fetch_token( &self, @@ -424,7 +433,7 @@ impl TokenCredential for ImdsManagedIdentityOAuthProvider { ) -> Result> { let mut query_items = vec![ ("api-version", MSI_API_VERSION), - ("resource", AZURE_STORAGE_SCOPE), + ("resource", AZURE_STORAGE_RESOURCE), ]; let mut identity = None; @@ -709,7 +718,7 @@ mod tests { )) }); - let credential = ImdsManagedIdentityOAuthProvider::new( + let credential = ImdsManagedIdentityProvider::new( Some("client_id".into()), None, None, diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 11350a202c72..ddfd02820f1d 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -1035,7 +1035,7 @@ impl MicrosoftAzureBuilder { } else { let client = self.client_options.clone().with_allow_http(true).client()?; - let msi_credential = credential::ImdsManagedIdentityOAuthProvider::new( + let msi_credential = credential::ImdsManagedIdentityProvider::new( self.client_id, self.object_id, self.msi_resource_id, From b3a99819f0d0448ddd7b311a8bc5c9625f3544bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 14:51:19 +0100 Subject: [PATCH 0885/1411] Simplify ObjectStore configuration pattern (#4189) --- object_store/src/aws/checksum.rs | 17 +++- object_store/src/aws/mod.rs | 155 ++++++++++++------------------- object_store/src/azure/mod.rs | 107 +++++++-------------- object_store/src/client/retry.rs | 6 +- object_store/src/gcp/mod.rs | 112 +++++++--------------- 5 files changed, 139 insertions(+), 258 deletions(-) diff --git a/object_store/src/aws/checksum.rs b/object_store/src/aws/checksum.rs index c787c28a8df0..57762b641ac6 100644 --- a/object_store/src/aws/checksum.rs +++ b/object_store/src/aws/checksum.rs @@ -16,6 +16,7 @@ // under the License. use ring::digest::{self, digest as ring_digest}; +use std::str::FromStr; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -47,13 +48,21 @@ impl std::fmt::Display for Checksum { } } -impl TryFrom<&String> for Checksum { - type Error = (); +impl FromStr for Checksum { + type Err = (); - fn try_from(value: &String) -> Result { - match value.to_lowercase().as_str() { + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { "sha256" => Ok(Self::SHA256), _ => Err(()), } } } + +impl TryFrom<&String> for Checksum { + type Error = (); + + fn try_from(value: &String) -> Result { + value.parse() + } +} diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index bc852ed48759..5de177afa10a 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -467,7 +467,7 @@ pub struct AmazonS3Builder { /// When set to true, unsigned payload option has to be used unsigned_payload: bool, /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option, + checksum_algorithm: Option, /// Metadata endpoint, see metadata_endpoint: Option, /// Profile name, see @@ -478,30 +478,17 @@ pub struct AmazonS3Builder { /// Configuration keys for [`AmazonS3Builder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](AmazonS3Builder::try_with_option) -/// or [`with_options`](AmazonS3Builder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; -/// -/// let options = HashMap::from([ -/// ("aws_access_key_id", "my-access-key-id"), -/// ("aws_secret_access_key", "my-secret-access-key"), -/// ]); -/// let typed_options = vec![ -/// (AmazonS3ConfigKey::DefaultRegion, "my-default-region"), -/// ]; -/// let aws = AmazonS3Builder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(AmazonS3ConfigKey::Region, "my-region") -/// .unwrap(); +/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// let builder = AmazonS3Builder::new() +/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") +/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] pub enum AmazonS3ConfigKey { /// AWS Access Key /// @@ -706,7 +693,7 @@ impl AmazonS3Builder { if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -754,14 +741,12 @@ impl AmazonS3Builder { } /// Set an option on the builder via a key - value pair. - /// - /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. - pub fn try_with_option( + pub fn with_config( mut self, - key: impl AsRef, + key: AmazonS3ConfigKey, value: impl Into, - ) -> Result { - match AmazonS3ConfigKey::from_str(key.as_ref())? { + ) -> Self { + match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), AmazonS3ConfigKey::SecretAccessKey => { self.secret_access_key = Some(value.into()) @@ -786,18 +771,28 @@ impl AmazonS3Builder { AmazonS3ConfigKey::UnsignedPayload => { self.unsigned_payload = str_is_truthy(&value.into()) } - AmazonS3ConfigKey::Checksum => { - let algorithm = Checksum::try_from(&value.into()) - .map_err(|_| Error::InvalidChecksumAlgorithm)?; - self.checksum_algorithm = Some(algorithm) - } + AmazonS3ConfigKey::Checksum => self.checksum_algorithm = Some(value.into()), }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs /// /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -838,7 +833,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => self.checksum_algorithm.map(|v| v.to_string()), + AmazonS3ConfigKey::Checksum => self.checksum_algorithm.clone(), } } @@ -979,7 +974,8 @@ impl AmazonS3Builder { /// /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { - self.checksum_algorithm = Some(checksum_algorithm); + // Convert to String to enable deferred parsing of config + self.checksum_algorithm = Some(checksum_algorithm.to_string()); self } @@ -1032,6 +1028,11 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; + let checksum = self + .checksum_algorithm + .map(|c| c.parse()) + .transpose() + .map_err(|_| Error::InvalidChecksumAlgorithm)?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { @@ -1129,7 +1130,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload, - checksum: self.checksum_algorithm, + checksum, }; let client = Arc::new(S3Client::new(config)?); @@ -1303,7 +1304,10 @@ mod tests { assert_eq!(builder.token.unwrap(), aws_session_token); let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); - assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); + assert_eq!( + builder.checksum_algorithm.unwrap(), + Checksum::SHA256.to_string() + ); assert!(builder.unsigned_payload); } @@ -1324,46 +1328,22 @@ mod tests { ("aws_checksum_algorithm", "sha256".to_string()), ]); - let builder = AmazonS3Builder::new() - .try_with_options(&options) - .unwrap() - .try_with_option("aws_secret_access_key", "new-secret-key") - .unwrap(); - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); - assert_eq!(builder.region.unwrap(), aws_default_region); - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!(builder.checksum_algorithm.unwrap(), Checksum::SHA256); - assert!(builder.unsigned_payload); - } - - #[test] - fn s3_test_config_from_typed_map() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), - (AmazonS3ConfigKey::SecretAccessKey, aws_secret_access_key), - (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), - (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), - (AmazonS3ConfigKey::Token, aws_session_token.clone()), - (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), - ]); + let builder = options + .into_iter() + .fold(AmazonS3Builder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }) + .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); - let builder = AmazonS3Builder::new() - .try_with_options(&options) - .unwrap() - .try_with_option(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key") - .unwrap(); assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); assert_eq!(builder.region.unwrap(), aws_default_region); assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!( + builder.checksum_algorithm.unwrap(), + Checksum::SHA256.to_string() + ); assert!(builder.unsigned_payload); } @@ -1374,19 +1354,15 @@ mod tests { let aws_default_region = "object_store:fake_default_region".to_string(); let aws_endpoint = "object_store:fake_endpoint".to_string(); let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - (AmazonS3ConfigKey::AccessKeyId, aws_access_key_id.clone()), - ( - AmazonS3ConfigKey::SecretAccessKey, - aws_secret_access_key.clone(), - ), - (AmazonS3ConfigKey::DefaultRegion, aws_default_region.clone()), - (AmazonS3ConfigKey::Endpoint, aws_endpoint.clone()), - (AmazonS3ConfigKey::Token, aws_session_token.clone()), - (AmazonS3ConfigKey::UnsignedPayload, "true".to_string()), - ]); - let builder = AmazonS3Builder::new().try_with_options(&options).unwrap(); + let builder = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) + .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) + .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) + .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) + .with_config(AmazonS3ConfigKey::Token, &aws_session_token) + .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); + assert_eq!( builder .get_config_value(&AmazonS3ConfigKey::AccessKeyId) @@ -1423,19 +1399,6 @@ mod tests { ); } - #[test] - fn s3_test_config_fallible_options() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let options = HashMap::from([ - ("aws_access_key_id", aws_access_key_id), - ("invalid-key", aws_secret_access_key), - ]); - - let builder = AmazonS3Builder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index ddfd02820f1d..15033dca7ae5 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -436,30 +436,17 @@ pub struct MicrosoftAzureBuilder { /// Configuration keys for [`MicrosoftAzureBuilder`] /// -/// Configuration via keys can be dome via the [`try_with_option`](MicrosoftAzureBuilder::try_with_option) -/// or [`with_options`](MicrosoftAzureBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; -/// -/// let options = HashMap::from([ -/// ("azure_client_id", "my-client-id"), -/// ("azure_client_secret", "my-account-name"), -/// ]); -/// let typed_options = vec![ -/// (AzureConfigKey::AccountName, "my-account-name"), -/// ]; -/// let azure = MicrosoftAzureBuilder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(AzureConfigKey::AuthorityId, "my-tenant-id") -/// .unwrap(); +/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// let builder = MicrosoftAzureBuilder::new() +/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") +/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] pub enum AzureConfigKey { /// The name of the azure storage account /// @@ -678,7 +665,7 @@ impl MicrosoftAzureBuilder { if let Ok(config_key) = AzureConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -724,12 +711,8 @@ impl MicrosoftAzureBuilder { } /// Set an option on the builder via a key - value pair. - pub fn try_with_option( - mut self, - key: impl AsRef, - value: impl Into, - ) -> Result { - match AzureConfigKey::from_str(key.as_ref())? { + pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { + match key { AzureConfigKey::AccessKey => self.access_key = Some(value.into()), AzureConfigKey::AccountName => self.account_name = Some(value.into()), AzureConfigKey::ClientId => self.client_id = Some(value.into()), @@ -750,10 +733,22 @@ impl MicrosoftAzureBuilder { self.use_emulator = str_is_truthy(&value.into()) } }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -1270,31 +1265,11 @@ mod tests { ("azure_storage_token", azure_storage_token), ]); - let builder = MicrosoftAzureBuilder::new() - .try_with_options(options) - .unwrap(); - assert_eq!(builder.client_id.unwrap(), azure_client_id); - assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); - assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); - } - - #[test] - fn azure_test_config_from_typed_map() { - let azure_client_id = "object_store:fake_access_key_id".to_string(); - let azure_storage_account_name = "object_store:fake_secret_key".to_string(); - let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - (AzureConfigKey::ClientId, azure_client_id.clone()), - ( - AzureConfigKey::AccountName, - azure_storage_account_name.clone(), - ), - (AzureConfigKey::Token, azure_storage_token.clone()), - ]); - - let builder = MicrosoftAzureBuilder::new() - .try_with_options(&options) - .unwrap(); + let builder = options + .into_iter() + .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); assert_eq!(builder.client_id.unwrap(), azure_client_id); assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); @@ -1305,18 +1280,11 @@ mod tests { let azure_client_id = "object_store:fake_access_key_id".to_string(); let azure_storage_account_name = "object_store:fake_secret_key".to_string(); let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - (AzureConfigKey::ClientId, azure_client_id.clone()), - ( - AzureConfigKey::AccountName, - azure_storage_account_name.clone(), - ), - (AzureConfigKey::Token, azure_storage_token.clone()), - ]); - let builder = MicrosoftAzureBuilder::new() - .try_with_options(&options) - .unwrap(); + .with_config(AzureConfigKey::ClientId, &azure_client_id) + .with_config(AzureConfigKey::AccountName, &azure_storage_account_name) + .with_config(AzureConfigKey::Token, &azure_storage_token); + assert_eq!( builder.get_config_value(&AzureConfigKey::ClientId).unwrap(), azure_client_id @@ -1333,19 +1301,6 @@ mod tests { ); } - #[test] - fn azure_test_config_fallible_options() { - let azure_client_id = "object_store:fake_access_key_id".to_string(); - let azure_storage_token = "object_store:fake_default_region".to_string(); - let options = HashMap::from([ - ("azure_client_id", azure_client_id), - ("invalid-key", azure_storage_token), - ]); - - let builder = MicrosoftAzureBuilder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[test] fn azure_test_split_sas() { let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index e6e92f086b2b..f9c2dd30088d 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -22,9 +22,9 @@ use futures::future::BoxFuture; use futures::FutureExt; use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; +use snafu::Error as SnafuError; use std::time::{Duration, Instant}; use tracing::info; -use snafu::Error as SnafuError; /// Retry request error #[derive(Debug)] @@ -365,13 +365,13 @@ mod tests { assert_eq!(e.message, "502 Bad Gateway"); // Panic results in an incomplete message error in the client - mock.push_fn(|_| {panic!()}); + mock.push_fn(|_| panic!()); let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); // Gives up after retrying mulitiple panics for _ in 0..=retry.max_retries { - mock.push_fn(|_| {panic!()}); + mock.push_fn(|_| panic!()); } let e = do_request().await.unwrap_err(); assert_eq!(e.retries, retry.max_retries); diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index a6cf660220bd..6f3d53d42f34 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -786,29 +786,17 @@ pub struct GoogleCloudStorageBuilder { /// Configuration keys for [`GoogleCloudStorageBuilder`] /// -/// Configuration via keys can be done via the [`try_with_option`](GoogleCloudStorageBuilder::try_with_option) -/// or [`try_with_options`](GoogleCloudStorageBuilder::try_with_options) methods on the builder. +/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] /// /// # Example /// ``` -/// use std::collections::HashMap; -/// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; -/// -/// let options = HashMap::from([ -/// ("google_service_account", "my-service-account"), -/// ]); -/// let typed_options = vec![ -/// (GoogleConfigKey::Bucket, "my-bucket"), -/// ]; -/// let azure = GoogleCloudStorageBuilder::new() -/// .try_with_options(options) -/// .unwrap() -/// .try_with_options(typed_options) -/// .unwrap() -/// .try_with_option(GoogleConfigKey::Bucket, "my-new-bucket") -/// .unwrap(); +/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// let builder = GoogleCloudStorageBuilder::new() +/// .with_config("google_service_account".parse().unwrap(), "my-service-account") +/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); /// ``` #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] pub enum GoogleConfigKey { /// Path to the service account file /// @@ -926,7 +914,7 @@ impl GoogleCloudStorageBuilder { if let Ok(config_key) = GoogleConfigKey::from_str(&key.to_ascii_lowercase()) { - builder = builder.try_with_option(config_key, value).unwrap(); + builder = builder.with_config(config_key, value); } } } @@ -957,12 +945,8 @@ impl GoogleCloudStorageBuilder { } /// Set an option on the builder via a key - value pair. - pub fn try_with_option( - mut self, - key: impl AsRef, - value: impl Into, - ) -> Result { - match GoogleConfigKey::from_str(key.as_ref())? { + pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { + match key { GoogleConfigKey::ServiceAccount => { self.service_account_path = Some(value.into()) } @@ -974,10 +958,22 @@ impl GoogleCloudStorageBuilder { self.application_credentials_path = Some(value.into()) } }; - Ok(self) + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] pub fn try_with_options< I: IntoIterator, impl Into)>, >( @@ -1449,31 +1445,12 @@ mod test { ("google_bucket_name", google_bucket_name.clone()), ]); - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); - assert_eq!( - builder.service_account_path.unwrap(), - google_service_account.as_str() - ); - assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); - } + let builder = options + .iter() + .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); - #[test] - fn gcs_test_config_from_typed_map() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ( - GoogleConfigKey::ServiceAccount, - google_service_account.clone(), - ), - (GoogleConfigKey::Bucket, google_bucket_name.clone()), - ]); - - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); assert_eq!( builder.service_account_path.unwrap(), google_service_account.as_str() @@ -1485,17 +1462,10 @@ mod test { fn gcs_test_config_get_value() { let google_service_account = "object_store:fake_service_account".to_string(); let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ( - GoogleConfigKey::ServiceAccount, - google_service_account.clone(), - ), - (GoogleConfigKey::Bucket, google_bucket_name.clone()), - ]); - let builder = GoogleCloudStorageBuilder::new() - .try_with_options(&options) - .unwrap(); + .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) + .with_config(GoogleConfigKey::Bucket, &google_bucket_name); + assert_eq!( builder .get_config_value(&GoogleConfigKey::ServiceAccount) @@ -1508,19 +1478,6 @@ mod test { ); } - #[test] - fn gcs_test_config_fallible_options() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ("google_service_account", google_service_account), - ("invalid-key", google_bucket_name), - ]); - - let builder = GoogleCloudStorageBuilder::new().try_with_options(&options); - assert!(builder.is_err()); - } - #[test] fn gcs_test_config_aliases() { // Service account path @@ -1531,16 +1488,14 @@ mod test { "service_account_path", ] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, "/fake/path.json")]) - .unwrap(); + .with_config(alias.parse().unwrap(), "/fake/path.json"); assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); } // Service account key for alias in ["google_service_account_key", "service_account_key"] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, FAKE_KEY)]) - .unwrap(); + .with_config(alias.parse().unwrap(), FAKE_KEY); assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); } @@ -1552,8 +1507,7 @@ mod test { "bucket_name", ] { let builder = GoogleCloudStorageBuilder::new() - .try_with_options([(alias, "fake_bucket")]) - .unwrap(); + .with_config(alias.parse().unwrap(), "fake_bucket"); assert_eq!("fake_bucket", builder.bucket_name.unwrap()); } } From 615dde061e79652594340bdce648fbec5e15ea96 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 18:43:13 +0100 Subject: [PATCH 0886/1411] Recognise R2 URLs (#4190) (#4194) --- object_store/src/aws/mod.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 5de177afa10a..6ea24fb70a80 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -724,6 +724,7 @@ impl AmazonS3Builder { /// - `s3a:///` /// - `https://s3..amazonaws.com` /// - `https://.s3..amazonaws.com` + /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` /// /// Note: Settings derived from the URL will override any others set on this builder /// @@ -849,9 +850,8 @@ impl AmazonS3Builder { "https" => match host.splitn(4, '.').collect_tuple() { Some(("s3", region, "amazonaws", "com")) => { self.region = Some(region.to_string()); - if let Some(bucket) = - parsed.path_segments().and_then(|mut path| path.next()) - { + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { self.bucket_name = Some(bucket.into()); } } @@ -860,6 +860,16 @@ impl AmazonS3Builder { self.region = Some(region.to_string()); self.virtual_hosted_style_request = true; } + Some((account, "r2", "cloudflarestorage", "com")) => { + self.region = Some("auto".to_string()); + let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); + self.endpoint = Some(endpoint); + + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), @@ -1556,6 +1566,18 @@ mod tests { assert_eq!(builder.region, Some("region".to_string())); assert!(builder.virtual_hosted_style_request); + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") + .unwrap(); + + assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); + assert_eq!(builder.region, Some("auto".to_string())); + assert_eq!( + builder.endpoint, + Some("https://account123.r2.cloudflarestorage.com".to_string()) + ); + let err_cases = [ "mailto://bucket/path", "https://s3.bucket.mydomain.com", From b314118a01250a06ef324f84abe50edd527e7023 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 10 May 2023 18:44:30 +0100 Subject: [PATCH 0887/1411] Deffered config parsing (#4191) (#4192) --- object_store/src/aws/checksum.rs | 10 ++++ object_store/src/aws/mod.rs | 100 +++++++++++++++++++------------ object_store/src/azure/mod.rs | 29 +++++---- object_store/src/client/mod.rs | 31 +++++++++- object_store/src/config.rs | 81 +++++++++++++++++++++++++ object_store/src/lib.rs | 3 + object_store/src/util.rs | 9 --- 7 files changed, 198 insertions(+), 65 deletions(-) create mode 100644 object_store/src/config.rs diff --git a/object_store/src/aws/checksum.rs b/object_store/src/aws/checksum.rs index 57762b641ac6..a50bd2d18b9c 100644 --- a/object_store/src/aws/checksum.rs +++ b/object_store/src/aws/checksum.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::config::Parse; use ring::digest::{self, digest as ring_digest}; use std::str::FromStr; @@ -66,3 +67,12 @@ impl TryFrom<&String> for Checksum { value.parse() } } + +impl Parse for Checksum { + fn parse(v: &str) -> crate::Result { + v.parse().map_err(|_| crate::Error::Generic { + store: "Config", + source: format!("\"{v}\" is not a valid checksum algorithm").into(), + }) + } +} diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 6ea24fb70a80..fe49471c4907 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -53,8 +53,9 @@ use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, StaticCredentialProvider, WebIdentityProvider, }; +use crate::client::ClientConfigKey; +use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; -use crate::util::str_is_truthy; use crate::{ ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, StreamExt, @@ -103,9 +104,6 @@ enum Error { source: std::num::ParseIntError, }, - #[snafu(display("Invalid Checksum algorithm"))] - InvalidChecksumAlgorithm, - #[snafu(display("Missing region"))] MissingRegion, @@ -461,13 +459,13 @@ pub struct AmazonS3Builder { /// Retry config retry_config: RetryConfig, /// When set to true, fallback to IMDSv1 - imdsv1_fallback: bool, + imdsv1_fallback: ConfigValue, /// When set to true, virtual hosted style request has to be used - virtual_hosted_style_request: bool, + virtual_hosted_style_request: ConfigValue, /// When set to true, unsigned payload option has to be used - unsigned_payload: bool, + unsigned_payload: ConfigValue, /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option, + checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, /// Profile name, see @@ -709,8 +707,9 @@ impl AmazonS3Builder { } if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.client_options = - builder.client_options.with_allow_http(str_is_truthy(&text)); + builder.client_options = builder + .client_options + .with_config(ClientConfigKey::AllowHttp, text); } builder @@ -756,11 +755,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), AmazonS3ConfigKey::Token => self.token = Some(value.into()), - AmazonS3ConfigKey::ImdsV1Fallback => { - self.imdsv1_fallback = str_is_truthy(&value.into()) - } + AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), AmazonS3ConfigKey::VirtualHostedStyleRequest => { - self.virtual_hosted_style_request = str_is_truthy(&value.into()) + self.virtual_hosted_style_request.parse(value) } AmazonS3ConfigKey::DefaultRegion => { self.region = self.region.or_else(|| Some(value.into())) @@ -769,10 +766,10 @@ impl AmazonS3Builder { self.metadata_endpoint = Some(value.into()) } AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), - AmazonS3ConfigKey::UnsignedPayload => { - self.unsigned_payload = str_is_truthy(&value.into()) + AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } - AmazonS3ConfigKey::Checksum => self.checksum_algorithm = Some(value.into()), }; self } @@ -834,7 +831,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => self.checksum_algorithm.clone(), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm.as_ref().map(ToString::to_string) + } } } @@ -858,7 +857,7 @@ impl AmazonS3Builder { Some((bucket, "s3", region, "amazonaws.com")) => { self.bucket_name = Some(bucket.to_string()); self.region = Some(region.to_string()); - self.virtual_hosted_style_request = true; + self.virtual_hosted_style_request = true.into(); } Some((account, "r2", "cloudflarestorage", "com")) => { self.region = Some("auto".to_string()); @@ -944,7 +943,7 @@ impl AmazonS3Builder { mut self, virtual_hosted_style_request: bool, ) -> Self { - self.virtual_hosted_style_request = virtual_hosted_style_request; + self.virtual_hosted_style_request = virtual_hosted_style_request.into(); self } @@ -967,7 +966,7 @@ impl AmazonS3Builder { /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ /// pub fn with_imdsv1_fallback(mut self) -> Self { - self.imdsv1_fallback = true; + self.imdsv1_fallback = true.into(); self } @@ -976,7 +975,7 @@ impl AmazonS3Builder { /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { - self.unsigned_payload = unsigned_payload; + self.unsigned_payload = unsigned_payload.into(); self } @@ -985,7 +984,7 @@ impl AmazonS3Builder { /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { // Convert to String to enable deferred parsing of config - self.checksum_algorithm = Some(checksum_algorithm.to_string()); + self.checksum_algorithm = Some(checksum_algorithm.into()); self } @@ -1038,11 +1037,7 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; - let checksum = self - .checksum_algorithm - .map(|c| c.parse()) - .transpose() - .map_err(|_| Error::InvalidChecksumAlgorithm)?; + let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { @@ -1103,7 +1098,7 @@ impl AmazonS3Builder { cache: Default::default(), client: client_options.client()?, retry_config: self.retry_config.clone(), - imdsv1_fallback: self.imdsv1_fallback, + imdsv1_fallback: self.imdsv1_fallback.get()?, metadata_endpoint: self .metadata_endpoint .unwrap_or_else(|| METADATA_ENDPOINT.into()), @@ -1119,7 +1114,7 @@ impl AmazonS3Builder { // If `endpoint` is provided then its assumed to be consistent with // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then // `endpoint` should have bucket name included. - if self.virtual_hosted_style_request { + if self.virtual_hosted_style_request.get()? { endpoint = self .endpoint .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); @@ -1139,7 +1134,7 @@ impl AmazonS3Builder { credentials, retry_config: self.retry_config, client_options: self.client_options, - sign_payload: !self.unsigned_payload, + sign_payload: !self.unsigned_payload.get()?, checksum, }; @@ -1315,10 +1310,10 @@ mod tests { let metadata_uri = format!("{METADATA_ENDPOINT}{container_creds_relative_uri}"); assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); assert_eq!( - builder.checksum_algorithm.unwrap(), - Checksum::SHA256.to_string() + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 ); - assert!(builder.unsigned_payload); + assert!(builder.unsigned_payload.get().unwrap()); } #[test] @@ -1351,10 +1346,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); assert_eq!( - builder.checksum_algorithm.unwrap(), - Checksum::SHA256.to_string() + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 ); - assert!(builder.unsigned_payload); + assert!(builder.unsigned_payload.get().unwrap()); } #[test] @@ -1564,7 +1559,7 @@ mod tests { .unwrap(); assert_eq!(builder.bucket_name, Some("bucket".to_string())); assert_eq!(builder.region, Some("region".to_string())); - assert!(builder.virtual_hosted_style_request); + assert!(builder.virtual_hosted_style_request.get().unwrap()); let mut builder = AmazonS3Builder::new(); builder @@ -1591,6 +1586,35 @@ mod tests { builder.parse_url(case).unwrap_err(); } } + + #[test] + fn test_invalid_config() { + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: failed to parse \"enabled\" as boolean" + ); + + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::Checksum, "md5") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: \"md5\" is not a valid checksum algorithm" + ); + } } #[cfg(test)] diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 15033dca7ae5..2b5b43adabe0 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -51,7 +51,9 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::util::{str_is_truthy, RFC1123_FMT}; +use crate::client::ClientConfigKey; +use crate::config::ConfigValue; +use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -417,7 +419,7 @@ pub struct MicrosoftAzureBuilder { /// Url url: Option, /// When set to true, azurite storage emulator has to be used - use_emulator: bool, + use_emulator: ConfigValue, /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, /// Object id for use with managed identity authentication @@ -427,7 +429,7 @@ pub struct MicrosoftAzureBuilder { /// File containing token for Azure AD workload identity federation federated_token_file: Option, /// When set to true, azure cli has to be used for acquiring access token - use_azure_cli: bool, + use_azure_cli: ConfigValue, /// Retry config retry_config: RetryConfig, /// Client options @@ -672,8 +674,9 @@ impl MicrosoftAzureBuilder { } if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { - builder.client_options = - builder.client_options.with_allow_http(str_is_truthy(&text)); + builder.client_options = builder + .client_options + .with_config(ClientConfigKey::AllowHttp, text) } if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { @@ -726,12 +729,8 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => { self.federated_token_file = Some(value.into()) } - AzureConfigKey::UseAzureCli => { - self.use_azure_cli = str_is_truthy(&value.into()) - } - AzureConfigKey::UseEmulator => { - self.use_emulator = str_is_truthy(&value.into()) - } + AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), + AzureConfigKey::UseEmulator => self.use_emulator.parse(value), }; self } @@ -898,7 +897,7 @@ impl MicrosoftAzureBuilder { /// Set if the Azure emulator should be used (defaults to false) pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { - self.use_emulator = use_emulator; + self.use_emulator = use_emulator.into(); self } @@ -956,7 +955,7 @@ impl MicrosoftAzureBuilder { /// Set if the Azure Cli should be used for acquiring access token /// pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { - self.use_azure_cli = use_azure_cli; + self.use_azure_cli = use_azure_cli.into(); self } @@ -969,7 +968,7 @@ impl MicrosoftAzureBuilder { let container = self.container_name.ok_or(Error::MissingContainerName {})?; - let (is_emulator, storage_url, auth, account) = if self.use_emulator { + let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self .account_name .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); @@ -1022,7 +1021,7 @@ impl MicrosoftAzureBuilder { credential::CredentialProvider::SASToken(query_pairs) } else if let Some(sas) = self.sas_key { credential::CredentialProvider::SASToken(split_sas(&sas)?) - } else if self.use_azure_cli { + } else if self.use_azure_cli.get()? { credential::CredentialProvider::TokenCredential( TokenCache::default(), Box::new(credential::AzureCliCredential::new()), diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index d019e8119ac2..d7b0b86d99e5 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -26,8 +26,10 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; +use crate::config::ConfigValue; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Duration; @@ -43,6 +45,14 @@ fn map_client_error(e: reqwest::Error) -> super::Error { static DEFAULT_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); +/// Configuration keys for [`ClientOptions`] +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] +pub enum ClientConfigKey { + /// Allow non-TLS, i.e. non-HTTPS connections + AllowHttp, +} + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { @@ -51,7 +61,7 @@ pub struct ClientOptions { default_content_type: Option, default_headers: Option, proxy_url: Option, - allow_http: bool, + allow_http: ConfigValue, allow_insecure: bool, timeout: Option, connect_timeout: Option, @@ -70,6 +80,21 @@ impl ClientOptions { Default::default() } + /// Set an option by key + pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { + match key { + ClientConfigKey::AllowHttp => self.allow_http.parse(value), + } + self + } + + /// Get an option by key + pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { + match key { + ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), + } + } + /// Sets the User-Agent header to be used by this client /// /// Default is based on the version of this crate @@ -104,7 +129,7 @@ impl ClientOptions { /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.allow_http = allow_http; + self.allow_http = allow_http.into(); self } /// Allows connections to invalid SSL certificates @@ -280,7 +305,7 @@ impl ClientOptions { } builder - .https_only(!self.allow_http) + .https_only(!self.allow_http.get()?) .build() .map_err(map_client_error) } diff --git a/object_store/src/config.rs b/object_store/src/config.rs new file mode 100644 index 000000000000..3ecce2e52bf1 --- /dev/null +++ b/object_store/src/config.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{Error, Result}; +use std::fmt::{Debug, Display, Formatter}; + +/// Provides deferred parsing of a value +/// +/// This allows builders to defer fallibility to build +#[derive(Debug, Clone)] +pub enum ConfigValue { + Parsed(T), + Deferred(String), +} + +impl Display for ConfigValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Parsed(v) => write!(f, "{v}"), + Self::Deferred(v) => write!(f, "{v}"), + } + } +} + +impl From for ConfigValue { + fn from(value: T) -> Self { + Self::Parsed(value) + } +} + +impl ConfigValue { + pub fn parse(&mut self, v: impl Into) { + *self = Self::Deferred(v.into()) + } + + pub fn get(&self) -> Result { + match self { + Self::Parsed(v) => Ok(v.clone()), + Self::Deferred(v) => T::parse(v), + } + } +} + +impl Default for ConfigValue { + fn default() -> Self { + Self::Parsed(T::default()) + } +} + +/// A value that can be stored in [`ConfigValue`] +pub trait Parse: Sized { + fn parse(v: &str) -> Result; +} + +impl Parse for bool { + fn parse(v: &str) -> Result { + let lower = v.to_ascii_lowercase(); + match lower.as_str() { + "1" | "true" | "on" | "yes" | "y" => Ok(true), + "0" | "false" | "off" | "no" | "n" => Ok(false), + _ => Err(Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as boolean").into(), + }), + } + } +} diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index c31027c0715c..1390a0140d1c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -247,6 +247,9 @@ mod client; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] pub use client::{backoff::BackoffConfig, retry::RetryConfig}; +#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +mod config; + #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; mod util; diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 1ec63f219a20..e5c701dd8b1b 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -185,15 +185,6 @@ fn merge_ranges( ret } -#[allow(dead_code)] -pub(crate) fn str_is_truthy(val: &str) -> bool { - val.eq_ignore_ascii_case("1") - | val.eq_ignore_ascii_case("true") - | val.eq_ignore_ascii_case("on") - | val.eq_ignore_ascii_case("yes") - | val.eq_ignore_ascii_case("y") -} - #[cfg(test)] mod tests { use super::*; From adca63abd47972999e7f69add2b9001dcbc307d0 Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Wed, 10 May 2023 19:48:14 +0200 Subject: [PATCH 0888/1411] Implement RecordBatchReader for arrow_csv::reader::BufReader (#4195) --- arrow-csv/src/reader/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 74294f42e8b2..0ab1664f5d00 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -490,6 +490,12 @@ impl Iterator for BufReader { } } +impl RecordBatchReader for BufReader { + fn schema(&self) -> SchemaRef { + self.decoder.schema.clone() + } +} + /// A push-based interface for decoding CSV data from an arbitrary byte stream /// /// See [`Reader`] for a higher-level interface for interface with [`Read`] From 378a9fcc9ee31fff4a9a13f5de5a326dc449541e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 12:24:35 +0100 Subject: [PATCH 0889/1411] Update arrow rustdocs (#4071) (#4197) * Update docs (#4071) * Review feedback --- arrow-array/src/array/binary_array.rs | 9 +- arrow-array/src/array/boolean_array.rs | 2 +- arrow-array/src/array/byte_array.rs | 2 +- arrow-array/src/array/dictionary_array.rs | 39 ++--- .../src/array/fixed_size_binary_array.rs | 2 +- .../src/array/fixed_size_list_array.rs | 6 +- arrow-array/src/array/list_array.rs | 24 +-- arrow-array/src/array/map_array.rs | 3 +- arrow-array/src/array/mod.rs | 5 +- arrow-array/src/array/null_array.rs | 2 +- arrow-array/src/array/primitive_array.rs | 101 ++++++++---- arrow-array/src/array/run_array.rs | 17 +- arrow-array/src/array/string_array.rs | 11 +- arrow-array/src/array/struct_array.rs | 4 +- arrow-array/src/array/union_array.rs | 2 +- .../src/builder/boolean_buffer_builder.rs | 2 +- arrow-array/src/builder/boolean_builder.rs | 2 +- .../src/builder/fixed_size_binary_builder.rs | 8 +- .../src/builder/fixed_size_list_builder.rs | 2 +- .../src/builder/generic_byte_run_builder.rs | 13 +- .../src/builder/generic_bytes_builder.rs | 2 +- .../generic_bytes_dictionary_builder.rs | 19 +-- .../src/builder/generic_list_builder.rs | 2 +- arrow-array/src/builder/map_builder.rs | 5 +- arrow-array/src/builder/mod.rs | 28 ++-- arrow-array/src/builder/primitive_builder.rs | 2 +- .../builder/primitive_dictionary_builder.rs | 4 +- .../src/builder/primitive_run_builder.rs | 2 +- arrow-array/src/builder/struct_builder.rs | 2 +- arrow-array/src/builder/union_builder.rs | 2 +- arrow-array/src/lib.rs | 148 ++++++++++-------- arrow-buffer/src/alloc/mod.rs | 3 +- arrow-buffer/src/buffer/mod.rs | 3 +- arrow-buffer/src/buffer/mutable.rs | 2 +- arrow-buffer/src/buffer/null.rs | 7 + arrow-buffer/src/buffer/scalar.rs | 20 ++- arrow-buffer/src/util/bit_iterator.rs | 2 + arrow/src/lib.rs | 17 +- 38 files changed, 283 insertions(+), 243 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 3b13a513f646..a4d64040ceff 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -23,8 +23,7 @@ use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing -/// binary data. +/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing binary data pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { @@ -218,7 +217,8 @@ where } } -/// An array where each element contains 0 or more bytes. +/// An array of `[u8]` using `i32` offsets +/// /// The byte length of each element is represented by an i32. /// /// # Examples @@ -258,8 +258,7 @@ where /// pub type BinaryArray = GenericBinaryArray; -/// An array where each element contains 0 or more bytes. -/// The byte length of each element is represented by an i64. +/// An array of `[u8]` using `i64` offsets /// /// # Examples /// diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index d03f0fd040f2..9ecdb2c5d24d 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -25,7 +25,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// Array of bools +/// An array of [boolean values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Example /// diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 12f9aab674e8..629ffd22cdc2 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -28,7 +28,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// Generic struct for variable-size byte arrays +/// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) /// /// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data /// diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 75fd4c6d0d68..a319a836a955 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -30,8 +30,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// -/// A dictionary array where each element is a single value indexed by an integer key. +/// A dictionary array indexed by `i8` /// /// # Example: Using `collect` /// ``` @@ -44,8 +43,8 @@ use std::sync::Arc; /// assert_eq!(array.values(), &values); /// ``` pub type Int8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i16` /// /// # Example: Using `collect` /// ``` @@ -58,8 +57,8 @@ pub type Int8DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i32` /// /// # Example: Using `collect` /// ``` @@ -72,8 +71,8 @@ pub type Int16DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i64` /// /// # Example: Using `collect` /// ``` @@ -86,8 +85,8 @@ pub type Int32DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int64DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u8` /// /// # Example: Using `collect` /// ``` @@ -100,8 +99,8 @@ pub type Int64DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u16` /// /// # Example: Using `collect` /// ``` @@ -114,8 +113,8 @@ pub type UInt8DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u32` /// /// # Example: Using `collect` /// ``` @@ -128,8 +127,8 @@ pub type UInt16DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u64` /// /// # Example: Using `collect` /// ``` @@ -143,7 +142,8 @@ pub type UInt32DictionaryArray = DictionaryArray; /// ``` pub type UInt64DictionaryArray = DictionaryArray; -/// A dictionary array where each element is a single value indexed by an integer key. +/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) +/// /// This is mostly used to represent strings or a limited set of primitive types as integers, /// for example when doing NLP analysis or representing chromosomes by name. /// @@ -695,8 +695,9 @@ impl std::fmt::Debug for DictionaryArray { } } -/// A strongly-typed wrapper around a [`DictionaryArray`] that implements [`ArrayAccessor`] -/// allowing fast access to its elements +/// A [`DictionaryArray`] typed on its child values array +/// +/// Implements [`ArrayAccessor`] allowing fast access to its elements /// /// ``` /// use arrow_array::{DictionaryArray, StringArray, types::Int32Type}; diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 08ce76c066c3..083d71cd963f 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -25,7 +25,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// An array where each element is a fixed-size sequence of bytes. +/// An array of [fixed size binary arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Examples /// diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 86adafa066f0..18fa9df928ff 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -24,8 +24,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// A list array where each element is a fixed-size sequence of values with the same -/// type whose maximum length is represented by a i32. +/// An array of [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) /// /// # Example /// @@ -59,9 +58,6 @@ use std::sync::Arc; /// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::().unwrap().values()); /// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::().unwrap().values()); /// ``` -/// -/// For non generic lists, you may wish to consider using -/// [crate::array::FixedSizeBinaryArray] #[derive(Clone)] pub struct FixedSizeListArray { data_type: DataType, // Must be DataType::FixedSizeList(value_length) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f4e5b4b79c77..f4816a61ea82 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -28,7 +28,15 @@ use num::Integer; use std::any::Any; use std::sync::Arc; -/// trait declaring an offset size, relevant for i32 vs i64 array types. +/// A type that can be used within a variable-size array to encode offset information +/// +/// See [`ListArray`], [`LargeListArray`], [`BinaryArray`], [`LargeBinaryArray`], +/// [`StringArray`] and [`LargeStringArray`] +/// +/// [`BinaryArray`]: crate::array::BinaryArray +/// [`LargeBinaryArray`]: crate::array::LargeBinaryArray +/// [`StringArray`]: crate::array::StringArray +/// [`LargeStringArray`]: crate::array::LargeStringArray pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { /// True for 64 bit offset size and false for 32 bit offset size const IS_LARGE: bool; @@ -46,12 +54,9 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } -/// Generic struct for a variable-size list array. +/// An array of [variable length arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) /// -/// Columnar format in Apache Arrow: -/// -/// -/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` +/// See [`ListArray`] and [`LargeListArray`]` pub struct GenericListArray { data_type: DataType, nulls: Option, @@ -447,8 +452,7 @@ impl std::fmt::Debug for GenericListArray std::fmt::Debug for GenericListArray; -/// A list array where each element is a variable-sized sequence of values with the same -/// type whose memory offsets between elements are represented by a i64. +/// An array of variable size lists, storing offsets as `i64`. +/// /// # Example /// /// ``` diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index c53e452a67dd..cf0978f05b4e 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -23,7 +23,8 @@ use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; -/// A nested array type where each record is a key-value map. +/// An array of key-value maps +/// /// Keys should always be non-null, but values can be null. /// /// [MapArray] is physically a [crate::array::ListArray] that has a diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index e6fd6828bac7..9312770644a3 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -67,8 +67,7 @@ pub use union_array::*; mod run_array; pub use run_array::*; -/// Trait for dealing with different types of array at runtime when the type of the -/// array is not known in advance. +/// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`](std::any::Any) so that it can be /// downcasted to a specific implementation. @@ -237,7 +236,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn get_array_memory_size(&self) -> usize; } -/// A reference-counted reference to a generic `Array`. +/// A reference-counted reference to a generic `Array` pub type ArrayRef = Arc; /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array` diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index c7f61d91da70..7fdd99a39675 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -24,7 +24,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// An Array where all elements are nulls +/// An array of [null values](https://arrow.apache.org/docs/format/Columnar.html#null-layout) /// /// A `NullArray` is a simplified array where all values are null. /// diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 8c8562b5be38..35202a4c7fd7 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -34,6 +34,7 @@ use half::f16; use std::any::Any; use std::sync::Arc; +/// An array of `i8` /// /// # Example: Using `collect` /// ``` @@ -41,6 +42,8 @@ use std::sync::Arc; /// let arr : Int8Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int8Array = PrimitiveArray; + +/// An array of `i16` /// /// # Example: Using `collect` /// ``` @@ -48,6 +51,8 @@ pub type Int8Array = PrimitiveArray; /// let arr : Int16Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int16Array = PrimitiveArray; + +/// An array of `i32` /// /// # Example: Using `collect` /// ``` @@ -55,6 +60,8 @@ pub type Int16Array = PrimitiveArray; /// let arr : Int32Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int32Array = PrimitiveArray; + +/// An array of `i64` /// /// # Example: Using `collect` /// ``` @@ -62,13 +69,16 @@ pub type Int32Array = PrimitiveArray; /// let arr : Int64Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int64Array = PrimitiveArray; -/// + +/// An array of `u8` /// # Example: Using `collect` /// ``` /// # use arrow_array::UInt8Array; /// let arr : UInt8Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt8Array = PrimitiveArray; + +/// An array of `u16` /// /// # Example: Using `collect` /// ``` @@ -76,6 +86,8 @@ pub type UInt8Array = PrimitiveArray; /// let arr : UInt16Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt16Array = PrimitiveArray; + +/// An array of `u32` /// /// # Example: Using `collect` /// ``` @@ -83,6 +95,8 @@ pub type UInt16Array = PrimitiveArray; /// let arr : UInt32Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt32Array = PrimitiveArray; + +/// An array of `u64` /// /// # Example: Using `collect` /// ``` @@ -90,6 +104,8 @@ pub type UInt32Array = PrimitiveArray; /// let arr : UInt64Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt64Array = PrimitiveArray; + +/// An array of `f16` /// /// # Example: Using `collect` /// ``` @@ -98,6 +114,8 @@ pub type UInt64Array = PrimitiveArray; /// let arr : Float16Array = [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))].into_iter().collect(); /// ``` pub type Float16Array = PrimitiveArray; + +/// An array of `f32` /// /// # Example: Using `collect` /// ``` @@ -105,6 +123,8 @@ pub type Float16Array = PrimitiveArray; /// let arr : Float32Array = [Some(1.0), Some(2.0)].into_iter().collect(); /// ``` pub type Float32Array = PrimitiveArray; + +/// An array of `f64` /// /// # Example: Using `collect` /// ``` @@ -113,8 +133,11 @@ pub type Float32Array = PrimitiveArray; /// ``` pub type Float64Array = PrimitiveArray; +/// An array of seconds since UNIX epoch stored as `i64` +/// +/// This type is similar to the [`chrono::DateTime`] type and can hold +/// values such as `1970-05-09 14:25:11 +01:00` /// -/// A primitive array where each element is of type [TimestampSecondType]. /// See also [`Timestamp`](arrow_schema::DataType::Timestamp). /// /// # Example: UTC timestamps post epoch @@ -157,82 +180,90 @@ pub type Float64Array = PrimitiveArray; /// ``` /// pub type TimestampSecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMillisecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of milliseconds since UNIX epoch stored as `i64` +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampMillisecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMicrosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of microseconds since UNIX epoch stored as `i64` +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampMicrosecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampNanosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of nanoseconds since UNIX epoch stored as `i64` +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampNanosecondArray = PrimitiveArray; // TODO: give examples for the below types -/// A primitive array where each element is of 32-bit value -/// representing the elapsed time since UNIX epoch in days." +/// An array of days since UNIX epoch stored as `i32` /// /// This type is similar to the [`chrono::NaiveDate`] type and can hold /// values such as `2018-11-13` pub type Date32Array = PrimitiveArray; -/// A primitive array where each element is a 64-bit value -/// representing the elapsed time since the UNIX epoch in milliseconds. + +/// An array of milliseconds since UNIX epoch stored as `i64` /// -/// This type is similar to the [`chrono::NaiveDateTime`] type and can hold -/// values such as `2018-11-13T17:11:10.011` +/// This type is similar to the [`chrono::NaiveDate`] type and can hold +/// values such as `2018-11-13` pub type Date64Array = PrimitiveArray; -/// An array where each element is of 32-bit type representing time elapsed in seconds -/// since midnight. +/// An array of seconds since midnight stored as `i32` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00` pub type Time32SecondArray = PrimitiveArray; -/// An array where each element is of 32-bit type representing time elapsed in milliseconds -/// since midnight. + +/// An array of milliseconds since midnight stored as `i32` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123` pub type Time32MillisecondArray = PrimitiveArray; -/// An array where each element is of 64-bit type representing time elapsed in microseconds -/// since midnight. + +/// An array of microseconds since midnight stored as `i64` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456` pub type Time64MicrosecondArray = PrimitiveArray; -/// An array where each element is of 64-bit type representing time elapsed in nanoseconds -/// since midnight. + +/// An array of nanoseconds since midnight stored as `i64` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456789` pub type Time64NanosecondArray = PrimitiveArray; -/// An array where each element is a “calendar” interval in months. +/// An array of “calendar” intervals in months pub type IntervalYearMonthArray = PrimitiveArray; -/// An array where each element is a “calendar” interval days and milliseconds. + +/// An array of “calendar” intervals in days and milliseconds pub type IntervalDayTimeArray = PrimitiveArray; -/// An array where each element is a “calendar” interval in months, days, and nanoseconds. + +/// An array of “calendar” intervals in months, days, and nanoseconds pub type IntervalMonthDayNanoArray = PrimitiveArray; -/// An array where each element is an elapsed time type in seconds. +/// An array of elapsed durations in seconds pub type DurationSecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in milliseconds. + +/// An array of elapsed durations in milliseconds pub type DurationMillisecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in microseconds. + +/// An array of elapsed durations in microseconds pub type DurationMicrosecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in nanoseconds. + +/// An array of elapsed durations in nanoseconds pub type DurationNanosecondArray = PrimitiveArray; -/// An array where each element is a 128-bits decimal with precision in [1, 38] and -/// scale less or equal to 38. +/// An array of 128-bit fixed point decimals pub type Decimal128Array = PrimitiveArray; -/// An array where each element is a 256-bits decimal with precision in [1, 76] and -/// scale less or equal to 76. + +/// An array of 256-bit fixed point decimals pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; -/// Array whose elements are of primitive types. +/// An array of [primitive values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Example: From an iterator of values /// @@ -890,6 +921,8 @@ impl<'a, T: ArrowPrimitiveType> PrimitiveArray { } } +/// An optional primitive value +/// /// This struct is used as an adapter when creating `PrimitiveArray` from an iterator. /// `FromIterator` for `PrimitiveArray` takes an iterator where the elements can be `into` /// this struct. So once implementing `From` or `Into` trait for a type, an iterator of diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index e7e71d3840bb..820d5c9ebfc1 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -30,10 +30,10 @@ use crate::{ Array, ArrayAccessor, ArrayRef, PrimitiveArray, }; +/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) /// -/// A run-end encoding (REE) is a variation of [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding). -/// -/// This encoding is good for representing data containing same values repeated consecutively. +/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding) +/// and is good for representing data containing same values repeated consecutively. /// /// [`RunArray`] contains `run_ends` array and `values` array of same length. /// The `run_ends` array stores the indexes at which the run ends. The `values` array @@ -428,7 +428,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { } /// -/// A [`RunArray`] array where run ends are stored using `i16` data type. +/// A [`RunArray`] with `i16` run ends /// /// # Example: Using `collect` /// ``` @@ -443,7 +443,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { pub type Int16RunArray = RunArray; /// -/// A [`RunArray`] array where run ends are stored using `i32` data type. +/// A [`RunArray`] with `i32` run ends /// /// # Example: Using `collect` /// ``` @@ -458,7 +458,7 @@ pub type Int16RunArray = RunArray; pub type Int32RunArray = RunArray; /// -/// A [`RunArray`] array where run ends are stored using `i64` data type. +/// A [`RunArray`] with `i64` run ends /// /// # Example: Using `collect` /// ``` @@ -472,8 +472,9 @@ pub type Int32RunArray = RunArray; /// ``` pub type Int64RunArray = RunArray; -/// A strongly-typed wrapper around a [`RunArray`] that implements [`ArrayAccessor`] -/// and [`IntoIterator`] allowing fast access to its elements +/// A [`RunArray`] typed typed on its child values array +/// +/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements /// /// ``` /// use arrow_array::{RunArray, StringArray, types::Int32Type}; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 7c4a375299db..d8f1c5da16c7 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -21,10 +21,7 @@ use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -/// Generic struct for \[Large\]StringArray -/// -/// See [`StringArray`] and [`LargeStringArray`] for storing -/// specific string data. +/// See [`StringArray`] and [`LargeStringArray`] for storing string data pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { @@ -211,8 +208,7 @@ impl From> for GenericStringArray From> for GenericStringArray; -/// An array where each element is a variable-sized sequence of bytes representing a string -/// whose maximum length (in bytes) is represented by a i64. +/// An array of `str` using `i64` offsets /// /// Example /// diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index fac947f14bfd..1a79ebd95f37 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -22,9 +22,9 @@ use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, SchemaBuilder} use std::sync::Arc; use std::{any::Any, ops::Index}; -/// A nested array type where each child (called *field*) is represented by a separate -/// array. +/// An array of [structs](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) /// +/// Each child (called *field*) is represented by a separate array. /// /// # Comparison with [RecordBatch] /// diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 172ae082197c..74a5f1efa767 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -25,7 +25,7 @@ use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; use std::any::Any; use std::sync::Arc; -/// An Array that can represent slots of varying types. +/// An array of [values of varying types](https://arrow.apache.org/docs/format/Columnar.html#union-layout) /// /// Each slot in a [UnionArray] can have a value chosen from a number /// of types. Each of the possible types are named like the fields of diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index f721504d08aa..1a3473e19a04 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -19,7 +19,7 @@ use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer}; use arrow_data::bit_mask; use std::ops::Range; -/// A builder for creating a boolean [`Buffer`] +/// Builder for [`BooleanBuffer`] #[derive(Debug)] pub struct BooleanBufferBuilder { buffer: MutableBuffer, diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index c7974967a700..a35e6f6b97e5 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// Array builder for fixed-width primitive types +/// Builder for [`BooleanArray`] /// /// # Example /// diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 695b553f0eee..a354a1db24e1 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -24,11 +24,11 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// A fixed size binary array builder +/// Builder for [`FixedSizeBinaryArray`] /// ``` -/// use arrow_array::builder::FixedSizeBinaryBuilder; -/// use arrow_array::Array; -/// +/// # use arrow_array::builder::FixedSizeBinaryBuilder; +/// # use arrow_array::Array; +/// # /// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); /// // [b"hello", null, b"arrow"] /// builder.append_value(b"hello").unwrap(); diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 57af768447c8..ab9fbf5fa63f 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{DataType, Field}; use std::any::Any; use std::sync::Arc; -/// Array builder for [`FixedSizeListArray`] +/// Builder for [`FixedSizeListArray`] /// ``` /// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array}; /// let values_builder = Int32Builder::new(); diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 9c26d7be6904..97082fe96673 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -30,7 +30,7 @@ use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -/// Array builder for [`RunArray`] for String and Binary types. +/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray) /// /// # Example: /// @@ -309,7 +309,7 @@ where } } -/// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). +/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray) /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. @@ -319,7 +319,7 @@ where /// # use arrow_array::{Int16Array, StringArray}; /// # use arrow_array::types::Int16Type; /// # use arrow_array::cast::AsArray; -/// +/// # /// let mut builder = StringRunBuilder::::new(); /// /// // The builder builds the dictionary value by value @@ -342,10 +342,10 @@ where /// ``` pub type StringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringRunBuilder`] for an example. +/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes binary values([`BinaryType`]). +/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray) /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. @@ -378,8 +378,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// ``` pub type BinaryRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes large binary values([`LargeBinaryType`]). -/// See documentation of [`BinaryRunBuilder`] for an example. +/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryRunBuilder = GenericByteRunBuilder; #[cfg(test)] diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index a3598d8bf26d..1887ab36c6d9 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -25,7 +25,7 @@ use std::any::Any; use std::fmt::Write; use std::sync::Arc; -/// Array builder for [`GenericByteArray`] +/// Builder for [`GenericByteArray`] pub struct GenericByteBuilder { value_builder: UInt8BufferBuilder, offsets_builder: BufferBuilder, diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index dd9a70b1d431..d5c62865ff8d 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -27,7 +27,8 @@ use hashbrown::HashMap; use std::any::Any; use std::sync::Arc; -/// Generic array builder for `DictionaryArray` that stores generic byte values. +/// Builder for [`DictionaryArray`] of [`GenericByteArray`] +/// /// For example to map a set of byte indices to String values. Note that /// the use of a `HashMap` here will not scale to very large arrays or /// result in an ordered dictionary. @@ -338,9 +339,7 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ &values[start_offset..end_offset] } -/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray) /// /// ``` /// // Create a dictionary array indexed by bytes whose values are Strings. @@ -376,15 +375,11 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores large Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores binary. For example to map a set of byte indices -/// to binary values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) /// /// ``` /// // Create a dictionary array indexed by bytes whose values are binary. @@ -420,9 +415,7 @@ pub type LargeStringDictionaryBuilder = pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores large binary. For example to map a set of byte indices -/// to binary values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index b6d0707982be..054c87187fbe 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::Field; use std::any::Any; use std::sync::Arc; -/// Array builder for [`GenericListArray`]s. +/// Builder for [`GenericListArray`] /// /// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. /// diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index db85465c8d5c..b73e65b117f1 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -24,7 +24,8 @@ use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; -/// Creates a new `MapBuilder` +/// Builder for [`MapArray`] +/// /// ``` /// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; /// # use arrow_array::{Int32Array, StringArray}; @@ -62,7 +63,7 @@ pub struct MapBuilder { value_builder: V, } -/// Contains details of the mapping +/// The [`Field`] names for a [`MapArray`] #[derive(Debug, Clone)] pub struct MapFieldNames { /// [`Field`] name for map entries diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 081f4d5f41f6..c4f581fbfb46 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines builders that can be used to safely build arrays +//! Defines push-based APIs for constructing arrays //! //! # Basic Usage //! @@ -81,7 +81,9 @@ //! # Custom Builders //! //! It is common to have a collection of statically defined Rust types that -//! you want to convert to Arrow arrays. An example of doing so is below +//! you want to convert to Arrow arrays. +//! +//! An example of doing so is below //! //! ``` //! # use std::any::Any; @@ -261,26 +263,20 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } -/// Builder for [`ListArray`]s (i32 offsets) -/// -/// See [`GenericListBuilder`] for usage examples -/// -/// [`ListArray`]: crate::array::ListArray +/// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; -/// Builder for [`LargeListArray`]s (i64 offsets) -/// -/// See [`GenericListBuilder`] for usage examples -/// -/// [`LargeListArray`]: crate::array::LargeListArray +/// Builder for [`LargeListArray`](crate::array::LargeListArray) pub type LargeListBuilder = GenericListBuilder; -/// A binary array builder with i32 offsets +/// Builder for [`BinaryArray`](crate::array::BinaryArray) pub type BinaryBuilder = GenericBinaryBuilder; -/// A binary array builder with i64 offsets + +/// Builder for [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryBuilder = GenericBinaryBuilder; -/// A string array builder with i32 offsets +/// Builder for [`StringArray`](crate::array::StringArray) pub type StringBuilder = GenericStringBuilder; -/// A string array builder with i64 offsets + +/// Builder for [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringBuilder = GenericStringBuilder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 8721004d27e4..440fb8a4bead 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -92,7 +92,7 @@ pub type Decimal128Builder = PrimitiveBuilder; /// A decimal 256 array builder pub type Decimal256Builder = PrimitiveBuilder; -/// Array builder for fixed-width primitive types +/// Builder for [`PrimitiveArray`] #[derive(Debug)] pub struct PrimitiveBuilder { values_builder: BufferBuilder, diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 41880d3a478c..cde1abe22b7b 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -45,9 +45,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Array builder for `DictionaryArray`. For example to map a set of byte indices -/// to f32 values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) /// /// # Example: /// diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 30750b6f3421..53674a73b172 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -23,7 +23,7 @@ use super::{ArrayBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -/// Array builder for [`RunArray`] that encodes primitive values. +/// Builder for [`RunArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) /// /// # Example: /// diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 4702bb734266..41ede9c7a992 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{DataType, Fields, IntervalUnit, TimeUnit}; use std::any::Any; use std::sync::Arc; -/// Array builder for Struct types. +/// Builder for [`StructArray`] /// /// Note that callers should make sure that methods of all the child field builders are /// properly called to maintain the consistency of the data structure. diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index 8ca303da8cb4..6461a56aabbe 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -99,7 +99,7 @@ impl FieldData { } } -/// Builder type for creating a new `UnionArray`. +/// Builder for [`UnionArray`] /// /// Example: **Dense Memory Layout** /// diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index ff1ddb1f67ce..6ee9f7f1d06f 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -19,42 +19,6 @@ //! all having the same type. This crate provides concrete implementations of each type, as //! well as an [`Array`] trait that can be used for type-erasure. //! -//! # Downcasting an Array -//! -//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. -//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. -//! -//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, -//! it is often the case that you wish to interact with the data directly. -//! -//! This requires downcasting to the concrete type of the array: -//! -//! ``` -//! # use arrow_array::{Array, Float32Array, Int32Array}; -//! -//! fn sum_int32(array: &dyn Array) -> i32 { -//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); -//! integers.iter().map(|val| val.unwrap_or_default()).sum() -//! } -//! -//! // Note: the values for positions corresponding to nulls will be arbitrary -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! array.as_any().downcast_ref::().unwrap().values() -//! } -//! ``` -//! -//! The [`cast::AsArray`] extension trait can make this more ergonomic -//! -//! ``` -//! # use arrow_array::Array; -//! # use arrow_array::cast::{AsArray, as_primitive_array}; -//! # use arrow_array::types::Float32Type; -//! -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! array.as_primitive::().values() -//! } -//! ``` - //! # Building an Array //! //! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] @@ -62,7 +26,7 @@ //! ``` //! # use arrow_array::{Int32Array, ListArray, StringArray}; //! # use arrow_array::types::Int32Type; -//! +//! # //! Int32Array::from(vec![1, 2]); //! Int32Array::from(vec![Some(1), None]); //! Int32Array::from_iter([1, 2, 3, 4]); @@ -91,30 +55,59 @@ //! //! // Append a single primitive value //! builder.append_value(1); -//! //! // Append a null value //! builder.append_null(); -//! //! // Append a slice of primitive values //! builder.append_slice(&[2, 3, 4]); //! //! // Build the array //! let array = builder.finish(); //! -//! assert_eq!( -//! 5, -//! array.len(), -//! "The array has 5 values, counting the null value" -//! ); +//! assert_eq!(5, array.len()); +//! assert_eq!(2, array.value(2)); +//! assert_eq!(&array.values()[3..5], &[3, 4]) +//! ``` //! -//! assert_eq!(2, array.value(2), "Get the value with index 2"); +//! # Low-level API +//! +//! Internally, arrays consist of one or more shared memory regions backed by a [`Buffer`], +//! the number and meaning of which depend on the array’s data type, as documented in +//! the [Arrow specification]. +//! +//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! +//! * An optional [`NullBuffer`] identifying any null values +//! * A contiguous [`ScalarBuffer`] of values +//! +//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! +//! * An optional [`NullBuffer`] identifying any null values +//! * An offsets [`OffsetBuffer`] identifying valid UTF-8 sequences within the values buffer +//! * A values [`Buffer`] of UTF-8 encoded string data +//! +//! Array constructors such as [`PrimitiveArray::try_new`] provide the ability to cheaply +//! construct an array from these parts, with functions such as [`PrimitiveArray::into_parts`] +//! providing the reverse operation. //! -//! assert_eq!( -//! &array.values()[3..5], -//! &[3, 4], -//! "Get slice of len 2 starting at idx 3" -//! ) //! ``` +//! # use arrow_array::{Array, Int32Array, StringArray}; +//! # use arrow_buffer::OffsetBuffer; +//! # +//! // Create a Int32Array from Vec without copying +//! let array = Int32Array::new(vec![1, 2, 3].into(), None); +//! assert_eq!(array.values(), &[1, 2, 3]); +//! assert_eq!(array.null_count(), 0); +//! +//! // Create a StringArray from parts +//! let offsets = OffsetBuffer::new(vec![0, 5, 10].into()); +//! let array = StringArray::new(offsets, b"helloworld".into(), None); +//! let values: Vec<_> = array.iter().map(|x| x.unwrap()).collect(); +//! assert_eq!(values, &["hello", "world"]); +//! ``` +//! +//! As [`Buffer`], and its derivatives, can be created from [`Vec`] without copying, this provides +//! an efficient way to not only interoperate with other Rust code, but also implement kernels +//! optimised for the arrow data layout - e.g. by handling buffers instead of values. //! //! # Zero-Copy Slicing //! @@ -122,32 +115,57 @@ //! data. Internally this just increments some ref-counts, and so is incredibly cheap //! //! ```rust -//! # use std::sync::Arc; -//! # use arrow_array::{ArrayRef, Int32Array}; -//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; +//! # use arrow_array::Int32Array; +//! let array = Int32Array::from_iter([1, 2, 3]); //! //! // Slice with offset 1 and length 2 //! let sliced = array.slice(1, 2); -//! let ints = sliced.as_any().downcast_ref::().unwrap(); -//! assert_eq!(ints.values(), &[2, 3]); +//! assert_eq!(sliced.values(), &[2, 3]); //! ``` //! -//! # Internal Representation +//! # Downcasting an Array //! -//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of -//! which depend on the array’s data type, as documented in the [Arrow specification]. +//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. +//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. //! -//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, +//! it is often the case that you wish to interact with the concrete arrays directly. //! -//! * An optional [`NullBuffer`] identifying any null values -//! * A contiguous [`Buffer`] of 16-bit integers +//! This requires downcasting to the concrete type of the array: //! -//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! ``` +//! # use arrow_array::{Array, Float32Array, Int32Array}; //! -//! * An optional [`NullBuffer`] identifying any null values -//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer -//! * A values [`Buffer`] of UTF-8 encoded string data +//! // Safely downcast an `Array` to an `Int32Array` and compute the sum +//! // using native i32 values +//! fn sum_int32(array: &dyn Array) -> i32 { +//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); +//! integers.iter().map(|val| val.unwrap_or_default()).sum() +//! } //! +//! // Safely downcasts the array to a `Float32Array` and returns a &[f32] view of the data +//! // Note: the values for positions corresponding to nulls will be arbitrary (but still valid f32) +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_any().downcast_ref::().unwrap().values() +//! } +//! ``` +//! +//! The [`cast::AsArray`] extension trait can make this more ergonomic +//! +//! ``` +//! # use arrow_array::Array; +//! # use arrow_array::cast::{AsArray, as_primitive_array}; +//! # use arrow_array::types::Float32Type; +//! +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_primitive::().values() +//! } +//! ``` +//! +//! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer +//! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer +//! [`OffsetBuffer`]: arrow_buffer::OffsetBuffer +//! [`NullBuffer`]: arrow_buffer::NullBuffer //! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html //! [`&dyn Array`]: Array //! [`NullBuffer`]: arrow_buffer::NullBuffer diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index d1236eeaa9a6..a3cb6253f324 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines memory-related functions, such as allocate/deallocate/reallocate memory -//! regions, cache and allocation alignments. +//! Defines the low-level [`Allocation`] API for shared memory regions use std::alloc::Layout; use std::fmt::{Debug, Formatter}; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index ed53d3361daa..d33e68795e4e 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents -//! a contiguous memory region that can be shared via `offsets`. +//! Types of shared memory region mod offset; pub use offset::*; diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 9a905a3223b6..43c1cd004c92 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -36,7 +36,7 @@ use super::Buffer; /// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] /// to insert many items, and `into` to convert it to [`Buffer`]. /// -/// For a safe, strongly typed API consider using `Vec` +/// For a safe, strongly typed API consider using [`Vec`] and [`ScalarBuffer`](crate::ScalarBuffer) /// /// Note: this may be deprecated in a future release ([#1176](https://github.com/apache/arrow-rs/issues/1176)) /// diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index cdb0c2aeb824..60987be6e415 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -19,6 +19,13 @@ use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::buffer::BooleanBuffer; use crate::{Buffer, MutableBuffer}; +/// A [`BooleanBuffer`] used to encode validity for arrow arrays +/// +/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a +/// `true` value indicating the corresponding slot is not null, and `false` indicating +/// that it is null. +/// +/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps #[derive(Debug, Clone, Eq, PartialEq)] pub struct NullBuffer { buffer: BooleanBuffer, diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 1a4680111bd1..40b24e4ebf0f 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -22,12 +22,24 @@ use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; -/// Provides a safe API for interpreting a [`Buffer`] as a slice of [`ArrowNativeType`] +/// A strongly-typed [`Buffer`] supporting zero-copy cloning and slicing /// -/// # Safety +/// The easiest way to think about `ScalarBuffer` is being equivalent to a `Arc>`, +/// with the following differences: /// -/// All [`ArrowNativeType`] are valid for all possible backing byte representations, and as -/// a result they are "trivially safely transmutable". +/// - slicing and cloning is O(1). +/// - it supports external allocated memory +/// +/// ``` +/// # use arrow_buffer::ScalarBuffer; +/// // Zero-copy conversion from Vec +/// let buffer = ScalarBuffer::from(vec![1, 2, 3]); +/// assert_eq!(&buffer, &[1, 2, 3]); +/// +/// // Zero-copy slicing +/// let sliced = buffer.slice(1, 2); +/// assert_eq!(&sliced, &[2, 3]); +/// ``` #[derive(Clone)] pub struct ScalarBuffer { /// Underlying data buffer diff --git a/arrow-buffer/src/util/bit_iterator.rs b/arrow-buffer/src/util/bit_iterator.rs index 1a8dd9226318..4e24ccdabec0 100644 --- a/arrow-buffer/src/util/bit_iterator.rs +++ b/arrow-buffer/src/util/bit_iterator.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Types for iterating over packed bitmasks + use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; use crate::bit_util::{ceil, get_bit_raw}; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 27c905ba0cd6..af5972acc97e 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -92,7 +92,7 @@ //! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! -//! And the following is generic over all arrays with comparable values +//! And the following is generic over all arrays with comparable values: //! //! ```rust //! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray}; @@ -109,7 +109,7 @@ //! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a")); //! ``` //! -//! For more examples, consult the [arrow_array] docs. +//! For more examples, and details consult the [arrow_array] docs. //! //! # Type Erasure / Trait Objects //! @@ -317,19 +317,6 @@ //! assert_eq!(string.value(1), "foo"); //! ``` //! -//! # Memory and Buffers -//! -//! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, -//! for FFI or high-performance conversion from other formats. This interface is provided by -//! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can be accessed -//! with [`Array::to_data`](array::Array::to_data) -//! -//! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants, with the former -//! performing extensive, but potentially expensive validation to ensure the buffers are well-formed. -//! -//! An [`ArrayRef`] can be cheaply created from an [`ArrayData`] using [`make_array`], -//! or by using the appropriate [`From`] conversion on the concrete [`Array`] implementation. -//! //! # Safety and Security //! //! Like many crates, this crate makes use of unsafe where prudent. However, it endeavours to be From 1f714fa77dc037e561dcfb701e2e906e01640142 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 13:19:45 +0100 Subject: [PATCH 0890/1411] Allow setting ClientOptions with Options API (#4202) * Allow setting ClientOptions with options API * More clippy --- object_store/src/aws/mod.rs | 25 +++++++++++++++---------- object_store/src/azure/mod.rs | 25 +++++++++++++++---------- object_store/src/client/mod.rs | 23 +++++++++++++++++++++++ object_store/src/gcp/mod.rs | 18 ++++++++++++++---- 4 files changed, 67 insertions(+), 24 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index fe49471c4907..17d779ff6a51 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -604,6 +604,9 @@ pub enum AmazonS3ConfigKey { /// - `aws_profile` /// - `profile` Profile, + + /// Client options + Client(ClientConfigKey), } impl AsRef for AmazonS3ConfigKey { @@ -622,6 +625,7 @@ impl AsRef for AmazonS3ConfigKey { Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", + Self::Client(opt) => opt.as_ref(), } } } @@ -652,7 +656,12 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + // Backwards compatibility + "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -688,9 +697,7 @@ impl AmazonS3Builder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("AWS_") { - if let Ok(config_key) = - AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } @@ -706,12 +713,6 @@ impl AmazonS3Builder { Some(format!("{METADATA_ENDPOINT}{metadata_relative_uri}")); } - if let Ok(text) = std::env::var("AWS_ALLOW_HTTP") { - builder.client_options = builder - .client_options - .with_config(ClientConfigKey::AllowHttp, text); - } - builder } @@ -770,6 +771,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -834,6 +838,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 2b5b43adabe0..c2cfdfe6af32 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -559,6 +559,9 @@ pub enum AzureConfigKey { /// - `azure_use_azure_cli` /// - `use_azure_cli` UseAzureCli, + + /// Client options + Client(ClientConfigKey), } impl AsRef for AzureConfigKey { @@ -577,6 +580,7 @@ impl AsRef for AzureConfigKey { Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", + Self::Client(key) => key.as_ref(), } } } @@ -621,7 +625,12 @@ impl FromStr for AzureConfigKey { Ok(Self::FederatedTokenFile) } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + // Backwards compatibility + "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -664,21 +673,13 @@ impl MicrosoftAzureBuilder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("AZURE_") { - if let Ok(config_key) = - AzureConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } } } - if let Ok(text) = std::env::var("AZURE_ALLOW_HTTP") { - builder.client_options = builder - .client_options - .with_config(ClientConfigKey::AllowHttp, text) - } - if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { builder = builder.with_msi_endpoint(text); } @@ -731,6 +732,9 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -786,6 +790,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + AzureConfigKey::Client(key) => self.client_options.get_config_value(key), } } diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index d7b0b86d99e5..d2242dd41089 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -31,6 +31,7 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::str::FromStr; use std::time::Duration; use crate::path::Path; @@ -53,6 +54,28 @@ pub enum ClientConfigKey { AllowHttp, } +impl AsRef for ClientConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AllowHttp => "allow_http", + } + } +} + +impl FromStr for ClientConfigKey { + type Err = super::Error; + + fn from_str(s: &str) -> Result { + match s { + "allow_http" => Ok(Self::AllowHttp), + _ => Err(super::Error::UnknownConfigurationKey { + store: "HTTP", + key: s.into(), + }), + } + } +} + /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 6f3d53d42f34..375b4d8f8c37 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -49,6 +49,7 @@ use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::ClientConfigKey; use crate::{ client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, @@ -829,6 +830,9 @@ pub enum GoogleConfigKey { /// /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. ApplicationCredentials, + + /// Client options + Client(ClientConfigKey), } impl AsRef for GoogleConfigKey { @@ -838,6 +842,7 @@ impl AsRef for GoogleConfigKey { Self::ServiceAccountKey => "google_service_account_key", Self::Bucket => "google_bucket", Self::ApplicationCredentials => "google_application_credentials", + Self::Client(key) => key.as_ref(), } } } @@ -858,7 +863,10 @@ impl FromStr for GoogleConfigKey { Ok(Self::Bucket) } "google_application_credentials" => Ok(Self::ApplicationCredentials), - _ => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, } } } @@ -911,9 +919,7 @@ impl GoogleCloudStorageBuilder { for (os_key, os_value) in std::env::vars_os() { if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { if key.starts_with("GOOGLE_") { - if let Ok(config_key) = - GoogleConfigKey::from_str(&key.to_ascii_lowercase()) - { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { builder = builder.with_config(config_key, value); } } @@ -957,6 +963,9 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path = Some(value.into()) } + GoogleConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } }; self } @@ -1005,6 +1014,7 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path.clone() } + GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), } } From d6c3c01aafdaea2b2b1051d76c4e604ffffd72b0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 18:03:56 +0100 Subject: [PATCH 0891/1411] Skip test_list_root on OS X (#3772) (#4198) * Skip test_list_root if cannot list root filesystem (#3772) * do not run on max * Remove list check --------- Co-authored-by: Andrew Lamb --- object_store/src/local.rs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 286853da2eda..b40f5a777860 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1117,19 +1117,23 @@ mod tests { } #[tokio::test] + #[cfg(target_family = "windows")] async fn test_list_root() { - let integration = LocalFileSystem::new(); - let result = integration.list_with_delimiter(None).await; - if cfg!(target_family = "windows") { - let r = result.unwrap_err().to_string(); - assert!( - r.contains("Unable to convert URL \"file:///\" to filesystem path"), - "{}", - r - ); - } else { - result.unwrap(); - } + let fs = LocalFileSystem::new(); + let r = fs.list_with_delimiter(None).await.unwrap_err().to_string(); + + assert!( + r.contains("Unable to convert URL \"file:///\" to filesystem path"), + "{}", + r + ); + } + + #[tokio::test] + #[cfg(target_os = "linux")] + async fn test_list_root() { + let fs = LocalFileSystem::new(); + fs.list_with_delimiter(None).await.unwrap(); } async fn check_list( From f875eecf07ab9bc2d0425ca7170aa9178c73bd0b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 11 May 2023 23:41:45 +0100 Subject: [PATCH 0892/1411] Document how to sort a RecordBatch (#4204) --- arrow-ord/src/lib.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs index c84db09fd32e..62338c022384 100644 --- a/arrow-ord/src/lib.rs +++ b/arrow-ord/src/lib.rs @@ -16,6 +16,32 @@ // under the License. //! Arrow ordering kernels +//! +//! # Sort RecordBatch +//! +//! ``` +//! # use std::sync::Arc; +//! # use arrow_array::*; +//! # use arrow_array::cast::AsArray; +//! # use arrow_array::types::Int32Type; +//! # use arrow_ord::sort::sort_to_indices; +//! # use arrow_select::take::take; +//! # +//! let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); +//! let b: ArrayRef = Arc::new(StringArray::from(vec!["b", "a", "e", "d"])); +//! let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap(); +//! +//! // Sort by column 1 +//! let indices = sort_to_indices(batch.column(1), None, None).unwrap(); +//! +//! // Apply indices to batch columns +//! let columns = batch.columns().iter().map(|c| take(&*c, &indices, None).unwrap()).collect(); +//! let sorted = RecordBatch::try_new(batch.schema(), columns).unwrap(); +//! +//! let col1 = sorted.column(0).as_primitive::(); +//! assert_eq!(col1.values(), &[2, 1, 4, 3]); +//! ``` +//! pub mod comparison; pub mod ord; From e1e1c79915948ec2eda256a94b50a315e05620e3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 12 May 2023 12:01:33 +0100 Subject: [PATCH 0893/1411] Create ObjectStore from URL and Options (#4047) (#4200) * Add parse_url function (#4047) * Clippy * Fix copypasta * Fix wasm32 build * More wasm fixes * Return remaining path * Don't use from_env --- object_store/src/aws/mod.rs | 2 +- object_store/src/lib.rs | 3 + object_store/src/parse.rs | 265 ++++++++++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 object_store/src/parse.rs diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 17d779ff6a51..6fa5e1c851c7 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -722,7 +722,7 @@ impl AmazonS3Builder { /// /// - `s3:///` /// - `s3a:///` - /// - `https://s3..amazonaws.com` + /// - `https://s3..amazonaws.com/` /// - `https://.s3..amazonaws.com` /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` /// diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 1390a0140d1c..2c93802edaa8 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -252,8 +252,11 @@ mod config; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; +mod parse; mod util; +pub use parse::{parse_url, parse_url_opts}; + use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] use crate::util::maybe_spawn_blocking; diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs new file mode 100644 index 000000000000..7b89e58e10e7 --- /dev/null +++ b/object_store/src/parse.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(not(target_arch = "wasm32"))] +use crate::local::LocalFileSystem; +use crate::memory::InMemory; +use crate::path::Path; +use crate::ObjectStore; +use snafu::Snafu; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] + InvalidUrl { url: Url }, + + #[snafu(display("Unable to recognise URL \"{}\"", url))] + Unrecognised { url: Url }, + + #[snafu(display("Feature {scheme:?} not enabled"))] + NotEnabled { scheme: ObjectStoreScheme }, + + #[snafu(context(false))] + Path { source: crate::path::Error }, +} + +impl From for super::Error { + fn from(e: Error) -> Self { + Self::Generic { + store: "URL", + source: Box::new(e), + } + } +} + +/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) +#[derive(Debug, Eq, PartialEq)] +enum ObjectStoreScheme { + /// Url corresponding to [`LocalFileSystem`](crate::local::LocalFileSystem) + Local, + /// Url corresponding to [`InMemory`](crate::memory::InMemory) + Memory, + /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3) + AmazonS3, + /// Url corresponding to [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage) + GoogleCloudStorage, + /// Url corresponding to [`MicrosoftAzure`](crate::azure::MicrosoftAzure) + MicrosoftAzure, + /// Url corresponding to [`HttpStore`](crate::http::HttpStore) + Http, +} + +impl ObjectStoreScheme { + /// Create an [`ObjectStoreScheme`] from the provided [`Url`] + /// + /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`] + fn parse(url: &Url) -> Result<(Self, Path), Error> { + let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1); + + let (scheme, path) = match (url.scheme(), url.host_str()) { + ("file", None) => (Self::Local, url.path()), + ("memory", None) => (Self::Memory, url.path()), + ("s3" | "s3a", Some(_)) => (Self::AmazonS3, url.path()), + ("gs", Some(_)) => (Self::GoogleCloudStorage, url.path()), + ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => { + (Self::MicrosoftAzure, url.path()) + } + ("http", Some(_)) => (Self::Http, url.path()), + ("https", Some(host)) => { + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + { + (Self::MicrosoftAzure, url.path()) + } else if host.ends_with("amazonaws.com") { + match host.starts_with("s3") { + true => (Self::AmazonS3, strip_bucket().unwrap_or_default()), + false => (Self::AmazonS3, url.path()), + } + } else if host.ends_with("r2.cloudflarestorage.com") { + (Self::AmazonS3, strip_bucket().unwrap_or_default()) + } else { + (Self::Http, url.path()) + } + } + _ => return Err(Error::Unrecognised { url: url.clone() }), + }; + + let path = Path::parse(path)?; + Ok((scheme, path)) + } +} + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] +macro_rules! builder_opts { + ($builder:ty, $url:expr, $options:expr) => {{ + let builder = $options.into_iter().fold( + <$builder>::new().with_url($url.as_str()), + |builder, (key, value)| match key.as_ref().parse() { + Ok(k) => builder.with_config(k, value), + Err(_) => builder, + }, + ); + Box::new(builder.build()?) as _ + }}; +} + +/// Create an [`ObjectStore`] based on the provided `url` +/// +/// Returns +/// - An [`ObjectStore`] of the corresponding type +/// - The [`Path`] into the [`ObjectStore`] of the addressed resource +pub fn parse_url(url: &Url) -> Result<(Box, Path), super::Error> { + parse_url_opts(url, std::iter::empty::<(&str, &str)>()) +} + +/// Create an [`ObjectStore`] based on the provided `url` and options +/// +/// Returns +/// - An [`ObjectStore`] of the corresponding type +/// - The [`Path`] into the [`ObjectStore`] of the addressed resource +pub fn parse_url_opts( + url: &Url, + options: I, +) -> Result<(Box, Path), super::Error> +where + I: IntoIterator, + K: AsRef, + V: Into, +{ + let _options = options; + let (scheme, path) = ObjectStoreScheme::parse(url)?; + let path = Path::parse(path)?; + + let store = match scheme { + #[cfg(not(target_arch = "wasm32"))] + ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _, + ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _, + #[cfg(feature = "aws")] + ObjectStoreScheme::AmazonS3 => { + builder_opts!(crate::aws::AmazonS3Builder, url, _options) + } + #[cfg(feature = "gcp")] + ObjectStoreScheme::GoogleCloudStorage => { + builder_opts!(crate::gcp::GoogleCloudStorageBuilder, url, _options) + } + #[cfg(feature = "azure")] + ObjectStoreScheme::MicrosoftAzure => { + builder_opts!(crate::azure::MicrosoftAzureBuilder, url, _options) + } + #[cfg(feature = "http")] + ObjectStoreScheme::Http => { + let url = &url[..url::Position::BeforePath]; + Box::new(crate::http::HttpBuilder::new().with_url(url).build()?) as _ + } + #[cfg(not(all( + feature = "aws", + feature = "azure", + feature = "gcp", + feature = "http" + )))] + s => { + return Err(super::Error::Generic { + store: "parse_url", + source: format!("feature for {s:?} not enabled").into(), + }) + } + }; + + Ok((store, path)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse() { + let cases = [ + ("file:/path", (ObjectStoreScheme::Local, "path")), + ("file:///path", (ObjectStoreScheme::Local, "path")), + ("memory:/path", (ObjectStoreScheme::Memory, "path")), + ("memory:///", (ObjectStoreScheme::Memory, "")), + ("s3://bucket/path", (ObjectStoreScheme::AmazonS3, "path")), + ("s3a://bucket/path", (ObjectStoreScheme::AmazonS3, "path")), + ( + "https://s3.region.amazonaws.com/bucket", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://s3.region.amazonaws.com/bucket/path", + (ObjectStoreScheme::AmazonS3, "path"), + ), + ( + "https://bucket.s3.region.amazonaws.com", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket", + (ObjectStoreScheme::AmazonS3, ""), + ), + ( + "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path", + (ObjectStoreScheme::AmazonS3, "path"), + ), + ( + "abfs://container/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "abfs://file_system@account_name.dfs.core.windows.net/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "abfss://file_system@account_name.dfs.core.windows.net/path", + (ObjectStoreScheme::MicrosoftAzure, "path"), + ), + ( + "https://account.dfs.core.windows.net", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.blob.core.windows.net", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "gs://bucket/path", + (ObjectStoreScheme::GoogleCloudStorage, "path"), + ), + ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), + ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), + ]; + + for (s, (expected_scheme, expected_path)) in cases { + let url = Url::parse(s).unwrap(); + let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap(); + + assert_eq!(scheme, expected_scheme, "{s}"); + assert_eq!(path, Path::parse(expected_path).unwrap(), "{s}"); + } + + let neg_cases = [ + "unix:/run/foo.socket", + "file://remote/path", + "memory://remote/", + ]; + for s in neg_cases { + let url = Url::parse(s).unwrap(); + assert!(ObjectStoreScheme::parse(&url).is_err()); + } + } +} From d012bb289fefddde8c388f7ddb3cc1d31e5a0ca9 Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Fri, 12 May 2023 23:54:29 +0700 Subject: [PATCH 0894/1411] Fix incorrect cast Timestamp with Timezone (#4201) * Fix incorrect cast Timestamp with Timezone * Fix incorrect cast timestamp with timezone * Support chrono_tz Timezone * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Move chrono-tz timestamp test to arrow/tests * Fix clippy and cargo fmt * Fix clippy --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-cast/src/cast.rs | 151 +++++++++++++++++++++++++++++++++++++- arrow/tests/array_cast.rs | 92 +++++++++++++++++++++++ 2 files changed, 240 insertions(+), 3 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 37fede0a6fe0..2b286bfa9119 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,7 +35,7 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{NaiveTime, TimeZone, Timelike, Utc}; +use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc}; use std::cmp::Ordering; use std::sync::Arc; @@ -1770,7 +1770,7 @@ pub fn cast_with_options( tz.clone(), )), - (Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => { + (Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => { let array = cast_with_options(array, &Int64, cast_options)?; let time_array = array.as_primitive::(); let from_size = time_unit_multiple(from_unit); @@ -1792,8 +1792,52 @@ pub fn cast_with_options( } } }; + // Normalize timezone + let adjusted = match (from_tz, to_tz) { + // Only this case needs to be adjusted because we're casting from + // unknown time offset to some time offset, we want the time to be + // unchanged. + // + // i.e. Timestamp('2001-01-01T00:00', None) -> Timestamp('2001-01-01T00:00', '+0700') + (None, Some(to_tz)) => { + let to_tz: Tz = to_tz.parse()?; + match to_unit { + TimeUnit::Second => { + adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )? + } + TimeUnit::Millisecond => { + adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )? + } + TimeUnit::Microsecond => { + adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )? + } + TimeUnit::Nanosecond => { + adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )? + } + } + } + _ => { + converted + } + }; Ok(make_timestamp_array( - &converted, + &adjusted, to_unit.clone(), to_tz.clone(), )) @@ -3005,6 +3049,30 @@ fn cast_string_to_month_day_nano_interval( Ok(Arc::new(interval_array) as ArrayRef) } +fn adjust_timestamp_to_timezone( + array: PrimitiveArray, + to_tz: &Tz, + cast_options: &CastOptions, +) -> Result, ArrowError> { + let adjust = |o| { + let local = as_datetime::(o)?; + let offset = to_tz.offset_from_local_datetime(&local).single()?; + T::make_value(local - offset.fix()) + }; + let adjusted = if cast_options.safe { + array.unary_opt::<_, Int64Type>(adjust) + } else { + array.try_unary::<_, Int64Type, _>(|o| { + adjust(o).ok_or_else(|| { + ArrowError::CastError( + "Cannot cast timezone to different timezone".to_string(), + ) + }) + })? + }; + Ok(adjusted) +} + /// Casts Utf8 to Boolean fn cast_utf8_to_boolean( from: &dyn Array, @@ -5978,6 +6046,83 @@ mod tests { assert!(b.is_err()); } + // Cast Timestamp(_, None) -> Timestamp(_, Some(timezone)) + #[test] + fn test_cast_timestamp_with_timezone_1() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + Some("2000-01-01T00:00:00.123456789"), + Some("2010-01-01T00:00:00.123456789"), + None, + ])); + let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into())); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T00:00:00.123456+07:00", result.value(0)); + assert_eq!("2010-01-01T00:00:00.123456+07:00", result.value(1)); + assert!(result.is_null(2)); + } + + // Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None) + #[test] + fn test_cast_timestamp_with_timezone_2() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + Some("2000-01-01T07:00:00.123456789"), + Some("2010-01-01T07:00:00.123456789"), + None, + ])); + let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("+0700".into())); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + // Check intermediate representation is correct + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T07:00:00.123+07:00", result.value(0)); + assert_eq!("2010-01-01T07:00:00.123+07:00", result.value(1)); + assert!(result.is_null(2)); + + let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T00:00:00.123", result.value(0)); + assert_eq!("2010-01-01T00:00:00.123", result.value(1)); + assert!(result.is_null(2)); + } + + // Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone)) + #[test] + fn test_cast_timestamp_with_timezone_3() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + Some("2000-01-01T07:00:00.123456789"), + Some("2010-01-01T07:00:00.123456789"), + None, + ])); + let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into())); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + // Check intermediate representation is correct + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T07:00:00.123456+07:00", result.value(0)); + assert_eq!("2010-01-01T07:00:00.123456+07:00", result.value(1)); + assert!(result.is_null(2)); + + let to_type = DataType::Timestamp(TimeUnit::Second, Some("-08:00".into())); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("1999-12-31T16:00:00-08:00", result.value(0)); + assert_eq!("2009-12-31T16:00:00-08:00", result.value(1)); + assert!(result.is_null(2)); + } + #[test] fn test_cast_date64_to_timestamp() { let array = diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index bf7e7a326efc..43dc6dd0eb0a 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -18,6 +18,7 @@ use arrow_array::builder::{ PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, }; +use arrow_array::cast::AsArray; use arrow_array::types::{ ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, @@ -64,6 +65,97 @@ fn test_cast_timestamp_to_string() { assert!(c.is_null(2)); } +// See: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for list of valid +// timezones + +// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone)) +#[test] +fn test_cast_timestamp_with_timezone_daylight_1() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + // This is winter in New York so daylight saving is not in effect + // UTC offset is -05:00 + Some("2000-01-01T00:00:00.123456789"), + // This is summer in New York so daylight saving is in effect + // UTC offset is -04:00 + Some("2010-07-01T00:00:00.123456789"), + None, + ])); + let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + let to_type = + DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0)); + assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1)); + assert!(result.is_null(2)); +} + +// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None) +#[test] +fn test_cast_timestamp_with_timezone_daylight_2() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + Some("2000-01-01T07:00:00.123456789"), + Some("2010-07-01T07:00:00.123456789"), + None, + ])); + let to_type = + DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into())); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + // Check intermediate representation is correct + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T07:00:00.123-05:00", result.value(0)); + assert_eq!("2010-07-01T07:00:00.123-04:00", result.value(1)); + assert!(result.is_null(2)); + + let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T12:00:00.123", result.value(0)); + assert_eq!("2010-07-01T11:00:00.123", result.value(1)); + assert!(result.is_null(2)); +} + +// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone)) +#[test] +fn test_cast_timestamp_with_timezone_daylight_3() { + let string_array: Arc = Arc::new(StringArray::from(vec![ + // Winter in New York, summer in Sydney + // UTC offset is -05:00 (New York) and +11:00 (Sydney) + Some("2000-01-01T00:00:00.123456789"), + // Summer in New York, winter in Sydney + // UTC offset is -04:00 (New York) and +10:00 (Sydney) + Some("2010-07-01T00:00:00.123456789"), + None, + ])); + let to_type = + DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); + let timestamp_array = cast(&string_array, &to_type).unwrap(); + + // Check intermediate representation is correct + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0)); + assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1)); + assert!(result.is_null(2)); + + let to_type = DataType::Timestamp(TimeUnit::Second, Some("Australia/Sydney".into())); + let timestamp_array = cast(×tamp_array, &to_type).unwrap(); + + let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); + let result = string_array.as_string::(); + assert_eq!("2000-01-01T16:00:00+11:00", result.value(0)); + assert_eq!("2010-07-01T14:00:00+10:00", result.value(1)); + assert!(result.is_null(2)); +} + #[test] #[cfg_attr(miri, ignore)] // running forever fn test_can_cast_types() { From 144528f0e2c6c21fcb72ff8b79b01a4c62c06077 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Fri, 12 May 2023 19:00:02 +0200 Subject: [PATCH 0895/1411] feat: extend client option configuration keys (#4208) --- object_store/Cargo.toml | 1 + object_store/src/client/mod.rs | 356 +++++++++++++++++++++++++++++---- object_store/src/config.rs | 55 ++++- 3 files changed, 367 insertions(+), 45 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index b27482bcfabc..e25801b6c92d 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -33,6 +33,7 @@ async-trait = "0.1.53" bytes = "1.0" chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" +humantime = "2.1" itertools = "0.10.1" parking_lot = { version = "0.12" } percent-encoding = "2.1" diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index d2242dd41089..ccf1b4a3bdc2 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -26,14 +26,15 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; -use crate::config::ConfigValue; -use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy}; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; +use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::{Client, ClientBuilder, Proxy}; +use serde::{Deserialize, Serialize}; + +use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; fn map_client_error(e: reqwest::Error) -> super::Error { @@ -52,12 +53,64 @@ static DEFAULT_USER_AGENT: &str = pub enum ClientConfigKey { /// Allow non-TLS, i.e. non-HTTPS connections AllowHttp, + /// Skip certificate validation on https connections. + /// + /// # Warning + /// + /// You should think very carefully before using this method. If + /// invalid certificates are trusted, *any* certificate for *any* site + /// will be trusted for use. This includes expired certificates. This + /// introduces significant vulnerabilities, and should only be used + /// as a last resort or for testing + AllowInvalidCertificates, + /// Timeout for only the connect phase of a Client + ConnectTimeout, + /// default CONTENT_TYPE for uploads + DefaultContentType, + /// Only use http1 connections + Http1Only, + /// Interval for HTTP2 Ping frames should be sent to keep a connection alive. + Http2KeepAliveInterval, + /// Timeout for receiving an acknowledgement of the keep-alive ping. + Http2KeepAliveTimeout, + /// Enable HTTP2 keep alive pings for idle connections + Http2KeepAliveWhileIdle, + /// Only use http2 connections + Http2Only, + /// The pool max idle timeout + /// + /// This is the length of time an idle connection will be kept alive + PoolIdleTimeout, + /// maximum number of idle connections per host + PoolMaxIdlePerHost, + /// HTTP proxy to use for requests + ProxyUrl, + /// Request timeout + /// + /// The timeout is applied from when the request starts connecting until the + /// response body has finished + Timeout, + /// User-Agent header to be used by this client + UserAgent, } impl AsRef for ClientConfigKey { fn as_ref(&self) -> &str { match self { Self::AllowHttp => "allow_http", + Self::AllowInvalidCertificates => "allow_invalid_certificates", + Self::ConnectTimeout => "connect_timeout", + Self::DefaultContentType => "default_content_type", + Self::Http1Only => "http1_only", + Self::Http2Only => "http2_only", + Self::Http2KeepAliveInterval => "http2_keep_alive_interval", + Self::Http2KeepAliveTimeout => "http2_keep_alive_timeout", + Self::Http2KeepAliveWhileIdle => "http2_keep_alive_while_idle", + Self::PoolIdleTimeout => "pool_idle_timeout", + Self::PoolMaxIdlePerHost => "pool_max_idle_per_host", + Self::ProxyUrl => "proxy_url", + Self::Timeout => "timeout", + Self::UserAgent => "user_agent", } } } @@ -68,6 +121,19 @@ impl FromStr for ClientConfigKey { fn from_str(s: &str) -> Result { match s { "allow_http" => Ok(Self::AllowHttp), + "allow_invalid_certificates" => Ok(Self::AllowInvalidCertificates), + "connect_timeout" => Ok(Self::ConnectTimeout), + "default_content_type" => Ok(Self::DefaultContentType), + "http1_only" => Ok(Self::Http1Only), + "http2_only" => Ok(Self::Http2Only), + "http2_keep_alive_interval" => Ok(Self::Http2KeepAliveInterval), + "http2_keep_alive_timeout" => Ok(Self::Http2KeepAliveTimeout), + "http2_keep_alive_while_idle" => Ok(Self::Http2KeepAliveWhileIdle), + "pool_idle_timeout" => Ok(Self::PoolIdleTimeout), + "pool_max_idle_per_host" => Ok(Self::PoolMaxIdlePerHost), + "proxy_url" => Ok(Self::ProxyUrl), + "timeout" => Ok(Self::Timeout), + "user_agent" => Ok(Self::UserAgent), _ => Err(super::Error::UnknownConfigurationKey { store: "HTTP", key: s.into(), @@ -79,22 +145,22 @@ impl FromStr for ClientConfigKey { /// HTTP client configuration for remote object stores #[derive(Debug, Clone, Default)] pub struct ClientOptions { - user_agent: Option, + user_agent: Option>, content_type_map: HashMap, default_content_type: Option, default_headers: Option, proxy_url: Option, allow_http: ConfigValue, - allow_insecure: bool, - timeout: Option, - connect_timeout: Option, - pool_idle_timeout: Option, - pool_max_idle_per_host: Option, - http2_keep_alive_interval: Option, - http2_keep_alive_timeout: Option, - http2_keep_alive_while_idle: bool, - http1_only: bool, - http2_only: bool, + allow_insecure: ConfigValue, + timeout: Option>, + connect_timeout: Option>, + pool_idle_timeout: Option>, + pool_max_idle_per_host: Option>, + http2_keep_alive_interval: Option>, + http2_keep_alive_timeout: Option>, + http2_keep_alive_while_idle: ConfigValue, + http1_only: ConfigValue, + http2_only: ConfigValue, } impl ClientOptions { @@ -107,6 +173,37 @@ impl ClientOptions { pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { match key { ClientConfigKey::AllowHttp => self.allow_http.parse(value), + ClientConfigKey::AllowInvalidCertificates => self.allow_insecure.parse(value), + ClientConfigKey::ConnectTimeout => { + self.connect_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::DefaultContentType => { + self.default_content_type = Some(value.into()) + } + ClientConfigKey::Http1Only => self.http1_only.parse(value), + ClientConfigKey::Http2Only => self.http2_only.parse(value), + ClientConfigKey::Http2KeepAliveInterval => { + self.http2_keep_alive_interval = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::Http2KeepAliveTimeout => { + self.http2_keep_alive_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::Http2KeepAliveWhileIdle => { + self.http2_keep_alive_while_idle.parse(value) + } + ClientConfigKey::PoolIdleTimeout => { + self.pool_idle_timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::PoolMaxIdlePerHost => { + self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), + ClientConfigKey::Timeout => { + self.timeout = Some(ConfigValue::Deferred(value.into())) + } + ClientConfigKey::UserAgent => { + self.user_agent = Some(ConfigValue::Deferred(value.into())) + } } self } @@ -115,6 +212,37 @@ impl ClientOptions { pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { match key { ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), + ClientConfigKey::AllowInvalidCertificates => { + Some(self.allow_insecure.to_string()) + } + ClientConfigKey::ConnectTimeout => { + self.connect_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::DefaultContentType => self.default_content_type.clone(), + ClientConfigKey::Http1Only => Some(self.http1_only.to_string()), + ClientConfigKey::Http2KeepAliveInterval => { + self.http2_keep_alive_interval.as_ref().map(fmt_duration) + } + ClientConfigKey::Http2KeepAliveTimeout => { + self.http2_keep_alive_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::Http2KeepAliveWhileIdle => { + Some(self.http2_keep_alive_while_idle.to_string()) + } + ClientConfigKey::Http2Only => Some(self.http2_only.to_string()), + ClientConfigKey::PoolIdleTimeout => { + self.pool_idle_timeout.as_ref().map(fmt_duration) + } + ClientConfigKey::PoolMaxIdlePerHost => { + self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) + } + ClientConfigKey::ProxyUrl => self.proxy_url.clone(), + ClientConfigKey::Timeout => self.timeout.as_ref().map(fmt_duration), + ClientConfigKey::UserAgent => self + .user_agent + .as_ref() + .and_then(|v| v.get().ok()) + .and_then(|v| v.to_str().ok().map(|s| s.to_string())), } } @@ -122,7 +250,7 @@ impl ClientOptions { /// /// Default is based on the version of this crate pub fn with_user_agent(mut self, agent: HeaderValue) -> Self { - self.user_agent = Some(agent); + self.user_agent = Some(agent.into()); self } @@ -167,19 +295,19 @@ impl ClientOptions { /// introduces significant vulnerabilities, and should only be used /// as a last resort or for testing pub fn with_allow_invalid_certificates(mut self, allow_insecure: bool) -> Self { - self.allow_insecure = allow_insecure; + self.allow_insecure = allow_insecure.into(); self } /// Only use http1 connections pub fn with_http1_only(mut self) -> Self { - self.http1_only = true; + self.http1_only = true.into(); self } /// Only use http2 connections pub fn with_http2_only(mut self) -> Self { - self.http2_only = true; + self.http2_only = true.into(); self } @@ -194,13 +322,13 @@ impl ClientOptions { /// The timeout is applied from when the request starts connecting until the /// response body has finished pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.timeout = Some(timeout); + self.timeout = Some(ConfigValue::Parsed(timeout)); self } /// Set a timeout for only the connect phase of a Client pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { - self.connect_timeout = Some(timeout); + self.connect_timeout = Some(ConfigValue::Parsed(timeout)); self } @@ -210,7 +338,7 @@ impl ClientOptions { /// /// Default is 90 seconds pub fn with_pool_idle_timeout(mut self, timeout: Duration) -> Self { - self.pool_idle_timeout = Some(timeout); + self.pool_idle_timeout = Some(ConfigValue::Parsed(timeout)); self } @@ -218,7 +346,7 @@ impl ClientOptions { /// /// Default is no limit pub fn with_pool_max_idle_per_host(mut self, max: usize) -> Self { - self.pool_max_idle_per_host = Some(max); + self.pool_max_idle_per_host = Some(max.into()); self } @@ -226,7 +354,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_interval(mut self, interval: Duration) -> Self { - self.http2_keep_alive_interval = Some(interval); + self.http2_keep_alive_interval = Some(ConfigValue::Parsed(interval)); self } @@ -237,7 +365,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_timeout(mut self, interval: Duration) -> Self { - self.http2_keep_alive_timeout = Some(interval); + self.http2_keep_alive_timeout = Some(ConfigValue::Parsed(interval)); self } @@ -248,7 +376,7 @@ impl ClientOptions { /// /// Default is disabled pub fn with_http2_keep_alive_while_idle(mut self) -> Self { - self.http2_keep_alive_while_idle = true; + self.http2_keep_alive_while_idle = true.into(); self } @@ -274,7 +402,7 @@ impl ClientOptions { let mut builder = ClientBuilder::new(); match &self.user_agent { - Some(user_agent) => builder = builder.user_agent(user_agent), + Some(user_agent) => builder = builder.user_agent(user_agent.get()?), None => builder = builder.user_agent(DEFAULT_USER_AGENT), } @@ -287,44 +415,44 @@ impl ClientOptions { builder = builder.proxy(proxy); } - if let Some(timeout) = self.timeout { - builder = builder.timeout(timeout) + if let Some(timeout) = &self.timeout { + builder = builder.timeout(timeout.get()?) } - if let Some(timeout) = self.connect_timeout { - builder = builder.connect_timeout(timeout) + if let Some(timeout) = &self.connect_timeout { + builder = builder.connect_timeout(timeout.get()?) } - if let Some(timeout) = self.pool_idle_timeout { - builder = builder.pool_idle_timeout(timeout) + if let Some(timeout) = &self.pool_idle_timeout { + builder = builder.pool_idle_timeout(timeout.get()?) } - if let Some(max) = self.pool_max_idle_per_host { - builder = builder.pool_max_idle_per_host(max) + if let Some(max) = &self.pool_max_idle_per_host { + builder = builder.pool_max_idle_per_host(max.get()?) } - if let Some(interval) = self.http2_keep_alive_interval { - builder = builder.http2_keep_alive_interval(interval) + if let Some(interval) = &self.http2_keep_alive_interval { + builder = builder.http2_keep_alive_interval(interval.get()?) } - if let Some(interval) = self.http2_keep_alive_timeout { - builder = builder.http2_keep_alive_timeout(interval) + if let Some(interval) = &self.http2_keep_alive_timeout { + builder = builder.http2_keep_alive_timeout(interval.get()?) } - if self.http2_keep_alive_while_idle { + if self.http2_keep_alive_while_idle.get()? { builder = builder.http2_keep_alive_while_idle(true) } - if self.http1_only { + if self.http1_only.get()? { builder = builder.http1_only() } - if self.http2_only { + if self.http2_only.get()? { builder = builder.http2_prior_knowledge() } - if self.allow_insecure { - builder = builder.danger_accept_invalid_certs(self.allow_insecure) + if self.allow_insecure.get()? { + builder = builder.danger_accept_invalid_certs(true) } builder @@ -333,3 +461,143 @@ impl ClientOptions { .map_err(map_client_error) } } + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn client_test_config_from_map() { + let allow_http = "true".to_string(); + let allow_invalid_certificates = "false".to_string(); + let connect_timeout = "90 seconds".to_string(); + let default_content_type = "object_store:fake_default_content_type".to_string(); + let http1_only = "true".to_string(); + let http2_only = "false".to_string(); + let http2_keep_alive_interval = "90 seconds".to_string(); + let http2_keep_alive_timeout = "91 seconds".to_string(); + let http2_keep_alive_while_idle = "92 seconds".to_string(); + let pool_idle_timeout = "93 seconds".to_string(); + let pool_max_idle_per_host = "94".to_string(); + let proxy_url = "https://fake_proxy_url".to_string(); + let timeout = "95 seconds".to_string(); + let user_agent = "object_store:fake_user_agent".to_string(); + + let options = HashMap::from([ + ("allow_http", allow_http.clone()), + ( + "allow_invalid_certificates", + allow_invalid_certificates.clone(), + ), + ("connect_timeout", connect_timeout.clone()), + ("default_content_type", default_content_type.clone()), + ("http1_only", http1_only.clone()), + ("http2_only", http2_only.clone()), + ( + "http2_keep_alive_interval", + http2_keep_alive_interval.clone(), + ), + ("http2_keep_alive_timeout", http2_keep_alive_timeout.clone()), + ( + "http2_keep_alive_while_idle", + http2_keep_alive_while_idle.clone(), + ), + ("pool_idle_timeout", pool_idle_timeout.clone()), + ("pool_max_idle_per_host", pool_max_idle_per_host.clone()), + ("proxy_url", proxy_url.clone()), + ("timeout", timeout.clone()), + ("user_agent", user_agent.clone()), + ]); + + let builder = options + .into_iter() + .fold(ClientOptions::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + + assert_eq!( + builder + .get_config_value(&ClientConfigKey::AllowHttp) + .unwrap(), + allow_http + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::AllowInvalidCertificates) + .unwrap(), + allow_invalid_certificates + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::ConnectTimeout) + .unwrap(), + connect_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::DefaultContentType) + .unwrap(), + default_content_type + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http1Only) + .unwrap(), + http1_only + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2Only) + .unwrap(), + http2_only + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveInterval) + .unwrap(), + http2_keep_alive_interval + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveTimeout) + .unwrap(), + http2_keep_alive_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::Http2KeepAliveWhileIdle) + .unwrap(), + http2_keep_alive_while_idle + ); + + assert_eq!( + builder + .get_config_value(&ClientConfigKey::PoolIdleTimeout) + .unwrap(), + pool_idle_timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::PoolMaxIdlePerHost) + .unwrap(), + pool_max_idle_per_host + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::ProxyUrl) + .unwrap(), + proxy_url + ); + assert_eq!( + builder.get_config_value(&ClientConfigKey::Timeout).unwrap(), + timeout + ); + assert_eq!( + builder + .get_config_value(&ClientConfigKey::UserAgent) + .unwrap(), + user_agent + ); + } +} diff --git a/object_store/src/config.rs b/object_store/src/config.rs index 3ecce2e52bf1..987e6e420eb3 100644 --- a/object_store/src/config.rs +++ b/object_store/src/config.rs @@ -14,9 +14,14 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +use std::fmt::{Debug, Display, Formatter}; +use std::str::FromStr; +use std::time::Duration; + +use humantime::{format_duration, parse_duration}; +use reqwest::header::HeaderValue; use crate::{Error, Result}; -use std::fmt::{Debug, Display, Formatter}; /// Provides deferred parsing of a value /// @@ -79,3 +84,51 @@ impl Parse for bool { } } } + +impl Parse for Duration { + fn parse(v: &str) -> Result { + parse_duration(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as Duration").into(), + }) + } +} + +impl Parse for usize { + fn parse(v: &str) -> Result { + Self::from_str(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as usize").into(), + }) + } +} + +impl Parse for HeaderValue { + fn parse(v: &str) -> Result { + Self::from_str(v).map_err(|_| Error::Generic { + store: "Config", + source: format!("failed to parse \"{v}\" as HeaderValue").into(), + }) + } +} + +pub(crate) fn fmt_duration(duration: &ConfigValue) -> String { + match duration { + ConfigValue::Parsed(v) => format_duration(*v).to_string(), + ConfigValue::Deferred(v) => v.clone(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[test] + fn test_parse_duration() { + let duration = Duration::from_secs(60); + assert_eq!(Duration::parse("60 seconds").unwrap(), duration); + assert_eq!(Duration::parse("60 s").unwrap(), duration); + assert_eq!(Duration::parse("60s").unwrap(), duration) + } +} From 0190408147a34c6c08fcc9ba57443c629c678ca6 Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Fri, 12 May 2023 19:25:04 +0200 Subject: [PATCH 0896/1411] Add RecordBatchWriter trait and implement it for CSV, JSON, IPC and Parquet (#4206) Co-authored-by: alexandreyc --- arrow-array/src/lib.rs | 1 + arrow-array/src/record_batch.rs | 6 +++ arrow-csv/src/writer.rs | 6 +++ arrow-ipc/src/writer.rs | 12 ++++++ arrow-json/src/writer.rs | 56 ++++++++++++++++----------- arrow/benches/json_reader.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 10 ++++- 7 files changed, 67 insertions(+), 26 deletions(-) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 6ee9f7f1d06f..46de381c3244 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -183,6 +183,7 @@ pub use array::*; mod record_batch; pub use record_batch::{ RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, + RecordBatchWriter, }; mod arithmetic; diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index bd1cc65c7341..aea49c04753e 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -43,6 +43,12 @@ pub trait RecordBatchReader: Iterator> { } } +/// Trait for types that can write `RecordBatch`'s. +pub trait RecordBatchWriter { + /// Write a single batch to the writer. + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError>; +} + /// A two-dimensional batch of column-oriented data with a defined /// [schema](arrow_schema::Schema). /// diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 5f542be30a73..ba2123a09498 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -193,6 +193,12 @@ impl Writer { } } +impl RecordBatchWriter for Writer { + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + self.write(batch) + } +} + /// A CSV writer builder #[derive(Clone, Debug)] pub struct WriterBuilder { diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b2fcec08d845..fcfd4d97ac07 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -857,6 +857,12 @@ impl FileWriter { } } +impl RecordBatchWriter for FileWriter { + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + self.write(batch) + } +} + pub struct StreamWriter { /// The object to write to writer: BufWriter, @@ -991,6 +997,12 @@ impl StreamWriter { } } +impl RecordBatchWriter for StreamWriter { + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + self.write(batch) + } +} + /// Stores the encoded data, which is an crate::Message, and optional Arrow data pub struct EncodedData { /// An encoded crate::Message diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d610dd9a35b4..6f241be409dc 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -35,7 +35,7 @@ //! let a = Int32Array::from(vec![1, 2, 3]); //! let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); //! -//! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[batch]).unwrap(); +//! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[&batch]).unwrap(); //! assert_eq!( //! serde_json::Value::Object(json_rows[1].clone()), //! serde_json::json!({"a": 2}), @@ -59,7 +59,7 @@ //! // Write the record batch out as JSON //! let buf = Vec::new(); //! let mut writer = arrow_json::LineDelimitedWriter::new(buf); -//! writer.write_batches(&vec![batch]).unwrap(); +//! writer.write_batches(&vec![&batch]).unwrap(); //! writer.finish().unwrap(); //! //! // Get the underlying buffer back, @@ -85,7 +85,7 @@ //! // Write the record batch out as a JSON array //! let buf = Vec::new(); //! let mut writer = arrow_json::ArrayWriter::new(buf); -//! writer.write_batches(&vec![batch]).unwrap(); +//! writer.write_batches(&vec![&batch]).unwrap(); //! writer.finish().unwrap(); //! //! // Get the underlying buffer back, @@ -390,7 +390,7 @@ fn set_column_for_json_rows( /// Converts an arrow [`RecordBatch`] into a `Vec` of Serde JSON /// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( - batches: &[RecordBatch], + batches: &[&RecordBatch], ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -554,7 +554,7 @@ where } /// Convert the `RecordBatch` into JSON rows, and write them to the output - pub fn write(&mut self, batch: RecordBatch) -> Result<(), ArrowError> { + pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(&[batch])? { self.write_row(&Value::Object(row))?; } @@ -562,7 +562,7 @@ where } /// Convert the [`RecordBatch`] into JSON rows, and write them to the output - pub fn write_batches(&mut self, batches: &[RecordBatch]) -> Result<(), ArrowError> { + pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(batches)? { self.write_row(&Value::Object(row))?; } @@ -586,6 +586,16 @@ where } } +impl RecordBatchWriter for Writer +where + W: Write, + F: JsonFormat, +{ + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + self.write(batch) + } +} + #[cfg(test)] mod tests { use std::fs::{read_to_string, File}; @@ -631,7 +641,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -662,7 +672,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -704,7 +714,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -759,7 +769,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -818,7 +828,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -864,7 +874,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -907,7 +917,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -950,7 +960,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1010,7 +1020,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1053,7 +1063,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1113,7 +1123,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1192,7 +1202,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1217,7 +1227,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } let result = String::from_utf8(buf).unwrap(); @@ -1315,7 +1325,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } // NOTE: The last value should technically be {"list": [null]} but it appears @@ -1378,7 +1388,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[batch]).unwrap(); + writer.write_batches(&[&batch]).unwrap(); } assert_json_eq( @@ -1408,7 +1418,7 @@ mod tests { let mut buf = Vec::new(); { let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write(batch).unwrap(); + writer.write(&batch).unwrap(); } let result = String::from_utf8(buf).unwrap(); @@ -1445,7 +1455,7 @@ mod tests { let batch = reader.next().unwrap().unwrap(); // test batches = an empty batch + 2 same batches, finally result should be eq to 2 same batches - let batches = [RecordBatch::new_empty(schema), batch.clone(), batch]; + let batches = [&RecordBatch::new_empty(schema), &batch, &batch]; let mut buf = Vec::new(); { diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index 8cebc42e4cf6..8f3898c51f9d 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -92,7 +92,7 @@ fn large_bench_primitive(c: &mut Criterion) { .unwrap(); let mut out = Vec::with_capacity(1024); - LineDelimitedWriter::new(&mut out).write(batch).unwrap(); + LineDelimitedWriter::new(&mut out).write(&batch).unwrap(); let json = std::str::from_utf8(&out).unwrap(); do_bench(c, "large_bench_primitive", json, schema) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 14eb30f0b9c5..075ecc034862 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,8 +23,8 @@ use std::sync::Arc; use arrow_array::cast::AsArray; use arrow_array::types::{Decimal128Type, Int32Type, Int64Type, UInt32Type, UInt64Type}; -use arrow_array::{types, Array, ArrayRef, RecordBatch}; -use arrow_schema::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; +use arrow_array::{types, Array, ArrayRef, RecordBatch, RecordBatchWriter}; +use arrow_schema::{ArrowError, DataType as ArrowDataType, IntervalUnit, SchemaRef}; use super::schema::{ add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, @@ -246,6 +246,12 @@ impl ArrowWriter { } } +impl RecordBatchWriter for ArrowWriter { + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + self.write(batch).map_err(|e| e.into()) + } +} + fn write_leaves( row_group_writer: &mut SerializedRowGroupWriter<'_, W>, arrays: &[ArrayRef], From 43028672557d2558509608612c7cfbbd4bcb0dec Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 14 May 2023 16:36:41 +0100 Subject: [PATCH 0897/1411] Implement list_with_offset for PrefixStore (#4203) --- object_store/src/prefix.rs | 147 +++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 78 deletions(-) diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index eba379553733..94836d33cbc6 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -22,10 +22,7 @@ use std::ops::Range; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::{ - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - Result as ObjectStoreResult, -}; +use crate::{GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result}; #[doc(hidden)] #[deprecated(note = "Use PrefixStore")] @@ -59,36 +56,63 @@ impl PrefixStore { } /// Strip the constant prefix from a given path - fn strip_prefix(&self, path: &Path) -> Option { - Some(path.prefix_match(&self.prefix)?.collect()) + fn strip_prefix(&self, path: Path) -> Path { + // Note cannot use match because of borrow checker + if let Some(suffix) = path.prefix_match(&self.prefix) { + return suffix.collect(); + } + path + } + + /// Strip the constant prefix from a given ObjectMeta + fn strip_meta(&self, meta: ObjectMeta) -> ObjectMeta { + ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: self.strip_prefix(meta.location), + e_tag: meta.e_tag, + } } } #[async_trait::async_trait] impl ObjectStore for PrefixStore { - async fn put(&self, location: &Path, bytes: Bytes) -> ObjectStoreResult<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let full_path = self.full_path(location); + self.inner.put_multipart(&full_path).await + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let full_path = self.full_path(location); + self.inner.abort_multipart(&full_path, multipart_id).await + } + async fn append( &self, location: &Path, - ) -> ObjectStoreResult> { + ) -> Result> { let full_path = self.full_path(location); self.inner.append(&full_path).await } - async fn get(&self, location: &Path) -> ObjectStoreResult { + async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await } - async fn get_range( - &self, - location: &Path, - range: Range, - ) -> ObjectStoreResult { + async fn get_range(&self, location: &Path, range: Range) -> Result { let full_path = self.full_path(location); self.inner.get_range(&full_path, range).await } @@ -97,22 +121,18 @@ impl ObjectStore for PrefixStore { &self, location: &Path, ranges: &[Range], - ) -> ObjectStoreResult> { + ) -> Result> { let full_path = self.full_path(location); self.inner.get_ranges(&full_path, ranges).await } - async fn head(&self, location: &Path) -> ObjectStoreResult { + async fn head(&self, location: &Path) -> Result { let full_path = self.full_path(location); - self.inner.head(&full_path).await.map(|meta| ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location).unwrap_or(meta.location), - e_tag: meta.e_tag, - }) + let meta = self.inner.head(&full_path).await?; + Ok(self.strip_meta(meta)) } - async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { + async fn delete(&self, location: &Path) -> Result<()> { let full_path = self.full_path(location); self.inner.delete(&full_path).await } @@ -120,94 +140,65 @@ impl ObjectStore for PrefixStore { async fn list( &self, prefix: Option<&Path>, - ) -> ObjectStoreResult>> { - Ok(self - .inner - .list(Some(&self.full_path(prefix.unwrap_or(&Path::from("/"))))) - .await? - .map_ok(|meta| ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location).unwrap_or(meta.location), - e_tag: meta.e_tag, - }) - .boxed()) + ) -> Result>> { + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); + let s = self.inner.list(Some(&prefix)).await?; + Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) } - async fn list_with_delimiter( + async fn list_with_offset( &self, prefix: Option<&Path>, - ) -> ObjectStoreResult { + offset: &Path, + ) -> Result>> { + let offset = self.full_path(offset); + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); + let s = self.inner.list_with_offset(Some(&prefix), &offset).await?; + Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let prefix = self.full_path(prefix.unwrap_or(&Path::default())); self.inner - .list_with_delimiter(Some( - &self.full_path(prefix.unwrap_or(&Path::from("/"))), - )) + .list_with_delimiter(Some(&prefix)) .await .map(|lst| ListResult { common_prefixes: lst .common_prefixes - .iter() - .filter_map(|p| self.strip_prefix(p)) + .into_iter() + .map(|p| self.strip_prefix(p)) .collect(), objects: lst .objects - .iter() - .filter_map(|meta| { - Some(ObjectMeta { - last_modified: meta.last_modified, - size: meta.size, - location: self.strip_prefix(&meta.location)?, - e_tag: meta.e_tag.clone(), - }) - }) + .into_iter() + .map(|meta| self.strip_meta(meta)) .collect(), }) } - async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.copy(&full_from, &full_to).await } - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); - self.inner.copy_if_not_exists(&full_from, &full_to).await + self.inner.rename(&full_from, &full_to).await } - async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); - self.inner.rename(&full_from, &full_to).await + self.inner.copy_if_not_exists(&full_from, &full_to).await } - async fn rename_if_not_exists( - &self, - from: &Path, - to: &Path, - ) -> ObjectStoreResult<()> { + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { let full_from = self.full_path(from); let full_to = self.full_path(to); self.inner.rename_if_not_exists(&full_from, &full_to).await } - - async fn put_multipart( - &self, - location: &Path, - ) -> ObjectStoreResult<(MultipartId, Box)> { - let full_path = self.full_path(location); - self.inner.put_multipart(&full_path).await - } - - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> ObjectStoreResult<()> { - let full_path = self.full_path(location); - self.inner.abort_multipart(&full_path, multipart_id).await - } } #[cfg(test)] From 0a8913a37dcf2f1c66a00f156f2c3452907e9f9f Mon Sep 17 00:00:00 2001 From: Folyd Date: Mon, 15 May 2023 00:38:30 +0800 Subject: [PATCH 0898/1411] Feat docs (#4215) * Feat docs * Cargo fmt --- arrow-ord/src/sort.rs | 4 ++-- parquet/README.md | 2 +- parquet/src/arrow/async_reader/mod.rs | 4 ++-- parquet/src/basic.rs | 2 +- parquet/src/lib.rs | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index a44d9a910f5d..144d078d79e5 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -137,8 +137,8 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { } /// Sort elements from `ArrayRef` into an unsigned integer (`UInt32Array`) of indices. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -/// limit is an option for partial_sort +/// For floating point arrays any NaN values are considered to be greater than any other non-null value. +/// `limit` is an option for [partial_sort]. pub fn sort_to_indices( values: &dyn Array, options: Option, diff --git a/parquet/README.md b/parquet/README.md index d904fc64e744..d006c47ec148 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -65,7 +65,7 @@ The `parquet` crate provides the following features which may be enabled in your - [x] Primitive column value writers - [ ] Row record writer - [x] Arrow record writer - - [ ] Async support + - [x] Async support - [x] Predicate pushdown - [x] Parquet format 4.0.0 support diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 248d80d1a35b..3d4277a831da 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -482,8 +482,8 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`] of [`RecordBatch`] for a parquet file that can be -/// constructed using [`ParquetRecordBatchStreamBuilder`] +/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] +/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. pub struct ParquetRecordBatchStream { metadata: Arc, diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ec1d4a07ae68..cc8d033f42a4 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -16,7 +16,7 @@ // under the License. //! Contains Rust mappings for Thrift definition. -//! Refer to `parquet.thrift` file to see raw definitions. +//! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift) file to see raw definitions. use std::{fmt, str}; diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 4cdba1dc55ee..2371f8837bb0 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -31,8 +31,8 @@ //! 2. [arrow] for reading and writing parquet files to Arrow //! `RecordBatch`es //! -//! 3. [arrow::async_reader] for `async` reading and writing parquet -//! files to Arrow `RecordBatch`es (requires the `async` feature). +//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading +//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature). /// Defines a an item with an experimental public API /// From bac40c6bfc6b390b3550acf42c9f099f867e1734 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 May 2023 12:01:19 +0100 Subject: [PATCH 0899/1411] Add ObjectStore::get_opts (#2241) (#4212) * Add ObjectStore::get_opts (#2241) * Cleanup error handling * Review feedback --- object_store/src/aws/client.rs | 36 ++---- object_store/src/aws/credential.rs | 8 +- object_store/src/aws/mod.rs | 39 +++---- object_store/src/azure/client.rs | 58 ++++------ object_store/src/azure/mod.rs | 46 +++----- object_store/src/chunked.rs | 7 +- object_store/src/client/mod.rs | 37 ++++++- object_store/src/client/retry.rs | 37 ++++++- object_store/src/gcp/mod.rs | 128 ++++++++------------- object_store/src/http/client.rs | 21 ++-- object_store/src/http/mod.rs | 21 +--- object_store/src/lib.rs | 172 ++++++++++++++++++++++++++++- object_store/src/limit.rs | 20 +++- object_store/src/local.rs | 57 ++++++---- object_store/src/memory.rs | 28 ++--- object_store/src/prefix.rs | 19 +++- object_store/src/throttle.rs | 43 +++++--- object_store/src/util.rs | 7 -- 18 files changed, 470 insertions(+), 314 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 9634c740d01d..b2d01abfb6f3 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -17,27 +17,25 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; -use crate::aws::STRICT_PATH_ENCODE_SET; +use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; use crate::multipart::UploadPart; use crate::path::DELIMITER; -use crate::util::{format_http_range, format_prefix}; +use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, ListResult, MultipartId, ObjectMeta, Path, Result, - RetryConfig, StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, ObjectMeta, Path, + Result, RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; -use reqwest::{ - header::CONTENT_TYPE, Client as ReqwestClient, Method, Response, StatusCode, -}; +use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; -use std::ops::Range; use std::sync::Arc; /// A specialized `Error` for object store-related errors @@ -102,16 +100,9 @@ impl From for crate::Error { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } | Error::CopyRequest { source, path } - | Error::PutRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } + | Error::PutRequest { source, path } => source.error(STORE, path), _ => Self::Generic { - store: "S3", + store: STORE, source: Box::new(err), }, } @@ -245,11 +236,9 @@ impl S3Client { pub async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { - use reqwest::header::RANGE; - let credential = self.get_credential().await?; let url = self.config.path_url(path); let method = match head { @@ -257,13 +246,10 @@ impl S3Client { false => Method::GET, }; - let mut builder = self.client.request(method, url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } + let builder = self.client.request(method, url); let response = builder + .with_get_options(options) .with_aws_sigv4( credential.as_ref(), &self.config.region, diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index c4cb7cfe1a01..16cdf35d0f4a 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aws::STRICT_ENCODE_SET; +use crate::aws::{STORE, STRICT_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::util::hmac_sha256; @@ -330,7 +330,7 @@ impl CredentialProvider for InstanceCredentialProvider { self.imdsv1_fallback, ) .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source, }) })) @@ -363,7 +363,7 @@ impl CredentialProvider for WebIdentityProvider { &self.endpoint, ) .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source, }) })) @@ -552,7 +552,7 @@ mod profile { .provide_credentials() .await .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source: Box::new(source), })?; let t_now = SystemTime::now(); diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 6fa5e1c851c7..3f9b4803fe7d 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -40,7 +40,6 @@ use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; use std::collections::BTreeSet; -use std::ops::Range; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -57,8 +56,8 @@ use crate::client::ClientConfigKey; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, - Result, RetryConfig, StreamExt, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Path, Result, RetryConfig, StreamExt, }; mod checksum; @@ -79,6 +78,8 @@ pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); +const STORE: &str = "S3"; + /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -160,10 +161,10 @@ impl From for super::Error { fn from(source: Error) -> Self { match source { Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: "S3", key } + Self::UnknownConfigurationKey { store: STORE, key } } _ => Self::Generic { - store: "S3", + store: STORE, source: Box::new(source), }, } @@ -246,12 +247,12 @@ impl ObjectStore for AmazonS3 { .await } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "S3", + store: STORE, source: Box::new(source), }) .boxed(); @@ -259,26 +260,13 @@ impl ObjectStore for AmazonS3 { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get_request(location, Some(range), false) - .await? - .bytes() - .await - .map_err(|source| client::Error::GetResponseBody { - source, - path: location.to_string(), - })?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; + let options = GetOptions::default(); // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - let response = self.client.get_request(location, None, true).await?; + let response = self.client.get_request(location, options, true).await?; let headers = response.headers(); let last_modified = headers @@ -1169,8 +1157,8 @@ fn profile_credentials( mod tests { use super::*; use crate::tests::{ - get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list_opts, rename_and_copy, stream_get, + get_nonexistent_object, get_opts, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; use bytes::Bytes; use std::collections::HashMap; @@ -1417,6 +1405,7 @@ mod tests { // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 87432f62b5cd..4611986e30d2 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -17,13 +17,15 @@ use super::credential::{AzureCredential, CredentialProvider}; use crate::azure::credential::*; +use crate::azure::STORE; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; use crate::path::DELIMITER; -use crate::util::{deserialize_rfc1123, format_http_range, format_prefix}; +use crate::util::{deserialize_rfc1123, format_prefix}; use crate::{ - BoxStream, ClientOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, - StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, + RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -32,13 +34,12 @@ use chrono::{DateTime, Utc}; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH, RANGE}, + header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH}, Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; -use std::ops::Range; use url::Url; /// A specialized `Error` for object store-related errors @@ -69,12 +70,6 @@ pub(crate) enum Error { path: String, }, - #[snafu(display("Error performing copy request {}: {}", path, source))] - CopyRequest { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Error performing list request: {}", source))] ListRequest { source: crate::client::retry::Error }, @@ -95,25 +90,9 @@ impl From for crate::Error { match err { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } - | Error::CopyRequest { source, path } - | Error::PutRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } - Error::CopyRequest { source, path } - if matches!(source.status(), Some(StatusCode::CONFLICT)) => - { - Self::AlreadyExists { - path, - source: Box::new(source), - } - } + | Error::PutRequest { source, path } => source.error(STORE, path), _ => Self::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), }, } @@ -175,7 +154,7 @@ impl AzureClient { // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), } })?, @@ -193,7 +172,7 @@ impl AzureClient { // and we want to use it in an infallible function HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(err), } })?, @@ -253,7 +232,7 @@ impl AzureClient { pub async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { let credential = self.get_credential().await?; @@ -263,17 +242,14 @@ impl AzureClient { false => Method::GET, }; - let mut builder = self + let builder = self .client .request(method, url) .header(CONTENT_LENGTH, HeaderValue::from_static("0")) .body(Bytes::new()); - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } - let response = builder + .with_get_options(options) .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await @@ -338,8 +314,12 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|err| match err.status() { + Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), })?; Ok(()) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index c2cfdfe6af32..6726241aa868 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -31,8 +31,8 @@ use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -45,7 +45,6 @@ use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; use std::io; -use std::ops::Range; use std::sync::Arc; use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; @@ -59,6 +58,8 @@ pub use credential::authority_hosts; mod client; mod credential; +const STORE: &str = "MicrosoftAzure"; + /// The well-known account used by Azurite and the legacy Azure Storage Emulator. /// const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; @@ -150,12 +151,11 @@ enum Error { impl From for super::Error { fn from(source: Error) -> Self { match source { - Error::UnknownConfigurationKey { key } => Self::UnknownConfigurationKey { - store: "MicrosoftAzure", - key, - }, + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } _ => Self::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(source), }, } @@ -209,12 +209,12 @@ impl ObjectStore for MicrosoftAzure { Ok(()) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "MicrosoftAzure", + store: STORE, source: Box::new(source), }) .boxed(); @@ -222,26 +222,13 @@ impl ObjectStore for MicrosoftAzure { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get_request(location, Some(range), false) - .await? - .bytes() - .await - .map_err(|source| client::Error::GetResponseBody { - source, - path: location.to_string(), - })?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; + let options = GetOptions::default(); // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties - let response = self.client.get_request(location, None, true).await?; + let response = self.client.get_request(location, options, true).await?; let headers = response.headers(); let last_modified = headers @@ -1103,8 +1090,9 @@ fn split_sas(sas: &str) -> Result, Error> { mod tests { use super::*; use crate::tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_opts, list_uses_directories_correctly, + list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, + rename_and_copy, stream_get, }; use std::collections::HashMap; use std::env; @@ -1175,6 +1163,7 @@ mod tests { async fn azure_blob_test() { let integration = maybe_skip_integration!().build().unwrap(); put_get_delete_list_opts(&integration, false).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -1203,6 +1192,7 @@ mod tests { let integration = builder.build().unwrap(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index aebefec61559..c639d7e89812 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -30,7 +30,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::util::maybe_spawn_blocking; -use crate::{GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{GetOptions, GetResult, ListResult, ObjectMeta, ObjectStore}; use crate::{MultipartId, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks @@ -81,8 +81,8 @@ impl ObjectStore for ChunkedStore { self.inner.abort_multipart(location, multipart_id).await } - async fn get(&self, location: &Path) -> Result { - match self.inner.get(location).await? { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + match self.inner.get_opts(location, options).await? { GetResult::File(std_file, ..) => { let reader = BufReader::new(std_file); let chunk_size = self.chunk_size; @@ -245,6 +245,7 @@ mod tests { let integration = ChunkedStore::new(Arc::clone(integration), 100); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index ccf1b4a3bdc2..be44a9f99b27 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -31,11 +31,12 @@ use std::str::FromStr; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy}; +use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; +use crate::GetOptions; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -462,6 +463,40 @@ impl ClientOptions { } } +pub trait GetOptionsExt { + fn with_get_options(self, options: GetOptions) -> Self; +} + +impl GetOptionsExt for RequestBuilder { + fn with_get_options(mut self, options: GetOptions) -> Self { + use hyper::header::*; + + if let Some(range) = options.range { + let range = format!("bytes={}-{}", range.start, range.end.saturating_sub(1)); + self = self.header(RANGE, range); + } + + if let Some(tag) = options.if_match { + self = self.header(IF_MATCH, tag); + } + + if let Some(tag) = options.if_none_match { + self = self.header(IF_NONE_MATCH, tag); + } + + const DATE_FORMAT: &str = "%a, %d %b %Y %H:%M:%S GMT"; + if let Some(date) = options.if_unmodified_since { + self = self.header(IF_UNMODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); + } + + if let Some(date) = options.if_modified_since { + self = self.header(IF_MODIFIED_SINCE, date.format(DATE_FORMAT).to_string()); + } + + self + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index f9c2dd30088d..39a913142e09 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -32,6 +32,7 @@ pub struct Error { retries: usize, message: String, source: Option, + status: Option, } impl std::fmt::Display for Error { @@ -57,7 +58,28 @@ impl std::error::Error for Error { impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.source.as_ref().and_then(|e| e.status()) + self.status + } + + pub fn error(self, store: &'static str, path: String) -> crate::Error { + match self.status { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + path, + source: Box::new(self), + }, + Some(StatusCode::NOT_MODIFIED) => crate::Error::NotModified { + path, + source: Box::new(self), + }, + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::Precondition { + path, + source: Box::new(self), + }, + _ => crate::Error::Generic { + store, + source: Box::new(self), + }, + } } } @@ -146,6 +168,14 @@ impl RetryExt for reqwest::RequestBuilder { match s.send().await { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), + Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { + return Err(Error{ + message: "not modified".to_string(), + retries, + status: Some(r.status()), + source: None, + }) + } Ok(r) => { let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); let message = match is_bare_redirect { @@ -157,6 +187,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, + status: Some(r.status()), source: None, }) } @@ -180,6 +211,7 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ message, retries, + status: Some(status), source: Some(e), }) @@ -209,7 +241,8 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error{ retries, message: "request error".to_string(), - source: Some(e) + status: e.status(), + source: Some(e), }) } let sleep = backoff.next(); diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 375b4d8f8c37..41a91fef84a9 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -31,7 +31,6 @@ //! week. use std::collections::BTreeSet; use std::io; -use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -40,7 +39,6 @@ use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; -use reqwest::header::RANGE; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -49,14 +47,14 @@ use url::Url; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; -use crate::client::ClientConfigKey; +use crate::client::{ClientConfigKey, GetOptionsExt}; use crate::{ client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - util::{format_http_range, format_prefix}, - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + util::format_prefix, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; use self::credential::{ @@ -66,6 +64,8 @@ use self::credential::{ mod credential; +const STORE: &str = "GCS"; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] @@ -100,15 +100,12 @@ enum Error { path: String, }, - #[snafu(display("Error performing copy request {}: {}", path, source))] - CopyRequest { + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing put request: {}", source))] - PutRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting put response body: {}", source))] PutResponseBody { source: reqwest::Error }, @@ -129,12 +126,6 @@ enum Error { #[snafu(display("GCP credential error: {}", source))] Credential { source: credential::Error }, - #[snafu(display("Already exists: {}", path))] - AlreadyExists { - source: crate::client::retry::Error, - path: String, - }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -159,23 +150,12 @@ impl From for super::Error { match err { Error::GetRequest { source, path } | Error::DeleteRequest { source, path } - | Error::CopyRequest { source, path } - if matches!(source.status(), Some(StatusCode::NOT_FOUND)) => - { - Self::NotFound { - path, - source: Box::new(source), - } - } - Error::AlreadyExists { source, path } => Self::AlreadyExists { - source: Box::new(source), - path, - }, + | Error::PutRequest { source, path } => source.error(STORE, path), Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: "GCS", key } + Self::UnknownConfigurationKey { store: STORE, key } } _ => Self::Generic { - store: "GCS", + store: STORE, source: Box::new(err), }, } @@ -280,26 +260,23 @@ impl GoogleCloudStorageClient { async fn get_request( &self, path: &Path, - range: Option>, + options: GetOptions, head: bool, ) -> Result { let token = self.get_token().await?; let url = self.object_url(path); - let mut builder = self.client.request(Method::GET, url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } - let alt = match head { true => "json", false => "media", }; + let builder = self.client.request(Method::GET, url); + let response = builder .bearer_auth(token) .query(&[("alt", alt)]) + .with_get_options(options) .send_retry(&self.retry_config) .await .context(GetRequestSnafu { @@ -331,7 +308,9 @@ impl GoogleCloudStorageClient { .body(payload) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { + path: path.as_ref(), + })?; Ok(()) } @@ -355,7 +334,9 @@ impl GoogleCloudStorageClient { .query(&[("uploads", "")]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { + path: path.as_ref(), + })?; let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( @@ -387,7 +368,7 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.retry_config) .await - .context(PutRequestSnafu)?; + .context(PutRequestSnafu { path })?; Ok(()) } @@ -444,22 +425,12 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_LENGTH, 0) .send_retry(&self.retry_config) .await - .map_err(|err| { - if err - .status() - .map(|status| status == reqwest::StatusCode::PRECONDITION_FAILED) - .unwrap_or_else(|| false) - { - Error::AlreadyExists { - source: err, - path: to.to_string(), - } - } else { - Error::CopyRequest { - source: err, - path: from.to_string(), - } - } + .map_err(|err| match err.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), })?; Ok(()) @@ -667,12 +638,18 @@ impl ObjectStore for GoogleCloudStorage { Ok(()) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, false).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_modified_since.is_some() || options.if_unmodified_since.is_some() { + return Err(super::Error::NotSupported { + source: "ModifiedSince Preconditions not supported by GoogleCloudStorage JSON API".to_string().into(), + }); + } + + let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() .map_err(|source| crate::Error::Generic { - store: "GCS", + store: STORE, source: Box::new(source), }) .boxed(); @@ -680,18 +657,9 @@ impl ObjectStore for GoogleCloudStorage { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let response = self - .client - .get_request(location, Some(range), false) - .await?; - Ok(response.bytes().await.context(GetResponseBodySnafu { - path: location.as_ref(), - })?) - } - async fn head(&self, location: &Path) -> Result { - let response = self.client.get_request(location, None, true).await?; + let options = GetOptions::default(); + let response = self.client.get_request(location, options, true).await?; let object = response.json().await.context(GetResponseBodySnafu { path: location.as_ref(), })?; @@ -1224,13 +1192,7 @@ mod test { use std::io::Write; use tempfile::NamedTempFile; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; use super::*; @@ -1299,6 +1261,8 @@ mod test { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; + // Fake GCS server doesn't currently honor preconditions + get_opts(&integration).await; } } @@ -1311,7 +1275,7 @@ mod test { let err = integration.get(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1330,7 +1294,7 @@ mod test { .unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1343,7 +1307,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } @@ -1359,7 +1323,7 @@ mod test { let err = integration.delete(&location).await.unwrap_err(); assert!( - matches!(err, ObjectStoreError::NotFound { .. }), + matches!(err, crate::Error::NotFound { .. }), "unexpected error type: {err}" ); } diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 5ef272180abc..4e58eb0b2927 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -16,17 +16,17 @@ // under the License. use crate::client::retry::{self, RetryConfig, RetryExt}; +use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; -use crate::util::{deserialize_rfc1123, format_http_range}; -use crate::{ClientOptions, ObjectMeta, Result}; +use crate::util::deserialize_rfc1123; +use crate::{ClientOptions, GetOptions, ObjectMeta, Result}; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::percent_decode_str; -use reqwest::header::{CONTENT_TYPE, RANGE}; +use reqwest::header::CONTENT_TYPE; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; use snafu::{OptionExt, ResultExt, Snafu}; -use std::ops::Range; use url::Url; #[derive(Debug, Snafu)] @@ -229,19 +229,12 @@ impl Client { Ok(()) } - pub async fn get( - &self, - location: &Path, - range: Option>, - ) -> Result { + pub async fn get(&self, location: &Path, options: GetOptions) -> Result { let url = self.path_url(location); - let mut builder = self.client.get(url); - - if let Some(range) = range { - builder = builder.header(RANGE, format_http_range(range)); - } + let builder = self.client.get(url); builder + .with_get_options(options) .send_retry(&self.retry_config) .await .map_err(|source| match source.status() { diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index c91faa2358ac..bed19722c83a 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -31,8 +31,6 @@ //! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV -use std::ops::Range; - use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; @@ -45,8 +43,8 @@ use url::Url; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, - RetryConfig, + ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, RetryConfig, }; mod client; @@ -119,8 +117,8 @@ impl ObjectStore for HttpStore { Err(super::Error::NotImplemented) } - async fn get(&self, location: &Path) -> Result { - let response = self.client.get(location, None).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.client.get(location, options).await?; let stream = response .bytes_stream() .map_err(|source| Error::Reqwest { source }.into()) @@ -129,17 +127,6 @@ impl ObjectStore for HttpStore { Ok(GetResult::Stream(stream)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let bytes = self - .client - .get(location, Some(range)) - .await? - .bytes() - .await - .context(ReqwestSnafu)?; - Ok(bytes) - } - async fn head(&self, location: &Path) -> Result { let status = self.client.list(Some(location), "0").await?; match status.response.len() { diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 2c93802edaa8..75f9ca7df411 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -346,11 +346,24 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Return the bytes that are stored at the specified location. - async fn get(&self, location: &Path) -> Result; + async fn get(&self, location: &Path) -> Result { + self.get_opts(location, GetOptions::default()).await + } + + /// Perform a get request with options + /// + /// Note: options.range will be ignored if [`GetResult::File`] + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; /// Return the bytes that are stored at the specified location /// in the given byte range - async fn get_range(&self, location: &Path, range: Range) -> Result; + async fn get_range(&self, location: &Path, range: Range) -> Result { + let options = GetOptions { + range: Some(range), + ..Default::default() + }; + self.get_opts(location, options).await?.bytes().await + } /// Return the bytes that are stored at the specified location /// in the given byte ranges @@ -478,6 +491,10 @@ impl ObjectStore for Box { self.as_ref().get(location).await } + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + self.as_ref().get_opts(location, options).await + } + async fn get_range(&self, location: &Path, range: Range) -> Result { self.as_ref().get_range(location, range).await } @@ -558,6 +575,66 @@ pub struct ObjectMeta { pub e_tag: Option, } +/// Options for a get request, such as range +#[derive(Debug, Default)] +pub struct GetOptions { + /// Request will succeed if the `ObjectMeta::e_tag` matches + /// otherwise returning [`Error::Precondition`] + /// + /// + pub if_match: Option, + /// Request will succeed if the `ObjectMeta::e_tag` does not match + /// otherwise returning [`Error::NotModified`] + /// + /// + pub if_none_match: Option, + /// Request will succeed if the object has been modified since + /// + /// + pub if_modified_since: Option>, + /// Request will succeed if the object has not been modified since + /// otherwise returning [`Error::Precondition`] + /// + /// Some stores, such as S3, will only return `NotModified` for exact + /// timestamp matches, instead of for any timestamp greater than or equal. + /// + /// + pub if_unmodified_since: Option>, + /// Request transfer of only the specified range of bytes + /// otherwise returning [`Error::NotModified`] + /// + /// + pub range: Option>, +} + +impl GetOptions { + /// Returns an error if the modification conditions on this request are not satisfied + fn check_modified( + &self, + location: &Path, + last_modified: DateTime, + ) -> Result<()> { + if let Some(date) = self.if_modified_since { + if last_modified <= date { + return Err(Error::NotModified { + path: location.to_string(), + source: format!("{} >= {}", date, last_modified).into(), + }); + } + } + + if let Some(date) = self.if_unmodified_since { + if last_modified > date { + return Err(Error::Precondition { + path: location.to_string(), + source: format!("{} < {}", date, last_modified).into(), + }); + } + } + Ok(()) + } +} + /// Result for a get request /// /// This special cases the case of a local file, as some systems may @@ -702,6 +779,18 @@ pub enum Error { source: Box, }, + #[snafu(display("Request precondition failure for path {}: {}", path, source))] + Precondition { + path: String, + source: Box, + }, + + #[snafu(display("Object at location {} not modified: {}", path, source))] + NotModified { + path: String, + source: Box, + }, + #[snafu(display("Operation not yet implemented."))] NotImplemented, @@ -1025,6 +1114,85 @@ mod tests { delete_fixtures(storage).await; } + pub(crate) async fn get_opts(storage: &dyn ObjectStore) { + let path = Path::from("test"); + storage.put(&path, "foo".into()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified + chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + let options = GetOptions { + if_unmodified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::Precondition { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Err(Error::NotModified { .. } | Error::NotSupported { .. }) => {} + d => panic!("{d:?}"), + } + + let options = GetOptions { + if_modified_since: Some(meta.last_modified - chrono::Duration::hours(10)), + ..GetOptions::default() + }; + match storage.get_opts(&path, options).await { + Ok(_) | Err(Error::NotSupported { .. }) => {} + Err(e) => panic!("{e}"), + } + + if let Some(tag) = meta.e_tag { + let options = GetOptions { + if_match: Some(tag.clone()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some(tag.clone()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::NotModified { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + } + } + fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { std::iter::repeat(Bytes::from_iter(std::iter::repeat(b'x').take(chunk_length))) .take(num_chunks) diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index d0d9f73c5c59..e0091115d8f6 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, - StreamExt, + BoxStream, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + Path, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -114,6 +114,16 @@ impl ObjectStore for LimitStore { } } + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + match self.inner.get_opts(location, options).await? { + r @ GetResult::File(_, _) => Ok(r), + GetResult::Stream(s) => { + Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) + } + } + } + async fn get_range(&self, location: &Path, range: Range) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_range(location, range).await @@ -251,10 +261,7 @@ impl AsyncWrite for PermitWrapper { mod tests { use crate::limit::LimitStore; use crate::memory::InMemory; - use crate::tests::{ - list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, - rename_and_copy, stream_get, - }; + use crate::tests::*; use crate::ObjectStore; use std::time::Duration; use tokio::time::timeout; @@ -266,6 +273,7 @@ mod tests { let integration = LimitStore::new(memory, max_requests); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/local.rs b/object_store/src/local.rs index b40f5a777860..26a8bf336873 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -19,7 +19,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -56,7 +56,7 @@ pub(crate) enum Error { }, #[snafu(display("Unable to access metadata for {}: {}", path, source))] - UnableToAccessMetadata { + Metadata { source: Box, path: String, }, @@ -360,10 +360,27 @@ impl ObjectStore for LocalFileSystem { Err(super::Error::NotImplemented) } - async fn get(&self, location: &Path) -> Result { - let path = self.config.path_to_filesystem(location)?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_match.is_some() || options.if_none_match.is_some() { + return Err(super::Error::NotSupported { + source: "ETags not supported by LocalFileSystem".to_string().into(), + }); + } + + let location = location.clone(); + let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let file = open_file(&path)?; + if options.if_unmodified_since.is_some() + || options.if_modified_since.is_some() + { + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: location.to_string(), + })?; + options.check_modified(&location, last_modified(&metadata))?; + } + Ok(GetResult::File(file, path)) }) .await @@ -408,7 +425,7 @@ impl ObjectStore for LocalFileSystem { source: e, } } else { - Error::UnableToAccessMetadata { + Error::Metadata { source: e.into(), path: location.to_string(), } @@ -878,21 +895,22 @@ fn open_file(path: &PathBuf) -> Result { } fn convert_entry(entry: DirEntry, location: Path) -> Result { - let metadata = entry - .metadata() - .map_err(|e| Error::UnableToAccessMetadata { - source: e.into(), - path: location.to_string(), - })?; + let metadata = entry.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: location.to_string(), + })?; convert_metadata(metadata, location) } -fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { - let last_modified: DateTime = metadata +fn last_modified(metadata: &std::fs::Metadata) -> DateTime { + metadata .modified() .expect("Modified file time should be supported on this platform") - .into(); + .into() +} +fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { + let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; @@ -956,13 +974,7 @@ fn convert_walkdir_result( mod tests { use super::*; use crate::test_util::flatten_list_stream; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; use futures::TryStreamExt; use tempfile::{NamedTempFile, TempDir}; use tokio::io::AsyncWriteExt; @@ -973,6 +985,7 @@ mod tests { let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -1085,7 +1098,7 @@ mod tests { let err = get_nonexistent_object(&integration, Some(location)) .await .unwrap_err(); - if let ObjectStoreError::NotFound { path, source } = err { + if let crate::Error::NotFound { path, source } = err { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(std::io::Error { .. }),), diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index b01ffbb02495..82d485997e88 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -16,8 +16,8 @@ // under the License. //! An in-memory object store implementation -use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -128,12 +128,17 @@ impl ObjectStore for InMemory { })) } - async fn get(&self, location: &Path) -> Result { - let data = self.entry(location).await?; + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + if options.if_match.is_some() || options.if_none_match.is_some() { + return Err(super::Error::NotSupported { + source: "ETags not supported by InMemory".to_string().into(), + }); + } + let (data, last_modified) = self.entry(location).await?; + options.check_modified(location, last_modified)?; - Ok(GetResult::Stream( - futures::stream::once(async move { Ok(data.0) }).boxed(), - )) + let stream = futures::stream::once(futures::future::ready(Ok(data))); + Ok(GetResult::Stream(stream.boxed())) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -391,19 +396,14 @@ mod tests { use super::*; - use crate::{ - tests::{ - copy_if_not_exists, get_nonexistent_object, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, - }, - Error as ObjectStoreError, ObjectStore, - }; + use crate::tests::*; #[tokio::test] async fn in_memory_test() { let integration = InMemory::new(); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; @@ -443,7 +443,7 @@ mod tests { let err = get_nonexistent_object(&integration, Some(location)) .await .unwrap_err(); - if let ObjectStoreError::NotFound { path, source } = err { + if let crate::Error::NotFound { path, source } = err { let source_variant = source.downcast_ref::(); assert!( matches!(source_variant, Some(Error::NoDataInMemory { .. }),), diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 94836d33cbc6..ffe509411911 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -22,7 +22,9 @@ use std::ops::Range; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::{GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result}; +use crate::{ + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, +}; #[doc(hidden)] #[deprecated(note = "Use PrefixStore")] @@ -117,6 +119,15 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> Result { + let full_path = self.full_path(location); + self.inner.get_opts(&full_path, options).await + } + async fn get_ranges( &self, location: &Path, @@ -206,10 +217,7 @@ mod tests { use super::*; use crate::local::LocalFileSystem; use crate::test_util::flatten_list_stream; - use crate::tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, stream_get, - }; + use crate::tests::*; use tempfile::TempDir; @@ -220,6 +228,7 @@ mod tests { let integration = PrefixStore::new(inner, "prefix"); put_get_delete_list(&integration).await; + get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index e51303114788..fb90afcec9fb 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -20,8 +20,8 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; -use crate::MultipartId; use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; use futures::{stream::BoxStream, FutureExt, StreamExt}; @@ -179,17 +179,18 @@ impl ObjectStore for ThrottledStore { // need to copy to avoid moving / referencing `self` let wait_get_per_byte = self.config().wait_get_per_byte; - self.inner.get(location).await.map(|result| { - let s = match result { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), - }; + let result = self.inner.get(location).await?; + Ok(throttle_get(result, wait_get_per_byte)) + } - GetResult::Stream(throttle_stream(s, move |bytes| { - let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); - wait_get_per_byte * bytes_len - })) - }) + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + sleep(self.config().wait_get_per_call).await; + + // need to copy to avoid moving / referencing `self` + let wait_get_per_byte = self.config().wait_get_per_byte; + + let result = self.inner.get_opts(location, options).await?; + Ok(throttle_get(result, wait_get_per_byte)) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -299,6 +300,18 @@ fn usize_to_u32_saturate(x: usize) -> u32 { x.try_into().unwrap_or(u32::MAX) } +fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { + let s = match result { + GetResult::Stream(s) => s, + GetResult::File(_, _) => unimplemented!(), + }; + + GetResult::Stream(throttle_stream(s, move |bytes| { + let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); + wait_get_per_byte * bytes_len + })) +} + fn throttle_stream( stream: BoxStream<'_, Result>, delay: F, @@ -317,13 +330,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{ - memory::InMemory, - tests::{ - copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list, rename_and_copy, - }, - }; + use crate::{memory::InMemory, tests::*}; use bytes::Bytes; use futures::TryStreamExt; use tokio::time::Duration; diff --git a/object_store/src/util.rs b/object_store/src/util.rs index e5c701dd8b1b..ba4c68345d73 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -44,13 +44,6 @@ pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) } -/// Returns a formatted HTTP range header as per -/// -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] -pub fn format_http_range(range: std::ops::Range) -> String { - format!("bytes={}-{}", range.start, range.end.saturating_sub(1)) -} - #[cfg(any(feature = "aws", feature = "azure"))] pub(crate) fn hmac_sha256( secret: impl AsRef<[u8]>, From 4e92f93be050e5cdd633604d64eebeb0c1bfe966 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 May 2023 17:18:51 +0100 Subject: [PATCH 0900/1411] Update proc-macro2 requirement from =1.0.56 to =1.0.57 (#4219) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.56...1.0.57) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 59144870a9e7..293230733a96 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.56", default-features = false } +proc-macro2 = { version = "=1.0.57", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 108b7a8d0002f9ffc8f3e626f488ec497991b503 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 15 May 2023 18:56:21 +0100 Subject: [PATCH 0901/1411] Consistently use GCP XML API (#4207) * Consistently use GCP XML API * Use updated fake-gcs-server * Review feedback --- .github/workflows/object_store.yml | 3 +- object_store/CONTRIBUTING.md | 2 +- object_store/Cargo.toml | 2 +- object_store/src/aws/client.rs | 69 +------------ object_store/src/aws/mod.rs | 57 ++-------- object_store/src/azure/mod.rs | 60 ++--------- object_store/src/client/header.rs | 83 +++++++++++++++ object_store/src/client/list.rs | 85 +++++++++++++++ object_store/src/client/mod.rs | 6 ++ object_store/src/gcp/mod.rs | 161 +++++++++-------------------- object_store/src/prefix.rs | 6 +- 11 files changed, 247 insertions(+), 287 deletions(-) create mode 100644 object_store/src/client/header.rs create mode 100644 object_store/src/client/list.rs diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 65c78df18466..df43ae3bf76a 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -95,8 +95,9 @@ jobs: - uses: actions/checkout@v3 - name: Configure Fake GCS Server (GCP emulation) + # Custom image - see fsouza/fake-gcs-server#1164 run: | - docker run -d -p 4443:4443 fsouza/fake-gcs-server -scheme http + docker run -d -p 4443:4443 tustvold/fake-gcs-server -scheme http -backend memory -public-host localhost:4443 # Give the container a moment to start up prior to configuring it sleep 1 curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index 550640d931b4..47c294022659 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -103,7 +103,7 @@ To test the GCS integration, we use [Fake GCS Server](https://github.com/fsouza/ Startup the fake server: ```shell -docker run -p 4443:4443 fsouza/fake-gcs-server -scheme http +docker run -p 4443:4443 tustvold/fake-gcs-server -scheme http ``` Configure the account: diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index e25801b6c92d..c6b89fa23186 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -68,7 +68,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut nix = "0.26.1" [features] -cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json","reqwest/stream", "chrono/serde", "base64", "rand", "ring"] +cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index b2d01abfb6f3..1cdf785e5f4d 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -18,6 +18,7 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; +use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -25,13 +26,12 @@ use crate::multipart::UploadPart; use crate::path::DELIMITER; use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, ObjectMeta, Path, - Result, RetryConfig, StreamExt, + BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, + RetryConfig, StreamExt, }; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; -use chrono::{DateTime, Utc}; use percent_encoding::{utf8_percent_encode, PercentEncode}; use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; @@ -109,69 +109,6 @@ impl From for crate::Error { } } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListResponse { - #[serde(default)] - pub contents: Vec, - #[serde(default)] - pub common_prefixes: Vec, - #[serde(default)] - pub next_continuation_token: Option, -} - -impl TryFrom for ListResult { - type Error = crate::Error; - - fn try_from(value: ListResponse) -> Result { - let common_prefixes = value - .common_prefixes - .into_iter() - .map(|x| Ok(Path::parse(x.prefix)?)) - .collect::>()?; - - let objects = value - .contents - .into_iter() - .map(TryFrom::try_from) - .collect::>()?; - - Ok(Self { - common_prefixes, - objects, - }) - } -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListPrefix { - pub prefix: String, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListContents { - pub key: String, - pub size: usize, - pub last_modified: DateTime, - #[serde(rename = "ETag")] - pub e_tag: Option, -} - -impl TryFrom for ObjectMeta { - type Error = crate::Error; - - fn try_from(value: ListContents) -> Result { - Ok(Self { - location: Path::parse(value.key)?, - last_modified: value.last_modified, - size: value.size, - e_tag: value.e_tag, - }) - } -} - #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] struct InitiateMultipart { diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 3f9b4803fe7d..2c38a9b712c2 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -33,7 +33,6 @@ use async_trait::async_trait; use bytes::Bytes; -use chrono::{DateTime, Utc}; use futures::stream::BoxStream; use futures::TryStreamExt; use itertools::Itertools; @@ -52,6 +51,7 @@ use crate::aws::credential::{ AwsCredential, CredentialProvider, InstanceCredentialProvider, StaticCredentialProvider, WebIdentityProvider, }; +use crate::client::header::header_meta; use crate::client::ClientConfigKey; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; @@ -87,24 +87,6 @@ static METADATA_ENDPOINT: &str = "http://169.254.169.254"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Last-Modified Header missing from response"))] - MissingLastModified, - - #[snafu(display("Content-Length Header missing from response"))] - MissingContentLength, - - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] - InvalidLastModified { - last_modified: String, - source: chrono::ParseError, - }, - - #[snafu(display("Invalid content length '{}': {}", content_length, source))] - InvalidContentLength { - content_length: String, - source: std::num::ParseIntError, - }, - #[snafu(display("Missing region"))] MissingRegion, @@ -155,6 +137,11 @@ enum Error { #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -261,41 +248,11 @@ impl ObjectStore for AmazonS3 { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; - let options = GetOptions::default(); // Extract meta from headers // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax let response = self.client.get_request(location, options, true).await?; - let headers = response.headers(); - - let last_modified = headers - .get(LAST_MODIFIED) - .context(MissingLastModifiedSnafu)?; - - let content_length = headers - .get(CONTENT_LENGTH) - .context(MissingContentLengthSnafu)?; - - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; - let last_modified = DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? - .with_timezone(&Utc); - - let content_length = content_length.to_str().context(BadHeaderSnafu)?; - let content_length = content_length - .parse() - .context(InvalidContentLengthSnafu { content_length })?; - - let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - - Ok(ObjectMeta { - location: location.clone(), - last_modified, - size: content_length, - e_tag: Some(e_tag.to_string()), - }) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 6726241aa868..0f8dae00c6c0 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -38,7 +38,6 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; -use chrono::{TimeZone, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; @@ -50,9 +49,9 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; use crate::client::ClientConfigKey; use crate::config::ConfigValue; -use crate::util::RFC1123_FMT; pub use credential::authority_hosts; mod client; @@ -75,24 +74,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Last-Modified Header missing from response"))] - MissingLastModified, - - #[snafu(display("Content-Length Header missing from response"))] - MissingContentLength, - - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] - InvalidLastModified { - last_modified: String, - source: chrono::ParseError, - }, - - #[snafu(display("Invalid content length '{}': {}", content_length, source))] - InvalidContentLength { - content_length: String, - source: std::num::ParseIntError, - }, - #[snafu(display("Received header containing non-ASCII data"))] BadHeader { source: reqwest::header::ToStrError }, @@ -146,6 +127,11 @@ enum Error { #[snafu(display("ETag Header missing from response"))] MissingEtag, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -223,44 +209,12 @@ impl ObjectStore for MicrosoftAzure { } async fn head(&self, location: &Path) -> Result { - use reqwest::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; let options = GetOptions::default(); // Extract meta from headers // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties let response = self.client.get_request(location, options, true).await?; - let headers = response.headers(); - - let last_modified = headers - .get(LAST_MODIFIED) - .ok_or(Error::MissingLastModified)? - .to_str() - .context(BadHeaderSnafu)?; - let last_modified = Utc - .datetime_from_str(last_modified, RFC1123_FMT) - .context(InvalidLastModifiedSnafu { last_modified })?; - - let content_length = headers - .get(CONTENT_LENGTH) - .ok_or(Error::MissingContentLength)? - .to_str() - .context(BadHeaderSnafu)?; - let content_length = content_length - .parse() - .context(InvalidContentLengthSnafu { content_length })?; - - let e_tag = headers - .get(ETAG) - .ok_or(Error::MissingEtag)? - .to_str() - .context(BadHeaderSnafu)?; - - Ok(ObjectMeta { - location: location.clone(), - last_modified, - size: content_length, - e_tag: Some(e_tag.to_string()), - }) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs new file mode 100644 index 000000000000..cc4f16eaa599 --- /dev/null +++ b/object_store/src/client/header.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Logic for extracting ObjectMeta from headers used by AWS, GCP and Azure + +use crate::path::Path; +use crate::ObjectMeta; +use chrono::{DateTime, Utc}; +use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; +use hyper::HeaderMap; +use snafu::{OptionExt, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("ETag Header missing from response"))] + MissingEtag, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: reqwest::header::ToStrError }, + + #[snafu(display("Last-Modified Header missing from response"))] + MissingLastModified, + + #[snafu(display("Content-Length Header missing from response"))] + MissingContentLength, + + #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + InvalidLastModified { + last_modified: String, + source: chrono::ParseError, + }, + + #[snafu(display("Invalid content length '{}': {}", content_length, source))] + InvalidContentLength { + content_length: String, + source: std::num::ParseIntError, + }, +} + +/// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] +pub fn header_meta(location: &Path, headers: &HeaderMap) -> Result { + let last_modified = headers + .get(LAST_MODIFIED) + .context(MissingLastModifiedSnafu)?; + + let content_length = headers + .get(CONTENT_LENGTH) + .context(MissingContentLengthSnafu)?; + + let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = DateTime::parse_from_rfc2822(last_modified) + .context(InvalidLastModifiedSnafu { last_modified })? + .with_timezone(&Utc); + + let content_length = content_length.to_str().context(BadHeaderSnafu)?; + let content_length = content_length + .parse() + .context(InvalidContentLengthSnafu { content_length })?; + + let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + + Ok(ObjectMeta { + location: location.clone(), + last_modified, + size: content_length, + e_tag: Some(e_tag.to_string()), + }) +} diff --git a/object_store/src/client/list.rs b/object_store/src/client/list.rs new file mode 100644 index 000000000000..6a3889e3be5b --- /dev/null +++ b/object_store/src/client/list.rs @@ -0,0 +1,85 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The list response format used by GCP and AWS + +use crate::path::Path; +use crate::{ListResult, ObjectMeta, Result}; +use chrono::{DateTime, Utc}; +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListResponse { + #[serde(default)] + pub contents: Vec, + #[serde(default)] + pub common_prefixes: Vec, + #[serde(default)] + pub next_continuation_token: Option, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResponse) -> Result { + let common_prefixes = value + .common_prefixes + .into_iter() + .map(|x| Ok(Path::parse(x.prefix)?)) + .collect::>()?; + + let objects = value + .contents + .into_iter() + .map(TryFrom::try_from) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListPrefix { + pub prefix: String, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListContents { + pub key: String, + pub size: usize, + pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: ListContents) -> Result { + Ok(Self { + location: Path::parse(value.key)?, + last_modified: value.last_modified, + size: value.size, + e_tag: value.e_tag, + }) + } +} diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index be44a9f99b27..c6a73fe7a618 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -26,6 +26,12 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod header; + +#[cfg(any(feature = "aws", feature = "gcp"))] +pub mod list; + use std::collections::HashMap; use std::str::FromStr; use std::time::Duration; diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 41a91fef84a9..32f4055f1178 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -36,15 +36,16 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes}; -use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use percent_encoding::{percent_encode, NON_ALPHANUMERIC}; +use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; +use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::{ClientConfigKey, GetOptionsExt}; @@ -82,6 +83,9 @@ enum Error { #[snafu(display("Error getting list response body: {}", source))] ListResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + #[snafu(display("Error performing get request {}: {}", path, source))] GetRequest { source: crate::client::retry::Error, @@ -143,6 +147,11 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("Failed to parse headers: {}", source))] + Header { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -162,25 +171,6 @@ impl From for super::Error { } } -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "camelCase")] -struct ListResponse { - next_page_token: Option, - #[serde(default)] - prefixes: Vec, - #[serde(default)] - items: Vec, -} - -#[derive(serde::Deserialize, Debug)] -struct Object { - name: String, - size: String, - updated: DateTime, - #[serde(rename = "etag")] - e_tag: Option, -} - #[derive(serde::Deserialize, Debug)] #[serde(rename_all = "PascalCase")] struct InitiateMultipartUploadResult { @@ -248,15 +238,11 @@ impl GoogleCloudStorageClient { } fn object_url(&self, path: &Path) -> String { - let encoded = - percent_encoding::utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); - format!( - "{}/storage/v1/b/{}/o/{}", - self.base_url, self.bucket_name_encoded, encoded - ) + let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) } - /// Perform a get request + /// Perform a get request async fn get_request( &self, path: &Path, @@ -266,16 +252,15 @@ impl GoogleCloudStorageClient { let token = self.get_token().await?; let url = self.object_url(path); - let alt = match head { - true => "json", - false => "media", + let method = match head { + true => Method::HEAD, + false => Method::GET, }; - let builder = self.client.request(Method::GET, url); - - let response = builder + let response = self + .client + .request(method, url) .bearer_auth(token) - .query(&[("alt", alt)]) .with_get_options(options) .send_retry(&self.retry_config) .await @@ -286,13 +271,10 @@ impl GoogleCloudStorageClient { Ok(response) } - /// Perform a put request + /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { let token = self.get_token().await?; - let url = format!( - "{}/upload/storage/v1/b/{}/o", - self.base_url, self.bucket_name_encoded - ); + let url = self.object_url(path); let content_type = self .client_options @@ -300,11 +282,10 @@ impl GoogleCloudStorageClient { .unwrap_or("application/octet-stream"); self.client - .request(Method::POST, url) + .request(Method::PUT, url) .bearer_auth(token) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) - .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) .send_retry(&self.retry_config) .await @@ -373,7 +354,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a delete request + /// Perform a delete request async fn delete_request(&self, path: &Path) -> Result<()> { let token = self.get_token().await?; let url = self.object_url(path); @@ -390,7 +371,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a copy request + /// Perform a copy request async fn copy_request( &self, from: &Path, @@ -398,24 +379,18 @@ impl GoogleCloudStorageClient { if_not_exists: bool, ) -> Result<()> { let token = self.get_token().await?; + let url = self.object_url(to); - let source = - percent_encoding::utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); - let destination = - percent_encoding::utf8_percent_encode(to.as_ref(), NON_ALPHANUMERIC); - let url = format!( - "{}/storage/v1/b/{}/o/{}/copyTo/b/{}/o/{}", - self.base_url, - self.bucket_name_encoded, - source, - self.bucket_name_encoded, - destination - ); + let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let source = format!("{}/{}", self.bucket_name_encoded, from); - let mut builder = self.client.request(Method::POST, url); + let mut builder = self + .client + .request(Method::PUT, url) + .header("x-goog-copy-source", source); if if_not_exists { - builder = builder.query(&[("ifGenerationMatch", "0")]); + builder = builder.header("x-goog-if-generation-match", 0); } builder @@ -436,7 +411,7 @@ impl GoogleCloudStorageClient { Ok(()) } - /// Perform a list request + /// Perform a list request async fn list_request( &self, prefix: Option<&str>, @@ -444,13 +419,10 @@ impl GoogleCloudStorageClient { page_token: Option<&str>, ) -> Result { let token = self.get_token().await?; + let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); - let url = format!( - "{}/storage/v1/b/{}/o", - self.base_url, self.bucket_name_encoded - ); - - let mut query = Vec::with_capacity(4); + let mut query = Vec::with_capacity(5); + query.push(("list-type", "2")); if delimiter { query.push(("delimiter", DELIMITER)) } @@ -460,14 +432,14 @@ impl GoogleCloudStorageClient { } if let Some(page_token) = page_token { - query.push(("pageToken", page_token)) + query.push(("continuation-token", page_token)) } if let Some(max_results) = &self.max_list_results { - query.push(("maxResults", max_results)) + query.push(("max-keys", max_results)) } - let response: ListResponse = self + let response = self .client .request(Method::GET, url) .query(&query) @@ -475,10 +447,13 @@ impl GoogleCloudStorageClient { .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? - .json() + .bytes() .await .context(ListResponseBodySnafu)?; + let response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + Ok(response) } @@ -487,14 +462,14 @@ impl GoogleCloudStorageClient { &self, prefix: Option<&Path>, delimiter: bool, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'_, Result> { let prefix = format_prefix(prefix); stream_paginated(prefix, move |prefix, token| async move { let mut r = self .list_request(prefix.as_deref(), delimiter, token.as_deref()) .await?; - let next_token = r.next_page_token.take(); - Ok((r, prefix, next_token)) + let next_token = r.next_continuation_token.take(); + Ok((r.try_into()?, prefix, next_token)) }) .boxed() } @@ -639,12 +614,6 @@ impl ObjectStore for GoogleCloudStorage { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_modified_since.is_some() || options.if_unmodified_since.is_some() { - return Err(super::Error::NotSupported { - source: "ModifiedSince Preconditions not supported by GoogleCloudStorage JSON API".to_string().into(), - }); - } - let response = self.client.get_request(location, options, false).await?; let stream = response .bytes_stream() @@ -660,10 +629,7 @@ impl ObjectStore for GoogleCloudStorage { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.client.get_request(location, options, true).await?; - let object = response.json().await.context(GetResponseBodySnafu { - path: location.as_ref(), - })?; - convert_object_meta(&object) + Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) } async fn delete(&self, location: &Path) -> Result<()> { @@ -677,11 +643,7 @@ impl ObjectStore for GoogleCloudStorage { let stream = self .client .list_paginated(prefix, false) - .map_ok(|r| { - futures::stream::iter( - r.items.into_iter().map(|x| convert_object_meta(&x)), - ) - }) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() .boxed(); @@ -696,15 +658,8 @@ impl ObjectStore for GoogleCloudStorage { while let Some(result) = stream.next().await { let response = result?; - - for p in response.prefixes { - common_prefixes.insert(Path::parse(p)?); - } - - objects.reserve(response.items.len()); - for object in &response.items { - objects.push(convert_object_meta(object)?); - } + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); } Ok(ListResult { @@ -1170,20 +1125,6 @@ impl GoogleCloudStorageBuilder { } } -fn convert_object_meta(object: &Object) -> Result { - let location = Path::parse(&object.name)?; - let last_modified = object.updated; - let size = object.size.parse().context(InvalidSizeSnafu)?; - let e_tag = object.e_tag.clone(); - - Ok(ObjectMeta { - location, - last_modified, - size, - e_tag, - }) -} - #[cfg(test)] mod test { use bytes::Bytes; diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index ffe509411911..39585f73b692 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -119,11 +119,7 @@ impl ObjectStore for PrefixStore { self.inner.get_range(&full_path, range).await } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let full_path = self.full_path(location); self.inner.get_opts(&full_path, options).await } From 4714b2188f01ae74beae1253660fb2f94942c9ed Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Tue, 16 May 2023 04:48:12 -0700 Subject: [PATCH 0902/1411] Object Store (AWS): Support region configured via named profile (#4161) * feat(aws_profile): use profile region as fallback * moved ProfileProvider to aws::profile module * added aws::region::RegionProvider * lazy-init profile credential provider * support overriding profile region * tests * fix(aws_profile): clippy & RAT errors * fix(aws_profile): make RegionProvider async * test(aws_profile): use fake config for testing * refactor(aws_profile): remove unnecessary module aws::profile::region -> aws::profile * refactor(aws_profile): tests w/ profile files * fix(object_store): rat + clippy warnings * Don't spawn thread --------- Co-authored-by: Raphael Taylor-Davies --- object_store/src/aws/credential.rs | 62 -------------- object_store/src/aws/mod.rs | 78 +++++++++++++++++- object_store/src/aws/profile.rs | 128 +++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 64 deletions(-) create mode 100644 object_store/src/aws/profile.rs diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 16cdf35d0f4a..9e047941a3c2 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -515,68 +515,6 @@ async fn web_identity( }) } -#[cfg(feature = "aws_profile")] -mod profile { - use super::*; - use aws_config::profile::ProfileFileCredentialsProvider; - use aws_config::provider_config::ProviderConfig; - use aws_credential_types::provider::ProvideCredentials; - use aws_types::region::Region; - use std::time::SystemTime; - - #[derive(Debug)] - pub struct ProfileProvider { - cache: TokenCache>, - credentials: ProfileFileCredentialsProvider, - } - - impl ProfileProvider { - pub fn new(name: String, region: String) -> Self { - let config = ProviderConfig::default().with_region(Some(Region::new(region))); - - Self { - cache: Default::default(), - credentials: ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(name) - .build(), - } - } - } - - impl CredentialProvider for ProfileProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(move || async move { - let c = - self.credentials - .provide_credentials() - .await - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, - }) - })) - } - } -} - -#[cfg(feature = "aws_profile")] -pub use profile::ProfileProvider; - #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 2c38a9b712c2..428e013f4478 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -64,6 +64,9 @@ mod checksum; mod client; mod credential; +#[cfg(feature = "aws_profile")] +mod profile; + // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -985,8 +988,14 @@ impl AmazonS3Builder { self.parse_url(&url)?; } + let region = match (self.region.clone(), self.profile.clone()) { + (Some(region), _) => Some(region), + (None, Some(profile)) => profile_region(profile), + (None, None) => None, + }; + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = self.region.context(MissingRegionSnafu)?; + let region = region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = match (self.access_key_id, self.secret_access_key, self.token) { @@ -1094,12 +1103,30 @@ impl AmazonS3Builder { } } +#[cfg(feature = "aws_profile")] +fn profile_region(profile: String) -> Option { + use tokio::runtime::Handle; + + let handle = Handle::current(); + let provider = profile::ProfileProvider::new(profile, None); + + handle.block_on(provider.get_region()) +} + #[cfg(feature = "aws_profile")] fn profile_credentials( profile: String, region: String, ) -> Result> { - Ok(Box::new(credential::ProfileProvider::new(profile, region))) + Ok(Box::new(profile::ProfileProvider::new( + profile, + Some(region), + ))) +} + +#[cfg(not(feature = "aws_profile"))] +fn profile_region(_profile: String) -> Option { + None } #[cfg(not(feature = "aws_profile"))] @@ -1594,3 +1621,50 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } } + +#[cfg(all(test, feature = "aws_profile"))] +mod profile_tests { + use super::*; + use std::env; + + use super::profile::{TEST_PROFILE_NAME, TEST_PROFILE_REGION}; + + #[tokio::test] + async fn s3_test_region_from_profile() { + let s3_url = "s3://bucket/prefix".to_owned(); + + let s3 = AmazonS3Builder::new() + .with_url(s3_url) + .with_profile(TEST_PROFILE_NAME) + .build() + .unwrap(); + + let region = &s3.client.config().region; + + assert_eq!(region, TEST_PROFILE_REGION); + } + + #[test] + fn s3_test_region_override() { + let s3_url = "s3://bucket/prefix".to_owned(); + + let aws_profile = + env::var("AWS_PROFILE").unwrap_or_else(|_| TEST_PROFILE_NAME.into()); + + let aws_region = + env::var("AWS_REGION").unwrap_or_else(|_| "object_store:fake_region".into()); + + env::set_var("AWS_PROFILE", aws_profile); + + let s3 = AmazonS3Builder::from_env() + .with_url(s3_url) + .with_region(aws_region.clone()) + .build() + .unwrap(); + + let actual = &s3.client.config().region; + let expected = &aws_region; + + assert_eq!(actual, expected); + } +} diff --git a/object_store/src/aws/profile.rs b/object_store/src/aws/profile.rs new file mode 100644 index 000000000000..a88824c79f93 --- /dev/null +++ b/object_store/src/aws/profile.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![cfg(feature = "aws_profile")] + +use aws_config::meta::region::ProvideRegion; +use aws_config::profile::profile_file::ProfileFiles; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::profile::ProfileFileRegionProvider; +use aws_config::provider_config::ProviderConfig; +use aws_credential_types::provider::ProvideCredentials; +use aws_types::region::Region; +use futures::future::BoxFuture; +use std::sync::Arc; +use std::time::Instant; +use std::time::SystemTime; + +use crate::aws::credential::CredentialProvider; +use crate::aws::AwsCredential; +use crate::client::token::{TemporaryToken, TokenCache}; +use crate::Result; + +#[cfg(test)] +pub static TEST_PROFILE_NAME: &str = "object_store:fake_profile"; + +#[cfg(test)] +pub static TEST_PROFILE_REGION: &str = "object_store:fake_region_from_profile"; + +#[derive(Debug)] +pub struct ProfileProvider { + name: String, + region: Option, + cache: TokenCache>, +} + +impl ProfileProvider { + pub fn new(name: String, region: Option) -> Self { + Self { + name, + region, + cache: Default::default(), + } + } + + #[cfg(test)] + fn profile_files(&self) -> ProfileFiles { + use aws_config::profile::profile_file::ProfileFileKind; + + let config = format!( + "[profile {}]\nregion = {}", + TEST_PROFILE_NAME, TEST_PROFILE_REGION + ); + + ProfileFiles::builder() + .with_contents(ProfileFileKind::Config, config) + .build() + } + + #[cfg(not(test))] + fn profile_files(&self) -> ProfileFiles { + ProfileFiles::default() + } + + pub async fn get_region(&self) -> Option { + if let Some(region) = self.region.clone() { + return Some(region); + } + + let provider = ProfileFileRegionProvider::builder() + .profile_files(self.profile_files()) + .profile_name(&self.name) + .build(); + + let region = provider.region().await; + + region.map(|r| r.as_ref().to_owned()) + } +} + +impl CredentialProvider for ProfileProvider { + fn get_credential(&self) -> BoxFuture<'_, Result>> { + Box::pin(self.cache.get_or_insert_with(move || async move { + let region = self.region.clone().map(Region::new); + + let config = ProviderConfig::default().with_region(region); + + let credentials = ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(&self.name) + .build(); + + let c = credentials.provide_credentials().await.map_err(|source| { + crate::Error::Generic { + store: "S3", + source: Box::new(source), + } + })?; + let t_now = SystemTime::now(); + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) + })) + } +} From 98867f5b2f61d1ed0539856240f45bbf004423de Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 08:31:01 +0100 Subject: [PATCH 0903/1411] Return NotFound for directories in Head and Get (#4230) (#4231) * Return NotFound for directories in Head and Get (#4230) * Fix webdav * Fix error message --- object_store/src/azure/client.rs | 14 ++++++++++- object_store/src/http/client.rs | 11 +++++--- object_store/src/http/mod.rs | 20 ++++++--------- object_store/src/lib.rs | 8 ++++++ object_store/src/local.rs | 43 ++++++++++++++++++++------------ 5 files changed, 63 insertions(+), 33 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 4611986e30d2..893e261fea64 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -257,7 +257,19 @@ impl AzureClient { path: path.as_ref(), })?; - Ok(response) + match response.headers().get("x-ms-resource-type") { + Some(resource) if resource.as_ref() != b"file" => { + Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }) + } + _ => Ok(response), + } } /// Make an Azure Delete request diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 4e58eb0b2927..6feacbba6c2d 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -238,10 +238,13 @@ impl Client { .send_retry(&self.retry_config) .await .map_err(|source| match source.status() { - Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { - source: Box::new(source), - path: location.to_string(), - }, + // Some stores return METHOD_NOT_ALLOWED for get on directories + Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { + crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + } + } _ => Error::Request { source }.into(), }) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index bed19722c83a..124b7da2f7e7 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -60,15 +60,6 @@ enum Error { url: String, }, - #[snafu(display("Object is a directory"))] - IsDirectory, - - #[snafu(display("PROPFIND response contained no valid objects"))] - NoObjects, - - #[snafu(display("PROPFIND response contained more than one object"))] - MultipleObjects, - #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, } @@ -134,12 +125,17 @@ impl ObjectStore for HttpStore { let response = status.response.into_iter().next().unwrap(); response.check_ok()?; match response.is_dir() { - true => Err(Error::IsDirectory.into()), + true => Err(crate::Error::NotFound { + path: location.to_string(), + source: "Is directory".to_string().into(), + }), false => response.object_meta(self.client.base_url()), } } - 0 => Err(Error::NoObjects.into()), - _ => Err(Error::MultipleObjects.into()), + x => Err(crate::Error::NotFound { + path: location.to_string(), + source: format!("Expected 1 result, got {x}").into(), + }), } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 75f9ca7df411..0f3ed809e424 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -880,6 +880,14 @@ mod tests { assert_eq!(result.common_prefixes.len(), 1); assert_eq!(result.common_prefixes[0], Path::from("test_dir")); + // Should return not found + let err = storage.get(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + // Should return not found + let err = storage.head(&Path::from("test_dir")).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + // List everything starting with a prefix that should return results let prefix = Path::from("test_dir"); let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 26a8bf336873..52719f1cb562 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -419,18 +419,23 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let metadata = match metadata(&path) { - Err(e) => Err(if e.kind() == ErrorKind::NotFound { - Error::NotFound { + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path: path.clone(), source: e, - } - } else { - Error::Metadata { + }, + _ => Error::Metadata { source: e.into(), path: location.to_string(), - } + }, }), - Ok(m) => Ok(m), + Ok(m) => match m.is_file() { + true => Ok(m), + false => Err(Error::NotFound { + path, + source: io::Error::new(ErrorKind::NotFound, "is not file"), + }), + }, }?; convert_metadata(metadata, location) }) @@ -878,19 +883,25 @@ fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result Result { - let file = File::open(path).map_err(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - Error::NotFound { + let file = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path: path.clone(), source: e, - } - } else { - Error::UnableToOpenFile { + }, + _ => Error::UnableToOpenFile { path: path.clone(), source: e, - } - } - })?; + }, + }), + Ok((metadata, file)) => match metadata.is_file() { + true => Ok(file), + false => Err(Error::NotFound { + path: path.clone(), + source: io::Error::new(ErrorKind::NotFound, "not a file"), + }), + }, + }?; Ok(file) } From ff4e8e5c5dee2b4bb5c9f55d15d9a9ddaf5833b2 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Wed, 17 May 2023 14:11:49 +0300 Subject: [PATCH 0904/1411] Minor: use all primitive types in test_layouts (#4229) --- arrow-array/src/types.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index b50018ca9751..8c19301dc7d0 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1695,6 +1695,12 @@ mod tests { test_layout::(); test_layout::(); test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); + test_layout::(); test_layout::(); test_layout::(); test_layout::(); @@ -1708,5 +1714,6 @@ mod tests { test_layout::(); test_layout::(); test_layout::(); + test_layout::(); } } From bccbf2354aec165e77592014bd0c487ca24d002f Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Wed, 17 May 2023 13:12:00 +0200 Subject: [PATCH 0905/1411] Add close method to RecordBatchWriter trait (#4228) * Add finish method to RecordBatchWriter trait and implement it for CSV, JSON, IPC and Parquet * Simplify parquet::ArrowWriter::finish Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Rename finish to close * Remove fully qualified method call --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/record_batch.rs | 3 +++ arrow-csv/src/writer.rs | 4 ++++ arrow-ipc/src/writer.rs | 8 ++++++++ arrow-json/src/writer.rs | 5 +++++ parquet/src/arrow/arrow_writer/mod.rs | 5 +++++ 5 files changed, 25 insertions(+) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index aea49c04753e..d2e36780a901 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -47,6 +47,9 @@ pub trait RecordBatchReader: Iterator> { pub trait RecordBatchWriter { /// Write a single batch to the writer. fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError>; + + /// Write footer or termination data, then mark the writer as done. + fn close(self) -> Result<(), ArrowError>; } /// A two-dimensional batch of column-oriented data with a defined diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index ba2123a09498..840e8e8a93cc 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -197,6 +197,10 @@ impl RecordBatchWriter for Writer { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) } + + fn close(self) -> Result<(), ArrowError> { + Ok(()) + } } /// A CSV writer builder diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index fcfd4d97ac07..59657bc4be09 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -861,6 +861,10 @@ impl RecordBatchWriter for FileWriter { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) } + + fn close(mut self) -> Result<(), ArrowError> { + self.finish() + } } pub struct StreamWriter { @@ -1001,6 +1005,10 @@ impl RecordBatchWriter for StreamWriter { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) } + + fn close(mut self) -> Result<(), ArrowError> { + self.finish() + } } /// Stores the encoded data, which is an crate::Message, and optional Arrow data diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 6f241be409dc..e6c960aef271 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -594,6 +594,10 @@ where fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch) } + + fn close(mut self) -> Result<(), ArrowError> { + self.finish() + } } #[cfg(test)] @@ -1265,6 +1269,7 @@ mod tests { writer.finish().unwrap(); assert_eq!(String::from_utf8(writer.into_inner()).unwrap(), ""); } + #[test] fn json_writer_one_row() { let mut writer = ArrayWriter::new(vec![] as Vec); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 075ecc034862..af820218255d 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -250,6 +250,11 @@ impl RecordBatchWriter for ArrowWriter { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch).map_err(|e| e.into()) } + + fn close(self) -> std::result::Result<(), ArrowError> { + self.close()?; + Ok(()) + } } fn write_leaves( From 69535611176f95f302c10e25a98f5b49af683d8b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 12:13:04 +0100 Subject: [PATCH 0906/1411] Standardise credentials API (#4223) (#4163) (#4225) * Standardise credentials API (#4223) (#4163) * Clippy * Allow HTTP metadata endpoint --- object_store/src/aws/client.rs | 6 +- object_store/src/aws/credential.rs | 91 ++++++------- object_store/src/aws/mod.rs | 60 +++++---- object_store/src/aws/profile.rs | 71 +++++----- object_store/src/azure/client.rs | 52 ++------ object_store/src/azure/credential.rs | 131 ++++++++++--------- object_store/src/azure/mod.rs | 65 +++++----- object_store/src/client/mod.rs | 89 ++++++++++++- object_store/src/gcp/credential.rs | 187 ++++++++++++++------------- object_store/src/gcp/mod.rs | 121 ++++++++--------- 10 files changed, 461 insertions(+), 412 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 1cdf785e5f4d..8ce743b31be9 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -16,8 +16,8 @@ // under the License. use crate::aws::checksum::Checksum; -use crate::aws::credential::{AwsCredential, CredentialExt, CredentialProvider}; -use crate::aws::{STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::credential::{AwsCredential, CredentialExt}; +use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; @@ -135,7 +135,7 @@ pub struct S3Config { pub endpoint: String, pub bucket: String, pub bucket_endpoint: String, - pub credentials: Box, + pub credentials: AwsCredentialProvider, pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 9e047941a3c2..47d681c631c7 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -18,12 +18,12 @@ use crate::aws::{STORE, STRICT_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::TokenProvider; use crate::util::hmac_sha256; use crate::{Result, RetryConfig}; +use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; -use futures::future::BoxFuture; -use futures::TryFutureExt; use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; @@ -41,10 +41,14 @@ static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; -#[derive(Debug)] +/// A set of AWS security credentials +#[derive(Debug, Eq, PartialEq)] pub struct AwsCredential { + /// AWS_ACCESS_KEY_ID pub key_id: String, + /// AWS_SECRET_ACCESS_KEY pub secret_key: String, + /// AWS_SESSION_TOKEN pub token: Option, } @@ -291,49 +295,31 @@ fn canonicalize_headers(header_map: &HeaderMap) -> (String, String) { (signed_headers, canonical_headers) } -/// Provides credentials for use when signing requests -pub trait CredentialProvider: std::fmt::Debug + Send + Sync { - fn get_credential(&self) -> BoxFuture<'_, Result>>; -} - -/// A static set of credentials -#[derive(Debug)] -pub struct StaticCredentialProvider { - pub credential: Arc, -} - -impl CredentialProvider for StaticCredentialProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(futures::future::ready(Ok(Arc::clone(&self.credential)))) - } -} - /// Credentials sourced from the instance metadata service /// /// #[derive(Debug)] pub struct InstanceCredentialProvider { pub cache: TokenCache>, - pub client: Client, - pub retry_config: RetryConfig, pub imdsv1_fallback: bool, pub metadata_endpoint: String, } -impl CredentialProvider for InstanceCredentialProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(|| { - instance_creds( - &self.client, - &self.retry_config, - &self.metadata_endpoint, - self.imdsv1_fallback, - ) +#[async_trait] +impl TokenProvider for InstanceCredentialProvider { + type Credential = AwsCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>> { + instance_creds(client, retry, &self.metadata_endpoint, self.imdsv1_fallback) + .await .map_err(|source| crate::Error::Generic { store: STORE, source, }) - })) } } @@ -342,31 +328,34 @@ impl CredentialProvider for InstanceCredentialProvider { /// #[derive(Debug)] pub struct WebIdentityProvider { - pub cache: TokenCache>, pub token_path: String, pub role_arn: String, pub session_name: String, pub endpoint: String, - pub client: Client, - pub retry_config: RetryConfig, } -impl CredentialProvider for WebIdentityProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(|| { - web_identity( - &self.client, - &self.retry_config, - &self.token_path, - &self.role_arn, - &self.session_name, - &self.endpoint, - ) - .map_err(|source| crate::Error::Generic { - store: STORE, - source, - }) - })) +#[async_trait] +impl TokenProvider for WebIdentityProvider { + type Credential = AwsCredential; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>> { + web_identity( + client, + retry, + &self.token_path, + &self.role_arn, + &self.session_name, + &self.endpoint, + ) + .await + .map_err(|source| crate::Error::Generic { + store: STORE, + source, + }) } } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 428e013f4478..ddb9dc799501 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -48,11 +48,13 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ - AwsCredential, CredentialProvider, InstanceCredentialProvider, - StaticCredentialProvider, WebIdentityProvider, + AwsCredential, InstanceCredentialProvider, WebIdentityProvider, }; use crate::client::header::header_meta; -use crate::client::ClientConfigKey; +use crate::client::{ + ClientConfigKey, CredentialProvider, StaticCredentialProvider, + TokenCredentialProvider, +}; use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ @@ -83,6 +85,8 @@ const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.rem const STORE: &str = "S3"; +type AwsCredentialProvider = Arc>; + /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -1001,13 +1005,12 @@ impl AmazonS3Builder { let credentials = match (self.access_key_id, self.secret_access_key, self.token) { (Some(key_id), Some(secret_key), token) => { info!("Using Static credential provider"); - Box::new(StaticCredentialProvider { - credential: Arc::new(AwsCredential { - key_id, - secret_key, - token, - }), - }) as _ + let credential = AwsCredential { + key_id, + secret_key, + token, + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ } (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), @@ -1031,15 +1034,18 @@ impl AmazonS3Builder { .with_allow_http(false) .client()?; - Box::new(WebIdentityProvider { - cache: Default::default(), + let token = WebIdentityProvider { token_path, session_name, role_arn, endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, client, - retry_config: self.retry_config.clone(), - }) as _ + self.retry_config.clone(), + )) as _ } _ => match self.profile { Some(profile) => { @@ -1049,19 +1055,20 @@ impl AmazonS3Builder { None => { info!("Using Instance credential provider"); - // The instance metadata endpoint is access over HTTP - let client_options = - self.client_options.clone().with_allow_http(true); - - Box::new(InstanceCredentialProvider { + let token = InstanceCredentialProvider { cache: Default::default(), - client: client_options.client()?, - retry_config: self.retry_config.clone(), imdsv1_fallback: self.imdsv1_fallback.get()?, metadata_endpoint: self .metadata_endpoint .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }) as _ + }; + + Arc::new(TokenCredentialProvider::new( + token, + // The instance metadata endpoint is access over HTTP + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ } }, }, @@ -1114,11 +1121,8 @@ fn profile_region(profile: String) -> Option { } #[cfg(feature = "aws_profile")] -fn profile_credentials( - profile: String, - region: String, -) -> Result> { - Ok(Box::new(profile::ProfileProvider::new( +fn profile_credentials(profile: String, region: String) -> Result { + Ok(Arc::new(profile::ProfileProvider::new( profile, Some(region), ))) @@ -1133,7 +1137,7 @@ fn profile_region(_profile: String) -> Option { fn profile_credentials( _profile: String, _region: String, -) -> Result> { +) -> Result { Err(Error::MissingProfileFeature.into()) } diff --git a/object_store/src/aws/profile.rs b/object_store/src/aws/profile.rs index a88824c79f93..3fc08056444e 100644 --- a/object_store/src/aws/profile.rs +++ b/object_store/src/aws/profile.rs @@ -17,6 +17,7 @@ #![cfg(feature = "aws_profile")] +use async_trait::async_trait; use aws_config::meta::region::ProvideRegion; use aws_config::profile::profile_file::ProfileFiles; use aws_config::profile::ProfileFileCredentialsProvider; @@ -24,14 +25,13 @@ use aws_config::profile::ProfileFileRegionProvider; use aws_config::provider_config::ProviderConfig; use aws_credential_types::provider::ProvideCredentials; use aws_types::region::Region; -use futures::future::BoxFuture; use std::sync::Arc; use std::time::Instant; use std::time::SystemTime; -use crate::aws::credential::CredentialProvider; use crate::aws::AwsCredential; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::CredentialProvider; use crate::Result; #[cfg(test)] @@ -91,38 +91,43 @@ impl ProfileProvider { } } +#[async_trait] impl CredentialProvider for ProfileProvider { - fn get_credential(&self) -> BoxFuture<'_, Result>> { - Box::pin(self.cache.get_or_insert_with(move || async move { - let region = self.region.clone().map(Region::new); - - let config = ProviderConfig::default().with_region(region); - - let credentials = ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(&self.name) - .build(); - - let c = credentials.provide_credentials().await.map_err(|source| { - crate::Error::Generic { - store: "S3", - source: Box::new(source), - } - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, + type Credential = AwsCredential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(move || async move { + let region = self.region.clone().map(Region::new); + + let config = ProviderConfig::default().with_region(region); + + let credentials = ProfileFileCredentialsProvider::builder() + .configure(&config) + .profile_name(&self.name) + .build(); + + let c = credentials.provide_credentials().await.map_err(|source| { + crate::Error::Generic { + store: "S3", + source: Box::new(source), + } + })?; + let t_now = SystemTime::now(); + let expiry = c + .expiry() + .and_then(|e| e.duration_since(t_now).ok()) + .map(|ttl| Instant::now() + ttl); + + Ok(TemporaryToken { + token: Arc::new(AwsCredential { + key_id: c.access_key_id().to_string(), + secret_key: c.secret_access_key().to_string(), + token: c.session_token().map(ToString::to_string), + }), + expiry, + }) }) - })) + .await } } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 893e261fea64..5f165c007947 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use super::credential::{AzureCredential, CredentialProvider}; +use super::credential::AzureCredential; use crate::azure::credential::*; -use crate::azure::STORE; +use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -40,6 +40,7 @@ use reqwest::{ use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::collections::HashMap; +use std::sync::Arc; use url::Url; /// A specialized `Error` for object store-related errors @@ -101,10 +102,10 @@ impl From for crate::Error { /// Configuration for [AzureClient] #[derive(Debug)] -pub struct AzureConfig { +pub(crate) struct AzureConfig { pub account: String, pub container: String, - pub credentials: CredentialProvider, + pub credentials: AzureCredentialProvider, pub retry_config: RetryConfig, pub service: Url, pub is_emulator: bool, @@ -143,45 +144,8 @@ impl AzureClient { &self.config } - async fn get_credential(&self) -> Result { - match &self.config.credentials { - CredentialProvider::AccessKey(key) => { - Ok(AzureCredential::AccessKey(key.to_owned())) - } - CredentialProvider::BearerToken(token) => { - Ok(AzureCredential::AuthorizationToken( - // we do the conversion to a HeaderValue here, since it is fallible - // and we want to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { - crate::Error::Generic { - store: STORE, - source: Box::new(err), - } - })?, - )) - } - CredentialProvider::TokenCredential(cache, cred) => { - let token = cache - .get_or_insert_with(|| { - cred.fetch_token(&self.client, &self.config.retry_config) - }) - .await - .context(AuthorizationSnafu)?; - Ok(AzureCredential::AuthorizationToken( - // we do the conversion to a HeaderValue here, since it is fallible - // and we want to use it in an infallible function - HeaderValue::from_str(&format!("Bearer {token}")).map_err(|err| { - crate::Error::Generic { - store: STORE, - source: Box::new(err), - } - })?, - )) - } - CredentialProvider::SASToken(sas) => { - Ok(AzureCredential::SASToken(sas.clone())) - } - } + async fn get_credential(&self) -> Result> { + self.config.credentials.get_credential().await } /// Make an Azure PUT request @@ -308,7 +272,7 @@ impl AzureClient { // If using SAS authorization must include the headers in the URL // - if let AzureCredential::SASToken(pairs) = &credential { + if let AzureCredential::SASToken(pairs) = credential.as_ref() { source.query_pairs_mut().extend_pairs(pairs); } diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 8130df6361fd..fd75389249b0 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -15,10 +15,13 @@ // specific language governing permissions and limitations // under the License. +use crate::azure::STORE; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; +use crate::client::{CredentialProvider, TokenProvider}; use crate::util::hmac_sha256; use crate::RetryConfig; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use chrono::{DateTime, Utc}; @@ -36,6 +39,7 @@ use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::process::Command; use std::str; +use std::sync::Arc; use std::time::{Duration, Instant}; use url::Url; @@ -81,19 +85,30 @@ pub enum Error { pub type Result = std::result::Result; -/// Provides credentials for use when signing requests -#[derive(Debug)] -pub enum CredentialProvider { - AccessKey(String), - BearerToken(String), - SASToken(Vec<(String, String)>), - TokenCredential(TokenCache, Box), +impl From for crate::Error { + fn from(value: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(value), + } + } } -pub(crate) enum AzureCredential { +/// An Azure storage credential +#[derive(Debug, Eq, PartialEq)] +pub enum AzureCredential { + /// A shared access key + /// + /// AccessKey(String), + /// A shared access signature + /// + /// SASToken(Vec<(String, String)>), - AuthorizationToken(HeaderValue), + /// An authorization token + /// + /// + BearerToken(String), } /// A list of known Azure authority hosts @@ -155,9 +170,7 @@ impl CredentialExt for RequestBuilder { Self::from_parts(client, request) } - AzureCredential::AuthorizationToken(token) => { - self.header(AUTHORIZATION, token) - } + AzureCredential::BearerToken(token) => self.bearer_auth(token), AzureCredential::SASToken(query_pairs) => self.query(&query_pairs), } } @@ -291,15 +304,6 @@ fn lexy_sort<'a>( values } -#[async_trait::async_trait] -pub trait TokenCredential: std::fmt::Debug + Send + Sync + 'static { - async fn fetch_token( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result>; -} - #[derive(Deserialize, Debug)] struct TokenResponse { access_token: String, @@ -338,13 +342,15 @@ impl ClientSecretOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for ClientSecretOAuthProvider { +impl TokenProvider for ClientSecretOAuthProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let response: TokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) @@ -361,12 +367,10 @@ impl TokenCredential for ClientSecretOAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -397,7 +401,6 @@ pub struct ImdsManagedIdentityProvider { client_id: Option, object_id: Option, msi_res_id: Option, - client: Client, } impl ImdsManagedIdentityProvider { @@ -407,7 +410,6 @@ impl ImdsManagedIdentityProvider { object_id: Option, msi_res_id: Option, msi_endpoint: Option, - client: Client, ) -> Self { let msi_endpoint = msi_endpoint.unwrap_or_else(|| { "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() @@ -418,19 +420,20 @@ impl ImdsManagedIdentityProvider { client_id, object_id, msi_res_id, - client, } } } #[async_trait::async_trait] -impl TokenCredential for ImdsManagedIdentityProvider { +impl TokenProvider for ImdsManagedIdentityProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, - _client: &Client, + client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let mut query_items = vec![ ("api-version", MSI_API_VERSION), ("resource", AZURE_STORAGE_RESOURCE), @@ -450,8 +453,7 @@ impl TokenCredential for ImdsManagedIdentityProvider { query_items.push((key, value)); } - let mut builder = self - .client + let mut builder = client .request(Method::GET, &self.msi_endpoint) .header("metadata", "true") .query(&query_items); @@ -468,12 +470,10 @@ impl TokenCredential for ImdsManagedIdentityProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -511,13 +511,15 @@ impl WorkloadIdentityOAuthProvider { } #[async_trait::async_trait] -impl TokenCredential for WorkloadIdentityOAuthProvider { +impl TokenProvider for WorkloadIdentityOAuthProvider { + type Credential = AzureCredential; + /// Fetch a token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let token_str = std::fs::read_to_string(&self.federated_token_file) .map_err(|_| Error::FederatedTokenFile)?; @@ -542,12 +544,10 @@ impl TokenCredential for WorkloadIdentityOAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(AzureCredential::BearerToken(response.access_token)), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -585,23 +585,16 @@ struct AzureCliTokenResponse { #[derive(Default, Debug)] pub struct AzureCliCredential { - _private: (), + cache: TokenCache>, } impl AzureCliCredential { pub fn new() -> Self { Self::default() } -} -#[async_trait::async_trait] -impl TokenCredential for AzureCliCredential { /// Fetch a token - async fn fetch_token( - &self, - _client: &Client, - _retry: &RetryConfig, - ) -> Result> { + async fn fetch_token(&self) -> Result>> { // on window az is a cmd and it should be called like this // see https://doc.rust-lang.org/nightly/std/process/struct.Command.html let program = if cfg!(target_os = "windows") { @@ -642,7 +635,9 @@ impl TokenCredential for AzureCliCredential { let duration = token_response.expires_on.naive_local() - chrono::Local::now().naive_local(); Ok(TemporaryToken { - token: token_response.access_token, + token: Arc::new(AzureCredential::BearerToken( + token_response.access_token, + )), expiry: Some( Instant::now() + duration.to_std().map_err(|_| Error::AzureCli { @@ -669,6 +664,15 @@ impl TokenCredential for AzureCliCredential { } } +#[async_trait] +impl CredentialProvider for AzureCliCredential { + type Credential = AzureCredential; + + async fn get_credential(&self) -> crate::Result> { + Ok(self.cache.get_or_insert_with(|| self.fetch_token()).await?) + } +} + #[cfg(test)] mod tests { use super::*; @@ -723,7 +727,6 @@ mod tests { None, None, Some(format!("{endpoint}/metadata/identity/oauth2/token")), - client.clone(), ); let token = credential @@ -731,7 +734,10 @@ mod tests { .await .unwrap(); - assert_eq!(&token.token, "TOKEN"); + assert_eq!( + token.token.as_ref(), + &AzureCredential::BearerToken("TOKEN".into()) + ); } #[tokio::test] @@ -779,6 +785,9 @@ mod tests { .await .unwrap(); - assert_eq!(&token.token, "TOKEN"); + assert_eq!( + token.token.as_ref(), + &AzureCredential::BearerToken("TOKEN".into()) + ); } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 0f8dae00c6c0..6dc14cfb54e9 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -27,7 +27,6 @@ //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. use self::client::{BlockId, BlockList}; -use crate::client::token::TokenCache; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::Path, @@ -49,14 +48,20 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; +use crate::azure::credential::AzureCredential; use crate::client::header::header_meta; -use crate::client::ClientConfigKey; +use crate::client::{ + ClientConfigKey, CredentialProvider, StaticCredentialProvider, + TokenCredentialProvider, +}; use crate::config::ConfigValue; pub use credential::authority_hosts; mod client; mod credential; +type AzureCredentialProvider = Arc>; + const STORE: &str = "MicrosoftAzure"; /// The well-known account used by Azurite and the legacy Azure Storage Emulator. @@ -101,12 +106,6 @@ enum Error { #[snafu(display("Container name must be specified"))] MissingContainerName {}, - #[snafu(display("At least one authorization option must be specified"))] - MissingCredentials {}, - - #[snafu(display("Azure credential error: {}", source), context(false))] - Credential { source: credential::Error }, - #[snafu(display( "Unknown url scheme cannot be parsed into storage location: {}", scheme @@ -913,6 +912,9 @@ impl MicrosoftAzureBuilder { } let container = self.container_name.ok_or(Error::MissingContainerName {})?; + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { + Arc::new(StaticCredentialProvider::new(credential)) + }; let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { let account_name = self @@ -924,7 +926,8 @@ impl MicrosoftAzureBuilder { let account_key = self .access_key .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); - let credential = credential::CredentialProvider::AccessKey(account_key); + + let credential = static_creds(AzureCredential::AccessKey(account_key)); self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) @@ -933,10 +936,11 @@ impl MicrosoftAzureBuilder { let account_url = format!("https://{}.blob.core.windows.net", &account_name); let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; + let credential = if let Some(bearer_token) = self.bearer_token { - credential::CredentialProvider::BearerToken(bearer_token) + static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { - credential::CredentialProvider::AccessKey(access_key) + static_creds(AzureCredential::AccessKey(access_key)) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = (&self.client_id, &self.tenant_id, self.federated_token_file) { @@ -946,10 +950,11 @@ impl MicrosoftAzureBuilder { tenant_id, self.authority_host, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(client_credential), - ) + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = (&self.client_id, self.client_secret, &self.tenant_id) { @@ -959,33 +964,29 @@ impl MicrosoftAzureBuilder { tenant_id, self.authority_host, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(client_credential), - ) + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ } else if let Some(query_pairs) = self.sas_query_pairs { - credential::CredentialProvider::SASToken(query_pairs) + static_creds(AzureCredential::SASToken(query_pairs)) } else if let Some(sas) = self.sas_key { - credential::CredentialProvider::SASToken(split_sas(&sas)?) + static_creds(AzureCredential::SASToken(split_sas(&sas)?)) } else if self.use_azure_cli.get()? { - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(credential::AzureCliCredential::new()), - ) + Arc::new(credential::AzureCliCredential::new()) as _ } else { - let client = - self.client_options.clone().with_allow_http(true).client()?; let msi_credential = credential::ImdsManagedIdentityProvider::new( self.client_id, self.object_id, self.msi_resource_id, self.msi_endpoint, - client, ); - credential::CredentialProvider::TokenCredential( - TokenCache::default(), - Box::new(msi_credential), - ) + Arc::new(TokenCredentialProvider::new( + msi_credential, + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; (false, url, credential, account_name) }; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index c6a73fe7a618..292e4678fd69 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -32,17 +32,20 @@ pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] pub mod list; +use async_trait::async_trait; use std::collections::HashMap; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; +use crate::client::token::{TemporaryToken, TokenCache}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; -use crate::GetOptions; +use crate::{GetOptions, Result, RetryConfig}; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -503,6 +506,90 @@ impl GetOptionsExt for RequestBuilder { } } +/// Provides credentials for use when signing requests +#[async_trait] +pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + type Credential; + + async fn get_credential(&self) -> Result>; +} + +/// A static set of credentials +#[derive(Debug)] +pub struct StaticCredentialProvider { + credential: Arc, +} + +impl StaticCredentialProvider { + pub fn new(credential: T) -> Self { + Self { + credential: Arc::new(credential), + } + } +} + +#[async_trait] +impl CredentialProvider for StaticCredentialProvider +where + T: std::fmt::Debug + Send + Sync, +{ + type Credential = T; + + async fn get_credential(&self) -> Result> { + Ok(Arc::clone(&self.credential)) + } +} + +#[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] +mod cloud { + use super::*; + + /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens + #[derive(Debug)] + pub struct TokenCredentialProvider { + inner: T, + client: Client, + retry: RetryConfig, + cache: TokenCache>, + } + + impl TokenCredentialProvider { + pub fn new(inner: T, client: Client, retry: RetryConfig) -> Self { + Self { + inner, + client, + retry, + cache: Default::default(), + } + } + } + + #[async_trait] + impl CredentialProvider for TokenCredentialProvider { + type Credential = T::Credential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| self.inner.fetch_token(&self.client, &self.retry)) + .await + } + } + + #[async_trait] + pub trait TokenProvider: std::fmt::Debug + Send + Sync { + type Credential: std::fmt::Debug + Send + Sync; + + async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result>>; + } +} + +#[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] +pub use cloud::*; + #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 057e013334ed..ad12855e19ef 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -17,6 +17,9 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; +use crate::client::{TokenCredentialProvider, TokenProvider}; +use crate::gcp::credential::Error::UnsupportedCredentialsType; +use crate::gcp::{GcpCredentialProvider, STORE}; use crate::ClientOptions; use crate::RetryConfig; use async_trait::async_trait; @@ -30,6 +33,7 @@ use std::env; use std::fs::File; use std::io::BufReader; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::info; @@ -67,9 +71,21 @@ pub enum Error { #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] UnsupportedCredentialsType { type_: String }, +} + +impl From for crate::Error { + fn from(value: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(value), + } + } +} - #[snafu(display("Error creating client: {}", source))] - Client { source: crate::Error }, +#[derive(Debug, Eq, PartialEq)] +pub struct GcpCredential { + /// An HTTP bearer token + pub bearer: String, } pub type Result = std::result::Result; @@ -127,15 +143,6 @@ struct TokenResponse { expires_in: u64, } -#[async_trait] -pub trait TokenProvider: std::fmt::Debug + Send + Sync { - async fn fetch_token( - &self, - client: &Client, - retry: &RetryConfig, - ) -> Result>; -} - /// Encapsulates the logic to perform an OAuth token challenge #[derive(Debug)] pub struct OAuthProvider { @@ -174,12 +181,14 @@ impl OAuthProvider { #[async_trait] impl TokenProvider for OAuthProvider { + type Credential = GcpCredential; + /// Fetch a fresh token async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { let now = seconds_since_epoch(); let exp = now + 3600; @@ -221,12 +230,12 @@ impl TokenProvider for OAuthProvider { .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + Ok(TemporaryToken { + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - - Ok(token) + }) } } @@ -281,17 +290,17 @@ impl ServiceAccountCredentials { } /// Create an [`OAuthProvider`] from this credentials struct. - pub fn token_provider( + pub fn oauth_provider( self, scope: &str, audience: &str, - ) -> Result> { - Ok(Box::new(OAuthProvider::new( + ) -> crate::Result { + Ok(OAuthProvider::new( self.client_email, self.private_key, scope.to_string(), audience.to_string(), - )?) as Box) + )?) } } @@ -329,23 +338,14 @@ fn b64_encode_obj(obj: &T) -> Result { #[derive(Debug, Default)] pub struct InstanceCredentialProvider { audience: String, - client: Client, } impl InstanceCredentialProvider { /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. - pub fn new>( - audience: T, - client_options: ClientOptions, - ) -> Result { - client_options - .with_allow_http(true) - .client() - .map(|client| Self { - audience: audience.into(), - client, - }) - .context(ClientSnafu) + pub fn new>(audience: T) -> Self { + Self { + audience: audience.into(), + } } } @@ -355,7 +355,7 @@ async fn make_metadata_request( hostname: &str, retry: &RetryConfig, audience: &str, -) -> Result { +) -> crate::Result { let url = format!( "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" ); @@ -374,30 +374,29 @@ async fn make_metadata_request( #[async_trait] impl TokenProvider for InstanceCredentialProvider { + type Credential = GcpCredential; + /// Fetch a token from the metadata server. /// Since the connection is local we need to enable http access and don't actually use the client object passed in. async fn fetch_token( &self, - _client: &Client, + client: &Client, retry: &RetryConfig, - ) -> Result> { + ) -> crate::Result>> { const METADATA_IP: &str = "169.254.169.254"; const METADATA_HOST: &str = "metadata"; info!("fetching token from metadata server"); let response = - make_metadata_request(&self.client, METADATA_HOST, retry, &self.audience) + make_metadata_request(client, METADATA_HOST, retry, &self.audience) .or_else(|_| { - make_metadata_request( - &self.client, - METADATA_IP, - retry, - &self.audience, - ) + make_metadata_request(client, METADATA_IP, retry, &self.audience) }) .await?; let token = TemporaryToken { - token: response.access_token, + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), }; Ok(token) @@ -406,31 +405,35 @@ impl TokenProvider for InstanceCredentialProvider { /// ApplicationDefaultCredentials /// -#[derive(Debug)] -pub enum ApplicationDefaultCredentials { - /// - AuthorizedUser { - client_id: String, - client_secret: String, - refresh_token: String, - }, -} - -impl ApplicationDefaultCredentials { - pub fn new(path: Option<&str>) -> Result, Error> { - let file = match ApplicationDefaultCredentialsFile::read(path)? { - Some(f) => f, - None => return Ok(None), - }; - - Ok(Some(match file.type_.as_str() { - "authorized_user" => Self::AuthorizedUser { +pub fn application_default_credentials( + path: Option<&str>, + client: &ClientOptions, + retry: &RetryConfig, +) -> crate::Result> { + let file = match ApplicationDefaultCredentialsFile::read(path)? { + Some(x) => x, + None => return Ok(None), + }; + + match file.type_.as_str() { + // + "authorized_user" => { + let token = AuthorizedUserCredentials { client_id: file.client_id, client_secret: file.client_secret, refresh_token: file.refresh_token, - }, - type_ => return UnsupportedCredentialsTypeSnafu { type_ }.fail(), - })) + }; + + Ok(Some(Arc::new(TokenCredentialProvider::new( + token, + client.client()?, + retry.clone(), + )))) + } + type_ => Err(UnsupportedCredentialsType { + type_: type_.to_string(), + } + .into()), } } @@ -473,41 +476,43 @@ impl ApplicationDefaultCredentialsFile { const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; +/// +#[derive(Debug)] +struct AuthorizedUserCredentials { + client_id: String, + client_secret: String, + refresh_token: String, +} + #[async_trait] -impl TokenProvider for ApplicationDefaultCredentials { +impl TokenProvider for AuthorizedUserCredentials { + type Credential = GcpCredential; + async fn fetch_token( &self, client: &Client, retry: &RetryConfig, - ) -> Result, Error> { - let builder = client.request(Method::POST, DEFAULT_TOKEN_GCP_URI); - let builder = match self { - Self::AuthorizedUser { - client_id, - client_secret, - refresh_token, - } => { - let body = [ - ("grant_type", "refresh_token"), - ("client_id", client_id), - ("client_secret", client_secret), - ("refresh_token", refresh_token), - ]; - builder.form(&body) - } - }; - - let response = builder + ) -> crate::Result>> { + let response = client + .request(Method::POST, DEFAULT_TOKEN_GCP_URI) + .form(&[ + ("grant_type", "refresh_token"), + ("client_id", &self.client_id), + ("client_secret", &self.client_secret), + ("refresh_token", &self.refresh_token), + ]) .send_retry(retry) .await .context(TokenRequestSnafu)? .json::() .await .context(TokenResponseBodySnafu)?; - let token = TemporaryToken { - token: response.access_token, + + Ok(TemporaryToken { + token: Arc::new(GcpCredential { + bearer: response.access_token, + }), expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), - }; - Ok(token) + }) } } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 32f4055f1178..6813bbf6ecf7 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -48,9 +48,12 @@ use crate::client::header::header_meta; use crate::client::list::ListResponse; use crate::client::pagination::stream_paginated; use crate::client::retry::RetryExt; -use crate::client::{ClientConfigKey, GetOptionsExt}; +use crate::client::{ + ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, + TokenCredentialProvider, +}; +use crate::gcp::credential::{application_default_credentials, GcpCredential}; use crate::{ - client::token::TokenCache, multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, util::format_prefix, @@ -59,14 +62,15 @@ use crate::{ }; use self::credential::{ - default_gcs_base_url, ApplicationDefaultCredentials, InstanceCredentialProvider, - ServiceAccountCredentials, TokenProvider, + default_gcs_base_url, InstanceCredentialProvider, ServiceAccountCredentials, }; mod credential; const STORE: &str = "GCS"; +type GcpCredentialProvider = Arc>; + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] @@ -119,9 +123,6 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display("Could not find either metadata credentials or configuration properties to initialize GCS credentials."))] - MissingCredentials, - #[snafu(display( "One of service account path or service account key may be provided." ))] @@ -209,8 +210,7 @@ struct GoogleCloudStorageClient { client: Client, base_url: String, - token_provider: Option>>, - token_cache: TokenCache, + credentials: GcpCredentialProvider, bucket_name: String, bucket_name_encoded: String, @@ -223,18 +223,8 @@ struct GoogleCloudStorageClient { } impl GoogleCloudStorageClient { - async fn get_token(&self) -> Result { - if let Some(token_provider) = &self.token_provider { - Ok(self - .token_cache - .get_or_insert_with(|| { - token_provider.fetch_token(&self.client, &self.retry_config) - }) - .await - .context(CredentialSnafu)?) - } else { - Ok("".to_owned()) - } + async fn get_credential(&self) -> Result> { + self.credentials.get_credential().await } fn object_url(&self, path: &Path) -> String { @@ -249,7 +239,7 @@ impl GoogleCloudStorageClient { options: GetOptions, head: bool, ) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let method = match head { @@ -260,7 +250,7 @@ impl GoogleCloudStorageClient { let response = self .client .request(method, url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .with_get_options(options) .send_retry(&self.retry_config) .await @@ -273,7 +263,7 @@ impl GoogleCloudStorageClient { /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let content_type = self @@ -283,7 +273,7 @@ impl GoogleCloudStorageClient { self.client .request(Method::PUT, url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) .body(payload) @@ -298,7 +288,7 @@ impl GoogleCloudStorageClient { /// Initiate a multi-part upload async fn multipart_initiate(&self, path: &Path) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); let content_type = self @@ -309,7 +299,7 @@ impl GoogleCloudStorageClient { let response = self .client .request(Method::POST, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) @@ -338,12 +328,12 @@ impl GoogleCloudStorageClient { path: &str, multipart_id: &MultipartId, ) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); self.client .request(Method::DELETE, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploadId", multipart_id)]) @@ -356,12 +346,12 @@ impl GoogleCloudStorageClient { /// Perform a delete request async fn delete_request(&self, path: &Path) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(path); let builder = self.client.request(Method::DELETE, url); builder - .bearer_auth(token) + .bearer_auth(&credential.bearer) .send_retry(&self.retry_config) .await .context(DeleteRequestSnafu { @@ -378,7 +368,7 @@ impl GoogleCloudStorageClient { to: &Path, if_not_exists: bool, ) -> Result<()> { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = self.object_url(to); let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); @@ -394,7 +384,7 @@ impl GoogleCloudStorageClient { } builder - .bearer_auth(token) + .bearer_auth(&credential.bearer) // Needed if reqwest is compiled with native-tls instead of rustls-tls // See https://github.com/apache/arrow-rs/pull/3921 .header(header::CONTENT_LENGTH, 0) @@ -418,7 +408,7 @@ impl GoogleCloudStorageClient { delimiter: bool, page_token: Option<&str>, ) -> Result { - let token = self.get_token().await?; + let credential = self.get_credential().await?; let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); let mut query = Vec::with_capacity(5); @@ -443,7 +433,7 @@ impl GoogleCloudStorageClient { .client .request(Method::GET, url) .query(&query) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? @@ -495,9 +485,9 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - let token = self + let credential = self .client - .get_token() + .get_credential() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -505,7 +495,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .client .client .request(Method::PUT, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .query(&[ ("partNumber", format!("{}", part_idx + 1)), ("uploadId", upload_id), @@ -549,9 +539,9 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { }) .collect(); - let token = self + let credential = self .client - .get_token() + .get_credential() .await .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -567,7 +557,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { self.client .client .request(Method::POST, &url) - .bearer_auth(token) + .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) .body(data) .send_retry(&self.client.retry_config) @@ -1062,10 +1052,11 @@ impl GoogleCloudStorageBuilder { }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::new( + let application_default_credentials = application_default_credentials( self.application_credentials_path.as_deref(), - ) - .context(CredentialSnafu)?; + &self.client_options, + &self.retry_config, + )?; let disable_oauth = service_account_credentials .as_ref() @@ -1081,29 +1072,24 @@ impl GoogleCloudStorageBuilder { let scope = "https://www.googleapis.com/auth/devstorage.full_control"; let audience = "https://www.googleapis.com/oauth2/v4/token"; - let token_provider = if disable_oauth { - None + let credentials = if disable_oauth { + Arc::new(StaticCredentialProvider::new(GcpCredential { + bearer: "".to_string(), + })) as _ + } else if let Some(credentials) = service_account_credentials { + Arc::new(TokenCredentialProvider::new( + credentials.oauth_provider(scope, audience)?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(credentials) = application_default_credentials { + credentials } else { - let best_provider = if let Some(credentials) = service_account_credentials { - Some( - credentials - .token_provider(scope, audience) - .context(CredentialSnafu)?, - ) - } else if let Some(credentials) = application_default_credentials { - Some(Box::new(credentials) as Box) - } else { - Some(Box::new( - InstanceCredentialProvider::new( - audience, - self.client_options.clone(), - ) - .context(CredentialSnafu)?, - ) as Box) - }; - - // A provider is required at this point, bail out if we don't have one. - Some(best_provider.ok_or(Error::MissingCredentials)?) + Arc::new(TokenCredentialProvider::new( + InstanceCredentialProvider::new(audience), + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; let encoded_bucket_name = @@ -1113,8 +1099,7 @@ impl GoogleCloudStorageBuilder { client: Arc::new(GoogleCloudStorageClient { client, base_url: gcs_base_url, - token_provider: token_provider.map(Arc::new), - token_cache: Default::default(), + credentials, bucket_name, bucket_name_encoded: encoded_bucket_name, retry_config: self.retry_config, From a21ac9c15f3557ba8250720c978586c5294678a1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 12:49:24 +0100 Subject: [PATCH 0907/1411] Fix clippy lints (#4233) --- arrow-buffer/src/buffer/immutable.rs | 2 +- arrow-buffer/src/buffer/mutable.rs | 3 +-- arrow-buffer/src/native.rs | 4 +--- arrow-data/src/data/mod.rs | 2 ++ arrow-data/src/transform/utils.rs | 2 +- arrow-schema/src/ffi.rs | 17 +++++++++++------ 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 15d9ff7838c6..a4ab64b84e0c 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -80,7 +80,7 @@ impl Buffer { /// Initializes a [Buffer] from a slice of items. pub fn from_slice_ref>(items: T) -> Self { let slice = items.as_ref(); - let capacity = slice.len() * std::mem::size_of::(); + let capacity = std::mem::size_of_val(slice); let mut buffer = MutableBuffer::with_capacity(capacity); buffer.extend_from_slice(slice); buffer.into() diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 43c1cd004c92..3e66e7f23fa2 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -383,8 +383,7 @@ impl MutableBuffer { /// ``` #[inline] pub fn extend_from_slice(&mut self, items: &[T]) { - let len = items.len(); - let additional = len * std::mem::size_of::(); + let additional = mem::size_of_val(items); self.reserve(additional); unsafe { // this assumes that `[ToByteSlice]` can be copied directly diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 4ea06974bb0b..8fe6cf2b7894 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -223,9 +223,7 @@ impl ToByteSlice for [T] { #[inline] fn to_byte_slice(&self) -> &[u8] { let raw_ptr = self.as_ptr() as *const T as *const u8; - unsafe { - std::slice::from_raw_parts(raw_ptr, self.len() * std::mem::size_of::()) - } + unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of_val(self)) } } } diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 10bf973065a0..103161f5a80d 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -1976,6 +1976,7 @@ mod tests { assert!(!int_data.ptr_eq(&float_data)); assert!(int_data.ptr_eq(&int_data)); + #[allow(clippy::redundant_clone)] let int_data_clone = int_data.clone(); assert_eq!(int_data, int_data_clone); assert!(int_data.ptr_eq(&int_data_clone)); @@ -2003,6 +2004,7 @@ mod tests { assert!(string_data.ptr_eq(&string_data)); + #[allow(clippy::redundant_clone)] let string_data_cloned = string_data.clone(); assert!(string_data_cloned.ptr_eq(&string_data)); assert!(string_data.ptr_eq(&string_data_cloned)); diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index b1e3388ba84e..17bb87e88a5c 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -32,7 +32,7 @@ pub(super) fn extend_offsets( mut last_offset: T, offsets: &[T], ) { - buffer.reserve(offsets.len() * std::mem::size_of::()); + buffer.reserve(std::mem::size_of_val(offsets)); offsets.windows(2).for_each(|offsets| { // compute the new offset let length = offsets[1] - offsets[0]; diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 9078e35b32e3..cd3c207a56c5 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -37,20 +37,25 @@ use crate::{ ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionFields, UnionMode, }; -use bitflags::bitflags; use std::sync::Arc; use std::{ collections::HashMap, ffi::{c_char, c_void, CStr, CString}, }; -bitflags! { - pub struct Flags: i64 { - const DICTIONARY_ORDERED = 0b00000001; - const NULLABLE = 0b00000010; - const MAP_KEYS_SORTED = 0b00000100; +#[allow(clippy::assign_op_pattern)] +/// Workaround +mod flags { + use bitflags::bitflags; + bitflags! { + pub struct Flags: i64 { + const DICTIONARY_ORDERED = 0b00000001; + const NULLABLE = 0b00000010; + const MAP_KEYS_SORTED = 0b00000100; + } } } +pub use flags::*; /// ABI-compatible struct for `ArrowSchema` from C Data Interface /// See From 8580e858c73eab442deb74d194af31385d78c95c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 13:00:37 +0100 Subject: [PATCH 0908/1411] Prefetch page index (#4090) (#4216) * Prefetch page index (#4090) * Clippy * Docs * Review feedback * Tweak docs --- parquet/src/arrow/async_reader/metadata.rs | 338 +++++++++++++++++---- parquet/src/arrow/async_reader/mod.rs | 55 +--- parquet/src/arrow/async_reader/store.rs | 55 ++-- parquet/src/file/footer.rs | 13 +- parquet/src/file/metadata.rs | 12 + 5 files changed, 340 insertions(+), 133 deletions(-) diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 7470814faa17..076ae5c54052 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -15,13 +15,216 @@ // specific language governing permissions and limitations // under the License. +use crate::arrow::async_reader::AsyncFileReader; use crate::errors::{ParquetError, Result}; -use crate::file::footer::{decode_footer, decode_metadata}; +use crate::file::footer::{decode_footer, read_metadata}; use crate::file::metadata::ParquetMetaData; -use bytes::{BufMut, Bytes, BytesMut}; +use crate::file::page_index::index::Index; +use crate::file::page_index::index_reader::{ + acc_range, decode_column_index, decode_offset_index, +}; +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::FutureExt; use std::future::Future; +use std::io::Read; use std::ops::Range; +/// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`] +pub(crate) trait MetadataFetch { + fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result>; +} + +impl<'a, T: AsyncFileReader> MetadataFetch for &'a mut T { + fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result> { + self.get_bytes(range) + } +} + +/// An asynchronous interface to load [`ParquetMetaData`] from an async source +/// +/// Crate-private until stabilised +pub(crate) struct MetadataLoader { + /// Function that fetches byte ranges asynchronously + fetch: F, + /// The in-progress metadata + metadata: ParquetMetaData, + /// The offset and bytes of remaining unparsed data + remainder: Option<(usize, Bytes)>, +} + +impl MetadataLoader { + /// Create a new [`MetadataLoader`] by reading the footer information + /// + /// See [`fetch_parquet_metadata`] for the meaning of the individual parameters + pub async fn load( + mut fetch: F, + file_size: usize, + prefetch: Option, + ) -> Result { + if file_size < 8 { + return Err(ParquetError::EOF(format!( + "file size of {file_size} is less than footer" + ))); + } + + // If a size hint is provided, read more than the minimum size + // to try and avoid a second fetch. + let footer_start = if let Some(size_hint) = prefetch { + file_size.saturating_sub(size_hint) + } else { + file_size - 8 + }; + + let suffix = fetch.fetch(footer_start..file_size).await?; + let suffix_len = suffix.len(); + + let mut footer = [0; 8]; + footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); + + let length = decode_footer(&footer)?; + + if file_size < length + 8 { + return Err(ParquetError::EOF(format!( + "file size of {} is less than footer + metadata {}", + file_size, + length + 8 + ))); + } + + // Did not fetch the entire file metadata in the initial read, need to make a second request + let (metadata, remainder) = if length > suffix_len - 8 { + let metadata_start = file_size - length - 8; + let remaining_metadata = fetch.fetch(metadata_start..footer_start).await?; + + let reader = remaining_metadata.as_ref().chain(&suffix[..suffix_len - 8]); + (read_metadata(reader)?, None) + } else { + let metadata_start = file_size - length - 8 - footer_start; + + let slice = &suffix[metadata_start..suffix_len - 8]; + ( + read_metadata(slice)?, + Some((footer_start, suffix.slice(..metadata_start))), + ) + }; + + Ok(Self { + fetch, + metadata, + remainder, + }) + } + + /// Create a new [`MetadataLoader`] from an existing [`ParquetMetaData`] + pub fn new(fetch: F, metadata: ParquetMetaData) -> Self { + Self { + fetch, + metadata, + remainder: None, + } + } + + /// Loads the page index, if any + /// + /// * `column_index`: if true will load column index + /// * `offset_index`: if true will load offset index + pub async fn load_page_index( + &mut self, + column_index: bool, + offset_index: bool, + ) -> Result<()> { + if !column_index && !offset_index { + return Ok(()); + } + + let mut range = None; + for c in self.metadata.row_groups().iter().flat_map(|r| r.columns()) { + range = acc_range(range, c.column_index_range()); + range = acc_range(range, c.offset_index_range()); + } + let range = match range { + None => return Ok(()), + Some(range) => range, + }; + + let data = match &self.remainder { + Some((remainder_start, remainder)) if *remainder_start <= range.start => { + let offset = range.start - *remainder_start; + remainder.slice(offset..range.end - *remainder_start + offset) + } + // Note: this will potentially fetch data already in remainder, this keeps things simple + _ => self.fetch.fetch(range.start..range.end).await?, + }; + + // Sanity check + assert_eq!(data.len(), range.end - range.start); + let offset = range.start; + + if column_index { + let index = self + .metadata + .row_groups() + .iter() + .map(|x| { + x.columns() + .iter() + .map(|c| match c.column_index_range() { + Some(r) => decode_column_index( + &data[r.start - offset..r.end - offset], + c.column_type(), + ), + None => Ok(Index::NONE), + }) + .collect::>>() + }) + .collect::>>()?; + + self.metadata.set_column_index(Some(index)); + } + + if offset_index { + let index = self + .metadata + .row_groups() + .iter() + .map(|x| { + x.columns() + .iter() + .map(|c| match c.offset_index_range() { + Some(r) => decode_offset_index( + &data[r.start - offset..r.end - offset], + ), + None => Err(general_err!("missing offset index")), + }) + .collect::>>() + }) + .collect::>>()?; + + self.metadata.set_offset_index(Some(index)); + } + + Ok(()) + } + + /// Returns the finished [`ParquetMetaData`] + pub fn finish(self) -> ParquetMetaData { + self.metadata + } +} + +struct MetadataFetchFn(F); + +impl MetadataFetch for MetadataFetchFn +where + F: FnMut(Range) -> Fut + Send, + Fut: Future> + Send, +{ + fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result> { + async move { self.0(range).await }.boxed() + } +} + /// Fetches parquet metadata /// /// Parameters: @@ -34,67 +237,22 @@ use std::ops::Range; /// the last 8 bytes to determine the footer's precise length, before /// issuing a second request to fetch the metadata bytes /// -/// If a hint is set, this method will read the specified number of bytes -/// in the first request, instead of 8, and only issue a second request -/// if additional bytes are needed. This can therefore eliminate a -/// potentially costly additional fetch operation +/// If `prefetch` is `Some`, this will read the specified number of bytes +/// in the first request, instead of 8, and only issue further requests +/// if additional bytes are needed. Providing a `prefetch` hint can therefore +/// significantly reduce the number of `fetch` requests, and consequently latency pub async fn fetch_parquet_metadata( - mut fetch: F, + fetch: F, file_size: usize, - footer_size_hint: Option, + prefetch: Option, ) -> Result where - F: FnMut(Range) -> Fut, - Fut: Future>, + F: FnMut(Range) -> Fut + Send, + Fut: Future> + Send, { - if file_size < 8 { - return Err(ParquetError::EOF(format!( - "file size of {file_size} is less than footer" - ))); - } - - // If a size hint is provided, read more than the minimum size - // to try and avoid a second fetch. - let footer_start = if let Some(size_hint) = footer_size_hint { - file_size.saturating_sub(size_hint) - } else { - file_size - 8 - }; - - let suffix = fetch(footer_start..file_size).await?; - let suffix_len = suffix.len(); - - let mut footer = [0; 8]; - footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); - - let length = decode_footer(&footer)?; - - if file_size < length + 8 { - return Err(ParquetError::EOF(format!( - "file size of {} is less than footer + metadata {}", - file_size, - length + 8 - ))); - } - - // Did not fetch the entire file metadata in the initial read, need to make a second request - if length > suffix_len - 8 { - let metadata_start = file_size - length - 8; - let remaining_metadata = fetch(metadata_start..footer_start).await?; - - let mut metadata = BytesMut::with_capacity(length); - - metadata.put(remaining_metadata.as_ref()); - metadata.put(&suffix[..suffix_len - 8]); - - Ok(decode_metadata(metadata.as_ref())?) - } else { - let metadata_start = file_size - length - 8; - - Ok(decode_metadata( - &suffix[metadata_start - footer_start..suffix_len - 8], - )?) - } + let fetch = MetadataFetchFn(fetch); + let loader = MetadataLoader::load(fetch, file_size, prefetch).await?; + Ok(loader.finish()) } #[cfg(test)] @@ -104,6 +262,7 @@ mod tests { use crate::util::test_common::file_util::get_test_file; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; + use std::sync::atomic::{AtomicUsize, Ordering}; fn read_range(file: &mut File, range: Range) -> Result { file.seek(SeekFrom::Start(range.start as _))?; @@ -120,28 +279,40 @@ mod tests { let reader = SerializedFileReader::new(file.try_clone().unwrap()).unwrap(); let expected = reader.metadata().file_metadata().schema(); + let fetch_count = AtomicUsize::new(0); + + let mut fetch = |range| { + fetch_count.fetch_add(1, Ordering::SeqCst); + futures::future::ready(read_range(&mut file, range)) + }; - let mut fetch = |range| futures::future::ready(read_range(&mut file, range)); let actual = fetch_parquet_metadata(&mut fetch, len, None).await.unwrap(); assert_eq!(actual.file_metadata().schema(), expected); + assert_eq!(fetch_count.load(Ordering::SeqCst), 2); // Metadata hint too small + fetch_count.store(0, Ordering::SeqCst); let actual = fetch_parquet_metadata(&mut fetch, len, Some(10)) .await .unwrap(); assert_eq!(actual.file_metadata().schema(), expected); + assert_eq!(fetch_count.load(Ordering::SeqCst), 2); // Metadata hint too large + fetch_count.store(0, Ordering::SeqCst); let actual = fetch_parquet_metadata(&mut fetch, len, Some(500)) .await .unwrap(); assert_eq!(actual.file_metadata().schema(), expected); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); // Metadata hint exactly correct + fetch_count.store(0, Ordering::SeqCst); let actual = fetch_parquet_metadata(&mut fetch, len, Some(428)) .await .unwrap(); assert_eq!(actual.file_metadata().schema(), expected); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); let err = fetch_parquet_metadata(&mut fetch, 4, None) .await @@ -155,4 +326,53 @@ mod tests { .to_string(); assert_eq!(err, "Parquet error: Invalid Parquet file. Corrupt footer"); } + + #[tokio::test] + async fn test_page_index() { + let mut file = get_test_file("alltypes_tiny_pages.parquet"); + let len = file.len() as usize; + let fetch_count = AtomicUsize::new(0); + let mut fetch = |range| { + fetch_count.fetch_add(1, Ordering::SeqCst); + futures::future::ready(read_range(&mut file, range)) + }; + + let f = MetadataFetchFn(&mut fetch); + let mut loader = MetadataLoader::load(f, len, None).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 2); + loader.load_page_index(true, true).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 3); + let metadata = loader.finish(); + assert!(metadata.offset_index().is_some() && metadata.column_index().is_some()); + + // Prefetch just footer exactly + fetch_count.store(0, Ordering::SeqCst); + let f = MetadataFetchFn(&mut fetch); + let mut loader = MetadataLoader::load(f, len, Some(1729)).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); + loader.load_page_index(true, true).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 2); + let metadata = loader.finish(); + assert!(metadata.offset_index().is_some() && metadata.column_index().is_some()); + + // Prefetch more than footer but not enough + fetch_count.store(0, Ordering::SeqCst); + let f = MetadataFetchFn(&mut fetch); + let mut loader = MetadataLoader::load(f, len, Some(130649)).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); + loader.load_page_index(true, true).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 2); + let metadata = loader.finish(); + assert!(metadata.offset_index().is_some() && metadata.column_index().is_some()); + + // Prefetch exactly enough + fetch_count.store(0, Ordering::SeqCst); + let f = MetadataFetchFn(&mut fetch); + let mut loader = MetadataLoader::load(f, len, Some(130650)).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); + loader.load_page_index(true, true).await.unwrap(); + assert_eq!(fetch_count.load(Ordering::SeqCst), 1); + let metadata = loader.finish(); + assert!(metadata.offset_index().is_some() && metadata.column_index().is_some()); + } } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 3d4277a831da..fb81a2b5d966 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -107,10 +107,6 @@ use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; -use crate::file::page_index::index::Index; -use crate::file::page_index::index_reader::{ - acc_range, decode_column_index, decode_offset_index, -}; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::format::PageLocation; @@ -243,53 +239,10 @@ impl ArrowReaderBuilder> { && metadata.column_index().is_none() && metadata.offset_index().is_none() { - let fetch = metadata.row_groups().iter().flat_map(|r| r.columns()).fold( - None, - |a, c| { - let a = acc_range(a, c.column_index_range()); - acc_range(a, c.offset_index_range()) - }, - ); - - if let Some(fetch) = fetch { - let bytes = input.get_bytes(fetch.clone()).await?; - let get = |r: Range| { - &bytes[(r.start - fetch.start)..(r.end - fetch.start)] - }; - - let mut offset_index = Vec::with_capacity(metadata.num_row_groups()); - let mut column_index = Vec::with_capacity(metadata.num_row_groups()); - for rg in metadata.row_groups() { - let columns = rg.columns(); - let mut rg_offset_index = Vec::with_capacity(columns.len()); - let mut rg_column_index = Vec::with_capacity(columns.len()); - - for chunk in rg.columns() { - let t = chunk.column_type(); - let c = match chunk.column_index_range() { - Some(range) => decode_column_index(get(range), t)?, - None => Index::NONE, - }; - - let o = match chunk.offset_index_range() { - Some(range) => decode_offset_index(get(range))?, - None => return Err(general_err!("missing offset index")), - }; - - rg_column_index.push(c); - rg_offset_index.push(o); - } - offset_index.push(rg_offset_index); - column_index.push(rg_column_index); - } - - metadata = Arc::new(ParquetMetaData::new_with_page_index( - metadata.file_metadata().clone(), - metadata.row_groups().to_vec(), - Some(column_index), - Some(offset_index), - )); - } + let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); + let mut loader = MetadataLoader::new(&mut input, m); + loader.load_page_index(true, true).await?; + metadata = Arc::new(loader.finish()) } Self::new_builder(AsyncReader(input), metadata, options) diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index eb64b11b9440..40d982cedf40 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -24,7 +24,7 @@ use futures::{FutureExt, TryFutureExt}; use object_store::{ObjectMeta, ObjectStore}; -use crate::arrow::async_reader::{fetch_parquet_metadata, AsyncFileReader}; +use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; @@ -34,6 +34,8 @@ pub struct ParquetObjectReader { store: Arc, meta: ObjectMeta, metadata_size_hint: Option, + preload_column_index: bool, + preload_offset_index: bool, } impl ParquetObjectReader { @@ -45,16 +47,35 @@ impl ParquetObjectReader { store, meta, metadata_size_hint: None, + preload_column_index: false, + preload_offset_index: false, } } - /// Provide a hint as to the size of the parquet file's footer, see [fetch_parquet_metadata] + /// Provide a hint as to the size of the parquet file's footer, + /// see [fetch_parquet_metadata](crate::arrow::async_reader::fetch_parquet_metadata) pub fn with_footer_size_hint(self, hint: usize) -> Self { Self { metadata_size_hint: Some(hint), ..self } } + + /// Load the Column Index as part of [`Self::get_metadata`] + pub fn with_preload_column_index(self, preload_column_index: bool) -> Self { + Self { + preload_column_index, + ..self + } + } + + /// Load the Offset Index as part of [`Self::get_metadata`] + pub fn with_preload_offset_index(self, preload_offset_index: bool) -> Self { + Self { + preload_offset_index, + ..self + } + } } impl AsyncFileReader for ParquetObjectReader { @@ -89,21 +110,15 @@ impl AsyncFileReader for ParquetObjectReader { fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { Box::pin(async move { - let metadata = fetch_parquet_metadata( - |range| { - self.store - .get_range(&self.meta.location, range) - .map_err(|e| { - ParquetError::General(format!( - "ParquetObjectReader::get_metadata error: {e}" - )) - }) - }, - self.meta.size, - self.metadata_size_hint, - ) - .await?; - Ok(Arc::new(metadata)) + let preload_column_index = self.preload_column_index; + let preload_offset_index = self.preload_offset_index; + let file_size = self.meta.size; + let prefetch = self.metadata_size_hint; + let mut loader = MetadataLoader::load(self, file_size, prefetch).await?; + loader + .load_page_index(preload_column_index, preload_offset_index) + .await?; + Ok(Arc::new(loader.finish())) }) } } @@ -150,7 +165,11 @@ mod tests { Ok(_) => panic!("expected failure"), Err(e) => { let err = e.to_string(); - assert!(err.contains("Parquet error: ParquetObjectReader::get_metadata error: Object at location") && err.contains("not found: No such file or directory (os error 2)"), "{}", err); + assert!( + err.contains("not found: No such file or directory (os error 2)"), + "{}", + err + ); } } } diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 7cc92afc014a..fcd6a300c5fb 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -61,16 +61,19 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result { + read_metadata(metadata_read) +} + +/// Decodes [`ParquetMetaData`] from the provided [`Read`] +pub(crate) fn read_metadata(read: R) -> Result { // TODO: row group filtering - let mut prot = TCompactInputProtocol::new(metadata_read); + let mut prot = TCompactInputProtocol::new(read); let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) .map_err(|e| ParquetError::General(format!("Could not parse metadata: {e}")))?; let schema = types::from_thrift(&t_file_metadata.schema)?; diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 85287c3e0e85..c2961aa76d06 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -153,6 +153,18 @@ impl ParquetMetaData { pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> { self.offset_index.as_ref() } + + /// Override the column index + #[allow(dead_code)] + pub(crate) fn set_column_index(&mut self, index: Option) { + self.column_index = index; + } + + /// Override the offset index + #[allow(dead_code)] + pub(crate) fn set_offset_index(&mut self, index: Option) { + self.offset_index = index; + } } pub type KeyValue = crate::format::KeyValue; From ca0278d5d88bdda71a5b0d67e4e5e0099f17067a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 May 2023 15:45:27 +0100 Subject: [PATCH 0909/1411] Update proc-macro2 requirement from =1.0.57 to =1.0.58 (#4236) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.57...1.0.58) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 293230733a96..ce719d05b698 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.57", default-features = false } +proc-macro2 = { version = "=1.0.58", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 11d2fe390b7d3ba8c23ac33545dbf75933be9f8b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 17 May 2023 21:13:25 +0100 Subject: [PATCH 0910/1411] Expose credential provider (#4235) --- object_store/src/aws/mod.rs | 159 ++++++++++++++++------------- object_store/src/azure/mod.rs | 25 ++++- object_store/src/client/mod.rs | 2 + object_store/src/gcp/credential.rs | 1 + object_store/src/gcp/mod.rs | 30 +++++- object_store/src/lib.rs | 2 +- 6 files changed, 137 insertions(+), 82 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index ddb9dc799501..a10561ba613b 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -47,9 +47,7 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{ - AwsCredential, InstanceCredentialProvider, WebIdentityProvider, -}; +use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; use crate::client::header::header_meta; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, @@ -85,7 +83,9 @@ const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.rem const STORE: &str = "S3"; -type AwsCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`AmazonS3`] +pub type AwsCredentialProvider = Arc>; +pub use credential::AwsCredential; /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -209,6 +209,13 @@ impl std::fmt::Display for AmazonS3 { } } +impl AmazonS3 { + /// Returns the [`AwsCredentialProvider`] used by [`AmazonS3`] + pub fn credentials(&self) -> &AwsCredentialProvider { + &self.client.config().credentials + } +} + #[async_trait] impl ObjectStore for AmazonS3 { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { @@ -424,6 +431,8 @@ pub struct AmazonS3Builder { profile: Option, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`AmazonS3Builder`] @@ -879,6 +888,12 @@ impl AmazonS3Builder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -992,7 +1007,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let region = match (self.region.clone(), self.profile.clone()) { + let region = match (self.region, self.profile.clone()) { (Some(region), _) => Some(region), (None, Some(profile)) => profile_region(profile), (None, None) => None, @@ -1002,76 +1017,74 @@ impl AmazonS3Builder { let region = region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; - let credentials = match (self.access_key_id, self.secret_access_key, self.token) { - (Some(key_id), Some(secret_key), token) => { - info!("Using Static credential provider"); - let credential = AwsCredential { - key_id, - secret_key, - token, - }; - Arc::new(StaticCredentialProvider::new(credential)) as _ - } - (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - // TODO: Replace with `AmazonS3Builder::credentials_from_env` - _ => match ( - std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), - std::env::var("AWS_ROLE_ARN"), - ) { - (Ok(token_path), Ok(role_arn)) => { - info!("Using WebIdentity credential provider"); - - let session_name = std::env::var("AWS_ROLE_SESSION_NAME") - .unwrap_or_else(|_| "WebIdentitySession".to_string()); - - let endpoint = format!("https://sts.{region}.amazonaws.com"); - - // Disallow non-HTTPs requests - let client = self - .client_options - .clone() - .with_allow_http(false) - .client()?; - - let token = WebIdentityProvider { - token_path, - session_name, - role_arn, - endpoint, - }; - - Arc::new(TokenCredentialProvider::new( + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { + match (self.access_key_id, self.secret_access_key, self.token) { + (Some(key_id), Some(secret_key), token) => { + info!("Using Static credential provider"); + let credential = AwsCredential { + key_id, + secret_key, token, - client, - self.retry_config.clone(), - )) as _ + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ } - _ => match self.profile { - Some(profile) => { - info!("Using profile \"{}\" credential provider", profile); - profile_credentials(profile, region.clone())? - } - None => { - info!("Using Instance credential provider"); - - let token = InstanceCredentialProvider { - cache: Default::default(), - imdsv1_fallback: self.imdsv1_fallback.get()?, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| METADATA_ENDPOINT.into()), - }; - - Arc::new(TokenCredentialProvider::new( - token, - // The instance metadata endpoint is access over HTTP - self.client_options.clone().with_allow_http(true).client()?, - self.retry_config.clone(), - )) as _ - } - }, - }, + (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + (None, None, _) => unreachable!(), + } + } else if let (Ok(token_path), Ok(role_arn)) = ( + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_ROLE_ARN"), + ) { + // TODO: Replace with `AmazonS3Builder::credentials_from_env` + info!("Using WebIdentity credential provider"); + + let session_name = std::env::var("AWS_ROLE_SESSION_NAME") + .unwrap_or_else(|_| "WebIdentitySession".to_string()); + + let endpoint = format!("https://sts.{region}.amazonaws.com"); + + // Disallow non-HTTPs requests + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; + + let token = WebIdentityProvider { + token_path, + session_name, + role_arn, + endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, + client, + self.retry_config.clone(), + )) as _ + } else if let Some(profile) = self.profile { + info!("Using profile \"{}\" credential provider", profile); + profile_credentials(profile, region.clone())? + } else { + info!("Using Instance credential provider"); + + let token = InstanceCredentialProvider { + cache: Default::default(), + imdsv1_fallback: self.imdsv1_fallback.get()?, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| METADATA_ENDPOINT.into()), + }; + + Arc::new(TokenCredentialProvider::new( + token, + // The instance metadata endpoint is access over HTTP + self.client_options.clone().with_allow_http(true).client()?, + self.retry_config.clone(), + )) as _ }; let endpoint: String; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 6dc14cfb54e9..069b033d1896 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -48,7 +48,6 @@ use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::azure::credential::AzureCredential; use crate::client::header::header_meta; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, @@ -60,7 +59,10 @@ pub use credential::authority_hosts; mod client; mod credential; -type AzureCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`MicrosoftAzure`] +pub type AzureCredentialProvider = + Arc>; +pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; @@ -153,6 +155,13 @@ pub struct MicrosoftAzure { client: Arc, } +impl MicrosoftAzure { + /// Returns the [`AzureCredentialProvider`] used by [`MicrosoftAzure`] + pub fn credentials(&self) -> &AzureCredentialProvider { + &self.client.config().credentials + } +} + impl std::fmt::Display for MicrosoftAzure { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -374,6 +383,8 @@ pub struct MicrosoftAzureBuilder { retry_config: RetryConfig, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -840,6 +851,12 @@ impl MicrosoftAzureBuilder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Set if the Azure emulator should be used (defaults to false) pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { self.use_emulator = use_emulator.into(); @@ -937,7 +954,9 @@ impl MicrosoftAzureBuilder { let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; - let credential = if let Some(bearer_token) = self.bearer_token { + let credential = if let Some(credential) = self.credentials { + credential + } else if let Some(bearer_token) = self.bearer_token { static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { static_creds(AzureCredential::AccessKey(access_key)) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 292e4678fd69..8c23576994fa 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -509,8 +509,10 @@ impl GetOptionsExt for RequestBuilder { /// Provides credentials for use when signing requests #[async_trait] pub trait CredentialProvider: std::fmt::Debug + Send + Sync { + /// The type of credential returned by this provider type Credential; + /// Return a credential async fn get_credential(&self) -> Result>; } diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index ad12855e19ef..205b805947cc 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -82,6 +82,7 @@ impl From for crate::Error { } } +/// A Google Cloud Storage Credential #[derive(Debug, Eq, PartialEq)] pub struct GcpCredential { /// An HTTP bearer token diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 6813bbf6ecf7..21ba1588fbe8 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -52,7 +52,6 @@ use crate::client::{ ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, TokenCredentialProvider, }; -use crate::gcp::credential::{application_default_credentials, GcpCredential}; use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, @@ -61,15 +60,18 @@ use crate::{ ObjectStore, Result, RetryConfig, }; -use self::credential::{ - default_gcs_base_url, InstanceCredentialProvider, ServiceAccountCredentials, +use credential::{ + application_default_credentials, default_gcs_base_url, InstanceCredentialProvider, + ServiceAccountCredentials, }; mod credential; const STORE: &str = "GCS"; -type GcpCredentialProvider = Arc>; +/// [`CredentialProvider`] for [`GoogleCloudStorage`] +pub type GcpCredentialProvider = Arc>; +pub use credential::GcpCredential; #[derive(Debug, Snafu)] enum Error { @@ -205,6 +207,13 @@ impl std::fmt::Display for GoogleCloudStorage { } } +impl GoogleCloudStorage { + /// Returns the [`GcpCredentialProvider`] used by [`GoogleCloudStorage`] + pub fn credentials(&self) -> &GcpCredentialProvider { + &self.client.credentials + } +} + #[derive(Debug)] struct GoogleCloudStorageClient { client: Client, @@ -696,6 +705,8 @@ pub struct GoogleCloudStorageBuilder { retry_config: RetryConfig, /// Client options client_options: ClientOptions, + /// Credentials + credentials: Option, } /// Configuration keys for [`GoogleCloudStorageBuilder`] @@ -794,6 +805,7 @@ impl Default for GoogleCloudStorageBuilder { retry_config: Default::default(), client_options: ClientOptions::new().with_allow_http(true), url: None, + credentials: None, } } } @@ -1006,6 +1018,12 @@ impl GoogleCloudStorageBuilder { self } + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + /// Set the retry configuration pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { self.retry_config = retry_config; @@ -1072,7 +1090,9 @@ impl GoogleCloudStorageBuilder { let scope = "https://www.googleapis.com/auth/devstorage.full_control"; let audience = "https://www.googleapis.com/oauth2/v4/token"; - let credentials = if disable_oauth { + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if disable_oauth { Arc::new(StaticCredentialProvider::new(GcpCredential { bearer: "".to_string(), })) as _ diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 0f3ed809e424..7116a8732ba6 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -245,7 +245,7 @@ pub mod throttle; mod client; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] -pub use client::{backoff::BackoffConfig, retry::RetryConfig}; +pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider}; #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod config; From a5c1a33af88d56bcd9a297d77305f033598e2428 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 08:50:56 +0100 Subject: [PATCH 0911/1411] Remove AWS_PROFILE support (#4238) --- .github/workflows/object_store.yml | 2 - object_store/Cargo.toml | 8 -- object_store/src/aws/mod.rs | 127 +-------------------------- object_store/src/aws/profile.rs | 133 ----------------------------- 4 files changed, 1 insertion(+), 269 deletions(-) delete mode 100644 object_store/src/aws/profile.rs diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index df43ae3bf76a..5ae9d2d9c83f 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -56,8 +56,6 @@ jobs: run: cargo clippy -- -D warnings - name: Run clippy with aws feature run: cargo clippy --features aws -- -D warnings - - name: Run clippy with aws_profile feature - run: cargo clippy --features aws_profile -- -D warnings - name: Run clippy with gcp feature run: cargo clippy --features gcp -- -D warnings - name: Run clippy with azure feature diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c6b89fa23186..bd9c973e052a 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -53,11 +53,6 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls" ring = { version = "0.16", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -# AWS Profile support -aws-types = { version = "0.55", optional = true } -aws-credential-types = { version = "0.55", optional = true } -aws-config = { version = "0.55", optional = true } - [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } @@ -74,9 +69,6 @@ gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] http = ["cloud"] -# Experimental support for AWS_PROFILE -aws_profile = ["aws", "aws-config", "aws-types", "aws-credential-types"] - [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a10561ba613b..a7f43d1532ab 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -64,9 +64,6 @@ mod checksum; mod client; mod credential; -#[cfg(feature = "aws_profile")] -mod profile; - // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -106,9 +103,6 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, - #[snafu(display("Profile support requires aws_profile feature"))] - MissingProfileFeature, - #[snafu(display("ETag Header missing from response"))] MissingEtag, @@ -427,8 +421,6 @@ pub struct AmazonS3Builder { checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, - /// Profile name, see - profile: Option, /// Client options client_options: ClientOptions, /// Credentials @@ -559,13 +551,6 @@ pub enum AmazonS3ConfigKey { /// - `metadata_endpoint` MetadataEndpoint, - /// AWS profile name - /// - /// Supported keys: - /// - `aws_profile` - /// - `profile` - Profile, - /// Client options Client(ClientConfigKey), } @@ -583,7 +568,6 @@ impl AsRef for AmazonS3ConfigKey { Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", Self::DefaultRegion => "aws_default_region", Self::MetadataEndpoint => "aws_metadata_endpoint", - Self::Profile => "aws_profile", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", Self::Client(opt) => opt.as_ref(), @@ -612,7 +596,6 @@ impl FromStr for AmazonS3ConfigKey { "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { Ok(Self::VirtualHostedStyleRequest) } - "aws_profile" | "profile" => Ok(Self::Profile), "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), @@ -643,7 +626,6 @@ impl AmazonS3Builder { /// * `AWS_SESSION_TOKEN` -> token /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS - /// * `AWS_PROFILE` -> set profile name, requires `aws_profile` feature enabled /// # Example /// ``` /// use object_store::aws::AmazonS3Builder; @@ -727,7 +709,6 @@ impl AmazonS3Builder { AmazonS3ConfigKey::MetadataEndpoint => { self.metadata_endpoint = Some(value.into()) } - AmazonS3ConfigKey::Profile => self.profile = Some(value.into()), AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) @@ -794,7 +775,6 @@ impl AmazonS3Builder { Some(self.virtual_hosted_style_request.to_string()) } AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), - AmazonS3ConfigKey::Profile => self.profile.clone(), AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm.as_ref().map(ToString::to_string) @@ -982,24 +962,6 @@ impl AmazonS3Builder { self } - /// Set the AWS profile name, see - /// - /// This makes use of [aws-config] to provide credentials and therefore requires - /// the `aws-profile` feature to be enabled - /// - /// It is strongly encouraged that users instead make use of a credential manager - /// such as [aws-vault] not only to avoid the significant additional dependencies, - /// but also to avoid storing credentials in [plain text on disk] - /// - /// [aws-config]: https://docs.rs/aws-config - /// [aws-vault]: https://github.com/99designs/aws-vault - /// [plain text on disk]: https://99designs.com.au/blog/engineering/aws-vault/ - #[cfg(feature = "aws_profile")] - pub fn with_profile(mut self, profile: impl Into) -> Self { - self.profile = Some(profile.into()); - self - } - /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -1007,14 +969,8 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let region = match (self.region, self.profile.clone()) { - (Some(region), _) => Some(region), - (None, Some(profile)) => profile_region(profile), - (None, None) => None, - }; - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = region.context(MissingRegionSnafu)?; + let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { @@ -1065,9 +1021,6 @@ impl AmazonS3Builder { client, self.retry_config.clone(), )) as _ - } else if let Some(profile) = self.profile { - info!("Using profile \"{}\" credential provider", profile); - profile_credentials(profile, region.clone())? } else { info!("Using Instance credential provider"); @@ -1123,37 +1076,6 @@ impl AmazonS3Builder { } } -#[cfg(feature = "aws_profile")] -fn profile_region(profile: String) -> Option { - use tokio::runtime::Handle; - - let handle = Handle::current(); - let provider = profile::ProfileProvider::new(profile, None); - - handle.block_on(provider.get_region()) -} - -#[cfg(feature = "aws_profile")] -fn profile_credentials(profile: String, region: String) -> Result { - Ok(Arc::new(profile::ProfileProvider::new( - profile, - Some(region), - ))) -} - -#[cfg(not(feature = "aws_profile"))] -fn profile_region(_profile: String) -> Option { - None -} - -#[cfg(not(feature = "aws_profile"))] -fn profile_credentials( - _profile: String, - _region: String, -) -> Result { - Err(Error::MissingProfileFeature.into()) -} - #[cfg(test)] mod tests { use super::*; @@ -1638,50 +1560,3 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } } - -#[cfg(all(test, feature = "aws_profile"))] -mod profile_tests { - use super::*; - use std::env; - - use super::profile::{TEST_PROFILE_NAME, TEST_PROFILE_REGION}; - - #[tokio::test] - async fn s3_test_region_from_profile() { - let s3_url = "s3://bucket/prefix".to_owned(); - - let s3 = AmazonS3Builder::new() - .with_url(s3_url) - .with_profile(TEST_PROFILE_NAME) - .build() - .unwrap(); - - let region = &s3.client.config().region; - - assert_eq!(region, TEST_PROFILE_REGION); - } - - #[test] - fn s3_test_region_override() { - let s3_url = "s3://bucket/prefix".to_owned(); - - let aws_profile = - env::var("AWS_PROFILE").unwrap_or_else(|_| TEST_PROFILE_NAME.into()); - - let aws_region = - env::var("AWS_REGION").unwrap_or_else(|_| "object_store:fake_region".into()); - - env::set_var("AWS_PROFILE", aws_profile); - - let s3 = AmazonS3Builder::from_env() - .with_url(s3_url) - .with_region(aws_region.clone()) - .build() - .unwrap(); - - let actual = &s3.client.config().region; - let expected = &aws_region; - - assert_eq!(actual, expected); - } -} diff --git a/object_store/src/aws/profile.rs b/object_store/src/aws/profile.rs deleted file mode 100644 index 3fc08056444e..000000000000 --- a/object_store/src/aws/profile.rs +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![cfg(feature = "aws_profile")] - -use async_trait::async_trait; -use aws_config::meta::region::ProvideRegion; -use aws_config::profile::profile_file::ProfileFiles; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::profile::ProfileFileRegionProvider; -use aws_config::provider_config::ProviderConfig; -use aws_credential_types::provider::ProvideCredentials; -use aws_types::region::Region; -use std::sync::Arc; -use std::time::Instant; -use std::time::SystemTime; - -use crate::aws::AwsCredential; -use crate::client::token::{TemporaryToken, TokenCache}; -use crate::client::CredentialProvider; -use crate::Result; - -#[cfg(test)] -pub static TEST_PROFILE_NAME: &str = "object_store:fake_profile"; - -#[cfg(test)] -pub static TEST_PROFILE_REGION: &str = "object_store:fake_region_from_profile"; - -#[derive(Debug)] -pub struct ProfileProvider { - name: String, - region: Option, - cache: TokenCache>, -} - -impl ProfileProvider { - pub fn new(name: String, region: Option) -> Self { - Self { - name, - region, - cache: Default::default(), - } - } - - #[cfg(test)] - fn profile_files(&self) -> ProfileFiles { - use aws_config::profile::profile_file::ProfileFileKind; - - let config = format!( - "[profile {}]\nregion = {}", - TEST_PROFILE_NAME, TEST_PROFILE_REGION - ); - - ProfileFiles::builder() - .with_contents(ProfileFileKind::Config, config) - .build() - } - - #[cfg(not(test))] - fn profile_files(&self) -> ProfileFiles { - ProfileFiles::default() - } - - pub async fn get_region(&self) -> Option { - if let Some(region) = self.region.clone() { - return Some(region); - } - - let provider = ProfileFileRegionProvider::builder() - .profile_files(self.profile_files()) - .profile_name(&self.name) - .build(); - - let region = provider.region().await; - - region.map(|r| r.as_ref().to_owned()) - } -} - -#[async_trait] -impl CredentialProvider for ProfileProvider { - type Credential = AwsCredential; - - async fn get_credential(&self) -> Result> { - self.cache - .get_or_insert_with(move || async move { - let region = self.region.clone().map(Region::new); - - let config = ProviderConfig::default().with_region(region); - - let credentials = ProfileFileCredentialsProvider::builder() - .configure(&config) - .profile_name(&self.name) - .build(); - - let c = credentials.provide_credentials().await.map_err(|source| { - crate::Error::Generic { - store: "S3", - source: Box::new(source), - } - })?; - let t_now = SystemTime::now(); - let expiry = c - .expiry() - .and_then(|e| e.duration_since(t_now).ok()) - .map(|ttl| Instant::now() + ttl); - - Ok(TemporaryToken { - token: Arc::new(AwsCredential { - key_id: c.access_key_id().to_string(), - secret_key: c.secret_access_key().to_string(), - token: c.session_token().map(ToString::to_string), - }), - expiry, - }) - }) - .await - } -} From f56690369b7fb7cecd5c57bc274f6560f37de5ca Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:05:41 +0100 Subject: [PATCH 0912/1411] Expose AwsAuthorizer (#4237) * Expose AWSAuthorizer * Review feedback --- object_store/src/aws/client.rs | 3 +- object_store/src/aws/credential.rs | 128 +++++++++++++++++++---------- object_store/src/aws/mod.rs | 3 +- 3 files changed, 86 insertions(+), 48 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 8ce743b31be9..2c45050fad04 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -238,7 +238,7 @@ impl S3Client { &self.config.region, "s3", self.config.sign_payload, - payload_sha256, + payload_sha256.as_deref(), ) .send_retry(&self.config.retry_config) .await @@ -315,7 +315,6 @@ impl S3Client { let mut query = Vec::with_capacity(4); - // Note: the order of these matters to ensure the generated URL is canonical if let Some(token) = token { query.push(("continuation-token", token)) } diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 47d681c631c7..909dde072193 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aws::{STORE, STRICT_ENCODE_SET}; +use crate::aws::{STORE, STRICT_ENCODE_SET, STRICT_PATH_ENCODE_SET}; use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; @@ -39,7 +39,8 @@ type StdError = Box; /// SHA256 hash of empty string static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; -static UNSIGNED_PAYLOAD_LITERAL: &str = "UNSIGNED-PAYLOAD"; +static UNSIGNED_PAYLOAD: &str = "UNSIGNED-PAYLOAD"; +static STREAMING_PAYLOAD: &str = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; /// A set of AWS security credentials #[derive(Debug, Eq, PartialEq)] @@ -72,8 +73,12 @@ impl AwsCredential { } } -struct RequestSigner<'a> { - date: DateTime, +/// Authorize a [`Request`] with an [`AwsCredential`] using [AWS SigV4] +/// +/// [AWS SigV4]: https://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html +#[derive(Debug)] +pub struct AwsAuthorizer<'a> { + date: Option>, credential: &'a AwsCredential, service: &'a str, region: &'a str, @@ -85,39 +90,78 @@ const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; -impl<'a> RequestSigner<'a> { - fn sign(&self, request: &mut Request, pre_calculated_digest: Option>) { +impl<'a> AwsAuthorizer<'a> { + /// Create a new [`AwsAuthorizer`] + pub fn new(credential: &'a AwsCredential, service: &'a str, region: &'a str) -> Self { + Self { + credential, + service, + region, + date: None, + sign_payload: true, + } + } + + /// Controls whether this [`AwsAuthorizer`] will attempt to sign the request payload, + /// the default is `true` + pub fn with_sign_payload(mut self, signed: bool) -> Self { + self.sign_payload = signed; + self + } + + /// Authorize `request` with an optional pre-calculated SHA256 digest by attaching + /// the relevant [AWS SigV4] headers + /// + /// # Payload Signature + /// + /// AWS SigV4 requests must contain the `x-amz-content-sha256` header, it is set as follows: + /// + /// * If not configured to sign payloads, it is set to `UNSIGNED-PAYLOAD` + /// * If a `pre_calculated_digest` is provided, it is set to the hex encoding of it + /// * If it is a streaming request, it is set to `STREAMING-AWS4-HMAC-SHA256-PAYLOAD` + /// * Otherwise it is set to the hex encoded SHA256 of the request body + /// + /// [AWS SigV4]: https://docs.aws.amazon.com/IAM/latest/UserGuide/create-signed-request.html + pub fn authorize(&self, request: &mut Request, pre_calculated_digest: Option<&[u8]>) { if let Some(ref token) = self.credential.token { let token_val = HeaderValue::from_str(token).unwrap(); request.headers_mut().insert(TOKEN_HEADER, token_val); } - let host_val = HeaderValue::from_str( - &request.url()[url::Position::BeforeHost..url::Position::AfterPort], - ) - .unwrap(); + let host = &request.url()[url::Position::BeforeHost..url::Position::AfterPort]; + let host_val = HeaderValue::from_str(host).unwrap(); request.headers_mut().insert("host", host_val); - let date_str = self.date.format("%Y%m%dT%H%M%SZ").to_string(); + let date = self.date.unwrap_or_else(Utc::now); + let date_str = date.format("%Y%m%dT%H%M%SZ").to_string(); let date_val = HeaderValue::from_str(&date_str).unwrap(); request.headers_mut().insert(DATE_HEADER, date_val); - let digest = if self.sign_payload { - if let Some(digest) = pre_calculated_digest { - hex_encode(&digest) - } else { - match request.body() { + let digest = match self.sign_payload { + false => UNSIGNED_PAYLOAD.to_string(), + true => match pre_calculated_digest { + Some(digest) => hex_encode(digest), + None => match request.body() { None => EMPTY_SHA256_HASH.to_string(), - Some(body) => hex_digest(body.as_bytes().unwrap()), - } - } - } else { - UNSIGNED_PAYLOAD_LITERAL.to_string() + Some(body) => match body.as_bytes() { + Some(bytes) => hex_digest(bytes), + None => STREAMING_PAYLOAD.to_string(), + }, + }, + }, }; let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(HASH_HEADER, header_digest); + // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets URI-encoded once). + // see https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html + let canonical_uri = match self.service { + "s3" => request.url().path().to_string(), + _ => utf8_percent_encode(request.url().path(), &STRICT_PATH_ENCODE_SET) + .to_string(), + }; + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); let canonical_query = canonicalize_query(request.url()); @@ -125,7 +169,7 @@ impl<'a> RequestSigner<'a> { let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", request.method().as_str(), - request.url().path(), // S3 doesn't percent encode this like other services + canonical_uri, canonical_query, canonical_headers, signed_headers, @@ -135,14 +179,14 @@ impl<'a> RequestSigner<'a> { let hashed_canonical_request = hex_digest(canonical_request.as_bytes()); let scope = format!( "{}/{}/{}/aws4_request", - self.date.format("%Y%m%d"), + date.format("%Y%m%d"), self.region, self.service ); let string_to_sign = format!( "AWS4-HMAC-SHA256\n{}\n{}\n{}", - self.date.format("%Y%m%dT%H%M%SZ"), + date.format("%Y%m%dT%H%M%SZ"), scope, hashed_canonical_request ); @@ -150,7 +194,7 @@ impl<'a> RequestSigner<'a> { // sign the string let signature = self.credential - .sign(&string_to_sign, self.date, self.region, self.service); + .sign(&string_to_sign, date, self.region, self.service); // build the actual auth header let authorisation = format!( @@ -171,7 +215,7 @@ pub trait CredentialExt { region: &str, service: &str, sign_payload: bool, - payload_sha256: Option>, + payload_sha256: Option<&[u8]>, ) -> Self; } @@ -182,21 +226,15 @@ impl CredentialExt for RequestBuilder { region: &str, service: &str, sign_payload: bool, - payload_sha256: Option>, + payload_sha256: Option<&[u8]>, ) -> Self { let (client, request) = self.build_split(); let mut request = request.expect("request valid"); - let date = Utc::now(); - let signer = RequestSigner { - date, - credential, - service, - region, - sign_payload, - }; + AwsAuthorizer::new(credential, service, region) + .with_sign_payload(sign_payload) + .authorize(&mut request, payload_sha256); - signer.sign(&mut request, payload_sha256); Self::from_parts(client, request) } } @@ -539,15 +577,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let signer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "ec2", region: "us-east-1", sign_payload: true, }; - signer.sign(&mut request, None); + signer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=a3c787a7ed37f7fdfbfd2d7056a3d7c9d85e6d52a2bfbec73793c0be6e7862d4") } @@ -577,15 +615,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let authorizer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "ec2", region: "us-east-1", sign_payload: false, }; - signer.sign(&mut request, None); + authorizer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") } @@ -614,15 +652,15 @@ mod tests { .build() .unwrap(); - let signer = RequestSigner { - date, + let authorizer = AwsAuthorizer { + date: Some(date), credential: &credential, service: "s3", region: "us-east-1", sign_payload: true, }; - signer.sign(&mut request, None); + authorizer.authorize(&mut request, None); assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=H20ABqCkLZID4rLe/20220809/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=9ebf2f92872066c99ac94e573b4e1b80f4dbb8a32b1e8e23178318746e7d1b4d") } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a7f43d1532ab..e71124fbace9 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -82,7 +82,7 @@ const STORE: &str = "S3"; /// [`CredentialProvider`] for [`AmazonS3`] pub type AwsCredentialProvider = Arc>; -pub use credential::AwsCredential; +pub use credential::{AwsAuthorizer, AwsCredential}; /// Default metadata endpoint static METADATA_ENDPOINT: &str = "http://169.254.169.254"; @@ -160,6 +160,7 @@ impl From for super::Error { } /// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// /// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html pub async fn resolve_bucket_region( bucket: &str, From fe1b574f7bef356691b1ee22f10f20b1b06d1502 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:18:38 +0100 Subject: [PATCH 0913/1411] Extract Common Listing and Retrieval Functionality (#4220) * Factor out common cloud storage client functionality * Remove format_prefix * Review feedback --- object_store/src/aws/client.rs | 224 +++++++++++------------ object_store/src/aws/mod.rs | 63 +------ object_store/src/azure/client.rs | 104 +++++------ object_store/src/azure/mod.rs | 55 +----- object_store/src/client/get.rs | 70 +++++++ object_store/src/client/list.rs | 162 ++++++++++------ object_store/src/client/list_response.rs | 85 +++++++++ object_store/src/client/mod.rs | 18 +- object_store/src/gcp/mod.rs | 148 ++++++--------- object_store/src/util.rs | 8 - 10 files changed, 497 insertions(+), 440 deletions(-) create mode 100644 object_store/src/client/get.rs create mode 100644 object_store/src/client/list_response.rs diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 2c45050fad04..cfce35254d65 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -18,17 +18,17 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; -use crate::client::list::ListResponse; -use crate::client::pagination::stream_paginated; +use crate::client::get::GetClient; +use crate::client::list::ListClient; +use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::UploadPart; use crate::path::DELIMITER; -use crate::util::format_prefix; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, }; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; @@ -169,40 +169,6 @@ impl S3Client { self.config.credentials.get_credential().await } - /// Make an S3 GET request - pub async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.config.path_url(path); - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let builder = self.client.request(method, url); - - let response = builder - .with_get_options(options) - .with_aws_sigv4( - credential.as_ref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) - .send_retry(&self.config.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } - /// Make an S3 PUT request pub async fn put_request( &self, @@ -302,88 +268,6 @@ impl S3Client { Ok(()) } - /// Make an S3 List request - async fn list_request( - &self, - prefix: Option<&str>, - delimiter: bool, - token: Option<&str>, - offset: Option<&str>, - ) -> Result<(ListResult, Option)> { - let credential = self.get_credential().await?; - let url = self.config.bucket_endpoint.clone(); - - let mut query = Vec::with_capacity(4); - - if let Some(token) = token { - query.push(("continuation-token", token)) - } - - if delimiter { - query.push(("delimiter", DELIMITER)) - } - - query.push(("list-type", "2")); - - if let Some(prefix) = prefix { - query.push(("prefix", prefix)) - } - - if let Some(offset) = offset { - query.push(("start-after", offset)) - } - - let response = self - .client - .request(Method::GET, &url) - .query(&query) - .with_aws_sigv4( - credential.as_ref(), - &self.config.region, - "s3", - self.config.sign_payload, - None, - ) - .send_retry(&self.config.retry_config) - .await - .context(ListRequestSnafu)? - .bytes() - .await - .context(ListResponseBodySnafu)?; - - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; - let token = response.next_continuation_token.take(); - - Ok((response.try_into()?, token)) - } - - /// Perform a list operation automatically handling pagination - pub fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - offset: Option<&Path>, - ) -> BoxStream<'_, Result> { - let offset = offset.map(|x| x.to_string()); - let prefix = format_prefix(prefix); - stream_paginated( - (prefix, offset), - move |(prefix, offset), token| async move { - let (r, next_token) = self - .list_request( - prefix.as_deref(), - delimiter, - token.as_deref(), - offset.as_deref(), - ) - .await?; - Ok((r, (prefix, offset), next_token)) - }, - ) - .boxed() - } - pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; let url = format!("{}?uploads=", self.config.path_url(location),); @@ -451,6 +335,104 @@ impl S3Client { } } +#[async_trait] +impl GetClient for S3Client { + const STORE: &'static str = STORE; + + /// Make an S3 GET request + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let builder = self.client.request(method, url); + + let response = builder + .with_get_options(options) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for S3Client { + /// Make an S3 List request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + let credential = self.get_credential().await?; + let url = self.config.bucket_endpoint.clone(); + + let mut query = Vec::with_capacity(4); + + if let Some(token) = token { + query.push(("continuation-token", token)) + } + + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + query.push(("list-type", "2")); + + if let Some(prefix) = prefix { + query.push(("prefix", prefix)) + } + + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + + let response = self + .client + .request(Method::GET, &url) + .query(&query) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListResponseBodySnafu)?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + let token = response.next_continuation_token.take(); + + Ok((response.try_into()?, token)) + } +} + fn encode_path(path: &Path) -> PercentEncode<'_> { utf8_percent_encode(path.as_ref(), &STRICT_PATH_ENCODE_SET) } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index e71124fbace9..4c6d346603d5 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -34,11 +34,9 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::TryStreamExt; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::collections::BTreeSet; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -48,7 +46,8 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; -use crate::client::header::header_meta; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, TokenCredentialProvider, @@ -57,7 +56,7 @@ use crate::config::ConfigValue; use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, Result, RetryConfig, StreamExt, + ObjectStore, Path, Result, RetryConfig, }; mod checksum; @@ -138,11 +137,6 @@ enum Error { #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] RegionParse { bucket: String }, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -244,24 +238,11 @@ impl ObjectStore for AmazonS3 { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - // Extract meta from headers - // https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadObject.html#API_HeadObject_ResponseSyntax - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -272,14 +253,7 @@ impl ObjectStore for AmazonS3 { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false, None) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_offset( @@ -287,32 +261,11 @@ impl ObjectStore for AmazonS3 { prefix: Option<&Path>, offset: &Path, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false, Some(offset)) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list_with_offset(prefix, offset).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true, None); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 5f165c007947..868a803e92d7 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -18,15 +18,16 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; -use crate::client::pagination::stream_paginated; +use crate::client::get::GetClient; +use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::path::DELIMITER; -use crate::util::{deserialize_rfc1123, format_prefix}; +use crate::util::deserialize_rfc1123; use crate::{ - BoxStream, ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, - RetryConfig, StreamExt, + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, }; +use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; @@ -187,40 +188,6 @@ impl AzureClient { path: path.as_ref(), })?; - Ok(response) - } - - /// Make an Azure GET request - /// - /// - pub async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.config.path_url(path); - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let builder = self - .client - .request(method, url) - .header(CONTENT_LENGTH, HeaderValue::from_static("0")) - .body(Bytes::new()); - - let response = builder - .with_get_options(options) - .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - match response.headers().get("x-ms-resource-type") { Some(resource) if resource.as_ref() != b"file" => { Err(crate::Error::NotFound { @@ -300,14 +267,59 @@ impl AzureClient { Ok(()) } +} +#[async_trait] +impl GetClient for AzureClient { + const STORE: &'static str = STORE; + + /// Make an Azure GET request + /// + /// + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let builder = self + .client + .request(method, url) + .header(CONTENT_LENGTH, HeaderValue::from_static("0")) + .body(Bytes::new()); + + let response = builder + .with_get_options(options) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for AzureClient { /// Make an Azure List request async fn list_request( &self, prefix: Option<&str>, delimiter: bool, token: Option<&str>, + offset: Option<&str>, ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + let credential = self.get_credential().await?; let url = self.config.path_url(&Path::default()); @@ -346,22 +358,6 @@ impl AzureClient { Ok((to_list_result(response, prefix)?, token)) } - - /// Perform a list operation automatically handling pagination - pub fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - ) -> BoxStream<'_, Result> { - let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let (r, next_token) = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - Ok((r, prefix, next_token)) - }) - .boxed() - } } /// Raw / internal response from list requests diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 069b033d1896..d2735038321b 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -37,18 +37,19 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; -use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use futures::stream::BoxStream; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; use std::io; +use std::str::FromStr; use std::sync::Arc; -use std::{collections::BTreeSet, str::FromStr}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; use crate::client::{ ClientConfigKey, CredentialProvider, StaticCredentialProvider, TokenCredentialProvider, @@ -128,11 +129,6 @@ enum Error { #[snafu(display("ETag Header missing from response"))] MissingEtag, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -204,25 +200,11 @@ impl ObjectStore for MicrosoftAzure { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - - // Extract meta from headers - // https://docs.microsoft.com/en-us/rest/api/storageservices/get-blob-properties - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -233,32 +215,11 @@ impl ObjectStore for MicrosoftAzure { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs new file mode 100644 index 000000000000..3c66a72d82ed --- /dev/null +++ b/object_store/src/client/get.rs @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::header::header_meta; +use crate::path::Path; +use crate::Result; +use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use async_trait::async_trait; +use futures::{StreamExt, TryStreamExt}; +use reqwest::Response; + +/// A client that can perform a get request +#[async_trait] +pub trait GetClient: Send + Sync + 'static { + const STORE: &'static str; + + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result; +} + +/// Extension trait for [`GetClient`] that adds common retrieval functionality +#[async_trait] +pub trait GetClientExt { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; + + async fn head(&self, location: &Path) -> Result; +} + +#[async_trait] +impl GetClientExt for T { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let response = self.get_request(location, options, false).await?; + let stream = response + .bytes_stream() + .map_err(|source| Error::Generic { + store: T::STORE, + source: Box::new(source), + }) + .boxed(); + + Ok(GetResult::Stream(stream)) + } + + async fn head(&self, location: &Path) -> Result { + let options = GetOptions::default(); + let response = self.get_request(location, options, true).await?; + header_meta(location, response.headers()).map_err(|e| Error::Generic { + store: T::STORE, + source: Box::new(e), + }) + } +} diff --git a/object_store/src/client/list.rs b/object_store/src/client/list.rs index 6a3889e3be5b..b2dbee27f14d 100644 --- a/object_store/src/client/list.rs +++ b/object_store/src/client/list.rs @@ -1,3 +1,4 @@ +// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file @@ -14,72 +15,123 @@ // specific language governing permissions and limitations // under the License. -//! The list response format used by GCP and AWS - +use crate::client::pagination::stream_paginated; use crate::path::Path; -use crate::{ListResult, ObjectMeta, Result}; -use chrono::{DateTime, Utc}; -use serde::Deserialize; - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListResponse { - #[serde(default)] - pub contents: Vec, - #[serde(default)] - pub common_prefixes: Vec, - #[serde(default)] - pub next_continuation_token: Option, +use crate::Result; +use crate::{ListResult, ObjectMeta}; +use async_trait::async_trait; +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; +use std::collections::BTreeSet; + +/// A client that can perform paginated list requests +#[async_trait] +pub trait ListClient: Send + Sync + 'static { + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)>; } -impl TryFrom for ListResult { - type Error = crate::Error; +/// Extension trait for [`ListClient`] that adds common listing functionality +#[async_trait] +pub trait ListClientExt { + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + offset: Option<&Path>, + ) -> BoxStream<'_, Result>; - fn try_from(value: ListResponse) -> Result { - let common_prefixes = value - .common_prefixes - .into_iter() - .map(|x| Ok(Path::parse(x.prefix)?)) - .collect::>()?; + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>>; - let objects = value - .contents - .into_iter() - .map(TryFrom::try_from) - .collect::>()?; + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>>; - Ok(Self { - common_prefixes, - objects, - }) - } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListPrefix { - pub prefix: String, -} +#[async_trait] +impl ListClientExt for T { + fn list_paginated( + &self, + prefix: Option<&Path>, + delimiter: bool, + offset: Option<&Path>, + ) -> BoxStream<'_, Result> { + let offset = offset.map(|x| x.to_string()); + let prefix = prefix + .filter(|x| !x.as_ref().is_empty()) + .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)); -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -pub struct ListContents { - pub key: String, - pub size: usize, - pub last_modified: DateTime, - #[serde(rename = "ETag")] - pub e_tag: Option, -} + stream_paginated( + (prefix, offset), + move |(prefix, offset), token| async move { + let (r, next_token) = self + .list_request( + prefix.as_deref(), + delimiter, + token.as_deref(), + offset.as_deref(), + ) + .await?; + Ok((r, (prefix, offset), next_token)) + }, + ) + .boxed() + } -impl TryFrom for ObjectMeta { - type Error = crate::Error; + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let stream = self + .list_paginated(prefix, false, None) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); - fn try_from(value: ListContents) -> Result { - Ok(Self { - location: Path::parse(value.key)?, - last_modified: value.last_modified, - size: value.size, - e_tag: value.e_tag, + Ok(stream) + } + + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + let stream = self + .list_paginated(prefix, false, Some(offset)) + .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) + .try_flatten() + .boxed(); + + Ok(stream) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let mut stream = self.list_paginated(prefix, true, None); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + while let Some(result) = stream.next().await { + let response = result?; + common_prefixes.extend(response.common_prefixes.into_iter()); + objects.extend(response.objects.into_iter()); + } + + Ok(ListResult { + common_prefixes: common_prefixes.into_iter().collect(), + objects, }) } } diff --git a/object_store/src/client/list_response.rs b/object_store/src/client/list_response.rs new file mode 100644 index 000000000000..6a3889e3be5b --- /dev/null +++ b/object_store/src/client/list_response.rs @@ -0,0 +1,85 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The list response format used by GCP and AWS + +use crate::path::Path; +use crate::{ListResult, ObjectMeta, Result}; +use chrono::{DateTime, Utc}; +use serde::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListResponse { + #[serde(default)] + pub contents: Vec, + #[serde(default)] + pub common_prefixes: Vec, + #[serde(default)] + pub next_continuation_token: Option, +} + +impl TryFrom for ListResult { + type Error = crate::Error; + + fn try_from(value: ListResponse) -> Result { + let common_prefixes = value + .common_prefixes + .into_iter() + .map(|x| Ok(Path::parse(x.prefix)?)) + .collect::>()?; + + let objects = value + .contents + .into_iter() + .map(TryFrom::try_from) + .collect::>()?; + + Ok(Self { + common_prefixes, + objects, + }) + } +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListPrefix { + pub prefix: String, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct ListContents { + pub key: String, + pub size: usize, + pub last_modified: DateTime, + #[serde(rename = "ETag")] + pub e_tag: Option, +} + +impl TryFrom for ObjectMeta { + type Error = crate::Error; + + fn try_from(value: ListContents) -> Result { + Ok(Self { + location: Path::parse(value.key)?, + last_modified: value.last_modified, + size: value.size, + e_tag: value.e_tag, + }) + } +} diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 8c23576994fa..5f3a042be46a 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -20,9 +20,18 @@ pub mod backoff; #[cfg(test)] pub mod mock_server; + +pub mod retry; + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; -pub mod retry; + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod get; + +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub mod list; + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; @@ -30,7 +39,7 @@ pub mod token; pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] -pub mod list; +pub mod list_response; use async_trait::async_trait; use std::collections::HashMap; @@ -42,10 +51,9 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; -use crate::client::token::{TemporaryToken, TokenCache}; use crate::config::{fmt_duration, ConfigValue}; use crate::path::Path; -use crate::{GetOptions, Result, RetryConfig}; +use crate::{GetOptions, Result}; fn map_client_error(e: reqwest::Error) -> super::Error { super::Error::Generic { @@ -545,6 +553,8 @@ where #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] mod cloud { use super::*; + use crate::client::token::{TemporaryToken, TokenCache}; + use crate::RetryConfig; /// A [`CredentialProvider`] that uses [`Client`] to fetch temporary tokens #[derive(Debug)] diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 21ba1588fbe8..7b1127354ccb 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -29,14 +29,13 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::collections::BTreeSet; use std::io; use std::str::FromStr; use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes}; -use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use futures::stream::BoxStream; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::{header, Client, Method, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -44,9 +43,9 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; -use crate::client::list::ListResponse; -use crate::client::pagination::stream_paginated; +use crate::client::get::{GetClient, GetClientExt}; +use crate::client::list::{ListClient, ListClientExt}; +use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::{ ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, @@ -55,7 +54,6 @@ use crate::client::{ use crate::{ multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - util::format_prefix, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; @@ -150,11 +148,6 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - - #[snafu(display("Failed to parse headers: {}", source))] - Header { - source: crate::client::header::Error, - }, } impl From for super::Error { @@ -241,35 +234,6 @@ impl GoogleCloudStorageClient { format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) } - /// Perform a get request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let method = match head { - true => Method::HEAD, - false => Method::GET, - }; - - let response = self - .client - .request(method, url) - .bearer_auth(&credential.bearer) - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } - /// Perform a put request async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { let credential = self.get_credential().await?; @@ -409,14 +373,54 @@ impl GoogleCloudStorageClient { Ok(()) } +} +#[async_trait] +impl GetClient for GoogleCloudStorageClient { + const STORE: &'static str = STORE; + + /// Perform a get request + async fn get_request( + &self, + path: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + + let response = self + .client + .request(method, url) + .bearer_auth(&credential.bearer) + .with_get_options(options) + .send_retry(&self.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for GoogleCloudStorageClient { /// Perform a list request async fn list_request( &self, prefix: Option<&str>, delimiter: bool, page_token: Option<&str>, - ) -> Result { + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + let credential = self.get_credential().await?; let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); @@ -450,27 +454,11 @@ impl GoogleCloudStorageClient { .await .context(ListResponseBodySnafu)?; - let response: ListResponse = quick_xml::de::from_reader(response.reader()) + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) .context(InvalidListResponseSnafu)?; - Ok(response) - } - - /// Perform a list operation automatically handling pagination - fn list_paginated( - &self, - prefix: Option<&Path>, - delimiter: bool, - ) -> BoxStream<'_, Result> { - let prefix = format_prefix(prefix); - stream_paginated(prefix, move |prefix, token| async move { - let mut r = self - .list_request(prefix.as_deref(), delimiter, token.as_deref()) - .await?; - let next_token = r.next_continuation_token.take(); - Ok((r.try_into()?, prefix, next_token)) - }) - .boxed() + let token = response.next_continuation_token.take(); + Ok((response.try_into()?, token)) } } @@ -613,22 +601,11 @@ impl ObjectStore for GoogleCloudStorage { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let response = self.client.get_request(location, options, false).await?; - let stream = response - .bytes_stream() - .map_err(|source| crate::Error::Generic { - store: STORE, - source: Box::new(source), - }) - .boxed(); - - Ok(GetResult::Stream(stream)) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - let response = self.client.get_request(location, options, true).await?; - Ok(header_meta(location, response.headers()).context(HeaderSnafu)?) + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { @@ -639,32 +616,11 @@ impl ObjectStore for GoogleCloudStorage { &self, prefix: Option<&Path>, ) -> Result>> { - let stream = self - .client - .list_paginated(prefix, false) - .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) - .try_flatten() - .boxed(); - - Ok(stream) + self.client.list(prefix).await } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let mut stream = self.client.list_paginated(prefix, true); - - let mut common_prefixes = BTreeSet::new(); - let mut objects = Vec::new(); - - while let Some(result) = stream.next().await { - let response = result?; - common_prefixes.extend(response.common_prefixes.into_iter()); - objects.extend(response.objects.into_iter()); - } - - Ok(ListResult { - common_prefixes: common_prefixes.into_iter().collect(), - objects, - }) + self.client.list_with_delimiter(prefix).await } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { diff --git a/object_store/src/util.rs b/object_store/src/util.rs index ba4c68345d73..79ca4bb7a834 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -36,14 +36,6 @@ where .map_err(serde::de::Error::custom) } -/// Returns the prefix to be passed to an object store -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] -pub fn format_prefix(prefix: Option<&crate::path::Path>) -> Option { - prefix - .filter(|x| !x.as_ref().is_empty()) - .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)) -} - #[cfg(any(feature = "aws", feature = "azure"))] pub(crate) fn hmac_sha256( secret: impl AsRef<[u8]>, From 077328c5cacc843cbbdb8757e67dbc57b338d5f2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 09:51:03 +0100 Subject: [PATCH 0914/1411] Prepare object_store 0.6.0 (#4241) --- object_store/CHANGELOG-old.md | 38 +++++++++ object_store/CHANGELOG.md | 80 ++++++++++++------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 94 insertions(+), 30 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index cc9453b321bc..c9c4e28dca05 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,44 @@ # Historical Changelog +## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) + +**Implemented enhancements:** + +- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) +- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) +- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) +- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) +- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) +- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) +- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) +- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) + ## [object_store_0.5.5](https://github.com/apache/arrow-rs/tree/object_store_0.5.5) (2023-02-27) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.4...object_store_0.5.5) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index b26ae7180004..bde0f752323e 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,43 +19,69 @@ # Changelog -## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) +## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) + +**Breaking changes:** + +- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) **Implemented enhancements:** -- Document ObjectStore::list Ordering [\#3975](https://github.com/apache/arrow-rs/issues/3975) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add option to start listing at a particular key [\#3970](https://github.com/apache/arrow-rs/issues/3970) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Implement `ObjectStore` for trait objects [\#3865](https://github.com/apache/arrow-rs/issues/3865) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add ObjectStore::append [\#3790](https://github.com/apache/arrow-rs/issues/3790) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Make `InMemory` object store track last modified time for each entry [\#3782](https://github.com/apache/arrow-rs/issues/3782) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support Unsigned S3 Payloads [\#3737](https://github.com/apache/arrow-rs/issues/3737) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add Content-MD5 or checksum header for using an Object Locked S3 [\#3725](https://github.com/apache/arrow-rs/issues/3725) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- LocalFileSystem::put is not Atomic [\#3780](https://github.com/apache/arrow-rs/issues/3780) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add ObjectStore::list\_with\_offset \(\#3970\) [\#3973](https://github.com/apache/arrow-rs/pull/3973) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Remove incorrect validation logic on S3 bucket names [\#3947](https://github.com/apache/arrow-rs/pull/3947) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([rtyler](https://github.com/rtyler)) -- Prepare arrow 36 [\#3935](https://github.com/apache/arrow-rs/pull/3935) ([tustvold](https://github.com/tustvold)) -- fix: Specify content length for gcp copy request [\#3921](https://github.com/apache/arrow-rs/pull/3921) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([scsmithr](https://github.com/scsmithr)) -- Revert structured ArrayData \(\#3877\) [\#3894](https://github.com/apache/arrow-rs/pull/3894) ([tustvold](https://github.com/tustvold)) -- Add support for checksum algorithms in AWS [\#3873](https://github.com/apache/arrow-rs/pull/3873) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) -- Rename PrefixObjectStore to PrefixStore [\#3870](https://github.com/apache/arrow-rs/pull/3870) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement append for LimitStore, PrefixObjectStore, ThrottledStore [\#3869](https://github.com/apache/arrow-rs/pull/3869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Supporting metadata fetch without open file read mode [\#3868](https://github.com/apache/arrow-rs/pull/3868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([metesynnada](https://github.com/metesynnada)) -- Impl ObjectStore for trait object [\#3866](https://github.com/apache/arrow-rs/pull/3866) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Kinrany](https://github.com/Kinrany)) -- Update quick-xml requirement from 0.27.0 to 0.28.0 [\#3857](https://github.com/apache/arrow-rs/pull/3857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update changelog for 35.0.0 [\#3843](https://github.com/apache/arrow-rs/pull/3843) ([tustvold](https://github.com/tustvold)) -- Cleanup ApplicationDefaultCredentials [\#3799](https://github.com/apache/arrow-rs/pull/3799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make InMemory object store track last modified time for each entry [\#3796](https://github.com/apache/arrow-rs/pull/3796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) -- Add ObjectStore::append [\#3791](https://github.com/apache/arrow-rs/pull/3791) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make LocalFileSystem::put atomic \(\#3780\) [\#3781](https://github.com/apache/arrow-rs/pull/3781) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Add support for unsigned payloads in aws [\#3741](https://github.com/apache/arrow-rs/pull/3741) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([trueleo](https://github.com/trueleo)) +- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) +- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) +- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bd9c973e052a..1fb988642dda 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.5.6" +version = "0.6.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index b69d36f8456c..60906307ecf7 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.5" -FUTURE_RELEASE="object_store_0.5.6" +SINCE_TAG="object_store_0.5.6" +FUTURE_RELEASE="object_store_0.6.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From ec7706c1f2aeef5a289e46d1df7785e5c93e6bfb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 18 May 2023 10:14:50 +0100 Subject: [PATCH 0915/1411] Fix merge conflict from #4220 (#4242) --- object_store/src/azure/client.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 868a803e92d7..5ed6f2443f32 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -188,19 +188,7 @@ impl AzureClient { path: path.as_ref(), })?; - match response.headers().get("x-ms-resource-type") { - Some(resource) if resource.as_ref() != b"file" => { - Err(crate::Error::NotFound { - path: path.to_string(), - source: format!( - "Not a file, got x-ms-resource-type: {}", - String::from_utf8_lossy(resource.as_ref()) - ) - .into(), - }) - } - _ => Ok(response), - } + Ok(response) } /// Make an Azure Delete request @@ -304,7 +292,19 @@ impl GetClient for AzureClient { path: path.as_ref(), })?; - Ok(response) + match response.headers().get("x-ms-resource-type") { + Some(resource) if resource.as_ref() != b"file" => { + Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }) + } + _ => Ok(response), + } } } From 5d8b39273f96e643739991dbc46e4fb71b462156 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Fri, 19 May 2023 11:24:44 +0300 Subject: [PATCH 0916/1411] feat: Support bitwise and boolean aggregate functions (#4210) * feat: Support bitwise and boolean aggregate functions * fix: clippy * feat: macro_rules for bit operations * fix: SIMD and docs --- arrow-arith/src/aggregate.rs | 201 +++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 2833300ddc07..4961d7efc0f2 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -24,6 +24,7 @@ use arrow_buffer::ArrowNativeType; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_schema::ArrowError; use arrow_schema::*; +use std::ops::{BitAnd, BitOr, BitXor}; /// Generic test for NaN, the optimizer should be able to remove this for integer types. #[inline] @@ -326,6 +327,115 @@ where } } +macro_rules! bit_operation { + ($NAME:ident, $OP:ident, $NATIVE:ident, $DEFAULT:expr, $DOC:expr) => { + #[doc = $DOC] + /// + /// Returns `None` if the array is empty or only contains null values. + pub fn $NAME(array: &PrimitiveArray) -> Option + where + T: ArrowNumericType, + T::Native: $NATIVE + ArrowNativeTypeOp, + { + let default; + if $DEFAULT == -1 { + default = T::Native::ONE.neg_wrapping(); + } else { + default = T::default_value(); + } + + let null_count = array.null_count(); + + if null_count == array.len() { + return None; + } + + let data: &[T::Native] = array.values(); + + match array.nulls() { + None => { + let result = data + .iter() + .fold(default, |accumulator, value| accumulator.$OP(*value)); + + Some(result) + } + Some(nulls) => { + let mut result = default; + let data_chunks = data.chunks_exact(64); + let remainder = data_chunks.remainder(); + + let bit_chunks = nulls.inner().bit_chunks(); + data_chunks + .zip(bit_chunks.iter()) + .for_each(|(chunk, mask)| { + // index_mask has value 1 << i in the loop + let mut index_mask = 1; + chunk.iter().for_each(|value| { + if (mask & index_mask) != 0 { + result = result.$OP(*value); + } + index_mask <<= 1; + }); + }); + + let remainder_bits = bit_chunks.remainder_bits(); + + remainder.iter().enumerate().for_each(|(i, value)| { + if remainder_bits & (1 << i) != 0 { + result = result.$OP(*value); + } + }); + + Some(result) + } + } + } + }; +} + +bit_operation!( + bit_and, + bitand, + BitAnd, + -1, + "Returns the bitwise and of all non-null input values." +); +bit_operation!( + bit_or, + bitor, + BitOr, + 0, + "Returns the bitwise or of all non-null input values." +); +bit_operation!( + bit_xor, + bitxor, + BitXor, + 0, + "Returns the bitwise xor of all non-null input values." +); + +/// Returns true if all non-null input values are true, otherwise false. +/// +/// Returns `None` if the array is empty or only contains null values. +pub fn bool_and(array: &BooleanArray) -> Option { + if array.null_count() == array.len() { + return None; + } + Some(array.false_count() == 0) +} + +/// Returns true if any non-null input value is true, otherwise false. +/// +/// Returns `None` if the array is empty or only contains null values. +pub fn bool_or(array: &BooleanArray) -> Option { + if array.null_count() == array.len() { + return None; + } + Some(array.true_count() != 0) +} + /// Returns the sum of values in the primitive array. /// /// Returns `Ok(None)` if the array is empty or only contains null values. @@ -838,6 +948,97 @@ mod tests { assert_eq!(Some((1..=100).filter(|i| i % 33 == 0).sum()), sum(&c)); } + #[test] + fn test_primitive_array_bit_and() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + assert_eq!(0, bit_and(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_and_with_nulls() { + let a = Int32Array::from(vec![None, Some(2), Some(3), None, None]); + assert_eq!(2, bit_and(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_and_all_nulls() { + let a = Int32Array::from(vec![None, None, None]); + assert_eq!(None, bit_and(&a)); + } + + #[test] + fn test_primitive_array_bit_or() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + assert_eq!(7, bit_or(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_or_with_nulls() { + let a = Int32Array::from(vec![None, Some(2), Some(3), None, Some(5)]); + assert_eq!(7, bit_or(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_or_all_nulls() { + let a = Int32Array::from(vec![None, None, None]); + assert_eq!(None, bit_or(&a)); + } + + #[test] + fn test_primitive_array_bit_xor() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + assert_eq!(1, bit_xor(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_xor_with_nulls() { + let a = Int32Array::from(vec![None, Some(2), Some(3), None, Some(5)]); + assert_eq!(4, bit_xor(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bit_xor_all_nulls() { + let a = Int32Array::from(vec![None, None, None]); + assert_eq!(None, bit_xor(&a)); + } + + #[test] + fn test_primitive_array_bool_and() { + let a = BooleanArray::from(vec![true, false, true, false, true]); + assert!(!bool_and(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bool_and_with_nulls() { + let a = BooleanArray::from(vec![None, Some(true), Some(true), None, Some(true)]); + assert!(bool_and(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bool_and_all_nulls() { + let a = BooleanArray::from(vec![None, None, None]); + assert_eq!(None, bool_and(&a)); + } + + #[test] + fn test_primitive_array_bool_or() { + let a = BooleanArray::from(vec![true, false, true, false, true]); + assert!(bool_or(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bool_or_with_nulls() { + let a = + BooleanArray::from(vec![None, Some(false), Some(false), None, Some(false)]); + assert!(!bool_or(&a).unwrap()); + } + + #[test] + fn test_primitive_array_bool_or_all_nulls() { + let a = BooleanArray::from(vec![None, None, None]); + assert_eq!(None, bool_or(&a)); + } + #[test] fn test_primitive_array_min_max() { let a = Int32Array::from(vec![5, 6, 7, 8, 9]); From ed16d9f8c0dc29b1019d20cfde8b874c22dd838d Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Fri, 19 May 2023 01:25:39 -0700 Subject: [PATCH 0917/1411] Arrow Arithmetic: Subtract timestamps (#4244) * feat(arith): subtract timestamps * feat(arith): checked and unchecked subtraction for timestamps * feat(arith): use closure for ts sub --- arrow-arith/src/arithmetic.rs | 218 +++++++++++++++++++++++++++++++++- 1 file changed, 216 insertions(+), 2 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 42f6e3974301..c3c5cb864ed2 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1096,13 +1096,17 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let r = right.as_primitive::(); + let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; + Ok(Arc::new(res)) + } _ => Err(ArrowError::CastError(format!( "Cannot perform arithmetic operation between array of type {} and array of type {}", left.data_type(), right.data_type() ))), } } - DataType::Timestamp(TimeUnit::Microsecond, _) => { let l = left.as_primitive::(); match right.data_type() { @@ -1121,6 +1125,11 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let r = right.as_primitive::(); + let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; + Ok(Arc::new(res)) + } _ => Err(ArrowError::CastError(format!( "Cannot perform arithmetic operation between array of type {} and array of type {}", left.data_type(), right.data_type() @@ -1145,13 +1154,17 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let r = right.as_primitive::(); + let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; + Ok(Arc::new(res)) + } _ => Err(ArrowError::CastError(format!( "Cannot perform arithmetic operation between array of type {} and array of type {}", left.data_type(), right.data_type() ))), } } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { let l = left.as_primitive::(); match right.data_type() { @@ -1170,6 +1183,11 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { + let r = right.as_primitive::(); + let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; + Ok(Arc::new(res)) + } _ => Err(ArrowError::CastError(format!( "Cannot perform arithmetic operation between array of type {} and array of type {}", left.data_type(), right.data_type() @@ -1256,6 +1274,62 @@ pub fn subtract_dyn_checked( ))), } } + DataType::Timestamp(TimeUnit::Second, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Timestamp(TimeUnit::Second, _) => { + let r = right.as_primitive::(); + let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let r = right.as_primitive::(); + let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let r = right.as_primitive::(); + let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let l = left.as_primitive::(); + match right.data_type() { + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let r = right.as_primitive::(); + let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } _ => { downcast_primitive_array!( (left, right) => { @@ -4649,4 +4723,144 @@ mod tests { ]); assert_eq!(&expected, result); } + + #[test] + fn test_timestamp_second_subtract_timestamp() { + let a = TimestampSecondArray::from(vec![0, 2, 4, 6, 8]); + let b = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); + let expected = DurationSecondArray::from(vec![-1, 0, 1, 2, 3]); + + // unchecked + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + + // checked + let result = subtract_dyn_checked(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_second_subtract_timestamp_overflow() { + let a = TimestampSecondArray::from(vec![ + ::Native::MAX, + ]); + let b = TimestampSecondArray::from(vec![ + ::Native::MIN, + ]); + + // unchecked + let result = subtract_dyn(&a, &b); + assert!(!&result.is_err()); + + // checked + let result = subtract_dyn_checked(&a, &b); + assert!(&result.is_err()); + } + + #[test] + fn test_timestamp_microsecond_subtract_timestamp() { + let a = TimestampMicrosecondArray::from(vec![0, 2, 4, 6, 8]); + let b = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); + let expected = DurationMicrosecondArray::from(vec![-1, 0, 1, 2, 3]); + + // unchecked + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + + // checked + let result = subtract_dyn_checked(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_microsecond_subtract_timestamp_overflow() { + let a = TimestampMicrosecondArray::from(vec![ + ::Native::MAX, + ]); + let b = TimestampMicrosecondArray::from(vec![ + ::Native::MIN, + ]); + + // unchecked + let result = subtract_dyn(&a, &b); + assert!(!&result.is_err()); + + // checked + let result = subtract_dyn_checked(&a, &b); + assert!(&result.is_err()); + } + + #[test] + fn test_timestamp_millisecond_subtract_timestamp() { + let a = TimestampMillisecondArray::from(vec![0, 2, 4, 6, 8]); + let b = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); + let expected = DurationMillisecondArray::from(vec![-1, 0, 1, 2, 3]); + + // unchecked + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + + // checked + let result = subtract_dyn_checked(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_millisecond_subtract_timestamp_overflow() { + let a = TimestampMillisecondArray::from(vec![ + ::Native::MAX, + ]); + let b = TimestampMillisecondArray::from(vec![ + ::Native::MIN, + ]); + + // unchecked + let result = subtract_dyn(&a, &b); + assert!(!&result.is_err()); + + // checked + let result = subtract_dyn_checked(&a, &b); + assert!(&result.is_err()); + } + + #[test] + fn test_timestamp_nanosecond_subtract_timestamp() { + let a = TimestampNanosecondArray::from(vec![0, 2, 4, 6, 8]); + let b = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); + let expected = DurationNanosecondArray::from(vec![-1, 0, 1, 2, 3]); + + // unchecked + let result = subtract_dyn(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + + // checked + let result = subtract_dyn_checked(&a, &b).unwrap(); + let result = result.as_primitive::(); + assert_eq!(&expected, result); + } + + #[test] + fn test_timestamp_nanosecond_subtract_timestamp_overflow() { + let a = TimestampNanosecondArray::from(vec![ + ::Native::MAX, + ]); + let b = TimestampNanosecondArray::from(vec![ + ::Native::MIN, + ]); + + // unchecked + let result = subtract_dyn(&a, &b); + assert!(!&result.is_err()); + + // checked + let result = subtract_dyn_checked(&a, &b); + assert!(&result.is_err()); + } } From 25bfccca58ff219d9f59ba9f4d75550493238a4f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 19 May 2023 13:39:20 +0100 Subject: [PATCH 0918/1411] Prepare 40.0.0 release (#4245) --- CHANGELOG-old.md | 63 ++++++++++++++++++++++++++ CHANGELOG.md | 77 ++++++++++++-------------------- Cargo.toml | 32 ++++++------- dev/release/update_change_log.sh | 4 +- 4 files changed, 109 insertions(+), 67 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index fa932b103615..da72626d86cf 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,69 @@ # Historical Changelog +## [39.0.0](https://github.com/apache/arrow-rs/tree/39.0.0) (2023-05-05) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/38.0.0...39.0.0) + +**Breaking changes:** + +- Allow creating unbuffered streamreader [\#4165](https://github.com/apache/arrow-rs/pull/4165) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) +- Cleanup ChunkReader \(\#4118\) [\#4156](https://github.com/apache/arrow-rs/pull/4156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove Type from NativeIndex [\#4146](https://github.com/apache/arrow-rs/pull/4146) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Don't Duplicate Offset Index on RowGroupMetadata [\#4142](https://github.com/apache/arrow-rs/pull/4142) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Return BooleanBuffer from BooleanBufferBuilder [\#4140](https://github.com/apache/arrow-rs/pull/4140) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup CSV schema inference \(\#4129\) \(\#4130\) [\#4133](https://github.com/apache/arrow-rs/pull/4133) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove deprecated parquet ArrowReader [\#4125](https://github.com/apache/arrow-rs/pull/4125) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- refactor: construct `StructArray` w/ `FieldRef` [\#4116](https://github.com/apache/arrow-rs/pull/4116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Ignore Field Metadata in equals\_datatype for Dictionary, RunEndEncoded, Map and Union [\#4111](https://github.com/apache/arrow-rs/pull/4111) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add StructArray Constructors \(\#3879\) [\#4064](https://github.com/apache/arrow-rs/pull/4064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Release 39.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 38.0.0\) [\#4170](https://github.com/apache/arrow-rs/issues/4170) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Fixed point decimal multiplication for DictionaryArray [\#4135](https://github.com/apache/arrow-rs/issues/4135) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Seek Requirement from CSV ReaderBuilder [\#4130](https://github.com/apache/arrow-rs/issues/4130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inconsistent CSV Inference and Parsing DateTime Handling [\#4129](https://github.com/apache/arrow-rs/issues/4129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4112](https://github.com/apache/arrow-rs/issues/4112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Interval + Timestamp` and `Interval + Date` in addition to `Timestamp + Interval` and `Interval + Date` [\#4094](https://github.com/apache/arrow-rs/issues/4094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable setting FlightDescriptor on FlightDataEncoderBuilder [\#3855](https://github.com/apache/arrow-rs/issues/3855) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Parquet Page Index Reader Assumes Consecutive Offsets [\#4149](https://github.com/apache/arrow-rs/issues/4149) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Equality of nested data types [\#4110](https://github.com/apache/arrow-rs/issues/4110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Improve Documentation of Parquet ChunkReader [\#4118](https://github.com/apache/arrow-rs/issues/4118) + +**Closed issues:** + +- add specific error log for empty JSON array [\#4105](https://github.com/apache/arrow-rs/issues/4105) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Prep for 39.0.0 [\#4171](https://github.com/apache/arrow-rs/pull/4171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) +- Support Compression in parquet-fromcsv [\#4160](https://github.com/apache/arrow-rs/pull/4160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([suxiaogang223](https://github.com/suxiaogang223)) +- feat: support bitwise shift left/right with scalars [\#4159](https://github.com/apache/arrow-rs/pull/4159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Cleanup reading page index \(\#4149\) \(\#4090\) [\#4151](https://github.com/apache/arrow-rs/pull/4151) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: support `bitwise` shift left/right [\#4148](https://github.com/apache/arrow-rs/pull/4148) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Don't hardcode port in FlightSQL tests [\#4145](https://github.com/apache/arrow-rs/pull/4145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Better flight SQL example codes [\#4144](https://github.com/apache/arrow-rs/pull/4144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([sundy-li](https://github.com/sundy-li)) +- chore: clean the code by using `as_primitive` [\#4143](https://github.com/apache/arrow-rs/pull/4143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Infer Float64 for JSON Numerics Beyond Bounds of i64 [\#4138](https://github.com/apache/arrow-rs/pull/4138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SteveLauC](https://github.com/SteveLauC)) +- Support fixed point multiplication for DictionaryArray of Decimals [\#4136](https://github.com/apache/arrow-rs/pull/4136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make arrow\_json::ReaderBuilder method names consistent [\#4128](https://github.com/apache/arrow-rs/pull/4128) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add get\_{ref, mut} to arrow\_ipc Reader and Writer [\#4122](https://github.com/apache/arrow-rs/pull/4122) ([sticnarf](https://github.com/sticnarf)) +- feat: support `Interval` + `Timestamp` and `Interval` + `Date` [\#4117](https://github.com/apache/arrow-rs/pull/4117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Support NullArray in JSON Reader [\#4114](https://github.com/apache/arrow-rs/pull/4114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jiangzhx](https://github.com/jiangzhx)) +- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4113](https://github.com/apache/arrow-rs/pull/4113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Update regex-syntax requirement from 0.6.27 to 0.7.1 [\#4107](https://github.com/apache/arrow-rs/pull/4107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: set FlightDescriptor on FlightDataEncoderBuilder [\#4101](https://github.com/apache/arrow-rs/pull/4101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Weijun-H](https://github.com/Weijun-H)) +- optimize cast for same decimal type and same scale [\#4088](https://github.com/apache/arrow-rs/pull/4088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) + ## [38.0.0](https://github.com/apache/arrow-rs/tree/38.0.0) (2023-04-21) [Full Changelog](https://github.com/apache/arrow-rs/compare/37.0.0...38.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 023a65941947..9b6e88f30c15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,68 +19,47 @@ # Changelog -## [39.0.0](https://github.com/apache/arrow-rs/tree/39.0.0) (2023-05-05) +## [40.0.0](https://github.com/apache/arrow-rs/tree/40.0.0) (2023-05-19) -[Full Changelog](https://github.com/apache/arrow-rs/compare/38.0.0...39.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/39.0.0...40.0.0) **Breaking changes:** -- Allow creating unbuffered streamreader [\#4165](https://github.com/apache/arrow-rs/pull/4165) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) -- Cleanup ChunkReader \(\#4118\) [\#4156](https://github.com/apache/arrow-rs/pull/4156) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove Type from NativeIndex [\#4146](https://github.com/apache/arrow-rs/pull/4146) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Don't Duplicate Offset Index on RowGroupMetadata [\#4142](https://github.com/apache/arrow-rs/pull/4142) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Return BooleanBuffer from BooleanBufferBuilder [\#4140](https://github.com/apache/arrow-rs/pull/4140) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup CSV schema inference \(\#4129\) \(\#4130\) [\#4133](https://github.com/apache/arrow-rs/pull/4133) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove deprecated parquet ArrowReader [\#4125](https://github.com/apache/arrow-rs/pull/4125) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- refactor: construct `StructArray` w/ `FieldRef` [\#4116](https://github.com/apache/arrow-rs/pull/4116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Ignore Field Metadata in equals\_datatype for Dictionary, RunEndEncoded, Map and Union [\#4111](https://github.com/apache/arrow-rs/pull/4111) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add StructArray Constructors \(\#3879\) [\#4064](https://github.com/apache/arrow-rs/pull/4064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Prefetch page index \(\#4090\) [\#4216](https://github.com/apache/arrow-rs/pull/4216) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add RecordBatchWriter trait and implement it for CSV, JSON, IPC and P… [\#4206](https://github.com/apache/arrow-rs/pull/4206) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Remove powf\_scalar kernel [\#4187](https://github.com/apache/arrow-rs/pull/4187) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow format specification in cast [\#4169](https://github.com/apache/arrow-rs/pull/4169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([parthchandra](https://github.com/parthchandra)) **Implemented enhancements:** -- Release 39.0.0 of arrow/arrow-flight/parquet/parquet-derive \(next release after 38.0.0\) [\#4170](https://github.com/apache/arrow-rs/issues/4170) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Fixed point decimal multiplication for DictionaryArray [\#4135](https://github.com/apache/arrow-rs/issues/4135) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove Seek Requirement from CSV ReaderBuilder [\#4130](https://github.com/apache/arrow-rs/issues/4130) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Inconsistent CSV Inference and Parsing DateTime Handling [\#4129](https://github.com/apache/arrow-rs/issues/4129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) -- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4112](https://github.com/apache/arrow-rs/issues/4112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `Interval + Timestamp` and `Interval + Date` in addition to `Timestamp + Interval` and `Interval + Date` [\#4094](https://github.com/apache/arrow-rs/issues/4094) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Enable setting FlightDescriptor on FlightDataEncoderBuilder [\#3855](https://github.com/apache/arrow-rs/issues/3855) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Support `Interval` +/- `Interval` [\#4178](https://github.com/apache/arrow-rs/issues/4178) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[parquet\] add compression info to `print_column_chunk_metadata()` [\#4172](https://github.com/apache/arrow-rs/issues/4172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Allow cast to take in a format specification [\#4168](https://github.com/apache/arrow-rs/issues/4168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support extended pow arithmetic [\#4166](https://github.com/apache/arrow-rs/issues/4166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Preload page index for async ParquetObjectReader [\#4090](https://github.com/apache/arrow-rs/issues/4090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Parquet Page Index Reader Assumes Consecutive Offsets [\#4149](https://github.com/apache/arrow-rs/issues/4149) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Equality of nested data types [\#4110](https://github.com/apache/arrow-rs/issues/4110) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Documentation updates:** - -- Improve Documentation of Parquet ChunkReader [\#4118](https://github.com/apache/arrow-rs/issues/4118) - -**Closed issues:** - -- add specific error log for empty JSON array [\#4105](https://github.com/apache/arrow-rs/issues/4105) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Subtracting `Timestamp` from `Timestamp` should produce a `Duration` \(not `Timestamp`\) [\#3964](https://github.com/apache/arrow-rs/issues/3964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Prep for 39.0.0 [\#4171](https://github.com/apache/arrow-rs/pull/4171) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([iajoiner](https://github.com/iajoiner)) -- Support Compression in parquet-fromcsv [\#4160](https://github.com/apache/arrow-rs/pull/4160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([suxiaogang223](https://github.com/suxiaogang223)) -- feat: support bitwise shift left/right with scalars [\#4159](https://github.com/apache/arrow-rs/pull/4159) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Cleanup reading page index \(\#4149\) \(\#4090\) [\#4151](https://github.com/apache/arrow-rs/pull/4151) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- feat: support `bitwise` shift left/right [\#4148](https://github.com/apache/arrow-rs/pull/4148) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Don't hardcode port in FlightSQL tests [\#4145](https://github.com/apache/arrow-rs/pull/4145) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Better flight SQL example codes [\#4144](https://github.com/apache/arrow-rs/pull/4144) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([sundy-li](https://github.com/sundy-li)) -- chore: clean the code by using `as_primitive` [\#4143](https://github.com/apache/arrow-rs/pull/4143) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) -- Infer Float64 for JSON Numerics Beyond Bounds of i64 [\#4138](https://github.com/apache/arrow-rs/pull/4138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([SteveLauC](https://github.com/SteveLauC)) -- Support fixed point multiplication for DictionaryArray of Decimals [\#4136](https://github.com/apache/arrow-rs/pull/4136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Make arrow\_json::ReaderBuilder method names consistent [\#4128](https://github.com/apache/arrow-rs/pull/4128) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: add get\_{ref, mut} to arrow\_ipc Reader and Writer [\#4122](https://github.com/apache/arrow-rs/pull/4122) ([sticnarf](https://github.com/sticnarf)) -- feat: support `Interval` + `Timestamp` and `Interval` + `Date` [\#4117](https://github.com/apache/arrow-rs/pull/4117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- Support NullArray in JSON Reader [\#4114](https://github.com/apache/arrow-rs/pull/4114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jiangzhx](https://github.com/jiangzhx)) -- Add Type Declarations for All Primitive Tensors and Buffer Builders [\#4113](https://github.com/apache/arrow-rs/pull/4113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Update regex-syntax requirement from 0.6.27 to 0.7.1 [\#4107](https://github.com/apache/arrow-rs/pull/4107) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat: set FlightDescriptor on FlightDataEncoderBuilder [\#4101](https://github.com/apache/arrow-rs/pull/4101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Weijun-H](https://github.com/Weijun-H)) -- optimize cast for same decimal type and same scale [\#4088](https://github.com/apache/arrow-rs/pull/4088) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Arrow Arithmetic: Subtract timestamps [\#4244](https://github.com/apache/arrow-rs/pull/4244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) +- Update proc-macro2 requirement from =1.0.57 to =1.0.58 [\#4236](https://github.com/apache/arrow-rs/pull/4236) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix Nightly Clippy Lints [\#4233](https://github.com/apache/arrow-rs/pull/4233) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: use all primitive types in test\_layouts [\#4229](https://github.com/apache/arrow-rs/pull/4229) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add close method to RecordBatchWriter trait [\#4228](https://github.com/apache/arrow-rs/pull/4228) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Update proc-macro2 requirement from =1.0.56 to =1.0.57 [\#4219](https://github.com/apache/arrow-rs/pull/4219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Feat docs [\#4215](https://github.com/apache/arrow-rs/pull/4215) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Folyd](https://github.com/Folyd)) +- feat: Support bitwise and boolean aggregate functions [\#4210](https://github.com/apache/arrow-rs/pull/4210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Document how to sort a RecordBatch [\#4204](https://github.com/apache/arrow-rs/pull/4204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix incorrect cast Timestamp with Timezone [\#4201](https://github.com/apache/arrow-rs/pull/4201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aprimadi](https://github.com/aprimadi)) +- Add implementation of `RecordBatchReader` for CSV reader [\#4195](https://github.com/apache/arrow-rs/pull/4195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Add Sliced ListArray test \(\#3748\) [\#4186](https://github.com/apache/arrow-rs/pull/4186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- refactor: simplify can\_cast\_types code. [\#4185](https://github.com/apache/arrow-rs/pull/4185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Minor: support new types in struct\_builder.rs [\#4177](https://github.com/apache/arrow-rs/pull/4177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- feat: add compression info to print\_column\_chunk\_metadata\(\) [\#4176](https://github.com/apache/arrow-rs/pull/4176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) diff --git a/Cargo.toml b/Cargo.toml index 1b3a76db9ae5..bf311bd05edc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "39.0.0" +version = "40.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "39.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "39.0.0", path = "./arrow-arith" } -arrow-array = { version = "39.0.0", path = "./arrow-array" } -arrow-buffer = { version = "39.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "39.0.0", path = "./arrow-cast" } -arrow-csv = { version = "39.0.0", path = "./arrow-csv" } -arrow-data = { version = "39.0.0", path = "./arrow-data" } -arrow-ipc = { version = "39.0.0", path = "./arrow-ipc" } -arrow-json = { version = "39.0.0", path = "./arrow-json" } -arrow-ord = { version = "39.0.0", path = "./arrow-ord" } -arrow-row = { version = "39.0.0", path = "./arrow-row" } -arrow-schema = { version = "39.0.0", path = "./arrow-schema" } -arrow-select = { version = "39.0.0", path = "./arrow-select" } -arrow-string = { version = "39.0.0", path = "./arrow-string" } -parquet = { version = "39.0.0", path = "./parquet", default-features = false } +arrow = { version = "40.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "40.0.0", path = "./arrow-arith" } +arrow-array = { version = "40.0.0", path = "./arrow-array" } +arrow-buffer = { version = "40.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "40.0.0", path = "./arrow-cast" } +arrow-csv = { version = "40.0.0", path = "./arrow-csv" } +arrow-data = { version = "40.0.0", path = "./arrow-data" } +arrow-ipc = { version = "40.0.0", path = "./arrow-ipc" } +arrow-json = { version = "40.0.0", path = "./arrow-json" } +arrow-ord = { version = "40.0.0", path = "./arrow-ord" } +arrow-row = { version = "40.0.0", path = "./arrow-row" } +arrow-schema = { version = "40.0.0", path = "./arrow-schema" } +arrow-select = { version = "40.0.0", path = "./arrow-select" } +arrow-string = { version = "40.0.0", path = "./arrow-string" } +parquet = { version = "40.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 0b8ad7052838..299fa45d3584 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="38.0.0" -FUTURE_RELEASE="39.0.0" +SINCE_TAG="39.0.0" +FUTURE_RELEASE="40.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From e2b9b1afebd45ab0dcdef260d662733bb3ac7d82 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 22 May 2023 11:53:12 +0100 Subject: [PATCH 0919/1411] Support Absolute Timestamps in CSV Schema Inference (#4131) (#4217) --- arrow-csv/src/reader/mod.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 0ab1664f5d00..328c2cd41f3b 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -150,10 +150,10 @@ lazy_static! { r"^-?(\d+)$", //INTEGER r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$", //DECIMAL r"^\d{4}-\d\d-\d\d$", //DATE32 - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d$", //Timestamp(Second) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,3}$", //Timestamp(Millisecond) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,6}$", //Timestamp(Microsecond) - r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d.\d{1,9}$", //Timestamp(Nanosecond) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d(?:[^\d\.].*)?$", //Timestamp(Second) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,3}(?:[^\d].*)?$", //Timestamp(Millisecond) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,6}(?:[^\d].*)?$", //Timestamp(Microsecond) + r"^\d{4}-\d\d-\d\d[T ]\d\d:\d\d:\d\d\.\d{1,9}(?:[^\d].*)?$", //Timestamp(Nanosecond) ]).unwrap(); } @@ -2165,6 +2165,19 @@ mod tests { ], DataType::Timestamp(TimeUnit::Microsecond, None), ), + ( + &["2020-03-19 02:00:00+02:00", "2020-03-19 02:00:00Z"], + DataType::Timestamp(TimeUnit::Second, None), + ), + ( + &[ + "2020-03-19", + "2020-03-19 02:00:00+02:00", + "2020-03-19 02:00:00Z", + "2020-03-19 02:00:00.12Z", + ], + DataType::Timestamp(TimeUnit::Millisecond, None), + ), ( &[ "2020-03-19", From a9b9c609d9db5c6dd2f20e92856d048bc20b7f14 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 May 2023 16:33:07 +0100 Subject: [PATCH 0920/1411] Update object_store requirement from 0.5 to 0.6 (#4258) Updates the requirements on [object_store](https://github.com/apache/arrow-rs) to permit the latest version. - [Changelog](https://github.com/apache/arrow-rs/blob/master/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/object_store_0.5.0...object_store_0.6.0) --- updated-dependencies: - dependency-name: object_store dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 84142824e372..f04e1df1d7a7 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -44,7 +44,7 @@ arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } # Intentionally not a path dependency as object_store is released separately -object_store = { version = "0.5", default-features = false, optional = true } +object_store = { version = "0.6", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } From df691d5be14ea334e1d541697457291ba0796c52 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 23 May 2023 11:38:36 +0100 Subject: [PATCH 0921/1411] Extract IPC ArrayReader struct (#4259) * Extract IPC ArrayReader struct * Review feedback --- arrow-ipc/src/reader.rs | 512 ++++++++++++++-------------------------- 1 file changed, 175 insertions(+), 337 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 162e92914901..cabf81fc245e 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -21,6 +21,7 @@ //! however the `FileReader` expects a reader that supports `Seek`ing use arrow_buffer::i256; +use flatbuffers::VectorIter; use std::collections::HashMap; use std::fmt; use std::io::{BufReader, Read, Seek, SeekFrom}; @@ -33,7 +34,7 @@ use arrow_data::ArrayData; use arrow_schema::*; use crate::compression::CompressionCodec; -use crate::CONTINUATION_MARKER; +use crate::{FieldNode, MetadataVersion, CONTINUATION_MARKER}; use DataType::*; /// Read a buffer based on offset and length @@ -48,7 +49,7 @@ use DataType::*; fn read_buffer( buf: &crate::Buffer, a_data: &Buffer, - compression_codec: &Option, + compression_codec: Option, ) -> Result { let start_offset = buf.offset() as usize; let buf_data = a_data.slice_with_length(start_offset, buf.length() as usize); @@ -68,122 +69,46 @@ fn read_buffer( /// - check if the bit width of non-64-bit numbers is 64, and /// - read the buffer as 64-bit (signed integer or float), and /// - cast the 64-bit array to the appropriate data type -#[allow(clippy::too_many_arguments)] -fn create_array( - nodes: flatbuffers::Vector<'_, crate::FieldNode>, - field: &Field, - data: &Buffer, - buffers: flatbuffers::Vector<'_, crate::Buffer>, - dictionaries_by_id: &HashMap, - mut node_index: usize, - mut buffer_index: usize, - compression_codec: &Option, - metadata: &crate::MetadataVersion, -) -> Result<(ArrayRef, usize, usize), ArrowError> { +fn create_array(reader: &mut ArrayReader, field: &Field) -> Result { let data_type = field.data_type(); - let array = match data_type { - Utf8 | Binary | LargeBinary | LargeUtf8 => { - let array = create_primitive_array( - nodes.get(node_index), - data_type, - &[ - read_buffer(buffers.get(buffer_index), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 2), data, compression_codec)?, - ], - )?; - node_index += 1; - buffer_index += 3; - array - } - FixedSizeBinary(_) => { - let array = create_primitive_array( - nodes.get(node_index), - data_type, - &[ - read_buffer(buffers.get(buffer_index), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, - ], - )?; - node_index += 1; - buffer_index += 2; - array - } + match data_type { + Utf8 | Binary | LargeBinary | LargeUtf8 => create_primitive_array( + reader.next_node(field)?, + data_type, + &[ + reader.next_buffer()?, + reader.next_buffer()?, + reader.next_buffer()?, + ], + ), + FixedSizeBinary(_) => create_primitive_array( + reader.next_node(field)?, + data_type, + &[reader.next_buffer()?, reader.next_buffer()?], + ), List(ref list_field) | LargeList(ref list_field) | Map(ref list_field, _) => { - let list_node = nodes.get(node_index); - let list_buffers = [ - read_buffer(buffers.get(buffer_index), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, - ]; - node_index += 1; - buffer_index += 2; - let triple = create_array( - nodes, - list_field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - node_index = triple.1; - buffer_index = triple.2; - - create_list_array(list_node, data_type, &list_buffers, triple.0)? + let list_node = reader.next_node(field)?; + let list_buffers = [reader.next_buffer()?, reader.next_buffer()?]; + let values = create_array(reader, list_field)?; + create_list_array(list_node, data_type, &list_buffers, values) } FixedSizeList(ref list_field, _) => { - let list_node = nodes.get(node_index); - let list_buffers = [read_buffer( - buffers.get(buffer_index), - data, - compression_codec, - )?]; - node_index += 1; - buffer_index += 1; - let triple = create_array( - nodes, - list_field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - node_index = triple.1; - buffer_index = triple.2; - - create_list_array(list_node, data_type, &list_buffers, triple.0)? + let list_node = reader.next_node(field)?; + let list_buffers = [reader.next_buffer()?]; + let values = create_array(reader, list_field)?; + create_list_array(list_node, data_type, &list_buffers, values) } Struct(struct_fields) => { - let struct_node = nodes.get(node_index); - let null_buffer = - read_buffer(buffers.get(buffer_index), data, compression_codec)?; - node_index += 1; - buffer_index += 1; + let struct_node = reader.next_node(field)?; + let null_buffer = reader.next_buffer()?; // read the arrays for each field let mut struct_arrays = vec![]; // TODO investigate whether just knowing the number of buffers could // still work for struct_field in struct_fields { - let triple = create_array( - nodes, - struct_field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - node_index = triple.1; - buffer_index = triple.2; - struct_arrays.push((struct_field.clone(), triple.0)); + let child = create_array(reader, struct_field)?; + struct_arrays.push((struct_field.clone(), child)); } let null_count = struct_node.null_count() as usize; let struct_array = if null_count > 0 { @@ -192,101 +117,61 @@ fn create_array( } else { StructArray::from(struct_arrays) }; - Arc::new(struct_array) + Ok(Arc::new(struct_array)) } RunEndEncoded(run_ends_field, values_field) => { - let run_node = nodes.get(node_index); - node_index += 1; - - let run_ends_triple = create_array( - nodes, - run_ends_field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - node_index = run_ends_triple.1; - buffer_index = run_ends_triple.2; - - let values_triple = create_array( - nodes, - values_field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - node_index = values_triple.1; - buffer_index = values_triple.2; + let run_node = reader.next_node(field)?; + let run_ends = create_array(reader, run_ends_field)?; + let values = create_array(reader, values_field)?; let run_array_length = run_node.length() as usize; let data = ArrayData::builder(data_type.clone()) .len(run_array_length) .offset(0) - .add_child_data(run_ends_triple.0.into_data()) - .add_child_data(values_triple.0.into_data()) + .add_child_data(run_ends.into_data()) + .add_child_data(values.into_data()) .build()?; - make_array(data) + Ok(make_array(data)) } // Create dictionary array from RecordBatch Dictionary(_, _) => { - let index_node = nodes.get(node_index); - let index_buffers = [ - read_buffer(buffers.get(buffer_index), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, - ]; + let index_node = reader.next_node(field)?; + let index_buffers = [reader.next_buffer()?, reader.next_buffer()?]; let dict_id = field.dict_id().ok_or_else(|| { ArrowError::IoError(format!("Field {field} does not have dict id")) })?; - let value_array = dictionaries_by_id.get(&dict_id).ok_or_else(|| { - ArrowError::IoError(format!( - "Cannot find a dictionary batch with dict id: {dict_id}" - )) - })?; - node_index += 1; - buffer_index += 2; + let value_array = + reader.dictionaries_by_id.get(&dict_id).ok_or_else(|| { + ArrowError::IoError(format!( + "Cannot find a dictionary batch with dict id: {dict_id}" + )) + })?; create_dictionary_array( index_node, data_type, &index_buffers, value_array.clone(), - )? + ) } Union(fields, mode) => { - let union_node = nodes.get(node_index); - node_index += 1; - + let union_node = reader.next_node(field)?; let len = union_node.length() as usize; // In V4, union types has validity bitmap // In V5 and later, union types have no validity bitmap - if metadata < &crate::MetadataVersion::V5 { - read_buffer(buffers.get(buffer_index), data, compression_codec)?; - buffer_index += 1; + if reader.version < MetadataVersion::V5 { + reader.next_buffer()?; } - let type_ids: Buffer = - read_buffer(buffers.get(buffer_index), data, compression_codec)?[..len] - .into(); - - buffer_index += 1; + let type_ids: Buffer = reader.next_buffer()?[..len].into(); let value_offsets = match mode { UnionMode::Dense => { - let buffer = - read_buffer(buffers.get(buffer_index), data, compression_codec)?; - buffer_index += 1; + let buffer = reader.next_buffer()?; Some(buffer[..len * 4].into()) } UnionMode::Sparse => None, @@ -296,30 +181,16 @@ fn create_array( let mut ids = Vec::with_capacity(fields.len()); for (id, field) in fields.iter() { - let triple = create_array( - nodes, - field, - data, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - compression_codec, - metadata, - )?; - - node_index = triple.1; - buffer_index = triple.2; - - children.push((field.as_ref().clone(), triple.0)); + let child = create_array(reader, field)?; + children.push((field.as_ref().clone(), child)); ids.push(id); } let array = UnionArray::try_new(&ids, type_ids, value_offsets, children)?; - Arc::new(array) + Ok(Arc::new(array)) } Null => { - let node = nodes.get(node_index); + let node = reader.next_node(field)?; let length = node.length(); let null_count = node.null_count(); @@ -334,125 +205,21 @@ fn create_array( .offset(0) .build() .unwrap(); - node_index += 1; // no buffer increases - make_array(data) - } - _ => { - if nodes.len() <= node_index { - return Err(ArrowError::IoError(format!( - "Invalid data for schema. {} refers to node index {} but only {} in schema", - field, node_index, nodes.len() - ))); - } - let array = create_primitive_array( - nodes.get(node_index), - data_type, - &[ - read_buffer(buffers.get(buffer_index), data, compression_codec)?, - read_buffer(buffers.get(buffer_index + 1), data, compression_codec)?, - ], - )?; - node_index += 1; - buffer_index += 2; - array - } - }; - Ok((array, node_index, buffer_index)) -} - -/// Skip fields based on data types to advance `node_index` and `buffer_index`. -/// This function should be called when doing projection in fn `read_record_batch`. -/// The advancement logic references fn `create_array`. -fn skip_field( - data_type: &DataType, - mut node_index: usize, - mut buffer_index: usize, -) -> Result<(usize, usize), ArrowError> { - match data_type { - Utf8 | Binary | LargeBinary | LargeUtf8 => { - node_index += 1; - buffer_index += 3; - } - FixedSizeBinary(_) => { - node_index += 1; - buffer_index += 2; - } - List(ref list_field) | LargeList(ref list_field) | Map(ref list_field, _) => { - node_index += 1; - buffer_index += 2; - let tuple = skip_field(list_field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; + Ok(Arc::new(NullArray::from(data))) } - FixedSizeList(ref list_field, _) => { - node_index += 1; - buffer_index += 1; - let tuple = skip_field(list_field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; - } - Struct(struct_fields) => { - node_index += 1; - buffer_index += 1; - - // skip for each field - for struct_field in struct_fields { - let tuple = - skip_field(struct_field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; - } - } - RunEndEncoded(run_ends_field, values_field) => { - node_index += 1; - - let tuple = skip_field(run_ends_field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; - - let tuple = skip_field(values_field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; - } - Dictionary(_, _) => { - node_index += 1; - buffer_index += 2; - } - Union(fields, mode) => { - node_index += 1; - buffer_index += 1; - - match mode { - UnionMode::Dense => { - buffer_index += 1; - } - UnionMode::Sparse => {} - }; - - for (_, field) in fields.iter() { - let tuple = skip_field(field.data_type(), node_index, buffer_index)?; - - node_index = tuple.0; - buffer_index = tuple.1; - } - } - Null => { - node_index += 1; - // no buffer increases - } - _ => { - node_index += 1; - buffer_index += 2; - } - }; - Ok((node_index, buffer_index)) + _ => create_primitive_array( + reader.next_node(field)?, + data_type, + &[reader.next_buffer()?, reader.next_buffer()?], + ), + } } /// Reads the correct number of buffers based on data type and null_count, and creates a /// primitive array ref fn create_primitive_array( - field_node: &crate::FieldNode, + field_node: &FieldNode, data_type: &DataType, buffers: &[Buffer], ) -> Result { @@ -628,6 +395,100 @@ fn create_dictionary_array( } } +/// State for decoding arrays from an encoded [`RecordBatch`] +struct ArrayReader<'a> { + /// Decoded dictionaries indexed by dictionary id + dictionaries_by_id: &'a HashMap, + /// Optional compression codec + compression: Option, + /// The format version + version: MetadataVersion, + /// The raw data buffer + data: &'a Buffer, + /// The fields comprising this array + nodes: VectorIter<'a, FieldNode>, + /// The buffers comprising this array + buffers: VectorIter<'a, crate::Buffer>, +} + +impl<'a> ArrayReader<'a> { + fn next_buffer(&mut self) -> Result { + read_buffer(self.buffers.next().unwrap(), self.data, self.compression) + } + + fn skip_buffer(&mut self) { + self.buffers.next().unwrap(); + } + + fn next_node(&mut self, field: &Field) -> Result<&'a FieldNode, ArrowError> { + self.nodes.next().ok_or_else(|| { + ArrowError::IoError(format!( + "Invalid data for schema. {} refers to node not found in schema", + field + )) + }) + } + + fn skip_field(&mut self, field: &Field) -> Result<(), ArrowError> { + self.next_node(field)?; + + match field.data_type() { + Utf8 | Binary | LargeBinary | LargeUtf8 => { + for _ in 0..3 { + self.skip_buffer() + } + } + FixedSizeBinary(_) => { + self.skip_buffer(); + self.skip_buffer(); + } + List(list_field) | LargeList(list_field) | Map(list_field, _) => { + self.skip_buffer(); + self.skip_buffer(); + self.skip_field(list_field)?; + } + FixedSizeList(list_field, _) => { + self.skip_buffer(); + self.skip_field(list_field)?; + } + Struct(struct_fields) => { + self.skip_buffer(); + + // skip for each field + for struct_field in struct_fields { + self.skip_field(struct_field)? + } + } + RunEndEncoded(run_ends_field, values_field) => { + self.skip_field(run_ends_field)?; + self.skip_field(values_field)?; + } + Dictionary(_, _) => { + self.skip_buffer(); // Nulls + self.skip_buffer(); // Indices + } + Union(fields, mode) => { + self.skip_buffer(); // Nulls + + match mode { + UnionMode::Dense => self.skip_buffer(), + UnionMode::Sparse => {} + }; + + for (_, field) in fields.iter() { + self.skip_field(field)? + } + } + Null => {} // No buffer increases + _ => { + self.skip_buffer(); + self.skip_buffer(); + } + }; + Ok(()) + } +} + /// Creates a record batch from binary data using the `crate::RecordBatch` indexes and the `Schema` pub fn read_record_batch( buf: &Buffer, @@ -635,7 +496,7 @@ pub fn read_record_batch( schema: SchemaRef, dictionaries_by_id: &HashMap, projection: Option<&[usize]>, - metadata: &crate::MetadataVersion, + metadata: &MetadataVersion, ) -> Result { let buffers = batch.buffers().ok_or_else(|| { ArrowError::IoError("Unable to get buffers from IPC RecordBatch".to_string()) @@ -644,13 +505,18 @@ pub fn read_record_batch( ArrowError::IoError("Unable to get field nodes from IPC RecordBatch".to_string()) })?; let batch_compression = batch.compression(); - let compression_codec: Option = batch_compression + let compression = batch_compression .map(|batch_compression| batch_compression.codec().try_into()) .transpose()?; - // keep track of buffer and node index, the functions that create arrays mutate these - let mut buffer_index = 0; - let mut node_index = 0; + let mut reader = ArrayReader { + dictionaries_by_id, + compression, + version: *metadata, + data: buf, + nodes: field_nodes.iter(), + buffers: buffers.iter(), + }; let options = RecordBatchOptions::new().with_row_count(Some(batch.length() as usize)); @@ -660,26 +526,10 @@ pub fn read_record_batch( for (idx, field) in schema.fields().iter().enumerate() { // Create array for projected field if let Some(proj_idx) = projection.iter().position(|p| p == &idx) { - let triple = create_array( - field_nodes, - field, - buf, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - &compression_codec, - metadata, - )?; - node_index = triple.1; - buffer_index = triple.2; - arrays.push((proj_idx, triple.0)); + let child = create_array(&mut reader, field)?; + arrays.push((proj_idx, child)); } else { - // Skip field. - // This must be called to advance `node_index` and `buffer_index`. - let tuple = skip_field(field.data_type(), node_index, buffer_index)?; - node_index = tuple.0; - buffer_index = tuple.1; + reader.skip_field(field)?; } } arrays.sort_by_key(|t| t.0); @@ -689,25 +539,13 @@ pub fn read_record_batch( &options, ) } else { - let mut arrays = vec![]; + let mut children = vec![]; // keep track of index as lists require more than one node for field in schema.fields() { - let triple = create_array( - field_nodes, - field, - buf, - buffers, - dictionaries_by_id, - node_index, - buffer_index, - &compression_codec, - metadata, - )?; - node_index = triple.1; - buffer_index = triple.2; - arrays.push(triple.0); + let child = create_array(&mut reader, field)?; + children.push(child); } - RecordBatch::try_new_with_options(schema, arrays, &options) + RecordBatch::try_new_with_options(schema, children, &options) } } From 5752997527f15357a0b7ed4cdfaa3307e10e05f3 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Tue, 23 May 2023 16:07:59 +0200 Subject: [PATCH 0922/1411] feat: update flight-sql to latest specs (#4250) * feat: update flight-sql to latest specs * fix: pr feedback --- arrow-flight/examples/flight_sql_server.rs | 100 +- arrow-flight/src/arrow.flight.protocol.rs | 21 +- arrow-flight/src/bin/flight_sql_client.rs | 5 +- arrow-flight/src/lib.rs | 2 + .../src/sql/arrow.flight.protocol.sql.rs | 526 ++- arrow-flight/src/sql/client.rs | 28 +- arrow-flight/src/sql/mod.rs | 23 + arrow-flight/src/sql/server.rs | 239 +- arrow-flight/tests/client.rs | 1 + arrow-flight/tests/flight_sql_client_cli.rs | 90 +- .../integration_test.rs | 1 + format/Flight.proto | 706 ++-- format/FlightSql.proto | 3366 +++++++++-------- 13 files changed, 3150 insertions(+), 1958 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 23d71090ae47..01632285cf66 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -18,7 +18,11 @@ use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::sql::{ - ActionCreatePreparedStatementResult, Any, ProstMessageExt, SqlInfo, + ActionBeginSavepointRequest, ActionBeginSavepointResult, + ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, + ActionCreatePreparedStatementResult, ActionEndSavepointRequest, + ActionEndTransactionRequest, Any, CommandStatementSubstraitPlan, ProstMessageExt, + SqlInfo, }; use arrow_flight::{ Action, FlightData, FlightEndpoint, HandshakeRequest, HandshakeResponse, IpcMessage, @@ -40,8 +44,9 @@ use arrow_flight::{ flight_service_server::FlightService, flight_service_server::FlightServiceServer, sql::{ - server::FlightSqlService, ActionClosePreparedStatementRequest, - ActionCreatePreparedStatementRequest, CommandGetCatalogs, + server::FlightSqlService, ActionBeginTransactionRequest, + ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, + ActionCreatePreparedSubstraitPlanRequest, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, @@ -177,6 +182,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn get_flight_info_substrait_plan( + &self, + _query: CommandStatementSubstraitPlan, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_substrait_plan not implemented", + )) + } + async fn get_flight_info_prepared_statement( &self, cmd: CommandPreparedStatementQuery, @@ -220,6 +235,7 @@ impl FlightSqlService for FlightSqlServiceImpl { endpoint: endpoints, total_records: num_rows as i64, total_bytes: num_bytes as i64, + ordered: false, }; let resp = Response::new(info); Ok(resp) @@ -441,6 +457,16 @@ impl FlightSqlService for FlightSqlServiceImpl { Ok(FAKE_UPDATE_RESULT) } + async fn do_put_substrait_plan( + &self, + _ticket: CommandStatementSubstraitPlan, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_substrait_plan not implemented", + )) + } + async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, @@ -486,8 +512,62 @@ impl FlightSqlService for FlightSqlServiceImpl { &self, _query: ActionClosePreparedStatementRequest, _request: Request, - ) { - unimplemented!("Implement do_action_close_prepared_statement") + ) -> Result<(), Status> { + Err(Status::unimplemented( + "Implement do_action_close_prepared_statement", + )) + } + + async fn do_action_create_prepared_substrait_plan( + &self, + _query: ActionCreatePreparedSubstraitPlanRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "Implement do_action_create_prepared_substrait_plan", + )) + } + + async fn do_action_begin_transaction( + &self, + _query: ActionBeginTransactionRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "Implement do_action_begin_transaction", + )) + } + + async fn do_action_end_transaction( + &self, + _query: ActionEndTransactionRequest, + _request: Request, + ) -> Result<(), Status> { + Err(Status::unimplemented("Implement do_action_end_transaction")) + } + + async fn do_action_begin_savepoint( + &self, + _query: ActionBeginSavepointRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented("Implement do_action_begin_savepoint")) + } + + async fn do_action_end_savepoint( + &self, + _query: ActionEndSavepointRequest, + _request: Request, + ) -> Result<(), Status> { + Err(Status::unimplemented("Implement do_action_end_savepoint")) + } + + async fn do_action_cancel_query( + &self, + _query: ActionCancelQueryRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented("Implement do_action_cancel_query")) } async fn register_sql_info(&self, _id: i32, _result: &SqlInfo) {} @@ -718,7 +798,7 @@ mod tests { test_all_clients(|mut client| async move { auth_client(&mut client).await; - let mut stmt = client.prepare("select 1;".to_string()).await.unwrap(); + let mut stmt = client.prepare("select 1;".to_string(), None).await.unwrap(); let flight_info = stmt.execute().await.unwrap(); @@ -746,7 +826,7 @@ mod tests { test_all_clients(|mut client| async move { auth_client(&mut client).await; let res = client - .execute_update("creat table test(a int);".to_string()) + .execute_update("creat table test(a int);".to_string(), None) .await .unwrap(); assert_eq!(res, FAKE_UPDATE_RESULT); @@ -759,7 +839,7 @@ mod tests { test_all_clients(|mut client| async move { // no handshake assert!(client - .prepare("select 1;".to_string()) + .prepare("select 1;".to_string(), None) .await .unwrap_err() .to_string() @@ -776,7 +856,7 @@ mod tests { // forget to set_token client.handshake("admin", "password").await.unwrap(); assert!(client - .prepare("select 1;".to_string()) + .prepare("select 1;".to_string(), None) .await .unwrap_err() .to_string() @@ -786,7 +866,7 @@ mod tests { client.handshake("admin", "password").await.unwrap(); client.set_token("wrong token".to_string()); assert!(client - .prepare("select 1;".to_string()) + .prepare("select 1;".to_string(), None) .await .unwrap_err() .to_string() diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index 200c858cf5f1..10dc7ace0356 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -183,8 +183,21 @@ pub struct FlightInfo { /// In other words, an application can use multiple endpoints to /// represent partitioned data. /// - /// There is no ordering defined on endpoints. Hence, if the returned - /// data has an ordering, it should be returned in a single endpoint. + /// If the returned data has an ordering, an application can use + /// "FlightInfo.ordered = true" or should return the all data in a + /// single endpoint. Otherwise, there is no ordering defined on + /// endpoints or the data within. + /// + /// A client can read ordered data by reading data from returned + /// endpoints, in order, from front to back. + /// + /// Note that a client may ignore "FlightInfo.ordered = true". If an + /// ordering is important for an application, an application must + /// choose one of them: + /// + /// * An application requires that all clients must read data in + /// returned endpoints order. + /// * An application must return the all data in a single endpoint. #[prost(message, repeated, tag = "3")] pub endpoint: ::prost::alloc::vec::Vec, /// Set these to -1 if unknown. @@ -192,6 +205,10 @@ pub struct FlightInfo { pub total_records: i64, #[prost(int64, tag = "5")] pub total_bytes: i64, + /// + /// FlightEndpoints are in the same order as the data. + #[prost(bool, tag = "6")] + pub ordered: bool, } /// /// A particular stream or split associated with a flight. diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index 1891a331be96..a787989bf6b4 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -108,7 +108,10 @@ async fn main() { setup_logging(); let mut client = setup_client(args.client_args).await.expect("setup client"); - let info = client.execute(args.query).await.expect("prepare statement"); + let info = client + .execute(args.query, None) + .await + .expect("prepare statement"); info!("got flight info"); let schema = Arc::new(Schema::try_from(info.clone()).expect("valid schema")); diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index a80358ff00c5..4960912ef8af 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -424,6 +424,7 @@ impl FlightInfo { endpoint: Vec, total_records: i64, total_bytes: i64, + ordered: bool, ) -> Self { let IpcMessage(vals) = message; FlightInfo { @@ -432,6 +433,7 @@ impl FlightInfo { endpoint, total_records, total_bytes, + ordered, } } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 080156cce88e..b2137d8543d3 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -51,12 +51,12 @@ pub struct CommandGetSqlInfo { /// The returned schema will be: /// < /// type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), -/// data_type: int not null (The SQL data type), -/// column_size: int (The maximum size supported by that column. -/// In case of exact numeric types, this represents the maximum precision. -/// In case of string types, this represents the character length. -/// In case of datetime data types, this represents the length in characters of the string representation. -/// NULL is returned for data types where column size is not applicable.), +/// data_type: int32 not null (The SQL data type), +/// column_size: int32 (The maximum size supported by that column. +/// In case of exact numeric types, this represents the maximum precision. +/// In case of string types, this represents the character length. +/// In case of datetime data types, this represents the length in characters of the string representation. +/// NULL is returned for data types where column size is not applicable.), /// literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for /// data types where a literal prefix is not applicable.), /// literal_suffix: utf8 (Character or characters used to terminate a literal, @@ -65,11 +65,11 @@ pub struct CommandGetSqlInfo { /// (A list of keywords corresponding to which parameters can be used when creating /// a column for that specific type. /// NULL is returned if there are no parameters for the data type definition.), -/// nullable: int not null (Shows if the data type accepts a NULL value. The possible values can be seen in the -/// Nullable enum.), +/// nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the +/// Nullable enum.), /// case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), -/// searchable: int not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the -/// Searchable enum.), +/// searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the +/// Searchable enum.), /// unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is /// not applicable to the data type or the data type is not numeric.), /// fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), @@ -77,26 +77,26 @@ pub struct CommandGetSqlInfo { /// is not applicable to the data type or the data type is not numeric.), /// local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL /// is returned if a localized name is not supported by the data source), -/// minimum_scale: int (The minimum scale of the data type on the data source. -/// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE -/// columns both contain this value. NULL is returned if scale is not applicable.), -/// maximum_scale: int (The maximum scale of the data type on the data source. -/// NULL is returned if scale is not applicable.), -/// sql_data_type: int not null (The value of the SQL DATA TYPE which has the same values -/// as data_type value. Except for interval and datetime, which -/// uses generic values. More info about those types can be -/// obtained through datetime_subcode. The possible values can be seen -/// in the XdbcDataType enum.), -/// datetime_subcode: int (Only used when the SQL DATA TYPE is interval or datetime. It contains -/// its sub types. For type different from interval and datetime, this value -/// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), -/// num_prec_radix: int (If the data type is an approximate numeric type, this column contains -/// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For -/// exact numeric types, this column contains the value 10 to indicate that -/// column size specifies a number of decimal digits. Otherwise, this column is NULL.), -/// interval_precision: int (If the data type is an interval data type, then this column contains the value -/// of the interval leading precision. Otherwise, this column is NULL. This fields -/// is only relevant to be used by ODBC). +/// minimum_scale: int32 (The minimum scale of the data type on the data source. +/// If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE +/// columns both contain this value. NULL is returned if scale is not applicable.), +/// maximum_scale: int32 (The maximum scale of the data type on the data source. +/// NULL is returned if scale is not applicable.), +/// sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values +/// as data_type value. Except for interval and datetime, which +/// uses generic values. More info about those types can be +/// obtained through datetime_subcode. The possible values can be seen +/// in the XdbcDataType enum.), +/// datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains +/// its sub types. For type different from interval and datetime, this value +/// is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), +/// num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains +/// the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For +/// exact numeric types, this column contains the value 10 to indicate that +/// column size specifies a number of decimal digits. Otherwise, this column is NULL.), +/// interval_precision: int32 (If the data type is an interval data type, then this column contains the value +/// of the interval leading precision. Otherwise, this column is NULL. This fields +/// is only relevant to be used by ODBC). /// > /// The returned data should be ordered by data_type and then by type_name. #[allow(clippy::derive_partial_eq_without_eq)] @@ -246,7 +246,7 @@ pub struct CommandGetTableTypes {} /// table_name: utf8 not null, /// column_name: utf8 not null, /// key_name: utf8, -/// key_sequence: int not null +/// key_sequence: int32 not null /// > /// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. #[allow(clippy::derive_partial_eq_without_eq)] @@ -285,11 +285,11 @@ pub struct CommandGetPrimaryKeys { /// fk_db_schema_name: utf8, /// fk_table_name: utf8 not null, /// fk_column_name: utf8 not null, -/// key_sequence: int not null, +/// key_sequence: int32 not null, /// fk_key_name: utf8, /// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null +/// update_rule: uint8 not null, +/// delete_rule: uint8 not null /// > /// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. /// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. @@ -328,11 +328,11 @@ pub struct CommandGetExportedKeys { /// fk_db_schema_name: utf8, /// fk_table_name: utf8 not null, /// fk_column_name: utf8 not null, -/// key_sequence: int not null, +/// key_sequence: int32 not null, /// fk_key_name: utf8, /// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null +/// update_rule: uint8 not null, +/// delete_rule: uint8 not null /// > /// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. /// update_rule and delete_rule returns a byte that is equivalent to actions: @@ -378,11 +378,11 @@ pub struct CommandGetImportedKeys { /// fk_db_schema_name: utf8, /// fk_table_name: utf8 not null, /// fk_column_name: utf8 not null, -/// key_sequence: int not null, +/// key_sequence: int32 not null, /// fk_key_name: utf8, /// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null +/// update_rule: uint8 not null, +/// delete_rule: uint8 not null /// > /// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. /// update_rule and delete_rule returns a byte that is equivalent to actions: @@ -435,13 +435,49 @@ pub struct ActionCreatePreparedStatementRequest { /// The valid SQL string to create a prepared statement for. #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, + /// Create/execute the prepared statement as part of this transaction (if + /// unset, executions of the prepared statement will be auto-committed). + #[prost(bytes = "bytes", optional, tag = "2")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, } /// -/// Wrap the result of a "GetPreparedStatement" action. +/// An embedded message describing a Substrait plan to execute. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SubstraitPlan { + /// The serialized substrait.Plan to create a prepared statement for. + /// XXX(ARROW-16902): this is bytes instead of an embedded message + /// because Protobuf does not really support one DLL using Protobuf + /// definitions from another DLL. + #[prost(bytes = "bytes", tag = "1")] + pub plan: ::prost::bytes::Bytes, + /// The Substrait release, e.g. "0.12.0". This information is not + /// tracked in the plan itself, so this is the only way for consumers + /// to potentially know if they can handle the plan. + #[prost(string, tag = "2")] + pub version: ::prost::alloc::string::String, +} +/// +/// Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionCreatePreparedSubstraitPlanRequest { + /// The serialized substrait.Plan to create a prepared statement for. + #[prost(message, optional, tag = "1")] + pub plan: ::core::option::Option, + /// Create/execute the prepared statement as part of this transaction (if + /// unset, executions of the prepared statement will be auto-committed). + #[prost(bytes = "bytes", optional, tag = "2")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, +} +/// +/// Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. /// /// The resultant PreparedStatement can be closed either: /// - Manually, through the "ClosePreparedStatement" action; /// - Automatically, by a server timeout. +/// +/// The result should be wrapped in a google.protobuf.Any message. #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { @@ -468,6 +504,182 @@ pub struct ActionClosePreparedStatementRequest { pub prepared_statement_handle: ::prost::bytes::Bytes, } /// +/// Request message for the "BeginTransaction" action. +/// Begins a transaction. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionBeginTransactionRequest {} +/// +/// Request message for the "BeginSavepoint" action. +/// Creates a savepoint within a transaction. +/// +/// Only supported if FLIGHT_SQL_TRANSACTION is +/// FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionBeginSavepointRequest { + /// The transaction to which a savepoint belongs. + #[prost(bytes = "bytes", tag = "1")] + pub transaction_id: ::prost::bytes::Bytes, + /// Name for the savepoint. + #[prost(string, tag = "2")] + pub name: ::prost::alloc::string::String, +} +/// +/// The result of a "BeginTransaction" action. +/// +/// The transaction can be manipulated with the "EndTransaction" action, or +/// automatically via server timeout. If the transaction times out, then it is +/// automatically rolled back. +/// +/// The result should be wrapped in a google.protobuf.Any message. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionBeginTransactionResult { + /// Opaque handle for the transaction on the server. + #[prost(bytes = "bytes", tag = "1")] + pub transaction_id: ::prost::bytes::Bytes, +} +/// +/// The result of a "BeginSavepoint" action. +/// +/// The transaction can be manipulated with the "EndSavepoint" action. +/// If the associated transaction is committed, rolled back, or times +/// out, then the savepoint is also invalidated. +/// +/// The result should be wrapped in a google.protobuf.Any message. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionBeginSavepointResult { + /// Opaque handle for the savepoint on the server. + #[prost(bytes = "bytes", tag = "1")] + pub savepoint_id: ::prost::bytes::Bytes, +} +/// +/// Request message for the "EndTransaction" action. +/// +/// Commit (COMMIT) or rollback (ROLLBACK) the transaction. +/// +/// If the action completes successfully, the transaction handle is +/// invalidated, as are all associated savepoints. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionEndTransactionRequest { + /// Opaque handle for the transaction on the server. + #[prost(bytes = "bytes", tag = "1")] + pub transaction_id: ::prost::bytes::Bytes, + /// Whether to commit/rollback the given transaction. + #[prost(enumeration = "action_end_transaction_request::EndTransaction", tag = "2")] + pub action: i32, +} +/// Nested message and enum types in `ActionEndTransactionRequest`. +pub mod action_end_transaction_request { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum EndTransaction { + Unspecified = 0, + /// Commit the transaction. + Commit = 1, + /// Roll back the transaction. + Rollback = 2, + } + impl EndTransaction { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + EndTransaction::Unspecified => "END_TRANSACTION_UNSPECIFIED", + EndTransaction::Commit => "END_TRANSACTION_COMMIT", + EndTransaction::Rollback => "END_TRANSACTION_ROLLBACK", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "END_TRANSACTION_UNSPECIFIED" => Some(Self::Unspecified), + "END_TRANSACTION_COMMIT" => Some(Self::Commit), + "END_TRANSACTION_ROLLBACK" => Some(Self::Rollback), + _ => None, + } + } + } +} +/// +/// Request message for the "EndSavepoint" action. +/// +/// Release (RELEASE) the savepoint or rollback (ROLLBACK) to the +/// savepoint. +/// +/// Releasing a savepoint invalidates that savepoint. Rolling back to +/// a savepoint does not invalidate the savepoint, but invalidates all +/// savepoints created after the current savepoint. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionEndSavepointRequest { + /// Opaque handle for the savepoint on the server. + #[prost(bytes = "bytes", tag = "1")] + pub savepoint_id: ::prost::bytes::Bytes, + /// Whether to rollback/release the given savepoint. + #[prost(enumeration = "action_end_savepoint_request::EndSavepoint", tag = "2")] + pub action: i32, +} +/// Nested message and enum types in `ActionEndSavepointRequest`. +pub mod action_end_savepoint_request { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum EndSavepoint { + Unspecified = 0, + /// Release the savepoint. + Release = 1, + /// Roll back to a savepoint. + Rollback = 2, + } + impl EndSavepoint { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + EndSavepoint::Unspecified => "END_SAVEPOINT_UNSPECIFIED", + EndSavepoint::Release => "END_SAVEPOINT_RELEASE", + EndSavepoint::Rollback => "END_SAVEPOINT_ROLLBACK", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "END_SAVEPOINT_UNSPECIFIED" => Some(Self::Unspecified), + "END_SAVEPOINT_RELEASE" => Some(Self::Release), + "END_SAVEPOINT_ROLLBACK" => Some(Self::Rollback), + _ => None, + } + } + } +} +/// /// Represents a SQL query. Used in the command member of FlightDescriptor /// for the following RPC calls: /// - GetSchema: return the Arrow schema of the query. @@ -489,6 +701,36 @@ pub struct CommandStatementQuery { /// The SQL syntax. #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, + /// Include the query as part of this transaction (if unset, the query is auto-committed). + #[prost(bytes = "bytes", optional, tag = "2")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, +} +/// +/// Represents a Substrait plan. Used in the command member of FlightDescriptor +/// for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// Fields on this schema may contain the following metadata: +/// - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name +/// - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name +/// - ARROW:FLIGHT:SQL:TABLE_NAME - Table name +/// - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. +/// - ARROW:FLIGHT:SQL:PRECISION - Column precision/size +/// - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable +/// - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. +/// - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. +/// - GetFlightInfo: execute the query. +/// - DoPut: execute the query. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct CommandStatementSubstraitPlan { + /// A serialized substrait.Plan + #[prost(message, optional, tag = "1")] + pub plan: ::core::option::Option, + /// Include the query as part of this transaction (if unset, the query is auto-committed). + #[prost(bytes = "bytes", optional, tag = "2")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, } /// * /// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. @@ -533,6 +775,9 @@ pub struct CommandStatementUpdate { /// The SQL syntax. #[prost(string, tag = "1")] pub query: ::prost::alloc::string::String, + /// Include the query as part of this transaction (if unset, the query is auto-committed). + #[prost(bytes = "bytes", optional, tag = "2")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, } /// /// Represents a SQL update query. Used in the command member of FlightDescriptor @@ -557,6 +802,93 @@ pub struct DoPutUpdateResult { #[prost(int64, tag = "1")] pub record_count: i64, } +/// +/// Request message for the "CancelQuery" action. +/// +/// Explicitly cancel a running query. +/// +/// This lets a single client explicitly cancel work, no matter how many clients +/// are involved/whether the query is distributed or not, given server support. +/// The transaction/statement is not rolled back; it is the application's job to +/// commit or rollback as appropriate. This only indicates the client no longer +/// wishes to read the remainder of the query results or continue submitting +/// data. +/// +/// This command is idempotent. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionCancelQueryRequest { + /// The result of the GetFlightInfo RPC that initiated the query. + /// XXX(ARROW-16902): this must be a serialized FlightInfo, but is + /// rendered as bytes because Protobuf does not really support one + /// DLL using Protobuf definitions from another DLL. + #[prost(bytes = "bytes", tag = "1")] + pub info: ::prost::bytes::Bytes, +} +/// +/// The result of cancelling a query. +/// +/// The result should be wrapped in a google.protobuf.Any message. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ActionCancelQueryResult { + #[prost(enumeration = "action_cancel_query_result::CancelResult", tag = "1")] + pub result: i32, +} +/// Nested message and enum types in `ActionCancelQueryResult`. +pub mod action_cancel_query_result { + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum CancelResult { + /// The cancellation status is unknown. Servers should avoid using + /// this value (send a NOT_FOUND error if the requested query is + /// not known). Clients can retry the request. + Unspecified = 0, + /// The cancellation request is complete. Subsequent requests with + /// the same payload may return CANCELLED or a NOT_FOUND error. + Cancelled = 1, + /// The cancellation request is in progress. The client may retry + /// the cancellation request. + Cancelling = 2, + /// The query is not cancellable. The client should not retry the + /// cancellation request. + NotCancellable = 3, + } + impl CancelResult { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + CancelResult::Unspecified => "CANCEL_RESULT_UNSPECIFIED", + CancelResult::Cancelled => "CANCEL_RESULT_CANCELLED", + CancelResult::Cancelling => "CANCEL_RESULT_CANCELLING", + CancelResult::NotCancellable => "CANCEL_RESULT_NOT_CANCELLABLE", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "CANCEL_RESULT_UNSPECIFIED" => Some(Self::Unspecified), + "CANCEL_RESULT_CANCELLED" => Some(Self::Cancelled), + "CANCEL_RESULT_CANCELLING" => Some(Self::Cancelling), + "CANCEL_RESULT_NOT_CANCELLABLE" => Some(Self::NotCancellable), + _ => None, + } + } + } +} /// Options for CommandGetSqlInfo. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] @@ -575,6 +907,49 @@ pub enum SqlInfo { /// - true: if read only FlightSqlServerReadOnly = 3, /// + /// Retrieves a boolean value indicating whether the Flight SQL Server supports executing + /// SQL queries. + /// + /// Note that the absence of this info (as opposed to a false value) does not necessarily + /// mean that SQL is not supported, as this property was not originally defined. + FlightSqlServerSql = 4, + /// + /// Retrieves a boolean value indicating whether the Flight SQL Server supports executing + /// Substrait plans. + FlightSqlServerSubstrait = 5, + /// + /// Retrieves a string value indicating the minimum supported Substrait version, or null + /// if Substrait is not supported. + FlightSqlServerSubstraitMinVersion = 6, + /// + /// Retrieves a string value indicating the maximum supported Substrait version, or null + /// if Substrait is not supported. + FlightSqlServerSubstraitMaxVersion = 7, + /// + /// Retrieves an int32 indicating whether the Flight SQL Server supports the + /// BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. + /// + /// Even if this is not supported, the database may still support explicit "BEGIN + /// TRANSACTION"/"COMMIT" SQL statements (see SQL_TRANSACTIONS_SUPPORTED); this property + /// is only about whether the server implements the Flight SQL API endpoints. + /// + /// The possible values are listed in `SqlSupportedTransaction`. + FlightSqlServerTransaction = 8, + /// + /// Retrieves a boolean value indicating whether the Flight SQL Server supports explicit + /// query cancellation (the CancelQuery action). + FlightSqlServerCancel = 9, + /// + /// Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. + /// + /// If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + FlightSqlServerStatementTimeout = 100, + /// + /// Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. + /// + /// If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + FlightSqlServerTransactionTimeout = 101, + /// /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. /// /// Returns: @@ -1125,6 +1500,22 @@ impl SqlInfo { SqlInfo::FlightSqlServerVersion => "FLIGHT_SQL_SERVER_VERSION", SqlInfo::FlightSqlServerArrowVersion => "FLIGHT_SQL_SERVER_ARROW_VERSION", SqlInfo::FlightSqlServerReadOnly => "FLIGHT_SQL_SERVER_READ_ONLY", + SqlInfo::FlightSqlServerSql => "FLIGHT_SQL_SERVER_SQL", + SqlInfo::FlightSqlServerSubstrait => "FLIGHT_SQL_SERVER_SUBSTRAIT", + SqlInfo::FlightSqlServerSubstraitMinVersion => { + "FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION" + } + SqlInfo::FlightSqlServerSubstraitMaxVersion => { + "FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION" + } + SqlInfo::FlightSqlServerTransaction => "FLIGHT_SQL_SERVER_TRANSACTION", + SqlInfo::FlightSqlServerCancel => "FLIGHT_SQL_SERVER_CANCEL", + SqlInfo::FlightSqlServerStatementTimeout => { + "FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT" + } + SqlInfo::FlightSqlServerTransactionTimeout => { + "FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT" + } SqlInfo::SqlDdlCatalog => "SQL_DDL_CATALOG", SqlInfo::SqlDdlSchema => "SQL_DDL_SCHEMA", SqlInfo::SqlDdlTable => "SQL_DDL_TABLE", @@ -1241,6 +1632,22 @@ impl SqlInfo { "FLIGHT_SQL_SERVER_VERSION" => Some(Self::FlightSqlServerVersion), "FLIGHT_SQL_SERVER_ARROW_VERSION" => Some(Self::FlightSqlServerArrowVersion), "FLIGHT_SQL_SERVER_READ_ONLY" => Some(Self::FlightSqlServerReadOnly), + "FLIGHT_SQL_SERVER_SQL" => Some(Self::FlightSqlServerSql), + "FLIGHT_SQL_SERVER_SUBSTRAIT" => Some(Self::FlightSqlServerSubstrait), + "FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION" => { + Some(Self::FlightSqlServerSubstraitMinVersion) + } + "FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION" => { + Some(Self::FlightSqlServerSubstraitMaxVersion) + } + "FLIGHT_SQL_SERVER_TRANSACTION" => Some(Self::FlightSqlServerTransaction), + "FLIGHT_SQL_SERVER_CANCEL" => Some(Self::FlightSqlServerCancel), + "FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT" => { + Some(Self::FlightSqlServerStatementTimeout) + } + "FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT" => { + Some(Self::FlightSqlServerTransactionTimeout) + } "SQL_DDL_CATALOG" => Some(Self::SqlDdlCatalog), "SQL_DDL_SCHEMA" => Some(Self::SqlDdlSchema), "SQL_DDL_TABLE" => Some(Self::SqlDdlTable), @@ -1354,6 +1761,43 @@ impl SqlInfo { } } } +/// The level of support for Flight SQL transaction RPCs. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum SqlSupportedTransaction { + /// Unknown/not indicated/no support + None = 0, + /// Transactions, but not savepoints. + /// A savepoint is a mark within a transaction that can be individually + /// rolled back to. Not all databases support savepoints. + Transaction = 1, + /// Transactions and savepoints + Savepoint = 2, +} +impl SqlSupportedTransaction { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedTransaction::None => "SQL_SUPPORTED_TRANSACTION_NONE", + SqlSupportedTransaction::Transaction => { + "SQL_SUPPORTED_TRANSACTION_TRANSACTION" + } + SqlSupportedTransaction::Savepoint => "SQL_SUPPORTED_TRANSACTION_SAVEPOINT", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SQL_SUPPORTED_TRANSACTION_NONE" => Some(Self::None), + "SQL_SUPPORTED_TRANSACTION_TRANSACTION" => Some(Self::Transaction), + "SQL_SUPPORTED_TRANSACTION_SAVEPOINT" => Some(Self::Savepoint), + _ => None, + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedCaseSensitivity { diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 15a896c109e1..c9adc2b98b12 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -116,8 +116,15 @@ impl FlightSqlServiceClient { } /// Execute a query on the server. - pub async fn execute(&mut self, query: String) -> Result { - let cmd = CommandStatementQuery { query }; + pub async fn execute( + &mut self, + query: String, + transaction_id: Option, + ) -> Result { + let cmd = CommandStatementQuery { + query, + transaction_id, + }; self.get_flight_info_for_command(cmd).await } @@ -170,8 +177,15 @@ impl FlightSqlServiceClient { } /// Execute a update query on the server, and return the number of records affected - pub async fn execute_update(&mut self, query: String) -> Result { - let cmd = CommandStatementUpdate { query }; + pub async fn execute_update( + &mut self, + query: String, + transaction_id: Option, + ) -> Result { + let cmd = CommandStatementUpdate { + query, + transaction_id, + }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); let req = self.set_request_headers( stream::iter(vec![FlightData { @@ -325,8 +339,12 @@ impl FlightSqlServiceClient { pub async fn prepare( &mut self, query: String, + transaction_id: Option, ) -> Result, ArrowError> { - let cmd = ActionCreatePreparedStatementRequest { query }; + let cmd = ActionCreatePreparedStatementRequest { + query, + transaction_id, + }; let action = Action { r#type: CREATE_PREPARED_STATEMENT.to_string(), body: cmd.as_any().encode_to_vec().into(), diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index ed26b38751c5..797ddfc9e4a6 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -46,9 +46,18 @@ mod gen { include!("arrow.flight.protocol.sql.rs"); } +pub use gen::ActionBeginSavepointRequest; +pub use gen::ActionBeginSavepointResult; +pub use gen::ActionBeginTransactionRequest; +pub use gen::ActionBeginTransactionResult; +pub use gen::ActionCancelQueryRequest; +pub use gen::ActionCancelQueryResult; pub use gen::ActionClosePreparedStatementRequest; pub use gen::ActionCreatePreparedStatementRequest; pub use gen::ActionCreatePreparedStatementResult; +pub use gen::ActionCreatePreparedSubstraitPlanRequest; +pub use gen::ActionEndSavepointRequest; +pub use gen::ActionEndTransactionRequest; pub use gen::CommandGetCatalogs; pub use gen::CommandGetCrossReference; pub use gen::CommandGetDbSchemas; @@ -62,6 +71,7 @@ pub use gen::CommandGetXdbcTypeInfo; pub use gen::CommandPreparedStatementQuery; pub use gen::CommandPreparedStatementUpdate; pub use gen::CommandStatementQuery; +pub use gen::CommandStatementSubstraitPlan; pub use gen::CommandStatementUpdate; pub use gen::DoPutUpdateResult; pub use gen::SqlInfo; @@ -120,6 +130,7 @@ macro_rules! prost_message_ext { /// # use arrow_flight::sql::{Any, CommandStatementQuery, Command}; /// let flightsql_message = CommandStatementQuery { /// query: "SELECT * FROM foo".to_string(), + /// transaction_id: None, /// }; /// /// // Given a packed FlightSQL Any message @@ -203,9 +214,18 @@ macro_rules! prost_message_ext { // Implement ProstMessageExt for all structs defined in FlightSql.proto prost_message_ext!( + ActionBeginSavepointRequest, + ActionBeginSavepointResult, + ActionBeginTransactionRequest, + ActionBeginTransactionResult, + ActionCancelQueryRequest, + ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, + ActionCreatePreparedSubstraitPlanRequest, + ActionEndSavepointRequest, + ActionEndTransactionRequest, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, @@ -219,6 +239,7 @@ prost_message_ext!( CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, + CommandStatementSubstraitPlan, CommandStatementUpdate, DoPutUpdateResult, TicketStatementQuery, @@ -296,6 +317,7 @@ mod tests { fn test_prost_any_pack_unpack() { let query = CommandStatementQuery { query: "select 1".to_string(), + transaction_id: None, }; let any = Any::pack(&query).unwrap(); assert!(any.is::()); @@ -307,6 +329,7 @@ mod tests { fn test_command() { let query = CommandStatementQuery { query: "select 1".to_string(), + transaction_id: None, }; let any = Any::pack(&query).unwrap(); let cmd: Command = any.try_into().unwrap(); diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 9a0183495434..89eb70e23b35 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -30,17 +30,28 @@ use super::{ FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, }, + ActionBeginSavepointRequest, ActionBeginSavepointResult, + ActionBeginTransactionRequest, ActionBeginTransactionResult, + ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, CommandGetCatalogs, CommandGetCrossReference, - CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, - CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, - CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, - CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, - DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, + ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, + ActionEndSavepointRequest, ActionEndTransactionRequest, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, + CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, + CommandStatementSubstraitPlan, CommandStatementUpdate, DoPutUpdateResult, + ProstMessageExt, SqlInfo, TicketStatementQuery, }; pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; pub(crate) static CLOSE_PREPARED_STATEMENT: &str = "ClosePreparedStatement"; +pub(crate) static CREATE_PREPARED_SUBSTRAIT_PLAN: &str = "CreatePreparedSubstraitPlan"; +pub(crate) static BEGIN_TRANSACTION: &str = "BeginTransaction"; +pub(crate) static END_TRANSACTION: &str = "EndTransaction"; +pub(crate) static BEGIN_SAVEPOINT: &str = "BeginSavepoint"; +pub(crate) static END_SAVEPOINT: &str = "EndSavepoint"; +pub(crate) static CANCEL_QUERY: &str = "CancelQuery"; /// Implements FlightSqlService to handle the flight sql protocol #[tonic::async_trait] @@ -81,6 +92,13 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { request: Request, ) -> Result, Status>; + /// Get a FlightInfo for executing a substrait plan. + async fn get_flight_info_substrait_plan( + &self, + query: CommandStatementSubstraitPlan, + request: Request, + ) -> Result, Status>; + /// Get a FlightInfo for executing an already created prepared statement. async fn get_flight_info_prepared_statement( &self, @@ -267,6 +285,13 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { request: Request>, ) -> Result; + /// Execute a substrait plan + async fn do_put_substrait_plan( + &self, + query: CommandStatementSubstraitPlan, + request: Request>, + ) -> Result; + // do_action /// Create a prepared statement from given SQL statement. @@ -281,7 +306,49 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { &self, query: ActionClosePreparedStatementRequest, request: Request, - ); + ) -> Result<(), Status>; + + /// Create a prepared substrait plan. + async fn do_action_create_prepared_substrait_plan( + &self, + query: ActionCreatePreparedSubstraitPlanRequest, + request: Request, + ) -> Result; + + /// Begin a transaction + async fn do_action_begin_transaction( + &self, + query: ActionBeginTransactionRequest, + request: Request, + ) -> Result; + + /// End a transaction + async fn do_action_end_transaction( + &self, + query: ActionEndTransactionRequest, + request: Request, + ) -> Result<(), Status>; + + /// Begin a savepoint + async fn do_action_begin_savepoint( + &self, + query: ActionBeginSavepointRequest, + request: Request, + ) -> Result; + + /// End a savepoint + async fn do_action_end_savepoint( + &self, + query: ActionEndSavepointRequest, + request: Request, + ) -> Result<(), Status>; + + /// Cancel a query + async fn do_action_cancel_query( + &self, + query: ActionCancelQueryRequest, + request: Request, + ) -> Result; /// Register a new SqlInfo result, making it available when calling GetSqlInfo. async fn register_sql_info(&self, id: i32, result: &SqlInfo); @@ -339,6 +406,9 @@ where self.get_flight_info_prepared_statement(handle, request) .await } + Command::CommandStatementSubstraitPlan(handle) => { + self.get_flight_info_substrait_plan(handle, request).await + } Command::CommandGetCatalogs(token) => { self.get_flight_info_catalogs(token, request).await } @@ -450,6 +520,14 @@ where Command::CommandPreparedStatementQuery(command) => { self.do_put_prepared_statement_query(command, request).await } + Command::CommandStatementSubstraitPlan(command) => { + let record_count = self.do_put_substrait_plan(command, request).await?; + let result = DoPutUpdateResult { record_count }; + let output = futures::stream::iter(vec![Ok(PutResult { + app_metadata: result.as_any().encode_to_vec().into(), + })]); + Ok(Response::new(Box::pin(output))) + } Command::CommandPreparedStatementUpdate(command) => { let record_count = self .do_put_prepared_statement_update(command, request) @@ -485,9 +563,58 @@ where Response Message: N/A" .into(), }; + let create_prepared_substrait_plan_action_type = ActionType { + r#type: CREATE_PREPARED_SUBSTRAIT_PLAN.to_string(), + description: + "Creates a reusable prepared substrait plan resource on the server.\n + Request Message: ActionCreatePreparedSubstraitPlanRequest\n + Response Message: ActionCreatePreparedStatementResult" + .into(), + }; + let begin_transaction_action_type = ActionType { + r#type: BEGIN_TRANSACTION.to_string(), + description: "Begins a transaction.\n + Request Message: ActionBeginTransactionRequest\n + Response Message: ActionBeginTransactionResult" + .into(), + }; + let end_transaction_action_type = ActionType { + r#type: END_TRANSACTION.to_string(), + description: "Ends a transaction\n + Request Message: ActionEndTransactionRequest\n + Response Message: N/A" + .into(), + }; + let begin_savepoint_action_type = ActionType { + r#type: BEGIN_SAVEPOINT.to_string(), + description: "Begins a savepoint.\n + Request Message: ActionBeginSavepointRequest\n + Response Message: ActionBeginSavepointResult" + .into(), + }; + let end_savepoint_action_type = ActionType { + r#type: END_SAVEPOINT.to_string(), + description: "Ends a savepoint\n + Request Message: ActionEndSavepointRequest\n + Response Message: N/A" + .into(), + }; + let cancel_query_action_type = ActionType { + r#type: CANCEL_QUERY.to_string(), + description: "Cancels a query\n + Request Message: ActionCancelQueryRequest\n + Response Message: ActionCancelQueryResult" + .into(), + }; let actions: Vec> = vec![ Ok(create_prepared_statement_action_type), Ok(close_prepared_statement_action_type), + Ok(create_prepared_substrait_plan_action_type), + Ok(begin_transaction_action_type), + Ok(end_transaction_action_type), + Ok(begin_savepoint_action_type), + Ok(end_savepoint_action_type), + Ok(cancel_query_action_type), ]; let output = futures::stream::iter(actions); Ok(Response::new(Box::pin(output) as Self::ListActionsStream)) @@ -516,8 +643,7 @@ where body: stmt.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); - } - if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { + } else if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; @@ -529,8 +655,101 @@ where "Unable to unpack ActionClosePreparedStatementRequest.", ) })?; - self.do_action_close_prepared_statement(cmd, request).await; + self.do_action_close_prepared_statement(cmd, request) + .await?; + return Ok(Response::new(Box::pin(futures::stream::empty()))); + } else if request.get_ref().r#type == CREATE_PREPARED_SUBSTRAIT_PLAN { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionCreatePreparedSubstraitPlanRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument( + "Unable to unpack ActionCreatePreparedSubstraitPlanRequest.", + ) + })?; + self.do_action_create_prepared_substrait_plan(cmd, request) + .await?; + return Ok(Response::new(Box::pin(futures::stream::empty()))); + } else if request.get_ref().r#type == BEGIN_TRANSACTION { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionBeginTransactionRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument( + "Unable to unpack ActionBeginTransactionRequest.", + ) + })?; + let stmt = self.do_action_begin_transaction(cmd, request).await?; + let output = futures::stream::iter(vec![Ok(super::super::gen::Result { + body: stmt.as_any().encode_to_vec().into(), + })]); + return Ok(Response::new(Box::pin(output))); + } else if request.get_ref().r#type == END_TRANSACTION { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionEndTransactionRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument( + "Unable to unpack ActionEndTransactionRequest.", + ) + })?; + self.do_action_end_transaction(cmd, request).await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); + } else if request.get_ref().r#type == BEGIN_SAVEPOINT { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionBeginSavepointRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument( + "Unable to unpack ActionBeginSavepointRequest.", + ) + })?; + let stmt = self.do_action_begin_savepoint(cmd, request).await?; + let output = futures::stream::iter(vec![Ok(super::super::gen::Result { + body: stmt.as_any().encode_to_vec().into(), + })]); + return Ok(Response::new(Box::pin(output))); + } else if request.get_ref().r#type == END_SAVEPOINT { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionEndSavepointRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument( + "Unable to unpack ActionEndSavepointRequest.", + ) + })?; + self.do_action_end_savepoint(cmd, request).await?; + return Ok(Response::new(Box::pin(futures::stream::empty()))); + } else if request.get_ref().r#type == CANCEL_QUERY { + let any = + Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + + let cmd: ActionCancelQueryRequest = any + .unpack() + .map_err(arrow_error_to_status)? + .ok_or_else(|| { + Status::invalid_argument("Unable to unpack ActionCancelQueryRequest.") + })?; + let stmt = self.do_action_cancel_query(cmd, request).await?; + let output = futures::stream::iter(vec![Ok(super::super::gen::Result { + body: stmt.as_any().encode_to_vec().into(), + })]); + return Ok(Response::new(Box::pin(output))); } Err(Status::invalid_argument(format!( diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index ed928a52c99a..8ea542879a27 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -104,6 +104,7 @@ fn test_flight_info(request: &FlightDescriptor) -> FlightInfo { flight_descriptor: Some(request.clone()), total_bytes: 123, total_records: 456, + ordered: false, } } diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 248b3732ff97..9b3baca9ba6c 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -21,13 +21,17 @@ use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray}; use arrow_flight::{ flight_service_server::{FlightService, FlightServiceServer}, sql::{ - server::FlightSqlService, ActionClosePreparedStatementRequest, - ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any, - CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, - CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, - CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, - CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, - CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, + server::FlightSqlService, ActionBeginSavepointRequest, + ActionBeginSavepointResult, ActionBeginTransactionRequest, + ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, + ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, + ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, + ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, + CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, + CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, ProstMessageExt, SqlInfo, TicketStatementQuery, }, utils::batches_to_flight_data, @@ -197,6 +201,7 @@ impl FlightSqlService for FlightSqlServiceImpl { ], total_records: batch.num_rows() as i64, total_bytes: batch.get_array_memory_size() as i64, + ordered: false, }; let resp = Response::new(info); Ok(resp) @@ -212,6 +217,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn get_flight_info_substrait_plan( + &self, + _query: CommandStatementSubstraitPlan, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_substrait_plan not implemented", + )) + } + async fn get_flight_info_catalogs( &self, _query: CommandGetCatalogs, @@ -430,6 +445,16 @@ impl FlightSqlService for FlightSqlServiceImpl { )) } + async fn do_put_substrait_plan( + &self, + _ticket: CommandStatementSubstraitPlan, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_substrait_plan not implemented", + )) + } + async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, @@ -464,7 +489,56 @@ impl FlightSqlService for FlightSqlServiceImpl { &self, _query: ActionClosePreparedStatementRequest, _request: Request, - ) { + ) -> Result<(), Status> { + unimplemented!("Implement do_action_close_prepared_statement") + } + + async fn do_action_create_prepared_substrait_plan( + &self, + _query: ActionCreatePreparedSubstraitPlanRequest, + _request: Request, + ) -> Result { + unimplemented!("Implement do_action_create_prepared_substrait_plan") + } + + async fn do_action_begin_transaction( + &self, + _query: ActionBeginTransactionRequest, + _request: Request, + ) -> Result { + unimplemented!("Implement do_action_begin_transaction") + } + + async fn do_action_end_transaction( + &self, + _query: ActionEndTransactionRequest, + _request: Request, + ) -> Result<(), Status> { + unimplemented!("Implement do_action_end_transaction") + } + + async fn do_action_begin_savepoint( + &self, + _query: ActionBeginSavepointRequest, + _request: Request, + ) -> Result { + unimplemented!("Implement do_action_begin_savepoint") + } + + async fn do_action_end_savepoint( + &self, + _query: ActionEndSavepointRequest, + _request: Request, + ) -> Result<(), Status> { + unimplemented!("Implement do_action_end_savepoint") + } + + async fn do_action_cancel_query( + &self, + _query: ActionCancelQueryRequest, + _request: Request, + ) -> Result { + unimplemented!("Implement do_action_cancel_query") } async fn register_sql_info(&self, _id: i32, _result: &SqlInfo) {} diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 51d08d94313c..e2c4cb5d88f3 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -200,6 +200,7 @@ impl FlightService for FlightServiceImpl { endpoint: vec![endpoint], total_records: total_records as i64, total_bytes: -1, + ordered: false, }; Ok(Response::new(info)) diff --git a/format/Flight.proto b/format/Flight.proto index 635b1793d2ba..9b44331a5765 100644 --- a/format/Flight.proto +++ b/format/Flight.proto @@ -16,347 +16,365 @@ * limitations under the License. */ -syntax = "proto3"; - -option java_package = "org.apache.arrow.flight.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; -option csharp_namespace = "Apache.Arrow.Flight.Protocol"; - -package arrow.flight.protocol; - -/* - * A flight service is an endpoint for retrieving or storing Arrow data. A - * flight service can expose one or more predefined endpoints that can be - * accessed using the Arrow Flight Protocol. Additionally, a flight service - * can expose a set of actions that are available. - */ -service FlightService { - - /* - * Handshake between client and server. Depending on the server, the - * handshake may be required to determine the token that should be used for - * future operations. Both request and response are streams to allow multiple - * round-trips depending on auth mechanism. - */ - rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {} - - /* - * Get a list of available streams given a particular criteria. Most flight - * services will expose one or more streams that are readily available for - * retrieval. This api allows listing the streams available for - * consumption. A user can also provide a criteria. The criteria can limit - * the subset of streams that can be listed via this interface. Each flight - * service allows its own definition of how to consume criteria. - */ - rpc ListFlights(Criteria) returns (stream FlightInfo) {} - - /* - * For a given FlightDescriptor, get information about how the flight can be - * consumed. This is a useful interface if the consumer of the interface - * already can identify the specific flight to consume. This interface can - * also allow a consumer to generate a flight stream through a specified - * descriptor. For example, a flight descriptor might be something that - * includes a SQL statement or a Pickled Python operation that will be - * executed. In those cases, the descriptor will not be previously available - * within the list of available streams provided by ListFlights but will be - * available for consumption for the duration defined by the specific flight - * service. - */ - rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {} - - /* - * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema - * This is used when a consumer needs the Schema of flight stream. Similar to - * GetFlightInfo this interface may generate a new flight that was not previously - * available in ListFlights. - */ - rpc GetSchema(FlightDescriptor) returns (SchemaResult) {} - - /* - * Retrieve a single stream associated with a particular descriptor - * associated with the referenced ticket. A Flight can be composed of one or - * more streams where each stream can be retrieved using a separate opaque - * ticket that the flight service uses for managing a collection of streams. - */ - rpc DoGet(Ticket) returns (stream FlightData) {} - - /* - * Push a stream to the flight service associated with a particular - * flight stream. This allows a client of a flight service to upload a stream - * of data. Depending on the particular flight service, a client consumer - * could be allowed to upload a single stream per descriptor or an unlimited - * number. In the latter, the service might implement a 'seal' action that - * can be applied to a descriptor once all streams are uploaded. - */ - rpc DoPut(stream FlightData) returns (stream PutResult) {} - - /* - * Open a bidirectional data channel for a given descriptor. This - * allows clients to send and receive arbitrary Arrow data and - * application-specific metadata in a single logical stream. In - * contrast to DoGet/DoPut, this is more suited for clients - * offloading computation (rather than storage) to a Flight service. - */ - rpc DoExchange(stream FlightData) returns (stream FlightData) {} - - /* - * Flight services can support an arbitrary number of simple actions in - * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut - * operations that are potentially available. DoAction allows a flight client - * to do a specific action against a flight service. An action includes - * opaque request and response objects that are specific to the type action - * being undertaken. - */ - rpc DoAction(Action) returns (stream Result) {} - - /* - * A flight service exposes all of the available action types that it has - * along with descriptions. This allows different flight consumers to - * understand the capabilities of the flight service. - */ - rpc ListActions(Empty) returns (stream ActionType) {} - -} - -/* - * The request that a client provides to a server on handshake. - */ -message HandshakeRequest { - - /* - * A defined protocol version - */ - uint64 protocol_version = 1; - - /* - * Arbitrary auth/handshake info. - */ - bytes payload = 2; -} - -message HandshakeResponse { - - /* - * A defined protocol version - */ - uint64 protocol_version = 1; - - /* - * Arbitrary auth/handshake info. - */ - bytes payload = 2; -} - -/* - * A message for doing simple auth. - */ -message BasicAuth { - string username = 2; - string password = 3; -} - -message Empty {} - -/* - * Describes an available action, including both the name used for execution - * along with a short description of the purpose of the action. - */ -message ActionType { - string type = 1; - string description = 2; -} - -/* - * A service specific expression that can be used to return a limited set - * of available Arrow Flight streams. - */ -message Criteria { - bytes expression = 1; -} - -/* - * An opaque action specific for the service. - */ -message Action { - string type = 1; - bytes body = 2; -} - -/* - * An opaque result returned after executing an action. - */ -message Result { - bytes body = 1; -} - -/* - * Wrap the result of a getSchema call - */ -message SchemaResult { - // The schema of the dataset in its IPC form: - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema - bytes schema = 1; -} - -/* - * The name or tag for a Flight. May be used as a way to retrieve or generate - * a flight or be used to expose a set of previously defined flights. - */ -message FlightDescriptor { - - /* - * Describes what type of descriptor is defined. - */ - enum DescriptorType { - - // Protobuf pattern, not used. - UNKNOWN = 0; - - /* - * A named path that identifies a dataset. A path is composed of a string - * or list of strings describing a particular dataset. This is conceptually - * similar to a path inside a filesystem. - */ - PATH = 1; - - /* - * An opaque command to generate a dataset. - */ - CMD = 2; - } - - DescriptorType type = 1; - - /* - * Opaque value used to express a command. Should only be defined when - * type = CMD. - */ - bytes cmd = 2; - - /* - * List of strings identifying a particular dataset. Should only be defined - * when type = PATH. - */ - repeated string path = 3; -} - -/* - * The access coordinates for retrieval of a dataset. With a FlightInfo, a - * consumer is able to determine how to retrieve a dataset. - */ -message FlightInfo { - // The schema of the dataset in its IPC form: - // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix - // 4 bytes - the byte length of the payload - // a flatbuffer Message whose header is the Schema - bytes schema = 1; - - /* - * The descriptor associated with this info. - */ - FlightDescriptor flight_descriptor = 2; - - /* - * A list of endpoints associated with the flight. To consume the - * whole flight, all endpoints (and hence all Tickets) must be - * consumed. Endpoints can be consumed in any order. - * - * In other words, an application can use multiple endpoints to - * represent partitioned data. - * - * There is no ordering defined on endpoints. Hence, if the returned - * data has an ordering, it should be returned in a single endpoint. - */ - repeated FlightEndpoint endpoint = 3; - - // Set these to -1 if unknown. - int64 total_records = 4; - int64 total_bytes = 5; -} - -/* - * A particular stream or split associated with a flight. - */ -message FlightEndpoint { - - /* - * Token used to retrieve this stream. - */ - Ticket ticket = 1; - - /* - * A list of URIs where this ticket can be redeemed via DoGet(). - * - * If the list is empty, the expectation is that the ticket can only - * be redeemed on the current service where the ticket was - * generated. - * - * If the list is not empty, the expectation is that the ticket can - * be redeemed at any of the locations, and that the data returned - * will be equivalent. In this case, the ticket may only be redeemed - * at one of the given locations, and not (necessarily) on the - * current service. - * - * In other words, an application can use multiple locations to - * represent redundant and/or load balanced services. - */ - repeated Location location = 2; -} - -/* - * A location where a Flight service will accept retrieval of a particular - * stream given a ticket. - */ -message Location { - string uri = 1; -} - -/* - * An opaque identifier that the service can use to retrieve a particular - * portion of a stream. - * - * Tickets are meant to be single use. It is an error/application-defined - * behavior to reuse a ticket. - */ -message Ticket { - bytes ticket = 1; -} - -/* - * A batch of Arrow data as part of a stream of batches. - */ -message FlightData { - - /* - * The descriptor of the data. This is only relevant when a client is - * starting a new DoPut stream. - */ - FlightDescriptor flight_descriptor = 1; - - /* - * Header for message data as described in Message.fbs::Message. - */ - bytes data_header = 2; - - /* - * Application-defined metadata. - */ - bytes app_metadata = 3; - - /* - * The actual batch of Arrow data. Preferably handled with minimal-copies - * coming last in the definition to help with sidecar patterns (it is - * expected that some implementations will fetch this field off the wire - * with specialized code to avoid extra memory copies). - */ - bytes data_body = 1000; -} - -/** - * The response message associated with the submission of a DoPut. - */ -message PutResult { - bytes app_metadata = 1; -} + syntax = "proto3"; + + option java_package = "org.apache.arrow.flight.impl"; + option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; + option csharp_namespace = "Apache.Arrow.Flight.Protocol"; + + package arrow.flight.protocol; + + /* + * A flight service is an endpoint for retrieving or storing Arrow data. A + * flight service can expose one or more predefined endpoints that can be + * accessed using the Arrow Flight Protocol. Additionally, a flight service + * can expose a set of actions that are available. + */ + service FlightService { + + /* + * Handshake between client and server. Depending on the server, the + * handshake may be required to determine the token that should be used for + * future operations. Both request and response are streams to allow multiple + * round-trips depending on auth mechanism. + */ + rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {} + + /* + * Get a list of available streams given a particular criteria. Most flight + * services will expose one or more streams that are readily available for + * retrieval. This api allows listing the streams available for + * consumption. A user can also provide a criteria. The criteria can limit + * the subset of streams that can be listed via this interface. Each flight + * service allows its own definition of how to consume criteria. + */ + rpc ListFlights(Criteria) returns (stream FlightInfo) {} + + /* + * For a given FlightDescriptor, get information about how the flight can be + * consumed. This is a useful interface if the consumer of the interface + * already can identify the specific flight to consume. This interface can + * also allow a consumer to generate a flight stream through a specified + * descriptor. For example, a flight descriptor might be something that + * includes a SQL statement or a Pickled Python operation that will be + * executed. In those cases, the descriptor will not be previously available + * within the list of available streams provided by ListFlights but will be + * available for consumption for the duration defined by the specific flight + * service. + */ + rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {} + + /* + * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema + * This is used when a consumer needs the Schema of flight stream. Similar to + * GetFlightInfo this interface may generate a new flight that was not previously + * available in ListFlights. + */ + rpc GetSchema(FlightDescriptor) returns (SchemaResult) {} + + /* + * Retrieve a single stream associated with a particular descriptor + * associated with the referenced ticket. A Flight can be composed of one or + * more streams where each stream can be retrieved using a separate opaque + * ticket that the flight service uses for managing a collection of streams. + */ + rpc DoGet(Ticket) returns (stream FlightData) {} + + /* + * Push a stream to the flight service associated with a particular + * flight stream. This allows a client of a flight service to upload a stream + * of data. Depending on the particular flight service, a client consumer + * could be allowed to upload a single stream per descriptor or an unlimited + * number. In the latter, the service might implement a 'seal' action that + * can be applied to a descriptor once all streams are uploaded. + */ + rpc DoPut(stream FlightData) returns (stream PutResult) {} + + /* + * Open a bidirectional data channel for a given descriptor. This + * allows clients to send and receive arbitrary Arrow data and + * application-specific metadata in a single logical stream. In + * contrast to DoGet/DoPut, this is more suited for clients + * offloading computation (rather than storage) to a Flight service. + */ + rpc DoExchange(stream FlightData) returns (stream FlightData) {} + + /* + * Flight services can support an arbitrary number of simple actions in + * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut + * operations that are potentially available. DoAction allows a flight client + * to do a specific action against a flight service. An action includes + * opaque request and response objects that are specific to the type action + * being undertaken. + */ + rpc DoAction(Action) returns (stream Result) {} + + /* + * A flight service exposes all of the available action types that it has + * along with descriptions. This allows different flight consumers to + * understand the capabilities of the flight service. + */ + rpc ListActions(Empty) returns (stream ActionType) {} + + } + + /* + * The request that a client provides to a server on handshake. + */ + message HandshakeRequest { + + /* + * A defined protocol version + */ + uint64 protocol_version = 1; + + /* + * Arbitrary auth/handshake info. + */ + bytes payload = 2; + } + + message HandshakeResponse { + + /* + * A defined protocol version + */ + uint64 protocol_version = 1; + + /* + * Arbitrary auth/handshake info. + */ + bytes payload = 2; + } + + /* + * A message for doing simple auth. + */ + message BasicAuth { + string username = 2; + string password = 3; + } + + message Empty {} + + /* + * Describes an available action, including both the name used for execution + * along with a short description of the purpose of the action. + */ + message ActionType { + string type = 1; + string description = 2; + } + + /* + * A service specific expression that can be used to return a limited set + * of available Arrow Flight streams. + */ + message Criteria { + bytes expression = 1; + } + + /* + * An opaque action specific for the service. + */ + message Action { + string type = 1; + bytes body = 2; + } + + /* + * An opaque result returned after executing an action. + */ + message Result { + bytes body = 1; + } + + /* + * Wrap the result of a getSchema call + */ + message SchemaResult { + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + bytes schema = 1; + } + + /* + * The name or tag for a Flight. May be used as a way to retrieve or generate + * a flight or be used to expose a set of previously defined flights. + */ + message FlightDescriptor { + + /* + * Describes what type of descriptor is defined. + */ + enum DescriptorType { + + // Protobuf pattern, not used. + UNKNOWN = 0; + + /* + * A named path that identifies a dataset. A path is composed of a string + * or list of strings describing a particular dataset. This is conceptually + * similar to a path inside a filesystem. + */ + PATH = 1; + + /* + * An opaque command to generate a dataset. + */ + CMD = 2; + } + + DescriptorType type = 1; + + /* + * Opaque value used to express a command. Should only be defined when + * type = CMD. + */ + bytes cmd = 2; + + /* + * List of strings identifying a particular dataset. Should only be defined + * when type = PATH. + */ + repeated string path = 3; + } + + /* + * The access coordinates for retrieval of a dataset. With a FlightInfo, a + * consumer is able to determine how to retrieve a dataset. + */ + message FlightInfo { + // The schema of the dataset in its IPC form: + // 4 bytes - an optional IPC_CONTINUATION_TOKEN prefix + // 4 bytes - the byte length of the payload + // a flatbuffer Message whose header is the Schema + bytes schema = 1; + + /* + * The descriptor associated with this info. + */ + FlightDescriptor flight_descriptor = 2; + + /* + * A list of endpoints associated with the flight. To consume the + * whole flight, all endpoints (and hence all Tickets) must be + * consumed. Endpoints can be consumed in any order. + * + * In other words, an application can use multiple endpoints to + * represent partitioned data. + * + * If the returned data has an ordering, an application can use + * "FlightInfo.ordered = true" or should return the all data in a + * single endpoint. Otherwise, there is no ordering defined on + * endpoints or the data within. + * + * A client can read ordered data by reading data from returned + * endpoints, in order, from front to back. + * + * Note that a client may ignore "FlightInfo.ordered = true". If an + * ordering is important for an application, an application must + * choose one of them: + * + * * An application requires that all clients must read data in + * returned endpoints order. + * * An application must return the all data in a single endpoint. + */ + repeated FlightEndpoint endpoint = 3; + + // Set these to -1 if unknown. + int64 total_records = 4; + int64 total_bytes = 5; + + /* + * FlightEndpoints are in the same order as the data. + */ + bool ordered = 6; + } + + /* + * A particular stream or split associated with a flight. + */ + message FlightEndpoint { + + /* + * Token used to retrieve this stream. + */ + Ticket ticket = 1; + + /* + * A list of URIs where this ticket can be redeemed via DoGet(). + * + * If the list is empty, the expectation is that the ticket can only + * be redeemed on the current service where the ticket was + * generated. + * + * If the list is not empty, the expectation is that the ticket can + * be redeemed at any of the locations, and that the data returned + * will be equivalent. In this case, the ticket may only be redeemed + * at one of the given locations, and not (necessarily) on the + * current service. + * + * In other words, an application can use multiple locations to + * represent redundant and/or load balanced services. + */ + repeated Location location = 2; + } + + /* + * A location where a Flight service will accept retrieval of a particular + * stream given a ticket. + */ + message Location { + string uri = 1; + } + + /* + * An opaque identifier that the service can use to retrieve a particular + * portion of a stream. + * + * Tickets are meant to be single use. It is an error/application-defined + * behavior to reuse a ticket. + */ + message Ticket { + bytes ticket = 1; + } + + /* + * A batch of Arrow data as part of a stream of batches. + */ + message FlightData { + + /* + * The descriptor of the data. This is only relevant when a client is + * starting a new DoPut stream. + */ + FlightDescriptor flight_descriptor = 1; + + /* + * Header for message data as described in Message.fbs::Message. + */ + bytes data_header = 2; + + /* + * Application-defined metadata. + */ + bytes app_metadata = 3; + + /* + * The actual batch of Arrow data. Preferably handled with minimal-copies + * coming last in the definition to help with sidecar patterns (it is + * expected that some implementations will fetch this field off the wire + * with specialized code to avoid extra memory copies). + */ + bytes data_body = 1000; + } + + /** + * The response message associated with the submission of a DoPut. + */ + message PutResult { + bytes app_metadata = 1; + } \ No newline at end of file diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 859427b68804..0acf647e1045 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -16,1540 +16,1832 @@ * limitations under the License. */ -syntax = "proto3"; -import "google/protobuf/descriptor.proto"; - -option java_package = "org.apache.arrow.flight.sql.impl"; -option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; -package arrow.flight.protocol.sql; - -/* - * Represents a metadata request. Used in the command member of FlightDescriptor - * for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the metadata request. - * - * The returned Arrow schema will be: - * < - * info_name: uint32 not null, - * value: dense_union< - * string_value: utf8, - * bool_value: bool, - * bigint_value: int64, - * int32_bitmask: int32, - * string_list: list - * int32_to_int32_list_map: map> - * > - * where there is one row per requested piece of metadata information. - */ -message CommandGetSqlInfo { - option (experimental) = true; - - /* - * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide - * Flight SQL clients with basic, SQL syntax and SQL functions related information. - * More information types can be added in future releases. - * E.g. more SQL syntax support types, scalar functions support, type conversion support etc. - * - * Note that the set of metadata may expand. - * - * Initially, Flight SQL will support the following information types: - * - Server Information - Range [0-500) - * - Syntax Information - Range [500-1000) - * Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). - * Custom options should start at 10,000. - * - * If omitted, then all metadata will be retrieved. - * Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must - * at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. - * If additional metadata is included, the metadata IDs should start from 10,000. - */ - repeated uint32 info = 1; -} - -// Options for CommandGetSqlInfo. -enum SqlInfo { - - // Server Information [0-500): Provides basic information about the Flight SQL Server. - - // Retrieves a UTF-8 string with the name of the Flight SQL Server. - FLIGHT_SQL_SERVER_NAME = 0; - - // Retrieves a UTF-8 string with the native version of the Flight SQL Server. - FLIGHT_SQL_SERVER_VERSION = 1; - - // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. - FLIGHT_SQL_SERVER_ARROW_VERSION = 2; - - /* - * Retrieves a boolean value indicating whether the Flight SQL Server is read only. - * - * Returns: - * - false: if read-write - * - true: if read only - */ - FLIGHT_SQL_SERVER_READ_ONLY = 3; - - - // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. - - /* - * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. - * - * Returns: - * - false: if it doesn't support CREATE and DROP of catalogs. - * - true: if it supports CREATE and DROP of catalogs. - */ - SQL_DDL_CATALOG = 500; - - /* - * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. - * - * Returns: - * - false: if it doesn't support CREATE and DROP of schemas. - * - true: if it supports CREATE and DROP of schemas. - */ - SQL_DDL_SCHEMA = 501; - - /* - * Indicates whether the Flight SQL Server supports CREATE and DROP of tables. - * - * Returns: - * - false: if it doesn't support CREATE and DROP of tables. - * - true: if it supports CREATE and DROP of tables. - */ - SQL_DDL_TABLE = 502; - - /* - * Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. - * - * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. - */ - SQL_IDENTIFIER_CASE = 503; - - // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. - SQL_IDENTIFIER_QUOTE_CHAR = 504; - - /* - * Retrieves a int32 describing the case sensitivity of quoted identifiers. - * - * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. - */ - SQL_QUOTED_IDENTIFIER_CASE = 505; - - /* - * Retrieves a boolean value indicating whether all tables are selectable. - * - * Returns: - * - false: if not all tables are selectable or if none are; - * - true: if all tables are selectable. - */ - SQL_ALL_TABLES_ARE_SELECTABLE = 506; - - /* - * Retrieves the null ordering. - * - * Returns a int32 ordinal for the null ordering being used, as described in - * `arrow.flight.protocol.sql.SqlNullOrdering`. - */ - SQL_NULL_ORDERING = 507; - - // Retrieves a UTF-8 string list with values of the supported keywords. - SQL_KEYWORDS = 508; - - // Retrieves a UTF-8 string list with values of the supported numeric functions. - SQL_NUMERIC_FUNCTIONS = 509; - - // Retrieves a UTF-8 string list with values of the supported string functions. - SQL_STRING_FUNCTIONS = 510; - - // Retrieves a UTF-8 string list with values of the supported system functions. - SQL_SYSTEM_FUNCTIONS = 511; - - // Retrieves a UTF-8 string list with values of the supported datetime functions. - SQL_DATETIME_FUNCTIONS = 512; - - /* - * Retrieves the UTF-8 string that can be used to escape wildcard characters. - * This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern - * (and therefore use one of the wildcard characters). - * The '_' character represents any single character; the '%' character represents any sequence of zero or more - * characters. - */ - SQL_SEARCH_STRING_ESCAPE = 513; - - /* - * Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names - * (those beyond a-z, A-Z, 0-9 and _). - */ - SQL_EXTRA_NAME_CHARACTERS = 514; - - /* - * Retrieves a boolean value indicating whether column aliasing is supported. - * If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns - * as required. - * - * Returns: - * - false: if column aliasing is unsupported; - * - true: if column aliasing is supported. - */ - SQL_SUPPORTS_COLUMN_ALIASING = 515; - - /* - * Retrieves a boolean value indicating whether concatenations between null and non-null values being - * null are supported. - * - * - Returns: - * - false: if concatenations between null and non-null values being null are unsupported; - * - true: if concatenations between null and non-null values being null are supported. - */ - SQL_NULL_PLUS_NULL_IS_NULL = 516; - - /* - * Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, - * indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on - * SqlSupportsConvert enum. - * The returned map will be: map> - */ - SQL_SUPPORTS_CONVERT = 517; - - /* - * Retrieves a boolean value indicating whether, when table correlation names are supported, - * they are restricted to being different from the names of the tables. - * - * Returns: - * - false: if table correlation names are unsupported; - * - true: if table correlation names are supported. - */ - SQL_SUPPORTS_TABLE_CORRELATION_NAMES = 518; - - /* - * Retrieves a boolean value indicating whether, when table correlation names are supported, - * they are restricted to being different from the names of the tables. - * - * Returns: - * - false: if different table correlation names are unsupported; - * - true: if different table correlation names are supported - */ - SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES = 519; - - /* - * Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. - * - * Returns: - * - false: if expressions in ORDER BY are unsupported; - * - true: if expressions in ORDER BY are supported; - */ - SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY = 520; - - /* - * Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY - * clause is supported. - * - * Returns: - * - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; - * - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. - */ - SQL_SUPPORTS_ORDER_BY_UNRELATED = 521; - - /* - * Retrieves the supported GROUP BY commands; - * - * Returns an int32 bitmask value representing the supported commands. - * The returned bitmask should be parsed in order to retrieve the supported commands. - * - * For instance: - * - return 0 (\b0) => [] (GROUP BY is unsupported); - * - return 1 (\b1) => [SQL_GROUP_BY_UNRELATED]; - * - return 2 (\b10) => [SQL_GROUP_BY_BEYOND_SELECT]; - * - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. - * Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. - */ - SQL_SUPPORTED_GROUP_BY = 522; - - /* - * Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. - * - * Returns: - * - false: if specifying a LIKE escape clause is unsupported; - * - true: if specifying a LIKE escape clause is supported. - */ - SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE = 523; - - /* - * Retrieves a boolean value indicating whether columns may be defined as non-nullable. - * - * Returns: - * - false: if columns cannot be defined as non-nullable; - * - true: if columns may be defined as non-nullable. - */ - SQL_SUPPORTS_NON_NULLABLE_COLUMNS = 524; - - /* - * Retrieves the supported SQL grammar level as per the ODBC specification. - * - * Returns an int32 bitmask value representing the supported SQL grammar level. - * The returned bitmask should be parsed in order to retrieve the supported grammar levels. - * - * For instance: - * - return 0 (\b0) => [] (SQL grammar is unsupported); - * - return 1 (\b1) => [SQL_MINIMUM_GRAMMAR]; - * - return 2 (\b10) => [SQL_CORE_GRAMMAR]; - * - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; - * - return 4 (\b100) => [SQL_EXTENDED_GRAMMAR]; - * - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - * - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - * - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. - * Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. - */ - SQL_SUPPORTED_GRAMMAR = 525; - - /* - * Retrieves the supported ANSI92 SQL grammar level. - * - * Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. - * The returned bitmask should be parsed in order to retrieve the supported commands. - * - * For instance: - * - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); - * - return 1 (\b1) => [ANSI92_ENTRY_SQL]; - * - return 2 (\b10) => [ANSI92_INTERMEDIATE_SQL]; - * - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; - * - return 4 (\b100) => [ANSI92_FULL_SQL]; - * - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; - * - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; - * - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. - * Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. - */ - SQL_ANSI92_SUPPORTED_LEVEL = 526; - - /* - * Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. - * - * Returns: - * - false: if the SQL Integrity Enhancement Facility is supported; - * - true: if the SQL Integrity Enhancement Facility is supported. - */ - SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY = 527; - - /* - * Retrieves the support level for SQL OUTER JOINs. - * - * Returns a int32 ordinal for the SQL ordering being used, as described in - * `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. - */ - SQL_OUTER_JOINS_SUPPORT_LEVEL = 528; - - // Retrieves a UTF-8 string with the preferred term for "schema". - SQL_SCHEMA_TERM = 529; - - // Retrieves a UTF-8 string with the preferred term for "procedure". - SQL_PROCEDURE_TERM = 530; - - /* - * Retrieves a UTF-8 string with the preferred term for "catalog". - * If a empty string is returned its assumed that the server does NOT supports catalogs. - */ - SQL_CATALOG_TERM = 531; - - /* - * Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. - * - * - false: if a catalog does not appear at the start of a fully qualified table name; - * - true: if a catalog appears at the start of a fully qualified table name. - */ - SQL_CATALOG_AT_START = 532; - - /* - * Retrieves the supported actions for a SQL schema. - * - * Returns an int32 bitmask value representing the supported actions for a SQL schema. - * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. - * - * For instance: - * - return 0 (\b0) => [] (no supported actions for SQL schema); - * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; - * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - * Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. - */ - SQL_SCHEMAS_SUPPORTED_ACTIONS = 533; - - /* - * Retrieves the supported actions for a SQL schema. - * - * Returns an int32 bitmask value representing the supported actions for a SQL catalog. - * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. - * - * For instance: - * - return 0 (\b0) => [] (no supported actions for SQL catalog); - * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; - * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - * Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. - */ - SQL_CATALOGS_SUPPORTED_ACTIONS = 534; - - /* - * Retrieves the supported SQL positioned commands. - * - * Returns an int32 bitmask value representing the supported SQL positioned commands. - * The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. - * - * For instance: - * - return 0 (\b0) => [] (no supported SQL positioned commands); - * - return 1 (\b1) => [SQL_POSITIONED_DELETE]; - * - return 2 (\b10) => [SQL_POSITIONED_UPDATE]; - * - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. - * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. - */ - SQL_SUPPORTED_POSITIONED_COMMANDS = 535; - - /* - * Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. - * - * Returns: - * - false: if SELECT FOR UPDATE statements are unsupported; - * - true: if SELECT FOR UPDATE statements are supported. - */ - SQL_SELECT_FOR_UPDATE_SUPPORTED = 536; - - /* - * Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax - * are supported. - * - * Returns: - * - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; - * - true: if stored procedure calls that use the stored procedure escape syntax are supported. - */ - SQL_STORED_PROCEDURES_SUPPORTED = 537; - - /* - * Retrieves the supported SQL subqueries. - * - * Returns an int32 bitmask value representing the supported SQL subqueries. - * The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. - * - * For instance: - * - return 0 (\b0) => [] (no supported SQL subqueries); - * - return 1 (\b1) => [SQL_SUBQUERIES_IN_COMPARISONS]; - * - return 2 (\b10) => [SQL_SUBQUERIES_IN_EXISTS]; - * - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; - * - return 4 (\b100) => [SQL_SUBQUERIES_IN_INS]; - * - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; - * - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; - * - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; - * - return 8 (\b1000) => [SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - * - ... - * Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. - */ - SQL_SUPPORTED_SUBQUERIES = 538; - - /* - * Retrieves a boolean value indicating whether correlated subqueries are supported. - * - * Returns: - * - false: if correlated subqueries are unsupported; - * - true: if correlated subqueries are supported. - */ - SQL_CORRELATED_SUBQUERIES_SUPPORTED = 539; - - /* - * Retrieves the supported SQL UNIONs. - * - * Returns an int32 bitmask value representing the supported SQL UNIONs. - * The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. - * - * For instance: - * - return 0 (\b0) => [] (no supported SQL positioned commands); - * - return 1 (\b1) => [SQL_UNION]; - * - return 2 (\b10) => [SQL_UNION_ALL]; - * - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. - * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. - */ - SQL_SUPPORTED_UNIONS = 540; - - // Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. - SQL_MAX_BINARY_LITERAL_LENGTH = 541; - - // Retrieves a int64 value representing the maximum number of characters allowed for a character literal. - SQL_MAX_CHAR_LITERAL_LENGTH = 542; - - // Retrieves a int64 value representing the maximum number of characters allowed for a column name. - SQL_MAX_COLUMN_NAME_LENGTH = 543; - - // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. - SQL_MAX_COLUMNS_IN_GROUP_BY = 544; - - // Retrieves a int64 value representing the maximum number of columns allowed in an index. - SQL_MAX_COLUMNS_IN_INDEX = 545; - - // Retrieves a int64 value representing the maximum number of columns allowed in an ORDER BY clause. - SQL_MAX_COLUMNS_IN_ORDER_BY = 546; - - // Retrieves a int64 value representing the maximum number of columns allowed in a SELECT list. - SQL_MAX_COLUMNS_IN_SELECT = 547; - - // Retrieves a int64 value representing the maximum number of columns allowed in a table. - SQL_MAX_COLUMNS_IN_TABLE = 548; - - // Retrieves a int64 value representing the maximum number of concurrent connections possible. - SQL_MAX_CONNECTIONS = 549; - - // Retrieves a int64 value the maximum number of characters allowed in a cursor name. - SQL_MAX_CURSOR_NAME_LENGTH = 550; - - /* - * Retrieves a int64 value representing the maximum number of bytes allowed for an index, - * including all of the parts of the index. - */ - SQL_MAX_INDEX_LENGTH = 551; - - // Retrieves a int64 value representing the maximum number of characters allowed in a schema name. - SQL_DB_SCHEMA_NAME_LENGTH = 552; - - // Retrieves a int64 value representing the maximum number of characters allowed in a procedure name. - SQL_MAX_PROCEDURE_NAME_LENGTH = 553; - - // Retrieves a int64 value representing the maximum number of characters allowed in a catalog name. - SQL_MAX_CATALOG_NAME_LENGTH = 554; - - // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. - SQL_MAX_ROW_SIZE = 555; - - /* - * Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL - * data types LONGVARCHAR and LONGVARBINARY. - * - * Returns: - * - false: if return value for the JDBC method getMaxRowSize does - * not include the SQL data types LONGVARCHAR and LONGVARBINARY; - * - true: if return value for the JDBC method getMaxRowSize includes - * the SQL data types LONGVARCHAR and LONGVARBINARY. - */ - SQL_MAX_ROW_SIZE_INCLUDES_BLOBS = 556; - - /* - * Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; - * a result of 0 (zero) means that there is no limit or the limit is not known. - */ - SQL_MAX_STATEMENT_LENGTH = 557; - - // Retrieves a int64 value representing the maximum number of active statements that can be open at the same time. - SQL_MAX_STATEMENTS = 558; - - // Retrieves a int64 value representing the maximum number of characters allowed in a table name. - SQL_MAX_TABLE_NAME_LENGTH = 559; - - // Retrieves a int64 value representing the maximum number of tables allowed in a SELECT statement. - SQL_MAX_TABLES_IN_SELECT = 560; - - // Retrieves a int64 value representing the maximum number of characters allowed in a user name. - SQL_MAX_USERNAME_LENGTH = 561; - - /* - * Retrieves this database's default transaction isolation level as described in - * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. - * - * Returns a int32 ordinal for the SQL transaction isolation level. - */ - SQL_DEFAULT_TRANSACTION_ISOLATION = 562; - - /* - * Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a - * noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. - * - * Returns: - * - false: if transactions are unsupported; - * - true: if transactions are supported. - */ - SQL_TRANSACTIONS_SUPPORTED = 563; - - /* - * Retrieves the supported transactions isolation levels. - * - * Returns an int32 bitmask value representing the supported transactions isolation levels. - * The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. - * - * For instance: - * - return 0 (\b0) => [] (no supported SQL transactions isolation levels); - * - return 1 (\b1) => [SQL_TRANSACTION_NONE]; - * - return 2 (\b10) => [SQL_TRANSACTION_READ_UNCOMMITTED]; - * - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; - * - return 4 (\b100) => [SQL_TRANSACTION_REPEATABLE_READ]; - * - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 8 (\b1000) => [SQL_TRANSACTION_REPEATABLE_READ]; - * - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - * - return 16 (\b10000) => [SQL_TRANSACTION_SERIALIZABLE]; - * - ... - * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. - */ - SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS = 564; - - /* - * Retrieves a boolean value indicating whether a data definition statement within a transaction forces - * the transaction to commit. - * - * Returns: - * - false: if a data definition statement within a transaction does not force the transaction to commit; - * - true: if a data definition statement within a transaction forces the transaction to commit. - */ - SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT = 565; - - /* - * Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. - * - * Returns: - * - false: if a data definition statement within a transaction is taken into account; - * - true: a data definition statement within a transaction is ignored. - */ - SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED = 566; - - /* - * Retrieves an int32 bitmask value representing the supported result set types. - * The returned bitmask should be parsed in order to retrieve the supported result set types. - * - * For instance: - * - return 0 (\b0) => [] (no supported result set types); - * - return 1 (\b1) => [SQL_RESULT_SET_TYPE_UNSPECIFIED]; - * - return 2 (\b10) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY]; - * - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; - * - return 4 (\b100) => [SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - * - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - * - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - * - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - * - return 8 (\b1000) => [SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE]; - * - ... - * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. - */ - SQL_SUPPORTED_RESULT_SET_TYPES = 567; - - /* - * Returns an int32 bitmask value concurrency types supported for - * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. - * - * For instance: - * - return 0 (\b0) => [] (no supported concurrency types for this result set type) - * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] - * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. - */ - SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED = 568; - - /* - * Returns an int32 bitmask value concurrency types supported for - * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. - * - * For instance: - * - return 0 (\b0) => [] (no supported concurrency types for this result set type) - * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] - * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. - */ - SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY = 569; - - /* - * Returns an int32 bitmask value concurrency types supported for - * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. - * - * For instance: - * - return 0 (\b0) => [] (no supported concurrency types for this result set type) - * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] - * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. - */ - SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE = 570; - - /* - * Returns an int32 bitmask value concurrency types supported for - * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. - * - * For instance: - * - return 0 (\b0) => [] (no supported concurrency types for this result set type) - * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] - * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. - */ - SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE = 571; - - /* - * Retrieves a boolean value indicating whether this database supports batch updates. - * - * - false: if this database does not support batch updates; - * - true: if this database supports batch updates. - */ - SQL_BATCH_UPDATES_SUPPORTED = 572; - - /* - * Retrieves a boolean value indicating whether this database supports savepoints. - * - * Returns: - * - false: if this database does not support savepoints; - * - true: if this database supports savepoints. - */ - SQL_SAVEPOINTS_SUPPORTED = 573; - - /* - * Retrieves a boolean value indicating whether named parameters are supported in callable statements. - * - * Returns: - * - false: if named parameters in callable statements are unsupported; - * - true: if named parameters in callable statements are supported. - */ - SQL_NAMED_PARAMETERS_SUPPORTED = 574; - - /* - * Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. - * - * Returns: - * - false: if updates made to a LOB are made directly to the LOB; - * - true: if updates made to a LOB are made on a copy. - */ - SQL_LOCATORS_UPDATE_COPY = 575; - - /* - * Retrieves a boolean value indicating whether invoking user-defined or vendor functions - * using the stored procedure escape syntax is supported. - * - * Returns: - * - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; - * - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. - */ - SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED = 576; -} - -enum SqlSupportedCaseSensitivity { - SQL_CASE_SENSITIVITY_UNKNOWN = 0; - SQL_CASE_SENSITIVITY_CASE_INSENSITIVE = 1; - SQL_CASE_SENSITIVITY_UPPERCASE = 2; - SQL_CASE_SENSITIVITY_LOWERCASE = 3; -} - -enum SqlNullOrdering { - SQL_NULLS_SORTED_HIGH = 0; - SQL_NULLS_SORTED_LOW = 1; - SQL_NULLS_SORTED_AT_START = 2; - SQL_NULLS_SORTED_AT_END = 3; -} - -enum SupportedSqlGrammar { - SQL_MINIMUM_GRAMMAR = 0; - SQL_CORE_GRAMMAR = 1; - SQL_EXTENDED_GRAMMAR = 2; -} - -enum SupportedAnsi92SqlGrammarLevel { - ANSI92_ENTRY_SQL = 0; - ANSI92_INTERMEDIATE_SQL = 1; - ANSI92_FULL_SQL = 2; -} - -enum SqlOuterJoinsSupportLevel { - SQL_JOINS_UNSUPPORTED = 0; - SQL_LIMITED_OUTER_JOINS = 1; - SQL_FULL_OUTER_JOINS = 2; -} - -enum SqlSupportedGroupBy { - SQL_GROUP_BY_UNRELATED = 0; - SQL_GROUP_BY_BEYOND_SELECT = 1; -} - -enum SqlSupportedElementActions { - SQL_ELEMENT_IN_PROCEDURE_CALLS = 0; - SQL_ELEMENT_IN_INDEX_DEFINITIONS = 1; - SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS = 2; -} - -enum SqlSupportedPositionedCommands { - SQL_POSITIONED_DELETE = 0; - SQL_POSITIONED_UPDATE = 1; -} - -enum SqlSupportedSubqueries { - SQL_SUBQUERIES_IN_COMPARISONS = 0; - SQL_SUBQUERIES_IN_EXISTS = 1; - SQL_SUBQUERIES_IN_INS = 2; - SQL_SUBQUERIES_IN_QUANTIFIEDS = 3; -} - -enum SqlSupportedUnions { - SQL_UNION = 0; - SQL_UNION_ALL = 1; -} - -enum SqlTransactionIsolationLevel { - SQL_TRANSACTION_NONE = 0; - SQL_TRANSACTION_READ_UNCOMMITTED = 1; - SQL_TRANSACTION_READ_COMMITTED = 2; - SQL_TRANSACTION_REPEATABLE_READ = 3; - SQL_TRANSACTION_SERIALIZABLE = 4; -} - -enum SqlSupportedTransactions { - SQL_TRANSACTION_UNSPECIFIED = 0; - SQL_DATA_DEFINITION_TRANSACTIONS = 1; - SQL_DATA_MANIPULATION_TRANSACTIONS = 2; -} - -enum SqlSupportedResultSetType { - SQL_RESULT_SET_TYPE_UNSPECIFIED = 0; - SQL_RESULT_SET_TYPE_FORWARD_ONLY = 1; - SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE = 2; - SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE = 3; -} - -enum SqlSupportedResultSetConcurrency { - SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED = 0; - SQL_RESULT_SET_CONCURRENCY_READ_ONLY = 1; - SQL_RESULT_SET_CONCURRENCY_UPDATABLE = 2; -} - -enum SqlSupportsConvert { - SQL_CONVERT_BIGINT = 0; - SQL_CONVERT_BINARY = 1; - SQL_CONVERT_BIT = 2; - SQL_CONVERT_CHAR = 3; - SQL_CONVERT_DATE = 4; - SQL_CONVERT_DECIMAL = 5; - SQL_CONVERT_FLOAT = 6; - SQL_CONVERT_INTEGER = 7; - SQL_CONVERT_INTERVAL_DAY_TIME = 8; - SQL_CONVERT_INTERVAL_YEAR_MONTH = 9; - SQL_CONVERT_LONGVARBINARY = 10; - SQL_CONVERT_LONGVARCHAR = 11; - SQL_CONVERT_NUMERIC = 12; - SQL_CONVERT_REAL = 13; - SQL_CONVERT_SMALLINT = 14; - SQL_CONVERT_TIME = 15; - SQL_CONVERT_TIMESTAMP = 16; - SQL_CONVERT_TINYINT = 17; - SQL_CONVERT_VARBINARY = 18; - SQL_CONVERT_VARCHAR = 19; -} - -/** - * The JDBC/ODBC-defined type of any object. - * All the values here are the sames as in the JDBC and ODBC specs. - */ -enum XdbcDataType { - XDBC_UNKNOWN_TYPE = 0; - XDBC_CHAR = 1; - XDBC_NUMERIC = 2; - XDBC_DECIMAL = 3; - XDBC_INTEGER = 4; - XDBC_SMALLINT = 5; - XDBC_FLOAT = 6; - XDBC_REAL = 7; - XDBC_DOUBLE = 8; - XDBC_DATETIME = 9; - XDBC_INTERVAL = 10; - XDBC_VARCHAR = 12; - XDBC_DATE = 91; - XDBC_TIME = 92; - XDBC_TIMESTAMP = 93; - XDBC_LONGVARCHAR = -1; - XDBC_BINARY = -2; - XDBC_VARBINARY = -3; - XDBC_LONGVARBINARY = -4; - XDBC_BIGINT = -5; - XDBC_TINYINT = -6; - XDBC_BIT = -7; - XDBC_WCHAR = -8; - XDBC_WVARCHAR = -9; -} - -/** - * Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. - */ -enum XdbcDatetimeSubcode { - option allow_alias = true; - XDBC_SUBCODE_UNKNOWN = 0; - XDBC_SUBCODE_YEAR = 1; - XDBC_SUBCODE_DATE = 1; - XDBC_SUBCODE_TIME = 2; - XDBC_SUBCODE_MONTH = 2; - XDBC_SUBCODE_TIMESTAMP = 3; - XDBC_SUBCODE_DAY = 3; - XDBC_SUBCODE_TIME_WITH_TIMEZONE = 4; - XDBC_SUBCODE_HOUR = 4; - XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE = 5; - XDBC_SUBCODE_MINUTE = 5; - XDBC_SUBCODE_SECOND = 6; - XDBC_SUBCODE_YEAR_TO_MONTH = 7; - XDBC_SUBCODE_DAY_TO_HOUR = 8; - XDBC_SUBCODE_DAY_TO_MINUTE = 9; - XDBC_SUBCODE_DAY_TO_SECOND = 10; - XDBC_SUBCODE_HOUR_TO_MINUTE = 11; - XDBC_SUBCODE_HOUR_TO_SECOND = 12; - XDBC_SUBCODE_MINUTE_TO_SECOND = 13; - XDBC_SUBCODE_INTERVAL_YEAR = 101; - XDBC_SUBCODE_INTERVAL_MONTH = 102; - XDBC_SUBCODE_INTERVAL_DAY = 103; - XDBC_SUBCODE_INTERVAL_HOUR = 104; - XDBC_SUBCODE_INTERVAL_MINUTE = 105; - XDBC_SUBCODE_INTERVAL_SECOND = 106; - XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH = 107; - XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR = 108; - XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE = 109; - XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND = 110; - XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE = 111; - XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND = 112; - XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND = 113; -} - -enum Nullable { - /** - * Indicates that the fields does not allow the use of null values. - */ - NULLABILITY_NO_NULLS = 0; - - /** - * Indicates that the fields allow the use of null values. - */ - NULLABILITY_NULLABLE = 1; - - /** - * Indicates that nullability of the fields can not be determined. - */ - NULLABILITY_UNKNOWN = 2; -} - -enum Searchable { - /** - * Indicates that column can not be used in a WHERE clause. - */ - SEARCHABLE_NONE = 0; - - /** - * Indicates that the column can be used in a WHERE clause if it is using a - * LIKE operator. - */ - SEARCHABLE_CHAR = 1; - - /** - * Indicates that the column can be used In a WHERE clause with any - * operator other than LIKE. - * - * - Allowed operators: comparison, quantified comparison, BETWEEN, - * DISTINCT, IN, MATCH, and UNIQUE. - */ - SEARCHABLE_BASIC = 2; - - /** - * Indicates that the column can be used in a WHERE clause using any operator. - */ - SEARCHABLE_FULL = 3; -} - -/* - * Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned schema will be: - * < - * type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), - * data_type: int not null (The SQL data type), - * column_size: int (The maximum size supported by that column. - * In case of exact numeric types, this represents the maximum precision. - * In case of string types, this represents the character length. - * In case of datetime data types, this represents the length in characters of the string representation. - * NULL is returned for data types where column size is not applicable.), - * literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for - * data types where a literal prefix is not applicable.), - * literal_suffix: utf8 (Character or characters used to terminate a literal, - * NULL is returned for data types where a literal suffix is not applicable.), - * create_params: list - * (A list of keywords corresponding to which parameters can be used when creating - * a column for that specific type. - * NULL is returned if there are no parameters for the data type definition.), - * nullable: int not null (Shows if the data type accepts a NULL value. The possible values can be seen in the - * Nullable enum.), - * case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), - * searchable: int not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the - * Searchable enum.), - * unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is - * not applicable to the data type or the data type is not numeric.), - * fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), - * auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute - * is not applicable to the data type or the data type is not numeric.), - * local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL - * is returned if a localized name is not supported by the data source), - * minimum_scale: int (The minimum scale of the data type on the data source. - * If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE - * columns both contain this value. NULL is returned if scale is not applicable.), - * maximum_scale: int (The maximum scale of the data type on the data source. - * NULL is returned if scale is not applicable.), - * sql_data_type: int not null (The value of the SQL DATA TYPE which has the same values - * as data_type value. Except for interval and datetime, which - * uses generic values. More info about those types can be - * obtained through datetime_subcode. The possible values can be seen - * in the XdbcDataType enum.), - * datetime_subcode: int (Only used when the SQL DATA TYPE is interval or datetime. It contains - * its sub types. For type different from interval and datetime, this value - * is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), - * num_prec_radix: int (If the data type is an approximate numeric type, this column contains - * the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For - * exact numeric types, this column contains the value 10 to indicate that - * column size specifies a number of decimal digits. Otherwise, this column is NULL.), - * interval_precision: int (If the data type is an interval data type, then this column contains the value - * of the interval leading precision. Otherwise, this column is NULL. This fields - * is only relevant to be used by ODBC). - * > - * The returned data should be ordered by data_type and then by type_name. - */ -message CommandGetXdbcTypeInfo { - option (experimental) = true; - - /* - * Specifies the data type to search for the info. - */ - optional int32 data_type = 1; -} - -/* - * Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. - * The definition of a catalog depends on vendor/implementation. It is usually the database itself - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * catalog_name: utf8 not null - * > - * The returned data should be ordered by catalog_name. - */ -message CommandGetCatalogs { - option (experimental) = true; -} - -/* - * Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. - * The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * catalog_name: utf8, - * db_schema_name: utf8 not null - * > - * The returned data should be ordered by catalog_name, then db_schema_name. - */ -message CommandGetDbSchemas { - option (experimental) = true; - - /* - * Specifies the Catalog to search for the tables. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string catalog = 1; - - /* - * Specifies a filter pattern for schemas to search for. - * When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. - * In the pattern string, two special characters can be used to denote matching rules: - * - "%" means to match any substring with 0 or more characters. - * - "_" means to match any one character. - */ - optional string db_schema_filter_pattern = 2; -} - -/* - * Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * catalog_name: utf8, - * db_schema_name: utf8, - * table_name: utf8 not null, - * table_type: utf8 not null, - * [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, - * it is serialized as an IPC message.) - * > - * Fields on table_schema may contain the following metadata: - * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name - * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name - * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name - * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. - * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size - * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable - * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. - * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. - */ -message CommandGetTables { - option (experimental) = true; - - /* - * Specifies the Catalog to search for the tables. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string catalog = 1; - - /* - * Specifies a filter pattern for schemas to search for. - * When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. - * In the pattern string, two special characters can be used to denote matching rules: - * - "%" means to match any substring with 0 or more characters. - * - "_" means to match any one character. - */ - optional string db_schema_filter_pattern = 2; - - /* - * Specifies a filter pattern for tables to search for. - * When no table_name_filter_pattern is provided, all tables matching other filters are searched. - * In the pattern string, two special characters can be used to denote matching rules: - * - "%" means to match any substring with 0 or more characters. - * - "_" means to match any one character. - */ - optional string table_name_filter_pattern = 3; - - /* - * Specifies a filter of table types which must match. - * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. - * TABLE, VIEW, and SYSTEM TABLE are commonly supported. - */ - repeated string table_types = 4; - - // Specifies if the Arrow schema should be returned for found tables. - bool include_schema = 5; -} - -/* - * Represents a request to retrieve the list of table types on a Flight SQL enabled backend. - * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. - * TABLE, VIEW, and SYSTEM TABLE are commonly supported. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * table_type: utf8 not null - * > - * The returned data should be ordered by table_type. - */ -message CommandGetTableTypes { - option (experimental) = true; -} - -/* - * Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * catalog_name: utf8, - * db_schema_name: utf8, - * table_name: utf8 not null, - * column_name: utf8 not null, - * key_name: utf8, - * key_sequence: int not null - * > - * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. - */ -message CommandGetPrimaryKeys { - option (experimental) = true; - - /* - * Specifies the catalog to search for the table. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string catalog = 1; - - /* - * Specifies the schema to search for the table. - * An empty string retrieves those without a schema. - * If omitted the schema name should not be used to narrow the search. - */ - optional string db_schema = 2; - - // Specifies the table to get the primary keys for. - string table = 3; -} - -enum UpdateDeleteRules { - CASCADE = 0; - RESTRICT = 1; - SET_NULL = 2; - NO_ACTION = 3; - SET_DEFAULT = 4; -} - -/* - * Represents a request to retrieve a description of the foreign key columns that reference the given table's - * primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * pk_catalog_name: utf8, - * pk_db_schema_name: utf8, - * pk_table_name: utf8 not null, - * pk_column_name: utf8 not null, - * fk_catalog_name: utf8, - * fk_db_schema_name: utf8, - * fk_table_name: utf8 not null, - * fk_column_name: utf8 not null, - * key_sequence: int not null, - * fk_key_name: utf8, - * pk_key_name: utf8, - * update_rule: uint1 not null, - * delete_rule: uint1 not null - * > - * The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. - * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. - */ -message CommandGetExportedKeys { - option (experimental) = true; - - /* - * Specifies the catalog to search for the foreign key table. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string catalog = 1; - - /* - * Specifies the schema to search for the foreign key table. - * An empty string retrieves those without a schema. - * If omitted the schema name should not be used to narrow the search. - */ - optional string db_schema = 2; - - // Specifies the foreign key table to get the foreign keys for. - string table = 3; -} - -/* - * Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * pk_catalog_name: utf8, - * pk_db_schema_name: utf8, - * pk_table_name: utf8 not null, - * pk_column_name: utf8 not null, - * fk_catalog_name: utf8, - * fk_db_schema_name: utf8, - * fk_table_name: utf8 not null, - * fk_column_name: utf8 not null, - * key_sequence: int not null, - * fk_key_name: utf8, - * pk_key_name: utf8, - * update_rule: uint1 not null, - * delete_rule: uint1 not null - * > - * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. - * update_rule and delete_rule returns a byte that is equivalent to actions: - * - 0 = CASCADE - * - 1 = RESTRICT - * - 2 = SET NULL - * - 3 = NO ACTION - * - 4 = SET DEFAULT - */ -message CommandGetImportedKeys { - option (experimental) = true; - - /* - * Specifies the catalog to search for the primary key table. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string catalog = 1; - - /* - * Specifies the schema to search for the primary key table. - * An empty string retrieves those without a schema. - * If omitted the schema name should not be used to narrow the search. - */ - optional string db_schema = 2; - - // Specifies the primary key table to get the foreign keys for. - string table = 3; -} - -/* - * Represents a request to retrieve a description of the foreign key columns in the given foreign key table that - * reference the primary key or the columns representing a unique constraint of the parent table (could be the same - * or a different table) on a Flight SQL enabled backend. - * Used in the command member of FlightDescriptor for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * - GetFlightInfo: execute the catalog metadata request. - * - * The returned Arrow schema will be: - * < - * pk_catalog_name: utf8, - * pk_db_schema_name: utf8, - * pk_table_name: utf8 not null, - * pk_column_name: utf8 not null, - * fk_catalog_name: utf8, - * fk_db_schema_name: utf8, - * fk_table_name: utf8 not null, - * fk_column_name: utf8 not null, - * key_sequence: int not null, - * fk_key_name: utf8, - * pk_key_name: utf8, - * update_rule: uint1 not null, - * delete_rule: uint1 not null - * > - * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. - * update_rule and delete_rule returns a byte that is equivalent to actions: - * - 0 = CASCADE - * - 1 = RESTRICT - * - 2 = SET NULL - * - 3 = NO ACTION - * - 4 = SET DEFAULT - */ -message CommandGetCrossReference { - option (experimental) = true; - - /** - * The catalog name where the parent table is. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string pk_catalog = 1; - - /** - * The Schema name where the parent table is. - * An empty string retrieves those without a schema. - * If omitted the schema name should not be used to narrow the search. - */ - optional string pk_db_schema = 2; - - /** - * The parent table name. It cannot be null. - */ - string pk_table = 3; - - /** - * The catalog name where the foreign table is. - * An empty string retrieves those without a catalog. - * If omitted the catalog name should not be used to narrow the search. - */ - optional string fk_catalog = 4; - - /** - * The schema name where the foreign table is. - * An empty string retrieves those without a schema. - * If omitted the schema name should not be used to narrow the search. - */ - optional string fk_db_schema = 5; - - /** - * The foreign table name. It cannot be null. - */ - string fk_table = 6; -} - -// SQL Execution Action Messages - -/* - * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. - */ -message ActionCreatePreparedStatementRequest { - option (experimental) = true; - - // The valid SQL string to create a prepared statement for. - string query = 1; -} - -/* - * Wrap the result of a "GetPreparedStatement" action. - * - * The resultant PreparedStatement can be closed either: - * - Manually, through the "ClosePreparedStatement" action; - * - Automatically, by a server timeout. - */ -message ActionCreatePreparedStatementResult { - option (experimental) = true; - - // Opaque handle for the prepared statement on the server. - bytes prepared_statement_handle = 1; - - // If a result set generating query was provided, dataset_schema contains the - // schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. - bytes dataset_schema = 2; - - // If the query provided contained parameters, parameter_schema contains the - // schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. - bytes parameter_schema = 3; -} - -/* - * Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. - * Closes server resources associated with the prepared statement handle. - */ -message ActionClosePreparedStatementRequest { - option (experimental) = true; - - // Opaque handle for the prepared statement on the server. - bytes prepared_statement_handle = 1; -} - - -// SQL Execution Messages. - -/* - * Represents a SQL query. Used in the command member of FlightDescriptor - * for the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * Fields on this schema may contain the following metadata: - * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name - * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name - * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name - * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. - * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size - * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable - * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. - * - GetFlightInfo: execute the query. - */ -message CommandStatementQuery { - option (experimental) = true; - - // The SQL syntax. - string query = 1; -} - -/** - * Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. - * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. - */ -message TicketStatementQuery { - option (experimental) = true; - - // Unique identifier for the instance of the statement to execute. - bytes statement_handle = 1; -} - -/* - * Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for - * the following RPC calls: - * - GetSchema: return the Arrow schema of the query. - * Fields on this schema may contain the following metadata: - * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name - * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name - * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name - * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. - * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size - * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable - * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. - * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. - * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. - * - GetFlightInfo: execute the prepared statement instance. - */ -message CommandPreparedStatementQuery { - option (experimental) = true; - - // Opaque handle for the prepared statement on the server. - bytes prepared_statement_handle = 1; -} - -/* - * Represents a SQL update query. Used in the command member of FlightDescriptor - * for the the RPC call DoPut to cause the server to execute the included SQL update. - */ -message CommandStatementUpdate { - option (experimental) = true; - - // The SQL syntax. - string query = 1; -} - -/* - * Represents a SQL update query. Used in the command member of FlightDescriptor - * for the the RPC call DoPut to cause the server to execute the included - * prepared statement handle as an update. - */ -message CommandPreparedStatementUpdate { - option (experimental) = true; - - // Opaque handle for the prepared statement on the server. - bytes prepared_statement_handle = 1; -} - -/* - * Returned from the RPC call DoPut when a CommandStatementUpdate - * CommandPreparedStatementUpdate was in the request, containing - * results from the update. - */ -message DoPutUpdateResult { - option (experimental) = true; - - // The number of records updated. A return value of -1 represents - // an unknown updated record count. - int64 record_count = 1; -} - -extend google.protobuf.MessageOptions { - bool experimental = 1000; -} + syntax = "proto3"; + import "google/protobuf/descriptor.proto"; + + option java_package = "org.apache.arrow.flight.sql.impl"; + option go_package = "github.com/apache/arrow/go/arrow/flight/internal/flight"; + package arrow.flight.protocol.sql; + + /* + * Represents a metadata request. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the metadata request. + * + * The returned Arrow schema will be: + * < + * info_name: uint32 not null, + * value: dense_union< + * string_value: utf8, + * bool_value: bool, + * bigint_value: int64, + * int32_bitmask: int32, + * string_list: list + * int32_to_int32_list_map: map> + * > + * where there is one row per requested piece of metadata information. + */ + message CommandGetSqlInfo { + option (experimental) = true; + + /* + * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide + * Flight SQL clients with basic, SQL syntax and SQL functions related information. + * More information types can be added in future releases. + * E.g. more SQL syntax support types, scalar functions support, type conversion support etc. + * + * Note that the set of metadata may expand. + * + * Initially, Flight SQL will support the following information types: + * - Server Information - Range [0-500) + * - Syntax Information - Range [500-1000) + * Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). + * Custom options should start at 10,000. + * + * If omitted, then all metadata will be retrieved. + * Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must + * at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. + * If additional metadata is included, the metadata IDs should start from 10,000. + */ + repeated uint32 info = 1; + } + + // Options for CommandGetSqlInfo. + enum SqlInfo { + + // Server Information [0-500): Provides basic information about the Flight SQL Server. + + // Retrieves a UTF-8 string with the name of the Flight SQL Server. + FLIGHT_SQL_SERVER_NAME = 0; + + // Retrieves a UTF-8 string with the native version of the Flight SQL Server. + FLIGHT_SQL_SERVER_VERSION = 1; + + // Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. + FLIGHT_SQL_SERVER_ARROW_VERSION = 2; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server is read only. + * + * Returns: + * - false: if read-write + * - true: if read only + */ + FLIGHT_SQL_SERVER_READ_ONLY = 3; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * SQL queries. + * + * Note that the absence of this info (as opposed to a false value) does not necessarily + * mean that SQL is not supported, as this property was not originally defined. + */ + FLIGHT_SQL_SERVER_SQL = 4; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * Substrait plans. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT = 5; + + /* + * Retrieves a string value indicating the minimum supported Substrait version, or null + * if Substrait is not supported. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT_MIN_VERSION = 6; + + /* + * Retrieves a string value indicating the maximum supported Substrait version, or null + * if Substrait is not supported. + */ + FLIGHT_SQL_SERVER_SUBSTRAIT_MAX_VERSION = 7; + + /* + * Retrieves an int32 indicating whether the Flight SQL Server supports the + * BeginTransaction/EndTransaction/BeginSavepoint/EndSavepoint actions. + * + * Even if this is not supported, the database may still support explicit "BEGIN + * TRANSACTION"/"COMMIT" SQL statements (see SQL_TRANSACTIONS_SUPPORTED); this property + * is only about whether the server implements the Flight SQL API endpoints. + * + * The possible values are listed in `SqlSupportedTransaction`. + */ + FLIGHT_SQL_SERVER_TRANSACTION = 8; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports explicit + * query cancellation (the CancelQuery action). + */ + FLIGHT_SQL_SERVER_CANCEL = 9; + + /* + * Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. + * + * If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + */ + FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT = 100; + + /* + * Retrieves an int32 indicating the timeout (in milliseconds) for transactions, since transactions are not tied to a connection. + * + * If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. + */ + FLIGHT_SQL_SERVER_TRANSACTION_TIMEOUT = 101; + + // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of catalogs. + * - true: if it supports CREATE and DROP of catalogs. + */ + SQL_DDL_CATALOG = 500; + + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of schemas. + * - true: if it supports CREATE and DROP of schemas. + */ + SQL_DDL_SCHEMA = 501; + + /* + * Indicates whether the Flight SQL Server supports CREATE and DROP of tables. + * + * Returns: + * - false: if it doesn't support CREATE and DROP of tables. + * - true: if it supports CREATE and DROP of tables. + */ + SQL_DDL_TABLE = 502; + + /* + * Retrieves a int32 ordinal representing the case sensitivity of catalog, table, schema and table names. + * + * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + */ + SQL_IDENTIFIER_CASE = 503; + + // Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. + SQL_IDENTIFIER_QUOTE_CHAR = 504; + + /* + * Retrieves a int32 describing the case sensitivity of quoted identifiers. + * + * The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + */ + SQL_QUOTED_IDENTIFIER_CASE = 505; + + /* + * Retrieves a boolean value indicating whether all tables are selectable. + * + * Returns: + * - false: if not all tables are selectable or if none are; + * - true: if all tables are selectable. + */ + SQL_ALL_TABLES_ARE_SELECTABLE = 506; + + /* + * Retrieves the null ordering. + * + * Returns a int32 ordinal for the null ordering being used, as described in + * `arrow.flight.protocol.sql.SqlNullOrdering`. + */ + SQL_NULL_ORDERING = 507; + + // Retrieves a UTF-8 string list with values of the supported keywords. + SQL_KEYWORDS = 508; + + // Retrieves a UTF-8 string list with values of the supported numeric functions. + SQL_NUMERIC_FUNCTIONS = 509; + + // Retrieves a UTF-8 string list with values of the supported string functions. + SQL_STRING_FUNCTIONS = 510; + + // Retrieves a UTF-8 string list with values of the supported system functions. + SQL_SYSTEM_FUNCTIONS = 511; + + // Retrieves a UTF-8 string list with values of the supported datetime functions. + SQL_DATETIME_FUNCTIONS = 512; + + /* + * Retrieves the UTF-8 string that can be used to escape wildcard characters. + * This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern + * (and therefore use one of the wildcard characters). + * The '_' character represents any single character; the '%' character represents any sequence of zero or more + * characters. + */ + SQL_SEARCH_STRING_ESCAPE = 513; + + /* + * Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names + * (those beyond a-z, A-Z, 0-9 and _). + */ + SQL_EXTRA_NAME_CHARACTERS = 514; + + /* + * Retrieves a boolean value indicating whether column aliasing is supported. + * If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns + * as required. + * + * Returns: + * - false: if column aliasing is unsupported; + * - true: if column aliasing is supported. + */ + SQL_SUPPORTS_COLUMN_ALIASING = 515; + + /* + * Retrieves a boolean value indicating whether concatenations between null and non-null values being + * null are supported. + * + * - Returns: + * - false: if concatenations between null and non-null values being null are unsupported; + * - true: if concatenations between null and non-null values being null are supported. + */ + SQL_NULL_PLUS_NULL_IS_NULL = 516; + + /* + * Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, + * indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on + * SqlSupportsConvert enum. + * The returned map will be: map> + */ + SQL_SUPPORTS_CONVERT = 517; + + /* + * Retrieves a boolean value indicating whether, when table correlation names are supported, + * they are restricted to being different from the names of the tables. + * + * Returns: + * - false: if table correlation names are unsupported; + * - true: if table correlation names are supported. + */ + SQL_SUPPORTS_TABLE_CORRELATION_NAMES = 518; + + /* + * Retrieves a boolean value indicating whether, when table correlation names are supported, + * they are restricted to being different from the names of the tables. + * + * Returns: + * - false: if different table correlation names are unsupported; + * - true: if different table correlation names are supported + */ + SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES = 519; + + /* + * Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. + * + * Returns: + * - false: if expressions in ORDER BY are unsupported; + * - true: if expressions in ORDER BY are supported; + */ + SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY = 520; + + /* + * Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY + * clause is supported. + * + * Returns: + * - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; + * - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. + */ + SQL_SUPPORTS_ORDER_BY_UNRELATED = 521; + + /* + * Retrieves the supported GROUP BY commands; + * + * Returns an int32 bitmask value representing the supported commands. + * The returned bitmask should be parsed in order to retrieve the supported commands. + * + * For instance: + * - return 0 (\b0) => [] (GROUP BY is unsupported); + * - return 1 (\b1) => [SQL_GROUP_BY_UNRELATED]; + * - return 2 (\b10) => [SQL_GROUP_BY_BEYOND_SELECT]; + * - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + * Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. + */ + SQL_SUPPORTED_GROUP_BY = 522; + + /* + * Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. + * + * Returns: + * - false: if specifying a LIKE escape clause is unsupported; + * - true: if specifying a LIKE escape clause is supported. + */ + SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE = 523; + + /* + * Retrieves a boolean value indicating whether columns may be defined as non-nullable. + * + * Returns: + * - false: if columns cannot be defined as non-nullable; + * - true: if columns may be defined as non-nullable. + */ + SQL_SUPPORTS_NON_NULLABLE_COLUMNS = 524; + + /* + * Retrieves the supported SQL grammar level as per the ODBC specification. + * + * Returns an int32 bitmask value representing the supported SQL grammar level. + * The returned bitmask should be parsed in order to retrieve the supported grammar levels. + * + * For instance: + * - return 0 (\b0) => [] (SQL grammar is unsupported); + * - return 1 (\b1) => [SQL_MINIMUM_GRAMMAR]; + * - return 2 (\b10) => [SQL_CORE_GRAMMAR]; + * - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + * - return 4 (\b100) => [SQL_EXTENDED_GRAMMAR]; + * - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + * - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + * - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + * Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. + */ + SQL_SUPPORTED_GRAMMAR = 525; + + /* + * Retrieves the supported ANSI92 SQL grammar level. + * + * Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. + * The returned bitmask should be parsed in order to retrieve the supported commands. + * + * For instance: + * - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + * - return 1 (\b1) => [ANSI92_ENTRY_SQL]; + * - return 2 (\b10) => [ANSI92_INTERMEDIATE_SQL]; + * - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + * - return 4 (\b100) => [ANSI92_FULL_SQL]; + * - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; + * - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; + * - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + * Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. + */ + SQL_ANSI92_SUPPORTED_LEVEL = 526; + + /* + * Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. + * + * Returns: + * - false: if the SQL Integrity Enhancement Facility is supported; + * - true: if the SQL Integrity Enhancement Facility is supported. + */ + SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY = 527; + + /* + * Retrieves the support level for SQL OUTER JOINs. + * + * Returns a int32 ordinal for the SQL ordering being used, as described in + * `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. + */ + SQL_OUTER_JOINS_SUPPORT_LEVEL = 528; + + // Retrieves a UTF-8 string with the preferred term for "schema". + SQL_SCHEMA_TERM = 529; + + // Retrieves a UTF-8 string with the preferred term for "procedure". + SQL_PROCEDURE_TERM = 530; + + /* + * Retrieves a UTF-8 string with the preferred term for "catalog". + * If a empty string is returned its assumed that the server does NOT supports catalogs. + */ + SQL_CATALOG_TERM = 531; + + /* + * Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. + * + * - false: if a catalog does not appear at the start of a fully qualified table name; + * - true: if a catalog appears at the start of a fully qualified table name. + */ + SQL_CATALOG_AT_START = 532; + + /* + * Retrieves the supported actions for a SQL schema. + * + * Returns an int32 bitmask value representing the supported actions for a SQL schema. + * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. + * + * For instance: + * - return 0 (\b0) => [] (no supported actions for SQL schema); + * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; + * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + * Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + */ + SQL_SCHEMAS_SUPPORTED_ACTIONS = 533; + + /* + * Retrieves the supported actions for a SQL schema. + * + * Returns an int32 bitmask value representing the supported actions for a SQL catalog. + * The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. + * + * For instance: + * - return 0 (\b0) => [] (no supported actions for SQL catalog); + * - return 1 (\b1) => [SQL_ELEMENT_IN_PROCEDURE_CALLS]; + * - return 2 (\b10) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + * - return 4 (\b100) => [SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + * - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + * Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + */ + SQL_CATALOGS_SUPPORTED_ACTIONS = 534; + + /* + * Retrieves the supported SQL positioned commands. + * + * Returns an int32 bitmask value representing the supported SQL positioned commands. + * The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL positioned commands); + * - return 1 (\b1) => [SQL_POSITIONED_DELETE]; + * - return 2 (\b10) => [SQL_POSITIONED_UPDATE]; + * - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. + */ + SQL_SUPPORTED_POSITIONED_COMMANDS = 535; + + /* + * Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. + * + * Returns: + * - false: if SELECT FOR UPDATE statements are unsupported; + * - true: if SELECT FOR UPDATE statements are supported. + */ + SQL_SELECT_FOR_UPDATE_SUPPORTED = 536; + + /* + * Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax + * are supported. + * + * Returns: + * - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; + * - true: if stored procedure calls that use the stored procedure escape syntax are supported. + */ + SQL_STORED_PROCEDURES_SUPPORTED = 537; + + /* + * Retrieves the supported SQL subqueries. + * + * Returns an int32 bitmask value representing the supported SQL subqueries. + * The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL subqueries); + * - return 1 (\b1) => [SQL_SUBQUERIES_IN_COMPARISONS]; + * - return 2 (\b10) => [SQL_SUBQUERIES_IN_EXISTS]; + * - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + * - return 4 (\b100) => [SQL_SUBQUERIES_IN_INS]; + * - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; + * - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; + * - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + * - return 8 (\b1000) => [SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + * - ... + * Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. + */ + SQL_SUPPORTED_SUBQUERIES = 538; + + /* + * Retrieves a boolean value indicating whether correlated subqueries are supported. + * + * Returns: + * - false: if correlated subqueries are unsupported; + * - true: if correlated subqueries are supported. + */ + SQL_CORRELATED_SUBQUERIES_SUPPORTED = 539; + + /* + * Retrieves the supported SQL UNIONs. + * + * Returns an int32 bitmask value representing the supported SQL UNIONs. + * The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL positioned commands); + * - return 1 (\b1) => [SQL_UNION]; + * - return 2 (\b10) => [SQL_UNION_ALL]; + * - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. + */ + SQL_SUPPORTED_UNIONS = 540; + + // Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. + SQL_MAX_BINARY_LITERAL_LENGTH = 541; + + // Retrieves a int64 value representing the maximum number of characters allowed for a character literal. + SQL_MAX_CHAR_LITERAL_LENGTH = 542; + + // Retrieves a int64 value representing the maximum number of characters allowed for a column name. + SQL_MAX_COLUMN_NAME_LENGTH = 543; + + // Retrieves a int64 value representing the the maximum number of columns allowed in a GROUP BY clause. + SQL_MAX_COLUMNS_IN_GROUP_BY = 544; + + // Retrieves a int64 value representing the maximum number of columns allowed in an index. + SQL_MAX_COLUMNS_IN_INDEX = 545; + + // Retrieves a int64 value representing the maximum number of columns allowed in an ORDER BY clause. + SQL_MAX_COLUMNS_IN_ORDER_BY = 546; + + // Retrieves a int64 value representing the maximum number of columns allowed in a SELECT list. + SQL_MAX_COLUMNS_IN_SELECT = 547; + + // Retrieves a int64 value representing the maximum number of columns allowed in a table. + SQL_MAX_COLUMNS_IN_TABLE = 548; + + // Retrieves a int64 value representing the maximum number of concurrent connections possible. + SQL_MAX_CONNECTIONS = 549; + + // Retrieves a int64 value the maximum number of characters allowed in a cursor name. + SQL_MAX_CURSOR_NAME_LENGTH = 550; + + /* + * Retrieves a int64 value representing the maximum number of bytes allowed for an index, + * including all of the parts of the index. + */ + SQL_MAX_INDEX_LENGTH = 551; + + // Retrieves a int64 value representing the maximum number of characters allowed in a schema name. + SQL_DB_SCHEMA_NAME_LENGTH = 552; + + // Retrieves a int64 value representing the maximum number of characters allowed in a procedure name. + SQL_MAX_PROCEDURE_NAME_LENGTH = 553; + + // Retrieves a int64 value representing the maximum number of characters allowed in a catalog name. + SQL_MAX_CATALOG_NAME_LENGTH = 554; + + // Retrieves a int64 value representing the maximum number of bytes allowed in a single row. + SQL_MAX_ROW_SIZE = 555; + + /* + * Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL + * data types LONGVARCHAR and LONGVARBINARY. + * + * Returns: + * - false: if return value for the JDBC method getMaxRowSize does + * not include the SQL data types LONGVARCHAR and LONGVARBINARY; + * - true: if return value for the JDBC method getMaxRowSize includes + * the SQL data types LONGVARCHAR and LONGVARBINARY. + */ + SQL_MAX_ROW_SIZE_INCLUDES_BLOBS = 556; + + /* + * Retrieves a int64 value representing the maximum number of characters allowed for an SQL statement; + * a result of 0 (zero) means that there is no limit or the limit is not known. + */ + SQL_MAX_STATEMENT_LENGTH = 557; + + // Retrieves a int64 value representing the maximum number of active statements that can be open at the same time. + SQL_MAX_STATEMENTS = 558; + + // Retrieves a int64 value representing the maximum number of characters allowed in a table name. + SQL_MAX_TABLE_NAME_LENGTH = 559; + + // Retrieves a int64 value representing the maximum number of tables allowed in a SELECT statement. + SQL_MAX_TABLES_IN_SELECT = 560; + + // Retrieves a int64 value representing the maximum number of characters allowed in a user name. + SQL_MAX_USERNAME_LENGTH = 561; + + /* + * Retrieves this database's default transaction isolation level as described in + * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + * + * Returns a int32 ordinal for the SQL transaction isolation level. + */ + SQL_DEFAULT_TRANSACTION_ISOLATION = 562; + + /* + * Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a + * noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + * + * Returns: + * - false: if transactions are unsupported; + * - true: if transactions are supported. + */ + SQL_TRANSACTIONS_SUPPORTED = 563; + + /* + * Retrieves the supported transactions isolation levels. + * + * Returns an int32 bitmask value representing the supported transactions isolation levels. + * The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. + * + * For instance: + * - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + * - return 1 (\b1) => [SQL_TRANSACTION_NONE]; + * - return 2 (\b10) => [SQL_TRANSACTION_READ_UNCOMMITTED]; + * - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + * - return 4 (\b100) => [SQL_TRANSACTION_REPEATABLE_READ]; + * - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 8 (\b1000) => [SQL_TRANSACTION_REPEATABLE_READ]; + * - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + * - return 16 (\b10000) => [SQL_TRANSACTION_SERIALIZABLE]; + * - ... + * Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + */ + SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS = 564; + + /* + * Retrieves a boolean value indicating whether a data definition statement within a transaction forces + * the transaction to commit. + * + * Returns: + * - false: if a data definition statement within a transaction does not force the transaction to commit; + * - true: if a data definition statement within a transaction forces the transaction to commit. + */ + SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT = 565; + + /* + * Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. + * + * Returns: + * - false: if a data definition statement within a transaction is taken into account; + * - true: a data definition statement within a transaction is ignored. + */ + SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED = 566; + + /* + * Retrieves an int32 bitmask value representing the supported result set types. + * The returned bitmask should be parsed in order to retrieve the supported result set types. + * + * For instance: + * - return 0 (\b0) => [] (no supported result set types); + * - return 1 (\b1) => [SQL_RESULT_SET_TYPE_UNSPECIFIED]; + * - return 2 (\b10) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + * - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + * - return 4 (\b100) => [SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + * - return 8 (\b1000) => [SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE]; + * - ... + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. + */ + SQL_SUPPORTED_RESULT_SET_TYPES = 567; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED = 568; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY = 569; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE = 570; + + /* + * Returns an int32 bitmask value concurrency types supported for + * `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. + * + * For instance: + * - return 0 (\b0) => [] (no supported concurrency types for this result set type) + * - return 1 (\b1) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED] + * - return 2 (\b10) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + * - return 4 (\b100) => [SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + * Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + */ + SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE = 571; + + /* + * Retrieves a boolean value indicating whether this database supports batch updates. + * + * - false: if this database does not support batch updates; + * - true: if this database supports batch updates. + */ + SQL_BATCH_UPDATES_SUPPORTED = 572; + + /* + * Retrieves a boolean value indicating whether this database supports savepoints. + * + * Returns: + * - false: if this database does not support savepoints; + * - true: if this database supports savepoints. + */ + SQL_SAVEPOINTS_SUPPORTED = 573; + + /* + * Retrieves a boolean value indicating whether named parameters are supported in callable statements. + * + * Returns: + * - false: if named parameters in callable statements are unsupported; + * - true: if named parameters in callable statements are supported. + */ + SQL_NAMED_PARAMETERS_SUPPORTED = 574; + + /* + * Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. + * + * Returns: + * - false: if updates made to a LOB are made directly to the LOB; + * - true: if updates made to a LOB are made on a copy. + */ + SQL_LOCATORS_UPDATE_COPY = 575; + + /* + * Retrieves a boolean value indicating whether invoking user-defined or vendor functions + * using the stored procedure escape syntax is supported. + * + * Returns: + * - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; + * - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. + */ + SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED = 576; + } + + // The level of support for Flight SQL transaction RPCs. + enum SqlSupportedTransaction { + // Unknown/not indicated/no support + SQL_SUPPORTED_TRANSACTION_NONE = 0; + // Transactions, but not savepoints. + // A savepoint is a mark within a transaction that can be individually + // rolled back to. Not all databases support savepoints. + SQL_SUPPORTED_TRANSACTION_TRANSACTION = 1; + // Transactions and savepoints + SQL_SUPPORTED_TRANSACTION_SAVEPOINT = 2; + } + + enum SqlSupportedCaseSensitivity { + SQL_CASE_SENSITIVITY_UNKNOWN = 0; + SQL_CASE_SENSITIVITY_CASE_INSENSITIVE = 1; + SQL_CASE_SENSITIVITY_UPPERCASE = 2; + SQL_CASE_SENSITIVITY_LOWERCASE = 3; + } + + enum SqlNullOrdering { + SQL_NULLS_SORTED_HIGH = 0; + SQL_NULLS_SORTED_LOW = 1; + SQL_NULLS_SORTED_AT_START = 2; + SQL_NULLS_SORTED_AT_END = 3; + } + + enum SupportedSqlGrammar { + SQL_MINIMUM_GRAMMAR = 0; + SQL_CORE_GRAMMAR = 1; + SQL_EXTENDED_GRAMMAR = 2; + } + + enum SupportedAnsi92SqlGrammarLevel { + ANSI92_ENTRY_SQL = 0; + ANSI92_INTERMEDIATE_SQL = 1; + ANSI92_FULL_SQL = 2; + } + + enum SqlOuterJoinsSupportLevel { + SQL_JOINS_UNSUPPORTED = 0; + SQL_LIMITED_OUTER_JOINS = 1; + SQL_FULL_OUTER_JOINS = 2; + } + + enum SqlSupportedGroupBy { + SQL_GROUP_BY_UNRELATED = 0; + SQL_GROUP_BY_BEYOND_SELECT = 1; + } + + enum SqlSupportedElementActions { + SQL_ELEMENT_IN_PROCEDURE_CALLS = 0; + SQL_ELEMENT_IN_INDEX_DEFINITIONS = 1; + SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS = 2; + } + + enum SqlSupportedPositionedCommands { + SQL_POSITIONED_DELETE = 0; + SQL_POSITIONED_UPDATE = 1; + } + + enum SqlSupportedSubqueries { + SQL_SUBQUERIES_IN_COMPARISONS = 0; + SQL_SUBQUERIES_IN_EXISTS = 1; + SQL_SUBQUERIES_IN_INS = 2; + SQL_SUBQUERIES_IN_QUANTIFIEDS = 3; + } + + enum SqlSupportedUnions { + SQL_UNION = 0; + SQL_UNION_ALL = 1; + } + + enum SqlTransactionIsolationLevel { + SQL_TRANSACTION_NONE = 0; + SQL_TRANSACTION_READ_UNCOMMITTED = 1; + SQL_TRANSACTION_READ_COMMITTED = 2; + SQL_TRANSACTION_REPEATABLE_READ = 3; + SQL_TRANSACTION_SERIALIZABLE = 4; + } + + enum SqlSupportedTransactions { + SQL_TRANSACTION_UNSPECIFIED = 0; + SQL_DATA_DEFINITION_TRANSACTIONS = 1; + SQL_DATA_MANIPULATION_TRANSACTIONS = 2; + } + + enum SqlSupportedResultSetType { + SQL_RESULT_SET_TYPE_UNSPECIFIED = 0; + SQL_RESULT_SET_TYPE_FORWARD_ONLY = 1; + SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE = 2; + SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE = 3; + } + + enum SqlSupportedResultSetConcurrency { + SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED = 0; + SQL_RESULT_SET_CONCURRENCY_READ_ONLY = 1; + SQL_RESULT_SET_CONCURRENCY_UPDATABLE = 2; + } + + enum SqlSupportsConvert { + SQL_CONVERT_BIGINT = 0; + SQL_CONVERT_BINARY = 1; + SQL_CONVERT_BIT = 2; + SQL_CONVERT_CHAR = 3; + SQL_CONVERT_DATE = 4; + SQL_CONVERT_DECIMAL = 5; + SQL_CONVERT_FLOAT = 6; + SQL_CONVERT_INTEGER = 7; + SQL_CONVERT_INTERVAL_DAY_TIME = 8; + SQL_CONVERT_INTERVAL_YEAR_MONTH = 9; + SQL_CONVERT_LONGVARBINARY = 10; + SQL_CONVERT_LONGVARCHAR = 11; + SQL_CONVERT_NUMERIC = 12; + SQL_CONVERT_REAL = 13; + SQL_CONVERT_SMALLINT = 14; + SQL_CONVERT_TIME = 15; + SQL_CONVERT_TIMESTAMP = 16; + SQL_CONVERT_TINYINT = 17; + SQL_CONVERT_VARBINARY = 18; + SQL_CONVERT_VARCHAR = 19; + } + + /** + * The JDBC/ODBC-defined type of any object. + * All the values here are the sames as in the JDBC and ODBC specs. + */ + enum XdbcDataType { + XDBC_UNKNOWN_TYPE = 0; + XDBC_CHAR = 1; + XDBC_NUMERIC = 2; + XDBC_DECIMAL = 3; + XDBC_INTEGER = 4; + XDBC_SMALLINT = 5; + XDBC_FLOAT = 6; + XDBC_REAL = 7; + XDBC_DOUBLE = 8; + XDBC_DATETIME = 9; + XDBC_INTERVAL = 10; + XDBC_VARCHAR = 12; + XDBC_DATE = 91; + XDBC_TIME = 92; + XDBC_TIMESTAMP = 93; + XDBC_LONGVARCHAR = -1; + XDBC_BINARY = -2; + XDBC_VARBINARY = -3; + XDBC_LONGVARBINARY = -4; + XDBC_BIGINT = -5; + XDBC_TINYINT = -6; + XDBC_BIT = -7; + XDBC_WCHAR = -8; + XDBC_WVARCHAR = -9; + } + + /** + * Detailed subtype information for XDBC_TYPE_DATETIME and XDBC_TYPE_INTERVAL. + */ + enum XdbcDatetimeSubcode { + option allow_alias = true; + XDBC_SUBCODE_UNKNOWN = 0; + XDBC_SUBCODE_YEAR = 1; + XDBC_SUBCODE_DATE = 1; + XDBC_SUBCODE_TIME = 2; + XDBC_SUBCODE_MONTH = 2; + XDBC_SUBCODE_TIMESTAMP = 3; + XDBC_SUBCODE_DAY = 3; + XDBC_SUBCODE_TIME_WITH_TIMEZONE = 4; + XDBC_SUBCODE_HOUR = 4; + XDBC_SUBCODE_TIMESTAMP_WITH_TIMEZONE = 5; + XDBC_SUBCODE_MINUTE = 5; + XDBC_SUBCODE_SECOND = 6; + XDBC_SUBCODE_YEAR_TO_MONTH = 7; + XDBC_SUBCODE_DAY_TO_HOUR = 8; + XDBC_SUBCODE_DAY_TO_MINUTE = 9; + XDBC_SUBCODE_DAY_TO_SECOND = 10; + XDBC_SUBCODE_HOUR_TO_MINUTE = 11; + XDBC_SUBCODE_HOUR_TO_SECOND = 12; + XDBC_SUBCODE_MINUTE_TO_SECOND = 13; + XDBC_SUBCODE_INTERVAL_YEAR = 101; + XDBC_SUBCODE_INTERVAL_MONTH = 102; + XDBC_SUBCODE_INTERVAL_DAY = 103; + XDBC_SUBCODE_INTERVAL_HOUR = 104; + XDBC_SUBCODE_INTERVAL_MINUTE = 105; + XDBC_SUBCODE_INTERVAL_SECOND = 106; + XDBC_SUBCODE_INTERVAL_YEAR_TO_MONTH = 107; + XDBC_SUBCODE_INTERVAL_DAY_TO_HOUR = 108; + XDBC_SUBCODE_INTERVAL_DAY_TO_MINUTE = 109; + XDBC_SUBCODE_INTERVAL_DAY_TO_SECOND = 110; + XDBC_SUBCODE_INTERVAL_HOUR_TO_MINUTE = 111; + XDBC_SUBCODE_INTERVAL_HOUR_TO_SECOND = 112; + XDBC_SUBCODE_INTERVAL_MINUTE_TO_SECOND = 113; + } + + enum Nullable { + /** + * Indicates that the fields does not allow the use of null values. + */ + NULLABILITY_NO_NULLS = 0; + + /** + * Indicates that the fields allow the use of null values. + */ + NULLABILITY_NULLABLE = 1; + + /** + * Indicates that nullability of the fields can not be determined. + */ + NULLABILITY_UNKNOWN = 2; + } + + enum Searchable { + /** + * Indicates that column can not be used in a WHERE clause. + */ + SEARCHABLE_NONE = 0; + + /** + * Indicates that the column can be used in a WHERE clause if it is using a + * LIKE operator. + */ + SEARCHABLE_CHAR = 1; + + /** + * Indicates that the column can be used In a WHERE clause with any + * operator other than LIKE. + * + * - Allowed operators: comparison, quantified comparison, BETWEEN, + * DISTINCT, IN, MATCH, and UNIQUE. + */ + SEARCHABLE_BASIC = 2; + + /** + * Indicates that the column can be used in a WHERE clause using any operator. + */ + SEARCHABLE_FULL = 3; + } + + /* + * Represents a request to retrieve information about data type supported on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned schema will be: + * < + * type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc), + * data_type: int32 not null (The SQL data type), + * column_size: int32 (The maximum size supported by that column. + * In case of exact numeric types, this represents the maximum precision. + * In case of string types, this represents the character length. + * In case of datetime data types, this represents the length in characters of the string representation. + * NULL is returned for data types where column size is not applicable.), + * literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for + * data types where a literal prefix is not applicable.), + * literal_suffix: utf8 (Character or characters used to terminate a literal, + * NULL is returned for data types where a literal suffix is not applicable.), + * create_params: list + * (A list of keywords corresponding to which parameters can be used when creating + * a column for that specific type. + * NULL is returned if there are no parameters for the data type definition.), + * nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the + * Nullable enum.), + * case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons), + * searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the + * Searchable enum.), + * unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is + * not applicable to the data type or the data type is not numeric.), + * fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.), + * auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute + * is not applicable to the data type or the data type is not numeric.), + * local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL + * is returned if a localized name is not supported by the data source), + * minimum_scale: int32 (The minimum scale of the data type on the data source. + * If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE + * columns both contain this value. NULL is returned if scale is not applicable.), + * maximum_scale: int32 (The maximum scale of the data type on the data source. + * NULL is returned if scale is not applicable.), + * sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values + * as data_type value. Except for interval and datetime, which + * uses generic values. More info about those types can be + * obtained through datetime_subcode. The possible values can be seen + * in the XdbcDataType enum.), + * datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains + * its sub types. For type different from interval and datetime, this value + * is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.), + * num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains + * the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For + * exact numeric types, this column contains the value 10 to indicate that + * column size specifies a number of decimal digits. Otherwise, this column is NULL.), + * interval_precision: int32 (If the data type is an interval data type, then this column contains the value + * of the interval leading precision. Otherwise, this column is NULL. This fields + * is only relevant to be used by ODBC). + * > + * The returned data should be ordered by data_type and then by type_name. + */ + message CommandGetXdbcTypeInfo { + option (experimental) = true; + + /* + * Specifies the data type to search for the info. + */ + optional int32 data_type = 1; + } + + /* + * Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. + * The definition of a catalog depends on vendor/implementation. It is usually the database itself + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8 not null + * > + * The returned data should be ordered by catalog_name. + */ + message CommandGetCatalogs { + option (experimental) = true; + } + + /* + * Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. + * The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8 not null + * > + * The returned data should be ordered by catalog_name, then db_schema_name. + */ + message CommandGetDbSchemas { + option (experimental) = true; + + /* + * Specifies the Catalog to search for the tables. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies a filter pattern for schemas to search for. + * When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string db_schema_filter_pattern = 2; + } + + /* + * Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8, + * table_name: utf8 not null, + * table_type: utf8 not null, + * [optional] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, + * it is serialized as an IPC message.) + * > + * Fields on table_schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. + */ + message CommandGetTables { + option (experimental) = true; + + /* + * Specifies the Catalog to search for the tables. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies a filter pattern for schemas to search for. + * When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string db_schema_filter_pattern = 2; + + /* + * Specifies a filter pattern for tables to search for. + * When no table_name_filter_pattern is provided, all tables matching other filters are searched. + * In the pattern string, two special characters can be used to denote matching rules: + * - "%" means to match any substring with 0 or more characters. + * - "_" means to match any one character. + */ + optional string table_name_filter_pattern = 3; + + /* + * Specifies a filter of table types which must match. + * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + * TABLE, VIEW, and SYSTEM TABLE are commonly supported. + */ + repeated string table_types = 4; + + // Specifies if the Arrow schema should be returned for found tables. + bool include_schema = 5; + } + + /* + * Represents a request to retrieve the list of table types on a Flight SQL enabled backend. + * The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + * TABLE, VIEW, and SYSTEM TABLE are commonly supported. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * table_type: utf8 not null + * > + * The returned data should be ordered by table_type. + */ + message CommandGetTableTypes { + option (experimental) = true; + } + + /* + * Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * catalog_name: utf8, + * db_schema_name: utf8, + * table_name: utf8 not null, + * column_name: utf8 not null, + * key_name: utf8, + * key_sequence: int32 not null + * > + * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. + */ + message CommandGetPrimaryKeys { + option (experimental) = true; + + /* + * Specifies the catalog to search for the table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the table to get the primary keys for. + string table = 3; + } + + enum UpdateDeleteRules { + CASCADE = 0; + RESTRICT = 1; + SET_NULL = 2; + NO_ACTION = 3; + SET_DEFAULT = 4; + } + + /* + * Represents a request to retrieve a description of the foreign key columns that reference the given table's + * primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. + */ + message CommandGetExportedKeys { + option (experimental) = true; + + /* + * Specifies the catalog to search for the foreign key table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the foreign key table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the foreign key table to get the foreign keys for. + string table = 3; + } + + /* + * Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions: + * - 0 = CASCADE + * - 1 = RESTRICT + * - 2 = SET NULL + * - 3 = NO ACTION + * - 4 = SET DEFAULT + */ + message CommandGetImportedKeys { + option (experimental) = true; + + /* + * Specifies the catalog to search for the primary key table. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string catalog = 1; + + /* + * Specifies the schema to search for the primary key table. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string db_schema = 2; + + // Specifies the primary key table to get the foreign keys for. + string table = 3; + } + + /* + * Represents a request to retrieve a description of the foreign key columns in the given foreign key table that + * reference the primary key or the columns representing a unique constraint of the parent table (could be the same + * or a different table) on a Flight SQL enabled backend. + * Used in the command member of FlightDescriptor for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * - GetFlightInfo: execute the catalog metadata request. + * + * The returned Arrow schema will be: + * < + * pk_catalog_name: utf8, + * pk_db_schema_name: utf8, + * pk_table_name: utf8 not null, + * pk_column_name: utf8 not null, + * fk_catalog_name: utf8, + * fk_db_schema_name: utf8, + * fk_table_name: utf8 not null, + * fk_column_name: utf8 not null, + * key_sequence: int32 not null, + * fk_key_name: utf8, + * pk_key_name: utf8, + * update_rule: uint8 not null, + * delete_rule: uint8 not null + * > + * The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. + * update_rule and delete_rule returns a byte that is equivalent to actions: + * - 0 = CASCADE + * - 1 = RESTRICT + * - 2 = SET NULL + * - 3 = NO ACTION + * - 4 = SET DEFAULT + */ + message CommandGetCrossReference { + option (experimental) = true; + + /** + * The catalog name where the parent table is. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string pk_catalog = 1; + + /** + * The Schema name where the parent table is. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string pk_db_schema = 2; + + /** + * The parent table name. It cannot be null. + */ + string pk_table = 3; + + /** + * The catalog name where the foreign table is. + * An empty string retrieves those without a catalog. + * If omitted the catalog name should not be used to narrow the search. + */ + optional string fk_catalog = 4; + + /** + * The schema name where the foreign table is. + * An empty string retrieves those without a schema. + * If omitted the schema name should not be used to narrow the search. + */ + optional string fk_db_schema = 5; + + /** + * The foreign table name. It cannot be null. + */ + string fk_table = 6; + } + + // Query Execution Action Messages + + /* + * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. + */ + message ActionCreatePreparedStatementRequest { + option (experimental) = true; + + // The valid SQL string to create a prepared statement for. + string query = 1; + // Create/execute the prepared statement as part of this transaction (if + // unset, executions of the prepared statement will be auto-committed). + optional bytes transaction_id = 2; + } + + /* + * An embedded message describing a Substrait plan to execute. + */ + message SubstraitPlan { + option (experimental) = true; + + // The serialized substrait.Plan to create a prepared statement for. + // XXX(ARROW-16902): this is bytes instead of an embedded message + // because Protobuf does not really support one DLL using Protobuf + // definitions from another DLL. + bytes plan = 1; + // The Substrait release, e.g. "0.12.0". This information is not + // tracked in the plan itself, so this is the only way for consumers + // to potentially know if they can handle the plan. + string version = 2; + } + + /* + * Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. + */ + message ActionCreatePreparedSubstraitPlanRequest { + option (experimental) = true; + + // The serialized substrait.Plan to create a prepared statement for. + SubstraitPlan plan = 1; + // Create/execute the prepared statement as part of this transaction (if + // unset, executions of the prepared statement will be auto-committed). + optional bytes transaction_id = 2; + } + + /* + * Wrap the result of a "CreatePreparedStatement" or "CreatePreparedSubstraitPlan" action. + * + * The resultant PreparedStatement can be closed either: + * - Manually, through the "ClosePreparedStatement" action; + * - Automatically, by a server timeout. + * + * The result should be wrapped in a google.protobuf.Any message. + */ + message ActionCreatePreparedStatementResult { + option (experimental) = true; + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; + + // If a result set generating query was provided, dataset_schema contains the + // schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. + bytes dataset_schema = 2; + + // If the query provided contained parameters, parameter_schema contains the + // schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. + bytes parameter_schema = 3; + } + + /* + * Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. + * Closes server resources associated with the prepared statement handle. + */ + message ActionClosePreparedStatementRequest { + option (experimental) = true; + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; + } + + /* + * Request message for the "BeginTransaction" action. + * Begins a transaction. + */ + message ActionBeginTransactionRequest { + option (experimental) = true; + } + + /* + * Request message for the "BeginSavepoint" action. + * Creates a savepoint within a transaction. + * + * Only supported if FLIGHT_SQL_TRANSACTION is + * FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. + */ + message ActionBeginSavepointRequest { + option (experimental) = true; + + // The transaction to which a savepoint belongs. + bytes transaction_id = 1; + // Name for the savepoint. + string name = 2; + } + + /* + * The result of a "BeginTransaction" action. + * + * The transaction can be manipulated with the "EndTransaction" action, or + * automatically via server timeout. If the transaction times out, then it is + * automatically rolled back. + * + * The result should be wrapped in a google.protobuf.Any message. + */ + message ActionBeginTransactionResult { + option (experimental) = true; + + // Opaque handle for the transaction on the server. + bytes transaction_id = 1; + } + + /* + * The result of a "BeginSavepoint" action. + * + * The transaction can be manipulated with the "EndSavepoint" action. + * If the associated transaction is committed, rolled back, or times + * out, then the savepoint is also invalidated. + * + * The result should be wrapped in a google.protobuf.Any message. + */ + message ActionBeginSavepointResult { + option (experimental) = true; + + // Opaque handle for the savepoint on the server. + bytes savepoint_id = 1; + } + + /* + * Request message for the "EndTransaction" action. + * + * Commit (COMMIT) or rollback (ROLLBACK) the transaction. + * + * If the action completes successfully, the transaction handle is + * invalidated, as are all associated savepoints. + */ + message ActionEndTransactionRequest { + option (experimental) = true; + + enum EndTransaction { + END_TRANSACTION_UNSPECIFIED = 0; + // Commit the transaction. + END_TRANSACTION_COMMIT = 1; + // Roll back the transaction. + END_TRANSACTION_ROLLBACK = 2; + } + // Opaque handle for the transaction on the server. + bytes transaction_id = 1; + // Whether to commit/rollback the given transaction. + EndTransaction action = 2; + } + + /* + * Request message for the "EndSavepoint" action. + * + * Release (RELEASE) the savepoint or rollback (ROLLBACK) to the + * savepoint. + * + * Releasing a savepoint invalidates that savepoint. Rolling back to + * a savepoint does not invalidate the savepoint, but invalidates all + * savepoints created after the current savepoint. + */ + message ActionEndSavepointRequest { + option (experimental) = true; + + enum EndSavepoint { + END_SAVEPOINT_UNSPECIFIED = 0; + // Release the savepoint. + END_SAVEPOINT_RELEASE = 1; + // Roll back to a savepoint. + END_SAVEPOINT_ROLLBACK = 2; + } + // Opaque handle for the savepoint on the server. + bytes savepoint_id = 1; + // Whether to rollback/release the given savepoint. + EndSavepoint action = 2; + } + + // Query Execution Messages. + + /* + * Represents a SQL query. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * - GetFlightInfo: execute the query. + */ + message CommandStatementQuery { + option (experimental) = true; + + // The SQL syntax. + string query = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; + } + + /* + * Represents a Substrait plan. Used in the command member of FlightDescriptor + * for the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * - GetFlightInfo: execute the query. + * - DoPut: execute the query. + */ + message CommandStatementSubstraitPlan { + option (experimental) = true; + + // A serialized substrait.Plan + SubstraitPlan plan = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; + } + + /** + * Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. + * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. + */ + message TicketStatementQuery { + option (experimental) = true; + + // Unique identifier for the instance of the statement to execute. + bytes statement_handle = 1; + } + + /* + * Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for + * the following RPC calls: + * - GetSchema: return the Arrow schema of the query. + * Fields on this schema may contain the following metadata: + * - ARROW:FLIGHT:SQL:CATALOG_NAME - Table's catalog name + * - ARROW:FLIGHT:SQL:DB_SCHEMA_NAME - Database schema name + * - ARROW:FLIGHT:SQL:TABLE_NAME - Table name + * - ARROW:FLIGHT:SQL:TYPE_NAME - The data source-specific name for the data type of the column. + * - ARROW:FLIGHT:SQL:PRECISION - Column precision/size + * - ARROW:FLIGHT:SQL:SCALE - Column scale/decimal digits if applicable + * - ARROW:FLIGHT:SQL:IS_AUTO_INCREMENT - "1" indicates if the column is auto incremented, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_CASE_SENSITIVE - "1" indicates if the column is case sensitive, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_READ_ONLY - "1" indicates if the column is read only, "0" otherwise. + * - ARROW:FLIGHT:SQL:IS_SEARCHABLE - "1" indicates if the column is searchable via WHERE clause, "0" otherwise. + * - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. + * - GetFlightInfo: execute the prepared statement instance. + */ + message CommandPreparedStatementQuery { + option (experimental) = true; + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; + } + + /* + * Represents a SQL update query. Used in the command member of FlightDescriptor + * for the the RPC call DoPut to cause the server to execute the included SQL update. + */ + message CommandStatementUpdate { + option (experimental) = true; + + // The SQL syntax. + string query = 1; + // Include the query as part of this transaction (if unset, the query is auto-committed). + optional bytes transaction_id = 2; + } + + /* + * Represents a SQL update query. Used in the command member of FlightDescriptor + * for the the RPC call DoPut to cause the server to execute the included + * prepared statement handle as an update. + */ + message CommandPreparedStatementUpdate { + option (experimental) = true; + + // Opaque handle for the prepared statement on the server. + bytes prepared_statement_handle = 1; + } + + /* + * Returned from the RPC call DoPut when a CommandStatementUpdate + * CommandPreparedStatementUpdate was in the request, containing + * results from the update. + */ + message DoPutUpdateResult { + option (experimental) = true; + + // The number of records updated. A return value of -1 represents + // an unknown updated record count. + int64 record_count = 1; + } + + /* + * Request message for the "CancelQuery" action. + * + * Explicitly cancel a running query. + * + * This lets a single client explicitly cancel work, no matter how many clients + * are involved/whether the query is distributed or not, given server support. + * The transaction/statement is not rolled back; it is the application's job to + * commit or rollback as appropriate. This only indicates the client no longer + * wishes to read the remainder of the query results or continue submitting + * data. + * + * This command is idempotent. + */ + message ActionCancelQueryRequest { + option (experimental) = true; + + // The result of the GetFlightInfo RPC that initiated the query. + // XXX(ARROW-16902): this must be a serialized FlightInfo, but is + // rendered as bytes because Protobuf does not really support one + // DLL using Protobuf definitions from another DLL. + bytes info = 1; + } + + /* + * The result of cancelling a query. + * + * The result should be wrapped in a google.protobuf.Any message. + */ + message ActionCancelQueryResult { + option (experimental) = true; + + enum CancelResult { + // The cancellation status is unknown. Servers should avoid using + // this value (send a NOT_FOUND error if the requested query is + // not known). Clients can retry the request. + CANCEL_RESULT_UNSPECIFIED = 0; + // The cancellation request is complete. Subsequent requests with + // the same payload may return CANCELLED or a NOT_FOUND error. + CANCEL_RESULT_CANCELLED = 1; + // The cancellation request is in progress. The client may retry + // the cancellation request. + CANCEL_RESULT_CANCELLING = 2; + // The query is not cancellable. The client should not retry the + // cancellation request. + CANCEL_RESULT_NOT_CANCELLABLE = 3; + } + + CancelResult result = 1; + } + + extend google.protobuf.MessageOptions { + bool experimental = 1000; + } \ No newline at end of file From 3adca539ad9e1b27892a5ef38ac2780aff4c0bff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 23 May 2023 18:15:00 +0100 Subject: [PATCH 0923/1411] Convert parquet metadata back to builders (#4265) --- parquet/src/file/metadata.rs | 139 ++++++++++++----------------------- 1 file changed, 46 insertions(+), 93 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index c2961aa76d06..40f6cf3123c7 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -365,78 +365,69 @@ impl RowGroupMetaData { ordinal: None, } } + + /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`] + pub fn into_builder(self) -> RowGroupMetaDataBuilder { + RowGroupMetaDataBuilder(self) + } } /// Builder for row group metadata. -pub struct RowGroupMetaDataBuilder { - columns: Vec, - schema_descr: SchemaDescPtr, - num_rows: i64, - sorting_columns: Option>, - total_byte_size: i64, -} +pub struct RowGroupMetaDataBuilder(RowGroupMetaData); impl RowGroupMetaDataBuilder { /// Creates new builder from schema descriptor. fn new(schema_descr: SchemaDescPtr) -> Self { - Self { + Self(RowGroupMetaData { columns: Vec::with_capacity(schema_descr.num_columns()), schema_descr, num_rows: 0, sorting_columns: None, total_byte_size: 0, - } + }) } /// Sets number of rows in this row group. pub fn set_num_rows(mut self, value: i64) -> Self { - self.num_rows = value; + self.0.num_rows = value; self } /// Sets the sorting order for columns pub fn set_sorting_columns(mut self, value: Option>) -> Self { - self.sorting_columns = value; + self.0.sorting_columns = value; self } /// Sets total size in bytes for this row group. pub fn set_total_byte_size(mut self, value: i64) -> Self { - self.total_byte_size = value; + self.0.total_byte_size = value; self } /// Sets column metadata for this row group. pub fn set_column_metadata(mut self, value: Vec) -> Self { - self.columns = value; + self.0.columns = value; self } /// Builds row group metadata. pub fn build(self) -> Result { - if self.schema_descr.num_columns() != self.columns.len() { + if self.0.schema_descr.num_columns() != self.0.columns.len() { return Err(general_err!( "Column length mismatch: {} != {}", - self.schema_descr.num_columns(), - self.columns.len() + self.0.schema_descr.num_columns(), + self.0.columns.len() )); } - Ok(RowGroupMetaData { - columns: self.columns, - num_rows: self.num_rows, - sorting_columns: self.sorting_columns, - total_byte_size: self.total_byte_size, - schema_descr: self.schema_descr, - }) + Ok(self.0) } } /// Metadata for a column chunk. #[derive(Debug, Clone, PartialEq)] pub struct ColumnChunkMetaData { - column_type: Type, - column_path: ColumnPath, column_descr: ColumnDescPtr, encodings: Vec, file_path: Option, @@ -479,12 +470,12 @@ impl ColumnChunkMetaData { /// Type of this column. Must be primitive. pub fn column_type(&self) -> Type { - self.column_type + self.column_descr.physical_type() } /// Path (or identifier) of this column. pub fn column_path(&self) -> &ColumnPath { - &self.column_path + self.column_descr.path() } /// Descriptor for this column. @@ -609,7 +600,6 @@ impl ColumnChunkMetaData { } let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap(); let column_type = Type::try_from(col_metadata.type_)?; - let column_path = ColumnPath::new(col_metadata.path_in_schema); let encodings = col_metadata .encodings .drain(0..) @@ -641,8 +631,6 @@ impl ColumnChunkMetaData { let column_index_length = cc.column_index_length; let result = ColumnChunkMetaData { - column_type, - column_path, column_descr, encodings, file_path, @@ -685,9 +673,9 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { ColumnMetaData { - type_: self.column_type.into(), + type_: self.column_type().into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), - path_in_schema: Vec::from(self.column_path.as_ref()), + path_in_schema: self.column_path().as_ref().to_vec(), codec: self.compression.into(), num_values: self.num_values, total_uncompressed_size: self.total_uncompressed_size, @@ -704,34 +692,20 @@ impl ColumnChunkMetaData { bloom_filter_offset: self.bloom_filter_offset, } } + + /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`] + pub fn into_builder(self) -> ColumnChunkMetaDataBuilder { + ColumnChunkMetaDataBuilder(self) + } } /// Builder for column chunk metadata. -pub struct ColumnChunkMetaDataBuilder { - column_descr: ColumnDescPtr, - encodings: Vec, - file_path: Option, - file_offset: i64, - num_values: i64, - compression: Compression, - total_compressed_size: i64, - total_uncompressed_size: i64, - data_page_offset: i64, - index_page_offset: Option, - dictionary_page_offset: Option, - statistics: Option, - encoding_stats: Option>, - bloom_filter_offset: Option, - offset_index_offset: Option, - offset_index_length: Option, - column_index_offset: Option, - column_index_length: Option, -} +pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData); impl ColumnChunkMetaDataBuilder { /// Creates new column chunk metadata builder. fn new(column_descr: ColumnDescPtr) -> Self { - Self { + Self(ColumnChunkMetaData { column_descr, encodings: Vec::new(), file_path: None, @@ -750,135 +724,114 @@ impl ColumnChunkMetaDataBuilder { offset_index_length: None, column_index_offset: None, column_index_length: None, - } + }) } /// Sets list of encodings for this column chunk. pub fn set_encodings(mut self, encodings: Vec) -> Self { - self.encodings = encodings; + self.0.encodings = encodings; self } /// Sets optional file path for this column chunk. pub fn set_file_path(mut self, value: String) -> Self { - self.file_path = Some(value); + self.0.file_path = Some(value); self } /// Sets file offset in bytes. pub fn set_file_offset(mut self, value: i64) -> Self { - self.file_offset = value; + self.0.file_offset = value; self } /// Sets number of values. pub fn set_num_values(mut self, value: i64) -> Self { - self.num_values = value; + self.0.num_values = value; self } /// Sets compression. pub fn set_compression(mut self, value: Compression) -> Self { - self.compression = value; + self.0.compression = value; self } /// Sets total compressed size in bytes. pub fn set_total_compressed_size(mut self, value: i64) -> Self { - self.total_compressed_size = value; + self.0.total_compressed_size = value; self } /// Sets total uncompressed size in bytes. pub fn set_total_uncompressed_size(mut self, value: i64) -> Self { - self.total_uncompressed_size = value; + self.0.total_uncompressed_size = value; self } /// Sets data page offset in bytes. pub fn set_data_page_offset(mut self, value: i64) -> Self { - self.data_page_offset = value; + self.0.data_page_offset = value; self } /// Sets optional dictionary page ofset in bytes. pub fn set_dictionary_page_offset(mut self, value: Option) -> Self { - self.dictionary_page_offset = value; + self.0.dictionary_page_offset = value; self } /// Sets optional index page offset in bytes. pub fn set_index_page_offset(mut self, value: Option) -> Self { - self.index_page_offset = value; + self.0.index_page_offset = value; self } /// Sets statistics for this column chunk. pub fn set_statistics(mut self, value: Statistics) -> Self { - self.statistics = Some(value); + self.0.statistics = Some(value); self } /// Sets page encoding stats for this column chunk. pub fn set_page_encoding_stats(mut self, value: Vec) -> Self { - self.encoding_stats = Some(value); + self.0.encoding_stats = Some(value); self } /// Sets optional bloom filter offset in bytes. pub fn set_bloom_filter_offset(mut self, value: Option) -> Self { - self.bloom_filter_offset = value; + self.0.bloom_filter_offset = value; self } /// Sets optional offset index offset in bytes. pub fn set_offset_index_offset(mut self, value: Option) -> Self { - self.offset_index_offset = value; + self.0.offset_index_offset = value; self } /// Sets optional offset index length in bytes. pub fn set_offset_index_length(mut self, value: Option) -> Self { - self.offset_index_length = value; + self.0.offset_index_length = value; self } /// Sets optional column index offset in bytes. pub fn set_column_index_offset(mut self, value: Option) -> Self { - self.column_index_offset = value; + self.0.column_index_offset = value; self } /// Sets optional column index length in bytes. pub fn set_column_index_length(mut self, value: Option) -> Self { - self.column_index_length = value; + self.0.column_index_length = value; self } /// Builds column chunk metadata. pub fn build(self) -> Result { - Ok(ColumnChunkMetaData { - column_type: self.column_descr.physical_type(), - column_path: self.column_descr.path().clone(), - column_descr: self.column_descr, - encodings: self.encodings, - file_path: self.file_path, - file_offset: self.file_offset, - num_values: self.num_values, - compression: self.compression, - total_compressed_size: self.total_compressed_size, - total_uncompressed_size: self.total_uncompressed_size, - data_page_offset: self.data_page_offset, - index_page_offset: self.index_page_offset, - dictionary_page_offset: self.dictionary_page_offset, - statistics: self.statistics, - encoding_stats: self.encoding_stats, - bloom_filter_offset: self.bloom_filter_offset, - offset_index_offset: self.offset_index_offset, - offset_index_length: self.offset_index_length, - column_index_offset: self.column_index_offset, - column_index_length: self.column_index_length, - }) + Ok(self.0) } } From 58e2c1c1691a9bd5e81dbe3fea99d9eb949e4c7a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 24 May 2023 14:40:20 +0100 Subject: [PATCH 0924/1411] Add splice column API (#4155) (#4269) * Add splice column API (#4155) * Review feedback * Re-encode offset index --- parquet/src/arrow/arrow_writer/byte_array.rs | 4 +- parquet/src/file/writer.rs | 233 ++++++++++++++++--- 2 files changed, 201 insertions(+), 36 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 24dae4f20d64..77f9598b23fe 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -104,12 +104,12 @@ impl<'a> ByteArrayWriter<'a> { /// Returns a new [`ByteArrayWriter`] pub fn new( descr: ColumnDescPtr, - props: &'a WriterPropertiesPtr, + props: WriterPropertiesPtr, page_writer: Box, on_close: OnCloseColumnChunk<'a>, ) -> Result { Ok(Self { - writer: GenericColumnWriter::new(descr, props.clone(), page_writer), + writer: GenericColumnWriter::new(descr, props, page_writer), on_close: Some(on_close), }) } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 9923970bedde..b4ae777bb131 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -21,7 +21,7 @@ use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; -use std::io::{BufWriter, IoSlice}; +use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TSerializable}; @@ -35,6 +35,7 @@ use crate::column::{ }; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; +use crate::file::reader::ChunkReader; use crate::file::{ metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, PARQUET_MAGIC, @@ -423,27 +424,15 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { } } - /// Returns the next column writer, if available, using the factory function; - /// otherwise returns `None`. - pub(crate) fn next_column_with_factory<'b, F, C>( - &'b mut self, - factory: F, - ) -> Result> - where - F: FnOnce( - ColumnDescPtr, - &'b WriterPropertiesPtr, - Box, - OnCloseColumnChunk<'b>, - ) -> Result, - { - self.assert_previous_writer_closed()?; - - if self.column_index >= self.descr.num_columns() { - return Ok(None); - } - let page_writer = Box::new(SerializedPageWriter::new(self.buf)); + /// Advance `self.column_index` returning the next [`ColumnDescPtr`] if any + fn next_column_desc(&mut self) -> Option { + let ret = self.descr.columns().get(self.column_index)?.clone(); + self.column_index += 1; + Some(ret) + } + /// Returns [`OnCloseColumnChunk`] for the next writer + fn get_on_close(&mut self) -> (&mut TrackedWrite, OnCloseColumnChunk<'_>) { let total_bytes_written = &mut self.total_bytes_written; let total_uncompressed_bytes = &mut self.total_uncompressed_bytes; let total_rows_written = &mut self.total_rows_written; @@ -475,16 +464,33 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { Ok(()) }; + (self.buf, Box::new(on_close)) + } - let column = self.descr.column(self.column_index); - self.column_index += 1; - - Ok(Some(factory( - column, - &self.props, - page_writer, - Box::new(on_close), - )?)) + /// Returns the next column writer, if available, using the factory function; + /// otherwise returns `None`. + pub(crate) fn next_column_with_factory<'b, F, C>( + &'b mut self, + factory: F, + ) -> Result> + where + F: FnOnce( + ColumnDescPtr, + WriterPropertiesPtr, + Box, + OnCloseColumnChunk<'b>, + ) -> Result, + { + self.assert_previous_writer_closed()?; + Ok(match self.next_column_desc() { + Some(column) => { + let props = self.props.clone(); + let (buf, on_close) = self.get_on_close(); + let page_writer = Box::new(SerializedPageWriter::new(buf)); + Some(factory(column, props, page_writer, Box::new(on_close))?) + } + None => None, + }) } /// Returns the next column writer, if available; otherwise returns `None`. @@ -492,11 +498,81 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { /// closed returns `Err`. pub fn next_column(&mut self) -> Result>> { self.next_column_with_factory(|descr, props, page_writer, on_close| { - let column_writer = get_column_writer(descr, props.clone(), page_writer); + let column_writer = get_column_writer(descr, props, page_writer); Ok(SerializedColumnWriter::new(column_writer, Some(on_close))) }) } + /// Append an encoded column chunk from another source without decoding it + /// + /// This can be used for efficiently concatenating or projecting parquet data, + /// or encoding parquet data to temporary in-memory buffers + /// + /// See [`Self::next_column`] for writing data that isn't already encoded + pub fn append_column( + &mut self, + reader: &R, + mut close: ColumnCloseResult, + ) -> Result<()> { + self.assert_previous_writer_closed()?; + let desc = self.next_column_desc().ok_or_else(|| { + general_err!("exhausted columns in SerializedRowGroupWriter") + })?; + + let metadata = close.metadata; + + if metadata.column_descr() != desc.as_ref() { + return Err(general_err!( + "column descriptor mismatch, expected {:?} got {:?}", + desc, + metadata.column_descr() + )); + } + + let src_dictionary_offset = metadata.dictionary_page_offset(); + let src_data_offset = metadata.data_page_offset(); + let src_offset = src_dictionary_offset.unwrap_or(src_data_offset); + let src_length = metadata.compressed_size(); + + let write_offset = self.buf.bytes_written(); + let mut read = reader.get_read(src_offset as _)?.take(src_length as _); + let write_length = std::io::copy(&mut read, &mut self.buf)?; + + if src_length as u64 != write_length { + return Err(general_err!( + "Failed to splice column data, expected {read_length} got {write_length}" + )); + } + + let file_offset = self.buf.bytes_written() as i64; + + let map_offset = |x| x - src_offset + write_offset as i64; + let mut builder = ColumnChunkMetaData::builder(metadata.column_descr_ptr()) + .set_compression(metadata.compression()) + .set_encodings(metadata.encodings().clone()) + .set_file_offset(file_offset) + .set_total_compressed_size(metadata.compressed_size()) + .set_total_uncompressed_size(metadata.uncompressed_size()) + .set_num_values(metadata.num_values()) + .set_data_page_offset(map_offset(src_data_offset)) + .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)); + + if let Some(statistics) = metadata.statistics() { + builder = builder.set_statistics(statistics.clone()) + } + close.metadata = builder.build()?; + + if let Some(offsets) = close.offset_index.as_mut() { + for location in &mut offsets.page_locations { + location.offset = map_offset(location.offset) + } + } + + SerializedPageWriter::new(self.buf).write_metadata(&metadata)?; + let (_, on_close) = self.get_on_close(); + on_close(close) + } + /// Closes this row group writer and returns row group metadata. pub fn close(mut self) -> Result { if self.row_group_metadata.is_none() { @@ -516,9 +592,9 @@ impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { if let Some(on_close) = self.on_close.take() { on_close( metadata, - self.bloom_filters.clone(), - self.column_indexes.clone(), - self.offset_indexes.clone(), + self.bloom_filters, + self.column_indexes, + self.offset_indexes, )? } } @@ -720,9 +796,11 @@ mod tests { use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; use crate::column::page::PageReader; + use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, Int32Type}; use crate::file::reader::ChunkReader; + use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ properties::{ReaderProperties, WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, @@ -1540,4 +1618,91 @@ mod tests { assert_eq!(s.min_value.as_deref(), Some(1_i32.to_le_bytes().as_ref())); assert_eq!(s.max_value.as_deref(), Some(3_i32.to_le_bytes().as_ref())); } + + #[test] + fn test_spliced_write() { + let message_type = " + message test_schema { + REQUIRED INT32 i32 (INTEGER(32,true)); + REQUIRED INT32 u32 (INTEGER(32,false)); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let props = Arc::new(WriterProperties::builder().build()); + + let mut file = Vec::with_capacity(1024); + let mut file_writer = + SerializedFileWriter::new(&mut file, schema, props.clone()).unwrap(); + + let columns = file_writer.descr.columns(); + let mut column_state: Vec<(_, Option)> = columns + .iter() + .map(|_| (TrackedWrite::new(Vec::with_capacity(1024)), None)) + .collect(); + + let mut column_state_slice = column_state.as_mut_slice(); + let mut column_writers = Vec::with_capacity(columns.len()); + for c in columns { + let ((buf, out), tail) = column_state_slice.split_first_mut().unwrap(); + column_state_slice = tail; + + let page_writer = Box::new(SerializedPageWriter::new(buf)); + let col_writer = get_column_writer(c.clone(), props.clone(), page_writer); + column_writers.push(SerializedColumnWriter::new( + col_writer, + Some(Box::new(|on_close| { + *out = Some(on_close); + Ok(()) + })), + )); + } + + let column_data = [[1, 2, 3, 4], [7, 3, 7, 3]]; + + // Interleaved writing to the column writers + for (writer, batch) in column_writers.iter_mut().zip(column_data) { + let writer = writer.typed::(); + writer.write_batch(&batch, None, None).unwrap(); + } + + // Close the column writers + for writer in column_writers { + writer.close().unwrap() + } + + // Splice column data into a row group + let mut row_group_writer = file_writer.next_row_group().unwrap(); + for (write, close) in column_state { + let buf = Bytes::from(write.into_inner().unwrap()); + row_group_writer + .append_column(&buf, close.unwrap()) + .unwrap(); + } + row_group_writer.close().unwrap(); + file_writer.close().unwrap(); + + // Check data was written correctly + let file = Bytes::from(file); + let test_read = |reader: SerializedFileReader| { + let row_group = reader.get_row_group(0).unwrap(); + + let mut out = [0; 4]; + let c1 = row_group.get_column_reader(0).unwrap(); + let mut c1 = get_typed_column_reader::(c1); + c1.read_batch(4, None, None, &mut out).unwrap(); + assert_eq!(out, column_data[0]); + + let c2 = row_group.get_column_reader(1).unwrap(); + let mut c2 = get_typed_column_reader::(c2); + c2.read_batch(4, None, None, &mut out).unwrap(); + assert_eq!(out, column_data[1]); + }; + + let reader = SerializedFileReader::new(file.clone()).unwrap(); + test_read(reader); + + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + test_read(reader); + } } From dbcf22870d2811f5a29c75c50f826de8b7f3bdb4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 24 May 2023 15:37:49 +0100 Subject: [PATCH 0925/1411] Update criterion requirement from 0.4 to 0.5 (#4275) Updates the requirements on [criterion](https://github.com/bheisler/criterion.rs) to permit the latest version. - [Changelog](https://github.com/bheisler/criterion.rs/blob/master/CHANGELOG.md) - [Commits](https://github.com/bheisler/criterion.rs/compare/0.4.0...0.5.0) --- updated-dependencies: - dependency-name: criterion dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-buffer/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index c5b0c6c26b0b..1db388db8398 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -38,7 +38,7 @@ num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } [dev-dependencies] -criterion = { version = "0.4", default-features = false } +criterion = { version = "0.5", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [build-dependencies] diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index e42e75b838ce..a999fe51739d 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -51,7 +51,7 @@ lexical-core = { version = "^0.8", default-features = false, features = ["write- comfy-table = { version = "6.0", optional = true, default-features = false } [dev-dependencies] -criterion = { version = "0.4", default-features = false } +criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } [build-dependencies] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 58fe54fd1f29..5de03666251b 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -95,7 +95,7 @@ chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] chrono = { version = "0.4.23", default-features = false, features = ["clock"] } -criterion = { version = "0.4", default-features = false } +criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index f04e1df1d7a7..29f7cda1360f 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,7 +69,7 @@ paste = { version = "1.0" } [dev-dependencies] base64 = { version = "0.21", default-features = false, features = ["std"] } -criterion = { version = "0.4", default-features = false } +criterion = { version = "0.5", default-features = false } snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } brotli = { version = "3.3", default-features = false, features = ["std"] } From 136445f5e63501e26e6be21e68a56a5f8e8b1e2e Mon Sep 17 00:00:00 2001 From: Marko Mikulicic Date: Wed, 24 May 2023 19:14:30 +0200 Subject: [PATCH 0926/1411] Strip leading whitespace from flight_sql_client custom header values (#4271) * Strip leading whitespace from flight_sql_client custom header values * Also strip trailing whitespace from flight_sql_client custom header values --- arrow-flight/src/bin/flight_sql_client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index a787989bf6b4..e5aacc2e779a 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -49,7 +49,7 @@ where match parts.as_slice() { [key, value] => { let key = K::from_str(key).map_err(|e| e.to_string())?; - let value = V::from_str(value).map_err(|e| e.to_string())?; + let value = V::from_str(value.trim()).map_err(|e| e.to_string())?; Ok(Self { key, value }) } _ => Err(format!( From 741244da7846fdcb0b34d24ea90e77025863a88c Mon Sep 17 00:00:00 2001 From: Dexter Duckworth Date: Wed, 24 May 2023 13:17:22 -0400 Subject: [PATCH 0927/1411] Parquet Reader/writer for fixed-size list arrays (#4267) * Initial implementation for writing fixed-size lists to Parquet. The implementation still needs tests. The implementation uses a new `write_fixed_size_list` method instead of `write_list`. This is done to avoid the overhead of needlessly calculating list offsets. * Initial implementation for reading fixed-size lists from Parquet. The implementation still needs tests. * Added tests for fixed-size list writer. Fixed bugs in implementation found via tests. * Added tests for fixed-size list reader. Fixed bugs in implementation found via tests. * Added correct behavior for writing empty fixed-length lists. Writer now emits the correct definition levels for empty lists. Added empty list unit test. * Added correct behavior for reading empty fixed-length lists. Reader now handles empty list definition levels correctly. Added empty list unit test. * Fixed linter warnings. * Added license header to fixed_size_list_array.rs * Added fixed-size list reader tests from PR review. * Added fixed-size reader row length sanity checks. * Simplified fixed-size list case in LevelInfoBuilder constructor. * Removed dynamic dispatch inside fixed-length list writer. * Expanded list of structs test for fixed-size list writer. * Reverted expected levels in fixed-size list writer test. * Fixed linter warnings. * Updated list size check in fixed-size list reader. Converted the check to return an error instead of panicking. * Small tweak to row length check in fixed-size list reader. * Fixed bug in fixed-size list level encoding. Writer now correctly handles child arrays with variable row length. Added new unit test to verify the new behavior is correct. * Added fixed-size list reader test. Test verifies that reader handles child arrays with variable length correctly. --- parquet/src/arrow/array_reader/builder.rs | 44 +- .../array_reader/fixed_size_list_array.rs | 688 ++++++++++++++++++ parquet/src/arrow/array_reader/mod.rs | 2 + parquet/src/arrow/arrow_writer/levels.rs | 368 +++++++++- parquet/src/arrow/arrow_writer/mod.rs | 16 +- 5 files changed, 1112 insertions(+), 6 deletions(-) create mode 100644 parquet/src/arrow/array_reader/fixed_size_list_array.rs diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 241a5efe078a..5e0d05e8953c 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -23,8 +23,8 @@ use crate::arrow::array_reader::empty_array::make_empty_array_reader; use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader; use crate::arrow::array_reader::{ make_byte_array_dictionary_reader, make_byte_array_reader, ArrayReader, - ListArrayReader, MapArrayReader, NullArrayReader, PrimitiveArrayReader, - RowGroupCollection, StructArrayReader, + FixedSizeListArrayReader, ListArrayReader, MapArrayReader, NullArrayReader, + PrimitiveArrayReader, RowGroupCollection, StructArrayReader, }; use crate::arrow::schema::{ParquetField, ParquetFieldType}; use crate::arrow::ProjectionMask; @@ -63,6 +63,9 @@ fn build_reader( DataType::Struct(_) => build_struct_reader(field, mask, row_groups), DataType::List(_) => build_list_reader(field, mask, false, row_groups), DataType::LargeList(_) => build_list_reader(field, mask, true, row_groups), + DataType::FixedSizeList(_, _) => { + build_fixed_size_list_reader(field, mask, row_groups) + } d => unimplemented!("reading group type {} not implemented", d), }, } @@ -166,6 +169,43 @@ fn build_list_reader( Ok(reader) } +/// Build array reader for fixed-size list type. +fn build_fixed_size_list_reader( + field: &ParquetField, + mask: &ProjectionMask, + row_groups: &dyn RowGroupCollection, +) -> Result>> { + let children = field.children().unwrap(); + assert_eq!(children.len(), 1); + + let reader = match build_reader(&children[0], mask, row_groups)? { + Some(item_reader) => { + let item_type = item_reader.get_data_type().clone(); + let reader = match &field.arrow_type { + &DataType::FixedSizeList(ref f, size) => { + let data_type = DataType::FixedSizeList( + Arc::new(f.as_ref().clone().with_data_type(item_type)), + size, + ); + + Box::new(FixedSizeListArrayReader::new( + item_reader, + size as usize, + data_type, + field.def_level, + field.rep_level, + field.nullable, + )) as _ + } + _ => unimplemented!(), + }; + Some(reader) + } + None => None, + }; + Ok(reader) +} + /// Creates primitive array reader for each primitive type. fn build_primitive_reader( field: &ParquetField, diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs new file mode 100644 index 000000000000..4cf68a06601c --- /dev/null +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -0,0 +1,688 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::cmp::Ordering; +use std::sync::Arc; + +use crate::arrow::array_reader::ArrayReader; +use crate::errors::ParquetError; +use crate::errors::Result; +use arrow_array::FixedSizeListArray; +use arrow_array::{builder::BooleanBufferBuilder, new_empty_array, Array, ArrayRef}; +use arrow_data::{transform::MutableArrayData, ArrayData}; +use arrow_schema::DataType as ArrowType; + +/// Implementation of fixed-size list array reader. +pub struct FixedSizeListArrayReader { + item_reader: Box, + /// The number of child items in each row of the list array + fixed_size: usize, + data_type: ArrowType, + /// The definition level at which this list is not null + def_level: i16, + /// The repetition level that corresponds to a new value in this array + rep_level: i16, + /// If the list is nullable + nullable: bool, +} + +impl FixedSizeListArrayReader { + /// Construct fixed-size list array reader. + pub fn new( + item_reader: Box, + fixed_size: usize, + data_type: ArrowType, + def_level: i16, + rep_level: i16, + nullable: bool, + ) -> Self { + Self { + item_reader, + fixed_size, + data_type, + def_level, + rep_level, + nullable, + } + } +} + +impl ArrayReader for FixedSizeListArrayReader { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn get_data_type(&self) -> &ArrowType { + &self.data_type + } + + fn read_records(&mut self, batch_size: usize) -> Result { + let size = self.item_reader.read_records(batch_size)?; + Ok(size) + } + + fn consume_batch(&mut self) -> Result { + let next_batch_array = self.item_reader.consume_batch()?; + if next_batch_array.len() == 0 { + return Ok(new_empty_array(&self.data_type)); + } + + let def_levels = self + .get_def_levels() + .ok_or_else(|| general_err!("item_reader def levels are None"))?; + let rep_levels = self + .get_rep_levels() + .ok_or_else(|| general_err!("item_reader rep levels are None"))?; + + if !rep_levels.is_empty() && rep_levels[0] != 0 { + // This implies either the source data was invalid, or the leaf column + // reader did not correctly delimit semantic records + return Err(general_err!("first repetition level of batch must be 0")); + } + + let mut validity = self + .nullable + .then(|| BooleanBufferBuilder::new(next_batch_array.len())); + + let data = next_batch_array.to_data(); + let mut child_data_builder = + MutableArrayData::new(vec![&data], true, next_batch_array.len()); + + // The current index into the child array entries + let mut child_idx = 0; + // The total number of rows (valid and invalid) in the list array + let mut list_len = 0; + // Start of the current run of valid values + let mut start_idx = None; + let mut row_len = 0; + + def_levels.iter().zip(rep_levels).try_for_each(|(d, r)| { + match r.cmp(&self.rep_level) { + Ordering::Greater => { + // Repetition level greater than current => already handled by inner array + if *d < self.def_level { + return Err(general_err!( + "Encountered repetition level too large for definition level" + )); + } + } + Ordering::Equal => { + // Item inside of the current list + child_idx += 1; + row_len += 1; + } + Ordering::Less => { + // Start of new list row + list_len += 1; + + // Length of the previous row should be equal to: + // - the list's fixed size (valid entries) + // - zero (null entries, start of array) + // Any other length indicates invalid data + if start_idx.is_some() && row_len != self.fixed_size { + return Err(general_err!( + "Encountered misaligned row with length {} (expected length {})", + row_len, + self.fixed_size + )) + } + row_len = 0; + + if *d >= self.def_level { + row_len += 1; + + // Valid list entry + if let Some(validity) = validity.as_mut() { + validity.append(true); + } + // Start a run of valid rows if not already inside of one + start_idx.get_or_insert(child_idx); + } else { + // Null list entry + + if let Some(start) = start_idx.take() { + // Flush pending child items + child_data_builder.extend(0, start, child_idx); + } + // Pad list with nulls + child_data_builder.extend_nulls(self.fixed_size); + + if let Some(validity) = validity.as_mut() { + // Valid if empty list + validity.append(*d + 1 == self.def_level); + } + } + child_idx += 1; + } + } + Ok(()) + })?; + + let child_data = match start_idx { + Some(0) => { + // No null entries - can reuse original array + next_batch_array.to_data() + } + Some(start) => { + // Flush pending child items + child_data_builder.extend(0, start, child_idx); + child_data_builder.freeze() + } + None => child_data_builder.freeze(), + }; + + // Verify total number of elements is aligned with fixed list size + if list_len * self.fixed_size != child_data.len() { + return Err(general_err!( + "fixed-size list length must be a multiple of {} but array contains {} elements", + self.fixed_size, + child_data.len() + )); + } + + let mut list_builder = ArrayData::builder(self.get_data_type().clone()) + .len(list_len) + .add_child_data(child_data); + + if let Some(builder) = validity { + list_builder = list_builder.null_bit_buffer(Some(builder.into())); + } + + let list_data = unsafe { list_builder.build_unchecked() }; + + let result_array = FixedSizeListArray::from(list_data); + Ok(Arc::new(result_array)) + } + + fn skip_records(&mut self, num_records: usize) -> Result { + self.item_reader.skip_records(num_records) + } + + fn get_def_levels(&self) -> Option<&[i16]> { + self.item_reader.get_def_levels() + } + + fn get_rep_levels(&self) -> Option<&[i16]> { + self.item_reader.get_rep_levels() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::{ + array_reader::{test_util::InMemoryArrayReader, ListArrayReader}, + arrow_reader::{ + ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, + }, + ArrowWriter, + }; + use arrow::datatypes::{Field, Int32Type}; + use arrow_array::{ + builder::{FixedSizeListBuilder, Int32Builder, ListBuilder}, + cast::AsArray, + FixedSizeListArray, ListArray, PrimitiveArray, RecordBatch, + }; + use arrow_buffer::Buffer; + use arrow_data::ArrayDataBuilder; + use arrow_schema::Schema; + use bytes::Bytes; + + #[test] + fn test_nullable_list() { + // [null, [1, null, 2], null, [3, 4, 5], [null, null, null]] + let expected = FixedSizeListArray::from_iter_primitive::( + vec![ + None, + Some([Some(1), None, Some(2)]), + None, + Some([Some(3), Some(4), Some(5)]), + Some([None, None, None]), + ], + 3, + ); + + let array = Arc::new(PrimitiveArray::::from(vec![ + None, + Some(1), + None, + Some(2), + None, + Some(3), + Some(4), + Some(5), + None, + None, + None, + ])); + let item_array_reader = InMemoryArrayReader::new( + ArrowType::Int32, + array, + Some(vec![0, 3, 2, 3, 0, 3, 3, 3, 2, 2, 2]), + Some(vec![0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1]), + ); + + let mut list_array_reader = FixedSizeListArrayReader::new( + Box::new(item_array_reader), + 3, + ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 3, + ), + 2, + 1, + true, + ); + let actual = list_array_reader.next_batch(1024).unwrap(); + let actual = actual + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&expected, actual) + } + + #[test] + fn test_required_list() { + // [[1, null], [2, 3], [null, null], [4, 5]] + let expected = FixedSizeListArray::from_iter_primitive::( + vec![ + Some([Some(1), None]), + Some([Some(2), Some(3)]), + Some([None, None]), + Some([Some(4), Some(5)]), + ], + 2, + ); + + let array = Arc::new(PrimitiveArray::::from(vec![ + Some(1), + None, + Some(2), + Some(3), + None, + None, + Some(4), + Some(5), + ])); + let item_array_reader = InMemoryArrayReader::new( + ArrowType::Int32, + array, + Some(vec![2, 1, 2, 2, 1, 1, 2, 2]), + Some(vec![0, 1, 0, 1, 0, 1, 0, 1]), + ); + + let mut list_array_reader = FixedSizeListArrayReader::new( + Box::new(item_array_reader), + 2, + ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 2, + ), + 1, + 1, + false, + ); + let actual = list_array_reader.next_batch(1024).unwrap(); + let actual = actual + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&expected, actual) + } + + #[test] + fn test_nested_list() { + // [ + // null, + // [[1, 2]], + // [[null, 3]], + // null, + // [[4, 5]], + // [[null, null]], + // ] + let l2_type = ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 2, + ); + let l1_type = ArrowType::FixedSizeList( + Arc::new(Field::new("item", l2_type.clone(), false)), + 1, + ); + + let array = PrimitiveArray::::from(vec![ + None, + None, + Some(1), + Some(2), + None, + Some(3), + None, + None, + Some(4), + Some(5), + None, + None, + ]); + + let l2 = ArrayDataBuilder::new(l2_type.clone()) + .len(6) + .add_child_data(array.into_data()) + .build() + .unwrap(); + + let l1 = ArrayDataBuilder::new(l1_type.clone()) + .len(6) + .add_child_data(l2) + .null_bit_buffer(Some(Buffer::from([0b110110]))) + .build() + .unwrap(); + + let expected = FixedSizeListArray::from(l1); + + let values = Arc::new(PrimitiveArray::::from(vec![ + None, + Some(1), + Some(2), + None, + Some(3), + None, + Some(4), + Some(5), + None, + None, + ])); + + let item_array_reader = InMemoryArrayReader::new( + ArrowType::Int32, + values, + Some(vec![0, 5, 5, 4, 5, 0, 5, 5, 4, 4]), + Some(vec![0, 0, 2, 0, 2, 0, 0, 2, 0, 2]), + ); + + let l2 = FixedSizeListArrayReader::new( + Box::new(item_array_reader), + 2, + l2_type, + 4, + 2, + false, + ); + let mut l1 = FixedSizeListArrayReader::new(Box::new(l2), 1, l1_type, 3, 1, true); + + let expected_1 = expected.slice(0, 2); + let expected_2 = expected.slice(2, 4); + + let actual = l1.next_batch(2).unwrap(); + assert_eq!(actual.as_ref(), &expected_1); + + let actual = l1.next_batch(1024).unwrap(); + assert_eq!(actual.as_ref(), &expected_2); + } + + #[test] + fn test_empty_list() { + // [null, [], null, []] + let expected = FixedSizeListArray::from_iter_primitive::( + vec![None, Some([]), None, Some([])], + 0, + ); + + let array = Arc::new(PrimitiveArray::::from(vec![ + None, None, None, None, + ])); + let item_array_reader = InMemoryArrayReader::new( + ArrowType::Int32, + array, + Some(vec![0, 1, 0, 1]), + Some(vec![0, 0, 0, 0]), + ); + + let mut list_array_reader = FixedSizeListArrayReader::new( + Box::new(item_array_reader), + 0, + ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 0, + ), + 2, + 1, + true, + ); + let actual = list_array_reader.next_batch(1024).unwrap(); + let actual = actual + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&expected, actual) + } + + #[test] + fn test_nested_var_list() { + // [[[1, null, 3], null], [[4], []], [[5, 6], [null, null]], null] + let mut builder = + FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); + builder.values().append_value([Some(1), None, Some(3)]); + builder.values().append_null(); + builder.append(true); + builder.values().append_value([Some(4)]); + builder.values().append_value([]); + builder.append(true); + builder.values().append_value([Some(5), Some(6)]); + builder.values().append_value([None, None]); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + let expected = builder.finish(); + + let array = Arc::new(PrimitiveArray::::from(vec![ + Some(1), + None, + Some(3), + None, + Some(4), + None, + Some(5), + Some(6), + None, + None, + None, + ])); + + let inner_type = + ArrowType::List(Arc::new(Field::new("item", ArrowType::Int32, true))); + let list_type = ArrowType::FixedSizeList( + Arc::new(Field::new("item", inner_type.clone(), true)), + 2, + ); + + let item_array_reader = InMemoryArrayReader::new( + ArrowType::Int32, + array, + Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]), + Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), + ); + + let inner_array_reader = ListArrayReader::::new( + Box::new(item_array_reader), + inner_type, + 4, + 2, + true, + ); + + let mut list_array_reader = FixedSizeListArrayReader::new( + Box::new(inner_array_reader), + 2, + list_type, + 2, + 1, + true, + ); + let actual = list_array_reader.next_batch(1024).unwrap(); + let actual = actual + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(&expected, actual) + } + + #[test] + fn test_read_list_column() { + // This test writes a Parquet file containing a fixed-length array column and a primitive column, + // then reads the columns back from the file. + + // [ + // [1, 2, 3, null], + // [5, 6, 7, 8], + // null, + // [9, null, 11, 12], + // ] + let list = FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3), None]), + Some(vec![Some(5), Some(6), Some(7), Some(8)]), + None, + Some(vec![Some(9), None, Some(11), Some(12)]), + Some(vec![None, None, None, None]), + ], + 4, + ); + + // [null, 2, 3, null, 5] + let primitive = PrimitiveArray::::from_iter(vec![ + None, + Some(2), + Some(3), + None, + Some(5), + ]); + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "list", + ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 4, + ), + true, + ), + Field::new("primitive", ArrowType::Int32, true), + ])); + + // Create record batch with a fixed-length array column and a primitive column + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(list.clone()), Arc::new(primitive.clone())], + ) + .expect("unable to create record batch"); + + // Write record batch to Parquet + let mut buffer = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), None) + .expect("unable to create parquet writer"); + writer.write(&batch).expect("unable to write record batch"); + writer.close().expect("unable to close parquet writer"); + + // Read record batch from Parquet + let reader = Bytes::from(buffer); + let mut batch_reader = ParquetRecordBatchReader::try_new(reader, 1024) + .expect("unable to create parquet reader"); + let actual = batch_reader + .next() + .expect("missing record batch") + .expect("unable to read record batch"); + + // Verify values of both read columns match + assert_eq!(schema, actual.schema()); + let actual_list = actual + .column(0) + .as_any() + .downcast_ref::() + .expect("unable to cast array to FixedSizeListArray"); + let actual_primitive = actual.column(1).as_primitive::(); + assert_eq!(actual_list, &list); + assert_eq!(actual_primitive, &primitive); + } + + #[test] + fn test_read_as_dyn_list() { + // This test verifies that fixed-size list arrays can be read from Parquet + // as variable-length list arrays. + + // [ + // [1, 2, 3, null], + // [5, 6, 7, 8], + // null, + // [9, null, 11, 12], + // ] + let list = FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3), None]), + Some(vec![Some(5), Some(6), Some(7), Some(8)]), + None, + Some(vec![Some(9), None, Some(11), Some(12)]), + Some(vec![None, None, None, None]), + ], + 4, + ); + + let schema = Arc::new(Schema::new(vec![Field::new( + "list", + ArrowType::FixedSizeList( + Arc::new(Field::new("item", ArrowType::Int32, true)), + 4, + ), + true, + )])); + + // Create record batch with a single fixed-length array column + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(list)]).unwrap(); + + // Write record batch to Parquet + let mut buffer = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buffer, schema, None) + .expect("unable to create parquet writer"); + writer.write(&batch).expect("unable to write record batch"); + writer.close().expect("unable to close parquet writer"); + + // Read record batch from Parquet - ignoring arrow metadata + let reader = Bytes::from(buffer); + let mut batch_reader = ArrowReaderBuilder::try_new_with_options( + reader, + ArrowReaderOptions::new().with_skip_arrow_metadata(true), + ) + .expect("unable to create reader builder") + .build() + .expect("unable to create parquet reader"); + let actual = batch_reader + .next() + .expect("missing record batch") + .expect("unable to read record batch"); + + // Verify the read column is a variable length list with values that match the input + let col = actual.column(0).as_list::(); + let expected = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), Some(3), None]), + Some(vec![Some(5), Some(6), Some(7), Some(8)]), + None, + Some(vec![Some(9), None, Some(11), Some(12)]), + Some(vec![None, None, None, None]), + ]); + assert_eq!(col, &expected); + } +} diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index f46f6073a714..823084b43207 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -35,6 +35,7 @@ mod byte_array; mod byte_array_dictionary; mod empty_array; mod fixed_len_byte_array; +mod fixed_size_list_array; mod list_array; mod map_array; mod null_array; @@ -48,6 +49,7 @@ pub use builder::build_array_reader; pub use byte_array::make_byte_array_reader; pub use byte_array_dictionary::make_byte_array_dictionary_reader; pub use fixed_len_byte_array::make_fixed_len_byte_array_reader; +pub use fixed_size_list_array::FixedSizeListArrayReader; pub use list_array::ListArrayReader; pub use map_array::MapArrayReader; pub use null_array::NullArrayReader; diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index fc5b9460322a..21b3e7dff88d 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -42,7 +42,7 @@ use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; -use arrow_array::{Array, ArrayRef, OffsetSizeTrait, StructArray}; +use arrow_array::{Array, ArrayRef, FixedSizeListArray, OffsetSizeTrait, StructArray}; use arrow_buffer::NullBuffer; use arrow_schema::{DataType, Field}; use std::ops::Range; @@ -144,7 +144,8 @@ impl LevelInfoBuilder { } DataType::List(child) | DataType::LargeList(child) - | DataType::Map(child, _) => { + | DataType::Map(child, _) + | DataType::FixedSizeList(child, _) => { let def_level = match field.is_nullable() { true => parent_ctx.def_level + 2, false => parent_ctx.def_level + 1, @@ -214,6 +215,19 @@ impl LevelInfoBuilder { range, ) } + &DataType::FixedSizeList(_, size) => { + let array = array + .as_any() + .downcast_ref::() + .expect("unable to get fixed-size list array"); + + self.write_fixed_size_list( + size as usize, + array.nulls(), + array.values(), + range, + ) + } _ => unreachable!(), } } @@ -371,6 +385,100 @@ impl LevelInfoBuilder { } } + /// Write `range` elements from FixedSizeListArray with child data `values` and null bitmap `nulls`. + fn write_fixed_size_list( + &mut self, + fixed_size: usize, + nulls: Option<&NullBuffer>, + values: &dyn Array, + range: Range, + ) { + let (child, ctx) = match self { + Self::List(child, ctx) => (child, ctx), + _ => unreachable!(), + }; + + let write_non_null = + |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + let values_start = start_idx * fixed_size; + let values_end = end_idx * fixed_size; + child.write(values, values_start..values_end); + + child.visit_leaves(|leaf| { + let rep_levels = leaf.rep_levels.as_mut().unwrap(); + + let row_indices = (0..fixed_size) + .rev() + .cycle() + .take(values_end - values_start); + + // Step backward over the child rep levels and mark the start of each list + rep_levels + .iter_mut() + .rev() + // Filter out reps from nested children + .filter(|&&mut r| r == ctx.rep_level) + .zip(row_indices) + .for_each(|(r, idx)| { + if idx == 0 { + *r = ctx.rep_level - 1; + } + }); + }) + }; + + // If list size is 0, ignore values and just write rep/def levels. + let write_empty = + |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + let len = end_idx - start_idx; + child.visit_leaves(|leaf| { + let rep_levels = leaf.rep_levels.as_mut().unwrap(); + rep_levels.extend(std::iter::repeat(ctx.rep_level - 1).take(len)); + let def_levels = leaf.def_levels.as_mut().unwrap(); + def_levels.extend(std::iter::repeat(ctx.def_level - 1).take(len)); + }) + }; + + let write_rows = + |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + if fixed_size > 0 { + write_non_null(child, start_idx, end_idx) + } else { + write_empty(child, start_idx, end_idx) + } + }; + + match nulls { + Some(nulls) => { + let mut start_idx = None; + for idx in range.clone() { + if nulls.is_valid(idx) { + // Start a run of valid rows if not already inside of one + start_idx.get_or_insert(idx); + } else { + // Write out any pending valid rows + if let Some(start) = start_idx.take() { + write_rows(child, start, idx); + } + // Add null row + child.visit_leaves(|leaf| { + let rep_levels = leaf.rep_levels.as_mut().unwrap(); + rep_levels.push(ctx.rep_level - 1); + let def_levels = leaf.def_levels.as_mut().unwrap(); + def_levels.push(ctx.def_level - 2); + }) + } + } + // Write out any remaining valid rows + if let Some(start) = start_idx.take() { + write_rows(child, start, range.end); + } + } + // If all rows are valid then write the whole array + None => write_rows(child, range.start, range.end), + } + } + /// Write a primitive array, as defined by [`is_leaf`] fn write_leaf(&mut self, array: &dyn Array, range: Range) { let info = match self { @@ -1397,4 +1505,260 @@ mod tests { assert_eq!(&levels[1], &expected_level); } + + #[test] + fn test_fixed_size_list() { + // [[1, 2], null, null, [7, 8], null] + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 2); + builder.values().append_slice(&[1, 2]); + builder.append(true); + builder.values().append_slice(&[3, 4]); + builder.append(false); + builder.values().append_slice(&[5, 6]); + builder.append(false); + builder.values().append_slice(&[7, 8]); + builder.append(true); + builder.values().append_slice(&[9, 10]); + builder.append(false); + let a = builder.finish(); + + let item_field = Field::new("item", a.data_type().clone(), true); + let mut builder = + LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); + builder.write(&a, 1..4); + let levels = builder.finish(); + + assert_eq!(levels.len(), 1); + + let list_level = levels.get(0).unwrap(); + + let expected_level = LevelInfo { + def_levels: Some(vec![0, 0, 3, 3]), + rep_levels: Some(vec![0, 0, 0, 1]), + non_null_indices: vec![6, 7], + max_def_level: 3, + max_rep_level: 1, + }; + assert_eq!(list_level, &expected_level); + } + + #[test] + fn test_fixed_size_list_of_struct() { + // define schema + let field_a = Field::new("a", DataType::Int32, true); + let field_b = Field::new("b", DataType::Int64, false); + let fields = Fields::from([Arc::new(field_a), Arc::new(field_b)]); + let item_field = Field::new("item", DataType::Struct(fields.clone()), true); + let list_field = Field::new( + "list", + DataType::FixedSizeList(Arc::new(item_field), 2), + true, + ); + + let builder_a = Int32Builder::with_capacity(10); + let builder_b = Int64Builder::with_capacity(10); + let struct_builder = + StructBuilder::new(fields, vec![Box::new(builder_a), Box::new(builder_b)]); + let mut list_builder = FixedSizeListBuilder::new(struct_builder, 2); + + // [ + // [{a: 1, b: 2}, null], + // null, + // [null, null], + // [{a: null, b: 3}, {a: 2, b: 4}] + // ] + + // [{a: 1, b: 2}, null] + let values = list_builder.values(); + // {a: 1, b: 2} + values + .field_builder::(0) + .unwrap() + .append_value(1); + values + .field_builder::(1) + .unwrap() + .append_value(2); + values.append(true); + // null + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(0); + values.append(false); + list_builder.append(true); + + // null + let values = list_builder.values(); + // null + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(0); + values.append(false); + // null + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(0); + values.append(false); + list_builder.append(false); + + // [null, null] + let values = list_builder.values(); + // null + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(0); + values.append(false); + // null + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(0); + values.append(false); + list_builder.append(true); + + // [{a: null, b: 3}, {a: 2, b: 4}] + let values = list_builder.values(); + // {a: null, b: 3} + values + .field_builder::(0) + .unwrap() + .append_null(); + values + .field_builder::(1) + .unwrap() + .append_value(3); + values.append(true); + // {a: 2, b: 4} + values + .field_builder::(0) + .unwrap() + .append_value(2); + values + .field_builder::(1) + .unwrap() + .append_value(4); + values.append(true); + list_builder.append(true); + + let array = Arc::new(list_builder.finish()); + + assert_eq!(array.values().len(), 8); + assert_eq!(array.len(), 4); + + let schema = Arc::new(Schema::new(vec![list_field])); + let rb = RecordBatch::try_new(schema, vec![array]).unwrap(); + + let levels = calculate_array_levels(rb.column(0), rb.schema().field(0)).unwrap(); + let a_levels = &levels[0]; + let b_levels = &levels[1]; + + // [[{a: 1}, null], null, [null, null], [{a: null}, {a: 2}]] + let expected_a = LevelInfo { + def_levels: Some(vec![4, 2, 0, 2, 2, 3, 4]), + rep_levels: Some(vec![0, 1, 0, 0, 1, 0, 1]), + non_null_indices: vec![0, 7], + max_def_level: 4, + max_rep_level: 1, + }; + // [[{b: 2}, null], null, [null, null], [{b: 3}, {b: 4}]] + let expected_b = LevelInfo { + def_levels: Some(vec![3, 2, 0, 2, 2, 3, 3]), + rep_levels: Some(vec![0, 1, 0, 0, 1, 0, 1]), + non_null_indices: vec![0, 6, 7], + max_def_level: 3, + max_rep_level: 1, + }; + + assert_eq!(a_levels, &expected_a); + assert_eq!(b_levels, &expected_b); + } + + #[test] + fn test_fixed_size_list_empty() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 0); + builder.append(true); + builder.append(false); + builder.append(true); + let a = builder.finish(); + + let item_field = Field::new("item", a.data_type().clone(), true); + let mut builder = + LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); + builder.write(&a, 0..3); + let levels = builder.finish(); + + assert_eq!(levels.len(), 1); + + let list_level = levels.get(0).unwrap(); + + let expected_level = LevelInfo { + def_levels: Some(vec![1, 0, 1]), + rep_levels: Some(vec![0, 0, 0]), + non_null_indices: vec![], + max_def_level: 3, + max_rep_level: 1, + }; + assert_eq!(list_level, &expected_level); + } + + #[test] + fn test_fixed_size_list_of_var_lists() { + // [[[1, null, 3], null], [[4], []], [[5, 6], [null, null]], null] + let mut builder = + FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); + builder.values().append_value([Some(1), None, Some(3)]); + builder.values().append_null(); + builder.append(true); + builder.values().append_value([Some(4)]); + builder.values().append_value([]); + builder.append(true); + builder.values().append_value([Some(5), Some(6)]); + builder.values().append_value([None, None]); + builder.append(true); + builder.values().append_null(); + builder.values().append_null(); + builder.append(false); + let a = builder.finish(); + + let item_field = Field::new("item", a.data_type().clone(), true); + let mut builder = + LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); + builder.write(&a, 0..4); + let levels = builder.finish(); + + let list_level = levels.get(0).unwrap(); + let expected_level = LevelInfo { + def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]), + rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), + non_null_indices: vec![0, 2, 3, 4, 5], + max_def_level: 5, + max_rep_level: 2, + }; + + assert_eq!(list_level, &expected_level); + } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index af820218255d..cfad15550bcf 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,7 +23,9 @@ use std::sync::Arc; use arrow_array::cast::AsArray; use arrow_array::types::{Decimal128Type, Int32Type, Int64Type, UInt32Type, UInt64Type}; -use arrow_array::{types, Array, ArrayRef, RecordBatch, RecordBatchWriter}; +use arrow_array::{ + types, Array, ArrayRef, FixedSizeListArray, RecordBatch, RecordBatchWriter, +}; use arrow_schema::{ArrowError, DataType as ArrowDataType, IntervalUnit, SchemaRef}; use super::schema::{ @@ -380,7 +382,17 @@ fn write_leaves( ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), )), - ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _) | ArrowDataType::RunEndEncoded(_, _) => { + ArrowDataType::FixedSizeList(_, _) => { + let arrays: Vec<_> = arrays.iter().map(|array|{ + array.as_any().downcast_ref::() + .expect("unable to get fixed-size list array") + .values() + .clone() + }).collect(); + write_leaves(row_group_writer, &arrays, levels)?; + Ok(()) + }, + ArrowDataType::Union(_, _) | ArrowDataType::RunEndEncoded(_, _) => { Err(ParquetError::NYI( format!( "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" From ccc5497b2d0b9984e0b372e2d741ac67c8f3ad1e Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Wed, 24 May 2023 23:38:17 +0200 Subject: [PATCH 0928/1411] Expose RecordBatchWriter in arrow crate (#4277) --- arrow/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index af5972acc97e..70e615e88c73 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -376,7 +376,9 @@ pub use arrow_json as json; pub mod pyarrow; pub mod record_batch { - pub use arrow_array::{RecordBatch, RecordBatchOptions, RecordBatchReader}; + pub use arrow_array::{ + RecordBatch, RecordBatchOptions, RecordBatchReader, RecordBatchWriter, + }; } pub use arrow_array::temporal_conversions; pub use arrow_row as row; From 6a39e22b6bbfbdbebc4f57fbfeff3a3deda7c79a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 24 May 2023 22:38:38 +0100 Subject: [PATCH 0929/1411] Convert FixedSizeListArray to GenericListArray (#4273) --- arrow-array/src/array/list_array.rs | 48 +++++++++++++++++++++++--- arrow-buffer/src/buffer/offset.rs | 52 +++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f4816a61ea82..d016afccbfe5 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -19,7 +19,7 @@ use crate::array::{get_offsets, make_array, print_long_array}; use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ iterator::GenericListArrayIter, new_empty_array, Array, ArrayAccessor, ArrayRef, - ArrowPrimitiveType, + ArrowPrimitiveType, FixedSizeListArray, }; use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -310,9 +310,7 @@ impl From for GenericListArray From> - for ArrayData -{ +impl From> for ArrayData { fn from(array: GenericListArray) -> Self { let len = array.len(); let builder = ArrayDataBuilder::new(array.data_type) @@ -325,6 +323,27 @@ impl From> } } +impl From + for GenericListArray +{ + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + + let offsets = + OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + } + } +} + impl GenericListArray { fn try_new_from_array_data(data: ArrayData) -> Result { if data.buffers().len() != 1 { @@ -509,7 +528,8 @@ pub type LargeListArray = GenericListArray; #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, ListBuilder}; + use crate::builder::{FixedSizeListBuilder, Int32Builder, ListBuilder}; + use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Int32Array, Int64Array}; use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; @@ -1178,4 +1198,22 @@ mod tests { "Invalid argument error: Max offset of 5 exceeds length of values 2" ); } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListArray = builder.finish().into(); + + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + } } diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index bfafe3306aed..0111d12fbab1 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -69,6 +69,35 @@ impl OffsetBuffer { Self(buffer.into_buffer().into()) } + /// Create a new [`OffsetBuffer`] from the iterator of slice lengths + /// + /// ``` + /// # use arrow_buffer::OffsetBuffer; + /// let offsets = OffsetBuffer::::from_lengths([1, 3, 5]); + /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); + /// ``` + /// + /// # Panics + /// + /// Panics on overflow + pub fn from_lengths(lengths: I) -> Self + where + I: IntoIterator, + { + let iter = lengths.into_iter(); + let mut out = Vec::with_capacity(iter.size_hint().0 + 1); + out.push(O::usize_as(0)); + + let mut acc = 0_usize; + for length in iter { + acc = acc.checked_add(length).expect("usize overflow"); + out.push(O::usize_as(acc)) + } + // Check for overflow + O::from_usize(acc).expect("offset overflow"); + Self(out.into()) + } + /// Returns the inner [`ScalarBuffer`] pub fn inner(&self) -> &ScalarBuffer { &self.0 @@ -139,4 +168,27 @@ mod tests { fn non_monotonic_offsets() { OffsetBuffer::new(vec![1, 2, 0].into()); } + + #[test] + fn from_lengths() { + let buffer = OffsetBuffer::::from_lengths([2, 6, 3, 7, 2]); + assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]); + + let half_max = i32::MAX / 2; + let buffer = + OffsetBuffer::::from_lengths([half_max as usize, half_max as usize]); + assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn from_lengths_offset_overflow() { + OffsetBuffer::::from_lengths([i32::MAX as usize, 1]); + } + + #[test] + #[should_panic(expected = "usize overflow")] + fn from_lengths_usize_overflow() { + OffsetBuffer::::from_lengths([usize::MAX, 1]); + } } From 98919ff2f368e1c1115e5b15583258367876ac6b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 24 May 2023 23:03:02 +0100 Subject: [PATCH 0930/1411] Add parquet-concat (#4274) --- parquet/Cargo.toml | 4 + parquet/src/bin/parquet-concat.rs | 118 ++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 parquet/src/bin/parquet-concat.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 29f7cda1360f..cc48424a6b05 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -131,6 +131,10 @@ required-features = ["cli"] name = "parquet-rowcount" required-features = ["cli"] +[[bin]] +name = "parquet-concat" +required-features = ["cli"] + [[bin]] name = "parquet-fromcsv" required-features = ["arrow", "cli", "snap", "brotli", "flate2", "lz4", "zstd"] diff --git a/parquet/src/bin/parquet-concat.rs b/parquet/src/bin/parquet-concat.rs new file mode 100644 index 000000000000..9cbdf8e7b399 --- /dev/null +++ b/parquet/src/bin/parquet-concat.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary that concatenates the column data of one or more parquet files +//! +//! # Install +//! +//! `parquet-concat` can be installed using `cargo`: +//! ``` +//! cargo install parquet --features=cli +//! ``` +//! After this `parquet-concat` should be available: +//! ``` +//! parquet-concat out.parquet a.parquet b.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --features=cli --bin parquet-concat out.parquet a.parquet b.parquet +//! ``` +//! +//! Note: this does not currently support preserving the page index or bloom filters +//! + +use clap::Parser; +use parquet::column::writer::ColumnCloseResult; +use parquet::errors::{ParquetError, Result}; +use parquet::file::properties::WriterProperties; +use parquet::file::writer::SerializedFileWriter; +use std::fs::File; +use std::sync::Arc; + +#[derive(Debug, Parser)] +#[clap(author, version)] +/// Concatenates one or more parquet files +struct Args { + /// Path to output + output: String, + + /// Path to input files + input: Vec, +} + +impl Args { + fn run(&self) -> Result<()> { + if self.input.is_empty() { + return Err(ParquetError::General( + "Must provide at least one input file".into(), + )); + } + + let output = File::create(&self.output)?; + + let inputs = self + .input + .iter() + .map(|x| { + let reader = File::open(x)?; + let metadata = parquet::file::footer::parse_metadata(&reader)?; + Ok((reader, metadata)) + }) + .collect::>>()?; + + let expected = inputs[0].1.file_metadata().schema(); + for (_, metadata) in inputs.iter().skip(1) { + let actual = metadata.file_metadata().schema(); + if expected != actual { + return Err(ParquetError::General(format!( + "inputs must have the same schema, {expected:#?} vs {actual:#?}" + ))); + } + } + + let props = Arc::new(WriterProperties::builder().build()); + let schema = inputs[0].1.file_metadata().schema_descr().root_schema_ptr(); + let mut writer = SerializedFileWriter::new(output, schema, props)?; + + for (input, metadata) in inputs { + for rg in metadata.row_groups() { + let mut rg_out = writer.next_row_group()?; + for column in rg.columns() { + let result = ColumnCloseResult { + bytes_written: column.compressed_size() as _, + rows_written: rg.num_rows() as _, + metadata: column.clone(), + bloom_filter: None, + column_index: None, + offset_index: None, + }; + rg_out.append_column(&input, result)?; + } + rg_out.close()?; + } + } + + writer.close()?; + + Ok(()) + } +} + +fn main() -> Result<()> { + Args::parse().run() +} From 56437ccca1f55b7096ea9b94c5edcc353c528479 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 24 May 2023 23:03:31 +0100 Subject: [PATCH 0931/1411] Derive Default for WriterProperties (#4268) * Derive Default for WriterProperties * Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 5 ++--- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/mod.rs | 5 +---- parquet/src/column/mod.rs | 3 +-- parquet/src/column/writer/mod.rs | 30 +++++++++++++-------------- parquet/src/file/mod.rs | 3 +-- parquet/src/file/properties.rs | 17 +++++++++++++-- parquet/src/file/serialized_reader.rs | 10 +++------ parquet/src/file/writer.rs | 8 +++---- parquet_derive/src/lib.rs | 3 +-- parquet_derive_test/src/lib.rs | 2 +- 11 files changed, 45 insertions(+), 43 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index c69fa420d564..4b14a54c531b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1746,11 +1746,10 @@ mod tests { { // Write using low-level parquet API (#1167) - let writer_props = Arc::new(WriterProperties::builder().build()); let mut writer = SerializedFileWriter::new( file.try_clone().unwrap(), schema, - writer_props, + Default::default(), ) .unwrap(); @@ -2288,7 +2287,7 @@ mod tests { } "; let schema = Arc::new(parse_message_type(MESSAGE_TYPE).unwrap()); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut buf = Vec::with_capacity(1024); let mut writer = SerializedFileWriter::new(&mut buf, schema, props).unwrap(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index cfad15550bcf..2feb0bdbfafc 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -105,7 +105,7 @@ impl ArrowWriter { ) -> Result { let schema = arrow_to_parquet_schema(&arrow_schema)?; // add serialized arrow schema - let mut props = props.unwrap_or_else(|| WriterProperties::builder().build()); + let mut props = props.unwrap_or_default(); add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); let max_row_group_size = props.max_row_group_size(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index da7e850c3d60..e5211ec23931 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -40,10 +40,7 @@ //! //! let file = File::create("data.parquet").unwrap(); //! -//! // Default writer properties -//! let props = WriterProperties::builder().build(); -//! -//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None).unwrap(); //! //! writer.write(&batch).expect("Writing batch"); //! diff --git a/parquet/src/column/mod.rs b/parquet/src/column/mod.rs index cb0c035dd6e2..a68127a4ef05 100644 --- a/parquet/src/column/mod.rs +++ b/parquet/src/column/mod.rs @@ -63,9 +63,8 @@ //! } //! "; //! let schema = Arc::new(parse_message_type(message_type).unwrap()); -//! let props = Arc::new(WriterProperties::builder().build()); //! let file = fs::File::create(path).unwrap(); -//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, Default::default()).unwrap(); //! //! let mut row_group_writer = writer.next_row_group().unwrap(); //! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 51e2614993e1..137893092405 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -1131,7 +1131,7 @@ mod tests { #[test] fn test_column_writer_inconsistent_def_rep_length() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 1, 1, props); let res = writer.write_batch(&[1, 2, 3, 4], Some(&[1, 1, 1]), Some(&[0, 0])); assert!(res.is_err()); @@ -1146,7 +1146,7 @@ mod tests { #[test] fn test_column_writer_invalid_def_levels() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 1, 0, props); let res = writer.write_batch(&[1, 2, 3, 4], None, None); assert!(res.is_err()); @@ -1161,7 +1161,7 @@ mod tests { #[test] fn test_column_writer_invalid_rep_levels() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 1, props); let res = writer.write_batch(&[1, 2, 3, 4], None, None); assert!(res.is_err()); @@ -1176,7 +1176,7 @@ mod tests { #[test] fn test_column_writer_not_enough_values_to_write() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 1, 0, props); let res = writer.write_batch(&[1, 2], Some(&[1, 1, 1, 1]), None); assert!(res.is_err()); @@ -1191,7 +1191,7 @@ mod tests { #[test] fn test_column_writer_write_only_one_dictionary_page() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); // First page should be correctly written. @@ -1499,7 +1499,7 @@ mod tests { #[test] fn test_column_writer_check_metadata() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); @@ -1535,7 +1535,7 @@ mod tests { #[test] fn test_column_writer_check_byte_array_min_max() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_decimals_column_writer::(page_writer, 0, 0, props); writer @@ -1591,7 +1591,7 @@ mod tests { #[test] fn test_column_writer_uint32_converted_type_min_max() { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_unsigned_int_given_as_converted_column_writer::< Int32Type, >(page_writer, 0, 0, props); @@ -1664,7 +1664,7 @@ mod tests { let mut buf = Vec::with_capacity(100); let mut write = TrackedWrite::new(&mut buf); let page_writer = Box::new(SerializedPageWriter::new(&mut write)); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); @@ -1772,25 +1772,25 @@ mod tests { #[test] fn test_column_writer_empty_column_roundtrip() { - let props = WriterProperties::builder().build(); + let props = Default::default(); column_roundtrip::(props, &[], None, None); } #[test] fn test_column_writer_non_nullable_values_roundtrip() { - let props = WriterProperties::builder().build(); + let props = Default::default(); column_roundtrip_random::(props, 1024, i32::MIN, i32::MAX, 0, 0); } #[test] fn test_column_writer_nullable_non_repeated_values_roundtrip() { - let props = WriterProperties::builder().build(); + let props = Default::default(); column_roundtrip_random::(props, 1024, i32::MIN, i32::MAX, 10, 0); } #[test] fn test_column_writer_nullable_repeated_values_roundtrip() { - let props = WriterProperties::builder().build(); + let props = Default::default(); column_roundtrip_random::(props, 1024, i32::MIN, i32::MAX, 10, 10); } @@ -2121,7 +2121,7 @@ mod tests { // write data // and check the offset index and column index let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); // first page @@ -2433,7 +2433,7 @@ mod tests { /// Write data into parquet using [`get_test_page_writer`] and [`get_test_column_writer`] and returns generated statistics. fn statistics_roundtrip(values: &[::T]) -> Statistics { let page_writer = get_test_page_writer(); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = get_test_column_writer::(page_writer, 0, 0, props); writer.write_batch(values, None, None).unwrap(); diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index 66d8ce48e0a7..fffe383c57ae 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -45,9 +45,8 @@ //! } //! "; //! let schema = Arc::new(parse_message_type(message_type).unwrap()); -//! let props = Arc::new(WriterProperties::builder().build()); //! let file = fs::File::create(&path).unwrap(); -//! let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +//! let mut writer = SerializedFileWriter::new(file, schema, Default::default()).unwrap(); //! let mut row_group_writer = writer.next_row_group().unwrap(); //! while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { //! // ... write values to a column writer diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 1d6f38dcd3c4..c09503987a00 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -27,7 +27,7 @@ //! }; //! //! // Create properties with default configuration. -//! let props = WriterProperties::builder().build(); +//! let props = WriterProperties::default(); //! //! // Use properties builder to set certain options and assemble the configuration. //! let props = WriterProperties::builder() @@ -130,7 +130,20 @@ pub struct WriterProperties { sorting_columns: Option>, } +impl Default for WriterProperties { + fn default() -> Self { + Self::builder().build() + } +} + impl WriterProperties { + /// Create a new [`WriterProperties`] with the default settings + /// + /// See [`WriterProperties::builder`] for customising settings + pub fn new() -> Self { + Self::default() + } + /// Returns builder for writer properties with default values. pub fn builder() -> WriterPropertiesBuilder { WriterPropertiesBuilder::with_defaults() @@ -836,7 +849,7 @@ mod tests { #[test] fn test_writer_properties_default_settings() { - let props = WriterProperties::builder().build(); + let props = WriterProperties::default(); assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE); assert_eq!( props.dictionary_pagesize_limit(), diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index bf843562ed02..782394942df4 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -785,7 +785,6 @@ mod tests { use crate::file::page_index::index_reader::{ read_columns_indexes, read_pages_locations, }; - use crate::file::properties::WriterProperties; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -1716,12 +1715,9 @@ mod tests { let schema = parse_message_type(message_type).unwrap(); let mut out = Vec::with_capacity(1024); - let mut writer = SerializedFileWriter::new( - &mut out, - Arc::new(schema), - Arc::new(WriterProperties::builder().build()), - ) - .unwrap(); + let mut writer = + SerializedFileWriter::new(&mut out, Arc::new(schema), Default::default()) + .unwrap(); let mut r = writer.next_row_group().unwrap(); let mut c = r.next_column().unwrap().unwrap(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index b4ae777bb131..9d0b7e677d10 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -825,7 +825,7 @@ mod tests { .build() .unwrap(), ); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); let row_group_writer = writer.next_row_group().unwrap(); let res = row_group_writer.close(); @@ -860,7 +860,7 @@ mod tests { .build() .unwrap(), ); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); let mut row_group_writer = writer.next_row_group().unwrap(); @@ -898,7 +898,7 @@ mod tests { .build() .unwrap(), ); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); writer.close().unwrap(); @@ -1575,7 +1575,7 @@ mod tests { "; let schema = Arc::new(parse_message_type(message_type).unwrap()); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = SerializedFileWriter::new(vec![], schema, props).unwrap(); let mut row_group_writer = writer.next_row_group().unwrap(); diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index 6525513cbaa1..a09b3b65233b 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -65,8 +65,7 @@ mod parquet_field; /// /// let schema = samples.as_slice().schema(); /// -/// let props = Arc::new(WriterProperties::builder().build()); -/// let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); +/// let mut writer = SerializedFileWriter::new(file, schema, Default::default()).unwrap(); /// /// let mut row_group = writer.next_row_group().unwrap(); /// samples.as_slice().write_to_row_group(&mut row_group).unwrap(); diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index 2aa174974aba..d2cf9efb1db6 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -139,7 +139,7 @@ mod tests { assert_eq!(&schema, &generated_schema); - let props = Arc::new(WriterProperties::builder().build()); + let props = Default::default(); let mut writer = SerializedFileWriter::new(file, generated_schema, props).unwrap(); From 17ca4d51d0490f9c65f5adde144f677dbc8300e7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 May 2023 07:39:56 -0400 Subject: [PATCH 0932/1411] Add `Debug` impls for `ArrowWriter` and `SerializedFileWriter` (#4278) * Add `Debug` impls for writers * Improve display --- parquet/src/arrow/arrow_writer/mod.rs | 25 +++++++++++++++++++++++++ parquet/src/file/writer.rs | 13 +++++++++++++ 2 files changed, 38 insertions(+) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2feb0bdbfafc..08cfc7ea3ebf 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -18,6 +18,7 @@ //! Contains writer which writes arrow data into parquet data. use std::collections::VecDeque; +use std::fmt::Debug; use std::io::Write; use std::sync::Arc; @@ -92,6 +93,30 @@ pub struct ArrowWriter { max_row_group_size: usize, } +impl Debug for ArrowWriter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let buffered_batches = self.buffer.len(); + let mut buffered_memory = 0; + + for batch in self.buffer.iter() { + for arr in batch.iter() { + buffered_memory += arr.get_array_memory_size() + } + } + + f.debug_struct("ArrowWriter") + .field("writer", &self.writer) + .field( + "buffer", + &format!("{buffered_batches} , {buffered_memory} bytes"), + ) + .field("buffered_rows", &self.buffered_rows) + .field("arrow_schema", &self.arrow_schema) + .field("max_row_group_size", &self.max_row_group_size) + .finish() + } +} + impl ArrowWriter { /// Try to create a new Arrow writer /// diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 9d0b7e677d10..4b1c4bad92e1 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -21,6 +21,7 @@ use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; +use std::fmt::Debug; use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TSerializable}; @@ -147,6 +148,18 @@ pub struct SerializedFileWriter { kv_metadatas: Vec, } +impl Debug for SerializedFileWriter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // implement Debug so this can be used with #[derive(Debug)] + // in client code rather than actually listing all the fields + f.debug_struct("SerializedFileWriter") + .field("descr", &self.descr) + .field("row_group_index", &self.row_group_index) + .field("kv_metadatas", &self.kv_metadatas) + .finish_non_exhaustive() + } +} + impl SerializedFileWriter { /// Creates new file writer. pub fn new(buf: W, schema: TypePtr, properties: WriterPropertiesPtr) -> Result { From cb5d45879087fda7b369e565e17545bf3e7e93f2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 25 May 2023 15:05:14 +0100 Subject: [PATCH 0933/1411] Add constructors for FixedSize array types (#3879) (#4263) * Add constructors for FixedSize array types (#3879) * Clippy --- .../src/array/fixed_size_binary_array.rs | 161 +++++++++++++---- .../src/array/fixed_size_list_array.rs | 162 +++++++++++++++++- arrow-buffer/src/buffer/null.rs | 26 +++ arrow-data/src/data/mod.rs | 61 ++----- 4 files changed, 328 insertions(+), 82 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 083d71cd963f..74a7c4c7a84a 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::iterator::FixedSizeBinaryIter; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray}; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -59,6 +59,78 @@ pub struct FixedSizeBinaryArray { } impl FixedSizeBinaryArray { + /// Create a new [`FixedSizeBinaryArray`] with `size` element size, panicking on failure + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new(size: i32, values: Buffer, nulls: Option) -> Self { + Self::try_new(size, values, nulls).unwrap() + } + + /// Create a new [`FixedSizeBinaryArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `size < 0` + /// * `values.len() / size != nulls.len()` + pub fn try_new( + size: i32, + values: Buffer, + nulls: Option, + ) -> Result { + let data_type = DataType::FixedSizeBinary(size); + let s = size.to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Size cannot be negative, got {}", + size + )) + })?; + + let len = values.len() / s; + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for FixedSizeBinaryArray, expected {} got {}", + len, + n.len(), + ))); + } + } + + Ok(Self { + data_type, + value_data: values, + value_length: size, + nulls, + len, + }) + } + + /// Create a new [`FixedSizeBinaryArray`] of length `len` where all values are null + /// + /// # Panics + /// + /// Panics if + /// + /// * `size < 0` + /// * `size * len` would overflow `usize` + pub fn new_null(size: i32, len: usize) -> Self { + let capacity = size.to_usize().unwrap().checked_mul(len).unwrap(); + Self { + data_type: DataType::FixedSizeBinary(size), + value_data: MutableBuffer::new(capacity).into(), + nulls: Some(NullBuffer::new_null(len)), + value_length: size, + len, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (i32, Buffer, Option) { + (self.value_length, self.value_data, self.nulls) + } + /// Returns the element at index `i` as a byte slice. /// # Panics /// Panics if index `i` is out of bounds. @@ -215,19 +287,17 @@ impl FixedSizeBinaryArray { )); } - let size = size.unwrap_or(0); - let array_data = unsafe { - ArrayData::new_unchecked( - DataType::FixedSizeBinary(size as i32), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ) - }; - Ok(FixedSizeBinaryArray::from(array_data)) + let null_buf = BooleanBuffer::new(null_buf.into(), 0, len); + let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0); + + let size = size.unwrap_or(0) as i32; + Ok(Self { + data_type: DataType::FixedSizeBinary(size), + value_data: buffer.into(), + nulls, + value_length: size, + len, + }) } /// Create an array from an iterable argument of sparse byte slices. @@ -298,18 +368,16 @@ impl FixedSizeBinaryArray { Ok(()) })?; - let array_data = unsafe { - ArrayData::new_unchecked( - DataType::FixedSizeBinary(size), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ) - }; - Ok(FixedSizeBinaryArray::from(array_data)) + let null_buf = BooleanBuffer::new(null_buf.into(), 0, len); + let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0); + + Ok(Self { + data_type: DataType::FixedSizeBinary(size), + value_data: buffer.into(), + nulls, + len, + value_length: size, + }) } /// Create an array from an iterable argument of byte slices. @@ -368,12 +436,14 @@ impl FixedSizeBinaryArray { )); } - let size = size.unwrap_or(0); - let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32)) - .len(len) - .add_buffer(buffer.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Ok(FixedSizeBinaryArray::from(array_data)) + let size = size.unwrap_or(0).try_into().unwrap(); + Ok(Self { + data_type: DataType::FixedSizeBinary(size), + value_data: buffer.into(), + nulls: None, + value_length: size, + len, + }) } #[inline] @@ -873,4 +943,31 @@ mod tests { array.value(4); } + + #[test] + fn test_constructors() { + let buffer = Buffer::from_vec(vec![0_u8; 10]); + let a = FixedSizeBinaryArray::new(2, buffer.clone(), None); + assert_eq!(a.len(), 5); + + let nulls = NullBuffer::new_null(5); + FixedSizeBinaryArray::new(2, buffer.clone(), Some(nulls)); + + let a = FixedSizeBinaryArray::new(3, buffer.clone(), None); + assert_eq!(a.len(), 3); + + let nulls = NullBuffer::new_null(3); + FixedSizeBinaryArray::new(3, buffer.clone(), Some(nulls)); + + let err = FixedSizeBinaryArray::try_new(-1, buffer.clone(), None).unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Size cannot be negative, got -1" + ); + + let nulls = NullBuffer::new_null(3); + let err = FixedSizeBinaryArray::try_new(2, buffer, Some(nulls)).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeBinaryArray, expected 5 got 3"); + } } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 18fa9df928ff..3df108ced04f 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -19,8 +19,9 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; use arrow_buffer::buffer::NullBuffer; +use arrow_buffer::ArrowNativeType; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType, FieldRef}; use std::any::Any; use std::sync::Arc; @@ -68,6 +69,114 @@ pub struct FixedSizeListArray { } impl FixedSizeListArray { + /// Create a new [`FixedSizeListArray`] with `size` element size, panicking on failure + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + size: i32, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, size, values, nulls).unwrap() + } + + /// Create a new [`FixedSizeListArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// * `size < 0` + /// * `values.len() / size != nulls.len()` + /// * `values.data_type() != field.data_type()` + /// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())` + pub fn try_new( + field: FieldRef, + size: i32, + values: ArrayRef, + nulls: Option, + ) -> Result { + let s = size.to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Size cannot be negative, got {}", + size + )) + })?; + + let len = values.len() / s; + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for FixedSizeListArray, expected {} got {}", + len, + n.len(), + ))); + } + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedSizeListArray expected data type {} got {} for {:?}", + field.data_type(), + values.data_type(), + field.name() + ))); + } + + if let Some(a) = values.nulls() { + let nulls_valid = field.is_nullable() + || nulls + .as_ref() + .map(|n| n.expand(size as _).contains(a)) + .unwrap_or_default(); + + if !nulls_valid { + return Err(ArrowError::InvalidArgumentError(format!( + "Found unmasked nulls for non-nullable FixedSizeListArray field {:?}", + field.name() + ))); + } + } + + let data_type = DataType::FixedSizeList(field, size); + Ok(Self { + data_type, + values, + value_length: size, + nulls, + len, + }) + } + + /// Create a new [`FixedSizeListArray`] of length `len` where all values are null + /// + /// # Panics + /// + /// Panics if + /// + /// * `size < 0` + /// * `size * len` would overflow `usize` + pub fn new_null(field: FieldRef, size: i32, len: usize) -> Self { + let capacity = size.to_usize().unwrap().checked_mul(len).unwrap(); + Self { + values: make_array(ArrayData::new_null(field.data_type(), capacity)), + data_type: DataType::FixedSizeList(field, size), + nulls: Some(NullBuffer::new_null(len)), + value_length: size, + len, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (FieldRef, i32, ArrayRef, Option) { + let f = match self.data_type { + DataType::FixedSizeList(f, _) => f, + _ => unreachable!(), + }; + (f, self.value_length, self.values, self.nulls) + } + /// Returns a reference to the values of this list. pub fn values(&self) -> &ArrayRef { &self.values @@ -285,7 +394,8 @@ mod tests { use super::*; use crate::cast::AsArray; use crate::types::Int32Type; - use arrow_buffer::{bit_util, Buffer}; + use crate::Int32Array; + use arrow_buffer::{bit_util, BooleanBuffer, Buffer}; use arrow_schema::Field; #[test] @@ -460,4 +570,52 @@ mod tests { list_array.value(10); } + + #[test] + fn test_fixed_size_list_constructors() { + let values = Arc::new(Int32Array::from_iter([ + Some(1), + Some(2), + None, + None, + Some(3), + Some(4), + ])); + + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), None); + assert_eq!(list.len(), 3); + + let nulls = NullBuffer::new_null(3); + let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), Some(nulls)); + assert_eq!(list.len(), 3); + + let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None); + assert_eq!(list.len(), 1); + + let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None) + .unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Size cannot be negative, got -1" + ); + + let nulls = NullBuffer::new_null(2); + let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2"); + + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None) + .unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\""); + + // Valid as nulls in child masked by parent + let nulls = NullBuffer::new(BooleanBuffer::new(vec![0b0000101].into(), 0, 3)); + FixedSizeListArray::new(field, 2, values.clone(), Some(nulls)); + + let field = Arc::new(Field::new("item", DataType::Int64, true)); + let err = FixedSizeListArray::try_new(field, 2, values, None).unwrap_err(); + assert_eq!(err.to_string(), "Invalid argument error: FixedSizeListArray expected data type Int64 got Int32 for \"item\""); + } } diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 60987be6e415..008d1f04fe85 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -76,11 +76,37 @@ impl NullBuffer { /// Returns true if all nulls in `other` also exist in self pub fn contains(&self, other: &NullBuffer) -> bool { + if other.null_count == 0 { + return true; + } let lhs = self.inner().bit_chunks().iter_padded(); let rhs = other.inner().bit_chunks().iter_padded(); lhs.zip(rhs).all(|(l, r)| (l & !r) == 0) } + /// Returns a new [`NullBuffer`] where each bit in the current null buffer + /// is repeated `count` times. This is useful for masking the nulls of + /// the child of a FixedSizeListArray based on its parent + pub fn expand(&self, count: usize) -> Self { + let capacity = self.buffer.len().checked_mul(count).unwrap(); + let mut buffer = MutableBuffer::new_null(capacity); + + // Expand each bit within `null_mask` into `element_len` + // bits, constructing the implicit mask of the child elements + for i in 0..self.buffer.len() { + if self.is_null(i) { + continue; + } + for j in 0..count { + crate::bit_util::set_bit(buffer.as_mut(), i * count + j) + } + } + Self { + buffer: BooleanBuffer::new(buffer.into(), 0, capacity), + null_count: self.null_count * count, + } + } + /// Returns the length of this [`NullBuffer`] #[inline] pub fn len(&self) -> usize { diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 103161f5a80d..32aae1e92a51 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -19,7 +19,6 @@ //! common attributes and operations for Arrow array. use crate::bit_iterator::BitSliceIterator; -use arrow_buffer::bit_chunk_iterator::BitChunks; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, UnionMode}; @@ -1143,7 +1142,7 @@ impl ArrayData { match &self.data_type { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { if !f.is_nullable() { - self.validate_non_nullable(None, 0, &self.child_data[0])? + self.validate_non_nullable(None, &self.child_data[0])? } } DataType::FixedSizeList(field, len) => { @@ -1152,40 +1151,17 @@ impl ArrayData { match &self.nulls { Some(nulls) => { let element_len = *len as usize; - let mut buffer = - MutableBuffer::new_null(element_len * self.len); - - // Expand each bit within `null_mask` into `element_len` - // bits, constructing the implicit mask of the child elements - for i in 0..self.len { - if nulls.is_null(i) { - continue; - } - for j in 0..element_len { - bit_util::set_bit( - buffer.as_mut(), - i * element_len + j, - ) - } - } - let mask = buffer.into(); - self.validate_non_nullable(Some(&mask), 0, child)?; + let expanded = nulls.expand(element_len); + self.validate_non_nullable(Some(&expanded), child)?; } - None => self.validate_non_nullable(None, 0, child)?, + None => self.validate_non_nullable(None, child)?, } } } DataType::Struct(fields) => { for (field, child) in fields.iter().zip(&self.child_data) { if !field.is_nullable() { - match &self.nulls { - Some(n) => self.validate_non_nullable( - Some(n.buffer()), - n.offset(), - child, - )?, - None => self.validate_non_nullable(None, 0, child)?, - } + self.validate_non_nullable(self.nulls(), child)? } } } @@ -1198,12 +1174,11 @@ impl ArrayData { /// Verifies that `child` contains no nulls not present in `mask` fn validate_non_nullable( &self, - mask: Option<&Buffer>, - offset: usize, + mask: Option<&NullBuffer>, child: &ArrayData, ) -> Result<(), ArrowError> { let mask = match mask { - Some(mask) => mask.as_ref(), + Some(mask) => mask, None => return match child.null_count() { 0 => Ok(()), _ => Err(ArrowError::InvalidArgumentError(format!( @@ -1215,23 +1190,13 @@ impl ArrayData { }; match child.nulls() { - Some(nulls) => { - let mask = BitChunks::new(mask, offset, child.len); - let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len); - mask - .iter_padded() - .zip(nulls.iter_padded()) - .try_for_each(|(m, c)| { - if (m & !c) != 0 { - return Err(ArrowError::InvalidArgumentError(format!( - "non-nullable child of type {} contains nulls not present in parent", - child.data_type - ))) - } - Ok(()) - }) + Some(nulls) if !mask.contains(nulls) => { + Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent", + child.data_type + ))) } - None => Ok(()), + _ => Ok(()), } } From 3fd744b6913d12a9079db0bae1199cc80caff5d9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 May 2023 14:14:42 -0400 Subject: [PATCH 0934/1411] Minor: Add more docstrings in arrow-flight (#4279) * Minor: Add more docstrings in arrow-flight * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- arrow-flight/src/lib.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 4960912ef8af..f7df32a20002 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -383,6 +383,15 @@ impl TryFrom for Schema { // FlightData, FlightDescriptor, etc.. impl FlightData { + /// Create a new [`FlightData`]. + /// + /// # See Also + /// + /// See [`FlightDataEncoderBuilder`] for a higher level API to + /// convert a stream of [`RecordBatch`]es to [`FlightData`]s + /// + /// [`FlightDataEncoderBuilder`]: crate::encode::FlightDataEncoderBuilder + /// [`RecordBatch`]: arrow_array::RecordBatch pub fn new( flight_descriptor: Option, message: IpcMessage, @@ -400,6 +409,9 @@ impl FlightData { } impl FlightDescriptor { + /// Create a new opaque command [`CMD`] `FlightDescriptor` to generate a dataset. + /// + /// [`CMD`]: https://github.com/apache/arrow/blob/6bd31f37ae66bd35594b077cb2f830be57e08acd/format/Flight.proto#L224-L227 pub fn new_cmd(cmd: impl Into) -> Self { FlightDescriptor { r#type: DescriptorType::Cmd.into(), @@ -408,6 +420,9 @@ impl FlightDescriptor { } } + /// Create a new named path [`PATH`] `FlightDescriptor` that identifies a dataset + /// + /// [`PATH`]: https://github.com/apache/arrow/blob/6bd31f37ae66bd35594b077cb2f830be57e08acd/format/Flight.proto#L217-L222 pub fn new_path(path: Vec) -> Self { FlightDescriptor { r#type: DescriptorType::Path.into(), @@ -418,6 +433,8 @@ impl FlightDescriptor { } impl FlightInfo { + /// Create a new [`FlightInfo`] that describes the access + /// coordinates for retrieval of a dataset. pub fn new( message: IpcMessage, flight_descriptor: Option, From 6959b4b08a78dd924d0044c64ac3b3a9b9fd3d2e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 26 May 2023 13:05:30 +0100 Subject: [PATCH 0935/1411] Only increment metrics for data pages (#4285) --- parquet/src/column/writer/mod.rs | 6 +++--- parquet/src/file/writer.rs | 6 +----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 137893092405..3fcfe6c1972a 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -915,11 +915,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) { self.column_metrics.total_uncompressed_size += page_spec.uncompressed_size as u64; self.column_metrics.total_compressed_size += page_spec.compressed_size as u64; - self.column_metrics.total_num_values += page_spec.num_values as u64; self.column_metrics.total_bytes_written += page_spec.bytes_written; match page_spec.page_type { PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => { + self.column_metrics.total_num_values += page_spec.num_values as u64; if self.column_metrics.data_page_offset.is_none() { self.column_metrics.data_page_offset = Some(page_spec.offset); } @@ -1512,7 +1512,7 @@ mod tests { metadata.encodings(), &vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY] ); - assert_eq!(metadata.num_values(), 8); // dictionary + value indexes + assert_eq!(metadata.num_values(), 4); assert_eq!(metadata.compressed_size(), 20); assert_eq!(metadata.uncompressed_size(), 20); assert_eq!(metadata.data_page_offset(), 0); @@ -1639,7 +1639,7 @@ mod tests { metadata.encodings(), &vec![Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY] ); - assert_eq!(metadata.num_values(), 8); // dictionary + value indexes + assert_eq!(metadata.num_values(), 4); assert_eq!(metadata.compressed_size(), 20); assert_eq!(metadata.uncompressed_size(), 20); assert_eq!(metadata.data_page_offset(), 0); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4b1c4bad92e1..c1c8db955d6e 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -26,7 +26,6 @@ use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; use thrift::protocol::{TCompactOutputProtocol, TSerializable}; -use crate::basic::PageType; use crate::column::writer::{ get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl, }; @@ -778,10 +777,7 @@ impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, W> { spec.compressed_size = compressed_size + header_size; spec.offset = start_pos; spec.bytes_written = self.sink.bytes_written() as u64 - start_pos; - // Number of values is incremented for data pages only - if page_type == PageType::DATA_PAGE || page_type == PageType::DATA_PAGE_V2 { - spec.num_values = num_values; - } + spec.num_values = num_values; Ok(spec) } From 18f91a20ab7078f82434cab45e2ae0791d639c5e Mon Sep 17 00:00:00 2001 From: Johann Fuechsl Date: Fri, 26 May 2023 14:30:34 +0200 Subject: [PATCH 0936/1411] Set ECS specific metadata endpoint if AWS_CONTAINER_CREDENTIALS_RELATIVE_URI is set (#4288) Co-authored-by: Johann Fuechsl --- object_store/src/aws/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 4c6d346603d5..fac6165b5147 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -84,7 +84,10 @@ pub type AwsCredentialProvider = Arc Date: Fri, 26 May 2023 10:50:49 -0400 Subject: [PATCH 0937/1411] Update proc-macro2 requirement from =1.0.58 to =1.0.59 (#4290) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.58...1.0.59) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index ce719d05b698..5f2f756b6237 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.58", default-features = false } +proc-macro2 = { version = "=1.0.59", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From aa799f0d03c42b59d5accacf87b6bda4cd36ceae Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 26 May 2023 23:04:53 +0100 Subject: [PATCH 0938/1411] Make GenericColumnWriter Send (#4287) --- parquet/src/arrow/arrow_writer/mod.rs | 6 +++--- parquet/src/column/page.rs | 2 +- parquet/src/column/writer/mod.rs | 6 ++++++ parquet/src/encodings/encoding/mod.rs | 2 +- parquet/src/file/writer.rs | 10 +++++----- parquet/src/record/record_writer.rs | 2 +- parquet_derive/src/lib.rs | 2 +- parquet_derive_test/src/lib.rs | 3 +-- 8 files changed, 19 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 08cfc7ea3ebf..616968bf6407 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -117,7 +117,7 @@ impl Debug for ArrowWriter { } } -impl ArrowWriter { +impl ArrowWriter { /// Try to create a new Arrow writer /// /// The writer will fail if: @@ -273,7 +273,7 @@ impl ArrowWriter { } } -impl RecordBatchWriter for ArrowWriter { +impl RecordBatchWriter for ArrowWriter { fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { self.write(batch).map_err(|e| e.into()) } @@ -284,7 +284,7 @@ impl RecordBatchWriter for ArrowWriter { } } -fn write_leaves( +fn write_leaves( row_group_writer: &mut SerializedRowGroupWriter<'_, W>, arrays: &[ArrayRef], levels: &mut [Vec], diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index bd3568d13cee..f854e5caca80 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -248,7 +248,7 @@ pub trait PageReader: Iterator> + Send { /// /// It is reasonable to assume that all pages will be written in the correct order, e.g. /// dictionary page followed by data pages, or a set of data pages, etc. -pub trait PageWriter { +pub trait PageWriter: Send { /// Writes a page into the output stream/sink. /// Returns `PageWriteSpec` that contains information about written page metrics, /// including number of bytes, size, number of values, offset, etc. diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 3fcfe6c1972a..bf77b2b325c1 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2174,6 +2174,12 @@ mod tests { ); } + #[test] + fn test_send() { + fn test() {} + test::>(); + } + /// Performs write-read roundtrip with randomly generated values and levels. /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index b7e30c4ecf08..3088f332183b 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -40,7 +40,7 @@ mod dict_encoder; /// /// Currently this allocates internal buffers for the encoded values. After done putting /// values, caller should call `flush_buffer()` to get an immutable buffer pointer. -pub trait Encoder { +pub trait Encoder: Send { /// Encodes data from `values`. fn put(&mut self, values: &[T::T]) -> Result<()>; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c1c8db955d6e..4f15c9f4ba02 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -159,7 +159,7 @@ impl Debug for SerializedFileWriter { } } -impl SerializedFileWriter { +impl SerializedFileWriter { /// Creates new file writer. pub fn new(buf: W, schema: TypePtr, properties: WriterPropertiesPtr) -> Result { let mut buf = TrackedWrite::new(buf); @@ -405,7 +405,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { on_close: Option>, } -impl<'a, W: Write> SerializedRowGroupWriter<'a, W> { +impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { /// Creates a new `SerializedRowGroupWriter` with: /// /// - `schema_descr` - the schema to write @@ -699,7 +699,7 @@ impl<'a, W: Write> SerializedPageWriter<'a, W> { } } -impl<'a, W: Write> PageWriter for SerializedPageWriter<'a, W> { +impl<'a, W: Write + Send> PageWriter for SerializedPageWriter<'a, W> { fn write_page(&mut self, page: CompressedPage) -> Result { let uncompressed_size = page.uncompressed_size(); let compressed_size = page.compressed_size(); @@ -1332,7 +1332,7 @@ mod tests { compression: Compression, ) -> crate::format::FileMetaData where - W: Write, + W: Write + Send, R: ChunkReader + From + 'static, { test_roundtrip::( @@ -1352,7 +1352,7 @@ mod tests { compression: Compression, ) -> crate::format::FileMetaData where - W: Write, + W: Write + Send, R: ChunkReader + From + 'static, D: DataType, F: Fn(Row) -> D::T, diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs index fe803a7ff4ef..62099051f513 100644 --- a/parquet/src/record/record_writer.rs +++ b/parquet/src/record/record_writer.rs @@ -21,7 +21,7 @@ use super::super::errors::ParquetError; use super::super::file::writer::SerializedRowGroupWriter; pub trait RecordWriter { - fn write_to_row_group( + fn write_to_row_group( &self, row_group_writer: &mut SerializedRowGroupWriter, ) -> Result<(), ParquetError>; diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index a09b3b65233b..0f875401f0e9 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -96,7 +96,7 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke (quote! { impl #generics ::parquet::record::RecordWriter<#derived_for #generics> for &[#derived_for #generics] { - fn write_to_row_group( + fn write_to_row_group( &self, row_group_writer: &mut ::parquet::file::writer::SerializedRowGroupWriter<'_, W> ) -> Result<(), ::parquet::errors::ParquetError> { diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index d2cf9efb1db6..f4f8be1e0d8c 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -56,8 +56,7 @@ mod tests { use std::{env, fs, io::Write, sync::Arc}; use parquet::{ - file::{properties::WriterProperties, writer::SerializedFileWriter}, - record::RecordWriter, + file::writer::SerializedFileWriter, record::RecordWriter, schema::parser::parse_message_type, }; From fb5c41860341ff5f49145de85c261d537aa20150 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Sat, 27 May 2023 10:44:28 +0200 Subject: [PATCH 0939/1411] fix: arrow_row docs.rs links (#4292) --- arrow-ord/src/sort.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 144d078d79e5..1d96532598ca 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -1061,7 +1061,7 @@ pub struct SortColumn { /// assert!(sorted_columns[0].is_null(0)); /// ``` /// -/// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow/latest/arrow/row/) +/// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow-row/latest/arrow_row/) /// may be significantly faster /// pub fn lexsort( @@ -1078,7 +1078,7 @@ pub fn lexsort( /// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer /// (`UInt32Array`) of indices. /// -/// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow/latest/arrow/row/) +/// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow-row/latest/arrow_row/) /// may be significantly faster pub fn lexsort_to_indices( columns: &[SortColumn], From 3e5b07aa4a9cdfa0f71cd7794c6e56532d12679e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sat, 27 May 2023 02:30:35 -0700 Subject: [PATCH 0940/1411] feat(api!): make ArrowArrayStreamReader Send (#4232) * feat(api make ArrowArrayStreamReader Send * simplify ptr handling * rename pyarrow traits to conform to guidelines * pr feedback * remove dangling Box::from_raw --- arrow-pyarrow-integration-testing/src/lib.rs | 10 +- arrow/src/ffi_stream.rs | 99 +++++++------------- arrow/src/pyarrow.rs | 93 ++++++++++++------ arrow/tests/pyarrow.rs | 2 +- 4 files changed, 103 insertions(+), 101 deletions(-) diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index af400868ffa9..730409b3777e 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -24,12 +24,12 @@ use arrow::array::new_empty_array; use pyo3::prelude::*; use pyo3::wrap_pyfunction; -use arrow::array::{Array, ArrayData, ArrayRef, Int64Array, make_array}; +use arrow::array::{make_array, Array, ArrayData, ArrayRef, Int64Array}; use arrow::compute::kernels; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::ArrowError; use arrow::ffi_stream::ArrowArrayStreamReader; -use arrow::pyarrow::{PyArrowConvert, PyArrowException, PyArrowType}; +use arrow::pyarrow::{FromPyArrow, PyArrowException, PyArrowType, ToPyArrow}; use arrow::record_batch::RecordBatch; fn to_py_err(err: ArrowError) -> PyErr { @@ -88,7 +88,8 @@ fn substring( let array = make_array(array.0); // substring - let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; + let array = + kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?; Ok(array.to_data().into()) } @@ -99,7 +100,8 @@ fn concatenate(array: PyArrowType, py: Python) -> PyResult let array = make_array(array.0); // concat - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; + let array = + kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?; array.to_data().to_pyarrow(py) } diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 0e358c36a0dc..cfda4c88b4b9 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -37,25 +37,19 @@ //! let reader = Box::new(FileReader::try_new(file).unwrap()); //! //! // export it -//! let stream = Box::new(FFI_ArrowArrayStream::empty()); -//! let stream_ptr = Box::into_raw(stream) as *mut FFI_ArrowArrayStream; -//! unsafe { export_reader_into_raw(reader, stream_ptr) }; +//! let mut stream = FFI_ArrowArrayStream::empty(); +//! unsafe { export_reader_into_raw(reader, &mut stream) }; //! //! // consumed and used by something else... //! //! // import it -//! let stream_reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr).unwrap() }; +//! let stream_reader = unsafe { ArrowArrayStreamReader::from_raw(&mut stream).unwrap() }; //! let imported_schema = stream_reader.schema(); //! //! let mut produced_batches = vec![]; //! for batch in stream_reader { //! produced_batches.push(batch.unwrap()); //! } -//! -//! // (drop/release) -//! unsafe { -//! Box::from_raw(stream_ptr); -//! } //! Ok(()) //! } //! ``` @@ -105,6 +99,8 @@ pub struct FFI_ArrowArrayStream { pub private_data: *mut c_void, } +unsafe impl Send for FFI_ArrowArrayStream {} + // callback used to drop [FFI_ArrowArrayStream] when it is exported. unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { if stream.is_null() { @@ -231,8 +227,7 @@ impl ExportedArrayStream { let struct_array = StructArray::from(batch); let array = FFI_ArrowArray::new(&struct_array.to_data()); - unsafe { std::ptr::copy(addr_of!(array), out, 1) }; - std::mem::forget(array); + unsafe { std::ptr::write_unaligned(out, array) }; 0 } else { let err = &next_batch.unwrap_err(); @@ -261,24 +256,21 @@ fn get_error_code(err: &ArrowError) -> i32 { /// Struct used to fetch `RecordBatch` from the C Stream Interface. /// Its main responsibility is to expose `RecordBatchReader` functionality /// that requires [FFI_ArrowArrayStream]. -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct ArrowArrayStreamReader { - stream: Arc, + stream: FFI_ArrowArrayStream, schema: SchemaRef, } /// Gets schema from a raw pointer of `FFI_ArrowArrayStream`. This is used when constructing /// `ArrowArrayStreamReader` to cache schema. fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result { - let empty_schema = Arc::new(FFI_ArrowSchema::empty()); - let schema_ptr = Arc::into_raw(empty_schema) as *mut FFI_ArrowSchema; + let mut schema = FFI_ArrowSchema::empty(); - let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, schema_ptr) }; - - let ffi_schema = unsafe { Arc::from_raw(schema_ptr) }; + let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, &mut schema) }; if ret_code == 0 { - let schema = Schema::try_from(ffi_schema.as_ref()).unwrap(); + let schema = Schema::try_from(&schema).unwrap(); Ok(Arc::new(schema)) } else { Err(ArrowError::CDataInterface(format!( @@ -291,21 +283,16 @@ impl ArrowArrayStreamReader { /// Creates a new `ArrowArrayStreamReader` from a `FFI_ArrowArrayStream`. /// This is used to import from the C Stream Interface. #[allow(dead_code)] - pub fn try_new(stream: FFI_ArrowArrayStream) -> Result { + pub fn try_new(mut stream: FFI_ArrowArrayStream) -> Result { if stream.release.is_none() { return Err(ArrowError::CDataInterface( "input stream is already released".to_string(), )); } - let stream_ptr = Arc::into_raw(Arc::new(stream)) as *mut FFI_ArrowArrayStream; - - let schema = get_stream_schema(stream_ptr)?; + let schema = get_stream_schema(&mut stream)?; - Ok(Self { - stream: unsafe { Arc::from_raw(stream_ptr) }, - schema, - }) + Ok(Self { stream, schema }) } /// Creates a new `ArrowArrayStreamReader` from a raw pointer of `FFI_ArrowArrayStream`. @@ -324,13 +311,12 @@ impl ArrowArrayStreamReader { } /// Get the last error from `ArrowArrayStreamReader` - fn get_stream_last_error(&self) -> Option { + fn get_stream_last_error(&mut self) -> Option { self.stream.get_last_error?; - let stream_ptr = Arc::as_ptr(&self.stream) as *mut FFI_ArrowArrayStream; - let error_str = unsafe { - let c_str = self.stream.get_last_error.unwrap()(stream_ptr) as *mut c_char; + let c_str = + self.stream.get_last_error.unwrap()(&mut self.stream) as *mut c_char; CString::from_raw(c_str).into_string() }; @@ -346,18 +332,14 @@ impl Iterator for ArrowArrayStreamReader { type Item = Result; fn next(&mut self) -> Option { - let stream_ptr = Arc::as_ptr(&self.stream) as *mut FFI_ArrowArrayStream; - - let empty_array = Arc::new(FFI_ArrowArray::empty()); - let array_ptr = Arc::into_raw(empty_array) as *mut FFI_ArrowArray; + let mut array = FFI_ArrowArray::empty(); - let ret_code = unsafe { self.stream.get_next.unwrap()(stream_ptr, array_ptr) }; + let ret_code = + unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) }; if ret_code == 0 { - let ffi_array = unsafe { Arc::from_raw(array_ptr) }; - // The end of stream has been reached - if ffi_array.is_released() { + if array.is_released() { return None; } @@ -365,7 +347,7 @@ impl Iterator for ArrowArrayStreamReader { let schema = FFI_ArrowSchema::try_from(schema_ref.as_ref()).ok()?; let data = ArrowArray { - array: ffi_array, + array: Arc::new(array), schema: Arc::new(schema), } .to_data() @@ -375,8 +357,6 @@ impl Iterator for ArrowArrayStreamReader { Some(Ok(record_batch)) } else { - unsafe { Arc::from_raw(array_ptr) }; - let last_error = self.get_stream_last_error(); let err = ArrowError::CDataInterface(last_error.unwrap()); Some(Err(err)) @@ -451,40 +431,33 @@ mod tests { let reader = TestRecordBatchReader::new(schema.clone(), iter); // Export a `RecordBatchReader` through `FFI_ArrowArrayStream` - let stream = Arc::new(FFI_ArrowArrayStream::empty()); - let stream_ptr = Arc::into_raw(stream) as *mut FFI_ArrowArrayStream; - - unsafe { export_reader_into_raw(reader, stream_ptr) }; - - let empty_schema = Arc::new(FFI_ArrowSchema::empty()); - let schema_ptr = Arc::into_raw(empty_schema) as *mut FFI_ArrowSchema; + let mut ffi_stream = FFI_ArrowArrayStream::empty(); + unsafe { export_reader_into_raw(reader, &mut ffi_stream) }; // Get schema from `FFI_ArrowArrayStream` - let ret_code = unsafe { get_schema(stream_ptr, schema_ptr) }; + let mut ffi_schema = FFI_ArrowSchema::empty(); + let ret_code = unsafe { get_schema(&mut ffi_stream, &mut ffi_schema) }; assert_eq!(ret_code, 0); - let ffi_schema = unsafe { Arc::from_raw(schema_ptr) }; - - let exported_schema = Schema::try_from(ffi_schema.as_ref()).unwrap(); + let exported_schema = Schema::try_from(&ffi_schema).unwrap(); assert_eq!(&exported_schema, schema.as_ref()); + let ffi_schema = Arc::new(ffi_schema); + // Get array from `FFI_ArrowArrayStream` let mut produced_batches = vec![]; loop { - let empty_array = Arc::new(FFI_ArrowArray::empty()); - let array_ptr = Arc::into_raw(empty_array.clone()) as *mut FFI_ArrowArray; - - let ret_code = unsafe { get_next(stream_ptr, array_ptr) }; + let mut ffi_array = FFI_ArrowArray::empty(); + let ret_code = unsafe { get_next(&mut ffi_stream, &mut ffi_array) }; assert_eq!(ret_code, 0); // The end of stream has been reached - let ffi_array = unsafe { Arc::from_raw(array_ptr) }; if ffi_array.is_released() { break; } let array = ArrowArray { - array: ffi_array, + array: Arc::new(ffi_array), schema: ffi_schema.clone(), } .to_data() @@ -496,7 +469,6 @@ mod tests { assert_eq!(produced_batches, vec![batch.clone(), batch]); - unsafe { Arc::from_raw(stream_ptr) }; Ok(()) } @@ -512,10 +484,8 @@ mod tests { let reader = TestRecordBatchReader::new(schema.clone(), iter); // Import through `FFI_ArrowArrayStream` as `ArrowArrayStreamReader` - let stream = Arc::new(FFI_ArrowArrayStream::new(reader)); - let stream_ptr = Arc::into_raw(stream) as *mut FFI_ArrowArrayStream; - let stream_reader = - unsafe { ArrowArrayStreamReader::from_raw(stream_ptr).unwrap() }; + let stream = FFI_ArrowArrayStream::new(reader); + let stream_reader = ArrowArrayStreamReader::try_new(stream).unwrap(); let imported_schema = stream_reader.schema(); assert_eq!(imported_schema, schema); @@ -527,7 +497,6 @@ mod tests { assert_eq!(produced_batches, vec![batch.clone(), batch]); - unsafe { Arc::from_raw(stream_ptr) }; Ok(()) } diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 081cc8063366..ba8d606f2e1f 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -15,13 +15,16 @@ // specific language governing permissions and limitations // under the License. -//! This module demonstrates a minimal usage of Rust's C data interface to pass -//! arrays from and to Python. +//! Pass Arrow objects from and to Python, using Arrow's +//! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) +//! and [pyo3](https://docs.rs/pyo3/latest/pyo3/). +//! For underlying implementation, see the [ffi] module. use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; +use pyo3::exceptions::PyValueError; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; @@ -44,12 +47,27 @@ fn to_py_err(err: ArrowError) -> PyErr { PyArrowException::new_err(err.to_string()) } -pub trait PyArrowConvert: Sized { +pub trait FromPyArrow: Sized { fn from_pyarrow(value: &PyAny) -> PyResult; +} + +/// Create a new PyArrow object from a arrow-rs type. +pub trait ToPyArrow { fn to_pyarrow(&self, py: Python) -> PyResult; } -impl PyArrowConvert for DataType { +/// Convert an arrow-rs type into a PyArrow object. +pub trait IntoPyArrow { + fn into_pyarrow(self, py: Python) -> PyResult; +} + +impl IntoPyArrow for T { + fn into_pyarrow(self, py: Python) -> PyResult { + self.to_pyarrow(py) + } +} + +impl FromPyArrow for DataType { fn from_pyarrow(value: &PyAny) -> PyResult { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -57,7 +75,9 @@ impl PyArrowConvert for DataType { let dtype = DataType::try_from(&c_schema).map_err(to_py_err)?; Ok(dtype) } +} +impl ToPyArrow for DataType { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -69,7 +89,7 @@ impl PyArrowConvert for DataType { } } -impl PyArrowConvert for Field { +impl FromPyArrow for Field { fn from_pyarrow(value: &PyAny) -> PyResult { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -77,7 +97,9 @@ impl PyArrowConvert for Field { let field = Field::try_from(&c_schema).map_err(to_py_err)?; Ok(field) } +} +impl ToPyArrow for Field { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -89,7 +111,7 @@ impl PyArrowConvert for Field { } } -impl PyArrowConvert for Schema { +impl FromPyArrow for Schema { fn from_pyarrow(value: &PyAny) -> PyResult { let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -97,7 +119,9 @@ impl PyArrowConvert for Schema { let schema = Schema::try_from(&c_schema).map_err(to_py_err)?; Ok(schema) } +} +impl ToPyArrow for Schema { fn to_pyarrow(&self, py: Python) -> PyResult { let c_schema = FFI_ArrowSchema::try_from(self).map_err(to_py_err)?; let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; @@ -109,7 +133,7 @@ impl PyArrowConvert for Schema { } } -impl PyArrowConvert for ArrayData { +impl FromPyArrow for ArrayData { fn from_pyarrow(value: &PyAny) -> PyResult { // prepare a pointer to receive the Array struct let mut array = FFI_ArrowArray::empty(); @@ -131,7 +155,9 @@ impl PyArrowConvert for ArrayData { Ok(data) } +} +impl ToPyArrow for ArrayData { fn to_pyarrow(&self, py: Python) -> PyResult { let array = FFI_ArrowArray::new(self); let schema = FFI_ArrowSchema::try_from(self.data_type()).map_err(to_py_err)?; @@ -149,12 +175,14 @@ impl PyArrowConvert for ArrayData { } } -impl PyArrowConvert for Vec { +impl FromPyArrow for Vec { fn from_pyarrow(value: &PyAny) -> PyResult { let list = value.downcast::()?; - list.iter().map(|x| T::from_pyarrow(&x)).collect() + list.iter().map(|x| T::from_pyarrow(x)).collect() } +} +impl ToPyArrow for Vec { fn to_pyarrow(&self, py: Python) -> PyResult { let values = self .iter() @@ -164,7 +192,7 @@ impl PyArrowConvert for Vec { } } -impl PyArrowConvert for RecordBatch { +impl FromPyArrow for RecordBatch { fn from_pyarrow(value: &PyAny) -> PyResult { // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches let schema = value.getattr("schema")?; @@ -179,7 +207,9 @@ impl PyArrowConvert for RecordBatch { let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; Ok(batch) } +} +impl ToPyArrow for RecordBatch { fn to_pyarrow(&self, py: Python) -> PyResult { let mut py_arrays = vec![]; @@ -203,38 +233,36 @@ impl PyArrowConvert for RecordBatch { } } -impl PyArrowConvert for ArrowArrayStreamReader { +impl FromPyArrow for ArrowArrayStreamReader { fn from_pyarrow(value: &PyAny) -> PyResult { // prepare a pointer to receive the stream struct - let stream = Box::new(FFI_ArrowArrayStream::empty()); - let stream_ptr = Box::into_raw(stream) as *mut FFI_ArrowArrayStream; + let mut stream = FFI_ArrowArrayStream::empty(); + let stream_ptr = &mut stream as *mut FFI_ArrowArrayStream; // make the conversion through PyArrow's private API // this changes the pointer's memory and is thus unsafe. // In particular, `_export_to_c` can go out of bounds - let args = PyTuple::new(value.py(), &[stream_ptr as Py_uintptr_t]); + let args = PyTuple::new(value.py(), [stream_ptr as Py_uintptr_t]); value.call_method1("_export_to_c", args)?; - let stream_reader = - unsafe { ArrowArrayStreamReader::from_raw(stream_ptr).unwrap() }; - - unsafe { - drop(Box::from_raw(stream_ptr)); - } + let stream_reader = ArrowArrayStreamReader::try_new(stream) + .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(stream_reader) } +} - fn to_pyarrow(&self, py: Python) -> PyResult { - let stream = Box::new(FFI_ArrowArrayStream::empty()); - let stream_ptr = Box::into_raw(stream) as *mut FFI_ArrowArrayStream; - - unsafe { export_reader_into_raw(Box::new(self.clone()), stream_ptr) }; +impl IntoPyArrow for ArrowArrayStreamReader { + fn into_pyarrow(self, py: Python) -> PyResult { + let mut stream = FFI_ArrowArrayStream::empty(); + unsafe { export_reader_into_raw(Box::new(self), &mut stream) }; + let stream_ptr = (&mut stream) as *mut FFI_ArrowArrayStream; let module = py.import("pyarrow")?; let class = module.getattr("RecordBatchReader")?; - let args = PyTuple::new(py, &[stream_ptr as Py_uintptr_t]); + let args = PyTuple::new(py, [stream_ptr as Py_uintptr_t]); let reader = class.call_method1("_import_from_c", args)?; + Ok(PyObject::from(reader)) } } @@ -242,21 +270,24 @@ impl PyArrowConvert for ArrowArrayStreamReader { /// A newtype wrapper around a `T: PyArrowConvert` that implements /// [`FromPyObject`] and [`IntoPy`] allowing usage with pyo3 macros #[derive(Debug)] -pub struct PyArrowType(pub T); +pub struct PyArrowType(pub T); -impl<'source, T: PyArrowConvert> FromPyObject<'source> for PyArrowType { +impl<'source, T: FromPyArrow + IntoPyArrow> FromPyObject<'source> for PyArrowType { fn extract(value: &'source PyAny) -> PyResult { Ok(Self(T::from_pyarrow(value)?)) } } -impl<'a, T: PyArrowConvert> IntoPy for PyArrowType { +impl IntoPy for PyArrowType { fn into_py(self, py: Python) -> PyObject { - self.0.to_pyarrow(py).unwrap() + match self.0.into_pyarrow(py) { + Ok(obj) => obj, + Err(err) => err.to_object(py), + } } } -impl From for PyArrowType { +impl From for PyArrowType { fn from(s: T) -> Self { Self(s) } diff --git a/arrow/tests/pyarrow.rs b/arrow/tests/pyarrow.rs index 4b1226c738f5..4b6991da0063 100644 --- a/arrow/tests/pyarrow.rs +++ b/arrow/tests/pyarrow.rs @@ -16,7 +16,7 @@ // under the License. use arrow::array::{ArrayRef, Int32Array, StringArray}; -use arrow::pyarrow::PyArrowConvert; +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; use arrow::record_batch::RecordBatch; use pyo3::Python; use std::sync::Arc; From 770e241ceac89d693dd7577c72266f6dad48c9e2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sat, 27 May 2023 02:31:34 -0700 Subject: [PATCH 0941/1411] feat: support bulk deletes in object_store (#4060) * feat: support bulk deletes * fix: make NotFound reporting consistent * fix http store * fix aws support * remove unnecessary flag * fix: make AWS S3 compatible * pr feedback: use simpler API * pr feedback: test paths and ordering * Update object_store/src/limit.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * take fallible stream * final pr feedback --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/CONTRIBUTING.md | 14 ++- object_store/Cargo.toml | 2 +- object_store/src/aws/client.rs | 175 ++++++++++++++++++++++++++++++++ object_store/src/aws/mod.rs | 21 ++++ object_store/src/http/client.rs | 8 +- object_store/src/lib.rs | 123 +++++++++++++++++++++- object_store/src/limit.rs | 7 ++ object_store/src/local.rs | 9 +- 8 files changed, 345 insertions(+), 14 deletions(-) diff --git a/object_store/CONTRIBUTING.md b/object_store/CONTRIBUTING.md index 47c294022659..aeb38e13a51c 100644 --- a/object_store/CONTRIBUTING.md +++ b/object_store/CONTRIBUTING.md @@ -39,7 +39,8 @@ To test the S3 integration against [localstack](https://localstack.cloud/) First start up a container running localstack ``` -$ podman run --rm -it -e PROVIDER_OVERRIDE_S3=asf -p 4566:4566 -p 4510-4559:4510-4559 localstack/localstack +$ podman run -d -p 4566:4566 localstack/localstack:2.0 +$ podman run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 ``` Setup environment @@ -87,13 +88,18 @@ $ podman run -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azur Create a bucket ``` -$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://128.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://128.0.0.1:10001/devstoreaccount1;' +$ podman run --net=host mcr.microsoft.com/azure-cli az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' ``` Run tests -``` -$ cargo test --features azure +```shell +AZURE_USE_EMULATOR=1 \ +TEST_INTEGRATION=1 \ +OBJECT_STORE_BUCKET=test-bucket \ +AZURE_STORAGE_ACCOUNT=devstoreaccount1 \ +AZURE_STORAGE_ACCESS_KEY=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== \ +cargo test --features azure ``` ### GCP diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 1fb988642dda..28bf29f7f1e0 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -45,7 +45,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.28.0", features = ["serialize"], optional = true } +quick-xml = { version = "0.28.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index cfce35254d65..0c2493651000 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -32,7 +32,9 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; +use quick_xml::events::{self as xml_events}; use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -66,6 +68,29 @@ pub(crate) enum Error { path: String, }, + #[snafu(display("Error performing DeleteObjects request: {}", source))] + DeleteObjectsRequest { source: crate::client::retry::Error }, + + #[snafu(display( + "DeleteObjects request failed for key {}: {} (code: {})", + path, + message, + code + ))] + DeleteFailed { + path: String, + code: String, + message: String, + }, + + #[snafu(display("Error getting DeleteObjects response body: {}", source))] + DeleteObjectsResponse { source: reqwest::Error }, + + #[snafu(display("Got invalid DeleteObjects response: {}", source))] + InvalidDeleteObjectsResponse { + source: Box, + }, + #[snafu(display("Error performing copy request {}: {}", path, source))] CopyRequest { source: crate::client::retry::Error, @@ -129,6 +154,44 @@ struct MultipartPart { part_number: usize, } +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "DeleteResult")] +struct BatchDeleteResponse { + #[serde(rename = "$value")] + content: Vec, +} + +#[derive(Deserialize)] +enum DeleteObjectResult { + Deleted(DeletedObject), + Error(DeleteError), +} + +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "Deleted")] +struct DeletedObject { + #[allow(dead_code)] + key: String, +} + +#[derive(Deserialize)] +#[serde(rename_all = "PascalCase", rename = "Error")] +struct DeleteError { + key: String, + code: String, + message: String, +} + +impl From for Error { + fn from(err: DeleteError) -> Self { + Self::DeleteFailed { + path: err.key, + code: err.code, + message: err.message, + } + } +} + #[derive(Debug)] pub struct S3Config { pub region: String, @@ -243,6 +306,118 @@ impl S3Client { Ok(()) } + /// Make an S3 Delete Objects request + /// + /// Produces a vector of results, one for each path in the input vector. If + /// the delete was successful, the path is returned in the `Ok` variant. If + /// there was an error for a certain path, the error will be returned in the + /// vector. If there was an issue with making the overall request, an error + /// will be returned at the top level. + pub async fn bulk_delete_request( + &self, + paths: Vec, + ) -> Result>> { + if paths.is_empty() { + return Ok(Vec::new()); + } + + let credential = self.get_credential().await?; + let url = format!("{}?delete", self.config.bucket_endpoint); + + let mut buffer = Vec::new(); + let mut writer = quick_xml::Writer::new(&mut buffer); + writer + .write_event(xml_events::Event::Start( + xml_events::BytesStart::new("Delete").with_attributes([( + "xmlns", + "http://s3.amazonaws.com/doc/2006-03-01/", + )]), + )) + .unwrap(); + for path in &paths { + // {path} + writer + .write_event(xml_events::Event::Start(xml_events::BytesStart::new( + "Object", + ))) + .unwrap(); + writer + .write_event(xml_events::Event::Start(xml_events::BytesStart::new("Key"))) + .unwrap(); + writer + .write_event(xml_events::Event::Text(xml_events::BytesText::new( + path.as_ref(), + ))) + .map_err(|err| crate::Error::Generic { + store: STORE, + source: Box::new(err), + })?; + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Key"))) + .unwrap(); + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Object"))) + .unwrap(); + } + writer + .write_event(xml_events::Event::End(xml_events::BytesEnd::new("Delete"))) + .unwrap(); + + let body = Bytes::from(buffer); + + let mut builder = self.client.request(Method::POST, url); + + // Compute checksum - S3 *requires* this for DeleteObjects requests, so we default to + // their algorithm if the user hasn't specified one. + let checksum = self.config().checksum.unwrap_or(Checksum::SHA256); + let digest = checksum.digest(&body); + builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + let payload_sha256 = if checksum == Checksum::SHA256 { + Some(digest) + } else { + None + }; + + let response = builder + .header(CONTENT_TYPE, "application/xml") + .body(body) + .with_aws_sigv4( + credential.as_ref(), + &self.config.region, + "s3", + self.config.sign_payload, + payload_sha256.as_deref(), + ) + .send_retry(&self.config.retry_config) + .await + .context(DeleteObjectsRequestSnafu {})? + .bytes() + .await + .context(DeleteObjectsResponseSnafu {})?; + + let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|err| Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + })?; + + // Assume all were ok, then fill in errors. This guarantees output order + // matches input order. + let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); + for content in response.content.into_iter() { + if let DeleteObjectResult::Error(error) = content { + let path = Path::parse(&error.key).map_err(|err| { + Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + } + })?; + let i = paths.iter().find_position(|&p| p == &path).unwrap().0; + results[i] = Err(Error::from(error).into()); + } + } + + Ok(results) + } + /// Make an S3 Copy request pub async fn copy_request(&self, from: &Path, to: &Path) -> Result<()> { let credential = self.get_credential().await?; diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index fac6165b5147..3696e4ad4eb2 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -34,6 +34,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; @@ -252,6 +253,26 @@ impl ObjectStore for AmazonS3 { self.client.delete_request(location, &()).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .try_chunks(1_000) + .map(move |locations| async { + // Early return the error. We ignore the paths that have already been + // collected into the chunk. + let locations = locations.map_err(|e| e.1)?; + self.client + .bulk_delete_request(locations) + .await + .map(futures::stream::iter) + }) + .buffered(20) + .try_flatten() + .boxed() + } + async fn list( &self, prefix: Option<&Path>, diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 6feacbba6c2d..1d3df34db9d1 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -225,7 +225,13 @@ impl Client { .delete(url) .send_retry(&self.retry_config) .await - .context(RequestSnafu)?; + .map_err(|source| match source.status() { + Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { + source: Box::new(source), + path: path.to_string(), + }, + _ => Error::Request { source }.into(), + })?; Ok(()) } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 7116a8732ba6..c5bf40cc4882 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -386,6 +386,63 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> Result<()>; + /// Delete all the objects at the specified locations + /// + /// When supported, this method will use bulk operations that delete more + /// than one object per a request. The default implementation will call + /// the single object delete method for each location, but with up to 10 + /// concurrent requests. + /// + /// The returned stream yields the results of the delete operations in the + /// same order as the input locations. However, some errors will be from + /// an overall call to a bulk delete operation, and not from a specific + /// location. + /// + /// If the object did not exist, the result may be an error or a success, + /// depending on the behavior of the underlying store. For example, local + /// filesystems, GCP, and Azure return an error, while S3 and in-memory will + /// return Ok. If it is an error, it will be [`Error::NotFound`]. + /// + /// ``` + /// # use object_store::local::LocalFileSystem; + /// # async fn example() -> Result<(), Box> { + /// # let root = tempfile::TempDir::new().unwrap(); + /// # let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + /// use object_store::{ObjectStore, ObjectMeta}; + /// use object_store::path::Path; + /// use futures::{StreamExt, TryStreamExt}; + /// use bytes::Bytes; + /// + /// // Create two objects + /// store.put(&Path::from("foo"), Bytes::from("foo")).await?; + /// store.put(&Path::from("bar"), Bytes::from("bar")).await?; + /// + /// // List object + /// let locations = store.list(None).await? + /// .map(|meta: Result| meta.map(|m| m.location)) + /// .boxed(); + /// + /// // Delete them + /// store.delete_stream(locations).try_collect::>().await?; + /// # Ok(()) + /// # } + /// # let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + /// # rt.block_on(example()).unwrap(); + /// ``` + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + locations + .map(|location| async { + let location = location?; + self.delete(&location).await?; + Ok(location) + }) + .buffered(10) + .boxed() + } + /// List all the objects with the given prefix. /// /// Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of `foo/bar/x` but not of @@ -515,6 +572,13 @@ impl ObjectStore for Box { self.as_ref().delete(location).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.as_ref().delete_stream(locations) + } + async fn list( &self, prefix: Option<&Path>, @@ -1119,6 +1183,49 @@ mod tests { assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); } + // Test bulk delete + let paths = vec![ + Path::from("a/a.file"), + Path::from("a/a/b.file"), + Path::from("aa/a.file"), + Path::from("does_not_exist"), + Path::from("I'm a < & weird path"), + Path::from("ab/a.file"), + Path::from("a/😀.file"), + ]; + + storage.put(&paths[4], "foo".into()).await.unwrap(); + + let out_paths = storage + .delete_stream(futures::stream::iter(paths.clone()).map(Ok).boxed()) + .collect::>() + .await; + + assert_eq!(out_paths.len(), paths.len()); + + let expect_errors = [3]; + + for (i, input_path) in paths.iter().enumerate() { + let err = storage.head(input_path).await.unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + if expect_errors.contains(&i) { + // Some object stores will report NotFound, but others (such as S3) will + // report success regardless. + match &out_paths[i] { + Err(Error::NotFound { path: out_path, .. }) => { + assert!(out_path.ends_with(&input_path.to_string())); + } + Ok(out_path) => { + assert_eq!(out_path, input_path); + } + _ => panic!("unexpected error"), + } + } else { + assert_eq!(out_paths[i].as_ref().unwrap(), input_path); + } + } + delete_fixtures(storage).await; } @@ -1471,11 +1578,17 @@ mod tests { } async fn delete_fixtures(storage: &DynObjectStore) { - let paths = flatten_list_stream(storage, None).await.unwrap(); - - for f in &paths { - storage.delete(f).await.unwrap(); - } + let paths = storage + .list(None) + .await + .unwrap() + .map_ok(|meta| meta.location) + .boxed(); + storage + .delete_stream(paths) + .try_collect::>() + .await + .unwrap(); } /// Test that the returned stream does not borrow the lifetime of Path diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index e0091115d8f6..630fd145b72c 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -148,6 +148,13 @@ impl ObjectStore for LimitStore { self.inner.delete(location).await } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.inner.delete_stream(locations) + } + async fn list( &self, prefix: Option<&Path>, diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 52719f1cb562..bbd54db2ea19 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -444,9 +444,12 @@ impl ObjectStore for LocalFileSystem { async fn delete(&self, location: &Path) -> Result<()> { let path = self.config.path_to_filesystem(location)?; - maybe_spawn_blocking(move || { - std::fs::remove_file(&path).context(UnableToDeleteFileSnafu { path })?; - Ok(()) + maybe_spawn_blocking(move || match std::fs::remove_file(&path) { + Ok(_) => Ok(()), + Err(e) => Err(match e.kind() { + ErrorKind::NotFound => Error::NotFound { path, source: e }.into(), + _ => Error::UnableToDeleteFile { path, source: e }.into(), + }), }) .await } From 77aa8f5b2645a91724048f5c1d644c6b52880028 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Sat, 27 May 2023 12:14:39 +0200 Subject: [PATCH 0942/1411] feat(flight): add sql-info helpers (#4266) * feat: baseline sql-info helpers * chore: clippy * chore: add license to files * docs: add some basic docstrings * Update arrow-flight/src/sql/sql_info.rs Co-authored-by: Andrew Lamb * fix: move flight info * test: add simple filter test * fix: docs link * fix: one more docs link * fix: one more one more docs link --------- Co-authored-by: Andrew Lamb --- arrow-flight/Cargo.toml | 8 +- arrow-flight/examples/flight_sql_server.rs | 104 ++++-- arrow-flight/src/sql/mod.rs | 4 + arrow-flight/src/sql/server.rs | 19 +- arrow-flight/src/sql/sql_info.rs | 376 +++++++++++++++++++++ 5 files changed, 464 insertions(+), 47 deletions(-) create mode 100644 arrow-flight/src/sql/sql_info.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index e22642b2a727..206cc6505c4b 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -31,15 +31,17 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 arrow-cast = { workspace = true } +arrow-data = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } base64 = { version = "0.21", default-features = false, features = ["std"] } -tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } +futures = { version = "0.3", default-features = false, features = ["alloc"] } +once_cell = { version = "1", optional = true } paste = { version = "1.0" } prost = { version = "0.11", default-features = false, features = ["prost-derive"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -futures = { version = "0.3", default-features = false, features = ["alloc"] } +tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } @@ -51,7 +53,7 @@ all-features = true [features] default = [] -flight-sql-experimental = [] +flight-sql-experimental = ["once_cell"] tls = ["tonic/tls"] # Enable CLI tools diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 01632285cf66..27ae5d85434c 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -15,22 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::StringBuilder; -use arrow_array::{ArrayRef, RecordBatch}; -use arrow_flight::sql::{ - ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, - ActionCreatePreparedStatementResult, ActionEndSavepointRequest, - ActionEndTransactionRequest, Any, CommandStatementSubstraitPlan, ProstMessageExt, - SqlInfo, -}; -use arrow_flight::{ - Action, FlightData, FlightEndpoint, HandshakeRequest, HandshakeResponse, IpcMessage, - Location, SchemaAsIpc, Ticket, -}; use base64::prelude::BASE64_STANDARD; use base64::Engine; -use futures::{stream, Stream}; +use futures::{stream, Stream, TryStreamExt}; +use once_cell::sync::Lazy; use prost::Message; use std::pin::Pin; use std::sync::Arc; @@ -38,22 +26,30 @@ use tonic::transport::Server; use tonic::transport::{Certificate, Identity, ServerTlsConfig}; use tonic::{Request, Response, Status, Streaming}; +use arrow_array::builder::StringBuilder; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_flight::encode::FlightDataEncoderBuilder; use arrow_flight::flight_descriptor::DescriptorType; +use arrow_flight::sql::sql_info::SqlInfoList; +use arrow_flight::sql::{ + server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, + ActionBeginTransactionRequest, ActionBeginTransactionResult, + ActionCancelQueryRequest, ActionCancelQueryResult, + ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, + ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, + ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, + CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, + CommandStatementSubstraitPlan, CommandStatementUpdate, ProstMessageExt, SqlInfo, + TicketStatementQuery, +}; use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ - flight_service_server::FlightService, - flight_service_server::FlightServiceServer, - sql::{ - server::FlightSqlService, ActionBeginTransactionRequest, - ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedSubstraitPlanRequest, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, - CommandStatementQuery, CommandStatementUpdate, TicketStatementQuery, - }, - FlightDescriptor, FlightInfo, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, + Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, + HandshakeResponse, IpcMessage, Location, SchemaAsIpc, Ticket, }; use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; @@ -68,6 +64,15 @@ const FAKE_TOKEN: &str = "uuid_token"; const FAKE_HANDLE: &str = "uuid_handle"; const FAKE_UPDATE_RESULT: i64 = 1; +static INSTANCE_SQL_INFO: Lazy = Lazy::new(|| { + SqlInfoList::new() + // Server information + .with_sql_info(SqlInfo::FlightSqlServerName, "Example Flight SQL Server") + .with_sql_info(SqlInfo::FlightSqlServerVersion, "1") + // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24 + .with_sql_info(SqlInfo::FlightSqlServerArrowVersion, "1.3") +}); + #[derive(Clone)] pub struct FlightSqlServiceImpl {} @@ -283,12 +288,38 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_sql_info( &self, - _query: CommandGetSqlInfo, - _request: Request, + query: CommandGetSqlInfo, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_sql_info not implemented", - )) + let flight_descriptor = request.into_inner(); + let ticket = Ticket { + ticket: query.encode_to_vec().into(), + }; + + let options = IpcWriteOptions::default(); + + // encode the schema into the correct form + let IpcMessage(schema) = SchemaAsIpc::new(SqlInfoList::schema(), &options) + .try_into() + .expect("valid sql_info schema"); + + let endpoint = vec![FlightEndpoint { + ticket: Some(ticket), + // we assume users wnating to use this helper would reasonably + // never need to be distributed across multile endpoints? + location: vec![], + }]; + + let flight_info = FlightInfo { + schema, + flight_descriptor: Some(flight_descriptor), + endpoint, + total_records: -1, + total_bytes: -1, + ordered: false, + }; + + Ok(tonic::Response::new(flight_info)) } async fn get_flight_info_primary_keys( @@ -394,10 +425,15 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_get_sql_info( &self, - _query: CommandGetSqlInfo, + query: CommandGetSqlInfo, _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("do_get_sql_info not implemented")) + let batch = INSTANCE_SQL_INFO.filter(&query.info).encode(); + let stream = FlightDataEncoderBuilder::new() + .with_schema(Arc::new(SqlInfoList::schema().clone())) + .build(futures::stream::once(async { batch })) + .map_err(Status::from); + Ok(Response::new(Box::pin(stream))) } async fn do_get_primary_keys( diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 797ddfc9e4a6..2c193b78bed1 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -84,6 +84,7 @@ pub use gen::SqlSupportedPositionedCommands; pub use gen::SqlSupportedResultSetConcurrency; pub use gen::SqlSupportedResultSetType; pub use gen::SqlSupportedSubqueries; +pub use gen::SqlSupportedTransaction; pub use gen::SqlSupportedTransactions; pub use gen::SqlSupportedUnions; pub use gen::SqlSupportsConvert; @@ -92,8 +93,11 @@ pub use gen::SupportedSqlGrammar; pub use gen::TicketStatementQuery; pub use gen::UpdateDeleteRules; +pub use sql_info::SqlInfoList; + pub mod client; pub mod server; +pub mod sql_info; /// ProstMessageExt are useful utility methods for prost::Message types pub trait ProstMessageExt: prost::Message + Default { diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 89eb70e23b35..a33b5b92d01e 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -19,30 +19,29 @@ use std::pin::Pin; -use crate::sql::{Any, Command}; use futures::Stream; use prost::Message; use tonic::{Request, Response, Status, Streaming}; use super::{ - super::{ - flight_service_server::FlightService, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, - PutResult, SchemaResult, Ticket, - }, ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, - ActionEndSavepointRequest, ActionEndTransactionRequest, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + ActionEndSavepointRequest, ActionEndTransactionRequest, Any, Command, + CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, + CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, }; +use crate::{ + flight_service_server::FlightService, Action, ActionType, Criteria, Empty, + FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, + PutResult, SchemaResult, Ticket, +}; pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; pub(crate) static CLOSE_PREPARED_STATEMENT: &str = "ClosePreparedStatement"; diff --git a/arrow-flight/src/sql/sql_info.rs b/arrow-flight/src/sql/sql_info.rs new file mode 100644 index 000000000000..e0b7df70cb37 --- /dev/null +++ b/arrow-flight/src/sql/sql_info.rs @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Auxiliary module to handle [`crate::sql::CommandGetSqlInfo`] queries. +//! +//! [`crate::sql::CommandGetSqlInfo`] represents metadata requests againsts the Flight SQL server. +//! Via this mechanism, the server can communicate supported capabilities to generic +//! Flight SQL clients. +//! +//! Servers construct a [`SqlInfoList`] by adding infos via `with_sql_info`. +//! The availabe configuration options are defined in the [Flight SQL protos][protos]. +//! +//! [protos]: https://github.com/apache/arrow/blob/6d3d2fca2c9693231fa1e52c142ceef563fc23f9/format/FlightSql.proto#L71-L820 + +use std::{borrow::Cow, collections::BTreeMap, sync::Arc}; + +use arrow_array::array::{Array, UnionArray}; +use arrow_array::builder::{ + ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, + StringBuilder, UInt32Builder, +}; +use arrow_array::RecordBatch; +use arrow_data::ArrayData; +use arrow_schema::{DataType, Field, Schema, UnionFields, UnionMode}; +use once_cell::sync::Lazy; + +use super::SqlInfo; +use crate::error::Result; + +/// Represents a dynamic value +#[derive(Debug, Clone, PartialEq)] +pub enum SqlInfoValue { + String(String), + Bool(bool), + BigInt(i64), + Bitmask(i32), + StringList(Vec), + // TODO support more exotic metadata that requires the map of lists + //ListMap(BTreeMap>), +} + +impl From<&str> for SqlInfoValue { + fn from(value: &str) -> Self { + Self::String(value.to_string()) + } +} + +impl From for SqlInfoValue { + fn from(value: bool) -> Self { + Self::Bool(value) + } +} + +impl From for SqlInfoValue { + fn from(value: i32) -> Self { + Self::Bitmask(value) + } +} + +impl From for SqlInfoValue { + fn from(value: i64) -> Self { + Self::BigInt(value) + } +} + +impl From<&[&str]> for SqlInfoValue { + fn from(values: &[&str]) -> Self { + let values = values.iter().map(|s| s.to_string()).collect(); + Self::StringList(values) + } +} + +/// Something that can be converted into u32 (the represenation of a [`SqlInfo`] name) +pub trait SqlInfoName { + fn as_u32(&self) -> u32; +} + +impl SqlInfoName for SqlInfo { + fn as_u32(&self) -> u32 { + // SqlInfos are u32 in the flight spec, but for some reason + // SqlInfo repr is an i32, so convert between them + u32::try_from(i32::from(*self)).expect("SqlInfo fit into u32") + } +} + +// Allow passing u32 directly into to with_sql_info +impl SqlInfoName for u32 { + fn as_u32(&self) -> u32 { + *self + } +} + +/// Handles creating the dense [`UnionArray`] described by [flightsql] +/// +/// +/// NOT YET COMPLETE: The int32_to_int32_list_map +/// +/// ```text +/// * value: dense_union< +/// * string_value: utf8, +/// * bool_value: bool, +/// * bigint_value: int64, +/// * int32_bitmask: int32, +/// * string_list: list +/// * int32_to_int32_list_map: map> +/// * > +/// ``` +///[flightsql]: (https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/FlightSql.proto#L32-L43 +struct SqlInfoUnionBuilder { + // Values for each child type + string_values: StringBuilder, + bool_values: BooleanBuilder, + bigint_values: Int64Builder, + int32_bitmask_values: Int32Builder, + string_list_values: ListBuilder, + + /// incrementally build types/offset of the dense union, + /// + /// See [Union Spec] for details. + /// + /// [Union Spec]: https://arrow.apache.org/docs/format/Columnar.html#dense-union + type_ids: Int8Builder, + offsets: Int32Builder, +} + +/// [`DataType`] for the output union array +static UNION_TYPE: Lazy = Lazy::new(|| { + let fields = vec![ + Field::new("string_value", DataType::Utf8, false), + Field::new("bool_value", DataType::Boolean, false), + Field::new("bigint_value", DataType::Int64, false), + Field::new("int32_bitmask", DataType::Int32, false), + // treat list as nullable b/c that is what the builders make + Field::new( + "string_list", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + ]; + + // create "type ids", one for each type, assume they go from 0 .. num_fields + let type_ids: Vec = (0..fields.len()).map(|v| v as i8).collect(); + + DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense) +}); + +impl SqlInfoUnionBuilder { + pub fn new() -> Self { + Self { + string_values: StringBuilder::new(), + bool_values: BooleanBuilder::new(), + bigint_values: Int64Builder::new(), + int32_bitmask_values: Int32Builder::new(), + string_list_values: ListBuilder::new(StringBuilder::new()), + type_ids: Int8Builder::new(), + offsets: Int32Builder::new(), + } + } + + /// Returns the DataType created by this builder + pub fn schema() -> &'static DataType { + &UNION_TYPE + } + + /// Append the specified value to this builder + pub fn append_value(&mut self, v: &SqlInfoValue) { + // typeid is which child and len is the child array's length + // *after* adding the value + let (type_id, len) = match v { + SqlInfoValue::String(v) => { + self.string_values.append_value(v); + (0, self.string_values.len()) + } + SqlInfoValue::Bool(v) => { + self.bool_values.append_value(*v); + (1, self.bool_values.len()) + } + SqlInfoValue::BigInt(v) => { + self.bigint_values.append_value(*v); + (2, self.bigint_values.len()) + } + SqlInfoValue::Bitmask(v) => { + self.int32_bitmask_values.append_value(*v); + (3, self.int32_bitmask_values.len()) + } + SqlInfoValue::StringList(values) => { + // build list + for v in values { + self.string_list_values.values().append_value(v); + } + // complete the list + self.string_list_values.append(true); + (4, self.string_list_values.len()) + } + }; + + self.type_ids.append_value(type_id); + let len = i32::try_from(len).expect("offset fit in i32"); + self.offsets.append_value(len - 1); + } + + /// Complete the construction and build the [`UnionArray`] + pub fn finish(self) -> UnionArray { + let Self { + mut string_values, + mut bool_values, + mut bigint_values, + mut int32_bitmask_values, + mut string_list_values, + mut type_ids, + mut offsets, + } = self; + let type_ids = type_ids.finish(); + let offsets = offsets.finish(); + + // form the correct ArrayData + + let len = offsets.len(); + let null_bit_buffer = None; + let offset = 0; + + let buffers = vec![ + type_ids.into_data().buffers()[0].clone(), + offsets.into_data().buffers()[0].clone(), + ]; + + let child_data = vec![ + string_values.finish().into_data(), + bool_values.finish().into_data(), + bigint_values.finish().into_data(), + int32_bitmask_values.finish().into_data(), + string_list_values.finish().into_data(), + ]; + + let data = ArrayData::try_new( + UNION_TYPE.clone(), + len, + null_bit_buffer, + offset, + buffers, + child_data, + ) + .expect("Correctly created UnionArray"); + + UnionArray::from(data) + } +} + +/// A list of SQL info names and valies +#[derive(Debug, Clone, PartialEq)] +pub struct SqlInfoList { + /// Use BTreeMap to ensure the values are sorted by value as + /// to make output consistent + /// + /// Use u32 to support "custom" sql info values that are not + /// part of the SqlInfo enum + infos: BTreeMap, +} + +impl Default for SqlInfoList { + fn default() -> Self { + Self::new() + } +} + +impl SqlInfoList { + pub fn new() -> Self { + Self { + infos: BTreeMap::new(), + } + } + + /// register the specific sql metadata item + pub fn with_sql_info( + mut self, + name: impl SqlInfoName, + value: impl Into, + ) -> Self { + self.infos.insert(name.as_u32(), value.into()); + self + } + + /// Filter this info list keeping only the info values specified + /// in `infos`. + /// + /// Returns self if infos is empty (no filtering) + pub fn filter(&self, info: &[u32]) -> Cow<'_, Self> { + if info.is_empty() { + Cow::Borrowed(self) + } else { + let infos: BTreeMap<_, _> = info + .iter() + .filter_map(|name| self.infos.get(name).map(|v| (*name, v.clone()))) + .collect(); + Cow::Owned(Self { infos }) + } + } + + /// Encode the contents of this info list according to the FlightSQL spec + pub fn encode(&self) -> Result { + let mut name_builder = UInt32Builder::new(); + let mut value_builder = SqlInfoUnionBuilder::new(); + + for (&name, value) in self.infos.iter() { + name_builder.append_value(name); + value_builder.append_value(value) + } + + let batch = RecordBatch::try_from_iter(vec![ + ("info_name", Arc::new(name_builder.finish()) as _), + ("value", Arc::new(value_builder.finish()) as _), + ])?; + Ok(batch) + } + + /// Return the [`Schema`] for a GetSchema RPC call with [`crate::sql::CommandGetSqlInfo`] + pub fn schema() -> &'static Schema { + // It is always the same + &SQL_INFO_SCHEMA + } +} + +// The schema produced by [`SqlInfoList`] +static SQL_INFO_SCHEMA: Lazy = Lazy::new(|| { + Schema::new(vec![ + Field::new("info_name", DataType::UInt32, false), + Field::new("value", SqlInfoUnionBuilder::schema().clone(), false), + ]) +}); + +#[cfg(test)] +mod tests { + use super::SqlInfoList; + use crate::sql::{SqlInfo, SqlSupportedTransaction}; + + #[test] + fn test_filter_sql_infos() { + let info_list = SqlInfoList::new() + .with_sql_info(SqlInfo::FlightSqlServerName, "server name") + .with_sql_info( + SqlInfo::FlightSqlServerTransaction, + SqlSupportedTransaction::Transaction as i32, + ); + + let batch = info_list.encode().unwrap(); + assert_eq!(batch.num_rows(), 2); + + let batch = info_list + .filter(&[SqlInfo::FlightSqlServerTransaction as u32]) + .encode() + .unwrap(); + let ref_batch = SqlInfoList::new() + .with_sql_info( + SqlInfo::FlightSqlServerTransaction, + SqlSupportedTransaction::Transaction as i32, + ) + .encode() + .unwrap(); + + assert_eq!(batch, ref_batch); + } +} From 8311818c25a5b056c696ae2edb111be0cdcf09b9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 29 May 2023 07:08:07 -0400 Subject: [PATCH 0943/1411] Improve docs and tests for `SqlInfoList (#4293) * Improve docs and tests for SqlInfoList * Add an example/ * Update arrow-flight/src/sql/sql_info.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- arrow-flight/src/sql/mod.rs | 1 + arrow-flight/src/sql/sql_info.rs | 95 ++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 12 deletions(-) diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 2c193b78bed1..74d3176c67b1 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -27,6 +27,7 @@ //! 2. Helpers for encoding and decoding FlightSQL messages: [`Any`] and [`Command`] //! 3. A [`FlightSqlServiceClient`] for interacting with FlightSQL servers. //! 4. A [`FlightSqlService`] to help building FlightSQL servers from [`FlightService`]. +//! 5. Structures to build responses for FlightSQL metadata APIs: [`SqlInfoList`] //! //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html //! [Apache Arrow]: https://arrow.apache.org diff --git a/arrow-flight/src/sql/sql_info.rs b/arrow-flight/src/sql/sql_info.rs index e0b7df70cb37..717f1393c879 100644 --- a/arrow-flight/src/sql/sql_info.rs +++ b/arrow-flight/src/sql/sql_info.rs @@ -15,16 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Auxiliary module to handle [`crate::sql::CommandGetSqlInfo`] queries. +//! [`SqlInfoList`] for building responses to [`CommandGetSqlInfo`] queries. //! -//! [`crate::sql::CommandGetSqlInfo`] represents metadata requests againsts the Flight SQL server. -//! Via this mechanism, the server can communicate supported capabilities to generic -//! Flight SQL clients. -//! -//! Servers construct a [`SqlInfoList`] by adding infos via `with_sql_info`. -//! The availabe configuration options are defined in the [Flight SQL protos][protos]. -//! -//! [protos]: https://github.com/apache/arrow/blob/6d3d2fca2c9693231fa1e52c142ceef563fc23f9/format/FlightSql.proto#L71-L820 +//! [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo use std::{borrow::Cow, collections::BTreeMap, sync::Arc}; @@ -260,7 +253,37 @@ impl SqlInfoUnionBuilder { } } -/// A list of SQL info names and valies +/// A list of FlightSQL server capabilties. +/// +/// [`CommandGetSqlInfo`] are metadata requests used by a Flight SQL +/// server to communicate supported capabilities to Flight SQL +/// clients. +/// +/// Servers construct a [`SqlInfoList`] by adding infos via +/// [`with_sql_info`] and build the response using [`encode`]. +/// +/// The available configuration options are defined in the [Flight SQL protos][protos]. +/// +/// # Example +/// ``` +/// # use arrow_flight::sql::{SqlInfoList, SqlInfo, SqlSupportedTransaction}; +/// // Create the list of metadata describing the server +/// let info_list = SqlInfoList::new() +/// .with_sql_info(SqlInfo::FlightSqlServerName, "server name") +/// // ... add other SqlInfo here .. +/// .with_sql_info( +/// SqlInfo::FlightSqlServerTransaction, +/// SqlSupportedTransaction::Transaction as i32, +/// ); +/// +/// // Create the batch to send back to the client +/// let batch = info_list.encode().unwrap(); +/// ``` +/// +/// [protos]: https://github.com/apache/arrow/blob/6d3d2fca2c9693231fa1e52c142ceef563fc23f9/format/FlightSql.proto#L71-L820 +/// [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo +/// [`with_sql_info`]: SqlInfoList::with_sql_info +/// [`encode`]: SqlInfoList::encode #[derive(Debug, Clone, PartialEq)] pub struct SqlInfoList { /// Use BTreeMap to ensure the values are sorted by value as @@ -310,7 +333,9 @@ impl SqlInfoList { } } - /// Encode the contents of this info list according to the FlightSQL spec + /// Encode the contents of this list according to the [FlightSQL spec] + /// + /// [FlightSQL spec]: (https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/FlightSql.proto#L32-L43 pub fn encode(&self) -> Result { let mut name_builder = UInt32Builder::new(); let mut value_builder = SqlInfoUnionBuilder::new(); @@ -345,7 +370,53 @@ static SQL_INFO_SCHEMA: Lazy = Lazy::new(|| { #[cfg(test)] mod tests { use super::SqlInfoList; - use crate::sql::{SqlInfo, SqlSupportedTransaction}; + use crate::sql::{SqlInfo, SqlNullOrdering, SqlSupportedTransaction}; + use arrow_array::RecordBatch; + use arrow_cast::pretty::pretty_format_batches; + + fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { + let formatted = pretty_format_batches(batches).unwrap().to_string(); + let actual_lines: Vec<_> = formatted.trim().lines().collect(); + assert_eq!( + &actual_lines, expected_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + } + + #[test] + fn test_sql_infos() { + let batch = SqlInfoList::new() + // str + .with_sql_info(SqlInfo::SqlIdentifierQuoteChar, r#"""#) + // bool + .with_sql_info(SqlInfo::SqlDdlCatalog, false) + // i32 + .with_sql_info( + SqlInfo::SqlNullOrdering, + SqlNullOrdering::SqlNullsSortedHigh as i32, + ) + // i64 + .with_sql_info(SqlInfo::SqlMaxBinaryLiteralLength, i32::MAX as i64) + // [str] + .with_sql_info(SqlInfo::SqlKeywords, &["SELECT", "DELETE"] as &[&str]) + .encode() + .unwrap(); + + let expected = vec![ + "+-----------+--------------------------------+", + "| info_name | value |", + "+-----------+--------------------------------+", + "| 500 | {bool_value=false} |", + "| 504 | {string_value=\"} |", + "| 507 | {int32_bitmask=0} |", + "| 508 | {string_list=[SELECT, DELETE]} |", + "| 541 | {bigint_value=2147483647} |", + "+-----------+--------------------------------+", + ]; + + assert_batches_eq(&[batch], &expected); + } #[test] fn test_filter_sql_infos() { From ea008922445d84d957cf3f89df793187c22d82d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 29 May 2023 13:04:46 +0100 Subject: [PATCH 0944/1411] Improve `ArrowWriter` memory usage: Buffer Pages in ArrowWriter instead of RecordBatch (#3871) (#4280) * Buffer Pages in ArrowWriter instead of RecordBatch (#3871) * Review feedback * Improved memory accounting * Clippy --- parquet/src/arrow/arrow_writer/byte_array.rs | 57 +- parquet/src/arrow/arrow_writer/mod.rs | 673 +++++++++++-------- parquet/src/column/page.rs | 69 ++ parquet/src/column/writer/encoder.rs | 2 +- parquet/src/column/writer/mod.rs | 42 ++ parquet/src/file/writer.rs | 102 +-- parquet/src/util/memory.rs | 6 + 7 files changed, 535 insertions(+), 416 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 77f9598b23fe..6dbc83dd05c4 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -15,25 +15,21 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::arrow_writer::levels::LevelInfo; use crate::basic::Encoding; use crate::bloom_filter::Sbbf; -use crate::column::page::PageWriter; use crate::column::writer::encoder::{ ColumnValueEncoder, DataPageValues, DictionaryPage, }; -use crate::column::writer::GenericColumnWriter; use crate::data_type::{AsBytes, ByteArray, Int32Type}; use crate::encodings::encoding::{DeltaBitPackEncoder, Encoder}; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::file::properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}; -use crate::file::writer::OnCloseColumnChunk; +use crate::file::properties::{WriterProperties, WriterVersion}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow_array::{ - Array, ArrayAccessor, ArrayRef, BinaryArray, DictionaryArray, LargeBinaryArray, + Array, ArrayAccessor, BinaryArray, DictionaryArray, LargeBinaryArray, LargeStringArray, StringArray, }; use arrow_schema::DataType; @@ -94,49 +90,6 @@ macro_rules! downcast_op { }; } -/// A writer for byte array types -pub(super) struct ByteArrayWriter<'a> { - writer: GenericColumnWriter<'a, ByteArrayEncoder>, - on_close: Option>, -} - -impl<'a> ByteArrayWriter<'a> { - /// Returns a new [`ByteArrayWriter`] - pub fn new( - descr: ColumnDescPtr, - props: WriterPropertiesPtr, - page_writer: Box, - on_close: OnCloseColumnChunk<'a>, - ) -> Result { - Ok(Self { - writer: GenericColumnWriter::new(descr, props, page_writer), - on_close: Some(on_close), - }) - } - - pub fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { - self.writer.write_batch_internal( - array, - Some(levels.non_null_indices()), - levels.def_levels(), - levels.rep_levels(), - None, - None, - None, - )?; - Ok(()) - } - - pub fn close(self) -> Result<()> { - let r = self.writer.close()?; - - if let Some(on_close) = self.on_close { - on_close(r)?; - } - Ok(()) - } -} - /// A fallback encoder, i.e. non-dictionary, for [`ByteArray`] struct FallbackEncoder { encoder: FallbackEncoderImpl, @@ -427,7 +380,7 @@ impl DictEncoder { } } -struct ByteArrayEncoder { +pub struct ByteArrayEncoder { fallback: FallbackEncoder, dict_encoder: Option, min_value: Option, @@ -437,11 +390,11 @@ struct ByteArrayEncoder { impl ColumnValueEncoder for ByteArrayEncoder { type T = ByteArray; - type Values = ArrayRef; + type Values = dyn Array; fn min_max( &self, - values: &ArrayRef, + values: &dyn Array, value_indices: Option<&[usize]>, ) -> Option<(Self::T, Self::T)> { match value_indices { diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 616968bf6407..bde21ae856d0 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -17,16 +17,21 @@ //! Contains writer which writes arrow data into parquet data. -use std::collections::VecDeque; +use bytes::Bytes; use std::fmt::Debug; -use std::io::Write; -use std::sync::Arc; +use std::io::{Read, Write}; +use std::iter::Peekable; +use std::slice::Iter; +use std::sync::{Arc, Mutex}; +use std::vec::IntoIter; +use thrift::protocol::{TCompactOutputProtocol, TSerializable}; use arrow_array::cast::AsArray; -use arrow_array::types::{Decimal128Type, Int32Type, Int64Type, UInt32Type, UInt64Type}; -use arrow_array::{ - types, Array, ArrayRef, FixedSizeListArray, RecordBatch, RecordBatchWriter, +use arrow_array::types::{ + Decimal128Type, Float32Type, Float64Type, Int32Type, Int64Type, UInt32Type, + UInt64Type, }; +use arrow_array::{Array, FixedSizeListArray, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, IntervalUnit, SchemaRef}; use super::schema::{ @@ -34,14 +39,19 @@ use super::schema::{ decimal_length_from_precision, }; -use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; -use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; -use crate::data_type::{ByteArray, DataType, FixedLenByteArray}; +use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; +use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; +use crate::column::writer::encoder::ColumnValueEncoder; +use crate::column::writer::{ + get_column_writer, ColumnCloseResult, ColumnWriter, GenericColumnWriter, +}; +use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{KeyValue, RowGroupMetaDataPtr}; -use crate::file::properties::WriterProperties; +use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr}; +use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; +use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::SerializedFileWriter; -use crate::file::writer::SerializedRowGroupWriter; +use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; use levels::{calculate_array_levels, LevelInfo}; mod byte_array; @@ -49,8 +59,8 @@ mod levels; /// Arrow writer /// -/// Writes Arrow `RecordBatch`es to a Parquet writer, buffering up `RecordBatch` in order -/// to produce row groups with `max_row_group_size` rows. Any remaining rows will be +/// Writes Arrow `RecordBatch`es to a Parquet writer. Multiple [`RecordBatch`] will be encoded +/// to the same row group, up to `max_row_group_size` rows. Any remaining rows will be /// flushed on close, leading the final row group in the output file to potentially /// contain fewer than `max_row_group_size` rows /// @@ -78,11 +88,8 @@ pub struct ArrowWriter { /// Underlying Parquet writer writer: SerializedFileWriter, - /// For each column, maintain an ordered queue of arrays to write - buffer: Vec>, - - /// The total number of rows currently buffered - buffered_rows: usize, + /// The in-progress row group if any + in_progress: Option, /// A copy of the Arrow schema. /// @@ -93,24 +100,13 @@ pub struct ArrowWriter { max_row_group_size: usize, } -impl Debug for ArrowWriter { +impl Debug for ArrowWriter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let buffered_batches = self.buffer.len(); - let mut buffered_memory = 0; - - for batch in self.buffer.iter() { - for arr in batch.iter() { - buffered_memory += arr.get_array_memory_size() - } - } - + let buffered_memory = self.in_progress_size(); f.debug_struct("ArrowWriter") .field("writer", &self.writer) - .field( - "buffer", - &format!("{buffered_batches} , {buffered_memory} bytes"), - ) - .field("buffered_rows", &self.buffered_rows) + .field("in_progress_size", &format_args!("{buffered_memory} bytes")) + .field("in_progress_rows", &self.in_progress_rows()) .field("arrow_schema", &self.arrow_schema) .field("max_row_group_size", &self.max_row_group_size) .finish() @@ -140,8 +136,7 @@ impl ArrowWriter { Ok(Self { writer: file_writer, - buffer: vec![Default::default(); arrow_schema.fields().len()], - buffered_rows: 0, + in_progress: None, arrow_schema, max_row_group_size, }) @@ -152,43 +147,75 @@ impl ArrowWriter { self.writer.flushed_row_groups() } - /// Enqueues the provided `RecordBatch` to be written + /// Returns the estimated length in bytes of the current in progress row group + pub fn in_progress_size(&self) -> usize { + match &self.in_progress { + Some(in_progress) => in_progress + .writers + .iter() + .map(|(_, x)| x.get_estimated_total_bytes() as usize) + .sum(), + None => 0, + } + } + + /// Returns the number of rows buffered in the in progress row group + pub fn in_progress_rows(&self) -> usize { + self.in_progress + .as_ref() + .map(|x| x.buffered_rows) + .unwrap_or_default() + } + + /// Encodes the provided [`RecordBatch`] /// - /// If following this there are more than `max_row_group_size` rows buffered, - /// this will flush out one or more row groups with `max_row_group_size` rows, - /// and drop any fully written `RecordBatch` + /// If this would cause the current row group to exceed [`WriterProperties::max_row_group_size`] + /// rows, the contents of `batch` will be written to one or more row groups such that all but + /// the final row group in the file contain [`WriterProperties::max_row_group_size`] rows pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { - // validate batch schema against writer's supplied schema - let batch_schema = batch.schema(); - if !(Arc::ptr_eq(&self.arrow_schema, &batch_schema) - || self.arrow_schema.contains(&batch_schema)) - { - return Err(ParquetError::ArrowError( - "Record batch schema does not match writer schema".to_string(), - )); + if batch.num_rows() == 0 { + return Ok(()); } - for (buffer, column) in self.buffer.iter_mut().zip(batch.columns()) { - buffer.push_back(column.clone()) - } + let in_progress = match &mut self.in_progress { + Some(in_progress) => in_progress, + x => x.insert(ArrowRowGroupWriter::new( + self.writer.schema_descr(), + self.writer.properties(), + &self.arrow_schema, + )?), + }; - self.buffered_rows += batch.num_rows(); - self.flush_completed()?; + // If would exceed max_row_group_size, split batch + if in_progress.buffered_rows + batch.num_rows() > self.max_row_group_size { + let to_write = self.max_row_group_size - in_progress.buffered_rows; + let a = batch.slice(0, to_write); + let b = batch.slice(to_write, batch.num_rows() - to_write); + self.write(&a)?; + return self.write(&b); + } - Ok(()) - } + in_progress.write(batch)?; - /// Flushes buffered data until there are less than `max_row_group_size` rows buffered - fn flush_completed(&mut self) -> Result<()> { - while self.buffered_rows >= self.max_row_group_size { - self.flush_rows(self.max_row_group_size)?; + if in_progress.buffered_rows >= self.max_row_group_size { + self.flush()? } Ok(()) } /// Flushes all buffered rows into a new row group pub fn flush(&mut self) -> Result<()> { - self.flush_rows(self.buffered_rows) + let in_progress = match self.in_progress.take() { + Some(in_progress) => in_progress, + None => return Ok(()), + }; + + let mut row_group_writer = self.writer.next_row_group()?; + for (chunk, close) in in_progress.close()? { + row_group_writer.append_column(&chunk, close)?; + } + row_group_writer.close()?; + Ok(()) } /// Additional [`KeyValue`] metadata to be written in addition to those from [`WriterProperties`] @@ -198,68 +225,6 @@ impl ArrowWriter { self.writer.append_key_value_metadata(kv_metadata) } - /// Flushes `num_rows` from the buffer into a new row group - fn flush_rows(&mut self, num_rows: usize) -> Result<()> { - if num_rows == 0 { - return Ok(()); - } - - assert!( - num_rows <= self.buffered_rows, - "cannot flush {} rows only have {}", - num_rows, - self.buffered_rows - ); - - assert!( - num_rows <= self.max_row_group_size, - "cannot flush {} rows would exceed max row group size of {}", - num_rows, - self.max_row_group_size - ); - - let mut row_group_writer = self.writer.next_row_group()?; - - for (col_buffer, field) in self.buffer.iter_mut().zip(self.arrow_schema.fields()) - { - // Collect the number of arrays to append - let mut remaining = num_rows; - let mut arrays = Vec::with_capacity(col_buffer.len()); - while remaining != 0 { - match col_buffer.pop_front() { - Some(next) if next.len() > remaining => { - col_buffer - .push_front(next.slice(remaining, next.len() - remaining)); - arrays.push(next.slice(0, remaining)); - remaining = 0; - } - Some(next) => { - remaining -= next.len(); - arrays.push(next); - } - _ => break, - } - } - - let mut levels = arrays - .iter() - .map(|array| { - let mut levels = calculate_array_levels(array, field)?; - // Reverse levels as we pop() them when writing arrays - levels.reverse(); - Ok(levels) - }) - .collect::>>()?; - - write_leaves(&mut row_group_writer, &arrays, &mut levels)?; - } - - row_group_writer.close()?; - self.buffered_rows -= num_rows; - - Ok(()) - } - /// Flushes any outstanding data and returns the underlying writer. pub fn into_inner(mut self) -> Result { self.flush()?; @@ -284,156 +249,284 @@ impl RecordBatchWriter for ArrowWriter { } } -fn write_leaves( - row_group_writer: &mut SerializedRowGroupWriter<'_, W>, - arrays: &[ArrayRef], - levels: &mut [Vec], -) -> Result<()> { - assert_eq!(arrays.len(), levels.len()); - assert!(!arrays.is_empty()); - - let data_type = arrays.first().unwrap().data_type().clone(); - assert!(arrays.iter().all(|a| a.data_type() == &data_type)); - - match &data_type { - ArrowDataType::Null - | ArrowDataType::Boolean - | ArrowDataType::Int8 - | ArrowDataType::Int16 - | ArrowDataType::Int32 - | ArrowDataType::Int64 - | ArrowDataType::UInt8 - | ArrowDataType::UInt16 - | ArrowDataType::UInt32 - | ArrowDataType::UInt64 - | ArrowDataType::Float32 - | ArrowDataType::Float64 - | ArrowDataType::Timestamp(_, _) - | ArrowDataType::Date32 - | ArrowDataType::Date64 - | ArrowDataType::Time32(_) - | ArrowDataType::Time64(_) - | ArrowDataType::Duration(_) - | ArrowDataType::Interval(_) - | ArrowDataType::Decimal128(_, _) - | ArrowDataType::Decimal256(_, _) - | ArrowDataType::FixedSizeBinary(_) => { - let mut col_writer = row_group_writer.next_column()?.unwrap(); - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; - } - col_writer.close() - } - ArrowDataType::LargeBinary - | ArrowDataType::Binary - | ArrowDataType::Utf8 - | ArrowDataType::LargeUtf8 => { - let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - col_writer.write(array, levels.pop().expect("Levels exhausted"))?; +/// A list of [`Bytes`] comprising a single column chunk +#[derive(Default)] +struct ArrowColumnChunk { + length: usize, + data: Vec, +} + +impl Length for ArrowColumnChunk { + fn len(&self) -> u64 { + self.length as _ + } +} + +impl ChunkReader for ArrowColumnChunk { + type T = ChainReader; + + fn get_read(&self, start: u64) -> Result { + assert_eq!(start, 0); // Assume append_column writes all data in one-shot + Ok(ChainReader(self.data.clone().into_iter().peekable())) + } + + fn get_bytes(&self, _start: u64, _length: usize) -> Result { + unimplemented!() + } +} + +/// A [`Read`] for an iterator of [`Bytes`] +struct ChainReader(Peekable>); + +impl Read for ChainReader { + fn read(&mut self, out: &mut [u8]) -> std::io::Result { + let buffer = loop { + match self.0.peek_mut() { + Some(b) if b.is_empty() => { + self.0.next(); + continue; + } + Some(b) => break b, + None => return Ok(0), } - col_writer.close() + }; + + let len = buffer.len().min(out.len()); + let b = buffer.split_to(len); + out[..len].copy_from_slice(&b); + Ok(len) + } +} + +/// A shared [`ArrowColumnChunk`] +/// +/// This allows it to be owned by [`ArrowPageWriter`] whilst allowing access via +/// [`ArrowRowGroupWriter`] on flush, without requiring self-referential borrows +type SharedColumnChunk = Arc>; + +#[derive(Default)] +struct ArrowPageWriter { + buffer: SharedColumnChunk, +} + +impl PageWriter for ArrowPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let mut buf = self.buffer.try_lock().unwrap(); + let page_header = page.to_thrift_header(); + let header = { + let mut header = Vec::with_capacity(1024); + let mut protocol = TCompactOutputProtocol::new(&mut header); + page_header.write_to_out_protocol(&mut protocol)?; + Bytes::from(header) + }; + + let data = page.compressed_page().buffer().clone(); + let compressed_size = data.len() + header.len(); + + let mut spec = PageWriteSpec::new(); + spec.page_type = page.page_type(); + spec.num_values = page.num_values(); + spec.uncompressed_size = page.uncompressed_size() + header.len(); + spec.offset = buf.length as u64; + spec.compressed_size = compressed_size; + spec.bytes_written = compressed_size as u64; + + buf.length += compressed_size; + buf.data.push(header); + buf.data.push(data.into()); + + Ok(spec) + } + + fn write_metadata(&mut self, _metadata: &ColumnChunkMetaData) -> Result<()> { + // Skip writing metadata as won't be copied anyway + Ok(()) + } + + fn close(&mut self) -> Result<()> { + Ok(()) + } +} + +/// Encodes a leaf column to [`ArrowPageWriter`] +enum ArrowColumnWriter { + ByteArray(GenericColumnWriter<'static, ByteArrayEncoder>), + Column(ColumnWriter<'static>), +} + +impl ArrowColumnWriter { + /// Returns the estimated total bytes for this column writer + fn get_estimated_total_bytes(&self) -> u64 { + match self { + ArrowColumnWriter::ByteArray(c) => c.get_estimated_total_bytes(), + ArrowColumnWriter::Column(c) => c.get_estimated_total_bytes(), } - ArrowDataType::List(_) => { - let arrays: Vec<_> = arrays.iter().map(|array|{ - array.as_list::().values().clone() - }).collect(); + } +} + +/// Encodes [`RecordBatch`] to a parquet row group +struct ArrowRowGroupWriter { + writers: Vec<(SharedColumnChunk, ArrowColumnWriter)>, + schema: SchemaRef, + buffered_rows: usize, +} - write_leaves(row_group_writer, &arrays, levels)?; - Ok(()) +impl ArrowRowGroupWriter { + fn new( + parquet: &SchemaDescriptor, + props: &WriterPropertiesPtr, + arrow: &SchemaRef, + ) -> Result { + let mut writers = Vec::with_capacity(arrow.fields.len()); + let mut leaves = parquet.columns().iter(); + for field in &arrow.fields { + get_arrow_column_writer(field.data_type(), props, &mut leaves, &mut writers)?; } - ArrowDataType::LargeList(_) => { - let arrays: Vec<_> = arrays.iter().map(|array|{ - array.as_list::().values().clone() - }).collect(); - write_leaves(row_group_writer, &arrays, levels)?; - Ok(()) + Ok(Self { + writers, + schema: arrow.clone(), + buffered_rows: 0, + }) + } + + fn write(&mut self, batch: &RecordBatch) -> Result<()> { + self.buffered_rows += batch.num_rows(); + let mut writers = self.writers.iter_mut().map(|(_, x)| x); + for (array, field) in batch.columns().iter().zip(&self.schema.fields) { + let mut levels = calculate_array_levels(array, field)?.into_iter(); + write_leaves(&mut writers, &mut levels, array.as_ref())?; } - ArrowDataType::Struct(fields) => { - // Groups child arrays by field - let mut field_arrays = vec![Vec::with_capacity(arrays.len()); fields.len()]; + Ok(()) + } - for array in arrays { - let struct_array: &arrow_array::StructArray = array - .as_any() - .downcast_ref::() - .expect("Unable to get struct array"); + fn close(self) -> Result> { + self.writers + .into_iter() + .map(|(chunk, writer)| { + let close_result = match writer { + ArrowColumnWriter::ByteArray(c) => c.close()?, + ArrowColumnWriter::Column(c) => c.close()?, + }; + + let chunk = Arc::try_unwrap(chunk).ok().unwrap().into_inner().unwrap(); + Ok((chunk, close_result)) + }) + .collect() + } +} - assert_eq!(struct_array.columns().len(), fields.len()); +/// Get an [`ArrowColumnWriter`] along with a reference to its [`SharedColumnChunk`] +fn get_arrow_column_writer( + data_type: &ArrowDataType, + props: &WriterPropertiesPtr, + leaves: &mut Iter<'_, ColumnDescPtr>, + out: &mut Vec<(SharedColumnChunk, ArrowColumnWriter)>, +) -> Result<()> { + let col = |desc: &ColumnDescPtr| { + let page_writer = Box::::default(); + let chunk = page_writer.buffer.clone(); + let writer = get_column_writer(desc.clone(), props.clone(), page_writer); + (chunk, ArrowColumnWriter::Column(writer)) + }; - for (child_array, field) in field_arrays.iter_mut().zip(struct_array.columns()) { - child_array.push(field.clone()) - } - } + let bytes = |desc: &ColumnDescPtr| { + let page_writer = Box::::default(); + let chunk = page_writer.buffer.clone(); + let writer = GenericColumnWriter::new(desc.clone(), props.clone(), page_writer); + (chunk, ArrowColumnWriter::ByteArray(writer)) + }; - for field in field_arrays { - write_leaves(row_group_writer, &field, levels)?; + match data_type { + _ if data_type.is_primitive() => out.push(col(leaves.next().unwrap())), + ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Boolean | ArrowDataType::Null => out.push(col(leaves.next().unwrap())), + ArrowDataType::LargeBinary + | ArrowDataType::Binary + | ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 => { + out.push(bytes(leaves.next().unwrap())) + } + ArrowDataType::List(f) + | ArrowDataType::LargeList(f) + | ArrowDataType::FixedSizeList(f, _) => { + get_arrow_column_writer(f.data_type(), props, leaves, out)? + } + ArrowDataType::Struct(fields) => { + for field in fields { + get_arrow_column_writer(field.data_type(), props, leaves, out)? } - - Ok(()) } - ArrowDataType::Map(_, _) => { - let mut keys = Vec::with_capacity(arrays.len()); - let mut values = Vec::with_capacity(arrays.len()); - for array in arrays { - let map_array: &arrow_array::MapArray = array - .as_any() - .downcast_ref::() - .expect("Unable to get map array"); - keys.push(map_array.keys().clone()); - values.push(map_array.values().clone()); + ArrowDataType::Map(f, _) => match f.data_type() { + ArrowDataType::Struct(f) => { + get_arrow_column_writer(f[0].data_type(), props, leaves, out)?; + get_arrow_column_writer(f[1].data_type(), props, leaves, out)? } - - write_leaves(row_group_writer, &keys, levels)?; - write_leaves(row_group_writer, &values, levels)?; - Ok(()) + _ => unreachable!("invalid map type"), } ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Binary | ArrowDataType::LargeBinary => { - let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - col_writer.write(array, levels.pop().expect("Levels exhausted"))?; - } - col_writer.close() + out.push(bytes(leaves.next().unwrap())) } _ => { - let mut col_writer = row_group_writer.next_column()?.unwrap(); - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; - } - col_writer.close() + out.push(col(leaves.next().unwrap())) } } - ArrowDataType::Float16 => Err(ParquetError::ArrowError( - "Float16 arrays not supported".to_string(), - )), + _ => return Err(ParquetError::NYI( + format!( + "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" + ) + )) + } + Ok(()) +} + +/// Write the leaves of `array` in depth-first order to `writers` with `levels` +fn write_leaves<'a, W>( + writers: &mut W, + levels: &mut IntoIter, + array: &(dyn Array + 'static), +) -> Result<()> +where + W: Iterator, +{ + match array.data_type() { + ArrowDataType::List(_) => { + write_leaves(writers, levels, array.as_list::().values().as_ref())? + } + ArrowDataType::LargeList(_) => { + write_leaves(writers, levels, array.as_list::().values().as_ref())? + } ArrowDataType::FixedSizeList(_, _) => { - let arrays: Vec<_> = arrays.iter().map(|array|{ - array.as_any().downcast_ref::() - .expect("unable to get fixed-size list array") - .values() - .clone() - }).collect(); - write_leaves(row_group_writer, &arrays, levels)?; - Ok(()) - }, - ArrowDataType::Union(_, _) | ArrowDataType::RunEndEncoded(_, _) => { - Err(ParquetError::NYI( - format!( - "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" - ) - )) + let array = array.as_any().downcast_ref::().unwrap(); + write_leaves(writers, levels, array.values().as_ref())? + } + ArrowDataType::Struct(_) => { + for column in array.as_struct().columns() { + write_leaves(writers, levels, column.as_ref())? + } + } + ArrowDataType::Map(_, _) => { + let map = array.as_map(); + write_leaves(writers, levels, map.keys().as_ref())?; + write_leaves(writers, levels, map.values().as_ref())? + } + _ => { + let levels = levels.next().unwrap(); + match writers.next().unwrap() { + ArrowColumnWriter::Column(c) => write_leaf(c, array, levels)?, + ArrowColumnWriter::ByteArray(c) => write_primitive(c, array, levels)?, + }; } } + Ok(()) } fn write_leaf( writer: &mut ColumnWriter<'_>, - column: &ArrayRef, + column: &dyn Array, levels: LevelInfo, -) -> Result { +) -> Result { let indices = levels.non_null_indices(); - let written = match writer { + match writer { ColumnWriter::Int32ColumnWriter(ref mut typed) => { match column.data_type() { ArrowDataType::Date64 => { @@ -442,26 +535,26 @@ fn write_leaf( let array = arrow_cast::cast(&array, &ArrowDataType::Int32)?; let array = array.as_primitive::(); - write_primitive(typed, array.values(), levels)? + write_primitive(typed, array.values(), levels) } ArrowDataType::UInt32 => { let values = column.as_primitive::().values(); // follow C++ implementation and use overflow/reinterpret cast from u32 to i32 which will map // `(i32::MAX as u32)..u32::MAX` to `i32::MIN..0` let array = values.inner().typed_data::(); - write_primitive(typed, array, levels)? + write_primitive(typed, array, levels) } ArrowDataType::Decimal128(_, _) => { // use the int32 to represent the decimal with low precision let array = column .as_primitive::() - .unary::<_, types::Int32Type>(|v| v as i32); - write_primitive(typed, array.values(), levels)? + .unary::<_, Int32Type>(|v| v as i32); + write_primitive(typed, array.values(), levels) } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; let array = array.as_primitive::(); - write_primitive(typed, array.values(), levels)? + write_primitive(typed, array.values(), levels) } } } @@ -471,32 +564,32 @@ fn write_leaf( get_bool_array_slice(array, indices).as_slice(), levels.def_levels(), levels.rep_levels(), - )? + ) } ColumnWriter::Int64ColumnWriter(ref mut typed) => { match column.data_type() { ArrowDataType::Int64 => { let array = column.as_primitive::(); - write_primitive(typed, array.values(), levels)? + write_primitive(typed, array.values(), levels) } ArrowDataType::UInt64 => { let values = column.as_primitive::().values(); // follow C++ implementation and use overflow/reinterpret cast from u64 to i64 which will map // `(i64::MAX as u64)..u64::MAX` to `i64::MIN..0` let array = values.inner().typed_data::(); - write_primitive(typed, array, levels)? + write_primitive(typed, array, levels) } ArrowDataType::Decimal128(_, _) => { // use the int64 to represent the decimal with low precision let array = column .as_primitive::() - .unary::<_, types::Int64Type>(|v| v as i64); - write_primitive(typed, array.values(), levels)? + .unary::<_, Int64Type>(|v| v as i64); + write_primitive(typed, array.values(), levels) } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; let array = array.as_primitive::(); - write_primitive(typed, array.values(), levels)? + write_primitive(typed, array.values(), levels) } } } @@ -504,18 +597,12 @@ fn write_leaf( unreachable!("Currently unreachable because data type not supported") } ColumnWriter::FloatColumnWriter(ref mut typed) => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get Float32 array"); - write_primitive(typed, array.values(), levels)? + let array = column.as_primitive::(); + write_primitive(typed, array.values(), levels) } ColumnWriter::DoubleColumnWriter(ref mut typed) => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get Float64 array"); - write_primitive(typed, array.values(), levels)? + let array = column.as_primitive::(); + write_primitive(typed, array.values(), levels) } ColumnWriter::ByteArrayColumnWriter(_) => { unreachable!("should use ByteArrayWriter") @@ -553,10 +640,7 @@ fn write_leaf( get_fsb_array_slice(array, indices) } ArrowDataType::Decimal128(_, _) => { - let array = column - .as_any() - .downcast_ref::() - .unwrap(); + let array = column.as_primitive::(); get_decimal_array_slice(array, indices) } _ => { @@ -566,19 +650,14 @@ fn write_leaf( )); } }; - typed.write_batch( - bytes.as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? + typed.write_batch(bytes.as_slice(), levels.def_levels(), levels.rep_levels()) } - }; - Ok(written as i64) + } } -fn write_primitive( - writer: &mut ColumnWriterImpl<'_, T>, - values: &[T::T], +fn write_primitive( + writer: &mut GenericColumnWriter, + values: &E::Values, levels: LevelInfo, ) -> Result { writer.write_batch_internal( @@ -2462,4 +2541,40 @@ mod tests { assert_ne!(back.schema(), batch.schema()); assert_eq!(back.column(0).as_ref(), batch.column(0).as_ref()); } + + #[test] + fn in_progress_accounting() { + // define schema + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + + // create some data + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + + // build a record batch + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); + + let mut writer = ArrowWriter::try_new(vec![], batch.schema(), None).unwrap(); + + // starts empty + assert_eq!(writer.in_progress_size(), 0); + assert_eq!(writer.in_progress_rows(), 0); + writer.write(&batch).unwrap(); + + // updated on write + let initial_size = writer.in_progress_size(); + assert!(initial_size > 0); + assert_eq!(writer.in_progress_rows(), 5); + + // updated on second write + writer.write(&batch).unwrap(); + assert!(writer.in_progress_size() > initial_size); + assert_eq!(writer.in_progress_rows(), 10); + + // cleared on flush + writer.flush().unwrap(); + assert_eq!(writer.in_progress_size(), 0); + assert_eq!(writer.in_progress_rows(), 0); + + writer.close().unwrap(); + } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index f854e5caca80..57a0278e23c4 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -162,6 +162,75 @@ impl CompressedPage { pub fn data(&self) -> &[u8] { self.compressed_page.buffer().data() } + + /// Returns the thrift page header + pub(crate) fn to_thrift_header(&self) -> PageHeader { + let uncompressed_size = self.uncompressed_size(); + let compressed_size = self.compressed_size(); + let num_values = self.num_values(); + let encoding = self.encoding(); + let page_type = self.page_type(); + + let mut page_header = PageHeader { + type_: page_type.into(), + uncompressed_page_size: uncompressed_size as i32, + compressed_page_size: compressed_size as i32, + // TODO: Add support for crc checksum + crc: None, + data_page_header: None, + index_page_header: None, + dictionary_page_header: None, + data_page_header_v2: None, + }; + + match self.compressed_page { + Page::DataPage { + def_level_encoding, + rep_level_encoding, + ref statistics, + .. + } => { + let data_page_header = crate::format::DataPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + definition_level_encoding: def_level_encoding.into(), + repetition_level_encoding: rep_level_encoding.into(), + statistics: crate::file::statistics::to_thrift(statistics.as_ref()), + }; + page_header.data_page_header = Some(data_page_header); + } + Page::DataPageV2 { + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + ref statistics, + .. + } => { + let data_page_header_v2 = crate::format::DataPageHeaderV2 { + num_values: num_values as i32, + num_nulls: num_nulls as i32, + num_rows: num_rows as i32, + encoding: encoding.into(), + definition_levels_byte_length: def_levels_byte_len as i32, + repetition_levels_byte_length: rep_levels_byte_len as i32, + is_compressed: Some(is_compressed), + statistics: crate::file::statistics::to_thrift(statistics.as_ref()), + }; + page_header.data_page_header_v2 = Some(data_page_header_v2); + } + Page::DictionaryPage { is_sorted, .. } => { + let dictionary_page_header = crate::format::DictionaryPageHeader { + num_values: num_values as i32, + encoding: encoding.into(), + is_sorted: Some(is_sorted), + }; + page_header.dictionary_page_header = Some(dictionary_page_header); + } + } + page_header + } } /// Contains page write metrics. diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index c343f1d6c824..fb5889b785a8 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -36,7 +36,7 @@ pub trait ColumnValues { } #[cfg(feature = "arrow")] -impl ColumnValues for T { +impl ColumnValues for dyn arrow_array::Array { fn len(&self) -> usize { arrow_array::Array::len(self) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index bf77b2b325c1..5e623d281157 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -43,6 +43,21 @@ use crate::util::memory::ByteBufferPtr; pub(crate) mod encoder; +macro_rules! downcast_writer { + ($e:expr, $i:ident, $b:expr) => { + match $e { + Self::BoolColumnWriter($i) => $b, + Self::Int32ColumnWriter($i) => $b, + Self::Int64ColumnWriter($i) => $b, + Self::Int96ColumnWriter($i) => $b, + Self::FloatColumnWriter($i) => $b, + Self::DoubleColumnWriter($i) => $b, + Self::ByteArrayColumnWriter($i) => $b, + Self::FixedLenByteArrayColumnWriter($i) => $b, + } + }; +} + /// Column writer for a Parquet type. pub enum ColumnWriter<'a> { BoolColumnWriter(ColumnWriterImpl<'a, BoolType>), @@ -55,6 +70,19 @@ pub enum ColumnWriter<'a> { FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>), } +impl<'a> ColumnWriter<'a> { + /// Returns the estimated total bytes for this column writer + #[cfg(feature = "arrow")] + pub(crate) fn get_estimated_total_bytes(&self) -> u64 { + downcast_writer!(self, typed, typed.get_estimated_total_bytes()) + } + + /// Close this [`ColumnWriter`] + pub fn close(self) -> Result { + downcast_writer!(self, typed, typed.close()) + } +} + pub enum Level { Page, Column, @@ -421,10 +449,24 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Returns total number of bytes written by this column writer so far. /// This value is also returned when column writer is closed. + /// + /// Note: this value does not include any buffered data that has not + /// yet been flushed to a page. pub fn get_total_bytes_written(&self) -> u64 { self.column_metrics.total_bytes_written } + /// Returns the estimated total bytes for this column writer + /// + /// Unlike [`Self::get_total_bytes_written`] this includes an estimate + /// of any data that has not yet been flushed to a page + #[cfg(feature = "arrow")] + pub(crate) fn get_estimated_total_bytes(&self) -> u64 { + self.column_metrics.total_bytes_written + + self.encoder.estimated_data_page_size() as u64 + + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 + } + /// Returns total number of rows written by this column writer so far. /// This value is also returned when column writer is closed. pub fn get_total_rows_written(&self) -> u64 { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 4f15c9f4ba02..defdaad321d8 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -30,16 +30,13 @@ use crate::column::writer::{ get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl, }; use crate::column::{ - page::{CompressedPage, Page, PageWriteSpec, PageWriter}, + page::{CompressedPage, PageWriteSpec, PageWriter}, writer::{get_column_writer, ColumnWriter}, }; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; use crate::file::reader::ChunkReader; -use crate::file::{ - metadata::*, properties::WriterPropertiesPtr, - statistics::to_thrift as statistics_to_thrift, PARQUET_MAGIC, -}; +use crate::file::{metadata::*, properties::WriterPropertiesPtr, PARQUET_MAGIC}; use crate::schema::types::{ self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr, }; @@ -370,6 +367,16 @@ impl SerializedFileWriter { self.kv_metadatas.push(kv_metadata); } + /// Returns a reference to schema descriptor. + pub fn schema_descr(&self) -> &SchemaDescriptor { + &self.descr + } + + /// Returns a reference to the writer properties + pub fn properties(&self) -> &WriterPropertiesPtr { + &self.props + } + /// Writes the file footer and returns the underlying writer. pub fn into_inner(mut self) -> Result { self.assert_previous_writer_closed()?; @@ -653,17 +660,7 @@ impl<'a> SerializedColumnWriter<'a> { /// Close this [`SerializedColumnWriter`] pub fn close(mut self) -> Result<()> { - let r = match self.inner { - ColumnWriter::BoolColumnWriter(typed) => typed.close()?, - ColumnWriter::Int32ColumnWriter(typed) => typed.close()?, - ColumnWriter::Int64ColumnWriter(typed) => typed.close()?, - ColumnWriter::Int96ColumnWriter(typed) => typed.close()?, - ColumnWriter::FloatColumnWriter(typed) => typed.close()?, - ColumnWriter::DoubleColumnWriter(typed) => typed.close()?, - ColumnWriter::ByteArrayColumnWriter(typed) => typed.close()?, - ColumnWriter::FixedLenByteArrayColumnWriter(typed) => typed.close()?, - }; - + let r = self.inner.close()?; if let Some(on_close) = self.on_close.take() { on_close(r)? } @@ -701,83 +698,20 @@ impl<'a, W: Write> SerializedPageWriter<'a, W> { impl<'a, W: Write + Send> PageWriter for SerializedPageWriter<'a, W> { fn write_page(&mut self, page: CompressedPage) -> Result { - let uncompressed_size = page.uncompressed_size(); - let compressed_size = page.compressed_size(); - let num_values = page.num_values(); - let encoding = page.encoding(); let page_type = page.page_type(); - - let mut page_header = parquet::PageHeader { - type_: page_type.into(), - uncompressed_page_size: uncompressed_size as i32, - compressed_page_size: compressed_size as i32, - // TODO: Add support for crc checksum - crc: None, - data_page_header: None, - index_page_header: None, - dictionary_page_header: None, - data_page_header_v2: None, - }; - - match *page.compressed_page() { - Page::DataPage { - def_level_encoding, - rep_level_encoding, - ref statistics, - .. - } => { - let data_page_header = parquet::DataPageHeader { - num_values: num_values as i32, - encoding: encoding.into(), - definition_level_encoding: def_level_encoding.into(), - repetition_level_encoding: rep_level_encoding.into(), - statistics: statistics_to_thrift(statistics.as_ref()), - }; - page_header.data_page_header = Some(data_page_header); - } - Page::DataPageV2 { - num_nulls, - num_rows, - def_levels_byte_len, - rep_levels_byte_len, - is_compressed, - ref statistics, - .. - } => { - let data_page_header_v2 = parquet::DataPageHeaderV2 { - num_values: num_values as i32, - num_nulls: num_nulls as i32, - num_rows: num_rows as i32, - encoding: encoding.into(), - definition_levels_byte_length: def_levels_byte_len as i32, - repetition_levels_byte_length: rep_levels_byte_len as i32, - is_compressed: Some(is_compressed), - statistics: statistics_to_thrift(statistics.as_ref()), - }; - page_header.data_page_header_v2 = Some(data_page_header_v2); - } - Page::DictionaryPage { is_sorted, .. } => { - let dictionary_page_header = parquet::DictionaryPageHeader { - num_values: num_values as i32, - encoding: encoding.into(), - is_sorted: Some(is_sorted), - }; - page_header.dictionary_page_header = Some(dictionary_page_header); - } - } - let start_pos = self.sink.bytes_written() as u64; + let page_header = page.to_thrift_header(); let header_size = self.serialize_page_header(page_header)?; self.sink.write_all(page.data())?; let mut spec = PageWriteSpec::new(); spec.page_type = page_type; - spec.uncompressed_size = uncompressed_size + header_size; - spec.compressed_size = compressed_size + header_size; + spec.uncompressed_size = page.uncompressed_size() + header_size; + spec.compressed_size = page.compressed_size() + header_size; spec.offset = start_pos; spec.bytes_written = self.sink.bytes_written() as u64 - start_pos; - spec.num_values = num_values; + spec.num_values = page.num_values(); Ok(spec) } @@ -804,7 +738,7 @@ mod tests { use std::fs::File; use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; - use crate::column::page::PageReader; + use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, Int32Type}; diff --git a/parquet/src/util/memory.rs b/parquet/src/util/memory.rs index 909878a6d538..25d15dd4ff73 100644 --- a/parquet/src/util/memory.rs +++ b/parquet/src/util/memory.rs @@ -114,6 +114,12 @@ impl From for ByteBufferPtr { } } +impl From for Bytes { + fn from(value: ByteBufferPtr) -> Self { + value.data + } +} + #[cfg(test)] mod tests { use super::*; From 04f67908fb1f85eb2dbcd129b2d435d189cd658d Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 30 May 2023 13:15:02 +0200 Subject: [PATCH 0945/1411] feat: support 'Decimal256' for parquet (#4272) Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/array_reader/byte_array.rs | 24 ++++++++-- .../array_reader/fixed_len_byte_array.rs | 32 ++++++++++--- .../src/arrow/array_reader/primitive_array.rs | 31 ++++++++++++- parquet/src/arrow/arrow_reader/mod.rs | 6 ++- parquet/src/arrow/arrow_writer/levels.rs | 1 + parquet/src/arrow/arrow_writer/mod.rs | 44 +++++++++++++++--- parquet/src/arrow/schema/mod.rs | 6 +-- parquet/src/arrow/schema/primitive.rs | 46 +++++++++++++++---- 8 files changed, 158 insertions(+), 32 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 22fa0ab45a20..43db658d9324 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -30,8 +30,10 @@ use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; -use arrow_array::{Array, ArrayRef, BinaryArray, Decimal128Array, OffsetSizeTrait}; -use arrow_buffer::Buffer; +use arrow_array::{ + Array, ArrayRef, BinaryArray, Decimal128Array, Decimal256Array, OffsetSizeTrait, +}; +use arrow_buffer::{i256, Buffer}; use arrow_schema::DataType as ArrowType; use std::any::Any; use std::ops::Range; @@ -52,7 +54,10 @@ pub fn make_byte_array_reader( }; match data_type { - ArrowType::Binary | ArrowType::Utf8 | ArrowType::Decimal128(_, _) => { + ArrowType::Binary + | ArrowType::Utf8 + | ArrowType::Decimal128(_, _) + | ArrowType::Decimal256(_, _) => { let reader = GenericRecordReader::new(column_desc); Ok(Box::new(ByteArrayReader::::new( pages, data_type, reader, @@ -119,7 +124,7 @@ impl ArrayReader for ByteArrayReader { self.rep_levels_buffer = self.record_reader.consume_rep_levels(); self.record_reader.reset(); - let array = match self.data_type { + let array: ArrayRef = match self.data_type { ArrowType::Decimal128(p, s) => { let array = buffer.into_array(null_buffer, ArrowType::Binary); let binary = array.as_any().downcast_ref::().unwrap(); @@ -131,6 +136,17 @@ impl ArrayReader for ByteArrayReader { Arc::new(decimal) } + ArrowType::Decimal256(p, s) => { + let array = buffer.into_array(null_buffer, ArrowType::Binary); + let binary = array.as_any().downcast_ref::().unwrap(); + let decimal = binary + .iter() + .map(|opt| Some(i256::from_be_bytes(sign_extend_be(opt?)))) + .collect::() + .with_precision_and_scale(p, s)?; + + Arc::new(decimal) + } _ => buffer.into_array(null_buffer, self.data_type.clone()), }; diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index fee032a4d763..47bd03a735e1 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -28,10 +28,10 @@ use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; use arrow_array::{ - ArrayRef, Decimal128Array, FixedSizeBinaryArray, IntervalDayTimeArray, - IntervalYearMonthArray, + ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, + IntervalDayTimeArray, IntervalYearMonthArray, }; -use arrow_buffer::Buffer; +use arrow_buffer::{i256, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowType, IntervalUnit}; use std::any::Any; @@ -61,7 +61,6 @@ pub fn make_fixed_len_byte_array_reader( )) } }; - match &data_type { ArrowType::FixedSizeBinary(_) => {} ArrowType::Decimal128(_, _) => { @@ -72,6 +71,14 @@ pub fn make_fixed_len_byte_array_reader( )); } } + ArrowType::Decimal256(_, _) => { + if byte_length > 32 { + return Err(general_err!( + "decimal 256 type too large, must be less than 32 bytes, got {}", + byte_length + )); + } + } ArrowType::Interval(_) => { if byte_length != 12 { // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#interval @@ -155,7 +162,7 @@ impl ArrayReader for FixedLenByteArrayReader { let binary = FixedSizeBinaryArray::from(unsafe { array_data.build_unchecked() }); // TODO: An improvement might be to do this conversion on read - let array = match &self.data_type { + let array: ArrayRef = match &self.data_type { ArrowType::Decimal128(p, s) => { let decimal = binary .iter() @@ -165,6 +172,15 @@ impl ArrayReader for FixedLenByteArrayReader { Arc::new(decimal) } + ArrowType::Decimal256(p, s) => { + let decimal = binary + .iter() + .map(|opt| Some(i256::from_be_bytes(sign_extend_be(opt?)))) + .collect::() + .with_precision_and_scale(*p, *s)?; + + Arc::new(decimal) + } ArrowType::Interval(unit) => { // An interval is stored as 3x 32-bit unsigned integers storing months, days, // and milliseconds @@ -428,16 +444,18 @@ mod tests { use super::*; use crate::arrow::arrow_reader::ParquetRecordBatchReader; use crate::arrow::ArrowWriter; - use arrow_array::{Array, Decimal128Array, ListArray}; use arrow::datatypes::Field; use arrow::error::Result as ArrowResult; use arrow_array::RecordBatch; + use arrow_array::{Array, ListArray}; use bytes::Bytes; use std::sync::Arc; #[test] fn test_decimal_list() { - let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); + let decimals = Decimal256Array::from_iter_values( + [1, 2, 3, 4, 5, 6, 7, 8].into_iter().map(i256::from_i128), + ); // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] let data = ArrayDataBuilder::new(ArrowType::List(Arc::new(Field::new( diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 772026960a3f..1e2720a4a4df 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -24,12 +24,13 @@ use crate::column::page::PageIterator; use crate::data_type::{DataType, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; +use arrow_array::Decimal256Array; use arrow_array::{ builder::{BooleanBufferBuilder, TimestampNanosecondBufferBuilder}, ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array, }; -use arrow_buffer::Buffer; +use arrow_buffer::{i256, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowType, TimeUnit}; use std::any::Any; @@ -237,6 +238,34 @@ where Arc::new(array) as ArrayRef } + ArrowType::Decimal256(p, s) => { + let array = match array.data_type() { + ArrowType::Int32 => array + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.map(|v| i256::from_i128(v as i128))) + .collect::(), + + ArrowType::Int64 => array + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.map(|v| i256::from_i128(v as i128))) + .collect::(), + _ => { + return Err(arrow_err!( + "Cannot convert {:?} to decimal", + array.data_type() + )); + } + } + .with_precision_and_scale(*p, *s)?; + + Arc::new(array) as ArrayRef + } _ => arrow_cast::cast(&array, target_type)?, }; diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 4b14a54c531b..819e96c0a3d1 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -528,7 +528,7 @@ mod tests { use arrow_array::builder::*; use arrow_array::*; use arrow_array::{RecordBatch, RecordBatchReader}; - use arrow_buffer::Buffer; + use arrow_buffer::{i256, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema}; @@ -928,7 +928,9 @@ mod tests { #[test] fn test_decimal_nullable_struct() { - let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); + let decimals = Decimal256Array::from_iter_values( + [1, 2, 3, 4, 5, 6, 7, 8].into_iter().map(i256::from_i128), + ); let data = ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 21b3e7dff88d..47b01890301e 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -88,6 +88,7 @@ fn is_leaf(data_type: &DataType) -> bool { | DataType::Binary | DataType::LargeBinary | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) | DataType::FixedSizeBinary(_) ) } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bde21ae856d0..5f2750a55009 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -27,10 +27,7 @@ use std::vec::IntoIter; use thrift::protocol::{TCompactOutputProtocol, TSerializable}; use arrow_array::cast::AsArray; -use arrow_array::types::{ - Decimal128Type, Float32Type, Float64Type, Int32Type, Int64Type, UInt32Type, - UInt64Type, -}; +use arrow_array::types::*; use arrow_array::{Array, FixedSizeListArray, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, IntervalUnit, SchemaRef}; @@ -551,6 +548,13 @@ fn write_leaf( .unary::<_, Int32Type>(|v| v as i32); write_primitive(typed, array.values(), levels) } + ArrowDataType::Decimal256(_, _) => { + // use the int32 to represent the decimal with low precision + let array = column + .as_primitive::() + .unary::<_, Int32Type>(|v| v.as_i128() as i32); + write_primitive(typed, array.values(), levels) + } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; let array = array.as_primitive::(); @@ -586,6 +590,13 @@ fn write_leaf( .unary::<_, Int64Type>(|v| v as i64); write_primitive(typed, array.values(), levels) } + ArrowDataType::Decimal256(_, _) => { + // use the int64 to represent the decimal with low precision + let array = column + .as_primitive::() + .unary::<_, Int64Type>(|v| v.as_i128() as i64); + write_primitive(typed, array.values(), levels) + } _ => { let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; let array = array.as_primitive::(); @@ -641,7 +652,14 @@ fn write_leaf( } ArrowDataType::Decimal128(_, _) => { let array = column.as_primitive::(); - get_decimal_array_slice(array, indices) + get_decimal_128_array_slice(array, indices) + } + ArrowDataType::Decimal256(_, _) => { + let array = column + .as_any() + .downcast_ref::() + .unwrap(); + get_decimal_256_array_slice(array, indices) } _ => { return Err(ParquetError::NYI( @@ -715,7 +733,7 @@ fn get_interval_dt_array_slice( values } -fn get_decimal_array_slice( +fn get_decimal_128_array_slice( array: &arrow_array::Decimal128Array, indices: &[usize], ) -> Vec { @@ -729,6 +747,20 @@ fn get_decimal_array_slice( values } +fn get_decimal_256_array_slice( + array: &arrow_array::Decimal256Array, + indices: &[usize], +) -> Vec { + let mut values = Vec::with_capacity(indices.len()); + let size = decimal_length_from_precision(array.precision()); + for i in indices { + let as_be_bytes = array.value(*i).to_be_bytes(); + let resized_value = as_be_bytes[(32 - size)..].to_vec(); + values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); + } + values +} + fn get_fsb_array_slice( array: &arrow_array::FixedSizeBinaryArray, indices: &[usize], diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 399dcba9e981..ffae1eae54aa 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -589,7 +589,7 @@ mod tests { let arrow_fields = Fields::from(vec![ Field::new("decimal1", DataType::Decimal128(4, 2), false), Field::new("decimal2", DataType::Decimal128(12, 2), false), - Field::new("decimal3", DataType::Decimal128(30, 2), false), + Field::new("decimal3", DataType::Decimal256(30, 2), false), Field::new("decimal4", DataType::Decimal128(33, 2), false), ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); @@ -1443,7 +1443,7 @@ mod tests { ), Field::new("decimal_int32", DataType::Decimal128(8, 2), false), Field::new("decimal_int64", DataType::Decimal128(16, 2), false), - Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false), + Field::new("decimal_fix_length", DataType::Decimal256(30, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap(); @@ -1614,7 +1614,7 @@ mod tests { // ), Field::new("c35", DataType::Null, true), Field::new("c36", DataType::Decimal128(2, 1), false), - Field::new("c37", DataType::Decimal128(50, 20), false), + Field::new("c37", DataType::Decimal256(50, 20), false), Field::new("c38", DataType::Decimal128(18, 12), true), Field::new_map( "c39", diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 6565f7eaeefb..d4db28915f2f 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -103,7 +103,7 @@ fn from_parquet(parquet_type: &Type) -> Result { } } -fn decimal_type(scale: i32, precision: i32) -> Result { +fn decimal_128_type(scale: i32, precision: i32) -> Result { let scale = scale .try_into() .map_err(|_| arrow_err!("scale cannot be negative: {}", scale))?; @@ -115,6 +115,18 @@ fn decimal_type(scale: i32, precision: i32) -> Result { Ok(DataType::Decimal128(precision, scale)) } +fn decimal_256_type(scale: i32, precision: i32) -> Result { + let scale = scale + .try_into() + .map_err(|_| arrow_err!("scale cannot be negative: {}", scale))?; + + let precision = precision + .try_into() + .map_err(|_| arrow_err!("precision cannot be negative: {}", precision))?; + + Ok(DataType::Decimal256(precision, scale)) +} + fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result { match (info.logical_type(), info.converted_type()) { (None, ConvertedType::NONE) => Ok(DataType::Int32), @@ -136,7 +148,7 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Err(arrow_err!("Cannot create INT32 physical type from {:?}", t)), }, (Some(LogicalType::Decimal { scale, precision }), _) => { - decimal_type(scale, precision) + decimal_128_type(scale, precision) } (Some(LogicalType::Date), _) => Ok(DataType::Date32), (Some(LogicalType::Time { unit, .. }), _) => match unit { @@ -156,7 +168,7 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Ok(DataType::Int32), (None, ConvertedType::DATE) => Ok(DataType::Date32), (None, ConvertedType::TIME_MILLIS) => Ok(DataType::Time32(TimeUnit::Millisecond)), - (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), + (None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision), (logical, converted) => Err(arrow_err!( "Unable to convert parquet INT32 logical type {:?} or converted type {}", logical, @@ -213,9 +225,9 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result { - decimal_type(scale, precision) + decimal_128_type(scale, precision) } - (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), + (None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision), (logical, converted) => Err(arrow_err!( "Unable to convert parquet INT64 logical type {:?} or converted type {}", logical, @@ -235,8 +247,14 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result Ok(DataType::Binary), (None, ConvertedType::ENUM) => Ok(DataType::Binary), (None, ConvertedType::UTF8) => Ok(DataType::Utf8), - (Some(LogicalType::Decimal { scale: s, precision: p }), _) => decimal_type(s, p), - (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), + ( + Some(LogicalType::Decimal { + scale: s, + precision: p, + }), + _, + ) => decimal_128_type(s, p), + (None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision), (logical, converted) => Err(arrow_err!( "Unable to convert parquet BYTE_ARRAY logical type {:?} or converted type {}", logical, @@ -254,9 +272,19 @@ fn from_fixed_len_byte_array( // TODO: This should check the type length for the decimal and interval types match (info.logical_type(), info.converted_type()) { (Some(LogicalType::Decimal { scale, precision }), _) => { - decimal_type(scale, precision) + if type_length < 16 { + decimal_128_type(scale, precision) + } else { + decimal_256_type(scale, precision) + } + } + (None, ConvertedType::DECIMAL) => { + if type_length < 16 { + decimal_128_type(scale, precision) + } else { + decimal_256_type(scale, precision) + } } - (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), (None, ConvertedType::INTERVAL) => { // There is currently no reliable way of determining which IntervalUnit // to return. Thus without the original Arrow schema, the results From 1b409a184f114c0b4a21f6f50081310d30ed0eaf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 30 May 2023 08:53:20 -0400 Subject: [PATCH 0946/1411] Add Builder style APIs and docs for `FlightData`,` FlightInfo`, `FlightEndpoint`, `Locaation` and `Ticket` (#4294) * Add Builder style APIs and docs for FlightData, FlightInfo, FlightEndpoint, Locaation and Ticket * fix and clippy * Rename `FlightInfo::with_schema` to `FlightInfo::try_with_schema` --- arrow-flight/examples/flight_sql_server.rs | 62 ++---- arrow-flight/src/lib.rs | 221 +++++++++++++++++--- arrow-flight/tests/flight_sql_client_cli.rs | 64 +++--- 3 files changed, 232 insertions(+), 115 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 27ae5d85434c..783e0bf5bdf6 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -29,7 +29,6 @@ use tonic::{Request, Response, Status, Streaming}; use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::flight_descriptor::DescriptorType; use arrow_flight::sql::sql_info::SqlInfoList; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, @@ -222,26 +221,15 @@ impl FlightSqlService for FlightSqlServiceImpl { ticket: Some(ticket), location: vec![loc], }; - let endpoints = vec![endpoint]; + let info = FlightInfo::new() + .try_with_schema(&schema) + .map_err(|e| status!("Unable to serialize schema", e))? + .with_descriptor(FlightDescriptor::new_cmd(vec![])) + .with_endpoint(endpoint) + .with_total_records(num_rows as i64) + .with_total_bytes(num_bytes as i64) + .with_ordered(false); - let message = SchemaAsIpc::new(&schema, &IpcWriteOptions::default()) - .try_into() - .map_err(|e| status!("Unable to serialize schema", e))?; - let IpcMessage(schema_bytes) = message; - - let flight_desc = FlightDescriptor { - r#type: DescriptorType::Cmd.into(), - cmd: Default::default(), - path: vec![], - }; - let info = FlightInfo { - schema: schema_bytes, - flight_descriptor: Some(flight_desc), - endpoint: endpoints, - total_records: num_rows as i64, - total_bytes: num_bytes as i64, - ordered: false, - }; let resp = Response::new(info); Ok(resp) } @@ -292,32 +280,14 @@ impl FlightSqlService for FlightSqlServiceImpl { request: Request, ) -> Result, Status> { let flight_descriptor = request.into_inner(); - let ticket = Ticket { - ticket: query.encode_to_vec().into(), - }; - - let options = IpcWriteOptions::default(); - - // encode the schema into the correct form - let IpcMessage(schema) = SchemaAsIpc::new(SqlInfoList::schema(), &options) - .try_into() - .expect("valid sql_info schema"); - - let endpoint = vec![FlightEndpoint { - ticket: Some(ticket), - // we assume users wnating to use this helper would reasonably - // never need to be distributed across multile endpoints? - location: vec![], - }]; - - let flight_info = FlightInfo { - schema, - flight_descriptor: Some(flight_descriptor), - endpoint, - total_records: -1, - total_bytes: -1, - ordered: false, - }; + let ticket = Ticket::new(query.encode_to_vec()); + let endpoint = FlightEndpoint::new().with_ticket(ticket); + + let flight_info = FlightInfo::new() + .try_with_schema(SqlInfoList::schema()) + .map_err(|e| status!("Unable to encode schema", e))? + .with_endpoint(endpoint) + .with_descriptor(flight_descriptor); Ok(tonic::Response::new(flight_info)) } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index f7df32a20002..4163f2ceaa27 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -26,7 +26,8 @@ //! This crate contains: //! //! 1. Low level [prost] generated structs -//! for Flight gRPC protobuf messages, such as [`FlightData`]. +//! for Flight gRPC protobuf messages, such as [`FlightData`], [`FlightInfo`], +//! [`Location`] and [`Ticket`]. //! //! 2. Low level [tonic] generated [`flight_service_client`] and //! [`flight_service_server`]. @@ -390,21 +391,51 @@ impl FlightData { /// See [`FlightDataEncoderBuilder`] for a higher level API to /// convert a stream of [`RecordBatch`]es to [`FlightData`]s /// + /// # Example: + /// + /// ``` + /// # use bytes::Bytes; + /// # use arrow_flight::{FlightData, FlightDescriptor}; + /// # fn encode_data() -> Bytes { Bytes::new() } // dummy data + /// // Get encoded Arrow IPC data: + /// let data_body: Bytes = encode_data(); + /// // Create the FlightData message + /// let flight_data = FlightData::new() + /// .with_descriptor(FlightDescriptor::new_cmd("the command")) + /// .with_app_metadata("My apps metadata") + /// .with_data_body(data_body); + /// ``` + /// /// [`FlightDataEncoderBuilder`]: crate::encode::FlightDataEncoderBuilder /// [`RecordBatch`]: arrow_array::RecordBatch - pub fn new( - flight_descriptor: Option, - message: IpcMessage, - app_metadata: impl Into, - data_body: impl Into, - ) -> Self { - let IpcMessage(vals) = message; - FlightData { - flight_descriptor, - data_header: vals, - app_metadata: app_metadata.into(), - data_body: data_body.into(), - } + pub fn new() -> Self { + Default::default() + } + + /// Add a [`FlightDescriptor`] describing the data + pub fn with_descriptor(mut self, flight_descriptor: FlightDescriptor) -> Self { + self.flight_descriptor = Some(flight_descriptor); + self + } + + /// Add a data header + pub fn with_data_header(mut self, data_header: impl Into) -> Self { + self.data_header = data_header.into(); + self + } + + /// Add a data body. See [`IpcDataGenerator`] to create this data. + /// + /// [`IpcDataGenerator`]: arrow_ipc::writer::IpcDataGenerator + pub fn with_data_body(mut self, data_body: impl Into) -> Self { + self.data_body = data_body.into(); + self + } + + /// Add optional application specific metadata to the message + pub fn with_app_metadata(mut self, app_metadata: impl Into) -> Self { + self.app_metadata = app_metadata.into(); + self } } @@ -433,24 +464,45 @@ impl FlightDescriptor { } impl FlightInfo { - /// Create a new [`FlightInfo`] that describes the access - /// coordinates for retrieval of a dataset. - pub fn new( - message: IpcMessage, - flight_descriptor: Option, - endpoint: Vec, - total_records: i64, - total_bytes: i64, - ordered: bool, - ) -> Self { - let IpcMessage(vals) = message; + /// Create a new, empty `FlightInfo`, describing where to fetch flight data + /// + /// + /// # Example: + /// ``` + /// # use arrow_flight::{FlightInfo, Ticket, FlightDescriptor, FlightEndpoint}; + /// # use arrow_schema::{Schema, Field, DataType}; + /// # fn get_schema() -> Schema { + /// # Schema::new(vec![ + /// # Field::new("a", DataType::Utf8, false), + /// # ]) + /// # } + /// # + /// // Create a new FlightInfo + /// let flight_info = FlightInfo::new() + /// // Encode the Arrow schema + /// .try_with_schema(&get_schema()) + /// .expect("encoding failed") + /// .with_descriptor( + /// FlightDescriptor::new_cmd("a command") + /// ) + /// .with_endpoint( + /// FlightEndpoint::new() + /// .with_ticket(Ticket::new("ticket contents") + /// ) + /// ) + /// .with_descriptor(FlightDescriptor::new_cmd("RUN QUERY")); + /// ``` + pub fn new() -> FlightInfo { FlightInfo { - schema: vals, - flight_descriptor, - endpoint, - total_records, - total_bytes, - ordered, + schema: Bytes::new(), + flight_descriptor: None, + endpoint: vec![], + ordered: false, + // Flight says "Set these to -1 if unknown." + // + // https://github.com/apache/arrow-rs/blob/17ca4d51d0490f9c65f5adde144f677dbc8300e7/format/Flight.proto#L287-L289 + total_records: -1, + total_bytes: -1, } } @@ -459,6 +511,51 @@ impl FlightInfo { let msg = IpcMessage(self.schema); msg.try_into() } + + /// Specify the schema for the response. + /// + /// Note this takes the arrow [`Schema`] (not the IPC schema) and + /// encodes it using the default IPC options. + /// + /// Returns an error if `schema` can not be encoded into IPC form. + pub fn try_with_schema(mut self, schema: &Schema) -> ArrowResult { + let options = IpcWriteOptions::default(); + let IpcMessage(schema) = SchemaAsIpc::new(schema, &options).try_into()?; + self.schema = schema; + Ok(self) + } + + /// Add specific a endpoint for fetching the data + pub fn with_endpoint(mut self, endpoint: FlightEndpoint) -> Self { + self.endpoint.push(endpoint); + self + } + + /// Add a [`FlightDescriptor`] describing what this data is + pub fn with_descriptor(mut self, flight_descriptor: FlightDescriptor) -> Self { + self.flight_descriptor = Some(flight_descriptor); + self + } + + /// Set the number of records in the result, if known + pub fn with_total_records(mut self, total_records: i64) -> Self { + self.total_records = total_records; + self + } + + /// Set the number of bytes in the result, if known + pub fn with_total_bytes(mut self, total_bytes: i64) -> Self { + self.total_bytes = total_bytes; + self + } + + /// Specify if the response is [ordered] across endpoints + /// + /// [ordered]: https://github.com/apache/arrow-rs/blob/17ca4d51d0490f9c65f5adde144f677dbc8300e7/format/Flight.proto#L269-L275 + pub fn with_ordered(mut self, ordered: bool) -> Self { + self.ordered = ordered; + self + } } impl<'a> SchemaAsIpc<'a> { @@ -486,6 +583,68 @@ impl Result { } } +impl Ticket { + /// Create a new `Ticket` + /// + /// # Example + /// + /// ``` + /// # use arrow_flight::Ticket; + /// let ticket = Ticket::new("SELECT * from FOO"); + /// ``` + pub fn new(ticket: impl Into) -> Self { + Self { + ticket: ticket.into(), + } + } +} + +impl FlightEndpoint { + /// Create a new, empty `FlightEndpoint` that represents a location + /// to retrieve Flight results. + /// + /// # Example + /// ``` + /// # use arrow_flight::{FlightEndpoint, Ticket}; + /// # + /// // Specify the client should fetch results from this server + /// let endpoint = FlightEndpoint::new() + /// .with_ticket(Ticket::new("the ticket")); + /// + /// // Specify the client should fetch results from either + /// // `http://example.com` or `https://example.com` + /// let endpoint = FlightEndpoint::new() + /// .with_ticket(Ticket::new("the ticket")) + /// .with_location("http://example.com") + /// .with_location("https://example.com"); + /// ``` + pub fn new() -> FlightEndpoint { + Default::default() + } + + /// Set the [`Ticket`] used to retrieve data from the endpoint + pub fn with_ticket(mut self, ticket: Ticket) -> Self { + self.ticket = Some(ticket); + self + } + + /// Add a location `uri` to this endpoint. Note each endpoint can + /// have multiple locations. + /// + /// If no `uri` is specified, the [Flight Spec] says: + /// + /// ```text + /// * If the list is empty, the expectation is that the ticket can only + /// * be redeemed on the current service where the ticket was + /// * generated. + /// ``` + /// [Flight Spec]: https://github.com/apache/arrow-rs/blob/17ca4d51d0490f9c65f5adde144f677dbc8300e7/format/Flight.proto#L307C2-L312 + pub fn with_location(mut self, uri: impl Into) -> Self { + self.location.push(Location { uri: uri.into() }); + self + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 9b3baca9ba6c..c4ae9280c898 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -36,9 +36,8 @@ use arrow_flight::{ }, utils::batches_to_flight_data, Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, SchemaAsIpc, Ticket, + HandshakeResponse, Ticket, }; -use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; use assert_cmd::Command; use futures::Stream; @@ -167,42 +166,31 @@ impl FlightSqlService for FlightSqlServiceImpl { let batch = Self::fake_result().unwrap(); - let IpcMessage(schema_bytes) = - SchemaAsIpc::new(batch.schema().as_ref(), &IpcWriteOptions::default()) - .try_into() - .unwrap(); - - let info = FlightInfo { - schema: schema_bytes, - flight_descriptor: None, - endpoint: vec![ - FlightEndpoint { - ticket: Some(Ticket { - ticket: FetchResults { - handle: String::from("part_1"), - } - .as_any() - .encode_to_vec() - .into(), - }), - location: vec![], - }, - FlightEndpoint { - ticket: Some(Ticket { - ticket: FetchResults { - handle: String::from("part_2"), - } - .as_any() - .encode_to_vec() - .into(), - }), - location: vec![], - }, - ], - total_records: batch.num_rows() as i64, - total_bytes: batch.get_array_memory_size() as i64, - ordered: false, - }; + let info = FlightInfo::new() + .try_with_schema(&batch.schema()) + .expect("encoding schema") + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_1"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_2"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_total_records(batch.num_rows() as i64) + .with_total_bytes(batch.get_array_memory_size() as i64) + .with_ordered(false); + let resp = Response::new(info); Ok(resp) } From d5ba15aa6ef291a429aa02981cb09d300f378b3b Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Tue, 30 May 2023 15:18:14 +0200 Subject: [PATCH 0947/1411] feat(flight): support int32_to_int32_list_map in sql infos (#4300) * Add an example/ * feat(flight): support int32_to_int32_list_map in sql infos * fix: use enum variants in test * chore: cleanup * fix: remove merge artifact --------- Co-authored-by: Andrew Lamb --- arrow-flight/src/sql/sql_info.rs | 133 +++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 25 deletions(-) diff --git a/arrow-flight/src/sql/sql_info.rs b/arrow-flight/src/sql/sql_info.rs index 717f1393c879..f0d14ff8a741 100644 --- a/arrow-flight/src/sql/sql_info.rs +++ b/arrow-flight/src/sql/sql_info.rs @@ -19,16 +19,18 @@ //! //! [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo -use std::{borrow::Cow, collections::BTreeMap, sync::Arc}; +use std::borrow::Cow; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; use arrow_array::array::{Array, UnionArray}; use arrow_array::builder::{ ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - StringBuilder, UInt32Builder, + MapBuilder, StringBuilder, UInt32Builder, }; use arrow_array::RecordBatch; use arrow_data::ArrayData; -use arrow_schema::{DataType, Field, Schema, UnionFields, UnionMode}; +use arrow_schema::{DataType, Field, Fields, Schema, UnionFields, UnionMode}; use once_cell::sync::Lazy; use super::SqlInfo; @@ -42,8 +44,7 @@ pub enum SqlInfoValue { BigInt(i64), Bitmask(i32), StringList(Vec), - // TODO support more exotic metadata that requires the map of lists - //ListMap(BTreeMap>), + ListMap(BTreeMap>), } impl From<&str> for SqlInfoValue { @@ -77,6 +78,35 @@ impl From<&[&str]> for SqlInfoValue { } } +impl From> for SqlInfoValue { + fn from(values: Vec) -> Self { + Self::StringList(values) + } +} + +impl From>> for SqlInfoValue { + fn from(value: BTreeMap>) -> Self { + Self::ListMap(value) + } +} + +impl From>> for SqlInfoValue { + fn from(value: HashMap>) -> Self { + Self::ListMap(value.into_iter().collect()) + } +} + +impl From<&HashMap>> for SqlInfoValue { + fn from(value: &HashMap>) -> Self { + Self::ListMap( + value + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())) + .collect(), + ) + } +} + /// Something that can be converted into u32 (the represenation of a [`SqlInfo`] name) pub trait SqlInfoName { fn as_u32(&self) -> u32; @@ -99,8 +129,7 @@ impl SqlInfoName for u32 { /// Handles creating the dense [`UnionArray`] described by [flightsql] /// -/// -/// NOT YET COMPLETE: The int32_to_int32_list_map +/// incrementally build types/offset of the dense union. See [Union Spec] for details. /// /// ```text /// * value: dense_union< @@ -113,6 +142,7 @@ impl SqlInfoName for u32 { /// * > /// ``` ///[flightsql]: (https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/FlightSql.proto#L32-L43 +///[Union Spec]: https://arrow.apache.org/docs/format/Columnar.html#dense-union struct SqlInfoUnionBuilder { // Values for each child type string_values: StringBuilder, @@ -120,12 +150,7 @@ struct SqlInfoUnionBuilder { bigint_values: Int64Builder, int32_bitmask_values: Int32Builder, string_list_values: ListBuilder, - - /// incrementally build types/offset of the dense union, - /// - /// See [Union Spec] for details. - /// - /// [Union Spec]: https://arrow.apache.org/docs/format/Columnar.html#dense-union + int32_to_int32_list_map_values: MapBuilder>, type_ids: Int8Builder, offsets: Int32Builder, } @@ -143,6 +168,29 @@ static UNION_TYPE: Lazy = Lazy::new(|| { DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), true, ), + Field::new( + "int32_to_int32_list_map", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Int32, false), + Field::new( + "values", + DataType::List(Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), + true, + ), + ])), + false, + )), + false, + ), + true, + ), ]; // create "type ids", one for each type, assume they go from 0 .. num_fields @@ -159,6 +207,11 @@ impl SqlInfoUnionBuilder { bigint_values: Int64Builder::new(), int32_bitmask_values: Int32Builder::new(), string_list_values: ListBuilder::new(StringBuilder::new()), + int32_to_int32_list_map_values: MapBuilder::new( + None, + Int32Builder::new(), + ListBuilder::new(Int32Builder::new()), + ), type_ids: Int8Builder::new(), offsets: Int32Builder::new(), } @@ -170,7 +223,7 @@ impl SqlInfoUnionBuilder { } /// Append the specified value to this builder - pub fn append_value(&mut self, v: &SqlInfoValue) { + pub fn append_value(&mut self, v: &SqlInfoValue) -> Result<()> { // typeid is which child and len is the child array's length // *after* adding the value let (type_id, len) = match v { @@ -199,11 +252,24 @@ impl SqlInfoUnionBuilder { self.string_list_values.append(true); (4, self.string_list_values.len()) } + SqlInfoValue::ListMap(values) => { + // build map + for (k, v) in values.clone() { + self.int32_to_int32_list_map_values.keys().append_value(k); + self.int32_to_int32_list_map_values + .values() + .append_value(v.into_iter().map(Some)); + } + // complete the list + self.int32_to_int32_list_map_values.append(true)?; + (5, self.int32_to_int32_list_map_values.len()) + } }; self.type_ids.append_value(type_id); let len = i32::try_from(len).expect("offset fit in i32"); self.offsets.append_value(len - 1); + Ok(()) } /// Complete the construction and build the [`UnionArray`] @@ -214,6 +280,7 @@ impl SqlInfoUnionBuilder { mut bigint_values, mut int32_bitmask_values, mut string_list_values, + mut int32_to_int32_list_map_values, mut type_ids, mut offsets, } = self; @@ -237,6 +304,7 @@ impl SqlInfoUnionBuilder { bigint_values.finish().into_data(), int32_bitmask_values.finish().into_data(), string_list_values.finish().into_data(), + int32_to_int32_list_map_values.finish().into_data(), ]; let data = ArrayData::try_new( @@ -342,7 +410,7 @@ impl SqlInfoList { for (&name, value) in self.infos.iter() { name_builder.append_value(name); - value_builder.append_value(value) + value_builder.append_value(value)? } let batch = RecordBatch::try_from_iter(vec![ @@ -369,8 +437,12 @@ static SQL_INFO_SCHEMA: Lazy = Lazy::new(|| { #[cfg(test)] mod tests { + use std::collections::HashMap; + use super::SqlInfoList; - use crate::sql::{SqlInfo, SqlNullOrdering, SqlSupportedTransaction}; + use crate::sql::{ + SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert, + }; use arrow_array::RecordBatch; use arrow_cast::pretty::pretty_format_batches; @@ -386,6 +458,15 @@ mod tests { #[test] fn test_sql_infos() { + let mut convert: HashMap> = HashMap::new(); + convert.insert( + SqlSupportsConvert::SqlConvertInteger as i32, + vec![ + SqlSupportsConvert::SqlConvertFloat as i32, + SqlSupportsConvert::SqlConvertReal as i32, + ], + ); + let batch = SqlInfoList::new() // str .with_sql_info(SqlInfo::SqlIdentifierQuoteChar, r#"""#) @@ -400,19 +481,21 @@ mod tests { .with_sql_info(SqlInfo::SqlMaxBinaryLiteralLength, i32::MAX as i64) // [str] .with_sql_info(SqlInfo::SqlKeywords, &["SELECT", "DELETE"] as &[&str]) + .with_sql_info(SqlInfo::SqlSupportsConvert, &convert) .encode() .unwrap(); let expected = vec![ - "+-----------+--------------------------------+", - "| info_name | value |", - "+-----------+--------------------------------+", - "| 500 | {bool_value=false} |", - "| 504 | {string_value=\"} |", - "| 507 | {int32_bitmask=0} |", - "| 508 | {string_list=[SELECT, DELETE]} |", - "| 541 | {bigint_value=2147483647} |", - "+-----------+--------------------------------+", + "+-----------+----------------------------------------+", + "| info_name | value |", + "+-----------+----------------------------------------+", + "| 500 | {bool_value=false} |", + "| 504 | {string_value=\"} |", + "| 507 | {int32_bitmask=0} |", + "| 508 | {string_list=[SELECT, DELETE]} |", + "| 517 | {int32_to_int32_list_map={7: [6, 13]}} |", + "| 541 | {bigint_value=2147483647} |", + "+-----------+----------------------------------------+", ]; assert_batches_eq(&[batch], &expected); From 6dd0378e33926f756d142f04d3e5cbc984ad258d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 30 May 2023 15:15:52 +0100 Subject: [PATCH 0948/1411] Simplify parquet PageIterator (#4306) --- parquet/benches/arrow_reader.rs | 43 +++---------------- .../src/arrow/array_reader/primitive_array.rs | 25 +++-------- parquet/src/arrow/array_reader/test_util.rs | 25 ++--------- parquet/src/arrow/async_reader/mod.rs | 16 +------ parquet/src/column/page.rs | 9 +--- parquet/src/file/reader.rs | 16 +------ parquet/src/util/test_common/page_util.rs | 22 ++-------- 7 files changed, 24 insertions(+), 132 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index f6f65bea8f2c..3dda6304d122 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -76,7 +76,6 @@ pub fn seedable_rng() -> StdRng { // support byte array for decimal fn build_encoded_decimal_bytes_page_iterator( - schema: SchemaDescPtr, column_desc: ColumnDescPtr, null_density: f32, encoding: Encoding, @@ -136,11 +135,10 @@ where } pages.push(column_chunk_pages); } - InMemoryPageIterator::new(schema, column_desc, pages) + InMemoryPageIterator::new(pages) } fn build_encoded_primitive_page_iterator( - schema: SchemaDescPtr, column_desc: ColumnDescPtr, null_density: f32, encoding: Encoding, @@ -185,11 +183,10 @@ where pages.push(column_chunk_pages); } - InMemoryPageIterator::new(schema, column_desc, pages) + InMemoryPageIterator::new(pages) } fn build_dictionary_encoded_primitive_page_iterator( - schema: SchemaDescPtr, column_desc: ColumnDescPtr, null_density: f32, ) -> impl PageIterator + Clone @@ -254,11 +251,10 @@ where pages.push(column_chunk_pages.into()); } - InMemoryPageIterator::new(schema, column_desc, pages) + InMemoryPageIterator::new(pages) } fn build_plain_encoded_string_page_iterator( - schema: SchemaDescPtr, column_desc: ColumnDescPtr, null_density: f32, ) -> impl PageIterator + Clone { @@ -297,11 +293,10 @@ fn build_plain_encoded_string_page_iterator( pages.push(column_chunk_pages); } - InMemoryPageIterator::new(schema, column_desc, pages) + InMemoryPageIterator::new(pages) } fn build_dictionary_encoded_string_page_iterator( - schema: SchemaDescPtr, column_desc: ColumnDescPtr, null_density: f32, ) -> impl PageIterator + Clone { @@ -363,7 +358,7 @@ fn build_dictionary_encoded_string_page_iterator( pages.push(column_chunk_pages.into()); } - InMemoryPageIterator::new(schema, column_desc, pages) + InMemoryPageIterator::new(pages) } fn bench_array_reader(mut array_reader: Box) -> usize { @@ -471,7 +466,6 @@ fn create_string_byte_array_dictionary_reader( fn bench_byte_decimal( group: &mut BenchmarkGroup, - schema: &SchemaDescPtr, mandatory_column_desc: &ColumnDescPtr, optional_column_desc: &ColumnDescPtr, min: i128, @@ -485,7 +479,6 @@ fn bench_byte_decimal( // plain encoded, no NULLs let data = build_encoded_decimal_bytes_page_iterator::( - schema.clone(), mandatory_column_desc.clone(), 0.0, Encoding::PLAIN, @@ -504,7 +497,6 @@ fn bench_byte_decimal( }); let data = build_encoded_decimal_bytes_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.0, Encoding::PLAIN, @@ -524,7 +516,6 @@ fn bench_byte_decimal( // half null let data = build_encoded_decimal_bytes_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.5, Encoding::PLAIN, @@ -545,7 +536,6 @@ fn bench_byte_decimal( fn bench_primitive( group: &mut BenchmarkGroup, - schema: &SchemaDescPtr, mandatory_column_desc: &ColumnDescPtr, optional_column_desc: &ColumnDescPtr, min: usize, @@ -558,7 +548,6 @@ fn bench_primitive( // plain encoded, no NULLs let data = build_encoded_primitive_page_iterator::( - schema.clone(), mandatory_column_desc.clone(), 0.0, Encoding::PLAIN, @@ -577,7 +566,6 @@ fn bench_primitive( }); let data = build_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.0, Encoding::PLAIN, @@ -595,7 +583,6 @@ fn bench_primitive( // plain encoded, half NULLs let data = build_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.5, Encoding::PLAIN, @@ -613,7 +600,6 @@ fn bench_primitive( // binary packed, no NULLs let data = build_encoded_primitive_page_iterator::( - schema.clone(), mandatory_column_desc.clone(), 0.0, Encoding::DELTA_BINARY_PACKED, @@ -632,7 +618,6 @@ fn bench_primitive( }); let data = build_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.0, Encoding::DELTA_BINARY_PACKED, @@ -650,7 +635,6 @@ fn bench_primitive( // binary packed skip , no NULLs let data = build_encoded_primitive_page_iterator::( - schema.clone(), mandatory_column_desc.clone(), 0.0, Encoding::DELTA_BINARY_PACKED, @@ -669,7 +653,6 @@ fn bench_primitive( }); let data = build_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.0, Encoding::DELTA_BINARY_PACKED, @@ -687,7 +670,6 @@ fn bench_primitive( // binary packed, half NULLs let data = build_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.5, Encoding::DELTA_BINARY_PACKED, @@ -705,7 +687,6 @@ fn bench_primitive( // dictionary encoded, no NULLs let data = build_dictionary_encoded_primitive_page_iterator::( - schema.clone(), mandatory_column_desc.clone(), 0.0, ); @@ -721,7 +702,6 @@ fn bench_primitive( }); let data = build_dictionary_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.0, ); @@ -736,7 +716,6 @@ fn bench_primitive( // dictionary encoded, half NULLs let data = build_dictionary_encoded_primitive_page_iterator::( - schema.clone(), optional_column_desc.clone(), 0.5, ); @@ -758,7 +737,6 @@ fn decimal_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/INT32/Decimal128Array"); bench_primitive::( &mut group, - &schema, &mandatory_decimal1_leaf_desc, &optional_decimal1_leaf_desc, // precision is 8: the max is 99999999 @@ -773,7 +751,6 @@ fn decimal_benches(c: &mut Criterion) { let optional_decimal2_leaf_desc = schema.column(9); bench_primitive::( &mut group, - &schema, &mandatory_decimal2_leaf_desc, &optional_decimal2_leaf_desc, // precision is 16: the max is 9999999999999999 @@ -788,7 +765,6 @@ fn decimal_benches(c: &mut Criterion) { let optional_decimal3_leaf_desc = schema.column(11); bench_byte_decimal::( &mut group, - &schema, &mandatory_decimal3_leaf_desc, &optional_decimal3_leaf_desc, // precision is 16: the max is 9999999999999999 @@ -803,7 +779,6 @@ fn decimal_benches(c: &mut Criterion) { let optional_decimal4_leaf_desc = schema.column(13); bench_byte_decimal::( &mut group, - &schema, &mandatory_decimal4_leaf_desc, &optional_decimal4_leaf_desc, // precision is 16: the max is 9999999999999999 @@ -829,7 +804,6 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/Int32Array"); bench_primitive::( &mut group, - &schema, &mandatory_int32_column_desc, &optional_int32_column_desc, 0, @@ -843,7 +817,6 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/Int64Array"); bench_primitive::( &mut group, - &schema, &mandatory_int64_column_desc, &optional_int64_column_desc, 0, @@ -858,7 +831,6 @@ fn add_benches(c: &mut Criterion) { // string, plain encoded, no NULLs let plain_string_no_null_data = build_plain_encoded_string_page_iterator( - schema.clone(), mandatory_string_column_desc.clone(), 0.0, ); @@ -874,7 +846,6 @@ fn add_benches(c: &mut Criterion) { }); let plain_string_no_null_data = build_plain_encoded_string_page_iterator( - schema.clone(), optional_string_column_desc.clone(), 0.0, ); @@ -891,7 +862,6 @@ fn add_benches(c: &mut Criterion) { // string, plain encoded, half NULLs let plain_string_half_null_data = build_plain_encoded_string_page_iterator( - schema.clone(), optional_string_column_desc.clone(), 0.5, ); @@ -908,7 +878,6 @@ fn add_benches(c: &mut Criterion) { // string, dictionary encoded, no NULLs let dictionary_string_no_null_data = build_dictionary_encoded_string_page_iterator( - schema.clone(), mandatory_string_column_desc.clone(), 0.0, ); @@ -924,7 +893,6 @@ fn add_benches(c: &mut Criterion) { }); let dictionary_string_no_null_data = build_dictionary_encoded_string_page_iterator( - schema.clone(), optional_string_column_desc.clone(), 0.0, ); @@ -941,7 +909,6 @@ fn add_benches(c: &mut Criterion) { // string, dictionary encoded, half NULLs let dictionary_string_half_null_data = build_dictionary_encoded_string_page_iterator( - schema.clone(), optional_string_column_desc.clone(), 0.5, ); diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 1e2720a4a4df..bef27dc7aae1 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -363,12 +363,9 @@ mod tests { .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) .unwrap(); - let column_desc = schema.column(0); - let page_iterator = EmptyPageIterator::new(schema); - let mut array_reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, + Box::::default(), + schema.column(0), None, ) .unwrap(); @@ -410,8 +407,7 @@ mod tests { true, 2, ); - let page_iterator = - InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + let page_iterator = InMemoryPageIterator::new(page_lists); let mut array_reader = PrimitiveArrayReader::::new( Box::new(page_iterator), @@ -474,11 +470,7 @@ mod tests { true, 2, ); - let page_iterator = InMemoryPageIterator::new( - schema.clone(), - column_desc.clone(), - page_lists, - ); + let page_iterator = InMemoryPageIterator::new(page_lists); let mut array_reader = PrimitiveArrayReader::<$arrow_parquet_type>::new( Box::new(page_iterator), column_desc.clone(), @@ -610,8 +602,7 @@ mod tests { 2, ); - let page_iterator = - InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + let page_iterator = InMemoryPageIterator::new(page_lists); let mut array_reader = PrimitiveArrayReader::::new( Box::new(page_iterator), @@ -690,8 +681,7 @@ mod tests { true, 2, ); - let page_iterator = - InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + let page_iterator = InMemoryPageIterator::new(page_lists); let mut array_reader = PrimitiveArrayReader::::new( Box::new(page_iterator), @@ -753,8 +743,7 @@ mod tests { true, 2, ); - let page_iterator = - InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + let page_iterator = InMemoryPageIterator::new(page_lists); let mut array_reader = PrimitiveArrayReader::::new( Box::new(page_iterator), diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index 6585d46146e2..7e66efead2e5 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -26,9 +26,7 @@ use crate::column::page::{PageIterator, PageReader}; use crate::data_type::{ByteArray, ByteArrayType}; use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::errors::Result; -use crate::schema::types::{ - ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, Type, -}; +use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type}; use crate::util::memory::ByteBufferPtr; /// Returns a descriptor for a UTF-8 column @@ -197,15 +195,8 @@ impl ArrayReader for InMemoryArrayReader { } /// Iterator for testing reading empty columns -pub struct EmptyPageIterator { - schema: SchemaDescPtr, -} - -impl EmptyPageIterator { - pub fn new(schema: SchemaDescPtr) -> Self { - EmptyPageIterator { schema } - } -} +#[derive(Default)] +pub struct EmptyPageIterator {} impl Iterator for EmptyPageIterator { type Item = Result>; @@ -215,12 +206,4 @@ impl Iterator for EmptyPageIterator { } } -impl PageIterator for EmptyPageIterator { - fn schema(&mut self) -> Result { - Ok(self.schema.clone()) - } - - fn column_schema(&mut self) -> Result { - Ok(self.schema.column(0)) - } -} +impl PageIterator for EmptyPageIterator {} diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index fb81a2b5d966..c11033eaeb7d 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -112,7 +112,7 @@ use crate::format::PageLocation; use crate::file::FOOTER_SIZE; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::types::SchemaDescPtr; mod metadata; pub use metadata::*; @@ -673,8 +673,6 @@ impl<'a> RowGroupCollection for InMemoryRowGroup<'a> { )?); Ok(Box::new(ColumnChunkIterator { - schema: self.metadata.schema_descr_ptr(), - column_schema: self.metadata.schema_descr_ptr().columns()[i].clone(), reader: Some(Ok(page_reader)), })) } @@ -739,8 +737,6 @@ impl ChunkReader for ColumnChunkData { /// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] struct ColumnChunkIterator { - schema: SchemaDescPtr, - column_schema: ColumnDescPtr, reader: Option>>, } @@ -752,15 +748,7 @@ impl Iterator for ColumnChunkIterator { } } -impl PageIterator for ColumnChunkIterator { - fn schema(&mut self) -> Result { - Ok(self.schema.clone()) - } - - fn column_schema(&mut self) -> Result { - Ok(self.column_schema.clone()) - } -} +impl PageIterator for ColumnChunkIterator {} #[cfg(test)] mod tests { diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 57a0278e23c4..3b19734a2218 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -21,7 +21,6 @@ use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; use crate::format::PageHeader; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; /// Parquet Page definition. @@ -338,13 +337,7 @@ pub trait PageWriter: Send { } /// An iterator over pages of one specific column in a parquet file. -pub trait PageIterator: Iterator>> + Send { - /// Get schema of parquet file. - fn schema(&mut self) -> Result; - - /// Get column schema of this page iterator. - fn column_schema(&mut self) -> Result; -} +pub trait PageIterator: Iterator>> + Send {} #[cfg(test)] mod tests { diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 6a7bbc78f306..7d2d7ea153d8 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -31,7 +31,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::metadata::*; pub use crate::file::serialized_reader::{SerializedFileReader, SerializedPageReader}; use crate::record::reader::RowIter; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType}; +use crate::schema::types::Type as SchemaType; use crate::basic::Type; @@ -264,16 +264,4 @@ impl Iterator for FilePageIterator { } } -impl PageIterator for FilePageIterator { - fn schema(&mut self) -> Result { - Ok(self - .file_reader - .metadata() - .file_metadata() - .schema_descr_ptr()) - } - - fn column_schema(&mut self) -> Result { - self.schema().map(|s| s.column(self.column_index)) - } -} +impl PageIterator for FilePageIterator {} diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index 243fb6f8b897..ab5287462c8c 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -22,7 +22,7 @@ use crate::data_type::DataType; use crate::encodings::encoding::{get_encoder, Encoder}; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; use std::iter::Peekable; use std::mem; @@ -204,20 +204,12 @@ impl + Send> Iterator for InMemoryPageReader

{ /// A utility page iterator which stores page readers in memory, used for tests. #[derive(Clone)] pub struct InMemoryPageIterator>> { - schema: SchemaDescPtr, - column_desc: ColumnDescPtr, page_reader_iter: I, } impl>> InMemoryPageIterator { - pub fn new( - schema: SchemaDescPtr, - column_desc: ColumnDescPtr, - pages: impl IntoIterator, IntoIter = I>, - ) -> Self { + pub fn new(pages: impl IntoIterator, IntoIter = I>) -> Self { Self { - schema, - column_desc, page_reader_iter: pages.into_iter(), } } @@ -233,12 +225,4 @@ impl>> Iterator for InMemoryPageIterator { } } -impl> + Send> PageIterator for InMemoryPageIterator { - fn schema(&mut self) -> Result { - Ok(self.schema.clone()) - } - - fn column_schema(&mut self) -> Result { - Ok(self.column_desc.clone()) - } -} +impl> + Send> PageIterator for InMemoryPageIterator {} From 04ca2f2b0ad964ce8962b8b362da0df932c88091 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 30 May 2023 13:52:45 -0700 Subject: [PATCH 0949/1411] feat: use exactly equal parts in multipart upload (#4305) * refactor: use exactly equal parts in multipart upload * Improve test * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix lifetime --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/lib.rs | 18 ++++++++++-- object_store/src/multipart.rs | 52 +++++++++++++++++++++++------------ 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index c5bf40cc4882..98bbb7adceb9 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -898,6 +898,8 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; + use bytes::{BufMut, BytesMut}; + use itertools::Itertools; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1308,8 +1310,18 @@ mod tests { } } + fn get_random_bytes(len: usize) -> Bytes { + use rand::Rng; + let mut rng = rand::thread_rng(); + let mut bytes = BytesMut::with_capacity(len); + for _ in 0..len { + bytes.put_u8(rng.gen()); + } + bytes.freeze() + } + fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { - std::iter::repeat(Bytes::from_iter(std::iter::repeat(b'x').take(chunk_length))) + std::iter::repeat(get_random_bytes(chunk_length)) .take(num_chunks) .collect() } @@ -1344,8 +1356,8 @@ mod tests { assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage - // Sizes carefully chosen to exactly hit min limit of 5 MiB - let data = get_vec_of_bytes(242_880, 22); + // Sizes chosen to ensure we write three parts + let data = (0..7).map(|_| get_random_bytes(3_200_000)).collect_vec(); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 0606fb51eb1c..26580307053e 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -60,8 +60,11 @@ where max_concurrency: usize, /// Buffer that will be sent in next upload. current_buffer: Vec, - /// Minimum size of a part in bytes - min_part_size: usize, + /// Size of each part. + /// + /// While S3 and Minio support variable part sizes, R2 requires they all be + /// exactly the same size. + part_size: usize, /// Index of current part current_part_idx: usize, /// The completion task @@ -85,12 +88,21 @@ where // Minimum size of 5 MiB // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html // https://cloud.google.com/storage/quotas#requests - min_part_size: 5_242_880, + part_size: 10 * 1024 * 1024, current_part_idx: 0, completion_task: None, } } + // Add data to the current buffer, returning the number of bytes added + fn add_to_buffer(mut self: Pin<&mut Self>, buf: &[u8], offset: usize) -> usize { + let remaining_capacity = self.part_size - self.current_buffer.len(); + let to_copy = std::cmp::min(remaining_capacity, buf.len() - offset); + self.current_buffer + .extend_from_slice(&buf[offset..offset + to_copy]); + to_copy + } + pub fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -158,15 +170,21 @@ where // Poll current tasks self.as_mut().poll_tasks(cx)?; - // If adding buf to pending buffer would trigger send, check - // whether we have capacity for another task. - let enough_to_send = - (buf.len() + self.current_buffer.len()) >= self.min_part_size; - if enough_to_send && self.tasks.len() < self.max_concurrency { - // If we do, copy into the buffer and submit the task, and return ready. - self.current_buffer.extend_from_slice(buf); + let mut offset = 0; + + loop { + // Fill up current buffer + offset += self.as_mut().add_to_buffer(buf, offset); - let out_buffer = std::mem::take(&mut self.current_buffer); + // If we don't have a full buffer or we have too many tasks, break + if self.current_buffer.len() < self.part_size + || self.tasks.len() >= self.max_concurrency + { + break; + } + + let new_buffer = Vec::with_capacity(self.part_size); + let out_buffer = std::mem::replace(&mut self.current_buffer, new_buffer); let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { @@ -177,14 +195,14 @@ where // We need to poll immediately after adding to setup waker self.as_mut().poll_tasks(cx)?; + } - Poll::Ready(Ok(buf.len())) - } else if !enough_to_send { - self.current_buffer.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } else { - // Waker registered by call to poll_tasks at beginning + // If offset is zero, then we didn't write anything because we didn't + // have capacity for more tasks and our buffer is full. + if offset == 0 && !buf.is_empty() { Poll::Pending + } else { + Poll::Ready(Ok(offset)) } } From d8a4f984783653462d324530ccbf95400a703a54 Mon Sep 17 00:00:00 2001 From: Sergii Mikhtoniuk Date: Wed, 31 May 2023 03:04:43 -0700 Subject: [PATCH 0950/1411] Treat legacy TIMSETAMP_X converted types as UTC (#4309) --- .../src/arrow/array_reader/primitive_array.rs | 24 +++++++++++++++---- parquet/src/arrow/schema/mod.rs | 4 ++-- parquet/src/arrow/schema/primitive.rs | 14 ++++++----- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index bef27dc7aae1..ec0d29e8babc 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -438,7 +438,15 @@ mod tests { } macro_rules! test_primitive_array_reader_one_type { - ($arrow_parquet_type:ty, $physical_type:expr, $converted_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ + ( + $arrow_parquet_type:ty, + $physical_type:expr, + $converted_type_str:expr, + $result_arrow_type:ty, + $result_arrow_cast_type:ty, + $result_primitive_type:ty + $(, $timezone:expr)? + ) => {{ let message_type = format!( " message test_schema {{ @@ -493,7 +501,9 @@ mod tests { result_data_type ) .as_str(), - ); + ) + $(.with_timezone($timezone))? + ; // create expected array as primitive, and cast to result type let expected = PrimitiveArray::<$result_arrow_cast_type>::from( @@ -516,7 +526,9 @@ mod tests { result_data_type ) .as_str(), - ); + ) + $(.with_timezone($timezone))? + ; assert_eq!(expected, array); } }}; @@ -554,7 +566,8 @@ mod tests { "TIMESTAMP_MILLIS", arrow::datatypes::TimestampMillisecondType, arrow::datatypes::Int64Type, - i64 + i64, + "UTC" ); test_primitive_array_reader_one_type!( crate::data_type::Int64Type, @@ -562,7 +575,8 @@ mod tests { "TIMESTAMP_MICROS", arrow::datatypes::TimestampMicrosecondType, arrow::datatypes::Int64Type, - i64 + i64, + "UTC" ); } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index ffae1eae54aa..a80d4add3d06 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1278,12 +1278,12 @@ mod tests { Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true), Field::new( "ts_milli", - DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())), true, ), Field::new( "ts_micro", - DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), false, ), Field::new( diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index d4db28915f2f..62133f157f37 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -218,12 +218,14 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Ok(DataType::Int64), (None, ConvertedType::UINT_64) => Ok(DataType::UInt64), (None, ConvertedType::TIME_MICROS) => Ok(DataType::Time64(TimeUnit::Microsecond)), - (None, ConvertedType::TIMESTAMP_MILLIS) => { - Ok(DataType::Timestamp(TimeUnit::Millisecond, None)) - } - (None, ConvertedType::TIMESTAMP_MICROS) => { - Ok(DataType::Timestamp(TimeUnit::Microsecond, None)) - } + (None, ConvertedType::TIMESTAMP_MILLIS) => Ok(DataType::Timestamp( + TimeUnit::Millisecond, + Some("UTC".into()), + )), + (None, ConvertedType::TIMESTAMP_MICROS) => Ok(DataType::Timestamp( + TimeUnit::Microsecond, + Some("UTC".into()), + )), (Some(LogicalType::Decimal { scale, precision }), _) => { decimal_128_type(scale, precision) } From 768e726df3077403f4653dffdc961328a22a5ef4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 31 May 2023 06:49:02 -0400 Subject: [PATCH 0951/1411] Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313) * Use `page_size` consistently, deprecate `pagesize` * doc tweaks * Update parquet/src/file/properties.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Liang-Chi Hsieh --- parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/arrow_writer/mod.rs | 10 +-- parquet/src/bin/parquet-rewrite.rs | 12 ++-- parquet/src/column/writer/mod.rs | 11 +-- parquet/src/file/properties.rs | 99 ++++++++++++++++++++------- parquet/tests/arrow_writer_layout.rs | 20 +++--- 6 files changed, 104 insertions(+), 52 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 819e96c0a3d1..f3e178bdf71c 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1240,7 +1240,7 @@ mod tests { fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() - .set_data_pagesize_limit(self.max_data_page_size) + .set_data_page_size_limit(self.max_data_page_size) .set_write_batch_size(self.write_batch_size) .set_writer_version(self.writer_version) .set_statistics_enabled(self.enabled_statistics); @@ -1248,7 +1248,7 @@ mod tests { let builder = match self.encoding { Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => builder .set_dictionary_enabled(true) - .set_dictionary_pagesize_limit(self.max_dict_page_size), + .set_dictionary_page_size_limit(self.max_dict_page_size), _ => builder .set_dictionary_enabled(false) .set_encoding(self.encoding), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5f2750a55009..0aca77f5b572 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1320,8 +1320,8 @@ mod tests { // Set everything very low so we fallback to PLAIN encoding after the first row let props = WriterProperties::builder() - .set_data_pagesize_limit(1) - .set_dictionary_pagesize_limit(1) + .set_data_page_size_limit(1) + .set_dictionary_page_size_limit(1) .set_write_batch_size(1) .build(); @@ -1494,7 +1494,7 @@ mod tests { .set_writer_version(version) .set_max_row_group_size(row_group_size) .set_dictionary_enabled(dictionary_size != 0) - .set_dictionary_pagesize_limit(dictionary_size.max(1)) + .set_dictionary_page_size_limit(dictionary_size.max(1)) .set_encoding(*encoding) .set_bloom_filter_enabled(bloom_filter) .build(); @@ -2043,7 +2043,7 @@ mod tests { let expected_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); let row_group_sizes = [1024, SMALL_SIZE, SMALL_SIZE / 2, SMALL_SIZE / 2 + 1, 10]; - let data_pagesize_limit: usize = 32; + let data_page_size_limit: usize = 32; let write_batch_size: usize = 16; for encoding in &encodings { @@ -2053,7 +2053,7 @@ mod tests { .set_max_row_group_size(row_group_size) .set_dictionary_enabled(false) .set_encoding(*encoding) - .set_data_pagesize_limit(data_pagesize_limit) + .set_data_page_size_limit(data_page_size_limit) .set_write_batch_size(write_batch_size) .build(); diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs index 57e8885c3ed1..e4a80e7af354 100644 --- a/parquet/src/bin/parquet-rewrite.rs +++ b/parquet/src/bin/parquet-rewrite.rs @@ -164,7 +164,7 @@ struct Args { /// Sets best effort maximum size of a data page in bytes. #[clap(long)] - data_pagesize_limit: Option, + data_page_size_limit: Option, /// Sets max statistics size for any column. /// @@ -174,7 +174,7 @@ struct Args { /// Sets best effort maximum dictionary page size, in bytes. #[clap(long)] - dictionary_pagesize_limit: Option, + dictionary_page_size_limit: Option, /// Sets whether bloom filter is enabled for any column. #[clap(long)] @@ -237,13 +237,13 @@ fn main() { writer_properties_builder = writer_properties_builder.set_data_page_row_count_limit(value); } - if let Some(value) = args.data_pagesize_limit { + if let Some(value) = args.data_page_size_limit { writer_properties_builder = - writer_properties_builder.set_data_pagesize_limit(value); + writer_properties_builder.set_data_page_size_limit(value); } - if let Some(value) = args.dictionary_pagesize_limit { + if let Some(value) = args.dictionary_page_size_limit { writer_properties_builder = - writer_properties_builder.set_dictionary_pagesize_limit(value); + writer_properties_builder.set_dictionary_page_size_limit(value); } if let Some(value) = args.max_statistics_size { writer_properties_builder = diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 5e623d281157..310519f4a39c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -609,7 +609,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { #[inline] fn should_dict_fallback(&self) -> bool { match self.encoder.estimated_dict_page_size() { - Some(size) => size >= self.props.dictionary_pagesize_limit(), + Some(size) => size >= self.props.dictionary_page_size_limit(), None => false, } } @@ -627,7 +627,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.page_metrics.num_buffered_rows as usize >= self.props.data_page_row_count_limit() - || self.encoder.estimated_data_page_size() >= self.props.data_pagesize_limit() + || self.encoder.estimated_data_page_size() + >= self.props.data_page_size_limit() } /// Performs dictionary fallback. @@ -1839,8 +1840,8 @@ mod tests { #[test] fn test_column_writer_dictionary_fallback_small_data_page() { let props = WriterProperties::builder() - .set_dictionary_pagesize_limit(32) - .set_data_pagesize_limit(32) + .set_dictionary_page_size_limit(32) + .set_data_page_size_limit(32) .build(); column_roundtrip_random::(props, 1024, i32::MIN, i32::MAX, 10, 10); } @@ -1899,7 +1900,7 @@ mod tests { let page_writer = Box::new(SerializedPageWriter::new(&mut write)); let props = Arc::new( WriterProperties::builder() - .set_data_pagesize_limit(10) + .set_data_page_size_limit(10) .set_write_batch_size(3) // write 3 values at a time .build(), ); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c09503987a00..66690463aa3c 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -117,8 +117,8 @@ pub type WriterPropertiesPtr = Arc; /// use [`WriterPropertiesBuilder`] to assemble these properties. #[derive(Debug, Clone)] pub struct WriterProperties { - data_pagesize_limit: usize, - dictionary_pagesize_limit: usize, + data_page_size_limit: usize, + dictionary_page_size_limit: usize, data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, @@ -152,23 +152,42 @@ impl WriterProperties { /// Returns data page size limit. /// /// Note: this is a best effort limit based on the write batch size + #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")] pub fn data_pagesize_limit(&self) -> usize { - self.data_pagesize_limit + self.data_page_size_limit + } + + /// Returns data page size limit. + /// + /// Note: this is a best effort limit based on the write batch size + /// + /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`] + pub fn data_page_size_limit(&self) -> usize { + self.data_page_size_limit } /// Returns dictionary page size limit. /// /// Note: this is a best effort limit based on the write batch size + #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")] pub fn dictionary_pagesize_limit(&self) -> usize { - self.dictionary_pagesize_limit + self.dictionary_page_size_limit } - /// Returns the maximum page row count + /// Returns dictionary page size limit. /// - /// This can be used to limit the number of rows within a page to - /// yield better page pruning + /// Note: this is a best effort limit based on the write batch size + /// + /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`] + pub fn dictionary_page_size_limit(&self) -> usize { + self.dictionary_page_size_limit + } + + /// Returns the maximum page row count /// /// Note: this is a best effort limit based on the write batch size + /// + /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`] pub fn data_page_row_count_limit(&self) -> usize { self.data_page_row_count_limit } @@ -290,8 +309,8 @@ impl WriterProperties { /// Writer properties builder. pub struct WriterPropertiesBuilder { - data_pagesize_limit: usize, - dictionary_pagesize_limit: usize, + data_page_size_limit: usize, + dictionary_page_size_limit: usize, data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, @@ -307,8 +326,8 @@ impl WriterPropertiesBuilder { /// Returns default state of the builder. fn with_defaults() -> Self { Self { - data_pagesize_limit: DEFAULT_PAGE_SIZE, - dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, + data_page_size_limit: DEFAULT_PAGE_SIZE, + dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, data_page_row_count_limit: usize::MAX, write_batch_size: DEFAULT_WRITE_BATCH_SIZE, max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, @@ -324,8 +343,8 @@ impl WriterPropertiesBuilder { /// Finalizes the configuration and returns immutable writer properties struct. pub fn build(self) -> WriterProperties { WriterProperties { - data_pagesize_limit: self.data_pagesize_limit, - dictionary_pagesize_limit: self.dictionary_pagesize_limit, + data_page_size_limit: self.data_page_size_limit, + dictionary_page_size_limit: self.dictionary_page_size_limit, data_page_row_count_limit: self.data_page_row_count_limit, write_batch_size: self.write_batch_size, max_row_group_size: self.max_row_group_size, @@ -351,16 +370,32 @@ impl WriterPropertiesBuilder { /// /// Note: this is a best effort limit based on value of /// [`set_write_batch_size`](Self::set_write_batch_size). + #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")] pub fn set_data_pagesize_limit(mut self, value: usize) -> Self { - self.data_pagesize_limit = value; + self.data_page_size_limit = value; self } - /// Sets best effort maximum number of rows in a data page. + /// Sets best effort maximum size of a data page in bytes. /// + /// The parquet writer will attempt to limit the sizes of each + /// `DataPage` to this many bytes. Reducing this value will result + /// in larger parquet files, but may improve the effectiveness of + /// page index based predicate pushdown during reading. /// - /// This can be used to limit the number of rows within a page to - /// yield better page pruning. + /// Note: this is a best effort limit based on value of + /// [`set_write_batch_size`](Self::set_write_batch_size). + pub fn set_data_page_size_limit(mut self, value: usize) -> Self { + self.data_page_size_limit = value; + self + } + + /// Sets best effort maximum number of rows in a data page. + /// + /// The parquet writer will attempt to limit the number of rows in + /// each `DataPage` to this value. Reducing this value will result + /// in larger parquet files, but may improve the effectiveness of + /// page index based predicate pushdown during reading. /// /// Note: this is a best effort limit based on value of /// [`set_write_batch_size`](Self::set_write_batch_size). @@ -373,8 +408,24 @@ impl WriterPropertiesBuilder { /// /// Note: this is a best effort limit based on value of /// [`set_write_batch_size`](Self::set_write_batch_size). + #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")] pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self { - self.dictionary_pagesize_limit = value; + self.dictionary_page_size_limit = value; + self + } + + /// Sets best effort maximum dictionary page size, in bytes. + /// + /// The parquet writer will attempt to limit the size of each + /// `DataPage` used to store dictionaries to this many + /// bytes. Reducing this value will result in larger parquet + /// files, but may improve the effectiveness of page index based + /// predicate pushdown during reading. + /// + /// Note: this is a best effort limit based on value of + /// [`set_write_batch_size`](Self::set_write_batch_size). + pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self { + self.dictionary_page_size_limit = value; self } @@ -850,9 +901,9 @@ mod tests { #[test] fn test_writer_properties_default_settings() { let props = WriterProperties::default(); - assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE); + assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE); assert_eq!( - props.dictionary_pagesize_limit(), + props.dictionary_page_size_limit(), DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT ); assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE); @@ -939,8 +990,8 @@ mod tests { let props = WriterProperties::builder() // file settings .set_writer_version(WriterVersion::PARQUET_2_0) - .set_data_pagesize_limit(10) - .set_dictionary_pagesize_limit(20) + .set_data_page_size_limit(10) + .set_dictionary_page_size_limit(20) .set_write_batch_size(30) .set_max_row_group_size(40) .set_created_by("default".to_owned()) @@ -969,8 +1020,8 @@ mod tests { .build(); assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0); - assert_eq!(props.data_pagesize_limit(), 10); - assert_eq!(props.dictionary_pagesize_limit(), 20); + assert_eq!(props.data_page_size_limit(), 10); + assert_eq!(props.dictionary_page_size_limit(), 20); assert_eq!(props.write_batch_size(), 30); assert_eq!(props.max_row_group_size(), 40); assert_eq!(props.created_by(), "default"); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 4bf649f245b0..142112b7b686 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -175,7 +175,7 @@ fn test_primitive() { let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); let props = WriterProperties::builder() .set_dictionary_enabled(false) - .set_data_pagesize_limit(1000) + .set_data_page_size_limit(1000) .set_write_batch_size(10) .build(); @@ -204,8 +204,8 @@ fn test_primitive() { // Test spill dictionary let props = WriterProperties::builder() .set_dictionary_enabled(true) - .set_dictionary_pagesize_limit(1000) - .set_data_pagesize_limit(10000) + .set_dictionary_page_size_limit(1000) + .set_data_page_size_limit(10000) .set_write_batch_size(10) .build(); @@ -246,8 +246,8 @@ fn test_primitive() { // Test spill dictionary encoded pages let props = WriterProperties::builder() .set_dictionary_enabled(true) - .set_dictionary_pagesize_limit(10000) - .set_data_pagesize_limit(500) + .set_dictionary_page_size_limit(10000) + .set_data_page_size_limit(500) .set_write_batch_size(10) .build(); @@ -350,7 +350,7 @@ fn test_string() { let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); let props = WriterProperties::builder() .set_dictionary_enabled(false) - .set_data_pagesize_limit(1000) + .set_data_page_size_limit(1000) .set_write_batch_size(10) .build(); @@ -386,8 +386,8 @@ fn test_string() { // Test spill dictionary let props = WriterProperties::builder() .set_dictionary_enabled(true) - .set_dictionary_pagesize_limit(1000) - .set_data_pagesize_limit(10000) + .set_dictionary_page_size_limit(1000) + .set_data_page_size_limit(10000) .set_write_batch_size(10) .build(); @@ -435,8 +435,8 @@ fn test_string() { // Test spill dictionary encoded pages let props = WriterProperties::builder() .set_dictionary_enabled(true) - .set_dictionary_pagesize_limit(20000) - .set_data_pagesize_limit(500) + .set_dictionary_page_size_limit(20000) + .set_data_page_size_limit(500) .set_write_batch_size(10) .build(); From bbb47a69023094809b69110c79a8f18d191e54f1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 31 May 2023 11:49:48 +0100 Subject: [PATCH 0952/1411] Expose page-level arrow reader API (#4298) (#4307) * Expose page-level arrow reader API (#4298) * Make scan_ranges public * Review feedback --- parquet/src/arrow/array_reader/builder.rs | 20 +++---- parquet/src/arrow/array_reader/list_array.rs | 4 +- parquet/src/arrow/array_reader/mod.rs | 26 +++------ parquet/src/arrow/arrow_reader/mod.rs | 39 ++++++++++--- parquet/src/arrow/arrow_reader/selection.rs | 11 +++- parquet/src/arrow/async_reader/mod.rs | 16 ++---- parquet/src/arrow/mod.rs | 3 +- parquet/src/arrow/schema/complex.rs | 12 ++-- parquet/src/arrow/schema/mod.rs | 60 +++++++++++++++++--- 9 files changed, 122 insertions(+), 69 deletions(-) diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 5e0d05e8953c..bb3f403358ee 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -24,7 +24,7 @@ use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_ use crate::arrow::array_reader::{ make_byte_array_dictionary_reader, make_byte_array_reader, ArrayReader, FixedSizeListArrayReader, ListArrayReader, MapArrayReader, NullArrayReader, - PrimitiveArrayReader, RowGroupCollection, StructArrayReader, + PrimitiveArrayReader, RowGroups, StructArrayReader, }; use crate::arrow::schema::{ParquetField, ParquetFieldType}; use crate::arrow::ProjectionMask; @@ -39,7 +39,7 @@ use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; pub fn build_array_reader( field: Option<&ParquetField>, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result> { let reader = field .and_then(|field| build_reader(field, mask, row_groups).transpose()) @@ -52,7 +52,7 @@ pub fn build_array_reader( fn build_reader( field: &ParquetField, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { match field.field_type { ParquetFieldType::Primitive { .. } => { @@ -75,7 +75,7 @@ fn build_reader( fn build_map_reader( field: &ParquetField, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { let children = field.children().unwrap(); assert_eq!(children.len(), 2); @@ -127,7 +127,7 @@ fn build_list_reader( field: &ParquetField, mask: &ProjectionMask, is_large: bool, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { let children = field.children().unwrap(); assert_eq!(children.len(), 1); @@ -173,7 +173,7 @@ fn build_list_reader( fn build_fixed_size_list_reader( field: &ParquetField, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { let children = field.children().unwrap(); assert_eq!(children.len(), 1); @@ -210,7 +210,7 @@ fn build_fixed_size_list_reader( fn build_primitive_reader( field: &ParquetField, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { let (col_idx, primitive_type) = match &field.field_type { ParquetFieldType::Primitive { @@ -301,7 +301,7 @@ fn build_primitive_reader( fn build_struct_reader( field: &ParquetField, mask: &ProjectionMask, - row_groups: &dyn RowGroupCollection, + row_groups: &dyn RowGroups, ) -> Result>> { let arrow_fields = match &field.arrow_type { DataType::Struct(children) => children, @@ -338,7 +338,7 @@ fn build_struct_reader( #[cfg(test)] mod tests { use super::*; - use crate::arrow::schema::parquet_to_array_schema_and_fields; + use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::util::test_common::file_util::get_test_file; use arrow::datatypes::Field; @@ -352,7 +352,7 @@ mod tests { let file_metadata = file_reader.metadata().file_metadata(); let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]); - let (_, fields) = parquet_to_array_schema_and_fields( + let (_, fields) = parquet_to_arrow_schema_and_fields( file_metadata.schema_descr(), ProjectionMask::all(), file_metadata.key_value_metadata(), diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 932034417c81..7c66c5c23112 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -250,7 +250,7 @@ mod tests { use crate::arrow::array_reader::build_array_reader; use crate::arrow::array_reader::list_array::ListArrayReader; use crate::arrow::array_reader::test_util::InMemoryArrayReader; - use crate::arrow::schema::parquet_to_array_schema_and_fields; + use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::arrow::{parquet_to_arrow_schema, ArrowWriter, ProjectionMask}; use crate::file::properties::WriterProperties; use crate::file::reader::{FileReader, SerializedFileReader}; @@ -566,7 +566,7 @@ mod tests { let file_metadata = file_reader.metadata().file_metadata(); let schema = file_metadata.schema_descr(); let mask = ProjectionMask::leaves(schema, vec![0]); - let (_, fields) = parquet_to_array_schema_and_fields( + let (_, fields) = parquet_to_arrow_schema_and_fields( schema, ProjectionMask::all(), file_metadata.key_value_metadata(), diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 823084b43207..1e781fb73ce5 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -28,7 +28,6 @@ use crate::arrow::record_reader::GenericRecordReader; use crate::column::page::PageIterator; use crate::column::reader::decoder::ColumnValueDecoder; use crate::file::reader::{FilePageIterator, FileReader}; -use crate::schema::types::SchemaDescPtr; mod builder; mod byte_array; @@ -100,22 +99,15 @@ pub trait ArrayReader: Send { } /// A collection of row groups -pub trait RowGroupCollection { - /// Get schema of parquet file. - fn schema(&self) -> SchemaDescPtr; - +pub trait RowGroups { /// Get the number of rows in this collection fn num_rows(&self) -> usize; - /// Returns an iterator over the column chunks for particular column + /// Returns a [`PageIterator`] for the column chunks with the given leaf column index fn column_chunks(&self, i: usize) -> Result>; } -impl RowGroupCollection for Arc { - fn schema(&self) -> SchemaDescPtr { - self.metadata().file_metadata().schema_descr_ptr() - } - +impl RowGroups for Arc { fn num_rows(&self) -> usize { self.metadata().file_metadata().num_rows() as usize } @@ -126,26 +118,22 @@ impl RowGroupCollection for Arc { } } -pub(crate) struct FileReaderRowGroupCollection { +pub(crate) struct FileReaderRowGroups { /// The underling file reader reader: Arc, /// Optional list of row group indices to scan row_groups: Option>, } -impl FileReaderRowGroupCollection { - /// Creates a new [`RowGroupCollection`] from a `FileReader` and an optional +impl FileReaderRowGroups { + /// Creates a new [`RowGroups`] from a `FileReader` and an optional /// list of row group indexes to scan pub fn new(reader: Arc, row_groups: Option>) -> Self { Self { reader, row_groups } } } -impl RowGroupCollection for FileReaderRowGroupCollection { - fn schema(&self) -> SchemaDescPtr { - self.reader.metadata().file_metadata().schema_descr_ptr() - } - +impl RowGroups for FileReaderRowGroups { fn num_rows(&self) -> usize { match &self.row_groups { None => self.reader.metadata().file_metadata().num_rows() as usize, diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index f3e178bdf71c..9cb09c9a5d7d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -26,12 +26,9 @@ use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; use arrow_select::filter::prep_null_mask_filter; -use crate::arrow::array_reader::{ - build_array_reader, ArrayReader, FileReaderRowGroupCollection, RowGroupCollection, -}; -use crate::arrow::schema::parquet_to_array_schema_and_fields; -use crate::arrow::schema::ParquetField; -use crate::arrow::ProjectionMask; +use crate::arrow::array_reader::{build_array_reader, ArrayReader, FileReaderRowGroups}; +use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; +use crate::arrow::{FieldLevels, ProjectionMask}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; use crate::file::reader::{ChunkReader, SerializedFileReader}; @@ -41,6 +38,7 @@ use crate::schema::types::SchemaDescriptor; mod filter; mod selection; +pub use crate::arrow::array_reader::RowGroups; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; pub use selection::{RowSelection, RowSelector}; @@ -87,7 +85,7 @@ impl ArrowReaderBuilder { false => metadata.file_metadata().key_value_metadata(), }; - let (schema, fields) = parquet_to_array_schema_and_fields( + let (schema, fields) = parquet_to_arrow_schema_and_fields( metadata.file_metadata().schema_descr(), ProjectionMask::all(), kv_metadata, @@ -269,8 +267,7 @@ impl ArrowReaderBuilder> { /// /// Note: this will eagerly evaluate any `RowFilter` before returning pub fn build(self) -> Result { - let reader = - FileReaderRowGroupCollection::new(Arc::new(self.input.0), self.row_groups); + let reader = FileReaderRowGroups::new(Arc::new(self.input.0), self.row_groups); let mut filter = self.filter; let mut selection = self.selection; @@ -420,6 +417,30 @@ impl ParquetRecordBatchReader { .build() } + /// Create a new [`ParquetRecordBatchReader`] from the provided [`RowGroups`] + /// + /// Note: this is a low-level interface see [`ParquetRecordBatchReader::try_new`] for a + /// higher-level interface for reading parquet data from a file + pub fn try_new_with_row_groups( + levels: &FieldLevels, + row_groups: &dyn RowGroups, + batch_size: usize, + selection: Option, + ) -> Result { + let array_reader = build_array_reader( + levels.levels.as_ref(), + &ProjectionMask::all(), + row_groups, + )?; + + Ok(Self { + batch_size, + array_reader, + schema: Arc::new(Schema::new(levels.fields.clone())), + selection: selection.map(|s| s.trim().into()), + }) + } + /// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at /// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` /// all rows will be returned diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 76f950620688..a558f893c43e 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -173,9 +173,14 @@ impl RowSelection { } } - /// Given an offset index, return the offset ranges for all data pages selected by `self` - #[cfg(any(test, feature = "async"))] - pub(crate) fn scan_ranges( + /// Given an offset index, return the byte ranges for all data pages selected by `self` + /// + /// This is useful for determining what byte ranges to fetch from underlying storage + /// + /// Note: this method does not make any effort to combine consecutive ranges, nor coalesce + /// ranges that are close together. This is instead delegated to the IO subsystem to optimise, + /// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges) + pub fn scan_ranges( &self, page_locations: &[crate::format::PageLocation], ) -> Vec> { diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c11033eaeb7d..f17fb0751d52 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -94,12 +94,11 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; use arrow_array::RecordBatch; use arrow_schema::SchemaRef; -use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; +use crate::arrow::array_reader::{build_array_reader, RowGroups}; use crate::arrow::arrow_reader::{ apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, RowSelection, }; -use crate::arrow::schema::ParquetField; use crate::arrow::ProjectionMask; use crate::column::page::{PageIterator, PageReader}; @@ -112,14 +111,13 @@ use crate::format::PageLocation; use crate::file::FOOTER_SIZE; -use crate::schema::types::SchemaDescPtr; - mod metadata; pub use metadata::*; #[cfg(feature = "object_store")] mod store; +use crate::arrow::schema::ParquetField; #[cfg(feature = "object_store")] pub use store::*; @@ -648,11 +646,7 @@ impl<'a> InMemoryRowGroup<'a> { } } -impl<'a> RowGroupCollection for InMemoryRowGroup<'a> { - fn schema(&self) -> SchemaDescPtr { - self.metadata.schema_descr_ptr() - } - +impl<'a> RowGroups for InMemoryRowGroup<'a> { fn num_rows(&self) -> usize { self.row_count } @@ -756,7 +750,7 @@ mod tests { use crate::arrow::arrow_reader::{ ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowSelector, }; - use crate::arrow::schema::parquet_to_array_schema_and_fields; + use crate::arrow::schema::parquet_to_arrow_schema_and_fields; use crate::arrow::ArrowWriter; use crate::file::footer::parse_metadata; use crate::file::page_index::index_reader; @@ -1401,7 +1395,7 @@ mod tests { }; let requests = async_reader.requests.clone(); - let (_, fields) = parquet_to_array_schema_and_fields( + let (_, fields) = parquet_to_arrow_schema_and_fields( metadata.file_metadata().schema_descr(), ProjectionMask::all(), None, diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index e5211ec23931..aad4925c7c70 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -123,7 +123,8 @@ pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; pub use self::schema::{ - arrow_to_parquet_schema, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, + arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema, + parquet_to_arrow_schema_by_columns, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index c1699aafcfe8..0d19875d97de 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -24,7 +24,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError; use crate::errors::Result; use crate::schema::types::{SchemaDescriptor, Type, TypePtr}; -use arrow_schema::{DataType, Field, Schema, SchemaBuilder}; +use arrow_schema::{DataType, Field, Fields, SchemaBuilder}; fn get_repetition(t: &Type) -> Repetition { let info = t.get_basic_info(); @@ -34,7 +34,8 @@ fn get_repetition(t: &Type) -> Repetition { } } -/// Representation of a parquet file, in terms of arrow schema elements +/// Representation of a parquet schema element, in terms of arrow schema elements +#[derive(Debug, Clone)] pub struct ParquetField { /// The level which represents an insertion into the current list /// i.e. guaranteed to be > 0 for a list type @@ -82,6 +83,7 @@ impl ParquetField { } } +#[derive(Debug, Clone)] pub enum ParquetFieldType { Primitive { /// The index of the column in parquet @@ -554,13 +556,13 @@ fn convert_field( /// Computes the [`ParquetField`] for the provided [`SchemaDescriptor`] with `leaf_columns` listing /// the indexes of leaf columns to project, and `embedded_arrow_schema` the optional -/// [`Schema`] embedded in the parquet metadata +/// [`Fields`] embedded in the parquet metadata /// /// Note: This does not support out of order column projection pub fn convert_schema( schema: &SchemaDescriptor, mask: ProjectionMask, - embedded_arrow_schema: Option<&Schema>, + embedded_arrow_schema: Option<&Fields>, ) -> Result> { let mut visitor = Visitor { next_col_idx: 0, @@ -570,7 +572,7 @@ pub fn convert_schema( let context = VisitorContext { rep_level: 0, def_level: 0, - data_type: embedded_arrow_schema.map(|s| DataType::Struct(s.fields().clone())), + data_type: embedded_arrow_schema.map(|fields| DataType::Struct(fields.clone())), }; visitor.dispatch(&schema.root_schema_ptr(), context) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index a80d4add3d06..3b969104424d 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -45,7 +45,8 @@ mod primitive; use crate::arrow::ProjectionMask; pub(crate) use complex::{ParquetField, ParquetFieldType}; -/// Convert Parquet schema to Arrow schema including optional metadata. +/// Convert Parquet schema to Arrow schema including optional metadata +/// /// Attempts to decode any existing Arrow schema metadata, falling back /// to converting the Parquet schema column-wise pub fn parquet_to_arrow_schema( @@ -66,11 +67,11 @@ pub fn parquet_to_arrow_schema_by_columns( mask: ProjectionMask, key_value_metadata: Option<&Vec>, ) -> Result { - Ok(parquet_to_array_schema_and_fields(parquet_schema, mask, key_value_metadata)?.0) + Ok(parquet_to_arrow_schema_and_fields(parquet_schema, mask, key_value_metadata)?.0) } /// Extracts the arrow metadata -pub(crate) fn parquet_to_array_schema_and_fields( +pub(crate) fn parquet_to_arrow_schema_and_fields( parquet_schema: &SchemaDescriptor, mask: ProjectionMask, key_value_metadata: Option<&Vec>, @@ -88,15 +89,56 @@ pub(crate) fn parquet_to_array_schema_and_fields( }); } - match complex::convert_schema(parquet_schema, mask, maybe_schema.as_ref())? { + let hint = maybe_schema.as_ref().map(|s| s.fields()); + let field_levels = parquet_to_arrow_field_levels(parquet_schema, mask, hint)?; + let schema = Schema::new_with_metadata(field_levels.fields, metadata); + Ok((schema, field_levels.levels)) +} + +/// Schema information necessary to decode a parquet file as arrow [`Fields`] +/// +/// In particular this stores the dremel-level information necessary to correctly +/// interpret the encoded definition and repetition levels +/// +/// Note: this is an opaque container intended to be used with lower-level APIs +/// within this crate +#[derive(Debug, Clone)] +pub struct FieldLevels { + pub(crate) fields: Fields, + pub(crate) levels: Option, +} + +/// Convert a parquet [`SchemaDescriptor`] to [`FieldLevels`] +/// +/// Columns not included within [`ProjectionMask`] will be ignored. +/// +/// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it +/// will be used, otherwise the default arrow type for the given parquet column type will be used. +/// +/// This is to accommodate arrow types that cannot be round-tripped through parquet natively. +/// Depending on the parquet writer, this can lead to a mismatch between a file's parquet schema +/// and its embedded arrow schema. The parquet `schema` must be treated as authoritative in such +/// an event. See [#1663](https://github.com/apache/arrow-rs/issues/1663) for more information +/// +/// Note: this is a low-level API, most users will want to make use of the higher-level +/// [`parquet_to_arrow_schema`] for decoding metadata from a parquet file. +pub fn parquet_to_arrow_field_levels( + schema: &SchemaDescriptor, + mask: ProjectionMask, + hint: Option<&Fields>, +) -> Result { + match complex::convert_schema(schema, mask, hint)? { Some(field) => match &field.arrow_type { - DataType::Struct(fields) => Ok(( - Schema::new_with_metadata(fields.clone(), metadata), - Some(field), - )), + DataType::Struct(fields) => Ok(FieldLevels { + fields: fields.clone(), + levels: Some(field), + }), _ => unreachable!(), }, - None => Ok((Schema::new_with_metadata(Fields::empty(), metadata), None)), + None => Ok(FieldLevels { + fields: Fields::empty(), + levels: None, + }), } } From 0783cf92f1000b2f0da5e3a1c6c5d18f92af6d66 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 31 May 2023 03:57:30 -0700 Subject: [PATCH 0953/1411] Speed up i256 division and remainder operations (#4303) * Get rid of BigInt division * Add test * Add benchmark * Fix merging conflicts * Add comment * Fix clippy * Fix clippy * Fix MIN case * For review * Move tests into test_ops * Fix doc --- arrow-buffer/benches/i256.rs | 37 ++++++++ arrow-buffer/src/bigint.rs | 167 ++++++++++++++++++++++++++++------- 2 files changed, 174 insertions(+), 30 deletions(-) diff --git a/arrow-buffer/benches/i256.rs b/arrow-buffer/benches/i256.rs index a04e4cb6cde8..2c43e0e91070 100644 --- a/arrow-buffer/benches/i256.rs +++ b/arrow-buffer/benches/i256.rs @@ -17,8 +17,23 @@ use arrow_buffer::i256; use criterion::*; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use std::str::FromStr; +/// Returns fixed seedable RNG +fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} + +fn create_i256_vec(size: usize) -> Vec { + let mut rng = seedable_rng(); + + (0..size) + .map(|_| i256::from_i128(rng.gen::())) + .collect() +} + fn criterion_benchmark(c: &mut Criterion) { let numbers = vec![ i256::ZERO, @@ -38,6 +53,28 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| i256::from_str(&t).unwrap()); }); } + + c.bench_function("i256_div", |b| { + b.iter(|| { + for number_a in create_i256_vec(10) { + for number_b in create_i256_vec(5) { + number_a.checked_div(number_b); + number_a.wrapping_div(number_b); + } + } + }); + }); + + c.bench_function("i256_rem", |b| { + b.iter(|| { + for number_a in create_i256_vec(10) { + for number_b in create_i256_vec(5) { + number_a.checked_rem(number_b); + number_a.wrapping_rem(number_b); + } + } + }); + }); } criterion_group!(benches, criterion_benchmark); diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index fab75b792abd..3b3994689566 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -22,22 +22,33 @@ use std::num::ParseIntError; use std::ops::{BitAnd, BitOr, BitXor, Neg, Shl, Shr}; use std::str::FromStr; -/// An opaque error similar to [`std::num::ParseIntError`] +/// [`i256`] operations return this error type. #[derive(Debug)] -pub struct ParseI256Error {} +pub enum I256Error { + /// An opaque error similar to [`std::num::ParseIntError`] + ParseError, + /// Division by zero + DivideByZero, + /// Division overflow + DivideOverflow, +} -impl From for ParseI256Error { +impl From for I256Error { fn from(_: ParseIntError) -> Self { - Self {} + I256Error::ParseError } } -impl std::fmt::Display for ParseI256Error { +impl std::fmt::Display for I256Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Failed to parse as i256") + match self { + I256Error::ParseError => write!(f, "Failed to parse as i256"), + I256Error::DivideByZero => write!(f, "Division by zero"), + I256Error::DivideOverflow => write!(f, "Division overflow"), + } } } -impl std::error::Error for ParseI256Error {} +impl std::error::Error for I256Error {} /// A signed 256-bit integer #[allow(non_camel_case_types)] @@ -60,7 +71,7 @@ impl std::fmt::Display for i256 { } impl FromStr for i256 { - type Err = ParseI256Error; + type Err = I256Error; fn from_str(s: &str) -> Result { // i128 can store up to 38 decimal digits @@ -82,7 +93,7 @@ impl FromStr for i256 { if !s.as_bytes()[0].is_ascii_digit() { // Ensures no duplicate sign - return Err(ParseI256Error {}); + return Err(I256Error::ParseError); } parse_impl(s, negative) @@ -90,7 +101,7 @@ impl FromStr for i256 { } /// Parse `s` with any sign and leading 0s removed -fn parse_impl(s: &str, negative: bool) -> Result { +fn parse_impl(s: &str, negative: bool) -> Result { if s.len() <= 38 { let low = i128::from_str(s)?; return Ok(match negative { @@ -102,7 +113,7 @@ fn parse_impl(s: &str, negative: bool) -> Result { let split = s.len() - 38; if !s.as_bytes()[split].is_ascii_digit() { // Ensures not splitting codepoint and no sign - return Err(ParseI256Error {}); + return Err(I256Error::ParseError); } let (hs, ls) = s.split_at(split); @@ -117,7 +128,7 @@ fn parse_impl(s: &str, negative: bool) -> Result { high.checked_mul(i256::from_i128(10_i128.pow(38))) .and_then(|high| high.checked_add(low)) - .ok_or(ParseI256Error {}) + .ok_or(I256Error::ParseError) } impl PartialOrd for i256 { @@ -396,42 +407,101 @@ impl i256 { .then_some(Self { low, high }) } + /// Return the least number of bits needed to represent the number + #[inline] + fn bits_required(&self) -> usize { + let le_bytes = self.to_le_bytes(); + let arr: [u128; 2] = [ + u128::from_le_bytes(le_bytes[0..16].try_into().unwrap()), + u128::from_le_bytes(le_bytes[16..32].try_into().unwrap()), + ]; + + let iter = arr.iter().rev().take(2 - 1); + if self.is_negative() { + let ctr = iter.take_while(|&&b| b == ::core::u128::MAX).count(); + (128 * (2 - ctr)) + 1 - (!arr[2 - ctr - 1]).leading_zeros() as usize + } else { + let ctr = iter.take_while(|&&b| b == ::core::u128::MIN).count(); + (128 * (2 - ctr)) + 1 - arr[2 - ctr - 1].leading_zeros() as usize + } + } + + /// Division operation, returns (quotient, remainder). + /// This basically implements [Long division]: `` + #[inline] + fn div_rem(self, other: Self) -> Result<(Self, Self), I256Error> { + if other == Self::ZERO { + return Err(I256Error::DivideByZero); + } + if other == Self::MINUS_ONE && self == Self::MIN { + return Err(I256Error::DivideOverflow); + } + + if self == Self::MIN || other == Self::MIN { + let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); + let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); + let d = i256::from_bigint_with_overflow(&l / &r).0; + let r = i256::from_bigint_with_overflow(&l % &r).0; + return Ok((d, r)); + } + + let mut me = self.checked_abs().unwrap(); + let mut you = other.checked_abs().unwrap(); + let mut ret = [0u128; 2]; + if me < you { + return Ok((Self::from_parts(ret[0], ret[1] as i128), self)); + } + + let shift = me.bits_required() - you.bits_required(); + you = you.shl(shift as u8); + for i in (0..=shift).rev() { + if me >= you { + ret[i / 128] |= 1 << (i % 128); + me = me.checked_sub(you).unwrap(); + } + you = you.shr(1); + } + + Ok(( + if self.is_negative() == other.is_negative() { + Self::from_parts(ret[0], ret[1] as i128) + } else { + -Self::from_parts(ret[0], ret[1] as i128) + }, + if self.is_negative() { -me } else { me }, + )) + } + /// Performs wrapping division #[inline] pub fn wrapping_div(self, other: Self) -> Self { - let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); - let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); - Self::from_bigint_with_overflow(l / r).0 + match self.div_rem(other) { + Ok((v, _)) => v, + Err(I256Error::DivideByZero) => panic!("attempt to divide by zero"), + Err(_) => Self::MIN, + } } /// Performs checked division #[inline] pub fn checked_div(self, other: Self) -> Option { - let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); - let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); - let (val, overflow) = Self::from_bigint_with_overflow(l / r); - (!overflow).then_some(val) + self.div_rem(other).map(|(v, _)| v).ok() } /// Performs wrapping remainder #[inline] pub fn wrapping_rem(self, other: Self) -> Self { - let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); - let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); - Self::from_bigint_with_overflow(l % r).0 + match self.div_rem(other) { + Ok((_, v)) => v, + Err(I256Error::DivideByZero) => panic!("attempt to divide by zero"), + Err(_) => Self::ZERO, + } } /// Performs checked remainder #[inline] pub fn checked_rem(self, other: Self) -> Option { - if other == Self::ZERO { - return None; - } - - let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); - let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); - let (val, overflow) = Self::from_bigint_with_overflow(l % r); - (!overflow).then_some(val) + self.div_rem(other).map(|(_, v)| v).ok() } /// Performs checked exponentiation @@ -853,6 +923,43 @@ mod tests { ), } + // Division + if ir != i256::ZERO { + let actual = il.wrapping_div(ir); + let expected = bl.clone() / br.clone(); + let checked = il.checked_div(ir); + + if ir == i256::MINUS_ONE && il == i256::MIN { + // BigInt produces an integer over i256::MAX + assert_eq!(actual, i256::MIN); + assert!(checked.is_none()); + } else { + assert_eq!(actual.to_string(), expected.to_string()); + assert_eq!(checked.unwrap().to_string(), expected.to_string()); + } + } else { + // `wrapping_div` panics on division by zero + assert!(il.checked_div(ir).is_none()); + } + + // Remainder + if ir != i256::ZERO { + let actual = il.wrapping_rem(ir); + let expected = bl.clone() % br.clone(); + let checked = il.checked_rem(ir); + + assert_eq!(actual.to_string(), expected.to_string()); + + if ir == i256::MINUS_ONE && il == i256::MIN { + assert!(checked.is_none()); + } else { + assert_eq!(checked.unwrap().to_string(), expected.to_string()); + } + } else { + // `wrapping_rem` panics on division by zero + assert!(il.checked_rem(ir).is_none()); + } + // Exponentiation for exp in vec![0, 1, 2, 3, 8, 100].into_iter() { let actual = il.wrapping_pow(exp); From c295b172b37902d5fa41ef275ff5b86caf9fde75 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 31 May 2023 15:03:25 +0100 Subject: [PATCH 0954/1411] Short-circuit on exhausted page in skip_records (#4320) --- parquet/src/column/reader.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index f63b1e60a03e..0bb6e002462a 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -333,6 +333,14 @@ where None => (to_read, to_read), }; + self.num_decoded_values += rep_levels_read as u32; + remaining -= records_read; + + if self.num_buffered_values == self.num_decoded_values { + // Exhausted buffered page - no need to advance other decoders + continue; + } + let (values_read, def_levels_read) = match self.def_level_decoder.as_mut() { Some(decoder) => decoder .skip_def_levels(rep_levels_read, self.descr.max_def_level())?, @@ -355,9 +363,6 @@ where values_read )); } - - self.num_decoded_values += rep_levels_read as u32; - remaining -= records_read; } Ok(num_records - remaining) } From 30196d89bfab698c50bcde6c304f0599011a1100 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 31 May 2023 16:17:53 +0100 Subject: [PATCH 0955/1411] Revert error handling changes from #4303 (#4318) --- arrow-buffer/src/bigint.rs | 53 ++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index 3b3994689566..b34dcdfa5c27 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -22,33 +22,30 @@ use std::num::ParseIntError; use std::ops::{BitAnd, BitOr, BitXor, Neg, Shl, Shr}; use std::str::FromStr; -/// [`i256`] operations return this error type. +/// An opaque error similar to [`std::num::ParseIntError`] #[derive(Debug)] -pub enum I256Error { - /// An opaque error similar to [`std::num::ParseIntError`] - ParseError, - /// Division by zero - DivideByZero, - /// Division overflow - DivideOverflow, -} +pub struct ParseI256Error {} -impl From for I256Error { +impl From for ParseI256Error { fn from(_: ParseIntError) -> Self { - I256Error::ParseError + Self {} } } -impl std::fmt::Display for I256Error { +impl std::fmt::Display for ParseI256Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - I256Error::ParseError => write!(f, "Failed to parse as i256"), - I256Error::DivideByZero => write!(f, "Division by zero"), - I256Error::DivideOverflow => write!(f, "Division overflow"), - } + write!(f, "Failed to parse as i256") } } -impl std::error::Error for I256Error {} +impl std::error::Error for ParseI256Error {} + +/// Error returned by i256::DivRem +enum DivRemError { + /// Division by zero + DivideByZero, + /// Division overflow + DivideOverflow, +} /// A signed 256-bit integer #[allow(non_camel_case_types)] @@ -71,7 +68,7 @@ impl std::fmt::Display for i256 { } impl FromStr for i256 { - type Err = I256Error; + type Err = ParseI256Error; fn from_str(s: &str) -> Result { // i128 can store up to 38 decimal digits @@ -93,7 +90,7 @@ impl FromStr for i256 { if !s.as_bytes()[0].is_ascii_digit() { // Ensures no duplicate sign - return Err(I256Error::ParseError); + return Err(ParseI256Error {}); } parse_impl(s, negative) @@ -101,7 +98,7 @@ impl FromStr for i256 { } /// Parse `s` with any sign and leading 0s removed -fn parse_impl(s: &str, negative: bool) -> Result { +fn parse_impl(s: &str, negative: bool) -> Result { if s.len() <= 38 { let low = i128::from_str(s)?; return Ok(match negative { @@ -113,7 +110,7 @@ fn parse_impl(s: &str, negative: bool) -> Result { let split = s.len() - 38; if !s.as_bytes()[split].is_ascii_digit() { // Ensures not splitting codepoint and no sign - return Err(I256Error::ParseError); + return Err(ParseI256Error {}); } let (hs, ls) = s.split_at(split); @@ -128,7 +125,7 @@ fn parse_impl(s: &str, negative: bool) -> Result { high.checked_mul(i256::from_i128(10_i128.pow(38))) .and_then(|high| high.checked_add(low)) - .ok_or(I256Error::ParseError) + .ok_or(ParseI256Error {}) } impl PartialOrd for i256 { @@ -429,12 +426,12 @@ impl i256 { /// Division operation, returns (quotient, remainder). /// This basically implements [Long division]: `` #[inline] - fn div_rem(self, other: Self) -> Result<(Self, Self), I256Error> { + fn div_rem(self, other: Self) -> Result<(Self, Self), DivRemError> { if other == Self::ZERO { - return Err(I256Error::DivideByZero); + return Err(DivRemError::DivideByZero); } if other == Self::MINUS_ONE && self == Self::MIN { - return Err(I256Error::DivideOverflow); + return Err(DivRemError::DivideOverflow); } if self == Self::MIN || other == Self::MIN { @@ -477,7 +474,7 @@ impl i256 { pub fn wrapping_div(self, other: Self) -> Self { match self.div_rem(other) { Ok((v, _)) => v, - Err(I256Error::DivideByZero) => panic!("attempt to divide by zero"), + Err(DivRemError::DivideByZero) => panic!("attempt to divide by zero"), Err(_) => Self::MIN, } } @@ -493,7 +490,7 @@ impl i256 { pub fn wrapping_rem(self, other: Self) -> Self { match self.div_rem(other) { Ok((_, v)) => v, - Err(I256Error::DivideByZero) => panic!("attempt to divide by zero"), + Err(DivRemError::DivideByZero) => panic!("attempt to divide by zero"), Err(_) => Self::ZERO, } } From d1fde40b7854500d540d1671ba36c151d14071e6 Mon Sep 17 00:00:00 2001 From: comphead Date: Wed, 31 May 2023 14:11:10 -0700 Subject: [PATCH 0956/1411] fix date conversion if timestamp below unixtimestamp (#4323) * fix date conversion if timestamp below unixtimestamp * comments --- arrow-cast/src/cast.rs | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 2b286bfa9119..9652047c749e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1853,7 +1853,7 @@ pub fn cast_with_options( if time_array.is_null(i) { b.append_null(); } else { - b.append_value((time_array.value(i) / from_size) as i32); + b.append_value(num::integer::div_floor::(time_array.value(i), from_size) as i32); } } @@ -9172,4 +9172,41 @@ mod tests { ); assert!(casted_array.is_err()); } + + #[test] + fn test_cast_below_unixtimestamp() { + let valid = StringArray::from(vec![ + "1900-01-03 23:59:59", + "1969-12-31 00:00:01", + "1989-12-31 00:00:01", + ]); + + let array = Arc::new(valid) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ) + .unwrap(); + + let ts_array = casted_array + .as_primitive::() + .values() + .iter() + .map(|ts| ts / 1_000_000) + .collect::>(); + + let array = + TimestampMillisecondArray::from(ts_array).with_timezone("UTC".to_string()); + let casted_array = cast(&array, &DataType::Date32).unwrap(); + let date_array = casted_array.as_primitive::(); + let casted_array = cast(&date_array, &DataType::Utf8).unwrap(); + let string_array = casted_array.as_string::(); + assert_eq!("1900-01-03", string_array.value(0)); + assert_eq!("1969-12-31", string_array.value(1)); + assert_eq!("1989-12-31", string_array.value(2)); + } } From deb38964be1440ae2f3477900663bb29c171817f Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Thu, 1 Jun 2023 04:29:54 -0700 Subject: [PATCH 0957/1411] feat: add read parquet by custom rowgroup examples (#4332) * feat: add parquet read by custom rowgroup example * feat: add parquet read by custom rowgroup example * address comments * address comments * address comments --- .github/workflows/parquet.yml | 6 + parquet/Cargo.toml | 5 + parquet/examples/read_with_rowgroup.rs | 185 +++++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100644 parquet/examples/read_with_rowgroup.rs diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index ee5813f567bb..55599b776c32 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -60,6 +60,12 @@ jobs: run: cargo test -p parquet - name: Test --all-features run: cargo test -p parquet --all-features + - name: Run examples + run: | + # Test parquet examples + cargo run -p parquet --example read_parquet + cargo run -p parquet --example async_read_parquet --features="async" + cargo run -p parquet --example read_with_rowgroup --features="async" # test compilation linux-features: diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index cc48424a6b05..adcbe82a7bbd 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -111,6 +111,11 @@ name = "async_read_parquet" required-features = ["arrow", "async"] path = "./examples/async_read_parquet.rs" +[[example]] +name = "read_with_rowgroup" +required-features = ["arrow", "async"] +path = "./examples/read_with_rowgroup.rs" + [[test]] name = "arrow_writer_layout" required-features = ["arrow"] diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs new file mode 100644 index 000000000000..b2d113d50529 --- /dev/null +++ b/parquet/examples/read_with_rowgroup.rs @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::util::pretty::print_batches; +use bytes::{Buf, Bytes}; +use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection}; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::arrow::{parquet_to_arrow_field_levels, ProjectionMask}; +use parquet::column::page::{PageIterator, PageReader}; +use parquet::errors::{ParquetError, Result}; +use parquet::file::metadata::RowGroupMetaData; +use parquet::file::reader::{ChunkReader, Length}; +use parquet::file::serialized_reader::SerializedPageReader; +use std::sync::Arc; +use tokio::fs::File; + +#[tokio::main(flavor = "current_thread")] +async fn main() -> Result<()> { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_plain.parquet"); + let mut file = File::open(&path).await.unwrap(); + + // The metadata could be cached in other places, this example only shows how to read + let metadata = file.get_metadata().await?; + + for rg in metadata.row_groups() { + let mut rowgroup = InMemoryRowGroup::create(rg.clone(), ProjectionMask::all()); + rowgroup.async_fetch_data(&mut file, None).await?; + let reader = rowgroup.build_reader(1024, None)?; + + for batch in reader { + let batch = batch?; + print_batches(&[batch])?; + } + } + + Ok(()) +} + +/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`] +struct ColumnChunkIterator { + reader: Option>>, +} + +impl Iterator for ColumnChunkIterator { + type Item = Result>; + + fn next(&mut self) -> Option { + self.reader.take() + } +} + +impl PageIterator for ColumnChunkIterator {} + +/// An in-memory column chunk +#[derive(Clone)] +pub struct ColumnChunkData { + offset: usize, + data: Bytes, +} + +impl ColumnChunkData { + fn get(&self, start: u64) -> Result { + let start = start as usize - self.offset; + Ok(self.data.slice(start..)) + } +} + +impl Length for ColumnChunkData { + fn len(&self) -> u64 { + self.data.len() as u64 + } +} + +impl ChunkReader for ColumnChunkData { + type T = bytes::buf::Reader; + + fn get_read(&self, start: u64) -> Result { + Ok(self.get(start)?.reader()) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + Ok(self.get(start)?.slice(..length)) + } +} + +#[derive(Clone)] +pub struct InMemoryRowGroup { + pub metadata: RowGroupMetaData, + mask: ProjectionMask, + column_chunks: Vec>>, +} + +impl RowGroups for InMemoryRowGroup { + fn num_rows(&self) -> usize { + self.metadata.num_rows() as usize + } + + fn column_chunks(&self, i: usize) -> Result> { + match &self.column_chunks[i] { + None => Err(ParquetError::General(format!( + "Invalid column index {i}, column was not fetched" + ))), + Some(data) => { + let page_reader: Box = + Box::new(SerializedPageReader::new( + data.clone(), + self.metadata.column(i), + self.num_rows(), + None, + )?); + + Ok(Box::new(ColumnChunkIterator { + reader: Some(Ok(page_reader)), + })) + } + } + } +} + +impl InMemoryRowGroup { + pub fn create(metadata: RowGroupMetaData, mask: ProjectionMask) -> Self { + let column_chunks = metadata.columns().iter().map(|_| None).collect::>(); + + Self { + metadata, + mask, + column_chunks, + } + } + + pub fn build_reader( + &self, + batch_size: usize, + selection: Option, + ) -> Result { + let levels = parquet_to_arrow_field_levels( + &self.metadata.schema_descr_ptr(), + self.mask.clone(), + None, + )?; + + ParquetRecordBatchReader::try_new_with_row_groups( + &levels, self, batch_size, selection, + ) + } + + /// fetch data from a reader in sync mode + pub async fn async_fetch_data( + &mut self, + reader: &mut R, + _selection: Option<&RowSelection>, + ) -> Result<()> { + let mut vs = std::mem::take(&mut self.column_chunks); + for (leaf_idx, meta) in self.metadata.columns().iter().enumerate() { + if self.mask.leaf_included(leaf_idx) { + let (start, len) = meta.byte_range(); + let data = reader + .get_bytes(start as usize..(start + len) as usize) + .await?; + + vs[leaf_idx] = Some(Arc::new(ColumnChunkData { + offset: start as usize, + data, + })); + } + } + self.column_chunks = std::mem::take(&mut vs); + Ok(()) + } +} From 23607fe669da1939eedfb043e8bc5ade657cfee0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Jun 2023 13:13:14 +0100 Subject: [PATCH 0958/1411] Don't split record across pages (#3680) (#4327) --- parquet/src/column/writer/mod.rs | 64 +++++++++++++++------------- parquet/tests/arrow_writer_layout.rs | 43 +++++++++++++++++++ 2 files changed, 77 insertions(+), 30 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 310519f4a39c..fc5e29b03256 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -308,6 +308,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { max: Option<&E::T>, distinct_count: Option, ) -> Result { + // Check if number of definition levels is the same as number of repetition levels. + if let (Some(def), Some(rep)) = (def_levels, rep_levels) { + if def.len() != rep.len() { + return Err(general_err!( + "Inconsistent length of definition and repetition levels: {} != {}", + def.len(), + rep.len() + )); + } + } + // We check for DataPage limits only after we have inserted the values. If a user // writes a large number of values, the DataPage size can be well above the limit. // @@ -323,10 +334,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { None => values.len(), }; - // Find out number of batches to process. - let write_batch_size = self.props.write_batch_size(); - let num_batches = num_levels / write_batch_size; - // If only computing chunk-level statistics compute them here, page-level statistics // are computed in [`Self::write_mini_batch`] and used to update chunk statistics in // [`Self::add_data_page`] @@ -374,27 +381,28 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let mut values_offset = 0; let mut levels_offset = 0; - for _ in 0..num_batches { + let base_batch_size = self.props.write_batch_size(); + while levels_offset < num_levels { + let mut end_offset = num_levels.min(levels_offset + base_batch_size); + + // Split at record boundary + if let Some(r) = rep_levels { + while end_offset < r.len() && r[end_offset] != 0 { + end_offset += 1; + } + } + values_offset += self.write_mini_batch( values, values_offset, value_indices, - write_batch_size, - def_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), - rep_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), + end_offset - levels_offset, + def_levels.map(|lv| &lv[levels_offset..end_offset]), + rep_levels.map(|lv| &lv[levels_offset..end_offset]), )?; - levels_offset += write_batch_size; + levels_offset = end_offset; } - values_offset += self.write_mini_batch( - values, - values_offset, - value_indices, - num_levels - levels_offset, - def_levels.map(|lv| &lv[levels_offset..]), - rep_levels.map(|lv| &lv[levels_offset..]), - )?; - // Return total number of values processed. Ok(values_offset) } @@ -522,18 +530,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, ) -> Result { - // Check if number of definition levels is the same as number of repetition - // levels. - if let (Some(def), Some(rep)) = (def_levels, rep_levels) { - if def.len() != rep.len() { - return Err(general_err!( - "Inconsistent length of definition and repetition levels: {} != {}", - def.len(), - rep.len() - )); - } - } - // Process definition levels and determine how many values to write. let values_to_write = if self.descr.max_def_level() > 0 { let levels = def_levels.ok_or_else(|| { @@ -569,6 +565,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ) })?; + if !levels.is_empty() && levels[0] != 0 { + return Err(general_err!( + "Write must start at a record boundary, got non-zero repetition level of {}", + levels[0] + )); + } + // Count the occasions where we start a new row for &level in levels { self.page_metrics.num_buffered_rows += (level == 0) as u32 @@ -2255,6 +2258,7 @@ mod tests { let mut buf: Vec = Vec::new(); let rep_levels = if max_rep_level > 0 { random_numbers_range(max_size, 0, max_rep_level + 1, &mut buf); + buf[0] = 0; // Must start on record boundary Some(&buf[..]) } else { None diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 142112b7b686..3142c8c52063 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -19,6 +19,7 @@ use arrow::array::{Int32Array, StringArray}; use arrow::record_batch::RecordBatch; +use arrow_array::builder::{Int32Builder, ListBuilder}; use bytes::Bytes; use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; use parquet::arrow::ArrowWriter; @@ -502,3 +503,45 @@ fn test_string() { }, }); } + +#[test] +fn test_list() { + let mut list = ListBuilder::new(Int32Builder::new()); + for _ in 0..200 { + let values = list.values(); + for i in 0..8 { + values.append_value(i); + } + list.append(true); + } + let array = Arc::new(list.finish()) as _; + + let batch = RecordBatch::try_from_iter([("col", array)]).unwrap(); + let props = WriterProperties::builder() + .set_dictionary_enabled(false) + .set_data_page_row_count_limit(20) + .set_write_batch_size(3) + .build(); + + // Test rows not split across pages + do_test(LayoutTest { + props, + batches: vec![batch], + layout: Layout { + row_groups: vec![RowGroup { + columns: vec![ColumnChunk { + pages: (0..10) + .map(|_| Page { + rows: 20, + page_header_size: 34, + compressed_size: 672, + encoding: Encoding::PLAIN, + page_type: PageType::DATA_PAGE, + }) + .collect(), + dictionary_page: None, + }], + }], + }, + }); +} From 0baf99a2244d39ff910ec09a0bc3a30b1138a577 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Jun 2023 13:13:30 +0100 Subject: [PATCH 0959/1411] Handle trailing padding when skipping repetition levels (#3911) (#4319) --- parquet/src/column/reader.rs | 32 +++++++----- parquet/src/column/reader/decoder.rs | 75 +++++++++++++++++++++------- 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 0bb6e002462a..3434eba69e50 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -296,11 +296,11 @@ where /// /// Returns the number of records skipped pub fn skip_records(&mut self, num_records: usize) -> Result { - let mut remaining = num_records; - while remaining != 0 { + let mut remaining_records = num_records; + while remaining_records != 0 { if self.num_buffered_values == self.num_decoded_values { let metadata = match self.page_reader.peek_next_page()? { - None => return Ok(num_records - remaining), + None => return Ok(num_records - remaining_records), Some(metadata) => metadata, }; @@ -312,29 +312,37 @@ where // If page has less rows than the remaining records to // be skipped, skip entire page - if metadata.num_rows <= remaining { + if metadata.num_rows <= remaining_records { self.page_reader.skip_next_page()?; - remaining -= metadata.num_rows; + remaining_records -= metadata.num_rows; continue; }; // because self.num_buffered_values == self.num_decoded_values means // we need reads a new page and set up the decoders for levels if !self.read_new_page()? { - return Ok(num_records - remaining); + return Ok(num_records - remaining_records); } } // start skip values in page level - let to_read = remaining - .min((self.num_buffered_values - self.num_decoded_values) as usize); + + // The number of levels in the current data page + let buffered_levels = + (self.num_buffered_values - self.num_decoded_values) as usize; let (records_read, rep_levels_read) = match self.rep_level_decoder.as_mut() { - Some(decoder) => decoder.skip_rep_levels(to_read)?, - None => (to_read, to_read), + Some(decoder) => { + decoder.skip_rep_levels(remaining_records, buffered_levels)? + } + None => { + // No repetition levels, so each level corresponds to a row + let levels = buffered_levels.min(remaining_records); + (levels, levels) + } }; self.num_decoded_values += rep_levels_read as u32; - remaining -= records_read; + remaining_records -= records_read; if self.num_buffered_values == self.num_decoded_values { // Exhausted buffered page - no need to advance other decoders @@ -364,7 +372,7 @@ where )); } } - Ok(num_records - remaining) + Ok(num_records - remaining_records) } /// Read the next page as a dictionary page. If the next page is not a dictionary page, diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index f57b3e16d5d0..3a6795c8cbf8 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -82,11 +82,15 @@ pub trait ColumnLevelDecoder { } pub trait RepetitionLevelDecoder: ColumnLevelDecoder { - /// Skips over repetition level corresponding to `num_records` records, where a record - /// is delimited by a repetition level of 0 + /// Skips over up to `num_levels` repetition levels corresponding to `num_records` records, + /// where a record is delimited by a repetition level of 0 /// /// Returns the number of records skipped, and the number of levels skipped - fn skip_rep_levels(&mut self, num_records: usize) -> Result<(usize, usize)>; + fn skip_rep_levels( + &mut self, + num_records: usize, + num_levels: usize, + ) -> Result<(usize, usize)>; } pub trait DefinitionLevelDecoder: ColumnLevelDecoder { @@ -395,22 +399,30 @@ impl DefinitionLevelDecoder for ColumnLevelDecoderImpl { } impl RepetitionLevelDecoder for ColumnLevelDecoderImpl { - fn skip_rep_levels(&mut self, num_records: usize) -> Result<(usize, usize)> { + fn skip_rep_levels( + &mut self, + num_records: usize, + num_levels: usize, + ) -> Result<(usize, usize)> { let mut level_skip = 0; let mut record_skip = 0; - loop { + while level_skip < num_levels { + let remaining_levels = num_levels - level_skip; + if self.buffer.is_empty() { - // Read SKIP_BUFFER_SIZE as we don't know how many to read - self.read_to_buffer(SKIP_BUFFER_SIZE)?; + // Only read number of needed values + self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?; if self.buffer.is_empty() { // Reached end of page break; } } + let max_skip = self.buffer.len().min(remaining_levels); + let mut to_skip = 0; - while to_skip < self.buffer.len() && record_skip != num_records { + while to_skip < max_skip && record_skip != num_records { if self.buffer[to_skip] == 0 { record_skip += 1; } @@ -418,12 +430,12 @@ impl RepetitionLevelDecoder for ColumnLevelDecoderImpl { } // Find end of record - while to_skip < self.buffer.len() && self.buffer[to_skip] != 0 { + while to_skip < max_skip && self.buffer[to_skip] != 0 { to_skip += 1; } level_skip += to_skip; - if to_skip >= self.buffer.len() { + if to_skip == self.buffer.len() { // Need to to read more values self.buffer.clear(); continue; @@ -473,17 +485,39 @@ mod tests { } #[test] - fn test_skip() { - let mut rng = thread_rng(); - let total_len = 10000; - let encoded: Vec = (0..total_len).map(|_| rng.gen_range(0..5)).collect(); - let mut encoder = RleEncoder::new(3, 1024); - for v in &encoded { - encoder.put(*v as _) - } + fn test_skip_padding() { + let mut encoder = RleEncoder::new(1, 1024); + encoder.put(0); + (0..3).for_each(|_| encoder.put(1)); let data = ByteBufferPtr::new(encoder.consume()); + let mut decoder = ColumnLevelDecoderImpl::new(1); + decoder.set_data(Encoding::RLE, data.clone()); + let (records, levels) = decoder.skip_rep_levels(100, 4).unwrap(); + assert_eq!(records, 1); + assert_eq!(levels, 4); + + // The length of the final bit packed run is ambiguous, so without the correct + // levels limit, it will decode zero padding + let mut decoder = ColumnLevelDecoderImpl::new(1); + decoder.set_data(Encoding::RLE, data); + let (records, levels) = decoder.skip_rep_levels(100, 6).unwrap(); + assert_eq!(records, 3); + assert_eq!(levels, 6); + } + + #[test] + fn test_skip() { for _ in 0..10 { + let mut rng = thread_rng(); + let total_len = 10000_usize; + let encoded: Vec = (0..total_len).map(|_| rng.gen_range(0..5)).collect(); + let mut encoder = RleEncoder::new(3, 1024); + for v in &encoded { + encoder.put(*v as _) + } + let data = ByteBufferPtr::new(encoder.consume()); + test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { let (values_skipped, levels_skipped) = decoder.skip_def_levels(to_read, 5).unwrap(); @@ -497,8 +531,11 @@ mod tests { }); test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { + let remaining_levels = total_len - *read; let (records_skipped, levels_skipped) = - decoder.skip_rep_levels(to_read).unwrap(); + decoder.skip_rep_levels(to_read, remaining_levels).unwrap(); + + assert!(levels_skipped <= remaining_levels); // If not run out of values if levels_skipped + *read != encoded.len() { From b78d99d8b6a45fef5ca998c86c3f774cc6fce644 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:37:30 +0100 Subject: [PATCH 0960/1411] Add separate row_count and level_count to PageMetadata (#4321) (#4326) --- parquet/src/arrow/arrow_reader/mod.rs | 32 +++++++++++++++++++++ parquet/src/column/page.rs | 34 +++++++++++++++-------- parquet/src/column/reader.rs | 26 +++++++++-------- parquet/src/file/serialized_reader.rs | 14 ++++++---- parquet/src/util/test_common/page_util.rs | 15 +++++++--- 5 files changed, 88 insertions(+), 33 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 9cb09c9a5d7d..deca0c719551 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2522,4 +2522,36 @@ mod tests { assert_eq!(&written.slice(0, 8), &read[0]); } + + #[test] + fn test_list_skip() { + let mut list = ListBuilder::new(Int32Builder::new()); + list.append_value([Some(1), Some(2)]); + list.append_value([Some(3)]); + list.append_value([Some(4)]); + let list = list.finish(); + let batch = RecordBatch::try_from_iter([("l", Arc::new(list) as _)]).unwrap(); + + // First page contains 2 values but only 1 row + let props = WriterProperties::builder() + .set_data_page_row_count_limit(1) + .set_write_batch_size(2) + .build(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let selection = vec![RowSelector::skip(2), RowSelector::select(1)]; + let mut reader = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)) + .unwrap() + .with_row_selection(selection.into()) + .build() + .unwrap(); + let out = reader.next().unwrap().unwrap(); + assert_eq!(out.num_rows(), 1); + assert_eq!(out, batch.slice(2, 1)); + } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 3b19734a2218..654cd0816039 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -265,9 +265,10 @@ impl PageWriteSpec { /// Contains metadata for a page #[derive(Clone)] pub struct PageMetadata { - /// The number of rows in this page - pub num_rows: usize, - + /// The number of rows within the page if known + pub num_rows: Option, + /// The number of levels within the page if known + pub num_levels: Option, /// Returns true if the page is a dictionary page pub is_dict: bool, } @@ -277,18 +278,27 @@ impl TryFrom<&PageHeader> for PageMetadata { fn try_from(value: &PageHeader) -> std::result::Result { match value.type_ { - crate::format::PageType::DATA_PAGE => Ok(PageMetadata { - num_rows: value.data_page_header.as_ref().unwrap().num_values as usize, - is_dict: false, - }), + crate::format::PageType::DATA_PAGE => { + let header = value.data_page_header.as_ref().unwrap(); + Ok(PageMetadata { + num_rows: None, + num_levels: Some(header.num_values as _), + is_dict: false, + }) + } crate::format::PageType::DICTIONARY_PAGE => Ok(PageMetadata { - num_rows: usize::MIN, + num_rows: None, + num_levels: None, is_dict: true, }), - crate::format::PageType::DATA_PAGE_V2 => Ok(PageMetadata { - num_rows: value.data_page_header_v2.as_ref().unwrap().num_rows as usize, - is_dict: false, - }), + crate::format::PageType::DATA_PAGE_V2 => { + let header = value.data_page_header_v2.as_ref().unwrap(); + Ok(PageMetadata { + num_rows: Some(header.num_rows as _), + num_levels: Some(header.num_values as _), + is_dict: false, + }) + } other => Err(ParquetError::General(format!( "page type {other:?} cannot be converted to PageMetadata" ))), diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 3434eba69e50..991ec2c545a4 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -312,11 +312,20 @@ where // If page has less rows than the remaining records to // be skipped, skip entire page - if metadata.num_rows <= remaining_records { - self.page_reader.skip_next_page()?; - remaining_records -= metadata.num_rows; - continue; - }; + let rows = metadata.num_rows.or_else(|| { + // If no repetition levels, num_levels == num_rows + self.rep_level_decoder + .is_none() + .then_some(metadata.num_levels)? + }); + + if let Some(rows) = rows { + if rows <= remaining_records { + self.page_reader.skip_next_page()?; + remaining_records -= rows; + continue; + } + } // because self.num_buffered_values == self.num_decoded_values means // we need reads a new page and set up the decoders for levels if !self.read_new_page()? { @@ -533,12 +542,7 @@ where if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values { - // TODO: should we return false if read_new_page() = true and - // num_buffered_values = 0? - match self.page_reader.peek_next_page()? { - Some(next_page) => Ok(next_page.num_rows != 0), - None => Ok(false), - } + Ok(self.page_reader.peek_next_page()?.is_some()) } else { Ok(true) } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 782394942df4..2b3536904bb9 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -722,7 +722,8 @@ impl PageReader for SerializedPageReader { } => { if dictionary_page.is_some() { Ok(Some(PageMetadata { - num_rows: 0, + num_rows: None, + num_levels: None, is_dict: true, })) } else if let Some(page) = page_locations.front() { @@ -732,7 +733,8 @@ impl PageReader for SerializedPageReader { .unwrap_or(*total_rows); Ok(Some(PageMetadata { - num_rows: next_rows - page.first_row_index as usize, + num_rows: Some(next_rows - page.first_row_index as usize), + num_levels: None, is_dict: false, })) } else { @@ -1644,11 +1646,11 @@ mod tests { // have checked with `parquet-tools column-index -c string_col ./alltypes_tiny_pages.parquet` // page meta has two scenarios(21, 20) of num_rows expect last page has 11 rows. if i != 351 { - assert!((meta.num_rows == 21) || (meta.num_rows == 20)); + assert!((meta.num_rows == Some(21)) || (meta.num_rows == Some(20))); } else { // last page first row index is 7290, total row count is 7300 // because first row start with zero, last page row count should be 10. - assert_eq!(meta.num_rows, 10); + assert_eq!(meta.num_rows, Some(10)); } assert!(!meta.is_dict); vec.push(meta); @@ -1686,11 +1688,11 @@ mod tests { // have checked with `parquet-tools column-index -c string_col ./alltypes_tiny_pages.parquet` // page meta has two scenarios(21, 20) of num_rows expect last page has 11 rows. if i != 351 { - assert!((meta.num_rows == 21) || (meta.num_rows == 20)); + assert!((meta.num_levels == Some(21)) || (meta.num_levels == Some(20))); } else { // last page first row index is 7290, total row count is 7300 // because first row start with zero, last page row count should be 10. - assert_eq!(meta.num_rows, 10); + assert_eq!(meta.num_levels, Some(10)); } assert!(!meta.is_dict); vec.push(meta); diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index ab5287462c8c..c51c5158cd42 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -170,15 +170,22 @@ impl + Send> PageReader for InMemoryPageReader

{ if let Some(x) = self.page_iter.peek() { match x { Page::DataPage { num_values, .. } => Ok(Some(PageMetadata { - num_rows: *num_values as usize, + num_rows: None, + num_levels: Some(*num_values as _), is_dict: false, })), - Page::DataPageV2 { num_rows, .. } => Ok(Some(PageMetadata { - num_rows: *num_rows as usize, + Page::DataPageV2 { + num_rows, + num_values, + .. + } => Ok(Some(PageMetadata { + num_rows: Some(*num_rows as _), + num_levels: Some(*num_values as _), is_dict: false, })), Page::DictionaryPage { .. } => Ok(Some(PageMetadata { - num_rows: 0, + num_rows: None, + num_levels: None, is_dict: true, })), } From f2610515e03a72afbe8c017683867ee9f921fffa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 1 Jun 2023 15:30:19 +0100 Subject: [PATCH 0961/1411] Skip unnecessary null checks in MutableArrayData (#4333) --- arrow-data/src/transform/fixed_binary.rs | 32 ++------ arrow-data/src/transform/fixed_size_list.rs | 40 ++-------- arrow-data/src/transform/list.rs | 81 ++++++--------------- arrow-data/src/transform/structure.rs | 44 +++-------- arrow-data/src/transform/variable_size.rs | 61 ++++------------ 5 files changed, 59 insertions(+), 199 deletions(-) diff --git a/arrow-data/src/transform/fixed_binary.rs b/arrow-data/src/transform/fixed_binary.rs index a20901014c5d..44c6f46ebf7e 100644 --- a/arrow-data/src/transform/fixed_binary.rs +++ b/arrow-data/src/transform/fixed_binary.rs @@ -26,32 +26,12 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { }; let values = &array.buffers()[0].as_slice()[array.offset() * size..]; - if array.null_count() == 0 { - // fast case where we can copy regions without null issues - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let buffer = &mut mutable.buffer1; - buffer.extend_from_slice(&values[start * size..(start + len) * size]); - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - // nulls present: append item by item, ignoring null entries - let values_buffer = &mut mutable.buffer1; - - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // append value - let bytes = &values[i * size..(i + 1) * size]; - values_buffer.extend_from_slice(bytes); - } else { - values_buffer.extend_zeros(size); - } - }) - }, - ) - } + Box::new( + move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { + let buffer = &mut mutable.buffer1; + buffer.extend_from_slice(&values[start * size..(start + len) * size]); + }, + ) } pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { diff --git a/arrow-data/src/transform/fixed_size_list.rs b/arrow-data/src/transform/fixed_size_list.rs index ad369c2be8a0..8eef7bce9bb3 100644 --- a/arrow-data/src/transform/fixed_size_list.rs +++ b/arrow-data/src/transform/fixed_size_list.rs @@ -26,38 +26,14 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend { _ => unreachable!(), }; - if array.null_count() == 0 { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - mutable.child_data.iter_mut().for_each(|child| { - child.extend(index, start * size, (start + len) * size) - }) - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - (start..start + len).for_each(|i| { - if array.is_valid(i) { - mutable.child_data.iter_mut().for_each(|child| { - child.extend(index, i * size, (i + 1) * size) - }) - } else { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(size)) - } - }) - }, - ) - } + Box::new( + move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { + mutable + .child_data + .iter_mut() + .for_each(|child| child.extend(index, start * size, (start + len) * size)) + }, + ) } pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index 76a845958da8..9d5d8330cb1e 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -27,66 +27,27 @@ pub(super) fn build_extend( array: &ArrayData, ) -> Extend { let offsets = array.buffer::(0); - if array.null_count() == 0 { - // fast case where we can copy regions without nullability checks - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - // offsets - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); - - mutable.child_data[0].extend( - index, - offsets[start].as_usize(), - offsets[start + len].as_usize(), - ) - }, - ) - } else { - // nulls present: append item by item, ignoring null entries - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let mut last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - let delta_len = array.len() - array.null_count(); - offset_buffer.reserve(delta_len * std::mem::size_of::()); - - let child = &mut mutable.child_data[0]; - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // compute the new offset - last_offset = last_offset + offsets[i + 1] - offsets[i]; - - // append value - child.extend( - index, - offsets[i].as_usize(), - offsets[i + 1].as_usize(), - ); - } - // append offset - offset_buffer.push(last_offset); - }) - }, - ) - } + Box::new( + move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { + let offset_buffer = &mut mutable.buffer1; + + // this is safe due to how offset is built. See details on `get_last_offset` + let last_offset: T = unsafe { get_last_offset(offset_buffer) }; + + // offsets + extend_offsets::( + offset_buffer, + last_offset, + &offsets[start..start + len + 1], + ); + + mutable.child_data[0].extend( + index, + offsets[start].as_usize(), + offsets[start + len].as_usize(), + ) + }, + ) } pub(super) fn extend_nulls( diff --git a/arrow-data/src/transform/structure.rs b/arrow-data/src/transform/structure.rs index c6841da4d83c..7330dcaa3705 100644 --- a/arrow-data/src/transform/structure.rs +++ b/arrow-data/src/transform/structure.rs @@ -18,41 +18,15 @@ use super::{Extend, _MutableArrayData}; use crate::ArrayData; -pub(super) fn build_extend(array: &ArrayData) -> Extend { - if array.null_count() == 0 { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, start, start + len)) - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - (start..start + len).for_each(|i| { - if array.is_valid(i) { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, i, i + 1)) - } else { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(1)) - } - }) - }, - ) - } +pub(super) fn build_extend(_: &ArrayData) -> Extend { + Box::new( + move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { + mutable + .child_data + .iter_mut() + .for_each(|child| child.extend(index, start, start + len)) + }, + ) } pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index ce62459aef09..597a8b2b6645 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -46,54 +46,23 @@ pub(super) fn build_extend< ) -> Extend { let offsets = array.buffer::(0); let values = array.buffers()[1].as_slice(); - if array.null_count() == 0 { - // fast case where we can copy regions without null issues - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let offset_buffer = &mut mutable.buffer1; - let values_buffer = &mut mutable.buffer2; + Box::new( + move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { + let offset_buffer = &mut mutable.buffer1; + let values_buffer = &mut mutable.buffer2; - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset = unsafe { get_last_offset(offset_buffer) }; + // this is safe due to how offset is built. See details on `get_last_offset` + let last_offset = unsafe { get_last_offset(offset_buffer) }; - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); - // values - extend_offset_values::(values_buffer, offsets, values, start, len); - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let offset_buffer = &mut mutable.buffer1; - let values_buffer = &mut mutable.buffer2; - - // this is safe due to how offset is built. See details on `get_last_offset` - let mut last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - // nulls present: append item by item, ignoring null entries - offset_buffer.reserve(len * std::mem::size_of::()); - - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // compute the new offset - let length = offsets[i + 1] - offsets[i]; - last_offset = last_offset + length; - - // append value - let bytes = &values[offsets[i].to_usize().unwrap() - ..offsets[i + 1].to_usize().unwrap()]; - values_buffer.extend_from_slice(bytes); - } - // offsets are always present - offset_buffer.push(last_offset); - }) - }, - ) - } + extend_offsets::( + offset_buffer, + last_offset, + &offsets[start..start + len + 1], + ); + // values + extend_offset_values::(values_buffer, offsets, values, start, len); + }, + ) } pub(super) fn extend_nulls( From b5a7481ac592c3bebe92adea1b7d50672f479439 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Jun 2023 16:20:51 +0100 Subject: [PATCH 0962/1411] Update pyo3 requirement from 0.18 to 0.19 (#4335) * Update pyo3 requirement from 0.18 to 0.19 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.18.0...v0.19.0) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update integration-test --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 5809e935ec16..50987b03ca9e 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.18", features = ["extension-module"] } +pyo3 = { version = "0.19", features = ["extension-module"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 5de03666251b..998d077fa105 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -60,7 +60,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.18", default-features = false, optional = true } +pyo3 = { version = "0.19", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "dyn_arith_dict", "ffi", "pyarrow"] From 1e0bf6aa6027363765327f9cb4929881c99f4d34 Mon Sep 17 00:00:00 2001 From: Chunchun Ye <14298407+appletreeisyellow@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:53:07 -0500 Subject: [PATCH 0963/1411] chore: expose Xdbc related enums (#4340) --- arrow-flight/src/sql/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 74d3176c67b1..d73dc2809615 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -75,6 +75,8 @@ pub use gen::CommandStatementQuery; pub use gen::CommandStatementSubstraitPlan; pub use gen::CommandStatementUpdate; pub use gen::DoPutUpdateResult; +pub use gen::Nullable; +pub use gen::Searchable; pub use gen::SqlInfo; pub use gen::SqlNullOrdering; pub use gen::SqlOuterJoinsSupportLevel; @@ -93,6 +95,8 @@ pub use gen::SqlTransactionIsolationLevel; pub use gen::SupportedSqlGrammar; pub use gen::TicketStatementQuery; pub use gen::UpdateDeleteRules; +pub use gen::XdbcDataType; +pub use gen::XdbcDatetimeSubcode; pub use sql_info::SqlInfoList; From a9c5c97a5d42b2c9af34b449bf1d6f42c7f27d16 Mon Sep 17 00:00:00 2001 From: Bo Lu Date: Fri, 2 Jun 2023 06:29:49 +1000 Subject: [PATCH 0964/1411] fix: make SerializedRowGroupReader::new() public (#4331) --- parquet/src/file/serialized_reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2b3536904bb9..2ed9b1653fdd 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -299,7 +299,7 @@ pub struct SerializedRowGroupReader<'a, R: ChunkReader> { impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { /// Creates new row group reader from a file, row group metadata and custom config. - fn new( + pub fn new( chunk_reader: Arc, metadata: &'a RowGroupMetaData, page_locations: Option<&'a [Vec]>, From dde65391263039f1431df9eca6a429de0b9457b3 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 1 Jun 2023 22:34:03 +0200 Subject: [PATCH 0965/1411] feat(flight): add helpers to handle `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests (#4296) * feat: add get catalog helpers * fix: clippy in arrow_row example * feat: add db schemas helpers * chore: cleanup db schemas helpers * feat: add table schema hlpers * test: add tests and docs * docs: add table queries to example server * docs: improve builder docs * fix: docs links * Apply suggestions from code review Co-authored-by: Andrew Lamb * Improve docs and tests for `SqlInfoList (#4293) * Improve docs and tests for SqlInfoList * Add an example/ * Update arrow-flight/src/sql/sql_info.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh * fix: use FlightInfo builders * chore: clippy * fmt * feat: add filters to GetTablesBuilder * fix: clippy * feat: more consistent builder apis * chore: cleanup --------- Co-authored-by: Andrew Lamb Co-authored-by: Liang-Chi Hsieh --- arrow-flight/Cargo.toml | 9 +- arrow-flight/examples/flight_sql_server.rs | 140 +++++- arrow-flight/src/sql/catalogs/db_schemas.rs | 284 ++++++++++++ arrow-flight/src/sql/catalogs/mod.rs | 123 ++++++ arrow-flight/src/sql/catalogs/tables.rs | 466 ++++++++++++++++++++ arrow-flight/src/sql/mod.rs | 1 + arrow-row/src/lib.rs | 2 +- 7 files changed, 1001 insertions(+), 24 deletions(-) create mode 100644 arrow-flight/src/sql/catalogs/db_schemas.rs create mode 100644 arrow-flight/src/sql/catalogs/mod.rs create mode 100644 arrow-flight/src/sql/catalogs/tables.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 206cc6505c4b..ae9759b6685f 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -27,13 +27,18 @@ repository = { workspace = true } license = { workspace = true } [dependencies] +arrow-arith = { workspace = true, optional = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } # Cast is needed to work around https://github.com/apache/arrow-rs/issues/3389 arrow-cast = { workspace = true } -arrow-data = { workspace = true } +arrow-data = { workspace = true, optional = true } arrow-ipc = { workspace = true } +arrow-ord = { workspace = true, optional = true } +arrow-row = { workspace = true, optional = true } +arrow-select = { workspace = true, optional = true } arrow-schema = { workspace = true } +arrow-string = { workspace = true, optional = true } base64 = { version = "0.21", default-features = false, features = ["std"] } bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } @@ -53,7 +58,7 @@ all-features = true [features] default = [] -flight-sql-experimental = ["once_cell"] +flight-sql-experimental = ["arrow-arith", "arrow-data", "arrow-ord", "arrow-row", "arrow-select", "arrow-string", "once_cell"] tls = ["tonic/tls"] # Enable CLI tools diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 783e0bf5bdf6..6b92621a564d 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -20,6 +20,7 @@ use base64::Engine; use futures::{stream, Stream, TryStreamExt}; use once_cell::sync::Lazy; use prost::Message; +use std::collections::HashSet; use std::pin::Pin; use std::sync::Arc; use tonic::transport::Server; @@ -29,6 +30,9 @@ use tonic::{Request, Response, Status, Streaming}; use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; +use arrow_flight::sql::catalogs::{ + get_catalogs_schema, get_db_schemas_schema, get_tables_schema, +}; use arrow_flight::sql::sql_info::SqlInfoList; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, @@ -72,6 +76,8 @@ static INSTANCE_SQL_INFO: Lazy = Lazy::new(|| { .with_sql_info(SqlInfo::FlightSqlServerArrowVersion, "1.3") }); +static TABLES: Lazy> = Lazy::new(|| vec!["flight_sql.example.table"]); + #[derive(Clone)] pub struct FlightSqlServiceImpl {} @@ -236,32 +242,62 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_catalogs( &self, - _query: CommandGetCatalogs, - _request: Request, + query: CommandGetCatalogs, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_catalogs not implemented", - )) + let flight_descriptor = request.into_inner(); + let ticket = Ticket { + ticket: query.encode_to_vec().into(), + }; + let endpoint = FlightEndpoint::new().with_ticket(ticket); + + let flight_info = FlightInfo::new() + .try_with_schema(get_catalogs_schema()) + .map_err(|e| status!("Unable to encode schema", e))? + .with_endpoint(endpoint) + .with_descriptor(flight_descriptor); + + Ok(tonic::Response::new(flight_info)) } async fn get_flight_info_schemas( &self, - _query: CommandGetDbSchemas, - _request: Request, + query: CommandGetDbSchemas, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_schemas not implemented", - )) + let flight_descriptor = request.into_inner(); + let ticket = Ticket { + ticket: query.encode_to_vec().into(), + }; + let endpoint = FlightEndpoint::new().with_ticket(ticket); + + let flight_info = FlightInfo::new() + .try_with_schema(get_db_schemas_schema().as_ref()) + .map_err(|e| status!("Unable to encode schema", e))? + .with_endpoint(endpoint) + .with_descriptor(flight_descriptor); + + Ok(tonic::Response::new(flight_info)) } async fn get_flight_info_tables( &self, - _query: CommandGetTables, - _request: Request, + query: CommandGetTables, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_tables not implemented", - )) + let flight_descriptor = request.into_inner(); + let ticket = Ticket { + ticket: query.encode_to_vec().into(), + }; + let endpoint = FlightEndpoint::new().with_ticket(ticket); + + let flight_info = FlightInfo::new() + .try_with_schema(get_tables_schema(query.include_schema).as_ref()) + .map_err(|e| status!("Unable to encode schema", e))? + .with_endpoint(endpoint) + .with_descriptor(flight_descriptor); + + Ok(tonic::Response::new(flight_info)) } async fn get_flight_info_table_types( @@ -363,26 +399,88 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_get_catalogs( &self, - _query: CommandGetCatalogs, + query: CommandGetCatalogs, _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("do_get_catalogs not implemented")) + let catalog_names = TABLES + .iter() + .map(|full_name| full_name.split('.').collect::>()[0].to_string()) + .collect::>(); + let mut builder = query.into_builder(); + for catalog_name in catalog_names { + builder.append(catalog_name); + } + let batch = builder.build(); + let stream = FlightDataEncoderBuilder::new() + .with_schema(Arc::new(get_catalogs_schema().clone())) + .build(futures::stream::once(async { batch })) + .map_err(Status::from); + Ok(Response::new(Box::pin(stream))) } async fn do_get_schemas( &self, - _query: CommandGetDbSchemas, + query: CommandGetDbSchemas, _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("do_get_schemas not implemented")) + let schemas = TABLES + .iter() + .map(|full_name| { + let parts = full_name.split('.').collect::>(); + (parts[0].to_string(), parts[1].to_string()) + }) + .collect::>(); + + let mut builder = query.into_builder(); + for (catalog_name, schema_name) in schemas { + builder.append(catalog_name, schema_name); + } + + let batch = builder.build(); + let stream = FlightDataEncoderBuilder::new() + .with_schema(get_db_schemas_schema()) + .build(futures::stream::once(async { batch })) + .map_err(Status::from); + Ok(Response::new(Box::pin(stream))) } async fn do_get_tables( &self, - _query: CommandGetTables, + query: CommandGetTables, _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("do_get_tables not implemented")) + let tables = TABLES + .iter() + .map(|full_name| { + let parts = full_name.split('.').collect::>(); + ( + parts[0].to_string(), + parts[1].to_string(), + parts[2].to_string(), + ) + }) + .collect::>(); + + let dummy_schema = Schema::empty(); + let mut builder = query.into_builder(); + for (catalog_name, schema_name, table_name) in tables { + builder + .append( + catalog_name, + schema_name, + table_name, + "TABLE", + &dummy_schema, + ) + .map_err(Status::from)?; + } + + let batch = builder.build(); + let stream = FlightDataEncoderBuilder::new() + .with_schema(get_db_schemas_schema()) + .build(futures::stream::once(async { batch })) + .map_err(Status::from); + Ok(Response::new(Box::pin(stream))) } async fn do_get_table_types( diff --git a/arrow-flight/src/sql/catalogs/db_schemas.rs b/arrow-flight/src/sql/catalogs/db_schemas.rs new file mode 100644 index 000000000000..76c5499c89cc --- /dev/null +++ b/arrow-flight/src/sql/catalogs/db_schemas.rs @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`GetSchemasBuilder`] for building responses to [`CommandGetDbSchemas`] queries. +//! +//! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas + +use std::sync::Arc; + +use arrow_arith::boolean::and; +use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch}; +use arrow_ord::comparison::eq_utf8_scalar; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_select::{filter::filter_record_batch, take::take}; +use arrow_string::like::like_utf8_scalar; +use once_cell::sync::Lazy; + +use super::lexsort_to_indices; +use crate::error::*; +use crate::sql::CommandGetDbSchemas; + +/// Return the schema of the RecordBatch that will be returned from [`CommandGetDbSchemas`] +/// +/// [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas +pub fn get_db_schemas_schema() -> SchemaRef { + Arc::clone(&GET_DB_SCHEMAS_SCHEMA) +} + +/// The schema for GetDbSchemas +static GET_DB_SCHEMAS_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![ + Field::new("catalog_name", DataType::Utf8, false), + Field::new("db_schema_name", DataType::Utf8, false), + ])) +}); + +/// Builds rows like this: +/// +/// * catalog_name: utf8, +/// * db_schema_name: utf8, +pub struct GetSchemasBuilder { + // Specifies the Catalog to search for the tables. + // - An empty string retrieves those without a catalog. + // - If omitted the catalog name is not used to narrow the search. + catalog_filter: Option, + // Optional filters to apply + db_schema_filter_pattern: Option, + // array builder for catalog names + catalog_name: StringBuilder, + // array builder for schema names + db_schema_name: StringBuilder, +} + +impl CommandGetDbSchemas { + pub fn into_builder(self) -> GetSchemasBuilder { + self.into() + } +} + +impl From for GetSchemasBuilder { + fn from(value: CommandGetDbSchemas) -> Self { + Self::new(value.catalog, value.db_schema_filter_pattern) + } +} + +impl GetSchemasBuilder { + /// Create a new instance of [`GetSchemasBuilder`] + /// + /// # Parameters + /// + /// - `catalog`: Specifies the Catalog to search for the tables. + /// - An empty string retrieves those without a catalog. + /// - If omitted the catalog name is not used to narrow the search. + /// - `db_schema_filter_pattern`: Specifies a filter pattern for schemas to search for. + /// When no pattern is provided, the pattern will not be used to narrow the search. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. + /// + /// [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas + pub fn new( + catalog: Option>, + db_schema_filter_pattern: Option>, + ) -> Self { + Self { + catalog_filter: catalog.map(|v| v.into()), + db_schema_filter_pattern: db_schema_filter_pattern.map(|v| v.into()), + catalog_name: StringBuilder::new(), + db_schema_name: StringBuilder::new(), + } + } + + /// Append a row + /// + /// In case the catalog should be considered as empty, pass in an empty string '""'. + pub fn append( + &mut self, + catalog_name: impl AsRef, + schema_name: impl AsRef, + ) { + self.catalog_name.append_value(catalog_name); + self.db_schema_name.append_value(schema_name); + } + + /// builds a `RecordBatch` with the correct schema for a `CommandGetDbSchemas` response + pub fn build(self) -> Result { + let Self { + catalog_filter, + db_schema_filter_pattern, + mut catalog_name, + mut db_schema_name, + } = self; + + // Make the arrays + let catalog_name = catalog_name.finish(); + let db_schema_name = db_schema_name.finish(); + + let mut filters = vec![]; + + if let Some(db_schema_filter_pattern) = db_schema_filter_pattern { + // use like kernel to get wildcard matching + filters.push(like_utf8_scalar( + &db_schema_name, + &db_schema_filter_pattern, + )?) + } + + if let Some(catalog_filter_name) = catalog_filter { + filters.push(eq_utf8_scalar(&catalog_name, &catalog_filter_name)?); + } + + // `AND` any filters together + let mut total_filter = None; + while let Some(filter) = filters.pop() { + let new_filter = match total_filter { + Some(total_filter) => and(&total_filter, &filter)?, + None => filter, + }; + total_filter = Some(new_filter); + } + + let batch = RecordBatch::try_new( + get_db_schemas_schema(), + vec![ + Arc::new(catalog_name) as ArrayRef, + Arc::new(db_schema_name) as ArrayRef, + ], + )?; + + // Apply the filters if needed + let filtered_batch = if let Some(filter) = total_filter { + filter_record_batch(&batch, &filter)? + } else { + batch + }; + + // Order filtered results by catalog_name, then db_schema_name + let indices = lexsort_to_indices(filtered_batch.columns()); + let columns = filtered_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>()?; + + Ok(RecordBatch::try_new(get_db_schemas_schema(), columns)?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{StringArray, UInt32Array}; + + fn get_ref_batch() -> RecordBatch { + RecordBatch::try_new( + get_db_schemas_schema(), + vec![ + Arc::new(StringArray::from(vec![ + "a_catalog", + "a_catalog", + "b_catalog", + "b_catalog", + ])) as ArrayRef, + Arc::new(StringArray::from(vec![ + "a_schema", "b_schema", "a_schema", "b_schema", + ])) as ArrayRef, + ], + ) + .unwrap() + } + + #[test] + fn test_schemas_are_filtered() { + let ref_batch = get_ref_batch(); + + let mut builder = GetSchemasBuilder::new(None::, None::); + builder.append("a_catalog", "a_schema"); + builder.append("a_catalog", "b_schema"); + builder.append("b_catalog", "a_schema"); + builder.append("b_catalog", "b_schema"); + let schema_batch = builder.build().unwrap(); + + assert_eq!(schema_batch, ref_batch); + + let mut builder = GetSchemasBuilder::new(None::, Some("a%")); + builder.append("a_catalog", "a_schema"); + builder.append("a_catalog", "b_schema"); + builder.append("b_catalog", "a_schema"); + builder.append("b_catalog", "b_schema"); + let schema_batch = builder.build().unwrap(); + + let indices = UInt32Array::from(vec![0, 2]); + let ref_filtered = RecordBatch::try_new( + get_db_schemas_schema(), + ref_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>() + .unwrap(), + ) + .unwrap(); + + assert_eq!(schema_batch, ref_filtered); + } + + #[test] + fn test_schemas_are_sorted() { + let ref_batch = get_ref_batch(); + + let mut builder = GetSchemasBuilder::new(None::, None::); + builder.append("a_catalog", "b_schema"); + builder.append("b_catalog", "a_schema"); + builder.append("a_catalog", "a_schema"); + builder.append("b_catalog", "b_schema"); + let schema_batch = builder.build().unwrap(); + + assert_eq!(schema_batch, ref_batch) + } + + #[test] + fn test_builder_from_query() { + let ref_batch = get_ref_batch(); + let query = CommandGetDbSchemas { + catalog: Some("a_catalog".into()), + db_schema_filter_pattern: Some("b%".into()), + }; + + let mut builder = query.into_builder(); + builder.append("a_catalog", "a_schema"); + builder.append("a_catalog", "b_schema"); + builder.append("b_catalog", "a_schema"); + builder.append("b_catalog", "b_schema"); + let schema_batch = builder.build().unwrap(); + + let indices = UInt32Array::from(vec![1]); + let ref_filtered = RecordBatch::try_new( + get_db_schemas_schema(), + ref_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>() + .unwrap(), + ) + .unwrap(); + + assert_eq!(schema_batch, ref_filtered); + } +} diff --git a/arrow-flight/src/sql/catalogs/mod.rs b/arrow-flight/src/sql/catalogs/mod.rs new file mode 100644 index 000000000000..e4cbb6fedc45 --- /dev/null +++ b/arrow-flight/src/sql/catalogs/mod.rs @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Builders and function for building responses to information schema requests +//! +//! - [`get_catalogs_batch`] and [`get_catalogs_schema`] for building responses to [`CommandGetCatalogs`] queries. +//! - [`GetSchemasBuilder`] and [`get_db_schemas_schema`] for building responses to [`CommandGetDbSchemas`] queries. +//! - [`GetTablesBuilder`] and [`get_tables_schema`] for building responses to [`CommandGetTables`] queries. +//! +//! [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs +//! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas +//! [`CommandGetTables`]: crate::sql::CommandGetTables + +use std::sync::Arc; + +use arrow_array::{ArrayRef, RecordBatch, StringArray, UInt32Array}; +use arrow_row::{RowConverter, SortField}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use once_cell::sync::Lazy; + +use crate::error::Result; +use crate::sql::CommandGetCatalogs; + +pub use db_schemas::{get_db_schemas_schema, GetSchemasBuilder}; +pub use tables::{get_tables_schema, GetTablesBuilder}; + +mod db_schemas; +mod tables; + +pub struct GetCatalogsBuilder { + catalogs: Vec, +} + +impl CommandGetCatalogs { + pub fn into_builder(self) -> GetCatalogsBuilder { + self.into() + } +} + +impl From for GetCatalogsBuilder { + fn from(_: CommandGetCatalogs) -> Self { + Self::new() + } +} + +impl Default for GetCatalogsBuilder { + fn default() -> Self { + Self::new() + } +} + +impl GetCatalogsBuilder { + /// Create a new instance of [`GetCatalogsBuilder`] + pub fn new() -> Self { + Self { + catalogs: Vec::new(), + } + } + + /// Append a row + pub fn append(&mut self, catalog_name: impl Into) { + self.catalogs.push(catalog_name.into()); + } + + /// builds a `RecordBatch` with the correct schema for a `CommandGetCatalogs` response + pub fn build(self) -> Result { + get_catalogs_batch(self.catalogs) + } +} + +/// Returns the RecordBatch for `CommandGetCatalogs` +pub fn get_catalogs_batch(mut catalog_names: Vec) -> Result { + catalog_names.sort_unstable(); + + let batch = RecordBatch::try_new( + Arc::clone(&GET_CATALOG_SCHEMA), + vec![Arc::new(StringArray::from_iter_values(catalog_names)) as _], + )?; + + Ok(batch) +} + +/// Returns the schema that will result from [`CommandGetCatalogs`] +/// +/// [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs +pub fn get_catalogs_schema() -> &'static Schema { + &GET_CATALOG_SCHEMA +} + +/// The schema for GetCatalogs +static GET_CATALOG_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![Field::new( + "catalog_name", + DataType::Utf8, + false, + )])) +}); + +fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { + let fields = arrays + .iter() + .map(|a| SortField::new(a.data_type().clone())) + .collect(); + let mut converter = RowConverter::new(fields).unwrap(); + let rows = converter.convert_columns(arrays).unwrap(); + let mut sort: Vec<_> = rows.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) +} diff --git a/arrow-flight/src/sql/catalogs/tables.rs b/arrow-flight/src/sql/catalogs/tables.rs new file mode 100644 index 000000000000..fcdc0dbb7447 --- /dev/null +++ b/arrow-flight/src/sql/catalogs/tables.rs @@ -0,0 +1,466 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`GetTablesBuilder`] for building responses to [`CommandGetTables`] queries. +//! +//! [`CommandGetTables`]: crate::sql::CommandGetTables + +use std::sync::Arc; + +use arrow_arith::boolean::{and, or}; +use arrow_array::builder::{BinaryBuilder, StringBuilder}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_ord::comparison::eq_utf8_scalar; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_select::{filter::filter_record_batch, take::take}; +use arrow_string::like::like_utf8_scalar; +use once_cell::sync::Lazy; + +use super::lexsort_to_indices; +use crate::error::*; +use crate::sql::CommandGetTables; +use crate::{IpcMessage, IpcWriteOptions, SchemaAsIpc}; + +/// Return the schema of the RecordBatch that will be returned from [`CommandGetTables`] +/// +/// Note the schema differs based on the values of `include_schema +/// +/// [`CommandGetTables`]: crate::sql::CommandGetTables +pub fn get_tables_schema(include_schema: bool) -> SchemaRef { + if include_schema { + Arc::clone(&GET_TABLES_SCHEMA_WITH_TABLE_SCHEMA) + } else { + Arc::clone(&GET_TABLES_SCHEMA_WITHOUT_TABLE_SCHEMA) + } +} + +/// Builds rows like this: +/// +/// * catalog_name: utf8, +/// * db_schema_name: utf8, +/// * table_name: utf8 not null, +/// * table_type: utf8 not null, +/// * (optional) table_schema: bytes not null (schema of the table as described +/// in Schema.fbs::Schema it is serialized as an IPC message.) +pub struct GetTablesBuilder { + catalog_filter: Option, + table_types_filter: Vec, + // Optional filters to apply to schemas + db_schema_filter_pattern: Option, + // Optional filters to apply to tables + table_name_filter_pattern: Option, + // array builder for catalog names + catalog_name: StringBuilder, + // array builder for db schema names + db_schema_name: StringBuilder, + // array builder for tables names + table_name: StringBuilder, + // array builder for table types + table_type: StringBuilder, + // array builder for table schemas + table_schema: Option, +} + +impl CommandGetTables { + pub fn into_builder(self) -> GetTablesBuilder { + self.into() + } +} + +impl From for GetTablesBuilder { + fn from(value: CommandGetTables) -> Self { + Self::new( + value.catalog, + value.db_schema_filter_pattern, + value.table_name_filter_pattern, + value.table_types, + value.include_schema, + ) + } +} + +impl GetTablesBuilder { + /// Create a new instance of [`GetTablesBuilder`] + /// + /// # Paramneters + /// + /// - `catalog`: Specifies the Catalog to search for the tables. + /// - An empty string retrieves those without a catalog. + /// - If omitted the catalog name is not used to narrow the search. + /// - `db_schema_filter_pattern`: Specifies a filter pattern for schemas to search for. + /// When no pattern is provided, the pattern will not be used to narrow the search. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. + /// - `table_name_filter_pattern`: Specifies a filter pattern for tables to search for. + /// When no pattern is provided, all tables matching other filters are searched. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. + /// - `table_types`: Specifies a filter of table types which must match. + /// An empy Vec matches all table types. + /// - `include_schema`: Specifies if the Arrow schema should be returned for found tables. + /// + /// [`CommandGetTables`]: crate::sql::CommandGetTables + pub fn new( + catalog: Option>, + db_schema_filter_pattern: Option>, + table_name_filter_pattern: Option>, + table_types: impl IntoIterator>, + include_schema: bool, + ) -> Self { + let table_schema = if include_schema { + Some(BinaryBuilder::new()) + } else { + None + }; + Self { + catalog_filter: catalog.map(|s| s.into()), + table_types_filter: table_types.into_iter().map(|tt| tt.into()).collect(), + db_schema_filter_pattern: db_schema_filter_pattern.map(|s| s.into()), + table_name_filter_pattern: table_name_filter_pattern.map(|t| t.into()), + catalog_name: StringBuilder::new(), + db_schema_name: StringBuilder::new(), + table_name: StringBuilder::new(), + table_type: StringBuilder::new(), + table_schema, + } + } + + /// Append a row + pub fn append( + &mut self, + catalog_name: impl AsRef, + schema_name: impl AsRef, + table_name: impl AsRef, + table_type: impl AsRef, + table_schema: &Schema, + ) -> Result<()> { + self.catalog_name.append_value(catalog_name); + self.db_schema_name.append_value(schema_name); + self.table_name.append_value(table_name); + self.table_type.append_value(table_type); + if let Some(self_table_schema) = self.table_schema.as_mut() { + let options = IpcWriteOptions::default(); + // encode the schema into the correct form + let message: std::result::Result = + SchemaAsIpc::new(table_schema, &options).try_into(); + let IpcMessage(schema) = message?; + self_table_schema.append_value(schema); + } + + Ok(()) + } + + /// builds a `RecordBatch` for `CommandGetTables` + pub fn build(self) -> Result { + let Self { + catalog_filter, + table_types_filter, + db_schema_filter_pattern, + table_name_filter_pattern, + + mut catalog_name, + mut db_schema_name, + mut table_name, + mut table_type, + table_schema, + } = self; + + // Make the arrays + let catalog_name = catalog_name.finish(); + let db_schema_name = db_schema_name.finish(); + let table_name = table_name.finish(); + let table_type = table_type.finish(); + let table_schema = table_schema.map(|mut table_schema| table_schema.finish()); + + // apply any filters, getting a BooleanArray that represents + // the rows that passed the filter + let mut filters = vec![]; + + if let Some(catalog_filter_name) = catalog_filter { + filters.push(eq_utf8_scalar(&catalog_name, &catalog_filter_name)?); + } + + let tt_filter = table_types_filter + .into_iter() + .map(|tt| eq_utf8_scalar(&table_type, &tt)) + .collect::, _>>()? + .into_iter() + // We know the arrays are of same length as they are produced fromn the same root array + .reduce(|filter, arr| or(&filter, &arr).unwrap()); + if let Some(filter) = tt_filter { + filters.push(filter); + } + + if let Some(db_schema_filter_pattern) = db_schema_filter_pattern { + // use like kernel to get wildcard matching + filters.push(like_utf8_scalar( + &db_schema_name, + &db_schema_filter_pattern, + )?) + } + + if let Some(table_name_filter_pattern) = table_name_filter_pattern { + // use like kernel to get wildcard matching + filters.push(like_utf8_scalar(&table_name, &table_name_filter_pattern)?) + } + + let include_schema = table_schema.is_some(); + let batch = if let Some(table_schema) = table_schema { + RecordBatch::try_new( + get_tables_schema(include_schema), + vec![ + Arc::new(catalog_name) as ArrayRef, + Arc::new(db_schema_name) as ArrayRef, + Arc::new(table_name) as ArrayRef, + Arc::new(table_type) as ArrayRef, + Arc::new(table_schema) as ArrayRef, + ], + ) + } else { + RecordBatch::try_new( + get_tables_schema(include_schema), + vec![ + Arc::new(catalog_name) as ArrayRef, + Arc::new(db_schema_name) as ArrayRef, + Arc::new(table_name) as ArrayRef, + Arc::new(table_type) as ArrayRef, + ], + ) + }?; + + // `AND` any filters together + let mut total_filter = None; + while let Some(filter) = filters.pop() { + let new_filter = match total_filter { + Some(total_filter) => and(&total_filter, &filter)?, + None => filter, + }; + total_filter = Some(new_filter); + } + + // Apply the filters if needed + let filtered_batch = if let Some(total_filter) = total_filter { + filter_record_batch(&batch, &total_filter)? + } else { + batch + }; + + // Order filtered results by catalog_name, then db_schema_name, then table_name, then table_type + // https://github.com/apache/arrow/blob/130f9e981aa98c25de5f5bfe55185db270cec313/format/FlightSql.proto#LL1202C1-L1202C1 + let sort_cols = filtered_batch.project(&[0, 1, 2, 3])?; + let indices = lexsort_to_indices(sort_cols.columns()); + let columns = filtered_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>()?; + + Ok(RecordBatch::try_new( + get_tables_schema(include_schema), + columns, + )?) + } +} + +/// The schema for GetTables without `table_schema` column +static GET_TABLES_SCHEMA_WITHOUT_TABLE_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![ + Field::new("catalog_name", DataType::Utf8, false), + Field::new("db_schema_name", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("table_type", DataType::Utf8, false), + ])) +}); + +/// The schema for GetTables with `table_schema` column +static GET_TABLES_SCHEMA_WITH_TABLE_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![ + Field::new("catalog_name", DataType::Utf8, false), + Field::new("db_schema_name", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("table_type", DataType::Utf8, false), + Field::new("table_schema", DataType::Binary, false), + ])) +}); + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{StringArray, UInt32Array}; + + fn get_ref_batch() -> RecordBatch { + RecordBatch::try_new( + get_tables_schema(false), + vec![ + Arc::new(StringArray::from(vec![ + "a_catalog", + "a_catalog", + "a_catalog", + "a_catalog", + "b_catalog", + "b_catalog", + "b_catalog", + "b_catalog", + ])) as ArrayRef, + Arc::new(StringArray::from(vec![ + "a_schema", "a_schema", "b_schema", "b_schema", "a_schema", + "a_schema", "b_schema", "b_schema", + ])) as ArrayRef, + Arc::new(StringArray::from(vec![ + "a_table", "b_table", "a_table", "b_table", "a_table", "a_table", + "b_table", "b_table", + ])) as ArrayRef, + Arc::new(StringArray::from(vec![ + "TABLE", "TABLE", "TABLE", "TABLE", "TABLE", "VIEW", "TABLE", "VIEW", + ])) as ArrayRef, + ], + ) + .unwrap() + } + + fn get_ref_builder( + catalog: Option<&str>, + db_schema_filter_pattern: Option<&str>, + table_name_filter_pattern: Option<&str>, + table_types: Vec<&str>, + include_schema: bool, + ) -> GetTablesBuilder { + let dummy_schema = Schema::empty(); + let tables = [ + ("a_catalog", "a_schema", "a_table", "TABLE"), + ("a_catalog", "a_schema", "b_table", "TABLE"), + ("a_catalog", "b_schema", "a_table", "TABLE"), + ("a_catalog", "b_schema", "b_table", "TABLE"), + ("b_catalog", "a_schema", "a_table", "TABLE"), + ("b_catalog", "a_schema", "a_table", "VIEW"), + ("b_catalog", "b_schema", "b_table", "TABLE"), + ("b_catalog", "b_schema", "b_table", "VIEW"), + ]; + let mut builder = GetTablesBuilder::new( + catalog, + db_schema_filter_pattern, + table_name_filter_pattern, + table_types, + include_schema, + ); + for (catalog_name, schema_name, table_name, table_type) in tables { + builder + .append( + catalog_name, + schema_name, + table_name, + table_type, + &dummy_schema, + ) + .unwrap(); + } + builder + } + + #[test] + fn test_tables_are_filtered() { + let ref_batch = get_ref_batch(); + + let builder = get_ref_builder(None, None, None, Vec::new(), false); + let table_batch = builder.build().unwrap(); + assert_eq!(table_batch, ref_batch); + + let builder = get_ref_builder(None, Some("a%"), Some("a%"), Vec::new(), false); + let table_batch = builder.build().unwrap(); + let indices = UInt32Array::from(vec![0, 4, 5]); + let ref_filtered = RecordBatch::try_new( + get_tables_schema(false), + ref_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>() + .unwrap(), + ) + .unwrap(); + assert_eq!(table_batch, ref_filtered); + + let builder = get_ref_builder(Some("a_catalog"), None, None, Vec::new(), false); + let table_batch = builder.build().unwrap(); + let indices = UInt32Array::from(vec![0, 1, 2, 3]); + let ref_filtered = RecordBatch::try_new( + get_tables_schema(false), + ref_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>() + .unwrap(), + ) + .unwrap(); + assert_eq!(table_batch, ref_filtered); + + let builder = get_ref_builder(None, None, None, vec!["VIEW"], false); + let table_batch = builder.build().unwrap(); + let indices = UInt32Array::from(vec![5, 7]); + let ref_filtered = RecordBatch::try_new( + get_tables_schema(false), + ref_batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>() + .unwrap(), + ) + .unwrap(); + assert_eq!(table_batch, ref_filtered); + } + + #[test] + fn test_tables_are_sorted() { + let ref_batch = get_ref_batch(); + let dummy_schema = Schema::empty(); + + let tables = [ + ("b_catalog", "a_schema", "a_table", "TABLE"), + ("b_catalog", "b_schema", "b_table", "TABLE"), + ("b_catalog", "b_schema", "b_table", "VIEW"), + ("b_catalog", "a_schema", "a_table", "VIEW"), + ("a_catalog", "a_schema", "a_table", "TABLE"), + ("a_catalog", "b_schema", "a_table", "TABLE"), + ("a_catalog", "b_schema", "b_table", "TABLE"), + ("a_catalog", "a_schema", "b_table", "TABLE"), + ]; + let mut builder = GetTablesBuilder::new( + None::, + None::, + None::, + None::, + false, + ); + for (catalog_name, schema_name, table_name, table_type) in tables { + builder + .append( + catalog_name, + schema_name, + table_name, + table_type, + &dummy_schema, + ) + .unwrap(); + } + let table_batch = builder.build().unwrap(); + assert_eq!(table_batch, ref_batch); + } +} diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index d73dc2809615..212655d66d01 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -100,6 +100,7 @@ pub use gen::XdbcDatetimeSubcode; pub use sql_info::SqlInfoList; +pub mod catalogs; pub mod client; pub mod server; pub mod sql_info; diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 9010c8d9a2a9..5b9a1bb88078 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -110,7 +110,7 @@ //! .map(|a| SortField::new(a.data_type().clone())) //! .collect(); //! let mut converter = RowConverter::new(fields).unwrap(); -//! let rows = converter.convert_columns(&arrays).unwrap(); +//! let rows = converter.convert_columns(arrays).unwrap(); //! let mut sort: Vec<_> = rows.iter().enumerate().collect(); //! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); //! UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) From a121e0969ee83d3396c59603717333864acc52fa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 01:23:18 +0100 Subject: [PATCH 0966/1411] Move BooleanBufferBuilder and NullBufferBuilder to arrow_buffer (#4338) * Move BooleanBufferBuilder and NullBufferBuilder to arrow_buffer * Clippy --- arrow-array/src/builder/boolean_builder.rs | 11 +- .../src/builder/fixed_size_binary_builder.rs | 10 +- .../src/builder/fixed_size_list_builder.rs | 14 +- .../src/builder/generic_bytes_builder.rs | 10 +- .../src/builder/generic_list_builder.rs | 11 +- arrow-array/src/builder/map_builder.rs | 14 +- arrow-array/src/builder/mod.rs | 4 +- arrow-array/src/builder/primitive_builder.rs | 13 +- arrow-array/src/builder/struct_builder.rs | 14 +- arrow-array/src/builder/union_builder.rs | 4 +- .../src/builder/boolean.rs | 8 +- arrow-buffer/src/builder/mod.rs | 23 +++ .../src/builder/null.rs | 22 +-- arrow-buffer/src/lib.rs | 3 + .../src => arrow-buffer/src/util}/bit_mask.rs | 151 +----------------- arrow-buffer/src/util/mod.rs | 1 + arrow-data/src/lib.rs | 3 +- 17 files changed, 86 insertions(+), 230 deletions(-) rename arrow-array/src/builder/boolean_buffer_builder.rs => arrow-buffer/src/builder/boolean.rs (98%) create mode 100644 arrow-buffer/src/builder/mod.rs rename arrow-array/src/builder/null_buffer_builder.rs => arrow-buffer/src/builder/null.rs (91%) rename {arrow-data/src => arrow-buffer/src/util}/bit_mask.rs (54%) diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index a35e6f6b97e5..0def0ec48e3b 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BooleanBufferBuilder}; use crate::{ArrayRef, BooleanArray}; use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -150,7 +150,7 @@ impl BooleanBuilder { let builder = ArrayData::builder(DataType::Boolean) .len(len) .add_buffer(self.values_builder.finish().into_inner()) - .null_bit_buffer(null_bit_buffer); + .nulls(null_bit_buffer); let array_data = unsafe { builder.build_unchecked() }; BooleanArray::from(array_data) @@ -159,15 +159,12 @@ impl BooleanBuilder { /// Builds the [BooleanArray] without resetting the builder. pub fn finish_cloned(&self) -> BooleanArray { let len = self.len(); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); + let nulls = self.null_buffer_builder.finish_cloned(); let value_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); let builder = ArrayData::builder(DataType::Boolean) .len(len) .add_buffer(value_buffer) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { builder.build_unchecked() }; BooleanArray::from(array_data) diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index a354a1db24e1..a213b3bbf87d 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, UInt8BufferBuilder}; use crate::{ArrayRef, FixedSizeBinaryArray}; use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -98,7 +98,7 @@ impl FixedSizeBinaryBuilder { let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) .add_buffer(self.values_builder.finish()) - .null_bit_buffer(self.null_buffer_builder.finish()) + .nulls(self.null_buffer_builder.finish()) .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) @@ -111,11 +111,7 @@ impl FixedSizeBinaryBuilder { let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) .add_buffer(values_buffer) - .null_bit_buffer( - self.null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref), - ) + .nulls(self.null_buffer_builder.finish_cloned()) .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index ab9fbf5fa63f..0dd58044305e 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::ArrayBuilder; use crate::{ArrayRef, FixedSizeListArray}; -use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; use std::any::Any; @@ -167,14 +166,14 @@ where len, ); - let null_bit_buffer = self.null_buffer_builder.finish(); + let nulls = self.null_buffer_builder.finish(); let array_data = ArrayData::builder(DataType::FixedSizeList( Arc::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { array_data.build_unchecked() }; @@ -195,17 +194,14 @@ where len, ); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); + let nulls = self.null_buffer_builder.finish_cloned(); let array_data = ArrayData::builder(DataType::FixedSizeList( Arc::new(Field::new("item", values_data.data_type().clone(), true)), self.list_len, )) .len(len) .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 1887ab36c6d9..f77940055bf1 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; +use arrow_buffer::NullBufferBuilder; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayDataBuilder; use std::any::Any; @@ -123,7 +123,7 @@ impl GenericByteBuilder { .len(self.len()) .add_buffer(self.offsets_builder.finish()) .add_buffer(self.value_builder.finish()) - .null_bit_buffer(self.null_buffer_builder.finish()); + .nulls(self.null_buffer_builder.finish()); self.offsets_builder.append(self.next_offset()); let array_data = unsafe { array_builder.build_unchecked() }; @@ -139,11 +139,7 @@ impl GenericByteBuilder { .len(self.len()) .add_buffer(offset_buffer) .add_buffer(value_buffer) - .null_bit_buffer( - self.null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref), - ); + .nulls(self.null_buffer_builder.finish_cloned()); let array_data = unsafe { array_builder.build_unchecked() }; GenericByteArray::from(array_data) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 054c87187fbe..99e15d10f3a5 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; use arrow_schema::Field; use std::any::Any; @@ -243,7 +243,7 @@ where .len(len) .add_buffer(offset_buffer) .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer); + .nulls(null_bit_buffer); let array_data = unsafe { array_data_builder.build_unchecked() }; @@ -257,10 +257,7 @@ where let values_data = values_arr.to_data(); let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); + let nulls = self.null_buffer_builder.finish_cloned(); let field = Arc::new(Field::new( "item", values_data.data_type().clone(), @@ -271,7 +268,7 @@ where .len(len) .add_buffer(offset_buffer) .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { array_data_builder.build_unchecked() }; diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index b73e65b117f1..56b5619ceab1 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::{Array, ArrayRef, MapArray, StructArray}; use arrow_buffer::Buffer; +use arrow_buffer::{NullBuffer, NullBufferBuilder}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; @@ -160,12 +160,8 @@ impl MapBuilder { let keys_arr = self.key_builder.finish_cloned(); let values_arr = self.value_builder.finish_cloned(); let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice()); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); - - self.finish_helper(keys_arr, values_arr, offset_buffer, null_bit_buffer, len) + let nulls = self.null_buffer_builder.finish_cloned(); + self.finish_helper(keys_arr, values_arr, offset_buffer, nulls, len) } fn finish_helper( @@ -173,7 +169,7 @@ impl MapBuilder { keys_arr: Arc, values_arr: Arc, offset_buffer: Buffer, - null_bit_buffer: Option, + nulls: Option, len: usize, ) -> MapArray { assert!( @@ -205,7 +201,7 @@ impl MapBuilder { .len(len) .add_buffer(offset_buffer) .add_child_data(struct_array.into_data()) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index c4f581fbfb46..91df8c27ce47 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -148,8 +148,7 @@ //! } //! ``` -mod boolean_buffer_builder; -pub use boolean_buffer_builder::*; +pub use arrow_buffer::BooleanBufferBuilder; mod boolean_builder; pub use boolean_builder::*; @@ -165,7 +164,6 @@ mod generic_list_builder; pub use generic_list_builder::*; mod map_builder; pub use map_builder::*; -mod null_buffer_builder; mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 440fb8a4bead..f064519e4f94 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::{ArrayBuilder, BufferBuilder}; use crate::types::*; use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use arrow_buffer::NullBufferBuilder; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -278,11 +278,11 @@ impl PrimitiveBuilder { /// Builds the [`PrimitiveArray`] and reset this builder. pub fn finish(&mut self) -> PrimitiveArray { let len = self.len(); - let null_bit_buffer = self.null_buffer_builder.finish(); + let nulls = self.null_buffer_builder.finish(); let builder = ArrayData::builder(self.data_type.clone()) .len(len) .add_buffer(self.values_builder.finish()) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { builder.build_unchecked() }; PrimitiveArray::::from(array_data) @@ -291,15 +291,12 @@ impl PrimitiveBuilder { /// Builds the [`PrimitiveArray`] without resetting the builder. pub fn finish_cloned(&self) -> PrimitiveArray { let len = self.len(); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); + let nulls = self.null_buffer_builder.finish_cloned(); let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); let builder = ArrayData::builder(self.data_type.clone()) .len(len) .add_buffer(values_buffer) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { builder.build_unchecked() }; PrimitiveArray::::from(array_data) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 41ede9c7a992..f5e3f2806507 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::*; use crate::{Array, ArrayRef, StructArray}; -use arrow_buffer::Buffer; +use arrow_buffer::NullBufferBuilder; use arrow_data::ArrayData; use arrow_schema::{DataType, Fields, IntervalUnit, TimeUnit}; use std::any::Any; @@ -247,12 +246,12 @@ impl StructBuilder { child_data.push(arr.to_data()); } let length = self.len(); - let null_bit_buffer = self.null_buffer_builder.finish(); + let nulls = self.null_buffer_builder.finish(); let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) .len(length) .child_data(child_data) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { builder.build_unchecked() }; StructArray::from(array_data) @@ -268,15 +267,12 @@ impl StructBuilder { child_data.push(arr.to_data()); } let length = self.len(); - let null_bit_buffer = self - .null_buffer_builder - .as_slice() - .map(Buffer::from_slice_ref); + let nulls = self.null_buffer_builder.finish_cloned(); let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) .len(length) .child_data(child_data) - .null_bit_buffer(null_bit_buffer); + .nulls(nulls); let array_data = unsafe { builder.build_unchecked() }; StructArray::from(array_data) diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index 6461a56aabbe..f74afb2aa9aa 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -16,9 +16,9 @@ // under the License. use crate::builder::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder}; -use crate::builder::null_buffer_builder::NullBufferBuilder; use crate::builder::BufferBuilder; use crate::{make_array, ArrowPrimitiveType, UnionArray}; +use arrow_buffer::NullBufferBuilder; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType, Field}; @@ -292,7 +292,7 @@ impl UnionBuilder { let arr_data_builder = ArrayDataBuilder::new(data_type.clone()) .add_buffer(buffer) .len(slots) - .null_bit_buffer(bitmap_builder.finish()); + .nulls(bitmap_builder.finish()); let arr_data_ref = unsafe { arr_data_builder.build_unchecked() }; let array_ref = make_array(arr_data_ref); diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-buffer/src/builder/boolean.rs similarity index 98% rename from arrow-array/src/builder/boolean_buffer_builder.rs rename to arrow-buffer/src/builder/boolean.rs index 1a3473e19a04..f84cfa79c2dc 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer}; -use arrow_data::bit_mask; +use crate::{bit_mask, bit_util, BooleanBuffer, Buffer, MutableBuffer}; use std::ops::Range; /// Builder for [`BooleanBuffer`] @@ -221,6 +220,11 @@ impl BooleanBufferBuilder { let len = std::mem::replace(&mut self.len, 0); BooleanBuffer::new(buf.into(), 0, len) } + + /// Builds the [BooleanBuffer] without resetting the builder. + pub fn finish_cloned(&self) -> BooleanBuffer { + BooleanBuffer::new(Buffer::from_slice_ref(self.as_slice()), 0, self.len) + } } impl From for Buffer { diff --git a/arrow-buffer/src/builder/mod.rs b/arrow-buffer/src/builder/mod.rs new file mode 100644 index 000000000000..f9d2d0935300 --- /dev/null +++ b/arrow-buffer/src/builder/mod.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Buffer builders + +mod boolean; +pub use boolean::*; +mod null; +pub use null::*; diff --git a/arrow-array/src/builder/null_buffer_builder.rs b/arrow-buffer/src/builder/null.rs similarity index 91% rename from arrow-array/src/builder/null_buffer_builder.rs rename to arrow-buffer/src/builder/null.rs index f37ce3a747ff..d805b79f09e6 100644 --- a/arrow-array/src/builder/null_buffer_builder.rs +++ b/arrow-buffer/src/builder/null.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::builder::BooleanBufferBuilder; -use arrow_buffer::{Buffer, MutableBuffer}; +use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer}; /// Builder for creating the null bit buffer. /// This builder only materializes the buffer when we append `false`. @@ -24,7 +23,7 @@ use arrow_buffer::{Buffer, MutableBuffer}; /// `None` when calling [`finish`](#method.finish). /// This optimization is **very** important for the performance. #[derive(Debug)] -pub(super) struct NullBufferBuilder { +pub struct NullBufferBuilder { bitmap_builder: Option, /// Store the length of the buffer before materializing. len: usize, @@ -128,10 +127,15 @@ impl NullBufferBuilder { /// Builds the null buffer and resets the builder. /// Returns `None` if the builder only contains `true`s. - pub fn finish(&mut self) -> Option { - let buf = self.bitmap_builder.take().map(Into::into); + pub fn finish(&mut self) -> Option { self.len = 0; - buf + Some(NullBuffer::new(self.bitmap_builder.take()?.finish())) + } + + /// Builds the [NullBuffer] without resetting the builder. + pub fn finish_cloned(&self) -> Option { + let buffer = self.bitmap_builder.as_ref()?.finish_cloned(); + Some(NullBuffer::new(buffer)) } /// Returns the inner bitmap builder as slice @@ -187,7 +191,7 @@ mod tests { assert_eq!(6, builder.len()); let buf = builder.finish().unwrap(); - assert_eq!(Buffer::from(&[0b110010_u8]), buf); + assert_eq!(&[0b110010_u8], buf.validity()); } #[test] @@ -199,7 +203,7 @@ mod tests { assert_eq!(6, builder.len()); let buf = builder.finish().unwrap(); - assert_eq!(Buffer::from(&[0b0_u8]), buf); + assert_eq!(&[0b0_u8], buf.validity()); } #[test] @@ -228,6 +232,6 @@ mod tests { builder.append_slice(&[true, true, false, true]); let buf = builder.finish().unwrap(); - assert_eq!(Buffer::from(&[0b1011_u8]), buf); + assert_eq!(&[0b1011_u8], buf.validity()); } } diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 364e92db229c..90b801c4ae29 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -21,6 +21,9 @@ pub mod alloc; pub mod buffer; pub use buffer::*; +pub mod builder; +pub use builder::*; + mod bigint; mod bytes; mod native; diff --git a/arrow-data/src/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs similarity index 54% rename from arrow-data/src/bit_mask.rs rename to arrow-buffer/src/util/bit_mask.rs index d978f2b74618..2af24b782632 100644 --- a/arrow-data/src/bit_mask.rs +++ b/arrow-buffer/src/util/bit_mask.rs @@ -17,11 +17,8 @@ //! Utils for working with packed bit masks -use crate::ArrayData; -use arrow_buffer::bit_chunk_iterator::BitChunks; -use arrow_buffer::bit_util::{ceil, get_bit, set_bit}; -use arrow_buffer::buffer::buffer_bin_and; -use arrow_buffer::Buffer; +use crate::bit_chunk_iterator::BitChunks; +use crate::bit_util::{ceil, get_bit, set_bit}; /// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the /// bits in `data` in the range `[offset_read..offset_read+len]` @@ -65,45 +62,9 @@ pub fn set_bits( null_count as usize } -/// Combines the null bitmaps of multiple arrays using a bitwise `and` operation. -/// -/// This function is useful when implementing operations on higher level arrays. -#[deprecated(note = "Use NullBuffer::union")] -pub fn combine_option_bitmap( - arrays: &[&ArrayData], - len_in_bits: usize, -) -> Option { - let (buffer, offset) = arrays - .iter() - .map(|array| match array.nulls() { - Some(n) => (Some(n.buffer().clone()), n.offset()), - None => (None, 0), - }) - .reduce(|acc, buffer_and_offset| match (acc, buffer_and_offset) { - ((None, _), (None, _)) => (None, 0), - ((Some(buffer), offset), (None, _)) | ((None, _), (Some(buffer), offset)) => { - (Some(buffer), offset) - } - ((Some(buffer_left), offset_left), (Some(buffer_right), offset_right)) => ( - Some(buffer_bin_and( - &buffer_left, - offset_left, - &buffer_right, - offset_right, - len_in_bits, - )), - 0, - ), - })?; - - Some(buffer?.bit_slice(offset, len_in_bits)) -} - #[cfg(test)] mod tests { use super::*; - use arrow_schema::DataType; - use std::sync::Arc; #[test] fn test_set_bits_aligned() { @@ -226,112 +187,4 @@ mod tests { assert_eq!(destination, expected_data); assert_eq!(result, expected_null_count); } - - fn make_data_with_null_bit_buffer( - len: usize, - offset: usize, - null_bit_buffer: Option, - ) -> Arc { - let buffer = Buffer::from(&vec![11; len + offset]); - - Arc::new( - ArrayData::try_new( - DataType::UInt8, - len, - null_bit_buffer, - offset, - vec![buffer], - vec![], - ) - .unwrap(), - ) - } - - #[test] - #[allow(deprecated)] - fn test_combine_option_bitmap() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let some_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); - let inverse_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); - let some_other_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b11010111]))); - assert_eq!(None, combine_option_bitmap(&[], 8)); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap], 8) - ); - assert_eq!( - None, - combine_option_bitmap(&[&none_bitmap, &none_bitmap], 8) - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap, &none_bitmap], 8) - ); - assert_eq!( - Some(Buffer::from([0b11010111])), - combine_option_bitmap(&[&none_bitmap, &some_other_bitmap], 8) - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&[&some_bitmap, &some_bitmap], 8,) - ); - assert_eq!( - Some(Buffer::from([0b0])), - combine_option_bitmap(&[&some_bitmap, &inverse_bitmap], 8,) - ); - assert_eq!( - Some(Buffer::from([0b01000010])), - combine_option_bitmap(&[&some_bitmap, &some_other_bitmap, &none_bitmap], 8,) - ); - assert_eq!( - Some(Buffer::from([0b00001001])), - combine_option_bitmap( - &[ - &some_bitmap.slice(3, 5), - &inverse_bitmap.slice(2, 5), - &some_other_bitmap.slice(1, 5) - ], - 5, - ) - ); - } - - #[test] - #[allow(deprecated)] - fn test_combine_option_bitmap_with_offsets() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let bitmap0 = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10101010]))); - let bitmap1 = - make_data_with_null_bit_buffer(8, 1, Some(Buffer::from([0b01010100, 0b1]))); - let bitmap2 = - make_data_with_null_bit_buffer(8, 2, Some(Buffer::from([0b10101000, 0b10]))); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1], 8) - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap2], 8) - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1, &none_bitmap], 8) - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&none_bitmap, &bitmap2], 8) - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap0, &bitmap1], 8) - ); - assert_eq!( - Some(Buffer::from([0b10101010])), - combine_option_bitmap(&[&bitmap1, &bitmap2], 8) - ); - } } diff --git a/arrow-buffer/src/util/mod.rs b/arrow-buffer/src/util/mod.rs index 0f1825eae9d4..9023fe4a035d 100644 --- a/arrow-buffer/src/util/mod.rs +++ b/arrow-buffer/src/util/mod.rs @@ -17,4 +17,5 @@ pub mod bit_chunk_iterator; pub mod bit_iterator; +pub mod bit_mask; pub mod bit_util; diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index 15f6acd2c97d..b864b786051a 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -23,8 +23,7 @@ pub use data::*; mod equal; pub mod transform; -pub use arrow_buffer::bit_iterator; -pub mod bit_mask; +pub use arrow_buffer::{bit_iterator, bit_mask}; pub mod decimal; #[cfg(feature = "ffi")] From 795259502d8d19f1e929d8ebf1b2819b6ab145c4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:05:59 +0100 Subject: [PATCH 0967/1411] Rename list contains kernels to in_list (#4289) (#4342) * Rename list contains kernels to in_list (#4289) * Fix other clippy lints --- arrow-ord/src/comparison.rs | 12 ++++++------ arrow/examples/builders.rs | 3 +-- arrow/examples/collect.rs | 5 +++-- arrow/examples/dynamic_types.rs | 2 +- arrow/examples/tensor_builder.rs | 3 +-- arrow/tests/array_validation.rs | 3 +-- parquet/src/data_type.rs | 6 +++--- 7 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index c771182f7917..b9274f0eaefb 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -2700,7 +2700,7 @@ where } /// Checks if a [`GenericListArray`] contains a value in the [`PrimitiveArray`] -pub fn contains( +pub fn in_list( left: &PrimitiveArray, right: &GenericListArray, ) -> Result @@ -2742,7 +2742,7 @@ where } /// Checks if a [`GenericListArray`] contains a value in the [`GenericStringArray`] -pub fn contains_utf8( +pub fn in_list_utf8( left: &GenericStringArray, right: &ListArray, ) -> Result @@ -3425,7 +3425,7 @@ mod tests { let list_array = LargeListArray::from(list_data); let nulls = Int32Array::from(vec![None, None, None, None]); - let nulls_result = contains(&nulls, &list_array).unwrap(); + let nulls_result = in_list(&nulls, &list_array).unwrap(); assert_eq!( nulls_result .as_any() @@ -3435,7 +3435,7 @@ mod tests { ); let values = Int32Array::from(vec![Some(0), Some(0), Some(0), Some(0)]); - let values_result = contains(&values, &list_array).unwrap(); + let values_result = in_list(&values, &list_array).unwrap(); assert_eq!( values_result .as_any() @@ -3695,7 +3695,7 @@ mod tests { let v: Vec> = vec![None, None, None, None]; let nulls = StringArray::from(v); - let nulls_result = contains_utf8(&nulls, &list_array).unwrap(); + let nulls_result = in_list_utf8(&nulls, &list_array).unwrap(); assert_eq!( nulls_result .as_any() @@ -3710,7 +3710,7 @@ mod tests { Some("Lorem"), Some("Lorem"), ]); - let values_result = contains_utf8(&values, &list_array).unwrap(); + let values_result = in_list_utf8(&values, &list_array).unwrap(); assert_eq!( values_result .as_any() diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index a6d8c563b4ca..250f5c39af10 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -///! Many builders are available to easily create different types of arrow arrays -extern crate arrow; +//! Many builders are available to easily create different types of arrow arrays use std::sync::Arc; diff --git a/arrow/examples/collect.rs b/arrow/examples/collect.rs index 5581186dbe7a..ced4640d600f 100644 --- a/arrow/examples/collect.rs +++ b/arrow/examples/collect.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -///! `FromIterator` API is implemented for different array types to easily create them -/// from values. +//! `FromIterator` API is implemented for different array types to easily create them +//! from values. + use arrow::array::Array; use arrow_array::types::Int32Type; use arrow_array::{Float32Array, Int32Array, Int8Array, ListArray}; diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index 5470131d6d41..8ec473c76d56 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -///! This example demonstrates dealing with mixed types dynamically at runtime +//! This example demonstrates dealing with mixed types dynamically at runtime use std::sync::Arc; extern crate arrow; diff --git a/arrow/examples/tensor_builder.rs b/arrow/examples/tensor_builder.rs index ca31679e250d..90ad1b4868f7 100644 --- a/arrow/examples/tensor_builder.rs +++ b/arrow/examples/tensor_builder.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -///! Tensor builder example -extern crate arrow; +//! Tensor builder example use arrow::array::*; //{Int32BufferBuilder, Float32BufferBuilder}; use arrow::buffer::Buffer; diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 67960ada6c98..0d3652a0473a 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -948,8 +948,7 @@ fn test_try_new_sliced_struct() { let struct_array = builder.finish(); let struct_array_slice = struct_array.slice(1, 3); - let cloned = struct_array_slice.clone(); - assert_eq!(&struct_array_slice, &cloned); + assert_eq!(struct_array_slice, struct_array_slice); } #[test] diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 2e7f73bf0a4f..67d0bad98202 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -483,7 +483,7 @@ macro_rules! gen_as_bytes { unsafe { std::slice::from_raw_parts( self_.as_ptr() as *const u8, - std::mem::size_of::<$source_ty>() * self_.len(), + std::mem::size_of_val(self_), ) } } @@ -493,7 +493,7 @@ macro_rules! gen_as_bytes { unsafe fn slice_as_bytes_mut(self_: &mut [Self]) -> &mut [u8] { std::slice::from_raw_parts_mut( self_.as_mut_ptr() as *mut u8, - std::mem::size_of::<$source_ty>() * self_.len(), + std::mem::size_of_val(self_), ) } } @@ -735,7 +735,7 @@ pub(crate) mod private { let raw = unsafe { std::slice::from_raw_parts( values.as_ptr() as *const u8, - std::mem::size_of::<$ty>() * values.len(), + std::mem::size_of_val(values), ) }; writer.write_all(raw)?; From 66ba303bc06c56596cbea45eb6bdc0ba5ba7c640 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 11:26:28 +0100 Subject: [PATCH 0968/1411] Fix clippy for object_store (#4344) --- object_store/src/aws/mod.rs | 18 ++++++++---------- object_store/src/path/mod.rs | 9 +++------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 3696e4ad4eb2..8de4b7c6afa1 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -1127,23 +1127,21 @@ mod tests { ) .with_allow_http(true); - let config = - if let Some(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT").ok() { - config.with_endpoint(endpoint) - } else { - config - }; + let config = if let Ok(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT") { + config.with_endpoint(endpoint) + } else { + config + }; - let config = if let Some(token) = - env::var("OBJECT_STORE_AWS_SESSION_TOKEN").ok() + let config = if let Ok(token) = env::var("OBJECT_STORE_AWS_SESSION_TOKEN") { config.with_token(token) } else { config }; - let config = if let Some(virtual_hosted_style_request) = - env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST").ok() + let config = if let Ok(virtual_hosted_style_request) = + env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST") { config.with_virtual_hosted_style_request( virtual_hosted_style_request.trim().parse().unwrap(), diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 29b134176955..ab30e0ed04cc 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -438,18 +438,15 @@ mod tests { assert!(existing_path.prefix_match(&prefix).is_none()); // Prefix matches but there aren't any parts after it - let existing_path = Path::from("apple/bear/cow/dog"); - - let prefix = existing_path.clone(); - assert_eq!(existing_path.prefix_match(&prefix).unwrap().count(), 0); + let existing = Path::from("apple/bear/cow/dog"); + assert_eq!(existing.prefix_match(&existing).unwrap().count(), 0); assert_eq!(Path::default().parts().count(), 0); } #[test] fn prefix_matches() { let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something"]); - let needle = haystack.clone(); // self starts with self assert!( haystack.prefix_matches(&haystack), @@ -457,7 +454,7 @@ mod tests { ); // a longer prefix doesn't match - let needle = needle.child("longer now"); + let needle = haystack.child("longer now"); assert!( !haystack.prefix_matches(&needle), "{haystack:?} shouldn't have started with {needle:?}" From 0a3259091cc4e74b59fe22d40fc84474492112b4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Jun 2023 07:06:44 -0400 Subject: [PATCH 0969/1411] Update FlightSQL metadata locations, names and docs (#4341) * Update FlightSQL metadata locations, names and docs * update * fix typo * fix CI, unify interface --- arrow-flight/examples/flight_sql_server.rs | 20 +-- arrow-flight/src/sql/catalogs/mod.rs | 123 ------------------ arrow-flight/src/sql/metadata/catalogs.rs | 100 ++++++++++++++ .../sql/{catalogs => metadata}/db_schemas.rs | 57 ++++---- arrow-flight/src/sql/metadata/mod.rs | 55 ++++++++ .../src/sql/{ => metadata}/sql_info.rs | 6 +- .../src/sql/{catalogs => metadata}/tables.rs | 50 ++++--- arrow-flight/src/sql/mod.rs | 8 +- 8 files changed, 233 insertions(+), 186 deletions(-) delete mode 100644 arrow-flight/src/sql/catalogs/mod.rs create mode 100644 arrow-flight/src/sql/metadata/catalogs.rs rename arrow-flight/src/sql/{catalogs => metadata}/db_schemas.rs (89%) create mode 100644 arrow-flight/src/sql/metadata/mod.rs rename arrow-flight/src/sql/{ => metadata}/sql_info.rs (99%) rename arrow-flight/src/sql/{catalogs => metadata}/tables.rs (95%) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 6b92621a564d..ecd8db76bba9 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -30,10 +30,7 @@ use tonic::{Request, Response, Status, Streaming}; use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::sql::catalogs::{ - get_catalogs_schema, get_db_schemas_schema, get_tables_schema, -}; -use arrow_flight::sql::sql_info::SqlInfoList; +use arrow_flight::sql::metadata::SqlInfoList; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, @@ -252,7 +249,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let endpoint = FlightEndpoint::new().with_ticket(ticket); let flight_info = FlightInfo::new() - .try_with_schema(get_catalogs_schema()) + .try_with_schema(&query.into_builder().schema()) .map_err(|e| status!("Unable to encode schema", e))? .with_endpoint(endpoint) .with_descriptor(flight_descriptor); @@ -272,7 +269,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let endpoint = FlightEndpoint::new().with_ticket(ticket); let flight_info = FlightInfo::new() - .try_with_schema(get_db_schemas_schema().as_ref()) + .try_with_schema(&query.into_builder().schema()) .map_err(|e| status!("Unable to encode schema", e))? .with_endpoint(endpoint) .with_descriptor(flight_descriptor); @@ -292,7 +289,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let endpoint = FlightEndpoint::new().with_ticket(ticket); let flight_info = FlightInfo::new() - .try_with_schema(get_tables_schema(query.include_schema).as_ref()) + .try_with_schema(&query.into_builder().schema()) .map_err(|e| status!("Unable to encode schema", e))? .with_endpoint(endpoint) .with_descriptor(flight_descriptor); @@ -410,9 +407,10 @@ impl FlightSqlService for FlightSqlServiceImpl { for catalog_name in catalog_names { builder.append(catalog_name); } + let schema = builder.schema(); let batch = builder.build(); let stream = FlightDataEncoderBuilder::new() - .with_schema(Arc::new(get_catalogs_schema().clone())) + .with_schema(schema) .build(futures::stream::once(async { batch })) .map_err(Status::from); Ok(Response::new(Box::pin(stream))) @@ -436,9 +434,10 @@ impl FlightSqlService for FlightSqlServiceImpl { builder.append(catalog_name, schema_name); } + let schema = builder.schema(); let batch = builder.build(); let stream = FlightDataEncoderBuilder::new() - .with_schema(get_db_schemas_schema()) + .with_schema(schema) .build(futures::stream::once(async { batch })) .map_err(Status::from); Ok(Response::new(Box::pin(stream))) @@ -475,9 +474,10 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(Status::from)?; } + let schema = builder.schema(); let batch = builder.build(); let stream = FlightDataEncoderBuilder::new() - .with_schema(get_db_schemas_schema()) + .with_schema(schema) .build(futures::stream::once(async { batch })) .map_err(Status::from); Ok(Response::new(Box::pin(stream))) diff --git a/arrow-flight/src/sql/catalogs/mod.rs b/arrow-flight/src/sql/catalogs/mod.rs deleted file mode 100644 index e4cbb6fedc45..000000000000 --- a/arrow-flight/src/sql/catalogs/mod.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Builders and function for building responses to information schema requests -//! -//! - [`get_catalogs_batch`] and [`get_catalogs_schema`] for building responses to [`CommandGetCatalogs`] queries. -//! - [`GetSchemasBuilder`] and [`get_db_schemas_schema`] for building responses to [`CommandGetDbSchemas`] queries. -//! - [`GetTablesBuilder`] and [`get_tables_schema`] for building responses to [`CommandGetTables`] queries. -//! -//! [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs -//! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas -//! [`CommandGetTables`]: crate::sql::CommandGetTables - -use std::sync::Arc; - -use arrow_array::{ArrayRef, RecordBatch, StringArray, UInt32Array}; -use arrow_row::{RowConverter, SortField}; -use arrow_schema::{DataType, Field, Schema, SchemaRef}; -use once_cell::sync::Lazy; - -use crate::error::Result; -use crate::sql::CommandGetCatalogs; - -pub use db_schemas::{get_db_schemas_schema, GetSchemasBuilder}; -pub use tables::{get_tables_schema, GetTablesBuilder}; - -mod db_schemas; -mod tables; - -pub struct GetCatalogsBuilder { - catalogs: Vec, -} - -impl CommandGetCatalogs { - pub fn into_builder(self) -> GetCatalogsBuilder { - self.into() - } -} - -impl From for GetCatalogsBuilder { - fn from(_: CommandGetCatalogs) -> Self { - Self::new() - } -} - -impl Default for GetCatalogsBuilder { - fn default() -> Self { - Self::new() - } -} - -impl GetCatalogsBuilder { - /// Create a new instance of [`GetCatalogsBuilder`] - pub fn new() -> Self { - Self { - catalogs: Vec::new(), - } - } - - /// Append a row - pub fn append(&mut self, catalog_name: impl Into) { - self.catalogs.push(catalog_name.into()); - } - - /// builds a `RecordBatch` with the correct schema for a `CommandGetCatalogs` response - pub fn build(self) -> Result { - get_catalogs_batch(self.catalogs) - } -} - -/// Returns the RecordBatch for `CommandGetCatalogs` -pub fn get_catalogs_batch(mut catalog_names: Vec) -> Result { - catalog_names.sort_unstable(); - - let batch = RecordBatch::try_new( - Arc::clone(&GET_CATALOG_SCHEMA), - vec![Arc::new(StringArray::from_iter_values(catalog_names)) as _], - )?; - - Ok(batch) -} - -/// Returns the schema that will result from [`CommandGetCatalogs`] -/// -/// [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs -pub fn get_catalogs_schema() -> &'static Schema { - &GET_CATALOG_SCHEMA -} - -/// The schema for GetCatalogs -static GET_CATALOG_SCHEMA: Lazy = Lazy::new(|| { - Arc::new(Schema::new(vec![Field::new( - "catalog_name", - DataType::Utf8, - false, - )])) -}); - -fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { - let fields = arrays - .iter() - .map(|a| SortField::new(a.data_type().clone())) - .collect(); - let mut converter = RowConverter::new(fields).unwrap(); - let rows = converter.convert_columns(arrays).unwrap(); - let mut sort: Vec<_> = rows.iter().enumerate().collect(); - sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) -} diff --git a/arrow-flight/src/sql/metadata/catalogs.rs b/arrow-flight/src/sql/metadata/catalogs.rs new file mode 100644 index 000000000000..327fed81077b --- /dev/null +++ b/arrow-flight/src/sql/metadata/catalogs.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_array::{RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use once_cell::sync::Lazy; + +use crate::error::Result; +use crate::sql::CommandGetCatalogs; + +/// A builder for a [`CommandGetCatalogs`] response. +/// +/// Builds rows like this: +/// +/// * catalog_name: utf8, +pub struct GetCatalogsBuilder { + catalogs: Vec, +} + +impl CommandGetCatalogs { + /// Create a builder suitable for constructing a response + pub fn into_builder(self) -> GetCatalogsBuilder { + self.into() + } +} + +impl From for GetCatalogsBuilder { + fn from(_: CommandGetCatalogs) -> Self { + Self::new() + } +} + +impl Default for GetCatalogsBuilder { + fn default() -> Self { + Self::new() + } +} + +impl GetCatalogsBuilder { + /// Create a new instance of [`GetCatalogsBuilder`] + pub fn new() -> Self { + Self { + catalogs: Vec::new(), + } + } + + /// Append a row + pub fn append(&mut self, catalog_name: impl Into) { + self.catalogs.push(catalog_name.into()); + } + + /// builds a `RecordBatch` with the correct schema for a + /// [`CommandGetCatalogs`] response + pub fn build(self) -> Result { + let Self { catalogs } = self; + + let batch = RecordBatch::try_new( + Arc::clone(&GET_CATALOG_SCHEMA), + vec![Arc::new(StringArray::from_iter_values(catalogs)) as _], + )?; + + Ok(batch) + } + + /// Returns the schema that will result from [`CommandGetCatalogs`] + /// + /// [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs + pub fn schema(&self) -> SchemaRef { + get_catalogs_schema() + } +} + +fn get_catalogs_schema() -> SchemaRef { + Arc::clone(&GET_CATALOG_SCHEMA) +} + +/// The schema for GetCatalogs +static GET_CATALOG_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![Field::new( + "catalog_name", + DataType::Utf8, + false, + )])) +}); diff --git a/arrow-flight/src/sql/catalogs/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs similarity index 89% rename from arrow-flight/src/sql/catalogs/db_schemas.rs rename to arrow-flight/src/sql/metadata/db_schemas.rs index 76c5499c89cc..7b10e1c14299 100644 --- a/arrow-flight/src/sql/catalogs/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! [`GetSchemasBuilder`] for building responses to [`CommandGetDbSchemas`] queries. +//! [`GetDbSchemasBuilder`] for building responses to [`CommandGetDbSchemas`] queries. //! //! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas @@ -33,26 +33,13 @@ use super::lexsort_to_indices; use crate::error::*; use crate::sql::CommandGetDbSchemas; -/// Return the schema of the RecordBatch that will be returned from [`CommandGetDbSchemas`] +/// A builder for a [`CommandGetDbSchemas`] response. /// -/// [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas -pub fn get_db_schemas_schema() -> SchemaRef { - Arc::clone(&GET_DB_SCHEMAS_SCHEMA) -} - -/// The schema for GetDbSchemas -static GET_DB_SCHEMAS_SCHEMA: Lazy = Lazy::new(|| { - Arc::new(Schema::new(vec![ - Field::new("catalog_name", DataType::Utf8, false), - Field::new("db_schema_name", DataType::Utf8, false), - ])) -}); - /// Builds rows like this: /// /// * catalog_name: utf8, /// * db_schema_name: utf8, -pub struct GetSchemasBuilder { +pub struct GetDbSchemasBuilder { // Specifies the Catalog to search for the tables. // - An empty string retrieves those without a catalog. // - If omitted the catalog name is not used to narrow the search. @@ -66,19 +53,20 @@ pub struct GetSchemasBuilder { } impl CommandGetDbSchemas { - pub fn into_builder(self) -> GetSchemasBuilder { + /// Create a builder suitable for constructing a response + pub fn into_builder(self) -> GetDbSchemasBuilder { self.into() } } -impl From for GetSchemasBuilder { +impl From for GetDbSchemasBuilder { fn from(value: CommandGetDbSchemas) -> Self { Self::new(value.catalog, value.db_schema_filter_pattern) } } -impl GetSchemasBuilder { - /// Create a new instance of [`GetSchemasBuilder`] +impl GetDbSchemasBuilder { + /// Create a new instance of [`GetDbSchemasBuilder`] /// /// # Parameters /// @@ -118,6 +106,7 @@ impl GetSchemasBuilder { /// builds a `RecordBatch` with the correct schema for a `CommandGetDbSchemas` response pub fn build(self) -> Result { + let schema = self.schema(); let Self { catalog_filter, db_schema_filter_pattern, @@ -154,7 +143,7 @@ impl GetSchemasBuilder { } let batch = RecordBatch::try_new( - get_db_schemas_schema(), + schema, vec![ Arc::new(catalog_name) as ArrayRef, Arc::new(db_schema_name) as ArrayRef, @@ -176,10 +165,28 @@ impl GetSchemasBuilder { .map(|c| take(c, &indices, None)) .collect::, _>>()?; - Ok(RecordBatch::try_new(get_db_schemas_schema(), columns)?) + Ok(RecordBatch::try_new(filtered_batch.schema(), columns)?) } + + /// Return the schema of the RecordBatch that will be returned + /// from [`CommandGetDbSchemas`] + pub fn schema(&self) -> SchemaRef { + get_db_schemas_schema() + } +} + +fn get_db_schemas_schema() -> SchemaRef { + Arc::clone(&GET_DB_SCHEMAS_SCHEMA) } +/// The schema for GetDbSchemas +static GET_DB_SCHEMAS_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![ + Field::new("catalog_name", DataType::Utf8, false), + Field::new("db_schema_name", DataType::Utf8, false), + ])) +}); + #[cfg(test)] mod tests { use super::*; @@ -207,7 +214,7 @@ mod tests { fn test_schemas_are_filtered() { let ref_batch = get_ref_batch(); - let mut builder = GetSchemasBuilder::new(None::, None::); + let mut builder = GetDbSchemasBuilder::new(None::, None::); builder.append("a_catalog", "a_schema"); builder.append("a_catalog", "b_schema"); builder.append("b_catalog", "a_schema"); @@ -216,7 +223,7 @@ mod tests { assert_eq!(schema_batch, ref_batch); - let mut builder = GetSchemasBuilder::new(None::, Some("a%")); + let mut builder = GetDbSchemasBuilder::new(None::, Some("a%")); builder.append("a_catalog", "a_schema"); builder.append("a_catalog", "b_schema"); builder.append("b_catalog", "a_schema"); @@ -242,7 +249,7 @@ mod tests { fn test_schemas_are_sorted() { let ref_batch = get_ref_batch(); - let mut builder = GetSchemasBuilder::new(None::, None::); + let mut builder = GetDbSchemasBuilder::new(None::, None::); builder.append("a_catalog", "b_schema"); builder.append("b_catalog", "a_schema"); builder.append("a_catalog", "a_schema"); diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs new file mode 100644 index 000000000000..9d3810806ab4 --- /dev/null +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Builders and function for building responses to FlightSQL metadata +//! / information schema requests. +//! +//! - [`GetCatalogsBuilder`] for building responses to [`CommandGetCatalogs`] queries. +//! - [`GetDbSchemasBuilder`] for building responses to [`CommandGetDbSchemas`] queries. +//! - [`GetTablesBuilder`]for building responses to [`CommandGetTables`] queries. +//! +//! [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs +//! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas +//! [`CommandGetTables`]: crate::sql::CommandGetTables + +mod catalogs; +mod db_schemas; +mod sql_info; +mod tables; + +pub use catalogs::GetCatalogsBuilder; +pub use db_schemas::GetDbSchemasBuilder; +pub use sql_info::SqlInfoList; +pub use tables::GetTablesBuilder; + +use arrow_array::ArrayRef; +use arrow_array::UInt32Array; +use arrow_row::RowConverter; +use arrow_row::SortField; + +/// Helper function to sort all the columns in an array +fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { + let fields = arrays + .iter() + .map(|a| SortField::new(a.data_type().clone())) + .collect(); + let mut converter = RowConverter::new(fields).unwrap(); + let rows = converter.convert_columns(arrays).unwrap(); + let mut sort: Vec<_> = rows.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) +} diff --git a/arrow-flight/src/sql/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs similarity index 99% rename from arrow-flight/src/sql/sql_info.rs rename to arrow-flight/src/sql/metadata/sql_info.rs index f0d14ff8a741..3dcee1e58c3b 100644 --- a/arrow-flight/src/sql/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -33,8 +33,8 @@ use arrow_data::ArrayData; use arrow_schema::{DataType, Field, Fields, Schema, UnionFields, UnionMode}; use once_cell::sync::Lazy; -use super::SqlInfo; use crate::error::Result; +use crate::sql::SqlInfo; /// Represents a dynamic value #[derive(Debug, Clone, PartialEq)] @@ -321,7 +321,7 @@ impl SqlInfoUnionBuilder { } } -/// A list of FlightSQL server capabilties. +/// A builder for [`CommandGetSqlInfo`] response. /// /// [`CommandGetSqlInfo`] are metadata requests used by a Flight SQL /// server to communicate supported capabilities to Flight SQL @@ -334,7 +334,7 @@ impl SqlInfoUnionBuilder { /// /// # Example /// ``` -/// # use arrow_flight::sql::{SqlInfoList, SqlInfo, SqlSupportedTransaction}; +/// # use arrow_flight::sql::{metadata::SqlInfoList, SqlInfo, SqlSupportedTransaction}; /// // Create the list of metadata describing the server /// let info_list = SqlInfoList::new() /// .with_sql_info(SqlInfo::FlightSqlServerName, "server name") diff --git a/arrow-flight/src/sql/catalogs/tables.rs b/arrow-flight/src/sql/metadata/tables.rs similarity index 95% rename from arrow-flight/src/sql/catalogs/tables.rs rename to arrow-flight/src/sql/metadata/tables.rs index fcdc0dbb7447..67193969d46d 100644 --- a/arrow-flight/src/sql/catalogs/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -35,19 +35,8 @@ use crate::error::*; use crate::sql::CommandGetTables; use crate::{IpcMessage, IpcWriteOptions, SchemaAsIpc}; -/// Return the schema of the RecordBatch that will be returned from [`CommandGetTables`] +/// A builder for a [`CommandGetTables`] response. /// -/// Note the schema differs based on the values of `include_schema -/// -/// [`CommandGetTables`]: crate::sql::CommandGetTables -pub fn get_tables_schema(include_schema: bool) -> SchemaRef { - if include_schema { - Arc::clone(&GET_TABLES_SCHEMA_WITH_TABLE_SCHEMA) - } else { - Arc::clone(&GET_TABLES_SCHEMA_WITHOUT_TABLE_SCHEMA) - } -} - /// Builds rows like this: /// /// * catalog_name: utf8, @@ -76,6 +65,7 @@ pub struct GetTablesBuilder { } impl CommandGetTables { + /// Create a builder suitable for constructing a response pub fn into_builder(self) -> GetTablesBuilder { self.into() } @@ -96,7 +86,7 @@ impl From for GetTablesBuilder { impl GetTablesBuilder { /// Create a new instance of [`GetTablesBuilder`] /// - /// # Paramneters + /// # Parameters /// /// - `catalog`: Specifies the Catalog to search for the tables. /// - An empty string retrieves those without a catalog. @@ -168,6 +158,7 @@ impl GetTablesBuilder { /// builds a `RecordBatch` for `CommandGetTables` pub fn build(self) -> Result { + let schema = self.schema(); let Self { catalog_filter, table_types_filter, @@ -220,10 +211,9 @@ impl GetTablesBuilder { filters.push(like_utf8_scalar(&table_name, &table_name_filter_pattern)?) } - let include_schema = table_schema.is_some(); let batch = if let Some(table_schema) = table_schema { RecordBatch::try_new( - get_tables_schema(include_schema), + schema, vec![ Arc::new(catalog_name) as ArrayRef, Arc::new(db_schema_name) as ArrayRef, @@ -234,7 +224,8 @@ impl GetTablesBuilder { ) } else { RecordBatch::try_new( - get_tables_schema(include_schema), + // schema is different if table_schema is none + schema, vec![ Arc::new(catalog_name) as ArrayRef, Arc::new(db_schema_name) as ArrayRef, @@ -271,10 +262,29 @@ impl GetTablesBuilder { .map(|c| take(c, &indices, None)) .collect::, _>>()?; - Ok(RecordBatch::try_new( - get_tables_schema(include_schema), - columns, - )?) + Ok(RecordBatch::try_new(filtered_batch.schema(), columns)?) + } + + /// Return the schema of the RecordBatch that will be returned from [`CommandGetTables`] + /// + /// Note the schema differs based on the values of `include_schema + /// + /// [`CommandGetTables`]: crate::sql::CommandGetTables + pub fn schema(&self) -> SchemaRef { + get_tables_schema(self.include_schema()) + } + + /// Should the "schema" column be included + pub fn include_schema(&self) -> bool { + self.table_schema.is_some() + } +} + +fn get_tables_schema(include_schema: bool) -> SchemaRef { + if include_schema { + Arc::clone(&GET_TABLES_SCHEMA_WITH_TABLE_SCHEMA) + } else { + Arc::clone(&GET_TABLES_SCHEMA_WITHOUT_TABLE_SCHEMA) } } diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 212655d66d01..4bb8ce8b36e5 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -27,7 +27,7 @@ //! 2. Helpers for encoding and decoding FlightSQL messages: [`Any`] and [`Command`] //! 3. A [`FlightSqlServiceClient`] for interacting with FlightSQL servers. //! 4. A [`FlightSqlService`] to help building FlightSQL servers from [`FlightService`]. -//! 5. Structures to build responses for FlightSQL metadata APIs: [`SqlInfoList`] +//! 5. Helpers to build responses for FlightSQL metadata APIs: [`metadata`] //! //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html //! [Apache Arrow]: https://arrow.apache.org @@ -37,6 +37,7 @@ //! [`do_get`]: crate::flight_service_server::FlightService::do_get //! [`FlightSqlServiceClient`]: client::FlightSqlServiceClient //! [`FlightSqlService`]: server::FlightSqlService +//! [`metadata`]: crate::sql::metadata use arrow_schema::ArrowError; use bytes::Bytes; use paste::paste; @@ -98,12 +99,9 @@ pub use gen::UpdateDeleteRules; pub use gen::XdbcDataType; pub use gen::XdbcDatetimeSubcode; -pub use sql_info::SqlInfoList; - -pub mod catalogs; pub mod client; +pub mod metadata; pub mod server; -pub mod sql_info; /// ProstMessageExt are useful utility methods for prost::Message types pub trait ProstMessageExt: prost::Message + Default { From 8924d82a86aa450fd16d0a12db95d3c19d4d38e1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 14:28:35 +0100 Subject: [PATCH 0970/1411] Don't exclude FIFO files from LocalFileSystem (#4345) --- object_store/src/local.rs | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index bbd54db2ea19..6039f8dbadf3 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -429,11 +429,11 @@ impl ObjectStore for LocalFileSystem { path: location.to_string(), }, }), - Ok(m) => match m.is_file() { + Ok(m) => match !m.is_dir() { true => Ok(m), false => Err(Error::NotFound { path, - source: io::Error::new(ErrorKind::NotFound, "is not file"), + source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; @@ -897,11 +897,11 @@ fn open_file(path: &PathBuf) -> Result { source: e, }, }), - Ok((metadata, file)) => match metadata.is_file() { + Ok((metadata, file)) => match !metadata.is_dir() { true => Ok(file), false => Err(Error::NotFound { path: path.clone(), - source: io::Error::new(ErrorKind::NotFound, "not a file"), + source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; @@ -1491,21 +1491,26 @@ mod unix_test { use crate::{ObjectStore, Path}; use nix::sys::stat; use nix::unistd; - use std::time::Duration; + use std::fs::OpenOptions; use tempfile::TempDir; - use tokio::time::timeout; #[tokio::test] - async fn test_head_fifo() { + async fn test_fifo() { let filename = "some_file"; let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - unistd::mkfifo(&root.path().join(filename), stat::Mode::S_IRWXU).unwrap(); + let path = root.path().join(filename); + unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); + let location = Path::from(filename); - if (timeout(Duration::from_millis(10), integration.head(&location)).await) - .is_err() - { - panic!("Did not receive value within 10 ms"); - } + integration.head(&location).await.unwrap(); + + // Need to open read and write side in parallel + let spawned = tokio::task::spawn_blocking(|| { + OpenOptions::new().write(true).open(path).unwrap(); + }); + + integration.get(&location).await.unwrap(); + spawned.await.unwrap(); } } From a7164849c56be041fc9ade8f9a55efac40e91f99 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 14:29:00 +0100 Subject: [PATCH 0971/1411] Add roundtrip tests for Decimal256 and fix issues (#4264) (#4311) * Add roundtrip tests for Decimal256 and fix issues (#4264) * Review feedback --- parquet/src/arrow/arrow_reader/mod.rs | 58 ++++++++++++++++++++++++++- parquet/src/arrow/schema/mod.rs | 16 +------- parquet/src/arrow/schema/primitive.rs | 17 ++++++-- 3 files changed, 73 insertions(+), 18 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index deca0c719551..432b003990e5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -543,13 +543,15 @@ mod tests { use std::sync::Arc; use bytes::Bytes; + use num::PrimInt; use rand::{thread_rng, Rng, RngCore}; use tempfile::tempfile; use arrow_array::builder::*; + use arrow_array::types::{Decimal128Type, Decimal256Type, DecimalType}; use arrow_array::*; use arrow_array::{RecordBatch, RecordBatchReader}; - use arrow_buffer::{i256, Buffer}; + use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema}; @@ -2554,4 +2556,58 @@ mod tests { assert_eq!(out.num_rows(), 1); assert_eq!(out, batch.slice(2, 1)); } + + fn test_decimal_roundtrip() { + // Precision <= 9 -> INT32 + // Precision <= 18 -> INT64 + // Precision > 18 -> FIXED_LEN_BYTE_ARRAY + + let d = |values: Vec, p: u8| { + let iter = values.into_iter().map(T::Native::usize_as); + PrimitiveArray::::from_iter_values(iter) + .with_precision_and_scale(p, 2) + .unwrap() + }; + + let d1 = d(vec![1, 2, 3, 4, 5], 9); + let d2 = d(vec![1, 2, 3, 4, 10.pow(10) - 1], 10); + let d3 = d(vec![1, 2, 3, 4, 10.pow(18) - 1], 18); + let d4 = d(vec![1, 2, 3, 4, 10.pow(19) - 1], 19); + + let batch = RecordBatch::try_from_iter([ + ("d1", Arc::new(d1) as ArrayRef), + ("d2", Arc::new(d2) as ArrayRef), + ("d3", Arc::new(d3) as ArrayRef), + ("d4", Arc::new(d4) as ArrayRef), + ]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let builder = + ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)).unwrap(); + let t1 = builder.parquet_schema().columns()[0].physical_type(); + assert_eq!(t1, PhysicalType::INT32); + let t2 = builder.parquet_schema().columns()[1].physical_type(); + assert_eq!(t2, PhysicalType::INT64); + let t3 = builder.parquet_schema().columns()[2].physical_type(); + assert_eq!(t3, PhysicalType::INT64); + let t4 = builder.parquet_schema().columns()[3].physical_type(); + assert_eq!(t4, PhysicalType::FIXED_LEN_BYTE_ARRAY); + + let mut reader = builder.build().unwrap(); + assert_eq!(batch.schema(), reader.schema()); + + let out = reader.next().unwrap().unwrap(); + assert_eq!(batch, out); + } + + #[test] + fn test_decimal() { + test_decimal_roundtrip::(); + test_decimal_roundtrip::(); + } } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 3b969104424d..7469d86dc667 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -443,7 +443,8 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::Decimal128(precision, scale) => { + DataType::Decimal128(precision, scale) + | DataType::Decimal256(precision, scale) => { // Decimal precision determines the Parquet physical type to use. // Following the: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal let (physical_type, length) = if *precision > 1 && *precision <= 9 { @@ -467,19 +468,6 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_scale(*scale as i32) .build() } - DataType::Decimal256(precision, scale) => { - // For the decimal256, use the fixed length byte array to store the data - Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_repetition(repetition) - .with_length(decimal_length_from_precision(*precision) as i32) - .with_logical_type(Some(LogicalType::Decimal { - scale: *scale as i32, - precision: *precision as i32, - })) - .with_precision(*precision as i32) - .with_scale(*scale as i32) - .build() - } DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_logical_type(Some(LogicalType::String)) diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 62133f157f37..c67f78076350 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -20,7 +20,7 @@ use crate::basic::{ }; use crate::errors::{ParquetError, Result}; use crate::schema::types::{BasicTypeInfo, Type}; -use arrow_schema::{DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION}; /// Converts [`Type`] to [`DataType`] with an optional `arrow_type_hint` /// provided by the arrow schema @@ -62,6 +62,9 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { // Determine interval time unit (#1666) (DataType::Interval(_), DataType::Interval(_)) => hint, + // Promote to Decimal256 + (DataType::Decimal128(_, _), DataType::Decimal256(_, _)) => hint, + // Potentially preserve dictionary encoding (_, DataType::Dictionary(_, value)) => { // Apply hint to inner type @@ -103,6 +106,14 @@ fn from_parquet(parquet_type: &Type) -> Result { } } +fn decimal_type(scale: i32, precision: i32) -> Result { + if precision <= DECIMAL128_MAX_PRECISION as _ { + decimal_128_type(scale, precision) + } else { + decimal_256_type(scale, precision) + } +} + fn decimal_128_type(scale: i32, precision: i32) -> Result { let scale = scale .try_into() @@ -255,8 +266,8 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result decimal_128_type(s, p), - (None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision), + ) => decimal_type(s, p), + (None, ConvertedType::DECIMAL) => decimal_type(scale, precision), (logical, converted) => Err(arrow_err!( "Unable to convert parquet BYTE_ARRAY logical type {:?} or converted type {}", logical, From 863d59981bb015570e0ed3a9618f3807750942ab Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Fri, 2 Jun 2023 16:32:20 +0300 Subject: [PATCH 0972/1411] feat: cast between `Intervals` (#4182) * feat: cast between Intervals * feat: cast: IntervalUnit::YearMonth -> IntervalUnit::MonthDayNano and IntervalUnit::DayTime -> IntervalUnit::MonthDayNano * refactoring --- arrow-cast/src/cast.rs | 117 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 5 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 9652047c749e..839326d089fb 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -254,6 +254,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { }, (Duration(_), Interval(MonthDayNano)) => true, (Interval(MonthDayNano), Duration(_)) => true, + (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => true, + (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano)) => true, (_, _) => false, } } @@ -409,6 +411,33 @@ where } } +/// Cast the array from interval year month to month day nano +fn cast_interval_year_month_to_interval_month_day_nano( + array: &dyn Array, + _cast_options: &CastOptions, +) -> Result { + let array = array.as_primitive::(); + + Ok(Arc::new(array.unary::<_, IntervalMonthDayNanoType>(|v| { + let months = IntervalYearMonthType::to_months(v); + IntervalMonthDayNanoType::make_value(months, 0, 0) + }))) +} + +/// Cast the array from interval day time to month day nano +fn cast_interval_day_time_to_interval_month_day_nano( + array: &dyn Array, + _cast_options: &CastOptions, +) -> Result { + let array = array.as_primitive::(); + let mul = 1_000_000; + + Ok(Arc::new(array.unary::<_, IntervalMonthDayNanoType>(|v| { + let (days, ms) = IntervalDayTimeType::to_parts(v); + IntervalMonthDayNanoType::make_value(0, days, ms as i64 * mul) + }))) +} + /// Cast the array from interval to duration fn cast_interval_to_duration>( array: &dyn Array, @@ -2137,18 +2166,24 @@ pub fn cast_with_options( (Duration(TimeUnit::Nanosecond), Interval(IntervalUnit::MonthDayNano)) => { cast_duration_to_interval::(array, cast_options) } - (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Second)) => { + (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Second)) => { cast_interval_to_duration::(array, cast_options) } - (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Millisecond)) => { + (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Millisecond)) => { cast_interval_to_duration::(array, cast_options) } - (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Microsecond)) => { + (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Microsecond)) => { cast_interval_to_duration::(array, cast_options) } - (DataType::Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Nanosecond)) => { + (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Nanosecond)) => { cast_interval_to_duration::(array, cast_options) } + (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => { + cast_interval_year_month_to_interval_month_day_nano(array, cast_options) + } + (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano)) => { + cast_interval_day_time_to_interval_month_day_nano(array, cast_options) + } (Interval(IntervalUnit::YearMonth), Int64) => { cast_numeric_arrays::(array, cast_options) } @@ -9030,7 +9065,7 @@ mod tests { assert_eq!(casted_array.value(0), 9223372036854775807); } - // helper function to test casting from interval to duration + /// helper function to test casting from interval to duration fn cast_from_interval_to_duration( array: Vec, cast_options: &CastOptions, @@ -9173,6 +9208,78 @@ mod tests { assert!(casted_array.is_err()); } + /// helper function to test casting from interval year month to interval month day nano + fn cast_from_interval_year_month_to_interval_month_day_nano( + array: Vec, + cast_options: &CastOptions, + ) -> Result, ArrowError> { + let array = PrimitiveArray::::from(array); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Interval(IntervalUnit::MonthDayNano), + cast_options, + )?; + casted_array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::ComputeError( + "Failed to downcast to IntervalMonthDayNanoArray".to_string(), + ) + }) + .cloned() + } + + #[test] + fn test_cast_from_interval_year_month_to_interval_month_day_nano() { + // from interval year month to interval month day nano + let array = vec![1234567]; + let casted_array = cast_from_interval_year_month_to_interval_month_day_nano( + array, + &CastOptions::default(), + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 97812474910747780469848774134464512); + } + + /// helper function to test casting from interval day time to interval month day nano + fn cast_from_interval_day_time_to_interval_month_day_nano( + array: Vec, + cast_options: &CastOptions, + ) -> Result, ArrowError> { + let array = PrimitiveArray::::from(array); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Interval(IntervalUnit::MonthDayNano), + cast_options, + )?; + Ok(casted_array + .as_primitive::() + .clone()) + } + + #[test] + fn test_cast_from_interval_day_time_to_interval_month_day_nano() { + // from interval day time to interval month day nano + let array = vec![123]; + let casted_array = cast_from_interval_day_time_to_interval_month_day_nano( + array, + &CastOptions::default(), + ) + .unwrap(); + assert_eq!( + casted_array.data_type(), + &DataType::Interval(IntervalUnit::MonthDayNano) + ); + assert_eq!(casted_array.value(0), 123000000); + } + #[test] fn test_cast_below_unixtimestamp() { let valid = StringArray::from(vec![ From fdf37a08e961117857933d016960b37d304a6910 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 15:40:28 +0100 Subject: [PATCH 0973/1411] Fix support for ECS IAM credentials (#4310) --- object_store/src/aws/credential.rs | 45 +++++++++++++++++++++++++- object_store/src/aws/mod.rs | 51 ++++++++++++++++++++---------- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index 909dde072193..be0ffa578d13 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -20,7 +20,7 @@ use crate::client::retry::RetryExt; use crate::client::token::{TemporaryToken, TokenCache}; use crate::client::TokenProvider; use crate::util::hmac_sha256; -use crate::{Result, RetryConfig}; +use crate::{CredentialProvider, Result, RetryConfig}; use async_trait::async_trait; use bytes::Buf; use chrono::{DateTime, Utc}; @@ -542,6 +542,49 @@ async fn web_identity( }) } +/// Credentials sourced from a task IAM role +/// +/// +#[derive(Debug)] +pub struct TaskCredentialProvider { + pub url: String, + pub retry: RetryConfig, + pub client: Client, + pub cache: TokenCache>, +} + +#[async_trait] +impl CredentialProvider for TaskCredentialProvider { + type Credential = AwsCredential; + + async fn get_credential(&self) -> Result> { + self.cache + .get_or_insert_with(|| task_credential(&self.client, &self.retry, &self.url)) + .await + .map_err(|source| crate::Error::Generic { + store: STORE, + source, + }) + } +} + +/// +async fn task_credential( + client: &Client, + retry: &RetryConfig, + url: &str, +) -> Result>, StdError> { + let creds: InstanceCredentials = + client.get(url).send_retry(retry).await?.json().await?; + + let now = Utc::now(); + let ttl = (creds.expiration - now).to_std().unwrap_or_default(); + Ok(TemporaryToken { + token: Arc::new(creds.into()), + expiry: Some(Instant::now() + ttl), + }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 8de4b7c6afa1..8a486f986792 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -46,7 +46,9 @@ use url::Url; pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{InstanceCredentialProvider, WebIdentityProvider}; +use crate::aws::credential::{ + InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, +}; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::{ @@ -87,9 +89,6 @@ pub use credential::{AwsAuthorizer, AwsCredential}; /// Default metadata endpoint static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; -/// ECS metadata endpoint -static ECS_METADATA_ENDPOINT: &str = "http://169.254.170.2"; - /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -399,6 +398,8 @@ pub struct AmazonS3Builder { checksum_algorithm: Option>, /// Metadata endpoint, see metadata_endpoint: Option, + /// Container credentials URL, see + container_credentials_relative_uri: Option, /// Client options client_options: ClientOptions, /// Credentials @@ -529,6 +530,11 @@ pub enum AmazonS3ConfigKey { /// - `metadata_endpoint` MetadataEndpoint, + /// Set the container credentials relative URI + /// + /// + ContainerCredentialsRelativeUri, + /// Client options Client(ClientConfigKey), } @@ -548,6 +554,9 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", + Self::ContainerCredentialsRelativeUri => { + "aws_container_credentials_relative_uri" + } Self::Client(opt) => opt.as_ref(), } } @@ -578,6 +587,9 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), + "aws_container_credentials_relative_uri" => { + Ok(Self::ContainerCredentialsRelativeUri) + } // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -625,15 +637,6 @@ impl AmazonS3Builder { } } - // This env var is set in ECS - // https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html - if let Ok(metadata_relative_uri) = - std::env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") - { - builder.metadata_endpoint = - Some(format!("{ECS_METADATA_ENDPOINT}{metadata_relative_uri}")); - } - builder } @@ -691,6 +694,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri = Some(value.into()) + } AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } @@ -758,6 +764,9 @@ impl AmazonS3Builder { self.checksum_algorithm.as_ref().map(ToString::to_string) } AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri.clone() + } } } @@ -999,6 +1008,15 @@ impl AmazonS3Builder { client, self.retry_config.clone(), )) as _ + } else if let Some(uri) = self.container_credentials_relative_uri { + info!("Using Task credential provider"); + Arc::new(TaskCredentialProvider { + url: format!("http://169.254.170.2{uri}"), + retry: self.retry_config.clone(), + // The instance metadata endpoint is access over HTTP + client: self.client_options.clone().with_allow_http(true).client()?, + cache: Default::default(), + }) as _ } else { info!("Using Instance credential provider"); @@ -1199,9 +1217,10 @@ mod tests { assert_eq!(builder.endpoint.unwrap(), aws_endpoint); assert_eq!(builder.token.unwrap(), aws_session_token); - let metadata_uri = - format!("{ECS_METADATA_ENDPOINT}{container_creds_relative_uri}"); - assert_eq!(builder.metadata_endpoint.unwrap(), metadata_uri); + assert_eq!( + builder.container_credentials_relative_uri.unwrap(), + container_creds_relative_uri + ); assert_eq!( builder.checksum_algorithm.unwrap().get().unwrap(), Checksum::SHA256 From b46ea46aa65149fac763671b0adcb9c4e406ec11 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 16:46:32 +0100 Subject: [PATCH 0974/1411] Fix MutableArrayData::extend_nulls (#1230) (#4343) --- arrow-data/src/transform/mod.rs | 56 ++++++++++++++++++++------------- arrow/tests/array_transform.rs | 23 ++++++++++++++ 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index c74875072233..f4b2b46d1723 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -53,7 +53,7 @@ struct _MutableArrayData<'a> { pub null_count: usize, pub len: usize, - pub null_buffer: MutableBuffer, + pub null_buffer: Option, // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). // Thus, we place them in the stack to avoid bound checks and greater data locality. @@ -63,6 +63,12 @@ struct _MutableArrayData<'a> { } impl<'a> _MutableArrayData<'a> { + fn null_buffer(&mut self) -> &mut MutableBuffer { + self.null_buffer + .as_mut() + .expect("MutableArrayData not nullable") + } + fn freeze(self, dictionary: Option) -> ArrayDataBuilder { let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); @@ -77,10 +83,13 @@ impl<'a> _MutableArrayData<'a> { } }; - let nulls = (self.null_count > 0).then(|| { - let bools = BooleanBuffer::new(self.null_buffer.into(), 0, self.len); - unsafe { NullBuffer::new_unchecked(bools, self.null_count) } - }); + let nulls = self + .null_buffer + .map(|nulls| { + let bools = BooleanBuffer::new(nulls.into(), 0, self.len); + unsafe { NullBuffer::new_unchecked(bools, self.null_count) } + }) + .filter(|n| n.null_count() > 0); ArrayDataBuilder::new(self.data_type) .offset(0) @@ -95,22 +104,25 @@ fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits if let Some(nulls) = array.nulls() { let bytes = nulls.validity(); Box::new(move |mutable, start, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); + let mutable_len = mutable.len; + let out = mutable.null_buffer(); + utils::resize_for_bits(out, mutable_len + len); mutable.null_count += set_bits( - mutable.null_buffer.as_slice_mut(), + out.as_slice_mut(), bytes, - mutable.len, + mutable_len, nulls.offset() + start, len, ); }) } else if use_nulls { Box::new(|mutable, _, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - let write_data = mutable.null_buffer.as_slice_mut(); - let offset = mutable.len; + let mutable_len = mutable.len; + let out = mutable.null_buffer(); + utils::resize_for_bits(out, mutable_len + len); + let write_data = out.as_slice_mut(); (0..len).for_each(|i| { - bit_util::set_bit(write_data, offset + i); + bit_util::set_bit(write_data, mutable_len + i); }); }) } else { @@ -555,13 +567,10 @@ impl<'a> MutableArrayData<'a> { .map(|array| build_extend_null_bits(array, use_nulls)) .collect(); - let null_buffer = if use_nulls { + let null_buffer = use_nulls.then(|| { let null_bytes = bit_util::ceil(array_capacity, 8); MutableBuffer::from_len_zeroed(null_bytes) - } else { - // create 0 capacity mutable buffer with the intention that it won't be used - MutableBuffer::with_capacity(0) - }; + }); let extend_values = match &data_type { DataType::Dictionary(_, _) => { @@ -624,13 +633,18 @@ impl<'a> MutableArrayData<'a> { } /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays + /// + /// # Panics + /// + /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays + /// pub fn extend_nulls(&mut self, len: usize) { - // TODO: null_buffer should probably be extended here as well - // otherwise is_valid() could later panic - // add test to confirm + self.data.len += len; + let bit_len = bit_util::ceil(self.data.len, 8); + let nulls = self.data.null_buffer(); + nulls.resize(bit_len, 0); self.data.null_count += len; (self.extend_nulls)(&mut self.data, len); - self.data.len += len; } /// Returns the current length diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 40938c80f4c3..ebbadc00aecd 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -922,6 +922,29 @@ fn test_fixed_size_binary_append() { assert_eq!(result, expected); } +#[test] +fn test_extend_nulls() { + let int = Int32Array::from(vec![1, 2, 3, 4]).into_data(); + let mut mutable = MutableArrayData::new(vec![&int], true, 4); + mutable.extend(0, 2, 3); + mutable.extend_nulls(2); + + let data = mutable.freeze(); + data.validate_full().unwrap(); + let out = Int32Array::from(data); + + assert_eq!(out.null_count(), 2); + assert_eq!(out.iter().collect::>(), vec![Some(3), None, None]); +} + +#[test] +#[should_panic(expected = "MutableArrayData not nullable")] +fn test_extend_nulls_panic() { + let int = Int32Array::from(vec![1, 2, 3, 4]).into_data(); + let mut mutable = MutableArrayData::new(vec![&int], false, 4); + mutable.extend_nulls(2); +} + /* // this is an old test used on a meanwhile removed dead code // that is still useful when `MutableArrayData` supports fixed-size lists. From 008cf9c27424d581a67ba97f338a22b6eace9cc1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 17:12:37 +0100 Subject: [PATCH 0975/1411] Don't infer 16-byte decimal as decimal256 (#4349) --- parquet/src/arrow/schema/mod.rs | 14 ++++++++++++-- parquet/src/arrow/schema/primitive.rs | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 7469d86dc667..cd6e8046cc63 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -607,6 +607,9 @@ mod tests { REQUIRED INT64 decimal2 (DECIMAL(12,2)); REQUIRED FIXED_LEN_BYTE_ARRAY (16) decimal3 (DECIMAL(30,2)); REQUIRED BYTE_ARRAY decimal4 (DECIMAL(33,2)); + REQUIRED BYTE_ARRAY decimal5 (DECIMAL(38,2)); + REQUIRED FIXED_LEN_BYTE_ARRAY (17) decimal6 (DECIMAL(39,2)); + REQUIRED BYTE_ARRAY decimal7 (DECIMAL(39,2)); } "; @@ -619,8 +622,11 @@ mod tests { let arrow_fields = Fields::from(vec![ Field::new("decimal1", DataType::Decimal128(4, 2), false), Field::new("decimal2", DataType::Decimal128(12, 2), false), - Field::new("decimal3", DataType::Decimal256(30, 2), false), + Field::new("decimal3", DataType::Decimal128(30, 2), false), Field::new("decimal4", DataType::Decimal128(33, 2), false), + Field::new("decimal5", DataType::Decimal128(38, 2), false), + Field::new("decimal6", DataType::Decimal256(39, 2), false), + Field::new("decimal7", DataType::Decimal256(39, 2), false), ]); assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -1389,6 +1395,8 @@ mod tests { REQUIRED INT32 decimal_int32 (DECIMAL(8,2)); REQUIRED INT64 decimal_int64 (DECIMAL(16,2)); REQUIRED FIXED_LEN_BYTE_ARRAY (13) decimal_fix_length (DECIMAL(30,2)); + REQUIRED FIXED_LEN_BYTE_ARRAY (16) decimal128 (DECIMAL(38,2)); + REQUIRED FIXED_LEN_BYTE_ARRAY (17) decimal256 (DECIMAL(39,2)); } "; let parquet_group_type = parse_message_type(message_type).unwrap(); @@ -1473,7 +1481,9 @@ mod tests { ), Field::new("decimal_int32", DataType::Decimal128(8, 2), false), Field::new("decimal_int64", DataType::Decimal128(16, 2), false), - Field::new("decimal_fix_length", DataType::Decimal256(30, 2), false), + Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false), + Field::new("decimal128", DataType::Decimal128(38, 2), false), + Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap(); diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index c67f78076350..83d84b77ec06 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -285,14 +285,14 @@ fn from_fixed_len_byte_array( // TODO: This should check the type length for the decimal and interval types match (info.logical_type(), info.converted_type()) { (Some(LogicalType::Decimal { scale, precision }), _) => { - if type_length < 16 { + if type_length <= 16 { decimal_128_type(scale, precision) } else { decimal_256_type(scale, precision) } } (None, ConvertedType::DECIMAL) => { - if type_length < 16 { + if type_length <= 16 { decimal_128_type(scale, precision) } else { decimal_256_type(scale, precision) From ec0f75aeabd07d06395b70ce2e4e3573da520ba7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 18:39:59 +0100 Subject: [PATCH 0976/1411] Fix ObjectStore::get_range for GetResult::File (#4350) (#4351) * Fix ObjectStore::get_range for GetResult::File (#4350) * Review feedback --- object_store/Cargo.toml | 4 + object_store/src/lib.rs | 14 +++- object_store/src/local.rs | 2 +- object_store/tests/get_range_file.rs | 116 +++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 object_store/tests/get_range_file.rs diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 28bf29f7f1e0..4002a1865fa6 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -75,3 +75,7 @@ tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" hyper = { version = "0.14.24", features = ["server"] } + +[[test]] +name = "get_range_file" +path = "tests/get_range_file.rs" diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 98bbb7adceb9..864cabc4a8c0 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -359,10 +359,20 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range async fn get_range(&self, location: &Path, range: Range) -> Result { let options = GetOptions { - range: Some(range), + range: Some(range.clone()), ..Default::default() }; - self.get_opts(location, options).await?.bytes().await + // Temporary until GetResult::File supports range (#4352) + match self.get_opts(location, options).await? { + GetResult::Stream(s) => collect_bytes(s, None).await, + #[cfg(not(target_arch = "wasm32"))] + GetResult::File(mut file, path) => { + maybe_spawn_blocking(move || local::read_range(&mut file, &path, range)) + .await + } + #[cfg(target_arch = "wasm32")] + _ => unimplemented!("File IO not implemented on wasm32."), + } } /// Return the bytes that are stored at the specified location diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 6039f8dbadf3..ffff6a5739d5 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -863,7 +863,7 @@ impl AsyncWrite for LocalUpload { } } -fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { +pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs new file mode 100644 index 000000000000..f926e3b07f2a --- /dev/null +++ b/object_store/tests/get_range_file.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the default implementation of get_range handles GetResult::File correctly (#4350) + +use async_trait::async_trait; +use bytes::Bytes; +use futures::stream::BoxStream; +use object_store::local::LocalFileSystem; +use object_store::path::Path; +use object_store::{ + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, +}; +use std::fmt::Formatter; +use tempfile::tempdir; +use tokio::io::AsyncWrite; + +#[derive(Debug)] +struct MyStore(LocalFileSystem); + +impl std::fmt::Display for MyStore { + fn fmt(&self, _: &mut Formatter<'_>) -> std::fmt::Result { + todo!() + } +} + +#[async_trait] +impl ObjectStore for MyStore { + async fn put(&self, path: &Path, data: Bytes) -> object_store::Result<()> { + self.0.put(path, data).await + } + + async fn put_multipart( + &self, + _: &Path, + ) -> object_store::Result<(MultipartId, Box)> { + todo!() + } + + async fn abort_multipart( + &self, + _: &Path, + _: &MultipartId, + ) -> object_store::Result<()> { + todo!() + } + + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> object_store::Result { + self.0.get_opts(location, options).await + } + + async fn head(&self, _: &Path) -> object_store::Result { + todo!() + } + + async fn delete(&self, _: &Path) -> object_store::Result<()> { + todo!() + } + + async fn list( + &self, + _: Option<&Path>, + ) -> object_store::Result>> { + todo!() + } + + async fn list_with_delimiter( + &self, + _: Option<&Path>, + ) -> object_store::Result { + todo!() + } + + async fn copy(&self, _: &Path, _: &Path) -> object_store::Result<()> { + todo!() + } + + async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> object_store::Result<()> { + todo!() + } +} + +#[tokio::test] +async fn test_get_range() { + let tmp = tempdir().unwrap(); + let store = MyStore(LocalFileSystem::new_with_prefix(tmp.path()).unwrap()); + let path = Path::from("foo"); + + let expected = Bytes::from_static(b"hello world"); + store.put(&path, expected.clone()).await.unwrap(); + let fetched = store.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(expected, fetched); + + for range in [0..10, 3..5, 0..expected.len()] { + let data = store.get_range(&path, range.clone()).await.unwrap(); + assert_eq!(&data[..], &expected[range]) + } +} From f323097584eaa8edb1193b4fb67bccadd39594f6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 19:01:44 +0100 Subject: [PATCH 0977/1411] Prepare object_store 0.6.1 (#4348) * Prepare object_store 0.6.1 * Final tweaks --- object_store/CHANGELOG-old.md | 64 +++++++++++++++++ object_store/CHANGELOG.md | 68 ++++--------------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 81 insertions(+), 57 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index c9c4e28dca05..3880205bc05e 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,70 @@ # Historical Changelog +## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) + +**Breaking changes:** + +- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) + +**Implemented enhancements:** + +- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) +- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) +- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) +- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) +- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) +- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) +- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) +- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) +- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) +- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) +- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) +- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.5.6](https://github.com/apache/arrow-rs/tree/object_store_0.5.6) (2023-03-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.5...object_store_0.5.6) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index bde0f752323e..fe25e23fb768 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,70 +19,30 @@ # Changelog -## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) +## [object_store_0.6.1](https://github.com/apache/arrow-rs/tree/object_store_0.6.1) (2023-06-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) - -**Breaking changes:** - -- Add ObjectStore::get\_opts \(\#2241\) [\#4212](https://github.com/apache/arrow-rs/pull/4212) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Simplify ObjectStore configuration pattern [\#4189](https://github.com/apache/arrow-rs/pull/4189) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: fix: Incorrect parsing of https Path Style S3 url [\#4082](https://github.com/apache/arrow-rs/pull/4082) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- feat: add etag for objectMeta [\#3937](https://github.com/apache/arrow-rs/pull/3937) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Weijun-H](https://github.com/Weijun-H)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.0...object_store_0.6.1) **Implemented enhancements:** -- Object Store Authorization [\#4223](https://github.com/apache/arrow-rs/issues/4223) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Use XML API for GCS [\#4209](https://github.com/apache/arrow-rs/issues/4209) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) -- Return Error on Invalid Config Value [\#4191](https://github.com/apache/arrow-rs/issues/4191) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Extensible ObjectStore Authentication [\#4163](https://github.com/apache/arrow-rs/issues/4163) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: When using an AWS profile, obtain the default AWS region from the active profile [\#4158](https://github.com/apache/arrow-rs/issues/4158) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- InMemory append API [\#4152](https://github.com/apache/arrow-rs/issues/4152) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support accessing ipc Reader/Writer inner by reference [\#4121](https://github.com/apache/arrow-rs/issues/4121) -- \[object\_store\] Retry requests on connection error [\#4119](https://github.com/apache/arrow-rs/issues/4119) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Instantiate object store from provided url with store options [\#4047](https://github.com/apache/arrow-rs/issues/4047) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Builders \(S3/Azure/GCS\) are missing the `get method` to get the actual configuration information [\#4021](https://github.com/apache/arrow-rs/issues/4021) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support multipart upload in R2 [\#4304](https://github.com/apache/arrow-rs/issues/4304) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- ObjectStore::head Returns Directory for LocalFileSystem and Hierarchical Azure [\#4230](https://github.com/apache/arrow-rs/issues/4230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: different behavior from aws cli for default profile [\#4137](https://github.com/apache/arrow-rs/issues/4137) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- ImdsManagedIdentityOAuthProvider should send resource ID instead of OIDC scope [\#4096](https://github.com/apache/arrow-rs/issues/4096) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Update readme to remove reference to Jira [\#4091](https://github.com/apache/arrow-rs/issues/4091) -- object\_store: Incorrect parsing of https Path Style S3 url [\#4078](https://github.com/apache/arrow-rs/issues/4078) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- \[object\_store\] `local::tests::test_list_root` test fails during release verification [\#3772](https://github.com/apache/arrow-rs/issues/3772) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Default ObjectStore::get\_range Doesn't Apply Range to GetResult::File [\#4350](https://github.com/apache/arrow-rs/issues/4350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -**Merged pull requests:** +**Closed issues:** -- Remove AWS\_PROFILE support [\#4238](https://github.com/apache/arrow-rs/pull/4238) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Expose AwsAuthorizer [\#4237](https://github.com/apache/arrow-rs/pull/4237) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Expose CredentialProvider [\#4235](https://github.com/apache/arrow-rs/pull/4235) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Return NotFound for directories in Head and Get \(\#4230\) [\#4231](https://github.com/apache/arrow-rs/pull/4231) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Standardise credentials API \(\#4223\) \(\#4163\) [\#4225](https://github.com/apache/arrow-rs/pull/4225) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Extract Common Listing and Retrieval Functionality [\#4220](https://github.com/apache/arrow-rs/pull/4220) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat\(object-store\): extend Options API for http client [\#4208](https://github.com/apache/arrow-rs/pull/4208) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([roeap](https://github.com/roeap)) -- Consistently use GCP XML API [\#4207](https://github.com/apache/arrow-rs/pull/4207) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Implement list\_with\_offset for PrefixStore [\#4203](https://github.com/apache/arrow-rs/pull/4203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Allow setting ClientOptions with Options API [\#4202](https://github.com/apache/arrow-rs/pull/4202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Create ObjectStore from URL and Options \(\#4047\) [\#4200](https://github.com/apache/arrow-rs/pull/4200) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Skip test\_list\_root on OS X \(\#3772\) [\#4198](https://github.com/apache/arrow-rs/pull/4198) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Recognise R2 URLs for S3 object store \(\#4190\) [\#4194](https://github.com/apache/arrow-rs/pull/4194) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix ImdsManagedIdentityProvider \(\#4096\) [\#4193](https://github.com/apache/arrow-rs/pull/4193) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Deffered Object Store Config Parsing \(\#4191\) [\#4192](https://github.com/apache/arrow-rs/pull/4192) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Object Store \(AWS\): Support dynamically resolving S3 bucket region [\#4188](https://github.com/apache/arrow-rs/pull/4188) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) -- Faster prefix match in object\_store path handling [\#4164](https://github.com/apache/arrow-rs/pull/4164) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Object Store \(AWS\): Support region configured via named profile [\#4161](https://github.com/apache/arrow-rs/pull/4161) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mr-brobot](https://github.com/mr-brobot)) -- InMemory append API [\#4153](https://github.com/apache/arrow-rs/pull/4153) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([berkaysynnada](https://github.com/berkaysynnada)) -- docs: fix the wrong ln command in CONTRIBUTING.md [\#4139](https://github.com/apache/arrow-rs/pull/4139) ([SteveLauC](https://github.com/SteveLauC)) -- Display the file path in the error message when failed to open credentials file for GCS [\#4124](https://github.com/apache/arrow-rs/pull/4124) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([haoxins](https://github.com/haoxins)) -- Retry on Connection Errors [\#4120](https://github.com/apache/arrow-rs/pull/4120) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kindly](https://github.com/kindly)) -- Simplify reference to GitHub issues [\#4092](https://github.com/apache/arrow-rs/pull/4092) ([bkmgit](https://github.com/bkmgit)) -- Use reqwest build\_split [\#4039](https://github.com/apache/arrow-rs/pull/4039) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix object\_store CI [\#4037](https://github.com/apache/arrow-rs/pull/4037) ([tustvold](https://github.com/tustvold)) -- Add get\_config\_value to AWS/Azure/GCP Builders [\#4035](https://github.com/apache/arrow-rs/pull/4035) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([r4ntix](https://github.com/r4ntix)) -- Update AWS SDK [\#3993](https://github.com/apache/arrow-rs/pull/3993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[object\_store - AmazonS3Builder\] incorrect metadata\_endpoint set in `from_env` in an ECS environment [\#4283](https://github.com/apache/arrow-rs/issues/4283) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +**Merged pull requests:** +- Fix ObjectStore::get\_range for GetResult::File \(\#4350\) [\#4351](https://github.com/apache/arrow-rs/pull/4351) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Don't exclude FIFO files from LocalFileSystem [\#4345](https://github.com/apache/arrow-rs/pull/4345) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix support for ECS IAM credentials [\#4310](https://github.com/apache/arrow-rs/pull/4310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- feat: use exactly equal parts in multipart upload [\#4305](https://github.com/apache/arrow-rs/pull/4305) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Set ECS specific metadata endpoint [\#4288](https://github.com/apache/arrow-rs/pull/4288) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jfuechsl](https://github.com/jfuechsl)) +- Prepare 40.0.0 release [\#4245](https://github.com/apache/arrow-rs/pull/4245) ([tustvold](https://github.com/tustvold)) +- feat: support bulk deletes in object\_store [\#4060](https://github.com/apache/arrow-rs/pull/4060) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 4002a1865fa6..5e2009d07013 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.6.0" +version = "0.6.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 60906307ecf7..3e9f8bdba859 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.5.6" -FUTURE_RELEASE="object_store_0.6.0" +SINCE_TAG="object_store_0.6.0" +FUTURE_RELEASE="object_store_0.6.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From e1badc0542ca82e2304cc3f51a9d25ea2dbb74eb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 2 Jun 2023 19:46:49 +0100 Subject: [PATCH 0978/1411] Prepare arrow 41 (#4354) --- CHANGELOG-old.md | 41 +++++++++++++ CHANGELOG.md | 100 ++++++++++++++++++++++--------- Cargo.toml | 32 +++++----- dev/release/update_change_log.sh | 4 +- 4 files changed, 131 insertions(+), 46 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index da72626d86cf..1475230a7c59 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,47 @@ # Historical Changelog +## [40.0.0](https://github.com/apache/arrow-rs/tree/40.0.0) (2023-05-19) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/39.0.0...40.0.0) + +**Breaking changes:** + +- Prefetch page index \(\#4090\) [\#4216](https://github.com/apache/arrow-rs/pull/4216) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add RecordBatchWriter trait and implement it for CSV, JSON, IPC and P… [\#4206](https://github.com/apache/arrow-rs/pull/4206) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Remove powf\_scalar kernel [\#4187](https://github.com/apache/arrow-rs/pull/4187) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow format specification in cast [\#4169](https://github.com/apache/arrow-rs/pull/4169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([parthchandra](https://github.com/parthchandra)) + +**Implemented enhancements:** + +- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) +- Support `Interval` +/- `Interval` [\#4178](https://github.com/apache/arrow-rs/issues/4178) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[parquet\] add compression info to `print_column_chunk_metadata()` [\#4172](https://github.com/apache/arrow-rs/issues/4172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Allow cast to take in a format specification [\#4168](https://github.com/apache/arrow-rs/issues/4168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support extended pow arithmetic [\#4166](https://github.com/apache/arrow-rs/issues/4166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Preload page index for async ParquetObjectReader [\#4090](https://github.com/apache/arrow-rs/issues/4090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- Subtracting `Timestamp` from `Timestamp` should produce a `Duration` \(not `Timestamp`\) [\#3964](https://github.com/apache/arrow-rs/issues/3964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Arrow Arithmetic: Subtract timestamps [\#4244](https://github.com/apache/arrow-rs/pull/4244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) +- Update proc-macro2 requirement from =1.0.57 to =1.0.58 [\#4236](https://github.com/apache/arrow-rs/pull/4236) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix Nightly Clippy Lints [\#4233](https://github.com/apache/arrow-rs/pull/4233) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: use all primitive types in test\_layouts [\#4229](https://github.com/apache/arrow-rs/pull/4229) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add close method to RecordBatchWriter trait [\#4228](https://github.com/apache/arrow-rs/pull/4228) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Update proc-macro2 requirement from =1.0.56 to =1.0.57 [\#4219](https://github.com/apache/arrow-rs/pull/4219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Feat docs [\#4215](https://github.com/apache/arrow-rs/pull/4215) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Folyd](https://github.com/Folyd)) +- feat: Support bitwise and boolean aggregate functions [\#4210](https://github.com/apache/arrow-rs/pull/4210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Document how to sort a RecordBatch [\#4204](https://github.com/apache/arrow-rs/pull/4204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix incorrect cast Timestamp with Timezone [\#4201](https://github.com/apache/arrow-rs/pull/4201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aprimadi](https://github.com/aprimadi)) +- Add implementation of `RecordBatchReader` for CSV reader [\#4195](https://github.com/apache/arrow-rs/pull/4195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Add Sliced ListArray test \(\#3748\) [\#4186](https://github.com/apache/arrow-rs/pull/4186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- refactor: simplify can\_cast\_types code. [\#4185](https://github.com/apache/arrow-rs/pull/4185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Minor: support new types in struct\_builder.rs [\#4177](https://github.com/apache/arrow-rs/pull/4177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- feat: add compression info to print\_column\_chunk\_metadata\(\) [\#4176](https://github.com/apache/arrow-rs/pull/4176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) ## [39.0.0](https://github.com/apache/arrow-rs/tree/39.0.0) (2023-05-05) [Full Changelog](https://github.com/apache/arrow-rs/compare/38.0.0...39.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b6e88f30c15..3620e86f1e49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,47 +19,91 @@ # Changelog -## [40.0.0](https://github.com/apache/arrow-rs/tree/40.0.0) (2023-05-19) +## [41.0.0](https://github.com/apache/arrow-rs/tree/41.0.0) (2023-06-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/39.0.0...40.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/40.0.0...41.0.0) **Breaking changes:** -- Prefetch page index \(\#4090\) [\#4216](https://github.com/apache/arrow-rs/pull/4216) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add RecordBatchWriter trait and implement it for CSV, JSON, IPC and P… [\#4206](https://github.com/apache/arrow-rs/pull/4206) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) -- Remove powf\_scalar kernel [\#4187](https://github.com/apache/arrow-rs/pull/4187) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Allow format specification in cast [\#4169](https://github.com/apache/arrow-rs/pull/4169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([parthchandra](https://github.com/parthchandra)) +- Rename list contains kernels to in\_list \(\#4289\) [\#4342](https://github.com/apache/arrow-rs/pull/4342) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move BooleanBufferBuilder and NullBufferBuilder to arrow\_buffer [\#4338](https://github.com/apache/arrow-rs/pull/4338) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add separate row\_count and level\_count to PageMetadata \(\#4321\) [\#4326](https://github.com/apache/arrow-rs/pull/4326) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Treat legacy TIMSETAMP\_X converted types as UTC [\#4309](https://github.com/apache/arrow-rs/pull/4309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sergiimk](https://github.com/sergiimk)) +- Simplify parquet PageIterator [\#4306](https://github.com/apache/arrow-rs/pull/4306) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add Builder style APIs and docs for `FlightData`,` FlightInfo`, `FlightEndpoint`, `Locaation` and `Ticket` [\#4294](https://github.com/apache/arrow-rs/pull/4294) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Make GenericColumnWriter Send [\#4287](https://github.com/apache/arrow-rs/pull/4287) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: update flight-sql to latest specs [\#4250](https://github.com/apache/arrow-rs/pull/4250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- feat\(api!\): make ArrowArrayStreamReader Send [\#4232](https://github.com/apache/arrow-rs/pull/4232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) **Implemented enhancements:** -- ObjectStore with\_url Should Handle Path [\#4199](https://github.com/apache/arrow-rs/issues/4199) -- Support `Interval` +/- `Interval` [\#4178](https://github.com/apache/arrow-rs/issues/4178) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[parquet\] add compression info to `print_column_chunk_metadata()` [\#4172](https://github.com/apache/arrow-rs/issues/4172) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Allow cast to take in a format specification [\#4168](https://github.com/apache/arrow-rs/issues/4168) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support extended pow arithmetic [\#4166](https://github.com/apache/arrow-rs/issues/4166) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Preload page index for async ParquetObjectReader [\#4090](https://github.com/apache/arrow-rs/issues/4090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make SerializedRowGroupReader::new\(\) Public [\#4330](https://github.com/apache/arrow-rs/issues/4330) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up i256 division and remainder operations [\#4302](https://github.com/apache/arrow-rs/issues/4302) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- export function parquet\_to\_array\_schema\_and\_fields [\#4298](https://github.com/apache/arrow-rs/issues/4298) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FLightSQL: add helpers to create `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4295](https://github.com/apache/arrow-rs/issues/4295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Make ColumnWriter Send [\#4286](https://github.com/apache/arrow-rs/issues/4286) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add Builder for `FlightInfo` to make it easier to create new requests [\#4281](https://github.com/apache/arrow-rs/issues/4281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Writing/Reading Decimal256 to/from Parquet [\#4264](https://github.com/apache/arrow-rs/issues/4264) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FlightSQL: Add helpers to create `CommandGetSqlInfo` responses \(`SqlInfoValue` and builders\) [\#4256](https://github.com/apache/arrow-rs/issues/4256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Update flight-sql implementation to latest specs [\#4249](https://github.com/apache/arrow-rs/issues/4249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Make ArrowArrayStreamReader Send [\#4222](https://github.com/apache/arrow-rs/issues/4222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support writing FixedSizeList to Parquet [\#4214](https://github.com/apache/arrow-rs/issues/4214) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Cast between `Intervals` [\#4181](https://github.com/apache/arrow-rs/issues/4181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Splice Parquet Data [\#4155](https://github.com/apache/arrow-rs/issues/4155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- CSV Schema More Flexible Timestamp Inference [\#4131](https://github.com/apache/arrow-rs/issues/4131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Subtracting `Timestamp` from `Timestamp` should produce a `Duration` \(not `Timestamp`\) [\#3964](https://github.com/apache/arrow-rs/issues/3964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Doc for arrow\_flight::sql is missing enums that are Xdbc related [\#4339](https://github.com/apache/arrow-rs/issues/4339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- concat\_batches panics with total\_len \<= bit\_len assertion for records with lists [\#4324](https://github.com/apache/arrow-rs/issues/4324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect PageMetadata Row Count returned for V1 DataPage [\#4321](https://github.com/apache/arrow-rs/issues/4321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[parquet\] Not following the spec for TIMESTAMP\_MILLIS legacy converted types [\#4308](https://github.com/apache/arrow-rs/issues/4308) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- ambiguous glob re-exports of contains\_utf8 [\#4289](https://github.com/apache/arrow-rs/issues/4289) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- flight\_sql\_client --header "key: value" yields a value with a leading whitespace [\#4270](https://github.com/apache/arrow-rs/issues/4270) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Casting Timestamp to date is off by one day for dates before 1970-01-01 [\#4211](https://github.com/apache/arrow-rs/issues/4211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Arrow Arithmetic: Subtract timestamps [\#4244](https://github.com/apache/arrow-rs/pull/4244) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) -- Update proc-macro2 requirement from =1.0.57 to =1.0.58 [\#4236](https://github.com/apache/arrow-rs/pull/4236) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix Nightly Clippy Lints [\#4233](https://github.com/apache/arrow-rs/pull/4233) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: use all primitive types in test\_layouts [\#4229](https://github.com/apache/arrow-rs/pull/4229) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add close method to RecordBatchWriter trait [\#4228](https://github.com/apache/arrow-rs/pull/4228) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) -- Update proc-macro2 requirement from =1.0.56 to =1.0.57 [\#4219](https://github.com/apache/arrow-rs/pull/4219) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Feat docs [\#4215](https://github.com/apache/arrow-rs/pull/4215) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Folyd](https://github.com/Folyd)) -- feat: Support bitwise and boolean aggregate functions [\#4210](https://github.com/apache/arrow-rs/pull/4210) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Document how to sort a RecordBatch [\#4204](https://github.com/apache/arrow-rs/pull/4204) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix incorrect cast Timestamp with Timezone [\#4201](https://github.com/apache/arrow-rs/pull/4201) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([aprimadi](https://github.com/aprimadi)) -- Add implementation of `RecordBatchReader` for CSV reader [\#4195](https://github.com/apache/arrow-rs/pull/4195) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) -- Add Sliced ListArray test \(\#3748\) [\#4186](https://github.com/apache/arrow-rs/pull/4186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- refactor: simplify can\_cast\_types code. [\#4185](https://github.com/apache/arrow-rs/pull/4185) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Minor: support new types in struct\_builder.rs [\#4177](https://github.com/apache/arrow-rs/pull/4177) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- feat: add compression info to print\_column\_chunk\_metadata\(\) [\#4176](https://github.com/apache/arrow-rs/pull/4176) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) +- Don't infer 16-byte decimal as decimal256 [\#4349](https://github.com/apache/arrow-rs/pull/4349) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix MutableArrayData::extend\_nulls \(\#1230\) [\#4343](https://github.com/apache/arrow-rs/pull/4343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update FlightSQL metadata locations, names and docs [\#4341](https://github.com/apache/arrow-rs/pull/4341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- chore: expose Xdbc related FlightSQL enums [\#4340](https://github.com/apache/arrow-rs/pull/4340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([appletreeisyellow](https://github.com/appletreeisyellow)) +- Update pyo3 requirement from 0.18 to 0.19 [\#4335](https://github.com/apache/arrow-rs/pull/4335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Skip unnecessary null checks in MutableArrayData [\#4333](https://github.com/apache/arrow-rs/pull/4333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add read parquet by custom rowgroup examples [\#4332](https://github.com/apache/arrow-rs/pull/4332) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sundy-li](https://github.com/sundy-li)) +- Make SerializedRowGroupReader::new\(\) public [\#4331](https://github.com/apache/arrow-rs/pull/4331) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([burmecia](https://github.com/burmecia)) +- Don't split record across pages \(\#3680\) [\#4327](https://github.com/apache/arrow-rs/pull/4327) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- fix date conversion if timestamp below unixtimestamp [\#4323](https://github.com/apache/arrow-rs/pull/4323) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Short-circuit on exhausted page in skip\_records [\#4320](https://github.com/apache/arrow-rs/pull/4320) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Handle trailing padding when skipping repetition levels \(\#3911\) [\#4319](https://github.com/apache/arrow-rs/pull/4319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties [\#4313](https://github.com/apache/arrow-rs/pull/4313) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add roundtrip tests for Decimal256 and fix issues \(\#4264\) [\#4311](https://github.com/apache/arrow-rs/pull/4311) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Expose page-level arrow reader API \(\#4298\) [\#4307](https://github.com/apache/arrow-rs/pull/4307) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Speed up i256 division and remainder operations [\#4303](https://github.com/apache/arrow-rs/pull/4303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat\(flight\): support int32\_to\_int32\_list\_map in sql infos [\#4300](https://github.com/apache/arrow-rs/pull/4300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- feat\(flight\): add helpers to handle `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4296](https://github.com/apache/arrow-rs/pull/4296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Improve docs and tests for `SqlInfoList [\#4293](https://github.com/apache/arrow-rs/pull/4293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- minor: fix arrow\_row docs.rs links [\#4292](https://github.com/apache/arrow-rs/pull/4292) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([roeap](https://github.com/roeap)) +- Update proc-macro2 requirement from =1.0.58 to =1.0.59 [\#4290](https://github.com/apache/arrow-rs/pull/4290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Improve `ArrowWriter` memory usage: Buffer Pages in ArrowWriter instead of RecordBatch \(\#3871\) [\#4280](https://github.com/apache/arrow-rs/pull/4280) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Minor: Add more docstrings in arrow-flight [\#4279](https://github.com/apache/arrow-rs/pull/4279) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Add `Debug` impls for `ArrowWriter` and `SerializedFileWriter` [\#4278](https://github.com/apache/arrow-rs/pull/4278) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Expose `RecordBatchWriter` to `arrow` crate [\#4277](https://github.com/apache/arrow-rs/pull/4277) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Update criterion requirement from 0.4 to 0.5 [\#4275](https://github.com/apache/arrow-rs/pull/4275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add parquet-concat [\#4274](https://github.com/apache/arrow-rs/pull/4274) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Convert FixedSizeListArray to GenericListArray [\#4273](https://github.com/apache/arrow-rs/pull/4273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: support 'Decimal256' for parquet [\#4272](https://github.com/apache/arrow-rs/pull/4272) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) +- Strip leading whitespace from flight\_sql\_client custom header values [\#4271](https://github.com/apache/arrow-rs/pull/4271) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([mkmik](https://github.com/mkmik)) +- Add Append Column API \(\#4155\) [\#4269](https://github.com/apache/arrow-rs/pull/4269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Derive Default for WriterProperties [\#4268](https://github.com/apache/arrow-rs/pull/4268) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Parquet Reader/writer for fixed-size list arrays [\#4267](https://github.com/apache/arrow-rs/pull/4267) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dexterduck](https://github.com/dexterduck)) +- feat\(flight\): add sql-info helpers [\#4266](https://github.com/apache/arrow-rs/pull/4266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Convert parquet metadata back to builders [\#4265](https://github.com/apache/arrow-rs/pull/4265) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add constructors for FixedSize array types \(\#3879\) [\#4263](https://github.com/apache/arrow-rs/pull/4263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Extract IPC ArrayReader struct [\#4259](https://github.com/apache/arrow-rs/pull/4259) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update object\_store requirement from 0.5 to 0.6 [\#4258](https://github.com/apache/arrow-rs/pull/4258) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Absolute Timestamps in CSV Schema Inference \(\#4131\) [\#4217](https://github.com/apache/arrow-rs/pull/4217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: cast between `Intervals` [\#4182](https://github.com/apache/arrow-rs/pull/4182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) diff --git a/Cargo.toml b/Cargo.toml index bf311bd05edc..bca0f70ef339 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "40.0.0" +version = "41.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "40.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "40.0.0", path = "./arrow-arith" } -arrow-array = { version = "40.0.0", path = "./arrow-array" } -arrow-buffer = { version = "40.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "40.0.0", path = "./arrow-cast" } -arrow-csv = { version = "40.0.0", path = "./arrow-csv" } -arrow-data = { version = "40.0.0", path = "./arrow-data" } -arrow-ipc = { version = "40.0.0", path = "./arrow-ipc" } -arrow-json = { version = "40.0.0", path = "./arrow-json" } -arrow-ord = { version = "40.0.0", path = "./arrow-ord" } -arrow-row = { version = "40.0.0", path = "./arrow-row" } -arrow-schema = { version = "40.0.0", path = "./arrow-schema" } -arrow-select = { version = "40.0.0", path = "./arrow-select" } -arrow-string = { version = "40.0.0", path = "./arrow-string" } -parquet = { version = "40.0.0", path = "./parquet", default-features = false } +arrow = { version = "41.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "41.0.0", path = "./arrow-arith" } +arrow-array = { version = "41.0.0", path = "./arrow-array" } +arrow-buffer = { version = "41.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "41.0.0", path = "./arrow-cast" } +arrow-csv = { version = "41.0.0", path = "./arrow-csv" } +arrow-data = { version = "41.0.0", path = "./arrow-data" } +arrow-ipc = { version = "41.0.0", path = "./arrow-ipc" } +arrow-json = { version = "41.0.0", path = "./arrow-json" } +arrow-ord = { version = "41.0.0", path = "./arrow-ord" } +arrow-row = { version = "41.0.0", path = "./arrow-row" } +arrow-schema = { version = "41.0.0", path = "./arrow-schema" } +arrow-select = { version = "41.0.0", path = "./arrow-select" } +arrow-string = { version = "41.0.0", path = "./arrow-string" } +parquet = { version = "41.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 299fa45d3584..7881ad02c06e 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="39.0.0" -FUTURE_RELEASE="40.0.0" +SINCE_TAG="40.0.0" +FUTURE_RELEASE="41.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 383d7933097c238e325e76490c66dd0f79c52fcc Mon Sep 17 00:00:00 2001 From: jakevin Date: Sat, 3 Jun 2023 18:05:42 +0800 Subject: [PATCH 0979/1411] minor: remove useless prefix and fix typo. (#4357) --- .../src/builder/generic_byte_run_builder.rs | 24 +++++++++---------- arrow-cast/src/cast.rs | 12 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 97082fe96673..4e3f36889a1b 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -391,7 +391,7 @@ mod tests { use crate::GenericByteArray; use crate::Int16RunArray; - fn test_bytes_run_buider(values: Vec<&T::Native>) + fn test_bytes_run_builder(values: Vec<&T::Native>) where T: ByteArrayType, ::Native: PartialEq, @@ -428,21 +428,21 @@ mod tests { } #[test] - fn test_string_run_buider() { - test_bytes_run_buider::(vec!["abc", "def", "ghi"]); + fn test_string_run_builder() { + test_bytes_run_builder::(vec!["abc", "def", "ghi"]); } #[test] - fn test_string_run_buider_with_empty_strings() { - test_bytes_run_buider::(vec!["abc", "", "ghi"]); + fn test_string_run_builder_with_empty_strings() { + test_bytes_run_builder::(vec!["abc", "", "ghi"]); } #[test] - fn test_binary_run_buider() { - test_bytes_run_buider::(vec![b"abc", b"def", b"ghi"]); + fn test_binary_run_builder() { + test_bytes_run_builder::(vec![b"abc", b"def", b"ghi"]); } - fn test_bytes_run_buider_finish_cloned(values: Vec<&T::Native>) + fn test_bytes_run_builder_finish_cloned(values: Vec<&T::Native>) where T: ByteArrayType, ::Native: PartialEq, @@ -498,13 +498,13 @@ mod tests { } #[test] - fn test_string_run_buider_finish_cloned() { - test_bytes_run_buider_finish_cloned::(vec!["abc", "def", "ghi"]); + fn test_string_run_builder_finish_cloned() { + test_bytes_run_builder_finish_cloned::(vec!["abc", "def", "ghi"]); } #[test] - fn test_binary_run_buider_finish_cloned() { - test_bytes_run_buider_finish_cloned::(vec![b"abc", b"def", b"ghi"]); + fn test_binary_run_builder_finish_cloned() { + test_bytes_run_builder_finish_cloned::(vec![b"abc", b"def", b"ghi"]); } #[test] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 839326d089fb..1e491d8447ef 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -254,8 +254,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { }, (Duration(_), Interval(MonthDayNano)) => true, (Interval(MonthDayNano), Duration(_)) => true, - (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => true, - (Interval(IntervalUnit::DayTime), Interval(IntervalUnit::MonthDayNano)) => true, + (Interval(YearMonth), Interval(MonthDayNano)) => true, + (Interval(DayTime), Interval(MonthDayNano)) => true, (_, _) => false, } } @@ -2166,16 +2166,16 @@ pub fn cast_with_options( (Duration(TimeUnit::Nanosecond), Interval(IntervalUnit::MonthDayNano)) => { cast_duration_to_interval::(array, cast_options) } - (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Second)) => { + (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Second)) => { cast_interval_to_duration::(array, cast_options) } - (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Millisecond)) => { + (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Millisecond)) => { cast_interval_to_duration::(array, cast_options) } - (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Microsecond)) => { + (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Microsecond)) => { cast_interval_to_duration::(array, cast_options) } - (Interval(IntervalUnit::MonthDayNano), DataType::Duration(TimeUnit::Nanosecond)) => { + (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Nanosecond)) => { cast_interval_to_duration::(array, cast_options) } (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => { From 7c3faa5bf65de7a7bcefb7970ff3692790083ed5 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Sun, 4 Jun 2023 00:00:43 +0200 Subject: [PATCH 0980/1411] refact: use as_primitive in cast.rs test (#4360) * refact: use as_primitive in cast.rs test * fix doctest --- arrow-cast/src/cast.rs | 95 +++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1e491d8447ef..752915f34ccc 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -25,11 +25,13 @@ //! use arrow_cast::cast; //! use arrow_schema::DataType; //! use std::sync::Arc; +//! use arrow_array::types::Float64Type; +//! use arrow_array::cast::AsArray; //! //! let a = Int32Array::from(vec![5, 6, 7]); //! let array = Arc::new(a) as ArrayRef; //! let b = cast(&array, &DataType::Float64).unwrap(); -//! let c = b.as_any().downcast_ref::().unwrap(); +//! let c = b.as_primitive::(); //! assert_eq!(5.0, c.value(0)); //! assert_eq!(6.0, c.value(1)); //! assert_eq!(7.0, c.value(2)); @@ -4979,7 +4981,7 @@ mod tests { fn test_cast_i32_to_f64() { let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast(&array, &DataType::Float64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(5.0, c.value(0)); assert_eq!(6.0, c.value(1)); assert_eq!(7.0, c.value(2)); @@ -4991,7 +4993,7 @@ mod tests { fn test_cast_i32_to_u8() { let array = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); let b = cast(&array, &DataType::UInt8).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert!(!c.is_valid(0)); assert_eq!(6, c.value(1)); assert!(!c.is_valid(2)); @@ -5021,7 +5023,7 @@ mod tests { let array = array.slice(2, 3); let b = cast(&array, &DataType::UInt8).unwrap(); assert_eq!(3, b.len()); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert!(!c.is_valid(0)); assert_eq!(8, c.value(1)); // overflows return None @@ -5032,7 +5034,7 @@ mod tests { fn test_cast_i32_to_i32() { let array = Int32Array::from(vec![5, 6, 7, 8, 9]); let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(5, c.value(0)); assert_eq!(6, c.value(1)); assert_eq!(7, c.value(2)); @@ -5121,7 +5123,7 @@ mod tests { fn test_cast_utf8_to_i32() { let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(5, c.value(0)); assert_eq!(6, c.value(1)); assert!(!c.is_valid(2)); @@ -5187,7 +5189,7 @@ mod tests { fn test_cast_bool_to_i32() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(1, c.value(0)); assert_eq!(0, c.value(1)); assert!(!c.is_valid(2)); @@ -5197,7 +5199,7 @@ mod tests { fn test_cast_bool_to_f64() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); let b = cast(&array, &DataType::Float64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(1.0, c.value(0)); assert_eq!(0.0, c.value(1)); assert!(!c.is_valid(2)); @@ -5303,7 +5305,7 @@ mod tests { let a = Date32Array::from(vec![10000, 17890]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(864000000000, c.value(0)); assert_eq!(1545696000000, c.value(1)); } @@ -5313,7 +5315,7 @@ mod tests { let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); assert_eq!(17890, c.value(1)); assert!(c.is_null(2)); @@ -5343,8 +5345,7 @@ mod tests { match time_unit { TimeUnit::Second => { - let c = - b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(1599566400, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); @@ -5414,7 +5415,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Date32; let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(17890, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); @@ -5447,7 +5448,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Time32(TimeUnit::Second); let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(29315, c.value(0)); assert_eq!(29340, c.value(1)); assert!(c.is_null(2)); @@ -5482,7 +5483,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Time32(TimeUnit::Millisecond); let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(29315091, c.value(0)); assert_eq!(29340091, c.value(1)); assert!(c.is_null(2)); @@ -5513,7 +5514,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Time64(TimeUnit::Microsecond); let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(29315091323, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); @@ -5542,7 +5543,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Time64(TimeUnit::Nanosecond); let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(29315091323414, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); @@ -5571,7 +5572,7 @@ mod tests { for array in &[a1, a2] { let to_type = DataType::Date64; let b = cast(array, &to_type).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(1599566400000, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); @@ -5863,7 +5864,7 @@ mod tests { fn test_cast_date32_to_int32() { let array = Date32Array::from(vec![10000, 17890]); let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); assert_eq!(17890, c.value(1)); } @@ -5872,7 +5873,7 @@ mod tests { fn test_cast_int32_to_date32() { let array = Int32Array::from(vec![10000, 17890]); let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); assert_eq!(17890, c.value(1)); } @@ -5886,7 +5887,7 @@ mod tests { ]) .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); assert_eq!(17890, c.value(1)); assert!(c.is_null(2)); @@ -5900,7 +5901,7 @@ mod tests { None, ]); let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(864000000005, c.value(0)); assert_eq!(1545696000001, c.value(1)); assert!(c.is_null(2)); @@ -5908,7 +5909,7 @@ mod tests { let array = TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(864000000005000, c.value(0)); assert_eq!(1545696000001000, c.value(1)); @@ -5932,12 +5933,12 @@ mod tests { let array = TimestampSecondArray::from(vec![Some(86405), Some(1), None]) .with_timezone("+01:00".to_string()); let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000, c.value(0)); assert_eq!(3601000000, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000000, c.value(0)); assert_eq!(3601000000000, c.value(1)); assert!(c.is_null(2)); @@ -5947,12 +5948,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000, c.value(0)); assert_eq!(3601000000, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000000, c.value(0)); assert_eq!(3601000000000, c.value(1)); assert!(c.is_null(2)); @@ -5963,12 +5964,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000, c.value(0)); assert_eq!(3601000000, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000000, c.value(0)); assert_eq!(3601000000000, c.value(1)); assert!(c.is_null(2)); @@ -5982,12 +5983,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000, c.value(0)); assert_eq!(3601000000, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time64(TimeUnit::Nanosecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000000000, c.value(0)); assert_eq!(3601000000000, c.value(1)); assert!(c.is_null(2)); @@ -6011,12 +6012,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605, c.value(0)); assert_eq!(3601, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000, c.value(0)); assert_eq!(3601000, c.value(1)); assert!(c.is_null(2)); @@ -6026,12 +6027,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605, c.value(0)); assert_eq!(3601, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000, c.value(0)); assert_eq!(3601000, c.value(1)); assert!(c.is_null(2)); @@ -6042,12 +6043,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605, c.value(0)); assert_eq!(3601, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000, c.value(0)); assert_eq!(3601000, c.value(1)); assert!(c.is_null(2)); @@ -6061,12 +6062,12 @@ mod tests { .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605, c.value(0)); assert_eq!(3601, c.value(1)); assert!(c.is_null(2)); let b = cast(&array, &DataType::Time32(TimeUnit::Millisecond)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(3605000, c.value(0)); assert_eq!(3601000, c.value(1)); assert!(c.is_null(2)); @@ -6163,7 +6164,7 @@ mod tests { let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(864000000, c.value(0)); assert_eq!(1545696000, c.value(1)); assert!(c.is_null(2)); @@ -6220,7 +6221,7 @@ mod tests { ]) .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Int64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(&DataType::Int64, c.data_type()); assert_eq!(864000000005, c.value(0)); assert_eq!(1545696000001, c.value(1)); @@ -6385,7 +6386,7 @@ mod tests { None, ]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(864000003, c.value(0)); assert_eq!(1545696002, c.value(1)); assert!(c.is_null(2)); @@ -6405,7 +6406,7 @@ mod tests { for arr in duration_arrays { assert!(can_cast_types(arr.data_type(), &DataType::Int64)); let result = cast(&arr, &DataType::Int64).unwrap(); - let result = result.as_any().downcast_ref::().unwrap(); + let result = result.as_primitive::(); assert_eq!(base.as_slice(), result.values()); } } @@ -6424,7 +6425,7 @@ mod tests { for arr in interval_arrays { assert!(can_cast_types(arr.data_type(), &DataType::Int64)); let result = cast(&arr, &DataType::Int64).unwrap(); - let result = result.as_any().downcast_ref::().unwrap(); + let result = result.as_primitive::(); assert_eq!(base.as_slice(), result.values()); } } @@ -7803,7 +7804,7 @@ mod tests { ]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); // test valid inputs let date_value = since( @@ -7840,7 +7841,7 @@ mod tests { ]); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); // test valid inputs assert!(c.is_valid(0)); // "2000-01-01T12:00:00" @@ -8627,7 +8628,7 @@ mod tests { let a = Date32Array::from(vec![Some(18628), Some(18993), None]); // 2021-1-1, 2022-1-1 let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); + let c = b.as_primitive::(); assert_eq!(1609459200, c.value(0)); assert_eq!(1640995200, c.value(1)); assert!(c.is_null(2)); From d8d5fca516a8f947d30b4e0d854710ee691a96b7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sun, 4 Jun 2023 08:11:33 -0700 Subject: [PATCH 0981/1411] Raise TypeError on PyArrow import (#4316) * type error on PyArrow import * fix error message --- .../tests/test_sql.py | 22 ++++++++++++++ arrow/src/pyarrow.rs | 30 ++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index f631f67cbfea..a7c6b34a4474 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -408,3 +408,25 @@ def test_record_batch_reader(): assert b.schema == schema got_batches = list(b) assert got_batches == batches + +def test_reject_other_classes(): + # Arbitrary type that is not a PyArrow type + not_pyarrow = ["hello"] + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Array, got builtins.list"): + rust.round_trip_array(not_pyarrow) + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Schema, got builtins.list"): + rust.round_trip_schema(not_pyarrow) + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Field, got builtins.list"): + rust.round_trip_field(not_pyarrow) + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.DataType, got builtins.list"): + rust.round_trip_type(not_pyarrow) + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.RecordBatch, got builtins.list"): + rust.round_trip_record_batch(not_pyarrow) + + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.RecordBatchReader, got builtins.list"): + rust.round_trip_record_batch_reader(not_pyarrow) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index ba8d606f2e1f..98e27ab30e09 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -24,7 +24,7 @@ use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; -use pyo3::exceptions::PyValueError; +use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; @@ -67,8 +67,27 @@ impl IntoPyArrow for T { } } +fn validate_class(expected: &str, value: &PyAny) -> PyResult<()> { + let pyarrow = PyModule::import(value.py(), "pyarrow")?; + let class = pyarrow.getattr(expected)?; + if !value.is_instance(class)? { + let expected_module = class.getattr("__module__")?.extract::<&str>()?; + let expected_name = class.getattr("__name__")?.extract::<&str>()?; + let found_class = value.get_type(); + let found_module = found_class.getattr("__module__")?.extract::<&str>()?; + let found_name = found_class.getattr("__name__")?.extract::<&str>()?; + return Err(PyTypeError::new_err(format!( + "Expected instance of {}.{}, got {}.{}", + expected_module, expected_name, found_module, found_name + ))); + } + Ok(()) +} + impl FromPyArrow for DataType { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("DataType", value)?; + let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; @@ -91,6 +110,8 @@ impl ToPyArrow for DataType { impl FromPyArrow for Field { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("Field", value)?; + let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; @@ -113,6 +134,8 @@ impl ToPyArrow for Field { impl FromPyArrow for Schema { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("Schema", value)?; + let c_schema = FFI_ArrowSchema::empty(); let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; value.call_method1("_export_to_c", (c_schema_ptr as Py_uintptr_t,))?; @@ -135,6 +158,8 @@ impl ToPyArrow for Schema { impl FromPyArrow for ArrayData { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("Array", value)?; + // prepare a pointer to receive the Array struct let mut array = FFI_ArrowArray::empty(); let mut schema = FFI_ArrowSchema::empty(); @@ -194,6 +219,7 @@ impl ToPyArrow for Vec { impl FromPyArrow for RecordBatch { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("RecordBatch", value)?; // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches let schema = value.getattr("schema")?; let schema = Arc::new(Schema::from_pyarrow(schema)?); @@ -235,6 +261,8 @@ impl ToPyArrow for RecordBatch { impl FromPyArrow for ArrowArrayStreamReader { fn from_pyarrow(value: &PyAny) -> PyResult { + validate_class("RecordBatchReader", value)?; + // prepare a pointer to receive the stream struct let mut stream = FFI_ArrowArrayStream::empty(); let stream_ptr = &mut stream as *mut FFI_ArrowArrayStream; From dc18d4f588695bd9e40e3d5f1c5404718faca770 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Sun, 4 Jun 2023 21:26:24 +0300 Subject: [PATCH 0982/1411] deprecate: as_decimal_array (#4363) * remove: as_decimal_array * feat: deprecated --- arrow-array/src/cast.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 21993114ea7d..af7e7d606020 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -676,7 +676,12 @@ array_downcast_fn!(as_null_array, NullArray); array_downcast_fn!(as_struct_array, StructArray); array_downcast_fn!(as_union_array, UnionArray); array_downcast_fn!(as_map_array, MapArray); -array_downcast_fn!(as_decimal_array, Decimal128Array); + +/// Force downcast of an Array, such as an ArrayRef to Decimal128Array, panic’ing on failure. +#[deprecated(note = "please use `as_primitive_array::` instead")] +pub fn as_decimal_array(arr: &dyn Array) -> &PrimitiveArray { + as_primitive_array::(arr) +} /// Downcasts a `dyn Array` to a concrete type /// @@ -876,18 +881,6 @@ mod tests { use super::*; - #[test] - fn test_as_decimal_array_ref() { - let array: Decimal128Array = vec![Some(123), None, Some(1111)] - .into_iter() - .collect::() - .with_precision_and_scale(10, 2) - .unwrap(); - assert!(!as_decimal_array(&array).is_empty()); - let result_decimal = as_decimal_array(&array); - assert_eq!(result_decimal, &array); - } - #[test] fn test_as_primitive_array_ref() { let array: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect(); From 5976ae45e7fc3e4bed2ea0916162432d96e97f96 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Sun, 4 Jun 2023 22:58:05 +0300 Subject: [PATCH 0983/1411] Minor: float16 to json (#4358) * Minor: float16 to json * feat: Float16 JSON Reader * fix: clippy * fix: cargo fmt --- arrow-cast/Cargo.toml | 1 + arrow-cast/src/parse.rs | 9 +++++++++ arrow-json/src/reader/mod.rs | 5 +++-- arrow-json/src/reader/primitive_array.rs | 7 +++++++ arrow-json/src/writer.rs | 5 +++++ arrow-json/test/data/basic.json | 4 ++-- 6 files changed, 27 insertions(+), 4 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index a999fe51739d..ebfadeb99f1a 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -46,6 +46,7 @@ arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } comfy-table = { version = "6.0", optional = true, default-features = false } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index fd248f2be850..fa0ed9979d8e 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -21,6 +21,7 @@ use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; use arrow_buffer::ArrowNativeType; use arrow_schema::ArrowError; use chrono::prelude::*; +use half::f16; use std::str::FromStr; /// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` @@ -436,6 +437,14 @@ pub trait Parser: ArrowPrimitiveType { } } +impl Parser for Float16Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()) + .ok() + .map(f16::from_f32) + } +} + impl Parser for Float32Type { fn parse(string: &str) -> Option { lexical_core::parse(string.as_bytes()).ok() diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 5f1a2bb43f8a..dd58e1e1a4d9 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -591,6 +591,7 @@ fn make_decoder( downcast_integer! { data_type => (primitive_decoder, data_type), DataType::Null => Ok(Box::::default()), + DataType::Float16 => primitive_decoder!(Float16Type, data_type), DataType::Float32 => primitive_decoder!(Float32Type, data_type), DataType::Float64 => primitive_decoder!(Float64Type, data_type), DataType::Timestamp(TimeUnit::Second, None) => { @@ -1422,7 +1423,7 @@ mod tests { let mut reader = read_file("test/data/basic.json", None); let batch = reader.next().unwrap().unwrap(); - assert_eq!(7, batch.num_columns()); + assert_eq!(8, batch.num_columns()); assert_eq!(12, batch.num_rows()); let schema = reader.schema(); @@ -1941,7 +1942,7 @@ mod tests { let mut sum_a = 0; for batch in reader { let batch = batch.unwrap(); - assert_eq!(7, batch.num_columns()); + assert_eq!(8, batch.num_columns()); sum_num_rows += batch.num_rows(); num_batches += 1; let batch_schema = batch.schema(); diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs index cde52391f654..c78e4d914060 100644 --- a/arrow-json/src/reader/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -23,6 +23,7 @@ use arrow_array::{Array, ArrowPrimitiveType}; use arrow_cast::parse::Parser; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; +use half::f16; use crate::reader::tape::{Tape, TapeElement}; use crate::reader::ArrayDecoder; @@ -54,6 +55,12 @@ macro_rules! primitive_parse { primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64); +impl ParseJsonNumber for f16 { + fn parse(s: &[u8]) -> Option { + lexical_core::parse::(s).ok().map(f16::from_f32) + } +} + impl ParseJsonNumber for f32 { fn parse(s: &[u8]) -> Option { lexical_core::parse::(s).ok() diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index e6c960aef271..d00662a7228d 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -174,6 +174,7 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { DataType::UInt16 => primitive_array_to_json::(array), DataType::UInt32 => primitive_array_to_json::(array), DataType::UInt64 => primitive_array_to_json::(array), + DataType::Float16 => primitive_array_to_json::(array), DataType::Float32 => primitive_array_to_json::(array), DataType::Float64 => primitive_array_to_json::(array), DataType::List(_) => as_list_array(array) @@ -264,6 +265,9 @@ fn set_column_for_json_rows( DataType::UInt64 => { set_column_by_primitive_type::(rows, array, col_name); } + DataType::Float16 => { + set_column_by_primitive_type::(rows, array, col_name); + } DataType::Float32 => { set_column_by_primitive_type::(rows, array, col_name); } @@ -1452,6 +1456,7 @@ mod tests { Field::new("e", DataType::Utf8, true), Field::new("f", DataType::Utf8, true), Field::new("g", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("h", DataType::Float16, true), ])); let mut reader = ReaderBuilder::new(schema.clone()) diff --git a/arrow-json/test/data/basic.json b/arrow-json/test/data/basic.json index 598838dfc536..a6a8766bf97c 100644 --- a/arrow-json/test/data/basic.json +++ b/arrow-json/test/data/basic.json @@ -1,5 +1,5 @@ -{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2", "f": "1.02", "g": "2012-04-23T18:25:43.511"} -{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31", "f": "-0.3", "g": "2016-04-23T18:25:43.511"} +{"a":1, "b":2.0, "c":false, "d":"4", "e":"1970-1-2", "f": "1.02", "g": "2012-04-23T18:25:43.511", "h": 1.1} +{"a":-10, "b":-3.5, "c":true, "d":"4", "e": "1969-12-31", "f": "-0.3", "g": "2016-04-23T18:25:43.511", "h": 3.141} {"a":2, "b":0.6, "c":false, "d":"text", "e": "1970-01-02 11:11:11", "f": "1377.223"} {"a":1, "b":2.0, "c":false, "d":"4", "f": "1337.009"} {"a":7, "b":-3.5, "c":true, "d":"4", "f": "1"} From ac9c6fa134280581c7e19750910b6c74153a75d4 Mon Sep 17 00:00:00 2001 From: dadepo Date: Mon, 5 Jun 2023 13:52:24 +0400 Subject: [PATCH 0984/1411] Add support for FixedSizeList in array_to_json_array (#4361) * Add support for FixedSizeList in array_to_json_array * Properly named the test function * Added iter implementation FixedSizeListArray. Updated array_to_json_array to use it --- .../src/array/fixed_size_list_array.rs | 26 ++++++++++++-- arrow-array/src/cast.rs | 9 +++++ arrow-array/src/iterator.rs | 3 ++ arrow-json/src/writer.rs | 36 +++++++++++++++++-- 4 files changed, 69 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 3df108ced04f..6c1598ce90df 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -17,6 +17,7 @@ use crate::array::print_long_array; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; +use crate::iterator::FixedSizeListIter; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::ArrowNativeType; @@ -277,6 +278,11 @@ impl FixedSizeListArray { } builder.finish() } + + /// constructs a new iterator + pub fn iter(&self) -> FixedSizeListIter<'_> { + FixedSizeListIter::new(self) + } } impl From for FixedSizeListArray { @@ -389,14 +395,28 @@ impl std::fmt::Debug for FixedSizeListArray { } } +impl<'a> ArrayAccessor for &'a FixedSizeListArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } +} + #[cfg(test)] mod tests { - use super::*; + use arrow_buffer::{bit_util, BooleanBuffer, Buffer}; + use arrow_schema::Field; + use crate::cast::AsArray; use crate::types::Int32Type; use crate::Int32Array; - use arrow_buffer::{bit_util, BooleanBuffer, Buffer}; - use arrow_schema::Field; + + use super::*; #[test] fn test_fixed_size_list_array() { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index af7e7d606020..1f8bb6587e58 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -593,6 +593,15 @@ pub fn as_list_array(arr: &dyn Array) -> &ListArray { as_generic_list_array::(arr) } +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`FixedSizeListArray`], panic'ing on failure. +#[inline] +pub fn as_fixed_size_list_array(arr: &dyn Array) -> &FixedSizeListArray { + arr.as_any() + .downcast_ref::() + .expect("Unable to downcast to fixed size list array") +} + /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`LargeListArray`], panic'ing on failure. #[inline] diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index ff99233129cf..fa76e09b2883 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -21,6 +21,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; +use crate::FixedSizeListArray; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -124,6 +125,8 @@ pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray>; pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; /// an iterator that returns Some(T) or None, that can be used on any FixedSizeBinaryArray pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; +/// an iterator that returns Some(T) or None, that can be used on any FixedSizeListArray +pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; /// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d00662a7228d..d2365118a31d 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -191,6 +191,13 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { None => Ok(Value::Null), }) .collect(), + DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array) + .iter() + .map(|maybe_value| match maybe_value { + Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + None => Ok(Value::Null), + }) + .collect(), DataType::Struct(_) => { let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) @@ -610,10 +617,12 @@ mod tests { use std::io::{BufReader, Seek}; use std::sync::Arc; - use crate::reader::*; + use serde_json::json; + use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; - use serde_json::json; + + use crate::reader::*; use super::*; @@ -1488,4 +1497,27 @@ mod tests { assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } } + + #[test] + fn test_array_to_json_array_for_fixed_size_list_array() { + let expected_json = vec![ + json!([0, 1, 2]), + json!(null), + json!([3, null, 5]), + json!([6, 7, 45]), + ]; + + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7), Some(45)]), + ]; + + let list_array = + FixedSizeListArray::from_iter_primitive::(data, 3); + let list_array = Arc::new(list_array) as ArrayRef; + + assert_eq!(array_to_json_array(&list_array).unwrap(), expected_json); + } } From 72f84b21d5826238392a1c03518e3dd625288ef1 Mon Sep 17 00:00:00 2001 From: Josh Wiley Date: Mon, 5 Jun 2023 09:26:23 -0700 Subject: [PATCH 0985/1411] Arrow Cast: Fixed Point Arithmetic for Interval Parsing (#4291) * refactor(arrow-cast): interval parsing * fix(arrow-cast): clippy * feat(arrow-cast): use fixed point arith in interval parsing wip * feat(arrow-cast): tests for fixed point parsing, conversion, arith * feat(arrow-cast): FixedMonthDayNano impl wip * feat(arrow-cast): trunc/fract for fixed point numbers * fix(arrow-cast): better error messages with overflow errors during parsing * refactor(arrow-cast): fixed point number impls * feat(arrow-cast): add checked conversion from fixed point interval * feat(arrow-cast): use i64 for interval constants * feat(arrow-cast): fixed interval amounts wip * fix(arrow-cast): simplify interval add according to postgres logic * feat(arrow-cast): impl interval parsing * docs(arrow-cast): better parse doc comments * fix(arrow-cast): interval cast tests * fix(arrow-cast): clippy * fix(arrow-cast): remove interval compaction, interval amount parsing --- arrow-cast/src/cast.rs | 7 +- arrow-cast/src/parse.rs | 4023 +++++++++++++++++++++------------------ 2 files changed, 2210 insertions(+), 1820 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 752915f34ccc..ec8559d962e3 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -5769,7 +5769,10 @@ mod tests { i64::MAX - 2 ))], IntervalUnit::DayTime, - r#"Parser error: Parsed interval field value out of range: 11068046444225730000000 months 331764692165666300000000 days 28663672503769583000000000000000000000 nanos"# + format!( + "Compute error: Overflow happened on: {} * 100", + i64::MAX - 2 + ) ); test_unsafe_string_to_interval_err!( vec![Some(format!( @@ -5779,7 +5782,7 @@ mod tests { i64::MAX - 2 ))], IntervalUnit::MonthDayNano, - r#"Parser error: Parsed interval field value out of range: 110680464442257310000 months 3043712772162076000000 days 262179884170819100000000000000000000 nanos"# + format!("Compute error: Overflow happened on: {} * 12", i64::MAX - 2) ); } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index fa0ed9979d8e..accce99b4649 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -1,1818 +1,2205 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::timezone::Tz; -use arrow_array::types::*; -use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::ArrowError; -use chrono::prelude::*; -use half::f16; -use std::str::FromStr; - -/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` -#[inline] -fn parse_nanos(digits: &[u8]) -> u32 { - digits[..N] - .iter() - .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) - * 10_u32.pow((9 - N) as _) -} - -/// Helper for parsing timestamps -struct TimestampParser { - /// The timestamp bytes to parse minus `b'0'` - /// - /// This makes interpretation as an integer inexpensive - digits: [u8; 32], - /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit - mask: u32, -} - -impl TimestampParser { - fn new(bytes: &[u8]) -> Self { - let mut digits = [0; 32]; - let mut mask = 0; - - // Treating all bytes the same way, helps LLVM vectorise this correctly - for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { - *o = i.wrapping_sub(b'0'); - mask |= ((*o < 10) as u32) << idx - } - - Self { digits, mask } - } - - /// Returns true if the byte at `idx` in the original string equals `b` - fn test(&self, idx: usize, b: u8) -> bool { - self.digits[idx] == b.wrapping_sub(b'0') - } - - /// Parses a date of the form `1997-01-31` - fn date(&self) -> Option { - if self.mask & 0b1111111111 != 0b1101101111 - || !self.test(4, b'-') - || !self.test(7, b'-') - { - return None; - } - - let year = self.digits[0] as u16 * 1000 - + self.digits[1] as u16 * 100 - + self.digits[2] as u16 * 10 - + self.digits[3] as u16; - - let month = self.digits[5] * 10 + self.digits[6]; - let day = self.digits[8] * 10 + self.digits[9]; - - NaiveDate::from_ymd_opt(year as _, month as _, day as _) - } - - /// Parses a time of any of forms - /// - `09:26:56` - /// - `09:26:56.123` - /// - `09:26:56.123456` - /// - `09:26:56.123456789` - /// - `092656` - /// - /// Returning the end byte offset - fn time(&self) -> Option<(NaiveTime, usize)> { - // Make a NaiveTime handling leap seconds - let time = |hour, min, sec, nano| match sec { - 60 => { - let nano = 1_000_000_000 + nano; - NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) - } - _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), - }; - - match (self.mask >> 11) & 0b11111111 { - // 09:26:56 - 0b11011011 if self.test(13, b':') && self.test(16, b':') => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[14] * 10 + self.digits[15]; - let second = self.digits[17] * 10 + self.digits[18]; - - match self.test(19, b'.') { - true => { - let digits = (self.mask >> 20).trailing_ones(); - let nanos = match digits { - 0 => return None, - 1 => parse_nanos::<1, 0>(&self.digits[20..21]), - 2 => parse_nanos::<2, 0>(&self.digits[20..22]), - 3 => parse_nanos::<3, 0>(&self.digits[20..23]), - 4 => parse_nanos::<4, 0>(&self.digits[20..24]), - 5 => parse_nanos::<5, 0>(&self.digits[20..25]), - 6 => parse_nanos::<6, 0>(&self.digits[20..26]), - 7 => parse_nanos::<7, 0>(&self.digits[20..27]), - 8 => parse_nanos::<8, 0>(&self.digits[20..28]), - _ => parse_nanos::<9, 0>(&self.digits[20..29]), - }; - Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) - } - false => Some((time(hour, minute, second, 0)?, 19)), - } - } - // 092656 - 0b111111 => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[13] * 10 + self.digits[14]; - let second = self.digits[15] * 10 + self.digits[16]; - let time = time(hour, minute, second, 0)?; - Some((time, 17)) - } - _ => None, - } - } -} - -/// Accepts a string and parses it relative to the provided `timezone` -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31` # close to RCF3339, only date no time -/// -/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled -/// -/// * `2023-01-01 040506 America/Los_Angeles` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// Some formats supported by PostgresSql -/// are not supported, like -/// -/// * "2023-01-01 04:05:06.789 +07:30:00", -/// * "2023-01-01 040506 +07:30:00", -/// * "2023-01-01 04:05:06.789 PST", -/// -/// [IANA timezones]: https://www.iana.org/time-zones -pub fn string_to_datetime( - timezone: &T, - s: &str, -) -> Result, ArrowError> { - let err = |ctx: &str| { - ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) - }; - - let bytes = s.as_bytes(); - if bytes.len() < 10 { - return Err(err("timestamp must contain at least 10 characters")); - } - - let parser = TimestampParser::new(bytes); - let date = parser.date().ok_or_else(|| err("error parsing date"))?; - if bytes.len() == 10 { - let offset = timezone.offset_from_local_date(&date); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - - let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - return Ok(DateTime::from_local(date.and_time(time), offset)); - } - - if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { - return Err(err("invalid timestamp separator")); - } - - let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; - let datetime = date.and_time(time); - - if tz_offset == 32 { - // Decimal overrun - while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { - tz_offset += 1; - } - } - - if bytes.len() <= tz_offset { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_local(datetime, offset)); - } - - if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_utc(datetime, offset)); - } - - // Parse remainder of string as timezone - let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; - let offset = parsed_tz.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) -} - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// See [`string_to_datetime`] for the full set of supported formats -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function interprets string without an explicit time zone as timestamps -/// relative to UTC, see [`string_to_datetime`] for alternative semantics -/// -/// In particular: -/// -/// ``` -/// # use arrow_cast::parse::string_to_timestamp_nanos; -/// // Note all three of these timestamps are parsed as the same value -/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); -/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); -/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); -/// -/// assert_eq!(a, b); -/// assert_eq!(b, c); -/// ``` -/// -#[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { - to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) -} - -/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates -#[inline] -fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { - if dt.timestamp().checked_mul(1_000_000_000).is_none() { - return Err(ArrowError::ParseError( - ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), - )); - } - - Ok(dt.timestamp_nanos()) -} - -/// Accepts a string in ISO8601 standard format and some -/// variants and converts it to nanoseconds since midnight. -/// -/// Examples of accepted inputs: -/// * `09:26:56.123 AM` -/// * `23:59:59` -/// * `6:00 pm` -// -/// Internally, this function uses the `chrono` library for the -/// time parsing -/// -/// ## Timezone / Offset Handling -/// -/// This function does not support parsing strings with a timezone -/// or offset specified, as it considers only time since midnight. -pub fn string_to_time_nanoseconds(s: &str) -> Result { - let nt = string_to_time(s).ok_or_else(|| { - ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) - })?; - Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) -} - -fn string_to_time(s: &str) -> Option { - let bytes = s.as_bytes(); - if bytes.len() < 4 { - return None; - } - - let (am, bytes) = match bytes.get(bytes.len() - 3..) { - Some(b" AM" | b" am" | b" Am" | b" aM") => { - (Some(true), &bytes[..bytes.len() - 3]) - } - Some(b" PM" | b" pm" | b" pM" | b" Pm") => { - (Some(false), &bytes[..bytes.len() - 3]) - } - _ => (None, bytes), - }; - - if bytes.len() < 4 { - return None; - } - - let mut digits = [b'0'; 6]; - - // Extract hour - let bytes = match (bytes[1], bytes[2]) { - (b':', _) => { - digits[1] = bytes[0]; - &bytes[2..] - } - (_, b':') => { - digits[0] = bytes[0]; - digits[1] = bytes[1]; - &bytes[3..] - } - _ => return None, - }; - - if bytes.len() < 2 { - return None; // Minutes required - } - - // Extract minutes - digits[2] = bytes[0]; - digits[3] = bytes[1]; - - let nanoseconds = match bytes.get(2) { - Some(b':') => { - if bytes.len() < 5 { - return None; - } - - // Extract seconds - digits[4] = bytes[3]; - digits[5] = bytes[4]; - - // Extract sub-seconds if any - match bytes.get(5) { - Some(b'.') => { - let decimal = &bytes[6..]; - if decimal.iter().any(|x| !x.is_ascii_digit()) { - return None; - } - match decimal.len() { - 0 => return None, - 1 => parse_nanos::<1, b'0'>(decimal), - 2 => parse_nanos::<2, b'0'>(decimal), - 3 => parse_nanos::<3, b'0'>(decimal), - 4 => parse_nanos::<4, b'0'>(decimal), - 5 => parse_nanos::<5, b'0'>(decimal), - 6 => parse_nanos::<6, b'0'>(decimal), - 7 => parse_nanos::<7, b'0'>(decimal), - 8 => parse_nanos::<8, b'0'>(decimal), - _ => parse_nanos::<9, b'0'>(decimal), - } - } - Some(_) => return None, - None => 0, - } - } - Some(_) => return None, - None => 0, - }; - - digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); - if digits.iter().any(|x| *x > 9) { - return None; - } - - let hour = match (digits[0] * 10 + digits[1], am) { - (12, Some(true)) => 0, // 12:00 AM -> 00:00 - (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 - (12, Some(false)) => 12, // 12:00 PM -> 12:00 - (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 - (_, Some(_)) => return None, - (h, None) => h, - }; - - // Handle leap second - let (second, nanoseconds) = match digits[4] * 10 + digits[5] { - 60 => (59, nanoseconds + 1_000_000_000), - s => (s, nanoseconds), - }; - - NaiveTime::from_hms_nano_opt( - hour as _, - (digits[2] * 10 + digits[3]) as _, - second as _, - nanoseconds, - ) -} - -/// Specialized parsing implementations -/// used by csv and json reader -pub trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option; - - fn parse_formatted(string: &str, _format: &str) -> Option { - Self::parse(string) - } -} - -impl Parser for Float16Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()) - .ok() - .map(f16::from_f32) - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -macro_rules! parser_primitive { - ($t:ty) => { - impl Parser for $t { - fn parse(string: &str) -> Option { - lexical_core::parse::(string.as_bytes()).ok() - } - } - }; -} -parser_primitive!(UInt64Type); -parser_primitive!(UInt32Type); -parser_primitive!(UInt16Type); -parser_primitive!(UInt8Type); -parser_primitive!(Int64Type); -parser_primitive!(Int32Type); -parser_primitive!(Int16Type); -parser_primitive!(Int8Type); - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - string_to_timestamp_nanos(string).ok() - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1000) - } -} - -impl Parser for TimestampMillisecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000) - } -} - -impl Parser for TimestampSecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000_000) - } -} - -impl Parser for Time64NanosecondType { - // Will truncate any fractions of a nanosecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 - + nt.nanosecond() as i64, - ) - } -} - -impl Parser for Time64MicrosecondType { - // Will truncate any fractions of a microsecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| nanos / 1_000) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000 - + nt.nanosecond() as i64 / 1_000, - ) - } -} - -impl Parser for Time32MillisecondType { - // Will truncate any fractions of a millisecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 * 1_000 - + nt.nanosecond() as i32 / 1_000_000, - ) - } -} - -impl Parser for Time32SecondType { - // Will truncate any fractions of a second - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 - + nt.nanosecond() as i32 / 1_000_000_000, - ) - } -} - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - let parser = TimestampParser::new(string.as_bytes()); - let date = parser.date()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let date = NaiveDate::parse_from_str(string, format).ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - let date_time = string_to_datetime(&Utc, string).ok()?; - Some(date_time.timestamp_millis()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::format::Fixed; - use chrono::format::StrftimeItems; - let fmt = StrftimeItems::new(format); - let has_zone = fmt.into_iter().any(|item| match item { - chrono::format::Item::Fixed(fixed_item) => matches!( - fixed_item, - Fixed::RFC2822 - | Fixed::RFC3339 - | Fixed::TimezoneName - | Fixed::TimezoneOffsetColon - | Fixed::TimezoneOffsetColonZ - | Fixed::TimezoneOffset - | Fixed::TimezoneOffsetZ - ), - _ => false, - }); - if has_zone { - let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } else { - let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } - } -} - -/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. -/// The result value can't be out of bounds. -pub fn parse_decimal( - s: &str, - precision: u8, - scale: i8, -) -> Result { - let mut result = T::Native::usize_as(0); - let mut fractionals = 0; - let mut digits = 0; - let base = T::Native::usize_as(10); - - let bs = s.as_bytes(); - let (bs, negative) = match bs.first() { - Some(b'-') => (&bs[1..], true), - Some(b'+') => (&bs[1..], false), - _ => (bs, false), - }; - - if bs.is_empty() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - - let mut bs = bs.iter(); - // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. - // Thus, if we validate the precision correctly, we can skip overflow checks. - while let Some(b) = bs.next() { - match b { - b'0'..=b'9' => { - if digits == 0 && *b == b'0' { - // Ignore leading zeros. - continue; - } - digits += 1; - result = result.mul_wrapping(base); - result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - b'.' => { - for b in bs.by_ref() { - if !b.is_ascii_digit() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - if fractionals == scale { - // We have processed all the digits that we need. All that - // is left is to validate that the rest of the string contains - // valid digits. - continue; - } - fractionals += 1; - digits += 1; - result = result.mul_wrapping(base); - result = - result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - - // Fail on "." - if digits == 0 { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - _ => { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - } - - if fractionals < scale { - let exp = scale - fractionals; - if exp as u8 + digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - let mul = base.pow_wrapping(exp as _); - result = result.mul_wrapping(mul); - } else if digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - - Ok(if negative { - result.neg_wrapping() - } else { - result - }) -} - -pub fn parse_interval_year_month( - value: &str, -) -> Result<::Native, ArrowError> { - let (result_months, result_days, result_nanos) = parse_interval("years", value)?; - if result_days != 0 || result_nanos != 0 { - return Err(ArrowError::CastError(format!( - "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." - ))); - } - Ok(IntervalYearMonthType::make_value(0, result_months)) -} - -pub fn parse_interval_day_time( - value: &str, -) -> Result<::Native, ArrowError> { - let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?; - if result_nanos % 1_000_000 != 0 { - return Err(ArrowError::CastError(format!( - "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" - ))); - } - result_days += result_months * 30; - Ok(IntervalDayTimeType::make_value( - result_days, - (result_nanos / 1_000_000) as i32, - )) -} - -pub fn parse_interval_month_day_nano( - value: &str, -) -> Result<::Native, ArrowError> { - let (result_months, result_days, result_nanos) = parse_interval("months", value)?; - Ok(IntervalMonthDayNanoType::make_value( - result_months, - result_days, - result_nanos, - )) -} - -const SECONDS_PER_HOUR: f64 = 3_600_f64; -const NANOS_PER_MILLIS: f64 = 1_000_000_f64; -const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS; -#[cfg(test)] -const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND; -#[cfg(test)] -const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE; -#[cfg(test)] -const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR; - -#[rustfmt::skip] -#[derive(Clone, Copy)] -#[repr(u16)] -enum IntervalType { - Century = 0b_0000_0000_0001, - Decade = 0b_0000_0000_0010, - Year = 0b_0000_0000_0100, - Month = 0b_0000_0000_1000, - Week = 0b_0000_0001_0000, - Day = 0b_0000_0010_0000, - Hour = 0b_0000_0100_0000, - Minute = 0b_0000_1000_0000, - Second = 0b_0001_0000_0000, - Millisecond = 0b_0010_0000_0000, - Microsecond = 0b_0100_0000_0000, - Nanosecond = 0b_1000_0000_0000, -} - -impl FromStr for IntervalType { - type Err = ArrowError; - - fn from_str(s: &str) -> Result { - match s.to_lowercase().as_str() { - "century" | "centuries" => Ok(Self::Century), - "decade" | "decades" => Ok(Self::Decade), - "year" | "years" => Ok(Self::Year), - "month" | "months" => Ok(Self::Month), - "week" | "weeks" => Ok(Self::Week), - "day" | "days" => Ok(Self::Day), - "hour" | "hours" => Ok(Self::Hour), - "minute" | "minutes" => Ok(Self::Minute), - "second" | "seconds" => Ok(Self::Second), - "millisecond" | "milliseconds" => Ok(Self::Millisecond), - "microsecond" | "microseconds" => Ok(Self::Microsecond), - "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), - _ => Err(ArrowError::NotYetImplemented(format!( - "Unknown interval type: {s}" - ))), - } - } -} - -pub type MonthDayNano = (i32, i32, i64); - -/// parse string value to a triple of aligned months, days, nanos. -/// leading field is the default unit. e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when leading_filed = 'second' -fn parse_interval(leading_field: &str, value: &str) -> Result { - let mut used_interval_types = 0; - - let mut calculate_from_part = |interval_period_str: &str, - interval_type: &str| - -> Result<(i32, i32, i64), ArrowError> { - // TODO: Use fixed-point arithmetic to avoid truncation and rounding errors (#3809) - let interval_period = match f64::from_str(interval_period_str) { - Ok(n) => n, - Err(_) => { - return Err(ArrowError::NotYetImplemented(format!( - "Unsupported Interval Expression with value {value:?}" - ))); - } - }; - - if interval_period > (i64::MAX as f64) { - return Err(ArrowError::ParseError(format!( - "Interval field value out of range: {value:?}" - ))); - } - - let it = IntervalType::from_str(interval_type).map_err(|_| { - ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}" - )) - })?; - - // Disallow duplicate interval types - if used_interval_types & (it as u16) != 0 { - return Err(ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'" - ))); - } else { - used_interval_types |= it as u16; - } - - match it { - IntervalType::Century => { - align_interval_parts(interval_period.mul_checked(1200_f64)?, 0.0, 0.0) - } - IntervalType::Decade => { - align_interval_parts(interval_period.mul_checked(120_f64)?, 0.0, 0.0) - } - IntervalType::Year => { - align_interval_parts(interval_period.mul_checked(12_f64)?, 0.0, 0.0) - } - IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0), - IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0), - IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0), - IntervalType::Hour => Ok(( - 0, - 0, - (interval_period.mul_checked(SECONDS_PER_HOUR * NANOS_PER_SECOND))? - as i64, - )), - IntervalType::Minute => Ok(( - 0, - 0, - (interval_period.mul_checked(60_f64 * NANOS_PER_SECOND))? as i64, - )), - IntervalType::Second => Ok(( - 0, - 0, - (interval_period.mul_checked(NANOS_PER_SECOND))? as i64, - )), - IntervalType::Millisecond => { - Ok((0, 0, (interval_period.mul_checked(1_000_000f64))? as i64)) - } - IntervalType::Microsecond => { - Ok((0, 0, (interval_period.mul_checked(1_000f64)?) as i64)) - } - IntervalType::Nanosecond => Ok((0, 0, interval_period as i64)), - } - }; - - let mut result_month: i32 = 0; - let mut result_days: i32 = 0; - let mut result_nanos: i64 = 0; - - let mut parts = value.split_whitespace(); - - while let Some(interval_period_str) = parts.next() { - let unit = parts.next().unwrap_or(leading_field); - - let (diff_month, diff_days, diff_nanos) = - calculate_from_part(interval_period_str, unit)?; - - result_month = - result_month - .checked_add(diff_month) - .ok_or(ArrowError::ParseError(format!( - "Interval field value out of range: {value:?}" - )))?; - - result_days = - result_days - .checked_add(diff_days) - .ok_or(ArrowError::ParseError(format!( - "Interval field value out of range: {value:?}" - )))?; - - result_nanos = - result_nanos - .checked_add(diff_nanos) - .ok_or(ArrowError::ParseError(format!( - "Interval field value out of range: {value:?}" - )))?; - } - - Ok((result_month, result_days, result_nanos)) -} - -/// The fractional units must be spilled to smaller units. -/// [reference Postgresql doc](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) -/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days -/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours -fn align_interval_parts( - month_part: f64, - mut day_part: f64, - mut nanos_part: f64, -) -> Result<(i32, i32, i64), ArrowError> { - // Convert fractional month to days, It's not supported by Arrow types, but anyway - day_part += (month_part - (month_part as i64) as f64) * 30_f64; - - // Convert fractional days to hours - nanos_part += (day_part - ((day_part as i64) as f64)) - * 24_f64 - * SECONDS_PER_HOUR - * NANOS_PER_SECOND; - - if month_part > i32::MAX as f64 - || month_part < i32::MIN as f64 - || day_part > i32::MAX as f64 - || day_part < i32::MIN as f64 - || nanos_part > i64::MAX as f64 - || nanos_part < i64::MIN as f64 - { - return Err(ArrowError::ParseError(format!( - "Parsed interval field value out of range: {month_part} months {day_part} days {nanos_part} nanos" - ))); - } - - Ok((month_part as i32, day_part as i32, nanos_part as i64)) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::timezone::Tz; - use arrow_buffer::i256; - - #[test] - fn test_parse_nanos() { - assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); - assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); - assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); - } - - #[test] - fn string_to_timestamp_timezone() { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - fn string_to_timestamp_timezone_space() { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime - fn string_to_timestamp_no_timezone() { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat susceptible to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(13, 42, 29).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - // ensure without time work - // no time, should be the nano second at - // 2020-09-08 0:0:0 - let naive_datetime_no_time = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(0, 0, 0).unwrap(), - ); - - assert_eq!( - naive_datetime_no_time.timestamp_nanos(), - parse_timestamp("2020-09-08").unwrap() - ) - } - - #[test] - fn string_to_timestamp_chrono() { - let cases = [ - "2020-09-08T13:42:29Z", - "1969-01-01T00:00:00.1Z", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12+00:00", - "2020-09-08T12:00:12.1+00:00", - "2020-09-08T12:00:12.12+00:00", - "2020-09-08T12:00:12.123+00:00", - "2020-09-08T12:00:12.1234+00:00", - "2020-09-08T12:00:12.12345+00:00", - "2020-09-08T12:00:12.123456+00:00", - "2020-09-08T12:00:12.1234567+00:00", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12.123456789+00:00", - "2020-09-08T12:00:12.12345678912z", - "2020-09-08T12:00:12.123456789123Z", - "2020-09-08T12:00:12.123456789123+02:00", - "2020-09-08T12:00:12.12345678912345Z", - "2020-09-08T12:00:12.1234567891234567+02:00", - "2020-09-08T12:00:60Z", - "2020-09-08T12:00:60.123Z", - "2020-09-08T12:00:60.123456+02:00", - "2020-09-08T12:00:60.1234567891234567+02:00", - "2020-09-08T12:00:60.999999999+02:00", - "2020-09-08t12:00:12.12345678+00:00", - "2020-09-08t12:00:12+00:00", - "2020-09-08t12:00:12Z", - ]; - - for case in cases { - let chrono = DateTime::parse_from_rfc3339(case).unwrap(); - let chrono_utc = chrono.with_timezone(&Utc); - - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono_utc, custom) - } - } - - #[test] - fn string_to_timestamp_naive() { - let cases = [ - "2018-11-13T17:11:10.011375885995", - "2030-12-04T17:11:10.123", - "2030-12-04T17:11:10.1234", - "2030-12-04T17:11:10.123456", - ]; - for case in cases { - let chrono = - NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono, custom.naive_utc()) - } - } - - #[test] - fn string_to_timestamp_invalid() { - // Test parsing invalid formats - let cases = [ - ("", "timestamp must contain at least 10 characters"), - ("SS", "timestamp must contain at least 10 characters"), - ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), - ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), - ("1997-01-31 09:26:56.123Z", "error parsing time"), - ("1997:01:31T09:26:56.123Z", "error parsing date"), - ("1997:1:31T09:26:56.123Z", "error parsing date"), - ("1997-01-32T09:26:56.123Z", "error parsing date"), - ("1997-13-32T09:26:56.123Z", "error parsing date"), - ("1997-02-29T09:26:56.123Z", "error parsing date"), - ("2015-02-30T17:35:20-08:00", "error parsing date"), - ("1997-01-10T9:26:56.123Z", "error parsing time"), - ("2015-01-20T25:35:20-08:00", "error parsing time"), - ("1997-01-10T09:61:56.123Z", "error parsing time"), - ("1997-01-10T09:61:90.123Z", "error parsing time"), - ("1997-01-10T12:00:6.123Z", "error parsing time"), - ("1997-01-31T092656.123Z", "error parsing time"), - ("1997-01-10T12:00:06.", "error parsing time"), - ("1997-01-10T12:00:06. ", "error parsing time"), - ]; - - for (s, ctx) in cases { - let expected = - format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); - let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); - assert_eq!(actual, expected) - } - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = string_to_timestamp_nanos(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{s}': {e:?}"); - } - result - } - - #[test] - fn string_without_timezone_to_timestamp() { - // string without timezone should always output the same regardless the local or session timezone - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - let tz: Tz = "+02:00".parse().unwrap(); - let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 11:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 13:42:29"); - - let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 13:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 15:42:29"); - - let dt = - NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") - .unwrap(); - let local: Tz = "+08:00".parse().unwrap(); - - // Parsed as offset from UTC - let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); - assert_eq!(dt, date.naive_utc()); - assert_ne!(dt, date.naive_local()); - - // Parsed as offset from local - let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); - assert_eq!(dt, date.naive_local()); - assert_ne!(dt, date.naive_utc()); - } - - #[test] - fn parse_time64_nanos() { - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567899999999"), - Some(7_801_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 AM"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 am"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 PM"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 pm"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 AM"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 am"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 PM"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 pm"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("02:10"), - Some(7_800_000_000_000) - ); - assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); - assert_eq!( - Time64NanosecondType::parse("12:10 AM"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10 am"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 PM"), - Some(51_000_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 pm"), - Some(51_000_000_000_000) - ); - - // parse directly as nanoseconds - assert_eq!(Time64NanosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64NanosecondType::parse("23:59:60"), - Some(86_400_000_000_000) - ); - - // custom format - assert_eq!( - Time64NanosecondType::parse_formatted( - "02 - 10 - 01 - .1234567", - "%H - %M - %S - %.f" - ), - Some(7_801_123_456_700) - ); - } - - #[test] - fn parse_time64_micros() { - // expected formats - assert_eq!( - Time64MicrosecondType::parse("02:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 AM"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 am"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 PM"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 pm"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("02:10:01"), - Some(7_801_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 AM"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 am"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 PM"), - Some(51_001_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 pm"), - Some(51_001_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); - assert_eq!( - Time64MicrosecondType::parse("2:10 PM"), - Some(51_000_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10 pm"), - Some(51_000_000_000) - ); - - // parse directly as microseconds - assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64MicrosecondType::parse("23:59:60"), - Some(86_400_000_000) - ); - - // custom format - assert_eq!( - Time64MicrosecondType::parse_formatted( - "02 - 10 - 01 - .1234", - "%H - %M - %S - %.f" - ), - Some(7_801_123_400) - ); - } - - #[test] - fn parse_time32_millis() { - // expected formats - assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); - assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 AM"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 am"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 PM"), - Some(51_001_120) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 pm"), - Some(51_001_120) - ); - assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); - assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); - - // parse directly as milliseconds - assert_eq!(Time32MillisecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); - - // custom format - assert_eq!( - Time32MillisecondType::parse_formatted( - "02 - 10 - 01 - .1", - "%H - %M - %S - %.f" - ), - Some(7_801_100) - ); - } - - #[test] - fn parse_time32_secs() { - // expected formats - assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); - assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); - assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); - assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); - assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); - assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); - assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); - assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); - assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); - - // parse directly as seconds - assert_eq!(Time32SecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); - - // custom format - assert_eq!( - Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), - Some(7_801) - ); - } - - #[test] - fn test_string_to_time_invalid() { - let cases = [ - "25:00", - "9:00:", - "009:00", - "09:0:00", - "25:00:00", - "13:00 AM", - "13:00 PM", - "12:00. AM", - "09:0:00", - "09:01:0", - "09:01:1", - "9:1:0", - "09:01:0", - "1:00.123", - "1:00:00.123f", - " 9:00:00", - ":09:00", - "T9:00:00", - "AM", - ]; - for case in cases { - assert!(string_to_time(case).is_none(), "{case}"); - } - } - - #[test] - fn test_string_to_time_chrono() { - let cases = [ - ("1:00", "%H:%M"), - ("12:00", "%H:%M"), - ("13:00", "%H:%M"), - ("24:00", "%H:%M"), - ("1:00:00", "%H:%M:%S"), - ("12:00:30", "%H:%M:%S"), - ("13:00:59", "%H:%M:%S"), - ("24:00:60", "%H:%M:%S"), - ("09:00:00", "%H:%M:%S%.f"), - ("0:00:30.123456", "%H:%M:%S%.f"), - ("0:00 AM", "%I:%M %P"), - ("1:00 AM", "%I:%M %P"), - ("12:00 AM", "%I:%M %P"), - ("13:00 AM", "%I:%M %P"), - ("0:00 PM", "%I:%M %P"), - ("1:00 PM", "%I:%M %P"), - ("12:00 PM", "%I:%M %P"), - ("13:00 PM", "%I:%M %P"), - ("1:00 pM", "%I:%M %P"), - ("1:00 Pm", "%I:%M %P"), - ("1:00 aM", "%I:%M %P"), - ("1:00 Am", "%I:%M %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), - ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), - ]; - for (s, format) in cases { - let chrono = NaiveTime::parse_from_str(s, format).ok(); - let custom = string_to_time(s); - assert_eq!(chrono, custom, "{s}"); - } - } - - #[test] - fn test_parse_interval() { - assert_eq!( - (1i32, 0i32, 0i64), - parse_interval("months", "1 month").unwrap(), - ); - - assert_eq!( - (2i32, 0i32, 0i64), - parse_interval("months", "2 month").unwrap(), - ); - - assert_eq!( - (-1i32, -18i32, (-0.2 * NANOS_PER_DAY) as i64), - parse_interval("months", "-1.5 months -3.2 days").unwrap(), - ); - - assert_eq!( - (2i32, 10i32, (9.0 * NANOS_PER_HOUR) as i64), - parse_interval("months", "2.1 months 7.25 days 3 hours").unwrap(), - ); - - assert_eq!( - parse_interval("months", "1 centurys 1 month") - .unwrap_err() - .to_string(), - r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# - ); - - assert_eq!( - (37i32, 0i32, 0i64), - parse_interval("months", "3 year 1 month").unwrap(), - ); - - assert_eq!( - (35i32, 0i32, 0i64), - parse_interval("months", "3 year -1 month").unwrap(), - ); - - assert_eq!( - (-37i32, 0i32, 0i64), - parse_interval("months", "-3 year -1 month").unwrap(), - ); - - assert_eq!( - (-35i32, 0i32, 0i64), - parse_interval("months", "-3 year 1 month").unwrap(), - ); - - assert_eq!( - (0i32, 5i32, 0i64), - parse_interval("months", "5 days").unwrap(), - ); - - assert_eq!( - (0i32, 7i32, (3f64 * NANOS_PER_HOUR) as i64), - parse_interval("months", "7 days 3 hours").unwrap(), - ); - - assert_eq!( - (0i32, 7i32, (5f64 * NANOS_PER_MINUTE) as i64), - parse_interval("months", "7 days 5 minutes").unwrap(), - ); - - assert_eq!( - (0i32, 7i32, (-5f64 * NANOS_PER_MINUTE) as i64), - parse_interval("months", "7 days -5 minutes").unwrap(), - ); - - assert_eq!( - (0i32, -7i32, (5f64 * NANOS_PER_HOUR) as i64), - parse_interval("months", "-7 days 5 hours").unwrap(), - ); - - assert_eq!( - ( - 0i32, - -7i32, - (-5f64 * NANOS_PER_HOUR - - 5f64 * NANOS_PER_MINUTE - - 5f64 * NANOS_PER_SECOND) as i64 - ), - parse_interval("months", "-7 days -5 hours -5 minutes -5 seconds").unwrap(), - ); - - assert_eq!( - (12i32, 0i32, (25f64 * NANOS_PER_MILLIS) as i64), - parse_interval("months", "1 year 25 millisecond").unwrap(), - ); - - assert_eq!( - (12i32, 1i32, (0.000000001 * NANOS_PER_SECOND) as i64), - parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(), - ); - - assert_eq!( - (12i32, 1i32, (0.1 * NANOS_PER_MILLIS) as i64), - parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(), - ); - - assert_eq!( - (12i32, 1i32, 1000i64), - parse_interval("months", "1 year 1 day 1 microsecond").unwrap(), - ); - - assert_eq!( - (12i32, 1i32, 1i64), - parse_interval("months", "1 year 1 day 1 nanoseconds").unwrap(), - ); - - assert_eq!( - (1i32, 0i32, (-NANOS_PER_SECOND) as i64), - parse_interval("months", "1 month -1 second").unwrap(), - ); - - assert_eq!( - (-13i32, -8i32, (- NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - 1.11 * NANOS_PER_MILLIS) as i64), - parse_interval("months", "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond").unwrap(), - ); - } - - #[test] - fn test_duplicate_interval_type() { - let err = parse_interval("months", "1 month 1 second 1 second") - .expect_err("parsing interval should have failed"); - assert_eq!( - r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, - format!("{err:?}") - ); - } - - #[test] - fn string_to_timestamp_old() { - parse_timestamp("1677-06-14T07:29:01.256") - .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) - .unwrap_err(); - } - - #[test] - fn test_parse_decimal_with_parameter() { - let tests = [ - ("0", 0i128), - ("123.123", 123123i128), - ("123.1234", 123123i128), - ("123.1", 123100i128), - ("123", 123000i128), - ("-123.123", -123123i128), - ("-123.1234", -123123i128), - ("-123.1", -123100i128), - ("-123", -123000i128), - ("0.0000123", 0i128), - ("12.", 12000i128), - ("-12.", -12000i128), - ("00.1", 100i128), - ("-00.1", -100i128), - ("12345678912345678.1234", 12345678912345678123i128), - ("-12345678912345678.1234", -12345678912345678123i128), - ("99999999999999999.999", 99999999999999999999i128), - ("-99999999999999999.999", -99999999999999999999i128), - (".123", 123i128), - ("-.123", -123i128), - ("123.", 123000i128), - ("-123.", -123000i128), - ]; - for (s, i) in tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!(i, result_128.unwrap()); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!(i256::from_i128(i), result_256.unwrap()); - } - let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; - for s in can_not_parse_tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_128.unwrap_err().to_string() - ); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_256.unwrap_err().to_string() - ); - } - let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; - for s in overflow_parse_tests { - let result_128 = parse_decimal::(s, 10, 3); - let expected_128 = "Parser error: parse decimal overflow"; - let actual_128 = result_128.unwrap_err().to_string(); - - assert!( - actual_128.contains(expected_128), - "actual: '{actual_128}', expected: '{expected_128}'" - ); - - let result_256 = parse_decimal::(s, 10, 3); - let expected_256 = "Parser error: parse decimal overflow"; - let actual_256 = result_256.unwrap_err().to_string(); - - assert!( - actual_256.contains(expected_256), - "actual: '{actual_256}', expected: '{expected_256}'" - ); - } - - let edge_tests_128 = [ - ( - "99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 0, - ), - ( - "999999999999999999999999999999999999.99", - 99999999999999999999999999999999999999i128, - 2, - ), - ( - "9999999999999999999999999.9999999999999", - 99999999999999999999999999999999999999i128, - 13, - ), - ( - "9999999999999999999999999", - 99999999999999999999999990000000000000i128, - 13, - ), - ( - "0.99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 38, - ), - ]; - for (s, i, scale) in edge_tests_128 { - let result_128 = parse_decimal::(s, 38, scale); - assert_eq!(i, result_128.unwrap()); - } - let edge_tests_256 = [ - ( - "9999999999999999999999999999999999999999999999999999999999999999999999999999", -i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 0, - ), - ( - "999999999999999999999999999999999999999999999999999999999999999999999999.9999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 4, - ), - ( - "99999999999999999999999999999999999999999999999999.99999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 26, - ), - ( - "99999999999999999999999999999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), - 26, - ), - ]; - for (s, i, scale) in edge_tests_256 { - let result = parse_decimal::(s, 76, scale); - assert_eq!(i, result.unwrap()); - } - } -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; +use chrono::prelude::*; +use half::f16; +use std::str::FromStr; + +/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` +#[inline] +fn parse_nanos(digits: &[u8]) -> u32 { + digits[..N] + .iter() + .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) + * 10_u32.pow((9 - N) as _) +} + +/// Helper for parsing timestamps +struct TimestampParser { + /// The timestamp bytes to parse minus `b'0'` + /// + /// This makes interpretation as an integer inexpensive + digits: [u8; 32], + /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit + mask: u32, +} + +impl TimestampParser { + fn new(bytes: &[u8]) -> Self { + let mut digits = [0; 32]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u32) << idx + } + + Self { digits, mask } + } + + /// Returns true if the byte at `idx` in the original string equals `b` + fn test(&self, idx: usize, b: u8) -> bool { + self.digits[idx] == b.wrapping_sub(b'0') + } + + /// Parses a date of the form `1997-01-31` + fn date(&self) -> Option { + if self.mask & 0b1111111111 != 0b1101101111 + || !self.test(4, b'-') + || !self.test(7, b'-') + { + return None; + } + + let year = self.digits[0] as u16 * 1000 + + self.digits[1] as u16 * 100 + + self.digits[2] as u16 * 10 + + self.digits[3] as u16; + + let month = self.digits[5] * 10 + self.digits[6]; + let day = self.digits[8] * 10 + self.digits[9]; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) + } + + /// Parses a time of any of forms + /// - `09:26:56` + /// - `09:26:56.123` + /// - `09:26:56.123456` + /// - `09:26:56.123456789` + /// - `092656` + /// + /// Returning the end byte offset + fn time(&self) -> Option<(NaiveTime, usize)> { + // Make a NaiveTime handling leap seconds + let time = |hour, min, sec, nano| match sec { + 60 => { + let nano = 1_000_000_000 + nano; + NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) + } + _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), + }; + + match (self.mask >> 11) & 0b11111111 { + // 09:26:56 + 0b11011011 if self.test(13, b':') && self.test(16, b':') => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[14] * 10 + self.digits[15]; + let second = self.digits[17] * 10 + self.digits[18]; + + match self.test(19, b'.') { + true => { + let digits = (self.mask >> 20).trailing_ones(); + let nanos = match digits { + 0 => return None, + 1 => parse_nanos::<1, 0>(&self.digits[20..21]), + 2 => parse_nanos::<2, 0>(&self.digits[20..22]), + 3 => parse_nanos::<3, 0>(&self.digits[20..23]), + 4 => parse_nanos::<4, 0>(&self.digits[20..24]), + 5 => parse_nanos::<5, 0>(&self.digits[20..25]), + 6 => parse_nanos::<6, 0>(&self.digits[20..26]), + 7 => parse_nanos::<7, 0>(&self.digits[20..27]), + 8 => parse_nanos::<8, 0>(&self.digits[20..28]), + _ => parse_nanos::<9, 0>(&self.digits[20..29]), + }; + Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) + } + false => Some((time(hour, minute, second, 0)?, 19)), + } + } + // 092656 + 0b111111 => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[13] * 10 + self.digits[14]; + let second = self.digits[15] * 10 + self.digits[16]; + let time = time(hour, minute, second, 0)?; + Some((time, 17)) + } + _ => None, + } + } +} + +/// Accepts a string and parses it relative to the provided `timezone` +/// +/// In addition to RFC3339 / ISO8601 standard timestamps, it also +/// accepts strings that use a space ` ` to separate the date and time +/// as well as strings that have no explicit timezone offset. +/// +/// Examples of accepted inputs: +/// * `1997-01-31T09:26:56.123Z` # RCF3339 +/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 +/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T +/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified +/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset +/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31` # close to RCF3339, only date no time +/// +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// Some formats supported by PostgresSql +/// are not supported, like +/// +/// * "2023-01-01 04:05:06.789 +07:30:00", +/// * "2023-01-01 040506 +07:30:00", +/// * "2023-01-01 04:05:06.789 PST", +/// +/// [IANA timezones]: https://www.iana.org/time-zones +pub fn string_to_datetime( + timezone: &T, + s: &str, +) -> Result, ArrowError> { + let err = |ctx: &str| { + ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) + }; + + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); + } + + let parser = TimestampParser::new(bytes); + let date = parser.date().ok_or_else(|| err("error parsing date"))?; + if bytes.len() == 10 { + let offset = timezone.offset_from_local_date(&date); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + + let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + return Ok(DateTime::from_local(date.and_time(time), offset)); + } + + if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { + return Err(err("invalid timestamp separator")); + } + + let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let datetime = date.and_time(time); + + if tz_offset == 32 { + // Decimal overrun + while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { + tz_offset += 1; + } + } + + if bytes.len() <= tz_offset { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_local(datetime, offset)); + } + + if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_utc(datetime, offset)); + } + + // Parse remainder of string as timezone + let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; + let offset = parsed_tz.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) +} + +/// Accepts a string in RFC3339 / ISO8601 standard format and some +/// variants and converts it to a nanosecond precision timestamp. +/// +/// See [`string_to_datetime`] for the full set of supported formats +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// We hope to extend this function in the future with a second +/// parameter to specifying the format string. +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// This function interprets string without an explicit time zone as timestamps +/// relative to UTC, see [`string_to_datetime`] for alternative semantics +/// +/// In particular: +/// +/// ``` +/// # use arrow_cast::parse::string_to_timestamp_nanos; +/// // Note all three of these timestamps are parsed as the same value +/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); +/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); +/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); +/// +/// assert_eq!(a, b); +/// assert_eq!(b, c); +/// ``` +/// +#[inline] +pub fn string_to_timestamp_nanos(s: &str) -> Result { + to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) +} + +/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +#[inline] +fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { + if dt.timestamp().checked_mul(1_000_000_000).is_none() { + return Err(ArrowError::ParseError( + ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), + )); + } + + Ok(dt.timestamp_nanos()) +} + +/// Accepts a string in ISO8601 standard format and some +/// variants and converts it to nanoseconds since midnight. +/// +/// Examples of accepted inputs: +/// * `09:26:56.123 AM` +/// * `23:59:59` +/// * `6:00 pm` +// +/// Internally, this function uses the `chrono` library for the +/// time parsing +/// +/// ## Timezone / Offset Handling +/// +/// This function does not support parsing strings with a timezone +/// or offset specified, as it considers only time since midnight. +pub fn string_to_time_nanoseconds(s: &str) -> Result { + let nt = string_to_time(s).ok_or_else(|| { + ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) + })?; + Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) +} + +fn string_to_time(s: &str) -> Option { + let bytes = s.as_bytes(); + if bytes.len() < 4 { + return None; + } + + let (am, bytes) = match bytes.get(bytes.len() - 3..) { + Some(b" AM" | b" am" | b" Am" | b" aM") => { + (Some(true), &bytes[..bytes.len() - 3]) + } + Some(b" PM" | b" pm" | b" pM" | b" Pm") => { + (Some(false), &bytes[..bytes.len() - 3]) + } + _ => (None, bytes), + }; + + if bytes.len() < 4 { + return None; + } + + let mut digits = [b'0'; 6]; + + // Extract hour + let bytes = match (bytes[1], bytes[2]) { + (b':', _) => { + digits[1] = bytes[0]; + &bytes[2..] + } + (_, b':') => { + digits[0] = bytes[0]; + digits[1] = bytes[1]; + &bytes[3..] + } + _ => return None, + }; + + if bytes.len() < 2 { + return None; // Minutes required + } + + // Extract minutes + digits[2] = bytes[0]; + digits[3] = bytes[1]; + + let nanoseconds = match bytes.get(2) { + Some(b':') => { + if bytes.len() < 5 { + return None; + } + + // Extract seconds + digits[4] = bytes[3]; + digits[5] = bytes[4]; + + // Extract sub-seconds if any + match bytes.get(5) { + Some(b'.') => { + let decimal = &bytes[6..]; + if decimal.iter().any(|x| !x.is_ascii_digit()) { + return None; + } + match decimal.len() { + 0 => return None, + 1 => parse_nanos::<1, b'0'>(decimal), + 2 => parse_nanos::<2, b'0'>(decimal), + 3 => parse_nanos::<3, b'0'>(decimal), + 4 => parse_nanos::<4, b'0'>(decimal), + 5 => parse_nanos::<5, b'0'>(decimal), + 6 => parse_nanos::<6, b'0'>(decimal), + 7 => parse_nanos::<7, b'0'>(decimal), + 8 => parse_nanos::<8, b'0'>(decimal), + _ => parse_nanos::<9, b'0'>(decimal), + } + } + Some(_) => return None, + None => 0, + } + } + Some(_) => return None, + None => 0, + }; + + digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if digits.iter().any(|x| *x > 9) { + return None; + } + + let hour = match (digits[0] * 10 + digits[1], am) { + (12, Some(true)) => 0, // 12:00 AM -> 00:00 + (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 + (12, Some(false)) => 12, // 12:00 PM -> 12:00 + (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 + (_, Some(_)) => return None, + (h, None) => h, + }; + + // Handle leap second + let (second, nanoseconds) = match digits[4] * 10 + digits[5] { + 60 => (59, nanoseconds + 1_000_000_000), + s => (s, nanoseconds), + }; + + NaiveTime::from_hms_nano_opt( + hour as _, + (digits[2] * 10 + digits[3]) as _, + second as _, + nanoseconds, + ) +} + +/// Specialized parsing implementations +/// used by csv and json reader +pub trait Parser: ArrowPrimitiveType { + fn parse(string: &str) -> Option; + + fn parse_formatted(string: &str, _format: &str) -> Option { + Self::parse(string) + } +} + +impl Parser for Float16Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()) + .ok() + .map(f16::from_f32) + } +} + +impl Parser for Float32Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +impl Parser for Float64Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + lexical_core::parse::(string.as_bytes()).ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); + +impl Parser for TimestampNanosecondType { + fn parse(string: &str) -> Option { + string_to_timestamp_nanos(string).ok() + } +} + +impl Parser for TimestampMicrosecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1000) + } +} + +impl Parser for TimestampMillisecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000) + } +} + +impl Parser for TimestampSecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000_000) + } +} + +impl Parser for Time64NanosecondType { + // Will truncate any fractions of a nanosecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + + nt.nanosecond() as i64, + ) + } +} + +impl Parser for Time64MicrosecondType { + // Will truncate any fractions of a microsecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| nanos / 1_000) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000 + + nt.nanosecond() as i64 / 1_000, + ) + } +} + +impl Parser for Time32MillisecondType { + // Will truncate any fractions of a millisecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 * 1_000 + + nt.nanosecond() as i32 / 1_000_000, + ) + } +} + +impl Parser for Time32SecondType { + // Will truncate any fractions of a second + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 + + nt.nanosecond() as i32 / 1_000_000_000, + ) + } +} + +/// Number of days between 0001-01-01 and 1970-01-01 +const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let parser = TimestampParser::new(string.as_bytes()); + let date = parser.date()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let date = NaiveDate::parse_from_str(string, format).ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + let date_time = string_to_datetime(&Utc, string).ok()?; + Some(date_time.timestamp_millis()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + use chrono::format::Fixed; + use chrono::format::StrftimeItems; + let fmt = StrftimeItems::new(format); + let has_zone = fmt.into_iter().any(|item| match item { + chrono::format::Item::Fixed(fixed_item) => matches!( + fixed_item, + Fixed::RFC2822 + | Fixed::RFC3339 + | Fixed::TimezoneName + | Fixed::TimezoneOffsetColon + | Fixed::TimezoneOffsetColonZ + | Fixed::TimezoneOffset + | Fixed::TimezoneOffsetZ + ), + _ => false, + }); + if has_zone { + let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } else { + let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } + } +} + +/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. +/// The result value can't be out of bounds. +pub fn parse_decimal( + s: &str, + precision: u8, + scale: i8, +) -> Result { + let mut result = T::Native::usize_as(0); + let mut fractionals = 0; + let mut digits = 0; + let base = T::Native::usize_as(10); + + let bs = s.as_bytes(); + let (bs, negative) = match bs.first() { + Some(b'-') => (&bs[1..], true), + Some(b'+') => (&bs[1..], false), + _ => (bs, false), + }; + + if bs.is_empty() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + let mut bs = bs.iter(); + // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. + // Thus, if we validate the precision correctly, we can skip overflow checks. + while let Some(b) = bs.next() { + match b { + b'0'..=b'9' => { + if digits == 0 && *b == b'0' { + // Ignore leading zeros. + continue; + } + digits += 1; + result = result.mul_wrapping(base); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + b'.' => { + for b in bs.by_ref() { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + if fractionals == scale { + // We have processed all the digits that we need. All that + // is left is to validate that the rest of the string contains + // valid digits. + continue; + } + fractionals += 1; + digits += 1; + result = result.mul_wrapping(base); + result = + result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + + // Fail on "." + if digits == 0 { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + } + + if fractionals < scale { + let exp = scale - fractionals; + if exp as u8 + digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + let mul = base.pow_wrapping(exp as _); + result = result.mul_wrapping(mul); + } else if digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + + Ok(if negative { + result.neg_wrapping() + } else { + result + }) +} + +pub fn parse_interval_year_month( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Year); + let interval = Interval::parse(value, &config)?; + + let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." + )))?; + + Ok(IntervalYearMonthType::make_value(0, months)) +} + +pub fn parse_interval_day_time( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Day); + let interval = Interval::parse(value, &config)?; + + let (days, millis) = interval.to_day_time().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" + )))?; + + Ok(IntervalDayTimeType::make_value(days, millis)) +} + +pub fn parse_interval_month_day_nano( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Month); + let interval = Interval::parse(value, &config)?; + + let (months, days, nanos) = interval.to_month_day_nanos(); + + Ok(IntervalMonthDayNanoType::make_value(months, days, nanos)) +} + +const NANOS_PER_MILLIS: i64 = 1_000_000; +const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS; +const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND; +const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; +#[cfg(test)] +const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; + +#[rustfmt::skip] +#[derive(Clone, Copy)] +#[repr(u16)] +enum IntervalUnit { + Century = 0b_0000_0000_0001, + Decade = 0b_0000_0000_0010, + Year = 0b_0000_0000_0100, + Month = 0b_0000_0000_1000, + Week = 0b_0000_0001_0000, + Day = 0b_0000_0010_0000, + Hour = 0b_0000_0100_0000, + Minute = 0b_0000_1000_0000, + Second = 0b_0001_0000_0000, + Millisecond = 0b_0010_0000_0000, + Microsecond = 0b_0100_0000_0000, + Nanosecond = 0b_1000_0000_0000, +} + +impl FromStr for IntervalUnit { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "century" | "centuries" => Ok(Self::Century), + "decade" | "decades" => Ok(Self::Decade), + "year" | "years" => Ok(Self::Year), + "month" | "months" => Ok(Self::Month), + "week" | "weeks" => Ok(Self::Week), + "day" | "days" => Ok(Self::Day), + "hour" | "hours" => Ok(Self::Hour), + "minute" | "minutes" => Ok(Self::Minute), + "second" | "seconds" => Ok(Self::Second), + "millisecond" | "milliseconds" => Ok(Self::Millisecond), + "microsecond" | "microseconds" => Ok(Self::Microsecond), + "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), + _ => Err(ArrowError::NotYetImplemented(format!( + "Unknown interval type: {s}" + ))), + } + } +} + +pub type MonthDayNano = (i32, i32, i64); + +/// Chosen based on the number of decimal digits in 1 week in nanoseconds +const INTERVAL_PRECISION: u32 = 15; + +#[derive(Clone, Copy, Debug, PartialEq)] +struct IntervalAmount { + /// The integer component of the interval amount + integer: i64, + /// The fractional component multiplied by 10^INTERVAL_PRECISION + frac: i64, +} + +#[cfg(test)] +impl IntervalAmount { + fn new(integer: i64, frac: i64) -> Self { + Self { integer, frac } + } +} + +impl FromStr for IntervalAmount { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.split_once('.') { + Some((integer, frac)) + if frac.len() <= INTERVAL_PRECISION as usize + && !integer.is_empty() + && !frac.is_empty() + && !frac.starts_with('-') => + { + let integer = integer.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + let frac_unscaled = frac.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + // scale fractional part by interval precision + let frac = + frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); + + // propagate the sign of the integer part to the fractional part + let frac = if integer < 0 { -frac } else { frac }; + + let result = Self { integer, frac }; + + Ok(result) + } + Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( + format!("Failed to parse {s} as interval amount"), + )), + Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { + Err(ArrowError::ParseError(format!( + "{s} exceeds the precision available for interval amount" + ))) + } + Some(_) | None => { + let integer = s.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + let result = Self { integer, frac: 0 }; + Ok(result) + } + } + } +} + +#[derive(Debug, Default, PartialEq)] +struct Interval { + months: i32, + days: i32, + nanos: i64, +} + +impl Interval { + fn new(months: i32, days: i32, nanos: i64) -> Self { + Self { + months, + days, + nanos, + } + } + + fn to_year_months(&self) -> Result { + match (self.months, self.days, self.nanos) { + (months, days, nanos) if days == 0 && nanos == 0 => Ok(months), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent interval with days and nanos as year-months: {:?}", + self + ))), + } + } + + fn to_day_time(&self) -> Result<(i32, i32), ArrowError> { + let days = self.months.mul_checked(30)?.add_checked(self.days)?; + + match self.nanos { + nanos if nanos % NANOS_PER_MILLIS == 0 => { + let millis = (self.nanos / 1_000_000).try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} nanos as milliseconds in a signed 32-bit integer", + self.nanos + )) + })?; + + Ok((days, millis)) + } + nanos => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent {nanos} as milliseconds" + ))), + } + } + + fn to_month_day_nanos(&self) -> (i32, i32, i64) { + (self.months, self.days, self.nanos) + } + + /// Parse string value in traditional Postgres format (e.g. 1 year 2 months 3 days 4 hours 5 minutes 6 seconds) + fn parse(value: &str, config: &IntervalParseConfig) -> Result { + let components = parse_interval_components(value, config)?; + + let result = components.into_iter().fold( + Ok(Self::default()), + |result, (amount, unit)| match result { + Ok(result) => result.add(amount, unit), + Err(e) => Err(e), + }, + )?; + + Ok(result) + } + + /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. + /// When the interval unit is larger than months, the result is rounded to total months and not spilled to days/nanos. + /// Fractional parts of weeks and days are represented using days and nanoseconds. + /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) + fn add( + &self, + amount: IntervalAmount, + unit: IntervalUnit, + ) -> Result { + let result = match unit { + IntervalUnit::Century => { + let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} centuries as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Decade => { + let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; + + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} decades as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Year => { + let months_int = amount.integer.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} years as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Month => { + let months = amount.integer.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months in a signed 32-bit integer", + &amount.integer + )) + })?; + + let days = amount.frac * 3 / 10_i64.pow(INTERVAL_PRECISION - 1); + let days = days.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months as days in a signed 32-bit integer", + amount.frac / 10_i64.pow(INTERVAL_PRECISION) + )) + })?; + + Self::new( + self.months.add_checked(months)?, + self.days.add_checked(days)?, + self.nanos, + ) + } + IntervalUnit::Week => { + let days = amount.integer.mul_checked(7)?.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} weeks as days in a signed 32-bit integer", + &amount.integer + )) + })?; + + let nanos = + amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Day => { + let days = amount.integer.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} days in a signed 32-bit integer", + amount.integer + )) + })?; + + let nanos = + amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Hour => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; + let nanos_frac = + amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Minute => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MINUTE)?; + let nanos_frac = amount.frac * 6 / 10_i64.pow(INTERVAL_PRECISION - 10); + + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Second => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_SECOND)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 9); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Millisecond => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MILLIS)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 6); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Microsecond => { + let nanos_int = amount.integer.mul_checked(1_000)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 3); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Nanosecond => { + let nanos_int = amount.integer; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + }; + + Ok(result) + } +} + +struct IntervalParseConfig { + /// The default unit to use if none is specified + /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = IntervalType::Second + default_unit: IntervalUnit, +} + +impl IntervalParseConfig { + fn new(default_unit: IntervalUnit) -> Self { + Self { default_unit } + } +} + +/// parse the string into a vector of interval components i.e. (amount, unit) tuples +fn parse_interval_components( + value: &str, + config: &IntervalParseConfig, +) -> Result, ArrowError> { + let parts = value.split_whitespace(); + + let raw_amounts = parts.clone().step_by(2); + let raw_units = parts.skip(1).step_by(2); + + // parse amounts + let (amounts, invalid_amounts) = raw_amounts + .map(IntervalAmount::from_str) + .partition::, _>(Result::is_ok); + + // invalid amounts? + if !invalid_amounts.is_empty() { + return Err(ArrowError::NotYetImplemented(format!( + "Unsupported Interval Expression with value {value:?}" + ))); + } + + // parse units + let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units + .clone() + .map(IntervalUnit::from_str) + .partition(Result::is_ok); + + // invalid units? + if !invalid_units.is_empty() { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}" + ))); + } + + // collect parsed results + let amounts = amounts.into_iter().map(Result::unwrap).collect::>(); + let units = units.into_iter().map(Result::unwrap).collect::>(); + + // if only an amount is specified, use the default unit + if amounts.len() == 1 && units.is_empty() { + return Ok(vec![(amounts[0], config.default_unit)]); + }; + + // duplicate units? + let mut observed_interval_types = 0; + for (unit, raw_unit) in units.iter().zip(raw_units) { + if observed_interval_types & (*unit as u16) != 0 { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}. Repeated type '{raw_unit}'", + ))); + } + + observed_interval_types |= *unit as u16; + } + + let result = amounts.iter().copied().zip(units.iter().copied()); + + Ok(result.collect::>()) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::timezone::Tz; + use arrow_buffer::i256; + + #[test] + fn test_parse_nanos() { + assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); + assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); + assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); + } + + #[test] + fn string_to_timestamp_timezone() { + // Explicit timezone + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08T13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + fn string_to_timestamp_timezone_space() { + // Ensure space rather than T between time and date is accepted + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08 13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime + fn string_to_timestamp_no_timezone() { + // This test is designed to succeed in regardless of the local + // timezone the test machine is running. Thus it is still + // somewhat susceptible to bugs in the use of chrono + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + // Also ensure that parsing timestamps with no fractional + // second part works as well + let naive_datetime_whole_secs = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(13, 42, 29).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + // ensure without time work + // no time, should be the nano second at + // 2020-09-08 0:0:0 + let naive_datetime_no_time = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + + assert_eq!( + naive_datetime_no_time.timestamp_nanos(), + parse_timestamp("2020-09-08").unwrap() + ) + } + + #[test] + fn string_to_timestamp_chrono() { + let cases = [ + "2020-09-08T13:42:29Z", + "1969-01-01T00:00:00.1Z", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12+00:00", + "2020-09-08T12:00:12.1+00:00", + "2020-09-08T12:00:12.12+00:00", + "2020-09-08T12:00:12.123+00:00", + "2020-09-08T12:00:12.1234+00:00", + "2020-09-08T12:00:12.12345+00:00", + "2020-09-08T12:00:12.123456+00:00", + "2020-09-08T12:00:12.1234567+00:00", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12.123456789+00:00", + "2020-09-08T12:00:12.12345678912z", + "2020-09-08T12:00:12.123456789123Z", + "2020-09-08T12:00:12.123456789123+02:00", + "2020-09-08T12:00:12.12345678912345Z", + "2020-09-08T12:00:12.1234567891234567+02:00", + "2020-09-08T12:00:60Z", + "2020-09-08T12:00:60.123Z", + "2020-09-08T12:00:60.123456+02:00", + "2020-09-08T12:00:60.1234567891234567+02:00", + "2020-09-08T12:00:60.999999999+02:00", + "2020-09-08t12:00:12.12345678+00:00", + "2020-09-08t12:00:12+00:00", + "2020-09-08t12:00:12Z", + ]; + + for case in cases { + let chrono = DateTime::parse_from_rfc3339(case).unwrap(); + let chrono_utc = chrono.with_timezone(&Utc); + + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono_utc, custom) + } + } + + #[test] + fn string_to_timestamp_naive() { + let cases = [ + "2018-11-13T17:11:10.011375885995", + "2030-12-04T17:11:10.123", + "2030-12-04T17:11:10.1234", + "2030-12-04T17:11:10.123456", + ]; + for case in cases { + let chrono = + NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono, custom.naive_utc()) + } + } + + #[test] + fn string_to_timestamp_invalid() { + // Test parsing invalid formats + let cases = [ + ("", "timestamp must contain at least 10 characters"), + ("SS", "timestamp must contain at least 10 characters"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), + ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), + ("1997-01-31 09:26:56.123Z", "error parsing time"), + ("1997:01:31T09:26:56.123Z", "error parsing date"), + ("1997:1:31T09:26:56.123Z", "error parsing date"), + ("1997-01-32T09:26:56.123Z", "error parsing date"), + ("1997-13-32T09:26:56.123Z", "error parsing date"), + ("1997-02-29T09:26:56.123Z", "error parsing date"), + ("2015-02-30T17:35:20-08:00", "error parsing date"), + ("1997-01-10T9:26:56.123Z", "error parsing time"), + ("2015-01-20T25:35:20-08:00", "error parsing time"), + ("1997-01-10T09:61:56.123Z", "error parsing time"), + ("1997-01-10T09:61:90.123Z", "error parsing time"), + ("1997-01-10T12:00:6.123Z", "error parsing time"), + ("1997-01-31T092656.123Z", "error parsing time"), + ("1997-01-10T12:00:06.", "error parsing time"), + ("1997-01-10T12:00:06. ", "error parsing time"), + ]; + + for (s, ctx) in cases { + let expected = + format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } + } + + // Parse a timestamp to timestamp int with a useful human readable error message + fn parse_timestamp(s: &str) -> Result { + let result = string_to_timestamp_nanos(s); + if let Err(e) = &result { + eprintln!("Error parsing timestamp '{s}': {e:?}"); + } + result + } + + #[test] + fn string_without_timezone_to_timestamp() { + // string without timezone should always output the same regardless the local or session timezone + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + let tz: Tz = "+02:00".parse().unwrap(); + let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 11:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 13:42:29"); + + let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 13:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 15:42:29"); + + let dt = + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") + .unwrap(); + let local: Tz = "+08:00".parse().unwrap(); + + // Parsed as offset from UTC + let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); + assert_eq!(dt, date.naive_utc()); + assert_ne!(dt, date.naive_local()); + + // Parsed as offset from local + let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); + assert_eq!(dt, date.naive_local()); + assert_ne!(dt, date.naive_utc()); + } + + #[test] + fn parse_time64_nanos() { + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567899999999"), + Some(7_801_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 am"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 PM"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 pm"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 AM"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 am"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 PM"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 pm"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10"), + Some(7_800_000_000_000) + ); + assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); + assert_eq!( + Time64NanosecondType::parse("12:10 AM"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 am"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 PM"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 pm"), + Some(51_000_000_000_000) + ); + + // parse directly as nanoseconds + assert_eq!(Time64NanosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64NanosecondType::parse("23:59:60"), + Some(86_400_000_000_000) + ); + + // custom format + assert_eq!( + Time64NanosecondType::parse_formatted( + "02 - 10 - 01 - .1234567", + "%H - %M - %S - %.f" + ), + Some(7_801_123_456_700) + ); + } + + #[test] + fn parse_time64_micros() { + // expected formats + assert_eq!( + Time64MicrosecondType::parse("02:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 AM"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 PM"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 pm"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01"), + Some(7_801_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 AM"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 am"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 PM"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 pm"), + Some(51_001_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); + assert_eq!( + Time64MicrosecondType::parse("2:10 PM"), + Some(51_000_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10 pm"), + Some(51_000_000_000) + ); + + // parse directly as microseconds + assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64MicrosecondType::parse("23:59:60"), + Some(86_400_000_000) + ); + + // custom format + assert_eq!( + Time64MicrosecondType::parse_formatted( + "02 - 10 - 01 - .1234", + "%H - %M - %S - %.f" + ), + Some(7_801_123_400) + ); + } + + #[test] + fn parse_time32_millis() { + // expected formats + assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 AM"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 am"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 pm"), + Some(51_001_120) + ); + assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); + + // parse directly as milliseconds + assert_eq!(Time32MillisecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); + + // custom format + assert_eq!( + Time32MillisecondType::parse_formatted( + "02 - 10 - 01 - .1", + "%H - %M - %S - %.f" + ), + Some(7_801_100) + ); + } + + #[test] + fn parse_time32_secs() { + // expected formats + assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); + assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); + assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); + assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); + assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); + assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); + assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); + + // parse directly as seconds + assert_eq!(Time32SecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); + + // custom format + assert_eq!( + Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), + Some(7_801) + ); + } + + #[test] + fn test_string_to_time_invalid() { + let cases = [ + "25:00", + "9:00:", + "009:00", + "09:0:00", + "25:00:00", + "13:00 AM", + "13:00 PM", + "12:00. AM", + "09:0:00", + "09:01:0", + "09:01:1", + "9:1:0", + "09:01:0", + "1:00.123", + "1:00:00.123f", + " 9:00:00", + ":09:00", + "T9:00:00", + "AM", + ]; + for case in cases { + assert!(string_to_time(case).is_none(), "{case}"); + } + } + + #[test] + fn test_string_to_time_chrono() { + let cases = [ + ("1:00", "%H:%M"), + ("12:00", "%H:%M"), + ("13:00", "%H:%M"), + ("24:00", "%H:%M"), + ("1:00:00", "%H:%M:%S"), + ("12:00:30", "%H:%M:%S"), + ("13:00:59", "%H:%M:%S"), + ("24:00:60", "%H:%M:%S"), + ("09:00:00", "%H:%M:%S%.f"), + ("0:00:30.123456", "%H:%M:%S%.f"), + ("0:00 AM", "%I:%M %P"), + ("1:00 AM", "%I:%M %P"), + ("12:00 AM", "%I:%M %P"), + ("13:00 AM", "%I:%M %P"), + ("0:00 PM", "%I:%M %P"), + ("1:00 PM", "%I:%M %P"), + ("12:00 PM", "%I:%M %P"), + ("13:00 PM", "%I:%M %P"), + ("1:00 pM", "%I:%M %P"), + ("1:00 Pm", "%I:%M %P"), + ("1:00 aM", "%I:%M %P"), + ("1:00 Am", "%I:%M %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), + ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), + ]; + for (s, format) in cases { + let chrono = NaiveTime::parse_from_str(s, format).ok(); + let custom = string_to_time(s); + assert_eq!(chrono, custom, "{s}"); + } + } + + #[test] + fn test_parse_interval() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + assert_eq!( + Interval::new(1i32, 0i32, 0i64), + Interval::parse("1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 0i32, 0i64), + Interval::parse("2 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-1i32, -18i32, -(NANOS_PER_DAY / 5)), + Interval::parse("-1.5 months -3.2 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), + Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::parse("1 centurys 1 month", &config) + .unwrap_err() + .to_string(), + r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# + ); + + assert_eq!( + Interval::new(37i32, 0i32, 0i64), + Interval::parse("3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(35i32, 0i32, 0i64), + Interval::parse("3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-37i32, 0i32, 0i64), + Interval::parse("-3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-35i32, 0i32, 0i64), + Interval::parse("-3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 5i32, 0i64), + Interval::parse("5 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 3 * NANOS_PER_HOUR), + Interval::parse("7 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 5 * NANOS_PER_MINUTE), + Interval::parse("7 days 5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, -5 * NANOS_PER_MINUTE), + Interval::parse("7 days -5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -7i32, 5 * NANOS_PER_HOUR), + Interval::parse("-7 days 5 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 0i32, + -7i32, + -5 * NANOS_PER_HOUR - 5 * NANOS_PER_MINUTE - 5 * NANOS_PER_SECOND + ), + Interval::parse("-7 days -5 hours -5 minutes -5 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 0i32, 25 * NANOS_PER_MILLIS), + Interval::parse("1 year 25 millisecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 12i32, + 1i32, + (NANOS_PER_SECOND as f64 * 0.000000001_f64) as i64 + ), + Interval::parse("1 year 1 day 0.000000001 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, NANOS_PER_MILLIS / 10), + Interval::parse("1 year 1 day 0.1 milliseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1000i64), + Interval::parse("1 year 1 day 1 microsecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1i64), + Interval::parse("1 year 1 day 1 nanoseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(1i32, 0i32, -NANOS_PER_SECOND), + Interval::parse("1 month -1 second", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), + Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), + ); + } + + #[test] + fn test_duplicate_interval_type() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let err = Interval::parse("1 month 1 second 1 second", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, + format!("{err:?}") + ); + + // test with singular and plural forms + let err = Interval::parse("1 century 2 centuries", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 century 2 centuries\". Repeated type 'centuries'")"#, + format!("{err:?}") + ); + } + + #[test] + fn test_interval_amount_parsing() { + // integer + let result = IntervalAmount::from_str("123").unwrap(); + let expected = IntervalAmount::new(123, 0); + + assert_eq!(result, expected); + + // positive w/ fractional + let result = IntervalAmount::from_str("0.3").unwrap(); + let expected = IntervalAmount::new(0, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // negative w/ fractional + let result = IntervalAmount::from_str("-3.5").unwrap(); + let expected = IntervalAmount::new(-3, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // invalid: missing integer + let result = IntervalAmount::from_str(".5"); + assert!(result.is_err()); + + // invalid: missing fractional + let result = IntervalAmount::from_str("3."); + assert!(result.is_err()); + + // invalid: sign in fractional + let result = IntervalAmount::from_str("3.-5"); + assert!(result.is_err()); + } + + #[test] + fn test_interval_precision() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let result = Interval::parse("100000.1 days", &config).unwrap(); + let expected = Interval::new(0_i32, 100_000_i32, NANOS_PER_DAY / 10); + + assert_eq!(result, expected); + } + + #[test] + fn test_interval_addition() { + // add 4.1 centuries + let start = Interval::new(1, 2, 3); + let expected = Interval::new(4921, 2, 3); + + let result = start + .add( + IntervalAmount::new(4, 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Century, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 10.25 decades + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1231, 2, 3); + + let result = start + .add( + IntervalAmount::new(10, 25 * 10_i64.pow(INTERVAL_PRECISION - 2)), + IntervalUnit::Decade, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 30.3 years (reminder: Postgres logic does not spill to days/nanos when interval is larger than a month) + let start = Interval::new(1, 2, 3); + let expected = Interval::new(364, 2, 3); + + let result = start + .add( + IntervalAmount::new(30, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Year, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 1.5 months + let start = Interval::new(1, 2, 3); + let expected = Interval::new(2, 17, 3); + + let result = start + .add( + IntervalAmount::new(1, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Month, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -2 weeks + let start = Interval::new(1, 25, 3); + let expected = Interval::new(1, 11, 3); + + let result = start + .add(IntervalAmount::new(-2, 0), IntervalUnit::Week) + .unwrap(); + + assert_eq!(result, expected); + + // add 2.2 days + let start = Interval::new(12, 15, 3); + let expected = Interval::new(12, 17, 3 + 17_280 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(2, 2 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Day, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 12.5 hours + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1, 2, 3 + 45_000 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(12, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Hour, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -1.5 minutes + let start = Interval::new(0, 0, -3); + let expected = Interval::new(0, 0, -90_000_000_000 - 3); + + let result = start + .add( + IntervalAmount::new(-1, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Minute, + ) + .unwrap(); + + assert_eq!(result, expected); + } + + #[test] + fn string_to_timestamp_old() { + parse_timestamp("1677-06-14T07:29:01.256") + .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) + .unwrap_err(); + } + + #[test] + fn test_parse_decimal_with_parameter() { + let tests = [ + ("0", 0i128), + ("123.123", 123123i128), + ("123.1234", 123123i128), + ("123.1", 123100i128), + ("123", 123000i128), + ("-123.123", -123123i128), + ("-123.1234", -123123i128), + ("-123.1", -123100i128), + ("-123", -123000i128), + ("0.0000123", 0i128), + ("12.", 12000i128), + ("-12.", -12000i128), + ("00.1", 100i128), + ("-00.1", -100i128), + ("12345678912345678.1234", 12345678912345678123i128), + ("-12345678912345678.1234", -12345678912345678123i128), + ("99999999999999999.999", 99999999999999999999i128), + ("-99999999999999999.999", -99999999999999999999i128), + (".123", 123i128), + ("-.123", -123i128), + ("123.", 123000i128), + ("-123.", -123000i128), + ]; + for (s, i) in tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!(i, result_128.unwrap()); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!(i256::from_i128(i), result_256.unwrap()); + } + let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; + for s in can_not_parse_tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string() + ); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_256.unwrap_err().to_string() + ); + } + let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; + for s in overflow_parse_tests { + let result_128 = parse_decimal::(s, 10, 3); + let expected_128 = "Parser error: parse decimal overflow"; + let actual_128 = result_128.unwrap_err().to_string(); + + assert!( + actual_128.contains(expected_128), + "actual: '{actual_128}', expected: '{expected_128}'" + ); + + let result_256 = parse_decimal::(s, 10, 3); + let expected_256 = "Parser error: parse decimal overflow"; + let actual_256 = result_256.unwrap_err().to_string(); + + assert!( + actual_256.contains(expected_256), + "actual: '{actual_256}', expected: '{expected_256}'" + ); + } + + let edge_tests_128 = [ + ( + "99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 0, + ), + ( + "999999999999999999999999999999999999.99", + 99999999999999999999999999999999999999i128, + 2, + ), + ( + "9999999999999999999999999.9999999999999", + 99999999999999999999999999999999999999i128, + 13, + ), + ( + "9999999999999999999999999", + 99999999999999999999999990000000000000i128, + 13, + ), + ( + "0.99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 38, + ), + ]; + for (s, i, scale) in edge_tests_128 { + let result_128 = parse_decimal::(s, 38, scale); + assert_eq!(i, result_128.unwrap()); + } + let edge_tests_256 = [ + ( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", +i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 0, + ), + ( + "999999999999999999999999999999999999999999999999999999999999999999999999.9999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 4, + ), + ( + "99999999999999999999999999999999999999999999999999.99999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 26, + ), + ( + "99999999999999999999999999999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + 26, + ), + ]; + for (s, i, scale) in edge_tests_256 { + let result = parse_decimal::(s, 76, scale); + assert_eq!(i, result.unwrap()); + } + } +} From 3e6cf98a7b1e54a87a32083a09a351476286e5d4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 5 Jun 2023 19:43:31 +0100 Subject: [PATCH 0986/1411] Make PrimitiveArray::with_timezone consuming (#4366) --- arrow-array/src/array/primitive_array.rs | 17 +++++++---------- .../src/arrow/array_reader/primitive_array.rs | 4 ++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 35202a4c7fd7..3fa011f8e127 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1132,24 +1132,21 @@ impl PrimitiveArray { } /// Construct a timestamp array with new timezone - pub fn with_timezone(&self, timezone: impl Into>) -> Self { + pub fn with_timezone(self, timezone: impl Into>) -> Self { self.with_timezone_opt(Some(timezone.into())) } /// Construct a timestamp array with UTC - pub fn with_timezone_utc(&self) -> Self { + pub fn with_timezone_utc(self) -> Self { self.with_timezone("+00:00") } /// Construct a timestamp array with an optional timezone - pub fn with_timezone_opt>>(&self, timezone: Option) -> Self { - let array_data = unsafe { - self.to_data() - .into_builder() - .data_type(DataType::Timestamp(T::UNIT, timezone.map(Into::into))) - .build_unchecked() - }; - PrimitiveArray::from(array_data) + pub fn with_timezone_opt>>(self, timezone: Option) -> Self { + Self { + data_type: DataType::Timestamp(T::UNIT, timezone.map(Into::into)), + ..self + } } } diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index ec0d29e8babc..f833eccecb4c 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -502,7 +502,7 @@ mod tests { ) .as_str(), ) - $(.with_timezone($timezone))? + $(.clone().with_timezone($timezone))? ; // create expected array as primitive, and cast to result type @@ -527,7 +527,7 @@ mod tests { ) .as_str(), ) - $(.with_timezone($timezone))? + $(.clone().with_timezone($timezone))? ; assert_eq!(expected, array); } From 7c406e4465ced4b3a69ef8ba4d1dbb341ce8d0c7 Mon Sep 17 00:00:00 2001 From: dadepo Date: Mon, 5 Jun 2023 22:43:45 +0400 Subject: [PATCH 0987/1411] Have array_to_json_array support MapArray (#4364) * Have array_to_json_array support MapArray * Have the iterator for MapArray return StructArray --- arrow-array/src/array/map_array.rs | 22 ++++++++++- arrow-array/src/iterator.rs | 4 +- arrow-json/src/writer.rs | 59 ++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index cf0978f05b4e..c98bca9505c3 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -16,7 +16,10 @@ // under the License. use crate::array::{get_offsets, print_long_array}; -use crate::{make_array, Array, ArrayRef, ListArray, StringArray, StructArray}; +use crate::iterator::MapArrayIter; +use crate::{ + make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray, +}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -116,6 +119,11 @@ impl MapArray { value_offsets: self.value_offsets.slice(offset, length), } } + + /// constructs a new iterator + pub fn iter(&self) -> MapArrayIter<'_> { + MapArrayIter::new(self) + } } impl From for MapArray { @@ -284,6 +292,18 @@ impl Array for MapArray { } } +impl<'a> ArrayAccessor for &'a MapArray { + type Item = StructArray; + + fn value(&self, index: usize) -> Self::Item { + MapArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + MapArray::value(self, index) + } +} + impl std::fmt::Debug for MapArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "MapArray\n[\n")?; diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index fa76e09b2883..86f5d991288a 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -21,7 +21,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, }; -use crate::FixedSizeListArray; +use crate::{FixedSizeListArray, MapArray}; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -129,6 +129,8 @@ pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; /// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +/// an iterator that returns Some(T) or None, that can be used on any MapArray +pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; #[cfg(test)] mod tests { diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index d2365118a31d..6fed0f747c1b 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -94,6 +94,7 @@ //! ``` use std::iter; +use std::sync::Arc; use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; @@ -202,6 +203,15 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } + DataType::Map(_, _) => as_map_array(array) + .iter() + .map(|maybe_value| match maybe_value { + Some(v) => Ok(Value::Array(array_to_json_array( + &(Arc::new(v) as ArrayRef), + )?)), + None => Ok(Value::Null), + }) + .collect(), t => Err(ArrowError::JsonError(format!( "data type {t:?} not supported" ))), @@ -617,6 +627,7 @@ mod tests { use std::io::{BufReader, Seek}; use std::sync::Arc; + use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; use serde_json::json; use arrow_buffer::{Buffer, ToByteSlice}; @@ -1520,4 +1531,52 @@ mod tests { assert_eq!(array_to_json_array(&list_array).unwrap(), expected_json); } + + #[test] + fn test_array_to_json_array_for_map_array() { + let expected_json = serde_json::from_value::>(json!([ + [ + { + "keys": "joe", + "values": 1 + } + ], + [ + { + "keys": "blogs", + "values": 2 + }, + { + "keys": "foo", + "values": 4 + } + ], + [], + null + ])) + .unwrap(); + + let string_builder = StringBuilder::new(); + let int_builder = Int32Builder::with_capacity(4); + + let mut builder = MapBuilder::new(None, string_builder, int_builder); + + builder.keys().append_value("joe"); + builder.values().append_value(1); + builder.append(true).unwrap(); + + builder.keys().append_value("blogs"); + builder.values().append_value(2); + builder.keys().append_value("foo"); + builder.values().append_value(4); + builder.append(true).unwrap(); + builder.append(true).unwrap(); + builder.append(false).unwrap(); + + let array = builder.finish(); + + let map_array = Arc::new(array) as ArrayRef; + + assert_eq!(array_to_json_array(&map_array).unwrap(), expected_json); + } } From 6c742c7d8a7168a302f6d4d356a345755368f553 Mon Sep 17 00:00:00 2001 From: dadepo Date: Tue, 6 Jun 2023 10:11:29 +0400 Subject: [PATCH 0988/1411] Changed array_to_json_array to take &dyn Array (#4370) * changed array_to_json_array to take &dyn Arra * Fix formatting * Update documentation --- arrow-json/src/writer.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 6fed0f747c1b..571e95a1a4ec 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -94,7 +94,6 @@ //! ``` use std::iter; -use std::sync::Arc; use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; @@ -108,7 +107,7 @@ use arrow_schema::*; use arrow_cast::display::{ArrayFormatter, FormatOptions}; -fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> +fn primitive_array_to_json(array: &dyn Array) -> Result, ArrowError> where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -138,8 +137,8 @@ fn struct_array_to_jsonmap_array( Ok(inner_objs) } -/// Converts an arrow [`ArrayRef`] into a `Vec` of Serde JSON [`serde_json::Value`]'s -pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { +/// Converts an arrow [`Array`] into a `Vec` of Serde JSON [`serde_json::Value`]'s +pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), DataType::Boolean => Ok(array @@ -206,9 +205,7 @@ pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { DataType::Map(_, _) => as_map_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array( - &(Arc::new(v) as ArrayRef), - )?)), + Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), None => Ok(Value::Null), }) .collect(), @@ -627,9 +624,9 @@ mod tests { use std::io::{BufReader, Seek}; use std::sync::Arc; - use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; use serde_json::json; + use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; From 077058d55c1e27e30f9affdc7c8ac9eef651eccc Mon Sep 17 00:00:00 2001 From: jakevin Date: Tue, 6 Jun 2023 14:31:14 +0800 Subject: [PATCH 0989/1411] minor: remove useless prefix (#4367) --- arrow-ord/src/comparison.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index b9274f0eaefb..4f8b9a322620 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1196,9 +1196,10 @@ where K: ArrowDictionaryKeyType, K::Native: num::ToPrimitive, { - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, dict.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) + let array = take(&dict_comparison, dict.keys(), None)? + .as_boolean() + .clone(); + Ok(array) } /// Helper function to perform boolean lambda function on values from two arrays using From d48391d208b7cb6c0ae7f2511c69cdb2b40bb76d Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Tue, 6 Jun 2023 12:21:04 +0200 Subject: [PATCH 0990/1411] feat(flight): add xdbc type info helpers (#4359) * feat(flight): add xdbc type info helpers * test: add test for creating record batch * test: filter record batch * docs: add some basic docstrings and examples * fix: xdbc info example imports * fix: actually fix example * docs: fix link * fix: rename structs and add GetXdbcTypeInfoBuilder * fix: clippy * Update arrow-flight/src/sql/metadata/xdbc_info.rs Co-authored-by: Andrew Lamb * fix: pr feedback * fix: docs * fix: missed one name * fix: example --------- Co-authored-by: Andrew Lamb --- arrow-flight/examples/flight_sql_server.rs | 66 +++- arrow-flight/src/sql/metadata/mod.rs | 17 + arrow-flight/src/sql/metadata/sql_info.rs | 13 +- arrow-flight/src/sql/metadata/xdbc_info.rs | 433 +++++++++++++++++++++ 4 files changed, 505 insertions(+), 24 deletions(-) create mode 100644 arrow-flight/src/sql/metadata/xdbc_info.rs diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index ecd8db76bba9..e9dba08f0dcd 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -30,7 +30,9 @@ use tonic::{Request, Response, Status, Streaming}; use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::sql::metadata::SqlInfoList; +use arrow_flight::sql::metadata::{ + SqlInfoList, XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder, +}; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, @@ -42,8 +44,8 @@ use arrow_flight::sql::{ CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementSubstraitPlan, CommandStatementUpdate, ProstMessageExt, SqlInfo, - TicketStatementQuery, + CommandStatementSubstraitPlan, CommandStatementUpdate, Nullable, ProstMessageExt, + Searchable, SqlInfo, TicketStatementQuery, XdbcDataType, }; use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ @@ -73,6 +75,32 @@ static INSTANCE_SQL_INFO: Lazy = Lazy::new(|| { .with_sql_info(SqlInfo::FlightSqlServerArrowVersion, "1.3") }); +static INSTANCE_XBDC_DATA: Lazy = Lazy::new(|| { + let mut builder = XdbcTypeInfoDataBuilder::new(); + builder.append(XdbcTypeInfo { + type_name: "INTEGER".into(), + data_type: XdbcDataType::XdbcInteger, + column_size: Some(32), + literal_prefix: None, + literal_suffix: None, + create_params: None, + nullable: Nullable::NullabilityNullable, + case_sensitive: false, + searchable: Searchable::Full, + unsigned_attribute: Some(false), + fixed_prec_scale: false, + auto_increment: Some(false), + local_type_name: Some("INTEGER".into()), + minimum_scale: None, + maximum_scale: None, + sql_data_type: XdbcDataType::XdbcInteger, + datetime_subcode: None, + num_prec_radix: Some(2), + interval_precision: None, + }); + builder.build().unwrap() +}); + static TABLES: Lazy> = Lazy::new(|| vec!["flight_sql.example.table"]); #[derive(Clone)] @@ -367,12 +395,20 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_xdbc_type_info( &self, - _query: CommandGetXdbcTypeInfo, - _request: Request, + query: CommandGetXdbcTypeInfo, + request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_xdbc_type_info not implemented", - )) + let flight_descriptor = request.into_inner(); + let ticket = Ticket::new(query.encode_to_vec()); + let endpoint = FlightEndpoint::new().with_ticket(ticket); + + let flight_info = FlightInfo::new() + .try_with_schema(query.into_builder(&INSTANCE_XBDC_DATA).schema().as_ref()) + .map_err(|e| status!("Unable to encode schema", e))? + .with_endpoint(endpoint) + .with_descriptor(flight_descriptor); + + Ok(tonic::Response::new(flight_info)) } // do_get @@ -544,12 +580,18 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_get_xdbc_type_info( &self, - _query: CommandGetXdbcTypeInfo, + query: CommandGetXdbcTypeInfo, _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented( - "do_get_xdbc_type_info not implemented", - )) + // create a builder with pre-defined Xdbc data: + let builder = query.into_builder(&INSTANCE_XBDC_DATA); + let schema = builder.schema(); + let batch = builder.build(); + let stream = FlightDataEncoderBuilder::new() + .with_schema(schema) + .build(futures::stream::once(async { batch })) + .map_err(Status::from); + Ok(Response::new(Box::pin(stream))) } // do_put diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs index 9d3810806ab4..b823c1f4a643 100644 --- a/arrow-flight/src/sql/metadata/mod.rs +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -30,11 +30,13 @@ mod catalogs; mod db_schemas; mod sql_info; mod tables; +mod xdbc_info; pub use catalogs::GetCatalogsBuilder; pub use db_schemas::GetDbSchemasBuilder; pub use sql_info::SqlInfoList; pub use tables::GetTablesBuilder; +pub use xdbc_info::{XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder}; use arrow_array::ArrayRef; use arrow_array::UInt32Array; @@ -53,3 +55,18 @@ fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) } + +#[cfg(test)] +mod tests { + use arrow_array::RecordBatch; + use arrow_cast::pretty::pretty_format_batches; + pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { + let formatted = pretty_format_batches(batches).unwrap().to_string(); + let actual_lines: Vec<_> = formatted.trim().lines().collect(); + assert_eq!( + &actual_lines, expected_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + } +} diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 3dcee1e58c3b..4b4604078359 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -440,21 +440,10 @@ mod tests { use std::collections::HashMap; use super::SqlInfoList; + use crate::sql::metadata::tests::assert_batches_eq; use crate::sql::{ SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert, }; - use arrow_array::RecordBatch; - use arrow_cast::pretty::pretty_format_batches; - - fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { - let formatted = pretty_format_batches(batches).unwrap().to_string(); - let actual_lines: Vec<_> = formatted.trim().lines().collect(); - assert_eq!( - &actual_lines, expected_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - } #[test] fn test_sql_infos() { diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs new file mode 100644 index 000000000000..cecef1b49e8b --- /dev/null +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -0,0 +1,433 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Helpers for [`CommandGetXdbcTypeInfo`] metadata requests. +//! +//! - [`XdbcTypeInfo`] - a typed struct that holds the xdbc info corresponding to expected schema. +//! - [`XdbcTypeInfoDataBuilder`] - a builder for collecting type infos +//! and building a conformant `RecordBatch`. +//! - [`XdbcTypeInfoData`] - a helper type wrapping a `RecordBatch` +//! used for storing xdbc server metadata. +//! - [`GetXdbcTypeInfoBuilder`] - a builder for consructing [`CommandGetXdbcTypeInfo`] responses. +//! +use std::sync::Arc; + +use arrow_array::builder::{BooleanBuilder, Int32Builder, ListBuilder, StringBuilder}; +use arrow_array::cast::downcast_array; +use arrow_array::{ArrayRef, Int32Array, ListArray, RecordBatch}; +use arrow_ord::comparison::eq_scalar; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use arrow_select::filter::filter_record_batch; +use arrow_select::take::take; +use once_cell::sync::Lazy; + +use super::lexsort_to_indices; +use crate::error::*; +use crate::sql::{ + CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode, +}; + +/// Data structure representing type information for xdbc types. +#[derive(Debug, Clone, Default)] +pub struct XdbcTypeInfo { + pub type_name: String, + pub data_type: XdbcDataType, + pub column_size: Option, + pub literal_prefix: Option, + pub literal_suffix: Option, + pub create_params: Option>, + pub nullable: Nullable, + pub case_sensitive: bool, + pub searchable: Searchable, + pub unsigned_attribute: Option, + pub fixed_prec_scale: bool, + pub auto_increment: Option, + pub local_type_name: Option, + pub minimum_scale: Option, + pub maximum_scale: Option, + pub sql_data_type: XdbcDataType, + pub datetime_subcode: Option, + pub num_prec_radix: Option, + pub interval_precision: Option, +} + +/// Helper to create [`CommandGetXdbcTypeInfo`] responses. +/// +/// [`CommandGetXdbcTypeInfo`] are metadata requests used by a Flight SQL +/// server to communicate supported capabilities to Flight SQL clients. +/// +/// Servers constuct - usually static - [`XdbcTypeInfoData`] via the [XdbcTypeInfoDataBuilder`], +/// and build responses by passing the [`GetXdbcTypeInfoBuilder`]. +pub struct XdbcTypeInfoData { + batch: RecordBatch, +} + +impl XdbcTypeInfoData { + /// Return the raw (not encoded) RecordBatch that will be returned + /// from [`CommandGetXdbcTypeInfo`] + pub fn record_batch(&self, data_type: impl Into>) -> Result { + if let Some(dt) = data_type.into() { + let arr: Int32Array = downcast_array(self.batch.column(1).as_ref()); + let filter = eq_scalar(&arr, dt)?; + Ok(filter_record_batch(&self.batch, &filter)?) + } else { + Ok(self.batch.clone()) + } + } + + /// Return the schema of the RecordBatch that will be returned + /// from [`CommandGetXdbcTypeInfo`] + pub fn schema(&self) -> SchemaRef { + self.batch.schema() + } +} + +pub struct XdbcTypeInfoDataBuilder { + infos: Vec, +} + +impl Default for XdbcTypeInfoDataBuilder { + fn default() -> Self { + Self::new() + } +} + +/// A builder for [`XdbcTypeInfoData`] which is used to create [`CommandGetXdbcTypeInfo`] responses. +/// +/// # Example +/// ``` +/// use arrow_flight::sql::{Nullable, Searchable, XdbcDataType}; +/// use arrow_flight::sql::metadata::{XdbcTypeInfo, XdbcTypeInfoDataBuilder}; +/// // Create the list of metadata describing the server. Since this would not change at +/// // runtime, using once_cell::Lazy or similar patterns to constuct the list is a common approach. +/// let mut builder = XdbcTypeInfoDataBuilder::new(); +/// builder.append(XdbcTypeInfo { +/// type_name: "INTEGER".into(), +/// data_type: XdbcDataType::XdbcInteger, +/// column_size: Some(32), +/// literal_prefix: None, +/// literal_suffix: None, +/// create_params: None, +/// nullable: Nullable::NullabilityNullable, +/// case_sensitive: false, +/// searchable: Searchable::Full, +/// unsigned_attribute: Some(false), +/// fixed_prec_scale: false, +/// auto_increment: Some(false), +/// local_type_name: Some("INTEGER".into()), +/// minimum_scale: None, +/// maximum_scale: None, +/// sql_data_type: XdbcDataType::XdbcInteger, +/// datetime_subcode: None, +/// num_prec_radix: Some(2), +/// interval_precision: None, +/// }); +/// let info_list = builder.build().unwrap(); +/// +/// // to access the underlying record batch +/// let batch = info_list.record_batch(None); +/// ``` +impl XdbcTypeInfoDataBuilder { + /// Create a new instance of [`XdbcTypeInfoDataBuilder`]. + pub fn new() -> Self { + Self { infos: Vec::new() } + } + + /// Append a new row + pub fn append(&mut self, info: XdbcTypeInfo) { + self.infos.push(info); + } + + /// Create helper structure for handling xdbc metadata requests. + pub fn build(self) -> Result { + let mut type_name_builder = StringBuilder::new(); + let mut data_type_builder = Int32Builder::new(); + let mut column_size_builder = Int32Builder::new(); + let mut literal_prefix_builder = StringBuilder::new(); + let mut literal_suffix_builder = StringBuilder::new(); + let mut create_params_builder = ListBuilder::new(StringBuilder::new()); + let mut nullable_builder = Int32Builder::new(); + let mut case_sensitive_builder = BooleanBuilder::new(); + let mut searchable_builder = Int32Builder::new(); + let mut unsigned_attribute_builder = BooleanBuilder::new(); + let mut fixed_prec_scale_builder = BooleanBuilder::new(); + let mut auto_increment_builder = BooleanBuilder::new(); + let mut local_type_name_builder = StringBuilder::new(); + let mut minimum_scale_builder = Int32Builder::new(); + let mut maximum_scale_builder = Int32Builder::new(); + let mut sql_data_type_builder = Int32Builder::new(); + let mut datetime_subcode_builder = Int32Builder::new(); + let mut num_prec_radix_builder = Int32Builder::new(); + let mut interval_precision_builder = Int32Builder::new(); + + self.infos.into_iter().for_each(|info| { + type_name_builder.append_value(info.type_name); + data_type_builder.append_value(info.data_type as i32); + column_size_builder.append_option(info.column_size); + literal_prefix_builder.append_option(info.literal_prefix); + literal_suffix_builder.append_option(info.literal_suffix); + if let Some(params) = info.create_params { + if !params.is_empty() { + for param in params { + create_params_builder.values().append_value(param); + } + create_params_builder.append(true); + } else { + create_params_builder.append_null(); + } + } else { + create_params_builder.append_null(); + } + nullable_builder.append_value(info.nullable as i32); + case_sensitive_builder.append_value(info.case_sensitive); + searchable_builder.append_value(info.searchable as i32); + unsigned_attribute_builder.append_option(info.unsigned_attribute); + fixed_prec_scale_builder.append_value(info.fixed_prec_scale); + auto_increment_builder.append_option(info.auto_increment); + local_type_name_builder.append_option(info.local_type_name); + minimum_scale_builder.append_option(info.minimum_scale); + maximum_scale_builder.append_option(info.maximum_scale); + sql_data_type_builder.append_value(info.sql_data_type as i32); + datetime_subcode_builder + .append_option(info.datetime_subcode.map(|code| code as i32)); + num_prec_radix_builder.append_option(info.num_prec_radix); + interval_precision_builder.append_option(info.interval_precision); + }); + + let type_name = Arc::new(type_name_builder.finish()); + let data_type = Arc::new(data_type_builder.finish()); + let column_size = Arc::new(column_size_builder.finish()); + let literal_prefix = Arc::new(literal_prefix_builder.finish()); + let literal_suffix = Arc::new(literal_suffix_builder.finish()); + let (field, offsets, values, nulls) = create_params_builder.finish().into_parts(); + // Re-defined the field to be non-nullable + let new_field = Arc::new(field.as_ref().clone().with_nullable(false)); + let create_params = + Arc::new(ListArray::new(new_field, offsets, values, nulls)) as ArrayRef; + let nullable = Arc::new(nullable_builder.finish()); + let case_sensitive = Arc::new(case_sensitive_builder.finish()); + let searchable = Arc::new(searchable_builder.finish()); + let unsigned_attribute = Arc::new(unsigned_attribute_builder.finish()); + let fixed_prec_scale = Arc::new(fixed_prec_scale_builder.finish()); + let auto_increment = Arc::new(auto_increment_builder.finish()); + let local_type_name = Arc::new(local_type_name_builder.finish()); + let minimum_scale = Arc::new(minimum_scale_builder.finish()); + let maximum_scale = Arc::new(maximum_scale_builder.finish()); + let sql_data_type = Arc::new(sql_data_type_builder.finish()); + let datetime_subcode = Arc::new(datetime_subcode_builder.finish()); + let num_prec_radix = Arc::new(num_prec_radix_builder.finish()); + let interval_precision = Arc::new(interval_precision_builder.finish()); + + let batch = RecordBatch::try_new( + Arc::clone(&GET_XDBC_INFO_SCHEMA), + vec![ + type_name, + data_type, + column_size, + literal_prefix, + literal_suffix, + create_params, + nullable, + case_sensitive, + searchable, + unsigned_attribute, + fixed_prec_scale, + auto_increment, + local_type_name, + minimum_scale, + maximum_scale, + sql_data_type, + datetime_subcode, + num_prec_radix, + interval_precision, + ], + )?; + + // Order batch by data_type and then by type_name + let sort_cols = batch.project(&[1, 0])?; + let indices = lexsort_to_indices(sort_cols.columns()); + let columns = batch + .columns() + .iter() + .map(|c| take(c, &indices, None)) + .collect::, _>>()?; + + Ok(XdbcTypeInfoData { + batch: RecordBatch::try_new(batch.schema(), columns)?, + }) + } + + /// Return the [`Schema`] for a GetSchema RPC call with [`CommandGetXdbcTypeInfo`] + pub fn schema(&self) -> SchemaRef { + Arc::clone(&GET_XDBC_INFO_SCHEMA) + } +} + +/// A builder for a [`CommandGetXdbcTypeInfo`] response. +pub struct GetXdbcTypeInfoBuilder<'a> { + data_type: Option, + infos: &'a XdbcTypeInfoData, +} + +impl CommandGetXdbcTypeInfo { + /// Create a builder suitable for constructing a response + pub fn into_builder(self, infos: &XdbcTypeInfoData) -> GetXdbcTypeInfoBuilder { + GetXdbcTypeInfoBuilder { + data_type: self.data_type, + infos, + } + } +} + +impl GetXdbcTypeInfoBuilder<'_> { + /// Builds a `RecordBatch` with the correct schema for a [`CommandGetXdbcTypeInfo`] response + pub fn build(self) -> Result { + self.infos.record_batch(self.data_type) + } + + /// Return the schema of the RecordBatch that will be returned + /// from [`CommandGetXdbcTypeInfo`] + pub fn schema(&self) -> SchemaRef { + self.infos.schema() + } +} + +/// The schema for GetXdbcTypeInfo +static GET_XDBC_INFO_SCHEMA: Lazy = Lazy::new(|| { + Arc::new(Schema::new(vec![ + Field::new("type_name", DataType::Utf8, false), + Field::new("data_type", DataType::Int32, false), + Field::new("column_size", DataType::Int32, true), + Field::new("literal_prefix", DataType::Utf8, true), + Field::new("literal_suffix", DataType::Utf8, true), + Field::new( + "create_params", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))), + true, + ), + Field::new("nullable", DataType::Int32, false), + Field::new("case_sensitive", DataType::Boolean, false), + Field::new("searchable", DataType::Int32, false), + Field::new("unsigned_attribute", DataType::Boolean, true), + Field::new("fixed_prec_scale", DataType::Boolean, false), + Field::new("auto_increment", DataType::Boolean, true), + Field::new("local_type_name", DataType::Utf8, true), + Field::new("minimum_scale", DataType::Int32, true), + Field::new("maximum_scale", DataType::Int32, true), + Field::new("sql_data_type", DataType::Int32, false), + Field::new("datetime_subcode", DataType::Int32, true), + Field::new("num_prec_radix", DataType::Int32, true), + Field::new("interval_precision", DataType::Int32, true), + ])) +}); + +#[cfg(test)] +mod tests { + use super::*; + use crate::sql::metadata::tests::assert_batches_eq; + + #[test] + fn test_create_batch() { + let mut builder = XdbcTypeInfoDataBuilder::new(); + builder.append(XdbcTypeInfo { + type_name: "VARCHAR".into(), + data_type: XdbcDataType::XdbcVarchar, + column_size: Some(i32::MAX), + literal_prefix: Some("'".into()), + literal_suffix: Some("'".into()), + create_params: Some(vec!["length".into()]), + nullable: Nullable::NullabilityNullable, + case_sensitive: true, + searchable: Searchable::Full, + unsigned_attribute: None, + fixed_prec_scale: false, + auto_increment: None, + local_type_name: Some("VARCHAR".into()), + minimum_scale: None, + maximum_scale: None, + sql_data_type: XdbcDataType::XdbcVarchar, + datetime_subcode: None, + num_prec_radix: None, + interval_precision: None, + }); + builder.append(XdbcTypeInfo { + type_name: "INTEGER".into(), + data_type: XdbcDataType::XdbcInteger, + column_size: Some(32), + literal_prefix: None, + literal_suffix: None, + create_params: None, + nullable: Nullable::NullabilityNullable, + case_sensitive: false, + searchable: Searchable::Full, + unsigned_attribute: Some(false), + fixed_prec_scale: false, + auto_increment: Some(false), + local_type_name: Some("INTEGER".into()), + minimum_scale: None, + maximum_scale: None, + sql_data_type: XdbcDataType::XdbcInteger, + datetime_subcode: None, + num_prec_radix: Some(2), + interval_precision: None, + }); + builder.append(XdbcTypeInfo { + type_name: "INTERVAL".into(), + data_type: XdbcDataType::XdbcInterval, + column_size: Some(i32::MAX), + literal_prefix: Some("'".into()), + literal_suffix: Some("'".into()), + create_params: None, + nullable: Nullable::NullabilityNullable, + case_sensitive: false, + searchable: Searchable::Full, + unsigned_attribute: None, + fixed_prec_scale: false, + auto_increment: None, + local_type_name: Some("INTERVAL".into()), + minimum_scale: None, + maximum_scale: None, + sql_data_type: XdbcDataType::XdbcInterval, + datetime_subcode: Some(XdbcDatetimeSubcode::XdbcSubcodeUnknown), + num_prec_radix: None, + interval_precision: None, + }); + let infos = builder.build().unwrap(); + + let batch = infos.record_batch(None).unwrap(); + let expected = vec![ + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + "| type_name | data_type | column_size | literal_prefix | literal_suffix | create_params | nullable | case_sensitive | searchable | unsigned_attribute | fixed_prec_scale | auto_increment | local_type_name | minimum_scale | maximum_scale | sql_data_type | datetime_subcode | num_prec_radix | interval_precision |", + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + "| INTEGER | 4 | 32 | | | | 1 | false | 3 | false | false | false | INTEGER | | | 4 | | 2 | |", + "| INTERVAL | 10 | 2147483647 | ' | ' | | 1 | false | 3 | | false | | INTERVAL | | | 10 | 0 | | |", + "| VARCHAR | 12 | 2147483647 | ' | ' | [length] | 1 | true | 3 | | false | | VARCHAR | | | 12 | | | |", + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + ]; + assert_batches_eq(&[batch], &expected); + + let batch = infos.record_batch(Some(10)).unwrap(); + let expected = vec![ + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + "| type_name | data_type | column_size | literal_prefix | literal_suffix | create_params | nullable | case_sensitive | searchable | unsigned_attribute | fixed_prec_scale | auto_increment | local_type_name | minimum_scale | maximum_scale | sql_data_type | datetime_subcode | num_prec_radix | interval_precision |", + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + "| INTERVAL | 10 | 2147483647 | ' | ' | | 1 | false | 3 | | false | | INTERVAL | | | 10 | 0 | | |", + "+-----------+-----------+-------------+----------------+----------------+---------------+----------+----------------+------------+--------------------+------------------+----------------+-----------------+---------------+---------------+---------------+------------------+----------------+--------------------+", + ]; + assert_batches_eq(&[batch], &expected); + } +} From 0c002e20f66c3ab53b883f3b7fb1a72a2e903a73 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Jun 2023 20:24:07 +0200 Subject: [PATCH 0991/1411] Update hashbrown requirement from 0.13 to 0.14 (#4373) Updates the requirements on [hashbrown](https://github.com/rust-lang/hashbrown) to permit the latest version. - [Changelog](https://github.com/rust-lang/hashbrown/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/hashbrown/compare/v0.13.1...v0.14.0) --- updated-dependencies: - dependency-name: hashbrown dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-array/Cargo.toml | 2 +- arrow-row/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 634a0aa647fb..f2703bb6fca0 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -48,7 +48,7 @@ chrono = { version = "0.4.24", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } -hashbrown = { version = "0.13", default-features = false } +hashbrown = { version = "0.14", default-features = false } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } [features] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 8f5de1177288..dcf624f8b7d1 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -46,7 +46,7 @@ arrow-data = { workspace = true } arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } -hashbrown = { version = "0.13", default-features = false } +hashbrown = { version = "0.14", default-features = false } [dev-dependencies] arrow-cast = { workspace = true } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index adcbe82a7bbd..52b0f049752c 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -63,7 +63,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op seq-macro = { version = "0.3", default-features = false } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] } -hashbrown = { version = "0.13", default-features = false } +hashbrown = { version = "0.14", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } From a88dc75100dd682ca873850a434e476b7d7f8404 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Jun 2023 16:50:55 +0100 Subject: [PATCH 0992/1411] Update comfy-table requirement from 6.0 to 7.0 (#4377) Updates the requirements on [comfy-table](https://github.com/nukesor/comfy-table) to permit the latest version. - [Release notes](https://github.com/nukesor/comfy-table/releases) - [Changelog](https://github.com/Nukesor/comfy-table/blob/main/CHANGELOG.md) - [Commits](https://github.com/nukesor/comfy-table/compare/v6.0.0...v7.0.0) --- updated-dependencies: - dependency-name: comfy-table dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-cast/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index ebfadeb99f1a..494ad104b11c 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -49,7 +49,7 @@ chrono = { version = "0.4.23", default-features = false, features = ["clock"] } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } -comfy-table = { version = "6.0", optional = true, default-features = false } +comfy-table = { version = "7.0", optional = true, default-features = false } [dev-dependencies] criterion = { version = "0.5", default-features = false } From bd07067390bf13b55ea43b523390eafd1865e606 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 7 Jun 2023 17:40:03 +0100 Subject: [PATCH 0993/1411] Add ListArrayReader benchmarks (#4378) --- parquet/benches/arrow_reader.rs | 106 +++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 3dda6304d122..825c7f00f905 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -17,15 +17,16 @@ use arrow::array::Array; use arrow::datatypes::DataType; +use arrow_schema::Field; use criterion::measurement::WallTime; use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion}; use num::FromPrimitive; use num_bigint::BigInt; use parquet::arrow::array_reader::{ - make_byte_array_reader, make_fixed_len_byte_array_reader, + make_byte_array_reader, make_fixed_len_byte_array_reader, ListArrayReader, }; use parquet::basic::Type; -use parquet::data_type::FixedLenByteArrayType; +use parquet::data_type::{ByteArray, FixedLenByteArrayType}; use parquet::util::{DataPageBuilder, DataPageBuilderImpl, InMemoryPageIterator}; use parquet::{ arrow::array_reader::ArrayReader, @@ -56,6 +57,11 @@ fn build_test_schema() -> SchemaDescPtr { OPTIONAL BYTE_ARRAY optional_decimal3_leaf (DECIMAL(16,2)); REQUIRED FIXED_LEN_BYTE_ARRAY (16) mandatory_decimal4_leaf (DECIMAL(16,2)); OPTIONAL FIXED_LEN_BYTE_ARRAY (16) optional_decimal4_leaf (DECIMAL(16,2)); + OPTIONAL GROUP string_list (LIST) { + repeated group list { + optional BYTE_ARRAY element (UTF8); + } + } } "; parse_message_type(message_type) @@ -68,6 +74,7 @@ const NUM_ROW_GROUPS: usize = 1; const PAGES_PER_GROUP: usize = 2; const VALUES_PER_PAGE: usize = 10_000; const BATCH_SIZE: usize = 8192; +const MAX_LIST_LEN: usize = 10; const EXPECTED_VALUE_COUNT: usize = NUM_ROW_GROUPS * PAGES_PER_GROUP * VALUES_PER_PAGE; pub fn seedable_rng() -> StdRng { @@ -361,6 +368,66 @@ fn build_dictionary_encoded_string_page_iterator( InMemoryPageIterator::new(pages) } +fn build_string_list_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, +) -> impl PageIterator + Clone { + let max_def_level = column_desc.max_def_level(); + let max_rep_level = column_desc.max_rep_level(); + assert_eq!(max_def_level, 3); + assert_eq!(max_rep_level, 1); + + let mut rng = seedable_rng(); + let mut pages: Vec> = Vec::new(); + for i in 0..NUM_ROW_GROUPS { + let mut column_chunk_pages = Vec::new(); + for j in 0..PAGES_PER_GROUP { + // generate page + let mut values: Vec = + Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); + let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); + let mut rep_levels = Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); + for k in 0..VALUES_PER_PAGE { + rep_levels.push(0); + if rng.gen::() < null_density { + // Null list + def_levels.push(0); + continue; + } + let len = rng.gen_range(0..MAX_LIST_LEN); + if len == 0 { + // Empty list + def_levels.push(1); + continue; + } + + (1..len).for_each(|_| rep_levels.push(1)); + + for l in 0..len { + if rng.gen::() < null_density { + // Null element + def_levels.push(2); + } else { + def_levels.push(3); + let value = + format!("Test value {k}[{l}], row group: {i}, page: {j}"); + values.push(value.as_str().into()); + } + } + } + let mut page_builder = + DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true); + page_builder.add_rep_levels(max_rep_level, &rep_levels); + page_builder.add_def_levels(max_def_level, &def_levels); + page_builder.add_values::(Encoding::PLAIN, &values); + column_chunk_pages.push(page_builder.consume()); + } + pages.push(column_chunk_pages); + } + + InMemoryPageIterator::new(pages) +} + fn bench_array_reader(mut array_reader: Box) -> usize { // test procedure: read data in batches of 8192 until no more data let mut total_count = 0; @@ -464,6 +531,16 @@ fn create_string_byte_array_dictionary_reader( .unwrap() } +fn create_string_list_reader( + page_iterator: impl PageIterator + 'static, + column_desc: ColumnDescPtr, +) -> Box { + let items = create_string_byte_array_reader(page_iterator, column_desc); + let field = Field::new("item", DataType::Utf8, true); + let data_type = DataType::List(Arc::new(field)); + Box::new(ListArrayReader::::new(items, data_type, 2, 1, true)) +} + fn bench_byte_decimal( group: &mut BenchmarkGroup, mandatory_column_desc: &ColumnDescPtr, @@ -798,6 +875,8 @@ fn add_benches(c: &mut Criterion) { let optional_string_column_desc = schema.column(3); let mandatory_int64_column_desc = schema.column(4); let optional_int64_column_desc = schema.column(5); + let string_list_desc = schema.column(14); + // primitive / int32 benchmarks // ============================= @@ -964,6 +1043,29 @@ fn add_benches(c: &mut Criterion) { }); group.finish(); + + // list benchmarks + //============================== + + let list_data = build_string_list_page_iterator(string_list_desc.clone(), 0.); + let mut group = c.benchmark_group("arrow_array_reader/ListArray"); + group.bench_function("plain encoded optional strings no NULLs", |b| { + b.iter(|| { + let reader = + create_string_list_reader(list_data.clone(), string_list_desc.clone()); + count = bench_array_reader(reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + let list_data = build_string_list_page_iterator(string_list_desc.clone(), 0.5); + group.bench_function("plain encoded optional strings half NULLs", |b| { + b.iter(|| { + let reader = + create_string_list_reader(list_data.clone(), string_list_desc.clone()); + count = bench_array_reader(reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); } criterion_group!(benches, add_benches, decimal_benches,); From 1cfbe1f06a43754c53987a2dabc56fe73843cc00 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jun 2023 16:17:17 -0400 Subject: [PATCH 0994/1411] Add more examples of constructing Boolean, Primitive, String, and Decimal Arrays, and From impl for i256 (#4379) * Add more examples of constructing Boolean, PrimitiveArray, StringArray and Decimal*Array, and impl for i256 * fix example --- arrow-array/src/array/boolean_array.rs | 70 +++++----- arrow-array/src/array/primitive_array.rs | 167 ++++++++++++++++++++--- arrow-array/src/array/string_array.rs | 34 ++++- arrow-buffer/src/bigint.rs | 24 ++++ 4 files changed, 240 insertions(+), 55 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 9ecdb2c5d24d..6905baa806de 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -27,42 +27,50 @@ use std::sync::Arc; /// An array of [boolean values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// -/// # Example +/// # Examples +/// +/// Construction +/// +/// ``` +///# use arrow_array::{Array, BooleanArray}; +/// // Create from Vec> +/// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); +/// // Create from Vec +/// let arr = BooleanArray::from(vec![false, true, true]); +/// // Create from iter/collect +/// let arr: BooleanArray = std::iter::repeat(Some(true)).take(10).collect(); +/// ``` +/// +/// Construction and Access /// /// ``` -/// use arrow_array::{Array, BooleanArray}; -/// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); -/// assert_eq!(4, arr.len()); -/// assert_eq!(1, arr.null_count()); -/// assert!(arr.is_valid(0)); -/// assert!(!arr.is_null(0)); -/// assert_eq!(false, arr.value(0)); -/// assert!(arr.is_valid(1)); -/// assert!(!arr.is_null(1)); -/// assert_eq!(true, arr.value(1)); -/// assert!(!arr.is_valid(2)); -/// assert!(arr.is_null(2)); -/// assert!(arr.is_valid(3)); -/// assert!(!arr.is_null(3)); -/// assert_eq!(true, arr.value(3)); +/// use arrow_array::{Array, BooleanArray}; +/// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); +/// assert_eq!(4, arr.len()); +/// assert_eq!(1, arr.null_count()); +/// assert!(arr.is_valid(0)); +/// assert!(!arr.is_null(0)); +/// assert_eq!(false, arr.value(0)); +/// assert!(!arr.is_valid(2)); +/// assert!(arr.is_null(2)); /// ``` /// -/// Using `from_iter` +/// Using `collect` /// ``` -/// use arrow_array::{Array, BooleanArray}; -/// let v = vec![Some(false), Some(true), Some(false), Some(true)]; -/// let arr = v.into_iter().collect::(); -/// assert_eq!(4, arr.len()); -/// assert_eq!(0, arr.offset()); -/// assert_eq!(0, arr.null_count()); -/// assert!(arr.is_valid(0)); -/// assert_eq!(false, arr.value(0)); -/// assert!(arr.is_valid(1)); -/// assert_eq!(true, arr.value(1)); -/// assert!(arr.is_valid(2)); -/// assert_eq!(false, arr.value(2)); -/// assert!(arr.is_valid(3)); -/// assert_eq!(true, arr.value(3)); +/// use arrow_array::{Array, BooleanArray}; +/// let v = vec![Some(false), Some(true), Some(false), Some(true)]; +/// let arr = v.into_iter().collect::(); +/// assert_eq!(4, arr.len()); +/// assert_eq!(0, arr.offset()); +/// assert_eq!(0, arr.null_count()); +/// assert!(arr.is_valid(0)); +/// assert_eq!(false, arr.value(0)); +/// assert!(arr.is_valid(1)); +/// assert_eq!(true, arr.value(1)); +/// assert!(arr.is_valid(2)); +/// assert_eq!(false, arr.value(2)); +/// assert!(arr.is_valid(3)); +/// assert_eq!(true, arr.value(3)); /// ``` #[derive(Clone)] pub struct BooleanArray { diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 3fa011f8e127..ce526a274fcf 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -36,77 +36,157 @@ use std::sync::Arc; /// An array of `i8` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::Int8Array; -/// let arr : Int8Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = Int8Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Int8Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Int8Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type Int8Array = PrimitiveArray; /// An array of `i16` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::Int16Array; -/// let arr : Int16Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = Int16Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Int16Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Int16Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type Int16Array = PrimitiveArray; /// An array of `i32` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::Int32Array; -/// let arr : Int32Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = Int32Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Int32Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Int32Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type Int32Array = PrimitiveArray; /// An array of `i64` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::Int64Array; -/// let arr : Int64Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = Int64Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Int64Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Int64Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type Int64Array = PrimitiveArray; /// An array of `u8` -/// # Example: Using `collect` +/// +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::UInt8Array; -/// let arr : UInt8Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = UInt8Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = UInt8Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: UInt8Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type UInt8Array = PrimitiveArray; /// An array of `u16` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::UInt16Array; -/// let arr : UInt16Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = UInt16Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = UInt16Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: UInt16Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type UInt16Array = PrimitiveArray; /// An array of `u32` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::UInt32Array; -/// let arr : UInt32Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = UInt32Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = UInt32Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: UInt32Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type UInt32Array = PrimitiveArray; /// An array of `u64` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::UInt64Array; -/// let arr : UInt64Array = [Some(1), Some(2)].into_iter().collect(); +/// // Create from Vec> +/// let arr = UInt64Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = UInt64Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: UInt64Array = std::iter::repeat(42).take(10).collect(); /// ``` pub type UInt64Array = PrimitiveArray; /// An array of `f16` /// +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::Float16Array; +/// use half::f16; +/// // Create from Vec> +/// let arr = Float16Array::from(vec![Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))]); +/// // Create from Vec +/// let arr = Float16Array::from(vec![f16::from_f64(1.0), f16::from_f64(2.0), f16::from_f64(3.0)]); +/// // Create iter/collect +/// let arr: Float16Array = std::iter::repeat(f16::from_f64(1.0)).take(10).collect(); +/// ``` +/// /// # Example: Using `collect` /// ``` /// # use arrow_array::Float16Array; @@ -117,19 +197,35 @@ pub type Float16Array = PrimitiveArray; /// An array of `f32` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` /// # use arrow_array::Float32Array; -/// let arr : Float32Array = [Some(1.0), Some(2.0)].into_iter().collect(); +/// // Create from Vec> +/// let arr = Float32Array::from(vec![Some(1.0), None, Some(2.0)]); +/// // Create from Vec +/// let arr = Float32Array::from(vec![1.0, 2.0, 3.0]); +/// // Create iter/collect +/// let arr: Float32Array = std::iter::repeat(42.0).take(10).collect(); /// ``` pub type Float32Array = PrimitiveArray; /// An array of `f64` /// -/// # Example: Using `collect` +/// # Examples +/// +/// Construction +/// /// ``` -/// # use arrow_array::Float64Array; -/// let arr : Float64Array = [Some(1.0), Some(2.0)].into_iter().collect(); +/// # use arrow_array::Float32Array; +/// // Create from Vec> +/// let arr = Float32Array::from(vec![Some(1.0), None, Some(2.0)]); +/// // Create from Vec +/// let arr = Float32Array::from(vec![1.0, 2.0, 3.0]); +/// // Create iter/collect +/// let arr: Float32Array = std::iter::repeat(42.0).take(10).collect(); /// ``` pub type Float64Array = PrimitiveArray; @@ -256,9 +352,38 @@ pub type DurationMicrosecondArray = PrimitiveArray; pub type DurationNanosecondArray = PrimitiveArray; /// An array of 128-bit fixed point decimals +/// +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::Decimal128Array; +/// // Create from Vec> +/// let arr = Decimal128Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Decimal128Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Decimal128Array = std::iter::repeat(42).take(10).collect(); +/// ``` pub type Decimal128Array = PrimitiveArray; /// An array of 256-bit fixed point decimals +/// +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::Decimal256Array; +/// use arrow_buffer::i256; +/// // Create from Vec> +/// let arr = Decimal256Array::from(vec![Some(i256::from(1)), None, Some(i256::from(2))]); +/// // Create from Vec +/// let arr = Decimal256Array::from(vec![i256::from(1), i256::from(2), i256::from(3)]); +/// // Create iter/collect +/// let arr: Decimal256Array = std::iter::repeat(i256::from(42)).take(10).collect(); +/// ``` pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index d8f1c5da16c7..8a1c0bd150d8 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -210,10 +210,24 @@ impl From> for GenericStringArray> +/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); +/// // Create from Vec<&str> +/// let arr = StringArray::from(vec!["foo", "bar", "baz"]); +/// // Create from iter/collect (requires Option<&str>) +/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect(); +/// ``` +/// +/// Construction and Access /// /// ``` -/// use arrow_array::StringArray; +/// # use arrow_array::StringArray; /// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]); /// assert_eq!(array.value(0), "foo"); /// ``` @@ -221,7 +235,21 @@ pub type StringArray = GenericStringArray; /// An array of `str` using `i64` offsets /// -/// Example +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::LargeStringArray; +/// // Create from Vec> +/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); +/// // Create from Vec<&str> +/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]); +/// // Create from iter/collect (requires Option<&str>) +/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect(); +/// ``` +/// +/// Constructon and Access /// /// ``` /// use arrow_array::LargeStringArray; diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs index b34dcdfa5c27..86150e67fd91 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint.rs @@ -97,6 +97,30 @@ impl FromStr for i256 { } } +impl From for i256 { + fn from(value: i8) -> Self { + Self::from_i128(value.into()) + } +} + +impl From for i256 { + fn from(value: i16) -> Self { + Self::from_i128(value.into()) + } +} + +impl From for i256 { + fn from(value: i32) -> Self { + Self::from_i128(value.into()) + } +} + +impl From for i256 { + fn from(value: i64) -> Self { + Self::from_i128(value.into()) + } +} + /// Parse `s` with any sign and leading 0s removed fn parse_impl(s: &str, negative: bool) -> Result { if s.len() <= 38 { From e121d5058a8f49caf51b22a9b5e93b788140715f Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Thu, 8 Jun 2023 15:56:07 +0300 Subject: [PATCH 0995/1411] feat: microsecond and millisecond (#4375) --- arrow-arith/src/temporal.rs | 71 +++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index f62e7e9a653a..0a313718c907 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -432,6 +432,41 @@ pub fn nanosecond_dyn(array: &dyn Array) -> Result { time_fraction_dyn(array, "nanosecond", |t| t.nanosecond() as i32) } +/// Extracts the microseconds of a given temporal primitive array as an array of integers +pub fn microsecond(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + time_fraction_internal(array, "microsecond", |t| (t.nanosecond() / 1_000) as i32) +} + +/// Extracts the microseconds of a given temporal primitive array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn microsecond_dyn(array: &dyn Array) -> Result { + time_fraction_dyn(array, "microsecond", |t| (t.nanosecond() / 1_000) as i32) +} + +/// Extracts the milliseconds of a given temporal primitive array as an array of integers +pub fn millisecond(array: &PrimitiveArray) -> Result +where + T: ArrowTemporalType + ArrowNumericType, + i64: From, +{ + time_fraction_internal(array, "millisecond", |t| { + (t.nanosecond() / 1_000_000) as i32 + }) +} +/// Extracts the milliseconds of a given temporal primitive array as an array of integers. +/// If the given array isn't temporal primitive or dictionary array, +/// an `Err` will be returned. +pub fn millisecond_dyn(array: &dyn Array) -> Result { + time_fraction_dyn(array, "millisecond", |t| { + (t.nanosecond() / 1_000_000) as i32 + }) +} + /// Extracts the time fraction of a given temporal array as an array of integers fn time_fraction_dyn( array: &dyn Array, @@ -1118,4 +1153,40 @@ mod tests { let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); } + + #[test] + fn test_temporal_array_date64_microsecond() { + let a: PrimitiveArray = vec![None, Some(1667328721453)].into(); + + let b = microsecond(&a).unwrap(); + assert!(!b.is_valid(0)); + assert_eq!(453_000, b.value(1)); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1)]); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); + let b = microsecond_dyn(&dict).unwrap(); + + let a = Int32Array::from(vec![None, Some(453_000)]); + let expected_dict = DictionaryArray::new(keys, Arc::new(a)); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + } + + #[test] + fn test_temporal_array_date64_millisecond() { + let a: PrimitiveArray = vec![None, Some(1667328721453)].into(); + + let b = millisecond(&a).unwrap(); + assert!(!b.is_valid(0)); + assert_eq!(453, b.value(1)); + + let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1)]); + let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); + let b = millisecond_dyn(&dict).unwrap(); + + let a = Int32Array::from(vec![None, Some(453)]); + let expected_dict = DictionaryArray::new(keys, Arc::new(a)); + let expected = Arc::new(expected_dict) as ArrayRef; + assert_eq!(&expected, &b); + } } From ad218fa99f4f6f0bc8b58ee95ad9f389c5f23b21 Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Thu, 8 Jun 2023 15:51:10 +0200 Subject: [PATCH 0996/1411] feat(flight): harmonize server metadata APIs (#4384) * feat(flight): harmonize server metadata APIs * Apply suggestions from code review Co-authored-by: Andrew Lamb * chore: fmt --------- Co-authored-by: Andrew Lamb --- arrow-flight/examples/flight_sql_server.rs | 26 ++- arrow-flight/src/sql/metadata/mod.rs | 2 +- arrow-flight/src/sql/metadata/sql_info.rs | 249 ++++++++++++--------- 3 files changed, 165 insertions(+), 112 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index e9dba08f0dcd..f717d9b621b2 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -31,7 +31,8 @@ use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; use arrow_flight::sql::metadata::{ - SqlInfoList, XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder, + SqlInfoData, SqlInfoDataBuilder, XdbcTypeInfo, XdbcTypeInfoData, + XdbcTypeInfoDataBuilder, }; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, @@ -66,13 +67,14 @@ const FAKE_TOKEN: &str = "uuid_token"; const FAKE_HANDLE: &str = "uuid_handle"; const FAKE_UPDATE_RESULT: i64 = 1; -static INSTANCE_SQL_INFO: Lazy = Lazy::new(|| { - SqlInfoList::new() - // Server information - .with_sql_info(SqlInfo::FlightSqlServerName, "Example Flight SQL Server") - .with_sql_info(SqlInfo::FlightSqlServerVersion, "1") - // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24 - .with_sql_info(SqlInfo::FlightSqlServerArrowVersion, "1.3") +static INSTANCE_SQL_DATA: Lazy = Lazy::new(|| { + let mut builder = SqlInfoDataBuilder::new(); + // Server information + builder.append(SqlInfo::FlightSqlServerName, "Example Flight SQL Server"); + builder.append(SqlInfo::FlightSqlServerVersion, "1"); + // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24 + builder.append(SqlInfo::FlightSqlServerArrowVersion, "1.3"); + builder.build().unwrap() }); static INSTANCE_XBDC_DATA: Lazy = Lazy::new(|| { @@ -345,7 +347,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let endpoint = FlightEndpoint::new().with_ticket(ticket); let flight_info = FlightInfo::new() - .try_with_schema(SqlInfoList::schema()) + .try_with_schema(query.into_builder(&INSTANCE_SQL_DATA).schema().as_ref()) .map_err(|e| status!("Unable to encode schema", e))? .with_endpoint(endpoint) .with_descriptor(flight_descriptor); @@ -532,9 +534,11 @@ impl FlightSqlService for FlightSqlServiceImpl { query: CommandGetSqlInfo, _request: Request, ) -> Result::DoGetStream>, Status> { - let batch = INSTANCE_SQL_INFO.filter(&query.info).encode(); + let builder = query.into_builder(&INSTANCE_SQL_DATA); + let schema = builder.schema(); + let batch = builder.build(); let stream = FlightDataEncoderBuilder::new() - .with_schema(Arc::new(SqlInfoList::schema().clone())) + .with_schema(schema) .build(futures::stream::once(async { batch })) .map_err(Status::from); Ok(Response::new(Box::pin(stream))) diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs index b823c1f4a643..72c882f385d3 100644 --- a/arrow-flight/src/sql/metadata/mod.rs +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -34,7 +34,7 @@ mod xdbc_info; pub use catalogs::GetCatalogsBuilder; pub use db_schemas::GetDbSchemasBuilder; -pub use sql_info::SqlInfoList; +pub use sql_info::{SqlInfoData, SqlInfoDataBuilder}; pub use tables::GetTablesBuilder; pub use xdbc_info::{XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder}; diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 4b4604078359..d0c9cedbcf7c 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -15,26 +15,34 @@ // specific language governing permissions and limitations // under the License. -//! [`SqlInfoList`] for building responses to [`CommandGetSqlInfo`] queries. +//! Helpers for building responses to [`CommandGetSqlInfo`] metadata requests. +//! +//! - [`SqlInfoDataBuilder`] - a builder for collecting sql infos +//! and building a conformant `RecordBatch` with sql info server metadata. +//! - [`SqlInfoData`] - a helper type wrapping a `RecordBatch` +//! used for storing sql info server metadata. +//! - [`GetSqlInfoBuilder`] - a builder for consructing [`CommandGetSqlInfo`] responses. //! -//! [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo -use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use arrow_array::array::{Array, UnionArray}; +use arrow_arith::boolean::or; +use arrow_array::array::{Array, UInt32Array, UnionArray}; use arrow_array::builder::{ ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, MapBuilder, StringBuilder, UInt32Builder, }; +use arrow_array::cast::downcast_array; use arrow_array::RecordBatch; use arrow_data::ArrayData; -use arrow_schema::{DataType, Field, Fields, Schema, UnionFields, UnionMode}; +use arrow_ord::comparison::eq_scalar; +use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef, UnionFields, UnionMode}; +use arrow_select::filter::filter_record_batch; use once_cell::sync::Lazy; use crate::error::Result; -use crate::sql::SqlInfo; +use crate::sql::{CommandGetSqlInfo, SqlInfo}; /// Represents a dynamic value #[derive(Debug, Clone, PartialEq)] @@ -321,39 +329,15 @@ impl SqlInfoUnionBuilder { } } -/// A builder for [`CommandGetSqlInfo`] response. +/// Helper to create [`CommandGetSqlInfo`] responses. /// /// [`CommandGetSqlInfo`] are metadata requests used by a Flight SQL -/// server to communicate supported capabilities to Flight SQL -/// clients. -/// -/// Servers construct a [`SqlInfoList`] by adding infos via -/// [`with_sql_info`] and build the response using [`encode`]. -/// -/// The available configuration options are defined in the [Flight SQL protos][protos]. -/// -/// # Example -/// ``` -/// # use arrow_flight::sql::{metadata::SqlInfoList, SqlInfo, SqlSupportedTransaction}; -/// // Create the list of metadata describing the server -/// let info_list = SqlInfoList::new() -/// .with_sql_info(SqlInfo::FlightSqlServerName, "server name") -/// // ... add other SqlInfo here .. -/// .with_sql_info( -/// SqlInfo::FlightSqlServerTransaction, -/// SqlSupportedTransaction::Transaction as i32, -/// ); +/// server to communicate supported capabilities to Flight SQL clients. /// -/// // Create the batch to send back to the client -/// let batch = info_list.encode().unwrap(); -/// ``` -/// -/// [protos]: https://github.com/apache/arrow/blob/6d3d2fca2c9693231fa1e52c142ceef563fc23f9/format/FlightSql.proto#L71-L820 -/// [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo -/// [`with_sql_info`]: SqlInfoList::with_sql_info -/// [`encode`]: SqlInfoList::encode +/// Servers constuct - usually static - [`SqlInfoData`] via the [SqlInfoDataBuilder`], +/// and build responses by passing the [`GetSqlInfoBuilder`]. #[derive(Debug, Clone, PartialEq)] -pub struct SqlInfoList { +pub struct SqlInfoDataBuilder { /// Use BTreeMap to ensure the values are sorted by value as /// to make output consistent /// @@ -362,13 +346,13 @@ pub struct SqlInfoList { infos: BTreeMap, } -impl Default for SqlInfoList { +impl Default for SqlInfoDataBuilder { fn default() -> Self { Self::new() } } -impl SqlInfoList { +impl SqlInfoDataBuilder { pub fn new() -> Self { Self { infos: BTreeMap::new(), @@ -376,40 +360,23 @@ impl SqlInfoList { } /// register the specific sql metadata item - pub fn with_sql_info( - mut self, - name: impl SqlInfoName, - value: impl Into, - ) -> Self { + pub fn append(&mut self, name: impl SqlInfoName, value: impl Into) { self.infos.insert(name.as_u32(), value.into()); - self - } - - /// Filter this info list keeping only the info values specified - /// in `infos`. - /// - /// Returns self if infos is empty (no filtering) - pub fn filter(&self, info: &[u32]) -> Cow<'_, Self> { - if info.is_empty() { - Cow::Borrowed(self) - } else { - let infos: BTreeMap<_, _> = info - .iter() - .filter_map(|name| self.infos.get(name).map(|v| (*name, v.clone()))) - .collect(); - Cow::Owned(Self { infos }) - } } /// Encode the contents of this list according to the [FlightSQL spec] /// /// [FlightSQL spec]: (https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/FlightSql.proto#L32-L43 - pub fn encode(&self) -> Result { + pub fn build(self) -> Result { let mut name_builder = UInt32Builder::new(); let mut value_builder = SqlInfoUnionBuilder::new(); - for (&name, value) in self.infos.iter() { - name_builder.append_value(name); + let mut names: Vec<_> = self.infos.keys().cloned().collect(); + names.sort_unstable(); + + for key in names { + let (name, value) = self.infos.get_key_value(&key).unwrap(); + name_builder.append_value(*name); value_builder.append_value(value)? } @@ -417,7 +384,8 @@ impl SqlInfoList { ("info_name", Arc::new(name_builder.finish()) as _), ("value", Arc::new(value_builder.finish()) as _), ])?; - Ok(batch) + + Ok(SqlInfoData { batch }) } /// Return the [`Schema`] for a GetSchema RPC call with [`crate::sql::CommandGetSqlInfo`] @@ -427,7 +395,89 @@ impl SqlInfoList { } } -// The schema produced by [`SqlInfoList`] +/// A builder for [`SqlInfoData`] which is used to create [`CommandGetSqlInfo`] responses. +/// +/// # Example +/// ``` +/// # use arrow_flight::sql::{metadata::SqlInfoDataBuilder, SqlInfo, SqlSupportedTransaction}; +/// // Create the list of metadata describing the server +/// let mut builder = SqlInfoDataBuilder::new(); +/// builder.append(SqlInfo::FlightSqlServerName, "server name"); +/// // ... add other SqlInfo here .. +/// builder.append( +/// SqlInfo::FlightSqlServerTransaction, +/// SqlSupportedTransaction::Transaction as i32, +/// ); +/// +/// // Create the batch to send back to the client +/// let info_data = builder.build().unwrap(); +/// ``` +/// +/// [protos]: https://github.com/apache/arrow/blob/6d3d2fca2c9693231fa1e52c142ceef563fc23f9/format/FlightSql.proto#L71-L820 +pub struct SqlInfoData { + batch: RecordBatch, +} + +impl SqlInfoData { + /// Return a [`RecordBatch`] containing only the requested `u32`, if any + /// from [`CommandGetSqlInfo`] + pub fn record_batch( + &self, + info: impl IntoIterator, + ) -> Result { + let arr: UInt32Array = downcast_array(self.batch.column(0).as_ref()); + let type_filter = info + .into_iter() + .map(|tt| eq_scalar(&arr, tt)) + .collect::, _>>()? + .into_iter() + // We know the arrays are of same length as they are produced fromn the same root array + .reduce(|filter, arr| or(&filter, &arr).unwrap()); + if let Some(filter) = type_filter { + Ok(filter_record_batch(&self.batch, &filter)?) + } else { + Ok(self.batch.clone()) + } + } + + /// Return the schema of the RecordBatch that will be returned + /// from [`CommandGetSqlInfo`] + pub fn schema(&self) -> SchemaRef { + self.batch.schema() + } +} + +/// A builder for a [`CommandGetSqlInfo`] response. +pub struct GetSqlInfoBuilder<'a> { + /// requested `SqlInfo`s. If empty means return all infos. + info: Vec, + infos: &'a SqlInfoData, +} + +impl CommandGetSqlInfo { + /// Create a builder suitable for constructing a response + pub fn into_builder(self, infos: &SqlInfoData) -> GetSqlInfoBuilder { + GetSqlInfoBuilder { + info: self.info, + infos, + } + } +} + +impl GetSqlInfoBuilder<'_> { + /// Builds a `RecordBatch` with the correct schema for a [`CommandGetSqlInfo`] response + pub fn build(self) -> Result { + self.infos.record_batch(self.info) + } + + /// Return the schema of the RecordBatch that will be returned + /// from [`CommandGetSqlInfo`] + pub fn schema(&self) -> SchemaRef { + self.infos.schema() + } +} + +// The schema produced by [`SqlInfoData`] static SQL_INFO_SCHEMA: Lazy = Lazy::new(|| { Schema::new(vec![ Field::new("info_name", DataType::UInt32, false), @@ -439,7 +489,7 @@ static SQL_INFO_SCHEMA: Lazy = Lazy::new(|| { mod tests { use std::collections::HashMap; - use super::SqlInfoList; + use super::SqlInfoDataBuilder; use crate::sql::metadata::tests::assert_batches_eq; use crate::sql::{ SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert, @@ -456,23 +506,23 @@ mod tests { ], ); - let batch = SqlInfoList::new() - // str - .with_sql_info(SqlInfo::SqlIdentifierQuoteChar, r#"""#) - // bool - .with_sql_info(SqlInfo::SqlDdlCatalog, false) - // i32 - .with_sql_info( - SqlInfo::SqlNullOrdering, - SqlNullOrdering::SqlNullsSortedHigh as i32, - ) - // i64 - .with_sql_info(SqlInfo::SqlMaxBinaryLiteralLength, i32::MAX as i64) - // [str] - .with_sql_info(SqlInfo::SqlKeywords, &["SELECT", "DELETE"] as &[&str]) - .with_sql_info(SqlInfo::SqlSupportsConvert, &convert) - .encode() - .unwrap(); + let mut builder = SqlInfoDataBuilder::new(); + // str + builder.append(SqlInfo::SqlIdentifierQuoteChar, r#"""#); + // bool + builder.append(SqlInfo::SqlDdlCatalog, false); + // i32 + builder.append( + SqlInfo::SqlNullOrdering, + SqlNullOrdering::SqlNullsSortedHigh as i32, + ); + // i64 + builder.append(SqlInfo::SqlMaxBinaryLiteralLength, i32::MAX as i64); + // [str] + builder.append(SqlInfo::SqlKeywords, &["SELECT", "DELETE"] as &[&str]); + builder.append(SqlInfo::SqlSupportsConvert, &convert); + + let batch = builder.build().unwrap().record_batch(None).unwrap(); let expected = vec![ "+-----------+----------------------------------------+", @@ -492,27 +542,26 @@ mod tests { #[test] fn test_filter_sql_infos() { - let info_list = SqlInfoList::new() - .with_sql_info(SqlInfo::FlightSqlServerName, "server name") - .with_sql_info( - SqlInfo::FlightSqlServerTransaction, - SqlSupportedTransaction::Transaction as i32, - ); - - let batch = info_list.encode().unwrap(); + let mut builder = SqlInfoDataBuilder::new(); + builder.append(SqlInfo::FlightSqlServerName, "server name"); + builder.append( + SqlInfo::FlightSqlServerTransaction, + SqlSupportedTransaction::Transaction as i32, + ); + let data = builder.build().unwrap(); + + let batch = data.record_batch(None).unwrap(); assert_eq!(batch.num_rows(), 2); - let batch = info_list - .filter(&[SqlInfo::FlightSqlServerTransaction as u32]) - .encode() - .unwrap(); - let ref_batch = SqlInfoList::new() - .with_sql_info( - SqlInfo::FlightSqlServerTransaction, - SqlSupportedTransaction::Transaction as i32, - ) - .encode() + let batch = data + .record_batch([SqlInfo::FlightSqlServerTransaction as u32]) .unwrap(); + let mut ref_builder = SqlInfoDataBuilder::new(); + ref_builder.append( + SqlInfo::FlightSqlServerTransaction, + SqlSupportedTransaction::Transaction as i32, + ); + let ref_batch = ref_builder.build().unwrap().record_batch(None).unwrap(); assert_eq!(batch, ref_batch); } From fac00bf5b03448db224fadb3965fb422a4394a4e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Jun 2023 16:23:38 +0100 Subject: [PATCH 0997/1411] Add MapArray constructors and doc example (#4382) * Add MapArray constructors * Clippy * Review feedback * Link to builder (#4385) * Clippy * Further docs tweaks --- arrow-array/src/array/list_array.rs | 3 +- arrow-array/src/array/map_array.rs | 192 +++++++++++++++++++++++++++- 2 files changed, 191 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index d016afccbfe5..abb5ba5e3c0b 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -94,6 +94,7 @@ impl GenericListArray { /// * `offsets.len() - 1 != nulls.len()` /// * `offsets.last() > values.len()` /// * `!field.is_nullable() && values.null_count() != 0` + /// * `field.data_type() != values.data_type()` pub fn try_new( field: FieldRef, offsets: OffsetBuffer, @@ -103,7 +104,7 @@ impl GenericListArray { let len = offsets.len() - 1; // Offsets guaranteed to not be empty let end_offset = offsets.last().unwrap().as_usize(); // don't need to check other values of `offsets` because they are checked - // during construction of `OffsetsbBuffer` + // during construction of `OffsetBuffer` if end_offset > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Max offset of {end_offset} exceeds length of values {}", diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index c98bca9505c3..fca49cd7836f 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -22,7 +22,7 @@ use crate::{ }; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType, Field, FieldRef}; use std::any::Any; use std::sync::Arc; @@ -30,8 +30,10 @@ use std::sync::Arc; /// /// Keys should always be non-null, but values can be null. /// -/// [MapArray] is physically a [crate::array::ListArray] that has a -/// [StructArray] with 2 child fields. +/// [`MapArray`] is physically a [`ListArray`] of key values pairs stored as an `entries` +/// [`StructArray`] with 2 child fields. +/// +/// See [`MapBuilder`](crate::builder::MapBuilder) for how to construct a [`MapArray`] #[derive(Clone)] pub struct MapArray { data_type: DataType, @@ -43,6 +45,112 @@ pub struct MapArray { } impl MapArray { + /// Create a new [`MapArray`] from the provided parts + /// + /// See [`MapBuilder`](crate::builder::MapBuilder) for a higher-level interface + /// to construct a [`MapArray`] + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() - 1 != nulls.len()` + /// * `offsets.last() > entries.len()` + /// * `field.is_nullable()` + /// * `entries.null_count() != 0` + /// * `entries.columns().len() != 2` + /// * `field.data_type() != entries.data_type()` + pub fn try_new( + field: FieldRef, + offsets: OffsetBuffer, + entries: StructArray, + nulls: Option, + ordered: bool, + ) -> Result { + let len = offsets.len() - 1; // Offsets guaranteed to not be empty + let end_offset = offsets.last().unwrap().as_usize(); + // don't need to check other values of `offsets` because they are checked + // during construction of `OffsetBuffer` + if end_offset > entries.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Max offset of {end_offset} exceeds length of entries {}", + entries.len() + ))); + } + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for MapArray, expected {len} got {}", + n.len(), + ))); + } + } + if field.is_nullable() || entries.null_count() != 0 { + return Err(ArrowError::InvalidArgumentError( + "MapArray entries cannot contain nulls".to_string(), + )); + } + + if field.data_type() != entries.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray expected data type {} got {} for {:?}", + field.data_type(), + entries.data_type(), + field.name() + ))); + } + + if entries.columns().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray entries must contain two children, got {}", + entries.columns().len() + ))); + } + + Ok(Self { + data_type: DataType::Map(field, ordered), + nulls, + entries, + value_offsets: offsets, + }) + } + + /// Create a new [`MapArray`] from the provided parts + /// + /// See [`MapBuilder`](crate::builder::MapBuilder) for a higher-level interface + /// to construct a [`MapArray`] + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: OffsetBuffer, + entries: StructArray, + nulls: Option, + ordered: bool, + ) -> Self { + Self::try_new(field, offsets, entries, nulls, ordered).unwrap() + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + OffsetBuffer, + StructArray, + Option, + bool, + ) { + let (f, ordered) = match self.data_type { + DataType::Map(f, ordered) => (f, ordered), + _ => unreachable!(), + }; + (f, self.value_offsets, self.entries, self.nulls, ordered) + } + /// Returns a reference to the offsets of this map /// /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`] @@ -623,4 +731,82 @@ mod tests { assert!(!map_array.is_null(i)); } } + + #[test] + fn test_try_new() { + let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into()); + let fields = Fields::from(vec![ + Field::new("key", DataType::Int32, false), + Field::new("values", DataType::Int32, false), + ]); + let columns = vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _, + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _, + ]; + + let entries = StructArray::new(fields.clone(), columns, None); + let field = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + + MapArray::new(field.clone(), offsets.clone(), entries.clone(), None, false); + + let nulls = NullBuffer::new_null(3); + MapArray::new(field.clone(), offsets, entries.clone(), Some(nulls), false); + + let nulls = NullBuffer::new_null(3); + let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into()); + let err = MapArray::try_new( + field.clone(), + offsets.clone(), + entries.clone(), + Some(nulls), + false, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for MapArray, expected 4 got 3" + ); + + let err = + MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Max offset of 5 exceeds length of entries 2" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = MapArray::try_new(field, offsets.clone(), entries, None, false) + .unwrap_err() + .to_string(); + + assert!( + err.starts_with( + "Invalid argument error: MapArray expected data type Int64 got Struct" + ), + "{err}" + ); + + let fields = Fields::from(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), + ]); + let columns = vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _, + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _, + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as _, + ]; + + let s = StructArray::new(fields.clone(), columns, None); + let field = Arc::new(Field::new("entries", DataType::Struct(fields), false)); + let err = MapArray::try_new(field, offsets, s, None, false).unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: MapArray entries must contain two children, got 3" + ); + } } From b780a0bb22048d0e28b684ed8b3cff1bc789d394 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Jun 2023 16:24:24 +0100 Subject: [PATCH 0998/1411] Update proc-macro2 requirement from =1.0.59 to =1.0.60 (#4388) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.59...1.0.60) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 5f2f756b6237..d3a9c4e42de0 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.59", default-features = false } +proc-macro2 = { version = "=1.0.60", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 7fa82d68118aee447ad5aa23511e6fc06706951d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Jun 2023 19:06:55 +0100 Subject: [PATCH 0999/1411] Add NullBuffer and BooleanBuffer From conversions (#4380) * Add NullBuffer and BooleanBuffer From conversions * Review feedback --- arrow-array/src/array/primitive_array.rs | 20 +++++++++++++++-- arrow-buffer/src/buffer/boolean.rs | 28 ++++++++++++++++++++++-- arrow-buffer/src/buffer/null.rs | 24 ++++++++++++++++++++ 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index ce526a274fcf..7220aca8f44b 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -419,16 +419,32 @@ impl Clone for PrimitiveArray { } impl PrimitiveArray { - /// Create a new [`PrimitiveArray`] from the provided data_type, values, nulls + /// Create a new [`PrimitiveArray`] from the provided values and nulls /// /// # Panics /// /// Panics if [`Self::try_new`] returns an error + /// + /// # Example + /// + /// Creating a [`PrimitiveArray`] directly from a [`ScalarBuffer`] and [`NullBuffer`] using + /// this constructor is the most performant approach, avoiding any additional allocations + /// + /// ``` + /// # use arrow_array::Int32Array; + /// # use arrow_array::types::Int32Type; + /// # use arrow_buffer::NullBuffer; + /// // [1, 2, 3, 4] + /// let array = Int32Array::new(vec![1, 2, 3, 4].into(), None); + /// // [1, null, 3, 4] + /// let nulls = NullBuffer::from(vec![true, false, true, true]); + /// let array = Int32Array::new(vec![1, 2, 3, 4].into(), Some(nulls)); + /// ``` pub fn new(values: ScalarBuffer, nulls: Option) -> Self { Self::try_new(values, nulls).unwrap() } - /// Create a new [`PrimitiveArray`] from the provided data_type, values, nulls + /// Create a new [`PrimitiveArray`] from the provided values and nulls /// /// # Errors /// diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index ffee13bd4956..9098926c5a60 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -18,8 +18,8 @@ use crate::bit_chunk_iterator::BitChunks; use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::{ - bit_util, buffer_bin_and, buffer_bin_or, buffer_bin_xor, buffer_unary_not, Buffer, - MutableBuffer, + bit_util, buffer_bin_and, buffer_bin_or, buffer_bin_xor, buffer_unary_not, + BooleanBufferBuilder, Buffer, MutableBuffer, }; use std::ops::{BitAnd, BitOr, BitXor, Not}; @@ -265,6 +265,30 @@ impl<'a> IntoIterator for &'a BooleanBuffer { } } +impl From<&[bool]> for BooleanBuffer { + fn from(value: &[bool]) -> Self { + let mut builder = BooleanBufferBuilder::new(value.len()); + builder.append_slice(value); + builder.finish() + } +} + +impl From> for BooleanBuffer { + fn from(value: Vec) -> Self { + value.as_slice().into() + } +} + +impl FromIterator for BooleanBuffer { + fn from_iter>(iter: T) -> Self { + let iter = iter.into_iter(); + let (hint, _) = iter.size_hint(); + let mut builder = BooleanBufferBuilder::new(hint); + iter.for_each(|b| builder.append(b)); + builder.finish() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 008d1f04fe85..260f5d78de33 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -218,6 +218,30 @@ impl<'a> IntoIterator for &'a NullBuffer { } } +impl From for NullBuffer { + fn from(value: BooleanBuffer) -> Self { + Self::new(value) + } +} + +impl From<&[bool]> for NullBuffer { + fn from(value: &[bool]) -> Self { + BooleanBuffer::from(value).into() + } +} + +impl From> for NullBuffer { + fn from(value: Vec) -> Self { + BooleanBuffer::from(value).into() + } +} + +impl FromIterator for NullBuffer { + fn from_iter>(iter: T) -> Self { + BooleanBuffer::from_iter(iter).into() + } +} + #[cfg(test)] mod tests { use super::*; From ab56693985826bb8caea30558b8c25db286a5e37 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 8 Jun 2023 19:07:50 +0100 Subject: [PATCH 1000/1411] Move record delimiting into ColumnReader (#4365) (#4376) * Move record delimiting into ColumnReader (#4365) * Misc tweaks * More tests * Clippy * Review feedback --- .../array_reader/fixed_len_byte_array.rs | 4 +- parquet/src/arrow/arrow_reader/mod.rs | 182 +++++++- parquet/src/arrow/buffer/dictionary_buffer.rs | 30 +- parquet/src/arrow/buffer/offset_buffer.rs | 29 +- parquet/src/arrow/record_reader/buffer.rs | 38 +- .../arrow/record_reader/definition_levels.rs | 88 +--- parquet/src/arrow/record_reader/mod.rs | 209 ++------- parquet/src/column/mod.rs | 11 +- parquet/src/column/reader.rs | 240 +++++----- parquet/src/column/reader/decoder.rs | 439 ++++++++++-------- parquet/src/column/writer/mod.rs | 6 +- parquet/src/file/writer.rs | 4 +- parquet/src/record/triplet.rs | 11 +- 13 files changed, 649 insertions(+), 642 deletions(-) diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 47bd03a735e1..b06091b6b57a 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -247,8 +247,8 @@ impl BufferQueue for FixedLenByteArrayBuffer { type Output = Buffer; type Slice = Self; - fn split_off(&mut self, len: usize) -> Self::Output { - self.buffer.split_off(len * self.byte_length) + fn consume(&mut self) -> Self::Output { + self.buffer.consume() } fn spare_capacity_mut(&mut self, _batch_size: usize) -> &mut Self::Slice { diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 432b003990e5..988738dac6ac 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -548,12 +548,14 @@ mod tests { use tempfile::tempfile; use arrow_array::builder::*; + use arrow_array::cast::AsArray; use arrow_array::types::{Decimal128Type, Decimal256Type, DecimalType}; use arrow_array::*; use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_buffer::{i256, ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema}; + use arrow_select::concat::concat_batches; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader, @@ -562,6 +564,7 @@ mod tests { use crate::arrow::schema::add_encoded_arrow_schema_to_metadata; use crate::arrow::{ArrowWriter, ProjectionMask}; use crate::basic::{ConvertedType, Encoding, Repetition, Type as PhysicalType}; + use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE; use crate::data_type::{ BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType, Int32Type, Int64Type, Int96Type, @@ -2131,15 +2134,15 @@ mod tests { #[test] fn test_row_group_exact_multiple() { - use crate::arrow::record_reader::MIN_BATCH_SIZE; + const BATCH_SIZE: usize = REPETITION_LEVELS_BATCH_SIZE; test_row_group_batch(8, 8); test_row_group_batch(10, 8); test_row_group_batch(8, 10); - test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE); - test_row_group_batch(MIN_BATCH_SIZE + 1, MIN_BATCH_SIZE); - test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE + 1); - test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE - 1); - test_row_group_batch(MIN_BATCH_SIZE - 1, MIN_BATCH_SIZE); + test_row_group_batch(BATCH_SIZE, BATCH_SIZE); + test_row_group_batch(BATCH_SIZE + 1, BATCH_SIZE); + test_row_group_batch(BATCH_SIZE, BATCH_SIZE + 1); + test_row_group_batch(BATCH_SIZE, BATCH_SIZE - 1); + test_row_group_batch(BATCH_SIZE - 1, BATCH_SIZE); } /// Given a RecordBatch containing all the column data, return the expected batches given @@ -2610,4 +2613,171 @@ mod tests { test_decimal_roundtrip::(); test_decimal_roundtrip::(); } + + #[test] + fn test_list_selection() { + let schema = Arc::new(Schema::new(vec![Field::new_list( + "list", + Field::new("item", ArrowDataType::Utf8, true), + false, + )])); + let mut buf = Vec::with_capacity(1024); + + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), None).unwrap(); + + for i in 0..2 { + let mut list_a_builder = ListBuilder::new(StringBuilder::new()); + for j in 0..1024 { + list_a_builder.values().append_value(format!("{i} {j}")); + list_a_builder.append(true); + } + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(list_a_builder.finish())], + ) + .unwrap(); + writer.write(&batch).unwrap(); + } + let _metadata = writer.close().unwrap(); + + let buf = Bytes::from(buf); + let reader = ParquetRecordBatchReaderBuilder::try_new(buf) + .unwrap() + .with_row_selection(RowSelection::from(vec![ + RowSelector::skip(100), + RowSelector::select(924), + RowSelector::skip(100), + RowSelector::select(924), + ])) + .build() + .unwrap(); + + let batches = reader.collect::, _>>().unwrap(); + let batch = concat_batches(&schema, &batches).unwrap(); + + assert_eq!(batch.num_rows(), 924 * 2); + let list = batch.column(0).as_list::(); + + for w in list.value_offsets().windows(2) { + assert_eq!(w[0] + 1, w[1]) + } + let mut values = list.values().as_string::().iter(); + + for i in 0..2 { + for j in 100..1024 { + let expected = format!("{i} {j}"); + assert_eq!(values.next().unwrap().unwrap(), &expected); + } + } + } + + #[test] + fn test_list_selection_fuzz() { + let mut rng = thread_rng(); + let schema = Arc::new(Schema::new(vec![Field::new_list( + "list", + Field::new_list("item", Field::new("item", ArrowDataType::Int32, true), true), + true, + )])); + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), None).unwrap(); + + let mut list_a_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new())); + + for _ in 0..2048 { + if rng.gen_bool(0.2) { + list_a_builder.append(false); + continue; + } + + let list_a_len = rng.gen_range(0..10); + let list_b_builder = list_a_builder.values(); + + for _ in 0..list_a_len { + if rng.gen_bool(0.2) { + list_b_builder.append(false); + continue; + } + + let list_b_len = rng.gen_range(0..10); + let int_builder = list_b_builder.values(); + for _ in 0..list_b_len { + match rng.gen_bool(0.2) { + true => int_builder.append_null(), + false => int_builder.append_value(rng.gen()), + } + } + list_b_builder.append(true) + } + list_a_builder.append(true); + } + + let array = Arc::new(list_a_builder.finish()); + let batch = RecordBatch::try_new(schema, vec![array]).unwrap(); + + writer.write(&batch).unwrap(); + let _metadata = writer.close().unwrap(); + + let buf = Bytes::from(buf); + + let cases = [ + vec![ + RowSelector::skip(100), + RowSelector::select(924), + RowSelector::skip(100), + RowSelector::select(924), + ], + vec![ + RowSelector::select(924), + RowSelector::skip(100), + RowSelector::select(924), + RowSelector::skip(100), + ], + vec![ + RowSelector::skip(1023), + RowSelector::select(1), + RowSelector::skip(1023), + RowSelector::select(1), + ], + vec![ + RowSelector::select(1), + RowSelector::skip(1023), + RowSelector::select(1), + RowSelector::skip(1023), + ], + ]; + + for batch_size in [100, 1024, 2048] { + for selection in &cases { + let selection = RowSelection::from(selection.clone()); + let reader = ParquetRecordBatchReaderBuilder::try_new(buf.clone()) + .unwrap() + .with_row_selection(selection.clone()) + .with_batch_size(batch_size) + .build() + .unwrap(); + + let batches = reader.collect::, _>>().unwrap(); + let actual = concat_batches(&batch.schema(), &batches).unwrap(); + assert_eq!(actual.num_rows(), selection.row_count()); + + let mut batch_offset = 0; + let mut actual_offset = 0; + for selector in selection.iter() { + if selector.skip { + batch_offset += selector.row_count; + continue; + } + + assert_eq!( + batch.slice(batch_offset, selector.row_count), + actual.slice(actual_offset, selector.row_count) + ); + + batch_offset += selector.row_count; + actual_offset += selector.row_count; + } + } + } + } } diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 529c28872642..6344d9dd3145 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -227,14 +227,14 @@ impl BufferQueue type Output = Self; type Slice = Self; - fn split_off(&mut self, len: usize) -> Self::Output { + fn consume(&mut self) -> Self::Output { match self { Self::Dict { keys, values } => Self::Dict { - keys: keys.take(len), + keys: std::mem::take(keys), values: values.clone(), }, Self::Values { values } => Self::Values { - values: values.split_off(len), + values: values.consume(), }, } } @@ -275,20 +275,6 @@ mod tests { let valid_buffer = Buffer::from_iter(valid.iter().cloned()); buffer.pad_nulls(0, values.len(), valid.len(), valid_buffer.as_slice()); - // Split off some data - - let split = buffer.split_off(4); - let null_buffer = Buffer::from_iter(valid.drain(0..4)); - let array = split.into_array(Some(null_buffer), &dict_type).unwrap(); - assert_eq!(array.data_type(), &dict_type); - - let strings = cast(&array, &ArrowType::Utf8).unwrap(); - let strings = strings.as_any().downcast_ref::().unwrap(); - assert_eq!( - strings.iter().collect::>(), - vec![None, None, Some("world"), Some("hello")] - ); - // Read some data not preserving the dictionary let values = buffer.spill_values().unwrap(); @@ -300,8 +286,8 @@ mod tests { let null_buffer = Buffer::from_iter(valid.iter().cloned()); buffer.pad_nulls(read_offset, 2, 5, null_buffer.as_slice()); - assert_eq!(buffer.len(), 9); - let split = buffer.split_off(9); + assert_eq!(buffer.len(), 13); + let split = buffer.consume(); let array = split.into_array(Some(null_buffer), &dict_type).unwrap(); assert_eq!(array.data_type(), &dict_type); @@ -311,6 +297,10 @@ mod tests { assert_eq!( strings.iter().collect::>(), vec![ + None, + None, + Some("world"), + Some("hello"), None, Some("a"), Some(""), @@ -332,7 +322,7 @@ mod tests { .unwrap() .extend_from_slice(&[0, 1, 0, 1]); - let array = buffer.split_off(4).into_array(None, &dict_type).unwrap(); + let array = buffer.consume().into_array(None, &dict_type).unwrap(); assert_eq!(array.data_type(), &dict_type); let strings = cast(&array, &ArrowType::Utf8).unwrap(); diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index df96996e3cbc..c8732bc4ed13 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -151,25 +151,8 @@ impl BufferQueue for OffsetBuffer { type Output = Self; type Slice = Self; - fn split_off(&mut self, len: usize) -> Self::Output { - assert!(self.offsets.len() > len, "{} > {}", self.offsets.len(), len); - let remaining_offsets = self.offsets.len() - len - 1; - let offsets = self.offsets.as_slice(); - - let end_offset = offsets[len]; - - let mut new_offsets = ScalarBuffer::new(); - new_offsets.reserve(remaining_offsets + 1); - for v in &offsets[len..] { - new_offsets.push(*v - end_offset) - } - - self.offsets.resize(len + 1); - - Self { - offsets: std::mem::replace(&mut self.offsets, new_offsets), - values: self.values.take(end_offset.as_usize()), - } + fn consume(&mut self) -> Self::Output { + std::mem::take(self) } fn spare_capacity_mut(&mut self, _batch_size: usize) -> &mut Self::Slice { @@ -267,18 +250,18 @@ mod tests { } #[test] - fn test_offset_buffer_split() { + fn test_offset_buffer() { let mut buffer = OffsetBuffer::::default(); for v in ["hello", "world", "cupcakes", "a", "b", "c"] { buffer.try_push(v.as_bytes(), false).unwrap() } - let split = buffer.split_off(3); + let split = buffer.consume(); let array = split.into_array(None, ArrowType::Utf8); let strings = array.as_any().downcast_ref::().unwrap(); assert_eq!( strings.iter().map(|x| x.unwrap()).collect::>(), - vec!["hello", "world", "cupcakes"] + vec!["hello", "world", "cupcakes", "a", "b", "c"] ); buffer.try_push("test".as_bytes(), false).unwrap(); @@ -286,7 +269,7 @@ mod tests { let strings = array.as_any().downcast_ref::().unwrap(); assert_eq!( strings.iter().map(|x| x.unwrap()).collect::>(), - vec!["a", "b", "c", "test"] + vec!["test"] ); } diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 404989493883..4a0fc2a2f2eb 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -30,13 +30,8 @@ pub trait BufferQueue: Sized { type Slice: ?Sized; - /// Split out the first `len` items - /// - /// # Panics - /// - /// Implementations must panic if `len` is beyond the length of [`BufferQueue`] - /// - fn split_off(&mut self, len: usize) -> Self::Output; + /// Consumes the contents of this [`BufferQueue`] + fn consume(&mut self) -> Self::Output; /// Returns a [`Self::Slice`] with at least `batch_size` capacity that can be used /// to append data to the end of this [`BufferQueue`] @@ -146,31 +141,6 @@ impl ScalarBuffer { assert!(prefix.is_empty() && suffix.is_empty()); buf } - - pub fn take(&mut self, len: usize) -> Self { - assert!(len <= self.len); - - let num_bytes = len * std::mem::size_of::(); - let remaining_bytes = self.buffer.len() - num_bytes; - // TODO: Optimize to reduce the copy - // create an empty buffer, as it will be resized below - let mut remaining = MutableBuffer::new(0); - remaining.resize(remaining_bytes, 0); - - let new_records = remaining.as_slice_mut(); - - new_records[0..remaining_bytes] - .copy_from_slice(&self.buffer.as_slice()[num_bytes..]); - - self.buffer.resize(num_bytes, 0); - self.len -= len; - - Self { - buffer: std::mem::replace(&mut self.buffer, remaining), - len, - _phantom: Default::default(), - } - } } impl ScalarBuffer { @@ -196,8 +166,8 @@ impl BufferQueue for ScalarBuffer { type Slice = [T]; - fn split_off(&mut self, len: usize) -> Self::Output { - self.take(len).into() + fn consume(&mut self) -> Self::Output { + std::mem::take(self).into() } fn spare_capacity_mut(&mut self, batch_size: usize) -> &mut Self::Slice { diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 272716caf664..5be0ac84dea2 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -22,16 +22,16 @@ use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; use arrow_buffer::Buffer; use crate::arrow::buffer::bit_util::count_set_bits; -use crate::arrow::record_reader::buffer::BufferQueue; use crate::basic::Encoding; use crate::column::reader::decoder::{ - ColumnLevelDecoder, ColumnLevelDecoderImpl, DefinitionLevelDecoder, LevelsBufferSlice, + ColumnLevelDecoder, DefinitionLevelDecoder, DefinitionLevelDecoderImpl, + LevelsBufferSlice, }; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; -use super::{buffer::ScalarBuffer, MIN_BATCH_SIZE}; +use super::buffer::ScalarBuffer; enum BufferInner { /// Compute levels and null mask @@ -87,13 +87,10 @@ impl DefinitionLevelBuffer { Self { inner, len: 0 } } - pub fn split_levels(&mut self, len: usize) -> Option { + /// Returns the built level data + pub fn consume_levels(&mut self) -> Option { match &mut self.inner { - BufferInner::Full { levels, .. } => { - let out = levels.split_off(len); - self.len = levels.len(); - Some(out) - } + BufferInner::Full { levels, .. } => Some(std::mem::take(levels).into()), BufferInner::Mask { .. } => None, } } @@ -103,27 +100,13 @@ impl DefinitionLevelBuffer { self.len = len; } - /// Split `len` levels out of `self` - pub fn split_bitmask(&mut self, len: usize) -> Buffer { - let old_builder = match &mut self.inner { - BufferInner::Full { nulls, .. } => nulls, - BufferInner::Mask { nulls } => nulls, - }; - - // Compute the number of values left behind - let num_left_values = old_builder.len() - len; - let mut new_builder = - BooleanBufferBuilder::new(MIN_BATCH_SIZE.max(num_left_values)); - - // Copy across remaining values - new_builder.append_packed_range(len..old_builder.len(), old_builder.as_slice()); - - // Truncate buffer - old_builder.resize(len); - - // Swap into self - self.len = new_builder.len(); - std::mem::replace(old_builder, new_builder).into() + /// Returns the built null bitmask + pub fn consume_bitmask(&mut self) -> Buffer { + self.len = 0; + match &mut self.inner { + BufferInner::Full { nulls, .. } => nulls.finish().into_inner(), + BufferInner::Mask { nulls } => nulls.finish().into_inner(), + } } pub fn nulls(&self) -> &BooleanBufferBuilder { @@ -148,7 +131,7 @@ impl LevelsBufferSlice for DefinitionLevelBuffer { enum MaybePacked { Packed(PackedDecoder), - Fallback(ColumnLevelDecoderImpl), + Fallback(DefinitionLevelDecoderImpl), } pub struct DefinitionLevelBufferDecoder { @@ -160,7 +143,7 @@ impl DefinitionLevelBufferDecoder { pub fn new(max_level: i16, packed: bool) -> Self { let decoder = match packed { true => MaybePacked::Packed(PackedDecoder::new()), - false => MaybePacked::Fallback(ColumnLevelDecoderImpl::new(max_level)), + false => MaybePacked::Fallback(DefinitionLevelDecoderImpl::new(max_level)), }; Self { max_level, decoder } @@ -176,8 +159,14 @@ impl ColumnLevelDecoder for DefinitionLevelBufferDecoder { MaybePacked::Fallback(d) => d.set_data(encoding, data), } } +} - fn read(&mut self, writer: &mut Self::Slice, range: Range) -> Result { +impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { + fn read_def_levels( + &mut self, + writer: &mut Self::Slice, + range: Range, + ) -> Result { match (&mut writer.inner, &mut self.decoder) { ( BufferInner::Full { @@ -193,7 +182,7 @@ impl ColumnLevelDecoder for DefinitionLevelBufferDecoder { levels.resize(range.end + writer.len); let slice = &mut levels.as_slice_mut()[writer.len..]; - let levels_read = decoder.read(slice, range.clone())?; + let levels_read = decoder.read_def_levels(slice, range.clone())?; nulls.reserve(levels_read); for i in &slice[range.start..range.start + levels_read] { @@ -211,9 +200,7 @@ impl ColumnLevelDecoder for DefinitionLevelBufferDecoder { _ => unreachable!("inconsistent null mask"), } } -} -impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { fn skip_def_levels( &mut self, num_levels: usize, @@ -391,11 +378,8 @@ impl PackedDecoder { #[cfg(test)] mod tests { use super::*; - use std::sync::Arc; - use crate::basic::Type as PhysicalType; use crate::encodings::rle::RleEncoder; - use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; use rand::{thread_rng, Rng}; #[test] @@ -492,30 +476,4 @@ mod tests { assert_eq!(read_level + skip_level, len); assert_eq!(read_value + skip_value, total_value); } - - #[test] - fn test_split_off() { - let t = Type::primitive_type_builder("col", PhysicalType::INT32) - .build() - .unwrap(); - - let descriptor = Arc::new(ColumnDescriptor::new( - Arc::new(t), - 1, - 0, - ColumnPath::new(vec![]), - )); - - let mut buffer = DefinitionLevelBuffer::new(&descriptor, true); - match &mut buffer.inner { - BufferInner::Mask { nulls } => nulls.append_n(100, false), - _ => unreachable!(), - }; - - let bitmap = buffer.split_bitmask(19); - - // Should have split off 19 records leaving, 81 behind - assert_eq!(bitmap.len(), 3); // Note: bitmask only tracks bytes not bits - assert_eq!(buffer.nulls().len(), 81); - } } diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index e47bdee1c38a..35933e6e15d9 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -15,18 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::cmp::{max, min}; - use arrow_buffer::Buffer; use crate::arrow::record_reader::{ buffer::{BufferQueue, ScalarBuffer, ValuesBuffer}, definition_levels::{DefinitionLevelBuffer, DefinitionLevelBufferDecoder}, }; +use crate::column::reader::decoder::RepetitionLevelDecoderImpl; use crate::column::{ page::PageReader, reader::{ - decoder::{ColumnLevelDecoderImpl, ColumnValueDecoder, ColumnValueDecoderImpl}, + decoder::{ColumnValueDecoder, ColumnValueDecoderImpl}, GenericColumnReader, }, }; @@ -37,15 +36,12 @@ use crate::schema::types::ColumnDescPtr; pub(crate) mod buffer; mod definition_levels; -/// The minimum number of levels read when reading a repeated field -pub(crate) const MIN_BATCH_SIZE: usize = 1024; - /// A `RecordReader` is a stateful column reader that delimits semantic records. pub type RecordReader = GenericRecordReader::T>, ColumnValueDecoderImpl>; pub(crate) type ColumnReader = - GenericColumnReader; + GenericColumnReader; /// A generic stateful column reader that delimits semantic records /// @@ -55,19 +51,14 @@ pub(crate) type ColumnReader = pub struct GenericRecordReader { column_desc: ColumnDescPtr, - records: V, + values: V, def_levels: Option, rep_levels: Option>, column_reader: Option>, - - /// Number of records accumulated in records - num_records: usize, - - /// Number of values `num_records` contains. + /// Number of buffered levels / null-padded values num_values: usize, - - /// Starts from 1, number of values have been written to buffer - values_written: usize, + /// Number of buffered records + num_records: usize, } impl GenericRecordReader @@ -93,14 +84,13 @@ where let rep_levels = (desc.max_rep_level() > 0).then(ScalarBuffer::new); Self { - records, + values: records, def_levels, rep_levels, column_reader: None, column_desc: desc, - num_records: 0, num_values: 0, - values_written: 0, + num_records: 0, } } @@ -117,7 +107,7 @@ where }); let rep_level_decoder = (descr.max_rep_level() != 0) - .then(|| ColumnLevelDecoderImpl::new(descr.max_rep_level())); + .then(|| RepetitionLevelDecoderImpl::new(descr.max_rep_level())); self.column_reader = Some(GenericColumnReader::new_with_decoders( self.column_desc.clone(), @@ -142,56 +132,14 @@ where let mut records_read = 0; loop { - // Try to find some records from buffers that has been read into memory - // but not counted as seen records. - - // Check to see if the column is exhausted. Only peek the next page since in - // case we are reading to a page boundary and do not actually need to read - // the next page. - let end_of_column = !self.column_reader.as_mut().unwrap().peek_next()?; - - let (record_count, value_count) = - self.count_records(num_records - records_read, end_of_column); - - self.num_records += record_count; - self.num_values += value_count; - records_read += record_count; - + let records_to_read = num_records - records_read; + records_read += self.read_one_batch(records_to_read)?; if records_read == num_records || !self.column_reader.as_mut().unwrap().has_next()? { break; } - - // If repetition levels present, we don't know how much more to read - // in order to read the requested number of records, therefore read at least - // MIN_BATCH_SIZE, otherwise read **exactly** what was requested. This helps - // to avoid a degenerate case where the buffers are never fully drained. - // - // Consider the scenario where the user is requesting batches of MIN_BATCH_SIZE. - // - // When transitioning across a row group boundary, this will read some remainder - // from the row group `r`, before reading MIN_BATCH_SIZE from the next row group, - // leaving `MIN_BATCH_SIZE + r` in the buffer. - // - // The client will then only split off the `MIN_BATCH_SIZE` they actually wanted, - // leaving behind `r`. This will continue indefinitely. - // - // Aside from wasting cycles splitting and shuffling buffers unnecessarily, this - // prevents dictionary preservation from functioning correctly as the buffer - // will never be emptied, allowing a new dictionary to be registered. - // - // This degenerate case can still occur for repeated fields, but - // it is avoided for the more common case of a non-repeated field - let batch_size = match &self.rep_levels { - Some(_) => max(num_records - records_read, MIN_BATCH_SIZE), - None => num_records - records_read, - }; - - // Try to more value from parquet pages - self.read_one_batch(batch_size)?; } - Ok(records_read) } @@ -201,31 +149,10 @@ where /// /// Number of records skipped pub fn skip_records(&mut self, num_records: usize) -> Result { - // First need to clear the buffer - let end_of_column = match self.column_reader.as_mut() { - Some(reader) => !reader.peek_next()?, - None => return Ok(0), - }; - - let (buffered_records, buffered_values) = - self.count_records(num_records, end_of_column); - - self.num_records += buffered_records; - self.num_values += buffered_values; - - let remaining = num_records - buffered_records; - - if remaining == 0 { - return Ok(buffered_records); + match self.column_reader.as_mut() { + Some(reader) => reader.skip_records(num_records), + None => Ok(0), } - - let skipped = self - .column_reader - .as_mut() - .unwrap() - .skip_records(remaining)?; - - Ok(skipped + buffered_records) } /// Returns number of records stored in buffer. @@ -246,25 +173,19 @@ where /// definition level values that have already been read into memory but not counted /// as record values, e.g. those from `self.num_values` to `self.values_written`. pub fn consume_def_levels(&mut self) -> Option { - match self.def_levels.as_mut() { - Some(x) => x.split_levels(self.num_values), - None => None, - } + self.def_levels.as_mut().and_then(|x| x.consume_levels()) } /// Return repetition level data. /// The side effect is similar to `consume_def_levels`. pub fn consume_rep_levels(&mut self) -> Option { - match self.rep_levels.as_mut() { - Some(x) => Some(x.split_off(self.num_values)), - None => None, - } + self.rep_levels.as_mut().map(|x| x.consume()) } /// Returns currently stored buffer data. /// The side effect is similar to `consume_def_levels`. pub fn consume_record_data(&mut self) -> V::Output { - self.records.split_off(self.num_values) + self.values.consume() } /// Returns currently stored null bitmap data. @@ -277,34 +198,31 @@ where /// Should be called after consuming data, e.g. `consume_rep_levels`, /// `consume_rep_levels`, `consume_record_data` and `consume_bitmap_buffer`. pub fn reset(&mut self) { - self.values_written -= self.num_values; - self.num_records = 0; self.num_values = 0; + self.num_records = 0; } /// Returns bitmap data. pub fn consume_bitmap(&mut self) -> Option { self.def_levels .as_mut() - .map(|levels| levels.split_bitmask(self.num_values)) + .map(|levels| levels.consume_bitmask()) } - /// Try to read one batch of data. + /// Try to read one batch of data returning the number of records read fn read_one_batch(&mut self, batch_size: usize) -> Result { let rep_levels = self .rep_levels .as_mut() .map(|levels| levels.spare_capacity_mut(batch_size)); - let def_levels = self.def_levels.as_mut(); + let values = self.values.spare_capacity_mut(batch_size); - let values = self.records.spare_capacity_mut(batch_size); - - let (values_read, levels_read) = self + let (records_read, values_read, levels_read) = self .column_reader .as_mut() .unwrap() - .read_batch(batch_size, def_levels, rep_levels, values)?; + .read_records(batch_size, def_levels, rep_levels, values)?; if values_read < levels_read { let def_levels = self.def_levels.as_ref().ok_or_else(|| { @@ -313,90 +231,29 @@ where ) })?; - self.records.pad_nulls( - self.values_written, + self.values.pad_nulls( + self.num_values, values_read, levels_read, def_levels.nulls().as_slice(), ); } - let values_read = max(levels_read, values_read); - self.set_values_written(self.values_written + values_read); - Ok(values_read) - } - - /// Inspects the buffered repetition levels in the range `self.num_values..self.values_written` - /// and returns the number of "complete" records along with the corresponding number of values - /// - /// If `end_of_column` is true it indicates that there are no further values for this - /// column chunk beyond what is currently in the buffers - /// - /// A "complete" record is one where the buffer contains a subsequent repetition level of 0 - fn count_records( - &self, - records_to_read: usize, - end_of_column: bool, - ) -> (usize, usize) { - match self.rep_levels.as_ref() { - Some(buf) => { - let buf = buf.as_slice(); - - let mut records_read = 0; - let mut end_of_last_record = self.num_values; - - for (current, item) in buf - .iter() - .enumerate() - .take(self.values_written) - .skip(self.num_values) - { - if *item == 0 && current != self.num_values { - records_read += 1; - end_of_last_record = current; - - if records_read == records_to_read { - break; - } - } - } - - // If reached end of column chunk => end of a record - if records_read != records_to_read - && end_of_column - && self.values_written != self.num_values - { - records_read += 1; - end_of_last_record = self.values_written; - } - - (records_read, end_of_last_record - self.num_values) - } - None => { - let records_read = - min(records_to_read, self.values_written - self.num_values); - - (records_read, records_read) - } - } - } - - fn set_values_written(&mut self, new_values_written: usize) { - self.values_written = new_values_written; - self.records.set_len(self.values_written); - + self.num_records += records_read; + self.num_values += levels_read; + self.values.set_len(self.num_values); if let Some(ref mut buf) = self.rep_levels { - buf.set_len(self.values_written) + buf.set_len(self.num_values) }; - if let Some(ref mut buf) = self.def_levels { - buf.set_len(self.values_written) + buf.set_len(self.num_values) }; + Ok(records_read) } } /// Returns true if we do not need to unpack the nullability for this column, this is -/// only possible if the max defiition level is 1, and corresponds to nulls at the +/// only possible if the max definition level is 1, and corresponds to nulls at the /// leaf level, as opposed to a nullable parent nested type fn packed_null_mask(descr: &ColumnDescPtr) -> bool { descr.max_def_level() == 1 diff --git a/parquet/src/column/mod.rs b/parquet/src/column/mod.rs index a68127a4ef05..c81d6290abc2 100644 --- a/parquet/src/column/mod.rs +++ b/parquet/src/column/mod.rs @@ -84,7 +84,6 @@ //! let reader = SerializedFileReader::new(file).unwrap(); //! let metadata = reader.metadata(); //! -//! let mut res = Ok((0, 0)); //! let mut values = vec![0; 8]; //! let mut def_levels = vec![0; 8]; //! let mut rep_levels = vec![0; 8]; @@ -98,19 +97,21 @@ //! match column_reader { //! // You can also use `get_typed_column_reader` method to extract typed reader. //! ColumnReader::Int32ColumnReader(ref mut typed_reader) => { -//! res = typed_reader.read_batch( -//! 8, // batch size +//! let (records, values, levels) = typed_reader.read_records( +//! 8, // maximum records to read //! Some(&mut def_levels), //! Some(&mut rep_levels), //! &mut values, -//! ); +//! ).unwrap(); +//! assert_eq!(records, 2); +//! assert_eq!(levels, 5); +//! assert_eq!(values, 3); //! } //! _ => {} //! } //! } //! } //! -//! assert_eq!(res.unwrap(), (3, 5)); //! assert_eq!(values, vec![1, 2, 3, 0, 0, 0, 0, 0]); //! assert_eq!(def_levels, vec![3, 3, 3, 2, 2, 0, 0, 0]); //! assert_eq!(rep_levels, vec![0, 1, 0, 1, 1, 0, 0, 0]); diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 991ec2c545a4..88967e179271 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -17,13 +17,12 @@ //! Contains column reader API. -use std::cmp::min; - use super::page::{Page, PageReader}; use crate::basic::*; use crate::column::reader::decoder::{ - ColumnLevelDecoderImpl, ColumnValueDecoder, ColumnValueDecoderImpl, - DefinitionLevelDecoder, LevelsBufferSlice, RepetitionLevelDecoder, ValuesBufferSlice, + ColumnValueDecoder, ColumnValueDecoderImpl, DefinitionLevelDecoder, + DefinitionLevelDecoderImpl, LevelsBufferSlice, RepetitionLevelDecoder, + RepetitionLevelDecoderImpl, ValuesBufferSlice, }; use crate::data_type::*; use crate::errors::{ParquetError, Result}; @@ -103,8 +102,8 @@ pub fn get_typed_column_reader( /// Typed value reader for a particular primitive column. pub type ColumnReaderImpl = GenericColumnReader< - ColumnLevelDecoderImpl, - ColumnLevelDecoderImpl, + RepetitionLevelDecoderImpl, + DefinitionLevelDecoderImpl, ColumnValueDecoderImpl, >; @@ -119,11 +118,14 @@ pub struct GenericColumnReader { page_reader: Box, /// The total number of values stored in the data page. - num_buffered_values: u32, + num_buffered_values: usize, /// The number of values from the current data page that has been decoded into memory /// so far. - num_decoded_values: u32, + num_decoded_values: usize, + + /// True if the end of the current data page denotes the end of a record + has_record_delimiter: bool, /// The decoder for the definition levels if any def_level_decoder: Option, @@ -135,7 +137,7 @@ pub struct GenericColumnReader { values_decoder: V, } -impl GenericColumnReader +impl GenericColumnReader where V: ColumnValueDecoder, { @@ -144,10 +146,10 @@ where let values_decoder = V::new(&descr); let def_level_decoder = (descr.max_def_level() != 0) - .then(|| ColumnLevelDecoderImpl::new(descr.max_def_level())); + .then(|| DefinitionLevelDecoderImpl::new(descr.max_def_level())); let rep_level_decoder = (descr.max_rep_level() != 0) - .then(|| ColumnLevelDecoderImpl::new(descr.max_rep_level())); + .then(|| RepetitionLevelDecoderImpl::new(descr.max_rep_level())); Self::new_with_decoders( descr, @@ -180,6 +182,7 @@ where num_buffered_values: 0, num_decoded_values: 0, values_decoder, + has_record_delimiter: false, } } @@ -195,99 +198,126 @@ where /// /// `values` will be contiguously populated with the non-null values. Note that if the column /// is not required, this may be less than either `batch_size` or the number of levels read + #[deprecated(note = "Use read_records")] pub fn read_batch( &mut self, batch_size: usize, - mut def_levels: Option<&mut D::Slice>, - mut rep_levels: Option<&mut R::Slice>, + def_levels: Option<&mut D::Slice>, + rep_levels: Option<&mut R::Slice>, values: &mut V::Slice, ) -> Result<(usize, usize)> { - let mut values_read = 0; - let mut levels_read = 0; + let (_, values, levels) = + self.read_records(batch_size, def_levels, rep_levels, values)?; + + Ok((values, levels)) + } - // Compute the smallest batch size we can read based on provided slices - let mut batch_size = min(batch_size, values.capacity()); + /// Read up to `num_records` returning the number of complete records, non-null + /// values and levels decoded + /// + /// If the max definition level is 0, `def_levels` will be ignored, otherwise it will be + /// populated with the number of levels read, with an error returned if it is `None`. + /// + /// If the max repetition level is 0, `rep_levels` will be ignored, otherwise it will be + /// populated with the number of levels read, with an error returned if it is `None`. + /// + /// `values` will be contiguously populated with the non-null values. Note that if the column + /// is not required, this may be less than either `max_records` or the number of levels read + pub fn read_records( + &mut self, + max_records: usize, + mut def_levels: Option<&mut D::Slice>, + mut rep_levels: Option<&mut R::Slice>, + values: &mut V::Slice, + ) -> Result<(usize, usize, usize)> { + let mut max_levels = values.capacity().min(max_records); if let Some(ref levels) = def_levels { - batch_size = min(batch_size, levels.capacity()); + max_levels = max_levels.min(levels.capacity()); } if let Some(ref levels) = rep_levels { - batch_size = min(batch_size, levels.capacity()); + max_levels = max_levels.min(levels.capacity()) } - // Read exhaustively all pages until we read all batch_size values/levels - // or there are no more values/levels to read. - while levels_read < batch_size { - if !self.has_next()? { - break; - } + let mut total_records_read = 0; + let mut total_levels_read = 0; + let mut total_values_read = 0; - // Batch size for the current iteration - let iter_batch_size = (batch_size - levels_read) - .min((self.num_buffered_values - self.num_decoded_values) as usize); + while total_records_read < max_records + && total_levels_read < max_levels + && self.has_next()? + { + let remaining_records = max_records - total_records_read; + let remaining_levels = self.num_buffered_values - self.num_decoded_values; + let levels_to_read = remaining_levels.min(max_levels - total_levels_read); - // If the field is required and non-repeated, there are no definition levels - let null_count = match self.descr.max_def_level() > 0 { - true => { - let levels = def_levels + let (records_read, levels_read) = match self.rep_level_decoder.as_mut() { + Some(reader) => { + let out = rep_levels .as_mut() - .ok_or_else(|| general_err!("must specify definition levels"))?; + .ok_or_else(|| general_err!("must specify repetition levels"))?; + + let (mut records_read, levels_read) = reader.read_rep_levels( + out, + total_levels_read..total_levels_read + levels_to_read, + remaining_records, + )?; + + if levels_read == remaining_levels && self.has_record_delimiter { + // Reached end of page, which implies records_read < remaining_records + // as otherwise would have stopped reading before reaching the end + assert!(records_read < remaining_records); // Sanity check + records_read += 1; + } + (records_read, levels_read) + } + None => { + let min = remaining_records.min(levels_to_read); + (min, min) + } + }; - let num_def_levels = self - .def_level_decoder + let values_to_read = match self.def_level_decoder.as_mut() { + Some(reader) => { + let out = def_levels .as_mut() - .expect("def_level_decoder be set") - .read(levels, levels_read..levels_read + iter_batch_size)?; + .ok_or_else(|| general_err!("must specify definition levels"))?; + + let read = reader.read_def_levels( + out, + total_levels_read..total_levels_read + levels_read, + )?; - if num_def_levels != iter_batch_size { - return Err(general_err!("insufficient definition levels read from column - expected {}, got {}", iter_batch_size, num_def_levels)); + if read != levels_read { + return Err(general_err!("insufficient definition levels read from column - expected {rep_levels}, got {read}")); } - levels.count_nulls( - levels_read..levels_read + num_def_levels, + let null_count = out.count_nulls( + total_levels_read..total_levels_read + read, self.descr.max_def_level(), - ) + ); + levels_read - null_count } - false => 0, + None => levels_read, }; - if self.descr.max_rep_level() > 0 { - let levels = rep_levels - .as_mut() - .ok_or_else(|| general_err!("must specify repetition levels"))?; - - let rep_levels = self - .rep_level_decoder - .as_mut() - .expect("rep_level_decoder be set") - .read(levels, levels_read..levels_read + iter_batch_size)?; - - if rep_levels != iter_batch_size { - return Err(general_err!("insufficient repetition levels read from column - expected {}, got {}", iter_batch_size, rep_levels)); - } - } - - let values_to_read = iter_batch_size - null_count; - let curr_values_read = self - .values_decoder - .read(values, values_read..values_read + values_to_read)?; + let values_read = self.values_decoder.read( + values, + total_values_read..total_values_read + values_to_read, + )?; - if curr_values_read != values_to_read { + if values_read != values_to_read { return Err(general_err!( - "insufficient values read from column - expected: {}, got: {}", - values_to_read, - curr_values_read + "insufficient values read from column - expected: {values_to_read}, got: {values_read}", )); } - // Update all "return" counters and internal state. - - // This is to account for when def or rep levels are not provided - self.num_decoded_values += iter_batch_size as u32; - levels_read += iter_batch_size; - values_read += curr_values_read; + self.num_decoded_values += levels_read; + total_records_read += records_read; + total_levels_read += levels_read; + total_values_read += values_read; } - Ok((values_read, levels_read)) + Ok((total_records_read, total_values_read, total_levels_read)) } /// Skips over `num_records` records, where records are delimited by repetition levels of 0 @@ -336,21 +366,30 @@ where // start skip values in page level // The number of levels in the current data page - let buffered_levels = - (self.num_buffered_values - self.num_decoded_values) as usize; + let remaining_levels = self.num_buffered_values - self.num_decoded_values; let (records_read, rep_levels_read) = match self.rep_level_decoder.as_mut() { Some(decoder) => { - decoder.skip_rep_levels(remaining_records, buffered_levels)? + let (mut records_read, levels_read) = + decoder.skip_rep_levels(remaining_records, remaining_levels)?; + + if levels_read == remaining_levels && self.has_record_delimiter { + // Reached end of page, which implies records_read < remaining_records + // as otherwise would have stopped reading before reaching the end + assert!(records_read < remaining_records); // Sanity check + records_read += 1; + } + + (records_read, levels_read) } None => { // No repetition levels, so each level corresponds to a row - let levels = buffered_levels.min(remaining_records); + let levels = remaining_levels.min(remaining_records); (levels, levels) } }; - self.num_decoded_values += rep_levels_read as u32; + self.num_decoded_values += rep_levels_read; remaining_records -= records_read; if self.num_buffered_values == self.num_decoded_values { @@ -431,7 +470,7 @@ where rep_level_encoding, statistics: _, } => { - self.num_buffered_values = num_values; + self.num_buffered_values = num_values as _; self.num_decoded_values = 0; let max_rep_level = self.descr.max_rep_level(); @@ -448,6 +487,9 @@ where )?; offset += bytes_read; + self.has_record_delimiter = + self.page_reader.peek_next_page()?.is_none(); + self.rep_level_decoder .as_mut() .unwrap() @@ -493,12 +535,18 @@ where return Err(general_err!("more nulls than values in page, contained {} values and {} nulls", num_values, num_nulls)); } - self.num_buffered_values = num_values; + self.num_buffered_values = num_values as _; self.num_decoded_values = 0; // DataPage v2 only supports RLE encoding for repetition // levels if self.descr.max_rep_level() > 0 { + // Technically a DataPage v2 should not write a record + // across multiple pages, however, the parquet writer + // used to do this so we preserve backwards compatibility + self.has_record_delimiter = + self.page_reader.peek_next_page()?.is_none(); + self.rep_level_decoder.as_mut().unwrap().set_data( Encoding::RLE, buf.range(0, rep_levels_byte_len as usize), @@ -533,21 +581,6 @@ where } } - /// Check whether there is more data to read from this column, - /// If the current page is fully decoded, this will NOT load the next page - /// into the buffer - #[inline] - #[cfg(feature = "arrow")] - pub(crate) fn peek_next(&mut self) -> Result { - if self.num_buffered_values == 0 - || self.num_buffered_values == self.num_decoded_values - { - Ok(self.page_reader.peek_next_page()?.is_some()) - } else { - Ok(true) - } - } - /// Check whether there is more data to read from this column, /// If the current page is fully decoded, this will load the next page /// (if it exists) into the buffer @@ -1359,15 +1392,14 @@ mod tests { let mut curr_values_read = 0; let mut curr_levels_read = 0; - let mut done = false; - while !done { + loop { let actual_def_levels = def_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); let actual_rep_levels = rep_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); - let (values_read, levels_read) = typed_column_reader - .read_batch( + let (_, values_read, levels_read) = typed_column_reader + .read_records( batch_size, actual_def_levels, actual_rep_levels, @@ -1375,12 +1407,12 @@ mod tests { ) .expect("read_batch() should be OK"); - if values_read == 0 && levels_read == 0 { - done = true; - } - curr_values_read += values_read; curr_levels_read += levels_read; + + if values_read == 0 && levels_read == 0 { + break; + } } assert!( diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index 3a6795c8cbf8..369b335dc98f 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -68,24 +68,35 @@ pub trait ColumnLevelDecoder { /// Set data for this [`ColumnLevelDecoder`] fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr); +} - /// Read level data into `out[range]` returning the number of levels read +pub trait RepetitionLevelDecoder: ColumnLevelDecoder { + /// Read up to `max_records` of repetition level data into `out[range]` returning the number + /// of complete records and levels read /// /// `range` is provided by the caller to allow for types such as default-initialized `[T]` /// that only track capacity and not length /// + /// A record only ends when the data contains a subsequent repetition level of 0, + /// it is therefore left to the caller to delimit the final record in a column + /// /// # Panics /// /// Implementations may panic if `range` overlaps with already written data - /// - fn read(&mut self, out: &mut Self::Slice, range: Range) -> Result; -} + fn read_rep_levels( + &mut self, + out: &mut Self::Slice, + range: Range, + max_records: usize, + ) -> Result<(usize, usize)>; -pub trait RepetitionLevelDecoder: ColumnLevelDecoder { /// Skips over up to `num_levels` repetition levels corresponding to `num_records` records, /// where a record is delimited by a repetition level of 0 /// /// Returns the number of records skipped, and the number of levels skipped + /// + /// A record only ends when the data contains a subsequent repetition level of 0, + /// it is therefore left to the caller to delimit the final record in a column fn skip_rep_levels( &mut self, num_records: usize, @@ -94,6 +105,22 @@ pub trait RepetitionLevelDecoder: ColumnLevelDecoder { } pub trait DefinitionLevelDecoder: ColumnLevelDecoder { + /// Read definition level data into `out[range]` returning the number of levels read + /// + /// `range` is provided by the caller to allow for types such as default-initialized `[T]` + /// that only track capacity and not length + /// + /// # Panics + /// + /// Implementations may panic if `range` overlaps with already written data + /// + // TODO: Should this return the number of nulls + fn read_def_levels( + &mut self, + out: &mut Self::Slice, + range: Range, + ) -> Result; + /// Skips over `num_levels` definition levels /// /// Returns the number of values skipped, and the number of levels skipped @@ -270,101 +297,67 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { const SKIP_BUFFER_SIZE: usize = 1024; -/// An implementation of [`ColumnLevelDecoder`] for `[i16]` -pub struct ColumnLevelDecoderImpl { - decoder: Option, - /// Temporary buffer populated when skipping values - buffer: Vec, - bit_width: u8, +enum LevelDecoder { + Packed(BitReader, u8), + Rle(RleDecoder), } -impl ColumnLevelDecoderImpl { - pub fn new(max_level: i16) -> Self { - let bit_width = num_required_bits(max_level as u64); - Self { - decoder: None, - buffer: vec![], - bit_width, +impl LevelDecoder { + fn new(encoding: Encoding, data: ByteBufferPtr, bit_width: u8) -> Self { + match encoding { + Encoding::RLE => { + let mut decoder = RleDecoder::new(bit_width); + decoder.set_data(data); + Self::Rle(decoder) + } + Encoding::BIT_PACKED => Self::Packed(BitReader::new(data), bit_width), + _ => unreachable!("invalid level encoding: {}", encoding), } } - /// Drops the first `len` values from the internal buffer - fn split_off_buffer(&mut self, len: usize) { - match self.buffer.len() == len { - true => self.buffer.clear(), - false => { - // Move to_read elements to end of slice - self.buffer.rotate_left(len); - // Truncate buffer - self.buffer.truncate(self.buffer.len() - len); + fn read(&mut self, out: &mut [i16]) -> Result { + match self { + Self::Packed(reader, bit_width) => { + Ok(reader.get_batch::(out, *bit_width as usize)) } + Self::Rle(reader) => Ok(reader.get_batch(out)?), } } +} - /// Reads up to `to_read` values to the internal buffer - fn read_to_buffer(&mut self, to_read: usize) -> Result<()> { - let mut buf = std::mem::take(&mut self.buffer); - - // Repopulate buffer - buf.resize(to_read, 0); - let actual = self.read(&mut buf, 0..to_read)?; - buf.truncate(actual); - - self.buffer = buf; - Ok(()) - } +/// An implementation of [`DefinitionLevelDecoder`] for `[i16]` +pub struct DefinitionLevelDecoderImpl { + decoder: Option, + bit_width: u8, } -enum LevelDecoderInner { - Packed(BitReader, u8), - Rle(RleDecoder), +impl DefinitionLevelDecoderImpl { + pub fn new(max_level: i16) -> Self { + let bit_width = num_required_bits(max_level as u64); + Self { + decoder: None, + bit_width, + } + } } -impl ColumnLevelDecoder for ColumnLevelDecoderImpl { +impl ColumnLevelDecoder for DefinitionLevelDecoderImpl { type Slice = [i16]; fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { - self.buffer.clear(); - match encoding { - Encoding::RLE => { - let mut decoder = RleDecoder::new(self.bit_width); - decoder.set_data(data); - self.decoder = Some(LevelDecoderInner::Rle(decoder)); - } - Encoding::BIT_PACKED => { - self.decoder = Some(LevelDecoderInner::Packed( - BitReader::new(data), - self.bit_width, - )); - } - _ => unreachable!("invalid level encoding: {}", encoding), - } + self.decoder = Some(LevelDecoder::new(encoding, data, self.bit_width)) } +} - fn read(&mut self, out: &mut Self::Slice, mut range: Range) -> Result { - let read_from_buffer = match self.buffer.is_empty() { - true => 0, - false => { - let read_from_buffer = self.buffer.len().min(range.end - range.start); - out[range.start..range.start + read_from_buffer] - .copy_from_slice(&self.buffer[0..read_from_buffer]); - self.split_off_buffer(read_from_buffer); - read_from_buffer - } - }; - range.start += read_from_buffer; - - match self.decoder.as_mut().unwrap() { - LevelDecoderInner::Packed(reader, bit_width) => Ok(read_from_buffer - + reader.get_batch::(&mut out[range], *bit_width as usize)), - LevelDecoderInner::Rle(reader) => { - Ok(read_from_buffer + reader.get_batch(&mut out[range])?) - } - } +impl DefinitionLevelDecoder for DefinitionLevelDecoderImpl { + fn read_def_levels( + &mut self, + out: &mut Self::Slice, + range: Range, + ) -> Result { + self.decoder.as_mut().unwrap().read(&mut out[range]) } -} -impl DefinitionLevelDecoder for ColumnLevelDecoderImpl { fn skip_def_levels( &mut self, num_levels: usize, @@ -372,80 +365,159 @@ impl DefinitionLevelDecoder for ColumnLevelDecoderImpl { ) -> Result<(usize, usize)> { let mut level_skip = 0; let mut value_skip = 0; + let mut buf: Vec = vec![]; while level_skip < num_levels { let remaining_levels = num_levels - level_skip; - if self.buffer.is_empty() { - // Only read number of needed values - self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?; - if self.buffer.is_empty() { - // Reached end of page - break; - } + let to_read = remaining_levels.min(SKIP_BUFFER_SIZE); + buf.resize(to_read, 0); + let read = self.read_def_levels(&mut buf, 0..to_read)?; + if read == 0 { + // Reached end of page + break; } - let to_read = self.buffer.len().min(remaining_levels); - level_skip += to_read; - value_skip += self.buffer[..to_read] - .iter() - .filter(|x| **x == max_def_level) - .count(); - - self.split_off_buffer(to_read) + level_skip += read; + value_skip += buf[..read].iter().filter(|x| **x == max_def_level).count(); } Ok((value_skip, level_skip)) } } -impl RepetitionLevelDecoder for ColumnLevelDecoderImpl { - fn skip_rep_levels( +pub(crate) const REPETITION_LEVELS_BATCH_SIZE: usize = 1024; + +/// An implementation of [`RepetitionLevelDecoder`] for `[i16]` +pub struct RepetitionLevelDecoderImpl { + decoder: Option, + bit_width: u8, + buffer: Box<[i16; REPETITION_LEVELS_BATCH_SIZE]>, + buffer_len: usize, + buffer_offset: usize, + has_partial: bool, +} + +impl RepetitionLevelDecoderImpl { + pub fn new(max_level: i16) -> Self { + let bit_width = num_required_bits(max_level as u64); + Self { + decoder: None, + bit_width, + buffer: Box::new([0; REPETITION_LEVELS_BATCH_SIZE]), + buffer_offset: 0, + buffer_len: 0, + has_partial: false, + } + } + + fn fill_buf(&mut self) -> Result<()> { + let read = self.decoder.as_mut().unwrap().read(self.buffer.as_mut())?; + self.buffer_offset = 0; + self.buffer_len = read; + Ok(()) + } + + /// Inspects the buffered repetition levels in the range `self.buffer_offset..self.buffer_len` + /// and returns the number of "complete" records along with the corresponding number of values + /// + /// A "complete" record is one where the buffer contains a subsequent repetition level of 0 + fn count_records( &mut self, - num_records: usize, + records_to_read: usize, num_levels: usize, - ) -> Result<(usize, usize)> { - let mut level_skip = 0; - let mut record_skip = 0; + ) -> (bool, usize, usize) { + let mut records_read = 0; - while level_skip < num_levels { - let remaining_levels = num_levels - level_skip; + let levels = num_levels.min(self.buffer_len - self.buffer_offset); + let buf = self.buffer.iter().skip(self.buffer_offset); + for (idx, item) in buf.take(levels).enumerate() { + if *item == 0 && (idx != 0 || self.has_partial) { + records_read += 1; - if self.buffer.is_empty() { - // Only read number of needed values - self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?; - if self.buffer.is_empty() { - // Reached end of page - break; + if records_read == records_to_read { + return (false, records_read, idx); } } + } + // Either ran out of space in `num_levels` or data in `self.buffer` + (true, records_read, levels) + } +} + +impl ColumnLevelDecoder for RepetitionLevelDecoderImpl { + type Slice = [i16]; - let max_skip = self.buffer.len().min(remaining_levels); + fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + self.decoder = Some(LevelDecoder::new(encoding, data, self.bit_width)); + self.buffer_len = 0; + self.buffer_offset = 0; + } +} - let mut to_skip = 0; - while to_skip < max_skip && record_skip != num_records { - if self.buffer[to_skip] == 0 { - record_skip += 1; +impl RepetitionLevelDecoder for RepetitionLevelDecoderImpl { + fn read_rep_levels( + &mut self, + out: &mut Self::Slice, + range: Range, + max_records: usize, + ) -> Result<(usize, usize)> { + let output = &mut out[range]; + let max_levels = output.len(); + let mut total_records_read = 0; + let mut total_levels_read = 0; + + while total_records_read < max_records && total_levels_read < max_levels { + if self.buffer_len == self.buffer_offset { + self.fill_buf()?; + if self.buffer_len == 0 { + break; } - to_skip += 1; } - // Find end of record - while to_skip < max_skip && self.buffer[to_skip] != 0 { - to_skip += 1; - } + let (partial, records_read, levels_read) = self.count_records( + max_records - total_records_read, + max_levels - total_levels_read, + ); - level_skip += to_skip; - if to_skip == self.buffer.len() { - // Need to to read more values - self.buffer.clear(); - continue; - } + output[total_levels_read..total_levels_read + levels_read].copy_from_slice( + &self.buffer[self.buffer_offset..self.buffer_offset + levels_read], + ); - self.split_off_buffer(to_skip); - break; + total_levels_read += levels_read; + total_records_read += records_read; + self.buffer_offset += levels_read; + self.has_partial = partial; } + Ok((total_records_read, total_levels_read)) + } + + fn skip_rep_levels( + &mut self, + num_records: usize, + num_levels: usize, + ) -> Result<(usize, usize)> { + let mut total_records_read = 0; + let mut total_levels_read = 0; - Ok((record_skip, level_skip)) + while total_records_read < num_records && total_levels_read < num_levels { + if self.buffer_len == self.buffer_offset { + self.fill_buf()?; + if self.buffer_len == 0 { + break; + } + } + + let (partial, records_read, levels_read) = self.count_records( + num_records - total_records_read, + num_levels - total_levels_read, + ); + + total_levels_read += levels_read; + total_records_read += records_read; + self.buffer_offset += levels_read; + self.has_partial = partial; + } + Ok((total_records_read, total_levels_read)) } } @@ -455,35 +527,6 @@ mod tests { use crate::encodings::rle::RleEncoder; use rand::prelude::*; - fn test_skip_levels(encoded: &[i16], data: ByteBufferPtr, skip: F) - where - F: Fn(&mut ColumnLevelDecoderImpl, &mut usize, usize), - { - let mut rng = thread_rng(); - let mut decoder = ColumnLevelDecoderImpl::new(5); - decoder.set_data(Encoding::RLE, data); - - let mut read = 0; - let mut decoded = vec![]; - let mut expected = vec![]; - while read < encoded.len() { - let to_read = rng.gen_range(0..(encoded.len() - read).min(100)) + 1; - - if rng.gen_bool(0.5) { - skip(&mut decoder, &mut read, to_read) - } else { - let start = decoded.len(); - let end = decoded.len() + to_read; - decoded.resize(end, 0); - let actual_read = decoder.read(&mut decoded, start..end).unwrap(); - assert_eq!(actual_read, to_read); - expected.extend_from_slice(&encoded[read..read + to_read]); - read += to_read; - } - } - assert_eq!(decoded, expected); - } - #[test] fn test_skip_padding() { let mut encoder = RleEncoder::new(1, 1024); @@ -491,67 +534,67 @@ mod tests { (0..3).for_each(|_| encoder.put(1)); let data = ByteBufferPtr::new(encoder.consume()); - let mut decoder = ColumnLevelDecoderImpl::new(1); + let mut decoder = RepetitionLevelDecoderImpl::new(1); decoder.set_data(Encoding::RLE, data.clone()); - let (records, levels) = decoder.skip_rep_levels(100, 4).unwrap(); - assert_eq!(records, 1); + let (_, levels) = decoder.skip_rep_levels(100, 4).unwrap(); assert_eq!(levels, 4); // The length of the final bit packed run is ambiguous, so without the correct // levels limit, it will decode zero padding - let mut decoder = ColumnLevelDecoderImpl::new(1); + let mut decoder = RepetitionLevelDecoderImpl::new(1); decoder.set_data(Encoding::RLE, data); - let (records, levels) = decoder.skip_rep_levels(100, 6).unwrap(); - assert_eq!(records, 3); + let (_, levels) = decoder.skip_rep_levels(100, 6).unwrap(); assert_eq!(levels, 6); } #[test] - fn test_skip() { + fn test_skip_rep_levels() { for _ in 0..10 { let mut rng = thread_rng(); let total_len = 10000_usize; - let encoded: Vec = (0..total_len).map(|_| rng.gen_range(0..5)).collect(); + let mut encoded: Vec = + (0..total_len).map(|_| rng.gen_range(0..5)).collect(); + encoded[0] = 0; let mut encoder = RleEncoder::new(3, 1024); for v in &encoded { encoder.put(*v as _) } let data = ByteBufferPtr::new(encoder.consume()); - test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { - let (values_skipped, levels_skipped) = - decoder.skip_def_levels(to_read, 5).unwrap(); - assert_eq!(levels_skipped, to_read); - - let expected = &encoded[*read..*read + to_read]; - let expected_values_skipped = - expected.iter().filter(|x| **x == 5).count(); - assert_eq!(values_skipped, expected_values_skipped); - *read += to_read; - }); - - test_skip_levels(&encoded, data.clone(), |decoder, read, to_read| { - let remaining_levels = total_len - *read; - let (records_skipped, levels_skipped) = - decoder.skip_rep_levels(to_read, remaining_levels).unwrap(); - - assert!(levels_skipped <= remaining_levels); - - // If not run out of values - if levels_skipped + *read != encoded.len() { - // Should have read correct number of records - assert_eq!(records_skipped, to_read); - // Next value should be start of record - assert_eq!(encoded[levels_skipped + *read], 0); + let mut decoder = RepetitionLevelDecoderImpl::new(5); + decoder.set_data(Encoding::RLE, data); + + let total_records = encoded.iter().filter(|x| **x == 0).count(); + let mut remaining_records = total_records; + let mut remaining_levels = encoded.len(); + loop { + let skip = rng.gen_bool(0.5); + let records = rng.gen_range(1..=remaining_records.min(5)); + let (records_read, levels_read) = if skip { + decoder.skip_rep_levels(records, remaining_levels).unwrap() + } else { + let mut decoded = vec![0; remaining_levels]; + let (records_read, levels_read) = decoder + .read_rep_levels(&mut decoded, 0..remaining_levels, records) + .unwrap(); + + assert_eq!( + decoded[..levels_read], + encoded[encoded.len() - remaining_levels..][..levels_read] + ); + (records_read, levels_read) + }; + + remaining_levels = remaining_levels.checked_sub(levels_read).unwrap(); + if remaining_levels == 0 { + assert_eq!(records_read + 1, records); + assert_eq!(records, remaining_records); + break; } - - let expected = &encoded[*read..*read + levels_skipped]; - let expected_records_skipped = - expected.iter().filter(|x| **x == 0).count(); - assert_eq!(records_skipped, expected_records_skipped); - - *read += levels_skipped; - }); + assert_eq!(records_read, records); + remaining_records -= records; + assert_ne!(remaining_records, 0); + } } } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index fc5e29b03256..93dff1b46f42 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2332,7 +2332,7 @@ mod tests { let mut actual_def_levels = def_levels.map(|_| vec![0i16; max_batch_size]); let mut actual_rep_levels = rep_levels.map(|_| vec![0i16; max_batch_size]); - let (values_read, levels_read) = read_fully( + let (_, values_read, levels_read) = read_fully( reader, max_batch_size, actual_def_levels.as_mut(), @@ -2409,11 +2409,11 @@ mod tests { mut def_levels: Option<&mut Vec>, mut rep_levels: Option<&mut Vec>, values: &mut [T::T], - ) -> (usize, usize) { + ) -> (usize, usize, usize) { let actual_def_levels = def_levels.as_mut().map(|vec| &mut vec[..]); let actual_rep_levels = rep_levels.as_mut().map(|vec| &mut vec[..]); reader - .read_batch(batch_size, actual_def_levels, actual_rep_levels, values) + .read_records(batch_size, actual_def_levels, actual_rep_levels, values) .unwrap() } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index defdaad321d8..15240e33c514 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1632,12 +1632,12 @@ mod tests { let mut out = [0; 4]; let c1 = row_group.get_column_reader(0).unwrap(); let mut c1 = get_typed_column_reader::(c1); - c1.read_batch(4, None, None, &mut out).unwrap(); + c1.read_records(4, None, None, &mut out).unwrap(); assert_eq!(out, column_data[0]); let c2 = row_group.get_column_reader(1).unwrap(); let mut c2 = get_typed_column_reader::(c2); - c2.read_batch(4, None, None, &mut out).unwrap(); + c2.read_records(4, None, None, &mut out).unwrap(); assert_eq!(out, column_data[1]); }; diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index 14a4a39454fd..67c407b3a05c 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -295,8 +295,11 @@ impl TypedTripletIter { fn read_next(&mut self) -> Result { self.curr_triplet_index += 1; - if self.curr_triplet_index >= self.triplets_left { - let (values_read, levels_read) = { + // A loop is required to handle the case of a batch size of 1, as in such a case + // on reaching the end of a record, read_records will return `Ok((1, 0, 0))` + // and therefore not advance `self.triplets_left` + while self.curr_triplet_index >= self.triplets_left { + let (records_read, values_read, levels_read) = { // Get slice of definition levels, if available let def_levels = self.def_levels.as_mut().map(|vec| &mut vec[..]); @@ -304,7 +307,7 @@ impl TypedTripletIter { let rep_levels = self.rep_levels.as_mut().map(|vec| &mut vec[..]); // Buffer triplets - self.reader.read_batch( + self.reader.read_records( self.batch_size, def_levels, rep_levels, @@ -313,7 +316,7 @@ impl TypedTripletIter { }; // No more values or levels to read - if values_read == 0 && levels_read == 0 { + if records_read == 0 && values_read == 0 && levels_read == 0 { self.has_next = false; return Ok(false); } From 2846cde87de2d51afbe4ab642b31c861f152de6f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Jun 2023 09:43:34 +0100 Subject: [PATCH 1001/1411] Consolidate ByteArray::from_iterator (#4386) --- arrow-array/src/array/binary_array.rs | 46 +------------------- arrow-array/src/array/byte_array.rs | 23 ++++++++++ arrow-array/src/array/string_array.rs | 61 +-------------------------- 3 files changed, 26 insertions(+), 104 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index a4d64040ceff..e809d3a6d615 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -19,7 +19,7 @@ use crate::types::{ByteArrayType, GenericBinaryType}; use crate::{ Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, }; -use arrow_buffer::{bit_util, Buffer, MutableBuffer}; +use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -174,49 +174,6 @@ impl From> } } -impl FromIterator> - for GenericBinaryArray -where - Ptr: AsRef<[u8]>, -{ - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = Vec::with_capacity(data_len + 1); - let mut values = Vec::new(); - let mut null_buf = MutableBuffer::new_null(data_len); - let mut length_so_far: OffsetSize = OffsetSize::zero(); - offsets.push(length_so_far); - - { - let null_slice = null_buf.as_slice_mut(); - - for (i, s) in iter.enumerate() { - if let Some(s) = s { - let s = s.as_ref(); - bit_util::set_bit(null_slice, i); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - } - // always add an element in offsets - offsets.push(length_so_far); - } - } - - // calculate actual data_len, which may be different from the iterator's upper bound - let data_len = offsets.len() - 1; - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(data_len) - .add_buffer(Buffer::from_vec(offsets)) - .add_buffer(Buffer::from_vec(values)) - .null_bit_buffer(Some(null_buf.into())); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } -} - /// An array of `[u8]` using `i32` offsets /// /// The byte length of each element is represented by an i32. @@ -301,6 +258,7 @@ pub type LargeBinaryArray = GenericBinaryArray; mod tests { use super::*; use crate::{ListArray, StringArray}; + use arrow_buffer::Buffer; use arrow_schema::Field; use std::sync::Arc; diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 629ffd22cdc2..563e965e5e45 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -456,6 +456,29 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray { } } +impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option> for GenericByteArray +where + Ptr: AsRef + 'a, +{ + fn from_iter>>(iter: I) -> Self { + iter.into_iter() + .map(|o| o.as_ref().map(|p| p.as_ref())) + .collect() + } +} + +impl FromIterator> for GenericByteArray +where + Ptr: AsRef, +{ + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024); + builder.extend(iter); + builder.finish() + } +} + #[cfg(test)] mod tests { use crate::{BinaryArray, StringArray}; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 8a1c0bd150d8..ecc3e3eaba23 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -17,7 +17,7 @@ use crate::types::GenericStringType; use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; -use arrow_buffer::{bit_util, MutableBuffer}; +use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -105,65 +105,6 @@ impl GenericStringArray { } } -impl<'a, Ptr, OffsetSize: OffsetSizeTrait> FromIterator<&'a Option> - for GenericStringArray -where - Ptr: AsRef + 'a, -{ - /// Creates a [`GenericStringArray`] based on an iterator of `Option` references. - fn from_iter>>(iter: I) -> Self { - // Convert each owned Ptr into &str and wrap in an owned `Option` - let iter = iter.into_iter().map(|o| o.as_ref().map(|p| p.as_ref())); - // Build a `GenericStringArray` with the resulting iterator - iter.collect::>() - } -} - -impl FromIterator> - for GenericStringArray -where - Ptr: AsRef, -{ - /// Creates a [`GenericStringArray`] based on an iterator of [`Option`]s - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let offset_size = std::mem::size_of::(); - let mut offsets = MutableBuffer::new((data_len + 1) * offset_size); - let mut values = MutableBuffer::new(0); - let mut null_buf = MutableBuffer::new_null(data_len); - let null_slice = null_buf.as_slice_mut(); - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for (i, s) in iter.enumerate() { - let value_bytes = if let Some(ref s) = s { - // set null bit - bit_util::set_bit(null_slice, i); - let s_bytes = s.as_ref().as_bytes(); - length_so_far += OffsetSize::from_usize(s_bytes.len()).unwrap(); - s_bytes - } else { - b"" - }; - values.extend_from_slice(value_bytes); - offsets.push(length_so_far); - } - - // calculate actual data_len, which may be different from the iterator's upper bound - let data_len = (offsets.len() / offset_size) - 1; - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(data_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()) - .null_bit_buffer(Some(null_buf.into())); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } -} - impl From> for GenericStringArray { From 13abf8803567f974c056c50322e0e557aa853678 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Jun 2023 15:32:46 +0100 Subject: [PATCH 1002/1411] Cleanup downcast macros (#4391) --- arrow-array/src/cast.rs | 56 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 1f8bb6587e58..e92e19eb3c7f 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -64,7 +64,7 @@ macro_rules! repeat_pat { /// [`DataType`]: arrow_schema::DataType #[macro_export] macro_rules! downcast_integer { - ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) => { match ($($data_type),+) { $crate::repeat_pat!(arrow_schema::DataType::Int8, $($data_type),+) => { $m!($crate::types::Int8Type $(, $args)*) @@ -90,7 +90,7 @@ macro_rules! downcast_integer { $crate::repeat_pat!(arrow_schema::DataType::UInt64, $($data_type),+) => { $m!($crate::types::UInt64Type $(, $args)*) } - $(($($p),+) => $fallback,)* + $($p => $fallback,)* } }; } @@ -127,7 +127,7 @@ macro_rules! downcast_integer { /// [`DataType`]: arrow_schema::DataType #[macro_export] macro_rules! downcast_run_end_index { - ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) => { match ($($data_type),+) { $crate::repeat_pat!(arrow_schema::DataType::Int16, $($data_type),+) => { $m!($crate::types::Int16Type $(, $args)*) @@ -138,7 +138,7 @@ macro_rules! downcast_run_end_index { $crate::repeat_pat!(arrow_schema::DataType::Int64, $($data_type),+) => { $m!($crate::types::Int64Type $(, $args)*) } - $(($($p),+) => $fallback,)* + $($p => $fallback,)* } }; } @@ -170,7 +170,7 @@ macro_rules! downcast_run_end_index { /// [`DataType`]: arrow_schema::DataType #[macro_export] macro_rules! downcast_temporal { - ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) => { match ($($data_type),+) { $crate::repeat_pat!(arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), $($data_type),+) => { $m!($crate::types::Time32SecondType $(, $args)*) @@ -202,7 +202,7 @@ macro_rules! downcast_temporal { $crate::repeat_pat!(arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), $($data_type),+) => { $m!($crate::types::TimestampNanosecondType $(, $args)*) } - $(($($p),+) => $fallback,)* + $($p => $fallback,)* } }; } @@ -237,16 +237,16 @@ macro_rules! downcast_temporal_array { ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { $crate::downcast_temporal_array!($values => {$e} $($p => $fallback)*) }; - (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { - $crate::downcast_temporal_array!($($values),+ => $e $($($p),+ => $fallback)*) + (($($values:ident),+) => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal_array!($($values),+ => {$e} $($p => $fallback)*) }; - (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { - $crate::downcast_temporal_array!($($values),+ => $e $($($p),+ => $fallback)*) + ($($values:ident),+ => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_temporal_array!(($($values),+) => $e $($p => $fallback)*) }; - ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + (($($values:ident),+) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { $crate::downcast_temporal!{ $($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e), - $($($p),+ => $fallback,)* + $($p => $fallback,)* } }; } @@ -281,7 +281,7 @@ macro_rules! downcast_temporal_array { /// [`DataType`]: arrow_schema::DataType #[macro_export] macro_rules! downcast_primitive { - ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($($p:pat),+ => $fallback:expr $(,)*)*) => { + ($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat => $fallback:expr $(,)*)*) => { $crate::downcast_integer! { $($data_type),+ => ($m $(, $args)*), $crate::repeat_pat!(arrow_schema::DataType::Float16, $($data_type),+) => { @@ -323,7 +323,7 @@ macro_rules! downcast_primitive { _ => { $crate::downcast_temporal! { $($data_type),+ => ($m $(, $args)*), - $($($p),+ => $fallback,)* + $($p => $fallback,)* } } } @@ -369,16 +369,16 @@ macro_rules! downcast_primitive_array { ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { $crate::downcast_primitive_array!($values => {$e} $($p => $fallback)*) }; - (($($values:ident),+) => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { - $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + (($($values:ident),+) => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!($($values),+ => {$e} $($p => $fallback)*) }; - (($($values:ident),+) => $e:block $(($($p:pat),+) => $fallback:expr $(,)*)*) => { - $crate::downcast_primitive_array!($($values),+ => $e $($($p),+ => $fallback)*) + ($($values:ident),+ => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + $crate::downcast_primitive_array!(($($values),+) => $e $($p => $fallback)*) }; - ($($values:ident),+ => $e:block $($($p:pat),+ => $fallback:expr $(,)*)*) => { + (($($values:ident),+) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { $crate::downcast_primitive!{ $($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e), - $($($p),+ => $fallback,)* + $($p => $fallback,)* } }; } @@ -577,7 +577,7 @@ macro_rules! downcast_run_array { } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`GenericListArray`], panic'ing on failure. +/// [`GenericListArray`], panicking on failure. pub fn as_generic_list_array( arr: &dyn Array, ) -> &GenericListArray { @@ -587,14 +587,14 @@ pub fn as_generic_list_array( } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`ListArray`], panic'ing on failure. +/// [`ListArray`], panicking on failure. #[inline] pub fn as_list_array(arr: &dyn Array) -> &ListArray { as_generic_list_array::(arr) } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`FixedSizeListArray`], panic'ing on failure. +/// [`FixedSizeListArray`], panicking on failure. #[inline] pub fn as_fixed_size_list_array(arr: &dyn Array) -> &FixedSizeListArray { arr.as_any() @@ -603,14 +603,14 @@ pub fn as_fixed_size_list_array(arr: &dyn Array) -> &FixedSizeListArray { } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`LargeListArray`], panic'ing on failure. +/// [`LargeListArray`], panicking on failure. #[inline] pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { as_generic_list_array::(arr) } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`GenericBinaryArray`], panic'ing on failure. +/// [`GenericBinaryArray`], panicking on failure. #[inline] pub fn as_generic_binary_array( arr: &dyn Array, @@ -621,7 +621,7 @@ pub fn as_generic_binary_array( } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`StringArray`], panic'ing on failure. +/// [`StringArray`], panicking on failure. /// /// # Example /// @@ -640,7 +640,7 @@ pub fn as_string_array(arr: &dyn Array) -> &StringArray { } /// Force downcast of an [`Array`], such as an [`ArrayRef`] to -/// [`BooleanArray`], panic'ing on failure. +/// [`BooleanArray`], panicking on failure. /// /// # Example /// @@ -675,7 +675,7 @@ macro_rules! array_downcast_fn { array_downcast_fn!( $name, $arrty, - concat!("[`", stringify!($arrty), "`], panic'ing on failure.") + concat!("[`", stringify!($arrty), "`], panicking on failure.") ); }; } From 9b2b4cae81f55114e04b170f26e8a9b84bbb62f2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 9 Jun 2023 15:52:34 +0100 Subject: [PATCH 1003/1411] Documentation Improvements (#4381) * Reorder crate documentation * Add backlinks * More examples * Link from typedefs to generic types * Restore examples * Example cleanup * Review feedback --- arrow-array/src/array/binary_array.rs | 8 +- arrow-array/src/array/boolean_array.rs | 60 ++++----- arrow-array/src/array/byte_array.rs | 46 +++++++ arrow-array/src/array/dictionary_array.rs | 67 +++++++--- arrow-array/src/array/list_array.rs | 54 +-------- arrow-array/src/array/primitive_array.rs | 141 ++++++++++++++++------ arrow-array/src/array/string_array.rs | 10 +- arrow-buffer/src/lib.rs | 2 +- arrow-data/src/lib.rs | 4 +- arrow/src/lib.rs | 68 +++++------ 10 files changed, 283 insertions(+), 177 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index e809d3a6d615..54839604d192 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -23,7 +23,7 @@ use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; -/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing binary data +/// A [`GenericBinaryArray`] for storing `[u8]` pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { @@ -174,7 +174,7 @@ impl From> } } -/// An array of `[u8]` using `i32` offsets +/// A [`GenericBinaryArray`] of `[u8]` using `i32` offsets /// /// The byte length of each element is represented by an i32. /// @@ -213,9 +213,10 @@ impl From> /// assert!(!array.is_null(4)); /// ``` /// +/// See [`GenericByteArray`] for more information and examples pub type BinaryArray = GenericBinaryArray; -/// An array of `[u8]` using `i64` offsets +/// A [`GenericBinaryArray`] of `[u8]` using `i64` offsets /// /// # Examples /// @@ -252,6 +253,7 @@ pub type BinaryArray = GenericBinaryArray; /// assert!(!array.is_null(4)); /// ``` /// +/// See [`GenericByteArray`] for more information and examples pub type LargeBinaryArray = GenericBinaryArray; #[cfg(test)] diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 6905baa806de..e99b71b1846e 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -27,51 +27,43 @@ use std::sync::Arc; /// An array of [boolean values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// -/// # Examples +/// # Example: From a Vec /// -/// Construction +/// ``` +/// # use arrow_array::{Array, BooleanArray}; +/// let arr: BooleanArray = vec![true, true, false].into(); +/// ``` +/// +/// # Example: From an optional Vec /// /// ``` -///# use arrow_array::{Array, BooleanArray}; -/// // Create from Vec> -/// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); -/// // Create from Vec -/// let arr = BooleanArray::from(vec![false, true, true]); -/// // Create from iter/collect -/// let arr: BooleanArray = std::iter::repeat(Some(true)).take(10).collect(); +/// # use arrow_array::{Array, BooleanArray}; +/// let arr: BooleanArray = vec![Some(true), None, Some(false)].into(); /// ``` /// -/// Construction and Access +/// # Example: From an iterator /// /// ``` -/// use arrow_array::{Array, BooleanArray}; -/// let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); -/// assert_eq!(4, arr.len()); -/// assert_eq!(1, arr.null_count()); -/// assert!(arr.is_valid(0)); -/// assert!(!arr.is_null(0)); -/// assert_eq!(false, arr.value(0)); -/// assert!(!arr.is_valid(2)); -/// assert!(arr.is_null(2)); +/// # use arrow_array::{Array, BooleanArray}; +/// let arr: BooleanArray = (0..5).map(|x| (x % 2 == 0).then(|| x % 3 == 0)).collect(); +/// let values: Vec<_> = arr.iter().collect(); +/// assert_eq!(&values, &[Some(true), None, Some(false), None, Some(false)]) /// ``` /// -/// Using `collect` +/// # Example: Using Builder +/// /// ``` -/// use arrow_array::{Array, BooleanArray}; -/// let v = vec![Some(false), Some(true), Some(false), Some(true)]; -/// let arr = v.into_iter().collect::(); -/// assert_eq!(4, arr.len()); -/// assert_eq!(0, arr.offset()); -/// assert_eq!(0, arr.null_count()); -/// assert!(arr.is_valid(0)); -/// assert_eq!(false, arr.value(0)); -/// assert!(arr.is_valid(1)); -/// assert_eq!(true, arr.value(1)); -/// assert!(arr.is_valid(2)); -/// assert_eq!(false, arr.value(2)); -/// assert!(arr.is_valid(3)); -/// assert_eq!(true, arr.value(3)); +/// # use arrow_array::Array; +/// # use arrow_array::builder::BooleanBuilder; +/// let mut builder = BooleanBuilder::new(); +/// builder.append_value(true); +/// builder.append_null(); +/// builder.append_value(false); +/// let array = builder.finish(); +/// let values: Vec<_> = array.iter().collect(); +/// assert_eq!(&values, &[Some(true), None, Some(false)]) /// ``` +/// #[derive(Clone)] pub struct BooleanArray { values: BooleanBuffer, diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 563e965e5e45..0a18062d9ae1 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -34,6 +34,52 @@ use std::sync::Arc; /// /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing arbitrary bytes /// +/// # Example: From a Vec +/// +/// ``` +/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; +/// let arr: GenericByteArray = vec!["hello", "world", ""].into(); +/// assert_eq!(arr.value_data(), b"helloworld"); +/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10]); +/// let values: Vec<_> = arr.iter().collect(); +/// assert_eq!(values, &[Some("hello"), Some("world"), Some("")]); +/// ``` +/// +/// # Example: From an optional Vec +/// +/// ``` +/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; +/// let arr: GenericByteArray = vec![Some("hello"), Some("world"), Some(""), None].into(); +/// assert_eq!(arr.value_data(), b"helloworld"); +/// assert_eq!(arr.value_offsets(), &[0, 5, 10, 10, 10]); +/// let values: Vec<_> = arr.iter().collect(); +/// assert_eq!(values, &[Some("hello"), Some("world"), Some(""), None]); +/// ``` +/// +/// # Example: From an iterator of option +/// +/// ``` +/// # use arrow_array::{Array, GenericByteArray, types::Utf8Type}; +/// let arr: GenericByteArray = (0..5).map(|x| (x % 2 == 0).then(|| x.to_string())).collect(); +/// let values: Vec<_> = arr.iter().collect(); +/// assert_eq!(values, &[Some("0"), None, Some("2"), None, Some("4")]); +/// ``` +/// +/// # Example: Using Builder +/// +/// ``` +/// # use arrow_array::Array; +/// # use arrow_array::builder::GenericByteBuilder; +/// # use arrow_array::types::Utf8Type; +/// let mut builder = GenericByteBuilder::::new(); +/// builder.append_value("hello"); +/// builder.append_null(); +/// builder.append_value("world"); +/// let array = builder.finish(); +/// let values: Vec<_> = array.iter().collect(); +/// assert_eq!(values, &[Some("hello"), None, Some("world")]); +/// ``` +/// /// [`StringArray`]: crate::StringArray /// [`LargeStringArray`]: crate::LargeStringArray /// [`BinaryArray`]: crate::BinaryArray diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index a319a836a955..b9112d103a89 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -30,7 +30,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// A dictionary array indexed by `i8` +/// A [`DictionaryArray`] indexed by `i8` /// /// # Example: Using `collect` /// ``` @@ -42,9 +42,11 @@ use std::sync::Arc; /// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type Int8DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `i16` +/// A [`DictionaryArray`] indexed by `i16` /// /// # Example: Using `collect` /// ``` @@ -56,9 +58,11 @@ pub type Int8DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type Int16DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `i32` +/// A [`DictionaryArray`] indexed by `i32` /// /// # Example: Using `collect` /// ``` @@ -70,9 +74,11 @@ pub type Int16DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type Int32DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `i64` +/// A [`DictionaryArray`] indexed by `i64` /// /// # Example: Using `collect` /// ``` @@ -84,9 +90,11 @@ pub type Int32DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type Int64DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `u8` +/// A [`DictionaryArray`] indexed by `u8` /// /// # Example: Using `collect` /// ``` @@ -98,9 +106,11 @@ pub type Int64DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type UInt8DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `u16` +/// A [`DictionaryArray`] indexed by `u16` /// /// # Example: Using `collect` /// ``` @@ -112,9 +122,11 @@ pub type UInt8DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type UInt16DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `u32` +/// A [`DictionaryArray`] indexed by `u32` /// /// # Example: Using `collect` /// ``` @@ -126,9 +138,11 @@ pub type UInt16DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type UInt32DictionaryArray = DictionaryArray; -/// A dictionary array indexed by `u64` +/// A [`DictionaryArray`] indexed by `u64` /// /// # Example: Using `collect` /// ``` @@ -140,6 +154,8 @@ pub type UInt32DictionaryArray = DictionaryArray; /// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2])); /// assert_eq!(array.values(), &values); /// ``` +/// +/// See [`DictionaryArray`] for more information and examples pub type UInt64DictionaryArray = DictionaryArray; /// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) @@ -175,39 +191,54 @@ pub type UInt64DictionaryArray = DictionaryArray; /// length = 6 /// ``` /// -/// Example **with nullable** data: +/// # Example: From Nullable Data /// /// ``` -/// use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; +/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); /// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)])); /// ``` /// -/// Example **without nullable** data: +/// # Example: From Non-Nullable Data /// /// ``` -/// use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; +/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type}; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.into_iter().collect(); /// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); /// ``` /// -/// Example from existing arrays: +/// # Example: From Existing Arrays /// /// ``` -/// use std::sync::Arc; -/// use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type}; +/// # use std::sync::Arc; +/// # use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type}; /// // You can form your own DictionaryArray by providing the /// // values (dictionary) and keys (indexes into the dictionary): /// let values = StringArray::from_iter_values(["a", "b", "c"]); /// let keys = Int8Array::from_iter_values([0, 0, 1, 2]); /// let array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); -/// let expected: DictionaryArray:: = vec!["a", "a", "b", "c"] -/// .into_iter() -/// .collect(); +/// let expected: DictionaryArray:: = vec!["a", "a", "b", "c"].into_iter().collect(); /// assert_eq!(&array, &expected); /// ``` +/// +/// # Example: Using Builder +/// +/// ``` +/// # use arrow_array::{Array, StringArray}; +/// # use arrow_array::builder::StringDictionaryBuilder; +/// # use arrow_array::types::Int32Type; +/// let mut builder = StringDictionaryBuilder::::new(); +/// builder.append_value("a"); +/// builder.append_null(); +/// builder.append_value("a"); +/// builder.append_value("b"); +/// let array = builder.finish(); +/// +/// let values: Vec<_> = array.downcast_dict::().unwrap().into_iter().collect(); +/// assert_eq!(&values, &[Some("a"), None, Some("a"), Some("b")]); +/// ``` pub struct DictionaryArray { data_type: DataType, diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index abb5ba5e3c0b..2205d846ea34 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -57,6 +57,8 @@ impl OffsetSizeTrait for i64 { /// An array of [variable length arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) /// /// See [`ListArray`] and [`LargeListArray`]` +/// +/// See [`GenericListBuilder`](crate::builder::GenericListBuilder) for how to construct a [`GenericListArray`] pub struct GenericListArray { data_type: DataType, nulls: Option, @@ -472,58 +474,14 @@ impl std::fmt::Debug for GenericListArray(data); -/// -/// assert_eq!(false, list_array.is_valid(1)); -/// -/// let list0 = list_array.value(0); -/// let list2 = list_array.value(2); -/// let list3 = list_array.value(3); +/// A [`GenericListArray`] of variable size lists, storing offsets as `i32`. /// -/// assert_eq!(&[] as &[i32], list0.as_any().downcast_ref::().unwrap().values()); -/// assert_eq!(false, list2.as_any().downcast_ref::().unwrap().is_valid(1)); -/// assert_eq!(&[6, 7], list3.as_any().downcast_ref::().unwrap().values()); -/// ``` +// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`] pub type ListArray = GenericListArray; -/// An array of variable size lists, storing offsets as `i64`. -/// -/// # Example -/// -/// ``` -/// # use arrow_array::{Array, LargeListArray, Int32Array, types::Int32Type}; -/// # use arrow_schema::DataType; -/// let data = vec![ -/// Some(vec![]), -/// None, -/// Some(vec![Some(3), None, Some(5), Some(19)]), -/// Some(vec![Some(6), Some(7)]), -/// ]; -/// let list_array = LargeListArray::from_iter_primitive::(data); -/// -/// assert_eq!(false, list_array.is_valid(1)); -/// -/// let list0 = list_array.value(0); -/// let list2 = list_array.value(2); -/// let list3 = list_array.value(3); +/// A [`GenericListArray`] of variable size lists, storing offsets as `i64`. /// -/// assert_eq!(&[] as &[i32], list0.as_any().downcast_ref::().unwrap().values()); -/// assert_eq!(false, list2.as_any().downcast_ref::().unwrap().is_valid(1)); -/// assert_eq!(&[6, 7], list3.as_any().downcast_ref::().unwrap().values()); -/// ``` +// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`] pub type LargeListArray = GenericListArray; #[cfg(test)] diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 7220aca8f44b..b821ad1b4422 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -34,7 +34,7 @@ use half::f16; use std::any::Any; use std::sync::Arc; -/// An array of `i8` +/// A [`PrimitiveArray`] of `i8` /// /// # Examples /// @@ -49,9 +49,11 @@ use std::sync::Arc; /// // Create iter/collect /// let arr: Int8Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Int8Array = PrimitiveArray; -/// An array of `i16` +/// A [`PrimitiveArray`] of `i16` /// /// # Examples /// @@ -66,9 +68,11 @@ pub type Int8Array = PrimitiveArray; /// // Create iter/collect /// let arr: Int16Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Int16Array = PrimitiveArray; -/// An array of `i32` +/// A [`PrimitiveArray`] of `i32` /// /// # Examples /// @@ -83,9 +87,11 @@ pub type Int16Array = PrimitiveArray; /// // Create iter/collect /// let arr: Int32Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Int32Array = PrimitiveArray; -/// An array of `i64` +/// A [`PrimitiveArray`] of `i64` /// /// # Examples /// @@ -100,9 +106,11 @@ pub type Int32Array = PrimitiveArray; /// // Create iter/collect /// let arr: Int64Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Int64Array = PrimitiveArray; -/// An array of `u8` +/// A [`PrimitiveArray`] of `u8` /// /// # Examples /// @@ -117,9 +125,11 @@ pub type Int64Array = PrimitiveArray; /// // Create iter/collect /// let arr: UInt8Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type UInt8Array = PrimitiveArray; -/// An array of `u16` +/// A [`PrimitiveArray`] of `u16` /// /// # Examples /// @@ -134,9 +144,11 @@ pub type UInt8Array = PrimitiveArray; /// // Create iter/collect /// let arr: UInt16Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type UInt16Array = PrimitiveArray; -/// An array of `u32` +/// A [`PrimitiveArray`] of `u32` /// /// # Examples /// @@ -151,9 +163,11 @@ pub type UInt16Array = PrimitiveArray; /// // Create iter/collect /// let arr: UInt32Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type UInt32Array = PrimitiveArray; -/// An array of `u64` +/// A [`PrimitiveArray`] of `u64` /// /// # Examples /// @@ -168,9 +182,11 @@ pub type UInt32Array = PrimitiveArray; /// // Create iter/collect /// let arr: UInt64Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type UInt64Array = PrimitiveArray; -/// An array of `f16` +/// A [`PrimitiveArray`] of `f16` /// /// # Examples /// @@ -193,9 +209,11 @@ pub type UInt64Array = PrimitiveArray; /// use half::f16; /// let arr : Float16Array = [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))].into_iter().collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Float16Array = PrimitiveArray; -/// An array of `f32` +/// A [`PrimitiveArray`] of `f32` /// /// # Examples /// @@ -210,9 +228,11 @@ pub type Float16Array = PrimitiveArray; /// // Create iter/collect /// let arr: Float32Array = std::iter::repeat(42.0).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Float32Array = PrimitiveArray; -/// An array of `f64` +/// A [`PrimitiveArray`] of `f64` /// /// # Examples /// @@ -227,9 +247,11 @@ pub type Float32Array = PrimitiveArray; /// // Create iter/collect /// let arr: Float32Array = std::iter::repeat(42.0).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Float64Array = PrimitiveArray; -/// An array of seconds since UNIX epoch stored as `i64` +/// A [`PrimitiveArray`] of seconds since UNIX epoch stored as `i64` /// /// This type is similar to the [`chrono::DateTime`] type and can hold /// values such as `1970-05-09 14:25:11 +01:00` @@ -275,83 +297,82 @@ pub type Float64Array = PrimitiveArray; /// assert_eq!(arr.value_as_datetime_with_tz(0, sydney_tz).map(|v| v.to_string()).unwrap(), "1970-05-10 00:25:11 +10:00") /// ``` /// +/// See [`PrimitiveArray`] for more information and examples pub type TimestampSecondArray = PrimitiveArray; -/// An array of milliseconds since UNIX epoch stored as `i64` +/// A [`PrimitiveArray`] of milliseconds since UNIX epoch stored as `i64` /// /// See examples for [`TimestampSecondArray`] pub type TimestampMillisecondArray = PrimitiveArray; -/// An array of microseconds since UNIX epoch stored as `i64` +/// A [`PrimitiveArray`] of microseconds since UNIX epoch stored as `i64` /// /// See examples for [`TimestampSecondArray`] pub type TimestampMicrosecondArray = PrimitiveArray; -/// An array of nanoseconds since UNIX epoch stored as `i64` +/// A [`PrimitiveArray`] of nanoseconds since UNIX epoch stored as `i64` /// /// See examples for [`TimestampSecondArray`] pub type TimestampNanosecondArray = PrimitiveArray; -// TODO: give examples for the below types - -/// An array of days since UNIX epoch stored as `i32` +/// A [`PrimitiveArray`] of days since UNIX epoch stored as `i32` /// /// This type is similar to the [`chrono::NaiveDate`] type and can hold /// values such as `2018-11-13` pub type Date32Array = PrimitiveArray; -/// An array of milliseconds since UNIX epoch stored as `i64` +/// A [`PrimitiveArray`] of milliseconds since UNIX epoch stored as `i64` /// /// This type is similar to the [`chrono::NaiveDate`] type and can hold /// values such as `2018-11-13` pub type Date64Array = PrimitiveArray; -/// An array of seconds since midnight stored as `i32` +/// A [`PrimitiveArray`] of seconds since midnight stored as `i32` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00` pub type Time32SecondArray = PrimitiveArray; -/// An array of milliseconds since midnight stored as `i32` +/// A [`PrimitiveArray`] of milliseconds since midnight stored as `i32` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123` pub type Time32MillisecondArray = PrimitiveArray; -/// An array of microseconds since midnight stored as `i64` +/// A [`PrimitiveArray`] of microseconds since midnight stored as `i64` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456` pub type Time64MicrosecondArray = PrimitiveArray; -/// An array of nanoseconds since midnight stored as `i64` +/// A [`PrimitiveArray`] of nanoseconds since midnight stored as `i64` /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456789` pub type Time64NanosecondArray = PrimitiveArray; -/// An array of “calendar” intervals in months +/// A [`PrimitiveArray`] of “calendar” intervals in months pub type IntervalYearMonthArray = PrimitiveArray; -/// An array of “calendar” intervals in days and milliseconds +/// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds pub type IntervalDayTimeArray = PrimitiveArray; -/// An array of “calendar” intervals in months, days, and nanoseconds +/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds pub type IntervalMonthDayNanoArray = PrimitiveArray; -/// An array of elapsed durations in seconds +/// A [`PrimitiveArray`] of elapsed durations in seconds pub type DurationSecondArray = PrimitiveArray; -/// An array of elapsed durations in milliseconds +/// A [`PrimitiveArray`] of elapsed durations in milliseconds pub type DurationMillisecondArray = PrimitiveArray; -/// An array of elapsed durations in microseconds +/// A [`PrimitiveArray`] of elapsed durations in microseconds pub type DurationMicrosecondArray = PrimitiveArray; -/// An array of elapsed durations in nanoseconds +/// A [`PrimitiveArray`] of elapsed durations in nanoseconds pub type DurationNanosecondArray = PrimitiveArray; -/// An array of 128-bit fixed point decimals +/// A [`PrimitiveArray`] of 128-bit fixed point decimals /// /// # Examples /// @@ -366,9 +387,11 @@ pub type DurationNanosecondArray = PrimitiveArray; /// // Create iter/collect /// let arr: Decimal128Array = std::iter::repeat(42).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Decimal128Array = PrimitiveArray; -/// An array of 256-bit fixed point decimals +/// A [`PrimitiveArray`] of 256-bit fixed point decimals /// /// # Examples /// @@ -384,23 +407,73 @@ pub type Decimal128Array = PrimitiveArray; /// // Create iter/collect /// let arr: Decimal256Array = std::iter::repeat(i256::from(42)).take(10).collect(); /// ``` +/// +/// See [`PrimitiveArray`] for more information and examples pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; /// An array of [primitive values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// +/// # Example: From a Vec +/// +/// ``` +/// # use arrow_array::{Array, PrimitiveArray, types::Int32Type}; +/// let arr: PrimitiveArray = vec![1, 2, 3, 4].into(); +/// assert_eq!(4, arr.len()); +/// assert_eq!(0, arr.null_count()); +/// assert_eq!(arr.values(), &[1, 2, 3, 4]) +/// ``` +/// +/// # Example: From an optional Vec +/// +/// ``` +/// # use arrow_array::{Array, PrimitiveArray, types::Int32Type}; +/// let arr: PrimitiveArray = vec![Some(1), None, Some(3), None].into(); +/// assert_eq!(4, arr.len()); +/// assert_eq!(2, arr.null_count()); +/// // Note: values for null indexes are arbitrary +/// assert_eq!(arr.values(), &[1, 0, 3, 0]) +/// ``` +/// /// # Example: From an iterator of values /// /// ``` -/// use arrow_array::{Array, PrimitiveArray, types::Int32Type}; -/// let arr: PrimitiveArray = PrimitiveArray::from_iter_values((0..10).map(|x| x + 1)); +/// # use arrow_array::{Array, PrimitiveArray, types::Int32Type}; +/// let arr: PrimitiveArray = (0..10).map(|x| x + 1).collect(); /// assert_eq!(10, arr.len()); /// assert_eq!(0, arr.null_count()); /// for i in 0..10i32 { /// assert_eq!(i + 1, arr.value(i as usize)); /// } /// ``` +/// +/// # Example: From an iterator of option +/// +/// ``` +/// # use arrow_array::{Array, PrimitiveArray, types::Int32Type}; +/// let arr: PrimitiveArray = (0..10).map(|x| (x % 2 == 0).then_some(x)).collect(); +/// assert_eq!(10, arr.len()); +/// assert_eq!(5, arr.null_count()); +/// // Note: values for null indexes are arbitrary +/// assert_eq!(arr.values(), &[0, 0, 2, 0, 4, 0, 6, 0, 8, 0]) +/// ``` +/// +/// # Example: Using Builder +/// +/// ``` +/// # use arrow_array::Array; +/// # use arrow_array::builder::PrimitiveBuilder; +/// # use arrow_array::types::Int32Type; +/// let mut builder = PrimitiveBuilder::::new(); +/// builder.append_value(1); +/// builder.append_null(); +/// builder.append_value(2); +/// let array = builder.finish(); +/// // Note: values for null indexes are arbitrary +/// assert_eq!(array.values(), &[1, 0, 2]); +/// assert!(array.is_null(1)); +/// ``` pub struct PrimitiveArray { data_type: DataType, /// Values data diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index ecc3e3eaba23..f9a3a5fbd095 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -21,7 +21,7 @@ use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -/// See [`StringArray`] and [`LargeStringArray`] for storing string data +/// A [`GenericByteArray`] for storing `str` pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { @@ -149,7 +149,7 @@ impl From> for GenericStringArray From> for GenericStringArray; -/// An array of `str` using `i64` offsets +/// A [`GenericStringArray`] of `str` using `i64` offsets /// /// # Examples /// @@ -197,6 +199,8 @@ pub type StringArray = GenericStringArray; /// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]); /// assert_eq!(array.value(2), "bar"); /// ``` +/// +/// See [`GenericByteArray`] for more information and examples pub type LargeStringArray = GenericStringArray; #[cfg(test)] diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 90b801c4ae29..cbcdb979e693 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Buffer abstractions for [Apache Arrow](https://docs.rs/arrow) +//! Low-level buffer abstractions for [Apache Arrow Rust](https://docs.rs/arrow) pub mod alloc; pub mod buffer; diff --git a/arrow-data/src/lib.rs b/arrow-data/src/lib.rs index b864b786051a..cfa0dba66c35 100644 --- a/arrow-data/src/lib.rs +++ b/arrow-data/src/lib.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Array data abstractions for [Apache Arrow](https://docs.rs/arrow) +//! Low-level array data abstractions for [Apache Arrow Rust](https://docs.rs/arrow) +//! +//! For a higher-level, strongly-typed interface see [arrow_array](https://docs.rs/arrow_array) mod data; pub use data::*; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 70e615e88c73..bf39bae530b9 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -21,38 +21,6 @@ //! Please see the [arrow crates.io](https://crates.io/crates/arrow) //! page for feature flags and tips to improve performance. //! -//! # Crate Topology -//! -//! The [`arrow`] project is implemented as multiple sub-crates, which are then re-exported by -//! this top-level crate. -//! -//! Crate authors can choose to depend on this top-level crate, or just -//! the sub-crates they need. -//! -//! The current list of sub-crates is: -//! -//! * [`arrow-arith`][arrow_arith] - arithmetic kernels -//! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions -//! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays -//! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays -//! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format -//! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays -//! * [`arrow-ipc`][arrow_ipc] - read/write IPC to arrow format -//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format -//! * [`arrow-ord`][arrow_ord] - ordering kernels for arrow arrays -//! * [`arrow-row`][arrow_row] - comparable row format -//! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays -//! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays -//! * [`arrow-string`][arrow_string] - string kernels for arrow arrays -//! -//! _This list is likely to grow as further functionality is split out from the top-level crate_ -//! -//! Some functionality is also distributed independently of this crate: -//! -//! * [`arrow-flight`] - support for [Arrow Flight RPC] -//! * [`arrow-integration-test`] - support for [Arrow JSON Test Format] -//! * [`parquet`](https://docs.rs/parquet/latest/parquet/) - support for [Apache Parquet] -//! //! # Columnar Format //! //! The [`array`] module provides statically typed implementations of all the array types as defined @@ -73,7 +41,7 @@ //! ``` //! //! It is also possible to write generic code. For example, the following is generic over -//! all primitively typed arrays: +//! all primitively typed arrays //! //! ```rust //! # use std::iter::Sum; @@ -92,7 +60,7 @@ //! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! -//! And the following is generic over all arrays with comparable values: +//! And the following is generic over all arrays with comparable values //! //! ```rust //! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray}; @@ -109,7 +77,7 @@ //! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a")); //! ``` //! -//! For more examples, and details consult the [arrow_array] docs. +//! **For more examples, and details consult the [arrow_array] docs.** //! //! # Type Erasure / Trait Objects //! @@ -317,6 +285,36 @@ //! assert_eq!(string.value(1), "foo"); //! ``` //! +//! # Crate Topology +//! +//! The [`arrow`] project is implemented as multiple sub-crates, which are then re-exported by +//! this top-level crate. +//! +//! Crate authors can choose to depend on this top-level crate, or just +//! the sub-crates they need. +//! +//! The current list of sub-crates is: +//! +//! * [`arrow-arith`][arrow_arith] - arithmetic kernels +//! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions +//! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays +//! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays +//! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format +//! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays +//! * [`arrow-ipc`][arrow_ipc] - read/write IPC to arrow format +//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format +//! * [`arrow-ord`][arrow_ord] - ordering kernels for arrow arrays +//! * [`arrow-row`][arrow_row] - comparable row format +//! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays +//! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays +//! * [`arrow-string`][arrow_string] - string kernels for arrow arrays +//! +//! Some functionality is also distributed independently of this crate: +//! +//! * [`arrow-flight`] - support for [Arrow Flight RPC] +//! * [`arrow-integration-test`] - support for [Arrow JSON Test Format] +//! * [`parquet`](https://docs.rs/parquet/latest/parquet/) - support for [Apache Parquet] +//! //! # Safety and Security //! //! Like many crates, this crate makes use of unsafe where prudent. However, it endeavours to be From 7bedb0a3ac642395daaa5318cb71f8f5c994657b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Jun 2023 13:18:00 -0400 Subject: [PATCH 1004/1411] Improve parquet WriterProperites and ReaderProperties docs (#4392) --- parquet/src/file/properties.rs | 139 ++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 65 deletions(-) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 66690463aa3c..9724fd7f4cde 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -15,55 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! [`WriterProperties`] -//! -//! # Usage -//! -//! ```rust -//! use parquet::{ -//! basic::{Compression, Encoding}, -//! file::properties::*, -//! schema::types::ColumnPath, -//! }; -//! -//! // Create properties with default configuration. -//! let props = WriterProperties::default(); -//! -//! // Use properties builder to set certain options and assemble the configuration. -//! let props = WriterProperties::builder() -//! .set_writer_version(WriterVersion::PARQUET_1_0) -//! .set_encoding(Encoding::PLAIN) -//! .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED) -//! .set_compression(Compression::SNAPPY) -//! .build(); -//! -//! assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0); -//! assert_eq!( -//! props.encoding(&ColumnPath::from("col1")), -//! Some(Encoding::DELTA_BINARY_PACKED) -//! ); -//! assert_eq!( -//! props.encoding(&ColumnPath::from("col2")), -//! Some(Encoding::PLAIN) -//! ); -//! ``` -//! -//! Reader properties. -//! -//! # Usage -//! -//! ```rust -//! use parquet::file::properties::ReaderProperties; -//! -//! // Create properties with default configuration. -//! let props = ReaderProperties::builder().build(); -//! -//! // Use properties builder to set certain options and assemble the configuration. -//! let props = ReaderProperties::builder() -//! .set_backward_compatible_lz4(false) -//! .build(); -//! ``` - +//! Configuration via [`WriterProperties`] and [`ReaderProperties`] use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; @@ -72,20 +24,30 @@ use crate::file::metadata::KeyValue; use crate::format::SortingColumn; use crate::schema::types::ColumnPath; -const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; -const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; -const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0; -const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; -const DEFAULT_DICTIONARY_ENABLED: bool = true; -const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; -const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; -const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; -const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; -const DEFAULT_CREATED_BY: &str = +/// Default value for [`WriterProperties::data_page_size_limit`] +pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; +/// Default value for [`WriterProperties::write_batch_size`] +pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; +/// Default value for [`WriterProperties::writer_version`] +pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0; +/// Default value for [`WriterProperties::compression`] +pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; +/// Default value for [`WriterProperties::dictionary_enabled`] +pub const DEFAULT_DICTIONARY_ENABLED: bool = true; +/// Default value for [`WriterProperties::dictionary_page_size_limit`] +pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE; +/// Default value for [`WriterProperties::statistics_enabled`] +pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; +/// Default value for [`WriterProperties::max_statistics_size`] +pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; +/// Default value for [`WriterProperties::max_row_group_size`] +pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; +/// Default value for [`WriterProperties::created_by`] +pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); -/// default value for the false positive probability used in a bloom filter. +/// Default value for [`BloomFilterProperties::fpp`] pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; -/// default value for the expected number of distinct values used in a bloom filter. +/// Default value for [`BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; /// Parquet writer version. @@ -111,10 +73,41 @@ impl WriterVersion { /// Reference counted writer properties. pub type WriterPropertiesPtr = Arc; -/// Writer properties. +/// Configuration settings for writing parquet files. /// /// All properties except the key-value metadata are immutable, /// use [`WriterPropertiesBuilder`] to assemble these properties. +/// +/// # Example +/// +/// ```rust +/// use parquet::{ +/// basic::{Compression, Encoding}, +/// file::properties::*, +/// schema::types::ColumnPath, +/// }; +/// +/// // Create properties with default configuration. +/// let props = WriterProperties::default(); +/// +/// // Use properties builder to set certain options and assemble the configuration. +/// let props = WriterProperties::builder() +/// .set_writer_version(WriterVersion::PARQUET_1_0) +/// .set_encoding(Encoding::PLAIN) +/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED) +/// .set_compression(Compression::SNAPPY) +/// .build(); +/// +/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0); +/// assert_eq!( +/// props.encoding(&ColumnPath::from("col1")), +/// Some(Encoding::DELTA_BINARY_PACKED) +/// ); +/// assert_eq!( +/// props.encoding(&ColumnPath::from("col2")), +/// Some(Encoding::PLAIN) +/// ); +/// ``` #[derive(Debug, Clone)] pub struct WriterProperties { data_page_size_limit: usize, @@ -307,7 +300,8 @@ impl WriterProperties { } } -/// Writer properties builder. +/// Builder for parquet file writer configuration. See example on +/// [`WriterProperties`] pub struct WriterPropertiesBuilder { data_page_size_limit: usize, dictionary_page_size_limit: usize, @@ -809,10 +803,24 @@ pub type ReaderPropertiesPtr = Arc; const DEFAULT_READ_BLOOM_FILTER: bool = false; -/// Reader properties. +/// Configuration settings for reading parquet files. /// /// All properties are immutable and `Send` + `Sync`. /// Use [`ReaderPropertiesBuilder`] to assemble these properties. +/// +/// # Example +/// +/// ```rust +/// use parquet::file::properties::ReaderProperties; +/// +/// // Create properties with default configuration. +/// let props = ReaderProperties::builder().build(); +/// +/// // Use properties builder to set certain options and assemble the configuration. +/// let props = ReaderProperties::builder() +/// .set_backward_compatible_lz4(false) +/// .build(); +/// ``` pub struct ReaderProperties { codec_options: CodecOptions, read_bloom_filter: bool, @@ -835,7 +843,8 @@ impl ReaderProperties { } } -/// Reader properties builder. +/// Builder for parquet file reader configuration. See example on +/// [`ReaderProperties`] pub struct ReaderPropertiesBuilder { codec_options_builder: CodecOptionsBuilder, read_bloom_filter: Option, From 44bf3e77677e22dcb984cff68c41658e85febb01 Mon Sep 17 00:00:00 2001 From: Okue Date: Sun, 11 Jun 2023 02:19:40 +0900 Subject: [PATCH 1005/1411] Fix typo in README (#4394) --- arrow-flight/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 86ef8f00b70e..d4fddba70b7c 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -33,7 +33,7 @@ arrow-flight = "39.0.0" Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. This crate provides a Rust implementation of the -[Flight.proto](../../format/Flight.proto) gRPC protocol and +[Flight.proto](../format/Flight.proto) gRPC protocol and [examples](https://github.com/apache/arrow-rs/tree/master/arrow-flight/examples) that demonstrate how to build a Flight server implemented with [tonic](https://docs.rs/crate/tonic/latest). From 2462d3604593c3df5f9c815dbc2486ebaaf3a596 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sun, 11 Jun 2023 21:11:25 +0300 Subject: [PATCH 1006/1411] Truncate Min/Max values in the Column Index (#4389) * Initial work * Slight rename * Update parquet/src/column/writer/mod.rs Co-authored-by: Will Jones * Handle utf8 vs binary truncation, include increases * Small cleanup * Update parquet/src/file/properties.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update parquet/src/file/properties.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update parquet/src/column/writer/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Review notes * Added handeling for some more cases - including not truncating non-BinaryArray data * Update parquet/src/column/writer/mod.rs Co-authored-by: Will Jones * Handels increment better and some refactoring * Nicer handeling of physical type * More review notes --------- Co-authored-by: Will Jones Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/column/writer/mod.rs | 347 +++++++++++++++++++++++++++++-- parquet/src/file/metadata.rs | 8 +- parquet/src/file/properties.rs | 24 +++ 3 files changed, 359 insertions(+), 20 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 93dff1b46f42..7a84680fabb0 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -20,6 +20,7 @@ use crate::bloom_filter::Sbbf; use crate::format::{ColumnIndex, OffsetIndex}; use std::collections::{BTreeSet, VecDeque}; +use std::str; use crate::basic::{Compression, ConvertedType, Encoding, LogicalType, PageType, Type}; use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; @@ -656,8 +657,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if null_page && self.column_index_builder.valid() { self.column_index_builder.append( null_page, - &[0; 1], - &[0; 1], + vec![0; 1], + vec![0; 1], self.page_metrics.num_page_nulls as i64, ); } else if self.column_index_builder.valid() { @@ -668,19 +669,54 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_index_builder.to_invalid(); } Some(stat) => { - self.column_index_builder.append( - null_page, - stat.min_bytes(), - stat.max_bytes(), - self.page_metrics.num_page_nulls as i64, - ); + // We only truncate if the data is represented as binary + match self.descr.physical_type() { + Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { + self.column_index_builder.append( + null_page, + self.truncate_min_value(stat.min_bytes()), + self.truncate_max_value(stat.max_bytes()), + self.page_metrics.num_page_nulls as i64, + ); + } + _ => { + self.column_index_builder.append( + null_page, + stat.min_bytes().to_vec(), + stat.max_bytes().to_vec(), + self.page_metrics.num_page_nulls as i64, + ); + } + } } } + + // update the offset index + self.offset_index_builder + .append_row_count(self.page_metrics.num_buffered_rows as i64); } + } - // update the offset index - self.offset_index_builder - .append_row_count(self.page_metrics.num_buffered_rows as i64); + fn truncate_min_value(&self, data: &[u8]) -> Vec { + self.props + .column_index_truncate_length() + .filter(|l| data.len() > *l) + .and_then(|l| match str::from_utf8(data) { + Ok(str_data) => truncate_utf8(str_data, l), + Err(_) => truncate_binary(data, l), + }) + .unwrap_or_else(|| data.to_vec()) + } + + fn truncate_max_value(&self, data: &[u8]) -> Vec { + self.props + .column_index_truncate_length() + .filter(|l| data.len() > *l) + .and_then(|l| match str::from_utf8(data) { + Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), + Err(_) => truncate_binary(data, l).and_then(increment), + }) + .unwrap_or_else(|| data.to_vec()) } /// Adds data page. @@ -1152,9 +1188,76 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { (a[1..]) > (b[1..]) } +/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, while being less than `length` bytes. +fn truncate_utf8(data: &str, length: usize) -> Option> { + // We return values like that at an earlier stage in the process. + assert!(data.len() >= length); + let mut char_indices = data.char_indices(); + + // We know `data` is a valid UTF8 encoded string, which means it has at least one valid UTF8 byte, which will make this loop exist. + while let Some((idx, c)) = char_indices.next_back() { + let split_point = idx + c.len_utf8(); + if split_point <= length { + return data.as_bytes()[0..split_point].to_vec().into(); + } + } + + None +} + +/// Truncate a binary slice to make sure its length is less than `length` +fn truncate_binary(data: &[u8], length: usize) -> Option> { + // We return values like that at an earlier stage in the process. + assert!(data.len() >= length); + // If all bytes are already maximal, no need to truncate + + data[0..length].to_vec().into() +} + +/// Try and increment the bytes from right to left. +/// +/// Returns `None` if all bytes are set to `u8::MAX`. +fn increment(mut data: Vec) -> Option> { + for byte in data.iter_mut().rev() { + let (incremented, overflow) = byte.overflowing_add(1); + *byte = incremented; + + if !overflow { + return Some(data); + } + } + + None +} + +/// Try and increment the the string's bytes from right to left, returning when the result is a valid UTF8 string. +/// Returns `None` when it can't increment any byte. +fn increment_utf8(mut data: Vec) -> Option> { + for idx in (0..data.len()).rev() { + let original = data[idx]; + let (mut byte, mut overflow) = data[idx].overflowing_add(1); + + // Until overflow: 0xFF -> 0x00 + while !overflow { + data[idx] = byte; + + if str::from_utf8(&data).is_ok() { + return Some(data); + } + (byte, overflow) = data[idx].overflowing_add(1); + } + + data[idx] = original; + } + + None +} + #[cfg(test)] mod tests { - use crate::format::BoundaryOrder; + use crate::{ + file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, format::BoundaryOrder, + }; use bytes::Bytes; use rand::distributions::uniform::SampleUniform; use std::sync::Arc; @@ -2197,11 +2300,9 @@ mod tests { if let Statistics::Int32(stats) = stats { // first page is [1,2,3,4] // second page is [-5,2,4,8] + // note that we don't increment here, as this is a non BinaryArray type. assert_eq!(stats.min_bytes(), column_index.min_values[1].as_slice()); - assert_eq!( - stats.max_bytes(), - column_index.max_values.get(1).unwrap().as_slice() - ); + assert_eq!(stats.max_bytes(), column_index.max_values.get(1).unwrap()); } else { panic!("expecting Statistics::Int32"); } @@ -2220,12 +2321,226 @@ mod tests { ); } + /// Verify min/max value truncation in the column index works as expected + #[test] + fn test_column_offset_index_metadata_truncating() { + // write data + // and check the offset index and column index + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = + get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![FixedLenByteArray::default(); 3]; + // This is the expected min value - "aaa..." + data[0].set_data(ByteBufferPtr::new(vec![97_u8; 200])); + // This is the expected max value - "ZZZ..." + data[1].set_data(ByteBufferPtr::new(vec![112_u8; 200])); + data[2].set_data(ByteBufferPtr::new(vec![98_u8; 200])); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + let column_index = r.column_index.unwrap(); + let offset_index = r.offset_index.unwrap(); + + assert_eq!(3, r.rows_written); + + // column index + assert_eq!(1, column_index.null_pages.len()); + assert_eq!(1, offset_index.page_locations.len()); + assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); + assert!(!column_index.null_pages[0]); + assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); + + if let Some(stats) = r.metadata.statistics() { + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::FixedLenByteArray(stats) = stats { + let column_index_min_value = column_index.min_values.get(0).unwrap(); + let column_index_max_value = column_index.max_values.get(0).unwrap(); + + // Column index stats are truncated, while the column chunk's aren't. + assert_ne!(stats.min_bytes(), column_index_min_value.as_slice()); + assert_ne!(stats.max_bytes(), column_index_max_value.as_slice()); + + assert_eq!( + column_index_min_value.len(), + DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH.unwrap() + ); + assert_eq!(column_index_min_value.as_slice(), &[97_u8; 64]); + assert_eq!( + column_index_max_value.len(), + DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH.unwrap() + ); + + // We expect the last byte to be incremented + assert_eq!( + *column_index_max_value.last().unwrap(), + *column_index_max_value.first().unwrap() + 1 + ); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } else { + panic!("metadata missing statistics"); + } + } + + #[test] + fn test_column_offset_index_truncating_spec_example() { + // write data + // and check the offset index and column index + let page_writer = get_test_page_writer(); + + // Truncate values at 1 byte + let builder = + WriterProperties::builder().set_column_index_truncate_length(Some(1)); + let props = Arc::new(builder.build()); + let mut writer = + get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![FixedLenByteArray::default(); 1]; + // This is the expected min value + data[0].set_data(ByteBufferPtr::new( + String::from("Blart Versenwald III").into_bytes(), + )); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + let column_index = r.column_index.unwrap(); + let offset_index = r.offset_index.unwrap(); + + assert_eq!(1, r.rows_written); + + // column index + assert_eq!(1, column_index.null_pages.len()); + assert_eq!(1, offset_index.page_locations.len()); + assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); + assert!(!column_index.null_pages[0]); + assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); + + if let Some(stats) = r.metadata.statistics() { + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::FixedLenByteArray(_stats) = stats { + let column_index_min_value = column_index.min_values.get(0).unwrap(); + let column_index_max_value = column_index.max_values.get(0).unwrap(); + + assert_eq!(column_index_min_value.len(), 1); + assert_eq!(column_index_max_value.len(), 1); + + assert_eq!("B".as_bytes(), column_index_min_value.as_slice()); + assert_eq!("C".as_bytes(), column_index_max_value.as_slice()); + + assert_ne!(column_index_min_value, stats.min_bytes()); + assert_ne!(column_index_max_value, stats.max_bytes()); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } else { + panic!("metadata missing statistics"); + } + } + #[test] fn test_send() { fn test() {} test::>(); } + #[test] + fn test_increment() { + let v = increment(vec![0, 0, 0]).unwrap(); + assert_eq!(&v, &[0, 0, 1]); + + // Handle overflow + let v = increment(vec![0, 255, 255]).unwrap(); + assert_eq!(&v, &[1, 0, 0]); + + // Return `None` if all bytes are u8::MAX + let v = increment(vec![255, 255, 255]); + assert!(v.is_none()); + } + + #[test] + fn test_increment_utf8() { + // Basic ASCII case + let v = increment_utf8("hello".as_bytes().to_vec()).unwrap(); + assert_eq!(&v, "hellp".as_bytes()); + + // Also show that BinaryArray level comparison works here + let mut greater = ByteArray::new(); + greater.set_data(ByteBufferPtr::new(v)); + let mut original = ByteArray::new(); + original.set_data(ByteBufferPtr::new("hello".as_bytes().to_vec())); + assert!(greater > original); + + // UTF8 string + let s = "❤️🧡💛💚💙💜"; + let v = increment_utf8(s.as_bytes().to_vec()).unwrap(); + + if let Ok(new) = String::from_utf8(v) { + assert_ne!(&new, s); + assert_eq!(new, "❤️🧡💛💚💙💝"); + assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap()); + } else { + panic!("Expected incremented UTF8 string to also be valid.") + } + + // Max UTF8 character - should be a No-Op + let s = char::MAX.to_string(); + assert_eq!(s.len(), 4); + let v = increment_utf8(s.as_bytes().to_vec()); + assert!(v.is_none()); + + // Handle multi-byte UTF8 characters + let s = "a\u{10ffff}"; + let v = increment_utf8(s.as_bytes().to_vec()); + assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes()); + } + + #[test] + fn test_truncate_utf8() { + // No-op + let data = "❤️🧡💛💚💙💜"; + let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); + assert_eq!(r.len(), data.as_bytes().len()); + assert_eq!(&r, data.as_bytes()); + println!("len is {}", data.len()); + + // We slice it away from the UTF8 boundary + let r = truncate_utf8(data, 13).unwrap(); + assert_eq!(r.len(), 10); + assert_eq!(&r, "❤️🧡".as_bytes()); + + // One multi-byte code point, and a length shorter than it, so we can't slice it + let r = truncate_utf8("\u{0836}", 1); + assert!(r.is_none()); + } + + #[test] + fn test_truncate_max_binary_chars() { + let r = + truncate_binary(&[0xFF, 0xFE, 0xFD, 0xFF, 0xFF, 0xFF], 5).and_then(increment); + + assert_eq!(&r.unwrap(), &[0xFF, 0xFE, 0xFE, 0x00, 0x00]); + + // We can truncate this slice, but increment it will fail + let truncated = truncate_binary(&[0xFF, 0xFF, 0xFF, 0xFF], 3); + assert!(truncated.is_some()); + + let incremented = truncated.and_then(increment); + assert!(incremented.is_none()) + } + /// Performs write-read roundtrip with randomly generated values and levels. /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 40f6cf3123c7..bb8346306cf9 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -868,13 +868,13 @@ impl ColumnIndexBuilder { pub fn append( &mut self, null_page: bool, - min_value: &[u8], - max_value: &[u8], + min_value: Vec, + max_value: Vec, null_count: i64, ) { self.null_pages.push(null_page); - self.min_values.push(min_value.to_vec()); - self.max_values.push(max_value.to_vec()); + self.min_values.push(min_value); + self.max_values.push(max_value); self.null_counts.push(null_count); } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 9724fd7f4cde..3d6390c036ae 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -45,6 +45,8 @@ pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; /// Default value for [`WriterProperties::created_by`] pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); +/// Default value for [`WriterProperties::column_index_truncate_length`] +pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); /// Default value for [`BloomFilterProperties::fpp`] pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; /// Default value for [`BloomFilterProperties::ndv`] @@ -121,6 +123,7 @@ pub struct WriterProperties { default_column_properties: ColumnProperties, column_properties: HashMap, sorting_columns: Option>, + column_index_truncate_length: Option, } impl Default for WriterProperties { @@ -219,6 +222,13 @@ impl WriterProperties { self.sorting_columns.as_ref() } + /// Returns the maximum length of truncated min/max values in the column index. + /// + /// `None` if truncation is disabled, must be greater than 0 otherwise. + pub fn column_index_truncate_length(&self) -> Option { + self.column_index_truncate_length + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// This is not configurable. #[inline] @@ -314,6 +324,7 @@ pub struct WriterPropertiesBuilder { default_column_properties: ColumnProperties, column_properties: HashMap, sorting_columns: Option>, + column_index_truncate_length: Option, } impl WriterPropertiesBuilder { @@ -331,6 +342,7 @@ impl WriterPropertiesBuilder { default_column_properties: Default::default(), column_properties: HashMap::new(), sorting_columns: None, + column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, } } @@ -348,6 +360,7 @@ impl WriterPropertiesBuilder { default_column_properties: self.default_column_properties, column_properties: self.column_properties, sorting_columns: self.sorting_columns, + column_index_truncate_length: self.column_index_truncate_length, } } @@ -620,6 +633,17 @@ impl WriterPropertiesBuilder { self.get_mut_props(col).set_bloom_filter_ndv(value); self } + + /// Sets the max length of min/max value fields in the column index. Must be greater than 0. + /// If set to `None` - there's no effective limit. + pub fn set_column_index_truncate_length(mut self, max_length: Option) -> Self { + if let Some(value) = max_length { + assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`."); + } + + self.column_index_truncate_length = max_length; + self + } } /// Controls the level of statistics to be computed by the writer From 83adf96af2fd21ea5d659da4c90f53de3e4dea2f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 11 Jun 2023 17:08:11 -0400 Subject: [PATCH 1007/1411] Minor: Derive `Hash` impls for `CastOptions` and `FormatOptions` (#4395) --- arrow-cast/src/cast.rs | 2 +- arrow-cast/src/display.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ec8559d962e3..32f422768dc3 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -57,7 +57,7 @@ use num::cast::AsPrimitive; use num::{NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct CastOptions<'a> { /// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false) pub safe: bool, diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 1c2ecfc5ed0d..07e78f8984f9 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -39,7 +39,7 @@ type TimeFormat<'a> = Option<&'a str>; /// By default nulls are formatted as `""` and temporal types formatted /// according to RFC3339 /// -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct FormatOptions<'a> { /// If set to `true` any formatting errors will be written to the output /// instead of being converted into a [`std::fmt::Error`] From b12a8e890e22bc851f56b0ecf9f0b606acb5c76f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Jun 2023 12:39:26 +0100 Subject: [PATCH 1008/1411] Faster UTF-8 truncation (#4399) --- parquet/src/column/writer/mod.rs | 58 ++++++++------------------------ 1 file changed, 14 insertions(+), 44 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 7a84680fabb0..4aefef98fd4e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -703,7 +703,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l), - Err(_) => truncate_binary(data, l), + Err(_) => Some(data[..l].to_vec()), }) .unwrap_or_else(|| data.to_vec()) } @@ -714,7 +714,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), - Err(_) => truncate_binary(data, l).and_then(increment), + Err(_) => increment(data[..l].to_vec()), }) .unwrap_or_else(|| data.to_vec()) } @@ -1188,30 +1188,11 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { (a[1..]) > (b[1..]) } -/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, while being less than `length` bytes. +/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, +/// while being less than `length` bytes and non-empty fn truncate_utf8(data: &str, length: usize) -> Option> { - // We return values like that at an earlier stage in the process. - assert!(data.len() >= length); - let mut char_indices = data.char_indices(); - - // We know `data` is a valid UTF8 encoded string, which means it has at least one valid UTF8 byte, which will make this loop exist. - while let Some((idx, c)) = char_indices.next_back() { - let split_point = idx + c.len_utf8(); - if split_point <= length { - return data.as_bytes()[0..split_point].to_vec().into(); - } - } - - None -} - -/// Truncate a binary slice to make sure its length is less than `length` -fn truncate_binary(data: &[u8], length: usize) -> Option> { - // We return values like that at an earlier stage in the process. - assert!(data.len() >= length); - // If all bytes are already maximal, no need to truncate - - data[0..length].to_vec().into() + let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?; + Some(data.as_bytes()[..split].to_vec()) } /// Try and increment the bytes from right to left. @@ -1230,24 +1211,19 @@ fn increment(mut data: Vec) -> Option> { None } -/// Try and increment the the string's bytes from right to left, returning when the result is a valid UTF8 string. -/// Returns `None` when it can't increment any byte. +/// Try and increment the the string's bytes from right to left, returning when the result +/// is a valid UTF8 string. Returns `None` when it can't increment any byte. fn increment_utf8(mut data: Vec) -> Option> { for idx in (0..data.len()).rev() { let original = data[idx]; - let (mut byte, mut overflow) = data[idx].overflowing_add(1); - - // Until overflow: 0xFF -> 0x00 - while !overflow { + let (byte, overflow) = original.overflowing_add(1); + if !overflow { data[idx] = byte; - if str::from_utf8(&data).is_ok() { return Some(data); } - (byte, overflow) = data[idx].overflowing_add(1); + data[idx] = original; } - - data[idx] = original; } None @@ -2527,17 +2503,11 @@ mod tests { } #[test] - fn test_truncate_max_binary_chars() { - let r = - truncate_binary(&[0xFF, 0xFE, 0xFD, 0xFF, 0xFF, 0xFF], 5).and_then(increment); - + fn test_increment_max_binary_chars() { + let r = increment(vec![0xFF, 0xFE, 0xFD, 0xFF, 0xFF]); assert_eq!(&r.unwrap(), &[0xFF, 0xFE, 0xFE, 0x00, 0x00]); - // We can truncate this slice, but increment it will fail - let truncated = truncate_binary(&[0xFF, 0xFF, 0xFF, 0xFF], 3); - assert!(truncated.is_some()); - - let incremented = truncated.and_then(increment); + let incremented = increment(vec![0xFF, 0xFF, 0xFF]); assert!(incremented.is_none()) } From 481c197a8e131fbf447246d2b91575f03a295666 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Jun 2023 12:39:35 +0100 Subject: [PATCH 1009/1411] StructBuilder validate child types (#4397) (#4400) --- arrow-array/src/builder/struct_builder.rs | 62 +++++++++++------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index f5e3f2806507..04dc5ba7319e 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -16,9 +16,8 @@ // under the License. use crate::builder::*; -use crate::{Array, ArrayRef, StructArray}; +use crate::{ArrayRef, StructArray}; use arrow_buffer::NullBufferBuilder; -use arrow_data::ArrayData; use arrow_schema::{DataType, Fields, IntervalUnit, TimeUnit}; use std::any::Any; use std::sync::Arc; @@ -240,42 +239,24 @@ impl StructBuilder { pub fn finish(&mut self) -> StructArray { self.validate_content(); - let mut child_data = Vec::with_capacity(self.field_builders.len()); - for f in &mut self.field_builders { - let arr = f.finish(); - child_data.push(arr.to_data()); - } - let length = self.len(); + let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); let nulls = self.null_buffer_builder.finish(); - - let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) - .len(length) - .child_data(child_data) - .nulls(nulls); - - let array_data = unsafe { builder.build_unchecked() }; - StructArray::from(array_data) + StructArray::new(self.fields.clone(), arrays, nulls) } /// Builds the `StructArray` without resetting the builder. pub fn finish_cloned(&self) -> StructArray { self.validate_content(); - let mut child_data = Vec::with_capacity(self.field_builders.len()); - for f in &self.field_builders { - let arr = f.finish_cloned(); - child_data.push(arr.to_data()); - } - let length = self.len(); - let nulls = self.null_buffer_builder.finish_cloned(); + let arrays = self + .field_builders + .iter() + .map(|f| f.finish_cloned()) + .collect(); - let builder = ArrayData::builder(DataType::Struct(self.fields.clone())) - .len(length) - .child_data(child_data) - .nulls(nulls); + let nulls = self.null_buffer_builder.finish_cloned(); - let array_data = unsafe { builder.build_unchecked() }; - StructArray::from(array_data) + StructArray::new(self.fields.clone(), arrays, nulls) } /// Constructs and validates contents in the builder to ensure that @@ -295,6 +276,7 @@ impl StructBuilder { mod tests { use super::*; use arrow_buffer::Buffer; + use arrow_data::ArrayData; use arrow_schema::Field; use crate::array::Array; @@ -305,8 +287,8 @@ mod tests { let int_builder = Int32Builder::new(); let fields = vec![ - Field::new("f1", DataType::Utf8, false), - Field::new("f2", DataType::Int32, false), + Field::new("f1", DataType::Utf8, true), + Field::new("f2", DataType::Int32, true), ]; let field_builders = vec![ Box::new(string_builder) as Box, @@ -596,4 +578,22 @@ mod tests { let mut builder = StructBuilder::new(fields, field_builders); builder.finish(); } + + #[test] + #[should_panic( + expected = "Incorrect datatype for StructArray field \\\"timestamp\\\", expected Timestamp(Nanosecond, Some(\\\"UTC\\\")) got Timestamp(Nanosecond, None)" + )] + fn test_struct_array_mismatch_builder() { + let fields = vec![Field::new( + "timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".to_owned().into())), + false, + )]; + + let field_builders: Vec> = + vec![Box::new(TimestampNanosecondBuilder::new())]; + + let mut sa = StructBuilder::new(fields, field_builders); + sa.finish(); + } } From c1283f1805f1691b10d9505dedb745687946eb1c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Jun 2023 12:39:41 +0100 Subject: [PATCH 1010/1411] Add PrimitiveBuilder type constructors (#4401) --- arrow-array/src/array/primitive_array.rs | 51 ++------------------ arrow-array/src/builder/primitive_builder.rs | 32 +++++++++++- arrow-array/src/types.rs | 40 +++++++++++++++ 3 files changed, 74 insertions(+), 49 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index b821ad1b4422..576f645b0375 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1388,64 +1388,19 @@ impl PrimitiveArray { /// Returns a Decimal array with the same data as self, with the /// specified precision and scale. /// - /// Returns an Error if: - /// - `precision` is zero - /// - `precision` is larger than `T:MAX_PRECISION` - /// - `scale` is larger than `T::MAX_SCALE` - /// - `scale` is > `precision` + /// See [`validate_decimal_precision_and_scale`] pub fn with_precision_and_scale( self, precision: u8, scale: i8, - ) -> Result - where - Self: Sized, - { - // validate precision and scale - self.validate_precision_scale(precision, scale)?; - - // safety: self.data is valid DataType::Decimal as checked above + ) -> Result { + validate_decimal_precision_and_scale::(precision, scale)?; Ok(Self { data_type: T::TYPE_CONSTRUCTOR(precision, scale), ..self }) } - // validate that the new precision and scale are valid or not - fn validate_precision_scale( - &self, - precision: u8, - scale: i8, - ) -> Result<(), ArrowError> { - if precision == 0 { - return Err(ArrowError::InvalidArgumentError(format!( - "precision cannot be 0, has to be between [1, {}]", - T::MAX_PRECISION - ))); - } - if precision > T::MAX_PRECISION { - return Err(ArrowError::InvalidArgumentError(format!( - "precision {} is greater than max {}", - precision, - T::MAX_PRECISION - ))); - } - if scale > T::MAX_SCALE { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is greater than max {}", - scale, - T::MAX_SCALE - ))); - } - if scale > 0 && scale as u8 > precision { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {scale} is greater than precision {precision}" - ))); - } - - Ok(()) - } - /// Validates values in this array can be properly interpreted /// with the specified precision. pub fn validate_decimal_precision(&self, precision: u8) -> Result<(), ArrowError> { diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index f064519e4f94..3e31b1d05576 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -21,7 +21,7 @@ use crate::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; use arrow_buffer::NullBufferBuilder; use arrow_buffer::{Buffer, MutableBuffer}; use arrow_data::ArrayData; -use arrow_schema::DataType; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; @@ -331,6 +331,36 @@ impl PrimitiveBuilder { } } +impl PrimitiveBuilder

{ + /// Sets the precision and scale + pub fn with_precision_and_scale( + self, + precision: u8, + scale: i8, + ) -> Result { + validate_decimal_precision_and_scale::

(precision, scale)?; + Ok(Self { + data_type: P::TYPE_CONSTRUCTOR(precision, scale), + ..self + }) + } +} + +impl PrimitiveBuilder

{ + /// Sets the timezone + pub fn with_timezone(self, timezone: impl Into>) -> Self { + self.with_timezone_opt(Some(timezone.into())) + } + + /// Sets an optional timezone + pub fn with_timezone_opt>>(self, timezone: Option) -> Self { + Self { + data_type: DataType::Timestamp(P::UNIT, timezone.map(Into::into)), + ..self + } + } +} + impl Extend> for PrimitiveBuilder

{ #[inline] fn extend>>(&mut self, iter: T) { diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 8c19301dc7d0..f99e6a8f6f81 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1403,6 +1403,46 @@ pub trait DecimalType: ) -> Result<(), ArrowError>; } +/// Validate that `precision` and `scale` are valid for `T` +/// +/// Returns an Error if: +/// - `precision` is zero +/// - `precision` is larger than `T:MAX_PRECISION` +/// - `scale` is larger than `T::MAX_SCALE` +/// - `scale` is > `precision` +pub fn validate_decimal_precision_and_scale( + precision: u8, + scale: i8, +) -> Result<(), ArrowError> { + if precision == 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "precision cannot be 0, has to be between [1, {}]", + T::MAX_PRECISION + ))); + } + if precision > T::MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "precision {} is greater than max {}", + precision, + T::MAX_PRECISION + ))); + } + if scale > T::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than max {}", + scale, + T::MAX_SCALE + ))); + } + if scale > 0 && scale as u8 > precision { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {scale} is greater than precision {precision}" + ))); + } + + Ok(()) +} + /// The decimal type for a Decimal128Array #[derive(Debug)] pub struct Decimal128Type {} From 2c71135a683844d51779f94cfd8e5fc35a2624e4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 12 Jun 2023 14:26:48 +0100 Subject: [PATCH 1011/1411] Further buffer constructors (#4402) --- arrow-arith/src/boolean.rs | 36 +++++--------------------- arrow-array/src/array/boolean_array.rs | 8 ++++++ arrow-buffer/src/buffer/boolean.rs | 17 ++++++++++++ arrow-buffer/src/buffer/null.rs | 14 +++++++--- 4 files changed, 43 insertions(+), 32 deletions(-) diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 258d683ad71a..04c9fb229034 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -23,11 +23,9 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use arrow_array::*; -use arrow_buffer::bit_util::ceil; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; -use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; -use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType}; +use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_schema::ArrowError; /// Logical 'and' boolean values with Kleene logic /// @@ -314,7 +312,7 @@ pub fn not(left: &BooleanArray) -> Result { /// ``` pub fn is_null(input: &dyn Array) -> Result { let values = match input.nulls() { - None => NullBuffer::new_null(input.len()).into_inner(), + None => BooleanBuffer::new_unset(input.len()), Some(nulls) => !nulls.inner(), }; @@ -333,31 +331,11 @@ pub fn is_null(input: &dyn Array) -> Result { /// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false])); /// ``` pub fn is_not_null(input: &dyn Array) -> Result { - let len = input.len(); - - let output = match input.nulls() { - None => { - let len_bytes = ceil(len, 8); - MutableBuffer::new(len_bytes) - .with_bitset(len_bytes, true) - .into() - } - Some(nulls) => nulls.inner().sliced(), - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - None, - None, - 0, - vec![output], - vec![], - ) + let values = match input.nulls() { + None => BooleanBuffer::new_set(input.len()), + Some(n) => n.inner().clone(), }; - - Ok(BooleanArray::from(data)) + Ok(BooleanArray::new(values, None)) } #[cfg(test)] diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index e99b71b1846e..14fa87e138eb 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -93,6 +93,14 @@ impl BooleanArray { Self { values, nulls } } + /// Create a new [`BooleanArray`] with length `len` consisting only of nulls + pub fn new_null(len: usize) -> Self { + Self { + values: BooleanBuffer::new_unset(len), + nulls: Some(NullBuffer::new_null(len)), + } + } + /// Returns the length of this array. pub fn len(&self) -> usize { self.values.len() diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 9098926c5a60..9cc2bc262bda 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -60,6 +60,23 @@ impl BooleanBuffer { } } + /// Create a new [`BooleanBuffer`] of `length` where all values are `true` + pub fn new_set(length: usize) -> Self { + let mut builder = BooleanBufferBuilder::new(length); + builder.append_n(length, true); + builder.finish() + } + + /// Create a new [`BooleanBuffer`] of `length` where all values are `false` + pub fn new_unset(length: usize) -> Self { + let buffer = MutableBuffer::new_null(length).into_buffer(); + Self { + buffer, + offset: 0, + len: length, + } + } + /// Invokes `f` with indexes `0..len` collecting the boolean results into a new `BooleanBuffer` pub fn collect_bool bool>(len: usize, f: F) -> Self { let buffer = MutableBuffer::collect_bool(len, f); diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index 260f5d78de33..e0c7d9ef8f49 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -41,14 +41,22 @@ impl NullBuffer { /// Create a new [`NullBuffer`] of length `len` where all values are null pub fn new_null(len: usize) -> Self { - let buffer = MutableBuffer::new_null(len).into_buffer(); - let buffer = BooleanBuffer::new(buffer, 0, len); Self { - buffer, + buffer: BooleanBuffer::new_unset(len), null_count: len, } } + /// Create a new [`NullBuffer`] of length `len` where all values are valid + /// + /// Note: it is more efficient to not set the null buffer if it is known to be all valid + pub fn new_valid(len: usize) -> Self { + Self { + buffer: BooleanBuffer::new_set(len), + null_count: 0, + } + } + /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count` /// /// # Safety From 4fc05636b10de39a3167819f235768820837bac3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:03:22 +0100 Subject: [PATCH 1012/1411] More take benchmarks (#4403) --- arrow/benches/take_kernels.rs | 43 ++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/arrow/benches/take_kernels.rs b/arrow/benches/take_kernels.rs index 731426031193..362b3f5cbf3c 100644 --- a/arrow/benches/take_kernels.rs +++ b/arrow/benches/take_kernels.rs @@ -56,11 +56,26 @@ fn add_benchmark(c: &mut Criterion) { let values = create_primitive_array::(512, 0.0); let indices = create_random_index(512, 0.0); c.bench_function("take i32 512", |b| b.iter(|| bench_take(&values, &indices))); + let values = create_primitive_array::(1024, 0.0); let indices = create_random_index(1024, 0.0); c.bench_function("take i32 1024", |b| { b.iter(|| bench_take(&values, &indices)) }); + let indices = create_random_index(1024, 0.5); + c.bench_function("take i32 null indices 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_primitive_array::(1024, 0.5); + let indices = create_random_index(1024, 0.0); + c.bench_function("take i32 null values 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + let indices = create_random_index(1024, 0.5); + c.bench_function("take i32 null values null indices 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); let values = create_primitive_array::(512, 0.0); let indices = create_random_index(512, 0.0); @@ -73,35 +88,32 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_take_bounds_check(&values, &indices)) }); - let indices = create_random_index(512, 0.5); - c.bench_function("take i32 nulls 512", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - let values = create_primitive_array::(1024, 0.0); - let indices = create_random_index(1024, 0.5); - c.bench_function("take i32 nulls 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - let values = create_boolean_array(512, 0.0, 0.5); let indices = create_random_index(512, 0.0); c.bench_function("take bool 512", |b| { b.iter(|| bench_take(&values, &indices)) }); + let values = create_boolean_array(1024, 0.0, 0.5); let indices = create_random_index(1024, 0.0); c.bench_function("take bool 1024", |b| { b.iter(|| bench_take(&values, &indices)) }); - let values = create_boolean_array(512, 0.0, 0.5); - let indices = create_random_index(512, 0.5); - c.bench_function("take bool nulls 512", |b| { + let indices = create_random_index(1024, 0.5); + c.bench_function("take bool null indices 1024", |b| { b.iter(|| bench_take(&values, &indices)) }); - let values = create_boolean_array(1024, 0.0, 0.5); + + let values = create_boolean_array(1024, 0.5, 0.5); + let indices = create_random_index(1024, 0.0); + c.bench_function("take bool null values 1024", |b| { + b.iter(|| bench_take(&values, &indices)) + }); + + let values = create_boolean_array(1024, 0.5, 0.5); let indices = create_random_index(1024, 0.5); - c.bench_function("take bool nulls 1024", |b| { + c.bench_function("take bool null values null indices 1024", |b| { b.iter(|| bench_take(&values, &indices)) }); @@ -128,7 +140,6 @@ fn add_benchmark(c: &mut Criterion) { }); let values = create_string_array::(1024, 0.5); - let indices = create_random_index(1024, 0.0); c.bench_function("take str null values 1024", |b| { b.iter(|| bench_take(&values, &indices)) From 700bd334ad8be53455f5dd80023b6c8c237559a7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 13 Jun 2023 17:00:07 +0100 Subject: [PATCH 1013/1411] Improve `take` kernel performance on primitive arrays, fix bad null index handling (#4404) (#4405) * Improve take primitive performance (#4404) * Remove unnecessary trait bounds --- arrow-buffer/src/buffer/boolean.rs | 1 + arrow-buffer/src/buffer/scalar.rs | 6 + arrow-select/src/take.rs | 434 ++++++++--------------------- 3 files changed, 121 insertions(+), 320 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 9cc2bc262bda..577c716e4bea 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -128,6 +128,7 @@ impl BooleanBuffer { /// # Panics /// /// Panics if `i >= self.len()` + #[inline] pub fn value(&self, idx: usize) -> bool { assert!(idx < self.len); unsafe { self.value_unchecked(idx) } diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 40b24e4ebf0f..920463b365a5 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -140,6 +140,12 @@ impl From> for ScalarBuffer { } } +impl FromIterator for ScalarBuffer { + fn from_iter>(iter: I) -> Self { + iter.into_iter().collect::>().into() + } +} + impl<'a, T: ArrowNativeType> IntoIterator for &'a ScalarBuffer { type Item = &'a T; type IntoIter = std::slice::Iter<'a, T>; diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 5d6507e71526..4d599369ca27 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -23,11 +23,14 @@ use arrow_array::builder::BufferBuilder; use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer, NullBuffer}; +use arrow_buffer::{ + bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, + ScalarBuffer, +}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; -use num::{ToPrimitive, Zero}; +use num::Zero; /// Take elements by index from [Array], creating a new [Array] from those indexes. /// @@ -59,44 +62,36 @@ use num::{ToPrimitive, Zero}; /// /// # Examples /// ``` -/// # use arrow_array::{StringArray, UInt32Array}; +/// # use arrow_array::{StringArray, UInt32Array, cast::AsArray}; /// # use arrow_select::take::take; /// let values = StringArray::from(vec!["zero", "one", "two"]); /// /// // Take items at index 2, and 1: /// let indices = UInt32Array::from(vec![2, 1]); /// let taken = take(&values, &indices, None).unwrap(); -/// let taken = taken.as_any().downcast_ref::().unwrap(); +/// let taken = taken.as_string::(); /// /// assert_eq!(*taken, StringArray::from(vec!["two", "one"])); /// ``` -pub fn take( +pub fn take( values: &dyn Array, indices: &PrimitiveArray, options: Option, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ +) -> Result { take_impl(values, indices, options) } -fn take_impl( +fn take_impl( values: &dyn Array, indices: &PrimitiveArray, options: Option, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ +) -> Result { let options = options.unwrap_or_default(); if options.check_bounds { let len = values.len(); if indices.null_count() > 0 { indices.iter().flatten().try_for_each(|index| { - let ix = ToPrimitive::to_usize(&index).ok_or_else(|| { + let ix = index.to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; if ix >= len { @@ -108,7 +103,7 @@ where })?; } else { indices.values().iter().try_for_each(|index| { - let ix = ToPrimitive::to_usize(index).ok_or_else(|| { + let ix = index.to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; if ix >= len { @@ -125,7 +120,7 @@ where values => Ok(Arc::new(take_primitive(values, indices)?)), DataType::Boolean => { let values = values.as_any().downcast_ref::().unwrap(); - Ok(Arc::new(take_boolean(values, indices)?)) + Ok(Arc::new(take_boolean(values, indices))) } DataType::Utf8 => { Ok(Arc::new(take_bytes(values.as_string::(), indices)?)) @@ -232,132 +227,6 @@ fn maybe_usize(index: I) -> Result { .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string())) } -// take implementation when neither values nor indices contain nulls -fn take_no_nulls( - values: &[T], - indices: &[I], -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowNativeType, - I: ArrowNativeType, -{ - let values = indices - .iter() - .map(|index| Result::<_, ArrowError>::Ok(values[maybe_usize::(*index)?])); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - Ok((buffer, None)) -} - -// take implementation when only values contain nulls -fn take_values_nulls( - values: &[T], - values_nulls: &NullBuffer, - indices: &[I], -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowNativeType, - I: ArrowNativeType, -{ - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = nulls.as_slice_mut(); - let mut null_count = 0; - - let values = indices.iter().enumerate().map(|(i, index)| { - let index = maybe_usize::(*index)?; - if values_nulls.is_null(index) { - null_count += 1; - bit_util::unset_bit(null_slice, i); - } - Result::<_, ArrowError>::Ok(values[index]) - }); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - let nulls = if null_count == 0 { - // if only non-null values were taken - None - } else { - Some(nulls.into()) - }; - - Ok((buffer, nulls)) -} - -// take implementation when only indices contain nulls -fn take_indices_nulls( - values: &[T], - indices: &[I], - indices_nulls: &NullBuffer, -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowNativeType, - I: ArrowNativeType, -{ - let values = indices.iter().map(|index| { - let index = maybe_usize::(*index)?; - Result::<_, ArrowError>::Ok(match values.get(index) { - Some(value) => *value, - None => { - if indices_nulls.is_null(index) { - T::default() - } else { - panic!("Out-of-bounds index {index}") - } - } - }) - }); - - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - Ok((buffer, Some(indices_nulls.inner().sliced()))) -} - -// take implementation when both values and indices contain nulls -fn take_values_indices_nulls( - values: &[T], - values_nulls: &NullBuffer, - indices: &[I], - indices_nulls: &NullBuffer, -) -> Result<(Buffer, Option), ArrowError> -where - T: ArrowNativeType, - I: ArrowNativeType, -{ - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = nulls.as_slice_mut(); - let mut null_count = 0; - - let values = indices.iter().enumerate().map(|(i, &index)| { - if indices_nulls.is_null(i) { - null_count += 1; - bit_util::unset_bit(null_slice, i); - Ok(T::default()) - } else { - let index = maybe_usize::(index)?; - if values_nulls.is_null(index) { - null_count += 1; - bit_util::unset_bit(null_slice, i); - } - Result::<_, ArrowError>::Ok(values[index]) - } - }); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - let nulls = if null_count == 0 { - // if only non-null values were taken - None - } else { - Some(nulls.into()) - }; - - Ok((buffer, nulls)) -} - /// `take` implementation for all primitive arrays /// /// This checks if an `indices` slot is populated, and gets the value from `values` @@ -374,148 +243,91 @@ fn take_primitive( where T: ArrowPrimitiveType, I: ArrowPrimitiveType, - I::Native: ToPrimitive, { - let indices_nulls = indices.nulls().filter(|x| x.null_count() > 0); - let values_nulls = values.nulls().filter(|x| x.null_count() > 0); - - // note: this function should only panic when "an index is not null and out of bounds". - // if the index is null, its value is undefined and therefore we should not read from it. - let (buffer, nulls) = match (values_nulls, indices_nulls) { - (None, None) => { - // * no nulls - // * all `indices.values()` are valid - take_no_nulls(values.values(), indices.values())? - } - (Some(values_nulls), None) => { - // * nulls come from `values` alone - // * all `indices.values()` are valid - take_values_nulls(values.values(), values_nulls, indices.values())? - } - (None, Some(indices_nulls)) => { - // in this branch it is unsound to read and use `index.values()`, - // as doing so is UB when they come from a null slot. - take_indices_nulls(values.values(), indices.values(), indices_nulls)? - } - (Some(values_nulls), Some(indices_nulls)) => { - // in this branch it is unsound to read and use `index.values()`, - // as doing so is UB when they come from a null slot. - take_values_indices_nulls( - values.values(), - values_nulls, - indices.values(), - indices_nulls, - )? - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - values.data_type().clone(), - indices.len(), - None, - nulls, - 0, - vec![buffer], - vec![], - ) - }; - Ok(PrimitiveArray::::from(data)) + let values_buf = take_native(values.values(), indices); + let nulls = take_nulls(values.nulls(), indices); + Ok(PrimitiveArray::new(values_buf, nulls).with_data_type(values.data_type().clone())) } -fn take_bits( - values: &Buffer, - values_offset: usize, - indices: &PrimitiveArray, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ - let len = indices.len(); - let values_slice = values.as_slice(); - let mut output_buffer = MutableBuffer::new_null(len); - let output_slice = output_buffer.as_slice_mut(); - - let indices_has_nulls = indices.null_count() > 0; +#[inline(never)] +fn take_nulls( + values: Option<&NullBuffer>, + indices: &PrimitiveArray, +) -> Option { + match values.filter(|n| n.null_count() > 0) { + Some(n) => { + let buffer = take_bits(n.inner(), indices); + Some(NullBuffer::new(buffer)).filter(|n| n.null_count() > 0) + } + None => indices.nulls().cloned(), + } +} - if indices_has_nulls { - indices +#[inline(never)] +fn take_native( + values: &[T], + indices: &PrimitiveArray, +) -> ScalarBuffer { + match indices.nulls().filter(|n| n.null_count() > 0) { + Some(n) => indices + .values() .iter() .enumerate() - .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| { - if let Some(index) = index { - let index = ToPrimitive::to_usize(&index).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if bit_util::get_bit(values_slice, values_offset + index) { - bit_util::set_bit(output_slice, i); - } - } - - Ok(()) - })?; - } else { - indices + .map(|(idx, index)| match values.get(index.as_usize()) { + Some(v) => *v, + None => match n.is_null(idx) { + true => T::default(), + false => panic!("Out-of-bounds index {index:?}"), + }, + }) + .collect(), + None => indices .values() .iter() - .enumerate() - .try_for_each::<_, Result<(), ArrowError>>(|(i, index)| { - let index = ToPrimitive::to_usize(index).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + .map(|index| values[index.as_usize()]) + .collect(), + } +} - if bit_util::get_bit(values_slice, values_offset + index) { - bit_util::set_bit(output_slice, i); - } - Ok(()) - })?; +#[inline(never)] +fn take_bits( + values: &BooleanBuffer, + indices: &PrimitiveArray, +) -> BooleanBuffer { + let len = indices.len(); + let mut output_buffer = MutableBuffer::new_null(len); + let output_slice = output_buffer.as_slice_mut(); + + match indices.nulls().filter(|n| n.null_count() > 0) { + Some(nulls) => nulls.valid_indices().for_each(|idx| { + if values.value(indices.value(idx).as_usize()) { + bit_util::set_bit(output_slice, idx); + } + }), + None => indices.values().iter().enumerate().for_each(|(i, index)| { + if values.value(index.as_usize()) { + bit_util::set_bit(output_slice, i); + } + }), } - Ok(output_buffer.into()) + BooleanBuffer::new(output_buffer.into(), 0, indices.len()) } /// `take` implementation for boolean arrays -fn take_boolean( +fn take_boolean( values: &BooleanArray, indices: &PrimitiveArray, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ - let val_buf = take_bits(values.values().inner(), values.offset(), indices)?; - let null_buf = match values.nulls() { - Some(nulls) if nulls.null_count() > 0 => { - Some(take_bits(nulls.buffer(), nulls.offset(), indices)?) - } - _ => indices.nulls().map(|b| b.inner().sliced()), - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - indices.len(), - None, - null_buf, - 0, - vec![val_buf], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +) -> BooleanArray { + let val_buf = take_bits(values.values(), indices); + let null_buf = take_nulls(values.nulls(), indices); + BooleanArray::new(val_buf, null_buf) } /// `take` implementation for string arrays -fn take_bytes( +fn take_bytes( array: &GenericByteArray, indices: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ByteArrayType, - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ +) -> Result, ArrowError> { let data_len = indices.len(); let bytes_offset = (data_len + 1) * std::mem::size_of::(); @@ -529,7 +341,7 @@ where let nulls; if array.null_count() == 0 && indices.null_count() == 0 { for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let index = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; @@ -548,7 +360,7 @@ where let null_slice = null_buf.as_slice_mut(); for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let index = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; @@ -566,10 +378,9 @@ where } else if array.null_count() == 0 { for (i, offset) in offsets.iter_mut().skip(1).enumerate() { if indices.is_valid(i) { - let index = - ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + let index = indices.value(i).to_usize().ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; let s: &[u8] = array.value(index).as_ref(); @@ -586,7 +397,7 @@ where let null_slice = null_buf.as_slice_mut(); for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let index = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; @@ -627,9 +438,8 @@ fn take_list( ) -> Result, ArrowError> where IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, OffsetType: ArrowPrimitiveType, - OffsetType::Native: ToPrimitive + OffsetSizeTrait, + OffsetType::Native: OffsetSizeTrait, PrimitiveArray: From>, { // TODO: Some optimizations can be done here such as if it is @@ -657,15 +467,11 @@ where /// Calculates the index and indexed offset for the inner array, /// applying `take` on the inner array, then reconstructing a list array /// with the indexed offsets -fn take_fixed_size_list( +fn take_fixed_size_list( values: &FixedSizeListArray, indices: &PrimitiveArray, length: ::Native, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ +) -> Result { let list_indices = take_value_indices_from_fixed_size_list(values, indices, length)?; let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; @@ -675,7 +481,7 @@ where let null_slice = null_buf.as_slice_mut(); for i in 0..indices.len() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let index = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; if !indices.is_valid(i) || values.is_null(index) { @@ -694,15 +500,11 @@ where Ok(FixedSizeListArray::from(list_data)) } -fn take_fixed_size_binary( +fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, size: i32, -) -> Result -where - IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, -{ +) -> Result { let nulls = values.nulls(); let array_iter = indices .values() @@ -725,26 +527,12 @@ where /// /// applies `take` to the keys of the dictionary array and returns a new dictionary array /// with the same dictionary values and reordered keys -fn take_dict( +fn take_dict( values: &DictionaryArray, indices: &PrimitiveArray, -) -> Result, ArrowError> -where - T: ArrowDictionaryKeyType, - T::Native: num::Num, - I: ArrowPrimitiveType, - I::Native: ToPrimitive, -{ - let new_keys = take_primitive::(values.keys(), indices)?; - let builder = new_keys - .into_data() - .into_builder() - .data_type(values.data_type().clone()) - .child_data(vec![values.values().to_data()]); - - // Safety: Indices were valid before - let data = unsafe { builder.build_unchecked() }; - Ok(DictionaryArray::::from(data)) +) -> Result, ArrowError> { + let new_keys = take_primitive(values.keys(), indices)?; + Ok(unsafe { DictionaryArray::new_unchecked(new_keys, values.values().clone()) }) } /// `take` implementation for run arrays @@ -755,16 +543,10 @@ where /// For e.g. an input `RunArray{ run_ends = [2,4,6,8], values=[1,2,1,2] }` and `logical_indices=[2,3,6,7]` /// would be converted to `physical_indices=[1,1,3,3]` which will be used to build /// output `RunArray{ run_ends=[2,4], values=[2,2] }`. -fn take_run( +fn take_run( run_array: &RunArray, logical_indices: &PrimitiveArray, -) -> Result, ArrowError> -where - T: RunEndIndexType, - T::Native: num::Num, - I: ArrowPrimitiveType, - I::Native: ToPrimitive, -{ +) -> Result, ArrowError> { // get physical indices for the input logical indices let physical_indices = run_array.get_physical_indices(logical_indices.values())?; @@ -840,7 +622,6 @@ fn take_value_indices_from_list( > where IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, OffsetType: ArrowPrimitiveType, OffsetType::Native: OffsetSizeTrait + std::ops::Add + num::Zero + num::One, PrimitiveArray: From>, @@ -862,7 +643,7 @@ where // compute the value indices, and set offsets accordingly for i in 0..indices.len() { if indices.is_valid(i) { - let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let ix = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; let start = offsets[ix]; @@ -901,13 +682,12 @@ fn take_value_indices_from_fixed_size_list( ) -> Result, ArrowError> where IndexType: ArrowPrimitiveType, - IndexType::Native: ToPrimitive, { let mut values = vec![]; for i in 0..indices.len() { if indices.is_valid(i) { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { + let index = indices.value(i).to_usize().ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; let start = @@ -1008,7 +788,6 @@ mod tests { T: ArrowPrimitiveType, PrimitiveArray: From>>, I: ArrowPrimitiveType, - I::Native: ToPrimitive, { let output = PrimitiveArray::::from(data); let expected = PrimitiveArray::::from(expected_data); @@ -2155,4 +1934,19 @@ mod tests { UInt32Array::from(vec![9, 10, 11, 6, 7, 8, 3, 4, 5, 6, 7, 8, 0, 1, 2]) ); } + + #[test] + fn test_take_null_indices() { + let indices = Int32Array::new( + vec![1, 2, 400, 400].into(), + Some(NullBuffer::from(vec![true, true, false, false])), + ); + let values = Int32Array::from(vec![1, 23, 4, 5]); + let r = take(&values, &indices, None).unwrap(); + let values = r + .as_primitive::() + .into_iter() + .collect::>(); + assert_eq!(&values, &[Some(23), Some(4), None, None]) + } } From 23177ee8f6b6e0f83730c556be54713cd7f4e323 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:14:12 +0100 Subject: [PATCH 1014/1411] Cleanup nullif kernel (#4416) --- arrow-select/src/nullif.rs | 61 ++++++++++++-------------------------- 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 3d9148016af0..ab68e8c2f097 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -15,11 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; -use arrow_buffer::buffer::{ - bitwise_bin_op_helper, bitwise_unary_op_helper, buffer_bin_and, -}; +use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_unary_op_helper}; +use arrow_buffer::{BooleanBuffer, NullBuffer}; use arrow_schema::ArrowError; /// Copies original array, setting validity bit to false if a secondary comparison @@ -28,16 +26,14 @@ use arrow_schema::ArrowError; /// Typically used to implement NULLIF. pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let left_data = left.to_data(); - let right_data = right.to_data(); - if left_data.len() != right_data.len() { + if left_data.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform comparison operation on arrays of different length" .to_string(), )); } let len = left_data.len(); - let l_offset = left_data.offset(); if len == 0 { return Ok(make_array(left_data)); @@ -53,18 +49,9 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result ( - buffer_bin_and( - right_data.buffers()[0], - right_data.offset(), - nulls.buffer(), - nulls.offset(), - len, - ), - 0, - ), - None => (right_data.buffers()[0].clone(), right_data.offset()), + let right = match right.nulls() { + Some(nulls) => right.values() & nulls.inner(), + None => right.values().clone(), }; // Compute left null bitmap & !right @@ -75,8 +62,8 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Result { let mut null_count = 0; - let buffer = bitwise_unary_op_helper(&right, r_offset, len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }); + let buffer = + bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); (buffer, null_count) } }; - // Need to construct null buffer with offset of left - let null_buffer = match left_data.offset() { - 0 => combined, - _ => { - let mut builder = BooleanBufferBuilder::new(len + l_offset); - // Pad with 0s up to offset - builder.resize(l_offset); - builder.append_packed_range(0..len, &combined); - builder.into() - } - }; - - let data = left_data - .into_builder() - .null_bit_buffer(Some(null_buffer)) - .null_count(null_count); + let combined = BooleanBuffer::new(combined, 0, len); + // Safety: + // Counted nulls whilst computing + let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) }; + let data = left_data.into_builder().nulls(Some(nulls)); // SAFETY: // Only altered null mask From 9d09fe562c65d4f52cdccb253690b5533f6cc23f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:14:26 +0100 Subject: [PATCH 1015/1411] Faster unpacking of Int32Type dictionary (#4406) --- arrow-cast/src/cast.rs | 43 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 32f422768dc3..dea3f2acfaf8 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -49,7 +49,7 @@ use crate::parse::{ use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; -use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -3466,34 +3466,21 @@ fn unpack_dictionary( where K: ArrowDictionaryKeyType, { - let dict_array = array - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), - ) - })?; - - // attempt to cast the dict values to the target type - // use the take kernel to expand out the dictionary + let dict_array = array.as_dictionary::(); let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?; - - // Note take requires first casting the indices to u32 - let keys_array: ArrayRef = - Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); - let indices = cast_with_options(&keys_array, &DataType::UInt32, cast_options)?; - let u32_indices = - indices - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dict indices to UInt32".to_string(), - ) - })?; - - take(cast_dict_values.as_ref(), u32_indices, None) + let keys = dict_array.keys(); + match K::DATA_TYPE { + DataType::Int32 => { + // Dictionary guarantees all non-null keys >= 0 + let buffer = ScalarBuffer::new(keys.values().inner().clone(), 0, keys.len()); + let indices = PrimitiveArray::new(buffer, keys.nulls().cloned()); + take::(cast_dict_values.as_ref(), &indices, None) + } + _ => { + let indices = cast_with_options(keys, &DataType::UInt32, cast_options)?; + take::(cast_dict_values.as_ref(), indices.as_primitive(), None) + } + } } /// Attempts to encode an array into an `ArrayDictionary` with index From a57d718ed8b2518d0fa920a781977a7bc0b1bcc7 Mon Sep 17 00:00:00 2001 From: ming08108 Date: Thu, 15 Jun 2023 14:06:05 -0500 Subject: [PATCH 1016/1411] fix create_primitive_array (#4412) --- arrow-ipc/src/reader.rs | 56 +++++++++++++---------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index cabf81fc245e..92a7a0dcc318 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -234,14 +234,6 @@ fn create_primitive_array( .null_bit_buffer(null_buffer) .build()? } - FixedSizeBinary(_) => { - // read 2 buffers: null buffer (optional) and data buffer - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()? - } Int8 | Int16 | Int32 @@ -250,24 +242,23 @@ fn create_primitive_array( | UInt32 | Time32(_) | Date32 - | Interval(IntervalUnit::YearMonth) => { - if buffers[1].len() / 8 == length && length != 1 { - // interpret as a signed i64, and cast appropriately - let data = ArrayData::builder(DataType::Int64) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()?; - let values = Arc::new(Int64Array::from(data)) as ArrayRef; - let casted = cast(&values, data_type)?; - casted.into_data() - } else { - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()? - } + | Interval(IntervalUnit::YearMonth) + | Interval(IntervalUnit::DayTime) + | FixedSizeBinary(_) + | Boolean + | Int64 + | UInt64 + | Float64 + | Time64(_) + | Timestamp(_, _) + | Date64 + | Duration(_) => { + // read 2 buffers: null buffer (optional) and data buffer + ArrayData::builder(data_type.clone()) + .len(length) + .add_buffer(buffers[1].clone()) + .null_bit_buffer(null_buffer) + .build()? } Float32 => { if buffers[1].len() / 8 == length && length != 1 { @@ -288,19 +279,6 @@ fn create_primitive_array( .build()? } } - Boolean - | Int64 - | UInt64 - | Float64 - | Time64(_) - | Timestamp(_, _) - | Date64 - | Duration(_) - | Interval(IntervalUnit::DayTime) => ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()?, Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { let buffer = get_aligned_buffer::(&buffers[1], length); From 993a7cc4688b380e1c4d2af44903ecd0673f4645 Mon Sep 17 00:00:00 2001 From: Gert Hulselmans Date: Fri, 16 Jun 2023 07:55:38 +0200 Subject: [PATCH 1017/1411] Fix reading gzip file with multiple gzip headers in parquet-fromcsv. (#4419) Fix reading gzip file with multiple gzip headers in parquet-fromcsv. Closes: #4173 --- parquet/src/bin/parquet-fromcsv.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index f2a911c00301..1ff6fecf5a81 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -381,7 +381,7 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { Box::new(snap::read::FrameDecoder::new(input_file)) as Box } Compression::GZIP(_) => { - Box::new(flate2::read::GzDecoder::new(input_file)) as Box + Box::new(flate2::read::MultiGzDecoder::new(input_file)) as Box } Compression::BROTLI(_) => { Box::new(brotli::Decompressor::new(input_file, 0)) as Box From e5c9e0d8b2029da0a018c80c8dba4d2e951aa7a7 Mon Sep 17 00:00:00 2001 From: Li wen Date: Sat, 17 Jun 2023 00:20:42 +0800 Subject: [PATCH 1018/1411] Fix bug in IPC logic that determines if the buffer should be compressed or not (#4411) The wrongly calculated compressed length included the full original buffer length, which will decline almost all the compressable data. Suppose original buffer len is *a*, incoming data len is *b*, compressed data len is *c*, the code should compare *b* and *c* instead of *b* and *a+c* --- arrow-ipc/src/compression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index dd60bfdeec66..db05e9a6a6c6 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -69,7 +69,7 @@ impl CompressionCodec { output.extend_from_slice(&uncompressed_data_len.to_le_bytes()); self.compress(input, output)?; - let compression_len = output.len(); + let compression_len = output.len() - original_output_len; if compression_len > uncompressed_data_len { // length of compressed data was larger than // uncompressed data, use the uncompressed data with From 2c7b4efc1701d9db5a0cc6decacf1df22123645f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Jun 2023 14:23:49 -0400 Subject: [PATCH 1019/1411] Prepare for the `42.0.0` release (#4423) * Update version to `42.0.0` * Update version + readme * Initial Changelog * update changelog --- CHANGELOG-old.md | 85 +++++++++++++++++++ CHANGELOG.md | 138 +++++++++++++++---------------- Cargo.toml | 32 +++---- dev/release/README.md | 3 - dev/release/update_change_log.sh | 4 +- 5 files changed, 169 insertions(+), 93 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 1475230a7c59..97d96882a3f5 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,91 @@ # Historical Changelog +## [41.0.0](https://github.com/apache/arrow-rs/tree/41.0.0) (2023-06-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/40.0.0...41.0.0) + +**Breaking changes:** + +- Rename list contains kernels to in\_list \(\#4289\) [\#4342](https://github.com/apache/arrow-rs/pull/4342) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move BooleanBufferBuilder and NullBufferBuilder to arrow\_buffer [\#4338](https://github.com/apache/arrow-rs/pull/4338) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add separate row\_count and level\_count to PageMetadata \(\#4321\) [\#4326](https://github.com/apache/arrow-rs/pull/4326) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Treat legacy TIMSETAMP\_X converted types as UTC [\#4309](https://github.com/apache/arrow-rs/pull/4309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sergiimk](https://github.com/sergiimk)) +- Simplify parquet PageIterator [\#4306](https://github.com/apache/arrow-rs/pull/4306) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add Builder style APIs and docs for `FlightData`,` FlightInfo`, `FlightEndpoint`, `Locaation` and `Ticket` [\#4294](https://github.com/apache/arrow-rs/pull/4294) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Make GenericColumnWriter Send [\#4287](https://github.com/apache/arrow-rs/pull/4287) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: update flight-sql to latest specs [\#4250](https://github.com/apache/arrow-rs/pull/4250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- feat\(api!\): make ArrowArrayStreamReader Send [\#4232](https://github.com/apache/arrow-rs/pull/4232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) + +**Implemented enhancements:** + +- Make SerializedRowGroupReader::new\(\) Public [\#4330](https://github.com/apache/arrow-rs/issues/4330) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Speed up i256 division and remainder operations [\#4302](https://github.com/apache/arrow-rs/issues/4302) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- export function parquet\_to\_array\_schema\_and\_fields [\#4298](https://github.com/apache/arrow-rs/issues/4298) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FLightSQL: add helpers to create `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4295](https://github.com/apache/arrow-rs/issues/4295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Make ColumnWriter Send [\#4286](https://github.com/apache/arrow-rs/issues/4286) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add Builder for `FlightInfo` to make it easier to create new requests [\#4281](https://github.com/apache/arrow-rs/issues/4281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support Writing/Reading Decimal256 to/from Parquet [\#4264](https://github.com/apache/arrow-rs/issues/4264) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- FlightSQL: Add helpers to create `CommandGetSqlInfo` responses \(`SqlInfoValue` and builders\) [\#4256](https://github.com/apache/arrow-rs/issues/4256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Update flight-sql implementation to latest specs [\#4249](https://github.com/apache/arrow-rs/issues/4249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Make ArrowArrayStreamReader Send [\#4222](https://github.com/apache/arrow-rs/issues/4222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support writing FixedSizeList to Parquet [\#4214](https://github.com/apache/arrow-rs/issues/4214) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Cast between `Intervals` [\#4181](https://github.com/apache/arrow-rs/issues/4181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Splice Parquet Data [\#4155](https://github.com/apache/arrow-rs/issues/4155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- CSV Schema More Flexible Timestamp Inference [\#4131](https://github.com/apache/arrow-rs/issues/4131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Doc for arrow\_flight::sql is missing enums that are Xdbc related [\#4339](https://github.com/apache/arrow-rs/issues/4339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- concat\_batches panics with total\_len \<= bit\_len assertion for records with lists [\#4324](https://github.com/apache/arrow-rs/issues/4324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect PageMetadata Row Count returned for V1 DataPage [\#4321](https://github.com/apache/arrow-rs/issues/4321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[parquet\] Not following the spec for TIMESTAMP\_MILLIS legacy converted types [\#4308](https://github.com/apache/arrow-rs/issues/4308) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- ambiguous glob re-exports of contains\_utf8 [\#4289](https://github.com/apache/arrow-rs/issues/4289) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- flight\_sql\_client --header "key: value" yields a value with a leading whitespace [\#4270](https://github.com/apache/arrow-rs/issues/4270) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Casting Timestamp to date is off by one day for dates before 1970-01-01 [\#4211](https://github.com/apache/arrow-rs/issues/4211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Don't infer 16-byte decimal as decimal256 [\#4349](https://github.com/apache/arrow-rs/pull/4349) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix MutableArrayData::extend\_nulls \(\#1230\) [\#4343](https://github.com/apache/arrow-rs/pull/4343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update FlightSQL metadata locations, names and docs [\#4341](https://github.com/apache/arrow-rs/pull/4341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- chore: expose Xdbc related FlightSQL enums [\#4340](https://github.com/apache/arrow-rs/pull/4340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([appletreeisyellow](https://github.com/appletreeisyellow)) +- Update pyo3 requirement from 0.18 to 0.19 [\#4335](https://github.com/apache/arrow-rs/pull/4335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Skip unnecessary null checks in MutableArrayData [\#4333](https://github.com/apache/arrow-rs/pull/4333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: add read parquet by custom rowgroup examples [\#4332](https://github.com/apache/arrow-rs/pull/4332) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sundy-li](https://github.com/sundy-li)) +- Make SerializedRowGroupReader::new\(\) public [\#4331](https://github.com/apache/arrow-rs/pull/4331) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([burmecia](https://github.com/burmecia)) +- Don't split record across pages \(\#3680\) [\#4327](https://github.com/apache/arrow-rs/pull/4327) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- fix date conversion if timestamp below unixtimestamp [\#4323](https://github.com/apache/arrow-rs/pull/4323) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) +- Short-circuit on exhausted page in skip\_records [\#4320](https://github.com/apache/arrow-rs/pull/4320) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Handle trailing padding when skipping repetition levels \(\#3911\) [\#4319](https://github.com/apache/arrow-rs/pull/4319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties [\#4313](https://github.com/apache/arrow-rs/pull/4313) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Add roundtrip tests for Decimal256 and fix issues \(\#4264\) [\#4311](https://github.com/apache/arrow-rs/pull/4311) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Expose page-level arrow reader API \(\#4298\) [\#4307](https://github.com/apache/arrow-rs/pull/4307) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Speed up i256 division and remainder operations [\#4303](https://github.com/apache/arrow-rs/pull/4303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- feat\(flight\): support int32\_to\_int32\_list\_map in sql infos [\#4300](https://github.com/apache/arrow-rs/pull/4300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- feat\(flight\): add helpers to handle `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4296](https://github.com/apache/arrow-rs/pull/4296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Improve docs and tests for `SqlInfoList [\#4293](https://github.com/apache/arrow-rs/pull/4293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- minor: fix arrow\_row docs.rs links [\#4292](https://github.com/apache/arrow-rs/pull/4292) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([roeap](https://github.com/roeap)) +- Update proc-macro2 requirement from =1.0.58 to =1.0.59 [\#4290](https://github.com/apache/arrow-rs/pull/4290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Improve `ArrowWriter` memory usage: Buffer Pages in ArrowWriter instead of RecordBatch \(\#3871\) [\#4280](https://github.com/apache/arrow-rs/pull/4280) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Minor: Add more docstrings in arrow-flight [\#4279](https://github.com/apache/arrow-rs/pull/4279) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Add `Debug` impls for `ArrowWriter` and `SerializedFileWriter` [\#4278](https://github.com/apache/arrow-rs/pull/4278) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Expose `RecordBatchWriter` to `arrow` crate [\#4277](https://github.com/apache/arrow-rs/pull/4277) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- Update criterion requirement from 0.4 to 0.5 [\#4275](https://github.com/apache/arrow-rs/pull/4275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add parquet-concat [\#4274](https://github.com/apache/arrow-rs/pull/4274) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Convert FixedSizeListArray to GenericListArray [\#4273](https://github.com/apache/arrow-rs/pull/4273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: support 'Decimal256' for parquet [\#4272](https://github.com/apache/arrow-rs/pull/4272) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) +- Strip leading whitespace from flight\_sql\_client custom header values [\#4271](https://github.com/apache/arrow-rs/pull/4271) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([mkmik](https://github.com/mkmik)) +- Add Append Column API \(\#4155\) [\#4269](https://github.com/apache/arrow-rs/pull/4269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Derive Default for WriterProperties [\#4268](https://github.com/apache/arrow-rs/pull/4268) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Parquet Reader/writer for fixed-size list arrays [\#4267](https://github.com/apache/arrow-rs/pull/4267) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dexterduck](https://github.com/dexterduck)) +- feat\(flight\): add sql-info helpers [\#4266](https://github.com/apache/arrow-rs/pull/4266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Convert parquet metadata back to builders [\#4265](https://github.com/apache/arrow-rs/pull/4265) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add constructors for FixedSize array types \(\#3879\) [\#4263](https://github.com/apache/arrow-rs/pull/4263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Extract IPC ArrayReader struct [\#4259](https://github.com/apache/arrow-rs/pull/4259) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update object\_store requirement from 0.5 to 0.6 [\#4258](https://github.com/apache/arrow-rs/pull/4258) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Absolute Timestamps in CSV Schema Inference \(\#4131\) [\#4217](https://github.com/apache/arrow-rs/pull/4217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: cast between `Intervals` [\#4182](https://github.com/apache/arrow-rs/pull/4182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) ## [40.0.0](https://github.com/apache/arrow-rs/tree/40.0.0) (2023-05-19) [Full Changelog](https://github.com/apache/arrow-rs/compare/39.0.0...40.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3620e86f1e49..22ae78b516e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,91 +19,85 @@ # Changelog -## [41.0.0](https://github.com/apache/arrow-rs/tree/41.0.0) (2023-06-02) +## [42.0.0](https://github.com/apache/arrow-rs/tree/42.0.0) (2023-06-16) -[Full Changelog](https://github.com/apache/arrow-rs/compare/40.0.0...41.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/41.0.0...42.0.0) **Breaking changes:** -- Rename list contains kernels to in\_list \(\#4289\) [\#4342](https://github.com/apache/arrow-rs/pull/4342) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move BooleanBufferBuilder and NullBufferBuilder to arrow\_buffer [\#4338](https://github.com/apache/arrow-rs/pull/4338) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add separate row\_count and level\_count to PageMetadata \(\#4321\) [\#4326](https://github.com/apache/arrow-rs/pull/4326) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Treat legacy TIMSETAMP\_X converted types as UTC [\#4309](https://github.com/apache/arrow-rs/pull/4309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sergiimk](https://github.com/sergiimk)) -- Simplify parquet PageIterator [\#4306](https://github.com/apache/arrow-rs/pull/4306) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add Builder style APIs and docs for `FlightData`,` FlightInfo`, `FlightEndpoint`, `Locaation` and `Ticket` [\#4294](https://github.com/apache/arrow-rs/pull/4294) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Make GenericColumnWriter Send [\#4287](https://github.com/apache/arrow-rs/pull/4287) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- feat: update flight-sql to latest specs [\#4250](https://github.com/apache/arrow-rs/pull/4250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- feat\(api!\): make ArrowArrayStreamReader Send [\#4232](https://github.com/apache/arrow-rs/pull/4232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Remove 64-bit to 32-bit Cast from IPC Reader [\#4412](https://github.com/apache/arrow-rs/pull/4412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) +- Truncate Min/Max values in the Column Index [\#4389](https://github.com/apache/arrow-rs/pull/4389) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) +- feat\(flight\): harmonize server metadata APIs [\#4384](https://github.com/apache/arrow-rs/pull/4384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Move record delimiting into ColumnReader \(\#4365\) [\#4376](https://github.com/apache/arrow-rs/pull/4376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Changed array\_to\_json\_array to take &dyn Array [\#4370](https://github.com/apache/arrow-rs/pull/4370) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- Make PrimitiveArray::with\_timezone consuming [\#4366](https://github.com/apache/arrow-rs/pull/4366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Make SerializedRowGroupReader::new\(\) Public [\#4330](https://github.com/apache/arrow-rs/issues/4330) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Speed up i256 division and remainder operations [\#4302](https://github.com/apache/arrow-rs/issues/4302) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- export function parquet\_to\_array\_schema\_and\_fields [\#4298](https://github.com/apache/arrow-rs/issues/4298) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- FLightSQL: add helpers to create `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4295](https://github.com/apache/arrow-rs/issues/4295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Make ColumnWriter Send [\#4286](https://github.com/apache/arrow-rs/issues/4286) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Add Builder for `FlightInfo` to make it easier to create new requests [\#4281](https://github.com/apache/arrow-rs/issues/4281) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support Writing/Reading Decimal256 to/from Parquet [\#4264](https://github.com/apache/arrow-rs/issues/4264) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- FlightSQL: Add helpers to create `CommandGetSqlInfo` responses \(`SqlInfoValue` and builders\) [\#4256](https://github.com/apache/arrow-rs/issues/4256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Update flight-sql implementation to latest specs [\#4249](https://github.com/apache/arrow-rs/issues/4249) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Make ArrowArrayStreamReader Send [\#4222](https://github.com/apache/arrow-rs/issues/4222) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support writing FixedSizeList to Parquet [\#4214](https://github.com/apache/arrow-rs/issues/4214) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Cast between `Intervals` [\#4181](https://github.com/apache/arrow-rs/issues/4181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Splice Parquet Data [\#4155](https://github.com/apache/arrow-rs/issues/4155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- CSV Schema More Flexible Timestamp Inference [\#4131](https://github.com/apache/arrow-rs/issues/4131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add doc example of constructing a MapArray [\#4385](https://github.com/apache/arrow-rs/issues/4385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `millisecond` and `microsecond` functions [\#4374](https://github.com/apache/arrow-rs/issues/4374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Changed array\_to\_json\_array to take &dyn Array [\#4369](https://github.com/apache/arrow-rs/issues/4369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- compute::ord kernel for getting min and max of two scalar/array values [\#4347](https://github.com/apache/arrow-rs/issues/4347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release 41.0.0 of arrow/arrow-flight/parquet/parquet-derive [\#4346](https://github.com/apache/arrow-rs/issues/4346) +- Refactor CAST tests to use new cast array syntax [\#4336](https://github.com/apache/arrow-rs/issues/4336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- pass bytes directly to parquet's KeyValue [\#4317](https://github.com/apache/arrow-rs/issues/4317) +- PyArrow conversions could return TypeError if provided incorrect Python type [\#4312](https://github.com/apache/arrow-rs/issues/4312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Have array\_to\_json\_array support Map [\#4297](https://github.com/apache/arrow-rs/issues/4297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FlightSQL: Add helpers to create `CommandGetXdbcTypeInfo` responses \(`XdbcInfoValue` and builders\) [\#4257](https://github.com/apache/arrow-rs/issues/4257) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Have array\_to\_json\_array support FixedSizeList [\#4248](https://github.com/apache/arrow-rs/issues/4248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Truncate ColumnIndex ByteArray Statistics [\#4126](https://github.com/apache/arrow-rs/issues/4126) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Arrow compute kernel regards selection vector [\#4095](https://github.com/apache/arrow-rs/issues/4095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Doc for arrow\_flight::sql is missing enums that are Xdbc related [\#4339](https://github.com/apache/arrow-rs/issues/4339) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- concat\_batches panics with total\_len \<= bit\_len assertion for records with lists [\#4324](https://github.com/apache/arrow-rs/issues/4324) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect PageMetadata Row Count returned for V1 DataPage [\#4321](https://github.com/apache/arrow-rs/issues/4321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[parquet\] Not following the spec for TIMESTAMP\_MILLIS legacy converted types [\#4308](https://github.com/apache/arrow-rs/issues/4308) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- ambiguous glob re-exports of contains\_utf8 [\#4289](https://github.com/apache/arrow-rs/issues/4289) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- flight\_sql\_client --header "key: value" yields a value with a leading whitespace [\#4270](https://github.com/apache/arrow-rs/issues/4270) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Casting Timestamp to date is off by one day for dates before 1970-01-01 [\#4211](https://github.com/apache/arrow-rs/issues/4211) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Wrongly calculated data compressed length in IPC writer [\#4410](https://github.com/apache/arrow-rs/issues/4410) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Take Kernel Handles Nullable Indices Incorrectly [\#4404](https://github.com/apache/arrow-rs/issues/4404) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- StructBuilder::new Doesn't Validate Builder DataTypes [\#4397](https://github.com/apache/arrow-rs/issues/4397) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet error: Not all children array length are the same! when using RowSelection to read a parquet file [\#4396](https://github.com/apache/arrow-rs/issues/4396) +- RecordReader::skip\_records Is Incorrect for Repeated Columns [\#4368](https://github.com/apache/arrow-rs/issues/4368) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- List-of-String Array panics in the presence of row filters [\#4365](https://github.com/apache/arrow-rs/issues/4365) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Fail to read block compressed gzip files with parquet-fromcsv [\#4173](https://github.com/apache/arrow-rs/issues/4173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Closed issues:** + +- Have a parquet file not able to be deduped via arrow-rs, complains about Decimal precision? [\#4356](https://github.com/apache/arrow-rs/issues/4356) +- Question: Could we move `dict_id, dict_is_ordered` into DataType? [\#4325](https://github.com/apache/arrow-rs/issues/4325) **Merged pull requests:** -- Don't infer 16-byte decimal as decimal256 [\#4349](https://github.com/apache/arrow-rs/pull/4349) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Fix MutableArrayData::extend\_nulls \(\#1230\) [\#4343](https://github.com/apache/arrow-rs/pull/4343) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update FlightSQL metadata locations, names and docs [\#4341](https://github.com/apache/arrow-rs/pull/4341) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- chore: expose Xdbc related FlightSQL enums [\#4340](https://github.com/apache/arrow-rs/pull/4340) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([appletreeisyellow](https://github.com/appletreeisyellow)) -- Update pyo3 requirement from 0.18 to 0.19 [\#4335](https://github.com/apache/arrow-rs/pull/4335) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Skip unnecessary null checks in MutableArrayData [\#4333](https://github.com/apache/arrow-rs/pull/4333) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: add read parquet by custom rowgroup examples [\#4332](https://github.com/apache/arrow-rs/pull/4332) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([sundy-li](https://github.com/sundy-li)) -- Make SerializedRowGroupReader::new\(\) public [\#4331](https://github.com/apache/arrow-rs/pull/4331) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([burmecia](https://github.com/burmecia)) -- Don't split record across pages \(\#3680\) [\#4327](https://github.com/apache/arrow-rs/pull/4327) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- fix date conversion if timestamp below unixtimestamp [\#4323](https://github.com/apache/arrow-rs/pull/4323) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([comphead](https://github.com/comphead)) -- Short-circuit on exhausted page in skip\_records [\#4320](https://github.com/apache/arrow-rs/pull/4320) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Handle trailing padding when skipping repetition levels \(\#3911\) [\#4319](https://github.com/apache/arrow-rs/pull/4319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties [\#4313](https://github.com/apache/arrow-rs/pull/4313) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Add roundtrip tests for Decimal256 and fix issues \(\#4264\) [\#4311](https://github.com/apache/arrow-rs/pull/4311) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Expose page-level arrow reader API \(\#4298\) [\#4307](https://github.com/apache/arrow-rs/pull/4307) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Speed up i256 division and remainder operations [\#4303](https://github.com/apache/arrow-rs/pull/4303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- feat\(flight\): support int32\_to\_int32\_list\_map in sql infos [\#4300](https://github.com/apache/arrow-rs/pull/4300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- feat\(flight\): add helpers to handle `CommandGetCatalogs`, `CommandGetSchemas`, and `CommandGetTables` requests [\#4296](https://github.com/apache/arrow-rs/pull/4296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- Improve docs and tests for `SqlInfoList [\#4293](https://github.com/apache/arrow-rs/pull/4293) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- minor: fix arrow\_row docs.rs links [\#4292](https://github.com/apache/arrow-rs/pull/4292) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([roeap](https://github.com/roeap)) -- Update proc-macro2 requirement from =1.0.58 to =1.0.59 [\#4290](https://github.com/apache/arrow-rs/pull/4290) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Improve `ArrowWriter` memory usage: Buffer Pages in ArrowWriter instead of RecordBatch \(\#3871\) [\#4280](https://github.com/apache/arrow-rs/pull/4280) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Minor: Add more docstrings in arrow-flight [\#4279](https://github.com/apache/arrow-rs/pull/4279) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Add `Debug` impls for `ArrowWriter` and `SerializedFileWriter` [\#4278](https://github.com/apache/arrow-rs/pull/4278) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Expose `RecordBatchWriter` to `arrow` crate [\#4277](https://github.com/apache/arrow-rs/pull/4277) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) -- Update criterion requirement from 0.4 to 0.5 [\#4275](https://github.com/apache/arrow-rs/pull/4275) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add parquet-concat [\#4274](https://github.com/apache/arrow-rs/pull/4274) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Convert FixedSizeListArray to GenericListArray [\#4273](https://github.com/apache/arrow-rs/pull/4273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: support 'Decimal256' for parquet [\#4272](https://github.com/apache/arrow-rs/pull/4272) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Weijun-H](https://github.com/Weijun-H)) -- Strip leading whitespace from flight\_sql\_client custom header values [\#4271](https://github.com/apache/arrow-rs/pull/4271) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([mkmik](https://github.com/mkmik)) -- Add Append Column API \(\#4155\) [\#4269](https://github.com/apache/arrow-rs/pull/4269) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Derive Default for WriterProperties [\#4268](https://github.com/apache/arrow-rs/pull/4268) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Parquet Reader/writer for fixed-size list arrays [\#4267](https://github.com/apache/arrow-rs/pull/4267) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dexterduck](https://github.com/dexterduck)) -- feat\(flight\): add sql-info helpers [\#4266](https://github.com/apache/arrow-rs/pull/4266) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- Convert parquet metadata back to builders [\#4265](https://github.com/apache/arrow-rs/pull/4265) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add constructors for FixedSize array types \(\#3879\) [\#4263](https://github.com/apache/arrow-rs/pull/4263) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Extract IPC ArrayReader struct [\#4259](https://github.com/apache/arrow-rs/pull/4259) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update object\_store requirement from 0.5 to 0.6 [\#4258](https://github.com/apache/arrow-rs/pull/4258) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support Absolute Timestamps in CSV Schema Inference \(\#4131\) [\#4217](https://github.com/apache/arrow-rs/pull/4217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat: cast between `Intervals` [\#4182](https://github.com/apache/arrow-rs/pull/4182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Fix reading gzip file with multiple gzip headers in parquet-fromcsv. [\#4419](https://github.com/apache/arrow-rs/pull/4419) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ghuls](https://github.com/ghuls)) +- Cleanup nullif kernel [\#4416](https://github.com/apache/arrow-rs/pull/4416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix bug in IPC logic that determines if the buffer should be compressed or not [\#4411](https://github.com/apache/arrow-rs/pull/4411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lwpyr](https://github.com/lwpyr)) +- Faster unpacking of Int32Type dictionary [\#4406](https://github.com/apache/arrow-rs/pull/4406) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve `take` kernel performance on primitive arrays, fix bad null index handling \(\#4404\) [\#4405](https://github.com/apache/arrow-rs/pull/4405) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More take benchmarks [\#4403](https://github.com/apache/arrow-rs/pull/4403) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `BooleanBuffer::new_unset` and `BooleanBuffer::new_set` and `BooleanArray::new_null` constructors [\#4402](https://github.com/apache/arrow-rs/pull/4402) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add PrimitiveBuilder type constructors [\#4401](https://github.com/apache/arrow-rs/pull/4401) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- StructBuilder Validate Child Data \(\#4397\) [\#4400](https://github.com/apache/arrow-rs/pull/4400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster UTF-8 truncation [\#4399](https://github.com/apache/arrow-rs/pull/4399) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Minor: Derive `Hash` impls for `CastOptions` and `FormatOptions` [\#4395](https://github.com/apache/arrow-rs/pull/4395) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix typo in README [\#4394](https://github.com/apache/arrow-rs/pull/4394) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([okue](https://github.com/okue)) +- Improve parquet `WriterProperites` and `ReaderProperties` docs [\#4392](https://github.com/apache/arrow-rs/pull/4392) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Cleanup downcast macros [\#4391](https://github.com/apache/arrow-rs/pull/4391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.59 to =1.0.60 [\#4388](https://github.com/apache/arrow-rs/pull/4388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Consolidate ByteArray::from\_iterator [\#4386](https://github.com/apache/arrow-rs/pull/4386) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add MapArray constructors and doc example [\#4382](https://github.com/apache/arrow-rs/pull/4382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Documentation Improvements [\#4381](https://github.com/apache/arrow-rs/pull/4381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add NullBuffer and BooleanBuffer From conversions [\#4380](https://github.com/apache/arrow-rs/pull/4380) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more examples of constructing Boolean, Primitive, String, and Decimal Arrays, and From impl for i256 [\#4379](https://github.com/apache/arrow-rs/pull/4379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add ListArrayReader benchmarks [\#4378](https://github.com/apache/arrow-rs/pull/4378) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update comfy-table requirement from 6.0 to 7.0 [\#4377](https://github.com/apache/arrow-rs/pull/4377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: Add`microsecond` and `millisecond` kernels [\#4375](https://github.com/apache/arrow-rs/pull/4375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Update hashbrown requirement from 0.13 to 0.14 [\#4373](https://github.com/apache/arrow-rs/pull/4373) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor: use as\_boolean to resolve TODO [\#4367](https://github.com/apache/arrow-rs/pull/4367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Have array\_to\_json\_array support MapArray [\#4364](https://github.com/apache/arrow-rs/pull/4364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- deprecate: as\_decimal\_array [\#4363](https://github.com/apache/arrow-rs/pull/4363) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add support for FixedSizeList in array\_to\_json\_array [\#4361](https://github.com/apache/arrow-rs/pull/4361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- refact: use as\_primitive in cast.rs test [\#4360](https://github.com/apache/arrow-rs/pull/4360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat\(flight\): add xdbc type info helpers [\#4359](https://github.com/apache/arrow-rs/pull/4359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Minor: float16 to json [\#4358](https://github.com/apache/arrow-rs/pull/4358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Raise TypeError on PyArrow import [\#4316](https://github.com/apache/arrow-rs/pull/4316) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Arrow Cast: Fixed Point Arithmetic for Interval Parsing [\#4291](https://github.com/apache/arrow-rs/pull/4291) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) diff --git a/Cargo.toml b/Cargo.toml index bca0f70ef339..0b67ed91b1f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "41.0.0" +version = "42.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "41.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "41.0.0", path = "./arrow-arith" } -arrow-array = { version = "41.0.0", path = "./arrow-array" } -arrow-buffer = { version = "41.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "41.0.0", path = "./arrow-cast" } -arrow-csv = { version = "41.0.0", path = "./arrow-csv" } -arrow-data = { version = "41.0.0", path = "./arrow-data" } -arrow-ipc = { version = "41.0.0", path = "./arrow-ipc" } -arrow-json = { version = "41.0.0", path = "./arrow-json" } -arrow-ord = { version = "41.0.0", path = "./arrow-ord" } -arrow-row = { version = "41.0.0", path = "./arrow-row" } -arrow-schema = { version = "41.0.0", path = "./arrow-schema" } -arrow-select = { version = "41.0.0", path = "./arrow-select" } -arrow-string = { version = "41.0.0", path = "./arrow-string" } -parquet = { version = "41.0.0", path = "./parquet", default-features = false } +arrow = { version = "42.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "42.0.0", path = "./arrow-arith" } +arrow-array = { version = "42.0.0", path = "./arrow-array" } +arrow-buffer = { version = "42.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "42.0.0", path = "./arrow-cast" } +arrow-csv = { version = "42.0.0", path = "./arrow-csv" } +arrow-data = { version = "42.0.0", path = "./arrow-data" } +arrow-ipc = { version = "42.0.0", path = "./arrow-ipc" } +arrow-json = { version = "42.0.0", path = "./arrow-json" } +arrow-ord = { version = "42.0.0", path = "./arrow-ord" } +arrow-row = { version = "42.0.0", path = "./arrow-row" } +arrow-schema = { version = "42.0.0", path = "./arrow-schema" } +arrow-select = { version = "42.0.0", path = "./arrow-select" } +arrow-string = { version = "42.0.0", path = "./arrow-string" } +parquet = { version = "42.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/README.md b/dev/release/README.md index 8c699d16374f..30b3a4a8a569 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -73,12 +73,9 @@ git checkout -b sed -i '' -e 's/14.0.0/39.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' -# Copy the content of CHANGELOG.md to the beginning of CHANGELOG-old.md - # ensure your github token is available export ARROW_GITHUB_API_TOKEN= - # manually edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog ./dev/release/update_change_log.sh diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 7881ad02c06e..0833c66c428d 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="40.0.0" -FUTURE_RELEASE="41.0.0" +SINCE_TAG="41.0.0" +FUTURE_RELEASE="42.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 15e0e76bfb6500799a43991f1339e69464c513f8 Mon Sep 17 00:00:00 2001 From: ming08108 Date: Sat, 17 Jun 2023 14:55:52 -0500 Subject: [PATCH 1020/1411] fix float cast (#4427) Co-authored-by: Guoming Yang --- arrow-ipc/src/reader.rs | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 92a7a0dcc318..0908d580d59a 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -29,7 +29,6 @@ use std::sync::Arc; use arrow_array::*; use arrow_buffer::{Buffer, MutableBuffer}; -use arrow_cast::cast; use arrow_data::ArrayData; use arrow_schema::*; @@ -248,6 +247,7 @@ fn create_primitive_array( | Boolean | Int64 | UInt64 + | Float32 | Float64 | Time64(_) | Timestamp(_, _) @@ -260,25 +260,6 @@ fn create_primitive_array( .null_bit_buffer(null_buffer) .build()? } - Float32 => { - if buffers[1].len() / 8 == length && length != 1 { - // interpret as a f64, and cast appropriately - let data = ArrayData::builder(DataType::Float64) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()?; - let values = Arc::new(Float64Array::from(data)) as ArrayRef; - let casted = cast(&values, data_type)?; - casted.into_data() - } else { - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffers[1].clone()) - .null_bit_buffer(null_buffer) - .build()? - } - } Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { let buffer = get_aligned_buffer::(&buffers[1], length); From c41dc7f204087045343449cea4382f7e936e8ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Brochet?= Date: Wed, 21 Jun 2023 00:52:26 +0200 Subject: [PATCH 1021/1411] feat: add strict mode to json reader (#4421) When strict mode is enabled, the parser will return an error if it encounters a column not present in the schema --- arrow-json/src/reader/list_array.rs | 2 + arrow-json/src/reader/map_array.rs | 3 + arrow-json/src/reader/mod.rs | 148 ++++++++++++++++++++++---- arrow-json/src/reader/struct_array.rs | 24 ++++- 4 files changed, 152 insertions(+), 25 deletions(-) diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index ad27eb516fab..d6f7670f2dc9 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -35,6 +35,7 @@ impl ListArrayDecoder { pub fn new( data_type: DataType, coerce_primitive: bool, + strict_mode: bool, is_nullable: bool, ) -> Result { let field = match &data_type { @@ -45,6 +46,7 @@ impl ListArrayDecoder { let decoder = make_decoder( field.data_type().clone(), coerce_primitive, + strict_mode, field.is_nullable(), )?; diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs index 2d6fde34d433..a1f7e5ace66e 100644 --- a/arrow-json/src/reader/map_array.rs +++ b/arrow-json/src/reader/map_array.rs @@ -34,6 +34,7 @@ impl MapArrayDecoder { pub fn new( data_type: DataType, coerce_primitive: bool, + strict_mode: bool, is_nullable: bool, ) -> Result { let fields = match &data_type { @@ -56,11 +57,13 @@ impl MapArrayDecoder { let keys = make_decoder( fields[0].data_type().clone(), coerce_primitive, + strict_mode, fields[0].is_nullable(), )?; let values = make_decoder( fields[1].data_type().clone(), coerce_primitive, + strict_mode, fields[1].is_nullable(), )?; diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index dd58e1e1a4d9..4e98e2fd873a 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -170,6 +170,7 @@ mod timestamp_array; pub struct ReaderBuilder { batch_size: usize, coerce_primitive: bool, + strict_mode: bool, schema: SchemaRef, } @@ -179,13 +180,15 @@ impl ReaderBuilder { /// /// This could be obtained using [`infer_json_schema`] if not known /// - /// Any columns not present in `schema` will be ignored + /// Any columns not present in `schema` will be ignored, unless `strict_mode` is set to true. + /// In this case, an error is returned when a column is missing from `schema`. /// /// [`infer_json_schema`]: crate::reader::infer_json_schema pub fn new(schema: SchemaRef) -> Self { Self { batch_size: 1024, coerce_primitive: false, + strict_mode: false, schema, } } @@ -211,6 +214,15 @@ impl ReaderBuilder { } } + /// Sets if the decoder should return an error if it encounters a column not present + /// in `schema` + pub fn with_strict_mode(self, strict_mode: bool) -> Self { + Self { + strict_mode, + ..self + } + } + /// Create a [`Reader`] with the provided [`BufRead`] pub fn build(self, reader: R) -> Result, ArrowError> { Ok(Reader { @@ -224,6 +236,7 @@ impl ReaderBuilder { let decoder = make_decoder( DataType::Struct(self.schema.fields.clone()), self.coerce_primitive, + self.strict_mode, false, )?; let num_fields = self.schema.all_fields().len(); @@ -586,6 +599,7 @@ macro_rules! primitive_decoder { fn make_decoder( data_type: DataType, coerce_primitive: bool, + strict_mode: bool, is_nullable: bool, ) -> Result, ArrowError> { downcast_integer! { @@ -633,13 +647,13 @@ fn make_decoder( DataType::Boolean => Ok(Box::::default()), DataType::Utf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), - DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, is_nullable)?)), - DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, is_nullable)?)), - DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, coerce_primitive, is_nullable)?)), + DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, strict_mode, is_nullable)?)), + DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(data_type, coerce_primitive, strict_mode, is_nullable)?)), + DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable)?)), DataType::Binary | DataType::LargeBinary | DataType::FixedSizeBinary(_) => { Err(ArrowError::JsonError(format!("{data_type} is not supported by JSON"))) } - DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, coerce_primitive, is_nullable)?)), + DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(data_type, coerce_primitive, strict_mode, is_nullable)?)), d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader"))) } } @@ -670,6 +684,7 @@ mod tests { buf: &str, batch_size: usize, coerce_primitive: bool, + strict_mode: bool, schema: SchemaRef, ) -> Vec { let mut unbuffered = vec![]; @@ -693,6 +708,7 @@ mod tests { let buffered = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) .with_coerce_primitive(coerce_primitive) + .with_strict_mode(strict_mode) .build(BufReader::with_capacity(b, Cursor::new(buf.as_bytes()))) .unwrap() .collect::, _>>() @@ -724,7 +740,7 @@ mod tests { Field::new("e", DataType::Date64, true), ])); - let batches = do_read(buf, 1024, false, schema); + let batches = do_read(buf, 1024, false, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_primitive::(); @@ -763,7 +779,7 @@ mod tests { {"a": "1", "b": "2"} {"a": "hello", "b": "shoo"} {"b": "\t😁foo", "a": "\nfoobar\ud83d\ude00\u0061\u0073\u0066\u0067\u00FF"} - + {"b": null} {"b": "", "a": null} @@ -773,7 +789,7 @@ mod tests { Field::new("b", DataType::LargeUtf8, true), ])); - let batches = do_read(buf, 1024, false, schema); + let batches = do_read(buf, 1024, false, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_string::(); @@ -826,7 +842,7 @@ mod tests { ), ])); - let batches = do_read(buf, 1024, false, schema); + let batches = do_read(buf, 1024, false, false, schema); assert_eq!(batches.len(), 1); let list = batches[0].column(0).as_list::(); @@ -895,7 +911,7 @@ mod tests { ), ])); - let batches = do_read(buf, 1024, false, schema); + let batches = do_read(buf, 1024, false, false, schema); assert_eq!(batches.len(), 1); let nested = batches[0].column(0).as_struct(); @@ -941,7 +957,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![map])); - let batches = do_read(buf, 1024, false, schema); + let batches = do_read(buf, 1024, false, false, schema); assert_eq!(batches.len(), 1); let map = batches[0].column(0).as_map(); @@ -1015,7 +1031,7 @@ mod tests { Field::new("c", DataType::Utf8, true), ])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_string::(); @@ -1063,7 +1079,7 @@ mod tests { Field::new("c", data_type, true), ])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_primitive::(); @@ -1121,7 +1137,7 @@ mod tests { Field::new("d", with_timezone, true), ])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let unit_in_nanos: i64 = match T::UNIT { @@ -1221,7 +1237,7 @@ mod tests { Field::new("c", T::DATA_TYPE, true), ])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_primitive::(); @@ -1298,7 +1314,7 @@ mod tests { ), ])); - let batches = do_read(json, 1024, true, schema); + let batches = do_read(json, 1024, true, false, schema); assert_eq!(batches.len(), 1); let s: StructArray = batches.into_iter().next().unwrap().into(); @@ -1373,7 +1389,7 @@ mod tests { Field::new("u64", DataType::UInt64, true), ])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let i64 = batches[0].column(0).as_primitive::(); @@ -1397,7 +1413,7 @@ mod tests { true, )])); - let batches = do_read(buf, 1024, true, schema); + let batches = do_read(buf, 1024, true, false, schema); assert_eq!(batches.len(), 1); let i64 = batches[0] @@ -1406,6 +1422,98 @@ mod tests { assert_eq!(i64.values(), &[i64::MAX, i64::MIN, 900000]); } + #[test] + fn test_strict_mode_no_missing_columns_in_schema() { + let buf = r#" + {"a": 1, "b": "2", "c": true} + {"a": 2E0, "b": "4", "c": false} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int16, false), + Field::new("b", DataType::Utf8, false), + Field::new("c", DataType::Boolean, false), + ])); + + let batches = do_read(buf, 1024, true, true, schema); + assert_eq!(batches.len(), 1); + + let buf = r#" + {"a": 1, "b": "2", "c": {"a": true, "b": 1}} + {"a": 2E0, "b": "4", "c": {"a": false, "b": 2}} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int16, false), + Field::new("b", DataType::Utf8, false), + Field::new_struct( + "c", + vec![ + Field::new("a", DataType::Boolean, false), + Field::new("b", DataType::Int16, false), + ], + false, + ), + ])); + + let batches = do_read(buf, 1024, true, true, schema); + assert_eq!(batches.len(), 1); + } + + #[test] + fn test_strict_mode_missing_columns_in_schema() { + let buf = r#" + {"a": 1, "b": "2", "c": true} + {"a": 2E0, "b": "4", "c": false} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int16, true), + Field::new("c", DataType::Boolean, true), + ])); + + let err = ReaderBuilder::new(schema) + .with_batch_size(1024) + .with_strict_mode(true) + .build(Cursor::new(buf.as_bytes())) + .unwrap() + .read() + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Json error: column 'b' missing from schema" + ); + + let buf = r#" + {"a": 1, "b": "2", "c": {"a": true, "b": 1}} + {"a": 2E0, "b": "4", "c": {"a": false, "b": 2}} + "#; + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int16, false), + Field::new("b", DataType::Utf8, false), + Field::new_struct( + "c", + vec![Field::new("a", DataType::Boolean, false)], + false, + ), + ])); + + let err = ReaderBuilder::new(schema) + .with_batch_size(1024) + .with_strict_mode(true) + .build(Cursor::new(buf.as_bytes())) + .unwrap() + .read() + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Json error: whilst decoding field 'c': column 'b' missing from schema" + ); + } + fn read_file(path: &str, schema: Option) -> Reader> { let file = File::open(path).unwrap(); let mut reader = BufReader::new(file); @@ -1628,7 +1736,7 @@ mod tests { true, )])); - let batches = do_read(json_content, 1024, false, schema); + let batches = do_read(json_content, 1024, false, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_list::(); @@ -1656,7 +1764,7 @@ mod tests { true, )])); - let batches = do_read(json_content, 1024, false, schema); + let batches = do_read(json_content, 1024, false, false, schema); assert_eq!(batches.len(), 1); let col1 = batches[0].column(0).as_list::(); diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs index 3d24a927d85c..77d7e170d07c 100644 --- a/arrow-json/src/reader/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -25,6 +25,7 @@ use arrow_schema::{ArrowError, DataType, Fields}; pub struct StructArrayDecoder { data_type: DataType, decoders: Vec>, + strict_mode: bool, is_nullable: bool, } @@ -32,6 +33,7 @@ impl StructArrayDecoder { pub fn new( data_type: DataType, coerce_primitive: bool, + strict_mode: bool, is_nullable: bool, ) -> Result { let decoders = struct_fields(&data_type) @@ -41,13 +43,19 @@ impl StructArrayDecoder { // StructArrayDecoder::decode verifies that if the child is not nullable // it doesn't contain any nulls not masked by its parent let nullable = f.is_nullable() || is_nullable; - make_decoder(f.data_type().clone(), coerce_primitive, nullable) + make_decoder( + f.data_type().clone(), + coerce_primitive, + strict_mode, + nullable, + ) }) .collect::, ArrowError>>()?; Ok(Self { data_type, decoders, + strict_mode, is_nullable, }) } @@ -86,10 +94,16 @@ impl ArrayDecoder for StructArrayDecoder { }; // Update child pos if match found - if let Some(field_idx) = - fields.iter().position(|x| x.name() == field_name) - { - child_pos[field_idx][row] = cur_idx + 1; + match fields.iter().position(|x| x.name() == field_name) { + Some(field_idx) => child_pos[field_idx][row] = cur_idx + 1, + None => { + if self.strict_mode { + return Err(ArrowError::JsonError(format!( + "column '{}' missing from schema", + field_name + ))); + } + } } // Advance to next field From 0bcf200ce22bd9a767dbe4c0fefcda6176e66f6e Mon Sep 17 00:00:00 2001 From: Vaibhav Rabber Date: Wed, 21 Jun 2023 04:23:14 +0530 Subject: [PATCH 1022/1411] gcp: Exclude authorization header when bearer empty (#4418) GCP tries to authorize when there's the authorization header. If the bearer is empty, exclude the header since this doesn't let us get a public object. Signed-off-by: Vaibhav --- object_store/src/gcp/mod.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 7b1127354ccb..d4d370373d0d 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -394,16 +394,19 @@ impl GetClient for GoogleCloudStorageClient { false => Method::GET, }; - let response = self - .client - .request(method, url) - .bearer_auth(&credential.bearer) - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; + let mut request = self.client.request(method, url).with_get_options(options); + + if !credential.bearer.is_empty() { + request = request.bearer_auth(&credential.bearer); + } + + let response = + request + .send_retry(&self.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; Ok(response) } From 4b6c4d23478a1548e269f5eb9fbd53a9140e9261 Mon Sep 17 00:00:00 2001 From: jakevin Date: Thu, 22 Jun 2023 22:47:03 +0800 Subject: [PATCH 1023/1411] minor: remove useless mut (#4443) --- arrow-buffer/src/util/bit_util.rs | 2 +- arrow-select/src/take.rs | 6 +++--- parquet/src/schema/types.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-buffer/src/util/bit_util.rs b/arrow-buffer/src/util/bit_util.rs index de4bc96f9daf..b27931f4cc85 100644 --- a/arrow-buffer/src/util/bit_util.rs +++ b/arrow-buffer/src/util/bit_util.rs @@ -278,7 +278,7 @@ mod tests { } #[test] - #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_ceil() { assert_eq!(ceil(0, 1), 0); assert_eq!(ceil(1, 1), 1); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 4d599369ca27..0f5689ff9990 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -30,7 +30,7 @@ use arrow_buffer::{ use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef}; -use num::Zero; +use num::{One, Zero}; /// Take elements by index from [Array], creating a new [Array] from those indexes. /// @@ -623,7 +623,7 @@ fn take_value_indices_from_list( where IndexType: ArrowPrimitiveType, OffsetType: ArrowPrimitiveType, - OffsetType::Native: OffsetSizeTrait + std::ops::Add + num::Zero + num::One, + OffsetType::Native: OffsetSizeTrait + std::ops::Add + Zero + One, PrimitiveArray: From>, { // TODO: benchmark this function, there might be a faster unsafe alternative @@ -656,7 +656,7 @@ where // if start == end, this slot is empty while curr < end { values.push(curr); - curr += num::One::one(); + curr += One::one(); } if !list.is_valid(ix) { bit_util::unset_bit(null_slice, i); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 151f2b69f31e..fd22cedeacaa 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -2085,7 +2085,7 @@ mod tests { let expected_schema = parse_message_type(message_type).unwrap(); let mut thrift_schema = to_thrift(&expected_schema).unwrap(); // Change all of None to Some(0) - for mut elem in &mut thrift_schema[..] { + for elem in &mut thrift_schema[..] { if elem.num_children.is_none() { elem.num_children = Some(0); } From 6e975a4d62699996061fff0e9e1971ea5dd40ddd Mon Sep 17 00:00:00 2001 From: Ze'ev Maor Date: Thu, 22 Jun 2023 18:15:15 +0300 Subject: [PATCH 1024/1411] cleanup some `unwrap`() into proper Result propagation, impl `PartialEq` for `ParquetError` (#4428) * cleanup some unwrap() into proper Result propagation * cleanup some unwrap() into proper Result propagation * cleanup some unwrap() into proper Result propagation * cleanup some unwrap() into proper Result propagation * Fix doc examples * fix parquet-read compilation --------- Co-authored-by: Ze'ev Maor Co-authored-by: Andrew Lamb --- parquet/src/bin/parquet-read.rs | 4 +- parquet/src/errors.rs | 17 +++++ parquet/src/file/mod.rs | 2 +- parquet/src/file/serialized_reader.rs | 6 +- parquet/src/file/writer.rs | 2 +- parquet/src/record/api.rs | 41 +++++++---- parquet/src/record/reader.rs | 98 +++++++++++++++------------ parquet/src/record/triplet.rs | 15 ++-- 8 files changed, 113 insertions(+), 72 deletions(-) diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index a8a835ab870d..392697e6c619 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -91,9 +91,9 @@ fn main() { while all_records || start < end { match iter.next() { - Some(row) => print_row(&row, json), + Some(row) => print_row(&row.unwrap(), json), None => break, - } + }; start += 1; } } diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 62f7656f14b5..f9e3d17c92dd 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -44,6 +44,23 @@ pub enum ParquetError { External(Box), } +impl PartialEq for ParquetError { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::General(l0), Self::General(r0)) => l0 == r0, + (Self::NYI(l0), Self::NYI(r0)) => l0 == r0, + (Self::EOF(l0), Self::EOF(r0)) => l0 == r0, + #[cfg(feature = "arrow")] + (Self::ArrowError(l0), Self::ArrowError(r0)) => l0 == r0, + (Self::IndexOutOfBound(l0, l1), Self::IndexOutOfBound(r0, r1)) => { + l0 == r0 && l1 == r1 + } + (Self::External(l0), Self::External(r0)) => l0.to_string() == r0.to_string(), + _ => false, + } + } +} + impl std::fmt::Display for ParquetError { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { match &self { diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index fffe383c57ae..c20fd38c7f8b 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -91,7 +91,7 @@ //! .flat_map(|r| r.into_iter()); //! //! for row in rows { -//! println!("{}", row); +//! println!("{}", row.unwrap()); //! } //! ``` pub mod footer; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 2ed9b1653fdd..d0e5420a1030 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -79,7 +79,7 @@ impl<'a> TryFrom<&'a str> for SerializedFileReader { /// Conversion into a [`RowIter`](crate::record::reader::RowIter) /// using the full file schema over all row groups. impl IntoIterator for SerializedFileReader { - type Item = Row; + type Item = Result; type IntoIter = RowIter<'static>; fn into_iter(self) -> Self::IntoIter { @@ -854,7 +854,7 @@ mod tests { .iter() .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) .flat_map(|r| r.into_iter()) - .flat_map(|r| r.get_int(0)) + .flat_map(|r| r.unwrap().get_int(0)) .collect::>(); // rows in the parquet file are not sorted by "id" @@ -874,7 +874,7 @@ mod tests { r.into_iter().project(proj).unwrap() }) - .map(|r| format!("{r}")) + .map(|r| format!("{}", r.unwrap())) .collect::>() .join(","); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 15240e33c514..bde350a1ea42 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1337,7 +1337,7 @@ mod tests { for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) { let row_group_reader = reader.get_row_group(i).unwrap(); let iter = row_group_reader.get_row_iter(None).unwrap(); - let res: Vec<_> = iter.map(&value).collect(); + let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect(); let row_group_size = row_group_reader.metadata().total_byte_size(); let uncompressed_size: i64 = row_group_reader .metadata() diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 1809e3ace889..ccff233c21db 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -66,7 +66,7 @@ impl Row { /// /// let file = File::open("/path/to/file").unwrap(); /// let reader = SerializedFileReader::new(file).unwrap(); - /// let row: Row = reader.get_row_iter(None).unwrap().next().unwrap(); + /// let row: Row = reader.get_row_iter(None).unwrap().next().unwrap().unwrap(); /// for (idx, (name, field)) in row.get_column_iter().enumerate() { /// println!("column index: {}, column name: {}, column value: {}", idx, name, field); /// } @@ -146,7 +146,7 @@ pub trait RowAccessor { /// /// if let Ok(file) = File::open(&Path::new("test.parquet")) { /// let reader = SerializedFileReader::new(file).unwrap(); -/// let row = reader.get_row_iter(None).unwrap().next().unwrap(); +/// let row = reader.get_row_iter(None).unwrap().next().unwrap().unwrap(); /// println!("column 0: {}, column 1: {}", row.fmt(0), row.fmt(1)); /// } /// ``` @@ -639,11 +639,17 @@ impl Field { /// Converts Parquet BYTE_ARRAY type with converted type into either UTF8 string or /// array of bytes. #[inline] - pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Self { - match descr.physical_type() { + pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Result { + let field = match descr.physical_type() { PhysicalType::BYTE_ARRAY => match descr.converted_type() { ConvertedType::UTF8 | ConvertedType::ENUM | ConvertedType::JSON => { - let value = String::from_utf8(value.data().to_vec()).unwrap(); + let value = + String::from_utf8(value.data().to_vec()).map_err(|e| { + general_err!( + "Error reading BYTE_ARRAY as String. Bytes: {:?} Error: {:?}", + value.data(), e + ) + })?; Field::Str(value) } ConvertedType::BSON | ConvertedType::NONE => Field::Bytes(value), @@ -664,7 +670,8 @@ impl Field { _ => nyi!(descr, value), }, _ => nyi!(descr, value), - } + }; + Ok(field) } #[cfg(any(feature = "json", test))] @@ -1020,38 +1027,41 @@ mod tests { let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::UTF8]; let value = ByteArray::from(vec![b'A', b'B', b'C', b'D']); let row = Field::convert_byte_array(&descr, value); - assert_eq!(row, Field::Str("ABCD".to_string())); + assert_eq!(row.unwrap(), Field::Str("ABCD".to_string())); // ENUM let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::ENUM]; let value = ByteArray::from(vec![b'1', b'2', b'3']); let row = Field::convert_byte_array(&descr, value); - assert_eq!(row, Field::Str("123".to_string())); + assert_eq!(row.unwrap(), Field::Str("123".to_string())); // JSON let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::JSON]; let value = ByteArray::from(vec![b'{', b'"', b'a', b'"', b':', b'1', b'}']); let row = Field::convert_byte_array(&descr, value); - assert_eq!(row, Field::Str("{\"a\":1}".to_string())); + assert_eq!(row.unwrap(), Field::Str("{\"a\":1}".to_string())); // NONE let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::NONE]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); - assert_eq!(row, Field::Bytes(value)); + assert_eq!(row.unwrap(), Field::Bytes(value)); // BSON let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::BSON]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); - assert_eq!(row, Field::Bytes(value)); + assert_eq!(row.unwrap(), Field::Bytes(value)); // DECIMAL let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::DECIMAL, 0, 8, 2]; let value = ByteArray::from(vec![207, 200]); let row = Field::convert_byte_array(&descr, value.clone()); - assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); + assert_eq!( + row.unwrap(), + Field::Decimal(Decimal::from_bytes(value, 8, 2)) + ); // DECIMAL (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ @@ -1063,7 +1073,10 @@ mod tests { ]; let value = ByteArray::from(vec![0, 0, 0, 0, 0, 4, 147, 224]); let row = Field::convert_byte_array(&descr, value.clone()); - assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 17, 5))); + assert_eq!( + row.unwrap(), + Field::Decimal(Decimal::from_bytes(value, 17, 5)) + ); // NONE (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ @@ -1075,7 +1088,7 @@ mod tests { ]; let value = ByteArray::from(vec![1, 2, 3, 4, 5, 6]); let row = Field::convert_byte_array(&descr, value.clone()); - assert_eq!(row, Field::Bytes(value)); + assert_eq!(row.unwrap(), Field::Bytes(value)); } #[test] diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index b7298a45b2e8..780e9822488d 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -65,7 +65,7 @@ impl TreeBuilder { &self, descr: SchemaDescPtr, row_group_reader: &dyn RowGroupReader, - ) -> Reader { + ) -> Result { // Prepare lookup table of column path -> original column index // This allows to prune columns and map schema leaf nodes to the column readers let mut paths: HashMap = HashMap::new(); @@ -89,13 +89,13 @@ impl TreeBuilder { 0, &paths, row_group_reader, - ); + )?; readers.push(reader); } // Return group reader for message type, // it is always required with definition level 0 - Reader::GroupReader(None, 0, readers) + Ok(Reader::GroupReader(None, 0, readers)) } /// Creates iterator of `Row`s directly from schema descriptor and row group. @@ -103,9 +103,12 @@ impl TreeBuilder { &self, descr: SchemaDescPtr, row_group_reader: &dyn RowGroupReader, - ) -> ReaderIter { + ) -> Result { let num_records = row_group_reader.metadata().num_rows() as usize; - ReaderIter::new(self.build(descr, row_group_reader), num_records) + Ok(ReaderIter::new( + self.build(descr, row_group_reader)?, + num_records, + )) } /// Builds tree of readers for the current schema recursively. @@ -117,7 +120,7 @@ impl TreeBuilder { mut curr_rep_level: i16, paths: &HashMap, row_group_reader: &dyn RowGroupReader, - ) -> Reader { + ) -> Result { assert!(field.get_basic_info().has_repetition()); // Update current definition and repetition levels for this type let repetition = field.get_basic_info().repetition(); @@ -135,12 +138,14 @@ impl TreeBuilder { path.push(String::from(field.name())); let reader = if field.is_primitive() { let col_path = ColumnPath::new(path.to_vec()); - let orig_index = *paths.get(&col_path).unwrap(); + let orig_index = *paths + .get(&col_path) + .ok_or(general_err!("Path {:?} not found", col_path))?; let col_descr = row_group_reader .metadata() .column(orig_index) .column_descr_ptr(); - let col_reader = row_group_reader.get_column_reader(orig_index).unwrap(); + let col_reader = row_group_reader.get_column_reader(orig_index)?; let column = TripletIter::new(col_descr, col_reader, self.batch_size); Reader::PrimitiveReader(field, Box::new(column)) } else { @@ -169,7 +174,7 @@ impl TreeBuilder { curr_rep_level, paths, row_group_reader, - ); + )?; Reader::RepeatedReader( field, @@ -189,7 +194,7 @@ impl TreeBuilder { curr_rep_level + 1, paths, row_group_reader, - ); + )?; path.pop(); @@ -239,7 +244,7 @@ impl TreeBuilder { curr_rep_level + 1, paths, row_group_reader, - ); + )?; let value_type = &key_value_type.get_fields()[1]; let value_reader = self.reader_tree( @@ -249,7 +254,7 @@ impl TreeBuilder { curr_rep_level + 1, paths, row_group_reader, - ); + )?; path.pop(); @@ -270,8 +275,7 @@ impl TreeBuilder { .with_repetition(Repetition::REQUIRED) .with_converted_type(field.get_basic_info().converted_type()) .with_fields(&mut Vec::from(field.get_fields())) - .build() - .unwrap(); + .build()?; path.pop(); @@ -282,7 +286,7 @@ impl TreeBuilder { curr_rep_level, paths, row_group_reader, - ); + )?; Reader::RepeatedReader( field, @@ -302,7 +306,7 @@ impl TreeBuilder { curr_rep_level, paths, row_group_reader, - ); + )?; readers.push(reader); } Reader::GroupReader(Some(field), curr_def_level, readers) @@ -311,7 +315,7 @@ impl TreeBuilder { }; path.pop(); - Reader::option(repetition, curr_def_level, reader) + Ok(Reader::option(repetition, curr_def_level, reader)) } } @@ -395,14 +399,15 @@ impl Reader { /// Automatically advances all necessary readers. /// This must be called on the root level reader (i.e., for Message type). /// Otherwise, it will panic. - fn read(&mut self) -> Row { + fn read(&mut self) -> Result { match *self { Reader::GroupReader(_, _, ref mut readers) => { let mut fields = Vec::new(); for reader in readers { - fields.push((String::from(reader.field_name()), reader.read_field())); + fields + .push((String::from(reader.field_name()), reader.read_field()?)); } - make_row(fields) + Ok(make_row(fields)) } _ => panic!("Cannot call read() on {self}"), } @@ -410,16 +415,16 @@ impl Reader { /// Reads current record as `Field` from the reader tree. /// Automatically advances all necessary readers. - fn read_field(&mut self) -> Field { - match *self { + fn read_field(&mut self) -> Result { + let field = match *self { Reader::PrimitiveReader(_, ref mut column) => { - let value = column.current_value(); - column.read_next().unwrap(); + let value = column.current_value()?; + column.read_next()?; value } Reader::OptionReader(def_level, ref mut reader) => { if reader.current_def_level() > def_level { - reader.read_field() + reader.read_field()? } else { reader.advance_columns(); Field::Null @@ -433,7 +438,7 @@ impl Reader { { fields.push(( String::from(reader.field_name()), - reader.read_field(), + reader.read_field()?, )); } else { reader.advance_columns(); @@ -447,7 +452,7 @@ impl Reader { let mut elements = Vec::new(); loop { if reader.current_def_level() > def_level { - elements.push(reader.read_field()); + elements.push(reader.read_field()?); } else { reader.advance_columns(); // If the current definition level is equal to the definition @@ -476,7 +481,7 @@ impl Reader { let mut pairs = Vec::new(); loop { if keys.current_def_level() > def_level { - pairs.push((keys.read_field(), values.read_field())); + pairs.push((keys.read_field()?, values.read_field()?)); } else { keys.advance_columns(); values.advance_columns(); @@ -497,7 +502,8 @@ impl Reader { Field::MapInternal(make_map(pairs)) } - } + }; + Ok(field) } /// Returns field name for the current reader. @@ -681,7 +687,7 @@ impl<'a> RowIter<'a> { ) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; let tree_builder = Self::tree_builder(); - let row_iter = tree_builder.as_iter(descr.clone(), reader); + let row_iter = tree_builder.as_iter(descr.clone(), reader)?; // For row group we need to set `current_row_group` >= `num_row_groups`, because // we only have one row group and can't buffer more. @@ -751,9 +757,9 @@ impl<'a> RowIter<'a> { } impl<'a> Iterator for RowIter<'a> { - type Item = Row; + type Item = Result; - fn next(&mut self) -> Option { + fn next(&mut self) -> Option> { let mut row = None; if let Some(ref mut iter) = self.row_iter { row = iter.next(); @@ -768,14 +774,18 @@ impl<'a> Iterator for RowIter<'a> { .get_row_group(self.current_row_group) .expect("Row group is required to advance"); - let mut iter = self + match self .tree_builder - .as_iter(self.descr.clone(), row_group_reader); - - row = iter.next(); + .as_iter(self.descr.clone(), row_group_reader) + { + Ok(mut iter) => { + row = iter.next(); - self.current_row_group += 1; - self.row_iter = Some(iter); + self.current_row_group += 1; + self.row_iter = Some(iter); + } + Err(e) => return Some(Err(e)), + } } } @@ -801,9 +811,9 @@ impl ReaderIter { } impl Iterator for ReaderIter { - type Item = Row; + type Item = Result; - fn next(&mut self) -> Option { + fn next(&mut self) -> Option> { if self.records_left > 0 { self.records_left -= 1; Some(self.root_reader.read()) @@ -1495,7 +1505,7 @@ mod tests { .iter() .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) .flat_map(|r| RowIter::from_file_into(Box::new(r))) - .flat_map(|r| r.get_int(0)) + .flat_map(|r| r.unwrap().get_int(0)) .collect::>(); assert_eq!(vec, vec![4, 5, 6, 7, 2, 3, 0, 1]); @@ -1513,7 +1523,7 @@ mod tests { RowIter::from_file_into(Box::new(r)).project(proj).unwrap() }) - .map(|r| format!("id:{}", r.fmt(0))) + .map(|r| format!("id:{}", r.unwrap().fmt(0))) .collect::>() .join(", "); @@ -1618,7 +1628,7 @@ mod tests { let file = get_test_file(file_name); let file_reader: Box = Box::new(SerializedFileReader::new(file)?); let iter = file_reader.get_row_iter(schema)?; - Ok(iter.collect()) + Ok(iter.map(|row| row.unwrap()).collect()) } fn test_row_group_rows(file_name: &str, schema: Option) -> Result> { @@ -1628,6 +1638,6 @@ mod tests { // group let row_group_reader = file_reader.get_row_group(0).unwrap(); let iter = row_group_reader.get_row_iter(schema)?; - Ok(iter.collect()) + Ok(iter.map(|row| row.unwrap()).collect()) } } diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index 67c407b3a05c..1d3488bf2d63 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -136,11 +136,11 @@ impl TripletIter { } /// Updates non-null value for current row. - pub fn current_value(&self) -> Field { + pub fn current_value(&self) -> Result { if self.is_null() { - return Field::Null; + return Ok(Field::Null); } - match *self { + let field = match *self { TripletIter::BoolTripletIter(ref typed) => { Field::convert_bool(typed.column_descr(), *typed.current_value()) } @@ -162,14 +162,15 @@ impl TripletIter { TripletIter::ByteArrayTripletIter(ref typed) => Field::convert_byte_array( typed.column_descr(), typed.current_value().clone(), - ), + )?, TripletIter::FixedLenByteArrayTripletIter(ref typed) => { Field::convert_byte_array( typed.column_descr(), typed.current_value().clone().into(), - ) + )? } - } + }; + Ok(field) } } @@ -553,7 +554,7 @@ mod tests { while let Ok(true) = iter.read_next() { assert!(iter.has_next()); if !iter.is_null() { - values.push(iter.current_value()); + values.push(iter.current_value().unwrap()); } def_levels.push(iter.current_def_level()); rep_levels.push(iter.current_rep_level()); From a3b5b193380cc4fbafa823e5b0c998b5f8e5cae3 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Fri, 23 Jun 2023 15:49:02 +0800 Subject: [PATCH 1025/1411] Casting fixedsizelist to list/largelist (#4433) * fixedsizelist to list/largelist Signed-off-by: jayzhan211 * address comment Signed-off-by: jayzhan211 * remove typehint Signed-off-by: jayzhan211 * addres ci Signed-off-by: jayzhan211 * address clippy Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 --- arrow-array/src/cast.rs | 17 +++++ arrow-cast/src/cast.rs | 141 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index e92e19eb3c7f..bee8823d1f59 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -799,6 +799,15 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`FixedSizeListArray`] returning `None` if not possible + fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray>; + + /// Downcast this to a [`FixedSizeListArray`] panicking if not possible + fn as_fixed_size_list(&self) -> &FixedSizeListArray { + self.as_fixed_size_list_opt() + .expect("fixed size list array") + } + /// Downcast this to a [`MapArray`] returning `None` if not possible fn as_map_opt(&self) -> Option<&MapArray>; @@ -839,6 +848,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray> { + self.as_any().downcast_ref() + } + fn as_map_opt(&self) -> Option<&MapArray> { self.as_any().downcast_ref() } @@ -872,6 +885,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray> { + self.as_ref().as_fixed_size_list_opt() + } + fn as_map_opt(&self) -> Option<&MapArray> { self.as_any().downcast_ref() } diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index dea3f2acfaf8..95c0a63a3a4e 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -141,6 +141,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { can_cast_types(list_from.data_type(), to_type) } (List(_), _) => false, + (FixedSizeList(list_from,_), List(list_to)) => { + list_from.data_type() == list_to.data_type() + } + (FixedSizeList(list_from,_), LargeList(list_to)) => { + list_from.data_type() == list_to.data_type() + } (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), // cast one decimal type to another decimal type @@ -824,6 +830,25 @@ pub fn cast_with_options( "Cannot cast list to non-list data types".to_string(), )), }, + (FixedSizeList(list_from, _), List(list_to)) => { + if list_to.data_type() != list_from.data_type() { + Err(ArrowError::CastError( + "cannot cast fixed-size-list to list with different child data".into(), + )) + } else { + cast_fixed_size_list_to_list::(array) + } + } + (FixedSizeList(list_from, _), LargeList(list_to)) => { + if list_to.data_type() != list_from.data_type() { + Err(ArrowError::CastError( + "cannot cast fixed-size-list to largelist with different child data".into(), + )) + } else { + cast_fixed_size_list_to_list::(array) + } + } + (_, List(ref to)) => { cast_primitive_to_list::(array, to, to_type, cast_options) } @@ -3822,6 +3847,17 @@ where Ok(Arc::new(GenericByteArray::::from(array_data))) } +fn cast_fixed_size_list_to_list( + array: &dyn Array, +) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let fixed_size_list: &FixedSizeListArray = array.as_fixed_size_list(); + let list: GenericListArray = fixed_size_list.clone().into(); + Ok(Arc::new(list)) +} + /// Cast the container type of List/Largelist array but not the inner types. /// This function can leave the value data intact and only has to cast the offset dtypes. fn cast_list_container( @@ -7847,6 +7883,71 @@ mod tests { assert!(!c.is_valid(5)); // "2000-01-01" } + #[test] + fn test_can_cast_types_fixed_size_list_to_list() { + // DataType::List + let array1 = Arc::new(make_fixed_size_list_array()) as ArrayRef; + assert!(can_cast_types( + array1.data_type(), + &DataType::List(Arc::new(Field::new("", DataType::Int32, false))) + )); + + // DataType::LargeList + let array2 = Arc::new(make_fixed_size_list_array_for_large_list()) as ArrayRef; + assert!(can_cast_types( + array2.data_type(), + &DataType::LargeList(Arc::new(Field::new("", DataType::Int64, false))) + )); + } + + #[test] + fn test_cast_fixed_size_list_to_list() { + // DataType::List + let array1 = Arc::new(make_fixed_size_list_array()) as ArrayRef; + let list_array1 = cast( + &array1, + &DataType::List(Arc::new(Field::new("", DataType::Int32, false))), + ) + .unwrap(); + let actual = list_array1.as_any().downcast_ref::().unwrap(); + let expected = array1 + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(expected.values(), actual.values()); + assert_eq!(expected.len(), actual.len()); + + // DataType::LargeList + let array2 = Arc::new(make_fixed_size_list_array_for_large_list()) as ArrayRef; + let list_array2 = cast( + &array2, + &DataType::LargeList(Arc::new(Field::new("", DataType::Int64, false))), + ) + .unwrap(); + let actual = list_array2 + .as_any() + .downcast_ref::() + .unwrap(); + let expected = array2 + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(expected.values(), actual.values()); + assert_eq!(expected.len(), actual.len()); + + // Cast previous LargeList to List + let array3 = Arc::new(actual.clone()) as ArrayRef; + let list_array3 = cast( + &array3, + &DataType::List(Arc::new(Field::new("", DataType::Int64, false))), + ) + .unwrap(); + let actual = list_array3.as_any().downcast_ref::().unwrap(); + let expected = array3.as_any().downcast_ref::().unwrap(); + assert_eq!(expected.values(), actual.values()); + } + #[test] fn test_cast_list_containers() { // large-list to list @@ -7929,6 +8030,46 @@ mod tests { LargeListArray::from(list_data) } + fn make_fixed_size_list_array() -> FixedSizeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let list_data_type = DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Int32, true)), + 4, + ); + let list_data = ArrayData::builder(list_data_type) + .len(2) + .add_child_data(value_data) + .build() + .unwrap(); + FixedSizeListArray::from(list_data) + } + + fn make_fixed_size_list_array_for_large_list() -> FixedSizeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int64) + .len(8) + .add_buffer(Buffer::from_slice_ref([0i64, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let list_data_type = DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Int64, true)), + 4, + ); + let list_data = ArrayData::builder(list_data_type) + .len(2) + .add_child_data(value_data) + .build() + .unwrap(); + FixedSizeListArray::from(list_data) + } + #[test] fn test_utf8_cast_offsets() { // test if offset of the array is taken into account during cast From b163b19d213c57170789f32a2011cbadf9ab4120 Mon Sep 17 00:00:00 2001 From: Igor Izvekov Date: Fri, 23 Jun 2023 12:57:36 +0300 Subject: [PATCH 1026/1411] feat: support `NullBuilder` (#4430) * feat: NullBuilder * fix: docs * refactor: use method ArrayData::new_null --- arrow-array/src/array/null_array.rs | 6 + arrow-array/src/builder/mod.rs | 2 + arrow-array/src/builder/null_builder.rs | 184 ++++++++++++++++++++++ arrow-array/src/builder/struct_builder.rs | 2 +- 4 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 arrow-array/src/builder/null_builder.rs diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 7fdd99a39675..c054c890431b 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -17,6 +17,7 @@ //! Contains the `NullArray` type. +use crate::builder::NullBuilder; use crate::{Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -62,6 +63,11 @@ impl NullArray { Self { len } } + + /// Returns a new null array builder + pub fn builder(capacity: usize) -> NullBuilder { + NullBuilder::with_capacity(capacity) + } } impl Array for NullArray { diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 91df8c27ce47..1e5e6426be09 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -164,6 +164,8 @@ mod generic_list_builder; pub use generic_list_builder::*; mod map_builder; pub use map_builder::*; +mod null_builder; +pub use null_builder::*; mod primitive_builder; pub use primitive_builder::*; mod primitive_dictionary_builder; diff --git a/arrow-array/src/builder/null_builder.rs b/arrow-array/src/builder/null_builder.rs new file mode 100644 index 000000000000..0b4345006993 --- /dev/null +++ b/arrow-array/src/builder/null_builder.rs @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, NullArray}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`NullArray`] +/// +/// # Example +/// +/// Create a `NullArray` from a `NullBuilder` +/// +/// ``` +/// +/// # use arrow_array::{Array, NullArray, builder::NullBuilder}; +/// +/// let mut b = NullBuilder::new(); +/// b.append_empty_value(); +/// b.append_null(); +/// b.append_nulls(3); +/// b.append_empty_values(3); +/// let arr = b.finish(); +/// +/// assert_eq!(8, arr.len()); +/// assert_eq!(8, arr.null_count()); +/// ``` +#[derive(Debug)] +pub struct NullBuilder { + len: usize, +} + +impl Default for NullBuilder { + fn default() -> Self { + Self::new() + } +} + +impl NullBuilder { + /// Creates a new null builder + pub fn new() -> Self { + Self { len: 0 } + } + + /// Creates a new null builder with space for `capacity` elements without re-allocating + pub fn with_capacity(capacity: usize) -> Self { + Self { len: capacity } + } + + /// Returns the capacity of this builder measured in slots of type `T` + pub fn capacity(&self) -> usize { + self.len + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_null(&mut self) { + self.len += 1; + } + + /// Appends `n` `null`s into the builder. + #[inline] + pub fn append_nulls(&mut self, n: usize) { + self.len += n; + } + + /// Appends a null slot into the builder + #[inline] + pub fn append_empty_value(&mut self) { + self.append_null(); + } + + /// Appends `n` `null`s into the builder. + #[inline] + pub fn append_empty_values(&mut self, n: usize) { + self.append_nulls(n); + } + + /// Builds the [NullArray] and reset this builder. + pub fn finish(&mut self) -> NullArray { + let len = self.len(); + let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); + + let array_data = unsafe { builder.build_unchecked() }; + NullArray::from(array_data) + } + + /// Builds the [NullArray] without resetting the builder. + pub fn finish_cloned(&self) -> NullArray { + let len = self.len(); + let builder = ArrayData::new_null(&DataType::Null, len).into_builder(); + + let array_data = unsafe { builder.build_unchecked() }; + NullArray::from(array_data) + } +} + +impl ArrayBuilder for NullBuilder { + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.len + } + + /// Returns whether the number of array slots is zero + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Array; + + #[test] + fn test_null_array_builder() { + let mut builder = NullArray::builder(10); + builder.append_null(); + builder.append_nulls(4); + builder.append_empty_value(); + builder.append_empty_values(4); + + let arr = builder.finish(); + assert_eq!(20, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(20, arr.null_count()); + } + + #[test] + fn test_null_array_builder_finish_cloned() { + let mut builder = NullArray::builder(16); + builder.append_null(); + builder.append_empty_value(); + builder.append_empty_values(3); + let mut array = builder.finish_cloned(); + assert_eq!(21, array.null_count()); + + builder.append_empty_values(5); + array = builder.finish(); + assert_eq!(26, array.null_count()); + } +} diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 04dc5ba7319e..88a23db6d10e 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -97,7 +97,7 @@ impl ArrayBuilder for StructBuilder { pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { use crate::builder::*; match datatype { - DataType::Null => unimplemented!(), + DataType::Null => Box::new(NullBuilder::with_capacity(capacity)), DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)), DataType::Int8 => Box::new(Int8Builder::with_capacity(capacity)), DataType::Int16 => Box::new(Int16Builder::with_capacity(capacity)), From 23465ec49962eba398394fdc2a2ee2baa11cc1e4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 26 Jun 2023 07:16:00 -0400 Subject: [PATCH 1027/1411] Parse intervals like `.5` the same as `0.5` (#4425) * Allow intervals like .5 * Support -.5 and -0.5 intervals as well --- arrow-cast/src/parse.rs | 51 +++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index accce99b4649..67477c57d519 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -830,15 +830,21 @@ impl FromStr for IntervalAmount { match s.split_once('.') { Some((integer, frac)) if frac.len() <= INTERVAL_PRECISION as usize - && !integer.is_empty() && !frac.is_empty() && !frac.starts_with('-') => { - let integer = integer.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - })?; + // integer will be "" for values like ".5" + // and "-" for values like "-.5" + let explicit_neg = integer.starts_with('-'); + let integer = if integer.is_empty() || integer == "-" { + Ok(0) + } else { + integer.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + }) + }?; let frac_unscaled = frac.parse::().map_err(|_| { ArrowError::ParseError(format!( @@ -851,7 +857,11 @@ impl FromStr for IntervalAmount { frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); // propagate the sign of the integer part to the fractional part - let frac = if integer < 0 { -frac } else { frac }; + let frac = if integer < 0 || explicit_neg { + -frac + } else { + frac + }; let result = Self { integer, frac }; @@ -929,7 +939,8 @@ impl Interval { (self.months, self.days, self.nanos) } - /// Parse string value in traditional Postgres format (e.g. 1 year 2 months 3 days 4 hours 5 minutes 6 seconds) + /// Parse string value in traditional Postgres format such as + /// `1 year 2 months 3 days 4 hours 5 minutes 6 seconds` fn parse(value: &str, config: &IntervalParseConfig) -> Result { let components = parse_interval_components(value, config)?; @@ -1798,6 +1809,26 @@ mod tests { Interval::parse("-1.5 months -3.2 days", &config).unwrap(), ); + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse("0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse(".5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-.5 months", &config).unwrap(), + ); + assert_eq!( Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), @@ -1944,10 +1975,6 @@ mod tests { assert_eq!(result, expected); - // invalid: missing integer - let result = IntervalAmount::from_str(".5"); - assert!(result.is_err()); - // invalid: missing fractional let result = IntervalAmount::from_str("3."); assert!(result.is_err()); From 8e65b5803dd6c457e18b24c13dac9a13bcc4d4cf Mon Sep 17 00:00:00 2001 From: jakevin Date: Mon, 26 Jun 2023 19:35:24 +0800 Subject: [PATCH 1028/1411] unify substring for binary&utf8 (#4442) --- arrow-string/Cargo.toml | 1 + arrow-string/src/substring.rs | 171 +++++++++++++--------------------- 2 files changed, 67 insertions(+), 105 deletions(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 6e16e0163a36..0f88ffbac923 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -41,6 +41,7 @@ arrow-array = { workspace = true } arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.7.1", default-features = false, features = ["unicode"] } +num = { version = "0.4", default-features = false, features = ["std"] } [package.metadata.docs.rs] all-features = true diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index a8250c75d287..1075d106911e 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -25,6 +25,7 @@ use arrow_array::*; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; +use num::Zero; use std::cmp::Ordering; use std::sync::Arc; @@ -106,7 +107,7 @@ pub fn substring( UInt64: UInt64Type ) } - DataType::LargeBinary => binary_substring( + DataType::LargeBinary => byte_substring( array .as_any() .downcast_ref::() @@ -114,7 +115,7 @@ pub fn substring( start, length.map(|e| e as i64), ), - DataType::Binary => binary_substring( + DataType::Binary => byte_substring( array .as_any() .downcast_ref::() @@ -131,7 +132,7 @@ pub fn substring( start as i32, length.map(|e| e as i32), ), - DataType::LargeUtf8 => utf8_substring( + DataType::LargeUtf8 => byte_substring( array .as_any() .downcast_ref::() @@ -139,7 +140,7 @@ pub fn substring( start, length.map(|e| e as i64), ), - DataType::Utf8 => utf8_substring( + DataType::Utf8 => byte_substring( array .as_any() .downcast_ref::() @@ -246,36 +247,61 @@ fn get_start_end_offset( (start_offset, end_offset) } -fn binary_substring( - array: &GenericBinaryArray, - start: OffsetSize, - length: Option, -) -> Result { +fn byte_substring( + array: &GenericByteArray, + start: T::Offset, + length: Option, +) -> Result +where + ::Native: PartialEq, +{ let offsets = array.value_offsets(); let data = array.value_data(); - let zero = OffsetSize::zero(); + let zero = ::zero(); + + // When array is [Large]StringArray, we will check whether `offset` is at a valid char boundary. + let check_char_boundary = { + |offset: T::Offset| { + if !matches!(T::DATA_TYPE, DataType::Utf8 | DataType::LargeUtf8) { + return Ok(offset); + } + // Safety: a StringArray must contain valid UTF8 data + let data_str = unsafe { std::str::from_utf8_unchecked(data) }; + let offset_usize = offset.as_usize(); + if data_str.is_char_boundary(offset_usize) { + Ok(offset) + } else { + Err(ArrowError::ComputeError(format!( + "The offset {offset_usize} is at an invalid utf-8 boundary." + ))) + } + } + }; // start and end offsets of all substrings - let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> = + let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len()); - let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); + let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); let mut len_so_far = zero; new_offsets.push(zero); - offsets.windows(2).for_each(|pair| { - let new_start = match start.cmp(&zero) { - Ordering::Greater => (pair[0] + start).min(pair[1]), - Ordering::Equal => pair[0], - Ordering::Less => (pair[1] + start).max(pair[0]), - }; - let new_end = match length { - Some(length) => (length + new_start).min(pair[1]), - None => pair[1], - }; - len_so_far += new_end - new_start; - new_starts_ends.push((new_start, new_end)); - new_offsets.push(len_so_far); - }); + offsets + .windows(2) + .try_for_each(|pair| -> Result<(), ArrowError> { + let new_start = match start.cmp(&zero) { + Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, + Ordering::Equal => pair[0], + Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, + }; + let new_end = match length { + Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, + None => pair[1], + }; + len_so_far += new_end - new_start; + new_starts_ends.push((new_start, new_end)); + new_offsets.push(len_so_far); + Ok(()) + })?; // concatenate substrings into a buffer let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); @@ -291,7 +317,7 @@ fn binary_substring( let data = unsafe { ArrayData::new_unchecked( - GenericBinaryArray::::DATA_TYPE, + GenericByteArray::::DATA_TYPE, array.len(), None, array.nulls().map(|b| b.inner().sliced()), @@ -349,84 +375,6 @@ fn fixed_size_binary_substring( Ok(make_array(array_data)) } -/// substring by byte -fn utf8_substring( - array: &GenericStringArray, - start: OffsetSize, - length: Option, -) -> Result { - let offsets = array.value_offsets(); - let data = array.value_data(); - let zero = OffsetSize::zero(); - - // Check if `offset` is at a valid char boundary. - // If yes, return `offset`, else return error - let check_char_boundary = { - // Safety: a StringArray must contain valid UTF8 data - let data_str = unsafe { std::str::from_utf8_unchecked(data) }; - |offset: OffsetSize| { - let offset_usize = offset.as_usize(); - if data_str.is_char_boundary(offset_usize) { - Ok(offset) - } else { - Err(ArrowError::ComputeError(format!( - "The offset {offset_usize} is at an invalid utf-8 boundary." - ))) - } - } - }; - - // start and end offsets of all substrings - let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> = - Vec::with_capacity(array.len()); - let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); - let mut len_so_far = zero; - new_offsets.push(zero); - - offsets - .windows(2) - .try_for_each(|pair| -> Result<(), ArrowError> { - let new_start = match start.cmp(&zero) { - Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?, - Ordering::Equal => pair[0], - Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?, - }; - let new_end = match length { - Some(length) => check_char_boundary((length + new_start).min(pair[1]))?, - None => pair[1], - }; - len_so_far += new_end - new_start; - new_starts_ends.push((new_start, new_end)); - new_offsets.push(len_so_far); - Ok(()) - })?; - - // concatenate substrings into a buffer - let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize()); - - new_starts_ends - .iter() - .map(|(start, end)| { - let start = start.as_usize(); - let end = end.as_usize(); - &data[start..end] - }) - .for_each(|slice| new_values.extend_from_slice(slice)); - - let data = unsafe { - ArrayData::new_unchecked( - GenericStringArray::::DATA_TYPE, - array.len(), - None, - array.nulls().map(|b| b.inner().sliced()), - 0, - vec![Buffer::from_vec(new_offsets), new_values.into()], - vec![], - ) - }; - Ok(make_array(data)) -} - #[cfg(test)] mod tests { use super::*; @@ -1020,4 +968,17 @@ mod tests { let err = substring(&array, 0, Some(5)).unwrap_err().to_string(); assert!(err.contains("invalid utf-8 boundary")); } + + #[test] + fn non_utf8_bytes() { + // non-utf8 bytes + let bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD, 0xE8, 0xAF, 0xAD]; + let array = BinaryArray::from(vec![Some(bytes)]); + let arr = substring(&array, 0, Some(5)).unwrap(); + let actual = arr.as_any().downcast_ref::().unwrap(); + + let expected_bytes: &[u8] = &[0xE4, 0xBD, 0xA0, 0xE5, 0xA5]; + let expected = BinaryArray::from(vec![Some(expected_bytes)]); + assert_eq!(expected, *actual); + } } From 9c0cae5301e0d254af4d64096ae95c2547a7f4a0 Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 26 Jun 2023 14:34:59 +0200 Subject: [PATCH 1029/1411] doc: deploy crate docs to GitHub pages (#4436) * deploy docs * Only deploy if a push to master * update docs --- .github/workflows/docs.yml | 29 ++++++++++++++++++++++++++++- README.md | 2 ++ arrow/README.md | 2 +- parquet/README.md | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index bf1bf7aad880..a7b9458a1333 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -41,7 +41,7 @@ jobs: container: image: ${{ matrix.arch }}/rust env: - RUSTDOCFLAGS: "-Dwarnings" + RUSTDOCFLAGS: "-Dwarnings --enable-index-page -Zunstable-options" steps: - uses: actions/checkout@v3 with: @@ -56,3 +56,30 @@ jobs: rust-version: ${{ matrix.rust }} - name: Run cargo doc run: cargo doc --document-private-items --no-deps --workspace --all-features + - name: Fix file permissions + shell: sh + run: | + chmod -c -R +rX "target/doc" | + while read line; do + echo "::warning title=Invalid file permissions automatically fixed::$line" + done + - name: Upload artifacts + uses: actions/upload-pages-artifact@v1 + with: + path: target/doc + + deploy: + # Only deploy if a push to master + if: github.ref_name == 'master' && github.event_name == 'push' + needs: docs + permissions: + pages: write # to deploy to Pages + id-token: write # to verify the deployment originates from an appropriate source + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v1 diff --git a/README.md b/README.md index df05d1463b2f..41cf9604af76 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ This repo contains the following main components: | arrow-flight | Support for Arrow-Flight IPC protocol | [(README)][flight-readme] | | object-store | Support for object store interactions (aws, azure, gcp, local, in-memory) | [(README)][objectstore-readme] | +See the list of all crates in this repo and their rustdocs [here](https://apache.github.io/arrow-rs). + There are two related crates in a different repository | Crate | Description | Documentation | diff --git a/arrow/README.md b/arrow/README.md index fde71607246e..eef7db4d2c67 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -24,7 +24,7 @@ This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. -The [crate documentation](https://docs.rs/arrow/latest/arrow/) contains examples and full API. +The [crate documentation](https://apache.github.io/arrow-rs/arrow/index.html) contains examples and full API. There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. ## Rust Version Compatibility diff --git a/parquet/README.md b/parquet/README.md index d006c47ec148..bb2f96418d30 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -24,7 +24,7 @@ This crate contains the official Native Rust implementation of [Apache Parquet](https://parquet.apache.org/), which is part of the [Apache Arrow](https://arrow.apache.org/) project. -See [crate documentation](https://docs.rs/parquet/latest/parquet/) for examples and the full API. +See [crate documentation](https://apache.github.io/arrow-rs/parquet/index.html) for examples and the full API. ## Rust Version Compatibility From f2cb3d076cb4356153f6d619bd0c0ca5266fb450 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:22:26 +0100 Subject: [PATCH 1030/1411] Update indexmap requirement from 1.9 to 2.0 (#4451) Updates the requirements on [indexmap](https://github.com/bluss/indexmap) to permit the latest version. - [Changelog](https://github.com/bluss/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/bluss/indexmap/compare/1.9.0...2.0.0) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-json/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index d9b3a0df9c87..137d53557790 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -40,7 +40,7 @@ arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } -indexmap = { version = "1.9", default-features = false, features = ["std"] } +indexmap = { version = "2.0", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } From 2d8996bf785a7994981af6832569d243c8b592ec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:22:35 +0100 Subject: [PATCH 1031/1411] Update proc-macro2 requirement from =1.0.60 to =1.0.63 (#4450) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.60...1.0.63) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index d3a9c4e42de0..743df85dc800 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.60", default-features = false } +proc-macro2 = { version = "=1.0.63", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 1fba5f5a5fec46c0b930e446218c43339ec1b66f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:22:46 +0100 Subject: [PATCH 1032/1411] Bump actions/deploy-pages from 1 to 2 (#4449) Bumps [actions/deploy-pages](https://github.com/actions/deploy-pages) from 1 to 2. - [Release notes](https://github.com/actions/deploy-pages/releases) - [Commits](https://github.com/actions/deploy-pages/compare/v1...v2) --- updated-dependencies: - dependency-name: actions/deploy-pages dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a7b9458a1333..b46e6be3cc10 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -82,4 +82,4 @@ jobs: steps: - name: Deploy to GitHub Pages id: deployment - uses: actions/deploy-pages@v1 + uses: actions/deploy-pages@v2 From 7b6896feecabfad97899fe8562f3f098608fbdde Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 27 Jun 2023 00:04:58 -0700 Subject: [PATCH 1033/1411] Revise error message in From for ScalarBuffer (#4446) * Revise error message in From for ScalaBuffer * Update arrow-buffer/src/buffer/scalar.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix tests --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/array/list_array.rs | 8 ++++++-- arrow-buffer/src/buffer/immutable.rs | 4 ++++ arrow-buffer/src/buffer/scalar.rs | 20 ++++++++++++++------ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 2205d846ea34..0c1fea6f4161 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -968,7 +968,9 @@ mod tests { } #[test] - #[should_panic(expected = "memory is not aligned")] + #[should_panic( + expected = "Memory pointer is not aligned with the specified scalar type" + )] fn test_primitive_array_alignment() { let buf = Buffer::from_slice_ref([0_u64]); let buf2 = buf.slice(1); @@ -980,7 +982,9 @@ mod tests { } #[test] - #[should_panic(expected = "memory is not aligned")] + #[should_panic( + expected = "Memory pointer is not aligned with the specified scalar type" + )] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index a4ab64b84e0c..2ecd3b41913a 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -165,6 +165,10 @@ impl Buffer { unsafe { std::slice::from_raw_parts(self.ptr, self.length) } } + pub(crate) fn deallocation(&self) -> &Deallocation { + self.data.deallocation() + } + /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. /// Doing so allows the same memory region to be shared between buffers. /// # Panics diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 920463b365a5..70c86f11866d 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::alloc::Deallocation; use crate::buffer::Buffer; use crate::native::ArrowNativeType; use crate::MutableBuffer; @@ -118,11 +119,16 @@ impl From for ScalarBuffer { impl From for ScalarBuffer { fn from(buffer: Buffer) -> Self { let align = std::mem::align_of::(); - assert_eq!( - buffer.as_ptr().align_offset(align), - 0, - "memory is not aligned" - ); + let is_aligned = buffer.as_ptr().align_offset(align) == 0; + + match buffer.deallocation() { + Deallocation::Standard(_) => assert!( + is_aligned, + "Memory pointer is not aligned with the specified scalar type" + ), + Deallocation::Custom(_) => + assert!(is_aligned, "Memory pointer from external source (e.g, FFI) is not aligned with the specified scalar type. Before importing buffer through FFI, please make sure the allocation is aligned."), + } Self { buffer, @@ -207,7 +213,9 @@ mod tests { } #[test] - #[should_panic(expected = "memory is not aligned")] + #[should_panic( + expected = "Memory pointer is not aligned with the specified scalar type" + )] fn test_unaligned() { let expected = [0_i32, 1, 2]; let buffer = Buffer::from_iter(expected.iter().cloned()); From 45cc770a96c6501b8f0c233f5cf301507c0f28c3 Mon Sep 17 00:00:00 2001 From: Virgiel <35613972+Virgiel@users.noreply.github.com> Date: Tue, 27 Jun 2023 09:49:47 +0200 Subject: [PATCH 1034/1411] Simplify ffi import/export (#4447) Co-authored-by: Virgiel <> --- arrow/src/array/ffi.rs | 30 +--- arrow/src/ffi.rs | 295 +++++++++++++++------------------------- arrow/src/ffi_stream.rs | 16 +-- arrow/src/pyarrow.rs | 5 +- 4 files changed, 117 insertions(+), 229 deletions(-) diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 56b9b6ecf8fd..639ff980ebc5 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -19,29 +19,9 @@ use std::convert::TryFrom; -use crate::{ - error::{ArrowError, Result}, - ffi, - ffi::ArrowArrayRef, -}; +use crate::{error::Result, ffi}; -use super::{ArrayData, ArrayRef}; - -impl TryFrom for ArrayData { - type Error = ArrowError; - - fn try_from(value: ffi::ArrowArray) -> Result { - value.to_data() - } -} - -impl TryFrom for ffi::ArrowArray { - type Error = ArrowError; - - fn try_from(value: ArrayData) -> Result { - ffi::ArrowArray::try_new(value) - } -} +use super::ArrayRef; /// Exports an array to raw pointers of the C Data Interface provided by the consumer. /// # Safety @@ -79,7 +59,7 @@ mod tests { StructArray, UInt32Array, UInt64Array, }, datatypes::{DataType, Field}, - ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}, + ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, }; use std::convert::TryFrom; use std::sync::Arc; @@ -90,9 +70,7 @@ mod tests { let schema = FFI_ArrowSchema::try_from(expected.data_type())?; // simulate an external consumer by being the consumer - let d1 = ArrowArray::new(array, schema); - - let result = &ArrayData::try_from(d1)?; + let result = &from_ffi(array, &schema)?; assert_eq!(result, expected); Ok(()) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index d8b5be69a517..12aa1309c552 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -22,7 +22,7 @@ //! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray]. //! //! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`, -//! `Buffer`, etc. This is handled by `ArrowArray`. +//! `Buffer`, etc. This is handled by `from_ffi` and `to_ffi`. //! //! //! Export to FFI @@ -32,19 +32,18 @@ //! # use arrow::array::{Int32Array, Array, ArrayData, make_array}; //! # use arrow::error::Result; //! # use arrow::compute::kernels::arithmetic; -//! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; +//! # use arrow::ffi::{to_ffi, from_ffi}; //! # fn main() -> Result<()> { //! // create an array natively //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); //! let data = array.into_data(); //! //! // Export it -//! let out_array = FFI_ArrowArray::new(&data); -//! let out_schema = FFI_ArrowSchema::try_from(data.data_type())?; +//! let (out_array, out_schema) = to_ffi(&data)?; //! //! // import it -//! let array = ArrowArray::new(out_array, out_schema); -//! let array = Int32Array::from(ArrayData::try_from(array)?); +//! let data = from_ffi(out_array, &out_schema)?; +//! let array = Int32Array::from(data); //! //! // perform some operation //! let array = arithmetic::add(&array, &array)?; @@ -60,7 +59,7 @@ //! //! ``` //! # use std::ptr::addr_of_mut; -//! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; +//! # use arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; //! # use arrow_array::{ArrayRef, make_array}; //! # use arrow_schema::ArrowError; //! # @@ -80,7 +79,7 @@ //! let mut schema = FFI_ArrowSchema::empty(); //! let mut array = FFI_ArrowArray::empty(); //! foreign.export_to_c(addr_of_mut!(array), addr_of_mut!(schema)); -//! Ok(make_array(ArrowArray::new(array, schema).try_into()?)) +//! Ok(make_array(from_ffi(array, &schema)?)) //! } //! ``` @@ -222,15 +221,44 @@ unsafe fn create_buffer( .map(|ptr| Buffer::from_custom_allocation(ptr, len, owner)) } -pub trait ArrowArrayRef { - fn to_data(&self) -> Result { - let data_type = self.data_type()?; - let len = self.array().len(); - let offset = self.array().offset(); - let null_count = self.array().null_count(); +/// Export to the C Data Interface +pub fn to_ffi(data: &ArrayData) -> Result<(FFI_ArrowArray, FFI_ArrowSchema)> { + let array = FFI_ArrowArray::new(data); + let schema = FFI_ArrowSchema::try_from(data.data_type())?; + Ok((array, schema)) +} + +/// Import [ArrayData] from the C Data Interface +/// +/// # Safety +/// +/// This struct assumes that the incoming data agrees with the C data interface. +pub fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result { + let array = Arc::new(array); + let tmp = ArrowArray { + array: &array, + schema, + owner: &array, + }; + tmp.consume() +} + +#[derive(Debug)] +struct ArrowArray<'a> { + array: &'a FFI_ArrowArray, + schema: &'a FFI_ArrowSchema, + owner: &'a Arc, +} - let data_layout = layout(&data_type); - let buffers = self.buffers(data_layout.can_contain_null_mask)?; +impl<'a> ArrowArray<'a> { + fn consume(self) -> Result { + let dt = DataType::try_from(self.schema)?; + let len = self.array.len(); + let offset = self.array.offset(); + let null_count = self.array.null_count(); + + let data_layout = layout(&dt); + let buffers = self.buffers(data_layout.can_contain_null_mask, &dt)?; let null_bit_buffer = if data_layout.can_contain_null_mask { self.null_bit_buffer() @@ -238,25 +266,24 @@ pub trait ArrowArrayRef { None }; - let mut child_data: Vec = (0..self.array().num_children()) + let mut child_data = (0..self.array.num_children()) .map(|i| { let child = self.child(i); - child.to_data() + child.consume() }) - .map(|d| d.unwrap()) - .collect(); + .collect::>>()?; if let Some(d) = self.dictionary() { // For dictionary type there should only be a single child, so we don't need to worry if // there are other children added above. assert!(child_data.is_empty()); - child_data.push(d.to_data()?); + child_data.push(d.consume()?); } // Should FFI be checking validity? Ok(unsafe { ArrayData::new_unchecked( - data_type, + dt, len, Some(null_count), null_bit_buffer, @@ -269,16 +296,15 @@ pub trait ArrowArrayRef { /// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present /// in the spec of the type) - fn buffers(&self, can_contain_null_mask: bool) -> Result> { + fn buffers(&self, can_contain_null_mask: bool, dt: &DataType) -> Result> { // + 1: skip null buffer let buffer_begin = can_contain_null_mask as usize; - (buffer_begin..self.array().num_buffers()) + (buffer_begin..self.array.num_buffers()) .map(|index| { - let len = self.buffer_len(index)?; + let len = self.buffer_len(index, dt)?; - match unsafe { - create_buffer(self.owner().clone(), self.array(), index, len) - } { + match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } + { Some(buf) => Ok(buf), None if len == 0 => { // Null data buffer, which Rust doesn't allow. So create @@ -297,17 +323,16 @@ pub trait ArrowArrayRef { /// Rust implementation uses fixed-sized buffers, which require knowledge of their `len`. /// for variable-sized buffers, such as the second buffer of a stringArray, we need /// to fetch offset buffer's len to build the second buffer. - fn buffer_len(&self, i: usize) -> Result { + fn buffer_len(&self, i: usize, dt: &DataType) -> Result { // Special handling for dictionary type as we only care about the key type in the case. - let t = self.data_type()?; - let data_type = match &t { + let data_type = match dt { DataType::Dictionary(key_data_type, _) => key_data_type.as_ref(), dt => dt, }; // `ffi::ArrowArray` records array offset, we need to add it back to the // buffer length to get the actual buffer length. - let length = self.array().len() + self.array().offset(); + let length = self.array.len() + self.array.offset(); // Inner type is not important for buffer length. Ok(match (&data_type, i) { @@ -325,21 +350,21 @@ pub trait ArrowArrayRef { } (DataType::Utf8, 2) | (DataType::Binary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) - let len = self.buffer_len(1)?; + let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = self.array().buffer(1) as *const i32; + let offset_buffer = self.array.buffer(1) as *const i32; // get last offset (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize } (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) - let len = self.buffer_len(1)?; + let len = self.buffer_len(1, dt)?; // first buffer is the null buffer => add(1) // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = self.array().buffer(1) as *const i64; + let offset_buffer = self.array.buffer(1) as *const i64; // get last offset (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize } @@ -358,30 +383,26 @@ pub trait ArrowArrayRef { // similar to `self.buffer_len(0)`, but without `Result`. // `ffi::ArrowArray` records array offset, we need to add it back to the // buffer length to get the actual buffer length. - let length = self.array().len() + self.array().offset(); + let length = self.array.len() + self.array.offset(); let buffer_len = bit_util::ceil(length, 8); - unsafe { create_buffer(self.owner().clone(), self.array(), 0, buffer_len) } + unsafe { create_buffer(self.owner.clone(), self.array, 0, buffer_len) } } - fn child(&self, index: usize) -> ArrowArrayChild { - ArrowArrayChild { - array: self.array().child(index), - schema: self.schema().child(index), - owner: self.owner(), + fn child(&self, index: usize) -> ArrowArray { + ArrowArray { + array: self.array.child(index), + schema: self.schema.child(index), + owner: self.owner, } } - fn owner(&self) -> &Arc; - fn array(&self) -> &FFI_ArrowArray; - fn schema(&self) -> &FFI_ArrowSchema; - fn data_type(&self) -> Result; - fn dictionary(&self) -> Option { - match (self.array().dictionary(), self.schema().dictionary()) { - (Some(array), Some(schema)) => Some(ArrowArrayChild { + fn dictionary(&self) -> Option { + match (self.array.dictionary(), self.schema.dictionary()) { + (Some(array), Some(schema)) => Some(ArrowArray { array, schema, - owner: self.owner(), + owner: self.owner, }), (None, None) => None, _ => panic!("Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema") @@ -389,101 +410,6 @@ pub trait ArrowArrayRef { } } -#[allow(rustdoc::private_intra_doc_links)] -/// Struct used to move an Array from and to the C Data Interface. -/// Its main responsibility is to expose functionality that requires -/// both [FFI_ArrowArray] and [FFI_ArrowSchema]. -/// -/// ## Import from the C Data Interface -/// * [ArrowArray::new] to create an array from [`FFI_ArrowArray`] and [`FFI_ArrowSchema`] -/// -/// ## Export to the C Data Interface -/// * Use [`FFI_ArrowArray`] and [`FFI_ArrowSchema`] directly -/// -/// # Safety -/// -/// This struct assumes that the incoming data agrees with the C data interface. -#[derive(Debug)] -pub struct ArrowArray { - pub(crate) array: Arc, - pub(crate) schema: Arc, -} - -#[derive(Debug)] -pub struct ArrowArrayChild<'a> { - array: &'a FFI_ArrowArray, - schema: &'a FFI_ArrowSchema, - owner: &'a Arc, -} - -impl ArrowArrayRef for ArrowArray { - /// the data_type as declared in the schema - fn data_type(&self) -> Result { - DataType::try_from(self.schema.as_ref()) - } - - fn array(&self) -> &FFI_ArrowArray { - self.array.as_ref() - } - - fn schema(&self) -> &FFI_ArrowSchema { - self.schema.as_ref() - } - - fn owner(&self) -> &Arc { - &self.array - } -} - -impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { - /// the data_type as declared in the schema - fn data_type(&self) -> Result { - DataType::try_from(self.schema) - } - - fn array(&self) -> &FFI_ArrowArray { - self.array - } - - fn schema(&self) -> &FFI_ArrowSchema { - self.schema - } - - fn owner(&self) -> &Arc { - self.owner - } -} - -impl ArrowArray { - /// Creates a new [`ArrowArray`] from the provided array and schema - pub fn new(array: FFI_ArrowArray, schema: FFI_ArrowSchema) -> Self { - Self { - array: Arc::new(array), - schema: Arc::new(schema), - } - } - - /// creates a new `ArrowArray`. This is used to export to the C Data Interface. - /// - /// # Memory Leaks - /// This method releases `buffers`. Consumers of this struct *must* call `release` before - /// releasing this struct, or contents in `buffers` leak. - pub fn try_new(data: ArrayData) -> Result { - let array = Arc::new(FFI_ArrowArray::new(&data)); - let schema = Arc::new(FFI_ArrowSchema::try_from(data.data_type())?); - Ok(ArrowArray { array, schema }) - } - - /// creates a new empty [ArrowArray]. Used to import from the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - pub unsafe fn empty() -> Self { - let schema = Arc::new(FFI_ArrowSchema::empty()); - let array = Arc::new(FFI_ArrowArray::empty()); - ArrowArray { array, schema } - } -} - #[cfg(test)] mod tests { use super::*; @@ -510,10 +436,10 @@ mod tests { let array = Int32Array::from(vec![1, 2, 3]); // export it - let array = ArrowArray::try_from(array.into_data()).unwrap(); + let (array, schema) = to_ffi(&array.into_data()).unwrap(); // (simulate consumer) import it - let array = Int32Array::from(ArrayData::try_from(array).unwrap()); + let array = Int32Array::from(from_ffi(array, &schema).unwrap()); let array = kernels::arithmetic::add(&array, &array).unwrap(); // verify @@ -539,11 +465,11 @@ mod tests { // We can read them back to memory // SAFETY: // Pointers are aligned and valid - let array = unsafe { - ArrowArray::new(std::ptr::read(array_ptr), std::ptr::read(schema_ptr)) + let data = unsafe { + from_ffi(std::ptr::read(array_ptr), &std::ptr::read(schema_ptr)).unwrap() }; - let array = Int32Array::from(ArrayData::try_from(array).unwrap()); + let array = Int32Array::from(data); assert_eq!(array, Int32Array::from(vec![1, 2, 3])); } @@ -555,10 +481,10 @@ mod tests { let array = array.slice(1, 2); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -585,10 +511,10 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(Array::to_data(&original_array))?; + let (array, schema) = to_ffi(&original_array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -608,10 +534,10 @@ mod tests { GenericStringArray::::from(vec![Some("a"), None, Some("aaa")]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -677,10 +603,10 @@ mod tests { let array = GenericListArray::::from(list_data.clone()); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // downcast @@ -717,10 +643,10 @@ mod tests { let array = GenericBinaryArray::::from(array); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -762,10 +688,10 @@ mod tests { let array = BooleanArray::from(vec![None, Some(true), Some(false)]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -788,10 +714,10 @@ mod tests { let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -824,10 +750,10 @@ mod tests { let array = TimestampMillisecondArray::from(vec![None, Some(1), Some(2)]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -868,10 +794,10 @@ mod tests { FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -929,10 +855,10 @@ mod tests { .build()?; // export it - let array = ArrowArray::try_from(list_data)?; + let (array, schema) = to_ffi(&list_data)?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -974,10 +900,10 @@ mod tests { let dict_array: DictionaryArray = values.into_iter().collect(); // export it - let array = ArrowArray::try_from(dict_array.into_data())?; + let (array, schema) = to_ffi(&dict_array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -1015,8 +941,7 @@ mod tests { } // (simulate consumer) import it - let array = ArrowArray::new(out_array, out_schema); - let data = ArrayData::try_from(array)?; + let data = from_ffi(out_array, &out_schema)?; let array = make_array(data); // perform some operation @@ -1034,10 +959,10 @@ mod tests { let array = DurationSecondArray::from(vec![None, Some(1), Some(2)]); // export it - let array = ArrowArray::try_from(array.into_data())?; + let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -1081,10 +1006,10 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(map_array.to_data())?; + let (array, schema) = to_ffi(&map_array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -1104,10 +1029,10 @@ mod tests { )]); // export it - let array = ArrowArray::try_from(struct_array.to_data())?; + let (array, schema) = to_ffi(&struct_array.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); // perform some operation @@ -1128,10 +1053,10 @@ mod tests { let union = builder.build().unwrap(); // export it - let array = ArrowArray::try_from(union.to_data())?; + let (array, schema) = to_ffi(&union.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = make_array(data); let array = array.as_any().downcast_ref::().unwrap(); @@ -1189,10 +1114,10 @@ mod tests { let union = builder.build().unwrap(); // export it - let array = ArrowArray::try_from(union.to_data())?; + let (array, schema) = to_ffi(&union.to_data())?; // (simulate consumer) import it - let data = ArrayData::try_from(array)?; + let data = from_ffi(array, &schema)?; let array = UnionArray::from(data); let expected_type_ids = vec![0_i8, 0, 1, 0]; diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index cfda4c88b4b9..5fb1c107350a 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -346,12 +346,7 @@ impl Iterator for ArrowArrayStreamReader { let schema_ref = self.schema(); let schema = FFI_ArrowSchema::try_from(schema_ref.as_ref()).ok()?; - let data = ArrowArray { - array: Arc::new(array), - schema: Arc::new(schema), - } - .to_data() - .ok()?; + let data = from_ffi(array, &schema).ok()?; let record_batch = RecordBatch::from(StructArray::from(data)); @@ -442,8 +437,6 @@ mod tests { let exported_schema = Schema::try_from(&ffi_schema).unwrap(); assert_eq!(&exported_schema, schema.as_ref()); - let ffi_schema = Arc::new(ffi_schema); - // Get array from `FFI_ArrowArrayStream` let mut produced_batches = vec![]; loop { @@ -456,12 +449,7 @@ mod tests { break; } - let array = ArrowArray { - array: Arc::new(ffi_array), - schema: ffi_schema.clone(), - } - .to_data() - .unwrap(); + let array = from_ffi(ffi_array, &ffi_schema).unwrap(); let record_batch = RecordBatch::from(StructArray::from(array)); produced_batches.push(record_batch); diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 98e27ab30e09..54a247d53e6d 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -175,10 +175,7 @@ impl FromPyArrow for ArrayData { ), )?; - let ffi_array = ffi::ArrowArray::new(array, schema); - let data = ArrayData::try_from(ffi_array).map_err(to_py_err)?; - - Ok(data) + ffi::from_ffi(array, &schema).map_err(to_py_err) } } From c1656ffea5bba726d7af892e013b6c5b184dd3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Tue, 27 Jun 2023 19:03:43 +0200 Subject: [PATCH 1035/1411] Convince the compiler to auto-vectorize the range check in parquet DictionaryBuffer (#4453) --- parquet/src/arrow/buffer/dictionary_buffer.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 6344d9dd3145..a0a47e3b98f7 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -152,8 +152,15 @@ impl let min = K::from_usize(0).unwrap(); let max = K::from_usize(values.len()).unwrap(); - // It may be possible to use SIMD here - if keys.as_slice().iter().any(|x| *x < min || *x >= max) { + // using copied and fold gets auto-vectorized since rust 1.70 + // all/any would allow early exit on invalid values + // but in the happy case all values have to be checked anyway + if !keys + .as_slice() + .iter() + .copied() + .fold(true, |a, x| a && x >= min && x < max) + { return Err(general_err!( "dictionary key beyond bounds of dictionary: 0..{}", values.len() From c8fb540a7c10c5fdc4e159a1549f14332287adcc Mon Sep 17 00:00:00 2001 From: xxchan Date: Tue, 27 Jun 2023 23:38:35 +0200 Subject: [PATCH 1036/1411] fix gh-pages deployment (#4452) Mainly refered to opendal's setting https://github.com/apache/incubator-opendal/blob/7ede1bdc0849905a5b140cf984481adca05dc1b3/.github/workflows/docs.yml --- .asf.yaml | 8 +++++++- .github/workflows/docs.yml | 28 ++++++++++++++++++++-------- README.md | 2 +- arrow/README.md | 2 +- parquet/README.md | 2 +- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/.asf.yaml b/.asf.yaml index 968c6779215a..9541db89daf8 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -38,4 +38,10 @@ github: # require branches to be up-to-date before merging strict: true # don't require any jobs to pass - contexts: [] \ No newline at end of file + contexts: [] + +# publishes the content of the `asf-site` branch to +# https://arrow.apache.org/rust/ +publish: + whoami: asf-site + subdir: rust diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b46e6be3cc10..f94071fa9a4d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -66,6 +66,7 @@ jobs: - name: Upload artifacts uses: actions/upload-pages-artifact@v1 with: + name: crate-docs path: target/doc deploy: @@ -73,13 +74,24 @@ jobs: if: github.ref_name == 'master' && github.event_name == 'push' needs: docs permissions: - pages: write # to deploy to Pages - id-token: write # to verify the deployment originates from an appropriate source - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} + contents: write runs-on: ubuntu-latest steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v2 + - uses: actions/checkout@v3 + - name: Download crate docs + uses: actions/download-artifact@v3 + with: + name: crate-docs + path: website/build + - name: Prepare website + run: | + tar -xf website/build/artifact.tar -C website/build + rm website/build/artifact.tar + cp .asf.yaml ./website/build/.asf.yaml + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3.9.2 + if: github.event_name == 'push' && github.ref_name == 'master' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: website/build + publish_branch: asf-site diff --git a/README.md b/README.md index 41cf9604af76..53220620a304 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ This repo contains the following main components: | arrow-flight | Support for Arrow-Flight IPC protocol | [(README)][flight-readme] | | object-store | Support for object store interactions (aws, azure, gcp, local, in-memory) | [(README)][objectstore-readme] | -See the list of all crates in this repo and their rustdocs [here](https://apache.github.io/arrow-rs). +See the list of all crates in this repo and their rustdocs [here](https://arrow.apache.org/rust). There are two related crates in a different repository diff --git a/arrow/README.md b/arrow/README.md index eef7db4d2c67..4d5206cba0a6 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -24,7 +24,7 @@ This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. -The [crate documentation](https://apache.github.io/arrow-rs/arrow/index.html) contains examples and full API. +The [crate documentation](https://arrow.apache.org/rust/arrow/index.html) contains examples and full API. There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. ## Rust Version Compatibility diff --git a/parquet/README.md b/parquet/README.md index bb2f96418d30..19f34fd877fa 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -24,7 +24,7 @@ This crate contains the official Native Rust implementation of [Apache Parquet](https://parquet.apache.org/), which is part of the [Apache Arrow](https://arrow.apache.org/) project. -See [crate documentation](https://apache.github.io/arrow-rs/parquet/index.html) for examples and the full API. +See [crate documentation](https://arrow.apache.org/rust/parquet/index.html) for examples and the full API. ## Rust Version Compatibility From 762873f9f42643c5e2d367fe8b7f9f8de948ff1d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 28 Jun 2023 19:20:49 +0100 Subject: [PATCH 1037/1411] Fix error message copypasta (#4458) --- parquet/src/arrow/array_reader/struct_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index a147c4e9557e..4af194774bfb 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -70,7 +70,7 @@ impl ArrayReader for StructArrayReader { Some(expected) => { if expected != child_read { return Err(general_err!( - "StructArrayReader out of sync in read_records, expected {} skipped, got {}", + "StructArrayReader out of sync in read_records, expected {} read, got {}", expected, child_read )); From 554aebe3b523737b3aaf6109846f4735110b26f8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 28 Jun 2023 19:53:51 +0100 Subject: [PATCH 1038/1411] Fix empty offset index for all null columns (#4459) (#4460) --- parquet/src/arrow/arrow_writer/mod.rs | 27 +++++++++++++++++++++++++++ parquet/src/column/writer/mod.rs | 7 +++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 0aca77f5b572..ccec4ffb20c0 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -790,6 +790,7 @@ mod tests { use arrow::util::pretty::pretty_format_batches; use arrow::{array::*, buffer::Buffer}; use arrow_array::RecordBatch; + use arrow_buffer::NullBuffer; use arrow_schema::Fields; use crate::basic::Encoding; @@ -2609,4 +2610,30 @@ mod tests { writer.close().unwrap(); } + + #[test] + fn test_writer_all_null() { + let a = Int32Array::from(vec![1, 2, 3, 4, 5]); + let b = Int32Array::new(vec![0; 5].into(), Some(NullBuffer::new_null(5))); + let batch = RecordBatch::try_from_iter(vec![ + ("a", Arc::new(a) as ArrayRef), + ("b", Arc::new(b) as ArrayRef), + ]) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let bytes = Bytes::from(buf); + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(bytes, options).unwrap(); + let index = reader.metadata().offset_index().unwrap(); + + assert_eq!(index.len(), 1); + assert_eq!(index[0].len(), 2); // 2 columns + assert_eq!(index[0][0].len(), 1); // 1 page + assert_eq!(index[0][1].len(), 1); // 1 page + } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 4aefef98fd4e..1cacfe793328 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -690,11 +690,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } } - - // update the offset index - self.offset_index_builder - .append_row_count(self.page_metrics.num_buffered_rows as i64); } + // update the offset index + self.offset_index_builder + .append_row_count(self.page_metrics.num_buffered_rows as i64); } fn truncate_min_value(&self, data: &[u8]) -> Vec { From 92a78942b6c75d316ab6ed2b6877723bf58ea388 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 28 Jun 2023 19:54:08 +0100 Subject: [PATCH 1039/1411] Bump peaceiris/actions-gh-pages from 3.9.2 to 3.9.3 (#4455) Bumps [peaceiris/actions-gh-pages](https://github.com/peaceiris/actions-gh-pages) from 3.9.2 to 3.9.3. - [Release notes](https://github.com/peaceiris/actions-gh-pages/releases) - [Changelog](https://github.com/peaceiris/actions-gh-pages/blob/main/CHANGELOG.md) - [Commits](https://github.com/peaceiris/actions-gh-pages/compare/v3.9.2...v3.9.3) --- updated-dependencies: - dependency-name: peaceiris/actions-gh-pages dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f94071fa9a4d..7e80aea6b978 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -89,7 +89,7 @@ jobs: rm website/build/artifact.tar cp .asf.yaml ./website/build/.asf.yaml - name: Deploy to gh-pages - uses: peaceiris/actions-gh-pages@v3.9.2 + uses: peaceiris/actions-gh-pages@v3.9.3 if: github.event_name == 'push' && github.ref_name == 'master' with: github_token: ${{ secrets.GITHUB_TOKEN }} From 0d4e6a727f113f42d58650d2dbecab89b22d4e28 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 29 Jun 2023 14:47:02 -0400 Subject: [PATCH 1040/1411] Bump actions/labeler from 4.1.0 to 4.2.0 (#4464) Bumps [actions/labeler](https://github.com/actions/labeler) from 4.1.0 to 4.2.0. - [Release notes](https://github.com/actions/labeler/releases) - [Commits](https://github.com/actions/labeler/compare/4.1.0...v4.2.0) --- updated-dependencies: - dependency-name: actions/labeler dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index c1492580cd39..daa5d6a76c52 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -44,7 +44,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@4.1.0 + uses: actions/labeler@v4.2.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml From a11b975d6081106615cee3fbdf8a89a6643271a1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 10:44:41 +0100 Subject: [PATCH 1041/1411] Remove Binary Dictionary Arithmetic Support (#4407) * Remove Binary Dictionary Arithmetic Support * Clippy --- .github/workflows/arrow.yml | 8 +- arrow-arith/Cargo.toml | 4 - arrow-arith/src/arithmetic.rs | 1044 +++------------------------------ arrow/Cargo.toml | 5 +- arrow/README.md | 1 - 5 files changed, 71 insertions(+), 991 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 35e70c8f070c..279e276a7912 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -83,7 +83,7 @@ jobs: - name: Test arrow-ord with all features except SIMD run: cargo test -p arrow-ord --features dyn_cmp_dict - name: Test arrow-arith with all features except SIMD - run: cargo test -p arrow-arith --features dyn_arith_dict + run: cargo test -p arrow-arith - name: Test arrow-row with all features run: cargo test -p arrow-row --all-features - name: Test arrow-integration-test with all features @@ -91,7 +91,7 @@ jobs: - name: Test arrow with default features run: cargo test -p arrow - name: Test arrow with all features apart from simd - run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,dyn_arith_dict,chrono-tz + run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,chrono-tz - name: Run examples run: | # Test arrow examples @@ -209,11 +209,11 @@ jobs: - name: Clippy arrow-ord with all features except SIMD run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings - name: Clippy arrow-arith with all features except SIMD - run: cargo clippy -p arrow-arith --all-targets --features dyn_arith_dict -- -D warnings + run: cargo clippy -p arrow-arith --all-targets -- -D warnings - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - name: Clippy arrow with all features except SIMD - run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings + run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,chrono-tz --all-targets -- -D warnings - name: Clippy arrow-integration-test with all features run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - name: Clippy arrow-integration-testing with all features diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 4460d116b466..b5ea2e3c4354 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -44,9 +44,5 @@ num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] -[package.metadata.docs.rs] -features = ["dyn_arith_dict"] - [features] -dyn_arith_dict = [] simd = ["arrow-array/simd"] diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index c3c5cb864ed2..8e7ab44042cf 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -89,48 +89,6 @@ where math_checked_op(left, right, op) } -/// Helper function for operations where a valid `0` on the right array should -/// result in an [ArrowError::DivideByZero], namely the division and modulo operations -/// -/// # Errors -/// -/// This function errors if: -/// * the arrays have different lengths -/// * there is an element where both left and right values are valid and the right value is `0` -#[cfg(feature = "dyn_arith_dict")] -fn math_checked_divide_op_on_iters( - left: impl Iterator>, - right: impl Iterator>, - op: F, - nulls: Option, -) -> Result, ArrowError> -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, -{ - let buffer = if nulls.is_some() { - let values = left.zip(right).map(|(left, right)| { - if let (Some(l), Some(r)) = (left, right) { - op(l, r) - } else { - Ok(T::default_value()) - } - }); - // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } - } else { - // no value is null - let values = left - .map(|l| l.unwrap()) - .zip(right.map(|r| r.unwrap())) - .map(|(left, right)| op(left, right)); - // Safety: Iterator comes from a PrimitiveArray which reports its size correctly - unsafe { arrow_buffer::Buffer::try_from_trusted_len_iter(values) } - }?; - - Ok(PrimitiveArray::new(buffer.into(), nulls)) -} - /// Calculates the modulus operation `left % right` on two SIMD inputs. /// The lower-most bits of `valid_mask` specify which vector lanes are considered as valid. /// @@ -358,290 +316,6 @@ where Ok(PrimitiveArray::new(result.into(), nulls)) } -/// Applies $OP to $LEFT and $RIGHT which are two dictionaries which have (the same) key type $KT -#[cfg(feature = "dyn_arith_dict")] -macro_rules! typed_dict_op { - ($LEFT: expr, $RIGHT: expr, $OP: expr, $KT: tt, $MATH_OP: ident) => {{ - match ($LEFT.value_type(), $RIGHT.value_type()) { - (DataType::Int8, DataType::Int8) => { - let array = $MATH_OP::<$KT, Int8Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Int16, DataType::Int16) => { - let array = $MATH_OP::<$KT, Int16Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Int32, DataType::Int32) => { - let array = $MATH_OP::<$KT, Int32Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Int64, DataType::Int64) => { - let array = $MATH_OP::<$KT, Int64Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::UInt8, DataType::UInt8) => { - let array = $MATH_OP::<$KT, UInt8Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::UInt16, DataType::UInt16) => { - let array = $MATH_OP::<$KT, UInt16Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::UInt32, DataType::UInt32) => { - let array = $MATH_OP::<$KT, UInt32Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::UInt64, DataType::UInt64) => { - let array = $MATH_OP::<$KT, UInt64Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Float32, DataType::Float32) => { - let array = $MATH_OP::<$KT, Float32Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Float64, DataType::Float64) => { - let array = $MATH_OP::<$KT, Float64Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { - let array = $MATH_OP::<$KT, Decimal128Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { - let array = $MATH_OP::<$KT, Decimal256Type, _>($LEFT, $RIGHT, $OP)?; - Ok(Arc::new(array)) - } - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation on two dictionary arrays of different value types ({} and {})", - t1, t2 - ))), - } - }}; -} - -#[cfg(feature = "dyn_arith_dict")] -macro_rules! typed_dict_math_op { - // Applies `LEFT OP RIGHT` when `LEFT` and `RIGHT` both are `DictionaryArray` - ($LEFT: expr, $RIGHT: expr, $OP: expr, $MATH_OP: ident) => {{ - match ($LEFT.data_type(), $RIGHT.data_type()) { - (DataType::Dictionary(left_key_type, _), DataType::Dictionary(right_key_type, _))=> { - match (left_key_type.as_ref(), right_key_type.as_ref()) { - (DataType::Int8, DataType::Int8) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, Int8Type, $MATH_OP) - } - (DataType::Int16, DataType::Int16) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, Int16Type, $MATH_OP) - } - (DataType::Int32, DataType::Int32) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, Int32Type, $MATH_OP) - } - (DataType::Int64, DataType::Int64) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, Int64Type, $MATH_OP) - } - (DataType::UInt8, DataType::UInt8) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, UInt8Type, $MATH_OP) - } - (DataType::UInt16, DataType::UInt16) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, UInt16Type, $MATH_OP) - } - (DataType::UInt32, DataType::UInt32) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, UInt32Type, $MATH_OP) - } - (DataType::UInt64, DataType::UInt64) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_op!(left, right, $OP, UInt64Type, $MATH_OP) - } - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation on two dictionary arrays of different key types ({} and {})", - t1, t2 - ))), - } - } - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation on dictionary array with non-dictionary array ({} and {})", - t1, t2 - ))), - } - }}; -} - -#[cfg(not(feature = "dyn_arith_dict"))] -macro_rules! typed_dict_math_op { - // Applies `LEFT OP RIGHT` when `LEFT` and `RIGHT` both are `DictionaryArray` - ($LEFT: expr, $RIGHT: expr, $OP: expr, $MATH_OP: ident) => {{ - Err(ArrowError::CastError(format!( - "Arithmetic on arrays of type {} with array of type {} requires \"dyn_arith_dict\" feature", - $LEFT.data_type(), $RIGHT.data_type() - ))) - }}; -} - -/// Perform given operation on two `DictionaryArray`s. -/// Returns an error if the two arrays have different value type -#[cfg(feature = "dyn_arith_dict")] -fn math_op_dict( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result, ArrowError> -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> T::Native, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError(format!( - "Cannot perform operation on arrays of different length ({}, {})", - left.len(), - right.len() - ))); - } - - // Safety justification: Since the inputs are valid Arrow arrays, all values are - // valid indexes into the dictionary (which is verified during construction) - - let left_iter = unsafe { - left.values() - .as_any() - .downcast_ref::>() - .unwrap() - .take_iter_unchecked(left.keys_iter()) - }; - - let right_iter = unsafe { - right - .values() - .as_any() - .downcast_ref::>() - .unwrap() - .take_iter_unchecked(right.keys_iter()) - }; - - let result = left_iter - .zip(right_iter) - .map(|(left_value, right_value)| { - if let (Some(left), Some(right)) = (left_value, right_value) { - Some(op(left, right)) - } else { - None - } - }) - .collect(); - - Ok(result) -} - -/// Perform given operation on two `DictionaryArray`s. -/// Returns an error if the two arrays have different value type -#[cfg(feature = "dyn_arith_dict")] -fn math_checked_op_dict( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result, ArrowError> -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, -{ - // left and right's value types are supposed to be same as guaranteed by the caller macro now. - if left.value_type() != T::DATA_TYPE { - return Err(ArrowError::NotYetImplemented(format!( - "Cannot perform provided operation on dictionary array of value type {}", - left.value_type() - ))); - } - - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - try_binary(left, right, op) -} - -/// Helper function for operations where a valid `0` on the right array should -/// result in an [ArrowError::DivideByZero], namely the division and modulo operations -/// -/// # Errors -/// -/// This function errors if: -/// * the arrays have different lengths -/// * there is an element where both left and right values are valid and the right value is `0` -#[cfg(feature = "dyn_arith_dict")] -fn math_divide_checked_op_dict( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result, ArrowError> -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Result, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError(format!( - "Cannot perform operation on arrays of different length ({}, {})", - left.len(), - right.len() - ))); - } - - let nulls = arrow_buffer::NullBuffer::union(left.nulls(), right.nulls()); - - // Safety justification: Since the inputs are valid Arrow arrays, all values are - // valid indexes into the dictionary (which is verified during construction) - - let left_iter = unsafe { - left.values() - .as_any() - .downcast_ref::>() - .unwrap() - .take_iter_unchecked(left.keys_iter()) - }; - - let right_iter = unsafe { - right - .values() - .as_any() - .downcast_ref::>() - .unwrap() - .take_iter_unchecked(right.keys_iter()) - }; - - math_checked_divide_op_on_iters(left_iter, right_iter, op, nulls) -} - -#[cfg(feature = "dyn_arith_dict")] -fn math_divide_safe_op_dict( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType + ArrowNumericType, - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> Option, -{ - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - let array: PrimitiveArray = binary_opt::<_, _, _, T>(left, right, op)?; - Ok(Arc::new(array) as ArrayRef) -} - fn math_safe_divide_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -687,9 +361,6 @@ pub fn add_checked( /// For an overflow-checking variant, use `add_dyn_checked` instead. pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a.add_wrapping(b), math_op_dict) - } DataType::Date32 => { let l = left.as_primitive::(); match right.data_type() { @@ -870,14 +541,6 @@ pub fn add_dyn_checked( right: &dyn Array, ) -> Result { match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| a.add_checked(b), - math_checked_op_dict - ) - } DataType::Date32 => { let l = left.as_primitive::(); match right.data_type() { @@ -1027,9 +690,6 @@ pub fn subtract_checked( /// For an overflow-checking variant, use `subtract_dyn_checked` instead. pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a.sub_wrapping(b), math_op_dict) - } DataType::Date32 => { let l = left.as_primitive::(); match right.data_type() { @@ -1218,14 +878,6 @@ pub fn subtract_dyn_checked( right: &dyn Array, ) -> Result { match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| a.sub_checked(b), - math_checked_op_dict - ) - } DataType::Date32 => { let l = left.as_primitive::(); match right.data_type() { @@ -1445,22 +1097,15 @@ pub fn multiply_checked( /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_dyn_checked` instead. pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a.mul_wrapping(b), math_op_dict) - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_op(left, right, |a, b| a.mul_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + downcast_primitive_array!( + (left, right) => { + math_op(left, right, |a, b| a.mul_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) } - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1472,37 +1117,15 @@ pub fn multiply_dyn_checked( left: &dyn Array, right: &dyn Array, ) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| a.mul_checked(b), - math_checked_op_dict - ) - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_op(left, right, |a, b| a.mul_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.mul_checked(b)).map(|a| Arc::new(a) as ArrayRef) } - } -} - -#[cfg(feature = "dyn_arith_dict")] -fn get_precision_scale(dt: &DataType) -> Result<(u8, i8), ArrowError> { - match dt { - DataType::Decimal128(precision, scale) => Ok((*precision, *scale)), - _ => Err(ArrowError::ComputeError( - "Cannot get precision and scale from non-decimal type".to_string(), - )), - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Returns the precision and scale of the result of a multiplication of two decimal types, @@ -1528,7 +1151,6 @@ fn get_fixed_point_info( Ok((precision, product_scale, divisor)) } -#[cfg(feature = "dyn_arith_dict")] /// Perform `left * right` operation on two decimal arrays. If either left or right value is /// null then the result is also null. /// @@ -1549,45 +1171,6 @@ pub fn multiply_fixed_point_dyn( required_scale: i8, ) -> Result { match (left.data_type(), right.data_type()) { - ( - DataType::Dictionary(_, lhs_value_type), - DataType::Dictionary(_, rhs_value_type), - ) if matches!(lhs_value_type.as_ref(), &DataType::Decimal128(_, _)) - && matches!(rhs_value_type.as_ref(), &DataType::Decimal128(_, _)) => - { - downcast_dictionary_array!( - left => match left.values().data_type() { - DataType::Decimal128(_, _) => { - let lhs_precision_scale = get_precision_scale(lhs_value_type.as_ref())?; - let rhs_precision_scale = get_precision_scale(rhs_value_type.as_ref())?; - - let (precision, product_scale, divisor) = get_fixed_point_info(lhs_precision_scale, rhs_precision_scale, required_scale)?; - - let right = as_dictionary_array::<_>(right); - - if required_scale == product_scale { - let mul = multiply_dyn(left, right)?; - let array = mul.as_any().downcast_ref::().unwrap(); - let array = array.clone().with_precision_and_scale(precision, required_scale)?; - return Ok(Arc::new(array)) - } - - let array = math_op_dict::<_, Decimal128Type, _>(left, right, |a, b| { - let a = i256::from_i128(a); - let b = i256::from_i128(b); - - let mut mul = a.wrapping_mul(b); - mul = divide_and_round::(mul, divisor); - mul.as_i128() - }).and_then(|a| a.with_precision_and_scale(precision, required_scale))?; - - Ok(Arc::new(array)) - } - t => unreachable!("Unsupported dictionary value type {}", t), - }, - t => unreachable!("Unsupported data type {}", t), - ) - } (DataType::Decimal128(_, _), DataType::Decimal128(_, _)) => { let left = left.as_any().downcast_ref::().unwrap(); let right = right.as_any().downcast_ref::().unwrap(); @@ -1782,39 +1365,21 @@ pub fn modulus( /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.mod_wrapping(b)) - } - }, - math_divide_checked_op_dict - ) + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.mod_wrapping(b)) + } + }).map(|a| Arc::new(a) as ArrayRef) } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.mod_wrapping(b)) - } - }).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1869,39 +1434,21 @@ pub fn divide_opt( /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_dyn_checked` instead. pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.div_wrapping(b)) - } - }, - math_divide_checked_op_dict - ) + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }).map(|a| Arc::new(a) as ArrayRef) } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.div_wrapping(b)) - } - }).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1914,27 +1461,15 @@ pub fn divide_dyn_checked( left: &dyn Array, right: &dyn Array, ) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| a.div_checked(b), - math_divide_checked_op_dict - ) + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| a.div_checked(b)).map(|a| Arc::new(a) as ArrayRef) } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| a.div_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1950,39 +1485,21 @@ pub fn divide_dyn_opt( left: &dyn Array, right: &dyn Array, ) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) => { - typed_dict_math_op!( - left, - right, - |a, b| { - if b.is_zero() { - None - } else { - Some(a.div_wrapping(b)) - } - }, - math_divide_safe_op_dict - ) + downcast_primitive_array!( + (left, right) => { + math_safe_divide_op(left, right, |a, b| { + if b.is_zero() { + None + } else { + Some(a.div_wrapping(b)) + } + }) } - _ => { - downcast_primitive_array!( - (left, right) => { - math_safe_divide_op(left, right, |a, b| { - if b.is_zero() { - None - } else { - Some(a.div_wrapping(b)) - } - }) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) } /// Perform `left / right` operation on two arrays without checking for @@ -2279,34 +1796,6 @@ mod tests { assert_eq!(17, c.value(4)); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_primitive_array_add_dyn_dict() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append_null(); - builder.append(10).unwrap(); - let b = builder.finish(); - - let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(19, c.value(4)); - } - #[test] fn test_primitive_array_add_scalar_dyn() { let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); @@ -2452,34 +1941,6 @@ mod tests { ); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_primitive_array_subtract_dyn_dict() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(15).unwrap(); - builder.append(8).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(20).unwrap(); - let a = builder.finish(); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append_null(); - builder.append(10).unwrap(); - let b = builder.finish(); - - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(9, c.value(0)); - assert_eq!(1, c.value(1)); - assert_eq!(-1, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(10, c.value(4)); - } - #[test] fn test_primitive_array_subtract_scalar_dyn() { let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); @@ -2531,34 +1992,6 @@ mod tests { assert_eq!(72, c.value(4)); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_primitive_array_multiply_dyn_dict() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(6).unwrap(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append_null(); - builder.append(10).unwrap(); - let b = builder.finish(); - - let c = multiply_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(30, c.value(0)); - assert_eq!(42, c.value(1)); - assert_eq!(56, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(90, c.value(4)); - } - #[test] fn test_primitive_array_divide_dyn() { let a = Int32Array::from(vec![Some(15), Some(6), Some(1), Some(8), Some(9)]); @@ -2572,34 +2005,6 @@ mod tests { assert_eq!(3, c.value(4)); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_primitive_array_divide_dyn_dict() { - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(15).unwrap(); - builder.append(6).unwrap(); - builder.append(1).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append(3).unwrap(); - builder.append(1).unwrap(); - builder.append_null(); - builder.append(3).unwrap(); - let b = builder.finish(); - - let c = divide_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(3, c.value(4)); - } - #[test] fn test_primitive_array_multiply_scalar_dyn() { let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); @@ -3154,40 +2559,6 @@ mod tests { divide_dyn(&a, &b).unwrap(); } - #[test] - #[should_panic(expected = "DivideByZero")] - #[cfg(feature = "dyn_arith_dict")] - fn test_int_array_divide_dyn_by_zero_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(15).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(0).unwrap(); - let b = builder.finish(); - - divide_dyn(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - #[cfg(feature = "dyn_arith_dict")] - fn test_f32_dict_array_divide_dyn_by_zero() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(1.5).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(0.0).unwrap(); - let b = builder.finish(); - - divide_dyn(&a, &b).unwrap(); - } - #[test] #[should_panic(expected = "DivideByZero")] fn test_i32_array_modulus_by_zero() { @@ -3449,30 +2820,6 @@ mod tests { overflow.expect_err("overflow should be detected"); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_dictionary_add_dyn_wrapping_overflow() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(2, 2); - builder.append(i32::MAX).unwrap(); - builder.append(i32::MIN).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(2, 2); - builder.append(1).unwrap(); - builder.append(1).unwrap(); - let b = builder.finish(); - - let wrapped = add_dyn(&a, &b).unwrap(); - let expected = - Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = add_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - #[test] fn test_primitive_subtract_dyn_wrapping_overflow() { let a = Int32Array::from(vec![-2]); @@ -3486,27 +2833,6 @@ mod tests { overflow.expect_err("overflow should be detected"); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_dictionary_subtract_dyn_wrapping_overflow() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(-2).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(i32::MAX).unwrap(); - let b = builder.finish(); - - let wrapped = subtract_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = subtract_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - #[test] fn test_primitive_mul_dyn_wrapping_overflow() { let a = Int32Array::from(vec![10]); @@ -3520,27 +2846,6 @@ mod tests { overflow.expect_err("overflow should be detected"); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_dictionary_mul_dyn_wrapping_overflow() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(10).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(i32::MAX).unwrap(); - let b = builder.finish(); - - let wrapped = multiply_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = multiply_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - #[test] fn test_primitive_div_dyn_wrapping_overflow() { let a = Int32Array::from(vec![i32::MIN]); @@ -3616,51 +2921,6 @@ mod tests { assert_eq!(e, r); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_dictionary_div_dyn_wrapping_overflow() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(i32::MIN).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(-1).unwrap(); - let b = builder.finish(); - - let wrapped = divide_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = divide_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_div_dyn_opt_overflow_division_by_zero() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![0]); - - let division_by_zero = divide_dyn_opt(&a, &b); - let expected = Arc::new(Int32Array::from(vec![None])) as ArrayRef; - assert_eq!(&expected, &division_by_zero.unwrap()); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(i32::MIN).unwrap(); - let a = builder.finish(); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(0).unwrap(); - let b = builder.finish(); - - let division_by_zero = divide_dyn_opt(&a, &b); - assert_eq!(&expected, &division_by_zero.unwrap()); - } - #[test] fn test_div_scalar_dyn_opt_overflow_division_by_zero() { let a = Int32Array::from(vec![i32::MIN]); @@ -3802,74 +3062,6 @@ mod tests { let _ = overflow.unwrap().expect_err("overflow should be detected"); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_dict_decimal() { - let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); - let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); - let array1 = DictionaryArray::new(keys, Arc::new(values)); - - let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); - let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); - let array2 = DictionaryArray::new(keys, Arc::new(values)); - - let result = add_dyn(&array1, &array2).unwrap(); - let expected = - Arc::new(Decimal128Array::from(vec![8, 9, 2, 8, 6, 5])) as ArrayRef; - assert_eq!(&result, &expected); - - let result = subtract_dyn(&array1, &array2).unwrap(); - let expected = - Arc::new(Decimal128Array::from(vec![-6, -5, 8, 0, 0, -5])) as ArrayRef; - assert_eq!(&result, &expected); - - let values = Decimal256Array::from_iter_values([ - i256::from_i128(0), - i256::from_i128(1), - i256::from_i128(2), - i256::from_i128(3), - i256::from_i128(4), - i256::from_i128(5), - ]); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(5), Some(4), Some(3), None]); - let array1 = DictionaryArray::new(keys, Arc::new(values)); - - let values = Decimal256Array::from_iter_values([ - i256::from_i128(7), - i256::from_i128(-3), - i256::from_i128(4), - i256::from_i128(3), - i256::from_i128(5), - ]); - let keys = - Int8Array::from(vec![Some(0_i8), Some(0), None, Some(2), Some(3), Some(4)]); - let array2 = DictionaryArray::new(keys, Arc::new(values)); - - let result = add_dyn(&array1, &array2).unwrap(); - let expected = Arc::new(Decimal256Array::from(vec![ - Some(i256::from_i128(8)), - None, - None, - Some(i256::from_i128(8)), - Some(i256::from_i128(6)), - None, - ])) as ArrayRef; - - assert_eq!(&result, &expected); - - let result = subtract_dyn(&array1, &array2).unwrap(); - let expected = Arc::new(Decimal256Array::from(vec![ - Some(i256::from_i128(-6)), - None, - None, - Some(i256::from_i128(0)), - Some(i256::from_i128(0)), - None, - ])) as ArrayRef; - assert_eq!(&result, &expected); - } - #[test] fn test_decimal_add_scalar_dyn() { let a = Decimal128Array::from(vec![100, 210, 320]) @@ -4047,110 +3239,6 @@ mod tests { ); } - #[test] - #[cfg(feature = "dyn_arith_dict")] - fn test_decimal_multiply_fixed_point_dyn() { - // [123456789] - let a = Decimal128Array::from(vec![123456789000000000000000000]) - .with_precision_and_scale(38, 18) - .unwrap(); - - // [10] - let b = Decimal128Array::from(vec![10000000000000000000]) - .with_precision_and_scale(38, 18) - .unwrap(); - - // Avoid overflow by reducing the scale. - let result = multiply_fixed_point_dyn(&a, &b, 28).unwrap(); - // [1234567890] - let expected = Arc::new( - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(), - ) as ArrayRef; - - assert_eq!(&expected, &result); - assert_eq!( - result.as_primitive::().value_as_string(0), - "1234567890.0000000000000000000000000000" - ); - - // [123456789, 10] - let a = Decimal128Array::from(vec![ - 123456789000000000000000000, - 10000000000000000000, - ]) - .with_precision_and_scale(38, 18) - .unwrap(); - - // [10, 123456789, 12] - let b = Decimal128Array::from(vec![ - 10000000000000000000, - 123456789000000000000000000, - 12000000000000000000, - ]) - .with_precision_and_scale(38, 18) - .unwrap(); - - let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), None]); - let array1 = DictionaryArray::new(keys, Arc::new(a)); - let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(2), None]); - let array2 = DictionaryArray::new(keys, Arc::new(b)); - - let result = multiply_fixed_point_dyn(&array1, &array2, 28).unwrap(); - let expected = Arc::new( - Decimal128Array::from(vec![ - Some(12345678900000000000000000000000000000), - Some(12345678900000000000000000000000000000), - Some(1200000000000000000000000000000), - None, - ]) - .with_precision_and_scale(38, 28) - .unwrap(), - ) as ArrayRef; - - assert_eq!(&expected, &result); - assert_eq!( - result.as_primitive::().value_as_string(0), - "1234567890.0000000000000000000000000000" - ); - assert_eq!( - result.as_primitive::().value_as_string(1), - "1234567890.0000000000000000000000000000" - ); - assert_eq!( - result.as_primitive::().value_as_string(2), - "120.0000000000000000000000000000" - ); - - // Required scale is same as the product of the input scales. Behavior is same as multiply_dyn. - let a = Decimal128Array::from(vec![123, 100]) - .with_precision_and_scale(3, 2) - .unwrap(); - - let b = Decimal128Array::from(vec![100, 123, 120]) - .with_precision_and_scale(3, 2) - .unwrap(); - - let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), None]); - let array1 = DictionaryArray::new(keys, Arc::new(a)); - let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(2), None]); - let array2 = DictionaryArray::new(keys, Arc::new(b)); - - let result = multiply_fixed_point_dyn(&array1, &array2, 4).unwrap(); - let expected = multiply_dyn(&array1, &array2).unwrap(); - let expected = Arc::new( - expected - .as_any() - .downcast_ref::() - .unwrap() - .clone() - .with_precision_and_scale(7, 4) - .unwrap(), - ) as ArrayRef; - assert_eq!(&expected, &result); - } - #[test] fn test_timestamp_second_add_interval() { // timestamp second + interval year month diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 998d077fa105..bc126a2f4c2d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -63,7 +63,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" pyo3 = { version = "0.19", default-features = false, optional = true } [package.metadata.docs.rs] -features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "dyn_arith_dict", "ffi", "pyarrow"] +features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] [features] default = ["csv", "ipc", "json"] @@ -88,9 +88,6 @@ ffi = ["arrow-schema/ffi", "arrow-data/ffi"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars dyn_cmp_dict = ["arrow-string/dyn_cmp_dict", "arrow-ord/dyn_cmp_dict"] -# Enable dyn-arithmetic kernels for dictionary arrays -# Note: this does not impact arithmetic with scalars -dyn_arith_dict = ["arrow-arith/dyn_arith_dict"] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] diff --git a/arrow/README.md b/arrow/README.md index 4d5206cba0a6..adf32ff61bf2 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -53,7 +53,6 @@ The `arrow` crate provides the following features which may be enabled in your ` - `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - `pyarrow` - bindings for pyo3 to call arrow-rs from python - `dyn_cmp_dict` - enables comparison of dictionary arrays within dyn comparison kernels -- `dyn_arith_dict` - enables arithmetic on dictionary arrays within dyn arithmetic kernels ## Arrow Feature Status From 6667646660fb0f3248b3cb02e77d89f1ac4f3cf2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 30 Jun 2023 07:15:21 -0400 Subject: [PATCH 1042/1411] Docs: Add clearer API doc links (#4461) * Docs: Add clearer API doc links * prettier --- README.md | 16 ++++++++-------- arrow-flight/README.md | 4 ++++ arrow/README.md | 4 +++- parquet/README.md | 4 +++- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 53220620a304..c3108917e87a 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,14 @@ Welcome to the implementation of Arrow, the popular in-memory columnar format, i This repo contains the following main components: -| Crate | Description | Documentation | -| ------------ | ------------------------------------------------------------------------- | ------------------------------ | -| arrow | Core functionality (memory layout, arrays, low level computations) | [(README)][arrow-readme] | -| parquet | Support for Parquet columnar file format | [(README)][parquet-readme] | -| arrow-flight | Support for Arrow-Flight IPC protocol | [(README)][flight-readme] | -| object-store | Support for object store interactions (aws, azure, gcp, local, in-memory) | [(README)][objectstore-readme] | - -See the list of all crates in this repo and their rustdocs [here](https://arrow.apache.org/rust). +| Crate | Description | Latest API Docs | README | +| ------------ | ------------------------------------------------------------------------- | ---------------------------------------------- | ------------------------------ | +| arrow | Core functionality (memory layout, arrays, low level computations) | [docs.rs](https://docs.rs/arrow/latest) | [(README)][arrow-readme] | +| parquet | Support for Parquet columnar file format | [docs.rs](https://docs.rs/parquet/latest) | [(README)][parquet-readme] | +| arrow-flight | Support for Arrow-Flight IPC protocol | [docs.rs](https://docs.rs/arrow-flight/latest) | [(README)][flight-readme] | +| object-store | Support for object store interactions (aws, azure, gcp, local, in-memory) | [docs.rs](https://docs.rs/object_store/latest) | [(README)][objectstore-readme] | + +The current development version the API documentation in this repo can be found [here](https://arrow.apache.org/rust). There are two related crates in a different repository diff --git a/arrow-flight/README.md b/arrow-flight/README.md index d4fddba70b7c..9194b209fe72 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -21,6 +21,10 @@ [![Crates.io](https://img.shields.io/crates/v/arrow-flight.svg)](https://crates.io/crates/arrow-flight) +See the [API documentation](https://docs.rs/arrow_flight/latest) for examples and the full API. + +The API documentation for most recent, unreleased code is available [here](https://arrow.apache.org/rust/arrow_flight/index.html). + ## Usage Add this to your Cargo.toml: diff --git a/arrow/README.md b/arrow/README.md index adf32ff61bf2..fb2119e3bc15 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -24,9 +24,11 @@ This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. -The [crate documentation](https://arrow.apache.org/rust/arrow/index.html) contains examples and full API. +The [API documentation](https://docs.rs/arrow/latest) contains examples and full API. There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. +The API documentation for most recent, unreleased code is available [here](https://arrow.apache.org/rust/arrow/index.html). + ## Rust Version Compatibility This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions. diff --git a/parquet/README.md b/parquet/README.md index 19f34fd877fa..86c7ee2c35d0 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -24,7 +24,9 @@ This crate contains the official Native Rust implementation of [Apache Parquet](https://parquet.apache.org/), which is part of the [Apache Arrow](https://arrow.apache.org/) project. -See [crate documentation](https://arrow.apache.org/rust/parquet/index.html) for examples and the full API. +See the [API documentation](https://docs.rs/parquet/latest) for examples and the full API. + +The API documentation for most recent, unreleased code is available [here](https://arrow.apache.org/rust/parquet/index.html). ## Rust Version Compatibility From 3354a4c5d5b5b6fd3ec55c49fd2f0930b935d07e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:45:16 +0100 Subject: [PATCH 1043/1411] Add DictionaryArray::occupancy (#4415) --- arrow-array/Cargo.toml | 5 ++ arrow-array/benches/occupancy.rs | 57 +++++++++++++++++++++++ arrow-array/src/array/dictionary_array.rs | 48 ++++++++++++++++++- 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 arrow-array/benches/occupancy.rs diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index f2703bb6fca0..d4f0f9fa0d47 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -56,5 +56,10 @@ simd = ["packed_simd"] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +criterion = { version = "0.5", default-features = false } [build-dependencies] + +[[bench]] +name = "occupancy" +harness = false diff --git a/arrow-array/benches/occupancy.rs b/arrow-array/benches/occupancy.rs new file mode 100644 index 000000000000..ed4b94351c28 --- /dev/null +++ b/arrow-array/benches/occupancy.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::types::Int32Type; +use arrow_array::{DictionaryArray, Int32Array}; +use arrow_buffer::NullBuffer; +use criterion::*; +use rand::{thread_rng, Rng}; +use std::sync::Arc; + +fn gen_dict( + len: usize, + values_len: usize, + occupancy: f64, + null_percent: f64, +) -> DictionaryArray { + let mut rng = thread_rng(); + let values = Int32Array::from(vec![0; values_len]); + let max_key = (values_len as f64 * occupancy) as i32; + let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect(); + let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect(); + + let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls))); + DictionaryArray::new(keys, Arc::new(values)) +} + +fn criterion_benchmark(c: &mut Criterion) { + for values in [10, 100, 512] { + for occupancy in [1., 0.5, 0.1] { + for null_percent in [0.0, 0.1, 0.5, 0.9] { + let dict = gen_dict(1024, values, occupancy, null_percent); + c.bench_function(&format!("occupancy(values: {values}, occupancy: {occupancy}, null_percent: {null_percent})"), |b| { + b.iter(|| { + black_box(&dict).occupancy() + }); + }); + } + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index b9112d103a89..5a2f439a8e0f 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -23,8 +23,9 @@ use crate::{ make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, PrimitiveArray, StringArray, }; +use arrow_buffer::bit_util::set_bit; use arrow_buffer::buffer::NullBuffer; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -549,6 +550,29 @@ impl DictionaryArray { .for_each(|v| *v = op(*v)); Ok(builder.finish()) } + + /// Computes an occupancy mask for this dictionary's values + /// + /// For each value in [`Self::values`] the corresponding bit will be set in the + /// returned mask if it is referenced by a key in this [`DictionaryArray`] + pub fn occupancy(&self) -> BooleanBuffer { + let len = self.values.len(); + let mut builder = BooleanBufferBuilder::new(len); + builder.resize(len); + let slice = builder.as_slice_mut(); + match self.keys.nulls().filter(|n| n.null_count() > 0) { + Some(n) => { + let v = self.keys.values(); + n.valid_indices() + .for_each(|idx| set_bit(slice, v[idx].as_usize())) + } + None => { + let v = self.keys.values(); + v.iter().for_each(|v| set_bit(slice, v.as_usize())) + } + } + builder.finish() + } } /// Constructs a `DictionaryArray` from an array data reference. @@ -1207,4 +1231,26 @@ mod tests { let expected = DictionaryArray::new(keys, Arc::new(values)); assert_eq!(expected, returned); } + + #[test] + fn test_occupancy() { + let keys = Int32Array::new((100..200).collect(), None); + let values = Int32Array::from(vec![0; 1024]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + for (idx, v) in dict.occupancy().iter().enumerate() { + let expected = (100..200).contains(&idx); + assert_eq!(v, expected, "{idx}"); + } + + let keys = Int32Array::new( + (0..100).collect(), + Some((0..100).map(|x| x % 4 == 0).collect()), + ); + let values = Int32Array::from(vec![0; 1024]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + for (idx, v) in dict.occupancy().iter().enumerate() { + let expected = idx % 4 == 0 && idx < 100; + assert_eq!(v, expected, "{idx}"); + } + } } From 11758dfadfee3fcc167f60a458dac136fa3abd58 Mon Sep 17 00:00:00 2001 From: Andre Martins <38951957+amartins23@users.noreply.github.com> Date: Fri, 30 Jun 2023 13:08:01 +0100 Subject: [PATCH 1044/1411] feat(flight-sql): Allow implementations of FlightSqlService to handle custom actions and commands (#4463) * feat(flight-sql): Allow implementations of FlightSqlService to handle custom actions and commands * rust fmt * fix missing awaits/warnings --- arrow-flight/src/sql/server.rs | 59 +++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index a33b5b92d01e..f599fbca46a5 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -263,6 +263,18 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { // do_put + /// Implementors may override to handle additional calls to do_put() + async fn do_put_fallback( + &self, + _request: Request>, + message: Any, + ) -> Result::DoPutStream>, Status> { + Err(Status::unimplemented(format!( + "do_put: The defined request is invalid: {}", + message.type_url + ))) + } + /// Execute an update SQL statement. async fn do_put_statement_update( &self, @@ -293,6 +305,22 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { // do_action + /// Implementors may override to handle additional calls to do_action() + async fn do_action_fallback( + &self, + request: Request, + ) -> Result::DoActionStream>, Status> { + Err(Status::invalid_argument(format!( + "do_action: The defined request is invalid: {:?}", + request.get_ref().r#type + ))) + } + + /// Add custom actions to list_actions() result + async fn list_custom_actions(&self) -> Option>> { + None + } + /// Create a prepared statement from given SQL statement. async fn do_action_create_prepared_statement( &self, @@ -349,6 +377,16 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { request: Request, ) -> Result; + /// do_exchange + + /// Implementors may override to handle additional calls to do_exchange() + async fn do_exchange_fallback( + &self, + _request: Request>, + ) -> Result::DoExchangeStream>, Status> { + Err(Status::unimplemented("Not yet implemented")) + } + /// Register a new SqlInfo result, making it available when calling GetSqlInfo. async fn register_sql_info(&self, id: i32, result: &SqlInfo); } @@ -537,10 +575,7 @@ where })]); Ok(Response::new(Box::pin(output))) } - cmd => Err(Status::invalid_argument(format!( - "do_put: The defined request is invalid: {}", - cmd.type_url() - ))), + cmd => self.do_put_fallback(request, cmd.into_any()).await, } } @@ -605,7 +640,7 @@ where Response Message: ActionCancelQueryResult" .into(), }; - let actions: Vec> = vec![ + let mut actions: Vec> = vec![ Ok(create_prepared_statement_action_type), Ok(close_prepared_statement_action_type), Ok(create_prepared_substrait_plan_action_type), @@ -615,6 +650,11 @@ where Ok(end_savepoint_action_type), Ok(cancel_query_action_type), ]; + + if let Some(mut custom_actions) = self.list_custom_actions().await { + actions.append(&mut custom_actions); + } + let output = futures::stream::iter(actions); Ok(Response::new(Box::pin(output) as Self::ListActionsStream)) } @@ -751,17 +791,14 @@ where return Ok(Response::new(Box::pin(output))); } - Err(Status::invalid_argument(format!( - "do_action: The defined request is invalid: {:?}", - request.get_ref().r#type - ))) + self.do_action_fallback(request).await } async fn do_exchange( &self, - _request: Request>, + request: Request>, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + self.do_exchange_fallback(request).await } } From 946890548127198f4afeebf637803ecce202e0e5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:33:41 +0100 Subject: [PATCH 1045/1411] Revert ParquetError: PartialEq (#4469) * Revert ParquetError: PartialEq * Review feedback --- parquet/src/errors.rs | 20 +++----------------- parquet/src/file/serialized_reader.rs | 4 +++- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index f9e3d17c92dd..0b70266b3012 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -23,6 +23,9 @@ use std::{cell, io, result, str}; #[cfg(feature = "arrow")] use arrow_schema::ArrowError; +/// Parquet error enumeration +// Note: we don't implement PartialEq as the semantics for the +// external variant are not well defined (#4469) #[derive(Debug)] pub enum ParquetError { /// General Parquet error. @@ -44,23 +47,6 @@ pub enum ParquetError { External(Box), } -impl PartialEq for ParquetError { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::General(l0), Self::General(r0)) => l0 == r0, - (Self::NYI(l0), Self::NYI(r0)) => l0 == r0, - (Self::EOF(l0), Self::EOF(r0)) => l0 == r0, - #[cfg(feature = "arrow")] - (Self::ArrowError(l0), Self::ArrowError(r0)) => l0 == r0, - (Self::IndexOutOfBound(l0, l1), Self::IndexOutOfBound(r0, r1)) => { - l0 == r0 && l1 == r1 - } - (Self::External(l0), Self::External(r0)) => l0.to_string() == r0.to_string(), - _ => false, - } - } -} - impl std::fmt::Display for ParquetError { fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { match &self { diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index d0e5420a1030..f685f14bd92f 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -810,7 +810,9 @@ mod tests { let file_iter = read_from_file.get_row_iter(None).unwrap(); let cursor_iter = read_from_cursor.get_row_iter(None).unwrap(); - assert!(file_iter.eq(cursor_iter)); + for (a, b) in file_iter.zip(cursor_iter) { + assert_eq!(a.unwrap(), b.unwrap()) + } } #[test] From d7fa775cf76c7cd54c6d2a86542115599d8f53ee Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 15:45:52 +0100 Subject: [PATCH 1046/1411] Append Row to Rows (#4466) (#4470) * Append Row to Rows (#4466) * Tweak docs * Pass slices to encode * Clippy --- arrow-row/src/dictionary.rs | 22 ++++---- arrow-row/src/fixed.rs | 20 ++++--- arrow-row/src/lib.rs | 110 ++++++++++++++++++++++++++++-------- arrow-row/src/list.rs | 10 ++-- arrow-row/src/variable.rs | 9 +-- 5 files changed, 118 insertions(+), 53 deletions(-) diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index d790d951ee3a..6c3ee9e18ced 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -58,18 +58,19 @@ pub fn compute_dictionary_mapping( /// Encode dictionary values not preserving the dictionary encoding pub fn encode_dictionary_values( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], column: &DictionaryArray, values: &Rows, null: &Row<'_>, ) { - for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { + for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) { let row = match k { Some(k) => values.row(k.as_usize()).data, None => null.data, }; let end_offset = *offset + row.len(); - out.buffer[*offset..end_offset].copy_from_slice(row); + data[*offset..end_offset].copy_from_slice(row); *offset = end_offset; } } @@ -79,27 +80,26 @@ pub fn encode_dictionary_values( /// - single `0_u8` if null /// - the bytes of the corresponding normalized key including the null terminator pub fn encode_dictionary( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], column: &DictionaryArray, normalized_keys: &[Option<&[u8]>], opts: SortOptions, ) { - for (offset, k) in out.offsets.iter_mut().skip(1).zip(column.keys()) { + for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) { match k.and_then(|k| normalized_keys[k.as_usize()]) { Some(normalized_key) => { let end_offset = *offset + 1 + normalized_key.len(); - out.buffer[*offset] = 1; - out.buffer[*offset + 1..end_offset].copy_from_slice(normalized_key); + data[*offset] = 1; + data[*offset + 1..end_offset].copy_from_slice(normalized_key); // Negate if descending if opts.descending { - out.buffer[*offset..end_offset] - .iter_mut() - .for_each(|v| *v = !*v) + data[*offset..end_offset].iter_mut().for_each(|v| *v = !*v) } *offset = end_offset; } None => { - out.buffer[*offset] = null_sentinel(opts); + data[*offset] = null_sentinel(opts); *offset += 1; } } diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index d4b82c2a3989..831105bd5f15 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -16,7 +16,7 @@ // under the License. use crate::array::PrimitiveArray; -use crate::{null_sentinel, Rows}; +use crate::null_sentinel; use arrow_array::builder::BufferBuilder; use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; @@ -177,14 +177,15 @@ where /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] pub fn encode>>( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], i: I, opts: SortOptions, ) { - for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { + for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(i) { let end_offset = *offset + T::ENCODED_LEN; if let Some(val) = maybe_val { - let to_write = &mut out.buffer[*offset..end_offset]; + let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; let mut encoded = val.encode(); if opts.descending { @@ -193,22 +194,23 @@ pub fn encode>>( } to_write[1..].copy_from_slice(encoded.as_ref()) } else { - out.buffer[*offset] = null_sentinel(opts); + data[*offset] = null_sentinel(opts); } *offset = end_offset; } } pub fn encode_fixed_size_binary( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], array: &FixedSizeBinaryArray, opts: SortOptions, ) { let len = array.value_length() as usize; - for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(array.iter()) { + for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) { let end_offset = *offset + len + 1; if let Some(val) = maybe_val { - let to_write = &mut out.buffer[*offset..end_offset]; + let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; to_write[1..].copy_from_slice(&val[..len]); if opts.descending { @@ -216,7 +218,7 @@ pub fn encode_fixed_size_binary( to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v) } } else { - out.buffer[*offset] = null_sentinel(opts); + data[*offset] = null_sentinel(opts); } *offset = end_offset; } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 5b9a1bb88078..e8c5ff708d55 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -458,7 +458,7 @@ impl Codec { let nulls = converter.convert_columns(&[null_array])?; let owned = OwnedRow { - data: nulls.buffer, + data: nulls.buffer.into(), config: nulls.config, }; Ok(Self::DictionaryValues(converter, owned)) @@ -496,7 +496,7 @@ impl Codec { let nulls = converter.convert_columns(&nulls)?; let owned = OwnedRow { - data: nulls.buffer, + data: nulls.buffer.into(), config: nulls.config, }; @@ -715,7 +715,13 @@ impl RowConverter { columns.iter().zip(self.fields.iter()).zip(encoders) { // We encode a column at a time to minimise dispatch overheads - encode_column(&mut rows, column.as_ref(), field.options, &encoder) + encode_column( + &mut rows.buffer, + &mut rows.offsets, + column.as_ref(), + field.options, + &encoder, + ) } if cfg!(debug_assertions) { @@ -756,6 +762,48 @@ impl RowConverter { unsafe { self.convert_raw(&mut rows, validate_utf8) } } + /// Returns an empty [`Rows`] with capacity for `row_capacity` rows with + /// a total length of `data_capacity` + /// + /// This can be used to buffer a selection of [`Row`] + /// + /// ``` + /// # use std::sync::Arc; + /// # use std::collections::HashSet; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::StringArray; + /// # use arrow_row::{Row, RowConverter, SortField}; + /// # use arrow_schema::DataType; + /// # + /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + /// let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]); + /// + /// // Convert to row format and deduplicate + /// let converted = converter.convert_columns(&[Arc::new(array)]).unwrap(); + /// let mut distinct_rows = converter.empty_rows(3, 100); + /// let mut dedup: HashSet = HashSet::with_capacity(3); + /// converted.iter().filter(|row| dedup.insert(*row)).for_each(|row| distinct_rows.push(row)); + /// + /// // Note: we could skip buffering and feed the filtered iterator directly + /// // into convert_rows, this is done for demonstration purposes only + /// let distinct = converter.convert_rows(&distinct_rows).unwrap(); + /// let values: Vec<_> = distinct[0].as_string::().iter().map(Option::unwrap).collect(); + /// assert_eq!(&values, &["hello", "world", "a"]); + /// ``` + pub fn empty_rows(&self, row_capacity: usize, data_capacity: usize) -> Rows { + let mut offsets = Vec::with_capacity(row_capacity.saturating_add(1)); + offsets.push(0); + + Rows { + offsets, + buffer: Vec::with_capacity(data_capacity), + config: RowConfig { + fields: self.fields.clone(), + validate_utf8: false, + }, + } + } + /// Convert raw bytes into [`ArrayRef`] /// /// # Safety @@ -832,14 +880,25 @@ struct RowConfig { #[derive(Debug)] pub struct Rows { /// Underlying row bytes - buffer: Box<[u8]>, + buffer: Vec, /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]` - offsets: Box<[usize]>, + offsets: Vec, /// The config for these rows config: RowConfig, } impl Rows { + /// Append a [`Row`] to this [`Rows`] + pub fn push(&mut self, row: Row<'_>) { + assert!( + Arc::ptr_eq(&row.config.fields, &self.config.fields), + "row was not produced by this RowConverter" + ); + self.config.validate_utf8 |= row.config.validate_utf8; + self.buffer.extend_from_slice(row.data); + self.offsets.push(self.buffer.len()) + } + pub fn row(&self, row: usize) -> Row<'_> { let end = self.offsets[row + 1]; let start = self.offsets[row]; @@ -1171,15 +1230,16 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> let buffer = vec![0_u8; cur_offset]; Rows { - buffer: buffer.into(), - offsets: offsets.into(), + buffer, + offsets, config, } } /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses fn encode_column( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], column: &dyn Array, opts: SortOptions, encoder: &Encoder<'_>, @@ -1187,22 +1247,22 @@ fn encode_column( match encoder { Encoder::Stateless => { downcast_primitive_array! { - column => fixed::encode(out, column, opts), + column => fixed::encode(data, offsets, column, opts), DataType::Null => {} - DataType::Boolean => fixed::encode(out, column.as_boolean(), opts), + DataType::Boolean => fixed::encode(data, offsets, column.as_boolean(), opts), DataType::Binary => { - variable::encode(out, as_generic_binary_array::(column).iter(), opts) + variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) } DataType::LargeBinary => { - variable::encode(out, as_generic_binary_array::(column).iter(), opts) + variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) } DataType::Utf8 => variable::encode( - out, + data, offsets, column.as_string::().iter().map(|x| x.map(|x| x.as_bytes())), opts, ), DataType::LargeUtf8 => variable::encode( - out, + data, offsets, column.as_string::() .iter() .map(|x| x.map(|x| x.as_bytes())), @@ -1210,27 +1270,27 @@ fn encode_column( ), DataType::FixedSizeBinary(_) => { let array = column.as_any().downcast_ref().unwrap(); - fixed::encode_fixed_size_binary(out, array, opts) + fixed::encode_fixed_size_binary(data, offsets, array, opts) } _ => unreachable!(), } } Encoder::Dictionary(dict) => { downcast_dictionary_array! { - column => encode_dictionary(out, column, dict, opts), + column => encode_dictionary(data, offsets, column, dict, opts), _ => unreachable!() } } Encoder::DictionaryValues(values, nulls) => { downcast_dictionary_array! { - column => encode_dictionary_values(out, column, values, nulls), + column => encode_dictionary_values(data, offsets, column, values, nulls), _ => unreachable!() } } Encoder::Struct(rows, null) => { let array = as_struct_array(column); let null_sentinel = null_sentinel(opts); - out.offsets + offsets .iter_mut() .skip(1) .enumerate() @@ -1240,15 +1300,17 @@ fn encode_column( false => (*null, null_sentinel), }; let end_offset = *offset + 1 + row.as_ref().len(); - out.buffer[*offset] = sentinel; - out.buffer[*offset + 1..end_offset].copy_from_slice(row.as_ref()); + data[*offset] = sentinel; + data[*offset + 1..end_offset].copy_from_slice(row.as_ref()); *offset = end_offset; }) } Encoder::List(rows) => match column.data_type() { - DataType::List(_) => list::encode(out, rows, opts, as_list_array(column)), + DataType::List(_) => { + list::encode(data, offsets, rows, opts, as_list_array(column)) + } DataType::LargeList(_) => { - list::encode(out, rows, opts, as_large_list_array(column)) + list::encode(data, offsets, rows, opts, as_large_list_array(column)) } _ => unreachable!(), }, @@ -1384,9 +1446,9 @@ mod tests { .unwrap(); let rows = converter.convert_columns(&cols).unwrap(); - assert_eq!(rows.offsets.as_ref(), &[0, 8, 16, 24, 32, 40, 48, 56]); + assert_eq!(rows.offsets, &[0, 8, 16, 24, 32, 40, 48, 56]); assert_eq!( - rows.buffer.as_ref(), + rows.buffer, &[ 1, 128, 1, // 1, 191, 166, 102, 102, // diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index e4ff878dd135..73c4b6fbfda5 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -57,23 +57,23 @@ fn encoded_len(rows: &Rows, range: Option>) -> usize { /// /// `rows` should contain the encoded child elements pub fn encode( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], rows: &Rows, opts: SortOptions, array: &GenericListArray, ) { let mut temporary = vec![]; - let offsets = array.value_offsets().windows(2); - out.offsets + offsets .iter_mut() .skip(1) - .zip(offsets) + .zip(array.value_offsets().windows(2)) .enumerate() .for_each(|(idx, (offset, offsets))| { let start = offsets[0].as_usize(); let end = offsets[1].as_usize(); let range = array.is_valid(idx).then_some(start..end); - let out = &mut out.buffer[*offset..]; + let out = &mut data[*offset..]; *offset += encode_one(out, &mut temporary, rows, range, opts) }); } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index c927f76963ab..e9f6160bf43c 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{null_sentinel, Rows}; +use crate::null_sentinel; use arrow_array::builder::BufferBuilder; use arrow_array::*; use arrow_buffer::bit_util::ceil; @@ -62,12 +62,13 @@ pub fn padded_length(a: Option) -> usize { /// - `0xFF_u8` if this is not the last block for this string /// - otherwise the length of the block as a `u8` pub fn encode<'a, I: Iterator>>( - out: &mut Rows, + data: &mut [u8], + offsets: &mut [usize], i: I, opts: SortOptions, ) { - for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(i) { - *offset += encode_one(&mut out.buffer[*offset..], maybe_val, opts); + for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(i) { + *offset += encode_one(&mut data[*offset..], maybe_val, opts); } } From 414235e7630d05cccf0b9f5032ebfc0858b8ae5b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 30 Jun 2023 16:22:33 +0100 Subject: [PATCH 1047/1411] Prepare arrow 43 (#4471) --- CHANGELOG-old.md | 79 ++++++++++++++++++++++++++ CHANGELOG.md | 96 +++++++++++--------------------- Cargo.toml | 32 +++++------ dev/release/update_change_log.sh | 4 +- 4 files changed, 129 insertions(+), 82 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 97d96882a3f5..295728a67d3a 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,85 @@ # Historical Changelog +## [42.0.0](https://github.com/apache/arrow-rs/tree/42.0.0) (2023-06-16) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/41.0.0...42.0.0) + +**Breaking changes:** + +- Remove 64-bit to 32-bit Cast from IPC Reader [\#4412](https://github.com/apache/arrow-rs/pull/4412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) +- Truncate Min/Max values in the Column Index [\#4389](https://github.com/apache/arrow-rs/pull/4389) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) +- feat\(flight\): harmonize server metadata APIs [\#4384](https://github.com/apache/arrow-rs/pull/4384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Move record delimiting into ColumnReader \(\#4365\) [\#4376](https://github.com/apache/arrow-rs/pull/4376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Changed array\_to\_json\_array to take &dyn Array [\#4370](https://github.com/apache/arrow-rs/pull/4370) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- Make PrimitiveArray::with\_timezone consuming [\#4366](https://github.com/apache/arrow-rs/pull/4366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Add doc example of constructing a MapArray [\#4385](https://github.com/apache/arrow-rs/issues/4385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `millisecond` and `microsecond` functions [\#4374](https://github.com/apache/arrow-rs/issues/4374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Changed array\_to\_json\_array to take &dyn Array [\#4369](https://github.com/apache/arrow-rs/issues/4369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- compute::ord kernel for getting min and max of two scalar/array values [\#4347](https://github.com/apache/arrow-rs/issues/4347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Release 41.0.0 of arrow/arrow-flight/parquet/parquet-derive [\#4346](https://github.com/apache/arrow-rs/issues/4346) +- Refactor CAST tests to use new cast array syntax [\#4336](https://github.com/apache/arrow-rs/issues/4336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- pass bytes directly to parquet's KeyValue [\#4317](https://github.com/apache/arrow-rs/issues/4317) +- PyArrow conversions could return TypeError if provided incorrect Python type [\#4312](https://github.com/apache/arrow-rs/issues/4312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Have array\_to\_json\_array support Map [\#4297](https://github.com/apache/arrow-rs/issues/4297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FlightSQL: Add helpers to create `CommandGetXdbcTypeInfo` responses \(`XdbcInfoValue` and builders\) [\#4257](https://github.com/apache/arrow-rs/issues/4257) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Have array\_to\_json\_array support FixedSizeList [\#4248](https://github.com/apache/arrow-rs/issues/4248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Truncate ColumnIndex ByteArray Statistics [\#4126](https://github.com/apache/arrow-rs/issues/4126) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Arrow compute kernel regards selection vector [\#4095](https://github.com/apache/arrow-rs/issues/4095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Wrongly calculated data compressed length in IPC writer [\#4410](https://github.com/apache/arrow-rs/issues/4410) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Take Kernel Handles Nullable Indices Incorrectly [\#4404](https://github.com/apache/arrow-rs/issues/4404) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- StructBuilder::new Doesn't Validate Builder DataTypes [\#4397](https://github.com/apache/arrow-rs/issues/4397) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet error: Not all children array length are the same! when using RowSelection to read a parquet file [\#4396](https://github.com/apache/arrow-rs/issues/4396) +- RecordReader::skip\_records Is Incorrect for Repeated Columns [\#4368](https://github.com/apache/arrow-rs/issues/4368) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- List-of-String Array panics in the presence of row filters [\#4365](https://github.com/apache/arrow-rs/issues/4365) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Fail to read block compressed gzip files with parquet-fromcsv [\#4173](https://github.com/apache/arrow-rs/issues/4173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Closed issues:** + +- Have a parquet file not able to be deduped via arrow-rs, complains about Decimal precision? [\#4356](https://github.com/apache/arrow-rs/issues/4356) +- Question: Could we move `dict_id, dict_is_ordered` into DataType? [\#4325](https://github.com/apache/arrow-rs/issues/4325) + +**Merged pull requests:** + +- Fix reading gzip file with multiple gzip headers in parquet-fromcsv. [\#4419](https://github.com/apache/arrow-rs/pull/4419) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ghuls](https://github.com/ghuls)) +- Cleanup nullif kernel [\#4416](https://github.com/apache/arrow-rs/pull/4416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix bug in IPC logic that determines if the buffer should be compressed or not [\#4411](https://github.com/apache/arrow-rs/pull/4411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lwpyr](https://github.com/lwpyr)) +- Faster unpacking of Int32Type dictionary [\#4406](https://github.com/apache/arrow-rs/pull/4406) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve `take` kernel performance on primitive arrays, fix bad null index handling \(\#4404\) [\#4405](https://github.com/apache/arrow-rs/pull/4405) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More take benchmarks [\#4403](https://github.com/apache/arrow-rs/pull/4403) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `BooleanBuffer::new_unset` and `BooleanBuffer::new_set` and `BooleanArray::new_null` constructors [\#4402](https://github.com/apache/arrow-rs/pull/4402) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add PrimitiveBuilder type constructors [\#4401](https://github.com/apache/arrow-rs/pull/4401) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- StructBuilder Validate Child Data \(\#4397\) [\#4400](https://github.com/apache/arrow-rs/pull/4400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Faster UTF-8 truncation [\#4399](https://github.com/apache/arrow-rs/pull/4399) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Minor: Derive `Hash` impls for `CastOptions` and `FormatOptions` [\#4395](https://github.com/apache/arrow-rs/pull/4395) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Fix typo in README [\#4394](https://github.com/apache/arrow-rs/pull/4394) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([okue](https://github.com/okue)) +- Improve parquet `WriterProperites` and `ReaderProperties` docs [\#4392](https://github.com/apache/arrow-rs/pull/4392) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Cleanup downcast macros [\#4391](https://github.com/apache/arrow-rs/pull/4391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.59 to =1.0.60 [\#4388](https://github.com/apache/arrow-rs/pull/4388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Consolidate ByteArray::from\_iterator [\#4386](https://github.com/apache/arrow-rs/pull/4386) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add MapArray constructors and doc example [\#4382](https://github.com/apache/arrow-rs/pull/4382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Documentation Improvements [\#4381](https://github.com/apache/arrow-rs/pull/4381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add NullBuffer and BooleanBuffer From conversions [\#4380](https://github.com/apache/arrow-rs/pull/4380) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more examples of constructing Boolean, Primitive, String, and Decimal Arrays, and From impl for i256 [\#4379](https://github.com/apache/arrow-rs/pull/4379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add ListArrayReader benchmarks [\#4378](https://github.com/apache/arrow-rs/pull/4378) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update comfy-table requirement from 6.0 to 7.0 [\#4377](https://github.com/apache/arrow-rs/pull/4377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: Add`microsecond` and `millisecond` kernels [\#4375](https://github.com/apache/arrow-rs/pull/4375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Update hashbrown requirement from 0.13 to 0.14 [\#4373](https://github.com/apache/arrow-rs/pull/4373) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- minor: use as\_boolean to resolve TODO [\#4367](https://github.com/apache/arrow-rs/pull/4367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Have array\_to\_json\_array support MapArray [\#4364](https://github.com/apache/arrow-rs/pull/4364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- deprecate: as\_decimal\_array [\#4363](https://github.com/apache/arrow-rs/pull/4363) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Add support for FixedSizeList in array\_to\_json\_array [\#4361](https://github.com/apache/arrow-rs/pull/4361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) +- refact: use as\_primitive in cast.rs test [\#4360](https://github.com/apache/arrow-rs/pull/4360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- feat\(flight\): add xdbc type info helpers [\#4359](https://github.com/apache/arrow-rs/pull/4359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) +- Minor: float16 to json [\#4358](https://github.com/apache/arrow-rs/pull/4358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Raise TypeError on PyArrow import [\#4316](https://github.com/apache/arrow-rs/pull/4316) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Arrow Cast: Fixed Point Arithmetic for Interval Parsing [\#4291](https://github.com/apache/arrow-rs/pull/4291) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) ## [41.0.0](https://github.com/apache/arrow-rs/tree/41.0.0) (2023-06-02) [Full Changelog](https://github.com/apache/arrow-rs/compare/40.0.0...41.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22ae78b516e8..6ed2f1420684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,85 +19,53 @@ # Changelog -## [42.0.0](https://github.com/apache/arrow-rs/tree/42.0.0) (2023-06-16) +## [43.0.0](https://github.com/apache/arrow-rs/tree/43.0.0) (2023-06-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/41.0.0...42.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/42.0.0...43.0.0) **Breaking changes:** -- Remove 64-bit to 32-bit Cast from IPC Reader [\#4412](https://github.com/apache/arrow-rs/pull/4412) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) -- Truncate Min/Max values in the Column Index [\#4389](https://github.com/apache/arrow-rs/pull/4389) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([AdamGS](https://github.com/AdamGS)) -- feat\(flight\): harmonize server metadata APIs [\#4384](https://github.com/apache/arrow-rs/pull/4384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- Move record delimiting into ColumnReader \(\#4365\) [\#4376](https://github.com/apache/arrow-rs/pull/4376) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Changed array\_to\_json\_array to take &dyn Array [\#4370](https://github.com/apache/arrow-rs/pull/4370) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) -- Make PrimitiveArray::with\_timezone consuming [\#4366](https://github.com/apache/arrow-rs/pull/4366) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify ffi import/export [\#4447](https://github.com/apache/arrow-rs/pull/4447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Virgiel](https://github.com/Virgiel)) +- Return Result from Parquet Row APIs [\#4428](https://github.com/apache/arrow-rs/pull/4428) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Remove Binary Dictionary Arithmetic Support [\#4407](https://github.com/apache/arrow-rs/pull/4407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Add doc example of constructing a MapArray [\#4385](https://github.com/apache/arrow-rs/issues/4385) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `millisecond` and `microsecond` functions [\#4374](https://github.com/apache/arrow-rs/issues/4374) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Changed array\_to\_json\_array to take &dyn Array [\#4369](https://github.com/apache/arrow-rs/issues/4369) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- compute::ord kernel for getting min and max of two scalar/array values [\#4347](https://github.com/apache/arrow-rs/issues/4347) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Release 41.0.0 of arrow/arrow-flight/parquet/parquet-derive [\#4346](https://github.com/apache/arrow-rs/issues/4346) -- Refactor CAST tests to use new cast array syntax [\#4336](https://github.com/apache/arrow-rs/issues/4336) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- pass bytes directly to parquet's KeyValue [\#4317](https://github.com/apache/arrow-rs/issues/4317) -- PyArrow conversions could return TypeError if provided incorrect Python type [\#4312](https://github.com/apache/arrow-rs/issues/4312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Have array\_to\_json\_array support Map [\#4297](https://github.com/apache/arrow-rs/issues/4297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- FlightSQL: Add helpers to create `CommandGetXdbcTypeInfo` responses \(`XdbcInfoValue` and builders\) [\#4257](https://github.com/apache/arrow-rs/issues/4257) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Have array\_to\_json\_array support FixedSizeList [\#4248](https://github.com/apache/arrow-rs/issues/4248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Truncate ColumnIndex ByteArray Statistics [\#4126](https://github.com/apache/arrow-rs/issues/4126) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Arrow compute kernel regards selection vector [\#4095](https://github.com/apache/arrow-rs/issues/4095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Request: a way to copy a `Row` to `Rows` [\#4466](https://github.com/apache/arrow-rs/issues/4466) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Reuse schema when importing from FFI [\#4444](https://github.com/apache/arrow-rs/issues/4444) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] Allow implementations of `FlightSqlService` to handle custom actions and commands [\#4439](https://github.com/apache/arrow-rs/issues/4439) +- Support `NullBuilder` [\#4429](https://github.com/apache/arrow-rs/issues/4429) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Wrongly calculated data compressed length in IPC writer [\#4410](https://github.com/apache/arrow-rs/issues/4410) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Take Kernel Handles Nullable Indices Incorrectly [\#4404](https://github.com/apache/arrow-rs/issues/4404) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- StructBuilder::new Doesn't Validate Builder DataTypes [\#4397](https://github.com/apache/arrow-rs/issues/4397) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet error: Not all children array length are the same! when using RowSelection to read a parquet file [\#4396](https://github.com/apache/arrow-rs/issues/4396) -- RecordReader::skip\_records Is Incorrect for Repeated Columns [\#4368](https://github.com/apache/arrow-rs/issues/4368) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- List-of-String Array panics in the presence of row filters [\#4365](https://github.com/apache/arrow-rs/issues/4365) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Fail to read block compressed gzip files with parquet-fromcsv [\#4173](https://github.com/apache/arrow-rs/issues/4173) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Regression in in parquet `42.0.0` : Bad parquet column indexes for All Null Columns, resulting in `Parquet error: StructArrayReader out of sync` on read [\#4459](https://github.com/apache/arrow-rs/issues/4459) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Regression in 42.0.0: Parsing fractional intervals without leading 0 is not supported [\#4424](https://github.com/apache/arrow-rs/issues/4424) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -**Closed issues:** +**Documentation updates:** -- Have a parquet file not able to be deduped via arrow-rs, complains about Decimal precision? [\#4356](https://github.com/apache/arrow-rs/issues/4356) -- Question: Could we move `dict_id, dict_is_ordered` into DataType? [\#4325](https://github.com/apache/arrow-rs/issues/4325) +- doc: deploy crate docs to GitHub pages [\#4436](https://github.com/apache/arrow-rs/pull/4436) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) **Merged pull requests:** -- Fix reading gzip file with multiple gzip headers in parquet-fromcsv. [\#4419](https://github.com/apache/arrow-rs/pull/4419) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ghuls](https://github.com/ghuls)) -- Cleanup nullif kernel [\#4416](https://github.com/apache/arrow-rs/pull/4416) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix bug in IPC logic that determines if the buffer should be compressed or not [\#4411](https://github.com/apache/arrow-rs/pull/4411) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([lwpyr](https://github.com/lwpyr)) -- Faster unpacking of Int32Type dictionary [\#4406](https://github.com/apache/arrow-rs/pull/4406) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve `take` kernel performance on primitive arrays, fix bad null index handling \(\#4404\) [\#4405](https://github.com/apache/arrow-rs/pull/4405) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- More take benchmarks [\#4403](https://github.com/apache/arrow-rs/pull/4403) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add `BooleanBuffer::new_unset` and `BooleanBuffer::new_set` and `BooleanArray::new_null` constructors [\#4402](https://github.com/apache/arrow-rs/pull/4402) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add PrimitiveBuilder type constructors [\#4401](https://github.com/apache/arrow-rs/pull/4401) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- StructBuilder Validate Child Data \(\#4397\) [\#4400](https://github.com/apache/arrow-rs/pull/4400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Faster UTF-8 truncation [\#4399](https://github.com/apache/arrow-rs/pull/4399) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Minor: Derive `Hash` impls for `CastOptions` and `FormatOptions` [\#4395](https://github.com/apache/arrow-rs/pull/4395) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Fix typo in README [\#4394](https://github.com/apache/arrow-rs/pull/4394) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([okue](https://github.com/okue)) -- Improve parquet `WriterProperites` and `ReaderProperties` docs [\#4392](https://github.com/apache/arrow-rs/pull/4392) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) -- Cleanup downcast macros [\#4391](https://github.com/apache/arrow-rs/pull/4391) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.59 to =1.0.60 [\#4388](https://github.com/apache/arrow-rs/pull/4388) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Consolidate ByteArray::from\_iterator [\#4386](https://github.com/apache/arrow-rs/pull/4386) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add MapArray constructors and doc example [\#4382](https://github.com/apache/arrow-rs/pull/4382) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Documentation Improvements [\#4381](https://github.com/apache/arrow-rs/pull/4381) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add NullBuffer and BooleanBuffer From conversions [\#4380](https://github.com/apache/arrow-rs/pull/4380) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add more examples of constructing Boolean, Primitive, String, and Decimal Arrays, and From impl for i256 [\#4379](https://github.com/apache/arrow-rs/pull/4379) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add ListArrayReader benchmarks [\#4378](https://github.com/apache/arrow-rs/pull/4378) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update comfy-table requirement from 6.0 to 7.0 [\#4377](https://github.com/apache/arrow-rs/pull/4377) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat: Add`microsecond` and `millisecond` kernels [\#4375](https://github.com/apache/arrow-rs/pull/4375) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Update hashbrown requirement from 0.13 to 0.14 [\#4373](https://github.com/apache/arrow-rs/pull/4373) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- minor: use as\_boolean to resolve TODO [\#4367](https://github.com/apache/arrow-rs/pull/4367) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Have array\_to\_json\_array support MapArray [\#4364](https://github.com/apache/arrow-rs/pull/4364) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) -- deprecate: as\_decimal\_array [\#4363](https://github.com/apache/arrow-rs/pull/4363) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Add support for FixedSizeList in array\_to\_json\_array [\#4361](https://github.com/apache/arrow-rs/pull/4361) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dadepo](https://github.com/dadepo)) -- refact: use as\_primitive in cast.rs test [\#4360](https://github.com/apache/arrow-rs/pull/4360) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) -- feat\(flight\): add xdbc type info helpers [\#4359](https://github.com/apache/arrow-rs/pull/4359) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([roeap](https://github.com/roeap)) -- Minor: float16 to json [\#4358](https://github.com/apache/arrow-rs/pull/4358) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Raise TypeError on PyArrow import [\#4316](https://github.com/apache/arrow-rs/pull/4316) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Arrow Cast: Fixed Point Arithmetic for Interval Parsing [\#4291](https://github.com/apache/arrow-rs/pull/4291) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mr-brobot](https://github.com/mr-brobot)) +- Append Row to Rows \(\#4466\) [\#4470](https://github.com/apache/arrow-rs/pull/4470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat\(flight-sql\): Allow implementations of FlightSqlService to handle custom actions and commands [\#4463](https://github.com/apache/arrow-rs/pull/4463) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- Docs: Add clearer API doc links [\#4461](https://github.com/apache/arrow-rs/pull/4461) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Fix empty offset index for all null columns \(\#4459\) [\#4460](https://github.com/apache/arrow-rs/pull/4460) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Bump peaceiris/actions-gh-pages from 3.9.2 to 3.9.3 [\#4455](https://github.com/apache/arrow-rs/pull/4455) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Convince the compiler to auto-vectorize the range check in parquet DictionaryBuffer [\#4453](https://github.com/apache/arrow-rs/pull/4453) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jhorstmann](https://github.com/jhorstmann)) +- fix docs deployment [\#4452](https://github.com/apache/arrow-rs/pull/4452) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) +- Update indexmap requirement from 1.9 to 2.0 [\#4451](https://github.com/apache/arrow-rs/pull/4451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.60 to =1.0.63 [\#4450](https://github.com/apache/arrow-rs/pull/4450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Revise error message in From\ for ScalarBuffer [\#4446](https://github.com/apache/arrow-rs/pull/4446) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- minor: remove useless mut [\#4443](https://github.com/apache/arrow-rs/pull/4443) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- unify substring for binary&utf8 [\#4442](https://github.com/apache/arrow-rs/pull/4442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Casting fixedsizelist to list/largelist [\#4433](https://github.com/apache/arrow-rs/pull/4433) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- feat: support `NullBuilder` [\#4430](https://github.com/apache/arrow-rs/pull/4430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Remove Float64 -\> Float32 cast in IPC Reader [\#4427](https://github.com/apache/arrow-rs/pull/4427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) +- Parse intervals like `.5` the same as `0.5` [\#4425](https://github.com/apache/arrow-rs/pull/4425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- feat: add strict mode to json reader [\#4421](https://github.com/apache/arrow-rs/pull/4421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([blinkseb](https://github.com/blinkseb)) +- Add DictionaryArray::occupancy [\#4415](https://github.com/apache/arrow-rs/pull/4415) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index 0b67ed91b1f1..173bafc6e08a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "42.0.0" +version = "43.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "42.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "42.0.0", path = "./arrow-arith" } -arrow-array = { version = "42.0.0", path = "./arrow-array" } -arrow-buffer = { version = "42.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "42.0.0", path = "./arrow-cast" } -arrow-csv = { version = "42.0.0", path = "./arrow-csv" } -arrow-data = { version = "42.0.0", path = "./arrow-data" } -arrow-ipc = { version = "42.0.0", path = "./arrow-ipc" } -arrow-json = { version = "42.0.0", path = "./arrow-json" } -arrow-ord = { version = "42.0.0", path = "./arrow-ord" } -arrow-row = { version = "42.0.0", path = "./arrow-row" } -arrow-schema = { version = "42.0.0", path = "./arrow-schema" } -arrow-select = { version = "42.0.0", path = "./arrow-select" } -arrow-string = { version = "42.0.0", path = "./arrow-string" } -parquet = { version = "42.0.0", path = "./parquet", default-features = false } +arrow = { version = "43.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "43.0.0", path = "./arrow-arith" } +arrow-array = { version = "43.0.0", path = "./arrow-array" } +arrow-buffer = { version = "43.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "43.0.0", path = "./arrow-cast" } +arrow-csv = { version = "43.0.0", path = "./arrow-csv" } +arrow-data = { version = "43.0.0", path = "./arrow-data" } +arrow-ipc = { version = "43.0.0", path = "./arrow-ipc" } +arrow-json = { version = "43.0.0", path = "./arrow-json" } +arrow-ord = { version = "43.0.0", path = "./arrow-ord" } +arrow-row = { version = "43.0.0", path = "./arrow-row" } +arrow-schema = { version = "43.0.0", path = "./arrow-schema" } +arrow-select = { version = "43.0.0", path = "./arrow-select" } +arrow-string = { version = "43.0.0", path = "./arrow-string" } +parquet = { version = "43.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 0833c66c428d..6b4b0a56c4bc 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="41.0.0" -FUTURE_RELEASE="42.0.0" +SINCE_TAG="42.0.0" +FUTURE_RELEASE="43.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 5ea197dd9de7cd2c5ad9a37e36a24ddf3ac5688f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 3 Jul 2023 07:00:11 -0700 Subject: [PATCH 1048/1411] feat: support RecordBatchReader on boxed trait objects (#4475) * Impl RBR for Box * Require send to create a FFI stream --- arrow-array/src/record_batch.rs | 24 ++++++++++++++++++++++++ arrow/src/ffi_stream.rs | 10 +++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index d2e36780a901..3134c9ecbd14 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -43,6 +43,12 @@ pub trait RecordBatchReader: Iterator> { } } +impl RecordBatchReader for Box { + fn schema(&self) -> SchemaRef { + self.as_ref().schema() + } +} + /// Trait for types that can write `RecordBatch`'s. pub trait RecordBatchWriter { /// Write a single batch to the writer. @@ -1115,4 +1121,22 @@ mod tests { // Cannot remove metadata batch.with_schema(nullable_schema).unwrap_err(); } + + #[test] + fn test_boxed_reader() { + // Make sure we can pass a boxed reader to a function generic over + // RecordBatchReader. + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let schema = Arc::new(schema); + + let reader = RecordBatchIterator::new(std::iter::empty(), schema); + let reader: Box = Box::new(reader); + + fn get_size(reader: impl RecordBatchReader) -> usize { + reader.size_hint().0 + } + + let size = get_size(reader); + assert_eq!(size, 0); + } } diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 5fb1c107350a..83d4eead30d6 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -119,7 +119,7 @@ unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { } struct StreamPrivateData { - batch_reader: Box, + batch_reader: Box, last_error: String, } @@ -157,7 +157,7 @@ impl Drop for FFI_ArrowArrayStream { impl FFI_ArrowArrayStream { /// Creates a new [`FFI_ArrowArrayStream`]. - pub fn new(batch_reader: Box) -> Self { + pub fn new(batch_reader: Box) -> Self { let private_data = Box::new(StreamPrivateData { batch_reader, last_error: String::new(), @@ -371,7 +371,7 @@ impl RecordBatchReader for ArrowArrayStreamReader { /// Assumes that the pointer represents valid C Stream Interfaces, both in memory /// representation and lifetime via the `release` mechanism. pub unsafe fn export_reader_into_raw( - reader: Box, + reader: Box, out_stream: *mut FFI_ArrowArrayStream, ) { let stream = FFI_ArrowArrayStream::new(reader); @@ -388,13 +388,13 @@ mod tests { struct TestRecordBatchReader { schema: SchemaRef, - iter: Box>>, + iter: Box> + Send>, } impl TestRecordBatchReader { pub fn new( schema: SchemaRef, - iter: Box>>, + iter: Box> + Send>, ) -> Box { Box::new(TestRecordBatchReader { schema, iter }) } From 9ee36b216c3f7dcbaae520f451194acd4f55b98e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 4 Jul 2023 10:06:38 +0100 Subject: [PATCH 1049/1411] Add Scalar/Datum abstraction (#1047) (#4393) * Add Scalar/Datum abstraction (#1047) * Add dyn Array --- arrow-array/src/lib.rs | 3 + arrow-array/src/scalar.rs | 116 +++++++++++++++++++++++++++++++++++++ arrow-select/src/filter.rs | 10 ---- 3 files changed, 119 insertions(+), 10 deletions(-) create mode 100644 arrow-array/src/scalar.rs diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 46de381c3244..afb7ec5e6e44 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -192,6 +192,9 @@ pub use arithmetic::ArrowNativeTypeOp; mod numeric; pub use numeric::*; +mod scalar; +pub use scalar::*; + pub mod builder; pub mod cast; mod delta; diff --git a/arrow-array/src/scalar.rs b/arrow-array/src/scalar.rs new file mode 100644 index 000000000000..e54a999f9980 --- /dev/null +++ b/arrow-array/src/scalar.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::Array; + +/// A possibly [`Scalar`] [`Array`] +/// +/// This allows optimised binary kernels where one or more arguments are constant +/// +/// ``` +/// # use arrow_array::*; +/// # use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; +/// # use arrow_schema::ArrowError; +/// # +/// fn eq_impl( +/// a: &PrimitiveArray, +/// a_scalar: bool, +/// b: &PrimitiveArray, +/// b_scalar: bool, +/// ) -> BooleanArray { +/// let (array, scalar) = match (a_scalar, b_scalar) { +/// (true, true) | (false, false) => { +/// let len = a.len().min(b.len()); +/// let nulls = NullBuffer::union(a.nulls(), b.nulls()); +/// let buffer = BooleanBuffer::collect_bool(len, |idx| a.value(idx) == b.value(idx)); +/// return BooleanArray::new(buffer, nulls); +/// } +/// (true, false) => (b, (a.null_count() == 0).then(|| a.value(0))), +/// (false, true) => (a, (b.null_count() == 0).then(|| b.value(0))), +/// }; +/// match scalar { +/// Some(v) => { +/// let len = array.len(); +/// let nulls = array.nulls().cloned(); +/// let buffer = BooleanBuffer::collect_bool(len, |idx| array.value(idx) == v); +/// BooleanArray::new(buffer, nulls) +/// } +/// None => BooleanArray::new_null(array.len()), +/// } +/// } +/// +/// pub fn eq(l: &dyn Datum, r: &dyn Datum) -> Result { +/// let (l_array, l_scalar) = l.get(); +/// let (r_array, r_scalar) = r.get(); +/// downcast_primitive_array!( +/// (l_array, r_array) => Ok(eq_impl(l_array, l_scalar, r_array, r_scalar)), +/// (a, b) => Err(ArrowError::NotYetImplemented(format!("{a} == {b}"))), +/// ) +/// } +/// +/// // Comparison of two arrays +/// let a = Int32Array::from(vec![1, 2, 3, 4, 5]); +/// let b = Int32Array::from(vec![1, 2, 4, 7, 3]); +/// let r = eq(&a, &b).unwrap(); +/// let values: Vec<_> = r.values().iter().collect(); +/// assert_eq!(values, &[true, true, false, false, false]); +/// +/// // Comparison of an array and a scalar +/// let a = Int32Array::from(vec![1, 2, 3, 4, 5]); +/// let b = Int32Array::from(vec![1]); +/// let r = eq(&a, &Scalar::new(&b)).unwrap(); +/// let values: Vec<_> = r.values().iter().collect(); +/// assert_eq!(values, &[true, false, false, false, false]); +pub trait Datum { + /// Returns the value for this [`Datum`] and a boolean indicating if the value is scalar + fn get(&self) -> (&dyn Array, bool); +} + +impl Datum for T { + fn get(&self) -> (&dyn Array, bool) { + (self, false) + } +} + +impl Datum for dyn Array { + fn get(&self) -> (&dyn Array, bool) { + (self, false) + } +} + +/// A wrapper around a single value [`Array`] indicating kernels should treat it as a scalar value +/// +/// See [`Datum`] for more information +pub struct Scalar<'a>(&'a dyn Array); + +impl<'a> Scalar<'a> { + /// Create a new [`Scalar`] from an [`Array`] + /// + /// # Panics + /// + /// Panics if `array.len() != 1` + pub fn new(array: &'a dyn Array) -> Self { + assert_eq!(array.len(), 1); + Self(array) + } +} + +impl<'a> Datum for Scalar<'a> { + fn get(&self) -> (&dyn Array, bool) { + (self.0, true) + } +} diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index c89491944a21..94afd2df376b 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -321,16 +321,6 @@ fn filter_array( // actually filter _ => downcast_primitive_array! { values => Ok(Arc::new(filter_primitive(values, predicate))), - DataType::Decimal128(p, s) => { - let values = values.as_any().downcast_ref::().unwrap(); - let filtered = filter_primitive(values, predicate); - Ok(Arc::new(filtered.with_precision_and_scale(*p, *s).unwrap())) - } - DataType::Decimal256(p, s) => { - let values = values.as_any().downcast_ref::().unwrap(); - let filtered = filter_primitive(values, predicate); - Ok(Arc::new(filtered.with_precision_and_scale(*p, *s).unwrap())) - } DataType::Boolean => { let values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(filter_boolean(values, predicate))) From aac3aa99398c4f4fe59c60d1839d3a8ab60d00f3 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Tue, 4 Jul 2023 21:34:52 +0530 Subject: [PATCH 1050/1411] Improve in-place primitive sorts by 13-67% (#4473) * Adding sort_primitives benchmark * Adding sort_primitives improvements * Fix lints * Remove all unsafe code and handle offset cases * Incorporate review comments * Remove unneeded returns --- arrow-ord/src/sort.rs | 72 +++++++++++++++++++++++-- arrow/Cargo.toml | 5 ++ arrow/benches/sort_kernel_primitives.rs | 59 ++++++++++++++++++++ 3 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 arrow/benches/sort_kernel_primitives.rs diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 1d96532598ca..147af1e301d6 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -22,6 +22,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; +use arrow_buffer::BooleanBufferBuilder; use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer}; use arrow_data::ArrayData; use arrow_data::ArrayDataBuilder; @@ -57,11 +58,74 @@ pub fn sort( values: &dyn Array, options: Option, ) -> Result { - if let DataType::RunEndEncoded(_, _) = values.data_type() { - return sort_run(values, options, None); + downcast_primitive_array!( + values => sort_native_type(values, options), + DataType::RunEndEncoded(_, _) => sort_run(values, options, None), + _ => { + let indices = sort_to_indices(values, options, None)?; + take(values, &indices, None) + } + ) +} + +fn sort_native_type( + primitive_values: &PrimitiveArray, + options: Option, +) -> Result +where + T: ArrowPrimitiveType, +{ + let sort_options = options.unwrap_or_default(); + + let mut mutable_buffer = vec![T::default_value(); primitive_values.len()]; + let mutable_slice = &mut mutable_buffer; + + let input_values = primitive_values.values().as_ref(); + + let nulls_count = primitive_values.null_count(); + let valid_count = primitive_values.len() - nulls_count; + + let null_bit_buffer = match nulls_count > 0 { + true => { + let mut validity_buffer = BooleanBufferBuilder::new(primitive_values.len()); + if sort_options.nulls_first { + validity_buffer.append_n(nulls_count, false); + validity_buffer.append_n(valid_count, true); + } else { + validity_buffer.append_n(valid_count, true); + validity_buffer.append_n(nulls_count, false); + } + Some(validity_buffer.finish().into()) + } + false => None, + }; + + if let Some(nulls) = primitive_values.nulls().filter(|n| n.null_count() > 0) { + let values_slice = match sort_options.nulls_first { + true => &mut mutable_slice[nulls_count..], + false => &mut mutable_slice[..valid_count], + }; + + for (write_index, index) in nulls.valid_indices().enumerate() { + values_slice[write_index] = primitive_values.value(index); + } + + values_slice.sort_unstable_by(|a, b| a.compare(*b)); + if sort_options.descending { + values_slice.reverse(); + } + } else { + mutable_slice.copy_from_slice(input_values); + mutable_slice.sort_unstable_by(|a, b| a.compare(*b)); + if sort_options.descending { + mutable_slice.reverse(); + } } - let indices = sort_to_indices(values, options, None)?; - take(values, &indices, None) + + Ok(Arc::new( + PrimitiveArray::::new(mutable_buffer.into(), null_bit_buffer) + .with_data_type(primitive_values.data_type().clone()), + )) } /// Sort the `ArrayRef` partially. diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index bc126a2f4c2d..ed4786fb3172 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -185,6 +185,11 @@ name = "sort_kernel" harness = false required-features = ["test_utils"] +[[bench]] +name = "sort_kernel_primitives" +harness = false +required-features = ["test_utils"] + [[bench]] name = "partition_kernels" harness = false diff --git a/arrow/benches/sort_kernel_primitives.rs b/arrow/benches/sort_kernel_primitives.rs new file mode 100644 index 000000000000..ca9183580bd2 --- /dev/null +++ b/arrow/benches/sort_kernel_primitives.rs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use arrow_ord::sort::sort; +use criterion::Criterion; + +use std::sync::Arc; + +extern crate arrow; + +use arrow::util::bench_util::*; +use arrow::{array::*, datatypes::Int64Type}; + +fn create_i64_array(size: usize, with_nulls: bool) -> ArrayRef { + let null_density = if with_nulls { 0.5 } else { 0.0 }; + let array = create_primitive_array::(size, null_density); + Arc::new(array) +} + +fn bench_sort(array: &ArrayRef) { + criterion::black_box(sort(criterion::black_box(array), None).unwrap()); +} + +fn add_benchmark(c: &mut Criterion) { + let arr_a = create_i64_array(2u64.pow(10) as usize, false); + + c.bench_function("sort 2^10", |b| b.iter(|| bench_sort(&arr_a))); + + let arr_a = create_i64_array(2u64.pow(12) as usize, false); + + c.bench_function("sort 2^12", |b| b.iter(|| bench_sort(&arr_a))); + + let arr_a = create_i64_array(2u64.pow(10) as usize, true); + + c.bench_function("sort nulls 2^10", |b| b.iter(|| bench_sort(&arr_a))); + + let arr_a = create_i64_array(2u64.pow(12) as usize, true); + + c.bench_function("sort nulls 2^12", |b| b.iter(|| bench_sort(&arr_a))); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); From 08a57e9b3b1f954338ba3b27289935e00bfc6ba4 Mon Sep 17 00:00:00 2001 From: Ross Jones Date: Sat, 8 Jul 2023 15:29:02 +0100 Subject: [PATCH 1051/1411] Add default implementations to the FlightSqlService trait (#4485) The trait currently does not have many default implementations, but it does have a lot of methods. This PR adds default implementations for all methods returning Status::unimplemented to fix #4372 --- arrow-flight/src/sql/server.rs | 370 +++++++++++++++++++++++---------- 1 file changed, 259 insertions(+), 111 deletions(-) diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index f599fbca46a5..102d97105a2e 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -87,179 +87,279 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { /// Get a FlightInfo for executing a SQL query. async fn get_flight_info_statement( &self, - query: CommandStatementQuery, - request: Request, - ) -> Result, Status>; + _query: CommandStatementQuery, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_statement has no default implementation", + )) + } /// Get a FlightInfo for executing a substrait plan. async fn get_flight_info_substrait_plan( &self, - query: CommandStatementSubstraitPlan, - request: Request, - ) -> Result, Status>; + _query: CommandStatementSubstraitPlan, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_substrait_plan has no default implementation", + )) + } /// Get a FlightInfo for executing an already created prepared statement. async fn get_flight_info_prepared_statement( &self, - query: CommandPreparedStatementQuery, - request: Request, - ) -> Result, Status>; + _query: CommandPreparedStatementQuery, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_prepared_statement has no default implementation", + )) + } /// Get a FlightInfo for listing catalogs. async fn get_flight_info_catalogs( &self, - query: CommandGetCatalogs, - request: Request, - ) -> Result, Status>; + _query: CommandGetCatalogs, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_catalogs has no default implementation", + )) + } /// Get a FlightInfo for listing schemas. async fn get_flight_info_schemas( &self, - query: CommandGetDbSchemas, - request: Request, - ) -> Result, Status>; + _query: CommandGetDbSchemas, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_schemas has no default implementation", + )) + } /// Get a FlightInfo for listing tables. async fn get_flight_info_tables( &self, - query: CommandGetTables, - request: Request, - ) -> Result, Status>; + _query: CommandGetTables, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_tables has no default implementation", + )) + } /// Get a FlightInfo to extract information about the table types. async fn get_flight_info_table_types( &self, - query: CommandGetTableTypes, - request: Request, - ) -> Result, Status>; + _query: CommandGetTableTypes, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_table_types has no default implementation", + )) + } /// Get a FlightInfo for retrieving other information (See SqlInfo). async fn get_flight_info_sql_info( &self, - query: CommandGetSqlInfo, - request: Request, - ) -> Result, Status>; + _query: CommandGetSqlInfo, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_sql_info has no default implementation", + )) + } /// Get a FlightInfo to extract information about primary and foreign keys. async fn get_flight_info_primary_keys( &self, - query: CommandGetPrimaryKeys, - request: Request, - ) -> Result, Status>; + _query: CommandGetPrimaryKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_primary_keys has no default implementation", + )) + } /// Get a FlightInfo to extract information about exported keys. async fn get_flight_info_exported_keys( &self, - query: CommandGetExportedKeys, - request: Request, - ) -> Result, Status>; + _query: CommandGetExportedKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_exported_keys has no default implementation", + )) + } /// Get a FlightInfo to extract information about imported keys. async fn get_flight_info_imported_keys( &self, - query: CommandGetImportedKeys, - request: Request, - ) -> Result, Status>; + _query: CommandGetImportedKeys, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_imported_keys has no default implementation", + )) + } /// Get a FlightInfo to extract information about cross reference. async fn get_flight_info_cross_reference( &self, - query: CommandGetCrossReference, - request: Request, - ) -> Result, Status>; + _query: CommandGetCrossReference, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_cross_reference has no default implementation", + )) + } /// Get a FlightInfo to extract information about the supported XDBC types. async fn get_flight_info_xdbc_type_info( &self, - query: CommandGetXdbcTypeInfo, - request: Request, - ) -> Result, Status>; + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "get_flight_info_xdbc_type_info has no default implementation", + )) + } // do_get /// Get a FlightDataStream containing the query results. async fn do_get_statement( &self, - ticket: TicketStatementQuery, - request: Request, - ) -> Result::DoGetStream>, Status>; + _ticket: TicketStatementQuery, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_statement has no default implementation", + )) + } /// Get a FlightDataStream containing the prepared statement query results. async fn do_get_prepared_statement( &self, - query: CommandPreparedStatementQuery, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandPreparedStatementQuery, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_prepared_statement has no default implementation", + )) + } /// Get a FlightDataStream containing the list of catalogs. async fn do_get_catalogs( &self, - query: CommandGetCatalogs, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetCatalogs, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_catalogs has no default implementation", + )) + } /// Get a FlightDataStream containing the list of schemas. async fn do_get_schemas( &self, - query: CommandGetDbSchemas, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetDbSchemas, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_schemas has no default implementation", + )) + } /// Get a FlightDataStream containing the list of tables. async fn do_get_tables( &self, - query: CommandGetTables, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetTables, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_tables has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the table types. async fn do_get_table_types( &self, - query: CommandGetTableTypes, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetTableTypes, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_table_types has no default implementation", + )) + } /// Get a FlightDataStream containing the list of SqlInfo results. async fn do_get_sql_info( &self, - query: CommandGetSqlInfo, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetSqlInfo, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_sql_info has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the primary and foreign keys. async fn do_get_primary_keys( &self, - query: CommandGetPrimaryKeys, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetPrimaryKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_primary_keys has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the exported keys. async fn do_get_exported_keys( &self, - query: CommandGetExportedKeys, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetExportedKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_exported_keys has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the imported keys. async fn do_get_imported_keys( &self, - query: CommandGetImportedKeys, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetImportedKeys, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_imported_keys has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the cross reference. async fn do_get_cross_reference( &self, - query: CommandGetCrossReference, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetCrossReference, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_cross_reference has no default implementation", + )) + } /// Get a FlightDataStream containing the data related to the supported XDBC types. async fn do_get_xdbc_type_info( &self, - query: CommandGetXdbcTypeInfo, - request: Request, - ) -> Result::DoGetStream>, Status>; + _query: CommandGetXdbcTypeInfo, + _request: Request, + ) -> Result::DoGetStream>, Status> { + Err(Status::unimplemented( + "do_get_xdbc_type_info has no default implementation", + )) + } // do_put @@ -278,30 +378,46 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { /// Execute an update SQL statement. async fn do_put_statement_update( &self, - ticket: CommandStatementUpdate, - request: Request>, - ) -> Result; + _ticket: CommandStatementUpdate, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_statement_update has no default implementation", + )) + } /// Bind parameters to given prepared statement. async fn do_put_prepared_statement_query( &self, - query: CommandPreparedStatementQuery, - request: Request>, - ) -> Result::DoPutStream>, Status>; + _query: CommandPreparedStatementQuery, + _request: Request>, + ) -> Result::DoPutStream>, Status> { + Err(Status::unimplemented( + "do_put_prepared_statement_query has no default implementation", + )) + } /// Execute an update SQL prepared statement. async fn do_put_prepared_statement_update( &self, - query: CommandPreparedStatementUpdate, - request: Request>, - ) -> Result; + _query: CommandPreparedStatementUpdate, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_prepared_statement_update has no default implementation", + )) + } /// Execute a substrait plan async fn do_put_substrait_plan( &self, - query: CommandStatementSubstraitPlan, - request: Request>, - ) -> Result; + _query: CommandStatementSubstraitPlan, + _request: Request>, + ) -> Result { + Err(Status::unimplemented( + "do_put_substrait_plan has no default implementation", + )) + } // do_action @@ -324,58 +440,90 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { /// Create a prepared statement from given SQL statement. async fn do_action_create_prepared_statement( &self, - query: ActionCreatePreparedStatementRequest, - request: Request, - ) -> Result; + _query: ActionCreatePreparedStatementRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_create_prepared_statement has no default implementation", + )) + } /// Close a prepared statement. async fn do_action_close_prepared_statement( &self, - query: ActionClosePreparedStatementRequest, - request: Request, - ) -> Result<(), Status>; + _query: ActionClosePreparedStatementRequest, + _request: Request, + ) -> Result<(), Status> { + Err(Status::unimplemented( + "do_action_close_prepared_statement has no default implementation", + )) + } /// Create a prepared substrait plan. async fn do_action_create_prepared_substrait_plan( &self, - query: ActionCreatePreparedSubstraitPlanRequest, - request: Request, - ) -> Result; + _query: ActionCreatePreparedSubstraitPlanRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_create_prepared_substrait_plan has no default implementation", + )) + } /// Begin a transaction async fn do_action_begin_transaction( &self, - query: ActionBeginTransactionRequest, - request: Request, - ) -> Result; + _query: ActionBeginTransactionRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_begin_transaction has no default implementation", + )) + } /// End a transaction async fn do_action_end_transaction( &self, - query: ActionEndTransactionRequest, - request: Request, - ) -> Result<(), Status>; + _query: ActionEndTransactionRequest, + _request: Request, + ) -> Result<(), Status> { + Err(Status::unimplemented( + "do_action_end_transaction has no default implementation", + )) + } /// Begin a savepoint async fn do_action_begin_savepoint( &self, - query: ActionBeginSavepointRequest, - request: Request, - ) -> Result; + _query: ActionBeginSavepointRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_begin_savepoint has no default implementation", + )) + } /// End a savepoint async fn do_action_end_savepoint( &self, - query: ActionEndSavepointRequest, - request: Request, - ) -> Result<(), Status>; + _query: ActionEndSavepointRequest, + _request: Request, + ) -> Result<(), Status> { + Err(Status::unimplemented( + "do_action_end_savepoint has no default implementation", + )) + } /// Cancel a query async fn do_action_cancel_query( &self, - query: ActionCancelQueryRequest, - request: Request, - ) -> Result; + _query: ActionCancelQueryRequest, + _request: Request, + ) -> Result { + Err(Status::unimplemented( + "do_action_cancel_query has no default implementation", + )) + } /// do_exchange From ee2c29236077094724a8031c17af2562e96dbd07 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 8 Jul 2023 14:05:26 -0400 Subject: [PATCH 1052/1411] Add Datum based arithmetic kernels (#3999) (#4465) * Add Datum based arithmetic kernels (#3999) * Fix benchmark * Review feedback --- arrow-arith/src/aggregate.rs | 39 +- arrow-arith/src/arithmetic.rs | 766 +++-------------------- arrow-arith/src/lib.rs | 2 + arrow-arith/src/numeric.rs | 672 ++++++++++++++++++++ arrow-array/src/array/primitive_array.rs | 9 + arrow-array/src/scalar.rs | 6 + arrow/benches/arithmetic_kernels.rs | 40 +- arrow/src/compute/kernels/mod.rs | 4 +- arrow/src/ffi.rs | 34 +- 9 files changed, 830 insertions(+), 742 deletions(-) create mode 100644 arrow-arith/src/numeric.rs diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 4961d7efc0f2..04417c666c85 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -867,8 +867,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::arithmetic::add; use arrow_array::types::*; + use arrow_buffer::NullBuffer; use std::sync::Arc; #[test] @@ -897,54 +897,35 @@ mod tests { #[test] fn test_primitive_array_sum_large_64() { - let a: Int64Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int64Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); + let c = Int64Array::new((1..=100).collect(), Some(validity)); + assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); } #[test] fn test_primitive_array_sum_large_32() { - let a: Int32Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int32Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); + let c = Int32Array::new((1..=100).collect(), Some(validity)); assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); } #[test] fn test_primitive_array_sum_large_16() { - let a: Int16Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int16Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); + let c = Int16Array::new((1..=100).collect(), Some(validity)); assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); } #[test] fn test_primitive_array_sum_large_8() { // include fewer values than other large tests so the result does not overflow the u8 - let a: UInt8Array = (1..=100) - .map(|i| if i % 33 == 0 { Some(i) } else { None }) - .collect(); - let b: UInt8Array = (1..=100) - .map(|i| if i % 33 == 0 { Some(0) } else { Some(i) }) - .collect(); // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); + let validity = NullBuffer::new((1..=100).map(|x| x % 33 == 0).collect()); + let c = UInt8Array::new((1..=100).collect(), Some(validity)); assert_eq!(Some((1..=100).filter(|i| i % 33 == 0).sum()), sum(&c)); } diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 8e7ab44042cf..4f6ecc78dc58 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -23,7 +23,6 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use crate::arity::*; -use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::i256; @@ -39,6 +38,7 @@ use std::sync::Arc; /// # Errors /// /// This function errors if the arrays have different lengths +#[deprecated(note = "Use arrow_arith::arity::binary")] pub fn math_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -52,43 +52,6 @@ where binary(left, right, op) } -/// This is similar to `math_op` as it performs given operation between two input primitive arrays. -/// But the given operation can return `Err` if overflow is detected. For the case, this function -/// returns an `Err`. -fn math_checked_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result, ArrowError> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Result, -{ - try_binary(left, right, op) -} - -/// Helper function for operations where a valid `0` on the right array should -/// result in an [ArrowError::DivideByZero], namely the division and modulo operations -/// -/// # Errors -/// -/// This function errors if: -/// * the arrays have different lengths -/// * there is an element where both left and right values are valid and the right value is `0` -fn math_checked_divide_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result, ArrowError> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Result, -{ - math_checked_op(left, right, op) -} - /// Calculates the modulus operation `left % right` on two SIMD inputs. /// The lower-most bits of `valid_mask` specify which vector lanes are considered as valid. /// @@ -335,11 +298,12 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] pub fn add( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_op(left, right, |a, b| a.add_wrapping(b)) + binary(left, right, |a, b| a.add_wrapping(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -347,11 +311,12 @@ pub fn add( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add` instead. +#[deprecated(note = "Use arrow_arith::numeric::add")] pub fn add_checked( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_checked_op(left, right, |a, b| a.add_checked(b)) + try_binary(left, right, |a, b| a.add_checked(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -359,176 +324,9 @@ pub fn add_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_dyn_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Date32 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Date64 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Second, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::add_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::add_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::add_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::add_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::add_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::add_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::add_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::add_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::add_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::add_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::add_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::add_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - - DataType::Interval(_) - if matches!( - right.data_type(), - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) - ) => - { - add_dyn(right, left) - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_op(left, right, |a, b| a.add_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + crate::numeric::add_wrapping(&left, &right) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -536,71 +334,12 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Date32 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::add_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Date64 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::add_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_op(left, right, |a, b| a.add_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + crate::numeric::add(&left, &right) } /// Add every value in an array by a scalar. If any value in the array is null then the @@ -608,6 +347,7 @@ pub fn add_dyn_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `add_scalar_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] pub fn add_scalar( array: &PrimitiveArray, scalar: T::Native, @@ -620,6 +360,7 @@ pub fn add_scalar( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add_scalar` instead. +#[deprecated(note = "Use arrow_arith::numeric::add")] pub fn add_scalar_checked( array: &PrimitiveArray, scalar: T::Native, @@ -635,6 +376,7 @@ pub fn add_scalar_checked( /// For an overflow-checking variant, use `add_scalar_checked_dyn` instead. /// /// This returns an `Err` when the input array is not supported for adding operation. +#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] pub fn add_scalar_dyn( array: &dyn Array, scalar: T::Native, @@ -651,6 +393,7 @@ pub fn add_scalar_dyn( /// /// As this kernel has the branching costs and also prevents LLVM from vectorising it correctly, /// it is usually much slower than non-checking variant. +#[deprecated(note = "Use arrow_arith::numeric::add")] pub fn add_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, @@ -664,11 +407,12 @@ pub fn add_scalar_checked_dyn( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] pub fn subtract( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_op(left, right, |a, b| a.sub_wrapping(b)) + binary(left, right, |a, b| a.sub_wrapping(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -676,11 +420,12 @@ pub fn subtract( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub")] pub fn subtract_checked( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_checked_op(left, right, |a, b| a.sub_checked(b)) + try_binary(left, right, |a, b| a.sub_checked(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -688,184 +433,9 @@ pub fn subtract_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_dyn_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Date32 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Date64 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Second, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::subtract_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::subtract_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampSecondType::subtract_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Timestamp(TimeUnit::Second, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMicrosecondType::subtract_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::subtract_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::subtract_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampMillisecondType::subtract_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::subtract_year_months)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::subtract_day_time)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_checked_op(l, r, TimestampNanosecondType::subtract_month_day_nano)?; - Ok(Arc::new(res.with_timezone_opt(l.timezone()))) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = binary(l, r, |a, b| a.wrapping_sub(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_op(left, right, |a, b| a.sub_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + crate::numeric::sub_wrapping(&left, &right) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -873,127 +443,12 @@ pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Date32 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date32Type::subtract_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Date64 => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Interval(IntervalUnit::YearMonth) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_year_months)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::DayTime) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_day_time)?; - Ok(Arc::new(res)) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let r = right.as_primitive::(); - let res = math_op(l, r, Date64Type::subtract_month_day_nano)?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Second, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Timestamp(TimeUnit::Second, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let l = left.as_primitive::(); - match right.data_type() { - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let r = right.as_primitive::(); - let res: PrimitiveArray = try_binary(l, r, |a, b| a.sub_checked(b))?; - Ok(Arc::new(res)) - } - _ => Err(ArrowError::CastError(format!( - "Cannot perform arithmetic operation between array of type {} and array of type {}", - left.data_type(), right.data_type() - ))), - } - } - _ => { - downcast_primitive_array!( - (left, right) => { - math_checked_op(left, right, |a, b| a.sub_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) - } - } + crate::numeric::sub(&left, &right) } /// Subtract every value in an array by a scalar. If any value in the array is null then the @@ -1001,6 +456,7 @@ pub fn subtract_dyn_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_scalar_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] pub fn subtract_scalar( array: &PrimitiveArray, scalar: T::Native, @@ -1013,6 +469,7 @@ pub fn subtract_scalar( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract_scalar` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub")] pub fn subtract_scalar_checked( array: &PrimitiveArray, scalar: T::Native, @@ -1026,6 +483,7 @@ pub fn subtract_scalar_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `subtract_scalar_checked_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] pub fn subtract_scalar_dyn( array: &dyn Array, scalar: T::Native, @@ -1039,6 +497,7 @@ pub fn subtract_scalar_dyn( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `subtract_scalar_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::sub")] pub fn subtract_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, @@ -1072,11 +531,12 @@ pub fn negate_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_check` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] pub fn multiply( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_op(left, right, |a, b| a.mul_wrapping(b)) + binary(left, right, |a, b| a.mul_wrapping(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1084,11 +544,12 @@ pub fn multiply( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul")] pub fn multiply_checked( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { - math_checked_op(left, right, |a, b| a.mul_checked(b)) + try_binary(left, right, |a, b| a.mul_checked(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1096,16 +557,9 @@ pub fn multiply_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_dyn_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { - downcast_primitive_array!( - (left, right) => { - math_op(left, right, |a, b| a.mul_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + crate::numeric::mul_wrapping(&left, &right) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1113,19 +567,12 @@ pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - downcast_primitive_array!( - (left, right) => { - math_checked_op(left, right, |a, b| a.mul_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + crate::numeric::mul(&left, &right) } /// Returns the precision and scale of the result of a multiplication of two decimal types, @@ -1210,8 +657,10 @@ pub fn multiply_fixed_point_checked( )?; if required_scale == product_scale { - return multiply_checked(left, right)? - .with_precision_and_scale(precision, required_scale); + return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { + a.mul_checked(b) + })? + .with_precision_and_scale(precision, required_scale); } try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { @@ -1254,7 +703,7 @@ pub fn multiply_fixed_point( )?; if required_scale == product_scale { - return multiply(left, right)? + return binary(left, right, |a, b| a.mul_wrapping(b))? .with_precision_and_scale(precision, required_scale); } @@ -1294,6 +743,7 @@ where /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_scalar_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] pub fn multiply_scalar( array: &PrimitiveArray, scalar: T::Native, @@ -1306,6 +756,7 @@ pub fn multiply_scalar( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply_scalar` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul")] pub fn multiply_scalar_checked( array: &PrimitiveArray, scalar: T::Native, @@ -1319,6 +770,7 @@ pub fn multiply_scalar_checked( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `multiply_scalar_checked_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] pub fn multiply_scalar_dyn( array: &dyn Array, scalar: T::Native, @@ -1332,6 +784,7 @@ pub fn multiply_scalar_dyn( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `multiply_scalar_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::mul")] pub fn multiply_scalar_checked_dyn( array: &dyn Array, scalar: T::Native, @@ -1343,6 +796,7 @@ pub fn multiply_scalar_checked_dyn( /// Perform `left % right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. +#[deprecated(note = "Use arrow_arith::numeric::rem")] pub fn modulus( left: &PrimitiveArray, right: &PrimitiveArray, @@ -1364,22 +818,9 @@ pub fn modulus( /// Perform `left % right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. +#[deprecated(note = "Use arrow_arith::numeric::rem")] pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result { - downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.mod_wrapping(b)) - } - }).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + crate::numeric::rem(&left, &right) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1388,6 +829,7 @@ pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result( left: &PrimitiveArray, right: &PrimitiveArray, @@ -1397,7 +839,7 @@ pub fn divide_checked( a.div_wrapping(b) }); #[cfg(not(feature = "simd"))] - return math_checked_divide_op(left, right, |a, b| a.div_checked(b)); + return try_binary(left, right, |a, b| a.div_checked(b)); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1414,6 +856,7 @@ pub fn divide_checked( /// /// For integer types overflow will wrap around. /// +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_opt( left: &PrimitiveArray, right: &PrimitiveArray, @@ -1433,17 +876,23 @@ pub fn divide_opt( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_dyn_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { + fn divide_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + ) -> Result, ArrowError> { + try_binary(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }) + } + downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.div_wrapping(b)) - } - }).map(|a| Arc::new(a) as ArrayRef) - } + (left, right) => divide_op(left, right).map(|a| Arc::new(a) as ArrayRef), _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", left.data_type(), right.data_type() @@ -1457,19 +906,12 @@ pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - downcast_primitive_array!( - (left, right) => { - math_checked_divide_op(left, right, |a, b| a.div_checked(b)).map(|a| Arc::new(a) as ArrayRef) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) + crate::numeric::div(&left, &right) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1481,6 +923,7 @@ pub fn divide_dyn_checked( /// Unlike `divide_dyn` or `divide_dyn_checked`, division by zero will get a null value instead /// returning an `Err`, this also doesn't check overflowing, overflowing will just wrap /// the result around. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_dyn_opt( left: &dyn Array, right: &dyn Array, @@ -1513,18 +956,20 @@ pub fn divide_dyn_opt( /// If either left or right value is null then the result is also null. /// /// For an overflow-checking variant, use `divide_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide( left: &PrimitiveArray, right: &PrimitiveArray, ) -> Result, ArrowError> { // TODO: This is incorrect as div_wrapping has side-effects for integer types // and so may panic on null values (#2647) - math_op(left, right, |a, b| a.div_wrapping(b)) + binary(left, right, |a, b| a.div_wrapping(b)) } /// Modulus every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. +#[deprecated(note = "Use arrow_arith::numeric::rem")] pub fn modulus_scalar( array: &PrimitiveArray, modulo: T::Native, @@ -1539,6 +984,7 @@ pub fn modulus_scalar( /// Modulus every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. +#[deprecated(note = "Use arrow_arith::numeric::rem")] pub fn modulus_scalar_dyn( array: &dyn Array, modulo: T::Native, @@ -1552,6 +998,7 @@ pub fn modulus_scalar_dyn( /// Divide every value in an array by a scalar. If any value in the array is null then the /// result is also null. If the scalar is zero then the result of this operation will be /// `Err(ArrowError::DivideByZero)`. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_scalar( array: &PrimitiveArray, divisor: T::Native, @@ -1569,6 +1016,7 @@ pub fn divide_scalar( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `divide_scalar_checked_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_scalar_dyn( array: &dyn Array, divisor: T::Native, @@ -1586,6 +1034,7 @@ pub fn divide_scalar_dyn( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `divide_scalar_dyn` instead. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_scalar_checked_dyn( array: &dyn Array, divisor: T::Native, @@ -1608,6 +1057,7 @@ pub fn divide_scalar_checked_dyn( /// Unlike `divide_scalar_dyn` or `divide_scalar_checked_dyn`, division by zero will get a /// null value instead returning an `Err`, this also doesn't check overflowing, overflowing /// will just wrap the result around. +#[deprecated(note = "Use arrow_arith::numeric::div")] pub fn divide_scalar_opt_dyn( array: &dyn Array, divisor: T::Native, @@ -1625,11 +1075,13 @@ pub fn divide_scalar_opt_dyn( } #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use arrow_array::builder::{ BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, }; + use arrow_array::cast::AsArray; use arrow_array::temporal_conversions::SECONDS_IN_DAY; use arrow_buffer::buffer::NullBuffer; use arrow_buffer::i256; @@ -1678,16 +1130,14 @@ mod tests { )]); let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); } @@ -1702,16 +1152,14 @@ mod tests { 1, 2, 3, )]); let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); } @@ -1724,16 +1172,14 @@ mod tests { let b = IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(1, 2)]); let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) ); } @@ -1745,16 +1191,14 @@ mod tests { )]); let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) ); } @@ -1769,16 +1213,14 @@ mod tests { 1, 2, 3, )]); let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( - c.value(0), + c.as_primitive::().value(0), Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) ); } @@ -2584,11 +2026,11 @@ mod tests { } #[test] - #[should_panic(expected = "DivideByZero")] fn test_f32_array_modulus_dyn_by_zero() { let a = Float32Array::from(vec![1.5]); let b = Float32Array::from(vec![0.0]); - modulus_dyn(&a, &b).unwrap(); + let result = modulus_dyn(&a, &b).unwrap(); + assert!(result.as_primitive::().value(0).is_nan()); } #[test] @@ -3838,10 +3280,6 @@ mod tests { ::Native::MIN, ]); - // unchecked - let result = subtract_dyn(&a, &b); - assert!(!&result.is_err()); - // checked let result = subtract_dyn_checked(&a, &b); assert!(&result.is_err()); @@ -3866,16 +3304,8 @@ mod tests { #[test] fn test_timestamp_microsecond_subtract_timestamp_overflow() { - let a = TimestampMicrosecondArray::from(vec![ - ::Native::MAX, - ]); - let b = TimestampMicrosecondArray::from(vec![ - ::Native::MIN, - ]); - - // unchecked - let result = subtract_dyn(&a, &b); - assert!(!&result.is_err()); + let a = TimestampMicrosecondArray::from(vec![i64::MAX]); + let b = TimestampMicrosecondArray::from(vec![i64::MIN]); // checked let result = subtract_dyn_checked(&a, &b); @@ -3901,16 +3331,8 @@ mod tests { #[test] fn test_timestamp_millisecond_subtract_timestamp_overflow() { - let a = TimestampMillisecondArray::from(vec![ - ::Native::MAX, - ]); - let b = TimestampMillisecondArray::from(vec![ - ::Native::MIN, - ]); - - // unchecked - let result = subtract_dyn(&a, &b); - assert!(!&result.is_err()); + let a = TimestampMillisecondArray::from(vec![i64::MAX]); + let b = TimestampMillisecondArray::from(vec![i64::MIN]); // checked let result = subtract_dyn_checked(&a, &b); @@ -3943,10 +3365,6 @@ mod tests { ::Native::MIN, ]); - // unchecked - let result = subtract_dyn(&a, &b); - assert!(!&result.is_err()); - // checked let result = subtract_dyn_checked(&a, &b); assert!(&result.is_err()); diff --git a/arrow-arith/src/lib.rs b/arrow-arith/src/lib.rs index 60d31c972b66..2d5451e04dd2 100644 --- a/arrow-arith/src/lib.rs +++ b/arrow-arith/src/lib.rs @@ -18,8 +18,10 @@ //! Arrow arithmetic and aggregation kernels pub mod aggregate; +#[doc(hidden)] // Kernels to be removed in a future release pub mod arithmetic; pub mod arity; pub mod bitwise; pub mod boolean; +pub mod numeric; pub mod temporal; diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs new file mode 100644 index 000000000000..816fcaa944f5 --- /dev/null +++ b/arrow-arith/src/numeric.rs @@ -0,0 +1,672 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines numeric arithmetic kernels on [`PrimitiveArray`], such as [`add`] + +use std::cmp::Ordering; +use std::sync::Arc; + +use arrow_array::cast::AsArray; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::ArrowNativeType; +use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; + +use crate::arity::{binary, try_binary}; + +/// Perform `lhs + rhs`, returning an error on overflow +pub fn add(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::Add, lhs, rhs) +} + +/// Perform `lhs + rhs`, wrapping on overflow for [`DataType::is_integer`] +pub fn add_wrapping(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::AddWrapping, lhs, rhs) +} + +/// Perform `lhs - rhs`, returning an error on overflow +pub fn sub(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::Sub, lhs, rhs) +} + +/// Perform `lhs - rhs`, wrapping on overflow for [`DataType::is_integer`] +pub fn sub_wrapping(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::SubWrapping, lhs, rhs) +} + +/// Perform `lhs * rhs`, returning an error on overflow +pub fn mul(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::Mul, lhs, rhs) +} + +/// Perform `lhs * rhs`, wrapping on overflow for [`DataType::is_integer`] +pub fn mul_wrapping(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::MulWrapping, lhs, rhs) +} + +/// Perform `lhs / rhs` +/// +/// Overflow or division by zero will result in an error, with exception to +/// floating point numbers, which instead follow the IEEE 754 rules +pub fn div(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::Div, lhs, rhs) +} + +/// Perform `lhs % rhs` +/// +/// Overflow or division by zero will result in an error, with exception to +/// floating point numbers, which instead follow the IEEE 754 rules +pub fn rem(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + arithmetic_op(Op::Rem, lhs, rhs) +} + +/// An enumeration of arithmetic operations +/// +/// This allows sharing the type dispatch logic across the various kernels +#[derive(Debug, Copy, Clone)] +enum Op { + AddWrapping, + Add, + SubWrapping, + Sub, + MulWrapping, + Mul, + Div, + Rem, +} + +impl Op { + fn commutative(&self) -> bool { + matches!(self, Self::Add | Self::AddWrapping) + } +} + +/// Dispatch the given `op` to the appropriate specialized kernel +fn arithmetic_op( + op: Op, + lhs: &dyn Datum, + rhs: &dyn Datum, +) -> Result { + use DataType::*; + use IntervalUnit::*; + use TimeUnit::*; + + macro_rules! integer_helper { + ($t:ty, $op:ident, $l:ident, $l_scalar:ident, $r:ident, $r_scalar:ident) => { + integer_op::<$t>($op, $l, $l_scalar, $r, $r_scalar) + }; + } + + let (l, l_scalar) = lhs.get(); + let (r, r_scalar) = rhs.get(); + downcast_integer! { + l.data_type(), r.data_type() => (integer_helper, op, l, l_scalar, r, r_scalar), + (Float16, Float16) => float_op::(op, l, l_scalar, r, r_scalar), + (Float32, Float32) => float_op::(op, l, l_scalar, r, r_scalar), + (Float64, Float64) => float_op::(op, l, l_scalar, r, r_scalar), + (Timestamp(Second, _), _) => timestamp_op::(op, l, l_scalar, r, r_scalar), + (Timestamp(Millisecond, _), _) => timestamp_op::(op, l, l_scalar, r, r_scalar), + (Timestamp(Microsecond, _), _) => timestamp_op::(op, l, l_scalar, r, r_scalar), + (Timestamp(Nanosecond, _), _) => timestamp_op::(op, l, l_scalar, r, r_scalar), + (Duration(Second), Duration(Second)) => duration_op::(op, l, l_scalar, r, r_scalar), + (Duration(Millisecond), Duration(Millisecond)) => duration_op::(op, l, l_scalar, r, r_scalar), + (Duration(Microsecond), Duration(Microsecond)) => duration_op::(op, l, l_scalar, r, r_scalar), + (Duration(Nanosecond), Duration(Nanosecond)) => duration_op::(op, l, l_scalar, r, r_scalar), + (Interval(YearMonth), Interval(YearMonth)) => interval_op::(op, l, l_scalar, r, r_scalar), + (Interval(DayTime), Interval(DayTime)) => interval_op::(op, l, l_scalar, r, r_scalar), + (Interval(MonthDayNano), Interval(MonthDayNano)) => interval_op::(op, l, l_scalar, r, r_scalar), + (Date32, _) => date_op::(op, l, l_scalar, r, r_scalar), + (Date64, _) => date_op::(op, l, l_scalar, r, r_scalar), + (Decimal128(_, _), Decimal128(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), + (Decimal256(_, _), Decimal256(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), + (l_t, r_t) => match (l_t, r_t) { + (Duration(_) | Interval(_), Date32 | Date64 | Timestamp(_, _)) if op.commutative() => { + arithmetic_op(op, rhs, lhs) + } + _ => Err(ArrowError::InvalidArgumentError( + format!("Invalid arithmetic operation: {l_t} {op:?} {r_t}") + )) + } + } +} + +/// Perform an infallible binary operation on potentially scalar inputs +macro_rules! op { + ($l:ident, $l_s:expr, $r:ident, $r_s:expr, $op:expr) => { + match ($l_s, $r_s) { + (true, true) | (false, false) => binary($l, $r, |$l, $r| $op)?, + (true, false) => match ($l.null_count() == 0).then(|| $l.value(0)) { + None => PrimitiveArray::new_null($r.len()), + Some($l) => $r.unary(|$r| $op), + }, + (false, true) => match ($r.null_count() == 0).then(|| $r.value(0)) { + None => PrimitiveArray::new_null($l.len()), + Some($r) => $l.unary(|$l| $op), + }, + } + }; +} + +/// Same as `op` but with a type hint for the returned array +macro_rules! op_ref { + ($t:ty, $l:ident, $l_s:expr, $r:ident, $r_s:expr, $op:expr) => {{ + let array: PrimitiveArray<$t> = op!($l, $l_s, $r, $r_s, $op); + Arc::new(array) + }}; +} + +/// Perform a fallible binary operation on potentially scalar inputs +macro_rules! try_op { + ($l:ident, $l_s:expr, $r:ident, $r_s:expr, $op:expr) => { + match ($l_s, $r_s) { + (true, true) | (false, false) => try_binary($l, $r, |$l, $r| $op)?, + (true, false) => match ($l.null_count() == 0).then(|| $l.value(0)) { + None => PrimitiveArray::new_null($r.len()), + Some($l) => $r.try_unary(|$r| $op)?, + }, + (false, true) => match ($r.null_count() == 0).then(|| $r.value(0)) { + None => PrimitiveArray::new_null($l.len()), + Some($r) => $l.try_unary(|$l| $op)?, + }, + } + }; +} + +/// Same as `try_op` but with a type hint for the returned array +macro_rules! try_op_ref { + ($t:ty, $l:ident, $l_s:expr, $r:ident, $r_s:expr, $op:expr) => {{ + let array: PrimitiveArray<$t> = try_op!($l, $l_s, $r, $r_s, $op); + Arc::new(array) + }}; +} + +/// Perform an arithmetic operation on integers +fn integer_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + let array: PrimitiveArray = match op { + Op::AddWrapping => op!(l, l_s, r, r_s, l.add_wrapping(r)), + Op::Add => try_op!(l, l_s, r, r_s, l.add_checked(r)), + Op::SubWrapping => op!(l, l_s, r, r_s, l.sub_wrapping(r)), + Op::Sub => try_op!(l, l_s, r, r_s, l.sub_checked(r)), + Op::MulWrapping => op!(l, l_s, r, r_s, l.mul_wrapping(r)), + Op::Mul => try_op!(l, l_s, r, r_s, l.mul_checked(r)), + Op::Div => try_op!(l, l_s, r, r_s, l.div_checked(r)), + Op::Rem => try_op!(l, l_s, r, r_s, l.mod_checked(r)), + }; + Ok(Arc::new(array)) +} + +/// Perform an arithmetic operation on floats +fn float_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + let array: PrimitiveArray = match op { + Op::AddWrapping | Op::Add => op!(l, l_s, r, r_s, l.add_wrapping(r)), + Op::SubWrapping | Op::Sub => op!(l, l_s, r, r_s, l.sub_wrapping(r)), + Op::MulWrapping | Op::Mul => op!(l, l_s, r, r_s, l.mul_wrapping(r)), + Op::Div => op!(l, l_s, r, r_s, l.div_wrapping(r)), + Op::Rem => op!(l, l_s, r, r_s, l.mod_wrapping(r)), + }; + Ok(Arc::new(array)) +} + +/// Arithmetic trait for timestamp arrays +trait TimestampOp: ArrowTimestampType { + type Duration: ArrowPrimitiveType; + + fn add_year_month(timestamp: i64, delta: i32) -> Result; + fn add_day_time(timestamp: i64, delta: i64) -> Result; + fn add_month_day_nano(timestamp: i64, delta: i128) -> Result; + + fn sub_year_month(timestamp: i64, delta: i32) -> Result; + fn sub_day_time(timestamp: i64, delta: i64) -> Result; + fn sub_month_day_nano(timestamp: i64, delta: i128) -> Result; +} + +macro_rules! timestamp { + ($t:ty, $d:ty) => { + impl TimestampOp for $t { + type Duration = $d; + + fn add_year_month(left: i64, right: i32) -> Result { + Self::add_year_months(left, right) + } + + fn add_day_time(left: i64, right: i64) -> Result { + Self::add_day_time(left, right) + } + + fn add_month_day_nano(left: i64, right: i128) -> Result { + Self::add_month_day_nano(left, right) + } + + fn sub_year_month(left: i64, right: i32) -> Result { + Self::subtract_year_months(left, right) + } + + fn sub_day_time(left: i64, right: i64) -> Result { + Self::subtract_day_time(left, right) + } + + fn sub_month_day_nano(left: i64, right: i128) -> Result { + Self::subtract_month_day_nano(left, right) + } + } + }; +} +timestamp!(TimestampSecondType, DurationSecondType); +timestamp!(TimestampMillisecondType, DurationMillisecondType); +timestamp!(TimestampMicrosecondType, DurationMicrosecondType); +timestamp!(TimestampNanosecondType, DurationNanosecondType); + +/// Perform arithmetic operation on a timestamp array +fn timestamp_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + use DataType::*; + use IntervalUnit::*; + + // Note: interval arithmetic should account for timezones (#4457) + let l = l.as_primitive::(); + let array: PrimitiveArray = match (op, r.data_type()) { + (Op::Sub | Op::SubWrapping, Timestamp(unit, _)) if unit == &T::UNIT => { + let r = r.as_primitive::(); + return Ok(try_op_ref!(T::Duration, l, l_s, r, r_s, l.sub_checked(r))); + } + + (Op::Add | Op::AddWrapping, Duration(unit)) if unit == &T::UNIT => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, l.add_checked(r)) + } + (Op::Sub | Op::SubWrapping, Duration(unit)) if unit == &T::UNIT => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, l.sub_checked(r)) + } + + (Op::Add | Op::AddWrapping, Interval(YearMonth)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::add_year_month(l, r)) + } + (Op::Sub | Op::SubWrapping, Interval(YearMonth)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::sub_year_month(l, r)) + } + + (Op::Add | Op::AddWrapping, Interval(DayTime)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::add_day_time(l, r)) + } + (Op::Sub | Op::SubWrapping, Interval(DayTime)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::sub_day_time(l, r)) + } + + (Op::Add | Op::AddWrapping, Interval(MonthDayNano)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::add_month_day_nano(l, r)) + } + (Op::Sub | Op::SubWrapping, Interval(MonthDayNano)) => { + let r = r.as_primitive::(); + try_op!(l, l_s, r, r_s, T::sub_month_day_nano(l, r)) + } + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid timestamp arithmetic operation: {} {op:?} {}", + l.data_type(), + r.data_type() + ))) + } + }; + Ok(Arc::new(array.with_timezone_opt(l.timezone()))) +} + +/// Arithmetic trait for date arrays +/// +/// Note: these should be fallible (#4456) +trait DateOp: ArrowTemporalType { + fn add_year_month(timestamp: Self::Native, delta: i32) -> Self::Native; + fn add_day_time(timestamp: Self::Native, delta: i64) -> Self::Native; + fn add_month_day_nano(timestamp: Self::Native, delta: i128) -> Self::Native; + + fn sub_year_month(timestamp: Self::Native, delta: i32) -> Self::Native; + fn sub_day_time(timestamp: Self::Native, delta: i64) -> Self::Native; + fn sub_month_day_nano(timestamp: Self::Native, delta: i128) -> Self::Native; +} + +macro_rules! date { + ($t:ty) => { + impl DateOp for $t { + fn add_year_month(left: Self::Native, right: i32) -> Self::Native { + Self::add_year_months(left, right) + } + + fn add_day_time(left: Self::Native, right: i64) -> Self::Native { + Self::add_day_time(left, right) + } + + fn add_month_day_nano(left: Self::Native, right: i128) -> Self::Native { + Self::add_month_day_nano(left, right) + } + + fn sub_year_month(left: Self::Native, right: i32) -> Self::Native { + Self::subtract_year_months(left, right) + } + + fn sub_day_time(left: Self::Native, right: i64) -> Self::Native { + Self::subtract_day_time(left, right) + } + + fn sub_month_day_nano(left: Self::Native, right: i128) -> Self::Native { + Self::subtract_month_day_nano(left, right) + } + } + }; +} +date!(Date32Type); +date!(Date64Type); + +/// Arithmetic trait for interval arrays +trait IntervalOp: ArrowPrimitiveType { + fn add(left: Self::Native, right: Self::Native) -> Result; + fn sub(left: Self::Native, right: Self::Native) -> Result; +} + +impl IntervalOp for IntervalYearMonthType { + fn add(left: Self::Native, right: Self::Native) -> Result { + left.add_checked(right) + } + + fn sub(left: Self::Native, right: Self::Native) -> Result { + left.sub_checked(right) + } +} + +impl IntervalOp for IntervalDayTimeType { + fn add(left: Self::Native, right: Self::Native) -> Result { + let (l_days, l_ms) = Self::to_parts(left); + let (r_days, r_ms) = Self::to_parts(right); + let days = l_days.add_checked(r_days)?; + let ms = l_ms.add_checked(r_ms)?; + Ok(Self::make_value(days, ms)) + } + + fn sub(left: Self::Native, right: Self::Native) -> Result { + let (l_days, l_ms) = Self::to_parts(left); + let (r_days, r_ms) = Self::to_parts(right); + let days = l_days.sub_checked(r_days)?; + let ms = l_ms.sub_checked(r_ms)?; + Ok(Self::make_value(days, ms)) + } +} + +impl IntervalOp for IntervalMonthDayNanoType { + fn add(left: Self::Native, right: Self::Native) -> Result { + let (l_months, l_days, l_nanos) = Self::to_parts(left); + let (r_months, r_days, r_nanos) = Self::to_parts(right); + let months = l_months.add_checked(r_months)?; + let days = l_days.add_checked(r_days)?; + let nanos = l_nanos.add_checked(r_nanos)?; + Ok(Self::make_value(months, days, nanos)) + } + + fn sub(left: Self::Native, right: Self::Native) -> Result { + let (l_months, l_days, l_nanos) = Self::to_parts(left); + let (r_months, r_days, r_nanos) = Self::to_parts(right); + let months = l_months.sub_checked(r_months)?; + let days = l_days.sub_checked(r_days)?; + let nanos = l_nanos.sub_checked(r_nanos)?; + Ok(Self::make_value(months, days, nanos)) + } +} + +/// Perform arithmetic operation on an interval array +fn interval_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + match op { + Op::Add | Op::AddWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, T::add(l, r))), + Op::Sub | Op::SubWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, T::sub(l, r))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid interval arithmetic operation: {} {op:?} {}", + l.data_type(), + r.data_type() + ))), + } +} + +fn duration_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + match op { + Op::Add | Op::AddWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, l.add_checked(r))), + Op::Sub | Op::SubWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, l.sub_checked(r))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid duration arithmetic operation: {} {op:?} {}", + l.data_type(), + r.data_type() + ))), + } +} + +/// Perform arithmetic operation on a date array +fn date_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + use DataType::*; + use IntervalUnit::*; + + // Note: interval arithmetic should account for timezones (#4457) + let l = l.as_primitive::(); + match (op, r.data_type()) { + (Op::Add | Op::AddWrapping, Interval(YearMonth)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::add_year_month(l, r))) + } + (Op::Sub | Op::SubWrapping, Interval(YearMonth)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::sub_year_month(l, r))) + } + + (Op::Add | Op::AddWrapping, Interval(DayTime)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::add_day_time(l, r))) + } + (Op::Sub | Op::SubWrapping, Interval(DayTime)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::sub_day_time(l, r))) + } + + (Op::Add | Op::AddWrapping, Interval(MonthDayNano)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::add_month_day_nano(l, r))) + } + (Op::Sub | Op::SubWrapping, Interval(MonthDayNano)) => { + let r = r.as_primitive::(); + Ok(op_ref!(T, l, l_s, r, r_s, T::sub_month_day_nano(l, r))) + } + + _ => Err(ArrowError::InvalidArgumentError(format!( + "Invalid date arithmetic operation: {} {op:?} {}", + l.data_type(), + r.data_type() + ))), + } +} + +/// Perform arithmetic operation on decimal arrays +fn decimal_op( + op: Op, + l: &dyn Array, + l_s: bool, + r: &dyn Array, + r_s: bool, +) -> Result { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + + let (p1, s1, p2, s2) = match (l.data_type(), r.data_type()) { + (DataType::Decimal128(p1, s1), DataType::Decimal128(p2, s2)) => (p1, s1, p2, s2), + (DataType::Decimal256(p1, s1), DataType::Decimal256(p2, s2)) => (p1, s1, p2, s2), + _ => unreachable!(), + }; + + // Follow the Hive decimal arithmetic rules + // https://cwiki.apache.org/confluence/download/attachments/27362075/Hive_Decimal_Precision_Scale_Support.pdf + let array: PrimitiveArray = match op { + Op::Add | Op::AddWrapping | Op::Sub | Op::SubWrapping => { + // max(s1, s2) + let result_scale = *s1.max(s2); + + // max(s1, s2) + max(p1-s1, p2-s2) + 1 + let result_precision = + (result_scale.saturating_add((*p1 as i8 - s1).max(*p2 as i8 - s2)) as u8) + .saturating_add(1) + .min(T::MAX_PRECISION); + + let l_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s1) as _); + let r_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s2) as _); + + match op { + Op::Add | Op::AddWrapping => { + try_op!( + l, + l_s, + r, + r_s, + l.mul_checked(l_mul)?.add_checked(r.mul_checked(r_mul)?) + ) + } + Op::Sub | Op::SubWrapping => { + try_op!( + l, + l_s, + r, + r_s, + l.mul_checked(l_mul)?.sub_checked(r.mul_checked(r_mul)?) + ) + } + _ => unreachable!(), + } + .with_precision_and_scale(result_precision, result_scale)? + } + Op::Mul | Op::MulWrapping => { + let result_precision = p1.saturating_add(p2 + 1).min(T::MAX_PRECISION); + let result_scale = s1.saturating_add(*s2); + if result_scale > T::MAX_SCALE { + // SQL standard says that if the resulting scale of a multiply operation goes + // beyond the maximum, rounding is not acceptable and thus an error occurs + return Err(ArrowError::InvalidArgumentError(format!( + "Output scale of {} {op:?} {} would exceed max scale of {}", + l.data_type(), + r.data_type(), + T::MAX_SCALE + ))); + } + + try_op!(l, l_s, r, r_s, l.mul_checked(r)) + .with_precision_and_scale(result_precision, result_scale)? + } + + Op::Div => { + // Follow postgres and MySQL adding a fixed scale increment of 4 + // s1 + 4 + let result_scale = s1.saturating_add(4).min(T::MAX_SCALE); + let mul_pow = result_scale - s1 + s2; + + // p1 - s1 + s2 + result_scale + let result_precision = + (mul_pow.saturating_add(*p1 as i8) as u8).min(T::MAX_PRECISION); + + let (l_mul, r_mul) = match mul_pow.cmp(&0) { + Ordering::Greater => ( + T::Native::usize_as(10).pow_wrapping(mul_pow as _), + T::Native::ONE, + ), + Ordering::Equal => (T::Native::ONE, T::Native::ONE), + Ordering::Less => ( + T::Native::ONE, + T::Native::usize_as(10).pow_wrapping(mul_pow.neg_wrapping() as _), + ), + }; + + try_op!( + l, + l_s, + r, + r_s, + l.mul_checked(l_mul)?.div_checked(r.mul_checked(r_mul)?) + ) + .with_precision_and_scale(result_precision, result_scale)? + } + + Op::Rem => { + // max(s1, s2) + let result_scale = *s1.max(s2); + // min(p1-s1, p2 -s2) + max( s1,s2 ) + let result_precision = + (result_scale.saturating_add((*p1 as i8 - s1).min(*p2 as i8 - s2)) as u8) + .min(T::MAX_PRECISION); + + let l_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s1) as _); + let r_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s2) as _); + + try_op!( + l, + l_s, + r, + r_s, + l.mul_checked(l_mul)?.mod_checked(r.mul_checked(r_mul)?) + ) + .with_precision_and_scale(result_precision, result_scale)? + } + }; + + Ok(Arc::new(array)) +} diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 576f645b0375..8337326370dd 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -517,6 +517,15 @@ impl PrimitiveArray { Self::try_new(values, nulls).unwrap() } + /// Create a new [`PrimitiveArray`] of the given length where all values are null + pub fn new_null(length: usize) -> Self { + Self { + data_type: T::DATA_TYPE, + values: vec![T::Native::usize_as(0); length].into(), + nulls: Some(NullBuffer::new_null(length)), + } + } + /// Create a new [`PrimitiveArray`] from the provided values and nulls /// /// # Errors diff --git a/arrow-array/src/scalar.rs b/arrow-array/src/scalar.rs index e54a999f9980..c142107c5cf3 100644 --- a/arrow-array/src/scalar.rs +++ b/arrow-array/src/scalar.rs @@ -92,6 +92,12 @@ impl Datum for dyn Array { } } +impl Datum for &dyn Array { + fn get(&self) -> (&dyn Array, bool) { + (*self, false) + } +} + /// A wrapper around a single value [`Array`] indicating kernels should treat it as a scalar value /// /// See [`Datum`] for more information diff --git a/arrow/benches/arithmetic_kernels.rs b/arrow/benches/arithmetic_kernels.rs index 4ed197783b07..e982b0eb4b5f 100644 --- a/arrow/benches/arithmetic_kernels.rs +++ b/arrow/benches/arithmetic_kernels.rs @@ -15,65 +15,61 @@ // specific language governing permissions and limitations // under the License. -#[macro_use] -extern crate criterion; -use criterion::Criterion; -use rand::Rng; +use criterion::*; extern crate arrow; +use arrow::compute::kernels::numeric::*; use arrow::datatypes::Float32Type; use arrow::util::bench_util::*; -use arrow::{compute::kernels::arithmetic::*, util::test_util::seedable_rng}; +use arrow_array::Scalar; fn add_benchmark(c: &mut Criterion) { const BATCH_SIZE: usize = 64 * 1024; for null_density in [0., 0.1, 0.5, 0.9, 1.0] { let arr_a = create_primitive_array::(BATCH_SIZE, null_density); let arr_b = create_primitive_array::(BATCH_SIZE, null_density); - let scalar = seedable_rng().gen(); + let scalar_a = create_primitive_array::(1, 0.); + let scalar = Scalar::new(&scalar_a); c.bench_function(&format!("add({null_density})"), |b| { - b.iter(|| criterion::black_box(add(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(add_wrapping(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("add_checked({null_density})"), |b| { - b.iter(|| criterion::black_box(add_checked(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(add(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("add_scalar({null_density})"), |b| { - b.iter(|| criterion::black_box(add_scalar(&arr_a, scalar).unwrap())) + b.iter(|| criterion::black_box(add_wrapping(&arr_a, &scalar).unwrap())) }); c.bench_function(&format!("subtract({null_density})"), |b| { - b.iter(|| criterion::black_box(subtract(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(sub_wrapping(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("subtract_checked({null_density})"), |b| { - b.iter(|| criterion::black_box(subtract_checked(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(sub(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("subtract_scalar({null_density})"), |b| { - b.iter(|| criterion::black_box(subtract_scalar(&arr_a, scalar).unwrap())) + b.iter(|| criterion::black_box(sub_wrapping(&arr_a, &scalar).unwrap())) }); c.bench_function(&format!("multiply({null_density})"), |b| { - b.iter(|| criterion::black_box(multiply(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(mul_wrapping(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("multiply_checked({null_density})"), |b| { - b.iter(|| criterion::black_box(multiply_checked(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(mul(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("multiply_scalar({null_density})"), |b| { - b.iter(|| criterion::black_box(multiply_scalar(&arr_a, scalar).unwrap())) + b.iter(|| criterion::black_box(mul_wrapping(&arr_a, &scalar).unwrap())) }); c.bench_function(&format!("divide({null_density})"), |b| { - b.iter(|| criterion::black_box(divide(&arr_a, &arr_b).unwrap())) - }); - c.bench_function(&format!("divide_checked({null_density})"), |b| { - b.iter(|| criterion::black_box(divide_checked(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(div(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("divide_scalar({null_density})"), |b| { - b.iter(|| criterion::black_box(divide_scalar(&arr_a, scalar).unwrap())) + b.iter(|| criterion::black_box(div(&arr_a, &scalar).unwrap())) }); c.bench_function(&format!("modulo({null_density})"), |b| { - b.iter(|| criterion::black_box(modulus(&arr_a, &arr_b).unwrap())) + b.iter(|| criterion::black_box(rem(&arr_a, &arr_b).unwrap())) }); c.bench_function(&format!("modulo_scalar({null_density})"), |b| { - b.iter(|| criterion::black_box(modulus_scalar(&arr_a, scalar).unwrap())) + b.iter(|| criterion::black_box(rem(&arr_a, &scalar).unwrap())) }); } } diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index d9c948c607bd..49eae6d3ade5 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -19,7 +19,9 @@ pub mod limit; -pub use arrow_arith::{aggregate, arithmetic, arity, bitwise, boolean, temporal}; +pub use arrow_arith::{ + aggregate, arithmetic, arity, bitwise, boolean, numeric, temporal, +}; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_ord::{partition, sort}; diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 12aa1309c552..a392d1deec86 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -105,6 +105,8 @@ To export an array, create an `ArrowArray` using [ArrowArray::try_new]. use std::{mem::size_of, ptr::NonNull, sync::Arc}; +pub use arrow_data::ffi::FFI_ArrowArray; +pub use arrow_schema::ffi::{FFI_ArrowSchema, Flags}; use arrow_schema::UnionMode; use crate::array::{layout, ArrayData}; @@ -113,9 +115,6 @@ use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util; -pub use arrow_data::ffi::FFI_ArrowArray; -pub use arrow_schema::ffi::{FFI_ArrowSchema, Flags}; - // returns the number of bits that buffer `i` (in the C data interface) is expected to have. // This is set by the Arrow specification fn bit_width(data_type: &DataType, i: usize) -> Result { @@ -412,7 +411,16 @@ impl<'a> ArrowArray<'a> { #[cfg(test)] mod tests { - use super::*; + use std::collections::HashMap; + use std::convert::TryFrom; + use std::mem::ManuallyDrop; + use std::ptr::addr_of_mut; + + use arrow_array::builder::UnionBuilder; + use arrow_array::cast::AsArray; + use arrow_array::types::{Float64Type, Int32Type}; + use arrow_array::{StructArray, UnionArray}; + use crate::array::{ make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, @@ -421,14 +429,8 @@ mod tests { }; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; - use arrow_array::builder::UnionBuilder; - use arrow_array::cast::AsArray; - use arrow_array::types::{Float64Type, Int32Type}; - use arrow_array::{StructArray, UnionArray}; - use std::collections::HashMap; - use std::convert::TryFrom; - use std::mem::ManuallyDrop; - use std::ptr::addr_of_mut; + + use super::*; #[test] fn test_round_trip() { @@ -440,10 +442,10 @@ mod tests { // (simulate consumer) import it let array = Int32Array::from(from_ffi(array, &schema).unwrap()); - let array = kernels::arithmetic::add(&array, &array).unwrap(); + let array = kernels::numeric::add(&array, &array).unwrap(); // verify - assert_eq!(array, Int32Array::from(vec![2, 4, 6])); + assert_eq!(array.as_ref(), &Int32Array::from(vec![2, 4, 6])); } #[test] @@ -491,10 +493,10 @@ mod tests { let array = array.as_any().downcast_ref::().unwrap(); assert_eq!(array, &Int32Array::from(vec![Some(2), None])); - let array = kernels::arithmetic::add(array, array).unwrap(); + let array = kernels::numeric::add(array, array).unwrap(); // verify - assert_eq!(array, Int32Array::from(vec![Some(4), None])); + assert_eq!(array.as_ref(), &Int32Array::from(vec![Some(4), None])); // (drop/release) Ok(()) From 9649d081497f7ef22725bb430b67db8d75ebbbec Mon Sep 17 00:00:00 2001 From: Max Burke Date: Sat, 8 Jul 2023 14:18:11 -0700 Subject: [PATCH 1053/1411] support FixedSizeBinary types in eq_dyn_binary_scalar/neq_dyn_binary_scalar (#4492) --- arrow-ord/src/comparison.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4f8b9a322620..d18b0e36e930 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -841,9 +841,13 @@ pub fn eq_dyn_binary_scalar( ) -> Result { match left.data_type() { DataType::Binary => eq_binary_scalar(left.as_binary::(), right), + DataType::FixedSizeBinary(_) => { + let left = left.as_any().downcast_ref::().unwrap(); + compare_op_scalar(left, |a| a == right) + } DataType::LargeBinary => eq_binary_scalar(left.as_binary::(), right), _ => Err(ArrowError::ComputeError( - "eq_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), + "eq_dyn_binary_scalar only supports Binary / FixedSizeBinary / LargeBinary arrays".to_string(), )), } } @@ -857,8 +861,12 @@ pub fn neq_dyn_binary_scalar( match left.data_type() { DataType::Binary => neq_binary_scalar(left.as_binary::(), right), DataType::LargeBinary => neq_binary_scalar(left.as_binary::(), right), + DataType::FixedSizeBinary(_) => { + let left = left.as_any().downcast_ref::().unwrap(); + compare_op_scalar(left, |a| a != right) + } _ => Err(ArrowError::ComputeError( - "neq_dyn_binary_scalar only supports Binary or LargeBinary arrays" + "neq_dyn_binary_scalar only supports Binary / FixedSizeBinary / LargeBinary arrays" .to_string(), )), } @@ -4276,6 +4284,15 @@ mod tests { eq_dyn_binary_scalar(&large_array, scalar).unwrap(), expected ); + + let fsb_array = FixedSizeBinaryArray::try_from_iter( + vec![vec![0u8], vec![0u8], vec![0u8], vec![1u8]].into_iter(), + ) + .unwrap(); + let scalar = &[1u8]; + let expected = + BooleanArray::from(vec![Some(false), Some(false), Some(false), Some(true)]); + assert_eq!(eq_dyn_binary_scalar(&fsb_array, scalar).unwrap(), expected); } #[test] @@ -4293,6 +4310,15 @@ mod tests { neq_dyn_binary_scalar(&large_array, scalar).unwrap(), expected ); + + let fsb_array = FixedSizeBinaryArray::try_from_iter( + vec![vec![0u8], vec![0u8], vec![0u8], vec![1u8]].into_iter(), + ) + .unwrap(); + let scalar = &[1u8]; + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(false)]); + assert_eq!(neq_dyn_binary_scalar(&fsb_array, scalar).unwrap(), expected); } #[test] From 66aab5a6227f08bf1c51ac4a69703b346cd29a44 Mon Sep 17 00:00:00 2001 From: Martin Date: Sat, 8 Jul 2023 17:36:43 -0400 Subject: [PATCH 1054/1411] add num-complex requirement (#4482) --- arrow-array/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index d4f0f9fa0d47..1990abfd2e35 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -47,6 +47,7 @@ arrow-data = { workspace = true } chrono = { version = "0.4.24", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4", default-features = false, features = ["std"] } +num-complex = "0.4.2" half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", default-features = false } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } From 1f56959cd82718a7aa1fbc69e27275f87ab8d1ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Jul 2023 09:39:24 -0400 Subject: [PATCH 1055/1411] Update proc-macro2 requirement from =1.0.63 to =1.0.64 (#4500) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.63...1.0.64) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 743df85dc800..8700d9524dc1 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.63", default-features = false } +proc-macro2 = { version = "=1.0.64", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From e5378be5cfd0648814f71e6bfd639ff2531b435e Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 10 Jul 2023 15:39:58 +0200 Subject: [PATCH 1056/1411] object_store/InMemory: Add `fork()` fn and deprecate `clone()` fn (#4499) --- object_store/src/memory.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 82d485997e88..98b3a15eecbd 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -287,14 +287,18 @@ impl InMemory { Self::default() } - /// Creates a clone of the store - pub async fn clone(&self) -> Self { + /// Creates a fork of the store, with the current content copied into the + /// new store. + pub fn fork(&self) -> Self { let storage = self.storage.read(); - let storage = storage.clone(); + let storage = Arc::new(RwLock::new(storage.clone())); + Self { storage } + } - Self { - storage: Arc::new(RwLock::new(storage)), - } + /// Creates a clone of the store + #[deprecated(note = "Use fork() instead")] + pub async fn clone(&self) -> Self { + self.fork() } async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { From 20af94b0acf8632e6512fad04b92e0602275d6ee Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 10 Jul 2023 09:59:08 -0400 Subject: [PATCH 1057/1411] Add negate kernels (#4488) (#4494) * Add negate kernels (#4488) * Fix doc * Add Inteval tests * Review feedback --- arrow-arith/src/arithmetic.rs | 2 + arrow-arith/src/numeric.rs | 236 ++++++++++++++++++++++++++++++++++ arrow-array/src/types.rs | 6 + 3 files changed, 244 insertions(+) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 4f6ecc78dc58..4566afc2e5c8 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -510,6 +510,7 @@ pub fn subtract_scalar_checked_dyn( /// /// This doesn't detect overflow. Once overflowing, the result will wrap around. /// For an overflow-checking variant, use `negate_checked` instead. +#[deprecated(note = "Use arrow_arith::numeric::neg_wrapping")] pub fn negate( array: &PrimitiveArray, ) -> Result, ArrowError> { @@ -520,6 +521,7 @@ pub fn negate( /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `negate` instead. +#[deprecated(note = "Use arrow_arith::numeric::neg")] pub fn negate_checked( array: &PrimitiveArray, ) -> Result, ArrowError> { diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index 816fcaa944f5..c2e867dc91c9 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -74,6 +74,97 @@ pub fn rem(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { arithmetic_op(Op::Rem, lhs, rhs) } +macro_rules! neg_checked { + ($t:ty, $a:ident) => {{ + let array = $a + .as_primitive::<$t>() + .try_unary::<_, $t, _>(|x| x.neg_checked())?; + Ok(Arc::new(array)) + }}; +} + +macro_rules! neg_wrapping { + ($t:ty, $a:ident) => {{ + let array = $a.as_primitive::<$t>().unary::<_, $t>(|x| x.neg_wrapping()); + Ok(Arc::new(array)) + }}; +} + +/// Negates each element of `array`, returning an error on overflow +/// +/// Note: negation of unsigned arrays is not supported and will return in an error, +/// for wrapping unsigned negation consider using [`neg_wrapping`][neg_wrapping()] +pub fn neg(array: &dyn Array) -> Result { + use DataType::*; + use IntervalUnit::*; + use TimeUnit::*; + + match array.data_type() { + Int8 => neg_checked!(Int8Type, array), + Int16 => neg_checked!(Int16Type, array), + Int32 => neg_checked!(Int32Type, array), + Int64 => neg_checked!(Int64Type, array), + Float16 => neg_wrapping!(Float16Type, array), + Float32 => neg_wrapping!(Float32Type, array), + Float64 => neg_wrapping!(Float64Type, array), + Decimal128(p, s) => { + let a = array + .as_primitive::() + .try_unary::<_, Decimal128Type, _>(|x| x.neg_checked())?; + + Ok(Arc::new(a.with_precision_and_scale(*p, *s)?)) + } + Decimal256(p, s) => { + let a = array + .as_primitive::() + .try_unary::<_, Decimal256Type, _>(|x| x.neg_checked())?; + + Ok(Arc::new(a.with_precision_and_scale(*p, *s)?)) + } + Duration(Second) => neg_checked!(DurationSecondType, array), + Duration(Millisecond) => neg_checked!(DurationMillisecondType, array), + Duration(Microsecond) => neg_checked!(DurationMicrosecondType, array), + Duration(Nanosecond) => neg_checked!(DurationNanosecondType, array), + Interval(YearMonth) => neg_checked!(IntervalYearMonthType, array), + Interval(DayTime) => { + let a = array + .as_primitive::() + .try_unary::<_, IntervalDayTimeType, ArrowError>(|x| { + let (days, ms) = IntervalDayTimeType::to_parts(x); + Ok(IntervalDayTimeType::make_value( + days.neg_checked()?, + ms.neg_checked()?, + )) + })?; + Ok(Arc::new(a)) + } + Interval(MonthDayNano) => { + let a = array + .as_primitive::() + .try_unary::<_, IntervalMonthDayNanoType, ArrowError>(|x| { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(x); + Ok(IntervalMonthDayNanoType::make_value( + months.neg_checked()?, + days.neg_checked()?, + nanos.neg_checked()?, + )) + })?; + Ok(Arc::new(a)) + } + t => Err(ArrowError::InvalidArgumentError(format!( + "Invalid arithmetic operation: !{t}" + ))), + } +} + +/// Negates each element of `array`, wrapping on overflow for [`DataType::is_integer`] +pub fn neg_wrapping(array: &dyn Array) -> Result { + downcast_integer! { + array.data_type() => (neg_wrapping, array), + _ => neg(array), + } +} + /// An enumeration of arithmetic operations /// /// This allows sharing the type dispatch logic across the various kernels @@ -670,3 +761,148 @@ fn decimal_op( Ok(Arc::new(array)) } + +#[cfg(test)] +mod tests { + use super::*; + use arrow_buffer::{i256, ScalarBuffer}; + + fn test_neg_primitive( + input: &[T::Native], + out: Result<&[T::Native], &str>, + ) { + let a = PrimitiveArray::::new(ScalarBuffer::from(input.to_vec()), None); + match out { + Ok(expected) => { + let result = neg(&a).unwrap(); + assert_eq!(result.as_primitive::().values(), expected); + } + Err(e) => { + let err = neg(&a).unwrap_err().to_string(); + assert_eq!(e, err); + } + } + } + + #[test] + fn test_neg() { + let input = &[1, -5, 2, 693, 3929]; + let output = &[-1, 5, -2, -693, -3929]; + test_neg_primitive::(input, Ok(output)); + + let input = &[1, -5, 2, 693, 3929]; + let output = &[-1, 5, -2, -693, -3929]; + test_neg_primitive::(input, Ok(output)); + test_neg_primitive::(input, Ok(output)); + test_neg_primitive::(input, Ok(output)); + test_neg_primitive::(input, Ok(output)); + test_neg_primitive::(input, Ok(output)); + + let input = &[f32::MAX, f32::MIN, f32::INFINITY, 1.3, 0.5]; + let output = &[f32::MIN, f32::MAX, f32::NEG_INFINITY, -1.3, -0.5]; + test_neg_primitive::(input, Ok(output)); + + test_neg_primitive::( + &[i32::MIN], + Err("Compute error: Overflow happened on: -2147483648"), + ); + test_neg_primitive::( + &[i64::MIN], + Err("Compute error: Overflow happened on: -9223372036854775808"), + ); + test_neg_primitive::( + &[i64::MIN], + Err("Compute error: Overflow happened on: -9223372036854775808"), + ); + + let r = neg_wrapping(&Int32Array::from(vec![i32::MIN])).unwrap(); + assert_eq!(r.as_primitive::().value(0), i32::MIN); + + let r = neg_wrapping(&Int64Array::from(vec![i64::MIN])).unwrap(); + assert_eq!(r.as_primitive::().value(0), i64::MIN); + + let err = neg_wrapping(&DurationSecondArray::from(vec![i64::MIN])) + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Compute error: Overflow happened on: -9223372036854775808" + ); + + let a = Decimal128Array::from(vec![1, 3, -44, 2, 4]) + .with_precision_and_scale(9, 6) + .unwrap(); + + let r = neg(&a).unwrap(); + assert_eq!(r.data_type(), a.data_type()); + assert_eq!( + r.as_primitive::().values(), + &[-1, -3, 44, -2, -4] + ); + + let a = Decimal256Array::from(vec![ + i256::from_i128(342), + i256::from_i128(-4949), + i256::from_i128(3), + ]) + .with_precision_and_scale(9, 6) + .unwrap(); + + let r = neg(&a).unwrap(); + assert_eq!(r.data_type(), a.data_type()); + assert_eq!( + r.as_primitive::().values(), + &[ + i256::from_i128(-342), + i256::from_i128(4949), + i256::from_i128(-3), + ] + ); + + let a = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(2, 4), + IntervalYearMonthType::make_value(2, -4), + IntervalYearMonthType::make_value(-3, -5), + ]); + let r = neg(&a).unwrap(); + assert_eq!( + r.as_primitive::().values(), + &[ + IntervalYearMonthType::make_value(-2, -4), + IntervalYearMonthType::make_value(-2, 4), + IntervalYearMonthType::make_value(3, 5), + ] + ); + + let a = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(2, 4), + IntervalDayTimeType::make_value(2, -4), + IntervalDayTimeType::make_value(-3, -5), + ]); + let r = neg(&a).unwrap(); + assert_eq!( + r.as_primitive::().values(), + &[ + IntervalDayTimeType::make_value(-2, -4), + IntervalDayTimeType::make_value(-2, 4), + IntervalDayTimeType::make_value(3, 5), + ] + ); + + let a = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(2, 4, 5953394), + IntervalMonthDayNanoType::make_value(2, -4, -45839), + IntervalMonthDayNanoType::make_value(-3, -5, 6944), + ]); + let r = neg(&a).unwrap(); + assert_eq!( + r.as_primitive::().values(), + &[ + IntervalMonthDayNanoType::make_value(-2, -4, -5953394), + IntervalMonthDayNanoType::make_value(-2, 4, 45839), + IntervalMonthDayNanoType::make_value(3, 5, -6944), + ] + ); + } +} diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index f99e6a8f6f81..0a65c64ad746 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1001,6 +1001,7 @@ impl IntervalYearMonthType { /// /// * `years` - The number of years (+/-) represented in this interval /// * `months` - The number of months (+/-) represented in this interval + #[inline] pub fn make_value( years: i32, months: i32, @@ -1015,6 +1016,7 @@ impl IntervalYearMonthType { /// # Arguments /// /// * `i` - The IntervalYearMonthType::Native to convert + #[inline] pub fn to_months(i: ::Native) -> i32 { i } @@ -1027,6 +1029,7 @@ impl IntervalDayTimeType { /// /// * `days` - The number of days (+/-) represented in this interval /// * `millis` - The number of milliseconds (+/-) represented in this interval + #[inline] pub fn make_value( days: i32, millis: i32, @@ -1053,6 +1056,7 @@ impl IntervalDayTimeType { /// # Arguments /// /// * `i` - The IntervalDayTimeType to convert + #[inline] pub fn to_parts( i: ::Native, ) -> (i32, i32) { @@ -1070,6 +1074,7 @@ impl IntervalMonthDayNanoType { /// * `months` - The number of months (+/-) represented in this interval /// * `days` - The number of days (+/-) represented in this interval /// * `nanos` - The number of nanoseconds (+/-) represented in this interval + #[inline] pub fn make_value( months: i32, days: i32, @@ -1098,6 +1103,7 @@ impl IntervalMonthDayNanoType { /// # Arguments /// /// * `i` - The IntervalMonthDayNanoType to convert + #[inline] pub fn to_parts( i: ::Native, ) -> (i32, i32, i64) { From 29d4ae3c32d760b4a936734c18965b528baea4ca Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Jul 2023 10:01:51 -0400 Subject: [PATCH 1058/1411] Bump actions/labeler from 4.2.0 to 4.3.0 (#4501) Bumps [actions/labeler](https://github.com/actions/labeler) from 4.2.0 to 4.3.0. - [Release notes](https://github.com/actions/labeler/releases) - [Commits](https://github.com/actions/labeler/compare/v4.2.0...v4.3.0) --- updated-dependencies: - dependency-name: actions/labeler dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index daa5d6a76c52..bb88e9dcd3f5 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -44,7 +44,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@v4.2.0 + uses: actions/labeler@v4.3.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml From 72b2ec53aa2d5f1d80605127b7e0611522ef9d71 Mon Sep 17 00:00:00 2001 From: eitsupi <50911393+eitsupi@users.noreply.github.com> Date: Mon, 10 Jul 2023 23:03:48 +0900 Subject: [PATCH 1059/1411] ci: verify MSRV on CI (#4490) --- .github/workflows/rust.yml | 24 ++++++++++++++++++++++++ arrow-flight/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- object_store/Cargo.toml | 1 + parquet/Cargo.toml | 2 +- 5 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e09e898fe160..6b316fd6bc43 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -100,3 +100,27 @@ jobs: run: rustup component add rustfmt - name: Run run: cargo fmt --all -- --check + + msrv: + name: Verify MSRV + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v3 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + - name: Install cargo-msrv + run: cargo install cargo-msrv + - name: Check arrow + working-directory: arrow + run: cargo msrv verify + - name: Check parquet + working-directory: parquet + run: cargo msrv verify + - name: Check arrow-flight + working-directory: arrow-flight + run: cargo msrv verify + - name: Check object_store + working-directory: object_store + run: cargo msrv verify diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index ae9759b6685f..3ed426a21fab 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -20,7 +20,7 @@ name = "arrow-flight" description = "Apache Arrow Flight" version = { workspace = true } edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.70.0" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index ed4786fb3172..2b502f4a3b61 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -31,7 +31,7 @@ include = [ "Cargo.toml", ] edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.70.0" [lib] name = "arrow" diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 5e2009d07013..255b972e32d8 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -24,6 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" +rust-version = "1.62.1" [package.metadata.docs.rs] all-features = true diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 52b0f049752c..a570e5f64b04 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -26,7 +26,7 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.70.0" [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } From 6f2231353f92dbffea7d5fd9c93db2641c7ffc41 Mon Sep 17 00:00:00 2001 From: Zhang Li Date: Mon, 10 Jul 2023 23:10:07 +0800 Subject: [PATCH 1060/1411] fix incorrect buffer size limiting in parquet async writer (#4478) * fix incorrect buffer size limiting in parquet async writer * Format * Review feedback --------- Co-authored-by: zhangli20 Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/async_writer/mod.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index 339618364324..4d8cf1b90640 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -77,12 +77,16 @@ pub struct AsyncArrowWriter { /// The inner buffer shared by the `sync_writer` and the `async_writer` shared_buffer: SharedBuffer, + + /// Trigger forced flushing once buffer size reaches this value + buffer_size: usize, } impl AsyncArrowWriter { /// Try to create a new Async Arrow Writer. /// - /// `buffer_size` determines the initial size of the intermediate buffer. + /// `buffer_size` determines the number of bytes to buffer before flushing + /// to the underlying [`AsyncWrite`] /// /// The intermediate buffer will automatically be resized if necessary /// @@ -102,6 +106,7 @@ impl AsyncArrowWriter { sync_writer, async_writer: writer, shared_buffer, + buffer_size, }) } @@ -111,7 +116,12 @@ impl AsyncArrowWriter { /// checked and flush if at least half full pub async fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.sync_writer.write(batch)?; - Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, false).await + Self::try_flush( + &mut self.shared_buffer, + &mut self.async_writer, + self.buffer_size, + ) + .await } /// Append [`KeyValue`] metadata in addition to those in [`WriterProperties`] @@ -128,7 +138,7 @@ impl AsyncArrowWriter { let metadata = self.sync_writer.close()?; // Force to flush the remaining data. - Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, true).await?; + Self::try_flush(&mut self.shared_buffer, &mut self.async_writer, 0).await?; self.async_writer.shutdown().await?; Ok(metadata) @@ -139,10 +149,10 @@ impl AsyncArrowWriter { async fn try_flush( shared_buffer: &mut SharedBuffer, async_writer: &mut W, - force: bool, + buffer_size: usize, ) -> Result<()> { let mut buffer = shared_buffer.buffer.try_lock().unwrap(); - if !force && buffer.len() < buffer.capacity() / 2 { + if buffer.is_empty() || buffer.len() < buffer_size { // no need to flush return Ok(()); } From 6bbf2f0ce3e69de3ce1494917458cef56e9b6a8c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 10 Jul 2023 12:54:33 -0400 Subject: [PATCH 1061/1411] Add Datum Arithmetic tests, Fix Interval Substraction (#4480) (#4493) * Add arithmetic tests (#4480) * Fix test * Review feedback --- arrow-arith/src/arithmetic.rs | 4 +- arrow-arith/src/numeric.rs | 539 +++++++++++++++++++++++++++++++++- arrow-array/src/arithmetic.rs | 5 +- arrow-array/src/types.rs | 60 ++-- 4 files changed, 564 insertions(+), 44 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 4566afc2e5c8..f8c855af0183 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -1317,7 +1317,7 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 27).unwrap()) + Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 28).unwrap()) ); } @@ -1364,7 +1364,7 @@ mod tests { let c = c.as_any().downcast_ref::().unwrap(); assert_eq!( c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 27).unwrap()) + Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 28).unwrap()) ); } diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index c2e867dc91c9..9816d3e3d556 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -18,6 +18,7 @@ //! Defines numeric arithmetic kernels on [`PrimitiveArray`], such as [`add`] use std::cmp::Ordering; +use std::fmt::Formatter; use std::sync::Arc; use arrow_array::cast::AsArray; @@ -180,6 +181,18 @@ enum Op { Rem, } +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Op::AddWrapping | Op::Add => write!(f, "+"), + Op::SubWrapping | Op::Sub => write!(f, "-"), + Op::MulWrapping | Op::Mul => write!(f, "*"), + Op::Div => write!(f, "/"), + Op::Rem => write!(f, "%"), + } + } +} + impl Op { fn commutative(&self) -> bool { matches!(self, Self::Add | Self::AddWrapping) @@ -229,7 +242,7 @@ fn arithmetic_op( arithmetic_op(op, rhs, lhs) } _ => Err(ArrowError::InvalidArgumentError( - format!("Invalid arithmetic operation: {l_t} {op:?} {r_t}") + format!("Invalid arithmetic operation: {l_t} {op} {r_t}") )) } } @@ -433,7 +446,7 @@ fn timestamp_op( } _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Invalid timestamp arithmetic operation: {} {op:?} {}", + "Invalid timestamp arithmetic operation: {} {op} {}", l.data_type(), r.data_type() ))) @@ -555,7 +568,7 @@ fn interval_op( Op::Add | Op::AddWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, T::add(l, r))), Op::Sub | Op::SubWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, T::sub(l, r))), _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid interval arithmetic operation: {} {op:?} {}", + "Invalid interval arithmetic operation: {} {op} {}", l.data_type(), r.data_type() ))), @@ -575,7 +588,7 @@ fn duration_op( Op::Add | Op::AddWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, l.add_checked(r))), Op::Sub | Op::SubWrapping => Ok(try_op_ref!(T, l, l_s, r, r_s, l.sub_checked(r))), _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid duration arithmetic operation: {} {op:?} {}", + "Invalid duration arithmetic operation: {} {op} {}", l.data_type(), r.data_type() ))), @@ -593,7 +606,6 @@ fn date_op( use DataType::*; use IntervalUnit::*; - // Note: interval arithmetic should account for timezones (#4457) let l = l.as_primitive::(); match (op, r.data_type()) { (Op::Add | Op::AddWrapping, Interval(YearMonth)) => { @@ -624,7 +636,7 @@ fn date_op( } _ => Err(ArrowError::InvalidArgumentError(format!( - "Invalid date arithmetic operation: {} {op:?} {}", + "Invalid date arithmetic operation: {} {op} {}", l.data_type(), r.data_type() ))), @@ -661,8 +673,8 @@ fn decimal_op( .saturating_add(1) .min(T::MAX_PRECISION); - let l_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s1) as _); - let r_mul = T::Native::usize_as(10).pow_wrapping((result_scale - s2) as _); + let l_mul = T::Native::usize_as(10).pow_checked((result_scale - s1) as _)?; + let r_mul = T::Native::usize_as(10).pow_checked((result_scale - s2) as _)?; match op { Op::Add | Op::AddWrapping => { @@ -694,7 +706,7 @@ fn decimal_op( // SQL standard says that if the resulting scale of a multiply operation goes // beyond the maximum, rounding is not acceptable and thus an error occurs return Err(ArrowError::InvalidArgumentError(format!( - "Output scale of {} {op:?} {} would exceed max scale of {}", + "Output scale of {} {op} {} would exceed max scale of {}", l.data_type(), r.data_type(), T::MAX_SCALE @@ -717,13 +729,13 @@ fn decimal_op( let (l_mul, r_mul) = match mul_pow.cmp(&0) { Ordering::Greater => ( - T::Native::usize_as(10).pow_wrapping(mul_pow as _), + T::Native::usize_as(10).pow_checked(mul_pow as _)?, T::Native::ONE, ), Ordering::Equal => (T::Native::ONE, T::Native::ONE), Ordering::Less => ( T::Native::ONE, - T::Native::usize_as(10).pow_wrapping(mul_pow.neg_wrapping() as _), + T::Native::usize_as(10).pow_checked(mul_pow.neg_wrapping() as _)?, ), }; @@ -765,7 +777,9 @@ fn decimal_op( #[cfg(test)] mod tests { use super::*; + use arrow_array::temporal_conversions::{as_date, as_datetime}; use arrow_buffer::{i256, ScalarBuffer}; + use chrono::{DateTime, NaiveDate}; fn test_neg_primitive( input: &[T::Native], @@ -905,4 +919,507 @@ mod tests { ] ); } + + #[test] + fn test_integer() { + let a = Int32Array::from(vec![4, 3, 5, -6, 100]); + let b = Int32Array::from(vec![6, 2, 5, -7, 3]); + let result = add(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &Int32Array::from(vec![10, 5, 10, -13, 103]) + ); + let result = sub(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![-2, 1, 0, 1, 97])); + let result = div(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![0, 1, 1, 0, 33])); + let result = mul(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![24, 6, 25, 42, 300])); + let result = rem(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![4, 1, 0, -6, 1])); + + let a = Int8Array::from(vec![Some(2), None, Some(45)]); + let b = Int8Array::from(vec![Some(5), Some(3), None]); + let result = add(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &Int8Array::from(vec![Some(7), None, None])); + + let a = UInt8Array::from(vec![56, 5, 3]); + let b = UInt8Array::from(vec![200, 2, 5]); + let err = add(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: 56 + 200"); + let result = add_wrapping(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &UInt8Array::from(vec![0, 7, 8])); + + let a = UInt8Array::from(vec![34, 5, 3]); + let b = UInt8Array::from(vec![200, 2, 5]); + let err = sub(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: 34 - 200"); + let result = sub_wrapping(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &UInt8Array::from(vec![90, 3, 254])); + + let a = UInt8Array::from(vec![34, 5, 3]); + let b = UInt8Array::from(vec![200, 2, 5]); + let err = mul(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: 34 * 200"); + let result = mul_wrapping(&a, &b).unwrap(); + assert_eq!(result.as_ref(), &UInt8Array::from(vec![144, 10, 15])); + + let a = Int16Array::from(vec![i16::MIN]); + let b = Int16Array::from(vec![-1]); + let err = div(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: -32768 / -1"); + + let a = Int16Array::from(vec![21]); + let b = Int16Array::from(vec![0]); + let err = div(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Divide by zero error"); + + let a = Int16Array::from(vec![21]); + let b = Int16Array::from(vec![0]); + let err = rem(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Divide by zero error"); + } + + #[test] + fn test_float() { + let a = Float32Array::from(vec![1., f32::MAX, 6., -4., -1., 0.]); + let b = Float32Array::from(vec![1., f32::MAX, f32::MAX, -3., 45., 0.]); + let result = add(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &Float32Array::from(vec![2., f32::INFINITY, f32::MAX, -7., 44.0, 0.]) + ); + + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &Float32Array::from(vec![0., 0., f32::MIN, -1., -46., 0.]) + ); + + let result = mul(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &Float32Array::from(vec![1., f32::INFINITY, f32::INFINITY, 12., -45., 0.]) + ); + + let result = div(&a, &b).unwrap(); + let r = result.as_primitive::(); + assert_eq!(r.value(0), 1.); + assert_eq!(r.value(1), 1.); + assert!(r.value(2) < f32::EPSILON); + assert_eq!(r.value(3), -4. / -3.); + assert!(r.value(5).is_nan()); + + let result = rem(&a, &b).unwrap(); + let r = result.as_primitive::(); + assert_eq!(&r.values()[..5], &[0., 0., 6., -1., -1.]); + assert!(r.value(5).is_nan()); + } + + #[test] + fn test_decimal() { + // 0.015 7.842 -0.577 0.334 -0.078 0.003 + let a = Decimal128Array::from(vec![15, 0, -577, 334, -78, 3]) + .with_precision_and_scale(12, 3) + .unwrap(); + + // 5.4 0 -35.6 0.3 0.6 7.45 + let b = Decimal128Array::from(vec![54, 34, -356, 3, 6, 745]) + .with_precision_and_scale(12, 1) + .unwrap(); + + let result = add(&a, &b).unwrap(); + assert_eq!(result.data_type(), &DataType::Decimal128(15, 3)); + assert_eq!( + result.as_primitive::().values(), + &[5415, 3400, -36177, 634, 522, 74503] + ); + + let result = sub(&a, &b).unwrap(); + assert_eq!(result.data_type(), &DataType::Decimal128(15, 3)); + assert_eq!( + result.as_primitive::().values(), + &[-5385, -3400, 35023, 34, -678, -74497] + ); + + let result = mul(&a, &b).unwrap(); + assert_eq!(result.data_type(), &DataType::Decimal128(25, 4)); + assert_eq!( + result.as_primitive::().values(), + &[810, 0, 205412, 1002, -468, 2235] + ); + + let result = div(&a, &b).unwrap(); + assert_eq!(result.data_type(), &DataType::Decimal128(17, 7)); + assert_eq!( + result.as_primitive::().values(), + &[27777, 0, 162078, 11133333, -1300000, 402] + ); + + let result = rem(&a, &b).unwrap(); + assert_eq!(result.data_type(), &DataType::Decimal128(12, 3)); + assert_eq!( + result.as_primitive::().values(), + &[15, 0, -577, 34, -78, 3] + ); + + let a = Decimal128Array::from(vec![1]) + .with_precision_and_scale(3, 3) + .unwrap(); + let b = Decimal128Array::from(vec![1]) + .with_precision_and_scale(37, 37) + .unwrap(); + let err = mul(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Invalid argument error: Output scale of Decimal128(3, 3) * Decimal128(37, 37) would exceed max scale of 38"); + + let a = Decimal128Array::from(vec![1]) + .with_precision_and_scale(3, -2) + .unwrap(); + let err = add(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: 10 ^ 39"); + + let a = Decimal128Array::from(vec![10]) + .with_precision_and_scale(3, -1) + .unwrap(); + let err = add(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000"); + + let b = Decimal128Array::from(vec![0]) + .with_precision_and_scale(1, 1) + .unwrap(); + let err = div(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Divide by zero error"); + let err = rem(&a, &b).unwrap_err().to_string(); + assert_eq!(err, "Divide by zero error"); + } + + fn test_timestamp_impl() { + let a = PrimitiveArray::::new(vec![2000000, 434030324, 53943340].into(), None); + let b = PrimitiveArray::::new(vec![329593, 59349, 694994].into(), None); + + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_primitive::().values(), + &[1670407, 433970975, 53248346] + ); + + let r2 = add(&b, &result.as_ref()).unwrap(); + assert_eq!(r2.as_ref(), &a); + + let r3 = add(&result.as_ref(), &b).unwrap(); + assert_eq!(r3.as_ref(), &a); + + let format_array = |x: &dyn Array| -> Vec { + x.as_primitive::() + .values() + .into_iter() + .map(|x| as_datetime::(*x).unwrap().to_string()) + .collect() + }; + + let values = vec![ + "1970-01-01T00:00:00Z", + "2010-04-01T04:00:20Z", + "1960-01-30T04:23:20Z", + ] + .into_iter() + .map(|x| { + T::make_value(DateTime::parse_from_rfc3339(x).unwrap().naive_utc()).unwrap() + }) + .collect(); + + let a = PrimitiveArray::::new(values, None); + let b = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(5, 34), + IntervalYearMonthType::make_value(-2, 4), + IntervalYearMonthType::make_value(7, -4), + ]); + let r4 = add(&a, &b).unwrap(); + assert_eq!( + &format_array(r4.as_ref()), + &[ + "1977-11-01 00:00:00".to_string(), + "2008-08-01 04:00:20".to_string(), + "1966-09-30 04:23:20".to_string() + ] + ); + + let r5 = sub(&r4, &b).unwrap(); + assert_eq!(r5.as_ref(), &a); + + let b = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(5, 454000), + IntervalDayTimeType::make_value(-34, 0), + IntervalDayTimeType::make_value(7, -4000), + ]); + let r6 = add(&a, &b).unwrap(); + assert_eq!( + &format_array(r6.as_ref()), + &[ + "1970-01-06 00:07:34".to_string(), + "2010-02-26 04:00:20".to_string(), + "1960-02-06 04:23:16".to_string() + ] + ); + + let r7 = sub(&r6, &b).unwrap(); + assert_eq!(r7.as_ref(), &a); + + let b = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(344, 34, -43_000_000_000), + IntervalMonthDayNanoType::make_value(-593, -33, 13_000_000_000), + IntervalMonthDayNanoType::make_value(5, 2, 493_000_000_000), + ]); + let r8 = add(&a, &b).unwrap(); + assert_eq!( + &format_array(r8.as_ref()), + &[ + "1998-10-04 23:59:17".to_string(), + "1960-09-29 04:00:33".to_string(), + "1960-07-02 04:31:33".to_string() + ] + ); + + let r9 = sub(&r8, &b).unwrap(); + // Note: subtraction is not the inverse of addition for intervals + assert_eq!( + &format_array(r9.as_ref()), + &[ + "1970-01-02 00:00:00".to_string(), + "2010-04-02 04:00:20".to_string(), + "1960-01-31 04:23:20".to_string() + ] + ); + } + + #[test] + fn test_timestamp() { + test_timestamp_impl::(); + test_timestamp_impl::(); + test_timestamp_impl::(); + test_timestamp_impl::(); + } + + #[test] + fn test_interval() { + let a = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(32, 4), + IntervalYearMonthType::make_value(32, 4), + ]); + let b = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(-4, 6), + IntervalYearMonthType::make_value(-3, 23), + ]); + let result = add(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(28, 10), + IntervalYearMonthType::make_value(29, 27) + ]) + ); + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(36, -2), + IntervalYearMonthType::make_value(35, -19) + ]) + ); + + let a = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(32, 4), + IntervalDayTimeType::make_value(32, 4), + ]); + let b = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(-4, 6), + IntervalDayTimeType::make_value(-3, 23), + ]); + let result = add(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(28, 10), + IntervalDayTimeType::make_value(29, 27) + ]) + ); + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(36, -2), + IntervalDayTimeType::make_value(35, -19) + ]) + ); + let a = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(32, 4, 4000000000000), + IntervalMonthDayNanoType::make_value(32, 4, 45463000000000000), + ]); + let b = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(-4, 6, 46000000000000), + IntervalMonthDayNanoType::make_value(-3, 23, 3564000000000000), + ]); + let result = add(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(28, 10, 50000000000000), + IntervalMonthDayNanoType::make_value(29, 27, 49027000000000000) + ]) + ); + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_ref(), + &IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(36, -2, -42000000000000), + IntervalMonthDayNanoType::make_value(35, -19, 41899000000000000) + ]) + ); + let a = IntervalMonthDayNanoArray::from(vec![i64::MAX as i128]); + let b = IntervalMonthDayNanoArray::from(vec![1]); + let err = add(&a, &b).unwrap_err().to_string(); + assert_eq!( + err, + "Compute error: Overflow happened on: 9223372036854775807 + 1" + ); + } + + fn test_duration_impl>() { + let a = PrimitiveArray::::new(vec![1000, 4394, -3944].into(), None); + let b = PrimitiveArray::::new(vec![4, -5, -243].into(), None); + + let result = add(&a, &b).unwrap(); + assert_eq!(result.as_primitive::().values(), &[1004, 4389, -4187]); + let result = sub(&a, &b).unwrap(); + assert_eq!(result.as_primitive::().values(), &[996, 4399, -3701]); + + let err = mul(&a, &b).unwrap_err().to_string(); + assert!( + err.contains("Invalid duration arithmetic operation"), + "{err}" + ); + + let err = div(&a, &b).unwrap_err().to_string(); + assert!( + err.contains("Invalid duration arithmetic operation"), + "{err}" + ); + + let err = rem(&a, &b).unwrap_err().to_string(); + assert!( + err.contains("Invalid duration arithmetic operation"), + "{err}" + ); + + let a = PrimitiveArray::::new(vec![i64::MAX].into(), None); + let b = PrimitiveArray::::new(vec![1].into(), None); + let err = add(&a, &b).unwrap_err().to_string(); + assert_eq!( + err, + "Compute error: Overflow happened on: 9223372036854775807 + 1" + ); + } + + #[test] + fn test_duration() { + test_duration_impl::(); + test_duration_impl::(); + test_duration_impl::(); + test_duration_impl::(); + } + + fn test_date_impl(f: F) + where + F: Fn(NaiveDate) -> T::Native, + T::Native: TryInto, + { + let a = PrimitiveArray::::new( + vec![ + f(NaiveDate::from_ymd_opt(1979, 1, 30).unwrap()), + f(NaiveDate::from_ymd_opt(2010, 4, 3).unwrap()), + f(NaiveDate::from_ymd_opt(2008, 2, 29).unwrap()), + ] + .into(), + None, + ); + + let b = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(34, 2), + IntervalYearMonthType::make_value(3, -3), + IntervalYearMonthType::make_value(-12, 4), + ]); + + let format_array = |x: &dyn Array| -> Vec { + x.as_primitive::() + .values() + .into_iter() + .map(|x| { + as_date::((*x).try_into().ok().unwrap()) + .unwrap() + .to_string() + }) + .collect() + }; + + let result = add(&a, &b).unwrap(); + assert_eq!( + &format_array(result.as_ref()), + &[ + "2013-03-30".to_string(), + "2013-01-03".to_string(), + "1996-06-29".to_string(), + ] + ); + let result = sub(&result, &b).unwrap(); + assert_eq!(result.as_ref(), &a); + + let b = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(34, 2), + IntervalDayTimeType::make_value(3, -3), + IntervalDayTimeType::make_value(-12, 4), + ]); + + let result = add(&a, &b).unwrap(); + assert_eq!( + &format_array(result.as_ref()), + &[ + "1979-03-05".to_string(), + "2010-04-06".to_string(), + "2008-02-17".to_string(), + ] + ); + let result = sub(&result, &b).unwrap(); + assert_eq!(result.as_ref(), &a); + + let b = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(34, 2, -34353534), + IntervalMonthDayNanoType::make_value(3, -3, 2443), + IntervalMonthDayNanoType::make_value(-12, 4, 2323242423232), + ]); + + let result = add(&a, &b).unwrap(); + assert_eq!( + &format_array(result.as_ref()), + &[ + "1981-12-02".to_string(), + "2010-06-30".to_string(), + "2007-03-04".to_string(), + ] + ); + let result = sub(&result, &b).unwrap(); + assert_eq!( + &format_array(result.as_ref()), + &[ + "1979-01-31".to_string(), + "2010-04-02".to_string(), + "2008-02-29".to_string(), + ] + ); + } + + #[test] + fn test_date() { + test_date_impl::(Date32Type::from_naive_date); + test_date_impl::(Date64Type::from_naive_date); + } } diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index abeb46b99688..b0ecef70ee19 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -229,7 +229,10 @@ macro_rules! native_type_op { #[inline] fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: {:?}", self)) + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} ^ {exp:?}", + self + )) }) } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 0a65c64ad746..3d14cff384b8 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -457,17 +457,17 @@ impl TimestampSecondType { timestamp: ::Native, delta: ::Native, ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::days(days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::microseconds(ms as i64)) + .checked_sub_signed(Duration::milliseconds(ms as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -491,12 +491,12 @@ impl TimestampSecondType { })?; let res = shift_months(res, -months); let res = res - .checked_add_signed(Duration::days(-days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::nanoseconds(-nanos)) + .checked_sub_signed(Duration::nanoseconds(nanos)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -616,17 +616,17 @@ impl TimestampMicrosecondType { delta: ::Native, ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::days(days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) + .checked_sub_signed(Duration::milliseconds(ms as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -651,12 +651,12 @@ impl TimestampMicrosecondType { })?; let res = shift_months(res, -months); let res = res - .checked_add_signed(Duration::days(-days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::nanoseconds(-nanos)) + .checked_sub_signed(Duration::nanoseconds(nanos)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -776,17 +776,17 @@ impl TimestampMillisecondType { delta: ::Native, ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::days(days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) + .checked_sub_signed(Duration::milliseconds(ms as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -811,12 +811,12 @@ impl TimestampMillisecondType { })?; let res = shift_months(res, -months); let res = res - .checked_add_signed(Duration::days(-days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::nanoseconds(-nanos)) + .checked_sub_signed(Duration::nanoseconds(nanos)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -946,14 +946,14 @@ impl TimestampNanosecondType { || ArrowError::ComputeError("Timestamp out of range".to_string()), )?; - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = res - .checked_add_signed(Duration::days(days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) + .checked_sub_signed(Duration::milliseconds(ms as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -980,12 +980,12 @@ impl TimestampNanosecondType { let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); let res = shift_months(res, -months); let res = res - .checked_add_signed(Duration::days(-days as i64)) + .checked_sub_signed(Duration::days(days as i64)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; let res = res - .checked_add_signed(Duration::nanoseconds(-nanos)) + .checked_sub_signed(Duration::nanoseconds(nanos)) .ok_or_else(|| { ArrowError::ComputeError("Timestamp out of range".to_string()) })?; @@ -1212,10 +1212,10 @@ impl Date32Type { date: ::Native, delta: ::Native, ) -> ::Native { - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = Date32Type::to_naive_date(date); - let res = res.add(Duration::days(days as i64)); - let res = res.add(Duration::milliseconds(ms as i64)); + let res = res.sub(Duration::days(days as i64)); + let res = res.sub(Duration::milliseconds(ms as i64)); Date32Type::from_naive_date(res) } @@ -1232,8 +1232,8 @@ impl Date32Type { let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); let res = Date32Type::to_naive_date(date); let res = shift_months(res, -months); - let res = res.add(Duration::days(-days as i64)); - let res = res.add(Duration::nanoseconds(-nanos)); + let res = res.sub(Duration::days(days as i64)); + let res = res.sub(Duration::nanoseconds(nanos)); Date32Type::from_naive_date(res) } } @@ -1336,10 +1336,10 @@ impl Date64Type { date: ::Native, delta: ::Native, ) -> ::Native { - let (days, ms) = IntervalDayTimeType::to_parts(-delta); + let (days, ms) = IntervalDayTimeType::to_parts(delta); let res = Date64Type::to_naive_date(date); - let res = res.add(Duration::days(days as i64)); - let res = res.add(Duration::milliseconds(ms as i64)); + let res = res.sub(Duration::days(days as i64)); + let res = res.sub(Duration::milliseconds(ms as i64)); Date64Type::from_naive_date(res) } @@ -1356,8 +1356,8 @@ impl Date64Type { let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); let res = Date64Type::to_naive_date(date); let res = shift_months(res, -months); - let res = res.add(Duration::days(-days as i64)); - let res = res.add(Duration::nanoseconds(-nanos)); + let res = res.sub(Duration::days(days as i64)); + let res = res.sub(Duration::nanoseconds(nanos)); Date64Type::from_naive_date(res) } } From 8da2f97bfd9a613c02acbd4b329d11937ca6257f Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Mon, 10 Jul 2023 20:04:50 +0200 Subject: [PATCH 1062/1411] object_store: Implement `ObjectStore` for `Arc` (#4502) * object_store: Add `Box` tests * object_store: Extract `as_ref_impl!()` macro * object_store: Implement `ObjectStore` for `Arc` --- object_store/src/lib.rs | 179 ++++++++++++++++++++----------------- object_store/src/memory.rs | 26 ++++++ 2 files changed, 125 insertions(+), 80 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 864cabc4a8c0..97e6aae97139 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -270,6 +270,7 @@ use std::fmt::{Debug, Formatter}; #[cfg(not(target_arch = "wasm32"))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; +use std::sync::Arc; use tokio::io::AsyncWrite; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] @@ -526,105 +527,123 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } } -#[async_trait] -impl ObjectStore for Box { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.as_ref().put(location, bytes).await - } +macro_rules! as_ref_impl { + ($type:ty) => { + #[async_trait] + impl ObjectStore for $type { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + self.as_ref().put(location, bytes).await + } - async fn put_multipart( - &self, - location: &Path, - ) -> Result<(MultipartId, Box)> { - self.as_ref().put_multipart(location).await - } + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.as_ref().put_multipart(location).await + } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { - self.as_ref().abort_multipart(location, multipart_id).await - } + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + self.as_ref().abort_multipart(location, multipart_id).await + } - async fn append( - &self, - location: &Path, - ) -> Result> { - self.as_ref().append(location).await - } + async fn append( + &self, + location: &Path, + ) -> Result> { + self.as_ref().append(location).await + } - async fn get(&self, location: &Path) -> Result { - self.as_ref().get(location).await - } + async fn get(&self, location: &Path) -> Result { + self.as_ref().get(location).await + } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - self.as_ref().get_opts(location, options).await - } + async fn get_opts( + &self, + location: &Path, + options: GetOptions, + ) -> Result { + self.as_ref().get_opts(location, options).await + } - async fn get_range(&self, location: &Path, range: Range) -> Result { - self.as_ref().get_range(location, range).await - } + async fn get_range( + &self, + location: &Path, + range: Range, + ) -> Result { + self.as_ref().get_range(location, range).await + } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { - self.as_ref().get_ranges(location, ranges).await - } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + self.as_ref().get_ranges(location, ranges).await + } - async fn head(&self, location: &Path) -> Result { - self.as_ref().head(location).await - } + async fn head(&self, location: &Path) -> Result { + self.as_ref().head(location).await + } - async fn delete(&self, location: &Path) -> Result<()> { - self.as_ref().delete(location).await - } + async fn delete(&self, location: &Path) -> Result<()> { + self.as_ref().delete(location).await + } - fn delete_stream<'a>( - &'a self, - locations: BoxStream<'a, Result>, - ) -> BoxStream<'a, Result> { - self.as_ref().delete_stream(locations) - } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + self.as_ref().delete_stream(locations) + } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.as_ref().list(prefix).await - } + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + self.as_ref().list(prefix).await + } - async fn list_with_offset( - &self, - prefix: Option<&Path>, - offset: &Path, - ) -> Result>> { - self.as_ref().list_with_offset(prefix, offset).await - } + async fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> Result>> { + self.as_ref().list_with_offset(prefix, offset).await + } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - self.as_ref().list_with_delimiter(prefix).await - } + async fn list_with_delimiter( + &self, + prefix: Option<&Path>, + ) -> Result { + self.as_ref().list_with_delimiter(prefix).await + } - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().copy(from, to).await - } + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy(from, to).await + } - async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().rename(from, to).await - } + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename(from, to).await + } - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().copy_if_not_exists(from, to).await - } + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().copy_if_not_exists(from, to).await + } - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - self.as_ref().rename_if_not_exists(from, to).await - } + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.as_ref().rename_if_not_exists(from, to).await + } + } + }; } +as_ref_impl!(Arc); +as_ref_impl!(Box); + /// Result of a list call that includes objects, prefixes (directories) and a /// token for the next set of results. Individual result sets may be limited to /// 1,000 objects based on the underlying object storage's limitations. diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 98b3a15eecbd..cfc2ac823036 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -415,6 +415,32 @@ mod tests { stream_get(&integration).await; } + #[tokio::test] + async fn box_test() { + let integration: Box = Box::new(InMemory::new()); + + put_get_delete_list(&integration).await; + get_opts(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + + #[tokio::test] + async fn arc_test() { + let integration: Arc = Arc::new(InMemory::new()); + + put_get_delete_list(&integration).await; + get_opts(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; + } + #[tokio::test] async fn unknown_length() { let integration = InMemory::new(); From 6e0faaf24d84cded196be3abfccd80a154a92740 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Jul 2023 10:59:46 -0400 Subject: [PATCH 1063/1411] Bump actions/upload-pages-artifact from 1 to 2 (#4508) Bumps [actions/upload-pages-artifact](https://github.com/actions/upload-pages-artifact) from 1 to 2. - [Release notes](https://github.com/actions/upload-pages-artifact/releases) - [Commits](https://github.com/actions/upload-pages-artifact/compare/v1...v2) --- updated-dependencies: - dependency-name: actions/upload-pages-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 7e80aea6b978..4ca71f464591 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -64,7 +64,7 @@ jobs: echo "::warning title=Invalid file permissions automatically fixed::$line" done - name: Upload artifacts - uses: actions/upload-pages-artifact@v1 + uses: actions/upload-pages-artifact@v2 with: name: crate-docs path: target/doc From 8bcb3fc4ec458f7fdf4a98de199620e1164281df Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 12 Jul 2023 12:45:44 -0400 Subject: [PATCH 1064/1411] Support Date - Date (#4383) (#4504) * Support Date - Date (#4383) * Review feedback --- arrow-arith/src/numeric.rs | 52 +++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index 9816d3e3d556..b0bbb75c129b 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -606,8 +606,34 @@ fn date_op( use DataType::*; use IntervalUnit::*; + const NUM_SECONDS_IN_DAY: i64 = 60 * 60 * 24; + + let r_t = r.data_type(); + match (T::DATA_TYPE, op, r_t) { + (Date32, Op::Sub | Op::SubWrapping, Date32) => { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + return Ok(op_ref!( + DurationSecondType, + l, + l_s, + r, + r_s, + ((l as i64) - (r as i64)) * NUM_SECONDS_IN_DAY + )); + } + (Date64, Op::Sub | Op::SubWrapping, Date64) => { + let l = l.as_primitive::(); + let r = r.as_primitive::(); + let result = + try_op_ref!(DurationMillisecondType, l, l_s, r, r_s, l.sub_checked(r)); + return Ok(result); + } + _ => {} + } + let l = l.as_primitive::(); - match (op, r.data_type()) { + match (op, r_t) { (Op::Add | Op::AddWrapping, Interval(YearMonth)) => { let r = r.as_primitive::(); Ok(op_ref!(T, l, l_s, r, r_s, T::add_year_month(l, r))) @@ -1421,5 +1447,29 @@ mod tests { fn test_date() { test_date_impl::(Date32Type::from_naive_date); test_date_impl::(Date64Type::from_naive_date); + + let a = Date32Array::from(vec![i32::MIN, i32::MAX, 23, 7684]); + let b = Date32Array::from(vec![i32::MIN, i32::MIN, -2, 45]); + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_primitive::().values(), + &[0, 371085174288000, 2160000, 660009600] + ); + + let a = Date64Array::from(vec![4343, 76676, 3434]); + let b = Date64Array::from(vec![3, -5, 5]); + let result = sub(&a, &b).unwrap(); + assert_eq!( + result.as_primitive::().values(), + &[4340, 76681, 3429] + ); + + let a = Date64Array::from(vec![i64::MAX]); + let b = Date64Array::from(vec![-1]); + let err = sub(&a, &b).unwrap_err().to_string(); + assert_eq!( + err, + "Compute error: Overflow happened on: 9223372036854775807 - -1" + ); } } From e169355220b67a7c47f92b5d5c7dc6996b38c905 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 12 Jul 2023 18:42:45 -0400 Subject: [PATCH 1065/1411] Cleanup cast_primitive_to_list (#4511) * Cleanup cast_primitive_to_list * Review feedback --- arrow-cast/src/cast.rs | 67 ++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 95c0a63a3a4e..814ad6589f2d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -49,7 +49,7 @@ use crate::parse::{ use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; -use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer, ScalarBuffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -849,12 +849,8 @@ pub fn cast_with_options( } } - (_, List(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } - (_, LargeList(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } + (_, List(ref to)) => cast_values_to_list::(array, to, cast_options), + (_, LargeList(ref to)) => cast_values_to_list::(array, to, cast_options), (Decimal128(_, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), @@ -3645,39 +3641,15 @@ where } /// Helper function that takes a primitive array and casts to a (generic) list array. -fn cast_primitive_to_list( +fn cast_values_to_list( array: &dyn Array, - to: &Field, - to_type: &DataType, + to: &FieldRef, cast_options: &CastOptions, ) -> Result { - // cast primitive to list's primitive - let cast_array = cast_with_options(array, to.data_type(), cast_options)?; - // create offsets, where if array.len() = 2, we have [0,1,2] - // Safety: - // Length of range can be trusted. - // Note: could not yet create a generic range in stable Rust. - let offsets = unsafe { - MutableBuffer::from_trusted_len_iter( - (0..=array.len()).map(|i| OffsetSize::from(i).expect("integer")), - ) - }; - - let list_data = unsafe { - ArrayData::new_unchecked( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array.nulls().map(|b| b.inner().sliced()), - 0, - vec![offsets.into()], - vec![cast_array.into_data()], - ) - }; - let list_array = - Arc::new(GenericListArray::::from(list_data)) as ArrayRef; - - Ok(list_array) + let values = cast_with_options(array, to.data_type(), cast_options)?; + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(1).take(values.len())); + let list = GenericListArray::::new(to.clone(), offsets, values, None); + Ok(Arc::new(list)) } /// Helper function that takes an Generic list container and casts the inner datatype. @@ -5098,7 +5070,7 @@ mod tests { ) .unwrap(); assert_eq!(5, b.len()); - assert_eq!(1, b.null_count()); + assert_eq!(0, b.null_count()); let arr = b.as_list::(); assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); @@ -5127,7 +5099,7 @@ mod tests { ) .unwrap(); assert_eq!(4, b.len()); - assert_eq!(1, b.null_count()); + assert_eq!(0, b.null_count()); let arr = b.as_list::(); assert_eq!(&[0, 1, 2, 3, 4], arr.value_offsets()); assert_eq!(1, arr.value_length(0)); @@ -9448,4 +9420,21 @@ mod tests { assert_eq!("1969-12-31", string_array.value(1)); assert_eq!("1989-12-31", string_array.value(2)); } + + #[test] + fn test_nested_list() { + let mut list = ListBuilder::new(Int32Builder::new()); + list.append_value([Some(1), Some(2), Some(3)]); + list.append_value([Some(4), None, Some(6)]); + let list = list.finish(); + + let to_field = Field::new("nested", list.data_type().clone(), false); + let to = DataType::List(Arc::new(to_field)); + let out = cast(&list, &to).unwrap(); + let opts = FormatOptions::default().with_null("null"); + let formatted = ArrayFormatter::try_new(out.as_ref(), &opts).unwrap(); + + assert_eq!(formatted.value(0).to_string(), "[[1], [2], [3]]"); + assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]"); + } } From c0444642b4a5f60a6621497d1de3316442c77d3b Mon Sep 17 00:00:00 2001 From: Ahmad Sattar Date: Thu, 13 Jul 2023 18:58:41 +0200 Subject: [PATCH 1066/1411] object_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` (#4516) * object_store: Export `ClientConfigKey` * object_store: Add `HttpBuilder::with_config` --- object_store/src/http/mod.rs | 10 ++++++++-- object_store/src/lib.rs | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 124b7da2f7e7..bc01c174f339 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -43,8 +43,8 @@ use url::Url; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -231,6 +231,12 @@ impl HttpBuilder { self } + /// Set individual client configuration without overriding the entire config + pub fn with_config(mut self, key: ClientConfigKey, value: impl Into) -> Self { + self.client_options = self.client_options.with_config(key, value); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 97e6aae97139..4867d485d182 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -274,7 +274,7 @@ use std::sync::Arc; use tokio::io::AsyncWrite; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] -pub use client::ClientOptions; +pub use client::{ClientConfigKey, ClientOptions}; /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; From 43712ac843850d59af8bc996462af8e38f8991a8 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Fri, 14 Jul 2023 01:51:25 +0200 Subject: [PATCH 1067/1411] fix: clippy problem (#4521) --- arrow/src/ffi_stream.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 83d4eead30d6..7d6689a89058 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -194,7 +194,7 @@ impl ExportedArrayStream { } pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 { - let mut private_data = self.get_private_data(); + let private_data = self.get_private_data(); let reader = &private_data.batch_reader; let schema = FFI_ArrowSchema::try_from(reader.schema().as_ref()); @@ -213,7 +213,7 @@ impl ExportedArrayStream { } pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 { - let mut private_data = self.get_private_data(); + let private_data = self.get_private_data(); let reader = &mut private_data.batch_reader; match reader.next() { From edeb7bbd92b1e8069ce1a031a4e23f7bfbc4f1d6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:52:20 -0400 Subject: [PATCH 1068/1411] Handle empty S3 payloads (#4514) (#4518) --- object_store/src/aws/client.rs | 27 ++++++++++++++++----------- object_store/src/aws/mod.rs | 4 ++-- object_store/src/azure/client.rs | 2 +- object_store/src/lib.rs | 9 +++++++++ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 0c2493651000..971d2c60862e 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -35,7 +35,10 @@ use bytes::{Buf, Bytes}; use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; -use reqwest::{header::CONTENT_TYPE, Client as ReqwestClient, Method, Response}; +use reqwest::{ + header::{CONTENT_LENGTH, CONTENT_TYPE}, + Client as ReqwestClient, Method, Response, +}; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::sync::Arc; @@ -236,7 +239,7 @@ impl S3Client { pub async fn put_request( &self, path: &Path, - bytes: Option, + bytes: Bytes, query: &T, ) -> Result { let credential = self.get_credential().await?; @@ -244,18 +247,20 @@ impl S3Client { let mut builder = self.client.request(Method::PUT, url); let mut payload_sha256 = None; - if let Some(bytes) = bytes { - if let Some(checksum) = self.config().checksum { - let digest = checksum.digest(&bytes); - builder = builder - .header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); - if checksum == Checksum::SHA256 { - payload_sha256 = Some(digest); - } + if let Some(checksum) = self.config().checksum { + let digest = checksum.digest(&bytes); + builder = + builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + if checksum == Checksum::SHA256 { + payload_sha256 = Some(digest); } - builder = builder.body(bytes); } + builder = match bytes.is_empty() { + true => builder.header(CONTENT_LENGTH, 0), // Handle empty uploads (#4514) + false => builder.body(bytes), + }; + if let Some(value) = self.config().client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 8a486f986792..e74e6f2dfc3e 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -211,7 +211,7 @@ impl AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, Some(bytes), &()).await?; + self.client.put_request(location, bytes, &()).await?; Ok(()) } @@ -321,7 +321,7 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { .client .put_request( &self.location, - Some(buf.into()), + buf.into(), &[("partNumber", &part), ("uploadId", &self.upload_id)], ) .await?; diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 5ed6f2443f32..e18135c2c77c 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -387,7 +387,7 @@ fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result 0 && obj.location.as_ref().len() > prefix.as_ref().len() { diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 4867d485d182..94261e7d421c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1258,6 +1258,15 @@ mod tests { } delete_fixtures(storage).await; + + let path = Path::from("empty"); + storage.put(&path, Bytes::new()).await.unwrap(); + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, 0); + let data = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(data.len(), 0); + + storage.delete(&path).await.unwrap(); } pub(crate) async fn get_opts(storage: &dyn ObjectStore) { From f951c8f6e39aa36e2be43532b895a0b75b231b7a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:00:55 -0400 Subject: [PATCH 1069/1411] Fix AsyncArrowWriter flush for large buffer sizes (#4526) (#4527) --- parquet/src/arrow/async_writer/mod.rs | 32 +++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index 4d8cf1b90640..0957b58697d7 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -158,7 +158,7 @@ impl AsyncArrowWriter { } async_writer - .write(buffer.as_slice()) + .write_all(buffer.as_slice()) .await .map_err(|e| ParquetError::External(Box::new(e)))?; @@ -207,7 +207,7 @@ impl Write for SharedBuffer { #[cfg(test)] mod tests { - use arrow_array::{ArrayRef, Int64Array, RecordBatchReader}; + use arrow_array::{ArrayRef, BinaryArray, Int64Array, RecordBatchReader}; use bytes::Bytes; use tokio::pin; @@ -374,4 +374,32 @@ mod tests { async_writer.close().await.unwrap(); } } + + #[tokio::test] + async fn test_async_writer_file() { + let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef; + let col2 = Arc::new(BinaryArray::from_iter_values(vec![ + vec![0; 500000], + vec![0; 500000], + vec![0; 500000], + ])) as ArrayRef; + let to_write = + RecordBatch::try_from_iter([("col", col), ("col2", col2)]).unwrap(); + + let temp = tempfile::tempfile().unwrap(); + + let file = tokio::fs::File::from_std(temp.try_clone().unwrap()); + let mut writer = + AsyncArrowWriter::try_new(file, to_write.schema(), 0, None).unwrap(); + writer.write(&to_write).await.unwrap(); + writer.close().await.unwrap(); + + let mut reader = ParquetRecordBatchReaderBuilder::try_new(temp) + .unwrap() + .build() + .unwrap(); + let read = reader.next().unwrap().unwrap(); + + assert_eq!(to_write, read); + } } From 6909db8b76cb047f46a8759035b578c9417bb8c7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:25:27 -0400 Subject: [PATCH 1070/1411] Use Parser for cast kernel (#4512) (#4513) * Use Parser for cast kernel (#4512) * Support parsing unpadded dates * Review feedback * Tweak bound --- arrow-cast/Cargo.toml | 4 + arrow-cast/benches/parse_date.rs | 34 + arrow-cast/src/cast.rs | 515 +--- arrow-cast/src/parse.rs | 4554 +++++++++++++++--------------- 4 files changed, 2416 insertions(+), 2691 deletions(-) create mode 100644 arrow-cast/benches/parse_date.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 494ad104b11c..2758a4817814 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -65,6 +65,10 @@ harness = false name = "parse_time" harness = false +[[bench]] +name = "parse_date" +harness = false + [[bench]] name = "parse_decimal" harness = false diff --git a/arrow-cast/benches/parse_date.rs b/arrow-cast/benches/parse_date.rs new file mode 100644 index 000000000000..e05d38d2f853 --- /dev/null +++ b/arrow-cast/benches/parse_date.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::types::Date32Type; +use arrow_cast::parse::Parser; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let timestamps = ["2020-09-08", "2020-9-8", "2020-09-8", "2020-9-08"]; + + for timestamp in timestamps { + let t = black_box(timestamp); + c.bench_function(t, |b| { + b.iter(|| Date32Type::parse(t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 814ad6589f2d..3a5c27fb6082 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -37,14 +37,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc}; +use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, - string_to_datetime, + string_to_datetime, Parser, }; use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, @@ -1262,36 +1262,28 @@ pub fn cast_with_options( ))), }, (Utf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(array, cast_options), - Date64 => cast_string_to_date64::(array, cast_options), + UInt8 => parse_string::(array, cast_options), + UInt16 => parse_string::(array, cast_options), + UInt32 => parse_string::(array, cast_options), + UInt64 => parse_string::(array, cast_options), + Int8 => parse_string::(array, cast_options), + Int16 => parse_string::(array, cast_options), + Int32 => parse_string::(array, cast_options), + Int64 => parse_string::(array, cast_options), + Float32 => parse_string::(array, cast_options), + Float64 => parse_string::(array, cast_options), + Date32 => parse_string::(array, cast_options), + Date64 => parse_string::(array, cast_options), Binary => Ok(Arc::new(BinaryArray::from(array.as_string::().clone()))), LargeBinary => { let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } LargeUtf8 => cast_byte_container::(array), - Time32(TimeUnit::Second) => { - cast_string_to_time32second::(array, cast_options) - } - Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(array, cast_options) - } - Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(array, cast_options) - } - Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(array, cast_options) - } + Time32(TimeUnit::Second) => parse_string::(array, cast_options), + Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), Timestamp(TimeUnit::Second, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1318,18 +1310,18 @@ pub fn cast_with_options( ))), }, (LargeUtf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(array, cast_options), - Date64 => cast_string_to_date64::(array, cast_options), + UInt8 => parse_string::(array, cast_options), + UInt16 => parse_string::(array, cast_options), + UInt32 => parse_string::(array, cast_options), + UInt64 => parse_string::(array, cast_options), + Int8 => parse_string::(array, cast_options), + Int16 => parse_string::(array, cast_options), + Int32 => parse_string::(array, cast_options), + Int64 => parse_string::(array, cast_options), + Float32 => parse_string::(array, cast_options), + Float64 => parse_string::(array, cast_options), + Date32 => parse_string::(array, cast_options), + Date64 => parse_string::(array, cast_options), Utf8 => cast_byte_container::(array), Binary => { let large_binary = @@ -1339,18 +1331,10 @@ pub fn cast_with_options( LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), - Time32(TimeUnit::Second) => { - cast_string_to_time32second::(array, cast_options) - } - Time32(TimeUnit::Millisecond) => { - cast_string_to_time32millisecond::(array, cast_options) - } - Time64(TimeUnit::Microsecond) => { - cast_string_to_time64microsecond::(array, cast_options) - } - Time64(TimeUnit::Nanosecond) => { - cast_string_to_time64nanosecond::(array, cast_options) - } + Time32(TimeUnit::Second) => parse_string::(array, cast_options), + Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), + Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), Timestamp(TimeUnit::Second, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -2523,422 +2507,35 @@ fn value_to_string( Ok(Arc::new(builder.finish())) } -/// Cast numeric types to Utf8 -fn cast_string_to_numeric( - from: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - T: ArrowPrimitiveType, - ::Native: lexical_core::FromLexical, -{ - Ok(Arc::new(string_to_numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - cast_options, - )?)) -} - -fn string_to_numeric_cast( - from: &GenericStringArray, - cast_options: &CastOptions, -) -> Result, ArrowError> -where - T: ArrowPrimitiveType, - ::Native: lexical_core::FromLexical, -{ - if cast_options.safe { - let iter = from - .iter() - .map(|v| v.and_then(|v| lexical_core::parse(v.as_bytes()).ok())); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) - } else { - let vec = from - .iter() - .map(|v| { - v.map(|v| { - lexical_core::parse(v.as_bytes()).map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - T::DATA_TYPE, - )) - }) - }) - .transpose() - }) - .collect::, _>>()?; - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) }) - } -} - -/// Casts generic string arrays to Date32Array -fn cast_string_to_date32( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - use chrono::Datelike; - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Date32 - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to Date64Array -fn cast_string_to_date64( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|datetime| datetime.timestamp_millis()) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|datetime| datetime.timestamp_millis()) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Date64 - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time32SecondArray` -fn cast_string_to_time32second( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per millisecond. - const NANOS_PER_SEC: u32 = 1_000_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() - + time.nanosecond() / NANOS_PER_SEC) - as i32 - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32SecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() - + time.nanosecond() / NANOS_PER_SEC) - as i32 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time32(TimeUnit::Second) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32SecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time32MillisecondArray` -fn cast_string_to_time32millisecond( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per millisecond. - const NANOS_PER_MILLI: u32 = 1_000_000; - /// The number of milliseconds per second. - const MILLIS_PER_SEC: u32 = 1_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() * MILLIS_PER_SEC - + time.nanosecond() / NANOS_PER_MILLI) - as i32 - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32MillisecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - (time.num_seconds_from_midnight() * MILLIS_PER_SEC - + time.nanosecond() / NANOS_PER_MILLI) - as i32 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time32(TimeUnit::Millisecond) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time32MillisecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time64MicrosecondArray` -fn cast_string_to_time64microsecond( +/// Parse UTF-8 +fn parse_string( array: &dyn Array, cast_options: &CastOptions, ) -> Result { - /// The number of nanoseconds per microsecond. - const NANOS_PER_MICRO: i64 = 1_000; - /// The number of microseconds per second. - const MICROS_PER_SEC: i64 = 1_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC - + time.nanosecond() as i64 / NANOS_PER_MICRO - }) - .ok() - }) - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64MicrosecondArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC - + time.nanosecond() as i64 / NANOS_PER_MICRO - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time64(TimeUnit::Microsecond) - )) - }) - }) - .transpose() - }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64MicrosecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to `Time64NanosecondArray` -fn cast_string_to_time64nanosecond( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - /// The number of nanoseconds per second. - const NANOS_PER_SEC: i64 = 1_000_000_000; - - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - + let string_array = array.as_string::(); let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC - + time.nanosecond() as i64 - }) - .ok() - }) - }); + let iter = string_array.iter().map(|x| x.and_then(P::parse)); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64NanosecondArray::from_trusted_len_iter(iter) } + unsafe { PrimitiveArray::

::from_trusted_len_iter(iter) } } else { - let vec = string_array + let v = string_array .iter() - .map(|v| { - v.map(|v| { - v.parse::() - .map(|time| { - time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC - + time.nanosecond() as i64 - }) - .map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - DataType::Time64(TimeUnit::Nanosecond) - )) - }) - }) - .transpose() + .map(|x| match x { + Some(v) => P::parse(v).ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + P::DATA_TYPE + )) + }), + None => Ok(P::Native::default()), }) - .collect::>, _>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Time64NanosecondArray::from_trusted_len_iter(vec.iter()) } + .collect::, ArrowError>>()?; + PrimitiveArray::new(v.into(), string_array.nulls().cloned()) }; Ok(Arc::new(array) as ArrayRef) @@ -7846,13 +7443,13 @@ mod tests { assert_eq!(946728000000, c.value(0)); assert!(c.is_valid(1)); // "2020-12-15T12:34:56" assert_eq!(1608035696000, c.value(1)); - assert!(c.is_valid(2)); // "2020-2-2T12:34:56" - assert_eq!(1580646896000, c.value(2)); + assert!(!c.is_valid(2)); // "2020-2-2T12:34:56" - // test invalid inputs assert!(!c.is_valid(3)); // "2000-00-00T12:00:00" - assert!(!c.is_valid(4)); // "2000-01-01 12:00:00" - assert!(!c.is_valid(5)); // "2000-01-01" + assert!(c.is_valid(4)); // "2000-01-01 12:00:00" + assert_eq!(946728000000, c.value(4)); + assert!(c.is_valid(5)); // "2000-01-01" + assert_eq!(946684800000, c.value(5)); } #[test] diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 67477c57d519..50bfca0f84bd 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -1,2232 +1,2322 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::timezone::Tz; -use arrow_array::types::*; -use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; -use arrow_buffer::ArrowNativeType; -use arrow_schema::ArrowError; -use chrono::prelude::*; -use half::f16; -use std::str::FromStr; - -/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` -#[inline] -fn parse_nanos(digits: &[u8]) -> u32 { - digits[..N] - .iter() - .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) - * 10_u32.pow((9 - N) as _) -} - -/// Helper for parsing timestamps -struct TimestampParser { - /// The timestamp bytes to parse minus `b'0'` - /// - /// This makes interpretation as an integer inexpensive - digits: [u8; 32], - /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit - mask: u32, -} - -impl TimestampParser { - fn new(bytes: &[u8]) -> Self { - let mut digits = [0; 32]; - let mut mask = 0; - - // Treating all bytes the same way, helps LLVM vectorise this correctly - for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { - *o = i.wrapping_sub(b'0'); - mask |= ((*o < 10) as u32) << idx - } - - Self { digits, mask } - } - - /// Returns true if the byte at `idx` in the original string equals `b` - fn test(&self, idx: usize, b: u8) -> bool { - self.digits[idx] == b.wrapping_sub(b'0') - } - - /// Parses a date of the form `1997-01-31` - fn date(&self) -> Option { - if self.mask & 0b1111111111 != 0b1101101111 - || !self.test(4, b'-') - || !self.test(7, b'-') - { - return None; - } - - let year = self.digits[0] as u16 * 1000 - + self.digits[1] as u16 * 100 - + self.digits[2] as u16 * 10 - + self.digits[3] as u16; - - let month = self.digits[5] * 10 + self.digits[6]; - let day = self.digits[8] * 10 + self.digits[9]; - - NaiveDate::from_ymd_opt(year as _, month as _, day as _) - } - - /// Parses a time of any of forms - /// - `09:26:56` - /// - `09:26:56.123` - /// - `09:26:56.123456` - /// - `09:26:56.123456789` - /// - `092656` - /// - /// Returning the end byte offset - fn time(&self) -> Option<(NaiveTime, usize)> { - // Make a NaiveTime handling leap seconds - let time = |hour, min, sec, nano| match sec { - 60 => { - let nano = 1_000_000_000 + nano; - NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) - } - _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), - }; - - match (self.mask >> 11) & 0b11111111 { - // 09:26:56 - 0b11011011 if self.test(13, b':') && self.test(16, b':') => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[14] * 10 + self.digits[15]; - let second = self.digits[17] * 10 + self.digits[18]; - - match self.test(19, b'.') { - true => { - let digits = (self.mask >> 20).trailing_ones(); - let nanos = match digits { - 0 => return None, - 1 => parse_nanos::<1, 0>(&self.digits[20..21]), - 2 => parse_nanos::<2, 0>(&self.digits[20..22]), - 3 => parse_nanos::<3, 0>(&self.digits[20..23]), - 4 => parse_nanos::<4, 0>(&self.digits[20..24]), - 5 => parse_nanos::<5, 0>(&self.digits[20..25]), - 6 => parse_nanos::<6, 0>(&self.digits[20..26]), - 7 => parse_nanos::<7, 0>(&self.digits[20..27]), - 8 => parse_nanos::<8, 0>(&self.digits[20..28]), - _ => parse_nanos::<9, 0>(&self.digits[20..29]), - }; - Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) - } - false => Some((time(hour, minute, second, 0)?, 19)), - } - } - // 092656 - 0b111111 => { - let hour = self.digits[11] * 10 + self.digits[12]; - let minute = self.digits[13] * 10 + self.digits[14]; - let second = self.digits[15] * 10 + self.digits[16]; - let time = time(hour, minute, second, 0)?; - Some((time, 17)) - } - _ => None, - } - } -} - -/// Accepts a string and parses it relative to the provided `timezone` -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds -/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator -/// * `1997-01-31` # close to RCF3339, only date no time -/// -/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled -/// -/// * `2023-01-01 040506 America/Los_Angeles` -/// -/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error -/// will be returned -/// -/// Some formats supported by PostgresSql -/// are not supported, like -/// -/// * "2023-01-01 04:05:06.789 +07:30:00", -/// * "2023-01-01 040506 +07:30:00", -/// * "2023-01-01 04:05:06.789 PST", -/// -/// [IANA timezones]: https://www.iana.org/time-zones -pub fn string_to_datetime( - timezone: &T, - s: &str, -) -> Result, ArrowError> { - let err = |ctx: &str| { - ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) - }; - - let bytes = s.as_bytes(); - if bytes.len() < 10 { - return Err(err("timestamp must contain at least 10 characters")); - } - - let parser = TimestampParser::new(bytes); - let date = parser.date().ok_or_else(|| err("error parsing date"))?; - if bytes.len() == 10 { - let offset = timezone.offset_from_local_date(&date); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - - let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - return Ok(DateTime::from_local(date.and_time(time), offset)); - } - - if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { - return Err(err("invalid timestamp separator")); - } - - let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; - let datetime = date.and_time(time); - - if tz_offset == 32 { - // Decimal overrun - while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { - tz_offset += 1; - } - } - - if bytes.len() <= tz_offset { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_local(datetime, offset)); - } - - if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_utc(datetime, offset)); - } - - // Parse remainder of string as timezone - let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; - let offset = parsed_tz.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) -} - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// See [`string_to_datetime`] for the full set of supported formats -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function interprets string without an explicit time zone as timestamps -/// relative to UTC, see [`string_to_datetime`] for alternative semantics -/// -/// In particular: -/// -/// ``` -/// # use arrow_cast::parse::string_to_timestamp_nanos; -/// // Note all three of these timestamps are parsed as the same value -/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); -/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); -/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); -/// -/// assert_eq!(a, b); -/// assert_eq!(b, c); -/// ``` -/// -#[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { - to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) -} - -/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates -#[inline] -fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { - if dt.timestamp().checked_mul(1_000_000_000).is_none() { - return Err(ArrowError::ParseError( - ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), - )); - } - - Ok(dt.timestamp_nanos()) -} - -/// Accepts a string in ISO8601 standard format and some -/// variants and converts it to nanoseconds since midnight. -/// -/// Examples of accepted inputs: -/// * `09:26:56.123 AM` -/// * `23:59:59` -/// * `6:00 pm` -// -/// Internally, this function uses the `chrono` library for the -/// time parsing -/// -/// ## Timezone / Offset Handling -/// -/// This function does not support parsing strings with a timezone -/// or offset specified, as it considers only time since midnight. -pub fn string_to_time_nanoseconds(s: &str) -> Result { - let nt = string_to_time(s).ok_or_else(|| { - ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) - })?; - Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) -} - -fn string_to_time(s: &str) -> Option { - let bytes = s.as_bytes(); - if bytes.len() < 4 { - return None; - } - - let (am, bytes) = match bytes.get(bytes.len() - 3..) { - Some(b" AM" | b" am" | b" Am" | b" aM") => { - (Some(true), &bytes[..bytes.len() - 3]) - } - Some(b" PM" | b" pm" | b" pM" | b" Pm") => { - (Some(false), &bytes[..bytes.len() - 3]) - } - _ => (None, bytes), - }; - - if bytes.len() < 4 { - return None; - } - - let mut digits = [b'0'; 6]; - - // Extract hour - let bytes = match (bytes[1], bytes[2]) { - (b':', _) => { - digits[1] = bytes[0]; - &bytes[2..] - } - (_, b':') => { - digits[0] = bytes[0]; - digits[1] = bytes[1]; - &bytes[3..] - } - _ => return None, - }; - - if bytes.len() < 2 { - return None; // Minutes required - } - - // Extract minutes - digits[2] = bytes[0]; - digits[3] = bytes[1]; - - let nanoseconds = match bytes.get(2) { - Some(b':') => { - if bytes.len() < 5 { - return None; - } - - // Extract seconds - digits[4] = bytes[3]; - digits[5] = bytes[4]; - - // Extract sub-seconds if any - match bytes.get(5) { - Some(b'.') => { - let decimal = &bytes[6..]; - if decimal.iter().any(|x| !x.is_ascii_digit()) { - return None; - } - match decimal.len() { - 0 => return None, - 1 => parse_nanos::<1, b'0'>(decimal), - 2 => parse_nanos::<2, b'0'>(decimal), - 3 => parse_nanos::<3, b'0'>(decimal), - 4 => parse_nanos::<4, b'0'>(decimal), - 5 => parse_nanos::<5, b'0'>(decimal), - 6 => parse_nanos::<6, b'0'>(decimal), - 7 => parse_nanos::<7, b'0'>(decimal), - 8 => parse_nanos::<8, b'0'>(decimal), - _ => parse_nanos::<9, b'0'>(decimal), - } - } - Some(_) => return None, - None => 0, - } - } - Some(_) => return None, - None => 0, - }; - - digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); - if digits.iter().any(|x| *x > 9) { - return None; - } - - let hour = match (digits[0] * 10 + digits[1], am) { - (12, Some(true)) => 0, // 12:00 AM -> 00:00 - (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 - (12, Some(false)) => 12, // 12:00 PM -> 12:00 - (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 - (_, Some(_)) => return None, - (h, None) => h, - }; - - // Handle leap second - let (second, nanoseconds) = match digits[4] * 10 + digits[5] { - 60 => (59, nanoseconds + 1_000_000_000), - s => (s, nanoseconds), - }; - - NaiveTime::from_hms_nano_opt( - hour as _, - (digits[2] * 10 + digits[3]) as _, - second as _, - nanoseconds, - ) -} - -/// Specialized parsing implementations -/// used by csv and json reader -pub trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option; - - fn parse_formatted(string: &str, _format: &str) -> Option { - Self::parse(string) - } -} - -impl Parser for Float16Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()) - .ok() - .map(f16::from_f32) - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -macro_rules! parser_primitive { - ($t:ty) => { - impl Parser for $t { - fn parse(string: &str) -> Option { - lexical_core::parse::(string.as_bytes()).ok() - } - } - }; -} -parser_primitive!(UInt64Type); -parser_primitive!(UInt32Type); -parser_primitive!(UInt16Type); -parser_primitive!(UInt8Type); -parser_primitive!(Int64Type); -parser_primitive!(Int32Type); -parser_primitive!(Int16Type); -parser_primitive!(Int8Type); - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - string_to_timestamp_nanos(string).ok() - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1000) - } -} - -impl Parser for TimestampMillisecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000) - } -} - -impl Parser for TimestampSecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000_000) - } -} - -impl Parser for Time64NanosecondType { - // Will truncate any fractions of a nanosecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 - + nt.nanosecond() as i64, - ) - } -} - -impl Parser for Time64MicrosecondType { - // Will truncate any fractions of a microsecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| nanos / 1_000) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000 - + nt.nanosecond() as i64 / 1_000, - ) - } -} - -impl Parser for Time32MillisecondType { - // Will truncate any fractions of a millisecond - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 * 1_000 - + nt.nanosecond() as i32 / 1_000_000, - ) - } -} - -impl Parser for Time32SecondType { - // Will truncate any fractions of a second - fn parse(string: &str) -> Option { - string_to_time_nanoseconds(string) - .ok() - .map(|nanos| (nanos / 1_000_000_000) as i32) - .or_else(|| string.parse::().ok()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 - + nt.nanosecond() as i32 / 1_000_000_000, - ) - } -} - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -/// Error message if nanosecond conversion request beyond supported interval -const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - let parser = TimestampParser::new(string.as_bytes()); - let date = parser.date()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - let date = NaiveDate::parse_from_str(string, format).ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - let date_time = string_to_datetime(&Utc, string).ok()?; - Some(date_time.timestamp_millis()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::format::Fixed; - use chrono::format::StrftimeItems; - let fmt = StrftimeItems::new(format); - let has_zone = fmt.into_iter().any(|item| match item { - chrono::format::Item::Fixed(fixed_item) => matches!( - fixed_item, - Fixed::RFC2822 - | Fixed::RFC3339 - | Fixed::TimezoneName - | Fixed::TimezoneOffsetColon - | Fixed::TimezoneOffsetColonZ - | Fixed::TimezoneOffset - | Fixed::TimezoneOffsetZ - ), - _ => false, - }); - if has_zone { - let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } else { - let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } - } -} - -/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. -/// The result value can't be out of bounds. -pub fn parse_decimal( - s: &str, - precision: u8, - scale: i8, -) -> Result { - let mut result = T::Native::usize_as(0); - let mut fractionals = 0; - let mut digits = 0; - let base = T::Native::usize_as(10); - - let bs = s.as_bytes(); - let (bs, negative) = match bs.first() { - Some(b'-') => (&bs[1..], true), - Some(b'+') => (&bs[1..], false), - _ => (bs, false), - }; - - if bs.is_empty() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - - let mut bs = bs.iter(); - // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. - // Thus, if we validate the precision correctly, we can skip overflow checks. - while let Some(b) = bs.next() { - match b { - b'0'..=b'9' => { - if digits == 0 && *b == b'0' { - // Ignore leading zeros. - continue; - } - digits += 1; - result = result.mul_wrapping(base); - result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - b'.' => { - for b in bs.by_ref() { - if !b.is_ascii_digit() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - if fractionals == scale { - // We have processed all the digits that we need. All that - // is left is to validate that the rest of the string contains - // valid digits. - continue; - } - fractionals += 1; - digits += 1; - result = result.mul_wrapping(base); - result = - result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - } - - // Fail on "." - if digits == 0 { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - _ => { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - } - } - - if fractionals < scale { - let exp = scale - fractionals; - if exp as u8 + digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - let mul = base.pow_wrapping(exp as _); - result = result.mul_wrapping(mul); - } else if digits > precision { - return Err(ArrowError::ParseError("parse decimal overflow".to_string())); - } - - Ok(if negative { - result.neg_wrapping() - } else { - result - }) -} - -pub fn parse_interval_year_month( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Year); - let interval = Interval::parse(value, &config)?; - - let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( - "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." - )))?; - - Ok(IntervalYearMonthType::make_value(0, months)) -} - -pub fn parse_interval_day_time( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Day); - let interval = Interval::parse(value, &config)?; - - let (days, millis) = interval.to_day_time().map_err(|_| ArrowError::CastError(format!( - "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" - )))?; - - Ok(IntervalDayTimeType::make_value(days, millis)) -} - -pub fn parse_interval_month_day_nano( - value: &str, -) -> Result<::Native, ArrowError> { - let config = IntervalParseConfig::new(IntervalUnit::Month); - let interval = Interval::parse(value, &config)?; - - let (months, days, nanos) = interval.to_month_day_nanos(); - - Ok(IntervalMonthDayNanoType::make_value(months, days, nanos)) -} - -const NANOS_PER_MILLIS: i64 = 1_000_000; -const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS; -const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND; -const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; -#[cfg(test)] -const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; - -#[rustfmt::skip] -#[derive(Clone, Copy)] -#[repr(u16)] -enum IntervalUnit { - Century = 0b_0000_0000_0001, - Decade = 0b_0000_0000_0010, - Year = 0b_0000_0000_0100, - Month = 0b_0000_0000_1000, - Week = 0b_0000_0001_0000, - Day = 0b_0000_0010_0000, - Hour = 0b_0000_0100_0000, - Minute = 0b_0000_1000_0000, - Second = 0b_0001_0000_0000, - Millisecond = 0b_0010_0000_0000, - Microsecond = 0b_0100_0000_0000, - Nanosecond = 0b_1000_0000_0000, -} - -impl FromStr for IntervalUnit { - type Err = ArrowError; - - fn from_str(s: &str) -> Result { - match s.to_lowercase().as_str() { - "century" | "centuries" => Ok(Self::Century), - "decade" | "decades" => Ok(Self::Decade), - "year" | "years" => Ok(Self::Year), - "month" | "months" => Ok(Self::Month), - "week" | "weeks" => Ok(Self::Week), - "day" | "days" => Ok(Self::Day), - "hour" | "hours" => Ok(Self::Hour), - "minute" | "minutes" => Ok(Self::Minute), - "second" | "seconds" => Ok(Self::Second), - "millisecond" | "milliseconds" => Ok(Self::Millisecond), - "microsecond" | "microseconds" => Ok(Self::Microsecond), - "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), - _ => Err(ArrowError::NotYetImplemented(format!( - "Unknown interval type: {s}" - ))), - } - } -} - -pub type MonthDayNano = (i32, i32, i64); - -/// Chosen based on the number of decimal digits in 1 week in nanoseconds -const INTERVAL_PRECISION: u32 = 15; - -#[derive(Clone, Copy, Debug, PartialEq)] -struct IntervalAmount { - /// The integer component of the interval amount - integer: i64, - /// The fractional component multiplied by 10^INTERVAL_PRECISION - frac: i64, -} - -#[cfg(test)] -impl IntervalAmount { - fn new(integer: i64, frac: i64) -> Self { - Self { integer, frac } - } -} - -impl FromStr for IntervalAmount { - type Err = ArrowError; - - fn from_str(s: &str) -> Result { - match s.split_once('.') { - Some((integer, frac)) - if frac.len() <= INTERVAL_PRECISION as usize - && !frac.is_empty() - && !frac.starts_with('-') => - { - // integer will be "" for values like ".5" - // and "-" for values like "-.5" - let explicit_neg = integer.starts_with('-'); - let integer = if integer.is_empty() || integer == "-" { - Ok(0) - } else { - integer.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - }) - }?; - - let frac_unscaled = frac.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - })?; - - // scale fractional part by interval precision - let frac = - frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); - - // propagate the sign of the integer part to the fractional part - let frac = if integer < 0 || explicit_neg { - -frac - } else { - frac - }; - - let result = Self { integer, frac }; - - Ok(result) - } - Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( - format!("Failed to parse {s} as interval amount"), - )), - Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { - Err(ArrowError::ParseError(format!( - "{s} exceeds the precision available for interval amount" - ))) - } - Some(_) | None => { - let integer = s.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) - })?; - - let result = Self { integer, frac: 0 }; - Ok(result) - } - } - } -} - -#[derive(Debug, Default, PartialEq)] -struct Interval { - months: i32, - days: i32, - nanos: i64, -} - -impl Interval { - fn new(months: i32, days: i32, nanos: i64) -> Self { - Self { - months, - days, - nanos, - } - } - - fn to_year_months(&self) -> Result { - match (self.months, self.days, self.nanos) { - (months, days, nanos) if days == 0 && nanos == 0 => Ok(months), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Unable to represent interval with days and nanos as year-months: {:?}", - self - ))), - } - } - - fn to_day_time(&self) -> Result<(i32, i32), ArrowError> { - let days = self.months.mul_checked(30)?.add_checked(self.days)?; - - match self.nanos { - nanos if nanos % NANOS_PER_MILLIS == 0 => { - let millis = (self.nanos / 1_000_000).try_into().map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Unable to represent {} nanos as milliseconds in a signed 32-bit integer", - self.nanos - )) - })?; - - Ok((days, millis)) - } - nanos => Err(ArrowError::InvalidArgumentError(format!( - "Unable to represent {nanos} as milliseconds" - ))), - } - } - - fn to_month_day_nanos(&self) -> (i32, i32, i64) { - (self.months, self.days, self.nanos) - } - - /// Parse string value in traditional Postgres format such as - /// `1 year 2 months 3 days 4 hours 5 minutes 6 seconds` - fn parse(value: &str, config: &IntervalParseConfig) -> Result { - let components = parse_interval_components(value, config)?; - - let result = components.into_iter().fold( - Ok(Self::default()), - |result, (amount, unit)| match result { - Ok(result) => result.add(amount, unit), - Err(e) => Err(e), - }, - )?; - - Ok(result) - } - - /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. - /// When the interval unit is larger than months, the result is rounded to total months and not spilled to days/nanos. - /// Fractional parts of weeks and days are represented using days and nanoseconds. - /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days - /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours - /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) - fn add( - &self, - amount: IntervalAmount, - unit: IntervalUnit, - ) -> Result { - let result = match unit { - IntervalUnit::Century => { - let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} centuries as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Decade => { - let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; - - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} decades as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Year => { - let months_int = amount.integer.mul_checked(12)?; - let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} years as months in a signed 32-bit integer", - &amount.integer - )) - })?; - - Self::new(self.months.add_checked(months)?, self.days, self.nanos) - } - IntervalUnit::Month => { - let months = amount.integer.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} months in a signed 32-bit integer", - &amount.integer - )) - })?; - - let days = amount.frac * 3 / 10_i64.pow(INTERVAL_PRECISION - 1); - let days = days.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} months as days in a signed 32-bit integer", - amount.frac / 10_i64.pow(INTERVAL_PRECISION) - )) - })?; - - Self::new( - self.months.add_checked(months)?, - self.days.add_checked(days)?, - self.nanos, - ) - } - IntervalUnit::Week => { - let days = amount.integer.mul_checked(7)?.try_into().map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} weeks as days in a signed 32-bit integer", - &amount.integer - )) - })?; - - let nanos = - amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - - Self::new( - self.months, - self.days.add_checked(days)?, - self.nanos.add_checked(nanos)?, - ) - } - IntervalUnit::Day => { - let days = amount.integer.try_into().map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Unable to represent {} days in a signed 32-bit integer", - amount.integer - )) - })?; - - let nanos = - amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - - Self::new( - self.months, - self.days.add_checked(days)?, - self.nanos.add_checked(nanos)?, - ) - } - IntervalUnit::Hour => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; - let nanos_frac = - amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Minute => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_MINUTE)?; - let nanos_frac = amount.frac * 6 / 10_i64.pow(INTERVAL_PRECISION - 10); - - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Second => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_SECOND)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 9); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Millisecond => { - let nanos_int = amount.integer.mul_checked(NANOS_PER_MILLIS)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 6); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Microsecond => { - let nanos_int = amount.integer.mul_checked(1_000)?; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 3); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - IntervalUnit::Nanosecond => { - let nanos_int = amount.integer; - let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION); - let nanos = nanos_int.add_checked(nanos_frac)?; - - Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) - } - }; - - Ok(result) - } -} - -struct IntervalParseConfig { - /// The default unit to use if none is specified - /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = IntervalType::Second - default_unit: IntervalUnit, -} - -impl IntervalParseConfig { - fn new(default_unit: IntervalUnit) -> Self { - Self { default_unit } - } -} - -/// parse the string into a vector of interval components i.e. (amount, unit) tuples -fn parse_interval_components( - value: &str, - config: &IntervalParseConfig, -) -> Result, ArrowError> { - let parts = value.split_whitespace(); - - let raw_amounts = parts.clone().step_by(2); - let raw_units = parts.skip(1).step_by(2); - - // parse amounts - let (amounts, invalid_amounts) = raw_amounts - .map(IntervalAmount::from_str) - .partition::, _>(Result::is_ok); - - // invalid amounts? - if !invalid_amounts.is_empty() { - return Err(ArrowError::NotYetImplemented(format!( - "Unsupported Interval Expression with value {value:?}" - ))); - } - - // parse units - let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units - .clone() - .map(IntervalUnit::from_str) - .partition(Result::is_ok); - - // invalid units? - if !invalid_units.is_empty() { - return Err(ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}" - ))); - } - - // collect parsed results - let amounts = amounts.into_iter().map(Result::unwrap).collect::>(); - let units = units.into_iter().map(Result::unwrap).collect::>(); - - // if only an amount is specified, use the default unit - if amounts.len() == 1 && units.is_empty() { - return Ok(vec![(amounts[0], config.default_unit)]); - }; - - // duplicate units? - let mut observed_interval_types = 0; - for (unit, raw_unit) in units.iter().zip(raw_units) { - if observed_interval_types & (*unit as u16) != 0 { - return Err(ArrowError::ParseError(format!( - "Invalid input syntax for type interval: {value:?}. Repeated type '{raw_unit}'", - ))); - } - - observed_interval_types |= *unit as u16; - } - - let result = amounts.iter().copied().zip(units.iter().copied()); - - Ok(result.collect::>()) -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::timezone::Tz; - use arrow_buffer::i256; - - #[test] - fn test_parse_nanos() { - assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); - assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); - assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); - } - - #[test] - fn string_to_timestamp_timezone() { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - fn string_to_timestamp_timezone_space() { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z").unwrap() - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() - ); - } - - #[test] - #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime - fn string_to_timestamp_no_timezone() { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat susceptible to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(13, 42, 29).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - // ensure without time work - // no time, should be the nano second at - // 2020-09-08 0:0:0 - let naive_datetime_no_time = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_opt(0, 0, 0).unwrap(), - ); - - assert_eq!( - naive_datetime_no_time.timestamp_nanos(), - parse_timestamp("2020-09-08").unwrap() - ) - } - - #[test] - fn string_to_timestamp_chrono() { - let cases = [ - "2020-09-08T13:42:29Z", - "1969-01-01T00:00:00.1Z", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12+00:00", - "2020-09-08T12:00:12.1+00:00", - "2020-09-08T12:00:12.12+00:00", - "2020-09-08T12:00:12.123+00:00", - "2020-09-08T12:00:12.1234+00:00", - "2020-09-08T12:00:12.12345+00:00", - "2020-09-08T12:00:12.123456+00:00", - "2020-09-08T12:00:12.1234567+00:00", - "2020-09-08T12:00:12.12345678+00:00", - "2020-09-08T12:00:12.123456789+00:00", - "2020-09-08T12:00:12.12345678912z", - "2020-09-08T12:00:12.123456789123Z", - "2020-09-08T12:00:12.123456789123+02:00", - "2020-09-08T12:00:12.12345678912345Z", - "2020-09-08T12:00:12.1234567891234567+02:00", - "2020-09-08T12:00:60Z", - "2020-09-08T12:00:60.123Z", - "2020-09-08T12:00:60.123456+02:00", - "2020-09-08T12:00:60.1234567891234567+02:00", - "2020-09-08T12:00:60.999999999+02:00", - "2020-09-08t12:00:12.12345678+00:00", - "2020-09-08t12:00:12+00:00", - "2020-09-08t12:00:12Z", - ]; - - for case in cases { - let chrono = DateTime::parse_from_rfc3339(case).unwrap(); - let chrono_utc = chrono.with_timezone(&Utc); - - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono_utc, custom) - } - } - - #[test] - fn string_to_timestamp_naive() { - let cases = [ - "2018-11-13T17:11:10.011375885995", - "2030-12-04T17:11:10.123", - "2030-12-04T17:11:10.1234", - "2030-12-04T17:11:10.123456", - ]; - for case in cases { - let chrono = - NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); - let custom = string_to_datetime(&Utc, case).unwrap(); - assert_eq!(chrono, custom.naive_utc()) - } - } - - #[test] - fn string_to_timestamp_invalid() { - // Test parsing invalid formats - let cases = [ - ("", "timestamp must contain at least 10 characters"), - ("SS", "timestamp must contain at least 10 characters"), - ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), - ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), - ("1997-01-31 09:26:56.123Z", "error parsing time"), - ("1997:01:31T09:26:56.123Z", "error parsing date"), - ("1997:1:31T09:26:56.123Z", "error parsing date"), - ("1997-01-32T09:26:56.123Z", "error parsing date"), - ("1997-13-32T09:26:56.123Z", "error parsing date"), - ("1997-02-29T09:26:56.123Z", "error parsing date"), - ("2015-02-30T17:35:20-08:00", "error parsing date"), - ("1997-01-10T9:26:56.123Z", "error parsing time"), - ("2015-01-20T25:35:20-08:00", "error parsing time"), - ("1997-01-10T09:61:56.123Z", "error parsing time"), - ("1997-01-10T09:61:90.123Z", "error parsing time"), - ("1997-01-10T12:00:6.123Z", "error parsing time"), - ("1997-01-31T092656.123Z", "error parsing time"), - ("1997-01-10T12:00:06.", "error parsing time"), - ("1997-01-10T12:00:06. ", "error parsing time"), - ]; - - for (s, ctx) in cases { - let expected = - format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); - let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); - assert_eq!(actual, expected) - } - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = string_to_timestamp_nanos(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{s}': {e:?}"); - } - result - } - - #[test] - fn string_without_timezone_to_timestamp() { - // string without timezone should always output the same regardless the local or session timezone - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29.190855").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29.190855").unwrap() - ); - - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), - NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08T13:42:29").unwrap() - ); - - assert_eq!( - naive_datetime.timestamp_nanos(), - parse_timestamp("2020-09-08 13:42:29").unwrap() - ); - - let tz: Tz = "+02:00".parse().unwrap(); - let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 11:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 13:42:29"); - - let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); - let utc = date.naive_utc().to_string(); - assert_eq!(utc, "2020-09-08 13:42:29"); - let local = date.naive_local().to_string(); - assert_eq!(local, "2020-09-08 15:42:29"); - - let dt = - NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") - .unwrap(); - let local: Tz = "+08:00".parse().unwrap(); - - // Parsed as offset from UTC - let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); - assert_eq!(dt, date.naive_utc()); - assert_ne!(dt, date.naive_local()); - - // Parsed as offset from local - let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); - assert_eq!(dt, date.naive_local()); - assert_ne!(dt, date.naive_utc()); - } - - #[test] - fn parse_time64_nanos() { - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567899999999"), - Some(7_801_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.1234567"), - Some(7_801_123_456_700) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 AM"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01.123456789 am"), - Some(601_123_456_789) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 PM"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01.12345678 pm"), - Some(51_001_123_456_780) - ); - assert_eq!( - Time64NanosecondType::parse("02:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01"), - Some(7_801_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 AM"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10:01 am"), - Some(601_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 PM"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10:01 pm"), - Some(51_001_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("02:10"), - Some(7_800_000_000_000) - ); - assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); - assert_eq!( - Time64NanosecondType::parse("12:10 AM"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("12:10 am"), - Some(600_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 PM"), - Some(51_000_000_000_000) - ); - assert_eq!( - Time64NanosecondType::parse("2:10 pm"), - Some(51_000_000_000_000) - ); - - // parse directly as nanoseconds - assert_eq!(Time64NanosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64NanosecondType::parse("23:59:60"), - Some(86_400_000_000_000) - ); - - // custom format - assert_eq!( - Time64NanosecondType::parse_formatted( - "02 - 10 - 01 - .1234567", - "%H - %M - %S - %.f" - ), - Some(7_801_123_456_700) - ); - } - - #[test] - fn parse_time64_micros() { - // expected formats - assert_eq!( - Time64MicrosecondType::parse("02:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.1234"), - Some(7_801_123_400) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 AM"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01.123456 am"), - Some(601_123_456) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 PM"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01.12345 pm"), - Some(51_001_123_450) - ); - assert_eq!( - Time64MicrosecondType::parse("02:10:01"), - Some(7_801_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 AM"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("12:10:01 am"), - Some(601_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 PM"), - Some(51_001_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10:01 pm"), - Some(51_001_000_000) - ); - assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); - assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); - assert_eq!( - Time64MicrosecondType::parse("2:10 PM"), - Some(51_000_000_000) - ); - assert_eq!( - Time64MicrosecondType::parse("2:10 pm"), - Some(51_000_000_000) - ); - - // parse directly as microseconds - assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); - - // leap second - assert_eq!( - Time64MicrosecondType::parse("23:59:60"), - Some(86_400_000_000) - ); - - // custom format - assert_eq!( - Time64MicrosecondType::parse_formatted( - "02 - 10 - 01 - .1234", - "%H - %M - %S - %.f" - ), - Some(7_801_123_400) - ); - } - - #[test] - fn parse_time32_millis() { - // expected formats - assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); - assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 AM"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("12:10:01.123 am"), - Some(601_123) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 PM"), - Some(51_001_120) - ); - assert_eq!( - Time32MillisecondType::parse("2:10:01.12 pm"), - Some(51_001_120) - ); - assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); - assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); - assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); - assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); - assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); - - // parse directly as milliseconds - assert_eq!(Time32MillisecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); - - // custom format - assert_eq!( - Time32MillisecondType::parse_formatted( - "02 - 10 - 01 - .1", - "%H - %M - %S - %.f" - ), - Some(7_801_100) - ); - } - - #[test] - fn parse_time32_secs() { - // expected formats - assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); - assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); - assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); - assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); - assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); - assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); - assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); - assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); - assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); - assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); - assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); - - // parse directly as seconds - assert_eq!(Time32SecondType::parse("1"), Some(1)); - - // leap second - assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); - - // custom format - assert_eq!( - Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), - Some(7_801) - ); - } - - #[test] - fn test_string_to_time_invalid() { - let cases = [ - "25:00", - "9:00:", - "009:00", - "09:0:00", - "25:00:00", - "13:00 AM", - "13:00 PM", - "12:00. AM", - "09:0:00", - "09:01:0", - "09:01:1", - "9:1:0", - "09:01:0", - "1:00.123", - "1:00:00.123f", - " 9:00:00", - ":09:00", - "T9:00:00", - "AM", - ]; - for case in cases { - assert!(string_to_time(case).is_none(), "{case}"); - } - } - - #[test] - fn test_string_to_time_chrono() { - let cases = [ - ("1:00", "%H:%M"), - ("12:00", "%H:%M"), - ("13:00", "%H:%M"), - ("24:00", "%H:%M"), - ("1:00:00", "%H:%M:%S"), - ("12:00:30", "%H:%M:%S"), - ("13:00:59", "%H:%M:%S"), - ("24:00:60", "%H:%M:%S"), - ("09:00:00", "%H:%M:%S%.f"), - ("0:00:30.123456", "%H:%M:%S%.f"), - ("0:00 AM", "%I:%M %P"), - ("1:00 AM", "%I:%M %P"), - ("12:00 AM", "%I:%M %P"), - ("13:00 AM", "%I:%M %P"), - ("0:00 PM", "%I:%M %P"), - ("1:00 PM", "%I:%M %P"), - ("12:00 PM", "%I:%M %P"), - ("13:00 PM", "%I:%M %P"), - ("1:00 pM", "%I:%M %P"), - ("1:00 Pm", "%I:%M %P"), - ("1:00 aM", "%I:%M %P"), - ("1:00 Am", "%I:%M %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), - ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), - ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), - ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), - ]; - for (s, format) in cases { - let chrono = NaiveTime::parse_from_str(s, format).ok(); - let custom = string_to_time(s); - assert_eq!(chrono, custom, "{s}"); - } - } - - #[test] - fn test_parse_interval() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - assert_eq!( - Interval::new(1i32, 0i32, 0i64), - Interval::parse("1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(2i32, 0i32, 0i64), - Interval::parse("2 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-1i32, -18i32, -(NANOS_PER_DAY / 5)), - Interval::parse("-1.5 months -3.2 days", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 15i32, 0), - Interval::parse("0.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 15i32, 0), - Interval::parse(".5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -15i32, 0), - Interval::parse("-0.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -15i32, 0), - Interval::parse("-.5 months", &config).unwrap(), - ); - - assert_eq!( - Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), - Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::parse("1 centurys 1 month", &config) - .unwrap_err() - .to_string(), - r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# - ); - - assert_eq!( - Interval::new(37i32, 0i32, 0i64), - Interval::parse("3 year 1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(35i32, 0i32, 0i64), - Interval::parse("3 year -1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-37i32, 0i32, 0i64), - Interval::parse("-3 year -1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-35i32, 0i32, 0i64), - Interval::parse("-3 year 1 month", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 5i32, 0i64), - Interval::parse("5 days", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, 3 * NANOS_PER_HOUR), - Interval::parse("7 days 3 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, 5 * NANOS_PER_MINUTE), - Interval::parse("7 days 5 minutes", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, 7i32, -5 * NANOS_PER_MINUTE), - Interval::parse("7 days -5 minutes", &config).unwrap(), - ); - - assert_eq!( - Interval::new(0i32, -7i32, 5 * NANOS_PER_HOUR), - Interval::parse("-7 days 5 hours", &config).unwrap(), - ); - - assert_eq!( - Interval::new( - 0i32, - -7i32, - -5 * NANOS_PER_HOUR - 5 * NANOS_PER_MINUTE - 5 * NANOS_PER_SECOND - ), - Interval::parse("-7 days -5 hours -5 minutes -5 seconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 0i32, 25 * NANOS_PER_MILLIS), - Interval::parse("1 year 25 millisecond", &config).unwrap(), - ); - - assert_eq!( - Interval::new( - 12i32, - 1i32, - (NANOS_PER_SECOND as f64 * 0.000000001_f64) as i64 - ), - Interval::parse("1 year 1 day 0.000000001 seconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, NANOS_PER_MILLIS / 10), - Interval::parse("1 year 1 day 0.1 milliseconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, 1000i64), - Interval::parse("1 year 1 day 1 microsecond", &config).unwrap(), - ); - - assert_eq!( - Interval::new(12i32, 1i32, 1i64), - Interval::parse("1 year 1 day 1 nanoseconds", &config).unwrap(), - ); - - assert_eq!( - Interval::new(1i32, 0i32, -NANOS_PER_SECOND), - Interval::parse("1 month -1 second", &config).unwrap(), - ); - - assert_eq!( - Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), - Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), - ); - } - - #[test] - fn test_duplicate_interval_type() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - let err = Interval::parse("1 month 1 second 1 second", &config) - .expect_err("parsing interval should have failed"); - assert_eq!( - r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, - format!("{err:?}") - ); - - // test with singular and plural forms - let err = Interval::parse("1 century 2 centuries", &config) - .expect_err("parsing interval should have failed"); - assert_eq!( - r#"ParseError("Invalid input syntax for type interval: \"1 century 2 centuries\". Repeated type 'centuries'")"#, - format!("{err:?}") - ); - } - - #[test] - fn test_interval_amount_parsing() { - // integer - let result = IntervalAmount::from_str("123").unwrap(); - let expected = IntervalAmount::new(123, 0); - - assert_eq!(result, expected); - - // positive w/ fractional - let result = IntervalAmount::from_str("0.3").unwrap(); - let expected = IntervalAmount::new(0, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)); - - assert_eq!(result, expected); - - // negative w/ fractional - let result = IntervalAmount::from_str("-3.5").unwrap(); - let expected = IntervalAmount::new(-3, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)); - - assert_eq!(result, expected); - - // invalid: missing fractional - let result = IntervalAmount::from_str("3."); - assert!(result.is_err()); - - // invalid: sign in fractional - let result = IntervalAmount::from_str("3.-5"); - assert!(result.is_err()); - } - - #[test] - fn test_interval_precision() { - let config = IntervalParseConfig::new(IntervalUnit::Month); - - let result = Interval::parse("100000.1 days", &config).unwrap(); - let expected = Interval::new(0_i32, 100_000_i32, NANOS_PER_DAY / 10); - - assert_eq!(result, expected); - } - - #[test] - fn test_interval_addition() { - // add 4.1 centuries - let start = Interval::new(1, 2, 3); - let expected = Interval::new(4921, 2, 3); - - let result = start - .add( - IntervalAmount::new(4, 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Century, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 10.25 decades - let start = Interval::new(1, 2, 3); - let expected = Interval::new(1231, 2, 3); - - let result = start - .add( - IntervalAmount::new(10, 25 * 10_i64.pow(INTERVAL_PRECISION - 2)), - IntervalUnit::Decade, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 30.3 years (reminder: Postgres logic does not spill to days/nanos when interval is larger than a month) - let start = Interval::new(1, 2, 3); - let expected = Interval::new(364, 2, 3); - - let result = start - .add( - IntervalAmount::new(30, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Year, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 1.5 months - let start = Interval::new(1, 2, 3); - let expected = Interval::new(2, 17, 3); - - let result = start - .add( - IntervalAmount::new(1, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Month, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add -2 weeks - let start = Interval::new(1, 25, 3); - let expected = Interval::new(1, 11, 3); - - let result = start - .add(IntervalAmount::new(-2, 0), IntervalUnit::Week) - .unwrap(); - - assert_eq!(result, expected); - - // add 2.2 days - let start = Interval::new(12, 15, 3); - let expected = Interval::new(12, 17, 3 + 17_280 * NANOS_PER_SECOND); - - let result = start - .add( - IntervalAmount::new(2, 2 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Day, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add 12.5 hours - let start = Interval::new(1, 2, 3); - let expected = Interval::new(1, 2, 3 + 45_000 * NANOS_PER_SECOND); - - let result = start - .add( - IntervalAmount::new(12, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Hour, - ) - .unwrap(); - - assert_eq!(result, expected); - - // add -1.5 minutes - let start = Interval::new(0, 0, -3); - let expected = Interval::new(0, 0, -90_000_000_000 - 3); - - let result = start - .add( - IntervalAmount::new(-1, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)), - IntervalUnit::Minute, - ) - .unwrap(); - - assert_eq!(result, expected); - } - - #[test] - fn string_to_timestamp_old() { - parse_timestamp("1677-06-14T07:29:01.256") - .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) - .unwrap_err(); - } - - #[test] - fn test_parse_decimal_with_parameter() { - let tests = [ - ("0", 0i128), - ("123.123", 123123i128), - ("123.1234", 123123i128), - ("123.1", 123100i128), - ("123", 123000i128), - ("-123.123", -123123i128), - ("-123.1234", -123123i128), - ("-123.1", -123100i128), - ("-123", -123000i128), - ("0.0000123", 0i128), - ("12.", 12000i128), - ("-12.", -12000i128), - ("00.1", 100i128), - ("-00.1", -100i128), - ("12345678912345678.1234", 12345678912345678123i128), - ("-12345678912345678.1234", -12345678912345678123i128), - ("99999999999999999.999", 99999999999999999999i128), - ("-99999999999999999.999", -99999999999999999999i128), - (".123", 123i128), - ("-.123", -123i128), - ("123.", 123000i128), - ("-123.", -123000i128), - ]; - for (s, i) in tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!(i, result_128.unwrap()); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!(i256::from_i128(i), result_256.unwrap()); - } - let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; - for s in can_not_parse_tests { - let result_128 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_128.unwrap_err().to_string() - ); - let result_256 = parse_decimal::(s, 20, 3); - assert_eq!( - format!("Parser error: can't parse the string value {s} to decimal"), - result_256.unwrap_err().to_string() - ); - } - let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; - for s in overflow_parse_tests { - let result_128 = parse_decimal::(s, 10, 3); - let expected_128 = "Parser error: parse decimal overflow"; - let actual_128 = result_128.unwrap_err().to_string(); - - assert!( - actual_128.contains(expected_128), - "actual: '{actual_128}', expected: '{expected_128}'" - ); - - let result_256 = parse_decimal::(s, 10, 3); - let expected_256 = "Parser error: parse decimal overflow"; - let actual_256 = result_256.unwrap_err().to_string(); - - assert!( - actual_256.contains(expected_256), - "actual: '{actual_256}', expected: '{expected_256}'" - ); - } - - let edge_tests_128 = [ - ( - "99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 0, - ), - ( - "999999999999999999999999999999999999.99", - 99999999999999999999999999999999999999i128, - 2, - ), - ( - "9999999999999999999999999.9999999999999", - 99999999999999999999999999999999999999i128, - 13, - ), - ( - "9999999999999999999999999", - 99999999999999999999999990000000000000i128, - 13, - ), - ( - "0.99999999999999999999999999999999999999", - 99999999999999999999999999999999999999i128, - 38, - ), - ]; - for (s, i, scale) in edge_tests_128 { - let result_128 = parse_decimal::(s, 38, scale); - assert_eq!(i, result_128.unwrap()); - } - let edge_tests_256 = [ - ( - "9999999999999999999999999999999999999999999999999999999999999999999999999999", -i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 0, - ), - ( - "999999999999999999999999999999999999999999999999999999999999999999999999.9999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 4, - ), - ( - "99999999999999999999999999999999999999999999999999.99999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), - 26, - ), - ( - "99999999999999999999999999999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), - 26, - ), - ]; - for (s, i, scale) in edge_tests_256 { - let result = parse_decimal::(s, 76, scale); - assert_eq!(i, result.unwrap()); - } - } -} +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType}; +use arrow_buffer::ArrowNativeType; +use arrow_schema::ArrowError; +use chrono::prelude::*; +use half::f16; +use std::str::FromStr; + +/// Parse nanoseconds from the first `N` values in digits, subtracting the offset `O` +#[inline] +fn parse_nanos(digits: &[u8]) -> u32 { + digits[..N] + .iter() + .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32) + * 10_u32.pow((9 - N) as _) +} + +/// Helper for parsing RFC3339 timestamps +struct TimestampParser { + /// The timestamp bytes to parse minus `b'0'` + /// + /// This makes interpretation as an integer inexpensive + digits: [u8; 32], + /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit + mask: u32, +} + +impl TimestampParser { + fn new(bytes: &[u8]) -> Self { + let mut digits = [0; 32]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u32) << idx + } + + Self { digits, mask } + } + + /// Returns true if the byte at `idx` in the original string equals `b` + fn test(&self, idx: usize, b: u8) -> bool { + self.digits[idx] == b.wrapping_sub(b'0') + } + + /// Parses a date of the form `1997-01-31` + fn date(&self) -> Option { + if self.mask & 0b1111111111 != 0b1101101111 + || !self.test(4, b'-') + || !self.test(7, b'-') + { + return None; + } + + let year = self.digits[0] as u16 * 1000 + + self.digits[1] as u16 * 100 + + self.digits[2] as u16 * 10 + + self.digits[3] as u16; + + let month = self.digits[5] * 10 + self.digits[6]; + let day = self.digits[8] * 10 + self.digits[9]; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) + } + + /// Parses a time of any of forms + /// - `09:26:56` + /// - `09:26:56.123` + /// - `09:26:56.123456` + /// - `09:26:56.123456789` + /// - `092656` + /// + /// Returning the end byte offset + fn time(&self) -> Option<(NaiveTime, usize)> { + // Make a NaiveTime handling leap seconds + let time = |hour, min, sec, nano| match sec { + 60 => { + let nano = 1_000_000_000 + nano; + NaiveTime::from_hms_nano_opt(hour as _, min as _, 59, nano) + } + _ => NaiveTime::from_hms_nano_opt(hour as _, min as _, sec as _, nano), + }; + + match (self.mask >> 11) & 0b11111111 { + // 09:26:56 + 0b11011011 if self.test(13, b':') && self.test(16, b':') => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[14] * 10 + self.digits[15]; + let second = self.digits[17] * 10 + self.digits[18]; + + match self.test(19, b'.') { + true => { + let digits = (self.mask >> 20).trailing_ones(); + let nanos = match digits { + 0 => return None, + 1 => parse_nanos::<1, 0>(&self.digits[20..21]), + 2 => parse_nanos::<2, 0>(&self.digits[20..22]), + 3 => parse_nanos::<3, 0>(&self.digits[20..23]), + 4 => parse_nanos::<4, 0>(&self.digits[20..24]), + 5 => parse_nanos::<5, 0>(&self.digits[20..25]), + 6 => parse_nanos::<6, 0>(&self.digits[20..26]), + 7 => parse_nanos::<7, 0>(&self.digits[20..27]), + 8 => parse_nanos::<8, 0>(&self.digits[20..28]), + _ => parse_nanos::<9, 0>(&self.digits[20..29]), + }; + Some((time(hour, minute, second, nanos)?, 20 + digits as usize)) + } + false => Some((time(hour, minute, second, 0)?, 19)), + } + } + // 092656 + 0b111111 => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[13] * 10 + self.digits[14]; + let second = self.digits[15] * 10 + self.digits[16]; + let time = time(hour, minute, second, 0)?; + Some((time, 17)) + } + _ => None, + } + } +} + +/// Accepts a string and parses it relative to the provided `timezone` +/// +/// In addition to RFC3339 / ISO8601 standard timestamps, it also +/// accepts strings that use a space ` ` to separate the date and time +/// as well as strings that have no explicit timezone offset. +/// +/// Examples of accepted inputs: +/// * `1997-01-31T09:26:56.123Z` # RCF3339 +/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 +/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T +/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified +/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset +/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator +/// * `1997-01-31` # close to RCF3339, only date no time +/// +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// Some formats supported by PostgresSql +/// are not supported, like +/// +/// * "2023-01-01 04:05:06.789 +07:30:00", +/// * "2023-01-01 040506 +07:30:00", +/// * "2023-01-01 04:05:06.789 PST", +/// +/// [IANA timezones]: https://www.iana.org/time-zones +pub fn string_to_datetime( + timezone: &T, + s: &str, +) -> Result, ArrowError> { + let err = |ctx: &str| { + ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) + }; + + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); + } + + let parser = TimestampParser::new(bytes); + let date = parser.date().ok_or_else(|| err("error parsing date"))?; + if bytes.len() == 10 { + let offset = timezone.offset_from_local_date(&date); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + + let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + return Ok(DateTime::from_local(date.and_time(time), offset)); + } + + if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { + return Err(err("invalid timestamp separator")); + } + + let (time, mut tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let datetime = date.and_time(time); + + if tz_offset == 32 { + // Decimal overrun + while tz_offset < bytes.len() && bytes[tz_offset].is_ascii_digit() { + tz_offset += 1; + } + } + + if bytes.len() <= tz_offset { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_local(datetime, offset)); + } + + if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + return Ok(DateTime::from_utc(datetime, offset)); + } + + // Parse remainder of string as timezone + let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; + let offset = parsed_tz.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing timezone offset"))?; + Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) +} + +/// Accepts a string in RFC3339 / ISO8601 standard format and some +/// variants and converts it to a nanosecond precision timestamp. +/// +/// See [`string_to_datetime`] for the full set of supported formats +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// We hope to extend this function in the future with a second +/// parameter to specifying the format string. +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// This function interprets string without an explicit time zone as timestamps +/// relative to UTC, see [`string_to_datetime`] for alternative semantics +/// +/// In particular: +/// +/// ``` +/// # use arrow_cast::parse::string_to_timestamp_nanos; +/// // Note all three of these timestamps are parsed as the same value +/// let a = string_to_timestamp_nanos("1997-01-31 09:26:56.123Z").unwrap(); +/// let b = string_to_timestamp_nanos("1997-01-31T09:26:56.123").unwrap(); +/// let c = string_to_timestamp_nanos("1997-01-31T14:26:56.123+05:00").unwrap(); +/// +/// assert_eq!(a, b); +/// assert_eq!(b, c); +/// ``` +/// +#[inline] +pub fn string_to_timestamp_nanos(s: &str) -> Result { + to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) +} + +/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +#[inline] +fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { + if dt.timestamp().checked_mul(1_000_000_000).is_none() { + return Err(ArrowError::ParseError( + ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), + )); + } + + Ok(dt.timestamp_nanos()) +} + +/// Accepts a string in ISO8601 standard format and some +/// variants and converts it to nanoseconds since midnight. +/// +/// Examples of accepted inputs: +/// * `09:26:56.123 AM` +/// * `23:59:59` +/// * `6:00 pm` +// +/// Internally, this function uses the `chrono` library for the +/// time parsing +/// +/// ## Timezone / Offset Handling +/// +/// This function does not support parsing strings with a timezone +/// or offset specified, as it considers only time since midnight. +pub fn string_to_time_nanoseconds(s: &str) -> Result { + let nt = string_to_time(s).ok_or_else(|| { + ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) + })?; + Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) +} + +fn string_to_time(s: &str) -> Option { + let bytes = s.as_bytes(); + if bytes.len() < 4 { + return None; + } + + let (am, bytes) = match bytes.get(bytes.len() - 3..) { + Some(b" AM" | b" am" | b" Am" | b" aM") => { + (Some(true), &bytes[..bytes.len() - 3]) + } + Some(b" PM" | b" pm" | b" pM" | b" Pm") => { + (Some(false), &bytes[..bytes.len() - 3]) + } + _ => (None, bytes), + }; + + if bytes.len() < 4 { + return None; + } + + let mut digits = [b'0'; 6]; + + // Extract hour + let bytes = match (bytes[1], bytes[2]) { + (b':', _) => { + digits[1] = bytes[0]; + &bytes[2..] + } + (_, b':') => { + digits[0] = bytes[0]; + digits[1] = bytes[1]; + &bytes[3..] + } + _ => return None, + }; + + if bytes.len() < 2 { + return None; // Minutes required + } + + // Extract minutes + digits[2] = bytes[0]; + digits[3] = bytes[1]; + + let nanoseconds = match bytes.get(2) { + Some(b':') => { + if bytes.len() < 5 { + return None; + } + + // Extract seconds + digits[4] = bytes[3]; + digits[5] = bytes[4]; + + // Extract sub-seconds if any + match bytes.get(5) { + Some(b'.') => { + let decimal = &bytes[6..]; + if decimal.iter().any(|x| !x.is_ascii_digit()) { + return None; + } + match decimal.len() { + 0 => return None, + 1 => parse_nanos::<1, b'0'>(decimal), + 2 => parse_nanos::<2, b'0'>(decimal), + 3 => parse_nanos::<3, b'0'>(decimal), + 4 => parse_nanos::<4, b'0'>(decimal), + 5 => parse_nanos::<5, b'0'>(decimal), + 6 => parse_nanos::<6, b'0'>(decimal), + 7 => parse_nanos::<7, b'0'>(decimal), + 8 => parse_nanos::<8, b'0'>(decimal), + _ => parse_nanos::<9, b'0'>(decimal), + } + } + Some(_) => return None, + None => 0, + } + } + Some(_) => return None, + None => 0, + }; + + digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if digits.iter().any(|x| *x > 9) { + return None; + } + + let hour = match (digits[0] * 10 + digits[1], am) { + (12, Some(true)) => 0, // 12:00 AM -> 00:00 + (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00 + (12, Some(false)) => 12, // 12:00 PM -> 12:00 + (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00 + (_, Some(_)) => return None, + (h, None) => h, + }; + + // Handle leap second + let (second, nanoseconds) = match digits[4] * 10 + digits[5] { + 60 => (59, nanoseconds + 1_000_000_000), + s => (s, nanoseconds), + }; + + NaiveTime::from_hms_nano_opt( + hour as _, + (digits[2] * 10 + digits[3]) as _, + second as _, + nanoseconds, + ) +} + +/// Specialized parsing implementations +/// used by csv and json reader +pub trait Parser: ArrowPrimitiveType { + fn parse(string: &str) -> Option; + + fn parse_formatted(string: &str, _format: &str) -> Option { + Self::parse(string) + } +} + +impl Parser for Float16Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()) + .ok() + .map(f16::from_f32) + } +} + +impl Parser for Float32Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +impl Parser for Float64Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + lexical_core::parse::(string.as_bytes()).ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); + +impl Parser for TimestampNanosecondType { + fn parse(string: &str) -> Option { + string_to_timestamp_nanos(string).ok() + } +} + +impl Parser for TimestampMicrosecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1000) + } +} + +impl Parser for TimestampMillisecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000) + } +} + +impl Parser for TimestampSecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000_000) + } +} + +impl Parser for Time64NanosecondType { + // Will truncate any fractions of a nanosecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + + nt.nanosecond() as i64, + ) + } +} + +impl Parser for Time64MicrosecondType { + // Will truncate any fractions of a microsecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| nanos / 1_000) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i64 * 1_000_000 + + nt.nanosecond() as i64 / 1_000, + ) + } +} + +impl Parser for Time32MillisecondType { + // Will truncate any fractions of a millisecond + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 * 1_000 + + nt.nanosecond() as i32 / 1_000_000, + ) + } +} + +impl Parser for Time32SecondType { + // Will truncate any fractions of a second + fn parse(string: &str) -> Option { + string_to_time_nanoseconds(string) + .ok() + .map(|nanos| (nanos / 1_000_000_000) as i32) + .or_else(|| string.parse::().ok()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let nt = NaiveTime::parse_from_str(string, format).ok()?; + Some( + nt.num_seconds_from_midnight() as i32 + + nt.nanosecond() as i32 / 1_000_000_000, + ) + } +} + +/// Number of days between 0001-01-01 and 1970-01-01 +const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + +fn parse_date(string: &str) -> Option { + if string.len() > 10 { + return None; + } + let mut digits = [0; 10]; + let mut mask = 0; + + // Treating all bytes the same way, helps LLVM vectorise this correctly + for (idx, (o, i)) in digits.iter_mut().zip(string.bytes()).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u16) << idx + } + + const HYPHEN: u8 = b'-'.wrapping_sub(b'0'); + + if digits[4] != HYPHEN { + return None; + } + + let (month, day) = match mask { + 0b1101101111 => { + if digits[7] != HYPHEN { + return None; + } + (digits[5] * 10 + digits[6], digits[8] * 10 + digits[9]) + } + 0b101101111 => { + if digits[7] != HYPHEN { + return None; + } + (digits[5] * 10 + digits[6], digits[8]) + } + 0b110101111 => { + if digits[6] != HYPHEN { + return None; + } + (digits[5], digits[7] * 10 + digits[8]) + } + 0b10101111 => { + if digits[6] != HYPHEN { + return None; + } + (digits[5], digits[7]) + } + _ => return None, + }; + + let year = digits[0] as u16 * 1000 + + digits[1] as u16 * 100 + + digits[2] as u16 * 10 + + digits[3] as u16; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) +} + +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let date = parse_date(string)?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let date = NaiveDate::parse_from_str(string, format).ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + if string.len() <= 10 { + let date = parse_date(string)?; + Some(NaiveDateTime::new(date, NaiveTime::default()).timestamp_millis()) + } else { + let date_time = string_to_datetime(&Utc, string).ok()?; + Some(date_time.timestamp_millis()) + } + } + + fn parse_formatted(string: &str, format: &str) -> Option { + use chrono::format::Fixed; + use chrono::format::StrftimeItems; + let fmt = StrftimeItems::new(format); + let has_zone = fmt.into_iter().any(|item| match item { + chrono::format::Item::Fixed(fixed_item) => matches!( + fixed_item, + Fixed::RFC2822 + | Fixed::RFC3339 + | Fixed::TimezoneName + | Fixed::TimezoneOffsetColon + | Fixed::TimezoneOffsetColonZ + | Fixed::TimezoneOffset + | Fixed::TimezoneOffsetZ + ), + _ => false, + }); + if has_zone { + let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } else { + let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } + } +} + +/// Parse the string format decimal value to i128/i256 format and checking the precision and scale. +/// The result value can't be out of bounds. +pub fn parse_decimal( + s: &str, + precision: u8, + scale: i8, +) -> Result { + let mut result = T::Native::usize_as(0); + let mut fractionals = 0; + let mut digits = 0; + let base = T::Native::usize_as(10); + + let bs = s.as_bytes(); + let (bs, negative) = match bs.first() { + Some(b'-') => (&bs[1..], true), + Some(b'+') => (&bs[1..], false), + _ => (bs, false), + }; + + if bs.is_empty() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + + let mut bs = bs.iter(); + // Overflow checks are not required if 10^(precision - 1) <= T::MAX holds. + // Thus, if we validate the precision correctly, we can skip overflow checks. + while let Some(b) = bs.next() { + match b { + b'0'..=b'9' => { + if digits == 0 && *b == b'0' { + // Ignore leading zeros. + continue; + } + digits += 1; + result = result.mul_wrapping(base); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + b'.' => { + for b in bs.by_ref() { + if !b.is_ascii_digit() { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + if fractionals == scale { + // We have processed all the digits that we need. All that + // is left is to validate that the rest of the string contains + // valid digits. + continue; + } + fractionals += 1; + digits += 1; + result = result.mul_wrapping(base); + result = + result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + } + + // Fail on "." + if digits == 0 { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } + } + } + + if fractionals < scale { + let exp = scale - fractionals; + if exp as u8 + digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + let mul = base.pow_wrapping(exp as _); + result = result.mul_wrapping(mul); + } else if digits > precision { + return Err(ArrowError::ParseError("parse decimal overflow".to_string())); + } + + Ok(if negative { + result.neg_wrapping() + } else { + result + }) +} + +pub fn parse_interval_year_month( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Year); + let interval = Interval::parse(value, &config)?; + + let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." + )))?; + + Ok(IntervalYearMonthType::make_value(0, months)) +} + +pub fn parse_interval_day_time( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Day); + let interval = Interval::parse(value, &config)?; + + let (days, millis) = interval.to_day_time().map_err(|_| ArrowError::CastError(format!( + "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds" + )))?; + + Ok(IntervalDayTimeType::make_value(days, millis)) +} + +pub fn parse_interval_month_day_nano( + value: &str, +) -> Result<::Native, ArrowError> { + let config = IntervalParseConfig::new(IntervalUnit::Month); + let interval = Interval::parse(value, &config)?; + + let (months, days, nanos) = interval.to_month_day_nanos(); + + Ok(IntervalMonthDayNanoType::make_value(months, days, nanos)) +} + +const NANOS_PER_MILLIS: i64 = 1_000_000; +const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS; +const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND; +const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE; +#[cfg(test)] +const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR; + +#[rustfmt::skip] +#[derive(Clone, Copy)] +#[repr(u16)] +enum IntervalUnit { + Century = 0b_0000_0000_0001, + Decade = 0b_0000_0000_0010, + Year = 0b_0000_0000_0100, + Month = 0b_0000_0000_1000, + Week = 0b_0000_0001_0000, + Day = 0b_0000_0010_0000, + Hour = 0b_0000_0100_0000, + Minute = 0b_0000_1000_0000, + Second = 0b_0001_0000_0000, + Millisecond = 0b_0010_0000_0000, + Microsecond = 0b_0100_0000_0000, + Nanosecond = 0b_1000_0000_0000, +} + +impl FromStr for IntervalUnit { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "century" | "centuries" => Ok(Self::Century), + "decade" | "decades" => Ok(Self::Decade), + "year" | "years" => Ok(Self::Year), + "month" | "months" => Ok(Self::Month), + "week" | "weeks" => Ok(Self::Week), + "day" | "days" => Ok(Self::Day), + "hour" | "hours" => Ok(Self::Hour), + "minute" | "minutes" => Ok(Self::Minute), + "second" | "seconds" => Ok(Self::Second), + "millisecond" | "milliseconds" => Ok(Self::Millisecond), + "microsecond" | "microseconds" => Ok(Self::Microsecond), + "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond), + _ => Err(ArrowError::NotYetImplemented(format!( + "Unknown interval type: {s}" + ))), + } + } +} + +pub type MonthDayNano = (i32, i32, i64); + +/// Chosen based on the number of decimal digits in 1 week in nanoseconds +const INTERVAL_PRECISION: u32 = 15; + +#[derive(Clone, Copy, Debug, PartialEq)] +struct IntervalAmount { + /// The integer component of the interval amount + integer: i64, + /// The fractional component multiplied by 10^INTERVAL_PRECISION + frac: i64, +} + +#[cfg(test)] +impl IntervalAmount { + fn new(integer: i64, frac: i64) -> Self { + Self { integer, frac } + } +} + +impl FromStr for IntervalAmount { + type Err = ArrowError; + + fn from_str(s: &str) -> Result { + match s.split_once('.') { + Some((integer, frac)) + if frac.len() <= INTERVAL_PRECISION as usize + && !frac.is_empty() + && !frac.starts_with('-') => + { + // integer will be "" for values like ".5" + // and "-" for values like "-.5" + let explicit_neg = integer.starts_with('-'); + let integer = if integer.is_empty() || integer == "-" { + Ok(0) + } else { + integer.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + }) + }?; + + let frac_unscaled = frac.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + // scale fractional part by interval precision + let frac = + frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); + + // propagate the sign of the integer part to the fractional part + let frac = if integer < 0 || explicit_neg { + -frac + } else { + frac + }; + + let result = Self { integer, frac }; + + Ok(result) + } + Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( + format!("Failed to parse {s} as interval amount"), + )), + Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { + Err(ArrowError::ParseError(format!( + "{s} exceeds the precision available for interval amount" + ))) + } + Some(_) | None => { + let integer = s.parse::().map_err(|_| { + ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + )) + })?; + + let result = Self { integer, frac: 0 }; + Ok(result) + } + } + } +} + +#[derive(Debug, Default, PartialEq)] +struct Interval { + months: i32, + days: i32, + nanos: i64, +} + +impl Interval { + fn new(months: i32, days: i32, nanos: i64) -> Self { + Self { + months, + days, + nanos, + } + } + + fn to_year_months(&self) -> Result { + match (self.months, self.days, self.nanos) { + (months, days, nanos) if days == 0 && nanos == 0 => Ok(months), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent interval with days and nanos as year-months: {:?}", + self + ))), + } + } + + fn to_day_time(&self) -> Result<(i32, i32), ArrowError> { + let days = self.months.mul_checked(30)?.add_checked(self.days)?; + + match self.nanos { + nanos if nanos % NANOS_PER_MILLIS == 0 => { + let millis = (self.nanos / 1_000_000).try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} nanos as milliseconds in a signed 32-bit integer", + self.nanos + )) + })?; + + Ok((days, millis)) + } + nanos => Err(ArrowError::InvalidArgumentError(format!( + "Unable to represent {nanos} as milliseconds" + ))), + } + } + + fn to_month_day_nanos(&self) -> (i32, i32, i64) { + (self.months, self.days, self.nanos) + } + + /// Parse string value in traditional Postgres format such as + /// `1 year 2 months 3 days 4 hours 5 minutes 6 seconds` + fn parse(value: &str, config: &IntervalParseConfig) -> Result { + let components = parse_interval_components(value, config)?; + + let result = components.into_iter().fold( + Ok(Self::default()), + |result, (amount, unit)| match result { + Ok(result) => result.add(amount, unit), + Err(e) => Err(e), + }, + )?; + + Ok(result) + } + + /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. + /// When the interval unit is larger than months, the result is rounded to total months and not spilled to days/nanos. + /// Fractional parts of weeks and days are represented using days and nanoseconds. + /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) + fn add( + &self, + amount: IntervalAmount, + unit: IntervalUnit, + ) -> Result { + let result = match unit { + IntervalUnit::Century => { + let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} centuries as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Decade => { + let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; + + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} decades as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Year => { + let months_int = amount.integer.mul_checked(12)?; + let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); + let months = + months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} years as months in a signed 32-bit integer", + &amount.integer + )) + })?; + + Self::new(self.months.add_checked(months)?, self.days, self.nanos) + } + IntervalUnit::Month => { + let months = amount.integer.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months in a signed 32-bit integer", + &amount.integer + )) + })?; + + let days = amount.frac * 3 / 10_i64.pow(INTERVAL_PRECISION - 1); + let days = days.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} months as days in a signed 32-bit integer", + amount.frac / 10_i64.pow(INTERVAL_PRECISION) + )) + })?; + + Self::new( + self.months.add_checked(months)?, + self.days.add_checked(days)?, + self.nanos, + ) + } + IntervalUnit::Week => { + let days = amount.integer.mul_checked(7)?.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} weeks as days in a signed 32-bit integer", + &amount.integer + )) + })?; + + let nanos = + amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Day => { + let days = amount.integer.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Unable to represent {} days in a signed 32-bit integer", + amount.integer + )) + })?; + + let nanos = + amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + + Self::new( + self.months, + self.days.add_checked(days)?, + self.nanos.add_checked(nanos)?, + ) + } + IntervalUnit::Hour => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; + let nanos_frac = + amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Minute => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MINUTE)?; + let nanos_frac = amount.frac * 6 / 10_i64.pow(INTERVAL_PRECISION - 10); + + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Second => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_SECOND)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 9); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Millisecond => { + let nanos_int = amount.integer.mul_checked(NANOS_PER_MILLIS)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 6); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Microsecond => { + let nanos_int = amount.integer.mul_checked(1_000)?; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION - 3); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + IntervalUnit::Nanosecond => { + let nanos_int = amount.integer; + let nanos_frac = amount.frac / 10_i64.pow(INTERVAL_PRECISION); + let nanos = nanos_int.add_checked(nanos_frac)?; + + Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) + } + }; + + Ok(result) + } +} + +struct IntervalParseConfig { + /// The default unit to use if none is specified + /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = IntervalType::Second + default_unit: IntervalUnit, +} + +impl IntervalParseConfig { + fn new(default_unit: IntervalUnit) -> Self { + Self { default_unit } + } +} + +/// parse the string into a vector of interval components i.e. (amount, unit) tuples +fn parse_interval_components( + value: &str, + config: &IntervalParseConfig, +) -> Result, ArrowError> { + let parts = value.split_whitespace(); + + let raw_amounts = parts.clone().step_by(2); + let raw_units = parts.skip(1).step_by(2); + + // parse amounts + let (amounts, invalid_amounts) = raw_amounts + .map(IntervalAmount::from_str) + .partition::, _>(Result::is_ok); + + // invalid amounts? + if !invalid_amounts.is_empty() { + return Err(ArrowError::NotYetImplemented(format!( + "Unsupported Interval Expression with value {value:?}" + ))); + } + + // parse units + let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units + .clone() + .map(IntervalUnit::from_str) + .partition(Result::is_ok); + + // invalid units? + if !invalid_units.is_empty() { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}" + ))); + } + + // collect parsed results + let amounts = amounts.into_iter().map(Result::unwrap).collect::>(); + let units = units.into_iter().map(Result::unwrap).collect::>(); + + // if only an amount is specified, use the default unit + if amounts.len() == 1 && units.is_empty() { + return Ok(vec![(amounts[0], config.default_unit)]); + }; + + // duplicate units? + let mut observed_interval_types = 0; + for (unit, raw_unit) in units.iter().zip(raw_units) { + if observed_interval_types & (*unit as u16) != 0 { + return Err(ArrowError::ParseError(format!( + "Invalid input syntax for type interval: {value:?}. Repeated type '{raw_unit}'", + ))); + } + + observed_interval_types |= *unit as u16; + } + + let result = amounts.iter().copied().zip(units.iter().copied()); + + Ok(result.collect::>()) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::temporal_conversions::date32_to_datetime; + use arrow_array::timezone::Tz; + use arrow_buffer::i256; + + #[test] + fn test_parse_nanos() { + assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000); + assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000); + assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000); + } + + #[test] + fn string_to_timestamp_timezone() { + // Explicit timezone + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08T13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08T13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08T13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + fn string_to_timestamp_timezone_space() { + // Ensure space rather than T between time and date is accepted + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855+00:00").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp("2020-09-08 13:42:29.190855Z").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp("2020-09-08 13:42:29Z").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp("2020-09-08 13:42:29.190855-05:00").unwrap() + ); + } + + #[test] + #[cfg_attr(miri, ignore)] // unsupported operation: can't call foreign function: mktime + fn string_to_timestamp_no_timezone() { + // This test is designed to succeed in regardless of the local + // timezone the test machine is running. Thus it is still + // somewhat susceptible to bugs in the use of chrono + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + // Also ensure that parsing timestamps with no fractional + // second part works as well + let naive_datetime_whole_secs = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(13, 42, 29).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime_whole_secs.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + // ensure without time work + // no time, should be the nano second at + // 2020-09-08 0:0:0 + let naive_datetime_no_time = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_opt(0, 0, 0).unwrap(), + ); + + assert_eq!( + naive_datetime_no_time.timestamp_nanos(), + parse_timestamp("2020-09-08").unwrap() + ) + } + + #[test] + fn string_to_timestamp_chrono() { + let cases = [ + "2020-09-08T13:42:29Z", + "1969-01-01T00:00:00.1Z", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12+00:00", + "2020-09-08T12:00:12.1+00:00", + "2020-09-08T12:00:12.12+00:00", + "2020-09-08T12:00:12.123+00:00", + "2020-09-08T12:00:12.1234+00:00", + "2020-09-08T12:00:12.12345+00:00", + "2020-09-08T12:00:12.123456+00:00", + "2020-09-08T12:00:12.1234567+00:00", + "2020-09-08T12:00:12.12345678+00:00", + "2020-09-08T12:00:12.123456789+00:00", + "2020-09-08T12:00:12.12345678912z", + "2020-09-08T12:00:12.123456789123Z", + "2020-09-08T12:00:12.123456789123+02:00", + "2020-09-08T12:00:12.12345678912345Z", + "2020-09-08T12:00:12.1234567891234567+02:00", + "2020-09-08T12:00:60Z", + "2020-09-08T12:00:60.123Z", + "2020-09-08T12:00:60.123456+02:00", + "2020-09-08T12:00:60.1234567891234567+02:00", + "2020-09-08T12:00:60.999999999+02:00", + "2020-09-08t12:00:12.12345678+00:00", + "2020-09-08t12:00:12+00:00", + "2020-09-08t12:00:12Z", + ]; + + for case in cases { + let chrono = DateTime::parse_from_rfc3339(case).unwrap(); + let chrono_utc = chrono.with_timezone(&Utc); + + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono_utc, custom) + } + } + + #[test] + fn string_to_timestamp_naive() { + let cases = [ + "2018-11-13T17:11:10.011375885995", + "2030-12-04T17:11:10.123", + "2030-12-04T17:11:10.1234", + "2030-12-04T17:11:10.123456", + ]; + for case in cases { + let chrono = + NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let custom = string_to_datetime(&Utc, case).unwrap(); + assert_eq!(chrono, custom.naive_utc()) + } + } + + #[test] + fn string_to_timestamp_invalid() { + // Test parsing invalid formats + let cases = [ + ("", "timestamp must contain at least 10 characters"), + ("SS", "timestamp must contain at least 10 characters"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), + ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), + ("1997-01-31 09:26:56.123Z", "error parsing time"), + ("1997:01:31T09:26:56.123Z", "error parsing date"), + ("1997:1:31T09:26:56.123Z", "error parsing date"), + ("1997-01-32T09:26:56.123Z", "error parsing date"), + ("1997-13-32T09:26:56.123Z", "error parsing date"), + ("1997-02-29T09:26:56.123Z", "error parsing date"), + ("2015-02-30T17:35:20-08:00", "error parsing date"), + ("1997-01-10T9:26:56.123Z", "error parsing time"), + ("2015-01-20T25:35:20-08:00", "error parsing time"), + ("1997-01-10T09:61:56.123Z", "error parsing time"), + ("1997-01-10T09:61:90.123Z", "error parsing time"), + ("1997-01-10T12:00:6.123Z", "error parsing time"), + ("1997-01-31T092656.123Z", "error parsing time"), + ("1997-01-10T12:00:06.", "error parsing time"), + ("1997-01-10T12:00:06. ", "error parsing time"), + ]; + + for (s, ctx) in cases { + let expected = + format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } + } + + // Parse a timestamp to timestamp int with a useful human readable error message + fn parse_timestamp(s: &str) -> Result { + let result = string_to_timestamp_nanos(s); + if let Err(e) = &result { + eprintln!("Error parsing timestamp '{s}': {e:?}"); + } + result + } + + #[test] + fn string_without_timezone_to_timestamp() { + // string without timezone should always output the same regardless the local or session timezone + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 190855000).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29.190855").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29.190855").unwrap() + ); + + let naive_datetime = NaiveDateTime::new( + NaiveDate::from_ymd_opt(2020, 9, 8).unwrap(), + NaiveTime::from_hms_nano_opt(13, 42, 29, 0).unwrap(), + ); + + // Ensure both T and ' ' variants work + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08T13:42:29").unwrap() + ); + + assert_eq!( + naive_datetime.timestamp_nanos(), + parse_timestamp("2020-09-08 13:42:29").unwrap() + ); + + let tz: Tz = "+02:00".parse().unwrap(); + let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 11:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 13:42:29"); + + let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap(); + let utc = date.naive_utc().to_string(); + assert_eq!(utc, "2020-09-08 13:42:29"); + let local = date.naive_local().to_string(); + assert_eq!(local, "2020-09-08 15:42:29"); + + let dt = + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") + .unwrap(); + let local: Tz = "+08:00".parse().unwrap(); + + // Parsed as offset from UTC + let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap(); + assert_eq!(dt, date.naive_utc()); + assert_ne!(dt, date.naive_local()); + + // Parsed as offset from local + let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap(); + assert_eq!(dt, date.naive_local()); + assert_ne!(dt, date.naive_utc()); + } + + #[test] + fn parse_date32() { + let cases = [ + "2020-09-08", + "2020-9-8", + "2020-09-8", + "2020-9-08", + "2020-12-1", + "1690-2-5", + ]; + for case in cases { + let v = date32_to_datetime(Date32Type::parse(case).unwrap()).unwrap(); + let expected: NaiveDate = case.parse().unwrap(); + assert_eq!(v.date(), expected); + } + + let err_cases = [ + "", + "80-01-01", + "342", + "Foo", + "2020-09-08-03", + "2020--04-03", + "2020--", + ]; + for case in err_cases { + assert_eq!(Date32Type::parse(case), None); + } + } + + #[test] + fn parse_time64_nanos() { + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567899999999"), + Some(7_801_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.1234567"), + Some(7_801_123_456_700) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 AM"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01.123456789 am"), + Some(601_123_456_789) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 PM"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01.12345678 pm"), + Some(51_001_123_456_780) + ); + assert_eq!( + Time64NanosecondType::parse("02:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01"), + Some(7_801_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 AM"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10:01 am"), + Some(601_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 PM"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10:01 pm"), + Some(51_001_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("02:10"), + Some(7_800_000_000_000) + ); + assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000)); + assert_eq!( + Time64NanosecondType::parse("12:10 AM"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("12:10 am"), + Some(600_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 PM"), + Some(51_000_000_000_000) + ); + assert_eq!( + Time64NanosecondType::parse("2:10 pm"), + Some(51_000_000_000_000) + ); + + // parse directly as nanoseconds + assert_eq!(Time64NanosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64NanosecondType::parse("23:59:60"), + Some(86_400_000_000_000) + ); + + // custom format + assert_eq!( + Time64NanosecondType::parse_formatted( + "02 - 10 - 01 - .1234567", + "%H - %M - %S - %.f" + ), + Some(7_801_123_456_700) + ); + } + + #[test] + fn parse_time64_micros() { + // expected formats + assert_eq!( + Time64MicrosecondType::parse("02:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.1234"), + Some(7_801_123_400) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 AM"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01.123456 am"), + Some(601_123_456) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 PM"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01.12345 pm"), + Some(51_001_123_450) + ); + assert_eq!( + Time64MicrosecondType::parse("02:10:01"), + Some(7_801_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000)); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 AM"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("12:10:01 am"), + Some(601_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 PM"), + Some(51_001_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10:01 pm"), + Some(51_001_000_000) + ); + assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000)); + assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000)); + assert_eq!( + Time64MicrosecondType::parse("2:10 PM"), + Some(51_000_000_000) + ); + assert_eq!( + Time64MicrosecondType::parse("2:10 pm"), + Some(51_000_000_000) + ); + + // parse directly as microseconds + assert_eq!(Time64MicrosecondType::parse("1"), Some(1)); + + // leap second + assert_eq!( + Time64MicrosecondType::parse("23:59:60"), + Some(86_400_000_000) + ); + + // custom format + assert_eq!( + Time64MicrosecondType::parse_formatted( + "02 - 10 - 01 - .1234", + "%H - %M - %S - %.f" + ), + Some(7_801_123_400) + ); + } + + #[test] + fn parse_time32_millis() { + // expected formats + assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100)); + assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100)); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 AM"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("12:10:01.123 am"), + Some(601_123) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 PM"), + Some(51_001_120) + ); + assert_eq!( + Time32MillisecondType::parse("2:10:01.12 pm"), + Some(51_001_120) + ); + assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000)); + assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000)); + assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000)); + assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000)); + assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000)); + + // parse directly as milliseconds + assert_eq!(Time32MillisecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000)); + + // custom format + assert_eq!( + Time32MillisecondType::parse_formatted( + "02 - 10 - 01 - .1", + "%H - %M - %S - %.f" + ), + Some(7_801_100) + ); + } + + #[test] + fn parse_time32_secs() { + // expected formats + assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801)); + assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801)); + assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601)); + assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601)); + assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001)); + assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001)); + assert_eq!(Time32SecondType::parse("02:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("2:10"), Some(7_800)); + assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600)); + assert_eq!(Time32SecondType::parse("12:10 am"), Some(600)); + assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000)); + assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000)); + + // parse directly as seconds + assert_eq!(Time32SecondType::parse("1"), Some(1)); + + // leap second + assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400)); + + // custom format + assert_eq!( + Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"), + Some(7_801) + ); + } + + #[test] + fn test_string_to_time_invalid() { + let cases = [ + "25:00", + "9:00:", + "009:00", + "09:0:00", + "25:00:00", + "13:00 AM", + "13:00 PM", + "12:00. AM", + "09:0:00", + "09:01:0", + "09:01:1", + "9:1:0", + "09:01:0", + "1:00.123", + "1:00:00.123f", + " 9:00:00", + ":09:00", + "T9:00:00", + "AM", + ]; + for case in cases { + assert!(string_to_time(case).is_none(), "{case}"); + } + } + + #[test] + fn test_string_to_time_chrono() { + let cases = [ + ("1:00", "%H:%M"), + ("12:00", "%H:%M"), + ("13:00", "%H:%M"), + ("24:00", "%H:%M"), + ("1:00:00", "%H:%M:%S"), + ("12:00:30", "%H:%M:%S"), + ("13:00:59", "%H:%M:%S"), + ("24:00:60", "%H:%M:%S"), + ("09:00:00", "%H:%M:%S%.f"), + ("0:00:30.123456", "%H:%M:%S%.f"), + ("0:00 AM", "%I:%M %P"), + ("1:00 AM", "%I:%M %P"), + ("12:00 AM", "%I:%M %P"), + ("13:00 AM", "%I:%M %P"), + ("0:00 PM", "%I:%M %P"), + ("1:00 PM", "%I:%M %P"), + ("12:00 PM", "%I:%M %P"), + ("13:00 PM", "%I:%M %P"), + ("1:00 pM", "%I:%M %P"), + ("1:00 Pm", "%I:%M %P"), + ("1:00 aM", "%I:%M %P"), + ("1:00 Am", "%I:%M %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"), + ("1:00:30.1234 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456 PM", "%I:%M:%S%.f %P"), + ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"), + ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"), + ]; + for (s, format) in cases { + let chrono = NaiveTime::parse_from_str(s, format).ok(); + let custom = string_to_time(s); + assert_eq!(chrono, custom, "{s}"); + } + } + + #[test] + fn test_parse_interval() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + assert_eq!( + Interval::new(1i32, 0i32, 0i64), + Interval::parse("1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 0i32, 0i64), + Interval::parse("2 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-1i32, -18i32, -(NANOS_PER_DAY / 5)), + Interval::parse("-1.5 months -3.2 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse("0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 15i32, 0), + Interval::parse(".5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-0.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -15i32, 0), + Interval::parse("-.5 months", &config).unwrap(), + ); + + assert_eq!( + Interval::new(2i32, 10i32, 9 * NANOS_PER_HOUR), + Interval::parse("2.1 months 7.25 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::parse("1 centurys 1 month", &config) + .unwrap_err() + .to_string(), + r#"Parser error: Invalid input syntax for type interval: "1 centurys 1 month""# + ); + + assert_eq!( + Interval::new(37i32, 0i32, 0i64), + Interval::parse("3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(35i32, 0i32, 0i64), + Interval::parse("3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-37i32, 0i32, 0i64), + Interval::parse("-3 year -1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-35i32, 0i32, 0i64), + Interval::parse("-3 year 1 month", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 5i32, 0i64), + Interval::parse("5 days", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 3 * NANOS_PER_HOUR), + Interval::parse("7 days 3 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, 5 * NANOS_PER_MINUTE), + Interval::parse("7 days 5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, 7i32, -5 * NANOS_PER_MINUTE), + Interval::parse("7 days -5 minutes", &config).unwrap(), + ); + + assert_eq!( + Interval::new(0i32, -7i32, 5 * NANOS_PER_HOUR), + Interval::parse("-7 days 5 hours", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 0i32, + -7i32, + -5 * NANOS_PER_HOUR - 5 * NANOS_PER_MINUTE - 5 * NANOS_PER_SECOND + ), + Interval::parse("-7 days -5 hours -5 minutes -5 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 0i32, 25 * NANOS_PER_MILLIS), + Interval::parse("1 year 25 millisecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new( + 12i32, + 1i32, + (NANOS_PER_SECOND as f64 * 0.000000001_f64) as i64 + ), + Interval::parse("1 year 1 day 0.000000001 seconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, NANOS_PER_MILLIS / 10), + Interval::parse("1 year 1 day 0.1 milliseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1000i64), + Interval::parse("1 year 1 day 1 microsecond", &config).unwrap(), + ); + + assert_eq!( + Interval::new(12i32, 1i32, 1i64), + Interval::parse("1 year 1 day 1 nanoseconds", &config).unwrap(), + ); + + assert_eq!( + Interval::new(1i32, 0i32, -NANOS_PER_SECOND), + Interval::parse("1 month -1 second", &config).unwrap(), + ); + + assert_eq!( + Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), + Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), + ); + } + + #[test] + fn test_duplicate_interval_type() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let err = Interval::parse("1 month 1 second 1 second", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 month 1 second 1 second\". Repeated type 'second'")"#, + format!("{err:?}") + ); + + // test with singular and plural forms + let err = Interval::parse("1 century 2 centuries", &config) + .expect_err("parsing interval should have failed"); + assert_eq!( + r#"ParseError("Invalid input syntax for type interval: \"1 century 2 centuries\". Repeated type 'centuries'")"#, + format!("{err:?}") + ); + } + + #[test] + fn test_interval_amount_parsing() { + // integer + let result = IntervalAmount::from_str("123").unwrap(); + let expected = IntervalAmount::new(123, 0); + + assert_eq!(result, expected); + + // positive w/ fractional + let result = IntervalAmount::from_str("0.3").unwrap(); + let expected = IntervalAmount::new(0, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // negative w/ fractional + let result = IntervalAmount::from_str("-3.5").unwrap(); + let expected = IntervalAmount::new(-3, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)); + + assert_eq!(result, expected); + + // invalid: missing fractional + let result = IntervalAmount::from_str("3."); + assert!(result.is_err()); + + // invalid: sign in fractional + let result = IntervalAmount::from_str("3.-5"); + assert!(result.is_err()); + } + + #[test] + fn test_interval_precision() { + let config = IntervalParseConfig::new(IntervalUnit::Month); + + let result = Interval::parse("100000.1 days", &config).unwrap(); + let expected = Interval::new(0_i32, 100_000_i32, NANOS_PER_DAY / 10); + + assert_eq!(result, expected); + } + + #[test] + fn test_interval_addition() { + // add 4.1 centuries + let start = Interval::new(1, 2, 3); + let expected = Interval::new(4921, 2, 3); + + let result = start + .add( + IntervalAmount::new(4, 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Century, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 10.25 decades + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1231, 2, 3); + + let result = start + .add( + IntervalAmount::new(10, 25 * 10_i64.pow(INTERVAL_PRECISION - 2)), + IntervalUnit::Decade, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 30.3 years (reminder: Postgres logic does not spill to days/nanos when interval is larger than a month) + let start = Interval::new(1, 2, 3); + let expected = Interval::new(364, 2, 3); + + let result = start + .add( + IntervalAmount::new(30, 3 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Year, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 1.5 months + let start = Interval::new(1, 2, 3); + let expected = Interval::new(2, 17, 3); + + let result = start + .add( + IntervalAmount::new(1, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Month, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -2 weeks + let start = Interval::new(1, 25, 3); + let expected = Interval::new(1, 11, 3); + + let result = start + .add(IntervalAmount::new(-2, 0), IntervalUnit::Week) + .unwrap(); + + assert_eq!(result, expected); + + // add 2.2 days + let start = Interval::new(12, 15, 3); + let expected = Interval::new(12, 17, 3 + 17_280 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(2, 2 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Day, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add 12.5 hours + let start = Interval::new(1, 2, 3); + let expected = Interval::new(1, 2, 3 + 45_000 * NANOS_PER_SECOND); + + let result = start + .add( + IntervalAmount::new(12, 5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Hour, + ) + .unwrap(); + + assert_eq!(result, expected); + + // add -1.5 minutes + let start = Interval::new(0, 0, -3); + let expected = Interval::new(0, 0, -90_000_000_000 - 3); + + let result = start + .add( + IntervalAmount::new(-1, -5 * 10_i64.pow(INTERVAL_PRECISION - 1)), + IntervalUnit::Minute, + ) + .unwrap(); + + assert_eq!(result, expected); + } + + #[test] + fn string_to_timestamp_old() { + parse_timestamp("1677-06-14T07:29:01.256") + .map_err(|e| assert!(e.to_string().ends_with(ERR_NANOSECONDS_NOT_SUPPORTED))) + .unwrap_err(); + } + + #[test] + fn test_parse_decimal_with_parameter() { + let tests = [ + ("0", 0i128), + ("123.123", 123123i128), + ("123.1234", 123123i128), + ("123.1", 123100i128), + ("123", 123000i128), + ("-123.123", -123123i128), + ("-123.1234", -123123i128), + ("-123.1", -123100i128), + ("-123", -123000i128), + ("0.0000123", 0i128), + ("12.", 12000i128), + ("-12.", -12000i128), + ("00.1", 100i128), + ("-00.1", -100i128), + ("12345678912345678.1234", 12345678912345678123i128), + ("-12345678912345678.1234", -12345678912345678123i128), + ("99999999999999999.999", 99999999999999999999i128), + ("-99999999999999999.999", -99999999999999999999i128), + (".123", 123i128), + ("-.123", -123i128), + ("123.", 123000i128), + ("-123.", -123000i128), + ]; + for (s, i) in tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!(i, result_128.unwrap()); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!(i256::from_i128(i), result_256.unwrap()); + } + let can_not_parse_tests = ["123,123", ".", "123.123.123", "", "+", "-"]; + for s in can_not_parse_tests { + let result_128 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string() + ); + let result_256 = parse_decimal::(s, 20, 3); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_256.unwrap_err().to_string() + ); + } + let overflow_parse_tests = ["12345678", "12345678.9", "99999999.99"]; + for s in overflow_parse_tests { + let result_128 = parse_decimal::(s, 10, 3); + let expected_128 = "Parser error: parse decimal overflow"; + let actual_128 = result_128.unwrap_err().to_string(); + + assert!( + actual_128.contains(expected_128), + "actual: '{actual_128}', expected: '{expected_128}'" + ); + + let result_256 = parse_decimal::(s, 10, 3); + let expected_256 = "Parser error: parse decimal overflow"; + let actual_256 = result_256.unwrap_err().to_string(); + + assert!( + actual_256.contains(expected_256), + "actual: '{actual_256}', expected: '{expected_256}'" + ); + } + + let edge_tests_128 = [ + ( + "99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 0, + ), + ( + "999999999999999999999999999999999999.99", + 99999999999999999999999999999999999999i128, + 2, + ), + ( + "9999999999999999999999999.9999999999999", + 99999999999999999999999999999999999999i128, + 13, + ), + ( + "9999999999999999999999999", + 99999999999999999999999990000000000000i128, + 13, + ), + ( + "0.99999999999999999999999999999999999999", + 99999999999999999999999999999999999999i128, + 38, + ), + ]; + for (s, i, scale) in edge_tests_128 { + let result_128 = parse_decimal::(s, 38, scale); + assert_eq!(i, result_128.unwrap()); + } + let edge_tests_256 = [ + ( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", +i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 0, + ), + ( + "999999999999999999999999999999999999999999999999999999999999999999999999.9999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 4, + ), + ( + "99999999999999999999999999999999999999999999999999.99999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + 26, + ), + ( + "99999999999999999999999999999999999999999999999999", + i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + 26, + ), + ]; + for (s, i, scale) in edge_tests_256 { + let result = parse_decimal::(s, 76, scale); + assert_eq!(i, result.unwrap()); + } + } +} From 8f44472e5c773f0daec1965253143e94d14c55e5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:33:15 -0400 Subject: [PATCH 1071/1411] Prepare arrow 44.0.0 (#4528) * Prepare arrow 44.0.0 * Final tweaks --- CHANGELOG-old.md | 47 +++++++++++++++++++++++ CHANGELOG.md | 64 +++++++++++++++----------------- Cargo.toml | 32 ++++++++-------- dev/release/update_change_log.sh | 4 +- 4 files changed, 95 insertions(+), 52 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 295728a67d3a..1d732ce6c022 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,53 @@ # Historical Changelog +## [43.0.0](https://github.com/apache/arrow-rs/tree/43.0.0) (2023-06-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/42.0.0...43.0.0) + +**Breaking changes:** + +- Simplify ffi import/export [\#4447](https://github.com/apache/arrow-rs/pull/4447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Virgiel](https://github.com/Virgiel)) +- Return Result from Parquet Row APIs [\#4428](https://github.com/apache/arrow-rs/pull/4428) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Remove Binary Dictionary Arithmetic Support [\#4407](https://github.com/apache/arrow-rs/pull/4407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Request: a way to copy a `Row` to `Rows` [\#4466](https://github.com/apache/arrow-rs/issues/4466) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Reuse schema when importing from FFI [\#4444](https://github.com/apache/arrow-rs/issues/4444) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] Allow implementations of `FlightSqlService` to handle custom actions and commands [\#4439](https://github.com/apache/arrow-rs/issues/4439) +- Support `NullBuilder` [\#4429](https://github.com/apache/arrow-rs/issues/4429) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Regression in in parquet `42.0.0` : Bad parquet column indexes for All Null Columns, resulting in `Parquet error: StructArrayReader out of sync` on read [\#4459](https://github.com/apache/arrow-rs/issues/4459) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Regression in 42.0.0: Parsing fractional intervals without leading 0 is not supported [\#4424](https://github.com/apache/arrow-rs/issues/4424) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- doc: deploy crate docs to GitHub pages [\#4436](https://github.com/apache/arrow-rs/pull/4436) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) + +**Merged pull requests:** + +- Append Row to Rows \(\#4466\) [\#4470](https://github.com/apache/arrow-rs/pull/4470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat\(flight-sql\): Allow implementations of FlightSqlService to handle custom actions and commands [\#4463](https://github.com/apache/arrow-rs/pull/4463) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- Docs: Add clearer API doc links [\#4461](https://github.com/apache/arrow-rs/pull/4461) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Fix empty offset index for all null columns \(\#4459\) [\#4460](https://github.com/apache/arrow-rs/pull/4460) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Bump peaceiris/actions-gh-pages from 3.9.2 to 3.9.3 [\#4455](https://github.com/apache/arrow-rs/pull/4455) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Convince the compiler to auto-vectorize the range check in parquet DictionaryBuffer [\#4453](https://github.com/apache/arrow-rs/pull/4453) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jhorstmann](https://github.com/jhorstmann)) +- fix docs deployment [\#4452](https://github.com/apache/arrow-rs/pull/4452) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) +- Update indexmap requirement from 1.9 to 2.0 [\#4451](https://github.com/apache/arrow-rs/pull/4451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.60 to =1.0.63 [\#4450](https://github.com/apache/arrow-rs/pull/4450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Revise error message in From\ for ScalarBuffer [\#4446](https://github.com/apache/arrow-rs/pull/4446) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- minor: remove useless mut [\#4443](https://github.com/apache/arrow-rs/pull/4443) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- unify substring for binary&utf8 [\#4442](https://github.com/apache/arrow-rs/pull/4442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Casting fixedsizelist to list/largelist [\#4433](https://github.com/apache/arrow-rs/pull/4433) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- feat: support `NullBuilder` [\#4430](https://github.com/apache/arrow-rs/pull/4430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) +- Remove Float64 -\> Float32 cast in IPC Reader [\#4427](https://github.com/apache/arrow-rs/pull/4427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) +- Parse intervals like `.5` the same as `0.5` [\#4425](https://github.com/apache/arrow-rs/pull/4425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- feat: add strict mode to json reader [\#4421](https://github.com/apache/arrow-rs/pull/4421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([blinkseb](https://github.com/blinkseb)) +- Add DictionaryArray::occupancy [\#4415](https://github.com/apache/arrow-rs/pull/4415) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) ## [42.0.0](https://github.com/apache/arrow-rs/tree/42.0.0) (2023-06-16) [Full Changelog](https://github.com/apache/arrow-rs/compare/41.0.0...42.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ed2f1420684..bef7a7c5cf43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,53 +19,49 @@ # Changelog -## [43.0.0](https://github.com/apache/arrow-rs/tree/43.0.0) (2023-06-30) +## [44.0.0](https://github.com/apache/arrow-rs/tree/44.0.0) (2023-07-14) -[Full Changelog](https://github.com/apache/arrow-rs/compare/42.0.0...43.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/43.0.0...44.0.0) **Breaking changes:** -- Simplify ffi import/export [\#4447](https://github.com/apache/arrow-rs/pull/4447) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Virgiel](https://github.com/Virgiel)) -- Return Result from Parquet Row APIs [\#4428](https://github.com/apache/arrow-rs/pull/4428) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Remove Binary Dictionary Arithmetic Support [\#4407](https://github.com/apache/arrow-rs/pull/4407) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use Parser for cast kernel \(\#4512\) [\#4513](https://github.com/apache/arrow-rs/pull/4513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Datum based arithmetic kernels \(\#3999\) [\#4465](https://github.com/apache/arrow-rs/pull/4465) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Request: a way to copy a `Row` to `Rows` [\#4466](https://github.com/apache/arrow-rs/issues/4466) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Reuse schema when importing from FFI [\#4444](https://github.com/apache/arrow-rs/issues/4444) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[FlightSQL\] Allow implementations of `FlightSqlService` to handle custom actions and commands [\#4439](https://github.com/apache/arrow-rs/issues/4439) -- Support `NullBuilder` [\#4429](https://github.com/apache/arrow-rs/issues/4429) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- eq\_dyn\_binary\_scalar should support FixedSizeBinary types [\#4491](https://github.com/apache/arrow-rs/issues/4491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Port Tests from Deprecated Arithmetic Kernels [\#4480](https://github.com/apache/arrow-rs/issues/4480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement RecordBatchReader for Boxed trait object [\#4474](https://github.com/apache/arrow-rs/issues/4474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Date` - `Date` kernel [\#4383](https://github.com/apache/arrow-rs/issues/4383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Default FlightSqlService Implementations [\#4372](https://github.com/apache/arrow-rs/issues/4372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Regression in in parquet `42.0.0` : Bad parquet column indexes for All Null Columns, resulting in `Parquet error: StructArrayReader out of sync` on read [\#4459](https://github.com/apache/arrow-rs/issues/4459) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Regression in 42.0.0: Parsing fractional intervals without leading 0 is not supported [\#4424](https://github.com/apache/arrow-rs/issues/4424) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Documentation updates:** - -- doc: deploy crate docs to GitHub pages [\#4436](https://github.com/apache/arrow-rs/pull/4436) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) +- Parquet: `AsyncArrowWriter` to a file corrupts the footer for large columns [\#4526](https://github.com/apache/arrow-rs/issues/4526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[object\_store\] Failure to send bytes to azure [\#4522](https://github.com/apache/arrow-rs/issues/4522) +- Cannot cast string '2021-01-02' to value of Date64 type [\#4512](https://github.com/apache/arrow-rs/issues/4512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Interval Subtraction [\#4489](https://github.com/apache/arrow-rs/issues/4489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Interval Negation Incorrect [\#4488](https://github.com/apache/arrow-rs/issues/4488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet: AsyncArrowWriter inner buffer is not correctly limited and causes OOM [\#4477](https://github.com/apache/arrow-rs/issues/4477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Merged pull requests:** -- Append Row to Rows \(\#4466\) [\#4470](https://github.com/apache/arrow-rs/pull/4470) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat\(flight-sql\): Allow implementations of FlightSqlService to handle custom actions and commands [\#4463](https://github.com/apache/arrow-rs/pull/4463) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) -- Docs: Add clearer API doc links [\#4461](https://github.com/apache/arrow-rs/pull/4461) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Fix empty offset index for all null columns \(\#4459\) [\#4460](https://github.com/apache/arrow-rs/pull/4460) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Bump peaceiris/actions-gh-pages from 3.9.2 to 3.9.3 [\#4455](https://github.com/apache/arrow-rs/pull/4455) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Convince the compiler to auto-vectorize the range check in parquet DictionaryBuffer [\#4453](https://github.com/apache/arrow-rs/pull/4453) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jhorstmann](https://github.com/jhorstmann)) -- fix docs deployment [\#4452](https://github.com/apache/arrow-rs/pull/4452) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) -- Update indexmap requirement from 1.9 to 2.0 [\#4451](https://github.com/apache/arrow-rs/pull/4451) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update proc-macro2 requirement from =1.0.60 to =1.0.63 [\#4450](https://github.com/apache/arrow-rs/pull/4450) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Revise error message in From\ for ScalarBuffer [\#4446](https://github.com/apache/arrow-rs/pull/4446) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- minor: remove useless mut [\#4443](https://github.com/apache/arrow-rs/pull/4443) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- unify substring for binary&utf8 [\#4442](https://github.com/apache/arrow-rs/pull/4442) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Casting fixedsizelist to list/largelist [\#4433](https://github.com/apache/arrow-rs/pull/4433) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) -- feat: support `NullBuilder` [\#4430](https://github.com/apache/arrow-rs/pull/4430) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([izveigor](https://github.com/izveigor)) -- Remove Float64 -\> Float32 cast in IPC Reader [\#4427](https://github.com/apache/arrow-rs/pull/4427) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ming08108](https://github.com/ming08108)) -- Parse intervals like `.5` the same as `0.5` [\#4425](https://github.com/apache/arrow-rs/pull/4425) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- feat: add strict mode to json reader [\#4421](https://github.com/apache/arrow-rs/pull/4421) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([blinkseb](https://github.com/blinkseb)) -- Add DictionaryArray::occupancy [\#4415](https://github.com/apache/arrow-rs/pull/4415) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix AsyncArrowWriter flush for large buffer sizes \(\#4526\) [\#4527](https://github.com/apache/arrow-rs/pull/4527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup cast\_primitive\_to\_list [\#4511](https://github.com/apache/arrow-rs/pull/4511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Bump actions/upload-pages-artifact from 1 to 2 [\#4508](https://github.com/apache/arrow-rs/pull/4508) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Date - Date \(\#4383\) [\#4504](https://github.com/apache/arrow-rs/pull/4504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Bump actions/labeler from 4.2.0 to 4.3.0 [\#4501](https://github.com/apache/arrow-rs/pull/4501) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.63 to =1.0.64 [\#4500](https://github.com/apache/arrow-rs/pull/4500) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add negate kernels \(\#4488\) [\#4494](https://github.com/apache/arrow-rs/pull/4494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Datum Arithmetic tests, Fix Interval Substraction \(\#4480\) [\#4493](https://github.com/apache/arrow-rs/pull/4493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- support FixedSizeBinary types in eq\_dyn\_binary\_scalar/neq\_dyn\_binary\_scalar [\#4492](https://github.com/apache/arrow-rs/pull/4492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- Add default implementations to the FlightSqlService trait [\#4485](https://github.com/apache/arrow-rs/pull/4485) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([rossjones](https://github.com/rossjones)) +- add num-complex requirement [\#4482](https://github.com/apache/arrow-rs/pull/4482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) +- fix incorrect buffer size limiting in parquet async writer [\#4478](https://github.com/apache/arrow-rs/pull/4478) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([richox](https://github.com/richox)) +- feat: support RecordBatchReader on boxed trait objects [\#4475](https://github.com/apache/arrow-rs/pull/4475) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Improve in-place primitive sorts by 13-67% [\#4473](https://github.com/apache/arrow-rs/pull/4473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Add Scalar/Datum abstraction \(\#1047\) [\#4393](https://github.com/apache/arrow-rs/pull/4393) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index 173bafc6e08a..ea21b97c7058 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "43.0.0" +version = "44.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "43.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "43.0.0", path = "./arrow-arith" } -arrow-array = { version = "43.0.0", path = "./arrow-array" } -arrow-buffer = { version = "43.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "43.0.0", path = "./arrow-cast" } -arrow-csv = { version = "43.0.0", path = "./arrow-csv" } -arrow-data = { version = "43.0.0", path = "./arrow-data" } -arrow-ipc = { version = "43.0.0", path = "./arrow-ipc" } -arrow-json = { version = "43.0.0", path = "./arrow-json" } -arrow-ord = { version = "43.0.0", path = "./arrow-ord" } -arrow-row = { version = "43.0.0", path = "./arrow-row" } -arrow-schema = { version = "43.0.0", path = "./arrow-schema" } -arrow-select = { version = "43.0.0", path = "./arrow-select" } -arrow-string = { version = "43.0.0", path = "./arrow-string" } -parquet = { version = "43.0.0", path = "./parquet", default-features = false } +arrow = { version = "44.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "44.0.0", path = "./arrow-arith" } +arrow-array = { version = "44.0.0", path = "./arrow-array" } +arrow-buffer = { version = "44.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "44.0.0", path = "./arrow-cast" } +arrow-csv = { version = "44.0.0", path = "./arrow-csv" } +arrow-data = { version = "44.0.0", path = "./arrow-data" } +arrow-ipc = { version = "44.0.0", path = "./arrow-ipc" } +arrow-json = { version = "44.0.0", path = "./arrow-json" } +arrow-ord = { version = "44.0.0", path = "./arrow-ord" } +arrow-row = { version = "44.0.0", path = "./arrow-row" } +arrow-schema = { version = "44.0.0", path = "./arrow-schema" } +arrow-select = { version = "44.0.0", path = "./arrow-select" } +arrow-string = { version = "44.0.0", path = "./arrow-string" } +parquet = { version = "44.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 6b4b0a56c4bc..6a0fee19b1ef 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="42.0.0" -FUTURE_RELEASE="43.0.0" +SINCE_TAG="43.0.0" +FUTURE_RELEASE="44.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 13fdfa472b8fd1599064b060be04904b7966278b Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 16 Jul 2023 10:13:20 -0400 Subject: [PATCH 1072/1411] use new num version instead of explicit num-complex dependency (#4532) --- arrow-array/Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 1990abfd2e35..4236da6d656b 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -46,8 +46,7 @@ arrow-schema = { workspace = true } arrow-data = { workspace = true } chrono = { version = "0.4.24", default-features = false, features = ["clock"] } chrono-tz = { version = "0.8", optional = true } -num = { version = "0.4", default-features = false, features = ["std"] } -num-complex = "0.4.2" +num = { version = "0.4.1", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", default-features = false } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } From be8bd13aedaf4d5880192ddcb40742035d6701c5 Mon Sep 17 00:00:00 2001 From: Christian Beilschmidt Date: Tue, 18 Jul 2023 15:42:33 +0200 Subject: [PATCH 1073/1411] add a validity slice access for boolean array builders (#4536) --- arrow-array/src/builder/boolean_builder.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 0def0ec48e3b..b4283775e6ae 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -169,6 +169,11 @@ impl BooleanBuilder { let array_data = unsafe { builder.build_unchecked() }; BooleanArray::from(array_data) } + + /// Returns the current null buffer as a slice + pub fn validity_slice(&self) -> Option<&[u8]> { + self.null_buffer_builder.as_slice() + } } impl ArrayBuilder for BooleanBuilder { From b71c0d952777f869cf40a3458413345b4ce93b23 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 09:42:51 -0400 Subject: [PATCH 1074/1411] Update proc-macro2 requirement from =1.0.64 to =1.0.66 (#4537) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.64...1.0.66) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 8700d9524dc1..8f889c0a7cb9 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.64", default-features = false } +proc-macro2 = { version = "=1.0.66", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From 276d8c5770a3734faf5ed32c2391783d53305de7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 18 Jul 2023 16:50:09 -0400 Subject: [PATCH 1075/1411] Add RowConverter::append (#4479) (#4541) * Add RowConverter::append (#4479) * Add overwrite test --- arrow-row/src/lib.rs | 147 +++++++++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 40 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index e8c5ff708d55..31942cb7e8bb 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -680,6 +680,52 @@ impl RowConverter { /// /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`] pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { + let num_rows = columns.first().map(|x| x.len()).unwrap_or(0); + let mut rows = self.empty_rows(num_rows, 0); + self.append(&mut rows, columns)?; + Ok(rows) + } + + /// Convert [`ArrayRef`] columns appending to an existing [`Rows`] + /// + /// See [`Row`] for information on when [`Row`] can be compared + /// + /// # Panics + /// + /// Panics if + /// * The schema of `columns` does not match that provided to [`RowConverter::new`] + /// * The provided [`Rows`] were not created by this [`RowConverter`] + /// + /// ``` + /// # use std::sync::Arc; + /// # use std::collections::HashSet; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::StringArray; + /// # use arrow_row::{Row, RowConverter, SortField}; + /// # use arrow_schema::DataType; + /// # + /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + /// let a1 = StringArray::from(vec!["hello", "world"]); + /// let a2 = StringArray::from(vec!["a", "a", "hello"]); + /// + /// let mut rows = converter.empty_rows(5, 128); + /// converter.append(&mut rows, &[Arc::new(a1)]).unwrap(); + /// converter.append(&mut rows, &[Arc::new(a2)]).unwrap(); + /// + /// let back = converter.convert_rows(&rows).unwrap(); + /// let values: Vec<_> = back[0].as_string::().iter().map(Option::unwrap).collect(); + /// assert_eq!(&values, &["hello", "world", "a", "a", "hello"]); + /// ``` + pub fn append( + &mut self, + rows: &mut Rows, + columns: &[ArrayRef], + ) -> Result<(), ArrowError> { + assert!( + Arc::ptr_eq(&rows.config.fields, &self.fields), + "rows were not produced by this RowConverter" + ); + if columns.len() != self.fields.len() { return Err(ArrowError::InvalidArgumentError(format!( "Incorrect number of arrays provided to RowConverter, expected {} got {}", @@ -704,12 +750,35 @@ impl RowConverter { }) .collect::, _>>()?; - let config = RowConfig { - fields: Arc::clone(&self.fields), - // Don't need to validate UTF-8 as came from arrow array - validate_utf8: false, - }; - let mut rows = new_empty_rows(columns, &encoders, config); + let write_offset = rows.num_rows(); + let lengths = row_lengths(columns, &encoders); + + // We initialize the offsets shifted down by one row index. + // + // As the rows are appended to the offsets will be incremented to match + // + // For example, consider the case of 3 rows of length 3, 4, and 6 respectively. + // The offsets would be initialized to `0, 0, 3, 7` + // + // Writing the first row entirely would yield `0, 3, 3, 7` + // The second, `0, 3, 7, 7` + // The third, `0, 3, 7, 13` + // + // This would be the final offsets for reading + // + // In this way offsets tracks the position during writing whilst eventually serving + // as identifying the offsets of the written rows + rows.offsets.reserve(lengths.len()); + let mut cur_offset = rows.offsets[write_offset]; + for l in lengths { + rows.offsets.push(cur_offset); + cur_offset = cur_offset.checked_add(l).expect("overflow"); + } + + // Note this will not zero out any trailing data in `rows.buffer`, + // e.g. resulting from a call to `Rows::clear`, relying instead on the + // encoders not assuming a zero-initialized buffer + rows.buffer.resize(cur_offset, 0); for ((column, field), encoder) in columns.iter().zip(self.fields.iter()).zip(encoders) @@ -717,7 +786,7 @@ impl RowConverter { // We encode a column at a time to minimise dispatch overheads encode_column( &mut rows.buffer, - &mut rows.offsets, + &mut rows.offsets[write_offset..], column.as_ref(), field.options, &encoder, @@ -731,7 +800,7 @@ impl RowConverter { .for_each(|w| assert!(w[0] <= w[1], "offsets should be monotonic")); } - Ok(rows) + Ok(()) } /// Convert [`Rows`] columns into [`ArrayRef`] @@ -899,6 +968,7 @@ impl Rows { self.offsets.push(self.buffer.len()) } + /// Returns the row at index `row` pub fn row(&self, row: usize) -> Row<'_> { let end = self.offsets[row + 1]; let start = self.offsets[row]; @@ -908,10 +978,17 @@ impl Rows { } } + /// Sets the length of this [`Rows`] to 0 + pub fn clear(&mut self) { + self.offsets.truncate(1); + } + + /// Returns the number of [`Row`] in this [`Rows`] pub fn num_rows(&self) -> usize { self.offsets.len() - 1 } + /// Returns an iterator over the [`Row`] in this [`Rows`] pub fn iter(&self) -> RowsIter<'_> { self.into_iter() } @@ -1116,7 +1193,7 @@ fn null_sentinel(options: SortOptions) -> u8 { } /// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`] -fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> Rows { +fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> Vec { use fixed::FixedLengthEncoding; let num_rows = cols.first().map(|x| x.len()).unwrap_or(0); @@ -1203,37 +1280,7 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) -> } } - let mut offsets = Vec::with_capacity(num_rows + 1); - offsets.push(0); - - // We initialize the offsets shifted down by one row index. - // - // As the rows are appended to the offsets will be incremented to match - // - // For example, consider the case of 3 rows of length 3, 4, and 6 respectively. - // The offsets would be initialized to `0, 0, 3, 7` - // - // Writing the first row entirely would yield `0, 3, 3, 7` - // The second, `0, 3, 7, 7` - // The third, `0, 3, 7, 13` - // - // This would be the final offsets for reading - // - // In this way offsets tracks the position during writing whilst eventually serving - // as identifying the offsets of the written rows - let mut cur_offset = 0_usize; - for l in lengths { - offsets.push(cur_offset); - cur_offset = cur_offset.checked_add(l).expect("overflow"); - } - - let buffer = vec![0_u8; cur_offset]; - - Rows { - buffer, - offsets, - config, - } + lengths } /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses @@ -2375,4 +2422,24 @@ mod tests { } } } + + #[test] + fn test_clear() { + let mut converter = + RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); + let mut rows = converter.empty_rows(3, 128); + + let arrays = [ + Int32Array::from(vec![None, Some(2), Some(4)]), + Int32Array::from(vec![Some(2), None, Some(4)]), + ]; + + for array in arrays { + rows.clear(); + let array = Arc::new(array) as ArrayRef; + converter.append(&mut rows, &[array.clone()]).unwrap(); + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(&back[0], &array); + } + } } From 47f288c166458f42d9f488bd48b91e077bc6459f Mon Sep 17 00:00:00 2001 From: Leslie Zhai Date: Wed, 19 Jul 2023 04:50:23 +0800 Subject: [PATCH 1076/1411] Initial loongarch port (#4538) --- arrow-buffer/src/alloc/alignment.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow-buffer/src/alloc/alignment.rs b/arrow-buffer/src/alloc/alignment.rs index 7978baa2bbd8..b3979e1d6a06 100644 --- a/arrow-buffer/src/alloc/alignment.rs +++ b/arrow-buffer/src/alloc/alignment.rs @@ -117,3 +117,7 @@ pub const ALIGNMENT: usize = 1 << 7; /// Cache and allocation multiple alignment size #[cfg(target_arch = "aarch64")] pub const ALIGNMENT: usize = 1 << 6; + +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "loongarch64")] +pub const ALIGNMENT: usize = 1 << 6; From 730941f09a88abfdce6e8c3f24cb57e5a97859ad Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Wed, 19 Jul 2023 22:34:16 +0800 Subject: [PATCH 1077/1411] feat: Support `FixedSizedListArray` for `length` kernel (#4520) * feat: Support FixedSizedListArray for length kernel * fix clippy * update comment * avoid unsafe * reduce useless trait * remove T --- arrow-string/src/length.rs | 45 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 90efdd7b67cc..25d6414ec8e6 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -17,8 +17,8 @@ //! Defines kernel for length of string arrays and binary arrays -use arrow_array::types::*; use arrow_array::*; +use arrow_array::{cast::AsArray, types::*}; use arrow_buffer::Buffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; @@ -88,6 +88,14 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x) } +fn length_list_fixed_size(array: &dyn Array, length: i32) -> ArrayRef { + let array = array.as_fixed_size_list(); + let length_list = array.len(); + let buffer = Buffer::from_vec(vec![length; length_list]); + let data = Int32Array::new(buffer.into(), array.nulls().cloned()); + Arc::new(data) +} + fn length_binary(array: &dyn Array) -> ArrayRef where O: OffsetSizeTrait, @@ -146,7 +154,7 @@ where /// For list array, length is the number of elements in each list. /// For string array and binary array, length is the number of bytes of each value. /// -/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray, +/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray, BinaryArray/LargeBinaryArray, and FixedSizeListArray, /// or DictionaryArray with above Arrays as values /// * length of null is null. pub fn length(array: &dyn Array) -> Result { @@ -172,6 +180,7 @@ pub fn length(array: &dyn Array) -> Result { DataType::LargeUtf8 => Ok(length_string::(array)), DataType::Binary => Ok(length_binary::(array)), DataType::LargeBinary => Ok(length_binary::(array)), + DataType::FixedSizeList(_, len) => Ok(length_list_fixed_size(array, *len)), other => Err(ArrowError::ComputeError(format!( "length not supported for {other:?}" ))), @@ -215,6 +224,8 @@ pub fn bit_length(array: &dyn Array) -> Result { mod tests { use super::*; use arrow_array::cast::AsArray; + use arrow_buffer::NullBuffer; + use arrow_schema::Field; fn double_vec(v: Vec) -> Vec { [&v[..], &v[..]].concat() @@ -696,4 +707,34 @@ mod tests { assert_eq!(expected[i], actual[i],); } } + + #[test] + fn test_fixed_size_list_length() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(9) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8])) + .build() + .unwrap(); + let list_data_type = DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Int32, false)), + 3, + ); + let nulls = NullBuffer::from(vec![true, false, true]); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .nulls(Some(nulls)) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + + let lengths = length(&list_array).unwrap(); + let lengths = lengths.as_any().downcast_ref::().unwrap(); + + assert_eq!(lengths.len(), 3); + assert_eq!(lengths.value(0), 3); + assert!(lengths.is_null(1)); + assert_eq!(lengths.value(2), 3); + } } From 72cafde586af831d911473c6d1bbd56d2482cfdb Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Wed, 19 Jul 2023 18:11:48 +0200 Subject: [PATCH 1078/1411] Provide default `is_empty` impl for `arrow::array::ArrayBuilder` (#4543) --- arrow-array/src/builder/boolean_builder.rs | 5 ----- arrow-array/src/builder/fixed_size_binary_builder.rs | 5 ----- arrow-array/src/builder/fixed_size_list_builder.rs | 5 ----- arrow-array/src/builder/generic_byte_run_builder.rs | 5 ----- arrow-array/src/builder/generic_bytes_builder.rs | 5 ----- arrow-array/src/builder/generic_bytes_dictionary_builder.rs | 5 ----- arrow-array/src/builder/generic_list_builder.rs | 5 ----- arrow-array/src/builder/map_builder.rs | 4 ---- arrow-array/src/builder/mod.rs | 4 +++- arrow-array/src/builder/null_builder.rs | 5 ----- arrow-array/src/builder/primitive_builder.rs | 5 ----- arrow-array/src/builder/primitive_dictionary_builder.rs | 5 ----- arrow-array/src/builder/primitive_run_builder.rs | 5 ----- arrow-array/src/builder/struct_builder.rs | 5 ----- 14 files changed, 3 insertions(+), 65 deletions(-) diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index b4283775e6ae..5f0013269677 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -197,11 +197,6 @@ impl ArrayBuilder for BooleanBuilder { self.values_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index a213b3bbf87d..180150e988f3 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -139,11 +139,6 @@ impl ArrayBuilder for FixedSizeBinaryBuilder { self.null_buffer_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.null_buffer_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 0dd58044305e..f7e8999099ae 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -113,11 +113,6 @@ where self.null_buffer_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.null_buffer_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 4e3f36889a1b..41165208de55 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -150,11 +150,6 @@ where self.current_run_end_index } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.current_run_end_index == 0 - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index f77940055bf1..d84be8c2fca6 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -189,11 +189,6 @@ impl ArrayBuilder for GenericByteBuilder { self.null_buffer_builder.len() } - /// Returns whether the number of binary slots is zero - fn is_empty(&self) -> bool { - self.null_buffer_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index d5c62865ff8d..282f423fa6d1 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -193,11 +193,6 @@ where self.keys_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 99e15d10f3a5..b31814615fc9 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -91,11 +91,6 @@ where self.null_buffer_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.null_buffer_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 56b5619ceab1..4e3ec4a7944d 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -214,10 +214,6 @@ impl ArrayBuilder for MapBuilder { self.null_buffer_builder.len() } - fn is_empty(&self) -> bool { - self.len() == 0 - } - fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 1e5e6426be09..38a7500dd55f 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -237,7 +237,9 @@ pub trait ArrayBuilder: Any + Send { fn len(&self) -> usize; /// Returns whether number of array slots is zero - fn is_empty(&self) -> bool; + fn is_empty(&self) -> bool { + self.len() == 0 + } /// Builds the array fn finish(&mut self) -> ArrayRef; diff --git a/arrow-array/src/builder/null_builder.rs b/arrow-array/src/builder/null_builder.rs index 0b4345006993..94cb7f5cc281 100644 --- a/arrow-array/src/builder/null_builder.rs +++ b/arrow-array/src/builder/null_builder.rs @@ -133,11 +133,6 @@ impl ArrayBuilder for NullBuilder { self.len } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len() == 0 - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 3e31b1d05576..b23d6bba36c4 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -121,11 +121,6 @@ impl ArrayBuilder for PrimitiveBuilder { self.values_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index cde1abe22b7b..7323ee57627d 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -194,11 +194,6 @@ where self.keys_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 53674a73b172..01a989199b58 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -136,11 +136,6 @@ where self.current_run_end_index } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.current_run_end_index == 0 - } - /// Builds the array and reset this builder. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 88a23db6d10e..0c878e621056 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -52,11 +52,6 @@ impl ArrayBuilder for StructBuilder { self.null_buffer_builder.len() } - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len() == 0 - } - /// Builds the array. fn finish(&mut self) -> ArrayRef { Arc::new(self.finish()) From 4e8e1b38901a0e8371054752d31129b3aa62ed5b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 21 Jul 2023 13:37:11 -0400 Subject: [PATCH 1079/1411] Minor: Make arrow blog link about row format more discoverable (#4551) I couldn't find the link while talking with @JayjeetAtGithub today, so I propose making it easier to find --- arrow-row/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 31942cb7e8bb..83ed812df551 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -21,7 +21,7 @@ //! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. //! This makes the row format ideal for implementing efficient multi-column sorting, //! grouping, aggregation, windowing and more, as described in more detail -//! [here](https://arrow.apache.org/blog/2022/11/07/multi-column-sorts-in-arrow-rust-part-1/). +//! [in this blog post](https://arrow.apache.org/blog/2022/11/07/multi-column-sorts-in-arrow-rust-part-1/). //! //! For example, given three input [`Array`], [`RowConverter`] creates byte //! sequences that [compare] the same as when using [`lexsort`]. From 6e4d7008db49e51efb3f75e1cac986ad114fe6a2 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Sat, 22 Jul 2023 01:38:21 +0800 Subject: [PATCH 1080/1411] docs: fix wrong inline code snippet in parquet document (#4550) --- parquet/src/file/metadata.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index bb8346306cf9..4cb2e9ab2a6a 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -230,7 +230,9 @@ impl FileMetaData { self.key_value_metadata.as_ref() } - /// Returns Parquet ['Type`] that describes schema in this file. + /// Returns Parquet [`Type`] that describes schema in this file. + /// + /// [`Type`]: crate::schema::types::Type pub fn schema(&self) -> &SchemaType { self.schema_descr.root_schema() } From 6ee30a57e9935ddd3fb7828062e3dfbfacf574a4 Mon Sep 17 00:00:00 2001 From: Remco Verhoef Date: Fri, 21 Jul 2023 20:51:31 +0200 Subject: [PATCH 1081/1411] fix multiline likes (#4548) we will ignore new lines in case of more complex likes, which are translated to regexes --- arrow-string/src/like.rs | 136 ++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 46 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 6b4aea7e8e64..1223280e3769 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -266,7 +266,7 @@ fn like<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("^{re_pattern}$")).map_err(|e| { + Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {e}" )) @@ -312,7 +312,7 @@ fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( })) } else { let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{re_pattern}$")).map_err(|e| { + let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {e}" )) @@ -395,7 +395,7 @@ fn nlike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("^{re_pattern}$")).map_err(|e| { + Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {e}" )) @@ -442,7 +442,7 @@ fn ilike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { + Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {e}" )) @@ -487,7 +487,7 @@ fn ilike_scalar_op bool>( } let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { + let re = Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!("Unable to build regex from ILIKE pattern: {e}")) })?; @@ -530,7 +530,7 @@ fn nilike<'a, S: ArrayAccessor>( right: S, ) -> Result { regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?i)^{re_pattern}$")).map_err(|e| { + Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {e}" )) @@ -1368,6 +1368,7 @@ mod tests { Some("Air"), None, Some("Air"), + Some("bbbbb\nAir"), ]; let dict_array: DictionaryArray = data.into_iter().collect(); @@ -1380,7 +1381,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(false), ]), ); @@ -1392,7 +1394,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(false), ]), ); @@ -1404,7 +1407,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1416,7 +1420,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1428,7 +1433,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1440,7 +1446,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1452,7 +1459,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1464,7 +1472,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1476,7 +1485,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1488,7 +1498,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); } @@ -1502,6 +1513,7 @@ mod tests { Some("Air"), None, Some("Air"), + Some("bbbbb\nAir"), ]; let dict_array: DictionaryArray = data.into_iter().collect(); @@ -1514,7 +1526,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(true), ]), ); @@ -1526,7 +1539,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(true), ]), ); @@ -1538,7 +1552,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1550,7 +1565,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1562,7 +1578,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1574,7 +1591,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1586,7 +1604,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1598,7 +1617,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1610,7 +1630,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1622,7 +1643,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); } @@ -1636,6 +1658,7 @@ mod tests { Some("Air"), None, Some("Air"), + Some("bbbbb\nAir"), ]; let dict_array: DictionaryArray = data.into_iter().collect(); @@ -1648,7 +1671,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(false), ]), ); @@ -1660,7 +1684,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(false), ]), ); @@ -1672,7 +1697,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1684,7 +1710,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1696,7 +1723,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1708,7 +1736,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1720,7 +1749,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1732,7 +1762,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1744,7 +1775,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1756,7 +1788,8 @@ mod tests { Some(true), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); } @@ -1770,6 +1803,7 @@ mod tests { Some("Air"), None, Some("Air"), + Some("bbbbb\nAir"), ]; let dict_array: DictionaryArray = data.into_iter().collect(); @@ -1782,7 +1816,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(true), ]), ); @@ -1794,7 +1829,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(true), ]), ); @@ -1806,7 +1842,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1818,7 +1855,8 @@ mod tests { Some(false), Some(true), None, - Some(true) + Some(true), + Some(true), ]), ); @@ -1830,7 +1868,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1842,7 +1881,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1854,7 +1894,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1866,7 +1907,8 @@ mod tests { Some(true), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1878,7 +1920,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); @@ -1890,7 +1933,8 @@ mod tests { Some(false), Some(false), None, - Some(false) + Some(false), + Some(false), ]), ); } From 48cd0cfaae66dc8316b89179d5d1ec7ddca91cfb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 22 Jul 2023 00:03:44 -0400 Subject: [PATCH 1082/1411] Fix FixedSizeListBuilder capacity (#4549) (#4552) --- arrow-array/src/builder/fixed_size_list_builder.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index f7e8999099ae..0fe779d5c1a2 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -73,7 +73,11 @@ impl FixedSizeListBuilder { /// Creates a new [`FixedSizeListBuilder`] from a given values array builder /// `value_length` is the number of values within each array pub fn new(values_builder: T, value_length: i32) -> Self { - let capacity = values_builder.len(); + let capacity = values_builder + .len() + .checked_div(value_length as _) + .unwrap_or_default(); + Self::with_capacity(values_builder, value_length, capacity) } From 92e1e9ad80224287c332561101a548a48fd7053b Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Sat, 22 Jul 2023 23:58:43 +0800 Subject: [PATCH 1083/1411] docs: fix wrong parameter name (#4559) --- parquet/src/column/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 88967e179271..13af8233d422 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -212,7 +212,7 @@ where Ok((values, levels)) } - /// Read up to `num_records` returning the number of complete records, non-null + /// Read up to `max_records` returning the number of complete records, non-null /// values and levels decoded /// /// If the max definition level is 0, `def_levels` will be ignored, otherwise it will be From fc21cfbb9c4b1238570430a848fa4cd71adb84cb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 22 Jul 2023 12:19:24 -0400 Subject: [PATCH 1084/1411] Clarify GenericColumnReader::read_records (#4540) * Clarify GenericColumnReader::read_records * Review feedback --- parquet/src/column/reader.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 13af8233d422..3ce00622e953 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -212,14 +212,17 @@ where Ok((values, levels)) } - /// Read up to `max_records` returning the number of complete records, non-null - /// values and levels decoded + /// Read up to `max_records` whole records, returning the number of complete + /// records, non-null values and levels decoded. All levels for a given record + /// will be read, i.e. the next repetition level, if any, will be 0 /// - /// If the max definition level is 0, `def_levels` will be ignored, otherwise it will be + /// If the max definition level is 0, `def_levels` will be ignored and the number of records, + /// non-null values and levels decoded will all be equal, otherwise `def_levels` will be /// populated with the number of levels read, with an error returned if it is `None`. /// - /// If the max repetition level is 0, `rep_levels` will be ignored, otherwise it will be - /// populated with the number of levels read, with an error returned if it is `None`. + /// If the max repetition level is 0, `rep_levels` will be ignored and the number of records + /// and levels decoded will both be equal, otherwise `rep_levels` will be populated with + /// the number of levels read, with an error returned if it is `None`. /// /// `values` will be contiguously populated with the non-null values. Note that if the column /// is not required, this may be less than either `max_records` or the number of levels read From d9719d1f2d21ed0c75ffd5c8e8c4fb15ee5ebe54 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 22 Jul 2023 12:30:16 -0400 Subject: [PATCH 1085/1411] Fix field docs (#4563) --- arrow-schema/src/field.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index f38e1e26ad26..00deecf06283 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -170,7 +170,7 @@ impl Field { /// Create a new [`Field`] with [`DataType::Struct`] /// - /// - `name`: the name of the [`DataType::List`] field + /// - `name`: the name of the [`DataType::Struct`] field /// - `fields`: the description of each struct element /// - `nullable`: if the [`DataType::Struct`] array is nullable pub fn new_struct( @@ -186,8 +186,6 @@ impl Field { /// - `name`: the name of the [`DataType::List`] field /// - `value`: the description of each list element /// - `nullable`: if the [`DataType::List`] array is nullable - /// - /// Uses "item" as the name of the child field, this can be overridden with [`Self::new`] pub fn new_list( name: impl Into, value: impl Into, From 918959b4ac08d190f8a87fdd18d74ae37c3c22bf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 23 Jul 2023 19:10:59 -0400 Subject: [PATCH 1086/1411] Remove Sync from arrow-flight example (#4564) * Remove Sync from arrow-flight example * Update arrow-flight/examples/server.rs Co-authored-by: Liang-Chi Hsieh * Update arrow-flight/examples/server.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- arrow-flight/examples/server.rs | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/arrow-flight/examples/server.rs b/arrow-flight/examples/server.rs index 1d473103af8e..1ed21acef9b8 100644 --- a/arrow-flight/examples/server.rs +++ b/arrow-flight/examples/server.rs @@ -15,9 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::pin::Pin; - -use futures::Stream; +use futures::stream::BoxStream; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; @@ -32,27 +30,13 @@ pub struct FlightServiceImpl {} #[tonic::async_trait] impl FlightService for FlightServiceImpl { - type HandshakeStream = Pin< - Box> + Send + Sync + 'static>, - >; - type ListFlightsStream = - Pin> + Send + Sync + 'static>>; - type DoGetStream = - Pin> + Send + Sync + 'static>>; - type DoPutStream = - Pin> + Send + Sync + 'static>>; - type DoActionStream = Pin< - Box< - dyn Stream> - + Send - + Sync - + 'static, - >, - >; - type ListActionsStream = - Pin> + Send + Sync + 'static>>; - type DoExchangeStream = - Pin> + Send + Sync + 'static>>; + type HandshakeStream = BoxStream<'static, Result>; + type ListFlightsStream = BoxStream<'static, Result>; + type DoGetStream = BoxStream<'static, Result>; + type DoPutStream = BoxStream<'static, Result>; + type DoActionStream = BoxStream<'static, Result>; + type ListActionsStream = BoxStream<'static, Result>; + type DoExchangeStream = BoxStream<'static, Result>; async fn handshake( &self, From dfb642809e93c2c1b8343692f4e4b3080000f988 Mon Sep 17 00:00:00 2001 From: Miklos Szots Date: Tue, 25 Jul 2023 14:36:54 +0200 Subject: [PATCH 1087/1411] support NullArray un arith/boolean kernel (#4566) * support NullArray un arith/boolean kernel * prettify based on feedback --- arrow-arith/src/boolean.rs | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 04c9fb229034..61e591d51634 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -25,7 +25,7 @@ use arrow_array::*; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; use arrow_buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, DataType}; /// Logical 'and' boolean values with Kleene logic /// @@ -312,6 +312,10 @@ pub fn not(left: &BooleanArray) -> Result { /// ``` pub fn is_null(input: &dyn Array) -> Result { let values = match input.nulls() { + // NullArray has no nulls buffer yet all values are null + None if input.data_type() == &DataType::Null => { + BooleanBuffer::new_set(input.len()) + } None => BooleanBuffer::new_unset(input.len()), Some(nulls) => !nulls.inner(), }; @@ -332,6 +336,10 @@ pub fn is_null(input: &dyn Array) -> Result { /// ``` pub fn is_not_null(input: &dyn Array) -> Result { let values = match input.nulls() { + // NullArray has no nulls buffer yet all values are null + None if input.data_type() == &DataType::Null => { + BooleanBuffer::new_unset(input.len()) + } None => BooleanBuffer::new_set(input.len()), Some(n) => n.inner().clone(), }; @@ -871,4 +879,28 @@ mod tests { assert_eq!(expected, res); assert!(res.nulls().is_none()); } + + #[test] + fn test_null_array_is_null() { + let a = NullArray::new(3); + + let res = is_null(&a).unwrap(); + + let expected = BooleanArray::from(vec![true, true, true]); + + assert_eq!(expected, res); + assert!(res.nulls().is_none()); + } + + #[test] + fn test_null_array_is_not_null() { + let a = NullArray::new(3); + + let res = is_not_null(&a).unwrap(); + + let expected = BooleanArray::from(vec![false, false, false]); + + assert_eq!(expected, res); + assert!(res.nulls().is_none()); + } } From 0b75e8fbb1f20fb14c4aefae953f535e5be9bdbd Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Wed, 26 Jul 2023 17:37:29 +0200 Subject: [PATCH 1088/1411] Fix timezoned timestamp arithmetic (#4546) * Fix arithmetic for timezone-aware timestamp arrays * Remove debug test from issue * Update to pass timezone by value instead of reference because it's smaller * Use as_datetime_with_timezone instead of manual conversion * Add support for string tz, fix bug and refactor the whole thing * Use checked arithmetic for months and days and use Self instead of type name * Refactor tests * Add DST test case * Add new tz parameter to docstring * Add checked negations and refactor substraction * Add tests for interval overflow * Remove checked negation and use absolute unsigned instead * Use Option instead of Result for overflow errors * Move arithmetic functions to TimestampOp trait * Revert "Move arithmetic functions to TimestampOp trait" This reverts commit 8ec4be477e71531c6238ccf48a695a6b7e6a8798. --- arrow-arith/Cargo.toml | 1 + arrow-arith/src/numeric.rs | 250 +++++++++++++-- arrow-array/src/delta.rs | 72 ++++- arrow-array/src/types.rs | 626 +++++++++++++------------------------ 4 files changed, 508 insertions(+), 441 deletions(-) diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index b5ea2e3c4354..6da472be6601 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -46,3 +46,4 @@ num = { version = "0.4", default-features = false, features = ["std"] } [features] simd = ["arrow-array/simd"] +chrono-tz = ["arrow-array/chrono-tz"] diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index b0bbb75c129b..7862fe2f9bea 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -22,6 +22,7 @@ use std::fmt::Formatter; use std::sync::Arc; use arrow_array::cast::AsArray; +use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; @@ -345,13 +346,13 @@ fn float_op( trait TimestampOp: ArrowTimestampType { type Duration: ArrowPrimitiveType; - fn add_year_month(timestamp: i64, delta: i32) -> Result; - fn add_day_time(timestamp: i64, delta: i64) -> Result; - fn add_month_day_nano(timestamp: i64, delta: i128) -> Result; + fn add_year_month(timestamp: i64, delta: i32, tz: Tz) -> Option; + fn add_day_time(timestamp: i64, delta: i64, tz: Tz) -> Option; + fn add_month_day_nano(timestamp: i64, delta: i128, tz: Tz) -> Option; - fn sub_year_month(timestamp: i64, delta: i32) -> Result; - fn sub_day_time(timestamp: i64, delta: i64) -> Result; - fn sub_month_day_nano(timestamp: i64, delta: i128) -> Result; + fn sub_year_month(timestamp: i64, delta: i32, tz: Tz) -> Option; + fn sub_day_time(timestamp: i64, delta: i64, tz: Tz) -> Option; + fn sub_month_day_nano(timestamp: i64, delta: i128, tz: Tz) -> Option; } macro_rules! timestamp { @@ -359,28 +360,28 @@ macro_rules! timestamp { impl TimestampOp for $t { type Duration = $d; - fn add_year_month(left: i64, right: i32) -> Result { - Self::add_year_months(left, right) + fn add_year_month(left: i64, right: i32, tz: Tz) -> Option { + Self::add_year_months(left, right, tz) } - fn add_day_time(left: i64, right: i64) -> Result { - Self::add_day_time(left, right) + fn add_day_time(left: i64, right: i64, tz: Tz) -> Option { + Self::add_day_time(left, right, tz) } - fn add_month_day_nano(left: i64, right: i128) -> Result { - Self::add_month_day_nano(left, right) + fn add_month_day_nano(left: i64, right: i128, tz: Tz) -> Option { + Self::add_month_day_nano(left, right, tz) } - fn sub_year_month(left: i64, right: i32) -> Result { - Self::subtract_year_months(left, right) + fn sub_year_month(left: i64, right: i32, tz: Tz) -> Option { + Self::subtract_year_months(left, right, tz) } - fn sub_day_time(left: i64, right: i64) -> Result { - Self::subtract_day_time(left, right) + fn sub_day_time(left: i64, right: i64, tz: Tz) -> Option { + Self::subtract_day_time(left, right, tz) } - fn sub_month_day_nano(left: i64, right: i128) -> Result { - Self::subtract_month_day_nano(left, right) + fn sub_month_day_nano(left: i64, right: i128, tz: Tz) -> Option { + Self::subtract_month_day_nano(left, right, tz) } } }; @@ -401,8 +402,9 @@ fn timestamp_op( use DataType::*; use IntervalUnit::*; - // Note: interval arithmetic should account for timezones (#4457) let l = l.as_primitive::(); + let l_tz: Tz = l.timezone().unwrap_or("+00:00").parse()?; + let array: PrimitiveArray = match (op, r.data_type()) { (Op::Sub | Op::SubWrapping, Timestamp(unit, _)) if unit == &T::UNIT => { let r = r.as_primitive::(); @@ -420,29 +422,77 @@ fn timestamp_op( (Op::Add | Op::AddWrapping, Interval(YearMonth)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::add_year_month(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::add_year_month(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } (Op::Sub | Op::SubWrapping, Interval(YearMonth)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::sub_year_month(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::sub_year_month(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } (Op::Add | Op::AddWrapping, Interval(DayTime)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::add_day_time(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::add_day_time(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } (Op::Sub | Op::SubWrapping, Interval(DayTime)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::sub_day_time(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::sub_day_time(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } (Op::Add | Op::AddWrapping, Interval(MonthDayNano)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::add_month_day_nano(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::add_month_day_nano(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } (Op::Sub | Op::SubWrapping, Interval(MonthDayNano)) => { let r = r.as_primitive::(); - try_op!(l, l_s, r, r_s, T::sub_month_day_nano(l, r)) + try_op!( + l, + l_s, + r, + r_s, + T::sub_month_day_nano(l, r, l_tz).ok_or(ArrowError::ComputeError( + "Timestamp out of range".to_string() + )) + ) } _ => { return Err(ArrowError::InvalidArgumentError(format!( @@ -803,9 +853,11 @@ fn decimal_op( #[cfg(test)] mod tests { use super::*; - use arrow_array::temporal_conversions::{as_date, as_datetime}; + use arrow_array::temporal_conversions::{ + as_date, as_datetime, as_datetime_with_timezone, + }; use arrow_buffer::{i256, ScalarBuffer}; - use chrono::{DateTime, NaiveDate}; + use chrono::{DateTime, NaiveDate, TimeZone}; fn test_neg_primitive( input: &[T::Native], @@ -1472,4 +1524,148 @@ mod tests { "Compute error: Overflow happened on: 9223372036854775807 - -1" ); } + + fn test_timestamp_with_timezone_impl(tz_str: &str) { + let tz: Tz = tz_str.parse().unwrap(); + + let transform_array = |x: &dyn Array| -> Vec> { + x.as_primitive::() + .values() + .into_iter() + .map(|x| as_datetime_with_timezone::(*x, tz).unwrap()) + .collect() + }; + + let values = vec![ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(1970, 1, 1, 0, 0, 0) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(2010, 4, 1, 4, 0, 20) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(1960, 1, 30, 4, 23, 20) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0) + .unwrap() + .naive_utc(), + ] + .into_iter() + .map(|x| T::make_value(x).unwrap()) + .collect(); + + let a = PrimitiveArray::::new(values, None).with_timezone(tz_str); + + // IntervalYearMonth + let b = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(0, 1), + IntervalYearMonthType::make_value(5, 34), + IntervalYearMonthType::make_value(-2, 4), + IntervalYearMonthType::make_value(7, -4), + IntervalYearMonthType::make_value(0, 1), + ]); + let r1 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r1.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1977, 11, 1, 0, 0, 0).unwrap(), + tz.with_ymd_and_hms(2008, 8, 1, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1966, 9, 30, 4, 23, 20).unwrap(), + tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), + ] + ); + + let r2 = sub(&r1, &b).unwrap(); + assert_eq!(r2.as_ref(), &a); + + // IntervalDayTime + let b = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(0, 0), + IntervalDayTimeType::make_value(5, 454000), + IntervalDayTimeType::make_value(-34, 0), + IntervalDayTimeType::make_value(7, -4000), + IntervalDayTimeType::make_value(1, 0), + ]); + let r3 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r3.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1970, 1, 6, 0, 7, 34).unwrap(), + tz.with_ymd_and_hms(2010, 2, 26, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1960, 2, 6, 4, 23, 16).unwrap(), + tz.with_ymd_and_hms(2023, 3, 26, 14, 0, 0).unwrap(), + ] + ); + + let r4 = sub(&r3, &b).unwrap(); + assert_eq!(r4.as_ref(), &a); + + // IntervalMonthDayNano + let b = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(344, 34, -43_000_000_000), + IntervalMonthDayNanoType::make_value(-593, -33, 13_000_000_000), + IntervalMonthDayNanoType::make_value(5, 2, 493_000_000_000), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ]); + let r5 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r5.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1998, 10, 4, 23, 59, 17).unwrap(), + tz.with_ymd_and_hms(1960, 9, 29, 4, 0, 33).unwrap(), + tz.with_ymd_and_hms(1960, 7, 2, 4, 31, 33).unwrap(), + tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), + ] + ); + + let r6 = sub(&r5, &b).unwrap(); + assert_eq!( + &transform_array(r6.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1970, 1, 2, 0, 0, 0).unwrap(), + tz.with_ymd_and_hms(2010, 4, 2, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1960, 1, 31, 4, 23, 20).unwrap(), + tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0).unwrap(), + ] + ); + } + + #[cfg(not(feature = "chrono-tz"))] + #[test] + fn test_timestamp_with_timezone() { + let timezones = ["+00:00", "+01:00", "-01:00", "+03:30"]; + for timezone in timezones { + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + } + } + + #[cfg(feature = "chrono-tz")] + #[test] + fn test_timestamp_with_timezone() { + let timezones = [ + "Europe/Paris", + "Europe/London", + "Africa/Bamako", + "America/Dominica", + "Asia/Seoul", + "Asia/Shanghai", + ]; + for timezone in timezones { + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + } + } } diff --git a/arrow-array/src/delta.rs b/arrow-array/src/delta.rs index 029168242b90..bf9ee5ca685f 100644 --- a/arrow-array/src/delta.rs +++ b/arrow-array/src/delta.rs @@ -23,22 +23,74 @@ // Copied from chronoutil crate //! Contains utility functions for shifting Date objects. -use chrono::{Datelike, Months}; +use chrono::{DateTime, Datelike, Days, Months, TimeZone}; use std::cmp::Ordering; /// Shift a date by the given number of months. -pub(crate) fn shift_months< - D: Datelike - + std::ops::Add - + std::ops::Sub, ->( - date: D, - months: i32, -) -> D { +pub(crate) fn shift_months(date: D, months: i32) -> D +where + D: Datelike + std::ops::Add + std::ops::Sub, +{ match months.cmp(&0) { Ordering::Equal => date, Ordering::Greater => date + Months::new(months as u32), - Ordering::Less => date - Months::new(-months as u32), + Ordering::Less => date - Months::new(months.unsigned_abs()), + } +} + +/// Add the given number of months to the given datetime. +/// +/// Returns `None` when it will result in overflow. +pub(crate) fn add_months_datetime( + dt: DateTime, + months: i32, +) -> Option> { + match months.cmp(&0) { + Ordering::Equal => Some(dt), + Ordering::Greater => dt.checked_add_months(Months::new(months as u32)), + Ordering::Less => dt.checked_sub_months(Months::new(months.unsigned_abs())), + } +} + +/// Add the given number of days to the given datetime. +/// +/// Returns `None` when it will result in overflow. +pub(crate) fn add_days_datetime( + dt: DateTime, + days: i32, +) -> Option> { + match days.cmp(&0) { + Ordering::Equal => Some(dt), + Ordering::Greater => dt.checked_add_days(Days::new(days as u64)), + Ordering::Less => dt.checked_sub_days(Days::new(days.unsigned_abs() as u64)), + } +} + +/// Substract the given number of months to the given datetime. +/// +/// Returns `None` when it will result in overflow. +pub(crate) fn sub_months_datetime( + dt: DateTime, + months: i32, +) -> Option> { + match months.cmp(&0) { + Ordering::Equal => Some(dt), + Ordering::Greater => dt.checked_sub_months(Months::new(months as u32)), + Ordering::Less => dt.checked_add_months(Months::new(months.unsigned_abs())), + } +} + +/// Substract the given number of days to the given datetime. +/// +/// Returns `None` when it will result in overflow. +pub(crate) fn sub_days_datetime( + dt: DateTime, + days: i32, +) -> Option> { + match days.cmp(&0) { + Ordering::Equal => Some(dt), + Ordering::Greater => dt.checked_sub_days(Days::new(days as u64)), + Ordering::Less => dt.checked_add_days(Days::new(days.unsigned_abs() as u64)), } } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 3d14cff384b8..769dbf974b93 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -17,7 +17,12 @@ //! Zero-sized types used to parameterize generic array implementations -use crate::delta::shift_months; +use crate::delta::{ + add_days_datetime, add_months_datetime, shift_months, sub_days_datetime, + sub_months_datetime, +}; +use crate::temporal_conversions::as_datetime_with_timezone; +use crate::timezone::Tz; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; @@ -350,158 +355,184 @@ impl ArrowTimestampType for TimestampNanosecondType { } } +fn add_year_months( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let months = IntervalYearMonthType::to_months(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = add_months_datetime(res, months)?; + let res = res.naive_utc(); + T::make_value(res) +} + +fn add_day_time( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = add_days_datetime(res, days)?; + let res = res.checked_add_signed(Duration::milliseconds(ms as i64))?; + let res = res.naive_utc(); + T::make_value(res) +} + +fn add_month_day_nano( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = add_months_datetime(res, months)?; + let res = add_days_datetime(res, days)?; + let res = res.checked_add_signed(Duration::nanoseconds(nanos))?; + let res = res.naive_utc(); + T::make_value(res) +} + +fn subtract_year_months( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let months = IntervalYearMonthType::to_months(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = sub_months_datetime(res, months)?; + let res = res.naive_utc(); + T::make_value(res) +} + +fn subtract_day_time( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let (days, ms) = IntervalDayTimeType::to_parts(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = sub_days_datetime(res, days)?; + let res = res.checked_sub_signed(Duration::milliseconds(ms as i64))?; + let res = res.naive_utc(); + T::make_value(res) +} + +fn subtract_month_day_nano( + timestamp: ::Native, + delta: ::Native, + tz: Tz, +) -> Option<::Native> { + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); + let res = as_datetime_with_timezone::(timestamp, tz)?; + let res = sub_months_datetime(res, months)?; + let res = sub_days_datetime(res, days)?; + let res = res.checked_sub_signed(Duration::nanoseconds(nanos))?; + let res = res.naive_utc(); + T::make_value(res) +} + impl TimestampSecondType { - /// Adds the given IntervalYearMonthType to an arrow TimestampSecondType + /// Adds the given IntervalYearMonthType to an arrow TimestampSecondType. + /// + /// Returns `None` when it will result in overflow. /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let prior = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - - let months = IntervalYearMonthType::to_months(delta); - let posterior = shift_months(prior, months); - TimestampSecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_year_months::(timestamp, delta, tz) } - /// Adds the given IntervalDayTimeType to an arrow TimestampSecondType + /// Adds the given IntervalDayTimeType to an arrow TimestampSecondType. + /// + /// Returns `None` when it will result in overflow. /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampSecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_day_time::(timestamp, delta, tz) } /// Adds the given IntervalMonthDayNanoType to an arrow TimestampSecondType /// + /// Returns `None` when it will result in overflow. /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, months); - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampSecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_month_day_nano::(timestamp, delta, tz) } /// Subtracts the given IntervalYearMonthType to an arrow TimestampSecondType /// + /// Returns `None` when it will result in overflow. + /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let prior = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let months = IntervalYearMonthType::to_months(-delta); - let posterior = shift_months(prior, months); - TimestampSecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_year_months::(timestamp, delta, tz) } /// Subtracts the given IntervalDayTimeType to an arrow TimestampSecondType /// + /// Returns `None` when it will result in overflow. + /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampSecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_day_time::(timestamp, delta, tz) } /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampSecondType /// + /// Returns `None` when it will result in overflow. + /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_opt(timestamp, 0).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, -months); - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampSecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_month_day_nano::(timestamp, delta, tz) } } @@ -512,18 +543,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let prior = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let months = IntervalYearMonthType::to_months(delta); - let posterior = shift_months(prior, months); - TimestampMicrosecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_year_months::(timestamp, delta, tz) } /// Adds the given IntervalDayTimeType to an arrow TimestampMicrosecondType @@ -532,27 +558,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMicrosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_day_time::(timestamp, delta, tz) } /// Adds the given IntervalMonthDayNanoType to an arrow TimestampMicrosecondType @@ -561,28 +573,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, months); - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMicrosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_month_day_nano::(timestamp, delta, tz) } /// Subtracts the given IntervalYearMonthType to an arrow TimestampMicrosecondType @@ -591,18 +588,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let prior = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let months = IntervalYearMonthType::to_months(-delta); - let posterior = shift_months(prior, months); - TimestampMicrosecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_year_months::(timestamp, delta, tz) } /// Subtracts the given IntervalDayTimeType to an arrow TimestampMicrosecondType @@ -611,27 +603,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMicrosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_day_time::(timestamp, delta, tz) } /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampMicrosecondType @@ -640,28 +618,13 @@ impl TimestampMicrosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_micros(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, -months); - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMicrosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_month_day_nano::(timestamp, delta, tz) } } @@ -672,18 +635,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let prior = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let months = IntervalYearMonthType::to_months(delta); - let posterior = shift_months(prior, months); - TimestampMillisecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_year_months::(timestamp, delta, tz) } /// Adds the given IntervalDayTimeType to an arrow TimestampMillisecondType @@ -692,27 +650,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMillisecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_day_time::(timestamp, delta, tz) } /// Adds the given IntervalMonthDayNanoType to an arrow TimestampMillisecondType @@ -721,28 +665,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, months); - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMillisecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_month_day_nano::(timestamp, delta, tz) } /// Subtracts the given IntervalYearMonthType to an arrow TimestampMillisecondType @@ -751,18 +680,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let prior = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let months = IntervalYearMonthType::to_months(-delta); - let posterior = shift_months(prior, months); - TimestampMillisecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_year_months::(timestamp, delta, tz) } /// Subtracts the given IntervalDayTimeType to an arrow TimestampMillisecondType @@ -771,27 +695,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMillisecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_day_time::(timestamp, delta, tz) } /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampMillisecondType @@ -800,28 +710,13 @@ impl TimestampMillisecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> - { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = NaiveDateTime::from_timestamp_millis(timestamp).ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = shift_months(res, -months); - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampMillisecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_month_day_nano::(timestamp, delta, tz) } } @@ -832,19 +727,13 @@ impl TimestampNanosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let prior = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; - let months = IntervalYearMonthType::to_months(delta); - let posterior = shift_months(prior, months); - TimestampNanosecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_year_months::(timestamp, delta, tz) } /// Adds the given IntervalDayTimeType to an arrow TimestampNanosecondType @@ -853,28 +742,13 @@ impl TimestampNanosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampNanosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + add_day_time::(timestamp, delta, tz) } /// Adds the given IntervalMonthDayNanoType to an arrow TimestampNanosecondType @@ -883,114 +757,58 @@ impl TimestampNanosecondType { /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn add_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; + tz: Tz, + ) -> Option<::Native> { + add_month_day_nano::(timestamp, delta, tz) + } - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = shift_months(res, months); - let res = res - .checked_add_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_add_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampNanosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) - } - - /// Subtracs the given IntervalYearMonthType to an arrow TimestampNanosecondType + /// Subtracts the given IntervalYearMonthType to an arrow TimestampNanosecondType /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_year_months( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let prior = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; - let months = IntervalYearMonthType::to_months(-delta); - let posterior = shift_months(prior, months); - TimestampNanosecondType::make_value(posterior) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_year_months::(timestamp, delta, tz) } - /// Subtracs the given IntervalDayTimeType to an arrow TimestampNanosecondType + /// Subtracts the given IntervalDayTimeType to an arrow TimestampNanosecondType /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_day_time( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; + tz: Tz, + ) -> Option<::Native> { + subtract_day_time::(timestamp, delta, tz) + } - let (days, ms) = IntervalDayTimeType::to_parts(delta); - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::milliseconds(ms as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampNanosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) - } - - /// Subtracs the given IntervalMonthDayNanoType to an arrow TimestampNanosecondType + /// Subtracts the given IntervalMonthDayNanoType to an arrow TimestampNanosecondType /// /// # Arguments /// /// * `timestamp` - The date on which to perform the operation /// * `delta` - The interval to add + /// * `tz` - The timezone in which to interpret `timestamp` pub fn subtract_month_day_nano( - timestamp: ::Native, + timestamp: ::Native, delta: ::Native, - ) -> Result<::Native, ArrowError> { - let seconds = timestamp / 1_000_000_000; - let nanos = timestamp % 1_000_000_000; - let res = NaiveDateTime::from_timestamp_opt(seconds, nanos as u32).ok_or_else( - || ArrowError::ComputeError("Timestamp out of range".to_string()), - )?; - - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(delta); - let res = shift_months(res, -months); - let res = res - .checked_sub_signed(Duration::days(days as i64)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - let res = res - .checked_sub_signed(Duration::nanoseconds(nanos)) - .ok_or_else(|| { - ArrowError::ComputeError("Timestamp out of range".to_string()) - })?; - TimestampNanosecondType::make_value(res) - .ok_or_else(|| ArrowError::ComputeError("Timestamp out of range".to_string())) + tz: Tz, + ) -> Option<::Native> { + subtract_month_day_nano::(timestamp, delta, tz) } } From bff6155d38e19bfe62a776731b78b435560f2c8e Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Wed, 26 Jul 2023 08:55:30 -0700 Subject: [PATCH 1089/1411] Make object_store::multipart public (#4570) * Make object_store::multipart public * one more public * docs * doc * more docs * derive debug * debug --- object_store/src/lib.rs | 4 ++-- object_store/src/multipart.rs | 30 +++++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 94261e7d421c..082dca293571 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -250,8 +250,8 @@ pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider} #[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] mod config; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] -mod multipart; +#[cfg(feature = "cloud")] +pub mod multipart; mod parse; mod util; diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 26580307053e..5f9b7e67488f 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +//! Cloud Multipart Upload +//! +//! This crate provides an asynchronous interface for multipart file uploads to cloud storage services. +//! It's designed to offer efficient, non-blocking operations, +//! especially useful when dealing with large files or high-throughput systems. + use async_trait::async_trait; use futures::{stream::FuturesUnordered, Future, StreamExt}; use std::{io, pin::Pin, sync::Arc, task::Poll}; @@ -28,7 +34,7 @@ type BoxedTryFuture = Pin> + Sen /// and used in combination with [`CloudMultiPartUpload`] to provide /// multipart upload support #[async_trait] -pub(crate) trait CloudMultiPartUploadImpl: 'static { +pub trait CloudMultiPartUploadImpl: 'static { /// Upload a single part async fn put_multipart_part( &self, @@ -42,12 +48,15 @@ pub(crate) trait CloudMultiPartUploadImpl: 'static { async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error>; } +/// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] -pub(crate) struct UploadPart { +pub struct UploadPart { + /// Id of this part pub content_id: String, } -pub(crate) struct CloudMultiPartUpload +/// Struct that manages and controls multipart uploads to a cloud storage service. +pub struct CloudMultiPartUpload where T: CloudMultiPartUploadImpl, { @@ -75,6 +84,7 @@ impl CloudMultiPartUpload where T: CloudMultiPartUploadImpl, { + /// Create a new multipart upload with the implementation and the given maximum concurrency pub fn new(inner: T, max_concurrency: usize) -> Self { Self { inner: Arc::new(inner), @@ -103,6 +113,7 @@ where to_copy } + /// Poll current tasks pub fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -257,3 +268,16 @@ where Pin::new(completion_task).poll(cx) } } + +impl std::fmt::Debug for CloudMultiPartUpload { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CloudMultiPartUpload") + .field("completed_parts", &self.completed_parts) + .field("tasks", &self.tasks) + .field("max_concurrency", &self.max_concurrency) + .field("current_buffer", &self.current_buffer) + .field("part_size", &self.part_size) + .field("current_part_idx", &self.current_part_idx) + .finish() + } +} From 96886303dfa681fa0ae0910c651d50b2cf15015b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Jul 2023 13:27:00 -0400 Subject: [PATCH 1090/1411] Fix interval to duration casting (#4553) (#4562) * Fix interval to duration casting (#4553) * Clippy * Review feedback --- arrow-cast/src/cast.rs | 231 ++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 153 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3a5c27fb6082..2ee8c51b0aa6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -447,20 +447,11 @@ fn cast_interval_day_time_to_interval_month_day_nano( } /// Cast the array from interval to duration -fn cast_interval_to_duration>( +fn cast_month_day_nano_to_duration>( array: &dyn Array, cast_options: &CastOptions, ) -> Result { - let array = array - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast interval to IntervalArray of expected type" - .to_string(), - ) - })?; - + let array = array.as_primitive::(); let scale = match D::DATA_TYPE { DataType::Duration(TimeUnit::Second) => 1_000_000_000, DataType::Duration(TimeUnit::Millisecond) => 1_000_000, @@ -470,16 +461,9 @@ fn cast_interval_to_duration>( }; if cast_options.safe { - let iter = array.iter().map(|v| { - v.and_then(|v| { - let v = v / scale; - if v > i64::MAX as i128 { - None - } else { - Some(v as i64) - } - }) - }); + let iter = array + .iter() + .map(|v| v.and_then(|v| (v >> 64 == 0).then_some((v as i64) / scale))); Ok(Arc::new(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) })) @@ -487,17 +471,9 @@ fn cast_interval_to_duration>( let vec = array .iter() .map(|v| { - v.map(|v| { - let v = v / scale; - if v > i64::MAX as i128 { - Err(ArrowError::ComputeError(format!( - "Cannot cast to {:?}. Overflowing on {:?}", - D::DATA_TYPE, - v - ))) - } else { - Ok(v as i64) - } + v.map(|v| match v >> 64 { + 0 => Ok((v as i64) / scale), + _ => Err(ArrowError::ComputeError("Cannot convert interval containing non-zero months or days to duration".to_string())) }) .transpose() }) @@ -2174,16 +2150,16 @@ pub fn cast_with_options( cast_duration_to_interval::(array, cast_options) } (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Second)) => { - cast_interval_to_duration::(array, cast_options) + cast_month_day_nano_to_duration::(array, cast_options) } (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Millisecond)) => { - cast_interval_to_duration::(array, cast_options) + cast_month_day_nano_to_duration::(array, cast_options) } (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Microsecond)) => { - cast_interval_to_duration::(array, cast_options) + cast_month_day_nano_to_duration::(array, cast_options) } (Interval(IntervalUnit::MonthDayNano), Duration(TimeUnit::Nanosecond)) => { - cast_interval_to_duration::(array, cast_options) + cast_month_day_nano_to_duration::(array, cast_options) } (Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::MonthDayNano)) => { cast_interval_year_month_to_interval_month_day_nano(array, cast_options) @@ -8624,29 +8600,16 @@ mod tests { } /// helper function to test casting from duration to interval - fn cast_from_duration_to_interval( + fn cast_from_duration_to_interval>( array: Vec, cast_options: &CastOptions, - ) -> Result, ArrowError> - where - arrow_array::PrimitiveArray: From>, - { - let array = PrimitiveArray::::from(array); + ) -> Result, ArrowError> { + let array = PrimitiveArray::::new(array.into(), None); let array = Arc::new(array) as ArrayRef; - let casted_array = cast_with_options( - &array, - &DataType::Interval(IntervalUnit::MonthDayNano), - cast_options, - )?; - casted_array - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Failed to downcast to IntervalMonthDayNanoArray".to_string(), - ) - }) - .cloned() + let interval = DataType::Interval(IntervalUnit::MonthDayNano); + let out = cast_with_options(&array, &interval, cast_options)?; + let out = out.as_primitive::().clone(); + Ok(out) } #[test] @@ -8768,11 +8731,9 @@ mod tests { /// helper function to test casting from interval to duration fn cast_from_interval_to_duration( - array: Vec, + array: &IntervalMonthDayNanoArray, cast_options: &CastOptions, ) -> Result, ArrowError> { - let array = IntervalMonthDayNanoArray::from(array); - let array = Arc::new(array) as ArrayRef; let casted_array = cast_with_options(&array, &T::DATA_TYPE, cast_options)?; casted_array .as_any() @@ -8788,125 +8749,89 @@ mod tests { #[test] fn test_cast_from_interval_to_duration() { + let nullable = CastOptions::default(); + let fallible = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + // from interval month day nano to duration second - let array = vec![1234567]; - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions::default(), - ) - .unwrap(); - assert_eq!( - casted_array.data_type(), - &DataType::Duration(TimeUnit::Second) - ); + let array = vec![1234567].into(); + let casted_array: DurationSecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 0); - let array = vec![i128::MAX]; - let casted_array = cast_from_interval_to_duration::( - array.clone(), - &CastOptions::default(), - ) - .unwrap(); + let array = vec![i128::MAX].into(); + let casted_array: DurationSecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ); - assert!(casted_array.is_err()); + let res = cast_from_interval_to_duration::(&array, &fallible); + assert!(res.is_err()); // from interval month day nano to duration millisecond - let array = vec![1234567]; - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions::default(), - ) - .unwrap(); + let array = vec![1234567].into(); + let casted_array: DurationMillisecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1); - let array = vec![i128::MAX]; - let casted_array = cast_from_interval_to_duration::( - array.clone(), - &CastOptions::default(), - ) - .unwrap(); + let array = vec![i128::MAX].into(); + let casted_array: DurationMillisecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ); - assert!(casted_array.is_err()); + let res = + cast_from_interval_to_duration::(&array, &fallible); + assert!(res.is_err()); // from interval month day nano to duration microsecond - let array = vec![1234567]; - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions::default(), - ) - .unwrap(); - assert_eq!( - casted_array.data_type(), - &DataType::Duration(TimeUnit::Microsecond) - ); + let array = vec![1234567].into(); + let casted_array: DurationMicrosecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1234); - let array = vec![i128::MAX]; - let casted_array = cast_from_interval_to_duration::( - array.clone(), - &CastOptions::default(), - ) - .unwrap(); + let array = vec![i128::MAX].into(); + let casted_array = + cast_from_interval_to_duration::(&array, &nullable) + .unwrap(); assert!(!casted_array.is_valid(0)); - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ); + let casted_array = + cast_from_interval_to_duration::(&array, &fallible); assert!(casted_array.is_err()); // from interval month day nano to duration nanosecond - let array = vec![1234567]; - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions::default(), - ) - .unwrap(); - assert_eq!( - casted_array.data_type(), - &DataType::Duration(TimeUnit::Nanosecond) - ); + let array = vec![1234567].into(); + let casted_array: DurationNanosecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1234567); - let array = vec![i128::MAX]; - let casted_array = cast_from_interval_to_duration::( - array.clone(), - &CastOptions::default(), - ) - .unwrap(); - assert_eq!( - casted_array.data_type(), - &DataType::Duration(TimeUnit::Nanosecond) - ); + let array = vec![i128::MAX].into(); + let casted_array: DurationNanosecondArray = + cast_from_interval_to_duration(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); - let casted_array = cast_from_interval_to_duration::( - array, - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ); + let casted_array = + cast_from_interval_to_duration::(&array, &fallible); assert!(casted_array.is_err()); + + let array = vec![ + IntervalMonthDayNanoType::make_value(0, 1, 0), + IntervalMonthDayNanoType::make_value(-1, 0, 0), + IntervalMonthDayNanoType::make_value(1, 1, 0), + IntervalMonthDayNanoType::make_value(1, 0, 1), + IntervalMonthDayNanoType::make_value(0, 0, -1), + ] + .into(); + let casted_array = + cast_from_interval_to_duration::(&array, &nullable) + .unwrap(); + assert!(!casted_array.is_valid(0)); + assert!(!casted_array.is_valid(1)); + assert!(!casted_array.is_valid(2)); + assert!(!casted_array.is_valid(3)); + assert!(casted_array.is_valid(4)); + assert_eq!(casted_array.value(4), -1); } /// helper function to test casting from interval year month to interval month day nano From b27dc7e7419c26f460a851b129d5a26567114897 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:19:09 -0400 Subject: [PATCH 1091/1411] Move chrono-tz arithmetic tests to integration (#4571) * Move chrono-tz arithmetic tests to integration * Clippy --- arrow-arith/Cargo.toml | 1 - arrow-arith/src/numeric.rs | 150 +--------------------------- arrow-arith/src/temporal.rs | 31 ------ arrow/Cargo.toml | 4 + arrow/tests/arithmetic.rs | 188 ++++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 180 deletions(-) create mode 100644 arrow/tests/arithmetic.rs diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 6da472be6601..b5ea2e3c4354 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -46,4 +46,3 @@ num = { version = "0.4", default-features = false, features = ["std"] } [features] simd = ["arrow-array/simd"] -chrono-tz = ["arrow-array/chrono-tz"] diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index 7862fe2f9bea..c47731ed5125 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -853,11 +853,9 @@ fn decimal_op( #[cfg(test)] mod tests { use super::*; - use arrow_array::temporal_conversions::{ - as_date, as_datetime, as_datetime_with_timezone, - }; + use arrow_array::temporal_conversions::{as_date, as_datetime}; use arrow_buffer::{i256, ScalarBuffer}; - use chrono::{DateTime, NaiveDate, TimeZone}; + use chrono::{DateTime, NaiveDate}; fn test_neg_primitive( input: &[T::Native], @@ -1524,148 +1522,4 @@ mod tests { "Compute error: Overflow happened on: 9223372036854775807 - -1" ); } - - fn test_timestamp_with_timezone_impl(tz_str: &str) { - let tz: Tz = tz_str.parse().unwrap(); - - let transform_array = |x: &dyn Array| -> Vec> { - x.as_primitive::() - .values() - .into_iter() - .map(|x| as_datetime_with_timezone::(*x, tz).unwrap()) - .collect() - }; - - let values = vec![ - tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0) - .unwrap() - .naive_utc(), - tz.with_ymd_and_hms(1970, 1, 1, 0, 0, 0) - .unwrap() - .naive_utc(), - tz.with_ymd_and_hms(2010, 4, 1, 4, 0, 20) - .unwrap() - .naive_utc(), - tz.with_ymd_and_hms(1960, 1, 30, 4, 23, 20) - .unwrap() - .naive_utc(), - tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0) - .unwrap() - .naive_utc(), - ] - .into_iter() - .map(|x| T::make_value(x).unwrap()) - .collect(); - - let a = PrimitiveArray::::new(values, None).with_timezone(tz_str); - - // IntervalYearMonth - let b = IntervalYearMonthArray::from(vec![ - IntervalYearMonthType::make_value(0, 1), - IntervalYearMonthType::make_value(5, 34), - IntervalYearMonthType::make_value(-2, 4), - IntervalYearMonthType::make_value(7, -4), - IntervalYearMonthType::make_value(0, 1), - ]); - let r1 = add(&a, &b).unwrap(); - assert_eq!( - &transform_array(r1.as_ref()), - &[ - tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), - tz.with_ymd_and_hms(1977, 11, 1, 0, 0, 0).unwrap(), - tz.with_ymd_and_hms(2008, 8, 1, 4, 0, 20).unwrap(), - tz.with_ymd_and_hms(1966, 9, 30, 4, 23, 20).unwrap(), - tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), - ] - ); - - let r2 = sub(&r1, &b).unwrap(); - assert_eq!(r2.as_ref(), &a); - - // IntervalDayTime - let b = IntervalDayTimeArray::from(vec![ - IntervalDayTimeType::make_value(0, 0), - IntervalDayTimeType::make_value(5, 454000), - IntervalDayTimeType::make_value(-34, 0), - IntervalDayTimeType::make_value(7, -4000), - IntervalDayTimeType::make_value(1, 0), - ]); - let r3 = add(&a, &b).unwrap(); - assert_eq!( - &transform_array(r3.as_ref()), - &[ - tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), - tz.with_ymd_and_hms(1970, 1, 6, 0, 7, 34).unwrap(), - tz.with_ymd_and_hms(2010, 2, 26, 4, 0, 20).unwrap(), - tz.with_ymd_and_hms(1960, 2, 6, 4, 23, 16).unwrap(), - tz.with_ymd_and_hms(2023, 3, 26, 14, 0, 0).unwrap(), - ] - ); - - let r4 = sub(&r3, &b).unwrap(); - assert_eq!(r4.as_ref(), &a); - - // IntervalMonthDayNano - let b = IntervalMonthDayNanoArray::from(vec![ - IntervalMonthDayNanoType::make_value(1, 0, 0), - IntervalMonthDayNanoType::make_value(344, 34, -43_000_000_000), - IntervalMonthDayNanoType::make_value(-593, -33, 13_000_000_000), - IntervalMonthDayNanoType::make_value(5, 2, 493_000_000_000), - IntervalMonthDayNanoType::make_value(1, 0, 0), - ]); - let r5 = add(&a, &b).unwrap(); - assert_eq!( - &transform_array(r5.as_ref()), - &[ - tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), - tz.with_ymd_and_hms(1998, 10, 4, 23, 59, 17).unwrap(), - tz.with_ymd_and_hms(1960, 9, 29, 4, 0, 33).unwrap(), - tz.with_ymd_and_hms(1960, 7, 2, 4, 31, 33).unwrap(), - tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), - ] - ); - - let r6 = sub(&r5, &b).unwrap(); - assert_eq!( - &transform_array(r6.as_ref()), - &[ - tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), - tz.with_ymd_and_hms(1970, 1, 2, 0, 0, 0).unwrap(), - tz.with_ymd_and_hms(2010, 4, 2, 4, 0, 20).unwrap(), - tz.with_ymd_and_hms(1960, 1, 31, 4, 23, 20).unwrap(), - tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0).unwrap(), - ] - ); - } - - #[cfg(not(feature = "chrono-tz"))] - #[test] - fn test_timestamp_with_timezone() { - let timezones = ["+00:00", "+01:00", "-01:00", "+03:30"]; - for timezone in timezones { - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - } - } - - #[cfg(feature = "chrono-tz")] - #[test] - fn test_timestamp_with_timezone() { - let timezones = [ - "Europe/Paris", - "Europe/London", - "Africa/Bamako", - "America/Dominica", - "Asia/Seoul", - "Asia/Shanghai", - ]; - for timezone in timezones { - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - test_timestamp_with_timezone_impl::(timezone); - } - } } diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 0a313718c907..4d713161a771 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -940,37 +940,6 @@ mod tests { assert!(err.contains("Invalid timezone"), "{}", err); } - #[cfg(feature = "chrono-tz")] - #[test] - fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("Asia/Kolkata".to_string()); - let b = hour(&a).unwrap(); - assert_eq!(15, b.value(0)); - } - - #[cfg(feature = "chrono-tz")] - #[test] - fn test_temporal_array_timestamp_hour_with_dst_timezone_using_chrono_tz() { - // - // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) - // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. - // When daylight savings is not in effect, Australia/Sydney has an offset difference of +10:00. - - let a = TimestampMillisecondArray::from(vec![Some(1635577147000)]) - .with_timezone("Australia/Sydney".to_string()); - let b = hour(&a).unwrap(); - assert_eq!(17, b.value(0)); - } - - #[cfg(not(feature = "chrono-tz"))] - #[test] - fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("Asia/Kolkatta".to_string()); - assert!(matches!(hour(&a), Err(ArrowError::ParseError(_)))) - } - #[test] fn test_temporal_array_timestamp_week_without_timezone() { // 1970-01-01T00:00:00 -> 1970-01-01T00:00:00 Thursday (week 1) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2b502f4a3b61..32f11af541fa 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -300,3 +300,7 @@ required-features = ["chrono-tz", "prettyprint"] [[test]] name = "timezone" required-features = ["chrono-tz"] + +[[test]] +name = "arithmetic" +required-features = ["chrono-tz"] diff --git a/arrow/tests/arithmetic.rs b/arrow/tests/arithmetic.rs new file mode 100644 index 000000000000..982420902cc3 --- /dev/null +++ b/arrow/tests/arithmetic.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_arith::numeric::{add, sub}; +use arrow_arith::temporal::hour; +use arrow_array::cast::AsArray; +use arrow_array::temporal_conversions::as_datetime_with_timezone; +use arrow_array::timezone::Tz; +use arrow_array::types::*; +use arrow_array::*; +use chrono::{DateTime, TimeZone}; + +#[test] +fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { + let a = TimestampSecondArray::from(vec![60 * 60 * 10]) + .with_timezone("Asia/Kolkata".to_string()); + let b = hour(&a).unwrap(); + assert_eq!(15, b.value(0)); +} + +#[test] +fn test_temporal_array_timestamp_hour_with_dst_timezone_using_chrono_tz() { + // + // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) + // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. + // When daylight savings is not in effect, Australia/Sydney has an offset difference of +10:00. + + let a = TimestampMillisecondArray::from(vec![Some(1635577147000)]) + .with_timezone("Australia/Sydney".to_string()); + let b = hour(&a).unwrap(); + assert_eq!(17, b.value(0)); +} + +fn test_timestamp_with_timezone_impl(tz_str: &str) { + let tz: Tz = tz_str.parse().unwrap(); + + let transform_array = |x: &dyn Array| -> Vec> { + x.as_primitive::() + .values() + .into_iter() + .map(|x| as_datetime_with_timezone::(*x, tz).unwrap()) + .collect() + }; + + let values = vec![ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(1970, 1, 1, 0, 0, 0) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(2010, 4, 1, 4, 0, 20) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(1960, 1, 30, 4, 23, 20) + .unwrap() + .naive_utc(), + tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0) + .unwrap() + .naive_utc(), + ] + .into_iter() + .map(|x| T::make_value(x).unwrap()) + .collect(); + + let a = PrimitiveArray::::new(values, None).with_timezone(tz_str); + + // IntervalYearMonth + let b = IntervalYearMonthArray::from(vec![ + IntervalYearMonthType::make_value(0, 1), + IntervalYearMonthType::make_value(5, 34), + IntervalYearMonthType::make_value(-2, 4), + IntervalYearMonthType::make_value(7, -4), + IntervalYearMonthType::make_value(0, 1), + ]); + let r1 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r1.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1977, 11, 1, 0, 0, 0).unwrap(), + tz.with_ymd_and_hms(2008, 8, 1, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1966, 9, 30, 4, 23, 20).unwrap(), + tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), + ] + ); + + let r2 = sub(&r1, &b).unwrap(); + assert_eq!(r2.as_ref(), &a); + + // IntervalDayTime + let b = IntervalDayTimeArray::from(vec![ + IntervalDayTimeType::make_value(0, 0), + IntervalDayTimeType::make_value(5, 454000), + IntervalDayTimeType::make_value(-34, 0), + IntervalDayTimeType::make_value(7, -4000), + IntervalDayTimeType::make_value(1, 0), + ]); + let r3 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r3.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1970, 1, 6, 0, 7, 34).unwrap(), + tz.with_ymd_and_hms(2010, 2, 26, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1960, 2, 6, 4, 23, 16).unwrap(), + tz.with_ymd_and_hms(2023, 3, 26, 14, 0, 0).unwrap(), + ] + ); + + let r4 = sub(&r3, &b).unwrap(); + assert_eq!(r4.as_ref(), &a); + + // IntervalMonthDayNano + let b = IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(344, 34, -43_000_000_000), + IntervalMonthDayNanoType::make_value(-593, -33, 13_000_000_000), + IntervalMonthDayNanoType::make_value(5, 2, 493_000_000_000), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ]); + let r5 = add(&a, &b).unwrap(); + assert_eq!( + &transform_array(r5.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 2, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1998, 10, 4, 23, 59, 17).unwrap(), + tz.with_ymd_and_hms(1960, 9, 29, 4, 0, 33).unwrap(), + tz.with_ymd_and_hms(1960, 7, 2, 4, 31, 33).unwrap(), + tz.with_ymd_and_hms(2023, 4, 25, 14, 0, 0).unwrap(), + ] + ); + + let r6 = sub(&r5, &b).unwrap(); + assert_eq!( + &transform_array(r6.as_ref()), + &[ + tz.with_ymd_and_hms(1970, 1, 28, 23, 0, 0).unwrap(), + tz.with_ymd_and_hms(1970, 1, 2, 0, 0, 0).unwrap(), + tz.with_ymd_and_hms(2010, 4, 2, 4, 0, 20).unwrap(), + tz.with_ymd_and_hms(1960, 1, 31, 4, 23, 20).unwrap(), + tz.with_ymd_and_hms(2023, 3, 25, 14, 0, 0).unwrap(), + ] + ); +} + +#[test] +fn test_timestamp_with_offset_timezone() { + let timezones = ["+00:00", "+01:00", "-01:00", "+03:30"]; + for timezone in timezones { + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + } +} + +#[test] +fn test_timestamp_with_timezone() { + let timezones = [ + "Europe/Paris", + "Europe/London", + "Africa/Bamako", + "America/Dominica", + "Asia/Seoul", + "Asia/Shanghai", + ]; + for timezone in timezones { + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + test_timestamp_with_timezone_impl::(timezone); + } +} From fba19b0142daed54c181cdb8f634f29cf7d37f8d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Jul 2023 02:32:07 -0400 Subject: [PATCH 1092/1411] Cleanup multipart upload trait (#4572) * Cleanup multipart upload trait * Update object_store/src/multipart.rs Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- object_store/src/aws/client.rs | 4 +- object_store/src/aws/mod.rs | 30 ++++--------- object_store/src/azure/mod.rs | 17 +++----- object_store/src/gcp/mod.rs | 77 ++++++++++++++++------------------ object_store/src/multipart.rs | 50 +++++++--------------- 5 files changed, 69 insertions(+), 109 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 971d2c60862e..188897620b91 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -23,7 +23,7 @@ use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; -use crate::multipart::UploadPart; +use crate::multipart::PartId; use crate::path::DELIMITER; use crate::{ ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, @@ -479,7 +479,7 @@ impl S3Client { &self, location: &Path, upload_id: &str, - parts: Vec, + parts: Vec, ) -> Result<()> { let parts = parts .into_iter() diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index e74e6f2dfc3e..5a29bd0fc6c7 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -56,7 +56,7 @@ use crate::client::{ TokenCredentialProvider, }; use crate::config::ConfigValue; -use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; +use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, @@ -227,7 +227,7 @@ impl ObjectStore for AmazonS3 { client: Arc::clone(&self.client), }; - Ok((id, Box::new(CloudMultiPartUpload::new(upload, 8)))) + Ok((id, Box::new(WriteMultiPart::new(upload, 8)))) } async fn abort_multipart( @@ -308,12 +308,8 @@ struct S3MultiPartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for S3MultiPartUpload { - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { +impl PutPart for S3MultiPartUpload { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { use reqwest::header::ETAG; let part = (part_idx + 1).to_string(); @@ -326,26 +322,16 @@ impl CloudMultiPartUploadImpl for S3MultiPartUpload { ) .await?; - let etag = response - .headers() - .get(ETAG) - .context(MissingEtagSnafu) - .map_err(crate::Error::from)?; + let etag = response.headers().get(ETAG).context(MissingEtagSnafu)?; - let etag = etag - .to_str() - .context(BadHeaderSnafu) - .map_err(crate::Error::from)?; + let etag = etag.to_str().context(BadHeaderSnafu)?; - Ok(UploadPart { + Ok(PartId { content_id: etag.to_string(), }) } - async fn complete( - &self, - completed_parts: Vec, - ) -> Result<(), std::io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { self.client .complete_multipart(&self.location, &self.upload_id, completed_parts) .await?; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index d2735038321b..8619319a5b25 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -28,7 +28,7 @@ //! after 7 days. use self::client::{BlockId, BlockList}; use crate::{ - multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, + multipart::{PartId, PutPart, WriteMultiPart}, path::Path, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, @@ -42,7 +42,6 @@ use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::fmt::{Debug, Formatter}; -use std::io; use std::str::FromStr; use std::sync::Arc; use tokio::io::AsyncWrite; @@ -186,7 +185,7 @@ impl ObjectStore for MicrosoftAzure { client: Arc::clone(&self.client), location: location.to_owned(), }; - Ok((String::new(), Box::new(CloudMultiPartUpload::new(inner, 8)))) + Ok((String::new(), Box::new(WriteMultiPart::new(inner, 8)))) } async fn abort_multipart( @@ -243,12 +242,8 @@ struct AzureMultiPartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for AzureMultiPartUpload { - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { +impl PutPart for AzureMultiPartUpload { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let content_id = format!("{part_idx:20}"); let block_id: BlockId = content_id.clone().into(); @@ -264,10 +259,10 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { ) .await?; - Ok(UploadPart { content_id }) + Ok(PartId { content_id }) } - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { let blocks = completed_parts .into_iter() .map(|part| BlockId::from(part.content_id)) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index d4d370373d0d..d98e6b068d4f 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -29,7 +29,6 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::io; use std::str::FromStr; use std::sync::Arc; @@ -52,7 +51,7 @@ use crate::client::{ TokenCredentialProvider, }; use crate::{ - multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, + multipart::{PartId, PutPart, WriteMultiPart}, path::{Path, DELIMITER}, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, @@ -117,6 +116,15 @@ enum Error { #[snafu(display("Error getting put response body: {}", source))] PutResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid put response: {}", source))] + InvalidPutResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing post request {}: {}", path, source))] + PostRequest { + source: crate::client::retry::Error, + path: String, + }, + #[snafu(display("Error decoding object size: {}", source))] InvalidSize { source: std::num::ParseIntError }, @@ -148,6 +156,12 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, + + #[snafu(display("ETag Header missing from response"))] + MissingEtag, + + #[snafu(display("Received header containing non-ASCII data"))] + BadHeader { source: header::ToStrError }, } impl From for super::Error { @@ -283,14 +297,9 @@ impl GoogleCloudStorageClient { })?; let data = response.bytes().await.context(PutResponseBodySnafu)?; - let result: InitiateMultipartUploadResult = quick_xml::de::from_reader( - data.as_ref().reader(), - ) - .context(InvalidXMLResponseSnafu { - method: "POST".to_string(), - url, - data, - })?; + let result: InitiateMultipartUploadResult = + quick_xml::de::from_reader(data.as_ref().reader()) + .context(InvalidPutResponseSnafu)?; Ok(result.upload_id) } @@ -472,24 +481,16 @@ struct GCSMultipartUpload { } #[async_trait] -impl CloudMultiPartUploadImpl for GCSMultipartUpload { +impl PutPart for GCSMultipartUpload { /// Upload an object part - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result { + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", self.client.base_url, self.client.bucket_name_encoded, self.encoded_path ); - let credential = self - .client - .get_credential() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + let credential = self.client.get_credential().await?; let response = self .client @@ -504,26 +505,24 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .header(header::CONTENT_LENGTH, format!("{}", buf.len())) .body(buf) .send_retry(&self.client.retry_config) - .await?; + .await + .context(PutRequestSnafu { + path: &self.encoded_path, + })?; let content_id = response .headers() .get("ETag") - .ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - "response headers missing ETag", - ) - })? + .context(MissingEtagSnafu)? .to_str() - .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err))? + .context(BadHeaderSnafu)? .to_string(); - Ok(UploadPart { content_id }) + Ok(PartId { content_id }) } /// Complete a multipart upload - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error> { + async fn complete(&self, completed_parts: Vec) -> Result<()> { let upload_id = self.multipart_id.clone(); let url = format!( "{}/{}/{}", @@ -539,16 +538,11 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { }) .collect(); - let credential = self - .client - .get_credential() - .await - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; - + let credential = self.client.get_credential().await?; let upload_info = CompleteMultipartUpload { parts }; let data = quick_xml::se::to_string(&upload_info) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? + .context(InvalidPutResponseSnafu)? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 @@ -561,7 +555,10 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .query(&[("uploadId", upload_id)]) .body(data) .send_retry(&self.client.retry_config) - .await?; + .await + .context(PostRequestSnafu { + path: &self.encoded_path, + })?; Ok(()) } @@ -588,7 +585,7 @@ impl ObjectStore for GoogleCloudStorage { multipart_id: upload_id.clone(), }; - Ok((upload_id, Box::new(CloudMultiPartUpload::new(inner, 8)))) + Ok((upload_id, Box::new(WriteMultiPart::new(inner, 8)))) } async fn abort_multipart( diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index 5f9b7e67488f..d4c911fceab4 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -31,40 +31,33 @@ use crate::Result; type BoxedTryFuture = Pin> + Send>>; /// A trait that can be implemented by cloud-based object stores -/// and used in combination with [`CloudMultiPartUpload`] to provide +/// and used in combination with [`WriteMultiPart`] to provide /// multipart upload support #[async_trait] -pub trait CloudMultiPartUploadImpl: 'static { +pub trait PutPart: Send + Sync + 'static { /// Upload a single part - async fn put_multipart_part( - &self, - buf: Vec, - part_idx: usize, - ) -> Result; + async fn put_part(&self, buf: Vec, part_idx: usize) -> Result; /// Complete the upload with the provided parts /// /// `completed_parts` is in order of part number - async fn complete(&self, completed_parts: Vec) -> Result<(), io::Error>; + async fn complete(&self, completed_parts: Vec) -> Result<()>; } /// Represents a part of a file that has been successfully uploaded in a multipart upload process. #[derive(Debug, Clone)] -pub struct UploadPart { +pub struct PartId { /// Id of this part pub content_id: String, } -/// Struct that manages and controls multipart uploads to a cloud storage service. -pub struct CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl, -{ +/// Wrapper around a [`PutPart`] that implements [`AsyncWrite`] +pub struct WriteMultiPart { inner: Arc, /// A list of completed parts, in sequential order. - completed_parts: Vec>, + completed_parts: Vec>, /// Part upload tasks currently running - tasks: FuturesUnordered>, + tasks: FuturesUnordered>, /// Maximum number of upload tasks to run concurrently max_concurrency: usize, /// Buffer that will be sent in next upload. @@ -80,10 +73,7 @@ where completion_task: Option>, } -impl CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl, -{ +impl WriteMultiPart { /// Create a new multipart upload with the implementation and the given maximum concurrency pub fn new(inner: T, max_concurrency: usize) -> Self { Self { @@ -114,7 +104,7 @@ where } /// Poll current tasks - pub fn poll_tasks( + fn poll_tasks( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> Result<(), io::Error> { @@ -130,12 +120,7 @@ where } Ok(()) } -} -impl CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl + Send + Sync, -{ // The `poll_flush` function will only flush the in-progress tasks. // The `final_flush` method called during `poll_shutdown` will flush // the `current_buffer` along with in-progress tasks. @@ -153,7 +138,7 @@ where let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + let upload_part = inner.put_part(out_buffer, part_idx).await?; Ok((part_idx, upload_part)) })); } @@ -169,10 +154,7 @@ where } } -impl AsyncWrite for CloudMultiPartUpload -where - T: CloudMultiPartUploadImpl + Send + Sync, -{ +impl AsyncWrite for WriteMultiPart { fn poll_write( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, @@ -199,7 +181,7 @@ where let inner = Arc::clone(&self.inner); let part_idx = self.current_part_idx; self.tasks.push(Box::pin(async move { - let upload_part = inner.put_multipart_part(out_buffer, part_idx).await?; + let upload_part = inner.put_part(out_buffer, part_idx).await?; Ok((part_idx, upload_part)) })); self.current_part_idx += 1; @@ -269,9 +251,9 @@ where } } -impl std::fmt::Debug for CloudMultiPartUpload { +impl std::fmt::Debug for WriteMultiPart { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CloudMultiPartUpload") + f.debug_struct("WriteMultiPart") .field("completed_parts", &self.completed_parts) .field("tasks", &self.tasks) .field("max_concurrency", &self.max_concurrency) From 8c85d34869e0742b7e9db41a98f0b499f1014830 Mon Sep 17 00:00:00 2001 From: lee <690585471@qq.com> Date: Fri, 28 Jul 2023 18:21:23 +0800 Subject: [PATCH 1093/1411] Write Page Offset Index For All-Nan Pages (#4567) * fix offset index none * add test * add test * Cleanup --------- Co-authored-by: guojie.lgj Co-authored-by: Raphael Taylor-Davies --- parquet/src/arrow/arrow_writer/mod.rs | 21 +++++++++++++++++++++ parquet/src/column/writer/mod.rs | 13 +++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index ccec4ffb20c0..d3d4e2626fe3 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1650,6 +1650,27 @@ mod tests { writer.close().unwrap(); } + #[test] + fn check_page_offset_index_with_nan() { + let values = Arc::new(Float64Array::from(vec![f64::NAN; 10])); + let schema = Schema::new(vec![Field::new("col", DataType::Float64, true)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + + let mut out = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None) + .expect("Unable to write file"); + writer.write(&batch).unwrap(); + let file_meta_data = writer.close().unwrap(); + for row_group in file_meta_data.row_groups { + for column in row_group.columns { + assert!(column.offset_index_offset.is_some()); + assert!(column.offset_index_length.is_some()); + assert!(column.column_index_offset.is_none()); + assert!(column.column_index_length.is_none()); + } + } + } + #[test] fn i8_single_column() { required_and_optional::(0..SMALL_SIZE as i8); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 1cacfe793328..3d8ce283ae64 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -500,14 +500,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let metadata = self.write_column_metadata()?; self.page_writer.close()?; - let (column_index, offset_index) = if self.column_index_builder.valid() { - // build the column and offset index - let column_index = self.column_index_builder.build_to_thrift(); - let offset_index = self.offset_index_builder.build_to_thrift(); - (Some(column_index), Some(offset_index)) - } else { - (None, None) - }; + let column_index = self + .column_index_builder + .valid() + .then(|| self.column_index_builder.build_to_thrift()); + let offset_index = Some(self.offset_index_builder.build_to_thrift()); Ok(ColumnCloseResult { bytes_written: self.column_metrics.total_bytes_written, From 18385e56343c64bbbc76f271c5fbb4f27b5e7e8d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 29 Jul 2023 15:24:53 +0100 Subject: [PATCH 1094/1411] Fix take_bytes Null and Overflow Handling (#4576) (#4579) * Cleanup take_bytes * Use extend * Tweak * Review feedback --- arrow-select/src/take.rs | 85 +++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 0f5689ff9990..cee9cbaf84df 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -331,94 +331,70 @@ fn take_bytes( let data_len = indices.len(); let bytes_offset = (data_len + 1) * std::mem::size_of::(); - let mut offsets_buffer = MutableBuffer::from_len_zeroed(bytes_offset); + let mut offsets = MutableBuffer::new(bytes_offset); + offsets.push(T::Offset::default()); - let offsets = offsets_buffer.typed_data_mut(); let mut values = MutableBuffer::new(0); - let mut length_so_far = T::Offset::from_usize(0).unwrap(); - offsets[0] = length_so_far; let nulls; if array.null_count() == 0 && indices.null_count() == 0 { - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - let s = array.value(index); - - let s: &[u8] = s.as_ref(); - length_so_far += T::Offset::from_usize(s.len()).unwrap(); + offsets.extend(indices.values().iter().map(|index| { + let s: &[u8] = array.value(index.as_usize()).as_ref(); values.extend_from_slice(s); - *offset = length_so_far; - } + T::Offset::usize_as(values.len()) + })); nulls = None } else if indices.null_count() == 0 { let num_bytes = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); let null_slice = null_buf.as_slice_mut(); - - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - + offsets.extend(indices.values().iter().enumerate().map(|(i, index)| { + let index = index.as_usize(); if array.is_valid(index) { let s: &[u8] = array.value(index).as_ref(); - - length_so_far += T::Offset::from_usize(s.len()).unwrap(); values.extend_from_slice(s.as_ref()); } else { bit_util::unset_bit(null_slice, i); } - *offset = length_so_far; - } + T::Offset::usize_as(values.len()) + })); nulls = Some(null_buf.into()); } else if array.null_count() == 0 { - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { + offsets.extend(indices.values().iter().enumerate().map(|(i, index)| { if indices.is_valid(i) { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - let s: &[u8] = array.value(index).as_ref(); - - length_so_far += T::Offset::from_usize(s.len()).unwrap(); + let s: &[u8] = array.value(index.as_usize()).as_ref(); values.extend_from_slice(s); } - *offset = length_so_far; - } + T::Offset::usize_as(values.len()) + })); nulls = indices.nulls().map(|b| b.inner().sliced()); } else { let num_bytes = bit_util::ceil(data_len, 8); let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); let null_slice = null_buf.as_slice_mut(); - - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if array.is_valid(index) && indices.is_valid(i) { + offsets.extend(indices.values().iter().enumerate().map(|(i, index)| { + // check index is valid before using index. The value in + // NULL index slots may not be within bounds of array + let index = index.as_usize(); + if indices.is_valid(i) && array.is_valid(index) { let s: &[u8] = array.value(index).as_ref(); - - length_so_far += T::Offset::from_usize(s.len()).unwrap(); values.extend_from_slice(s); } else { // set null bit bit_util::unset_bit(null_slice, i); } - *offset = length_so_far; - } - + T::Offset::usize_as(values.len()) + })); nulls = Some(null_buf.into()) } + T::Offset::from_usize(values.len()).expect("offset overflow"); + let array_data = ArrayData::builder(T::DATA_TYPE) .len(data_len) - .add_buffer(offsets_buffer.into()) + .add_buffer(offsets.into()) .add_buffer(values.into()) .null_bit_buffer(nulls); @@ -1937,6 +1913,7 @@ mod tests { #[test] fn test_take_null_indices() { + // Build indices with values that are out of bounds, but masked by null mask let indices = Int32Array::new( vec![1, 2, 400, 400].into(), Some(NullBuffer::from(vec![true, true, false, false])), @@ -1949,4 +1926,16 @@ mod tests { .collect::>(); assert_eq!(&values, &[Some(23), Some(4), None, None]) } + + #[test] + fn test_take_bytes_null_indices() { + let indices = Int32Array::new( + vec![0, 1, 400, 400].into(), + Some(NullBuffer::from_iter(vec![true, true, false, false])), + ); + let values = StringArray::from(vec![Some("foo"), None]); + let r = take(&values, &indices, None).unwrap(); + let values = r.as_string::().iter().collect::>(); + assert_eq!(&values, &[Some("foo"), None, None, None]) + } } From a31005605ead4b70bd89fa29bd09d7b1613636dc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Jul 2023 09:31:15 +0100 Subject: [PATCH 1095/1411] Mark GenericByteArray::new_unchecked unsafe (#4584) --- arrow-array/src/array/byte_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 0a18062d9ae1..be10a4508db1 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -159,7 +159,7 @@ impl GenericByteArray { /// # Safety /// /// Safe if [`Self::try_new`] would not error - pub fn new_unchecked( + pub unsafe fn new_unchecked( offsets: OffsetBuffer, values: Buffer, nulls: Option, From 2adb64d113a031432cb4e9e0e37c071ce85ca6d6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Jul 2023 14:14:18 +0100 Subject: [PATCH 1096/1411] Move from_iter_values to GenericByteArray (#4586) --- arrow-array/src/array/binary_array.rs | 37 ------------------------- arrow-array/src/array/byte_array.rs | 35 ++++++++++++++++++++++++ arrow-array/src/array/string_array.rs | 39 +-------------------------- 3 files changed, 36 insertions(+), 75 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 54839604d192..67be3768cc80 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -19,7 +19,6 @@ use crate::types::{ByteArrayType, GenericBinaryType}; use crate::{ Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, }; -use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -83,42 +82,6 @@ impl GenericBinaryArray { Self::from(data) } - /// Creates a [`GenericBinaryArray`] based on an iterator of values without nulls - pub fn from_iter_values(iter: I) -> Self - where - Ptr: AsRef<[u8]>, - I: IntoIterator, - { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for s in iter { - let s = s.as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s); - } - - // iterator size hint may not be correct so compute the actual number of offsets - assert!(!offsets.is_empty()); // wrote at least one - let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(actual_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } - /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i` pub fn take_iter<'a>( &'a self, diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index be10a4508db1..f694aa32e507 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -182,6 +182,41 @@ impl GenericByteArray { } } + /// Creates a [`GenericByteArray`] based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef, + I: IntoIterator, + { + let iter = iter.into_iter(); + let (_, data_len) = iter.size_hint(); + let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. + + let mut offsets = + MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + offsets.push(T::Offset::usize_as(0)); + + let mut values = MutableBuffer::new(0); + for s in iter { + let s: &[u8] = s.as_ref().as_ref(); + values.extend_from_slice(s); + offsets.push(T::Offset::usize_as(values.len())); + } + + T::Offset::from_usize(values.len()).expect("offset overflow"); + let offsets = Buffer::from(offsets); + + // Safety: valid by construction + let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + Self { + data_type: T::DATA_TYPE, + value_data: values.into(), + value_offsets, + nulls: None, + } + } + /// Deconstruct this array into its constituent parts pub fn into_parts(self) -> (OffsetBuffer, Buffer, Option) { (self.value_offsets, self.value_data, self.nulls) diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index f9a3a5fbd095..4c40e8b90ce2 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -17,8 +17,6 @@ use crate::types::GenericStringType; use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait}; -use arrow_buffer::MutableBuffer; -use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; /// A [`GenericByteArray`] for storing `str` @@ -40,42 +38,6 @@ impl GenericStringArray { self.value(i).chars().count() } - /// Creates a [`GenericStringArray`] based on an iterator of values without nulls - pub fn from_iter_values(iter: I) -> Self - where - Ptr: AsRef, - I: IntoIterator, - { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for i in iter { - let s = i.as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s.as_bytes()); - } - - // iterator size hint may not be correct so compute the actual number of offsets - assert!(!offsets.is_empty()); // wrote at least one - let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - - let array_data = ArrayData::builder(Self::DATA_TYPE) - .len(actual_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Self::from(array_data) - } - /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i` pub fn take_iter<'a>( &'a self, @@ -210,6 +172,7 @@ mod tests { use crate::types::UInt8Type; use crate::Array; use arrow_buffer::Buffer; + use arrow_data::ArrayData; use arrow_schema::Field; use std::sync::Arc; From 1e0f02ffc0619cbdc2cfd184573dc1eecd0f61a2 Mon Sep 17 00:00:00 2001 From: jakevin Date: Sun, 30 Jul 2023 22:11:11 +0800 Subject: [PATCH 1097/1411] refactor: simplify hour_dyn() with time_fraction_dyn() (#4588) --- arrow-arith/src/temporal.rs | 21 +-------------------- arrow-array/src/array/string_array.rs | 2 +- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 4d713161a771..ef551ceeddb7 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -181,26 +181,7 @@ pub fn using_chrono_tz_and_utc_naive_date_time( /// the range of [0, 23]. If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. pub fn hour_dyn(array: &dyn Array) -> Result { - match array.data_type().clone() { - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - array => { - let hour_values = hour_dyn(array.values())?; - Ok(Arc::new(array.with_values(&hour_values))) - } - dt => return_compute_error_with!("hour does not support", dt), - ) - } - _ => { - downcast_temporal_array!( - array => { - hour(array) - .map(|a| Arc::new(a) as ArrayRef) - } - dt => return_compute_error_with!("hour does not support", dt), - ) - } - } + time_fraction_dyn(array, "hour", |t| t.hour() as i32) } /// Extracts the hours of a given temporal primitive array as an array of integers within diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 4c40e8b90ce2..9694cd2d4eec 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -154,7 +154,7 @@ pub type StringArray = GenericStringArray; /// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect(); /// ``` /// -/// Constructon and Access +/// Construction and Access /// /// ``` /// use arrow_array::LargeStringArray; From fb926a4ff9f84fcfb5c853b6f3cb5d2d11bdf916 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Jul 2023 16:20:19 +0100 Subject: [PATCH 1098/1411] Configurable Duration Display (#4581) * Make FormatOptions const (#4580) * Add non-ISO duration display (#4554) * Review feedback --- arrow-cast/src/cast.rs | 10 ++ arrow-cast/src/display.rs | 260 ++++++++++++++++++++++++++++++++------ 2 files changed, 234 insertions(+), 36 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 2ee8c51b0aa6..e7ca2d0ed4ca 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -8959,4 +8959,14 @@ mod tests { assert_eq!(formatted.value(0).to_string(), "[[1], [2], [3]]"); assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]"); } + + const CAST_OPTIONS: CastOptions<'static> = CastOptions { + safe: true, + format_options: FormatOptions::new(), + }; + + #[test] + fn test_const_options() { + assert!(CAST_OPTIONS.safe) + } } diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 07e78f8984f9..b373891ecb2a 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -34,6 +34,16 @@ use lexical_core::FormattedSize; type TimeFormat<'a> = Option<&'a str>; +/// Format for displaying durations +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum DurationFormat { + /// ISO 8601 - `P198DT72932.972880S` + ISO8601, + /// A human readable representation - `198 days 16 hours 34 mins 15.407810000 secs` + Pretty, +} + /// Options for formatting arrays /// /// By default nulls are formatted as `""` and temporal types formatted @@ -56,10 +66,18 @@ pub struct FormatOptions<'a> { timestamp_tz_format: TimeFormat<'a>, /// Time format for time arrays time_format: TimeFormat<'a>, + /// Duration format + duration_format: DurationFormat, } impl<'a> Default for FormatOptions<'a> { fn default() -> Self { + Self::new() + } +} + +impl<'a> FormatOptions<'a> { + pub const fn new() -> Self { Self { safe: true, null: "", @@ -68,14 +86,13 @@ impl<'a> Default for FormatOptions<'a> { timestamp_format: None, timestamp_tz_format: None, time_format: None, + duration_format: DurationFormat::ISO8601, } } -} -impl<'a> FormatOptions<'a> { /// If set to `true` any formatting errors will be written to the output /// instead of being converted into a [`std::fmt::Error`] - pub fn with_display_error(mut self, safe: bool) -> Self { + pub const fn with_display_error(mut self, safe: bool) -> Self { self.safe = safe; self } @@ -83,12 +100,12 @@ impl<'a> FormatOptions<'a> { /// Overrides the string used to represent a null /// /// Defaults to `""` - pub fn with_null(self, null: &'a str) -> Self { + pub const fn with_null(self, null: &'a str) -> Self { Self { null, ..self } } /// Overrides the format used for [`DataType::Date32`] columns - pub fn with_date_format(self, date_format: Option<&'a str>) -> Self { + pub const fn with_date_format(self, date_format: Option<&'a str>) -> Self { Self { date_format, ..self @@ -96,7 +113,7 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Date64`] columns - pub fn with_datetime_format(self, datetime_format: Option<&'a str>) -> Self { + pub const fn with_datetime_format(self, datetime_format: Option<&'a str>) -> Self { Self { datetime_format, ..self @@ -104,7 +121,7 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Timestamp`] columns without a timezone - pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) -> Self { + pub const fn with_timestamp_format(self, timestamp_format: Option<&'a str>) -> Self { Self { timestamp_format, ..self @@ -112,7 +129,10 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone - pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self { + pub const fn with_timestamp_tz_format( + self, + timestamp_tz_format: Option<&'a str>, + ) -> Self { Self { timestamp_tz_format, ..self @@ -120,12 +140,22 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Time32`] and [`DataType::Time64`] columns - pub fn with_time_format(self, time_format: Option<&'a str>) -> Self { + pub const fn with_time_format(self, time_format: Option<&'a str>) -> Self { Self { time_format, ..self } } + + /// Overrides the format used for duration columns + /// + /// Defaults to [`DurationFormat::ISO8601`] + pub const fn with_duration_format(self, duration_format: DurationFormat) -> Self { + Self { + duration_format, + ..self + } + } } /// Implements [`Display`] for a specific array value @@ -534,20 +564,82 @@ temporal_display!(time64us_to_time, time_format, Time64MicrosecondType); temporal_display!(time64ns_to_time, time_format, Time64NanosecondType); macro_rules! duration_display { - ($convert:ident, $t:ty) => { - impl<'a> DisplayIndex for &'a PrimitiveArray<$t> { - fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - write!(f, "{}", $convert(self.value(idx)))?; + ($convert:ident, $t:ty, $scale:tt) => { + impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { + type State = DurationFormat; + + fn prepare( + &self, + options: &FormatOptions<'a>, + ) -> Result { + Ok(options.duration_format) + } + + fn write( + &self, + fmt: &Self::State, + idx: usize, + f: &mut dyn Write, + ) -> FormatResult { + let v = self.value(idx); + match fmt { + DurationFormat::ISO8601 => write!(f, "{}", $convert(v))?, + DurationFormat::Pretty => duration_fmt!(f, v, $scale)?, + } Ok(()) } } }; } -duration_display!(duration_s_to_duration, DurationSecondType); -duration_display!(duration_ms_to_duration, DurationMillisecondType); -duration_display!(duration_us_to_duration, DurationMicrosecondType); -duration_display!(duration_ns_to_duration, DurationNanosecondType); +macro_rules! duration_fmt { + ($f:ident, $v:expr, 0) => {{ + let secs = $v; + let mins = secs / 60; + let hours = mins / 60; + let days = hours / 24; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + write!($f, "{days} days {hours} hours {mins} mins {secs} secs") + }}; + ($f:ident, $v:expr, $scale:tt) => {{ + let subsec = $v; + let secs = subsec / 10_i64.pow($scale); + let mins = secs / 60; + let hours = mins / 60; + let days = hours / 24; + + let subsec = subsec - (secs * 10_i64.pow($scale)); + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + match subsec.is_negative() { + true => { + write!( + $f, + concat!("{} days {} hours {} mins -{}.{:0", $scale, "} secs"), + days, + hours, + mins, + secs.abs(), + subsec.abs() + ) + } + false => { + write!( + $f, + concat!("{} days {} hours {} mins {}.{:0", $scale, "} secs"), + days, hours, mins, secs, subsec + ) + } + } + }}; +} + +duration_display!(duration_s_to_duration, DurationSecondType, 0); +duration_display!(duration_ms_to_duration, DurationMillisecondType, 3); +duration_display!(duration_us_to_duration, DurationMicrosecondType, 6); +duration_display!(duration_ns_to_duration, DurationNanosecondType, 9); impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { @@ -866,8 +958,18 @@ pub fn lexical_to_string(n: N) -> String { mod tests { use super::*; + /// Test to verify options can be constant. See #4580 + const TEST_CONST_OPTIONS: FormatOptions<'static> = FormatOptions::new() + .with_date_format(Some("foo")) + .with_timestamp_format(Some("404")); + + #[test] + fn test_const_options() { + assert_eq!(TEST_CONST_OPTIONS.date_format, Some("foo")); + } + #[test] - fn test_map_arry_to_string() { + fn test_map_array_to_string() { let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"]; let values_data = UInt32Array::from(vec![0u32, 10, 20, 30, 40, 50, 60, 70]); @@ -887,25 +989,111 @@ mod tests { ); } + fn format_array(array: &dyn Array, fmt: &FormatOptions) -> Vec { + let fmt = ArrayFormatter::try_new(array, fmt).unwrap(); + (0..array.len()).map(|x| fmt.value(x).to_string()).collect() + } + #[test] fn test_array_value_to_string_duration() { - let ns_array = DurationNanosecondArray::from(vec![Some(1), None]); - assert_eq!( - array_value_to_string(&ns_array, 0).unwrap(), - "PT0.000000001S" - ); - assert_eq!(array_value_to_string(&ns_array, 1).unwrap(), ""); - - let us_array = DurationMicrosecondArray::from(vec![Some(1), None]); - assert_eq!(array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S"); - assert_eq!(array_value_to_string(&us_array, 1).unwrap(), ""); - - let ms_array = DurationMillisecondArray::from(vec![Some(1), None]); - assert_eq!(array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S"); - assert_eq!(array_value_to_string(&ms_array, 1).unwrap(), ""); - - let s_array = DurationSecondArray::from(vec![Some(1), None]); - assert_eq!(array_value_to_string(&s_array, 0).unwrap(), "PT1S"); - assert_eq!(array_value_to_string(&s_array, 1).unwrap(), ""); + let iso_fmt = FormatOptions::new(); + let pretty_fmt = + FormatOptions::new().with_duration_format(DurationFormat::Pretty); + + let array = DurationNanosecondArray::from(vec![ + 1, + -1, + 1000, + -1000, + (45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000_000_000 + 123456789, + -(45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000_000_000 - 123456789, + ]); + let iso = format_array(&array, &iso_fmt); + let pretty = format_array(&array, &pretty_fmt); + + assert_eq!(iso[0], "PT0.000000001S"); + assert_eq!(pretty[0], "0 days 0 hours 0 mins 0.000000001 secs"); + assert_eq!(iso[1], "-PT0.000000001S"); + assert_eq!(pretty[1], "0 days 0 hours 0 mins -0.000000001 secs"); + assert_eq!(iso[2], "PT0.000001S"); + assert_eq!(pretty[2], "0 days 0 hours 0 mins 0.000001000 secs"); + assert_eq!(iso[3], "-PT0.000001S"); + assert_eq!(pretty[3], "0 days 0 hours 0 mins -0.000001000 secs"); + assert_eq!(iso[4], "P45DT50554.123456789S"); + assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123456789 secs"); + assert_eq!(iso[5], "-P45DT50554.123456789S"); + assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123456789 secs"); + + let array = DurationMicrosecondArray::from(vec![ + 1, + -1, + 1000, + -1000, + (45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000_000 + 123456, + -(45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000_000 - 123456, + ]); + let iso = format_array(&array, &iso_fmt); + let pretty = format_array(&array, &pretty_fmt); + + assert_eq!(iso[0], "PT0.000001S"); + assert_eq!(pretty[0], "0 days 0 hours 0 mins 0.000001 secs"); + assert_eq!(iso[1], "-PT0.000001S"); + assert_eq!(pretty[1], "0 days 0 hours 0 mins -0.000001 secs"); + assert_eq!(iso[2], "PT0.001S"); + assert_eq!(pretty[2], "0 days 0 hours 0 mins 0.001000 secs"); + assert_eq!(iso[3], "-PT0.001S"); + assert_eq!(pretty[3], "0 days 0 hours 0 mins -0.001000 secs"); + assert_eq!(iso[4], "P45DT50554.123456S"); + assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123456 secs"); + assert_eq!(iso[5], "-P45DT50554.123456S"); + assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123456 secs"); + + let array = DurationMillisecondArray::from(vec![ + 1, + -1, + 1000, + -1000, + (45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000 + 123, + -(45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34) * 1_000 - 123, + ]); + let iso = format_array(&array, &iso_fmt); + let pretty = format_array(&array, &pretty_fmt); + + assert_eq!(iso[0], "PT0.001S"); + assert_eq!(pretty[0], "0 days 0 hours 0 mins 0.001 secs"); + assert_eq!(iso[1], "-PT0.001S"); + assert_eq!(pretty[1], "0 days 0 hours 0 mins -0.001 secs"); + assert_eq!(iso[2], "PT1S"); + assert_eq!(pretty[2], "0 days 0 hours 0 mins 1.000 secs"); + assert_eq!(iso[3], "-PT1S"); + assert_eq!(pretty[3], "0 days 0 hours 0 mins -1.000 secs"); + assert_eq!(iso[4], "P45DT50554.123S"); + assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123 secs"); + assert_eq!(iso[5], "-P45DT50554.123S"); + assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123 secs"); + + let array = DurationSecondArray::from(vec![ + 1, + -1, + 1000, + -1000, + 45 * 60 * 60 * 24 + 14 * 60 * 60 + 2 * 60 + 34, + -45 * 60 * 60 * 24 - 14 * 60 * 60 - 2 * 60 - 34, + ]); + let iso = format_array(&array, &iso_fmt); + let pretty = format_array(&array, &pretty_fmt); + + assert_eq!(iso[0], "PT1S"); + assert_eq!(pretty[0], "0 days 0 hours 0 mins 1 secs"); + assert_eq!(iso[1], "-PT1S"); + assert_eq!(pretty[1], "0 days 0 hours 0 mins -1 secs"); + assert_eq!(iso[2], "PT1000S"); + assert_eq!(pretty[2], "0 days 0 hours 16 mins 40 secs"); + assert_eq!(iso[3], "-PT1000S"); + assert_eq!(pretty[3], "0 days 0 hours -16 mins -40 secs"); + assert_eq!(iso[4], "P45DT50554S"); + assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34 secs"); + assert_eq!(iso[5], "-P45DT50554S"); + assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34 secs"); } } From 2950d8b8fef2fb776905bcb35ac9c10aa5151aca Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Jul 2023 17:15:38 +0100 Subject: [PATCH 1099/1411] Fix pretty hours duration display (#4591) --- arrow-cast/src/display.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index b373891ecb2a..d15d57cf3c05 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -601,6 +601,7 @@ macro_rules! duration_fmt { let secs = secs - (mins * 60); let mins = mins - (hours * 60); + let hours = hours - (days * 24); write!($f, "{days} days {hours} hours {mins} mins {secs} secs") }}; ($f:ident, $v:expr, $scale:tt) => {{ @@ -613,6 +614,7 @@ macro_rules! duration_fmt { let subsec = subsec - (secs * 10_i64.pow($scale)); let secs = secs - (mins * 60); let mins = mins - (hours * 60); + let hours = hours - (days * 24); match subsec.is_negative() { true => { write!( @@ -1020,9 +1022,9 @@ mod tests { assert_eq!(iso[3], "-PT0.000001S"); assert_eq!(pretty[3], "0 days 0 hours 0 mins -0.000001000 secs"); assert_eq!(iso[4], "P45DT50554.123456789S"); - assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123456789 secs"); + assert_eq!(pretty[4], "45 days 14 hours 2 mins 34.123456789 secs"); assert_eq!(iso[5], "-P45DT50554.123456789S"); - assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123456789 secs"); + assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34.123456789 secs"); let array = DurationMicrosecondArray::from(vec![ 1, @@ -1044,9 +1046,9 @@ mod tests { assert_eq!(iso[3], "-PT0.001S"); assert_eq!(pretty[3], "0 days 0 hours 0 mins -0.001000 secs"); assert_eq!(iso[4], "P45DT50554.123456S"); - assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123456 secs"); + assert_eq!(pretty[4], "45 days 14 hours 2 mins 34.123456 secs"); assert_eq!(iso[5], "-P45DT50554.123456S"); - assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123456 secs"); + assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34.123456 secs"); let array = DurationMillisecondArray::from(vec![ 1, @@ -1068,9 +1070,9 @@ mod tests { assert_eq!(iso[3], "-PT1S"); assert_eq!(pretty[3], "0 days 0 hours 0 mins -1.000 secs"); assert_eq!(iso[4], "P45DT50554.123S"); - assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34.123 secs"); + assert_eq!(pretty[4], "45 days 14 hours 2 mins 34.123 secs"); assert_eq!(iso[5], "-P45DT50554.123S"); - assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34.123 secs"); + assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34.123 secs"); let array = DurationSecondArray::from(vec![ 1, @@ -1092,8 +1094,8 @@ mod tests { assert_eq!(iso[3], "-PT1000S"); assert_eq!(pretty[3], "0 days 0 hours -16 mins -40 secs"); assert_eq!(iso[4], "P45DT50554S"); - assert_eq!(pretty[4], "45 days 1094 hours 2 mins 34 secs"); + assert_eq!(pretty[4], "45 days 14 hours 2 mins 34 secs"); assert_eq!(iso[5], "-P45DT50554S"); - assert_eq!(pretty[5], "-45 days -1094 hours -2 mins -34 secs"); + assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34 secs"); } } From 16744e5ac08d9ead6c51ff6e08d8b91e87460c52 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 30 Jul 2023 17:28:55 +0100 Subject: [PATCH 1100/1411] Prepare arrow 45 (#4590) --- CHANGELOG-old.md | 43 +++++++++++++++++++++ CHANGELOG.md | 66 +++++++++++++++++--------------- Cargo.toml | 32 ++++++++-------- dev/release/update_change_log.sh | 4 +- 4 files changed, 97 insertions(+), 48 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 1d732ce6c022..4d04f9515c44 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,49 @@ # Historical Changelog +## [44.0.0](https://github.com/apache/arrow-rs/tree/44.0.0) (2023-07-14) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/43.0.0...44.0.0) + +**Breaking changes:** + +- Use Parser for cast kernel \(\#4512\) [\#4513](https://github.com/apache/arrow-rs/pull/4513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Datum based arithmetic kernels \(\#3999\) [\#4465](https://github.com/apache/arrow-rs/pull/4465) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- eq\_dyn\_binary\_scalar should support FixedSizeBinary types [\#4491](https://github.com/apache/arrow-rs/issues/4491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Port Tests from Deprecated Arithmetic Kernels [\#4480](https://github.com/apache/arrow-rs/issues/4480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement RecordBatchReader for Boxed trait object [\#4474](https://github.com/apache/arrow-rs/issues/4474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Date` - `Date` kernel [\#4383](https://github.com/apache/arrow-rs/issues/4383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Default FlightSqlService Implementations [\#4372](https://github.com/apache/arrow-rs/issues/4372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Parquet: `AsyncArrowWriter` to a file corrupts the footer for large columns [\#4526](https://github.com/apache/arrow-rs/issues/4526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- \[object\_store\] Failure to send bytes to azure [\#4522](https://github.com/apache/arrow-rs/issues/4522) +- Cannot cast string '2021-01-02' to value of Date64 type [\#4512](https://github.com/apache/arrow-rs/issues/4512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Interval Subtraction [\#4489](https://github.com/apache/arrow-rs/issues/4489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Interval Negation Incorrect [\#4488](https://github.com/apache/arrow-rs/issues/4488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet: AsyncArrowWriter inner buffer is not correctly limited and causes OOM [\#4477](https://github.com/apache/arrow-rs/issues/4477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Fix AsyncArrowWriter flush for large buffer sizes \(\#4526\) [\#4527](https://github.com/apache/arrow-rs/pull/4527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup cast\_primitive\_to\_list [\#4511](https://github.com/apache/arrow-rs/pull/4511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Bump actions/upload-pages-artifact from 1 to 2 [\#4508](https://github.com/apache/arrow-rs/pull/4508) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Date - Date \(\#4383\) [\#4504](https://github.com/apache/arrow-rs/pull/4504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Bump actions/labeler from 4.2.0 to 4.3.0 [\#4501](https://github.com/apache/arrow-rs/pull/4501) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Update proc-macro2 requirement from =1.0.63 to =1.0.64 [\#4500](https://github.com/apache/arrow-rs/pull/4500) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add negate kernels \(\#4488\) [\#4494](https://github.com/apache/arrow-rs/pull/4494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add Datum Arithmetic tests, Fix Interval Substraction \(\#4480\) [\#4493](https://github.com/apache/arrow-rs/pull/4493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- support FixedSizeBinary types in eq\_dyn\_binary\_scalar/neq\_dyn\_binary\_scalar [\#4492](https://github.com/apache/arrow-rs/pull/4492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) +- Add default implementations to the FlightSqlService trait [\#4485](https://github.com/apache/arrow-rs/pull/4485) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([rossjones](https://github.com/rossjones)) +- add num-complex requirement [\#4482](https://github.com/apache/arrow-rs/pull/4482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) +- fix incorrect buffer size limiting in parquet async writer [\#4478](https://github.com/apache/arrow-rs/pull/4478) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([richox](https://github.com/richox)) +- feat: support RecordBatchReader on boxed trait objects [\#4475](https://github.com/apache/arrow-rs/pull/4475) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Improve in-place primitive sorts by 13-67% [\#4473](https://github.com/apache/arrow-rs/pull/4473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Add Scalar/Datum abstraction \(\#1047\) [\#4393](https://github.com/apache/arrow-rs/pull/4393) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) ## [43.0.0](https://github.com/apache/arrow-rs/tree/43.0.0) (2023-06-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/42.0.0...43.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index bef7a7c5cf43..6c52c5843459 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,49 +19,55 @@ # Changelog -## [44.0.0](https://github.com/apache/arrow-rs/tree/44.0.0) (2023-07-14) +## [45.0.0](https://github.com/apache/arrow-rs/tree/45.0.0) (2023-07-30) -[Full Changelog](https://github.com/apache/arrow-rs/compare/43.0.0...44.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/44.0.0...45.0.0) **Breaking changes:** -- Use Parser for cast kernel \(\#4512\) [\#4513](https://github.com/apache/arrow-rs/pull/4513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add Datum based arithmetic kernels \(\#3999\) [\#4465](https://github.com/apache/arrow-rs/pull/4465) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix timezoned timestamp arithmetic [\#4546](https://github.com/apache/arrow-rs/pull/4546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) **Implemented enhancements:** -- eq\_dyn\_binary\_scalar should support FixedSizeBinary types [\#4491](https://github.com/apache/arrow-rs/issues/4491) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Port Tests from Deprecated Arithmetic Kernels [\#4480](https://github.com/apache/arrow-rs/issues/4480) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement RecordBatchReader for Boxed trait object [\#4474](https://github.com/apache/arrow-rs/issues/4474) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `Date` - `Date` kernel [\#4383](https://github.com/apache/arrow-rs/issues/4383) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Default FlightSqlService Implementations [\#4372](https://github.com/apache/arrow-rs/issues/4372) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Use FormatOptions in Const Contexts [\#4580](https://github.com/apache/arrow-rs/issues/4580) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Human Readable Duration Display [\#4554](https://github.com/apache/arrow-rs/issues/4554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `BooleanBuilder`: Add `validity_slice` method for accessing validity bits [\#4535](https://github.com/apache/arrow-rs/issues/4535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `FixedSizedListArray` for `length` kernel [\#4517](https://github.com/apache/arrow-rs/issues/4517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `RowCoverter::convert` that targets an existing `Rows` [\#4479](https://github.com/apache/arrow-rs/issues/4479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Parquet: `AsyncArrowWriter` to a file corrupts the footer for large columns [\#4526](https://github.com/apache/arrow-rs/issues/4526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- \[object\_store\] Failure to send bytes to azure [\#4522](https://github.com/apache/arrow-rs/issues/4522) -- Cannot cast string '2021-01-02' to value of Date64 type [\#4512](https://github.com/apache/arrow-rs/issues/4512) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect Interval Subtraction [\#4489](https://github.com/apache/arrow-rs/issues/4489) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Interval Negation Incorrect [\#4488](https://github.com/apache/arrow-rs/issues/4488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet: AsyncArrowWriter inner buffer is not correctly limited and causes OOM [\#4477](https://github.com/apache/arrow-rs/issues/4477) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Panic `assertion failed: idx < self.len` when casting DictionaryArrays with nulls [\#4576](https://github.com/apache/arrow-rs/issues/4576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-arith is\_null is buggy with NullArray [\#4565](https://github.com/apache/arrow-rs/issues/4565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Interval to Duration Casting [\#4553](https://github.com/apache/arrow-rs/issues/4553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Too large validity buffer pre-allocation in `FixedSizeListBuilder::new` [\#4549](https://github.com/apache/arrow-rs/issues/4549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Like with wildcards fail to match fields with new lines. [\#4547](https://github.com/apache/arrow-rs/issues/4547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Timestamp Interval Arithmetic Ignores Timezone [\#4457](https://github.com/apache/arrow-rs/issues/4457) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Fix AsyncArrowWriter flush for large buffer sizes \(\#4526\) [\#4527](https://github.com/apache/arrow-rs/pull/4527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Cleanup cast\_primitive\_to\_list [\#4511](https://github.com/apache/arrow-rs/pull/4511) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Bump actions/upload-pages-artifact from 1 to 2 [\#4508](https://github.com/apache/arrow-rs/pull/4508) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support Date - Date \(\#4383\) [\#4504](https://github.com/apache/arrow-rs/pull/4504) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Bump actions/labeler from 4.2.0 to 4.3.0 [\#4501](https://github.com/apache/arrow-rs/pull/4501) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Update proc-macro2 requirement from =1.0.63 to =1.0.64 [\#4500](https://github.com/apache/arrow-rs/pull/4500) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add negate kernels \(\#4488\) [\#4494](https://github.com/apache/arrow-rs/pull/4494) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add Datum Arithmetic tests, Fix Interval Substraction \(\#4480\) [\#4493](https://github.com/apache/arrow-rs/pull/4493) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- support FixedSizeBinary types in eq\_dyn\_binary\_scalar/neq\_dyn\_binary\_scalar [\#4492](https://github.com/apache/arrow-rs/pull/4492) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([maxburke](https://github.com/maxburke)) -- Add default implementations to the FlightSqlService trait [\#4485](https://github.com/apache/arrow-rs/pull/4485) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([rossjones](https://github.com/rossjones)) -- add num-complex requirement [\#4482](https://github.com/apache/arrow-rs/pull/4482) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) -- fix incorrect buffer size limiting in parquet async writer [\#4478](https://github.com/apache/arrow-rs/pull/4478) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([richox](https://github.com/richox)) -- feat: support RecordBatchReader on boxed trait objects [\#4475](https://github.com/apache/arrow-rs/pull/4475) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Improve in-place primitive sorts by 13-67% [\#4473](https://github.com/apache/arrow-rs/pull/4473) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Add Scalar/Datum abstraction \(\#1047\) [\#4393](https://github.com/apache/arrow-rs/pull/4393) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- refactor: simplify hour\_dyn\(\) with time\_fraction\_dyn\(\) [\#4588](https://github.com/apache/arrow-rs/pull/4588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Move from\_iter\_values to GenericByteArray [\#4586](https://github.com/apache/arrow-rs/pull/4586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Mark GenericByteArray::new\_unchecked unsafe [\#4584](https://github.com/apache/arrow-rs/pull/4584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Configurable Duration Display [\#4581](https://github.com/apache/arrow-rs/pull/4581) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix take\_bytes Null and Overflow Handling \(\#4576\) [\#4579](https://github.com/apache/arrow-rs/pull/4579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move chrono-tz arithmetic tests to integration [\#4571](https://github.com/apache/arrow-rs/pull/4571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Write Page Offset Index For All-Nan Pages [\#4567](https://github.com/apache/arrow-rs/pull/4567) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([MachaelLee](https://github.com/MachaelLee)) +- support NullArray un arith/boolean kernel [\#4566](https://github.com/apache/arrow-rs/pull/4566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- Remove Sync from arrow-flight example [\#4564](https://github.com/apache/arrow-rs/pull/4564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix interval to duration casting \(\#4553\) [\#4562](https://github.com/apache/arrow-rs/pull/4562) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- docs: fix wrong parameter name [\#4559](https://github.com/apache/arrow-rs/pull/4559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) +- Fix FixedSizeListBuilder capacity \(\#4549\) [\#4552](https://github.com/apache/arrow-rs/pull/4552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- docs: fix wrong inline code snippet in parquet document [\#4550](https://github.com/apache/arrow-rs/pull/4550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) +- fix multiline wildcard likes \(fixes \#4547\) [\#4548](https://github.com/apache/arrow-rs/pull/4548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nl5887](https://github.com/nl5887)) +- Provide default `is_empty` impl for `arrow::array::ArrayBuilder` [\#4543](https://github.com/apache/arrow-rs/pull/4543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Add RowConverter::append \(\#4479\) [\#4541](https://github.com/apache/arrow-rs/pull/4541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clarify GenericColumnReader::read\_records [\#4540](https://github.com/apache/arrow-rs/pull/4540) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Initial loongarch port [\#4538](https://github.com/apache/arrow-rs/pull/4538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xiangzhai](https://github.com/xiangzhai)) +- Update proc-macro2 requirement from =1.0.64 to =1.0.66 [\#4537](https://github.com/apache/arrow-rs/pull/4537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- add a validity slice access for boolean array builders [\#4536](https://github.com/apache/arrow-rs/pull/4536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ChristianBeilschmidt](https://github.com/ChristianBeilschmidt)) +- use new num version instead of explicit num-complex dependency [\#4532](https://github.com/apache/arrow-rs/pull/4532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) +- feat: Support `FixedSizedListArray` for `length` kernel [\#4520](https://github.com/apache/arrow-rs/pull/4520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) diff --git a/Cargo.toml b/Cargo.toml index ea21b97c7058..ea64c1250747 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "44.0.0" +version = "45.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "44.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "44.0.0", path = "./arrow-arith" } -arrow-array = { version = "44.0.0", path = "./arrow-array" } -arrow-buffer = { version = "44.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "44.0.0", path = "./arrow-cast" } -arrow-csv = { version = "44.0.0", path = "./arrow-csv" } -arrow-data = { version = "44.0.0", path = "./arrow-data" } -arrow-ipc = { version = "44.0.0", path = "./arrow-ipc" } -arrow-json = { version = "44.0.0", path = "./arrow-json" } -arrow-ord = { version = "44.0.0", path = "./arrow-ord" } -arrow-row = { version = "44.0.0", path = "./arrow-row" } -arrow-schema = { version = "44.0.0", path = "./arrow-schema" } -arrow-select = { version = "44.0.0", path = "./arrow-select" } -arrow-string = { version = "44.0.0", path = "./arrow-string" } -parquet = { version = "44.0.0", path = "./parquet", default-features = false } +arrow = { version = "45.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "45.0.0", path = "./arrow-arith" } +arrow-array = { version = "45.0.0", path = "./arrow-array" } +arrow-buffer = { version = "45.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "45.0.0", path = "./arrow-cast" } +arrow-csv = { version = "45.0.0", path = "./arrow-csv" } +arrow-data = { version = "45.0.0", path = "./arrow-data" } +arrow-ipc = { version = "45.0.0", path = "./arrow-ipc" } +arrow-json = { version = "45.0.0", path = "./arrow-json" } +arrow-ord = { version = "45.0.0", path = "./arrow-ord" } +arrow-row = { version = "45.0.0", path = "./arrow-row" } +arrow-schema = { version = "45.0.0", path = "./arrow-schema" } +arrow-select = { version = "45.0.0", path = "./arrow-select" } +arrow-string = { version = "45.0.0", path = "./arrow-string" } +parquet = { version = "45.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 6a0fee19b1ef..89ef6ebc111f 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="43.0.0" -FUTURE_RELEASE="44.0.0" +SINCE_TAG="44.0.0" +FUTURE_RELEASE="45.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 95683439fa4108c036e48b334f8bed898b87a9b9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 31 Jul 2023 09:15:01 +0100 Subject: [PATCH 1101/1411] Remove deprecated limit kernel (#4597) --- .../compute/{kernels/mod.rs => kernels.rs} | 2 - arrow/src/compute/kernels/limit.rs | 208 ------------------ arrow/src/compute/mod.rs | 1 - arrow/src/lib.rs | 2 +- 4 files changed, 1 insertion(+), 212 deletions(-) rename arrow/src/compute/{kernels/mod.rs => kernels.rs} (98%) delete mode 100644 arrow/src/compute/kernels/limit.rs diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels.rs similarity index 98% rename from arrow/src/compute/kernels/mod.rs rename to arrow/src/compute/kernels.rs index 49eae6d3ade5..1a79aef547d3 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels.rs @@ -17,8 +17,6 @@ //! Computation kernels on Arrow Arrays -pub mod limit; - pub use arrow_arith::{ aggregate, arithmetic, arity, bitwise, boolean, numeric, temporal, }; diff --git a/arrow/src/compute/kernels/limit.rs b/arrow/src/compute/kernels/limit.rs deleted file mode 100644 index 097b8e949443..000000000000 --- a/arrow/src/compute/kernels/limit.rs +++ /dev/null @@ -1,208 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines miscellaneous array kernels. - -use crate::array::ArrayRef; - -/// Returns the array, taking only the number of elements specified -/// -/// Limit performs a zero-copy slice of the array, and is a convenience method on slice -/// where: -/// * it performs a bounds-check on the array -/// * it slices from offset 0 -#[deprecated(note = "Use Array::slice")] -pub fn limit(array: &ArrayRef, num_elements: usize) -> ArrayRef { - let lim = num_elements.min(array.len()); - array.slice(0, lim) -} - -#[cfg(test)] -#[allow(deprecated)] -mod tests { - use super::*; - use crate::array::*; - use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field}; - use crate::util::bit_util; - - use std::sync::Arc; - - #[test] - fn test_limit_array() { - let a: ArrayRef = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9])); - let b = limit(&a, 3); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(3, c.len()); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - } - - #[test] - fn test_limit_string_array() { - let a: ArrayRef = Arc::new(StringArray::from(vec!["hello", " ", "world", "!"])); - let b = limit(&a, 2); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, c.len()); - assert_eq!("hello", c.value(0)); - assert_eq!(" ", c.value(1)); - } - - #[test] - fn test_limit_array_with_null() { - let a: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(5)])); - let b = limit(&a, 1); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(1, c.len()); - assert!(c.is_null(0)); - } - - #[test] - fn test_limit_array_with_limit_too_large() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let a_ref: ArrayRef = Arc::new(a); - let b = limit(&a_ref, 6); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - - assert_eq!(5, c.len()); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - assert_eq!(8, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_list_array_limit() { - // adapted from crate::array::test::test_list_array_slice - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1], null, [2, 3], null, [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref([0, 2, 2, 4, 4, 6, 6, 9, 9, 10]); - // 01010101 00000001 - let mut null_bits: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 2); - bit_util::set_bit(&mut null_bits, 4); - bit_util::set_bit(&mut null_bits, 6); - bit_util::set_bit(&mut null_bits, 8); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(9) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Some(Buffer::from(null_bits))) - .build() - .unwrap(); - let list_array: ArrayRef = Arc::new(ListArray::from(list_data)); - - let limit_array = limit(&list_array, 6); - assert_eq!(6, limit_array.len()); - assert_eq!(0, limit_array.offset()); - assert_eq!(3, limit_array.null_count()); - - // Check offset and length for each non-null value. - let limit_array: &ListArray = - limit_array.as_any().downcast_ref::().unwrap(); - - for i in 0..limit_array.len() { - let offset = limit_array.value_offsets()[i]; - let length = limit_array.value_length(i); - if i % 2 == 0 { - assert_eq!(2, length); - assert_eq!(i as i32, offset); - } else { - assert_eq!(0, length); - } - } - } - - #[test] - fn test_struct_array_limit() { - // adapted from crate::array::test::test_struct_array_slice - let boolean_data = ArrayData::builder(DataType::Boolean) - .len(5) - .add_buffer(Buffer::from([0b00010000])) - .null_bit_buffer(Some(Buffer::from([0b00010001]))) - .build() - .unwrap(); - let int_data = ArrayData::builder(DataType::Int32) - .len(5) - .add_buffer(Buffer::from_slice_ref([0, 28, 42, 0, 0])) - .null_bit_buffer(Some(Buffer::from([0b00000110]))) - .build() - .unwrap(); - - let field_types = vec![ - Field::new("a", DataType::Boolean, true), - Field::new("b", DataType::Int32, true), - ]; - let struct_array_data = ArrayData::builder(DataType::Struct(field_types.into())) - .len(5) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) - .null_bit_buffer(Some(Buffer::from([0b00010111]))) - .build() - .unwrap(); - let struct_array = StructArray::from(struct_array_data); - - assert_eq!(5, struct_array.len()); - assert_eq!(1, struct_array.null_count()); - assert_eq!(boolean_data, struct_array.column(0).to_data()); - assert_eq!(int_data, struct_array.column(1).to_data()); - - let array: ArrayRef = Arc::new(struct_array); - - let sliced_array = limit(&array, 3); - let sliced_array = sliced_array.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_array.len()); - assert_eq!(0, sliced_array.offset()); - assert_eq!(0, sliced_array.null_count()); - assert!(sliced_array.is_valid(0)); - assert!(sliced_array.is_valid(1)); - assert!(sliced_array.is_valid(2)); - - let sliced_c0 = sliced_array.column(0); - let sliced_c0 = sliced_c0.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c0.len()); - assert_eq!(0, sliced_c0.offset()); - assert_eq!(2, sliced_c0.null_count()); - assert!(sliced_c0.is_valid(0)); - assert!(sliced_c0.is_null(1)); - assert!(sliced_c0.is_null(2)); - assert!(!sliced_c0.value(0)); - - let sliced_c1 = sliced_array.column(1); - let sliced_c1 = sliced_c1.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c1.len()); - assert_eq!(0, sliced_c1.offset()); - assert_eq!(1, sliced_c1.null_count()); - assert!(sliced_c1.is_null(0)); - assert_eq!(28, sliced_c1.value(1)); - assert_eq!(42, sliced_c1.value(2)); - } -} diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index c9fd525e85a4..7cfe787b08cf 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -28,7 +28,6 @@ pub use self::kernels::comparison::*; pub use self::kernels::concat::*; pub use self::kernels::filter::*; pub use self::kernels::interleave::*; -pub use self::kernels::limit::*; pub use self::kernels::nullif::*; pub use self::kernels::partition::*; pub use self::kernels::regexp::*; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index bf39bae530b9..96cc98177a9a 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -188,7 +188,7 @@ //! * All boolean binary operators such as [`equality`](compute::kernels::comparison::eq) //! * [`cast`](compute::kernels::cast::cast) //! * [`filter`](compute::kernels::filter::filter) -//! * [`take`](compute::kernels::take::take) and [`limit`](compute::kernels::limit::limit) +//! * [`take`](compute::kernels::take::take) //! * [`sort`](compute::kernels::sort::sort) //! * some string operators such as [`substring`](compute::kernels::substring::substring) and [`length`](compute::kernels::length::length) //! From d1fb2c6a0a07d3e28fb41287c0b5e1dc8fc032e5 Mon Sep 17 00:00:00 2001 From: Tomoaki Kawada Date: Mon, 31 Jul 2023 22:53:14 +0900 Subject: [PATCH 1102/1411] fix(data): create child arrays of correct length when building a sparse union null array (#4601) * test: validate built arrays in `test_null_union` * fix(data): create child arrays of correct length when building a sparse union null array --- arrow-array/src/array/mod.rs | 2 ++ arrow-data/src/data/mod.rs | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9312770644a3..0157279dfe49 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -841,6 +841,8 @@ mod tests { assert_eq!(a.null_count(), 1); assert!(a.is_null(0)) } + + array.to_data().validate_full().unwrap(); } } diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs index 32aae1e92a51..50643b90e881 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data/mod.rs @@ -634,9 +634,12 @@ impl ArrayData { let children = f .iter() .enumerate() - .map(|(idx, (_, f))| match idx { - 0 => Self::new_null(f.data_type(), len), - _ => Self::new_empty(f.data_type()), + .map(|(idx, (_, f))| { + if idx == 0 || *mode == UnionMode::Sparse { + Self::new_null(f.data_type(), len) + } else { + Self::new_empty(f.data_type()) + } }) .collect(); From b597a206a59dd10deeff9fa53097b6d14cf169e2 Mon Sep 17 00:00:00 2001 From: Yuyi Wang Date: Mon, 31 Jul 2023 21:53:28 +0800 Subject: [PATCH 1103/1411] Use u32 metadata_len when parsing footer of parquet. (#4599) * Use u32 metadata_len. * Remove a useless test. * Fix footer metadata_len type. --- parquet/src/file/footer.rs | 20 +++----------------- parquet/src/file/writer.rs | 2 +- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index fcd6a300c5fb..f4fb2534c220 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -103,13 +103,9 @@ pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result { } // get the metadata length from the footer - let metadata_len = i32::from_le_bytes(slice[..4].try_into().unwrap()); - metadata_len.try_into().map_err(|_| { - general_err!( - "Invalid Parquet file. Metadata length is less than zero ({})", - metadata_len - ) - }) + let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap()); + // u32 won't be larger than usize in most cases + Ok(metadata_len as usize) } /// Parses column orders from Thrift definition. @@ -175,16 +171,6 @@ mod tests { ); } - #[test] - fn test_parse_metadata_invalid_length() { - let test_file = Bytes::from(vec![0, 0, 0, 255, b'P', b'A', b'R', b'1']); - let reader_result = parse_metadata(&test_file); - assert_eq!( - reader_result.unwrap_err().to_string(), - "Parquet error: Invalid Parquet file. Metadata length is less than zero (-16777216)" - ); - } - #[test] fn test_parse_metadata_invalid_start() { let test_file = Bytes::from(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1']); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index bde350a1ea42..12da085ed2b7 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -347,7 +347,7 @@ impl SerializedFileWriter { let end_pos = self.buf.bytes_written(); // Write footer - let metadata_len = (end_pos - start_pos) as i32; + let metadata_len = (end_pos - start_pos) as u32; self.buf.write_all(&metadata_len.to_le_bytes())?; self.buf.write_all(&PARQUET_MAGIC)?; From c663d88327dfd6958b102ec7b1ca310cc20b40c4 Mon Sep 17 00:00:00 2001 From: Tomoaki Kawada Date: Mon, 31 Jul 2023 22:53:41 +0900 Subject: [PATCH 1104/1411] fix(data): map type ID to child index before indexing a union child array (#4598) * test: add a test for `MutableArrayData` and dense union * fix(data): map type ID to child index before indexing union child array --- arrow-data/src/transform/union.rs | 13 +++++-- arrow/tests/array_transform.rs | 59 ++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/arrow-data/src/transform/union.rs b/arrow-data/src/transform/union.rs index 8d1ea34c314d..d7083588d782 100644 --- a/arrow-data/src/transform/union.rs +++ b/arrow-data/src/transform/union.rs @@ -39,6 +39,9 @@ pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend { pub(super) fn build_extend_dense(array: &ArrayData) -> Extend { let type_ids = array.buffer::(0); let offsets = array.buffer::(1); + let arrow_schema::DataType::Union(src_fields, _) = array.data_type() else { + unreachable!(); + }; Box::new( move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { @@ -48,14 +51,18 @@ pub(super) fn build_extend_dense(array: &ArrayData) -> Extend { .extend_from_slice(&type_ids[start..start + len]); (start..start + len).for_each(|i| { - let type_id = type_ids[i] as usize; + let type_id = type_ids[i]; + let child_index = src_fields + .iter() + .position(|(r, _)| r == type_id) + .expect("invalid union type ID"); let src_offset = offsets[i] as usize; - let child_data = &mut mutable.child_data[type_id]; + let child_data = &mut mutable.child_data[child_index]; let dst_offset = child_data.len(); // Extend offsets mutable.buffer2.push(dst_offset as i32); - mutable.child_data[type_id].extend(index, src_offset, src_offset + 1) + mutable.child_data[child_index].extend(index, src_offset, src_offset + 1) }) }, ) diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index ebbadc00aecd..15141eb208e4 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -19,7 +19,7 @@ use arrow::array::{ Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, ListBuilder, MapBuilder, NullArray, StringArray, StringBuilder, - StringDictionaryBuilder, StructArray, UInt8Array, + StringDictionaryBuilder, StructArray, UInt8Array, UnionArray, }; use arrow::datatypes::Int16Type; use arrow_buffer::Buffer; @@ -488,6 +488,63 @@ fn test_struct_many() { assert_eq!(array, expected) } +#[test] +fn test_union_dense() { + // Input data + let strings: ArrayRef = Arc::new(StringArray::from(vec![ + Some("joe"), + Some("mark"), + Some("doe"), + ])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])); + let offsets = Buffer::from_slice_ref([0, 0, 1, 1, 2, 2, 3, 4i32]); + let type_ids = Buffer::from_slice_ref([42, 84, 42, 84, 84, 42, 84, 84i8]); + + let array = UnionArray::try_new( + &[84, 42], + type_ids, + Some(offsets), + vec![ + (Field::new("int", DataType::Int32, false), ints), + (Field::new("string", DataType::Utf8, false), strings), + ], + ) + .unwrap() + .into_data(); + let arrays = vec![&array]; + let mut mutable = MutableArrayData::new(arrays, false, 0); + + // Slice it by `MutableArrayData` + mutable.extend(0, 4, 7); + let data = mutable.freeze(); + let array = UnionArray::from(data); + + // Expected data + let strings: ArrayRef = Arc::new(StringArray::from(vec![Some("doe")])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(3), Some(4)])); + let offsets = Buffer::from_slice_ref([0, 0, 1i32]); + let type_ids = Buffer::from_slice_ref([84, 42, 84i8]); + + let expected = UnionArray::try_new( + &[84, 42], + type_ids, + Some(offsets), + vec![ + (Field::new("int", DataType::Int32, false), ints), + (Field::new("string", DataType::Utf8, false), strings), + ], + ) + .unwrap(); + + assert_eq!(array.to_data(), expected.to_data()); +} + #[test] fn test_binary_fixed_sized_offsets() { let array = FixedSizeBinaryArray::try_from_iter( From a5519d6ac273e4cc4fdcd85a9e424676130795b7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 08:54:18 +0100 Subject: [PATCH 1105/1411] Consolidate sort benchmarks (#4604) --- arrow/Cargo.toml | 5 - arrow/benches/sort_kernel.rs | 209 ++++++++++++++---------- arrow/benches/sort_kernel_primitives.rs | 59 ------- 3 files changed, 121 insertions(+), 152 deletions(-) delete mode 100644 arrow/benches/sort_kernel_primitives.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 32f11af541fa..bcf6a84311d5 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -185,11 +185,6 @@ name = "sort_kernel" harness = false required-features = ["test_utils"] -[[bench]] -name = "sort_kernel_primitives" -harness = false -required-features = ["test_utils"] - [[bench]] name = "partition_kernels" harness = false diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index 43a9a84d9a74..8762d9eb2f5f 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -23,8 +23,7 @@ use std::sync::Arc; extern crate arrow; -use arrow::compute::kernels::sort::{lexsort, SortColumn}; -use arrow::compute::{sort_limit, sort_to_indices}; +use arrow::compute::{lexsort, sort, sort_to_indices, SortColumn}; use arrow::datatypes::{Int16Type, Int32Type}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; @@ -42,7 +41,11 @@ fn create_bool_array(size: usize, with_nulls: bool) -> ArrayRef { Arc::new(array) } -fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { +fn bench_sort(array: &dyn Array) { + criterion::black_box(sort(array, None).unwrap()); +} + +fn bench_lexsort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { let columns = vec![ SortColumn { values: array_a.clone(), @@ -57,115 +60,145 @@ fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { criterion::black_box(lexsort(&columns, limit).unwrap()); } -fn bench_sort_to_indices(array: &ArrayRef, limit: Option) { +fn bench_sort_to_indices(array: &dyn Array, limit: Option) { criterion::black_box(sort_to_indices(array, None, limit).unwrap()); } -fn bench_sort_run(array: &ArrayRef, limit: Option) { - criterion::black_box(sort_limit(array, None, limit).unwrap()); -} - fn add_benchmark(c: &mut Criterion) { - let arr_a = create_f32_array(2u64.pow(10) as usize, false); - let arr_b = create_f32_array(2u64.pow(10) as usize, false); - - c.bench_function("sort 2^10", |b| b.iter(|| bench_sort(&arr_a, &arr_b, None))); + let arr = create_primitive_array::(2usize.pow(10), 0.0); + c.bench_function("sort i64 2^10", |b| b.iter(|| bench_sort(&arr))); - let arr_a = create_f32_array(2u64.pow(12) as usize, false); - let arr_b = create_f32_array(2u64.pow(12) as usize, false); + let arr = create_primitive_array::(2usize.pow(12), 0.5); + c.bench_function("sort i64 2^12", |b| b.iter(|| bench_sort(&arr))); - c.bench_function("sort 2^12", |b| b.iter(|| bench_sort(&arr_a, &arr_b, None))); + let arr = create_primitive_array::(2usize.pow(12), 0.0); + c.bench_function("sort i64 nulls 2^10", |b| b.iter(|| bench_sort(&arr))); - let arr_a = create_f32_array(2u64.pow(10) as usize, true); - let arr_b = create_f32_array(2u64.pow(10) as usize, true); + let arr = create_primitive_array::(2usize.pow(12), 0.5); + c.bench_function("sort i64 nulls 2^12", |b| b.iter(|| bench_sort(&arr))); - c.bench_function("sort nulls 2^10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) + let arr = create_f32_array(2_usize.pow(12), false); + c.bench_function("sort f32 to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) }); - let arr_a = create_f32_array(2u64.pow(12) as usize, true); - let arr_b = create_f32_array(2u64.pow(12) as usize, true); + let arr = create_f32_array(2usize.pow(12), true); + c.bench_function("sort f32 nulls to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) + }); - c.bench_function("sort nulls 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) + let arr = create_string_array_with_len::(2usize.pow(12), 0.0, 10); + c.bench_function("sort string[10] to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) }); - let arr_a = create_bool_array(2u64.pow(12) as usize, false); - let arr_b = create_bool_array(2u64.pow(12) as usize, false); - c.bench_function("bool sort 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) + let arr = create_string_array_with_len::(2usize.pow(12), 0.5, 10); + c.bench_function("sort string[10] nulls to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) }); - let arr_a = create_bool_array(2u64.pow(12) as usize, true); - let arr_b = create_bool_array(2u64.pow(12) as usize, true); - c.bench_function("bool sort nulls 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) + let arr = create_string_dict_array::(2usize.pow(12), 0.0, 10); + c.bench_function("sort string[10] dict to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) }); - let dict_arr = Arc::new(create_string_dict_array::( - 2u64.pow(12) as usize, - 0.0, - 1, - )) as ArrayRef; - c.bench_function("dict string 2^12", |b| { - b.iter(|| bench_sort_to_indices(&dict_arr, None)) + let arr = create_string_dict_array::(2usize.pow(12), 0.5, 10); + c.bench_function("sort string[10] dict nulls to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) }); - let run_encoded_array = Arc::new(create_primitive_run_array::( - 2u64.pow(12) as usize, - 2u64.pow(10) as usize, - )) as ArrayRef; + let run_encoded_array = create_primitive_run_array::( + 2usize.pow(12), + 2usize.pow(10), + ); + + c.bench_function("sort primitive run 2^12", |b| { + b.iter(|| bench_sort(&run_encoded_array)) + }); c.bench_function("sort primitive run to indices 2^12", |b| { b.iter(|| bench_sort_to_indices(&run_encoded_array, None)) }); - c.bench_function("sort primitive run to run 2^12", |b| { - b.iter(|| bench_sort_run(&run_encoded_array, None)) - }); - - // with limit - { - let arr_a = create_f32_array(2u64.pow(12) as usize, false); - let arr_b = create_f32_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(10))) - }); - - let arr_a = create_f32_array(2u64.pow(12) as usize, false); - let arr_b = create_f32_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 100", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(100))) - }); - - let arr_a = create_f32_array(2u64.pow(12) as usize, false); - let arr_b = create_f32_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 1000", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(1000))) - }); - - let arr_a = create_f32_array(2u64.pow(12) as usize, false); - let arr_b = create_f32_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(2u64.pow(12) as usize))) - }); - - let arr_a = create_f32_array(2u64.pow(12) as usize, true); - let arr_b = create_f32_array(2u64.pow(12) as usize, true); - - c.bench_function("sort nulls 2^12 limit 10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(10))) - }); - c.bench_function("sort nulls 2^12 limit 100", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(100))) - }); - c.bench_function("sort nulls 2^12 limit 1000", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(1000))) - }); - c.bench_function("sort nulls 2^12 limit 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(2u64.pow(12) as usize))) - }); - } + let arr_a = create_f32_array(2usize.pow(10), false); + let arr_b = create_f32_array(2usize.pow(10), false); + + c.bench_function("lexsort (f32, f32) 2^10", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_f32_array(2usize.pow(12), false); + let arr_b = create_f32_array(2usize.pow(12), false); + + c.bench_function("lexsort (f32, f32) 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_f32_array(2usize.pow(10), true); + let arr_b = create_f32_array(2usize.pow(10), true); + + c.bench_function("lexsort (f32, f32) nulls 2^10", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_f32_array(2usize.pow(12), true); + let arr_b = create_f32_array(2usize.pow(12), true); + + c.bench_function("lexsort (f32, f32) nulls 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_bool_array(2usize.pow(12), false); + let arr_b = create_bool_array(2usize.pow(12), false); + c.bench_function("lexsort (bool, bool) 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_bool_array(2usize.pow(12), true); + let arr_b = create_bool_array(2usize.pow(12), true); + c.bench_function("lexsort (bool, bool) nulls 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, None)) + }); + + let arr_a = create_f32_array(2usize.pow(12), false); + let arr_b = create_f32_array(2usize.pow(12), false); + c.bench_function("lexsort (f32, f32) 2^12 limit 10", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(10))) + }); + + let arr_a = create_f32_array(2usize.pow(12), false); + let arr_b = create_f32_array(2usize.pow(12), false); + c.bench_function("lexsort (f32, f32) 2^12 limit 100", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(100))) + }); + + let arr_a = create_f32_array(2usize.pow(12), false); + let arr_b = create_f32_array(2usize.pow(12), false); + c.bench_function("lexsort (f32, f32) 2^12 limit 1000", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(1000))) + }); + + let arr_a = create_f32_array(2usize.pow(12), false); + let arr_b = create_f32_array(2usize.pow(12), false); + c.bench_function("lexsort (f32, f32) 2^12 limit 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(2usize.pow(12)))) + }); + + let arr_a = create_f32_array(2usize.pow(12), true); + let arr_b = create_f32_array(2usize.pow(12), true); + + c.bench_function("lexsort (f32, f32) nulls 2^12 limit 10", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(10))) + }); + c.bench_function("lexsort (f32, f32) nulls 2^12 limit 100", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(100))) + }); + c.bench_function("lexsort (f32, f32) nulls 2^12 limit 1000", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(1000))) + }); + c.bench_function("lexsort (f32, f32) nulls 2^12 limit 2^12", |b| { + b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(2usize.pow(12)))) + }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/benches/sort_kernel_primitives.rs b/arrow/benches/sort_kernel_primitives.rs deleted file mode 100644 index ca9183580bd2..000000000000 --- a/arrow/benches/sort_kernel_primitives.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use arrow_ord::sort::sort; -use criterion::Criterion; - -use std::sync::Arc; - -extern crate arrow; - -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Int64Type}; - -fn create_i64_array(size: usize, with_nulls: bool) -> ArrayRef { - let null_density = if with_nulls { 0.5 } else { 0.0 }; - let array = create_primitive_array::(size, null_density); - Arc::new(array) -} - -fn bench_sort(array: &ArrayRef) { - criterion::black_box(sort(criterion::black_box(array), None).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let arr_a = create_i64_array(2u64.pow(10) as usize, false); - - c.bench_function("sort 2^10", |b| b.iter(|| bench_sort(&arr_a))); - - let arr_a = create_i64_array(2u64.pow(12) as usize, false); - - c.bench_function("sort 2^12", |b| b.iter(|| bench_sort(&arr_a))); - - let arr_a = create_i64_array(2u64.pow(10) as usize, true); - - c.bench_function("sort nulls 2^10", |b| b.iter(|| bench_sort(&arr_a))); - - let arr_a = create_i64_array(2u64.pow(12) as usize, true); - - c.bench_function("sort nulls 2^12", |b| b.iter(|| bench_sort(&arr_a))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); From caa37fed52478c19dfd7ac49b651fb1c891e27a9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:00:55 +0100 Subject: [PATCH 1106/1411] Simplify dictionary sort (#4605) --- arrow-ord/src/sort.rs | 73 ++++++++++--------------------------------- 1 file changed, 16 insertions(+), 57 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 147af1e301d6..c623475c0b3f 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -390,28 +390,15 @@ pub fn sort_to_indices( descending: false, nulls_first: value_null_first, }); - downcast_dictionary_array!( - values => match values.values().data_type() { - dt if DataType::is_primitive(dt) => { - let dict_values = values.values(); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = sorted_rank(&sorted_value_indices); - sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) - }, - DataType::Utf8 => { - let dict_values = values.values(); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let value_indices_map = sorted_rank(&sorted_value_indices); - sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) - }, - t => return Err(ArrowError::ComputeError(format!( - "Unsupported dictionary value type {t}" - ))), - }, - t => return Err(ArrowError::ComputeError(format!( - "Unsupported datatype {t}" - ))), - ) + downcast_dictionary_array! { + values => { + let dict_values = values.values(); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let rank = sorted_rank(&sorted_value_indices); + sort_dictionary(values, &rank, v, n, options, limit) + } + _ => unreachable!(), + } } DataType::Binary | DataType::FixedSizeBinary(_) => { sort_binary::(values, v, n, &options, limit) @@ -563,28 +550,23 @@ fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec { out } -/// Sort dictionary encoded primitive values -fn sort_primitive_dictionary( - values: &DictionaryArray, - value_indices_map: &[u32], +/// Sort dictionary given the sorted rank of each key +fn sort_dictionary( + dict: &DictionaryArray, + rank: &[u32], value_indices: Vec, null_indices: Vec, options: SortOptions, limit: Option, - cmp: F, -) -> UInt32Array -where - K: ArrowDictionaryKeyType, - F: Fn(u32, u32) -> Ordering, -{ - let keys: &PrimitiveArray = values.keys(); +) -> UInt32Array { + let keys: &PrimitiveArray = dict.keys(); // create tuples that are used for sorting let valids = value_indices .into_iter() .map(|index| { let key: K::Native = keys.value(index as usize); - (index, value_indices_map[key.as_usize()]) + (index, rank[key.as_usize()]) }) .collect::>(); @@ -877,29 +859,6 @@ fn sort_string( ) } -/// Sort dictionary encoded strings -fn sort_string_dictionary( - values: &DictionaryArray, - value_indices_map: &[u32], - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> UInt32Array { - let keys: &PrimitiveArray = values.keys(); - - // create tuples that are used for sorting - let valids = value_indices - .into_iter() - .map(|index| { - let key: T::Native = keys.value(index as usize); - (index, value_indices_map[key.as_usize()]) - }) - .collect::>(); - - sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options, limit, valids) -} - /// shared implementation between dictionary encoded and plain string arrays #[inline] fn sort_string_helper<'a, A: Array, F>( From 149a3f8c9f05a31eef717291cef566ac8baf8c56 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 09:10:39 +0100 Subject: [PATCH 1107/1411] Remove deprecated arithmetic kernels (#4481) (#4594) * Remove deprecated arithmetic kernels (#4481) * Fix tests * Fix FFI * Update pyarrow-integration-test --- arrow-arith/src/arithmetic.rs | 3160 +----------------- arrow-arith/src/arity.rs | 71 - arrow-pyarrow-integration-testing/src/lib.rs | 2 +- arrow/src/ffi.rs | 11 +- arrow/src/lib.rs | 2 +- 5 files changed, 77 insertions(+), 3169 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index f8c855af0183..8635ce0ddd80 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -31,552 +31,6 @@ use arrow_schema::*; use std::cmp::min; use std::sync::Arc; -/// Helper function to perform math lambda function on values from two arrays. If either -/// left or right value is null then the output value is also null, so `1 + null` is -/// `null`. -/// -/// # Errors -/// -/// This function errors if the arrays have different lengths -#[deprecated(note = "Use arrow_arith::arity::binary")] -pub fn math_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result, ArrowError> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> LT::Native, -{ - binary(left, right, op) -} - -/// Calculates the modulus operation `left % right` on two SIMD inputs. -/// The lower-most bits of `valid_mask` specify which vector lanes are considered as valid. -/// -/// # Errors -/// -/// This function returns a [`ArrowError::DivideByZero`] if a valid element in `right` is `0` -#[cfg(feature = "simd")] -#[inline] -fn simd_checked_modulus( - valid_mask: Option, - left: T::Simd, - right: T::Simd, -) -> Result { - let zero = T::init(T::Native::ZERO); - let one = T::init(T::Native::ONE); - - let right_no_invalid_zeros = match valid_mask { - Some(mask) => { - let simd_mask = T::mask_from_u64(mask); - // select `1` for invalid lanes, which will be a no-op during division later - T::mask_select(simd_mask, right, one) - } - None => right, - }; - - let zero_mask = T::eq(right_no_invalid_zeros, zero); - - if T::mask_any(zero_mask) { - Err(ArrowError::DivideByZero) - } else { - Ok(T::bin_op(left, right_no_invalid_zeros, |a, b| a % b)) - } -} - -/// Calculates the division operation `left / right` on two SIMD inputs. -/// The lower-most bits of `valid_mask` specify which vector lanes are considered as valid. -/// -/// # Errors -/// -/// This function returns a [`ArrowError::DivideByZero`] if a valid element in `right` is `0` -#[cfg(feature = "simd")] -#[inline] -fn simd_checked_divide( - valid_mask: Option, - left: T::Simd, - right: T::Simd, -) -> Result { - let zero = T::init(T::Native::ZERO); - let one = T::init(T::Native::ONE); - - let right_no_invalid_zeros = match valid_mask { - Some(mask) => { - let simd_mask = T::mask_from_u64(mask); - // select `1` for invalid lanes, which will be a no-op during division later - T::mask_select(simd_mask, right, one) - } - None => right, - }; - - let zero_mask = T::eq(right_no_invalid_zeros, zero); - - if T::mask_any(zero_mask) { - Err(ArrowError::DivideByZero) - } else { - Ok(T::bin_op(left, right_no_invalid_zeros, |a, b| a / b)) - } -} - -/// Applies `op` on the remainder elements of two input chunks and writes the result into -/// the remainder elements of `result_chunks`. -/// The lower-most bits of `valid_mask` specify which elements are considered as valid. -/// -/// # Errors -/// -/// This function returns a [`ArrowError::DivideByZero`] if a valid element in `right` is `0` -#[cfg(feature = "simd")] -#[inline] -fn simd_checked_divide_op_remainder( - valid_mask: Option, - left_chunks: std::slice::ChunksExact, - right_chunks: std::slice::ChunksExact, - result_chunks: std::slice::ChunksExactMut, - op: F, -) -> Result<(), ArrowError> -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> T::Native, -{ - let result_remainder = result_chunks.into_remainder(); - let left_remainder = left_chunks.remainder(); - let right_remainder = right_chunks.remainder(); - - result_remainder - .iter_mut() - .zip(left_remainder.iter().zip(right_remainder.iter())) - .enumerate() - .try_for_each(|(i, (result_scalar, (left_scalar, right_scalar)))| { - if valid_mask.map(|mask| mask & (1 << i) != 0).unwrap_or(true) { - if right_scalar.is_zero() { - return Err(ArrowError::DivideByZero); - } - *result_scalar = op(*left_scalar, *right_scalar); - } else { - *result_scalar = T::default_value(); - } - Ok(()) - })?; - - Ok(()) -} - -/// Creates a new PrimitiveArray by applying `simd_op` to the `left` and `right` input array. -/// If the length of the arrays is not multiple of the number of vector lanes -/// then the remainder of the array will be calculated using `scalar_op`. -/// Any operation on a `NULL` value will result in a `NULL` value in the output. -/// -/// # Errors -/// -/// This function errors if: -/// * the arrays have different lengths -/// * there is an element where both left and right values are valid and the right value is `0` -#[cfg(feature = "simd")] -fn simd_checked_divide_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - simd_op: SI, - scalar_op: SC, -) -> Result, ArrowError> -where - T: ArrowNumericType, - SI: Fn(Option, T::Simd, T::Simd) -> Result, - SC: Fn(T::Native, T::Native) -> T::Native, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - // Create the combined `Bitmap` - let nulls = arrow_buffer::NullBuffer::union(left.nulls(), right.nulls()); - - let lanes = T::lanes(); - let buffer_size = left.len() * std::mem::size_of::(); - let mut result = - arrow_buffer::MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - match &nulls { - Some(b) => { - let valid_chunks = b.inner().bit_chunks(); - - // process data in chunks of 64 elements since we also get 64 bits of validity information at a time - - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(64); - let mut left_chunks = left.values().chunks_exact(64); - let mut right_chunks = right.values().chunks_exact(64); - - valid_chunks - .iter() - .zip((&mut result_chunks).zip((&mut left_chunks).zip(&mut right_chunks))) - .try_for_each( - |(mut mask, (result_slice, (left_slice, right_slice)))| { - // split chunks further into slices corresponding to the vector length - // the compiler is able to unroll this inner loop and remove bounds checks - // since the outer chunk size (64) is always a multiple of the number of lanes - result_slice - .chunks_exact_mut(lanes) - .zip(left_slice.chunks_exact(lanes).zip(right_slice.chunks_exact(lanes))) - .try_for_each(|(result_slice, (left_slice, right_slice))| -> Result<(), ArrowError> { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - - let simd_result = simd_op(Some(mask), simd_left, simd_right)?; - - T::write(simd_result, result_slice); - - // skip the shift and avoid overflow for u8 type, which uses 64 lanes. - mask >>= T::lanes() % 64; - - Ok(()) - }) - }, - )?; - - let valid_remainder = valid_chunks.remainder_bits(); - - simd_checked_divide_op_remainder::( - Some(valid_remainder), - left_chunks, - right_chunks, - result_chunks, - scalar_op, - )?; - } - None => { - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut left_chunks = left.values().chunks_exact(lanes); - let mut right_chunks = right.values().chunks_exact(lanes); - - (&mut result_chunks) - .zip((&mut left_chunks).zip(&mut right_chunks)) - .try_for_each( - |(result_slice, (left_slice, right_slice))| -> Result<(), ArrowError> { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - - let simd_result = simd_op(None, simd_left, simd_right)?; - - T::write(simd_result, result_slice); - - Ok(()) - }, - )?; - - simd_checked_divide_op_remainder::( - None, - left_chunks, - right_chunks, - result_chunks, - scalar_op, - )?; - } - } - - Ok(PrimitiveArray::new(result.into(), nulls)) -} - -fn math_safe_divide_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Option, -{ - let array: PrimitiveArray = binary_opt::<_, _, _, LT>(left, right, op)?; - Ok(Arc::new(array) as ArrayRef) -} - -/// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `add_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] -pub fn add( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - binary(left, right, |a, b| a.add_wrapping(b)) -} - -/// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `add` instead. -#[deprecated(note = "Use arrow_arith::numeric::add")] -pub fn add_checked( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - try_binary(left, right, |a, b| a.add_checked(b)) -} - -/// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `add_dyn_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] -pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { - crate::numeric::add_wrapping(&left, &right) -} - -/// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `add_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::add")] -pub fn add_dyn_checked( - left: &dyn Array, - right: &dyn Array, -) -> Result { - crate::numeric::add(&left, &right) -} - -/// Add every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `add_scalar_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] -pub fn add_scalar( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - Ok(unary(array, |value| value.add_wrapping(scalar))) -} - -/// Add every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `add_scalar` instead. -#[deprecated(note = "Use arrow_arith::numeric::add")] -pub fn add_scalar_checked( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - try_unary(array, |value| value.add_checked(scalar)) -} - -/// Add every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `add_scalar_checked_dyn` instead. -/// -/// This returns an `Err` when the input array is not supported for adding operation. -#[deprecated(note = "Use arrow_arith::numeric::add_wrapping")] -pub fn add_scalar_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - unary_dyn::<_, T>(array, |value| value.add_wrapping(scalar)) -} - -/// Add every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `add_scalar_dyn` instead. -/// -/// As this kernel has the branching costs and also prevents LLVM from vectorising it correctly, -/// it is usually much slower than non-checking variant. -#[deprecated(note = "Use arrow_arith::numeric::add")] -pub fn add_scalar_checked_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - try_unary_dyn::<_, T>(array, |value| value.add_checked(scalar)) - .map(|a| Arc::new(a) as ArrayRef) -} - -/// Perform `left - right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `subtract_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] -pub fn subtract( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - binary(left, right, |a, b| a.sub_wrapping(b)) -} - -/// Perform `left - right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `subtract` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub")] -pub fn subtract_checked( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - try_binary(left, right, |a, b| a.sub_checked(b)) -} - -/// Perform `left - right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `subtract_dyn_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] -pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { - crate::numeric::sub_wrapping(&left, &right) -} - -/// Perform `left - right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `subtract_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub")] -pub fn subtract_dyn_checked( - left: &dyn Array, - right: &dyn Array, -) -> Result { - crate::numeric::sub(&left, &right) -} - -/// Subtract every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `subtract_scalar_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] -pub fn subtract_scalar( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - Ok(unary(array, |value| value.sub_wrapping(scalar))) -} - -/// Subtract every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `subtract_scalar` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub")] -pub fn subtract_scalar_checked( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - try_unary(array, |value| value.sub_checked(scalar)) -} - -/// Subtract every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `subtract_scalar_checked_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub_wrapping")] -pub fn subtract_scalar_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - unary_dyn::<_, T>(array, |value| value.sub_wrapping(scalar)) -} - -/// Subtract every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `subtract_scalar_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::sub")] -pub fn subtract_scalar_checked_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - try_unary_dyn::<_, T>(array, |value| value.sub_checked(scalar)) - .map(|a| Arc::new(a) as ArrayRef) -} - -/// Perform `-` operation on an array. If value is null then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `negate_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::neg_wrapping")] -pub fn negate( - array: &PrimitiveArray, -) -> Result, ArrowError> { - Ok(unary(array, |x| x.neg_wrapping())) -} - -/// Perform `-` operation on an array. If value is null then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `negate` instead. -#[deprecated(note = "Use arrow_arith::numeric::neg")] -pub fn negate_checked( - array: &PrimitiveArray, -) -> Result, ArrowError> { - try_unary(array, |value| value.neg_checked()) -} - -/// Perform `left * right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `multiply_check` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] -pub fn multiply( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - binary(left, right, |a, b| a.mul_wrapping(b)) -} - -/// Perform `left * right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `multiply` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul")] -pub fn multiply_checked( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - try_binary(left, right, |a, b| a.mul_checked(b)) -} - -/// Perform `left * right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `multiply_dyn_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] -pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { - crate::numeric::mul_wrapping(&left, &right) -} - -/// Perform `left * right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `multiply_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul")] -pub fn multiply_dyn_checked( - left: &dyn Array, - right: &dyn Array, -) -> Result { - crate::numeric::mul(&left, &right) -} - /// Returns the precision and scale of the result of a multiplication of two decimal types, /// and the divisor for fixed point multiplication. fn get_fixed_point_info( @@ -740,1861 +194,81 @@ where } } -/// Multiply every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `multiply_scalar_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] -pub fn multiply_scalar( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - Ok(unary(array, |value| value.mul_wrapping(scalar))) -} +#[cfg(test)] +mod tests { + use super::*; + use crate::numeric::mul; -/// Multiply every value in an array by a scalar. If any value in the array is null then the -/// result is also null. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `multiply_scalar` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul")] -pub fn multiply_scalar_checked( - array: &PrimitiveArray, - scalar: T::Native, -) -> Result, ArrowError> { - try_unary(array, |value| value.mul_checked(scalar)) -} + #[test] + fn test_decimal_multiply_allow_precision_loss() { + // Overflow happening as i128 cannot hold multiplying result. + // [123456789] + let a = Decimal128Array::from(vec![123456789000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); -/// Multiply every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `multiply_scalar_checked_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul_wrapping")] -pub fn multiply_scalar_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - unary_dyn::<_, T>(array, |value| value.mul_wrapping(scalar)) -} + // [10] + let b = Decimal128Array::from(vec![10000000000000000000]) + .with_precision_and_scale(38, 18) + .unwrap(); -/// Subtract every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type same as -/// the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `multiply_scalar_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::mul")] -pub fn multiply_scalar_checked_dyn( - array: &dyn Array, - scalar: T::Native, -) -> Result { - try_unary_dyn::<_, T>(array, |value| value.mul_checked(scalar)) - .map(|a| Arc::new(a) as ArrayRef) -} + let err = mul(&a, &b).unwrap_err(); + assert!(err.to_string().contains( + "Overflow happened on: 123456789000000000000000000 * 10000000000000000000" + )); -/// Perform `left % right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -#[deprecated(note = "Use arrow_arith::numeric::rem")] -pub fn modulus( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - #[cfg(feature = "simd")] - return simd_checked_divide_op(&left, &right, simd_checked_modulus::, |a, b| { - a.mod_wrapping(b) - }); - #[cfg(not(feature = "simd"))] - return try_binary(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.mod_wrapping(b)) - } - }); -} + // Allow precision loss. + let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); + // [1234567890] + let expected = + Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); -/// Perform `left % right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -#[deprecated(note = "Use arrow_arith::numeric::rem")] -pub fn modulus_dyn(left: &dyn Array, right: &dyn Array) -> Result { - crate::numeric::rem(&left, &right) -} + assert_eq!(&expected, &result); + assert_eq!( + result.value_as_string(0), + "1234567890.0000000000000000000000000000" + ); -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -/// -/// When `simd` feature is not enabled. This detects overflow and returns an `Err` for that. -/// For an non-overflow-checking variant, use `divide` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_checked( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - #[cfg(feature = "simd")] - return simd_checked_divide_op(&left, &right, simd_checked_divide::, |a, b| { - a.div_wrapping(b) - }); - #[cfg(not(feature = "simd"))] - return try_binary(left, right, |a, b| a.div_checked(b)); -} + // Rounding case + // [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555] + let a = Decimal128Array::from(vec![ + 1, + 123456789555555555555555555, + 1555555555555555555, + ]) + .with_precision_and_scale(38, 18) + .unwrap(); -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// If any right hand value is zero, the operation value will be replaced with null in the -/// result. -/// -/// Unlike [`divide`] or [`divide_checked`], division by zero will yield a null value in the -/// result instead of returning an `Err`. -/// -/// For floating point types overflow will saturate at INF or -INF -/// preserving the expected sign value. -/// -/// For integer types overflow will wrap around. -/// -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_opt( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - binary_opt(left, right, |a, b| { - if b.is_zero() { - None - } else { - Some(a.div_wrapping(b)) - } - }) -} + // [1.555555555555555555, 11.222222222222222222, 0.000000000000000001] + let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1]) + .with_precision_and_scale(38, 18) + .unwrap(); -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `divide_dyn_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { - fn divide_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - ) -> Result, ArrowError> { - try_binary(left, right, |a, b| { - if b.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(a.div_wrapping(b)) - } - }) - } + let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); + // [ + // 0.0000000000000000015555555556, + // 1385459527.2345679012071330528765432099, + // 0.0000000000000000015555555556 + // ] + let expected = Decimal128Array::from(vec![ + 15555555556, + 13854595272345679012071330528765432099, + 15555555556, + ]) + .with_precision_and_scale(38, 28) + .unwrap(); - downcast_primitive_array!( - (left, right) => divide_op(left, right).map(|a| Arc::new(a) as ArrayRef), - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) -} + assert_eq!(&expected, &result); -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `divide_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_dyn_checked( - left: &dyn Array, - right: &dyn Array, -) -> Result { - crate::numeric::div(&left, &right) -} - -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. -/// -/// If any right hand value is zero, the operation value will be replaced with null in the -/// result. -/// -/// Unlike `divide_dyn` or `divide_dyn_checked`, division by zero will get a null value instead -/// returning an `Err`, this also doesn't check overflowing, overflowing will just wrap -/// the result around. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_dyn_opt( - left: &dyn Array, - right: &dyn Array, -) -> Result { - downcast_primitive_array!( - (left, right) => { - math_safe_divide_op(left, right, |a, b| { - if b.is_zero() { - None - } else { - Some(a.div_wrapping(b)) - } - }) - } - _ => Err(ArrowError::CastError(format!( - "Unsupported data type {}, {}", - left.data_type(), right.data_type() - ))) - ) -} - -/// Perform `left / right` operation on two arrays without checking for -/// division by zero or overflow. -/// -/// For floating point types, overflow and division by zero follows normal floating point rules -/// -/// For integer types overflow will wrap around. Division by zero will currently panic, although -/// this may be subject to change see -/// -/// If either left or right value is null then the result is also null. -/// -/// For an overflow-checking variant, use `divide_checked` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result, ArrowError> { - // TODO: This is incorrect as div_wrapping has side-effects for integer types - // and so may panic on null values (#2647) - binary(left, right, |a, b| a.div_wrapping(b)) -} - -/// Modulus every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. -#[deprecated(note = "Use arrow_arith::numeric::rem")] -pub fn modulus_scalar( - array: &PrimitiveArray, - modulo: T::Native, -) -> Result, ArrowError> { - if modulo.is_zero() { - return Err(ArrowError::DivideByZero); - } - - Ok(unary(array, |a| a.mod_wrapping(modulo))) -} - -/// Modulus every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. -#[deprecated(note = "Use arrow_arith::numeric::rem")] -pub fn modulus_scalar_dyn( - array: &dyn Array, - modulo: T::Native, -) -> Result { - if modulo.is_zero() { - return Err(ArrowError::DivideByZero); - } - unary_dyn::<_, T>(array, |value| value.mod_wrapping(modulo)) -} - -/// Divide every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_scalar( - array: &PrimitiveArray, - divisor: T::Native, -) -> Result, ArrowError> { - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - Ok(unary(array, |a| a.div_wrapping(divisor))) -} - -/// Divide every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. The given array must be a `PrimitiveArray` of the type -/// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `divide_scalar_checked_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_scalar_dyn( - array: &dyn Array, - divisor: T::Native, -) -> Result { - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - unary_dyn::<_, T>(array, |value| value.div_wrapping(divisor)) -} - -/// Divide every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. The given array must be a `PrimitiveArray` of the type -/// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, -/// use `divide_scalar_dyn` instead. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_scalar_checked_dyn( - array: &dyn Array, - divisor: T::Native, -) -> Result { - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - - try_unary_dyn::<_, T>(array, |value| value.div_checked(divisor)) - .map(|a| Arc::new(a) as ArrayRef) -} - -/// Divide every value in an array by a scalar. If any value in the array is null then the -/// result is also null. The given array must be a `PrimitiveArray` of the type -/// same as the scalar, or a `DictionaryArray` of the value type same as the scalar. -/// -/// If any right hand value is zero, the operation value will be replaced with null in the -/// result. -/// -/// Unlike `divide_scalar_dyn` or `divide_scalar_checked_dyn`, division by zero will get a -/// null value instead returning an `Err`, this also doesn't check overflowing, overflowing -/// will just wrap the result around. -#[deprecated(note = "Use arrow_arith::numeric::div")] -pub fn divide_scalar_opt_dyn( - array: &dyn Array, - divisor: T::Native, -) -> Result { - if divisor.is_zero() { - match array.data_type() { - DataType::Dictionary(_, value_type) => { - return Ok(new_null_array(value_type.as_ref(), array.len())) - } - _ => return Ok(new_null_array(array.data_type(), array.len())), - } - } - - unary_dyn::<_, T>(array, |value| value.div_wrapping(divisor)) -} - -#[cfg(test)] -#[allow(deprecated)] -mod tests { - use super::*; - use arrow_array::builder::{ - BooleanBufferBuilder, BufferBuilder, PrimitiveDictionaryBuilder, - }; - use arrow_array::cast::AsArray; - use arrow_array::temporal_conversions::SECONDS_IN_DAY; - use arrow_buffer::buffer::NullBuffer; - use arrow_buffer::i256; - use arrow_data::ArrayDataBuilder; - use chrono::NaiveDate; - use half::f16; - - #[test] - fn test_primitive_array_add() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8, 9, 8]); - let c = add(&a, &b).unwrap(); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert_eq!(17, c.value(3)); - assert_eq!(17, c.value(4)); - } - - #[test] - fn test_date32_month_add() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = - IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(1, 2)]); - let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) - ); - } - - #[test] - fn test_date32_day_time_add() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); - let c = add_dyn(&a, &b).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) - ); - } - - #[test] - fn test_date32_month_day_nano_add() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = - IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( - 1, 2, 3, - )]); - let c = add_dyn(&a, &b).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) - ); - } - - #[test] - fn test_date64_month_add() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = - IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(1, 2)]); - let c = add_dyn(&a, &b).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2001, 3, 1).unwrap()) - ); - } - - #[test] - fn test_date64_day_time_add() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 2)]); - let c = add_dyn(&a, &b).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 1, 2).unwrap()) - ); - } - - #[test] - fn test_date64_month_day_nano_add() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 1, 1).unwrap(), - )]); - let b = - IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( - 1, 2, 3, - )]); - let c = add_dyn(&a, &b).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) - ); - - let c = add_dyn(&b, &a).unwrap(); - assert_eq!( - c.as_primitive::().value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2000, 2, 3).unwrap()) - ); - } - - #[test] - fn test_primitive_array_add_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8), Some(9)]); - let b = Int32Array::from(vec![Some(6), Some(7), Some(8), None, Some(8)]); - let c = add_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(17, c.value(4)); - } - - #[test] - fn test_primitive_array_add_scalar_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); - let b = 1_i32; - let c = add_scalar_dyn::(&a, b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(6, c.value(0)); - assert_eq!(7, c.value(1)); - assert_eq!(8, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(10, c.value(4)); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append_null(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - let b = -1_i32; - - let c = add_scalar_dyn::(&a, b).unwrap(); - let c = c - .as_any() - .downcast_ref::>() - .unwrap(); - let values = c - .values() - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(4, values.value(c.key(0).unwrap())); - assert!(c.is_null(1)); - assert_eq!(6, values.value(c.key(2).unwrap())); - assert_eq!(7, values.value(c.key(3).unwrap())); - assert_eq!(8, values.value(c.key(4).unwrap())); - } - - #[test] - fn test_primitive_array_subtract_dyn() { - let a = Int32Array::from(vec![Some(51), Some(6), Some(15), Some(8), Some(9)]); - let b = Int32Array::from(vec![Some(6), Some(7), Some(8), None, Some(8)]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(45, c.value(0)); - assert_eq!(-1, c.value(1)); - assert_eq!(7, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(1, c.value(4)); - } - - #[test] - fn test_date32_month_subtract() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 7, 1).unwrap(), - )]); - let b = - IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(6, 3)]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(1994, 4, 1).unwrap()) - ); - } - - #[test] - fn test_date32_day_time_subtract() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2023, 3, 29).unwrap(), - )]); - let b = - IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 86500)]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 28).unwrap()) - ); - } - - #[test] - fn test_date32_month_day_nano_subtract() { - let a = Date32Array::from(vec![Date32Type::from_naive_date( - NaiveDate::from_ymd_opt(2023, 3, 15).unwrap(), - )]); - let b = - IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( - 1, 2, 0, - )]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date32Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 2, 13).unwrap()) - ); - } - - #[test] - fn test_date64_month_subtract() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2000, 7, 1).unwrap(), - )]); - let b = - IntervalYearMonthArray::from(vec![IntervalYearMonthType::make_value(6, 3)]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(1994, 4, 1).unwrap()) - ); - } - - #[test] - fn test_date64_day_time_subtract() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2023, 3, 29).unwrap(), - )]); - let b = - IntervalDayTimeArray::from(vec![IntervalDayTimeType::make_value(1, 86500)]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 3, 28).unwrap()) - ); - } - - #[test] - fn test_date64_month_day_nano_subtract() { - let a = Date64Array::from(vec![Date64Type::from_naive_date( - NaiveDate::from_ymd_opt(2023, 3, 15).unwrap(), - )]); - let b = - IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNanoType::make_value( - 1, 2, 0, - )]); - let c = subtract_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!( - c.value(0), - Date64Type::from_naive_date(NaiveDate::from_ymd_opt(2023, 2, 13).unwrap()) - ); - } - - #[test] - fn test_primitive_array_subtract_scalar_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); - let b = 1_i32; - let c = subtract_scalar_dyn::(&a, b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(4, c.value(0)); - assert_eq!(5, c.value(1)); - assert_eq!(6, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(8, c.value(4)); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append_null(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - let b = -1_i32; - - let c = subtract_scalar_dyn::(&a, b).unwrap(); - let c = c - .as_any() - .downcast_ref::>() - .unwrap(); - let values = c - .values() - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(6, values.value(c.key(0).unwrap())); - assert!(c.is_null(1)); - assert_eq!(8, values.value(c.key(2).unwrap())); - assert_eq!(9, values.value(c.key(3).unwrap())); - assert_eq!(10, values.value(c.key(4).unwrap())); - } - - #[test] - fn test_primitive_array_multiply_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8), Some(9)]); - let b = Int32Array::from(vec![Some(6), Some(7), Some(8), None, Some(8)]); - let c = multiply_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(30, c.value(0)); - assert_eq!(42, c.value(1)); - assert_eq!(56, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(72, c.value(4)); - } - - #[test] - fn test_primitive_array_divide_dyn() { - let a = Int32Array::from(vec![Some(15), Some(6), Some(1), Some(8), Some(9)]); - let b = Int32Array::from(vec![Some(5), Some(3), Some(1), None, Some(3)]); - let c = divide_dyn(&a, &b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(3, c.value(4)); - } - - #[test] - fn test_primitive_array_multiply_scalar_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); - let b = 2_i32; - let c = multiply_scalar_dyn::(&a, b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(10, c.value(0)); - assert_eq!(12, c.value(1)); - assert_eq!(14, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(18, c.value(4)); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append_null(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - let b = -1_i32; - - let c = multiply_scalar_dyn::(&a, b).unwrap(); - let c = c - .as_any() - .downcast_ref::>() - .unwrap(); - let values = c - .values() - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(-5, values.value(c.key(0).unwrap())); - assert!(c.is_null(1)); - assert_eq!(-7, values.value(c.key(2).unwrap())); - assert_eq!(-8, values.value(c.key(3).unwrap())); - assert_eq!(-9, values.value(c.key(4).unwrap())); - } - - #[test] - fn test_primitive_array_add_sliced() { - let a = Int32Array::from(vec![0, 0, 0, 5, 6, 7, 8, 9, 0]); - let b = Int32Array::from(vec![0, 0, 0, 6, 7, 8, 9, 8, 0]); - let a = a.slice(3, 5); - let b = b.slice(3, 5); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.as_any().downcast_ref::().unwrap(); - - assert_eq!(5, a.value(0)); - assert_eq!(6, b.value(0)); - - let c = add(a, b).unwrap(); - assert_eq!(5, c.len()); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert_eq!(17, c.value(3)); - assert_eq!(17, c.value(4)); - } - - #[test] - fn test_primitive_array_add_mismatched_length() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8]); - let e = add(&a, &b).expect_err("should have failed due to different lengths"); - assert_eq!( - "ComputeError(\"Cannot perform binary operation on arrays of different length\")", - format!("{e:?}") - ); - } - - #[test] - fn test_primitive_array_add_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = add_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![18, 17, 12, 11, 4]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_add_scalar_sliced() { - let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); - let a = a.slice(1, 4); - let actual = add_scalar(&a, 3).unwrap(); - let expected = Int32Array::from(vec![None, Some(12), Some(11), None]); - assert_eq!(actual, expected); - } - - #[test] - fn test_primitive_array_subtract() { - let a = Int32Array::from(vec![1, 2, 3, 4, 5]); - let b = Int32Array::from(vec![5, 4, 3, 2, 1]); - let c = subtract(&a, &b).unwrap(); - assert_eq!(-4, c.value(0)); - assert_eq!(-2, c.value(1)); - assert_eq!(0, c.value(2)); - assert_eq!(2, c.value(3)); - assert_eq!(4, c.value(4)); - } - - #[test] - fn test_primitive_array_subtract_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = subtract_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![12, 11, 6, 5, -2]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_subtract_scalar_sliced() { - let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); - let a = a.slice(1, 4); - let actual = subtract_scalar(&a, 3).unwrap(); - let expected = Int32Array::from(vec![None, Some(6), Some(5), None]); - assert_eq!(actual, expected); - } - - #[test] - fn test_primitive_array_multiply() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8, 9, 8]); - let c = multiply(&a, &b).unwrap(); - assert_eq!(30, c.value(0)); - assert_eq!(42, c.value(1)); - assert_eq!(56, c.value(2)); - assert_eq!(72, c.value(3)); - assert_eq!(72, c.value(4)); - } - - #[test] - fn test_primitive_array_multiply_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = multiply_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![45, 42, 27, 24, 3]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_multiply_scalar_sliced() { - let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); - let a = a.slice(1, 4); - let actual = multiply_scalar(&a, 3).unwrap(); - let expected = Int32Array::from(vec![None, Some(27), Some(24), None]); - assert_eq!(actual, expected); - } - - #[test] - fn test_primitive_array_divide() { - let a = Int32Array::from(vec![15, 15, 8, 1, 9]); - let b = Int32Array::from(vec![5, 6, 8, 9, 1]); - let c = divide(&a, &b).unwrap(); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_int_array_modulus() { - let a = Int32Array::from(vec![15, 15, 8, 1, 9]); - let b = Int32Array::from(vec![5, 6, 8, 9, 1]); - let c = modulus(&a, &b).unwrap(); - assert_eq!(0, c.value(0)); - assert_eq!(3, c.value(1)); - assert_eq!(0, c.value(2)); - assert_eq!(1, c.value(3)); - assert_eq!(0, c.value(4)); - - let c = modulus_dyn(&a, &b).unwrap(); - let c = c.as_primitive::(); - assert_eq!(0, c.value(0)); - assert_eq!(3, c.value(1)); - assert_eq!(0, c.value(2)); - assert_eq!(1, c.value(3)); - assert_eq!(0, c.value(4)); - } - - #[test] - #[should_panic( - expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" - )] - fn test_int_array_modulus_divide_by_zero() { - let a = Int32Array::from(vec![1]); - let b = Int32Array::from(vec![0]); - modulus(&a, &b).unwrap(); - } - - #[test] - #[should_panic( - expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" - )] - fn test_int_array_modulus_dyn_divide_by_zero() { - let a = Int32Array::from(vec![1]); - let b = Int32Array::from(vec![0]); - modulus_dyn(&a, &b).unwrap(); - } - - #[test] - fn test_int_array_modulus_overflow_wrapping() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![-1]); - let result = modulus(&a, &b).unwrap(); - assert_eq!(0, result.value(0)) - } - - #[test] - fn test_primitive_array_divide_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = divide_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![5, 4, 3, 2, 0]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_divide_scalar_dyn() { - let a = Int32Array::from(vec![Some(5), Some(6), Some(7), None, Some(9)]); - let b = 2_i32; - let c = divide_scalar_dyn::(&a, b).unwrap(); - let c = c.as_any().downcast_ref::().unwrap(); - assert_eq!(2, c.value(0)); - assert_eq!(3, c.value(1)); - assert_eq!(3, c.value(2)); - assert!(c.is_null(3)); - assert_eq!(4, c.value(4)); - - let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.append(5).unwrap(); - builder.append_null(); - builder.append(7).unwrap(); - builder.append(8).unwrap(); - builder.append(9).unwrap(); - let a = builder.finish(); - let b = -2_i32; - - let c = divide_scalar_dyn::(&a, b).unwrap(); - let c = c - .as_any() - .downcast_ref::>() - .unwrap(); - let values = c - .values() - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!(-2, values.value(c.key(0).unwrap())); - assert!(c.is_null(1)); - assert_eq!(-3, values.value(c.key(2).unwrap())); - assert_eq!(-4, values.value(c.key(3).unwrap())); - assert_eq!(-4, values.value(c.key(4).unwrap())); - - let e = divide_scalar_dyn::(&a, 0_i32) - .expect_err("should have failed due to divide by zero"); - assert_eq!("DivideByZero", format!("{e:?}")); - } - - #[test] - fn test_primitive_array_divide_scalar_sliced() { - let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); - let a = a.slice(1, 4); - let actual = divide_scalar(&a, 3).unwrap(); - let expected = Int32Array::from(vec![None, Some(3), Some(2), None]); - assert_eq!(actual, expected); - } - - #[test] - fn test_int_array_modulus_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = modulus_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![0, 2, 0, 2, 1]); - assert_eq!(c, expected); - - let c = modulus_scalar_dyn::(&a, b).unwrap(); - let c = c.as_primitive::(); - let expected = Int32Array::from(vec![0, 2, 0, 2, 1]); - assert_eq!(c, &expected); - } - - #[test] - fn test_int_array_modulus_scalar_sliced() { - let a = Int32Array::from(vec![Some(15), None, Some(9), Some(8), None]); - let a = a.slice(1, 4); - let actual = modulus_scalar(&a, 3).unwrap(); - let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); - assert_eq!(actual, expected); - - let actual = modulus_scalar_dyn::(&a, 3).unwrap(); - let actual = actual.as_primitive::(); - let expected = Int32Array::from(vec![None, Some(0), Some(2), None]); - assert_eq!(actual, &expected); - } - - #[test] - #[should_panic( - expected = "called `Result::unwrap()` on an `Err` value: DivideByZero" - )] - fn test_int_array_modulus_scalar_divide_by_zero() { - let a = Int32Array::from(vec![1]); - modulus_scalar(&a, 0).unwrap(); - } - - #[test] - fn test_int_array_modulus_scalar_overflow_wrapping() { - let a = Int32Array::from(vec![i32::MIN]); - let result = modulus_scalar(&a, -1).unwrap(); - assert_eq!(0, result.value(0)); - - let result = modulus_scalar_dyn::(&a, -1).unwrap(); - let result = result.as_primitive::(); - assert_eq!(0, result.value(0)); - } - - #[test] - fn test_primitive_array_divide_sliced() { - let a = Int32Array::from(vec![0, 0, 0, 15, 15, 8, 1, 9, 0]); - let b = Int32Array::from(vec![0, 0, 0, 5, 6, 8, 9, 1, 0]); - let a = a.slice(3, 5); - let b = b.slice(3, 5); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = divide(a, b).unwrap(); - assert_eq!(5, c.len()); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_primitive_array_modulus_sliced() { - let a = Int32Array::from(vec![0, 0, 0, 15, 15, 8, 1, 9, 0]); - let b = Int32Array::from(vec![0, 0, 0, 5, 6, 8, 9, 1, 0]); - let a = a.slice(3, 5); - let b = b.slice(3, 5); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = modulus(a, b).unwrap(); - assert_eq!(5, c.len()); - assert_eq!(0, c.value(0)); - assert_eq!(3, c.value(1)); - assert_eq!(0, c.value(2)); - assert_eq!(1, c.value(3)); - assert_eq!(0, c.value(4)); - } - - #[test] - fn test_primitive_array_divide_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = Int32Array::from(vec![Some(5), Some(6), Some(8), Some(9), None, None]); - let c = divide_checked(&a, &b).unwrap(); - assert_eq!(3, c.value(0)); - assert!(c.is_null(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert!(c.is_null(4)); - assert!(c.is_null(5)); - } - - #[test] - fn test_primitive_array_modulus_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = Int32Array::from(vec![Some(5), Some(6), Some(8), Some(9), None, None]); - let c = modulus(&a, &b).unwrap(); - assert_eq!(0, c.value(0)); - assert!(c.is_null(1)); - assert_eq!(0, c.value(2)); - assert_eq!(1, c.value(3)); - assert!(c.is_null(4)); - assert!(c.is_null(5)); - } - - #[test] - fn test_primitive_array_divide_scalar_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = 3; - let c = divide_scalar(&a, b).unwrap(); - let expected = - Int32Array::from(vec![Some(5), None, Some(2), Some(0), Some(3), None]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_modulus_scalar_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = 3; - let c = modulus_scalar(&a, b).unwrap(); - let expected = - Int32Array::from(vec![Some(0), None, Some(2), Some(1), Some(0), None]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_divide_with_nulls_sliced() { - let a = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(15), - None, - Some(8), - Some(1), - Some(9), - None, - None, - ]); - let b = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(5), - Some(6), - Some(8), - Some(9), - None, - None, - None, - ]); - - let a = a.slice(8, 6); - let a = a.as_any().downcast_ref::().unwrap(); - - let b = b.slice(8, 6); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = divide_checked(a, b).unwrap(); - assert_eq!(6, c.len()); - assert_eq!(3, c.value(0)); - assert!(c.is_null(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert!(c.is_null(4)); - assert!(c.is_null(5)); - } - - #[test] - fn test_primitive_array_modulus_with_nulls_sliced() { - let a = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(15), - None, - Some(8), - Some(1), - Some(9), - None, - None, - ]); - let b = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(5), - Some(6), - Some(8), - Some(9), - None, - None, - None, - ]); - - let a = a.slice(8, 6); - let a = a.as_any().downcast_ref::().unwrap(); - - let b = b.slice(8, 6); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = modulus(a, b).unwrap(); - assert_eq!(6, c.len()); - assert_eq!(0, c.value(0)); - assert!(c.is_null(1)); - assert_eq!(0, c.value(2)); - assert_eq!(1, c.value(3)); - assert!(c.is_null(4)); - assert!(c.is_null(5)); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_int_array_divide_by_zero_with_checked() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - divide_checked(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_f32_array_divide_by_zero_with_checked() { - let a = Float32Array::from(vec![15.0]); - let b = Float32Array::from(vec![0.0]); - divide_checked(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "attempt to divide by zero")] - fn test_int_array_divide_by_zero() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - divide(&a, &b).unwrap(); - } - - #[test] - fn test_f32_array_divide_by_zero() { - let a = Float32Array::from(vec![1.5, 0.0, -1.5]); - let b = Float32Array::from(vec![0.0, 0.0, 0.0]); - let result = divide(&a, &b).unwrap(); - assert_eq!(result.value(0), f32::INFINITY); - assert!(result.value(1).is_nan()); - assert_eq!(result.value(2), f32::NEG_INFINITY); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_int_array_divide_dyn_by_zero() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - divide_dyn(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_f32_array_divide_dyn_by_zero() { - let a = Float32Array::from(vec![1.5]); - let b = Float32Array::from(vec![0.0]); - divide_dyn(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_i32_array_modulus_by_zero() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - modulus(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_i32_array_modulus_dyn_by_zero() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - modulus_dyn(&a, &b).unwrap(); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_f32_array_modulus_by_zero() { - let a = Float32Array::from(vec![1.5]); - let b = Float32Array::from(vec![0.0]); - modulus(&a, &b).unwrap(); - } - - #[test] - fn test_f32_array_modulus_dyn_by_zero() { - let a = Float32Array::from(vec![1.5]); - let b = Float32Array::from(vec![0.0]); - let result = modulus_dyn(&a, &b).unwrap(); - assert!(result.as_primitive::().value(0).is_nan()); - } - - #[test] - fn test_f64_array_divide() { - let a = Float64Array::from(vec![15.0, 15.0, 8.0]); - let b = Float64Array::from(vec![5.0, 6.0, 8.0]); - let c = divide(&a, &b).unwrap(); - assert_eq!(3.0, c.value(0)); - assert_eq!(2.5, c.value(1)); - assert_eq!(1.0, c.value(2)); - } - - #[test] - fn test_primitive_array_add_with_nulls() { - let a = Int32Array::from(vec![Some(5), None, Some(7), None]); - let b = Int32Array::from(vec![None, None, Some(6), Some(7)]); - let c = add(&a, &b).unwrap(); - assert!(c.is_null(0)); - assert!(c.is_null(1)); - assert!(!c.is_null(2)); - assert!(c.is_null(3)); - assert_eq!(13, c.value(2)); - } - - #[test] - fn test_primitive_array_negate() { - let a: Int64Array = (0..100).map(Some).collect(); - let actual = negate(&a).unwrap(); - let expected: Int64Array = (0..100).map(|i| Some(-i)).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_primitive_array_negate_checked_overflow() { - let a = Int32Array::from(vec![i32::MIN]); - let actual = negate(&a).unwrap(); - let expected = Int32Array::from(vec![i32::MIN]); - assert_eq!(expected, actual); - - let err = negate_checked(&a); - err.expect_err("negate_checked should detect overflow"); - } - - #[test] - fn test_arithmetic_kernel_should_not_rely_on_padding() { - let a: UInt8Array = (0..128_u8).map(Some).collect(); - let a = a.slice(63, 65); - let a = a.as_any().downcast_ref::().unwrap(); - - let b: UInt8Array = (0..128_u8).map(Some).collect(); - let b = b.slice(63, 65); - let b = b.as_any().downcast_ref::().unwrap(); - - let actual = add(a, b).unwrap(); - let actual: Vec> = actual.iter().collect(); - let expected: Vec> = - (63..63_u8 + 65_u8).map(|i| Some(i + i)).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_primitive_add_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - let b = Int32Array::from(vec![1, 1]); - - let wrapped = add(&a, &b); - let expected = Int32Array::from(vec![-2147483648, -2147483647]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = add_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_subtract_wrapping_overflow() { - let a = Int32Array::from(vec![-2]); - let b = Int32Array::from(vec![i32::MAX]); - - let wrapped = subtract(&a, &b); - let expected = Int32Array::from(vec![i32::MAX]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = subtract_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_mul_wrapping_overflow() { - let a = Int32Array::from(vec![10]); - let b = Int32Array::from(vec![i32::MAX]); - - let wrapped = multiply(&a, &b); - let expected = Int32Array::from(vec![-10]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = multiply_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - #[cfg(not(feature = "simd"))] - fn test_primitive_div_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![-1]); - - let wrapped = divide(&a, &b); - let expected = Int32Array::from(vec![-2147483648]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = divide_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_add_scalar_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - - let wrapped = add_scalar(&a, 1); - let expected = Int32Array::from(vec![-2147483648, -2147483647]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = add_scalar_checked(&a, 1); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_subtract_scalar_wrapping_overflow() { - let a = Int32Array::from(vec![-2]); - - let wrapped = subtract_scalar(&a, i32::MAX); - let expected = Int32Array::from(vec![i32::MAX]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = subtract_scalar_checked(&a, i32::MAX); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_mul_scalar_wrapping_overflow() { - let a = Int32Array::from(vec![10]); - - let wrapped = multiply_scalar(&a, i32::MAX); - let expected = Int32Array::from(vec![-10]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = multiply_scalar_checked(&a, i32::MAX); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_add_scalar_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - - let wrapped = add_scalar_dyn::(&a, 1).unwrap(); - let expected = - Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = add_scalar_checked_dyn::(&a, 1); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_subtract_scalar_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![-2]); - - let wrapped = subtract_scalar_dyn::(&a, i32::MAX).unwrap(); - let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = subtract_scalar_checked_dyn::(&a, i32::MAX); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_mul_scalar_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![10]); - - let wrapped = multiply_scalar_dyn::(&a, i32::MAX).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = multiply_scalar_checked_dyn::(&a, i32::MAX); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_div_scalar_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MIN]); - - let wrapped = divide_scalar_dyn::(&a, -1).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = divide_scalar_checked_dyn::(&a, -1); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_div_opt_overflow_division_by_zero() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![-1]); - - let wrapped = divide(&a, &b); - let expected = Int32Array::from(vec![-2147483648]); - assert_eq!(expected, wrapped.unwrap()); - - let overflow = divide_opt(&a, &b); - let expected = Int32Array::from(vec![-2147483648]); - assert_eq!(expected, overflow.unwrap()); - - let b = Int32Array::from(vec![0]); - let overflow = divide_opt(&a, &b); - let expected = Int32Array::from(vec![None]); - assert_eq!(expected, overflow.unwrap()); - } - - #[test] - fn test_primitive_add_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - let b = Int32Array::from(vec![1, 1]); - - let wrapped = add_dyn(&a, &b).unwrap(); - let expected = - Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = add_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_subtract_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![-2]); - let b = Int32Array::from(vec![i32::MAX]); - - let wrapped = subtract_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = subtract_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_mul_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![10]); - let b = Int32Array::from(vec![i32::MAX]); - - let wrapped = multiply_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = multiply_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_div_dyn_wrapping_overflow() { - let a = Int32Array::from(vec![i32::MIN]); - let b = Int32Array::from(vec![-1]); - - let wrapped = divide_dyn(&a, &b).unwrap(); - let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; - assert_eq!(&expected, &wrapped); - - let overflow = divide_dyn_checked(&a, &b); - overflow.expect_err("overflow should be detected"); - } - - #[test] - fn test_decimal128() { - let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); - let b = Decimal128Array::from_iter_values([7, -3, 6, 3]); - let e = Decimal128Array::from_iter_values([8, -1, 10, 8]); - let r = add(&a, &b).unwrap(); - assert_eq!(e, r); - - let e = Decimal128Array::from_iter_values([-6, 5, -2, 2]); - let r = subtract(&a, &b).unwrap(); - assert_eq!(e, r); - - let e = Decimal128Array::from_iter_values([7, -6, 24, 15]); - let r = multiply(&a, &b).unwrap(); - assert_eq!(e, r); - - let a = Decimal128Array::from_iter_values([23, 56, 32, 55]); - let b = Decimal128Array::from_iter_values([1, -2, 4, 5]); - let e = Decimal128Array::from_iter_values([23, -28, 8, 11]); - let r = divide(&a, &b).unwrap(); - assert_eq!(e, r); - } - - #[test] - fn test_decimal256() { - let a = Decimal256Array::from_iter_values( - [1, 2, 4, 5].into_iter().map(i256::from_i128), - ); - let b = Decimal256Array::from_iter_values( - [7, -3, 6, 3].into_iter().map(i256::from_i128), - ); - let e = Decimal256Array::from_iter_values( - [8, -1, 10, 8].into_iter().map(i256::from_i128), - ); - let r = add(&a, &b).unwrap(); - assert_eq!(e, r); - - let e = Decimal256Array::from_iter_values( - [-6, 5, -2, 2].into_iter().map(i256::from_i128), - ); - let r = subtract(&a, &b).unwrap(); - assert_eq!(e, r); - - let e = Decimal256Array::from_iter_values( - [7, -6, 24, 15].into_iter().map(i256::from_i128), - ); - let r = multiply(&a, &b).unwrap(); - assert_eq!(e, r); - - let a = Decimal256Array::from_iter_values( - [23, 56, 32, 55].into_iter().map(i256::from_i128), - ); - let b = Decimal256Array::from_iter_values( - [1, -2, 4, 5].into_iter().map(i256::from_i128), - ); - let e = Decimal256Array::from_iter_values( - [23, -28, 8, 11].into_iter().map(i256::from_i128), - ); - let r = divide(&a, &b).unwrap(); - assert_eq!(e, r); - } - - #[test] - fn test_div_scalar_dyn_opt_overflow_division_by_zero() { - let a = Int32Array::from(vec![i32::MIN]); - - let division_by_zero = divide_scalar_opt_dyn::(&a, 0); - let expected = Arc::new(Int32Array::from(vec![None])) as ArrayRef; - assert_eq!(&expected, &division_by_zero.unwrap()); - - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(1, 1); - builder.append(i32::MIN).unwrap(); - let a = builder.finish(); - - let division_by_zero = divide_scalar_opt_dyn::(&a, 0); - assert_eq!(&expected, &division_by_zero.unwrap()); - } - - #[test] - fn test_sum_f16() { - let a = Float16Array::from_iter_values([ - f16::from_f32(0.1), - f16::from_f32(0.2), - f16::from_f32(1.5), - f16::from_f32(-0.1), - ]); - let b = Float16Array::from_iter_values([ - f16::from_f32(5.1), - f16::from_f32(6.2), - f16::from_f32(-1.), - f16::from_f32(-2.1), - ]); - let expected = Float16Array::from_iter_values( - a.values().iter().zip(b.values()).map(|(a, b)| a + b), - ); - - let c = add(&a, &b).unwrap(); - assert_eq!(c, expected); - } - - #[test] - fn test_resize_builder() { - let mut null_buffer_builder = BooleanBufferBuilder::new(16); - null_buffer_builder.append_slice(&[ - false, false, false, false, false, false, false, false, false, false, false, - false, false, true, true, true, - ]); - // `resize` resizes the buffer length to the ceil of byte numbers. - // So the underlying buffer is not changed. - null_buffer_builder.resize(13); - assert_eq!(null_buffer_builder.len(), 13); - - let nulls = null_buffer_builder.finish(); - assert_eq!(nulls.count_set_bits(), 0); - let nulls = NullBuffer::new(nulls); - assert_eq!(nulls.null_count(), 13); - - let mut data_buffer_builder = BufferBuilder::::new(13); - data_buffer_builder.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); - let data_buffer = data_buffer_builder.finish(); - - let arg1: Int32Array = ArrayDataBuilder::new(DataType::Int32) - .len(13) - .nulls(Some(nulls)) - .buffers(vec![data_buffer]) - .build() - .unwrap() - .into(); - - assert_eq!(arg1.null_count(), 13); - - let mut data_buffer_builder = BufferBuilder::::new(13); - data_buffer_builder.append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); - let data_buffer = data_buffer_builder.finish(); - - let arg2: Int32Array = ArrayDataBuilder::new(DataType::Int32) - .len(13) - .buffers(vec![data_buffer]) - .build() - .unwrap() - .into(); - - assert_eq!(arg2.null_count(), 0); - - let result_dyn = add_dyn(&arg1, &arg2).unwrap(); - let result = result_dyn.as_any().downcast_ref::().unwrap(); - - assert_eq!(result.len(), 13); - assert_eq!(result.null_count(), 13); - } - - #[test] - fn test_primitive_array_add_mut_by_binary_mut() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); - - let c = binary_mut(a, &b, |a, b| a.add_wrapping(b)) - .unwrap() - .unwrap(); - let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_add_mut_wrapping_overflow_by_try_binary_mut() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - let b = Int32Array::from(vec![1, 1]); - - let wrapped = binary_mut(a, &b, |a, b| a.add_wrapping(b)) - .unwrap() - .unwrap(); - let expected = Int32Array::from(vec![-2147483648, -2147483647]); - assert_eq!(expected, wrapped); - - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - let b = Int32Array::from(vec![1, 1]); - let overflow = try_binary_mut(a, &b, |a, b| a.add_checked(b)); - let _ = overflow.unwrap().expect_err("overflow should be detected"); - } - - #[test] - fn test_primitive_add_scalar_by_unary_mut() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = unary_mut(a, |value| value.add_wrapping(b)).unwrap(); - let expected = Int32Array::from(vec![18, 17, 12, 11, 4]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_add_scalar_overflow_by_try_unary_mut() { - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - - let wrapped = unary_mut(a, |value| value.add_wrapping(1)).unwrap(); - let expected = Int32Array::from(vec![-2147483648, -2147483647]); - assert_eq!(expected, wrapped); - - let a = Int32Array::from(vec![i32::MAX, i32::MIN]); - let overflow = try_unary_mut(a, |value| value.add_checked(1)); - let _ = overflow.unwrap().expect_err("overflow should be detected"); - } - - #[test] - fn test_decimal_add_scalar_dyn() { - let a = Decimal128Array::from(vec![100, 210, 320]) - .with_precision_and_scale(38, 2) - .unwrap(); - - let result = add_scalar_dyn::(&a, 1).unwrap(); - let result = result - .as_primitive::() - .clone() - .with_precision_and_scale(38, 2) - .unwrap(); - let expected = Decimal128Array::from(vec![101, 211, 321]) - .with_precision_and_scale(38, 2) - .unwrap(); - - assert_eq!(&expected, &result); - } - - #[test] - fn test_decimal_multiply_allow_precision_loss() { - // Overflow happening as i128 cannot hold multiplying result. - // [123456789] - let a = Decimal128Array::from(vec![123456789000000000000000000]) - .with_precision_and_scale(38, 18) - .unwrap(); - - // [10] - let b = Decimal128Array::from(vec![10000000000000000000]) - .with_precision_and_scale(38, 18) - .unwrap(); - - let err = multiply_dyn_checked(&a, &b).unwrap_err(); - assert!(err.to_string().contains( - "Overflow happened on: 123456789000000000000000000 * 10000000000000000000" - )); - - // Allow precision loss. - let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); - // [1234567890] - let expected = - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(); - - assert_eq!(&expected, &result); - assert_eq!( - result.value_as_string(0), - "1234567890.0000000000000000000000000000" - ); - - // Rounding case - // [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555] - let a = Decimal128Array::from(vec![ - 1, - 123456789555555555555555555, - 1555555555555555555, - ]) - .with_precision_and_scale(38, 18) - .unwrap(); - - // [1.555555555555555555, 11.222222222222222222, 0.000000000000000001] - let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1]) - .with_precision_and_scale(38, 18) - .unwrap(); - - let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); - // [ - // 0.0000000000000000015555555556, - // 1385459527.2345679012071330528765432099, - // 0.0000000000000000015555555556 - // ] - let expected = Decimal128Array::from(vec![ - 15555555556, - 13854595272345679012071330528765432099, - 15555555556, - ]) - .with_precision_and_scale(38, 28) - .unwrap(); - - assert_eq!(&expected, &result); - - // Rounded the value "1385459527.234567901207133052876543209876543210". - assert_eq!( - result.value_as_string(1), - "1385459527.2345679012071330528765432099" - ); - assert_eq!(result.value_as_string(0), "0.0000000000000000015555555556"); - assert_eq!(result.value_as_string(2), "0.0000000000000000015555555556"); + // Rounded the value "1385459527.234567901207133052876543209876543210". + assert_eq!( + result.value_as_string(1), + "1385459527.2345679012071330528765432099" + ); + assert_eq!(result.value_as_string(0), "0.0000000000000000015555555556"); + assert_eq!(result.value_as_string(2), "0.0000000000000000015555555556"); let a = Decimal128Array::from(vec![1230]) .with_precision_and_scale(4, 2) @@ -2609,11 +283,8 @@ mod tests { assert_eq!(result.precision(), 9); assert_eq!(result.scale(), 4); - let expected = multiply_checked(&a, &b) - .unwrap() - .with_precision_and_scale(9, 4) - .unwrap(); - assert_eq!(&expected, &result); + let expected = mul(&a, &b).unwrap(); + assert_eq!(expected.as_ref(), &result); // Required scale cannot be larger than the product of the input scales. let result = multiply_fixed_point_checked(&a, &b, 5).unwrap_err(); @@ -2661,12 +332,8 @@ mod tests { .unwrap(); // `multiply` overflows on this case. - let result = multiply(&a, &b).unwrap(); - let expected = - Decimal128Array::from(vec![-16672482290199102048610367863168958464]) - .with_precision_and_scale(38, 10) - .unwrap(); - assert_eq!(&expected, &result); + let err = mul(&a, &b).unwrap_err(); + assert_eq!(err.to_string(), "Compute error: Overflow happened on: 123456789000000000000000000 * 10000000000000000000"); // Avoid overflow by reducing the scale. let result = multiply_fixed_point(&a, &b, 28).unwrap(); @@ -2682,693 +349,4 @@ mod tests { "1234567890.0000000000000000000000000000" ); } - - #[test] - fn test_timestamp_second_add_interval() { - // timestamp second + interval year month - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 + SECONDS_IN_DAY * (365 + 31 + 28), - 2 + SECONDS_IN_DAY * (365 + 31 + 28), - 3 + SECONDS_IN_DAY * (365 + 31 + 28), - 4 + SECONDS_IN_DAY * (365 + 31 + 28), - 5 + SECONDS_IN_DAY * (365 + 31 + 28), - ]); - assert_eq!(result, &expected); - - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp second + interval day time - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 + SECONDS_IN_DAY, - 2 + SECONDS_IN_DAY, - 3 + SECONDS_IN_DAY, - 4 + SECONDS_IN_DAY, - 5 + SECONDS_IN_DAY, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp second + interval month day nanosecond - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 + SECONDS_IN_DAY, - 2 + SECONDS_IN_DAY, - 3 + SECONDS_IN_DAY, - 4 + SECONDS_IN_DAY, - 5 + SECONDS_IN_DAY, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - } - - #[test] - fn test_timestamp_second_subtract_interval() { - // timestamp second + interval year month - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 - SECONDS_IN_DAY * (31 + 30 + 365), - 2 - SECONDS_IN_DAY * (31 + 30 + 365), - 3 - SECONDS_IN_DAY * (31 + 30 + 365), - 4 - SECONDS_IN_DAY * (31 + 30 + 365), - 5 - SECONDS_IN_DAY * (31 + 30 + 365), - ]); - assert_eq!(&expected, result); - - // timestamp second + interval day time - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 - SECONDS_IN_DAY, - 2 - SECONDS_IN_DAY, - 3 - SECONDS_IN_DAY, - 4 - SECONDS_IN_DAY, - 5 - SECONDS_IN_DAY, - ]); - assert_eq!(&expected, result); - - // timestamp second + interval month day nanosecond - let a = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampSecondArray::from(vec![ - 1 - SECONDS_IN_DAY, - 2 - SECONDS_IN_DAY, - 3 - SECONDS_IN_DAY, - 4 - SECONDS_IN_DAY, - 5 - SECONDS_IN_DAY, - ]); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_millisecond_add_interval() { - // timestamp millisecond + interval year month - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, - 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, - 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, - 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, - 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000, - ]); - assert_eq!(result, &expected); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp millisecond + interval day time - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000, - 2 + SECONDS_IN_DAY * 1_000, - 3 + SECONDS_IN_DAY * 1_000, - 4 + SECONDS_IN_DAY * 1_000, - 5 + SECONDS_IN_DAY * 1_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp millisecond + interval month day nanosecond - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000, - 2 + SECONDS_IN_DAY * 1_000, - 3 + SECONDS_IN_DAY * 1_000, - 4 + SECONDS_IN_DAY * 1_000, - 5 + SECONDS_IN_DAY * 1_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - } - - #[test] - fn test_timestamp_millisecond_subtract_interval() { - // timestamp millisecond + interval year month - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, - 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, - 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, - 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, - 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000, - ]); - assert_eq!(&expected, result); - - // timestamp millisecond + interval day time - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000, - 2 - SECONDS_IN_DAY * 1_000, - 3 - SECONDS_IN_DAY * 1_000, - 4 - SECONDS_IN_DAY * 1_000, - 5 - SECONDS_IN_DAY * 1_000, - ]); - assert_eq!(&expected, result); - - // timestamp millisecond + interval month day nanosecond - let a = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMillisecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000, - 2 - SECONDS_IN_DAY * 1_000, - 3 - SECONDS_IN_DAY * 1_000, - 4 - SECONDS_IN_DAY * 1_000, - 5 - SECONDS_IN_DAY * 1_000, - ]); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_microsecond_add_interval() { - // timestamp microsecond + interval year month - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, - 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, - 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, - 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, - 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000, - ]); - assert_eq!(result, &expected); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp microsecond + interval day time - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000_000, - 2 + SECONDS_IN_DAY * 1_000_000, - 3 + SECONDS_IN_DAY * 1_000_000, - 4 + SECONDS_IN_DAY * 1_000_000, - 5 + SECONDS_IN_DAY * 1_000_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp microsecond + interval month day nanosecond - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000_000, - 2 + SECONDS_IN_DAY * 1_000_000, - 3 + SECONDS_IN_DAY * 1_000_000, - 4 + SECONDS_IN_DAY * 1_000_000, - 5 + SECONDS_IN_DAY * 1_000_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - } - - #[test] - fn test_timestamp_microsecond_subtract_interval() { - // timestamp microsecond + interval year month - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, - 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, - 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, - 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, - 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000, - ]); - assert_eq!(&expected, result); - - // timestamp microsecond + interval day time - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000_000, - 2 - SECONDS_IN_DAY * 1_000_000, - 3 - SECONDS_IN_DAY * 1_000_000, - 4 - SECONDS_IN_DAY * 1_000_000, - 5 - SECONDS_IN_DAY * 1_000_000, - ]); - assert_eq!(&expected, result); - - // timestamp microsecond + interval month day nanosecond - let a = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampMicrosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000_000, - 2 - SECONDS_IN_DAY * 1_000_000, - 3 - SECONDS_IN_DAY * 1_000_000, - 4 - SECONDS_IN_DAY * 1_000_000, - 5 - SECONDS_IN_DAY * 1_000_000, - ]); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_nanosecond_add_interval() { - // timestamp nanosecond + interval year month - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, - 2 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, - 3 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, - 4 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, - 5 + SECONDS_IN_DAY * (31 + 28 + 365) * 1_000_000_000, - ]); - assert_eq!(result, &expected); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp nanosecond + interval day time - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000_000_000, - 2 + SECONDS_IN_DAY * 1_000_000_000, - 3 + SECONDS_IN_DAY * 1_000_000_000, - 4 + SECONDS_IN_DAY * 1_000_000_000, - 5 + SECONDS_IN_DAY * 1_000_000_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - - // timestamp nanosecond + interval month day nanosecond - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = add_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 + SECONDS_IN_DAY * 1_000_000_000, - 2 + SECONDS_IN_DAY * 1_000_000_000, - 3 + SECONDS_IN_DAY * 1_000_000_000, - 4 + SECONDS_IN_DAY * 1_000_000_000, - 5 + SECONDS_IN_DAY * 1_000_000_000, - ]); - assert_eq!(&expected, result); - let result = add_dyn(&b, &a).unwrap(); - let result = result.as_primitive::(); - assert_eq!(result, &expected); - } - - #[test] - fn test_timestamp_nanosecond_subtract_interval() { - // timestamp nanosecond + interval year month - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalYearMonthArray::from(vec![ - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - Some(IntervalYearMonthType::make_value(1, 2)), - ]); - - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, - 2 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, - 3 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, - 4 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, - 5 - SECONDS_IN_DAY * (31 + 30 + 365) * 1_000_000_000, - ]); - assert_eq!(&expected, result); - - // timestamp nanosecond + interval day time - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalDayTimeArray::from(vec![ - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - Some(IntervalDayTimeType::make_value(1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000_000_000, - 2 - SECONDS_IN_DAY * 1_000_000_000, - 3 - SECONDS_IN_DAY * 1_000_000_000, - 4 - SECONDS_IN_DAY * 1_000_000_000, - 5 - SECONDS_IN_DAY * 1_000_000_000, - ]); - assert_eq!(&expected, result); - - // timestamp nanosecond + interval month day nanosecond - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let b = IntervalMonthDayNanoArray::from(vec![ - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - Some(IntervalMonthDayNanoType::make_value(0, 1, 0)), - ]); - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - - let expected = TimestampNanosecondArray::from(vec![ - 1 - SECONDS_IN_DAY * 1_000_000_000, - 2 - SECONDS_IN_DAY * 1_000_000_000, - 3 - SECONDS_IN_DAY * 1_000_000_000, - 4 - SECONDS_IN_DAY * 1_000_000_000, - 5 - SECONDS_IN_DAY * 1_000_000_000, - ]); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_second_subtract_timestamp() { - let a = TimestampSecondArray::from(vec![0, 2, 4, 6, 8]); - let b = TimestampSecondArray::from(vec![1, 2, 3, 4, 5]); - let expected = DurationSecondArray::from(vec![-1, 0, 1, 2, 3]); - - // unchecked - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - - // checked - let result = subtract_dyn_checked(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_second_subtract_timestamp_overflow() { - let a = TimestampSecondArray::from(vec![ - ::Native::MAX, - ]); - let b = TimestampSecondArray::from(vec![ - ::Native::MIN, - ]); - - // checked - let result = subtract_dyn_checked(&a, &b); - assert!(&result.is_err()); - } - - #[test] - fn test_timestamp_microsecond_subtract_timestamp() { - let a = TimestampMicrosecondArray::from(vec![0, 2, 4, 6, 8]); - let b = TimestampMicrosecondArray::from(vec![1, 2, 3, 4, 5]); - let expected = DurationMicrosecondArray::from(vec![-1, 0, 1, 2, 3]); - - // unchecked - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - - // checked - let result = subtract_dyn_checked(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_microsecond_subtract_timestamp_overflow() { - let a = TimestampMicrosecondArray::from(vec![i64::MAX]); - let b = TimestampMicrosecondArray::from(vec![i64::MIN]); - - // checked - let result = subtract_dyn_checked(&a, &b); - assert!(&result.is_err()); - } - - #[test] - fn test_timestamp_millisecond_subtract_timestamp() { - let a = TimestampMillisecondArray::from(vec![0, 2, 4, 6, 8]); - let b = TimestampMillisecondArray::from(vec![1, 2, 3, 4, 5]); - let expected = DurationMillisecondArray::from(vec![-1, 0, 1, 2, 3]); - - // unchecked - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - - // checked - let result = subtract_dyn_checked(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_millisecond_subtract_timestamp_overflow() { - let a = TimestampMillisecondArray::from(vec![i64::MAX]); - let b = TimestampMillisecondArray::from(vec![i64::MIN]); - - // checked - let result = subtract_dyn_checked(&a, &b); - assert!(&result.is_err()); - } - - #[test] - fn test_timestamp_nanosecond_subtract_timestamp() { - let a = TimestampNanosecondArray::from(vec![0, 2, 4, 6, 8]); - let b = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]); - let expected = DurationNanosecondArray::from(vec![-1, 0, 1, 2, 3]); - - // unchecked - let result = subtract_dyn(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - - // checked - let result = subtract_dyn_checked(&a, &b).unwrap(); - let result = result.as_primitive::(); - assert_eq!(&expected, result); - } - - #[test] - fn test_timestamp_nanosecond_subtract_timestamp_overflow() { - let a = TimestampNanosecondArray::from(vec![ - ::Native::MAX, - ]); - let b = TimestampNanosecondArray::from(vec![ - ::Native::MIN, - ]); - - // checked - let result = subtract_dyn_checked(&a, &b); - assert!(&result.is_err()); - } } diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index ce766aff66f7..2dac33a4f28b 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -18,7 +18,6 @@ //! Defines kernels suitable to perform operations to primitive arrays. use arrow_array::builder::BufferBuilder; -use arrow_array::iterator::ArrayIter; use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; use arrow_buffer::buffer::NullBuffer; @@ -425,76 +424,6 @@ where Ok(Ok(builder.finish())) } -#[inline(never)] -fn try_binary_opt_no_nulls( - len: usize, - a: A, - b: B, - op: F, -) -> Result, ArrowError> -where - O: ArrowPrimitiveType, - F: Fn(A::Item, B::Item) -> Option, -{ - let mut buffer = Vec::with_capacity(10); - for idx in 0..len { - unsafe { - buffer.push(op(a.value_unchecked(idx), b.value_unchecked(idx))); - }; - } - Ok(buffer.iter().collect()) -} - -/// Applies the provided binary operation across `a` and `b`, collecting the optional results -/// into a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the corresponding -/// index in the result will also be null. The binary operation could return `None` which -/// results in a new null in the collected [`PrimitiveArray`]. -/// -/// The function is only evaluated for non-null indices -/// -/// # Error -/// -/// This function gives error if the arrays have different lengths -pub(crate) fn binary_opt( - a: A, - b: B, - op: F, -) -> Result, ArrowError> -where - O: ArrowPrimitiveType, - F: Fn(A::Item, B::Item) -> Option, -{ - if a.len() != b.len() { - return Err(ArrowError::ComputeError( - "Cannot perform binary operation on arrays of different length".to_string(), - )); - } - - if a.is_empty() { - return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); - } - - if a.null_count() == 0 && b.null_count() == 0 { - return try_binary_opt_no_nulls(a.len(), a, b, op); - } - - let iter_a = ArrayIter::new(a); - let iter_b = ArrayIter::new(b); - - let values = iter_a - .into_iter() - .zip(iter_b.into_iter()) - .map(|(item_a, item_b)| { - if let (Some(a), Some(b)) = (item_a, item_b) { - op(a, b) - } else { - None - } - }); - - Ok(values.collect()) -} - #[cfg(test)] mod tests { use super::*; diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index 730409b3777e..89395bd2ed08 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -49,7 +49,7 @@ fn double(array: &PyAny, py: Python) -> PyResult { .ok_or_else(|| ArrowError::ParseError("Expects an int64".to_string())) .map_err(to_py_err)?; - let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; + let array = kernels::numeric::add(array, array).map_err(to_py_err)?; // export array.to_data().to_pyarrow(py) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index a392d1deec86..7fbbaa7a3907 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -31,10 +31,11 @@ //! # use std::sync::Arc; //! # use arrow::array::{Int32Array, Array, ArrayData, make_array}; //! # use arrow::error::Result; -//! # use arrow::compute::kernels::arithmetic; +//! # use arrow_arith::numeric::add; //! # use arrow::ffi::{to_ffi, from_ffi}; //! # fn main() -> Result<()> { //! // create an array natively +//! //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); //! let data = array.into_data(); //! @@ -46,10 +47,10 @@ //! let array = Int32Array::from(data); //! //! // perform some operation -//! let array = arithmetic::add(&array, &array)?; +//! let array = add(&array, &array)?; //! //! // verify -//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)])); +//! assert_eq!(array.as_ref(), &Int32Array::from(vec![Some(2), None, Some(6)])); //! # //! # Ok(()) //! # } @@ -948,10 +949,10 @@ mod tests { // perform some operation let array = array.as_any().downcast_ref::().unwrap(); - let array = kernels::arithmetic::add(array, array).unwrap(); + let array = kernels::numeric::add(array, array).unwrap(); // verify - assert_eq!(array, Int32Array::from(vec![2, 4, 6])); + assert_eq!(array.as_ref(), &Int32Array::from(vec![2, 4, 6])); Ok(()) } diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 96cc98177a9a..e347f99ee429 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -184,7 +184,7 @@ //! //! This module also implements many common vertical operations: //! -//! * All mathematical binary operators, such as [`subtract`](compute::kernels::arithmetic::subtract) +//! * All mathematical binary operators, such as [`sub`](compute::kernels::numeric::sub) //! * All boolean binary operators such as [`equality`](compute::kernels::comparison::eq) //! * [`cast`](compute::kernels::cast::cast) //! * [`filter`](compute::kernels::filter::filter) From 86c3fdba211762de44dbff0a9578cb6c5f694af6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 10:02:35 +0100 Subject: [PATCH 1108/1411] Add rank function (#4606) (#4609) * Add rank function (#4606) * Add benchmarks * Add inline attribute --- arrow-ord/src/lib.rs | 1 + arrow-ord/src/rank.rs | 195 +++++++++++++++++++++++++++++++++++ arrow/benches/sort_kernel.rs | 29 +++++- arrow/src/compute/kernels.rs | 2 +- arrow/src/compute/mod.rs | 1 + 5 files changed, 223 insertions(+), 5 deletions(-) create mode 100644 arrow-ord/src/rank.rs diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs index 62338c022384..8b43cdb0bffb 100644 --- a/arrow-ord/src/lib.rs +++ b/arrow-ord/src/lib.rs @@ -46,4 +46,5 @@ pub mod comparison; pub mod ord; pub mod partition; +pub mod rank; pub mod sort; diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs new file mode 100644 index 000000000000..1e79156a71a3 --- /dev/null +++ b/arrow-ord/src/rank.rs @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::cast::AsArray; +use arrow_array::types::*; +use arrow_array::{downcast_primitive_array, Array, ArrowNativeTypeOp, GenericByteArray}; +use arrow_buffer::NullBuffer; +use arrow_schema::{ArrowError, DataType, SortOptions}; +use std::cmp::Ordering; + +/// Assigns a rank to each value in `array` based on its position in the sorted order +/// +/// Where values are equal, they will be assigned the highest of their ranks, +/// leaving gaps in the overall rank assignment +/// +/// ``` +/// # use arrow_array::StringArray; +/// # use arrow_ord::rank::rank; +/// let array = StringArray::from(vec![Some("foo"), None, Some("foo"), None, Some("bar")]); +/// let ranks = rank(&array, None).unwrap(); +/// assert_eq!(ranks, &[5, 2, 5, 2, 3]); +/// ``` +pub fn rank( + array: &dyn Array, + options: Option, +) -> Result, ArrowError> { + let options = options.unwrap_or_default(); + let ranks = downcast_primitive_array! { + array => primitive_rank(array.values(), array.nulls(), options), + DataType::Utf8 => bytes_rank(array.as_bytes::(), options), + DataType::LargeUtf8 => bytes_rank(array.as_bytes::(), options), + DataType::Binary => bytes_rank(array.as_bytes::(), options), + DataType::LargeBinary => bytes_rank(array.as_bytes::(), options), + d => return Err(ArrowError::ComputeError(format!("{d:?} not supported in rank"))) + }; + Ok(ranks) +} + +#[inline(never)] +fn primitive_rank( + values: &[T], + nulls: Option<&NullBuffer>, + options: SortOptions, +) -> Vec { + let len: u32 = values.len().try_into().unwrap(); + let to_sort = match nulls.filter(|n| n.null_count() > 0) { + Some(n) => n + .valid_indices() + .map(|idx| (values[idx], idx as u32)) + .collect(), + None => values.iter().copied().zip(0..len).collect(), + }; + rank_impl(values.len(), to_sort, options, T::compare, T::is_eq) +} + +#[inline(never)] +fn bytes_rank( + array: &GenericByteArray, + options: SortOptions, +) -> Vec { + let to_sort: Vec<(&[u8], u32)> = match array.nulls().filter(|n| n.null_count() > 0) { + Some(n) => n + .valid_indices() + .map(|idx| (array.value(idx).as_ref(), idx as u32)) + .collect(), + None => (0..array.len()) + .map(|idx| (array.value(idx).as_ref(), idx as u32)) + .collect(), + }; + rank_impl(array.len(), to_sort, options, Ord::cmp, PartialEq::eq) +} + +fn rank_impl( + len: usize, + mut valid: Vec<(T, u32)>, + options: SortOptions, + compare: C, + eq: E, +) -> Vec +where + T: Copy, + C: Fn(T, T) -> Ordering, + E: Fn(T, T) -> bool, +{ + // We can use an unstable sort as we combine equal values later + valid.sort_unstable_by(|a, b| compare(a.0, b.0)); + if options.descending { + valid.reverse(); + } + + let (mut valid_rank, null_rank) = match options.nulls_first { + true => (len as u32, (len - valid.len()) as u32), + false => (valid.len() as u32, len as u32), + }; + + let mut out: Vec<_> = vec![null_rank; len]; + if let Some(v) = valid.last() { + out[v.1 as usize] = valid_rank; + } + + let mut count = 1; // Number of values in rank + for w in valid.windows(2).rev() { + match eq(w[0].0, w[1].0) { + true => { + count += 1; + out[w[0].1 as usize] = valid_rank; + } + false => { + valid_rank -= count; + count = 1; + out[w[0].1 as usize] = valid_rank + } + } + } + + out +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::*; + + #[test] + fn test_primitive() { + let descending = SortOptions { + descending: true, + nulls_first: true, + }; + + let nulls_last = SortOptions { + descending: false, + nulls_first: false, + }; + + let nulls_last_descending = SortOptions { + descending: true, + nulls_first: false, + }; + + let a = Int32Array::from(vec![Some(1), Some(1), None, Some(3), Some(3), Some(4)]); + let res = rank(&a, None).unwrap(); + assert_eq!(res, &[3, 3, 1, 5, 5, 6]); + + let res = rank(&a, Some(descending)).unwrap(); + assert_eq!(res, &[6, 6, 1, 4, 4, 2]); + + let res = rank(&a, Some(nulls_last)).unwrap(); + assert_eq!(res, &[2, 2, 6, 4, 4, 5]); + + let res = rank(&a, Some(nulls_last_descending)).unwrap(); + assert_eq!(res, &[5, 5, 6, 3, 3, 1]); + + // Test with non-zero null values + let nulls = NullBuffer::from(vec![true, true, false, true, false, false]); + let a = Int32Array::new(vec![1, 4, 3, 4, 5, 5].into(), Some(nulls)); + let res = rank(&a, None).unwrap(); + assert_eq!(res, &[4, 6, 3, 6, 3, 3]); + } + + #[test] + fn test_bytes() { + let v = vec!["foo", "fo", "bar", "bar"]; + let values = StringArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[4, 3, 2, 2]); + + let values = LargeStringArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[4, 3, 2, 2]); + + let v: Vec<&[u8]> = vec![&[1, 2], &[0], &[1, 2, 3], &[1, 2]]; + let values = LargeBinaryArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[3, 1, 4, 3]); + + let values = BinaryArray::from(v); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[3, 1, 4, 3]); + } +} diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index 8762d9eb2f5f..3a3ce4462dff 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -17,7 +17,7 @@ #[macro_use] extern crate criterion; -use criterion::Criterion; +use criterion::{black_box, Criterion}; use std::sync::Arc; @@ -27,6 +27,7 @@ use arrow::compute::{lexsort, sort, sort_to_indices, SortColumn}; use arrow::datatypes::{Int16Type, Int32Type}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; +use arrow_ord::rank::rank; fn create_f32_array(size: usize, with_nulls: bool) -> ArrayRef { let null_density = if with_nulls { 0.5 } else { 0.0 }; @@ -42,7 +43,7 @@ fn create_bool_array(size: usize, with_nulls: bool) -> ArrayRef { } fn bench_sort(array: &dyn Array) { - criterion::black_box(sort(array, None).unwrap()); + black_box(sort(array, None).unwrap()); } fn bench_lexsort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { @@ -57,11 +58,11 @@ fn bench_lexsort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { }, ]; - criterion::black_box(lexsort(&columns, limit).unwrap()); + black_box(lexsort(&columns, limit).unwrap()); } fn bench_sort_to_indices(array: &dyn Array, limit: Option) { - criterion::black_box(sort_to_indices(array, None, limit).unwrap()); + black_box(sort_to_indices(array, None, limit).unwrap()); } fn add_benchmark(c: &mut Criterion) { @@ -199,6 +200,26 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("lexsort (f32, f32) nulls 2^12 limit 2^12", |b| { b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(2usize.pow(12)))) }); + + let arr = create_f32_array(2usize.pow(12), false); + c.bench_function("rank f32 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_f32_array(2usize.pow(12), true); + c.bench_function("rank f32 nulls 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_string_array_with_len::(2usize.pow(12), 0.0, 10); + c.bench_function("rank string[10] 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_string_array_with_len::(2usize.pow(12), 0.5, 10); + c.bench_function("rank string[10] nulls 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs index 1a79aef547d3..faff1b8a0ddf 100644 --- a/arrow/src/compute/kernels.rs +++ b/arrow/src/compute/kernels.rs @@ -22,7 +22,7 @@ pub use arrow_arith::{ }; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; -pub use arrow_ord::{partition, sort}; +pub use arrow_ord::{partition, rank, sort}; pub use arrow_select::{concat, filter, interleave, nullif, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index 7cfe787b08cf..47a9d149aadb 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -30,6 +30,7 @@ pub use self::kernels::filter::*; pub use self::kernels::interleave::*; pub use self::kernels::nullif::*; pub use self::kernels::partition::*; +pub use self::kernels::rank::*; pub use self::kernels::regexp::*; pub use self::kernels::sort::*; pub use self::kernels::take::*; From a2e2fa762e0af4bfffe92aa650266efa279293f9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 10:26:28 +0100 Subject: [PATCH 1109/1411] Don't Reorder Nulls in sort_to_indices (#4545) (#4603) --- arrow-ord/src/sort.rs | 65 ++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index c623475c0b3f..c3e9e26ec05e 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -455,7 +455,7 @@ fn sort_boolean( .map(|index| (index, values.value(index as usize))) .collect::>(); - sort_valids(descending, &mut valids, &mut null_indices, len, cmp); + sort_valids(descending, &mut valids, len, cmp); valids } else { // when limit is not present, we have a better way than sorting: we can just partition @@ -576,7 +576,7 @@ fn sort_dictionary( // sort is instantiated a lot so we only compile this inner version for each native type fn sort_primitive_inner( value_len: usize, - null_indices: Vec, + nulls: Vec, cmp: F, options: &SortOptions, limit: Option, @@ -587,8 +587,6 @@ where T: PartialOrd, F: Fn(T, T) -> Ordering, { - let mut nulls = null_indices; - let valids_len = valids.len(); let nulls_len = nulls.len(); let mut len = value_len; @@ -597,7 +595,7 @@ where len = limit.min(len); } - sort_valids(options.descending, &mut valids, &mut nulls, len, cmp); + sort_valids(options.descending, &mut valids, len, cmp); // collect results directly into a buffer instead of a vec to avoid another aligned allocation let result_capacity = len * std::mem::size_of::(); @@ -884,7 +882,7 @@ where len = limit.min(len); } - sort_valids(descending, &mut valids, &mut nulls, len, cmp); + sort_valids(descending, &mut valids, len, cmp); // collect the order of valid tuplies let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); @@ -1002,7 +1000,7 @@ where len = limit.min(len); } - sort_valids(descending, &mut valids, &mut null_indices, len, cmp); + sort_valids(descending, &mut valids, len, cmp); let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); if options.nulls_first { @@ -1230,10 +1228,9 @@ impl LexicographicalComparator<'_> { } } -fn sort_valids( +fn sort_valids( descending: bool, valids: &mut [(u32, T)], - nulls: &mut [U], len: usize, mut cmp: impl FnMut(T, T) -> Ordering, ) where @@ -1244,8 +1241,6 @@ fn sort_valids( sort_unstable_by(valids, len.min(valids_len), |a, b| cmp(a.1, b.1)); } else { sort_unstable_by(valids, len.min(valids_len), |a, b| cmp(a.1, b.1).reverse()); - // reverse to keep a stable ordering - nulls.reverse(); } } @@ -1756,7 +1751,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], // [2, 4, 1, 3, 5, 0] + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1766,7 +1761,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1776,7 +1771,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1786,7 +1781,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1803,7 +1798,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1820,7 +1815,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); test_sort_to_indices_primitive_arrays::( @@ -1830,7 +1825,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 1, 4, 3, 5, 0], + vec![2, 1, 4, 3, 0, 5], ); // descending, nulls first @@ -1841,7 +1836,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] + vec![0, 5, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] ); test_sort_to_indices_primitive_arrays::( @@ -1851,7 +1846,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] + vec![0, 5, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] ); test_sort_to_indices_primitive_arrays::( @@ -1861,7 +1856,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], + vec![0, 5, 2, 1, 4, 3], ); test_sort_to_indices_primitive_arrays::( @@ -1871,7 +1866,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], + vec![0, 5, 2, 1, 4, 3], ); test_sort_to_indices_primitive_arrays::( @@ -1888,7 +1883,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], + vec![0, 5, 2, 1, 4, 3], ); test_sort_to_indices_primitive_arrays::( @@ -1898,7 +1893,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], + vec![0, 5, 2, 1, 4, 3], ); test_sort_to_indices_primitive_arrays::( @@ -1908,7 +1903,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 1, 4, 3], + vec![0, 5, 2, 1, 4, 3], ); // valid values less than limit with extra nulls @@ -2007,7 +2002,7 @@ mod tests { nulls_first: true, }), Some(3), - vec![5, 0, 2], + vec![0, 5, 2], ); // valid values less than limit with extra nulls @@ -2070,7 +2065,7 @@ mod tests { nulls_first: false, }), None, - vec![1, 5, 3, 2, 4, 6, 0], + vec![1, 5, 3, 2, 4, 0, 6], ); // decimal null_first and descending test_sort_to_indices_decimal128_array( @@ -2080,7 +2075,7 @@ mod tests { nulls_first: true, }), None, - vec![6, 0, 1, 5, 3, 2, 4], + vec![0, 6, 1, 5, 3, 2, 4], ); // decimal null_first test_sort_to_indices_decimal128_array( @@ -2117,7 +2112,7 @@ mod tests { nulls_first: true, }), Some(3), - vec![6, 0, 1], + vec![0, 6, 1], ); // limit null_first test_sort_to_indices_decimal128_array( @@ -2154,7 +2149,7 @@ mod tests { nulls_first: false, }), None, - vec![1, 5, 3, 2, 4, 6, 0], + vec![1, 5, 3, 2, 4, 0, 6], ); // decimal null_first and descending test_sort_to_indices_decimal256_array( @@ -2167,7 +2162,7 @@ mod tests { nulls_first: true, }), None, - vec![6, 0, 1, 5, 3, 2, 4], + vec![0, 6, 1, 5, 3, 2, 4], ); // decimal null_first test_sort_to_indices_decimal256_array( @@ -2216,7 +2211,7 @@ mod tests { nulls_first: true, }), Some(3), - vec![6, 0, 1], + vec![0, 6, 1], ); // limit null_first test_sort_to_indices_decimal256_array( @@ -2938,7 +2933,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 4, 1, 5, 3, 0], + vec![2, 4, 1, 5, 0, 3], ); test_sort_to_indices_string_arrays( @@ -2972,7 +2967,7 @@ mod tests { nulls_first: true, }), None, - vec![3, 0, 2, 4, 1, 5], + vec![0, 3, 2, 4, 1, 5], ); test_sort_to_indices_string_arrays( @@ -2989,7 +2984,7 @@ mod tests { nulls_first: true, }), Some(3), - vec![3, 0, 2], + vec![0, 3, 2], ); // valid values less than limit with extra nulls From a5d9118c7b420bb413b642cfc024770e38594aab Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 10:26:46 +0100 Subject: [PATCH 1110/1411] Cleanup ArrayData::buffers (#4583) * Cleanup ArrayData::buffers * Hide from docs * Review feedback --- arrow-data/src/{data/mod.rs => data.rs} | 13 ++-- arrow-data/src/data/buffers.rs | 96 ------------------------- 2 files changed, 7 insertions(+), 102 deletions(-) rename arrow-data/src/{data/mod.rs => data.rs} (99%) delete mode 100644 arrow-data/src/data/buffers.rs diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data.rs similarity index 99% rename from arrow-data/src/data/mod.rs rename to arrow-data/src/data.rs index 50643b90e881..6ff8a824b2ff 100644 --- a/arrow-data/src/data/mod.rs +++ b/arrow-data/src/data.rs @@ -29,8 +29,10 @@ use std::sync::Arc; use crate::equal; -mod buffers; -pub use buffers::*; +/// A collection of [`Buffer`] +#[doc(hidden)] +#[deprecated(note = "Use [Buffer]")] +pub type Buffers<'a> = &'a [Buffer]; #[inline] pub(crate) fn contains_nulls( @@ -345,10 +347,9 @@ impl ArrayData { &self.data_type } - /// Returns the [`Buffers`] storing data for this [`ArrayData`] - pub fn buffers(&self) -> Buffers<'_> { - // In future ArrayData won't store data contiguously as `Vec` (#1799) - Buffers::from_slice(&self.buffers) + /// Returns the [`Buffer`] storing data for this [`ArrayData`] + pub fn buffers(&self) -> &[Buffer] { + &self.buffers } /// Returns a slice of children [`ArrayData`]. This will be non diff --git a/arrow-data/src/data/buffers.rs b/arrow-data/src/data/buffers.rs deleted file mode 100644 index 883e92e36d82..000000000000 --- a/arrow-data/src/data/buffers.rs +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_buffer::Buffer; -use std::iter::Chain; -use std::ops::Index; - -/// A collection of [`Buffer`] -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] -pub struct Buffers<'a>([Option<&'a Buffer>; 2]); - -impl<'a> Buffers<'a> { - /// Temporary will be removed once ArrayData does not store `Vec` directly (#3769) - pub(crate) fn from_slice(a: &'a [Buffer]) -> Self { - match a.len() { - 0 => Self([None, None]), - 1 => Self([Some(&a[0]), None]), - _ => Self([Some(&a[0]), Some(&a[1])]), - } - } - - /// Returns the number of [`Buffer`] in this collection - #[inline] - pub fn len(&self) -> usize { - self.0[0].is_some() as usize + self.0[1].is_some() as usize - } - - /// Returns `true` if this collection is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.0[0].is_none() && self.0[1].is_none() - } - - #[inline] - pub fn iter(&self) -> IntoIter<'a> { - self.into_iter() - } - - /// Converts this [`Buffers`] to a `Vec` - #[inline] - pub fn to_vec(&self) -> Vec { - self.iter().cloned().collect() - } -} - -impl<'a> Index for Buffers<'a> { - type Output = &'a Buffer; - - #[inline] - fn index(&self, index: usize) -> &Self::Output { - self.0[index].as_ref().unwrap() - } -} - -impl<'a> IntoIterator for Buffers<'a> { - type Item = &'a Buffer; - type IntoIter = IntoIter<'a>; - - #[inline] - fn into_iter(self) -> Self::IntoIter { - IntoIter(self.0[0].into_iter().chain(self.0[1].into_iter())) - } -} - -type OptionIter<'a> = std::option::IntoIter<&'a Buffer>; - -/// [`Iterator`] for [`Buffers`] -pub struct IntoIter<'a>(Chain, OptionIter<'a>>); - -impl<'a> Iterator for IntoIter<'a> { - type Item = &'a Buffer; - - #[inline] - fn next(&mut self) -> Option { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } -} From ae0d82ccb8fa679e67c1340e055e7c4cef8c605e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 12:06:25 +0100 Subject: [PATCH 1111/1411] Test Disabled Page Statistics (#4587) (#4589) * Test disabling page index statistics (#4587) * Apply suggestions from code review Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- parquet/src/file/writer.rs | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 12da085ed2b7..3b2dd8289455 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -742,6 +742,8 @@ mod tests { use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, Int32Type}; + use crate::file::page_index::index::Index; + use crate::file::properties::EnabledStatistics; use crate::file::reader::ChunkReader; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ @@ -1648,4 +1650,62 @@ mod tests { let reader = SerializedFileReader::new_with_options(file, options).unwrap(); test_read(reader); } + + #[test] + fn test_disabled_statistics() { + let message_type = " + message test_schema { + REQUIRED INT32 a; + REQUIRED INT32 b; + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::None) + .set_column_statistics_enabled("a".into(), EnabledStatistics::Page) + .build(); + let mut file = Vec::with_capacity(1024); + let mut file_writer = + SerializedFileWriter::new(&mut file, schema, Arc::new(props)).unwrap(); + + let mut row_group_writer = file_writer.next_row_group().unwrap(); + let mut a_writer = row_group_writer.next_column().unwrap().unwrap(); + let col_writer = a_writer.typed::(); + col_writer.write_batch(&[1, 2, 3], None, None).unwrap(); + a_writer.close().unwrap(); + + let mut b_writer = row_group_writer.next_column().unwrap().unwrap(); + let col_writer = b_writer.typed::(); + col_writer.write_batch(&[4, 5, 6], None, None).unwrap(); + b_writer.close().unwrap(); + row_group_writer.close().unwrap(); + + let metadata = file_writer.close().unwrap(); + assert_eq!(metadata.row_groups.len(), 1); + let row_group = &metadata.row_groups[0]; + assert_eq!(row_group.columns.len(), 2); + // Column "a" has both offset and column index, as requested + assert!(row_group.columns[0].offset_index_offset.is_some()); + assert!(row_group.columns[0].column_index_offset.is_some()); + // Column "b" should only have offset index + assert!(row_group.columns[1].offset_index_offset.is_some()); + assert!(row_group.columns[1].column_index_offset.is_none()); + + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = + SerializedFileReader::new_with_options(Bytes::from(file), options).unwrap(); + + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); // 1 row group + assert_eq!(offset_index[0].len(), 2); // 2 columns + + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); // 1 row group + assert_eq!(column_index[0].len(), 2); // 2 column + + let a_idx = &column_index[0][0]; + assert!(matches!(a_idx, Index::INT32(_)), "{a_idx:?}"); + let b_idx = &column_index[0][1]; + assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + } } From 5724cf21c23aa9d5a3ef06b6381cf267903746ee Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Aug 2023 12:06:40 +0100 Subject: [PATCH 1112/1411] Use contains_nulls in ArrayData equality of byte arrays (#4582) --- arrow-data/src/equal/variable_size.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arrow-data/src/equal/variable_size.rs b/arrow-data/src/equal/variable_size.rs index ae880437450b..92f00818b4a0 100644 --- a/arrow-data/src/equal/variable_size.rs +++ b/arrow-data/src/equal/variable_size.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::data::{count_nulls, ArrayData}; +use crate::data::{contains_nulls, ArrayData}; use arrow_buffer::ArrowNativeType; use num::Integer; @@ -59,14 +59,9 @@ pub(super) fn variable_sized_equal( let lhs_values = lhs.buffers()[1].as_slice(); let rhs_values = rhs.buffers()[1].as_slice(); - let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); - let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); - - if lhs_null_count == 0 - && rhs_null_count == 0 - && !lhs_values.is_empty() - && !rhs_values.is_empty() - { + // Only checking one null mask here because by the time the control flow reaches + // this point, the equality of the two masks would have already been verified. + if !contains_nulls(lhs.nulls(), lhs_start, len) { offset_value_equal( lhs_values, rhs_values, From d5b713ada2823443293b5616789e3c6c75bf48bb Mon Sep 17 00:00:00 2001 From: Tomoaki Kawada Date: Wed, 2 Aug 2023 17:46:05 +0900 Subject: [PATCH 1113/1411] fix(buffer): panic on end index overflow in `MutableBuffer::set_null_bits` (#4621) --- arrow-buffer/src/buffer/mutable.rs | 36 +++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 3e66e7f23fa2..0d2d2ed75146 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -168,7 +168,14 @@ impl MutableBuffer { /// `len` of the buffer and so can be used to initialize the memory region from /// `len` to `capacity`. pub fn set_null_bits(&mut self, start: usize, count: usize) { - assert!(start + count <= self.layout.size()); + assert!( + start.saturating_add(count) <= self.layout.size(), + "range start index {start} and count {count} out of bounds for \ + buffer of length {}", + self.layout.size(), + ); + + // Safety: `self.data[start..][..count]` is in-bounds and well-aligned for `u8` unsafe { std::ptr::write_bytes(self.data.as_ptr().add(start), 0, count); } @@ -932,4 +939,31 @@ mod tests { buffer.shrink_to_fit(); assert!(buffer.capacity() >= 64 && buffer.capacity() < 128); } + + #[test] + fn test_mutable_set_null_bits() { + let mut buffer = MutableBuffer::new(8).with_bitset(8, true); + + for i in 0..=buffer.capacity() { + buffer.set_null_bits(i, 0); + assert_eq!(buffer[..8], [255; 8][..]); + } + + buffer.set_null_bits(1, 4); + assert_eq!(buffer[..8], [255, 0, 0, 0, 0, 255, 255, 255][..]); + } + + #[test] + #[should_panic = "out of bounds for buffer of length"] + fn test_mutable_set_null_bits_oob() { + let mut buffer = MutableBuffer::new(64); + buffer.set_null_bits(1, buffer.capacity()); + } + + #[test] + #[should_panic = "out of bounds for buffer of length"] + fn test_mutable_set_null_bits_oob_by_overflow() { + let mut buffer = MutableBuffer::new(0); + buffer.set_null_bits(1, usize::MAX); + } } From a016654cfb90c6d7fa5c595cfeffd8fcbeca7f9d Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Wed, 2 Aug 2023 10:46:35 +0200 Subject: [PATCH 1114/1411] impl `Default` for `arrow_buffer::buffer::MutableBuffer` (#4619) * impl `Default` for `MutableBuffer` * Add a test --- arrow-buffer/src/buffer/mutable.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 0d2d2ed75146..8655bdb89c55 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -650,6 +650,12 @@ impl MutableBuffer { } } +impl Default for MutableBuffer { + fn default() -> Self { + Self::with_capacity(0) + } +} + impl std::ops::Deref for MutableBuffer { type Target = [u8]; @@ -777,6 +783,19 @@ mod tests { assert!(buf.is_empty()); } + #[test] + fn test_mutable_default() { + let buf = MutableBuffer::default(); + assert_eq!(0, buf.capacity()); + assert_eq!(0, buf.len()); + assert!(buf.is_empty()); + + let mut buf = MutableBuffer::default(); + buf.extend_from_slice(b"hello"); + assert_eq!(5, buf.len()); + assert_eq!(b"hello", buf.as_slice()); + } + #[test] fn test_mutable_extend_from_slice() { let mut buf = MutableBuffer::new(100); From de5aa483a533ef523f738fe15f67b8763bffcace Mon Sep 17 00:00:00 2001 From: fan <75058860+fansehep@users.noreply.github.com> Date: Wed, 2 Aug 2023 18:16:22 +0800 Subject: [PATCH 1115/1411] chore: add datatype new_list (#4561) * chore: add datatype new_list Signed-off-by: fansehep * fix cargo fmt and clippy check Signed-off-by: fansehep --------- Signed-off-by: fansehep --- arrow-schema/src/datatype.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index edd1dd09620e..4f8c8a18bd17 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -18,7 +18,7 @@ use std::fmt; use std::sync::Arc; -use crate::{FieldRef, Fields, UnionFields}; +use crate::{Field, FieldRef, Fields, UnionFields}; /// The set of datatypes that are supported by this implementation of Apache Arrow. /// @@ -576,6 +576,11 @@ impl DataType { _ => self == other, } } + + /// Create a List DataType default name is "item" + pub fn new_list(data_type: DataType, nullable: bool) -> Self { + DataType::List(Arc::new(Field::new("item", data_type, nullable))) + } } /// The maximum precision for [DataType::Decimal128] values From 74889255f143fc091d8ea4baebb122125e7b64f7 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Wed, 2 Aug 2023 13:39:45 +0200 Subject: [PATCH 1116/1411] impl `FromIterator` for `MutableBuffer` (#4624) --- arrow-buffer/src/buffer/mutable.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 8655bdb89c55..0177582b0b97 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -771,6 +771,14 @@ impl std::iter::FromIterator for MutableBuffer { } } +impl std::iter::FromIterator for MutableBuffer { + fn from_iter>(iter: I) -> Self { + let mut buffer = Self::default(); + buffer.extend_from_iter(iter.into_iter()); + buffer + } +} + #[cfg(test)] mod tests { use super::*; @@ -985,4 +993,11 @@ mod tests { let mut buffer = MutableBuffer::new(0); buffer.set_null_bits(1, usize::MAX); } + + #[test] + fn from_iter() { + let buffer = [1u16, 2, 3, 4].into_iter().collect::(); + assert_eq!(buffer.len(), 4 * mem::size_of::()); + assert_eq!(buffer.as_slice(), &[1, 0, 2, 0, 3, 0, 4, 0]); + } } From 399a3d1bfd0d3177b1112e6be84dc46fa75c81a4 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Wed, 2 Aug 2023 15:51:39 +0200 Subject: [PATCH 1117/1411] Move `BufferBuilder` to `arrow-buffer` (#4630) --- arrow-array/src/builder/buffer_builder.rs | 345 +--------------------- arrow-buffer/src/builder/mod.rs | 342 +++++++++++++++++++++ 2 files changed, 344 insertions(+), 343 deletions(-) diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index f88a6392083e..01e4c1d4e217 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -16,9 +16,8 @@ // under the License. use crate::array::ArrowPrimitiveType; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +pub use arrow_buffer::BufferBuilder; use half::f16; -use std::marker::PhantomData; use crate::types::*; @@ -73,7 +72,7 @@ pub type Date64BufferBuilder = BufferBuilder<: /// Buffer builder for 32-bit elaspsed time since midnight of second unit. pub type Time32SecondBufferBuilder = BufferBuilder<::Native>; -/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit. +/// Buffer builder for 32-bit elaspsed time since midnight of millisecond unit. pub type Time32MillisecondBufferBuilder = BufferBuilder<::Native>; /// Buffer builder for 64-bit elaspsed time since midnight of microsecond unit. @@ -106,346 +105,6 @@ pub type DurationMicrosecondBufferBuilder = pub type DurationNanosecondBufferBuilder = BufferBuilder<::Native>; -/// Builder for creating a [`Buffer`](arrow_buffer::Buffer) object. -/// -/// A [`Buffer`](arrow_buffer::Buffer) is the underlying data -/// structure of Arrow's [`Arrays`](crate::Array). -/// -/// For all supported types, there are type definitions for the -/// generic version of `BufferBuilder`, e.g. `UInt8BufferBuilder`. -/// -/// # Example: -/// -/// ``` -/// # use arrow_array::builder::UInt8BufferBuilder; -/// -/// let mut builder = UInt8BufferBuilder::new(100); -/// builder.append_slice(&[42, 43, 44]); -/// builder.append(45); -/// let buffer = builder.finish(); -/// -/// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); -/// ``` -#[derive(Debug)] -pub struct BufferBuilder { - buffer: MutableBuffer, - len: usize, - _marker: PhantomData, -} - -impl BufferBuilder { - /// Creates a new builder with initial capacity for _at least_ `capacity` - /// elements of type `T`. - /// - /// The capacity can later be manually adjusted with the - /// [`reserve()`](BufferBuilder::reserve) method. - /// Also the - /// [`append()`](BufferBuilder::append), - /// [`append_slice()`](BufferBuilder::append_slice) and - /// [`advance()`](BufferBuilder::advance) - /// methods automatically increase the capacity if needed. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// - /// assert!(builder.capacity() >= 10); - /// ``` - #[inline] - pub fn new(capacity: usize) -> Self { - let buffer = MutableBuffer::new(capacity * std::mem::size_of::()); - - Self { - buffer, - len: 0, - _marker: PhantomData, - } - } - - /// Creates a new builder from a [`MutableBuffer`] - pub fn new_from_buffer(buffer: MutableBuffer) -> Self { - let buffer_len = buffer.len(); - Self { - buffer, - len: buffer_len / std::mem::size_of::(), - _marker: PhantomData, - } - } - - /// Returns the current number of array elements in the internal buffer. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - pub fn len(&self) -> usize { - self.len - } - - /// Returns whether the internal buffer is empty. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.is_empty(), false); - /// ``` - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the actual capacity (number of elements) of the internal buffer. - /// - /// Note: the internal capacity returned by this method might be larger than - /// what you'd expect after setting the capacity in the `new()` or `reserve()` - /// functions. - pub fn capacity(&self) -> usize { - let byte_capacity = self.buffer.capacity(); - byte_capacity / std::mem::size_of::() - } - - /// Increases the number of elements in the internal buffer by `n` - /// and resizes the buffer as needed. - /// - /// The values of the newly added elements are 0. - /// This method is usually used when appending `NULL` values to the buffer - /// as they still require physical memory space. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.advance(2); - /// - /// assert_eq!(builder.len(), 2); - /// ``` - #[inline] - pub fn advance(&mut self, i: usize) { - self.buffer.extend_zeros(i * std::mem::size_of::()); - self.len += i; - } - - /// Reserves memory for _at least_ `n` more elements of type `T`. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.reserve(10); - /// - /// assert!(builder.capacity() >= 20); - /// ``` - #[inline] - pub fn reserve(&mut self, n: usize) { - self.buffer.reserve(n * std::mem::size_of::()); - } - - /// Appends a value of type `T` into the builder, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - #[inline] - pub fn append(&mut self, v: T) { - self.reserve(1); - self.buffer.push(v); - self.len += 1; - } - - /// Appends a value of type `T` into the builder N times, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_n(10, 42); - /// - /// assert_eq!(builder.len(), 10); - /// ``` - #[inline] - pub fn append_n(&mut self, n: usize, v: T) { - self.reserve(n); - for _ in 0..n { - self.buffer.push(v); - } - self.len += n; - } - - /// Appends `n`, zero-initialized values - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt32BufferBuilder; - /// - /// let mut builder = UInt32BufferBuilder::new(10); - /// builder.append_n_zeroed(3); - /// - /// assert_eq!(builder.len(), 3); - /// assert_eq!(builder.as_slice(), &[0, 0, 0]) - #[inline] - pub fn append_n_zeroed(&mut self, n: usize) { - self.buffer.extend_zeros(n * std::mem::size_of::()); - self.len += n; - } - - /// Appends a slice of type `T`, growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// assert_eq!(builder.len(), 3); - /// ``` - #[inline] - pub fn append_slice(&mut self, slice: &[T]) { - self.buffer.extend_from_slice(slice); - self.len += slice.len(); - } - - /// View the contents of this buffer as a slice - /// - /// ``` - /// # use arrow_array::builder::Float64BufferBuilder; - /// - /// let mut builder = Float64BufferBuilder::new(10); - /// builder.append(1.3); - /// builder.append_n(2, 2.3); - /// - /// assert_eq!(builder.as_slice(), &[1.3, 2.3, 2.3]); - /// ``` - #[inline] - pub fn as_slice(&self) -> &[T] { - // SAFETY - // - // - MutableBuffer is aligned and initialized for len elements of T - // - MutableBuffer corresponds to a single allocation - // - MutableBuffer does not support modification whilst active immutable borrows - unsafe { std::slice::from_raw_parts(self.buffer.as_ptr() as _, self.len) } - } - - /// View the contents of this buffer as a mutable slice - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::Float32BufferBuilder; - /// - /// let mut builder = Float32BufferBuilder::new(10); - /// - /// builder.append_slice(&[1., 2., 3.4]); - /// assert_eq!(builder.as_slice(), &[1., 2., 3.4]); - /// - /// builder.as_slice_mut()[1] = 4.2; - /// assert_eq!(builder.as_slice(), &[1., 4.2, 3.4]); - /// ``` - #[inline] - pub fn as_slice_mut(&mut self) -> &mut [T] { - // SAFETY - // - // - MutableBuffer is aligned and initialized for len elements of T - // - MutableBuffer corresponds to a single allocation - // - MutableBuffer does not support modification whilst active immutable borrows - unsafe { std::slice::from_raw_parts_mut(self.buffer.as_mut_ptr() as _, self.len) } - } - - /// Shorten this BufferBuilder to `len` items - /// - /// If `len` is greater than the builder's current length, this has no effect - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt16BufferBuilder; - /// - /// let mut builder = UInt16BufferBuilder::new(10); - /// - /// builder.append_slice(&[42, 44, 46]); - /// assert_eq!(builder.as_slice(), &[42, 44, 46]); - /// - /// builder.truncate(2); - /// assert_eq!(builder.as_slice(), &[42, 44]); - /// - /// builder.append(12); - /// assert_eq!(builder.as_slice(), &[42, 44, 12]); - /// ``` - #[inline] - pub fn truncate(&mut self, len: usize) { - self.buffer.truncate(len * std::mem::size_of::()); - self.len = len; - } - - /// # Safety - /// This requires the iterator be a trusted length. This could instead require - /// the iterator implement `TrustedLen` once that is stabilized. - #[inline] - pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { - let iter = iter.into_iter(); - let len = iter - .size_hint() - .1 - .expect("append_trusted_len_iter expects upper bound"); - self.reserve(len); - for v in iter { - self.buffer.push(v) - } - self.len += len; - } - - /// Resets this builder and returns an immutable [`Buffer`](arrow_buffer::Buffer). - /// - /// # Example: - /// - /// ``` - /// # use arrow_array::builder::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// let buffer = builder.finish(); - /// - /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 44, 46]); - /// ``` - #[inline] - pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() - } -} - #[cfg(test)] mod tests { use crate::builder::{ diff --git a/arrow-buffer/src/builder/mod.rs b/arrow-buffer/src/builder/mod.rs index f9d2d0935300..1d45bf40d2a2 100644 --- a/arrow-buffer/src/builder/mod.rs +++ b/arrow-buffer/src/builder/mod.rs @@ -21,3 +21,345 @@ mod boolean; pub use boolean::*; mod null; pub use null::*; + +use crate::{ArrowNativeType, Buffer, MutableBuffer}; +use std::marker::PhantomData; + +/// Builder for creating a [Buffer] object. +/// +/// A [Buffer] is the underlying data structure of Arrow's Arrays. +/// +/// For all supported types, there are type definitions for the +/// generic version of `BufferBuilder`, e.g. `BufferBuilder`. +/// +/// # Example: +/// +/// ``` +/// # use arrow_buffer::builder::BufferBuilder; +/// +/// let mut builder = BufferBuilder::::new(100); +/// builder.append_slice(&[42, 43, 44]); +/// builder.append(45); +/// let buffer = builder.finish(); +/// +/// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); +/// ``` +#[derive(Debug)] +pub struct BufferBuilder { + buffer: MutableBuffer, + len: usize, + _marker: PhantomData, +} + +impl BufferBuilder { + /// Creates a new builder with initial capacity for _at least_ `capacity` + /// elements of type `T`. + /// + /// The capacity can later be manually adjusted with the + /// [`reserve()`](BufferBuilder::reserve) method. + /// Also the + /// [`append()`](BufferBuilder::append), + /// [`append_slice()`](BufferBuilder::append_slice) and + /// [`advance()`](BufferBuilder::advance) + /// methods automatically increase the capacity if needed. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// + /// assert!(builder.capacity() >= 10); + /// ``` + #[inline] + pub fn new(capacity: usize) -> Self { + let buffer = MutableBuffer::new(capacity * std::mem::size_of::()); + + Self { + buffer, + len: 0, + _marker: PhantomData, + } + } + + /// Creates a new builder from a [`MutableBuffer`] + pub fn new_from_buffer(buffer: MutableBuffer) -> Self { + let buffer_len = buffer.len(); + Self { + buffer, + len: buffer_len / std::mem::size_of::(), + _marker: PhantomData, + } + } + + /// Returns the current number of array elements in the internal buffer. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.len(), 1); + /// ``` + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the internal buffer is empty. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.is_empty(), false); + /// ``` + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the actual capacity (number of elements) of the internal buffer. + /// + /// Note: the internal capacity returned by this method might be larger than + /// what you'd expect after setting the capacity in the `new()` or `reserve()` + /// functions. + pub fn capacity(&self) -> usize { + let byte_capacity = self.buffer.capacity(); + byte_capacity / std::mem::size_of::() + } + + /// Increases the number of elements in the internal buffer by `n` + /// and resizes the buffer as needed. + /// + /// The values of the newly added elements are 0. + /// This method is usually used when appending `NULL` values to the buffer + /// as they still require physical memory space. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.advance(2); + /// + /// assert_eq!(builder.len(), 2); + /// ``` + #[inline] + pub fn advance(&mut self, i: usize) { + self.buffer.extend_zeros(i * std::mem::size_of::()); + self.len += i; + } + + /// Reserves memory for _at least_ `n` more elements of type `T`. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.reserve(10); + /// + /// assert!(builder.capacity() >= 20); + /// ``` + #[inline] + pub fn reserve(&mut self, n: usize) { + self.buffer.reserve(n * std::mem::size_of::()); + } + + /// Appends a value of type `T` into the builder, + /// growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append(42); + /// + /// assert_eq!(builder.len(), 1); + /// ``` + #[inline] + pub fn append(&mut self, v: T) { + self.reserve(1); + self.buffer.push(v); + self.len += 1; + } + + /// Appends a value of type `T` into the builder N times, + /// growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append_n(10, 42); + /// + /// assert_eq!(builder.len(), 10); + /// ``` + #[inline] + pub fn append_n(&mut self, n: usize, v: T) { + self.reserve(n); + for _ in 0..n { + self.buffer.push(v); + } + self.len += n; + } + + /// Appends `n`, zero-initialized values + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append_n_zeroed(3); + /// + /// assert_eq!(builder.len(), 3); + /// assert_eq!(builder.as_slice(), &[0, 0, 0]) + #[inline] + pub fn append_n_zeroed(&mut self, n: usize) { + self.buffer.extend_zeros(n * std::mem::size_of::()); + self.len += n; + } + + /// Appends a slice of type `T`, growing the internal buffer as needed. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append_slice(&[42, 44, 46]); + /// + /// assert_eq!(builder.len(), 3); + /// ``` + #[inline] + pub fn append_slice(&mut self, slice: &[T]) { + self.buffer.extend_from_slice(slice); + self.len += slice.len(); + } + + /// View the contents of this buffer as a slice + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append(1.3); + /// builder.append_n(2, 2.3); + /// + /// assert_eq!(builder.as_slice(), &[1.3, 2.3, 2.3]); + /// ``` + #[inline] + pub fn as_slice(&self) -> &[T] { + // SAFETY + // + // - MutableBuffer is aligned and initialized for len elements of T + // - MutableBuffer corresponds to a single allocation + // - MutableBuffer does not support modification whilst active immutable borrows + unsafe { std::slice::from_raw_parts(self.buffer.as_ptr() as _, self.len) } + } + + /// View the contents of this buffer as a mutable slice + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// + /// builder.append_slice(&[1., 2., 3.4]); + /// assert_eq!(builder.as_slice(), &[1., 2., 3.4]); + /// + /// builder.as_slice_mut()[1] = 4.2; + /// assert_eq!(builder.as_slice(), &[1., 4.2, 3.4]); + /// ``` + #[inline] + pub fn as_slice_mut(&mut self) -> &mut [T] { + // SAFETY + // + // - MutableBuffer is aligned and initialized for len elements of T + // - MutableBuffer corresponds to a single allocation + // - MutableBuffer does not support modification whilst active immutable borrows + unsafe { std::slice::from_raw_parts_mut(self.buffer.as_mut_ptr() as _, self.len) } + } + + /// Shorten this BufferBuilder to `len` items + /// + /// If `len` is greater than the builder's current length, this has no effect + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// + /// builder.append_slice(&[42, 44, 46]); + /// assert_eq!(builder.as_slice(), &[42, 44, 46]); + /// + /// builder.truncate(2); + /// assert_eq!(builder.as_slice(), &[42, 44]); + /// + /// builder.append(12); + /// assert_eq!(builder.as_slice(), &[42, 44, 12]); + /// ``` + #[inline] + pub fn truncate(&mut self, len: usize) { + self.buffer.truncate(len * std::mem::size_of::()); + self.len = len; + } + + /// # Safety + /// This requires the iterator be a trusted length. This could instead require + /// the iterator implement `TrustedLen` once that is stabilized. + #[inline] + pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { + let iter = iter.into_iter(); + let len = iter + .size_hint() + .1 + .expect("append_trusted_len_iter expects upper bound"); + self.reserve(len); + for v in iter { + self.buffer.push(v) + } + self.len += len; + } + + /// Resets this builder and returns an immutable [Buffer]. + /// + /// # Example: + /// + /// ``` + /// # use arrow_buffer::builder::BufferBuilder; + /// + /// let mut builder = BufferBuilder::::new(10); + /// builder.append_slice(&[42, 44, 46]); + /// + /// let buffer = builder.finish(); + /// + /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 44, 46]); + /// ``` + #[inline] + pub fn finish(&mut self) -> Buffer { + let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + self.len = 0; + buf.into() + } +} From 70094a5eb3762664c5e58272974397b904dc5f9f Mon Sep 17 00:00:00 2001 From: Miklos Szots Date: Wed, 2 Aug 2023 15:55:25 +0200 Subject: [PATCH 1118/1411] expand docs for FixedSizeListArray (#4622) * expand docs for FixedSizeListArray * mark nulls as ???? --- .../src/array/fixed_size_list_array.rs | 58 ++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 6c1598ce90df..e6ae1acf0aad 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -26,7 +26,60 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use std::any::Any; use std::sync::Arc; -/// An array of [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) +/// An array of [fixed length lists], similar to JSON arrays +/// (e.g. `["A", "B"]`). +/// +/// Lists are represented using a `values` child +/// array where each list has a fixed size of `value_length`. +/// +/// Use [`FixedSizeListBuilder`](crate::builder::FixedSizeListBuilder) to +/// construct a [`FixedSizeListArray`]. +/// +/// # Representation +/// +/// A [`FixedSizeListArray`] can represent a list of values of any other +/// supported Arrow type. Each element of the `FixedSizeListArray` itself is +/// a list which may contain NULL and non-null values, +/// or may itself be NULL. +/// +/// For example, this `FixedSizeListArray` stores lists of strings: +/// +/// ```text +/// ┌─────────────┐ +/// │ [A,B] │ +/// ├─────────────┤ +/// │ NULL │ +/// ├─────────────┤ +/// │ [C,NULL] │ +/// └─────────────┘ +/// ``` +/// +/// The `values` of this `FixedSizeListArray`s are stored in a child +/// [`StringArray`] where logical null values take up `values_length` slots in the array +/// as shown in the following diagram. The logical values +/// are shown on the left, and the actual `FixedSizeListArray` encoding on the right +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─┐ +/// ┌─────────────┐ │ ┌───┐ ┌───┐ ┌──────┐ │ +/// │ [A,B] │ │ 1 │ │ │ 1 │ │ A │ │ 0 +/// ├─────────────┤ │ ├───┤ ├───┤ ├──────┤ │ +/// │ NULL │ │ 0 │ │ │ 1 │ │ B │ │ 1 +/// ├─────────────┤ │ ├───┤ ├───┤ ├──────┤ │ +/// │ [C,NULL] │ │ 1 │ │ │ 0 │ │ ???? │ │ 2 +/// └─────────────┘ │ └───┘ ├───┤ ├──────┤ │ +/// | │ 0 │ │ ???? │ │ 3 +/// Logical Values │ Validity ├───┤ ├──────┤ │ +/// (nulls) │ │ 1 │ │ C │ │ 4 +/// │ ├───┤ ├──────┤ │ +/// │ │ 0 │ │ ???? │ │ 5 +/// │ └───┘ └──────┘ │ +/// │ Values │ +/// │ FixedSizeListArray (Array) │ +/// └ ─ ─ ─ ─ ─ ─ ─ ─┘ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ +/// ``` /// /// # Example /// @@ -60,6 +113,9 @@ use std::sync::Arc; /// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::().unwrap().values()); /// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::().unwrap().values()); /// ``` +/// +/// [`StringArray`]: crate::array::StringArray +/// [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) #[derive(Clone)] pub struct FixedSizeListArray { data_type: DataType, // Must be DataType::FixedSizeList(value_length) From 8a9dd842fccafb29fadd07fa1c2bf988a6049a52 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:58:12 +0100 Subject: [PATCH 1119/1411] Use Config System for Object Store Integration Tests (#4628) --- .github/workflows/object_store.yml | 21 ++-- object_store/Cargo.toml | 1 - object_store/src/aws/mod.rs | 171 +++-------------------------- object_store/src/azure/mod.rs | 101 +---------------- object_store/src/gcp/mod.rs | 78 +++---------- object_store/src/http/mod.rs | 7 +- object_store/src/lib.rs | 10 ++ 7 files changed, 57 insertions(+), 332 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 5ae9d2d9c83f..7858da1e2d2d 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -82,12 +82,19 @@ jobs: # Run integration tests TEST_INTEGRATION: 1 EC2_METADATA_ENDPOINT: http://localhost:1338 - AZURE_USE_EMULATOR: "1" + AZURE_CONTAINER_NAME: test-bucket + AZURE_STORAGE_USE_EMULATOR: "1" AZURITE_BLOB_STORAGE_URL: "http://localhost:10000" AZURITE_QUEUE_STORAGE_URL: "http://localhost:10001" + AWS_BUCKET: test-bucket + AWS_DEFAULT_REGION: "us-east-1" + AWS_ACCESS_KEY_ID: test + AWS_SECRET_ACCESS_KEY: test + AWS_ENDPOINT: http://localhost:4566 + AWS_ALLOW_HTTP: true HTTP_URL: "http://localhost:8080" + GOOGLE_BUCKET: test-bucket GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" - OBJECT_STORE_BUCKET: test-bucket steps: - uses: actions/checkout@v3 @@ -105,11 +112,6 @@ jobs: run: docker run -d -p 8080:80 rclone/rclone serve webdav /data --addr :80 - name: Setup LocalStack (AWS emulation) - env: - AWS_DEFAULT_REGION: "us-east-1" - AWS_ACCESS_KEY_ID: test - AWS_SECRET_ACCESS_KEY: test - AWS_ENDPOINT: http://localhost:4566 run: | docker run -d -p 4566:4566 localstack/localstack:2.0 docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 @@ -128,11 +130,6 @@ jobs: rustup default stable - name: Run object_store tests - env: - OBJECT_STORE_AWS_DEFAULT_REGION: "us-east-1" - OBJECT_STORE_AWS_ACCESS_KEY_ID: test - OBJECT_STORE_AWS_SECRET_ACCESS_KEY: test - OBJECT_STORE_AWS_ENDPOINT: http://localhost:4566 run: cargo test --features=aws,azure,gcp,http # test the object_store crate builds against wasm32 in stable rust diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 255b972e32d8..eca5a5ce84ed 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -71,7 +71,6 @@ aws = ["cloud"] http = ["cloud"] [dev-dependencies] # In alphabetical order -dotenv = "0.15.0" tempfile = "3.1.0" futures-test = "0.3" rand = "0.8" diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 5a29bd0fc6c7..f6066d45a72c 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -1067,153 +1067,9 @@ mod tests { }; use bytes::Bytes; use std::collections::HashMap; - use std::env; const NON_EXISTENT_NAME: &str = "nonexistentname"; - // Helper macro to skip tests if TEST_INTEGRATION and the AWS - // environment variables are not set. Returns a configured - // AmazonS3Builder - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let required_vars = [ - "OBJECT_STORE_AWS_DEFAULT_REGION", - "OBJECT_STORE_BUCKET", - "OBJECT_STORE_AWS_ACCESS_KEY_ID", - "OBJECT_STORE_AWS_SECRET_ACCESS_KEY", - ]; - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ); - } else if force.is_err() { - eprintln!( - "skipping AWS integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - let config = AmazonS3Builder::new() - .with_access_key_id( - env::var("OBJECT_STORE_AWS_ACCESS_KEY_ID") - .expect("already checked OBJECT_STORE_AWS_ACCESS_KEY_ID"), - ) - .with_secret_access_key( - env::var("OBJECT_STORE_AWS_SECRET_ACCESS_KEY") - .expect("already checked OBJECT_STORE_AWS_SECRET_ACCESS_KEY"), - ) - .with_region( - env::var("OBJECT_STORE_AWS_DEFAULT_REGION") - .expect("already checked OBJECT_STORE_AWS_DEFAULT_REGION"), - ) - .with_bucket_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_allow_http(true); - - let config = if let Ok(endpoint) = env::var("OBJECT_STORE_AWS_ENDPOINT") { - config.with_endpoint(endpoint) - } else { - config - }; - - let config = if let Ok(token) = env::var("OBJECT_STORE_AWS_SESSION_TOKEN") - { - config.with_token(token) - } else { - config - }; - - let config = if let Ok(virtual_hosted_style_request) = - env::var("OBJECT_STORE_VIRTUAL_HOSTED_STYLE_REQUEST") - { - config.with_virtual_hosted_style_request( - virtual_hosted_style_request.trim().parse().unwrap(), - ) - } else { - config - }; - - config - } - }}; - } - - #[test] - fn s3_test_config_from_env() { - let aws_access_key_id = env::var("AWS_ACCESS_KEY_ID") - .unwrap_or_else(|_| "object_store:fake_access_key_id".into()); - let aws_secret_access_key = env::var("AWS_SECRET_ACCESS_KEY") - .unwrap_or_else(|_| "object_store:fake_secret_key".into()); - - let aws_default_region = env::var("AWS_DEFAULT_REGION") - .unwrap_or_else(|_| "object_store:fake_default_region".into()); - - let aws_endpoint = env::var("AWS_ENDPOINT") - .unwrap_or_else(|_| "object_store:fake_endpoint".into()); - let aws_session_token = env::var("AWS_SESSION_TOKEN") - .unwrap_or_else(|_| "object_store:fake_session_token".into()); - - let container_creds_relative_uri = - env::var("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") - .unwrap_or_else(|_| "/object_store/fake_credentials_uri".into()); - - // required - env::set_var("AWS_ACCESS_KEY_ID", &aws_access_key_id); - env::set_var("AWS_SECRET_ACCESS_KEY", &aws_secret_access_key); - env::set_var("AWS_DEFAULT_REGION", &aws_default_region); - - // optional - env::set_var("AWS_ENDPOINT", &aws_endpoint); - env::set_var("AWS_SESSION_TOKEN", &aws_session_token); - env::set_var( - "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI", - &container_creds_relative_uri, - ); - env::set_var("AWS_UNSIGNED_PAYLOAD", "true"); - env::set_var("AWS_CHECKSUM_ALGORITHM", "sha256"); - - let builder = AmazonS3Builder::from_env(); - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!( - builder.secret_access_key.unwrap(), - aws_secret_access_key.as_str() - ); - assert_eq!(builder.region.unwrap(), aws_default_region); - - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!( - builder.container_credentials_relative_uri.unwrap(), - container_creds_relative_uri - ); - assert_eq!( - builder.checksum_algorithm.unwrap().get().unwrap(), - Checksum::SHA256 - ); - assert!(builder.unsigned_payload.get().unwrap()); - } - #[test] fn s3_test_config_from_map() { let aws_access_key_id = "object_store:fake_access_key_id".to_string(); @@ -1304,7 +1160,9 @@ mod tests { #[tokio::test] async fn s3_test() { - let config = maybe_skip_integration!(); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env(); + let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); @@ -1317,13 +1175,14 @@ mod tests { stream_get(&integration).await; // run integration test with unsigned payload enabled - let config = maybe_skip_integration!().with_unsigned_payload(true); + let config = AmazonS3Builder::from_env().with_unsigned_payload(true); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let config = maybe_skip_integration!().with_checksum_algorithm(Checksum::SHA256); + let config = + AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); let integration = config.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; @@ -1331,8 +1190,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1344,7 +1203,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1355,8 +1215,8 @@ mod tests { #[tokio::test] async fn s3_test_put_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); - + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1368,8 +1228,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = AmazonS3Builder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1378,7 +1238,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_bucket() { - let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + crate::test_util::maybe_skip_integration!(); + let config = AmazonS3Builder::from_env().with_bucket_name(NON_EXISTENT_NAME); let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 8619319a5b25..019cde581354 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -1021,107 +1021,18 @@ mod tests { use super::*; use crate::tests::{ copy_if_not_exists, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list, put_get_delete_list_opts, - rename_and_copy, stream_get, + list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; use std::collections::HashMap; - use std::env; - - // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment - // variables are not set. - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let use_emulator = std::env::var("AZURE_USE_EMULATOR").is_ok(); - - let mut required_vars = vec!["OBJECT_STORE_BUCKET"]; - if !use_emulator { - required_vars.push("AZURE_STORAGE_ACCOUNT"); - required_vars.push("AZURE_STORAGE_ACCESS_KEY"); - } - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = std::env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ) - } else if force.is_err() { - eprintln!( - "skipping Azure integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - let builder = MicrosoftAzureBuilder::new() - .with_container_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - ) - .with_use_emulator(use_emulator); - if !use_emulator { - builder - .with_account( - env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default(), - ) - .with_access_key( - env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), - ) - } else { - builder - } - } - }}; - } #[tokio::test] async fn azure_blob_test() { - let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list_opts(&integration, false).await; - get_opts(&integration).await; - list_uses_directories_correctly(&integration).await; - list_with_delimiter(&integration).await; - rename_and_copy(&integration).await; - copy_if_not_exists(&integration).await; - stream_get(&integration).await; - } + crate::test_util::maybe_skip_integration!(); + let container_name = std::env::var("AZURE_CONTAINER_NAME").unwrap(); // (#4629) + let config = MicrosoftAzureBuilder::from_env(); + let integration = config.with_container_name(container_name).build().unwrap(); - // test for running integration test against actual blob service with service principal - // credentials. To run make sure all environment variables are set and remove the ignore - #[tokio::test] - #[ignore] - async fn azure_blob_test_sp() { - dotenv::dotenv().ok(); - let builder = MicrosoftAzureBuilder::new() - .with_account( - env::var("AZURE_STORAGE_ACCOUNT") - .expect("must be set AZURE_STORAGE_ACCOUNT"), - ) - .with_container_name( - env::var("OBJECT_STORE_BUCKET").expect("must be set OBJECT_STORE_BUCKET"), - ) - .with_access_key( - env::var("AZURE_STORAGE_ACCESS_KEY") - .expect("must be set AZURE_STORAGE_CLIENT_ID"), - ); - let integration = builder.build().unwrap(); - - put_get_delete_list(&integration).await; + put_get_delete_list_opts(&integration, false).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index d98e6b068d4f..58a5d19f3cb7 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -1090,7 +1090,6 @@ impl GoogleCloudStorageBuilder { mod test { use bytes::Bytes; use std::collections::HashMap; - use std::env; use std::io::Write; use tempfile::NamedTempFile; @@ -1101,56 +1100,10 @@ mod test { const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; - // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. - macro_rules! maybe_skip_integration { - () => {{ - dotenv::dotenv().ok(); - - let required_vars = ["OBJECT_STORE_BUCKET", "GOOGLE_SERVICE_ACCOUNT"]; - let unset_vars: Vec<_> = required_vars - .iter() - .filter_map(|&name| match env::var(name) { - Ok(_) => None, - Err(_) => Some(name), - }) - .collect(); - let unset_var_names = unset_vars.join(", "); - - let force = std::env::var("TEST_INTEGRATION"); - - if force.is_ok() && !unset_var_names.is_empty() { - panic!( - "TEST_INTEGRATION is set, \ - but variable(s) {} need to be set", - unset_var_names - ) - } else if force.is_err() { - eprintln!( - "skipping Google Cloud integration test - set {}TEST_INTEGRATION to run", - if unset_var_names.is_empty() { - String::new() - } else { - format!("{} and ", unset_var_names) - } - ); - return; - } else { - GoogleCloudStorageBuilder::new() - .with_bucket_name( - env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET") - ) - .with_service_account_path( - env::var("GOOGLE_SERVICE_ACCOUNT") - .expect("already checked GOOGLE_SERVICE_ACCOUNT") - ) - } - }}; - } - #[tokio::test] async fn gcs_test() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); put_get_delete_list(&integration).await; list_uses_directories_correctly(&integration).await; @@ -1170,7 +1123,8 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_location() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1184,10 +1138,9 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1203,7 +1156,8 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { - let integration = maybe_skip_integration!().build().unwrap(); + crate::test_util::maybe_skip_integration!(); + let integration = GoogleCloudStorageBuilder::from_env().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1216,10 +1170,9 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1232,10 +1185,9 @@ mod test { #[tokio::test] async fn gcs_test_put_nonexistent_bucket() { - let integration = maybe_skip_integration!() - .with_bucket_name(NON_EXISTENT_NAME) - .build() - .unwrap(); + crate::test_util::maybe_skip_integration!(); + let config = GoogleCloudStorageBuilder::from_env(); + let integration = config.with_bucket_name(NON_EXISTENT_NAME).build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index bc01c174f339..6927f1b883be 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -262,12 +262,7 @@ mod tests { #[tokio::test] async fn http_test() { - dotenv::dotenv().ok(); - let force = std::env::var("TEST_INTEGRATION"); - if force.is_err() { - eprintln!("skipping HTTP integration test - set TEST_INTEGRATION to run"); - return; - } + crate::test_util::maybe_skip_integration!(); let url = std::env::var("HTTP_URL").expect("HTTP_URL must be set"); let options = ClientOptions::new().with_allow_http(true); let integration = HttpBuilder::new() diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 082dca293571..6c70326d2b7b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -910,6 +910,16 @@ mod test_util { use super::*; use futures::TryStreamExt; + macro_rules! maybe_skip_integration { + () => { + if std::env::var("TEST_INTEGRATION").is_err() { + eprintln!("Skipping integration test - set TEST_INTEGRATION"); + return; + } + }; + } + pub(crate) use maybe_skip_integration; + pub async fn flatten_list_stream( storage: &DynObjectStore, prefix: Option<&Path>, From 30a5b8d83f56c10ab947ec98ec7c50611663fd0e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 2 Aug 2023 10:24:26 -0500 Subject: [PATCH 1120/1411] Minor: improve docs and add example for lexicographical_partition_ranges (#4615) * Minor: improve docs and add example for lexicographical_partition_ranges * improve wording --- arrow-ord/src/partition.rs | 78 ++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 7 deletions(-) diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 26a030beb35e..4411a0f0ab31 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -22,15 +22,79 @@ use arrow_schema::ArrowError; use std::cmp::Ordering; use std::ops::Range; -/// Given a list of already sorted columns, find partition ranges that would partition -/// lexicographically equal values across columns. +/// Given a list of already sorted columns, returns [`Range`]es that +/// partition the input such that each partition has equal values +/// across sort columns. /// -/// Here LexicographicalComparator is used in conjunction with binary -/// search so the columns *MUST* be pre-sorted already. +/// Returns an error if no columns are specified or all columns do not +/// have the same number of rows. /// -/// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive -/// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last -/// range. +/// Returns an iterator with `k` items where `k` is cardinality of the +/// sort values: Consecutive values will be connected: `(a, b)` and `(b, +/// c)`, where `start = 0` and `end = n` for the first and last range. +/// +/// # Example: +/// +/// For example, given columns `x`, `y` and `z`, calling +/// `lexicographical_partition_ranges(values, (x, y))` will divide the +/// rows into ranges where the values of `(x, y)` are equal: +/// +/// ```text +/// ┌ ─ ┬───┬ ─ ─┌───┐─ ─ ┬───┬ ─ ─ ┐ +/// │ 1 │ │ 1 │ │ A │ Range: 0..1 (x=1, y=1) +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 1 │ │ 2 │ │ B │ +/// │ ├───┤ ├───┤ ├───┤ │ +/// │ 1 │ │ 2 │ │ C │ Range: 1..4 (x=1, y=2) +/// │ ├───┤ ├───┤ ├───┤ │ +/// │ 1 │ │ 2 │ │ D │ +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 2 │ │ 1 │ │ E │ Range: 4..5 (x=2, y=1) +/// ├ ─ ┼───┼ ─ ─├───┤─ ─ ┼───┼ ─ ─ ┤ +/// │ 3 │ │ 1 │ │ F │ Range: 5..6 (x=3, y=1) +/// └ ─ ┴───┴ ─ ─└───┘─ ─ ┴───┴ ─ ─ ┘ +/// +/// x y z lexicographical_partition_ranges +/// by (x,y) +/// ``` +/// +/// # Example Code +/// +/// ``` +/// # use std::{sync::Arc, ops::Range}; +/// # use arrow_array::{RecordBatch, Int64Array, StringArray, ArrayRef}; +/// # use arrow_ord::sort::{SortColumn, SortOptions}; +/// # use arrow_ord::partition::lexicographical_partition_ranges; +/// let batch = RecordBatch::try_from_iter(vec![ +/// ("x", Arc::new(Int64Array::from(vec![1, 1, 1, 1, 2, 3])) as ArrayRef), +/// ("y", Arc::new(Int64Array::from(vec![1, 2, 2, 2, 1, 1])) as ArrayRef), +/// ("z", Arc::new(StringArray::from(vec!["A", "B", "C", "D", "E", "F"])) as ArrayRef), +/// ]).unwrap(); +/// +/// // Lexographically partition on (x, y) +/// let sort_columns = vec![ +/// SortColumn { +/// values: batch.column(0).clone(), +/// options: Some(SortOptions::default()), +/// }, +/// SortColumn { +/// values: batch.column(1).clone(), +/// options: Some(SortOptions::default()), +/// }, +/// ]; +/// let ranges:Vec> = lexicographical_partition_ranges(&sort_columns) +/// .unwrap() +/// .collect(); +/// +/// let expected = vec![ +/// (0..1), +/// (1..4), +/// (4..5), +/// (5..6), +/// ]; +/// +/// assert_eq!(ranges, expected); +/// ``` pub fn lexicographical_partition_ranges( columns: &[SortColumn], ) -> Result> + '_, ArrowError> { From 0eb9049ff7df9e2b008c57246fba6ff59997a88a Mon Sep 17 00:00:00 2001 From: Miklos Szots Date: Wed, 2 Aug 2023 19:26:46 +0200 Subject: [PATCH 1121/1411] allow zero sized empty fixed (#4626) --- arrow-array/src/array/fixed_size_list_array.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index e6ae1acf0aad..6c3abb556ad6 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -161,7 +161,7 @@ impl FixedSizeListArray { )) })?; - let len = values.len() / s; + let len = values.len() / s.max(1); if let Some(n) = nulls.as_ref() { if n.len() != len { return Err(ArrowError::InvalidArgumentError(format!( @@ -676,6 +676,9 @@ mod tests { "Invalid argument error: Size cannot be negative, got -1" ); + let list = FixedSizeListArray::new(field.clone(), 0, values.clone(), None); + assert_eq!(list.len(), 6); + let nulls = NullBuffer::new_null(2); let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)) .unwrap_err(); From 08e4692b2e7e9a34b69f5c25613d7628f3c7852d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 2 Aug 2023 12:31:51 -0500 Subject: [PATCH 1122/1411] Add more docs and examples for ListArray and OffsetsBuffer (#4607) * Add more docs and examples for ListArray and OffsetsBuffer * fix docs * Update arrow-array/src/array/list_array.rs * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * ascii repair * Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Clarify empty list [] documenation and typos * Move example to GenericLisBuilder * Update arrow-array/src/array/list_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/builder/generic_list_builder.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Liang-Chi Hsieh --- arrow-array/src/array/list_array.rs | 72 ++++++++++++++++++- .../src/builder/generic_list_builder.rs | 54 ++++++++++++++ arrow-buffer/src/buffer/offset.rs | 38 +++++++++- 3 files changed, 160 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 0c1fea6f4161..05628084c844 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -54,11 +54,77 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } -/// An array of [variable length arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) +/// An array of [variable length lists], similar to JSON arrays +/// (e.g. `["A", "B", "C"]`). /// -/// See [`ListArray`] and [`LargeListArray`]` +/// Lists are represented using `offsets` into a `values` child +/// array. Offsets are stored in two adjacent entries of an +/// [`OffsetBuffer`]. /// -/// See [`GenericListBuilder`](crate::builder::GenericListBuilder) for how to construct a [`GenericListArray`] +/// Arrow defines [`ListArray`] with `i32` offsets and +/// [`LargeListArray`] with `i64` offsets. +/// +/// Use [`GenericListBuilder`](crate::builder::GenericListBuilder) to +/// construct a [`GenericListArray`]. +/// +/// # Representation +/// +/// A [`ListArray`] can represent a list of values of any other +/// supported Arrow type. Each element of the `ListArray` itself is +/// a list which may be empty, may contain NULL and non-null values, +/// or may itself be NULL. +/// +/// For example, the `ListArray` shown in the following diagram stores +/// lists of strings. Note that `[]` represents an empty (length +/// 0), but non NULL list. +/// +/// ```text +/// ┌─────────────┐ +/// │ [A,B,C] │ +/// ├─────────────┤ +/// │ [] │ +/// ├─────────────┤ +/// │ NULL │ +/// ├─────────────┤ +/// │ [D] │ +/// ├─────────────┤ +/// │ [NULL, F] │ +/// └─────────────┘ +/// ``` +/// +/// The `values` are stored in a child [`StringArray`] and the offsets +/// are stored in an [`OffsetBuffer`] as shown in the following +/// diagram. The logical values and offsets are shown on the left, and +/// the actual `ListArray` encoding on the right. +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ┌ ─ ─ ─ ─ ─ ─ ┐ │ +/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ +/// │ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ │ 1 │ │ A │ │ 0 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [] │ │ (3,3) │ │ 1 │ │ 3 │ │ │ 1 │ │ B │ │ 1 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ NULL │ │ (3,4) │ │ 0 │ │ 3 │ │ │ 1 │ │ C │ │ 2 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [D] │ │ (4,5) │ │ 1 │ │ 4 │ │ │ ? │ │ ? │ │ 3 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [NULL, F] │ │ (5,7) │ │ 1 │ │ 5 │ │ │ 1 │ │ D │ │ 4 │ +/// └─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤ +/// │ 7 │ │ │ 0 │ │ ? │ │ 5 │ +/// │ Validity └───┘ ├───┤ ├───┤ +/// Logical Logical (nulls) Offsets │ │ 1 │ │ F │ │ 6 │ +/// Values Offsets │ └───┘ └───┘ +/// │ Values │ │ +/// (offsets[i], │ ListArray (Array) +/// offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// +/// +/// ``` +/// +/// [`StringArray`]: crate::array::StringArray +/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout pub struct GenericListArray { data_type: DataType, nulls: Option, diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index b31814615fc9..5cc7f7b04e0a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -28,6 +28,60 @@ use std::sync::Arc; /// /// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. /// +/// # Example +/// +/// Here is code that constructs a ListArray with the contents: +/// `[[A,B,C], [], NULL, [D], [NULL, F]]` +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{builder::ListBuilder, builder::StringBuilder, ArrayRef, StringArray, Array}; +/// # +/// let values_builder = StringBuilder::new(); +/// let mut builder = ListBuilder::new(values_builder); +/// +/// // [A, B, C] +/// builder.values().append_value("A"); +/// builder.values().append_value("B"); +/// builder.values().append_value("C"); +/// builder.append(true); +/// +/// // [ ] (empty list) +/// builder.append(true); +/// +/// // Null +/// builder.values().append_value("?"); // irrelevant +/// builder.append(false); +/// +/// // [D] +/// builder.values().append_value("D"); +/// builder.append(true); +/// +/// // [NULL, F] +/// builder.values().append_null(); +/// builder.values().append_value("F"); +/// builder.append(true); +/// +/// // Build the array +/// let array = builder.finish(); +/// +/// // Values is a string array +/// // "A", "B" "C", "?", "D", NULL, "F" +/// assert_eq!( +/// array.values().as_ref(), +/// &StringArray::from(vec![ +/// Some("A"), Some("B"), Some("C"), +/// Some("?"), Some("D"), None, +/// Some("F") +/// ]) +/// ); +/// +/// // Offsets are indexes into the values array +/// assert_eq!( +/// array.value_offsets(), +/// &[0, 3, 3, 4, 5, 7] +/// ); +/// ``` /// /// [`ListBuilder`]: crate::builder::ListBuilder /// [`ListArray`]: crate::array::ListArray diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 0111d12fbab1..fede32c57924 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -19,7 +19,43 @@ use crate::buffer::ScalarBuffer; use crate::{ArrowNativeType, MutableBuffer}; use std::ops::Deref; -/// A non-empty buffer of monotonically increasing, positive integers +/// A non-empty buffer of monotonically increasing, positive integers. +/// +/// [`OffsetBuffer`] are used to represent ranges of offsets. An +/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start +/// offset for element `i` is `offsets[i]` and the end offset is +/// `offsets[i+1]`. Equal offsets represent an empty range. +/// +/// # Example +/// +/// This example shows how 5 distinct ranges, are represented using a +/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the +/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets +/// (e.g. an empty list). +/// +/// ```text +/// ┌───────┐ ┌───┐ +/// │ (0,3) │ │ 0 │ +/// ├───────┤ ├───┤ +/// │ (3,3) │ │ 3 │ +/// ├───────┤ ├───┤ +/// │ (3,4) │ │ 3 │ +/// ├───────┤ ├───┤ +/// │ (4,5) │ │ 4 │ +/// ├───────┤ ├───┤ +/// │ (5,7) │ │ 5 │ +/// └───────┘ ├───┤ +/// │ 7 │ +/// └───┘ +/// +/// Offsets Buffer +/// Logical +/// Offsets +/// +/// (offsets[i], +/// offsets[i+1]) +/// ``` + #[derive(Debug, Clone)] pub struct OffsetBuffer(ScalarBuffer); From 00be18dda519599536678069d02ce1bd8dc246ec Mon Sep 17 00:00:00 2001 From: jakevin Date: Thu, 3 Aug 2023 18:15:13 +0800 Subject: [PATCH 1123/1411] fix: compute_dictionary_mapping use wrong offsetSize (#4625) * fix: compute_dictionary_mapping use wrong offsetSize * add a unit test to check type --- arrow-row/src/dictionary.rs | 2 +- arrow-row/src/lib.rs | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs index 6c3ee9e18ced..740b2e205c04 100644 --- a/arrow-row/src/dictionary.rs +++ b/arrow-row/src/dictionary.rs @@ -37,7 +37,7 @@ pub fn compute_dictionary_mapping( values => interner .intern(values.iter().map(|x| x.map(|x| x.encode()))), DataType::Binary => { - let iter = as_generic_binary_array::(values).iter(); + let iter = as_generic_binary_array::(values).iter(); interner.intern(iter) } DataType::LargeBinary => { diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 83ed812df551..396f09380af7 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2442,4 +2442,32 @@ mod tests { assert_eq!(&back[0], &array); } } + + #[test] + fn test_append_codec_dictionary_binary() { + use DataType::*; + // Dictionary RowConverter + let mut converter = RowConverter::new(vec![SortField::new(Dictionary( + Box::new(Int32), + Box::new(Binary), + ))]) + .unwrap(); + let mut rows = converter.empty_rows(4, 128); + + let keys = Int32Array::from_iter_values([0, 1, 2, 3]); + let values = BinaryArray::from(vec![ + Some("a".as_bytes()), + Some(b"b"), + Some(b"c"), + Some(b"d"), + ]); + let dict_array = DictionaryArray::new(keys, Arc::new(values)); + + rows.clear(); + let array = Arc::new(dict_array) as ArrayRef; + converter.append(&mut rows, &[array.clone()]).unwrap(); + let back = converter.convert_rows(&rows).unwrap(); + + assert_eq!(&back[0], &array); + } } From c2b0aa5fae68eedfadd9285a0b4d76e48dc29a6d Mon Sep 17 00:00:00 2001 From: Tomoaki Kawada Date: Thu, 3 Aug 2023 19:15:44 +0900 Subject: [PATCH 1124/1411] fix(select): handle `NullArray` in `nullif` (#4635) * test(select): add a test case passing `NullArray` to `nullif` * fix(select): handle `NullArray` in `nullif` * test(select): remove `as_null_array` calls `dyn Array + '_` implements `PartialEq`, so the unwrapped result of `nullif` can be compared against `NullArray` without downcasting it first. Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-select/src/nullif.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index ab68e8c2f097..f0bcb73cccb9 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -18,7 +18,7 @@ use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_unary_op_helper}; use arrow_buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::ArrowError; +use arrow_schema::{ArrowError, DataType}; /// Copies original array, setting validity bit to false if a secondary comparison /// boolean array is set to true @@ -35,7 +35,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Date: Thu, 3 Aug 2023 21:17:18 +0100 Subject: [PATCH 1125/1411] Vectorized lexicographical_partition_ranges (~80% faster) (#4575) * Faster lexicographical_partition_ranges * Add comments * Add tests and cleanup API * Update benchmarks * Fix bench --- arrow-ord/src/partition.rs | 512 +++++++++++------------------ arrow/benches/partition_kernels.rs | 43 +-- 2 files changed, 215 insertions(+), 340 deletions(-) diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 4411a0f0ab31..4a0a6730d882 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -17,22 +17,64 @@ //! Defines partition kernel for `ArrayRef` -use crate::sort::{LexicographicalComparator, SortColumn}; -use arrow_schema::ArrowError; -use std::cmp::Ordering; use std::ops::Range; -/// Given a list of already sorted columns, returns [`Range`]es that -/// partition the input such that each partition has equal values -/// across sort columns. +use arrow_array::{Array, ArrayRef}; +use arrow_buffer::BooleanBuffer; +use arrow_schema::ArrowError; + +use crate::comparison::neq_dyn; +use crate::sort::SortColumn; + +/// A computed set of partitions, see [`partition`] +#[derive(Debug, Clone)] +pub struct Partitions(Option); + +impl Partitions { + /// Returns the range of each partition + /// + /// Consecutive ranges will be contiguous: i.e [`(a, b)` and `(b, c)`], and + /// `start = 0` and `end = self.len()` for the first and last range respectively + pub fn ranges(&self) -> Vec> { + let boundaries = match &self.0 { + Some(boundaries) => boundaries, + None => return vec![], + }; + + let mut out = vec![]; + let mut current = 0; + for idx in boundaries.set_indices() { + let t = current; + current = idx + 1; + out.push(t..current) + } + let last = boundaries.len() + 1; + if current != last { + out.push(current..last) + } + out + } + + /// Returns the number of partitions + pub fn len(&self) -> usize { + match &self.0 { + Some(b) => b.count_set_bits() + 1, + None => 0, + } + } + + /// Returns true if this contains no partitions + pub fn is_empty(&self) -> bool { + self.0.is_none() + } +} + +/// Given a list of lexicographically sorted columns, computes the [`Partitions`], +/// where a partition consists of the set of consecutive rows with equal values /// /// Returns an error if no columns are specified or all columns do not /// have the same number of rows. /// -/// Returns an iterator with `k` items where `k` is cardinality of the -/// sort values: Consecutive values will be connected: `(a, b)` and `(b, -/// c)`, where `start = 0` and `end = n` for the first and last range. -/// /// # Example: /// /// For example, given columns `x`, `y` and `z`, calling @@ -54,8 +96,7 @@ use std::ops::Range; /// │ 3 │ │ 1 │ │ F │ Range: 5..6 (x=3, y=1) /// └ ─ ┴───┴ ─ ─└───┘─ ─ ┴───┴ ─ ─ ┘ /// -/// x y z lexicographical_partition_ranges -/// by (x,y) +/// x y z partition(&[x, y]) /// ``` /// /// # Example Code @@ -64,27 +105,15 @@ use std::ops::Range; /// # use std::{sync::Arc, ops::Range}; /// # use arrow_array::{RecordBatch, Int64Array, StringArray, ArrayRef}; /// # use arrow_ord::sort::{SortColumn, SortOptions}; -/// # use arrow_ord::partition::lexicographical_partition_ranges; +/// # use arrow_ord::partition::partition; /// let batch = RecordBatch::try_from_iter(vec![ /// ("x", Arc::new(Int64Array::from(vec![1, 1, 1, 1, 2, 3])) as ArrayRef), /// ("y", Arc::new(Int64Array::from(vec![1, 2, 2, 2, 1, 1])) as ArrayRef), /// ("z", Arc::new(StringArray::from(vec!["A", "B", "C", "D", "E", "F"])) as ArrayRef), /// ]).unwrap(); /// -/// // Lexographically partition on (x, y) -/// let sort_columns = vec![ -/// SortColumn { -/// values: batch.column(0).clone(), -/// options: Some(SortOptions::default()), -/// }, -/// SortColumn { -/// values: batch.column(1).clone(), -/// options: Some(SortOptions::default()), -/// }, -/// ]; -/// let ranges:Vec> = lexicographical_partition_ranges(&sort_columns) -/// .unwrap() -/// .collect(); +/// // Partition on first two columns +/// let ranges = partition(&batch.columns()[..2]).unwrap().ranges(); /// /// let expected = vec![ /// (0..1), @@ -95,348 +124,209 @@ use std::ops::Range; /// /// assert_eq!(ranges, expected); /// ``` -pub fn lexicographical_partition_ranges( - columns: &[SortColumn], -) -> Result> + '_, ArrowError> { - LexicographicalPartitionIterator::try_new(columns) -} +pub fn partition(columns: &[ArrayRef]) -> Result { + if columns.is_empty() { + return Err(ArrowError::InvalidArgumentError( + "Partition requires at least one column".to_string(), + )); + } + let num_rows = columns[0].len(); + if columns.iter().any(|item| item.len() != num_rows) { + return Err(ArrowError::InvalidArgumentError( + "Partition columns have different row counts".to_string(), + )); + }; -struct LexicographicalPartitionIterator<'a> { - comparator: LexicographicalComparator<'a>, - num_rows: usize, - previous_partition_point: usize, - partition_point: usize, -} + match num_rows { + 0 => return Ok(Partitions(None)), + 1 => return Ok(Partitions(Some(BooleanBuffer::new_unset(0)))), + _ => {} + } -impl<'a> LexicographicalPartitionIterator<'a> { - fn try_new( - columns: &'a [SortColumn], - ) -> Result { - if columns.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Sort requires at least one column".to_string(), - )); - } - let num_rows = columns[0].values.len(); - if columns.iter().any(|item| item.values.len() != num_rows) { - return Err(ArrowError::ComputeError( - "Lexical sort columns have different row counts".to_string(), - )); - }; + let acc = find_boundaries(&columns[0])?; + let acc = columns + .iter() + .skip(1) + .try_fold(acc, |acc, c| find_boundaries(c.as_ref()).map(|b| &acc | &b))?; - let comparator = LexicographicalComparator::try_new(columns)?; - Ok(LexicographicalPartitionIterator { - comparator, - num_rows, - previous_partition_point: 0, - partition_point: 0, - }) - } + Ok(Partitions(Some(acc))) } -/// Returns the next partition point of the range `start..end` according to the given comparator. -/// The return value is the index of the first element of the second partition, -/// and is guaranteed to be between `start..=end` (inclusive). -/// -/// The values corresponding to those indices are assumed to be partitioned according to the given comparator. -/// -/// Exponential search is to remedy for the case when array size and cardinality are both large. -/// In these cases the partition point would be near the beginning of the range and -/// plain binary search would be doing some unnecessary iterations on each call. -/// -/// see -#[inline] -fn exponential_search_next_partition_point( - start: usize, - end: usize, - comparator: &LexicographicalComparator<'_>, -) -> usize { - let target = start; - let mut bound = 1; - while bound + start < end - && comparator.compare(bound + start, target) != Ordering::Greater - { - bound *= 2; - } +/// Returns a mask with bits set whenever the value or nullability changes +fn find_boundaries(v: &dyn Array) -> Result { + let slice_len = v.len() - 1; + let v1 = v.slice(0, slice_len); + let v2 = v.slice(1, slice_len); - // invariant after while loop: - // (start + bound / 2) <= target < min(end, start + bound + 1) - // where <= and < are defined by the comparator; - // note here we have right = min(end, start + bound + 1) because (start + bound) might - // actually be considered and must be included. - partition_point(start + bound / 2, end.min(start + bound + 1), |idx| { - comparator.compare(idx, target) != Ordering::Greater - }) -} + let array_ne = neq_dyn(v1.as_ref(), v2.as_ref())?; + // Set if values have different non-NULL values + let values_ne = match array_ne.nulls().filter(|n| n.null_count() > 0) { + Some(n) => n.inner() & array_ne.values(), + None => array_ne.values().clone(), + }; -/// Returns the partition point of the range `start..end` according to the given predicate. -/// The return value is the index of the first element of the second partition, -/// and is guaranteed to be between `start..=end` (inclusive). -/// -/// The algorithm is similar to a binary search. -/// -/// The values corresponding to those indices are assumed to be partitioned according to the given predicate. -/// -/// See [`slice::partition_point`] -#[inline] -fn partition_point bool>(start: usize, end: usize, pred: P) -> usize { - let mut left = start; - let mut right = end; - let mut size = right - left; - while left < right { - let mid = left + size / 2; - - let less = pred(mid); - - if less { - left = mid + 1; - } else { - right = mid; + Ok(match v.nulls().filter(|x| x.null_count() > 0) { + Some(n) => { + let n1 = n.inner().slice(0, slice_len); + let n2 = n.inner().slice(1, slice_len); + // Set if values_ne or the nullability has changed + &(&n1 ^ &n2) | &values_ne } - - size = right - left; - } - left + None => values_ne, + }) } -impl<'a> Iterator for LexicographicalPartitionIterator<'a> { - type Item = Range; - - fn next(&mut self) -> Option { - if self.partition_point < self.num_rows { - // invariant: - // in the range [0..previous_partition_point] all values are <= the value at [previous_partition_point] - // so in order to save time we can do binary search on the range [previous_partition_point..num_rows] - // and find the index where any value is greater than the value at [previous_partition_point] - self.partition_point = exponential_search_next_partition_point( - self.partition_point, - self.num_rows, - &self.comparator, - ); - let start = self.previous_partition_point; - let end = self.partition_point; - self.previous_partition_point = self.partition_point; - Some(Range { start, end }) - } else { - None - } - } +/// Given a list of already sorted columns, find partition ranges that would partition +/// lexicographically equal values across columns. +/// +/// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive +/// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last +/// range. +#[deprecated(note = "Use partition")] +pub fn lexicographical_partition_ranges( + columns: &[SortColumn], +) -> Result> + '_, ArrowError> { + let cols: Vec<_> = columns.iter().map(|x| x.values.clone()).collect(); + Ok(partition(&cols)?.ranges().into_iter()) } #[cfg(test)] mod tests { - use super::*; - use crate::sort::SortOptions; + use std::sync::Arc; + use arrow_array::*; use arrow_schema::DataType; - use std::sync::Arc; - #[test] - fn test_partition_point() { - let input = &[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4]; - { - let median = input[input.len() / 2]; - assert_eq!( - 9, - partition_point(0, input.len(), |i: usize| input[i].cmp(&median) - != Ordering::Greater) - ); - } - { - let search = input[9]; - assert_eq!( - 12, - partition_point(9, input.len(), |i: usize| input[i].cmp(&search) - != Ordering::Greater) - ); - } - { - let search = input[0]; - assert_eq!( - 3, - partition_point(0, 9, |i: usize| input[i].cmp(&search) - != Ordering::Greater) - ); - } - let input = &[1, 2, 2, 2, 2, 2, 2, 2, 9]; - { - let search = input[5]; - assert_eq!( - 8, - partition_point(5, 9, |i: usize| input[i].cmp(&search) - != Ordering::Greater) - ); - } - } + use super::*; #[test] - fn test_lexicographical_partition_ranges_empty() { - let input = vec![]; - assert!( - lexicographical_partition_ranges(&input).is_err(), - "lexicographical_partition_ranges should reject columns with empty rows" + fn test_partition_empty() { + let err = partition(&[]).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Partition requires at least one column" ); } #[test] - fn test_lexicographical_partition_ranges_unaligned_rows() { + fn test_partition_unaligned_rows() { let input = vec![ - SortColumn { - values: Arc::new(Int64Array::from(vec![None, Some(-1)])) as ArrayRef, - options: None, - }, - SortColumn { - values: Arc::new(StringArray::from(vec![Some("foo")])) as ArrayRef, - options: None, - }, + Arc::new(Int64Array::from(vec![None, Some(-1)])) as _, + Arc::new(StringArray::from(vec![Some("foo")])) as _, ]; - assert!( - lexicographical_partition_ranges(&input).is_err(), - "lexicographical_partition_ranges should reject columns with different row counts" - ); + let err = partition(&input).unwrap_err(); + assert_eq!( + err.to_string(), + "Invalid argument error: Partition columns have different row counts" + ) } #[test] - fn test_lexicographical_partition_single_column() { - let input = vec![SortColumn { - values: Arc::new(Int64Array::from(vec![1, 2, 2, 2, 2, 2, 2, 2, 9])) - as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }]; - let results = lexicographical_partition_ranges(&input).unwrap(); + fn test_partition_small() { + let results = partition(&[ + Arc::new(Int32Array::new(vec![].into(), None)) as _, + Arc::new(Int32Array::new(vec![].into(), None)) as _, + Arc::new(Int32Array::new(vec![].into(), None)) as _, + ]) + .unwrap(); + assert_eq!(results.len(), 0); + assert!(results.is_empty()); + + let results = partition(&[ + Arc::new(Int32Array::from(vec![1])) as _, + Arc::new(Int32Array::from(vec![1])) as _, + ]) + .unwrap(); + assert_eq!(results.ranges(), &[0..1]); + } + + #[test] + fn test_partition_single_column() { + let a = Int64Array::from(vec![1, 2, 2, 2, 2, 2, 2, 2, 9]); + let input = vec![Arc::new(a) as _]; assert_eq!( - vec![(0_usize..1_usize), (1_usize..8_usize), (8_usize..9_usize)], - results.collect::>() + partition(&input).unwrap().ranges(), + vec![(0..1), (1..8), (8..9)], ); } #[test] - fn test_lexicographical_partition_all_equal_values() { - let input = vec![SortColumn { - values: Arc::new(Int64Array::from_value(1, 1000)) as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }]; - - let results = lexicographical_partition_ranges(&input).unwrap(); - assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); + fn test_partition_all_equal_values() { + let a = Int64Array::from_value(1, 1000); + let input = vec![Arc::new(a) as _]; + assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1000)]); + } + + #[test] + fn test_partition_all_null_values() { + let input = vec![ + new_null_array(&DataType::Int8, 1000), + new_null_array(&DataType::UInt16, 1000), + ]; + assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1000)]); } #[test] - fn test_lexicographical_partition_all_null_values() { + fn test_partition_unique_column_1() { let input = vec![ - SortColumn { - values: new_null_array(&DataType::Int8, 1000), - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }, - SortColumn { - values: new_null_array(&DataType::UInt16, 1000), - options: Some(SortOptions { - descending: false, - nulls_first: false, - }), - }, + Arc::new(Int64Array::from(vec![None, Some(-1)])) as _, + Arc::new(StringArray::from(vec![Some("foo"), Some("bar")])) as _, ]; - let results = lexicographical_partition_ranges(&input).unwrap(); - assert_eq!(vec![(0_usize..1000_usize)], results.collect::>()); + assert_eq!(partition(&input).unwrap().ranges(), vec![(0..1), (1..2)],); } #[test] - fn test_lexicographical_partition_unique_column_1() { + fn test_partition_unique_column_2() { let input = vec![ - SortColumn { - values: Arc::new(Int64Array::from(vec![None, Some(-1)])) as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![Some("foo"), Some("bar")])) - as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, + Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1)])) as _, + Arc::new(StringArray::from(vec![ + Some("foo"), + Some("bar"), + Some("apple"), + ])) as _, ]; - let results = lexicographical_partition_ranges(&input).unwrap(); assert_eq!( - vec![(0_usize..1_usize), (1_usize..2_usize)], - results.collect::>() + partition(&input).unwrap().ranges(), + vec![(0..1), (1..2), (2..3),], ); } #[test] - fn test_lexicographical_partition_unique_column_2() { + fn test_partition_non_unique_column_1() { let input = vec![ - SortColumn { - values: Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1)])) - as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("bar"), - Some("apple"), - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, + Arc::new(Int64Array::from(vec![None, Some(-1), Some(-1), Some(1)])) as _, + Arc::new(StringArray::from(vec![ + Some("foo"), + Some("bar"), + Some("bar"), + Some("bar"), + ])) as _, ]; - let results = lexicographical_partition_ranges(&input).unwrap(); assert_eq!( - vec![(0_usize..1_usize), (1_usize..2_usize), (2_usize..3_usize),], - results.collect::>() + partition(&input).unwrap().ranges(), + vec![(0..1), (1..3), (3..4),], ); } #[test] - fn test_lexicographical_partition_non_unique_column_1() { + fn test_partition_masked_nulls() { let input = vec![ - SortColumn { - values: Arc::new(Int64Array::from(vec![ - None, - Some(-1), - Some(-1), - Some(1), - ])) as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("bar"), - Some("bar"), - Some("bar"), - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, + Arc::new(Int64Array::new(vec![1; 9].into(), None)) as _, + Arc::new(Int64Array::new( + vec![1, 1, 2, 2, 2, 3, 3, 3, 3].into(), + Some( + vec![false, true, true, true, true, false, false, true, false].into(), + ), + )) as _, + Arc::new(Int64Array::new( + vec![1, 1, 2, 2, 2, 2, 2, 3, 7].into(), + Some(vec![true, true, true, true, false, true, true, true, false].into()), + )) as _, ]; - let results = lexicographical_partition_ranges(&input).unwrap(); + assert_eq!( - vec![(0_usize..1_usize), (1_usize..3_usize), (3_usize..4_usize),], - results.collect::>() + partition(&input).unwrap().ranges(), + vec![(0..1), (1..2), (2..4), (4..5), (5..7), (7..8), (8..9)], ); } } diff --git a/arrow/benches/partition_kernels.rs b/arrow/benches/partition_kernels.rs index ae55fbdad22c..85cafbe47a11 100644 --- a/arrow/benches/partition_kernels.rs +++ b/arrow/benches/partition_kernels.rs @@ -20,13 +20,13 @@ extern crate criterion; use criterion::Criterion; use std::sync::Arc; extern crate arrow; -use arrow::compute::kernels::partition::lexicographical_partition_ranges; use arrow::compute::kernels::sort::{lexsort, SortColumn}; use arrow::util::bench_util::*; use arrow::{ array::*, datatypes::{ArrowPrimitiveType, Float64Type, UInt8Type}, }; +use arrow_ord::partition::partition; use rand::distributions::{Distribution, Standard}; use std::iter; @@ -40,19 +40,7 @@ where } fn bench_partition(sorted_columns: &[ArrayRef]) { - let columns = sorted_columns - .iter() - .map(|arr| SortColumn { - values: arr.clone(), - options: None, - }) - .collect::>(); - - criterion::black_box( - lexicographical_partition_ranges(&columns) - .unwrap() - .collect::>(), - ); + criterion::black_box(partition(sorted_columns).unwrap().ranges()); } fn create_sorted_low_cardinality_data(length: usize) -> Vec { @@ -109,37 +97,34 @@ fn create_sorted_data(pow: u32, with_nulls: bool) -> Vec { fn add_benchmark(c: &mut Criterion) { let sorted_columns = create_sorted_data(10, false); - c.bench_function("lexicographical_partition_ranges(u8) 2^10", |b| { + c.bench_function("partition(u8) 2^10", |b| { b.iter(|| bench_partition(&sorted_columns)) }); let sorted_columns = create_sorted_data(12, false); - c.bench_function("lexicographical_partition_ranges(u8) 2^12", |b| { + c.bench_function("partition(u8) 2^12", |b| { b.iter(|| bench_partition(&sorted_columns)) }); let sorted_columns = create_sorted_data(10, true); - c.bench_function( - "lexicographical_partition_ranges(u8) 2^10 with nulls", - |b| b.iter(|| bench_partition(&sorted_columns)), - ); + c.bench_function("partition(u8) 2^10 with nulls", |b| { + b.iter(|| bench_partition(&sorted_columns)) + }); let sorted_columns = create_sorted_data(12, true); - c.bench_function( - "lexicographical_partition_ranges(u8) 2^12 with nulls", - |b| b.iter(|| bench_partition(&sorted_columns)), - ); + c.bench_function("partition(u8) 2^12 with nulls", |b| { + b.iter(|| bench_partition(&sorted_columns)) + }); let sorted_columns = create_sorted_float_data(10, false); - c.bench_function("lexicographical_partition_ranges(f64) 2^10", |b| { + c.bench_function("partition(f64) 2^10", |b| { b.iter(|| bench_partition(&sorted_columns)) }); let sorted_columns = create_sorted_low_cardinality_data(1024); - c.bench_function( - "lexicographical_partition_ranges(low cardinality) 1024", - |b| b.iter(|| bench_partition(&sorted_columns)), - ); + c.bench_function("partition(low cardinality) 1024", |b| { + b.iter(|| bench_partition(&sorted_columns)) + }); } criterion_group!(benches, add_benchmark); From a81da6c89c68507cfb0b37a057dcecd7ba582d9b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:28:18 +0100 Subject: [PATCH 1126/1411] Cleanup sort (#4613) * Cleanup sort * Add inline * Further cleanup * Further sort benchmark fixes --- arrow-array/src/cast.rs | 17 + arrow-ord/src/sort.rs | 785 +++++++---------------------------- arrow/benches/sort_kernel.rs | 26 +- 3 files changed, 196 insertions(+), 632 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index bee8823d1f59..66b40d5b8eb3 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -799,6 +799,15 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; + + /// Downcast this to a [`FixedSizeBinaryArray`] panicking if not possible + fn as_fixed_size_binary(&self) -> &FixedSizeBinaryArray { + self.as_fixed_size_binary_opt() + .expect("fixed size binary array") + } + /// Downcast this to a [`FixedSizeListArray`] returning `None` if not possible fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray>; @@ -848,6 +857,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray> { self.as_any().downcast_ref() } @@ -885,6 +898,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { + self.as_ref().as_fixed_size_binary_opt() + } + fn as_fixed_size_list_opt(&self) -> Option<&FixedSizeListArray> { self.as_ref().as_fixed_size_list_opt() } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index c3e9e26ec05e..648a7d7afcca 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -23,10 +23,9 @@ use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::BooleanBufferBuilder; -use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_buffer::{ArrowNativeType, NullBuffer}; use arrow_data::ArrayDataBuilder; -use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; +use arrow_schema::{ArrowError, DataType}; use arrow_select::take::take; use std::cmp::Ordering; use std::sync::Arc; @@ -181,13 +180,6 @@ where } } -fn cmp(l: T, r: T) -> Ordering -where - T: Ord, -{ - l.cmp(&r) -} - // partition indices into valid and null indices fn partition_validity(array: &dyn Array) -> (Vec, Vec) { match array.null_count() { @@ -204,210 +196,33 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { /// For floating point arrays any NaN values are considered to be greater than any other non-null value. /// `limit` is an option for [partial_sort]. pub fn sort_to_indices( - values: &dyn Array, + array: &dyn Array, options: Option, limit: Option, ) -> Result { let options = options.unwrap_or_default(); - let (v, n) = partition_validity(values); - - Ok(match values.data_type() { - DataType::Decimal128(_, _) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Decimal256(_, _) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Boolean => sort_boolean(values, v, n, &options, limit), - DataType::Int8 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int16 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt8 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt16 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Float16 => sort_primitive::( - values, - v, - n, - |x, y| x.total_cmp(&y), - &options, - limit, - ), - DataType::Float32 => sort_primitive::( - values, - v, - n, - |x, y| x.total_cmp(&y), - &options, - limit, - ), - DataType::Float64 => sort_primitive::( - values, - v, - n, - |x, y| x.total_cmp(&y), - &options, - limit, - ), - DataType::Date32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Date64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time32(TimeUnit::Second) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time32(TimeUnit::Millisecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time64(TimeUnit::Microsecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time64(TimeUnit::Nanosecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Timestamp(TimeUnit::Second, _) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Interval(IntervalUnit::YearMonth) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Interval(IntervalUnit::DayTime) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Duration(TimeUnit::Second) => { - sort_primitive::(values, v, n, cmp, &options, limit) + let (v, n) = partition_validity(array); + + Ok(downcast_primitive_array! { + array => sort_primitive(array, v, n, options, limit), + DataType::Boolean => sort_boolean(array.as_boolean(), v, n, options, limit), + DataType::Utf8 => sort_bytes(array.as_string::(), v, n, options, limit), + DataType::LargeUtf8 => sort_bytes(array.as_string::(), v, n, options, limit), + DataType::Binary => sort_bytes(array.as_binary::(), v, n, options, limit), + DataType::LargeBinary => sort_bytes(array.as_binary::(), v, n, options, limit), + DataType::FixedSizeBinary(_) => sort_fixed_size_binary(array.as_fixed_size_binary(), v, n, options, limit), + DataType::List(_) => sort_list(array.as_list::(), v, n, options, limit)?, + DataType::LargeList(_) => sort_list(array.as_list::(), v, n, options, limit)?, + DataType::FixedSizeList(_, _) => sort_fixed_size_list(array.as_fixed_size_list(), v, n, options, limit)?, + DataType::Dictionary(_, _) => downcast_dictionary_array!{ + array => sort_dictionary(array, v, n, options, limit)?, + _ => unreachable!() } - DataType::Duration(TimeUnit::Millisecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Duration(TimeUnit::Microsecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Duration(TimeUnit::Nanosecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Utf8 => sort_string::(values, v, n, &options, limit), - DataType::LargeUtf8 => sort_string::(values, v, n, &options, limit), - DataType::List(field) | DataType::FixedSizeList(field, _) => { - match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => sort_list::(values, v, n, &options, limit), - DataType::UInt32 => sort_list::(values, v, n, &options, limit), - DataType::UInt64 => sort_list::(values, v, n, &options, limit), - DataType::Float16 => sort_list::(values, v, n, &options, limit), - DataType::Float32 => sort_list::(values, v, n, &options, limit), - DataType::Float64 => sort_list::(values, v, n, &options, limit), - t => { - return Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {t:?}" - ))); - } - } - } - DataType::LargeList(field) => match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => sort_list::(values, v, n, &options, limit), - DataType::UInt32 => sort_list::(values, v, n, &options, limit), - DataType::UInt64 => sort_list::(values, v, n, &options, limit), - DataType::Float16 => sort_list::(values, v, n, &options, limit), - DataType::Float32 => sort_list::(values, v, n, &options, limit), - DataType::Float64 => sort_list::(values, v, n, &options, limit), - t => { - return Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {t:?}" - ))); - } - }, - DataType::Dictionary(_, _) => { - let value_null_first = if options.descending { - // When sorting dictionary in descending order, we take inverse of of null ordering - // when sorting the values. Because if `nulls_first` is true, null must be in front - // of non-null value. As we take the sorted order of value array to sort dictionary - // keys, these null values will be treated as smallest ones and be sorted to the end - // of sorted result. So we set `nulls_first` to false when sorting dictionary value - // array to make them as largest ones, then null values will be put at the beginning - // of sorted dictionary result. - !options.nulls_first - } else { - options.nulls_first - }; - let value_options = Some(SortOptions { - descending: false, - nulls_first: value_null_first, - }); - downcast_dictionary_array! { - values => { - let dict_values = values.values(); - let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; - let rank = sorted_rank(&sorted_value_indices); - sort_dictionary(values, &rank, v, n, options, limit) - } - _ => unreachable!(), - } - } - DataType::Binary | DataType::FixedSizeBinary(_) => { - sort_binary::(values, v, n, &options, limit) - } - DataType::LargeBinary => sort_binary::(values, v, n, &options, limit), DataType::RunEndEncoded(run_ends_field, _) => match run_ends_field.data_type() { - DataType::Int16 => sort_run_to_indices::(values, &options, limit), - DataType::Int32 => sort_run_to_indices::(values, &options, limit), - DataType::Int64 => sort_run_to_indices::(values, &options, limit), + DataType::Int16 => sort_run_to_indices::(array, options, limit), + DataType::Int32 => sort_run_to_indices::(array, options, limit), + DataType::Int64 => sort_run_to_indices::(array, options, limit), dt => { return Err(ArrowError::ComputeError(format!( "Invalid run end data type: {dt}" @@ -422,147 +237,76 @@ pub fn sort_to_indices( }) } -/// Sort boolean values -/// -/// when a limit is present, the sort is pair-comparison based as k-select might be more efficient, -/// when the limit is absent, binary partition is used to speed up (which is linear). -/// -/// TODO maybe partition_validity call can be eliminated in this case -/// and [tri-color sort](https://en.wikipedia.org/wiki/Dutch_national_flag_problem) -/// can be used instead. fn sort_boolean( - values: &dyn Array, + values: &BooleanArray, value_indices: Vec, - mut null_indices: Vec, - options: &SortOptions, + null_indices: Vec, + options: SortOptions, limit: Option, ) -> UInt32Array { - let values = values - .as_any() - .downcast_ref::() - .expect("Unable to downcast to boolean array"); - let descending = options.descending; - - let valids_len = value_indices.len(); - let nulls_len = null_indices.len(); - - let mut len = values.len(); - let valids = if let Some(limit) = limit { - len = limit.min(len); - // create tuples that are used for sorting - let mut valids = value_indices - .into_iter() - .map(|index| (index, values.value(index as usize))) - .collect::>(); - - sort_valids(descending, &mut valids, len, cmp); - valids - } else { - // when limit is not present, we have a better way than sorting: we can just partition - // the vec into [false..., true...] or [true..., false...] when descending - // TODO when https://github.com/rust-lang/rust/issues/62543 is merged we can use partition_in_place - let (mut a, b): (Vec<_>, Vec<_>) = value_indices - .into_iter() - .map(|index| (index, values.value(index as usize))) - .partition(|(_, value)| *value == descending); - a.extend(b); - if descending { - null_indices.reverse(); - } - a - }; - - let nulls = null_indices; - - // collect results directly into a buffer instead of a vec to avoid another aligned allocation - let result_capacity = len * std::mem::size_of::(); - let mut result = MutableBuffer::new(result_capacity); - // sets len to capacity so we can access the whole buffer as a typed slice - result.resize(result_capacity, 0); - let result_slice: &mut [u32] = result.typed_data_mut(); - - if options.nulls_first { - let size = nulls_len.min(len); - result_slice[0..size].copy_from_slice(&nulls[0..size]); - if nulls_len < len { - insert_valid_values(result_slice, nulls_len, &valids[0..len - size]); - } - } else { - // nulls last - let size = valids.len().min(len); - insert_valid_values(result_slice, 0, &valids[0..size]); - if len > size { - result_slice[valids_len..].copy_from_slice(&nulls[0..(len - valids_len)]); - } - } - - let result_data = unsafe { - ArrayData::new_unchecked( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ) - }; + let mut valids = value_indices + .into_iter() + .map(|index| (index, values.value(index as usize))) + .collect::>(); + sort_impl(options, &mut valids, &null_indices, limit, |a, b| a.cmp(&b)).into() +} - UInt32Array::from(result_data) +fn sort_primitive( + values: &PrimitiveArray, + value_indices: Vec, + nulls: Vec, + options: SortOptions, + limit: Option, +) -> UInt32Array { + let mut valids = value_indices + .into_iter() + .map(|index| (index, values.value(index as usize))) + .collect::>(); + sort_impl(options, &mut valids, &nulls, limit, T::Native::compare).into() } -/// Sort primitive values -fn sort_primitive( - values: &dyn Array, +fn sort_bytes( + values: &GenericByteArray, value_indices: Vec, - null_indices: Vec, - cmp: F, - options: &SortOptions, + nulls: Vec, + options: SortOptions, limit: Option, -) -> UInt32Array -where - T: ArrowPrimitiveType, - T::Native: PartialOrd, - F: Fn(T::Native, T::Native) -> Ordering, -{ - // create tuples that are used for sorting - let valids = { - let values = values.as_primitive::(); - value_indices - .into_iter() - .map(|index| (index, values.value(index as usize))) - .collect::>() - }; - sort_primitive_inner(values.len(), null_indices, cmp, options, limit, valids) +) -> UInt32Array { + let mut valids = value_indices + .into_iter() + .map(|index| (index, values.value(index as usize).as_ref())) + .collect::>(); + + sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() } -/// Given a list of indices that yield a sorted order, returns the ordered -/// rank of each index -/// -/// e.g. [2, 4, 3, 1, 0] -> [4, 3, 0, 2, 1] -fn sorted_rank(sorted_value_indices: &UInt32Array) -> Vec { - assert_eq!(sorted_value_indices.null_count(), 0); - let sorted_indices = sorted_value_indices.values(); - let mut out: Vec<_> = vec![0_u32; sorted_indices.len()]; - for (ix, val) in sorted_indices.iter().enumerate() { - out[*val as usize] = ix as u32; - } - out +fn sort_fixed_size_binary( + values: &FixedSizeBinaryArray, + value_indices: Vec, + nulls: Vec, + options: SortOptions, + limit: Option, +) -> UInt32Array { + let mut valids = value_indices + .iter() + .copied() + .map(|index| (index, values.value(index as usize))) + .collect::>(); + sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() } -/// Sort dictionary given the sorted rank of each key fn sort_dictionary( dict: &DictionaryArray, - rank: &[u32], value_indices: Vec, null_indices: Vec, options: SortOptions, limit: Option, -) -> UInt32Array { +) -> Result { let keys: &PrimitiveArray = dict.keys(); + let rank = child_rank(dict.values().as_ref(), options)?; // create tuples that are used for sorting - let valids = value_indices + let mut valids = value_indices .into_iter() .map(|index| { let key: K::Native = keys.value(index as usize); @@ -570,83 +314,100 @@ fn sort_dictionary( }) .collect::>(); - sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, &options, limit, valids) + Ok(sort_impl(options, &mut valids, &null_indices, limit, |a, b| a.cmp(&b)).into()) } -// sort is instantiated a lot so we only compile this inner version for each native type -fn sort_primitive_inner( - value_len: usize, - nulls: Vec, - cmp: F, - options: &SortOptions, +fn sort_list( + array: &GenericListArray, + value_indices: Vec, + null_indices: Vec, + options: SortOptions, limit: Option, - mut valids: Vec<(u32, T)>, -) -> UInt32Array -where - T: ArrowNativeType, - T: PartialOrd, - F: Fn(T, T) -> Ordering, -{ - let valids_len = valids.len(); - let nulls_len = nulls.len(); - let mut len = value_len; +) -> Result { + let rank = child_rank(array.values().as_ref(), options)?; + let offsets = array.value_offsets(); + let mut valids = value_indices + .into_iter() + .map(|index| { + let end = offsets[index as usize + 1].as_usize(); + let start = offsets[index as usize].as_usize(); + (index, &rank[start..end]) + }) + .collect::>(); + Ok(sort_impl(options, &mut valids, &null_indices, limit, Ord::cmp).into()) +} - if let Some(limit) = limit { - len = limit.min(len); - } +fn sort_fixed_size_list( + array: &FixedSizeListArray, + value_indices: Vec, + null_indices: Vec, + options: SortOptions, + limit: Option, +) -> Result { + let rank = child_rank(array.values().as_ref(), options)?; + let size = array.value_length() as usize; + let mut valids = value_indices + .into_iter() + .map(|index| { + let start = index as usize * size; + (index, &rank[start..start + size]) + }) + .collect::>(); + Ok(sort_impl(options, &mut valids, &null_indices, limit, Ord::cmp).into()) +} - sort_valids(options.descending, &mut valids, len, cmp); +#[inline(never)] +fn sort_impl( + options: SortOptions, + valids: &mut [(u32, T)], + nulls: &[u32], + limit: Option, + mut cmp: impl FnMut(T, T) -> Ordering, +) -> Vec { + let v_limit = match (limit, options.nulls_first) { + (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), + _ => valids.len(), + }; - // collect results directly into a buffer instead of a vec to avoid another aligned allocation - let result_capacity = len * std::mem::size_of::(); - let mut result = MutableBuffer::new(result_capacity); - // sets len to capacity so we can access the whole buffer as a typed slice - result.resize(result_capacity, 0); - let result_slice: &mut [u32] = result.typed_data_mut(); + match options.descending { + false => sort_unstable_by(valids, v_limit, |a, b| cmp(a.1, b.1)), + true => sort_unstable_by(valids, v_limit, |a, b| cmp(a.1, b.1).reverse()), + } - if options.nulls_first { - let size = nulls_len.min(len); - result_slice[0..size].copy_from_slice(&nulls[0..size]); - if nulls_len < len { - insert_valid_values(result_slice, nulls_len, &valids[0..len - size]); + let len = valids.len() + nulls.len(); + let limit = limit.unwrap_or(len).min(len); + let mut out = Vec::with_capacity(len); + match options.nulls_first { + true => { + out.extend_from_slice(&nulls[..nulls.len().min(limit)]); + let remaining = limit - out.len(); + out.extend(valids.iter().map(|x| x.0).take(remaining)); } - } else { - // nulls last - let size = valids.len().min(len); - insert_valid_values(result_slice, 0, &valids[0..size]); - if len > size { - result_slice[valids_len..].copy_from_slice(&nulls[0..(len - valids_len)]); + false => { + out.extend(valids.iter().map(|x| x.0).take(limit)); + let remaining = limit - out.len(); + out.extend_from_slice(&nulls[..remaining]) } } - - let result_data = unsafe { - ArrayData::new_unchecked( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ) - }; - - UInt32Array::from(result_data) + out } -// insert valid and nan values in the correct order depending on the descending flag -fn insert_valid_values(result_slice: &mut [u32], offset: usize, valids: &[(u32, T)]) { - let valids_len = valids.len(); - // helper to append the index part of the valid tuples - let append_valids = move |dst_slice: &mut [u32]| { - debug_assert_eq!(dst_slice.len(), valids_len); - dst_slice - .iter_mut() - .zip(valids.iter()) - .for_each(|(dst, src)| *dst = src.0) - }; +/// Computes the rank for a set of child values +fn child_rank(values: &dyn Array, options: SortOptions) -> Result, ArrowError> { + // If parent sort order is descending we need to invert the value of nulls_first so that + // when the parent is sorted based on the produced ranks, nulls are still ordered correctly + let value_options = Some(SortOptions { + descending: false, + nulls_first: options.nulls_first != options.descending, + }); - append_valids(&mut result_slice[offset..offset + valids.len()]); + let sorted_value_indices = sort_to_indices(values, value_options, None)?; + let sorted_indices = sorted_value_indices.values(); + let mut out: Vec<_> = vec![0_u32; sorted_indices.len()]; + for (ix, val) in sorted_indices.iter().enumerate() { + out[*val as usize] = ix as u32; + } + Ok(out) } // Sort run array and return sorted run array. @@ -737,7 +498,7 @@ fn sort_run_downcasted( // encoded back to run array. fn sort_run_to_indices( values: &dyn Array, - options: &SortOptions, + options: SortOptions, limit: Option, ) -> UInt32Array { let run_array = values.as_any().downcast_ref::>().unwrap(); @@ -752,7 +513,7 @@ fn sort_run_to_indices( let consume_runs = |run_length, logical_start| { result.extend(logical_start as u32..(logical_start + run_length) as u32); }; - sort_run_inner(run_array, Some(*options), output_len, consume_runs); + sort_run_inner(run_array, Some(options), output_len, consume_runs); UInt32Array::from(result) } @@ -834,200 +595,6 @@ where (values_indices, run_values) } -/// Sort strings -fn sort_string( - values: &dyn Array, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> UInt32Array { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - - sort_string_helper( - values, - value_indices, - null_indices, - options, - limit, - |array, idx| array.value(idx as usize), - ) -} - -/// shared implementation between dictionary encoded and plain string arrays -#[inline] -fn sort_string_helper<'a, A: Array, F>( - values: &'a A, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, - value_fn: F, -) -> UInt32Array -where - F: Fn(&'a A, u32) -> &str, -{ - let mut valids = value_indices - .into_iter() - .map(|index| (index, value_fn(values, index))) - .collect::>(); - let mut nulls = null_indices; - let descending = options.descending; - let mut len = values.len(); - - if let Some(limit) = limit { - len = limit.min(len); - } - - sort_valids(descending, &mut valids, len, cmp); - // collect the order of valid tuplies - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); - - if options.nulls_first { - nulls.append(&mut valid_indices); - nulls.truncate(len); - UInt32Array::from(nulls) - } else { - // no need to sort nulls as they are in the correct order already - valid_indices.append(&mut nulls); - valid_indices.truncate(len); - UInt32Array::from(valid_indices) - } -} - -fn sort_list( - values: &dyn Array, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> UInt32Array -where - S: OffsetSizeTrait, -{ - sort_list_inner::(values, value_indices, null_indices, options, limit) -} - -fn sort_list_inner( - values: &dyn Array, - value_indices: Vec, - mut null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> UInt32Array -where - S: OffsetSizeTrait, -{ - let mut valids: Vec<(u32, ArrayRef)> = values - .as_any() - .downcast_ref::() - .map_or_else( - || { - let values = as_generic_list_array::(values); - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - |values| { - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - ); - - let mut len = values.len(); - let descending = options.descending; - - if let Some(limit) = limit { - len = limit.min(len); - } - sort_valids_array(descending, &mut valids, &mut null_indices, len); - - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); - if options.nulls_first { - null_indices.append(&mut valid_indices); - null_indices.truncate(len); - UInt32Array::from(null_indices) - } else { - valid_indices.append(&mut null_indices); - valid_indices.truncate(len); - UInt32Array::from(valid_indices) - } -} - -fn sort_binary( - values: &dyn Array, - value_indices: Vec, - mut null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> UInt32Array -where - S: OffsetSizeTrait, -{ - let mut valids: Vec<(u32, &[u8])> = values - .as_any() - .downcast_ref::() - .map_or_else( - || { - let values = as_generic_binary_array::(values); - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - |values| { - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - ); - - let mut len = values.len(); - let descending = options.descending; - - if let Some(limit) = limit { - len = limit.min(len); - } - - sort_valids(descending, &mut valids, len, cmp); - - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); - if options.nulls_first { - null_indices.append(&mut valid_indices); - null_indices.truncate(len); - UInt32Array::from(null_indices) - } else { - valid_indices.append(&mut null_indices); - valid_indices.truncate(len); - UInt32Array::from(valid_indices) - } -} - -/// Compare two `Array`s based on the ordering defined in [build_compare] -fn cmp_array(a: &dyn Array, b: &dyn Array) -> Ordering { - let cmp_op = build_compare(a, b).unwrap(); - let length = a.len().max(b.len()); - - for i in 0..length { - let result = cmp_op(i, i); - if result != Ordering::Equal { - return result; - } - } - Ordering::Equal -} - /// One column to be used in lexicographical sort #[derive(Clone, Debug)] pub struct SortColumn { @@ -1146,8 +713,10 @@ pub fn partial_sort(v: &mut [T], limit: usize, mut is_less: F) where F: FnMut(&T, &T) -> Ordering, { - let (before, _mid, _after) = v.select_nth_unstable_by(limit, &mut is_less); - before.sort_unstable_by(is_less); + if let Some(n) = limit.checked_sub(1) { + let (before, _mid, _after) = v.select_nth_unstable_by(n, &mut is_less); + before.sort_unstable_by(is_less); + } } type LexicographicalCompareItem<'a> = ( @@ -1228,42 +797,6 @@ impl LexicographicalComparator<'_> { } } -fn sort_valids( - descending: bool, - valids: &mut [(u32, T)], - len: usize, - mut cmp: impl FnMut(T, T) -> Ordering, -) where - T: ?Sized + Copy, -{ - let valids_len = valids.len(); - if !descending { - sort_unstable_by(valids, len.min(valids_len), |a, b| cmp(a.1, b.1)); - } else { - sort_unstable_by(valids, len.min(valids_len), |a, b| cmp(a.1, b.1).reverse()); - } -} - -fn sort_valids_array( - descending: bool, - valids: &mut [(u32, ArrayRef)], - nulls: &mut [T], - len: usize, -) { - let valids_len = valids.len(); - if !descending { - sort_unstable_by(valids, len.min(valids_len), |a, b| { - cmp_array(a.1.as_ref(), b.1.as_ref()) - }); - } else { - sort_unstable_by(valids, len.min(valids_len), |a, b| { - cmp_array(a.1.as_ref(), b.1.as_ref()).reverse() - }); - // reverse to keep a stable ordering - nulls.reverse(); - } -} - #[cfg(test)] mod tests { use super::*; @@ -1980,7 +1513,7 @@ mod tests { nulls_first: false, }), None, - vec![2, 3, 1, 4, 5, 0], + vec![2, 3, 1, 4, 0, 5], ); // boolean, descending, nulls first @@ -1991,7 +1524,7 @@ mod tests { nulls_first: true, }), None, - vec![5, 0, 2, 3, 1, 4], + vec![0, 5, 2, 3, 1, 4], ); // boolean, descending, nulls first, limit diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index 3a3ce4462dff..63e10e0528ba 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -67,23 +67,37 @@ fn bench_sort_to_indices(array: &dyn Array, limit: Option) { fn add_benchmark(c: &mut Criterion) { let arr = create_primitive_array::(2usize.pow(10), 0.0); - c.bench_function("sort i64 2^10", |b| b.iter(|| bench_sort(&arr))); - - let arr = create_primitive_array::(2usize.pow(12), 0.5); - c.bench_function("sort i64 2^12", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 2^10", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 to indices 2^10", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) + }); let arr = create_primitive_array::(2usize.pow(12), 0.0); - c.bench_function("sort i64 nulls 2^10", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 2^12", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) + }); + + let arr = create_primitive_array::(2usize.pow(10), 0.5); + c.bench_function("sort i32 nulls 2^10", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 nulls to indices 2^10", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) + }); let arr = create_primitive_array::(2usize.pow(12), 0.5); - c.bench_function("sort i64 nulls 2^12", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 nulls 2^12", |b| b.iter(|| bench_sort(&arr))); + c.bench_function("sort i32 nulls to indices 2^12", |b| { + b.iter(|| bench_sort_to_indices(&arr, None)) + }); let arr = create_f32_array(2_usize.pow(12), false); + c.bench_function("sort f32 2^12", |b| b.iter(|| bench_sort(&arr))); c.bench_function("sort f32 to indices 2^12", |b| { b.iter(|| bench_sort_to_indices(&arr, None)) }); let arr = create_f32_array(2usize.pow(12), true); + c.bench_function("sort f32 nulls 2^12", |b| b.iter(|| bench_sort(&arr))); c.bench_function("sort f32 nulls to indices 2^12", |b| { b.iter(|| bench_sort_to_indices(&arr, None)) }); From b15838cfc146b77127660c9ea68eec2816e93a74 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 4 Aug 2023 17:32:37 +0200 Subject: [PATCH 1127/1411] Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` (#4638) * Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` * Revert `BufferBuilder::append` to use `MutableBuffer::push` --- arrow-buffer/src/builder/mod.rs | 67 ++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/arrow-buffer/src/builder/mod.rs b/arrow-buffer/src/builder/mod.rs index 1d45bf40d2a2..464f9a202c03 100644 --- a/arrow-buffer/src/builder/mod.rs +++ b/arrow-buffer/src/builder/mod.rs @@ -23,7 +23,7 @@ mod null; pub use null::*; use crate::{ArrowNativeType, Buffer, MutableBuffer}; -use std::marker::PhantomData; +use std::{iter, marker::PhantomData}; /// Builder for creating a [Buffer] object. /// @@ -211,10 +211,7 @@ impl BufferBuilder { #[inline] pub fn append_n(&mut self, n: usize, v: T) { self.reserve(n); - for _ in 0..n { - self.buffer.push(v); - } - self.len += n; + self.extend(iter::repeat(v).take(n)) } /// Appends `n`, zero-initialized values @@ -336,10 +333,7 @@ impl BufferBuilder { .1 .expect("append_trusted_len_iter expects upper bound"); self.reserve(len); - for v in iter { - self.buffer.push(v) - } - self.len += len; + self.extend(iter); } /// Resets this builder and returns an immutable [Buffer]. @@ -358,8 +352,61 @@ impl BufferBuilder { /// ``` #[inline] pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); + let buf = std::mem::take(&mut self.buffer); self.len = 0; buf.into() } } + +impl Default for BufferBuilder { + fn default() -> Self { + Self::new(0) + } +} + +impl Extend for BufferBuilder { + fn extend>(&mut self, iter: I) { + self.buffer.extend(iter.into_iter().inspect(|_| { + self.len += 1; + })) + } +} + +impl FromIterator for BufferBuilder { + fn from_iter>(iter: I) -> Self { + let mut builder = Self::default(); + builder.extend(iter); + builder + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::mem; + + #[test] + fn default() { + let builder = BufferBuilder::::default(); + assert!(builder.is_empty()); + assert!(builder.buffer.is_empty()); + assert_eq!(builder.buffer.capacity(), 0); + } + + #[test] + fn from_iter() { + let input = [1u16, 2, 3, 4]; + let builder = input.into_iter().collect::>(); + assert_eq!(builder.len(), 4); + assert_eq!(builder.buffer.len(), 4 * mem::size_of::()); + } + + #[test] + fn extend() { + let input = [1, 2]; + let mut builder = input.into_iter().collect::>(); + assert_eq!(builder.len(), 2); + builder.extend([3, 4]); + assert_eq!(builder.len(), 4); + } +} From 273dcc18a8d2389e8429aa25e8530b0e9ac27b2a Mon Sep 17 00:00:00 2001 From: jakevin Date: Sat, 5 Aug 2023 03:09:13 +0800 Subject: [PATCH 1128/1411] refactor: from_thrift avoid panic (#4642) --- parquet/src/file/metadata.rs | 2 +- parquet/src/file/serialized_reader.rs | 4 ++-- parquet/src/file/statistics.rs | 28 +++++++++++++++------------ parquet/src/file/writer.rs | 6 ++++-- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 4cb2e9ab2a6a..b1a812c0ecd5 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -616,7 +616,7 @@ impl ColumnChunkMetaData { let data_page_offset = col_metadata.data_page_offset; let index_page_offset = col_metadata.index_page_offset; let dictionary_page_offset = col_metadata.dictionary_page_offset; - let statistics = statistics::from_thrift(column_type, col_metadata.statistics); + let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?; let encoding_stats = col_metadata .encoding_stats .as_ref() diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f685f14bd92f..629606e587d4 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -461,7 +461,7 @@ pub(crate) fn decode_page( encoding: Encoding::try_from(header.encoding)?, def_level_encoding: Encoding::try_from(header.definition_level_encoding)?, rep_level_encoding: Encoding::try_from(header.repetition_level_encoding)?, - statistics: statistics::from_thrift(physical_type, header.statistics), + statistics: statistics::from_thrift(physical_type, header.statistics)?, } } PageType::DATA_PAGE_V2 => { @@ -477,7 +477,7 @@ pub(crate) fn decode_page( def_levels_byte_len: header.definition_levels_byte_length as u32, rep_levels_byte_len: header.repetition_levels_byte_length as u32, is_compressed, - statistics: statistics::from_thrift(physical_type, header.statistics), + statistics: statistics::from_thrift(physical_type, header.statistics)?, } } _ => { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 939ce037f968..b36e37a80c97 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -44,6 +44,7 @@ use crate::format::Statistics as TStatistics; use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::*; +use crate::errors::{ParquetError, Result}; use crate::util::bit_util::from_le_slice; pub(crate) mod private { @@ -119,15 +120,18 @@ macro_rules! statistics_enum_func { pub fn from_thrift( physical_type: Type, thrift_stats: Option, -) -> Option { - match thrift_stats { +) -> Result> { + Ok(match thrift_stats { Some(stats) => { // Number of nulls recorded, when it is not available, we just mark it as 0. let null_count = stats.null_count.unwrap_or(0); - assert!( - null_count >= 0, - "Statistics null count is negative ({null_count})" - ); + + if null_count < 0 { + return Err(ParquetError::General(format!( + "Statistics null count is negative {}", + null_count + ))); + } // Generic null count. let null_count = null_count as u64; @@ -221,7 +225,7 @@ pub fn from_thrift( Some(res) } None => None, - } + }) } // Convert Statistics into Thrift definition. @@ -594,7 +598,7 @@ mod tests { } #[test] - #[should_panic(expected = "Statistics null count is negative (-10)")] + #[should_panic(expected = "General(\"Statistics null count is negative -10\")")] fn test_statistics_negative_null_count() { let thrift_stats = TStatistics { max: None, @@ -605,13 +609,13 @@ mod tests { min_value: None, }; - from_thrift(Type::INT32, Some(thrift_stats)); + from_thrift(Type::INT32, Some(thrift_stats)).unwrap(); } #[test] fn test_statistics_thrift_none() { - assert_eq!(from_thrift(Type::INT32, None), None); - assert_eq!(from_thrift(Type::BYTE_ARRAY, None), None); + assert_eq!(from_thrift(Type::INT32, None).unwrap(), None); + assert_eq!(from_thrift(Type::BYTE_ARRAY, None).unwrap(), None); } #[test] @@ -715,7 +719,7 @@ mod tests { fn check_stats(stats: Statistics) { let tpe = stats.physical_type(); let thrift_stats = to_thrift(Some(&stats)); - assert_eq!(from_thrift(tpe, thrift_stats), Some(stats)); + assert_eq!(from_thrift(tpe, thrift_stats).unwrap(), Some(stats)); } check_stats(Statistics::boolean(Some(false), Some(true), None, 7, true)); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 3b2dd8289455..7e1034ae7b63 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1135,7 +1135,8 @@ mod tests { statistics: from_thrift( physical_type, to_thrift(statistics.as_ref()), - ), + ) + .unwrap(), } } Page::DataPageV2 { @@ -1168,7 +1169,8 @@ mod tests { statistics: from_thrift( physical_type, to_thrift(statistics.as_ref()), - ), + ) + .unwrap(), } } Page::DictionaryPage { From 0aa49fc08bfc123d70cb16f78505be951c0449a9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 5 Aug 2023 14:45:13 +0100 Subject: [PATCH 1129/1411] Pin latest nightly (#4652) --- .github/workflows/arrow.yml | 6 ++++-- .github/workflows/docs.yml | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 279e276a7912..4872b66077f6 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -142,7 +142,8 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - rust-version: nightly + # Pinned nightly (#4651) + rust-version: nightly-2023-08-03 - name: Test arrow-array with SIMD run: cargo test -p arrow-array --features simd - name: Test arrow-ord with SIMD @@ -168,7 +169,8 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - rust-version: nightly + # Pinned nightly (#4651) + rust-version: nightly-2023-08-03 target: wasm32-unknown-unknown,wasm32-wasi - name: Build wasm32-unknown-unknown run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4ca71f464591..2bdfeea2471e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -37,7 +37,8 @@ jobs: strategy: matrix: arch: [ amd64 ] - rust: [ nightly ] + # Pinned nightly (#4651) + rust: [ nightly-2023-08-03 ] container: image: ${{ matrix.arch }}/rust env: From 6fd50329b1680bccd61b4de8de43ff67aa8c1a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 5 Aug 2023 15:49:35 +0200 Subject: [PATCH 1130/1411] Filter record batch with 0 columns (#4648) * Filter empty record batch: * Fix title * Add doc comment --- arrow-select/src/filter.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 94afd2df376b..f2da79e243c8 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -187,8 +187,8 @@ pub fn filter_record_batch( .iter() .map(|a| filter_array(a, &filter)) .collect::, _>>()?; - - RecordBatch::try_new(record_batch.schema(), filtered_arrays) + let options = RecordBatchOptions::default().with_row_count(Some(filter.count())); + RecordBatch::try_new_with_options(record_batch.schema(), filtered_arrays, &options) } /// A builder to construct [`FilterPredicate`] @@ -301,6 +301,11 @@ impl FilterPredicate { pub fn filter(&self, values: &dyn Array) -> Result { filter_array(values, self) } + + /// Number of rows being selected based on this [`FilterPredicate`] + pub fn count(&self) -> usize { + self.count + } } fn filter_array( @@ -977,6 +982,21 @@ mod tests { assert_eq!(out.as_ref(), &a.slice(0, 2)); } + #[test] + fn test_filter_record_batch_no_columns() { + let pred = BooleanArray::from(vec![Some(true), Some(true), None]); + let options = RecordBatchOptions::default().with_row_count(Some(100)); + let record_batch = RecordBatch::try_new_with_options( + Arc::new(Schema::empty()), + vec![], + &options, + ) + .unwrap(); + let out = filter_record_batch(&record_batch, &pred).unwrap(); + + assert_eq!(out.num_rows(), 2); + } + #[test] fn test_fast_path() { let a: PrimitiveArray = From bcc04a40fdc8d24d201f82a53f79a7b80b3a74fb Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Sun, 6 Aug 2023 14:58:57 +0200 Subject: [PATCH 1131/1411] impl `From>` for `BufferBuilder` and `MutableBuffer` (#4650) * impl `From>` for `BufferBuilder` and `MutableBuffer` * Deprecate `MutableBuffer::from_vec` --- arrow-buffer/src/buffer/immutable.rs | 2 +- arrow-buffer/src/buffer/mutable.rs | 27 +++++++++++++++++---------- arrow-buffer/src/builder/mod.rs | 6 ++++++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 2ecd3b41913a..8296d3fbcc31 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -74,7 +74,7 @@ impl Buffer { /// Create a [`Buffer`] from the provided [`Vec`] without copying #[inline] pub fn from_vec(vec: Vec) -> Self { - MutableBuffer::from_vec(vec).into() + MutableBuffer::from(vec).into() } /// Initializes a [Buffer] from a slice of items. diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 0177582b0b97..2c56f9a5b270 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -112,17 +112,9 @@ impl MutableBuffer { /// Create a [`MutableBuffer`] from the provided [`Vec`] without copying #[inline] + #[deprecated(note = "Use From>")] pub fn from_vec(vec: Vec) -> Self { - // Safety - // Vec::as_ptr guaranteed to not be null and ArrowNativeType are trivially transmutable - let data = unsafe { NonNull::new_unchecked(vec.as_ptr() as _) }; - let len = vec.len() * mem::size_of::(); - // Safety - // Vec guaranteed to have a valid layout matching that of `Layout::array` - // This is based on `RawVec::current_memory` - let layout = unsafe { Layout::array::(vec.capacity()).unwrap_unchecked() }; - mem::forget(vec); - Self { data, len, layout } + Self::from(vec) } /// Allocates a new [MutableBuffer] from given `Bytes`. @@ -502,6 +494,21 @@ impl Extend for MutableBuffer { } } +impl From> for MutableBuffer { + fn from(value: Vec) -> Self { + // Safety + // Vec::as_ptr guaranteed to not be null and ArrowNativeType are trivially transmutable + let data = unsafe { NonNull::new_unchecked(value.as_ptr() as _) }; + let len = value.len() * mem::size_of::(); + // Safety + // Vec guaranteed to have a valid layout matching that of `Layout::array` + // This is based on `RawVec::current_memory` + let layout = unsafe { Layout::array::(value.capacity()).unwrap_unchecked() }; + mem::forget(value); + Self { data, len, layout } + } +} + impl MutableBuffer { #[inline] pub(super) fn extend_from_iter>( diff --git a/arrow-buffer/src/builder/mod.rs b/arrow-buffer/src/builder/mod.rs index 464f9a202c03..d5d5a7d3f18d 100644 --- a/arrow-buffer/src/builder/mod.rs +++ b/arrow-buffer/src/builder/mod.rs @@ -372,6 +372,12 @@ impl Extend for BufferBuilder { } } +impl From> for BufferBuilder { + fn from(value: Vec) -> Self { + Self::new_from_buffer(MutableBuffer::from(value)) + } +} + impl FromIterator for BufferBuilder { fn from_iter>(iter: I) -> Self { let mut builder = Self::default(); From 3b24ca1e624e3f76969532fbc6ca898f7d6fd8f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 6 Aug 2023 15:10:09 +0200 Subject: [PATCH 1132/1411] Update packed_simd and run miri tests on simd code (#4654) * Update packed_simd and run miri tests on simd code * Unpin nightly version --- .github/workflows/arrow.yml | 6 ++---- .github/workflows/docs.yml | 3 +-- .github/workflows/miri.sh | 8 +++----- arrow-array/Cargo.toml | 2 +- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 4872b66077f6..279e276a7912 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -142,8 +142,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - # Pinned nightly (#4651) - rust-version: nightly-2023-08-03 + rust-version: nightly - name: Test arrow-array with SIMD run: cargo test -p arrow-array --features simd - name: Test arrow-ord with SIMD @@ -169,8 +168,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - # Pinned nightly (#4651) - rust-version: nightly-2023-08-03 + rust-version: nightly target: wasm32-unknown-unknown,wasm32-wasi - name: Build wasm32-unknown-unknown run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2bdfeea2471e..4ca71f464591 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -37,8 +37,7 @@ jobs: strategy: matrix: arch: [ amd64 ] - # Pinned nightly (#4651) - rust: [ nightly-2023-08-03 ] + rust: [ nightly ] container: image: ${{ matrix.arch }}/rust env: diff --git a/.github/workflows/miri.sh b/.github/workflows/miri.sh index 3323bd0996bf..faf9f028d281 100755 --- a/.github/workflows/miri.sh +++ b/.github/workflows/miri.sh @@ -5,11 +5,7 @@ # Must be run with nightly rust for example # rustup default nightly - -# stacked borrows checking uses too much memory to run successfully in github actions -# re-enable if the CI is migrated to something more powerful (https://github.com/apache/arrow-rs/issues/1833) -# see also https://github.com/rust-lang/miri/issues/1367 -export MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-disable-stacked-borrows" +export MIRIFLAGS="-Zmiri-disable-isolation" cargo miri setup cargo clean @@ -18,3 +14,5 @@ cargo miri test -p arrow-buffer cargo miri test -p arrow-data --features ffi cargo miri test -p arrow-schema --features ffi cargo miri test -p arrow-array +cargo miri test -p arrow-arith --features simd +cargo miri test -p arrow-ord --features simd diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 4236da6d656b..80a6eb3f541e 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -49,7 +49,7 @@ chrono-tz = { version = "0.8", optional = true } num = { version = "0.4.1", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", default-features = false } -packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } +packed_simd = { version = "0.3.9", default-features = false, optional = true } [features] simd = ["packed_simd"] From 1f466dc62c9ad2fbea206b2bfdec40ca783a9c33 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:44:40 +0100 Subject: [PATCH 1133/1411] Support copy_if_not_exists for Cloudflare R2 (#4190) (#4239) * Support copy_if_not_exists for Cloudflare R2 (#4190) * Add tests --- object_store/src/aws/client.rs | 48 +++++++++++++++++++++---- object_store/src/aws/copy.rs | 66 ++++++++++++++++++++++++++++++++++ object_store/src/aws/mod.rs | 44 +++++++++++++++++++---- 3 files changed, 144 insertions(+), 14 deletions(-) create mode 100644 object_store/src/aws/copy.rs diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 188897620b91..1c35586f8bc9 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -17,7 +17,9 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{AwsCredentialProvider, STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::{ + AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, +}; use crate::client::get::GetClient; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; @@ -37,7 +39,7 @@ use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, Response, + Client as ReqwestClient, Method, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -206,6 +208,7 @@ pub struct S3Config { pub client_options: ClientOptions, pub sign_payload: bool, pub checksum: Option, + pub copy_if_not_exists: Option, } impl S3Config { @@ -424,14 +427,37 @@ impl S3Client { } /// Make an S3 Copy request - pub async fn copy_request(&self, from: &Path, to: &Path) -> Result<()> { + pub async fn copy_request( + &self, + from: &Path, + to: &Path, + overwrite: bool, + ) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); - self.client + let mut builder = self + .client .request(Method::PUT, url) - .header("x-amz-copy-source", source) + .header("x-amz-copy-source", source); + + if !overwrite { + match &self.config.copy_if_not_exists { + Some(S3CopyIfNotExists::Header(k, v)) => { + builder = builder.header(k, v); + } + None => { + return Err(crate::Error::NotSupported { + source: "S3 does not support copy-if-not-exists" + .to_string() + .into(), + }) + } + } + } + + builder .with_aws_sigv4( credential.as_ref(), &self.config.region, @@ -441,8 +467,16 @@ impl S3Client { ) .send_retry(&self.config.retry_config) .await - .context(CopyRequestSnafu { - path: from.as_ref(), + .map_err(|source| match source.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(source), + path: to.to_string(), + }, + _ => Error::CopyRequest { + source, + path: from.to_string(), + } + .into(), })?; Ok(()) diff --git a/object_store/src/aws/copy.rs b/object_store/src/aws/copy.rs new file mode 100644 index 000000000000..6b96f992cec5 --- /dev/null +++ b/object_store/src/aws/copy.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::config::Parse; + +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`] +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum S3CopyIfNotExists { + /// Some S3-compatible stores, such as Cloudflare R2, support copy if not exists + /// semantics through custom headers. + /// + /// If set, [`ObjectStore::copy_if_not_exists`] will perform a normal copy operation + /// with the provided header pair, and expect the store to fail with `412 Precondition Failed` + /// if the destination file already exists + /// + /// Encoded as `header::` ignoring whitespace + /// + /// For example `header: cf-copy-destination-if-none-match: *`, would set + /// the header `cf-copy-destination-if-none-match` to `*` + Header(String, String), +} + +impl std::fmt::Display for S3CopyIfNotExists { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Header(k, v) => write!(f, "header: {}: {}", k, v), + } + } +} + +impl S3CopyIfNotExists { + fn from_str(s: &str) -> Option { + let (variant, value) = s.split_once(':')?; + match variant.trim() { + "header" => { + let (k, v) = value.split_once(':')?; + Some(Self::Header(k.trim().to_string(), v.trim().to_string())) + } + _ => None, + } + } +} + +impl Parse for S3CopyIfNotExists { + fn parse(v: &str) -> crate::Result { + Self::from_str(v).ok_or_else(|| crate::Error::Generic { + store: "Config", + source: format!("Failed to parse \"{v}\" as S3CopyIfNotExists").into(), + }) + } +} diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index f6066d45a72c..7e16b5a1baf6 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -44,7 +44,6 @@ use tokio::io::AsyncWrite; use tracing::info; use url::Url; -pub use crate::aws::checksum::Checksum; use crate::aws::client::{S3Client, S3Config}; use crate::aws::credential::{ InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, @@ -64,8 +63,12 @@ use crate::{ mod checksum; mod client; +mod copy; mod credential; +pub use checksum::Checksum; +pub use copy::S3CopyIfNotExists; + // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: @@ -292,12 +295,11 @@ impl ObjectStore for AmazonS3 { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.client.copy_request(from, to).await + self.client.copy_request(from, to, true).await } - async fn copy_if_not_exists(&self, _source: &Path, _dest: &Path) -> Result<()> { - // Will need dynamodb_lock - Err(crate::Error::NotImplemented) + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.client.copy_request(from, to, false).await } } @@ -390,6 +392,8 @@ pub struct AmazonS3Builder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Copy if not exists + copy_if_not_exists: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -521,6 +525,11 @@ pub enum AmazonS3ConfigKey { /// ContainerCredentialsRelativeUri, + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] + /// + /// See [`S3CopyIfNotExists`] + CopyIfNotExists, + /// Client options Client(ClientConfigKey), } @@ -543,6 +552,7 @@ impl AsRef for AmazonS3ConfigKey { Self::ContainerCredentialsRelativeUri => { "aws_container_credentials_relative_uri" } + Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), } } @@ -576,6 +586,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_container_credentials_relative_uri" => { Ok(Self::ContainerCredentialsRelativeUri) } + "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -686,6 +697,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) + } }; self } @@ -753,6 +767,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { self.container_credentials_relative_uri.clone() } + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists.as_ref().map(ToString::to_string) + } } } @@ -935,6 +952,12 @@ impl AmazonS3Builder { self } + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] + pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { + self.copy_if_not_exists = Some(config.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -945,6 +968,7 @@ impl AmazonS3Builder { let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; + let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { credentials @@ -1050,6 +1074,7 @@ impl AmazonS3Builder { client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, checksum, + copy_if_not_exists, }; let client = Arc::new(S3Client::new(config)?); @@ -1062,8 +1087,9 @@ impl AmazonS3Builder { mod tests { use super::*; use crate::tests::{ - get_nonexistent_object, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_nonexistent_object, get_opts, + list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, + rename_and_copy, stream_get, }; use bytes::Bytes; use std::collections::HashMap; @@ -1164,6 +1190,7 @@ mod tests { let config = AmazonS3Builder::from_env(); let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); + let test_not_exists = config.copy_if_not_exists.is_some(); let integration = config.build().unwrap(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 @@ -1173,6 +1200,9 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + if test_not_exists { + copy_if_not_exists(&integration).await; + } // run integration test with unsigned payload enabled let config = AmazonS3Builder::from_env().with_unsigned_payload(true); From ab8d918cd52ae3df6afa2dc5fef8bd86e764a593 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Mon, 7 Aug 2023 09:22:20 -0700 Subject: [PATCH 1134/1411] Fix illustration for dict encoding (#4657) --- arrow-row/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 396f09380af7..3cd082c51165 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -305,7 +305,7 @@ mod variable; /// /// ```text /// ┌─────┬─────┬─────┬─────┐ -/// "Fabulous" │ 01 │ 03 │ 05 │ 00 │ +/// "Fabulous" │ 01 │ 01 │ 02 │ 00 │ /// └─────┴─────┴─────┴─────┘ /// /// ┌─────┬─────┬─────┐ From f16ceedf981c7230167ff858eabb9a8cdc87ffda Mon Sep 17 00:00:00 2001 From: jakevin Date: Tue, 8 Aug 2023 00:59:09 +0800 Subject: [PATCH 1135/1411] minor: move comment to the correct location (#4655) --- arrow-array/src/record_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 3134c9ecbd14..80c0e4b96741 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -158,7 +158,6 @@ impl RecordBatch { ))); } - // check that all columns have the same row count let row_count = options .row_count .or_else(|| columns.first().map(|col| col.len())) @@ -177,6 +176,7 @@ impl RecordBatch { } } + // check that all columns have the same row count if columns.iter().any(|c| c.len() != row_count) { let err = match options.row_count { Some(_) => { From 50f161eafb4062ffa13e3399c49d8f98e8dbfb6d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 7 Aug 2023 15:16:23 -0700 Subject: [PATCH 1136/1411] fix ownership of c stream error (#4660) * fix ownership of c stream error * add pyarrow integration test --- arrow-pyarrow-integration-testing/src/lib.rs | 13 ++++ .../tests/test_sql.py | 12 +++ arrow/src/ffi_stream.rs | 75 ++++++++++++++----- 3 files changed, 81 insertions(+), 19 deletions(-) diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index 89395bd2ed08..adcec769f247 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use arrow::array::new_empty_array; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::wrap_pyfunction; @@ -140,6 +141,17 @@ fn round_trip_record_batch_reader( Ok(obj) } +#[pyfunction] +fn reader_return_errors(obj: PyArrowType) -> PyResult<()> { + // This makes sure we can correctly consume a RBR and return the error, + // ensuring the error can live beyond the lifetime of the RBR. + let batches = obj.0.collect::, ArrowError>>(); + match batches { + Ok(_) => Ok(()), + Err(err) => Err(PyValueError::new_err(err.to_string())), + } +} + #[pymodule] fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(double))?; @@ -153,5 +165,6 @@ fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> m.add_wrapped(wrap_pyfunction!(round_trip_array))?; m.add_wrapped(wrap_pyfunction!(round_trip_record_batch))?; m.add_wrapped(wrap_pyfunction!(round_trip_record_batch_reader))?; + m.add_wrapped(wrap_pyfunction!(reader_return_errors))?; Ok(()) } diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index a7c6b34a4474..92782b9ed473 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -409,6 +409,18 @@ def test_record_batch_reader(): got_batches = list(b) assert got_batches == batches +def test_record_batch_reader_error(): + schema = pa.schema([('ints', pa.list_(pa.int32()))]) + + def iter_batches(): + yield pa.record_batch([[[1], [2, 42]]], schema) + raise ValueError("test error") + + reader = pa.RecordBatchReader.from_batches(schema, iter_batches()) + + with pytest.raises(ValueError, match="test error"): + rust.reader_return_errors(reader) + def test_reject_other_classes(): # Arbitrary type that is not a PyArrow type not_pyarrow = ["hello"] diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 7d6689a89058..a9d2e8ab6bf2 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -54,6 +54,7 @@ //! } //! ``` +use std::ffi::CStr; use std::ptr::addr_of; use std::{ convert::TryFrom, @@ -120,7 +121,7 @@ unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { struct StreamPrivateData { batch_reader: Box, - last_error: String, + last_error: Option, } // The callback used to get array schema @@ -142,8 +143,12 @@ unsafe extern "C" fn get_next( // The callback used to get the error from last operation on the `FFI_ArrowArrayStream` unsafe extern "C" fn get_last_error(stream: *mut FFI_ArrowArrayStream) -> *const c_char { let mut ffi_stream = ExportedArrayStream { stream }; - let last_error = ffi_stream.get_last_error(); - CString::new(last_error.as_str()).unwrap().into_raw() + // The consumer should not take ownership of this string, we should return + // a const pointer to it. + match ffi_stream.get_last_error() { + Some(err_string) => err_string.as_ptr(), + None => std::ptr::null(), + } } impl Drop for FFI_ArrowArrayStream { @@ -160,7 +165,7 @@ impl FFI_ArrowArrayStream { pub fn new(batch_reader: Box) -> Self { let private_data = Box::new(StreamPrivateData { batch_reader, - last_error: String::new(), + last_error: None, }); Self { @@ -206,7 +211,10 @@ impl ExportedArrayStream { 0 } Err(ref err) => { - private_data.last_error = err.to_string(); + private_data.last_error = Some( + CString::new(err.to_string()) + .expect("Error string has a null byte in it."), + ); get_error_code(err) } } @@ -231,15 +239,18 @@ impl ExportedArrayStream { 0 } else { let err = &next_batch.unwrap_err(); - private_data.last_error = err.to_string(); + private_data.last_error = Some( + CString::new(err.to_string()) + .expect("Error string has a null byte in it."), + ); get_error_code(err) } } } } - pub fn get_last_error(&mut self) -> &String { - &self.get_private_data().last_error + pub fn get_last_error(&mut self) -> Option<&CString> { + self.get_private_data().last_error.as_ref() } } @@ -312,19 +323,15 @@ impl ArrowArrayStreamReader { /// Get the last error from `ArrowArrayStreamReader` fn get_stream_last_error(&mut self) -> Option { - self.stream.get_last_error?; - - let error_str = unsafe { - let c_str = - self.stream.get_last_error.unwrap()(&mut self.stream) as *mut c_char; - CString::from_raw(c_str).into_string() - }; + let get_last_error = self.stream.get_last_error?; - if let Err(err) = error_str { - Some(err.to_string()) - } else { - Some(error_str.unwrap()) + let error_str = unsafe { get_last_error(&mut self.stream) }; + if error_str.is_null() { + return None; } + + let error_str = unsafe { CStr::from_ptr(error_str) }; + Some(error_str.to_string_lossy().to_string()) } } @@ -381,6 +388,8 @@ pub unsafe fn export_reader_into_raw( #[cfg(test)] mod tests { + use arrow_schema::DataType; + use super::*; use crate::array::Int32Array; @@ -503,4 +512,32 @@ mod tests { _test_round_trip_import(vec![array.clone(), array.clone(), array]) } + + #[test] + fn test_error_import() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + + let iter = + Box::new(vec![Err(ArrowError::MemoryError("".to_string()))].into_iter()); + + let reader = TestRecordBatchReader::new(schema.clone(), iter); + + // Import through `FFI_ArrowArrayStream` as `ArrowArrayStreamReader` + let stream = FFI_ArrowArrayStream::new(reader); + let stream_reader = ArrowArrayStreamReader::try_new(stream).unwrap(); + + let imported_schema = stream_reader.schema(); + assert_eq!(imported_schema, schema); + + let mut produced_batches = vec![]; + for batch in stream_reader { + produced_batches.push(batch); + } + + // The results should outlive the lifetime of the stream itself. + assert_eq!(produced_batches.len(), 1); + assert!(produced_batches[0].is_err()); + + Ok(()) + } } From 696cbdbb72e821613f44a09e2b547d2f85c06089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 8 Aug 2023 08:01:52 +0200 Subject: [PATCH 1137/1411] Support `concat_batches` for 0 columns (#4662) * Support `concat_batches` for 0 columns * Add doc comment --- arrow-select/src/concat.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 0bf4c97ff827..31846ee1fdc3 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -97,6 +97,14 @@ pub fn concat_batches<'a>( schema: &SchemaRef, input_batches: impl IntoIterator, ) -> Result { + // When schema is empty, sum the number of the rows of all batches + if schema.fields().is_empty() { + let num_rows: usize = input_batches.into_iter().map(RecordBatch::num_rows).sum(); + let mut options = RecordBatchOptions::default(); + options.row_count = Some(num_rows); + return RecordBatch::try_new_with_options(schema.clone(), vec![], &options); + } + let batches: Vec<&RecordBatch> = input_batches.into_iter().collect(); if batches.is_empty() { return Ok(RecordBatch::new_empty(schema.clone())); @@ -142,6 +150,21 @@ mod tests { assert!(re.is_err()); } + #[test] + fn test_concat_batches_no_columns() { + // Test concat using empty schema / batches without columns + let schema = Arc::new(Schema::empty()); + + let mut options = RecordBatchOptions::default(); + options.row_count = Some(100); + let batch = + RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); + // put in 2 batches of 100 rows each + let re = concat_batches(&schema, &[batch.clone(), batch]).unwrap(); + + assert_eq!(re.num_rows(), 200); + } + #[test] fn test_concat_one_element_vec() { let arr = Arc::new(PrimitiveArray::::from(vec![ From eb8edc41220ca3db06079a7e44b5aab198b0df16 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Tue, 8 Aug 2023 18:17:43 +0800 Subject: [PATCH 1138/1411] =?UTF-8?q?bug:=20Add=20some=20missing=20field?= =?UTF-8?q?=20in=20row=20group=20metadata:=20ordinal,=20total=20co?= =?UTF-8?q?=E2=80=A6=20(#4636)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add some missing field in row group metadata: ordinal, total compressed size, file_offset * make formatter happy * Add some test * fix bug * fix comments * fix comment * fix clippy --- parquet/src/file/metadata.rs | 37 +++++++++++++++++++++++++++++++++--- parquet/src/file/writer.rs | 16 ++++++++++++++++ 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index b1a812c0ecd5..b2a2b3eee531 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -279,6 +279,9 @@ pub struct RowGroupMetaData { sorting_columns: Option>, total_byte_size: i64, schema_descr: SchemaDescPtr, + // We can't infer from file offset of first column since there may empty columns in row group. + file_offset: Option, + ordinal: Option, } impl RowGroupMetaData { @@ -332,6 +335,18 @@ impl RowGroupMetaData { self.schema_descr.clone() } + /// Returns ordinal of this row group in file + #[inline(always)] + pub fn ordinal(&self) -> Option { + self.ordinal + } + + /// Returns file offset of this row group in file. + #[inline(always)] + pub fn file_offset(&self) -> Option { + self.file_offset + } + /// Method to convert from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, @@ -352,6 +367,8 @@ impl RowGroupMetaData { sorting_columns, total_byte_size, schema_descr, + file_offset: rg.file_offset, + ordinal: rg.ordinal, }) } @@ -362,9 +379,9 @@ impl RowGroupMetaData { total_byte_size: self.total_byte_size, num_rows: self.num_rows, sorting_columns: self.sorting_columns().cloned(), - file_offset: None, - total_compressed_size: None, - ordinal: None, + file_offset: self.file_offset(), + total_compressed_size: Some(self.compressed_size()), + ordinal: self.ordinal, } } @@ -383,9 +400,11 @@ impl RowGroupMetaDataBuilder { Self(RowGroupMetaData { columns: Vec::with_capacity(schema_descr.num_columns()), schema_descr, + file_offset: None, num_rows: 0, sorting_columns: None, total_byte_size: 0, + ordinal: None, }) } @@ -413,6 +432,17 @@ impl RowGroupMetaDataBuilder { self } + /// Sets ordinal for this row group. + pub fn set_ordinal(mut self, value: i16) -> Self { + self.0.ordinal = Some(value); + self + } + + pub fn set_file_offset(mut self, value: i64) -> Self { + self.0.file_offset = Some(value); + self + } + /// Builds row group metadata. pub fn build(self) -> Result { if self.0.schema_descr.num_columns() != self.0.columns.len() { @@ -968,6 +998,7 @@ mod tests { .set_num_rows(1000) .set_total_byte_size(2000) .set_column_metadata(columns) + .set_ordinal(1) .build() .unwrap(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7e1034ae7b63..c31b9dc47426 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -183,6 +183,8 @@ impl SerializedFileWriter { /// previous row group must be finalised and closed using `RowGroupWriter::close` method. pub fn next_row_group(&mut self) -> Result> { self.assert_previous_writer_closed()?; + let ordinal = self.row_group_index; + self.row_group_index += 1; let row_groups = &mut self.row_groups; @@ -204,6 +206,7 @@ impl SerializedFileWriter { self.descr.clone(), self.props.clone(), &mut self.buf, + ordinal as i16, Some(Box::new(on_close)), ); Ok(row_group_writer) @@ -409,6 +412,8 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { bloom_filters: Vec>, column_indexes: Vec>, offset_indexes: Vec>, + row_group_index: i16, + file_offset: i64, on_close: Option>, } @@ -418,16 +423,22 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { /// - `schema_descr` - the schema to write /// - `properties` - writer properties /// - `buf` - the buffer to write data to + /// - `row_group_index` - row group index in this parquet file. + /// - `file_offset` - file offset of this row group in this parquet file. /// - `on_close` - an optional callback that will invoked on [`Self::close`] pub fn new( schema_descr: SchemaDescPtr, properties: WriterPropertiesPtr, buf: &'a mut TrackedWrite, + row_group_index: i16, on_close: Option>, ) -> Self { let num_columns = schema_descr.num_columns(); + let file_offset = buf.bytes_written() as i64; Self { buf, + row_group_index, + file_offset, on_close, total_rows_written: None, descr: schema_descr, @@ -603,6 +614,8 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_total_byte_size(self.total_uncompressed_bytes) .set_num_rows(self.total_rows_written.unwrap_or(0) as i64) .set_sorting_columns(self.props.sorting_columns().cloned()) + .set_ordinal(self.row_group_index) + .set_file_offset(self.file_offset) .build()?; let metadata = Arc::new(row_group_metadata); @@ -1316,6 +1329,7 @@ mod tests { let mut rows: i64 = 0; for (idx, subset) in data.iter().enumerate() { + let row_group_file_offset = file_writer.buf.bytes_written(); let mut row_group_writer = file_writer.next_row_group().unwrap(); if let Some(mut writer) = row_group_writer.next_column().unwrap() { rows += writer @@ -1327,6 +1341,8 @@ mod tests { let last_group = row_group_writer.close().unwrap(); let flushed = file_writer.flushed_row_groups(); assert_eq!(flushed.len(), idx + 1); + assert_eq!(Some(idx as i16), last_group.ordinal()); + assert_eq!(Some(row_group_file_offset as i64), last_group.file_offset()); assert_eq!(flushed[idx].as_ref(), last_group.as_ref()); } let file_metadata = file_writer.close().unwrap(); From 97eba43b01b1e4363995ae0a6f9874b1482fb2fc Mon Sep 17 00:00:00 2001 From: jakevin Date: Wed, 9 Aug 2023 03:16:58 +0800 Subject: [PATCH 1139/1411] enhancement: batches_to_flight_data require a schema ref as param. (#4665) --- arrow-flight/examples/flight_sql_server.rs | 4 ++-- arrow-flight/src/utils.rs | 4 ++-- arrow-flight/tests/flight_sql_client_cli.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index f717d9b621b2..08a36bc49ea8 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -196,9 +196,9 @@ impl FlightSqlService for FlightSqlServiceImpl { self.check_token(&request)?; let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; - let schema = (*batch.schema()).clone(); + let schema = batch.schema(); let batches = vec![batch]; - let flight_data = batches_to_flight_data(schema, batches) + let flight_data = batches_to_flight_data(schema.as_ref(), batches) .map_err(|e| status!("Could not convert batches", e))? .into_iter() .map(Ok); diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index ccf1e73866e1..8baf5ed7232a 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -147,11 +147,11 @@ pub fn ipc_message_from_arrow_schema( /// Convert `RecordBatch`es to wire protocol `FlightData`s pub fn batches_to_flight_data( - schema: Schema, + schema: &Schema, batches: Vec, ) -> Result, ArrowError> { let options = IpcWriteOptions::default(); - let schema_flight_data: FlightData = SchemaAsIpc::new(&schema, &options).into(); + let schema_flight_data: FlightData = SchemaAsIpc::new(schema, &options).into(); let mut dictionaries = vec![]; let mut flight_data = vec![]; diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index c4ae9280c898..912bcc75a9df 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -144,9 +144,9 @@ impl FlightSqlService for FlightSqlServiceImpl { "part_2" => batch.slice(2, 1), ticket => panic!("Invalid ticket: {ticket:?}"), }; - let schema = (*batch.schema()).clone(); + let schema = batch.schema(); let batches = vec![batch]; - let flight_data = batches_to_flight_data(schema, batches) + let flight_data = batches_to_flight_data(schema.as_ref(), batches) .unwrap() .into_iter() .map(Ok); From 0ded0ce1be928ac8a74ce8e791febda95e01c05e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 8 Aug 2023 15:35:19 -0500 Subject: [PATCH 1140/1411] Account for child `Bucket` size in OrderPreservingInterner (#4646) * Account for child buckets in OrderPreservingInterner * Add a test * Tweak * clipy --- arrow-row/src/interner.rs | 93 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs index 1c71b6a55217..fde9251952c0 100644 --- a/arrow-row/src/interner.rs +++ b/arrow-row/src/interner.rs @@ -343,8 +343,19 @@ impl Bucket { fn size(&self) -> usize { std::mem::size_of::() + self.slots.capacity() * std::mem::size_of::() + // and account for the size of any embedded buckets in the slots + + self.slot_child_bucket_size() + self.next.as_ref().map(|x| x.size()).unwrap_or_default() } + + /// returns the total size of any recursively allocated `Bucket`s + /// in self.slots. This does not include the size of the child Slot itself + fn slot_child_bucket_size(&self) -> usize { + self.slots + .iter() + .map(|slot| slot.child.as_ref().map(|x| x.size()).unwrap_or_default()) + .sum() + } } #[cfg(test)] @@ -427,4 +438,86 @@ mod tests { interner.normalized_key(interned[3]) < interner.normalized_key(interned[2]) ); } + + #[test] + fn test_intern_sizes() { + let mut interner = OrderPreservingInterner::default(); + + // Intern a 1K values each 8 bytes large + let num_items = 1000; + let mut values: Vec = (0..num_items).collect(); + values.reverse(); + + // intern these values 1 at a time (otherwise the interner + // will sort them first); + for v in values { + interner.intern([Some(v.to_be_bytes())]); + } + + let reported = interner.size(); + + // Figure out the expected size (this is a second + // implementation of size()) as a double check + let min_expected = BucketWalker::new() + .visit_bucket(interner.bucket.as_ref()) + .memory_estimate() + // hash table size + + interner.lookup.capacity() * std::mem::size_of::() + // key/value storage + + interner.keys.buffer_size() + + interner.values.buffer_size(); + + assert!( + reported > min_expected, + "reported size {reported} not larger than min expected size: {min_expected}" + ) + } + + // Walks over the buckets / slots counting counting them all + struct BucketWalker { + num_buckets: usize, + num_slots: usize, + } + + impl BucketWalker { + fn new() -> Self { + Self { + num_buckets: 0, + num_slots: 0, + } + } + + // recursively visit the bucket and any slots/buckets contained + fn visit_bucket(mut self, bucket: &Bucket) -> Self { + self.num_buckets += 1; + let acc = bucket + .slots + .iter() + .fold(self, |acc, slot| acc.visit_slot(slot)); + + if let Some(next) = bucket.next.as_ref() { + acc.visit_bucket(next.as_ref()) + } else { + acc + } + } + + // recursively visit slot and any slots/buckets + fn visit_slot(mut self, slot: &Slot) -> Self { + self.num_slots += 1; + if let Some(child) = slot.child.as_ref() { + self.visit_bucket(child.as_ref()) + } else { + self + } + } + + // estimate how much memory is used just for Buckets / Slots + // (an underestimate of the total memory used for the + // interner as it doesn't contain any actual values) + fn memory_estimate(self) -> usize { + self.num_buckets * std::mem::size_of::() + + self.num_slots * std::mem::size_of::() + } + } } From 946e17d792b2f67f4b7358fb8bd029e82b1074cf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 9 Aug 2023 09:23:38 -0500 Subject: [PATCH 1141/1411] Minor: Improve API docs for FlightSQL metadata builders (#4667) --- arrow-flight/src/sql/metadata/mod.rs | 4 ++++ arrow-flight/src/sql/metadata/sql_info.rs | 4 ++-- arrow-flight/src/sql/metadata/xdbc_info.rs | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs index 72c882f385d3..71551f1849ae 100644 --- a/arrow-flight/src/sql/metadata/mod.rs +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -21,10 +21,14 @@ //! - [`GetCatalogsBuilder`] for building responses to [`CommandGetCatalogs`] queries. //! - [`GetDbSchemasBuilder`] for building responses to [`CommandGetDbSchemas`] queries. //! - [`GetTablesBuilder`]for building responses to [`CommandGetTables`] queries. +//! - [`SqlInfoDataBuilder`]for building responses to [`CommandGetSqlInfo`] queries. +//! - [`XdbcTypeInfoDataBuilder`]for building responses to [`CommandGetXdbcTypeInfo`] queries. //! //! [`CommandGetCatalogs`]: crate::sql::CommandGetCatalogs //! [`CommandGetDbSchemas`]: crate::sql::CommandGetDbSchemas //! [`CommandGetTables`]: crate::sql::CommandGetTables +//! [`CommandGetSqlInfo`]: crate::sql::CommandGetSqlInfo +//! [`CommandGetXdbcTypeInfo`]: crate::sql::CommandGetXdbcTypeInfo mod catalogs; mod db_schemas; diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index d0c9cedbcf7c..b37ac85308f4 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -334,8 +334,8 @@ impl SqlInfoUnionBuilder { /// [`CommandGetSqlInfo`] are metadata requests used by a Flight SQL /// server to communicate supported capabilities to Flight SQL clients. /// -/// Servers constuct - usually static - [`SqlInfoData`] via the [SqlInfoDataBuilder`], -/// and build responses by passing the [`GetSqlInfoBuilder`]. +/// Servers constuct - usually static - [`SqlInfoData`] via the [`SqlInfoDataBuilder`], +/// and build responses using [`CommandGetSqlInfo::into_builder`] #[derive(Debug, Clone, PartialEq)] pub struct SqlInfoDataBuilder { /// Use BTreeMap to ensure the values are sorted by value as diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index cecef1b49e8b..b70a3ce3cb3e 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -70,8 +70,8 @@ pub struct XdbcTypeInfo { /// [`CommandGetXdbcTypeInfo`] are metadata requests used by a Flight SQL /// server to communicate supported capabilities to Flight SQL clients. /// -/// Servers constuct - usually static - [`XdbcTypeInfoData`] via the [XdbcTypeInfoDataBuilder`], -/// and build responses by passing the [`GetXdbcTypeInfoBuilder`]. +/// Servers constuct - usually static - [`XdbcTypeInfoData`] via the [`XdbcTypeInfoDataBuilder`], +/// and build responses using [`CommandGetXdbcTypeInfo::into_builder`]. pub struct XdbcTypeInfoData { batch: RecordBatch, } From 92d8ee682fa8a8d0afc053a871565e463646811d Mon Sep 17 00:00:00 2001 From: Faiaz Sanaulla <105630300+fsdvh@users.noreply.github.com> Date: Wed, 9 Aug 2023 17:10:02 +0200 Subject: [PATCH 1142/1411] More intuitive bool-to-string casting (#4666) * use more intuitive bool to string casting * tests --- arrow-cast/src/cast.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index e7ca2d0ed4ca..c730452a8da5 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -1220,7 +1220,7 @@ pub fn cast_with_options( Ok(Arc::new( array .iter() - .map(|value| value.map(|value| if value { "1" } else { "0" })) + .map(|value| value.map(|value| if value { "true" } else { "false" })) .collect::(), )) } @@ -1229,7 +1229,7 @@ pub fn cast_with_options( Ok(Arc::new( array .iter() - .map(|value| value.map(|value| if value { "1" } else { "0" })) + .map(|value| value.map(|value| if value { "true" } else { "false" })) .collect::(), )) } @@ -4763,6 +4763,26 @@ mod tests { assert!(!c.is_valid(2)); } + #[test] + fn test_cast_bool_to_utf8() { + let array = BooleanArray::from(vec![Some(true), Some(false), None]); + let b = cast(&array, &DataType::Utf8).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!("true", c.value(0)); + assert_eq!("false", c.value(1)); + assert!(!c.is_valid(2)); + } + + #[test] + fn test_cast_bool_to_large_utf8() { + let array = BooleanArray::from(vec![Some(true), Some(false), None]); + let b = cast(&array, &DataType::LargeUtf8).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!("true", c.value(0)); + assert_eq!("false", c.value(1)); + assert!(!c.is_valid(2)); + } + #[test] fn test_cast_bool_to_f64() { let array = BooleanArray::from(vec![Some(true), Some(false), None]); From 5023ea8438e3143bf711a89a3a2ffb8838a18e9e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Aug 2023 17:53:29 +0100 Subject: [PATCH 1143/1411] Fix MSRV CI (#4671) --- .github/workflows/rust.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 6b316fd6bc43..c1c7f4d90fd4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -121,6 +121,10 @@ jobs: - name: Check arrow-flight working-directory: arrow-flight run: cargo msrv verify + - name: Downgrade object_store dependencies + working-directory: object_store + # Necessary because 1.30.0 updates MSRV to 1.63 + run: cargo update -p tokio --precise 1.29.1 - name: Check object_store working-directory: object_store run: cargo msrv verify From cefb8c1bbb2807fbb420e62f108676eeb80ec198 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Aug 2023 22:30:12 +0100 Subject: [PATCH 1144/1411] Use ArrayFormatter in cast kernel (#4668) * Use ArrayFormatter in cast kernel * Add test * Clippy --- arrow-cast/src/cast.rs | 86 +++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 55 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c730452a8da5..c7fd082de2e6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -41,7 +41,7 @@ use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; use std::sync::Arc; -use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; +use crate::display::{ArrayFormatter, FormatOptions}; use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, string_to_datetime, Parser, @@ -622,21 +622,6 @@ where Ok(Arc::new(array)) } -// cast the List array to Utf8 array -macro_rules! cast_list_to_string { - ($ARRAY:expr, $SIZE:ident) => {{ - let mut value_builder: GenericStringBuilder<$SIZE> = GenericStringBuilder::new(); - for i in 0..$ARRAY.len() { - if $ARRAY.is_null(i) { - value_builder.append_null(); - } else { - value_builder.append_value(array_value_to_string($ARRAY, i)?); - } - } - Ok(Arc::new(value_builder.finish())) - }}; -} - fn make_timestamp_array( array: &PrimitiveArray, unit: TimeUnit, @@ -800,8 +785,8 @@ pub fn cast_with_options( } } (List(_) | LargeList(_), _) => match to_type { - Utf8 => cast_list_to_string!(array, i32), - LargeUtf8 => cast_list_to_string!(array, i64), + Utf8 => value_to_string::(array, cast_options), + LargeUtf8 => value_to_string::(array, cast_options), _ => Err(ArrowError::CastError( "Cannot cast list to non-list data types".to_string(), )), @@ -924,8 +909,8 @@ pub fn cast_with_options( x as f64 / 10_f64.powi(*scale as i32) }) } - Utf8 => value_to_string::(array, Some(&cast_options.format_options)), - LargeUtf8 => value_to_string::(array, Some(&cast_options.format_options)), + Utf8 => value_to_string::(array, cast_options), + LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -993,8 +978,8 @@ pub fn cast_with_options( x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }) } - Utf8 => value_to_string::(array, Some(&cast_options.format_options)), - LargeUtf8 => value_to_string::(array, Some(&cast_options.format_options)), + Utf8 => value_to_string::(array, cast_options), + LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported" @@ -1215,24 +1200,8 @@ pub fn cast_with_options( Float16 => cast_bool_to_numeric::(array, cast_options), Float32 => cast_bool_to_numeric::(array, cast_options), Float64 => cast_bool_to_numeric::(array, cast_options), - Utf8 => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|value| value.map(|value| if value { "true" } else { "false" })) - .collect::(), - )) - } - LargeUtf8 => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|value| value.map(|value| if value { "true" } else { "false" })) - .collect::(), - )) - } + Utf8 => value_to_string::(array, cast_options), + LargeUtf8 => value_to_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), @@ -1374,8 +1343,8 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array, Some(&cast_options.format_options)), - (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array, Some(&cast_options.format_options)), + (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array, cast_options), + (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array, cast_options), // start numeric casts (UInt8, UInt16) => { cast_numeric_arrays::(array, cast_options) @@ -2461,14 +2430,10 @@ where fn value_to_string( array: &dyn Array, - options: Option<&FormatOptions>, + options: &CastOptions, ) -> Result { let mut builder = GenericStringBuilder::::new(); - let mut fmt_options = &FormatOptions::default(); - if let Some(fmt_opts) = options { - fmt_options = fmt_opts; - }; - let formatter = ArrayFormatter::try_new(array, fmt_options)?; + let formatter = ArrayFormatter::try_new(array, &options.format_options)?; let nulls = array.nulls(); for i in 0..array.len() { match nulls.map(|x| x.is_null(i)).unwrap_or_default() { @@ -7369,14 +7334,10 @@ mod tests { /// Print the `DictionaryArray` `array` as a vector of strings fn array_to_strings(array: &ArrayRef) -> Vec { + let options = FormatOptions::new().with_null("null"); + let formatter = ArrayFormatter::try_new(array.as_ref(), &options).unwrap(); (0..array.len()) - .map(|i| { - if array.is_null(i) { - "null".to_string() - } else { - array_value_to_string(array, i).expect("Convert array to String") - } - }) + .map(|i| formatter.value(i).to_string()) .collect() } @@ -8989,4 +8950,19 @@ mod tests { fn test_const_options() { assert!(CAST_OPTIONS.safe) } + + #[test] + fn test_list_format_options() { + let options = CastOptions { + safe: false, + format_options: FormatOptions::default().with_null("null"), + }; + let array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(0), None, Some(2)]), + ]); + let a = cast_with_options(&array, &DataType::Utf8, &options).unwrap(); + let r: Vec<_> = a.as_string::().iter().map(|x| x.unwrap()).collect(); + assert_eq!(r, &["[0, 1, 2]", "[0, null, 2]"]); + } } From 2139fa5317cc60fb91781238938a199a3a0f1a01 Mon Sep 17 00:00:00 2001 From: Ze'ev Maor Date: Thu, 10 Aug 2023 10:11:29 +0300 Subject: [PATCH 1145/1411] cleanup some assert() with error propagation (#4673) * cleanup some assert() with error propagation * cleanup some assert() with error propagation --------- Co-authored-by: Ze'ev Maor --- parquet/src/file/serialized_reader.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 629606e587d4..f8716359e045 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -442,8 +442,10 @@ pub(crate) fn decode_page( let result = match page_header.type_ { PageType::DICTIONARY_PAGE => { - assert!(page_header.dictionary_page_header.is_some()); - let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let dict_header = + page_header.dictionary_page_header.as_ref().ok_or_else(|| { + ParquetError::General("Missing dictionary page header".to_string()) + })?; let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: buffer, @@ -453,8 +455,9 @@ pub(crate) fn decode_page( } } PageType::DATA_PAGE => { - assert!(page_header.data_page_header.is_some()); - let header = page_header.data_page_header.unwrap(); + let header = page_header.data_page_header.ok_or_else(|| { + ParquetError::General("Missing V1 data page header".to_string()) + })?; Page::DataPage { buf: buffer, num_values: header.num_values as u32, @@ -465,8 +468,9 @@ pub(crate) fn decode_page( } } PageType::DATA_PAGE_V2 => { - assert!(page_header.data_page_header_v2.is_some()); - let header = page_header.data_page_header_v2.unwrap(); + let header = page_header.data_page_header_v2.ok_or_else(|| { + ParquetError::General("Missing V2 data page header".to_string()) + })?; let is_compressed = header.is_compressed.unwrap_or(true); Page::DataPageV2 { buf: buffer, From a41248ad8d00e2a5b06fb83b089b39d6fb9ff305 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Aug 2023 14:11:18 +0100 Subject: [PATCH 1146/1411] Fix equality of nested nullable FixedSizeBinary (#4637) (#4670) --- arrow-data/src/equal/fixed_binary.rs | 2 +- arrow/tests/array_equal.rs | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index 9e0e77ff7eca..40dacdddd3a0 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -80,7 +80,7 @@ pub(super) fn fixed_binary_equal( lhs_start + lhs_nulls.offset(), len, ); - let rhs_nulls = lhs.nulls().unwrap(); + let rhs_nulls = rhs.nulls().unwrap(); let rhs_slices_iter = BitSliceIterator::new( rhs_nulls.validity(), rhs_start + rhs_nulls.offset(), diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 83a280db67b8..4abe31a36cf5 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -1295,3 +1295,25 @@ fn test_struct_equal_slice() { test_equal(&a, &b, true); } + +#[test] +fn test_list_excess_children_equal() { + let mut a = ListBuilder::new(FixedSizeBinaryBuilder::new(5)); + a.values().append_value(b"11111").unwrap(); // Masked value + a.append_null(); + a.values().append_value(b"22222").unwrap(); + a.values().append_null(); + a.append(true); + let a = a.finish(); + + let mut b = ListBuilder::new(FixedSizeBinaryBuilder::new(5)); + b.append_null(); + b.values().append_value(b"22222").unwrap(); + b.values().append_null(); + b.append(true); + let b = b.finish(); + + assert_eq!(a.value_offsets(), &[0, 1, 3]); + assert_eq!(b.value_offsets(), &[0, 0, 2]); + assert_eq!(a, b); +} From ea19ce86d6c12f837eecc901cc66f7dd96883d48 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Aug 2023 17:40:49 +0100 Subject: [PATCH 1147/1411] Separate metadata fetch from `ArrowReaderBuilder` construction (#4674) (#4676) * Separate metadata fetch from builder construction (#4674) * Clippy * Docs tweaks * Wrap ParquetField in Arc * Move load to ArrowReaderMetadata --- parquet/src/arrow/array_reader/mod.rs | 43 ----- parquet/src/arrow/arrow_reader/mod.rs | 262 +++++++++++++++++++++----- parquet/src/arrow/async_reader/mod.rs | 67 ++++--- parquet/src/file/metadata.rs | 4 +- parquet/src/file/serialized_reader.rs | 5 - 5 files changed, 265 insertions(+), 116 deletions(-) diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 1e781fb73ce5..625ac034ef47 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -118,49 +118,6 @@ impl RowGroups for Arc { } } -pub(crate) struct FileReaderRowGroups { - /// The underling file reader - reader: Arc, - /// Optional list of row group indices to scan - row_groups: Option>, -} - -impl FileReaderRowGroups { - /// Creates a new [`RowGroups`] from a `FileReader` and an optional - /// list of row group indexes to scan - pub fn new(reader: Arc, row_groups: Option>) -> Self { - Self { reader, row_groups } - } -} - -impl RowGroups for FileReaderRowGroups { - fn num_rows(&self) -> usize { - match &self.row_groups { - None => self.reader.metadata().file_metadata().num_rows() as usize, - Some(row_groups) => { - let meta = self.reader.metadata().row_groups(); - row_groups - .iter() - .map(|x| meta[*x].num_rows() as usize) - .sum() - } - } - } - - fn column_chunks(&self, i: usize) -> Result> { - let iterator = match &self.row_groups { - Some(row_groups) => FilePageIterator::with_row_groups( - i, - Box::new(row_groups.clone().into_iter()), - Arc::clone(&self.reader), - )?, - None => FilePageIterator::new(i, Arc::clone(&self.reader))?, - }; - - Ok(Box::new(iterator)) - } -} - /// Uses `record_reader` to read up to `batch_size` records from `pages` /// /// Returns the number of records read, which can be less than `batch_size` if diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 988738dac6ac..7e4423b86423 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -26,19 +26,21 @@ use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef}; use arrow_select::filter::prep_null_mask_filter; -use crate::arrow::array_reader::{build_array_reader, ArrayReader, FileReaderRowGroups}; +use crate::arrow::array_reader::{build_array_reader, ArrayReader}; use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{FieldLevels, ProjectionMask}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; -use crate::file::reader::{ChunkReader, SerializedFileReader}; -use crate::file::serialized_reader::ReadOptionsBuilder; +use crate::file::reader::{ChunkReader, SerializedPageReader}; use crate::schema::types::SchemaDescriptor; mod filter; mod selection; pub use crate::arrow::array_reader::RowGroups; +use crate::column::page::{PageIterator, PageReader}; +use crate::file::footer; +use crate::file::page_index::index_reader; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; pub use selection::{RowSelection, RowSelector}; @@ -57,7 +59,7 @@ pub struct ArrowReaderBuilder { pub(crate) schema: SchemaRef, - pub(crate) fields: Option, + pub(crate) fields: Option>, pub(crate) batch_size: usize, @@ -75,27 +77,12 @@ pub struct ArrowReaderBuilder { } impl ArrowReaderBuilder { - pub(crate) fn new_builder( - input: T, - metadata: Arc, - options: ArrowReaderOptions, - ) -> Result { - let kv_metadata = match options.skip_arrow_metadata { - true => None, - false => metadata.file_metadata().key_value_metadata(), - }; - - let (schema, fields) = parquet_to_arrow_schema_and_fields( - metadata.file_metadata().schema_descr(), - ProjectionMask::all(), - kv_metadata, - )?; - - Ok(Self { + pub(crate) fn new_builder(input: T, metadata: ArrowReaderMetadata) -> Self { + Self { input, - metadata, - schema: Arc::new(schema), - fields, + metadata: metadata.metadata, + schema: metadata.schema, + fields: metadata.fields, batch_size: 1024, row_groups: None, projection: ProjectionMask::all(), @@ -103,7 +90,7 @@ impl ArrowReaderBuilder { selection: None, limit: None, offset: None, - }) + } } /// Returns a reference to the [`ParquetMetaData`] for this parquet file @@ -234,48 +221,184 @@ impl ArrowReaderOptions { } } +/// The cheaply clone-able metadata necessary to construct a [`ArrowReaderBuilder`] +/// +/// This allows loading the metadata for a file once and then using this to construct +/// multiple separate readers, for example, to distribute readers across multiple threads +#[derive(Debug, Clone)] +pub struct ArrowReaderMetadata { + pub(crate) metadata: Arc, + + pub(crate) schema: SchemaRef, + + pub(crate) fields: Option>, +} + +impl ArrowReaderMetadata { + /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`] + /// + /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how this can be used + pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { + let mut metadata = footer::parse_metadata(reader)?; + if options.page_index { + let column_index = metadata + .row_groups() + .iter() + .map(|rg| index_reader::read_columns_indexes(reader, rg.columns())) + .collect::>>()?; + metadata.set_column_index(Some(column_index)); + + let offset_index = metadata + .row_groups() + .iter() + .map(|rg| index_reader::read_pages_locations(reader, rg.columns())) + .collect::>>()?; + + metadata.set_offset_index(Some(offset_index)) + } + Self::try_new(Arc::new(metadata), options) + } + + pub(crate) fn try_new( + metadata: Arc, + options: ArrowReaderOptions, + ) -> Result { + let kv_metadata = match options.skip_arrow_metadata { + true => None, + false => metadata.file_metadata().key_value_metadata(), + }; + + let (schema, fields) = parquet_to_arrow_schema_and_fields( + metadata.file_metadata().schema_descr(), + ProjectionMask::all(), + kv_metadata, + )?; + + Ok(Self { + metadata, + schema: Arc::new(schema), + fields: fields.map(Arc::new), + }) + } + + /// Returns a reference to the [`ParquetMetaData`] for this parquet file + pub fn metadata(&self) -> &Arc { + &self.metadata + } + + /// Returns the parquet [`SchemaDescriptor`] for this parquet file + pub fn parquet_schema(&self) -> &SchemaDescriptor { + self.metadata.file_metadata().schema_descr() + } + + /// Returns the arrow [`SchemaRef`] for this parquet file + pub fn schema(&self) -> &SchemaRef { + &self.schema + } +} + #[doc(hidden)] /// A newtype used within [`ReaderOptionsBuilder`] to distinguish sync readers from async -pub struct SyncReader(SerializedFileReader); +pub struct SyncReader(T); /// A synchronous builder used to construct [`ParquetRecordBatchReader`] for a file /// /// For an async API see [`crate::arrow::async_reader::ParquetRecordBatchStreamBuilder`] pub type ParquetRecordBatchReaderBuilder = ArrowReaderBuilder>; -impl ArrowReaderBuilder> { +impl ParquetRecordBatchReaderBuilder { /// Create a new [`ParquetRecordBatchReaderBuilder`] + /// + /// ``` + /// # use std::sync::Arc; + /// # use bytes::Bytes; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; + /// # use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; + /// # use parquet::arrow::ArrowWriter; + /// # let mut file: Vec = Vec::with_capacity(1024); + /// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)])); + /// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap(); + /// # let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap(); + /// # writer.write(&batch).unwrap(); + /// # writer.close().unwrap(); + /// # let file = Bytes::from(file); + /// # + /// let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + /// + /// // Inspect metadata + /// assert_eq!(builder.metadata().num_row_groups(), 1); + /// + /// // Construct reader + /// let mut reader: ParquetRecordBatchReader = builder.with_row_groups(vec![0]).build().unwrap(); + /// + /// // Read data + /// let _batch = reader.next().unwrap().unwrap(); + /// ``` pub fn try_new(reader: T) -> Result { Self::try_new_with_options(reader, Default::default()) } /// Create a new [`ParquetRecordBatchReaderBuilder`] with [`ArrowReaderOptions`] pub fn try_new_with_options(reader: T, options: ArrowReaderOptions) -> Result { - let reader = match options.page_index { - true => { - let read_options = ReadOptionsBuilder::new().with_page_index().build(); - SerializedFileReader::new_with_options(reader, read_options)? - } - false => SerializedFileReader::new(reader)?, - }; + let metadata = ArrowReaderMetadata::load(&reader, options)?; + Ok(Self::new_with_metadata(reader, metadata)) + } - let metadata = Arc::clone(reader.metadata_ref()); - Self::new_builder(SyncReader(reader), metadata, options) + /// Create a [`ParquetRecordBatchReaderBuilder`] from the provided [`ArrowReaderMetadata`] + /// + /// This allows loading metadata once and using it to create multiple builders with + /// potentially different settings + /// + /// ``` + /// # use std::fs::metadata; + /// # use std::sync::Arc; + /// # use bytes::Bytes; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; + /// # use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; + /// # use parquet::arrow::ArrowWriter; + /// # let mut file: Vec = Vec::with_capacity(1024); + /// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)])); + /// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap(); + /// # let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap(); + /// # writer.write(&batch).unwrap(); + /// # writer.close().unwrap(); + /// # let file = Bytes::from(file); + /// # + /// let metadata = ArrowReaderMetadata::load(&file, Default::default()).unwrap(); + /// let mut a = ParquetRecordBatchReaderBuilder::new_with_metadata(file.clone(), metadata.clone()).build().unwrap(); + /// let mut b = ParquetRecordBatchReaderBuilder::new_with_metadata(file, metadata).build().unwrap(); + /// + /// // Should be able to read from both in parallel + /// assert_eq!(a.next().unwrap().unwrap(), b.next().unwrap().unwrap()); + /// ``` + pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self { + Self::new_builder(SyncReader(input), metadata) } /// Build a [`ParquetRecordBatchReader`] /// /// Note: this will eagerly evaluate any `RowFilter` before returning pub fn build(self) -> Result { - let reader = FileReaderRowGroups::new(Arc::new(self.input.0), self.row_groups); - - let mut filter = self.filter; - let mut selection = self.selection; - // Try to avoid allocate large buffer let batch_size = self .batch_size .min(self.metadata.file_metadata().num_rows() as usize); + + let row_groups = self + .row_groups + .unwrap_or_else(|| (0..self.metadata.num_row_groups()).collect()); + + let reader = ReaderRowGroups { + reader: Arc::new(self.input.0), + metadata: self.metadata, + row_groups, + }; + + let mut filter = self.filter; + let mut selection = self.selection; + if let Some(filter) = filter.as_mut() { for predicate in filter.predicates.iter_mut() { if !selects_any(selection.as_ref()) { @@ -283,7 +406,7 @@ impl ArrowReaderBuilder> { } let array_reader = build_array_reader( - self.fields.as_ref(), + self.fields.as_deref(), predicate.projection(), &reader, )?; @@ -298,7 +421,7 @@ impl ArrowReaderBuilder> { } let array_reader = - build_array_reader(self.fields.as_ref(), &self.projection, &reader)?; + build_array_reader(self.fields.as_deref(), &self.projection, &reader)?; // If selection is empty, truncate if !selects_any(selection.as_ref()) { @@ -313,6 +436,59 @@ impl ArrowReaderBuilder> { } } +struct ReaderRowGroups { + reader: Arc, + + metadata: Arc, + /// Optional list of row group indices to scan + row_groups: Vec, +} + +impl RowGroups for ReaderRowGroups { + fn num_rows(&self) -> usize { + let meta = self.metadata.row_groups(); + self.row_groups + .iter() + .map(|x| meta[*x].num_rows() as usize) + .sum() + } + + fn column_chunks(&self, i: usize) -> Result> { + Ok(Box::new(ReaderPageIterator { + column_idx: i, + reader: self.reader.clone(), + metadata: self.metadata.clone(), + row_groups: self.row_groups.clone().into_iter(), + })) + } +} + +struct ReaderPageIterator { + reader: Arc, + column_idx: usize, + row_groups: std::vec::IntoIter, + metadata: Arc, +} + +impl Iterator for ReaderPageIterator { + type Item = Result>; + + fn next(&mut self) -> Option { + let rg_idx = self.row_groups.next()?; + let rg = self.metadata.row_group(rg_idx); + let meta = rg.column(self.column_idx); + let offset_index = self.metadata.offset_index(); + let page_locations = offset_index.map(|i| i[rg_idx][self.column_idx].clone()); + let total_rows = rg.num_rows() as usize; + let reader = self.reader.clone(); + + let ret = SerializedPageReader::new(reader, meta, total_rows, page_locations); + Some(ret.map(|x| Box::new(x) as _)) + } +} + +impl PageIterator for ReaderPageIterator {} + /// An `Iterator>` that yields [`RecordBatch`] /// read from a parquet data source pub struct ParquetRecordBatchReader { diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index f17fb0751d52..c7e0f64783f1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -96,8 +96,9 @@ use arrow_schema::SchemaRef; use crate::arrow::array_reader::{build_array_reader, RowGroups}; use crate::arrow::arrow_reader::{ - apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderOptions, - ParquetRecordBatchReader, RowFilter, RowSelection, + apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, + RowSelection, }; use crate::arrow::ProjectionMask; @@ -205,6 +206,29 @@ impl AsyncFileReader for T { } } +impl ArrowReaderMetadata { + /// Returns a new [`ArrowReaderMetadata`] for this builder + /// + /// See [`ParquetRecordBatchStreamBuilder::new_with_metadata`] for how this can be used + pub async fn load_async( + input: &mut T, + options: ArrowReaderOptions, + ) -> Result { + let mut metadata = input.get_metadata().await?; + + if options.page_index + && metadata.column_index().is_none() + && metadata.offset_index().is_none() + { + let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); + let mut loader = MetadataLoader::new(input, m); + loader.load_page_index(true, true).await?; + metadata = Arc::new(loader.finish()) + } + Self::try_new(metadata, options) + } +} + #[doc(hidden)] /// A newtype used within [`ReaderOptionsBuilder`] to distinguish sync readers from async /// @@ -217,33 +241,30 @@ pub struct AsyncReader(T); /// In particular, this handles reading the parquet file metadata, allowing consumers /// to use this information to select what specific columns, row groups, etc... /// they wish to be read by the resulting stream -/// pub type ParquetRecordBatchStreamBuilder = ArrowReaderBuilder>; -impl ArrowReaderBuilder> { +impl ParquetRecordBatchStreamBuilder { /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file - pub async fn new(mut input: T) -> Result { - let metadata = input.get_metadata().await?; - Self::new_builder(AsyncReader(input), metadata, Default::default()) + pub async fn new(input: T) -> Result { + Self::new_with_options(input, Default::default()).await } + /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file + /// and [`ArrowReaderOptions`] pub async fn new_with_options( mut input: T, options: ArrowReaderOptions, ) -> Result { - let mut metadata = input.get_metadata().await?; - - if options.page_index - && metadata.column_index().is_none() - && metadata.offset_index().is_none() - { - let m = Arc::try_unwrap(metadata).unwrap_or_else(|e| e.as_ref().clone()); - let mut loader = MetadataLoader::new(&mut input, m); - loader.load_page_index(true, true).await?; - metadata = Arc::new(loader.finish()) - } + let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; + Ok(Self::new_with_metadata(input, metadata)) + } - Self::new_builder(AsyncReader(input), metadata, options) + /// Create a [`ParquetRecordBatchStreamBuilder`] from the provided [`ArrowReaderMetadata`] + /// + /// This allows loading metadata once and using it to create multiple builders with + /// potentially different settings + pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self { + Self::new_builder(AsyncReader(input), metadata) } /// Build a new [`ParquetRecordBatchStream`] @@ -297,7 +318,7 @@ type ReadResult = Result<(ReaderFactory, Option) struct ReaderFactory { metadata: Arc, - fields: Option, + fields: Option>, input: T, @@ -350,7 +371,7 @@ where .await?; let array_reader = build_array_reader( - self.fields.as_ref(), + self.fields.as_deref(), predicate_projection, &row_group, )?; @@ -403,7 +424,7 @@ where let reader = ParquetRecordBatchReader::new( batch_size, - build_array_reader(self.fields.as_ref(), &projection, &row_group)?, + build_array_reader(self.fields.as_deref(), &projection, &row_group)?, selection, ); @@ -1409,7 +1430,7 @@ mod tests { let reader_factory = ReaderFactory { metadata, - fields, + fields: fields.map(Arc::new), input: async_reader, filter: None, limit: None, diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index b2a2b3eee531..a5e2de6b0667 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -155,13 +155,13 @@ impl ParquetMetaData { } /// Override the column index - #[allow(dead_code)] + #[cfg(feature = "arrow")] pub(crate) fn set_column_index(&mut self, index: Option) { self.column_index = index; } /// Override the offset index - #[allow(dead_code)] + #[cfg(feature = "arrow")] pub(crate) fn set_offset_index(&mut self, index: Option) { self.offset_index = index; } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index f8716359e045..3dac8ee55886 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -242,11 +242,6 @@ impl SerializedFileReader { }) } } - - #[cfg(feature = "arrow")] - pub(crate) fn metadata_ref(&self) -> &Arc { - &self.metadata - } } /// Get midpoint offset for a row group From c6184389241a0c85823aa494e8b5d93343771666 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Aug 2023 18:58:03 +0100 Subject: [PATCH 1148/1411] Faster i256 Division (2-100x) (#4663) (#4672) * Faster i256 Division (2-100x) (#4663) * Clippy * Use inline assembly * Fix non-x64 * Add repr(C) * More docs * Format --- arrow-buffer/benches/i256.rs | 53 +-- arrow-buffer/src/bigint/div.rs | 312 ++++++++++++++++++ arrow-buffer/src/{bigint.rs => bigint/mod.rs} | 80 ++--- 3 files changed, 375 insertions(+), 70 deletions(-) create mode 100644 arrow-buffer/src/bigint/div.rs rename arrow-buffer/src/{bigint.rs => bigint/mod.rs} (94%) diff --git a/arrow-buffer/benches/i256.rs b/arrow-buffer/benches/i256.rs index 2c43e0e91070..ebb45e793bd0 100644 --- a/arrow-buffer/benches/i256.rs +++ b/arrow-buffer/benches/i256.rs @@ -21,18 +21,7 @@ use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::str::FromStr; -/// Returns fixed seedable RNG -fn seedable_rng() -> StdRng { - StdRng::seed_from_u64(42) -} - -fn create_i256_vec(size: usize) -> Vec { - let mut rng = seedable_rng(); - - (0..size) - .map(|_| i256::from_i128(rng.gen::())) - .collect() -} +const SIZE: usize = 1024; fn criterion_benchmark(c: &mut Criterion) { let numbers = vec![ @@ -54,24 +43,40 @@ fn criterion_benchmark(c: &mut Criterion) { }); } - c.bench_function("i256_div", |b| { + let mut rng = StdRng::seed_from_u64(42); + + let numerators: Vec<_> = (0..SIZE) + .map(|_| { + let high = rng.gen_range(1000..i128::MAX); + let low = rng.gen(); + i256::from_parts(low, high) + }) + .collect(); + + let divisors: Vec<_> = numerators + .iter() + .map(|n| { + let quotient = rng.gen_range(1..100_i32); + n.wrapping_div(i256::from(quotient)) + }) + .collect(); + + c.bench_function("i256_div_rem small quotient", |b| { b.iter(|| { - for number_a in create_i256_vec(10) { - for number_b in create_i256_vec(5) { - number_a.checked_div(number_b); - number_a.wrapping_div(number_b); - } + for (n, d) in numerators.iter().zip(&divisors) { + black_box(n.wrapping_div(*d)); } }); }); - c.bench_function("i256_rem", |b| { + let divisors: Vec<_> = (0..SIZE) + .map(|_| i256::from(rng.gen_range(1..100_i32))) + .collect(); + + c.bench_function("i256_div_rem small divisor", |b| { b.iter(|| { - for number_a in create_i256_vec(10) { - for number_b in create_i256_vec(5) { - number_a.checked_rem(number_b); - number_a.wrapping_rem(number_b); - } + for (n, d) in numerators.iter().zip(&divisors) { + black_box(n.wrapping_div(*d)); } }); }); diff --git a/arrow-buffer/src/bigint/div.rs b/arrow-buffer/src/bigint/div.rs new file mode 100644 index 000000000000..ba530ffcc6c8 --- /dev/null +++ b/arrow-buffer/src/bigint/div.rs @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! N-digit division +//! +//! Implementation heavily inspired by [uint] +//! +//! [uint]: https://github.com/paritytech/parity-common/blob/d3a9327124a66e52ca1114bb8640c02c18c134b8/uint/src/uint.rs#L844 + +/// Unsigned, little-endian, n-digit division with remainder +/// +/// # Panics +/// +/// Panics if divisor is zero +pub fn div_rem( + numerator: &[u64; N], + divisor: &[u64; N], +) -> ([u64; N], [u64; N]) { + let numerator_bits = bits(numerator); + let divisor_bits = bits(divisor); + assert_ne!(divisor_bits, 0, "division by zero"); + + if numerator_bits < divisor_bits { + return ([0; N], *numerator); + } + + if divisor_bits <= 64 { + return div_rem_small(numerator, divisor[0]); + } + + let numerator_words = (numerator_bits + 63) / 64; + let divisor_words = (divisor_bits + 63) / 64; + let n = divisor_words; + let m = numerator_words - divisor_words; + + div_rem_knuth(numerator, divisor, n, m) +} + +/// Return the least number of bits needed to represent the number +fn bits(arr: &[u64]) -> usize { + for (idx, v) in arr.iter().enumerate().rev() { + if *v > 0 { + return 64 - v.leading_zeros() as usize + 64 * idx; + } + } + 0 +} + +/// Division of numerator by a u64 divisor +fn div_rem_small( + numerator: &[u64; N], + divisor: u64, +) -> ([u64; N], [u64; N]) { + let mut rem = 0u64; + let mut numerator = *numerator; + numerator.iter_mut().rev().for_each(|d| { + let (q, r) = div_rem_word(rem, *d, divisor); + *d = q; + rem = r; + }); + + let mut rem_padded = [0; N]; + rem_padded[0] = rem; + (numerator, rem_padded) +} + +/// Use Knuth Algorithm D to compute `numerator / divisor` returning the +/// quotient and remainder +/// +/// `n` is the number of non-zero 64-bit words in `divisor` +/// `m` is the number of non-zero 64-bit words present in `numerator` beyond `divisor`, and +/// therefore the number of words in the quotient +/// +/// A good explanation of the algorithm can be found [here](https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html) +fn div_rem_knuth( + numerator: &[u64; N], + divisor: &[u64; N], + n: usize, + m: usize, +) -> ([u64; N], [u64; N]) { + assert!(n + m <= N); + + // The algorithm works by incrementally generating guesses `q_hat`, for the next digit + // of the quotient, starting from the most significant digit. + // + // This relies on the property that for any `q_hat` where + // + // (q_hat << (j * 64)) * divisor <= numerator` + // + // We can set + // + // q += q_hat << (j * 64) + // numerator -= (q_hat << (j * 64)) * divisor + // + // And then iterate until `numerator < divisor` + + // We normalize the divisor so that the highest bit in the highest digit of the + // divisor is set, this ensures our initial guess of `q_hat` is at most 2 off from + // the correct value for q[j] + let shift = divisor[n - 1].leading_zeros(); + // As the shift is computed based on leading zeros, don't need to perform full_shl + let divisor = shl_word(divisor, shift); + // numerator may have fewer leading zeros than divisor, so must add another digit + let mut numerator = full_shl(numerator, shift); + + // The two most significant digits of the divisor + let b0 = divisor[n - 1]; + let b1 = divisor[n - 2]; + + let mut q = [0; N]; + + for j in (0..=m).rev() { + let a0 = numerator[j + n]; + let a1 = numerator[j + n - 1]; + + let mut q_hat = if a0 < b0 { + // The first estimate is [a1, a0] / b0, it may be too large by at most 2 + let (mut q_hat, mut r_hat) = div_rem_word(a0, a1, b0); + + // r_hat = [a1, a0] - q_hat * b0 + // + // Now we want to compute a more precise estimate [a2,a1,a0] / [b1,b0] + // which can only be less or equal to the current q_hat + // + // q_hat is too large if: + // [a2,a1,a0] < q_hat * [b1,b0] + // [a2,r_hat] < q_hat * b1 + let a2 = numerator[j + n - 2]; + loop { + let r = u128::from(q_hat) * u128::from(b1); + let (lo, hi) = (r as u64, (r >> 64) as u64); + if (hi, lo) <= (r_hat, a2) { + break; + } + + q_hat -= 1; + let (new_r_hat, overflow) = r_hat.overflowing_add(b0); + r_hat = new_r_hat; + + if overflow { + break; + } + } + q_hat + } else { + u64::MAX + }; + + // q_hat is now either the correct quotient digit, or in rare cases 1 too large + + // Compute numerator -= (q_hat * divisor) << (j * 64) + let q_hat_v = full_mul_u64(&divisor, q_hat); + let c = sub_assign(&mut numerator[j..], &q_hat_v[..n + 1]); + + // If underflow, q_hat was too large by 1 + if c { + // Reduce q_hat by 1 + q_hat -= 1; + + // Add back one multiple of divisor + let c = add_assign(&mut numerator[j..], &divisor[..n]); + numerator[j + n] = numerator[j + n].wrapping_add(u64::from(c)); + } + + // q_hat is the correct value for q[j] + q[j] = q_hat; + } + + // The remainder is what is left in numerator, with the initial normalization shl reversed + let remainder = full_shr(&numerator, shift); + (q, remainder) +} + +/// Perform narrowing division of a u128 by a u64 divisor, returning the quotient and remainder +/// +/// This method may trap or panic if hi >= divisor, i.e. the quotient would not fit +/// into a 64-bit integer +fn div_rem_word(hi: u64, lo: u64, divisor: u64) -> (u64, u64) { + debug_assert!(hi < divisor); + debug_assert_ne!(divisor, 0); + + // LLVM fails to use the div instruction as it is not able to prove + // that hi < divisor, and therefore the result will fit into 64-bits + #[cfg(target_arch = "x86_64")] + unsafe { + let mut quot = lo; + let mut rem = hi; + std::arch::asm!( + "div {divisor}", + divisor = in(reg) divisor, + inout("rax") quot, + inout("rdx") rem, + options(pure, nomem, nostack) + ); + (quot, rem) + } + #[cfg(not(target_arch = "x86_64"))] + { + let x = (u128::from(hi) << 64) + u128::from(lo); + let y = u128::from(divisor); + ((x / y) as u64, (x % y) as u64) + } +} + +/// Perform `a += b` +fn add_assign(a: &mut [u64], b: &[u64]) -> bool { + binop_slice(a, b, u64::overflowing_add) +} + +/// Perform `a -= b` +fn sub_assign(a: &mut [u64], b: &[u64]) -> bool { + binop_slice(a, b, u64::overflowing_sub) +} + +/// Converts an overflowing binary operation on scalars to one on slices +fn binop_slice( + a: &mut [u64], + b: &[u64], + binop: impl Fn(u64, u64) -> (u64, bool) + Copy, +) -> bool { + let mut c = false; + a.iter_mut().zip(b.iter()).for_each(|(x, y)| { + let (res1, overflow1) = y.overflowing_add(u64::from(c)); + let (res2, overflow2) = binop(*x, res1); + *x = res2; + c = overflow1 || overflow2; + }); + c +} + +/// Widening multiplication of an N-digit array with a u64 +fn full_mul_u64(a: &[u64; N], b: u64) -> ArrayPlusOne { + let mut carry = 0; + let mut out = [0; N]; + out.iter_mut().zip(a).for_each(|(o, v)| { + let r = *v as u128 * b as u128 + carry as u128; + *o = r as u64; + carry = (r >> 64) as u64; + }); + ArrayPlusOne(out, carry) +} + +/// Left shift of an N-digit array by at most 63 bits +fn shl_word(v: &[u64; N], shift: u32) -> [u64; N] { + full_shl(v, shift).0 +} + +/// Widening left shift of an N-digit array by at most 63 bits +fn full_shl(v: &[u64; N], shift: u32) -> ArrayPlusOne { + debug_assert!(shift < 64); + if shift == 0 { + return ArrayPlusOne(*v, 0); + } + let mut out = [0u64; N]; + out[0] = v[0] << shift; + for i in 1..N { + out[i] = v[i - 1] >> (64 - shift) | v[i] << shift + } + let carry = v[N - 1] >> (64 - shift); + ArrayPlusOne(out, carry) +} + +/// Narrowing right shift of an (N+1)-digit array by at most 63 bits +fn full_shr(a: &ArrayPlusOne, shift: u32) -> [u64; N] { + debug_assert!(shift < 64); + if shift == 0 { + return a.0; + } + let mut out = [0; N]; + for i in 0..N - 1 { + out[i] = a[i] >> shift | a[i + 1] << (64 - shift) + } + out[N - 1] = a[N - 1] >> shift; + out +} + +/// An array of N + 1 elements +/// +/// This is a hack around lack of support for const arithmetic +#[repr(C)] +struct ArrayPlusOne([T; N], T); + +impl std::ops::Deref for ArrayPlusOne { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + let x = self as *const Self; + unsafe { std::slice::from_raw_parts(x as *const T, N + 1) } + } +} + +impl std::ops::DerefMut for ArrayPlusOne { + fn deref_mut(&mut self) -> &mut Self::Target { + let x = self as *mut Self; + unsafe { std::slice::from_raw_parts_mut(x as *mut T, N + 1) } + } +} diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint/mod.rs similarity index 94% rename from arrow-buffer/src/bigint.rs rename to arrow-buffer/src/bigint/mod.rs index 86150e67fd91..fe0774539989 100644 --- a/arrow-buffer/src/bigint.rs +++ b/arrow-buffer/src/bigint/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::bigint::div::div_rem; use num::cast::AsPrimitive; use num::{BigInt, FromPrimitive, ToPrimitive}; use std::cmp::Ordering; @@ -22,6 +23,8 @@ use std::num::ParseIntError; use std::ops::{BitAnd, BitOr, BitXor, Neg, Shl, Shr}; use std::str::FromStr; +mod div; + /// An opaque error similar to [`std::num::ParseIntError`] #[derive(Debug)] pub struct ParseI256Error {} @@ -428,25 +431,6 @@ impl i256 { .then_some(Self { low, high }) } - /// Return the least number of bits needed to represent the number - #[inline] - fn bits_required(&self) -> usize { - let le_bytes = self.to_le_bytes(); - let arr: [u128; 2] = [ - u128::from_le_bytes(le_bytes[0..16].try_into().unwrap()), - u128::from_le_bytes(le_bytes[16..32].try_into().unwrap()), - ]; - - let iter = arr.iter().rev().take(2 - 1); - if self.is_negative() { - let ctr = iter.take_while(|&&b| b == ::core::u128::MAX).count(); - (128 * (2 - ctr)) + 1 - (!arr[2 - ctr - 1]).leading_zeros() as usize - } else { - let ctr = iter.take_while(|&&b| b == ::core::u128::MIN).count(); - (128 * (2 - ctr)) + 1 - arr[2 - ctr - 1].leading_zeros() as usize - } - } - /// Division operation, returns (quotient, remainder). /// This basically implements [Long division]: `` #[inline] @@ -458,41 +442,45 @@ impl i256 { return Err(DivRemError::DivideOverflow); } - if self == Self::MIN || other == Self::MIN { - let l = BigInt::from_signed_bytes_le(&self.to_le_bytes()); - let r = BigInt::from_signed_bytes_le(&other.to_le_bytes()); - let d = i256::from_bigint_with_overflow(&l / &r).0; - let r = i256::from_bigint_with_overflow(&l % &r).0; - return Ok((d, r)); - } - - let mut me = self.checked_abs().unwrap(); - let mut you = other.checked_abs().unwrap(); - let mut ret = [0u128; 2]; - if me < you { - return Ok((Self::from_parts(ret[0], ret[1] as i128), self)); - } + let a = self.wrapping_abs(); + let b = other.wrapping_abs(); - let shift = me.bits_required() - you.bits_required(); - you = you.shl(shift as u8); - for i in (0..=shift).rev() { - if me >= you { - ret[i / 128] |= 1 << (i % 128); - me = me.checked_sub(you).unwrap(); - } - you = you.shr(1); - } + let (div, rem) = div_rem(&a.as_digits(), &b.as_digits()); + let div = Self::from_digits(div); + let rem = Self::from_digits(rem); Ok(( if self.is_negative() == other.is_negative() { - Self::from_parts(ret[0], ret[1] as i128) + div + } else { + div.wrapping_neg() + }, + if self.is_negative() { + rem.wrapping_neg() } else { - -Self::from_parts(ret[0], ret[1] as i128) + rem }, - if self.is_negative() { -me } else { me }, )) } + /// Interpret this [`i256`] as 4 `u64` digits, least significant first + fn as_digits(self) -> [u64; 4] { + [ + self.low as u64, + (self.low >> 64) as u64, + self.high as u64, + (self.high as u128 >> 64) as u64, + ] + } + + /// Interpret 4 `u64` digits, least significant first, as a [`i256`] + fn from_digits(digits: [u64; 4]) -> Self { + Self::from_parts( + digits[0] as u128 | (digits[1] as u128) << 64, + digits[2] as i128 | (digits[3] as i128) << 64, + ) + } + /// Performs wrapping division #[inline] pub fn wrapping_div(self, other: Self) -> Self { @@ -969,7 +957,7 @@ mod tests { let expected = bl.clone() % br.clone(); let checked = il.checked_rem(ir); - assert_eq!(actual.to_string(), expected.to_string()); + assert_eq!(actual.to_string(), expected.to_string(), "{il} % {ir}"); if ir == i256::MINUS_ONE && il == i256::MIN { assert!(checked.is_none()); From 4200bedd618321e80db26bbe767efc69e491a68d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Aug 2023 08:03:34 -0400 Subject: [PATCH 1149/1411] Minor: improve object_store docs.rs library landing page (#4682) --- object_store/src/lib.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6c70326d2b7b..bb4ba5c7d9a6 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -28,10 +28,32 @@ //! # object_store //! -//! This crate provides a uniform API for interacting with object storage services and -//! local files via the the [`ObjectStore`] trait. +//! This crate provides a uniform API for interacting with object +//! storage services and local files via the [`ObjectStore`] +//! trait. //! -//! # Create an [`ObjectStore`] implementation: +//! Using this crate, the same binary and code can run in multiple +//! clouds and local test environments, via a simple runtime +//! configuration change. +//! +//! # Features: +//! +//! 1. A focused, easy to use, idiomatic, well documented, high +//! performance, `async` API. +//! +//! 2. Production quality, leading this crate to be used in large +//! scale production systems, such as [crates.io] and [InfluxDB IOx]. +//! +//! 3. Stable and predictable governance via the [Apache Arrow] project. +//! +//! Originally developed for [InfluxDB IOx] and subsequently donated +//! to [Apache Arrow]. +//! +//! [Apache Arrow]: https://arrow.apache.org/ +//! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ +//! [crates.io]: https://github.com/rust-lang/crates.io +//! +//! # Example: Create an [`ObjectStore`] implementation: //! #![cfg_attr( feature = "gcp", From f9f6eeae55029dee8a7976317e56fb80211ef349 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Aug 2023 13:35:10 +0100 Subject: [PATCH 1150/1411] Check object_store format in CI (#4679) * Check object_store format in CI * Format --- .github/workflows/rust.yml | 5 ++++- object_store/src/local.rs | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c1c7f4d90fd4..f198f48dfec5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -98,7 +98,10 @@ jobs: uses: ./.github/actions/setup-builder - name: Setup rustfmt run: rustup component add rustfmt - - name: Run + - name: Format arrow + run: cargo fmt --all -- --check + - name: Format object_store + working-directory: object_store run: cargo fmt --all -- --check msrv: diff --git a/object_store/src/local.rs b/object_store/src/local.rs index ffff6a5739d5..a0933cc6177d 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -863,7 +863,11 @@ impl AsyncWrite for LocalUpload { } } -pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { +pub(crate) fn read_range( + file: &mut File, + path: &PathBuf, + range: Range, +) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; From df28eafb08da436286d08f6170c1a24db3b11274 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Aug 2023 10:53:11 -0400 Subject: [PATCH 1151/1411] Fix object_store docs and Add CI job (#4684) * Add CI job for object_store_docs * fix job * fix again * fix * Fix doc links * Add comment about why a different workflow is needed * Fix AmazonS3 link --- .github/workflows/object_store.yml | 20 +++++++++++++++++++- object_store/src/aws/copy.rs | 8 +++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 7858da1e2d2d..01e14022e122 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -65,6 +65,24 @@ jobs: - name: Run clippy with all features and all targets run: cargo clippy --all-features --all-targets -- -D warnings + # test doc links still work + # + # Note that since object_store is not part of the main workspace, + # this needs a separate docs job as it is not covered by + # `cargo doc --workspace` + docs: + name: Rustdocs + runs-on: ubuntu-latest + defaults: + run: + working-directory: object_store + env: + RUSTDOCFLAGS: "-Dwarnings" + steps: + - uses: actions/checkout@v3 + - name: Run cargo doc + run: cargo doc --document-private-items --no-deps --workspace --all-features + # test the crate # This runs outside a container to workaround lack of support for passing arguments # to service containers - https://github.com/orgs/community/discussions/26688 @@ -152,4 +170,4 @@ jobs: - name: Build wasm32-unknown-unknown run: cargo build --target wasm32-unknown-unknown - name: Build wasm32-wasi - run: cargo build --target wasm32-wasi \ No newline at end of file + run: cargo build --target wasm32-wasi diff --git a/object_store/src/aws/copy.rs b/object_store/src/aws/copy.rs index 6b96f992cec5..da4e2809be1a 100644 --- a/object_store/src/aws/copy.rs +++ b/object_store/src/aws/copy.rs @@ -17,7 +17,11 @@ use crate::config::Parse; -/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`] +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for +/// [`AmazonS3`]. +/// +/// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists +/// [`AmazonS3`]: super::AmazonS3 #[derive(Debug, Clone)] #[non_exhaustive] pub enum S3CopyIfNotExists { @@ -32,6 +36,8 @@ pub enum S3CopyIfNotExists { /// /// For example `header: cf-copy-destination-if-none-match: *`, would set /// the header `cf-copy-destination-if-none-match` to `*` + /// + /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists Header(String, String), } From 65c24d6cee94b1e08647e096a53fbf5237f91002 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 11 Aug 2023 16:19:14 +0100 Subject: [PATCH 1152/1411] Faster stream_get test (#4685) --- object_store/src/lib.rs | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index bb4ba5c7d9a6..cf7e47998aa2 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -959,8 +959,7 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; - use bytes::{BufMut, BytesMut}; - use itertools::Itertools; + use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1380,27 +1379,27 @@ mod tests { } } - fn get_random_bytes(len: usize) -> Bytes { - use rand::Rng; - let mut rng = rand::thread_rng(); - let mut bytes = BytesMut::with_capacity(len); - for _ in 0..len { - bytes.put_u8(rng.gen()); + /// Returns a chunk of length `chunk_length` + fn get_chunk(chunk_length: usize) -> Bytes { + let mut data = vec![0_u8; chunk_length]; + let mut rng = thread_rng(); + // Set a random selection of bytes + for _ in 0..1000 { + data[rng.gen_range(0..chunk_length)] = rng.gen(); } - bytes.freeze() + data.into() } - fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { - std::iter::repeat(get_random_bytes(chunk_length)) - .take(num_chunks) - .collect() + /// Returns `num_chunks` of length `chunks` + fn get_chunks(chunk_length: usize, num_chunks: usize) -> Vec { + (0..num_chunks).map(|_| get_chunk(chunk_length)).collect() } pub(crate) async fn stream_get(storage: &DynObjectStore) { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage - let data = get_vec_of_bytes(5_000, 10); + let data = get_chunks(5_000, 10); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { @@ -1427,7 +1426,7 @@ mod tests { // Can overwrite some storage // Sizes chosen to ensure we write three parts - let data = (0..7).map(|_| get_random_bytes(3_200_000)).collect_vec(); + let data = get_chunks(3_200_000, 7); let bytes_expected = data.concat(); let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { From 230612eedc77a8b707206767cb1b36ed7ad82f76 Mon Sep 17 00:00:00 2001 From: vmuddassir-msft <140655500+vmuddassir-msft@users.noreply.github.com> Date: Fri, 11 Aug 2023 20:50:16 +0530 Subject: [PATCH 1153/1411] Add Support for Microsoft Fabric / OneLake (#4573) * Changes required for onelake-fix * Fix Unit tests * Add Unit Tests * Add onelake read/write test * Add with_use_fabric , for fabric url check * Final tweaks * Further tweaks * Automatically set use_fabric_endpoint --------- Co-authored-by: Raphael Taylor-Davies --- object_store/src/azure/mod.rs | 101 +++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 019cde581354..6bb4cdad1bb0 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -341,6 +341,10 @@ pub struct MicrosoftAzureBuilder { client_options: ClientOptions, /// Credentials credentials: Option, + /// When set to true, fabric url scheme will be used + /// + /// i.e. https://{account_name}.dfs.fabric.microsoft.com + use_fabric_endpoint: ConfigValue, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -430,6 +434,13 @@ pub enum AzureConfigKey { /// - `use_emulator` UseEmulator, + /// Use object store with url scheme account.dfs.fabric.microsoft.com + /// + /// Supported keys: + /// - `azure_use_fabric_endpoint` + /// - `use_fabric_endpoint` + UseFabricEndpoint, + /// Endpoint to request a imds managed identity token /// /// Supported keys: @@ -482,6 +493,7 @@ impl AsRef for AzureConfigKey { Self::SasKey => "azure_storage_sas_key", Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", + Self::UseFabricEndpoint => "azure_use_fabric_endpoint", Self::MsiEndpoint => "azure_msi_endpoint", Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", @@ -531,6 +543,9 @@ impl FromStr for AzureConfigKey { "azure_federated_token_file" | "federated_token_file" => { Ok(Self::FederatedTokenFile) } + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { + Ok(Self::UseFabricEndpoint) + } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), @@ -600,11 +615,16 @@ impl MicrosoftAzureBuilder { /// /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `abfs[s]://@.dfs.fabric.microsoft.com/` /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) /// - `azure:///` (custom) /// - `https://.dfs.core.windows.net` /// - `https://.blob.core.windows.net` + /// - `https://.dfs.fabric.microsoft.com` + /// - `https://.dfs.fabric.microsoft.com/` + /// - `https://.blob.fabric.microsoft.com` + /// - `https://.blob.fabric.microsoft.com/` /// /// Note: Settings derived from the URL will override any others set on this builder /// @@ -639,6 +659,7 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } @@ -692,6 +713,9 @@ impl MicrosoftAzureBuilder { AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::UseFabricEndpoint => { + Some(self.use_fabric_endpoint.to_string()) + } AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), @@ -724,6 +748,10 @@ impl MicrosoftAzureBuilder { } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { self.container_name = Some(validate(parsed.username())?); self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); } else { return Err(UrlNotRecognisedSnafu { url }.build().into()); } @@ -733,6 +761,21 @@ impl MicrosoftAzureBuilder { | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); } + Some((a, "dfs.fabric.microsoft.com")) + | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); + // Attempt to infer the container name from the URL + // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv + // - https://onelake.dfs.fabric.microsoft.com//.// + // + // See + if let Some(workspace) = parsed.path_segments().unwrap().next() { + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()) + } + } + self.use_fabric_endpoint = true.into(); + } _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), }, scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), @@ -819,6 +862,14 @@ impl MicrosoftAzureBuilder { self } + /// Set if Microsoft Fabric url scheme should be used (defaults to false) + /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` + /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { + self.use_fabric_endpoint = use_fabric_endpoint.into(); + self + } + /// Sets what protocol is allowed. If `allow_http` is : /// * false (default): Only HTTPS are allowed /// * true: HTTP and HTTPS are allowed @@ -885,6 +936,7 @@ impl MicrosoftAzureBuilder { } let container = self.container_name.ok_or(Error::MissingContainerName {})?; + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { Arc::new(StaticCredentialProvider::new(credential)) }; @@ -906,7 +958,11 @@ impl MicrosoftAzureBuilder { (true, url, credential, account_name) } else { let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = format!("https://{}.blob.core.windows.net", &account_name); + let account_url = match self.use_fabric_endpoint.get()? { + true => format!("https://{}.blob.fabric.microsoft.com", &account_name), + false => format!("https://{}.blob.core.windows.net", &account_name), + }; + let url = Url::parse(&account_url) .context(UnableToParseUrlSnafu { url: account_url })?; @@ -1049,6 +1105,15 @@ mod tests { .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); let mut builder = MicrosoftAzureBuilder::new(); builder.parse_url("abfs://container/path").unwrap(); @@ -1067,12 +1132,46 @@ mod tests { .parse_url("https://account.dfs.core.windows.net/") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); let mut builder = MicrosoftAzureBuilder::new(); builder .parse_url("https://account.blob.core.windows.net/") .unwrap(); assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); let err_cases = [ "mailto://account.blob.core.windows.net/", From ec273e76db12106db0a886529d9018763c11dc9f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 13 Aug 2023 21:38:49 +0100 Subject: [PATCH 1154/1411] Cleanup DynComparator (#2654) (#4687) --- arrow-ord/src/ord.rs | 392 +++++++++++++++---------------------------- 1 file changed, 134 insertions(+), 258 deletions(-) diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index a33ead8ab041..4d6e3bde9152 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -21,114 +21,59 @@ use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::ArrowError; use std::cmp::Ordering; /// Compare the values at two arbitrary indices in two arrays. pub type DynComparator = Box Ordering + Send + Sync>; -fn compare_primitives( +fn compare_primitive( left: &dyn Array, right: &dyn Array, ) -> DynComparator where T::Native: ArrowNativeTypeOp, { - let left: PrimitiveArray = PrimitiveArray::from(left.to_data()); - let right: PrimitiveArray = PrimitiveArray::from(right.to_data()); + let left = left.as_primitive::().clone(); + let right = right.as_primitive::().clone(); Box::new(move |i, j| left.value(i).compare(right.value(j))) } fn compare_boolean(left: &dyn Array, right: &dyn Array) -> DynComparator { - let left: BooleanArray = BooleanArray::from(left.to_data()); - let right: BooleanArray = BooleanArray::from(right.to_data()); + let left: BooleanArray = left.as_boolean().clone(); + let right: BooleanArray = right.as_boolean().clone(); Box::new(move |i, j| left.value(i).cmp(&right.value(j))) } -fn compare_string(left: &dyn Array, right: &dyn Array) -> DynComparator { - let left: StringArray = StringArray::from(left.to_data()); - let right: StringArray = StringArray::from(right.to_data()); +fn compare_bytes(left: &dyn Array, right: &dyn Array) -> DynComparator { + let left = left.as_bytes::().clone(); + let right = right.as_bytes::().clone(); - Box::new(move |i, j| left.value(i).cmp(right.value(j))) -} - -fn compare_dict_primitive(left: &dyn Array, right: &dyn Array) -> DynComparator -where - K: ArrowDictionaryKeyType, - V: ArrowPrimitiveType, - V::Native: ArrowNativeTypeOp, -{ - let left = left.as_dictionary::(); - let right = right.as_dictionary::(); - - let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().to_data()); - let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().to_data()); - let left_values: PrimitiveArray = left.values().to_data().into(); - let right_values: PrimitiveArray = right.values().to_data().into(); - - Box::new(move |i: usize, j: usize| { - let key_left = left_keys.value(i).as_usize(); - let key_right = right_keys.value(j).as_usize(); - let left = left_values.value(key_left); - let right = right_values.value(key_right); - left.compare(right) - }) -} - -fn compare_dict_string(left: &dyn Array, right: &dyn Array) -> DynComparator -where - T: ArrowDictionaryKeyType, -{ - let left = left.as_dictionary::(); - let right = right.as_dictionary::(); - - let left_keys: PrimitiveArray = PrimitiveArray::from(left.keys().to_data()); - let right_keys: PrimitiveArray = PrimitiveArray::from(right.keys().to_data()); - let left_values = StringArray::from(left.values().to_data()); - let right_values = StringArray::from(right.values().to_data()); - - Box::new(move |i: usize, j: usize| { - let key_left = left_keys.value(i).as_usize(); - let key_right = right_keys.value(j).as_usize(); - let left = left_values.value(key_left); - let right = right_values.value(key_right); - left.cmp(right) + Box::new(move |i, j| { + let l: &[u8] = left.value(i).as_ref(); + let r: &[u8] = right.value(j).as_ref(); + l.cmp(r) }) } -fn cmp_dict_primitive( - key_type: &DataType, +fn compare_dict( left: &dyn Array, right: &dyn Array, -) -> Result -where - VT: ArrowPrimitiveType, - VT::Native: ArrowNativeTypeOp, -{ - use DataType::*; - - Ok(match key_type { - UInt8 => compare_dict_primitive::(left, right), - UInt16 => compare_dict_primitive::(left, right), - UInt32 => compare_dict_primitive::(left, right), - UInt64 => compare_dict_primitive::(left, right), - Int8 => compare_dict_primitive::(left, right), - Int16 => compare_dict_primitive::(left, right), - Int32 => compare_dict_primitive::(left, right), - Int64 => compare_dict_primitive::(left, right), - t => { - return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {t:?}" - ))); - } - }) -} +) -> Result { + let left = left.as_dictionary::(); + let right = right.as_dictionary::(); + + let cmp = build_compare(left.values().as_ref(), right.values().as_ref())?; + let left_keys = left.keys().clone(); + let right_keys = right.keys().clone(); -macro_rules! cmp_dict_primitive_helper { - ($t:ty, $key_type_lhs:expr, $left:expr, $right:expr) => { - cmp_dict_primitive::<$t>($key_type_lhs, $left, $right)? - }; + // TODO: Handle value nulls (#2687) + Ok(Box::new(move |i, j| { + let l = left_keys.value(i).as_usize(); + let r = right_keys.value(j).as_usize(); + cmp(l, r) + })) } /// returns a comparison function that compares two values at two different positions @@ -145,7 +90,7 @@ macro_rules! cmp_dict_primitive_helper { /// let cmp = build_compare(&array1, &array2).unwrap(); /// /// // 1 (index 0 of array1) is smaller than 4 (index 1 of array2) -/// assert_eq!(std::cmp::Ordering::Less, (cmp)(0, 1)); +/// assert_eq!(std::cmp::Ordering::Less, cmp(0, 1)); /// ``` // This is a factory of comparisons. // The lifetime 'a enforces that we cannot use the closure beyond any of the array's lifetime. @@ -153,134 +98,47 @@ pub fn build_compare( left: &dyn Array, right: &dyn Array, ) -> Result { - use arrow_schema::{DataType::*, IntervalUnit::*, TimeUnit::*}; - Ok(match (left.data_type(), right.data_type()) { - (a, b) if a != b => { - return Err(ArrowError::InvalidArgumentError( - "Can't compare arrays of different types".to_string(), - )); - } - (Boolean, Boolean) => compare_boolean(left, right), - (UInt8, UInt8) => compare_primitives::(left, right), - (UInt16, UInt16) => compare_primitives::(left, right), - (UInt32, UInt32) => compare_primitives::(left, right), - (UInt64, UInt64) => compare_primitives::(left, right), - (Int8, Int8) => compare_primitives::(left, right), - (Int16, Int16) => compare_primitives::(left, right), - (Int32, Int32) => compare_primitives::(left, right), - (Int64, Int64) => compare_primitives::(left, right), - (Float16, Float16) => compare_primitives::(left, right), - (Float32, Float32) => compare_primitives::(left, right), - (Float64, Float64) => compare_primitives::(left, right), - (Decimal128(_, _), Decimal128(_, _)) => { - compare_primitives::(left, right) - } - (Decimal256(_, _), Decimal256(_, _)) => { - compare_primitives::(left, right) - } - (Date32, Date32) => compare_primitives::(left, right), - (Date64, Date64) => compare_primitives::(left, right), - (Time32(Second), Time32(Second)) => { - compare_primitives::(left, right) - } - (Time32(Millisecond), Time32(Millisecond)) => { - compare_primitives::(left, right) - } - (Time64(Microsecond), Time64(Microsecond)) => { - compare_primitives::(left, right) - } - (Time64(Nanosecond), Time64(Nanosecond)) => { - compare_primitives::(left, right) - } - (Timestamp(Second, _), Timestamp(Second, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Millisecond, _), Timestamp(Millisecond, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Microsecond, _), Timestamp(Microsecond, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Nanosecond, _), Timestamp(Nanosecond, _)) => { - compare_primitives::(left, right) - } - (Interval(YearMonth), Interval(YearMonth)) => { - compare_primitives::(left, right) - } - (Interval(DayTime), Interval(DayTime)) => { - compare_primitives::(left, right) - } - (Interval(MonthDayNano), Interval(MonthDayNano)) => { - compare_primitives::(left, right) - } - (Duration(Second), Duration(Second)) => { - compare_primitives::(left, right) - } - (Duration(Millisecond), Duration(Millisecond)) => { - compare_primitives::(left, right) - } - (Duration(Microsecond), Duration(Microsecond)) => { - compare_primitives::(left, right) - } - (Duration(Nanosecond), Duration(Nanosecond)) => { - compare_primitives::(left, right) - } - (Utf8, Utf8) => compare_string(left, right), - (LargeUtf8, LargeUtf8) => compare_string(left, right), - ( - Dictionary(key_type_lhs, value_type_lhs), - Dictionary(key_type_rhs, value_type_rhs), - ) => { - if key_type_lhs != key_type_rhs || value_type_lhs != value_type_rhs { - return Err(ArrowError::InvalidArgumentError( - "Can't compare arrays of different types".to_string(), - )); - } - - let key_type_lhs = key_type_lhs.as_ref(); - downcast_primitive! { - value_type_lhs.as_ref() => (cmp_dict_primitive_helper, key_type_lhs, left, right), - Utf8 => match key_type_lhs { - UInt8 => compare_dict_string::(left, right), - UInt16 => compare_dict_string::(left, right), - UInt32 => compare_dict_string::(left, right), - UInt64 => compare_dict_string::(left, right), - Int8 => compare_dict_string::(left, right), - Int16 => compare_dict_string::(left, right), - Int32 => compare_dict_string::(left, right), - Int64 => compare_dict_string::(left, right), - lhs => { - return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {lhs:?}" - ))); - } - }, - t => { - return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries of value data type {t:?} are not supported" - ))); - } - } - } + use arrow_schema::DataType::*; + macro_rules! primitive_helper { + ($t:ty, $left:expr, $right:expr) => { + Ok(compare_primitive::<$t>($left, $right)) + }; + } + downcast_primitive! { + left.data_type(), right.data_type() => (primitive_helper, left, right), + (Boolean, Boolean) => Ok(compare_boolean(left, right)), + (Utf8, Utf8) => Ok(compare_bytes::(left, right)), + (LargeUtf8, LargeUtf8) => Ok(compare_bytes::(left, right)), + (Binary, Binary) => Ok(compare_bytes::(left, right)), + (LargeBinary, LargeBinary) => Ok(compare_bytes::(left, right)), (FixedSizeBinary(_), FixedSizeBinary(_)) => { - let left: FixedSizeBinaryArray = left.to_data().into(); - let right: FixedSizeBinaryArray = right.to_data().into(); - - Box::new(move |i, j| left.value(i).cmp(right.value(j))) - } - (lhs, _) => { - return Err(ArrowError::InvalidArgumentError(format!( - "The data type type {lhs:?} has no natural order" - ))); - } - }) + let left = left.as_fixed_size_binary().clone(); + let right = right.as_fixed_size_binary().clone(); + Ok(Box::new(move |i, j| left.value(i).cmp(right.value(j)))) + }, + (Dictionary(l_key, _), Dictionary(r_key, _)) => { + macro_rules! dict_helper { + ($t:ty, $left:expr, $right:expr) => { + compare_dict::<$t>($left, $right) + }; + } + downcast_integer! { + l_key.as_ref(), r_key.as_ref() => (dict_helper, left, right), + _ => unreachable!() + } + }, + (lhs, rhs) => Err(ArrowError::InvalidArgumentError(match lhs == rhs { + true => format!("The data type type {lhs:?} has no natural order"), + false => "Can't compare arrays of different types".to_string(), + })) + } } #[cfg(test)] pub mod tests { use super::*; use arrow_array::{FixedSizeBinaryArray, Float64Array, Int32Array}; - use arrow_buffer::i256; + use arrow_buffer::{i256, OffsetBuffer}; use half::f16; use std::cmp::Ordering; use std::sync::Arc; @@ -292,7 +150,7 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); + assert_eq!(Ordering::Less, cmp(0, 1)); } #[test] @@ -304,7 +162,7 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 0)); } #[test] @@ -323,7 +181,7 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 0)); } #[test] @@ -332,7 +190,7 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); + assert_eq!(Ordering::Less, cmp(0, 1)); } #[test] @@ -341,7 +199,7 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); + assert_eq!(Ordering::Less, cmp(0, 1)); } #[test] @@ -350,8 +208,8 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); - assert_eq!(Ordering::Equal, (cmp)(1, 1)); + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Equal, cmp(1, 1)); } #[test] @@ -360,8 +218,8 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); - assert_eq!(Ordering::Greater, (cmp)(1, 0)); + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); } #[test] @@ -373,8 +231,8 @@ pub mod tests { .unwrap(); let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(1, 0)); - assert_eq!(Ordering::Greater, (cmp)(0, 2)); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Greater, cmp(0, 2)); } #[test] @@ -390,8 +248,8 @@ pub mod tests { .unwrap(); let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(1, 0)); - assert_eq!(Ordering::Greater, (cmp)(0, 2)); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Greater, cmp(0, 2)); } #[test] @@ -401,9 +259,9 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 1)); - assert_eq!(Ordering::Equal, (cmp)(3, 4)); - assert_eq!(Ordering::Greater, (cmp)(2, 3)); + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Equal, cmp(3, 4)); + assert_eq!(Ordering::Greater, cmp(2, 3)); } #[test] @@ -415,9 +273,9 @@ pub mod tests { let cmp = build_compare(&a1, &a2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Equal, (cmp)(0, 3)); - assert_eq!(Ordering::Greater, (cmp)(1, 3)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Equal, cmp(0, 3)); + assert_eq!(Ordering::Greater, cmp(1, 3)); } #[test] @@ -432,11 +290,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -451,11 +309,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -470,11 +328,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -489,11 +347,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -508,11 +366,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -527,11 +385,11 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); } #[test] @@ -556,10 +414,28 @@ pub mod tests { let cmp = build_compare(&array1, &array2).unwrap(); - assert_eq!(Ordering::Less, (cmp)(0, 0)); - assert_eq!(Ordering::Less, (cmp)(0, 3)); - assert_eq!(Ordering::Equal, (cmp)(3, 3)); - assert_eq!(Ordering::Greater, (cmp)(3, 1)); - assert_eq!(Ordering::Greater, (cmp)(3, 2)); + assert_eq!(Ordering::Less, cmp(0, 0)); + assert_eq!(Ordering::Less, cmp(0, 3)); + assert_eq!(Ordering::Equal, cmp(3, 3)); + assert_eq!(Ordering::Greater, cmp(3, 1)); + assert_eq!(Ordering::Greater, cmp(3, 2)); + } + + fn test_bytes_impl() { + let offsets = OffsetBuffer::from_lengths([3, 3, 1]); + let a = GenericByteArray::::new(offsets, b"abcdefa".into(), None); + let cmp = build_compare(&a, &a).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(0, 2)); + assert_eq!(Ordering::Equal, cmp(1, 1)); + } + + #[test] + fn test_bytes() { + test_bytes_impl::(); + test_bytes_impl::(); + test_bytes_impl::(); + test_bytes_impl::(); } } From cd0e5513c99e0d9af2a7c3307ad3d3189175df59 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 14 Aug 2023 10:09:21 +0100 Subject: [PATCH 1155/1411] Add AzureConfigKey::ContainerName (#4629) (#4686) --- object_store/src/azure/mod.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 6bb4cdad1bb0..27bbbfb64d3f 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -478,6 +478,13 @@ pub enum AzureConfigKey { /// - `use_azure_cli` UseAzureCli, + /// Container name + /// + /// Supported keys: + /// - `azure_container_name` + /// - `container_name` + ContainerName, + /// Client options Client(ClientConfigKey), } @@ -499,6 +506,7 @@ impl AsRef for AzureConfigKey { Self::MsiResourceId => "azure_msi_resource_id", Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", + Self::ContainerName => "azure_container_name", Self::Client(key) => key.as_ref(), } } @@ -547,6 +555,7 @@ impl FromStr for AzureConfigKey { Ok(Self::UseFabricEndpoint) } "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), + "azure_container_name" | "container_name" => Ok(Self::ContainerName), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -663,6 +672,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AzureConfigKey::ContainerName => self.container_name = Some(value.into()), }; self } @@ -722,6 +732,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), AzureConfigKey::Client(key) => self.client_options.get_config_value(key), + AzureConfigKey::ContainerName => self.container_name.clone(), } } @@ -1084,9 +1095,7 @@ mod tests { #[tokio::test] async fn azure_blob_test() { crate::test_util::maybe_skip_integration!(); - let container_name = std::env::var("AZURE_CONTAINER_NAME").unwrap(); // (#4629) - let config = MicrosoftAzureBuilder::from_env(); - let integration = config.with_container_name(container_name).build().unwrap(); + let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); put_get_delete_list_opts(&integration, false).await; get_opts(&integration).await; From 820e40a27863f3eb8b1e95856107c2c0e4d81722 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 14 Aug 2023 12:11:43 +0100 Subject: [PATCH 1156/1411] Add range and ObjectMeta to GetResult (#4352) (#4495) (#4677) * Add range and ObjectMeta to GetResult (#4352) (#4495) * Review feedback * Fix docs --- object_store/src/chunked.rs | 126 ++++++++++++++------------------- object_store/src/client/get.rs | 15 +++- object_store/src/http/mod.rs | 19 ++++- object_store/src/lib.rs | 103 ++++++++++----------------- object_store/src/limit.rs | 30 ++++---- object_store/src/local.rs | 80 ++++++++++++++++----- object_store/src/memory.rs | 50 ++++++++----- object_store/src/throttle.rs | 27 ++++--- 8 files changed, 252 insertions(+), 198 deletions(-) diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index c639d7e89812..008dec679413 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -18,7 +18,6 @@ //! A [`ChunkedStore`] that can be used to test streaming behaviour use std::fmt::{Debug, Display, Formatter}; -use std::io::{BufReader, Read}; use std::ops::Range; use std::sync::Arc; @@ -29,8 +28,9 @@ use futures::StreamExt; use tokio::io::AsyncWrite; use crate::path::Path; -use crate::util::maybe_spawn_blocking; -use crate::{GetOptions, GetResult, ListResult, ObjectMeta, ObjectStore}; +use crate::{ + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, +}; use crate::{MultipartId, Result}; /// Wraps a [`ObjectStore`] and makes its get response return chunks @@ -82,77 +82,57 @@ impl ObjectStore for ChunkedStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - match self.inner.get_opts(location, options).await? { - GetResult::File(std_file, ..) => { - let reader = BufReader::new(std_file); - let chunk_size = self.chunk_size; - Ok(GetResult::Stream( - futures::stream::try_unfold(reader, move |mut reader| async move { - let (r, out, reader) = maybe_spawn_blocking(move || { - let mut out = Vec::with_capacity(chunk_size); - let r = (&mut reader) - .take(chunk_size as u64) - .read_to_end(&mut out) - .map_err(|err| crate::Error::Generic { - store: "ChunkedStore", - source: Box::new(err), - })?; - Ok((r, out, reader)) - }) - .await?; - - match r { - 0 => Ok(None), - _ => Ok(Some((out.into(), reader))), - } - }) - .boxed(), - )) + let r = self.inner.get_opts(location, options).await?; + let stream = match r.payload { + GetResultPayload::File(file, path) => { + crate::local::chunked_stream(file, path, r.range.clone(), self.chunk_size) } - GetResult::Stream(stream) => { + GetResultPayload::Stream(stream) => { let buffer = BytesMut::new(); - Ok(GetResult::Stream( - futures::stream::unfold( - (stream, buffer, false, self.chunk_size), - |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { - // Keep accumulating bytes until we reach capacity as long as - // the stream can provide them: - if exhausted { - return None; - } - while buffer.len() < chunk_size { - match stream.next().await { - None => { - exhausted = true; - let slice = buffer.split_off(0).freeze(); - return Some(( - Ok(slice), - (stream, buffer, exhausted, chunk_size), - )); - } - Some(Ok(bytes)) => { - buffer.put(bytes); - } - Some(Err(e)) => { - return Some(( - Err(crate::Error::Generic { - store: "ChunkedStore", - source: Box::new(e), - }), - (stream, buffer, exhausted, chunk_size), - )) - } - }; - } - // Return the chunked values as the next value in the stream - let slice = buffer.split_to(chunk_size).freeze(); - Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) - }, - ) - .boxed(), - )) + futures::stream::unfold( + (stream, buffer, false, self.chunk_size), + |(mut stream, mut buffer, mut exhausted, chunk_size)| async move { + // Keep accumulating bytes until we reach capacity as long as + // the stream can provide them: + if exhausted { + return None; + } + while buffer.len() < chunk_size { + match stream.next().await { + None => { + exhausted = true; + let slice = buffer.split_off(0).freeze(); + return Some(( + Ok(slice), + (stream, buffer, exhausted, chunk_size), + )); + } + Some(Ok(bytes)) => { + buffer.put(bytes); + } + Some(Err(e)) => { + return Some(( + Err(crate::Error::Generic { + store: "ChunkedStore", + source: Box::new(e), + }), + (stream, buffer, exhausted, chunk_size), + )) + } + }; + } + // Return the chunked values as the next value in the stream + let slice = buffer.split_to(chunk_size).freeze(); + Some((Ok(slice), (stream, buffer, exhausted, chunk_size))) + }, + ) + .boxed() } - } + }; + Ok(GetResult { + payload: GetResultPayload::Stream(stream), + ..r + }) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -217,8 +197,8 @@ mod tests { for chunk_size in [10, 20, 31] { let store = ChunkedStore::new(Arc::clone(&store), chunk_size); - let mut s = match store.get(&location).await.unwrap() { - GetResult::Stream(s) => s, + let mut s = match store.get(&location).await.unwrap().payload { + GetResultPayload::Stream(s) => s, _ => unreachable!(), }; diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 3c66a72d82ed..6b2d60ae565f 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -17,8 +17,8 @@ use crate::client::header::header_meta; use crate::path::Path; -use crate::Result; use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use crate::{GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; use reqwest::Response; @@ -47,7 +47,14 @@ pub trait GetClientExt { #[async_trait] impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let range = options.range.clone(); let response = self.get_request(location, options, false).await?; + let meta = + header_meta(location, response.headers()).map_err(|e| Error::Generic { + store: T::STORE, + source: Box::new(e), + })?; + let stream = response .bytes_stream() .map_err(|source| Error::Generic { @@ -56,7 +63,11 @@ impl GetClientExt for T { }) .boxed(); - Ok(GetResult::Stream(stream)) + Ok(GetResult { + range: range.unwrap_or(0..meta.size), + payload: GetResultPayload::Stream(stream), + meta, + }) } async fn head(&self, location: &Path) -> Result { diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 6927f1b883be..e8e7b459e12f 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -40,11 +40,12 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; +use crate::client::header::header_meta; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, GetResultPayload, ListResult, + MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -60,6 +61,11 @@ enum Error { url: String, }, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, + #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, } @@ -109,13 +115,20 @@ impl ObjectStore for HttpStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let range = options.range.clone(); let response = self.client.get(location, options).await?; + let meta = header_meta(location, response.headers()).context(MetadataSnafu)?; + let stream = response .bytes_stream() .map_err(|source| Error::Reqwest { source }.into()) .boxed(); - Ok(GetResult::Stream(stream)) + Ok(GetResult { + payload: GetResultPayload::Stream(stream), + range: range.unwrap_or(0..meta.size), + meta, + }) } async fn head(&self, location: &Path) -> Result { diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index cf7e47998aa2..7496b589cd8a 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -374,8 +374,6 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Perform a get request with options - /// - /// Note: options.range will be ignored if [`GetResult::File`] async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; /// Return the bytes that are stored at the specified location @@ -385,17 +383,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { range: Some(range.clone()), ..Default::default() }; - // Temporary until GetResult::File supports range (#4352) - match self.get_opts(location, options).await? { - GetResult::Stream(s) => collect_bytes(s, None).await, - #[cfg(not(target_arch = "wasm32"))] - GetResult::File(mut file, path) => { - maybe_spawn_blocking(move || local::read_range(&mut file, &path, range)) - .await - } - #[cfg(target_arch = "wasm32")] - _ => unimplemented!("File IO not implemented on wasm32."), - } + self.get_opts(location, options).await?.bytes().await } /// Return the bytes that are stored at the specified location @@ -751,21 +739,32 @@ impl GetOptions { } /// Result for a get request +#[derive(Debug)] +pub struct GetResult { + /// The [`GetResultPayload`] + pub payload: GetResultPayload, + /// The [`ObjectMeta`] for this object + pub meta: ObjectMeta, + /// The range of bytes returned by this request + pub range: Range, +} + +/// The kind of a [`GetResult`] /// /// This special cases the case of a local file, as some systems may /// be able to optimise the case of a file already present on local disk -pub enum GetResult { - /// A file and its path on the local filesystem +pub enum GetResultPayload { + /// The file, path File(std::fs::File, std::path::PathBuf), - /// An asynchronous stream + /// An opaque stream of bytes Stream(BoxStream<'static, Result>), } -impl Debug for GetResult { +impl Debug for GetResultPayload { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::File(_, _) => write!(f, "GetResult(File)"), - Self::Stream(_) => write!(f, "GetResult(Stream)"), + Self::File(_, _) => write!(f, "GetResultPayload(File)"), + Self::Stream(_) => write!(f, "GetResultPayload(Stream)"), } } } @@ -773,32 +772,31 @@ impl Debug for GetResult { impl GetResult { /// Collects the data into a [`Bytes`] pub async fn bytes(self) -> Result { - match self { + let len = self.range.end - self.range.start; + match self.payload { #[cfg(not(target_arch = "wasm32"))] - Self::File(mut file, path) => { + GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { - let len = file.seek(SeekFrom::End(0)).map_err(|source| { - local::Error::Seek { + file.seek(SeekFrom::Start(self.range.start as _)).map_err( + |source| local::Error::Seek { source, path: path.clone(), - } - })?; - - file.rewind().map_err(|source| local::Error::Seek { - source, - path: path.clone(), - })?; + }, + )?; - let mut buffer = Vec::with_capacity(len as usize); - file.read_to_end(&mut buffer).map_err(|source| { - local::Error::UnableToReadBytes { source, path } - })?; + let mut buffer = Vec::with_capacity(len); + file.take(len as _) + .read_to_end(&mut buffer) + .map_err(|source| local::Error::UnableToReadBytes { + source, + path, + })?; Ok(buffer.into()) }) .await } - Self::Stream(s) => collect_bytes(s, None).await, + GetResultPayload::Stream(s) => collect_bytes(s, Some(len)).await, #[cfg(target_arch = "wasm32")] _ => unimplemented!("File IO not implemented on wasm32."), } @@ -806,8 +804,8 @@ impl GetResult { /// Converts this into a byte stream /// - /// If the result is [`Self::File`] will perform chunked reads of the file, otherwise - /// will return the [`Self::Stream`]. + /// If the `self.kind` is [`GetResultPayload::File`] will perform chunked reads of the file, + /// otherwise will return the [`GetResultPayload::Stream`]. /// /// # Tokio Compatibility /// @@ -819,36 +817,13 @@ impl GetResult { /// If not called from a tokio context, this will perform IO on the current thread with /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { - match self { + match self.payload { #[cfg(not(target_arch = "wasm32"))] - Self::File(file, path) => { + GetResultPayload::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; - - futures::stream::try_unfold( - (file, path, false), - |(mut file, path, finished)| { - maybe_spawn_blocking(move || { - if finished { - return Ok(None); - } - - let mut buffer = Vec::with_capacity(CHUNK_SIZE); - let read = file - .by_ref() - .take(CHUNK_SIZE as u64) - .read_to_end(&mut buffer) - .map_err(|e| local::Error::UnableToReadBytes { - source: e, - path: path.clone(), - })?; - - Ok(Some((buffer.into(), (file, path, read != CHUNK_SIZE)))) - }) - }, - ) - .boxed() + local::chunked_stream(file, path, self.range, CHUNK_SIZE) } - Self::Stream(s) => s, + GetResultPayload::Stream(s) => s, #[cfg(target_arch = "wasm32")] _ => unimplemented!("File IO not implemented on wasm32."), } diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 630fd145b72c..a9b8c4b05020 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, - Path, Result, StreamExt, + BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, + ObjectMeta, ObjectStore, Path, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -106,22 +106,14 @@ impl ObjectStore for LimitStore { async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - match self.inner.get(location).await? { - r @ GetResult::File(_, _) => Ok(r), - GetResult::Stream(s) => { - Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) - } - } + let r = self.inner.get(location).await?; + Ok(permit_get_result(r, permit)) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - match self.inner.get_opts(location, options).await? { - r @ GetResult::File(_, _) => Ok(r), - GetResult::Stream(s) => { - Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) - } - } + let r = self.inner.get_opts(location, options).await?; + Ok(permit_get_result(r, permit)) } async fn get_range(&self, location: &Path, range: Range) -> Result { @@ -200,6 +192,16 @@ impl ObjectStore for LimitStore { } } +fn permit_get_result(r: GetResult, permit: OwnedSemaphorePermit) -> GetResult { + let payload = match r.payload { + v @ GetResultPayload::File(_, _) => v, + GetResultPayload::Stream(s) => { + GetResultPayload::Stream(PermitWrapper::new(s, permit).boxed()) + } + }; + GetResult { payload, ..r } +} + /// Combines an [`OwnedSemaphorePermit`] with some other type struct PermitWrapper { inner: T, diff --git a/object_store/src/local.rs b/object_store/src/local.rs index a0933cc6177d..4d57ef1b79e1 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -19,16 +19,17 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, + ObjectStore, Result, }; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::future::BoxFuture; -use futures::FutureExt; use futures::{stream::BoxStream, StreamExt}; +use futures::{FutureExt, TryStreamExt}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::fs::{metadata, symlink_metadata, File, OpenOptions}; +use std::fs::{metadata, symlink_metadata, File, Metadata, OpenOptions}; use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; use std::pin::Pin; @@ -370,18 +371,20 @@ impl ObjectStore for LocalFileSystem { let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { - let file = open_file(&path)?; + let (file, metadata) = open_file(&path)?; if options.if_unmodified_since.is_some() || options.if_modified_since.is_some() { - let metadata = file.metadata().map_err(|e| Error::Metadata { - source: e.into(), - path: location.to_string(), - })?; options.check_modified(&location, last_modified(&metadata))?; } - Ok(GetResult::File(file, path)) + let meta = convert_metadata(metadata, location)?; + + Ok(GetResult { + payload: GetResultPayload::File(file, path), + range: options.range.unwrap_or(0..meta.size), + meta, + }) }) .await } @@ -389,7 +392,7 @@ impl ObjectStore for LocalFileSystem { async fn get_range(&self, location: &Path, range: Range) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { - let mut file = open_file(&path)?; + let (mut file, _) = open_file(&path)?; read_range(&mut file, &path, range) }) .await @@ -404,7 +407,7 @@ impl ObjectStore for LocalFileSystem { let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { // Vectored IO might be faster - let mut file = open_file(&path)?; + let (mut file, _) = open_file(&path)?; ranges .into_iter() .map(|r| read_range(&mut file, &path, r)) @@ -863,6 +866,51 @@ impl AsyncWrite for LocalUpload { } } +pub(crate) fn chunked_stream( + mut file: File, + path: PathBuf, + range: Range, + chunk_size: usize, +) -> BoxStream<'static, Result> { + futures::stream::once(async move { + let (file, path) = maybe_spawn_blocking(move || { + file.seek(SeekFrom::Start(range.start as _)) + .map_err(|source| Error::Seek { + source, + path: path.clone(), + })?; + Ok((file, path)) + }) + .await?; + + let stream = futures::stream::try_unfold( + (file, path, range.end - range.start), + move |(mut file, path, remaining)| { + maybe_spawn_blocking(move || { + if remaining == 0 { + return Ok(None); + } + + let to_read = remaining.min(chunk_size); + let mut buffer = Vec::with_capacity(to_read); + let read = (&mut file) + .take(to_read as u64) + .read_to_end(&mut buffer) + .map_err(|e| Error::UnableToReadBytes { + source: e, + path: path.clone(), + })?; + + Ok(Some((buffer.into(), (file, path, remaining - read)))) + }) + }, + ); + Ok::<_, super::Error>(stream) + }) + .try_flatten() + .boxed() +} + pub(crate) fn read_range( file: &mut File, path: &PathBuf, @@ -889,8 +937,8 @@ pub(crate) fn read_range( Ok(buf.into()) } -fn open_file(path: &PathBuf) -> Result { - let file = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { +fn open_file(path: &PathBuf) -> Result<(File, Metadata)> { + let ret = match File::open(path).and_then(|f| Ok((f.metadata()?, f))) { Err(e) => Err(match e.kind() { ErrorKind::NotFound => Error::NotFound { path: path.clone(), @@ -902,14 +950,14 @@ fn open_file(path: &PathBuf) -> Result { }, }), Ok((metadata, file)) => match !metadata.is_dir() { - true => Ok(file), + true => Ok((file, metadata)), false => Err(Error::NotFound { path: path.clone(), source: io::Error::new(ErrorKind::NotFound, "is directory"), }), }, }?; - Ok(file) + Ok(ret) } fn convert_entry(entry: DirEntry, location: Path) -> Result { @@ -927,7 +975,7 @@ fn last_modified(metadata: &std::fs::Metadata) -> DateTime { .into() } -fn convert_metadata(metadata: std::fs::Metadata, location: Path) -> Result { +fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index cfc2ac823036..1e8e3c1fd005 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -16,7 +16,9 @@ // under the License. //! An in-memory object store implementation -use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{ + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, +}; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; @@ -43,11 +45,13 @@ enum Error { #[snafu(display("No data in memory found. Location: {path}"))] NoDataInMemory { path: String }, - #[snafu(display("Out of range"))] - OutOfRange, + #[snafu(display( + "Requested range {}..{} is out of bounds for object with length {}", range.start, range.end, len + ))] + OutOfRange { range: Range, len: usize }, - #[snafu(display("Bad range"))] - BadRange, + #[snafu(display("Invalid range: {}..{}", range.start, range.end))] + BadRange { range: Range }, #[snafu(display("Object already exists at that location: {path}"))] AlreadyExists { path: String }, @@ -136,17 +140,29 @@ impl ObjectStore for InMemory { } let (data, last_modified) = self.entry(location).await?; options.check_modified(location, last_modified)?; + let meta = ObjectMeta { + location: location.clone(), + last_modified, + size: data.len(), + e_tag: None, + }; + let (range, data) = match options.range { + Some(range) => { + let len = data.len(); + ensure!(range.end <= len, OutOfRangeSnafu { range, len }); + ensure!(range.start <= range.end, BadRangeSnafu { range }); + (range.clone(), data.slice(range)) + } + None => (0..data.len(), data), + }; let stream = futures::stream::once(futures::future::ready(Ok(data))); - Ok(GetResult::Stream(stream.boxed())) - } - - async fn get_range(&self, location: &Path, range: Range) -> Result { - let data = self.entry(location).await?; - ensure!(range.end <= data.0.len(), OutOfRangeSnafu); - ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.0.slice(range)) + Ok(GetResult { + payload: GetResultPayload::Stream(stream.boxed()), + meta, + range, + }) } async fn get_ranges( @@ -158,9 +174,11 @@ impl ObjectStore for InMemory { ranges .iter() .map(|range| { - ensure!(range.end <= data.0.len(), OutOfRangeSnafu); - ensure!(range.start <= range.end, BadRangeSnafu); - Ok(data.0.slice(range.clone())) + let range = range.clone(); + let len = data.0.len(); + ensure!(range.end <= data.0.len(), OutOfRangeSnafu { range, len }); + ensure!(range.start <= range.end, BadRangeSnafu { range }); + Ok(data.0.slice(range)) }) .collect() } diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index fb90afcec9fb..58c476ab4530 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -20,7 +20,9 @@ use parking_lot::Mutex; use std::ops::Range; use std::{convert::TryInto, sync::Arc}; -use crate::{path::Path, GetResult, ListResult, ObjectMeta, ObjectStore, Result}; +use crate::{ + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, +}; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; use bytes::Bytes; @@ -301,15 +303,20 @@ fn usize_to_u32_saturate(x: usize) -> u32 { } fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { - let s = match result { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), + let s = match result.payload { + GetResultPayload::Stream(s) => s, + GetResultPayload::File(_, _) => unimplemented!(), }; - GetResult::Stream(throttle_stream(s, move |bytes| { + let stream = throttle_stream(s, move |bytes| { let bytes_len: u32 = usize_to_u32_saturate(bytes.len()); wait_get_per_byte * bytes_len - })) + }); + + GetResult { + payload: GetResultPayload::Stream(stream), + ..result + } } fn throttle_stream( @@ -330,7 +337,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{memory::InMemory, tests::*}; + use crate::{memory::InMemory, tests::*, GetResultPayload}; use bytes::Bytes; use futures::TryStreamExt; use tokio::time::Duration; @@ -550,9 +557,9 @@ mod tests { let res = store.get(&path).await; if n_bytes.is_some() { // need to consume bytes to provoke sleep times - let s = match res.unwrap() { - GetResult::Stream(s) => s, - GetResult::File(_, _) => unimplemented!(), + let s = match res.unwrap().payload { + GetResultPayload::Stream(s) => s, + GetResultPayload::File(_, _) => unimplemented!(), }; s.map_ok(|b| bytes::BytesMut::from(&b[..])) From 979a070dc82eeb26b38a8651cac879b2c276c0ed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 14 Aug 2023 20:02:45 +0100 Subject: [PATCH 1157/1411] Logical Nullability (#4691) --- arrow-arith/src/arity.rs | 13 ++-- arrow-arith/src/boolean.rs | 14 +--- arrow-array/src/array/boolean_array.rs | 7 +- arrow-array/src/array/dictionary_array.rs | 49 +++++++++++++ .../src/array/fixed_size_list_array.rs | 6 +- arrow-array/src/array/list_array.rs | 4 +- arrow-array/src/array/mod.rs | 60 +++++++++++++++- arrow-array/src/array/null_array.rs | 33 ++++----- arrow-array/src/array/run_array.rs | 69 ++++++++++++++++++- arrow-array/src/array/struct_array.rs | 19 +++-- arrow-array/src/builder/null_builder.rs | 9 +-- arrow-array/src/iterator.rs | 16 ++++- arrow-buffer/src/builder/boolean.rs | 6 ++ arrow-cast/src/cast.rs | 5 +- arrow-ord/src/comparison.rs | 14 ++++ arrow-ord/src/sort.rs | 44 +++++++++--- arrow-string/src/like.rs | 5 +- parquet/src/arrow/arrow_writer/levels.rs | 34 ++++++++- parquet/src/arrow/arrow_writer/mod.rs | 2 +- 19 files changed, 333 insertions(+), 76 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 2dac33a4f28b..fdfb26f7f72a 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -198,7 +198,7 @@ where return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let nulls = NullBuffer::union(a.nulls(), b.nulls()); + let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); // JUSTIFICATION @@ -248,7 +248,7 @@ where )))); } - let nulls = NullBuffer::union(a.nulls(), b.nulls()); + let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let mut builder = a.into_builder()?; @@ -296,7 +296,9 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls(len, a, b, op) } else { - let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); + let nulls = + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) + .unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); @@ -355,7 +357,10 @@ where if a.null_count() == 0 && b.null_count() == 0 { try_binary_no_nulls_mut(len, a, b, op) } else { - let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap(); + let nulls = + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) + .unwrap(); + let mut builder = a.into_builder()?; let slice = builder.values_slice_mut(); diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 61e591d51634..46e5998208f1 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -25,7 +25,7 @@ use arrow_array::*; use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; use arrow_buffer::{BooleanBuffer, NullBuffer}; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::ArrowError; /// Logical 'and' boolean values with Kleene logic /// @@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result { /// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true])); /// ``` pub fn is_null(input: &dyn Array) -> Result { - let values = match input.nulls() { - // NullArray has no nulls buffer yet all values are null - None if input.data_type() == &DataType::Null => { - BooleanBuffer::new_set(input.len()) - } + let values = match input.logical_nulls() { None => BooleanBuffer::new_unset(input.len()), Some(nulls) => !nulls.inner(), }; @@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result { /// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false])); /// ``` pub fn is_not_null(input: &dyn Array) -> Result { - let values = match input.nulls() { - // NullArray has no nulls buffer yet all values are null - None if input.data_type() == &DataType::Null => { - BooleanBuffer::new_unset(input.len()) - } + let values = match input.logical_nulls() { None => BooleanBuffer::new_set(input.len()), Some(n) => n.inner().clone(), }; diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 14fa87e138eb..995bb7d510d9 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -205,7 +205,7 @@ impl BooleanArray { where F: FnMut(T::Item) -> bool, { - let nulls = left.nulls().cloned(); + let nulls = left.logical_nulls(); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i)) @@ -239,7 +239,10 @@ impl BooleanArray { { assert_eq!(left.len(), right.len()); - let nulls = NullBuffer::union(left.nulls(), right.nulls()); + let nulls = NullBuffer::union( + left.logical_nulls().as_ref(), + right.logical_nulls().as_ref(), + ); let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe { // SAFETY: i in range 0..len op(left.value_unchecked(i), right.value_unchecked(i)) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5a2f439a8e0f..2d80c75f073a 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -729,6 +729,31 @@ impl Array for DictionaryArray { self.keys.nulls() } + fn logical_nulls(&self) -> Option { + match self.values.nulls() { + None => self.nulls().cloned(), + Some(value_nulls) => { + let mut builder = BooleanBufferBuilder::new(self.len()); + match self.keys.nulls() { + Some(n) => builder.append_buffer(n.inner()), + None => builder.append_n(self.len(), true), + } + for (idx, k) in self.keys.values().iter().enumerate() { + let k = k.as_usize(); + // Check range to allow for nulls + if k < value_nulls.len() && value_nulls.is_null(k) { + builder.set_bit(idx, false); + } + } + Some(builder.finish().into()) + } + } + } + + fn is_nullable(&self) -> bool { + !self.is_empty() && (self.nulls().is_some() || self.values.is_nullable()) + } + fn get_buffer_memory_size(&self) -> usize { self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size() } @@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, self.dictionary.nulls() } + fn logical_nulls(&self) -> Option { + self.dictionary.logical_nulls() + } + + fn is_nullable(&self) -> bool { + self.dictionary.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.dictionary.get_buffer_memory_size() } @@ -1253,4 +1286,20 @@ mod tests { assert_eq!(v, expected, "{idx}"); } } + + #[test] + fn test_iterator_nulls() { + let keys = Int32Array::new( + vec![0, 700, 1, 2].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + let values = Int32Array::from(vec![Some(50), None, Some(2)]); + let dict = DictionaryArray::new(keys, Arc::new(values)); + let values: Vec<_> = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect(); + assert_eq!(values, &[Some(50), None, None, Some(2)]) + } } diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 6c3abb556ad6..8996fc8da408 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -147,7 +147,7 @@ impl FixedSizeListArray { /// * `size < 0` /// * `values.len() / size != nulls.len()` /// * `values.data_type() != field.data_type()` - /// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())` + /// * `!field.is_nullable() && !nulls.expand(size).contains(values.logical_nulls())` pub fn try_new( field: FieldRef, size: i32, @@ -181,11 +181,11 @@ impl FixedSizeListArray { ))); } - if let Some(a) = values.nulls() { + if let Some(a) = values.logical_nulls() { let nulls_valid = field.is_nullable() || nulls .as_ref() - .map(|n| n.expand(size as _).contains(a)) + .map(|n| n.expand(size as _).contains(&a)) .unwrap_or_default(); if !nulls_valid { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 05628084c844..f5b7ae77c3f9 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -161,7 +161,7 @@ impl GenericListArray { /// /// * `offsets.len() - 1 != nulls.len()` /// * `offsets.last() > values.len()` - /// * `!field.is_nullable() && values.null_count() != 0` + /// * `!field.is_nullable() && values.is_nullable()` /// * `field.data_type() != values.data_type()` pub fn try_new( field: FieldRef, @@ -189,7 +189,7 @@ impl GenericListArray { ))); } } - if !field.is_nullable() && values.null_count() != 0 { + if !field.is_nullable() && values.is_nullable() { return Err(ArrowError::InvalidArgumentError(format!( "Non-nullable field of {}ListArray {:?} cannot contain nulls", OffsetSize::PREFIX, diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 0157279dfe49..79240d105a44 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn offset(&self) -> usize; - /// Returns the null buffers of this array if any + /// Returns the null buffer of this array if any + /// + /// Note: some arrays can encode their nullability in their children, for example, + /// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer, + /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this fn nulls(&self) -> Option<&NullBuffer>; + /// Returns the logical null buffer of this array if any + /// + /// In most cases this will be the same as [`Array::nulls`], except for: + /// + /// * DictionaryArray where [`DictionaryArray::values`] contains nulls + /// * RunArray where [`RunArray::values`] contains nulls + /// * NullArray where all indices are nulls + /// + /// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability + /// of these arrays, beyond what is encoded in [`Array::nulls`] + fn logical_nulls(&self) -> Option { + self.nulls().cloned() + } + /// Returns whether the element at `index` is null. /// When using this function on a slice, the index is relative to the slice. /// + /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] + /// see [`Array::logical_nulls`] for logical nullability + /// /// # Example: /// /// ``` @@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Returns whether the element at `index` is not null. /// When using this function on a slice, the index is relative to the slice. /// + /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] + /// see [`Array::logical_nulls`] for logical nullability + /// /// # Example: /// /// ``` @@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync { !self.is_null(index) } - /// Returns the total number of null values in this array. + /// Returns the total number of physical null values in this array. + /// + /// Note: this method returns the physical null count, i.e. that encoded in [`Array::nulls`], + /// see [`Array::logical_nulls`] for logical nullability /// /// # Example: /// @@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.nulls().map(|n| n.null_count()).unwrap_or_default() } + /// Returns `false` if the array is guaranteed to not contain any logical nulls + /// + /// In general this will be equivalent to `Array::null_count() != 0` but may differ in the + /// presence of logical nullability, see [`Array::logical_nulls`]. + /// + /// Implementations will return `true` unless they can cheaply prove no logical nulls + /// are present. For example a [`DictionaryArray`] with nullable values will still return true, + /// even if the nulls present in [`DictionaryArray::values`] are not referenced by any key, + /// and therefore would not appear in [`Array::logical_nulls`]. + fn is_nullable(&self) -> bool { + self.null_count() != 0 + } + /// Returns the total number of bytes of memory pointed to by this array. /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map. fn get_buffer_memory_size(&self) -> usize; @@ -277,6 +317,10 @@ impl Array for ArrayRef { self.as_ref().nulls() } + fn logical_nulls(&self) -> Option { + self.as_ref().logical_nulls() + } + fn is_null(&self, index: usize) -> bool { self.as_ref().is_null(index) } @@ -289,6 +333,10 @@ impl Array for ArrayRef { self.as_ref().null_count() } + fn is_nullable(&self) -> bool { + self.as_ref().is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.as_ref().get_buffer_memory_size() } @@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T { T::nulls(self) } + fn logical_nulls(&self) -> Option { + T::logical_nulls(self) + } + fn is_null(&self, index: usize) -> bool { T::is_null(self, index) } @@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T { T::null_count(self) } + fn is_nullable(&self) -> bool { + T::is_nullable(self) + } + fn get_buffer_memory_size(&self) -> usize { T::get_buffer_memory_size(self) } diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index c054c890431b..af3ec0b57d27 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -36,8 +36,10 @@ use std::sync::Arc; /// /// let array = NullArray::new(10); /// +/// assert!(array.is_nullable()); /// assert_eq!(array.len(), 10); -/// assert_eq!(array.null_count(), 10); +/// assert_eq!(array.null_count(), 0); +/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10); /// ``` #[derive(Clone)] pub struct NullArray { @@ -107,22 +109,12 @@ impl Array for NullArray { None } - /// Returns whether the element at `index` is null. - /// All elements of a `NullArray` are always null. - fn is_null(&self, _index: usize) -> bool { - true + fn logical_nulls(&self) -> Option { + (self.len != 0).then(|| NullBuffer::new_null(self.len)) } - /// Returns whether the element at `index` is valid. - /// All elements of a `NullArray` are always invalid. - fn is_valid(&self, _index: usize) -> bool { - false - } - - /// Returns the total number of null values in this array. - /// The null count of a `NullArray` always equals its length. - fn null_count(&self) -> usize { - self.len() + fn is_nullable(&self) -> bool { + !self.is_empty() } fn get_buffer_memory_size(&self) -> usize { @@ -176,8 +168,10 @@ mod tests { let null_arr = NullArray::new(32); assert_eq!(null_arr.len(), 32); - assert_eq!(null_arr.null_count(), 32); - assert!(!null_arr.is_valid(0)); + assert_eq!(null_arr.null_count(), 0); + assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32); + assert!(null_arr.is_valid(0)); + assert!(null_arr.is_nullable()); } #[test] @@ -186,7 +180,10 @@ mod tests { let array2 = array1.slice(8, 16); assert_eq!(array2.len(), 16); - assert_eq!(array2.null_count(), 16); + assert_eq!(array2.null_count(), 0); + assert_eq!(array2.logical_nulls().unwrap().null_count(), 16); + assert!(array2.is_valid(0)); + assert!(array2.is_nullable()); } #[test] diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 820d5c9ebfc1..30cefaeb4d46 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow_buffer::{ArrowNativeType, NullBuffer, RunEndBuffer}; +use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; @@ -349,6 +349,43 @@ impl Array for RunArray { None } + fn logical_nulls(&self) -> Option { + let len = self.len(); + let nulls = self.values.logical_nulls()?; + let mut out = BooleanBufferBuilder::new(len); + let offset = self.run_ends.offset(); + let mut valid_start = 0; + let mut last_end = 0; + for (idx, end) in self.run_ends.values().iter().enumerate() { + let end = end.as_usize(); + if end < offset { + continue; + } + let end = (end - offset).min(len); + if nulls.is_null(idx) { + if valid_start < last_end { + out.append_n(last_end - valid_start, true); + } + out.append_n(end - last_end, false); + valid_start = end; + } + last_end = end; + if end == len { + break; + } + } + if valid_start < len { + out.append_n(len - valid_start, true) + } + // Sanity check + assert_eq!(out.len(), len); + Some(out.finish().into()) + } + + fn is_nullable(&self) -> bool { + !self.is_empty() && self.values.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size() } @@ -569,6 +606,14 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> { self.run_array.nulls() } + fn logical_nulls(&self) -> Option { + self.run_array.logical_nulls() + } + + fn is_nullable(&self) -> bool { + self.run_array.is_nullable() + } + fn get_buffer_memory_size(&self) -> usize { self.run_array.get_buffer_memory_size() } @@ -1041,4 +1086,26 @@ mod tests { ); } } + + #[test] + fn test_logical_nulls() { + let run = Int32Array::from(vec![3, 6, 9, 12]); + let values = Int32Array::from(vec![Some(0), None, Some(1), None]); + let array = RunArray::try_new(&run, &values).unwrap(); + + let expected = vec![ + true, true, true, false, false, false, true, true, true, false, false, false, + ]; + + let n = array.logical_nulls().unwrap(); + assert_eq!(n.null_count(), 6); + + let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)]; + for (offset, length) in slices { + let a = array.slice(offset, length); + let n = a.logical_nulls().unwrap(); + let n = n.into_iter().collect::>(); + assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}"); + } + } } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 1a79ebd95f37..284c3b26a946 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -143,15 +143,14 @@ impl StructArray { ))); } - if let Some(a) = a.nulls() { - let nulls_valid = f.is_nullable() - || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); - - if !nulls_valid { - return Err(ArrowError::InvalidArgumentError(format!( - "Found unmasked nulls for non-nullable StructArray field {:?}", - f.name() - ))); + if !f.is_nullable() { + if let Some(a) = a.logical_nulls() { + if !nulls.as_ref().map(|n| n.contains(&a)).unwrap_or_default() { + return Err(ArrowError::InvalidArgumentError(format!( + "Found unmasked nulls for non-nullable StructArray field {:?}", + f.name() + ))); + } } } } @@ -314,7 +313,7 @@ impl TryFrom> for StructArray { .into_iter() .map(|(name, array)| { ( - Field::new(name, array.data_type().clone(), array.nulls().is_some()), + Field::new(name, array.data_type().clone(), array.is_nullable()), array, ) }) diff --git a/arrow-array/src/builder/null_builder.rs b/arrow-array/src/builder/null_builder.rs index 94cb7f5cc281..53a6b103d541 100644 --- a/arrow-array/src/builder/null_builder.rs +++ b/arrow-array/src/builder/null_builder.rs @@ -40,7 +40,7 @@ use std::sync::Arc; /// let arr = b.finish(); /// /// assert_eq!(8, arr.len()); -/// assert_eq!(8, arr.null_count()); +/// assert_eq!(0, arr.null_count()); /// ``` #[derive(Debug)] pub struct NullBuilder { @@ -160,7 +160,8 @@ mod tests { let arr = builder.finish(); assert_eq!(20, arr.len()); assert_eq!(0, arr.offset()); - assert_eq!(20, arr.null_count()); + assert_eq!(0, arr.null_count()); + assert!(arr.is_nullable()); } #[test] @@ -170,10 +171,10 @@ mod tests { builder.append_empty_value(); builder.append_empty_values(3); let mut array = builder.finish_cloned(); - assert_eq!(21, array.null_count()); + assert_eq!(21, array.len()); builder.append_empty_values(5); array = builder.finish(); - assert_eq!(26, array.null_count()); + assert_eq!(26, array.len()); } } diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 86f5d991288a..a198332ca5b5 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -22,6 +22,7 @@ use crate::array::{ GenericListArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; +use arrow_buffer::NullBuffer; /// An iterator that returns Some(T) or None, that can be used on any [`ArrayAccessor`] /// @@ -46,6 +47,7 @@ use crate::{FixedSizeListArray, MapArray}; #[derive(Debug)] pub struct ArrayIter { array: T, + logical_nulls: Option, current: usize, current_end: usize, } @@ -54,12 +56,22 @@ impl ArrayIter { /// create a new iterator pub fn new(array: T) -> Self { let len = array.len(); + let logical_nulls = array.logical_nulls(); ArrayIter { array, + logical_nulls, current: 0, current_end: len, } } + + #[inline] + fn is_null(&self, idx: usize) -> bool { + self.logical_nulls + .as_ref() + .map(|x| x.is_null(idx)) + .unwrap_or_default() + } } impl Iterator for ArrayIter { @@ -69,7 +81,7 @@ impl Iterator for ArrayIter { fn next(&mut self) -> Option { if self.current == self.current_end { None - } else if self.array.is_null(self.current) { + } else if self.is_null(self.current) { self.current += 1; Some(None) } else { @@ -98,7 +110,7 @@ impl DoubleEndedIterator for ArrayIter { None } else { self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { + Some(if self.is_null(self.current_end) { None } else { // Safety: diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index f84cfa79c2dc..f0e7f0f13670 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -203,6 +203,12 @@ impl BooleanBufferBuilder { ); } + /// Append [`BooleanBuffer`] to this [`BooleanBufferBuilder`] + pub fn append_buffer(&mut self, buffer: &BooleanBuffer) { + let range = buffer.offset()..buffer.offset() + buffer.len(); + self.append_packed_range(range, buffer.values()) + } + /// Returns the packed bits pub fn as_slice(&self) -> &[u8] { self.buffer.as_slice() diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index c7fd082de2e6..a08a7a4fd413 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -7233,9 +7233,8 @@ mod tests { assert_eq!(array.data_type(), &data_type); let cast_array = cast(&array, &DataType::Null).expect("cast failed"); assert_eq!(cast_array.data_type(), &DataType::Null); - for i in 0..4 { - assert!(cast_array.is_null(i)); - } + assert_eq!(cast_array.len(), 4); + assert_eq!(cast_array.logical_nulls().unwrap().null_count(), 4); } #[test] diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index d18b0e36e930..21583fac08ff 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -6352,4 +6352,18 @@ mod tests { .to_string() .contains("Could not convert ToType with to_i128")); } + + #[test] + #[cfg(feature = "dyn_cmp_dict")] + fn test_dictionary_nested_nulls() { + let keys = Int32Array::from(vec![0, 1, 2]); + let v1 = Arc::new(Int32Array::from(vec![Some(0), None, Some(2)])); + let a = DictionaryArray::new(keys.clone(), v1); + let v2 = Arc::new(Int32Array::from(vec![None, Some(0), Some(2)])); + let b = DictionaryArray::new(keys, v2); + + let r = eq_dyn(&a, &b).unwrap(); + assert_eq!(r.null_count(), 2); + assert!(r.is_valid(2)); + } } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 648a7d7afcca..87858630599f 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -719,19 +719,19 @@ where } } -type LexicographicalCompareItem<'a> = ( - Option<&'a NullBuffer>, // nulls - DynComparator, // comparator - SortOptions, // sort_option +type LexicographicalCompareItem = ( + Option, // nulls + DynComparator, // comparator + SortOptions, // sort_option ); /// A lexicographical comparator that wraps given array data (columns) and can lexicographically compare data /// at given two indices. The lifetime is the same at the data wrapped. -pub struct LexicographicalComparator<'a> { - compare_items: Vec>, +pub struct LexicographicalComparator { + compare_items: Vec, } -impl LexicographicalComparator<'_> { +impl LexicographicalComparator { /// lexicographically compare values at the wrapped columns with given indices. pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering { for (nulls, comparator, sort_option) in &self.compare_items { @@ -780,14 +780,14 @@ impl LexicographicalComparator<'_> { /// results with two indices. pub fn try_new( columns: &[SortColumn], - ) -> Result, ArrowError> { + ) -> Result { let compare_items = columns .iter() .map(|column| { // flatten and convert build comparators let values = column.values.as_ref(); Ok(( - values.nulls(), + values.logical_nulls(), build_compare(values, values)?, column.options.unwrap_or_default(), )) @@ -4016,4 +4016,30 @@ mod tests { vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)], ); } + + #[test] + fn test_lexicographic_comparator_null_dict_values() { + let values = Int32Array::new( + vec![1, 2, 3, 4].into(), + Some(NullBuffer::from(vec![true, false, false, true])), + ); + let keys = Int32Array::new( + vec![0, 1, 53, 3].into(), + Some(NullBuffer::from(vec![true, true, false, true])), + ); + // [1, NULL, NULL, 4] + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let comparator = LexicographicalComparator::try_new(&[SortColumn { + values: Arc::new(dict), + options: None, + }]) + .unwrap(); + // 1.cmp(NULL) + assert_eq!(comparator.compare(0, 1), Ordering::Greater); + // NULL.cmp(NULL) + assert_eq!(comparator.compare(2, 1), Ordering::Equal); + // NULL.cmp(4) + assert_eq!(comparator.compare(2, 3), Ordering::Less); + } } diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 1223280e3769..9d3abea66fb1 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -581,7 +581,10 @@ where )); } - let nulls = NullBuffer::union(left.nulls(), right.nulls()); + let nulls = NullBuffer::union( + left.logical_nulls().as_ref(), + right.logical_nulls().as_ref(), + ); let mut result = BooleanBufferBuilder::new(left.len()); for i in 0..left.len() { diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 47b01890301e..48615dc3d599 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -494,7 +494,7 @@ impl LevelInfoBuilder { def_levels.reserve(len); info.non_null_indices.reserve(len); - match array.nulls() { + match array.logical_nulls() { Some(nulls) => { // TODO: Faster bitmask iteration (#1757) for i in range { @@ -1751,7 +1751,6 @@ mod tests { builder.write(&a, 0..4); let levels = builder.finish(); - let list_level = levels.get(0).unwrap(); let expected_level = LevelInfo { def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]), rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), @@ -1760,6 +1759,35 @@ mod tests { max_rep_level: 2, }; - assert_eq!(list_level, &expected_level); + assert_eq!(levels[0], expected_level); + } + + #[test] + fn test_null_dictionary_values() { + let values = Int32Array::new( + vec![1, 2, 3, 4].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + let keys = Int32Array::new( + vec![1, 54, 2, 0].into(), + Some(NullBuffer::from(vec![true, false, true, true])), + ); + // [NULL, NULL, 3, 0] + let dict = DictionaryArray::new(keys, Arc::new(values)); + + let item_field = Field::new("item", dict.data_type().clone(), true); + + let mut builder = + LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); + builder.write(&dict, 0..4); + let levels = builder.finish(); + let expected_level = LevelInfo { + def_levels: Some(vec![0, 0, 1, 1]), + rep_levels: None, + non_null_indices: vec![2, 3], + max_def_level: 1, + max_rep_level: 0, + }; + assert_eq!(levels[0], expected_level); } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index d3d4e2626fe3..c4d174b6adc1 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1965,7 +1965,7 @@ mod tests { assert_eq!(a.value(0).len(), 0); assert_eq!(a.value(2).len(), 2); - assert_eq!(a.value(2).null_count(), 2); + assert_eq!(a.value(2).logical_nulls().unwrap().null_count(), 2); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); roundtrip(batch, None); From abd80ae014e1927fa60f51a159b17ac3d7500fac Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 15 Aug 2023 00:02:52 -0700 Subject: [PATCH 1158/1411] Support references in i256 arithmetic ops (#4692) * Support references in i256 arithmetic ops * Fix clippy * For review --- arrow-buffer/src/bigint/mod.rs | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arrow-buffer/src/bigint/mod.rs b/arrow-buffer/src/bigint/mod.rs index fe0774539989..d064663bf63a 100644 --- a/arrow-buffer/src/bigint/mod.rs +++ b/arrow-buffer/src/bigint/mod.rs @@ -659,6 +659,30 @@ macro_rules! derive_op { self.$wrapping(rhs) } } + + impl<'a> std::ops::$t for &'a i256 { + type Output = i256; + + fn $op(self, rhs: i256) -> Self::Output { + (*self).$op(rhs) + } + } + + impl<'a> std::ops::$t<&'a i256> for i256 { + type Output = i256; + + fn $op(self, rhs: &'a i256) -> Self::Output { + self.$op(*rhs) + } + } + + impl<'a, 'b> std::ops::$t<&'b i256> for &'a i256 { + type Output = i256; + + fn $op(self, rhs: &'b i256) -> Self::Output { + (*self).$op(*rhs) + } + } }; } @@ -1194,4 +1218,57 @@ mod tests { assert_eq!(i256::from_string(case), expected) } } + + #[allow(clippy::op_ref)] + fn test_reference_op(il: i256, ir: i256) { + let r1 = il + ir; + let r2 = &il + ir; + let r3 = il + &ir; + let r4 = &il + &ir; + assert_eq!(r1, r2); + assert_eq!(r1, r3); + assert_eq!(r1, r4); + + let r1 = il - ir; + let r2 = &il - ir; + let r3 = il - &ir; + let r4 = &il - &ir; + assert_eq!(r1, r2); + assert_eq!(r1, r3); + assert_eq!(r1, r4); + + let r1 = il * ir; + let r2 = &il * ir; + let r3 = il * &ir; + let r4 = &il * &ir; + assert_eq!(r1, r2); + assert_eq!(r1, r3); + assert_eq!(r1, r4); + + let r1 = il / ir; + let r2 = &il / ir; + let r3 = il / &ir; + let r4 = &il / &ir; + assert_eq!(r1, r2); + assert_eq!(r1, r3); + assert_eq!(r1, r4); + } + + #[test] + fn test_i256_reference_op() { + let candidates = [ + i256::ONE, + i256::MINUS_ONE, + i256::from_i128(2), + i256::from_i128(-2), + i256::from_i128(3), + i256::from_i128(-3), + ]; + + for il in candidates { + for ir in candidates { + test_reference_op(il, ir) + } + } + } } From 77fe72ddd40c1d39068ad580b975504d57032060 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:51:36 +0100 Subject: [PATCH 1159/1411] Prepare object_store 0.7.0 (#4699) --- object_store/CHANGELOG.md | 45 ++++++++++++++----- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index fe25e23fb768..125063943726 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,30 +19,51 @@ # Changelog -## [object_store_0.6.1](https://github.com/apache/arrow-rs/tree/object_store_0.6.1) (2023-06-02) +## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.0...object_store_0.6.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) + +**Breaking changes:** + +- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Support multipart upload in R2 [\#4304](https://github.com/apache/arrow-rs/issues/4304) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) **Fixed bugs:** -- Default ObjectStore::get\_range Doesn't Apply Range to GetResult::File [\#4350](https://github.com/apache/arrow-rs/issues/4350) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- \[object\_store - AmazonS3Builder\] incorrect metadata\_endpoint set in `from_env` in an ECS environment [\#4283](https://github.com/apache/arrow-rs/issues/4283) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Fix ObjectStore::get\_range for GetResult::File \(\#4350\) [\#4351](https://github.com/apache/arrow-rs/pull/4351) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Don't exclude FIFO files from LocalFileSystem [\#4345](https://github.com/apache/arrow-rs/pull/4345) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix support for ECS IAM credentials [\#4310](https://github.com/apache/arrow-rs/pull/4310) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- feat: use exactly equal parts in multipart upload [\#4305](https://github.com/apache/arrow-rs/pull/4305) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) -- Set ECS specific metadata endpoint [\#4288](https://github.com/apache/arrow-rs/pull/4288) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jfuechsl](https://github.com/jfuechsl)) -- Prepare 40.0.0 release [\#4245](https://github.com/apache/arrow-rs/pull/4245) ([tustvold](https://github.com/tustvold)) -- feat: support bulk deletes in object\_store [\#4060](https://github.com/apache/arrow-rs/pull/4060) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([wjones127](https://github.com/wjones127)) +- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) +- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) +- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) +- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) +- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) +- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index eca5a5ce84ed..7ef395acd4c9 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.6.1" +version = "0.7.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 3e9f8bdba859..48835c715552 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.6.0" -FUTURE_RELEASE="object_store_0.6.1" +SINCE_TAG="object_store_0.6.1" +FUTURE_RELEASE="object_store_0.7.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 197c425285219706f0a8393468c55e2ccd82e6e8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 16 Aug 2023 11:53:46 +0100 Subject: [PATCH 1160/1411] Validate ArrayData Buffer Alignment and Automatically Align IPC buffers (#4255) (#4681) * Automatically align misaligned IPC buffers (#4255) * Update test * Further test fix * Format * Review feedback * More docs --- arrow-array/src/array/list_array.rs | 12 +- arrow-array/src/types.rs | 4 +- arrow-data/src/data.rs | 230 +++++++++++++++++++--------- arrow-ipc/src/reader.rs | 120 ++++++--------- arrow-ipc/src/writer.rs | 2 +- arrow/tests/array_validation.rs | 4 +- 6 files changed, 223 insertions(+), 149 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f5b7ae77c3f9..3508e4f1c469 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -1037,13 +1037,17 @@ mod tests { #[should_panic( expected = "Memory pointer is not aligned with the specified scalar type" )] + // Different error messages, so skip for now + // https://github.com/apache/arrow-rs/issues/1545 + #[cfg(not(feature = "force_validate"))] fn test_primitive_array_alignment() { let buf = Buffer::from_slice_ref([0_u64]); let buf2 = buf.slice(1); - let array_data = ArrayData::builder(DataType::Int32) - .add_buffer(buf2) - .build() - .unwrap(); + let array_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(buf2) + .build_unchecked() + }; drop(Int32Array::from(array_data)); } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 769dbf974b93..d79b32a991ed 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1494,7 +1494,6 @@ pub type LargeBinaryType = GenericBinaryType; mod tests { use super::*; use arrow_data::{layout, BufferSpec}; - use std::mem::size_of; #[test] fn month_day_nano_should_roundtrip() { @@ -1541,7 +1540,8 @@ mod tests { assert_eq!( spec, &BufferSpec::FixedWidth { - byte_width: size_of::() + byte_width: std::mem::size_of::(), + alignment: std::mem::align_of::(), } ); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 6ff8a824b2ff..0417e1d357c7 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -20,7 +20,7 @@ use crate::bit_iterator::BitSliceIterator; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, UnionMode}; use std::convert::TryInto; use std::mem; @@ -451,7 +451,7 @@ impl ArrayData { for spec in layout.buffers.iter() { match spec { - BufferSpec::FixedWidth { byte_width } => { + BufferSpec::FixedWidth { byte_width, .. } => { let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( @@ -699,6 +699,23 @@ impl ArrayData { Self::new_null(data_type, 0) } + /// Verifies that the buffers meet the minimum alignment requirements for the data type + /// + /// Buffers that are not adequately aligned will be copied to a new aligned allocation + /// + /// This can be useful for when interacting with data sent over IPC or FFI, that may + /// not meet the minimum alignment requirements + fn align_buffers(&mut self) { + let layout = layout(&self.data_type); + for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { + if let BufferSpec::FixedWidth { alignment, .. } = spec { + if buffer.as_ptr().align_offset(*alignment) != 0 { + *buffer = Buffer::from_slice_ref(buffer.as_ref()) + } + } + } + } + /// "cheap" validation of an `ArrayData`. Ensures buffers are /// sufficiently sized to store `len` + `offset` total elements of /// `data_type` and performs other inexpensive consistency checks. @@ -736,10 +753,11 @@ impl ArrayData { self.buffers.iter().zip(layout.buffers.iter()).enumerate() { match spec { - BufferSpec::FixedWidth { byte_width } => { - let min_buffer_size = len_plus_offset - .checked_mul(*byte_width) - .expect("integer overflow computing min buffer size"); + BufferSpec::FixedWidth { + byte_width, + alignment, + } => { + let min_buffer_size = len_plus_offset.saturating_mul(*byte_width); if buffer.len() < min_buffer_size { return Err(ArrowError::InvalidArgumentError(format!( @@ -747,6 +765,14 @@ impl ArrayData { min_buffer_size, i, self.data_type, buffer.len() ))); } + + let align_offset = buffer.as_ptr().align_offset(*alignment); + if align_offset != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}", + self.data_type, align_offset.min(alignment - align_offset) + ))); + } } BufferSpec::VariableWidth => { // not cheap to validate (need to look at the @@ -1493,7 +1519,8 @@ impl ArrayData { pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) - use std::mem::size_of; + use arrow_schema::IntervalUnit::*; + match data_type { DataType::Null => DataTypeLayout { buffers: vec![], @@ -1503,44 +1530,52 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { buffers: vec![BufferSpec::BitMap], can_contain_null_mask: true, }, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Interval(_) => { - DataTypeLayout::new_fixed_width(data_type.primitive_width().unwrap()) - } - DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Binary => DataTypeLayout::new_binary(size_of::()), - DataType::FixedSizeBinary(bytes_per_value) => { - let bytes_per_value: usize = (*bytes_per_value) - .try_into() - .expect("negative size for fixed size binary"); - DataTypeLayout::new_fixed_width(bytes_per_value) + DataType::Int8 => DataTypeLayout::new_fixed_width::(), + DataType::Int16 => DataTypeLayout::new_fixed_width::(), + DataType::Int32 => DataTypeLayout::new_fixed_width::(), + DataType::Int64 => DataTypeLayout::new_fixed_width::(), + DataType::UInt8 => DataTypeLayout::new_fixed_width::(), + DataType::UInt16 => DataTypeLayout::new_fixed_width::(), + DataType::UInt32 => DataTypeLayout::new_fixed_width::(), + DataType::UInt64 => DataTypeLayout::new_fixed_width::(), + DataType::Float16 => DataTypeLayout::new_fixed_width::(), + DataType::Float32 => DataTypeLayout::new_fixed_width::(), + DataType::Float64 => DataTypeLayout::new_fixed_width::(), + DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::Date32 => DataTypeLayout::new_fixed_width::(), + DataType::Date64 => DataTypeLayout::new_fixed_width::(), + DataType::Time32(_) => DataTypeLayout::new_fixed_width::(), + DataType::Time64(_) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(MonthDayNano) => DataTypeLayout::new_fixed_width::(), + DataType::Duration(_) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::FixedSizeBinary(size) => { + let spec = BufferSpec::FixedWidth { + byte_width: (*size).try_into().unwrap(), + alignment: mem::align_of::(), + }; + DataTypeLayout { + buffers: vec![spec], + can_contain_null_mask: true, + } } - DataType::LargeBinary => DataTypeLayout::new_binary(size_of::()), - DataType::Utf8 => DataTypeLayout::new_binary(size_of::()), - DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::()), - DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Binary => DataTypeLayout::new_binary::(), + DataType::LargeBinary => DataTypeLayout::new_binary::(), + DataType::Utf8 => DataTypeLayout::new_binary::(), + DataType::LargeUtf8 => DataTypeLayout::new_binary::(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data - DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::List(_) => DataTypeLayout::new_fixed_width::(), + DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), + DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, mode) => { let type_ids = BufferSpec::FixedWidth { - byte_width: size_of::(), + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }; DataTypeLayout { @@ -1552,7 +1587,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { vec![ type_ids, BufferSpec::FixedWidth { - byte_width: size_of::(), + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }, ] } @@ -1561,19 +1597,6 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { } } DataType::Dictionary(key_type, _value_type) => layout(key_type), - DataType::Decimal128(_, _) => { - // Decimals are always some fixed width; The rust implementation - // always uses 16 bytes / size of i128 - DataTypeLayout::new_fixed_width(size_of::()) - } - DataType::Decimal256(_, _) => { - // Decimals are always some fixed width. - DataTypeLayout::new_fixed_width(32) - } - DataType::Map(_, _) => { - // same as ListType - DataTypeLayout::new_fixed_width(size_of::()) - } } } @@ -1589,10 +1612,13 @@ pub struct DataTypeLayout { } impl DataTypeLayout { - /// Describes a basic numeric array where each element has a fixed width - pub fn new_fixed_width(byte_width: usize) -> Self { + /// Describes a basic numeric array where each element has type `T` + pub fn new_fixed_width() -> Self { Self { - buffers: vec![BufferSpec::FixedWidth { byte_width }], + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], can_contain_null_mask: true, } } @@ -1608,14 +1634,15 @@ impl DataTypeLayout { } /// Describes a basic numeric array where each element has a fixed - /// with offset buffer of `offset_byte_width` bytes, followed by a + /// with offset buffer of type `T`, followed by a /// variable width data buffer - pub fn new_binary(offset_byte_width: usize) -> Self { + pub fn new_binary() -> Self { Self { buffers: vec![ // offsets BufferSpec::FixedWidth { - byte_width: offset_byte_width, + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }, // values BufferSpec::VariableWidth, @@ -1628,8 +1655,18 @@ impl DataTypeLayout { /// Layout specification for a single data type buffer #[derive(Debug, PartialEq, Eq)] pub enum BufferSpec { - /// each element has a fixed width - FixedWidth { byte_width: usize }, + /// Each element is a fixed width primitive, with the given `byte_width` and `alignment` + /// + /// `alignment` is the alignment required by Rust for an array of the corresponding primitive, + /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. + /// + /// Arrow-rs requires that all buffers are have at least this alignment, to allow for + /// [slice](std::slice) based APIs. We do not require alignment in excess of this to allow + /// for array slicing, and interoperability with `Vec` which in the absence of support + /// for custom allocators, cannot be over-aligned. + /// + /// Note that these alignment requirements will vary between architectures + FixedWidth { byte_width: usize, alignment: usize }, /// Variable width, such as string data for utf8 data VariableWidth, /// Buffer holds a bitmap. @@ -1741,6 +1778,15 @@ impl ArrayDataBuilder { /// apply. #[allow(clippy::let_and_return)] pub unsafe fn build_unchecked(self) -> ArrayData { + let data = self.build_impl(); + // Provide a force_validate mode + #[cfg(feature = "force_validate")] + data.validate_data().unwrap(); + data + } + + /// Same as [`Self::build_unchecked`] but ignoring `force_validate` feature flag + unsafe fn build_impl(self) -> ArrayData { let nulls = self.nulls.or_else(|| { let buffer = self.null_bit_buffer?; let buffer = BooleanBuffer::new(buffer, self.offset, self.len); @@ -1750,26 +1796,41 @@ impl ArrayDataBuilder { }) }); - let data = ArrayData { + ArrayData { data_type: self.data_type, len: self.len, offset: self.offset, buffers: self.buffers, child_data: self.child_data, nulls: nulls.filter(|b| b.null_count() != 0), - }; - - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - data.validate_data().unwrap(); - data + } } /// Creates an array data, validating all inputs - #[allow(clippy::let_and_return)] pub fn build(self) -> Result { - let data = unsafe { self.build_unchecked() }; - #[cfg(not(feature = "force_validate"))] + let data = unsafe { self.build_impl() }; + data.validate_data()?; + Ok(data) + } + + /// Creates an array data, validating all inputs, and aligning any buffers + /// + /// Rust requires that arrays are aligned to their corresponding primitive, + /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. + /// + /// [`ArrayData`] therefore requires that all buffers are have at least this alignment, + /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. + /// + /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, + /// this method is provided to automatically copy buffers to a new correctly aligned allocation + /// when necessary, making it useful when interacting with buffers produced by other systems, + /// e.g. IPC or FFI. + /// + /// This is unlike `[Self::build`] which will instead return an error on encountering + /// insufficiently aligned buffers. + pub fn build_aligned(self) -> Result { + let mut data = unsafe { self.build_impl() }; + data.align_buffers(); data.validate_data()?; Ok(data) } @@ -2057,4 +2118,31 @@ mod tests { assert_eq!(buffers.len(), layout.buffers.len()); } } + + #[test] + fn test_alignment() { + let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); + let sliced = buffer.slice(1); + + let mut data = ArrayData { + data_type: DataType::Int32, + len: 0, + offset: 0, + buffers: vec![buffer], + child_data: vec![], + nulls: None, + }; + data.validate_full().unwrap(); + + data.buffers[0] = sliced; + let err = data.validate().unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" + ); + + data.align_buffers(); + data.validate_full().unwrap(); + } } diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 0908d580d59a..b7d328977d1c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -20,7 +20,6 @@ //! The `FileReader` and `StreamReader` have similar interfaces, //! however the `FileReader` expects a reader that supports `Seek`ing -use arrow_buffer::i256; use flatbuffers::VectorIter; use std::collections::HashMap; use std::fmt; @@ -129,7 +128,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result Result { + _ if data_type.is_primitive() + || matches!(data_type, Boolean | FixedSizeBinary(_)) => + { // read 2 buffers: null buffer (optional) and data buffer ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build()? - } - Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { - let buffer = get_aligned_buffer::(&buffers[1], length); - - // read 2 buffers: null buffer (optional) and data buffer - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffer) - .null_bit_buffer(null_buffer) - .build()? - } - Decimal256(_, _) => { - let buffer = get_aligned_buffer::(&buffers[1], length); - - // read 2 buffers: null buffer (optional) and data buffer - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffer) - .null_bit_buffer(null_buffer) - .build()? + .build_aligned()? } t => unreachable!("Data type {:?} either unsupported or not primitive", t), }; @@ -286,28 +248,10 @@ fn create_primitive_array( Ok(make_array(array_data)) } -/// Checks if given `Buffer` is properly aligned with `T`. -/// If not, copying the data and padded it for alignment. -fn get_aligned_buffer(buffer: &Buffer, length: usize) -> Buffer { - let ptr = buffer.as_ptr(); - let align_req = std::mem::align_of::(); - let align_offset = ptr.align_offset(align_req); - // The buffer is not aligned properly. The writer might use a smaller alignment - // e.g. 8 bytes, but on some platform (e.g. ARM) i128 requires 16 bytes alignment. - // We need to copy the buffer as fallback. - if align_offset != 0 { - let len_in_bytes = (length * std::mem::size_of::()).min(buffer.len()); - let slice = &buffer.as_slice()[0..len_in_bytes]; - Buffer::from_slice_ref(slice) - } else { - buffer.clone() - } -} - /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_list_array( - field_node: &crate::FieldNode, + field_node: &FieldNode, data_type: &DataType, buffers: &[Buffer], child_array: ArrayRef, @@ -329,13 +273,13 @@ fn create_list_array( _ => unreachable!("Cannot create list or map array from {:?}", data_type), }; - Ok(make_array(builder.build()?)) + Ok(make_array(builder.build_aligned()?)) } /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_dictionary_array( - field_node: &crate::FieldNode, + field_node: &FieldNode, data_type: &DataType, buffers: &[Buffer], value_array: ArrayRef, @@ -348,7 +292,7 @@ fn create_dictionary_array( .add_child_data(value_array.into_data()) .null_bit_buffer(null_buffer); - Ok(make_array(builder.build()?)) + Ok(make_array(builder.build_aligned()?)) } else { unreachable!("Cannot create dictionary array from {:?}", data_type) } @@ -1097,10 +1041,11 @@ impl RecordBatchReader for StreamReader { #[cfg(test)] mod tests { - use crate::writer::unslice_run_array; + use crate::writer::{unslice_run_array, DictionaryTracker, IpcDataGenerator}; use super::*; + use crate::root_as_message; use arrow_array::builder::{PrimitiveRunBuilder, UnionBuilder}; use arrow_array::types::*; use arrow_buffer::ArrowNativeType; @@ -1357,8 +1302,7 @@ mod tests { writer.finish().unwrap(); drop(writer); - let mut reader = - crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); reader.next().unwrap().unwrap() } @@ -1704,4 +1648,40 @@ mod tests { let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } + + #[test] + fn test_unaligned() { + let batch = RecordBatch::try_from_iter(vec![( + "i32", + Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _, + )]) + .unwrap(); + + let gen = IpcDataGenerator {}; + let mut dict_tracker = DictionaryTracker::new(false); + let (_, encoded) = gen + .encoded_batch(&batch, &mut dict_tracker, &Default::default()) + .unwrap(); + + let message = root_as_message(&encoded.ipc_message).unwrap(); + + // Construct an unaligned buffer + let mut buffer = MutableBuffer::with_capacity(encoded.arrow_data.len() + 1); + buffer.push(0_u8); + buffer.extend_from_slice(&encoded.arrow_data); + let b = Buffer::from(buffer).slice(1); + assert_ne!(b.as_ptr().align_offset(8), 0); + + let ipc_batch = message.header_as_record_batch().unwrap(); + let roundtrip = read_record_batch( + &b, + ipc_batch, + batch.schema(), + &Default::default(), + None, + &message.version(), + ) + .unwrap(); + assert_eq!(batch, roundtrip); + } } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 59657bc4be09..1c56613d8f24 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1146,7 +1146,7 @@ fn buffer_need_truncate( #[inline] fn get_buffer_element_width(spec: &BufferSpec) -> usize { match spec { - BufferSpec::FixedWidth { byte_width } => *byte_width, + BufferSpec::FixedWidth { byte_width, .. } => *byte_width, _ => 0, } } diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 0d3652a0473a..fa80db1860cd 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -56,7 +56,9 @@ fn test_bad_number_of_buffers() { } #[test] -#[should_panic(expected = "integer overflow computing min buffer size")] +#[should_panic( + expected = "Need at least 18446744073709551615 bytes in buffers[0] in array of type Int64, but got 8" +)] fn test_fixed_width_overflow() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) From cbff4d818ded578b0e28be5cd89c32085ed07ff4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 16 Aug 2023 13:09:51 +0100 Subject: [PATCH 1161/1411] Update object_store Dependencies and Configure Dependabot (#4700) * Update itertools and quick-xml * Add dependabot --- .github/dependabot.yml | 11 +++++++++-- object_store/Cargo.toml | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9c4cda5d034d..ffde5378da93 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,10 +6,17 @@ updates: interval: daily open-pull-requests-limit: 10 target-branch: master - labels: [auto-dependencies] + labels: [ auto-dependencies, arrow ] + - package-ecosystem: cargo + directory: "/object_store" + schedule: + interval: daily + open-pull-requests-limit: 10 + target-branch: master + labels: [ auto-dependencies, object_store ] - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" open-pull-requests-limit: 10 - labels: [auto-dependencies] + labels: [ auto-dependencies ] diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 7ef395acd4c9..3c10f4a9c849 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.23", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.10.1" +itertools = "0.11.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.28.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.30.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From b581ef51b07f416374955f4b3ebbcd1ff8b1fc48 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 16 Aug 2023 13:10:48 +0100 Subject: [PATCH 1162/1411] Add safe zero-copy converion from bytes::Bytes (#4254) (#4260) --- arrow-buffer/Cargo.toml | 1 + arrow-buffer/src/bytes.rs | 28 ++++++++++++++++++++++++++++ arrow-flight/src/decode.rs | 3 ++- arrow-flight/src/sql/client.rs | 2 +- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 1db388db8398..746045cc8dde 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -34,6 +34,7 @@ path = "src/lib.rs" bench = false [dependencies] +bytes = { version = "1.4" } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false } diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index b3105ed5a3b4..8f5019d5a4cc 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -148,3 +148,31 @@ impl Debug for Bytes { write!(f, " }}") } } + +impl From for Bytes { + fn from(value: bytes::Bytes) -> Self { + Self { + len: value.len(), + ptr: NonNull::new(value.as_ptr() as _).unwrap(), + deallocation: Deallocation::Custom(std::sync::Arc::new(value)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_bytes() { + let bytes = bytes::Bytes::from(vec![1, 2, 3, 4]); + let arrow_bytes: Bytes = bytes.clone().into(); + + assert_eq!(bytes.as_ptr(), arrow_bytes.as_ptr()); + + drop(bytes); + drop(arrow_bytes); + + let _ = Bytes::from(bytes::Bytes::new()); + } +} diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index fe132e3e8448..df74923332e3 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -17,6 +17,7 @@ use crate::{utils::flight_data_to_arrow_batch, FlightData}; use arrow_array::{ArrayRef, RecordBatch}; +use arrow_buffer::Buffer; use arrow_schema::{Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; @@ -258,7 +259,7 @@ impl FlightDataDecoder { )); }; - let buffer: arrow_buffer::Buffer = data.data_body.into(); + let buffer = Buffer::from_bytes(data.data_body.into()); let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { FlightError::protocol( diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index c9adc2b98b12..d661c9640908 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -538,7 +538,7 @@ pub fn arrow_data_from_flight_data( let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( - &Buffer::from(&flight_data.data_body), + &Buffer::from_bytes(flight_data.data_body.into()), ipc_record_batch, arrow_schema_ref.clone(), &dictionaries_by_field, From fc6f528705e0f819ce333e1a2de75202a1965f05 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 16 Aug 2023 17:24:13 +0100 Subject: [PATCH 1163/1411] Remove rank kernels (#4703) --- arrow-ord/src/lib.rs | 1 - arrow-ord/src/rank.rs | 195 ----------------------------------- arrow/benches/sort_kernel.rs | 21 ---- arrow/src/compute/kernels.rs | 2 +- arrow/src/compute/mod.rs | 1 - 5 files changed, 1 insertion(+), 219 deletions(-) delete mode 100644 arrow-ord/src/rank.rs diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs index 8b43cdb0bffb..62338c022384 100644 --- a/arrow-ord/src/lib.rs +++ b/arrow-ord/src/lib.rs @@ -46,5 +46,4 @@ pub mod comparison; pub mod ord; pub mod partition; -pub mod rank; pub mod sort; diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs deleted file mode 100644 index 1e79156a71a3..000000000000 --- a/arrow-ord/src/rank.rs +++ /dev/null @@ -1,195 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::cast::AsArray; -use arrow_array::types::*; -use arrow_array::{downcast_primitive_array, Array, ArrowNativeTypeOp, GenericByteArray}; -use arrow_buffer::NullBuffer; -use arrow_schema::{ArrowError, DataType, SortOptions}; -use std::cmp::Ordering; - -/// Assigns a rank to each value in `array` based on its position in the sorted order -/// -/// Where values are equal, they will be assigned the highest of their ranks, -/// leaving gaps in the overall rank assignment -/// -/// ``` -/// # use arrow_array::StringArray; -/// # use arrow_ord::rank::rank; -/// let array = StringArray::from(vec![Some("foo"), None, Some("foo"), None, Some("bar")]); -/// let ranks = rank(&array, None).unwrap(); -/// assert_eq!(ranks, &[5, 2, 5, 2, 3]); -/// ``` -pub fn rank( - array: &dyn Array, - options: Option, -) -> Result, ArrowError> { - let options = options.unwrap_or_default(); - let ranks = downcast_primitive_array! { - array => primitive_rank(array.values(), array.nulls(), options), - DataType::Utf8 => bytes_rank(array.as_bytes::(), options), - DataType::LargeUtf8 => bytes_rank(array.as_bytes::(), options), - DataType::Binary => bytes_rank(array.as_bytes::(), options), - DataType::LargeBinary => bytes_rank(array.as_bytes::(), options), - d => return Err(ArrowError::ComputeError(format!("{d:?} not supported in rank"))) - }; - Ok(ranks) -} - -#[inline(never)] -fn primitive_rank( - values: &[T], - nulls: Option<&NullBuffer>, - options: SortOptions, -) -> Vec { - let len: u32 = values.len().try_into().unwrap(); - let to_sort = match nulls.filter(|n| n.null_count() > 0) { - Some(n) => n - .valid_indices() - .map(|idx| (values[idx], idx as u32)) - .collect(), - None => values.iter().copied().zip(0..len).collect(), - }; - rank_impl(values.len(), to_sort, options, T::compare, T::is_eq) -} - -#[inline(never)] -fn bytes_rank( - array: &GenericByteArray, - options: SortOptions, -) -> Vec { - let to_sort: Vec<(&[u8], u32)> = match array.nulls().filter(|n| n.null_count() > 0) { - Some(n) => n - .valid_indices() - .map(|idx| (array.value(idx).as_ref(), idx as u32)) - .collect(), - None => (0..array.len()) - .map(|idx| (array.value(idx).as_ref(), idx as u32)) - .collect(), - }; - rank_impl(array.len(), to_sort, options, Ord::cmp, PartialEq::eq) -} - -fn rank_impl( - len: usize, - mut valid: Vec<(T, u32)>, - options: SortOptions, - compare: C, - eq: E, -) -> Vec -where - T: Copy, - C: Fn(T, T) -> Ordering, - E: Fn(T, T) -> bool, -{ - // We can use an unstable sort as we combine equal values later - valid.sort_unstable_by(|a, b| compare(a.0, b.0)); - if options.descending { - valid.reverse(); - } - - let (mut valid_rank, null_rank) = match options.nulls_first { - true => (len as u32, (len - valid.len()) as u32), - false => (valid.len() as u32, len as u32), - }; - - let mut out: Vec<_> = vec![null_rank; len]; - if let Some(v) = valid.last() { - out[v.1 as usize] = valid_rank; - } - - let mut count = 1; // Number of values in rank - for w in valid.windows(2).rev() { - match eq(w[0].0, w[1].0) { - true => { - count += 1; - out[w[0].1 as usize] = valid_rank; - } - false => { - valid_rank -= count; - count = 1; - out[w[0].1 as usize] = valid_rank - } - } - } - - out -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::*; - - #[test] - fn test_primitive() { - let descending = SortOptions { - descending: true, - nulls_first: true, - }; - - let nulls_last = SortOptions { - descending: false, - nulls_first: false, - }; - - let nulls_last_descending = SortOptions { - descending: true, - nulls_first: false, - }; - - let a = Int32Array::from(vec![Some(1), Some(1), None, Some(3), Some(3), Some(4)]); - let res = rank(&a, None).unwrap(); - assert_eq!(res, &[3, 3, 1, 5, 5, 6]); - - let res = rank(&a, Some(descending)).unwrap(); - assert_eq!(res, &[6, 6, 1, 4, 4, 2]); - - let res = rank(&a, Some(nulls_last)).unwrap(); - assert_eq!(res, &[2, 2, 6, 4, 4, 5]); - - let res = rank(&a, Some(nulls_last_descending)).unwrap(); - assert_eq!(res, &[5, 5, 6, 3, 3, 1]); - - // Test with non-zero null values - let nulls = NullBuffer::from(vec![true, true, false, true, false, false]); - let a = Int32Array::new(vec![1, 4, 3, 4, 5, 5].into(), Some(nulls)); - let res = rank(&a, None).unwrap(); - assert_eq!(res, &[4, 6, 3, 6, 3, 3]); - } - - #[test] - fn test_bytes() { - let v = vec!["foo", "fo", "bar", "bar"]; - let values = StringArray::from(v.clone()); - let res = rank(&values, None).unwrap(); - assert_eq!(res, &[4, 3, 2, 2]); - - let values = LargeStringArray::from(v.clone()); - let res = rank(&values, None).unwrap(); - assert_eq!(res, &[4, 3, 2, 2]); - - let v: Vec<&[u8]> = vec![&[1, 2], &[0], &[1, 2, 3], &[1, 2]]; - let values = LargeBinaryArray::from(v.clone()); - let res = rank(&values, None).unwrap(); - assert_eq!(res, &[3, 1, 4, 3]); - - let values = BinaryArray::from(v); - let res = rank(&values, None).unwrap(); - assert_eq!(res, &[3, 1, 4, 3]); - } -} diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index 63e10e0528ba..dd55076647a5 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -27,7 +27,6 @@ use arrow::compute::{lexsort, sort, sort_to_indices, SortColumn}; use arrow::datatypes::{Int16Type, Int32Type}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; -use arrow_ord::rank::rank; fn create_f32_array(size: usize, with_nulls: bool) -> ArrayRef { let null_density = if with_nulls { 0.5 } else { 0.0 }; @@ -214,26 +213,6 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("lexsort (f32, f32) nulls 2^12 limit 2^12", |b| { b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(2usize.pow(12)))) }); - - let arr = create_f32_array(2usize.pow(12), false); - c.bench_function("rank f32 2^12", |b| { - b.iter(|| black_box(rank(&arr, None).unwrap())) - }); - - let arr = create_f32_array(2usize.pow(12), true); - c.bench_function("rank f32 nulls 2^12", |b| { - b.iter(|| black_box(rank(&arr, None).unwrap())) - }); - - let arr = create_string_array_with_len::(2usize.pow(12), 0.0, 10); - c.bench_function("rank string[10] 2^12", |b| { - b.iter(|| black_box(rank(&arr, None).unwrap())) - }); - - let arr = create_string_array_with_len::(2usize.pow(12), 0.5, 10); - c.bench_function("rank string[10] nulls 2^12", |b| { - b.iter(|| black_box(rank(&arr, None).unwrap())) - }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs index faff1b8a0ddf..1a79aef547d3 100644 --- a/arrow/src/compute/kernels.rs +++ b/arrow/src/compute/kernels.rs @@ -22,7 +22,7 @@ pub use arrow_arith::{ }; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; -pub use arrow_ord::{partition, rank, sort}; +pub use arrow_ord::{partition, sort}; pub use arrow_select::{concat, filter, interleave, nullif, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index 47a9d149aadb..7cfe787b08cf 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -30,7 +30,6 @@ pub use self::kernels::filter::*; pub use self::kernels::interleave::*; pub use self::kernels::nullif::*; pub use self::kernels::partition::*; -pub use self::kernels::rank::*; pub use self::kernels::regexp::*; pub use self::kernels::sort::*; pub use self::kernels::take::*; From d8381943ccf74256e36e522fe2c7ae9357888117 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 09:28:30 +0100 Subject: [PATCH 1164/1411] Improve ergonomics of Scalar (#4704) * Improve ergonomics of Scalar * Add BooleanArray::new_scalar --- arrow-array/src/array/boolean_array.rs | 11 ++++++++++- arrow-array/src/array/byte_array.rs | 7 ++++++- arrow-array/src/array/primitive_array.rs | 11 ++++++++++- arrow-array/src/scalar.rs | 15 ++++++++------- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 995bb7d510d9..0d9a1044be8e 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -18,7 +18,7 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; -use crate::{Array, ArrayAccessor, ArrayRef}; +use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; @@ -101,6 +101,15 @@ impl BooleanArray { } } + /// Create a new [`Scalar`] from `value` + pub fn new_scalar(value: bool) -> Scalar { + let values = match value { + true => BooleanBuffer::new_set(1), + false => BooleanBuffer::new_unset(1), + }; + Scalar::new(Self::new(values, None)) + } + /// Returns the length of this array. pub fn len(&self) -> usize { self.values.len() diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index f694aa32e507..37d8de931e99 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -20,7 +20,7 @@ use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; use crate::types::ByteArrayType; -use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait}; +use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar}; use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -182,6 +182,11 @@ impl GenericByteArray { } } + /// Create a new [`Scalar`] from `v` + pub fn new_scalar(value: impl AsRef) -> Scalar { + Scalar::new(Self::from_iter_values(std::iter::once(value))) + } + /// Creates a [`GenericByteArray`] based on an iterator of values without nulls pub fn from_iter_values(iter: I) -> Self where diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 8337326370dd..0c32279640b2 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -24,7 +24,7 @@ use crate::temporal_conversions::{ use crate::timezone::Tz; use crate::trusted_len::trusted_len_unzip; use crate::types::*; -use crate::{Array, ArrayAccessor, ArrayRef}; +use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; use arrow_buffer::{i256, ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -553,6 +553,15 @@ impl PrimitiveArray { }) } + /// Create a new [`Scalar`] from `value` + pub fn new_scalar(value: T::Native) -> Scalar { + Scalar::new(Self { + data_type: T::DATA_TYPE, + values: vec![value].into(), + nulls: None, + }) + } + /// Deconstruct this array into its constituent parts pub fn into_parts(self) -> (DataType, ScalarBuffer, Option) { (self.data_type, self.values, self.nulls) diff --git a/arrow-array/src/scalar.rs b/arrow-array/src/scalar.rs index c142107c5cf3..7dfdbddd964a 100644 --- a/arrow-array/src/scalar.rs +++ b/arrow-array/src/scalar.rs @@ -71,8 +71,8 @@ use crate::Array; /// /// // Comparison of an array and a scalar /// let a = Int32Array::from(vec![1, 2, 3, 4, 5]); -/// let b = Int32Array::from(vec![1]); -/// let r = eq(&a, &Scalar::new(&b)).unwrap(); +/// let b = Int32Array::new_scalar(1); +/// let r = eq(&a, &b).unwrap(); /// let values: Vec<_> = r.values().iter().collect(); /// assert_eq!(values, &[true, false, false, false, false]); pub trait Datum { @@ -101,22 +101,23 @@ impl Datum for &dyn Array { /// A wrapper around a single value [`Array`] indicating kernels should treat it as a scalar value /// /// See [`Datum`] for more information -pub struct Scalar<'a>(&'a dyn Array); +#[derive(Debug, Copy, Clone)] +pub struct Scalar(T); -impl<'a> Scalar<'a> { +impl Scalar { /// Create a new [`Scalar`] from an [`Array`] /// /// # Panics /// /// Panics if `array.len() != 1` - pub fn new(array: &'a dyn Array) -> Self { + pub fn new(array: T) -> Self { assert_eq!(array.len(), 1); Self(array) } } -impl<'a> Datum for Scalar<'a> { +impl Datum for Scalar { fn get(&self) -> (&dyn Array, bool) { - (self.0, true) + (&self.0, true) } } From 44b644d47ccb2172de54f4dc729caae487f7851d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:22:23 +0100 Subject: [PATCH 1165/1411] Cleanup parquet type builders (#4706) * Cleanup parquet type builders * Update parquet-derive --- parquet/src/arrow/arrow_reader/mod.rs | 8 +- parquet/src/arrow/schema/mod.rs | 24 +++--- parquet/src/file/footer.rs | 4 +- parquet/src/file/metadata.rs | 2 +- parquet/src/file/writer.rs | 16 ++-- parquet/src/record/reader.rs | 2 +- parquet/src/schema/mod.rs | 2 +- parquet/src/schema/parser.rs | 40 +++++----- parquet/src/schema/printer.rs | 36 ++++----- parquet/src/schema/types.rs | 105 ++++++++++++-------------- parquet_derive/src/lib.rs | 2 +- 11 files changed, 114 insertions(+), 127 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 7e4423b86423..f7cecabb01d8 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -1669,7 +1669,7 @@ mod tests { _ => -1, }; - let mut fields = vec![Arc::new( + let fields = vec![Arc::new( Type::primitive_type_builder("leaf", T::get_physical_type()) .with_repetition(repetition) .with_converted_type(converted_type) @@ -1680,7 +1680,7 @@ mod tests { let schema = Arc::new( Type::group_type_builder("test_schema") - .with_fields(&mut fields) + .with_fields(fields) .build() .unwrap(), ); @@ -2026,7 +2026,7 @@ mod tests { #[test] fn test_dictionary_preservation() { - let mut fields = vec![Arc::new( + let fields = vec![Arc::new( Type::primitive_type_builder("leaf", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::OPTIONAL) .with_converted_type(ConvertedType::UTF8) @@ -2036,7 +2036,7 @@ mod tests { let schema = Arc::new( Type::group_type_builder("test_schema") - .with_fields(&mut fields) + .with_fields(fields) .build() .unwrap(), ); diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index cd6e8046cc63..bcfc2f884cac 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -37,7 +37,7 @@ use crate::basic::{ }; use crate::errors::{ParquetError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; -use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; +use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type}; mod complex; mod primitive; @@ -230,13 +230,13 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata( /// Convert arrow schema to parquet schema pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { - let fields: Result> = schema + let fields = schema .fields() .iter() .map(|field| arrow_to_parquet_type(field).map(Arc::new)) - .collect(); + .collect::>()?; let group = Type::group_type_builder("arrow_schema") - .with_fields(&mut fields?) + .with_fields(fields) .build()?; Ok(SchemaDescriptor::new(Arc::new(group))) } @@ -476,9 +476,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result { } DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { Type::group_type_builder(name) - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( Type::group_type_builder("list") - .with_fields(&mut vec![Arc::new(arrow_to_parquet_type(f)?)]) + .with_fields(vec![Arc::new(arrow_to_parquet_type(f)?)]) .with_repetition(Repetition::REPEATED) .build()?, )]) @@ -493,21 +493,21 @@ fn arrow_to_parquet_type(field: &Field) -> Result { ); } // recursively convert children to types/nodes - let fields: Result> = fields + let fields = fields .iter() .map(|f| arrow_to_parquet_type(f).map(Arc::new)) - .collect(); + .collect::>()?; Type::group_type_builder(name) - .with_fields(&mut fields?) + .with_fields(fields) .with_repetition(repetition) .build() } DataType::Map(field, _) => { if let DataType::Struct(struct_fields) = field.data_type() { Type::group_type_builder(name) - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( Type::group_type_builder(field.name()) - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new(arrow_to_parquet_type(&Field::new( struct_fields[0].name(), struct_fields[0].data_type().clone(), @@ -534,7 +534,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { DataType::Union(_, _) => unimplemented!("See ARROW-8817."), DataType::Dictionary(_, ref value) => { // Dictionary encoding not handled at the schema level - let dict_field = Field::new(name, *value.clone(), field.is_nullable()); + let dict_field = field.clone().with_data_type(value.as_ref().clone()); arrow_to_parquet_type(&dict_field) } DataType::RunEndEncoded(_, _) => Err(arrow_err!( diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index f4fb2534c220..21de63e0c234 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -184,7 +184,7 @@ mod tests { #[test] fn test_metadata_column_orders_parse() { // Define simple schema, we do not need to provide logical types. - let mut fields = vec![ + let fields = vec![ Arc::new( SchemaType::primitive_type_builder("col1", Type::INT32) .build() @@ -197,7 +197,7 @@ mod tests { ), ]; let schema = SchemaType::group_type_builder("schema") - .with_fields(&mut fields) + .with_fields(fields) .build() .unwrap(); let schema_descr = SchemaDescriptor::new(Arc::new(schema)); diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index a5e2de6b0667..aaa3d28e206a 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -1112,7 +1112,7 @@ mod tests { /// Returns sample schema descriptor so we can create column metadata. fn get_test_schema_descr() -> SchemaDescPtr { let schema = SchemaType::group_type_builder("schema") - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( SchemaType::primitive_type_builder("a", Type::INT32) .build() diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c31b9dc47426..cafb1761352d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -775,7 +775,7 @@ mod tests { let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .build() .unwrap(), @@ -801,7 +801,7 @@ mod tests { let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .with_repetition(Repetition::REQUIRED) @@ -848,7 +848,7 @@ mod tests { let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .build() .unwrap(), @@ -871,7 +871,7 @@ mod tests { let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .build() .unwrap(), @@ -920,7 +920,7 @@ mod tests { ); let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![field.clone()]) + .with_fields(vec![field.clone()]) .build() .unwrap(), ); @@ -963,7 +963,7 @@ mod tests { let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .build() @@ -1310,7 +1310,7 @@ mod tests { { let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", D::get_physical_type()) .with_repetition(Repetition::REQUIRED) .build() @@ -1468,7 +1468,7 @@ mod tests { ) { let schema = Arc::new( types::Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( types::Type::primitive_type_builder("col1", Type::INT32) .with_repetition(Repetition::REQUIRED) .build() diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 780e9822488d..3416386c9797 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -274,7 +274,7 @@ impl TreeBuilder { let required_field = Type::group_type_builder(field.name()) .with_repetition(Repetition::REQUIRED) .with_converted_type(field.get_basic_info().converted_type()) - .with_fields(&mut Vec::from(field.get_fields())) + .with_fields(field.get_fields().to_vec()) .build()?; path.pop(); diff --git a/parquet/src/schema/mod.rs b/parquet/src/schema/mod.rs index 1ebee2e06e83..ead7f1d2c0f8 100644 --- a/parquet/src/schema/mod.rs +++ b/parquet/src/schema/mod.rs @@ -45,7 +45,7 @@ //! .unwrap(); //! //! let schema = Type::group_type_builder("schema") -//! .with_fields(&mut vec![Arc::new(field_a), Arc::new(field_b)]) +//! .with_fields(vec![Arc::new(field_a), Arc::new(field_b)]) //! .build() //! .unwrap(); //! diff --git a/parquet/src/schema/parser.rs b/parquet/src/schema/parser.rs index c09f13603d29..9af0f328a74a 100644 --- a/parquet/src/schema/parser.rs +++ b/parquet/src/schema/parser.rs @@ -205,9 +205,8 @@ impl<'a> Parser<'a> { .tokenizer .next() .ok_or_else(|| general_err!("Expected name, found None"))?; - let mut fields = self.parse_child_types()?; Type::group_type_builder(name) - .with_fields(&mut fields) + .with_fields(self.parse_child_types()?) .build() } _ => Err(general_err!("Message type does not start with 'message'")), @@ -290,17 +289,14 @@ impl<'a> Parser<'a> { None }; - let mut fields = self.parse_child_types()?; let mut builder = Type::group_type_builder(name) .with_logical_type(logical_type) .with_converted_type(converted_type) - .with_fields(&mut fields); + .with_fields(self.parse_child_types()?) + .with_id(id); if let Some(rep) = repetition { builder = builder.with_repetition(rep); } - if let Some(id) = id { - builder = builder.with_id(id); - } builder.build() } @@ -516,17 +512,15 @@ impl<'a> Parser<'a> { }; assert_token(self.tokenizer.next(), ";")?; - let mut builder = Type::primitive_type_builder(name, physical_type) + Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) .with_logical_type(logical_type) .with_converted_type(converted_type) .with_length(length) .with_precision(precision) - .with_scale(scale); - if let Some(id) = id { - builder = builder.with_id(id); - } - builder.build() + .with_scale(scale) + .with_id(id) + .build() } } @@ -845,7 +839,7 @@ mod tests { let message = parse(schema).unwrap(); let expected = Type::group_type_builder("root") - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( Type::primitive_type_builder( "f1", @@ -906,16 +900,16 @@ mod tests { let message = parse(schema).unwrap(); let expected = Type::group_type_builder("root") - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( Type::group_type_builder("a0") .with_repetition(Repetition::REQUIRED) - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( Type::group_type_builder("a1") .with_repetition(Repetition::OPTIONAL) .with_logical_type(Some(LogicalType::List)) .with_converted_type(ConvertedType::LIST) - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( Type::primitive_type_builder( "a2", PhysicalType::BYTE_ARRAY, @@ -933,10 +927,10 @@ mod tests { .with_repetition(Repetition::OPTIONAL) .with_logical_type(Some(LogicalType::List)) .with_converted_type(ConvertedType::LIST) - .with_fields(&mut vec![Arc::new( + .with_fields(vec![Arc::new( Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) - .with_fields(&mut vec![ + .with_fields(vec![ Arc::new( Type::primitive_type_builder( "b3", @@ -984,7 +978,7 @@ mod tests { "; let message = parse(schema).unwrap(); - let mut fields = vec![ + let fields = vec![ Arc::new( Type::primitive_type_builder("_1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) @@ -1027,7 +1021,7 @@ mod tests { ]; let expected = Type::group_type_builder("root") - .with_fields(&mut fields) + .with_fields(fields) .build() .unwrap(); assert_eq!(message, expected); @@ -1051,7 +1045,7 @@ mod tests { "; let message = parse(schema).unwrap(); - let mut fields = vec![ + let fields = vec![ Arc::new( Type::primitive_type_builder("_1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) @@ -1135,7 +1129,7 @@ mod tests { ]; let expected = Type::group_type_builder("root") - .with_fields(&mut fields) + .with_fields(fields) .build() .unwrap(); assert_eq!(message, expected); diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index ad4acb0cb8b1..12624513ac6a 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -695,40 +695,40 @@ mod tests { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) .with_converted_type(ConvertedType::INT_32) - .with_id(0) + .with_id(Some(0)) .build(); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) .with_converted_type(ConvertedType::UTF8) - .with_id(1) + .with_id(Some(1)) .build(); let f3 = Type::primitive_type_builder("f3", PhysicalType::BYTE_ARRAY) .with_logical_type(Some(LogicalType::String)) - .with_id(1) + .with_id(Some(1)) .build(); let f4 = Type::primitive_type_builder("f4", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REPEATED) .with_converted_type(ConvertedType::INTERVAL) .with_length(12) - .with_id(2) + .with_id(Some(2)) .build(); - let mut struct_fields = vec![ + let struct_fields = vec![ Arc::new(f1.unwrap()), Arc::new(f2.unwrap()), Arc::new(f3.unwrap()), ]; let field = Type::group_type_builder("field") .with_repetition(Repetition::OPTIONAL) - .with_fields(&mut struct_fields) - .with_id(1) + .with_fields(struct_fields) + .with_id(Some(1)) .build() .unwrap(); - let mut fields = vec![Arc::new(field), Arc::new(f4.unwrap())]; + let fields = vec![Arc::new(field), Arc::new(f4.unwrap())]; let message = Type::group_type_builder("schema") - .with_fields(&mut fields) - .with_id(2) + .with_fields(fields) + .with_id(Some(2)) .build() .unwrap(); p.print(&message); @@ -756,7 +756,7 @@ mod tests { .with_repetition(Repetition::OPTIONAL) .with_logical_type(Some(LogicalType::List)) .with_converted_type(ConvertedType::LIST) - .with_fields(&mut vec![Arc::new(a2)]) + .with_fields(vec![Arc::new(a2)]) .build() .unwrap(); @@ -773,7 +773,7 @@ mod tests { let b2 = Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) .with_converted_type(ConvertedType::NONE) - .with_fields(&mut vec![Arc::new(b3), Arc::new(b4)]) + .with_fields(vec![Arc::new(b3), Arc::new(b4)]) .build() .unwrap(); @@ -781,18 +781,18 @@ mod tests { .with_repetition(Repetition::OPTIONAL) .with_logical_type(Some(LogicalType::List)) .with_converted_type(ConvertedType::LIST) - .with_fields(&mut vec![Arc::new(b2)]) + .with_fields(vec![Arc::new(b2)]) .build() .unwrap(); let a0 = Type::group_type_builder("a0") .with_repetition(Repetition::REQUIRED) - .with_fields(&mut vec![Arc::new(a1), Arc::new(b1)]) + .with_fields(vec![Arc::new(a1), Arc::new(b1)]) .build() .unwrap(); let message = Type::group_type_builder("root") - .with_fields(&mut vec![Arc::new(a0)]) + .with_fields(vec![Arc::new(a0)]) .build() .unwrap(); @@ -815,7 +815,7 @@ mod tests { let field = Type::group_type_builder("field") .with_repetition(Repetition::OPTIONAL) - .with_fields(&mut vec![Arc::new(f1), Arc::new(f2)]) + .with_fields(vec![Arc::new(f1), Arc::new(f2)]) .build() .unwrap(); @@ -827,7 +827,7 @@ mod tests { .unwrap(); let message = Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new(field), Arc::new(f3)]) + .with_fields(vec![Arc::new(field), Arc::new(f3)]) .build() .unwrap(); @@ -861,7 +861,7 @@ mod tests { .unwrap(); let message = Type::group_type_builder("schema") - .with_fields(&mut vec![Arc::new(f1), Arc::new(f2)]) + .with_fields(vec![Arc::new(f1), Arc::new(f2)]) .build() .unwrap(); diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index fd22cedeacaa..bed85268ff93 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -220,52 +220,51 @@ impl<'a> PrimitiveTypeBuilder<'a> { } /// Sets [`Repetition`](crate::basic::Repetition) for this field and returns itself. - pub fn with_repetition(mut self, repetition: Repetition) -> Self { - self.repetition = repetition; - self + pub fn with_repetition(self, repetition: Repetition) -> Self { + Self { repetition, ..self } } /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. - pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { - self.converted_type = converted_type; - self + pub fn with_converted_type(self, converted_type: ConvertedType) -> Self { + Self { + converted_type, + ..self + } } /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. /// If only the logical type is populated for a primitive type, the converted type /// will be automatically populated, and can thus be omitted. - pub fn with_logical_type(mut self, logical_type: Option) -> Self { - self.logical_type = logical_type; - self + pub fn with_logical_type(self, logical_type: Option) -> Self { + Self { + logical_type, + ..self + } } /// Sets type length and returns itself. /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because /// they maintain fixed size underlying byte array. /// By default, value is `0`. - pub fn with_length(mut self, length: i32) -> Self { - self.length = length; - self + pub fn with_length(self, length: i32) -> Self { + Self { length, ..self } } /// Sets precision for Parquet DECIMAL physical type and returns itself. /// By default, it equals to `0` and used only for decimal context. - pub fn with_precision(mut self, precision: i32) -> Self { - self.precision = precision; - self + pub fn with_precision(self, precision: i32) -> Self { + Self { precision, ..self } } /// Sets scale for Parquet DECIMAL physical type and returns itself. /// By default, it equals to `0` and used only for decimal context. - pub fn with_scale(mut self, scale: i32) -> Self { - self.scale = scale; - self + pub fn with_scale(self, scale: i32) -> Self { + Self { scale, ..self } } /// Sets optional field id and returns itself. - pub fn with_id(mut self, id: i32) -> Self { - self.id = Some(id); - self + pub fn with_id(self, id: Option) -> Self { + Self { id, ..self } } /// Creates a new `PrimitiveType` instance from the collected attributes. @@ -560,28 +559,30 @@ impl<'a> GroupTypeBuilder<'a> { } /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. - pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { - self.converted_type = converted_type; - self + pub fn with_converted_type(self, converted_type: ConvertedType) -> Self { + Self { + converted_type, + ..self + } } /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. - pub fn with_logical_type(mut self, logical_type: Option) -> Self { - self.logical_type = logical_type; - self + pub fn with_logical_type(self, logical_type: Option) -> Self { + Self { + logical_type, + ..self + } } /// Sets a list of fields that should be child nodes of this field. /// Returns updated self. - pub fn with_fields(mut self, fields: &mut Vec) -> Self { - self.fields.append(fields); - self + pub fn with_fields(self, fields: Vec) -> Self { + Self { fields, ..self } } /// Sets optional field id and returns itself. - pub fn with_id(mut self, id: i32) -> Self { - self.id = Some(id); - self + pub fn with_id(self, id: Option) -> Self { + Self { id, ..self } } /// Creates a new `GroupType` instance from the gathered attributes. @@ -1093,16 +1094,14 @@ fn from_thrift_helper( let scale = elements[index].scale.unwrap_or(-1); let precision = elements[index].precision.unwrap_or(-1); let name = &elements[index].name; - let mut builder = Type::primitive_type_builder(name, physical_type) + let builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) .with_converted_type(converted_type) .with_logical_type(logical_type) .with_length(length) .with_precision(precision) - .with_scale(scale); - if let Some(id) = field_id { - builder = builder.with_id(id); - } + .with_scale(scale) + .with_id(field_id); Ok((index + 1, Arc::new(builder.build()?))) } Some(n) => { @@ -1122,7 +1121,8 @@ fn from_thrift_helper( let mut builder = Type::group_type_builder(&elements[index].name) .with_converted_type(converted_type) .with_logical_type(logical_type) - .with_fields(&mut fields); + .with_fields(fields) + .with_id(field_id); if let Some(rep) = repetition { // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or // REPEATED for root node. @@ -1135,9 +1135,6 @@ fn from_thrift_helper( builder = builder.with_repetition(rep); } } - if let Some(id) = field_id { - builder = builder.with_id(id); - } Ok((next_index, Arc::new(builder.build().unwrap()))) } } @@ -1243,7 +1240,7 @@ mod tests { bit_width: 32, is_signed: true, })) - .with_id(0) + .with_id(Some(0)) .build(); assert!(result.is_ok()); @@ -1525,22 +1522,22 @@ mod tests { fn test_group_type() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_converted_type(ConvertedType::INT_32) - .with_id(0) + .with_id(Some(0)) .build(); assert!(f1.is_ok()); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) .with_converted_type(ConvertedType::UTF8) - .with_id(1) + .with_id(Some(1)) .build(); assert!(f2.is_ok()); - let mut fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())]; + let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())]; let result = Type::group_type_builder("foo") .with_repetition(Repetition::REPEATED) .with_logical_type(Some(LogicalType::List)) - .with_fields(&mut fields) - .with_id(1) + .with_fields(fields) + .with_id(Some(1)) .build(); assert!(result.is_ok()); @@ -1630,17 +1627,17 @@ mod tests { let list = Type::group_type_builder("records") .with_repetition(Repetition::REPEATED) .with_converted_type(ConvertedType::LIST) - .with_fields(&mut vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)]) + .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)]) .build()?; let bag = Type::group_type_builder("bag") .with_repetition(Repetition::OPTIONAL) - .with_fields(&mut vec![Arc::new(list)]) + .with_fields(vec![Arc::new(list)]) .build()?; fields.push(Arc::new(bag)); let schema = Type::group_type_builder("schema") .with_repetition(Repetition::REPEATED) - .with_fields(&mut fields) + .with_fields(fields) .build()?; let descr = SchemaDescriptor::new(Arc::new(schema)); @@ -1789,13 +1786,9 @@ mod tests { // function to create a new group type for testing fn test_new_group_type(name: &str, repetition: Repetition, types: Vec) -> Type { - let mut fields = Vec::new(); - for tpe in types { - fields.push(Arc::new(tpe)) - } Type::group_type_builder(name) .with_repetition(repetition) - .with_fields(&mut fields) + .with_fields(types.into_iter().map(Arc::new).collect()) .build() .unwrap() } diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index 0f875401f0e9..c6641cd8091d 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -130,7 +130,7 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke #field_types );*; let group = ParquetType::group_type_builder("rust_schema") - .with_fields(&mut fields) + .with_fields(fields) .build()?; Ok(group.into()) } From 810291179f65d63a5c49ed6b7881bc5788d85a9e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:48:33 +0100 Subject: [PATCH 1166/1411] Take kernel dyn Array (#4705) --- arrow-cast/src/cast.rs | 16 +--- arrow-select/src/take.rs | 153 +++++++++++++++++++++++++++++---------- 2 files changed, 116 insertions(+), 53 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index a08a7a4fd413..23b7a4b5a05d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -49,7 +49,7 @@ use crate::parse::{ use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; -use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -3027,19 +3027,7 @@ where { let dict_array = array.as_dictionary::(); let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?; - let keys = dict_array.keys(); - match K::DATA_TYPE { - DataType::Int32 => { - // Dictionary guarantees all non-null keys >= 0 - let buffer = ScalarBuffer::new(keys.values().inner().clone(), 0, keys.len()); - let indices = PrimitiveArray::new(buffer, keys.nulls().cloned()); - take::(cast_dict_values.as_ref(), &indices, None) - } - _ => { - let indices = cast_with_options(keys, &DataType::UInt32, cast_options)?; - take::(cast_dict_values.as_ref(), indices.as_primitive(), None) - } - } + take(cast_dict_values.as_ref(), dict_array.keys(), None) } /// Attempts to encode an array into an `ArrayDictionary` with index diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index cee9cbaf84df..70b80e5878dd 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -73,49 +73,65 @@ use num::{One, Zero}; /// /// assert_eq!(*taken, StringArray::from(vec!["two", "one"])); /// ``` -pub fn take( +pub fn take( values: &dyn Array, - indices: &PrimitiveArray, + indices: &dyn Array, options: Option, ) -> Result { - take_impl(values, indices, options) + let options = options.unwrap_or_default(); + macro_rules! helper { + ($t:ty, $values:expr, $indices:expr, $options:expr) => {{ + let indices = indices.as_primitive::<$t>(); + if $options.check_bounds { + check_bounds($values.len(), indices)?; + } + let indices = indices.to_indices(); + take_impl($values, &indices) + }}; + } + downcast_integer! { + indices.data_type() => (helper, values, indices, options), + d => Err(ArrowError::InvalidArgumentError(format!("Take only supported for integers, got {d:?}"))) + } +} + +/// Verifies that the non-null values of `indices` are all `< len` +fn check_bounds( + len: usize, + indices: &PrimitiveArray, +) -> Result<(), ArrowError> { + if indices.null_count() > 0 { + indices.iter().flatten().try_for_each(|index| { + let ix = index.to_usize().ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) + ); + } + Ok(()) + }) + } else { + indices.values().iter().try_for_each(|index| { + let ix = index.to_usize().ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + if ix >= len { + return Err(ArrowError::ComputeError( + format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) + ); + } + Ok(()) + }) + } } +#[inline(never)] fn take_impl( values: &dyn Array, indices: &PrimitiveArray, - options: Option, ) -> Result { - let options = options.unwrap_or_default(); - if options.check_bounds { - let len = values.len(); - if indices.null_count() > 0 { - indices.iter().flatten().try_for_each(|index| { - let ix = index.to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) - ); - } - Ok(()) - })?; - } else { - indices.values().iter().try_for_each(|index| { - let ix = index.to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) - ); - } - Ok(()) - })? - } - } - downcast_primitive_array! { values => Ok(Arc::new(take_primitive(values, indices)?)), DataType::Boolean => { @@ -156,7 +172,7 @@ fn take_impl( let arrays = array .columns() .iter() - .map(|a| take_impl(a.as_ref(), indices, Some(options.clone()))) + .map(|a| take_impl(a.as_ref(), indices)) .collect::, _>>()?; let fields: Vec<(FieldRef, ArrayRef)> = fields.iter().cloned().zip(arrays).collect(); @@ -423,7 +439,7 @@ where let (list_indices, offsets, null_buf) = take_value_indices_from_list::(values, indices)?; - let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; + let taken = take_impl::(values.values().as_ref(), &list_indices)?; let value_offsets = Buffer::from_vec(offsets); // create a new list with taken data and computed null information let list_data = ArrayDataBuilder::new(values.data_type().clone()) @@ -449,7 +465,7 @@ fn take_fixed_size_list( length: ::Native, ) -> Result { let list_indices = take_value_indices_from_fixed_size_list(values, indices, length)?; - let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; + let taken = take_impl::(values.values().as_ref(), &list_indices)?; // determine null count and null buffer, which are a function of `values` and `indices` let num_bytes = bit_util::ceil(indices.len(), 8); @@ -676,6 +692,65 @@ where Ok(PrimitiveArray::::from(values)) } +/// To avoid generating take implementations for every index type, instead we +/// only generate for UInt32 and UInt64 and coerce inputs to these types +trait ToIndices { + type T: ArrowPrimitiveType; + + fn to_indices(&self) -> PrimitiveArray; +} + +macro_rules! to_indices_reinterpret { + ($t:ty, $o:ty) => { + impl ToIndices for PrimitiveArray<$t> { + type T = $o; + + fn to_indices(&self) -> PrimitiveArray<$o> { + let cast = + ScalarBuffer::new(self.values().inner().clone(), 0, self.len()); + PrimitiveArray::new(cast, self.nulls().cloned()) + } + } + }; +} + +macro_rules! to_indices_identity { + ($t:ty) => { + impl ToIndices for PrimitiveArray<$t> { + type T = $t; + + fn to_indices(&self) -> PrimitiveArray<$t> { + self.clone() + } + } + }; +} + +macro_rules! to_indices_widening { + ($t:ty, $o:ty) => { + impl ToIndices for PrimitiveArray<$t> { + type T = UInt32Type; + + fn to_indices(&self) -> PrimitiveArray<$o> { + let cast = self.values().iter().copied().map(|x| x as _).collect(); + PrimitiveArray::new(cast, self.nulls().cloned()) + } + } + }; +} + +to_indices_widening!(UInt8Type, UInt32Type); +to_indices_widening!(Int8Type, UInt32Type); + +to_indices_widening!(UInt16Type, UInt32Type); +to_indices_widening!(Int16Type, UInt32Type); + +to_indices_identity!(UInt32Type); +to_indices_reinterpret!(Int32Type, UInt32Type); + +to_indices_identity!(UInt64Type); +to_indices_reinterpret!(Int64Type, UInt64Type); + #[cfg(test)] mod tests { use super::*; @@ -767,7 +842,7 @@ mod tests { { let output = PrimitiveArray::::from(data); let expected = PrimitiveArray::::from(expected_data); - let output = take_impl(&output, index, options).unwrap(); + let output = take(&output, index, options).unwrap(); let output = output.as_any().downcast_ref::>().unwrap(); assert_eq!(output, &expected) } @@ -1078,7 +1153,7 @@ mod tests { 1_639_715_368_000_000_000, ]) .with_timezone("UTC".to_string()); - let result = take_impl(&input, &index, None).unwrap(); + let result = take(&input, &index, None).unwrap(); match result.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) => { assert_eq!(tz.clone(), Some("UTC".into())) From 31c81c5fd99921e9aaffa5cc930ab1110e67962b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:53:44 +0100 Subject: [PATCH 1167/1411] Fix nightly tests (#4709) --- arrow-array/src/array/binary_array.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 67be3768cc80..75880bec30ce 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -547,9 +547,7 @@ mod tests { #[test] #[should_panic( - expected = "assertion failed: `(left == right)`\n left: `UInt32`,\n \ - right: `UInt8`: BinaryArray can only be created from List arrays, \ - mismatched data types." + expected = "BinaryArray can only be created from List arrays, mismatched data types." )] fn test_binary_array_from_incorrect_list_array() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; From f0200dbec164c9593d80405b928a9684b598bf77 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 10:54:24 +0100 Subject: [PATCH 1168/1411] Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with_values (#4707) * Add AnyDictionary Abstraction * Review feedback * Move to AsArray --- arrow-arith/src/arity.rs | 8 +- arrow-arith/src/temporal.rs | 2 +- arrow-array/src/array/dictionary_array.rs | 116 +++++++++++++++++++--- arrow-array/src/cast.rs | 20 ++++ arrow-row/src/lib.rs | 2 +- 5 files changed, 129 insertions(+), 19 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index fdfb26f7f72a..f3118d104536 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -82,7 +82,7 @@ where { let dict_values = array.values().as_any().downcast_ref().unwrap(); let values = unary::(dict_values, op); - Ok(Arc::new(array.with_values(&values))) + Ok(Arc::new(array.with_values(Arc::new(values)))) } /// A helper function that applies a fallible unary function to a dictionary array with primitive value type. @@ -105,10 +105,11 @@ where let dict_values = array.values().as_any().downcast_ref().unwrap(); let values = try_unary::(dict_values, op)?; - Ok(Arc::new(array.with_values(&values))) + Ok(Arc::new(array.with_values(Arc::new(values)))) } /// Applies an infallible unary function to an array with primitive values. +#[deprecated(note = "Use arrow_array::AnyDictionaryArray")] pub fn unary_dyn(array: &dyn Array, op: F) -> Result where T: ArrowPrimitiveType, @@ -134,6 +135,7 @@ where } /// Applies a fallible unary function to an array with primitive values. +#[deprecated(note = "Use arrow_array::AnyDictionaryArray")] pub fn try_unary_dyn(array: &dyn Array, op: F) -> Result where T: ArrowPrimitiveType, @@ -436,6 +438,7 @@ mod tests { use arrow_array::types::*; #[test] + #[allow(deprecated)] fn test_unary_f64_slice() { let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); @@ -455,6 +458,7 @@ mod tests { } #[test] + #[allow(deprecated)] fn test_unary_dict_and_unary_dyn() { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(5).unwrap(); diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index ef551ceeddb7..7855b6fc6e46 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -462,7 +462,7 @@ where downcast_dictionary_array!( array => { let values = time_fraction_dyn(array.values(), name, op)?; - Ok(Arc::new(array.with_values(&values))) + Ok(Arc::new(array.with_values(values))) } dt => return_compute_error_with!(format!("{name} does not support"), dt), ) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 2d80c75f073a..ed043754da4b 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -434,6 +434,7 @@ impl DictionaryArray { /// Panics if `values` has a length less than the current values /// /// ``` + /// # use std::sync::Arc; /// # use arrow_array::builder::PrimitiveDictionaryBuilder; /// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor}; /// # use arrow_array::types::{Int32Type, Int8Type}; @@ -451,7 +452,7 @@ impl DictionaryArray { /// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64); /// /// // Create a Dict(Int32, - /// let new = dictionary.with_values(&values); + /// let new = dictionary.with_values(Arc::new(values)); /// /// // Verify values are as expected /// let new_typed = new.downcast_dict::().unwrap(); @@ -460,21 +461,18 @@ impl DictionaryArray { /// } /// ``` /// - pub fn with_values(&self, values: &dyn Array) -> Self { + pub fn with_values(&self, values: ArrayRef) -> Self { assert!(values.len() >= self.values.len()); - - let builder = self - .to_data() - .into_builder() - .data_type(DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - )) - .child_data(vec![values.to_data()]); - - // SAFETY: - // Offsets were valid before and verified length is greater than or equal - Self::from(unsafe { builder.build_unchecked() }) + let data_type = DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + Self { + data_type, + keys: self.keys.clone(), + values, + is_ordered: false, + } } /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating @@ -930,6 +928,94 @@ where } } +/// A [`DictionaryArray`] with the key type erased +/// +/// This can be used to efficiently implement kernels for all possible dictionary +/// keys without needing to create specialized implementations for each key type +/// +/// For example +/// +/// ``` +/// # use arrow_array::*; +/// # use arrow_array::cast::AsArray; +/// # use arrow_array::builder::PrimitiveDictionaryBuilder; +/// # use arrow_array::types::*; +/// # use arrow_schema::ArrowError; +/// # use std::sync::Arc; +/// +/// fn to_string(a: &dyn Array) -> Result { +/// if let Some(d) = a.as_any_dictionary_opt() { +/// // Recursively handle dictionary input +/// let r = to_string(d.values().as_ref())?; +/// return Ok(d.with_values(r)); +/// } +/// downcast_primitive_array! { +/// a => Ok(Arc::new(a.iter().map(|x| x.map(|x| x.to_string())).collect::())), +/// d => Err(ArrowError::InvalidArgumentError(format!("{d:?} not supported"))) +/// } +/// } +/// +/// let result = to_string(&Int32Array::from(vec![1, 2, 3])).unwrap(); +/// let actual = result.as_string::().iter().map(Option::unwrap).collect::>(); +/// assert_eq!(actual, &["1", "2", "3"]); +/// +/// let mut dict = PrimitiveDictionaryBuilder::::new(); +/// dict.extend([Some(1), Some(1), Some(2), Some(3), Some(2)]); +/// let dict = dict.finish(); +/// +/// let r = to_string(&dict).unwrap(); +/// let r = r.as_dictionary::().downcast_dict::().unwrap(); +/// assert_eq!(r.keys(), dict.keys()); // Keys are the same +/// +/// let actual = r.into_iter().map(Option::unwrap).collect::>(); +/// assert_eq!(actual, &["1", "1", "2", "3", "2"]); +/// ``` +/// +/// See [`AsArray::as_any_dictionary_opt`] and [`AsArray::as_any_dictionary`] +pub trait AnyDictionaryArray: Array { + /// Returns the primitive keys of this dictionary as an [`Array`] + fn keys(&self) -> &dyn Array; + + /// Returns the values of this dictionary + fn values(&self) -> &ArrayRef; + + /// Returns the keys of this dictionary as usize + /// + /// The values for nulls will be arbitrary, but are guaranteed + /// to be in the range `0..self.values.len()` + /// + /// # Panic + /// + /// Panics if `values.len() == 0` + fn normalized_keys(&self) -> Vec; + + /// Create a new [`DictionaryArray`] replacing `values` with the new values + /// + /// See [`DictionaryArray::with_values`] + fn with_values(&self, values: ArrayRef) -> ArrayRef; +} + +impl AnyDictionaryArray for DictionaryArray { + fn keys(&self) -> &dyn Array { + &self.keys + } + + fn values(&self) -> &ArrayRef { + self.values() + } + + fn normalized_keys(&self) -> Vec { + let v_len = self.values().len(); + assert_ne!(v_len, 0); + let iter = self.keys().values().iter(); + iter.map(|x| x.as_usize().min(v_len)).collect() + } + + fn with_values(&self, values: ArrayRef) -> ArrayRef { + Arc::new(self.with_values(values)) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 66b40d5b8eb3..b6cda44e8973 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -833,6 +833,14 @@ pub trait AsArray: private::Sealed { fn as_dictionary(&self) -> &DictionaryArray { self.as_dictionary_opt().expect("dictionary array") } + + /// Downcasts this to a [`AnyDictionaryArray`] returning `None` if not possible + fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray>; + + /// Downcasts this to a [`AnyDictionaryArray`] panicking if not possible + fn as_any_dictionary(&self) -> &dyn AnyDictionaryArray { + self.as_any_dictionary_opt().expect("any dictionary array") + } } impl private::Sealed for dyn Array + '_ {} @@ -874,6 +882,14 @@ impl AsArray for dyn Array + '_ { ) -> Option<&DictionaryArray> { self.as_any().downcast_ref() } + + fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray> { + let array = self; + downcast_dictionary_array! { + array => Some(array), + _ => None + } + } } impl private::Sealed for ArrayRef {} @@ -915,6 +931,10 @@ impl AsArray for ArrayRef { ) -> Option<&DictionaryArray> { self.as_ref().as_dictionary_opt() } + + fn as_any_dictionary_opt(&self) -> Option<&dyn AnyDictionaryArray> { + self.as_ref().as_any_dictionary_opt() + } } #[cfg(test)] diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 3cd082c51165..18b5890d4a3a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1642,7 +1642,7 @@ mod tests { // Construct dictionary with a timezone let dict = a.finish(); let values = TimestampNanosecondArray::from(dict.values().to_data()); - let dict_with_tz = dict.with_values(&values.with_timezone("+02:00")); + let dict_with_tz = dict.with_values(Arc::new(values.with_timezone("+02:00"))); let d = DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Timestamp( From a19ff2bacd469741b42eab88ccd28eda15f6ea44 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 14:10:21 +0100 Subject: [PATCH 1169/1411] Tweak docs (#4711) --- arrow-data/src/data.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 0417e1d357c7..7e07194012bf 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -1660,10 +1660,9 @@ pub enum BufferSpec { /// `alignment` is the alignment required by Rust for an array of the corresponding primitive, /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. /// - /// Arrow-rs requires that all buffers are have at least this alignment, to allow for - /// [slice](std::slice) based APIs. We do not require alignment in excess of this to allow - /// for array slicing, and interoperability with `Vec` which in the absence of support - /// for custom allocators, cannot be over-aligned. + /// Arrow-rs requires that all buffers have at least this alignment, to allow for + /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow + /// for array slicing and interoperability with `Vec`, which cannot be over-aligned. /// /// Note that these alignment requirements will vary between architectures FixedWidth { byte_width: usize, alignment: usize }, @@ -1818,7 +1817,7 @@ impl ArrayDataBuilder { /// Rust requires that arrays are aligned to their corresponding primitive, /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. /// - /// [`ArrayData`] therefore requires that all buffers are have at least this alignment, + /// [`ArrayData`] therefore requires that all buffers have at least this alignment, /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. /// /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, From b810e8f207bbc70294b01acba4be32153c18a6ab Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 17 Aug 2023 15:05:42 +0100 Subject: [PATCH 1170/1411] Support Field ID in ArrowWriter (#4702) (#4710) --- parquet/src/arrow/mod.rs | 7 ++ parquet/src/arrow/schema/mod.rs | 200 ++++++++++++++++++++++++-------- 2 files changed, 156 insertions(+), 51 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index aad4925c7c70..8cca79b40e93 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -130,6 +130,13 @@ pub use self::schema::{ /// Schema metadata key used to store serialized Arrow IPC schema pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; +/// The value of this metadata key, if present on [`Field::metadata`], will be used +/// to populate [`BasicTypeInfo::id`] +/// +/// [`Field::metadata`]: arrow_schema::Field::metadata +/// [`BasicTypeInfo::id`]: crate::schema::types::BasicTypeInfo::id +pub const PARQUET_FIELD_ID_META_KEY: &str = "PARQUET:field_id"; + /// A [`ProjectionMask`] identifies a set of columns within a potentially nested schema to project /// /// In particular, a [`ProjectionMask`] can be constructed from a list of leaf column indices diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index bcfc2f884cac..3f1994d10829 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -295,14 +295,17 @@ fn arrow_to_parquet_type(field: &Field) -> Result { } else { Repetition::REQUIRED }; + let id = field_id(field); // create type from field match field.data_type() { DataType::Null => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Unknown)) .with_repetition(repetition) + .with_id(id) .build(), DataType::Boolean => Type::primitive_type_builder(name, PhysicalType::BOOLEAN) .with_repetition(repetition) + .with_id(id) .build(), DataType::Int8 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { @@ -310,6 +313,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: true, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::Int16 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { @@ -317,12 +321,15 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: true, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::Int32 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_repetition(repetition) + .with_id(id) .build(), DataType::Int64 => Type::primitive_type_builder(name, PhysicalType::INT64) .with_repetition(repetition) + .with_id(id) .build(), DataType::UInt8 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { @@ -330,6 +337,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: false, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::UInt16 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { @@ -337,6 +345,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: false, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::UInt32 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Integer { @@ -344,6 +353,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: false, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::UInt64 => Type::primitive_type_builder(name, PhysicalType::INT64) .with_logical_type(Some(LogicalType::Integer { @@ -351,18 +361,22 @@ fn arrow_to_parquet_type(field: &Field) -> Result { is_signed: false, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::Float16 => Err(arrow_err!("Float16 arrays not supported")), DataType::Float32 => Type::primitive_type_builder(name, PhysicalType::FLOAT) .with_repetition(repetition) + .with_id(id) .build(), DataType::Float64 => Type::primitive_type_builder(name, PhysicalType::DOUBLE) .with_repetition(repetition) + .with_id(id) .build(), DataType::Timestamp(TimeUnit::Second, _) => { // Cannot represent seconds in LogicalType Type::primitive_type_builder(name, PhysicalType::INT64) .with_repetition(repetition) + .with_id(id) .build() } DataType::Timestamp(time_unit, tz) => { @@ -384,21 +398,25 @@ fn arrow_to_parquet_type(field: &Field) -> Result { }, })) .with_repetition(repetition) + .with_id(id) .build() } DataType::Date32 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Date)) .with_repetition(repetition) + .with_id(id) .build(), // date64 is cast to date32 (#1666) DataType::Date64 => Type::primitive_type_builder(name, PhysicalType::INT32) .with_logical_type(Some(LogicalType::Date)) .with_repetition(repetition) + .with_id(id) .build(), DataType::Time32(TimeUnit::Second) => { // Cannot represent seconds in LogicalType Type::primitive_type_builder(name, PhysicalType::INT32) .with_repetition(repetition) + .with_id(id) .build() } DataType::Time32(unit) => Type::primitive_type_builder(name, PhysicalType::INT32) @@ -410,6 +428,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { }, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::Time64(unit) => Type::primitive_type_builder(name, PhysicalType::INT64) .with_logical_type(Some(LogicalType::Time { @@ -421,6 +440,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { }, })) .with_repetition(repetition) + .with_id(id) .build(), DataType::Duration(_) => { Err(arrow_err!("Converting Duration to parquet not supported",)) @@ -429,17 +449,20 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_converted_type(ConvertedType::INTERVAL) .with_repetition(repetition) + .with_id(id) .with_length(12) .build() } DataType::Binary | DataType::LargeBinary => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_repetition(repetition) + .with_id(id) .build() } DataType::FixedSizeBinary(length) => { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(repetition) + .with_id(id) .with_length(*length) .build() } @@ -459,6 +482,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { }; Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) + .with_id(id) .with_length(length) .with_logical_type(Some(LogicalType::Decimal { scale: *scale as i32, @@ -472,6 +496,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) .with_logical_type(Some(LogicalType::String)) .with_repetition(repetition) + .with_id(id) .build() } DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { @@ -484,6 +509,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { )]) .with_logical_type(Some(LogicalType::List)) .with_repetition(repetition) + .with_id(id) .build() } DataType::Struct(fields) => { @@ -500,6 +526,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::group_type_builder(name) .with_fields(fields) .with_repetition(repetition) + .with_id(id) .build() } DataType::Map(field, _) => { @@ -508,22 +535,15 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_fields(vec![Arc::new( Type::group_type_builder(field.name()) .with_fields(vec![ - Arc::new(arrow_to_parquet_type(&Field::new( - struct_fields[0].name(), - struct_fields[0].data_type().clone(), - false, - ))?), - Arc::new(arrow_to_parquet_type(&Field::new( - struct_fields[1].name(), - struct_fields[1].data_type().clone(), - struct_fields[1].is_nullable(), - ))?), + Arc::new(arrow_to_parquet_type(&struct_fields[0])?), + Arc::new(arrow_to_parquet_type(&struct_fields[1])?), ]) .with_repetition(Repetition::REPEATED) .build()?, )]) .with_logical_type(Some(LogicalType::Map)) .with_repetition(repetition) + .with_id(id) .build() } else { Err(arrow_err!( @@ -543,6 +563,11 @@ fn arrow_to_parquet_type(field: &Field) -> Result { } } +fn field_id(field: &Field) -> Option { + let value = field.metadata().get(super::PARQUET_FIELD_ID_META_KEY)?; + value.parse().ok() // Fail quietly if not a valid integer +} + #[cfg(test)] mod tests { use super::*; @@ -551,6 +576,7 @@ mod tests { use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; + use crate::arrow::PARQUET_FIELD_ID_META_KEY; use crate::file::metadata::KeyValue; use crate::{ arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}, @@ -1555,17 +1581,18 @@ mod tests { #[test] fn test_arrow_schema_roundtrip() -> Result<()> { - // This tests the roundtrip of an Arrow schema - // Fields that are commented out fail roundtrip tests or are unsupported by the writer - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); + let meta = |a: &[(&str, &str)]| -> HashMap { + a.iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect() + }; let schema = Schema::new_with_metadata( vec![ - Field::new("c1", DataType::Utf8, false), + Field::new("c1", DataType::Utf8, false).with_metadata(meta(&[ + ("Key", "Foo"), + (PARQUET_FIELD_ID_META_KEY, "2"), + ])), Field::new("c2", DataType::Binary, false), Field::new("c3", DataType::FixedSizeBinary(3), false), Field::new("c4", DataType::Boolean, false), @@ -1598,24 +1625,40 @@ mod tests { Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), Field::new_list( "c21", - Field::new("list", DataType::Boolean, true), + Field::new("item", DataType::Boolean, true).with_metadata(meta(&[ + ("Key", "Bar"), + (PARQUET_FIELD_ID_META_KEY, "5"), + ])), + false, + ) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "4")])), + Field::new( + "c22", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Boolean, true)), + 5, + ), + false, + ), + Field::new_list( + "c23", + Field::new_large_list( + "inner", + Field::new( + "item", + DataType::Struct( + vec![ + Field::new("a", DataType::Int16, true), + Field::new("b", DataType::Float64, false), + ] + .into(), + ), + false, + ), + true, + ), false, ), - // Field::new( - // "c22", - // DataType::FixedSizeList(Box::new(DataType::Boolean), 5), - // false, - // ), - // Field::new( - // "c23", - // DataType::List(Box::new(DataType::LargeList(Box::new( - // DataType::Struct(vec![ - // Field::new("a", DataType::Int16, true), - // Field::new("b", DataType::Float64, false), - // ]), - // )))), - // true, - // ), Field::new( "c24", DataType::Struct(Fields::from(vec![ @@ -1626,6 +1669,7 @@ mod tests { ), Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), + // Duration types not supported // Field::new("c27", DataType::Duration(TimeUnit::Second), false), // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), @@ -1639,19 +1683,29 @@ mod tests { true, 123, true, - ), + ) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "6")])), Field::new("c32", DataType::LargeBinary, true), Field::new("c33", DataType::LargeUtf8, true), - // Field::new( - // "c34", - // DataType::LargeList(Box::new(DataType::List(Box::new( - // DataType::Struct(vec![ - // Field::new("a", DataType::Int16, true), - // Field::new("b", DataType::Float64, true), - // ]), - // )))), - // true, - // ), + Field::new_large_list( + "c34", + Field::new_list( + "inner", + Field::new( + "item", + DataType::Struct( + vec![ + Field::new("a", DataType::Int16, true), + Field::new("b", DataType::Float64, true), + ] + .into(), + ), + true, + ), + true, + ), + true, + ), Field::new("c35", DataType::Null, true), Field::new("c36", DataType::Decimal128(2, 1), false), Field::new("c37", DataType::Decimal256(50, 20), false), @@ -1671,29 +1725,34 @@ mod tests { Field::new_map( "c40", "my_entries", - Field::new("my_key", DataType::Utf8, false), + Field::new("my_key", DataType::Utf8, false) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "8")])), Field::new_list( "my_value", - Field::new("item", DataType::Utf8, true), + Field::new("item", DataType::Utf8, true) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "10")])), true, - ), + ) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "9")])), false, // fails to roundtrip keys_sorted true, - ), + ) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "7")])), Field::new_map( "c41", "my_entries", Field::new("my_key", DataType::Utf8, false), Field::new_list( "my_value", - Field::new("item", DataType::Utf8, true), + Field::new("item", DataType::Utf8, true) + .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "11")])), true, ), false, // fails to roundtrip keys_sorted false, ), ], - metadata, + meta(&[("Key", "Value")]), ); // write to an empty parquet file so that schema is serialized @@ -1707,9 +1766,48 @@ mod tests { // read file back let arrow_reader = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + + // Check arrow schema let read_schema = arrow_reader.schema(); assert_eq!(&schema, read_schema.as_ref()); + // Walk schema finding field IDs + let mut stack = Vec::with_capacity(10); + let mut out = Vec::with_capacity(10); + + let root = arrow_reader.parquet_schema().root_schema_ptr(); + stack.push((root.name().to_string(), root)); + + while let Some((p, t)) = stack.pop() { + if t.is_group() { + for f in t.get_fields() { + stack.push((format!("{p}.{}", f.name()), f.clone())) + } + } + + let info = t.get_basic_info(); + if info.has_id() { + out.push(format!("{p} -> {}", info.id())) + } + } + out.sort_unstable(); + let out: Vec<_> = out.iter().map(|x| x.as_str()).collect(); + + assert_eq!( + &out, + &[ + "arrow_schema.c1 -> 2", + "arrow_schema.c21 -> 4", + "arrow_schema.c21.list.item -> 5", + "arrow_schema.c31 -> 6", + "arrow_schema.c40 -> 7", + "arrow_schema.c40.my_entries.my_key -> 8", + "arrow_schema.c40.my_entries.my_value -> 9", + "arrow_schema.c40.my_entries.my_value.list.item -> 10", + "arrow_schema.c41.my_entries.my_value.list.item -> 11", + ] + ); + Ok(()) } From 8bbb5c18776a968b0968627ed1285ef7c620d7ab Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 18 Aug 2023 07:43:49 +0100 Subject: [PATCH 1171/1411] Datum based comparison kernels (#4596) (#4701) * Datum based comparison kernels (#4596) * Clippy * More clippy * Even more clippy * Further clippy * Format * Use take kernel for scalar evaluation * Clippy * Review feedback * Use AnyDictionaryArray --- .github/workflows/arrow.yml | 10 +- .github/workflows/miri.sh | 2 +- arrow-flight/src/sql/metadata/db_schemas.rs | 7 +- arrow-flight/src/sql/metadata/sql_info.rs | 14 +- arrow-flight/src/sql/metadata/tables.rs | 12 +- arrow-flight/src/sql/metadata/xdbc_info.rs | 9 +- arrow-ord/Cargo.toml | 7 - arrow-ord/src/cmp.rs | 489 ++++ arrow-ord/src/comparison.rs | 2459 ++++--------------- arrow-ord/src/lib.rs | 2 + arrow-ord/src/partition.rs | 4 +- arrow/Cargo.toml | 4 +- arrow/benches/comparison_kernels.rs | 175 +- arrow/benches/equal.rs | 10 - arrow/src/compute/kernels.rs | 2 +- parquet/examples/async_read_parquet.rs | 5 +- parquet/src/arrow/async_reader/mod.rs | 14 +- 17 files changed, 1095 insertions(+), 2130 deletions(-) create mode 100644 arrow-ord/src/cmp.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 279e276a7912..8203c15afc6c 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -80,8 +80,8 @@ jobs: run: cargo test -p arrow-json --all-features - name: Test arrow-string with all features run: cargo test -p arrow-string --all-features - - name: Test arrow-ord with all features except SIMD - run: cargo test -p arrow-ord --features dyn_cmp_dict + - name: Test arrow-ord with all features + run: cargo test -p arrow-ord --all-features - name: Test arrow-arith with all features except SIMD run: cargo test -p arrow-arith - name: Test arrow-row with all features @@ -145,8 +145,6 @@ jobs: rust-version: nightly - name: Test arrow-array with SIMD run: cargo test -p arrow-array --features simd - - name: Test arrow-ord with SIMD - run: cargo test -p arrow-ord --features simd - name: Test arrow-arith with SIMD run: cargo test -p arrow-arith --features simd - name: Test arrow with SIMD @@ -206,8 +204,8 @@ jobs: run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - name: Clippy arrow-string with all features run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - - name: Clippy arrow-ord with all features except SIMD - run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings + - name: Clippy arrow-ord with all features + run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - name: Clippy arrow-arith with all features except SIMD run: cargo clippy -p arrow-arith --all-targets -- -D warnings - name: Clippy arrow-row with all features diff --git a/.github/workflows/miri.sh b/.github/workflows/miri.sh index faf9f028d281..ec8712660c74 100755 --- a/.github/workflows/miri.sh +++ b/.github/workflows/miri.sh @@ -15,4 +15,4 @@ cargo miri test -p arrow-data --features ffi cargo miri test -p arrow-schema --features ffi cargo miri test -p arrow-array cargo miri test -p arrow-arith --features simd -cargo miri test -p arrow-ord --features simd +cargo miri test -p arrow-ord diff --git a/arrow-flight/src/sql/metadata/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs index 7b10e1c14299..20780a116032 100644 --- a/arrow-flight/src/sql/metadata/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -22,8 +22,8 @@ use std::sync::Arc; use arrow_arith::boolean::and; -use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch}; -use arrow_ord::comparison::eq_utf8_scalar; +use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch, Scalar, StringArray}; +use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::{filter::filter_record_batch, take::take}; use arrow_string::like::like_utf8_scalar; @@ -129,7 +129,8 @@ impl GetDbSchemasBuilder { } if let Some(catalog_filter_name) = catalog_filter { - filters.push(eq_utf8_scalar(&catalog_name, &catalog_filter_name)?); + let scalar = StringArray::from_iter_values([catalog_filter_name]); + filters.push(eq(&catalog_name, &Scalar::new(&scalar))?); } // `AND` any filters together diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index b37ac85308f4..88c97227814d 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -33,10 +33,9 @@ use arrow_array::builder::{ ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, MapBuilder, StringBuilder, UInt32Builder, }; -use arrow_array::cast::downcast_array; -use arrow_array::RecordBatch; +use arrow_array::{RecordBatch, Scalar}; use arrow_data::ArrayData; -use arrow_ord::comparison::eq_scalar; +use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef, UnionFields, UnionMode}; use arrow_select::filter::filter_record_batch; use once_cell::sync::Lazy; @@ -425,13 +424,16 @@ impl SqlInfoData { &self, info: impl IntoIterator, ) -> Result { - let arr: UInt32Array = downcast_array(self.batch.column(0).as_ref()); + let arr = self.batch.column(0); let type_filter = info .into_iter() - .map(|tt| eq_scalar(&arr, tt)) + .map(|tt| { + let s = UInt32Array::from(vec![tt]); + eq(arr, &Scalar::new(&s)) + }) .collect::, _>>()? .into_iter() - // We know the arrays are of same length as they are produced fromn the same root array + // We know the arrays are of same length as they are produced from the same root array .reduce(|filter, arr| or(&filter, &arr).unwrap()); if let Some(filter) = type_filter { Ok(filter_record_batch(&self.batch, &filter)?) diff --git a/arrow-flight/src/sql/metadata/tables.rs b/arrow-flight/src/sql/metadata/tables.rs index 67193969d46d..de55f0624f2f 100644 --- a/arrow-flight/src/sql/metadata/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -23,8 +23,8 @@ use std::sync::Arc; use arrow_arith::boolean::{and, or}; use arrow_array::builder::{BinaryBuilder, StringBuilder}; -use arrow_array::{ArrayRef, RecordBatch}; -use arrow_ord::comparison::eq_utf8_scalar; +use arrow_array::{ArrayRef, RecordBatch, Scalar, StringArray}; +use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::{filter::filter_record_batch, take::take}; use arrow_string::like::like_utf8_scalar; @@ -184,12 +184,16 @@ impl GetTablesBuilder { let mut filters = vec![]; if let Some(catalog_filter_name) = catalog_filter { - filters.push(eq_utf8_scalar(&catalog_name, &catalog_filter_name)?); + let scalar = StringArray::from_iter_values([catalog_filter_name]); + filters.push(eq(&catalog_name, &Scalar::new(&scalar))?); } let tt_filter = table_types_filter .into_iter() - .map(|tt| eq_utf8_scalar(&table_type, &tt)) + .map(|tt| { + let scalar = StringArray::from_iter_values([tt]); + eq(&table_type, &Scalar::new(&scalar)) + }) .collect::, _>>()? .into_iter() // We know the arrays are of same length as they are produced fromn the same root array diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index b70a3ce3cb3e..8212c847a4fa 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -27,9 +27,8 @@ use std::sync::Arc; use arrow_array::builder::{BooleanBuilder, Int32Builder, ListBuilder, StringBuilder}; -use arrow_array::cast::downcast_array; -use arrow_array::{ArrayRef, Int32Array, ListArray, RecordBatch}; -use arrow_ord::comparison::eq_scalar; +use arrow_array::{ArrayRef, Int32Array, ListArray, RecordBatch, Scalar}; +use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::filter::filter_record_batch; use arrow_select::take::take; @@ -81,8 +80,8 @@ impl XdbcTypeInfoData { /// from [`CommandGetXdbcTypeInfo`] pub fn record_batch(&self, data_type: impl Into>) -> Result { if let Some(dt) = data_type.into() { - let arr: Int32Array = downcast_array(self.batch.column(1).as_ref()); - let filter = eq_scalar(&arr, dt)?; + let scalar = Int32Array::from(vec![dt]); + let filter = eq(self.batch.column(1), &Scalar::new(&scalar))?; Ok(filter_record_batch(&self.batch, &filter)?) } else { Ok(self.batch.clone()) diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index fb061b9b5499..c9c30074fe6e 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -44,10 +44,3 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } - -[package.metadata.docs.rs] -features = ["dyn_cmp_dict"] - -[features] -dyn_cmp_dict = [] -simd = ["arrow-array/simd"] diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs new file mode 100644 index 000000000000..aad61fa8f062 --- /dev/null +++ b/arrow-ord/src/cmp.rs @@ -0,0 +1,489 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Comparison kernels for `Array`s. +//! +//! These kernels can leverage SIMD if available on your system. Currently no runtime +//! detection is provided, you should enable the specific SIMD intrinsics using +//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation +//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. +//! + +use arrow_array::cast::AsArray; +use arrow_array::types::ByteArrayType; +use arrow_array::{ + downcast_primitive_array, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, + Datum, FixedSizeBinaryArray, GenericByteArray, +}; +use arrow_buffer::bit_util::ceil; +use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; +use arrow_schema::ArrowError; +use arrow_select::take::take; + +#[derive(Debug, Copy, Clone)] +enum Op { + Equal, + NotEqual, + Less, + LessEqual, + Greater, + GreaterEqual, +} + +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Equal => write!(f, "=="), + Op::NotEqual => write!(f, "!="), + Op::Less => write!(f, "<"), + Op::LessEqual => write!(f, "<="), + Op::Greater => write!(f, ">"), + Op::GreaterEqual => write!(f, ">="), + } + } +} + +/// Perform `left == right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn eq(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::Equal, lhs, rhs) +} + +/// Perform `left != right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn neq(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::NotEqual, lhs, rhs) +} + +/// Perform `left < right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn lt(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::Less, lhs, rhs) +} + +/// Perform `left <= right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn lt_eq(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::LessEqual, lhs, rhs) +} + +/// Perform `left > right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn gt(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::Greater, lhs, rhs) +} + +/// Perform `left >= right` operation on two [`Datum`] +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn gt_eq(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + compare_op(Op::GreaterEqual, lhs, rhs) +} + +/// Perform `op` on the provided `Datum` +fn compare_op( + op: Op, + lhs: &dyn Datum, + rhs: &dyn Datum, +) -> Result { + use arrow_schema::DataType::*; + let (l, l_s) = lhs.get(); + let (r, r_s) = rhs.get(); + + let l_len = l.len(); + let r_len = r.len(); + let l_nulls = l.logical_nulls(); + let r_nulls = r.logical_nulls(); + + let (len, nulls) = match (l_s, r_s) { + (true, true) | (false, false) => { + if l_len != r_len { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {l_len} vs {r_len}" + ))); + } + (l_len, NullBuffer::union(l_nulls.as_ref(), r_nulls.as_ref())) + } + (true, false) => match l_nulls.map(|x| x.null_count() != 0).unwrap_or_default() { + true => (r_len, Some(NullBuffer::new_null(r_len))), + false => (r_len, r_nulls), // Left is scalar and not null + }, + (false, true) => match r_nulls.map(|x| x.null_count() != 0).unwrap_or_default() { + true => (l_len, Some(NullBuffer::new_null(l_len))), + false => (l_len, l_nulls), // Right is scalar and not null + }, + }; + + let l_v = l.as_any_dictionary_opt(); + let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); + + let r_v = r.as_any_dictionary_opt(); + let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); + + let values = downcast_primitive_array! { + (l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v), + (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v), + (Utf8, Utf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), + (LargeUtf8, LargeUtf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), + (Binary, Binary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), + (LargeBinary, LargeBinary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), + (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v), + (l_t, r_t) => return Err(ArrowError::InvalidArgumentError(format!("Invalid comparison operation: {l_t} {op} {r_t}"))), + }.unwrap_or_else(|| { + let count = nulls.as_ref().map(|x| x.null_count()).unwrap_or_default(); + assert_eq!(count, len); // Sanity check + BooleanBuffer::new_unset(len) + }); + + assert_eq!(values.len(), len); // Sanity check + Ok(BooleanArray::new(values, nulls)) +} + +/// Perform a potentially vectored `op` on the provided `ArrayOrd` +fn apply( + op: Op, + l: T, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: T, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, +) -> Option { + if l.len() == 0 || r.len() == 0 { + return None; // Handle empty dictionaries + } + + if !l_s && !r_s && (l_v.is_some() || r_v.is_some()) { + // Not scalar and at least one side has a dictionary, need to perform vectored comparison + let l_v = l_v + .map(|x| x.normalized_keys()) + .unwrap_or_else(|| (0..l.len()).collect()); + + let r_v = r_v + .map(|x| x.normalized_keys()) + .unwrap_or_else(|| (0..r.len()).collect()); + + assert_eq!(l_v.len(), r_v.len()); // Sanity check + + Some(match op { + Op::Equal => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_eq), + Op::NotEqual => apply_op_vectored(l, &l_v, r, &r_v, true, T::is_eq), + Op::Less => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_lt), + Op::LessEqual => apply_op_vectored(r, &r_v, l, &l_v, true, T::is_lt), + Op::Greater => apply_op_vectored(r, &r_v, l, &l_v, false, T::is_lt), + Op::GreaterEqual => apply_op_vectored(l, &l_v, r, &r_v, true, T::is_lt), + }) + } else { + let l_s = l_s.then(|| l_v.map(|x| x.normalized_keys()[0]).unwrap_or_default()); + let r_s = r_s.then(|| r_v.map(|x| x.normalized_keys()[0]).unwrap_or_default()); + + let buffer = match op { + Op::Equal => apply_op(l, l_s, r, r_s, false, T::is_eq), + Op::NotEqual => apply_op(l, l_s, r, r_s, true, T::is_eq), + Op::Less => apply_op(l, l_s, r, r_s, false, T::is_lt), + Op::LessEqual => apply_op(r, r_s, l, l_s, true, T::is_lt), + Op::Greater => apply_op(r, r_s, l, l_s, false, T::is_lt), + Op::GreaterEqual => apply_op(l, l_s, r, r_s, true, T::is_lt), + }; + + // If a side had a dictionary, and was not scalar, we need to materialize this + Some(match (l_v, r_v) { + (Some(l_v), _) if l_s.is_none() => take_bits(l_v, buffer), + (_, Some(r_v)) if r_s.is_none() => take_bits(r_v, buffer), + _ => buffer, + }) + } +} + +/// Perform a take operation on `buffer` with the given dictionary +fn take_bits(v: &dyn AnyDictionaryArray, buffer: BooleanBuffer) -> BooleanBuffer { + let array = take(&BooleanArray::new(buffer, None), v.keys(), None).unwrap(); + array.as_boolean().values().clone() +} + +/// Invokes `f` with values `0..len` collecting the boolean results into a new `BooleanBuffer` +/// +/// This is similar to [`MutableBuffer::collect_bool`] but with +/// the option to efficiently negate the result +fn collect_bool(len: usize, neg: bool, f: impl Fn(usize) -> bool) -> BooleanBuffer { + let mut buffer = MutableBuffer::new(ceil(len, 64) * 8); + + let chunks = len / 64; + let remainder = len % 64; + for chunk in 0..chunks { + let mut packed = 0; + for bit_idx in 0..64 { + let i = bit_idx + chunk * 64; + packed |= (f(i) as u64) << bit_idx; + } + if neg { + packed = !packed + } + + // SAFETY: Already allocated sufficient capacity + unsafe { buffer.push_unchecked(packed) } + } + + if remainder != 0 { + let mut packed = 0; + for bit_idx in 0..remainder { + let i = bit_idx + chunks * 64; + packed |= (f(i) as u64) << bit_idx; + } + if neg { + packed = !packed + } + + // SAFETY: Already allocated sufficient capacity + unsafe { buffer.push_unchecked(packed) } + } + BooleanBuffer::new(buffer.into(), 0, len) +} + +/// Applies `op` to possibly scalar `ArrayOrd` +/// +/// If l is scalar `l_s` will be `Some(idx)` where `idx` is the index of the scalar value in `l` +/// If r is scalar `r_s` will be `Some(idx)` where `idx` is the index of the scalar value in `r` +fn apply_op( + l: T, + l_s: Option, + r: T, + r_s: Option, + neg: bool, + op: impl Fn(T::Item, T::Item) -> bool, +) -> BooleanBuffer { + match (l_s, r_s) { + (None, None) => { + assert_eq!(l.len(), r.len()); + collect_bool(l.len(), neg, |idx| unsafe { + op(l.value_unchecked(idx), r.value_unchecked(idx)) + }) + } + (Some(l_s), Some(r_s)) => { + let a = l.value(l_s); + let b = r.value(r_s); + std::iter::once(op(a, b)).collect() + } + (Some(l_s), None) => { + let v = l.value(l_s); + collect_bool(r.len(), neg, |idx| op(v, unsafe { r.value_unchecked(idx) })) + } + (None, Some(r_s)) => { + let v = r.value(r_s); + collect_bool(l.len(), neg, |idx| op(unsafe { l.value_unchecked(idx) }, v)) + } + } +} + +/// Applies `op` to possibly scalar `ArrayOrd` with the given indices +fn apply_op_vectored( + l: T, + l_v: &[usize], + r: T, + r_v: &[usize], + neg: bool, + op: impl Fn(T::Item, T::Item) -> bool, +) -> BooleanBuffer { + assert_eq!(l_v.len(), r_v.len()); + collect_bool(l_v.len(), neg, |idx| unsafe { + let l_idx = *l_v.get_unchecked(idx); + let r_idx = *r_v.get_unchecked(idx); + op(l.value_unchecked(l_idx), r.value_unchecked(r_idx)) + }) +} + +trait ArrayOrd { + type Item: Copy + Default; + + fn len(&self) -> usize; + + fn value(&self, idx: usize) -> Self::Item { + assert!(idx < self.len()); + unsafe { self.value_unchecked(idx) } + } + + /// # Safety + /// + /// Safe if `idx < self.len()` + unsafe fn value_unchecked(&self, idx: usize) -> Self::Item; + + fn is_eq(l: Self::Item, r: Self::Item) -> bool; + + fn is_lt(l: Self::Item, r: Self::Item) -> bool; +} + +impl<'a> ArrayOrd for &'a BooleanArray { + type Item = bool; + + fn len(&self) -> usize { + Array::len(self) + } + + unsafe fn value_unchecked(&self, idx: usize) -> Self::Item { + BooleanArray::value_unchecked(self, idx) + } + + fn is_eq(l: Self::Item, r: Self::Item) -> bool { + l == r + } + + fn is_lt(l: Self::Item, r: Self::Item) -> bool { + !l & r + } +} + +impl ArrayOrd for &[T] { + type Item = T; + + fn len(&self) -> usize { + (*self).len() + } + + unsafe fn value_unchecked(&self, idx: usize) -> Self::Item { + *self.get_unchecked(idx) + } + + fn is_eq(l: Self::Item, r: Self::Item) -> bool { + l.is_eq(r) + } + + fn is_lt(l: Self::Item, r: Self::Item) -> bool { + l.is_lt(r) + } +} + +impl<'a, T: ByteArrayType> ArrayOrd for &'a GenericByteArray { + type Item = &'a [u8]; + + fn len(&self) -> usize { + Array::len(self) + } + + unsafe fn value_unchecked(&self, idx: usize) -> Self::Item { + GenericByteArray::value_unchecked(self, idx).as_ref() + } + + fn is_eq(l: Self::Item, r: Self::Item) -> bool { + l == r + } + + fn is_lt(l: Self::Item, r: Self::Item) -> bool { + l < r + } +} + +impl<'a> ArrayOrd for &'a FixedSizeBinaryArray { + type Item = &'a [u8]; + + fn len(&self) -> usize { + Array::len(self) + } + + unsafe fn value_unchecked(&self, idx: usize) -> Self::Item { + FixedSizeBinaryArray::value_unchecked(self, idx) + } + + fn is_eq(l: Self::Item, r: Self::Item) -> bool { + l == r + } + + fn is_lt(l: Self::Item, r: Self::Item) -> bool { + l < r + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{DictionaryArray, Int32Array, Scalar}; + + use super::*; + + #[test] + fn test_null_dict() { + let a = DictionaryArray::new( + Int32Array::new_null(10), + Arc::new(Int32Array::new_null(0)), + ); + let r = eq(&a, &a).unwrap(); + assert_eq!(r.null_count(), 10); + + let a = DictionaryArray::new( + Int32Array::from(vec![1, 2, 3, 4, 5, 6]), + Arc::new(Int32Array::new_null(10)), + ); + let r = eq(&a, &a).unwrap(); + assert_eq!(r.null_count(), 6); + + let scalar = DictionaryArray::new( + Int32Array::new_null(1), + Arc::new(Int32Array::new_null(0)), + ); + let r = eq(&a, &Scalar::new(&scalar)).unwrap(); + assert_eq!(r.null_count(), 6); + + let scalar = DictionaryArray::new( + Int32Array::new_null(1), + Arc::new(Int32Array::new_null(0)), + ); + let r = eq(&Scalar::new(&scalar), &Scalar::new(&scalar)).unwrap(); + assert_eq!(r.null_count(), 1); + + let a = DictionaryArray::new( + Int32Array::from(vec![0, 1, 2]), + Arc::new(Int32Array::from(vec![3, 2, 1])), + ); + let r = eq(&a, &Scalar::new(&scalar)).unwrap(); + assert_eq!(r.null_count(), 3); + } +} diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 21583fac08ff..1a6e564283d7 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -23,15 +23,229 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. //! +use half::f16; +use std::sync::Arc; + use arrow_array::cast::*; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::i256; -use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit}; -use arrow_select::take::take; -use half::f16; + +/// Calls $RIGHT.$TY() (e.g. `right.to_i128()`) with a nice error message. +/// Type of expression is `Result<.., ArrowError>` +macro_rules! try_to_type { + ($RIGHT: expr, $TY: ident) => { + try_to_type_result($RIGHT.$TY(), &format!("{:?}", $RIGHT), stringify!($TY)) + }; +} + +// Avoids creating a closure for each combination of `$RIGHT` and `$TY` +fn try_to_type_result( + value: Option, + right: &str, + ty: &str, +) -> Result { + value.ok_or_else(|| { + ArrowError::ComputeError(format!("Could not convert {right} with {ty}",)) + }) +} + +fn make_primitive_scalar( + d: &DataType, + scalar: T, +) -> Result { + match d { + DataType::Int8 => { + let right = try_to_type!(scalar, to_i8)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Int16 => { + let right = try_to_type!(scalar, to_i16)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Int32 => { + let right = try_to_type!(scalar, to_i32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Int64 => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::UInt8 => { + let right = try_to_type!(scalar, to_u8)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::UInt16 => { + let right = try_to_type!(scalar, to_u16)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::UInt32 => { + let right = try_to_type!(scalar, to_u32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::UInt64 => { + let right = try_to_type!(scalar, to_u64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Float16 => { + let right = try_to_type!(scalar, to_f32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + f16::from_f32(right), + ]))) + } + DataType::Float32 => { + let right = try_to_type!(scalar, to_f32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Float64 => { + let right = try_to_type!(scalar, to_f64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Decimal128(_, _) => { + let right = try_to_type!(scalar, to_i128)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + right, + ]))) + } + DataType::Decimal256(_, _) => { + let right = try_to_type!(scalar, to_i128)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + i256::from_i128(right), + ]))) + } + DataType::Date32 => { + let right = try_to_type!(scalar, to_i32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Date64 => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![right]))) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Timestamp(TimeUnit::Second, _) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + right, + ]))) + } + DataType::Time32(TimeUnit::Second) => { + let right = try_to_type!(scalar, to_i32)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + right, + ]))) + } + DataType::Time32(TimeUnit::Millisecond) => { + let right = try_to_type!(scalar, to_i32)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Time64(TimeUnit::Microsecond) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Time64(TimeUnit::Nanosecond) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Interval(IntervalUnit::YearMonth) => { + let right = try_to_type!(scalar, to_i32)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Interval(IntervalUnit::DayTime) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + right, + ]))) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let right = try_to_type!(scalar, to_i128)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Duration(TimeUnit::Second) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from(vec![ + right, + ]))) + } + DataType::Duration(TimeUnit::Millisecond) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Duration(TimeUnit::Microsecond) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Duration(TimeUnit::Nanosecond) => { + let right = try_to_type!(scalar, to_i64)?; + Ok(Arc::new(PrimitiveArray::::from( + vec![right], + ))) + } + DataType::Dictionary(_, v) => make_primitive_scalar(v.as_ref(), scalar), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported primitive scalar data type {d:?}", + ))), + } +} + +fn make_binary_scalar(d: &DataType, scalar: &[u8]) -> Result { + match d { + DataType::Binary => Ok(Arc::new(BinaryArray::from_iter_values([scalar]))), + DataType::FixedSizeBinary(_) => Ok(Arc::new( + FixedSizeBinaryArray::try_from_iter([scalar].into_iter())?, + )), + DataType::LargeBinary => { + Ok(Arc::new(LargeBinaryArray::from_iter_values([scalar]))) + } + DataType::Dictionary(_, v) => make_binary_scalar(v.as_ref(), scalar), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported binary scalar data type {d:?}", + ))), + } +} + +fn make_utf8_scalar(d: &DataType, scalar: &str) -> Result { + match d { + DataType::Utf8 => Ok(Arc::new(StringArray::from_iter_values([scalar]))), + DataType::LargeUtf8 => Ok(Arc::new(LargeStringArray::from_iter_values([scalar]))), + DataType::Dictionary(_, v) => make_utf8_scalar(v.as_ref(), scalar), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported utf8 scalar data type {d:?}", + ))), + } +} /// Helper function to perform boolean lambda function on values from two array accessors, this /// version does not attempt to use SIMD. @@ -67,6 +281,7 @@ where /// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified /// comparison function. +#[deprecated(note = "Use BooleanArray::from_binary")] pub fn no_simd_compare_op( left: &PrimitiveArray, right: &PrimitiveArray, @@ -81,6 +296,7 @@ where /// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using /// a specified comparison function. +#[deprecated(note = "Use BooleanArray::from_unary")] pub fn no_simd_compare_op_scalar( left: &PrimitiveArray, right: T::Native, @@ -94,617 +310,345 @@ where } /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a == b) -} - -fn utf8_empty( - left: &GenericStringArray, -) -> Result { - let null_bit_buffer = left.nulls().map(|b| b.inner().sliced()); - - let buffer = unsafe { - MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( - |offset| { - if EQ { - offset[1].as_usize() == offset[0].as_usize() - } else { - offset[1].as_usize() > offset[0].as_usize() - } - }, - )) - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + crate::cmp::eq(left, right) } /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - if right.is_empty() { - return utf8_empty::<_, true>(left); - } - compare_op_scalar(left, |a| a == right) + let right = GenericStringArray::::from(vec![right]); + crate::cmp::eq(&left, &Scalar::new(&right)) } /// Perform `left == right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| !(a ^ b)) + crate::cmp::eq(&left, &right) } /// Perform `left != right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| (a ^ b)) + crate::cmp::neq(&left, &right) } /// Perform `left < right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| ((!a) & b)) + crate::cmp::lt(&left, &right) } /// Perform `left <= right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| !(a & (!b))) + crate::cmp::lt_eq(&left, &right) } /// Perform `left > right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| (a & (!b))) + crate::cmp::gt(&left, &right) } /// Perform `left >= right` operation on [`BooleanArray`] +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_bool( left: &BooleanArray, right: &BooleanArray, ) -> Result { - compare_op(left, right, |a, b| !((!a) & b)) + crate::cmp::gt_eq(&left, &right) } /// Perform `left == right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - let values = match right { - true => left.values().clone(), - false => !left.values(), - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - values.len(), - None, - left.nulls().map(|b| b.inner().sliced()), - values.offset(), - vec![values.into_inner()], - vec![], - ) - }; - - Ok(BooleanArray::from(data)) + let right = BooleanArray::from(vec![right]); + crate::cmp::eq(&left, &Scalar::new(&right)) } /// Perform `left < right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - compare_op_scalar(left, |a: bool| !a & right) + let right = BooleanArray::from(vec![right]); + crate::cmp::lt(&left, &Scalar::new(&right)) } /// Perform `left <= right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - compare_op_scalar(left, |a| a <= right) + let right = BooleanArray::from(vec![right]); + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } /// Perform `left > right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - compare_op_scalar(left, |a: bool| a & !right) + let right = BooleanArray::from(vec![right]); + crate::cmp::gt(&left, &Scalar::new(&right)) } /// Perform `left >= right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - compare_op_scalar(left, |a| a >= right) + let right = BooleanArray::from(vec![right]); + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } /// Perform `left != right` operation on [`BooleanArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_bool_scalar( left: &BooleanArray, right: bool, ) -> Result { - eq_bool_scalar(left, !right) + let right = BooleanArray::from(vec![right]); + crate::cmp::neq(&left, &Scalar::new(&right)) } /// Perform `left == right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a == b) + crate::cmp::eq(left, right) } /// Perform `left == right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a == right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::eq(left, &Scalar::new(&right)) } /// Perform `left != right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a != b) + crate::cmp::neq(left, right) } /// Perform `left != right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a != right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::neq(left, &Scalar::new(&right)) } /// Perform `left < right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a < b) + crate::cmp::lt(left, right) } /// Perform `left < right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a < right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::lt(left, &Scalar::new(&right)) } /// Perform `left <= right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a <= b) + crate::cmp::lt_eq(left, right) } /// Perform `left <= right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a <= right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::lt_eq(left, &Scalar::new(&right)) } /// Perform `left > right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a > b) + crate::cmp::gt(left, right) } /// Perform `left > right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a > right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::gt(left, &Scalar::new(&right)) } /// Perform `left >= right` operation on [`BinaryArray`] / [`LargeBinaryArray`]. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_binary( left: &GenericBinaryArray, right: &GenericBinaryArray, ) -> Result { - compare_op(left, right, |a, b| a >= b) + crate::cmp::gt_eq(left, right) } /// Perform `left >= right` operation on [`BinaryArray`] / [`LargeBinaryArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_binary_scalar( left: &GenericBinaryArray, right: &[u8], ) -> Result { - compare_op_scalar(left, |a| a >= right) + let right = GenericBinaryArray::::from_iter_values([right]); + crate::cmp::gt_eq(left, &Scalar::new(&right)) } /// Perform `left != right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a != b) + crate::cmp::neq(left, right) } /// Perform `left != right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - if right.is_empty() { - return utf8_empty::<_, false>(left); - } - compare_op_scalar(left, |a| a != right) + let right = GenericStringArray::::from_iter_values([right]); + crate::cmp::neq(left, &Scalar::new(&right)) } /// Perform `left < right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a < b) + crate::cmp::lt(left, right) } /// Perform `left < right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - compare_op_scalar(left, |a| a < right) + let right = GenericStringArray::::from_iter_values([right]); + crate::cmp::lt(left, &Scalar::new(&right)) } /// Perform `left <= right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a <= b) + crate::cmp::lt_eq(left, right) } /// Perform `left <= right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - compare_op_scalar(left, |a| a <= right) + let right = GenericStringArray::::from_iter_values([right]); + crate::cmp::lt_eq(left, &Scalar::new(&right)) } /// Perform `left > right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a > b) + crate::cmp::gt(left, right) } /// Perform `left > right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - compare_op_scalar(left, |a| a > right) + let right = GenericStringArray::::from_iter_values([right]); + crate::cmp::gt(left, &Scalar::new(&right)) } /// Perform `left >= right` operation on [`StringArray`] / [`LargeStringArray`]. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_utf8( left: &GenericStringArray, right: &GenericStringArray, ) -> Result { - compare_op(left, right, |a, b| a >= b) + crate::cmp::gt_eq(left, right) } /// Perform `left >= right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { - compare_op_scalar(left, |a| a >= right) -} - -// Avoids creating a closure for each combination of `$RIGHT` and `$TY` -fn try_to_type_result( - value: Option, - right: &str, - ty: &str, -) -> Result { - value.ok_or_else(|| { - ArrowError::ComputeError(format!("Could not convert {right} with {ty}",)) - }) -} - -/// Calls $RIGHT.$TY() (e.g. `right.to_i128()`) with a nice error message. -/// Type of expression is `Result<.., ArrowError>` -macro_rules! try_to_type { - ($RIGHT: expr, $TY: ident) => { - try_to_type_result($RIGHT.$TY(), &format!("{:?}", $RIGHT), stringify!($TY)) - }; -} - -macro_rules! dyn_compare_scalar { - // Applies `LEFT OP RIGHT` when `LEFT` is a `PrimitiveArray` - ($LEFT: expr, $RIGHT: expr, $OP: ident) => {{ - match $LEFT.data_type() { - DataType::Int8 => { - let right = try_to_type!($RIGHT, to_i8)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Int16 => { - let right = try_to_type!($RIGHT, to_i16)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Int32 => { - let right = try_to_type!($RIGHT, to_i32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Int64 => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::UInt8 => { - let right = try_to_type!($RIGHT, to_u8)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::UInt16 => { - let right = try_to_type!($RIGHT, to_u16)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::UInt32 => { - let right = try_to_type!($RIGHT, to_u32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::UInt64 => { - let right = try_to_type!($RIGHT, to_u64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Float16 => { - let right = try_to_type!($RIGHT, to_f32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, f16::from_f32(right)) - } - DataType::Float32 => { - let right = try_to_type!($RIGHT, to_f32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Float64 => { - let right = try_to_type!($RIGHT, to_f64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Decimal128(_, _) => { - let right = try_to_type!($RIGHT, to_i128)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Decimal256(_, _) => { - let right = try_to_type!($RIGHT, to_i128)?; - let left = as_primitive_array::($LEFT); - $OP::(left, i256::from_i128(right)) - } - DataType::Date32 => { - let right = try_to_type!($RIGHT, to_i32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Date64 => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Timestamp(TimeUnit::Second, _) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Time32(TimeUnit::Second) => { - let right = try_to_type!($RIGHT, to_i32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Time32(TimeUnit::Millisecond) => { - let right = try_to_type!($RIGHT, to_i32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Time64(TimeUnit::Microsecond) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Time64(TimeUnit::Nanosecond) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Interval(IntervalUnit::YearMonth) => { - let right = try_to_type!($RIGHT, to_i32)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Interval(IntervalUnit::DayTime) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let right = try_to_type!($RIGHT, to_i128)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Duration(TimeUnit::Second) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Duration(TimeUnit::Millisecond) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Duration(TimeUnit::Microsecond) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - DataType::Duration(TimeUnit::Nanosecond) => { - let right = try_to_type!($RIGHT, to_i64)?; - let left = as_primitive_array::($LEFT); - $OP::(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "Unsupported data type {:?} for comparison {} with {:?}", - $LEFT.data_type(), - stringify!($OP), - $RIGHT - ))), - } - }}; - // Applies `LEFT OP RIGHT` when `LEFT` is a `DictionaryArray` with keys of type `KT` - ($LEFT: expr, $RIGHT: expr, $KT: ident, $OP: ident) => {{ - match $KT.as_ref() { - DataType::UInt8 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::UInt16 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::UInt32 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::UInt64 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::Int8 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::Int16 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::Int32 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - DataType::Int64 => { - let left = as_dictionary_array::($LEFT); - unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) - } - _ => Err(ArrowError::ComputeError(format!( - "Unsupported dictionary key type {:?}", - $KT.as_ref() - ))), - } - }}; -} - -macro_rules! dyn_compare_utf8_scalar { - ($LEFT: expr, $RIGHT: expr, $KT: ident, $OP: ident) => {{ - match $KT.as_ref() { - DataType::UInt8 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::UInt16 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::UInt32 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::UInt64 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::Int8 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::Int16 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::Int32 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - DataType::Int64 => { - let left = as_dictionary_array::($LEFT); - let values = as_string_array(left.values()); - unpack_dict_comparison(left, $OP(values, $RIGHT)?) - } - _ => Err(ArrowError::ComputeError(String::from("Unknown key type"))), - } - }}; + let right = GenericStringArray::::from_iter_values([right]); + crate::cmp::gt_eq(left, &Scalar::new(&right)) } /// Perform `left == right` operation on an array and a numeric scalar @@ -716,16 +660,13 @@ macro_rules! dyn_compare_utf8_scalar { /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, eq_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, eq_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::eq(&left, &Scalar::new(&right)) } /// Perform `left < right` operation on an array and a numeric scalar @@ -737,16 +678,13 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, lt_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::lt(&left, &Scalar::new(&right)) } /// Perform `left <= right` operation on an array and a numeric scalar @@ -758,16 +696,13 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_eq_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, lt_eq_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } /// Perform `left > right` operation on an array and a numeric scalar @@ -779,16 +714,13 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, gt_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::gt(&left, &Scalar::new(&right)) } /// Perform `left >= right` operation on an array and a numeric scalar @@ -800,16 +732,13 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_eq_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, gt_eq_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } /// Perform `left != right` operation on an array and a numeric scalar @@ -821,1325 +750,211 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_dyn_scalar(left: &dyn Array, right: T) -> Result where T: num::ToPrimitive + std::fmt::Debug, { - match left.data_type() { - DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, neq_dyn_scalar) - } - _ => dyn_compare_scalar!(left, right, neq_scalar), - } + let right = make_primitive_scalar(left.data_type(), right)?; + crate::cmp::neq(&left, &Scalar::new(&right)) } /// Perform `left == right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_dyn_binary_scalar( left: &dyn Array, right: &[u8], ) -> Result { - match left.data_type() { - DataType::Binary => eq_binary_scalar(left.as_binary::(), right), - DataType::FixedSizeBinary(_) => { - let left = left.as_any().downcast_ref::().unwrap(); - compare_op_scalar(left, |a| a == right) - } - DataType::LargeBinary => eq_binary_scalar(left.as_binary::(), right), - _ => Err(ArrowError::ComputeError( - "eq_dyn_binary_scalar only supports Binary / FixedSizeBinary / LargeBinary arrays".to_string(), - )), - } + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::eq(&left, &Scalar::new(&right)) } /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_dyn_binary_scalar( left: &dyn Array, right: &[u8], ) -> Result { - match left.data_type() { - DataType::Binary => neq_binary_scalar(left.as_binary::(), right), - DataType::LargeBinary => neq_binary_scalar(left.as_binary::(), right), - DataType::FixedSizeBinary(_) => { - let left = left.as_any().downcast_ref::().unwrap(); - compare_op_scalar(left, |a| a != right) - } - _ => Err(ArrowError::ComputeError( - "neq_dyn_binary_scalar only supports Binary / FixedSizeBinary / LargeBinary arrays" - .to_string(), - )), - } + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::neq(&left, &Scalar::new(&right)) } /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_dyn_binary_scalar( left: &dyn Array, right: &[u8], ) -> Result { - match left.data_type() { - DataType::Binary => lt_binary_scalar(left.as_binary::(), right), - DataType::LargeBinary => lt_binary_scalar(left.as_binary::(), right), - _ => Err(ArrowError::ComputeError( - "lt_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), - )), - } + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::lt(&left, &Scalar::new(&right)) } /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_dyn_binary_scalar( left: &dyn Array, right: &[u8], ) -> Result { - match left.data_type() { - DataType::Binary => lt_eq_binary_scalar(left.as_binary::(), right), - DataType::LargeBinary => lt_eq_binary_scalar(left.as_binary::(), right), - _ => Err(ArrowError::ComputeError( - "lt_eq_dyn_binary_scalar only supports Binary or LargeBinary arrays" - .to_string(), - )), - } + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_dyn_binary_scalar( left: &dyn Array, right: &[u8], ) -> Result { - match left.data_type() { - DataType::Binary => gt_binary_scalar(left.as_binary::(), right), - DataType::LargeBinary => gt_binary_scalar(left.as_binary::(), right), - _ => Err(ArrowError::ComputeError( - "gt_dyn_binary_scalar only supports Binary or LargeBinary arrays".to_string(), - )), - } + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::gt(&left, &Scalar::new(&right)) } /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { - match left.data_type() { - DataType::Binary => gt_eq_binary_scalar(left.as_binary::(), right), - DataType::LargeBinary => gt_eq_binary_scalar(left.as_binary::(), right), - _ => Err(ArrowError::ComputeError( - "gt_eq_dyn_binary_scalar only supports Binary or LargeBinary arrays" - .to_string(), - )), - } -} - -/// Perform `left == right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, eq_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - eq_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - eq_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left < right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn lt_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, lt_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "lt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - lt_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - lt_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "lt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left >= right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn gt_eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, gt_eq_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "gt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - gt_eq_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - gt_eq_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "gt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left <= right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn lt_eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, lt_eq_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "lt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - lt_eq_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - lt_eq_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "lt_eq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left > right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn gt_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, gt_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "gt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - gt_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - gt_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "gt_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left != right` operation on an array and a numeric scalar -/// value. Supports StringArrays, and DictionaryArrays that have string values -pub fn neq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { - let result = match left.data_type() { - DataType::Dictionary(key_type, value_type) => match value_type.as_ref() { - DataType::Utf8 | DataType::LargeUtf8 => { - dyn_compare_utf8_scalar!(left, right, key_type, neq_utf8_scalar) - } - _ => Err(ArrowError::ComputeError( - "neq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), - )), - }, - DataType::Utf8 => { - neq_utf8_scalar(left.as_string::(), right) - } - DataType::LargeUtf8 => { - neq_utf8_scalar(left.as_string::(), right) - } - _ => Err(ArrowError::ComputeError( - "neq_dyn_utf8_scalar only supports Utf8 or LargeUtf8 arrays".to_string(), - )), - }; - result -} - -/// Perform `left == right` operation on an array and a numeric scalar -/// value. -pub fn eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => eq_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "eq_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// Perform `left < right` operation on an array and a numeric scalar -/// value. Supports BooleanArrays. -pub fn lt_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => lt_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "lt_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// Perform `left > right` operation on an array and a numeric scalar -/// value. Supports BooleanArrays. -pub fn gt_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => gt_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "gt_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// Perform `left <= right` operation on an array and a numeric scalar -/// value. Supports BooleanArrays. -pub fn lt_eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => lt_eq_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "lt_eq_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// Perform `left >= right` operation on an array and a numeric scalar -/// value. Supports BooleanArrays. -pub fn gt_eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => gt_eq_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "gt_eq_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// Perform `left != right` operation on an array and a numeric scalar -/// value. Supports BooleanArrays. -pub fn neq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { - let result = match left.data_type() { - DataType::Boolean => neq_bool_scalar(left.as_boolean(), right), - _ => Err(ArrowError::ComputeError( - "neq_dyn_bool_scalar only supports BooleanArray".to_string(), - )), - }; - result -} - -/// unpacks the results of comparing left.values (as a boolean) -/// -/// TODO add example -/// -fn unpack_dict_comparison( - dict: &DictionaryArray, - dict_comparison: BooleanArray, -) -> Result -where - K: ArrowDictionaryKeyType, - K::Native: num::ToPrimitive, -{ - let array = take(&dict_comparison, dict.keys(), None)? - .as_boolean() - .clone(); - Ok(array) -} - -/// Helper function to perform boolean lambda function on values from two arrays using -/// SIMD. -#[cfg(feature = "simd")] -fn simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - simd_op: SI, - scalar_op: SC, -) -> Result -where - T: ArrowNumericType, - SI: Fn(T::Simd, T::Simd) -> T::SimdMask, - SC: Fn(T::Native, T::Native) -> bool, -{ - use std::borrow::BorrowMut; - - let len = left.len(); - if len != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let nulls = NullBuffer::union(left.nulls(), right.nulls()); - - // we process the data in chunks so that each iteration results in one u64 of comparison result bits - const CHUNK_SIZE: usize = 64; - let lanes = T::lanes(); - - // this is currently the case for all our datatypes and allows us to always append full bytes - assert!( - lanes <= CHUNK_SIZE, - "Number of vector lanes must be at most 64" - ); - - let buffer_size = bit_util::ceil(len, 8); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE); - let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE); - - let result_chunks = result.typed_data_mut(); - let result_remainder = left_chunks - .borrow_mut() - .zip(right_chunks.borrow_mut()) - .fold(result_chunks, |result_slice, (left_slice, right_slice)| { - let mut i = 0; - let mut bitmask = 0_u64; - while i < CHUNK_SIZE { - let simd_left = T::load(&left_slice[i..]); - let simd_right = T::load(&right_slice[i..]); - let simd_result = simd_op(simd_left, simd_right); - - let m = T::mask_to_u64(&simd_result); - bitmask |= m << i; - - i += lanes; - } - let bytes = bitmask.to_le_bytes(); - result_slice[0..8].copy_from_slice(&bytes); - - &mut result_slice[8..] - }); - - let left_remainder = left_chunks.remainder(); - let right_remainder = right_chunks.remainder(); - - assert_eq!(left_remainder.len(), right_remainder.len()); - - if !left_remainder.is_empty() { - let remainder_bitmask = left_remainder - .iter() - .zip(right_remainder.iter()) - .enumerate() - .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| { - let bit = scalar_op(*scalar_left, *scalar_right) as u64; - mask |= bit << i; - mask - }); - let remainder_mask_as_bytes = - &remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)]; - result_remainder.copy_from_slice(remainder_mask_as_bytes); - } - - let values = BooleanBuffer::new(result.into(), 0, len); - Ok(BooleanArray::new(values, nulls)) -} - -/// Helper function to perform boolean lambda function on values from an array and a scalar value using -/// SIMD. -#[cfg(feature = "simd")] -fn simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - simd_op: SI, - scalar_op: SC, -) -> Result -where - T: ArrowNumericType, - SI: Fn(T::Simd, T::Simd) -> T::SimdMask, - SC: Fn(T::Native, T::Native) -> bool, -{ - use std::borrow::BorrowMut; - - let len = left.len(); - - // we process the data in chunks so that each iteration results in one u64 of comparison result bits - const CHUNK_SIZE: usize = 64; - let lanes = T::lanes(); - - // this is currently the case for all our datatypes and allows us to always append full bytes - assert!( - lanes <= CHUNK_SIZE, - "Number of vector lanes must be at most 64" - ); - - let buffer_size = bit_util::ceil(len, 8); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE); - let simd_right = T::init(right); - - let result_chunks = result.typed_data_mut(); - let result_remainder = - left_chunks - .borrow_mut() - .fold(result_chunks, |result_slice, left_slice| { - let mut i = 0; - let mut bitmask = 0_u64; - while i < CHUNK_SIZE { - let simd_left = T::load(&left_slice[i..]); - let simd_result = simd_op(simd_left, simd_right); - - let m = T::mask_to_u64(&simd_result); - bitmask |= m << i; - - i += lanes; - } - let bytes = bitmask.to_le_bytes(); - result_slice[0..8].copy_from_slice(&bytes); - - &mut result_slice[8..] - }); - - let left_remainder = left_chunks.remainder(); - - if !left_remainder.is_empty() { - let remainder_bitmask = left_remainder.iter().enumerate().fold( - 0_u64, - |mut mask, (i, scalar_left)| { - let bit = scalar_op(*scalar_left, right) as u64; - mask |= bit << i; - mask - }, - ); - let remainder_mask_as_bytes = - &remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)]; - result_remainder.copy_from_slice(remainder_mask_as_bytes); - } - - let null_bit_buffer = left.nulls().map(|b| b.inner().sliced()); - - // null count is the same as in the input since the right side of the scalar comparison cannot be null - let null_count = left.null_count(); - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - len, - Some(null_count), - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) -} - -fn cmp_primitive_array( - left: &dyn Array, - right: &dyn Array, - op: F, -) -> Result -where - F: Fn(T::Native, T::Native) -> bool, -{ - let left_array = left.as_primitive::(); - let right_array = right.as_primitive::(); - compare_op(left_array, right_array, op) -} - -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_dict_non_dict_cmp { - ($LEFT: expr, $RIGHT: expr, $LEFT_KEY_TYPE: expr, $RIGHT_TYPE: tt, $OP_BOOL: expr, $OP: expr) => {{ - match $LEFT_KEY_TYPE { - DataType::Int8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_primitive::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - t => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - t - ))), - } - }}; -} - -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_dict_string_array_cmp { - ($LEFT: expr, $RIGHT: expr, $LEFT_KEY_TYPE: expr, $RIGHT_TYPE: tt, $OP: expr) => {{ - match $LEFT_KEY_TYPE { - DataType::Int8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::Int64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt8 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt16 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt32 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - DataType::UInt64 => { - let left = as_dictionary_array::($LEFT); - cmp_dict_string_array::<_, $RIGHT_TYPE, _>(left, $RIGHT, $OP) - } - t => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - t - ))), - } - }}; -} - -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_cmp_dict_non_dict { - ($LEFT: expr, $RIGHT: expr, $OP_BOOL: expr, $OP: expr, $OP_FLOAT: expr) => {{ - match ($LEFT.data_type(), $RIGHT.data_type()) { - (DataType::Dictionary(left_key_type, left_value_type), right_type) => { - match (left_value_type.as_ref(), right_type) { - (DataType::Boolean, DataType::Boolean) => { - let left = $LEFT; - downcast_dictionary_array!( - left => { - cmp_dict_boolean_array::<_, _>(left, $RIGHT, $OP) - } - _ => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - left_key_type.as_ref() - ))), - ) - } - (DataType::Int8, DataType::Int8) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int8Type, $OP_BOOL, $OP) - } - (DataType::Int16, DataType::Int16) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int16Type, $OP_BOOL, $OP) - } - (DataType::Int32, DataType::Int32) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int32Type, $OP_BOOL, $OP) - } - (DataType::Int64, DataType::Int64) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Int64Type, $OP_BOOL, $OP) - } - (DataType::UInt8, DataType::UInt8) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), UInt8Type, $OP_BOOL, $OP) - } - (DataType::UInt16, DataType::UInt16) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), UInt16Type, $OP_BOOL, $OP) - } - (DataType::UInt32, DataType::UInt32) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), UInt32Type, $OP_BOOL, $OP) - } - (DataType::UInt64, DataType::UInt64) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), UInt64Type, $OP_BOOL, $OP) - } - (DataType::Float16, DataType::Float16) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float16Type, $OP_BOOL, $OP_FLOAT) - } - (DataType::Float32, DataType::Float32) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float32Type, $OP_BOOL, $OP_FLOAT) - } - (DataType::Float64, DataType::Float64) => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Float64Type, $OP_BOOL, $OP_FLOAT) - } - (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Decimal128Type, $OP_BOOL, $OP) - } - (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { - typed_dict_non_dict_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), Decimal256Type, $OP_BOOL, $OP) - } - (DataType::Utf8, DataType::Utf8) => { - typed_dict_string_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), i32, $OP) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - typed_dict_string_array_cmp!($LEFT, $RIGHT, left_key_type.as_ref(), i64, $OP) - } - (DataType::Binary, DataType::Binary) => { - let left = $LEFT; - downcast_dictionary_array!( - left => { - cmp_dict_binary_array::<_, i32, _>(left, $RIGHT, $OP) - } - _ => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - left_key_type.as_ref() - ))), - ) - } - (DataType::LargeBinary, DataType::LargeBinary) => { - let left = $LEFT; - downcast_dictionary_array!( - left => { - cmp_dict_binary_array::<_, i64, _>(left, $RIGHT, $OP) - } - _ => Err(ArrowError::NotYetImplemented(format!( - "Cannot compare dictionary array of key type {}", - left_key_type.as_ref() - ))), - ) - } - (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( - "Comparing dictionary array of type {} with array of type {} is not yet implemented", - t1, t2 - ))), - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot compare dictionary array with array of different value types ({} and {})", - t1, t2 - ))), - } - } - _ => unreachable!("Should not reach this branch"), - } - }}; -} - -#[cfg(not(feature = "dyn_cmp_dict"))] -macro_rules! typed_cmp_dict_non_dict { - ($LEFT: expr, $RIGHT: expr, $OP_BOOL: expr, $OP: expr, $OP_FLOAT: expr) => {{ - Err(ArrowError::CastError(format!( - "Comparing dictionary array of type {} with array of type {} requires \"dyn_cmp_dict\" feature", - $LEFT.data_type(), $RIGHT.data_type() - ))) - }} + left: &dyn Array, + right: &[u8], +) -> Result { + let right = make_binary_scalar(left.data_type(), right)?; + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } -macro_rules! typed_compares { - ($LEFT: expr, $RIGHT: expr, $OP_BOOL: expr, $OP: expr, $OP_FLOAT: expr) => {{ - match ($LEFT.data_type(), $RIGHT.data_type()) { - (DataType::Boolean, DataType::Boolean) => { - compare_op(as_boolean_array($LEFT), as_boolean_array($RIGHT), $OP_BOOL) - } - (DataType::Int8, DataType::Int8) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Int16, DataType::Int16) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Int32, DataType::Int32) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Int64, DataType::Int64) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::UInt8, DataType::UInt8) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::UInt16, DataType::UInt16) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::UInt32, DataType::UInt32) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::UInt64, DataType::UInt64) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Float16, DataType::Float16) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Float32, DataType::Float32) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Float64, DataType::Float64) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Utf8, DataType::Utf8) => { - compare_op(as_string_array($LEFT), as_string_array($RIGHT), $OP) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => compare_op( - as_largestring_array($LEFT), - as_largestring_array($RIGHT), - $OP, - ), - (DataType::FixedSizeBinary(_), DataType::FixedSizeBinary(_)) => { - let lhs = $LEFT - .as_any() - .downcast_ref::() - .unwrap(); - let rhs = $RIGHT - .as_any() - .downcast_ref::() - .unwrap(); - - compare_op(lhs, rhs, $OP) - } - (DataType::Binary, DataType::Binary) => compare_op( - as_generic_binary_array::($LEFT), - as_generic_binary_array::($RIGHT), - $OP, - ), - (DataType::LargeBinary, DataType::LargeBinary) => compare_op( - as_generic_binary_array::($LEFT), - as_generic_binary_array::($RIGHT), - $OP, - ), - ( - DataType::Timestamp(TimeUnit::Nanosecond, _), - DataType::Timestamp(TimeUnit::Nanosecond, _), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - DataType::Timestamp(TimeUnit::Microsecond, _), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Timestamp(TimeUnit::Millisecond, _), - DataType::Timestamp(TimeUnit::Millisecond, _), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Timestamp(TimeUnit::Second, _), - DataType::Timestamp(TimeUnit::Second, _), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - (DataType::Date32, DataType::Date32) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Date64, DataType::Date64) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - (DataType::Time32(TimeUnit::Second), DataType::Time32(TimeUnit::Second)) => { - cmp_primitive_array::($LEFT, $RIGHT, $OP) - } - ( - DataType::Time32(TimeUnit::Millisecond), - DataType::Time32(TimeUnit::Millisecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Time64(TimeUnit::Microsecond), - DataType::Time64(TimeUnit::Microsecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Time64(TimeUnit::Nanosecond), - DataType::Time64(TimeUnit::Nanosecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Interval(IntervalUnit::YearMonth), - DataType::Interval(IntervalUnit::YearMonth), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Interval(IntervalUnit::DayTime), - DataType::Interval(IntervalUnit::DayTime), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Interval(IntervalUnit::MonthDayNano), - DataType::Interval(IntervalUnit::MonthDayNano), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Duration(TimeUnit::Second), - DataType::Duration(TimeUnit::Second), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Duration(TimeUnit::Millisecond), - DataType::Duration(TimeUnit::Millisecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Duration(TimeUnit::Microsecond), - DataType::Duration(TimeUnit::Microsecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - ( - DataType::Duration(TimeUnit::Nanosecond), - DataType::Duration(TimeUnit::Nanosecond), - ) => cmp_primitive_array::($LEFT, $RIGHT, $OP), - (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( - "Comparing arrays of type {} is not yet implemented", - t1 - ))), - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot compare two arrays of different types ({} and {})", - t1, t2 - ))), - } - }}; +/// Perform `left == right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::eq")] +pub fn eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::eq(&left, &Scalar::new(&right)) } -/// Applies $OP to $LEFT and $RIGHT which are two dictionaries which have (the same) key type $KT -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_dict_cmp { - ($LEFT: expr, $RIGHT: expr, $OP: expr, $OP_FLOAT: expr, $OP_BOOL: expr, $KT: tt) => {{ - match ($LEFT.value_type(), $RIGHT.value_type()) { - (DataType::Boolean, DataType::Boolean) => { - cmp_dict_bool::<$KT, _>($LEFT, $RIGHT, $OP_BOOL) - } - (DataType::Int8, DataType::Int8) => { - cmp_dict::<$KT, Int8Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Int16, DataType::Int16) => { - cmp_dict::<$KT, Int16Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Int32, DataType::Int32) => { - cmp_dict::<$KT, Int32Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Int64, DataType::Int64) => { - cmp_dict::<$KT, Int64Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::UInt8, DataType::UInt8) => { - cmp_dict::<$KT, UInt8Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::UInt16, DataType::UInt16) => { - cmp_dict::<$KT, UInt16Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::UInt32, DataType::UInt32) => { - cmp_dict::<$KT, UInt32Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::UInt64, DataType::UInt64) => { - cmp_dict::<$KT, UInt64Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Float16, DataType::Float16) => { - cmp_dict::<$KT, Float16Type, _>($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Float32, DataType::Float32) => { - cmp_dict::<$KT, Float32Type, _>($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Float64, DataType::Float64) => { - cmp_dict::<$KT, Float64Type, _>($LEFT, $RIGHT, $OP_FLOAT) - } - (DataType::Decimal128(_, s1), DataType::Decimal128(_, s2)) if s1 == s2 => { - cmp_dict::<$KT, Decimal128Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Decimal256(_, s1), DataType::Decimal256(_, s2)) if s1 == s2 => { - cmp_dict::<$KT, Decimal256Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Utf8, DataType::Utf8) => { - cmp_dict_utf8::<$KT, i32, _>($LEFT, $RIGHT, $OP) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - cmp_dict_utf8::<$KT, i64, _>($LEFT, $RIGHT, $OP) - } - (DataType::Binary, DataType::Binary) => { - cmp_dict_binary::<$KT, i32, _>($LEFT, $RIGHT, $OP) - } - (DataType::LargeBinary, DataType::LargeBinary) => { - cmp_dict_binary::<$KT, i64, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Timestamp(TimeUnit::Nanosecond, _), - DataType::Timestamp(TimeUnit::Nanosecond, _), - ) => { - cmp_dict::<$KT, TimestampNanosecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Timestamp(TimeUnit::Microsecond, _), - DataType::Timestamp(TimeUnit::Microsecond, _), - ) => { - cmp_dict::<$KT, TimestampMicrosecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Timestamp(TimeUnit::Millisecond, _), - DataType::Timestamp(TimeUnit::Millisecond, _), - ) => { - cmp_dict::<$KT, TimestampMillisecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Timestamp(TimeUnit::Second, _), - DataType::Timestamp(TimeUnit::Second, _), - ) => { - cmp_dict::<$KT, TimestampSecondType, _>($LEFT, $RIGHT, $OP) - } - (DataType::Date32, DataType::Date32) => { - cmp_dict::<$KT, Date32Type, _>($LEFT, $RIGHT, $OP) - } - (DataType::Date64, DataType::Date64) => { - cmp_dict::<$KT, Date64Type, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Time32(TimeUnit::Second), - DataType::Time32(TimeUnit::Second), - ) => { - cmp_dict::<$KT, Time32SecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Time32(TimeUnit::Millisecond), - DataType::Time32(TimeUnit::Millisecond), - ) => { - cmp_dict::<$KT, Time32MillisecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Time64(TimeUnit::Microsecond), - DataType::Time64(TimeUnit::Microsecond), - ) => { - cmp_dict::<$KT, Time64MicrosecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Time64(TimeUnit::Nanosecond), - DataType::Time64(TimeUnit::Nanosecond), - ) => { - cmp_dict::<$KT, Time64NanosecondType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Interval(IntervalUnit::YearMonth), - DataType::Interval(IntervalUnit::YearMonth), - ) => { - cmp_dict::<$KT, IntervalYearMonthType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Interval(IntervalUnit::DayTime), - DataType::Interval(IntervalUnit::DayTime), - ) => { - cmp_dict::<$KT, IntervalDayTimeType, _>($LEFT, $RIGHT, $OP) - } - ( - DataType::Interval(IntervalUnit::MonthDayNano), - DataType::Interval(IntervalUnit::MonthDayNano), - ) => { - cmp_dict::<$KT, IntervalMonthDayNanoType, _>($LEFT, $RIGHT, $OP) - } - (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( - "Comparing dictionary arrays of value type {} is not yet implemented", - t1 - ))), - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot compare two dictionary arrays of different value types ({} and {})", - t1, t2 - ))), - } - }}; +/// Perform `left < right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::lt")] +pub fn lt_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::lt(&left, &Scalar::new(&right)) } -#[cfg(feature = "dyn_cmp_dict")] -macro_rules! typed_dict_compares { - // Applies `LEFT OP RIGHT` when `LEFT` and `RIGHT` both are `DictionaryArray` - ($LEFT: expr, $RIGHT: expr, $OP: expr, $OP_FLOAT: expr, $OP_BOOL: expr) => {{ - match ($LEFT.data_type(), $RIGHT.data_type()) { - (DataType::Dictionary(left_key_type, _), DataType::Dictionary(right_key_type, _))=> { - match (left_key_type.as_ref(), right_key_type.as_ref()) { - (DataType::Int8, DataType::Int8) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, Int8Type) - } - (DataType::Int16, DataType::Int16) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, Int16Type) - } - (DataType::Int32, DataType::Int32) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, Int32Type) - } - (DataType::Int64, DataType::Int64) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, Int64Type) - } - (DataType::UInt8, DataType::UInt8) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, UInt8Type) - } - (DataType::UInt16, DataType::UInt16) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, UInt16Type) - } - (DataType::UInt32, DataType::UInt32) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, UInt32Type) - } - (DataType::UInt64, DataType::UInt64) => { - let left = as_dictionary_array::($LEFT); - let right = as_dictionary_array::($RIGHT); - typed_dict_cmp!(left, right, $OP, $OP_FLOAT, $OP_BOOL, UInt64Type) - } - (t1, t2) if t1 == t2 => Err(ArrowError::NotYetImplemented(format!( - "Comparing dictionary arrays of type {} is not yet implemented", - t1 - ))), - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot compare two dictionary arrays of different key types ({} and {})", - t1, t2 - ))), - } - } - (t1, t2) => Err(ArrowError::CastError(format!( - "Cannot compare dictionary array with non-dictionary array ({} and {})", - t1, t2 - ))), - } - }}; +/// Perform `left >= right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] +pub fn gt_eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } -#[cfg(not(feature = "dyn_cmp_dict"))] -macro_rules! typed_dict_compares { - ($LEFT: expr, $RIGHT: expr, $OP: expr, $OP_FLOAT: expr, $OP_BOOL: expr) => {{ - Err(ArrowError::CastError(format!( - "Comparing array of type {} with array of type {} requires \"dyn_cmp_dict\" feature", - $LEFT.data_type(), $RIGHT.data_type() - ))) - }} +/// Perform `left <= right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] +pub fn lt_eq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } -/// Perform given operation on `DictionaryArray` and `PrimitiveArray`. The value -/// type of `DictionaryArray` is same as `PrimitiveArray`'s type. -#[cfg(feature = "dyn_cmp_dict")] -fn cmp_dict_primitive( - left: &DictionaryArray, - right: &dyn Array, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - T: ArrowPrimitiveType + Sync + Send, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op( - left.downcast_dict::>().unwrap(), - right.as_primitive::(), - op, - ) +/// Perform `left > right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::gt")] +pub fn gt_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::gt(&left, &Scalar::new(&right)) } -/// Perform given operation on `DictionaryArray` and `GenericStringArray`. The value -/// type of `DictionaryArray` is same as `GenericStringArray`'s type. -#[cfg(feature = "dyn_cmp_dict")] -fn cmp_dict_string_array( - left: &DictionaryArray, - right: &dyn Array, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(&str, &str) -> bool, -{ - compare_op( - left.downcast_dict::>() - .unwrap(), - right - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ) +/// Perform `left != right` operation on an array and a numeric scalar +/// value. Supports StringArrays, and DictionaryArrays that have string values +#[deprecated(note = "Use arrow_ord::cmp::neq")] +pub fn neq_dyn_utf8_scalar( + left: &dyn Array, + right: &str, +) -> Result { + let right = make_utf8_scalar(left.data_type(), right)?; + crate::cmp::neq(&left, &Scalar::new(&right)) } -/// Perform given operation on `DictionaryArray` and `BooleanArray`. The value -/// type of `DictionaryArray` is same as `BooleanArray`'s type. -#[cfg(feature = "dyn_cmp_dict")] -fn cmp_dict_boolean_array( - left: &DictionaryArray, - right: &dyn Array, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(bool, bool) -> bool, -{ - compare_op( - left.downcast_dict::().unwrap(), - right.as_any().downcast_ref::().unwrap(), - op, - ) +/// Perform `left == right` operation on an array and a numeric scalar +/// value. +#[deprecated(note = "Use arrow_ord::cmp::eq")] +pub fn eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::eq(&left, &Scalar::new(&right)) } -/// Perform given operation on `DictionaryArray` and `GenericBinaryArray`. The value -/// type of `DictionaryArray` is same as `GenericBinaryArray`'s type. -#[cfg(feature = "dyn_cmp_dict")] -fn cmp_dict_binary_array( - left: &DictionaryArray, - right: &dyn Array, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(&[u8], &[u8]) -> bool, -{ - compare_op( - left.downcast_dict::>() - .unwrap(), - right - .as_any() - .downcast_ref::>() - .unwrap(), - op, - ) +/// Perform `left < right` operation on an array and a numeric scalar +/// value. Supports BooleanArrays. +#[deprecated(note = "Use arrow_ord::cmp::lt")] +pub fn lt_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::lt(&left, &Scalar::new(&right)) } -/// Perform given operation on two `DictionaryArray`s which value type is -/// primitive type. Returns an error if the two arrays have different value -/// type -#[cfg(feature = "dyn_cmp_dict")] -pub fn cmp_dict( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - T: ArrowPrimitiveType + Sync + Send, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op( - left.downcast_dict::>().unwrap(), - right.downcast_dict::>().unwrap(), - op, - ) +/// Perform `left > right` operation on an array and a numeric scalar +/// value. Supports BooleanArrays. +#[deprecated(note = "Use arrow_ord::cmp::gt")] +pub fn gt_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::gt(&left, &Scalar::new(&right)) } -/// Perform the given operation on two `DictionaryArray`s which value type is -/// `DataType::Boolean`. -#[cfg(feature = "dyn_cmp_dict")] -pub fn cmp_dict_bool( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(bool, bool) -> bool, -{ - compare_op( - left.downcast_dict::().unwrap(), - right.downcast_dict::().unwrap(), - op, - ) +/// Perform `left <= right` operation on an array and a numeric scalar +/// value. Supports BooleanArrays. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] +pub fn lt_eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } -/// Perform the given operation on two `DictionaryArray`s which value type is -/// `DataType::Utf8` or `DataType::LargeUtf8`. -#[cfg(feature = "dyn_cmp_dict")] -pub fn cmp_dict_utf8( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(&str, &str) -> bool, -{ - compare_op( - left.downcast_dict::>() - .unwrap(), - right - .downcast_dict::>() - .unwrap(), - op, - ) +/// Perform `left >= right` operation on an array and a numeric scalar +/// value. Supports BooleanArrays. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] +pub fn gt_eq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } -/// Perform the given operation on two `DictionaryArray`s which value type is -/// `DataType::Binary` or `DataType::LargeBinary`. -#[cfg(feature = "dyn_cmp_dict")] -pub fn cmp_dict_binary( - left: &DictionaryArray, - right: &DictionaryArray, - op: F, -) -> Result -where - K: ArrowDictionaryKeyType, - F: Fn(&[u8], &[u8]) -> bool, -{ - compare_op( - left.downcast_dict::>() - .unwrap(), - right - .downcast_dict::>() - .unwrap(), - op, - ) +/// Perform `left != right` operation on an array and a numeric scalar +/// value. Supports BooleanArrays. +#[deprecated(note = "Use arrow_ord::cmp::neq")] +pub fn neq_dyn_bool_scalar( + left: &dyn Array, + right: bool, +) -> Result { + let right = BooleanArray::from(vec![right]); + crate::cmp::neq(&left, &Scalar::new(&right)) } /// Perform `left == right` operation on two (dynamic) [`Array`]s. @@ -2162,29 +977,9 @@ where /// let result = eq_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(true), None, Some(false)]), result); /// ``` +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a == b, |a, b| a.is_eq(b), |a, b| a - == b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a == b, |a, b| a == b, |a, b| a - .is_eq(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a == b, |a, b| a == b, |a, b| b - .is_eq(a)) - } - _ => { - typed_compares!(left, right, |a, b| !(a ^ b), |a, b| a == b, |a, b| a - .is_eq(b)) - } - } + crate::cmp::eq(&left, &right) } /// Perform `left != right` operation on two (dynamic) [`Array`]s. @@ -2209,29 +1004,9 @@ pub fn eq_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a != b, |a, b| a.is_ne(b), |a, b| a - != b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a != b, |a, b| a != b, |a, b| a - .is_ne(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a != b, |a, b| a != b, |a, b| b - .is_ne(a)) - } - _ => { - typed_compares!(left, right, |a, b| (a ^ b), |a, b| a != b, |a, b| a - .is_ne(b)) - } - } + crate::cmp::neq(&left, &right) } /// Perform `left < right` operation on two (dynamic) [`Array`]s. @@ -2255,30 +1030,9 @@ pub fn neq_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a < b, |a, b| a.is_lt(b), |a, b| a - < b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a < b, |a, b| a < b, |a, b| a - .is_lt(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a > b, |a, b| a > b, |a, b| b - .is_lt(a)) - } - _ => { - typed_compares!(left, right, |a, b| ((!a) & b), |a, b| a < b, |a, b| a - .is_lt(b)) - } - } + crate::cmp::lt(&left, &right) } /// Perform `left <= right` operation on two (dynamic) [`Array`]s. @@ -2302,32 +1056,12 @@ pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a <= b, |a, b| a.is_le(b), |a, b| a - <= b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a <= b, |a, b| a <= b, |a, b| a - .is_le(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a >= b, |a, b| a >= b, |a, b| b - .is_le(a)) - } - _ => { - typed_compares!(left, right, |a, b| !(a & (!b)), |a, b| a <= b, |a, b| a - .is_le(b)) - } - } + crate::cmp::lt_eq(&left, &right) } /// Perform `left > right` operation on two (dynamic) [`Array`]s. @@ -2350,30 +1084,9 @@ pub fn lt_eq_dyn( /// let result = gt_dyn(&array1, &array2).unwrap(); /// assert_eq!(BooleanArray::from(vec![Some(true), Some(false), None]), result); /// ``` -#[allow(clippy::bool_comparison)] +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a > b, |a, b| a.is_gt(b), |a, b| a - > b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a > b, |a, b| a > b, |a, b| a - .is_gt(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a < b, |a, b| a < b, |a, b| b - .is_gt(a)) - } - _ => { - typed_compares!(left, right, |a, b| (a & (!b)), |a, b| a > b, |a, b| a - .is_gt(b)) - } - } + crate::cmp::gt(&left, &right) } /// Perform `left >= right` operation on two (dynamic) [`Array`]s. @@ -2396,32 +1109,12 @@ pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { - match left.data_type() { - DataType::Dictionary(_, _) - if matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_dict_compares!(left, right, |a, b| a >= b, |a, b| a.is_ge(b), |a, b| a - >= b) - } - DataType::Dictionary(_, _) - if !matches!(right.data_type(), DataType::Dictionary(_, _)) => - { - typed_cmp_dict_non_dict!(left, right, |a, b| a >= b, |a, b| a >= b, |a, b| a - .is_ge(b)) - } - _ if matches!(right.data_type(), DataType::Dictionary(_, _)) => { - typed_cmp_dict_non_dict!(right, left, |a, b| a <= b, |a, b| a <= b, |a, b| b - .is_ge(a)) - } - _ => { - typed_compares!(left, right, |a, b| !((!a) & b), |a, b| a >= b, |a, b| a - .is_ge(b)) - } - } + crate::cmp::gt_eq(&left, &right) } /// Perform `left == right` operation on two [`PrimitiveArray`]s. @@ -2432,6 +1125,7 @@ pub fn gt_eq_dyn( /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2440,20 +1134,17 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::eq, |a, b| a == b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_eq(b)); + crate::cmp::eq(&left, &right) } /// Perform `left == right` operation on a [`PrimitiveArray`] and a scalar value. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_scalar( left: &PrimitiveArray, right: T::Native, @@ -2462,10 +1153,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::eq, |a, b| a == b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_eq(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::eq(&left, &Scalar::new(&right)) } /// Applies an unary and infallible comparison function to a primitive array. @@ -2488,6 +1177,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2496,10 +1186,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::ne, |a, b| a != b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_ne(b)); + crate::cmp::neq(&left, &right) } /// Perform `left != right` operation on a [`PrimitiveArray`] and a scalar value. @@ -2510,6 +1197,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::neq")] pub fn neq_scalar( left: &PrimitiveArray, right: T::Native, @@ -2518,10 +1206,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::ne, |a, b| a != b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_ne(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::neq(&left, &Scalar::new(&right)) } /// Perform `left < right` operation on two [`PrimitiveArray`]s. Null values are less than non-null @@ -2533,6 +1219,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2541,10 +1228,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::lt, |a, b| a < b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_lt(b)); + crate::cmp::lt(&left, &right) } /// Perform `left < right` operation on a [`PrimitiveArray`] and a scalar value. @@ -2556,6 +1240,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt")] pub fn lt_scalar( left: &PrimitiveArray, right: T::Native, @@ -2564,10 +1249,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::lt, |a, b| a < b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_lt(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::lt(&left, &Scalar::new(&right)) } /// Perform `left <= right` operation on two [`PrimitiveArray`]s. Null values are less than non-null @@ -2579,6 +1262,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2587,10 +1271,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::le, |a, b| a <= b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_le(b)); + crate::cmp::lt_eq(&left, &right) } /// Perform `left <= right` operation on a [`PrimitiveArray`] and a scalar value. @@ -2602,6 +1283,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::lt_eq")] pub fn lt_eq_scalar( left: &PrimitiveArray, right: T::Native, @@ -2610,10 +1292,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::le, |a, b| a <= b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_le(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::lt_eq(&left, &Scalar::new(&right)) } /// Perform `left > right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null @@ -2625,6 +1305,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2633,10 +1314,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::gt, |a, b| a > b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_gt(b)); + crate::cmp::gt(&left, &right) } /// Perform `left > right` operation on a [`PrimitiveArray`] and a scalar value. @@ -2648,6 +1326,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt")] pub fn gt_scalar( left: &PrimitiveArray, right: T::Native, @@ -2656,10 +1335,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::gt, |a, b| a > b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_gt(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::gt(&left, &Scalar::new(&right)) } /// Perform `left >= right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null @@ -2671,6 +1348,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq( left: &PrimitiveArray, right: &PrimitiveArray, @@ -2679,10 +1357,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op(left, right, T::ge, |a, b| a >= b); - #[cfg(not(feature = "simd"))] - return compare_op(left, right, |a, b| a.is_ge(b)); + crate::cmp::gt_eq(&left, &right) } /// Perform `left >= right` operation on a [`PrimitiveArray`] and a scalar value. @@ -2694,6 +1369,7 @@ where /// Note that totalOrder treats positive and negative zeros are different. If it is necessary /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. +#[deprecated(note = "Use arrow_ord::cmp::gt_eq")] pub fn gt_eq_scalar( left: &PrimitiveArray, right: T::Native, @@ -2702,10 +1378,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - #[cfg(feature = "simd")] - return simd_compare_op_scalar(left, right, T::ge, |a, b| a >= b); - #[cfg(not(feature = "simd"))] - return compare_op_scalar(left, |a| a.is_ge(right)); + let right = PrimitiveArray::::new(vec![right].into(), None); + crate::cmp::gt_eq(&left, &Scalar::new(&right)) } /// Checks if a [`GenericListArray`] contains a value in the [`PrimitiveArray`] @@ -2793,14 +1467,18 @@ where // disable wrapping inside literal vectors used for test data and assertions #[rustfmt::skip::macros(vec)] #[cfg(test)] +#[allow(deprecated)] mod tests { - use super::*; + use std::sync::Arc; + use arrow_array::builder::{ ListBuilder, PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder, }; - use arrow_buffer::i256; + use arrow_buffer::{i256, Buffer}; + use arrow_data::ArrayData; use arrow_schema::Field; - use std::sync::Arc; + + use super::*; /// Evaluate `KERNEL` with two vectors as inputs and assert against the expected output. /// `A_VEC` and `B_VEC` can be of type `Vec` or `Vec>` where `T` is the native @@ -4645,7 +3323,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_i8_array() { // Construct a value array let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); @@ -4667,7 +3344,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_u64_array() { let values = UInt64Array::from_iter_values([10_u64, 11, 12, 13, 14, 15, 16, 17]); let values = Arc::new(values) as ArrayRef; @@ -4688,7 +3364,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_utf8_array() { let test1 = vec!["a", "a", "b", "c"]; let test2 = vec!["a", "b", "b", "c"]; @@ -4716,7 +3391,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_binary_array() { let values: BinaryArray = ["hello", "", "parquet"] .into_iter() @@ -4740,7 +3414,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_interval_array() { let values = IntervalDayTimeArray::from(vec![1, 6, 10, 2, 3, 5]); let values = Arc::new(values) as ArrayRef; @@ -4761,7 +3434,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_date_array() { let values = Date32Array::from(vec![1, 6, 10, 2, 3, 5]); let values = Arc::new(values) as ArrayRef; @@ -4782,7 +3454,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_bool_array() { let values = BooleanArray::from(vec![true, false]); let values = Arc::new(values) as ArrayRef; @@ -4803,7 +3474,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_gt_dyn_dictionary_i8_array() { // Construct a value array let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); @@ -4834,7 +3504,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_gt_dyn_dictionary_bool_array() { let values = BooleanArray::from(vec![true, false]); let values = Arc::new(values) as ArrayRef; @@ -4876,7 +3545,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_i8_i8_array() { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); let keys = Int8Array::from_iter_values([2_i8, 3, 4]); @@ -4911,7 +3579,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_i8_i8_array() { let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); let keys = Int8Array::from_iter_values([2_i8, 3, 4]); @@ -4984,7 +3651,6 @@ mod tests { ); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(eq(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -4992,7 +3658,6 @@ mod tests { ); assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(neq(&array1, &array2).unwrap(), expected); let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] @@ -5008,7 +3673,6 @@ mod tests { ); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(eq(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5016,7 +3680,6 @@ mod tests { ); assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(neq(&array1, &array2).unwrap(), expected); let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] @@ -5033,7 +3696,6 @@ mod tests { ); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(eq(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5041,7 +3703,6 @@ mod tests { ); assert_eq!(neq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(neq(&array1, &array2).unwrap(), expected); } @@ -5061,7 +3722,6 @@ mod tests { ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5069,7 +3729,6 @@ mod tests { ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] @@ -5086,7 +3745,6 @@ mod tests { ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5094,7 +3752,6 @@ mod tests { ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] @@ -5111,7 +3768,6 @@ mod tests { ); assert_eq!(lt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5119,7 +3775,6 @@ mod tests { ); assert_eq!(lt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); } @@ -5139,7 +3794,6 @@ mod tests { ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5147,7 +3801,6 @@ mod tests { ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] @@ -5164,7 +3817,6 @@ mod tests { ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5172,7 +3824,6 @@ mod tests { ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] @@ -5189,7 +3840,6 @@ mod tests { ); assert_eq!(gt_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -5197,7 +3847,6 @@ mod tests { ); assert_eq!(gt_eq_dyn(&array1, &array2).unwrap(), expected); - #[cfg(not(feature = "simd"))] assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); } @@ -5207,21 +3856,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); @@ -5231,21 +3871,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); @@ -5255,21 +3886,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); assert_eq!(eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); @@ -5282,21 +3904,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); assert_eq!(lt_dyn_scalar(&array, f16::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(true)], ); @@ -5306,21 +3919,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); assert_eq!(lt_dyn_scalar(&array, f32::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(true)], ); @@ -5330,21 +3934,12 @@ mod tests { .into_iter() .map(Some) .collect(); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] + let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); assert_eq!(lt_dyn_scalar(&array, f64::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(true), Some(true), Some(true), Some(true)], ); @@ -5362,11 +3957,6 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, f16::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); @@ -5381,11 +3971,6 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, f32::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); @@ -5400,11 +3985,6 @@ mod tests { ); assert_eq!(gt_dyn_scalar(&array, f64::NAN).unwrap(), expected); - #[cfg(feature = "simd")] - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(false), Some(false)], - ); - #[cfg(not(feature = "simd"))] let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); @@ -5412,7 +3992,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_to_utf8_array() { let test1 = vec!["a", "a", "b", "c"]; let test2 = vec!["a", "b", "b", "d"]; @@ -5453,7 +4032,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_utf8_array() { let test1 = vec!["abc", "abc", "b", "cde"]; let test2 = vec!["abc", "b", "b", "def"]; @@ -5518,7 +4096,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_to_binary_array() { let values: BinaryArray = ["hello", "", "parquet"] .into_iter() @@ -5559,7 +4136,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_binary_array() { let values: BinaryArray = ["hello", "", "parquet"] .into_iter() @@ -5624,7 +4200,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dict_non_dict_float_nan() { let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] .into_iter() @@ -5683,7 +4258,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_dict_non_dict_float_nan() { let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] .into_iter() @@ -5741,7 +4315,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_gt_dyn_gt_eq_dyn_dict_non_dict_float_nan() { let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] .into_iter() @@ -5799,7 +4372,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_eq_dyn_neq_dyn_dictionary_to_boolean_array() { let test1 = vec![Some(true), None, Some(false)]; let test2 = vec![Some(true), None, None, Some(true)]; @@ -5836,7 +4408,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_boolean_array() { let test1 = vec![Some(true), None, Some(false)]; let test2 = vec![Some(true), None, None, Some(true)]; @@ -5897,7 +4468,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_decimal128() { let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); @@ -5934,7 +4504,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_non_dict_decimal128() { let array1: Decimal128Array = Decimal128Array::from_iter_values([1, 2, 5, 4, 3, 0]); @@ -5970,7 +4539,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_decimal256() { let values = Decimal256Array::from_iter_values( [0, 1, 2, 3, 4, 5].into_iter().map(i256::from_i128), @@ -6011,7 +4579,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_cmp_dict_non_dict_decimal256() { let array1: Decimal256Array = Decimal256Array::from_iter_values( [1, 2, 5, 4, 3, 0].into_iter().map(i256::from_i128), @@ -6317,7 +4884,6 @@ mod tests { } #[test] - #[cfg(not(feature = "simd"))] fn test_floating_zeros() { let a = Float32Array::from(vec![0.0_f32, -0.0]); let b = Float32Array::from(vec![-0.0_f32, 0.0]); @@ -6354,7 +4920,6 @@ mod tests { } #[test] - #[cfg(feature = "dyn_cmp_dict")] fn test_dictionary_nested_nulls() { let keys = Int32Array::from(vec![0, 1, 2]); let v1 = Arc::new(Int32Array::from(vec![Some(0), None, Some(2)])); diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs index 62338c022384..19ad8229417f 100644 --- a/arrow-ord/src/lib.rs +++ b/arrow-ord/src/lib.rs @@ -43,6 +43,8 @@ //! ``` //! +pub mod cmp; +#[doc(hidden)] pub mod comparison; pub mod ord; pub mod partition; diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 4a0a6730d882..52aa5ee8d0f1 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -23,7 +23,7 @@ use arrow_array::{Array, ArrayRef}; use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; -use crate::comparison::neq_dyn; +use crate::cmp::neq; use crate::sort::SortColumn; /// A computed set of partitions, see [`partition`] @@ -158,7 +158,7 @@ fn find_boundaries(v: &dyn Array) -> Result { let v1 = v.slice(0, slice_len); let v2 = v.slice(1, slice_len); - let array_ne = neq_dyn(v1.as_ref(), v2.as_ref())?; + let array_ne = neq(&v1, &v2)?; // Set if values have different non-NULL values let values_ne = match array_ne.nulls().filter(|n| n.null_count() > 0) { Some(n) => n.inner() & array_ne.values(), diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index bcf6a84311d5..9456dd4b012c 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -71,7 +71,7 @@ ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["arrow-json"] -simd = ["arrow-array/simd", "arrow-ord/simd", "arrow-arith/simd"] +simd = ["arrow-array/simd", "arrow-arith/simd"] prettyprint = ["arrow-cast/prettyprint"] # The test utils feature enables code used in benchmarks and tests but # not the core arrow code itself. Be aware that `rand` must be kept as @@ -87,7 +87,7 @@ force_validate = ["arrow-data/force_validate"] ffi = ["arrow-schema/ffi", "arrow-data/ffi"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars -dyn_cmp_dict = ["arrow-string/dyn_cmp_dict", "arrow-ord/dyn_cmp_dict"] +dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 73db3ffed368..b9fb6c8e3300 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -21,61 +21,16 @@ use criterion::Criterion; extern crate arrow; -use arrow::compute::*; -use arrow::datatypes::{ArrowNativeTypeOp, ArrowNumericType, IntervalMonthDayNanoType}; +use arrow::compute::kernels::cmp::*; +use arrow::datatypes::IntervalMonthDayNanoType; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type, datatypes::Int32Type}; +use arrow_array::Scalar; +use arrow_string::like::*; +use arrow_string::regexp::regexp_is_match_utf8_scalar; const SIZE: usize = 65536; -fn bench_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_neq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - neq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_lt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - lt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_lt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - lt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_gt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - gt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_gt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, - ::Native: ArrowNativeTypeOp, -{ - gt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) { like_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); } @@ -104,27 +59,6 @@ fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { .unwrap(); } -#[cfg(not(feature = "dyn_cmp_dict"))] -fn dyn_cmp_dict_benchmarks(_c: &mut Criterion) {} - -#[cfg(feature = "dyn_cmp_dict")] -fn dyn_cmp_dict_benchmarks(c: &mut Criterion) { - let strings = create_string_array::(20, 0.); - let dict_arr_a = create_dict_from_values::(SIZE, 0., &strings); - let dict_arr_b = create_dict_from_values::(SIZE, 0., &strings); - - c.bench_function("eq dictionary[10] string[4])", |b| { - b.iter(|| { - cmp_dict_utf8::<_, i32, _>( - criterion::black_box(&dict_arr_a), - criterion::black_box(&dict_arr_b), - |a, b| a == b, - ) - .unwrap() - }) - }); -} - fn add_benchmark(c: &mut Criterion) { let arr_a = create_primitive_array_with_seed::(SIZE, 0.0, 42); let arr_b = create_primitive_array_with_seed::(SIZE, 0.0, 43); @@ -135,105 +69,79 @@ fn add_benchmark(c: &mut Criterion) { create_primitive_array_with_seed::(SIZE, 0.0, 43); let arr_string = create_string_array::(SIZE, 0.0); + let scalar = Float32Array::from(vec![1.0]); - c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); + c.bench_function("eq Float32", |b| b.iter(|| eq(&arr_a, &arr_b))); c.bench_function("eq scalar Float32", |b| { - b.iter(|| { - eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("neq Float32", |b| b.iter(|| bench_neq(&arr_a, &arr_b))); + c.bench_function("neq Float32", |b| b.iter(|| neq(&arr_a, &arr_b))); c.bench_function("neq scalar Float32", |b| { - b.iter(|| { - neq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| neq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("lt Float32", |b| b.iter(|| bench_lt(&arr_a, &arr_b))); + c.bench_function("lt Float32", |b| b.iter(|| lt(&arr_a, &arr_b))); c.bench_function("lt scalar Float32", |b| { - b.iter(|| { - lt_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| lt(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("lt_eq Float32", |b| b.iter(|| bench_lt_eq(&arr_a, &arr_b))); + c.bench_function("lt_eq Float32", |b| b.iter(|| lt_eq(&arr_a, &arr_b))); c.bench_function("lt_eq scalar Float32", |b| { - b.iter(|| { - lt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| lt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("gt Float32", |b| b.iter(|| bench_gt(&arr_a, &arr_b))); + c.bench_function("gt Float32", |b| b.iter(|| gt(&arr_a, &arr_b))); c.bench_function("gt scalar Float32", |b| { - b.iter(|| { - gt_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| gt(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("gt_eq Float32", |b| b.iter(|| bench_gt_eq(&arr_a, &arr_b))); + c.bench_function("gt_eq Float32", |b| b.iter(|| gt_eq(&arr_a, &arr_b))); c.bench_function("gt_eq scalar Float32", |b| { - b.iter(|| { - gt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1.0)).unwrap() - }) + b.iter(|| gt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); let arr_a = create_primitive_array_with_seed::(SIZE, 0.0, 42); let arr_b = create_primitive_array_with_seed::(SIZE, 0.0, 43); + let scalar = Int32Array::from(vec![1]); - c.bench_function("eq Int32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); + c.bench_function("eq Int32", |b| b.iter(|| eq(&arr_a, &arr_b))); c.bench_function("eq scalar Int32", |b| { - b.iter(|| { - eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("neq Int32", |b| b.iter(|| bench_neq(&arr_a, &arr_b))); + c.bench_function("neq Int32", |b| b.iter(|| neq(&arr_a, &arr_b))); c.bench_function("neq scalar Int32", |b| { - b.iter(|| { - neq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| neq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("lt Int32", |b| b.iter(|| bench_lt(&arr_a, &arr_b))); + c.bench_function("lt Int32", |b| b.iter(|| lt(&arr_a, &arr_b))); c.bench_function("lt scalar Int32", |b| { - b.iter(|| { - lt_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| lt(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("lt_eq Int32", |b| b.iter(|| bench_lt_eq(&arr_a, &arr_b))); + c.bench_function("lt_eq Int32", |b| b.iter(|| lt_eq(&arr_a, &arr_b))); c.bench_function("lt_eq scalar Int32", |b| { - b.iter(|| { - lt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| lt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("gt Int32", |b| b.iter(|| bench_gt(&arr_a, &arr_b))); + c.bench_function("gt Int32", |b| b.iter(|| gt(&arr_a, &arr_b))); c.bench_function("gt scalar Int32", |b| { - b.iter(|| { - gt_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| gt(&arr_a, &Scalar::new(&scalar)).unwrap()) }); - c.bench_function("gt_eq Int32", |b| b.iter(|| bench_gt_eq(&arr_a, &arr_b))); + c.bench_function("gt_eq Int32", |b| b.iter(|| gt_eq(&arr_a, &arr_b))); c.bench_function("gt_eq scalar Int32", |b| { - b.iter(|| { - gt_eq_scalar(criterion::black_box(&arr_a), criterion::black_box(1)).unwrap() - }) + b.iter(|| gt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) }); c.bench_function("eq MonthDayNano", |b| { - b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b)) + b.iter(|| eq(&arr_month_day_nano_a, &arr_month_day_nano_b)) }); + let scalar = IntervalMonthDayNanoArray::from(vec![123]); + c.bench_function("eq scalar MonthDayNano", |b| { - b.iter(|| { - eq_scalar( - criterion::black_box(&arr_month_day_nano_a), - criterion::black_box(123), - ) - .unwrap() - }) + b.iter(|| eq(&arr_month_day_nano_b, &Scalar::new(&scalar)).unwrap()) }); c.bench_function("like_utf8 scalar equals", |b| { @@ -326,14 +234,15 @@ fn add_benchmark(c: &mut Criterion) { let strings = create_string_array::(20, 0.); let dict_arr_a = create_dict_from_values::(SIZE, 0., &strings); + let scalar = StringArray::from(vec!["test"]); c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| { - b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test")) + b.iter(|| eq(&dict_arr_a, &Scalar::new(&scalar))) }); c.bench_function( "gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])", - |b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")), + |b| b.iter(|| gt_eq(&dict_arr_a, &Scalar::new(&scalar))), ); c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| { @@ -344,7 +253,13 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test")) }); - dyn_cmp_dict_benchmarks(c); + let strings = create_string_array::(20, 0.); + let dict_arr_a = create_dict_from_values::(SIZE, 0., &strings); + let dict_arr_b = create_dict_from_values::(SIZE, 0., &strings); + + c.bench_function("eq dictionary[10] string[4])", |b| { + b.iter(|| eq(&dict_arr_a, &dict_arr_b).unwrap()) + }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs index 2f4e2fada9e9..4e99bf3071c9 100644 --- a/arrow/benches/equal.rs +++ b/arrow/benches/equal.rs @@ -20,7 +20,6 @@ #[macro_use] extern crate criterion; -use arrow::compute::eq_utf8_scalar; use criterion::Criterion; extern crate arrow; @@ -32,10 +31,6 @@ fn bench_equal>(arr_a: &A) { criterion::black_box(arr_a == arr_a); } -fn bench_equal_utf8_scalar(arr_a: &GenericStringArray, right: &str) { - criterion::black_box(eq_utf8_scalar(arr_a, right).unwrap()); -} - fn add_benchmark(c: &mut Criterion) { let arr_a = create_primitive_array::(512, 0.0); c.bench_function("equal_512", |b| b.iter(|| bench_equal(&arr_a))); @@ -49,11 +44,6 @@ fn add_benchmark(c: &mut Criterion) { let arr_a = create_string_array::(512, 0.0); c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a))); - let arr_a = create_string_array::(512, 0.0); - c.bench_function("equal_string_scalar_empty_512", |b| { - b.iter(|| bench_equal_utf8_scalar(&arr_a, "")) - }); - let arr_a_nulls = create_string_array::(512, 0.5); c.bench_function("equal_string_nulls_512", |b| { b.iter(|| bench_equal(&arr_a_nulls)) diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs index 1a79aef547d3..dba41625020b 100644 --- a/arrow/src/compute/kernels.rs +++ b/arrow/src/compute/kernels.rs @@ -22,7 +22,7 @@ pub use arrow_arith::{ }; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; -pub use arrow_ord::{partition, sort}; +pub use arrow_ord::{cmp, partition, sort}; pub use arrow_select::{concat, filter, interleave, nullif, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; diff --git a/parquet/examples/async_read_parquet.rs b/parquet/examples/async_read_parquet.rs index f600cd0d11e3..e59cad8055cb 100644 --- a/parquet/examples/async_read_parquet.rs +++ b/parquet/examples/async_read_parquet.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. +use arrow::compute::kernels::cmp::eq; use arrow::util::pretty::print_batches; +use arrow_array::{Int32Array, Scalar}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter}; use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; @@ -44,9 +46,10 @@ async fn main() -> Result<()> { // Highlight: set `RowFilter`, it'll push down filter predicates to skip IO and decode. // For more specific usage: please refer to https://github.com/apache/arrow-datafusion/blob/master/datafusion/core/src/physical_plan/file_format/parquet/row_filter.rs. + let scalar = Int32Array::from(vec![1]); let filter = ArrowPredicateFn::new( ProjectionMask::roots(file_metadata.schema_descr(), [0]), - |record_batch| arrow::compute::eq_dyn_scalar(record_batch.column(0), 1), + move |record_batch| eq(record_batch.column(0), &Scalar::new(&scalar)), ); let row_filter = RowFilter::new(vec![Box::new(filter)]); builder = builder.with_row_filter(row_filter); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c7e0f64783f1..54793c47fea1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -776,10 +776,11 @@ mod tests { use crate::file::footer::parse_metadata; use crate::file::page_index::index_reader; use crate::file::properties::WriterProperties; + use arrow::compute::kernels::cmp::eq; use arrow::error::Result as ArrowResult; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{Array, ArrayRef, Int32Array, StringArray}; + use arrow_array::{Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray}; use futures::TryStreamExt; use rand::{thread_rng, Rng}; use std::sync::Mutex; @@ -1188,14 +1189,16 @@ mod tests { }; let requests = test.requests.clone(); + let a_scalar = StringArray::from_iter_values(["b"]); let a_filter = ArrowPredicateFn::new( ProjectionMask::leaves(&parquet_schema, vec![0]), - |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "b"), + move |batch| eq(batch.column(0), &Scalar::new(&a_scalar)), ); + let b_scalar = StringArray::from_iter_values(["4"]); let b_filter = ArrowPredicateFn::new( ProjectionMask::leaves(&parquet_schema, vec![1]), - |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "4"), + move |batch| eq(batch.column(0), &Scalar::new(&b_scalar)), ); let filter = RowFilter::new(vec![Box::new(a_filter), Box::new(b_filter)]); @@ -1353,12 +1356,13 @@ mod tests { let a_filter = ArrowPredicateFn::new( ProjectionMask::leaves(&parquet_schema, vec![1]), - |batch| arrow::compute::eq_dyn_bool_scalar(batch.column(0), true), + |batch| Ok(batch.column(0).as_boolean().clone()), ); + let b_scalar = Int8Array::from(vec![2]); let b_filter = ArrowPredicateFn::new( ProjectionMask::leaves(&parquet_schema, vec![2]), - |batch| arrow::compute::eq_dyn_scalar(batch.column(0), 2_i32), + move |batch| eq(batch.column(0), &Scalar::new(&b_scalar)), ); let filter = RowFilter::new(vec![Box::new(a_filter), Box::new(b_filter)]); From 1afc7c32bf50f6a57e3e362fe1cf9c1e3a5ab899 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 18 Aug 2023 12:10:50 +0100 Subject: [PATCH 1172/1411] Update parquet object_store 0.7 (#4715) --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a570e5f64b04..c4f3696b43c9 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -44,7 +44,7 @@ arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } # Intentionally not a path dependency as object_store is released separately -object_store = { version = "0.6", default-features = false, optional = true } +object_store = { version = "0.7", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } From 2c487d0eba33569086887d434d971129a77db4eb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 18 Aug 2023 12:37:12 +0100 Subject: [PATCH 1173/1411] Parquet doc tweaks (#4680) * Parquet doc tweaks * Update parquet/src/arrow/mod.rs --- parquet/src/arrow/arrow_reader/mod.rs | 2 ++ parquet/src/arrow/async_reader/mod.rs | 51 ++++++++++++++++++++++----- parquet/src/arrow/mod.rs | 41 +++++++++------------ 3 files changed, 62 insertions(+), 32 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index f7cecabb01d8..5f95a8664b4b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -304,6 +304,8 @@ pub struct SyncReader(T); /// A synchronous builder used to construct [`ParquetRecordBatchReader`] for a file /// /// For an async API see [`crate::arrow::async_reader::ParquetRecordBatchStreamBuilder`] +/// +/// See [`ArrowReaderBuilder`] for additional member functions pub type ParquetRecordBatchReaderBuilder = ArrowReaderBuilder>; impl ParquetRecordBatchReaderBuilder { diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 54793c47fea1..7d30580ece93 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -22,13 +22,13 @@ //! # #[tokio::main(flavor="current_thread")] //! # async fn main() { //! # -//! use arrow_array::RecordBatch; -//! use arrow::util::pretty::pretty_format_batches; -//! use futures::TryStreamExt; -//! use tokio::fs::File; -//! -//! use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; -//! +//! # use arrow_array::RecordBatch; +//! # use arrow::util::pretty::pretty_format_batches; +//! # use futures::TryStreamExt; +//! # use tokio::fs::File; +//! # +//! # use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +//! # //! # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { //! # let formatted = pretty_format_batches(batches).unwrap().to_string(); //! # let actual_lines: Vec<_> = formatted.trim().lines().collect(); @@ -38,7 +38,7 @@ //! # expected_lines, actual_lines //! # ); //! # } -//! +//! # //! let testdata = arrow::util::test_util::parquet_test_data(); //! let path = format!("{}/alltypes_plain.parquet", testdata); //! let file = File::open(path).await.unwrap(); @@ -241,6 +241,8 @@ pub struct AsyncReader(T); /// In particular, this handles reading the parquet file metadata, allowing consumers /// to use this information to select what specific columns, row groups, etc... /// they wish to be read by the resulting stream +/// +/// See [`ArrowReaderBuilder`] for additional member functions pub type ParquetRecordBatchStreamBuilder = ArrowReaderBuilder>; impl ParquetRecordBatchStreamBuilder { @@ -263,6 +265,39 @@ impl ParquetRecordBatchStreamBuilder { /// /// This allows loading metadata once and using it to create multiple builders with /// potentially different settings + /// + /// ``` + /// # use std::fs::metadata; + /// # use std::sync::Arc; + /// # use bytes::Bytes; + /// # use arrow_array::{Int32Array, RecordBatch}; + /// # use arrow_schema::{DataType, Field, Schema}; + /// # use parquet::arrow::arrow_reader::ArrowReaderMetadata; + /// # use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder}; + /// # use tempfile::tempfile; + /// # use futures::StreamExt; + /// # #[tokio::main(flavor="current_thread")] + /// # async fn main() { + /// # + /// let mut file = tempfile().unwrap(); + /// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)])); + /// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap(); + /// # let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap(); + /// # writer.write(&batch).unwrap(); + /// # writer.close().unwrap(); + /// # + /// let mut file = tokio::fs::File::from_std(file); + /// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap(); + /// let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata( + /// file.try_clone().await.unwrap(), + /// meta.clone() + /// ).build().unwrap(); + /// let mut b = ParquetRecordBatchStreamBuilder::new_with_metadata(file, meta).build().unwrap(); + /// + /// // Should be able to read from both in parallel + /// assert_eq!(a.next().await.unwrap().unwrap(), b.next().await.unwrap().unwrap()); + /// # } + /// ``` pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self { Self::new_builder(AsyncReader(input), metadata) } diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 8cca79b40e93..0174db6b517f 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -25,12 +25,13 @@ //!# Example of writing Arrow record batch to Parquet file //! //!```rust -//! use arrow_array::{Int32Array, ArrayRef}; -//! use arrow_array::RecordBatch; -//! use parquet::arrow::arrow_writer::ArrowWriter; -//! use parquet::file::properties::WriterProperties; -//! use std::fs::File; -//! use std::sync::Arc; +//! # use arrow_array::{Int32Array, ArrayRef}; +//! # use arrow_array::RecordBatch; +//! # use parquet::arrow::arrow_writer::ArrowWriter; +//! # use parquet::file::properties::WriterProperties; +//! # use tempfile::tempfile; +//! # use std::sync::Arc; +//! # use parquet::basic::Compression; //! let ids = Int32Array::from(vec![1, 2, 3, 4]); //! let vals = Int32Array::from(vec![5, 6, 7, 8]); //! let batch = RecordBatch::try_from_iter(vec![ @@ -38,9 +39,14 @@ //! ("val", Arc::new(vals) as ArrayRef), //! ]).unwrap(); //! -//! let file = File::create("data.parquet").unwrap(); +//! let file = tempfile().unwrap(); //! -//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None).unwrap(); +//! // WriterProperties can be used to set Parquet file options +//! let props = WriterProperties::builder() +//! .set_compression(Compression::SNAPPY) +//! .build(); +//! +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); //! //! writer.write(&batch).expect("Writing batch"); //! @@ -48,24 +54,11 @@ //! writer.close().unwrap(); //! ``` //! -//! `WriterProperties` can be used to set Parquet file options -//! ```rust -//! use parquet::file::properties::WriterProperties; -//! use parquet::basic::{ Compression, Encoding }; -//! use parquet::file::properties::WriterVersion; -//! -//! // File compression -//! let props = WriterProperties::builder() -//! .set_compression(Compression::SNAPPY) -//! .build(); -//! ``` -//! //! # Example of reading parquet file into arrow record batch //! //! ```rust -//! use std::fs::File; -//! use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -//! +//! # use std::fs::File; +//! # use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; //! # use std::sync::Arc; //! # use arrow_array::Int32Array; //! # use arrow::datatypes::{DataType, Field, Schema}; @@ -88,7 +81,7 @@ //! # writer.write(&batch).expect("Writing batch"); //! # } //! # writer.close().unwrap(); -//! +//! # //! let file = File::open("data.parquet").unwrap(); //! //! let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); From 23db567d05bc21df56f5f7d08288f209de9fd785 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:07:32 +0100 Subject: [PATCH 1174/1411] Cleanup redundant link targets (#4719) --- .../src/array/fixed_size_list_array.rs | 3 +- arrow-array/src/array/list_array.rs | 3 +- arrow-array/src/array/mod.rs | 4 +-- arrow-array/src/record_batch.rs | 2 +- arrow-flight/src/client.rs | 12 +++---- arrow-flight/src/encode.rs | 4 +-- arrow/src/datatypes/mod.rs | 6 ++-- arrow/src/lib.rs | 2 +- parquet/src/column/page.rs | 6 ++-- parquet/src/file/serialized_reader.rs | 2 +- parquet/src/record/reader.rs | 19 +++++------ parquet/src/schema/parser.rs | 4 +-- parquet/src/schema/printer.rs | 8 ++--- parquet/src/schema/types.rs | 33 +++++++++---------- 14 files changed, 51 insertions(+), 57 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 8996fc8da408..db3ccbe0617b 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -32,8 +32,7 @@ use std::sync::Arc; /// Lists are represented using a `values` child /// array where each list has a fixed size of `value_length`. /// -/// Use [`FixedSizeListBuilder`](crate::builder::FixedSizeListBuilder) to -/// construct a [`FixedSizeListArray`]. +/// Use [`FixedSizeListBuilder`] to construct a [`FixedSizeListArray`]. /// /// # Representation /// diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 3508e4f1c469..e36d0ac4434f 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -64,8 +64,7 @@ impl OffsetSizeTrait for i64 { /// Arrow defines [`ListArray`] with `i32` offsets and /// [`LargeListArray`] with `i64` offsets. /// -/// Use [`GenericListBuilder`](crate::builder::GenericListBuilder) to -/// construct a [`GenericListArray`]. +/// Use [`GenericListBuilder`] to construct a [`GenericListArray`]. /// /// # Representation /// diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 79240d105a44..905ec1e5431b 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -69,7 +69,7 @@ pub use run_array::*; /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { - /// Returns the array as [`Any`](std::any::Any) so that it can be + /// Returns the array as [`Any`] so that it can be /// downcasted to a specific implementation. /// /// # Example: @@ -101,7 +101,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// Unlike [`Array::to_data`] this consumes self, allowing it avoid unnecessary clones fn into_data(self) -> ArrayData; - /// Returns a reference to the [`DataType`](arrow_schema::DataType) of this array. + /// Returns a reference to the [`DataType`] of this array. /// /// # Example: /// diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 80c0e4b96741..886d00e0c2a9 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -238,7 +238,7 @@ impl RecordBatch { }) } - /// Returns the [`Schema`](arrow_schema::Schema) of the record batch. + /// Returns the [`Schema`] of the record batch. pub fn schema(&self) -> SchemaRef { self.schema.clone() } diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index f843bbf7cd0c..2c952fb3bfbf 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -74,7 +74,7 @@ pub struct FlightClient { } impl FlightClient { - /// Creates a client client with the provided [`Channel`](tonic::transport::Channel) + /// Creates a client client with the provided [`Channel`] pub fn new(channel: Channel) -> Self { Self::new_from_inner(FlightServiceClient::new(channel)) } @@ -262,7 +262,7 @@ impl FlightClient { } /// Make a `DoPut` call to the server with the provided - /// [`Stream`](futures::Stream) of [`FlightData`] and returning a + /// [`Stream`] of [`FlightData`] and returning a /// stream of [`PutResult`]. /// /// # Note @@ -340,7 +340,7 @@ impl FlightClient { } /// Make a `DoExchange` call to the server with the provided - /// [`Stream`](futures::Stream) of [`FlightData`] and returning a + /// [`Stream`] of [`FlightData`] and returning a /// stream of [`FlightData`]. /// /// # Example: @@ -391,7 +391,7 @@ impl FlightClient { } /// Make a `ListFlights` call to the server with the provided - /// criteria and returning a [`Stream`](futures::Stream) of [`FlightInfo`]. + /// criteria and returning a [`Stream`] of [`FlightInfo`]. /// /// # Example: /// ```no_run @@ -469,7 +469,7 @@ impl FlightClient { } /// Make a `ListActions` call to the server and returning a - /// [`Stream`](futures::Stream) of [`ActionType`]. + /// [`Stream`] of [`ActionType`]. /// /// # Example: /// ```no_run @@ -506,7 +506,7 @@ impl FlightClient { } /// Make a `DoAction` call to the server and returning a - /// [`Stream`](futures::Stream) of opaque [`Bytes`]. + /// [`Stream`] of opaque [`Bytes`]. /// /// # Example: /// ```no_run diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 9650031d8b5f..cd2ee7c02b68 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -24,7 +24,7 @@ use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; -/// Creates a [`Stream`](futures::Stream) of [`FlightData`]s from a +/// Creates a [`Stream`] of [`FlightData`]s from a /// `Stream` of [`Result`]<[`RecordBatch`], [`FlightError`]>. /// /// This can be used to implement [`FlightService::do_get`] in an @@ -146,7 +146,7 @@ impl FlightDataEncoderBuilder { self } - /// Return a [`Stream`](futures::Stream) of [`FlightData`], + /// Return a [`Stream`] of [`FlightData`], /// consuming self. More details on [`FlightDataEncoder`] pub fn build(self, input: S) -> FlightDataEncoder where diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 840e98ab0ded..bc5b7d500b18 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -18,9 +18,9 @@ //! Defines the logical data types of Arrow arrays. //! //! The most important things you might be looking for are: -//! * [`Schema`](crate::datatypes::Schema) to describe a schema. -//! * [`Field`](crate::datatypes::Field) to describe one field within a schema. -//! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. +//! * [`Schema`] to describe a schema. +//! * [`Field`] to describe one field within a schema. +//! * [`DataType`] to describe the type of a field. pub use arrow_array::types::*; pub use arrow_array::{ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType}; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index e347f99ee429..fb904c1908e6 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -160,7 +160,7 @@ //! //! # Compute Kernels //! -//! The [`compute`](compute) module provides optimised implementations of many common operations, +//! The [`compute`] module provides optimised implementations of many common operations, //! for example the `parse_strings` operation above could also be implemented as follows: //! //! ``` diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 654cd0816039..ec9af2aa271a 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -58,7 +58,7 @@ pub enum Page { } impl Page { - /// Returns [`PageType`](crate::basic::PageType) for this page. + /// Returns [`PageType`] for this page. pub fn page_type(&self) -> PageType { match self { Page::DataPage { .. } => PageType::DATA_PAGE, @@ -85,7 +85,7 @@ impl Page { } } - /// Returns this page [`Encoding`](crate::basic::Encoding). + /// Returns this page [`Encoding`]. pub fn encoding(&self) -> Encoding { match self { Page::DataPage { encoding, .. } => *encoding, @@ -94,7 +94,7 @@ impl Page { } } - /// Returns optional [`Statistics`](crate::file::statistics::Statistics). + /// Returns optional [`Statistics`]. pub fn statistics(&self) -> Option<&Statistics> { match self { Page::DataPage { ref statistics, .. } => statistics.as_ref(), diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 3dac8ee55886..8eccf3408a55 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -76,7 +76,7 @@ impl<'a> TryFrom<&'a str> for SerializedFileReader { } } -/// Conversion into a [`RowIter`](crate::record::reader::RowIter) +/// Conversion into a [`RowIter`] /// using the full file schema over all row groups. impl IntoIterator for SerializedFileReader { type Item = Result; diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 3416386c9797..5a1d8406575c 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -16,7 +16,7 @@ // under the License. //! Contains implementation of record assembly and converting Parquet types into -//! [`Row`](crate::record::Row)s. +//! [`Row`]s. use std::{collections::HashMap, fmt, sync::Arc}; @@ -618,7 +618,7 @@ impl fmt::Display for Reader { // Row iterators /// The enum Either with variants That represents a reference and a box of -/// [`FileReader`](crate::file::reader::FileReader). +/// [`FileReader`]. enum Either<'a> { Left(&'a dyn FileReader), Right(Box), @@ -633,7 +633,7 @@ impl<'a> Either<'a> { } } -/// Iterator of [`Row`](crate::record::Row)s. +/// Iterator of [`Row`]s. /// It is used either for a single row group to iterate over data in that row group, or /// an entire file with auto buffering of all row groups. pub struct RowIter<'a> { @@ -646,7 +646,7 @@ pub struct RowIter<'a> { } impl<'a> RowIter<'a> { - /// Creates a new iterator of [`Row`](crate::record::Row)s. + /// Creates a new iterator of [`Row`]s. fn new( file_reader: Option>, row_iter: Option, @@ -668,7 +668,7 @@ impl<'a> RowIter<'a> { } } - /// Creates iterator of [`Row`](crate::record::Row)s for all row groups in a + /// Creates iterator of [`Row`]s for all row groups in a /// file. pub fn from_file(proj: Option, reader: &'a dyn FileReader) -> Result { let either = Either::Left(reader); @@ -680,7 +680,7 @@ impl<'a> RowIter<'a> { Ok(Self::new(Some(either), None, descr)) } - /// Creates iterator of [`Row`](crate::record::Row)s for a specific row group. + /// Creates iterator of [`Row`]s for a specific row group. pub fn from_row_group( proj: Option, reader: &'a dyn RowGroupReader, @@ -694,8 +694,7 @@ impl<'a> RowIter<'a> { Ok(Self::new(None, Some(row_iter), descr)) } - /// Creates a iterator of [`Row`](crate::record::Row)s from a - /// [`FileReader`](crate::file::reader::FileReader) using the full file schema. + /// Creates a iterator of [`Row`]s from a [`FileReader`] using the full file schema. pub fn from_file_into(reader: Box) -> Self { let either = Either::Right(reader); let descr = either @@ -707,7 +706,7 @@ impl<'a> RowIter<'a> { Self::new(Some(either), None, descr) } - /// Tries to create a iterator of [`Row`](crate::record::Row)s using projections. + /// Tries to create a iterator of [`Row`]s using projections. /// Returns a error if a file reader is not the source of this iterator. /// /// The Projected schema can be a subset of or equal to the file schema, @@ -793,7 +792,7 @@ impl<'a> Iterator for RowIter<'a> { } } -/// Internal iterator of [`Row`](crate::record::Row)s for a reader. +/// Internal iterator of [`Row`]s for a reader. pub struct ReaderIter { root_reader: Reader, records_left: usize, diff --git a/parquet/src/schema/parser.rs b/parquet/src/schema/parser.rs index 9af0f328a74a..d589f8c1100a 100644 --- a/parquet/src/schema/parser.rs +++ b/parquet/src/schema/parser.rs @@ -17,7 +17,7 @@ //! Parquet schema parser. //! Provides methods to parse and validate string message type into Parquet -//! [`Type`](crate::schema::types::Type). +//! [`Type`]. //! //! # Example //! @@ -50,7 +50,7 @@ use crate::basic::{ use crate::errors::{ParquetError, Result}; use crate::schema::types::{Type, TypePtr}; -/// Parses message type as string into a Parquet [`Type`](crate::schema::types::Type) +/// Parses message type as string into a Parquet [`Type`] /// which, for example, could be used to extract individual columns. Returns Parquet /// general error when parsing or validation fails. pub fn parse_message_type(message_type: &str) -> Result { diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 12624513ac6a..0c90c5405a2b 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -51,8 +51,7 @@ use crate::file::metadata::{ }; use crate::schema::types::Type; -/// Prints Parquet metadata [`ParquetMetaData`](crate::file::metadata::ParquetMetaData) -/// information. +/// Prints Parquet metadata [`ParquetMetaData`] information. #[allow(unused_must_use)] pub fn print_parquet_metadata(out: &mut dyn io::Write, metadata: &ParquetMetaData) { print_file_metadata(out, metadata.file_metadata()); @@ -68,8 +67,7 @@ pub fn print_parquet_metadata(out: &mut dyn io::Write, metadata: &ParquetMetaDat } } -/// Prints file metadata [`FileMetaData`](crate::file::metadata::FileMetaData) -/// information. +/// Prints file metadata [`FileMetaData`] information. #[allow(unused_must_use)] pub fn print_file_metadata(out: &mut dyn io::Write, file_metadata: &FileMetaData) { writeln!(out, "version: {}", file_metadata.version()); @@ -92,7 +90,7 @@ pub fn print_file_metadata(out: &mut dyn io::Write, file_metadata: &FileMetaData print_schema(out, schema); } -/// Prints Parquet [`Type`](crate::schema::types::Type) information. +/// Prints Parquet [`Type`] information. #[allow(unused_must_use)] pub fn print_schema(out: &mut dyn io::Write, tp: &Type) { // TODO: better if we can pass fmt::Write to Printer. diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index bed85268ff93..f0e1a7f27a8f 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -219,12 +219,12 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } - /// Sets [`Repetition`](crate::basic::Repetition) for this field and returns itself. + /// Sets [`Repetition`] for this field and returns itself. pub fn with_repetition(self, repetition: Repetition) -> Self { Self { repetition, ..self } } - /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + /// Sets [`ConvertedType`] for this field and returns itself. pub fn with_converted_type(self, converted_type: ConvertedType) -> Self { Self { converted_type, @@ -232,7 +232,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } - /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. + /// Sets [`LogicalType`] for this field and returns itself. /// If only the logical type is populated for a primitive type, the converted type /// will be automatically populated, and can thus be omitted. pub fn with_logical_type(self, logical_type: Option) -> Self { @@ -552,13 +552,13 @@ impl<'a> GroupTypeBuilder<'a> { } } - /// Sets [`Repetition`](crate::basic::Repetition) for this field and returns itself. + /// Sets [`Repetition`] for this field and returns itself. pub fn with_repetition(mut self, repetition: Repetition) -> Self { self.repetition = Some(repetition); self } - /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + /// Sets [`ConvertedType`] for this field and returns itself. pub fn with_converted_type(self, converted_type: ConvertedType) -> Self { Self { converted_type, @@ -566,7 +566,7 @@ impl<'a> GroupTypeBuilder<'a> { } } - /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. + /// Sets [`LogicalType`] for this field and returns itself. pub fn with_logical_type(self, logical_type: Option) -> Self { Self { logical_type, @@ -629,18 +629,18 @@ impl BasicTypeInfo { self.repetition.is_some() } - /// Returns [`Repetition`](crate::basic::Repetition) value for the type. + /// Returns [`Repetition`] value for the type. pub fn repetition(&self) -> Repetition { assert!(self.repetition.is_some()); self.repetition.unwrap() } - /// Returns [`ConvertedType`](crate::basic::ConvertedType) value for the type. + /// Returns [`ConvertedType`] value for the type. pub fn converted_type(&self) -> ConvertedType { self.converted_type } - /// Returns [`LogicalType`](crate::basic::LogicalType) value for the type. + /// Returns [`LogicalType`] value for the type. pub fn logical_type(&self) -> Option { // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it self.logical_type.clone() @@ -787,12 +787,12 @@ impl ColumnDescriptor { &self.path } - /// Returns self type [`Type`](crate::schema::types::Type) for this leaf column. + /// Returns self type [`Type`] for this leaf column. pub fn self_type(&self) -> &Type { self.primitive_type.as_ref() } - /// Returns self type [`TypePtr`](crate::schema::types::TypePtr) for this leaf + /// Returns self type [`TypePtr`] for this leaf /// column. pub fn self_type_ptr(&self) -> TypePtr { self.primitive_type.clone() @@ -803,12 +803,12 @@ impl ColumnDescriptor { self.primitive_type.name() } - /// Returns [`ConvertedType`](crate::basic::ConvertedType) for this column. + /// Returns [`ConvertedType`] for this column. pub fn converted_type(&self) -> ConvertedType { self.primitive_type.get_basic_info().converted_type() } - /// Returns [`LogicalType`](crate::basic::LogicalType) for this column. + /// Returns [`LogicalType`] for this column. pub fn logical_type(&self) -> Option { self.primitive_type.get_basic_info().logical_type() } @@ -928,14 +928,13 @@ impl SchemaDescriptor { self.leaves.len() } - /// Returns column root [`Type`](crate::schema::types::Type) for a leaf position. + /// Returns column root [`Type`] for a leaf position. pub fn get_column_root(&self, i: usize) -> &Type { let result = self.column_root_of(i); result.as_ref() } - /// Returns column root [`Type`](crate::schema::types::Type) pointer for a leaf - /// position. + /// Returns column root [`Type`] pointer for a leaf position. pub fn get_column_root_ptr(&self, i: usize) -> TypePtr { let result = self.column_root_of(i); result.clone() @@ -960,7 +959,7 @@ impl SchemaDescriptor { &self.schema.get_fields()[self.get_column_root_idx(i)] } - /// Returns schema as [`Type`](crate::schema::types::Type). + /// Returns schema as [`Type`]. pub fn root_schema(&self) -> &Type { self.schema.as_ref() } From bce0b418b69dcf0dab1fca3edbe5db1f5ca122a2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 21 Aug 2023 15:25:26 +0100 Subject: [PATCH 1175/1411] Add distinct kernels (#960) (#4438) (#4716) * Add distinct kernels (#960) (#4438) * Fixes * Add tests * Handle NullArray * Fix comparisons between scalar and empty array * Clippy * Review feedback --- arrow-array/src/array/boolean_array.rs | 9 + arrow-ord/src/cmp.rs | 304 +++++++++++++++++++++---- arrow-ord/src/partition.rs | 20 +- 3 files changed, 274 insertions(+), 59 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 0d9a1044be8e..4d19babe3e4b 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -437,6 +437,15 @@ impl>> FromIterator for BooleanArray } } +impl From for BooleanArray { + fn from(values: BooleanBuffer) -> Self { + Self { + values, + nulls: None, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index aad61fa8f062..96f5aafd8697 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -33,6 +33,7 @@ use arrow_buffer::bit_util::ceil; use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; use arrow_schema::ArrowError; use arrow_select::take::take; +use std::ops::Not; #[derive(Debug, Copy, Clone)] enum Op { @@ -42,6 +43,8 @@ enum Op { LessEqual, Greater, GreaterEqual, + Distinct, + NotDistinct, } impl std::fmt::Display for Op { @@ -53,6 +56,8 @@ impl std::fmt::Display for Op { Op::LessEqual => write!(f, "<="), Op::Greater => write!(f, ">"), Op::GreaterEqual => write!(f, ">="), + Op::Distinct => write!(f, "IS DISTINCT FROM"), + Op::NotDistinct => write!(f, "IS NOT DISTINCT FROM"), } } } @@ -129,7 +134,43 @@ pub fn gt_eq(lhs: &dyn Datum, rhs: &dyn Datum) -> Result Result { + compare_op(Op::Distinct, lhs, rhs) +} + +/// Perform `left IS NOT DISTINCT FROM right` operation on two [`Datum`] +/// +/// [`not_distinct`] is similar to [`eq`], only differing in null handling. In particular, two +/// operands are considered `NOT DISTINCT` if they have the same value or if both of them +/// is NULL. The result of [`not_distinct`] is never NULL. +/// +/// For floating values like f32 and f64, this comparison produces an ordering in accordance to +/// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. +/// Note that totalOrder treats positive and negative zeros as different. If it is necessary +/// to treat them as equal, please normalize zeros before calling this kernel. +/// +/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`] +pub fn not_distinct( + lhs: &dyn Datum, + rhs: &dyn Datum, +) -> Result { + compare_op(Op::NotDistinct, lhs, rhs) +} + /// Perform `op` on the provided `Datum` +#[inline(never)] fn compare_op( op: Op, lhs: &dyn Datum, @@ -141,51 +182,114 @@ fn compare_op( let l_len = l.len(); let r_len = r.len(); - let l_nulls = l.logical_nulls(); - let r_nulls = r.logical_nulls(); - let (len, nulls) = match (l_s, r_s) { - (true, true) | (false, false) => { - if l_len != r_len { - return Err(ArrowError::InvalidArgumentError(format!( - "Cannot compare arrays of different lengths, got {l_len} vs {r_len}" - ))); - } - (l_len, NullBuffer::union(l_nulls.as_ref(), r_nulls.as_ref())) - } - (true, false) => match l_nulls.map(|x| x.null_count() != 0).unwrap_or_default() { - true => (r_len, Some(NullBuffer::new_null(r_len))), - false => (r_len, r_nulls), // Left is scalar and not null - }, - (false, true) => match r_nulls.map(|x| x.null_count() != 0).unwrap_or_default() { - true => (l_len, Some(NullBuffer::new_null(l_len))), - false => (l_len, l_nulls), // Right is scalar and not null - }, + if l_len != r_len && !l_s && !r_s { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {l_len} vs {r_len}" + ))); + } + + let len = match l_s { + true => r_len, + false => l_len, }; + let l_nulls = l.logical_nulls(); + let r_nulls = r.logical_nulls(); + let l_v = l.as_any_dictionary_opt(); let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); + let l_t = l.data_type(); let r_v = r.as_any_dictionary_opt(); let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); + let r_t = r.data_type(); + + if l_t != r_t || l_t.is_nested() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid comparison operation: {l_t} {op} {r_t}" + ))); + } + + // Defer computation as may not be necessary + let values = || -> BooleanBuffer { + let d = downcast_primitive_array! { + (l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v), + (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v), + (Utf8, Utf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), + (LargeUtf8, LargeUtf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), + (Binary, Binary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), + (LargeBinary, LargeBinary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), + (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v), + (Null, Null) => None, + _ => unreachable!(), + }; + d.unwrap_or_else(|| BooleanBuffer::new_unset(len)) + }; - let values = downcast_primitive_array! { - (l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v), - (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v), - (Utf8, Utf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), - (LargeUtf8, LargeUtf8) => apply(op, l.as_string::(), l_s, l_v, r.as_string::(), r_s, r_v), - (Binary, Binary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), - (LargeBinary, LargeBinary) => apply(op, l.as_binary::(), l_s, l_v, r.as_binary::(), r_s, r_v), - (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v), - (l_t, r_t) => return Err(ArrowError::InvalidArgumentError(format!("Invalid comparison operation: {l_t} {op} {r_t}"))), - }.unwrap_or_else(|| { - let count = nulls.as_ref().map(|x| x.null_count()).unwrap_or_default(); - assert_eq!(count, len); // Sanity check - BooleanBuffer::new_unset(len) - }); - - assert_eq!(values.len(), len); // Sanity check - Ok(BooleanArray::new(values, nulls)) + let l_nulls = l_nulls.filter(|n| n.null_count() > 0); + let r_nulls = r_nulls.filter(|n| n.null_count() > 0); + Ok(match (l_nulls, l_s, r_nulls, r_s) { + (Some(l), true, Some(r), true) | (Some(l), false, Some(r), false) => { + // Either both sides are scalar or neither side is scalar + match op { + Op::Distinct => { + let values = values(); + let l = l.inner().bit_chunks().iter_padded(); + let r = r.inner().bit_chunks().iter_padded(); + let ne = values.bit_chunks().iter_padded(); + + let c = |((l, r), n)| ((l ^ r) | (l & r & n)); + let buffer = l.zip(r).zip(ne).map(c).collect(); + BooleanBuffer::new(buffer, 0, len).into() + } + Op::NotDistinct => { + let values = values(); + let l = l.inner().bit_chunks().iter_padded(); + let r = r.inner().bit_chunks().iter_padded(); + let e = values.bit_chunks().iter_padded(); + + let c = |((l, r), e)| u64::not(l | r) | (l & r & e); + let buffer = l.zip(r).zip(e).map(c).collect(); + BooleanBuffer::new(buffer, 0, len).into() + } + _ => BooleanArray::new(values(), NullBuffer::union(Some(&l), Some(&r))), + } + } + (Some(_), true, Some(a), false) | (Some(a), false, Some(_), true) => { + // Scalar is null, other side is non-scalar and nullable + match op { + Op::Distinct => a.into_inner().into(), + Op::NotDistinct => a.into_inner().not().into(), + _ => BooleanArray::new_null(len), + } + } + (Some(nulls), is_scalar, None, _) | (None, _, Some(nulls), is_scalar) => { + // Only one side is nullable + match is_scalar { + true => match op { + // Scalar is null, other side is not nullable + Op::Distinct => BooleanBuffer::new_set(len).into(), + Op::NotDistinct => BooleanBuffer::new_unset(len).into(), + _ => BooleanArray::new_null(len), + }, + false => match op { + Op::Distinct => { + let values = values(); + let l = nulls.inner().bit_chunks().iter_padded(); + let ne = values.bit_chunks().iter_padded(); + let c = |(l, n)| u64::not(l) | n; + let buffer = l.zip(ne).map(c).collect(); + BooleanBuffer::new(buffer, 0, len).into() + } + Op::NotDistinct => (nulls.inner() & &values()).into(), + _ => BooleanArray::new(values(), Some(nulls)), + }, + } + } + // Neither side is nullable + (None, _, None, _) => BooleanArray::new(values(), None), + }) } /// Perform a potentially vectored `op` on the provided `ArrayOrd` @@ -215,8 +319,12 @@ fn apply( assert_eq!(l_v.len(), r_v.len()); // Sanity check Some(match op { - Op::Equal => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_eq), - Op::NotEqual => apply_op_vectored(l, &l_v, r, &r_v, true, T::is_eq), + Op::Equal | Op::NotDistinct => { + apply_op_vectored(l, &l_v, r, &r_v, false, T::is_eq) + } + Op::NotEqual | Op::Distinct => { + apply_op_vectored(l, &l_v, r, &r_v, true, T::is_eq) + } Op::Less => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_lt), Op::LessEqual => apply_op_vectored(r, &r_v, l, &l_v, true, T::is_lt), Op::Greater => apply_op_vectored(r, &r_v, l, &l_v, false, T::is_lt), @@ -227,8 +335,8 @@ fn apply( let r_s = r_s.then(|| r_v.map(|x| x.normalized_keys()[0]).unwrap_or_default()); let buffer = match op { - Op::Equal => apply_op(l, l_s, r, r_s, false, T::is_eq), - Op::NotEqual => apply_op(l, l_s, r, r_s, true, T::is_eq), + Op::Equal | Op::NotDistinct => apply_op(l, l_s, r, r_s, false, T::is_eq), + Op::NotEqual | Op::Distinct => apply_op(l, l_s, r, r_s, true, T::is_eq), Op::Less => apply_op(l, l_s, r, r_s, false, T::is_lt), Op::LessEqual => apply_op(r, r_s, l, l_s, true, T::is_lt), Op::Greater => apply_op(r, r_s, l, l_s, false, T::is_lt), @@ -293,6 +401,8 @@ fn collect_bool(len: usize, neg: bool, f: impl Fn(usize) -> bool) -> BooleanBuff /// /// If l is scalar `l_s` will be `Some(idx)` where `idx` is the index of the scalar value in `l` /// If r is scalar `r_s` will be `Some(idx)` where `idx` is the index of the scalar value in `r` +/// +/// If `neg` is true the result of `op` will be negated fn apply_op( l: T, l_s: Option, @@ -311,7 +421,7 @@ fn apply_op( (Some(l_s), Some(r_s)) => { let a = l.value(l_s); let b = r.value(r_s); - std::iter::once(op(a, b)).collect() + std::iter::once(op(a, b) ^ neg).collect() } (Some(l_s), None) => { let v = l.value(l_s); @@ -486,4 +596,116 @@ mod tests { let r = eq(&a, &Scalar::new(&scalar)).unwrap(); assert_eq!(r.null_count(), 3); } + + #[test] + fn is_distinct_from_non_nulls() { + let left_int_array = Int32Array::from(vec![0, 1, 2, 3, 4]); + let right_int_array = Int32Array::from(vec![4, 3, 2, 1, 0]); + + assert_eq!( + BooleanArray::from(vec![true, true, false, true, true,]), + distinct(&left_int_array, &right_int_array).unwrap() + ); + assert_eq!( + BooleanArray::from(vec![false, false, true, false, false,]), + not_distinct(&left_int_array, &right_int_array).unwrap() + ); + } + + #[test] + fn is_distinct_from_nulls() { + // [0, 0, NULL, 0, 0, 0] + let left_int_array = Int32Array::new( + vec![0, 0, 1, 3, 0, 0].into(), + Some(NullBuffer::from(vec![true, true, false, true, true, true])), + ); + // [0, NULL, NULL, NULL, 0, NULL] + let right_int_array = Int32Array::new( + vec![0; 6].into(), + Some(NullBuffer::from(vec![ + true, false, false, false, true, false, + ])), + ); + + assert_eq!( + BooleanArray::from(vec![false, true, false, true, false, true,]), + distinct(&left_int_array, &right_int_array).unwrap() + ); + + assert_eq!( + BooleanArray::from(vec![true, false, true, false, true, false,]), + not_distinct(&left_int_array, &right_int_array).unwrap() + ); + } + + #[test] + fn test_distinct_scalar() { + let a = Int32Array::new_scalar(12); + let b = Int32Array::new_scalar(12); + assert!(!distinct(&a, &b).unwrap().value(0)); + assert!(not_distinct(&a, &b).unwrap().value(0)); + + let a = Int32Array::new_scalar(12); + let b = Int32Array::new_null(1); + assert!(distinct(&a, &b).unwrap().value(0)); + assert!(!not_distinct(&a, &b).unwrap().value(0)); + assert!(distinct(&b, &a).unwrap().value(0)); + assert!(!not_distinct(&b, &a).unwrap().value(0)); + + let b = Scalar::new(b); + assert!(distinct(&a, &b).unwrap().value(0)); + assert!(!not_distinct(&a, &b).unwrap().value(0)); + + assert!(!distinct(&b, &b).unwrap().value(0)); + assert!(not_distinct(&b, &b).unwrap().value(0)); + + let a = Int32Array::new( + vec![0, 1, 2, 3].into(), + Some(vec![false, false, true, true].into()), + ); + let expected = BooleanArray::from(vec![false, false, true, true]); + assert_eq!(distinct(&a, &b).unwrap(), expected); + assert_eq!(distinct(&b, &a).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, false]); + assert_eq!(not_distinct(&a, &b).unwrap(), expected); + assert_eq!(not_distinct(&b, &a).unwrap(), expected); + + let b = Int32Array::new_scalar(1); + let expected = BooleanArray::from(vec![true; 4]); + assert_eq!(distinct(&a, &b).unwrap(), expected); + assert_eq!(distinct(&b, &a).unwrap(), expected); + let expected = BooleanArray::from(vec![false; 4]); + assert_eq!(not_distinct(&a, &b).unwrap(), expected); + assert_eq!(not_distinct(&b, &a).unwrap(), expected); + + let b = Int32Array::new_scalar(3); + let expected = BooleanArray::from(vec![true, true, true, false]); + assert_eq!(distinct(&a, &b).unwrap(), expected); + assert_eq!(distinct(&b, &a).unwrap(), expected); + let expected = BooleanArray::from(vec![false, false, false, true]); + assert_eq!(not_distinct(&a, &b).unwrap(), expected); + assert_eq!(not_distinct(&b, &a).unwrap(), expected); + } + + #[test] + fn test_scalar_negation() { + let a = Int32Array::new_scalar(54); + let b = Int32Array::new_scalar(54); + let r = eq(&a, &b).unwrap(); + assert!(r.value(0)); + + let r = neq(&a, &b).unwrap(); + assert!(!r.value(0)) + } + + #[test] + fn test_scalar_empty() { + let a = Int32Array::new_null(0); + let b = Int32Array::new_scalar(23); + let r = eq(&a, &b).unwrap(); + assert_eq!(r.len(), 0); + let r = eq(&b, &a).unwrap(); + assert_eq!(r.len(), 0); + } } diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 52aa5ee8d0f1..0b8447989b8d 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -23,7 +23,7 @@ use arrow_array::{Array, ArrayRef}; use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; -use crate::cmp::neq; +use crate::cmp::distinct; use crate::sort::SortColumn; /// A computed set of partitions, see [`partition`] @@ -157,23 +157,7 @@ fn find_boundaries(v: &dyn Array) -> Result { let slice_len = v.len() - 1; let v1 = v.slice(0, slice_len); let v2 = v.slice(1, slice_len); - - let array_ne = neq(&v1, &v2)?; - // Set if values have different non-NULL values - let values_ne = match array_ne.nulls().filter(|n| n.null_count() > 0) { - Some(n) => n.inner() & array_ne.values(), - None => array_ne.values().clone(), - }; - - Ok(match v.nulls().filter(|x| x.null_count() > 0) { - Some(n) => { - let n1 = n.inner().slice(0, slice_len); - let n2 = n.inner().slice(1, slice_len); - // Set if values_ne or the nullability has changed - &(&n1 ^ &n2) | &values_ne - } - None => values_ne, - }) + Ok(distinct(&v1, &v2)?.values().clone()) } /// Given a list of already sorted columns, find partition ranges that would partition From 90449ffb2ea6ceef43ce8fc97084b3373975f357 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:31:53 +0100 Subject: [PATCH 1176/1411] Prepare Arrow 46.0.0 (#4720) --- CHANGELOG-old.md | 49 ++++++++++++ CHANGELOG.md | 133 ++++++++++++++++++++++--------- Cargo.toml | 32 ++++---- dev/release/update_change_log.sh | 4 +- 4 files changed, 164 insertions(+), 54 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 4d04f9515c44..c404133f564e 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,55 @@ # Historical Changelog +## [45.0.0](https://github.com/apache/arrow-rs/tree/45.0.0) (2023-07-30) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/44.0.0...45.0.0) + +**Breaking changes:** + +- Fix timezoned timestamp arithmetic [\#4546](https://github.com/apache/arrow-rs/pull/4546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) + +**Implemented enhancements:** + +- Use FormatOptions in Const Contexts [\#4580](https://github.com/apache/arrow-rs/issues/4580) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Human Readable Duration Display [\#4554](https://github.com/apache/arrow-rs/issues/4554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `BooleanBuilder`: Add `validity_slice` method for accessing validity bits [\#4535](https://github.com/apache/arrow-rs/issues/4535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `FixedSizedListArray` for `length` kernel [\#4517](https://github.com/apache/arrow-rs/issues/4517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `RowCoverter::convert` that targets an existing `Rows` [\#4479](https://github.com/apache/arrow-rs/issues/4479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Panic `assertion failed: idx < self.len` when casting DictionaryArrays with nulls [\#4576](https://github.com/apache/arrow-rs/issues/4576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- arrow-arith is\_null is buggy with NullArray [\#4565](https://github.com/apache/arrow-rs/issues/4565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Incorrect Interval to Duration Casting [\#4553](https://github.com/apache/arrow-rs/issues/4553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Too large validity buffer pre-allocation in `FixedSizeListBuilder::new` [\#4549](https://github.com/apache/arrow-rs/issues/4549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Like with wildcards fail to match fields with new lines. [\#4547](https://github.com/apache/arrow-rs/issues/4547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Timestamp Interval Arithmetic Ignores Timezone [\#4457](https://github.com/apache/arrow-rs/issues/4457) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- refactor: simplify hour\_dyn\(\) with time\_fraction\_dyn\(\) [\#4588](https://github.com/apache/arrow-rs/pull/4588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Move from\_iter\_values to GenericByteArray [\#4586](https://github.com/apache/arrow-rs/pull/4586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Mark GenericByteArray::new\_unchecked unsafe [\#4584](https://github.com/apache/arrow-rs/pull/4584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Configurable Duration Display [\#4581](https://github.com/apache/arrow-rs/pull/4581) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix take\_bytes Null and Overflow Handling \(\#4576\) [\#4579](https://github.com/apache/arrow-rs/pull/4579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move chrono-tz arithmetic tests to integration [\#4571](https://github.com/apache/arrow-rs/pull/4571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Write Page Offset Index For All-Nan Pages [\#4567](https://github.com/apache/arrow-rs/pull/4567) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([MachaelLee](https://github.com/MachaelLee)) +- support NullArray un arith/boolean kernel [\#4566](https://github.com/apache/arrow-rs/pull/4566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- Remove Sync from arrow-flight example [\#4564](https://github.com/apache/arrow-rs/pull/4564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Fix interval to duration casting \(\#4553\) [\#4562](https://github.com/apache/arrow-rs/pull/4562) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- docs: fix wrong parameter name [\#4559](https://github.com/apache/arrow-rs/pull/4559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) +- Fix FixedSizeListBuilder capacity \(\#4549\) [\#4552](https://github.com/apache/arrow-rs/pull/4552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- docs: fix wrong inline code snippet in parquet document [\#4550](https://github.com/apache/arrow-rs/pull/4550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) +- fix multiline wildcard likes \(fixes \#4547\) [\#4548](https://github.com/apache/arrow-rs/pull/4548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nl5887](https://github.com/nl5887)) +- Provide default `is_empty` impl for `arrow::array::ArrayBuilder` [\#4543](https://github.com/apache/arrow-rs/pull/4543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Add RowConverter::append \(\#4479\) [\#4541](https://github.com/apache/arrow-rs/pull/4541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clarify GenericColumnReader::read\_records [\#4540](https://github.com/apache/arrow-rs/pull/4540) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Initial loongarch port [\#4538](https://github.com/apache/arrow-rs/pull/4538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xiangzhai](https://github.com/xiangzhai)) +- Update proc-macro2 requirement from =1.0.64 to =1.0.66 [\#4537](https://github.com/apache/arrow-rs/pull/4537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- add a validity slice access for boolean array builders [\#4536](https://github.com/apache/arrow-rs/pull/4536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ChristianBeilschmidt](https://github.com/ChristianBeilschmidt)) +- use new num version instead of explicit num-complex dependency [\#4532](https://github.com/apache/arrow-rs/pull/4532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) +- feat: Support `FixedSizedListArray` for `length` kernel [\#4520](https://github.com/apache/arrow-rs/pull/4520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) ## [44.0.0](https://github.com/apache/arrow-rs/tree/44.0.0) (2023-07-14) [Full Changelog](https://github.com/apache/arrow-rs/compare/43.0.0...44.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c52c5843459..74f74bc3ef13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,55 +19,116 @@ # Changelog -## [45.0.0](https://github.com/apache/arrow-rs/tree/45.0.0) (2023-07-30) +## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) -[Full Changelog](https://github.com/apache/arrow-rs/compare/44.0.0...45.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) **Breaking changes:** -- Fix timezoned timestamp arithmetic [\#4546](https://github.com/apache/arrow-rs/pull/4546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alexandreyc](https://github.com/alexandreyc)) +- API improvement: `batches_to_flight_data` forces clone [\#4656](https://github.com/apache/arrow-rs/issues/4656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with\_values [\#4707](https://github.com/apache/arrow-rs/pull/4707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup parquet type builders [\#4706](https://github.com/apache/arrow-rs/pull/4706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Take kernel dyn Array [\#4705](https://github.com/apache/arrow-rs/pull/4705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve ergonomics of Scalar [\#4704](https://github.com/apache/arrow-rs/pull/4704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Datum based comparison kernels \(\#4596\) [\#4701](https://github.com/apache/arrow-rs/pull/4701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Improve `Array` Logical Nullability [\#4691](https://github.com/apache/arrow-rs/pull/4691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate ArrayData Buffer Alignment and Automatically Align IPC buffers \(\#4255\) [\#4681](https://github.com/apache/arrow-rs/pull/4681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More intuitive bool-to-string casting [\#4666](https://github.com/apache/arrow-rs/pull/4666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fsdvh](https://github.com/fsdvh)) +- enhancement: batches\_to\_flight\_data use a schema ref as param. [\#4665](https://github.com/apache/arrow-rs/pull/4665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) +- fix: from\_thrift avoid panic when stats in invalid. [\#4642](https://github.com/apache/arrow-rs/pull/4642) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jackwener](https://github.com/jackwener)) +- bug: Add some missing field in row group metadata: ordinal, total co… [\#4636](https://github.com/apache/arrow-rs/pull/4636) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liurenjie1024](https://github.com/liurenjie1024)) +- Remove deprecated limit kernel [\#4597](https://github.com/apache/arrow-rs/pull/4597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Use FormatOptions in Const Contexts [\#4580](https://github.com/apache/arrow-rs/issues/4580) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Human Readable Duration Display [\#4554](https://github.com/apache/arrow-rs/issues/4554) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `BooleanBuilder`: Add `validity_slice` method for accessing validity bits [\#4535](https://github.com/apache/arrow-rs/issues/4535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `FixedSizedListArray` for `length` kernel [\#4517](https://github.com/apache/arrow-rs/issues/4517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `RowCoverter::convert` that targets an existing `Rows` [\#4479](https://github.com/apache/arrow-rs/issues/4479) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: support setting the field\_id with an ArrowWriter [\#4702](https://github.com/apache/arrow-rs/issues/4702) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support references in i256 arithmetic ops [\#4694](https://github.com/apache/arrow-rs/issues/4694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Precision-Loss Decimal Arithmetic [\#4664](https://github.com/apache/arrow-rs/issues/4664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Faster i256 Division [\#4663](https://github.com/apache/arrow-rs/issues/4663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `concat_batches` for 0 columns [\#4661](https://github.com/apache/arrow-rs/issues/4661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `filter_record_batch` should support filtering record batch without columns [\#4647](https://github.com/apache/arrow-rs/issues/4647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve speed of `lexicographical_partition_ranges` [\#4614](https://github.com/apache/arrow-rs/issues/4614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Add Rank Function [\#4606](https://github.com/apache/arrow-rs/issues/4606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based Comparison Kernels [\#4596](https://github.com/apache/arrow-rs/issues/4596) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Convenience method to create `DataType::List` correctly [\#4544](https://github.com/apache/arrow-rs/issues/4544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Deprecated Arithmetic Kernels [\#4481](https://github.com/apache/arrow-rs/issues/4481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Equality kernel where null==null gives true [\#4438](https://github.com/apache/arrow-rs/issues/4438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Panic `assertion failed: idx < self.len` when casting DictionaryArrays with nulls [\#4576](https://github.com/apache/arrow-rs/issues/4576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- arrow-arith is\_null is buggy with NullArray [\#4565](https://github.com/apache/arrow-rs/issues/4565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Incorrect Interval to Duration Casting [\#4553](https://github.com/apache/arrow-rs/issues/4553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Too large validity buffer pre-allocation in `FixedSizeListBuilder::new` [\#4549](https://github.com/apache/arrow-rs/issues/4549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Like with wildcards fail to match fields with new lines. [\#4547](https://github.com/apache/arrow-rs/issues/4547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Timestamp Interval Arithmetic Ignores Timezone [\#4457](https://github.com/apache/arrow-rs/issues/4457) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet ArrowWriter Ignores Nulls in Dictionary Values [\#4690](https://github.com/apache/arrow-rs/issues/4690) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Schema Nullability Validation Fails to Account for Dictionary Nulls [\#4689](https://github.com/apache/arrow-rs/issues/4689) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Comparison Kernels Ignore Nulls in Dictionary Values [\#4688](https://github.com/apache/arrow-rs/issues/4688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting List to String Ignores Format Options [\#4669](https://github.com/apache/arrow-rs/issues/4669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Double free in C Stream Interface [\#4659](https://github.com/apache/arrow-rs/issues/4659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CI Failing On Packed SIMD [\#4651](https://github.com/apache/arrow-rs/issues/4651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `RowInterner::size()` much too low for high cardinality dictionary columns [\#4645](https://github.com/apache/arrow-rs/issues/4645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Decimal PrimitiveArray change datatype after try\_unary [\#4644](https://github.com/apache/arrow-rs/issues/4644) +- Better explanation in docs for Dictionary field encoding using RowConverter [\#4639](https://github.com/apache/arrow-rs/issues/4639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `List(FixedSizeBinary)` array equality check may return wrong result [\#4637](https://github.com/apache/arrow-rs/issues/4637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow::compute::nullif` panics if `NullArray` is provided [\#4634](https://github.com/apache/arrow-rs/issues/4634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty lists in FixedSizeListArray::try\_new is not handled [\#4623](https://github.com/apache/arrow-rs/issues/4623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Bounds checking in `MutableBuffer::set_null_bits` can be bypassed [\#4620](https://github.com/apache/arrow-rs/issues/4620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- TypedDictionaryArray Misleading Null Behaviour [\#4616](https://github.com/apache/arrow-rs/issues/4616) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- bug: Parquet writer missing row group metadata fields such as `compressed_size`, `file offset`. [\#4610](https://github.com/apache/arrow-rs/issues/4610) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `new_null_array` generates an invalid union array [\#4600](https://github.com/apache/arrow-rs/issues/4600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Footer parsing fails for very large parquet file. [\#4592](https://github.com/apache/arrow-rs/issues/4592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- bug\(parquet\): Disabling global statistics but enabling for particular column breaks reading [\#4587](https://github.com/apache/arrow-rs/issues/4587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `arrow::compute::concat` panics for dense union arrays with non-trivial type IDs [\#4578](https://github.com/apache/arrow-rs/issues/4578) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) **Merged pull requests:** -- refactor: simplify hour\_dyn\(\) with time\_fraction\_dyn\(\) [\#4588](https://github.com/apache/arrow-rs/pull/4588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Move from\_iter\_values to GenericByteArray [\#4586](https://github.com/apache/arrow-rs/pull/4586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Mark GenericByteArray::new\_unchecked unsafe [\#4584](https://github.com/apache/arrow-rs/pull/4584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Configurable Duration Display [\#4581](https://github.com/apache/arrow-rs/pull/4581) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix take\_bytes Null and Overflow Handling \(\#4576\) [\#4579](https://github.com/apache/arrow-rs/pull/4579) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move chrono-tz arithmetic tests to integration [\#4571](https://github.com/apache/arrow-rs/pull/4571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Write Page Offset Index For All-Nan Pages [\#4567](https://github.com/apache/arrow-rs/pull/4567) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([MachaelLee](https://github.com/MachaelLee)) -- support NullArray un arith/boolean kernel [\#4566](https://github.com/apache/arrow-rs/pull/4566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) -- Remove Sync from arrow-flight example [\#4564](https://github.com/apache/arrow-rs/pull/4564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Fix interval to duration casting \(\#4553\) [\#4562](https://github.com/apache/arrow-rs/pull/4562) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- docs: fix wrong parameter name [\#4559](https://github.com/apache/arrow-rs/pull/4559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) -- Fix FixedSizeListBuilder capacity \(\#4549\) [\#4552](https://github.com/apache/arrow-rs/pull/4552) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- docs: fix wrong inline code snippet in parquet document [\#4550](https://github.com/apache/arrow-rs/pull/4550) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([SteveLauC](https://github.com/SteveLauC)) -- fix multiline wildcard likes \(fixes \#4547\) [\#4548](https://github.com/apache/arrow-rs/pull/4548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([nl5887](https://github.com/nl5887)) -- Provide default `is_empty` impl for `arrow::array::ArrayBuilder` [\#4543](https://github.com/apache/arrow-rs/pull/4543) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Add RowConverter::append \(\#4479\) [\#4541](https://github.com/apache/arrow-rs/pull/4541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Clarify GenericColumnReader::read\_records [\#4540](https://github.com/apache/arrow-rs/pull/4540) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Initial loongarch port [\#4538](https://github.com/apache/arrow-rs/pull/4538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xiangzhai](https://github.com/xiangzhai)) -- Update proc-macro2 requirement from =1.0.64 to =1.0.66 [\#4537](https://github.com/apache/arrow-rs/pull/4537) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- add a validity slice access for boolean array builders [\#4536](https://github.com/apache/arrow-rs/pull/4536) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ChristianBeilschmidt](https://github.com/ChristianBeilschmidt)) -- use new num version instead of explicit num-complex dependency [\#4532](https://github.com/apache/arrow-rs/pull/4532) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mwlon](https://github.com/mwlon)) -- feat: Support `FixedSizedListArray` for `length` kernel [\#4520](https://github.com/apache/arrow-rs/pull/4520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Weijun-H](https://github.com/Weijun-H)) +- Add distinct kernels \(\#960\) \(\#4438\) [\#4716](https://github.com/apache/arrow-rs/pull/4716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update parquet object\_store 0.7 [\#4715](https://github.com/apache/arrow-rs/pull/4715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support Field ID in ArrowWriter \(\#4702\) [\#4710](https://github.com/apache/arrow-rs/pull/4710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove rank kernels [\#4703](https://github.com/apache/arrow-rs/pull/4703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support references in i256 arithmetic ops [\#4692](https://github.com/apache/arrow-rs/pull/4692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup DynComparator \(\#2654\) [\#4687](https://github.com/apache/arrow-rs/pull/4687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Separate metadata fetch from `ArrowReaderBuilder` construction \(\#4674\) [\#4676](https://github.com/apache/arrow-rs/pull/4676) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- cleanup some assert\(\) with error propagation [\#4673](https://github.com/apache/arrow-rs/pull/4673) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Faster i256 Division \(2-100x\) \(\#4663\) [\#4672](https://github.com/apache/arrow-rs/pull/4672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Fix equality of nested nullable FixedSizeBinary \(\#4637\) [\#4670](https://github.com/apache/arrow-rs/pull/4670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use ArrayFormatter in cast kernel [\#4668](https://github.com/apache/arrow-rs/pull/4668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve API docs for FlightSQL metadata builders [\#4667](https://github.com/apache/arrow-rs/pull/4667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Support `concat_batches` for 0 columns [\#4662](https://github.com/apache/arrow-rs/pull/4662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- fix ownership of c stream error [\#4660](https://github.com/apache/arrow-rs/pull/4660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Minor: Fix illustration for dict encoding [\#4657](https://github.com/apache/arrow-rs/pull/4657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- minor: move comment to the correct location [\#4655](https://github.com/apache/arrow-rs/pull/4655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Update packed\_simd and run miri tests on simd code [\#4654](https://github.com/apache/arrow-rs/pull/4654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- impl `From>` for `BufferBuilder` and `MutableBuffer` [\#4650](https://github.com/apache/arrow-rs/pull/4650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Filter record batch with 0 columns [\#4648](https://github.com/apache/arrow-rs/pull/4648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Account for child `Bucket` size in OrderPreservingInterner [\#4646](https://github.com/apache/arrow-rs/pull/4646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` [\#4638](https://github.com/apache/arrow-rs/pull/4638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- fix\(select\): handle `NullArray` in `nullif` [\#4635](https://github.com/apache/arrow-rs/pull/4635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Move `BufferBuilder` to `arrow-buffer` [\#4630](https://github.com/apache/arrow-rs/pull/4630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- allow zero sized empty fixed [\#4626](https://github.com/apache/arrow-rs/pull/4626) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix: compute\_dictionary\_mapping use wrong offsetSize [\#4625](https://github.com/apache/arrow-rs/pull/4625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- impl `FromIterator` for `MutableBuffer` [\#4624](https://github.com/apache/arrow-rs/pull/4624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- expand docs for FixedSizeListArray [\#4622](https://github.com/apache/arrow-rs/pull/4622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix\(buffer\): panic on end index overflow in `MutableBuffer::set_null_bits` [\#4621](https://github.com/apache/arrow-rs/pull/4621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- impl `Default` for `arrow_buffer::buffer::MutableBuffer` [\#4619](https://github.com/apache/arrow-rs/pull/4619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Minor: improve docs and add example for lexicographical\_partition\_ranges [\#4615](https://github.com/apache/arrow-rs/pull/4615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Cleanup sort [\#4613](https://github.com/apache/arrow-rs/pull/4613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add rank function \(\#4606\) [\#4609](https://github.com/apache/arrow-rs/pull/4609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more docs and examples for ListArray and OffsetsBuffer [\#4607](https://github.com/apache/arrow-rs/pull/4607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Simplify dictionary sort [\#4605](https://github.com/apache/arrow-rs/pull/4605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate sort benchmarks [\#4604](https://github.com/apache/arrow-rs/pull/4604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't Reorder Nulls in sort\_to\_indices \(\#4545\) [\#4603](https://github.com/apache/arrow-rs/pull/4603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix\(data\): create child arrays of correct length when building a sparse union null array [\#4601](https://github.com/apache/arrow-rs/pull/4601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Use u32 metadata\_len when parsing footer of parquet. [\#4599](https://github.com/apache/arrow-rs/pull/4599) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Berrysoft](https://github.com/Berrysoft)) +- fix\(data\): map type ID to child index before indexing a union child array [\#4598](https://github.com/apache/arrow-rs/pull/4598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Remove deprecated arithmetic kernels \(\#4481\) [\#4594](https://github.com/apache/arrow-rs/pull/4594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Test Disabled Page Statistics \(\#4587\) [\#4589](https://github.com/apache/arrow-rs/pull/4589) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup ArrayData::buffers [\#4583](https://github.com/apache/arrow-rs/pull/4583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use contains\_nulls in ArrayData equality of byte arrays [\#4582](https://github.com/apache/arrow-rs/pull/4582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Vectorized lexicographical\_partition\_ranges \(~80% faster\) [\#4575](https://github.com/apache/arrow-rs/pull/4575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- chore: add datatype new\_list [\#4561](https://github.com/apache/arrow-rs/pull/4561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) diff --git a/Cargo.toml b/Cargo.toml index ea64c1250747..b118c937ca36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "45.0.0" +version = "46.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +76,18 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "45.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "45.0.0", path = "./arrow-arith" } -arrow-array = { version = "45.0.0", path = "./arrow-array" } -arrow-buffer = { version = "45.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "45.0.0", path = "./arrow-cast" } -arrow-csv = { version = "45.0.0", path = "./arrow-csv" } -arrow-data = { version = "45.0.0", path = "./arrow-data" } -arrow-ipc = { version = "45.0.0", path = "./arrow-ipc" } -arrow-json = { version = "45.0.0", path = "./arrow-json" } -arrow-ord = { version = "45.0.0", path = "./arrow-ord" } -arrow-row = { version = "45.0.0", path = "./arrow-row" } -arrow-schema = { version = "45.0.0", path = "./arrow-schema" } -arrow-select = { version = "45.0.0", path = "./arrow-select" } -arrow-string = { version = "45.0.0", path = "./arrow-string" } -parquet = { version = "45.0.0", path = "./parquet", default-features = false } +arrow = { version = "46.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "46.0.0", path = "./arrow-arith" } +arrow-array = { version = "46.0.0", path = "./arrow-array" } +arrow-buffer = { version = "46.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "46.0.0", path = "./arrow-cast" } +arrow-csv = { version = "46.0.0", path = "./arrow-csv" } +arrow-data = { version = "46.0.0", path = "./arrow-data" } +arrow-ipc = { version = "46.0.0", path = "./arrow-ipc" } +arrow-json = { version = "46.0.0", path = "./arrow-json" } +arrow-ord = { version = "46.0.0", path = "./arrow-ord" } +arrow-row = { version = "46.0.0", path = "./arrow-row" } +arrow-schema = { version = "46.0.0", path = "./arrow-schema" } +arrow-select = { version = "46.0.0", path = "./arrow-select" } +arrow-string = { version = "46.0.0", path = "./arrow-string" } +parquet = { version = "46.0.0", path = "./parquet", default-features = false } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 89ef6ebc111f..0b62e97383c2 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="44.0.0" -FUTURE_RELEASE="45.0.0" +SINCE_TAG="45.0.0" +FUTURE_RELEASE="46.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From d9381c66c25f52bb8d6fbd9503947c804a89a37a Mon Sep 17 00:00:00 2001 From: Alexandre Crayssac Date: Thu, 24 Aug 2023 14:06:38 +0200 Subject: [PATCH 1177/1411] Add `IpcError` variant to replace some uses of `IoError`that don't have underlying `std::io::Error` (#4726) --- arrow-flight/examples/flight_sql_server.rs | 2 +- arrow-flight/src/bin/flight_sql_client.rs | 10 ++-- arrow-flight/src/sql/client.rs | 16 ++++--- arrow-flight/tests/encode_decode.rs | 2 +- arrow-ipc/src/convert.rs | 4 +- arrow-ipc/src/reader.rs | 54 +++++++++++----------- arrow-ipc/src/writer.rs | 8 ++-- arrow-schema/src/error.rs | 10 ++-- arrow/src/ffi_stream.rs | 2 +- 9 files changed, 58 insertions(+), 50 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 08a36bc49ea8..1e99957390d8 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -802,7 +802,7 @@ mod tests { fn endpoint(uri: String) -> Result { let endpoint = Endpoint::new(uri) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .map_err(|_| ArrowError::IpcError("Cannot create endpoint".to_string()))? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index e5aacc2e779a..20c8062f899e 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -151,7 +151,7 @@ async fn setup_client( let protocol = if args.tls { "https" } else { "http" }; let mut endpoint = Endpoint::new(format!("{}://{}:{}", protocol, args.host, port)) - .map_err(|_| ArrowError::IoError("Cannot create endpoint".to_string()))? + .map_err(|_| ArrowError::IpcError("Cannot create endpoint".to_string()))? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait @@ -162,15 +162,15 @@ async fn setup_client( if args.tls { let tls_config = ClientTlsConfig::new(); - endpoint = endpoint - .tls_config(tls_config) - .map_err(|_| ArrowError::IoError("Cannot create TLS endpoint".to_string()))?; + endpoint = endpoint.tls_config(tls_config).map_err(|_| { + ArrowError::IpcError("Cannot create TLS endpoint".to_string()) + })?; } let channel = endpoint .connect() .await - .map_err(|e| ArrowError::IoError(format!("Cannot connect to endpoint: {e}")))?; + .map_err(|e| ArrowError::IpcError(format!("Cannot connect to endpoint: {e}")))?; let mut client = FlightSqlServiceClient::new(channel); info!("connected"); diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index d661c9640908..4b1f38ebcbb7 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -150,7 +150,7 @@ impl FlightSqlServiceClient { .flight_client .handshake(req) .await - .map_err(|e| ArrowError::IoError(format!("Can't handshake {e}")))?; + .map_err(|e| ArrowError::IpcError(format!("Can't handshake {e}")))?; if let Some(auth) = resp.metadata().get("authorization") { let auth = auth.to_str().map_err(|_| { ArrowError::ParseError("Can't read auth header".to_string()) @@ -390,16 +390,20 @@ impl FlightSqlServiceClient { ) -> Result, ArrowError> { for (k, v) in &self.headers { let k = AsciiMetadataKey::from_str(k.as_str()).map_err(|e| { - ArrowError::IoError(format!("Cannot convert header key \"{k}\": {e}")) + ArrowError::ParseError(format!("Cannot convert header key \"{k}\": {e}")) })?; let v = v.parse().map_err(|e| { - ArrowError::IoError(format!("Cannot convert header value \"{v}\": {e}")) + ArrowError::ParseError(format!( + "Cannot convert header value \"{v}\": {e}" + )) })?; req.metadata_mut().insert(k, v); } if let Some(token) = &self.token { let val = format!("Bearer {token}").parse().map_err(|e| { - ArrowError::IoError(format!("Cannot convert token to header value: {e}")) + ArrowError::ParseError(format!( + "Cannot convert token to header value: {e}" + )) })?; req.metadata_mut().insert("authorization", val); } @@ -504,11 +508,11 @@ impl PreparedStatement { } fn decode_error_to_arrow_error(err: prost::DecodeError) -> ArrowError { - ArrowError::IoError(err.to_string()) + ArrowError::IpcError(err.to_string()) } fn status_to_arrow_error(status: tonic::Status) -> ArrowError { - ArrowError::IoError(format!("{status:?}")) + ArrowError::IpcError(format!("{status:?}")) } // A polymorphic structure to natively represent different types of data contained in `FlightData` diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 4f1a8e667ffc..71bcf4e0521a 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -386,7 +386,7 @@ async fn test_mismatched_schema_message() { do_test( make_primitive_batch(5), make_dictionary_batch(3), - "Error decoding ipc RecordBatch: Io error: Invalid data for schema", + "Error decoding ipc RecordBatch: Schema error: Invalid data for schema", ) .await; diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 07f716dea843..3569562af228 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -150,12 +150,12 @@ pub fn try_schema_from_flatbuffer_bytes(bytes: &[u8]) -> Result Result Result ArrayReader<'a> { fn next_node(&mut self, field: &Field) -> Result<&'a FieldNode, ArrowError> { self.nodes.next().ok_or_else(|| { - ArrowError::IoError(format!( + ArrowError::SchemaError(format!( "Invalid data for schema. {} refers to node not found in schema", field )) @@ -402,10 +402,10 @@ pub fn read_record_batch( metadata: &MetadataVersion, ) -> Result { let buffers = batch.buffers().ok_or_else(|| { - ArrowError::IoError("Unable to get buffers from IPC RecordBatch".to_string()) + ArrowError::IpcError("Unable to get buffers from IPC RecordBatch".to_string()) })?; let field_nodes = batch.nodes().ok_or_else(|| { - ArrowError::IoError("Unable to get field nodes from IPC RecordBatch".to_string()) + ArrowError::IpcError("Unable to get field nodes from IPC RecordBatch".to_string()) })?; let batch_compression = batch.compression(); let compression = batch_compression @@ -462,7 +462,7 @@ pub fn read_dictionary( metadata: &crate::MetadataVersion, ) -> Result<(), ArrowError> { if batch.isDelta() { - return Err(ArrowError::IoError( + return Err(ArrowError::InvalidArgumentError( "delta dictionary batches not supported".to_string(), )); } @@ -569,14 +569,14 @@ impl FileReader { let mut magic_buffer: [u8; 6] = [0; 6]; reader.read_exact(&mut magic_buffer)?; if magic_buffer != super::ARROW_MAGIC { - return Err(ArrowError::IoError( + return Err(ArrowError::ParseError( "Arrow file does not contain correct header".to_string(), )); } reader.seek(SeekFrom::End(-6))?; reader.read_exact(&mut magic_buffer)?; if magic_buffer != super::ARROW_MAGIC { - return Err(ArrowError::IoError( + return Err(ArrowError::ParseError( "Arrow file does not contain correct footer".to_string(), )); } @@ -592,11 +592,11 @@ impl FileReader { reader.read_exact(&mut footer_data)?; let footer = crate::root_as_footer(&footer_data[..]).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as footer: {err:?}")) + ArrowError::ParseError(format!("Unable to get root as footer: {err:?}")) })?; let blocks = footer.recordBatches().ok_or_else(|| { - ArrowError::IoError( + ArrowError::ParseError( "Unable to get record batches from IPC Footer".to_string(), ) })?; @@ -633,7 +633,9 @@ impl FileReader { reader.read_exact(&mut block_data)?; let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as message: {err:?}")) + ArrowError::ParseError(format!( + "Unable to get root as message: {err:?}" + )) })?; match message.header_type() { @@ -657,7 +659,7 @@ impl FileReader { )?; } t => { - return Err(ArrowError::IoError(format!( + return Err(ArrowError::ParseError(format!( "Expecting DictionaryBatch in dictionary blocks, found {t:?}." ))); } @@ -705,7 +707,7 @@ impl FileReader { /// Sets the current block to the index, allowing random reads pub fn set_index(&mut self, index: usize) -> Result<(), ArrowError> { if index >= self.total_blocks { - Err(ArrowError::IoError(format!( + Err(ArrowError::InvalidArgumentError(format!( "Cannot set batch to index {} from {} total batches", index, self.total_blocks ))) @@ -732,25 +734,25 @@ impl FileReader { let mut block_data = vec![0; meta_len as usize]; self.reader.read_exact(&mut block_data)?; let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as footer: {err:?}")) + ArrowError::ParseError(format!("Unable to get root as footer: {err:?}")) })?; // some old test data's footer metadata is not set, so we account for that if self.metadata_version != crate::MetadataVersion::V1 && message.version() != self.metadata_version { - return Err(ArrowError::IoError( + return Err(ArrowError::IpcError( "Could not read IPC message as metadata versions mismatch".to_string(), )); } match message.header_type() { - crate::MessageHeader::Schema => Err(ArrowError::IoError( + crate::MessageHeader::Schema => Err(ArrowError::IpcError( "Not expecting a schema when messages are read".to_string(), )), crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { - ArrowError::IoError( + ArrowError::IpcError( "Unable to read IPC message as record batch".to_string(), ) })?; @@ -774,7 +776,7 @@ impl FileReader { crate::MessageHeader::NONE => { Ok(None) } - t => Err(ArrowError::IoError(format!( + t => Err(ArrowError::InvalidArgumentError(format!( "Reading types other than record batches not yet supported, unable to read {t:?}" ))), } @@ -886,11 +888,11 @@ impl StreamReader { reader.read_exact(&mut meta_buffer)?; let message = crate::root_as_message(meta_buffer.as_slice()).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as message: {err:?}")) + ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) })?; // message header is a Schema, so read it let ipc_schema: crate::Schema = message.header_as_schema().ok_or_else(|| { - ArrowError::IoError("Unable to read IPC message as schema".to_string()) + ArrowError::ParseError("Unable to read IPC message as schema".to_string()) })?; let schema = crate::convert::fb_to_schema(ipc_schema); @@ -965,16 +967,16 @@ impl StreamReader { let vecs = &meta_buffer.to_vec(); let message = crate::root_as_message(vecs).map_err(|err| { - ArrowError::IoError(format!("Unable to get root as message: {err:?}")) + ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) })?; match message.header_type() { - crate::MessageHeader::Schema => Err(ArrowError::IoError( + crate::MessageHeader::Schema => Err(ArrowError::IpcError( "Not expecting a schema when messages are read".to_string(), )), crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { - ArrowError::IoError( + ArrowError::IpcError( "Unable to read IPC message as record batch".to_string(), ) })?; @@ -986,7 +988,7 @@ impl StreamReader { } crate::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().ok_or_else(|| { - ArrowError::IoError( + ArrowError::IpcError( "Unable to read IPC message as dictionary batch".to_string(), ) })?; @@ -1004,7 +1006,7 @@ impl StreamReader { crate::MessageHeader::NONE => { Ok(None) } - t => Err(ArrowError::IoError( + t => Err(ArrowError::InvalidArgumentError( format!("Reading types other than record batches not yet supported, unable to read {t:?} ") )), } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 1c56613d8f24..9c418d76e485 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -757,7 +757,7 @@ impl FileWriter { /// Write a record batch to the file pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { if self.finished { - return Err(ArrowError::IoError( + return Err(ArrowError::IpcError( "Cannot write record batch to file writer as it is closed".to_string(), )); } @@ -794,7 +794,7 @@ impl FileWriter { /// Write footer and closing tag, then mark the writer as done pub fn finish(&mut self) -> Result<(), ArrowError> { if self.finished { - return Err(ArrowError::IoError( + return Err(ArrowError::IpcError( "Cannot write footer to file writer as it is closed".to_string(), )); } @@ -909,7 +909,7 @@ impl StreamWriter { /// Write a record batch to the stream pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { if self.finished { - return Err(ArrowError::IoError( + return Err(ArrowError::IpcError( "Cannot write record batch to stream writer as it is closed".to_string(), )); } @@ -930,7 +930,7 @@ impl StreamWriter { /// Write continuation bytes, and mark the stream as done pub fn finish(&mut self) -> Result<(), ArrowError> { if self.finished { - return Err(ArrowError::IoError( + return Err(ArrowError::IpcError( "Cannot write footer to stream writer as it is closed".to_string(), )); } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index cd236c0871a6..8ea533db89af 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -35,7 +35,8 @@ pub enum ArrowError { DivideByZero, CsvError(String), JsonError(String), - IoError(String), + IoError(String, std::io::Error), + IpcError(String), InvalidArgumentError(String), ParquetError(String), /// Error during import or export to/from the C Data Interface @@ -53,7 +54,7 @@ impl ArrowError { impl From for ArrowError { fn from(error: std::io::Error) -> Self { - ArrowError::IoError(error.to_string()) + ArrowError::IoError(error.to_string(), error) } } @@ -65,7 +66,7 @@ impl From for ArrowError { impl From> for ArrowError { fn from(error: std::io::IntoInnerError) -> Self { - ArrowError::IoError(error.to_string()) + ArrowError::IoError(error.to_string(), error.into()) } } @@ -84,7 +85,8 @@ impl Display for ArrowError { ArrowError::DivideByZero => write!(f, "Divide by zero error"), ArrowError::CsvError(desc) => write!(f, "Csv error: {desc}"), ArrowError::JsonError(desc) => write!(f, "Json error: {desc}"), - ArrowError::IoError(desc) => write!(f, "Io error: {desc}"), + ArrowError::IoError(desc, _) => write!(f, "Io error: {desc}"), + ArrowError::IpcError(desc) => write!(f, "Ipc error: {desc}"), ArrowError::InvalidArgumentError(desc) => { write!(f, "Invalid argument error: {desc}") } diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index a9d2e8ab6bf2..7005cadc623c 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -258,7 +258,7 @@ fn get_error_code(err: &ArrowError) -> i32 { match err { ArrowError::NotYetImplemented(_) => ENOSYS, ArrowError::MemoryError(_) => ENOMEM, - ArrowError::IoError(_) => EIO, + ArrowError::IoError(_, _) => EIO, _ => EINVAL, } } From dfb1ea2469c9edf399e944821c6f00130cdf8016 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 24 Aug 2023 19:54:13 +0100 Subject: [PATCH 1178/1411] Fix new clippy lints (#4734) * Fix new clippy lints * More clippy * Even more clippy * Clippy --- arrow-array/src/array/dictionary_array.rs | 5 +- arrow-array/src/array/primitive_array.rs | 9 +- arrow-array/src/array/run_array.rs | 7 +- arrow-array/src/array/string_array.rs | 4 +- arrow-array/src/record_batch.rs | 2 +- arrow-array/src/run_iterator.rs | 4 +- arrow-array/src/trusted_len.rs | 2 +- arrow-buffer/src/native.rs | 2 +- arrow-buffer/src/util/bit_chunk_iterator.rs | 2 +- arrow-cast/src/cast.rs | 47 +--- arrow-cast/src/parse.rs | 14 +- arrow-cast/src/pretty.rs | 2 +- arrow-flight/src/utils.rs | 4 +- arrow-integration-testing/tests/ipc_reader.rs | 8 +- arrow-integration-testing/tests/ipc_writer.rs | 28 ++- arrow-ipc/src/convert.rs | 2 +- arrow-ipc/src/reader.rs | 4 +- arrow-ipc/src/writer.rs | 2 +- arrow-json/src/writer.rs | 10 +- arrow-ord/src/comparison.rs | 222 +++++------------- arrow-ord/src/partition.rs | 6 +- arrow-ord/src/sort.rs | 159 +++++-------- arrow-row/src/interner.rs | 2 +- arrow-select/src/concat.rs | 5 +- arrow-select/src/filter.rs | 2 +- arrow-string/src/length.rs | 1 + arrow-string/src/like.rs | 50 +--- arrow-string/src/regexp.rs | 10 +- arrow/tests/array_equal.rs | 6 +- object_store/src/memory.rs | 2 +- object_store/src/util.rs | 2 +- parquet/src/arrow/array_reader/byte_array.rs | 4 +- .../array_reader/byte_array_dictionary.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 6 +- parquet/src/arrow/buffer/bit_util.rs | 2 +- parquet/src/arrow/buffer/offset_buffer.rs | 4 +- parquet/src/column/writer/mod.rs | 13 +- parquet/src/encodings/decoding.rs | 50 ++-- parquet/src/file/serialized_reader.rs | 37 +-- parquet/src/record/reader.rs | 39 ++- parquet/src/schema/types.rs | 4 +- 41 files changed, 271 insertions(+), 515 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index ed043754da4b..5896cf02dfaa 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -800,10 +800,7 @@ pub struct TypedDictionaryArray<'a, K: ArrowDictionaryKeyType, V> { // Manually implement `Clone` to avoid `V: Clone` type constraint impl<'a, K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'a, K, V> { fn clone(&self) -> Self { - Self { - dictionary: self.dictionary, - values: self.values, - } + *self } } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 0c32279640b2..4c07e81468aa 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1580,7 +1580,7 @@ mod tests { assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); - let formatted = vec!["00:00:00.001", "10:30:00.005", "23:59:59.210"]; + let formatted = ["00:00:00.001", "10:30:00.005", "23:59:59.210"]; for (i, formatted) in formatted.iter().enumerate().take(3) { // check that we can't create dates or datetimes from time instances assert_eq!(None, arr.value_as_datetime(i)); @@ -1604,7 +1604,7 @@ mod tests { assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); - let formatted = vec!["00:00:00.001", "10:30:00.005", "23:59:59.210"]; + let formatted = ["00:00:00.001", "10:30:00.005", "23:59:59.210"]; for (i, item) in formatted.iter().enumerate().take(3) { // check that we can't create dates or datetimes from time instances assert_eq!(None, arr.value_as_datetime(i)); @@ -2219,7 +2219,7 @@ mod tests { #[test] fn test_decimal_from_iter_values() { - let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); + let array = Decimal128Array::from_iter_values(vec![-100, 0, 101]); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); assert_eq!(-100_i128, array.value(0)); @@ -2419,8 +2419,7 @@ mod tests { expected = "Trying to access an element at index 4 from a PrimitiveArray of length 3" )] fn test_fixed_size_binary_array_get_value_index_out_of_bound() { - let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); - + let array = Decimal128Array::from(vec![-100, 0, 101]); array.value(4); } diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 30cefaeb4d46..ba6986c28463 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -537,10 +537,7 @@ pub struct TypedRunArray<'a, R: RunEndIndexType, V> { // Manually implement `Clone` to avoid `V: Clone` type constraint impl<'a, R: RunEndIndexType, V> Clone for TypedRunArray<'a, R, V> { fn clone(&self) -> Self { - Self { - run_array: self.run_array, - values: self.values, - } + *self } } @@ -1093,7 +1090,7 @@ mod tests { let values = Int32Array::from(vec![Some(0), None, Some(1), None]); let array = RunArray::try_new(&run, &values).unwrap(); - let expected = vec![ + let expected = [ true, true, true, false, false, false, true, true, true, false, false, false, ]; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 9694cd2d4eec..cac4651f4496 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -324,14 +324,14 @@ mod tests { #[test] fn test_string_array_from_iter_values() { - let data = vec!["hello", "hello2"]; + let data = ["hello", "hello2"]; let array1 = StringArray::from_iter_values(data.iter()); assert_eq!(array1.value(0), "hello"); assert_eq!(array1.value(1), "hello2"); // Also works with String types. - let data2: Vec = vec!["goodbye".into(), "goodbye2".into()]; + let data2 = ["goodbye".to_string(), "goodbye2".to_string()]; let array2 = StringArray::from_iter_values(data2.iter()); assert_eq!(array2.value(0), "goodbye"); diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 886d00e0c2a9..27804447fba6 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -757,7 +757,7 @@ mod tests { )))) .add_child_data(a2_child.into_data()) .len(2) - .add_buffer(Buffer::from(vec![0i32, 3, 4].to_byte_slice())) + .add_buffer(Buffer::from([0i32, 3, 4].to_byte_slice())) .build() .unwrap(); let a2: ArrayRef = Arc::new(ListArray::from(a2)); diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 60022113c3dd..489aabf4756a 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -237,7 +237,7 @@ mod tests { Some(72), ]; let mut builder = PrimitiveRunBuilder::::new(); - builder.extend(input_vec.clone().into_iter()); + builder.extend(input_vec.iter().copied()); let ree_array = builder.finish(); let ree_array = ree_array.downcast::().unwrap(); @@ -261,7 +261,7 @@ mod tests { Some(72), ]; let mut builder = PrimitiveRunBuilder::::new(); - builder.extend(input_vec.into_iter()); + builder.extend(input_vec); let ree_array = builder.finish(); let ree_array = ree_array.downcast::().unwrap(); diff --git a/arrow-array/src/trusted_len.rs b/arrow-array/src/trusted_len.rs index fdec18b78781..781cad38f7e9 100644 --- a/arrow-array/src/trusted_len.rs +++ b/arrow-array/src/trusted_len.rs @@ -63,7 +63,7 @@ mod tests { #[test] fn trusted_len_unzip_good() { - let vec = vec![Some(1u32), None]; + let vec = [Some(1u32), None]; let (null, buffer) = unsafe { trusted_len_unzip(vec.iter()) }; assert_eq!(null.as_slice(), &[0b00000001]); assert_eq!(buffer.as_slice(), &[1u8, 0, 0, 0, 0, 0, 0, 0]); diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 8fe6cf2b7894..38074a8dc26c 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -222,7 +222,7 @@ pub trait ToByteSlice { impl ToByteSlice for [T] { #[inline] fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self.as_ptr() as *const T as *const u8; + let raw_ptr = self.as_ptr() as *const u8; unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of_val(self)) } } } diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index 3d9632e73229..6830acae94a1 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -157,7 +157,7 @@ impl<'a> UnalignedBitChunk<'a> { self.prefix .into_iter() .chain(self.chunks.iter().cloned()) - .chain(self.suffix.into_iter()) + .chain(self.suffix) } /// Counts the number of ones diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 23b7a4b5a05d..7b8e6144bb49 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3428,50 +3428,24 @@ mod tests { macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { + let output = $OUTPUT_TYPE_ARRAY::from($OUTPUT_VALUES) + .with_data_type($OUTPUT_TYPE.clone()); + // assert cast type let input_array_type = $INPUT_ARRAY.data_type(); assert!(can_cast_types(input_array_type, $OUTPUT_TYPE)); - let casted_array = cast($INPUT_ARRAY, $OUTPUT_TYPE).unwrap(); - let result_array = casted_array - .as_any() - .downcast_ref::<$OUTPUT_TYPE_ARRAY>() - .unwrap(); - assert_eq!($OUTPUT_TYPE, result_array.data_type()); - assert_eq!(result_array.len(), $OUTPUT_VALUES.len()); - for (i, x) in $OUTPUT_VALUES.iter().enumerate() { - match x { - Some(x) => { - assert!(!result_array.is_null(i)); - assert_eq!(result_array.value(i), *x); - } - None => { - assert!(result_array.is_null(i)); - } - } - } + let result = cast($INPUT_ARRAY, $OUTPUT_TYPE).unwrap(); + assert_eq!($OUTPUT_TYPE, result.data_type()); + assert_eq!(result.as_ref(), &output); let cast_option = CastOptions { safe: false, format_options: FormatOptions::default(), }; - let casted_array_with_option = + let result = cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); - let result_array = casted_array_with_option - .as_any() - .downcast_ref::<$OUTPUT_TYPE_ARRAY>() - .unwrap(); - assert_eq!($OUTPUT_TYPE, result_array.data_type()); - assert_eq!(result_array.len(), $OUTPUT_VALUES.len()); - for (i, x) in $OUTPUT_VALUES.iter().enumerate() { - match x { - Some(x) => { - assert_eq!(result_array.value(i), *x); - } - None => { - assert!(result_array.is_null(i)); - } - } - } + assert_eq!($OUTPUT_TYPE, result.data_type()); + assert_eq!(result.as_ref(), &output); }; } @@ -5997,7 +5971,7 @@ mod tests { #[test] fn test_str_to_str_casts() { - for data in vec![ + for data in [ vec![Some("foo"), Some("bar"), Some("ham")], vec![Some("foo"), None, Some("bar")], ] { @@ -8934,6 +8908,7 @@ mod tests { }; #[test] + #[allow(clippy::assertions_on_constants)] fn test_const_options() { assert!(CAST_OPTIONS.safe) } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 50bfca0f84bd..8483c44f9782 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -1003,15 +1003,11 @@ impl Interval { fn parse(value: &str, config: &IntervalParseConfig) -> Result { let components = parse_interval_components(value, config)?; - let result = components.into_iter().fold( - Ok(Self::default()), - |result, (amount, unit)| match result { - Ok(result) => result.add(amount, unit), - Err(e) => Err(e), - }, - )?; - - Ok(result) + components + .into_iter() + .try_fold(Self::default(), |result, (amount, unit)| { + result.add(amount, unit) + }) } /// Interval addition following Postgres behavior. Fractional units will be spilled into smaller units. diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 13d1df6a118d..59a9f9d605e2 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -848,7 +848,7 @@ mod tests { let mut buf = String::new(); write!(&mut buf, "{}", pretty_format_batches(&[batch]).unwrap()).unwrap(); - let s = vec![ + let s = [ "+---+-----+", "| a | b |", "+---+-----+", diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 8baf5ed7232a..145626b6608f 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -166,8 +166,8 @@ pub fn batches_to_flight_data( flight_data.push(encoded_batch.into()); } let mut stream = vec![schema_flight_data]; - stream.extend(dictionaries.into_iter()); - stream.extend(flight_data.into_iter()); + stream.extend(dictionaries); + stream.extend(flight_data); let flight_data: Vec<_> = stream.into_iter().collect(); Ok(flight_data) } diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 9205f4318393..696ab6e6053a 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -27,7 +27,7 @@ use std::fs::File; fn read_0_1_4() { let testdata = arrow_test_data(); let version = "0.14.1"; - let paths = vec![ + let paths = [ "generated_interval", "generated_datetime", "generated_dictionary", @@ -48,7 +48,7 @@ fn read_0_1_4() { fn read_0_1_7() { let testdata = arrow_test_data(); let version = "0.17.1"; - let paths = vec!["generated_union"]; + let paths = ["generated_union"]; paths.iter().for_each(|path| { verify_arrow_file(&testdata, version, path); verify_arrow_stream(&testdata, version, path); @@ -76,7 +76,7 @@ fn read_1_0_0_bigendian_dictionary_should_panic() { #[test] fn read_1_0_0_bigendian() { let testdata = arrow_test_data(); - let paths = vec![ + let paths = [ "generated_interval", "generated_datetime", "generated_map", @@ -145,7 +145,7 @@ fn read_2_0_0_compression() { let version = "2.0.0-compression"; // the test is repetitive, thus we can read all supported files at once - let paths = vec!["generated_lz4", "generated_zstd"]; + let paths = ["generated_lz4", "generated_zstd"]; paths.iter().for_each(|path| { verify_arrow_file(&testdata, version, path); verify_arrow_stream(&testdata, version, path); diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index 40f356b1d442..11707d935540 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -27,7 +27,7 @@ use std::io::Seek; fn write_0_1_4() { let testdata = arrow_test_data(); let version = "0.14.1"; - let paths = vec![ + let paths = [ "generated_interval", "generated_datetime", "generated_dictionary", @@ -48,7 +48,7 @@ fn write_0_1_4() { fn write_0_1_7() { let testdata = arrow_test_data(); let version = "0.17.1"; - let paths = vec!["generated_union"]; + let paths = ["generated_union"]; paths.iter().for_each(|path| { roundtrip_arrow_file(&testdata, version, path); roundtrip_arrow_stream(&testdata, version, path); @@ -59,7 +59,7 @@ fn write_0_1_7() { fn write_1_0_0_littleendian() { let testdata = arrow_test_data(); let version = "1.0.0-littleendian"; - let paths = vec![ + let paths = [ "generated_datetime", "generated_custom_metadata", "generated_decimal", @@ -94,10 +94,10 @@ fn write_1_0_0_littleendian() { fn write_2_0_0_compression() { let testdata = arrow_test_data(); let version = "2.0.0-compression"; - let paths = vec!["generated_lz4", "generated_zstd"]; + let paths = ["generated_lz4", "generated_zstd"]; // writer options for each compression type - let all_options = vec![ + let all_options = [ IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) .unwrap() .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) @@ -187,11 +187,12 @@ fn roundtrip_arrow_file_with_options( let rewrite_reader = FileReader::try_new(&tempfile, None).unwrap(); // Compare to original reader - reader.into_iter().zip(rewrite_reader.into_iter()).for_each( - |(batch1, batch2)| { + reader + .into_iter() + .zip(rewrite_reader) + .for_each(|(batch1, batch2)| { assert_eq!(batch1.unwrap(), batch2.unwrap()); - }, - ); + }); } } @@ -264,10 +265,11 @@ fn roundtrip_arrow_stream_with_options( let rewrite_reader = StreamReader::try_new(&tempfile, None).unwrap(); // Compare to original reader - reader.into_iter().zip(rewrite_reader.into_iter()).for_each( - |(batch1, batch2)| { + reader + .into_iter() + .zip(rewrite_reader) + .for_each(|(batch1, batch2)| { assert_eq!(batch1.unwrap(), batch2.unwrap()); - }, - ); + }); } } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 3569562af228..a78ccde6e169 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -717,7 +717,7 @@ pub(crate) fn get_fb_field_type<'a>( RunEndEncoded(run_ends, values) => { let run_ends_field = build_field(fbb, run_ends); let values_field = build_field(fbb, values); - let children = vec![run_ends_field, values_field]; + let children = [run_ends_field, values_field]; FBFieldType { type_type: crate::Type::RunEndEncoded, type_: crate::RunEndEncodedBuilder::new(fbb) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 962b17c39d7d..96cb4393ba58 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1156,7 +1156,7 @@ mod tests { let array10_input = vec![Some(1_i32), None, None]; let mut array10_builder = PrimitiveRunBuilder::::new(); - array10_builder.extend(array10_input.into_iter()); + array10_builder.extend(array10_input); let array10 = array10_builder.finish(); let array11 = BooleanArray::from(vec![false, false, true]); @@ -1411,7 +1411,7 @@ mod tests { let run_array_2_inupt = vec![Some(1_i32), None, None, Some(2), Some(2)]; let mut run_array_2_builder = PrimitiveRunBuilder::::new(); - run_array_2_builder.extend(run_array_2_inupt.into_iter()); + run_array_2_builder.extend(run_array_2_inupt); let run_array_2 = run_array_2_builder.finish(); let schema = Arc::new(Schema::new(vec![ diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 9c418d76e485..0e01e51231d6 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -2138,7 +2138,7 @@ mod tests { let u32 = UInt32Builder::new(); let mut ls = ListBuilder::new(u32); - for list in vec![vec![1u32, 2, 3], vec![4, 5, 6], vec![7, 8, 9, 10]] { + for list in [vec![1u32, 2, 3], vec![4, 5, 6], vec![7, 8, 9, 10]] { for value in list { ls.values().append_value(value); } diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 571e95a1a4ec..a918f44b54ff 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -320,11 +320,9 @@ fn set_column_for_json_rows( } DataType::Struct(_) => { let inner_objs = struct_array_to_jsonmap_array(array.as_struct())?; - rows.iter_mut() - .zip(inner_objs.into_iter()) - .for_each(|(row, obj)| { - row.insert(col_name.to_string(), Value::Object(obj)); - }); + rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { + row.insert(col_name.to_string(), Value::Object(obj)); + }); } DataType::List(_) => { let listarr = as_list_array(array); @@ -374,7 +372,7 @@ fn set_column_for_json_rows( let keys = keys.as_string::(); let values = array_to_json_array(values)?; - let mut kv = keys.iter().zip(values.into_iter()); + let mut kv = keys.iter().zip(values); for (i, row) in rows.iter_mut().enumerate() { if maparr.is_null(i) { diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 1a6e564283d7..4e475d8fd572 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -2187,25 +2187,17 @@ mod tests { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] fn $test_name() { + let expected = BooleanArray::from($expected); + let left = BinaryArray::from_vec($left); let right = BinaryArray::from_vec($right); let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } + assert_eq!(res, expected); let left = LargeBinaryArray::from_vec($left); let right = LargeBinaryArray::from_vec($right); let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } + assert_eq!(res, expected); } }; } @@ -2228,37 +2220,15 @@ mod tests { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] fn $test_name() { + let expected = BooleanArray::from($expected); + let left = BinaryArray::from_vec($left); let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {:?} at position {} to {:?} ", - left.value(i), - i, - $right - ); - } + assert_eq!(res, expected); let left = LargeBinaryArray::from_vec($left); let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {:?} at position {} to {:?} ", - left.value(i), - i, - $right - ); - } + assert_eq!(res, expected); } }; } @@ -2492,14 +2462,14 @@ mod tests { vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"], eq_utf8, - vec![true, false, false, false] + [true, false, false, false] ); test_utf8_scalar!( test_utf8_array_eq_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "arrow", eq_utf8_scalar, - vec![true, false, false, false] + [true, false, false, false] ); test_utf8!( @@ -2507,14 +2477,14 @@ mod tests { vec!["arrow", "arrow", "arrow", "arrow"], vec!["arrow", "parquet", "datafusion", "flight"], neq_utf8, - vec![false, true, true, true] + [false, true, true, true] ); test_utf8_scalar!( test_utf8_array_neq_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "arrow", neq_utf8_scalar, - vec![false, true, true, true] + [false, true, true, true] ); test_utf8!( @@ -2522,14 +2492,14 @@ mod tests { vec!["arrow", "datafusion", "flight", "parquet"], vec!["flight", "flight", "flight", "flight"], lt_utf8, - vec![true, true, false, false] + [true, true, false, false] ); test_utf8_scalar!( test_utf8_array_lt_scalar, vec!["arrow", "datafusion", "flight", "parquet"], "flight", lt_utf8_scalar, - vec![true, true, false, false] + [true, true, false, false] ); test_utf8!( @@ -2537,14 +2507,14 @@ mod tests { vec!["arrow", "datafusion", "flight", "parquet"], vec!["flight", "flight", "flight", "flight"], lt_eq_utf8, - vec![true, true, true, false] + [true, true, true, false] ); test_utf8_scalar!( test_utf8_array_lt_eq_scalar, vec!["arrow", "datafusion", "flight", "parquet"], "flight", lt_eq_utf8_scalar, - vec![true, true, true, false] + [true, true, true, false] ); test_utf8!( @@ -2552,14 +2522,14 @@ mod tests { vec!["arrow", "datafusion", "flight", "parquet"], vec!["flight", "flight", "flight", "flight"], gt_utf8, - vec![false, false, false, true] + [false, false, false, true] ); test_utf8_scalar!( test_utf8_array_gt_scalar, vec!["arrow", "datafusion", "flight", "parquet"], "flight", gt_utf8_scalar, - vec![false, false, false, true] + [false, false, false, true] ); test_utf8!( @@ -2567,14 +2537,14 @@ mod tests { vec!["arrow", "datafusion", "flight", "parquet"], vec!["flight", "flight", "flight", "flight"], gt_eq_utf8, - vec![false, false, true, true] + [false, false, true, true] ); test_utf8_scalar!( test_utf8_array_gt_eq_scalar, vec!["arrow", "datafusion", "flight", "parquet"], "flight", gt_eq_utf8_scalar, - vec![false, false, true, true] + [false, false, true, true] ); #[test] @@ -3365,8 +3335,8 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_dictionary_utf8_array() { - let test1 = vec!["a", "a", "b", "c"]; - let test2 = vec!["a", "b", "b", "c"]; + let test1 = ["a", "a", "b", "c"]; + let test2 = ["a", "b", "b", "c"]; let dict_array1: DictionaryArray = test1 .iter() @@ -3535,7 +3505,7 @@ mod tests { #[test] fn test_unary_cmp() { let a = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let values = vec![1_i32, 3]; + let values = [1_i32, 3]; let a_eq = unary_cmp(&a, |a| values.contains(&a)).unwrap(); assert_eq!( @@ -3638,14 +3608,8 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_float_nan() { - let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] - .into_iter() - .map(Some) - .collect(); - let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] - .into_iter() - .map(Some) - .collect(); + let array1 = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); + let array2 = Float16Array::from(vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], ); @@ -3660,14 +3624,8 @@ mod tests { assert_eq!(neq(&array1, &array2).unwrap(), expected); - let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); - let array2: Float32Array = vec![f32::NAN, f32::NAN, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); + let array1 = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); + let array2 = Float32Array::from(vec![f32::NAN, f32::NAN, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], ); @@ -3682,14 +3640,8 @@ mod tests { assert_eq!(neq(&array1, &array2).unwrap(), expected); - let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); - let array2: Float64Array = vec![f64::NAN, f64::NAN, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); + let array1 = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); + let array2 = Float64Array::from(vec![f64::NAN, f64::NAN, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), Some(true)], @@ -3708,14 +3660,8 @@ mod tests { #[test] fn test_lt_dyn_lt_eq_dyn_float_nan() { - let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] - .into_iter() - .map(Some) - .collect(); - let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)] - .into_iter() - .map(Some) - .collect(); + let array1 = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN]); + let array2 = Float16Array::from(vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], @@ -3731,14 +3677,8 @@ mod tests { assert_eq!(lt_eq(&array1, &array2).unwrap(), expected); - let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] - .into_iter() - .map(Some) - .collect(); - let array2: Float32Array = vec![f32::NAN, f32::NAN, 8.0, 9.0, 10.0, 1.0] - .into_iter() - .map(Some) - .collect(); + let array1 = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN]); + let array2 = Float32Array::from(vec![f32::NAN, f32::NAN, 8.0, 9.0, 10.0, 1.0]); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), Some(false), Some(false)], @@ -3780,14 +3720,8 @@ mod tests { #[test] fn test_gt_dyn_gt_eq_dyn_float_nan() { - let array1: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN] - .into_iter() - .map(Some) - .collect(); - let array2: Float16Array = vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)] - .into_iter() - .map(Some) - .collect(); + let array1 = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(11.0), f16::NAN]); + let array2 = Float16Array::from(vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(9.0), f16::from_f32(10.0), f16::from_f32(1.0)]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -3803,14 +3737,8 @@ mod tests { assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); - let array1: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN] - .into_iter() - .map(Some) - .collect(); - let array2: Float32Array = vec![f32::NAN, f32::NAN, 8.0, 9.0, 10.0, 1.0] - .into_iter() - .map(Some) - .collect(); + let array1 = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 11.0, f32::NAN]); + let array2 = Float32Array::from(vec![f32::NAN, f32::NAN, 8.0, 9.0, 10.0, 1.0]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -3826,14 +3754,8 @@ mod tests { assert_eq!(gt_eq(&array1, &array2).unwrap(), expected); - let array1: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN] - .into_iter() - .map(Some) - .collect(); - let array2: Float64Array = vec![f64::NAN, f64::NAN, 8.0, 9.0, 10.0, 1.0] - .into_iter() - .map(Some) - .collect(); + let array1 = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 11.0, f64::NAN]); + let array2 = Float64Array::from(vec![f64::NAN, f64::NAN, 8.0, 9.0, 10.0, 1.0]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(true), Some(true)], @@ -3852,10 +3774,7 @@ mod tests { #[test] fn test_eq_dyn_scalar_neq_dyn_scalar_float_nan() { - let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] - .into_iter() - .map(Some) - .collect(); + let array = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], @@ -3867,11 +3786,7 @@ mod tests { ); assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); - + let array = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); @@ -3882,11 +3797,7 @@ mod tests { ); assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); - + let array = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(true), Some(false), Some(false), Some(false), Some(false)], ); @@ -3900,10 +3811,7 @@ mod tests { #[test] fn test_lt_dyn_scalar_lt_eq_dyn_scalar_float_nan() { - let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] - .into_iter() - .map(Some) - .collect(); + let array = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], @@ -3915,10 +3823,7 @@ mod tests { ); assert_eq!(lt_eq_dyn_scalar(&array, f16::NAN).unwrap(), expected); - let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); + let array = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], @@ -3930,11 +3835,7 @@ mod tests { ); assert_eq!(lt_eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); - + let array = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(false), Some(true), Some(true), Some(true), Some(true)], ); @@ -3948,10 +3849,13 @@ mod tests { #[test] fn test_gt_dyn_scalar_gt_eq_dyn_scalar_float_nan() { - let array: Float16Array = vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)] - .into_iter() - .map(Some) - .collect(); + let array = Float16Array::from(vec![ + f16::NAN, + f16::from_f32(7.0), + f16::from_f32(8.0), + f16::from_f32(8.0), + f16::from_f32(10.0), + ]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); @@ -3962,10 +3866,7 @@ mod tests { ); assert_eq!(gt_eq_dyn_scalar(&array, f16::NAN).unwrap(), expected); - let array: Float32Array = vec![f32::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); + let array = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); @@ -3976,10 +3877,7 @@ mod tests { ); assert_eq!(gt_eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let array: Float64Array = vec![f64::NAN, 7.0, 8.0, 8.0, 10.0] - .into_iter() - .map(Some) - .collect(); + let array = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); let expected = BooleanArray::from( vec![Some(false), Some(false), Some(false), Some(false), Some(false)], ); @@ -3993,8 +3891,8 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_dictionary_to_utf8_array() { - let test1 = vec!["a", "a", "b", "c"]; - let test2 = vec!["a", "b", "b", "d"]; + let test1 = ["a", "a", "b", "c"]; + let test2 = ["a", "b", "b", "d"]; let dict_array: DictionaryArray = test1 .iter() @@ -4033,8 +3931,8 @@ mod tests { #[test] fn test_lt_dyn_lt_eq_dyn_gt_dyn_gt_eq_dyn_dictionary_to_utf8_array() { - let test1 = vec!["abc", "abc", "b", "cde"]; - let test2 = vec!["abc", "b", "b", "def"]; + let test1 = ["abc", "abc", "b", "cde"]; + let test2 = ["abc", "b", "b", "def"]; let dict_array: DictionaryArray = test1 .iter() @@ -4380,7 +4278,7 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2]); let dict_array = DictionaryArray::new(keys, Arc::new(values)); - let array: BooleanArray = test2.iter().collect(); + let array = BooleanArray::from(test2); let result = eq_dyn(&dict_array, &array); assert_eq!( @@ -4416,7 +4314,7 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2]); let dict_array = DictionaryArray::new(keys, Arc::new(values)); - let array: BooleanArray = test2.iter().collect(); + let array = BooleanArray::from(test2); let result = lt_dyn(&dict_array, &array); assert_eq!( diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 0b8447989b8d..80b25ee2afba 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -220,8 +220,10 @@ mod tests { Arc::new(Int32Array::from(vec![1])) as _, Arc::new(Int32Array::from(vec![1])) as _, ]) - .unwrap(); - assert_eq!(results.ranges(), &[0..1]); + .unwrap() + .ranges(); + assert_eq!(results.len(), 1); + assert_eq!(results[0], 0..1); } #[test] diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 87858630599f..6c8c3b8facef 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -1661,22 +1661,26 @@ mod tests { #[test] fn test_sort_indices_decimal256() { + let data = vec![ + None, + Some(i256::from_i128(5)), + Some(i256::from_i128(2)), + Some(i256::from_i128(3)), + Some(i256::from_i128(1)), + Some(i256::from_i128(4)), + None, + ]; + // decimal default test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), None, None, vec![0, 6, 4, 2, 3, 5, 1], ); // decimal descending test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: false, @@ -1686,10 +1690,7 @@ mod tests { ); // decimal null_first and descending test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: true, @@ -1699,10 +1700,7 @@ mod tests { ); // decimal null_first test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: false, nulls_first: true, @@ -1711,21 +1709,10 @@ mod tests { vec![0, 6, 4, 2, 3, 5, 1], ); // limit - test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - None, - Some(3), - vec![0, 6, 4], - ); + test_sort_to_indices_decimal256_array(data.clone(), None, Some(3), vec![0, 6, 4]); // limit descending test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: false, @@ -1735,10 +1722,7 @@ mod tests { ); // limit descending null_first test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: true, @@ -1748,10 +1732,7 @@ mod tests { ); // limit null_first test_sort_to_indices_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data, Some(SortOptions { descending: false, nulls_first: true, @@ -1763,14 +1744,15 @@ mod tests { #[test] fn test_sort_indices_decimal256_max_min() { + let data = vec![ + None, + Some(i256::MIN), + Some(i256::from_i128(1)), + Some(i256::MAX), + Some(i256::from_i128(-1)), + ]; test_sort_to_indices_decimal256_array( - vec![ - None, - Some(i256::MIN), - Some(i256::from_i128(1)), - Some(i256::MAX), - Some(i256::from_i128(-1)), - ], + data.clone(), Some(SortOptions { descending: false, nulls_first: true, @@ -1780,13 +1762,7 @@ mod tests { ); test_sort_to_indices_decimal256_array( - vec![ - None, - Some(i256::MIN), - Some(i256::from_i128(1)), - Some(i256::MAX), - Some(i256::from_i128(-1)), - ], + data.clone(), Some(SortOptions { descending: true, nulls_first: true, @@ -1796,13 +1772,7 @@ mod tests { ); test_sort_to_indices_decimal256_array( - vec![ - None, - Some(i256::MIN), - Some(i256::from_i128(1)), - Some(i256::MAX), - Some(i256::from_i128(-1)), - ], + data.clone(), Some(SortOptions { descending: false, nulls_first: true, @@ -1812,13 +1782,7 @@ mod tests { ); test_sort_to_indices_decimal256_array( - vec![ - None, - Some(i256::MIN), - Some(i256::from_i128(1)), - Some(i256::MAX), - Some(i256::from_i128(-1)), - ], + data.clone(), Some(SortOptions { descending: true, nulls_first: true, @@ -1908,124 +1872,109 @@ mod tests { #[test] fn test_sort_decimal256() { + let data = vec![ + None, + Some(i256::from_i128(5)), + Some(i256::from_i128(2)), + Some(i256::from_i128(3)), + Some(i256::from_i128(1)), + Some(i256::from_i128(4)), + None, + ]; // decimal default test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), None, None, - vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] + [None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // decimal descending test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: false, }), None, - vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None] + [Some(5), Some(4), Some(3), Some(2), Some(1), None, None] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // decimal null_first and descending test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: true, }), None, - vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)] + [None, None, Some(5), Some(4), Some(3), Some(2), Some(1)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // decimal null_first test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: false, nulls_first: true, }), None, - vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] + [None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // limit test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), None, Some(3), - vec![None, None, Some(1)] + [None, None, Some(1)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // limit descending test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: false, }), Some(3), - vec![Some(5), Some(4), Some(3)] + [Some(5), Some(4), Some(3)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // limit descending null_first test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data.clone(), Some(SortOptions { descending: true, nulls_first: true, }), Some(3), - vec![None, None, Some(5)] + [None, None, Some(5)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), ); // limit null_first test_sort_decimal256_array( - vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), + data, Some(SortOptions { descending: false, nulls_first: true, }), Some(3), - vec![None, None, Some(1)] + [None, None, Some(1)] .iter() .map(|v| v.map(i256::from_i128)) .collect(), diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs index fde9251952c0..9f5f0b3d33d2 100644 --- a/arrow-row/src/interner.rs +++ b/arrow-row/src/interner.rs @@ -417,7 +417,7 @@ mod tests { #[test] fn test_intern_duplicates() { // Unsorted with duplicates - let values = vec![0_u8, 1, 8, 4, 1, 0]; + let values = [0_u8, 1, 8, 4, 1, 0]; let mut interner = OrderPreservingInterner::default(); let interned = interner.intern(values.iter().map(std::slice::from_ref).map(Some)); diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 31846ee1fdc3..eed20699c239 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -338,10 +338,7 @@ mod tests { let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); - let expected = list1 - .into_iter() - .chain(list2.into_iter()) - .chain(list3.into_iter()); + let expected = list1.into_iter().chain(list2).chain(list3); let array_expected = ListArray::from_iter_primitive::(expected); assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index f2da79e243c8..1afb8197bab6 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -868,7 +868,7 @@ mod tests { #[test] fn test_filter_dictionary_array() { - let values = vec![Some("hello"), None, Some("world"), Some("!")]; + let values = [Some("hello"), None, Some("world"), Some("!")]; let a: Int8DictionaryArray = values.iter().copied().collect(); let b = BooleanArray::from(vec![false, true, true, false]); let c = filter(&a, &b).unwrap(); diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 25d6414ec8e6..fb47f70af342 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -16,6 +16,7 @@ // under the License. //! Defines kernel for length of string arrays and binary arrays +#![allow(clippy::redundant_closure_call)] use arrow_array::*; use arrow_array::{cast::AsArray, types::*}; diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 9d3abea66fb1..57cc22f2c549 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -736,15 +736,11 @@ mod tests { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] fn $test_name() { + let expected = BooleanArray::from($expected); let left = StringArray::from($left); let right = StringArray::from($right); let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } + assert_eq!(res, expected); } }; } @@ -754,15 +750,11 @@ mod tests { #[test] #[cfg(feature = "dyn_cmp_dict")] fn $test_name() { + let expected = BooleanArray::from($expected); let left: DictionaryArray = $left.into_iter().collect(); let right: DictionaryArray = $right.into_iter().collect(); let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } + assert_eq!(res, expected); } }; } @@ -771,37 +763,15 @@ mod tests { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] fn $test_name() { + let expected = BooleanArray::from($expected); + let left = StringArray::from($left); let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } + assert_eq!(res, expected); let left = LargeStringArray::from($left); let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } + assert_eq!(res, expected); } }; ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { @@ -953,7 +923,7 @@ mod tests { test_utf8!( test_utf8_scalar_ilike_regex, vec!["%%%"], - vec![r#"\%_\%"#], + vec![r"\%_\%"], ilike_utf8, vec![true] ); @@ -961,7 +931,7 @@ mod tests { test_dict_utf8!( test_utf8_scalar_ilike_regex_dict, vec!["%%%"], - vec![r#"\%_\%"#], + vec![r"\%_\%"], ilike_dyn, vec![true] ); diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index e28564bdae95..af4d66f97fd0 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -398,7 +398,7 @@ mod tests { vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"], vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], regexp_is_match_utf8, - vec![true, false, true, false, false, true] + [true, false, true, false, false, true] ); test_flag_utf8!( test_utf8_array_regexp_is_match_insensitive, @@ -406,7 +406,7 @@ mod tests { vec!["^ar", "^AR", "ow$", "OW$", "foo", ""], vec!["i"; 6], regexp_is_match_utf8, - vec![true, true, true, true, false, true] + [true, true, true, true, false, true] ); test_flag_utf8_scalar!( @@ -414,14 +414,14 @@ mod tests { vec!["arrow", "ARROW", "parquet", "PARQUET"], "^ar", regexp_is_match_utf8_scalar, - vec![true, false, false, false] + [true, false, false, false] ); test_flag_utf8_scalar!( test_utf8_array_regexp_is_match_empty_scalar, vec!["arrow", "ARROW", "parquet", "PARQUET"], "", regexp_is_match_utf8_scalar, - vec![true, true, true, true] + [true, true, true, true] ); test_flag_utf8_scalar!( test_utf8_array_regexp_is_match_insensitive_scalar, @@ -429,6 +429,6 @@ mod tests { "^ar", "i", regexp_is_match_utf8_scalar, - vec![true, true, false, false] + [true, true, false, false] ); } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 4abe31a36cf5..317287c102f2 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -399,7 +399,7 @@ fn test_empty_offsets_list_equal() { true, )))) .len(0) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_buffer(Buffer::from([0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) .add_child_data(Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]).into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() @@ -437,7 +437,7 @@ fn test_list_null() { true, )))) .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_buffer(Buffer::from([0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) .add_child_data(c_values.into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() @@ -460,7 +460,7 @@ fn test_list_null() { true, )))) .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) + .add_buffer(Buffer::from([0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) .add_child_data(d_values.into_data()) .null_bit_buffer(Some(Buffer::from(vec![0b00001001]))) .build() diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 1e8e3c1fd005..0e229885b006 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -393,7 +393,7 @@ impl AsyncWrite for InMemoryAppend { if let Some((bytes, _)) = writer.remove(&self.location) { let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(bytes.into_iter().chain(buf.into_iter())); + let concat = Bytes::from_iter(bytes.into_iter().chain(buf)); writer.insert(self.location.clone(), (concat, Utc::now())); } else { writer.insert( diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 79ca4bb7a834..07d3ed44ca16 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -207,7 +207,7 @@ mod tests { let fetches = do_fetch(vec![], 0).await; assert!(fetches.is_empty()); - let fetches = do_fetch(vec![0..3], 0).await; + let fetches = do_fetch(vec![0..3; 1], 0).await; assert_eq!(fetches, vec![0..3]); let fetches = do_fetch(vec![0..2, 3..5], 0).await; diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 43db658d9324..4612f816146a 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -636,7 +636,7 @@ mod tests { assert_eq!(decoder.read(&mut output, 4..8).unwrap(), 0); - let valid = vec![false, false, true, true, false, true, true, false, false]; + let valid = [false, false, true, true, false, true, true, false, false]; let valid_buffer = Buffer::from_iter(valid.iter().cloned()); output.pad_nulls(0, 4, valid.len(), valid_buffer.as_slice()); @@ -690,7 +690,7 @@ mod tests { assert_eq!(decoder.read(&mut output, 4..8).unwrap(), 0); - let valid = vec![false, false, true, true, false, false]; + let valid = [false, false, true, true, false, false]; let valid_buffer = Buffer::from_iter(valid.iter().cloned()); output.pad_nulls(0, 2, valid.len(), valid_buffer.as_slice()); diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 763a6ccee2c3..841f5a95fd4e 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -510,7 +510,7 @@ mod tests { assert_eq!(decoder.read(&mut output, 4..5).unwrap(), 1); assert_eq!(decoder.skip_values(4).unwrap(), 0); - let valid = vec![true, true, true, true, true]; + let valid = [true, true, true, true, true]; let valid_buffer = Buffer::from_iter(valid.iter().cloned()); output.pad_nulls(0, 5, 5, valid_buffer.as_slice()); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index c4d174b6adc1..5417ebe894a3 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -859,7 +859,7 @@ mod tests { let expected_batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(a), Arc::new(b)]).unwrap(); - for buffer in vec![ + for buffer in [ get_bytes_after_close(schema.clone(), &expected_batch), get_bytes_by_into_inner(schema, &expected_batch), ] { @@ -2158,7 +2158,7 @@ mod tests { #[test] fn u32_min_max() { // check values roundtrip through parquet - let src = vec![ + let src = [ u32::MIN, u32::MIN + 1, (i32::MAX as u32) - 1, @@ -2199,7 +2199,7 @@ mod tests { #[test] fn u64_min_max() { // check values roundtrip through parquet - let src = vec![ + let src = [ u64::MIN, u64::MIN + 1, (i64::MAX as u64) - 1, diff --git a/parquet/src/arrow/buffer/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs index 2781190331c5..d01556d24e30 100644 --- a/parquet/src/arrow/buffer/bit_util.rs +++ b/parquet/src/arrow/buffer/bit_util.rs @@ -35,7 +35,7 @@ pub fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator + '_ { .prefix() .into_iter() .chain(unaligned.chunks().iter().cloned()) - .chain(unaligned.suffix().into_iter()); + .chain(unaligned.suffix()); iter.rev().flat_map(move |mut chunk| { let chunk_idx = chunk_end_idx - 64; diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index c8732bc4ed13..07d78e8a3282 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -281,10 +281,10 @@ mod tests { buffer.try_push(v.as_bytes(), false).unwrap() } - let valid = vec![ + let valid = [ true, false, false, true, false, true, false, true, true, false, false, ]; - let valid_mask = Buffer::from_iter(valid.iter().cloned()); + let valid_mask = Buffer::from_iter(valid.iter().copied()); // Both trailing and leading nulls buffer.pad_nulls(1, values.len() - 1, valid.len() - 1, valid_mask.as_slice()); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 3d8ce283ae64..8c1c55409988 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2108,10 +2108,10 @@ mod tests { #[test] fn test_byte_array_statistics() { - let input = vec!["aawaa", "zz", "aaw", "m", "qrs"] + let input = ["aawaa", "zz", "aaw", "m", "qrs"] .iter() .map(|&s| s.into()) - .collect::>(); + .collect::>(); let stats = statistics_roundtrip::(&input); assert!(!stats.is_min_max_backwards_compatible()); @@ -2126,13 +2126,10 @@ mod tests { #[test] fn test_fixed_len_byte_array_statistics() { - let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "] + let input = ["aawaa", "zz ", "aaw ", "m ", "qrs "] .iter() - .map(|&s| { - let b: ByteArray = s.into(); - b.into() - }) - .collect::>(); + .map(|&s| ByteArray::from(s).into()) + .collect::>(); let stats = statistics_roundtrip::(&input); assert!(stats.has_min_max_set()); diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 8058335875c9..7aed6df419ee 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -1128,9 +1128,9 @@ mod tests { #[test] fn test_plain_decode_int32() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - let mut buffer = vec![0; 3]; + let mut buffer = [0; 3]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 3, @@ -1142,7 +1142,7 @@ mod tests { #[test] fn test_plain_skip_int32() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1155,7 +1155,7 @@ mod tests { #[test] fn test_plain_skip_all_int32() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } @@ -1165,7 +1165,7 @@ mod tests { let data = [42, 18, 52]; let expected_data = [0, 42, 0, 18, 0, 0, 52, 0]; let data_bytes = Int32Type::to_byte_array(&data[..]); - let mut buffer = vec![0; 8]; + let mut buffer = [0; 8]; let num_nulls = 5; let valid_bits = [0b01001010]; test_plain_decode_spaced::( @@ -1181,9 +1181,9 @@ mod tests { #[test] fn test_plain_decode_int64() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - let mut buffer = vec![0; 3]; + let mut buffer = [0; 3]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 3, @@ -1195,7 +1195,7 @@ mod tests { #[test] fn test_plain_skip_int64() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1208,16 +1208,16 @@ mod tests { #[test] fn test_plain_skip_all_int64() { - let data = vec![42, 18, 52]; + let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 3, -1, &[]); } #[test] fn test_plain_decode_float() { - let data = vec![PI_f32, 2.414, 12.51]; + let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - let mut buffer = vec![0.0; 3]; + let mut buffer = [0.0; 3]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 3, @@ -1229,7 +1229,7 @@ mod tests { #[test] fn test_plain_skip_float() { - let data = vec![PI_f32, 2.414, 12.51]; + let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1242,14 +1242,14 @@ mod tests { #[test] fn test_plain_skip_all_float() { - let data = vec![PI_f32, 2.414, 12.51]; + let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); } #[test] fn test_plain_skip_double() { - let data = vec![PI_f64, 2.414f64, 12.51f64]; + let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); test_plain_skip::( ByteBufferPtr::new(data_bytes), @@ -1262,16 +1262,16 @@ mod tests { #[test] fn test_plain_skip_all_double() { - let data = vec![PI_f64, 2.414f64, 12.51f64]; + let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] fn test_plain_decode_double() { - let data = vec![PI_f64, 2.414f64, 12.51f64]; + let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - let mut buffer = vec![0.0f64; 3]; + let mut buffer = [0.0f64; 3]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 3, @@ -1283,13 +1283,13 @@ mod tests { #[test] fn test_plain_decode_int96() { - let mut data = vec![Int96::new(); 4]; + let mut data = [Int96::new(); 4]; data[0].set_data(11, 22, 33); data[1].set_data(44, 55, 66); data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - let mut buffer = vec![Int96::new(); 4]; + let mut buffer = [Int96::new(); 4]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 4, @@ -1301,7 +1301,7 @@ mod tests { #[test] fn test_plain_skip_int96() { - let mut data = vec![Int96::new(); 4]; + let mut data = [Int96::new(); 4]; data[0].set_data(11, 22, 33); data[1].set_data(44, 55, 66); data[2].set_data(10, 20, 30); @@ -1318,7 +1318,7 @@ mod tests { #[test] fn test_plain_skip_all_int96() { - let mut data = vec![Int96::new(); 4]; + let mut data = [Int96::new(); 4]; data[0].set_data(11, 22, 33); data[1].set_data(44, 55, 66); data[2].set_data(10, 20, 30); @@ -1329,11 +1329,11 @@ mod tests { #[test] fn test_plain_decode_bool() { - let data = vec![ + let data = [ false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - let mut buffer = vec![false; 10]; + let mut buffer = [false; 10]; test_plain_decode::( ByteBufferPtr::new(data_bytes), 10, @@ -1345,7 +1345,7 @@ mod tests { #[test] fn test_plain_skip_bool() { - let data = vec![ + let data = [ false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); @@ -1360,7 +1360,7 @@ mod tests { #[test] fn test_plain_skip_all_bool() { - let data = vec![ + let data = [ false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 8eccf3408a55..4924dcc6f35a 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -851,38 +851,23 @@ mod tests { #[test] fn test_file_reader_into_iter() { let path = get_test_path("alltypes_plain.parquet"); - let vec = vec![path.clone(), path] - .iter() - .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) - .flat_map(|r| r.into_iter()) - .flat_map(|r| r.unwrap().get_int(0)) - .collect::>(); - - // rows in the parquet file are not sorted by "id" - // each file contains [id:4, id:5, id:6, id:7, id:2, id:3, id:0, id:1] - assert_eq!(vec, vec![4, 5, 6, 7, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3, 0, 1]); + let reader = SerializedFileReader::try_from(path.as_path()).unwrap(); + let iter = reader.into_iter(); + let values: Vec<_> = iter.flat_map(|x| x.unwrap().get_int(0)).collect(); + + assert_eq!(values, &[4, 5, 6, 7, 2, 3, 0, 1]); } #[test] fn test_file_reader_into_iter_project() { let path = get_test_path("alltypes_plain.parquet"); - let result = vec![path] - .iter() - .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) - .flat_map(|r| { - let schema = "message schema { OPTIONAL INT32 id; }"; - let proj = parse_message_type(schema).ok(); - - r.into_iter().project(proj).unwrap() - }) - .map(|r| format!("{}", r.unwrap())) - .collect::>() - .join(","); + let reader = SerializedFileReader::try_from(path.as_path()).unwrap(); + let schema = "message schema { OPTIONAL INT32 id; }"; + let proj = parse_message_type(schema).ok(); + let iter = reader.into_iter().project(proj).unwrap(); + let values: Vec<_> = iter.flat_map(|x| x.unwrap().get_int(0)).collect(); - assert_eq!( - result, - "{id: 4},{id: 5},{id: 6},{id: 7},{id: 2},{id: 3},{id: 0},{id: 1}" - ); + assert_eq!(values, &[4, 5, 6, 7, 2, 3, 0, 1]); } #[test] diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 5a1d8406575c..1069eab15f23 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -828,7 +828,7 @@ mod tests { use crate::errors::Result; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; + use crate::record::api::{Field, Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::util::test_common::file_util::{get_test_file, get_test_path}; use std::convert::TryFrom; @@ -1500,33 +1500,26 @@ mod tests { #[test] fn test_file_reader_iter() { let path = get_test_path("alltypes_plain.parquet"); - let vec = vec![path] - .iter() - .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) - .flat_map(|r| RowIter::from_file_into(Box::new(r))) - .flat_map(|r| r.unwrap().get_int(0)) - .collect::>(); - - assert_eq!(vec, vec![4, 5, 6, 7, 2, 3, 0, 1]); + let reader = SerializedFileReader::try_from(path.as_path()).unwrap(); + let iter = RowIter::from_file_into(Box::new(reader)); + + let values: Vec<_> = iter.flat_map(|r| r.unwrap().get_int(0)).collect(); + assert_eq!(values, &[4, 5, 6, 7, 2, 3, 0, 1]); } #[test] fn test_file_reader_iter_projection() { let path = get_test_path("alltypes_plain.parquet"); - let values = vec![path] - .iter() - .map(|p| SerializedFileReader::try_from(p.as_path()).unwrap()) - .flat_map(|r| { - let schema = "message schema { OPTIONAL INT32 id; }"; - let proj = parse_message_type(schema).ok(); - - RowIter::from_file_into(Box::new(r)).project(proj).unwrap() - }) - .map(|r| format!("id:{}", r.unwrap().fmt(0))) - .collect::>() - .join(", "); - - assert_eq!(values, "id:4, id:5, id:6, id:7, id:2, id:3, id:0, id:1"); + let reader = SerializedFileReader::try_from(path.as_path()).unwrap(); + let schema = "message schema { OPTIONAL INT32 id; }"; + let proj = parse_message_type(schema).ok(); + + let iter = RowIter::from_file_into(Box::new(reader)) + .project(proj) + .unwrap(); + let values: Vec<_> = iter.flat_map(|r| r.unwrap().get_int(0)).collect(); + + assert_eq!(values, &[4, 5, 6, 7, 2, 3, 0, 1]); } #[test] diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index f0e1a7f27a8f..f4cb3a9956d6 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -1652,8 +1652,8 @@ mod tests { // required int64 item1 2 1 // optional boolean item2 3 1 // repeated int32 item3 3 2 - let ex_max_def_levels = vec![0, 1, 1, 2, 3, 3]; - let ex_max_rep_levels = vec![0, 0, 1, 1, 1, 2]; + let ex_max_def_levels = [0, 1, 1, 2, 3, 3]; + let ex_max_rep_levels = [0, 0, 1, 1, 1, 2]; for i in 0..nleaves { let col = descr.column(i); From f2ba18b89ab19a8e410f37de390c8c6bd5b0a86f Mon Sep 17 00:00:00 2001 From: Gordon Wang <36049150+gordonwang0@users.noreply.github.com> Date: Fri, 25 Aug 2023 01:07:26 -0700 Subject: [PATCH 1179/1411] Add `with_proxy_ca_certificate` and `with_proxy_excludes` (#4714) * Add proxy_auth and proxy_exclude * Add proxy_ca_certificate * Add public fns * rename fn * clippy fix --- object_store/src/aws/mod.rs | 17 ++++++++++++ object_store/src/azure/mod.rs | 19 ++++++++++++- object_store/src/client/mod.rs | 50 ++++++++++++++++++++++++++++++++-- object_store/src/gcp/mod.rs | 17 ++++++++++++ 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 7e16b5a1baf6..db3e1b9a4bbe 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -946,6 +946,23 @@ impl AmazonS3Builder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 27bbbfb64d3f..2a07710d09d6 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -436,7 +436,7 @@ pub enum AzureConfigKey { /// Use object store with url scheme account.dfs.fabric.microsoft.com /// - /// Supported keys: + /// Supported keys: /// - `azure_use_fabric_endpoint` /// - `use_fabric_endpoint` UseFabricEndpoint, @@ -909,6 +909,23 @@ impl MicrosoftAzureBuilder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 5f3a042be46a..d4995a5b143f 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -48,7 +48,7 @@ use std::sync::Arc; use std::time::Duration; use reqwest::header::{HeaderMap, HeaderValue}; -use reqwest::{Client, ClientBuilder, Proxy, RequestBuilder}; +use reqwest::{Client, ClientBuilder, NoProxy, Proxy, RequestBuilder}; use serde::{Deserialize, Serialize}; use crate::config::{fmt_duration, ConfigValue}; @@ -103,6 +103,10 @@ pub enum ClientConfigKey { PoolMaxIdlePerHost, /// HTTP proxy to use for requests ProxyUrl, + /// PEM-formatted CA certificate for proxy connections + ProxyCaCertificate, + /// List of hosts that bypass proxy + ProxyExcludes, /// Request timeout /// /// The timeout is applied from when the request starts connecting until the @@ -127,6 +131,8 @@ impl AsRef for ClientConfigKey { Self::PoolIdleTimeout => "pool_idle_timeout", Self::PoolMaxIdlePerHost => "pool_max_idle_per_host", Self::ProxyUrl => "proxy_url", + Self::ProxyCaCertificate => "proxy_ca_certificate", + Self::ProxyExcludes => "proxy_excludes", Self::Timeout => "timeout", Self::UserAgent => "user_agent", } @@ -168,6 +174,8 @@ pub struct ClientOptions { default_content_type: Option, default_headers: Option, proxy_url: Option, + proxy_ca_certificate: Option, + proxy_excludes: Option, allow_http: ConfigValue, allow_insecure: ConfigValue, timeout: Option>, @@ -216,6 +224,10 @@ impl ClientOptions { self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) } ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), + ClientConfigKey::ProxyCaCertificate => { + self.proxy_ca_certificate = Some(value.into()) + } + ClientConfigKey::ProxyExcludes => self.proxy_excludes = Some(value.into()), ClientConfigKey::Timeout => { self.timeout = Some(ConfigValue::Deferred(value.into())) } @@ -255,6 +267,8 @@ impl ClientOptions { self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) } ClientConfigKey::ProxyUrl => self.proxy_url.clone(), + ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate.clone(), + ClientConfigKey::ProxyExcludes => self.proxy_excludes.clone(), ClientConfigKey::Timeout => self.timeout.as_ref().map(fmt_duration), ClientConfigKey::UserAgent => self .user_agent @@ -329,12 +343,27 @@ impl ClientOptions { self } - /// Set an HTTP proxy to use for requests + /// Set a proxy URL to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.proxy_ca_certificate = Some(proxy_ca_certificate.into()); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.proxy_excludes = Some(proxy_excludes.into()); + self + } + /// Set a request timeout /// /// The timeout is applied from when the request starts connecting until the @@ -429,7 +458,22 @@ impl ClientOptions { } if let Some(proxy) = &self.proxy_url { - let proxy = Proxy::all(proxy).map_err(map_client_error)?; + let mut proxy = Proxy::all(proxy).map_err(map_client_error)?; + + if let Some(certificate) = &self.proxy_ca_certificate { + let certificate = + reqwest::tls::Certificate::from_pem(certificate.as_bytes()) + .map_err(map_client_error)?; + + builder = builder.add_root_certificate(certificate); + } + + if let Some(proxy_excludes) = &self.proxy_excludes { + let no_proxy = NoProxy::from_string(proxy_excludes); + + proxy = proxy.no_proxy(no_proxy); + } + builder = builder.proxy(proxy); } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 58a5d19f3cb7..3f5bf629d180 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -992,6 +992,23 @@ impl GoogleCloudStorageBuilder { self } + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + /// Sets the client options, overriding any already set pub fn with_client_options(mut self, options: ClientOptions) -> Self { self.client_options = options; From f332e23b3df9bd7502e4c881b24ba9f9fccb90d9 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 25 Aug 2023 10:56:34 +0200 Subject: [PATCH 1180/1411] chore: fix libpython in CI (#4738) --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4ca71f464591..d3f8e9046510 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -49,7 +49,7 @@ jobs: - name: Install python dev run: | apt update - apt install -y libpython3.9-dev + apt install -y libpython3.11-dev - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: From 2fe9ef11e5b3277d29a0ea79414a3a074c1c6667 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Aug 2023 11:38:05 +0100 Subject: [PATCH 1181/1411] Cleanup length and bit_length kernels (#4718) * Cleanup length and bit_length kernels * Clippy * Review feedback --- arrow-string/src/length.rs | 286 +++++++++++++------------------------ 1 file changed, 100 insertions(+), 186 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index fb47f70af342..ab5fbb0c6425 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -16,139 +16,32 @@ // under the License. //! Defines kernel for length of string arrays and binary arrays -#![allow(clippy::redundant_closure_call)] use arrow_array::*; use arrow_array::{cast::AsArray, types::*}; -use arrow_buffer::Buffer; -use arrow_data::ArrayData; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; -macro_rules! unary_offsets { - ($array: expr, $data_type: expr, $op: expr) => {{ - let slice = $array.value_offsets(); - - let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0])); - - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` come from a slice iterator with a known size. - let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - - let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced()); - - let data = unsafe { - ArrayData::new_unchecked( - $data_type, - $array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - make_array(data) - }}; +fn length_impl( + offsets: &OffsetBuffer, + nulls: Option<&NullBuffer>, +) -> ArrayRef { + let v: Vec<_> = offsets + .windows(2) + .map(|w| w[1].sub_wrapping(w[0])) + .collect(); + Arc::new(PrimitiveArray::

::new(v.into(), nulls.cloned())) } -macro_rules! kernel_dict { - ($array: ident, $kernel: expr, $kt: ident, $($t: ident: $gt: ident), *) => { - match $kt.as_ref() { - $(&DataType::$t => { - let dict = $array - .as_any() - .downcast_ref::>() - .unwrap_or_else(|| { - panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}", - stringify!($gt), $array.data_type()) - }); - let values = $kernel(dict.values())?; - let result = DictionaryArray::try_new(dict.keys().clone(), values)?; - Ok(Arc::new(result)) - }, - )* - t => panic!("Unsupported dictionary key type: {}", t) - } - } -} - -fn length_list(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn length_list_fixed_size(array: &dyn Array, length: i32) -> ArrayRef { - let array = array.as_fixed_size_list(); - let length_list = array.len(); - let buffer = Buffer::from_vec(vec![length; length_list]); - let data = Int32Array::new(buffer.into(), array.nulls().cloned()); - Arc::new(data) -} - -fn length_binary(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn length_string(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn bit_length_binary(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) -} - -fn bit_length_string(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) +fn bit_length_impl( + offsets: &OffsetBuffer, + nulls: Option<&NullBuffer>, +) -> ArrayRef { + let bits = P::Native::usize_as(8); + let c = |w: &[P::Native]| w[1].sub_wrapping(w[0]).mul_wrapping(bits); + let v: Vec<_> = offsets.windows(2).map(c).collect(); + Arc::new(PrimitiveArray::

::new(v.into(), nulls.cloned())) } /// Returns an array of Int32/Int64 denoting the length of each value in the array. @@ -159,29 +52,39 @@ where /// or DictionaryArray with above Arrays as values /// * length of null is null. pub fn length(array: &dyn Array) -> Result { + if let Some(d) = array.as_any_dictionary_opt() { + let lengths = length(d.values().as_ref())?; + return Ok(d.with_values(lengths)); + } + match array.data_type() { - DataType::Dictionary(kt, _) => { - kernel_dict!( - array, - |a| { length(a) }, - kt, - Int8: Int8Type, - Int16: Int16Type, - Int32: Int32Type, - Int64: Int64Type, - UInt8: UInt8Type, - UInt16: UInt16Type, - UInt32: UInt32Type, - UInt64: UInt64Type - ) + DataType::List(_) => { + let list = array.as_list::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeList(_) => { + let list = array.as_list::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::Utf8 => { + let list = array.as_string::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeUtf8 => { + let list = array.as_string::(); + Ok(length_impl::(list.offsets(), list.nulls())) } - DataType::List(_) => Ok(length_list::(array)), - DataType::LargeList(_) => Ok(length_list::(array)), - DataType::Utf8 => Ok(length_string::(array)), - DataType::LargeUtf8 => Ok(length_string::(array)), - DataType::Binary => Ok(length_binary::(array)), - DataType::LargeBinary => Ok(length_binary::(array)), - DataType::FixedSizeList(_, len) => Ok(length_list_fixed_size(array, *len)), + DataType::Binary => { + let list = array.as_binary::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeBinary => { + let list = array.as_binary::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::FixedSizeBinary(len) | DataType::FixedSizeList(_, len) => Ok(Arc::new( + Int32Array::new(vec![*len; array.len()].into(), array.nulls().cloned()), + )), other => Err(ArrowError::ComputeError(format!( "length not supported for {other:?}" ))), @@ -195,26 +98,40 @@ pub fn length(array: &dyn Array) -> Result { /// * bit_length of null is null. /// * bit_length is in number of bits pub fn bit_length(array: &dyn Array) -> Result { + if let Some(d) = array.as_any_dictionary_opt() { + let lengths = bit_length(d.values().as_ref())?; + return Ok(d.with_values(lengths)); + } + match array.data_type() { - DataType::Dictionary(kt, _) => { - kernel_dict!( - array, - |a| { bit_length(a) }, - kt, - Int8: Int8Type, - Int16: Int16Type, - Int32: Int32Type, - Int64: Int64Type, - UInt8: UInt8Type, - UInt16: UInt16Type, - UInt32: UInt32Type, - UInt64: UInt64Type - ) + DataType::List(_) => { + let list = array.as_list::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeList(_) => { + let list = array.as_list::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::Utf8 => { + let list = array.as_string::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeUtf8 => { + let list = array.as_string::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::Binary => { + let list = array.as_binary::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeBinary => { + let list = array.as_binary::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) } - DataType::Utf8 => Ok(bit_length_string::(array)), - DataType::LargeUtf8 => Ok(bit_length_string::(array)), - DataType::Binary => Ok(bit_length_binary::(array)), - DataType::LargeBinary => Ok(bit_length_binary::(array)), + DataType::FixedSizeBinary(len) => Ok(Arc::new(Int32Array::new( + vec![*len * 8; array.len()].into(), + array.nulls().cloned(), + ))), other => Err(ArrowError::ComputeError(format!( "bit_length not supported for {other:?}" ))), @@ -225,21 +142,15 @@ pub fn bit_length(array: &dyn Array) -> Result { mod tests { use super::*; use arrow_array::cast::AsArray; - use arrow_buffer::NullBuffer; + use arrow_buffer::{Buffer, NullBuffer}; + use arrow_data::ArrayData; use arrow_schema::Field; - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![3, 2, 1, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } + let values = ["one", "on", "o", ""]; + let values = values.into_iter().cycle().take(4096).collect(); + let expected = [3, 2, 1, 0].into_iter().cycle().take(4096).collect(); vec![ (vec!["hello", " ", "world"], 3, vec![5, 1, 5]), @@ -273,7 +184,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // running forever fn length_test_string() { length_cases_string() .into_iter() @@ -289,7 +199,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // running forever fn length_test_large_string() { length_cases_string() .into_iter() @@ -460,12 +369,9 @@ mod tests { fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![24, 16, 8, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } + let values = ["one", "on", "o", ""]; + let values = values.into_iter().cycle().take(4096).collect(); + let expected = [24, 16, 8, 0].into_iter().cycle().take(4096).collect(); vec![ (vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]), @@ -476,7 +382,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI fn bit_length_test_string() { bit_length_cases() .into_iter() @@ -492,7 +397,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI fn bit_length_test_large_string() { bit_length_cases() .into_iter() @@ -731,11 +635,21 @@ mod tests { let list_array = FixedSizeListArray::from(list_data); let lengths = length(&list_array).unwrap(); - let lengths = lengths.as_any().downcast_ref::().unwrap(); + let lengths = lengths.as_primitive::(); assert_eq!(lengths.len(), 3); assert_eq!(lengths.value(0), 3); assert!(lengths.is_null(1)); assert_eq!(lengths.value(2), 3); } + + #[test] + fn test_fixed_size_binary() { + let array = FixedSizeBinaryArray::new(4, [0; 16].into(), None); + let result = length(&array).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![4; 4])); + + let result = bit_length(&array).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![32; 4])); + } } From 4533271b4b221a5e28fa3215cb3cbddaafafdd84 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 25 Aug 2023 12:45:20 +0200 Subject: [PATCH 1182/1411] feat: expose DoGet response headers & trailers (#4727) * feat: expose DoGet response headers & trailers * docs: improve Co-authored-by: Andrew Lamb * refactor: address review comments --------- Co-authored-by: Andrew Lamb --- arrow-flight/Cargo.toml | 3 + arrow-flight/src/client.rs | 20 ++- arrow-flight/src/decode.rs | 44 ++++++- arrow-flight/src/lib.rs | 3 + arrow-flight/src/trailers.rs | 97 ++++++++++++++ arrow-flight/tests/client.rs | 34 ++++- arrow-flight/tests/common/server.rs | 6 +- arrow-flight/tests/common/trailers_layer.rs | 138 ++++++++++++++++++++ 8 files changed, 327 insertions(+), 18 deletions(-) create mode 100644 arrow-flight/src/trailers.rs create mode 100644 arrow-flight/tests/common/trailers_layer.rs diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 3ed426a21fab..1a53dbddb13d 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -67,6 +67,9 @@ cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "t [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } assert_cmd = "2.0.8" +http = "0.2.9" +http-body = "0.4.5" +pin-project-lite = "0.2" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } tower = "0.4.13" diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index 2c952fb3bfbf..8793f7834bfb 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -18,9 +18,9 @@ use std::task::Poll; use crate::{ - decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, Action, - ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, PutResult, Ticket, + decode::FlightRecordBatchStream, flight_service_client::FlightServiceClient, + trailers::extract_lazy_trailers, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, PutResult, Ticket, }; use arrow_schema::Schema; use bytes::Bytes; @@ -204,16 +204,14 @@ impl FlightClient { pub async fn do_get(&mut self, ticket: Ticket) -> Result { let request = self.make_request(ticket); - let response_stream = self - .inner - .do_get(request) - .await? - .into_inner() - .map_err(FlightError::Tonic); + let (md, response_stream, _ext) = self.inner.do_get(request).await?.into_parts(); + let (response_stream, trailers) = extract_lazy_trailers(response_stream); Ok(FlightRecordBatchStream::new_from_flight_data( - response_stream, - )) + response_stream.map_err(FlightError::Tonic), + ) + .with_headers(md) + .with_trailers(trailers)) } /// Make a `GetFlightInfo` call to the server with the provided diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index df74923332e3..dfcdd260602c 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{utils::flight_data_to_arrow_batch, FlightData}; +use crate::{trailers::LazyTrailers, utils::flight_data_to_arrow_batch, FlightData}; use arrow_array::{ArrayRef, RecordBatch}; use arrow_buffer::Buffer; use arrow_schema::{Schema, SchemaRef}; @@ -24,6 +24,7 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; use std::{ collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll, }; +use tonic::metadata::MetadataMap; use crate::error::{FlightError, Result}; @@ -82,13 +83,23 @@ use crate::error::{FlightError, Result}; /// ``` #[derive(Debug)] pub struct FlightRecordBatchStream { + /// Optional grpc header metadata. + headers: MetadataMap, + + /// Optional grpc trailer metadata. + trailers: Option, + inner: FlightDataDecoder, } impl FlightRecordBatchStream { /// Create a new [`FlightRecordBatchStream`] from a decoded stream pub fn new(inner: FlightDataDecoder) -> Self { - Self { inner } + Self { + inner, + headers: MetadataMap::default(), + trailers: None, + } } /// Create a new [`FlightRecordBatchStream`] from a stream of [`FlightData`] @@ -98,9 +109,37 @@ impl FlightRecordBatchStream { { Self { inner: FlightDataDecoder::new(inner), + headers: MetadataMap::default(), + trailers: None, + } + } + + /// Record response headers. + pub fn with_headers(self, headers: MetadataMap) -> Self { + Self { headers, ..self } + } + + /// Record response trailers. + pub fn with_trailers(self, trailers: LazyTrailers) -> Self { + Self { + trailers: Some(trailers), + ..self } } + /// Headers attached to this stream. + pub fn headers(&self) -> &MetadataMap { + &self.headers + } + + /// Trailers attached to this stream. + /// + /// Note that this will return `None` until the entire stream is consumed. + /// Only after calling `next()` returns `None`, might any available trailers be returned. + pub fn trailers(&self) -> Option { + self.trailers.as_ref().and_then(|trailers| trailers.get()) + } + /// Has a message defining the schema been received yet? #[deprecated = "use schema().is_some() instead"] pub fn got_schema(&self) -> bool { @@ -117,6 +156,7 @@ impl FlightRecordBatchStream { self.inner } } + impl futures::Stream for FlightRecordBatchStream { type Item = Result; diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 4163f2ceaa27..04edf266389c 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -111,6 +111,9 @@ pub use gen::Result; pub use gen::SchemaResult; pub use gen::Ticket; +/// Helper to extract HTTP/gRPC trailers from a tonic stream. +mod trailers; + pub mod utils; #[cfg(feature = "flight-sql-experimental")] diff --git a/arrow-flight/src/trailers.rs b/arrow-flight/src/trailers.rs new file mode 100644 index 000000000000..d652542da779 --- /dev/null +++ b/arrow-flight/src/trailers.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ + pin::Pin, + sync::{Arc, Mutex}, + task::{Context, Poll}, +}; + +use futures::{ready, FutureExt, Stream, StreamExt}; +use tonic::{metadata::MetadataMap, Status, Streaming}; + +/// Extract [`LazyTrailers`] from [`Streaming`] [tonic] response. +/// +/// Note that [`LazyTrailers`] has inner mutability and will only hold actual data after [`ExtractTrailersStream`] is +/// fully consumed (dropping it is not required though). +pub fn extract_lazy_trailers( + s: Streaming, +) -> (ExtractTrailersStream, LazyTrailers) { + let trailers: SharedTrailers = Default::default(); + let stream = ExtractTrailersStream { + inner: s, + trailers: Arc::clone(&trailers), + }; + let lazy_trailers = LazyTrailers { trailers }; + (stream, lazy_trailers) +} + +type SharedTrailers = Arc>>; + +/// [Stream] that stores the gRPC trailers into [`LazyTrailers`]. +/// +/// See [`extract_lazy_trailers`] for construction. +#[derive(Debug)] +pub struct ExtractTrailersStream { + inner: Streaming, + trailers: SharedTrailers, +} + +impl Stream for ExtractTrailersStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let res = ready!(self.inner.poll_next_unpin(cx)); + + if res.is_none() { + // stream exhausted => trailers should available + if let Some(trailers) = self + .inner + .trailers() + .now_or_never() + .and_then(|res| res.ok()) + .flatten() + { + *self.trailers.lock().expect("poisoned") = Some(trailers); + } + } + + Poll::Ready(res) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// gRPC trailers that are extracted by [`ExtractTrailersStream`]. +/// +/// See [`extract_lazy_trailers`] for construction. +#[derive(Debug)] +pub struct LazyTrailers { + trailers: SharedTrailers, +} + +impl LazyTrailers { + /// gRPC trailers that are known at the end of a stream. + pub fn get(&self) -> Option { + self.trailers.lock().expect("poisoned").clone() + } +} diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 8ea542879a27..1b9891e121fa 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -19,6 +19,7 @@ mod common { pub mod server; + pub mod trailers_layer; } use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ @@ -28,7 +29,7 @@ use arrow_flight::{ }; use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; -use common::server::TestFlightServer; +use common::{server::TestFlightServer, trailers_layer::TrailersLayer}; use futures::{Future, StreamExt, TryStreamExt}; use tokio::{net::TcpListener, task::JoinHandle}; use tonic::{ @@ -158,18 +159,42 @@ async fn test_do_get() { let response = vec![Ok(batch.clone())]; test_server.set_do_get_response(response); - let response_stream = client + let mut response_stream = client .do_get(ticket.clone()) .await .expect("error making request"); + assert_eq!( + response_stream + .headers() + .get("test-resp-header") + .expect("header exists") + .to_str() + .unwrap(), + "some_val", + ); + + // trailers are not available before stream exhaustion + assert!(response_stream.trailers().is_none()); + let expected_response = vec![batch]; - let response: Vec<_> = response_stream + let response: Vec<_> = (&mut response_stream) .try_collect() .await .expect("Error streaming data"); - assert_eq!(response, expected_response); + + assert_eq!( + response_stream + .trailers() + .expect("stream exhausted") + .get("test-trailer") + .expect("trailer exists") + .to_str() + .unwrap(), + "trailer_val", + ); + assert_eq!(test_server.take_do_get_request(), Some(ticket)); ensure_metadata(&client, &test_server); }) @@ -932,6 +957,7 @@ impl TestFixture { let serve_future = tonic::transport::Server::builder() .timeout(server_timeout) + .layer(TrailersLayer) .add_service(test_server.service()) .serve_with_incoming_shutdown( tokio_stream::wrappers::TcpListenerStream::new(listener), diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index b87019d632c4..c575d12bbf52 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -359,7 +359,11 @@ impl FlightService for TestFlightServer { .build(batch_stream) .map_err(Into::into); - Ok(Response::new(stream.boxed())) + let mut resp = Response::new(stream.boxed()); + resp.metadata_mut() + .insert("test-resp-header", "some_val".parse().unwrap()); + + Ok(resp) } async fn do_put( diff --git a/arrow-flight/tests/common/trailers_layer.rs b/arrow-flight/tests/common/trailers_layer.rs new file mode 100644 index 000000000000..9e6be0dcf0da --- /dev/null +++ b/arrow-flight/tests/common/trailers_layer.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures::ready; +use http::{HeaderValue, Request, Response}; +use http_body::SizeHint; +use pin_project_lite::pin_project; +use tower::{Layer, Service}; + +#[derive(Debug, Copy, Clone, Default)] +pub struct TrailersLayer; + +impl Layer for TrailersLayer { + type Service = TrailersService; + + fn layer(&self, service: S) -> Self::Service { + TrailersService { service } + } +} + +#[derive(Debug, Clone)] +pub struct TrailersService { + service: S, +} + +impl Service> for TrailersService +where + S: Service, Response = Response>, + ResBody: http_body::Body, +{ + type Response = Response>; + type Error = S::Error; + type Future = WrappedFuture; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.service.poll_ready(cx) + } + + fn call(&mut self, request: Request) -> Self::Future { + WrappedFuture { + inner: self.service.call(request), + } + } +} + +pin_project! { + #[derive(Debug)] + pub struct WrappedFuture { + #[pin] + inner: F, + } +} + +impl Future for WrappedFuture +where + F: Future, Error>>, + ResBody: http_body::Body, +{ + type Output = Result>, Error>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let result: Result, Error> = + ready!(self.as_mut().project().inner.poll(cx)); + + match result { + Ok(response) => { + Poll::Ready(Ok(response.map(|body| WrappedBody { inner: body }))) + } + Err(e) => Poll::Ready(Err(e)), + } + } +} + +pin_project! { + #[derive(Debug)] + pub struct WrappedBody { + #[pin] + inner: B, + } +} + +impl http_body::Body for WrappedBody { + type Data = B::Data; + type Error = B::Error; + + fn poll_data( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + self.as_mut().project().inner.poll_data(cx) + } + + fn poll_trailers( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll, Self::Error>> { + let result: Result, Self::Error> = + ready!(self.as_mut().project().inner.poll_trailers(cx)); + + let mut trailers = http::header::HeaderMap::new(); + trailers.insert("test-trailer", HeaderValue::from_static("trailer_val")); + + match result { + Ok(Some(mut existing)) => { + existing.extend(trailers.iter().map(|(k, v)| (k.clone(), v.clone()))); + Poll::Ready(Ok(Some(existing))) + } + Ok(None) => Poll::Ready(Ok(Some(trailers))), + Err(e) => Poll::Ready(Err(e)), + } + } + + fn is_end_stream(&self) -> bool { + self.inner.is_end_stream() + } + + fn size_hint(&self) -> SizeHint { + self.inner.size_hint() + } +} From 221f5d2fe910afe15d7f7d35a87a803914451d29 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 25 Aug 2023 20:24:14 +0100 Subject: [PATCH 1183/1411] Datum based like kernels (#4595) (#4732) * Datum based like kernels (#4595) * Clippy * More Clippy * Review feedback --- arrow-flight/src/sql/metadata/db_schemas.rs | 14 +- arrow-flight/src/sql/metadata/tables.rs | 22 +- arrow-string/src/lib.rs | 1 + arrow-string/src/like.rs | 961 ++++++-------------- arrow-string/src/predicate.rs | 229 +++++ arrow/benches/comparison_kernels.rs | 33 +- 6 files changed, 555 insertions(+), 705 deletions(-) create mode 100644 arrow-string/src/predicate.rs diff --git a/arrow-flight/src/sql/metadata/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs index 20780a116032..642802b058d5 100644 --- a/arrow-flight/src/sql/metadata/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -22,11 +22,11 @@ use std::sync::Arc; use arrow_arith::boolean::and; -use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch, Scalar, StringArray}; +use arrow_array::{builder::StringBuilder, ArrayRef, RecordBatch, StringArray}; use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::{filter::filter_record_batch, take::take}; -use arrow_string::like::like_utf8_scalar; +use arrow_string::like::like; use once_cell::sync::Lazy; use super::lexsort_to_indices; @@ -122,15 +122,13 @@ impl GetDbSchemasBuilder { if let Some(db_schema_filter_pattern) = db_schema_filter_pattern { // use like kernel to get wildcard matching - filters.push(like_utf8_scalar( - &db_schema_name, - &db_schema_filter_pattern, - )?) + let scalar = StringArray::new_scalar(db_schema_filter_pattern); + filters.push(like(&db_schema_name, &scalar)?) } if let Some(catalog_filter_name) = catalog_filter { - let scalar = StringArray::from_iter_values([catalog_filter_name]); - filters.push(eq(&catalog_name, &Scalar::new(&scalar))?); + let scalar = StringArray::new_scalar(catalog_filter_name); + filters.push(eq(&catalog_name, &scalar)?); } // `AND` any filters together diff --git a/arrow-flight/src/sql/metadata/tables.rs b/arrow-flight/src/sql/metadata/tables.rs index de55f0624f2f..00502a76db53 100644 --- a/arrow-flight/src/sql/metadata/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -23,11 +23,11 @@ use std::sync::Arc; use arrow_arith::boolean::{and, or}; use arrow_array::builder::{BinaryBuilder, StringBuilder}; -use arrow_array::{ArrayRef, RecordBatch, Scalar, StringArray}; +use arrow_array::{ArrayRef, RecordBatch, StringArray}; use arrow_ord::cmp::eq; use arrow_schema::{DataType, Field, Schema, SchemaRef}; use arrow_select::{filter::filter_record_batch, take::take}; -use arrow_string::like::like_utf8_scalar; +use arrow_string::like::like; use once_cell::sync::Lazy; use super::lexsort_to_indices; @@ -184,16 +184,13 @@ impl GetTablesBuilder { let mut filters = vec![]; if let Some(catalog_filter_name) = catalog_filter { - let scalar = StringArray::from_iter_values([catalog_filter_name]); - filters.push(eq(&catalog_name, &Scalar::new(&scalar))?); + let scalar = StringArray::new_scalar(catalog_filter_name); + filters.push(eq(&catalog_name, &scalar)?); } let tt_filter = table_types_filter .into_iter() - .map(|tt| { - let scalar = StringArray::from_iter_values([tt]); - eq(&table_type, &Scalar::new(&scalar)) - }) + .map(|tt| eq(&table_type, &StringArray::new_scalar(tt))) .collect::, _>>()? .into_iter() // We know the arrays are of same length as they are produced fromn the same root array @@ -204,15 +201,14 @@ impl GetTablesBuilder { if let Some(db_schema_filter_pattern) = db_schema_filter_pattern { // use like kernel to get wildcard matching - filters.push(like_utf8_scalar( - &db_schema_name, - &db_schema_filter_pattern, - )?) + let scalar = StringArray::new_scalar(db_schema_filter_pattern); + filters.push(like(&db_schema_name, &scalar)?) } if let Some(table_name_filter_pattern) = table_name_filter_pattern { // use like kernel to get wildcard matching - filters.push(like_utf8_scalar(&table_name, &table_name_filter_pattern)?) + let scalar = StringArray::new_scalar(table_name_filter_pattern); + filters.push(like(&table_name, &scalar)?) } let batch = if let Some(table_schema) = table_schema { diff --git a/arrow-string/src/lib.rs b/arrow-string/src/lib.rs index 4bd4d282656c..4444b37a7742 100644 --- a/arrow-string/src/lib.rs +++ b/arrow-string/src/lib.rs @@ -20,5 +20,6 @@ pub mod concat_elements; pub mod length; pub mod like; +mod predicate; pub mod regexp; pub mod substring; diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 57cc22f2c549..412f1e6cc89a 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -15,227 +15,37 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::cast::*; +use crate::predicate::Predicate; +use arrow_array::cast::AsArray; use arrow_array::*; -use arrow_buffer::NullBuffer; -use arrow_data::ArrayDataBuilder; use arrow_schema::*; use arrow_select::take::take; -use regex::Regex; -use std::collections::HashMap; - -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op( - left: T, - right: S, - op: F, -) -> Result -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -/// -/// Duplicated from `arrow_ord::comparison` -fn compare_op_scalar( - left: T, - op: F, -) -> Result -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) -} - -macro_rules! dyn_function { - ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result { - match (left.data_type(), right.data_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.as_string::(); - let right = right.as_string::(); - $fn_utf8(left, right) - } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.as_string::(); - let right = right.as_string::(); - $fn_utf8(left, right) - } - #[cfg(feature = "dyn_cmp_dict")] - (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => { - downcast_dictionary_array!( - left => { - let right = as_dictionary_array(right); - $fn_dict(left, right) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - - } -} -dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict); -dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict); -dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict); -dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict); -dyn_function!( - "STARTSWITH(left, right)", - starts_with_dyn, - starts_with_utf8, - starts_with_dict -); -dyn_function!( - "ENDSWITH(left, right)", - ends_with_dyn, - ends_with_utf8, - ends_with_dict -); -dyn_function!( - "CONTAINS(left, right)", - contains_dyn, - contains_utf8, - contains_dict -); - -macro_rules! scalar_dyn_function { - ($sql:tt, $fn_name:tt, $fn_scalar:tt) => { -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")] -/// [`LargeStringArray`], or [`DictionaryArray`] with values -/// [`StringArray`]/[`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn $fn_name( - left: &dyn Array, - right: &str, -) -> Result { - match left.data_type() { - DataType::Utf8 => { - let left = left.as_string::(); - $fn_scalar(left, right) - } - DataType::LargeUtf8 => { - let left = left.as_string::(); - $fn_scalar(left, right) - } - DataType::Dictionary(_, _) => { - downcast_dictionary_array!( - left => { - let dict_comparison = $fn_name(left.values().as_ref(), right)?; - // TODO: Use take_boolean (#2967) - let array = take(&dict_comparison, left.keys(), None)?; - Ok(BooleanArray::from(array.to_data())) - } - t => Err(ArrowError::ComputeError(format!( - "Should be DictionaryArray but got: {}", t - ))) - ) - } - _ => { - Err(ArrowError::ComputeError(format!( - "{} only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))) - } - } -} - } +use std::sync::Arc; + +#[derive(Debug)] +enum Op { + Like(bool), + ILike(bool), + Contains, + StartsWith, + EndsWith, } -scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar); -scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar); -scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar); -scalar_dyn_function!( - "left NOT ILIKE right", - nilike_utf8_scalar_dyn, - nilike_scalar -); -scalar_dyn_function!( - "STARTSWITH(left, right)", - starts_with_utf8_scalar_dyn, - starts_with_scalar -); -scalar_dyn_function!( - "ENDSWITH(left, right)", - ends_with_utf8_scalar_dyn, - ends_with_scalar -); -scalar_dyn_function!( - "CONTAINS(left, right)", - contains_utf8_scalar_dyn, - contains_scalar -); - -macro_rules! dict_function { - ($sql:tt, $fn_name:tt, $fn_impl:tt) => { - -#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")] -/// [`StringArray`]/[`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -#[cfg(feature = "dyn_cmp_dict")] -fn $fn_name( - left: &DictionaryArray, - right: &DictionaryArray, -) -> Result { - match (left.value_type(), right.value_type()) { - (DataType::Utf8, DataType::Utf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - $fn_impl(left, right) +impl std::fmt::Display for Op { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Op::Like(false) => write!(f, "LIKE"), + Op::Like(true) => write!(f, "NLIKE"), + Op::ILike(false) => write!(f, "ILIKE"), + Op::ILike(true) => write!(f, "NILIKE"), + Op::Contains => write!(f, "CONTAINS"), + Op::StartsWith => write!(f, "STARTS_WITH"), + Op::EndsWith => write!(f, "ENDS_WITH"), } - (DataType::LargeUtf8, DataType::LargeUtf8) => { - let left = left.downcast_dict::>().unwrap(); - let right = right.downcast_dict::>().unwrap(); - - $fn_impl(left, right) - } - _ => Err(ArrowError::ComputeError(format!( - "{} only supports DictionaryArray with Utf8 or LargeUtf8 values", - stringify!($fn_name) - ))), } } - } -} - -dict_function!("left LIKE right", like_dict, like); -dict_function!("left NOT LIKE right", nlike_dict, nlike); -dict_function!("left ILIKE right", ilike_dict, ilike); -dict_function!("left NOT ILIKE right", nilike_dict, nilike); -dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with); -dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with); -dict_function!("CONTAINS(left, right)", contains_dict, contains); -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. +/// Perform SQL `left LIKE right` /// /// There are two wildcards supported with the LIKE operator: /// @@ -244,490 +54,337 @@ dict_function!("CONTAINS(left, right)", contains_dict, contains); /// /// For example: /// ``` -/// use arrow_array::{StringArray, BooleanArray}; -/// use arrow_string::like::like_utf8; -/// +/// # use arrow_array::{StringArray, BooleanArray}; +/// # use arrow_string::like::like; +/// # /// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); /// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]); /// -/// let result = like_utf8(&strings, &patterns).unwrap(); +/// let result = like(&strings, &patterns).unwrap(); /// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); /// ``` -pub fn like_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - like(left, right) -} - -#[inline] -fn like<'a, S: ArrayAccessor>( - left: S, - right: S, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) -} - -#[inline] -fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( - left: L, - right: &str, - op: F, -) -> Result { - if !right.contains(is_like_pattern) { - // fast path, can use equals - Ok(BooleanArray::from_unary(left, |item| op(item == right))) - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.starts_with(starts_with)) - })) - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.ends_with(ends_with)) - })) - } else if right.starts_with('%') - && right.ends_with('%') - && !right.ends_with("\\%") - && !right[1..right.len() - 1].contains(is_like_pattern) - { - let contains = &right[1..right.len() - 1]; - - Ok(BooleanArray::from_unary(left, |item| { - op(item.contains(contains)) - })) - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) - } -} - -#[inline] -fn like_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - like_scalar(left, right) -} - -/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: -/// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` -fn replace_like_wildcards(pattern: &str) -> Result { - let mut result = String::new(); - let pattern = String::from(pattern); - let mut chars_iter = pattern.chars().peekable(); - while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); - } - _ => { - result.push('\\'); - result.push('\\'); - } - } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); - } - } - Ok(result) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - nlike(left, right) -} - -#[inline] -fn nlike<'a, S: ArrayAccessor>( - left: S, - right: S, -) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?s)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {e}" - )) - }) - }) +pub fn like(left: &dyn Datum, right: &dyn Datum) -> Result { + like_op(Op::Like(false), left, right) } -#[inline] -fn nlike_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - like_scalar_op(left, right, |x| !x) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. +/// Perform SQL `left ILIKE right` /// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nlike_scalar(left, right) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// Case insensitive version of [`like_utf8`] +/// This is a case-insensitive version of [`like`] /// /// Note: this only implements loose matching as defined by the Unicode standard. For example, /// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to `SS` -pub fn ilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - ilike(left, right) +pub fn ilike(left: &dyn Datum, right: &dyn Datum) -> Result { + like_op(Op::ILike(false), left, right) } -#[inline] -fn ilike<'a, S: ArrayAccessor>( - left: S, - right: S, -) -> Result { - regex_like(left, right, false, |re_pattern| { - Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {e}" - )) - }) - }) -} - -#[inline] -fn ilike_scalar_op bool>( - left: &GenericStringArray, - right: &str, - op: F, -) -> Result { - // If not ASCII faster to use case insensitive regex than using to_uppercase - if right.is_ascii() && left.is_ascii() { - if !right.contains(is_like_pattern) { - return Ok(BooleanArray::from_unary(left, |item| { - op(item.eq_ignore_ascii_case(right)) - })); - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let start_str = &right[..right.len() - 1]; - return Ok(BooleanArray::from_unary(left, |item| { - let end = item.len().min(start_str.len()); - let result = item.is_char_boundary(end) - && start_str.eq_ignore_ascii_case(&item[..end]); - op(result) - })); - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_str = &right[1..]; - return Ok(BooleanArray::from_unary(left, |item| { - let start = item.len().saturating_sub(ends_str.len()); - let result = item.is_char_boundary(start) - && ends_str.eq_ignore_ascii_case(&item[start..]); - op(result) - })); - } - } - - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!("Unable to build regex from ILIKE pattern: {e}")) - })?; - - Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item)))) -} - -#[inline] -fn ilike_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - ilike_scalar_op(left, right, |x| x) -} - -/// Perform SQL `left ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. +/// Perform SQL `left NOT LIKE right` /// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn ilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - ilike_scalar(left, right) +/// See the documentation on [`like`] for more details +pub fn nlike(left: &dyn Datum, right: &dyn Datum) -> Result { + like_op(Op::Like(true), left, right) } -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. +/// Perform SQL `left NOT ILIKE right` /// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn nilike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - nilike(left, right) +/// See the documentation on [`ilike`] for more details +pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result { + like_op(Op::ILike(true), left, right) } -#[inline] -fn nilike<'a, S: ArrayAccessor>( - left: S, - right: S, +/// Perform SQL `STARTSWITH(left, right)` +pub fn starts_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result { - regex_like(left, right, true, |re_pattern| { - Regex::new(&format!("(?is)^{re_pattern}$")).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from ILIKE pattern: {e}" - )) - }) - }) + like_op(Op::StartsWith, left, right) } -#[inline] -fn nilike_scalar( - left: &GenericStringArray, - right: &str, +/// Perform SQL `ENDSWITH(left, right)` +pub fn ends_with( + left: &dyn Datum, + right: &dyn Datum, ) -> Result { - ilike_scalar_op(left, right, |x| !x) + like_op(Op::EndsWith, left, right) } -/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`ilike_utf8`] for more details. -pub fn nilike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - nilike_scalar(left, right) +/// Perform SQL `CONTAINS(left, right)` +pub fn contains(left: &dyn Datum, right: &dyn Datum) -> Result { + like_op(Op::Contains, left, right) } -fn is_like_pattern(c: char) -> bool { - c == '%' || c == '_' -} - -/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`] -/// -/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`) -fn regex_like<'a, S: ArrayAccessor, F>( - left: S, - right: S, - negate_regex: bool, - op: F, -) -> Result -where - F: Fn(&str) -> Result, -{ - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); +fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { + use arrow_schema::DataType::*; + let (l, l_s) = lhs.get(); + let (r, r_s) = rhs.get(); + + if l.len() != r.len() && !l_s && !r_s { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot compare arrays of different lengths, got {} vs {}", + l.len(), + r.len() + ))); } - let nulls = NullBuffer::union( - left.logical_nulls().as_ref(), - right.logical_nulls().as_ref(), - ); + let l_v = l.as_any_dictionary_opt(); + let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l); - let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = replace_like_wildcards(pat)?; - let re = op(&re_pattern)?; - map.insert(pat, re); - map.get(pat).unwrap() - }; + let r_v = r.as_any_dictionary_opt(); + let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r); - result.append(if negate_regex { - !re.is_match(haystack) - } else { - re.is_match(haystack) - }); + match (l.data_type(), r.data_type()) { + (Utf8, Utf8) => { + apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + } + (LargeUtf8, LargeUtf8) => { + apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + } + (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!( + "Invalid string operation: {l_t} {op} {r_t}" + ))), } - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(left.len()) - .nulls(nulls) - .buffers(vec![result.into()]) - .build_unchecked() - }; - Ok(BooleanArray::from(data)) } -/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn starts_with_utf8( - left: &GenericStringArray, - right: &GenericStringArray, +fn apply( + op: Op, + l: &GenericStringArray, + l_s: bool, + l_v: Option<&dyn AnyDictionaryArray>, + r: &GenericStringArray, + r_s: bool, + r_v: Option<&dyn AnyDictionaryArray>, ) -> Result { - starts_with(left, right) + let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); + if r_s { + let scalar = match r_v { + Some(dict) => match dict.nulls().filter(|n| n.null_count() != 0) { + Some(_) => return Ok(BooleanArray::new_null(l_len)), + None => { + let idx = dict.normalized_keys()[0]; + if r.is_null(idx) { + return Ok(BooleanArray::new_null(l_len)); + } + r.value(idx) + } + }, + None => r.value(0), + }; + op_scalar(op, l, l_v, scalar) + } else { + match (l_s, l_v, r_v) { + (true, None, None) => { + let v = l.is_valid(0).then(|| l.value(0)); + op_binary(op, std::iter::repeat(v), r.iter()) + } + (true, Some(l_v), None) => { + let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); + let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); + op_binary(op, std::iter::repeat(v), r.iter()) + } + (true, None, Some(r_v)) => { + let v = l.is_valid(0).then(|| l.value(0)); + op_binary(op, std::iter::repeat(v), vectored_iter(r, r_v)) + } + (true, Some(l_v), Some(r_v)) => { + let idx = l_v.is_valid(0).then(|| l_v.normalized_keys()[0]); + let v = idx.and_then(|idx| l.is_valid(idx).then(|| l.value(idx))); + op_binary(op, std::iter::repeat(v), vectored_iter(r, r_v)) + } + (false, None, None) => op_binary(op, l.iter(), r.iter()), + (false, Some(l_v), None) => op_binary(op, vectored_iter(l, l_v), r.iter()), + (false, None, Some(r_v)) => op_binary(op, l.iter(), vectored_iter(r, r_v)), + (false, Some(l_v), Some(r_v)) => { + op_binary(op, vectored_iter(l, l_v), vectored_iter(r, r_v)) + } + } + } } -#[inline] -fn starts_with<'a, S: ArrayAccessor>( - left: S, - right: S, +#[inline(never)] +fn op_scalar( + op: Op, + l: &GenericStringArray, + l_v: Option<&dyn AnyDictionaryArray>, + r: &str, ) -> Result { - compare_op(left, right, |l, r| l.starts_with(r)) -} + let r = match op { + Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg), + Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), + Op::Contains => Predicate::Contains(r).evaluate_array(l, false), + Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false), + Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false), + }; -#[inline] -fn starts_with_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - compare_op_scalar(left, |item| item.starts_with(right)) + Ok(match l_v { + Some(v) => take(&r, v.keys(), None)?.as_boolean().clone(), + None => r, + }) } -/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn starts_with_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - starts_with_scalar(left, right) +fn vectored_iter<'a, O: OffsetSizeTrait>( + a: &'a GenericStringArray, + a_v: &'a dyn AnyDictionaryArray, +) -> impl Iterator> + 'a { + let nulls = a_v.nulls(); + let keys = a_v.normalized_keys(); + keys.into_iter().enumerate().map(move |(idx, key)| { + if nulls.map(|n| n.is_null(idx)).unwrap_or_default() || a.is_null(key) { + return None; + } + Some(a.value(key)) + }) } -/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ends_with_utf8( - left: &GenericStringArray, - right: &GenericStringArray, +#[inline(never)] +fn op_binary<'a>( + op: Op, + l: impl Iterator>, + r: impl Iterator>, ) -> Result { - ends_with(left, right) + match op { + Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), + Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), + Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(l?.contains(r?))).collect()), + Op::StartsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.starts_with(r?))).collect()), + Op::EndsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.ends_with(r?))).collect()), + } } -#[inline] -fn ends_with<'a, S: ArrayAccessor>( - left: S, - right: S, +fn binary_predicate<'a>( + l: impl Iterator>, + r: impl Iterator>, + neg: bool, + f: impl Fn(&'a str) -> Result, ArrowError>, ) -> Result { - compare_op(left, right, |l, r| l.ends_with(r)) + let mut previous = None; + l.zip(r) + .map(|(l, r)| match (l, r) { + (Some(l), Some(r)) => { + let p: &Predicate = match previous { + Some((expr, ref predicate)) if expr == r => predicate, + _ => &previous.insert((r, f(r)?)).1, + }; + Ok(Some(p.evaluate(l) != neg)) + } + _ => Ok(None), + }) + .collect() } -#[inline] -fn ends_with_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - compare_op_scalar(left, |item| item.ends_with(right)) -} +// Deprecated kernels -/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn ends_with_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - ends_with_scalar(left, right) +fn make_scalar(data_type: &DataType, scalar: &str) -> Result { + match data_type { + DataType::Utf8 => Ok(Arc::new(StringArray::from_iter_values([scalar]))), + DataType::LargeUtf8 => Ok(Arc::new(LargeStringArray::from_iter_values([scalar]))), + DataType::Dictionary(_, v) => make_scalar(v.as_ref(), scalar), + d => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported string scalar data type {d:?}", + ))), + } } -/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn contains_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - contains(left, right) -} +macro_rules! legacy_kernels { + ($fn_datum:ident, $fn_array:ident, $fn_scalar:ident, $fn_array_dyn:ident, $fn_scalar_dyn:ident, $deprecation:expr) => { + #[doc(hidden)] + #[deprecated(note = $deprecation)] + pub fn $fn_array( + left: &GenericStringArray, + right: &GenericStringArray, + ) -> Result { + $fn_datum(left, right) + } -#[inline] -fn contains<'a, S: ArrayAccessor>( - left: S, - right: S, -) -> Result { - compare_op(left, right, |l, r| l.contains(r)) -} + #[doc(hidden)] + #[deprecated(note = $deprecation)] + pub fn $fn_scalar( + left: &GenericStringArray, + right: &str, + ) -> Result { + let scalar = GenericStringArray::::from_iter_values([right]); + $fn_datum(left, &Scalar::new(&scalar)) + } -#[inline] -fn contains_scalar<'a, L: ArrayAccessor>( - left: L, - right: &str, -) -> Result { - compare_op_scalar(left, |item| item.contains(right)) -} + #[doc(hidden)] + #[deprecated(note = $deprecation)] + pub fn $fn_array_dyn( + left: &dyn Array, + right: &dyn Array, + ) -> Result { + $fn_datum(&left, &right) + } -/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn contains_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - contains_scalar(left, right) + #[doc(hidden)] + #[deprecated(note = $deprecation)] + pub fn $fn_scalar_dyn( + left: &dyn Array, + right: &str, + ) -> Result { + let scalar = make_scalar(left.data_type(), right)?; + $fn_datum(&left, &Scalar::new(&scalar)) + } + }; } +legacy_kernels!( + like, + like_utf8, + like_utf8_scalar, + like_dyn, + like_utf8_scalar_dyn, + "Use arrow_string::like::like" +); +legacy_kernels!( + ilike, + ilike_utf8, + ilike_utf8_scalar, + ilike_dyn, + ilike_utf8_scalar_dyn, + "Use arrow_string::like::ilike" +); +legacy_kernels!( + nlike, + nlike_utf8, + nlike_utf8_scalar, + nlike_dyn, + nlike_utf8_scalar_dyn, + "Use arrow_string::like::nlike" +); +legacy_kernels!( + nilike, + nilike_utf8, + nilike_utf8_scalar, + nilike_dyn, + nilike_utf8_scalar_dyn, + "Use arrow_string::like::nilike" +); +legacy_kernels!( + contains, + contains_utf8, + contains_utf8_scalar, + contains_dyn, + contains_utf8_scalar_dyn, + "Use arrow_string::like::contains" +); +legacy_kernels!( + starts_with, + starts_with_utf8, + starts_with_utf8_scalar, + starts_with_dyn, + starts_with_utf8_scalar_dyn, + "Use arrow_string::like::starts_with" +); + +legacy_kernels!( + ends_with, + ends_with_utf8, + ends_with_utf8_scalar, + ends_with_dyn, + ends_with_utf8_scalar_dyn, + "Use arrow_string::like::ends_with" +); + #[cfg(test)] +#[allow(deprecated)] mod tests { use super::*; use arrow_array::types::Int8Type; @@ -936,34 +593,6 @@ mod tests { vec![true] ); - #[test] - fn test_replace_like_wildcards() { - let a_eq = "_%"; - let expected = "..*"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_leave_like_meta_chars() { - let a_eq = "\\%\\_"; - let expected = "%_"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_with_multiple_escape_chars() { - let a_eq = "\\\\%"; - let expected = "\\\\%"; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - - #[test] - fn test_replace_like_wildcards_escape_regex_meta_char() { - let a_eq = "."; - let expected = "\\."; - assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); - } - test_utf8!( test_utf8_array_nlike, vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs new file mode 100644 index 000000000000..162e3c75027d --- /dev/null +++ b/arrow-string/src/predicate.rs @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_schema::ArrowError; +use regex::{Regex, RegexBuilder}; + +/// A string based predicate +pub enum Predicate<'a> { + Eq(&'a str), + Contains(&'a str), + StartsWith(&'a str), + EndsWith(&'a str), + + /// Equality ignoring ASCII case + IEqAscii(&'a str), + /// Starts with ignoring ASCII case + IStartsWithAscii(&'a str), + /// Ends with ignoring ASCII case + IEndsWithAscii(&'a str), + + Regex(Regex), +} + +impl<'a> Predicate<'a> { + /// Create a predicate for the given like pattern + pub fn like(pattern: &'a str) -> Result { + if !pattern.contains(is_like_pattern) { + Ok(Self::Eq(pattern)) + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::StartsWith(&pattern[..pattern.len() - 1])) + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) { + Ok(Self::EndsWith(&pattern[1..])) + } else if pattern.starts_with('%') + && pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[1..pattern.len() - 1].contains(is_like_pattern) + { + Ok(Self::Contains(&pattern[1..pattern.len() - 1])) + } else { + Ok(Self::Regex(regex_like(pattern, false)?)) + } + } + + /// Create a predicate for the given ilike pattern + pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result { + if is_ascii && pattern.is_ascii() { + if !pattern.contains(is_like_pattern) { + return Ok(Self::IEqAscii(pattern)); + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { + return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) + { + return Ok(Self::IEndsWithAscii(&pattern[1..])); + } + } + Ok(Self::Regex(regex_like(pattern, true)?)) + } + + /// Evaluate this predicate against the given haystack + pub fn evaluate(&self, haystack: &str) -> bool { + match self { + Predicate::Eq(v) => *v == haystack, + Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), + Predicate::Contains(v) => haystack.contains(v), + Predicate::StartsWith(v) => haystack.starts_with(v), + Predicate::IStartsWithAscii(v) => starts_with_ignore_ascii_case(haystack, v), + Predicate::EndsWith(v) => haystack.ends_with(v), + Predicate::IEndsWithAscii(v) => ends_with_ignore_ascii_case(haystack, v), + Predicate::Regex(v) => v.is_match(haystack), + } + } + + /// Evaluate this predicate against the elements of `array` + /// + /// If `negate` is true the result of the predicate will be negated + #[inline(never)] + pub fn evaluate_array( + &self, + array: &GenericStringArray, + negate: bool, + ) -> BooleanArray { + match self { + Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| { + (haystack.len() == v.len() && haystack == *v) != negate + }), + Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { + haystack.eq_ignore_ascii_case(v) != negate + }), + Predicate::Contains(v) => { + BooleanArray::from_unary(array, |haystack| haystack.contains(v) != negate) + } + Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| { + haystack.starts_with(v) != negate + }), + Predicate::IStartsWithAscii(v) => { + BooleanArray::from_unary(array, |haystack| { + starts_with_ignore_ascii_case(haystack, v) != negate + }) + } + Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { + haystack.ends_with(v) != negate + }), + Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { + ends_with_ignore_ascii_case(haystack, v) != negate + }), + Predicate::Regex(v) => { + BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) + } + } + } +} + +fn starts_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { + let end = haystack.len().min(needle.len()); + haystack.is_char_boundary(end) && needle.eq_ignore_ascii_case(&haystack[..end]) +} + +fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { + let start = haystack.len().saturating_sub(needle.len()); + haystack.is_char_boundary(start) && needle.eq_ignore_ascii_case(&haystack[start..]) +} + +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +fn regex_like(pattern: &str, case_insensitive: bool) -> Result { + let mut result = String::with_capacity(pattern.len() * 2); + result.push('^'); + let mut chars_iter = pattern.chars().peekable(); + while let Some(c) = chars_iter.next() { + if c == '\\' { + let next = chars_iter.peek(); + match next { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } + } + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); + } else { + result.push(c); + } + } + result.push('$'); + RegexBuilder::new(&result) + .case_insensitive(case_insensitive) + .dot_matches_new_line(true) + .build() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Unable to build regex from LIKE pattern: {e}" + )) + }) +} + +fn is_like_pattern(c: char) -> bool { + c == '%' || c == '_' +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_replace_like_wildcards() { + let a_eq = "_%"; + let expected = "^..*$"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } + + #[test] + fn test_replace_like_wildcards_leave_like_meta_chars() { + let a_eq = "\\%\\_"; + let expected = "^%_$"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } + + #[test] + fn test_replace_like_wildcards_with_multiple_escape_chars() { + let a_eq = "\\\\%"; + let expected = "^\\\\%$"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } + + #[test] + fn test_replace_like_wildcards_escape_regex_meta_char() { + let a_eq = "."; + let expected = "^\\.$"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } +} diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index b9fb6c8e3300..02de70c5d79d 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -32,22 +32,19 @@ use arrow_string::regexp::regexp_is_match_utf8_scalar; const SIZE: usize = 65536; fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) { - like_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); + like(arr_a, &StringArray::new_scalar(value_b)).unwrap(); } fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) { - nlike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) - .unwrap(); + nlike(arr_a, &StringArray::new_scalar(value_b)).unwrap(); } fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { - ilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) - .unwrap(); + ilike(arr_a, &StringArray::new_scalar(value_b)).unwrap(); } fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) { - nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) - .unwrap(); + nilike(arr_a, &StringArray::new_scalar(value_b)).unwrap(); } fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) { @@ -103,45 +100,45 @@ fn add_benchmark(c: &mut Criterion) { let arr_a = create_primitive_array_with_seed::(SIZE, 0.0, 42); let arr_b = create_primitive_array_with_seed::(SIZE, 0.0, 43); - let scalar = Int32Array::from(vec![1]); + let scalar = Int32Array::new_scalar(1); c.bench_function("eq Int32", |b| b.iter(|| eq(&arr_a, &arr_b))); c.bench_function("eq scalar Int32", |b| { - b.iter(|| eq(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| eq(&arr_a, &scalar).unwrap()) }); c.bench_function("neq Int32", |b| b.iter(|| neq(&arr_a, &arr_b))); c.bench_function("neq scalar Int32", |b| { - b.iter(|| neq(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| neq(&arr_a, &scalar).unwrap()) }); c.bench_function("lt Int32", |b| b.iter(|| lt(&arr_a, &arr_b))); c.bench_function("lt scalar Int32", |b| { - b.iter(|| lt(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| lt(&arr_a, &scalar).unwrap()) }); c.bench_function("lt_eq Int32", |b| b.iter(|| lt_eq(&arr_a, &arr_b))); c.bench_function("lt_eq scalar Int32", |b| { - b.iter(|| lt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| lt_eq(&arr_a, &scalar).unwrap()) }); c.bench_function("gt Int32", |b| b.iter(|| gt(&arr_a, &arr_b))); c.bench_function("gt scalar Int32", |b| { - b.iter(|| gt(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| gt(&arr_a, &scalar).unwrap()) }); c.bench_function("gt_eq Int32", |b| b.iter(|| gt_eq(&arr_a, &arr_b))); c.bench_function("gt_eq scalar Int32", |b| { - b.iter(|| gt_eq(&arr_a, &Scalar::new(&scalar)).unwrap()) + b.iter(|| gt_eq(&arr_a, &scalar).unwrap()) }); c.bench_function("eq MonthDayNano", |b| { b.iter(|| eq(&arr_month_day_nano_a, &arr_month_day_nano_b)) }); - let scalar = IntervalMonthDayNanoArray::from(vec![123]); + let scalar = IntervalMonthDayNanoArray::new_scalar(123); c.bench_function("eq scalar MonthDayNano", |b| { - b.iter(|| eq(&arr_month_day_nano_b, &Scalar::new(&scalar)).unwrap()) + b.iter(|| eq(&arr_month_day_nano_b, &scalar).unwrap()) }); c.bench_function("like_utf8 scalar equals", |b| { @@ -246,11 +243,11 @@ fn add_benchmark(c: &mut Criterion) { ); c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| { - b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test")) + b.iter(|| like(&dict_arr_a, &StringArray::new_scalar("test"))) }); c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| { - b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test")) + b.iter(|| ilike(&dict_arr_a, &StringArray::new_scalar("test"))) }); let strings = create_string_array::(20, 0.); From 4888dbf1d9441edb5fc0a8702fad05cfa4de5296 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Sun, 27 Aug 2023 07:40:33 -0700 Subject: [PATCH 1184/1411] Clear row buffer before reuse (#4742) * Clear row buffer before reuse * clear for Vec buffer --- arrow-row/src/lib.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 18b5890d4a3a..b59d84061a8a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -981,6 +981,7 @@ impl Rows { /// Sets the length of this [`Rows`] to 0 pub fn clear(&mut self) { self.offsets.truncate(1); + self.buffer.clear(); } /// Returns the number of [`Row`] in this [`Rows`] @@ -2429,17 +2430,26 @@ mod tests { RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); let mut rows = converter.empty_rows(3, 128); - let arrays = [ - Int32Array::from(vec![None, Some(2), Some(4)]), - Int32Array::from(vec![Some(2), None, Some(4)]), - ]; + let first = Int32Array::from(vec![None, Some(2), Some(4)]); + let second = Int32Array::from(vec![Some(2), None, Some(4)]); + let arrays = vec![Arc::new(first) as ArrayRef, Arc::new(second) as ArrayRef]; - for array in arrays { + for array in arrays.iter() { rows.clear(); - let array = Arc::new(array) as ArrayRef; converter.append(&mut rows, &[array.clone()]).unwrap(); let back = converter.convert_rows(&rows).unwrap(); - assert_eq!(&back[0], &array); + assert_eq!(&back[0], array); + } + + let mut rows_expected = converter.empty_rows(3, 128); + converter.append(&mut rows_expected, &arrays[1..]).unwrap(); + + for (i, (actual, expected)) in rows.iter().zip(rows_expected.iter()).enumerate() { + assert_eq!( + actual, expected, + "For row {}: expected {:?}, actual: {:?}", + i, expected, actual + ); } } From cb793a5d54e155b121bc5c33534a37bf94651c84 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 28 Aug 2023 13:31:01 -0700 Subject: [PATCH 1185/1411] Update nix requirement from 0.26.1 to 0.27.1 in /object_store (#4744) --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 3c10f4a9c849..b8d4391321fd 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -61,7 +61,7 @@ tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-ut tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] -nix = "0.26.1" +nix = { version = "0.27.1", features = ["fs"] } [features] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] From 32e973d7fd90a6f94799177a6d3735c6e3201689 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 29 Aug 2023 18:05:11 +0100 Subject: [PATCH 1186/1411] Chrono deprecations (#4748) --- arrow-cast/src/parse.rs | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 8483c44f9782..ac3b89e0ba02 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -189,13 +189,11 @@ pub fn string_to_datetime( let parser = TimestampParser::new(bytes); let date = parser.date().ok_or_else(|| err("error parsing date"))?; if bytes.len() == 10 { - let offset = timezone.offset_from_local_date(&date); - let offset = offset + let datetime = date.and_time(NaiveTime::from_hms_opt(0, 0, 0).unwrap()); + return timezone + .from_local_datetime(&datetime) .single() - .ok_or_else(|| err("error computing timezone offset"))?; - - let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - return Ok(DateTime::from_local(date.and_time(time), offset)); + .ok_or_else(|| err("error computing timezone offset")); } if !parser.test(10, b'T') && !parser.test(10, b't') && !parser.test(10, b' ') { @@ -213,28 +211,24 @@ pub fn string_to_datetime( } if bytes.len() <= tz_offset { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset + return timezone + .from_local_datetime(&datetime) .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_local(datetime, offset)); + .ok_or_else(|| err("error computing timezone offset")); } if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { - let offset = timezone.offset_from_local_datetime(&datetime); - let offset = offset - .single() - .ok_or_else(|| err("error computing timezone offset"))?; - return Ok(DateTime::from_utc(datetime, offset)); + return Ok(timezone.from_utc_datetime(&datetime)); } // Parse remainder of string as timezone let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; - let offset = parsed_tz.offset_from_local_datetime(&datetime); - let offset = offset + let parsed = parsed_tz + .from_local_datetime(&datetime) .single() .ok_or_else(|| err("error computing timezone offset"))?; - Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) + + Ok(parsed.with_timezone(timezone)) } /// Accepts a string in RFC3339 / ISO8601 standard format and some From 735f48d1f2ebd7e6edb4df167f4f3e91d33f84a4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 30 Aug 2023 03:32:52 -0700 Subject: [PATCH 1187/1411] return error (#4752) --- arrow-pyarrow-integration-testing/tests/test_sql.py | 8 ++++++++ arrow/src/ffi_stream.rs | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 92782b9ed473..e2e8d66c0f29 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -421,6 +421,14 @@ def iter_batches(): with pytest.raises(ValueError, match="test error"): rust.reader_return_errors(reader) + # Due to a long-standing oversight, PyArrow allows binary values in schema + # metadata that are not valid UTF-8. This is not allowed in Rust, but we + # make sure we error and not panic here. + schema = schema.with_metadata({"key": b"\xff"}) + reader = pa.RecordBatchReader.from_batches(schema, iter_batches()) + with pytest.raises(ValueError, match="invalid utf-8"): + rust.round_trip_record_batch_reader(reader) + def test_reject_other_classes(): # Arbitrary type that is not a PyArrow type not_pyarrow = ["hello"] diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 7005cadc623c..865a8d0e0a29 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -281,7 +281,7 @@ fn get_stream_schema(stream_ptr: *mut FFI_ArrowArrayStream) -> Result let ret_code = unsafe { (*stream_ptr).get_schema.unwrap()(stream_ptr, &mut schema) }; if ret_code == 0 { - let schema = Schema::try_from(&schema).unwrap(); + let schema = Schema::try_from(&schema)?; Ok(Arc::new(schema)) } else { Err(ArrowError::CDataInterface(format!( From eeba0a3792a2774dee1d10a25340b2741cf95c9e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 31 Aug 2023 17:19:33 +0100 Subject: [PATCH 1188/1411] Relax constraints on PyArrowType (#4757) --- arrow/src/pyarrow.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 54a247d53e6d..0e9669c5e9fa 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -295,15 +295,15 @@ impl IntoPyArrow for ArrowArrayStreamReader { /// A newtype wrapper around a `T: PyArrowConvert` that implements /// [`FromPyObject`] and [`IntoPy`] allowing usage with pyo3 macros #[derive(Debug)] -pub struct PyArrowType(pub T); +pub struct PyArrowType(pub T); -impl<'source, T: FromPyArrow + IntoPyArrow> FromPyObject<'source> for PyArrowType { +impl<'source, T: FromPyArrow> FromPyObject<'source> for PyArrowType { fn extract(value: &'source PyAny) -> PyResult { Ok(Self(T::from_pyarrow(value)?)) } } -impl IntoPy for PyArrowType { +impl IntoPy for PyArrowType { fn into_py(self, py: Python) -> PyObject { match self.0.into_pyarrow(py) { Ok(obj) => obj, @@ -312,7 +312,7 @@ impl IntoPy for PyArrowType { } } -impl From for PyArrowType { +impl From for PyArrowType { fn from(s: T) -> Self { Self(s) } From 4927c1ef1c22373d30c82203577db0bac2ee8eb9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 1 Sep 2023 04:11:01 -0700 Subject: [PATCH 1189/1411] Implement PyArrowType for `Box` (#4751) * implement for boxed rbr * add docs --- arrow-pyarrow-integration-testing/src/lib.rs | 16 +++++ .../tests/test_sql.py | 7 +++ arrow/src/lib.rs | 3 +- arrow/src/pyarrow.rs | 59 +++++++++++++++++-- 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs index adcec769f247..a53447b53c31 100644 --- a/arrow-pyarrow-integration-testing/src/lib.rs +++ b/arrow-pyarrow-integration-testing/src/lib.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use arrow::array::new_empty_array; +use arrow::record_batch::{RecordBatchIterator, RecordBatchReader}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::wrap_pyfunction; @@ -152,6 +153,20 @@ fn reader_return_errors(obj: PyArrowType) -> PyResult<() } } +#[pyfunction] +fn boxed_reader_roundtrip( + obj: PyArrowType, +) -> PyArrowType> { + let schema = obj.0.schema(); + let batches = obj + .0 + .collect::, ArrowError>>() + .unwrap(); + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let reader: Box = Box::new(reader); + PyArrowType(reader) +} + #[pymodule] fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(double))?; @@ -166,5 +181,6 @@ fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> m.add_wrapped(wrap_pyfunction!(round_trip_record_batch))?; m.add_wrapped(wrap_pyfunction!(round_trip_record_batch_reader))?; m.add_wrapped(wrap_pyfunction!(reader_return_errors))?; + m.add_wrapped(wrap_pyfunction!(boxed_reader_roundtrip))?; Ok(()) } diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index e2e8d66c0f29..3be5b9ec52fe 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -409,6 +409,13 @@ def test_record_batch_reader(): got_batches = list(b) assert got_batches == batches + # Also try the boxed reader variant + a = pa.RecordBatchReader.from_batches(schema, batches) + b = rust.boxed_reader_roundtrip(a) + assert b.schema == schema + got_batches = list(b) + assert got_batches == batches + def test_record_batch_reader_error(): schema = pa.schema([('ints', pa.list_(pa.int32()))]) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index fb904c1908e6..f4d0585fa6b5 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -375,7 +375,8 @@ pub mod pyarrow; pub mod record_batch { pub use arrow_array::{ - RecordBatch, RecordBatchOptions, RecordBatchReader, RecordBatchWriter, + RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, + RecordBatchWriter, }; } pub use arrow_array::temporal_conversions; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 0e9669c5e9fa..6063ae763228 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -15,15 +15,51 @@ // specific language governing permissions and limitations // under the License. -//! Pass Arrow objects from and to Python, using Arrow's +//! Pass Arrow objects from and to PyArrow, using Arrow's //! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) //! and [pyo3](https://docs.rs/pyo3/latest/pyo3/). //! For underlying implementation, see the [ffi] module. +//! +//! One can use these to write Python functions that take and return PyArrow +//! objects, with automatic conversion to corresponding arrow-rs types. +//! +//! ```ignore +//! #[pyfunction] +//! fn double_array(array: PyArrowType) -> PyResult> { +//! let array = array.0; // Extract from PyArrowType wrapper +//! let array: Arc = make_array(array); // Convert ArrayData to ArrayRef +//! let array: &Int32Array = array.as_any().downcast_ref() +//! .ok_or_else(|| PyValueError::new_err("expected int32 array"))?; +//! let array: Int32Array = array.iter().map(|x| x.map(|x| x * 2)).collect(); +//! Ok(PyArrowType(array.into_data())) +//! } +//! ``` +//! +//! | pyarrow type | arrow-rs type | +//! |-----------------------------|--------------------------------------------------------------------| +//! | `pyarrow.DataType` | [DataType] | +//! | `pyarrow.Field` | [Field] | +//! | `pyarrow.Schema` | [Schema] | +//! | `pyarrow.Array` | [ArrayData] | +//! | `pyarrow.RecordBatch` | [RecordBatch] | +//! | `pyarrow.RecordBatchReader` | [ArrowArrayStreamReader] / `Box` (1) | +//! +//! (1) `pyarrow.RecordBatchReader` can be imported as [ArrowArrayStreamReader]. Either +//! [ArrowArrayStreamReader] or `Box` can be exported +//! as `pyarrow.RecordBatchReader`. (`Box` is typically +//! easier to create.) +//! +//! PyArrow has the notion of chunked arrays and tables, but arrow-rs doesn't +//! have these same concepts. A chunked table is instead represented with +//! `Vec`. A `pyarrow.Table` can be imported to Rust by calling +//! [pyarrow.Table.to_reader()](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_reader) +//! and then importing the reader as a [ArrowArrayStreamReader]. use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; +use arrow_array::RecordBatchReader; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; @@ -256,6 +292,7 @@ impl ToPyArrow for RecordBatch { } } +/// Supports conversion from `pyarrow.RecordBatchReader` to [ArrowArrayStreamReader]. impl FromPyArrow for ArrowArrayStreamReader { fn from_pyarrow(value: &PyAny) -> PyResult { validate_class("RecordBatchReader", value)?; @@ -277,10 +314,13 @@ impl FromPyArrow for ArrowArrayStreamReader { } } -impl IntoPyArrow for ArrowArrayStreamReader { +/// Convert a [`RecordBatchReader`] into a `pyarrow.RecordBatchReader`. +impl IntoPyArrow for Box { + // We can't implement `ToPyArrow` for `T: RecordBatchReader + Send` because + // there is already a blanket implementation for `T: ToPyArrow`. fn into_pyarrow(self, py: Python) -> PyResult { let mut stream = FFI_ArrowArrayStream::empty(); - unsafe { export_reader_into_raw(Box::new(self), &mut stream) }; + unsafe { export_reader_into_raw(self, &mut stream) }; let stream_ptr = (&mut stream) as *mut FFI_ArrowArrayStream; let module = py.import("pyarrow")?; @@ -292,8 +332,17 @@ impl IntoPyArrow for ArrowArrayStreamReader { } } -/// A newtype wrapper around a `T: PyArrowConvert` that implements -/// [`FromPyObject`] and [`IntoPy`] allowing usage with pyo3 macros +/// Convert a [`ArrowArrayStreamReader`] into a `pyarrow.RecordBatchReader`. +impl IntoPyArrow for ArrowArrayStreamReader { + fn into_pyarrow(self, py: Python) -> PyResult { + let boxed: Box = Box::new(self); + boxed.into_pyarrow(py) + } +} + +/// A newtype wrapper. When wrapped around a type `T: FromPyArrow`, it +/// implements `FromPyObject` for the PyArrow objects. When wrapped around a +/// `T: IntoPyArrow`, it implements `IntoPy` for the wrapped type. #[derive(Debug)] pub struct PyArrowType(pub T); From 6e28c03ff6d61bfda4eb5d70bfb413cf91c49ae7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 2 Sep 2023 10:02:20 +0100 Subject: [PATCH 1190/1411] Make ObjectStore::copy Atomic and Automatically Create Parent Directories (#4758) (#4760) (#4759) * Make LocalFileSystem::copy atomic (#4758) * Create sub-directories for copy (#4760) * Fix HttpStore * Clippy * Tweak error propagation * Add doc --- object_store/src/http/client.rs | 50 +++++++------- object_store/src/lib.rs | 20 +++++- object_store/src/local.rs | 112 ++++++++++++++++++++------------ 3 files changed, 115 insertions(+), 67 deletions(-) diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 1d3df34db9d1..93cd4ee0ea09 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -256,31 +256,37 @@ impl Client { } pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { - let from = self.path_url(from); - let to = self.path_url(to); - let method = Method::from_bytes(b"COPY").unwrap(); - - let mut builder = self - .client - .request(method, from) - .header("Destination", to.as_str()); + let mut retry = false; + loop { + let method = Method::from_bytes(b"COPY").unwrap(); - if !overwrite { - builder = builder.header("Overwrite", "F"); - } + let mut builder = self + .client + .request(method, self.path_url(from)) + .header("Destination", self.path_url(to).as_str()); - match builder.send_retry(&self.retry_config).await { - Ok(_) => Ok(()), - Err(e) - if !overwrite - && matches!(e.status(), Some(StatusCode::PRECONDITION_FAILED)) => - { - Err(crate::Error::AlreadyExists { - path: to.to_string(), - source: Box::new(e), - }) + if !overwrite { + builder = builder.header("Overwrite", "F"); } - Err(source) => Err(Error::Request { source }.into()), + + return match builder.send_retry(&self.retry_config).await { + Ok(_) => Ok(()), + Err(source) => Err(match source.status() { + Some(StatusCode::PRECONDITION_FAILED) if !overwrite => { + crate::Error::AlreadyExists { + path: to.to_string(), + source: Box::new(source), + } + } + // Some implementations return 404 instead of 409 + Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { + retry = true; + self.create_parent_directories(to).await?; + continue; + } + _ => Error::Request { source }.into(), + }), + }; } } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 7496b589cd8a..d1ee83b64d7b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1105,8 +1105,24 @@ mod tests { files.sort_unstable(); assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + let dst2 = Path::from("new/nested/foo.parquet"); + storage.copy(&emoji_file, &dst2).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone(), dst2.clone()]); + + let dst3 = Path::from("new/nested2/bar.parquet"); + storage.rename(&dst, &dst3).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst2.clone(), dst3.clone()]); + + let err = storage.head(&dst).await.unwrap_err(); + assert!(matches!(err, Error::NotFound { .. })); + storage.delete(&emoji_file).await.unwrap(); - storage.delete(&dst).await.unwrap(); + storage.delete(&dst3).await.unwrap(); + storage.delete(&dst2).await.unwrap(); let files = flatten_list_stream(storage, Some(&emoji_prefix)) .await .unwrap(); @@ -1605,7 +1621,7 @@ mod tests { pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); - let path2 = Path::from("test2"); + let path2 = Path::from("not_exists_nested/test2"); let contents1 = Bytes::from("cats"); let contents2 = Bytes::from("dogs"); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 4d57ef1b79e1..495bb4f9c4aa 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -28,7 +28,7 @@ use chrono::{DateTime, Utc}; use futures::future::BoxFuture; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; +use snafu::{ensure, ResultExt, Snafu}; use std::fs::{metadata, symlink_metadata, File, Metadata, OpenOptions}; use std::io::{ErrorKind, Read, Seek, SeekFrom, Write}; use std::ops::Range; @@ -78,10 +78,10 @@ pub(crate) enum Error { path: PathBuf, }, - #[snafu(display("Unable to create file {}: {}", path.display(), err))] + #[snafu(display("Unable to create file {}: {}", path.display(), source))] UnableToCreateFile { + source: io::Error, path: PathBuf, - err: io::Error, }, #[snafu(display("Unable to delete file {}: {}", path.display(), source))] @@ -336,12 +336,13 @@ impl ObjectStore for LocalFileSystem { // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. Ok(file) => return Ok(Box::new(file)), // If the error is that the file was not found, attempt to create the file and any necessary parent directories. - Err(err) if err.kind() == ErrorKind::NotFound => { + Err(source) if source.kind() == ErrorKind::NotFound => { // Get the path to the parent directory of the file. - let parent = path - .parent() - // If the parent directory does not exist, return a `UnableToCreateFileSnafu` error. - .context(UnableToCreateFileSnafu { path: &path, err })?; + let parent = + path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; // Create the parent directory and any necessary ancestors. tokio::fs::create_dir_all(parent) @@ -584,10 +585,27 @@ impl ObjectStore for LocalFileSystem { async fn copy(&self, from: &Path, to: &Path) -> Result<()> { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - - maybe_spawn_blocking(move || { - std::fs::copy(&from, &to).context(UnableToCopyFileSnafu { from, to })?; - Ok(()) + let mut id = 0; + // In order to make this atomic we: + // + // - hard link to a hidden temporary file + // - atomically rename this temporary file into place + // + // This is necessary because hard_link returns an error if the destination already exists + maybe_spawn_blocking(move || loop { + let staged = staged_upload_path(&to, &id.to_string()); + match std::fs::hard_link(&from, &staged) { + Ok(_) => { + return std::fs::rename(&staged, &to).map_err(|source| { + Error::UnableToCopyFile { from, to, source }.into() + }) + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => id += 1, + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } @@ -595,9 +613,14 @@ impl ObjectStore for LocalFileSystem { async fn rename(&self, from: &Path, to: &Path) -> Result<()> { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - maybe_spawn_blocking(move || { - std::fs::rename(&from, &to).context(UnableToCopyFileSnafu { from, to })?; - Ok(()) + maybe_spawn_blocking(move || loop { + match std::fs::rename(&from, &to) { + Ok(_) => return Ok(()), + Err(source) => match source.kind() { + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } @@ -606,25 +629,37 @@ impl ObjectStore for LocalFileSystem { let from = self.config.path_to_filesystem(from)?; let to = self.config.path_to_filesystem(to)?; - maybe_spawn_blocking(move || { - std::fs::hard_link(&from, &to).map_err(|err| match err.kind() { - io::ErrorKind::AlreadyExists => Error::AlreadyExists { - path: to.to_str().unwrap().to_string(), - source: err, - } - .into(), - _ => Error::UnableToCopyFile { - from, - to, - source: err, - } - .into(), - }) + maybe_spawn_blocking(move || loop { + match std::fs::hard_link(&from, &to) { + Ok(_) => return Ok(()), + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => { + return Err(Error::AlreadyExists { + path: to.to_str().unwrap().to_string(), + source, + } + .into()) + } + ErrorKind::NotFound => create_parent_dirs(&to, source)?, + _ => return Err(Error::UnableToCopyFile { from, to, source }.into()), + }, + } }) .await } } +/// Creates the parent directories of `path` or returns an error based on `source` if no parent +fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { + let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; + + std::fs::create_dir_all(parent).context(UnableToCreateDirSnafu { path: parent })?; + Ok(()) +} + /// Generates a unique file path `{base}#{suffix}`, returning the opened `File` and `suffix` /// /// Creates any directories if necessary @@ -636,20 +671,11 @@ fn new_staged_upload(base: &std::path::Path) -> Result<(File, String)> { let mut options = OpenOptions::new(); match options.read(true).write(true).create_new(true).open(&path) { Ok(f) => return Ok((f, suffix)), - Err(e) if e.kind() == ErrorKind::AlreadyExists => { - multipart_id += 1; - } - Err(err) if err.kind() == ErrorKind::NotFound => { - let parent = path - .parent() - .context(UnableToCreateFileSnafu { path: &path, err })?; - - std::fs::create_dir_all(parent) - .context(UnableToCreateDirSnafu { path: parent })?; - - continue; - } - Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => multipart_id += 1, + ErrorKind::NotFound => create_parent_dirs(&path, source)?, + _ => return Err(Error::UnableToOpenFile { source, path }.into()), + }, } } } From 611b129e3927708983e2f2dfdeb7d5282a74bde5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 3 Sep 2023 23:09:42 +0100 Subject: [PATCH 1191/1411] Remove unused dyn_cmp_dict feature (#4766) --- .github/workflows/arrow.yml | 4 ++-- arrow-string/Cargo.toml | 6 ------ arrow-string/src/like.rs | 1 - arrow/Cargo.toml | 5 +---- arrow/README.md | 1 - 5 files changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 8203c15afc6c..41a2722beff9 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -91,7 +91,7 @@ jobs: - name: Test arrow with default features run: cargo test -p arrow - name: Test arrow with all features apart from simd - run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,chrono-tz + run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | # Test arrow examples @@ -211,7 +211,7 @@ jobs: - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - name: Clippy arrow with all features except SIMD - run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,chrono-tz --all-targets -- -D warnings + run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,chrono-tz --all-targets -- -D warnings - name: Clippy arrow-integration-test with all features run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - name: Clippy arrow-integration-testing with all features diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index 0f88ffbac923..e1163dc03eab 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -42,9 +42,3 @@ arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.7.1", default-features = false, features = ["unicode"] } num = { version = "0.4", default-features = false, features = ["std"] } - -[package.metadata.docs.rs] -all-features = true - -[features] -dyn_cmp_dict = [] diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 412f1e6cc89a..279a4782009d 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -405,7 +405,6 @@ mod tests { macro_rules! test_dict_utf8 { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] - #[cfg(feature = "dyn_cmp_dict")] fn $test_name() { let expected = BooleanArray::from($expected); let left: DictionaryArray = $left.into_iter().collect(); diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 9456dd4b012c..bc75207c2230 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -63,7 +63,7 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" pyo3 = { version = "0.19", default-features = false, optional = true } [package.metadata.docs.rs] -features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] +features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] [features] default = ["csv", "ipc", "json"] @@ -85,9 +85,6 @@ pyarrow = ["pyo3", "ffi"] force_validate = ["arrow-data/force_validate"] # Enable ffi support ffi = ["arrow-schema/ffi", "arrow-data/ffi"] -# Enable dyn-comparison of dictionary arrays with other arrays -# Note: this does not impact comparison against scalars -dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] diff --git a/arrow/README.md b/arrow/README.md index fb2119e3bc15..6a91bc951cc1 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -54,7 +54,6 @@ The `arrow` crate provides the following features which may be enabled in your ` - `chrono-tz` - support of parsing timezone using [chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/) - `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - `pyarrow` - bindings for pyo3 to call arrow-rs from python -- `dyn_cmp_dict` - enables comparison of dictionary arrays within dyn comparison kernels ## Arrow Feature Status From 587250c8e0f9707cc102bd04573395c153249ced Mon Sep 17 00:00:00 2001 From: RinChanNOW Date: Mon, 4 Sep 2023 19:26:15 +0800 Subject: [PATCH 1192/1411] fix: avoid panic if offset index not exists. (#4761) * fix: avoid panic if offset index not exists. * Add unit tests and comments. --- parquet/src/arrow/arrow_reader/mod.rs | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5f95a8664b4b..2acc0faf130f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -480,7 +480,11 @@ impl Iterator for ReaderPageIterator { let rg = self.metadata.row_group(rg_idx); let meta = rg.column(self.column_idx); let offset_index = self.metadata.offset_index(); - let page_locations = offset_index.map(|i| i[rg_idx][self.column_idx].clone()); + // `offset_index` may not exist and `i[rg_idx]` will be empty. + // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. + let page_locations = offset_index + .filter(|i| !i[rg_idx].is_empty()) + .map(|i| i[rg_idx][self.column_idx].clone()); let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); @@ -2481,6 +2485,43 @@ mod tests { assert_eq!(reader.batch_size, num_rows as usize); } + #[test] + fn test_read_with_page_index_enabled() { + let testdata = arrow::util::test_util::parquet_test_data(); + + { + // `alltypes_tiny_pages.parquet` has page index + let path = format!("{testdata}/alltypes_tiny_pages.parquet"); + let test_file = File::open(path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + test_file, + ArrowReaderOptions::new().with_page_index(true), + ) + .unwrap(); + assert!(!builder.metadata().offset_index().unwrap()[0].is_empty()); + let reader = builder.build().unwrap(); + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 8); + } + + { + // `alltypes_plain.parquet` doesn't have page index + let path = format!("{testdata}/alltypes_plain.parquet"); + let test_file = File::open(path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + test_file, + ArrowReaderOptions::new().with_page_index(true), + ) + .unwrap(); + // Although `Vec>` of each row group is empty, + // we should read the file successfully. + assert!(builder.metadata().offset_index().unwrap()[0].is_empty()); + let reader = builder.build().unwrap(); + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 1); + } + } + #[test] fn test_raw_repetition() { const MESSAGE_TYPE: &str = " From b66c57c4b441140f61af846a3371a957e82a5ff7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Sep 2023 17:54:12 +0100 Subject: [PATCH 1193/1411] Bump actions/checkout from 3 to 4 (#4767) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/arrow.yml | 10 +++++----- .github/workflows/arrow_flight.yml | 6 +++--- .github/workflows/coverage.yml | 2 +- .github/workflows/dev.yml | 4 ++-- .github/workflows/dev_pr.yml | 2 +- .github/workflows/docs.yml | 4 ++-- .github/workflows/integration.yml | 6 +++--- .github/workflows/miri.yaml | 2 +- .github/workflows/object_store.yml | 8 ++++---- .github/workflows/parquet.yml | 10 +++++----- .github/workflows/parquet_derive.yml | 4 ++-- .github/workflows/rust.yml | 8 ++++---- 12 files changed, 33 insertions(+), 33 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 41a2722beff9..cde931c3c6b8 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -55,7 +55,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -109,7 +109,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -136,7 +136,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -160,7 +160,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -179,7 +179,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 5301a3f8563f..242e0f2a3b0d 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -47,7 +47,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -68,7 +68,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Run gen @@ -82,7 +82,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 3fa254142dbe..64b2ca437067 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -36,7 +36,7 @@ jobs: # otherwise we get this error: # Failed to run tests: ASLR disable failed: EPERM: Operation not permitted steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 0eb2d024f352..9871f8b7d295 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -38,7 +38,7 @@ jobs: name: Release Audit Tool (RAT) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v4 with: @@ -50,7 +50,7 @@ jobs: name: Markdown format runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-node@v3 with: node-version: "14" diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index bb88e9dcd3f5..5f3d9e54c8db 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -37,7 +37,7 @@ jobs: contents: read pull-requests: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Assign GitHub labels if: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index d3f8e9046510..721260892402 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -43,7 +43,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings --enable-index-page -Zunstable-options" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install python dev @@ -77,7 +77,7 @@ jobs: contents: write runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download crate docs uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9b2e7797d5ff..3ff6aedb0122 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -76,13 +76,13 @@ jobs: - name: Check cmake run: which cmake - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: apache/arrow submodules: true fetch-depth: 0 - name: Checkout Arrow Rust - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: rust fetch-depth: 0 @@ -127,7 +127,7 @@ jobs: matrix: rust: [ stable ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 0c1f8069cd40..e3704d036aca 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -46,7 +46,7 @@ jobs: name: MIRI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 01e14022e122..3b9b1e31d5c3 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -43,7 +43,7 @@ jobs: run: working-directory: object_store steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy @@ -79,7 +79,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Run cargo doc run: cargo doc --document-private-items --no-deps --workspace --all-features @@ -115,7 +115,7 @@ jobs: GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Configure Fake GCS Server (GCP emulation) # Custom image - see fsouza/fake-gcs-server#1164 @@ -160,7 +160,7 @@ jobs: run: working-directory: object_store steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 55599b776c32..c309a3fa6473 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -51,7 +51,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -74,7 +74,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -116,7 +116,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -135,7 +135,7 @@ jobs: matrix: rust: [ stable ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v4 with: @@ -168,7 +168,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index 72b90ecfd81a..d8b02f73a8aa 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -43,7 +43,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -57,7 +57,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f198f48dfec5..9c4b28b691b7 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -37,7 +37,7 @@ jobs: name: Test on Mac runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install protoc with brew @@ -60,7 +60,7 @@ jobs: name: Test on Windows runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install protobuf compiler in /d/protoc @@ -93,7 +93,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup rustfmt @@ -110,7 +110,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv From db5314c5c2680861683b7dcb8f69cc27aa7ac8ed Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 5 Sep 2023 16:01:09 +0100 Subject: [PATCH 1194/1411] Fix List Sorting, Revert Removal of Rank Kernels (#4747) * Revert "Remove rank kernels" This reverts commit c06786faaf750de7c899dd7750111c2d684e307b. * Fix child_rank --- arrow-ord/src/lib.rs | 1 + arrow-ord/src/rank.rs | 195 +++++++++++++++++++++++++++++++++++ arrow-ord/src/sort.rs | 41 ++++++-- arrow/benches/sort_kernel.rs | 21 ++++ arrow/src/compute/kernels.rs | 2 +- arrow/src/compute/mod.rs | 1 + 6 files changed, 251 insertions(+), 10 deletions(-) create mode 100644 arrow-ord/src/rank.rs diff --git a/arrow-ord/src/lib.rs b/arrow-ord/src/lib.rs index 19ad8229417f..8fe4ecbc05aa 100644 --- a/arrow-ord/src/lib.rs +++ b/arrow-ord/src/lib.rs @@ -48,4 +48,5 @@ pub mod cmp; pub mod comparison; pub mod ord; pub mod partition; +pub mod rank; pub mod sort; diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs new file mode 100644 index 000000000000..1e79156a71a3 --- /dev/null +++ b/arrow-ord/src/rank.rs @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::cast::AsArray; +use arrow_array::types::*; +use arrow_array::{downcast_primitive_array, Array, ArrowNativeTypeOp, GenericByteArray}; +use arrow_buffer::NullBuffer; +use arrow_schema::{ArrowError, DataType, SortOptions}; +use std::cmp::Ordering; + +/// Assigns a rank to each value in `array` based on its position in the sorted order +/// +/// Where values are equal, they will be assigned the highest of their ranks, +/// leaving gaps in the overall rank assignment +/// +/// ``` +/// # use arrow_array::StringArray; +/// # use arrow_ord::rank::rank; +/// let array = StringArray::from(vec![Some("foo"), None, Some("foo"), None, Some("bar")]); +/// let ranks = rank(&array, None).unwrap(); +/// assert_eq!(ranks, &[5, 2, 5, 2, 3]); +/// ``` +pub fn rank( + array: &dyn Array, + options: Option, +) -> Result, ArrowError> { + let options = options.unwrap_or_default(); + let ranks = downcast_primitive_array! { + array => primitive_rank(array.values(), array.nulls(), options), + DataType::Utf8 => bytes_rank(array.as_bytes::(), options), + DataType::LargeUtf8 => bytes_rank(array.as_bytes::(), options), + DataType::Binary => bytes_rank(array.as_bytes::(), options), + DataType::LargeBinary => bytes_rank(array.as_bytes::(), options), + d => return Err(ArrowError::ComputeError(format!("{d:?} not supported in rank"))) + }; + Ok(ranks) +} + +#[inline(never)] +fn primitive_rank( + values: &[T], + nulls: Option<&NullBuffer>, + options: SortOptions, +) -> Vec { + let len: u32 = values.len().try_into().unwrap(); + let to_sort = match nulls.filter(|n| n.null_count() > 0) { + Some(n) => n + .valid_indices() + .map(|idx| (values[idx], idx as u32)) + .collect(), + None => values.iter().copied().zip(0..len).collect(), + }; + rank_impl(values.len(), to_sort, options, T::compare, T::is_eq) +} + +#[inline(never)] +fn bytes_rank( + array: &GenericByteArray, + options: SortOptions, +) -> Vec { + let to_sort: Vec<(&[u8], u32)> = match array.nulls().filter(|n| n.null_count() > 0) { + Some(n) => n + .valid_indices() + .map(|idx| (array.value(idx).as_ref(), idx as u32)) + .collect(), + None => (0..array.len()) + .map(|idx| (array.value(idx).as_ref(), idx as u32)) + .collect(), + }; + rank_impl(array.len(), to_sort, options, Ord::cmp, PartialEq::eq) +} + +fn rank_impl( + len: usize, + mut valid: Vec<(T, u32)>, + options: SortOptions, + compare: C, + eq: E, +) -> Vec +where + T: Copy, + C: Fn(T, T) -> Ordering, + E: Fn(T, T) -> bool, +{ + // We can use an unstable sort as we combine equal values later + valid.sort_unstable_by(|a, b| compare(a.0, b.0)); + if options.descending { + valid.reverse(); + } + + let (mut valid_rank, null_rank) = match options.nulls_first { + true => (len as u32, (len - valid.len()) as u32), + false => (valid.len() as u32, len as u32), + }; + + let mut out: Vec<_> = vec![null_rank; len]; + if let Some(v) = valid.last() { + out[v.1 as usize] = valid_rank; + } + + let mut count = 1; // Number of values in rank + for w in valid.windows(2).rev() { + match eq(w[0].0, w[1].0) { + true => { + count += 1; + out[w[0].1 as usize] = valid_rank; + } + false => { + valid_rank -= count; + count = 1; + out[w[0].1 as usize] = valid_rank + } + } + } + + out +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::*; + + #[test] + fn test_primitive() { + let descending = SortOptions { + descending: true, + nulls_first: true, + }; + + let nulls_last = SortOptions { + descending: false, + nulls_first: false, + }; + + let nulls_last_descending = SortOptions { + descending: true, + nulls_first: false, + }; + + let a = Int32Array::from(vec![Some(1), Some(1), None, Some(3), Some(3), Some(4)]); + let res = rank(&a, None).unwrap(); + assert_eq!(res, &[3, 3, 1, 5, 5, 6]); + + let res = rank(&a, Some(descending)).unwrap(); + assert_eq!(res, &[6, 6, 1, 4, 4, 2]); + + let res = rank(&a, Some(nulls_last)).unwrap(); + assert_eq!(res, &[2, 2, 6, 4, 4, 5]); + + let res = rank(&a, Some(nulls_last_descending)).unwrap(); + assert_eq!(res, &[5, 5, 6, 3, 3, 1]); + + // Test with non-zero null values + let nulls = NullBuffer::from(vec![true, true, false, true, false, false]); + let a = Int32Array::new(vec![1, 4, 3, 4, 5, 5].into(), Some(nulls)); + let res = rank(&a, None).unwrap(); + assert_eq!(res, &[4, 6, 3, 6, 3, 3]); + } + + #[test] + fn test_bytes() { + let v = vec!["foo", "fo", "bar", "bar"]; + let values = StringArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[4, 3, 2, 2]); + + let values = LargeStringArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[4, 3, 2, 2]); + + let v: Vec<&[u8]> = vec![&[1, 2], &[0], &[1, 2, 3], &[1, 2]]; + let values = LargeBinaryArray::from(v.clone()); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[3, 1, 4, 3]); + + let values = BinaryArray::from(v); + let res = rank(&values, None).unwrap(); + assert_eq!(res, &[3, 1, 4, 3]); + } +} diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 6c8c3b8facef..a477d6c261b3 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -30,6 +30,7 @@ use arrow_select::take::take; use std::cmp::Ordering; use std::sync::Arc; +use crate::rank::rank; pub use arrow_schema::SortOptions; /// Sort the `ArrayRef` using `SortOptions`. @@ -400,14 +401,7 @@ fn child_rank(values: &dyn Array, options: SortOptions) -> Result, Arro descending: false, nulls_first: options.nulls_first != options.descending, }); - - let sorted_value_indices = sort_to_indices(values, value_options, None)?; - let sorted_indices = sorted_value_indices.values(); - let mut out: Vec<_> = vec![0_u32; sorted_indices.len()]; - for (ix, val) in sorted_indices.iter().enumerate() { - out[*val as usize] = ix as u32; - } - Ok(out) + rank(values, value_options) } // Sort run array and return sorted run array. @@ -800,7 +794,9 @@ impl LexicographicalComparator { #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::PrimitiveRunBuilder; + use arrow_array::builder::{ + FixedSizeListBuilder, Int64Builder, ListBuilder, PrimitiveRunBuilder, + }; use arrow_buffer::i256; use half::f16; use rand::rngs::StdRng; @@ -3991,4 +3987,31 @@ mod tests { // NULL.cmp(4) assert_eq!(comparator.compare(2, 3), Ordering::Less); } + + #[test] + fn sort_list_equal() { + let a = { + let mut builder = FixedSizeListBuilder::new(Int64Builder::new(), 2); + for value in [[1, 5], [0, 3], [1, 3]] { + builder.values().append_slice(&value); + builder.append(true); + } + builder.finish() + }; + + let sort_indices = sort_to_indices(&a, None, None).unwrap(); + assert_eq!(sort_indices.values(), &[1, 2, 0]); + + let a = { + let mut builder = ListBuilder::new(Int64Builder::new()); + for value in [[1, 5], [0, 3], [1, 3]] { + builder.values().append_slice(&value); + builder.append(true); + } + builder.finish() + }; + + let sort_indices = sort_to_indices(&a, None, None).unwrap(); + assert_eq!(sort_indices.values(), &[1, 2, 0]); + } } diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index dd55076647a5..63e10e0528ba 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -27,6 +27,7 @@ use arrow::compute::{lexsort, sort, sort_to_indices, SortColumn}; use arrow::datatypes::{Int16Type, Int32Type}; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; +use arrow_ord::rank::rank; fn create_f32_array(size: usize, with_nulls: bool) -> ArrayRef { let null_density = if with_nulls { 0.5 } else { 0.0 }; @@ -213,6 +214,26 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("lexsort (f32, f32) nulls 2^12 limit 2^12", |b| { b.iter(|| bench_lexsort(&arr_a, &arr_b, Some(2usize.pow(12)))) }); + + let arr = create_f32_array(2usize.pow(12), false); + c.bench_function("rank f32 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_f32_array(2usize.pow(12), true); + c.bench_function("rank f32 nulls 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_string_array_with_len::(2usize.pow(12), 0.0, 10); + c.bench_function("rank string[10] 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); + + let arr = create_string_array_with_len::(2usize.pow(12), 0.5, 10); + c.bench_function("rank string[10] nulls 2^12", |b| { + b.iter(|| black_box(rank(&arr, None).unwrap())) + }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs index dba41625020b..35ad80e009cc 100644 --- a/arrow/src/compute/kernels.rs +++ b/arrow/src/compute/kernels.rs @@ -22,7 +22,7 @@ pub use arrow_arith::{ }; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; -pub use arrow_ord::{cmp, partition, sort}; +pub use arrow_ord::{cmp, partition, rank, sort}; pub use arrow_select::{concat, filter, interleave, nullif, take, window, zip}; pub use arrow_string::{concat_elements, length, regexp, substring}; diff --git a/arrow/src/compute/mod.rs b/arrow/src/compute/mod.rs index 7cfe787b08cf..47a9d149aadb 100644 --- a/arrow/src/compute/mod.rs +++ b/arrow/src/compute/mod.rs @@ -30,6 +30,7 @@ pub use self::kernels::filter::*; pub use self::kernels::interleave::*; pub use self::kernels::nullif::*; pub use self::kernels::partition::*; +pub use self::kernels::rank::*; pub use self::kernels::regexp::*; pub use self::kernels::sort::*; pub use self::kernels::take::*; From 65edbb1702e25420aacebe656dcd789690f72c82 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 5 Sep 2023 16:02:46 +0100 Subject: [PATCH 1195/1411] Re-encode dictionaries in selection kernels (#3558) * Re-encode dictionaries in selection kernels * More benchmarks * Best-effort hashing * More benchmarks * Add fallback to concatenating dictionaries * Fix nulls * Format * Cleanup * RAT * Clippy * Split out heuristic * Add support to interleave kernel * Clippy * More clippy * Clippy * Cleanup * Optimize concat * Review feedback * Clippy * Improved null handling * Further tests * Faster ptr_eq --- arrow-buffer/src/buffer/immutable.rs | 8 + arrow-buffer/src/buffer/offset.rs | 8 + arrow-buffer/src/buffer/scalar.rs | 8 + arrow-select/Cargo.toml | 1 + arrow-select/src/concat.rs | 186 +++++++++++---- arrow-select/src/dictionary.rs | 333 +++++++++++++++++++++++++++ arrow-select/src/interleave.rs | 165 ++++++++++--- arrow-select/src/lib.rs | 1 + arrow/benches/concatenate_kernel.rs | 22 ++ arrow/benches/interleave_kernels.rs | 32 ++- arrow/src/util/bench_util.rs | 25 +- 11 files changed, 698 insertions(+), 91 deletions(-) create mode 100644 arrow-select/src/dictionary.rs diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8296d3fbcc31..bda6dfc5cdee 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -323,6 +323,14 @@ impl Buffer { length, }) } + + /// Returns true if this [`Buffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.ptr == other.ptr && self.length == other.length + } } /// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index fede32c57924..a6f2f7f6cfae 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -148,6 +148,14 @@ impl OffsetBuffer { pub fn slice(&self, offset: usize, len: usize) -> Self { Self(self.0.slice(offset, len.saturating_add(1))) } + + /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.0.ptr_eq(&other.0) + } } impl Deref for OffsetBuffer { diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 70c86f11866d..276e635e825c 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -86,6 +86,14 @@ impl ScalarBuffer { pub fn into_inner(self) -> Buffer { self.buffer } + + /// Returns true if this [`ScalarBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.buffer.ptr_eq(&other.buffer) + } } impl Deref for ScalarBuffer { diff --git a/arrow-select/Cargo.toml b/arrow-select/Cargo.toml index ff8a212c7b52..023788799c94 100644 --- a/arrow-select/Cargo.toml +++ b/arrow-select/Cargo.toml @@ -39,6 +39,7 @@ arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-array = { workspace = true } num = { version = "0.4", default-features = false, features = ["std"] } +ahash = { version = "0.8", default-features = false} [features] default = [] diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index eed20699c239..c34c3d3d0ccf 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -30,20 +30,20 @@ //! assert_eq!(arr.len(), 3); //! ``` +use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values}; +use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer}; use arrow_data::transform::{Capacities, MutableArrayData}; use arrow_schema::{ArrowError, DataType, SchemaRef}; +use std::sync::Arc; fn binary_capacity(arrays: &[&dyn Array]) -> Capacities { let mut item_capacity = 0; let mut bytes_capacity = 0; for array in arrays { - let a = array - .as_any() - .downcast_ref::>() - .unwrap(); + let a = array.as_bytes::(); // Guaranteed to always have at least one element let offsets = a.value_offsets(); @@ -54,6 +54,59 @@ fn binary_capacity(arrays: &[&dyn Array]) -> Capacities { Capacities::Binary(item_capacity, Some(bytes_capacity)) } +fn concat_dictionaries( + arrays: &[&dyn Array], +) -> Result { + let mut output_len = 0; + let dictionaries: Vec<_> = arrays + .iter() + .map(|x| x.as_dictionary::()) + .inspect(|d| output_len += d.len()) + .collect(); + + if !should_merge_dictionary_values::(&dictionaries, output_len) { + return concat_fallback(arrays, Capacities::Array(output_len)); + } + + let merged = merge_dictionary_values(&dictionaries, None)?; + + // Recompute keys + let mut key_values = Vec::with_capacity(output_len); + + let mut has_nulls = false; + for (d, mapping) in dictionaries.iter().zip(merged.key_mappings) { + has_nulls |= d.null_count() != 0; + for key in d.keys().values() { + // Use get to safely handle nulls + key_values.push(mapping.get(key.as_usize()).copied().unwrap_or_default()) + } + } + + let nulls = has_nulls.then(|| { + let mut nulls = BooleanBufferBuilder::new(output_len); + for d in &dictionaries { + match d.nulls() { + Some(n) => nulls.append_buffer(n.inner()), + None => nulls.append_n(d.len(), true), + } + } + NullBuffer::new(nulls.finish()) + }); + + let keys = PrimitiveArray::::new(key_values.into(), nulls); + // Sanity check + assert_eq!(keys.len(), output_len); + + let array = unsafe { DictionaryArray::new_unchecked(keys, merged.values) }; + Ok(Arc::new(array)) +} + +macro_rules! dict_helper { + ($t:ty, $arrays:expr) => { + return Ok(Arc::new(concat_dictionaries::<$t>($arrays)?) as _) + }; +} + /// Concatenate multiple [Array] of the same type into a single [ArrayRef]. pub fn concat(arrays: &[&dyn Array]) -> Result { if arrays.is_empty() { @@ -78,9 +131,23 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { DataType::LargeUtf8 => binary_capacity::(arrays), DataType::Binary => binary_capacity::(arrays), DataType::LargeBinary => binary_capacity::(arrays), + DataType::Dictionary(k, _) => downcast_integer! { + k.as_ref() => (dict_helper, arrays), + _ => unreachable!("illegal dictionary key type {k}") + }, _ => Capacities::Array(arrays.iter().map(|a| a.len()).sum()), }; + concat_fallback(arrays, capacity) +} + +/// Concatenates arrays using MutableArrayData +/// +/// This will naively concatenate dictionaries +fn concat_fallback( + arrays: &[&dyn Array], + capacity: Capacities, +) -> Result { let array_data: Vec<_> = arrays.iter().map(|a| a.to_data()).collect::>(); let array_data = array_data.iter().collect(); let mut mutable = MutableArrayData::with_capacities(array_data, false, capacity); @@ -140,6 +207,7 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; + use arrow_array::builder::StringDictionaryBuilder; use arrow_array::cast::AsArray; use arrow_schema::{Field, Schema}; use std::sync::Arc; @@ -468,29 +536,10 @@ mod tests { } fn collect_string_dictionary( - dictionary: &DictionaryArray, - ) -> Vec> { - let values = dictionary.values(); - let values = values.as_any().downcast_ref::().unwrap(); - - dictionary - .keys() - .iter() - .map(|key| key.map(|key| values.value(key as _).to_string())) - .collect() - } - - fn concat_dictionary( - input_1: DictionaryArray, - input_2: DictionaryArray, - ) -> Vec> { - let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); - let concat = concat - .as_any() - .downcast_ref::>() - .unwrap(); - - collect_string_dictionary(concat) + array: &DictionaryArray, + ) -> Vec> { + let concrete = array.downcast_dict::().unwrap(); + concrete.into_iter().collect() } #[test] @@ -509,11 +558,19 @@ mod tests { "E", ] .into_iter() - .map(|x| Some(x.to_string())) + .map(Some) .collect(); - let concat = concat_dictionary(input_1, input_2); - assert_eq!(concat, expected); + let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); + let dictionary = concat.as_dictionary::(); + let actual = collect_string_dictionary(dictionary); + assert_eq!(actual, expected); + + // Should have concatenated inputs together + assert_eq!( + dictionary.values().len(), + input_1.values().len() + input_2.values().len(), + ) } #[test] @@ -523,16 +580,45 @@ mod tests { .into_iter() .collect(); let input_2: DictionaryArray = vec![None].into_iter().collect(); - let expected = vec![ - Some("foo".to_string()), - Some("bar".to_string()), - None, - Some("fiz".to_string()), - None, - ]; + let expected = vec![Some("foo"), Some("bar"), None, Some("fiz"), None]; - let concat = concat_dictionary(input_1, input_2); - assert_eq!(concat, expected); + let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); + let dictionary = concat.as_dictionary::(); + let actual = collect_string_dictionary(dictionary); + assert_eq!(actual, expected); + + // Should have concatenated inputs together + assert_eq!( + dictionary.values().len(), + input_1.values().len() + input_2.values().len(), + ) + } + + #[test] + fn test_string_dictionary_merge() { + let mut builder = StringDictionaryBuilder::::new(); + for i in 0..20 { + builder.append(&i.to_string()).unwrap(); + } + let input_1 = builder.finish(); + + let mut builder = StringDictionaryBuilder::::new(); + for i in 0..30 { + builder.append(&i.to_string()).unwrap(); + } + let input_2 = builder.finish(); + + let expected: Vec<_> = (0..20).chain(0..30).map(|x| x.to_string()).collect(); + let expected: Vec<_> = expected.iter().map(|x| Some(x.as_str())).collect(); + + let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); + let dictionary = concat.as_dictionary::(); + let actual = collect_string_dictionary(dictionary); + assert_eq!(actual, expected); + + // Should have merged inputs together + // Not 30 as this is done on a best-effort basis + assert_eq!(dictionary.values().len(), 33) } #[test] @@ -556,7 +642,7 @@ mod tests { fn test_dictionary_concat_reuse() { let array: DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); - let copy: DictionaryArray = array.to_data().into(); + let copy: DictionaryArray = array.clone(); // dictionary is "a", "b", "c" assert_eq!( @@ -567,11 +653,7 @@ mod tests { // concatenate it with itself let combined = concat(&[© as _, &array as _]).unwrap(); - - let combined = combined - .as_any() - .downcast_ref::>() - .unwrap(); + let combined = combined.as_dictionary::(); assert_eq!( combined.values(), @@ -738,4 +820,16 @@ mod tests { assert_eq!(data.buffers()[1].len(), 200); assert_eq!(data.buffers()[1].capacity(), 256); // Nearest multiple of 64 } + + #[test] + fn concat_sparse_nulls() { + let values = StringArray::from_iter_values((0..100).map(|x| x.to_string())); + let keys = Int32Array::from(vec![1; 10]); + let dict_a = DictionaryArray::new(keys, Arc::new(values)); + let values = StringArray::new_null(0); + let keys = Int32Array::new_null(10); + let dict_b = DictionaryArray::new(keys, Arc::new(values)); + let array = concat(&[&dict_a, &dict_b]).unwrap(); + assert_eq!(array.null_count(), 10); + } } diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs new file mode 100644 index 000000000000..8630b332f068 --- /dev/null +++ b/arrow-select/src/dictionary.rs @@ -0,0 +1,333 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::interleave::interleave; +use ahash::RandomState; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::cast::AsArray; +use arrow_array::types::{ + ArrowDictionaryKeyType, BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, + Utf8Type, +}; +use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer}; +use arrow_schema::{ArrowError, DataType}; + +/// A best effort interner that maintains a fixed number of buckets +/// and interns keys based on their hash value +/// +/// Hash collisions will result in replacement +struct Interner<'a, V> { + state: RandomState, + buckets: Vec>, + shift: u32, +} + +impl<'a, V> Interner<'a, V> { + /// Capacity controls the number of unique buckets allocated within the Interner + /// + /// A larger capacity reduces the probability of hash collisions, and should be set + /// based on an approximation of the upper bound of unique values + fn new(capacity: usize) -> Self { + // Add additional buckets to help reduce collisions + let shift = (capacity as u64 + 128).leading_zeros(); + let num_buckets = (u64::MAX >> shift) as usize; + let buckets = (0..num_buckets.saturating_add(1)).map(|_| None).collect(); + Self { + // A fixed seed to ensure deterministic behaviour + state: RandomState::with_seeds(0, 0, 0, 0), + buckets, + shift, + } + } + + fn intern Result, E>( + &mut self, + new: &'a [u8], + f: F, + ) -> Result<&V, E> { + let hash = self.state.hash_one(new); + let bucket_idx = hash >> self.shift; + Ok(match &mut self.buckets[bucket_idx as usize] { + Some((current, v)) => { + if *current != new { + *v = f()?; + *current = new; + } + v + } + slot => &slot.insert((new, f()?)).1, + }) + } +} + +pub struct MergedDictionaries { + /// Provides `key_mappings[`array_idx`][`old_key`] -> new_key` + pub key_mappings: Vec>, + /// The new values + pub values: ArrayRef, +} + +/// Performs a cheap, pointer-based comparison of two byte array +/// +/// See [`ScalarBuffer::ptr_eq`] +fn bytes_ptr_eq(a: &dyn Array, b: &dyn Array) -> bool { + match (a.as_bytes_opt::(), b.as_bytes_opt::()) { + (Some(a), Some(b)) => { + let values_eq = + a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); + match (a.nulls(), b.nulls()) { + (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()), + (None, None) => values_eq, + _ => false, + } + } + _ => false, + } +} + +/// A type-erased function that compares two array for pointer equality +type PtrEq = dyn Fn(&dyn Array, &dyn Array) -> bool; + +/// A weak heuristic of whether to merge dictionary values that aims to only +/// perform the expensive merge computation when it is likely to yield at least +/// some return over the naive approach used by MutableArrayData +/// +/// `len` is the total length of the merged output +pub fn should_merge_dictionary_values( + dictionaries: &[&DictionaryArray], + len: usize, +) -> bool { + use DataType::*; + let first_values = dictionaries[0].values().as_ref(); + let ptr_eq: Box = match first_values.data_type() { + Utf8 => Box::new(bytes_ptr_eq::), + LargeUtf8 => Box::new(bytes_ptr_eq::), + Binary => Box::new(bytes_ptr_eq::), + LargeBinary => Box::new(bytes_ptr_eq::), + _ => return false, + }; + + let mut single_dictionary = true; + let mut total_values = first_values.len(); + for dict in dictionaries.iter().skip(1) { + let values = dict.values().as_ref(); + total_values += values.len(); + if single_dictionary { + single_dictionary = ptr_eq(first_values, values) + } + } + + let overflow = K::Native::from_usize(total_values).is_none(); + let values_exceed_length = total_values >= len; + + !single_dictionary && (overflow || values_exceed_length) +} + +/// Given an array of dictionaries and an optional key mask compute a values array +/// containing referenced values, along with mappings from the [`DictionaryArray`] +/// keys to the new keys within this values array. Best-effort will be made to ensure +/// that the dictionary values are unique +/// +/// This method is meant to be very fast and the output dictionary values +/// may not be unique, unlike `GenericByteDictionaryBuilder` which is slower +/// but produces unique values +pub fn merge_dictionary_values( + dictionaries: &[&DictionaryArray], + masks: Option<&[BooleanBuffer]>, +) -> Result, ArrowError> { + let mut num_values = 0; + + let mut values = Vec::with_capacity(dictionaries.len()); + let mut value_slices = Vec::with_capacity(dictionaries.len()); + + for (idx, dictionary) in dictionaries.iter().enumerate() { + let mask = masks.and_then(|m| m.get(idx)); + let key_mask = match (dictionary.logical_nulls(), mask) { + (Some(n), None) => Some(n.into_inner()), + (None, Some(n)) => Some(n.clone()), + (Some(n), Some(m)) => Some(n.inner() & m), + (None, None) => None, + }; + let keys = dictionary.keys().values(); + let values_mask = compute_values_mask(keys, key_mask.as_ref()); + let v = dictionary.values().as_ref(); + num_values += v.len(); + value_slices.push(get_masked_values(v, &values_mask)); + values.push(v) + } + + // Map from value to new index + let mut interner = Interner::new(num_values); + // Interleave indices for new values array + let mut indices = Vec::with_capacity(num_values); + + // Compute the mapping for each dictionary + let key_mappings = dictionaries + .iter() + .enumerate() + .zip(value_slices) + .map(|((dictionary_idx, dictionary), values)| { + let zero = K::Native::from_usize(0).unwrap(); + let mut mapping = vec![zero; dictionary.values().len()]; + + for (value_idx, value) in values { + mapping[value_idx] = *interner.intern(value, || { + match K::Native::from_usize(indices.len()) { + Some(idx) => { + indices.push((dictionary_idx, value_idx)); + Ok(idx) + } + None => Err(ArrowError::DictionaryKeyOverflowError), + } + })?; + } + Ok(mapping) + }) + .collect::, ArrowError>>()?; + + Ok(MergedDictionaries { + key_mappings, + values: interleave(&values, &indices)?, + }) +} + +/// Return a mask identifying the values that are referenced by keys in `dictionary` +/// at the positions indicated by `selection` +fn compute_values_mask( + keys: &ScalarBuffer, + mask: Option<&BooleanBuffer>, +) -> BooleanBuffer { + let mut builder = BooleanBufferBuilder::new(keys.len()); + builder.advance(keys.len()); + + match mask { + Some(n) => n + .set_indices() + .for_each(|idx| builder.set_bit(keys[idx].as_usize(), true)), + None => keys + .iter() + .for_each(|k| builder.set_bit(k.as_usize(), true)), + } + builder.finish() +} + +/// Return a Vec containing for each set index in `mask`, the index and byte value of that index +fn get_masked_values<'a>( + array: &'a dyn Array, + mask: &BooleanBuffer, +) -> Vec<(usize, &'a [u8])> { + match array.data_type() { + DataType::Utf8 => masked_bytes(array.as_string::(), mask), + DataType::LargeUtf8 => masked_bytes(array.as_string::(), mask), + DataType::Binary => masked_bytes(array.as_binary::(), mask), + DataType::LargeBinary => masked_bytes(array.as_binary::(), mask), + _ => unimplemented!(), + } +} + +/// Compute [`get_masked_values`] for a [`GenericByteArray`] +/// +/// Note: this does not check the null mask and will return values contained in null slots +fn masked_bytes<'a, T: ByteArrayType>( + array: &'a GenericByteArray, + mask: &BooleanBuffer, +) -> Vec<(usize, &'a [u8])> { + let mut out = Vec::with_capacity(mask.count_set_bits()); + for idx in mask.set_indices() { + out.push((idx, array.value(idx).as_ref())) + } + out +} + +#[cfg(test)] +mod tests { + use crate::dictionary::merge_dictionary_values; + use arrow_array::cast::as_string_array; + use arrow_array::types::Int32Type; + use arrow_array::{DictionaryArray, Int32Array, StringArray}; + use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, OffsetBuffer}; + use std::sync::Arc; + + #[test] + fn test_merge_strings() { + let a = + DictionaryArray::::from_iter(["a", "b", "a", "b", "d", "c", "e"]); + let b = DictionaryArray::::from_iter(["c", "f", "c", "d", "a", "d"]); + let merged = merge_dictionary_values(&[&a, &b], None).unwrap(); + + let values = as_string_array(merged.values.as_ref()); + let actual: Vec<_> = values.iter().map(Option::unwrap).collect(); + assert_eq!(&actual, &["a", "b", "d", "c", "e", "f"]); + + assert_eq!(merged.key_mappings.len(), 2); + assert_eq!(&merged.key_mappings[0], &[0, 1, 2, 3, 4]); + assert_eq!(&merged.key_mappings[1], &[3, 5, 2, 0]); + + let a_slice = a.slice(1, 4); + let merged = merge_dictionary_values(&[&a_slice, &b], None).unwrap(); + + let values = as_string_array(merged.values.as_ref()); + let actual: Vec<_> = values.iter().map(Option::unwrap).collect(); + assert_eq!(&actual, &["a", "b", "d", "c", "f"]); + + assert_eq!(merged.key_mappings.len(), 2); + assert_eq!(&merged.key_mappings[0], &[0, 1, 2, 0, 0]); + assert_eq!(&merged.key_mappings[1], &[3, 4, 2, 0]); + + // Mask out only ["b", "b", "d"] from a + let a_mask = + BooleanBuffer::from_iter([false, true, false, true, true, false, false]); + let b_mask = BooleanBuffer::new_set(b.len()); + let merged = merge_dictionary_values(&[&a, &b], Some(&[a_mask, b_mask])).unwrap(); + + let values = as_string_array(merged.values.as_ref()); + let actual: Vec<_> = values.iter().map(Option::unwrap).collect(); + assert_eq!(&actual, &["b", "d", "c", "f", "a"]); + + assert_eq!(merged.key_mappings.len(), 2); + assert_eq!(&merged.key_mappings[0], &[0, 0, 1, 0, 0]); + assert_eq!(&merged.key_mappings[1], &[2, 3, 1, 4]); + } + + #[test] + fn test_merge_nulls() { + let buffer = Buffer::from("helloworldbingohelloworld"); + let offsets = OffsetBuffer::from_lengths([5, 5, 5, 5, 5]); + let nulls = NullBuffer::from(vec![true, false, true, true, true]); + let values = StringArray::new(offsets, buffer, Some(nulls)); + + let key_values = vec![1, 2, 3, 1, 8, 2, 3]; + let key_nulls = + NullBuffer::from(vec![true, true, false, true, false, true, true]); + let keys = Int32Array::new(key_values.into(), Some(key_nulls)); + let a = DictionaryArray::new(keys, Arc::new(values)); + // [NULL, "bingo", NULL, NULL, NULL, "bingo", "hello"] + + let b = DictionaryArray::new( + Int32Array::new_null(10), + Arc::new(StringArray::new_null(0)), + ); + + let merged = merge_dictionary_values(&[&a, &b], None).unwrap(); + let expected = StringArray::from(vec!["bingo", "hello"]); + assert_eq!(merged.values.as_ref(), &expected); + assert_eq!(merged.key_mappings.len(), 2); + assert_eq!(&merged.key_mappings[0], &[0, 0, 0, 1, 0]); + assert_eq!(&merged.key_mappings[1], &[]); + } +} diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index c0d2026808af..a0f41666513b 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values}; +use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; +use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{ + ArrowNativeType, MutableBuffer, NullBuffer, NullBufferBuilder, OffsetBuffer, +}; use arrow_data::transform::MutableArrayData; -use arrow_data::ArrayDataBuilder; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; @@ -30,6 +33,12 @@ macro_rules! primitive_helper { }; } +macro_rules! dict_helper { + ($t:ty, $values:expr, $indices:expr) => { + Ok(Arc::new(interleave_dictionaries::<$t>($values, $indices)?) as _) + }; +} + /// /// Takes elements by index from a list of [`Array`], creating a new [`Array`] from those values. /// @@ -87,6 +96,10 @@ pub fn interleave( DataType::LargeUtf8 => interleave_bytes::(values, indices), DataType::Binary => interleave_bytes::(values, indices), DataType::LargeBinary => interleave_bytes::(values, indices), + DataType::Dictionary(k, _) => downcast_integer! { + k.as_ref() => (dict_helper, values, indices), + _ => unreachable!("illegal dictionary key type {k}") + }, _ => interleave_fallback(values, indices) } } @@ -97,10 +110,8 @@ pub fn interleave( struct Interleave<'a, T> { /// The input arrays downcast to T arrays: Vec<&'a T>, - /// The number of nulls in the interleaved output - null_count: usize, /// The null buffer of the interleaved output - nulls: Option, + nulls: Option, } impl<'a, T: Array + 'static> Interleave<'a, T> { @@ -114,22 +125,19 @@ impl<'a, T: Array + 'static> Interleave<'a, T> { }) .collect(); - let mut null_count = 0; - let nulls = has_nulls.then(|| { - let mut builder = BooleanBufferBuilder::new(indices.len()); - for (a, b) in indices { - let v = arrays[*a].is_valid(*b); - null_count += !v as usize; - builder.append(v) + let nulls = match has_nulls { + true => { + let mut builder = NullBufferBuilder::new(indices.len()); + for (a, b) in indices { + let v = arrays[*a].is_valid(*b); + builder.append(v) + } + builder.finish() } - builder.into() - }); + false => None, + }; - Self { - arrays, - null_count, - nulls, - } + Self { arrays, nulls } } } @@ -140,20 +148,14 @@ fn interleave_primitive( ) -> Result { let interleaved = Interleave::<'_, PrimitiveArray>::new(values, indices); - let mut values = BufferBuilder::::new(indices.len()); + let mut values = Vec::with_capacity(indices.len()); for (a, b) in indices { let v = interleaved.arrays[*a].value(*b); - values.append(v) + values.push(v) } - let builder = ArrayDataBuilder::new(data_type.clone()) - .len(indices.len()) - .add_buffer(values.finish()) - .null_bit_buffer(interleaved.nulls) - .null_count(interleaved.null_count); - - let data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(PrimitiveArray::::from(data))) + let array = PrimitiveArray::::new(values.into(), interleaved.nulls); + Ok(Arc::new(array.with_data_type(data_type.clone()))) } fn interleave_bytes( @@ -177,15 +179,55 @@ fn interleave_bytes( values.extend_from_slice(interleaved.arrays[*a].value(*b).as_ref()); } - let builder = ArrayDataBuilder::new(T::DATA_TYPE) - .len(indices.len()) - .add_buffer(offsets.finish()) - .add_buffer(values.into()) - .null_bit_buffer(interleaved.nulls) - .null_count(interleaved.null_count); + // Safety: safe by construction + let array = unsafe { + let offsets = OffsetBuffer::new_unchecked(offsets.finish().into()); + GenericByteArray::::new_unchecked(offsets, values.into(), interleaved.nulls) + }; + Ok(Arc::new(array)) +} + +fn interleave_dictionaries( + arrays: &[&dyn Array], + indices: &[(usize, usize)], +) -> Result { + let dictionaries: Vec<_> = arrays.iter().map(|x| x.as_dictionary::()).collect(); + if !should_merge_dictionary_values::(&dictionaries, indices.len()) { + return interleave_fallback(arrays, indices); + } + + let masks: Vec<_> = dictionaries + .iter() + .enumerate() + .map(|(a_idx, dictionary)| { + let mut key_mask = BooleanBufferBuilder::new_from_buffer( + MutableBuffer::new_null(dictionary.len()), + dictionary.len(), + ); + + for (_, key_idx) in indices.iter().filter(|(a, _)| *a == a_idx) { + key_mask.set_bit(*key_idx, true); + } + key_mask.finish() + }) + .collect(); + + let merged = merge_dictionary_values(&dictionaries, Some(&masks))?; - let data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(GenericByteArray::::from(data))) + // Recompute keys + let mut keys = PrimitiveBuilder::::with_capacity(indices.len()); + for (a, b) in indices { + let old_keys: &PrimitiveArray = dictionaries[*a].keys(); + match old_keys.is_valid(*b) { + true => { + let old_key = old_keys.values()[*b]; + keys.append_value(merged.key_mappings[*a][old_key.as_usize()]) + } + false => keys.append_null(), + } + } + let array = unsafe { DictionaryArray::new_unchecked(keys.finish(), merged.values) }; + Ok(Arc::new(array)) } /// Fallback implementation of interleave using [`MutableArrayData`] @@ -280,6 +322,32 @@ mod tests { ) } + #[test] + fn test_interleave_dictionary() { + let a = DictionaryArray::::from_iter(["a", "b", "c", "a", "b"]); + let b = DictionaryArray::::from_iter(["a", "c", "a", "c", "a"]); + + // Should not recompute dictionary + let values = + interleave(&[&a, &b], &[(0, 2), (0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]) + .unwrap(); + let v = values.as_dictionary::(); + assert_eq!(v.values().len(), 5); + + let vc = v.downcast_dict::().unwrap(); + let collected: Vec<_> = vc.into_iter().map(Option::unwrap).collect(); + assert_eq!(&collected, &["c", "c", "c", "a", "c", "b"]); + + // Should recompute dictionary + let values = interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 1)]).unwrap(); + let v = values.as_dictionary::(); + assert_eq!(v.values().len(), 1); + + let vc = v.downcast_dict::().unwrap(); + let collected: Vec<_> = vc.into_iter().map(Option::unwrap).collect(); + assert_eq!(&collected, &["c", "c", "c"]); + } + #[test] fn test_lists() { // [[1, 2], null, [3]] @@ -323,4 +391,25 @@ mod tests { assert_eq!(v, &expected); } + + #[test] + fn interleave_sparse_nulls() { + let values = StringArray::from_iter_values((0..100).map(|x| x.to_string())); + let keys = Int32Array::from_iter_values(0..10); + let dict_a = DictionaryArray::new(keys, Arc::new(values)); + let values = StringArray::new_null(0); + let keys = Int32Array::new_null(10); + let dict_b = DictionaryArray::new(keys, Arc::new(values)); + + let indices = &[(0, 0), (0, 1), (0, 2), (1, 0)]; + let array = interleave(&[&dict_a, &dict_b], indices).unwrap(); + + let expected = DictionaryArray::::from_iter(vec![ + Some("0"), + Some("1"), + Some("2"), + None, + ]); + assert_eq!(array.as_ref(), &expected) + } } diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs index c468e20a511e..82f57a6af42b 100644 --- a/arrow-select/src/lib.rs +++ b/arrow-select/src/lib.rs @@ -18,6 +18,7 @@ //! Arrow selection kernels pub mod concat; +mod dictionary; pub mod filter; pub mod interleave; pub mod nullif; diff --git a/arrow/benches/concatenate_kernel.rs b/arrow/benches/concatenate_kernel.rs index 3fff2abd179c..2f5b654394e4 100644 --- a/arrow/benches/concatenate_kernel.rs +++ b/arrow/benches/concatenate_kernel.rs @@ -60,6 +60,28 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("concat str nulls 1024", |b| { b.iter(|| bench_concat(&v1, &v2)) }); + + let v1 = create_string_array_with_len::(10, 0.0, 20); + let v1 = create_dict_from_values::(1024, 0.0, &v1); + let v2 = create_string_array_with_len::(10, 0.0, 20); + let v2 = create_dict_from_values::(1024, 0.0, &v2); + c.bench_function("concat str_dict 1024", |b| { + b.iter(|| bench_concat(&v1, &v2)) + }); + + let v1 = create_string_array_with_len::(1024, 0.0, 20); + let v1 = create_sparse_dict_from_values::(1024, 0.0, &v1, 10..20); + let v2 = create_string_array_with_len::(1024, 0.0, 20); + let v2 = create_sparse_dict_from_values::(1024, 0.0, &v2, 30..40); + c.bench_function("concat str_dict_sparse 1024", |b| { + b.iter(|| bench_concat(&v1, &v2)) + }); + + let v1 = create_string_array::(1024, 0.5); + let v2 = create_string_array::(1024, 0.5); + c.bench_function("concat str nulls 1024", |b| { + b.iter(|| bench_concat(&v1, &v2)) + }); } criterion_group!(benches, add_benchmark); diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs index 2bb430e40b0f..454d9140809c 100644 --- a/arrow/benches/interleave_kernels.rs +++ b/arrow/benches/interleave_kernels.rs @@ -37,14 +37,21 @@ fn do_bench( base: &dyn Array, slices: &[Range], ) { - let mut rng = seedable_rng(); - let arrays: Vec<_> = slices .iter() .map(|r| base.slice(r.start, r.end - r.start)) .collect(); let values: Vec<_> = arrays.iter().map(|x| x.as_ref()).collect(); + bench_values( + c, + &format!("interleave {prefix} {len} {slices:?}"), + len, + &values, + ); +} +fn bench_values(c: &mut Criterion, name: &str, len: usize, values: &[&dyn Array]) { + let mut rng = seedable_rng(); let indices: Vec<_> = (0..len) .map(|_| { let array_idx = rng.gen_range(0..values.len()); @@ -53,8 +60,8 @@ fn do_bench( }) .collect(); - c.bench_function(&format!("interleave {prefix} {len} {slices:?}"), |b| { - b.iter(|| criterion::black_box(interleave(&values, &indices).unwrap())) + c.bench_function(name, |b| { + b.iter(|| criterion::black_box(interleave(values, &indices).unwrap())) }); } @@ -63,12 +70,20 @@ fn add_benchmark(c: &mut Criterion) { let i32_opt = create_primitive_array::(1024, 0.5); let string = create_string_array_with_len::(1024, 0., 20); let string_opt = create_string_array_with_len::(1024, 0.5, 20); + let values = create_string_array_with_len::(10, 0.0, 20); + let dict = create_dict_from_values::(1024, 0.0, &values); + + let values = create_string_array_with_len::(1024, 0.0, 20); + let sparse_dict = + create_sparse_dict_from_values::(1024, 0.0, &values, 10..20); let cases: &[(&str, &dyn Array)] = &[ ("i32(0.0)", &i32), ("i32(0.5)", &i32_opt), ("str(20, 0.0)", &string), ("str(20, 0.5)", &string_opt), + ("dict(20, 0.0)", &dict), + ("dict_sparse(20, 0.0)", &sparse_dict), ]; for (prefix, base) in cases { @@ -83,6 +98,15 @@ fn add_benchmark(c: &mut Criterion) { do_bench(c, prefix, *len, *base, slice); } } + + for len in [100, 1024, 2048] { + bench_values( + c, + &format!("interleave dict_distinct {len}"), + 100, + &[&dict, &sparse_dict], + ); + } } criterion_group!(benches, add_benchmark); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 9bdc24783736..5e5f4c6ee118 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -29,6 +29,7 @@ use rand::{ distributions::{Alphanumeric, Distribution, Standard}, prelude::StdRng, }; +use std::ops::Range; /// Creates an random (but fixed-seeded) array of a given size and null density pub fn create_primitive_array(size: usize, null_density: f32) -> PrimitiveArray @@ -268,6 +269,24 @@ pub fn create_dict_from_values( null_density: f32, values: &dyn Array, ) -> DictionaryArray +where + K: ArrowDictionaryKeyType, + Standard: Distribution, + K::Native: SampleUniform, +{ + let min_key = K::Native::from_usize(0).unwrap(); + let max_key = K::Native::from_usize(values.len()).unwrap(); + create_sparse_dict_from_values(size, null_density, values, min_key..max_key) +} + +/// Creates a random (but fixed-seeded) dictionary array of a given size and null density +/// with the provided values array and key range +pub fn create_sparse_dict_from_values( + size: usize, + null_density: f32, + values: &dyn Array, + key_range: Range, +) -> DictionaryArray where K: ArrowDictionaryKeyType, Standard: Distribution, @@ -279,9 +298,9 @@ where Box::new(values.data_type().clone()), ); - let min_key = K::Native::from_usize(0).unwrap(); - let max_key = K::Native::from_usize(values.len()).unwrap(); - let keys: Buffer = (0..size).map(|_| rng.gen_range(min_key..max_key)).collect(); + let keys: Buffer = (0..size) + .map(|_| rng.gen_range(key_range.clone())) + .collect(); let nulls: Option = (null_density != 0.) .then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect()); From 0847c9aa8b93e134e5b8123cd11a3129594bb82a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:19:03 +0100 Subject: [PATCH 1196/1411] Update object_store chrono deprecations (#4786) --- object_store/src/util.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 07d3ed44ca16..25b0fc343d31 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -32,8 +32,9 @@ where D: serde::Deserializer<'de>, { let s: String = serde::Deserialize::deserialize(deserializer)?; - chrono::TimeZone::datetime_from_str(&chrono::Utc, &s, RFC1123_FMT) - .map_err(serde::de::Error::custom) + let naive = chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT) + .map_err(serde::de::Error::custom)?; + Ok(chrono::TimeZone::from_utc_datetime(&chrono::Utc, &naive)) } #[cfg(any(feature = "aws", feature = "azure"))] From 6fdbc263b08fea0858b662e8d792f88268d9e741 Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 7 Sep 2023 13:59:59 +0200 Subject: [PATCH 1197/1411] Make coalesce_ranges and collect_bytes available to the users (#4784) of the object_store crate. --- object_store/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index d1ee83b64d7b..413b40039553 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -282,7 +282,7 @@ pub use parse::{parse_url, parse_url_opts}; use crate::path::Path; #[cfg(not(target_arch = "wasm32"))] use crate::util::maybe_spawn_blocking; -use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; +pub use crate::util::{coalesce_ranges, collect_bytes, OBJECT_STORE_COALESCE_DEFAULT}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; From 83390ed13afe03d03a94b6e97e701a6971811204 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:45:33 +0100 Subject: [PATCH 1198/1411] Fix DictionaryArray::normalized_keys (#4788) (#4789) --- arrow-array/src/array/dictionary_array.rs | 11 ++++++++++- arrow-ord/src/cmp.rs | 14 +++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5896cf02dfaa..0cb00878929c 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -1005,7 +1005,7 @@ impl AnyDictionaryArray for DictionaryArray { let v_len = self.values().len(); assert_ne!(v_len, 0); let iter = self.keys().values().iter(); - iter.map(|x| x.as_usize().min(v_len)).collect() + iter.map(|x| x.as_usize().min(v_len - 1)).collect() } fn with_values(&self, values: ArrayRef) -> ArrayRef { @@ -1385,4 +1385,13 @@ mod tests { .collect(); assert_eq!(values, &[Some(50), None, None, Some(2)]) } + + #[test] + fn test_normalized_keys() { + let values = vec![132, 0, 1].into(); + let nulls = NullBuffer::from(vec![false, true, true]); + let keys = Int32Array::new(values, Some(nulls)); + let dictionary = DictionaryArray::new(keys, Arc::new(Int32Array::new_null(2))); + assert_eq!(&dictionary.normalized_keys(), &[1, 0, 1]) + } } diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index 96f5aafd8697..feb168335568 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -555,7 +555,7 @@ impl<'a> ArrayOrd for &'a FixedSizeBinaryArray { mod tests { use std::sync::Arc; - use arrow_array::{DictionaryArray, Int32Array, Scalar}; + use arrow_array::{DictionaryArray, Int32Array, Scalar, StringArray}; use super::*; @@ -708,4 +708,16 @@ mod tests { let r = eq(&b, &a).unwrap(); assert_eq!(r.len(), 0); } + + #[test] + fn test_dictionary_nulls() { + let values = StringArray::from(vec![Some("us-west"), Some("us-east")]); + let nulls = NullBuffer::from(vec![false, true, true]); + + let key_values = vec![100i32, 1i32, 0i32].into(); + let keys = Int32Array::new(key_values, Some(nulls)); + let col = DictionaryArray::try_new(keys, Arc::new(values)).unwrap(); + + neq(&col.slice(0, col.len() - 1), &col.slice(1, col.len() - 1)).unwrap(); + } } From 2fe71ca52da53586282f6a24d9290ffca776025f Mon Sep 17 00:00:00 2001 From: Yuri Kotov Date: Thu, 7 Sep 2023 20:47:11 +0700 Subject: [PATCH 1199/1411] Allow custom tree builder for parquet::record::RowIter (#4783) * Allow custom tree builder for parquet::record::RowIter It will allow to read parquet with custom batch_size. Currently the only possible batch_size for parquet::record::RowIter is 1024 * Change with_tree_builder to with_batch_size to be more future-proof --------- Co-authored-by: Yuri Kotov --- parquet/src/record/reader.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 1069eab15f23..2a9b6dbb0bed 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -747,6 +747,12 @@ impl<'a> RowIter<'a> { } } + /// Sets batch size for this row iter. + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.tree_builder = self.tree_builder.with_batch_size(batch_size); + self + } + /// Returns common tree builder, so the same settings are applied to both iterators /// from file reader and row group. #[inline] From dd0c4ab980a1dee1099ddb44b6757439faaf6b26 Mon Sep 17 00:00:00 2001 From: Valery Meleshkin Date: Thu, 7 Sep 2023 16:19:06 +0200 Subject: [PATCH 1200/1411] Relaxing type bounds on coalesce_ranges and collect_bytes (#4787) to allow using them with a wider range of Error types. --- object_store/src/util.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 25b0fc343d31..764582a67f95 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -47,9 +47,13 @@ pub(crate) fn hmac_sha256( } /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk -pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result +pub async fn collect_bytes( + mut stream: S, + size_hint: Option, +) -> Result where - S: Stream> + Send + Unpin, + E: Send, + S: Stream> + Send + Unpin, { let first = stream.next().await.transpose()?.unwrap_or_default(); @@ -99,14 +103,15 @@ pub const OBJECT_STORE_COALESCE_PARALLEL: usize = 10; /// * Combine ranges less than `coalesce` bytes apart into a single call to `fetch` /// * Make multiple `fetch` requests in parallel (up to maximum of 10) /// -pub async fn coalesce_ranges( +pub async fn coalesce_ranges( ranges: &[std::ops::Range], fetch: F, coalesce: usize, -) -> Result> +) -> Result, E> where F: Send + FnMut(std::ops::Range) -> Fut, - Fut: std::future::Future> + Send, + E: Send, + Fut: std::future::Future> + Send, { let fetch_ranges = merge_ranges(ranges, coalesce); @@ -173,6 +178,8 @@ fn merge_ranges( #[cfg(test)] mod tests { + use crate::Error; + use super::*; use rand::{thread_rng, Rng}; use std::ops::Range; @@ -185,7 +192,7 @@ mod tests { let src: Vec<_> = (0..max).map(|x| x as u8).collect(); let mut fetches = vec![]; - let coalesced = coalesce_ranges( + let coalesced = coalesce_ranges::<_, Error, _>( &ranges, |range| { fetches.push(range.clone()); From 15dde87d4ecefadedb22a1df1aeebc0a09cfab0e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 7 Sep 2023 17:00:39 +0100 Subject: [PATCH 1201/1411] Re-export array crate root (#4780) (#4779) (#4791) * Re-export array crate root (#4780) (#4779) * Clippy --- arrow/examples/dynamic_types.rs | 1 - arrow/src/array/mod.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index 8ec473c76d56..21edb235aaa7 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -23,7 +23,6 @@ extern crate arrow; use arrow::array::*; use arrow::datatypes::*; use arrow::error::Result; -use arrow::record_batch::*; #[cfg(feature = "prettyprint")] use arrow::util::pretty::print_batches; diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index ff3a170c698a..fa01f4c4c15b 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -23,10 +23,10 @@ mod ffi; // --------------------- Array & ArrayData --------------------- -pub use arrow_array::array::*; pub use arrow_array::builder::*; pub use arrow_array::cast::*; pub use arrow_array::iterator::*; +pub use arrow_array::*; pub use arrow_data::{ layout, ArrayData, ArrayDataBuilder, ArrayDataRef, BufferSpec, DataTypeLayout, }; From 1e46f8f08a9aa735fe581602aff66f5cc0c40b05 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Sep 2023 08:43:38 +0100 Subject: [PATCH 1202/1411] Best effort cleanup of staged upload files (#4778) (#4792) * Best effort cleanup of staged upload files (#4778) * Clippy * Fix MSRV --- object_store/src/local.rs | 142 ++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 495bb4f9c4aa..20eb3c63ccbd 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -26,6 +26,7 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::future::BoxFuture; +use futures::ready; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; use snafu::{ensure, ResultExt, Snafu}; @@ -274,13 +275,15 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); - file.write_all(&bytes) - .context(UnableToCopyDataToFileSnafu)?; - - std::fs::rename(staging_path, path).context(UnableToRenameFileSnafu)?; - - Ok(()) + .context(UnableToCopyDataToFileSnafu) + .and_then(|_| { + std::fs::rename(&staging_path, &path).context(UnableToRenameFileSnafu) + }) + .map_err(|e| { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + e.into() + }) }) .await } @@ -304,12 +307,14 @@ impl ObjectStore for LocalFileSystem { multipart_id: &MultipartId, ) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; - let staging_path: PathBuf = staged_upload_path(&dest, multipart_id); + let path: PathBuf = staged_upload_path(&dest, multipart_id); - maybe_spawn_blocking(move || { - std::fs::remove_file(&staging_path) - .context(UnableToDeleteFileSnafu { path: staging_path })?; - Ok(()) + maybe_spawn_blocking(move || match std::fs::remove_file(&path) { + Ok(_) => Ok(()), + Err(source) => match source.kind() { + ErrorKind::NotFound => Ok(()), // Already deleted + _ => Err(Error::UnableToDeleteFile { path, source }.into()), + }, }) .await } @@ -318,7 +323,6 @@ impl ObjectStore for LocalFileSystem { &self, location: &Path, ) -> Result> { - #[cfg(not(target_arch = "wasm32"))] // Get the path to the file from the configuration. let path = self.config.path_to_filesystem(location)?; loop { @@ -358,8 +362,6 @@ impl ObjectStore for LocalFileSystem { } } } - #[cfg(target_arch = "wasm32")] - Err(super::Error::NotImplemented) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { @@ -597,8 +599,9 @@ impl ObjectStore for LocalFileSystem { match std::fs::hard_link(&from, &staged) { Ok(_) => { return std::fs::rename(&staged, &to).map_err(|source| { + let _ = std::fs::remove_file(&staged); // Attempt to clean up Error::UnableToCopyFile { from, to, source }.into() - }) + }); } Err(source) => match source.kind() { ErrorKind::AlreadyExists => id += 1, @@ -690,12 +693,9 @@ fn staged_upload_path(dest: &std::path::Path, suffix: &str) -> PathBuf { enum LocalUploadState { /// Upload is ready to send new data - Idle(Arc), + Idle(Arc), /// In the middle of a write - Writing( - Arc, - BoxFuture<'static, Result>, - ), + Writing(Arc, BoxFuture<'static, Result>), /// In the middle of syncing data and closing file. /// /// Future will contain last reference to file, so it will call drop on completion. @@ -713,11 +713,7 @@ struct LocalUpload { } impl LocalUpload { - pub fn new( - dest: PathBuf, - multipart_id: MultipartId, - file: Arc, - ) -> Self { + pub fn new(dest: PathBuf, multipart_id: MultipartId, file: Arc) -> Self { Self { inner_state: LocalUploadState::Idle(file), dest, @@ -731,14 +727,13 @@ impl AsyncWrite for LocalUpload { mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { - let invalid_state = - |condition: &str| -> std::task::Poll> { - Poll::Ready(Err(io::Error::new( - io::ErrorKind::InvalidInput, - format!("Tried to write to file {condition}."), - ))) - }; + ) -> Poll> { + let invalid_state = |condition: &str| -> Poll> { + Poll::Ready(Err(io::Error::new( + ErrorKind::InvalidInput, + format!("Tried to write to file {condition}."), + ))) + }; if let Ok(runtime) = tokio::runtime::Handle::try_current() { let mut data: Vec = buf.to_vec(); @@ -757,7 +752,7 @@ impl AsyncWrite for LocalUpload { .spawn_blocking(move || (&*file2).write_all(&data)) .map(move |res| match res { Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) + Err(io::Error::new(ErrorKind::Other, err)) } Ok(res) => res.map(move |_| data_len), }), @@ -765,16 +760,9 @@ impl AsyncWrite for LocalUpload { ); } LocalUploadState::Writing(file, inner_write) => { - match inner_write.poll_unpin(cx) { - Poll::Ready(res) => { - self.inner_state = - LocalUploadState::Idle(Arc::clone(file)); - return Poll::Ready(res); - } - Poll::Pending => { - return Poll::Pending; - } - } + let res = ready!(inner_write.poll_unpin(cx)); + self.inner_state = LocalUploadState::Idle(Arc::clone(file)); + return Poll::Ready(res); } LocalUploadState::ShuttingDown(_) => { return invalid_state("when writer is shutting down"); @@ -800,14 +788,14 @@ impl AsyncWrite for LocalUpload { fn poll_flush( self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { Poll::Ready(Ok(())) } fn poll_shutdown( mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { if let Ok(runtime) = tokio::runtime::Handle::try_current() { loop { match &mut self.inner_state { @@ -854,13 +842,11 @@ impl AsyncWrite for LocalUpload { "Tried to commit a file where a write is in progress.", ))); } - LocalUploadState::Committing(fut) => match fut.poll_unpin(cx) { - Poll::Ready(res) => { - self.inner_state = LocalUploadState::Complete; - return Poll::Ready(res); - } - Poll::Pending => return Poll::Pending, - }, + LocalUploadState::Committing(fut) => { + let res = ready!(fut.poll_unpin(cx)); + self.inner_state = LocalUploadState::Complete; + return Poll::Ready(res); + } LocalUploadState::Complete => { return Poll::Ready(Err(io::Error::new( io::ErrorKind::Other, @@ -876,22 +862,36 @@ impl AsyncWrite for LocalUpload { let file = Arc::clone(file); self.inner_state = LocalUploadState::Complete; file.sync_all()?; - std::mem::drop(file); + drop(file); std::fs::rename(staging_path, &self.dest)?; Poll::Ready(Ok(())) } _ => { // If we are running on this thread, then only possible states are Idle and Complete. - Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "Already complete", - ))) + Poll::Ready(Err(io::Error::new(ErrorKind::Other, "Already complete"))) } } } } } +impl Drop for LocalUpload { + fn drop(&mut self) { + match self.inner_state { + LocalUploadState::Complete => (), + _ => { + self.inner_state = LocalUploadState::Complete; + let path = staged_upload_path(&self.dest, &self.multipart_id); + // Try to cleanup intermediate file ignoring any error + match tokio::runtime::Handle::try_current() { + Ok(r) => drop(r.spawn_blocking(move || std::fs::remove_file(path))), + Err(_) => drop(std::fs::remove_file(path)), + }; + } + } + } +} + pub(crate) fn chunked_stream( mut file: File, path: PathBuf, @@ -1018,8 +1018,8 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { /// Convert walkdir results and converts not-found errors into `None`. /// Convert broken symlinks to `None`. fn convert_walkdir_result( - res: std::result::Result, -) -> Result> { + res: std::result::Result, +) -> Result> { match res { Ok(entry) => { // To check for broken symlink: call symlink_metadata() - it does not traverse symlinks); @@ -1048,7 +1048,7 @@ fn convert_walkdir_result( Err(walkdir_err) => match walkdir_err.io_error() { Some(io_err) => match io_err.kind() { - io::ErrorKind::NotFound => Ok(None), + ErrorKind::NotFound => Ok(None), _ => Err(Error::UnableToWalkDir { source: walkdir_err, } @@ -1476,6 +1476,7 @@ mod not_wasm_tests { use crate::local::LocalFileSystem; use crate::{ObjectStore, Path}; use bytes::Bytes; + use std::time::Duration; use tempfile::TempDir; use tokio::io::AsyncWriteExt; @@ -1560,6 +1561,25 @@ mod not_wasm_tests { let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); assert_eq!(&*read_data, expected_data); } + + #[tokio::test] + async fn test_cleanup_intermediate_files() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let location = Path::from("some_file"); + let (_, mut writer) = integration.put_multipart(&location).await.unwrap(); + writer.write_all(b"hello").await.unwrap(); + + let file_count = std::fs::read_dir(root.path()).unwrap().count(); + assert_eq!(file_count, 1); + drop(writer); + + tokio::time::sleep(Duration::from_millis(1)).await; + + let file_count = std::fs::read_dir(root.path()).unwrap().count(); + assert_eq!(file_count, 0); + } } #[cfg(target_family = "unix")] From 878217b9e330b4f1ed13e798a214ea11fbeb2bbb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 8 Sep 2023 06:10:08 -0400 Subject: [PATCH 1203/1411] Add docstring and example to `Scalar` (#4793) --- arrow-array/src/scalar.rs | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/scalar.rs b/arrow-array/src/scalar.rs index 7dfdbddd964a..f2a696a8f329 100644 --- a/arrow-array/src/scalar.rs +++ b/arrow-array/src/scalar.rs @@ -98,9 +98,32 @@ impl Datum for &dyn Array { } } -/// A wrapper around a single value [`Array`] indicating kernels should treat it as a scalar value +/// A wrapper around a single value [`Array`] that implements +/// [`Datum`] and indicates [compute] kernels should treat this array +/// as a scalar value (a single value). /// -/// See [`Datum`] for more information +/// Using a [`Scalar`] is often much more efficient than creating an +/// [`Array`] with the same (repeated) value. +/// +/// See [`Datum`] for more information. +/// +/// # Example +/// +/// ```rust +/// # use arrow_array::{Scalar, Int32Array, ArrayRef}; +/// # fn get_array() -> ArrayRef { std::sync::Arc::new(Int32Array::from(vec![42])) } +/// // Create a (typed) scalar for Int32Array for the value 42 +/// let scalar = Scalar::new(Int32Array::from(vec![42])); +/// +/// // Create a scalar using PrimtiveArray::scalar +/// let scalar = Int32Array::new_scalar(42); +/// +/// // create a scalar from an ArrayRef (for dynamic typed Arrays) +/// let array: ArrayRef = get_array(); +/// let scalar = Scalar::new(array); +/// ``` +/// +/// [compute]: https://docs.rs/arrow/latest/arrow/compute/index.html #[derive(Debug, Copy, Clone)] pub struct Scalar(T); From b4997bc35cdd41e98b22fcaa2793b796ab88ceb5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 10 Sep 2023 12:44:19 +0100 Subject: [PATCH 1204/1411] Improved csv_reader benchmarks with smaller integers (#4803) --- arrow/benches/csv_reader.rs | 50 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index c2491a5a0b04..4c3f663bf741 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -18,15 +18,18 @@ extern crate arrow; extern crate criterion; +use std::io::Cursor; +use std::sync::Arc; + use criterion::*; +use rand::Rng; use arrow::array::*; use arrow::csv; use arrow::datatypes::*; use arrow::record_batch::RecordBatch; use arrow::util::bench_util::{create_primitive_array, create_string_array_with_len}; -use std::io::Cursor; -use std::sync::Arc; +use arrow::util::test_util::seedable_rng; fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let batch = RecordBatch::try_from_iter(cols.into_iter().map(|a| ("col", a))).unwrap(); @@ -55,18 +58,49 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { } fn criterion_benchmark(c: &mut Criterion) { - let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + let mut rng = seedable_rng(); + + let values = Int32Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024))); + let cols = vec![Arc::new(values) as ArrayRef]; + do_bench(c, "4096 i32_small(0)", cols); + + let values = Int32Array::from_iter_values((0..4096).map(|_| rng.gen())); + let cols = vec![Arc::new(values) as ArrayRef]; + do_bench(c, "4096 i32(0)", cols); + + let values = UInt64Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024))); + let cols = vec![Arc::new(values) as ArrayRef]; + do_bench(c, "4096 u64_small(0)", cols); + + let values = UInt64Array::from_iter_values((0..4096).map(|_| rng.gen())); + let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 u64(0)", cols); - let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + let values = + Int64Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024) - 512)); + let cols = vec![Arc::new(values) as ArrayRef]; + do_bench(c, "4096 i64_small(0)", cols); + + let values = Int64Array::from_iter_values((0..4096).map(|_| rng.gen())); + let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 i64(0)", cols); - let cols = - vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + let cols = vec![Arc::new(Float32Array::from_iter_values( + (0..4096).map(|_| rng.gen_range(0..1024000) as f32 / 1000.), + )) as _]; + do_bench(c, "4096 f32_small(0)", cols); + + let values = Float32Array::from_iter_values((0..4096).map(|_| rng.gen())); + let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 f32(0)", cols); - let cols = - vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; + let cols = vec![Arc::new(Float64Array::from_iter_values( + (0..4096).map(|_| rng.gen_range(0..1024000) as f64 / 1000.), + )) as _]; + do_bench(c, "4096 f64_small(0)", cols); + + let values = Float64Array::from_iter_values((0..4096).map(|_| rng.gen())); + let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 f64(0)", cols); let cols = From 77455d48cd6609045a4728ba908123de9d0b62fd Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sun, 10 Sep 2023 04:45:14 -0700 Subject: [PATCH 1205/1411] fix: entries field is non-nullable (#4808) --- arrow-array/src/array/map_array.rs | 8 ++++---- .../src/bin/arrow-json-integration-test.rs | 5 ++--- arrow-ipc/src/reader.rs | 4 ++-- arrow-json/src/writer.rs | 2 +- arrow-schema/src/ffi.rs | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index fca49cd7836f..77a7b9d4d547 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -330,7 +330,7 @@ impl MapArray { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -477,7 +477,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -523,7 +523,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -645,7 +645,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 2c36e8d9b8ae..db5df8b58a6f 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -124,7 +124,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { let key_field = Arc::new(Field::new( "key", first_field.data_type().clone(), - first_field.is_nullable(), + false, )); let second_field = fields.get(1).unwrap(); let value_field = Arc::new(Field::new( @@ -135,8 +135,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { let fields = Fields::from([key_field, value_field]); let struct_type = DataType::Struct(fields); - let child_field = - Field::new("entries", struct_type, child_field.is_nullable()); + let child_field = Field::new("entries", struct_type, false); Arc::new(Field::new( field.name().as_str(), diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 96cb4393ba58..75c91be21dde 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1487,7 +1487,7 @@ mod tests { let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), - true, + true, // It is technically not legal for this field to be null. 1, false, )); @@ -1506,7 +1506,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index a918f44b54ff..a5b5a78190b3 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1385,7 +1385,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index cd3c207a56c5..a17dbe769f2e 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -833,7 +833,7 @@ mod tests { // Construct a map array from the above two let map_data_type = - DataType::Map(Arc::new(Field::new("entries", entry_struct, true)), true); + DataType::Map(Arc::new(Field::new("entries", entry_struct, false)), true); let arrow_schema = FFI_ArrowSchema::try_from(map_data_type).unwrap(); assert!(arrow_schema.map_keys_sorted()); From 2075cd125dc0c132be5cb9dbf65748abf52243f1 Mon Sep 17 00:00:00 2001 From: Vaibhav Rabber Date: Wed, 13 Sep 2023 16:27:39 +0530 Subject: [PATCH 1206/1411] csv: Add option to specify custom null values (#4795) * csv: Add option to specify custom null regex Can specify custom strings as `NULL` values for CSVs as a regular expression. This allows reading a CSV files which have placeholders for NULL values instead of empty strings. Fixes #4794 Signed-off-by: Vaibhav * Apply suggestions from code review --------- Signed-off-by: Vaibhav Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-csv/src/reader/mod.rs | 203 +++++++++++++++++++---- arrow-csv/test/data/custom_null_test.csv | 6 + 2 files changed, 180 insertions(+), 29 deletions(-) create mode 100644 arrow-csv/test/data/custom_null_test.csv diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 328c2cd41f3b..695e3d47965d 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -133,8 +133,8 @@ use arrow_schema::*; use chrono::{TimeZone, Utc}; use csv::StringRecord; use lazy_static::lazy_static; -use regex::RegexSet; -use std::fmt; +use regex::{Regex, RegexSet}; +use std::fmt::{self, Debug}; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -157,6 +157,22 @@ lazy_static! { ]).unwrap(); } +/// A wrapper over `Option` to check if the value is `NULL`. +#[derive(Debug, Clone, Default)] +struct NullRegex(Option); + +impl NullRegex { + /// Returns true if the value should be considered as `NULL` according to + /// the provided regular expression. + #[inline] + fn is_null(&self, s: &str) -> bool { + match &self.0 { + Some(r) => r.is_match(s), + None => s.is_empty(), + } + } +} + #[derive(Default, Copy, Clone)] struct InferredDataType { /// Packed booleans indicating type @@ -213,6 +229,7 @@ pub struct Format { escape: Option, quote: Option, terminator: Option, + null_regex: NullRegex, } impl Format { @@ -241,6 +258,12 @@ impl Format { self } + /// Provide a regex to match null values, defaults to `^$` + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.null_regex = NullRegex(Some(null_regex)); + self + } + /// Infer schema of CSV records from the provided `reader` /// /// If `max_records` is `None`, all records will be read, otherwise up to `max_records` @@ -287,7 +310,7 @@ impl Format { column_types.iter_mut().enumerate().take(header_length) { if let Some(string) = record.get(i) { - if !string.is_empty() { + if !self.null_regex.is_null(string) { column_type.update(string) } } @@ -557,6 +580,9 @@ pub struct Decoder { /// A decoder for [`StringRecords`] record_decoder: RecordDecoder, + + /// Check if the string matches this pattern for `NULL`. + null_regex: NullRegex, } impl Decoder { @@ -603,6 +629,7 @@ impl Decoder { Some(self.schema.metadata.clone()), self.projection.as_ref(), self.line_number, + &self.null_regex, )?; self.line_number += rows.len(); Ok(Some(batch)) @@ -621,6 +648,7 @@ fn parse( metadata: Option>, projection: Option<&Vec>, line_number: usize, + null_regex: &NullRegex, ) -> Result { let projection: Vec = match projection { Some(v) => v.clone(), @@ -633,7 +661,9 @@ fn parse( let i = *i; let field = &fields[i]; match field.data_type() { - DataType::Boolean => build_boolean_array(line_number, rows, i), + DataType::Boolean => { + build_boolean_array(line_number, rows, i, null_regex) + } DataType::Decimal128(precision, scale) => { build_decimal_array::( line_number, @@ -641,6 +671,7 @@ fn parse( i, *precision, *scale, + null_regex, ) } DataType::Decimal256(precision, scale) => { @@ -650,53 +681,73 @@ fn parse( i, *precision, *scale, + null_regex, ) } - DataType::Int8 => build_primitive_array::(line_number, rows, i), + DataType::Int8 => { + build_primitive_array::(line_number, rows, i, null_regex) + } DataType::Int16 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Int32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Int64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt8 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt16 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Float32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Float64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Date32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Date64 => { - build_primitive_array::(line_number, rows, i) - } - DataType::Time32(TimeUnit::Second) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } + DataType::Time32(TimeUnit::Second) => build_primitive_array::< + Time32SecondType, + >( + line_number, rows, i, null_regex + ), DataType::Time32(TimeUnit::Millisecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::( + line_number, + rows, + i, + null_regex, + ) } DataType::Time64(TimeUnit::Microsecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::( + line_number, + rows, + i, + null_regex, + ) } DataType::Time64(TimeUnit::Nanosecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::( + line_number, + rows, + i, + null_regex, + ) } DataType::Timestamp(TimeUnit::Second, tz) => { build_timestamp_array::( @@ -704,6 +755,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Millisecond, tz) => { @@ -712,6 +764,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Microsecond, tz) => { @@ -720,6 +773,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Nanosecond, tz) => { @@ -728,6 +782,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Utf8 => Ok(Arc::new( @@ -827,11 +882,12 @@ fn build_decimal_array( col_idx: usize, precision: u8, scale: i8, + null_regex: &NullRegex, ) -> Result { let mut decimal_builder = PrimitiveBuilder::::with_capacity(rows.len()); for row in rows.iter() { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { // append null decimal_builder.append_null(); } else { @@ -859,12 +915,13 @@ fn build_primitive_array( line_number: usize, rows: &StringRecords<'_>, col_idx: usize, + null_regex: &NullRegex, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } @@ -888,14 +945,27 @@ fn build_timestamp_array( rows: &StringRecords<'_>, col_idx: usize, timezone: Option<&str>, + null_regex: &NullRegex, ) -> Result { Ok(Arc::new(match timezone { Some(timezone) => { let tz: Tz = timezone.parse()?; - build_timestamp_array_impl::(line_number, rows, col_idx, &tz)? - .with_timezone(timezone) + build_timestamp_array_impl::( + line_number, + rows, + col_idx, + &tz, + null_regex, + )? + .with_timezone(timezone) } - None => build_timestamp_array_impl::(line_number, rows, col_idx, &Utc)?, + None => build_timestamp_array_impl::( + line_number, + rows, + col_idx, + &Utc, + null_regex, + )?, })) } @@ -904,12 +974,13 @@ fn build_timestamp_array_impl( rows: &StringRecords<'_>, col_idx: usize, timezone: &Tz, + null_regex: &NullRegex, ) -> Result, ArrowError> { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } @@ -936,12 +1007,13 @@ fn build_boolean_array( line_number: usize, rows: &StringRecords<'_>, col_idx: usize, + null_regex: &NullRegex, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } let parsed = parse_bool(s); @@ -1042,6 +1114,12 @@ impl ReaderBuilder { self } + /// Provide a regex to match null values, defaults to `^$` + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.format.null_regex = NullRegex(Some(null_regex)); + self + } + /// Set the batch size (number of records to load at one time) pub fn with_batch_size(mut self, batch_size: usize) -> Self { self.batch_size = batch_size; @@ -1100,6 +1178,7 @@ impl ReaderBuilder { end, projection: self.projection, batch_size: self.batch_size, + null_regex: self.format.null_regex, } } } @@ -1426,6 +1505,36 @@ mod tests { assert!(!batch.column(1).is_null(4)); } + #[test] + fn test_custom_nulls() { + let schema = Arc::new(Schema::new(vec![ + Field::new("c_int", DataType::UInt64, true), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + ])); + + let file = File::open("test/data/custom_null_test.csv").unwrap(); + + let null_regex = Regex::new("^nil$").unwrap(); + + let mut csv = ReaderBuilder::new(schema) + .has_header(true) + .with_null_regex(null_regex) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + // "nil"s should be NULL + assert!(batch.column(0).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(batch.column(3).is_null(4)); + // String won't be empty + assert!(!batch.column(2).is_null(3)); + assert!(!batch.column(2).is_null(4)); + } + #[test] fn test_nulls_with_inference() { let mut file = File::open("test/data/various_types.csv").unwrap(); @@ -1485,6 +1594,42 @@ mod tests { assert!(!batch.column(1).is_null(4)); } + #[test] + fn test_custom_nulls_with_inference() { + let mut file = File::open("test/data/custom_null_test.csv").unwrap(); + + let null_regex = Regex::new("^nil$").unwrap(); + + let format = Format::default() + .with_header(true) + .with_null_regex(null_regex); + + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new("c_int", DataType::Int64, true), + Field::new("c_float", DataType::Float64, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + ]); + + assert_eq!(schema, expected_schema); + + let builder = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(5, batch.num_rows()); + assert_eq!(4, batch.num_columns()); + + assert_eq!(batch.schema().as_ref(), &expected_schema); + } + #[test] fn test_parse_invalid_csv() { let file = File::open("test/data/various_types_invalid.csv").unwrap(); diff --git a/arrow-csv/test/data/custom_null_test.csv b/arrow-csv/test/data/custom_null_test.csv new file mode 100644 index 000000000000..39f9fc4b3eff --- /dev/null +++ b/arrow-csv/test/data/custom_null_test.csv @@ -0,0 +1,6 @@ +c_int,c_float,c_string,c_bool +1,1.1,"1.11",True +nil,2.2,"2.22",TRUE +3,nil,"3.33",true +4,4.4,nil,False +5,6.6,"",nil From 229bf8b6148e5a134df85b41ca4e6d10583cf1db Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 14 Sep 2023 10:21:50 +0100 Subject: [PATCH 1207/1411] ObjectStore Wasm32 Fixes (#4775) (#4776) (#4796) --- .github/workflows/object_store.yml | 2 ++ object_store/README.md | 2 +- object_store/src/client/mod.rs | 3 ++- object_store/src/lib.rs | 18 +++++++++--------- object_store/src/parse.rs | 2 +- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 3b9b1e31d5c3..c28f8037a307 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -60,6 +60,8 @@ jobs: run: cargo clippy --features gcp -- -D warnings - name: Run clippy with azure feature run: cargo clippy --features azure -- -D warnings + - name: Run clippy with http feature + run: cargo clippy --features http -- -D warnings - name: Run clippy with all features run: cargo clippy --all-features -- -D warnings - name: Run clippy with all features and all targets diff --git a/object_store/README.md b/object_store/README.md index 5b47a65c124f..fd09ec7205af 100644 --- a/object_store/README.md +++ b/object_store/README.md @@ -39,7 +39,7 @@ See [docs.rs](https://docs.rs/object_store) for usage instructions ## Support for `wasm32-unknown-unknown` target -It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, and `gcp` are not supported. +It's possible to build `object_store` for the `wasm32-unknown-unknown` target, however the cloud storage features `aws`, `azure`, `gcp`, and `http` are not supported. ``` cargo build -p object_store --target wasm32-unknown-unknown diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index d4995a5b143f..77b14a7587d7 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -18,6 +18,7 @@ //! Generic utilities reqwest based ObjectStore implementations pub mod backoff; + #[cfg(test)] pub mod mock_server; @@ -35,7 +36,6 @@ pub mod list; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod token; -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] @@ -575,6 +575,7 @@ pub struct StaticCredentialProvider { } impl StaticCredentialProvider { + /// A [`CredentialProvider`] for a static credential of type `T` pub fn new(credential: T) -> Self { Self { credential: Arc::new(credential), diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 413b40039553..8d96ccf1dfc3 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -240,9 +240,9 @@ #[cfg(all( target_arch = "wasm32", - any(feature = "gcp", feature = "aws", feature = "azure",) + any(feature = "gcp", feature = "aws", feature = "azure", feature = "http") ))] -compile_error!("Features 'gcp', 'aws', 'azure' are not supported on wasm."); +compile_error!("Features 'gcp', 'aws', 'azure', 'http' are not supported on wasm."); #[cfg(feature = "aws")] pub mod aws; @@ -263,13 +263,16 @@ pub mod path; pub mod prefix; pub mod throttle; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +#[cfg(feature = "cloud")] mod client; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] -pub use client::{backoff::BackoffConfig, retry::RetryConfig, CredentialProvider}; +#[cfg(feature = "cloud")] +pub use client::{ + backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, + CredentialProvider, StaticCredentialProvider, +}; -#[cfg(any(feature = "gcp", feature = "aws", feature = "azure", feature = "http"))] +#[cfg(feature = "cloud")] mod config; #[cfg(feature = "cloud")] @@ -295,9 +298,6 @@ use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWrite; -#[cfg(any(feature = "azure", feature = "aws", feature = "gcp", feature = "http"))] -pub use client::{ClientConfigKey, ClientOptions}; - /// An alias for a dynamically dispatched object store implementation. pub type DynObjectStore = dyn ObjectStore; diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 7b89e58e10e7..1159e9a1af17 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -104,7 +104,7 @@ impl ObjectStoreScheme { } } -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure", feature = "http"))] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] macro_rules! builder_opts { ($builder:ty, $url:expr, $options:expr) => {{ let builder = $options.into_iter().fold( From 7355e8392d6e82c28cc54236d79b4dc56d7226d4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 Sep 2023 21:03:59 +0100 Subject: [PATCH 1208/1411] Update proc-macro2 requirement from =1.0.66 to =1.0.67 (#4816) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.66...1.0.67) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 8f889c0a7cb9..13a93cce853f 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.66", default-features = false } +proc-macro2 = { version = "=1.0.67", default-features = false } prost-build = { version = "=0.11.9", default-features = false } tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } From d2be733f447c252e0f655201f4cb0fc8015448e5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 15 Sep 2023 18:14:14 +0100 Subject: [PATCH 1209/1411] More chrono deprecations (#4822) --- arrow-cast/src/parse.rs | 29 +++++++++----------- arrow-csv/src/reader/mod.rs | 34 ++++++++++++++---------- arrow-json/src/reader/timestamp_array.rs | 9 ++++++- arrow-json/src/writer.rs | 6 +++-- 4 files changed, 44 insertions(+), 34 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index ac3b89e0ba02..3806f0adc5d6 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -277,16 +277,11 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) } -/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +/// Fallible conversion of [`NaiveDateTime`] to `i64` nanoseconds #[inline] fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { - if dt.timestamp().checked_mul(1_000_000_000).is_none() { - return Err(ArrowError::ParseError( - ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), - )); - } - - Ok(dt.timestamp_nanos()) + dt.timestamp_nanos_opt() + .ok_or_else(|| ArrowError::ParseError(ERR_NANOSECONDS_NOT_SUPPORTED.to_string())) } /// Accepts a string in ISO8601 standard format and some @@ -1313,12 +1308,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); @@ -1331,12 +1326,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), + naive_datetime_whole_secs.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), + naive_datetime_whole_secs.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); @@ -1349,7 +1344,7 @@ mod tests { ); assert_eq!( - naive_datetime_no_time.timestamp_nanos(), + naive_datetime_no_time.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08").unwrap() ) } @@ -1463,12 +1458,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); @@ -1479,12 +1474,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 695e3d47965d..17db7a34e06f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -984,20 +984,26 @@ fn build_timestamp_array_impl( return Ok(None); } - let date = string_to_datetime(timezone, s).map_err(|e| { - ArrowError::ParseError(format!( - "Error parsing column {col_idx} at line {}: {}", - line_number + row_index, - e - )) - })?; - - Ok(Some(match T::UNIT { - TimeUnit::Second => date.timestamp(), - TimeUnit::Millisecond => date.timestamp_millis(), - TimeUnit::Microsecond => date.timestamp_micros(), - TimeUnit::Nanosecond => date.timestamp_nanos(), - })) + let date = string_to_datetime(timezone, s) + .and_then(|date| match T::UNIT { + TimeUnit::Second => Ok(date.timestamp()), + TimeUnit::Millisecond => Ok(date.timestamp_millis()), + TimeUnit::Microsecond => Ok(date.timestamp_micros()), + TimeUnit::Nanosecond => date.timestamp_nanos_opt().ok_or_else(|| { + ArrowError::ParseError(format!( + "{} would overflow 64-bit signed nanoseconds", + date.to_rfc3339(), + )) + }), + }) + .map_err(|e| { + ArrowError::ParseError(format!( + "Error parsing column {col_idx} at line {}: {}", + line_number + row_index, + e + )) + })?; + Ok(Some(date)) }) .collect() } diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index ef69deabce2d..b80915f6a56a 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -71,7 +71,14 @@ where TimeUnit::Second => date.timestamp(), TimeUnit::Millisecond => date.timestamp_millis(), TimeUnit::Microsecond => date.timestamp_micros(), - TimeUnit::Nanosecond => date.timestamp_nanos(), + TimeUnit::Nanosecond => { + date.timestamp_nanos_opt().ok_or_else(|| { + ArrowError::ParseError(format!( + "{} would overflow 64-bit signed nanoseconds", + date.to_rfc3339(), + )) + })? + } }; builder.append_value(value) } diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index a5b5a78190b3..db371b59080a 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -757,7 +757,8 @@ mod tests { let ts_nanos = ts_string .parse::() .unwrap() - .timestamp_nanos(); + .timestamp_nanos_opt() + .unwrap(); let ts_micros = ts_nanos / 1000; let ts_millis = ts_micros / 1000; let ts_secs = ts_millis / 1000; @@ -809,7 +810,8 @@ mod tests { let ts_nanos = ts_string .parse::() .unwrap() - .timestamp_nanos(); + .timestamp_nanos_opt() + .unwrap(); let ts_micros = ts_nanos / 1000; let ts_millis = ts_micros / 1000; let ts_secs = ts_millis / 1000; From d960379f3dd3b4aee6935bd0a5bdc3c4b69cfed9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 16 Sep 2023 07:59:28 -0400 Subject: [PATCH 1210/1411] Do not check schema for equality in `concat_batches` (#4815) --- arrow-select/src/concat.rs | 72 +++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index c34c3d3d0ccf..a6dcca24eace 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -159,7 +159,12 @@ fn concat_fallback( Ok(make_array(mutable.freeze())) } -/// Concatenates `batches` together into a single record batch. +/// Concatenates `batches` together into a single [`RecordBatch`]. +/// +/// The output batch has the specified `schemas`; The schema of the +/// input are ignored. +/// +/// Returns an error if the types of underlying arrays are different. pub fn concat_batches<'a>( schema: &SchemaRef, input_batches: impl IntoIterator, @@ -176,20 +181,6 @@ pub fn concat_batches<'a>( if batches.is_empty() { return Ok(RecordBatch::new_empty(schema.clone())); } - if let Some((i, _)) = batches - .iter() - .enumerate() - .find(|&(_, batch)| batch.schema() != *schema) - { - return Err(ArrowError::InvalidArgumentError(format!( - "batches[{i}] schema is different with argument schema. - batches[{i}] schema: {:?}, - argument schema: {:?} - ", - batches[i].schema(), - *schema - ))); - } let field_num = schema.fields().len(); let mut arrays = Vec::with_capacity(field_num); for i in 0..field_num { @@ -727,36 +718,45 @@ mod tests { } #[test] - fn concat_record_batches_of_different_schemas() { - let schema1 = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Utf8, false), - ])); - let schema2 = Arc::new(Schema::new(vec![ - Field::new("c", DataType::Int32, false), - Field::new("d", DataType::Utf8, false), - ])); + fn concat_record_batches_of_different_schemas_but_compatible_data() { + let schema1 = + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + // column names differ + let schema2 = + Arc::new(Schema::new(vec![Field::new("c", DataType::Int32, false)])); let batch1 = RecordBatch::try_new( schema1.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(StringArray::from(vec!["a", "b"])), - ], + vec![Arc::new(Int32Array::from(vec![1, 2]))], + ) + .unwrap(); + let batch2 = + RecordBatch::try_new(schema2, vec![Arc::new(Int32Array::from(vec![3, 4]))]) + .unwrap(); + // concat_batches simply uses the schema provided + let batch = concat_batches(&schema1, [&batch1, &batch2]).unwrap(); + assert_eq!(batch.schema().as_ref(), schema1.as_ref()); + assert_eq!(4, batch.num_rows()); + } + + #[test] + fn concat_record_batches_of_different_schemas_incompatible_data() { + let schema1 = + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + // column names differ + let schema2 = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + let batch1 = RecordBatch::try_new( + schema1.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2]))], ) .unwrap(); let batch2 = RecordBatch::try_new( schema2, - vec![ - Arc::new(Int32Array::from(vec![3, 4])), - Arc::new(StringArray::from(vec!["c", "d"])), - ], + vec![Arc::new(StringArray::from(vec!["foo", "bar"]))], ) .unwrap(); + let error = concat_batches(&schema1, [&batch1, &batch2]).unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument error: batches[1] schema is different with argument schema.\n batches[1] schema: Schema { fields: [Field { name: \"c\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"d\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} },\n argument schema: Schema { fields: [Field { name: \"a\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"b\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }\n " - ); + assert_eq!(error.to_string(), "Invalid argument error: It is not possible to concatenate arrays of different data types."); } #[test] From 80b0888fa172090c91588142a5f964cc158cb5c8 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Sun, 17 Sep 2023 06:35:14 -0400 Subject: [PATCH 1211/1411] fix: export record batch through stream (#4806) * fix: export record batch through stream * Update arrow/src/pyarrow.rs --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- .../tests/test_sql.py | 17 ++++++++++ arrow/src/pyarrow.rs | 31 ++++++------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 3be5b9ec52fe..1748fd3ffb6b 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -393,6 +393,23 @@ def test_sparse_union_python(): del a del b +def test_tensor_array(): + tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) + inner = pa.array([float(x) for x in range(1, 7)] + [None] * 12, pa.float32()) + storage = pa.FixedSizeListArray.from_arrays(inner, 6) + f32_array = pa.ExtensionArray.from_storage(tensor_type, storage) + + # Round-tripping as an array gives back storage type, because arrow-rs has + # no notion of extension types. + b = rust.round_trip_array(f32_array) + assert b == f32_array.storage + + batch = pa.record_batch([f32_array], ["tensor"]) + b = rust.round_trip_record_batch(batch) + assert b == batch + + del b + def test_record_batch_reader(): """ Python -> Rust -> Python diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 6063ae763228..ab0ea8ef8d74 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -59,14 +59,14 @@ use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; -use arrow_array::RecordBatchReader; +use arrow_array::{RecordBatchIterator, RecordBatchReader}; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList, PyTuple}; +use pyo3::types::{PyList, PyTuple}; -use crate::array::{make_array, Array, ArrayData}; +use crate::array::{make_array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; @@ -270,25 +270,12 @@ impl FromPyArrow for RecordBatch { impl ToPyArrow for RecordBatch { fn to_pyarrow(&self, py: Python) -> PyResult { - let mut py_arrays = vec![]; - - let schema = self.schema(); - let columns = self.columns().iter(); - - for array in columns { - py_arrays.push(array.to_data().to_pyarrow(py)?); - } - - let py_schema = schema.to_pyarrow(py)?; - - let module = py.import("pyarrow")?; - let class = module.getattr("RecordBatch")?; - let args = (py_arrays,); - let kwargs = PyDict::new(py); - kwargs.set_item("schema", py_schema)?; - let record = class.call_method("from_arrays", args, Some(kwargs))?; - - Ok(PyObject::from(record)) + // Workaround apache/arrow#37669 by returning RecordBatchIterator + let reader = + RecordBatchIterator::new(vec![Ok(self.clone())], self.schema().clone()); + let reader: Box = Box::new(reader); + let py_reader = reader.into_pyarrow(py)?; + py_reader.call_method0(py, "read_next_batch") } } From b64e362f04084d1ea9e9bd4d55f8b994968aa9d2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 17 Sep 2023 11:35:37 +0100 Subject: [PATCH 1212/1411] Adaptive Row Block Size (#4812) (#4818) * Adaptive Row Block Size (#4812) * Perf improvements * Further tweaks * Review feedback --- arrow-array/src/types.rs | 2 + arrow-row/src/lib.rs | 16 +++-- arrow-row/src/variable.rs | 145 ++++++++++++++++++++++---------------- 3 files changed, 98 insertions(+), 65 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index d79b32a991ed..7988fe9f6690 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1368,12 +1368,14 @@ pub(crate) mod bytes { } impl ByteArrayNativeType for [u8] { + #[inline] unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { b } } impl ByteArrayNativeType for str { + #[inline] unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { std::str::from_utf8_unchecked(b) } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index b59d84061a8a..bd1dd7256240 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -232,13 +232,13 @@ mod variable; /// A non-null, non-empty byte array is encoded as `2_u8` followed by the byte array /// encoded using a block based scheme described below. /// -/// The byte array is broken up into 32-byte blocks, each block is written in turn +/// The byte array is broken up into fixed-width blocks, each block is written in turn /// to the output, followed by `0xFF_u8`. The final block is padded to 32-bytes /// with `0_u8` and written to the output, followed by the un-padded length in bytes -/// of this final block as a `u8`. +/// of this final block as a `u8`. The first 4 blocks have a length of 8, with subsequent +/// blocks using a length of 32, this is to reduce space amplification for small strings. /// -/// Note the following example encodings use a block size of 4 bytes, -/// as opposed to 32 bytes for brevity: +/// Note the following example encodings use a block size of 4 bytes for brevity: /// /// ```text /// ┌───┬───┬───┬───┬───┬───┐ @@ -1698,12 +1698,18 @@ mod tests { None, Some(vec![0_u8; 0]), Some(vec![0_u8; 6]), + Some(vec![0_u8; variable::MINI_BLOCK_SIZE]), + Some(vec![0_u8; variable::MINI_BLOCK_SIZE + 1]), Some(vec![0_u8; variable::BLOCK_SIZE]), Some(vec![0_u8; variable::BLOCK_SIZE + 1]), Some(vec![1_u8; 6]), + Some(vec![1_u8; variable::MINI_BLOCK_SIZE]), + Some(vec![1_u8; variable::MINI_BLOCK_SIZE + 1]), Some(vec![1_u8; variable::BLOCK_SIZE]), Some(vec![1_u8; variable::BLOCK_SIZE + 1]), Some(vec![0xFF_u8; 6]), + Some(vec![0xFF_u8; variable::MINI_BLOCK_SIZE]), + Some(vec![0xFF_u8; variable::MINI_BLOCK_SIZE + 1]), Some(vec![0xFF_u8; variable::BLOCK_SIZE]), Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]), ])) as ArrayRef; @@ -2221,7 +2227,7 @@ mod tests { } for r in r2.iter() { - assert_eq!(r.data.len(), 34); + assert_eq!(r.data.len(), 10); } } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index e9f6160bf43c..6c9c4c43bca3 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -26,6 +26,14 @@ use arrow_schema::{DataType, SortOptions}; /// The block size of the variable length encoding pub const BLOCK_SIZE: usize = 32; +/// The first block is split into `MINI_BLOCK_COUNT` mini-blocks +/// +/// This helps to reduce the space amplification for small strings +pub const MINI_BLOCK_COUNT: usize = 4; + +/// The mini block size +pub const MINI_BLOCK_SIZE: usize = BLOCK_SIZE / MINI_BLOCK_COUNT; + /// The continuation token pub const BLOCK_CONTINUATION: u8 = 0xFF; @@ -45,7 +53,12 @@ pub fn encoded_len(a: Option<&[u8]>) -> usize { #[inline] pub fn padded_length(a: Option) -> usize { match a { - Some(a) => 1 + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), + Some(a) if a <= BLOCK_SIZE => { + 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1) + } + // Each miniblock ends with a 1 byte continuation, therefore add + // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size + Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), None => 1, } } @@ -82,44 +95,23 @@ pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usiz 1 } Some(val) => { - let block_count = ceil(val.len(), BLOCK_SIZE); - let end_offset = 1 + block_count * (BLOCK_SIZE + 1); - let to_write = &mut out[..end_offset]; - // Write `2_u8` to demarcate as non-empty, non-null string - to_write[0] = NON_EMPTY_SENTINEL; - - let chunks = val.chunks_exact(BLOCK_SIZE); - let remainder = chunks.remainder(); - for (input, output) in chunks - .clone() - .zip(to_write[1..].chunks_exact_mut(BLOCK_SIZE + 1)) - { - let input: &[u8; BLOCK_SIZE] = input.try_into().unwrap(); - let out_block: &mut [u8; BLOCK_SIZE] = - (&mut output[..BLOCK_SIZE]).try_into().unwrap(); - - *out_block = *input; - - // Indicate that there are further blocks to follow - output[BLOCK_SIZE] = BLOCK_CONTINUATION; - } + out[0] = NON_EMPTY_SENTINEL; - if !remainder.is_empty() { - let start_offset = 1 + (block_count - 1) * (BLOCK_SIZE + 1); - to_write[start_offset..start_offset + remainder.len()] - .copy_from_slice(remainder); - *to_write.last_mut().unwrap() = remainder.len() as u8; + let len = if val.len() <= BLOCK_SIZE { + 1 + encode_blocks::(&mut out[1..], val) } else { - // We must overwrite the continuation marker written by the loop above - *to_write.last_mut().unwrap() = BLOCK_SIZE as u8; - } + let (initial, rem) = val.split_at(BLOCK_SIZE); + let offset = encode_blocks::(&mut out[1..], initial); + out[offset] = BLOCK_CONTINUATION; + 1 + offset + encode_blocks::(&mut out[1 + offset..], rem) + }; if opts.descending { // Invert bits - to_write.iter_mut().for_each(|v| *v = !*v) + out[..len].iter_mut().for_each(|v| *v = !*v) } - end_offset + len } None => { out[0] = null_sentinel(opts); @@ -128,8 +120,37 @@ pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usiz } } -/// Returns the number of bytes of encoded data -fn decoded_len(row: &[u8], options: SortOptions) -> usize { +/// Writes `val` in `SIZE` blocks with the appropriate continuation tokens +#[inline] +fn encode_blocks(out: &mut [u8], val: &[u8]) -> usize { + let block_count = ceil(val.len(), SIZE); + let end_offset = block_count * (SIZE + 1); + let to_write = &mut out[..end_offset]; + + let chunks = val.chunks_exact(SIZE); + let remainder = chunks.remainder(); + for (input, output) in chunks.clone().zip(to_write.chunks_exact_mut(SIZE + 1)) { + let input: &[u8; SIZE] = input.try_into().unwrap(); + let out_block: &mut [u8; SIZE] = (&mut output[..SIZE]).try_into().unwrap(); + + *out_block = *input; + + // Indicate that there are further blocks to follow + output[SIZE] = BLOCK_CONTINUATION; + } + + if !remainder.is_empty() { + let start_offset = (block_count - 1) * (SIZE + 1); + to_write[start_offset..start_offset + remainder.len()].copy_from_slice(remainder); + *to_write.last_mut().unwrap() = remainder.len() as u8; + } else { + // We must overwrite the continuation marker written by the loop above + *to_write.last_mut().unwrap() = SIZE as u8; + } + end_offset +} + +fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize { let (non_empty_sentinel, continuation) = match options.descending { true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION), false => (NON_EMPTY_SENTINEL, BLOCK_CONTINUATION), @@ -137,26 +158,44 @@ fn decoded_len(row: &[u8], options: SortOptions) -> usize { if row[0] != non_empty_sentinel { // Empty or null string - return 0; + return 1; } - let mut str_len = 0; + // Extracts the block length from the sentinel + let block_len = |sentinel: u8| match options.descending { + true => !sentinel as usize, + false => sentinel as usize, + }; + let mut idx = 1; + for _ in 0..MINI_BLOCK_COUNT { + let sentinel = row[idx + MINI_BLOCK_SIZE]; + if sentinel != continuation { + f(&row[idx..idx + block_len(sentinel)]); + return idx + MINI_BLOCK_SIZE + 1; + } + f(&row[idx..idx + MINI_BLOCK_SIZE]); + idx += MINI_BLOCK_SIZE + 1; + } + loop { let sentinel = row[idx + BLOCK_SIZE]; - if sentinel == continuation { - idx += BLOCK_SIZE + 1; - str_len += BLOCK_SIZE; - continue; + if sentinel != continuation { + f(&row[idx..idx + block_len(sentinel)]); + return idx + BLOCK_SIZE + 1; } - let block_len = match options.descending { - true => !sentinel, - false => sentinel, - }; - return str_len + block_len as usize; + f(&row[idx..idx + BLOCK_SIZE]); + idx += BLOCK_SIZE + 1; } } +/// Returns the number of bytes of encoded data +fn decoded_len(row: &[u8], options: SortOptions) -> usize { + let mut len = 0; + decode_blocks(row, options, |block| len += block.len()); + len +} + /// Decodes a binary array from `rows` with the provided `options` pub fn decode_binary( rows: &mut [&[u8]], @@ -176,22 +215,8 @@ pub fn decode_binary( let mut values = MutableBuffer::new(values_capacity); for row in rows { - let str_length = decoded_len(row, options); - let mut to_read = str_length; - let mut offset = 1; - while to_read >= BLOCK_SIZE { - to_read -= BLOCK_SIZE; - - values.extend_from_slice(&row[offset..offset + BLOCK_SIZE]); - offset += BLOCK_SIZE + 1; - } - - if to_read != 0 { - values.extend_from_slice(&row[offset..offset + to_read]); - offset += BLOCK_SIZE + 1; - } + let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); *row = &row[offset..]; - offsets.append(I::from_usize(values.len()).expect("offset overflow")) } From 9cb4a75f183c6a8ae729de51e7480ee1ece9a662 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 17 Sep 2023 11:47:39 +0100 Subject: [PATCH 1213/1411] Stateless Row Encoding / Don't Preserve Dictionaries in `RowConverter` (#4811) (#4819) * Stateless Row Encoding / Don't Preserve Dictionaries (#4811) * Add low cardinality benchmarks --- arrow-flight/src/sql/metadata/mod.rs | 2 +- arrow-row/src/dictionary.rs | 296 --------------- arrow-row/src/interner.rs | 523 --------------------------- arrow-row/src/lib.rs | 420 +++++++-------------- arrow/benches/lexsort.rs | 2 +- arrow/benches/row_format.rs | 61 ++-- 6 files changed, 155 insertions(+), 1149 deletions(-) delete mode 100644 arrow-row/src/dictionary.rs delete mode 100644 arrow-row/src/interner.rs diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs index 71551f1849ae..1e9881ffa70e 100644 --- a/arrow-flight/src/sql/metadata/mod.rs +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -53,7 +53,7 @@ fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { .iter() .map(|a| SortField::new(a.data_type().clone())) .collect(); - let mut converter = RowConverter::new(fields).unwrap(); + let converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(arrays).unwrap(); let mut sort: Vec<_> = rows.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); diff --git a/arrow-row/src/dictionary.rs b/arrow-row/src/dictionary.rs deleted file mode 100644 index 740b2e205c04..000000000000 --- a/arrow-row/src/dictionary.rs +++ /dev/null @@ -1,296 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::fixed::{FixedLengthEncoding, FromSlice}; -use crate::interner::{Interned, OrderPreservingInterner}; -use crate::{null_sentinel, Row, Rows}; -use arrow_array::builder::*; -use arrow_array::cast::*; -use arrow_array::types::*; -use arrow_array::*; -use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; -use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, SortOptions}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; - -/// Computes the dictionary mapping for the given dictionary values -pub fn compute_dictionary_mapping( - interner: &mut OrderPreservingInterner, - values: &ArrayRef, -) -> Vec> { - downcast_primitive_array! { - values => interner - .intern(values.iter().map(|x| x.map(|x| x.encode()))), - DataType::Binary => { - let iter = as_generic_binary_array::(values).iter(); - interner.intern(iter) - } - DataType::LargeBinary => { - let iter = as_generic_binary_array::(values).iter(); - interner.intern(iter) - } - DataType::Utf8 => { - let iter = values.as_string::().iter().map(|x| x.map(|x| x.as_bytes())); - interner.intern(iter) - } - DataType::LargeUtf8 => { - let iter = values.as_string::().iter().map(|x| x.map(|x| x.as_bytes())); - interner.intern(iter) - } - _ => unreachable!(), - } -} - -/// Encode dictionary values not preserving the dictionary encoding -pub fn encode_dictionary_values( - data: &mut [u8], - offsets: &mut [usize], - column: &DictionaryArray, - values: &Rows, - null: &Row<'_>, -) { - for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) { - let row = match k { - Some(k) => values.row(k.as_usize()).data, - None => null.data, - }; - let end_offset = *offset + row.len(); - data[*offset..end_offset].copy_from_slice(row); - *offset = end_offset; - } -} - -/// Dictionary types are encoded as -/// -/// - single `0_u8` if null -/// - the bytes of the corresponding normalized key including the null terminator -pub fn encode_dictionary( - data: &mut [u8], - offsets: &mut [usize], - column: &DictionaryArray, - normalized_keys: &[Option<&[u8]>], - opts: SortOptions, -) { - for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) { - match k.and_then(|k| normalized_keys[k.as_usize()]) { - Some(normalized_key) => { - let end_offset = *offset + 1 + normalized_key.len(); - data[*offset] = 1; - data[*offset + 1..end_offset].copy_from_slice(normalized_key); - // Negate if descending - if opts.descending { - data[*offset..end_offset].iter_mut().for_each(|v| *v = !*v) - } - *offset = end_offset; - } - None => { - data[*offset] = null_sentinel(opts); - *offset += 1; - } - } - } -} - -macro_rules! decode_primitive_helper { - ($t:ty, $values: ident, $data_type:ident) => { - decode_primitive::<$t>(&$values, $data_type.clone()) - }; -} - -/// Decodes a string array from `rows` with the provided `options` -/// -/// # Safety -/// -/// `interner` must contain valid data for the provided `value_type` -pub unsafe fn decode_dictionary( - interner: &OrderPreservingInterner, - value_type: &DataType, - options: SortOptions, - rows: &mut [&[u8]], -) -> Result, ArrowError> { - let len = rows.len(); - let mut dictionary: HashMap = HashMap::with_capacity(len); - - let null_sentinel = null_sentinel(options); - - // If descending, the null terminator will have been negated - let null_terminator = match options.descending { - true => 0xFF, - false => 0_u8, - }; - - let mut null_builder = BooleanBufferBuilder::new(len); - let mut keys = BufferBuilder::::new(len); - let mut values = Vec::with_capacity(len); - let mut null_count = 0; - let mut key_scratch = Vec::new(); - - for row in rows { - if row[0] == null_sentinel { - null_builder.append(false); - null_count += 1; - *row = &row[1..]; - keys.append(K::Native::default()); - continue; - } - - let key_offset = row - .iter() - .skip(1) - .position(|x| *x == null_terminator) - .unwrap(); - - // Extract the normalized key including the null terminator - let key = &row[1..key_offset + 2]; - *row = &row[key_offset + 2..]; - - let interned = match options.descending { - true => { - // If options.descending the normalized key will have been - // negated we must first reverse this - key_scratch.clear(); - key_scratch.extend_from_slice(key); - key_scratch.iter_mut().for_each(|o| *o = !*o); - interner.lookup(&key_scratch).unwrap() - } - false => interner.lookup(key).unwrap(), - }; - - let k = match dictionary.entry(interned) { - Entry::Vacant(v) => { - let k = values.len(); - values.push(interner.value(interned)); - let key = K::Native::from_usize(k) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - *v.insert(key) - } - Entry::Occupied(o) => *o.get(), - }; - - keys.append(k); - null_builder.append(true); - } - - let child = downcast_primitive! { - value_type => (decode_primitive_helper, values, value_type), - DataType::Null => NullArray::new(values.len()).into_data(), - DataType::Boolean => decode_bool(&values), - DataType::Utf8 => decode_string::(&values), - DataType::LargeUtf8 => decode_string::(&values), - DataType::Binary => decode_binary::(&values), - DataType::LargeBinary => decode_binary::(&values), - _ => unreachable!(), - }; - - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(value_type.clone())); - - let builder = ArrayDataBuilder::new(data_type) - .len(len) - .null_bit_buffer(Some(null_builder.into())) - .null_count(null_count) - .add_buffer(keys.finish()) - .add_child_data(child); - - Ok(DictionaryArray::from(builder.build_unchecked())) -} - -/// Decodes a binary array from dictionary values -/// -/// # Safety -/// -/// Values must be valid UTF-8 -fn decode_binary(values: &[&[u8]]) -> ArrayData { - let capacity = values.iter().map(|x| x.len()).sum(); - let mut builder = GenericBinaryBuilder::::with_capacity(values.len(), capacity); - for v in values { - builder.append_value(v) - } - builder.finish().into_data() -} - -/// Decodes a string array from dictionary values -/// -/// # Safety -/// -/// Values must be valid UTF-8 -unsafe fn decode_string(values: &[&[u8]]) -> ArrayData { - let d = match O::IS_LARGE { - true => DataType::LargeUtf8, - false => DataType::Utf8, - }; - - decode_binary::(values) - .into_builder() - .data_type(d) - .build_unchecked() -} - -/// Decodes a boolean array from dictionary values -fn decode_bool(values: &[&[u8]]) -> ArrayData { - let mut builder = BooleanBufferBuilder::new(values.len()); - for value in values { - builder.append(bool::decode([value[0]])) - } - - let builder = ArrayDataBuilder::new(DataType::Boolean) - .len(values.len()) - .add_buffer(builder.into()); - - // SAFETY: Buffers correct length - unsafe { builder.build_unchecked() } -} - -/// Decodes a fixed length type array from dictionary values -/// -/// # Safety -/// -/// `data_type` must be appropriate native type for `T` -unsafe fn decode_fixed( - values: &[&[u8]], - data_type: DataType, -) -> ArrayData { - let mut buffer = MutableBuffer::new(std::mem::size_of::() * values.len()); - - for value in values { - let value = T::Encoded::from_slice(value, false); - buffer.push(T::decode(value)) - } - - let builder = ArrayDataBuilder::new(data_type) - .len(values.len()) - .add_buffer(buffer.into()); - - // SAFETY: Buffers correct length - builder.build_unchecked() -} - -/// Decodes a `PrimitiveArray` from dictionary values -fn decode_primitive( - values: &[&[u8]], - data_type: DataType, -) -> ArrayData -where - T::Native: FixedLengthEncoding, -{ - assert!(PrimitiveArray::::is_compatible(&data_type)); - - // SAFETY: - // Validated data type above - unsafe { decode_fixed::(values, data_type) } -} diff --git a/arrow-row/src/interner.rs b/arrow-row/src/interner.rs deleted file mode 100644 index 9f5f0b3d33d2..000000000000 --- a/arrow-row/src/interner.rs +++ /dev/null @@ -1,523 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use hashbrown::hash_map::RawEntryMut; -use hashbrown::HashMap; -use std::num::NonZeroU32; -use std::ops::Index; - -/// An interned value -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub struct Interned(NonZeroU32); // We use NonZeroU32 so that `Option` is 32 bits - -/// A byte array interner that generates normalized keys that are sorted with respect -/// to the interned values, e.g. `inter(a) < intern(b) => a < b` -#[derive(Debug, Default)] -pub struct OrderPreservingInterner { - /// Provides a lookup from [`Interned`] to the normalized key - keys: InternBuffer, - /// Provides a lookup from [`Interned`] to the normalized value - values: InternBuffer, - /// Key allocation data structure - bucket: Box, - - // A hash table used to perform faster re-keying, and detect duplicates - hasher: ahash::RandomState, - lookup: HashMap, -} - -impl OrderPreservingInterner { - /// Interns an iterator of values returning a list of [`Interned`] which can be - /// used with [`Self::normalized_key`] to retrieve the normalized keys with a - /// lifetime not tied to the mutable borrow passed to this method - pub fn intern(&mut self, input: I) -> Vec> - where - I: IntoIterator>, - V: AsRef<[u8]>, - { - let iter = input.into_iter(); - let capacity = iter.size_hint().0; - let mut out = Vec::with_capacity(capacity); - - // (index in output, hash value, value) - let mut to_intern: Vec<(usize, u64, V)> = Vec::with_capacity(capacity); - let mut to_intern_len = 0; - - for (idx, item) in iter.enumerate() { - let value: V = match item { - Some(value) => value, - None => { - out.push(None); - continue; - } - }; - - let v = value.as_ref(); - let hash = self.hasher.hash_one(v); - let entry = self - .lookup - .raw_entry_mut() - .from_hash(hash, |a| &self.values[*a] == v); - - match entry { - RawEntryMut::Occupied(o) => out.push(Some(*o.key())), - RawEntryMut::Vacant(_) => { - // Push placeholder - out.push(None); - to_intern_len += v.len(); - to_intern.push((idx, hash, value)); - } - }; - } - - to_intern.sort_unstable_by(|(_, _, a), (_, _, b)| a.as_ref().cmp(b.as_ref())); - - self.keys.offsets.reserve(to_intern.len()); - self.keys.values.reserve(to_intern.len()); // Approximation - self.values.offsets.reserve(to_intern.len()); - self.values.values.reserve(to_intern_len); - - for (idx, hash, value) in to_intern { - let val = value.as_ref(); - - let entry = self - .lookup - .raw_entry_mut() - .from_hash(hash, |a| &self.values[*a] == val); - - match entry { - RawEntryMut::Occupied(o) => { - out[idx] = Some(*o.key()); - } - RawEntryMut::Vacant(v) => { - let val = value.as_ref(); - self.bucket - .insert(&mut self.values, val, &mut self.keys.values); - self.keys.values.push(0); - let interned = self.keys.append(); - - let hasher = &mut self.hasher; - let values = &self.values; - v.insert_with_hasher(hash, interned, (), |key| { - hasher.hash_one(&values[*key]) - }); - out[idx] = Some(interned); - } - } - } - - out - } - - /// Returns a null-terminated byte array that can be compared against other normalized_key - /// returned by this instance, to establish ordering of the interned values - pub fn normalized_key(&self, key: Interned) -> &[u8] { - &self.keys[key] - } - - /// Converts a normalized key returned by [`Self::normalized_key`] to [`Interned`] - /// returning `None` if it cannot be found - pub fn lookup(&self, normalized_key: &[u8]) -> Option { - let len = normalized_key.len(); - if len <= 1 { - return None; - } - - let mut bucket = self.bucket.as_ref(); - if len > 2 { - for v in normalized_key.iter().take(len - 2) { - if *v == 255 { - bucket = bucket.next.as_ref()?; - } else { - let bucket_idx = v.checked_sub(1)?; - bucket = bucket.slots.get(bucket_idx as usize)?.child.as_ref()?; - } - } - } - - let slot_idx = normalized_key[len - 2].checked_sub(2)?; - Some(bucket.slots.get(slot_idx as usize)?.value) - } - - /// Returns the interned value for a given [`Interned`] - pub fn value(&self, key: Interned) -> &[u8] { - self.values.index(key) - } - - /// Returns the size of this instance in bytes including self - pub fn size(&self) -> usize { - std::mem::size_of::() - + self.keys.buffer_size() - + self.values.buffer_size() - + self.bucket.size() - + self.lookup.capacity() * std::mem::size_of::() - } -} - -/// A buffer of `[u8]` indexed by `[Interned]` -#[derive(Debug)] -struct InternBuffer { - /// Raw values - values: Vec, - /// The ith value is `&values[offsets[i]..offsets[i+1]]` - offsets: Vec, -} - -impl Default for InternBuffer { - fn default() -> Self { - Self { - values: Default::default(), - offsets: vec![0], - } - } -} - -impl InternBuffer { - /// Insert `data` returning the corresponding [`Interned`] - fn insert(&mut self, data: &[u8]) -> Interned { - self.values.extend_from_slice(data); - self.append() - } - - /// Appends the next value based on data written to `self.values` - /// returning the corresponding [`Interned`] - fn append(&mut self) -> Interned { - let idx: u32 = self.offsets.len().try_into().unwrap(); - let key = Interned(NonZeroU32::new(idx).unwrap()); - self.offsets.push(self.values.len()); - key - } - - /// Returns the byte size of the associated buffers - fn buffer_size(&self) -> usize { - self.values.capacity() + self.offsets.capacity() * std::mem::size_of::() - } -} - -impl Index for InternBuffer { - type Output = [u8]; - - fn index(&self, key: Interned) -> &Self::Output { - let index = key.0.get() as usize; - let end = self.offsets[index]; - let start = self.offsets[index - 1]; - // SAFETY: - // self.values is never reduced in size and values appended - // to self.offsets are always less than self.values at the time - unsafe { self.values.get_unchecked(start..end) } - } -} - -/// A slot corresponds to a single byte-value in the generated normalized key -/// -/// It may contain a value, if not the first slot, and may contain a child [`Bucket`] representing -/// the next byte in the generated normalized key -#[derive(Debug, Clone)] -struct Slot { - value: Interned, - /// Child values less than `self.value` if any - child: Option>, -} - -/// Bucket is the root of the data-structure used to allocate normalized keys -/// -/// In particular it needs to generate keys that -/// -/// * Contain no `0` bytes other than the null terminator -/// * Compare lexicographically in the same manner as the encoded `data` -/// -/// The data structure consists of 254 slots, each of which can store a value. -/// Additionally each slot may contain a child bucket, containing values smaller -/// than the value within the slot. -/// -/// Each bucket also may contain a child bucket, containing values greater than -/// all values in the current bucket -/// -/// # Allocation Strategy -/// -/// The contiguous slice of slots containing values is searched to find the insertion -/// point for the new value, according to the sort order. -/// -/// If the insertion position exceeds 254, the number of slots, the value is inserted -/// into the child bucket of the current bucket. -/// -/// If the insertion position already contains a value, the value is inserted into the -/// child bucket of that slot. -/// -/// If the slot is not occupied, the value is inserted into that slot. -/// -/// The final key consists of the slot indexes visited incremented by 1, -/// with the final value incremented by 2, followed by a null terminator. -/// -/// Consider the case of the integers `[8, 6, 5, 7]` inserted in that order -/// -/// ```ignore -/// 8: &[2, 0] -/// 6: &[1, 2, 0] -/// 5: &[1, 1, 2, 0] -/// 7: &[1, 3, 0] -/// ``` -/// -/// Note: this allocation strategy is optimised for interning values in sorted order -/// -#[derive(Debug, Clone)] -struct Bucket { - slots: Vec, - /// Bucket containing values larger than all of `slots` - next: Option>, -} - -impl Default for Bucket { - fn default() -> Self { - Self { - slots: Vec::with_capacity(254), - next: None, - } - } -} - -impl Bucket { - /// Insert `data` into this bucket or one of its children, appending the - /// normalized key to `out` as it is constructed - /// - /// # Panics - /// - /// Panics if the value already exists - fn insert(&mut self, values_buf: &mut InternBuffer, data: &[u8], out: &mut Vec) { - let slots_len = self.slots.len() as u8; - // We optimise the case of inserting a value directly after those already inserted - // as [`OrderPreservingInterner::intern`] sorts values prior to interning them - match self.slots.last() { - Some(slot) => { - if &values_buf[slot.value] < data { - if slots_len == 254 { - out.push(255); - self.next - .get_or_insert_with(Default::default) - .insert(values_buf, data, out) - } else { - out.push(slots_len + 2); - let value = values_buf.insert(data); - self.slots.push(Slot { value, child: None }); - } - } else { - // Find insertion point - match self - .slots - .binary_search_by(|slot| values_buf[slot.value].cmp(data)) - { - Ok(_) => unreachable!("value already exists"), - Err(idx) => { - out.push(idx as u8 + 1); - self.slots[idx] - .child - .get_or_insert_with(Default::default) - .insert(values_buf, data, out) - } - } - } - } - None => { - out.push(2); - let value = values_buf.insert(data); - self.slots.push(Slot { value, child: None }) - } - } - } - - /// Returns the size of this instance in bytes - fn size(&self) -> usize { - std::mem::size_of::() - + self.slots.capacity() * std::mem::size_of::() - // and account for the size of any embedded buckets in the slots - + self.slot_child_bucket_size() - + self.next.as_ref().map(|x| x.size()).unwrap_or_default() - } - - /// returns the total size of any recursively allocated `Bucket`s - /// in self.slots. This does not include the size of the child Slot itself - fn slot_child_bucket_size(&self) -> usize { - self.slots - .iter() - .map(|slot| slot.child.as_ref().map(|x| x.size()).unwrap_or_default()) - .sum() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rand::prelude::*; - - // Clippy isn't smart enough to understand dropping mutability - #[allow(clippy::needless_collect)] - fn test_intern_values(values: &[u64]) { - let mut interner = OrderPreservingInterner::default(); - - // Intern a single value at a time to check ordering - let interned: Vec<_> = values - .iter() - .flat_map(|v| interner.intern([Some(&v.to_be_bytes())])) - .map(Option::unwrap) - .collect(); - - for (value, interned) in values.iter().zip(&interned) { - assert_eq!(interner.value(*interned), &value.to_be_bytes()); - } - - let normalized_keys: Vec<_> = interned - .iter() - .map(|x| interner.normalized_key(*x)) - .collect(); - - for (interned, normalized) in interned.iter().zip(&normalized_keys) { - assert_eq!(*interned, interner.lookup(normalized).unwrap()); - } - - for (i, a) in normalized_keys.iter().enumerate() { - for (j, b) in normalized_keys.iter().enumerate() { - let interned_cmp = a.cmp(b); - let values_cmp = values[i].cmp(&values[j]); - assert_eq!( - interned_cmp, values_cmp, - "({:?} vs {:?}) vs ({} vs {})", - a, b, values[i], values[j] - ) - } - } - } - - #[test] - #[cfg_attr(miri, ignore)] - fn test_interner() { - test_intern_values(&[8, 6, 5, 7]); - - let mut values: Vec<_> = (0_u64..2000).collect(); - test_intern_values(&values); - - let mut rng = thread_rng(); - values.shuffle(&mut rng); - test_intern_values(&values); - } - - #[test] - fn test_intern_duplicates() { - // Unsorted with duplicates - let values = [0_u8, 1, 8, 4, 1, 0]; - let mut interner = OrderPreservingInterner::default(); - - let interned = interner.intern(values.iter().map(std::slice::from_ref).map(Some)); - let interned: Vec<_> = interned.into_iter().map(Option::unwrap).collect(); - - assert_eq!(interned[0], interned[5]); - assert_eq!(interned[1], interned[4]); - assert!( - interner.normalized_key(interned[0]) < interner.normalized_key(interned[1]) - ); - assert!( - interner.normalized_key(interned[1]) < interner.normalized_key(interned[2]) - ); - assert!( - interner.normalized_key(interned[1]) < interner.normalized_key(interned[3]) - ); - assert!( - interner.normalized_key(interned[3]) < interner.normalized_key(interned[2]) - ); - } - - #[test] - fn test_intern_sizes() { - let mut interner = OrderPreservingInterner::default(); - - // Intern a 1K values each 8 bytes large - let num_items = 1000; - let mut values: Vec = (0..num_items).collect(); - values.reverse(); - - // intern these values 1 at a time (otherwise the interner - // will sort them first); - for v in values { - interner.intern([Some(v.to_be_bytes())]); - } - - let reported = interner.size(); - - // Figure out the expected size (this is a second - // implementation of size()) as a double check - let min_expected = BucketWalker::new() - .visit_bucket(interner.bucket.as_ref()) - .memory_estimate() - // hash table size - + interner.lookup.capacity() * std::mem::size_of::() - // key/value storage - + interner.keys.buffer_size() - + interner.values.buffer_size(); - - assert!( - reported > min_expected, - "reported size {reported} not larger than min expected size: {min_expected}" - ) - } - - // Walks over the buckets / slots counting counting them all - struct BucketWalker { - num_buckets: usize, - num_slots: usize, - } - - impl BucketWalker { - fn new() -> Self { - Self { - num_buckets: 0, - num_slots: 0, - } - } - - // recursively visit the bucket and any slots/buckets contained - fn visit_bucket(mut self, bucket: &Bucket) -> Self { - self.num_buckets += 1; - let acc = bucket - .slots - .iter() - .fold(self, |acc, slot| acc.visit_slot(slot)); - - if let Some(next) = bucket.next.as_ref() { - acc.visit_bucket(next.as_ref()) - } else { - acc - } - } - - // recursively visit slot and any slots/buckets - fn visit_slot(mut self, slot: &Slot) -> Self { - self.num_slots += 1; - if let Some(child) = slot.child.as_ref() { - self.visit_bucket(child.as_ref()) - } else { - self - } - } - - // estimate how much memory is used just for Buckets / Slots - // (an underestimate of the total memory used for the - // interner as it doesn't contain any actual values) - fn memory_estimate(self) -> usize { - self.num_buckets * std::mem::size_of::() - + self.num_slots * std::mem::size_of::() - } - } -} diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index bd1dd7256240..58dc42a4cacb 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -61,7 +61,7 @@ //! let arrays = vec![a1, a2]; //! //! // Convert arrays to rows -//! let mut converter = RowConverter::new(vec![ +//! let converter = RowConverter::new(vec![ //! SortField::new(DataType::Int32), //! SortField::new(DataType::Utf8), //! ]).unwrap(); @@ -109,7 +109,7 @@ //! .iter() //! .map(|a| SortField::new(a.data_type().clone())) //! .collect(); -//! let mut converter = RowConverter::new(fields).unwrap(); +//! let converter = RowConverter::new(fields).unwrap(); //! let rows = converter.convert_columns(arrays).unwrap(); //! let mut sort: Vec<_> = rows.iter().enumerate().collect(); //! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); @@ -130,22 +130,16 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow_array::cast::*; +use arrow_array::types::ArrowDictionaryKeyType; use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_data::ArrayDataBuilder; use arrow_schema::*; -use crate::dictionary::{ - compute_dictionary_mapping, decode_dictionary, encode_dictionary, - encode_dictionary_values, -}; use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive}; -use crate::interner::OrderPreservingInterner; use crate::variable::{decode_binary, decode_string}; -mod dictionary; mod fixed; -mod interner; mod list; mod variable; @@ -271,53 +265,7 @@ mod variable; /// /// ## Dictionary Encoding /// -/// [`RowConverter`] needs to support converting dictionary encoded arrays with unsorted, and -/// potentially distinct dictionaries. One simple mechanism to avoid this would be to reverse -/// the dictionary encoding, and encode the array values directly, however, this would lose -/// the benefits of dictionary encoding to reduce memory and CPU consumption. -/// -/// As such the [`RowConverter`] creates an order-preserving mapping -/// for each dictionary encoded column, which allows new dictionary -/// values to be added whilst preserving the sort order. -/// -/// A null dictionary value is encoded as `0_u8`. -/// -/// A non-null dictionary value is encoded as `1_u8` followed by a null-terminated byte array -/// key determined by the order-preserving dictionary encoding -/// -/// ```text -/// ┌──────────┐ ┌─────┐ -/// │ "Bar" │ ───────────────▶│ 01 │ -/// └──────────┘ └─────┘ -/// ┌──────────┐ ┌─────┬─────┐ -/// │"Fabulous"│ ───────────────▶│ 01 │ 02 │ -/// └──────────┘ └─────┴─────┘ -/// ┌──────────┐ ┌─────┐ -/// │ "Soup" │ ───────────────▶│ 05 │ -/// └──────────┘ └─────┘ -/// ┌──────────┐ ┌─────┐ -/// │ "ZZ" │ ───────────────▶│ 07 │ -/// └──────────┘ └─────┘ -/// -/// Example Order Preserving Mapping -/// ``` -/// Using the map above, the corresponding row format will be -/// -/// ```text -/// ┌─────┬─────┬─────┬─────┐ -/// "Fabulous" │ 01 │ 01 │ 02 │ 00 │ -/// └─────┴─────┴─────┴─────┘ -/// -/// ┌─────┬─────┬─────┐ -/// "ZZ" │ 01 │ 07 │ 00 │ -/// └─────┴─────┴─────┘ -/// -/// ┌─────┐ -/// NULL │ 00 │ -/// └─────┘ -/// -/// Input Row Format -/// ``` +/// Dictionaries are hydrated to their underlying values /// /// ## Struct Encoding /// @@ -426,15 +374,9 @@ pub struct RowConverter { enum Codec { /// No additional codec state is necessary Stateless, - /// The interner used to encode dictionary values - /// - /// Used when preserving the dictionary encoding - Dictionary(OrderPreservingInterner), /// A row converter for the dictionary values /// and the encoding of a row containing only nulls - /// - /// Used when not preserving dictionary encoding - DictionaryValues(RowConverter, OwnedRow), + Dictionary(RowConverter, OwnedRow), /// A row converter for the child fields /// and the encoding of a row containing only nulls Struct(RowConverter, OwnedRow), @@ -445,25 +387,22 @@ enum Codec { impl Codec { fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { - DataType::Dictionary(_, values) => match sort_field.preserve_dictionaries { - true => Ok(Self::Dictionary(Default::default())), - false => { - let sort_field = SortField::new_with_options( - values.as_ref().clone(), - sort_field.options, - ); + DataType::Dictionary(_, values) => { + let sort_field = SortField::new_with_options( + values.as_ref().clone(), + sort_field.options, + ); - let mut converter = RowConverter::new(vec![sort_field])?; - let null_array = new_null_array(values.as_ref(), 1); - let nulls = converter.convert_columns(&[null_array])?; + let converter = RowConverter::new(vec![sort_field])?; + let null_array = new_null_array(values.as_ref(), 1); + let nulls = converter.convert_columns(&[null_array])?; - let owned = OwnedRow { - data: nulls.buffer.into(), - config: nulls.config, - }; - Ok(Self::DictionaryValues(converter, owned)) - } - }, + let owned = OwnedRow { + data: nulls.buffer.into(), + config: nulls.config, + }; + Ok(Self::Dictionary(converter, owned)) + } d if !d.is_nested() => Ok(Self::Stateless), DataType::List(f) | DataType::LargeList(f) => { // The encoded contents will be inverted if descending is set to true @@ -490,7 +429,7 @@ impl Codec { }) .collect(); - let mut converter = RowConverter::new(sort_fields)?; + let converter = RowConverter::new(sort_fields)?; let nulls: Vec<_> = f.iter().map(|x| new_null_array(x.data_type(), 1)).collect(); @@ -509,32 +448,13 @@ impl Codec { } } - fn encoder(&mut self, array: &dyn Array) -> Result, ArrowError> { + fn encoder(&self, array: &dyn Array) -> Result, ArrowError> { match self { Codec::Stateless => Ok(Encoder::Stateless), - Codec::Dictionary(interner) => { - let values = downcast_dictionary_array! { - array => array.values(), - _ => unreachable!() - }; - - let mapping = compute_dictionary_mapping(interner, values) - .into_iter() - .map(|maybe_interned| { - maybe_interned.map(|interned| interner.normalized_key(interned)) - }) - .collect(); - - Ok(Encoder::Dictionary(mapping)) - } - Codec::DictionaryValues(converter, nulls) => { - let values = downcast_dictionary_array! { - array => array.values(), - _ => unreachable!() - }; - - let rows = converter.convert_columns(&[values.clone()])?; - Ok(Encoder::DictionaryValues(rows, nulls.row())) + Codec::Dictionary(converter, nulls) => { + let values = array.as_any_dictionary().values().clone(); + let rows = converter.convert_columns(&[values])?; + Ok(Encoder::Dictionary(rows, nulls.row())) } Codec::Struct(converter, null) => { let v = as_struct_array(array); @@ -556,10 +476,7 @@ impl Codec { fn size(&self) -> usize { match self { Codec::Stateless => 0, - Codec::Dictionary(interner) => interner.size(), - Codec::DictionaryValues(converter, nulls) => { - converter.size() + nulls.data.len() - } + Codec::Dictionary(converter, nulls) => converter.size() + nulls.data.len(), Codec::Struct(converter, nulls) => converter.size() + nulls.data.len(), Codec::List(converter) => converter.size(), } @@ -570,10 +487,8 @@ impl Codec { enum Encoder<'a> { /// No additional encoder state is necessary Stateless, - /// The mapping from dictionary keys to normalized keys - Dictionary(Vec>), /// The encoding of the child array and the encoding of a null row - DictionaryValues(Rows, Row<'a>), + Dictionary(Rows, Row<'a>), /// The row encoding of the child arrays and the encoding of a null row /// /// It is necessary to encode to a temporary [`Rows`] to avoid serializing @@ -591,8 +506,6 @@ pub struct SortField { options: SortOptions, /// Data type data_type: DataType, - /// Preserve dictionaries - preserve_dictionaries: bool, } impl SortField { @@ -603,30 +516,7 @@ impl SortField { /// Create a new column with the given data type and [`SortOptions`] pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self { - Self { - options, - data_type, - preserve_dictionaries: true, - } - } - - /// By default dictionaries are preserved as described on [`RowConverter`] - /// - /// However, this process requires maintaining and incrementally updating - /// an order-preserving mapping of dictionary values. This is relatively expensive - /// computationally but reduces the size of the encoded rows, minimising memory - /// usage and potentially yielding faster comparisons. - /// - /// Some applications may wish to instead trade-off space efficiency, for improved - /// encoding performance, by instead encoding dictionary values directly - /// - /// When `preserve_dictionaries` is true, fields will instead be encoded as their - /// underlying value, reversing any dictionary encoding - pub fn preserve_dictionaries(self, preserve_dictionaries: bool) -> Self { - Self { - preserve_dictionaries, - ..self - } + Self { options, data_type } } /// Return size of this instance in bytes. @@ -679,7 +569,7 @@ impl RowConverter { /// # Panics /// /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`] - pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result { + pub fn convert_columns(&self, columns: &[ArrayRef]) -> Result { let num_rows = columns.first().map(|x| x.len()).unwrap_or(0); let mut rows = self.empty_rows(num_rows, 0); self.append(&mut rows, columns)?; @@ -704,7 +594,7 @@ impl RowConverter { /// # use arrow_row::{Row, RowConverter, SortField}; /// # use arrow_schema::DataType; /// # - /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + /// let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); /// let a1 = StringArray::from(vec!["hello", "world"]); /// let a2 = StringArray::from(vec!["a", "a", "hello"]); /// @@ -717,7 +607,7 @@ impl RowConverter { /// assert_eq!(&values, &["hello", "world", "a", "a", "hello"]); /// ``` pub fn append( - &mut self, + &self, rows: &mut Rows, columns: &[ArrayRef], ) -> Result<(), ArrowError> { @@ -736,7 +626,7 @@ impl RowConverter { let encoders = columns .iter() - .zip(&mut self.codecs) + .zip(&self.codecs) .zip(self.fields.iter()) .map(|((column, codec), field)| { if !column.data_type().equals_datatype(&field.data_type) { @@ -844,7 +734,7 @@ impl RowConverter { /// # use arrow_row::{Row, RowConverter, SortField}; /// # use arrow_schema::DataType; /// # - /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + /// let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); /// let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]); /// /// // Convert to row format and deduplicate @@ -1234,20 +1124,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> Vec { _ => unreachable!(), } } - Encoder::Dictionary(dict) => { - downcast_dictionary_array! { - array => { - for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { - match v.and_then(|v| dict[v as usize]) { - Some(k) => *length += k.len() + 1, - None => *length += 1, - } - } - } - _ => unreachable!(), - } - } - Encoder::DictionaryValues(values, null) => { + Encoder::Dictionary(values, null) => { downcast_dictionary_array! { array => { for (v, length) in array.keys().iter().zip(lengths.iter_mut()) { @@ -1323,13 +1200,7 @@ fn encode_column( _ => unreachable!(), } } - Encoder::Dictionary(dict) => { - downcast_dictionary_array! { - column => encode_dictionary(data, offsets, column, dict, opts), - _ => unreachable!() - } - } - Encoder::DictionaryValues(values, nulls) => { + Encoder::Dictionary(values, nulls) => { downcast_dictionary_array! { column => encode_dictionary_values(data, offsets, column, values, nulls), _ => unreachable!() @@ -1365,18 +1236,31 @@ fn encode_column( } } +/// Encode dictionary values not preserving the dictionary encoding +pub fn encode_dictionary_values( + data: &mut [u8], + offsets: &mut [usize], + column: &DictionaryArray, + values: &Rows, + null: &Row<'_>, +) { + for (offset, k) in offsets.iter_mut().skip(1).zip(column.keys()) { + let row = match k { + Some(k) => values.row(k.as_usize()).data, + None => null.data, + }; + let end_offset = *offset + row.len(); + data[*offset..end_offset].copy_from_slice(row); + *offset = end_offset; + } +} + macro_rules! decode_primitive_helper { ($t:ty, $rows:ident, $data_type:ident, $options:ident) => { Arc::new(decode_primitive::<$t>($rows, $data_type, $options)) }; } -macro_rules! decode_dictionary_helper { - ($t:ty, $interner:ident, $v:ident, $options:ident, $rows:ident) => { - Arc::new(decode_dictionary::<$t>($interner, $v, $options, $rows)?) - }; -} - /// Decodes a the provided `field` from `rows` /// /// # Safety @@ -1402,20 +1286,11 @@ unsafe fn decode_column( DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)), DataType::Utf8 => Arc::new(decode_string::(rows, options, validate_utf8)), DataType::LargeUtf8 => Arc::new(decode_string::(rows, options, validate_utf8)), + DataType::Dictionary(_, _) => todo!(), _ => unreachable!() } } - Codec::Dictionary(interner) => { - let (k, v) = match &field.data_type { - DataType::Dictionary(k, v) => (k.as_ref(), v.as_ref()), - _ => unreachable!(), - }; - downcast_integer! { - k => (decode_dictionary_helper, interner, v, options, rows), - _ => unreachable!() - } - } - Codec::DictionaryValues(converter, _) => { + Codec::Dictionary(converter, _) => { let cols = converter.convert_raw(rows, validate_utf8)?; cols.into_iter().next().unwrap() } @@ -1487,7 +1362,7 @@ mod tests { ])) as ArrayRef, ]; - let mut converter = RowConverter::new(vec![ + let converter = RowConverter::new(vec![ SortField::new(DataType::Int16), SortField::new(DataType::Float32), ]) @@ -1529,9 +1404,10 @@ mod tests { #[test] fn test_decimal128() { - let mut converter = RowConverter::new(vec![SortField::new( - DataType::Decimal128(DECIMAL128_MAX_PRECISION, 7), - )]) + let converter = RowConverter::new(vec![SortField::new(DataType::Decimal128( + DECIMAL128_MAX_PRECISION, + 7, + ))]) .unwrap(); let col = Arc::new( Decimal128Array::from_iter([ @@ -1558,9 +1434,10 @@ mod tests { #[test] fn test_decimal256() { - let mut converter = RowConverter::new(vec![SortField::new( - DataType::Decimal256(DECIMAL256_MAX_PRECISION, 7), - )]) + let converter = RowConverter::new(vec![SortField::new(DataType::Decimal256( + DECIMAL256_MAX_PRECISION, + 7, + ))]) .unwrap(); let col = Arc::new( Decimal256Array::from_iter([ @@ -1589,7 +1466,7 @@ mod tests { #[test] fn test_bool() { - let mut converter = + let converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]).unwrap(); let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) @@ -1603,7 +1480,7 @@ mod tests { let cols = converter.convert_rows(&rows).unwrap(); assert_eq!(&cols[0], &col); - let mut converter = RowConverter::new(vec![SortField::new_with_options( + let converter = RowConverter::new(vec![SortField::new_with_options( DataType::Boolean, SortOptions { descending: true, @@ -1626,7 +1503,7 @@ mod tests { .with_timezone("+01:00".to_string()); let d = a.data_type().clone(); - let mut converter = + let converter = RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap(); let back = converter.convert_rows(&rows).unwrap(); @@ -1644,29 +1521,23 @@ mod tests { let dict = a.finish(); let values = TimestampNanosecondArray::from(dict.values().to_data()); let dict_with_tz = dict.with_values(Arc::new(values.with_timezone("+02:00"))); - let d = DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Timestamp( - TimeUnit::Nanosecond, - Some("+02:00".into()), - )), - ); + let v = DataType::Timestamp(TimeUnit::Nanosecond, Some("+02:00".into())); + let d = DataType::Dictionary(Box::new(DataType::Int32), Box::new(v.clone())); assert_eq!(dict_with_tz.data_type(), &d); - let mut converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); let rows = converter .convert_columns(&[Arc::new(dict_with_tz) as _]) .unwrap(); let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); - assert_eq!(back[0].data_type(), &d); + assert_eq!(back[0].data_type(), &v); } #[test] fn test_null_encoding() { let col = Arc::new(NullArray::new(10)); - let mut converter = - RowConverter::new(vec![SortField::new(DataType::Null)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Null)]).unwrap(); let rows = converter.convert_columns(&[col]).unwrap(); assert_eq!(rows.num_rows(), 10); assert_eq!(rows.row(1).data.len(), 0); @@ -1682,8 +1553,7 @@ mod tests { Some(""), ])) as ArrayRef; - let mut converter = - RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(1) < rows.row(0)); @@ -1714,7 +1584,7 @@ mod tests { Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]), ])) as ArrayRef; - let mut converter = + let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); @@ -1734,7 +1604,7 @@ mod tests { let cols = converter.convert_rows(&rows).unwrap(); assert_eq!(&cols[0], &col); - let mut converter = RowConverter::new(vec![SortField::new_with_options( + let converter = RowConverter::new(vec![SortField::new_with_options( DataType::Binary, SortOptions { descending: true, @@ -1762,9 +1632,9 @@ mod tests { } /// If `exact` is false performs a logical comparison between a and dictionary-encoded b - fn dictionary_eq(exact: bool, a: &dyn Array, b: &dyn Array) { + fn dictionary_eq(a: &dyn Array, b: &dyn Array) { match b.data_type() { - DataType::Dictionary(_, v) if !exact => { + DataType::Dictionary(_, v) => { assert_eq!(a.data_type(), v.as_ref()); let b = arrow_cast::cast(b, v).unwrap(); assert_eq!(a, b.as_ref()) @@ -1775,11 +1645,6 @@ mod tests { #[test] fn test_string_dictionary() { - test_string_dictionary_impl(false); - test_string_dictionary_impl(true); - } - - fn test_string_dictionary_impl(preserve: bool) { let a = Arc::new(DictionaryArray::::from_iter([ Some("foo"), Some("hello"), @@ -1791,8 +1656,8 @@ mod tests { Some("hello"), ])) as ArrayRef; - let field = SortField::new(a.data_type().clone()).preserve_dictionaries(preserve); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let field = SortField::new(a.data_type().clone()); + let converter = RowConverter::new(vec![field]).unwrap(); let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); assert!(rows_a.row(3) < rows_a.row(5)); @@ -1805,7 +1670,7 @@ mod tests { assert_eq!(rows_a.row(1), rows_a.row(7)); let cols = converter.convert_rows(&rows_a).unwrap(); - dictionary_eq(preserve, &cols[0], &a); + dictionary_eq(&cols[0], &a); let b = Arc::new(DictionaryArray::::from_iter([ Some("hello"), @@ -1819,16 +1684,15 @@ mod tests { assert!(rows_b.row(2) < rows_a.row(0)); let cols = converter.convert_rows(&rows_b).unwrap(); - dictionary_eq(preserve, &cols[0], &b); + dictionary_eq(&cols[0], &b); - let mut converter = RowConverter::new(vec![SortField::new_with_options( + let converter = RowConverter::new(vec![SortField::new_with_options( a.data_type().clone(), SortOptions { descending: true, nulls_first: false, }, - ) - .preserve_dictionaries(preserve)]) + )]) .unwrap(); let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); @@ -1838,16 +1702,15 @@ mod tests { assert!(rows_c.row(3) > rows_c.row(0)); let cols = converter.convert_rows(&rows_c).unwrap(); - dictionary_eq(preserve, &cols[0], &a); + dictionary_eq(&cols[0], &a); - let mut converter = RowConverter::new(vec![SortField::new_with_options( + let converter = RowConverter::new(vec![SortField::new_with_options( a.data_type().clone(), SortOptions { descending: true, nulls_first: true, }, - ) - .preserve_dictionaries(preserve)]) + )]) .unwrap(); let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap(); @@ -1857,7 +1720,7 @@ mod tests { assert!(rows_c.row(3) < rows_c.row(0)); let cols = converter.convert_rows(&rows_c).unwrap(); - dictionary_eq(preserve, &cols[0], &a); + dictionary_eq(&cols[0], &a); } #[test] @@ -1870,7 +1733,7 @@ mod tests { let s1 = Arc::new(StructArray::from(vec![(a_f, a), (u_f, u)])) as ArrayRef; let sort_fields = vec![SortField::new(s1.data_type().clone())]; - let mut converter = RowConverter::new(sort_fields).unwrap(); + let converter = RowConverter::new(sort_fields).unwrap(); let r1 = converter.convert_columns(&[Arc::clone(&s1)]).unwrap(); for (a, b) in r1.iter().zip(r1.iter().skip(1)) { @@ -1919,16 +1782,14 @@ mod tests { let data_type = a.data_type().clone(); let columns = [Arc::new(a) as ArrayRef]; - for preserve in [true, false] { - let field = SortField::new(data_type.clone()).preserve_dictionaries(preserve); - let mut converter = RowConverter::new(vec![field]).unwrap(); - let rows = converter.convert_columns(&columns).unwrap(); - assert!(rows.row(0) < rows.row(1)); - assert!(rows.row(2) < rows.row(0)); - assert!(rows.row(3) < rows.row(2)); - assert!(rows.row(6) < rows.row(2)); - assert!(rows.row(3) < rows.row(6)); - } + let field = SortField::new(data_type.clone()); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&columns).unwrap(); + assert!(rows.row(0) < rows.row(1)); + assert!(rows.row(2) < rows.row(0)); + assert!(rows.row(3) < rows.row(2)); + assert!(rows.row(6) < rows.row(2)); + assert!(rows.row(3) < rows.row(6)); } #[test] @@ -1949,22 +1810,20 @@ mod tests { .unwrap(); let columns = [Arc::new(DictionaryArray::::from(data)) as ArrayRef]; - for preserve in [true, false] { - let field = SortField::new(data_type.clone()).preserve_dictionaries(preserve); - let mut converter = RowConverter::new(vec![field]).unwrap(); - let rows = converter.convert_columns(&columns).unwrap(); - - assert_eq!(rows.row(0), rows.row(1)); - assert_eq!(rows.row(3), rows.row(4)); - assert_eq!(rows.row(4), rows.row(5)); - assert!(rows.row(3) < rows.row(0)); - } + let field = SortField::new(data_type.clone()); + let converter = RowConverter::new(vec![field]).unwrap(); + let rows = converter.convert_columns(&columns).unwrap(); + + assert_eq!(rows.row(0), rows.row(1)); + assert_eq!(rows.row(3), rows.row(4)); + assert_eq!(rows.row(4), rows.row(5)); + assert!(rows.row(3) < rows.row(0)); } #[test] #[should_panic(expected = "Encountered non UTF-8 data")] fn test_invalid_utf8() { - let mut converter = + let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _; let rows = converter.convert_columns(&[array]).unwrap(); @@ -1981,8 +1840,7 @@ mod tests { #[should_panic(expected = "rows were not produced by this RowConverter")] fn test_different_converter() { let values = Arc::new(Int32Array::from_iter([Some(1), Some(-1)])); - let mut converter = - RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); let rows = converter.convert_columns(&[values]).unwrap(); let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); @@ -2013,7 +1871,7 @@ mod tests { let list = Arc::new(builder.finish()) as ArrayRef; let d = list.data_type().clone(); - let mut converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(d.clone())]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] @@ -2033,7 +1891,7 @@ mod tests { nulls_first: false, }; let field = SortField::new_with_options(d.clone(), options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] @@ -2053,7 +1911,7 @@ mod tests { nulls_first: false, }; let field = SortField::new_with_options(d.clone(), options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] @@ -2073,7 +1931,7 @@ mod tests { nulls_first: true, }; let field = SortField::new_with_options(d, options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] @@ -2137,7 +1995,7 @@ mod tests { nulls_first: true, }; let field = SortField::new_with_options(d.clone(), options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); @@ -2156,7 +2014,7 @@ mod tests { nulls_first: true, }; let field = SortField::new_with_options(d.clone(), options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); @@ -2175,7 +2033,7 @@ mod tests { nulls_first: false, }; let field = SortField::new_with_options(d, options); - let mut converter = RowConverter::new(vec![field]).unwrap(); + let converter = RowConverter::new(vec![field]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) < rows.row(1)); @@ -2202,35 +2060,6 @@ mod tests { test_nested_list::(); } - #[test] - fn test_dictionary_preserving() { - let mut dict = StringDictionaryBuilder::::new(); - dict.append_value("foo"); - dict.append_value("foo"); - dict.append_value("bar"); - dict.append_value("bar"); - dict.append_value("bar"); - dict.append_value("bar"); - - let array = Arc::new(dict.finish()) as ArrayRef; - let preserve = SortField::new(array.data_type().clone()); - let non_preserve = preserve.clone().preserve_dictionaries(false); - - let mut c1 = RowConverter::new(vec![preserve]).unwrap(); - let r1 = c1.convert_columns(&[array.clone()]).unwrap(); - - let mut c2 = RowConverter::new(vec![non_preserve]).unwrap(); - let r2 = c2.convert_columns(&[array.clone()]).unwrap(); - - for r in r1.iter() { - assert_eq!(r.data.len(), 3); - } - - for r in r2.iter() { - assert_eq!(r.data.len(), 10); - } - } - fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, @@ -2386,21 +2215,15 @@ mod tests { }) .collect(); - let preserve: Vec<_> = (0..num_columns).map(|_| rng.gen_bool(0.5)).collect(); - let comparator = LexicographicalComparator::try_new(&sort_columns).unwrap(); let columns = options .into_iter() .zip(&arrays) - .zip(&preserve) - .map(|((o, a), p)| { - SortField::new_with_options(a.data_type().clone(), o) - .preserve_dictionaries(*p) - }) + .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o)) .collect(); - let mut converter = RowConverter::new(columns).unwrap(); + let converter = RowConverter::new(columns).unwrap(); let rows = converter.convert_columns(&arrays).unwrap(); for i in 0..len { @@ -2423,17 +2246,16 @@ mod tests { } let back = converter.convert_rows(&rows).unwrap(); - for ((actual, expected), preserve) in back.iter().zip(&arrays).zip(preserve) { + for (actual, expected) in back.iter().zip(&arrays) { actual.to_data().validate_full().unwrap(); - dictionary_eq(preserve, actual, expected) + dictionary_eq(actual, expected) } } } #[test] fn test_clear() { - let mut converter = - RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]).unwrap(); let mut rows = converter.empty_rows(3, 128); let first = Int32Array::from(vec![None, Some(2), Some(4)]); @@ -2463,7 +2285,7 @@ mod tests { fn test_append_codec_dictionary_binary() { use DataType::*; // Dictionary RowConverter - let mut converter = RowConverter::new(vec![SortField::new(Dictionary( + let converter = RowConverter::new(vec![SortField::new(Dictionary( Box::new(Int32), Box::new(Binary), ))]) @@ -2484,6 +2306,6 @@ mod tests { converter.append(&mut rows, &[array.clone()]).unwrap(); let back = converter.convert_rows(&rows).unwrap(); - assert_eq!(&back[0], &array); + dictionary_eq(&back[0], &array); } } diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index 30dab9a74667..25b2279be8d6 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -100,7 +100,7 @@ fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { .iter() .map(|a| SortField::new(a.data_type().clone())) .collect(); - let mut converter = RowConverter::new(fields).unwrap(); + let converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(&arrays).unwrap(); let mut sort: Vec<_> = rows.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index 12ce71764f7e..bde117e3ec3e 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -23,35 +23,28 @@ use arrow::array::ArrayRef; use arrow::datatypes::{Int64Type, UInt64Type}; use arrow::row::{RowConverter, SortField}; use arrow::util::bench_util::{ - create_primitive_array, create_string_array_with_len, create_string_dict_array, + create_dict_from_values, create_primitive_array, create_string_array_with_len, + create_string_dict_array, }; use arrow_array::types::Int32Type; use arrow_array::Array; use criterion::{black_box, Criterion}; use std::sync::Arc; -fn do_bench( - c: &mut Criterion, - name: &str, - cols: Vec, - preserve_dictionaries: bool, -) { +fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let fields: Vec<_> = cols .iter() - .map(|x| { - SortField::new(x.data_type().clone()) - .preserve_dictionaries(preserve_dictionaries) - }) + .map(|x| SortField::new(x.data_type().clone())) .collect(); c.bench_function(&format!("convert_columns {name}"), |b| { b.iter(|| { - let mut converter = RowConverter::new(fields.clone()).unwrap(); + let converter = RowConverter::new(fields.clone()).unwrap(); black_box(converter.convert_columns(&cols).unwrap()) }); }); - let mut converter = RowConverter::new(fields).unwrap(); + let converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(&cols).unwrap(); // using a pre-prepared row converter should be faster than the first time c.bench_function(&format!("convert_columns_prepared {name}"), |b| { @@ -65,46 +58,57 @@ fn do_bench( fn row_bench(c: &mut Criterion) { let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; - do_bench(c, "4096 u64(0)", cols, true); + do_bench(c, "4096 u64(0)", cols); let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; - do_bench(c, "4096 i64(0)", cols, true); + do_bench(c, "4096 i64(0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; - do_bench(c, "4096 string(10, 0)", cols, true); + do_bench(c, "4096 string(10, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; - do_bench(c, "4096 string(30, 0)", cols, true); + do_bench(c, "4096 string(30, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; - do_bench(c, "4096 string(100, 0)", cols, true); + do_bench(c, "4096 string(100, 0)", cols); let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; - do_bench(c, "4096 string(100, 0.5)", cols, true); + do_bench(c, "4096 string(100, 0.5)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(10, 0)", cols, true); + do_bench(c, "4096 string_dictionary(10, 0)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(30, 0)", cols, true); + do_bench(c, "4096 string_dictionary(30, 0)", cols); let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(100, 0)", cols.clone(), true); - let name = "4096 string_dictionary_non_preserving(100, 0)"; - do_bench(c, name, cols, false); + do_bench(c, "4096 string_dictionary(100, 0)", cols.clone()); let cols = vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; - do_bench(c, "4096 string_dictionary(100, 0.5)", cols.clone(), true); - let name = "4096 string_dictionary_non_preserving(100, 0.5)"; - do_bench(c, name, cols, false); + do_bench(c, "4096 string_dictionary(100, 0.5)", cols.clone()); + + let values = create_string_array_with_len::(10, 0., 10); + let dict = create_dict_from_values::(4096, 0., &values); + let cols = vec![Arc::new(dict) as ArrayRef]; + do_bench(c, "4096 string_dictionary_low_cardinality(10, 0)", cols); + + let values = create_string_array_with_len::(10, 0., 30); + let dict = create_dict_from_values::(4096, 0., &values); + let cols = vec![Arc::new(dict) as ArrayRef]; + do_bench(c, "4096 string_dictionary_low_cardinality(30, 0)", cols); + + let values = create_string_array_with_len::(10, 0., 100); + let dict = create_dict_from_values::(4096, 0., &values); + let cols = vec![Arc::new(dict) as ArrayRef]; + do_bench(c, "4096 string_dictionary_low_cardinality(100, 0)", cols); let cols = vec![ Arc::new(create_string_array_with_len::(4096, 0.5, 20)) as ArrayRef, @@ -116,7 +120,6 @@ fn row_bench(c: &mut Criterion) { c, "4096 string(20, 0.5), string(30, 0), string(100, 0), i64(0)", cols, - false, ); let cols = vec![ @@ -125,7 +128,7 @@ fn row_bench(c: &mut Criterion) { Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef, Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef, ]; - do_bench(c, "4096 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", cols, false); + do_bench(c, "4096 4096 string_dictionary(20, 0.5), string_dictionary(30, 0), string_dictionary(100, 0), i64(0)", cols); } criterion_group!(benches, row_bench); From 7b785310c055a0e3884a6aec51d3587057f9db31 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 17 Sep 2023 14:21:35 +0100 Subject: [PATCH 1214/1411] Update chrono pin (#4824) --- Cargo.toml | 2 ++ arrow-arith/Cargo.toml | 2 +- arrow-array/Cargo.toml | 2 +- arrow-cast/Cargo.toml | 2 +- arrow-csv/Cargo.toml | 2 +- arrow-json/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- object_store/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- parquet_derive_test/Cargo.toml | 2 +- 10 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b118c937ca36..804fdf5807ee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,3 +91,5 @@ arrow-schema = { version = "46.0.0", path = "./arrow-schema" } arrow-select = { version = "46.0.0", path = "./arrow-select" } arrow-string = { version = "46.0.0", path = "./arrow-string" } parquet = { version = "46.0.0", path = "./parquet", default-features = false } + +chrono = { version = "0.4.31", default-features = false, features = ["clock"] } diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index b5ea2e3c4354..57dc033e9645 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -38,7 +38,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -chrono = { version = "0.4.23", default-features = false } +chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 80a6eb3f541e..4f7ab24f9708 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -44,7 +44,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrow-buffer = { workspace = true } arrow-schema = { workspace = true } arrow-data = { workspace = true } -chrono = { version = "0.4.24", default-features = false, features = ["clock"] } +chrono = { workspace = true } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4.1", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 2758a4817814..2e0a9fdd4ebd 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -45,7 +45,7 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 1f1a762d5065..66a6d7dbcaa5 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -39,7 +39,7 @@ arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1" } lazy_static = { version = "1.4", default-features = false } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 137d53557790..977ed4390c99 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -44,7 +44,7 @@ indexmap = { version = "2.0", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } serde = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } lexical-core = { version = "0.8", default-features = false } [dev-dependencies] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index bc75207c2230..8abb4f73a384 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -88,7 +88,7 @@ ffi = ["arrow-schema/ffi", "arrow-data/ffi"] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index b8d4391321fd..72722df5483a 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -32,7 +32,7 @@ all-features = true [dependencies] # In alphabetical order async-trait = "0.1.53" bytes = "1.0" -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { version = "0.4.31", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" itertools = "0.11.0" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index c4f3696b43c9..7c346248acbb 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -53,7 +53,7 @@ brotli = { version = "3.3", default-features = false, features = ["std"], option flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } lz4 = { version = "1.23", default-features = false, optional = true } zstd = { version = "0.12.0", optional = true, default-features = false } -chrono = { version = "0.4.23", default-features = false, features = ["alloc"] } +chrono = { workspace = true } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } base64 = { version = "0.21", default-features = false, features = ["std", ], optional = true } diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index be24db85a109..a5d2e76d4503 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -31,4 +31,4 @@ rust-version = { workspace = true } [dependencies] parquet = { workspace = true } parquet_derive = { path = "../parquet_derive", default-features = false } -chrono = { version="0.4.23", default-features = false, features = [ "clock" ] } +chrono = { workspace = true } From a6dffca082386eb63d2e3c25635b4ddb22479274 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 17 Sep 2023 14:59:42 +0100 Subject: [PATCH 1215/1411] Update prost (#4825) --- arrow-flight/Cargo.toml | 4 +- arrow-flight/gen/Cargo.toml | 4 +- arrow-flight/src/arrow.flight.protocol.rs | 44 +++-- arrow-flight/src/lib.rs | 10 -- .../src/sql/arrow.flight.protocol.sql.rs | 150 +++++++++--------- arrow-integration-testing/Cargo.toml | 4 +- 6 files changed, 109 insertions(+), 107 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1a53dbddb13d..29a8109d8889 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -44,9 +44,9 @@ bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } paste = { version = "1.0" } -prost = { version = "0.11", default-features = false, features = ["prost-derive"] } +prost = { version = "0.12.1", default-features = false, features = ["prost-derive"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.10.0", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 13a93cce853f..c342170c5b72 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.67", default-features = false } -prost-build = { version = "=0.11.9", default-features = false } -tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } +prost-build = { version = "=0.12.1", default-features = false } +tonic-build = { version = "=0.10.0", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index 10dc7ace0356..e76013bd7c5f 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -685,7 +685,7 @@ pub mod flight_service_server { #[async_trait] pub trait FlightService: Send + Sync + 'static { /// Server streaming response type for the Handshake method. - type HandshakeStream: futures_core::Stream< + type HandshakeStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -700,7 +700,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListFlights method. - type ListFlightsStream: futures_core::Stream< + type ListFlightsStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -744,7 +744,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoGet method. - type DoGetStream: futures_core::Stream< + type DoGetStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -759,7 +759,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoPut method. - type DoPutStream: futures_core::Stream< + type DoPutStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -776,7 +776,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoExchange method. - type DoExchangeStream: futures_core::Stream< + type DoExchangeStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -792,7 +792,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoAction method. - type DoActionStream: futures_core::Stream< + type DoActionStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -809,7 +809,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListActions method. - type ListActionsStream: futures_core::Stream< + type ListActionsStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -930,7 +930,9 @@ pub mod flight_service_server { >, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).handshake(request).await }; + let fut = async move { + ::handshake(&inner, request).await + }; Box::pin(fut) } } @@ -976,7 +978,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).list_flights(request).await + ::list_flights(&inner, request).await }; Box::pin(fut) } @@ -1022,7 +1024,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).get_flight_info(request).await + ::get_flight_info(&inner, request).await }; Box::pin(fut) } @@ -1067,7 +1069,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).get_schema(request).await }; + let fut = async move { + ::get_schema(&inner, request).await + }; Box::pin(fut) } } @@ -1112,7 +1116,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_get(request).await }; + let fut = async move { + ::do_get(&inner, request).await + }; Box::pin(fut) } } @@ -1157,7 +1163,9 @@ pub mod flight_service_server { request: tonic::Request>, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_put(request).await }; + let fut = async move { + ::do_put(&inner, request).await + }; Box::pin(fut) } } @@ -1202,7 +1210,9 @@ pub mod flight_service_server { request: tonic::Request>, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_exchange(request).await }; + let fut = async move { + ::do_exchange(&inner, request).await + }; Box::pin(fut) } } @@ -1247,7 +1257,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_action(request).await }; + let fut = async move { + ::do_action(&inner, request).await + }; Box::pin(fut) } } @@ -1293,7 +1305,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).list_actions(request).await + ::list_actions(&inner, request).await }; Box::pin(fut) } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 04edf266389c..3035f109c685 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -316,16 +316,6 @@ impl TryFrom> for SchemaResult { } } -// TryFrom... - -impl TryFrom for DescriptorType { - type Error = ArrowError; - - fn try_from(value: i32) -> ArrowResult { - value.try_into() - } -} - impl TryFrom> for IpcMessage { type Error = ArrowError; diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index b2137d8543d3..c7c23311e61e 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -1077,10 +1077,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported commands. /// /// For instance: - /// - return 0 (\b0) => [] (GROUP BY is unsupported); + /// - return 0 (\b0) => \[\] (GROUP BY is unsupported); /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; - /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + /// - return 3 (\b11) => \[SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT\]. /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlSupportedGroupBy = 522, /// @@ -1104,14 +1104,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. /// /// For instance: - /// - return 0 (\b0) => [] (SQL grammar is unsupported); + /// - return 0 (\b0) => \[\] (SQL grammar is unsupported); /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; - /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + /// - return 3 (\b11) => \[SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR\]; /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; - /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + /// - return 5 (\b101) => \[SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR\]; + /// - return 6 (\b110) => \[SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR\]; + /// - return 7 (\b111) => \[SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR\]. /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlSupportedGrammar = 525, /// @@ -1121,14 +1121,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported commands. /// /// For instance: - /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + /// - return 0 (\b0) => \[\] (ANSI92 SQL grammar is unsupported); /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; - /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + /// - return 3 (\b11) => \[ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL\]; /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; - /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; - /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; - /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + /// - return 5 (\b101) => \[ANSI92_ENTRY_SQL, ANSI92_FULL_SQL\]; + /// - return 6 (\b110) => \[ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL\]; + /// - return 7 (\b111) => \[ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL\]. /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlAnsi92SupportedLevel = 526, /// @@ -1165,14 +1165,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. /// /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL schema); + /// - return 0 (\b0) => \[\] (no supported actions for SQL schema); /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 3 (\b11) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// - return 5 (\b101) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 6 (\b110) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 7 (\b111) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]. /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlSchemasSupportedActions = 533, /// @@ -1182,14 +1182,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. /// /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL catalog); + /// - return 0 (\b0) => \[\] (no supported actions for SQL catalog); /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 3 (\b11) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// - return 5 (\b101) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 6 (\b110) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 7 (\b111) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]. /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlCatalogsSupportedActions = 534, /// @@ -1199,10 +1199,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 0 (\b0) => \[\] (no supported SQL positioned commands); /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; - /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + /// - return 3 (\b11) => \[SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE\]. /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlSupportedPositionedCommands = 535, /// @@ -1227,22 +1227,22 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL subqueries); + /// - return 0 (\b0) => \[\] (no supported SQL subqueries); /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; - /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 3 (\b11) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS\]; /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; - /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; - /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + /// - return 5 (\b101) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS\]; + /// - return 6 (\b110) => \[SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS\]; + /// - return 7 (\b111) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS\]; /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; - /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 9 (\b1001) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 10 (\b1010) => \[SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 11 (\b1011) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 12 (\b1100) => \[SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 13 (\b1101) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 14 (\b1110) => \[SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 15 (\b1111) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; /// - ... /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlSupportedSubqueries = 538, @@ -1260,10 +1260,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 0 (\b0) => \[\] (no supported SQL positioned commands); /// - return 1 (\b1) => \[SQL_UNION\]; /// - return 2 (\b10) => \[SQL_UNION_ALL\]; - /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + /// - return 3 (\b11) => \[SQL_UNION, SQL_UNION_ALL\]. /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. SqlSupportedUnions = 540, /// Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. @@ -1341,22 +1341,22 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + /// - return 0 (\b0) => \[\] (no supported SQL transactions isolation levels); /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; - /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + /// - return 3 (\b11) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED\]; /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 5 (\b101) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 6 (\b110) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 7 (\b111) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 9 (\b1001) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 10 (\b1010) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 11 (\b1011) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 12 (\b1100) => \[SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 13 (\b1101) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 14 (\b1110) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 15 (\b1111) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; /// - ... /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. @@ -1381,14 +1381,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported result set types. /// /// For instance: - /// - return 0 (\b0) => [] (no supported result set types); + /// - return 0 (\b0) => \[\] (no supported result set types); /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; - /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + /// - return 3 (\b11) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; - /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 5 (\b101) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 6 (\b110) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 7 (\b111) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; /// - ... /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. @@ -1398,14 +1398,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetUnspecified = 568, /// @@ -1413,14 +1413,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetForwardOnly = 569, /// @@ -1428,14 +1428,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollSensitive = 570, /// @@ -1443,14 +1443,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollInsensitive = 571, /// diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 7f78cf50a9d7..86c2cb27d297 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -39,11 +39,11 @@ async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false, features = ["std"] } -prost = { version = "0.11", default-features = false } +prost = { version = "0.12", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.9", default-features = false } +tonic = { version = "0.10", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } From 95ee5d3c5736ad6a08fa4d70287ad0d56a74a8f7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 18 Sep 2023 01:36:34 -0700 Subject: [PATCH 1216/1411] feat: FixedSizeBinaryArray::value_data return reference (#4821) * feat: FixedSizeBinaryArray::value_data return reference * pr feedback --- arrow-array/src/array/fixed_size_binary_array.rs | 15 ++++++++++++--- arrow-string/src/substring.rs | 3 +-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 74a7c4c7a84a..f0b04c203ceb 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -179,9 +179,18 @@ impl FixedSizeBinaryArray { self.value_length } - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.value_data.clone() + /// Returns the values of this array. + /// + /// Unlike [`Self::value_data`] this returns the [`Buffer`] + /// allowing for zero-copy cloning. + #[inline] + pub fn values(&self) -> &Buffer { + &self.value_data + } + + /// Returns the raw value data. + pub fn value_data(&self) -> &[u8] { + self.value_data.as_slice() } /// Returns a zero-copy slice of this array with the indicated offset and length. diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index 1075d106911e..dc0dfdcbb4ad 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -347,8 +347,7 @@ fn fixed_size_binary_substring( // build value buffer let num_of_elements = array.len(); - let values = array.value_data(); - let data = values.as_slice(); + let data = array.value_data(); let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize)); (0..num_of_elements) .map(|idx| { From 175c7765939c0738defc736426c0b0a93b00bfa8 Mon Sep 17 00:00:00 2001 From: Harry Scholes Date: Mon, 18 Sep 2023 12:31:01 +0300 Subject: [PATCH 1217/1411] Fix typo in docstring (#4826) --- arrow-data/src/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 7e07194012bf..2073b932c994 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -174,7 +174,7 @@ pub(crate) fn into_buffers( } } -/// An generic representation of Arrow array data which encapsulates common attributes and +/// A generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. /// From 33b881dc184affa87137f2270803d30d05eed6d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 18 Sep 2023 21:35:35 +0100 Subject: [PATCH 1218/1411] Fix like scalar null (#4832) * Fix like scalar null * Review feedback --- arrow-string/src/like.rs | 54 ++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 279a4782009d..4478c4e4f7ef 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -155,20 +155,17 @@ fn apply( ) -> Result { let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { - let scalar = match r_v { - Some(dict) => match dict.nulls().filter(|n| n.null_count() != 0) { - Some(_) => return Ok(BooleanArray::new_null(l_len)), - None => { - let idx = dict.normalized_keys()[0]; - if r.is_null(idx) { - return Ok(BooleanArray::new_null(l_len)); - } - r.value(idx) - } - }, - None => r.value(0), + let idx = match r_v { + Some(dict) if dict.null_count() != 0 => { + return Ok(BooleanArray::new_null(l_len)) + } + Some(dict) => dict.normalized_keys()[0], + None => 0, }; - op_scalar(op, l, l_v, scalar) + if r.is_null(idx) { + return Ok(BooleanArray::new_null(l_len)); + } + op_scalar(op, l, l_v, r.value(idx)) } else { match (l_s, l_v, r_v) { (true, None, None) => { @@ -1539,4 +1536,35 @@ mod tests { ]), ); } + + #[test] + fn like_scalar_null() { + let a = StringArray::new_scalar("a"); + let b = Scalar::new(StringArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringArray::from_iter_values(["a"]); + let b = Scalar::new(StringArray::new_null(1)); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringArray::from_iter_values(["a"]); + let b = StringArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + + let a = StringArray::new_scalar("a"); + let b = StringArray::new_null(1); + let r = like(&a, &b).unwrap(); + assert_eq!(r.len(), 1); + assert_eq!(r.null_count(), 1); + assert!(r.is_null(0)); + } } From 47e8a8ddb31c39b2b4ef30217e219fed729909ee Mon Sep 17 00:00:00 2001 From: Matthew Cramerus <8771538+suremarc@users.noreply.github.com> Date: Mon, 18 Sep 2023 18:27:53 -0400 Subject: [PATCH 1219/1411] Fix flight sql do put handling, add bind parameter support to FlightSQL cli client (#4797) * change Streaming to Peekable> * add explanatory comment * working test * trigger pre-commit hooks? * Update arrow-flight/src/sql/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * remove unnecessary multi-thread annotation * rework api --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/examples/flight_sql_server.rs | 9 +- arrow-flight/src/bin/flight_sql_client.rs | 104 +++++++++-- arrow-flight/src/sql/client.rs | 50 ++++- arrow-flight/src/sql/server.rs | 110 ++++++++++- arrow-flight/tests/flight_sql_client_cli.rs | 194 +++++++++++++++----- 5 files changed, 392 insertions(+), 75 deletions(-) diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 1e99957390d8..d1aeae6f0a6c 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use arrow_flight::sql::server::PeekableFlightDataStream; use base64::prelude::BASE64_STANDARD; use base64::Engine; use futures::{stream, Stream, TryStreamExt}; @@ -602,7 +603,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Ok(FAKE_UPDATE_RESULT) } @@ -610,7 +611,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_substrait_plan( &self, _ticket: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan not implemented", @@ -620,7 +621,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + _request: Request, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented( "do_put_prepared_statement_query not implemented", @@ -630,7 +631,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update not implemented", diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index 20c8062f899e..d7b02414c5cc 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -15,15 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::{sync::Arc, time::Duration}; +use std::{error::Error, sync::Arc, time::Duration}; -use arrow_array::RecordBatch; -use arrow_cast::pretty::pretty_format_batches; +use arrow_array::{ArrayRef, Datum, RecordBatch, StringArray}; +use arrow_cast::{cast_with_options, pretty::pretty_format_batches, CastOptions}; use arrow_flight::{ sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, + FlightInfo, }; use arrow_schema::{ArrowError, Schema}; -use clap::Parser; +use clap::{Parser, Subcommand}; use futures::TryStreamExt; use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; use tracing_log::log::info; @@ -98,8 +99,20 @@ struct Args { #[clap(flatten)] client_args: ClientArgs, - /// SQL query. - query: String, + #[clap(subcommand)] + cmd: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + StatementQuery { + query: String, + }, + PreparedStatementQuery { + query: String, + #[clap(short, value_parser = parse_key_val)] + params: Vec<(String, String)>, + }, } #[tokio::main] @@ -108,12 +121,50 @@ async fn main() { setup_logging(); let mut client = setup_client(args.client_args).await.expect("setup client"); - let info = client - .execute(args.query, None) + let flight_info = match args.cmd { + Command::StatementQuery { query } => client + .execute(query, None) + .await + .expect("execute statement"), + Command::PreparedStatementQuery { query, params } => { + let mut prepared_stmt = client + .prepare(query, None) + .await + .expect("prepare statement"); + + if !params.is_empty() { + prepared_stmt + .set_parameters( + construct_record_batch_from_params( + ¶ms, + prepared_stmt + .parameter_schema() + .expect("get parameter schema"), + ) + .expect("construct parameters"), + ) + .expect("bind parameters") + } + + prepared_stmt + .execute() + .await + .expect("execute prepared statement") + } + }; + + let batches = execute_flight(&mut client, flight_info) .await - .expect("prepare statement"); - info!("got flight info"); + .expect("read flight data"); + let res = pretty_format_batches(batches.as_slice()).expect("format results"); + println!("{res}"); +} + +async fn execute_flight( + client: &mut FlightSqlServiceClient, + info: FlightInfo, +) -> Result, ArrowError> { let schema = Arc::new(Schema::try_from(info.clone()).expect("valid schema")); let mut batches = Vec::with_capacity(info.endpoint.len() + 1); batches.push(RecordBatch::new_empty(schema)); @@ -134,8 +185,27 @@ async fn main() { } info!("received data"); - let res = pretty_format_batches(batches.as_slice()).expect("format results"); - println!("{res}"); + Ok(batches) +} + +fn construct_record_batch_from_params( + params: &[(String, String)], + parameter_schema: &Schema, +) -> Result { + let mut items = Vec::<(&String, ArrayRef)>::new(); + + for (name, value) in params { + let field = parameter_schema.field_with_name(name)?; + let value_as_array = StringArray::new_scalar(value); + let casted = cast_with_options( + value_as_array.get().0, + field.data_type(), + &CastOptions::default(), + )?; + items.push((name, casted)) + } + + RecordBatch::try_from_iter(items) } fn setup_logging() { @@ -203,3 +273,13 @@ async fn setup_client( Ok(client) } + +/// Parse a single key-value pair +fn parse_key_val( + s: &str, +) -> Result<(String, String), Box> { + let pos = s + .find('=') + .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?; + Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) +} diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 4b1f38ebcbb7..2d382cf2ca20 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -24,6 +24,8 @@ use std::collections::HashMap; use std::str::FromStr; use tonic::metadata::AsciiMetadataKey; +use crate::encode::FlightDataEncoderBuilder; +use crate::error::FlightError; use crate::flight_service_client::FlightServiceClient; use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; use crate::sql::{ @@ -32,8 +34,8 @@ use crate::sql::{ CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandStatementQuery, CommandStatementUpdate, - DoPutUpdateResult, ProstMessageExt, SqlInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, + CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, }; use crate::{ Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, @@ -439,9 +441,12 @@ impl PreparedStatement { /// Executes the prepared statement query on the server. pub async fn execute(&mut self) -> Result { + self.write_bind_params().await?; + let cmd = CommandPreparedStatementQuery { prepared_statement_handle: self.handle.clone(), }; + let result = self .flight_sql_client .get_flight_info_for_command(cmd) @@ -451,7 +456,9 @@ impl PreparedStatement { /// Executes the prepared statement update query on the server. pub async fn execute_update(&mut self) -> Result { - let cmd = CommandPreparedStatementQuery { + self.write_bind_params().await?; + + let cmd = CommandPreparedStatementUpdate { prepared_statement_handle: self.handle.clone(), }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); @@ -492,6 +499,36 @@ impl PreparedStatement { Ok(()) } + /// Submit parameters to the server, if any have been set on this prepared statement instance + async fn write_bind_params(&mut self) -> Result<(), ArrowError> { + if let Some(ref params_batch) = self.parameter_binding { + let cmd = CommandPreparedStatementQuery { + prepared_statement_handle: self.handle.clone(), + }; + + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let flight_stream_builder = FlightDataEncoderBuilder::new() + .with_flight_descriptor(Some(descriptor)) + .with_schema(params_batch.schema()); + let flight_data = flight_stream_builder + .build(futures::stream::iter( + self.parameter_binding.clone().map(Ok), + )) + .try_collect::>() + .await + .map_err(flight_error_to_arrow_error)?; + + self.flight_sql_client + .do_put(stream::iter(flight_data)) + .await? + .try_collect::>() + .await + .map_err(status_to_arrow_error)?; + } + + Ok(()) + } + /// Close the prepared statement, so that this PreparedStatement can not used /// anymore and server can free up any resources. pub async fn close(mut self) -> Result<(), ArrowError> { @@ -515,6 +552,13 @@ fn status_to_arrow_error(status: tonic::Status) -> ArrowError { ArrowError::IpcError(format!("{status:?}")) } +fn flight_error_to_arrow_error(err: FlightError) -> ArrowError { + match err { + FlightError::Arrow(e) => e, + e => ArrowError::ExternalError(Box::new(e)), + } +} + // A polymorphic structure to natively represent different types of data contained in `FlightData` pub enum ArrowFlightData { RecordBatch(RecordBatch), diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 102d97105a2e..a158ed77f54d 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -19,7 +19,7 @@ use std::pin::Pin; -use futures::Stream; +use futures::{stream::Peekable, Stream, StreamExt}; use prost::Message; use tonic::{Request, Response, Status, Streaming}; @@ -366,7 +366,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { /// Implementors may override to handle additional calls to do_put() async fn do_put_fallback( &self, - _request: Request>, + _request: Request, message: Any, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented(format!( @@ -379,7 +379,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_statement_update has no default implementation", @@ -390,7 +390,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + _request: Request, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented( "do_put_prepared_statement_query has no default implementation", @@ -401,7 +401,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update has no default implementation", @@ -412,7 +412,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_substrait_plan( &self, _query: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan has no default implementation", @@ -688,9 +688,17 @@ where async fn do_put( &self, - mut request: Request>, + request: Request>, ) -> Result, Status> { - let cmd = request.get_mut().message().await?.unwrap(); + // See issue #4658: https://github.com/apache/arrow-rs/issues/4658 + // To dispatch to the correct `do_put` method, we cannot discard the first message, + // as it may contain the Arrow schema, which the `do_put` handler may need. + // To allow the first message to be reused by the `do_put` handler, + // we wrap this stream in a `Peekable` one, which allows us to peek at + // the first message without discarding it. + let mut request = request.map(PeekableFlightDataStream::new); + let cmd = Pin::new(request.get_mut()).peek().await.unwrap().clone()?; + let message = Any::decode(&*cmd.flight_descriptor.unwrap().cmd) .map_err(decode_error_to_status)?; match Command::try_from(message).map_err(arrow_error_to_status)? { @@ -957,3 +965,89 @@ fn decode_error_to_status(err: prost::DecodeError) -> Status { fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { Status::internal(format!("{err:?}")) } + +/// A wrapper around [`Streaming`] that allows "peeking" at the +/// message at the front of the stream without consuming it. +/// This is needed because sometimes the first message in the stream will contain +/// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic +/// must inspect this information. +/// +/// # Example +/// +/// [`PeekableFlightDataStream::peek`] can be used to peek at the first message without +/// discarding it; otherwise, `PeekableFlightDataStream` can be used as a regular stream. +/// See the following example: +/// +/// ```no_run +/// use arrow_array::RecordBatch; +/// use arrow_flight::decode::FlightRecordBatchStream; +/// use arrow_flight::FlightDescriptor; +/// use arrow_flight::error::FlightError; +/// use arrow_flight::sql::server::PeekableFlightDataStream; +/// use tonic::{Request, Status}; +/// use futures::TryStreamExt; +/// +/// #[tokio::main] +/// async fn main() -> Result<(), Status> { +/// let request: Request = todo!(); +/// let stream: PeekableFlightDataStream = request.into_inner(); +/// +/// // The first message contains the flight descriptor and the schema. +/// // Read the flight descriptor without discarding the schema: +/// let flight_descriptor: FlightDescriptor = stream +/// .peek() +/// .await +/// .cloned() +/// .transpose()? +/// .and_then(|data| data.flight_descriptor) +/// .expect("first message should contain flight descriptor"); +/// +/// // Pass the stream through a decoder +/// let batches: Vec = FlightRecordBatchStream::new_from_flight_data( +/// request.into_inner().map_err(|e| e.into()), +/// ) +/// .try_collect() +/// .await?; +/// } +/// ``` +pub struct PeekableFlightDataStream { + inner: Peekable>, +} + +impl PeekableFlightDataStream { + fn new(stream: Streaming) -> Self { + Self { + inner: stream.peekable(), + } + } + + /// Convert this stream into a `Streaming`. + /// Any messages observed through [`Self::peek`] will be lost + /// after the conversion. + pub fn into_inner(self) -> Streaming { + self.inner.into_inner() + } + + /// Convert this stream into a `Peekable>`. + /// Preserves the state of the stream, so that calls to [`Self::peek`] + /// and [`Self::poll_next`] are the same. + pub fn into_peekable(self) -> Peekable> { + self.inner + } + + /// Peek at the head of this stream without advancing it. + pub async fn peek(&mut self) -> Option<&Result> { + Pin::new(&mut self.inner).peek().await + } +} + +impl Stream for PeekableFlightDataStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.inner.poll_next_unpin(cx) + } +} diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 912bcc75a9df..221e776218c3 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -19,11 +19,13 @@ use std::{net::SocketAddr, pin::Pin, sync::Arc, time::Duration}; use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray}; use arrow_flight::{ + decode::FlightRecordBatchStream, flight_service_server::{FlightService, FlightServiceServer}, sql::{ - server::FlightSqlService, ActionBeginSavepointRequest, - ActionBeginSavepointResult, ActionBeginTransactionRequest, - ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, + server::{FlightSqlService, PeekableFlightDataStream}, + ActionBeginSavepointRequest, ActionBeginSavepointResult, + ActionBeginTransactionRequest, ActionBeginTransactionResult, + ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, @@ -36,18 +38,20 @@ use arrow_flight::{ }, utils::batches_to_flight_data, Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, Ticket, + HandshakeResponse, IpcMessage, PutResult, SchemaAsIpc, Ticket, }; +use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; use assert_cmd::Command; -use futures::Stream; +use bytes::Bytes; +use futures::{Stream, StreamExt, TryStreamExt}; use prost::Message; use tokio::{net::TcpListener, task::JoinHandle}; use tonic::{Request, Response, Status, Streaming}; const QUERY: &str = "SELECT * FROM table;"; -#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[tokio::test] async fn test_simple() { let test_server = FlightSqlServiceImpl {}; let fixture = TestFixture::new(&test_server).await; @@ -63,6 +67,7 @@ async fn test_simple() { .arg(addr.ip().to_string()) .arg("--port") .arg(addr.port().to_string()) + .arg("statement-query") .arg(QUERY) .assert() .success() @@ -87,10 +92,56 @@ async fn test_simple() { ); } +const PREPARED_QUERY: &str = "SELECT * FROM table WHERE field = $1"; +const PREPARED_STATEMENT_HANDLE: &str = "prepared_statement_handle"; + +#[tokio::test] +async fn test_do_put_prepared_statement() { + let test_server = FlightSqlServiceImpl {}; + let fixture = TestFixture::new(&test_server).await; + let addr = fixture.addr; + + let stdout = tokio::task::spawn_blocking(move || { + Command::cargo_bin("flight_sql_client") + .unwrap() + .env_clear() + .env("RUST_BACKTRACE", "1") + .env("RUST_LOG", "warn") + .arg("--host") + .arg(addr.ip().to_string()) + .arg("--port") + .arg(addr.port().to_string()) + .arg("prepared-statement-query") + .arg(PREPARED_QUERY) + .args(["-p", "$1=string"]) + .args(["-p", "$2=64"]) + .assert() + .success() + .get_output() + .stdout + .clone() + }) + .await + .unwrap(); + + fixture.shutdown_and_wait().await; + + assert_eq!( + std::str::from_utf8(&stdout).unwrap().trim(), + "+--------------+-----------+\ + \n| field_string | field_int |\ + \n+--------------+-----------+\ + \n| Hello | 42 |\ + \n| lovely | |\ + \n| FlightSQL! | 1337 |\ + \n+--------------+-----------+", + ); +} + /// All tests must complete within this many seconds or else the test server is shutdown const DEFAULT_TIMEOUT_SECONDS: u64 = 30; -#[derive(Clone)] +#[derive(Clone, Default)] pub struct FlightSqlServiceImpl {} impl FlightSqlServiceImpl { @@ -116,6 +167,59 @@ impl FlightSqlServiceImpl { ]; RecordBatch::try_new(Arc::new(schema), cols) } + + fn create_fake_prepared_stmt( + ) -> Result { + let handle = PREPARED_STATEMENT_HANDLE.to_string(); + let schema = Schema::new(vec![ + Field::new("field_string", DataType::Utf8, false), + Field::new("field_int", DataType::Int64, true), + ]); + + let parameter_schema = Schema::new(vec![ + Field::new("$1", DataType::Utf8, false), + Field::new("$2", DataType::Int64, true), + ]); + + Ok(ActionCreatePreparedStatementResult { + prepared_statement_handle: handle.into(), + dataset_schema: serialize_schema(&schema)?, + parameter_schema: serialize_schema(¶meter_schema)?, + }) + } + + fn fake_flight_info(&self) -> Result { + let batch = Self::fake_result()?; + + Ok(FlightInfo::new() + .try_with_schema(&batch.schema()) + .expect("encoding schema") + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_1"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_2"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_total_records(batch.num_rows() as i64) + .with_total_bytes(batch.get_array_memory_size() as i64) + .with_ordered(false)) + } +} + +fn serialize_schema(schema: &Schema) -> Result { + Ok(IpcMessage::try_from(SchemaAsIpc::new(schema, &IpcWriteOptions::default()))?.0) } #[tonic::async_trait] @@ -164,45 +268,21 @@ impl FlightSqlService for FlightSqlServiceImpl { ) -> Result, Status> { assert_eq!(query.query, QUERY); - let batch = Self::fake_result().unwrap(); - - let info = FlightInfo::new() - .try_with_schema(&batch.schema()) - .expect("encoding schema") - .with_endpoint( - FlightEndpoint::new().with_ticket(Ticket::new( - FetchResults { - handle: String::from("part_1"), - } - .as_any() - .encode_to_vec(), - )), - ) - .with_endpoint( - FlightEndpoint::new().with_ticket(Ticket::new( - FetchResults { - handle: String::from("part_2"), - } - .as_any() - .encode_to_vec(), - )), - ) - .with_total_records(batch.num_rows() as i64) - .with_total_bytes(batch.get_array_memory_size() as i64) - .with_ordered(false); - - let resp = Response::new(info); + let resp = Response::new(self.fake_flight_info().unwrap()); Ok(resp) } async fn get_flight_info_prepared_statement( &self, - _cmd: CommandPreparedStatementQuery, + cmd: CommandPreparedStatementQuery, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_prepared_statement not implemented", - )) + assert_eq!( + cmd.prepared_statement_handle, + PREPARED_STATEMENT_HANDLE.as_bytes() + ); + let resp = Response::new(self.fake_flight_info().unwrap()); + Ok(resp) } async fn get_flight_info_substrait_plan( @@ -426,7 +506,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_statement_update not implemented", @@ -436,7 +516,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_substrait_plan( &self, _ticket: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan not implemented", @@ -446,17 +526,36 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + request: Request, ) -> Result::DoPutStream>, Status> { - Err(Status::unimplemented( - "do_put_prepared_statement_query not implemented", + // just make sure decoding the parameters works + let parameters = FlightRecordBatchStream::new_from_flight_data( + request.into_inner().map_err(|e| e.into()), + ) + .try_collect::>() + .await?; + + for (left, right) in parameters[0].schema().all_fields().iter().zip(vec![ + Field::new("$1", DataType::Utf8, false), + Field::new("$2", DataType::Int64, true), + ]) { + if left.name() != right.name() || left.data_type() != right.data_type() { + return Err(Status::invalid_argument(format!( + "Parameters did not match parameter schema\ngot {}", + parameters[0].schema(), + ))); + } + } + + Ok(Response::new( + futures::stream::once(async { Ok(PutResult::default()) }).boxed(), )) } async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update not implemented", @@ -468,9 +567,8 @@ impl FlightSqlService for FlightSqlServiceImpl { _query: ActionCreatePreparedStatementRequest, _request: Request, ) -> Result { - Err(Status::unimplemented( - "do_action_create_prepared_statement not implemented", - )) + Self::create_fake_prepared_stmt() + .map_err(|e| Status::internal(format!("Unable to serialize schema: {e}"))) } async fn do_action_close_prepared_statement( From f7464bc056662a091f29438e01069ad330b56161 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 19 Sep 2023 08:59:53 +0100 Subject: [PATCH 1220/1411] Fix merge_dictionary_values in selection kernels (#4833) --- arrow-select/src/dictionary.rs | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs index 8630b332f068..330196ae33f4 100644 --- a/arrow-select/src/dictionary.rs +++ b/arrow-select/src/dictionary.rs @@ -152,7 +152,7 @@ pub fn merge_dictionary_values( ) -> Result, ArrowError> { let mut num_values = 0; - let mut values = Vec::with_capacity(dictionaries.len()); + let mut values_arrays = Vec::with_capacity(dictionaries.len()); let mut value_slices = Vec::with_capacity(dictionaries.len()); for (idx, dictionary) in dictionaries.iter().enumerate() { @@ -164,11 +164,13 @@ pub fn merge_dictionary_values( (None, None) => None, }; let keys = dictionary.keys().values(); - let values_mask = compute_values_mask(keys, key_mask.as_ref()); - let v = dictionary.values().as_ref(); - num_values += v.len(); - value_slices.push(get_masked_values(v, &values_mask)); - values.push(v) + let values = dictionary.values().as_ref(); + let values_mask = compute_values_mask(keys, key_mask.as_ref(), values.len()); + + let masked_values = get_masked_values(values, &values_mask); + num_values += masked_values.len(); + value_slices.push(masked_values); + values_arrays.push(values) } // Map from value to new index @@ -202,7 +204,7 @@ pub fn merge_dictionary_values( Ok(MergedDictionaries { key_mappings, - values: interleave(&values, &indices)?, + values: interleave(&values_arrays, &indices)?, }) } @@ -211,9 +213,10 @@ pub fn merge_dictionary_values( fn compute_values_mask( keys: &ScalarBuffer, mask: Option<&BooleanBuffer>, + max_key: usize, ) -> BooleanBuffer { - let mut builder = BooleanBufferBuilder::new(keys.len()); - builder.advance(keys.len()); + let mut builder = BooleanBufferBuilder::new(max_key); + builder.advance(max_key); match mask { Some(n) => n @@ -330,4 +333,15 @@ mod tests { assert_eq!(&merged.key_mappings[0], &[0, 0, 0, 1, 0]); assert_eq!(&merged.key_mappings[1], &[]); } + + #[test] + fn test_merge_keys_smaller() { + let values = StringArray::from_iter_values(["a", "b"]); + let keys = Int32Array::from_iter_values([1]); + let a = DictionaryArray::new(keys, Arc::new(values)); + + let merged = merge_dictionary_values(&[&a], None).unwrap(); + let expected = StringArray::from(vec!["b"]); + assert_eq!(merged.values.as_ref(), &expected); + } } From e214d6b6129f2b66283c5f2ed65323d57a64630d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:57:15 +0100 Subject: [PATCH 1221/1411] Respect FormatOption::nulls for NullArray (#4836) * Respect FormatOption::nulls for NullArray * Clippy --- arrow-cast/src/display.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index d15d57cf3c05..246135e114bc 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -399,8 +399,15 @@ impl<'a> DisplayIndex for &'a BooleanArray { } } -impl<'a> DisplayIndex for &'a NullArray { - fn write(&self, _idx: usize, _f: &mut dyn Write) -> FormatResult { +impl<'a> DisplayIndexState<'a> for &'a NullArray { + type State = &'a str; + + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + Ok(options.null) + } + + fn write(&self, state: &Self::State, _idx: usize, f: &mut dyn Write) -> FormatResult { + f.write_str(state)?; Ok(()) } } @@ -1098,4 +1105,12 @@ mod tests { assert_eq!(iso[5], "-P45DT50554S"); assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34 secs"); } + + #[test] + fn test_null() { + let array = NullArray::new(2); + let options = FormatOptions::new().with_null("NULL"); + let formatted = format_array(&array, &options); + assert_eq!(formatted, &["NULL".to_string(), "NULL".to_string()]) + } } From 1d6feeacebb8d0d659d493b783ba381940973745 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 19 Sep 2023 12:58:06 +0100 Subject: [PATCH 1222/1411] Prepare arrow 47.0.0 (#4827) * Prepare arrow 47.0.0 * Update --- CHANGELOG-old.md | 110 +++++++++++++++++++++++ CHANGELOG.md | 147 +++++++++++-------------------- Cargo.toml | 32 +++---- dev/release/update_change_log.sh | 4 +- 4 files changed, 181 insertions(+), 112 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c404133f564e..bac7847bdac5 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,116 @@ # Historical Changelog +## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) + +**Breaking changes:** + +- API improvement: `batches_to_flight_data` forces clone [\#4656](https://github.com/apache/arrow-rs/issues/4656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with\_values [\#4707](https://github.com/apache/arrow-rs/pull/4707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup parquet type builders [\#4706](https://github.com/apache/arrow-rs/pull/4706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Take kernel dyn Array [\#4705](https://github.com/apache/arrow-rs/pull/4705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve ergonomics of Scalar [\#4704](https://github.com/apache/arrow-rs/pull/4704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Datum based comparison kernels \(\#4596\) [\#4701](https://github.com/apache/arrow-rs/pull/4701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Improve `Array` Logical Nullability [\#4691](https://github.com/apache/arrow-rs/pull/4691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate ArrayData Buffer Alignment and Automatically Align IPC buffers \(\#4255\) [\#4681](https://github.com/apache/arrow-rs/pull/4681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More intuitive bool-to-string casting [\#4666](https://github.com/apache/arrow-rs/pull/4666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fsdvh](https://github.com/fsdvh)) +- enhancement: batches\_to\_flight\_data use a schema ref as param. [\#4665](https://github.com/apache/arrow-rs/pull/4665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) +- fix: from\_thrift avoid panic when stats in invalid. [\#4642](https://github.com/apache/arrow-rs/pull/4642) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jackwener](https://github.com/jackwener)) +- bug: Add some missing field in row group metadata: ordinal, total co… [\#4636](https://github.com/apache/arrow-rs/pull/4636) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liurenjie1024](https://github.com/liurenjie1024)) +- Remove deprecated limit kernel [\#4597](https://github.com/apache/arrow-rs/pull/4597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- parquet: support setting the field\_id with an ArrowWriter [\#4702](https://github.com/apache/arrow-rs/issues/4702) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support references in i256 arithmetic ops [\#4694](https://github.com/apache/arrow-rs/issues/4694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Precision-Loss Decimal Arithmetic [\#4664](https://github.com/apache/arrow-rs/issues/4664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Faster i256 Division [\#4663](https://github.com/apache/arrow-rs/issues/4663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `concat_batches` for 0 columns [\#4661](https://github.com/apache/arrow-rs/issues/4661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `filter_record_batch` should support filtering record batch without columns [\#4647](https://github.com/apache/arrow-rs/issues/4647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve speed of `lexicographical_partition_ranges` [\#4614](https://github.com/apache/arrow-rs/issues/4614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Add Rank Function [\#4606](https://github.com/apache/arrow-rs/issues/4606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based Comparison Kernels [\#4596](https://github.com/apache/arrow-rs/issues/4596) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Convenience method to create `DataType::List` correctly [\#4544](https://github.com/apache/arrow-rs/issues/4544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Deprecated Arithmetic Kernels [\#4481](https://github.com/apache/arrow-rs/issues/4481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Equality kernel where null==null gives true [\#4438](https://github.com/apache/arrow-rs/issues/4438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Parquet ArrowWriter Ignores Nulls in Dictionary Values [\#4690](https://github.com/apache/arrow-rs/issues/4690) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Schema Nullability Validation Fails to Account for Dictionary Nulls [\#4689](https://github.com/apache/arrow-rs/issues/4689) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Comparison Kernels Ignore Nulls in Dictionary Values [\#4688](https://github.com/apache/arrow-rs/issues/4688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting List to String Ignores Format Options [\#4669](https://github.com/apache/arrow-rs/issues/4669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Double free in C Stream Interface [\#4659](https://github.com/apache/arrow-rs/issues/4659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CI Failing On Packed SIMD [\#4651](https://github.com/apache/arrow-rs/issues/4651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `RowInterner::size()` much too low for high cardinality dictionary columns [\#4645](https://github.com/apache/arrow-rs/issues/4645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Decimal PrimitiveArray change datatype after try\_unary [\#4644](https://github.com/apache/arrow-rs/issues/4644) +- Better explanation in docs for Dictionary field encoding using RowConverter [\#4639](https://github.com/apache/arrow-rs/issues/4639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `List(FixedSizeBinary)` array equality check may return wrong result [\#4637](https://github.com/apache/arrow-rs/issues/4637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow::compute::nullif` panics if `NullArray` is provided [\#4634](https://github.com/apache/arrow-rs/issues/4634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty lists in FixedSizeListArray::try\_new is not handled [\#4623](https://github.com/apache/arrow-rs/issues/4623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Bounds checking in `MutableBuffer::set_null_bits` can be bypassed [\#4620](https://github.com/apache/arrow-rs/issues/4620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- TypedDictionaryArray Misleading Null Behaviour [\#4616](https://github.com/apache/arrow-rs/issues/4616) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- bug: Parquet writer missing row group metadata fields such as `compressed_size`, `file offset`. [\#4610](https://github.com/apache/arrow-rs/issues/4610) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `new_null_array` generates an invalid union array [\#4600](https://github.com/apache/arrow-rs/issues/4600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Footer parsing fails for very large parquet file. [\#4592](https://github.com/apache/arrow-rs/issues/4592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- bug\(parquet\): Disabling global statistics but enabling for particular column breaks reading [\#4587](https://github.com/apache/arrow-rs/issues/4587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `arrow::compute::concat` panics for dense union arrays with non-trivial type IDs [\#4578](https://github.com/apache/arrow-rs/issues/4578) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) + +**Merged pull requests:** + +- Add distinct kernels \(\#960\) \(\#4438\) [\#4716](https://github.com/apache/arrow-rs/pull/4716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update parquet object\_store 0.7 [\#4715](https://github.com/apache/arrow-rs/pull/4715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support Field ID in ArrowWriter \(\#4702\) [\#4710](https://github.com/apache/arrow-rs/pull/4710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove rank kernels [\#4703](https://github.com/apache/arrow-rs/pull/4703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support references in i256 arithmetic ops [\#4692](https://github.com/apache/arrow-rs/pull/4692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup DynComparator \(\#2654\) [\#4687](https://github.com/apache/arrow-rs/pull/4687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Separate metadata fetch from `ArrowReaderBuilder` construction \(\#4674\) [\#4676](https://github.com/apache/arrow-rs/pull/4676) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- cleanup some assert\(\) with error propagation [\#4673](https://github.com/apache/arrow-rs/pull/4673) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Faster i256 Division \(2-100x\) \(\#4663\) [\#4672](https://github.com/apache/arrow-rs/pull/4672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Fix equality of nested nullable FixedSizeBinary \(\#4637\) [\#4670](https://github.com/apache/arrow-rs/pull/4670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use ArrayFormatter in cast kernel [\#4668](https://github.com/apache/arrow-rs/pull/4668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve API docs for FlightSQL metadata builders [\#4667](https://github.com/apache/arrow-rs/pull/4667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Support `concat_batches` for 0 columns [\#4662](https://github.com/apache/arrow-rs/pull/4662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- fix ownership of c stream error [\#4660](https://github.com/apache/arrow-rs/pull/4660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Minor: Fix illustration for dict encoding [\#4657](https://github.com/apache/arrow-rs/pull/4657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- minor: move comment to the correct location [\#4655](https://github.com/apache/arrow-rs/pull/4655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Update packed\_simd and run miri tests on simd code [\#4654](https://github.com/apache/arrow-rs/pull/4654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- impl `From>` for `BufferBuilder` and `MutableBuffer` [\#4650](https://github.com/apache/arrow-rs/pull/4650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Filter record batch with 0 columns [\#4648](https://github.com/apache/arrow-rs/pull/4648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Account for child `Bucket` size in OrderPreservingInterner [\#4646](https://github.com/apache/arrow-rs/pull/4646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` [\#4638](https://github.com/apache/arrow-rs/pull/4638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- fix\(select\): handle `NullArray` in `nullif` [\#4635](https://github.com/apache/arrow-rs/pull/4635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Move `BufferBuilder` to `arrow-buffer` [\#4630](https://github.com/apache/arrow-rs/pull/4630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- allow zero sized empty fixed [\#4626](https://github.com/apache/arrow-rs/pull/4626) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix: compute\_dictionary\_mapping use wrong offsetSize [\#4625](https://github.com/apache/arrow-rs/pull/4625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- impl `FromIterator` for `MutableBuffer` [\#4624](https://github.com/apache/arrow-rs/pull/4624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- expand docs for FixedSizeListArray [\#4622](https://github.com/apache/arrow-rs/pull/4622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix\(buffer\): panic on end index overflow in `MutableBuffer::set_null_bits` [\#4621](https://github.com/apache/arrow-rs/pull/4621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- impl `Default` for `arrow_buffer::buffer::MutableBuffer` [\#4619](https://github.com/apache/arrow-rs/pull/4619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Minor: improve docs and add example for lexicographical\_partition\_ranges [\#4615](https://github.com/apache/arrow-rs/pull/4615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Cleanup sort [\#4613](https://github.com/apache/arrow-rs/pull/4613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add rank function \(\#4606\) [\#4609](https://github.com/apache/arrow-rs/pull/4609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more docs and examples for ListArray and OffsetsBuffer [\#4607](https://github.com/apache/arrow-rs/pull/4607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Simplify dictionary sort [\#4605](https://github.com/apache/arrow-rs/pull/4605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate sort benchmarks [\#4604](https://github.com/apache/arrow-rs/pull/4604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't Reorder Nulls in sort\_to\_indices \(\#4545\) [\#4603](https://github.com/apache/arrow-rs/pull/4603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix\(data\): create child arrays of correct length when building a sparse union null array [\#4601](https://github.com/apache/arrow-rs/pull/4601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Use u32 metadata\_len when parsing footer of parquet. [\#4599](https://github.com/apache/arrow-rs/pull/4599) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Berrysoft](https://github.com/Berrysoft)) +- fix\(data\): map type ID to child index before indexing a union child array [\#4598](https://github.com/apache/arrow-rs/pull/4598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Remove deprecated arithmetic kernels \(\#4481\) [\#4594](https://github.com/apache/arrow-rs/pull/4594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Test Disabled Page Statistics \(\#4587\) [\#4589](https://github.com/apache/arrow-rs/pull/4589) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup ArrayData::buffers [\#4583](https://github.com/apache/arrow-rs/pull/4583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use contains\_nulls in ArrayData equality of byte arrays [\#4582](https://github.com/apache/arrow-rs/pull/4582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Vectorized lexicographical\_partition\_ranges \(~80% faster\) [\#4575](https://github.com/apache/arrow-rs/pull/4575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- chore: add datatype new\_list [\#4561](https://github.com/apache/arrow-rs/pull/4561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) ## [45.0.0](https://github.com/apache/arrow-rs/tree/45.0.0) (2023-07-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/44.0.0...45.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74f74bc3ef13..1f97055a9c0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,116 +19,75 @@ # Changelog -## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) +## [47.0.0](https://github.com/apache/arrow-rs/tree/47.0.0) (2023-09-19) -[Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/46.0.0...47.0.0) **Breaking changes:** -- API improvement: `batches_to_flight_data` forces clone [\#4656](https://github.com/apache/arrow-rs/issues/4656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with\_values [\#4707](https://github.com/apache/arrow-rs/pull/4707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup parquet type builders [\#4706](https://github.com/apache/arrow-rs/pull/4706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Take kernel dyn Array [\#4705](https://github.com/apache/arrow-rs/pull/4705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve ergonomics of Scalar [\#4704](https://github.com/apache/arrow-rs/pull/4704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Datum based comparison kernels \(\#4596\) [\#4701](https://github.com/apache/arrow-rs/pull/4701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Improve `Array` Logical Nullability [\#4691](https://github.com/apache/arrow-rs/pull/4691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Validate ArrayData Buffer Alignment and Automatically Align IPC buffers \(\#4255\) [\#4681](https://github.com/apache/arrow-rs/pull/4681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- More intuitive bool-to-string casting [\#4666](https://github.com/apache/arrow-rs/pull/4666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fsdvh](https://github.com/fsdvh)) -- enhancement: batches\_to\_flight\_data use a schema ref as param. [\#4665](https://github.com/apache/arrow-rs/pull/4665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) -- fix: from\_thrift avoid panic when stats in invalid. [\#4642](https://github.com/apache/arrow-rs/pull/4642) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jackwener](https://github.com/jackwener)) -- bug: Add some missing field in row group metadata: ordinal, total co… [\#4636](https://github.com/apache/arrow-rs/pull/4636) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liurenjie1024](https://github.com/liurenjie1024)) -- Remove deprecated limit kernel [\#4597](https://github.com/apache/arrow-rs/pull/4597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Make FixedSizeBinaryArray value\_data return a reference [\#4820](https://github.com/apache/arrow-rs/issues/4820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update prost to v0.12.1 [\#4825](https://github.com/apache/arrow-rs/pull/4825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: FixedSizeBinaryArray::value\_data return reference [\#4821](https://github.com/apache/arrow-rs/pull/4821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Stateless Row Encoding / Don't Preserve Dictionaries in `RowConverter` \(\#4811\) [\#4819](https://github.com/apache/arrow-rs/pull/4819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- fix: entries field is non-nullable [\#4808](https://github.com/apache/arrow-rs/pull/4808) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Fix flight sql do put handling, add bind parameter support to FlightSQL cli client [\#4797](https://github.com/apache/arrow-rs/pull/4797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([suremarc](https://github.com/suremarc)) +- Remove unused dyn\_cmp\_dict feature [\#4766](https://github.com/apache/arrow-rs/pull/4766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add underlying `std::io::Error` to `IoError` and add `IpcError` variant [\#4726](https://github.com/apache/arrow-rs/pull/4726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexandreyc](https://github.com/alexandreyc)) **Implemented enhancements:** -- parquet: support setting the field\_id with an ArrowWriter [\#4702](https://github.com/apache/arrow-rs/issues/4702) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support references in i256 arithmetic ops [\#4694](https://github.com/apache/arrow-rs/issues/4694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Precision-Loss Decimal Arithmetic [\#4664](https://github.com/apache/arrow-rs/issues/4664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Faster i256 Division [\#4663](https://github.com/apache/arrow-rs/issues/4663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `concat_batches` for 0 columns [\#4661](https://github.com/apache/arrow-rs/issues/4661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `filter_record_batch` should support filtering record batch without columns [\#4647](https://github.com/apache/arrow-rs/issues/4647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve speed of `lexicographical_partition_ranges` [\#4614](https://github.com/apache/arrow-rs/issues/4614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) -- Add Rank Function [\#4606](https://github.com/apache/arrow-rs/issues/4606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Datum Based Comparison Kernels [\#4596](https://github.com/apache/arrow-rs/issues/4596) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Convenience method to create `DataType::List` correctly [\#4544](https://github.com/apache/arrow-rs/issues/4544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove Deprecated Arithmetic Kernels [\#4481](https://github.com/apache/arrow-rs/issues/4481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Equality kernel where null==null gives true [\#4438](https://github.com/apache/arrow-rs/issues/4438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row Format Adapative Block Size [\#4812](https://github.com/apache/arrow-rs/issues/4812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Stateless Row Conversion [\#4811](https://github.com/apache/arrow-rs/issues/4811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add option to specify custom null values for CSV reader [\#4794](https://github.com/apache/arrow-rs/issues/4794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet::record::RowIter cannot be customized with batch\_size and defaults to 1024 [\#4782](https://github.com/apache/arrow-rs/issues/4782) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `DynScalar` abstraction \(something that makes it easy to create scalar `Datum`s\) [\#4781](https://github.com/apache/arrow-rs/issues/4781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Datum` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4780](https://github.com/apache/arrow-rs/issues/4780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Scalar` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4779](https://github.com/apache/arrow-rs/issues/4779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support IntoPyArrow for impl RecordBatchReader [\#4730](https://github.com/apache/arrow-rs/issues/4730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based String Kernels [\#4595](https://github.com/apache/arrow-rs/issues/4595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- Parquet ArrowWriter Ignores Nulls in Dictionary Values [\#4690](https://github.com/apache/arrow-rs/issues/4690) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Schema Nullability Validation Fails to Account for Dictionary Nulls [\#4689](https://github.com/apache/arrow-rs/issues/4689) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Comparison Kernels Ignore Nulls in Dictionary Values [\#4688](https://github.com/apache/arrow-rs/issues/4688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Casting List to String Ignores Format Options [\#4669](https://github.com/apache/arrow-rs/issues/4669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Double free in C Stream Interface [\#4659](https://github.com/apache/arrow-rs/issues/4659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- CI Failing On Packed SIMD [\#4651](https://github.com/apache/arrow-rs/issues/4651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `RowInterner::size()` much too low for high cardinality dictionary columns [\#4645](https://github.com/apache/arrow-rs/issues/4645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Decimal PrimitiveArray change datatype after try\_unary [\#4644](https://github.com/apache/arrow-rs/issues/4644) -- Better explanation in docs for Dictionary field encoding using RowConverter [\#4639](https://github.com/apache/arrow-rs/issues/4639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `List(FixedSizeBinary)` array equality check may return wrong result [\#4637](https://github.com/apache/arrow-rs/issues/4637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `arrow::compute::nullif` panics if `NullArray` is provided [\#4634](https://github.com/apache/arrow-rs/issues/4634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Empty lists in FixedSizeListArray::try\_new is not handled [\#4623](https://github.com/apache/arrow-rs/issues/4623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Bounds checking in `MutableBuffer::set_null_bits` can be bypassed [\#4620](https://github.com/apache/arrow-rs/issues/4620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- TypedDictionaryArray Misleading Null Behaviour [\#4616](https://github.com/apache/arrow-rs/issues/4616) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- bug: Parquet writer missing row group metadata fields such as `compressed_size`, `file offset`. [\#4610](https://github.com/apache/arrow-rs/issues/4610) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `new_null_array` generates an invalid union array [\#4600](https://github.com/apache/arrow-rs/issues/4600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Footer parsing fails for very large parquet file. [\#4592](https://github.com/apache/arrow-rs/issues/4592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- bug\(parquet\): Disabling global statistics but enabling for particular column breaks reading [\#4587](https://github.com/apache/arrow-rs/issues/4587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `arrow::compute::concat` panics for dense union arrays with non-trivial type IDs [\#4578](https://github.com/apache/arrow-rs/issues/4578) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- MapArray::new\_from\_strings creates nullable entries field [\#4807](https://github.com/apache/arrow-rs/issues/4807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- pyarrow module can't roundtrip tensor arrays [\#4805](https://github.com/apache/arrow-rs/issues/4805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `concat_batches` errors with "schema mismatch" error when only metadata differs [\#4799](https://github.com/apache/arrow-rs/issues/4799) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- panic in `cmp` kernels with DictionaryArrays: `Option::unwrap()` on a `None` value' [\#4788](https://github.com/apache/arrow-rs/issues/4788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- stream ffi panics if schema metadata values aren't valid utf8 [\#4750](https://github.com/apache/arrow-rs/issues/4750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression: Incorrect Sorting of `*ListArray` in 46.0.0 [\#4746](https://github.com/apache/arrow-rs/issues/4746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row is no longer comparable after reuse [\#4741](https://github.com/apache/arrow-rs/issues/4741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DoPut FlightSQL handler inadvertently consumes schema at start of Request\\> [\#4658](https://github.com/apache/arrow-rs/issues/4658) +- Return error when converting schema [\#4752](https://github.com/apache/arrow-rs/pull/4752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Implement PyArrowType for `Box` [\#4751](https://github.com/apache/arrow-rs/pull/4751) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) **Closed issues:** -- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- Building arrow-rust for target wasm32-wasi falied to compile packed\_simd\_2 [\#4717](https://github.com/apache/arrow-rs/issues/4717) **Merged pull requests:** -- Add distinct kernels \(\#960\) \(\#4438\) [\#4716](https://github.com/apache/arrow-rs/pull/4716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update parquet object\_store 0.7 [\#4715](https://github.com/apache/arrow-rs/pull/4715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support Field ID in ArrowWriter \(\#4702\) [\#4710](https://github.com/apache/arrow-rs/pull/4710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove rank kernels [\#4703](https://github.com/apache/arrow-rs/pull/4703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support references in i256 arithmetic ops [\#4692](https://github.com/apache/arrow-rs/pull/4692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cleanup DynComparator \(\#2654\) [\#4687](https://github.com/apache/arrow-rs/pull/4687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Separate metadata fetch from `ArrowReaderBuilder` construction \(\#4674\) [\#4676](https://github.com/apache/arrow-rs/pull/4676) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- cleanup some assert\(\) with error propagation [\#4673](https://github.com/apache/arrow-rs/pull/4673) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Faster i256 Division \(2-100x\) \(\#4663\) [\#4672](https://github.com/apache/arrow-rs/pull/4672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) -- Fix equality of nested nullable FixedSizeBinary \(\#4637\) [\#4670](https://github.com/apache/arrow-rs/pull/4670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use ArrayFormatter in cast kernel [\#4668](https://github.com/apache/arrow-rs/pull/4668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Improve API docs for FlightSQL metadata builders [\#4667](https://github.com/apache/arrow-rs/pull/4667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Support `concat_batches` for 0 columns [\#4662](https://github.com/apache/arrow-rs/pull/4662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- fix ownership of c stream error [\#4660](https://github.com/apache/arrow-rs/pull/4660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Minor: Fix illustration for dict encoding [\#4657](https://github.com/apache/arrow-rs/pull/4657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) -- minor: move comment to the correct location [\#4655](https://github.com/apache/arrow-rs/pull/4655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Update packed\_simd and run miri tests on simd code [\#4654](https://github.com/apache/arrow-rs/pull/4654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- impl `From>` for `BufferBuilder` and `MutableBuffer` [\#4650](https://github.com/apache/arrow-rs/pull/4650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Filter record batch with 0 columns [\#4648](https://github.com/apache/arrow-rs/pull/4648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Account for child `Bucket` size in OrderPreservingInterner [\#4646](https://github.com/apache/arrow-rs/pull/4646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` [\#4638](https://github.com/apache/arrow-rs/pull/4638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- fix\(select\): handle `NullArray` in `nullif` [\#4635](https://github.com/apache/arrow-rs/pull/4635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Move `BufferBuilder` to `arrow-buffer` [\#4630](https://github.com/apache/arrow-rs/pull/4630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- allow zero sized empty fixed [\#4626](https://github.com/apache/arrow-rs/pull/4626) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) -- fix: compute\_dictionary\_mapping use wrong offsetSize [\#4625](https://github.com/apache/arrow-rs/pull/4625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- impl `FromIterator` for `MutableBuffer` [\#4624](https://github.com/apache/arrow-rs/pull/4624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- expand docs for FixedSizeListArray [\#4622](https://github.com/apache/arrow-rs/pull/4622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) -- fix\(buffer\): panic on end index overflow in `MutableBuffer::set_null_bits` [\#4621](https://github.com/apache/arrow-rs/pull/4621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- impl `Default` for `arrow_buffer::buffer::MutableBuffer` [\#4619](https://github.com/apache/arrow-rs/pull/4619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Minor: improve docs and add example for lexicographical\_partition\_ranges [\#4615](https://github.com/apache/arrow-rs/pull/4615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Cleanup sort [\#4613](https://github.com/apache/arrow-rs/pull/4613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add rank function \(\#4606\) [\#4609](https://github.com/apache/arrow-rs/pull/4609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add more docs and examples for ListArray and OffsetsBuffer [\#4607](https://github.com/apache/arrow-rs/pull/4607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Simplify dictionary sort [\#4605](https://github.com/apache/arrow-rs/pull/4605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Consolidate sort benchmarks [\#4604](https://github.com/apache/arrow-rs/pull/4604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Don't Reorder Nulls in sort\_to\_indices \(\#4545\) [\#4603](https://github.com/apache/arrow-rs/pull/4603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix\(data\): create child arrays of correct length when building a sparse union null array [\#4601](https://github.com/apache/arrow-rs/pull/4601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Use u32 metadata\_len when parsing footer of parquet. [\#4599](https://github.com/apache/arrow-rs/pull/4599) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Berrysoft](https://github.com/Berrysoft)) -- fix\(data\): map type ID to child index before indexing a union child array [\#4598](https://github.com/apache/arrow-rs/pull/4598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Remove deprecated arithmetic kernels \(\#4481\) [\#4594](https://github.com/apache/arrow-rs/pull/4594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Test Disabled Page Statistics \(\#4587\) [\#4589](https://github.com/apache/arrow-rs/pull/4589) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Cleanup ArrayData::buffers [\#4583](https://github.com/apache/arrow-rs/pull/4583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use contains\_nulls in ArrayData equality of byte arrays [\#4582](https://github.com/apache/arrow-rs/pull/4582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Vectorized lexicographical\_partition\_ranges \(~80% faster\) [\#4575](https://github.com/apache/arrow-rs/pull/4575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- chore: add datatype new\_list [\#4561](https://github.com/apache/arrow-rs/pull/4561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- Respect FormatOption::nulls for NullArray [\#4836](https://github.com/apache/arrow-rs/pull/4836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix merge\_dictionary\_values in selection kernels [\#4833](https://github.com/apache/arrow-rs/pull/4833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix like scalar null [\#4832](https://github.com/apache/arrow-rs/pull/4832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More chrono deprecations [\#4822](https://github.com/apache/arrow-rs/pull/4822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Adaptive Row Block Size \(\#4812\) [\#4818](https://github.com/apache/arrow-rs/pull/4818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.66 to =1.0.67 [\#4816](https://github.com/apache/arrow-rs/pull/4816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Do not check schema for equality in concat\_batches [\#4815](https://github.com/apache/arrow-rs/pull/4815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: export record batch through stream [\#4806](https://github.com/apache/arrow-rs/pull/4806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Improve CSV Reader Benchmark Coverage of Small Primitives [\#4803](https://github.com/apache/arrow-rs/pull/4803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- csv: Add option to specify custom null values [\#4795](https://github.com/apache/arrow-rs/pull/4795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vrongmeal](https://github.com/vrongmeal)) +- Expand docstring and add example to `Scalar` [\#4793](https://github.com/apache/arrow-rs/pull/4793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Re-export array crate root \(\#4780\) \(\#4779\) [\#4791](https://github.com/apache/arrow-rs/pull/4791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix DictionaryArray::normalized\_keys \(\#4788\) [\#4789](https://github.com/apache/arrow-rs/pull/4789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow custom tree builder for parquet::record::RowIter [\#4783](https://github.com/apache/arrow-rs/pull/4783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([YuraKotov](https://github.com/YuraKotov)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: avoid panic if offset index not exists. [\#4761](https://github.com/apache/arrow-rs/pull/4761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RinChanNOWWW](https://github.com/RinChanNOWWW)) +- Relax constraints on PyArrowType [\#4757](https://github.com/apache/arrow-rs/pull/4757) ([tustvold](https://github.com/tustvold)) +- Chrono deprecations [\#4748](https://github.com/apache/arrow-rs/pull/4748) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix List Sorting, Revert Removal of Rank Kernels [\#4747](https://github.com/apache/arrow-rs/pull/4747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clear row buffer before reuse [\#4742](https://github.com/apache/arrow-rs/pull/4742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- Datum based like kernels \(\#4595\) [\#4732](https://github.com/apache/arrow-rs/pull/4732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: expose DoGet response headers & trailers [\#4727](https://github.com/apache/arrow-rs/pull/4727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Cleanup length and bit\_length kernels [\#4718](https://github.com/apache/arrow-rs/pull/4718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) diff --git a/Cargo.toml b/Cargo.toml index 804fdf5807ee..936935ec7e3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,7 +61,7 @@ exclude = [ ] [workspace.package] -version = "46.0.0" +version = "47.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,20 +76,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "46.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "46.0.0", path = "./arrow-arith" } -arrow-array = { version = "46.0.0", path = "./arrow-array" } -arrow-buffer = { version = "46.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "46.0.0", path = "./arrow-cast" } -arrow-csv = { version = "46.0.0", path = "./arrow-csv" } -arrow-data = { version = "46.0.0", path = "./arrow-data" } -arrow-ipc = { version = "46.0.0", path = "./arrow-ipc" } -arrow-json = { version = "46.0.0", path = "./arrow-json" } -arrow-ord = { version = "46.0.0", path = "./arrow-ord" } -arrow-row = { version = "46.0.0", path = "./arrow-row" } -arrow-schema = { version = "46.0.0", path = "./arrow-schema" } -arrow-select = { version = "46.0.0", path = "./arrow-select" } -arrow-string = { version = "46.0.0", path = "./arrow-string" } -parquet = { version = "46.0.0", path = "./parquet", default-features = false } +arrow = { version = "47.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "47.0.0", path = "./arrow-arith" } +arrow-array = { version = "47.0.0", path = "./arrow-array" } +arrow-buffer = { version = "47.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "47.0.0", path = "./arrow-cast" } +arrow-csv = { version = "47.0.0", path = "./arrow-csv" } +arrow-data = { version = "47.0.0", path = "./arrow-data" } +arrow-ipc = { version = "47.0.0", path = "./arrow-ipc" } +arrow-json = { version = "47.0.0", path = "./arrow-json" } +arrow-ord = { version = "47.0.0", path = "./arrow-ord" } +arrow-row = { version = "47.0.0", path = "./arrow-row" } +arrow-schema = { version = "47.0.0", path = "./arrow-schema" } +arrow-select = { version = "47.0.0", path = "./arrow-select" } +arrow-string = { version = "47.0.0", path = "./arrow-string" } +parquet = { version = "47.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 0b62e97383c2..74bbb4ac1e8d 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="45.0.0" -FUTURE_RELEASE="46.0.0" +SINCE_TAG="46.0.0" +FUTURE_RELEASE="47.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From a03ce564f1c95e10c78e6a065996cb036ca13cef Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Tue, 19 Sep 2023 08:59:21 -0500 Subject: [PATCH 1223/1411] fix: object store http header last modified (#4834) * fix: object store http header last modified * refactor: make headermeta configurable on required fields * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/client/header.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/client/get.rs | 12 +++--- object_store/src/client/header.rs | 62 ++++++++++++++++++++++++------- object_store/src/http/mod.rs | 9 ++++- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 6b2d60ae565f..8b84a079c7d5 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -49,8 +49,8 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options, false).await?; - let meta = - header_meta(location, response.headers()).map_err(|e| Error::Generic { + let meta = header_meta(location, response.headers(), Default::default()) + .map_err(|e| Error::Generic { store: T::STORE, source: Box::new(e), })?; @@ -73,9 +73,11 @@ impl GetClientExt for T { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers()).map_err(|e| Error::Generic { - store: T::STORE, - source: Box::new(e), + header_meta(location, response.headers(), Default::default()).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } }) } } diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index cc4f16eaa599..b55494cdb812 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -19,11 +19,33 @@ use crate::path::Path; use crate::ObjectMeta; -use chrono::{DateTime, Utc}; +use chrono::{DateTime, TimeZone, Utc}; use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; use snafu::{OptionExt, ResultExt, Snafu}; +#[derive(Debug)] +/// Configuration for header extraction +pub struct HeaderConfig { + /// Whether to require an ETag header when extracting [`ObjectMeta`] from headers. + /// + /// Defaults to `true` + pub etag_required: bool, + /// Whether to require a Last-Modified header when extracting [`ObjectMeta`] from headers. + /// + /// Defaults to `true` + pub last_modified_required: bool, +} + +impl Default for HeaderConfig { + fn default() -> Self { + Self { + etag_required: true, + last_modified_required: true, + } + } +} + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("ETag Header missing from response"))] @@ -52,32 +74,44 @@ pub enum Error { } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] -pub fn header_meta(location: &Path, headers: &HeaderMap) -> Result { - let last_modified = headers - .get(LAST_MODIFIED) - .context(MissingLastModifiedSnafu)?; +pub fn header_meta( + location: &Path, + headers: &HeaderMap, + cfg: HeaderConfig, +) -> Result { + let last_modified = match headers.get(LAST_MODIFIED) { + Some(last_modified) => { + let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + DateTime::parse_from_rfc2822(last_modified) + .context(InvalidLastModifiedSnafu { last_modified })? + .with_timezone(&Utc) + } + None if cfg.last_modified_required => return Err(Error::MissingLastModified), + None => Utc.timestamp_nanos(0), + }; + + let e_tag = match headers.get(ETAG) { + Some(e_tag) => { + let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; + Some(e_tag.to_string()) + } + None if cfg.etag_required => return Err(Error::MissingEtag), + None => None, + }; let content_length = headers .get(CONTENT_LENGTH) .context(MissingContentLengthSnafu)?; - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; - let last_modified = DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? - .with_timezone(&Utc); - let content_length = content_length.to_str().context(BadHeaderSnafu)?; let content_length = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; - let e_tag = headers.get(ETAG).context(MissingEtagSnafu)?; - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - Ok(ObjectMeta { location: location.clone(), last_modified, size: content_length, - e_tag: Some(e_tag.to_string()), + e_tag, }) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index e8e7b459e12f..614381975625 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -40,7 +40,7 @@ use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::header_meta; +use crate::client::header::{header_meta, HeaderConfig}; use crate::http::client::Client; use crate::path::Path; use crate::{ @@ -117,7 +117,12 @@ impl ObjectStore for HttpStore { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.client.get(location, options).await?; - let meta = header_meta(location, response.headers()).context(MetadataSnafu)?; + let cfg = HeaderConfig { + last_modified_required: false, + etag_required: false, + }; + let meta = + header_meta(location, response.headers(), cfg).context(MetadataSnafu)?; let stream = response .bytes_stream() From 407e575f41365b73a84fb2a2150f918e6dab2bbe Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Wed, 20 Sep 2023 09:22:44 -0500 Subject: [PATCH 1224/1411] Error if Remote Ignores HTTP Range Header (#4841) * fix: abort http:get on !206 when issuing a range request * add some comments * pr feedback * Update object_store/src/http/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/http/client.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 93cd4ee0ea09..67a41291743d 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -37,6 +37,9 @@ enum Error { #[snafu(display("Request error: {}", source))] Reqwest { source: reqwest::Error }, + #[snafu(display("Range request not supported by {}", href))] + RangeNotSupported { href: String }, + #[snafu(display("Error decoding PROPFIND response: {}", source))] InvalidPropFind { source: quick_xml::de::DeError }, @@ -238,8 +241,9 @@ impl Client { pub async fn get(&self, location: &Path, options: GetOptions) -> Result { let url = self.path_url(location); let builder = self.client.get(url); + let has_range = options.range.is_some(); - builder + let res = builder .with_get_options(options) .send_retry(&self.retry_config) .await @@ -252,7 +256,19 @@ impl Client { } } _ => Error::Request { source }.into(), - }) + })?; + + // We expect a 206 Partial Content response if a range was requested + // a 200 OK response would indicate the server did not fulfill the request + if has_range && res.status() != StatusCode::PARTIAL_CONTENT { + return Err(crate::Error::NotSupported { + source: Box::new(Error::RangeNotSupported { + href: location.to_string(), + }), + }); + } + + Ok(res) } pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { From f9cd26f7bc76fb36ffce9ac59036b8c7a0cbd34d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 20 Sep 2023 17:04:13 -0400 Subject: [PATCH 1225/1411] Refine documentation to `Array::is_null` (#4838) * Add documentation and Array::is_logical_null * Remove code change, refine comments * fix docs * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix link formatting --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/array/mod.rs | 40 ++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 905ec1e5431b..9b66826f7584 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -173,20 +173,22 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn offset(&self) -> usize; - /// Returns the null buffer of this array if any + /// Returns the null buffer of this array if any. /// - /// Note: some arrays can encode their nullability in their children, for example, + /// The null buffer encodes the "physical" nulls of an array. + /// However, some arrays can also encode nullability in their children, for example, /// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer, - /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this + /// such as [`NullArray`]. To determine if each element of such an array is logically null, + /// you can use the slower [`Array::logical_nulls`] to obtain a computed mask . fn nulls(&self) -> Option<&NullBuffer>; - /// Returns the logical null buffer of this array if any + /// Returns a potentially computed [`NullBuffer`] that represent the logical null values of this array, if any. /// /// In most cases this will be the same as [`Array::nulls`], except for: /// - /// * DictionaryArray where [`DictionaryArray::values`] contains nulls - /// * RunArray where [`RunArray::values`] contains nulls - /// * NullArray where all indices are nulls + /// * [`DictionaryArray`] where [`DictionaryArray::values`] contains nulls + /// * [`RunArray`] where [`RunArray::values`] contains nulls + /// * [`NullArray`] where all indices are nulls /// /// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability /// of these arrays, beyond what is encoded in [`Array::nulls`] @@ -194,31 +196,33 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.nulls().cloned() } - /// Returns whether the element at `index` is null. - /// When using this function on a slice, the index is relative to the slice. + /// Returns whether the element at `index` is null according to [`Array::nulls`] /// - /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] - /// see [`Array::logical_nulls`] for logical nullability + /// Note: For performance reasons, this method returns nullability solely as determined by the + /// null buffer. This difference can lead to surprising results, for example, [`NullArray::is_null`] always + /// returns `false` as the array lacks a null buffer. Similarly [`DictionaryArray`] and [`RunArray`] may + /// encode nullability in their children. See [`Self::logical_nulls`] for more information. /// /// # Example: /// /// ``` - /// use arrow_array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array, NullArray}; /// /// let array = Int32Array::from(vec![Some(1), None]); - /// /// assert_eq!(array.is_null(0), false); /// assert_eq!(array.is_null(1), true); + /// + /// // NullArrays do not have a null buffer, and therefore always + /// // return false for is_null. + /// let array = NullArray::new(1); + /// assert_eq!(array.is_null(0), false); /// ``` fn is_null(&self, index: usize) -> bool { self.nulls().map(|n| n.is_null(index)).unwrap_or_default() } - /// Returns whether the element at `index` is not null. - /// When using this function on a slice, the index is relative to the slice. - /// - /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] - /// see [`Array::logical_nulls`] for logical nullability + /// Returns whether the element at `index` is *not* null, the + /// opposite of [`Self::is_null`]. /// /// # Example: /// From 8465ed4729cf4de8a5aa31d811170c0968c1bc59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:36:59 +0100 Subject: [PATCH 1226/1411] Update tonic-build requirement from =0.10.0 to =0.10.1 (#4846) Updates the requirements on [tonic-build](https://github.com/hyperium/tonic) to permit the latest version. - [Changelog](https://github.com/hyperium/tonic/blob/master/CHANGELOG.md) - [Commits](https://github.com/hyperium/tonic/compare/v0.10.0...v0.10.1) --- updated-dependencies: - dependency-name: tonic-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index c342170c5b72..50305579d833 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -34,4 +34,4 @@ publish = false # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.67", default-features = false } prost-build = { version = "=0.12.1", default-features = false } -tonic-build = { version = "=0.10.0", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "=0.10.1", default-features = false, features = ["transport", "prost"] } From 1de21d29ade3dbc48a89e150c00fd548f0b83aab Mon Sep 17 00:00:00 2001 From: JasonLi Date: Sun, 24 Sep 2023 00:21:41 +0800 Subject: [PATCH 1227/1411] fix: make_primitive_scalar bug (#4852) Co-authored-by: jasonnnli --- arrow-ord/src/comparison.rs | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4e475d8fd572..ffd35a6070b8 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -105,15 +105,17 @@ fn make_primitive_scalar( } DataType::Decimal128(_, _) => { let right = try_to_type!(scalar, to_i128)?; - Ok(Arc::new(PrimitiveArray::::from(vec![ - right, - ]))) + Ok(Arc::new( + PrimitiveArray::::from(vec![right]) + .with_data_type(d.clone()), + )) } DataType::Decimal256(_, _) => { let right = try_to_type!(scalar, to_i128)?; - Ok(Arc::new(PrimitiveArray::::from(vec![ - i256::from_i128(right), - ]))) + Ok(Arc::new( + PrimitiveArray::::from(vec![i256::from_i128(right)]) + .with_data_type(d.clone()), + )) } DataType::Date32 => { let right = try_to_type!(scalar, to_i32)?; @@ -125,27 +127,31 @@ fn make_primitive_scalar( } DataType::Timestamp(TimeUnit::Nanosecond, _) => { let right = try_to_type!(scalar, to_i64)?; - Ok(Arc::new(PrimitiveArray::::from( - vec![right], - ))) + Ok(Arc::new( + PrimitiveArray::::from(vec![right]) + .with_data_type(d.clone()), + )) } DataType::Timestamp(TimeUnit::Microsecond, _) => { let right = try_to_type!(scalar, to_i64)?; - Ok(Arc::new(PrimitiveArray::::from( - vec![right], - ))) + Ok(Arc::new( + PrimitiveArray::::from(vec![right]) + .with_data_type(d.clone()), + )) } DataType::Timestamp(TimeUnit::Millisecond, _) => { let right = try_to_type!(scalar, to_i64)?; - Ok(Arc::new(PrimitiveArray::::from( - vec![right], - ))) + Ok(Arc::new( + PrimitiveArray::::from(vec![right]) + .with_data_type(d.clone()), + )) } DataType::Timestamp(TimeUnit::Second, _) => { let right = try_to_type!(scalar, to_i64)?; - Ok(Arc::new(PrimitiveArray::::from(vec![ - right, - ]))) + Ok(Arc::new( + PrimitiveArray::::from(vec![right]) + .with_data_type(d.clone()), + )) } DataType::Time32(TimeUnit::Second) => { let right = try_to_type!(scalar, to_i32)?; From 431be3facb0645528397aa800166089e4a21a834 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sat, 23 Sep 2023 18:18:45 +0100 Subject: [PATCH 1228/1411] Perform HEAD request for HttpStore::head (#4837) * Perform HEAD request for HttpStore::head * Logical merge conflicts * Review feedback --- object_store/src/client/get.rs | 20 ++++--- object_store/src/client/header.rs | 11 +--- object_store/src/client/mod.rs | 1 - object_store/src/http/client.rs | 90 +++++++++++++++++++------------ object_store/src/http/mod.rs | 47 +++------------- 5 files changed, 78 insertions(+), 91 deletions(-) diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 8b84a079c7d5..333f6fe58475 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::client::header::header_meta; +use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; use crate::{Error, GetOptions, GetResult, ObjectMeta}; use crate::{GetResultPayload, Result}; @@ -28,6 +28,12 @@ use reqwest::Response; pub trait GetClient: Send + Sync + 'static { const STORE: &'static str; + /// Configure the [`HeaderConfig`] for this client + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + }; + async fn get_request( &self, path: &Path, @@ -49,10 +55,12 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options, false).await?; - let meta = header_meta(location, response.headers(), Default::default()) - .map_err(|e| Error::Generic { - store: T::STORE, - source: Box::new(e), + let meta = + header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } })?; let stream = response @@ -73,7 +81,7 @@ impl GetClientExt for T { async fn head(&self, location: &Path) -> Result { let options = GetOptions::default(); let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers(), Default::default()).map_err(|e| { + header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { Error::Generic { store: T::STORE, source: Box::new(e), diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index b55494cdb812..6499eff5aebe 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -24,7 +24,7 @@ use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; use snafu::{OptionExt, ResultExt, Snafu}; -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] /// Configuration for header extraction pub struct HeaderConfig { /// Whether to require an ETag header when extracting [`ObjectMeta`] from headers. @@ -37,15 +37,6 @@ pub struct HeaderConfig { pub last_modified_required: bool, } -impl Default for HeaderConfig { - fn default() -> Self { - Self { - etag_required: true, - last_modified_required: true, - } - } -} - #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("ETag Header missing from response"))] diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 77b14a7587d7..ee9d62a44f0c 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -27,7 +27,6 @@ pub mod retry; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod pagination; -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub mod get; #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 67a41291743d..0bd2e5639cb5 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +use crate::client::get::GetClient; +use crate::client::header::HeaderConfig; use crate::client::retry::{self, RetryConfig, RetryExt}; use crate::client::GetOptionsExt; use crate::path::{Path, DELIMITER}; use crate::util::deserialize_rfc1123; use crate::{ClientOptions, GetOptions, ObjectMeta, Result}; +use async_trait::async_trait; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use percent_encoding::percent_decode_str; @@ -238,39 +241,6 @@ impl Client { Ok(()) } - pub async fn get(&self, location: &Path, options: GetOptions) -> Result { - let url = self.path_url(location); - let builder = self.client.get(url); - let has_range = options.range.is_some(); - - let res = builder - .with_get_options(options) - .send_retry(&self.retry_config) - .await - .map_err(|source| match source.status() { - // Some stores return METHOD_NOT_ALLOWED for get on directories - Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { - crate::Error::NotFound { - source: Box::new(source), - path: location.to_string(), - } - } - _ => Error::Request { source }.into(), - })?; - - // We expect a 206 Partial Content response if a range was requested - // a 200 OK response would indicate the server did not fulfill the request - if has_range && res.status() != StatusCode::PARTIAL_CONTENT { - return Err(crate::Error::NotSupported { - source: Box::new(Error::RangeNotSupported { - href: location.to_string(), - }), - }); - } - - Ok(res) - } - pub async fn copy(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let mut retry = false; loop { @@ -307,6 +277,60 @@ impl Client { } } +#[async_trait] +impl GetClient for Client { + const STORE: &'static str = "HTTP"; + + /// Override the [`HeaderConfig`] to be less strict to support a + /// broader range of HTTP servers (#4831) + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: false, + last_modified_required: false, + }; + + async fn get_request( + &self, + location: &Path, + options: GetOptions, + head: bool, + ) -> Result { + let url = self.path_url(location); + let method = match head { + true => Method::HEAD, + false => Method::GET, + }; + let has_range = options.range.is_some(); + let builder = self.client.request(method, url); + + let res = builder + .with_get_options(options) + .send_retry(&self.retry_config) + .await + .map_err(|source| match source.status() { + // Some stores return METHOD_NOT_ALLOWED for get on directories + Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { + crate::Error::NotFound { + source: Box::new(source), + path: location.to_string(), + } + } + _ => Error::Request { source }.into(), + })?; + + // We expect a 206 Partial Content response if a range was requested + // a 200 OK response would indicate the server did not fulfill the request + if has_range && res.status() != StatusCode::PARTIAL_CONTENT { + return Err(crate::Error::NotSupported { + source: Box::new(Error::RangeNotSupported { + href: location.to_string(), + }), + }); + } + + Ok(res) + } +} + /// The response returned by a PROPFIND request, i.e. list #[derive(Deserialize, Default)] pub struct MultiStatus { diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 614381975625..afbc0ce4374a 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -34,18 +34,18 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::{StreamExt, TryStreamExt}; +use futures::StreamExt; use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; use url::Url; -use crate::client::header::{header_meta, HeaderConfig}; +use crate::client::get::GetClientExt; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, GetResultPayload, ListResult, - MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, Result, RetryConfig, }; mod client; @@ -115,46 +115,11 @@ impl ObjectStore for HttpStore { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - let range = options.range.clone(); - let response = self.client.get(location, options).await?; - let cfg = HeaderConfig { - last_modified_required: false, - etag_required: false, - }; - let meta = - header_meta(location, response.headers(), cfg).context(MetadataSnafu)?; - - let stream = response - .bytes_stream() - .map_err(|source| Error::Reqwest { source }.into()) - .boxed(); - - Ok(GetResult { - payload: GetResultPayload::Stream(stream), - range: range.unwrap_or(0..meta.size), - meta, - }) + self.client.get_opts(location, options).await } async fn head(&self, location: &Path) -> Result { - let status = self.client.list(Some(location), "0").await?; - match status.response.len() { - 1 => { - let response = status.response.into_iter().next().unwrap(); - response.check_ok()?; - match response.is_dir() { - true => Err(crate::Error::NotFound { - path: location.to_string(), - source: "Is directory".to_string().into(), - }), - false => response.object_meta(self.client.base_url()), - } - } - x => Err(crate::Error::NotFound { - path: location.to_string(), - source: format!("Expected 1 result, got {x}").into(), - }), - } + self.client.head(location).await } async fn delete(&self, location: &Path) -> Result<()> { From 6d5d7e36eff05be054221986f0e162a23cfe6a7e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 23 Sep 2023 13:18:56 -0400 Subject: [PATCH 1229/1411] Minor: Improve object_store docs.rs landing page (#4849) * Improve object_store docs.rs landing page * Apply suggestions from code review --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/http/mod.rs | 2 +- object_store/src/lib.rs | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index afbc0ce4374a..e9ed5902d8f5 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -17,7 +17,7 @@ //! An object store implementation for generic HTTP servers //! -//! This follows [rfc2518] commonly known called [WebDAV] +//! This follows [rfc2518] commonly known as [WebDAV] //! //! Basic get support will work out of the box with most HTTP servers, //! even those that don't explicitly support [rfc2518] diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 8d96ccf1dfc3..cef10f1dd418 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -36,7 +36,7 @@ //! clouds and local test environments, via a simple runtime //! configuration change. //! -//! # Features: +//! # Highlights //! //! 1. A focused, easy to use, idiomatic, well documented, high //! performance, `async` API. @@ -53,26 +53,31 @@ //! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ //! [crates.io]: https://github.com/rust-lang/crates.io //! -//! # Example: Create an [`ObjectStore`] implementation: +//! # Available [`ObjectStore`] Implementations +//! +//! By default, this crate provides the following implementations: +//! +//! * Memory: [`InMemory`](memory::InMemory) +//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) +//! +//! Feature flags are used to enable support for other implementations: //! #![cfg_attr( feature = "gcp", - doc = "* [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" + doc = "* `gcp`: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" )] #![cfg_attr( feature = "aws", - doc = "* [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder)" + doc = "* `aws`: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" )] #![cfg_attr( feature = "azure", - doc = "* [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/): [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" + doc = "* `azure`: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] #![cfg_attr( feature = "http", - doc = "* [HTTP Storage](https://datatracker.ietf.org/doc/html/rfc2518): [`HttpBuilder`](http::HttpBuilder)" + doc = "* `http`: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] -//! * In Memory: [`InMemory`](memory::InMemory) -//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! //! # Adapters //! From 72a2dab54a95d2d02151844cae63ee664d50d819 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 25 Sep 2023 10:45:01 +0100 Subject: [PATCH 1230/1411] Allow Constructing Non-Empty StructArray with no Fields (#4842) (#4845) --- arrow-array/src/array/struct_array.rs | 17 ++++++++++++++ arrow-array/src/builder/struct_builder.rs | 28 +++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 284c3b26a946..0e586ed1ef96 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -197,6 +197,23 @@ impl StructArray { } } + /// Create a new [`StructArray`] containing no fields + /// + /// # Panics + /// + /// If `len != nulls.len()` + pub fn new_empty_fields(len: usize, nulls: Option) -> Self { + if let Some(n) = &nulls { + assert_eq!(len, n.len()) + } + Self { + len, + data_type: DataType::Struct(Fields::empty()), + fields: vec![], + nulls, + } + } + /// Deconstruct this array into its constituent parts pub fn into_parts(self) -> (Fields, Vec, Option) { let f = match self.data_type { diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 0c878e621056..7aa91dacaa8c 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -233,6 +233,12 @@ impl StructBuilder { /// Builds the `StructArray` and reset this builder. pub fn finish(&mut self) -> StructArray { self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields( + self.len(), + self.null_buffer_builder.finish(), + ); + } let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); let nulls = self.null_buffer_builder.finish(); @@ -243,6 +249,13 @@ impl StructBuilder { pub fn finish_cloned(&self) -> StructArray { self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields( + self.len(), + self.null_buffer_builder.finish_cloned(), + ); + } + let arrays = self .field_builders .iter() @@ -591,4 +604,19 @@ mod tests { let mut sa = StructBuilder::new(fields, field_builders); sa.finish(); } + + #[test] + fn test_empty() { + let mut builder = StructBuilder::new(Fields::empty(), vec![]); + builder.append(true); + builder.append(false); + + let a1 = builder.finish_cloned(); + let a2 = builder.finish(); + assert_eq!(a1, a2); + assert_eq!(a1.len(), 2); + assert_eq!(a1.null_count(), 1); + assert!(a1.is_valid(0)); + assert!(a1.is_null(1)); + } } From b35511d293d72d3491330608325e38a6f5ca569a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 25 Sep 2023 10:45:16 +0100 Subject: [PATCH 1231/1411] Allow overriding azure endpoint (#4853) (#4854) --- object_store/src/azure/mod.rs | 37 ++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 2a07710d09d6..b210d486d9bf 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -325,6 +325,8 @@ pub struct MicrosoftAzureBuilder { url: Option, /// When set to true, azurite storage emulator has to be used use_emulator: ConfigValue, + /// Storage endpoint + endpoint: Option, /// Msi endpoint for acquiring managed identity token msi_endpoint: Option, /// Object id for use with managed identity authentication @@ -434,6 +436,14 @@ pub enum AzureConfigKey { /// - `use_emulator` UseEmulator, + /// Override the endpoint used to communicate with blob storage + /// + /// Supported keys: + /// - `azure_storage_endpoint` + /// - `azure_endpoint` + /// - `endpoint` + Endpoint, + /// Use object store with url scheme account.dfs.fabric.microsoft.com /// /// Supported keys: @@ -501,6 +511,7 @@ impl AsRef for AzureConfigKey { Self::Token => "azure_storage_token", Self::UseEmulator => "azure_storage_use_emulator", Self::UseFabricEndpoint => "azure_use_fabric_endpoint", + Self::Endpoint => "azure_storage_endpoint", Self::MsiEndpoint => "azure_msi_endpoint", Self::ObjectId => "azure_object_id", Self::MsiResourceId => "azure_msi_resource_id", @@ -542,6 +553,9 @@ impl FromStr for AzureConfigKey { | "sas_token" => Ok(Self::SasKey), "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { + Ok(Self::Endpoint) + } "azure_msi_endpoint" | "azure_identity_endpoint" | "identity_endpoint" @@ -668,6 +682,7 @@ impl MicrosoftAzureBuilder { } AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), AzureConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) @@ -726,6 +741,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::UseFabricEndpoint => { Some(self.use_fabric_endpoint.to_string()) } + AzureConfigKey::Endpoint => self.endpoint.clone(), AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), @@ -873,9 +889,19 @@ impl MicrosoftAzureBuilder { self } + /// Override the endpoint used to communicate with blob storage + /// + /// Defaults to `https://{account}.blob.core.windows.net` + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + /// Set if Microsoft Fabric url scheme should be used (defaults to false) /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + /// + /// Note: [`Self::with_endpoint`] will take precedence over this option pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { self.use_fabric_endpoint = use_fabric_endpoint.into(); self @@ -986,9 +1012,14 @@ impl MicrosoftAzureBuilder { (true, url, credential, account_name) } else { let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = match self.use_fabric_endpoint.get()? { - true => format!("https://{}.blob.fabric.microsoft.com", &account_name), - false => format!("https://{}.blob.core.windows.net", &account_name), + let account_url = match self.endpoint { + Some(account_url) => account_url, + None => match self.use_fabric_endpoint.get()? { + true => { + format!("https://{}.blob.fabric.microsoft.com", &account_name) + } + false => format!("https://{}.blob.core.windows.net", &account_name), + }, }; let url = Url::parse(&account_url) From 7e7ac153c69a0b227ae11e0caf0f00b04b85cd23 Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Mon, 25 Sep 2023 17:45:55 +0800 Subject: [PATCH 1232/1411] fix: add missing precision overflow checking for `cast_string_to_decimal` (#4830) * fix: add missing precision overflow checking for `cast_string_to_decimal` * Add test_cast_string_to_decimal256_precision_overflow --- arrow-cast/src/cast.rs | 75 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 7b8e6144bb49..e7727565c981 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2801,6 +2801,11 @@ where if cast_options.safe { let iter = from.iter().map(|v| { v.and_then(|v| parse_string_to_decimal_native::(v, scale as usize).ok()) + .and_then(|v| { + T::validate_decimal_precision(v, precision) + .is_ok() + .then_some(v) + }) }); // Benefit: // 20% performance improvement @@ -2815,13 +2820,17 @@ where .iter() .map(|v| { v.map(|v| { - parse_string_to_decimal_native::(v, scale as usize).map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - T::DATA_TYPE, - )) - }) + parse_string_to_decimal_native::(v, scale as usize) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + T::DATA_TYPE, + )) + }) + .and_then(|v| { + T::validate_decimal_precision(v, precision).map(|_| v) + }) }) .transpose() }) @@ -8152,6 +8161,32 @@ mod tests { ); } + #[test] + fn test_cast_string_to_decimal128_precision_overflow() { + let array = StringArray::from(vec!["1000".to_string()]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(10, 8), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal128(10, 8), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Invalid argument error: 100000000000 is too large to store in a Decimal128 of precision 10. Max is 9999999999", err.unwrap_err().to_string()); + } + #[test] fn test_cast_utf8_to_decimal128_overflow() { let overflow_str_array = StringArray::from(vec![ @@ -8209,6 +8244,32 @@ mod tests { assert!(decimal_arr.is_null(6)); } + #[test] + fn test_cast_string_to_decimal256_precision_overflow() { + let array = StringArray::from(vec!["1000".to_string()]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(10, 8), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal256(10, 8), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Invalid argument error: 100000000000 is too large to store in a Decimal256 of precision 10. Max is 9999999999", err.unwrap_err().to_string()); + } + #[test] fn test_cast_utf8_to_decimal256_overflow() { let overflow_str_array = StringArray::from(vec![ From 74e2c5cd23070d6803ce1e0dbfb78693d463d1c2 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Mon, 25 Sep 2023 07:31:00 -0400 Subject: [PATCH 1233/1411] Make ArrowRowGroupWriter Public and SerializedRowGroupWriter Send (#4850) * changes in supported of async parallel parquet writer * rename ChainReader * cargo fmt --- parquet/src/arrow/arrow_writer/mod.rs | 20 +++++++++++--------- parquet/src/file/writer.rs | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5417ebe894a3..2e170738f1a8 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -248,7 +248,7 @@ impl RecordBatchWriter for ArrowWriter { /// A list of [`Bytes`] comprising a single column chunk #[derive(Default)] -struct ArrowColumnChunk { +pub struct ArrowColumnChunk { length: usize, data: Vec, } @@ -260,11 +260,13 @@ impl Length for ArrowColumnChunk { } impl ChunkReader for ArrowColumnChunk { - type T = ChainReader; + type T = ArrowColumnChunkReader; fn get_read(&self, start: u64) -> Result { assert_eq!(start, 0); // Assume append_column writes all data in one-shot - Ok(ChainReader(self.data.clone().into_iter().peekable())) + Ok(ArrowColumnChunkReader( + self.data.clone().into_iter().peekable(), + )) } fn get_bytes(&self, _start: u64, _length: usize) -> Result { @@ -273,9 +275,9 @@ impl ChunkReader for ArrowColumnChunk { } /// A [`Read`] for an iterator of [`Bytes`] -struct ChainReader(Peekable>); +pub struct ArrowColumnChunkReader(Peekable>); -impl Read for ChainReader { +impl Read for ArrowColumnChunkReader { fn read(&mut self, out: &mut [u8]) -> std::io::Result { let buffer = loop { match self.0.peek_mut() { @@ -362,14 +364,14 @@ impl ArrowColumnWriter { } /// Encodes [`RecordBatch`] to a parquet row group -struct ArrowRowGroupWriter { +pub struct ArrowRowGroupWriter { writers: Vec<(SharedColumnChunk, ArrowColumnWriter)>, schema: SchemaRef, buffered_rows: usize, } impl ArrowRowGroupWriter { - fn new( + pub fn new( parquet: &SchemaDescriptor, props: &WriterPropertiesPtr, arrow: &SchemaRef, @@ -386,7 +388,7 @@ impl ArrowRowGroupWriter { }) } - fn write(&mut self, batch: &RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.buffered_rows += batch.num_rows(); let mut writers = self.writers.iter_mut().map(|(_, x)| x); for (array, field) in batch.columns().iter().zip(&self.schema.fields) { @@ -396,7 +398,7 @@ impl ArrowRowGroupWriter { Ok(()) } - fn close(self) -> Result> { + pub fn close(self) -> Result> { self.writers .into_iter() .map(|(chunk, writer)| { diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index cafb1761352d..859a0aa1f902 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -115,7 +115,8 @@ pub type OnCloseRowGroup<'a> = Box< Vec>, Vec>, ) -> Result<()> - + 'a, + + 'a + + Send, >; // ---------------------------------------------------------------------- From 2c9e2e9a95b9defd59d4ad59970b87a6fb7fa58c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 25 Sep 2023 18:24:15 +0100 Subject: [PATCH 1234/1411] Add ObjectStore BufReader (#4762) (#4857) * Add ObjectStore BufReader (#4762) * Clippy * More Clippy * Fix MSRV * Fix doc --- object_store/src/buffered.rs | 293 +++++++++++++++++++++++++++++++++++ object_store/src/lib.rs | 1 + 2 files changed, 294 insertions(+) create mode 100644 object_store/src/buffered.rs diff --git a/object_store/src/buffered.rs b/object_store/src/buffered.rs new file mode 100644 index 000000000000..bdc3f4c772b9 --- /dev/null +++ b/object_store/src/buffered.rs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for performing tokio-style buffered IO + +use crate::path::Path; +use crate::{ObjectMeta, ObjectStore}; +use bytes::Bytes; +use futures::future::{BoxFuture, FutureExt}; +use futures::ready; +use std::cmp::Ordering; +use std::io::{Error, ErrorKind, SeekFrom}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncSeek, ReadBuf}; + +/// The default buffer size used by [`BufReader`] +pub const DEFAULT_BUFFER_SIZE: usize = 1024 * 1024; + +/// An async-buffered reader compatible with the tokio IO traits +/// +/// Internally this maintains a buffer of the requested size, and uses [`ObjectStore::get_range`] +/// to populate its internal buffer once depleted. This buffer is cleared on seek. +/// +/// Whilst simple, this interface will typically be outperformed by the native [`ObjectStore`] +/// methods that better map to the network APIs. This is because most object stores have +/// very [high first-byte latencies], on the order of 100-200ms, and so avoiding unnecessary +/// round-trips is critical to throughput. +/// +/// Systems looking to sequentially scan a file should instead consider using [`ObjectStore::get`], +/// or [`ObjectStore::get_opts`], or [`ObjectStore::get_range`] to read a particular range. +/// +/// Systems looking to read multiple ranges of a file should instead consider using +/// [`ObjectStore::get_ranges`], which will optimise the vectored IO. +/// +/// [high first-byte latencies]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html +pub struct BufReader { + /// The object store to fetch data from + store: Arc, + /// The size of the object + size: u64, + /// The path to the object + path: Path, + /// The current position in the object + cursor: u64, + /// The number of bytes to read in a single request + capacity: usize, + /// The buffered data if any + buffer: Buffer, +} + +impl std::fmt::Debug for BufReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BufReader") + .field("path", &self.path) + .field("size", &self.size) + .field("capacity", &self.capacity) + .finish() + } +} + +enum Buffer { + Empty, + Pending(BoxFuture<'static, std::io::Result>), + Ready(Bytes), +} + +impl BufReader { + /// Create a new [`BufReader`] from the provided [`ObjectMeta`] and [`ObjectStore`] + pub fn new(store: Arc, meta: &ObjectMeta) -> Self { + Self::with_capacity(store, meta, DEFAULT_BUFFER_SIZE) + } + + /// Create a new [`BufReader`] from the provided [`ObjectMeta`], [`ObjectStore`], and `capacity` + pub fn with_capacity( + store: Arc, + meta: &ObjectMeta, + capacity: usize, + ) -> Self { + Self { + path: meta.location.clone(), + size: meta.size as _, + store, + capacity, + cursor: 0, + buffer: Buffer::Empty, + } + } + + fn poll_fill_buf_impl( + &mut self, + cx: &mut Context<'_>, + amnt: usize, + ) -> Poll> { + let buf = &mut self.buffer; + loop { + match buf { + Buffer::Empty => { + let store = Arc::clone(&self.store); + let path = self.path.clone(); + let start = self.cursor.min(self.size) as _; + let end = self.cursor.saturating_add(amnt as u64).min(self.size) as _; + + if start == end { + return Poll::Ready(Ok(&[])); + } + + *buf = Buffer::Pending(Box::pin(async move { + Ok(store.get_range(&path, start..end).await?) + })) + } + Buffer::Pending(fut) => match ready!(fut.poll_unpin(cx)) { + Ok(b) => *buf = Buffer::Ready(b), + Err(e) => return Poll::Ready(Err(e)), + }, + Buffer::Ready(r) => return Poll::Ready(Ok(r)), + } + } + } +} + +impl AsyncSeek for BufReader { + fn start_seek(mut self: Pin<&mut Self>, position: SeekFrom) -> std::io::Result<()> { + self.cursor = match position { + SeekFrom::Start(offset) => offset, + SeekFrom::End(offset) => { + checked_add_signed(self.size,offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from end of {} byte file would result in overflow", self.size)))? + } + SeekFrom::Current(offset) => { + checked_add_signed(self.cursor, offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from current offset of {} would result in overflow", self.cursor)))? + } + }; + self.buffer = Buffer::Empty; + Ok(()) + } + + fn poll_complete( + self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(self.cursor)) + } +} + +impl AsyncRead for BufReader { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + out: &mut ReadBuf<'_>, + ) -> Poll> { + // Read the maximum of the internal buffer and `out` + let to_read = out.remaining().max(self.capacity); + let r = match ready!(self.poll_fill_buf_impl(cx, to_read)) { + Ok(buf) => { + let to_consume = out.remaining().min(buf.len()); + out.put_slice(&buf[..to_consume]); + self.consume(to_consume); + Ok(()) + } + Err(e) => Err(e), + }; + Poll::Ready(r) + } +} + +impl AsyncBufRead for BufReader { + fn poll_fill_buf( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let capacity = self.capacity; + self.get_mut().poll_fill_buf_impl(cx, capacity) + } + + fn consume(mut self: Pin<&mut Self>, amt: usize) { + match &mut self.buffer { + Buffer::Empty => assert_eq!(amt, 0, "cannot consume from empty buffer"), + Buffer::Ready(b) => match b.len().cmp(&amt) { + Ordering::Less => panic!("{amt} exceeds buffer sized of {}", b.len()), + Ordering::Greater => *b = b.slice(amt..), + Ordering::Equal => self.buffer = Buffer::Empty, + }, + Buffer::Pending(_) => panic!("cannot consume from pending buffer"), + } + self.cursor += amt as u64; + } +} + +/// Port of standardised function as requires Rust 1.66 +/// +/// +#[inline] +fn checked_add_signed(a: u64, rhs: i64) -> Option { + let (res, overflowed) = a.overflowing_add(rhs as _); + let overflow = overflowed ^ (rhs < 0); + (!overflow).then_some(res) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::memory::InMemory; + use crate::path::Path; + use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt}; + + #[tokio::test] + async fn test_buf_reader() { + let store = Arc::new(InMemory::new()) as Arc; + + let existent = Path::from("exists.txt"); + const BYTES: usize = 4096; + + let data: Bytes = b"12345678".iter().cycle().copied().take(BYTES).collect(); + store.put(&existent, data.clone()).await.unwrap(); + + let meta = store.head(&existent).await.unwrap(); + + let mut reader = BufReader::new(Arc::clone(&store), &meta); + let mut out = Vec::with_capacity(BYTES); + let read = reader.read_to_end(&mut out).await.unwrap(); + + assert_eq!(read, BYTES); + assert_eq!(&out, &data); + + let err = reader.seek(SeekFrom::Current(i64::MIN)).await.unwrap_err(); + assert_eq!(err.to_string(), "Seeking -9223372036854775808 from current offset of 4096 would result in overflow"); + + reader.rewind().await.unwrap(); + + let err = reader.seek(SeekFrom::Current(-1)).await.unwrap_err(); + assert_eq!( + err.to_string(), + "Seeking -1 from current offset of 0 would result in overflow" + ); + + // Seeking beyond the bounds of the file is permitted but should return no data + reader.seek(SeekFrom::Start(u64::MAX)).await.unwrap(); + let buf = reader.fill_buf().await.unwrap(); + assert!(buf.is_empty()); + + let err = reader.seek(SeekFrom::Current(1)).await.unwrap_err(); + assert_eq!(err.to_string(), "Seeking 1 from current offset of 18446744073709551615 would result in overflow"); + + for capacity in [200, 1024, 4096, DEFAULT_BUFFER_SIZE] { + let store = Arc::clone(&store); + let mut reader = BufReader::with_capacity(store, &meta, capacity); + + let mut bytes_read = 0; + loop { + let buf = reader.fill_buf().await.unwrap(); + if buf.is_empty() { + assert_eq!(bytes_read, BYTES); + break; + } + assert!(buf.starts_with(b"12345678")); + bytes_read += 8; + reader.consume(8); + } + + let mut buf = Vec::with_capacity(76); + reader.seek(SeekFrom::Current(-76)).await.unwrap(); + reader.read_to_end(&mut buf).await.unwrap(); + assert_eq!(&buf, &data[BYTES - 76..]); + + reader.rewind().await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert_eq!(buffer, &data[..capacity.min(BYTES)]); + + reader.seek(SeekFrom::Start(325)).await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert_eq!(buffer, &data[325..(325 + capacity).min(BYTES)]); + + reader.seek(SeekFrom::End(0)).await.unwrap(); + let buffer = reader.fill_buf().await.unwrap(); + assert!(buffer.is_empty()); + } + } +} diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index cef10f1dd418..3fd363fd4f06 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -253,6 +253,7 @@ compile_error!("Features 'gcp', 'aws', 'azure', 'http' are not supported on wasm pub mod aws; #[cfg(feature = "azure")] pub mod azure; +pub mod buffered; #[cfg(not(target_arch = "wasm32"))] pub mod chunked; pub mod delimited; From fbd9008d31e51018494c48eff032a77b93fab56a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:55:50 +0100 Subject: [PATCH 1235/1411] Faster Serde Integration (~80% faster) (#4861) * Store decoded numerics in JSON tape * Add arrow-json serde benchmarks * Fix timestamp serialize * Clippy --- arrow-json/Cargo.toml | 7 +++ arrow-json/benches/serde.rs | 62 ++++++++++++++++++++++++ arrow-json/src/reader/primitive_array.rs | 46 ++++++++++++++---- arrow-json/src/reader/serializer.rs | 56 +++++++++++---------- arrow-json/src/reader/tape.rs | 51 ++++++++++++++++++- arrow-json/src/reader/timestamp_array.rs | 7 +++ 6 files changed, 192 insertions(+), 37 deletions(-) create mode 100644 arrow-json/benches/serde.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 977ed4390c99..df38a52811c2 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -54,3 +54,10 @@ serde = { version = "1.0", default-features = false, features = ["derive"] } futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } bytes = "1.4" +criterion = { version = "0.5", default-features = false } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } + +[[bench]] +name = "serde" +harness = false + diff --git a/arrow-json/benches/serde.rs b/arrow-json/benches/serde.rs new file mode 100644 index 000000000000..7636b9c9dff9 --- /dev/null +++ b/arrow-json/benches/serde.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::*; +use rand::{thread_rng, Rng}; +use serde::Serialize; +use std::sync::Arc; + +#[allow(deprecated)] +fn do_bench(c: &mut Criterion, name: &str, rows: &[R], schema: &Schema) { + let schema = Arc::new(schema.clone()); + c.bench_function(name, |b| { + b.iter(|| { + let builder = ReaderBuilder::new(schema.clone()).with_batch_size(64); + let mut decoder = builder.build_decoder().unwrap(); + decoder.serialize(rows) + }) + }); +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut rng = thread_rng(); + let schema = Schema::new(vec![Field::new("i32", DataType::Int32, false)]); + let v: Vec = (0..2048).map(|_| rng.gen_range(0..10000)).collect(); + + do_bench(c, "small_i32", &v, &schema); + let v: Vec = (0..2048).map(|_| rng.gen()).collect(); + do_bench(c, "large_i32", &v, &schema); + + let schema = Schema::new(vec![Field::new("i64", DataType::Int64, false)]); + let v: Vec = (0..2048).map(|_| rng.gen_range(0..10000)).collect(); + do_bench(c, "small_i64", &v, &schema); + let v: Vec = (0..2048).map(|_| rng.gen_range(0..i32::MAX as _)).collect(); + do_bench(c, "medium_i64", &v, &schema); + let v: Vec = (0..2048).map(|_| rng.gen()).collect(); + do_bench(c, "large_i64", &v, &schema); + + let schema = Schema::new(vec![Field::new("f32", DataType::Float32, false)]); + let v: Vec = (0..2048).map(|_| rng.gen_range(0.0..10000.)).collect(); + do_bench(c, "small_f32", &v, &schema); + let v: Vec = (0..2048).map(|_| rng.gen_range(0.0..f32::MAX)).collect(); + do_bench(c, "large_f32", &v, &schema); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs index c78e4d914060..6cf0bac86737 100644 --- a/arrow-json/src/reader/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -91,11 +91,12 @@ impl PrimitiveArrayDecoder

{ impl

ArrayDecoder for PrimitiveArrayDecoder

where P: ArrowPrimitiveType + Parser, - P::Native: ParseJsonNumber, + P::Native: ParseJsonNumber + NumCast, { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) .with_data_type(self.data_type.clone()); + let d = &self.data_type; for p in pos { match tape.get(*p) { @@ -103,10 +104,7 @@ where TapeElement::String(idx) => { let s = tape.get_string(idx); let value = P::parse(s).ok_or_else(|| { - ArrowError::JsonError(format!( - "failed to parse \"{s}\" as {}", - self.data_type - )) + ArrowError::JsonError(format!("failed to parse \"{s}\" as {d}",)) })?; builder.append_value(value) @@ -115,14 +113,44 @@ where let s = tape.get_string(idx); let value = ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| { - ArrowError::JsonError(format!( - "failed to parse {s} as {}", - self.data_type - )) + ArrowError::JsonError(format!("failed to parse {s} as {d}",)) })?; builder.append_value(value) } + TapeElement::F32(v) => { + let v = f32::from_bits(v); + let value = NumCast::from(v).ok_or_else(|| { + ArrowError::JsonError(format!("failed to parse {v} as {d}",)) + })?; + builder.append_value(value) + } + TapeElement::I32(v) => { + let value = NumCast::from(v).ok_or_else(|| { + ArrowError::JsonError(format!("failed to parse {v} as {d}",)) + })?; + builder.append_value(value) + } + TapeElement::F64(high) => match tape.get(p + 1) { + TapeElement::F32(low) => { + let v = f64::from_bits((high as u64) << 32 | low as u64); + let value = NumCast::from(v).ok_or_else(|| { + ArrowError::JsonError(format!("failed to parse {v} as {d}",)) + })?; + builder.append_value(value) + } + _ => unreachable!(), + }, + TapeElement::I64(high) => match tape.get(p + 1) { + TapeElement::I32(low) => { + let v = (high as i64) << 32 | low as i64; + let value = NumCast::from(v).ok_or_else(|| { + ArrowError::JsonError(format!("failed to parse {v} as {d}",)) + })?; + builder.append_value(value) + } + _ => unreachable!(), + }, _ => return Err(tape.error(*p, "primitive")), } } diff --git a/arrow-json/src/reader/serializer.rs b/arrow-json/src/reader/serializer.rs index 2aa72de943f7..2fd250bdfcc3 100644 --- a/arrow-json/src/reader/serializer.rs +++ b/arrow-json/src/reader/serializer.rs @@ -77,22 +77,6 @@ impl<'a> TapeSerializer<'a> { } } -/// The tape stores all values as strings, and so must serialize numeric types -/// -/// Formatting to a string only to parse it back again is rather wasteful, -/// it may be possible to tweak the tape representation to avoid this -/// -/// Need to use macro as const generic expressions are unstable -/// -macro_rules! serialize_numeric { - ($s:ident, $t:ty, $v:ident) => {{ - let mut buffer = [0_u8; <$t>::FORMATTED_SIZE]; - let s = lexical_core::write($v, &mut buffer); - $s.serialize_number(s); - Ok(()) - }}; -} - impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> { type Ok = (); @@ -115,43 +99,63 @@ impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> { } fn serialize_i8(self, v: i8) -> Result<(), SerializerError> { - serialize_numeric!(self, i8, v) + self.serialize_i32(v as _) } fn serialize_i16(self, v: i16) -> Result<(), SerializerError> { - serialize_numeric!(self, i16, v) + self.serialize_i32(v as _) } fn serialize_i32(self, v: i32) -> Result<(), SerializerError> { - serialize_numeric!(self, i32, v) + self.elements.push(TapeElement::I32(v)); + Ok(()) } fn serialize_i64(self, v: i64) -> Result<(), SerializerError> { - serialize_numeric!(self, i64, v) + let low = v as i32; + let high = (v >> 32) as i32; + self.elements.push(TapeElement::I64(high)); + self.elements.push(TapeElement::I32(low)); + Ok(()) } fn serialize_u8(self, v: u8) -> Result<(), SerializerError> { - serialize_numeric!(self, u8, v) + self.serialize_i32(v as _) } fn serialize_u16(self, v: u16) -> Result<(), SerializerError> { - serialize_numeric!(self, u16, v) + self.serialize_i32(v as _) } fn serialize_u32(self, v: u32) -> Result<(), SerializerError> { - serialize_numeric!(self, u32, v) + match i32::try_from(v) { + Ok(v) => self.serialize_i32(v), + Err(_) => self.serialize_i64(v as _), + } } fn serialize_u64(self, v: u64) -> Result<(), SerializerError> { - serialize_numeric!(self, u64, v) + match i64::try_from(v) { + Ok(v) => self.serialize_i64(v), + Err(_) => { + let mut buffer = [0_u8; u64::FORMATTED_SIZE]; + let s = lexical_core::write(v, &mut buffer); + self.serialize_number(s); + Ok(()) + } + } } fn serialize_f32(self, v: f32) -> Result<(), SerializerError> { - serialize_numeric!(self, f32, v) + self.elements.push(TapeElement::F32(v.to_bits())); + Ok(()) } fn serialize_f64(self, v: f64) -> Result<(), SerializerError> { - serialize_numeric!(self, f64, v) + let bits = v.to_bits(); + self.elements.push(TapeElement::F64((bits >> 32) as u32)); + self.elements.push(TapeElement::F32(bits as u32)); + Ok(()) } fn serialize_char(self, v: char) -> Result<(), SerializerError> { diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 5eca7b43dcc7..801e8f29d525 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -18,6 +18,7 @@ use crate::reader::serializer::TapeSerializer; use arrow_schema::ArrowError; use serde::Serialize; +use std::fmt::Write; /// We decode JSON to a flattened tape representation, /// allowing for efficient traversal of the JSON data @@ -54,6 +55,25 @@ pub enum TapeElement { /// /// Contains the offset into the [`Tape`] string data Number(u32), + + /// The high bits of a i64 + /// + /// Followed by [`Self::I32`] containing the low bits + I64(i32), + + /// A 32-bit signed integer + /// + /// May be preceded by [`Self::I64`] containing high bits + I32(i32), + + /// The high bits of a 64-bit float + /// + /// Followed by [`Self::F32`] containing the low bits + F64(u32), + + /// A 32-bit float or the low-bits of a 64-bit float if preceded by [`Self::F64`] + F32(u32), + /// A true literal True, /// A false literal @@ -104,10 +124,15 @@ impl<'a> Tape<'a> { | TapeElement::Number(_) | TapeElement::True | TapeElement::False - | TapeElement::Null => Ok(cur_idx + 1), + | TapeElement::Null + | TapeElement::I32(_) + | TapeElement::F32(_) => Ok(cur_idx + 1), + TapeElement::I64(_) | TapeElement::F64(_) => Ok(cur_idx + 2), TapeElement::StartList(end_idx) => Ok(end_idx + 1), TapeElement::StartObject(end_idx) => Ok(end_idx + 1), - _ => Err(self.error(cur_idx, expected)), + TapeElement::EndObject(_) | TapeElement::EndList(_) => { + Err(self.error(cur_idx, expected)) + } } } @@ -153,6 +178,28 @@ impl<'a> Tape<'a> { TapeElement::True => out.push_str("true"), TapeElement::False => out.push_str("false"), TapeElement::Null => out.push_str("null"), + TapeElement::I64(high) => match self.get(idx + 1) { + TapeElement::I32(low) => { + let val = (high as i64) << 32 | low as i64; + let _ = write!(out, "{val}"); + return idx + 2; + } + _ => unreachable!(), + }, + TapeElement::I32(val) => { + let _ = write!(out, "{val}"); + } + TapeElement::F64(high) => match self.get(idx + 1) { + TapeElement::F32(low) => { + let val = f64::from_bits((high as u64) << 32 | low as u64); + let _ = write!(out, "{val}"); + return idx + 2; + } + _ => unreachable!(), + }, + TapeElement::F32(val) => { + let _ = write!(out, "{}", f32::from_bits(val)); + } } idx + 1 } diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index b80915f6a56a..09672614107c 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -96,6 +96,13 @@ where builder.append_value(value) } + TapeElement::I32(v) => builder.append_value(v as i64), + TapeElement::I64(high) => match tape.get(p + 1) { + TapeElement::I32(low) => { + builder.append_value((high as i64) << 32 | low as i64) + } + _ => unreachable!(), + }, _ => return Err(tape.error(*p, "primitive")), } } From 4ef7917bd57b701e30def8511b5fd8a7961f2fcf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:56:27 +0100 Subject: [PATCH 1236/1411] Prepare object_store 0.7.1 (#4860) --- object_store/CHANGELOG-old.md | 47 ++++++++++++++ object_store/CHANGELOG.md | 64 +++++++++---------- object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 81 insertions(+), 36 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 3880205bc05e..a0ced7c8d21e 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,53 @@ # Historical Changelog +## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) + +**Breaking changes:** + +- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) + +**Fixed bugs:** + +- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) +- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) +- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) +- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) +- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) +- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) +- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.6.0](https://github.com/apache/arrow-rs/tree/object_store_0.6.0) (2023-05-18) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.5.6...object_store_0.6.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 125063943726..1f069ce41eac 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,51 +19,49 @@ # Changelog -## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) +## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) - -**Breaking changes:** - -- Add range and ObjectMeta to GetResult \(\#4352\) \(\#4495\) [\#4677](https://github.com/apache/arrow-rs/pull/4677) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) **Implemented enhancements:** -- Add AzureConfigKey::ContainerName [\#4629](https://github.com/apache/arrow-rs/issues/4629) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) -- Make object\_store::multipart public [\#4569](https://github.com/apache/arrow-rs/issues/4569) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Export `ClientConfigKey` and make the `HttpBuilder` more consistent with other builders [\#4515](https://github.com/apache/arrow-rs/issues/4515) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store/InMemory: Make `clone()` non-async [\#4496](https://github.com/apache/arrow-rs/issues/4496) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add Range to GetResult::File [\#4352](https://github.com/apache/arrow-rs/issues/4352) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support copy\_if\_not\_exists for Cloudflare R2 \(S3 API\) [\#4190](https://github.com/apache/arrow-rs/issues/4190) +- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) +- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- object\_store documentation is broken [\#4683](https://github.com/apache/arrow-rs/issues/4683) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Exports are not sufficient for configuring some object stores, for example minio running locally [\#4530](https://github.com/apache/arrow-rs/issues/4530) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object\_store: Uploading empty file to S3 results in "411 Length Required" [\#4514](https://github.com/apache/arrow-rs/issues/4514) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- GCP doesn't fetch public objects [\#4417](https://github.com/apache/arrow-rs/issues/4417) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) +- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) +- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Closed issues:** -- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) -- AWS Profile credentials no longer working in object\_store 0.6.1 [\#4556](https://github.com/apache/arrow-rs/issues/4556) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) **Merged pull requests:** -- Add AzureConfigKey::ContainerName \(\#4629\) [\#4686](https://github.com/apache/arrow-rs/pull/4686) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) -- Use Config System for Object Store Integration Tests [\#4628](https://github.com/apache/arrow-rs/pull/4628) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Prepare arrow 45 [\#4590](https://github.com/apache/arrow-rs/pull/4590) ([tustvold](https://github.com/tustvold)) -- Add Support for Microsoft Fabric / OneLake [\#4573](https://github.com/apache/arrow-rs/pull/4573) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vmuddassir-msft](https://github.com/vmuddassir-msft)) -- Cleanup multipart upload trait [\#4572](https://github.com/apache/arrow-rs/pull/4572) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make object\_store::multipart public [\#4570](https://github.com/apache/arrow-rs/pull/4570) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([yjshen](https://github.com/yjshen)) -- Handle empty S3 payloads \(\#4514\) [\#4518](https://github.com/apache/arrow-rs/pull/4518) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- object\_store: Export `ClientConfigKey` and add `HttpBuilder::with_config` [\#4516](https://github.com/apache/arrow-rs/pull/4516) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([thehabbos007](https://github.com/thehabbos007)) -- object\_store: Implement `ObjectStore` for `Arc` [\#4502](https://github.com/apache/arrow-rs/pull/4502) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- object\_store/InMemory: Add `fork()` fn and deprecate `clone()` fn [\#4499](https://github.com/apache/arrow-rs/pull/4499) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- Bump actions/deploy-pages from 1 to 2 [\#4449](https://github.com/apache/arrow-rs/pull/4449) ([dependabot[bot]](https://github.com/apps/dependabot)) -- gcp: Exclude authorization header when bearer empty [\#4418](https://github.com/apache/arrow-rs/pull/4418) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([vrongmeal](https://github.com/vrongmeal)) -- Support copy\_if\_not\_exists for Cloudflare R2 \(\#4190\) [\#4239](https://github.com/apache/arrow-rs/pull/4239) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) +- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) +- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + + \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 72722df5483a..ff8047c60ca9 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.7.0" +version = "0.7.1" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 48835c715552..aeec3caf4f57 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.6.1" -FUTURE_RELEASE="object_store_0.7.0" +SINCE_TAG="object_store_0.7.0" +FUTURE_RELEASE="object_store_0.7.1" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 8a07be3a5111aaab8dda5288f5ebcd962f00ad66 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 27 Sep 2023 02:28:44 -0700 Subject: [PATCH 1237/1411] Make align_buffers as public API (#4863) --- arrow-data/src/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 2073b932c994..5f87dddd4217 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -705,7 +705,7 @@ impl ArrayData { /// /// This can be useful for when interacting with data sent over IPC or FFI, that may /// not meet the minimum alignment requirements - fn align_buffers(&mut self) { + pub fn align_buffers(&mut self) { let layout = layout(&self.data_type); for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { if let BufferSpec::FixedWidth { alignment, .. } = spec { From b600e202f4bd8a9ec91b58db3686ec84213a01f0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 27 Sep 2023 10:29:39 +0100 Subject: [PATCH 1238/1411] Enable new integration tests (#4828) (#4862) * Enable new integration tests (#4828) * Enable ARROW_GO_INTEGRATION --- .github/workflows/integration.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 3ff6aedb0122..aaf39d22bbce 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -56,6 +56,7 @@ jobs: env: ARROW_USE_CCACHE: OFF ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_GO_INTEGRATION: 1 BUILD_DOCS_CPP: OFF # These are necessary because the github runner overrides $HOME # https://github.com/actions/runner/issues/863 @@ -106,6 +107,8 @@ jobs: run: | conda run --no-capture-output archery integration \ --run-flight \ + --run-c-data \ + --run-ipc \ --with-cpp=1 \ --with-csharp=1 \ --with-java=1 \ From 786760a8f26cb3a6ebe0f552dee7d337036913c2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 27 Sep 2023 17:52:25 +0100 Subject: [PATCH 1239/1411] Flush in multiple_append test (#4868) (#4869) --- object_store/src/local.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 20eb3c63ccbd..c625c59fbc27 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1545,11 +1545,13 @@ mod not_wasm_tests { for d in &data { writer.write_all(d).await.unwrap(); } + writer.flush().await.unwrap(); let mut writer = integration.append(&location).await.unwrap(); for d in &data { writer.write_all(d).await.unwrap(); } + writer.flush().await.unwrap(); let read_data = integration .get(&location) From 62bb64cf9f034a75e4485719653253077eb8efa6 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 27 Sep 2023 12:17:30 -0700 Subject: [PATCH 1240/1411] Check precision overflow for casting floating to decimal (#4866) * Check precision overflow for casting floating to decimal * For review --- arrow-cast/src/cast.rs | 122 ++++++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 20 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index e7727565c981..54c500f1ac41 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -364,21 +364,32 @@ where if cast_options.safe { array - .unary_opt::<_, Decimal128Type>(|v| (mul * v.as_()).round().to_i128()) + .unary_opt::<_, Decimal128Type>(|v| { + (mul * v.as_()).round().to_i128().filter(|v| { + Decimal128Type::validate_decimal_precision(*v, precision).is_ok() + }) + }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array .try_unary::<_, Decimal128Type, _>(|v| { - (mul * v.as_()).round().to_i128().ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - precision, - scale, - v - )) - }) + (mul * v.as_()) + .round() + .to_i128() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + precision, + scale, + v + )) + }) + .and_then(|v| { + Decimal128Type::validate_decimal_precision(v, precision) + .map(|_| v) + }) })? .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -398,21 +409,30 @@ where if cast_options.safe { array - .unary_opt::<_, Decimal256Type>(|v| i256::from_f64((v.as_() * mul).round())) + .unary_opt::<_, Decimal256Type>(|v| { + i256::from_f64((v.as_() * mul).round()).filter(|v| { + Decimal256Type::validate_decimal_precision(*v, precision).is_ok() + }) + }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array .try_unary::<_, Decimal256Type, _>(|v| { - i256::from_f64((v.as_() * mul).round()).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - precision, - scale, - v - )) - }) + i256::from_f64((v.as_() * mul).round()) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + precision, + scale, + v + )) + }) + .and_then(|v| { + Decimal256Type::validate_decimal_precision(v, precision) + .map(|_| v) + }) })? .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -7748,6 +7768,68 @@ mod tests { assert!(casted_array.is_err()); } + #[test] + fn test_cast_floating_point_to_decimal128_precision_overflow() { + let array = Float64Array::from(vec![1.1]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(2, 2), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(2, 2), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Invalid argument error: 110 is too large to store in a Decimal128 of precision 2. Max is 99"; + assert!( + err.contains(expected_error), + "did not find expected error '{expected_error}' in actual error '{err}'" + ); + } + + #[test] + fn test_cast_floating_point_to_decimal256_precision_overflow() { + let array = Float64Array::from(vec![1.1]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(2, 2), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(2, 2), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Invalid argument error: 110 is too large to store in a Decimal256 of precision 2. Max is 99"; + assert!( + err.contains(expected_error), + "did not find expected error '{expected_error}' in actual error '{err}'" + ); + } + #[test] fn test_cast_floating_point_to_decimal128_overflow() { let array = Float64Array::from(vec![f64::MAX]); From 284556e55ae073b88cd24cbc0749e19f394d21fd Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Thu, 28 Sep 2023 13:51:15 +0200 Subject: [PATCH 1241/1411] feat: improve flight CLI error handling (#4873) **Before:** ```text thread 'main' panicked at 'collect data stream: Status { code: Internal, message: "h2 protocol error: error reading a body from connection: stream error received: unexpected internal error encountered", source: Some(hyper::Error(Body, Error { kind: Reset(S treamId(3), INTERNAL_ERROR, Remote) })) }', arrow-flight/src/bin/flight_sql_client.rs:130:14 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ``` **After:** ```text Error: read flight data Caused by: 0: collect data stream 1: status: Internal, message: "h2 protocol error: error reading a body from connection: stream error received: unexpected internal error encountered", details: [], metadata: MetadataMap { headers: {} } 2: error reading a body from connection: stream error received: unexpected internal error encountered 3: stream error received: unexpected internal error encountered ``` --- arrow-flight/Cargo.toml | 3 +- arrow-flight/src/bin/flight_sql_client.rs | 67 ++++++++++++----------- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 29a8109d8889..54c5cdf5e2c7 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -49,6 +49,7 @@ tokio = { version = "1.0", default-features = false, features = ["macros", "rt", tonic = { version = "0.10.0", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies +anyhow = { version = "1.0", optional = true } clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } tracing-log = { version = "0.1", optional = true } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "fmt"], optional = true } @@ -62,7 +63,7 @@ flight-sql-experimental = ["arrow-arith", "arrow-data", "arrow-ord", "arrow-row" tls = ["tonic/tls"] # Enable CLI tools -cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["anyhow", "arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index d7b02414c5cc..c6aaccf376eb 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -17,13 +17,14 @@ use std::{error::Error, sync::Arc, time::Duration}; +use anyhow::{Context, Result}; use arrow_array::{ArrayRef, Datum, RecordBatch, StringArray}; use arrow_cast::{cast_with_options, pretty::pretty_format_batches, CastOptions}; use arrow_flight::{ sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, FlightInfo, }; -use arrow_schema::{ArrowError, Schema}; +use arrow_schema::Schema; use clap::{Parser, Subcommand}; use futures::TryStreamExt; use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; @@ -116,21 +117,23 @@ enum Command { } #[tokio::main] -async fn main() { +async fn main() -> Result<()> { let args = Args::parse(); - setup_logging(); - let mut client = setup_client(args.client_args).await.expect("setup client"); + setup_logging()?; + let mut client = setup_client(args.client_args) + .await + .context("setup client")?; let flight_info = match args.cmd { Command::StatementQuery { query } => client .execute(query, None) .await - .expect("execute statement"), + .context("execute statement")?, Command::PreparedStatementQuery { query, params } => { let mut prepared_stmt = client .prepare(query, None) .await - .expect("prepare statement"); + .context("prepare statement")?; if !params.is_empty() { prepared_stmt @@ -139,33 +142,35 @@ async fn main() { ¶ms, prepared_stmt .parameter_schema() - .expect("get parameter schema"), + .context("get parameter schema")?, ) - .expect("construct parameters"), + .context("construct parameters")?, ) - .expect("bind parameters") + .context("bind parameters")?; } prepared_stmt .execute() .await - .expect("execute prepared statement") + .context("execute prepared statement")? } }; let batches = execute_flight(&mut client, flight_info) .await - .expect("read flight data"); + .context("read flight data")?; - let res = pretty_format_batches(batches.as_slice()).expect("format results"); + let res = pretty_format_batches(batches.as_slice()).context("format results")?; println!("{res}"); + + Ok(()) } async fn execute_flight( client: &mut FlightSqlServiceClient, info: FlightInfo, -) -> Result, ArrowError> { - let schema = Arc::new(Schema::try_from(info.clone()).expect("valid schema")); +) -> Result> { + let schema = Arc::new(Schema::try_from(info.clone()).context("valid schema")?); let mut batches = Vec::with_capacity(info.endpoint.len() + 1); batches.push(RecordBatch::new_empty(schema)); info!("decoded schema"); @@ -174,13 +179,13 @@ async fn execute_flight( let Some(ticket) = &endpoint.ticket else { panic!("did not get ticket"); }; - let flight_data = client.do_get(ticket.clone()).await.expect("do get"); + let flight_data = client.do_get(ticket.clone()).await.context("do get")?; let flight_data: Vec = flight_data .try_collect() .await - .expect("collect data stream"); + .context("collect data stream")?; let mut endpoint_batches = flight_data_to_batches(&flight_data) - .expect("convert flight data to record batches"); + .context("convert flight data to record batches")?; batches.append(&mut endpoint_batches); } info!("received data"); @@ -191,7 +196,7 @@ async fn execute_flight( fn construct_record_batch_from_params( params: &[(String, String)], parameter_schema: &Schema, -) -> Result { +) -> Result { let mut items = Vec::<(&String, ArrayRef)>::new(); for (name, value) in params { @@ -205,23 +210,22 @@ fn construct_record_batch_from_params( items.push((name, casted)) } - RecordBatch::try_from_iter(items) + Ok(RecordBatch::try_from_iter(items)?) } -fn setup_logging() { - tracing_log::LogTracer::init().expect("tracing log init"); +fn setup_logging() -> Result<()> { + tracing_log::LogTracer::init().context("tracing log init")?; tracing_subscriber::fmt::init(); + Ok(()) } -async fn setup_client( - args: ClientArgs, -) -> Result, ArrowError> { +async fn setup_client(args: ClientArgs) -> Result> { let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); let protocol = if args.tls { "https" } else { "http" }; let mut endpoint = Endpoint::new(format!("{}://{}:{}", protocol, args.host, port)) - .map_err(|_| ArrowError::IpcError("Cannot create endpoint".to_string()))? + .context("create endpoint")? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait @@ -232,15 +236,12 @@ async fn setup_client( if args.tls { let tls_config = ClientTlsConfig::new(); - endpoint = endpoint.tls_config(tls_config).map_err(|_| { - ArrowError::IpcError("Cannot create TLS endpoint".to_string()) - })?; + endpoint = endpoint + .tls_config(tls_config) + .context("create TLS endpoint")?; } - let channel = endpoint - .connect() - .await - .map_err(|e| ArrowError::IpcError(format!("Cannot connect to endpoint: {e}")))?; + let channel = endpoint.connect().await.context("connect to endpoint")?; let mut client = FlightSqlServiceClient::new(channel); info!("connected"); @@ -260,7 +261,7 @@ async fn setup_client( client .handshake(&username, &password) .await - .expect("handshake"); + .context("handshake")?; info!("performed handshake"); } (Some(_), None) => { From 471f6dd2911d8328ca56efe2f685e08c0a3fb8c8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 28 Sep 2023 17:46:49 +0100 Subject: [PATCH 1242/1411] Flush in creates_dir_if_not_present_append (#4872) (#4874) --- object_store/src/local.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index c625c59fbc27..69da170b0872 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1494,6 +1494,8 @@ mod not_wasm_tests { writer.write_all(data.as_ref()).await.unwrap(); + writer.flush().await.unwrap(); + let read_data = integration .get(&location) .await From 0e04757d90c22186983af12f01385f00f4e85f1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Sep 2023 07:58:11 -0700 Subject: [PATCH 1243/1411] Update tonic-build requirement from =0.10.1 to =0.10.2 (#4881) Updates the requirements on [tonic-build](https://github.com/hyperium/tonic) to permit the latest version. - [Changelog](https://github.com/hyperium/tonic/blob/master/CHANGELOG.md) - [Commits](https://github.com/hyperium/tonic/compare/v0.10.1...v0.10.2) --- updated-dependencies: - dependency-name: tonic-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 50305579d833..5be24e780ae9 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -34,4 +34,4 @@ publish = false # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.67", default-features = false } prost-build = { version = "=0.12.1", default-features = false } -tonic-build = { version = "=0.10.1", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From 3ac0053772660f09483d14649996f73be6d45269 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:53:56 +0100 Subject: [PATCH 1244/1411] Support Encoding Parquet Columns in Parallel (#4871) * Facilitate parallel parquet writing * Revert OnCloseRowGroup Send * Add example * Review feedback * Fix doc * Further review feedback * More docs --- parquet/src/arrow/arrow_writer/levels.rs | 434 +++++++++++++---------- parquet/src/arrow/arrow_writer/mod.rs | 306 +++++++++++----- parquet/src/file/writer.rs | 3 +- 3 files changed, 461 insertions(+), 282 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 48615dc3d599..4a0bd551e1f9 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -42,19 +42,20 @@ use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; -use arrow_array::{Array, ArrayRef, FixedSizeListArray, OffsetSizeTrait, StructArray}; -use arrow_buffer::NullBuffer; +use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::{NullBuffer, OffsetBuffer}; use arrow_schema::{DataType, Field}; use std::ops::Range; +use std::sync::Arc; -/// Performs a depth-first scan of the children of `array`, constructing [`LevelInfo`] +/// Performs a depth-first scan of the children of `array`, constructing [`ArrayLevels`] /// for each leaf column encountered pub(crate) fn calculate_array_levels( array: &ArrayRef, field: &Field, -) -> Result> { - let mut builder = LevelInfoBuilder::try_new(field, Default::default())?; - builder.write(array, 0..array.len()); +) -> Result> { + let mut builder = LevelInfoBuilder::try_new(field, Default::default(), array)?; + builder.write(0..array.len()); Ok(builder.finish()) } @@ -102,31 +103,57 @@ struct LevelContext { def_level: i16, } -/// A helper to construct [`LevelInfo`] from a potentially nested [`Field`] +/// A helper to construct [`ArrayLevels`] from a potentially nested [`Field`] enum LevelInfoBuilder { /// A primitive, leaf array - Primitive(LevelInfo), - /// A list array, contains the [`LevelInfoBuilder`] of the child and - /// the [`LevelContext`] of this list - List(Box, LevelContext), - /// A list array, contains the [`LevelInfoBuilder`] of its children and - /// the [`LevelContext`] of this struct array - Struct(Vec, LevelContext), + Primitive(ArrayLevels), + /// A list array + List( + Box, // Child Values + LevelContext, // Context + OffsetBuffer, // Offsets + Option, // Nulls + ), + /// A large list array + LargeList( + Box, // Child Values + LevelContext, // Context + OffsetBuffer, // Offsets + Option, // Nulls + ), + /// A fixed size list array + FixedSizeList( + Box, // Values + LevelContext, // Context + usize, // List Size + Option, // Nulls + ), + /// A struct array + Struct(Vec, LevelContext, Option), } impl LevelInfoBuilder { /// Create a new [`LevelInfoBuilder`] for the given [`Field`] and parent [`LevelContext`] - fn try_new(field: &Field, parent_ctx: LevelContext) -> Result { - match field.data_type() { - d if is_leaf(d) => Ok(Self::Primitive(LevelInfo::new( - parent_ctx, - field.is_nullable(), - ))), - DataType::Dictionary(_, v) if is_leaf(v.as_ref()) => Ok(Self::Primitive( - LevelInfo::new(parent_ctx, field.is_nullable()), - )), + fn try_new( + field: &Field, + parent_ctx: LevelContext, + array: &ArrayRef, + ) -> Result { + assert_eq!(field.data_type(), array.data_type()); + let is_nullable = field.is_nullable(); + + match array.data_type() { + d if is_leaf(d) => { + let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone()); + Ok(Self::Primitive(levels)) + } + DataType::Dictionary(_, v) if is_leaf(v.as_ref()) => { + let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone()); + Ok(Self::Primitive(levels)) + } DataType::Struct(children) => { - let def_level = match field.is_nullable() { + let array = array.as_struct(); + let def_level = match is_nullable { true => parent_ctx.def_level + 1, false => parent_ctx.def_level, }; @@ -138,16 +165,17 @@ impl LevelInfoBuilder { let children = children .iter() - .map(|f| Self::try_new(f, ctx)) + .zip(array.columns()) + .map(|(f, a)| Self::try_new(f, ctx, a)) .collect::>()?; - Ok(Self::Struct(children, ctx)) + Ok(Self::Struct(children, ctx, array.nulls().cloned())) } DataType::List(child) | DataType::LargeList(child) | DataType::Map(child, _) | DataType::FixedSizeList(child, _) => { - let def_level = match field.is_nullable() { + let def_level = match is_nullable { true => parent_ctx.def_level + 2, false => parent_ctx.def_level + 1, }; @@ -157,79 +185,70 @@ impl LevelInfoBuilder { def_level, }; - let child = Self::try_new(child.as_ref(), ctx)?; - Ok(Self::List(Box::new(child), ctx)) + Ok(match field.data_type() { + DataType::List(_) => { + let list = array.as_list(); + let child = Self::try_new(child.as_ref(), ctx, list.values())?; + let offsets = list.offsets().clone(); + Self::List(Box::new(child), ctx, offsets, list.nulls().cloned()) + } + DataType::LargeList(_) => { + let list = array.as_list(); + let child = Self::try_new(child.as_ref(), ctx, list.values())?; + let offsets = list.offsets().clone(); + let nulls = list.nulls().cloned(); + Self::LargeList(Box::new(child), ctx, offsets, nulls) + } + DataType::Map(_, _) => { + let map = array.as_map(); + let entries = Arc::new(map.entries().clone()) as ArrayRef; + let child = Self::try_new(child.as_ref(), ctx, &entries)?; + let offsets = map.offsets().clone(); + Self::List(Box::new(child), ctx, offsets, map.nulls().cloned()) + } + DataType::FixedSizeList(_, size) => { + let list = array.as_fixed_size_list(); + let child = Self::try_new(child.as_ref(), ctx, list.values())?; + let nulls = list.nulls().cloned(); + Self::FixedSizeList(Box::new(child), ctx, *size as _, nulls) + } + _ => unreachable!(), + }) } d => Err(nyi_err!("Datatype {} is not yet supported", d)), } } - /// Finish this [`LevelInfoBuilder`] returning the [`LevelInfo`] for the leaf columns + /// Finish this [`LevelInfoBuilder`] returning the [`ArrayLevels`] for the leaf columns /// as enumerated by a depth-first search - fn finish(self) -> Vec { + fn finish(self) -> Vec { match self { LevelInfoBuilder::Primitive(v) => vec![v], - LevelInfoBuilder::List(v, _) => v.finish(), - LevelInfoBuilder::Struct(v, _) => { + LevelInfoBuilder::List(v, _, _, _) + | LevelInfoBuilder::LargeList(v, _, _, _) + | LevelInfoBuilder::FixedSizeList(v, _, _, _) => v.finish(), + LevelInfoBuilder::Struct(v, _, _) => { v.into_iter().flat_map(|l| l.finish()).collect() } } } /// Given an `array`, write the level data for the elements in `range` - fn write(&mut self, array: &dyn Array, range: Range) { - match array.data_type() { - d if is_leaf(d) => self.write_leaf(array, range), - DataType::Dictionary(_, v) if is_leaf(v.as_ref()) => { - self.write_leaf(array, range) - } - DataType::Struct(_) => { - let array = array.as_struct(); - self.write_struct(array, range) - } - DataType::List(_) => { - let array = array.as_list::(); - self.write_list( - array.value_offsets(), - array.nulls(), - array.values(), - range, - ) + fn write(&mut self, range: Range) { + match self { + LevelInfoBuilder::Primitive(info) => Self::write_leaf(info, range), + LevelInfoBuilder::List(child, ctx, offsets, nulls) => { + Self::write_list(child, ctx, offsets, nulls.as_ref(), range) } - DataType::LargeList(_) => { - let array = array.as_list::(); - self.write_list( - array.value_offsets(), - array.nulls(), - array.values(), - range, - ) + LevelInfoBuilder::LargeList(child, ctx, offsets, nulls) => { + Self::write_list(child, ctx, offsets, nulls.as_ref(), range) } - DataType::Map(_, _) => { - let array = array.as_map(); - // A Map is just as ListArray with a StructArray child, we therefore - // treat it as such to avoid code duplication - self.write_list( - array.value_offsets(), - array.nulls(), - array.entries(), - range, - ) + LevelInfoBuilder::FixedSizeList(child, ctx, size, nulls) => { + Self::write_fixed_size_list(child, ctx, *size, nulls.as_ref(), range) } - &DataType::FixedSizeList(_, size) => { - let array = array - .as_any() - .downcast_ref::() - .expect("unable to get fixed-size list array"); - - self.write_fixed_size_list( - size as usize, - array.nulls(), - array.values(), - range, - ) + LevelInfoBuilder::Struct(children, ctx, nulls) => { + Self::write_struct(children, ctx, nulls.as_ref(), range) } - _ => unreachable!(), } } @@ -237,22 +256,17 @@ impl LevelInfoBuilder { /// /// Note: MapArrays are `ListArray` under the hood and so are dispatched to this method fn write_list( - &mut self, + child: &mut LevelInfoBuilder, + ctx: &LevelContext, offsets: &[O], nulls: Option<&NullBuffer>, - values: &dyn Array, range: Range, ) { - let (child, ctx) = match self { - Self::List(child, ctx) => (child, ctx), - _ => unreachable!(), - }; - let offsets = &offsets[range.start..range.end + 1]; let write_non_null_slice = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { - child.write(values, start_idx..end_idx); + child.write(start_idx..end_idx); child.visit_leaves(|leaf| { let rep_levels = leaf.rep_levels.as_mut().unwrap(); let mut rev = rep_levels.iter_mut().rev(); @@ -324,12 +338,12 @@ impl LevelInfoBuilder { } /// Write `range` elements from StructArray `array` - fn write_struct(&mut self, array: &StructArray, range: Range) { - let (children, ctx) = match self { - Self::Struct(children, ctx) => (children, ctx), - _ => unreachable!(), - }; - + fn write_struct( + children: &mut [LevelInfoBuilder], + ctx: &LevelContext, + nulls: Option<&NullBuffer>, + range: Range, + ) { let write_null = |children: &mut [LevelInfoBuilder], range: Range| { for child in children { child.visit_leaves(|info| { @@ -346,12 +360,12 @@ impl LevelInfoBuilder { }; let write_non_null = |children: &mut [LevelInfoBuilder], range: Range| { - for (child_array, child) in array.columns().iter().zip(children) { - child.write(child_array, range.clone()) + for child in children { + child.write(range.clone()) } }; - match array.nulls() { + match nulls { Some(validity) => { let mut last_non_null_idx = None; let mut last_null_idx = None; @@ -388,22 +402,17 @@ impl LevelInfoBuilder { /// Write `range` elements from FixedSizeListArray with child data `values` and null bitmap `nulls`. fn write_fixed_size_list( - &mut self, + child: &mut LevelInfoBuilder, + ctx: &LevelContext, fixed_size: usize, nulls: Option<&NullBuffer>, - values: &dyn Array, range: Range, ) { - let (child, ctx) = match self { - Self::List(child, ctx) => (child, ctx), - _ => unreachable!(), - }; - let write_non_null = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { let values_start = start_idx * fixed_size; let values_end = end_idx * fixed_size; - child.write(values, values_start..values_end); + child.write(values_start..values_end); child.visit_leaves(|leaf| { let rep_levels = leaf.rep_levels.as_mut().unwrap(); @@ -481,12 +490,7 @@ impl LevelInfoBuilder { } /// Write a primitive array, as defined by [`is_leaf`] - fn write_leaf(&mut self, array: &dyn Array, range: Range) { - let info = match self { - Self::Primitive(info) => info, - _ => unreachable!(), - }; - + fn write_leaf(info: &mut ArrayLevels, range: Range) { let len = range.end - range.start; match &mut info.def_levels { @@ -494,7 +498,7 @@ impl LevelInfoBuilder { def_levels.reserve(len); info.non_null_indices.reserve(len); - match array.logical_nulls() { + match info.array.logical_nulls() { Some(nulls) => { // TODO: Faster bitmask iteration (#1757) for i in range { @@ -523,11 +527,13 @@ impl LevelInfoBuilder { } /// Visits all children of this node in depth first order - fn visit_leaves(&mut self, visit: impl Fn(&mut LevelInfo) + Copy) { + fn visit_leaves(&mut self, visit: impl Fn(&mut ArrayLevels) + Copy) { match self { LevelInfoBuilder::Primitive(info) => visit(info), - LevelInfoBuilder::List(c, _) => c.visit_leaves(visit), - LevelInfoBuilder::Struct(children, _) => { + LevelInfoBuilder::List(c, _, _, _) + | LevelInfoBuilder::LargeList(c, _, _, _) + | LevelInfoBuilder::FixedSizeList(c, _, _, _) => c.visit_leaves(visit), + LevelInfoBuilder::Struct(children, _, _) => { for c in children { c.visit_leaves(visit) } @@ -537,8 +543,8 @@ impl LevelInfoBuilder { } /// The data necessary to write a primitive Arrow array to parquet, taking into account /// any non-primitive parents it may have in the arrow representation -#[derive(Debug, Eq, PartialEq, Clone)] -pub(crate) struct LevelInfo { +#[derive(Debug, Clone)] +pub(crate) struct ArrayLevels { /// Array's definition levels /// /// Present if `max_def_level != 0` @@ -558,10 +564,25 @@ pub(crate) struct LevelInfo { /// The maximum repetition for this leaf column max_rep_level: i16, + + /// The arrow array + array: ArrayRef, } -impl LevelInfo { - fn new(ctx: LevelContext, is_nullable: bool) -> Self { +impl PartialEq for ArrayLevels { + fn eq(&self, other: &Self) -> bool { + self.def_levels == other.def_levels + && self.rep_levels == other.rep_levels + && self.non_null_indices == other.non_null_indices + && self.max_def_level == other.max_def_level + && self.max_rep_level == other.max_rep_level + && self.array.as_ref() == other.array.as_ref() + } +} +impl Eq for ArrayLevels {} + +impl ArrayLevels { + fn new(ctx: LevelContext, is_nullable: bool, array: ArrayRef) -> Self { let max_rep_level = ctx.rep_level; let max_def_level = match is_nullable { true => ctx.def_level + 1, @@ -574,9 +595,14 @@ impl LevelInfo { non_null_indices: vec![], max_def_level, max_rep_level, + array, } } + pub fn array(&self) -> &ArrayRef { + &self.array + } + pub fn def_levels(&self) -> Option<&[i16]> { self.def_levels.as_deref() } @@ -597,6 +623,7 @@ mod tests { use std::sync::Arc; use arrow_array::builder::*; + use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; use arrow_array::*; use arrow_buffer::{Buffer, ToByteSlice}; @@ -622,7 +649,7 @@ mod tests { let inner_list = ArrayDataBuilder::new(inner_type) .len(4) .add_buffer(offsets) - .add_child_data(primitives.into_data()) + .add_child_data(primitives.to_data()) .build() .unwrap(); @@ -638,12 +665,13 @@ mod tests { let levels = calculate_array_levels(&outer_list, &outer_field).unwrap(); assert_eq!(levels.len(), 1); - let expected = LevelInfo { + let expected = ArrayLevels { def_levels: Some(vec![2; 10]), rep_levels: Some(vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]), non_null_indices: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], max_def_level: 2, max_rep_level: 2, + array: Arc::new(primitives), }; assert_eq!(&levels[0], &expected); } @@ -657,12 +685,13 @@ mod tests { let levels = calculate_array_levels(&array, &field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: None, rep_levels: None, non_null_indices: (0..10).collect(), max_def_level: 0, max_rep_level: 0, + array, }; assert_eq!(&levels[0], &expected_levels); } @@ -682,12 +711,13 @@ mod tests { let levels = calculate_array_levels(&array, &field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![1, 0, 1, 1, 0]), rep_levels: None, non_null_indices: vec![0, 2, 3], max_def_level: 1, max_rep_level: 0, + array, }; assert_eq!(&levels[0], &expected_levels); } @@ -706,7 +736,7 @@ mod tests { let list = ArrayDataBuilder::new(list_type.clone()) .len(5) .add_buffer(offsets) - .add_child_data(leaf_array.into_data()) + .add_child_data(leaf_array.to_data()) .build() .unwrap(); let list = make_array(list); @@ -715,12 +745,13 @@ mod tests { let levels = calculate_array_levels(&list, &list_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![1; 5]), rep_levels: Some(vec![0; 5]), non_null_indices: (0..5).collect(), max_def_level: 1, max_rep_level: 1, + array: Arc::new(leaf_array), }; assert_eq!(&levels[0], &expected_levels); @@ -737,7 +768,7 @@ mod tests { let list = ArrayDataBuilder::new(list_type.clone()) .len(5) .add_buffer(offsets) - .add_child_data(leaf_array.into_data()) + .add_child_data(leaf_array.to_data()) .null_bit_buffer(Some(Buffer::from([0b00011101]))) .build() .unwrap(); @@ -747,12 +778,13 @@ mod tests { let levels = calculate_array_levels(&list, &list_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2]), rep_levels: Some(vec![0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1]), non_null_indices: (0..11).collect(), max_def_level: 2, max_rep_level: 1, + array: Arc::new(leaf_array), }; assert_eq!(&levels[0], &expected_levels); } @@ -778,7 +810,7 @@ mod tests { let list_type = DataType::List(Arc::new(leaf_field)); let list = ArrayData::builder(list_type.clone()) .len(5) - .add_child_data(leaf.into_data()) + .add_child_data(leaf.to_data()) .add_buffer(Buffer::from_iter([0_i32, 2, 2, 4, 8, 11])) .build() .unwrap(); @@ -795,12 +827,13 @@ mod tests { let levels = calculate_array_levels(&array, &struct_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![0, 2, 0, 3, 3, 3, 3, 3, 3, 3]), rep_levels: Some(vec![0, 0, 0, 0, 1, 1, 1, 0, 1, 1]), non_null_indices: (4..11).collect(), max_def_level: 3, max_rep_level: 1, + array: Arc::new(leaf), }; assert_eq!(&levels[0], &expected_levels); @@ -820,7 +853,7 @@ mod tests { let offsets = Buffer::from_iter([0_i32, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]); let l1 = ArrayData::builder(l1_type.clone()) .len(11) - .add_child_data(leaf.into_data()) + .add_child_data(leaf.to_data()) .add_buffer(offsets) .build() .unwrap(); @@ -840,7 +873,7 @@ mod tests { let levels = calculate_array_levels(&l2, &l2_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![ 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ]), @@ -850,6 +883,7 @@ mod tests { non_null_indices: (0..22).collect(), max_def_level: 5, max_rep_level: 2, + array: Arc::new(leaf), }; assert_eq!(&levels[0], &expected_levels); @@ -871,7 +905,7 @@ mod tests { let list = ArrayData::builder(list_type.clone()) .len(4) .add_buffer(Buffer::from_iter(0_i32..5)) - .add_child_data(leaf.into_data()) + .add_child_data(leaf.to_data()) .build() .unwrap(); let list = make_array(list); @@ -880,12 +914,13 @@ mod tests { let levels = calculate_array_levels(&list, &list_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![1; 4]), rep_levels: Some(vec![0; 4]), non_null_indices: (0..4).collect(), max_def_level: 1, max_rep_level: 1, + array: Arc::new(leaf), }; assert_eq!(&levels[0], &expected_levels); @@ -898,7 +933,7 @@ mod tests { .len(4) .add_buffer(Buffer::from_iter([0_i32, 0, 3, 5, 7])) .null_bit_buffer(Some(Buffer::from([0b00001110]))) - .add_child_data(leaf.into_data()) + .add_child_data(leaf.to_data()) .build() .unwrap(); let list = make_array(list); @@ -911,12 +946,13 @@ mod tests { let levels = calculate_array_levels(&array, &struct_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![1, 3, 3, 3, 3, 3, 3, 3]), rep_levels: Some(vec![0, 0, 1, 1, 0, 1, 0, 1]), non_null_indices: (0..7).collect(), max_def_level: 3, max_rep_level: 1, + array: Arc::new(leaf), }; assert_eq!(&levels[0], &expected_levels); @@ -933,7 +969,7 @@ mod tests { let list_1 = ArrayData::builder(list_1_type.clone()) .len(7) .add_buffer(Buffer::from_iter([0_i32, 1, 3, 3, 6, 10, 10, 15])) - .add_child_data(leaf.into_data()) + .add_child_data(leaf.to_data()) .build() .unwrap(); @@ -958,12 +994,13 @@ mod tests { let levels = calculate_array_levels(&array, &struct_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![1, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5]), rep_levels: Some(vec![0, 0, 1, 2, 1, 0, 2, 2, 1, 2, 2, 2, 0, 1, 2, 2, 2, 2]), non_null_indices: (0..15).collect(), max_def_level: 5, max_rep_level: 2, + array: Arc::new(leaf), }; assert_eq!(&levels[0], &expected_levels); } @@ -980,9 +1017,10 @@ mod tests { // - {a: {b: {c: 6}}} let c = Int32Array::from_iter([Some(1), None, Some(3), None, Some(5), Some(6)]); + let leaf = Arc::new(c) as ArrayRef; let c_field = Arc::new(Field::new("c", DataType::Int32, true)); let b = StructArray::from(( - (vec![(c_field, Arc::new(c) as ArrayRef)]), + (vec![(c_field, leaf.clone())]), Buffer::from([0b00110111]), )); @@ -998,12 +1036,13 @@ mod tests { let levels = calculate_array_levels(&a_array, &a_field).unwrap(); assert_eq!(levels.len(), 1); - let expected_levels = LevelInfo { + let expected_levels = ArrayLevels { def_levels: Some(vec![3, 2, 3, 1, 0, 3]), rep_levels: None, non_null_indices: vec![0, 2, 5], max_def_level: 3, max_rep_level: 0, + array: leaf, }; assert_eq!(&levels[0], &expected_levels); } @@ -1020,7 +1059,7 @@ mod tests { .len(5) .add_buffer(a_value_offsets) .null_bit_buffer(Some(Buffer::from(vec![0b00011011]))) - .add_child_data(a_values.into_data()) + .add_child_data(a_values.to_data()) .build() .unwrap(); @@ -1029,21 +1068,21 @@ mod tests { let a = ListArray::from(a_list_data); let item_field = Field::new("item", a_list_type, true); - let mut builder = - LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&a, 2..4); + let mut builder = levels(&item_field, a); + builder.write(2..4); let levels = builder.finish(); assert_eq!(levels.len(), 1); let list_level = levels.get(0).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![0, 3, 3, 3]), rep_levels: Some(vec![0, 0, 1, 1]), non_null_indices: vec![3, 4, 5], max_def_level: 3, max_rep_level: 1, + array: Arc::new(a_values), }; assert_eq!(list_level, &expected_level); } @@ -1100,19 +1139,19 @@ mod tests { let g = ListArray::from(g_list_data); let e = StructArray::from(vec![ - (struct_field_f, Arc::new(f) as ArrayRef), + (struct_field_f, Arc::new(f.clone()) as ArrayRef), (struct_field_g, Arc::new(g) as ArrayRef), ]); let c = StructArray::from(vec![ - (struct_field_d, Arc::new(d) as ArrayRef), + (struct_field_d, Arc::new(d.clone()) as ArrayRef), (struct_field_e, Arc::new(e) as ArrayRef), ]); // build a record batch let batch = RecordBatch::try_new( Arc::new(schema), - vec![Arc::new(a), Arc::new(b), Arc::new(c)], + vec![Arc::new(a.clone()), Arc::new(b.clone()), Arc::new(c)], ) .unwrap(); @@ -1132,48 +1171,52 @@ mod tests { // test "a" levels let list_level = levels.get(0).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: None, rep_levels: None, non_null_indices: vec![0, 1, 2, 3, 4], max_def_level: 0, max_rep_level: 0, + array: Arc::new(a), }; assert_eq!(list_level, &expected_level); // test "b" levels let list_level = levels.get(1).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![1, 0, 0, 1, 1]), rep_levels: None, non_null_indices: vec![0, 3, 4], max_def_level: 1, max_rep_level: 0, + array: Arc::new(b), }; assert_eq!(list_level, &expected_level); // test "d" levels let list_level = levels.get(2).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![1, 1, 1, 2, 1]), rep_levels: None, non_null_indices: vec![3], max_def_level: 2, max_rep_level: 0, + array: Arc::new(d), }; assert_eq!(list_level, &expected_level); // test "f" levels let list_level = levels.get(3).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![3, 2, 3, 2, 3]), rep_levels: None, non_null_indices: vec![0, 2, 4], max_def_level: 3, max_rep_level: 0, + array: Arc::new(f), }; assert_eq!(list_level, &expected_level); } @@ -1270,27 +1313,31 @@ mod tests { }); assert_eq!(levels.len(), 2); + let map = batch.column(0).as_map(); + // test key levels let list_level = levels.get(0).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![1; 7]), rep_levels: Some(vec![0, 1, 0, 1, 0, 1, 1]), non_null_indices: vec![0, 1, 2, 3, 4, 5, 6], max_def_level: 1, max_rep_level: 1, + array: map.keys().clone(), }; assert_eq!(list_level, &expected_level); // test values levels let list_level = levels.get(1).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![2, 2, 2, 1, 2, 1, 2]), rep_levels: Some(vec![0, 1, 0, 1, 0, 1, 1]), non_null_indices: vec![0, 1, 2, 4, 6], max_def_level: 2, max_rep_level: 1, + array: map.values().clone(), }; assert_eq!(list_level, &expected_level); } @@ -1358,7 +1405,8 @@ mod tests { let array = Arc::new(list_builder.finish()); - let values_len = array.values().len(); + let values = array.values().as_struct().column(0).clone(); + let values_len = values.len(); assert_eq!(values_len, 5); let schema = Arc::new(Schema::new(vec![list_field])); @@ -1368,12 +1416,13 @@ mod tests { let levels = calculate_array_levels(rb.column(0), rb.schema().field(0)).unwrap(); let list_level = &levels[0]; - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![4, 1, 0, 2, 2, 3, 4]), rep_levels: Some(vec![0, 0, 0, 0, 1, 0, 0]), non_null_indices: vec![0, 4], max_def_level: 4, max_rep_level: 1, + array: values, }; assert_eq!(list_level, &expected_level); @@ -1391,6 +1440,7 @@ mod tests { None, // Masked by struct array None, ]); + let values = inner.values().clone(); // This test assumes that nulls don't take up space assert_eq!(inner.values().len(), 7); @@ -1406,12 +1456,13 @@ mod tests { assert_eq!(levels.len(), 1); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![4, 4, 3, 2, 0, 4, 4, 0, 1]), rep_levels: Some(vec![0, 1, 0, 0, 0, 0, 1, 0, 0]), non_null_indices: vec![0, 1, 5, 6], max_def_level: 4, max_rep_level: 1, + array: values, }; assert_eq!(&levels[0], &expected_level); @@ -1422,14 +1473,16 @@ mod tests { // Test the null mask of a struct array and the null mask of a list array // masking out non-null elements of their children - let a1 = Arc::new(ListArray::from_iter_primitive::(vec![ + let a1 = ListArray::from_iter_primitive::(vec![ Some(vec![None]), // Masked by list array Some(vec![]), // Masked by list array Some(vec![Some(3), None]), Some(vec![Some(4), Some(5), None, Some(6)]), // Masked by struct array None, None, - ])) as ArrayRef; + ]); + let a1_values = a1.values().clone(); + let a1 = Arc::new(a1) as ArrayRef; let a2 = Arc::new(Int32Array::from_iter(vec![ Some(1), // Masked by list array @@ -1439,6 +1492,7 @@ mod tests { Some(5), None, ])) as ArrayRef; + let a2_values = a2.clone(); let field_a1 = Arc::new(Field::new("list", a1.data_type().clone(), true)); let field_a2 = Arc::new(Field::new("integers", a2.data_type().clone(), true)); @@ -1486,22 +1540,24 @@ mod tests { assert_eq!(levels.len(), 2); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![0, 0, 1, 6, 5, 2, 3, 1]), rep_levels: Some(vec![0, 0, 0, 0, 2, 0, 1, 0]), non_null_indices: vec![1], max_def_level: 6, max_rep_level: 2, + array: a1_values, }; assert_eq!(&levels[0], &expected_level); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![0, 0, 1, 3, 2, 4, 1]), rep_levels: Some(vec![0, 0, 0, 0, 0, 1, 0]), non_null_indices: vec![4], max_def_level: 4, max_rep_level: 1, + array: a2_values, }; assert_eq!(&levels[1], &expected_level); @@ -1522,23 +1578,24 @@ mod tests { builder.values().append_slice(&[9, 10]); builder.append(false); let a = builder.finish(); + let values = a.values().clone(); let item_field = Field::new("item", a.data_type().clone(), true); - let mut builder = - LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&a, 1..4); + let mut builder = levels(&item_field, a); + builder.write(1..4); let levels = builder.finish(); assert_eq!(levels.len(), 1); let list_level = levels.get(0).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![0, 0, 3, 3]), rep_levels: Some(vec![0, 0, 0, 1]), non_null_indices: vec![6, 7], max_def_level: 3, max_rep_level: 1, + array: values, }; assert_eq!(list_level, &expected_level); } @@ -1670,6 +1727,10 @@ mod tests { assert_eq!(array.values().len(), 8); assert_eq!(array.len(), 4); + let struct_values = array.values().as_struct(); + let values_a = struct_values.column(0).clone(); + let values_b = struct_values.column(1).clone(); + let schema = Arc::new(Schema::new(vec![list_field])); let rb = RecordBatch::try_new(schema, vec![array]).unwrap(); @@ -1678,20 +1739,22 @@ mod tests { let b_levels = &levels[1]; // [[{a: 1}, null], null, [null, null], [{a: null}, {a: 2}]] - let expected_a = LevelInfo { + let expected_a = ArrayLevels { def_levels: Some(vec![4, 2, 0, 2, 2, 3, 4]), rep_levels: Some(vec![0, 1, 0, 0, 1, 0, 1]), non_null_indices: vec![0, 7], max_def_level: 4, max_rep_level: 1, + array: values_a, }; // [[{b: 2}, null], null, [null, null], [{b: 3}, {b: 4}]] - let expected_b = LevelInfo { + let expected_b = ArrayLevels { def_levels: Some(vec![3, 2, 0, 2, 2, 3, 3]), rep_levels: Some(vec![0, 1, 0, 0, 1, 0, 1]), non_null_indices: vec![0, 6, 7], max_def_level: 3, max_rep_level: 1, + array: values_b, }; assert_eq!(a_levels, &expected_a); @@ -1704,24 +1767,25 @@ mod tests { builder.append(true); builder.append(false); builder.append(true); - let a = builder.finish(); + let array = builder.finish(); + let values = array.values().clone(); - let item_field = Field::new("item", a.data_type().clone(), true); - let mut builder = - LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&a, 0..3); + let item_field = Field::new("item", array.data_type().clone(), true); + let mut builder = levels(&item_field, array); + builder.write(0..3); let levels = builder.finish(); assert_eq!(levels.len(), 1); let list_level = levels.get(0).unwrap(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![1, 0, 1]), rep_levels: Some(vec![0, 0, 0]), non_null_indices: vec![], max_def_level: 3, max_rep_level: 1, + array: values, }; assert_eq!(list_level, &expected_level); } @@ -1744,19 +1808,20 @@ mod tests { builder.values().append_null(); builder.append(false); let a = builder.finish(); + let values = a.values().as_list::().values().clone(); let item_field = Field::new("item", a.data_type().clone(), true); - let mut builder = - LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&a, 0..4); + let mut builder = levels(&item_field, a); + builder.write(0..4); let levels = builder.finish(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]), rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]), non_null_indices: vec![0, 2, 3, 4, 5], max_def_level: 5, max_rep_level: 2, + array: values, }; assert_eq!(levels[0], expected_level); @@ -1777,17 +1842,22 @@ mod tests { let item_field = Field::new("item", dict.data_type().clone(), true); - let mut builder = - LevelInfoBuilder::try_new(&item_field, Default::default()).unwrap(); - builder.write(&dict, 0..4); + let mut builder = levels(&item_field, dict.clone()); + builder.write(0..4); let levels = builder.finish(); - let expected_level = LevelInfo { + let expected_level = ArrayLevels { def_levels: Some(vec![0, 0, 1, 1]), rep_levels: None, non_null_indices: vec![2, 3], max_def_level: 1, max_rep_level: 0, + array: Arc::new(dict), }; assert_eq!(levels[0], expected_level); } + + fn levels(field: &Field, array: T) -> LevelInfoBuilder { + let v = Arc::new(array) as ArrayRef; + LevelInfoBuilder::try_new(field, Default::default(), &v).unwrap() + } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2e170738f1a8..5dae81d4711c 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -18,7 +18,6 @@ //! Contains writer which writes arrow data into parquet data. use bytes::Bytes; -use std::fmt::Debug; use std::io::{Read, Write}; use std::iter::Peekable; use std::slice::Iter; @@ -28,8 +27,10 @@ use thrift::protocol::{TCompactOutputProtocol, TSerializable}; use arrow_array::cast::AsArray; use arrow_array::types::*; -use arrow_array::{Array, FixedSizeListArray, RecordBatch, RecordBatchWriter}; -use arrow_schema::{ArrowError, DataType as ArrowDataType, IntervalUnit, SchemaRef}; +use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef, +}; use super::schema::{ add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, @@ -47,14 +48,14 @@ use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; -use crate::file::writer::SerializedFileWriter; +use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; -use levels::{calculate_array_levels, LevelInfo}; +use levels::{calculate_array_levels, ArrayLevels}; mod byte_array; mod levels; -/// Arrow writer +/// Encodes [`RecordBatch`] to parquet /// /// Writes Arrow `RecordBatch`es to a Parquet writer. Multiple [`RecordBatch`] will be encoded /// to the same row group, up to `max_row_group_size` rows. Any remaining rows will be @@ -97,7 +98,7 @@ pub struct ArrowWriter { max_row_group_size: usize, } -impl Debug for ArrowWriter { +impl std::fmt::Debug for ArrowWriter { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let buffered_memory = self.in_progress_size(); f.debug_struct("ArrowWriter") @@ -150,7 +151,7 @@ impl ArrowWriter { Some(in_progress) => in_progress .writers .iter() - .map(|(_, x)| x.get_estimated_total_bytes() as usize) + .map(|x| x.get_estimated_total_bytes()) .sum(), None => 0, } @@ -208,8 +209,8 @@ impl ArrowWriter { }; let mut row_group_writer = self.writer.next_row_group()?; - for (chunk, close) in in_progress.close()? { - row_group_writer.append_column(&chunk, close)?; + for chunk in in_progress.close()? { + chunk.append_to_row_group(&mut row_group_writer)?; } row_group_writer.close()?; Ok(()) @@ -246,20 +247,20 @@ impl RecordBatchWriter for ArrowWriter { } } -/// A list of [`Bytes`] comprising a single column chunk +/// A single column chunk produced by [`ArrowColumnWriter`] #[derive(Default)] -pub struct ArrowColumnChunk { +struct ArrowColumnChunkData { length: usize, data: Vec, } -impl Length for ArrowColumnChunk { +impl Length for ArrowColumnChunkData { fn len(&self) -> u64 { self.length as _ } } -impl ChunkReader for ArrowColumnChunk { +impl ChunkReader for ArrowColumnChunkData { type T = ArrowColumnChunkReader; fn get_read(&self, start: u64) -> Result { @@ -274,8 +275,8 @@ impl ChunkReader for ArrowColumnChunk { } } -/// A [`Read`] for an iterator of [`Bytes`] -pub struct ArrowColumnChunkReader(Peekable>); +/// A [`Read`] for [`ArrowColumnChunkData`] +struct ArrowColumnChunkReader(Peekable>); impl Read for ArrowColumnChunkReader { fn read(&mut self, out: &mut [u8]) -> std::io::Result { @@ -297,11 +298,11 @@ impl Read for ArrowColumnChunkReader { } } -/// A shared [`ArrowColumnChunk`] +/// A shared [`ArrowColumnChunkData`] /// /// This allows it to be owned by [`ArrowPageWriter`] whilst allowing access via /// [`ArrowRowGroupWriter`] on flush, without requiring self-referential borrows -type SharedColumnChunk = Arc>; +type SharedColumnChunk = Arc>; #[derive(Default)] struct ArrowPageWriter { @@ -347,40 +348,180 @@ impl PageWriter for ArrowPageWriter { } } -/// Encodes a leaf column to [`ArrowPageWriter`] -enum ArrowColumnWriter { +/// A leaf column that can be encoded by [`ArrowColumnWriter`] +#[derive(Debug)] +pub struct ArrowLeafColumn(ArrayLevels); + +/// Computes the [`ArrowLeafColumn`] for a potentially nested [`ArrayRef`] +pub fn compute_leaves(field: &Field, array: &ArrayRef) -> Result> { + let levels = calculate_array_levels(array, field)?; + Ok(levels.into_iter().map(ArrowLeafColumn).collect()) +} + +/// The data for a single column chunk, see [`ArrowColumnWriter`] +pub struct ArrowColumnChunk { + data: ArrowColumnChunkData, + close: ColumnCloseResult, +} + +impl std::fmt::Debug for ArrowColumnChunk { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrowColumnChunk") + .field("length", &self.data.length) + .finish_non_exhaustive() + } +} + +impl ArrowColumnChunk { + /// Calls [`SerializedRowGroupWriter::append_column`] with this column's data + pub fn append_to_row_group( + self, + writer: &mut SerializedRowGroupWriter<'_, W>, + ) -> Result<()> { + writer.append_column(&self.data, self.close) + } +} + +/// Encodes [`ArrowLeafColumn`] to [`ArrowColumnChunk`] +/// +/// Note: This is a low-level interface for applications that require fine-grained control +/// of encoding, see [`ArrowWriter`] for a higher-level interface +/// +/// ``` +/// // The arrow schema +/// # use std::sync::Arc; +/// # use arrow_array::*; +/// # use arrow_schema::*; +/// # use parquet::arrow::arrow_to_parquet_schema; +/// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers}; +/// # use parquet::file::properties::WriterProperties; +/// # use parquet::file::writer::SerializedFileWriter; +/// # +/// let schema = Arc::new(Schema::new(vec![ +/// Field::new("i32", DataType::Int32, false), +/// Field::new("f32", DataType::Float32, false), +/// ])); +/// +/// // Compute the parquet schema +/// let parquet_schema = arrow_to_parquet_schema(schema.as_ref()).unwrap(); +/// let props = Arc::new(WriterProperties::default()); +/// +/// // Create writers for each of the leaf columns +/// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap(); +/// +/// // Spawn a worker thread for each column +/// // This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better +/// let mut workers: Vec<_> = col_writers +/// .into_iter() +/// .map(|mut col_writer| { +/// let (send, recv) = std::sync::mpsc::channel::(); +/// let handle = std::thread::spawn(move || { +/// for col in recv { +/// col_writer.write(&col)?; +/// } +/// col_writer.close() +/// }); +/// (handle, send) +/// }) +/// .collect(); +/// +/// // Create parquet writer +/// let root_schema = parquet_schema.root_schema_ptr(); +/// let mut out = Vec::with_capacity(1024); // This could be a File +/// let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone()).unwrap(); +/// +/// // Start row group +/// let mut row_group = writer.next_row_group().unwrap(); +/// +/// // Columns to encode +/// let to_write = vec![ +/// Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _, +/// Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _, +/// ]; +/// +/// // Spawn work to encode columns +/// let mut worker_iter = workers.iter_mut(); +/// for (arr, field) in to_write.iter().zip(&schema.fields) { +/// for leaves in compute_leaves(field, arr).unwrap() { +/// worker_iter.next().unwrap().1.send(leaves).unwrap(); +/// } +/// } +/// +/// // Finish up parallel column encoding +/// for (handle, send) in workers { +/// drop(send); // Drop send side to signal termination +/// let chunk = handle.join().unwrap().unwrap(); +/// chunk.append_to_row_group(&mut row_group).unwrap(); +/// } +/// row_group.close().unwrap(); +/// +/// let metadata = writer.close().unwrap(); +/// assert_eq!(metadata.num_rows, 3); +/// ``` +pub struct ArrowColumnWriter { + writer: ArrowColumnWriterImpl, + chunk: SharedColumnChunk, +} + +impl std::fmt::Debug for ArrowColumnWriter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrowColumnWriter").finish_non_exhaustive() + } +} + +enum ArrowColumnWriterImpl { ByteArray(GenericColumnWriter<'static, ByteArrayEncoder>), Column(ColumnWriter<'static>), } impl ArrowColumnWriter { + /// Write an [`ArrowLeafColumn`] + pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()> { + match &mut self.writer { + ArrowColumnWriterImpl::Column(c) => { + write_leaf(c, &col.0)?; + } + ArrowColumnWriterImpl::ByteArray(c) => { + write_primitive(c, col.0.array().as_ref(), &col.0)?; + } + } + Ok(()) + } + + /// Close this column returning the written [`ArrowColumnChunk`] + pub fn close(self) -> Result { + let close = match self.writer { + ArrowColumnWriterImpl::ByteArray(c) => c.close()?, + ArrowColumnWriterImpl::Column(c) => c.close()?, + }; + let chunk = Arc::try_unwrap(self.chunk).ok().unwrap(); + let data = chunk.into_inner().unwrap(); + Ok(ArrowColumnChunk { data, close }) + } + /// Returns the estimated total bytes for this column writer - fn get_estimated_total_bytes(&self) -> u64 { - match self { - ArrowColumnWriter::ByteArray(c) => c.get_estimated_total_bytes(), - ArrowColumnWriter::Column(c) => c.get_estimated_total_bytes(), + pub fn get_estimated_total_bytes(&self) -> usize { + match &self.writer { + ArrowColumnWriterImpl::ByteArray(c) => c.get_estimated_total_bytes() as _, + ArrowColumnWriterImpl::Column(c) => c.get_estimated_total_bytes() as _, } } } /// Encodes [`RecordBatch`] to a parquet row group -pub struct ArrowRowGroupWriter { - writers: Vec<(SharedColumnChunk, ArrowColumnWriter)>, +struct ArrowRowGroupWriter { + writers: Vec, schema: SchemaRef, buffered_rows: usize, } impl ArrowRowGroupWriter { - pub fn new( + fn new( parquet: &SchemaDescriptor, props: &WriterPropertiesPtr, arrow: &SchemaRef, ) -> Result { - let mut writers = Vec::with_capacity(arrow.fields.len()); - let mut leaves = parquet.columns().iter(); - for field in &arrow.fields { - get_arrow_column_writer(field.data_type(), props, &mut leaves, &mut writers)?; - } + let writers = get_column_writers(parquet, props, arrow)?; Ok(Self { writers, schema: arrow.clone(), @@ -388,51 +529,64 @@ impl ArrowRowGroupWriter { }) } - pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { + fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.buffered_rows += batch.num_rows(); - let mut writers = self.writers.iter_mut().map(|(_, x)| x); - for (array, field) in batch.columns().iter().zip(&self.schema.fields) { - let mut levels = calculate_array_levels(array, field)?.into_iter(); - write_leaves(&mut writers, &mut levels, array.as_ref())?; + let mut writers = self.writers.iter_mut(); + for (field, column) in self.schema.fields().iter().zip(batch.columns()) { + for leaf in compute_leaves(field.as_ref(), column)? { + writers.next().unwrap().write(&leaf)? + } } Ok(()) } - pub fn close(self) -> Result> { + fn close(self) -> Result> { self.writers .into_iter() - .map(|(chunk, writer)| { - let close_result = match writer { - ArrowColumnWriter::ByteArray(c) => c.close()?, - ArrowColumnWriter::Column(c) => c.close()?, - }; - - let chunk = Arc::try_unwrap(chunk).ok().unwrap().into_inner().unwrap(); - Ok((chunk, close_result)) - }) + .map(|writer| writer.close()) .collect() } } -/// Get an [`ArrowColumnWriter`] along with a reference to its [`SharedColumnChunk`] +/// Returns the [`ArrowColumnWriter`] for a given schema +pub fn get_column_writers( + parquet: &SchemaDescriptor, + props: &WriterPropertiesPtr, + arrow: &SchemaRef, +) -> Result> { + let mut writers = Vec::with_capacity(arrow.fields.len()); + let mut leaves = parquet.columns().iter(); + for field in &arrow.fields { + get_arrow_column_writer(field.data_type(), props, &mut leaves, &mut writers)?; + } + Ok(writers) +} + +/// Gets the [`ArrowColumnWriter`] for the given `data_type` fn get_arrow_column_writer( data_type: &ArrowDataType, props: &WriterPropertiesPtr, leaves: &mut Iter<'_, ColumnDescPtr>, - out: &mut Vec<(SharedColumnChunk, ArrowColumnWriter)>, + out: &mut Vec, ) -> Result<()> { let col = |desc: &ColumnDescPtr| { let page_writer = Box::::default(); let chunk = page_writer.buffer.clone(); let writer = get_column_writer(desc.clone(), props.clone(), page_writer); - (chunk, ArrowColumnWriter::Column(writer)) + ArrowColumnWriter { + chunk, + writer: ArrowColumnWriterImpl::Column(writer), + } }; let bytes = |desc: &ColumnDescPtr| { let page_writer = Box::::default(); let chunk = page_writer.buffer.clone(); let writer = GenericColumnWriter::new(desc.clone(), props.clone(), page_writer); - (chunk, ArrowColumnWriter::ByteArray(writer)) + ArrowColumnWriter { + chunk, + writer: ArrowColumnWriterImpl::ByteArray(writer), + } }; match data_type { @@ -478,52 +632,8 @@ fn get_arrow_column_writer( Ok(()) } -/// Write the leaves of `array` in depth-first order to `writers` with `levels` -fn write_leaves<'a, W>( - writers: &mut W, - levels: &mut IntoIter, - array: &(dyn Array + 'static), -) -> Result<()> -where - W: Iterator, -{ - match array.data_type() { - ArrowDataType::List(_) => { - write_leaves(writers, levels, array.as_list::().values().as_ref())? - } - ArrowDataType::LargeList(_) => { - write_leaves(writers, levels, array.as_list::().values().as_ref())? - } - ArrowDataType::FixedSizeList(_, _) => { - let array = array.as_any().downcast_ref::().unwrap(); - write_leaves(writers, levels, array.values().as_ref())? - } - ArrowDataType::Struct(_) => { - for column in array.as_struct().columns() { - write_leaves(writers, levels, column.as_ref())? - } - } - ArrowDataType::Map(_, _) => { - let map = array.as_map(); - write_leaves(writers, levels, map.keys().as_ref())?; - write_leaves(writers, levels, map.values().as_ref())? - } - _ => { - let levels = levels.next().unwrap(); - match writers.next().unwrap() { - ArrowColumnWriter::Column(c) => write_leaf(c, array, levels)?, - ArrowColumnWriter::ByteArray(c) => write_primitive(c, array, levels)?, - }; - } - } - Ok(()) -} - -fn write_leaf( - writer: &mut ColumnWriter<'_>, - column: &dyn Array, - levels: LevelInfo, -) -> Result { +fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result { + let column = levels.array().as_ref(); let indices = levels.non_null_indices(); match writer { ColumnWriter::Int32ColumnWriter(ref mut typed) => { @@ -678,7 +788,7 @@ fn write_leaf( fn write_primitive( writer: &mut GenericColumnWriter, values: &E::Values, - levels: LevelInfo, + levels: &ArrayLevels, ) -> Result { writer.write_batch_internal( values, diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 859a0aa1f902..cafb1761352d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -115,8 +115,7 @@ pub type OnCloseRowGroup<'a> = Box< Vec>, Vec>, ) -> Result<()> - + 'a - + Send, + + 'a, >; // ---------------------------------------------------------------------- From 8c495b60021df1e32e1ff0616dec2979fd66b467 Mon Sep 17 00:00:00 2001 From: Samrose Date: Sun, 1 Oct 2023 03:09:18 -0700 Subject: [PATCH 1245/1411] parquet: Read field IDs from Parquet Schema (#4878) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, field ids are only read from the serialized arrow schema and not the actual parquet file. This PR adds reading the field ids from a Parquet file that doesnt contain the serialized arrow schema. Signed-off-by: 🐼 Samrose Ahmed 🐼 --- parquet/src/arrow/schema/complex.rs | 13 +++++- parquet/src/arrow/schema/mod.rs | 63 +++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 0d19875d97de..9f85b2c284c6 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -19,7 +19,7 @@ use std::collections::HashMap; use std::sync::Arc; use crate::arrow::schema::primitive::convert_primitive; -use crate::arrow::ProjectionMask; +use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError; use crate::errors::Result; @@ -550,7 +550,16 @@ fn convert_field( field.with_metadata(hint.metadata().clone()) } - None => Field::new(name, data_type, nullable), + None => { + let mut ret = Field::new(name, data_type, nullable); + let basic_info = parquet_type.get_basic_info(); + if basic_info.has_id() { + let mut meta = HashMap::with_capacity(1); + meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string()); + ret.set_metadata(meta); + } + ret + }, } } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 3f1994d10829..d56cc42d4313 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -45,6 +45,8 @@ mod primitive; use crate::arrow::ProjectionMask; pub(crate) use complex::{ParquetField, ParquetFieldType}; +use super::PARQUET_FIELD_ID_META_KEY; + /// Convert Parquet schema to Arrow schema including optional metadata /// /// Attempts to decode any existing Arrow schema metadata, falling back @@ -268,12 +270,20 @@ fn parse_key_value_metadata( /// Convert parquet column schema to arrow field. pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result { let field = complex::convert_type(&parquet_column.self_type_ptr())?; - - Ok(Field::new( + let mut ret = Field::new( parquet_column.name(), field.arrow_type, field.nullable, - )) + ); + + let basic_info = parquet_column.self_type().get_basic_info(); + if basic_info.has_id() { + let mut meta = HashMap::with_capacity(1); + meta.insert(PARQUET_FIELD_ID_META_KEY.to_string(), basic_info.id().to_string()); + ret.set_metadata(meta); + } + + Ok(ret) } pub fn decimal_length_from_precision(precision: u8) -> usize { @@ -578,6 +588,7 @@ mod tests { use crate::arrow::PARQUET_FIELD_ID_META_KEY; use crate::file::metadata::KeyValue; + use crate::file::reader::FileReader; use crate::{ arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ArrowWriter}, schema::{parser::parse_message_type, types::SchemaDescriptor}, @@ -1811,6 +1822,52 @@ mod tests { Ok(()) } + #[test] + fn test_read_parquet_field_ids_raw() -> Result<()> { + let meta = |a: &[(&str, &str)]| -> HashMap { + a.iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect() + }; + let schema = Schema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, true).with_metadata(meta(&[ + (PARQUET_FIELD_ID_META_KEY, "1"), + ])), + Field::new("c2", DataType::Utf8, true).with_metadata(meta(&[ + (PARQUET_FIELD_ID_META_KEY, "2"), + ])), + ], + HashMap::new(), + ); + + let writer = ArrowWriter::try_new( + vec![], + Arc::new(schema.clone()), + None, + )?; + let parquet_bytes = writer.into_inner()?; + + let reader = crate::file::reader::SerializedFileReader::new( + bytes::Bytes::from(parquet_bytes), + )?; + let schema_descriptor = reader.metadata().file_metadata().schema_descr_ptr(); + + // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema + let arrow_schema = crate::arrow::parquet_to_arrow_schema( + &schema_descriptor, + None, + )?; + + let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema)?; + let parq_fields = parq_schema_descr.root_schema().get_fields(); + assert_eq!(parq_fields.len(), 2); + assert_eq!(parq_fields[0].get_basic_info().id(), 1); + assert_eq!(parq_fields[1].get_basic_info().id(), 2); + + Ok(()) + } + #[test] fn test_arrow_schema_roundtrip_lists() -> Result<()> { let metadata: HashMap = From 3b0ede4fbb112b0d45d0ae3f03d8fc42c3ead631 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 2 Oct 2023 11:33:25 +0100 Subject: [PATCH 1246/1411] Replace lz4 with lz4_flex Allowing Compilation for WASM (#4884) * Use lz4_flex * Fix features * Install clang for zlib * Update arrow-ipc * Fix CI * Use LZ4F * Support LZ4F fallback * Restore support for LZ4F compressed CSV * Clippy * Fix features * Add benchmark * Additional system dependencies --- .github/workflows/parquet.yml | 6 +- arrow-integration-test/src/lib.rs | 3 +- arrow-ipc/Cargo.toml | 6 +- arrow-ipc/src/compression.rs | 78 ++++++++++++++-------- parquet/Cargo.toml | 11 +++- parquet/benches/compression.rs | 101 +++++++++++++++++++++++++++++ parquet/src/bin/parquet-fromcsv.rs | 20 ++---- parquet/src/compression.rs | 36 +++++----- 8 files changed, 191 insertions(+), 70 deletions(-) create mode 100644 parquet/benches/compression.rs diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index c309a3fa6473..7a649e16b1ec 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -123,10 +123,12 @@ jobs: uses: ./.github/actions/setup-builder with: target: wasm32-unknown-unknown,wasm32-wasi + - name: Install clang # Needed for zlib compilation + run: apt-get update && apt-get install -y clang gcc-multilib - name: Build wasm32-unknown-unknown - run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-unknown-unknown + run: cargo build -p parquet --target wasm32-unknown-unknown - name: Build wasm32-wasi - run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-wasi + run: cargo build -p parquet --target wasm32-wasi pyspark-integration-test: name: PySpark Integration Test diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 04bbcf3f6f23..07b69bffd07d 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -183,7 +183,8 @@ impl ArrowJson { return Ok(false); } } - _ => return Ok(false), + Some(Err(e)) => return Err(e), + None => return Ok(false), } } diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index a03f53d6641c..b5f66294a7c7 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -40,8 +40,12 @@ arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } flatbuffers = { version = "23.1.21", default-features = false } -lz4 = { version = "1.23", default-features = false, optional = true } +lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } zstd = { version = "0.12.0", default-features = false, optional = true } +[features] +default = [] +lz4 = ["lz4_flex"] + [dev-dependencies] tempfile = "3.3" diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index db05e9a6a6c6..fafc2c5c9b6d 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -103,13 +103,15 @@ impl CompressionCodec { } else if decompressed_length == LENGTH_NO_COMPRESSED_DATA { // no compression input.slice(LENGTH_OF_PREFIX_DATA as usize) - } else { + } else if let Ok(decompressed_length) = usize::try_from(decompressed_length) { // decompress data using the codec - let mut uncompressed_buffer = - Vec::with_capacity(decompressed_length as usize); let input_data = &input[(LENGTH_OF_PREFIX_DATA as usize)..]; - self.decompress(input_data, &mut uncompressed_buffer)?; - Buffer::from(uncompressed_buffer) + self.decompress(input_data, decompressed_length as _)? + .into() + } else { + return Err(ArrowError::IpcError(format!( + "Invalid uncompressed length: {decompressed_length}" + ))); }; Ok(buffer) } @@ -128,21 +130,30 @@ impl CompressionCodec { fn decompress( &self, input: &[u8], - output: &mut Vec, - ) -> Result { - match self { - CompressionCodec::Lz4Frame => decompress_lz4(input, output), - CompressionCodec::Zstd => decompress_zstd(input, output), + decompressed_size: usize, + ) -> Result, ArrowError> { + let ret = match self { + CompressionCodec::Lz4Frame => decompress_lz4(input, decompressed_size)?, + CompressionCodec::Zstd => decompress_zstd(input, decompressed_size)?, + }; + if ret.len() != decompressed_size { + return Err(ArrowError::IpcError(format!( + "Expected compressed length of {decompressed_size} got {}", + ret.len() + ))); } + Ok(ret) } } #[cfg(feature = "lz4")] fn compress_lz4(input: &[u8], output: &mut Vec) -> Result<(), ArrowError> { use std::io::Write; - let mut encoder = lz4::EncoderBuilder::new().build(output)?; + let mut encoder = lz4_flex::frame::FrameEncoder::new(output); encoder.write_all(input)?; - encoder.finish().1?; + encoder + .finish() + .map_err(|e| ArrowError::ExternalError(Box::new(e)))?; Ok(()) } @@ -155,14 +166,19 @@ fn compress_lz4(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> } #[cfg(feature = "lz4")] -fn decompress_lz4(input: &[u8], output: &mut Vec) -> Result { +fn decompress_lz4(input: &[u8], decompressed_size: usize) -> Result, ArrowError> { use std::io::Read; - Ok(lz4::Decoder::new(input)?.read_to_end(output)?) + let mut output = Vec::with_capacity(decompressed_size); + lz4_flex::frame::FrameDecoder::new(input).read_to_end(&mut output)?; + Ok(output) } #[cfg(not(feature = "lz4"))] #[allow(clippy::ptr_arg)] -fn decompress_lz4(_input: &[u8], _output: &mut Vec) -> Result { +fn decompress_lz4( + _input: &[u8], + _decompressed_size: usize, +) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "lz4 IPC decompression requires the lz4 feature".to_string(), )) @@ -186,14 +202,22 @@ fn compress_zstd(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> } #[cfg(feature = "zstd")] -fn decompress_zstd(input: &[u8], output: &mut Vec) -> Result { +fn decompress_zstd( + input: &[u8], + decompressed_size: usize, +) -> Result, ArrowError> { use std::io::Read; - Ok(zstd::Decoder::new(input)?.read_to_end(output)?) + let mut output = Vec::with_capacity(decompressed_size); + zstd::Decoder::with_buffer(input)?.read_to_end(&mut output)?; + Ok(output) } #[cfg(not(feature = "zstd"))] #[allow(clippy::ptr_arg)] -fn decompress_zstd(_input: &[u8], _output: &mut Vec) -> Result { +fn decompress_zstd( + _input: &[u8], + _decompressed_size: usize, +) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "zstd IPC decompression requires the zstd feature".to_string(), )) @@ -216,28 +240,26 @@ mod tests { #[test] #[cfg(feature = "lz4")] fn test_lz4_compression() { - let input_bytes = "hello lz4".as_bytes(); + let input_bytes = b"hello lz4"; let codec = super::CompressionCodec::Lz4Frame; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) + let result = codec + .decompress(output_bytes.as_slice(), input_bytes.len()) .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); + assert_eq!(input_bytes, result.as_slice()); } #[test] #[cfg(feature = "zstd")] fn test_zstd_compression() { - let input_bytes = "hello zstd".as_bytes(); + let input_bytes = b"hello zstd"; let codec = super::CompressionCodec::Zstd; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) + let result = codec + .decompress(output_bytes.as_slice(), input_bytes.len()) .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); + assert_eq!(input_bytes, result.as_slice()); } } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 7c346248acbb..c710c83213b9 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -51,7 +51,7 @@ thrift = { version = "0.17", default-features = false } snap = { version = "1.0", default-features = false, optional = true } brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } -lz4 = { version = "1.23", default-features = false, optional = true } +lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } zstd = { version = "0.12.0", optional = true, default-features = false } chrono = { workspace = true } num = { version = "0.4", default-features = false } @@ -74,7 +74,7 @@ snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } brotli = { version = "3.3", default-features = false, features = ["std"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } -lz4 = { version = "1.23", default-features = false } +lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"] } zstd = { version = "0.12", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } @@ -86,6 +86,8 @@ all-features = true [features] default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] +# Enable lz4 +lz4 = ["lz4_flex"] # Enable arrow reader/writer APIs arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"] # Enable CLI tools @@ -166,5 +168,10 @@ name = "arrow_reader" required-features = ["arrow", "test_common", "experimental"] harness = false +[[bench]] +name = "compression" +required-features = ["experimental", "default"] +harness = false + [lib] bench = false diff --git a/parquet/benches/compression.rs b/parquet/benches/compression.rs new file mode 100644 index 000000000000..ce4f9aead751 --- /dev/null +++ b/parquet/benches/compression.rs @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use criterion::*; +use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; +use parquet::compression::create_codec; +use rand::distributions::Alphanumeric; +use rand::prelude::*; + +fn do_bench(c: &mut Criterion, name: &str, uncompressed: &[u8]) { + let codecs = [ + Compression::BROTLI(BrotliLevel::default()), + Compression::GZIP(GzipLevel::default()), + Compression::LZ4, + Compression::LZ4_RAW, + Compression::SNAPPY, + Compression::GZIP(GzipLevel::default()), + Compression::ZSTD(ZstdLevel::default()), + ]; + + for compression in codecs { + let mut codec = create_codec(compression, &Default::default()) + .unwrap() + .unwrap(); + + c.bench_function(&format!("compress {compression} - {name}"), |b| { + b.iter(|| { + let mut out = Vec::new(); + codec.compress(uncompressed, &mut out).unwrap(); + out + }); + }); + + let mut compressed = Vec::new(); + codec.compress(uncompressed, &mut compressed).unwrap(); + println!( + "{compression} compressed {} bytes of {name} to {} bytes", + uncompressed.len(), + compressed.len() + ); + + c.bench_function(&format!("decompress {compression} - {name}"), |b| { + b.iter(|| { + let mut out = Vec::new(); + codec + .decompress( + black_box(&compressed), + &mut out, + Some(uncompressed.len()), + ) + .unwrap(); + out + }); + }); + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut rng = StdRng::seed_from_u64(42); + let rng = &mut rng; + const DATA_SIZE: usize = 1024 * 1024; + + let uncompressed: Vec<_> = rng.sample_iter(&Alphanumeric).take(DATA_SIZE).collect(); + do_bench(c, "alphanumeric", &uncompressed); + + // Create a collection of 64 words + let words: Vec> = (0..64) + .map(|_| { + let len = rng.gen_range(1..12); + rng.sample_iter(&Alphanumeric).take(len).collect() + }) + .collect(); + + // Build data by concatenating these words randomly together + let mut uncompressed = Vec::with_capacity(DATA_SIZE); + while uncompressed.len() < DATA_SIZE { + let word = &words[rng.gen_range(0..words.len())]; + uncompressed + .extend_from_slice(&word[..word.len().min(DATA_SIZE - uncompressed.len())]) + } + assert_eq!(uncompressed.len(), DATA_SIZE); + + do_bench(c, "words", &uncompressed); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 1ff6fecf5a81..548bbdbfb8f1 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -386,9 +386,9 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { Compression::BROTLI(_) => { Box::new(brotli::Decompressor::new(input_file, 0)) as Box } - Compression::LZ4 => Box::new(lz4::Decoder::new(input_file).map_err(|e| { - ParquetFromCsvError::with_context(e, "Failed to create lz4::Decoder") - })?) as Box, + Compression::LZ4 => { + Box::new(lz4_flex::frame::FrameDecoder::new(input_file)) as Box + } Compression::ZSTD(_) => Box::new(zstd::Decoder::new(input_file).map_err(|e| { ParquetFromCsvError::with_context(e, "Failed to create zstd::Decoder") })?) as Box, @@ -692,19 +692,9 @@ mod tests { encoder.into_inner() } Compression::LZ4 => { - let mut encoder = lz4::EncoderBuilder::new() - .build(input_file) - .map_err(|e| { - ParquetFromCsvError::with_context( - e, - "Failed to create lz4::Encoder", - ) - }) - .unwrap(); + let mut encoder = lz4_flex::frame::FrameEncoder::new(input_file); write_tmp_file(&mut encoder); - let (inner, err) = encoder.finish(); - err.unwrap(); - inner + encoder.finish().unwrap() } Compression::ZSTD(level) => { diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index f1831ed48444..9e0eee0e3e04 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -388,7 +388,7 @@ mod lz4_codec { use std::io::{Read, Write}; use crate::compression::Codec; - use crate::errors::Result; + use crate::errors::{ParquetError, Result}; const LZ4_BUFFER_SIZE: usize = 4096; @@ -409,7 +409,7 @@ mod lz4_codec { output_buf: &mut Vec, _uncompress_size: Option, ) -> Result { - let mut decoder = lz4::Decoder::new(input_buf)?; + let mut decoder = lz4_flex::frame::FrameDecoder::new(input_buf); let mut buffer: [u8; LZ4_BUFFER_SIZE] = [0; LZ4_BUFFER_SIZE]; let mut total_len = 0; loop { @@ -424,7 +424,7 @@ mod lz4_codec { } fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { - let mut encoder = lz4::EncoderBuilder::new().build(output_buf)?; + let mut encoder = lz4_flex::frame::FrameEncoder::new(output_buf); let mut from = 0; loop { let to = std::cmp::min(from + LZ4_BUFFER_SIZE, input_buf.len()); @@ -434,7 +434,10 @@ mod lz4_codec { break; } } - encoder.finish().1.map_err(|e| e.into()) + match encoder.finish() { + Ok(_) => Ok(()), + Err(e) => Err(ParquetError::External(Box::new(e))), + } } } } @@ -551,11 +554,7 @@ mod lz4_raw_codec { } }; output_buf.resize(offset + required_len, 0); - match lz4::block::decompress_to_buffer( - input_buf, - Some(required_len.try_into().unwrap()), - &mut output_buf[offset..], - ) { + match lz4_flex::block::decompress_into(input_buf, &mut output_buf[offset..]) { Ok(n) => { if n != required_len { return Err(ParquetError::General( @@ -564,25 +563,20 @@ mod lz4_raw_codec { } Ok(n) } - Err(e) => Err(e.into()), + Err(e) => Err(ParquetError::External(Box::new(e))), } } fn compress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result<()> { let offset = output_buf.len(); - let required_len = lz4::block::compress_bound(input_buf.len())?; + let required_len = lz4_flex::block::get_maximum_output_size(input_buf.len()); output_buf.resize(offset + required_len, 0); - match lz4::block::compress_to_buffer( - input_buf, - None, - false, - &mut output_buf[offset..], - ) { + match lz4_flex::block::compress_into(input_buf, &mut output_buf[offset..]) { Ok(n) => { output_buf.truncate(offset + n); Ok(()) } - Err(e) => Err(e.into()), + Err(e) => Err(ParquetError::External(Box::new(e))), } } } @@ -666,11 +660,11 @@ mod lz4_hadoop_codec { "Not enough bytes to hold advertised output", )); } - let decompressed_size = lz4::block::decompress_to_buffer( + let decompressed_size = lz4_flex::decompress_into( &input[..expected_compressed_size as usize], - Some(output_len as i32), output, - )?; + ) + .map_err(|e| ParquetError::External(Box::new(e)))?; if decompressed_size != expected_decompressed_size as usize { return Err(io::Error::new( io::ErrorKind::Other, From d941ff1c3741ba4e18022d8be8edfbbca8b0af17 Mon Sep 17 00:00:00 2001 From: Letian Jiang Date: Mon, 2 Oct 2023 22:14:23 +0800 Subject: [PATCH 1247/1411] Support parquet bloom filter length (#4885) * Support parquet bloom filter length Signed-off-by: Letian Jiang * update Signed-off-by: Letian Jiang --------- Signed-off-by: Letian Jiang --- parquet/src/bloom_filter/mod.rs | 30 +- parquet/src/file/metadata.rs | 17 ++ parquet/src/file/writer.rs | 9 +- parquet/src/format.rs | 505 ++++++++++---------------------- parquet/src/schema/printer.rs | 5 + 5 files changed, 195 insertions(+), 371 deletions(-) diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 4d2040b7f258..c893d492b52a 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -135,13 +135,12 @@ pub struct Sbbf(Vec); const SBBF_HEADER_SIZE_ESTIMATE: usize = 20; -/// given an initial offset, and a [ChunkReader], try to read out a bloom filter header and return +/// given an initial offset, and a byte buffer, try to read out a bloom filter header and return /// both the header and the offset after it (for bitset). -fn chunk_read_bloom_filter_header_and_offset( +fn chunk_read_bloom_filter_header_and_offset( offset: u64, - reader: Arc, + buffer: Bytes, ) -> Result<(BloomFilterHeader, u64), ParquetError> { - let buffer = reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE)?; let (header, length) = read_bloom_filter_header_and_length(buffer)?; Ok((header, offset + length)) } @@ -271,8 +270,13 @@ impl Sbbf { return Ok(None); }; + let buffer = match column_metadata.bloom_filter_length() { + Some(length) => reader.get_bytes(offset, length as usize), + None => reader.get_bytes(offset, SBBF_HEADER_SIZE_ESTIMATE), + }?; + let (header, bitset_offset) = - chunk_read_bloom_filter_header_and_offset(offset, reader.clone())?; + chunk_read_bloom_filter_header_and_offset(offset, buffer.clone())?; match header.algorithm { BloomFilterAlgorithm::BLOCK(_) => { @@ -289,11 +293,17 @@ impl Sbbf { // this match exists to future proof the singleton hash enum } } - // length in bytes - let length: usize = header.num_bytes.try_into().map_err(|_| { - ParquetError::General("Bloom filter length is invalid".to_string()) - })?; - let bitset = reader.get_bytes(bitset_offset, length)?; + + let bitset = match column_metadata.bloom_filter_length() { + Some(_) => buffer.slice((bitset_offset - offset) as usize..), + None => { + let bitset_length: usize = header.num_bytes.try_into().map_err(|_| { + ParquetError::General("Bloom filter length is invalid".to_string()) + })?; + reader.get_bytes(bitset_offset, bitset_length)? + } + }; + Ok(Some(Self::new(&bitset))) } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index aaa3d28e206a..1f46c8105ebc 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -474,6 +474,7 @@ pub struct ColumnChunkMetaData { statistics: Option, encoding_stats: Option>, bloom_filter_offset: Option, + bloom_filter_length: Option, offset_index_offset: Option, offset_index_length: Option, column_index_offset: Option, @@ -591,6 +592,11 @@ impl ColumnChunkMetaData { self.bloom_filter_offset } + /// Returns the offset for the bloom filter. + pub fn bloom_filter_length(&self) -> Option { + self.bloom_filter_length + } + /// Returns the offset for the column index. pub fn column_index_offset(&self) -> Option { self.column_index_offset @@ -657,6 +663,7 @@ impl ColumnChunkMetaData { }) .transpose()?; let bloom_filter_offset = col_metadata.bloom_filter_offset; + let bloom_filter_length = col_metadata.bloom_filter_length; let offset_index_offset = cc.offset_index_offset; let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; @@ -677,6 +684,7 @@ impl ColumnChunkMetaData { statistics, encoding_stats, bloom_filter_offset, + bloom_filter_length, offset_index_offset, offset_index_length, column_index_offset, @@ -722,6 +730,7 @@ impl ColumnChunkMetaData { .as_ref() .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, + bloom_filter_length: self.bloom_filter_length, } } @@ -752,6 +761,7 @@ impl ColumnChunkMetaDataBuilder { statistics: None, encoding_stats: None, bloom_filter_offset: None, + bloom_filter_length: None, offset_index_offset: None, offset_index_length: None, column_index_offset: None, @@ -837,6 +847,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional bloom filter length in bytes. + pub fn set_bloom_filter_length(mut self, value: Option) -> Self { + self.0.bloom_filter_length = value; + self + } + /// Sets optional offset index offset in bytes. pub fn set_offset_index_offset(mut self, value: Option) -> Self { self.0.offset_index_offset = value; @@ -1053,6 +1069,7 @@ mod tests { }, ]) .set_bloom_filter_offset(Some(6000)) + .set_bloom_filter_length(Some(25)) .set_offset_index_offset(Some(7000)) .set_offset_index_length(Some(25)) .set_column_index_offset(Some(8000)) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index cafb1761352d..af25cc9689c1 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -267,12 +267,15 @@ impl SerializedFileWriter { Some(bloom_filter) => { let start_offset = self.buf.bytes_written(); bloom_filter.write(&mut self.buf)?; + let end_offset = self.buf.bytes_written(); // set offset and index for bloom filter - column_chunk + let column_chunk_meta = column_chunk .meta_data .as_mut() - .expect("can't have bloom filter without column metadata") - .bloom_filter_offset = Some(start_offset as i64); + .expect("can't have bloom filter without column metadata"); + column_chunk_meta.bloom_filter_offset = Some(start_offset as i64); + column_chunk_meta.bloom_filter_length = + Some((end_offset - start_offset) as i32); } None => {} } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 0851b2287fba..12c572c23cf5 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -1,9 +1,10 @@ -// Autogenerated by Thrift Compiler (0.17.0) +// Autogenerated by Thrift Compiler (0.19.0) // DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING +#![allow(dead_code)] #![allow(unused_imports)] #![allow(unused_extern_crates)] -#![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box)] +#![allow(clippy::too_many_arguments, clippy::type_complexity, clippy::vec_box, clippy::wrong_self_convention)] #![cfg_attr(rustfmt, rustfmt_skip)] use std::cell::RefCell; @@ -99,7 +100,7 @@ impl From<&Type> for i32 { /// DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet. /// ConvertedType is superseded by LogicalType. This enum should not be extended. -/// +/// /// See LogicalTypes.md for conversion between ConvertedType and LogicalType. #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ConvertedType(pub i32); @@ -117,12 +118,12 @@ impl ConvertedType { /// an enum is converted into a binary field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. - /// + /// /// This may be used to annotate binary or fixed primitive types. The /// underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. - /// + /// /// This must be accompanied by a (maximum) precision and a scale in the /// SchemaElement. The precision specifies the number of digits in the decimal /// and the scale stores the location of the decimal point. For example 1.23 @@ -130,62 +131,62 @@ impl ConvertedType { /// 2 digits over). pub const DECIMAL: ConvertedType = ConvertedType(5); /// A Date - /// + /// /// Stored as days since Unix epoch, encoded as the INT32 physical type. - /// + /// pub const DATE: ConvertedType = ConvertedType(6); /// A time - /// + /// /// The total number of milliseconds since midnight. The value is stored /// as an INT32 physical type. pub const TIME_MILLIS: ConvertedType = ConvertedType(7); /// A time. - /// + /// /// The total number of microseconds since midnight. The value is stored as /// an INT64 physical type. pub const TIME_MICROS: ConvertedType = ConvertedType(8); /// A date/time combination - /// + /// /// Date and time recorded as milliseconds since the Unix epoch. Recorded as /// a physical type of INT64. pub const TIMESTAMP_MILLIS: ConvertedType = ConvertedType(9); /// A date/time combination - /// + /// /// Date and time recorded as microseconds since the Unix epoch. The value is /// stored as an INT64 physical type. pub const TIMESTAMP_MICROS: ConvertedType = ConvertedType(10); /// An unsigned integer value. - /// + /// /// The number describes the maximum number of meaningful data bits in /// the stored value. 8, 16 and 32 bit values are stored using the /// INT32 physical type. 64 bit values are stored using the INT64 /// physical type. - /// + /// pub const UINT_8: ConvertedType = ConvertedType(11); pub const UINT_16: ConvertedType = ConvertedType(12); pub const UINT_32: ConvertedType = ConvertedType(13); pub const UINT_64: ConvertedType = ConvertedType(14); /// A signed integer value. - /// + /// /// The number describes the maximum number of meaningful data bits in /// the stored value. 8, 16 and 32 bit values are stored using the /// INT32 physical type. 64 bit values are stored using the INT64 /// physical type. - /// + /// pub const INT_8: ConvertedType = ConvertedType(15); pub const INT_16: ConvertedType = ConvertedType(16); pub const INT_32: ConvertedType = ConvertedType(17); pub const INT_64: ConvertedType = ConvertedType(18); /// An embedded JSON document - /// + /// /// A JSON document embedded within a single UTF8 column. pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document - /// + /// /// A BSON document embedded within a single BINARY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time - /// + /// /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 /// This data is composed of three separate little endian unsigned /// integers. Each stores a component of a duration of time. The first @@ -443,11 +444,11 @@ impl From<&Encoding> for i32 { } /// Supported compression algorithms. -/// +/// /// Codecs added in format version X.Y can be read by readers based on X.Y and later. /// Codec support may vary between readers based on the format version and /// libraries available at runtime. -/// +/// /// See Compression.md for a detailed specification of these algorithms. #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct CompressionCodec(pub i32); @@ -637,17 +638,17 @@ impl From<&BoundaryOrder> for i32 { /// Statistics per row group and per page /// All fields are optional. -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Statistics { /// DEPRECATED: min and max value of the column. Use min_value and max_value. - /// + /// /// Values are encoded using PLAIN encoding, except that variable-length byte /// arrays do not include a length prefix. - /// + /// /// These fields encode min and max values determined by signed comparison /// only. New files should use the correct order for a column's logical type /// and store the values in the min_value and max_value fields. - /// + /// /// To support older readers, these may be set when the column order is /// signed. pub max: Option>, @@ -657,7 +658,7 @@ pub struct Statistics { /// count of distinct values occurring pub distinct_count: Option, /// Min and max values for the column, determined by its ColumnOrder. - /// + /// /// Values are encoded using PLAIN encoding, except that variable-length byte /// arrays do not include a length prefix. pub max_value: Option>, @@ -772,25 +773,12 @@ impl TSerializable for Statistics { } } -impl Default for Statistics { - fn default() -> Self { - Statistics{ - max: Some(Vec::new()), - min: Some(Vec::new()), - null_count: Some(0), - distinct_count: Some(0), - max_value: Some(Vec::new()), - min_value: Some(Vec::new()), - } - } -} - // // StringType // /// Empty structs to use as logical type annotations -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct StringType { } @@ -808,12 +796,7 @@ impl TSerializable for StringType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -828,17 +811,11 @@ impl TSerializable for StringType { } } -impl Default for StringType { - fn default() -> Self { - StringType{} - } -} - // // UUIDType // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct UUIDType { } @@ -856,12 +833,7 @@ impl TSerializable for UUIDType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -876,17 +848,11 @@ impl TSerializable for UUIDType { } } -impl Default for UUIDType { - fn default() -> Self { - UUIDType{} - } -} - // // MapType // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct MapType { } @@ -904,12 +870,7 @@ impl TSerializable for MapType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -924,17 +885,11 @@ impl TSerializable for MapType { } } -impl Default for MapType { - fn default() -> Self { - MapType{} - } -} - // // ListType // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ListType { } @@ -952,12 +907,7 @@ impl TSerializable for ListType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -972,17 +922,11 @@ impl TSerializable for ListType { } } -impl Default for ListType { - fn default() -> Self { - ListType{} - } -} - // // EnumType // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct EnumType { } @@ -1000,12 +944,7 @@ impl TSerializable for EnumType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1020,17 +959,11 @@ impl TSerializable for EnumType { } } -impl Default for EnumType { - fn default() -> Self { - EnumType{} - } -} - // // DateType // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DateType { } @@ -1048,12 +981,7 @@ impl TSerializable for DateType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1068,22 +996,16 @@ impl TSerializable for DateType { } } -impl Default for DateType { - fn default() -> Self { - DateType{} - } -} - // // NullType // /// Logical type to annotate a column that is always null. -/// +/// /// Sometimes when discovering the schema of existing data, values are always /// null and the physical type can't be determined. This annotation signals /// the case where the physical type was guessed from all null values. -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct NullType { } @@ -1101,12 +1023,7 @@ impl TSerializable for NullType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1121,21 +1038,18 @@ impl TSerializable for NullType { } } -impl Default for NullType { - fn default() -> Self { - NullType{} - } -} - // // DecimalType // /// Decimal logical type annotation -/// +/// +/// Scale must be zero or a positive integer less than or equal to the precision. +/// Precision must be a non-zero positive integer. +/// /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. -/// +/// /// Allowed for physical types: INT32, INT64, FIXED, and BINARY #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { @@ -1206,7 +1120,7 @@ impl TSerializable for DecimalType { // /// Time units for logical types -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct MilliSeconds { } @@ -1224,12 +1138,7 @@ impl TSerializable for MilliSeconds { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1244,17 +1153,11 @@ impl TSerializable for MilliSeconds { } } -impl Default for MilliSeconds { - fn default() -> Self { - MilliSeconds{} - } -} - // // MicroSeconds // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct MicroSeconds { } @@ -1272,12 +1175,7 @@ impl TSerializable for MicroSeconds { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1292,17 +1190,11 @@ impl TSerializable for MicroSeconds { } } -impl Default for MicroSeconds { - fn default() -> Self { - MicroSeconds{} - } -} - // // NanoSeconds // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct NanoSeconds { } @@ -1320,12 +1212,7 @@ impl TSerializable for NanoSeconds { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1340,12 +1227,6 @@ impl TSerializable for NanoSeconds { } } -impl Default for NanoSeconds { - fn default() -> Self { - NanoSeconds{} - } -} - // // TimeUnit // @@ -1450,7 +1331,7 @@ impl TSerializable for TimeUnit { // /// Timestamp logical type annotation -/// +/// /// Allowed for physical types: INT64 #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct TimestampType { @@ -1521,7 +1402,7 @@ impl TSerializable for TimestampType { // /// Time logical type annotation -/// +/// /// Allowed for physical types: INT32 (millis), INT64 (micros, nanos) #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct TimeType { @@ -1592,9 +1473,9 @@ impl TSerializable for TimeType { // /// Integer logical type annotation -/// +/// /// bitWidth must be 8, 16, 32, or 64. -/// +/// /// Allowed for physical types: INT32, INT64 #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct IntType { @@ -1665,9 +1546,9 @@ impl TSerializable for IntType { // /// Embedded JSON logical type annotation -/// +/// /// Allowed for physical types: BINARY -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { } @@ -1685,12 +1566,7 @@ impl TSerializable for JsonType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1705,20 +1581,14 @@ impl TSerializable for JsonType { } } -impl Default for JsonType { - fn default() -> Self { - JsonType{} - } -} - // // BsonType // /// Embedded BSON logical type annotation -/// +/// /// Allowed for physical types: BINARY -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { } @@ -1736,12 +1606,7 @@ impl TSerializable for BsonType { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -1756,12 +1621,6 @@ impl TSerializable for BsonType { } } -impl Default for BsonType { - fn default() -> Self { - BsonType{} - } -} - // // LogicalType // @@ -2003,7 +1862,7 @@ impl TSerializable for LogicalType { pub struct SchemaElement { /// Data type for this field. Not set if the current element is a non-leaf node pub type_: Option, - /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. + /// If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. /// Otherwise, if specified, this is the maximum bit length to store any of the values. /// (e.g. a low cardinality INT col could have this set to 3). Note that this is /// in the schema, and therefore fixed for the entire file. @@ -2020,12 +1879,12 @@ pub struct SchemaElement { pub num_children: Option, /// DEPRECATED: When the schema is the result of a conversion from another model. /// Used to record the original type to help with cross conversion. - /// + /// /// This is superseded by logicalType. pub converted_type: Option, /// DEPRECATED: Used when this column contains decimal data. /// See the DECIMAL converted type for more details. - /// + /// /// This is superseded by using the DecimalType annotation in logicalType. pub scale: Option, pub precision: Option, @@ -2033,7 +1892,7 @@ pub struct SchemaElement { /// original field id in the parquet schema pub field_id: Option, /// The logical type of this SchemaElement - /// + /// /// LogicalType replaces ConvertedType, but ConvertedType is still required /// for some logical types to ensure forward-compatibility in format v1. pub logical_type: Option, @@ -2309,7 +2168,7 @@ impl TSerializable for DataPageHeader { // IndexPageHeader // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct IndexPageHeader { } @@ -2327,12 +2186,7 @@ impl TSerializable for IndexPageHeader { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -2347,16 +2201,14 @@ impl TSerializable for IndexPageHeader { } } -impl Default for IndexPageHeader { - fn default() -> Self { - IndexPageHeader{} - } -} - // // DictionaryPageHeader // +/// The dictionary page must be placed at the first position of the column chunk +/// if it is partly or completely dictionary encoded. At most one dictionary page +/// can be placed in a column chunk. +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DictionaryPageHeader { /// Number of values in the dictionary * @@ -2444,7 +2296,7 @@ impl TSerializable for DictionaryPageHeader { /// New page format allowing reading levels without decompressing the data /// Repetition and definition levels are uncompressed /// The remaining section containing the data is compressed if is_compressed is true -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeaderV2 { /// Number of values, including NULLs, in this data page. * @@ -2601,7 +2453,7 @@ impl TSerializable for DataPageHeaderV2 { // /// Block-based algorithm type annotation. * -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct SplitBlockAlgorithm { } @@ -2619,12 +2471,7 @@ impl TSerializable for SplitBlockAlgorithm { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -2639,12 +2486,6 @@ impl TSerializable for SplitBlockAlgorithm { } } -impl Default for SplitBlockAlgorithm { - fn default() -> Self { - SplitBlockAlgorithm{} - } -} - // // BloomFilterAlgorithm // @@ -2724,8 +2565,8 @@ impl TSerializable for BloomFilterAlgorithm { /// Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash /// algorithm. It uses 64 bits version of xxHash. -/// -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +/// +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct XxHash { } @@ -2743,12 +2584,7 @@ impl TSerializable for XxHash { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -2763,12 +2599,6 @@ impl TSerializable for XxHash { } } -impl Default for XxHash { - fn default() -> Self { - XxHash{} - } -} - // // BloomFilterHash // @@ -2847,8 +2677,8 @@ impl TSerializable for BloomFilterHash { // /// The compression used in the Bloom filter. -/// -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +/// +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct Uncompressed { } @@ -2866,12 +2696,7 @@ impl TSerializable for Uncompressed { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -2886,12 +2711,6 @@ impl TSerializable for Uncompressed { } } -impl Default for Uncompressed { - fn default() -> Self { - Uncompressed{} - } -} - // // BloomFilterCompression // @@ -2971,7 +2790,7 @@ impl TSerializable for BloomFilterCompression { /// Bloom filter header is stored at beginning of Bloom filter data of each column /// and followed by its bitset. -/// +/// #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BloomFilterHeader { /// The size of bitset in bytes * @@ -3076,32 +2895,22 @@ pub struct PageHeader { pub uncompressed_page_size: i32, /// Compressed (and potentially encrypted) page size in bytes, not including this header * pub compressed_page_size: i32, - /// The 32bit CRC for the page, to be be calculated as follows: - /// - Using the standard CRC32 algorithm - /// - On the data only, i.e. this header should not be included. 'Data' - /// hereby refers to the concatenation of the repetition levels, the - /// definition levels and the column value, in this exact order. - /// - On the encoded versions of the repetition levels, definition levels and - /// column values - /// - On the compressed versions of the repetition levels, definition levels - /// and column values where possible; - /// - For v1 data pages, the repetition levels, definition levels and column - /// values are always compressed together. If a compression scheme is - /// specified, the CRC shall be calculated on the compressed version of - /// this concatenation. If no compression scheme is specified, the CRC - /// shall be calculated on the uncompressed version of this concatenation. - /// - For v2 data pages, the repetition levels and definition levels are - /// handled separately from the data and are never compressed (only - /// encoded). If a compression scheme is specified, the CRC shall be - /// calculated on the concatenation of the uncompressed repetition levels, - /// uncompressed definition levels and the compressed column values. - /// If no compression scheme is specified, the CRC shall be calculated on - /// the uncompressed concatenation. - /// - In encrypted columns, CRC is calculated after page encryption; the - /// encryption itself is performed after page compression (if compressed) + /// The 32-bit CRC checksum for the page, to be be calculated as follows: + /// + /// - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7, + /// the same as in e.g. GZip). + /// - All page types can have a CRC (v1 and v2 data pages, dictionary pages, + /// etc.). + /// - The CRC is computed on the serialization binary representation of the page + /// (as written to disk), excluding the page header. For example, for v1 + /// data pages, the CRC is computed on the concatenation of repetition levels, + /// definition levels and column values (optionally compressed, optionally + /// encrypted). + /// - The CRC computation therefore takes place after any compression + /// and encryption steps, if any. + /// /// If enabled, this allows for disabling checksumming in HDFS if only a few /// pages need to be read. - /// pub crc: Option, pub data_page_header: Option, pub index_page_header: Option, @@ -3516,10 +3325,16 @@ pub struct ColumnMetaData { pub encoding_stats: Option>, /// Byte offset from beginning of file to Bloom filter data. * pub bloom_filter_offset: Option, + /// Size of Bloom filter data including the serialized header, in bytes. + /// Added in 2.10 so readers may not read this field from old files and + /// it can be obtained after the BloomFilterHeader has been deserialized. + /// Writers should write this field so readers can read the bloom filter + /// in a single I/O. + pub bloom_filter_length: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { ColumnMetaData { type_, encodings, @@ -3535,6 +3350,7 @@ impl ColumnMetaData { statistics: statistics.into(), encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), + bloom_filter_length: bloom_filter_length.into(), } } } @@ -3556,6 +3372,7 @@ impl TSerializable for ColumnMetaData { let mut f_12: Option = None; let mut f_13: Option> = None; let mut f_14: Option = None; + let mut f_15: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3643,6 +3460,10 @@ impl TSerializable for ColumnMetaData { let val = i_prot.read_i64()?; f_14 = Some(val); }, + 15 => { + let val = i_prot.read_i32()?; + f_15 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3673,6 +3494,7 @@ impl TSerializable for ColumnMetaData { statistics: f_12, encoding_stats: f_13, bloom_filter_offset: f_14, + bloom_filter_length: f_15, }; Ok(ret) } @@ -3749,6 +3571,11 @@ impl TSerializable for ColumnMetaData { o_prot.write_i64(fld_var)?; o_prot.write_field_end()? } + if let Some(fld_var) = self.bloom_filter_length { + o_prot.write_field_begin(&TFieldIdentifier::new("bloom_filter_length", TType::I32, 15))?; + o_prot.write_i32(fld_var)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3758,7 +3585,7 @@ impl TSerializable for ColumnMetaData { // EncryptionWithFooterKey // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct EncryptionWithFooterKey { } @@ -3776,12 +3603,7 @@ impl TSerializable for EncryptionWithFooterKey { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -3796,12 +3618,6 @@ impl TSerializable for EncryptionWithFooterKey { } } -impl Default for EncryptionWithFooterKey { - fn default() -> Self { - EncryptionWithFooterKey{} - } -} - // // EncryptionWithColumnKey // @@ -3977,14 +3793,14 @@ impl TSerializable for ColumnCryptoMetaData { pub struct ColumnChunk { /// File where column data is stored. If not set, assumed to be same file as /// metadata. This path is relative to the current file. - /// + /// pub file_path: Option, /// Byte offset in file_path to the ColumnMetaData * pub file_offset: i64, /// Column metadata for this chunk. This is the same content as what is at /// file_path/file_offset. Having it here has it replicated in the file /// metadata. - /// + /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * pub offset_index_offset: Option, @@ -4151,7 +3967,7 @@ impl TSerializable for ColumnChunk { pub struct RowGroup { /// Metadata for each column chunk in this row group. /// This list must have the same order as the SchemaElement list in FileMetaData. - /// + /// pub columns: Vec, /// Total byte size of all the uncompressed column data in this row group * pub total_byte_size: i64, @@ -4312,7 +4128,7 @@ impl TSerializable for RowGroup { // /// Empty struct to signal the order defined by the physical or logical type -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct TypeDefinedOrder { } @@ -4330,12 +4146,7 @@ impl TSerializable for TypeDefinedOrder { if field_ident.field_type == TType::Stop { break; } - let field_id = field_id(&field_ident)?; - match field_id { - _ => { - i_prot.skip(field_ident.field_type)?; - }, - }; + i_prot.skip(field_ident.field_type)?; i_prot.read_field_end()?; } i_prot.read_struct_end()?; @@ -4350,12 +4161,6 @@ impl TSerializable for TypeDefinedOrder { } } -impl Default for TypeDefinedOrder { - fn default() -> Self { - TypeDefinedOrder{} - } -} - // // ColumnOrder // @@ -4596,13 +4401,14 @@ pub struct ColumnIndex { /// byte\[0\], so that all lists have the same length. If false, the /// corresponding entries in min_values and max_values must be valid. pub null_pages: Vec, - /// Two lists containing lower and upper bounds for the values of each page. - /// These may be the actual minimum and maximum values found on a page, but - /// can also be (more compact) values that do not exist on a page. For - /// example, instead of storing ""Blart Versenwald III", a writer may set - /// min_values\[i\]="B", max_values\[i\]="C". Such more compact values must still - /// be valid values within the column's logical type. Readers must make sure - /// that list entries are populated before using them by inspecting null_pages. + /// Two lists containing lower and upper bounds for the values of each page + /// determined by the ColumnOrder of the column. These may be the actual + /// minimum and maximum values found on a page, but can also be (more compact) + /// values that do not exist on a page. For example, instead of storing ""Blart + /// Versenwald III", a writer may set min_values\[i\]="B", max_values\[i\]="C". + /// Such more compact values must still be valid values within the column's + /// logical type. Readers must make sure that list entries are populated before + /// using them by inspecting null_pages. pub min_values: Vec>, pub max_values: Vec>, /// Stores whether both min_values and max_values are ordered and if so, in @@ -4750,7 +4556,7 @@ impl TSerializable for ColumnIndex { // AesGcmV1 // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct AesGcmV1 { /// AAD prefix * pub aad_prefix: Option>, @@ -4833,21 +4639,11 @@ impl TSerializable for AesGcmV1 { } } -impl Default for AesGcmV1 { - fn default() -> Self { - AesGcmV1{ - aad_prefix: Some(Vec::new()), - aad_file_unique: Some(Vec::new()), - supply_aad_prefix: Some(false), - } - } -} - // // AesGcmCtrV1 // -#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct AesGcmCtrV1 { /// AAD prefix * pub aad_prefix: Option>, @@ -4930,16 +4726,6 @@ impl TSerializable for AesGcmCtrV1 { } } -impl Default for AesGcmCtrV1 { - fn default() -> Self { - AesGcmCtrV1{ - aad_prefix: Some(Vec::new()), - aad_file_unique: Some(Vec::new()), - supply_aad_prefix: Some(false), - } - } -} - // // EncryptionAlgorithm // @@ -5051,19 +4837,22 @@ pub struct FileMetaData { /// String for application that wrote this file. This should be in the format /// `` version `` (build ``). /// e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) - /// + /// pub created_by: Option, - /// Sort order used for the min_value and max_value fields of each column in - /// this file. Sort orders are listed in the order matching the columns in the - /// schema. The indexes are not necessary the same though, because only leaf - /// nodes of the schema are represented in the list of sort orders. - /// - /// Without column_orders, the meaning of the min_value and max_value fields is - /// undefined. To ensure well-defined behaviour, if min_value and max_value are - /// written to a Parquet file, column_orders must be written as well. - /// - /// The obsolete min and max fields are always sorted by signed comparison - /// regardless of column_orders. + /// Sort order used for the min_value and max_value fields in the Statistics + /// objects and the min_values and max_values fields in the ColumnIndex + /// objects of each column in this file. Sort orders are listed in the order + /// matching the columns in the schema. The indexes are not necessary the same + /// though, because only leaf nodes of the schema are represented in the list + /// of sort orders. + /// + /// Without column_orders, the meaning of the min_value and max_value fields + /// in the Statistics object and the ColumnIndex object is undefined. To ensure + /// well-defined behaviour, if these fields are written to a Parquet file, + /// column_orders must be written as well. + /// + /// The obsolete min and max fields in the Statistics object are always sorted + /// by signed comparison regardless of column_orders. pub column_orders: Option>, /// Encryption algorithm. This field is set only in encrypted files /// with plaintext footer. Files with encrypted footer store algorithm id diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index 0c90c5405a2b..fe63e758b251 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -167,6 +167,11 @@ fn print_column_chunk_metadata( Some(bfo) => bfo.to_string(), }; writeln!(out, "bloom filter offset: {bloom_filter_offset_str}"); + let bloom_filter_length_str = match cc_metadata.bloom_filter_length() { + None => "N/A".to_owned(), + Some(bfo) => bfo.to_string(), + }; + writeln!(out, "bloom filter length: {bloom_filter_length_str}"); let offset_index_offset_str = match cc_metadata.offset_index_offset() { None => "N/A".to_owned(), Some(oio) => oio.to_string(), From 39e4d94364d3df5e3cc8662b5c6305463f089658 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 2 Oct 2023 15:14:34 +0100 Subject: [PATCH 1248/1411] Update ring requirement from 0.16 to 0.17 in /object_store (#4887) * Update ring requirement from 0.16 to 0.17 in /object_store Updates the requirements on [ring](https://github.com/briansmith/ring) to permit the latest version. - [Commits](https://github.com/briansmith/ring/commits) --- updated-dependencies: - dependency-name: ring dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Clippy --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- object_store/Cargo.toml | 2 +- object_store/src/gcp/credential.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index ff8047c60ca9..7928648d170f 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -51,7 +51,7 @@ serde = { version = "1.0", default-features = false, features = ["derive"], opti serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } -ring = { version = "0.16", default-features = false, features = ["std"], optional = true } +ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 205b805947cc..ad21c33b8b9d 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -203,7 +203,7 @@ impl TokenProvider for OAuthProvider { let claim_str = b64_encode_obj(&claims)?; let message = [self.jwt_header.as_ref(), claim_str.as_ref()].join("."); - let mut sig_bytes = vec![0; self.key_pair.public_modulus_len()]; + let mut sig_bytes = vec![0; self.key_pair.public().modulus_len()]; self.key_pair .sign( &ring::signature::RSA_PKCS1_SHA256, From 4320a753beaee0a1a6870c59ef46b59e88c9c323 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 09:21:51 -0600 Subject: [PATCH 1249/1411] Implement Take for UnionArray (#4883) Implement Take for UnionArray (#4883) --- arrow-select/src/take.rs | 54 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 70b80e5878dd..a546949f86e6 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -28,7 +28,7 @@ use arrow_buffer::{ ScalarBuffer, }; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, FieldRef}; +use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode}; use num::{One, Zero}; @@ -223,6 +223,21 @@ fn take_impl( Ok(new_null_array(&DataType::Null, indices.len())) } } + DataType::Union(fields, UnionMode::Sparse) => { + let mut field_type_ids = Vec::with_capacity(fields.len()); + let mut children = Vec::with_capacity(fields.len()); + let values = values.as_any().downcast_ref::().unwrap(); + let type_ids = take_native(values.type_ids(), indices).into_inner(); + for (type_id, field) in fields.iter() { + let values = values.child(type_id); + let values = take_impl(values, indices)?; + let field = (**field).clone(); + children.push((field, values)); + field_type_ids.push(type_id); + } + let array = UnionArray::try_new(field_type_ids.as_slice(), type_ids, None, children)?; + Ok(Arc::new(array)) + } t => unimplemented!("Take not supported for data type {:?}", t) } } @@ -2013,4 +2028,41 @@ mod tests { let values = r.as_string::().iter().collect::>(); assert_eq!(&values, &[Some("foo"), None, None, None]) } + + #[test] + fn test_take_union() { + let structs = create_test_struct(vec![ + Some((Some(true), Some(42))), + Some((Some(false), Some(28))), + Some((Some(false), Some(19))), + Some((Some(true), Some(31))), + None, + ]); + let strings = + StringArray::from(vec![Some("a"), None, Some("c"), None, Some("d")]); + let type_ids = Buffer::from_slice_ref(vec![1i8; 5]); + + let children: Vec<(Field, Arc)> = vec![ + ( + Field::new("f1", structs.data_type().clone(), true), + Arc::new(structs), + ), + ( + Field::new("f2", strings.data_type().clone(), true), + Arc::new(strings), + ), + ]; + let array = UnionArray::try_new(&[0, 1], type_ids, None, children).unwrap(); + + let indices = vec![0, 3, 1, 0, 2, 4]; + let index = UInt32Array::from(indices.clone()); + let actual = take(&array, &index, None).unwrap(); + let actual = actual.as_any().downcast_ref::().unwrap(); + let strings = actual.child(1); + let strings = strings.as_any().downcast_ref::().unwrap(); + + let actual = strings.iter().collect::>(); + let expected = vec![Some("a"), None, None, Some("a"), Some("c"), Some("d")]; + assert_eq!(expected, actual); + } } From f0455d12ddcb174f1f8d2bbfd5874f7b708c9a74 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 4 Oct 2023 12:42:49 +0100 Subject: [PATCH 1250/1411] Support Parsing Avro File Headers (#4888) * Add arrow-avro * Add HeaderDecoder * Add schema parsing * Add BlockDecoder * Further docs * Apply suggestions from code review Co-authored-by: Andrew Lamb * Review feedback --------- Co-authored-by: Andrew Lamb --- .github/workflows/arrow.yml | 5 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 1 + arrow-avro/Cargo.toml | 46 +++ arrow-avro/src/compression.rs | 32 ++ arrow-avro/src/lib.rs | 28 ++ arrow-avro/src/reader/block.rs | 141 ++++++++ arrow-avro/src/reader/header.rs | 289 ++++++++++++++++ arrow-avro/src/reader/mod.rs | 92 +++++ arrow-avro/src/reader/vlq.rs | 46 +++ arrow-avro/src/schema.rs | 484 +++++++++++++++++++++++++++ dev/release/README.md | 1 + 15 files changed, 1169 insertions(+) create mode 100644 arrow-avro/Cargo.toml create mode 100644 arrow-avro/src/compression.rs create mode 100644 arrow-avro/src/lib.rs create mode 100644 arrow-avro/src/reader/block.rs create mode 100644 arrow-avro/src/reader/header.rs create mode 100644 arrow-avro/src/reader/mod.rs create mode 100644 arrow-avro/src/reader/vlq.rs create mode 100644 arrow-avro/src/schema.rs diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index cde931c3c6b8..da56c23b5cd9 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -39,6 +39,7 @@ on: - arrow-integration-test/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-row/** - arrow-schema/** @@ -78,6 +79,8 @@ jobs: run: cargo test -p arrow-csv --all-features - name: Test arrow-json with all features run: cargo test -p arrow-json --all-features + - name: Test arrow-avro with all features + run: cargo test -p arrow-avro --all-features - name: Test arrow-string with all features run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features @@ -202,6 +205,8 @@ jobs: run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - name: Clippy arrow-json with all features run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings + - name: Clippy arrow-avro with all features + run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - name: Clippy arrow-string with all features run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index e5b86e8bcdf0..ea5873081f18 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -27,6 +27,7 @@ arrow: - arrow-integration-testing/**/* - arrow-ipc/**/* - arrow-json/**/* + - arrow-avro/**/* - arrow-ord/**/* - arrow-row/**/* - arrow-schema/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index aaf39d22bbce..eca51a80c164 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -38,6 +38,7 @@ on: - arrow-integration-testing/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-pyarrow-integration-testing/** - arrow-schema/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index e3704d036aca..19b432121b6f 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -36,6 +36,7 @@ on: - arrow-data/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-schema/** - arrow-select/** - arrow-string/** diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 7a649e16b1ec..d664a0dc0730 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -40,6 +40,7 @@ on: - arrow-ipc/** - arrow-csv/** - arrow-json/** + - arrow-avro/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 936935ec7e3d..d874e335eeae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "arrow", "arrow-arith", "arrow-array", + "arrow-avro", "arrow-buffer", "arrow-cast", "arrow-csv", diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml new file mode 100644 index 000000000000..9575874c41d2 --- /dev/null +++ b/arrow-avro/Cargo.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-avro" +version = { workspace = true } +description = "Support for parsing Avro format into the Arrow format" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "arrow_avro" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +serde = { version = "1.0.188", features = ["derive"] } + +[dev-dependencies] + diff --git a/arrow-avro/src/compression.rs b/arrow-avro/src/compression.rs new file mode 100644 index 000000000000..a1a44fc22b68 --- /dev/null +++ b/arrow-avro/src/compression.rs @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; + +/// The metadata key used for storing the JSON encoded [`CompressionCodec`] +pub const CODEC_METADATA_KEY: &str = "avro.codec"; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionCodec { + Null, + Deflate, + BZip2, + Snappy, + XZ, + ZStandard, +} diff --git a/arrow-avro/src/lib.rs b/arrow-avro/src/lib.rs new file mode 100644 index 000000000000..e134d9d798f2 --- /dev/null +++ b/arrow-avro/src/lib.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! +//! [Apache Arrow]: https://arrow.apache.org +//! [Apache Avro]: https://avro.apache.org/ + +#![allow(unused)] // Temporary + +pub mod reader; +mod schema; + +mod compression; diff --git a/arrow-avro/src/reader/block.rs b/arrow-avro/src/reader/block.rs new file mode 100644 index 000000000000..479f0ef90909 --- /dev/null +++ b/arrow-avro/src/reader/block.rs @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Block`] + +use crate::reader::vlq::VLQDecoder; +use arrow_schema::ArrowError; + +/// A file data block +/// +/// +#[derive(Debug, Default)] +pub struct Block { + /// The number of objects in this block + pub count: usize, + /// The serialized objects within this block + pub data: Vec, + /// The sync marker + pub sync: [u8; 16], +} + +/// A decoder for [`Block`] +#[derive(Debug)] +pub struct BlockDecoder { + state: BlockDecoderState, + in_progress: Block, + vlq_decoder: VLQDecoder, + bytes_remaining: usize, +} + +#[derive(Debug)] +enum BlockDecoderState { + Count, + Size, + Data, + Sync, + Finished, +} + +impl Default for BlockDecoder { + fn default() -> Self { + Self { + state: BlockDecoderState::Count, + in_progress: Default::default(), + vlq_decoder: Default::default(), + bytes_remaining: 0, + } + } +} + +impl BlockDecoder { + /// Parse [`Block`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once an entire [`Block`] has been decoded this method will not read any further + /// input bytes, until [`Self::flush`] is called. Afterwards [`Self::decode`] + /// can then be used again to read the next block, if any + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + BlockDecoderState::Count => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.in_progress.count = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block count cannot be negative, got {c}" + )) + })?; + + self.state = BlockDecoderState::Size; + } + } + BlockDecoderState::Size => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block size cannot be negative, got {c}" + )) + })?; + + self.in_progress.data.reserve(self.bytes_remaining); + self.state = BlockDecoderState::Data; + } + } + BlockDecoderState::Data => { + let to_read = self.bytes_remaining.min(buf.len()); + self.in_progress.data.extend_from_slice(&buf[..to_read]); + buf = &buf[to_read..]; + self.bytes_remaining -= to_read; + if self.bytes_remaining == 0 { + self.bytes_remaining = 16; + self.state = BlockDecoderState::Sync; + } + } + BlockDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.in_progress.sync[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = BlockDecoderState::Finished; + } + } + BlockDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Block`] if any + pub fn flush(&mut self) -> Option { + match self.state { + BlockDecoderState::Finished => { + self.state = BlockDecoderState::Count; + Some(std::mem::take(&mut self.in_progress)) + } + _ => None, + } + } +} diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs new file mode 100644 index 000000000000..92db8b1dc76d --- /dev/null +++ b/arrow-avro/src/reader/header.rs @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Header`] + +use crate::reader::vlq::VLQDecoder; +use crate::schema::Schema; +use arrow_schema::ArrowError; + +#[derive(Debug)] +enum HeaderDecoderState { + /// Decoding the [`MAGIC`] prefix + Magic, + /// Decoding a block count + BlockCount, + /// Decoding a block byte length + BlockLen, + /// Decoding a key length + KeyLen, + /// Decoding a key string + Key, + /// Decoding a value length + ValueLen, + /// Decoding a value payload + Value, + /// Decoding sync marker + Sync, + /// Finished decoding + Finished, +} + +/// A decoded header for an [Object Container File](https://avro.apache.org/docs/1.11.1/specification/#object-container-files) +#[derive(Debug, Clone)] +pub struct Header { + meta_offsets: Vec, + meta_buf: Vec, + sync: [u8; 16], +} + +impl Header { + /// Returns an iterator over the meta keys in this header + pub fn metadata(&self) -> impl Iterator { + let mut last = 0; + self.meta_offsets.windows(2).map(move |w| { + let start = last; + last = w[1]; + (&self.meta_buf[start..w[0]], &self.meta_buf[w[0]..w[1]]) + }) + } + + /// Returns the value for a given metadata key if present + pub fn get(&self, key: impl AsRef<[u8]>) -> Option<&[u8]> { + self.metadata() + .find_map(|(k, v)| (k == key.as_ref()).then_some(v)) + } + + /// Returns the sync token for this file + pub fn sync(&self) -> [u8; 16] { + self.sync + } +} + +/// A decoder for [`Header`] +/// +/// The avro file format does not encode the length of the header, and so it +/// is necessary to provide a push-based decoder that can be used with streams +#[derive(Debug)] +pub struct HeaderDecoder { + state: HeaderDecoderState, + vlq_decoder: VLQDecoder, + + /// The end offsets of strings in `meta_buf` + meta_offsets: Vec, + /// The raw binary data of the metadata map + meta_buf: Vec, + + /// The decoded sync marker + sync_marker: [u8; 16], + + /// The number of remaining tuples in the current block + tuples_remaining: usize, + /// The number of bytes remaining in the current string/bytes payload + bytes_remaining: usize, +} + +impl Default for HeaderDecoder { + fn default() -> Self { + Self { + state: HeaderDecoderState::Magic, + meta_offsets: vec![], + meta_buf: vec![], + sync_marker: [0; 16], + vlq_decoder: Default::default(), + tuples_remaining: 0, + bytes_remaining: MAGIC.len(), + } + } +} + +const MAGIC: &[u8; 4] = b"Obj\x01"; + +impl HeaderDecoder { + /// Parse [`Header`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once the entire [`Header`] has been decoded this method will not read any further + /// input bytes, and the header can be obtained with [`Self::flush`] + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + HeaderDecoderState::Magic => { + let remaining = &MAGIC[MAGIC.len() - self.bytes_remaining..]; + let to_decode = buf.len().min(remaining.len()); + if !buf.starts_with(&remaining[..to_decode]) { + return Err(ArrowError::ParseError( + "Incorrect avro magic".to_string(), + )); + } + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::BlockCount; + } + } + HeaderDecoderState::BlockCount => { + if let Some(block_count) = self.vlq_decoder.long(&mut buf) { + match block_count.try_into() { + Ok(0) => { + self.state = HeaderDecoderState::Sync; + self.bytes_remaining = 16; + } + Ok(remaining) => { + self.tuples_remaining = remaining; + self.state = HeaderDecoderState::KeyLen; + } + Err(_) => { + self.tuples_remaining = block_count.unsigned_abs() as _; + self.state = HeaderDecoderState::BlockLen; + } + } + } + } + HeaderDecoderState::BlockLen => { + if self.vlq_decoder.long(&mut buf).is_some() { + self.state = HeaderDecoderState::KeyLen + } + } + HeaderDecoderState::Key => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + self.state = HeaderDecoderState::ValueLen; + } + } + HeaderDecoderState::Value => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + + self.tuples_remaining -= 1; + match self.tuples_remaining { + 0 => self.state = HeaderDecoderState::BlockCount, + _ => self.state = HeaderDecoderState::KeyLen, + } + } + } + HeaderDecoderState::KeyLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Key; + } + } + HeaderDecoderState::ValueLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Value; + } + } + HeaderDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.sync_marker[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::Finished; + } + } + HeaderDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Header`] if any + pub fn flush(&mut self) -> Option

{ + match self.state { + HeaderDecoderState::Finished => { + self.state = HeaderDecoderState::Magic; + Some(Header { + meta_offsets: std::mem::take(&mut self.meta_offsets), + meta_buf: std::mem::take(&mut self.meta_buf), + sync: self.sync_marker, + }) + } + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::read_header; + use crate::schema::SCHEMA_METADATA_KEY; + use std::fs::File; + use std::io::{BufRead, BufReader}; + + #[test] + fn test_header_decode() { + let mut decoder = HeaderDecoder::default(); + for m in MAGIC { + decoder.decode(std::slice::from_ref(m)).unwrap(); + } + + let mut decoder = HeaderDecoder::default(); + assert_eq!(decoder.decode(MAGIC).unwrap(), 4); + + let mut decoder = HeaderDecoder::default(); + decoder.decode(b"Ob").unwrap(); + let err = decoder.decode(b"s").unwrap_err().to_string(); + assert_eq!(err, "Parser error: Incorrect avro magic"); + } + + fn decode_file(file: &str) -> Header { + let file = File::open(file).unwrap(); + read_header(BufReader::with_capacity(100, file)).unwrap() + } + + #[test] + fn test_header() { + let header = decode_file("../testing/data/avro/alltypes_plain.avro"); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":["int","null"]},{"name":"bool_col","type":["boolean","null"]},{"name":"tinyint_col","type":["int","null"]},{"name":"smallint_col","type":["int","null"]},{"name":"int_col","type":["int","null"]},{"name":"bigint_col","type":["long","null"]},{"name":"float_col","type":["float","null"]},{"name":"double_col","type":["double","null"]},{"name":"date_string_col","type":["bytes","null"]},{"name":"string_col","type":["bytes","null"]},{"name":"timestamp_col","type":[{"type":"long","logicalType":"timestamp-micros"},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 226966037233754408753420635932530907102 + ); + + let header = decode_file("../testing/data/avro/fixed_length_decimal.avro"); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"value","type":[{"type":"fixed","name":"fixed","namespace":"topLevelRecord.value","size":11,"logicalType":"decimal","precision":25,"scale":2},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 325166208089902833952788552656412487328 + ); + } +} diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs new file mode 100644 index 000000000000..a42011e3b2ad --- /dev/null +++ b/arrow-avro/src/reader/mod.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read Avro data to Arrow + +use crate::reader::block::{Block, BlockDecoder}; +use crate::reader::header::{Header, HeaderDecoder}; +use arrow_schema::ArrowError; +use std::io::BufRead; + +mod header; + +mod block; + +mod vlq; + +/// Read a [`Header`] from the provided [`BufRead`] +fn read_header(mut reader: R) -> Result { + let mut decoder = HeaderDecoder::default(); + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + + decoder + .flush() + .ok_or_else(|| ArrowError::ParseError("Unexpected EOF".to_string())) +} + +/// Return an iterator of [`Block`] from the provided [`BufRead`] +fn read_blocks( + mut reader: R, +) -> impl Iterator> { + let mut decoder = BlockDecoder::default(); + + let mut try_next = move || { + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + Ok(decoder.flush()) + }; + std::iter::from_fn(move || try_next().transpose()) +} + +#[cfg(test)] +mod test { + use crate::reader::{read_blocks, read_header}; + use std::fs::File; + use std::io::BufReader; + + #[test] + fn test_mux() { + let file = File::open("../testing/data/avro/alltypes_plain.avro").unwrap(); + let mut reader = BufReader::new(file); + let header = read_header(&mut reader).unwrap(); + for result in read_blocks(reader) { + let block = result.unwrap(); + assert_eq!(block.sync, header.sync()); + } + } +} diff --git a/arrow-avro/src/reader/vlq.rs b/arrow-avro/src/reader/vlq.rs new file mode 100644 index 000000000000..80f1c60eec7d --- /dev/null +++ b/arrow-avro/src/reader/vlq.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Decoder for zig-zag encoded variable length (VLW) integers +/// +/// See also: +/// +/// +#[derive(Debug, Default)] +pub struct VLQDecoder { + /// Scratch space for decoding VLQ integers + in_progress: u64, + shift: u32, +} + +impl VLQDecoder { + /// Decode a signed long from `buf` + pub fn long(&mut self, buf: &mut &[u8]) -> Option { + while let Some(byte) = buf.first().copied() { + *buf = &buf[1..]; + self.in_progress |= ((byte & 0x7F) as u64) << self.shift; + self.shift += 7; + if byte & 0x80 == 0 { + let val = self.in_progress; + self.in_progress = 0; + self.shift = 0; + return Some((val >> 1) as i64 ^ -((val & 1) as i64)); + } + } + None + } +} diff --git a/arrow-avro/src/schema.rs b/arrow-avro/src/schema.rs new file mode 100644 index 000000000000..839ba65bd5fc --- /dev/null +++ b/arrow-avro/src/schema.rs @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// The metadata key used for storing the JSON encoded [`Schema`] +pub const SCHEMA_METADATA_KEY: &str = "avro.schema"; + +/// Either a [`PrimitiveType`] or a reference to a previously defined named type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum TypeName<'a> { + Primitive(PrimitiveType), + Ref(&'a str), +} + +/// A primitive type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum PrimitiveType { + Null, + Boolean, + Int, + Long, + Float, + Double, + Bytes, + String, +} + +/// Additional attributes within a [`Schema`] +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Attributes<'a> { + /// A logical type name + /// + /// + #[serde(default)] + pub logical_type: Option<&'a str>, + + /// Additional JSON attributes + #[serde(flatten)] + pub additional: HashMap<&'a str, serde_json::Value>, +} + +/// A type definition that is not a variant of [`ComplexType`] +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Type<'a> { + #[serde(borrow)] + pub r#type: TypeName<'a>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An Avro schema +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Schema<'a> { + #[serde(borrow)] + TypeName(TypeName<'a>), + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Complex(ComplexType<'a>), + #[serde(borrow)] + Type(Type<'a>), +} + +/// A complex type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum ComplexType<'a> { + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Record(Record<'a>), + #[serde(borrow)] + Enum(Enum<'a>), + #[serde(borrow)] + Array(Array<'a>), + #[serde(borrow)] + Map(Map<'a>), + #[serde(borrow)] + Fixed(Fixed<'a>), +} + +/// A record +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Record<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub fields: Vec>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A field within a [`Record`] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Field<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow)] + pub r#type: Schema<'a>, + #[serde(borrow, default)] + pub default: Option<&'a str>, +} + +/// An enumeration +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Enum<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub symbols: Vec<&'a str>, + #[serde(borrow, default)] + pub default: Option<&'a str>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Array<'a> { + #[serde(borrow)] + pub items: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A map +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Map<'a> { + #[serde(borrow)] + pub values: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A fixed length binary array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Fixed<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + pub size: usize, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + #[test] + fn test_deserialize() { + let t: Schema = serde_json::from_str("\"string\"").unwrap(); + assert_eq!( + t, + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)) + ); + + let t: Schema = serde_json::from_str("[\"int\", \"null\"]").unwrap(); + assert_eq!( + t, + Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]) + ); + + let t: Type = serde_json::from_str( + r#"{ + "type":"long", + "logicalType":"timestamp-micros" + }"#, + ) + .unwrap(); + + let timestamp = Type { + r#type: TypeName::Primitive(PrimitiveType::Long), + attributes: Attributes { + logical_type: Some("timestamp-micros"), + additional: Default::default(), + }, + }; + + assert_eq!(t, timestamp); + + let t: ComplexType = serde_json::from_str( + r#"{ + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }"#, + ) + .unwrap(); + + let decimal = ComplexType::Fixed(Fixed { + name: "fixed", + namespace: Some("topLevelRecord.value"), + aliases: vec![], + size: 11, + attributes: Attributes { + logical_type: Some("decimal"), + additional: vec![("precision", json!(25)), ("scale", json!(2))] + .into_iter() + .collect(), + }, + }); + + assert_eq!(t, decimal); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"value", + "type":[ + { + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![Field { + name: "value", + doc: None, + r#type: Schema::Union(vec![ + Schema::Complex(decimal), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + },], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "LongList", + "aliases": ["LinkedLongs"], + "fields" : [ + {"name": "value", "type": "long"}, + {"name": "next", "type": ["null", "LongList"]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "LongList", + namespace: None, + doc: None, + aliases: vec!["LinkedLongs"], + fields: vec![ + Field { + name: "value", + doc: None, + r#type: Schema::TypeName(TypeName::Primitive( + PrimitiveType::Long + )), + default: None, + }, + Field { + name: "next", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Ref("LongList")), + ]), + default: None, + } + ], + attributes: Attributes::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"id", + "type":[ + "int", + "null" + ] + }, + { + "name":"timestamp_col", + "type":[ + { + "type":"long", + "logicalType":"timestamp-micros" + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "id", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + }, + Field { + name: "timestamp_col", + doc: None, + r#type: Schema::Union(vec![ + Schema::Type(timestamp), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", + "fields": [ + {"name": "clientHash", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "HandshakeRequest", + namespace: Some("org.apache.avro.ipc"), + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "clientHash", + doc: None, + r#type: Schema::Complex(ComplexType::Fixed(Fixed { + name: "MD5", + namespace: None, + aliases: vec![], + size: 16, + attributes: Default::default(), + })), + default: None, + }, + Field { + name: "clientProtocol", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)), + ]), + default: None, + }, + Field { + name: "serverHash", + doc: None, + r#type: Schema::TypeName(TypeName::Ref("MD5")), + default: None, + }, + Field { + name: "meta", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::Complex(ComplexType::Map(Map { + values: Box::new(Schema::TypeName(TypeName::Primitive( + PrimitiveType::Bytes + ))), + attributes: Default::default(), + })), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + } +} diff --git a/dev/release/README.md b/dev/release/README.md index 30b3a4a8a569..177f33bcbb4d 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -258,6 +258,7 @@ Rust Arrow Crates: (cd arrow-ipc && cargo publish) (cd arrow-csv && cargo publish) (cd arrow-json && cargo publish) +(cd arrow-avro && cargo publish) (cd arrow-ord && cargo publish) (cd arrow-arith && cargo publish) (cd arrow-string && cargo publish) From 97a82c01d4e6d2e2ccada3452f4790fb4c688472 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 4 Oct 2023 21:21:20 +0100 Subject: [PATCH 1251/1411] Fix integration tests (#4889) --- .github/workflows/integration.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index eca51a80c164..62d2d2cb1a06 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -88,6 +88,10 @@ jobs: with: path: rust fetch-depth: 0 + - name: Install pythonnet + run: conda run --no-capture-output pip install pythonnet + - name: Install archery + run: conda run --no-capture-output pip install -e dev/archery[integration] - name: Make build directory run: mkdir /build - name: Build Rust @@ -102,8 +106,6 @@ jobs: run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - name: Build JS run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build - - name: Install archery - run: conda run --no-capture-output pip install -e dev/archery - name: Run integration tests run: | conda run --no-capture-output archery integration \ From 208da03979b2903c3182c20ef382b2895756380a Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Thu, 5 Oct 2023 06:20:17 -0400 Subject: [PATCH 1252/1411] mark OnCloseRowGroup Send (#4893) --- parquet/src/file/writer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index af25cc9689c1..d723158de9f4 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -115,7 +115,8 @@ pub type OnCloseRowGroup<'a> = Box< Vec>, Vec>, ) -> Result<()> - + 'a, + + 'a + + Send, >; // ---------------------------------------------------------------------- From 2214fda3a918bb31e7d1eed3ab2799d660bb138a Mon Sep 17 00:00:00 2001 From: Kamil Skalski Date: Thu, 5 Oct 2023 17:44:09 +0200 Subject: [PATCH 1253/1411] fix(arrow-json)!: include null fields in schema inference with a type of Null (#4894) * Treat json null as new field of InferredType::Any * Fix clippy. --- arrow-json/src/reader/schema.rs | 137 +++++++++++++++----------------- 1 file changed, 62 insertions(+), 75 deletions(-) diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index c8250ac37716..126a85df3931 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -72,6 +72,15 @@ impl InferredType { Ok(()) } + + fn is_none_or_any(ty: Option<&Self>) -> bool { + matches!(ty, Some(Self::Any) | None) + } +} + +/// Shorthand for building list data type of `ty` +fn list_type_of(ty: DataType) -> DataType { + DataType::List(Arc::new(Field::new("item", ty, true))) } /// Coerce data type during inference @@ -84,23 +93,18 @@ fn coerce_data_type(dt: Vec<&DataType>) -> DataType { let dt_init = dt_iter.next().unwrap_or(DataType::Utf8); dt_iter.fold(dt_init, |l, r| match (l, r) { + (DataType::Null, o) | (o, DataType::Null) => o, (DataType::Boolean, DataType::Boolean) => DataType::Boolean, (DataType::Int64, DataType::Int64) => DataType::Int64, (DataType::Float64, DataType::Float64) | (DataType::Float64, DataType::Int64) | (DataType::Int64, DataType::Float64) => DataType::Float64, - (DataType::List(l), DataType::List(r)) => DataType::List(Arc::new(Field::new( - "item", - coerce_data_type(vec![l.data_type(), r.data_type()]), - true, - ))), + (DataType::List(l), DataType::List(r)) => { + list_type_of(coerce_data_type(vec![l.data_type(), r.data_type()])) + } // coerce scalar and scalar array into scalar array (DataType::List(e), not_list) | (not_list, DataType::List(e)) => { - DataType::List(Arc::new(Field::new( - "item", - coerce_data_type(vec![e.data_type(), ¬_list]), - true, - ))) + list_type_of(coerce_data_type(vec![e.data_type(), ¬_list])) } _ => DataType::Utf8, }) @@ -110,11 +114,7 @@ fn generate_datatype(t: &InferredType) -> Result { Ok(match t { InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), - InferredType::Array(ele_type) => DataType::List(Arc::new(Field::new( - "item", - generate_datatype(ele_type)?, - true, - ))), + InferredType::Array(ele_type) => list_type_of(generate_datatype(ele_type)?), InferredType::Any => DataType::Null, }) } @@ -277,7 +277,7 @@ fn set_object_scalar_field_type( key: &str, ftype: DataType, ) -> Result<(), ArrowError> { - if !field_types.contains_key(key) { + if InferredType::is_none_or_any(field_types.get(key)) { field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); } @@ -388,7 +388,7 @@ fn collect_field_types_from_object( Value::Array(array) => { let ele_type = infer_array_element_type(array)?; - if !field_types.contains_key(k) { + if InferredType::is_none_or_any(field_types.get(k)) { match ele_type { InferredType::Scalar(_) => { field_types.insert( @@ -438,8 +438,11 @@ fn collect_field_types_from_object( set_object_scalar_field_type(field_types, k, DataType::Boolean)?; } Value::Null => { - // do nothing, we treat json as nullable by default when - // inferring + // we treat json as nullable by default when inferring, so just + // mark existence of a field if it wasn't known before + if !field_types.contains_key(k) { + field_types.insert(k.to_string(), InferredType::Any); + } } Value::Number(n) => { if n.is_i64() { @@ -520,21 +523,9 @@ mod tests { fn test_json_infer_schema() { let schema = Schema::new(vec![ Field::new("a", DataType::Int64, true), - Field::new( - "b", - DataType::List(Arc::new(Field::new("item", DataType::Float64, true))), - true, - ), - Field::new( - "c", - DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))), - true, - ), - Field::new( - "d", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - true, - ), + Field::new("b", list_type_of(DataType::Float64), true), + Field::new("c", list_type_of(DataType::Boolean), true), + Field::new("d", list_type_of(DataType::Utf8), true), ]); let mut reader = @@ -589,22 +580,18 @@ mod tests { let schema = Schema::new(vec![ Field::new( "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::Struct(Fields::from(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int64, true), - Field::new("c", DataType::Boolean, true), - ])), - true, - ))), + list_type_of(DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int64, true), + Field::new("c", DataType::Boolean, true), + ]))), true, ), Field::new("c2", DataType::Float64, true), Field::new( "c3", // empty json array's inner types are inferred as null - DataType::List(Arc::new(Field::new("item", DataType::Null, true))), + list_type_of(DataType::Null), true, ), ]); @@ -629,15 +616,7 @@ mod tests { #[test] fn test_json_infer_schema_nested_list() { let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::List(Arc::new(Field::new( - "item", - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), - true, - ))), - true, - ), + Field::new("c1", list_type_of(list_type_of(DataType::Utf8)), true), Field::new("c2", DataType::Float64, true), ]); @@ -682,36 +661,22 @@ mod tests { #[test] fn test_coercion_scalar_and_list() { - use arrow_schema::DataType::*; - assert_eq!( - List(Arc::new(Field::new("item", Float64, true))), - coerce_data_type(vec![ - &Float64, - &List(Arc::new(Field::new("item", Float64, true))) - ]) + list_type_of(DataType::Float64), + coerce_data_type(vec![&DataType::Float64, &list_type_of(DataType::Float64)]) ); assert_eq!( - List(Arc::new(Field::new("item", Float64, true))), - coerce_data_type(vec![ - &Float64, - &List(Arc::new(Field::new("item", Int64, true))) - ]) + list_type_of(DataType::Float64), + coerce_data_type(vec![&DataType::Float64, &list_type_of(DataType::Int64)]) ); assert_eq!( - List(Arc::new(Field::new("item", Int64, true))), - coerce_data_type(vec![ - &Int64, - &List(Arc::new(Field::new("item", Int64, true))) - ]) + list_type_of(DataType::Int64), + coerce_data_type(vec![&DataType::Int64, &list_type_of(DataType::Int64)]) ); // boolean and number are incompatible, return utf8 assert_eq!( - List(Arc::new(Field::new("item", Utf8, true))), - coerce_data_type(vec![ - &Boolean, - &List(Arc::new(Field::new("item", Float64, true))) - ]) + list_type_of(DataType::Utf8), + coerce_data_type(vec![&DataType::Boolean, &list_type_of(DataType::Float64)]) ); } @@ -723,4 +688,26 @@ mod tests { "Json error: Not valid JSON: expected value at line 1 column 1", ); } + + #[test] + fn test_null_field_inferred_as_null() { + let data = r#" + {"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[], "na": null, "nas":null} + {"in":null, "ni":2, "ns":"3", "sn":null, "n":null, "an":null, "na": [], "nas":["8"]} + {"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[], "na": null, "nas":[]} + "#; + let inferred_schema = + infer_json_schema_from_seekable(Cursor::new(data), None).expect("infer"); + let schema = Schema::new(vec![ + Field::new("an", list_type_of(DataType::Null), true), + Field::new("in", DataType::Int64, true), + Field::new("n", DataType::Null, true), + Field::new("na", list_type_of(DataType::Null), true), + Field::new("nas", list_type_of(DataType::Utf8), true), + Field::new("ni", DataType::Int64, true), + Field::new("ns", DataType::Utf8, true), + Field::new("sn", DataType::Utf8, true), + ]); + assert_eq!(inferred_schema, schema); + } } From 2c4bc5449fc4432ecb2f9963994ac8997b64b52e Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Fri, 6 Oct 2023 09:55:38 -0400 Subject: [PATCH 1254/1411] Upgrade to Rust 1.73.0 (#4899) * fix: Call Ord's implementation from PartialOrd so they stay in sync As recommended by Clippy in Rust 1.73.0 * fix: Use or_default methods instead of or_else(default) As recommended by Clippy in Rust 1.73.0 * fix: Use filter then map with bools instead of filter_map then As recommended by Clippy in Rust 1.73.0 * fix: Change a match guard to a pattern As recommended by Clippy in Rust 1.73.0 * fix: Change to a different kind of filter_map Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-row/src/lib.rs | 4 +-- object_store/src/azure/client.rs | 2 +- object_store/src/azure/credential.rs | 8 ++--- parquet/src/arrow/async_reader/mod.rs | 45 +++++++++++++-------------- parquet/src/arrow/buffer/bit_util.rs | 2 +- parquet/src/arrow/schema/primitive.rs | 4 +-- parquet/src/file/properties.rs | 4 +-- 7 files changed, 32 insertions(+), 37 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 58dc42a4cacb..1fb4e1de7ac2 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -989,7 +989,7 @@ impl<'a> Eq for Row<'a> {} impl<'a> PartialOrd for Row<'a> { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - self.data.partial_cmp(other.data) + Some(self.cmp(other)) } } @@ -1049,7 +1049,7 @@ impl Eq for OwnedRow {} impl PartialOrd for OwnedRow { #[inline] fn partial_cmp(&self, other: &Self) -> Option { - self.row().partial_cmp(&other.row()) + Some(self.cmp(other)) } } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index e18135c2c77c..cd1a3a10fcc7 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -372,7 +372,7 @@ struct ListResultInternal { } fn to_list_result(value: ListResultInternal, prefix: Option<&str>) -> Result { - let prefix = prefix.map(Path::from).unwrap_or_else(Path::default); + let prefix = prefix.map(Path::from).unwrap_or_default(); let common_prefixes = value .blobs .blob_prefix diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index fd75389249b0..8dc61365fa6e 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -234,11 +234,9 @@ fn string_to_sign(h: &HeaderMap, u: &Url, method: &Method, account: &str) -> Str fn canonicalize_header(headers: &HeaderMap) -> String { let mut names = headers .iter() - .filter_map(|(k, _)| { - (k.as_str().starts_with("x-ms")) - // TODO remove unwraps - .then(|| (k.as_str(), headers.get(k).unwrap().to_str().unwrap())) - }) + .filter(|&(k, _)| (k.as_str().starts_with("x-ms"))) + // TODO remove unwraps + .map(|(k, _)| (k.as_str(), headers.get(k).unwrap().to_str().unwrap())) .collect::>(); names.sort_unstable(); diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 7d30580ece93..c749d4deeb16 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -625,27 +625,27 @@ impl<'a> InMemoryRowGroup<'a> { .iter() .zip(self.metadata.columns()) .enumerate() - .filter_map(|(idx, (chunk, chunk_meta))| { - (chunk.is_none() && projection.leaf_included(idx)).then(|| { - // If the first page does not start at the beginning of the column, - // then we need to also fetch a dictionary page. - let mut ranges = vec![]; - let (start, _len) = chunk_meta.byte_range(); - match page_locations[idx].first() { - Some(first) if first.offset as u64 != start => { - ranges.push(start as usize..first.offset as usize); - } - _ => (), + .filter(|&(idx, (chunk, _chunk_meta))| { + chunk.is_none() && projection.leaf_included(idx) + }) + .flat_map(|(idx, (_chunk, chunk_meta))| { + // If the first page does not start at the beginning of the column, + // then we need to also fetch a dictionary page. + let mut ranges = vec![]; + let (start, _len) = chunk_meta.byte_range(); + match page_locations[idx].first() { + Some(first) if first.offset as u64 != start => { + ranges.push(start as usize..first.offset as usize); } + _ => (), + } - ranges.extend(selection.scan_ranges(&page_locations[idx])); - page_start_offsets - .push(ranges.iter().map(|range| range.start).collect()); + ranges.extend(selection.scan_ranges(&page_locations[idx])); + page_start_offsets + .push(ranges.iter().map(|range| range.start).collect()); - ranges - }) + ranges }) - .flatten() .collect(); let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter(); @@ -673,12 +673,11 @@ impl<'a> InMemoryRowGroup<'a> { .column_chunks .iter() .enumerate() - .filter_map(|(idx, chunk)| { - (chunk.is_none() && projection.leaf_included(idx)).then(|| { - let column = self.metadata.column(idx); - let (start, length) = column.byte_range(); - start as usize..(start + length) as usize - }) + .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx)) + .map(|(idx, _chunk)| { + let column = self.metadata.column(idx); + let (start, length) = column.byte_range(); + start as usize..(start + length) as usize }) .collect(); diff --git a/parquet/src/arrow/buffer/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs index d01556d24e30..b8e2e2f539d3 100644 --- a/parquet/src/arrow/buffer/bit_util.rs +++ b/parquet/src/arrow/buffer/bit_util.rs @@ -84,7 +84,7 @@ mod tests { .iter() .enumerate() .rev() - .filter_map(|(x, y)| y.then(|| x)) + .filter_map(|(x, y)| y.then_some(x)) .collect(); assert_eq!(actual, expected); diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 83d84b77ec06..7d8b6a04ee81 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -193,11 +193,11 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Ok(DataType::Int64), ( Some(LogicalType::Integer { - bit_width, + bit_width: 64, is_signed, }), _, - ) if bit_width == 64 => match is_signed { + ) => match is_signed { true => Ok(DataType::Int64), false => Ok(DataType::UInt64), }, diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 3d6390c036ae..c83fea3f9b92 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -550,9 +550,7 @@ impl WriterPropertiesBuilder { /// Helper method to get existing or new mutable reference of column properties. #[inline] fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { - self.column_properties - .entry(col) - .or_insert_with(Default::default) + self.column_properties.entry(col).or_default() } /// Sets encoding for a column. From 431df704d3c64ae2ac6660433eb0cd28b4d7a22e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 13:00:17 +0100 Subject: [PATCH 1255/1411] Update proc-macro2 requirement from =1.0.67 to =1.0.68 (#4900) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.67...1.0.68) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 5be24e780ae9..be895acc39c4 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.67", default-features = false } +proc-macro2 = { version = "=1.0.68", default-features = false } prost-build = { version = "=0.12.1", default-features = false } tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From c7911286cc2f9682fa9e08c9269394da93cff3ed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Oct 2023 16:00:05 +0100 Subject: [PATCH 1256/1411] Update proc-macro2 requirement from =1.0.68 to =1.0.69 (#4907) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.68...1.0.69) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index be895acc39c4..036281528c19 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.68", default-features = false } +proc-macro2 = { version = "=1.0.69", default-features = false } prost-build = { version = "=0.12.1", default-features = false } tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From ed58e767d7607e954085842a57a680b6807794e0 Mon Sep 17 00:00:00 2001 From: fan <75058860+fansehep@users.noreply.github.com> Date: Mon, 9 Oct 2023 23:03:10 +0800 Subject: [PATCH 1257/1411] chore: add csv example (#4904) * chore: add csv example Signed-off-by: fan * follow reviews Signed-off-by: fan --------- Signed-off-by: fan --- arrow-csv/examples/README.md | 21 ++++++++++ arrow-csv/examples/csv_calculation.rs | 56 +++++++++++++++++++++++++++ arrow-csv/test/data/example.csv | 4 ++ 3 files changed, 81 insertions(+) create mode 100644 arrow-csv/examples/README.md create mode 100644 arrow-csv/examples/csv_calculation.rs create mode 100644 arrow-csv/test/data/example.csv diff --git a/arrow-csv/examples/README.md b/arrow-csv/examples/README.md new file mode 100644 index 000000000000..340413e76d94 --- /dev/null +++ b/arrow-csv/examples/README.md @@ -0,0 +1,21 @@ + + +# Examples +- [`csv_calculation.rs`](csv_calculation.rs): performs a simple calculation using the CSV reader \ No newline at end of file diff --git a/arrow-csv/examples/csv_calculation.rs b/arrow-csv/examples/csv_calculation.rs new file mode 100644 index 000000000000..12aaadde4415 --- /dev/null +++ b/arrow-csv/examples/csv_calculation.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::cast::AsArray; +use arrow_array::types::Int16Type; +use arrow_csv::ReaderBuilder; + +use arrow_schema::{DataType, Field, Schema}; +use std::fs::File; +use std::sync::Arc; + +fn main() { + // read csv from file + let file = File::open("arrow-csv/test/data/example.csv").unwrap(); + let csv_schema = Schema::new(vec![ + Field::new("c1", DataType::Int16, true), + Field::new("c2", DataType::Float32, true), + Field::new("c3", DataType::Utf8, true), + Field::new("c4", DataType::Boolean, true), + ]); + let mut reader = ReaderBuilder::new(Arc::new(csv_schema)) + .has_header(true) + .build(file) + .unwrap(); + + match reader.next() { + Some(r) => match r { + Ok(r) => { + // get the column(0) max value + let col = r.column(0).as_primitive::(); + let max = col.iter().max().flatten(); + println!("max value column(0): {max:?}") + } + Err(e) => { + println!("{e:?}"); + } + }, + None => { + println!("csv is empty"); + } + } +} diff --git a/arrow-csv/test/data/example.csv b/arrow-csv/test/data/example.csv new file mode 100644 index 000000000000..0c03cee84528 --- /dev/null +++ b/arrow-csv/test/data/example.csv @@ -0,0 +1,4 @@ +c1,c2,c3,c4 +1,1.1,"hong kong",true +3,323.12,"XiAn",false +10,131323.12,"cheng du",false \ No newline at end of file From 2af51631e492e0ea0ac71da67b3dba6a846dafd5 Mon Sep 17 00:00:00 2001 From: Kamil Skalski Date: Mon, 9 Oct 2023 18:19:32 +0200 Subject: [PATCH 1258/1411] Allow merge of Null to any datatype. (#4902) --- arrow-schema/src/field.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 00deecf06283..b50778c785fb 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -461,7 +461,10 @@ impl Field { )); } }, - DataType::Null + DataType::Null => { + self.nullable = true; + self.data_type = from.data_type.clone(); + } | DataType::Boolean | DataType::Int8 | DataType::Int16 @@ -494,7 +497,9 @@ impl Field { | DataType::LargeUtf8 | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { - if self.data_type != from.data_type { + if from.data_type == DataType::Null { + self.nullable = true; + } else if self.data_type != from.data_type { return Err(ArrowError::SchemaError( format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}", self.name, from.data_type, self.data_type) @@ -580,6 +585,21 @@ mod test { assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result); } + #[test] + fn test_merge_with_null() { + let mut field1 = Field::new("c1", DataType::Null, true); + field1 + .try_merge(&Field::new("c1", DataType::Float32, false)) + .expect("should widen type to nullable float"); + assert_eq!(Field::new("c1", DataType::Float32, true), field1); + + let mut field2 = Field::new("c2", DataType::Utf8, false); + field2 + .try_merge(&Field::new("c2", DataType::Null, true)) + .expect("should widen type to nullable utf8"); + assert_eq!(Field::new("c2", DataType::Utf8, true), field2); + } + #[test] fn test_fields_with_dict_id() { let dict1 = Field::new_dict( From 16f59056a4920e3f7cdfbde5c7faf0f05139c1d4 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 10 Oct 2023 09:51:44 +0200 Subject: [PATCH 1259/1411] feat: log headers/trailers in flight CLI (+ minor fixes) (#4898) * feat: improve CLI logging setup * refactor: flight SQL DoGet should be a high-level interface * feat: log headers/trailers in SQL CLI * fix: replace explicit panics in CLI --- arrow-flight/Cargo.toml | 2 +- arrow-flight/examples/flight_sql_server.rs | 4 +- arrow-flight/src/bin/flight_sql_client.rs | 94 ++++++++++++++++++---- arrow-flight/src/sql/client.rs | 16 +++- 4 files changed, 93 insertions(+), 23 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 54c5cdf5e2c7..edaa7129dc9a 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -52,7 +52,7 @@ tonic = { version = "0.10.0", default-features = false, features = ["transport", anyhow = { version = "1.0", optional = true } clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } tracing-log = { version = "0.1", optional = true } -tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "fmt"], optional = true } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"], optional = true } [package.metadata.docs.rs] all-features = true diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index d1aeae6f0a6c..013f7e7788f8 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -789,7 +789,6 @@ mod tests { use arrow_cast::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; - use arrow_flight::utils::flight_data_to_batches; use tonic::transport::server::TcpIncoming; use tonic::transport::{Certificate, Endpoint}; use tower::service_fn; @@ -955,8 +954,7 @@ mod tests { let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); let flight_data = client.do_get(ticket).await.unwrap(); - let flight_data: Vec = flight_data.try_collect().await.unwrap(); - let batches = flight_data_to_batches(&flight_data).unwrap(); + let batches: Vec<_> = flight_data.try_collect().await.unwrap(); let res = pretty_format_batches(batches.as_slice()).unwrap(); let expected = r#" diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index c6aaccf376eb..df51530b3c8f 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -17,17 +17,17 @@ use std::{error::Error, sync::Arc, time::Duration}; -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use arrow_array::{ArrayRef, Datum, RecordBatch, StringArray}; use arrow_cast::{cast_with_options, pretty::pretty_format_batches, CastOptions}; -use arrow_flight::{ - sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, - FlightInfo, -}; +use arrow_flight::{sql::client::FlightSqlServiceClient, FlightInfo}; use arrow_schema::Schema; use clap::{Parser, Subcommand}; use futures::TryStreamExt; -use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use tonic::{ + metadata::MetadataMap, + transport::{Channel, ClientTlsConfig, Endpoint}, +}; use tracing_log::log::info; /// A ':' separated key value pair @@ -61,6 +61,22 @@ where } } +/// Logging CLI config. +#[derive(Debug, Parser)] +pub struct LoggingArgs { + /// Log verbosity. + /// + /// Use `-v for warn, `-vv for info, -vvv for debug, -vvvv for trace. + /// + /// Note you can also set logging level using `RUST_LOG` environment variable: `RUST_LOG=debug` + #[clap( + short = 'v', + long = "verbose", + action = clap::ArgAction::Count, + )] + log_verbose_count: u8, +} + #[derive(Debug, Parser)] struct ClientArgs { /// Additional headers. @@ -96,6 +112,10 @@ struct ClientArgs { #[derive(Debug, Parser)] struct Args { + /// Logging args. + #[clap(flatten)] + logging_args: LoggingArgs, + /// Client args. #[clap(flatten)] client_args: ClientArgs, @@ -119,7 +139,7 @@ enum Command { #[tokio::main] async fn main() -> Result<()> { let args = Args::parse(); - setup_logging()?; + setup_logging(args.logging_args)?; let mut client = setup_client(args.client_args) .await .context("setup client")?; @@ -177,16 +197,21 @@ async fn execute_flight( for endpoint in info.endpoint { let Some(ticket) = &endpoint.ticket else { - panic!("did not get ticket"); + bail!("did not get ticket"); }; - let flight_data = client.do_get(ticket.clone()).await.context("do get")?; - let flight_data: Vec = flight_data + + let mut flight_data = client.do_get(ticket.clone()).await.context("do get")?; + log_metadata(flight_data.headers(), "header"); + + let mut endpoint_batches: Vec<_> = (&mut flight_data) .try_collect() .await .context("collect data stream")?; - let mut endpoint_batches = flight_data_to_batches(&flight_data) - .context("convert flight data to record batches")?; batches.append(&mut endpoint_batches); + + if let Some(trailers) = flight_data.trailers() { + log_metadata(&trailers, "trailer"); + } } info!("received data"); @@ -213,9 +238,22 @@ fn construct_record_batch_from_params( Ok(RecordBatch::try_from_iter(items)?) } -fn setup_logging() -> Result<()> { +fn setup_logging(args: LoggingArgs) -> Result<()> { + use tracing_subscriber::{util::SubscriberInitExt, EnvFilter, FmtSubscriber}; + tracing_log::LogTracer::init().context("tracing log init")?; - tracing_subscriber::fmt::init(); + + let filter = match args.log_verbose_count { + 0 => "warn", + 1 => "info", + 2 => "debug", + _ => "trace", + }; + let filter = EnvFilter::try_new(filter).context("set up log env filter")?; + + let subscriber = FmtSubscriber::builder().with_env_filter(filter).finish(); + subscriber.try_init().context("init logging subscriber")?; + Ok(()) } @@ -265,10 +303,10 @@ async fn setup_client(args: ClientArgs) -> Result { - panic!("when username is set, you also need to set a password") + bail!("when username is set, you also need to set a password") } (None, Some(_)) => { - panic!("when password is set, you also need to set a username") + bail!("when password is set, you also need to set a username") } } @@ -284,3 +322,27 @@ fn parse_key_val( .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?; Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) } + +/// Log headers/trailers. +fn log_metadata(map: &MetadataMap, what: &'static str) { + for k_v in map.iter() { + match k_v { + tonic::metadata::KeyAndValueRef::Ascii(k, v) => { + info!( + "{}: {}={}", + what, + k.as_str(), + v.to_str().unwrap_or(""), + ); + } + tonic::metadata::KeyAndValueRef::Binary(k, v) => { + info!( + "{}: {}={}", + what, + k.as_str(), + String::from_utf8_lossy(v.as_ref()), + ); + } + } + } +} diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 2d382cf2ca20..7685813ff844 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -24,6 +24,7 @@ use std::collections::HashMap; use std::str::FromStr; use tonic::metadata::AsciiMetadataKey; +use crate::decode::FlightRecordBatchStream; use crate::encode::FlightDataEncoderBuilder; use crate::error::FlightError; use crate::flight_service_client::FlightServiceClient; @@ -37,6 +38,7 @@ use crate::sql::{ CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, }; +use crate::trailers::extract_lazy_trailers; use crate::{ Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage, PutResult, Ticket, @@ -231,14 +233,22 @@ impl FlightSqlServiceClient { pub async fn do_get( &mut self, ticket: impl IntoRequest, - ) -> Result, ArrowError> { + ) -> Result { let req = self.set_request_headers(ticket.into_request())?; - Ok(self + + let (md, response_stream, _ext) = self .flight_client .do_get(req) .await .map_err(status_to_arrow_error)? - .into_inner()) + .into_parts(); + let (response_stream, trailers) = extract_lazy_trailers(response_stream); + + Ok(FlightRecordBatchStream::new_from_flight_data( + response_stream.map_err(FlightError::Tonic), + ) + .with_headers(md) + .with_trailers(trailers)) } /// Push a stream to the flight service associated with a particular flight stream. From c6387c1ffc27cbf9180253648c4ba461d92d586d Mon Sep 17 00:00:00 2001 From: Kamil Skalski Date: Tue, 10 Oct 2023 11:13:34 +0200 Subject: [PATCH 1260/1411] fix(csv)!: infer null for empty column. (#4910) * Infer null for empty column. * Add test file. --- arrow-csv/src/reader/mod.rs | 62 +++++++++++++++++++++++++- arrow-csv/test/data/init_null_test.csv | 6 +++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 arrow-csv/test/data/init_null_test.csv diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 17db7a34e06f..2ba49cadc73f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -193,6 +193,7 @@ impl InferredDataType { /// Returns the inferred data type fn get(&self) -> DataType { match self.packed { + 0 => DataType::Null, 1 => DataType::Boolean, 2 => DataType::Int64, 4 | 6 => DataType::Float64, // Promote Int64 to Float64 @@ -785,6 +786,9 @@ fn parse( null_regex, ) } + DataType::Null => { + Ok(Arc::new(NullArray::builder(rows.len()).finish()) as ArrayRef) + } DataType::Utf8 => Ok(Arc::new( rows.iter() .map(|row| Some(row.get(i))) @@ -1511,6 +1515,62 @@ mod tests { assert!(!batch.column(1).is_null(4)); } + #[test] + fn test_init_nulls() { + let schema = Arc::new(Schema::new(vec![ + Field::new("c_int", DataType::UInt64, true), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + Field::new("c_null", DataType::Null, true), + ])); + let file = File::open("test/data/init_null_test.csv").unwrap(); + + let mut csv = ReaderBuilder::new(schema) + .has_header(true) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + assert!(batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_init_nulls_with_inference() { + let format = Format::default().with_header(true).with_delimiter(b','); + + let mut file = File::open("test/data/init_null_test.csv").unwrap(); + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new("c_int", DataType::Int64, true), + Field::new("c_float", DataType::Float64, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + Field::new("c_null", DataType::Null, true), + ]); + assert_eq!(schema, expected_schema); + + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + assert!(batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + #[test] fn test_custom_nulls() { let schema = Arc::new(Schema::new(vec![ @@ -2283,7 +2343,7 @@ mod tests { #[test] fn test_inference() { let cases: &[(&[&str], DataType)] = &[ - (&[], DataType::Utf8), + (&[], DataType::Null), (&["false", "12"], DataType::Utf8), (&["12", "cupcakes"], DataType::Utf8), (&["12", "12.4"], DataType::Float64), diff --git a/arrow-csv/test/data/init_null_test.csv b/arrow-csv/test/data/init_null_test.csv new file mode 100644 index 000000000000..f7d8a299645d --- /dev/null +++ b/arrow-csv/test/data/init_null_test.csv @@ -0,0 +1,6 @@ +c_int,c_float,c_string,c_bool,c_null +,,,, +2,2.2,"a",TRUE, +3,,"b",true, +4,4.4,,False, +5,6.6,"",FALSE, \ No newline at end of file From 538a7bfed55fb4f9305e224574bf48956019c471 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:13:20 +0100 Subject: [PATCH 1261/1411] Specialize Thrift Decoding (~40% Faster) (#4891) (#4892) * Specialize thrift (#4891) * Review feedback --- parquet/CONTRIBUTING.md | 6 +- parquet/Cargo.toml | 5 + parquet/benches/metadata.rs | 42 +++ parquet/regen.sh | 35 ++ parquet/src/arrow/arrow_writer/mod.rs | 3 +- parquet/src/arrow/async_reader/metadata.rs | 11 +- parquet/src/bin/parquet-layout.rs | 3 +- parquet/src/bloom_filter/mod.rs | 15 +- parquet/src/file/footer.rs | 13 +- parquet/src/file/page_index/index_reader.rs | 8 +- parquet/src/file/serialized_reader.rs | 12 +- parquet/src/file/writer.rs | 3 +- parquet/src/format.rs | 348 ++++++++++---------- parquet/src/lib.rs | 2 + parquet/src/thrift.rs | 284 ++++++++++++++++ 15 files changed, 571 insertions(+), 219 deletions(-) create mode 100644 parquet/benches/metadata.rs create mode 100755 parquet/regen.sh create mode 100644 parquet/src/thrift.rs diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 903126d9f4f8..5670eef08101 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -62,10 +62,6 @@ To compile and view in the browser, run `cargo doc --no-deps --open`. ## Update Parquet Format -To generate the parquet format (thrift definitions) code run from the repository root run - -``` -$ docker run -v $(pwd):/thrift/src -it archlinux pacman -Sy --noconfirm thrift && wget https://raw.githubusercontent.com/apache/parquet-format/apache-parquet-format-2.9.0/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && thrift --gen rs /tmp/parquet.thrift && sed -i '/use thrift::server::TProcessor;/d' parquet.rs && mv parquet.rs parquet/src/format.rs -``` +To generate the parquet format (thrift definitions) code run [`./regen.sh`](./regen.sh). You may need to manually patch up doc comments that contain unescaped `[]` diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index c710c83213b9..eaafb5130fcb 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -173,5 +173,10 @@ name = "compression" required-features = ["experimental", "default"] harness = false + +[[bench]] +name = "metadata" +harness = false + [lib] bench = false diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs new file mode 100644 index 000000000000..c817385f6ba9 --- /dev/null +++ b/parquet/benches/metadata.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::Bytes; +use criterion::*; +use parquet::file::reader::SerializedFileReader; +use parquet::file::serialized_reader::ReadOptionsBuilder; + +fn criterion_benchmark(c: &mut Criterion) { + // Read file into memory to isolate filesystem performance + let file = "../parquet-testing/data/alltypes_tiny_pages.parquet"; + let data = std::fs::read(file).unwrap(); + let data = Bytes::from(data); + + c.bench_function("open(default)", |b| { + b.iter(|| SerializedFileReader::new(data.clone()).unwrap()) + }); + + c.bench_function("open(page index)", |b| { + b.iter(|| { + let options = ReadOptionsBuilder::new().with_page_index().build(); + SerializedFileReader::new_with_options(data.clone(), options).unwrap() + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/parquet/regen.sh b/parquet/regen.sh new file mode 100755 index 000000000000..b8c3549e2324 --- /dev/null +++ b/parquet/regen.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +REVISION=aeae80660c1d0c97314e9da837de1abdebd49c37 + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" + +docker run -v $SOURCE_DIR:/thrift/src -it archlinux pacman -Sy --noconfirm thrift && \ + wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \ + thrift --gen rs /tmp/parquet.thrift && \ + echo "Removing TProcessor" && \ + sed -i '/use thrift::server::TProcessor;/d' parquet.rs && \ + echo "Replacing TSerializable" && \ + sed -i 's/impl TSerializable for/impl crate::thrift::TSerializable for/g' parquet.rs && \ + echo "Rewriting write_to_out_protocol" && \ + sed -i 's/fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol)/fn write_to_out_protocol(\&self, o_prot: \&mut T)/g' parquet.rs && \ + echo "Rewriting read_from_in_protocol" && \ + sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol(i_prot: \&mut T)/g' parquet.rs && \ + mv parquet.rs src/format.rs diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 5dae81d4711c..752eff86c5e9 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,7 +23,7 @@ use std::iter::Peekable; use std::slice::Iter; use std::sync::{Arc, Mutex}; use std::vec::IntoIter; -use thrift::protocol::{TCompactOutputProtocol, TSerializable}; +use thrift::protocol::TCompactOutputProtocol; use arrow_array::cast::AsArray; use arrow_array::types::*; @@ -50,6 +50,7 @@ use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; +use crate::thrift::TSerializable; use levels::{calculate_array_levels, ArrayLevels}; mod byte_array; diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 076ae5c54052..fe7b4427647c 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -17,7 +17,7 @@ use crate::arrow::async_reader::AsyncFileReader; use crate::errors::{ParquetError, Result}; -use crate::file::footer::{decode_footer, read_metadata}; +use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::{ @@ -27,7 +27,6 @@ use bytes::Bytes; use futures::future::BoxFuture; use futures::FutureExt; use std::future::Future; -use std::io::Read; use std::ops::Range; /// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`] @@ -95,16 +94,14 @@ impl MetadataLoader { // Did not fetch the entire file metadata in the initial read, need to make a second request let (metadata, remainder) = if length > suffix_len - 8 { let metadata_start = file_size - length - 8; - let remaining_metadata = fetch.fetch(metadata_start..footer_start).await?; - - let reader = remaining_metadata.as_ref().chain(&suffix[..suffix_len - 8]); - (read_metadata(reader)?, None) + let meta = fetch.fetch(metadata_start..file_size - 8).await?; + (decode_metadata(&meta)?, None) } else { let metadata_start = file_size - length - 8 - footer_start; let slice = &suffix[metadata_start..suffix_len - 8]; ( - read_metadata(slice)?, + decode_metadata(slice)?, Some((footer_start, suffix.slice(..metadata_start))), ) }; diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index d749bb8a4ba7..901ac9ea2309 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -38,12 +38,13 @@ use std::io::Read; use clap::Parser; use serde::Serialize; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; +use thrift::protocol::TCompactInputProtocol; use parquet::basic::{Compression, Encoding}; use parquet::errors::Result; use parquet::file::reader::ChunkReader; use parquet::format::PageHeader; +use parquet::thrift::TSerializable; #[derive(Serialize, Debug)] struct ParquetFile { diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index c893d492b52a..a3807eb37011 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -26,13 +26,12 @@ use crate::format::{ BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, BloomFilterHeader, SplitBlockAlgorithm, Uncompressed, XxHash, }; -use bytes::{Buf, Bytes}; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; +use bytes::Bytes; use std::hash::Hasher; use std::io::Write; use std::sync::Arc; -use thrift::protocol::{ - TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable, -}; +use thrift::protocol::{TCompactOutputProtocol, TOutputProtocol}; use twox_hash::XxHash64; /// Salt as defined in the [spec](https://github.com/apache/parquet-format/blob/master/BloomFilter.md#technical-approach). @@ -152,15 +151,11 @@ fn read_bloom_filter_header_and_length( buffer: Bytes, ) -> Result<(BloomFilterHeader, u64), ParquetError> { let total_length = buffer.len(); - let mut buf_reader = buffer.reader(); - let mut prot = TCompactInputProtocol::new(&mut buf_reader); + let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); let header = BloomFilterHeader::read_from_in_protocol(&mut prot).map_err(|e| { ParquetError::General(format!("Could not read bloom filter header: {e}")) })?; - Ok(( - header, - (total_length - buf_reader.into_inner().remaining()) as u64, - )) + Ok((header, (total_length - prot.as_slice().len()) as u64)) } pub(crate) const BITSET_MIN_LENGTH: usize = 32; diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 21de63e0c234..53496a66b572 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -18,7 +18,7 @@ use std::{io::Read, sync::Arc}; use crate::format::{ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData}; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use crate::basic::ColumnOrder; @@ -62,18 +62,13 @@ pub fn parse_metadata(chunk_reader: &R) -> Result Result { - read_metadata(metadata_read) -} - -/// Decodes [`ParquetMetaData`] from the provided [`Read`] -pub(crate) fn read_metadata(read: R) -> Result { +pub fn decode_metadata(buf: &[u8]) -> Result { // TODO: row group filtering - let mut prot = TCompactInputProtocol::new(read); + let mut prot = TCompactSliceInputProtocol::new(buf); let t_file_metadata: TFileMetaData = TFileMetaData::read_from_in_protocol(&mut prot) .map_err(|e| ParquetError::General(format!("Could not parse metadata: {e}")))?; let schema = types::from_thrift(&t_file_metadata.schema)?; diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index c36708a59aeb..ae3bf3699c1c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -24,9 +24,8 @@ use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; -use std::io::Cursor; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use std::ops::Range; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; /// Computes the covering range of two optional ranges /// @@ -116,7 +115,7 @@ pub fn read_pages_locations( pub(crate) fn decode_offset_index( data: &[u8], ) -> Result, ParquetError> { - let mut prot = TCompactInputProtocol::new(data); + let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; Ok(offset.page_locations) } @@ -125,8 +124,7 @@ pub(crate) fn decode_column_index( data: &[u8], column_type: Type, ) -> Result { - let mut d = Cursor::new(data); - let mut prot = TCompactInputProtocol::new(&mut d); + let mut prot = TCompactSliceInputProtocol::new(data); let index = ColumnIndex::read_from_in_protocol(&mut prot)?; diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 4924dcc6f35a..4bc484144a81 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -19,7 +19,6 @@ //! Also contains implementations of the ChunkReader for files (with buffering) and byte arrays (RAM) use std::collections::VecDeque; -use std::io::Cursor; use std::iter; use std::{convert::TryFrom, fs::File, io::Read, path::Path, sync::Arc}; @@ -40,8 +39,9 @@ use crate::format::{PageHeader, PageLocation, PageType}; use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; +use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; use crate::util::memory::ByteBufferPtr; -use thrift::protocol::{TCompactInputProtocol, TSerializable}; +use thrift::protocol::TCompactInputProtocol; impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -661,11 +661,11 @@ impl PageReader for SerializedPageReader { let buffer = self.reader.get_bytes(front.offset as u64, page_len)?; - let mut cursor = Cursor::new(buffer.as_ref()); - let header = read_page_header(&mut cursor)?; - let offset = cursor.position(); + let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); + let header = PageHeader::read_from_in_protocol(&mut prot)?; + let offset = buffer.len() - prot.as_slice().len(); - let bytes = buffer.slice(offset as usize..); + let bytes = buffer.slice(offset..); decode_page( header, bytes.into(), diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index d723158de9f4..7796be6013df 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -21,10 +21,11 @@ use crate::bloom_filter::Sbbf; use crate::format as parquet; use crate::format::{ColumnIndex, OffsetIndex, RowGroup}; +use crate::thrift::TSerializable; use std::fmt::Debug; use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; -use thrift::protocol::{TCompactOutputProtocol, TSerializable}; +use thrift::protocol::TCompactOutputProtocol; use crate::column::writer::{ get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl, diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 12c572c23cf5..46adc39e6406 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -53,12 +53,12 @@ impl Type { ]; } -impl TSerializable for Type { +impl crate::thrift::TSerializable for Type { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(Type::from(enum_value)) } @@ -222,12 +222,12 @@ impl ConvertedType { ]; } -impl TSerializable for ConvertedType { +impl crate::thrift::TSerializable for ConvertedType { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(ConvertedType::from(enum_value)) } @@ -299,12 +299,12 @@ impl FieldRepetitionType { ]; } -impl TSerializable for FieldRepetitionType { +impl crate::thrift::TSerializable for FieldRepetitionType { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(FieldRepetitionType::from(enum_value)) } @@ -397,12 +397,12 @@ impl Encoding { ]; } -impl TSerializable for Encoding { +impl crate::thrift::TSerializable for Encoding { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(Encoding::from(enum_value)) } @@ -474,12 +474,12 @@ impl CompressionCodec { ]; } -impl TSerializable for CompressionCodec { +impl crate::thrift::TSerializable for CompressionCodec { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(CompressionCodec::from(enum_value)) } @@ -535,12 +535,12 @@ impl PageType { ]; } -impl TSerializable for PageType { +impl crate::thrift::TSerializable for PageType { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(PageType::from(enum_value)) } @@ -592,12 +592,12 @@ impl BoundaryOrder { ]; } -impl TSerializable for BoundaryOrder { +impl crate::thrift::TSerializable for BoundaryOrder { #[allow(clippy::trivially_copy_pass_by_ref)] - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { o_prot.write_i32(self.0) } - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let enum_value = i_prot.read_i32()?; Ok(BoundaryOrder::from(enum_value)) } @@ -678,8 +678,8 @@ impl Statistics { } } -impl TSerializable for Statistics { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for Statistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -735,7 +735,7 @@ impl TSerializable for Statistics { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("Statistics"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.max { @@ -788,8 +788,8 @@ impl StringType { } } -impl TSerializable for StringType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for StringType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -803,7 +803,7 @@ impl TSerializable for StringType { let ret = StringType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("StringType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -825,8 +825,8 @@ impl UUIDType { } } -impl TSerializable for UUIDType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for UUIDType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -840,7 +840,7 @@ impl TSerializable for UUIDType { let ret = UUIDType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("UUIDType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -862,8 +862,8 @@ impl MapType { } } -impl TSerializable for MapType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for MapType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -877,7 +877,7 @@ impl TSerializable for MapType { let ret = MapType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MapType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -899,8 +899,8 @@ impl ListType { } } -impl TSerializable for ListType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ListType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -914,7 +914,7 @@ impl TSerializable for ListType { let ret = ListType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ListType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -936,8 +936,8 @@ impl EnumType { } } -impl TSerializable for EnumType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for EnumType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -951,7 +951,7 @@ impl TSerializable for EnumType { let ret = EnumType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EnumType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -973,8 +973,8 @@ impl DateType { } } -impl TSerializable for DateType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for DateType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -988,7 +988,7 @@ impl TSerializable for DateType { let ret = DateType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DateType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1015,8 +1015,8 @@ impl NullType { } } -impl TSerializable for NullType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for NullType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1030,7 +1030,7 @@ impl TSerializable for NullType { let ret = NullType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("NullType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1066,8 +1066,8 @@ impl DecimalType { } } -impl TSerializable for DecimalType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for DecimalType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1101,7 +1101,7 @@ impl TSerializable for DecimalType { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DecimalType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("scale", TType::I32, 1))?; @@ -1130,8 +1130,8 @@ impl MilliSeconds { } } -impl TSerializable for MilliSeconds { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for MilliSeconds { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1145,7 +1145,7 @@ impl TSerializable for MilliSeconds { let ret = MilliSeconds {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MilliSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1167,8 +1167,8 @@ impl MicroSeconds { } } -impl TSerializable for MicroSeconds { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for MicroSeconds { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1182,7 +1182,7 @@ impl TSerializable for MicroSeconds { let ret = MicroSeconds {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("MicroSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1204,8 +1204,8 @@ impl NanoSeconds { } } -impl TSerializable for NanoSeconds { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for NanoSeconds { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1219,7 +1219,7 @@ impl TSerializable for NanoSeconds { let ret = NanoSeconds {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("NanoSeconds"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1238,8 +1238,8 @@ pub enum TimeUnit { NANOS(NanoSeconds), } -impl TSerializable for TimeUnit { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for TimeUnit { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -1301,7 +1301,7 @@ impl TSerializable for TimeUnit { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimeUnit"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -1348,8 +1348,8 @@ impl TimestampType { } } -impl TSerializable for TimestampType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for TimestampType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1383,7 +1383,7 @@ impl TSerializable for TimestampType { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimestampType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; @@ -1419,8 +1419,8 @@ impl TimeType { } } -impl TSerializable for TimeType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for TimeType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1454,7 +1454,7 @@ impl TSerializable for TimeType { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TimeType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("isAdjustedToUTC", TType::Bool, 1))?; @@ -1492,8 +1492,8 @@ impl IntType { } } -impl TSerializable for IntType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for IntType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1527,7 +1527,7 @@ impl TSerializable for IntType { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("IntType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("bitWidth", TType::I08, 1))?; @@ -1558,8 +1558,8 @@ impl JsonType { } } -impl TSerializable for JsonType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for JsonType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1573,7 +1573,7 @@ impl TSerializable for JsonType { let ret = JsonType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("JsonType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1598,8 +1598,8 @@ impl BsonType { } } -impl TSerializable for BsonType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for BsonType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -1613,7 +1613,7 @@ impl TSerializable for BsonType { let ret = BsonType {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BsonType"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -1642,8 +1642,8 @@ pub enum LogicalType { UUID(UUIDType), } -impl TSerializable for LogicalType { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for LogicalType { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -1775,7 +1775,7 @@ impl TSerializable for LogicalType { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("LogicalType"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -1915,8 +1915,8 @@ impl SchemaElement { } } -impl TSerializable for SchemaElement { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for SchemaElement { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -1997,7 +1997,7 @@ impl TSerializable for SchemaElement { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SchemaElement"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.type_ { @@ -2084,8 +2084,8 @@ impl DataPageHeader { } } -impl TSerializable for DataPageHeader { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for DataPageHeader { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2139,7 +2139,7 @@ impl TSerializable for DataPageHeader { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DataPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2178,8 +2178,8 @@ impl IndexPageHeader { } } -impl TSerializable for IndexPageHeader { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for IndexPageHeader { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2193,7 +2193,7 @@ impl TSerializable for IndexPageHeader { let ret = IndexPageHeader {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("IndexPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2229,8 +2229,8 @@ impl DictionaryPageHeader { } } -impl TSerializable for DictionaryPageHeader { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for DictionaryPageHeader { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2270,7 +2270,7 @@ impl TSerializable for DictionaryPageHeader { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DictionaryPageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2337,8 +2337,8 @@ impl DataPageHeaderV2 { } } -impl TSerializable for DataPageHeaderV2 { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for DataPageHeaderV2 { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2412,7 +2412,7 @@ impl TSerializable for DataPageHeaderV2 { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("DataPageHeaderV2"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("num_values", TType::I32, 1))?; @@ -2463,8 +2463,8 @@ impl SplitBlockAlgorithm { } } -impl TSerializable for SplitBlockAlgorithm { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for SplitBlockAlgorithm { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2478,7 +2478,7 @@ impl TSerializable for SplitBlockAlgorithm { let ret = SplitBlockAlgorithm {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SplitBlockAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2495,8 +2495,8 @@ pub enum BloomFilterAlgorithm { BLOCK(SplitBlockAlgorithm), } -impl TSerializable for BloomFilterAlgorithm { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for BloomFilterAlgorithm { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2544,7 +2544,7 @@ impl TSerializable for BloomFilterAlgorithm { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2576,8 +2576,8 @@ impl XxHash { } } -impl TSerializable for XxHash { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for XxHash { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2591,7 +2591,7 @@ impl TSerializable for XxHash { let ret = XxHash {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("XxHash"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2608,8 +2608,8 @@ pub enum BloomFilterHash { XXHASH(XxHash), } -impl TSerializable for BloomFilterHash { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for BloomFilterHash { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2657,7 +2657,7 @@ impl TSerializable for BloomFilterHash { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterHash"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2688,8 +2688,8 @@ impl Uncompressed { } } -impl TSerializable for Uncompressed { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for Uncompressed { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -2703,7 +2703,7 @@ impl TSerializable for Uncompressed { let ret = Uncompressed {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("Uncompressed"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -2720,8 +2720,8 @@ pub enum BloomFilterCompression { UNCOMPRESSED(Uncompressed), } -impl TSerializable for BloomFilterCompression { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for BloomFilterCompression { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -2769,7 +2769,7 @@ impl TSerializable for BloomFilterCompression { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterCompression"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -2814,8 +2814,8 @@ impl BloomFilterHeader { } } -impl TSerializable for BloomFilterHeader { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for BloomFilterHeader { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -2863,7 +2863,7 @@ impl TSerializable for BloomFilterHeader { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("BloomFilterHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("numBytes", TType::I32, 1))?; @@ -2933,8 +2933,8 @@ impl PageHeader { } } -impl TSerializable for PageHeader { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for PageHeader { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3005,7 +3005,7 @@ impl TSerializable for PageHeader { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageHeader"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; @@ -3067,8 +3067,8 @@ impl KeyValue { } } -impl TSerializable for KeyValue { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for KeyValue { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3101,7 +3101,7 @@ impl TSerializable for KeyValue { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("KeyValue"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("key", TType::String, 1))?; @@ -3143,8 +3143,8 @@ impl SortingColumn { } } -impl TSerializable for SortingColumn { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for SortingColumn { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3185,7 +3185,7 @@ impl TSerializable for SortingColumn { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("SortingColumn"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("column_idx", TType::I32, 1))?; @@ -3227,8 +3227,8 @@ impl PageEncodingStats { } } -impl TSerializable for PageEncodingStats { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for PageEncodingStats { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3269,7 +3269,7 @@ impl TSerializable for PageEncodingStats { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageEncodingStats"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("page_type", TType::I32, 1))?; @@ -3355,8 +3355,8 @@ impl ColumnMetaData { } } -impl TSerializable for ColumnMetaData { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ColumnMetaData { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -3498,7 +3498,7 @@ impl TSerializable for ColumnMetaData { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("type", TType::I32, 1))?; @@ -3595,8 +3595,8 @@ impl EncryptionWithFooterKey { } } -impl TSerializable for EncryptionWithFooterKey { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for EncryptionWithFooterKey { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -3610,7 +3610,7 @@ impl TSerializable for EncryptionWithFooterKey { let ret = EncryptionWithFooterKey {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionWithFooterKey"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -3639,8 +3639,8 @@ impl EncryptionWithColumnKey { } } -impl TSerializable for EncryptionWithColumnKey { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for EncryptionWithColumnKey { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -3679,7 +3679,7 @@ impl TSerializable for EncryptionWithColumnKey { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionWithColumnKey"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("path_in_schema", TType::List, 1))?; @@ -3709,8 +3709,8 @@ pub enum ColumnCryptoMetaData { ENCRYPTIONWITHCOLUMNKEY(EncryptionWithColumnKey), } -impl TSerializable for ColumnCryptoMetaData { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ColumnCryptoMetaData { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -3765,7 +3765,7 @@ impl TSerializable for ColumnCryptoMetaData { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnCryptoMetaData"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -3832,8 +3832,8 @@ impl ColumnChunk { } } -impl TSerializable for ColumnChunk { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ColumnChunk { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -3908,7 +3908,7 @@ impl TSerializable for ColumnChunk { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnChunk"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.file_path { @@ -4000,8 +4000,8 @@ impl RowGroup { } } -impl TSerializable for RowGroup { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for RowGroup { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option = None; @@ -4078,7 +4078,7 @@ impl TSerializable for RowGroup { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("RowGroup"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("columns", TType::List, 1))?; @@ -4138,8 +4138,8 @@ impl TypeDefinedOrder { } } -impl TSerializable for TypeDefinedOrder { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for TypeDefinedOrder { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; loop { let field_ident = i_prot.read_field_begin()?; @@ -4153,7 +4153,7 @@ impl TSerializable for TypeDefinedOrder { let ret = TypeDefinedOrder {}; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("TypeDefinedOrder"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_stop()?; @@ -4170,8 +4170,8 @@ pub enum ColumnOrder { TYPEORDER(TypeDefinedOrder), } -impl TSerializable for ColumnOrder { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ColumnOrder { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -4219,7 +4219,7 @@ impl TSerializable for ColumnOrder { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnOrder"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -4260,8 +4260,8 @@ impl PageLocation { } } -impl TSerializable for PageLocation { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for PageLocation { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option = None; @@ -4302,7 +4302,7 @@ impl TSerializable for PageLocation { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("PageLocation"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("offset", TType::I64, 1))?; @@ -4338,8 +4338,8 @@ impl OffsetIndex { } } -impl TSerializable for OffsetIndex { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for OffsetIndex { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; loop { @@ -4372,7 +4372,7 @@ impl TSerializable for OffsetIndex { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("OffsetIndex"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("page_locations", TType::List, 1))?; @@ -4432,8 +4432,8 @@ impl ColumnIndex { } } -impl TSerializable for ColumnIndex { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for ColumnIndex { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option>> = None; @@ -4511,7 +4511,7 @@ impl TSerializable for ColumnIndex { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("ColumnIndex"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("null_pages", TType::List, 1))?; @@ -4577,8 +4577,8 @@ impl AesGcmV1 { } } -impl TSerializable for AesGcmV1 { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for AesGcmV1 { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -4616,7 +4616,7 @@ impl TSerializable for AesGcmV1 { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("AesGcmV1"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.aad_prefix { @@ -4664,8 +4664,8 @@ impl AesGcmCtrV1 { } } -impl TSerializable for AesGcmCtrV1 { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for AesGcmCtrV1 { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; let mut f_2: Option> = None; @@ -4703,7 +4703,7 @@ impl TSerializable for AesGcmCtrV1 { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("AesGcmCtrV1"); o_prot.write_struct_begin(&struct_ident)?; if let Some(ref fld_var) = self.aad_prefix { @@ -4736,8 +4736,8 @@ pub enum EncryptionAlgorithm { AESGCMCTRV1(AesGcmCtrV1), } -impl TSerializable for EncryptionAlgorithm { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for EncryptionAlgorithm { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { let mut ret: Option = None; let mut received_field_count = 0; i_prot.read_struct_begin()?; @@ -4792,7 +4792,7 @@ impl TSerializable for EncryptionAlgorithm { Ok(ret.expect("return value should have been constructed")) } } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("EncryptionAlgorithm"); o_prot.write_struct_begin(&struct_ident)?; match *self { @@ -4879,8 +4879,8 @@ impl FileMetaData { } } -impl TSerializable for FileMetaData { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for FileMetaData { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -4982,7 +4982,7 @@ impl TSerializable for FileMetaData { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("FileMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("version", TType::I32, 1))?; @@ -5068,8 +5068,8 @@ impl FileCryptoMetaData { } } -impl TSerializable for FileCryptoMetaData { - fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result { +impl crate::thrift::TSerializable for FileCryptoMetaData { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option = None; let mut f_2: Option> = None; @@ -5102,7 +5102,7 @@ impl TSerializable for FileCryptoMetaData { }; Ok(ret) } - fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> { + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { let struct_ident = TStructIdentifier::new("FileCryptoMetaData"); o_prot.write_struct_begin(&struct_ident)?; o_prot.write_field_begin(&TFieldIdentifier::new("encryption_algorithm", TType::Struct, 1))?; diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 2371f8837bb0..f1612c90cc2a 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -88,3 +88,5 @@ pub mod bloom_filter; pub mod file; pub mod record; pub mod schema; + +pub mod thrift; diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs new file mode 100644 index 000000000000..57f52edc6ef0 --- /dev/null +++ b/parquet/src/thrift.rs @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Custom thrift definitions + +use thrift::protocol::{ + TFieldIdentifier, TInputProtocol, TListIdentifier, TMapIdentifier, + TMessageIdentifier, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, +}; + +/// Reads and writes the struct to Thrift protocols. +/// +/// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of trait objects +pub trait TSerializable: Sized { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result; + fn write_to_out_protocol( + &self, + o_prot: &mut T, + ) -> thrift::Result<()>; +} + +/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice +/// +/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol +pub(crate) struct TCompactSliceInputProtocol<'a> { + buf: &'a [u8], + // Identifier of the last field deserialized for a struct. + last_read_field_id: i16, + // Stack of the last read field ids (a new entry is added each time a nested struct is read). + read_field_id_stack: Vec, + // Boolean value for a field. + // Saved because boolean fields and their value are encoded in a single byte, + // and reading the field only occurs after the field id is read. + pending_read_bool_value: Option, +} + +impl<'a> TCompactSliceInputProtocol<'a> { + pub fn new(buf: &'a [u8]) -> Self { + Self { + buf, + last_read_field_id: 0, + read_field_id_stack: Vec::with_capacity(16), + pending_read_bool_value: None, + } + } + + pub fn as_slice(&self) -> &'a [u8] { + self.buf + } + + fn read_vlq(&mut self) -> thrift::Result { + let mut in_progress = 0; + let mut shift = 0; + loop { + let byte = self.read_byte()?; + in_progress |= ((byte & 0x7F) as u64) << shift; + shift += 7; + if byte & 0x80 == 0 { + return Ok(in_progress); + } + } + } + + fn read_zig_zag(&mut self) -> thrift::Result { + let val = self.read_vlq()?; + Ok((val >> 1) as i64 ^ -((val & 1) as i64)) + } + + fn read_list_set_begin(&mut self) -> thrift::Result<(TType, i32)> { + let header = self.read_byte()?; + let element_type = collection_u8_to_type(header & 0x0F)?; + + let possible_element_count = (header & 0xF0) >> 4; + let element_count = if possible_element_count != 15 { + // high bits set high if count and type encoded separately + possible_element_count as i32 + } else { + self.read_vlq()? as _ + }; + + Ok((element_type, element_count)) + } +} + +impl<'a> TInputProtocol for TCompactSliceInputProtocol<'a> { + fn read_message_begin(&mut self) -> thrift::Result { + unimplemented!() + } + + fn read_message_end(&mut self) -> thrift::Result<()> { + unimplemented!() + } + + fn read_struct_begin(&mut self) -> thrift::Result> { + self.read_field_id_stack.push(self.last_read_field_id); + self.last_read_field_id = 0; + Ok(None) + } + + fn read_struct_end(&mut self) -> thrift::Result<()> { + self.last_read_field_id = self + .read_field_id_stack + .pop() + .expect("should have previous field ids"); + Ok(()) + } + + fn read_field_begin(&mut self) -> thrift::Result { + // we can read at least one byte, which is: + // - the type + // - the field delta and the type + let field_type = self.read_byte()?; + let field_delta = (field_type & 0xF0) >> 4; + let field_type = match field_type & 0x0F { + 0x01 => { + self.pending_read_bool_value = Some(true); + Ok(TType::Bool) + } + 0x02 => { + self.pending_read_bool_value = Some(false); + Ok(TType::Bool) + } + ttu8 => u8_to_type(ttu8), + }?; + + match field_type { + TType::Stop => Ok( + TFieldIdentifier::new::, String, Option>( + None, + TType::Stop, + None, + ), + ), + _ => { + if field_delta != 0 { + self.last_read_field_id += field_delta as i16; + } else { + self.last_read_field_id = self.read_i16()?; + }; + + Ok(TFieldIdentifier { + name: None, + field_type, + id: Some(self.last_read_field_id), + }) + } + } + } + + fn read_field_end(&mut self) -> thrift::Result<()> { + Ok(()) + } + + fn read_bool(&mut self) -> thrift::Result { + match self.pending_read_bool_value.take() { + Some(b) => Ok(b), + None => { + let b = self.read_byte()?; + match b { + 0x01 => Ok(true), + 0x02 => Ok(false), + unkn => Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::InvalidData, + message: format!("cannot convert {} into bool", unkn), + })), + } + } + } + } + + fn read_bytes(&mut self) -> thrift::Result> { + let len = self.read_vlq()? as usize; + let ret = self.buf.get(..len).ok_or_else(eof_error)?.to_vec(); + self.buf = &self.buf[len..]; + Ok(ret) + } + + fn read_i8(&mut self) -> thrift::Result { + Ok(self.read_byte()? as _) + } + + fn read_i16(&mut self) -> thrift::Result { + Ok(self.read_zig_zag()? as _) + } + + fn read_i32(&mut self) -> thrift::Result { + Ok(self.read_zig_zag()? as _) + } + + fn read_i64(&mut self) -> thrift::Result { + self.read_zig_zag() + } + + fn read_double(&mut self) -> thrift::Result { + let slice = (self.buf[..8]).try_into().unwrap(); + self.buf = &self.buf[8..]; + Ok(f64::from_le_bytes(slice)) + } + + fn read_string(&mut self) -> thrift::Result { + let bytes = self.read_bytes()?; + String::from_utf8(bytes).map_err(From::from) + } + + fn read_list_begin(&mut self) -> thrift::Result { + let (element_type, element_count) = self.read_list_set_begin()?; + Ok(TListIdentifier::new(element_type, element_count)) + } + + fn read_list_end(&mut self) -> thrift::Result<()> { + Ok(()) + } + + fn read_set_begin(&mut self) -> thrift::Result { + unimplemented!() + } + + fn read_set_end(&mut self) -> thrift::Result<()> { + unimplemented!() + } + + fn read_map_begin(&mut self) -> thrift::Result { + unimplemented!() + } + + fn read_map_end(&mut self) -> thrift::Result<()> { + Ok(()) + } + + #[inline] + fn read_byte(&mut self) -> thrift::Result { + let ret = *self.buf.first().ok_or_else(eof_error)?; + self.buf = &self.buf[1..]; + Ok(ret) + } +} + +fn collection_u8_to_type(b: u8) -> thrift::Result { + match b { + 0x01 => Ok(TType::Bool), + o => u8_to_type(o), + } +} + +fn u8_to_type(b: u8) -> thrift::Result { + match b { + 0x00 => Ok(TType::Stop), + 0x03 => Ok(TType::I08), // equivalent to TType::Byte + 0x04 => Ok(TType::I16), + 0x05 => Ok(TType::I32), + 0x06 => Ok(TType::I64), + 0x07 => Ok(TType::Double), + 0x08 => Ok(TType::String), + 0x09 => Ok(TType::List), + 0x0A => Ok(TType::Set), + 0x0B => Ok(TType::Map), + 0x0C => Ok(TType::Struct), + unkn => Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::InvalidData, + message: format!("cannot convert {} into TType", unkn), + })), + } +} + +fn eof_error() -> thrift::Error { + thrift::Error::Transport(thrift::TransportError { + kind: thrift::TransportErrorKind::EndOfFile, + message: "Unexpected EOF".to_string(), + }) +} From d3e1302a69518bd6ac85c364ae5adaebc89c123a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:16:37 +0100 Subject: [PATCH 1262/1411] Update regex-syntax requirement from 0.7.1 to 0.8.0 (#4914) Updates the requirements on [regex-syntax](https://github.com/rust-lang/regex) to permit the latest version. - [Release notes](https://github.com/rust-lang/regex/releases) - [Changelog](https://github.com/rust-lang/regex/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-lang/regex/commits) --- updated-dependencies: - dependency-name: regex-syntax dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-string/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index e1163dc03eab..1ae7af8bdf41 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -40,5 +40,5 @@ arrow-schema = { workspace = true } arrow-array = { workspace = true } arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } -regex-syntax = { version = "0.7.1", default-features = false, features = ["unicode"] } +regex-syntax = { version = "0.8.0", default-features = false, features = ["unicode"] } num = { version = "0.4", default-features = false, features = ["std"] } From 4aabd2c5ded46d529cce0714a776f0a6336a9a89 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 10 Oct 2023 20:14:14 +0200 Subject: [PATCH 1263/1411] feat: document & streamline flight SQL CLI (#4912) - add docs to README - add a few more clap features - document arguments - unify key-value parsing for headers and parameters --- arrow-flight/Cargo.toml | 2 +- arrow-flight/README.md | 32 +++++++- arrow-flight/src/bin/flight_sql_client.rs | 97 ++++++++++++----------- 3 files changed, 81 insertions(+), 50 deletions(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index edaa7129dc9a..70227eedea0e 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -50,7 +50,7 @@ tonic = { version = "0.10.0", default-features = false, features = ["transport", # CLI-related dependencies anyhow = { version = "1.0", optional = true } -clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } +clap = { version = "4.4.6", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage", "wrap_help", "color", "suggestions"], optional = true } tracing-log = { version = "0.1", optional = true } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"], optional = true } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 9194b209fe72..b80772ac927e 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -44,5 +44,33 @@ that demonstrate how to build a Flight server implemented with [tonic](https://d ## Feature Flags - `flight-sql-experimental`: Enables experimental support for - [Apache Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html), - a protocol for interacting with SQL databases. + [Apache Arrow FlightSQL], a protocol for interacting with SQL databases. + +## CLI + +This crates offers a basic [Apache Arrow FlightSQL] command line interface. + +The client can be installed from the repository: + +```console +$ cargo install --features=cli,flight-sql-experimental,tls --bin=flight_sql_client --path=. --locked +``` + +The client comes with extensive help text: + +```console +$ flight_sql_client help +``` + +A query can be executed using: + +```console +$ flight_sql_client --host example.com statement-query "SELECT 1;" ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ +``` + +[apache arrow flightsql]: https://arrow.apache.org/docs/format/FlightSql.html diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index df51530b3c8f..296efc1c308e 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{error::Error, sync::Arc, time::Duration}; +use std::{sync::Arc, time::Duration}; use anyhow::{bail, Context, Result}; use arrow_array::{ArrayRef, Datum, RecordBatch, StringArray}; @@ -30,45 +30,17 @@ use tonic::{ }; use tracing_log::log::info; -/// A ':' separated key value pair -#[derive(Debug, Clone)] -struct KeyValue { - pub key: K, - pub value: V, -} - -impl std::str::FromStr for KeyValue -where - K: std::str::FromStr, - V: std::str::FromStr, - K::Err: std::fmt::Display, - V::Err: std::fmt::Display, -{ - type Err = String; - - fn from_str(s: &str) -> std::result::Result { - let parts = s.splitn(2, ':').collect::>(); - match parts.as_slice() { - [key, value] => { - let key = K::from_str(key).map_err(|e| e.to_string())?; - let value = V::from_str(value.trim()).map_err(|e| e.to_string())?; - Ok(Self { key, value }) - } - _ => Err(format!( - "Invalid key value pair - expected 'KEY:VALUE' got '{s}'" - )), - } - } -} - /// Logging CLI config. #[derive(Debug, Parser)] pub struct LoggingArgs { /// Log verbosity. /// - /// Use `-v for warn, `-vv for info, -vvv for debug, -vvvv for trace. + /// Defaults to "warn". /// - /// Note you can also set logging level using `RUST_LOG` environment variable: `RUST_LOG=debug` + /// Use `-v` for "info", `-vv` for "debug", `-vvv` for "trace". + /// + /// Note you can also set logging level using `RUST_LOG` environment variable: + /// `RUST_LOG=debug`. #[clap( short = 'v', long = "verbose", @@ -81,16 +53,22 @@ pub struct LoggingArgs { struct ClientArgs { /// Additional headers. /// - /// Values should be key value pairs separated by ':' - #[clap(long, value_delimiter = ',')] - headers: Vec>, + /// Can be given multiple times. Headers and values are separated by '='. + /// + /// Example: `-H foo=bar -H baz=42` + #[clap(long = "header", short = 'H', value_parser = parse_key_val)] + headers: Vec<(String, String)>, - /// Username - #[clap(long)] + /// Username. + /// + /// Optional. If given, `password` must also be set. + #[clap(long, requires = "password")] username: Option, - /// Password - #[clap(long)] + /// Password. + /// + /// Optional. If given, `username` must also be set. + #[clap(long, requires = "username")] password: Option, /// Auth token. @@ -98,14 +76,20 @@ struct ClientArgs { token: Option, /// Use TLS. + /// + /// If not provided, use cleartext connection. #[clap(long)] tls: bool, /// Server host. + /// + /// Required. #[clap(long)] host: String, /// Server port. + /// + /// Defaults to `443` if `tls` is set, otherwise defaults to `80`. #[clap(long)] port: Option, } @@ -124,13 +108,34 @@ struct Args { cmd: Command, } +/// Different available commands. #[derive(Debug, Subcommand)] enum Command { + /// Execute given statement. StatementQuery { + /// SQL query. + /// + /// Required. query: String, }, + + /// Prepare given statement and then execute it. PreparedStatementQuery { + /// SQL query. + /// + /// Required. + /// + /// Can contains placeholders like `$1`. + /// + /// Example: `SELECT * FROM t WHERE x = $1` query: String, + + /// Additional parameters. + /// + /// Can be given multiple times. Names and values are separated by '='. Values will be + /// converted to the type that the server reported for the prepared statement. + /// + /// Example: `-p $1=42` #[clap(short, value_parser = parse_key_val)] params: Vec<(String, String)>, }, @@ -284,8 +289,8 @@ async fn setup_client(args: ClientArgs) -> Result Result Result<(String, String), Box> { +fn parse_key_val(s: &str) -> Result<(String, String), String> { let pos = s .find('=') .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?; - Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) + Ok((s[..pos].to_owned(), s[pos + 1..].to_owned())) } /// Log headers/trailers. From 181cb3d66e33c689be31292646ae63879cf0c134 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 11 Oct 2023 02:57:45 -0400 Subject: [PATCH 1264/1411] Minor: Clarify rationale for FlightDataEncoder API, add examples (#4916) --- arrow-flight/src/encode.rs | 48 ++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index cd2ee7c02b68..28c181c0d5fd 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -30,6 +30,11 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// This can be used to implement [`FlightService::do_get`] in an /// Arrow Flight implementation; /// +/// This structure encodes a stream of `Result`s rather than `RecordBatch`es to +/// propagate errors from streaming execution, where the generation of the +/// `RecordBatch`es is incremental, and an error may occur even after +/// several have already been successfully produced. +/// /// # Caveats /// 1. [`DictionaryArray`](arrow_array::array::DictionaryArray)s /// are converted to their underlying types prior to transport, due to @@ -41,14 +46,14 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// # use arrow_array::{ArrayRef, RecordBatch, UInt32Array}; /// # async fn f() { /// # let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); -/// # let record_batch = RecordBatch::try_from_iter(vec![ +/// # let batch = RecordBatch::try_from_iter(vec![ /// # ("a", Arc::new(c1) as ArrayRef) /// # ]) /// # .expect("cannot create record batch"); /// use arrow_flight::encode::FlightDataEncoderBuilder; /// /// // Get an input stream of Result -/// let input_stream = futures::stream::iter(vec![Ok(record_batch)]); +/// let input_stream = futures::stream::iter(vec![Ok(batch)]); /// /// // Build a stream of `Result` (e.g. to return for do_get) /// let flight_data_stream = FlightDataEncoderBuilder::new() @@ -59,6 +64,39 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// # } /// ``` /// +/// # Example: Sending `Vec` +/// +/// You can create a [`Stream`] to pass to [`Self::build`] from an existing +/// `Vec` of `RecordBatch`es like this: +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, RecordBatch, UInt32Array}; +/// # async fn f() { +/// # fn make_batches() -> Vec { +/// # let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); +/// # let batch = RecordBatch::try_from_iter(vec![ +/// # ("a", Arc::new(c1) as ArrayRef) +/// # ]) +/// # .expect("cannot create record batch"); +/// # vec![batch.clone(), batch.clone()] +/// # } +/// use arrow_flight::encode::FlightDataEncoderBuilder; +/// +/// // Get batches that you want to send via Flight +/// let batches: Vec = make_batches(); +/// +/// // Create an input stream of Result +/// let input_stream = futures::stream::iter( +/// batches.into_iter().map(Ok) +/// ); +/// +/// // Build a stream of `Result` (e.g. to return for do_get) +/// let flight_data_stream = FlightDataEncoderBuilder::new() +/// .build(input_stream); +/// # } +/// ``` +/// /// [`FlightService::do_get`]: crate::flight_service_server::FlightService::do_get /// [`FlightError`]: crate::error::FlightError #[derive(Debug)] @@ -146,8 +184,10 @@ impl FlightDataEncoderBuilder { self } - /// Return a [`Stream`] of [`FlightData`], - /// consuming self. More details on [`FlightDataEncoder`] + /// Takes a [`Stream`] of [`Result`] and returns a [`Stream`] + /// of [`FlightData`], consuming self. + /// + /// See example on [`Self`] and [`FlightDataEncoder`] for more details pub fn build(self, input: S) -> FlightDataEncoder where S: Stream> + Send + 'static, From d83008bc035d6bc724b79fcf363b96d1a5e11ce5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 11 Oct 2023 07:57:59 +0100 Subject: [PATCH 1265/1411] Cleanup `object_store::retry` client error handling (#4915) * Cleanup client error handling * Clippy * Format * Update test * Review feedback --- object_store/src/client/retry.rs | 180 ++++++++++++++++--------------- object_store/src/gcp/mod.rs | 2 +- 2 files changed, 96 insertions(+), 86 deletions(-) diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index 39a913142e09..e4d246c87a2a 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -23,46 +23,50 @@ use futures::FutureExt; use reqwest::header::LOCATION; use reqwest::{Response, StatusCode}; use snafu::Error as SnafuError; +use snafu::Snafu; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug)] -pub struct Error { - retries: usize, - message: String, - source: Option, - status: Option, -} - -impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "response error \"{}\", after {} retries", - self.message, self.retries - )?; - if let Some(source) = &self.source { - write!(f, ": {source}")?; - } - Ok(()) - } -} - -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - self.source.as_ref().map(|e| e as _) - } +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] + BareRedirect, + + #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + Client { + status: StatusCode, + body: Option, + }, + + #[snafu(display("Error after {retries} retries: {source}"))] + Reqwest { + retries: usize, + source: reqwest::Error, + }, } impl Error { /// Returns the status code associated with this error if any pub fn status(&self) -> Option { - self.status + match self { + Self::BareRedirect => None, + Self::Client { status, .. } => Some(*status), + Self::Reqwest { source, .. } => source.status(), + } + } + + /// Returns the error body if any + pub fn body(&self) -> Option<&str> { + match self { + Self::Client { body, .. } => body.as_deref(), + Self::BareRedirect => None, + Self::Reqwest { .. } => None, + } } pub fn error(self, store: &'static str, path: String) -> crate::Error { - match self.status { + match self.status() { Some(StatusCode::NOT_FOUND) => crate::Error::NotFound { path, source: Box::new(self), @@ -86,16 +90,19 @@ impl Error { impl From for std::io::Error { fn from(err: Error) -> Self { use std::io::ErrorKind; - match (&err.source, err.status()) { - (Some(source), _) if source.is_builder() || source.is_request() => { - Self::new(ErrorKind::InvalidInput, err) - } - (_, Some(StatusCode::NOT_FOUND)) => Self::new(ErrorKind::NotFound, err), - (_, Some(StatusCode::BAD_REQUEST)) => Self::new(ErrorKind::InvalidInput, err), - (Some(source), None) if source.is_timeout() => { + match &err { + Error::Client { + status: StatusCode::NOT_FOUND, + .. + } => Self::new(ErrorKind::NotFound, err), + Error::Client { + status: StatusCode::BAD_REQUEST, + .. + } => Self::new(ErrorKind::InvalidInput, err), + Error::Reqwest { source, .. } if source.is_timeout() => { Self::new(ErrorKind::TimedOut, err) } - (Some(source), None) if source.is_connect() => { + Error::Reqwest { source, .. } if source.is_connect() => { Self::new(ErrorKind::NotConnected, err) } _ => Self::new(ErrorKind::Other, err), @@ -169,27 +176,21 @@ impl RetryExt for reqwest::RequestBuilder { Ok(r) => match r.error_for_status_ref() { Ok(_) if r.status().is_success() => return Ok(r), Ok(r) if r.status() == StatusCode::NOT_MODIFIED => { - return Err(Error{ - message: "not modified".to_string(), - retries, - status: Some(r.status()), - source: None, + return Err(Error::Client { + body: None, + status: StatusCode::NOT_MODIFIED, }) } Ok(r) => { let is_bare_redirect = r.status().is_redirection() && !r.headers().contains_key(LOCATION); - let message = match is_bare_redirect { - true => "Received redirect without LOCATION, this normally indicates an incorrectly configured region".to_string(), + return match is_bare_redirect { + true => Err(Error::BareRedirect), // Not actually sure if this is reachable, but here for completeness - false => format!("request unsuccessful: {}", r.status()), - }; - - return Err(Error{ - message, - retries, - status: Some(r.status()), - source: None, - }) + false => Err(Error::Client { + body: None, + status: r.status(), + }) + } } Err(e) => { let status = r.status(); @@ -198,23 +199,26 @@ impl RetryExt for reqwest::RequestBuilder { || now.elapsed() > retry_timeout || !status.is_server_error() { - // Get the response message if returned a client error - let message = match status.is_client_error() { + return Err(match status.is_client_error() { true => match r.text().await { - Ok(message) if !message.is_empty() => message, - Ok(_) => "No Body".to_string(), - Err(e) => format!("error getting response body: {e}") + Ok(body) => { + Error::Client { + body: Some(body).filter(|b| !b.is_empty()), + status, + } + } + Err(e) => { + Error::Reqwest { + retries, + source: e, + } + } } - false => status.to_string(), - }; - - return Err(Error{ - message, - retries, - status: Some(status), - source: Some(e), - }) - + false => Error::Reqwest { + retries, + source: e, + } + }); } let sleep = backoff.next(); @@ -238,16 +242,14 @@ impl RetryExt for reqwest::RequestBuilder { || now.elapsed() > retry_timeout || !do_retry { - return Err(Error{ + return Err(Error::Reqwest { retries, - message: "request error".to_string(), - status: e.status(), - source: Some(e), + source: e, }) } let sleep = backoff.next(); retries += 1; - info!("Encountered request error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); + info!("Encountered transport error ({}) backing off for {} seconds, retry {} of {}", e, sleep.as_secs_f32(), retries, max_retries); tokio::time::sleep(sleep).await; } } @@ -260,7 +262,7 @@ impl RetryExt for reqwest::RequestBuilder { #[cfg(test)] mod tests { use crate::client::mock_server::MockServer; - use crate::client::retry::RetryExt; + use crate::client::retry::{Error, RetryExt}; use crate::RetryConfig; use hyper::header::LOCATION; use hyper::{Body, Response}; @@ -294,8 +296,11 @@ mod tests { let e = do_request().await.unwrap_err(); assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); - assert_eq!(e.retries, 0); - assert_eq!(&e.message, "cupcakes"); + assert_eq!(e.body(), Some("cupcakes")); + assert_eq!( + e.to_string(), + "Client error with status 400 Bad Request: cupcakes" + ); // Handles client errors with no payload mock.push( @@ -307,8 +312,11 @@ mod tests { let e = do_request().await.unwrap_err(); assert_eq!(e.status().unwrap(), StatusCode::BAD_REQUEST); - assert_eq!(e.retries, 0); - assert_eq!(&e.message, "No Body"); + assert_eq!(e.body(), None); + assert_eq!( + e.to_string(), + "Client error with status 400 Bad Request: No Body" + ); // Should retry server error request mock.push( @@ -381,7 +389,8 @@ mod tests { ); let e = do_request().await.unwrap_err(); - assert_eq!(e.message, "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); + assert!(matches!(e, Error::BareRedirect)); + assert_eq!(e.to_string(), "Received redirect without LOCATION, this normally indicates an incorrectly configured region"); // Gives up after the retrying the specified number of times for _ in 0..=retry.max_retries { @@ -393,22 +402,23 @@ mod tests { ); } - let e = do_request().await.unwrap_err(); - assert_eq!(e.retries, retry.max_retries); - assert_eq!(e.message, "502 Bad Gateway"); + let e = do_request().await.unwrap_err().to_string(); + assert!(e.starts_with("Error after 2 retries: HTTP status server error (502 Bad Gateway) for url"), "{e}"); // Panic results in an incomplete message error in the client mock.push_fn(|_| panic!()); let r = do_request().await.unwrap(); assert_eq!(r.status(), StatusCode::OK); - // Gives up after retrying mulitiple panics + // Gives up after retrying multiple panics for _ in 0..=retry.max_retries { mock.push_fn(|_| panic!()); } - let e = do_request().await.unwrap_err(); - assert_eq!(e.retries, retry.max_retries); - assert_eq!(e.message, "request error"); + let e = do_request().await.unwrap_err().to_string(); + assert!( + e.starts_with("Error after 2 retries: error sending request for url"), + "{e}" + ); // Shutdown mock.shutdown().await diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 3f5bf629d180..a0a60f27a6aa 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -1215,7 +1215,7 @@ mod test { .unwrap_err() .to_string(); assert!( - err.contains("HTTP status client error (404 Not Found)"), + err.contains("Client error with status 404 Not Found"), "{}", err ) From 556c5ff8193665bc4cbba80505d517f4d5b8b601 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 11 Oct 2023 07:58:20 +0100 Subject: [PATCH 1266/1411] Cleanup CSV WriterBuilder, Default to AutoSI Second Precision (#4735) (#4909) * Cleanup CSV WriterBuilder (#4735) * Update test * Review feedback * Clippy --- arrow-csv/src/writer.rs | 134 +++++++++++++++++++++++----------------- arrow/tests/csv.rs | 42 ------------- 2 files changed, 79 insertions(+), 97 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 840e8e8a93cc..1ca956e2c73f 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -70,11 +70,6 @@ use csv::ByteRecord; use std::io::Write; use crate::map_csv_error; - -const DEFAULT_DATE_FORMAT: &str = "%F"; -const DEFAULT_TIME_FORMAT: &str = "%T"; -const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; -const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; /// A CSV writer @@ -82,41 +77,29 @@ const DEFAULT_NULL_VALUE: &str = ""; pub struct Writer { /// The object to write to writer: csv::Writer, - /// Whether file should be written with headers. Defaults to `true` + /// Whether file should be written with headers, defaults to `true` has_headers: bool, - /// The date format for date arrays + /// The date format for date arrays, defaults to RFC3339 date_format: Option, - /// The datetime format for datetime arrays + /// The datetime format for datetime arrays, defaults to RFC3339 datetime_format: Option, - /// The timestamp format for timestamp arrays + /// The timestamp format for timestamp arrays, defaults to RFC3339 timestamp_format: Option, - /// The timestamp format for timestamp (with timezone) arrays + /// The timestamp format for timestamp (with timezone) arrays, defaults to RFC3339 timestamp_tz_format: Option, - /// The time format for time arrays + /// The time format for time arrays, defaults to RFC3339 time_format: Option, /// Is the beginning-of-writer beginning: bool, - /// The value to represent null entries - null_value: String, + /// The value to represent null entries, defaults to [`DEFAULT_NULL_VALUE`] + null_value: Option, } impl Writer { /// Create a new CsvWriter from a writable object, with default options pub fn new(writer: W) -> Self { let delimiter = b','; - let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - has_headers: true, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - beginning: true, - null_value: DEFAULT_NULL_VALUE.to_string(), - } + WriterBuilder::new().with_delimiter(delimiter).build(writer) } /// Write a vector of record batches to a writable object @@ -138,7 +121,7 @@ impl Writer { } let options = FormatOptions::default() - .with_null(&self.null_value) + .with_null(self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE)) .with_date_format(self.date_format.as_deref()) .with_datetime_format(self.datetime_format.as_deref()) .with_timestamp_format(self.timestamp_format.as_deref()) @@ -207,9 +190,9 @@ impl RecordBatchWriter for Writer { #[derive(Clone, Debug)] pub struct WriterBuilder { /// Optional column delimiter. Defaults to `b','` - delimiter: Option, + delimiter: u8, /// Whether to write column names as file headers. Defaults to `true` - has_headers: bool, + has_header: bool, /// Optional date format for date arrays date_format: Option, /// Optional datetime format for datetime arrays @@ -227,14 +210,14 @@ pub struct WriterBuilder { impl Default for WriterBuilder { fn default() -> Self { Self { - has_headers: true, - delimiter: None, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - null_value: Some(DEFAULT_NULL_VALUE.to_string()), + has_header: true, + delimiter: b',', + date_format: None, + datetime_format: None, + time_format: None, + timestamp_format: None, + timestamp_tz_format: None, + null_value: None, } } } @@ -254,7 +237,7 @@ impl WriterBuilder { /// let file = File::create("target/out.csv").unwrap(); /// /// // create a builder that doesn't write headers - /// let builder = WriterBuilder::new().has_headers(false); + /// let builder = WriterBuilder::new().with_header(false); /// let writer = builder.build(file); /// /// writer @@ -265,48 +248,92 @@ impl WriterBuilder { } /// Set whether to write headers + #[deprecated(note = "Use Self::with_header")] + #[doc(hidden)] pub fn has_headers(mut self, has_headers: bool) -> Self { - self.has_headers = has_headers; + self.has_header = has_headers; + self + } + + /// Set whether to write the CSV file with a header + pub fn with_header(mut self, header: bool) -> Self { + self.has_header = header; self } + /// Returns `true` if this writer is configured to write a header + pub fn header(&self) -> bool { + self.has_header + } + /// Set the CSV file's column delimiter as a byte character pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); + self.delimiter = delimiter; self } + /// Get the CSV file's column delimiter as a byte character + pub fn delimiter(&self) -> u8 { + self.delimiter + } + /// Set the CSV file's date format pub fn with_date_format(mut self, format: String) -> Self { self.date_format = Some(format); self } + /// Get the CSV file's date format if set, defaults to RFC3339 + pub fn date_format(&self) -> Option<&str> { + self.date_format.as_deref() + } + /// Set the CSV file's datetime format pub fn with_datetime_format(mut self, format: String) -> Self { self.datetime_format = Some(format); self } + /// Get the CSV file's datetime format if set, defaults to RFC3339 + pub fn datetime_format(&self) -> Option<&str> { + self.datetime_format.as_deref() + } + /// Set the CSV file's time format pub fn with_time_format(mut self, format: String) -> Self { self.time_format = Some(format); self } + /// Get the CSV file's datetime time if set, defaults to RFC3339 + pub fn time_format(&self) -> Option<&str> { + self.time_format.as_deref() + } + /// Set the CSV file's timestamp format pub fn with_timestamp_format(mut self, format: String) -> Self { self.timestamp_format = Some(format); self } + /// Get the CSV file's timestamp format if set, defaults to RFC3339 + pub fn timestamp_format(&self) -> Option<&str> { + self.timestamp_format.as_deref() + } + /// Set the value to represent null in output pub fn with_null(mut self, null_value: String) -> Self { self.null_value = Some(null_value); self } - /// Use RFC3339 format for date/time/timestamps + /// Get the value to represent null in output + pub fn null(&self) -> &str { + self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE) + } + + /// Use RFC3339 format for date/time/timestamps (default) + #[deprecated(note = "Use WriterBuilder::default()")] pub fn with_rfc3339(mut self) -> Self { self.date_format = None; self.datetime_format = None; @@ -318,21 +345,18 @@ impl WriterBuilder { /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { - let delimiter = self.delimiter.unwrap_or(b','); let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); + let writer = builder.delimiter(self.delimiter).from_writer(writer); Writer { writer, - has_headers: self.has_headers, + beginning: true, + has_headers: self.has_header, date_format: self.date_format, datetime_format: self.datetime_format, time_format: self.time_format, timestamp_format: self.timestamp_format, timestamp_tz_format: self.timestamp_tz_format, - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + null_value: self.null_value, } } } @@ -411,11 +435,11 @@ mod tests { let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo "#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -512,7 +536,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); let builder = WriterBuilder::new() - .has_headers(false) + .with_header(false) .with_delimiter(b'|') .with_null("NULL".to_string()) .with_time_format("%r".to_string()); @@ -560,7 +584,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ) .unwrap(); - let builder = WriterBuilder::new().has_headers(false); + let builder = WriterBuilder::new().with_header(false); let mut buf: Cursor> = Default::default(); // drop the writer early to release the borrow. @@ -652,7 +676,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); - let builder = WriterBuilder::new().with_rfc3339(); + let builder = WriterBuilder::new(); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 3ee319101757..a79b6b44c2d3 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -53,48 +53,6 @@ fn test_export_csv_timestamps() { } drop(writer); - let left = "c1,c2 -2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 -2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; - let right = String::from_utf8(sw).unwrap(); - assert_eq!(left, right); -} - -#[test] -fn test_export_csv_timestamps_using_rfc3339() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::Timestamp(TimeUnit::Millisecond, Some("Australia/Sydney".into())), - true, - ), - Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), - ]); - - let c1 = TimestampMillisecondArray::from( - // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). - // The offset (difference to UTC) is +10:00. - // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) - // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. - // - vec![Some(1555584887378), Some(1635577147000)], - ) - .with_timezone("Australia/Sydney"); - let c2 = - TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); - - let mut sw = Vec::new(); - let mut writer = arrow_csv::WriterBuilder::new() - .with_rfc3339() - .build(&mut sw); - let batches = vec![&batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - drop(writer); - let left = "c1,c2 2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; From d5a655d21fe14e6c08e72ba3233909e414dfb6b6 Mon Sep 17 00:00:00 2001 From: Alex Wilcoxson Date: Thu, 12 Oct 2023 05:47:55 -0500 Subject: [PATCH 1267/1411] Add option to `FlightDataEncoder` to always resend batch dictionaries (#4896) * Add option to FlightDataEncoder to always resend batch dictionaries * Replace send_dictionaries on FlightDataEncoder with DictionaryHandling Enum * Improve docs --------- Co-authored-by: Andrew Lamb --- arrow-flight/src/encode.rs | 197 ++++++++++++++++++++++++++++++------- 1 file changed, 163 insertions(+), 34 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 28c181c0d5fd..9ae7f1637982 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -36,9 +36,11 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// several have already been successfully produced. /// /// # Caveats -/// 1. [`DictionaryArray`](arrow_array::array::DictionaryArray)s -/// are converted to their underlying types prior to transport, due to -/// . +/// 1. When [`DictionaryHandling`] is [`DictionaryHandling::Hydrate`], [`DictionaryArray`](arrow_array::array::DictionaryArray)s +/// are converted to their underlying types prior to transport. +/// When [`DictionaryHandling`] is [`DictionaryHandling::Resend`], Dictionary [`FlightData`] is sent with every +/// [`RecordBatch`] that contains a [`DictionaryArray`](arrow_array::array::DictionaryArray). +/// See . /// /// # Example /// ```no_run @@ -112,6 +114,9 @@ pub struct FlightDataEncoderBuilder { schema: Option, /// Optional flight descriptor, if known before data. descriptor: Option, + /// Deterimines how `DictionaryArray`s are encoded for transport. + /// See [`DictionaryHandling`] for more information. + dictionary_handling: DictionaryHandling, } /// Default target size for encoded [`FlightData`]. @@ -128,6 +133,7 @@ impl Default for FlightDataEncoderBuilder { app_metadata: Bytes::new(), schema: None, descriptor: None, + dictionary_handling: DictionaryHandling::Hydrate, } } } @@ -152,6 +158,15 @@ impl FlightDataEncoderBuilder { self } + /// Set [`DictionaryHandling`] for encoder + pub fn with_dictionary_handling( + mut self, + dictionary_handling: DictionaryHandling, + ) -> Self { + self.dictionary_handling = dictionary_handling; + self + } + /// Specify application specific metadata included in the /// [`FlightData::app_metadata`] field of the the first Schema /// message @@ -198,6 +213,7 @@ impl FlightDataEncoderBuilder { app_metadata, schema, descriptor, + dictionary_handling, } = self; FlightDataEncoder::new( @@ -207,6 +223,7 @@ impl FlightDataEncoderBuilder { options, app_metadata, descriptor, + dictionary_handling, ) } } @@ -232,6 +249,9 @@ pub struct FlightDataEncoder { done: bool, /// cleared after the first FlightData message is sent descriptor: Option, + /// Deterimines how `DictionaryArray`s are encoded for transport. + /// See [`DictionaryHandling`] for more information. + dictionary_handling: DictionaryHandling, } impl FlightDataEncoder { @@ -242,16 +262,21 @@ impl FlightDataEncoder { options: IpcWriteOptions, app_metadata: Bytes, descriptor: Option, + dictionary_handling: DictionaryHandling, ) -> Self { let mut encoder = Self { inner, schema: None, max_flight_data_size, - encoder: FlightIpcEncoder::new(options), + encoder: FlightIpcEncoder::new( + options, + dictionary_handling != DictionaryHandling::Resend, + ), app_metadata: Some(app_metadata), queue: VecDeque::new(), done: false, descriptor, + dictionary_handling, }; // If schema is known up front, enqueue it immediately @@ -282,7 +307,8 @@ impl FlightDataEncoder { fn encode_schema(&mut self, schema: &SchemaRef) -> SchemaRef { // The first message is the schema message, and all // batches have the same schema - let schema = Arc::new(prepare_schema_for_flight(schema)); + let send_dictionaries = self.dictionary_handling == DictionaryHandling::Resend; + let schema = Arc::new(prepare_schema_for_flight(schema, send_dictionaries)); let mut schema_flight_data = self.encoder.encode_schema(&schema); // attach any metadata requested @@ -304,7 +330,8 @@ impl FlightDataEncoder { }; // encode the batch - let batch = prepare_batch_for_flight(&batch, schema)?; + let send_dictionaries = self.dictionary_handling == DictionaryHandling::Resend; + let batch = prepare_batch_for_flight(&batch, schema, send_dictionaries)?; for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { let (flight_dictionaries, flight_batch) = @@ -365,17 +392,46 @@ impl Stream for FlightDataEncoder { } } +/// Defines how a [`FlightDataEncoder`] encodes [`DictionaryArray`]s +/// +/// [`DictionaryArray`]: arrow_array::DictionaryArray +#[derive(Debug, PartialEq)] +pub enum DictionaryHandling { + /// Expands to the underlying type (default). This likely sends more data + /// over the network but requires less memory (dictionaries are not tracked) + /// and is more compatible with other arrow flight client implementations + /// that may not support `DictionaryEncoding` + /// + /// An IPC response, streaming or otherwise, defines its schema up front + /// which defines the mapping from dictionary IDs. It then sends these + /// dictionaries over the wire. + /// + /// This requires identifying the different dictionaries in use, assigning + /// them IDs, and sending new dictionaries, delta or otherwise, when needed + /// + /// See also: + /// * + Hydrate, + /// Send dictionary FlightData with every RecordBatch that contains a + /// [`DictionaryArray`]. See [`Self::Hydrate`] for more tradeoffs. No + /// attempt is made to skip sending the same (logical) dictionary values + /// twice. + /// + /// [`DictionaryArray`]: arrow_array::DictionaryArray + Resend, +} + /// Prepare an arrow Schema for transport over the Arrow Flight protocol /// /// Convert dictionary types to underlying types /// /// See hydrate_dictionary for more information -fn prepare_schema_for_flight(schema: &Schema) -> Schema { +fn prepare_schema_for_flight(schema: &Schema, send_dictionaries: bool) -> Schema { let fields: Fields = schema .fields() .iter() .map(|field| match field.data_type() { - DataType::Dictionary(_, value_type) => Field::new( + DataType::Dictionary(_, value_type) if !send_dictionaries => Field::new( field.name(), value_type.as_ref().clone(), field.is_nullable(), @@ -434,8 +490,7 @@ struct FlightIpcEncoder { } impl FlightIpcEncoder { - fn new(options: IpcWriteOptions) -> Self { - let error_on_replacement = true; + fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self { Self { options, data_gen: IpcDataGenerator::default(), @@ -478,12 +533,14 @@ impl FlightIpcEncoder { fn prepare_batch_for_flight( batch: &RecordBatch, schema: SchemaRef, + send_dictionaries: bool, ) -> Result { let columns = batch .columns() .iter() - .map(hydrate_dictionary) + .map(|c| hydrate_dictionary(c, send_dictionaries)) .collect::>>()?; + let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); Ok(RecordBatch::try_new_with_options( @@ -491,35 +548,28 @@ fn prepare_batch_for_flight( )?) } -/// Hydrates a dictionary to its underlying type -/// -/// An IPC response, streaming or otherwise, defines its schema up front -/// which defines the mapping from dictionary IDs. It then sends these -/// dictionaries over the wire. -/// -/// This requires identifying the different dictionaries in use, assigning -/// them IDs, and sending new dictionaries, delta or otherwise, when needed -/// -/// See also: -/// * -/// -/// For now we just hydrate the dictionaries to their underlying type -fn hydrate_dictionary(array: &ArrayRef) -> Result { - let arr = if let DataType::Dictionary(_, value) = array.data_type() { - arrow_cast::cast(array, value)? - } else { - Arc::clone(array) +/// Hydrates a dictionary to its underlying type if send_dictionaries is false. If send_dictionaries +/// is true, dictionaries are sent with every batch which is not as optimal as described in [DictionaryHandling::Hydrate] above, +/// but does enable sending DictionaryArray's via Flight. +fn hydrate_dictionary(array: &ArrayRef, send_dictionaries: bool) -> Result { + let arr = match array.data_type() { + DataType::Dictionary(_, value) if !send_dictionaries => { + arrow_cast::cast(array, value)? + } + _ => Arc::clone(array), }; Ok(arr) } #[cfg(test)] mod tests { - use arrow_array::types::*; use arrow_array::*; + use arrow_array::{cast::downcast_array, types::*}; use arrow_cast::pretty::pretty_format_batches; use std::collections::HashMap; + use crate::decode::{DecodedPayload, FlightDataDecoder}; + use super::*; #[test] @@ -537,7 +587,7 @@ mod tests { let big_batch = batch.slice(0, batch.num_rows() - 1); let optimized_big_batch = - prepare_batch_for_flight(&big_batch, Arc::clone(&schema)) + prepare_batch_for_flight(&big_batch, Arc::clone(&schema), false) .expect("failed to optimize"); let (_, optimized_big_flight_batch) = make_flight_data(&optimized_big_batch, &options); @@ -549,7 +599,7 @@ mod tests { let small_batch = batch.slice(0, 1); let optimized_small_batch = - prepare_batch_for_flight(&small_batch, Arc::clone(&schema)) + prepare_batch_for_flight(&small_batch, Arc::clone(&schema), false) .expect("failed to optimize"); let (_, optimized_small_flight_batch) = make_flight_data(&optimized_small_batch, &options); @@ -560,6 +610,84 @@ mod tests { ); } + #[tokio::test] + async fn test_dictionary_hydration() { + let arr: DictionaryArray = vec!["a", "a", "b"].into_iter().collect(); + let schema = Arc::new(Schema::new(vec![Field::new_dictionary( + "dict", + DataType::UInt16, + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); + let encoder = FlightDataEncoderBuilder::default() + .build(futures::stream::once(async { Ok(batch) })); + let mut decoder = FlightDataDecoder::new(encoder); + let expected_schema = + Schema::new(vec![Field::new("dict", DataType::Utf8, false)]); + let expected_schema = Arc::new(expected_schema); + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), expected_schema); + let expected_array = StringArray::from(vec!["a", "a", "b"]); + let actual_array = b.column_by_name("dict").unwrap(); + let actual_array = downcast_array::(actual_array); + + assert_eq!(actual_array, expected_array); + } + } + } + } + + #[tokio::test] + async fn test_send_dictionaries() { + let schema = Arc::new(Schema::new(vec![Field::new_dictionary( + "dict", + DataType::UInt16, + DataType::Utf8, + false, + )])); + + let arr_one: Arc> = + Arc::new(vec!["a", "a", "b"].into_iter().collect()); + let arr_two: Arc> = + Arc::new(vec!["b", "a", "c"].into_iter().collect()); + let batch_one = + RecordBatch::try_new(schema.clone(), vec![arr_one.clone()]).unwrap(); + let batch_two = + RecordBatch::try_new(schema.clone(), vec![arr_two.clone()]).unwrap(); + + let encoder = FlightDataEncoderBuilder::default() + .with_dictionary_handling(DictionaryHandling::Resend) + .build(futures::stream::iter(vec![Ok(batch_one), Ok(batch_two)])); + + let mut decoder = FlightDataDecoder::new(encoder); + let mut expected_array = arr_one; + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), schema); + + let actual_array = + Arc::new(downcast_array::>( + b.column_by_name("dict").unwrap(), + )); + + assert_eq!(actual_array, expected_array); + + expected_array = arr_two.clone(); + } + } + } + } + #[test] fn test_schema_metadata_encoded() { let schema = @@ -567,7 +695,7 @@ mod tests { HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), ); - let got = prepare_schema_for_flight(&schema); + let got = prepare_schema_for_flight(&schema, false); assert!(got.metadata().contains_key("some_key")); } @@ -580,7 +708,8 @@ mod tests { ) .expect("cannot create record batch"); - prepare_batch_for_flight(&batch, batch.schema()).expect("failed to optimize"); + prepare_batch_for_flight(&batch, batch.schema(), false) + .expect("failed to optimize"); } pub fn make_flight_data( From 11205a891c637694165ce40f75e9093729d80342 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:27:22 -0400 Subject: [PATCH 1268/1411] Add AWS presigned URL support (#4876) * refactor: Extract AWS algorithm string into a const * refactor: Extract a string_to_sign function and encapsulate non-reused values * refactor: Extract a scope function * refactor: Move hashing of canonical request into string_to_sign * refactor: Move canonical_request into string_to_sign * refactor: Move canonical URI construction into string_to_sign * refactor: Move canonical query construction into string_to_sign * feat: Implement sign method * feat: Publicly expose AWS S3 path_url for convenience constructing signed URLs * docs: Add an example of signing an upload URL * feat: Add a more convenient API on AmazonS3 for creating signed URLs * fix: Add credential token to the X-Amz-Security-Token query param if specified * fix: Change path_url to be pub crate instead of pub * feat: Define a public Signer trait for the signing interface * fix: Hide some doc test code Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fix: Use Method through reqwest which re-exports http anyway --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/aws/client.rs | 2 +- object_store/src/aws/credential.rs | 181 ++++++++++++++++++++++++----- object_store/src/aws/mod.rs | 64 +++++++++- object_store/src/lib.rs | 2 + object_store/src/signer.rs | 40 +++++++ 5 files changed, 255 insertions(+), 34 deletions(-) create mode 100644 object_store/src/signer.rs diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 1c35586f8bc9..e3ac60eca060 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -212,7 +212,7 @@ pub struct S3Config { } impl S3Config { - fn path_url(&self, path: &Path) -> String { + pub(crate) fn path_url(&self, path: &Path) -> String { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } } diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index be0ffa578d13..e27b71f7c411 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -30,7 +30,7 @@ use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; use std::collections::BTreeMap; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use tracing::warn; use url::Url; @@ -89,6 +89,7 @@ const DATE_HEADER: &str = "x-amz-date"; const HASH_HEADER: &str = "x-amz-content-sha256"; const TOKEN_HEADER: &str = "x-amz-security-token"; const AUTH_HEADER: &str = "authorization"; +const ALGORITHM: &str = "AWS4-HMAC-SHA256"; impl<'a> AwsAuthorizer<'a> { /// Create a new [`AwsAuthorizer`] @@ -154,21 +155,110 @@ impl<'a> AwsAuthorizer<'a> { let header_digest = HeaderValue::from_str(&digest).unwrap(); request.headers_mut().insert(HASH_HEADER, header_digest); - // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets URI-encoded once). + let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); + + let scope = self.scope(date); + + let string_to_sign = self.string_to_sign( + date, + &scope, + request.method(), + request.url(), + &canonical_headers, + &signed_headers, + &digest, + ); + + // sign the string + let signature = + self.credential + .sign(&string_to_sign, date, self.region, self.service); + + // build the actual auth header + let authorisation = format!( + "{} Credential={}/{}, SignedHeaders={}, Signature={}", + ALGORITHM, self.credential.key_id, scope, signed_headers, signature + ); + + let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); + request.headers_mut().insert(AUTH_HEADER, authorization_val); + } + + pub(crate) fn sign(&self, method: Method, url: &mut Url, expires_in: Duration) { + let date = self.date.unwrap_or_else(Utc::now); + let scope = self.scope(date); + + // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + url.query_pairs_mut() + .append_pair("X-Amz-Algorithm", ALGORITHM) + .append_pair( + "X-Amz-Credential", + &format!("{}/{}", self.credential.key_id, scope), + ) + .append_pair("X-Amz-Date", &date.format("%Y%m%dT%H%M%SZ").to_string()) + .append_pair("X-Amz-Expires", &expires_in.as_secs().to_string()) + .append_pair("X-Amz-SignedHeaders", "host"); + + // For S3, you must include the X-Amz-Security-Token query parameter in the URL if + // using credentials sourced from the STS service. + if let Some(ref token) = self.credential.token { + url.query_pairs_mut() + .append_pair("X-Amz-Security-Token", token); + } + + // We don't have a payload; the user is going to send the payload directly themselves. + let digest = UNSIGNED_PAYLOAD; + + let host = &url[url::Position::BeforeHost..url::Position::AfterPort].to_string(); + let mut headers = HeaderMap::new(); + let host_val = HeaderValue::from_str(host).unwrap(); + headers.insert("host", host_val); + + let (signed_headers, canonical_headers) = canonicalize_headers(&headers); + + let string_to_sign = self.string_to_sign( + date, + &scope, + &method, + url, + &canonical_headers, + &signed_headers, + digest, + ); + + let signature = + self.credential + .sign(&string_to_sign, date, self.region, self.service); + + url.query_pairs_mut() + .append_pair("X-Amz-Signature", &signature); + } + + #[allow(clippy::too_many_arguments)] + fn string_to_sign( + &self, + date: DateTime, + scope: &str, + request_method: &Method, + url: &Url, + canonical_headers: &str, + signed_headers: &str, + digest: &str, + ) -> String { + // Each path segment must be URI-encoded twice (except for Amazon S3 which only gets + // URI-encoded once). // see https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_uri = match self.service { - "s3" => request.url().path().to_string(), - _ => utf8_percent_encode(request.url().path(), &STRICT_PATH_ENCODE_SET) - .to_string(), + "s3" => url.path().to_string(), + _ => utf8_percent_encode(url.path(), &STRICT_PATH_ENCODE_SET).to_string(), }; - let (signed_headers, canonical_headers) = canonicalize_headers(request.headers()); - let canonical_query = canonicalize_query(request.url()); + let canonical_query = canonicalize_query(url); // https://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html let canonical_request = format!( "{}\n{}\n{}\n{}\n{}\n{}", - request.method().as_str(), + request_method.as_str(), canonical_uri, canonical_query, canonical_headers, @@ -177,33 +267,23 @@ impl<'a> AwsAuthorizer<'a> { ); let hashed_canonical_request = hex_digest(canonical_request.as_bytes()); - let scope = format!( - "{}/{}/{}/aws4_request", - date.format("%Y%m%d"), - self.region, - self.service - ); - let string_to_sign = format!( - "AWS4-HMAC-SHA256\n{}\n{}\n{}", + format!( + "{}\n{}\n{}\n{}", + ALGORITHM, date.format("%Y%m%dT%H%M%SZ"), scope, hashed_canonical_request - ); - - // sign the string - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); - - // build the actual auth header - let authorisation = format!( - "AWS4-HMAC-SHA256 Credential={}/{}, SignedHeaders={}, Signature={}", - self.credential.key_id, scope, signed_headers, signature - ); + ) + } - let authorization_val = HeaderValue::from_str(&authorisation).unwrap(); - request.headers_mut().insert(AUTH_HEADER, authorization_val); + fn scope(&self, date: DateTime) -> String { + format!( + "{}/{}/{}/aws4_request", + date.format("%Y%m%d"), + self.region, + self.service + ) } } @@ -667,7 +747,46 @@ mod tests { }; authorizer.authorize(&mut request, None); - assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699") + assert_eq!(request.headers().get(AUTH_HEADER).unwrap(), "AWS4-HMAC-SHA256 Credential=AKIAIOSFODNN7EXAMPLE/20220806/us-east-1/ec2/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=653c3d8ea261fd826207df58bc2bb69fbb5003e9eb3c0ef06e4a51f2a81d8699"); + } + + #[test] + fn signed_get_url() { + // Values from https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html + let credential = AwsCredential { + key_id: "AKIAIOSFODNN7EXAMPLE".to_string(), + secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + token: None, + }; + + let date = DateTime::parse_from_rfc3339("2013-05-24T00:00:00Z") + .unwrap() + .with_timezone(&Utc); + + let authorizer = AwsAuthorizer { + date: Some(date), + credential: &credential, + service: "s3", + region: "us-east-1", + sign_payload: false, + }; + + let mut url = + Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); + + assert_eq!( + url, + Url::parse( + "https://examplebucket.s3.amazonaws.com/test.txt?\ + X-Amz-Algorithm=AWS4-HMAC-SHA256&\ + X-Amz-Credential=AKIAIOSFODNN7EXAMPLE%2F20130524%2Fus-east-1%2Fs3%2Faws4_request&\ + X-Amz-Date=20130524T000000Z&\ + X-Amz-Expires=86400&\ + X-Amz-SignedHeaders=host&\ + X-Amz-Signature=aeeed9bbccd4d02ee5c0109b86d86835f995330da4c265957d157751f604d404" + ).unwrap() + ); } #[test] diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index db3e1b9a4bbe..0028be99fa2e 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -36,10 +36,10 @@ use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; +use reqwest::Method; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::str::FromStr; -use std::sync::Arc; +use std::{str::FromStr, sync::Arc, time::Duration}; use tokio::io::AsyncWrite; use tracing::info; use url::Url; @@ -56,6 +56,7 @@ use crate::client::{ }; use crate::config::ConfigValue; use crate::multipart::{PartId, PutPart, WriteMultiPart}; +use crate::signer::Signer; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, RetryConfig, @@ -209,6 +210,65 @@ impl AmazonS3 { pub fn credentials(&self) -> &AwsCredentialProvider { &self.client.config().credentials } + + /// Create a full URL to the resource specified by `path` with this instance's configuration. + fn path_url(&self, path: &Path) -> String { + self.client.config().path_url(path) + } +} + +#[async_trait] +impl Signer for AmazonS3 { + /// Create a URL containing the relevant [AWS SigV4] query parameters that authorize a request + /// via `method` to the resource at `path` valid for the duration specified in `expires_in`. + /// + /// [AWS SigV4]: https://docs.aws.amazon.com/IAM/latest/UserGuide/create-signed-request.html + /// + /// # Example + /// + /// This example returns a URL that will enable a user to upload a file to + /// "some-folder/some-file.txt" in the next hour. + /// + /// ``` + /// # async fn example() -> Result<(), Box> { + /// # use object_store::{aws::AmazonS3Builder, path::Path, signer::Signer}; + /// # use reqwest::Method; + /// # use std::time::Duration; + /// # + /// let region = "us-east-1"; + /// let s3 = AmazonS3Builder::new() + /// .with_region(region) + /// .with_bucket_name("my-bucket") + /// .with_access_key_id("my-access-key-id") + /// .with_secret_access_key("my-secret-access-key") + /// .build()?; + /// + /// let url = s3.signed_url( + /// Method::PUT, + /// &Path::from("some-folder/some-file.txt"), + /// Duration::from_secs(60 * 60) + /// ).await?; + /// # Ok(()) + /// # } + /// ``` + async fn signed_url( + &self, + method: Method, + path: &Path, + expires_in: Duration, + ) -> Result { + let credential = self.credentials().get_credential().await?; + let authorizer = + AwsAuthorizer::new(&credential, "s3", &self.client.config().region); + + let path_url = self.path_url(path); + let mut url = + Url::parse(&path_url).context(UnableToParseUrlSnafu { url: path_url })?; + + authorizer.sign(method, &mut url, expires_in); + + Ok(url) + } } #[async_trait] diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 3fd363fd4f06..68e785b3a31e 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -267,6 +267,8 @@ pub mod local; pub mod memory; pub mod path; pub mod prefix; +#[cfg(feature = "cloud")] +pub mod signer; pub mod throttle; #[cfg(feature = "cloud")] diff --git a/object_store/src/signer.rs b/object_store/src/signer.rs new file mode 100644 index 000000000000..f1f35debe053 --- /dev/null +++ b/object_store/src/signer.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Abstraction of signed URL generation for those object store implementations that support it + +use crate::{path::Path, Result}; +use async_trait::async_trait; +use reqwest::Method; +use std::{fmt, time::Duration}; +use url::Url; + +/// Universal API to presigned URLs generated from multiple object store services. Not supported by +/// all object store services. +#[async_trait] +pub trait Signer: Send + Sync + fmt::Debug + 'static { + /// Given the intended [`Method`] and [`Path`] to use and the desired length of time for which + /// the URL should be valid, return a signed [`Url`] created with the object store + /// implementation's credentials such that the URL can be handed to something that doesn't have + /// access to the object store's credentials, to allow limited access to the object store. + async fn signed_url( + &self, + method: Method, + path: &Path, + expires_in: Duration, + ) -> Result; +} From 0503d65b36b2c4267114ca5882967264929b8fc4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:27:45 +0100 Subject: [PATCH 1269/1411] Update zstd requirement from 0.12.0 to 0.13.0 (#4923) Updates the requirements on [zstd](https://github.com/gyscos/zstd-rs) to permit the latest version. - [Release notes](https://github.com/gyscos/zstd-rs/releases) - [Commits](https://github.com/gyscos/zstd-rs/compare/v0.12.0...v0.13.0) --- updated-dependencies: - dependency-name: zstd dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-ipc/Cargo.toml | 2 +- parquet/Cargo.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index b5f66294a7c7..83ad044d25e7 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -41,7 +41,7 @@ arrow-data = { workspace = true } arrow-schema = { workspace = true } flatbuffers = { version = "23.1.21", default-features = false } lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } -zstd = { version = "0.12.0", default-features = false, optional = true } +zstd = { version = "0.13.0", default-features = false, optional = true } [features] default = [] diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index eaafb5130fcb..659e2c0ee3a7 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -52,7 +52,7 @@ snap = { version = "1.0", default-features = false, optional = true } brotli = { version = "3.3", default-features = false, features = ["std"], optional = true } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true } lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } -zstd = { version = "0.12.0", optional = true, default-features = false } +zstd = { version = "0.13.0", optional = true, default-features = false } chrono = { workspace = true } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } @@ -75,7 +75,7 @@ tempfile = { version = "3.0", default-features = false } brotli = { version = "3.3", default-features = false, features = ["std"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"] } -zstd = { version = "0.12", default-features = false } +zstd = { version = "0.13", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } From 6e49f31e6fc992cfd93e84bd6f2d3d6b563b62a4 Mon Sep 17 00:00:00 2001 From: Hengfei Yang Date: Thu, 12 Oct 2023 09:30:36 -0500 Subject: [PATCH 1270/1411] feat: add method for async read bloom filter (#4917) * feat: add method for async read bloomfilter * fix: compatible for bloom filter length * test: add unit tests for read bloom filter * fix: format code for unit test --- parquet/src/arrow/async_reader/mod.rs | 147 +++++++++++++++++++++++++- parquet/src/bloom_filter/mod.rs | 8 +- 2 files changed, 146 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c749d4deeb16..4b3eebf2e67e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -77,7 +77,6 @@ use std::collections::VecDeque; use std::fmt::Formatter; - use std::io::SeekFrom; use std::ops::Range; use std::pin::Pin; @@ -88,7 +87,6 @@ use bytes::{Buf, Bytes}; use futures::future::{BoxFuture, FutureExt}; use futures::ready; use futures::stream::Stream; - use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; use arrow_array::RecordBatch; @@ -102,15 +100,18 @@ use crate::arrow::arrow_reader::{ }; use crate::arrow::ProjectionMask; +use crate::bloom_filter::{ + chunk_read_bloom_filter_header_and_offset, Sbbf, SBBF_HEADER_SIZE_ESTIMATE, +}; use crate::column::page::{PageIterator, PageReader}; - use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; -use crate::format::PageLocation; - use crate::file::FOOTER_SIZE; +use crate::format::{ + BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation, +}; mod metadata; pub use metadata::*; @@ -302,6 +303,71 @@ impl ParquetRecordBatchStreamBuilder { Self::new_builder(AsyncReader(input), metadata) } + /// Read bloom filter for a column in a row group + /// Returns `None` if the column does not have a bloom filter + /// + /// We should call this function after other forms pruning, such as projection and predicate pushdown. + pub async fn get_row_group_column_bloom_filter( + &mut self, + row_group_idx: usize, + column_idx: usize, + ) -> Result> { + let metadata = self.metadata.row_group(row_group_idx); + let column_metadata = metadata.column(column_idx); + + let offset: usize = if let Some(offset) = column_metadata.bloom_filter_offset() { + offset.try_into().map_err(|_| { + ParquetError::General("Bloom filter offset is invalid".to_string()) + })? + } else { + return Ok(None); + }; + + let buffer = match column_metadata.bloom_filter_length() { + Some(length) => self.input.0.get_bytes(offset..offset + length as usize), + None => self + .input + .0 + .get_bytes(offset..offset + SBBF_HEADER_SIZE_ESTIMATE), + } + .await?; + + let (header, bitset_offset) = + chunk_read_bloom_filter_header_and_offset(offset as u64, buffer.clone())?; + + match header.algorithm { + BloomFilterAlgorithm::BLOCK(_) => { + // this match exists to future proof the singleton algorithm enum + } + } + match header.compression { + BloomFilterCompression::UNCOMPRESSED(_) => { + // this match exists to future proof the singleton compression enum + } + } + match header.hash { + BloomFilterHash::XXHASH(_) => { + // this match exists to future proof the singleton hash enum + } + } + + let bitset = match column_metadata.bloom_filter_length() { + Some(_) => buffer.slice((bitset_offset as usize - offset)..), + None => { + let bitset_length: usize = header.num_bytes.try_into().map_err(|_| { + ParquetError::General("Bloom filter length is invalid".to_string()) + })?; + self.input + .0 + .get_bytes( + bitset_offset as usize..bitset_offset as usize + bitset_length, + ) + .await? + } + }; + Ok(Some(Sbbf::new(&bitset))) + } + /// Build a new [`ParquetRecordBatchStream`] pub fn build(self) -> Result> { let num_row_groups = self.metadata.row_groups().len(); @@ -1540,4 +1606,75 @@ mod tests { assert_ne!(1024, file_rows); assert_eq!(stream.batch_size, file_rows); } + + #[tokio::test] + async fn test_get_row_group_column_bloom_filter_without_length() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/data_index_bloom_encoding_stats.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + test_get_row_group_column_bloom_filter(data, false).await; + } + + #[tokio::test] + async fn test_get_row_group_column_bloom_filter_with_length() { + // convert to new parquet file with bloom_filter_length + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/data_index_bloom_encoding_stats.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + let schema = builder.schema().clone(); + let stream = builder.build().unwrap(); + let batches = stream.try_collect::>().await.unwrap(); + + let mut parquet_data = Vec::new(); + let props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + let mut writer = + ArrowWriter::try_new(&mut parquet_data, schema, Some(props)).unwrap(); + for batch in batches { + writer.write(&batch).unwrap(); + } + writer.close().unwrap(); + + // test the new parquet file + test_get_row_group_column_bloom_filter(parquet_data.into(), true).await; + } + + async fn test_get_row_group_column_bloom_filter(data: Bytes, with_length: bool) { + let metadata = parse_metadata(&data).unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + let row_group = metadata.row_group(0); + let column = row_group.column(0); + assert_eq!(column.bloom_filter_length().is_some(), with_length); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let mut builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + + let sbbf = builder + .get_row_group_column_bloom_filter(0, 0) + .await + .unwrap() + .unwrap(); + assert!(sbbf.check(&"Hello")); + assert!(!sbbf.check(&"Hello_Not_Exists")); + } } diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index a3807eb37011..e98aee9fd213 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -132,11 +132,11 @@ impl std::ops::IndexMut for Block { #[derive(Debug, Clone)] pub struct Sbbf(Vec); -const SBBF_HEADER_SIZE_ESTIMATE: usize = 20; +pub(crate) const SBBF_HEADER_SIZE_ESTIMATE: usize = 20; /// given an initial offset, and a byte buffer, try to read out a bloom filter header and return /// both the header and the offset after it (for bitset). -fn chunk_read_bloom_filter_header_and_offset( +pub(crate) fn chunk_read_bloom_filter_header_and_offset( offset: u64, buffer: Bytes, ) -> Result<(BloomFilterHeader, u64), ParquetError> { @@ -147,7 +147,7 @@ fn chunk_read_bloom_filter_header_and_offset( /// given a [Bytes] buffer, try to read out a bloom filter header and return both the header and /// length of the header. #[inline] -fn read_bloom_filter_header_and_length( +pub(crate) fn read_bloom_filter_header_and_length( buffer: Bytes, ) -> Result<(BloomFilterHeader, u64), ParquetError> { let total_length = buffer.len(); @@ -199,7 +199,7 @@ impl Sbbf { Self::new(&bitset) } - fn new(bitset: &[u8]) -> Self { + pub(crate) fn new(bitset: &[u8]) -> Self { let data = bitset .chunks_exact(4 * 8) .map(|chunk| { From 90bc5ec96b5ae5162f469f9784dde7b1a53a5bdd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 12 Oct 2023 20:39:46 +0100 Subject: [PATCH 1271/1411] Support Arbitrary JSON values in JSON Reader (#4905) (#4911) * Support Arbitrary JSON values in JSON Reader (#4905) * Review feedback * Clippy * Docs --- arrow-json/src/reader/mod.rs | 110 +++++++++++++++++++++++++++------- arrow-json/src/reader/tape.rs | 61 +++++++++---------- arrow-json/src/writer.rs | 6 +- 3 files changed, 116 insertions(+), 61 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 4e98e2fd873a..c1cef0ec81b4 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -17,9 +17,13 @@ //! JSON reader //! -//! This JSON reader allows JSON line-delimited files to be read into the Arrow memory -//! model. Records are loaded in batches and are then converted from row-based data to -//! columnar data. +//! This JSON reader allows JSON records to be read into the Arrow memory +//! model. Records are loaded in batches and are then converted from the record-oriented +//! representation to the columnar arrow data model. +//! +//! The reader ignores whitespace between JSON values, including `\n` and `\r`, allowing +//! parsing of sequences of one or more arbitrarily formatted JSON values, including +//! but not limited to newline-delimited JSON. //! //! # Basic Usage //! @@ -130,6 +134,7 @@ //! use std::io::BufRead; +use std::sync::Arc; use chrono::Utc; use serde::Serialize; @@ -137,9 +142,11 @@ use serde::Serialize; use arrow_array::timezone::Tz; use arrow_array::types::Float32Type; use arrow_array::types::*; -use arrow_array::{downcast_integer, RecordBatch, RecordBatchReader, StructArray}; +use arrow_array::{ + downcast_integer, make_array, RecordBatch, RecordBatchReader, StructArray, +}; use arrow_data::ArrayData; -use arrow_schema::{ArrowError, DataType, SchemaRef, TimeUnit}; +use arrow_schema::{ArrowError, DataType, FieldRef, Schema, SchemaRef, TimeUnit}; pub use schema::*; use crate::reader::boolean_array::BooleanArrayDecoder; @@ -150,7 +157,7 @@ use crate::reader::null_array::NullArrayDecoder; use crate::reader::primitive_array::PrimitiveArrayDecoder; use crate::reader::string_array::StringArrayDecoder; use crate::reader::struct_array::StructArrayDecoder; -use crate::reader::tape::{Tape, TapeDecoder, TapeElement}; +use crate::reader::tape::{Tape, TapeDecoder}; use crate::reader::timestamp_array::TimestampArrayDecoder; mod boolean_array; @@ -171,6 +178,7 @@ pub struct ReaderBuilder { batch_size: usize, coerce_primitive: bool, strict_mode: bool, + is_field: bool, schema: SchemaRef, } @@ -189,10 +197,51 @@ impl ReaderBuilder { batch_size: 1024, coerce_primitive: false, strict_mode: false, + is_field: false, schema, } } + /// Create a new [`ReaderBuilder`] that will parse JSON values of `field.data_type()` + /// + /// Unlike [`ReaderBuilder::new`] this does not require the root of the JSON data + /// to be an object, i.e. `{..}`, allowing for parsing of any valid JSON value(s) + /// + /// ``` + /// # use std::sync::Arc; + /// # use arrow_array::cast::AsArray; + /// # use arrow_array::types::Int32Type; + /// # use arrow_json::ReaderBuilder; + /// # use arrow_schema::{DataType, Field}; + /// // Root of JSON schema is a numeric type + /// let data = "1\n2\n3\n"; + /// let field = Arc::new(Field::new("int", DataType::Int32, true)); + /// let mut reader = ReaderBuilder::new_with_field(field.clone()).build(data.as_bytes()).unwrap(); + /// let b = reader.next().unwrap().unwrap(); + /// let values = b.column(0).as_primitive::().values(); + /// assert_eq!(values, &[1, 2, 3]); + /// + /// // Root of JSON schema is a list type + /// let data = "[1, 2, 3, 4, 5, 6, 7]\n[1, 2, 3]"; + /// let field = Field::new_list("int", field.clone(), true); + /// let mut reader = ReaderBuilder::new_with_field(field).build(data.as_bytes()).unwrap(); + /// let b = reader.next().unwrap().unwrap(); + /// let list = b.column(0).as_list::(); + /// + /// assert_eq!(list.offsets().as_ref(), &[0, 7, 10]); + /// let list_values = list.values().as_primitive::(); + /// assert_eq!(list_values.values(), &[1, 2, 3, 4, 5, 6, 7, 1, 2, 3]); + /// ``` + pub fn new_with_field(field: impl Into) -> Self { + Self { + batch_size: 1024, + coerce_primitive: false, + strict_mode: false, + is_field: true, + schema: Arc::new(Schema::new([field.into()])), + } + } + /// Sets the batch size in rows to read pub fn with_batch_size(self, batch_size: usize) -> Self { Self { batch_size, ..self } @@ -233,16 +282,22 @@ impl ReaderBuilder { /// Create a [`Decoder`] pub fn build_decoder(self) -> Result { - let decoder = make_decoder( - DataType::Struct(self.schema.fields.clone()), - self.coerce_primitive, - self.strict_mode, - false, - )?; + let (data_type, nullable) = match self.is_field { + false => (DataType::Struct(self.schema.fields.clone()), false), + true => { + let field = &self.schema.fields[0]; + (field.data_type().clone(), field.is_nullable()) + } + }; + + let decoder = + make_decoder(data_type, self.coerce_primitive, self.strict_mode, nullable)?; + let num_fields = self.schema.all_fields().len(); Ok(Decoder { decoder, + is_field: self.is_field, tape_decoder: TapeDecoder::new(self.batch_size, num_fields), batch_size: self.batch_size, schema: self.schema, @@ -344,6 +399,7 @@ pub struct Decoder { tape_decoder: TapeDecoder, decoder: Box, batch_size: usize, + is_field: bool, schema: SchemaRef, } @@ -563,24 +619,20 @@ impl Decoder { let mut next_object = 1; let pos: Vec<_> = (0..tape.num_rows()) .map(|_| { - let end = match tape.get(next_object) { - TapeElement::StartObject(end) => end, - _ => unreachable!("corrupt tape"), - }; - std::mem::replace(&mut next_object, end + 1) + let next = tape.next(next_object, "row").unwrap(); + std::mem::replace(&mut next_object, next) }) .collect(); let decoded = self.decoder.decode(&tape, &pos)?; self.tape_decoder.clear(); - // Sanity check - assert!(matches!(decoded.data_type(), DataType::Struct(_))); - assert_eq!(decoded.null_count(), 0); - assert_eq!(decoded.len(), pos.len()); + let batch = match self.is_field { + true => RecordBatch::try_new(self.schema.clone(), vec![make_array(decoded)])?, + false => RecordBatch::from(StructArray::from(decoded)) + .with_schema(self.schema.clone())?, + }; - let batch = RecordBatch::from(StructArray::from(decoded)) - .with_schema(self.schema.clone())?; Ok(Some(batch)) } } @@ -2175,4 +2227,16 @@ mod tests { let values = batch.column(0).as_primitive::(); assert_eq!(values.values(), &[1681319393, -7200]); } + + #[test] + fn test_serde_field() { + let field = Field::new("int", DataType::Int32, true); + let mut decoder = ReaderBuilder::new_with_field(field) + .build_decoder() + .unwrap(); + decoder.serialize(&[1_i32, 2, 3, 4]).unwrap(); + let b = decoder.flush().unwrap().unwrap(); + let values = b.column(0).as_primitive::().values(); + assert_eq!(values, &[1, 2, 3, 4]); + } } diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 801e8f29d525..b39caede7047 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -297,7 +297,8 @@ macro_rules! next { pub struct TapeDecoder { elements: Vec, - num_rows: usize, + /// The number of rows decoded, including any in progress if `!stack.is_empty()` + cur_row: usize, /// Number of rows to read per batch batch_size: usize, @@ -330,36 +331,34 @@ impl TapeDecoder { offsets, elements, batch_size, - num_rows: 0, + cur_row: 0, bytes: Vec::with_capacity(num_fields * 2 * 8), stack: Vec::with_capacity(10), } } pub fn decode(&mut self, buf: &[u8]) -> Result { - if self.num_rows >= self.batch_size { - return Ok(0); - } - let mut iter = BufIter::new(buf); while !iter.is_empty() { - match self.stack.last_mut() { - // Start of row + let state = match self.stack.last_mut() { + Some(l) => l, None => { - // Skip over leading whitespace iter.skip_whitespace(); - match next!(iter) { - b'{' => { - let idx = self.elements.len() as u32; - self.stack.push(DecoderState::Object(idx)); - self.elements.push(TapeElement::StartObject(u32::MAX)); - } - b => return Err(err(b, "trimming leading whitespace")), + if iter.is_empty() || self.cur_row >= self.batch_size { + break; } + + // Start of row + self.cur_row += 1; + self.stack.push(DecoderState::Value); + self.stack.last_mut().unwrap() } + }; + + match state { // Decoding an object - Some(DecoderState::Object(start_idx)) => { + DecoderState::Object(start_idx) => { iter.advance_until(|b| !json_whitespace(b) && b != b','); match next!(iter) { b'"' => { @@ -374,16 +373,12 @@ impl TapeDecoder { TapeElement::StartObject(end_idx); self.elements.push(TapeElement::EndObject(start_idx)); self.stack.pop(); - self.num_rows += self.stack.is_empty() as usize; - if self.num_rows >= self.batch_size { - break; - } } b => return Err(err(b, "parsing object")), } } // Decoding a list - Some(DecoderState::List(start_idx)) => { + DecoderState::List(start_idx) => { iter.advance_until(|b| !json_whitespace(b) && b != b','); match iter.peek() { Some(b']') => { @@ -400,7 +395,7 @@ impl TapeDecoder { } } // Decoding a string - Some(DecoderState::String) => { + DecoderState::String => { let s = iter.advance_until(|b| matches!(b, b'\\' | b'"')); self.bytes.extend_from_slice(s); @@ -415,7 +410,7 @@ impl TapeDecoder { b => unreachable!("{}", b), } } - Some(state @ DecoderState::Value) => { + state @ DecoderState::Value => { iter.skip_whitespace(); *state = match next!(iter) { b'"' => DecoderState::String, @@ -439,7 +434,7 @@ impl TapeDecoder { b => return Err(err(b, "parsing value")), }; } - Some(DecoderState::Number) => { + DecoderState::Number => { let s = iter.advance_until(|b| { !matches!(b, b'0'..=b'9' | b'-' | b'+' | b'.' | b'e' | b'E') }); @@ -452,14 +447,14 @@ impl TapeDecoder { self.offsets.push(self.bytes.len()); } } - Some(DecoderState::Colon) => { + DecoderState::Colon => { iter.skip_whitespace(); match next!(iter) { b':' => self.stack.pop(), b => return Err(err(b, "parsing colon")), }; } - Some(DecoderState::Literal(literal, idx)) => { + DecoderState::Literal(literal, idx) => { let bytes = literal.bytes(); let expected = bytes.iter().skip(*idx as usize).copied(); for (expected, b) in expected.zip(&mut iter) { @@ -474,7 +469,7 @@ impl TapeDecoder { self.elements.push(element); } } - Some(DecoderState::Escape) => { + DecoderState::Escape => { let v = match next!(iter) { b'u' => { self.stack.pop(); @@ -496,7 +491,7 @@ impl TapeDecoder { self.bytes.push(v); } // Parse a unicode escape sequence - Some(DecoderState::Unicode(high, low, idx)) => loop { + DecoderState::Unicode(high, low, idx) => loop { match *idx { 0..=3 => *high = *high << 4 | parse_hex(next!(iter))? as u16, 4 => { @@ -547,7 +542,7 @@ impl TapeDecoder { .try_for_each(|row| row.serialize(&mut serializer)) .map_err(|e| ArrowError::JsonError(e.to_string()))?; - self.num_rows += rows.len(); + self.cur_row += rows.len(); Ok(()) } @@ -591,7 +586,7 @@ impl TapeDecoder { strings, elements: &self.elements, string_offsets: &self.offsets, - num_rows: self.num_rows, + num_rows: self.cur_row, }) } @@ -599,7 +594,7 @@ impl TapeDecoder { pub fn clear(&mut self) { assert!(self.stack.is_empty()); - self.num_rows = 0; + self.cur_row = 0; self.bytes.clear(); self.elements.clear(); self.elements.push(TapeElement::Null); @@ -837,7 +832,7 @@ mod tests { let err = decoder.decode(b"hello").unwrap_err().to_string(); assert_eq!( err, - "Json error: Encountered unexpected 'h' whilst trimming leading whitespace" + "Json error: Encountered unexpected 'h' whilst parsing value" ); let mut decoder = TapeDecoder::new(16, 2); diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index db371b59080a..8c4145bc95b4 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1338,11 +1338,7 @@ mod tests { let batch = reader.next().unwrap().unwrap(); - let list_row = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); + let list_row = batch.column(0).as_list::(); let values = list_row.values(); assert_eq!(values.len(), 4); assert_eq!(values.null_count(), 1); From bb8e42f6392284f4a7a39d3eec74144a603b481c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 15 Oct 2023 11:04:14 +0100 Subject: [PATCH 1272/1411] Add GetOptions::head (#4931) --- object_store/src/aws/client.rs | 9 ++------ object_store/src/aws/mod.rs | 4 ---- object_store/src/azure/client.rs | 9 ++------ object_store/src/azure/mod.rs | 4 ---- object_store/src/client/get.rs | 24 +++------------------ object_store/src/gcp/mod.rs | 13 ++--------- object_store/src/http/client.rs | 15 +++++-------- object_store/src/http/mod.rs | 4 ---- object_store/src/lib.rs | 12 ++++++++++- object_store/src/local.rs | 37 ++++---------------------------- 10 files changed, 29 insertions(+), 102 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index e3ac60eca060..ac07f9ab9af3 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -554,15 +554,10 @@ impl GetClient for S3Client { const STORE: &'static str = STORE; /// Make an S3 GET request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 0028be99fa2e..285ee2f59deb 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -307,10 +307,6 @@ impl ObjectStore for AmazonS3 { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location, &()).await } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index cd1a3a10fcc7..f65388b61a80 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -264,15 +264,10 @@ impl GetClient for AzureClient { /// Make an Azure GET request /// /// - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index b210d486d9bf..9017634c42da 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -202,10 +202,6 @@ impl ObjectStore for MicrosoftAzure { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location, &()).await } diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 333f6fe58475..7f68b6d1225f 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -17,7 +17,7 @@ use crate::client::header::{header_meta, HeaderConfig}; use crate::path::Path; -use crate::{Error, GetOptions, GetResult, ObjectMeta}; +use crate::{Error, GetOptions, GetResult}; use crate::{GetResultPayload, Result}; use async_trait::async_trait; use futures::{StreamExt, TryStreamExt}; @@ -34,27 +34,20 @@ pub trait GetClient: Send + Sync + 'static { last_modified_required: true, }; - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result; + async fn get_request(&self, path: &Path, options: GetOptions) -> Result; } /// Extension trait for [`GetClient`] that adds common retrieval functionality #[async_trait] pub trait GetClientExt { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result; - - async fn head(&self, location: &Path) -> Result; } #[async_trait] impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); - let response = self.get_request(location, options, false).await?; + let response = self.get_request(location, options).await?; let meta = header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { Error::Generic { @@ -77,15 +70,4 @@ impl GetClientExt for T { meta, }) } - - async fn head(&self, location: &Path) -> Result { - let options = GetOptions::default(); - let response = self.get_request(location, options, true).await?; - header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { - Error::Generic { - store: T::STORE, - source: Box::new(e), - } - }) - } } diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index a0a60f27a6aa..f80704b91765 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -389,16 +389,11 @@ impl GetClient for GoogleCloudStorageClient { const STORE: &'static str = STORE; /// Perform a get request - async fn get_request( - &self, - path: &Path, - options: GetOptions, - head: bool, - ) -> Result { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); - let method = match head { + let method = match options.head { true => Method::HEAD, false => Method::GET, }; @@ -604,10 +599,6 @@ impl ObjectStore for GoogleCloudStorage { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete_request(location).await } diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 0bd2e5639cb5..b2a6ac0aa34a 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -288,14 +288,9 @@ impl GetClient for Client { last_modified_required: false, }; - async fn get_request( - &self, - location: &Path, - options: GetOptions, - head: bool, - ) -> Result { - let url = self.path_url(location); - let method = match head { + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + let url = self.path_url(path); + let method = match options.head { true => Method::HEAD, false => Method::GET, }; @@ -311,7 +306,7 @@ impl GetClient for Client { Some(StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED) => { crate::Error::NotFound { source: Box::new(source), - path: location.to_string(), + path: path.to_string(), } } _ => Error::Request { source }.into(), @@ -322,7 +317,7 @@ impl GetClient for Client { if has_range && res.status() != StatusCode::PARTIAL_CONTENT { return Err(crate::Error::NotSupported { source: Box::new(Error::RangeNotSupported { - href: location.to_string(), + href: path.to_string(), }), }); } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index e9ed5902d8f5..6ffb62358941 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -118,10 +118,6 @@ impl ObjectStore for HttpStore { self.client.get_opts(location, options).await } - async fn head(&self, location: &Path) -> Result { - self.client.head(location).await - } - async fn delete(&self, location: &Path) -> Result<()> { self.client.delete(location).await } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 68e785b3a31e..ff0a46533dda 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -410,7 +410,13 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { } /// Return the metadata for the specified location - async fn head(&self, location: &Path) -> Result; + async fn head(&self, location: &Path) -> Result { + let options = GetOptions { + head: true, + ..Default::default() + }; + Ok(self.get_opts(location, options).await?.meta) + } /// Delete the object at the specified location. async fn delete(&self, location: &Path) -> Result<()>; @@ -716,6 +722,10 @@ pub struct GetOptions { /// /// pub range: Option>, + /// Request transfer of no content + /// + /// + pub head: bool, } impl GetOptions { diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 69da170b0872..3ed63a410815 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -419,35 +419,6 @@ impl ObjectStore for LocalFileSystem { .await } - async fn head(&self, location: &Path) -> Result { - let path = self.config.path_to_filesystem(location)?; - let location = location.clone(); - - maybe_spawn_blocking(move || { - let metadata = match metadata(&path) { - Err(e) => Err(match e.kind() { - ErrorKind::NotFound => Error::NotFound { - path: path.clone(), - source: e, - }, - _ => Error::Metadata { - source: e.into(), - path: location.to_string(), - }, - }), - Ok(m) => match !m.is_dir() { - true => Ok(m), - false => Err(Error::NotFound { - path, - source: io::Error::new(ErrorKind::NotFound, "is directory"), - }), - }, - }?; - convert_metadata(metadata, location) - }) - .await - } - async fn delete(&self, location: &Path) -> Result<()> { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || match std::fs::remove_file(&path) { @@ -1604,15 +1575,15 @@ mod unix_test { let path = root.path().join(filename); unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); - let location = Path::from(filename); - integration.head(&location).await.unwrap(); - // Need to open read and write side in parallel let spawned = tokio::task::spawn_blocking(|| { - OpenOptions::new().write(true).open(path).unwrap(); + OpenOptions::new().write(true).open(path).unwrap() }); + let location = Path::from(filename); + integration.head(&location).await.unwrap(); integration.get(&location).await.unwrap(); + spawned.await.unwrap(); } } From 57cd0945db863059d30d31a890b692a6844038fd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 10:56:10 +0100 Subject: [PATCH 1273/1411] Allow opting out of request signing (#4927) (#4929) --- object_store/src/aws/client.rs | 24 +++++++++------- object_store/src/aws/credential.rs | 21 ++++++++------ object_store/src/aws/mod.rs | 44 ++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 18 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index ac07f9ab9af3..8a45a9f3ac47 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -207,6 +207,7 @@ pub struct S3Config { pub retry_config: RetryConfig, pub client_options: ClientOptions, pub sign_payload: bool, + pub skip_signature: bool, pub checksum: Option, pub copy_if_not_exists: Option, } @@ -234,8 +235,11 @@ impl S3Client { &self.config } - async fn get_credential(&self) -> Result> { - self.config.credentials.get_credential().await + async fn get_credential(&self) -> Result>> { + Ok(match self.config.skip_signature { + false => Some(self.config.credentials.get_credential().await?), + true => None, + }) } /// Make an S3 PUT request @@ -271,7 +275,7 @@ impl S3Client { let response = builder .query(query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -299,7 +303,7 @@ impl S3Client { .request(Method::DELETE, url) .query(query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -390,7 +394,7 @@ impl S3Client { .header(CONTENT_TYPE, "application/xml") .body(body) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -459,7 +463,7 @@ impl S3Client { builder .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -490,7 +494,7 @@ impl S3Client { .client .request(Method::POST, url) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -535,7 +539,7 @@ impl S3Client { .query(&[("uploadId", upload_id)]) .body(body) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -567,7 +571,7 @@ impl GetClient for S3Client { let response = builder .with_get_options(options) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, @@ -621,7 +625,7 @@ impl ListClient for S3Client { .request(Method::GET, &url) .query(&query) .with_aws_sigv4( - credential.as_ref(), + credential.as_deref(), &self.config.region, "s3", self.config.sign_payload, diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index e27b71f7c411..e0c5de5fe784 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -291,7 +291,7 @@ pub trait CredentialExt { /// Sign a request fn with_aws_sigv4( self, - credential: &AwsCredential, + credential: Option<&AwsCredential>, region: &str, service: &str, sign_payload: bool, @@ -302,20 +302,25 @@ pub trait CredentialExt { impl CredentialExt for RequestBuilder { fn with_aws_sigv4( self, - credential: &AwsCredential, + credential: Option<&AwsCredential>, region: &str, service: &str, sign_payload: bool, payload_sha256: Option<&[u8]>, ) -> Self { - let (client, request) = self.build_split(); - let mut request = request.expect("request valid"); + match credential { + Some(credential) => { + let (client, request) = self.build_split(); + let mut request = request.expect("request valid"); - AwsAuthorizer::new(credential, service, region) - .with_sign_payload(sign_payload) - .authorize(&mut request, payload_sha256); + AwsAuthorizer::new(credential, service, region) + .with_sign_payload(sign_payload) + .authorize(&mut request, payload_sha256); - Self::from_parts(client, request) + Self::from_parts(client, request) + } + None => self, + } } } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 285ee2f59deb..70170a3cf48a 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -448,6 +448,8 @@ pub struct AmazonS3Builder { client_options: ClientOptions, /// Credentials credentials: Option, + /// Skip signing requests + skip_signature: ConfigValue, /// Copy if not exists copy_if_not_exists: Option>, } @@ -586,6 +588,9 @@ pub enum AmazonS3ConfigKey { /// See [`S3CopyIfNotExists`] CopyIfNotExists, + /// Skip signing request + SkipSignature, + /// Client options Client(ClientConfigKey), } @@ -608,6 +613,7 @@ impl AsRef for AmazonS3ConfigKey { Self::ContainerCredentialsRelativeUri => { "aws_container_credentials_relative_uri" } + Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), } @@ -642,6 +648,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_container_credentials_relative_uri" => { Ok(Self::ContainerCredentialsRelativeUri) } + "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), @@ -753,6 +760,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::Client(key) => { self.client_options = self.client_options.with_config(key, value) } + AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } @@ -823,6 +831,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { self.container_credentials_relative_uri.clone() } + AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists.as_ref().map(ToString::to_string) } @@ -977,6 +986,14 @@ impl AmazonS3Builder { self } + /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests + /// + /// This can be useful when interacting with public S3 buckets that deny authorized requests + pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { + self.skip_signature = skip_signature.into(); + self + } + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. /// /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html @@ -1146,6 +1163,7 @@ impl AmazonS3Builder { retry_config: self.retry_config, client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, + skip_signature: self.skip_signature.get()?, checksum, copy_if_not_exists, }; @@ -1505,4 +1523,30 @@ mod s3_resolve_bucket_region_tests { assert!(result.is_err()); } + + #[tokio::test] + #[ignore = "Tests shouldn't call use remote services by default"] + async fn test_disable_creds() { + // https://registry.opendata.aws/daylight-osm/ + let v1 = AmazonS3Builder::new() + .with_bucket_name("daylight-map-distribution") + .with_region("us-west-1") + .with_access_key_id("local") + .with_secret_access_key("development") + .build() + .unwrap(); + + let prefix = Path::from("release"); + + v1.list_with_delimiter(Some(&prefix)).await.unwrap_err(); + + let v2 = AmazonS3Builder::new() + .with_bucket_name("daylight-map-distribution") + .with_region("us-west-1") + .with_skip_signature(true) + .build() + .unwrap(); + + v2.list_with_delimiter(Some(&prefix)).await.unwrap(); + } } From 31bc84c91e7d6c509443f6e73bda0df32a0a5cba Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 10:56:25 +0100 Subject: [PATCH 1274/1411] Default connection and request timeouts of 5 seconds (#4928) * Default connection and request timeouts of 5 seconds * Clippy * Allow disabling timeouts --- object_store/src/aws/mod.rs | 3 +- object_store/src/azure/mod.rs | 2 +- object_store/src/client/mod.rs | 66 ++++++++++++++++++++++++++++++++-- object_store/src/gcp/mod.rs | 2 +- 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 70170a3cf48a..3ddce08002c4 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -1130,8 +1130,7 @@ impl AmazonS3Builder { Arc::new(TokenCredentialProvider::new( token, - // The instance metadata endpoint is access over HTTP - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 9017634c42da..190b73bf9490 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -1070,7 +1070,7 @@ impl MicrosoftAzureBuilder { ); Arc::new(TokenCredentialProvider::new( msi_credential, - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index ee9d62a44f0c..137da2b37594 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -166,7 +166,7 @@ impl FromStr for ClientConfigKey { } /// HTTP client configuration for remote object stores -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] pub struct ClientOptions { user_agent: Option>, content_type_map: HashMap, @@ -188,6 +188,35 @@ pub struct ClientOptions { http2_only: ConfigValue, } +impl Default for ClientOptions { + fn default() -> Self { + // Defaults based on + // + // + // Which recommend a connection timeout of 3.1s and a request timeout of 2s + Self { + user_agent: None, + content_type_map: Default::default(), + default_content_type: None, + default_headers: None, + proxy_url: None, + proxy_ca_certificate: None, + proxy_excludes: None, + allow_http: Default::default(), + allow_insecure: Default::default(), + timeout: Some(Duration::from_secs(5).into()), + connect_timeout: Some(Duration::from_secs(5).into()), + pool_idle_timeout: None, + pool_max_idle_per_host: None, + http2_keep_alive_interval: None, + http2_keep_alive_timeout: None, + http2_keep_alive_while_idle: Default::default(), + http1_only: Default::default(), + http2_only: Default::default(), + } + } +} + impl ClientOptions { /// Create a new [`ClientOptions`] with default values pub fn new() -> Self { @@ -367,17 +396,37 @@ impl ClientOptions { /// /// The timeout is applied from when the request starts connecting until the /// response body has finished + /// + /// Default is 5 seconds pub fn with_timeout(mut self, timeout: Duration) -> Self { self.timeout = Some(ConfigValue::Parsed(timeout)); self } + /// Disables the request timeout + /// + /// See [`Self::with_timeout`] + pub fn with_timeout_disabled(mut self) -> Self { + self.timeout = None; + self + } + /// Set a timeout for only the connect phase of a Client + /// + /// Default is 5 seconds pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { self.connect_timeout = Some(ConfigValue::Parsed(timeout)); self } + /// Disables the connection timeout + /// + /// See [`Self::with_connect_timeout`] + pub fn with_connect_timeout_disabled(mut self) -> Self { + self.timeout = None; + self + } + /// Set the pool max idle timeout /// /// This is the length of time an idle connection will be kept alive @@ -444,7 +493,20 @@ impl ClientOptions { } } - pub(crate) fn client(&self) -> super::Result { + /// Create a [`Client`] with overrides optimised for metadata endpoint access + /// + /// In particular: + /// * Allows HTTP as metadata endpoints do not use TLS + /// * Configures a low connection timeout to provide quick feedback if not present + #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] + pub(crate) fn metadata_client(&self) -> Result { + self.clone() + .with_allow_http(true) + .with_connect_timeout(Duration::from_secs(1)) + .client() + } + + pub(crate) fn client(&self) -> Result { let mut builder = ClientBuilder::new(); match &self.user_agent { diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index f80704b91765..f8a16310dd1e 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -1071,7 +1071,7 @@ impl GoogleCloudStorageBuilder { } else { Arc::new(TokenCredentialProvider::new( InstanceCredentialProvider::new(audience), - self.client_options.clone().with_allow_http(true).client()?, + self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ }; From 4a23ab93336fbdbc96b9e9f29fe46c44e40b57d6 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 16 Oct 2023 15:16:45 +0200 Subject: [PATCH 1275/1411] Update pyo3 requirement from 0.19 to 0.20 (#4941) Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.19.0...v0.20.0) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow-pyarrow-integration-testing/pyproject.toml | 2 +- arrow/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 50987b03ca9e..8c60c086c29a 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.19", features = ["extension-module"] } +pyo3 = { version = "0.20", features = ["extension-module"] } diff --git a/arrow-pyarrow-integration-testing/pyproject.toml b/arrow-pyarrow-integration-testing/pyproject.toml index d75f8de1ac4c..d85db24c2e18 100644 --- a/arrow-pyarrow-integration-testing/pyproject.toml +++ b/arrow-pyarrow-integration-testing/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["maturin"] +requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" dependencies = ["pyarrow>=1"] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8abb4f73a384..37f03a05b3fa 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -60,7 +60,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.19", default-features = false, optional = true } +pyo3 = { version = "0.20", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] From 69c937565f7404dc1576bc22d153ce79bf107cfb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 16 Oct 2023 14:18:53 +0100 Subject: [PATCH 1276/1411] Support service_account in ApplicationDefaultCredentials and Use SelfSignedJwt (#4926) * Support service_account in ApplicationDefaultCredentials * Use SelfSignedJwt for Service Accounts * Update CI * Apply suggestions from code review Co-authored-by: Marco Neumann --------- Co-authored-by: Marco Neumann --- .github/workflows/object_store.yml | 2 +- object_store/src/gcp/credential.rs | 219 +++++++++++------------------ object_store/src/gcp/mod.rs | 45 +++--- 3 files changed, 108 insertions(+), 158 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index c28f8037a307..1b991e33c097 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -126,7 +126,7 @@ jobs: # Give the container a moment to start up prior to configuring it sleep 1 curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" - echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" + echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": "", "private_key_id": ""}' > "$GOOGLE_SERVICE_ACCOUNT" - name: Setup WebDav run: docker run -d -p 8080:80 rclone/rclone serve webdav /data --addr :80 diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index ad21c33b8b9d..87f8e244f21c 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -17,10 +17,8 @@ use crate::client::retry::RetryExt; use crate::client::token::TemporaryToken; -use crate::client::{TokenCredentialProvider, TokenProvider}; -use crate::gcp::credential::Error::UnsupportedCredentialsType; -use crate::gcp::{GcpCredentialProvider, STORE}; -use crate::ClientOptions; +use crate::client::TokenProvider; +use crate::gcp::STORE; use crate::RetryConfig; use async_trait::async_trait; use base64::prelude::BASE64_URL_SAFE_NO_PAD; @@ -28,6 +26,7 @@ use base64::Engine; use futures::TryFutureExt; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; +use serde::Deserialize; use snafu::{ResultExt, Snafu}; use std::env; use std::fs::File; @@ -37,6 +36,10 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::info; +pub const DEFAULT_SCOPE: &str = "https://www.googleapis.com/auth/devstorage.full_control"; + +pub const DEFAULT_GCS_BASE_URL: &str = "https://storage.googleapis.com"; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] @@ -68,9 +71,6 @@ pub enum Error { #[snafu(display("Error getting token response body: {}", source))] TokenResponseBody { source: reqwest::Error }, - - #[snafu(display("Unsupported ApplicationCredentials type: {}", type_))] - UnsupportedCredentialsType { type_: String }, } impl From for crate::Error { @@ -92,48 +92,48 @@ pub struct GcpCredential { pub type Result = std::result::Result; #[derive(Debug, Default, serde::Serialize)] -pub struct JwtHeader { +pub struct JwtHeader<'a> { /// The type of JWS: it can only be "JWT" here /// /// Defined in [RFC7515#4.1.9](https://tools.ietf.org/html/rfc7515#section-4.1.9). #[serde(skip_serializing_if = "Option::is_none")] - pub typ: Option, + pub typ: Option<&'a str>, /// The algorithm used /// /// Defined in [RFC7515#4.1.1](https://tools.ietf.org/html/rfc7515#section-4.1.1). - pub alg: String, + pub alg: &'a str, /// Content type /// /// Defined in [RFC7519#5.2](https://tools.ietf.org/html/rfc7519#section-5.2). #[serde(skip_serializing_if = "Option::is_none")] - pub cty: Option, + pub cty: Option<&'a str>, /// JSON Key URL /// /// Defined in [RFC7515#4.1.2](https://tools.ietf.org/html/rfc7515#section-4.1.2). #[serde(skip_serializing_if = "Option::is_none")] - pub jku: Option, + pub jku: Option<&'a str>, /// Key ID /// /// Defined in [RFC7515#4.1.4](https://tools.ietf.org/html/rfc7515#section-4.1.4). #[serde(skip_serializing_if = "Option::is_none")] - pub kid: Option, + pub kid: Option<&'a str>, /// X.509 URL /// /// Defined in [RFC7515#4.1.5](https://tools.ietf.org/html/rfc7515#section-4.1.5). #[serde(skip_serializing_if = "Option::is_none")] - pub x5u: Option, + pub x5u: Option<&'a str>, /// X.509 certificate thumbprint /// /// Defined in [RFC7515#4.1.7](https://tools.ietf.org/html/rfc7515#section-4.1.7). #[serde(skip_serializing_if = "Option::is_none")] - pub x5t: Option, + pub x5t: Option<&'a str>, } #[derive(serde::Serialize)] struct TokenClaims<'a> { iss: &'a str, + sub: &'a str, scope: &'a str, - aud: &'a str, exp: u64, iat: u64, } @@ -144,28 +144,32 @@ struct TokenResponse { expires_in: u64, } -/// Encapsulates the logic to perform an OAuth token challenge +/// Self-signed JWT (JSON Web Token). +/// +/// # References +/// - #[derive(Debug)] -pub struct OAuthProvider { +pub struct SelfSignedJwt { issuer: String, scope: String, - audience: String, key_pair: RsaKeyPair, jwt_header: String, random: ring::rand::SystemRandom, } -impl OAuthProvider { - /// Create a new [`OAuthProvider`] +impl SelfSignedJwt { + /// Create a new [`SelfSignedJwt`] pub fn new( + key_id: String, issuer: String, private_key_pem: String, scope: String, - audience: String, ) -> Result { let key_pair = decode_first_rsa_key(private_key_pem)?; let jwt_header = b64_encode_obj(&JwtHeader { - alg: "RS256".to_string(), + alg: "RS256", + typ: Some("JWT"), + kid: Some(&key_id), ..Default::default() })?; @@ -173,7 +177,6 @@ impl OAuthProvider { issuer, key_pair, scope, - audience, jwt_header, random: ring::rand::SystemRandom::new(), }) @@ -181,24 +184,24 @@ impl OAuthProvider { } #[async_trait] -impl TokenProvider for OAuthProvider { +impl TokenProvider for SelfSignedJwt { type Credential = GcpCredential; /// Fetch a fresh token async fn fetch_token( &self, - client: &Client, - retry: &RetryConfig, + _client: &Client, + _retry: &RetryConfig, ) -> crate::Result>> { let now = seconds_since_epoch(); let exp = now + 3600; let claims = TokenClaims { iss: &self.issuer, + sub: &self.issuer, scope: &self.scope, - aud: &self.audience, - exp, iat: now, + exp, }; let claim_str = b64_encode_obj(&claims)?; @@ -214,28 +217,11 @@ impl TokenProvider for OAuthProvider { .context(SignSnafu)?; let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); - let jwt = [message, signature].join("."); - - let body = [ - ("grant_type", "urn:ietf:params:oauth:grant-type:jwt-bearer"), - ("assertion", &jwt), - ]; - - let response: TokenResponse = client - .request(Method::POST, &self.audience) - .form(&body) - .send_retry(retry) - .await - .context(TokenRequestSnafu)? - .json() - .await - .context(TokenResponseBodySnafu)?; + let bearer = [message, signature].join("."); Ok(TemporaryToken { - token: Arc::new(GcpCredential { - bearer: response.access_token, - }), - expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), + token: Arc::new(GcpCredential { bearer }), + expiry: Some(Instant::now() + Duration::from_secs(3600)), }) } } @@ -259,29 +245,24 @@ pub struct ServiceAccountCredentials { /// The private key in RSA format. pub private_key: String, + /// The private key ID + pub private_key_id: String, + /// The email address associated with the service account. pub client_email: String, /// Base URL for GCS - #[serde(default = "default_gcs_base_url")] - pub gcs_base_url: String, + #[serde(default)] + pub gcs_base_url: Option, /// Disable oauth and use empty tokens. - #[serde(default = "default_disable_oauth")] + #[serde(default)] pub disable_oauth: bool, } -pub fn default_gcs_base_url() -> String { - "https://storage.googleapis.com".to_owned() -} - -pub fn default_disable_oauth() -> bool { - false -} - impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a file. - pub fn from_file>(path: P) -> Result { + pub fn from_file>(path: P) -> Result { read_credentials_file(path) } @@ -290,17 +271,20 @@ impl ServiceAccountCredentials { serde_json::from_str(key).context(DecodeCredentialsSnafu) } - /// Create an [`OAuthProvider`] from this credentials struct. - pub fn oauth_provider( - self, - scope: &str, - audience: &str, - ) -> crate::Result { - Ok(OAuthProvider::new( + /// Create a [`SelfSignedJwt`] from this credentials struct. + /// + /// We use a scope of [`DEFAULT_SCOPE`] as opposed to an audience + /// as GCS appears to not support audience + /// + /// # References + /// - + /// - + pub fn token_provider(self) -> crate::Result { + Ok(SelfSignedJwt::new( + self.private_key_id, self.client_email, self.private_key, - scope.to_string(), - audience.to_string(), + DEFAULT_SCOPE.to_string(), )?) } } @@ -337,25 +321,13 @@ fn b64_encode_obj(obj: &T) -> Result { /// /// #[derive(Debug, Default)] -pub struct InstanceCredentialProvider { - audience: String, -} - -impl InstanceCredentialProvider { - /// Create a new [`InstanceCredentialProvider`], we need to control the client in order to enable http access so save the options. - pub fn new>(audience: T) -> Self { - Self { - audience: audience.into(), - } - } -} +pub struct InstanceCredentialProvider {} /// Make a request to the metadata server to fetch a token, using a a given hostname. async fn make_metadata_request( client: &Client, hostname: &str, retry: &RetryConfig, - audience: &str, ) -> crate::Result { let url = format!( "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" @@ -363,7 +335,7 @@ async fn make_metadata_request( let response: TokenResponse = client .request(Method::GET, url) .header("Metadata-Flavor", "Google") - .query(&[("audience", audience)]) + .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await .context(TokenRequestSnafu)? @@ -388,12 +360,9 @@ impl TokenProvider for InstanceCredentialProvider { const METADATA_HOST: &str = "metadata"; info!("fetching token from metadata server"); - let response = - make_metadata_request(client, METADATA_HOST, retry, &self.audience) - .or_else(|_| { - make_metadata_request(client, METADATA_IP, retry, &self.audience) - }) - .await?; + let response = make_metadata_request(client, METADATA_HOST, retry) + .or_else(|_| make_metadata_request(client, METADATA_IP, retry)) + .await?; let token = TemporaryToken { token: Arc::new(GcpCredential { bearer: response.access_token, @@ -404,62 +373,36 @@ impl TokenProvider for InstanceCredentialProvider { } } -/// ApplicationDefaultCredentials -/// -pub fn application_default_credentials( - path: Option<&str>, - client: &ClientOptions, - retry: &RetryConfig, -) -> crate::Result> { - let file = match ApplicationDefaultCredentialsFile::read(path)? { - Some(x) => x, - None => return Ok(None), - }; - - match file.type_.as_str() { - // - "authorized_user" => { - let token = AuthorizedUserCredentials { - client_id: file.client_id, - client_secret: file.client_secret, - refresh_token: file.refresh_token, - }; - - Ok(Some(Arc::new(TokenCredentialProvider::new( - token, - client.client()?, - retry.clone(), - )))) - } - type_ => Err(UnsupportedCredentialsType { - type_: type_.to_string(), - } - .into()), - } -} - /// A deserialized `application_default_credentials.json`-file. -/// +/// +/// # References +/// - +/// - #[derive(serde::Deserialize)] -struct ApplicationDefaultCredentialsFile { - #[serde(default)] - client_id: String, - #[serde(default)] - client_secret: String, - #[serde(default)] - refresh_token: String, - #[serde(rename = "type")] - type_: String, +#[serde(tag = "type")] +pub enum ApplicationDefaultCredentials { + /// Service Account. + /// + /// # References + /// - + #[serde(rename = "service_account")] + ServiceAccount(ServiceAccountCredentials), + /// Authorized user via "gcloud CLI Integration". + /// + /// # References + /// - + #[serde(rename = "authorized_user")] + AuthorizedUser(AuthorizedUserCredentials), } -impl ApplicationDefaultCredentialsFile { +impl ApplicationDefaultCredentials { const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. // 2. without argument if the well-known configuration file is present. - fn read(path: Option<&str>) -> Result, Error> { + pub fn read(path: Option<&str>) -> Result, Error> { if let Some(path) = path { return read_credentials_file::(path).map(Some); } @@ -478,8 +421,8 @@ impl ApplicationDefaultCredentialsFile { const DEFAULT_TOKEN_GCP_URI: &str = "https://accounts.google.com/o/oauth2/token"; /// -#[derive(Debug)] -struct AuthorizedUserCredentials { +#[derive(Debug, Deserialize)] +pub struct AuthorizedUserCredentials { client_id: String, client_secret: String, refresh_token: String, diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index f8a16310dd1e..a75527fe7b9f 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -57,10 +57,7 @@ use crate::{ ObjectStore, Result, RetryConfig, }; -use credential::{ - application_default_credentials, default_gcs_base_url, InstanceCredentialProvider, - ServiceAccountCredentials, -}; +use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; mod credential; @@ -68,6 +65,7 @@ const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; +use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; pub use credential::GcpCredential; #[derive(Debug, Snafu)] @@ -1034,10 +1032,8 @@ impl GoogleCloudStorageBuilder { }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = application_default_credentials( + let application_default_credentials = ApplicationDefaultCredentials::read( self.application_credentials_path.as_deref(), - &self.client_options, - &self.retry_config, )?; let disable_oauth = service_account_credentials @@ -1045,14 +1041,10 @@ impl GoogleCloudStorageBuilder { .map(|c| c.disable_oauth) .unwrap_or(false); - let gcs_base_url = service_account_credentials + let gcs_base_url: String = service_account_credentials .as_ref() - .map(|c| c.gcs_base_url.clone()) - .unwrap_or_else(default_gcs_base_url); - - // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes - let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token"; + .and_then(|c| c.gcs_base_url.clone()) + .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); let credentials = if let Some(credentials) = self.credentials { credentials @@ -1062,15 +1054,30 @@ impl GoogleCloudStorageBuilder { })) as _ } else if let Some(credentials) = service_account_credentials { Arc::new(TokenCredentialProvider::new( - credentials.oauth_provider(scope, audience)?, + credentials.token_provider()?, self.client_options.client()?, self.retry_config.clone(), )) as _ } else if let Some(credentials) = application_default_credentials { - credentials + match credentials { + ApplicationDefaultCredentials::AuthorizedUser(token) => { + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + ApplicationDefaultCredentials::ServiceAccount(token) => { + Arc::new(TokenCredentialProvider::new( + token.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + } } else { Arc::new(TokenCredentialProvider::new( - InstanceCredentialProvider::new(audience), + InstanceCredentialProvider::default(), self.client_options.metadata_client()?, self.retry_config.clone(), )) as _ @@ -1105,7 +1112,7 @@ mod test { use super::*; - const FAKE_KEY: &str = r#"{"private_key": "private_key", "client_email":"client_email", "disable_oauth":true}"#; + const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; #[tokio::test] @@ -1117,7 +1124,7 @@ mod test { list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; - if integration.client.base_url == default_gcs_base_url() { + if integration.client.base_url == DEFAULT_GCS_BASE_URL { // Fake GCS server doesn't currently honor ifGenerationMatch // https://github.com/fsouza/fake-gcs-server/issues/994 copy_if_not_exists(&integration).await; From ce2a9580556c33261eba39a96d597db9600cc682 Mon Sep 17 00:00:00 2001 From: Haixuan Xavier Tao Date: Mon, 16 Oct 2023 23:04:55 +0800 Subject: [PATCH 1277/1411] Add `FileWriter` schema getter (#4940) * Add `FileWriter` schema getter * Make schema getter consistent with the parquet implementation --- arrow-ipc/src/writer.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 0e01e51231d6..567fa2e94171 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -23,6 +23,7 @@ use std::cmp::min; use std::collections::HashMap; use std::io::{BufWriter, Write}; +use std::sync::Arc; use flatbuffers::FlatBufferBuilder; @@ -696,7 +697,7 @@ pub struct FileWriter { /// IPC write options write_options: IpcWriteOptions, /// A reference to the schema, used in validating record batches - schema: Schema, + schema: SchemaRef, /// The number of bytes between each block of bytes, as an offset for random access block_offsets: usize, /// Dictionary blocks that will be written as part of the IPC footer @@ -739,7 +740,7 @@ impl FileWriter { Ok(Self { writer, write_options, - schema: schema.clone(), + schema: Arc::new(schema.clone()), block_offsets: meta + data + header_size, dictionary_blocks: vec![], record_blocks: vec![], @@ -832,6 +833,11 @@ impl FileWriter { Ok(()) } + /// Returns the arrow [`SchemaRef`] for this arrow file. + pub fn schema(&self) -> &SchemaRef { + &self.schema + } + /// Gets a reference to the underlying writer. pub fn get_ref(&self) -> &W { self.writer.get_ref() From 95b015cf7b5d57c7fe66a8feada4f48a987cb020 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 17 Oct 2023 01:52:27 +0800 Subject: [PATCH 1278/1411] Evaluate null_regex for string type in csv (now such values will be parsed as `Null` rather than `""`) (#4942) * fix: add null_regex for string type in csv * Update arrow-csv/src/reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-csv/src/reader/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 2ba49cadc73f..1106b16bc46f 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -791,7 +791,10 @@ fn parse( } DataType::Utf8 => Ok(Arc::new( rows.iter() - .map(|row| Some(row.get(i))) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::(), ) as ArrayRef), DataType::Dictionary(key_type, value_type) @@ -1495,7 +1498,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("c_int", DataType::UInt64, false), Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), + Field::new("c_string", DataType::Utf8, true), Field::new("c_bool", DataType::Boolean, false), ])); @@ -1596,8 +1599,7 @@ mod tests { assert!(batch.column(0).is_null(1)); assert!(batch.column(1).is_null(2)); assert!(batch.column(3).is_null(4)); - // String won't be empty - assert!(!batch.column(2).is_null(3)); + assert!(batch.column(2).is_null(3)); assert!(!batch.column(2).is_null(4)); } @@ -2237,8 +2239,8 @@ mod tests { fn err_test(csv: &[u8], expected: &str) { let schema = Arc::new(Schema::new(vec![ - Field::new("text1", DataType::Utf8, false), - Field::new("text2", DataType::Utf8, false), + Field::new("text1", DataType::Utf8, true), + Field::new("text2", DataType::Utf8, true), ])); let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); let b = ReaderBuilder::new(schema) From ab87abdd69ab787fdf247cf36f04abc1fbfa6266 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 12:39:34 +0100 Subject: [PATCH 1279/1411] Generate `ETag`s for `InMemory` and `LocalFileSystem` (#4879) (#4922) * Support ETag in InMemory (#4879) * Add LocalFileSystem Etag * Review feedback * Review feedback --- object_store/src/lib.rs | 206 ++++++++++++++++++++++++++++--------- object_store/src/local.rs | 37 ++++--- object_store/src/memory.rs | 149 ++++++++++++++++----------- 3 files changed, 268 insertions(+), 124 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index ff0a46533dda..b79042e3cda8 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -698,12 +698,28 @@ pub struct GetOptions { /// Request will succeed if the `ObjectMeta::e_tag` matches /// otherwise returning [`Error::Precondition`] /// - /// + /// See + /// + /// Examples: + /// + /// ```text + /// If-Match: "xyzzy" + /// If-Match: "xyzzy", "r2d2xxxx", "c3piozzzz" + /// If-Match: * + /// ``` pub if_match: Option, /// Request will succeed if the `ObjectMeta::e_tag` does not match /// otherwise returning [`Error::NotModified`] /// - /// + /// See + /// + /// Examples: + /// + /// ```text + /// If-None-Match: "xyzzy" + /// If-None-Match: "xyzzy", "r2d2xxxx", "c3piozzzz" + /// If-None-Match: * + /// ``` pub if_none_match: Option, /// Request will succeed if the object has been modified since /// @@ -730,25 +746,41 @@ pub struct GetOptions { impl GetOptions { /// Returns an error if the modification conditions on this request are not satisfied - fn check_modified( - &self, - location: &Path, - last_modified: DateTime, - ) -> Result<()> { - if let Some(date) = self.if_modified_since { - if last_modified <= date { - return Err(Error::NotModified { - path: location.to_string(), - source: format!("{} >= {}", date, last_modified).into(), + /// + /// + fn check_preconditions(&self, meta: &ObjectMeta) -> Result<()> { + // The use of the invalid etag "*" means no ETag is equivalent to never matching + let etag = meta.e_tag.as_deref().unwrap_or("*"); + let last_modified = meta.last_modified; + + if let Some(m) = &self.if_match { + if m != "*" && m.split(',').map(str::trim).all(|x| x != etag) { + return Err(Error::Precondition { + path: meta.location.to_string(), + source: format!("{etag} does not match {m}").into(), }); } - } - - if let Some(date) = self.if_unmodified_since { + } else if let Some(date) = self.if_unmodified_since { if last_modified > date { return Err(Error::Precondition { - path: location.to_string(), - source: format!("{} < {}", date, last_modified).into(), + path: meta.location.to_string(), + source: format!("{date} < {last_modified}").into(), + }); + } + } + + if let Some(m) = &self.if_none_match { + if m == "*" || m.split(',').map(str::trim).any(|x| x == etag) { + return Err(Error::NotModified { + path: meta.location.to_string(), + source: format!("{etag} matches {m}").into(), + }); + } + } else if let Some(date) = self.if_modified_since { + if last_modified <= date { + return Err(Error::NotModified { + path: meta.location.to_string(), + source: format!("{date} >= {last_modified}").into(), }); } } @@ -952,6 +984,7 @@ mod test_util { mod tests { use super::*; use crate::test_util::flatten_list_stream; + use chrono::TimeZone; use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; @@ -1359,33 +1392,32 @@ mod tests { Err(e) => panic!("{e}"), } - if let Some(tag) = meta.e_tag { - let options = GetOptions { - if_match: Some(tag.clone()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - - let options = GetOptions { - if_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::Precondition { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some(tag.clone()), - ..GetOptions::default() - }; - let err = storage.get_opts(&path, options).await.unwrap_err(); - assert!(matches!(err, Error::NotModified { .. }), "{err}"); - - let options = GetOptions { - if_none_match: Some("invalid".to_string()), - ..GetOptions::default() - }; - storage.get_opts(&path, options).await.unwrap(); - } + let tag = meta.e_tag.unwrap(); + let options = GetOptions { + if_match: Some(tag.clone()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some(tag.clone()), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::NotModified { .. }), "{err}"); + + let options = GetOptions { + if_none_match: Some("invalid".to_string()), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); } /// Returns a chunk of length `chunk_length` @@ -1697,8 +1729,86 @@ mod tests { assert!(stream.next().await.is_none()); } - // Tests TODO: - // GET nonexisting location (in_memory/file) - // DELETE nonexisting location - // PUT overwriting + #[test] + fn test_preconditions() { + let mut meta = ObjectMeta { + location: Path::from("test"), + last_modified: Utc.timestamp_nanos(100), + size: 100, + e_tag: Some("123".to_string()), + }; + + let mut options = GetOptions::default(); + options.check_preconditions(&meta).unwrap(); + + options.if_modified_since = Some(Utc.timestamp_nanos(50)); + options.check_preconditions(&meta).unwrap(); + + options.if_modified_since = Some(Utc.timestamp_nanos(100)); + options.check_preconditions(&meta).unwrap_err(); + + options.if_modified_since = Some(Utc.timestamp_nanos(101)); + options.check_preconditions(&meta).unwrap_err(); + + options = GetOptions::default(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(50)); + options.check_preconditions(&meta).unwrap_err(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(100)); + options.check_preconditions(&meta).unwrap(); + + options.if_unmodified_since = Some(Utc.timestamp_nanos(101)); + options.check_preconditions(&meta).unwrap(); + + options = GetOptions::default(); + + options.if_match = Some("123".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("123,354".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("354, 123,".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_match = Some("354".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_match = Some("*".to_string()); + options.check_preconditions(&meta).unwrap(); + + // If-Match takes precedence + options.if_unmodified_since = Some(Utc.timestamp_nanos(200)); + options.check_preconditions(&meta).unwrap(); + + options = GetOptions::default(); + + options.if_none_match = Some("123".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_none_match = Some("*".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + options.if_none_match = Some("1232".to_string()); + options.check_preconditions(&meta).unwrap(); + + options.if_none_match = Some("23, 123".to_string()); + options.check_preconditions(&meta).unwrap_err(); + + // If-None-Match takes precedence + options.if_modified_since = Some(Utc.timestamp_nanos(10)); + options.check_preconditions(&meta).unwrap_err(); + + // Check missing ETag + meta.e_tag = None; + options = GetOptions::default(); + + options.if_none_match = Some("*".to_string()); // Fails if any file exists + options.check_preconditions(&meta).unwrap_err(); + + options = GetOptions::default(); + options.if_match = Some("*".to_string()); // Passes if file exists + options.check_preconditions(&meta).unwrap(); + } } diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 3ed63a410815..3d4a02a1e9e9 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -365,23 +365,12 @@ impl ObjectStore for LocalFileSystem { } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_match.is_some() || options.if_none_match.is_some() { - return Err(super::Error::NotSupported { - source: "ETags not supported by LocalFileSystem".to_string().into(), - }); - } - let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; maybe_spawn_blocking(move || { let (file, metadata) = open_file(&path)?; - if options.if_unmodified_since.is_some() - || options.if_modified_since.is_some() - { - options.check_modified(&location, last_modified(&metadata))?; - } - let meta = convert_metadata(metadata, location)?; + options.check_preconditions(&meta)?; Ok(GetResult { payload: GetResultPayload::File(file, path), @@ -965,7 +954,7 @@ fn convert_entry(entry: DirEntry, location: Path) -> Result { convert_metadata(metadata, location) } -fn last_modified(metadata: &std::fs::Metadata) -> DateTime { +fn last_modified(metadata: &Metadata) -> DateTime { metadata .modified() .expect("Modified file time should be supported on this platform") @@ -977,15 +966,35 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; + let inode = get_inode(&metadata); + let mtime = last_modified.timestamp_micros(); + + // Use an ETag scheme based on that used by many popular HTTP servers + // + // + let etag = format!("{inode:x}-{mtime:x}-{size:x}"); Ok(ObjectMeta { location, last_modified, size, - e_tag: None, + e_tag: Some(etag), }) } +#[cfg(unix)] +/// We include the inode when available to yield an ETag more resistant to collisions +/// and as used by popular web servers such as [Apache](https://httpd.apache.org/docs/2.2/mod/core.html#fileetag) +fn get_inode(metadata: &Metadata) -> u64 { + std::os::unix::fs::MetadataExt::ino(metadata) +} + +#[cfg(not(unix))] +/// On platforms where an inode isn't available, fallback to just relying on size and mtime +fn get_inode(metadata: &Metadata) -> u64 { + 0 +} + /// Convert walkdir results and converts not-found errors into `None`. /// Convert broken symlinks to `None`. fn convert_walkdir_result( diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 0e229885b006..f638ed6d7a55 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -35,9 +35,6 @@ use std::sync::Arc; use std::task::Poll; use tokio::io::AsyncWrite; -type Entry = (Bytes, DateTime); -type StorageType = Arc>>; - /// A specialized `Error` for in-memory object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -80,7 +77,41 @@ impl From for super::Error { /// storage provider. #[derive(Debug, Default)] pub struct InMemory { - storage: StorageType, + storage: SharedStorage, +} + +#[derive(Debug, Clone)] +struct Entry { + data: Bytes, + last_modified: DateTime, + e_tag: usize, +} + +impl Entry { + fn new(data: Bytes, last_modified: DateTime, e_tag: usize) -> Self { + Self { + data, + last_modified, + e_tag, + } + } +} + +#[derive(Debug, Default, Clone)] +struct Storage { + next_etag: usize, + map: BTreeMap, +} + +type SharedStorage = Arc>; + +impl Storage { + fn insert(&mut self, location: &Path, bytes: Bytes) { + let etag = self.next_etag; + self.next_etag += 1; + let entry = Entry::new(bytes, Utc::now(), etag); + self.map.insert(location.clone(), entry); + } } impl std::fmt::Display for InMemory { @@ -92,9 +123,7 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage - .write() - .insert(location.clone(), (bytes, Utc::now())); + self.storage.write().insert(location, bytes); Ok(()) } @@ -128,33 +157,30 @@ impl ObjectStore for InMemory { Ok(Box::new(InMemoryAppend { location: location.clone(), data: Vec::::new(), - storage: StorageType::clone(&self.storage), + storage: SharedStorage::clone(&self.storage), })) } async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { - if options.if_match.is_some() || options.if_none_match.is_some() { - return Err(super::Error::NotSupported { - source: "ETags not supported by InMemory".to_string().into(), - }); - } - let (data, last_modified) = self.entry(location).await?; - options.check_modified(location, last_modified)?; + let entry = self.entry(location).await?; + let e_tag = entry.e_tag.to_string(); + let meta = ObjectMeta { location: location.clone(), - last_modified, - size: data.len(), - e_tag: None, + last_modified: entry.last_modified, + size: entry.data.len(), + e_tag: Some(e_tag), }; + options.check_preconditions(&meta)?; let (range, data) = match options.range { Some(range) => { - let len = data.len(); + let len = entry.data.len(); ensure!(range.end <= len, OutOfRangeSnafu { range, len }); ensure!(range.start <= range.end, BadRangeSnafu { range }); - (range.clone(), data.slice(range)) + (range.clone(), entry.data.slice(range)) } - None => (0..data.len(), data), + None => (0..entry.data.len(), entry.data), }; let stream = futures::stream::once(futures::future::ready(Ok(data))); @@ -170,15 +196,18 @@ impl ObjectStore for InMemory { location: &Path, ranges: &[Range], ) -> Result> { - let data = self.entry(location).await?; + let entry = self.entry(location).await?; ranges .iter() .map(|range| { let range = range.clone(); - let len = data.0.len(); - ensure!(range.end <= data.0.len(), OutOfRangeSnafu { range, len }); + let len = entry.data.len(); + ensure!( + range.end <= entry.data.len(), + OutOfRangeSnafu { range, len } + ); ensure!(range.start <= range.end, BadRangeSnafu { range }); - Ok(data.0.slice(range)) + Ok(entry.data.slice(range)) }) .collect() } @@ -188,14 +217,14 @@ impl ObjectStore for InMemory { Ok(ObjectMeta { location: location.clone(), - last_modified: entry.1, - size: entry.0.len(), - e_tag: None, + last_modified: entry.last_modified, + size: entry.data.len(), + e_tag: Some(entry.e_tag.to_string()), }) } async fn delete(&self, location: &Path) -> Result<()> { - self.storage.write().remove(location); + self.storage.write().map.remove(location); Ok(()) } @@ -208,6 +237,7 @@ impl ObjectStore for InMemory { let storage = self.storage.read(); let values: Vec<_> = storage + .map .range((prefix)..) .take_while(|(key, _)| key.as_ref().starts_with(prefix.as_ref())) .filter(|(key, _)| { @@ -219,9 +249,9 @@ impl ObjectStore for InMemory { .map(|(key, value)| { Ok(ObjectMeta { location: key.clone(), - last_modified: value.1, - size: value.0.len(), - e_tag: None, + last_modified: value.last_modified, + size: value.data.len(), + e_tag: Some(value.e_tag.to_string()), }) }) .collect(); @@ -241,7 +271,7 @@ impl ObjectStore for InMemory { // Only objects in this base level should be returned in the // response. Otherwise, we just collect the common prefixes. let mut objects = vec![]; - for (k, v) in self.storage.read().range((prefix)..) { + for (k, v) in self.storage.read().map.range((prefix)..) { if !k.as_ref().starts_with(prefix.as_ref()) { break; } @@ -263,9 +293,9 @@ impl ObjectStore for InMemory { } else { let object = ObjectMeta { location: k.clone(), - last_modified: v.1, - size: v.0.len(), - e_tag: None, + last_modified: v.last_modified, + size: v.data.len(), + e_tag: Some(v.e_tag.to_string()), }; objects.push(object); } @@ -278,23 +308,21 @@ impl ObjectStore for InMemory { } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.entry(from).await?; - self.storage - .write() - .insert(to.clone(), (data.0, Utc::now())); + let entry = self.entry(from).await?; + self.storage.write().insert(to, entry.data); Ok(()) } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - let data = self.entry(from).await?; + let entry = self.entry(from).await?; let mut storage = self.storage.write(); - if storage.contains_key(to) { + if storage.map.contains_key(to) { return Err(Error::AlreadyExists { path: to.to_string(), } .into()); } - storage.insert(to.clone(), (data.0, Utc::now())); + storage.insert(to, entry.data); Ok(()) } } @@ -319,9 +347,10 @@ impl InMemory { self.fork() } - async fn entry(&self, location: &Path) -> Result<(Bytes, DateTime)> { + async fn entry(&self, location: &Path) -> Result { let storage = self.storage.read(); let value = storage + .map .get(location) .cloned() .context(NoDataInMemorySnafu { @@ -335,7 +364,7 @@ impl InMemory { struct InMemoryUpload { location: Path, data: Vec, - storage: StorageType, + storage: Arc>, } impl AsyncWrite for InMemoryUpload { @@ -343,7 +372,7 @@ impl AsyncWrite for InMemoryUpload { mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { self.data.extend_from_slice(buf); Poll::Ready(Ok(buf.len())) } @@ -351,18 +380,16 @@ impl AsyncWrite for InMemoryUpload { fn poll_flush( self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { Poll::Ready(Ok(())) } fn poll_shutdown( mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { let data = Bytes::from(std::mem::take(&mut self.data)); - self.storage - .write() - .insert(self.location.clone(), (data, Utc::now())); + self.storage.write().insert(&self.location, data); Poll::Ready(Ok(())) } } @@ -370,7 +397,7 @@ impl AsyncWrite for InMemoryUpload { struct InMemoryAppend { location: Path, data: Vec, - storage: StorageType, + storage: Arc>, } impl AsyncWrite for InMemoryAppend { @@ -378,7 +405,7 @@ impl AsyncWrite for InMemoryAppend { mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, buf: &[u8], - ) -> std::task::Poll> { + ) -> Poll> { self.data.extend_from_slice(buf); Poll::Ready(Ok(buf.len())) } @@ -386,20 +413,18 @@ impl AsyncWrite for InMemoryAppend { fn poll_flush( mut self: Pin<&mut Self>, _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - let storage = StorageType::clone(&self.storage); + ) -> Poll> { + let storage = Arc::clone(&self.storage); let mut writer = storage.write(); - if let Some((bytes, _)) = writer.remove(&self.location) { + if let Some(entry) = writer.map.remove(&self.location) { let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(bytes.into_iter().chain(buf)); - writer.insert(self.location.clone(), (concat, Utc::now())); + let concat = Bytes::from_iter(entry.data.into_iter().chain(buf)); + writer.insert(&self.location, concat); } else { - writer.insert( - self.location.clone(), - (Bytes::from(std::mem::take(&mut self.data)), Utc::now()), - ); + let data = Bytes::from(std::mem::take(&mut self.data)); + writer.insert(&self.location, data); }; Poll::Ready(Ok(())) } From d4d11fe7a47b529429020848f2ac0f63659500d6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:09:12 +0100 Subject: [PATCH 1280/1411] Assume Pages Delimit Records When Offset Index Loaded (#4921) (#4943) * Assume records not split across pages (#4921) * More test * Add PageReader::at_record_boundary * Fix flush partial --- parquet/src/arrow/array_reader/mod.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 96 ++++++++++++++++++++++++++- parquet/src/column/page.rs | 14 ++++ parquet/src/column/reader.rs | 8 +-- parquet/src/column/reader/decoder.rs | 7 ++ parquet/src/file/serialized_reader.rs | 9 +++ 6 files changed, 129 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 625ac034ef47..a4ee5040590e 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -152,7 +152,7 @@ where Ok(records_read) } -/// Uses `record_reader` to skip up to `batch_size` records from`pages` +/// Uses `record_reader` to skip up to `batch_size` records from `pages` /// /// Returns the number of records skipped, which can be less than `batch_size` if /// pages is exhausted diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 4b3eebf2e67e..875fff4dac57 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -878,12 +878,17 @@ mod tests { use crate::file::properties::WriterProperties; use arrow::compute::kernels::cmp::eq; use arrow::error::Result as ArrowResult; + use arrow_array::builder::{ListBuilder, StringBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray}; - use futures::TryStreamExt; + use arrow_array::{ + Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray, UInt64Array, + }; + use arrow_schema::{DataType, Field, Schema}; + use futures::{StreamExt, TryStreamExt}; use rand::{thread_rng, Rng}; use std::sync::Mutex; + use tempfile::tempfile; #[derive(Clone)] struct TestReader { @@ -1677,4 +1682,91 @@ mod tests { assert!(sbbf.check(&"Hello")); assert!(!sbbf.check(&"Hello_Not_Exists")); } + + #[tokio::test] + async fn test_nested_skip() { + let schema = Arc::new(Schema::new(vec![ + Field::new("col_1", DataType::UInt64, false), + Field::new_list("col_2", Field::new("item", DataType::Utf8, true), true), + ])); + + // Default writer properties + let props = WriterProperties::builder() + .set_data_page_row_count_limit(256) + .set_write_batch_size(256) + .set_max_row_group_size(1024); + + // Write data + let mut file = tempfile().unwrap(); + let mut writer = + ArrowWriter::try_new(&mut file, schema.clone(), Some(props.build())).unwrap(); + + let mut builder = ListBuilder::new(StringBuilder::new()); + for id in 0..1024 { + match id % 3 { + 0 => builder + .append_value([Some("val_1".to_string()), Some(format!("id_{id}"))]), + 1 => builder.append_value([Some(format!("id_{id}"))]), + _ => builder.append_null(), + } + } + let refs = vec![ + Arc::new(UInt64Array::from_iter_values(0..1024)) as ArrayRef, + Arc::new(builder.finish()) as ArrayRef, + ]; + + let batch = RecordBatch::try_new(schema.clone(), refs).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let selections = [ + RowSelection::from(vec![ + RowSelector::skip(313), + RowSelector::select(1), + RowSelector::skip(709), + RowSelector::select(1), + ]), + RowSelection::from(vec![ + RowSelector::skip(255), + RowSelector::select(1), + RowSelector::skip(767), + RowSelector::select(1), + ]), + RowSelection::from(vec![ + RowSelector::select(255), + RowSelector::skip(1), + RowSelector::select(767), + RowSelector::skip(1), + ]), + RowSelection::from(vec![ + RowSelector::skip(254), + RowSelector::select(1), + RowSelector::select(1), + RowSelector::skip(767), + RowSelector::select(1), + ]), + ]; + + for selection in selections { + let expected = selection.row_count(); + // Read data + let mut reader = ParquetRecordBatchStreamBuilder::new_with_options( + tokio::fs::File::from_std(file.try_clone().unwrap()), + ArrowReaderOptions::new().with_page_index(true), + ) + .await + .unwrap(); + + reader = reader.with_row_selection(selection); + + let mut stream = reader.build().unwrap(); + + let mut total_rows = 0; + while let Some(rb) = stream.next().await { + let rb = rb.unwrap(); + total_rows += rb.num_rows(); + } + assert_eq!(total_rows, expected); + } + } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index ec9af2aa271a..933e42386272 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -320,6 +320,20 @@ pub trait PageReader: Iterator> + Send { /// Skips reading the next page, returns an error if no /// column index information fn skip_next_page(&mut self) -> Result<()>; + + /// Returns `true` if the next page can be assumed to contain the start of a new record + /// + /// Prior to parquet V2 the specification was ambiguous as to whether a single record + /// could be split across multiple pages, and prior to [(#4327)] the Rust writer would do + /// this in certain situations. However, correctly interpreting the offset index relies on + /// this assumption holding [(#4943)], and so this mechanism is provided for a [`PageReader`] + /// to signal this to the calling context + /// + /// [(#4327)]: https://github.com/apache/arrow-rs/pull/4327 + /// [(#4943)]: https://github.com/apache/arrow-rs/pull/4943 + fn at_record_boundary(&mut self) -> Result { + Ok(self.peek_next_page()?.is_none()) + } } /// API for writing pages in a column chunk. diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 3ce00622e953..52ad4d644c95 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -269,7 +269,7 @@ where // Reached end of page, which implies records_read < remaining_records // as otherwise would have stopped reading before reaching the end assert!(records_read < remaining_records); // Sanity check - records_read += 1; + records_read += reader.flush_partial() as usize; } (records_read, levels_read) } @@ -380,7 +380,7 @@ where // Reached end of page, which implies records_read < remaining_records // as otherwise would have stopped reading before reaching the end assert!(records_read < remaining_records); // Sanity check - records_read += 1; + records_read += decoder.flush_partial() as usize; } (records_read, levels_read) @@ -491,7 +491,7 @@ where offset += bytes_read; self.has_record_delimiter = - self.page_reader.peek_next_page()?.is_none(); + self.page_reader.at_record_boundary()?; self.rep_level_decoder .as_mut() @@ -548,7 +548,7 @@ where // across multiple pages, however, the parquet writer // used to do this so we preserve backwards compatibility self.has_record_delimiter = - self.page_reader.peek_next_page()?.is_none(); + self.page_reader.at_record_boundary()?; self.rep_level_decoder.as_mut().unwrap().set_data( Encoding::RLE, diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index 369b335dc98f..27ffb7637e18 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -102,6 +102,9 @@ pub trait RepetitionLevelDecoder: ColumnLevelDecoder { num_records: usize, num_levels: usize, ) -> Result<(usize, usize)>; + + /// Flush any partially read or skipped record + fn flush_partial(&mut self) -> bool; } pub trait DefinitionLevelDecoder: ColumnLevelDecoder { @@ -519,6 +522,10 @@ impl RepetitionLevelDecoder for RepetitionLevelDecoderImpl { } Ok((total_records_read, total_levels_read)) } + + fn flush_partial(&mut self) -> bool { + std::mem::take(&mut self.has_partial) + } } #[cfg(test)] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 4bc484144a81..b60d30ffea23 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -770,6 +770,15 @@ impl PageReader for SerializedPageReader { } } } + + fn at_record_boundary(&mut self) -> Result { + match &mut self.state { + SerializedPageReaderState::Values { .. } => { + Ok(self.peek_next_page()?.is_none()) + } + SerializedPageReaderState::Pages { .. } => Ok(true), + } + } } #[cfg(test)] From fa7a61a4b074ca4ec9bf429cc84b6c325057d96e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:10:31 +0100 Subject: [PATCH 1281/1411] Remove Nested async and Fallibility from ObjectStore::list (#4930) * Remove nested async and fallibility from ObjectStore::list * Clippy * Update limit test * Update docs --- object_store/src/aws/mod.rs | 13 +- object_store/src/azure/mod.rs | 7 +- object_store/src/chunked.rs | 13 +- object_store/src/client/list.rs | 32 ++--- object_store/src/gcp/mod.rs | 7 +- object_store/src/http/mod.rs | 24 ++-- object_store/src/lib.rs | 178 +++++++++++---------------- object_store/src/limit.rs | 44 ++++--- object_store/src/local.rs | 82 +++++------- object_store/src/memory.rs | 7 +- object_store/src/prefix.rs | 17 ++- object_store/src/throttle.rs | 47 +++---- object_store/tests/get_range_file.rs | 5 +- 13 files changed, 197 insertions(+), 279 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 3ddce08002c4..d3c50861c122 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -331,19 +331,16 @@ impl ObjectStore for AmazonS3 { .boxed() } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.client.list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.client.list_with_offset(prefix, offset) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 190b73bf9490..2a08c6775807 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -206,11 +206,8 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 008dec679413..d3e02b412725 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -147,19 +147,16 @@ impl ObjectStore for ChunkedStore { self.inner.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.inner.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.inner.list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.inner.list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.inner.list_with_offset(prefix, offset) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/client/list.rs b/object_store/src/client/list.rs index b2dbee27f14d..371894dfeb71 100644 --- a/object_store/src/client/list.rs +++ b/object_store/src/client/list.rs @@ -46,16 +46,13 @@ pub trait ListClientExt { offset: Option<&Path>, ) -> BoxStream<'_, Result>; - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>>; + ) -> BoxStream<'_, Result>; async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } @@ -90,31 +87,22 @@ impl ListClientExt for T { .boxed() } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - let stream = self - .list_paginated(prefix, false, None) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.list_paginated(prefix, false, None) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() - .boxed(); - - Ok(stream) + .boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - let stream = self - .list_paginated(prefix, false, Some(offset)) + ) -> BoxStream<'_, Result> { + self.list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() - .boxed(); - - Ok(stream) + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index a75527fe7b9f..513e396cbae6 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -601,11 +601,8 @@ impl ObjectStore for GoogleCloudStorage { self.client.delete_request(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.client.list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.client.list(prefix) } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 6ffb62358941..2fd7850b6bbf 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -34,7 +34,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use snafu::{OptionExt, ResultExt, Snafu}; use tokio::io::AsyncWrite; @@ -122,14 +122,13 @@ impl ObjectStore for HttpStore { self.client.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); - let status = self.client.list(prefix, "infinity").await?; - Ok(futures::stream::iter( - status + let prefix = prefix.cloned(); + futures::stream::once(async move { + let status = self.client.list(prefix.as_ref(), "infinity").await?; + + let iter = status .response .into_iter() .filter(|r| !r.is_dir()) @@ -138,9 +137,12 @@ impl ObjectStore for HttpStore { response.object_meta(self.client.base_url()) }) // Filter out exact prefix matches - .filter_ok(move |r| r.location.as_ref().len() > prefix_len), - ) - .boxed()) + .filter_ok(move |r| r.location.as_ref().len() > prefix_len); + + Ok::<_, crate::Error>(futures::stream::iter(iter)) + }) + .try_flatten() + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index b79042e3cda8..9b396444fa0d 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -95,18 +95,18 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; +//! # use std::sync::Arc; +//! # use object_store::{path::Path, ObjectStore}; +//! # use futures::stream::StreamExt; //! # // use LocalFileSystem for example -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } -//! +//! # //! # async fn example() { -//! use std::sync::Arc; -//! use object_store::{path::Path, ObjectStore}; -//! use futures::stream::StreamExt; -//! +//! # //! // create an ObjectStore -//! let object_store: Arc = Arc::new(get_object_store()); +//! let object_store: Arc = get_object_store(); //! //! // Recursively list all files below the 'data' path. //! // 1. On AWS S3 this would be the 'data/' prefix @@ -114,21 +114,12 @@ //! let prefix: Path = "data".try_into().unwrap(); //! //! // Get an `async` stream of Metadata objects: -//! let list_stream = object_store -//! .list(Some(&prefix)) -//! .await -//! .expect("Error listing files"); +//! let mut list_stream = object_store.list(Some(&prefix)); //! -//! // Print a line about each object based on its metadata -//! // using for_each from `StreamExt` trait. -//! list_stream -//! .for_each(move |meta| { -//! async { -//! let meta = meta.expect("Error listing"); -//! println!("Name: {}, size: {}", meta.location, meta.size); -//! } -//! }) -//! .await; +//! // Print a line about each object +//! while let Some(meta) = list_stream.next().await.transpose().unwrap() { +//! println!("Name: {}, size: {}", meta.location, meta.size); +//! } //! # } //! ``` //! @@ -147,19 +138,18 @@ //! from remote storage or files in the local filesystem as a stream. //! //! ``` +//! # use futures::TryStreamExt; //! # use object_store::local::LocalFileSystem; -//! # // use LocalFileSystem for example -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use std::sync::Arc; +//! # use object_store::{path::Path, ObjectStore}; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } -//! +//! # //! # async fn example() { -//! use std::sync::Arc; -//! use object_store::{path::Path, ObjectStore}; -//! use futures::stream::StreamExt; -//! +//! # //! // create an ObjectStore -//! let object_store: Arc = Arc::new(get_object_store()); +//! let object_store: Arc = get_object_store(); //! //! // Retrieve a specific file //! let path: Path = "data/file01.parquet".try_into().unwrap(); @@ -171,16 +161,11 @@ //! .unwrap() //! .into_stream(); //! -//! // Count the '0's using `map` from `StreamExt` trait +//! // Count the '0's using `try_fold` from `TryStreamExt` trait //! let num_zeros = stream -//! .map(|bytes| { -//! let bytes = bytes.unwrap(); -//! bytes.iter().filter(|b| **b == 0).count() -//! }) -//! .collect::>() -//! .await -//! .into_iter() -//! .sum::(); +//! .try_fold(0, |acc, bytes| async move { +//! Ok(acc + bytes.iter().filter(|b| **b == 0).count()) +//! }).await.unwrap(); //! //! println!("Num zeros in {} is {}", path, num_zeros); //! # } @@ -196,22 +181,19 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } //! # async fn put() { -//! use object_store::ObjectStore; -//! use std::sync::Arc; -//! use bytes::Bytes; -//! use object_store::path::Path; -//! -//! let object_store: Arc = Arc::new(get_object_store()); +//! # +//! let object_store: Arc = get_object_store(); //! let path: Path = "data/file1".try_into().unwrap(); //! let bytes = Bytes::from_static(b"hello"); -//! object_store -//! .put(&path, bytes) -//! .await -//! .unwrap(); +//! object_store.put(&path, bytes).await.unwrap(); //! # } //! ``` //! @@ -220,22 +202,20 @@ //! //! ``` //! # use object_store::local::LocalFileSystem; -//! # fn get_object_store() -> LocalFileSystem { -//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) //! # } //! # async fn multi_upload() { -//! use object_store::ObjectStore; -//! use std::sync::Arc; -//! use bytes::Bytes; -//! use tokio::io::AsyncWriteExt; -//! use object_store::path::Path; -//! -//! let object_store: Arc = Arc::new(get_object_store()); +//! # +//! let object_store: Arc = get_object_store(); //! let path: Path = "data/large_file".try_into().unwrap(); -//! let (_id, mut writer) = object_store -//! .put_multipart(&path) -//! .await -//! .unwrap(); +//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); +//! //! let bytes = Bytes::from_static(b"hello"); //! writer.write_all(&bytes).await.unwrap(); //! writer.flush().await.unwrap(); @@ -439,23 +419,22 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// return Ok. If it is an error, it will be [`Error::NotFound`]. /// /// ``` + /// # use futures::{StreamExt, TryStreamExt}; /// # use object_store::local::LocalFileSystem; /// # async fn example() -> Result<(), Box> { /// # let root = tempfile::TempDir::new().unwrap(); /// # let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - /// use object_store::{ObjectStore, ObjectMeta}; - /// use object_store::path::Path; - /// use futures::{StreamExt, TryStreamExt}; - /// use bytes::Bytes; - /// + /// # use object_store::{ObjectStore, ObjectMeta}; + /// # use object_store::path::Path; + /// # use futures::{StreamExt, TryStreamExt}; + /// # use bytes::Bytes; + /// # /// // Create two objects /// store.put(&Path::from("foo"), Bytes::from("foo")).await?; /// store.put(&Path::from("bar"), Bytes::from("bar")).await?; /// /// // List object - /// let locations = store.list(None).await? - /// .map(|meta: Result| meta.map(|m| m.location)) - /// .boxed(); + /// let locations = store.list(None).map_ok(|m| m.location).boxed(); /// /// // Delete them /// store.delete_stream(locations).try_collect::>().await?; @@ -484,10 +463,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// `foo/bar_baz/x`. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; /// List all the objects with the given prefix and a location greater than `offset` /// @@ -495,18 +471,15 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// the number of network requests required /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { + ) -> BoxStream<'_, Result> { let offset = offset.clone(); - let stream = self - .list(prefix) - .await? + self.list(prefix) .try_filter(move |f| futures::future::ready(f.location > offset)) - .boxed(); - Ok(stream) + .boxed() } /// List objects with the given prefix and an implementation specific @@ -624,19 +597,16 @@ macro_rules! as_ref_impl { self.as_ref().delete_stream(locations) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - self.as_ref().list(prefix).await + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.as_ref().list(prefix) } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - self.as_ref().list_with_offset(prefix, offset).await + ) -> BoxStream<'_, Result> { + self.as_ref().list_with_offset(prefix, offset) } async fn list_with_delimiter( @@ -973,7 +943,6 @@ mod test_util { ) -> Result> { storage .list(prefix) - .await? .map_ok(|meta| meta.location) .try_collect::>() .await @@ -1264,11 +1233,7 @@ mod tests { ]; for (prefix, offset) in cases { - let s = storage - .list_with_offset(prefix.as_ref(), &offset) - .await - .unwrap(); - + let s = storage.list_with_offset(prefix.as_ref(), &offset); let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); @@ -1700,12 +1665,7 @@ mod tests { } async fn delete_fixtures(storage: &DynObjectStore) { - let paths = storage - .list(None) - .await - .unwrap() - .map_ok(|meta| meta.location) - .boxed(); + let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage .delete_stream(paths) .try_collect::>() @@ -1714,18 +1674,18 @@ mod tests { } /// Test that the returned stream does not borrow the lifetime of Path - async fn list_store<'a, 'b>( + fn list_store<'a>( store: &'a dyn ObjectStore, - path_str: &'b str, - ) -> super::Result>> { + path_str: &str, + ) -> BoxStream<'a, Result> { let path = Path::from(path_str); - store.list(Some(&path)).await + store.list(Some(&path)) } #[tokio::test] async fn test_list_lifetimes() { let store = memory::InMemory::new(); - let mut stream = list_store(&store, "path").await.unwrap(); + let mut stream = list_store(&store, "path"); assert!(stream.next().await.is_none()); } diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index a9b8c4b05020..00cbce023c3d 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -23,7 +23,7 @@ use crate::{ }; use async_trait::async_trait; use bytes::Bytes; -use futures::Stream; +use futures::{FutureExt, Stream}; use std::io::{Error, IoSlice}; use std::ops::Range; use std::pin::Pin; @@ -147,23 +147,31 @@ impl ObjectStore for LimitStore { self.inner.delete_stream(locations) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let s = self.inner.list(prefix).await?; - Ok(PermitWrapper::new(s, permit).boxed()) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + let prefix = prefix.cloned(); + let fut = Arc::clone(&self.semaphore) + .acquire_owned() + .map(move |permit| { + let s = self.inner.list(prefix.as_ref()); + PermitWrapper::new(s, permit.unwrap()) + }); + fut.into_stream().flatten().boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let s = self.inner.list_with_offset(prefix, offset).await?; - Ok(PermitWrapper::new(s, permit).boxed()) + ) -> BoxStream<'_, Result> { + let prefix = prefix.cloned(); + let offset = offset.clone(); + let fut = Arc::clone(&self.semaphore) + .acquire_owned() + .map(move |permit| { + let s = self.inner.list_with_offset(prefix.as_ref(), &offset); + PermitWrapper::new(s, permit.unwrap()) + }); + fut.into_stream().flatten().boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -272,6 +280,8 @@ mod tests { use crate::memory::InMemory; use crate::tests::*; use crate::ObjectStore; + use futures::stream::StreamExt; + use std::pin::Pin; use std::time::Duration; use tokio::time::timeout; @@ -290,19 +300,21 @@ mod tests { let mut streams = Vec::with_capacity(max_requests); for _ in 0..max_requests { - let stream = integration.list(None).await.unwrap(); + let mut stream = integration.list(None).peekable(); + Pin::new(&mut stream).peek().await; // Ensure semaphore is acquired streams.push(stream); } let t = Duration::from_millis(20); // Expect to not be able to make another request - assert!(timeout(t, integration.list(None)).await.is_err()); + let fut = integration.list(None).collect::>(); + assert!(timeout(t, fut).await.is_err()); // Drop one of the streams streams.pop(); // Can now make another request - integration.list(None).await.unwrap(); + integration.list(None).collect::>().await; } } diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 3d4a02a1e9e9..38467c3a9e7c 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -420,14 +420,14 @@ impl ObjectStore for LocalFileSystem { .await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let config = Arc::clone(&self.config); let root_path = match prefix { - Some(prefix) => config.path_to_filesystem(prefix)?, + Some(prefix) => match config.path_to_filesystem(prefix) { + Ok(path) => path, + Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), + }, None => self.config.root.to_file_path().unwrap(), }; @@ -457,36 +457,34 @@ impl ObjectStore for LocalFileSystem { // If no tokio context, return iterator directly as no // need to perform chunked spawn_blocking reads if tokio::runtime::Handle::try_current().is_err() { - return Ok(futures::stream::iter(s).boxed()); + return futures::stream::iter(s).boxed(); } // Otherwise list in batches of CHUNK_SIZE const CHUNK_SIZE: usize = 1024; let buffer = VecDeque::with_capacity(CHUNK_SIZE); - let stream = - futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { - if buffer.is_empty() { - (s, buffer) = tokio::task::spawn_blocking(move || { - for _ in 0..CHUNK_SIZE { - match s.next() { - Some(r) => buffer.push_back(r), - None => break, - } + futures::stream::try_unfold((s, buffer), |(mut s, mut buffer)| async move { + if buffer.is_empty() { + (s, buffer) = tokio::task::spawn_blocking(move || { + for _ in 0..CHUNK_SIZE { + match s.next() { + Some(r) => buffer.push_back(r), + None => break, } - (s, buffer) - }) - .await?; - } - - match buffer.pop_front() { - Some(Err(e)) => Err(e), - Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), - None => Ok(None), - } - }); + } + (s, buffer) + }) + .await?; + } - Ok(stream.boxed()) + match buffer.pop_front() { + Some(Err(e)) => Err(e), + Some(Ok(meta)) => Ok(Some((meta, (s, buffer)))), + None => Ok(None), + } + }) + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -1138,21 +1136,14 @@ mod tests { let store = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - // `list` must fail - match store.list(None).await { - Err(_) => { - // ok, error found - } - Ok(mut stream) => { - let mut any_err = false; - while let Some(res) = stream.next().await { - if res.is_err() { - any_err = true; - } - } - assert!(any_err); + let mut stream = store.list(None); + let mut any_err = false; + while let Some(res) = stream.next().await { + if res.is_err() { + any_err = true; } } + assert!(any_err); // `list_with_delimiter assert!(store.list_with_delimiter(None).await.is_err()); @@ -1226,13 +1217,7 @@ mod tests { prefix: Option<&Path>, expected: &[&str], ) { - let result: Vec<_> = integration - .list(prefix) - .await - .unwrap() - .try_collect() - .await - .unwrap(); + let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); strings.sort_unstable(); @@ -1428,8 +1413,7 @@ mod tests { std::fs::write(temp_dir.path().join(filename), "foo").unwrap(); - let list_stream = integration.list(None).await.unwrap(); - let res: Vec<_> = list_stream.try_collect().await.unwrap(); + let res: Vec<_> = integration.list(None).try_collect().await.unwrap(); assert_eq!(res.len(), 1); assert_eq!(res[0].location.as_ref(), filename); diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index f638ed6d7a55..00b330b5eb94 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -228,10 +228,7 @@ impl ObjectStore for InMemory { Ok(()) } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); @@ -256,7 +253,7 @@ impl ObjectStore for InMemory { }) .collect(); - Ok(futures::stream::iter(values).boxed()) + futures::stream::iter(values).boxed() } /// The memory implementation returns all results, as opposed to the cloud diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 39585f73b692..3776dec2e872 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -144,24 +144,21 @@ impl ObjectStore for PrefixStore { self.inner.delete(&full_path).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let prefix = self.full_path(prefix.unwrap_or(&Path::default())); - let s = self.inner.list(Some(&prefix)).await?; - Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + let s = self.inner.list(Some(&prefix)); + s.map_ok(|meta| self.strip_meta(meta)).boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { + ) -> BoxStream<'_, Result> { let offset = self.full_path(offset); let prefix = self.full_path(prefix.unwrap_or(&Path::default())); - let s = self.inner.list_with_offset(Some(&prefix), &offset).await?; - Ok(s.map_ok(|meta| self.strip_meta(meta)).boxed()) + let s = self.inner.list_with_offset(Some(&prefix), &offset); + s.map_ok(|meta| self.strip_meta(meta)).boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 58c476ab4530..f716a11f8a05 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -233,29 +233,30 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - async fn list( - &self, - prefix: Option<&Path>, - ) -> Result>> { - sleep(self.config().wait_list_per_call).await; - - // need to copy to avoid moving / referencing `self` - let wait_list_per_entry = self.config().wait_list_per_entry; - let stream = self.inner.list(prefix).await?; - Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + let stream = self.inner.list(prefix); + futures::stream::once(async move { + let wait_list_per_entry = self.config().wait_list_per_entry; + sleep(self.config().wait_list_per_call).await; + throttle_stream(stream, move |_| wait_list_per_entry) + }) + .flatten() + .boxed() } - async fn list_with_offset( + fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> Result>> { - sleep(self.config().wait_list_per_call).await; - - // need to copy to avoid moving / referencing `self` - let wait_list_per_entry = self.config().wait_list_per_entry; - let stream = self.inner.list_with_offset(prefix, offset).await?; - Ok(throttle_stream(stream, move |_| wait_list_per_entry)) + ) -> BoxStream<'_, Result> { + let stream = self.inner.list_with_offset(prefix, offset); + futures::stream::once(async move { + let wait_list_per_entry = self.config().wait_list_per_entry; + sleep(self.config().wait_list_per_call).await; + throttle_stream(stream, move |_| wait_list_per_entry) + }) + .flatten() + .boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { @@ -511,13 +512,7 @@ mod tests { let prefix = Path::from("foo"); // clean up store - let entries: Vec<_> = store - .list(Some(&prefix)) - .await - .unwrap() - .try_collect() - .await - .unwrap(); + let entries: Vec<_> = store.list(Some(&prefix)).try_collect().await.unwrap(); for entry in entries { store.delete(&entry.location).await.unwrap(); @@ -583,8 +578,6 @@ mod tests { let t0 = Instant::now(); store .list(Some(&prefix)) - .await - .unwrap() .try_collect::>() .await .unwrap(); diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index f926e3b07f2a..25c469260675 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -75,10 +75,7 @@ impl ObjectStore for MyStore { todo!() } - async fn list( - &self, - _: Option<&Path>, - ) -> object_store::Result>> { + fn list(&self, _: Option<&Path>) -> BoxStream<'_, object_store::Result> { todo!() } From 511ac44cf94ffe3f35e4efd3d1e816a8657a5061 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:27:16 +0100 Subject: [PATCH 1282/1411] Fix object_store docs (#4947) --- object_store/src/parse.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 1159e9a1af17..2e72a710ac75 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -47,12 +47,12 @@ impl From for super::Error { } } -/// Recognises various URL formats, identifying the relevant [`ObjectStore`](crate::ObjectStore) +/// Recognises various URL formats, identifying the relevant [`ObjectStore`] #[derive(Debug, Eq, PartialEq)] enum ObjectStoreScheme { - /// Url corresponding to [`LocalFileSystem`](crate::local::LocalFileSystem) + /// Url corresponding to [`LocalFileSystem`] Local, - /// Url corresponding to [`InMemory`](crate::memory::InMemory) + /// Url corresponding to [`InMemory`] Memory, /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3) AmazonS3, From 952cd2efcb787385c6368acc8c582ffc5a7dfd95 Mon Sep 17 00:00:00 2001 From: Andre Martins <38951957+amartins23@users.noreply.github.com> Date: Tue, 17 Oct 2023 22:31:23 +0100 Subject: [PATCH 1283/1411] Expose SubstraitPlan structure in arrow_flight::sql (#4932) (#4933) --- arrow-flight/src/sql/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 4bb8ce8b36e5..4042ce8efc46 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -93,6 +93,7 @@ pub use gen::SqlSupportedTransactions; pub use gen::SqlSupportedUnions; pub use gen::SqlSupportsConvert; pub use gen::SqlTransactionIsolationLevel; +pub use gen::SubstraitPlan; pub use gen::SupportedSqlGrammar; pub use gen::TicketStatementQuery; pub use gen::UpdateDeleteRules; From a94ccff9deac04ca075f6f05f81a5755af81348e Mon Sep 17 00:00:00 2001 From: fan <75058860+fansehep@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:36:43 +0800 Subject: [PATCH 1284/1411] feat: support parsing for parquet writer option (#4938) * feat: support parsing for parquet writer option Signed-off-by: fan * fix clippy warning Signed-off-by: fan * add tests Signed-off-by: fan * follow reviews Signed-off-by: fan * fix only support lower and uppercase Signed-off-by: fan --------- Signed-off-by: fan --- parquet/src/basic.rs | 185 +++++++++++++++++++++++++++++++++ parquet/src/file/properties.rs | 68 ++++++++++++ 2 files changed, 253 insertions(+) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index cc8d033f42a4..cdad3597ffef 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -18,6 +18,7 @@ //! Contains Rust mappings for Thrift definition. //! Refer to [`parquet.thrift`](https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift) file to see raw definitions. +use std::str::FromStr; use std::{fmt, str}; pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; @@ -278,6 +279,29 @@ pub enum Encoding { BYTE_STREAM_SPLIT, } +impl FromStr for Encoding { + type Err = ParquetError; + + fn from_str(s: &str) -> Result { + match s { + "PLAIN" | "plain" => Ok(Encoding::PLAIN), + "PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY), + "RLE" | "rle" => Ok(Encoding::RLE), + "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED), + "DELTA_BINARY_PACKED" | "delta_binary_packed" => { + Ok(Encoding::DELTA_BINARY_PACKED) + } + "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => { + Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) + } + "DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY), + "RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY), + "BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT), + _ => Err(general_err!("unknown encoding: {}", s)), + } + } +} + // ---------------------------------------------------------------------- // Mirrors `parquet::CompressionCodec` @@ -295,6 +319,90 @@ pub enum Compression { LZ4_RAW, } +fn split_compression_string( + str_setting: &str, +) -> Result<(&str, Option), ParquetError> { + let split_setting = str_setting.split_once('('); + + match split_setting { + Some((codec, level_str)) => { + let level = + &level_str[..level_str.len() - 1] + .parse::() + .map_err(|_| { + ParquetError::General(format!( + "invalid compression level: {}", + level_str + )) + })?; + Ok((codec, Some(*level))) + } + None => Ok((str_setting, None)), + } +} + +fn check_level_is_none(level: &Option) -> Result<(), ParquetError> { + if level.is_some() { + return Err(ParquetError::General("level is not support".to_string())); + } + + Ok(()) +} + +fn require_level(codec: &str, level: Option) -> Result { + level.ok_or(ParquetError::General(format!("{} require level", codec))) +} + +impl FromStr for Compression { + type Err = ParquetError; + + fn from_str(s: &str) -> std::result::Result { + let (codec, level) = split_compression_string(s)?; + + let c = match codec { + "UNCOMPRESSED" | "uncompressed" => { + check_level_is_none(&level)?; + Compression::UNCOMPRESSED + } + "SNAPPY" | "snappy" => { + check_level_is_none(&level)?; + Compression::SNAPPY + } + "GZIP" | "gzip" => { + let level = require_level(codec, level)?; + Compression::GZIP(GzipLevel::try_new(level)?) + } + "LZO" | "lzo" => { + check_level_is_none(&level)?; + Compression::LZO + } + "BROTLI" | "brotli" => { + let level = require_level(codec, level)?; + Compression::BROTLI(BrotliLevel::try_new(level)?) + } + "LZ4" | "lz4" => { + check_level_is_none(&level)?; + Compression::LZ4 + } + "ZSTD" | "zstd" => { + let level = require_level(codec, level)?; + Compression::ZSTD(ZstdLevel::try_new(level as i32)?) + } + "LZ4_RAW" | "lz4_raw" => { + check_level_is_none(&level)?; + Compression::LZ4_RAW + } + _ => { + return Err(ParquetError::General(format!( + "unsupport compression {codec}" + ))); + } + }; + + Ok(c) + } +} + // ---------------------------------------------------------------------- // Mirrors `parquet::PageType` @@ -2130,4 +2238,81 @@ mod tests { ); assert_eq!(ColumnOrder::UNDEFINED.sort_order(), SortOrder::SIGNED); } + + #[test] + fn test_parse_encoding() { + let mut encoding: Encoding = "PLAIN".parse().unwrap(); + assert_eq!(encoding, Encoding::PLAIN); + encoding = "PLAIN_DICTIONARY".parse().unwrap(); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + encoding = "RLE".parse().unwrap(); + assert_eq!(encoding, Encoding::RLE); + encoding = "BIT_PACKED".parse().unwrap(); + assert_eq!(encoding, Encoding::BIT_PACKED); + encoding = "DELTA_BINARY_PACKED".parse().unwrap(); + assert_eq!(encoding, Encoding::DELTA_BINARY_PACKED); + encoding = "DELTA_LENGTH_BYTE_ARRAY".parse().unwrap(); + assert_eq!(encoding, Encoding::DELTA_LENGTH_BYTE_ARRAY); + encoding = "DELTA_BYTE_ARRAY".parse().unwrap(); + assert_eq!(encoding, Encoding::DELTA_BYTE_ARRAY); + encoding = "RLE_DICTIONARY".parse().unwrap(); + assert_eq!(encoding, Encoding::RLE_DICTIONARY); + encoding = "BYTE_STREAM_SPLIT".parse().unwrap(); + assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT); + + // test lowercase + encoding = "byte_stream_split".parse().unwrap(); + assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT); + + // test unknown string + match "plain_xxx".parse::() { + Ok(e) => { + panic!("Should not be able to parse {:?}", e); + } + Err(e) => { + assert_eq!(e.to_string(), "Parquet error: unknown encoding: plain_xxx"); + } + } + } + + #[test] + fn test_parse_compression() { + let mut compress: Compression = "snappy".parse().unwrap(); + assert_eq!(compress, Compression::SNAPPY); + compress = "lzo".parse().unwrap(); + assert_eq!(compress, Compression::LZO); + compress = "zstd(3)".parse().unwrap(); + assert_eq!(compress, Compression::ZSTD(ZstdLevel::try_new(3).unwrap())); + compress = "LZ4_RAW".parse().unwrap(); + assert_eq!(compress, Compression::LZ4_RAW); + compress = "uncompressed".parse().unwrap(); + assert_eq!(compress, Compression::UNCOMPRESSED); + compress = "snappy".parse().unwrap(); + assert_eq!(compress, Compression::SNAPPY); + compress = "gzip(9)".parse().unwrap(); + assert_eq!(compress, Compression::GZIP(GzipLevel::try_new(9).unwrap())); + compress = "lzo".parse().unwrap(); + assert_eq!(compress, Compression::LZO); + compress = "brotli(3)".parse().unwrap(); + assert_eq!( + compress, + Compression::BROTLI(BrotliLevel::try_new(3).unwrap()) + ); + compress = "lz4".parse().unwrap(); + assert_eq!(compress, Compression::LZ4); + + // test unknown compression + let mut err = "plain_xxx".parse::().unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: unknown encoding: plain_xxx" + ); + + // test invalid compress level + err = "gzip(-10)".parse::().unwrap_err(); + assert_eq!( + err.to_string(), + "Parquet error: unknown encoding: gzip(-10)" + ); + } } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index c83fea3f9b92..93b034cf4f60 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -16,6 +16,7 @@ // under the License. //! Configuration via [`WriterProperties`] and [`ReaderProperties`] +use std::str::FromStr; use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; @@ -72,6 +73,18 @@ impl WriterVersion { } } +impl FromStr for WriterVersion { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0), + "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0), + _ => Err(format!("Invalid writer version: {}", s)), + } + } +} + /// Reference counted writer properties. pub type WriterPropertiesPtr = Arc; @@ -655,6 +668,19 @@ pub enum EnabledStatistics { Page, } +impl FromStr for EnabledStatistics { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "NONE" | "none" => Ok(EnabledStatistics::None), + "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk), + "PAGE" | "page" => Ok(EnabledStatistics::Page), + _ => Err(format!("Invalid statistics arg: {}", s)), + } + } +} + impl Default for EnabledStatistics { fn default() -> Self { DEFAULT_STATISTICS_ENABLED @@ -1182,4 +1208,46 @@ mod tests { assert_eq!(props.codec_options(), &codec_options); } + + #[test] + fn test_parse_writerversion() { + let mut writer_version = "PARQUET_1_0".parse::().unwrap(); + assert_eq!(writer_version, WriterVersion::PARQUET_1_0); + writer_version = "PARQUET_2_0".parse::().unwrap(); + assert_eq!(writer_version, WriterVersion::PARQUET_2_0); + + // test lowercase + writer_version = "parquet_1_0".parse::().unwrap(); + assert_eq!(writer_version, WriterVersion::PARQUET_1_0); + + // test invalid version + match "PARQUET_-1_0".parse::() { + Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"), + Err(e) => { + assert_eq!(e, "Invalid writer version: PARQUET_-1_0"); + } + } + } + + #[test] + fn test_parse_enabledstatistics() { + let mut enabled_statistics = "NONE".parse::().unwrap(); + assert_eq!(enabled_statistics, EnabledStatistics::None); + enabled_statistics = "CHUNK".parse::().unwrap(); + assert_eq!(enabled_statistics, EnabledStatistics::Chunk); + enabled_statistics = "PAGE".parse::().unwrap(); + assert_eq!(enabled_statistics, EnabledStatistics::Page); + + // test lowercase + enabled_statistics = "none".parse::().unwrap(); + assert_eq!(enabled_statistics, EnabledStatistics::None); + + //test invalid statistics + match "ChunkAndPage".parse::() { + Ok(_) => panic!("Should not be able to parse ChunkAndPage"), + Err(e) => { + assert_eq!(e, "Invalid statistics arg: ChunkAndPage"); + } + } + } } From 4964d844313d5e62cf102616d26864dca6fe286e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:18:52 +0100 Subject: [PATCH 1285/1411] Add `ReaderBuilder::with_header` for csv reader (#4949) * Add ReaderBuilder::with_header * Update test --- arrow-csv/examples/csv_calculation.rs | 2 +- arrow-csv/src/reader/mod.rs | 48 ++++++++++++++++----------- arrow/benches/csv_reader.rs | 2 +- parquet/src/bin/parquet-fromcsv.rs | 6 ++-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/arrow-csv/examples/csv_calculation.rs b/arrow-csv/examples/csv_calculation.rs index 12aaadde4415..6ce963e2b012 100644 --- a/arrow-csv/examples/csv_calculation.rs +++ b/arrow-csv/examples/csv_calculation.rs @@ -33,7 +33,7 @@ fn main() { Field::new("c4", DataType::Boolean, true), ]); let mut reader = ReaderBuilder::new(Arc::new(csv_schema)) - .has_header(true) + .with_header(true) .build(file) .unwrap(); diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 1106b16bc46f..a194b35ffa46 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -225,7 +225,7 @@ impl InferredDataType { /// The format specification for the CSV file #[derive(Debug, Clone, Default)] pub struct Format { - has_header: bool, + header: bool, delimiter: Option, escape: Option, quote: Option, @@ -235,7 +235,7 @@ pub struct Format { impl Format { pub fn with_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.header = has_header; self } @@ -280,7 +280,7 @@ impl Format { // get or create header names // when has_header is false, creates default column names with column_ prefix - let headers: Vec = if self.has_header { + let headers: Vec = if self.header { let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); headers.iter().map(|s| s.to_string()).collect() } else { @@ -331,7 +331,7 @@ impl Format { /// Build a [`csv::Reader`] for this [`Format`] fn build_reader(&self, reader: R) -> csv::Reader { let mut builder = csv::ReaderBuilder::new(); - builder.has_headers(self.has_header); + builder.has_headers(self.header); if let Some(c) = self.delimiter { builder.delimiter(c); @@ -403,7 +403,7 @@ pub fn infer_reader_schema( ) -> Result<(Schema, usize), ArrowError> { let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; format.infer_schema(reader, max_read_records) @@ -425,7 +425,7 @@ pub fn infer_schema_from_files( let mut records_to_read = max_read_records.unwrap_or(usize::MAX); let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; @@ -1095,8 +1095,16 @@ impl ReaderBuilder { } /// Set whether the CSV file has headers + #[deprecated(note = "Use with_header")] + #[doc(hidden)] pub fn has_header(mut self, has_header: bool) -> Self { - self.format.has_header = has_header; + self.format.header = has_header; + self + } + + /// Set whether the CSV file has a header + pub fn with_header(mut self, has_header: bool) -> Self { + self.format.header = has_header; self } @@ -1176,7 +1184,7 @@ impl ReaderBuilder { let delimiter = self.format.build_parser(); let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len()); - let header = self.format.has_header as usize; + let header = self.format.header as usize; let (start, end) = match self.bounds { Some((start, end)) => (start + header, end + header), @@ -1317,7 +1325,7 @@ mod tests { .chain(Cursor::new("\n".to_string())) .chain(file_without_headers); let mut csv = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .build(both_files) .unwrap(); let batch = csv.next().unwrap().unwrap(); @@ -1335,7 +1343,7 @@ mod tests { .unwrap(); file.rewind().unwrap(); - let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true); + let builder = ReaderBuilder::new(Arc::new(schema)).with_header(true); let mut csv = builder.build(file).unwrap(); let expected_schema = Schema::new(vec![ @@ -1505,7 +1513,7 @@ mod tests { let file = File::open("test/data/null_test.csv").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .build(file) .unwrap(); @@ -1530,7 +1538,7 @@ mod tests { let file = File::open("test/data/init_null_test.csv").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .build(file) .unwrap(); @@ -1588,7 +1596,7 @@ mod tests { let null_regex = Regex::new("^nil$").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .with_null_regex(null_regex) .build(file) .unwrap(); @@ -1710,7 +1718,7 @@ mod tests { ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .with_delimiter(b'|') .with_batch_size(512) .with_projection(vec![0, 1, 2, 3]); @@ -2037,7 +2045,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_quote(b'~'); // default is ", change to ~ let mut csv_text = Vec::new(); @@ -2069,7 +2077,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_escape(b'\\'); // default is None, change to \ let mut csv_text = Vec::new(); @@ -2101,7 +2109,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_terminator(b'\n'); // default is CRLF, change to LF let mut csv_text = Vec::new(); @@ -2143,7 +2151,7 @@ mod tests { ])); for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() { - let mut reader = ReaderBuilder::new(schema.clone()).has_header(has_header); + let mut reader = ReaderBuilder::new(schema.clone()).with_header(has_header); if let Some((start, end)) = bounds { reader = reader.with_bounds(start, end); } @@ -2208,7 +2216,7 @@ mod tests { for capacity in [1, 3, 7, 100] { let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build(File::open(path).unwrap()) .unwrap(); @@ -2226,7 +2234,7 @@ mod tests { let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build_buffered(buffered) .unwrap(); diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 4c3f663bf741..5a91dfe0a6ff 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -45,7 +45,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: Vec) { let cursor = Cursor::new(buf.as_slice()); let reader = csv::ReaderBuilder::new(batch.schema()) .with_batch_size(batch_size) - .has_header(true) + .with_header(true) .build_buffered(cursor) .unwrap(); diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 548bbdbfb8f1..1f5d0a62bbfa 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -321,7 +321,7 @@ fn configure_reader_builder(args: &Args, arrow_schema: Arc) -> ReaderBui let mut builder = ReaderBuilder::new(arrow_schema) .with_batch_size(args.batch_size) - .has_header(args.has_header) + .with_header(args.has_header) .with_delimiter(args.get_delimiter()); builder = configure_reader( @@ -606,7 +606,7 @@ mod tests { let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{reader_builder:?}"); - assert_debug_text(&builder_debug, "has_header", "false"); + assert_debug_text(&builder_debug, "header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); assert_debug_text(&builder_debug, "quote", "Some(34)"); assert_debug_text(&builder_debug, "terminator", "None"); @@ -641,7 +641,7 @@ mod tests { ])); let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{reader_builder:?}"); - assert_debug_text(&builder_debug, "has_header", "true"); + assert_debug_text(&builder_debug, "header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); assert_debug_text(&builder_debug, "quote", "None"); assert_debug_text(&builder_debug, "terminator", "Some(10)"); From 6e332b8f570d53bdc906159a97b2c5f95db670e5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:20:57 +0100 Subject: [PATCH 1286/1411] Prepare arrow 48.0.0 (#4948) --- CHANGELOG-old.md | 69 ++++++++++++++++++ CHANGELOG.md | 117 +++++++++++++++++-------------- Cargo.toml | 32 ++++----- dev/release/update_change_log.sh | 4 +- 4 files changed, 151 insertions(+), 71 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index bac7847bdac5..cde9b8f3b521 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,75 @@ # Historical Changelog +## [47.0.0](https://github.com/apache/arrow-rs/tree/47.0.0) (2023-09-19) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/46.0.0...47.0.0) + +**Breaking changes:** + +- Make FixedSizeBinaryArray value\_data return a reference [\#4820](https://github.com/apache/arrow-rs/issues/4820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update prost to v0.12.1 [\#4825](https://github.com/apache/arrow-rs/pull/4825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: FixedSizeBinaryArray::value\_data return reference [\#4821](https://github.com/apache/arrow-rs/pull/4821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Stateless Row Encoding / Don't Preserve Dictionaries in `RowConverter` \(\#4811\) [\#4819](https://github.com/apache/arrow-rs/pull/4819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- fix: entries field is non-nullable [\#4808](https://github.com/apache/arrow-rs/pull/4808) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Fix flight sql do put handling, add bind parameter support to FlightSQL cli client [\#4797](https://github.com/apache/arrow-rs/pull/4797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([suremarc](https://github.com/suremarc)) +- Remove unused dyn\_cmp\_dict feature [\#4766](https://github.com/apache/arrow-rs/pull/4766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add underlying `std::io::Error` to `IoError` and add `IpcError` variant [\#4726](https://github.com/apache/arrow-rs/pull/4726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexandreyc](https://github.com/alexandreyc)) + +**Implemented enhancements:** + +- Row Format Adapative Block Size [\#4812](https://github.com/apache/arrow-rs/issues/4812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Stateless Row Conversion [\#4811](https://github.com/apache/arrow-rs/issues/4811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add option to specify custom null values for CSV reader [\#4794](https://github.com/apache/arrow-rs/issues/4794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet::record::RowIter cannot be customized with batch\_size and defaults to 1024 [\#4782](https://github.com/apache/arrow-rs/issues/4782) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `DynScalar` abstraction \(something that makes it easy to create scalar `Datum`s\) [\#4781](https://github.com/apache/arrow-rs/issues/4781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Datum` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4780](https://github.com/apache/arrow-rs/issues/4780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Scalar` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4779](https://github.com/apache/arrow-rs/issues/4779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support IntoPyArrow for impl RecordBatchReader [\#4730](https://github.com/apache/arrow-rs/issues/4730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based String Kernels [\#4595](https://github.com/apache/arrow-rs/issues/4595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- MapArray::new\_from\_strings creates nullable entries field [\#4807](https://github.com/apache/arrow-rs/issues/4807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- pyarrow module can't roundtrip tensor arrays [\#4805](https://github.com/apache/arrow-rs/issues/4805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `concat_batches` errors with "schema mismatch" error when only metadata differs [\#4799](https://github.com/apache/arrow-rs/issues/4799) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- panic in `cmp` kernels with DictionaryArrays: `Option::unwrap()` on a `None` value' [\#4788](https://github.com/apache/arrow-rs/issues/4788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- stream ffi panics if schema metadata values aren't valid utf8 [\#4750](https://github.com/apache/arrow-rs/issues/4750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression: Incorrect Sorting of `*ListArray` in 46.0.0 [\#4746](https://github.com/apache/arrow-rs/issues/4746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row is no longer comparable after reuse [\#4741](https://github.com/apache/arrow-rs/issues/4741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DoPut FlightSQL handler inadvertently consumes schema at start of Request\\> [\#4658](https://github.com/apache/arrow-rs/issues/4658) +- Return error when converting schema [\#4752](https://github.com/apache/arrow-rs/pull/4752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Implement PyArrowType for `Box` [\#4751](https://github.com/apache/arrow-rs/pull/4751) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) + +**Closed issues:** + +- Building arrow-rust for target wasm32-wasi falied to compile packed\_simd\_2 [\#4717](https://github.com/apache/arrow-rs/issues/4717) + +**Merged pull requests:** + +- Respect FormatOption::nulls for NullArray [\#4836](https://github.com/apache/arrow-rs/pull/4836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix merge\_dictionary\_values in selection kernels [\#4833](https://github.com/apache/arrow-rs/pull/4833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix like scalar null [\#4832](https://github.com/apache/arrow-rs/pull/4832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More chrono deprecations [\#4822](https://github.com/apache/arrow-rs/pull/4822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Adaptive Row Block Size \(\#4812\) [\#4818](https://github.com/apache/arrow-rs/pull/4818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.66 to =1.0.67 [\#4816](https://github.com/apache/arrow-rs/pull/4816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Do not check schema for equality in concat\_batches [\#4815](https://github.com/apache/arrow-rs/pull/4815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: export record batch through stream [\#4806](https://github.com/apache/arrow-rs/pull/4806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Improve CSV Reader Benchmark Coverage of Small Primitives [\#4803](https://github.com/apache/arrow-rs/pull/4803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- csv: Add option to specify custom null values [\#4795](https://github.com/apache/arrow-rs/pull/4795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vrongmeal](https://github.com/vrongmeal)) +- Expand docstring and add example to `Scalar` [\#4793](https://github.com/apache/arrow-rs/pull/4793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Re-export array crate root \(\#4780\) \(\#4779\) [\#4791](https://github.com/apache/arrow-rs/pull/4791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix DictionaryArray::normalized\_keys \(\#4788\) [\#4789](https://github.com/apache/arrow-rs/pull/4789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow custom tree builder for parquet::record::RowIter [\#4783](https://github.com/apache/arrow-rs/pull/4783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([YuraKotov](https://github.com/YuraKotov)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: avoid panic if offset index not exists. [\#4761](https://github.com/apache/arrow-rs/pull/4761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RinChanNOWWW](https://github.com/RinChanNOWWW)) +- Relax constraints on PyArrowType [\#4757](https://github.com/apache/arrow-rs/pull/4757) ([tustvold](https://github.com/tustvold)) +- Chrono deprecations [\#4748](https://github.com/apache/arrow-rs/pull/4748) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix List Sorting, Revert Removal of Rank Kernels [\#4747](https://github.com/apache/arrow-rs/pull/4747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clear row buffer before reuse [\#4742](https://github.com/apache/arrow-rs/pull/4742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- Datum based like kernels \(\#4595\) [\#4732](https://github.com/apache/arrow-rs/pull/4732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: expose DoGet response headers & trailers [\#4727](https://github.com/apache/arrow-rs/pull/4727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Cleanup length and bit\_length kernels [\#4718](https://github.com/apache/arrow-rs/pull/4718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) ## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) [Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f97055a9c0d..8c5351708c0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,75 +19,86 @@ # Changelog -## [47.0.0](https://github.com/apache/arrow-rs/tree/47.0.0) (2023-09-19) +## [48.0.0](https://github.com/apache/arrow-rs/tree/48.0.0) (2023-10-18) -[Full Changelog](https://github.com/apache/arrow-rs/compare/46.0.0...47.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/47.0.0...48.0.0) **Breaking changes:** -- Make FixedSizeBinaryArray value\_data return a reference [\#4820](https://github.com/apache/arrow-rs/issues/4820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Update prost to v0.12.1 [\#4825](https://github.com/apache/arrow-rs/pull/4825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- feat: FixedSizeBinaryArray::value\_data return reference [\#4821](https://github.com/apache/arrow-rs/pull/4821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Stateless Row Encoding / Don't Preserve Dictionaries in `RowConverter` \(\#4811\) [\#4819](https://github.com/apache/arrow-rs/pull/4819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- fix: entries field is non-nullable [\#4808](https://github.com/apache/arrow-rs/pull/4808) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Fix flight sql do put handling, add bind parameter support to FlightSQL cli client [\#4797](https://github.com/apache/arrow-rs/pull/4797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([suremarc](https://github.com/suremarc)) -- Remove unused dyn\_cmp\_dict feature [\#4766](https://github.com/apache/arrow-rs/pull/4766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add underlying `std::io::Error` to `IoError` and add `IpcError` variant [\#4726](https://github.com/apache/arrow-rs/pull/4726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexandreyc](https://github.com/alexandreyc)) +- Evaluate null\_regex for string type in csv \(now such values will be parsed as `Null` rather than `""`\) [\#4942](https://github.com/apache/arrow-rs/pull/4942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haohuaijin](https://github.com/haohuaijin)) +- fix\(csv\)!: infer null for empty column. [\#4910](https://github.com/apache/arrow-rs/pull/4910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- feat: log headers/trailers in flight CLI \(+ minor fixes\) [\#4898](https://github.com/apache/arrow-rs/pull/4898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- fix\(arrow-json\)!: include null fields in schema inference with a type of Null [\#4894](https://github.com/apache/arrow-rs/pull/4894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Mark OnCloseRowGroup Send [\#4893](https://github.com/apache/arrow-rs/pull/4893) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) +- Specialize Thrift Decoding \(~40% Faster\) \(\#4891\) [\#4892](https://github.com/apache/arrow-rs/pull/4892) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make ArrowRowGroupWriter Public and SerializedRowGroupWriter Send [\#4850](https://github.com/apache/arrow-rs/pull/4850) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) **Implemented enhancements:** -- Row Format Adapative Block Size [\#4812](https://github.com/apache/arrow-rs/issues/4812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Stateless Row Conversion [\#4811](https://github.com/apache/arrow-rs/issues/4811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Add option to specify custom null values for CSV reader [\#4794](https://github.com/apache/arrow-rs/issues/4794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet::record::RowIter cannot be customized with batch\_size and defaults to 1024 [\#4782](https://github.com/apache/arrow-rs/issues/4782) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `DynScalar` abstraction \(something that makes it easy to create scalar `Datum`s\) [\#4781](https://github.com/apache/arrow-rs/issues/4781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `Datum` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4780](https://github.com/apache/arrow-rs/issues/4780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `Scalar` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4779](https://github.com/apache/arrow-rs/issues/4779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support IntoPyArrow for impl RecordBatchReader [\#4730](https://github.com/apache/arrow-rs/issues/4730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Datum Based String Kernels [\#4595](https://github.com/apache/arrow-rs/issues/4595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Allow schema fields to merge with `Null` datatype [\#4901](https://github.com/apache/arrow-rs/issues/4901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add option to FlightDataEncoder to always send dictionaries [\#4895](https://github.com/apache/arrow-rs/issues/4895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Rework Thrift Encoding / Decoding of Parquet Metadata [\#4891](https://github.com/apache/arrow-rs/issues/4891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Implement Take for UnionArray [\#4882](https://github.com/apache/arrow-rs/issues/4882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check precision overflow for casting floating to decimal [\#4865](https://github.com/apache/arrow-rs/issues/4865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace lexical [\#4774](https://github.com/apache/arrow-rs/issues/4774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add read access to settings in `csv::WriterBuilder` [\#4735](https://github.com/apache/arrow-rs/issues/4735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve the performance of "DictionaryValue" row encoding [\#4712](https://github.com/apache/arrow-rs/issues/4712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Fixed bugs:** -- MapArray::new\_from\_strings creates nullable entries field [\#4807](https://github.com/apache/arrow-rs/issues/4807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- pyarrow module can't roundtrip tensor arrays [\#4805](https://github.com/apache/arrow-rs/issues/4805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `concat_batches` errors with "schema mismatch" error when only metadata differs [\#4799](https://github.com/apache/arrow-rs/issues/4799) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- panic in `cmp` kernels with DictionaryArrays: `Option::unwrap()` on a `None` value' [\#4788](https://github.com/apache/arrow-rs/issues/4788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- stream ffi panics if schema metadata values aren't valid utf8 [\#4750](https://github.com/apache/arrow-rs/issues/4750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Regression: Incorrect Sorting of `*ListArray` in 46.0.0 [\#4746](https://github.com/apache/arrow-rs/issues/4746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Row is no longer comparable after reuse [\#4741](https://github.com/apache/arrow-rs/issues/4741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- DoPut FlightSQL handler inadvertently consumes schema at start of Request\\> [\#4658](https://github.com/apache/arrow-rs/issues/4658) -- Return error when converting schema [\#4752](https://github.com/apache/arrow-rs/pull/4752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Implement PyArrowType for `Box` [\#4751](https://github.com/apache/arrow-rs/pull/4751) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Should we make blank values and empty string to `None` in csv? [\#4939](https://github.com/apache/arrow-rs/issues/4939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] SubstraitPlan structure is not exported [\#4932](https://github.com/apache/arrow-rs/issues/4932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Loading page index breaks skipping of pages with nested types [\#4921](https://github.com/apache/arrow-rs/issues/4921) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- CSV schema inference assumes `Utf8` for empty columns [\#4903](https://github.com/apache/arrow-rs/issues/4903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: Field Ids are not read from a Parquet file without serialized arrow schema [\#4877](https://github.com/apache/arrow-rs/issues/4877) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- make\_primitive\_scalar function loses DataType Internal information [\#4851](https://github.com/apache/arrow-rs/issues/4851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- StructBuilder doesn't handle nulls correctly for empty structs [\#4842](https://github.com/apache/arrow-rs/issues/4842) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `NullArray::is_null()` returns `false` incorrectly [\#4835](https://github.com/apache/arrow-rs/issues/4835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- cast\_string\_to\_decimal should check precision overflow [\#4829](https://github.com/apache/arrow-rs/issues/4829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Null fields are omitted by `infer_json_schema_from_seekable` [\#4814](https://github.com/apache/arrow-rs/issues/4814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Closed issues:** -- Building arrow-rust for target wasm32-wasi falied to compile packed\_simd\_2 [\#4717](https://github.com/apache/arrow-rs/issues/4717) +- Support for reading JSON Array to Arrow [\#4905](https://github.com/apache/arrow-rs/issues/4905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Respect FormatOption::nulls for NullArray [\#4836](https://github.com/apache/arrow-rs/pull/4836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix merge\_dictionary\_values in selection kernels [\#4833](https://github.com/apache/arrow-rs/pull/4833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix like scalar null [\#4832](https://github.com/apache/arrow-rs/pull/4832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- More chrono deprecations [\#4822](https://github.com/apache/arrow-rs/pull/4822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Adaptive Row Block Size \(\#4812\) [\#4818](https://github.com/apache/arrow-rs/pull/4818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.66 to =1.0.67 [\#4816](https://github.com/apache/arrow-rs/pull/4816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Do not check schema for equality in concat\_batches [\#4815](https://github.com/apache/arrow-rs/pull/4815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- fix: export record batch through stream [\#4806](https://github.com/apache/arrow-rs/pull/4806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Improve CSV Reader Benchmark Coverage of Small Primitives [\#4803](https://github.com/apache/arrow-rs/pull/4803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- csv: Add option to specify custom null values [\#4795](https://github.com/apache/arrow-rs/pull/4795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vrongmeal](https://github.com/vrongmeal)) -- Expand docstring and add example to `Scalar` [\#4793](https://github.com/apache/arrow-rs/pull/4793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Re-export array crate root \(\#4780\) \(\#4779\) [\#4791](https://github.com/apache/arrow-rs/pull/4791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix DictionaryArray::normalized\_keys \(\#4788\) [\#4789](https://github.com/apache/arrow-rs/pull/4789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Allow custom tree builder for parquet::record::RowIter [\#4783](https://github.com/apache/arrow-rs/pull/4783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([YuraKotov](https://github.com/YuraKotov)) -- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) -- fix: avoid panic if offset index not exists. [\#4761](https://github.com/apache/arrow-rs/pull/4761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RinChanNOWWW](https://github.com/RinChanNOWWW)) -- Relax constraints on PyArrowType [\#4757](https://github.com/apache/arrow-rs/pull/4757) ([tustvold](https://github.com/tustvold)) -- Chrono deprecations [\#4748](https://github.com/apache/arrow-rs/pull/4748) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix List Sorting, Revert Removal of Rank Kernels [\#4747](https://github.com/apache/arrow-rs/pull/4747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Clear row buffer before reuse [\#4742](https://github.com/apache/arrow-rs/pull/4742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) -- Datum based like kernels \(\#4595\) [\#4732](https://github.com/apache/arrow-rs/pull/4732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- feat: expose DoGet response headers & trailers [\#4727](https://github.com/apache/arrow-rs/pull/4727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Cleanup length and bit\_length kernels [\#4718](https://github.com/apache/arrow-rs/pull/4718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Assume Pages Delimit Records When Offset Index Loaded \(\#4921\) [\#4943](https://github.com/apache/arrow-rs/pull/4943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update pyo3 requirement from 0.19 to 0.20 [\#4941](https://github.com/apache/arrow-rs/pull/4941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add `FileWriter` schema getter [\#4940](https://github.com/apache/arrow-rs/pull/4940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haixuanTao](https://github.com/haixuanTao)) +- feat: support parsing for parquet writer option [\#4938](https://github.com/apache/arrow-rs/pull/4938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([fansehep](https://github.com/fansehep)) +- Export `SubstraitPlan` structure in arrow\_flight::sql \(\#4932\) [\#4933](https://github.com/apache/arrow-rs/pull/4933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- Update zstd requirement from 0.12.0 to 0.13.0 [\#4923](https://github.com/apache/arrow-rs/pull/4923) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: add method for async read bloom filter [\#4917](https://github.com/apache/arrow-rs/pull/4917) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hengfeiyang](https://github.com/hengfeiyang)) +- Minor: Clarify rationale for `FlightDataEncoder` API, add examples [\#4916](https://github.com/apache/arrow-rs/pull/4916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Update regex-syntax requirement from 0.7.1 to 0.8.0 [\#4914](https://github.com/apache/arrow-rs/pull/4914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: document & streamline flight SQL CLI [\#4912](https://github.com/apache/arrow-rs/pull/4912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Arbitrary JSON values in JSON Reader \(\#4905\) [\#4911](https://github.com/apache/arrow-rs/pull/4911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup CSV WriterBuilder, Default to AutoSI Second Precision \(\#4735\) [\#4909](https://github.com/apache/arrow-rs/pull/4909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.68 to =1.0.69 [\#4907](https://github.com/apache/arrow-rs/pull/4907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- chore: add csv example [\#4904](https://github.com/apache/arrow-rs/pull/4904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- feat\(schema\): allow null fields to be merged with other datatypes [\#4902](https://github.com/apache/arrow-rs/pull/4902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Update proc-macro2 requirement from =1.0.67 to =1.0.68 [\#4900](https://github.com/apache/arrow-rs/pull/4900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add option to `FlightDataEncoder` to always resend batch dictionaries [\#4896](https://github.com/apache/arrow-rs/pull/4896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Support parquet bloom filter length [\#4885](https://github.com/apache/arrow-rs/pull/4885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([letian-jiang](https://github.com/letian-jiang)) +- Replace lz4 with lz4\_flex Allowing Compilation for WASM [\#4884](https://github.com/apache/arrow-rs/pull/4884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Take for UnionArray [\#4883](https://github.com/apache/arrow-rs/pull/4883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Update tonic-build requirement from =0.10.1 to =0.10.2 [\#4881](https://github.com/apache/arrow-rs/pull/4881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- parquet: Read field IDs from Parquet Schema [\#4878](https://github.com/apache/arrow-rs/pull/4878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Samrose-Ahmed](https://github.com/Samrose-Ahmed)) +- feat: improve flight CLI error handling [\#4873](https://github.com/apache/arrow-rs/pull/4873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Encoding Parquet Columns in Parallel [\#4871](https://github.com/apache/arrow-rs/pull/4871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Check precision overflow for casting floating to decimal [\#4866](https://github.com/apache/arrow-rs/pull/4866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make align\_buffers as public API [\#4863](https://github.com/apache/arrow-rs/pull/4863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) +- Faster Serde Integration \(~80% faster\) [\#4861](https://github.com/apache/arrow-rs/pull/4861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: make\_primitive\_scalar bug [\#4852](https://github.com/apache/arrow-rs/pull/4852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) +- Update tonic-build requirement from =0.10.0 to =0.10.1 [\#4846](https://github.com/apache/arrow-rs/pull/4846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow Constructing Non-Empty StructArray with no Fields \(\#4842\) [\#4845](https://github.com/apache/arrow-rs/pull/4845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Refine documentation to `Array::is_null` [\#4838](https://github.com/apache/arrow-rs/pull/4838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: add missing precision overflow checking for `cast_string_to_decimal` [\#4830](https://github.com/apache/arrow-rs/pull/4830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonahgao](https://github.com/jonahgao)) diff --git a/Cargo.toml b/Cargo.toml index d874e335eeae..d59a5af68a19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "47.0.0" +version = "48.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "47.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "47.0.0", path = "./arrow-arith" } -arrow-array = { version = "47.0.0", path = "./arrow-array" } -arrow-buffer = { version = "47.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "47.0.0", path = "./arrow-cast" } -arrow-csv = { version = "47.0.0", path = "./arrow-csv" } -arrow-data = { version = "47.0.0", path = "./arrow-data" } -arrow-ipc = { version = "47.0.0", path = "./arrow-ipc" } -arrow-json = { version = "47.0.0", path = "./arrow-json" } -arrow-ord = { version = "47.0.0", path = "./arrow-ord" } -arrow-row = { version = "47.0.0", path = "./arrow-row" } -arrow-schema = { version = "47.0.0", path = "./arrow-schema" } -arrow-select = { version = "47.0.0", path = "./arrow-select" } -arrow-string = { version = "47.0.0", path = "./arrow-string" } -parquet = { version = "47.0.0", path = "./parquet", default-features = false } +arrow = { version = "48.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "48.0.0", path = "./arrow-arith" } +arrow-array = { version = "48.0.0", path = "./arrow-array" } +arrow-buffer = { version = "48.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "48.0.0", path = "./arrow-cast" } +arrow-csv = { version = "48.0.0", path = "./arrow-csv" } +arrow-data = { version = "48.0.0", path = "./arrow-data" } +arrow-ipc = { version = "48.0.0", path = "./arrow-ipc" } +arrow-json = { version = "48.0.0", path = "./arrow-json" } +arrow-ord = { version = "48.0.0", path = "./arrow-ord" } +arrow-row = { version = "48.0.0", path = "./arrow-row" } +arrow-schema = { version = "48.0.0", path = "./arrow-schema" } +arrow-select = { version = "48.0.0", path = "./arrow-select" } +arrow-string = { version = "48.0.0", path = "./arrow-string" } +parquet = { version = "48.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 74bbb4ac1e8d..c1627ebb8cf2 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="46.0.0" -FUTURE_RELEASE="47.0.0" +SINCE_TAG="47.0.0" +FUTURE_RELEASE="48.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 51ac6fec8755147cd6b1dfe7d76bfdcfacad0463 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:52:36 +0100 Subject: [PATCH 1287/1411] Respect ARROW_TEST_DATA in apache-avro tests (#4950) --- arrow-avro/src/lib.rs | 10 ++++++++++ arrow-avro/src/reader/header.rs | 5 +++-- arrow-avro/src/reader/mod.rs | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arrow-avro/src/lib.rs b/arrow-avro/src/lib.rs index e134d9d798f2..c76ecb399a45 100644 --- a/arrow-avro/src/lib.rs +++ b/arrow-avro/src/lib.rs @@ -26,3 +26,13 @@ pub mod reader; mod schema; mod compression; + +#[cfg(test)] +mod test_util { + pub fn arrow_test_data(path: &str) -> String { + match std::env::var("ARROW_TEST_DATA") { + Ok(dir) => format!("{dir}/{path}"), + Err(_) => format!("../testing/data/{path}"), + } + } +} diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs index 92db8b1dc76d..2d443175a7aa 100644 --- a/arrow-avro/src/reader/header.rs +++ b/arrow-avro/src/reader/header.rs @@ -240,6 +240,7 @@ mod test { use super::*; use crate::reader::read_header; use crate::schema::SCHEMA_METADATA_KEY; + use crate::test_util::arrow_test_data; use std::fs::File; use std::io::{BufRead, BufReader}; @@ -266,7 +267,7 @@ mod test { #[test] fn test_header() { - let header = decode_file("../testing/data/avro/alltypes_plain.avro"); + let header = decode_file(&arrow_test_data("avro/alltypes_plain.avro")); let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":["int","null"]},{"name":"bool_col","type":["boolean","null"]},{"name":"tinyint_col","type":["int","null"]},{"name":"smallint_col","type":["int","null"]},{"name":"int_col","type":["int","null"]},{"name":"bigint_col","type":["long","null"]},{"name":"float_col","type":["float","null"]},{"name":"double_col","type":["double","null"]},{"name":"date_string_col","type":["bytes","null"]},{"name":"string_col","type":["bytes","null"]},{"name":"timestamp_col","type":[{"type":"long","logicalType":"timestamp-micros"},"null"]}]}"#; assert_eq!(schema_json, expected); @@ -276,7 +277,7 @@ mod test { 226966037233754408753420635932530907102 ); - let header = decode_file("../testing/data/avro/fixed_length_decimal.avro"); + let header = decode_file(&arrow_test_data("avro/fixed_length_decimal.avro")); let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"value","type":[{"type":"fixed","name":"fixed","namespace":"topLevelRecord.value","size":11,"logicalType":"decimal","precision":25,"scale":2},"null"]}]}"#; assert_eq!(schema_json, expected); diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index a42011e3b2ad..91e2dbf9835b 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -76,12 +76,13 @@ fn read_blocks( #[cfg(test)] mod test { use crate::reader::{read_blocks, read_header}; + use crate::test_util::arrow_test_data; use std::fs::File; use std::io::BufReader; #[test] fn test_mux() { - let file = File::open("../testing/data/avro/alltypes_plain.avro").unwrap(); + let file = File::open(arrow_test_data("avro/alltypes_plain.avro")).unwrap(); let mut reader = BufReader::new(file); let header = read_header(&mut reader).unwrap(); for result in read_blocks(reader) { From 4cca0291441fe622f13db6724f8bc3efb1a31b5b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 09:44:46 +0100 Subject: [PATCH 1288/1411] Return `PutResult` with an ETag from ObjectStore::put (#4934) (#4944) * Return ETag from ObjectStore::put (#4934) * Further tests * Clippy * Review feedback --- object_store/src/aws/client.rs | 12 +++- object_store/src/aws/mod.rs | 25 ++------ object_store/src/azure/mod.rs | 20 ++++--- object_store/src/chunked.rs | 3 +- object_store/src/client/header.rs | 17 +++--- object_store/src/gcp/mod.rs | 87 +++++++++++----------------- object_store/src/http/client.rs | 4 +- object_store/src/http/mod.rs | 13 ++++- object_store/src/lib.rs | 35 ++++++++++- object_store/src/limit.rs | 4 +- object_store/src/local.rs | 43 ++++++++++---- object_store/src/memory.rs | 14 +++-- object_store/src/prefix.rs | 5 +- object_store/src/throttle.rs | 5 +- object_store/tests/get_range_file.rs | 4 +- 15 files changed, 169 insertions(+), 122 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 8a45a9f3ac47..eb81e92fb932 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -21,6 +21,7 @@ use crate::aws::{ AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, }; use crate::client::get::GetClient; +use crate::client::header::get_etag; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -122,6 +123,11 @@ pub(crate) enum Error { #[snafu(display("Got invalid multipart response: {}", source))] InvalidMultipartResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for crate::Error { @@ -243,12 +249,14 @@ impl S3Client { } /// Make an S3 PUT request + /// + /// Returns the ETag pub async fn put_request( &self, path: &Path, bytes: Bytes, query: &T, - ) -> Result { + ) -> Result { let credential = self.get_credential().await?; let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); @@ -287,7 +295,7 @@ impl S3Client { path: path.as_ref(), })?; - Ok(response) + Ok(get_etag(response.headers()).context(MetadataSnafu)?) } /// Make an S3 Delete request diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index d3c50861c122..6d5aecea2d17 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -59,7 +59,7 @@ use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, Result, RetryConfig, + ObjectStore, Path, PutResult, Result, RetryConfig, }; mod checksum; @@ -109,12 +109,6 @@ enum Error { #[snafu(display("Missing SecretAccessKey"))] MissingSecretAccessKey, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, - - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -273,9 +267,9 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, bytes, &()).await?; - Ok(()) + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let e_tag = self.client.put_request(location, bytes, &()).await?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -365,10 +359,9 @@ struct S3MultiPartUpload { #[async_trait] impl PutPart for S3MultiPartUpload { async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - use reqwest::header::ETAG; let part = (part_idx + 1).to_string(); - let response = self + let content_id = self .client .put_request( &self.location, @@ -377,13 +370,7 @@ impl PutPart for S3MultiPartUpload { ) .await?; - let etag = response.headers().get(ETAG).context(MissingEtagSnafu)?; - - let etag = etag.to_str().context(BadHeaderSnafu)?; - - Ok(PartId { - content_id: etag.to_string(), - }) + Ok(PartId { content_id }) } async fn complete(&self, completed_parts: Vec) -> Result<()> { diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 2a08c6775807..0e638efc399f 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -31,7 +31,7 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ObjectStore, PutResult, Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -62,6 +62,7 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; +use crate::client::header::get_etag; pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; @@ -81,9 +82,6 @@ const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; #[derive(Debug, Snafu)] #[allow(missing_docs)] enum Error { - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] UnableToParseUrl { source: url::ParseError, @@ -126,8 +124,10 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -170,11 +170,13 @@ impl std::fmt::Display for MicrosoftAzure { #[async_trait] impl ObjectStore for MicrosoftAzure { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let response = self + .client .put_request(location, Some(bytes), false, &()) .await?; - Ok(()) + let e_tag = Some(get_etag(response.headers()).context(MetadataSnafu)?); + Ok(PutResult { e_tag }) } async fn put_multipart( diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index d3e02b412725..5694c55d787f 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -30,6 +30,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, }; use crate::{MultipartId, Result}; @@ -62,7 +63,7 @@ impl Display for ChunkedStore { #[async_trait] impl ObjectStore for ChunkedStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { self.inner.put(location, bytes).await } diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index 6499eff5aebe..17f83a2ba8c8 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -64,6 +64,12 @@ pub enum Error { }, } +/// Extracts an etag from the provided [`HeaderMap`] +pub fn get_etag(headers: &HeaderMap) -> Result { + let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; + Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) +} + /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] pub fn header_meta( location: &Path, @@ -81,13 +87,10 @@ pub fn header_meta( None => Utc.timestamp_nanos(0), }; - let e_tag = match headers.get(ETAG) { - Some(e_tag) => { - let e_tag = e_tag.to_str().context(BadHeaderSnafu)?; - Some(e_tag.to_string()) - } - None if cfg.etag_required => return Err(Error::MissingEtag), - None => None, + let e_tag = match get_etag(headers) { + Ok(e_tag) => Some(e_tag), + Err(Error::MissingEtag) if !cfg.etag_required => None, + Err(e) => return Err(e), }; let content_length = headers diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 513e396cbae6..97755c07c671 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -54,7 +54,7 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::{Path, DELIMITER}, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, RetryConfig, + ObjectStore, PutResult, Result, RetryConfig, }; use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; @@ -65,6 +65,7 @@ const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; +use crate::client::header::get_etag; use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; pub use credential::GcpCredential; @@ -155,11 +156,10 @@ enum Error { #[snafu(display("Configuration key: '{}' is not known.", key))] UnknownConfigurationKey { key: String }, - #[snafu(display("ETag Header missing from response"))] - MissingEtag, - - #[snafu(display("Received header containing non-ASCII data"))] - BadHeader { source: header::ToStrError }, + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for super::Error { @@ -247,7 +247,14 @@ impl GoogleCloudStorageClient { } /// Perform a put request - async fn put_request(&self, path: &Path, payload: Bytes) -> Result<()> { + /// + /// Returns the new ETag + async fn put_request( + &self, + path: &Path, + payload: Bytes, + query: &T, + ) -> Result { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -256,8 +263,10 @@ impl GoogleCloudStorageClient { .get_content_type(path) .unwrap_or("application/octet-stream"); - self.client + let response = self + .client .request(Method::PUT, url) + .query(query) .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) @@ -268,7 +277,7 @@ impl GoogleCloudStorageClient { path: path.as_ref(), })?; - Ok(()) + Ok(get_etag(response.headers()).context(MetadataSnafu)?) } /// Initiate a multi-part upload @@ -469,7 +478,7 @@ impl ListClient for GoogleCloudStorageClient { struct GCSMultipartUpload { client: Arc, - encoded_path: String, + path: Path, multipart_id: MultipartId, } @@ -478,38 +487,17 @@ impl PutPart for GCSMultipartUpload { /// Upload an object part async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { let upload_id = self.multipart_id.clone(); - let url = format!( - "{}/{}/{}", - self.client.base_url, self.client.bucket_name_encoded, self.encoded_path - ); - - let credential = self.client.get_credential().await?; - - let response = self + let content_id = self .client - .client - .request(Method::PUT, &url) - .bearer_auth(&credential.bearer) - .query(&[ - ("partNumber", format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ]) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, format!("{}", buf.len())) - .body(buf) - .send_retry(&self.client.retry_config) - .await - .context(PutRequestSnafu { - path: &self.encoded_path, - })?; - - let content_id = response - .headers() - .get("ETag") - .context(MissingEtagSnafu)? - .to_str() - .context(BadHeaderSnafu)? - .to_string(); + .put_request( + &self.path, + buf.into(), + &[ + ("partNumber", format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ], + ) + .await?; Ok(PartId { content_id }) } @@ -517,10 +505,7 @@ impl PutPart for GCSMultipartUpload { /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { let upload_id = self.multipart_id.clone(); - let url = format!( - "{}/{}/{}", - self.client.base_url, self.client.bucket_name_encoded, self.encoded_path - ); + let url = self.client.object_url(&self.path); let parts = completed_parts .into_iter() @@ -550,7 +535,7 @@ impl PutPart for GCSMultipartUpload { .send_retry(&self.client.retry_config) .await .context(PostRequestSnafu { - path: &self.encoded_path, + path: self.path.as_ref(), })?; Ok(()) @@ -559,8 +544,9 @@ impl PutPart for GCSMultipartUpload { #[async_trait] impl ObjectStore for GoogleCloudStorage { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put_request(location, bytes).await + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let e_tag = self.client.put_request(location, bytes, &()).await?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -569,12 +555,9 @@ impl ObjectStore for GoogleCloudStorage { ) -> Result<(MultipartId, Box)> { let upload_id = self.client.multipart_initiate(location).await?; - let encoded_path = - percent_encode(location.to_string().as_bytes(), NON_ALPHANUMERIC).to_string(); - let inner = GCSMultipartUpload { client: Arc::clone(&self.client), - encoded_path, + path: location.clone(), multipart_id: upload_id.clone(), }; diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index b2a6ac0aa34a..4c2a7fcf8db3 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -160,7 +160,7 @@ impl Client { Ok(()) } - pub async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + pub async fn put(&self, location: &Path, bytes: Bytes) -> Result { let mut retry = false; loop { let url = self.path_url(location); @@ -170,7 +170,7 @@ impl Client { } match builder.send_retry(&self.retry_config).await { - Ok(_) => return Ok(()), + Ok(response) => return Ok(response), Err(source) => match source.status() { // Some implementations return 404 instead of 409 Some(StatusCode::CONFLICT | StatusCode::NOT_FOUND) if !retry => { diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 2fd7850b6bbf..e41e4f990110 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -41,11 +41,12 @@ use tokio::io::AsyncWrite; use url::Url; use crate::client::get::GetClientExt; +use crate::client::header::get_etag; use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, Result, RetryConfig, + ObjectMeta, ObjectStore, PutResult, Result, RetryConfig, }; mod client; @@ -95,8 +96,14 @@ impl std::fmt::Display for HttpStore { #[async_trait] impl ObjectStore for HttpStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.client.put(location, bytes).await + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let response = self.client.put(location, bytes).await?; + let e_tag = match get_etag(response.headers()) { + Ok(e_tag) => Some(e_tag), + Err(crate::client::header::Error::MissingEtag) => None, + Err(source) => return Err(Error::Metadata { source }.into()), + }; + Ok(PutResult { e_tag }) } async fn put_multipart( diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 9b396444fa0d..018f0f5e8dec 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -300,7 +300,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// The operation is guaranteed to be atomic, it will either successfully /// write the entirety of `bytes` to `location`, or fail. No clients /// should be able to observe a partially written object - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()>; + async fn put(&self, location: &Path, bytes: Bytes) -> Result; /// Get a multi-part upload that allows writing data in chunks /// @@ -528,7 +528,7 @@ macro_rules! as_ref_impl { ($type:ty) => { #[async_trait] impl ObjectStore for $type { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { self.as_ref().put(location, bytes).await } @@ -659,6 +659,8 @@ pub struct ObjectMeta { /// The size in bytes of the object pub size: usize, /// The unique identifier for the object + /// + /// pub e_tag: Option, } @@ -850,6 +852,15 @@ impl GetResult { } } +/// Result for a put request +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PutResult { + /// The unique identifier for the object + /// + /// + pub e_tag: Option, +} + /// A specialized `Result` for object store-related errors pub type Result = std::result::Result; @@ -1383,6 +1394,26 @@ mod tests { ..GetOptions::default() }; storage.get_opts(&path, options).await.unwrap(); + + let result = storage.put(&path, "test".into()).await.unwrap(); + let new_tag = result.e_tag.unwrap(); + assert_ne!(tag, new_tag); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.e_tag.unwrap(), new_tag); + + let options = GetOptions { + if_match: Some(new_tag), + ..GetOptions::default() + }; + storage.get_opts(&path, options).await.unwrap(); + + let options = GetOptions { + if_match: Some(tag), + ..GetOptions::default() + }; + let err = storage.get_opts(&path, options).await.unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); } /// Returns a chunk of length `chunk_length` diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 00cbce023c3d..8a453813c24e 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -19,7 +19,7 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, - ObjectMeta, ObjectStore, Path, Result, StreamExt, + ObjectMeta, ObjectStore, Path, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -72,7 +72,7 @@ impl std::fmt::Display for LimitStore { #[async_trait] impl ObjectStore for LimitStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.put(location, bytes).await } diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 38467c3a9e7c..4b7c96346e4d 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -20,7 +20,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, Result, + ObjectStore, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -36,6 +36,7 @@ use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::Poll; +use std::time::SystemTime; use std::{collections::BTreeSet, convert::TryFrom, io}; use std::{collections::VecDeque, path::PathBuf}; use tokio::io::AsyncWrite; @@ -270,7 +271,7 @@ impl Config { #[async_trait] impl ObjectStore for LocalFileSystem { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; @@ -282,8 +283,17 @@ impl ObjectStore for LocalFileSystem { }) .map_err(|e| { let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - e.into() - }) + e + })?; + + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: path.to_string_lossy().to_string(), + })?; + + Ok(PutResult { + e_tag: Some(get_etag(&metadata)), + }) }) .await } @@ -959,24 +969,33 @@ fn last_modified(metadata: &Metadata) -> DateTime { .into() } +fn get_etag(metadata: &Metadata) -> String { + let inode = get_inode(metadata); + let size = metadata.len(); + let mtime = metadata + .modified() + .ok() + .and_then(|mtime| mtime.duration_since(SystemTime::UNIX_EPOCH).ok()) + .unwrap_or_default() + .as_micros(); + + // Use an ETag scheme based on that used by many popular HTTP servers + // + // + format!("{inode:x}-{mtime:x}-{size:x}") +} + fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { path: location.as_ref(), })?; - let inode = get_inode(&metadata); - let mtime = last_modified.timestamp_micros(); - - // Use an ETag scheme based on that used by many popular HTTP servers - // - // - let etag = format!("{inode:x}-{mtime:x}-{size:x}"); Ok(ObjectMeta { location, last_modified, size, - e_tag: Some(etag), + e_tag: Some(get_etag(&metadata)), }) } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 00b330b5eb94..952b45739759 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -17,7 +17,8 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -106,11 +107,12 @@ struct Storage { type SharedStorage = Arc>; impl Storage { - fn insert(&mut self, location: &Path, bytes: Bytes) { + fn insert(&mut self, location: &Path, bytes: Bytes) -> usize { let etag = self.next_etag; self.next_etag += 1; let entry = Entry::new(bytes, Utc::now(), etag); self.map.insert(location.clone(), entry); + etag } } @@ -122,9 +124,11 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { - self.storage.write().insert(location, bytes); - Ok(()) + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + let etag = self.storage.write().insert(location, bytes); + Ok(PutResult { + e_tag: Some(etag.to_string()), + }) } async fn put_multipart( diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 3776dec2e872..21f6c1d99dc9 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -23,7 +23,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; #[doc(hidden)] @@ -79,7 +80,7 @@ impl PrefixStore { #[async_trait::async_trait] impl ObjectStore for PrefixStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { let full_path = self.full_path(location); self.inner.put(&full_path, bytes).await } diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index f716a11f8a05..d6f191baf82e 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -21,7 +21,8 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -147,7 +148,7 @@ impl std::fmt::Display for ThrottledStore { #[async_trait] impl ObjectStore for ThrottledStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put(&self, location: &Path, bytes: Bytes) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.put(location, bytes).await diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index 25c469260675..5703d7f24844 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -23,7 +23,7 @@ use futures::stream::BoxStream; use object_store::local::LocalFileSystem; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, }; use std::fmt::Formatter; use tempfile::tempdir; @@ -40,7 +40,7 @@ impl std::fmt::Display for MyStore { #[async_trait] impl ObjectStore for MyStore { - async fn put(&self, path: &Path, data: Bytes) -> object_store::Result<()> { + async fn put(&self, path: &Path, data: Bytes) -> object_store::Result { self.0.put(path, data).await } From 62ca5f37d143db172a73b3f0365f48f8bc3e2c72 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:40:49 +0100 Subject: [PATCH 1289/1411] Split aws Module (#4953) * Split aws module * Clippy * Fix doc --- object_store/src/aws/builder.rs | 1098 +++++++++++++++++++++++++++++ object_store/src/aws/mod.rs | 1169 +------------------------------ object_store/src/aws/resolve.rs | 106 +++ 3 files changed, 1225 insertions(+), 1148 deletions(-) create mode 100644 object_store/src/aws/builder.rs create mode 100644 object_store/src/aws/resolve.rs diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs new file mode 100644 index 000000000000..422ba15efa52 --- /dev/null +++ b/object_store/src/aws/builder.rs @@ -0,0 +1,1098 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aws::client::{S3Client, S3Config}; +use crate::aws::credential::{ + InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, +}; +use crate::aws::{ + AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3CopyIfNotExists, STORE, +}; +use crate::client::TokenCredentialProvider; +use crate::config::ConfigValue; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use tracing::info; +use url::Url; + +/// Default metadata endpoint +static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Missing region"))] + MissingRegion, + + #[snafu(display("Missing bucket name"))] + MissingBucketName, + + #[snafu(display("Missing AccessKeyId"))] + MissingAccessKeyId, + + #[snafu(display("Missing SecretAccessKey"))] + MissingSecretAccessKey, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(source), + }, + } + } +} + +/// Configure a connection to Amazon S3 using the specified credentials in +/// the specified Amazon region and bucket. +/// +/// # Example +/// ``` +/// # let REGION = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY_ID = "foo"; +/// # let SECRET_KEY = "foo"; +/// # use object_store::aws::AmazonS3Builder; +/// let s3 = AmazonS3Builder::new() +/// .with_region(REGION) +/// .with_bucket_name(BUCKET_NAME) +/// .with_access_key_id(ACCESS_KEY_ID) +/// .with_secret_access_key(SECRET_KEY) +/// .build(); +/// ``` +#[derive(Debug, Default, Clone)] +pub struct AmazonS3Builder { + /// Access key id + access_key_id: Option, + /// Secret access_key + secret_access_key: Option, + /// Region + region: Option, + /// Bucket name + bucket_name: Option, + /// Endpoint for communicating with AWS S3 + endpoint: Option, + /// Token to use for requests + token: Option, + /// Url + url: Option, + /// Retry config + retry_config: RetryConfig, + /// When set to true, fallback to IMDSv1 + imdsv1_fallback: ConfigValue, + /// When set to true, virtual hosted style request has to be used + virtual_hosted_style_request: ConfigValue, + /// When set to true, unsigned payload option has to be used + unsigned_payload: ConfigValue, + /// Checksum algorithm which has to be used for object integrity check during upload + checksum_algorithm: Option>, + /// Metadata endpoint, see + metadata_endpoint: Option, + /// Container credentials URL, see + container_credentials_relative_uri: Option, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, + /// Skip signing requests + skip_signature: ConfigValue, + /// Copy if not exists + copy_if_not_exists: Option>, +} + +/// Configuration keys for [`AmazonS3Builder`] +/// +/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; +/// let builder = AmazonS3Builder::new() +/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") +/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] +pub enum AmazonS3ConfigKey { + /// AWS Access Key + /// + /// See [`AmazonS3Builder::with_access_key_id`] for details. + /// + /// Supported keys: + /// - `aws_access_key_id` + /// - `access_key_id` + AccessKeyId, + + /// Secret Access Key + /// + /// See [`AmazonS3Builder::with_secret_access_key`] for details. + /// + /// Supported keys: + /// - `aws_secret_access_key` + /// - `secret_access_key` + SecretAccessKey, + + /// Region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_region` + /// - `region` + Region, + + /// Default region + /// + /// See [`AmazonS3Builder::with_region`] for details. + /// + /// Supported keys: + /// - `aws_default_region` + /// - `default_region` + DefaultRegion, + + /// Bucket name + /// + /// See [`AmazonS3Builder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `aws_bucket` + /// - `aws_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Sets custom endpoint for communicating with AWS S3. + /// + /// See [`AmazonS3Builder::with_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_endpoint` + /// - `aws_endpoint_url` + /// - `endpoint` + /// - `endpoint_url` + Endpoint, + + /// Token to use for requests (passed to underlying provider) + /// + /// See [`AmazonS3Builder::with_token`] for details. + /// + /// Supported keys: + /// - `aws_session_token` + /// - `aws_token` + /// - `session_token` + /// - `token` + Token, + + /// Fall back to ImdsV1 + /// + /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. + /// + /// Supported keys: + /// - `aws_imdsv1_fallback` + /// - `imdsv1_fallback` + ImdsV1Fallback, + + /// If virtual hosted style request has to be used + /// + /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. + /// + /// Supported keys: + /// - `aws_virtual_hosted_style_request` + /// - `virtual_hosted_style_request` + VirtualHostedStyleRequest, + + /// Avoid computing payload checksum when calculating signature. + /// + /// See [`AmazonS3Builder::with_unsigned_payload`] for details. + /// + /// Supported keys: + /// - `aws_unsigned_payload` + /// - `unsigned_payload` + UnsignedPayload, + + /// Set the checksum algorithm for this client + /// + /// See [`AmazonS3Builder::with_checksum_algorithm`] + Checksum, + + /// Set the instance metadata endpoint + /// + /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. + /// + /// Supported keys: + /// - `aws_metadata_endpoint` + /// - `metadata_endpoint` + MetadataEndpoint, + + /// Set the container credentials relative URI + /// + /// + ContainerCredentialsRelativeUri, + + /// Configure how to provide `copy_if_not_exists` + /// + /// See [`S3CopyIfNotExists`] + CopyIfNotExists, + + /// Skip signing request + SkipSignature, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for AmazonS3ConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccessKeyId => "aws_access_key_id", + Self::SecretAccessKey => "aws_secret_access_key", + Self::Region => "aws_region", + Self::Bucket => "aws_bucket", + Self::Endpoint => "aws_endpoint", + Self::Token => "aws_session_token", + Self::ImdsV1Fallback => "aws_imdsv1_fallback", + Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", + Self::DefaultRegion => "aws_default_region", + Self::MetadataEndpoint => "aws_metadata_endpoint", + Self::UnsignedPayload => "aws_unsigned_payload", + Self::Checksum => "aws_checksum_algorithm", + Self::ContainerCredentialsRelativeUri => { + "aws_container_credentials_relative_uri" + } + Self::SkipSignature => "aws_skip_signature", + Self::CopyIfNotExists => "copy_if_not_exists", + Self::Client(opt) => opt.as_ref(), + } + } +} + +impl FromStr for AmazonS3ConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), + "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), + "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), + "aws_region" | "region" => Ok(Self::Region), + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { + Ok(Self::Bucket) + } + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { + Ok(Self::Endpoint) + } + "aws_session_token" | "aws_token" | "session_token" | "token" => { + Ok(Self::Token) + } + "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { + Ok(Self::VirtualHostedStyleRequest) + } + "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), + "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), + "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), + "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), + "aws_container_credentials_relative_uri" => { + Ok(Self::ContainerCredentialsRelativeUri) + } + "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), + "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + // Backwards compatibility + "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl AmazonS3Builder { + /// Create a new [`AmazonS3Builder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Fill the [`AmazonS3Builder`] with regular AWS environment variables + /// + /// Variables extracted from environment: + /// * `AWS_ACCESS_KEY_ID` -> access_key_id + /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key + /// * `AWS_DEFAULT_REGION` -> region + /// * `AWS_ENDPOINT` -> endpoint + /// * `AWS_SESSION_TOKEN` -> token + /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> + /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder: Self = Default::default(); + + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AWS_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `s3:///` + /// - `s3a:///` + /// - `https://s3..amazonaws.com/` + /// - `https://.s3..amazonaws.com` + /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::aws::AmazonS3Builder; + /// + /// let s3 = AmazonS3Builder::from_env() + /// .with_url("s3://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config( + mut self, + key: AmazonS3ConfigKey, + value: impl Into, + ) -> Self { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), + AmazonS3ConfigKey::SecretAccessKey => { + self.secret_access_key = Some(value.into()) + } + AmazonS3ConfigKey::Region => self.region = Some(value.into()), + AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), + AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), + AmazonS3ConfigKey::Token => self.token = Some(value.into()), + AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + self.virtual_hosted_style_request.parse(value) + } + AmazonS3ConfigKey::DefaultRegion => { + self.region = self.region.or_else(|| Some(value.into())) + } + AmazonS3ConfigKey::MetadataEndpoint => { + self.metadata_endpoint = Some(value.into()) + } + AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) + } + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri = Some(value.into()) + } + AmazonS3ConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) + } + }; + self + } + + /// Set an option on the builder via a key - value pair. + /// + /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + /// + /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`AmazonS3ConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; + /// + /// let builder = AmazonS3Builder::from_env() + /// .with_bucket_name("foo"); + /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); + /// assert_eq!("foo", &bucket_name); + /// ``` + pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { + match key { + AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { + self.region.clone() + } + AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), + AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), + AmazonS3ConfigKey::Token => self.token.clone(), + AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), + AmazonS3ConfigKey::VirtualHostedStyleRequest => { + Some(self.virtual_hosted_style_request.to_string()) + } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), + AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), + AmazonS3ConfigKey::Checksum => { + self.checksum_algorithm.as_ref().map(ToString::to_string) + } + AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), + AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { + self.container_credentials_relative_uri.clone() + } + AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), + AmazonS3ConfigKey::CopyIfNotExists => { + self.copy_if_not_exists.as_ref().map(ToString::to_string) + } + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + match parsed.scheme() { + "s3" | "s3a" => self.bucket_name = Some(host.to_string()), + "https" => match host.splitn(4, '.').collect_tuple() { + Some(("s3", region, "amazonaws", "com")) => { + self.region = Some(region.to_string()); + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } + Some((bucket, "s3", region, "amazonaws.com")) => { + self.bucket_name = Some(bucket.to_string()); + self.region = Some(region.to_string()); + self.virtual_hosted_style_request = true.into(); + } + Some((account, "r2", "cloudflarestorage", "com")) => { + self.region = Some("auto".to_string()); + let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); + self.endpoint = Some(endpoint); + + let bucket = parsed.path_segments().into_iter().flatten().next(); + if let Some(bucket) = bucket { + self.bucket_name = Some(bucket.into()); + } + } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + }, + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + }; + Ok(()) + } + + /// Set the AWS Access Key (required) + pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { + self.access_key_id = Some(access_key_id.into()); + self + } + + /// Set the AWS Secret Access Key (required) + pub fn with_secret_access_key( + mut self, + secret_access_key: impl Into, + ) -> Self { + self.secret_access_key = Some(secret_access_key.into()); + self + } + + /// Set the region (e.g. `us-east-1`) (required) + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the bucket_name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Sets the endpoint for communicating with AWS S3. Default value + /// is based on region. The `endpoint` field should be consistent with + /// the field `virtual_hosted_style_request'. + /// + /// For example, this might be set to `"http://localhost:4566:` + /// for testing against a localstack instance. + /// If `virtual_hosted_style_request` is set to true then `endpoint` + /// should have bucket name included. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self + } + + /// Set the token to use for requests (passed to underlying provider) + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.client_options = self.client_options.with_allow_http(allow_http); + self + } + + /// Sets if virtual hosted style request has to be used. + /// If `virtual_hosted_style_request` is : + /// * false (default): Path style request is used + /// * true: Virtual hosted style request is used + /// + /// If the `endpoint` is provided then it should be + /// consistent with `virtual_hosted_style_request`. + /// i.e. if `virtual_hosted_style_request` is set to true + /// then `endpoint` should have bucket name included. + pub fn with_virtual_hosted_style_request( + mut self, + virtual_hosted_style_request: bool, + ) -> Self { + self.virtual_hosted_style_request = virtual_hosted_style_request.into(); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// By default instance credentials will only be fetched over [IMDSv2], as AWS recommends + /// against having IMDSv1 enabled on EC2 instances as it is vulnerable to [SSRF attack] + /// + /// However, certain deployment environments, such as those running old versions of kube2iam, + /// may not support IMDSv2. This option will enable automatic fallback to using IMDSv1 + /// if the token endpoint returns a 403 error indicating that IMDSv2 is not supported. + /// + /// This option has no effect if not using instance credentials + /// + /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html + /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ + /// + pub fn with_imdsv1_fallback(mut self) -> Self { + self.imdsv1_fallback = true.into(); + self + } + + /// Sets if unsigned payload option has to be used. + /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) + /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. + /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, + pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { + self.unsigned_payload = unsigned_payload.into(); + self + } + + /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests + /// + /// This can be useful when interacting with public S3 buckets that deny authorized requests + pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { + self.skip_signature = skip_signature.into(); + self + } + + /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. + /// + /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { + // Convert to String to enable deferred parsing of config + self.checksum_algorithm = Some(checksum_algorithm.into()); + self + } + + /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), + /// used primarily within AWS EC2. + /// + /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 + /// endpoint http://fd00:ec2::254. + pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { + self.metadata_endpoint = Some(endpoint.into()); + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Configure how to provide `copy_if_not_exists` + pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { + self.copy_if_not_exists = Some(config.into()); + self + } + + /// Create a [`AmazonS3`] instance from the provided values, + /// consuming `self`. + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let region = self.region.context(MissingRegionSnafu)?; + let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; + let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; + + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { + match (self.access_key_id, self.secret_access_key, self.token) { + (Some(key_id), Some(secret_key), token) => { + info!("Using Static credential provider"); + let credential = AwsCredential { + key_id, + secret_key, + token, + }; + Arc::new(StaticCredentialProvider::new(credential)) as _ + } + (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + (None, None, _) => unreachable!(), + } + } else if let (Ok(token_path), Ok(role_arn)) = ( + std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), + std::env::var("AWS_ROLE_ARN"), + ) { + // TODO: Replace with `AmazonS3Builder::credentials_from_env` + info!("Using WebIdentity credential provider"); + + let session_name = std::env::var("AWS_ROLE_SESSION_NAME") + .unwrap_or_else(|_| "WebIdentitySession".to_string()); + + let endpoint = format!("https://sts.{region}.amazonaws.com"); + + // Disallow non-HTTPs requests + let client = self + .client_options + .clone() + .with_allow_http(false) + .client()?; + + let token = WebIdentityProvider { + token_path, + session_name, + role_arn, + endpoint, + }; + + Arc::new(TokenCredentialProvider::new( + token, + client, + self.retry_config.clone(), + )) as _ + } else if let Some(uri) = self.container_credentials_relative_uri { + info!("Using Task credential provider"); + Arc::new(TaskCredentialProvider { + url: format!("http://169.254.170.2{uri}"), + retry: self.retry_config.clone(), + // The instance metadata endpoint is access over HTTP + client: self.client_options.clone().with_allow_http(true).client()?, + cache: Default::default(), + }) as _ + } else { + info!("Using Instance credential provider"); + + let token = InstanceCredentialProvider { + cache: Default::default(), + imdsv1_fallback: self.imdsv1_fallback.get()?, + metadata_endpoint: self + .metadata_endpoint + .unwrap_or_else(|| DEFAULT_METADATA_ENDPOINT.into()), + }; + + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + + let endpoint: String; + let bucket_endpoint: String; + + // If `endpoint` is provided then its assumed to be consistent with + // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then + // `endpoint` should have bucket name included. + if self.virtual_hosted_style_request.get()? { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); + bucket_endpoint = endpoint.clone(); + } else { + endpoint = self + .endpoint + .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); + bucket_endpoint = format!("{endpoint}/{bucket}"); + } + + let config = S3Config { + region, + endpoint, + bucket, + bucket_endpoint, + credentials, + retry_config: self.retry_config, + client_options: self.client_options, + sign_payload: !self.unsigned_payload.get()?, + skip_signature: self.skip_signature.get()?, + checksum, + copy_if_not_exists, + }; + + let client = Arc::new(S3Client::new(config)?); + + Ok(AmazonS3 { client }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn s3_test_config_from_map() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + let options = HashMap::from([ + ("aws_access_key_id", aws_access_key_id.clone()), + ("aws_secret_access_key", aws_secret_access_key), + ("aws_default_region", aws_default_region.clone()), + ("aws_endpoint", aws_endpoint.clone()), + ("aws_session_token", aws_session_token.clone()), + ("aws_unsigned_payload", "true".to_string()), + ("aws_checksum_algorithm", "sha256".to_string()), + ]); + + let builder = options + .into_iter() + .fold(AmazonS3Builder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }) + .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); + + assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); + assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); + assert_eq!(builder.region.unwrap(), aws_default_region); + assert_eq!(builder.endpoint.unwrap(), aws_endpoint); + assert_eq!(builder.token.unwrap(), aws_session_token); + assert_eq!( + builder.checksum_algorithm.unwrap().get().unwrap(), + Checksum::SHA256 + ); + assert!(builder.unsigned_payload.get().unwrap()); + } + + #[test] + fn s3_test_config_get_value() { + let aws_access_key_id = "object_store:fake_access_key_id".to_string(); + let aws_secret_access_key = "object_store:fake_secret_key".to_string(); + let aws_default_region = "object_store:fake_default_region".to_string(); + let aws_endpoint = "object_store:fake_endpoint".to_string(); + let aws_session_token = "object_store:fake_session_token".to_string(); + + let builder = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) + .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) + .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) + .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) + .with_config(AmazonS3ConfigKey::Token, &aws_session_token) + .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); + + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::AccessKeyId) + .unwrap(), + aws_access_key_id + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) + .unwrap(), + aws_secret_access_key + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::DefaultRegion) + .unwrap(), + aws_default_region + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::Endpoint) + .unwrap(), + aws_endpoint + ); + assert_eq!( + builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), + aws_session_token + ); + assert_eq!( + builder + .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) + .unwrap(), + "true" + ); + } + + #[test] + fn s3_test_urls() { + let mut builder = AmazonS3Builder::new(); + builder.parse_url("s3://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("s3://buckets.can.have.dots/path") + .unwrap(); + assert_eq!( + builder.bucket_name, + Some("buckets.can.have.dots".to_string()) + ); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") + .unwrap(); + assert_eq!(builder.region, Some("region".to_string())); + assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://bucket.s3.region.amazonaws.com") + .unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.region, Some("region".to_string())); + assert!(builder.virtual_hosted_style_request.get().unwrap()); + + let mut builder = AmazonS3Builder::new(); + builder + .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") + .unwrap(); + + assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); + assert_eq!(builder.region, Some("auto".to_string())); + assert_eq!( + builder.endpoint, + Some("https://account123.r2.cloudflarestorage.com".to_string()) + ); + + let err_cases = [ + "mailto://bucket/path", + "https://s3.bucket.mydomain.com", + "https://s3.bucket.foo.amazonaws.com", + "https://bucket.mydomain.region.amazonaws.com", + "https://bucket.s3.region.bar.amazonaws.com", + "https://bucket.foo.s3.amazonaws.com", + ]; + let mut builder = AmazonS3Builder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[tokio::test] + async fn s3_test_proxy_url() { + let s3 = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("https://example.com") + .build(); + + assert!(s3.is_ok()); + + let err = AmazonS3Builder::new() + .with_access_key_id("access_key_id") + .with_secret_access_key("secret_access_key") + .with_region("region") + .with_bucket_name("bucket_name") + .with_allow_http(true) + .with_proxy_url("asdf://example.com") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); + } + + #[test] + fn test_invalid_config() { + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: failed to parse \"enabled\" as boolean" + ); + + let err = AmazonS3Builder::new() + .with_config(AmazonS3ConfigKey::Checksum, "md5") + .with_bucket_name("bucket") + .with_region("region") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + err, + "Generic Config error: \"md5\" is not a valid checksum algorithm" + ); + } +} diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 6d5aecea2d17..a4e39c3b88dd 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -35,40 +35,33 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use itertools::Itertools; use reqwest::Method; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; -use std::{str::FromStr, sync::Arc, time::Duration}; +use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; -use tracing::info; use url::Url; -use crate::aws::client::{S3Client, S3Config}; -use crate::aws::credential::{ - InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, -}; +use crate::aws::client::S3Client; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, StaticCredentialProvider, - TokenCredentialProvider, -}; -use crate::config::ConfigValue; +use crate::client::CredentialProvider; use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, PutResult, Result, RetryConfig, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, + PutResult, Result, }; +mod builder; mod checksum; mod client; mod copy; mod credential; +mod resolve; +pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; pub use copy::S3CopyIfNotExists; +pub use resolve::resolve_bucket_region; // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html // @@ -90,103 +83,6 @@ const STORE: &str = "S3"; pub type AwsCredentialProvider = Arc>; pub use credential::{AwsAuthorizer, AwsCredential}; -/// Default metadata endpoint -static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; - -/// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display("Missing region"))] - MissingRegion, - - #[snafu(display("Missing bucket name"))] - MissingBucketName, - - #[snafu(display("Missing AccessKeyId"))] - MissingAccessKeyId, - - #[snafu(display("Missing SecretAccessKey"))] - MissingSecretAccessKey, - - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Bucket '{}' not found", bucket))] - BucketNotFound { bucket: String }, - - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] - ResolveRegion { - bucket: String, - source: reqwest::Error, - }, - - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] - RegionParse { bucket: String }, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(source), - }, - } - } -} - -/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. -/// -/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html -pub async fn resolve_bucket_region( - bucket: &str, - client_options: &ClientOptions, -) -> Result { - use reqwest::StatusCode; - - let endpoint = format!("https://{}.s3.amazonaws.com", bucket); - - let client = client_options.client()?; - - let response = client - .head(&endpoint) - .send() - .await - .context(ResolveRegionSnafu { bucket })?; - - ensure!( - response.status() != StatusCode::NOT_FOUND, - BucketNotFoundSnafu { bucket } - ); - - let region = response - .headers() - .get("x-amz-bucket-region") - .and_then(|x| x.to_str().ok()) - .context(RegionParseSnafu { bucket })?; - - Ok(region.to_string()) -} - /// Interface for [Amazon S3](https://aws.amazon.com/s3/). #[derive(Debug)] pub struct AmazonS3 { @@ -256,8 +152,10 @@ impl Signer for AmazonS3 { AwsAuthorizer::new(&credential, "s3", &self.client.config().region); let path_url = self.path_url(path); - let mut url = - Url::parse(&path_url).context(UnableToParseUrlSnafu { url: path_url })?; + let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { + store: STORE, + source: format!("Unable to parse url {path_url}: {e}").into(), + })?; authorizer.sign(method, &mut url, expires_in); @@ -381,891 +279,23 @@ impl PutPart for S3MultiPartUpload { } } -/// Configure a connection to Amazon S3 using the specified credentials in -/// the specified Amazon region and bucket. -/// -/// # Example -/// ``` -/// # let REGION = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY_ID = "foo"; -/// # let SECRET_KEY = "foo"; -/// # use object_store::aws::AmazonS3Builder; -/// let s3 = AmazonS3Builder::new() -/// .with_region(REGION) -/// .with_bucket_name(BUCKET_NAME) -/// .with_access_key_id(ACCESS_KEY_ID) -/// .with_secret_access_key(SECRET_KEY) -/// .build(); -/// ``` -#[derive(Debug, Default, Clone)] -pub struct AmazonS3Builder { - /// Access key id - access_key_id: Option, - /// Secret access_key - secret_access_key: Option, - /// Region - region: Option, - /// Bucket name - bucket_name: Option, - /// Endpoint for communicating with AWS S3 - endpoint: Option, - /// Token to use for requests - token: Option, - /// Url - url: Option, - /// Retry config - retry_config: RetryConfig, - /// When set to true, fallback to IMDSv1 - imdsv1_fallback: ConfigValue, - /// When set to true, virtual hosted style request has to be used - virtual_hosted_style_request: ConfigValue, - /// When set to true, unsigned payload option has to be used - unsigned_payload: ConfigValue, - /// Checksum algorithm which has to be used for object integrity check during upload - checksum_algorithm: Option>, - /// Metadata endpoint, see - metadata_endpoint: Option, - /// Container credentials URL, see - container_credentials_relative_uri: Option, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, - /// Skip signing requests - skip_signature: ConfigValue, - /// Copy if not exists - copy_if_not_exists: Option>, -} - -/// Configuration keys for [`AmazonS3Builder`] -/// -/// Configuration via keys can be done via [`AmazonS3Builder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; -/// let builder = AmazonS3Builder::new() -/// .with_config("aws_access_key_id".parse().unwrap(), "my-access-key-id") -/// .with_config(AmazonS3ConfigKey::DefaultRegion, "my-default-region"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] -#[non_exhaustive] -pub enum AmazonS3ConfigKey { - /// AWS Access Key - /// - /// See [`AmazonS3Builder::with_access_key_id`] for details. - /// - /// Supported keys: - /// - `aws_access_key_id` - /// - `access_key_id` - AccessKeyId, - - /// Secret Access Key - /// - /// See [`AmazonS3Builder::with_secret_access_key`] for details. - /// - /// Supported keys: - /// - `aws_secret_access_key` - /// - `secret_access_key` - SecretAccessKey, - - /// Region - /// - /// See [`AmazonS3Builder::with_region`] for details. - /// - /// Supported keys: - /// - `aws_region` - /// - `region` - Region, - - /// Default region - /// - /// See [`AmazonS3Builder::with_region`] for details. - /// - /// Supported keys: - /// - `aws_default_region` - /// - `default_region` - DefaultRegion, - - /// Bucket name - /// - /// See [`AmazonS3Builder::with_bucket_name`] for details. - /// - /// Supported keys: - /// - `aws_bucket` - /// - `aws_bucket_name` - /// - `bucket` - /// - `bucket_name` - Bucket, - - /// Sets custom endpoint for communicating with AWS S3. - /// - /// See [`AmazonS3Builder::with_endpoint`] for details. - /// - /// Supported keys: - /// - `aws_endpoint` - /// - `aws_endpoint_url` - /// - `endpoint` - /// - `endpoint_url` - Endpoint, - - /// Token to use for requests (passed to underlying provider) - /// - /// See [`AmazonS3Builder::with_token`] for details. - /// - /// Supported keys: - /// - `aws_session_token` - /// - `aws_token` - /// - `session_token` - /// - `token` - Token, - - /// Fall back to ImdsV1 - /// - /// See [`AmazonS3Builder::with_imdsv1_fallback`] for details. - /// - /// Supported keys: - /// - `aws_imdsv1_fallback` - /// - `imdsv1_fallback` - ImdsV1Fallback, - - /// If virtual hosted style request has to be used - /// - /// See [`AmazonS3Builder::with_virtual_hosted_style_request`] for details. - /// - /// Supported keys: - /// - `aws_virtual_hosted_style_request` - /// - `virtual_hosted_style_request` - VirtualHostedStyleRequest, - - /// Avoid computing payload checksum when calculating signature. - /// - /// See [`AmazonS3Builder::with_unsigned_payload`] for details. - /// - /// Supported keys: - /// - `aws_unsigned_payload` - /// - `unsigned_payload` - UnsignedPayload, - - /// Set the checksum algorithm for this client - /// - /// See [`AmazonS3Builder::with_checksum_algorithm`] - Checksum, - - /// Set the instance metadata endpoint - /// - /// See [`AmazonS3Builder::with_metadata_endpoint`] for details. - /// - /// Supported keys: - /// - `aws_metadata_endpoint` - /// - `metadata_endpoint` - MetadataEndpoint, - - /// Set the container credentials relative URI - /// - /// - ContainerCredentialsRelativeUri, - - /// Configure how to provide [`ObjectStore::copy_if_not_exists`] - /// - /// See [`S3CopyIfNotExists`] - CopyIfNotExists, - - /// Skip signing request - SkipSignature, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for AmazonS3ConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::AccessKeyId => "aws_access_key_id", - Self::SecretAccessKey => "aws_secret_access_key", - Self::Region => "aws_region", - Self::Bucket => "aws_bucket", - Self::Endpoint => "aws_endpoint", - Self::Token => "aws_session_token", - Self::ImdsV1Fallback => "aws_imdsv1_fallback", - Self::VirtualHostedStyleRequest => "aws_virtual_hosted_style_request", - Self::DefaultRegion => "aws_default_region", - Self::MetadataEndpoint => "aws_metadata_endpoint", - Self::UnsignedPayload => "aws_unsigned_payload", - Self::Checksum => "aws_checksum_algorithm", - Self::ContainerCredentialsRelativeUri => { - "aws_container_credentials_relative_uri" - } - Self::SkipSignature => "aws_skip_signature", - Self::CopyIfNotExists => "copy_if_not_exists", - Self::Client(opt) => opt.as_ref(), - } - } -} - -impl FromStr for AmazonS3ConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "aws_access_key_id" | "access_key_id" => Ok(Self::AccessKeyId), - "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), - "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), - "aws_region" | "region" => Ok(Self::Region), - "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { - Ok(Self::Bucket) - } - "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { - Ok(Self::Endpoint) - } - "aws_session_token" | "aws_token" | "session_token" | "token" => { - Ok(Self::Token) - } - "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { - Ok(Self::VirtualHostedStyleRequest) - } - "aws_imdsv1_fallback" | "imdsv1_fallback" => Ok(Self::ImdsV1Fallback), - "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), - "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), - "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - "aws_container_credentials_relative_uri" => { - Ok(Self::ContainerCredentialsRelativeUri) - } - "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), - "copy_if_not_exists" => Ok(Self::CopyIfNotExists), - // Backwards compatibility - "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl AmazonS3Builder { - /// Create a new [`AmazonS3Builder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Fill the [`AmazonS3Builder`] with regular AWS environment variables - /// - /// Variables extracted from environment: - /// * `AWS_ACCESS_KEY_ID` -> access_key_id - /// * `AWS_SECRET_ACCESS_KEY` -> secret_access_key - /// * `AWS_DEFAULT_REGION` -> region - /// * `AWS_ENDPOINT` -> endpoint - /// * `AWS_SESSION_TOKEN` -> token - /// * `AWS_CONTAINER_CREDENTIALS_RELATIVE_URI` -> - /// * `AWS_ALLOW_HTTP` -> set to "true" to permit HTTP connections without TLS - /// # Example - /// ``` - /// use object_store::aws::AmazonS3Builder; - /// - /// let s3 = AmazonS3Builder::from_env() - /// .with_bucket_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder: Self = Default::default(); - - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("AWS_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `s3:///` - /// - `s3a:///` - /// - `https://s3..amazonaws.com/` - /// - `https://.s3..amazonaws.com` - /// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::aws::AmazonS3Builder; - /// - /// let s3 = AmazonS3Builder::from_env() - /// .with_url("s3://bucket/path") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config( - mut self, - key: AmazonS3ConfigKey, - value: impl Into, - ) -> Self { - match key { - AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), - AmazonS3ConfigKey::SecretAccessKey => { - self.secret_access_key = Some(value.into()) - } - AmazonS3ConfigKey::Region => self.region = Some(value.into()), - AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), - AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), - AmazonS3ConfigKey::Token => self.token = Some(value.into()), - AmazonS3ConfigKey::ImdsV1Fallback => self.imdsv1_fallback.parse(value), - AmazonS3ConfigKey::VirtualHostedStyleRequest => { - self.virtual_hosted_style_request.parse(value) - } - AmazonS3ConfigKey::DefaultRegion => { - self.region = self.region.or_else(|| Some(value.into())) - } - AmazonS3ConfigKey::MetadataEndpoint => { - self.metadata_endpoint = Some(value.into()) - } - AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), - AmazonS3ConfigKey::Checksum => { - self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) - } - AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { - self.container_credentials_relative_uri = Some(value.into()) - } - AmazonS3ConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), - AmazonS3ConfigKey::CopyIfNotExists => { - self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) - } - }; - self - } - - /// Set an option on the builder via a key - value pair. - /// - /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - /// - /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`AmazonS3ConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey}; - /// - /// let builder = AmazonS3Builder::from_env() - /// .with_bucket_name("foo"); - /// let bucket_name = builder.get_config_value(&AmazonS3ConfigKey::Bucket).unwrap_or_default(); - /// assert_eq!("foo", &bucket_name); - /// ``` - pub fn get_config_value(&self, key: &AmazonS3ConfigKey) -> Option { - match key { - AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), - AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), - AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { - self.region.clone() - } - AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), - AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), - AmazonS3ConfigKey::Token => self.token.clone(), - AmazonS3ConfigKey::ImdsV1Fallback => Some(self.imdsv1_fallback.to_string()), - AmazonS3ConfigKey::VirtualHostedStyleRequest => { - Some(self.virtual_hosted_style_request.to_string()) - } - AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint.clone(), - AmazonS3ConfigKey::UnsignedPayload => Some(self.unsigned_payload.to_string()), - AmazonS3ConfigKey::Checksum => { - self.checksum_algorithm.as_ref().map(ToString::to_string) - } - AmazonS3ConfigKey::Client(key) => self.client_options.get_config_value(key), - AmazonS3ConfigKey::ContainerCredentialsRelativeUri => { - self.container_credentials_relative_uri.clone() - } - AmazonS3ConfigKey::SkipSignature => Some(self.skip_signature.to_string()), - AmazonS3ConfigKey::CopyIfNotExists => { - self.copy_if_not_exists.as_ref().map(ToString::to_string) - } - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - match parsed.scheme() { - "s3" | "s3a" => self.bucket_name = Some(host.to_string()), - "https" => match host.splitn(4, '.').collect_tuple() { - Some(("s3", region, "amazonaws", "com")) => { - self.region = Some(region.to_string()); - let bucket = parsed.path_segments().into_iter().flatten().next(); - if let Some(bucket) = bucket { - self.bucket_name = Some(bucket.into()); - } - } - Some((bucket, "s3", region, "amazonaws.com")) => { - self.bucket_name = Some(bucket.to_string()); - self.region = Some(region.to_string()); - self.virtual_hosted_style_request = true.into(); - } - Some((account, "r2", "cloudflarestorage", "com")) => { - self.region = Some("auto".to_string()); - let endpoint = format!("https://{account}.r2.cloudflarestorage.com"); - self.endpoint = Some(endpoint); - - let bucket = parsed.path_segments().into_iter().flatten().next(); - if let Some(bucket) = bucket { - self.bucket_name = Some(bucket.into()); - } - } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), - }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - }; - Ok(()) - } - - /// Set the AWS Access Key (required) - pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { - self.access_key_id = Some(access_key_id.into()); - self - } - - /// Set the AWS Secret Access Key (required) - pub fn with_secret_access_key( - mut self, - secret_access_key: impl Into, - ) -> Self { - self.secret_access_key = Some(secret_access_key.into()); - self - } - - /// Set the region (e.g. `us-east-1`) (required) - pub fn with_region(mut self, region: impl Into) -> Self { - self.region = Some(region.into()); - self - } - - /// Set the bucket_name (required) - pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { - self.bucket_name = Some(bucket_name.into()); - self - } - - /// Sets the endpoint for communicating with AWS S3. Default value - /// is based on region. The `endpoint` field should be consistent with - /// the field `virtual_hosted_style_request'. - /// - /// For example, this might be set to `"http://localhost:4566:` - /// for testing against a localstack instance. - /// If `virtual_hosted_style_request` is set to true then `endpoint` - /// should have bucket name included. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = Some(endpoint.into()); - self - } - - /// Set the token to use for requests (passed to underlying provider) - pub fn with_token(mut self, token: impl Into) -> Self { - self.token = Some(token.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: AwsCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS are allowed - /// * true: HTTP and HTTPS are allowed - pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.client_options = self.client_options.with_allow_http(allow_http); - self - } - - /// Sets if virtual hosted style request has to be used. - /// If `virtual_hosted_style_request` is : - /// * false (default): Path style request is used - /// * true: Virtual hosted style request is used - /// - /// If the `endpoint` is provided then it should be - /// consistent with `virtual_hosted_style_request`. - /// i.e. if `virtual_hosted_style_request` is set to true - /// then `endpoint` should have bucket name included. - pub fn with_virtual_hosted_style_request( - mut self, - virtual_hosted_style_request: bool, - ) -> Self { - self.virtual_hosted_style_request = virtual_hosted_style_request.into(); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// By default instance credentials will only be fetched over [IMDSv2], as AWS recommends - /// against having IMDSv1 enabled on EC2 instances as it is vulnerable to [SSRF attack] - /// - /// However, certain deployment environments, such as those running old versions of kube2iam, - /// may not support IMDSv2. This option will enable automatic fallback to using IMDSv1 - /// if the token endpoint returns a 403 error indicating that IMDSv2 is not supported. - /// - /// This option has no effect if not using instance credentials - /// - /// [IMDSv2]: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html - /// [SSRF attack]: https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/ - /// - pub fn with_imdsv1_fallback(mut self) -> Self { - self.imdsv1_fallback = true.into(); - self - } - - /// Sets if unsigned payload option has to be used. - /// See [unsigned payload option](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-header-based-auth.html) - /// * false (default): Signed payload option is used, where the checksum for the request body is computed and included when constructing a canonical request. - /// * true: Unsigned payload option is used. `UNSIGNED-PAYLOAD` literal is included when constructing a canonical request, - pub fn with_unsigned_payload(mut self, unsigned_payload: bool) -> Self { - self.unsigned_payload = unsigned_payload.into(); - self - } - - /// If enabled, [`AmazonS3`] will not fetch credentials and will not sign requests - /// - /// This can be useful when interacting with public S3 buckets that deny authorized requests - pub fn with_skip_signature(mut self, skip_signature: bool) -> Self { - self.skip_signature = skip_signature.into(); - self - } - - /// Sets the [checksum algorithm] which has to be used for object integrity check during upload. - /// - /// [checksum algorithm]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html - pub fn with_checksum_algorithm(mut self, checksum_algorithm: Checksum) -> Self { - // Convert to String to enable deferred parsing of config - self.checksum_algorithm = Some(checksum_algorithm.into()); - self - } - - /// Set the [instance metadata endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html), - /// used primarily within AWS EC2. - /// - /// This defaults to the IPv4 endpoint: http://169.254.169.254. One can alternatively use the IPv6 - /// endpoint http://fd00:ec2::254. - pub fn with_metadata_endpoint(mut self, endpoint: impl Into) -> Self { - self.metadata_endpoint = Some(endpoint.into()); - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Configure how to provide [`ObjectStore::copy_if_not_exists`] - pub fn with_copy_if_not_exists(mut self, config: S3CopyIfNotExists) -> Self { - self.copy_if_not_exists = Some(config.into()); - self - } - - /// Create a [`AmazonS3`] instance from the provided values, - /// consuming `self`. - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; - let region = self.region.context(MissingRegionSnafu)?; - let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; - let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; - - let credentials = if let Some(credentials) = self.credentials { - credentials - } else if self.access_key_id.is_some() || self.secret_access_key.is_some() { - match (self.access_key_id, self.secret_access_key, self.token) { - (Some(key_id), Some(secret_key), token) => { - info!("Using Static credential provider"); - let credential = AwsCredential { - key_id, - secret_key, - token, - }; - Arc::new(StaticCredentialProvider::new(credential)) as _ - } - (None, Some(_), _) => return Err(Error::MissingAccessKeyId.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - (None, None, _) => unreachable!(), - } - } else if let (Ok(token_path), Ok(role_arn)) = ( - std::env::var("AWS_WEB_IDENTITY_TOKEN_FILE"), - std::env::var("AWS_ROLE_ARN"), - ) { - // TODO: Replace with `AmazonS3Builder::credentials_from_env` - info!("Using WebIdentity credential provider"); - - let session_name = std::env::var("AWS_ROLE_SESSION_NAME") - .unwrap_or_else(|_| "WebIdentitySession".to_string()); - - let endpoint = format!("https://sts.{region}.amazonaws.com"); - - // Disallow non-HTTPs requests - let client = self - .client_options - .clone() - .with_allow_http(false) - .client()?; - - let token = WebIdentityProvider { - token_path, - session_name, - role_arn, - endpoint, - }; - - Arc::new(TokenCredentialProvider::new( - token, - client, - self.retry_config.clone(), - )) as _ - } else if let Some(uri) = self.container_credentials_relative_uri { - info!("Using Task credential provider"); - Arc::new(TaskCredentialProvider { - url: format!("http://169.254.170.2{uri}"), - retry: self.retry_config.clone(), - // The instance metadata endpoint is access over HTTP - client: self.client_options.clone().with_allow_http(true).client()?, - cache: Default::default(), - }) as _ - } else { - info!("Using Instance credential provider"); - - let token = InstanceCredentialProvider { - cache: Default::default(), - imdsv1_fallback: self.imdsv1_fallback.get()?, - metadata_endpoint: self - .metadata_endpoint - .unwrap_or_else(|| DEFAULT_METADATA_ENDPOINT.into()), - }; - - Arc::new(TokenCredentialProvider::new( - token, - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - - let endpoint: String; - let bucket_endpoint: String; - - // If `endpoint` is provided then its assumed to be consistent with - // `virtual_hosted_style_request`. i.e. if `virtual_hosted_style_request` is true then - // `endpoint` should have bucket name included. - if self.virtual_hosted_style_request.get()? { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://{bucket}.s3.{region}.amazonaws.com")); - bucket_endpoint = endpoint.clone(); - } else { - endpoint = self - .endpoint - .unwrap_or_else(|| format!("https://s3.{region}.amazonaws.com")); - bucket_endpoint = format!("{endpoint}/{bucket}"); - } - - let config = S3Config { - region, - endpoint, - bucket, - bucket_endpoint, - credentials, - retry_config: self.retry_config, - client_options: self.client_options, - sign_payload: !self.unsigned_payload.get()?, - skip_signature: self.skip_signature.get()?, - checksum, - copy_if_not_exists, - }; - - let client = Arc::new(S3Client::new(config)?); - - Ok(AmazonS3 { client }) - } -} - #[cfg(test)] mod tests { use super::*; - use crate::tests::{ - copy_if_not_exists, get_nonexistent_object, get_opts, - list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, - rename_and_copy, stream_get, - }; + use crate::tests::*; use bytes::Bytes; - use std::collections::HashMap; const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[test] - fn s3_test_config_from_map() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - let options = HashMap::from([ - ("aws_access_key_id", aws_access_key_id.clone()), - ("aws_secret_access_key", aws_secret_access_key), - ("aws_default_region", aws_default_region.clone()), - ("aws_endpoint", aws_endpoint.clone()), - ("aws_session_token", aws_session_token.clone()), - ("aws_unsigned_payload", "true".to_string()), - ("aws_checksum_algorithm", "sha256".to_string()), - ]); - - let builder = options - .into_iter() - .fold(AmazonS3Builder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }) - .with_config(AmazonS3ConfigKey::SecretAccessKey, "new-secret-key"); - - assert_eq!(builder.access_key_id.unwrap(), aws_access_key_id.as_str()); - assert_eq!(builder.secret_access_key.unwrap(), "new-secret-key"); - assert_eq!(builder.region.unwrap(), aws_default_region); - assert_eq!(builder.endpoint.unwrap(), aws_endpoint); - assert_eq!(builder.token.unwrap(), aws_session_token); - assert_eq!( - builder.checksum_algorithm.unwrap().get().unwrap(), - Checksum::SHA256 - ); - assert!(builder.unsigned_payload.get().unwrap()); - } - - #[test] - fn s3_test_config_get_value() { - let aws_access_key_id = "object_store:fake_access_key_id".to_string(); - let aws_secret_access_key = "object_store:fake_secret_key".to_string(); - let aws_default_region = "object_store:fake_default_region".to_string(); - let aws_endpoint = "object_store:fake_endpoint".to_string(); - let aws_session_token = "object_store:fake_session_token".to_string(); - - let builder = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::AccessKeyId, &aws_access_key_id) - .with_config(AmazonS3ConfigKey::SecretAccessKey, &aws_secret_access_key) - .with_config(AmazonS3ConfigKey::DefaultRegion, &aws_default_region) - .with_config(AmazonS3ConfigKey::Endpoint, &aws_endpoint) - .with_config(AmazonS3ConfigKey::Token, &aws_session_token) - .with_config(AmazonS3ConfigKey::UnsignedPayload, "true"); - - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::AccessKeyId) - .unwrap(), - aws_access_key_id - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::SecretAccessKey) - .unwrap(), - aws_secret_access_key - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::DefaultRegion) - .unwrap(), - aws_default_region - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::Endpoint) - .unwrap(), - aws_endpoint - ); - assert_eq!( - builder.get_config_value(&AmazonS3ConfigKey::Token).unwrap(), - aws_session_token - ); - assert_eq!( - builder - .get_config_value(&AmazonS3ConfigKey::UnsignedPayload) - .unwrap(), - "true" - ); - } - #[tokio::test] async fn s3_test() { crate::test_util::maybe_skip_integration!(); let config = AmazonS3Builder::from_env(); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let test_not_exists = config.copy_if_not_exists.is_some(); let integration = config.build().unwrap(); + let config = integration.client.config(); + let is_local = config.endpoint.starts_with("http://"); + let test_not_exists = config.copy_if_not_exists.is_some(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; @@ -1279,16 +309,14 @@ mod tests { } // run integration test with unsigned payload enabled - let config = AmazonS3Builder::from_env().with_unsigned_payload(true); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let integration = config.build().unwrap(); + let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); + let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let config = + let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); - let is_local = matches!(&config.endpoint, Some(e) if e.starts_with("http://")); - let integration = config.build().unwrap(); + let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; } @@ -1352,161 +380,6 @@ mod tests { assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); } - #[tokio::test] - async fn s3_test_proxy_url() { - let s3 = AmazonS3Builder::new() - .with_access_key_id("access_key_id") - .with_secret_access_key("secret_access_key") - .with_region("region") - .with_bucket_name("bucket_name") - .with_allow_http(true) - .with_proxy_url("https://example.com") - .build(); - - assert!(s3.is_ok()); - - let err = AmazonS3Builder::new() - .with_access_key_id("access_key_id") - .with_secret_access_key("secret_access_key") - .with_region("region") - .with_bucket_name("bucket_name") - .with_allow_http(true) - .with_proxy_url("asdf://example.com") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); - } - - #[test] - fn s3_test_urls() { - let mut builder = AmazonS3Builder::new(); - builder.parse_url("s3://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("s3://buckets.can.have.dots/path") - .unwrap(); - assert_eq!( - builder.bucket_name, - Some("buckets.can.have.dots".to_string()) - ); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com/bucket") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://s3.region.amazonaws.com/bucket.with.dot/path") - .unwrap(); - assert_eq!(builder.region, Some("region".to_string())); - assert_eq!(builder.bucket_name, Some("bucket.with.dot".to_string())); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://bucket.s3.region.amazonaws.com") - .unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - assert_eq!(builder.region, Some("region".to_string())); - assert!(builder.virtual_hosted_style_request.get().unwrap()); - - let mut builder = AmazonS3Builder::new(); - builder - .parse_url("https://account123.r2.cloudflarestorage.com/bucket-123") - .unwrap(); - - assert_eq!(builder.bucket_name, Some("bucket-123".to_string())); - assert_eq!(builder.region, Some("auto".to_string())); - assert_eq!( - builder.endpoint, - Some("https://account123.r2.cloudflarestorage.com".to_string()) - ); - - let err_cases = [ - "mailto://bucket/path", - "https://s3.bucket.mydomain.com", - "https://s3.bucket.foo.amazonaws.com", - "https://bucket.mydomain.region.amazonaws.com", - "https://bucket.s3.region.bar.amazonaws.com", - "https://bucket.foo.s3.amazonaws.com", - ]; - let mut builder = AmazonS3Builder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn test_invalid_config() { - let err = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::ImdsV1Fallback, "enabled") - .with_bucket_name("bucket") - .with_region("region") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - err, - "Generic Config error: failed to parse \"enabled\" as boolean" - ); - - let err = AmazonS3Builder::new() - .with_config(AmazonS3ConfigKey::Checksum, "md5") - .with_bucket_name("bucket") - .with_region("region") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - err, - "Generic Config error: \"md5\" is not a valid checksum algorithm" - ); - } -} - -#[cfg(test)] -mod s3_resolve_bucket_region_tests { - use super::*; - - #[tokio::test] - async fn test_private_bucket() { - let bucket = "bloxbender"; - - let region = resolve_bucket_region(bucket, &ClientOptions::new()) - .await - .unwrap(); - - let expected = "us-west-2".to_string(); - - assert_eq!(region, expected); - } - - #[tokio::test] - async fn test_bucket_does_not_exist() { - let bucket = "please-dont-exist"; - - let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; - - assert!(result.is_err()); - } - #[tokio::test] #[ignore = "Tests shouldn't call use remote services by default"] async fn test_disable_creds() { diff --git a/object_store/src/aws/resolve.rs b/object_store/src/aws/resolve.rs new file mode 100644 index 000000000000..2b21fabd34ab --- /dev/null +++ b/object_store/src/aws/resolve.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aws::STORE; +use crate::{ClientOptions, Result}; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +/// A specialized `Error` for object store-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Bucket '{}' not found", bucket))] + BucketNotFound { bucket: String }, + + #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + ResolveRegion { + bucket: String, + source: reqwest::Error, + }, + + #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + RegionParse { bucket: String }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + Self::Generic { + store: STORE, + source: Box::new(source), + } + } +} + +/// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. +/// +/// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html +pub async fn resolve_bucket_region( + bucket: &str, + client_options: &ClientOptions, +) -> Result { + use reqwest::StatusCode; + + let endpoint = format!("https://{}.s3.amazonaws.com", bucket); + + let client = client_options.client()?; + + let response = client + .head(&endpoint) + .send() + .await + .context(ResolveRegionSnafu { bucket })?; + + ensure!( + response.status() != StatusCode::NOT_FOUND, + BucketNotFoundSnafu { bucket } + ); + + let region = response + .headers() + .get("x-amz-bucket-region") + .and_then(|x| x.to_str().ok()) + .context(RegionParseSnafu { bucket })?; + + Ok(region.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_private_bucket() { + let bucket = "bloxbender"; + + let region = resolve_bucket_region(bucket, &ClientOptions::new()) + .await + .unwrap(); + + let expected = "us-west-2".to_string(); + + assert_eq!(region, expected); + } + + #[tokio::test] + async fn test_bucket_does_not_exist() { + let bucket = "please-dont-exist"; + + let result = resolve_bucket_region(bucket, &ClientOptions::new()).await; + + assert!(result.is_err()); + } +} From a425e7e7faf82032abc85cf570c863974db5bb66 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:41:03 +0100 Subject: [PATCH 1290/1411] Split azure Module (#4954) * Split azure module * Format * Docs --- object_store/src/azure/builder.rs | 1101 +++++++++++++++++++++++++++++ object_store/src/azure/mod.rs | 1081 +--------------------------- 2 files changed, 1112 insertions(+), 1070 deletions(-) create mode 100644 object_store/src/azure/builder.rs diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs new file mode 100644 index 000000000000..eb2de147f3ad --- /dev/null +++ b/object_store/src/azure/builder.rs @@ -0,0 +1,1101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::azure::client::{AzureClient, AzureConfig}; +use crate::azure::credential::{ + AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, + WorkloadIdentityOAuthProvider, +}; +use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; +use crate::client::TokenCredentialProvider; +use crate::config::ConfigValue; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use percent_encoding::percent_decode_str; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use url::Url; + +/// The well-known account used by Azurite and the legacy Azure Storage Emulator. +/// +/// +const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; + +/// The well-known account key used by Azurite and the legacy Azure Storage Emulator. +/// +/// +const EMULATOR_ACCOUNT_KEY: &str = + "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; + +const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; + +/// A specialized `Error` for Azure builder-related errors +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unable parse emulator url {}={}, Error: {}", + env_name, + env_value, + source + ))] + UnableToParseEmulatorUrl { + env_name: String, + env_value: String, + source: url::ParseError, + }, + + #[snafu(display("Account must be specified"))] + MissingAccount {}, + + #[snafu(display("Container name must be specified"))] + MissingContainerName {}, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Failed parsing an SAS key"))] + DecodeSasKey { source: std::str::Utf8Error }, + + #[snafu(display("Missing component in SAS query pair"))] + MissingSasComponent {}, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, +} + +impl From for crate::Error { + fn from(source: Error) -> Self { + match source { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(source), + }, + } + } +} + +/// Configure a connection to Microsoft Azure Blob Storage container using +/// the specified credentials. +/// +/// # Example +/// ``` +/// # let ACCOUNT = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY = "foo"; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() +/// .with_account(ACCOUNT) +/// .with_access_key(ACCESS_KEY) +/// .with_container_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Default, Clone)] +pub struct MicrosoftAzureBuilder { + /// Account name + account_name: Option, + /// Access key + access_key: Option, + /// Container name + container_name: Option, + /// Bearer token + bearer_token: Option, + /// Client id + client_id: Option, + /// Client secret + client_secret: Option, + /// Tenant id + tenant_id: Option, + /// Query pairs for shared access signature authorization + sas_query_pairs: Option>, + /// Shared access signature + sas_key: Option, + /// Authority host + authority_host: Option, + /// Url + url: Option, + /// When set to true, azurite storage emulator has to be used + use_emulator: ConfigValue, + /// Storage endpoint + endpoint: Option, + /// Msi endpoint for acquiring managed identity token + msi_endpoint: Option, + /// Object id for use with managed identity authentication + object_id: Option, + /// Msi resource id for use with managed identity authentication + msi_resource_id: Option, + /// File containing token for Azure AD workload identity federation + federated_token_file: Option, + /// When set to true, azure cli has to be used for acquiring access token + use_azure_cli: ConfigValue, + /// Retry config + retry_config: RetryConfig, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, + /// When set to true, fabric url scheme will be used + /// + /// i.e. https://{account_name}.dfs.fabric.microsoft.com + use_fabric_endpoint: ConfigValue, +} + +/// Configuration keys for [`MicrosoftAzureBuilder`] +/// +/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; +/// let builder = MicrosoftAzureBuilder::new() +/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") +/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] +#[non_exhaustive] +pub enum AzureConfigKey { + /// The name of the azure storage account + /// + /// Supported keys: + /// - `azure_storage_account_name` + /// - `account_name` + AccountName, + + /// Master key for accessing storage account + /// + /// Supported keys: + /// - `azure_storage_account_key` + /// - `azure_storage_access_key` + /// - `azure_storage_master_key` + /// - `access_key` + /// - `account_key` + /// - `master_key` + AccessKey, + + /// Service principal client id for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_id` + /// - `azure_client_id` + /// - `client_id` + ClientId, + + /// Service principal client secret for authorizing requests + /// + /// Supported keys: + /// - `azure_storage_client_secret` + /// - `azure_client_secret` + /// - `client_secret` + ClientSecret, + + /// Tenant id used in oauth flows + /// + /// Supported keys: + /// - `azure_storage_tenant_id` + /// - `azure_storage_authority_id` + /// - `azure_tenant_id` + /// - `azure_authority_id` + /// - `tenant_id` + /// - `authority_id` + AuthorityId, + + /// Shared access signature. + /// + /// The signature is expected to be percent-encoded, much like they are provided + /// in the azure storage explorer or azure portal. + /// + /// Supported keys: + /// - `azure_storage_sas_key` + /// - `azure_storage_sas_token` + /// - `sas_key` + /// - `sas_token` + SasKey, + + /// Bearer token + /// + /// Supported keys: + /// - `azure_storage_token` + /// - `bearer_token` + /// - `token` + Token, + + /// Use object store with azurite storage emulator + /// + /// Supported keys: + /// - `azure_storage_use_emulator` + /// - `object_store_use_emulator` + /// - `use_emulator` + UseEmulator, + + /// Override the endpoint used to communicate with blob storage + /// + /// Supported keys: + /// - `azure_storage_endpoint` + /// - `azure_endpoint` + /// - `endpoint` + Endpoint, + + /// Use object store with url scheme account.dfs.fabric.microsoft.com + /// + /// Supported keys: + /// - `azure_use_fabric_endpoint` + /// - `use_fabric_endpoint` + UseFabricEndpoint, + + /// Endpoint to request a imds managed identity token + /// + /// Supported keys: + /// - `azure_msi_endpoint` + /// - `azure_identity_endpoint` + /// - `identity_endpoint` + /// - `msi_endpoint` + MsiEndpoint, + + /// Object id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_object_id` + /// - `object_id` + ObjectId, + + /// Msi resource id for use with managed identity authentication + /// + /// Supported keys: + /// - `azure_msi_resource_id` + /// - `msi_resource_id` + MsiResourceId, + + /// File containing token for Azure AD workload identity federation + /// + /// Supported keys: + /// - `azure_federated_token_file` + /// - `federated_token_file` + FederatedTokenFile, + + /// Use azure cli for acquiring access token + /// + /// Supported keys: + /// - `azure_use_azure_cli` + /// - `use_azure_cli` + UseAzureCli, + + /// Container name + /// + /// Supported keys: + /// - `azure_container_name` + /// - `container_name` + ContainerName, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for AzureConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::AccountName => "azure_storage_account_name", + Self::AccessKey => "azure_storage_account_key", + Self::ClientId => "azure_storage_client_id", + Self::ClientSecret => "azure_storage_client_secret", + Self::AuthorityId => "azure_storage_tenant_id", + Self::SasKey => "azure_storage_sas_key", + Self::Token => "azure_storage_token", + Self::UseEmulator => "azure_storage_use_emulator", + Self::UseFabricEndpoint => "azure_use_fabric_endpoint", + Self::Endpoint => "azure_storage_endpoint", + Self::MsiEndpoint => "azure_msi_endpoint", + Self::ObjectId => "azure_object_id", + Self::MsiResourceId => "azure_msi_resource_id", + Self::FederatedTokenFile => "azure_federated_token_file", + Self::UseAzureCli => "azure_use_azure_cli", + Self::ContainerName => "azure_container_name", + Self::Client(key) => key.as_ref(), + } + } +} + +impl FromStr for AzureConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "azure_storage_account_key" + | "azure_storage_access_key" + | "azure_storage_master_key" + | "master_key" + | "account_key" + | "access_key" => Ok(Self::AccessKey), + "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), + "azure_storage_client_id" | "azure_client_id" | "client_id" => { + Ok(Self::ClientId) + } + "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { + Ok(Self::ClientSecret) + } + "azure_storage_tenant_id" + | "azure_storage_authority_id" + | "azure_tenant_id" + | "azure_authority_id" + | "tenant_id" + | "authority_id" => Ok(Self::AuthorityId), + "azure_storage_sas_key" + | "azure_storage_sas_token" + | "sas_key" + | "sas_token" => Ok(Self::SasKey), + "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), + "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { + Ok(Self::Endpoint) + } + "azure_msi_endpoint" + | "azure_identity_endpoint" + | "identity_endpoint" + | "msi_endpoint" => Ok(Self::MsiEndpoint), + "azure_object_id" | "object_id" => Ok(Self::ObjectId), + "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), + "azure_federated_token_file" | "federated_token_file" => { + Ok(Self::FederatedTokenFile) + } + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { + Ok(Self::UseFabricEndpoint) + } + "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), + "azure_container_name" | "container_name" => Ok(Self::ContainerName), + // Backwards compatibility + "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl std::fmt::Debug for MicrosoftAzureBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", + self.account_name, self.container_name + ) + } +} + +impl MicrosoftAzureBuilder { + /// Create a new [`MicrosoftAzureBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name + /// * AZURE_STORAGE_ACCOUNT_KEY: storage account master key + /// * AZURE_STORAGE_ACCESS_KEY: alias for AZURE_STORAGE_ACCOUNT_KEY + /// * AZURE_STORAGE_CLIENT_ID -> client id for service principal authorization + /// * AZURE_STORAGE_CLIENT_SECRET -> client secret for service principal authorization + /// * AZURE_STORAGE_TENANT_ID -> tenant id used in oauth flows + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_container_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("AZURE_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { + builder = builder.with_msi_endpoint(text); + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `abfs[s]://@.dfs.core.windows.net/` + /// - `abfs[s]://@.dfs.fabric.microsoft.com/` + /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) + /// - `azure:///` (custom) + /// - `https://.dfs.core.windows.net` + /// - `https://.blob.core.windows.net` + /// - `https://.dfs.fabric.microsoft.com` + /// - `https://.dfs.fabric.microsoft.com/` + /// - `https://.blob.fabric.microsoft.com` + /// - `https://.blob.fabric.microsoft.com/` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::azure::MicrosoftAzureBuilder; + /// + /// let azure = MicrosoftAzureBuilder::from_env() + /// .with_url("abfss://file_system@account.dfs.core.windows.net/") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { + match key { + AzureConfigKey::AccessKey => self.access_key = Some(value.into()), + AzureConfigKey::AccountName => self.account_name = Some(value.into()), + AzureConfigKey::ClientId => self.client_id = Some(value.into()), + AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), + AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), + AzureConfigKey::SasKey => self.sas_key = Some(value.into()), + AzureConfigKey::Token => self.bearer_token = Some(value.into()), + AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), + AzureConfigKey::ObjectId => self.object_id = Some(value.into()), + AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), + AzureConfigKey::FederatedTokenFile => { + self.federated_token_file = Some(value.into()) + } + AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), + AzureConfigKey::UseEmulator => self.use_emulator.parse(value), + AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), + AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), + AzureConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + AzureConfigKey::ContainerName => self.container_name = Some(value.into()), + }; + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`AzureConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; + /// + /// let builder = MicrosoftAzureBuilder::from_env() + /// .with_account("foo"); + /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); + /// assert_eq!("foo", &account_name); + /// ``` + pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { + match key { + AzureConfigKey::AccountName => self.account_name.clone(), + AzureConfigKey::AccessKey => self.access_key.clone(), + AzureConfigKey::ClientId => self.client_id.clone(), + AzureConfigKey::ClientSecret => self.client_secret.clone(), + AzureConfigKey::AuthorityId => self.tenant_id.clone(), + AzureConfigKey::SasKey => self.sas_key.clone(), + AzureConfigKey::Token => self.bearer_token.clone(), + AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), + AzureConfigKey::UseFabricEndpoint => { + Some(self.use_fabric_endpoint.to_string()) + } + AzureConfigKey::Endpoint => self.endpoint.clone(), + AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), + AzureConfigKey::ObjectId => self.object_id.clone(), + AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), + AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), + AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), + AzureConfigKey::Client(key) => self.client_options.get_config_value(key), + AzureConfigKey::ContainerName => self.container_name.clone(), + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), + "abfs" | "abfss" => { + // abfs(s) might refer to the fsspec convention abfs:/// + // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ + if parsed.username().is_empty() { + self.container_name = Some(validate(host)?); + } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { + self.container_name = Some(validate(parsed.username())?); + self.account_name = Some(validate(a)?); + self.use_fabric_endpoint = true.into(); + } else { + return Err(UrlNotRecognisedSnafu { url }.build().into()); + } + } + "https" => match host.split_once('.') { + Some((a, "dfs.core.windows.net")) + | Some((a, "blob.core.windows.net")) => { + self.account_name = Some(validate(a)?); + } + Some((a, "dfs.fabric.microsoft.com")) + | Some((a, "blob.fabric.microsoft.com")) => { + self.account_name = Some(validate(a)?); + // Attempt to infer the container name from the URL + // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv + // - https://onelake.dfs.fabric.microsoft.com//.// + // + // See + if let Some(workspace) = parsed.path_segments().unwrap().next() { + if !workspace.is_empty() { + self.container_name = Some(workspace.to_string()) + } + } + self.use_fabric_endpoint = true.into(); + } + _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + }, + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + + /// Set the Azure Account (required) + pub fn with_account(mut self, account: impl Into) -> Self { + self.account_name = Some(account.into()); + self + } + + /// Set the Azure Container Name (required) + pub fn with_container_name(mut self, container_name: impl Into) -> Self { + self.container_name = Some(container_name.into()); + self + } + + /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) + pub fn with_access_key(mut self, access_key: impl Into) -> Self { + self.access_key = Some(access_key.into()); + self + } + + /// Set a static bearer token to be used for authorizing requests + pub fn with_bearer_token_authorization( + mut self, + bearer_token: impl Into, + ) -> Self { + self.bearer_token = Some(bearer_token.into()); + self + } + + /// Set a client secret used for client secret authorization + pub fn with_client_secret_authorization( + mut self, + client_id: impl Into, + client_secret: impl Into, + tenant_id: impl Into, + ) -> Self { + self.client_id = Some(client_id.into()); + self.client_secret = Some(client_secret.into()); + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Sets the client id for use in client secret or k8s federated credential flow + pub fn with_client_id(mut self, client_id: impl Into) -> Self { + self.client_id = Some(client_id.into()); + self + } + + /// Sets the client secret for use in client secret flow + pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { + self.client_secret = Some(client_secret.into()); + self + } + + /// Sets the tenant id for use in client secret or k8s federated credential flow + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Set query pairs appended to the url for shared access signature authorization + pub fn with_sas_authorization( + mut self, + query_pairs: impl Into>, + ) -> Self { + self.sas_query_pairs = Some(query_pairs.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Set if the Azure emulator should be used (defaults to false) + pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { + self.use_emulator = use_emulator.into(); + self + } + + /// Override the endpoint used to communicate with blob storage + /// + /// Defaults to `https://{account}.blob.core.windows.net` + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + + /// Set if Microsoft Fabric url scheme should be used (defaults to false) + /// + /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` + /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` + /// + /// Note: [`Self::with_endpoint`] will take precedence over this option + pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { + self.use_fabric_endpoint = use_fabric_endpoint.into(); + self + } + + /// Sets what protocol is allowed + /// + /// If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.client_options = self.client_options.with_allow_http(allow_http); + self + } + + /// Sets an alternative authority host for OAuth based authorization + /// + /// Common hosts for azure clouds are defined in [authority_hosts](crate::azure::authority_hosts). + /// + /// Defaults to + pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { + self.authority_host = Some(authority_host.into()); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Sets the endpoint for acquiring managed identity token + pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { + self.msi_endpoint = Some(msi_endpoint.into()); + self + } + + /// Sets a file path for acquiring azure federated identity token in k8s + /// + /// requires `client_id` and `tenant_id` to be set + pub fn with_federated_token_file( + mut self, + federated_token_file: impl Into, + ) -> Self { + self.federated_token_file = Some(federated_token_file.into()); + self + } + + /// Set if the Azure Cli should be used for acquiring access token + /// + /// + pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { + self.use_azure_cli = use_azure_cli.into(); + self + } + + /// Configure a connection to container with given name on Microsoft Azure Blob store. + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let container = self.container_name.ok_or(Error::MissingContainerName {})?; + + let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { + Arc::new(StaticCredentialProvider::new(credential)) + }; + + let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { + let account_name = self + .account_name + .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let account_key = self + .access_key + .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + + let credential = static_creds(AzureCredential::AccessKey(account_key)); + + self.client_options = self.client_options.with_allow_http(true); + (true, url, credential, account_name) + } else { + let account_name = self.account_name.ok_or(Error::MissingAccount {})?; + let account_url = match self.endpoint { + Some(account_url) => account_url, + None => match self.use_fabric_endpoint.get()? { + true => { + format!("https://{}.blob.fabric.microsoft.com", &account_name) + } + false => format!("https://{}.blob.core.windows.net", &account_name), + }, + }; + + let url = Url::parse(&account_url) + .context(UnableToParseUrlSnafu { url: account_url })?; + + let credential = if let Some(credential) = self.credentials { + credential + } else if let Some(bearer_token) = self.bearer_token { + static_creds(AzureCredential::BearerToken(bearer_token)) + } else if let Some(access_key) = self.access_key { + static_creds(AzureCredential::AccessKey(access_key)) + } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = + (&self.client_id, &self.tenant_id, self.federated_token_file) + { + let client_credential = WorkloadIdentityOAuthProvider::new( + client_id, + federated_token_file, + tenant_id, + self.authority_host, + ); + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = + (&self.client_id, self.client_secret, &self.tenant_id) + { + let client_credential = ClientSecretOAuthProvider::new( + client_id.clone(), + client_secret, + tenant_id, + self.authority_host, + ); + Arc::new(TokenCredentialProvider::new( + client_credential, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(query_pairs) = self.sas_query_pairs { + static_creds(AzureCredential::SASToken(query_pairs)) + } else if let Some(sas) = self.sas_key { + static_creds(AzureCredential::SASToken(split_sas(&sas)?)) + } else if self.use_azure_cli.get()? { + Arc::new(AzureCliCredential::new()) as _ + } else { + let msi_credential = ImdsManagedIdentityProvider::new( + self.client_id, + self.object_id, + self.msi_resource_id, + self.msi_endpoint, + ); + Arc::new(TokenCredentialProvider::new( + msi_credential, + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + (false, url, credential, account_name) + }; + + let config = AzureConfig { + account, + is_emulator, + container, + retry_config: self.retry_config, + client_options: self.client_options, + service: storage_url, + credentials: auth, + }; + + let client = Arc::new(AzureClient::new(config)?); + + Ok(MicrosoftAzure { client }) + } +} + +/// Parses the contents of the environment variable `env_name` as a URL +/// if present, otherwise falls back to default_url +fn url_from_env(env_name: &str, default_url: &str) -> Result { + let url = match std::env::var(env_name) { + Ok(env_value) => { + Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })? + } + Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), + }; + Ok(url) +} + +fn split_sas(sas: &str) -> Result, Error> { + let sas = percent_decode_str(sas) + .decode_utf8() + .context(DecodeSasKeySnafu {})?; + let kv_str_pairs = sas + .trim_start_matches('?') + .split('&') + .filter(|s| !s.chars().all(char::is_whitespace)); + let mut pairs = Vec::new(); + for kv_pair_str in kv_str_pairs { + let (k, v) = kv_pair_str + .trim() + .split_once('=') + .ok_or(Error::MissingSasComponent {})?; + pairs.push((k.into(), v.into())) + } + Ok(pairs) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn azure_blob_test_urls() { + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, Some("file_system".to_string())); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("abfs://container/path").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder.parse_url("az://container/path").unwrap(); + assert_eq!(builder.container_name, Some("container".to_string())); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.core.windows.net/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert!(!builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.dfs.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name, None); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let mut builder = MicrosoftAzureBuilder::new(); + builder + .parse_url("https://account.blob.fabric.microsoft.com/container") + .unwrap(); + assert_eq!(builder.account_name, Some("account".to_string())); + assert_eq!(builder.container_name.as_deref(), Some("container")); + assert!(builder.use_fabric_endpoint.get().unwrap()); + + let err_cases = [ + "mailto://account.blob.core.windows.net/", + "az://blob.mydomain/", + "abfs://container.foo/path", + "abfss://file_system@account.foo.dfs.core.windows.net/", + "abfss://file_system.bar@account.dfs.core.windows.net/", + "https://blob.mydomain/", + "https://blob.foo.dfs.core.windows.net/", + ]; + let mut builder = MicrosoftAzureBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[test] + fn azure_test_config_from_map() { + let azure_client_id = "object_store:fake_access_key_id"; + let azure_storage_account_name = "object_store:fake_secret_key"; + let azure_storage_token = "object_store:fake_default_region"; + let options = HashMap::from([ + ("azure_client_id", azure_client_id), + ("azure_storage_account_name", azure_storage_account_name), + ("azure_storage_token", azure_storage_token), + ]); + + let builder = options + .into_iter() + .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + assert_eq!(builder.client_id.unwrap(), azure_client_id); + assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); + assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); + } + + #[test] + fn azure_test_split_sas() { + let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; + let expected = vec![ + ("sv".to_string(), "2021-10-04".to_string()), + ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), + ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), + ("sr".to_string(), "c".to_string()), + ("sp".to_string(), "rcwl".to_string()), + ( + "sig".to_string(), + "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), + ), + ]; + let pairs = split_sas(raw_sas).unwrap(); + assert_eq!(expected, pairs); + } +} diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 0e638efc399f..7e1db5bc8c1c 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -30,32 +30,24 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::Bytes; use futures::stream::BoxStream; -use percent_encoding::percent_decode_str; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; -use std::fmt::{Debug, Formatter}; -use std::str::FromStr; +use std::fmt::Debug; use std::sync::Arc; use tokio::io::AsyncWrite; -use url::Url; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, StaticCredentialProvider, - TokenCredentialProvider, -}; -use crate::config::ConfigValue; +use crate::client::CredentialProvider; pub use credential::authority_hosts; +mod builder; mod client; mod credential; @@ -63,87 +55,11 @@ mod credential; pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; +pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; const STORE: &str = "MicrosoftAzure"; -/// The well-known account used by Azurite and the legacy Azure Storage Emulator. -/// -const EMULATOR_ACCOUNT: &str = "devstoreaccount1"; - -/// The well-known account key used by Azurite and the legacy Azure Storage Emulator. -/// -const EMULATOR_ACCOUNT_KEY: &str = - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="; - -const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; - -/// A specialized `Error` for Azure object store-related errors -#[derive(Debug, Snafu)] -#[allow(missing_docs)] -enum Error { - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unable parse emulator url {}={}, Error: {}", - env_name, - env_value, - source - ))] - UnableToParseEmulatorUrl { - env_name: String, - env_value: String, - source: url::ParseError, - }, - - #[snafu(display("Account must be specified"))] - MissingAccount {}, - - #[snafu(display("Container name must be specified"))] - MissingContainerName {}, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Failed parsing an SAS key"))] - DecodeSasKey { source: std::str::Utf8Error }, - - #[snafu(display("Missing component in SAS query pair"))] - MissingSasComponent {}, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, -} - -impl From for super::Error { - fn from(source: Error) -> Self { - match source { - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(source), - }, - } - } -} - /// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { @@ -175,8 +91,11 @@ impl ObjectStore for MicrosoftAzure { .client .put_request(location, Some(bytes), false, &()) .await?; - let e_tag = Some(get_etag(response.headers()).context(MetadataSnafu)?); - Ok(PutResult { e_tag }) + let e_tag = get_etag(response.headers()).map_err(|e| crate::Error::Generic { + store: STORE, + source: Box::new(e), + })?; + Ok(PutResult { e_tag: Some(e_tag) }) } async fn put_multipart( @@ -279,853 +198,6 @@ impl PutPart for AzureMultiPartUpload { } } -/// Configure a connection to Microsoft Azure Blob Storage container using -/// the specified credentials. -/// -/// # Example -/// ``` -/// # let ACCOUNT = "foo"; -/// # let BUCKET_NAME = "foo"; -/// # let ACCESS_KEY = "foo"; -/// # use object_store::azure::MicrosoftAzureBuilder; -/// let azure = MicrosoftAzureBuilder::new() -/// .with_account(ACCOUNT) -/// .with_access_key(ACCESS_KEY) -/// .with_container_name(BUCKET_NAME) -/// .build(); -/// ``` -#[derive(Default, Clone)] -pub struct MicrosoftAzureBuilder { - /// Account name - account_name: Option, - /// Access key - access_key: Option, - /// Container name - container_name: Option, - /// Bearer token - bearer_token: Option, - /// Client id - client_id: Option, - /// Client secret - client_secret: Option, - /// Tenant id - tenant_id: Option, - /// Query pairs for shared access signature authorization - sas_query_pairs: Option>, - /// Shared access signature - sas_key: Option, - /// Authority host - authority_host: Option, - /// Url - url: Option, - /// When set to true, azurite storage emulator has to be used - use_emulator: ConfigValue, - /// Storage endpoint - endpoint: Option, - /// Msi endpoint for acquiring managed identity token - msi_endpoint: Option, - /// Object id for use with managed identity authentication - object_id: Option, - /// Msi resource id for use with managed identity authentication - msi_resource_id: Option, - /// File containing token for Azure AD workload identity federation - federated_token_file: Option, - /// When set to true, azure cli has to be used for acquiring access token - use_azure_cli: ConfigValue, - /// Retry config - retry_config: RetryConfig, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, - /// When set to true, fabric url scheme will be used - /// - /// i.e. https://{account_name}.dfs.fabric.microsoft.com - use_fabric_endpoint: ConfigValue, -} - -/// Configuration keys for [`MicrosoftAzureBuilder`] -/// -/// Configuration via keys can be done via [`MicrosoftAzureBuilder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; -/// let builder = MicrosoftAzureBuilder::new() -/// .with_config("azure_client_id".parse().unwrap(), "my-client-id") -/// .with_config(AzureConfigKey::AuthorityId, "my-tenant-id"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] -#[non_exhaustive] -pub enum AzureConfigKey { - /// The name of the azure storage account - /// - /// Supported keys: - /// - `azure_storage_account_name` - /// - `account_name` - AccountName, - - /// Master key for accessing storage account - /// - /// Supported keys: - /// - `azure_storage_account_key` - /// - `azure_storage_access_key` - /// - `azure_storage_master_key` - /// - `access_key` - /// - `account_key` - /// - `master_key` - AccessKey, - - /// Service principal client id for authorizing requests - /// - /// Supported keys: - /// - `azure_storage_client_id` - /// - `azure_client_id` - /// - `client_id` - ClientId, - - /// Service principal client secret for authorizing requests - /// - /// Supported keys: - /// - `azure_storage_client_secret` - /// - `azure_client_secret` - /// - `client_secret` - ClientSecret, - - /// Tenant id used in oauth flows - /// - /// Supported keys: - /// - `azure_storage_tenant_id` - /// - `azure_storage_authority_id` - /// - `azure_tenant_id` - /// - `azure_authority_id` - /// - `tenant_id` - /// - `authority_id` - AuthorityId, - - /// Shared access signature. - /// - /// The signature is expected to be percent-encoded, much like they are provided - /// in the azure storage explorer or azure portal. - /// - /// Supported keys: - /// - `azure_storage_sas_key` - /// - `azure_storage_sas_token` - /// - `sas_key` - /// - `sas_token` - SasKey, - - /// Bearer token - /// - /// Supported keys: - /// - `azure_storage_token` - /// - `bearer_token` - /// - `token` - Token, - - /// Use object store with azurite storage emulator - /// - /// Supported keys: - /// - `azure_storage_use_emulator` - /// - `object_store_use_emulator` - /// - `use_emulator` - UseEmulator, - - /// Override the endpoint used to communicate with blob storage - /// - /// Supported keys: - /// - `azure_storage_endpoint` - /// - `azure_endpoint` - /// - `endpoint` - Endpoint, - - /// Use object store with url scheme account.dfs.fabric.microsoft.com - /// - /// Supported keys: - /// - `azure_use_fabric_endpoint` - /// - `use_fabric_endpoint` - UseFabricEndpoint, - - /// Endpoint to request a imds managed identity token - /// - /// Supported keys: - /// - `azure_msi_endpoint` - /// - `azure_identity_endpoint` - /// - `identity_endpoint` - /// - `msi_endpoint` - MsiEndpoint, - - /// Object id for use with managed identity authentication - /// - /// Supported keys: - /// - `azure_object_id` - /// - `object_id` - ObjectId, - - /// Msi resource id for use with managed identity authentication - /// - /// Supported keys: - /// - `azure_msi_resource_id` - /// - `msi_resource_id` - MsiResourceId, - - /// File containing token for Azure AD workload identity federation - /// - /// Supported keys: - /// - `azure_federated_token_file` - /// - `federated_token_file` - FederatedTokenFile, - - /// Use azure cli for acquiring access token - /// - /// Supported keys: - /// - `azure_use_azure_cli` - /// - `use_azure_cli` - UseAzureCli, - - /// Container name - /// - /// Supported keys: - /// - `azure_container_name` - /// - `container_name` - ContainerName, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for AzureConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::AccountName => "azure_storage_account_name", - Self::AccessKey => "azure_storage_account_key", - Self::ClientId => "azure_storage_client_id", - Self::ClientSecret => "azure_storage_client_secret", - Self::AuthorityId => "azure_storage_tenant_id", - Self::SasKey => "azure_storage_sas_key", - Self::Token => "azure_storage_token", - Self::UseEmulator => "azure_storage_use_emulator", - Self::UseFabricEndpoint => "azure_use_fabric_endpoint", - Self::Endpoint => "azure_storage_endpoint", - Self::MsiEndpoint => "azure_msi_endpoint", - Self::ObjectId => "azure_object_id", - Self::MsiResourceId => "azure_msi_resource_id", - Self::FederatedTokenFile => "azure_federated_token_file", - Self::UseAzureCli => "azure_use_azure_cli", - Self::ContainerName => "azure_container_name", - Self::Client(key) => key.as_ref(), - } - } -} - -impl FromStr for AzureConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "azure_storage_account_key" - | "azure_storage_access_key" - | "azure_storage_master_key" - | "master_key" - | "account_key" - | "access_key" => Ok(Self::AccessKey), - "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), - "azure_storage_client_id" | "azure_client_id" | "client_id" => { - Ok(Self::ClientId) - } - "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { - Ok(Self::ClientSecret) - } - "azure_storage_tenant_id" - | "azure_storage_authority_id" - | "azure_tenant_id" - | "azure_authority_id" - | "tenant_id" - | "authority_id" => Ok(Self::AuthorityId), - "azure_storage_sas_key" - | "azure_storage_sas_token" - | "sas_key" - | "sas_token" => Ok(Self::SasKey), - "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), - "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), - "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { - Ok(Self::Endpoint) - } - "azure_msi_endpoint" - | "azure_identity_endpoint" - | "identity_endpoint" - | "msi_endpoint" => Ok(Self::MsiEndpoint), - "azure_object_id" | "object_id" => Ok(Self::ObjectId), - "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), - "azure_federated_token_file" | "federated_token_file" => { - Ok(Self::FederatedTokenFile) - } - "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { - Ok(Self::UseFabricEndpoint) - } - "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), - "azure_container_name" | "container_name" => Ok(Self::ContainerName), - // Backwards compatibility - "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl Debug for MicrosoftAzureBuilder { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "MicrosoftAzureBuilder {{ account: {:?}, container_name: {:?} }}", - self.account_name, self.container_name - ) - } -} - -impl MicrosoftAzureBuilder { - /// Create a new [`MicrosoftAzureBuilder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Create an instance of [`MicrosoftAzureBuilder`] with values pre-populated from environment variables. - /// - /// Variables extracted from environment: - /// * AZURE_STORAGE_ACCOUNT_NAME: storage account name - /// * AZURE_STORAGE_ACCOUNT_KEY: storage account master key - /// * AZURE_STORAGE_ACCESS_KEY: alias for AZURE_STORAGE_ACCOUNT_KEY - /// * AZURE_STORAGE_CLIENT_ID -> client id for service principal authorization - /// * AZURE_STORAGE_CLIENT_SECRET -> client secret for service principal authorization - /// * AZURE_STORAGE_TENANT_ID -> tenant id used in oauth flows - /// # Example - /// ``` - /// use object_store::azure::MicrosoftAzureBuilder; - /// - /// let azure = MicrosoftAzureBuilder::from_env() - /// .with_container_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder = Self::default(); - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("AZURE_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - if let Ok(text) = std::env::var(MSI_ENDPOINT_ENV_KEY) { - builder = builder.with_msi_endpoint(text); - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `abfs[s]:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `abfs[s]://@.dfs.core.windows.net/` - /// - `abfs[s]://@.dfs.fabric.microsoft.com/` - /// - `az:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `adl:///` (according to [fsspec](https://github.com/fsspec/adlfs)) - /// - `azure:///` (custom) - /// - `https://.dfs.core.windows.net` - /// - `https://.blob.core.windows.net` - /// - `https://.dfs.fabric.microsoft.com` - /// - `https://.dfs.fabric.microsoft.com/` - /// - `https://.blob.fabric.microsoft.com` - /// - `https://.blob.fabric.microsoft.com/` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::azure::MicrosoftAzureBuilder; - /// - /// let azure = MicrosoftAzureBuilder::from_env() - /// .with_url("abfss://file_system@account.dfs.core.windows.net/") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config(mut self, key: AzureConfigKey, value: impl Into) -> Self { - match key { - AzureConfigKey::AccessKey => self.access_key = Some(value.into()), - AzureConfigKey::AccountName => self.account_name = Some(value.into()), - AzureConfigKey::ClientId => self.client_id = Some(value.into()), - AzureConfigKey::ClientSecret => self.client_secret = Some(value.into()), - AzureConfigKey::AuthorityId => self.tenant_id = Some(value.into()), - AzureConfigKey::SasKey => self.sas_key = Some(value.into()), - AzureConfigKey::Token => self.bearer_token = Some(value.into()), - AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), - AzureConfigKey::ObjectId => self.object_id = Some(value.into()), - AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), - AzureConfigKey::FederatedTokenFile => { - self.federated_token_file = Some(value.into()) - } - AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), - AzureConfigKey::UseEmulator => self.use_emulator.parse(value), - AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), - AzureConfigKey::UseFabricEndpoint => self.use_fabric_endpoint.parse(value), - AzureConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - AzureConfigKey::ContainerName => self.container_name = Some(value.into()), - }; - self - } - - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`AzureConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::azure::{MicrosoftAzureBuilder, AzureConfigKey}; - /// - /// let builder = MicrosoftAzureBuilder::from_env() - /// .with_account("foo"); - /// let account_name = builder.get_config_value(&AzureConfigKey::AccountName).unwrap_or_default(); - /// assert_eq!("foo", &account_name); - /// ``` - pub fn get_config_value(&self, key: &AzureConfigKey) -> Option { - match key { - AzureConfigKey::AccountName => self.account_name.clone(), - AzureConfigKey::AccessKey => self.access_key.clone(), - AzureConfigKey::ClientId => self.client_id.clone(), - AzureConfigKey::ClientSecret => self.client_secret.clone(), - AzureConfigKey::AuthorityId => self.tenant_id.clone(), - AzureConfigKey::SasKey => self.sas_key.clone(), - AzureConfigKey::Token => self.bearer_token.clone(), - AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), - AzureConfigKey::UseFabricEndpoint => { - Some(self.use_fabric_endpoint.to_string()) - } - AzureConfigKey::Endpoint => self.endpoint.clone(), - AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), - AzureConfigKey::ObjectId => self.object_id.clone(), - AzureConfigKey::MsiResourceId => self.msi_resource_id.clone(), - AzureConfigKey::FederatedTokenFile => self.federated_token_file.clone(), - AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), - AzureConfigKey::Client(key) => self.client_options.get_config_value(key), - AzureConfigKey::ContainerName => self.container_name.clone(), - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - - match parsed.scheme() { - "az" | "adl" | "azure" => self.container_name = Some(validate(host)?), - "abfs" | "abfss" => { - // abfs(s) might refer to the fsspec convention abfs:/// - // or the convention for the hadoop driver abfs[s]://@.dfs.core.windows.net/ - if parsed.username().is_empty() { - self.container_name = Some(validate(host)?); - } else if let Some(a) = host.strip_suffix(".dfs.core.windows.net") { - self.container_name = Some(validate(parsed.username())?); - self.account_name = Some(validate(a)?); - } else if let Some(a) = host.strip_suffix(".dfs.fabric.microsoft.com") { - self.container_name = Some(validate(parsed.username())?); - self.account_name = Some(validate(a)?); - self.use_fabric_endpoint = true.into(); - } else { - return Err(UrlNotRecognisedSnafu { url }.build().into()); - } - } - "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) - | Some((a, "blob.core.windows.net")) => { - self.account_name = Some(validate(a)?); - } - Some((a, "dfs.fabric.microsoft.com")) - | Some((a, "blob.fabric.microsoft.com")) => { - self.account_name = Some(validate(a)?); - // Attempt to infer the container name from the URL - // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv - // - https://onelake.dfs.fabric.microsoft.com//.// - // - // See - if let Some(workspace) = parsed.path_segments().unwrap().next() { - if !workspace.is_empty() { - self.container_name = Some(workspace.to_string()) - } - } - self.use_fabric_endpoint = true.into(); - } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), - }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - } - Ok(()) - } - - /// Set the Azure Account (required) - pub fn with_account(mut self, account: impl Into) -> Self { - self.account_name = Some(account.into()); - self - } - - /// Set the Azure Container Name (required) - pub fn with_container_name(mut self, container_name: impl Into) -> Self { - self.container_name = Some(container_name.into()); - self - } - - /// Set the Azure Access Key (required - one of access key, bearer token, or client credentials) - pub fn with_access_key(mut self, access_key: impl Into) -> Self { - self.access_key = Some(access_key.into()); - self - } - - /// Set a static bearer token to be used for authorizing requests - pub fn with_bearer_token_authorization( - mut self, - bearer_token: impl Into, - ) -> Self { - self.bearer_token = Some(bearer_token.into()); - self - } - - /// Set a client secret used for client secret authorization - pub fn with_client_secret_authorization( - mut self, - client_id: impl Into, - client_secret: impl Into, - tenant_id: impl Into, - ) -> Self { - self.client_id = Some(client_id.into()); - self.client_secret = Some(client_secret.into()); - self.tenant_id = Some(tenant_id.into()); - self - } - - /// Sets the client id for use in client secret or k8s federated credential flow - pub fn with_client_id(mut self, client_id: impl Into) -> Self { - self.client_id = Some(client_id.into()); - self - } - - /// Sets the client secret for use in client secret flow - pub fn with_client_secret(mut self, client_secret: impl Into) -> Self { - self.client_secret = Some(client_secret.into()); - self - } - - /// Sets the tenant id for use in client secret or k8s federated credential flow - pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { - self.tenant_id = Some(tenant_id.into()); - self - } - - /// Set query pairs appended to the url for shared access signature authorization - pub fn with_sas_authorization( - mut self, - query_pairs: impl Into>, - ) -> Self { - self.sas_query_pairs = Some(query_pairs.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: AzureCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Set if the Azure emulator should be used (defaults to false) - pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { - self.use_emulator = use_emulator.into(); - self - } - - /// Override the endpoint used to communicate with blob storage - /// - /// Defaults to `https://{account}.blob.core.windows.net` - pub fn with_endpoint(mut self, endpoint: String) -> Self { - self.endpoint = Some(endpoint); - self - } - - /// Set if Microsoft Fabric url scheme should be used (defaults to false) - /// When disabled the url scheme used is `https://{account}.blob.core.windows.net` - /// When enabled the url scheme used is `https://{account}.dfs.fabric.microsoft.com` - /// - /// Note: [`Self::with_endpoint`] will take precedence over this option - pub fn with_use_fabric_endpoint(mut self, use_fabric_endpoint: bool) -> Self { - self.use_fabric_endpoint = use_fabric_endpoint.into(); - self - } - - /// Sets what protocol is allowed. If `allow_http` is : - /// * false (default): Only HTTPS are allowed - /// * true: HTTP and HTTPS are allowed - pub fn with_allow_http(mut self, allow_http: bool) -> Self { - self.client_options = self.client_options.with_allow_http(allow_http); - self - } - - /// Sets an alternative authority host for OAuth based authorization - /// common hosts for azure clouds are defined in [authority_hosts]. - /// Defaults to - pub fn with_authority_host(mut self, authority_host: impl Into) -> Self { - self.authority_host = Some(authority_host.into()); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Sets the endpoint for acquiring managed identity token - pub fn with_msi_endpoint(mut self, msi_endpoint: impl Into) -> Self { - self.msi_endpoint = Some(msi_endpoint.into()); - self - } - - /// Sets a file path for acquiring azure federated identity token in k8s - /// - /// requires `client_id` and `tenant_id` to be set - pub fn with_federated_token_file( - mut self, - federated_token_file: impl Into, - ) -> Self { - self.federated_token_file = Some(federated_token_file.into()); - self - } - - /// Set if the Azure Cli should be used for acquiring access token - /// - pub fn with_use_azure_cli(mut self, use_azure_cli: bool) -> Self { - self.use_azure_cli = use_azure_cli.into(); - self - } - - /// Configure a connection to container with given name on Microsoft Azure - /// Blob store. - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let container = self.container_name.ok_or(Error::MissingContainerName {})?; - - let static_creds = |credential: AzureCredential| -> AzureCredentialProvider { - Arc::new(StaticCredentialProvider::new(credential)) - }; - - let (is_emulator, storage_url, auth, account) = if self.use_emulator.get()? { - let account_name = self - .account_name - .unwrap_or_else(|| EMULATOR_ACCOUNT.to_string()); - // Allow overriding defaults. Values taken from - // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = self - .access_key - .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); - - let credential = static_creds(AzureCredential::AccessKey(account_key)); - - self.client_options = self.client_options.with_allow_http(true); - (true, url, credential, account_name) - } else { - let account_name = self.account_name.ok_or(Error::MissingAccount {})?; - let account_url = match self.endpoint { - Some(account_url) => account_url, - None => match self.use_fabric_endpoint.get()? { - true => { - format!("https://{}.blob.fabric.microsoft.com", &account_name) - } - false => format!("https://{}.blob.core.windows.net", &account_name), - }, - }; - - let url = Url::parse(&account_url) - .context(UnableToParseUrlSnafu { url: account_url })?; - - let credential = if let Some(credential) = self.credentials { - credential - } else if let Some(bearer_token) = self.bearer_token { - static_creds(AzureCredential::BearerToken(bearer_token)) - } else if let Some(access_key) = self.access_key { - static_creds(AzureCredential::AccessKey(access_key)) - } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = - (&self.client_id, &self.tenant_id, self.federated_token_file) - { - let client_credential = credential::WorkloadIdentityOAuthProvider::new( - client_id, - federated_token_file, - tenant_id, - self.authority_host, - ); - Arc::new(TokenCredentialProvider::new( - client_credential, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let (Some(client_id), Some(client_secret), Some(tenant_id)) = - (&self.client_id, self.client_secret, &self.tenant_id) - { - let client_credential = credential::ClientSecretOAuthProvider::new( - client_id.clone(), - client_secret, - tenant_id, - self.authority_host, - ); - Arc::new(TokenCredentialProvider::new( - client_credential, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let Some(query_pairs) = self.sas_query_pairs { - static_creds(AzureCredential::SASToken(query_pairs)) - } else if let Some(sas) = self.sas_key { - static_creds(AzureCredential::SASToken(split_sas(&sas)?)) - } else if self.use_azure_cli.get()? { - Arc::new(credential::AzureCliCredential::new()) as _ - } else { - let msi_credential = credential::ImdsManagedIdentityProvider::new( - self.client_id, - self.object_id, - self.msi_resource_id, - self.msi_endpoint, - ); - Arc::new(TokenCredentialProvider::new( - msi_credential, - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - (false, url, credential, account_name) - }; - - let config = client::AzureConfig { - account, - is_emulator, - container, - retry_config: self.retry_config, - client_options: self.client_options, - service: storage_url, - credentials: auth, - }; - - let client = Arc::new(client::AzureClient::new(config)?); - - Ok(MicrosoftAzure { client }) - } -} - -/// Parses the contents of the environment variable `env_name` as a URL -/// if present, otherwise falls back to default_url -fn url_from_env(env_name: &str, default_url: &str) -> Result { - let url = match std::env::var(env_name) { - Ok(env_value) => { - Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })? - } - Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), - }; - Ok(url) -} - -fn split_sas(sas: &str) -> Result, Error> { - let sas = percent_decode_str(sas) - .decode_utf8() - .context(DecodeSasKeySnafu {})?; - let kv_str_pairs = sas - .trim_start_matches('?') - .split('&') - .filter(|s| !s.chars().all(char::is_whitespace)); - let mut pairs = Vec::new(); - for kv_pair_str in kv_str_pairs { - let (k, v) = kv_pair_str - .trim() - .split_once('=') - .ok_or(Error::MissingSasComponent {})?; - pairs.push((k.into(), v.into())) - } - Ok(pairs) -} - #[cfg(test)] mod tests { use super::*; @@ -1133,7 +205,6 @@ mod tests { copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, }; - use std::collections::HashMap; #[tokio::test] async fn azure_blob_test() { @@ -1149,118 +220,6 @@ mod tests { stream_get(&integration).await; } - #[test] - fn azure_blob_test_urls() { - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("abfss://file_system@account.dfs.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, Some("file_system".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("abfss://file_system@account.dfs.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, Some("file_system".to_string())); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("abfs://container/path").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("az://container").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder.parse_url("az://container/path").unwrap(); - assert_eq!(builder.container_name, Some("container".to_string())); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.core.windows.net/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert!(!builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, None); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.dfs.fabric.microsoft.com/container") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name.as_deref(), Some("container")); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.fabric.microsoft.com/") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name, None); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let mut builder = MicrosoftAzureBuilder::new(); - builder - .parse_url("https://account.blob.fabric.microsoft.com/container") - .unwrap(); - assert_eq!(builder.account_name, Some("account".to_string())); - assert_eq!(builder.container_name.as_deref(), Some("container")); - assert!(builder.use_fabric_endpoint.get().unwrap()); - - let err_cases = [ - "mailto://account.blob.core.windows.net/", - "az://blob.mydomain/", - "abfs://container.foo/path", - "abfss://file_system@account.foo.dfs.core.windows.net/", - "abfss://file_system.bar@account.dfs.core.windows.net/", - "https://blob.mydomain/", - "https://blob.foo.dfs.core.windows.net/", - ]; - let mut builder = MicrosoftAzureBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn azure_test_config_from_map() { - let azure_client_id = "object_store:fake_access_key_id"; - let azure_storage_account_name = "object_store:fake_secret_key"; - let azure_storage_token = "object_store:fake_default_region"; - let options = HashMap::from([ - ("azure_client_id", azure_client_id), - ("azure_storage_account_name", azure_storage_account_name), - ("azure_storage_token", azure_storage_token), - ]); - - let builder = options - .into_iter() - .fold(MicrosoftAzureBuilder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }); - assert_eq!(builder.client_id.unwrap(), azure_client_id); - assert_eq!(builder.account_name.unwrap(), azure_storage_account_name); - assert_eq!(builder.bearer_token.unwrap(), azure_storage_token); - } - #[test] fn azure_test_config_get_value() { let azure_client_id = "object_store:fake_access_key_id".to_string(); @@ -1286,22 +245,4 @@ mod tests { azure_storage_token ); } - - #[test] - fn azure_test_split_sas() { - let raw_sas = "?sv=2021-10-04&st=2023-01-04T17%3A48%3A57Z&se=2023-01-04T18%3A15%3A00Z&sr=c&sp=rcwl&sig=C7%2BZeEOWbrxPA3R0Cw%2Fw1EZz0%2B4KBvQexeKZKe%2BB6h0%3D"; - let expected = vec![ - ("sv".to_string(), "2021-10-04".to_string()), - ("st".to_string(), "2023-01-04T17:48:57Z".to_string()), - ("se".to_string(), "2023-01-04T18:15:00Z".to_string()), - ("sr".to_string(), "c".to_string()), - ("sp".to_string(), "rcwl".to_string()), - ( - "sig".to_string(), - "C7+ZeEOWbrxPA3R0Cw/w1EZz0+4KBvQexeKZKe+B6h0=".to_string(), - ), - ]; - let pairs = split_sas(raw_sas).unwrap(); - assert_eq!(expected, pairs); - } } From efd4d1900a9d2cadd9393ab8c8b4eac77f6b88b5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:41:17 +0100 Subject: [PATCH 1291/1411] Add module links in docs root (#4955) --- object_store/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 018f0f5e8dec..86313616be1b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -64,19 +64,19 @@ //! #![cfg_attr( feature = "gcp", - doc = "* `gcp`: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" + doc = "* [`gcp`]: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" )] #![cfg_attr( feature = "aws", - doc = "* `aws`: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" + doc = "* [`aws`]: [Amazon S3](https://aws.amazon.com/s3/). See [`AmazonS3Builder`](aws::AmazonS3Builder)" )] #![cfg_attr( feature = "azure", - doc = "* `azure`: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" + doc = "* [`azure`]: [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/). See [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder)" )] #![cfg_attr( feature = "http", - doc = "* `http`: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" + doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! //! # Adapters From f597d3a6874264ebd9cf28a0d07a7fae52df440b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:45:43 +0100 Subject: [PATCH 1292/1411] Split gcp Module (#4956) * Split out GCP client * Split out builder * RAT --- object_store/src/gcp/builder.rs | 705 ++++++++++++++++++++ object_store/src/gcp/client.rs | 446 +++++++++++++ object_store/src/gcp/mod.rs | 1097 +------------------------------ 3 files changed, 1177 insertions(+), 1071 deletions(-) create mode 100644 object_store/src/gcp/builder.rs create mode 100644 object_store/src/gcp/client.rs diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs new file mode 100644 index 000000000000..920ab8b2a9b5 --- /dev/null +++ b/object_store/src/gcp/builder.rs @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::TokenCredentialProvider; +use crate::gcp::client::{GoogleCloudStorageClient, GoogleCloudStorageConfig}; +use crate::gcp::credential::{ + ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, + DEFAULT_GCS_BASE_URL, +}; +use crate::gcp::{ + credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE, +}; +use crate::{ + ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, +}; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::str::FromStr; +use std::sync::Arc; +use url::Url; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + + #[snafu(display( + "One of service account path or service account key may be provided." + ))] + ServiceAccountPathAndKeyProvided, + + #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + UnableToParseUrl { + source: url::ParseError, + url: String, + }, + + #[snafu(display( + "Unknown url scheme cannot be parsed into storage location: {}", + scheme + ))] + UnknownUrlScheme { scheme: String }, + + #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + UrlNotRecognised { url: String }, + + #[snafu(display("Configuration key: '{}' is not known.", key))] + UnknownConfigurationKey { key: String }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, + + #[snafu(display("GCP credential error: {}", source))] + Credential { source: credential::Error }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::UnknownConfigurationKey { key } => { + Self::UnknownConfigurationKey { store: STORE, key } + } + _ => Self::Generic { + store: STORE, + source: Box::new(err), + }, + } + } +} + +/// Configure a connection to Google Cloud Storage using the specified +/// credentials. +/// +/// # Example +/// ``` +/// # let BUCKET_NAME = "foo"; +/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; +/// # use object_store::gcp::GoogleCloudStorageBuilder; +/// let gcs = GoogleCloudStorageBuilder::new() +/// .with_service_account_path(SERVICE_ACCOUNT_PATH) +/// .with_bucket_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Clone)] +pub struct GoogleCloudStorageBuilder { + /// Bucket name + bucket_name: Option, + /// Url + url: Option, + /// Path to the service account file + service_account_path: Option, + /// The serialized service account key + service_account_key: Option, + /// Path to the application credentials file. + application_credentials_path: Option, + /// Retry config + retry_config: RetryConfig, + /// Client options + client_options: ClientOptions, + /// Credentials + credentials: Option, +} + +/// Configuration keys for [`GoogleCloudStorageBuilder`] +/// +/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] +/// +/// # Example +/// ``` +/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; +/// let builder = GoogleCloudStorageBuilder::new() +/// .with_config("google_service_account".parse().unwrap(), "my-service-account") +/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); +/// ``` +#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] +#[non_exhaustive] +pub enum GoogleConfigKey { + /// Path to the service account file + /// + /// Supported keys: + /// - `google_service_account` + /// - `service_account` + /// - `google_service_account_path` + /// - `service_account_path` + ServiceAccount, + + /// The serialized service account key. + /// + /// Supported keys: + /// - `google_service_account_key` + /// - `service_account_key` + ServiceAccountKey, + + /// Bucket name + /// + /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. + /// + /// Supported keys: + /// - `google_bucket` + /// - `google_bucket_name` + /// - `bucket` + /// - `bucket_name` + Bucket, + + /// Application credentials path + /// + /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. + ApplicationCredentials, + + /// Client options + Client(ClientConfigKey), +} + +impl AsRef for GoogleConfigKey { + fn as_ref(&self) -> &str { + match self { + Self::ServiceAccount => "google_service_account", + Self::ServiceAccountKey => "google_service_account_key", + Self::Bucket => "google_bucket", + Self::ApplicationCredentials => "google_application_credentials", + Self::Client(key) => key.as_ref(), + } + } +} + +impl FromStr for GoogleConfigKey { + type Err = crate::Error; + + fn from_str(s: &str) -> Result { + match s { + "google_service_account" + | "service_account" + | "google_service_account_path" + | "service_account_path" => Ok(Self::ServiceAccount), + "google_service_account_key" | "service_account_key" => { + Ok(Self::ServiceAccountKey) + } + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { + Ok(Self::Bucket) + } + "google_application_credentials" => Ok(Self::ApplicationCredentials), + _ => match s.parse() { + Ok(key) => Ok(Self::Client(key)), + Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), + }, + } + } +} + +impl Default for GoogleCloudStorageBuilder { + fn default() -> Self { + Self { + bucket_name: None, + service_account_path: None, + service_account_key: None, + application_credentials_path: None, + retry_config: Default::default(), + client_options: ClientOptions::new().with_allow_http(true), + url: None, + credentials: None, + } + } +} + +impl GoogleCloudStorageBuilder { + /// Create a new [`GoogleCloudStorageBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } + + /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. + /// + /// Variables extracted from environment: + /// * GOOGLE_SERVICE_ACCOUNT: location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file + /// * SERVICE_ACCOUNT: (alias) location of service account file + /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key + /// * GOOGLE_BUCKET: bucket name + /// * GOOGLE_BUCKET_NAME: (alias) bucket name + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_bucket_name("foo") + /// .build(); + /// ``` + pub fn from_env() -> Self { + let mut builder = Self::default(); + + if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { + builder.service_account_path = Some(service_account_path); + } + + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if key.starts_with("GOOGLE_") { + if let Ok(config_key) = key.to_ascii_lowercase().parse() { + builder = builder.with_config(config_key, value); + } + } + } + } + + builder + } + + /// Parse available connection info form a well-known storage URL. + /// + /// The supported url schemes are: + /// + /// - `gs:///` + /// + /// Note: Settings derived from the URL will override any others set on this builder + /// + /// # Example + /// ``` + /// use object_store::gcp::GoogleCloudStorageBuilder; + /// + /// let gcs = GoogleCloudStorageBuilder::from_env() + /// .with_url("gs://bucket/path") + /// .build(); + /// ``` + pub fn with_url(mut self, url: impl Into) -> Self { + self.url = Some(url.into()); + self + } + + /// Set an option on the builder via a key - value pair. + pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { + match key { + GoogleConfigKey::ServiceAccount => { + self.service_account_path = Some(value.into()) + } + GoogleConfigKey::ServiceAccountKey => { + self.service_account_key = Some(value.into()) + } + GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path = Some(value.into()) + } + GoogleConfigKey::Client(key) => { + self.client_options = self.client_options.with_config(key, value) + } + }; + self + } + + /// Set an option on the builder via a key - value pair. + #[deprecated(note = "Use with_config")] + pub fn try_with_option( + self, + key: impl AsRef, + value: impl Into, + ) -> Result { + Ok(self.with_config(key.as_ref().parse()?, value)) + } + + /// Hydrate builder from key value pairs + #[deprecated(note = "Use with_config")] + #[allow(deprecated)] + pub fn try_with_options< + I: IntoIterator, impl Into)>, + >( + mut self, + options: I, + ) -> Result { + for (key, value) in options { + self = self.try_with_option(key, value)?; + } + Ok(self) + } + + /// Get config value via a [`GoogleConfigKey`]. + /// + /// # Example + /// ``` + /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; + /// + /// let builder = GoogleCloudStorageBuilder::from_env() + /// .with_service_account_key("foo"); + /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); + /// assert_eq!("foo", &service_account_key); + /// ``` + pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { + match key { + GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), + GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), + GoogleConfigKey::Bucket => self.bucket_name.clone(), + GoogleConfigKey::ApplicationCredentials => { + self.application_credentials_path.clone() + } + GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), + } + } + + /// Sets properties on this builder based on a URL + /// + /// This is a separate member function to allow fallible computation to + /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] + fn parse_url(&mut self, url: &str) -> Result<()> { + let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; + let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + + let validate = |s: &str| match s.contains('.') { + true => Err(UrlNotRecognisedSnafu { url }.build()), + false => Ok(s.to_string()), + }; + + match parsed.scheme() { + "gs" => self.bucket_name = Some(validate(host)?), + scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + } + Ok(()) + } + + /// Set the bucket name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Set the path to the service account file. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be + /// set. + /// + /// Example `"/tmp/gcs.json"`. + /// + /// Example contents of `gcs.json`: + /// + /// ```json + /// { + /// "gcs_base_url": "https://localhost:4443", + /// "disable_oauth": true, + /// "client_email": "", + /// "private_key": "" + /// } + /// ``` + pub fn with_service_account_path( + mut self, + service_account_path: impl Into, + ) -> Self { + self.service_account_path = Some(service_account_path.into()); + self + } + + /// Set the service account key. The service account must be in the JSON + /// format. + /// + /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be + /// set. + pub fn with_service_account_key( + mut self, + service_account: impl Into, + ) -> Self { + self.service_account_key = Some(service_account.into()); + self + } + + /// Set the path to the application credentials file. + /// + /// + pub fn with_application_credentials( + mut self, + application_credentials_path: impl Into, + ) -> Self { + self.application_credentials_path = Some(application_credentials_path.into()); + self + } + + /// Set the credential provider overriding any other options + pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { + self.credentials = Some(credentials); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Set the proxy_url to be used by the underlying client + pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_url(proxy_url); + self + } + + /// Set a trusted proxy CA certificate + pub fn with_proxy_ca_certificate( + mut self, + proxy_ca_certificate: impl Into, + ) -> Self { + self.client_options = self + .client_options + .with_proxy_ca_certificate(proxy_ca_certificate); + self + } + + /// Set a list of hosts to exclude from proxy connections + pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { + self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); + self + } + + /// Sets the client options, overriding any already set + pub fn with_client_options(mut self, options: ClientOptions) -> Self { + self.client_options = options; + self + } + + /// Configure a connection to Google Cloud Storage, returning a + /// new [`GoogleCloudStorage`] and consuming `self` + pub fn build(mut self) -> Result { + if let Some(url) = self.url.take() { + self.parse_url(&url)?; + } + + let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; + + // First try to initialize from the service account information. + let service_account_credentials = + match (self.service_account_path, self.service_account_key) { + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .context(CredentialSnafu)?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, + ), + (None, None) => None, + (Some(_), Some(_)) => { + return Err(Error::ServiceAccountPathAndKeyProvided.into()) + } + }; + + // Then try to initialize from the application credentials file, or the environment. + let application_default_credentials = ApplicationDefaultCredentials::read( + self.application_credentials_path.as_deref(), + )?; + + let disable_oauth = service_account_credentials + .as_ref() + .map(|c| c.disable_oauth) + .unwrap_or(false); + + let gcs_base_url: String = service_account_credentials + .as_ref() + .and_then(|c| c.gcs_base_url.clone()) + .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); + + let credentials = if let Some(credentials) = self.credentials { + credentials + } else if disable_oauth { + Arc::new(StaticCredentialProvider::new(GcpCredential { + bearer: "".to_string(), + })) as _ + } else if let Some(credentials) = service_account_credentials { + Arc::new(TokenCredentialProvider::new( + credentials.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } else if let Some(credentials) = application_default_credentials { + match credentials { + ApplicationDefaultCredentials::AuthorizedUser(token) => { + Arc::new(TokenCredentialProvider::new( + token, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + ApplicationDefaultCredentials::ServiceAccount(token) => { + Arc::new(TokenCredentialProvider::new( + token.token_provider()?, + self.client_options.client()?, + self.retry_config.clone(), + )) as _ + } + } + } else { + Arc::new(TokenCredentialProvider::new( + InstanceCredentialProvider::default(), + self.client_options.metadata_client()?, + self.retry_config.clone(), + )) as _ + }; + + let config = GoogleCloudStorageConfig { + base_url: gcs_base_url, + credentials, + bucket_name, + retry_config: self.retry_config, + client_options: self.client_options, + }; + + Ok(GoogleCloudStorage { + client: Arc::new(GoogleCloudStorageClient::new(config)?), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + use std::io::Write; + use tempfile::NamedTempFile; + + const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; + + #[test] + fn gcs_test_service_account_key_and_path() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_service_account_path(tfile.path().to_str().unwrap()) + .with_bucket_name("foo") + .build() + .unwrap_err(); + } + + #[test] + fn gcs_test_config_from_map() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let options = HashMap::from([ + ("google_service_account", google_service_account.clone()), + ("google_bucket_name", google_bucket_name.clone()), + ]); + + let builder = options + .iter() + .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { + builder.with_config(key.parse().unwrap(), value) + }); + + assert_eq!( + builder.service_account_path.unwrap(), + google_service_account.as_str() + ); + assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); + } + + #[test] + fn gcs_test_config_aliases() { + // Service account path + for alias in [ + "google_service_account", + "service_account", + "google_service_account_path", + "service_account_path", + ] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), "/fake/path.json"); + assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); + } + + // Service account key + for alias in ["google_service_account_key", "service_account_key"] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), FAKE_KEY); + assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); + } + + // Bucket name + for alias in [ + "google_bucket", + "google_bucket_name", + "bucket", + "bucket_name", + ] { + let builder = GoogleCloudStorageBuilder::new() + .with_config(alias.parse().unwrap(), "fake_bucket"); + assert_eq!("fake_bucket", builder.bucket_name.unwrap()); + } + } + + #[tokio::test] + async fn gcs_test_proxy_url() { + let mut tfile = NamedTempFile::new().unwrap(); + write!(tfile, "{FAKE_KEY}").unwrap(); + let service_account_path = tfile.path(); + let gcs = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("https://example.com") + .build(); + assert!(dbg!(gcs).is_ok()); + + let err = GoogleCloudStorageBuilder::new() + .with_service_account_path(service_account_path.to_str().unwrap()) + .with_bucket_name("foo") + .with_proxy_url("asdf://example.com") + .build() + .unwrap_err() + .to_string(); + + assert_eq!( + "Generic HTTP client error: builder error: unknown proxy scheme", + err + ); + } + + #[test] + fn gcs_test_urls() { + let mut builder = GoogleCloudStorageBuilder::new(); + builder.parse_url("gs://bucket/path").unwrap(); + assert_eq!(builder.bucket_name, Some("bucket".to_string())); + + let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; + let mut builder = GoogleCloudStorageBuilder::new(); + for case in err_cases { + builder.parse_url(case).unwrap_err(); + } + } + + #[test] + fn gcs_test_service_account_key_only() { + let _ = GoogleCloudStorageBuilder::new() + .with_service_account_key(FAKE_KEY) + .with_bucket_name("foo") + .build() + .unwrap(); + } + + #[test] + fn gcs_test_config_get_value() { + let google_service_account = "object_store:fake_service_account".to_string(); + let google_bucket_name = "object_store:fake_bucket".to_string(); + let builder = GoogleCloudStorageBuilder::new() + .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) + .with_config(GoogleConfigKey::Bucket, &google_bucket_name); + + assert_eq!( + builder + .get_config_value(&GoogleConfigKey::ServiceAccount) + .unwrap(), + google_service_account + ); + assert_eq!( + builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), + google_bucket_name + ); + } +} diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs new file mode 100644 index 000000000000..9141a9da8c5b --- /dev/null +++ b/object_store/src/gcp/client.rs @@ -0,0 +1,446 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::get::GetClient; +use crate::client::header::get_etag; +use crate::client::list::ListClient; +use crate::client::list_response::ListResponse; +use crate::client::retry::RetryExt; +use crate::client::GetOptionsExt; +use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; +use crate::multipart::PartId; +use crate::path::{Path, DELIMITER}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Result, RetryConfig}; +use async_trait::async_trait; +use bytes::{Buf, Bytes}; +use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; +use reqwest::{header, Client, Method, Response, StatusCode}; +use serde::Serialize; +use snafu::{ResultExt, Snafu}; +use std::sync::Arc; + +#[derive(Debug, Snafu)] +enum Error { + #[snafu(display("Error performing list request: {}", source))] + ListRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting list response body: {}", source))] + ListResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid list response: {}", source))] + InvalidListResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing get request {}: {}", path, source))] + GetRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error performing delete request {}: {}", path, source))] + DeleteRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error performing put request {}: {}", path, source))] + PutRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Error getting put response body: {}", source))] + PutResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid put response: {}", source))] + InvalidPutResponse { source: quick_xml::de::DeError }, + + #[snafu(display("Error performing post request {}: {}", path, source))] + PostRequest { + source: crate::client::retry::Error, + path: String, + }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, +} + +impl From for crate::Error { + fn from(err: Error) -> Self { + match err { + Error::GetRequest { source, path } + | Error::DeleteRequest { source, path } + | Error::PutRequest { source, path } => source.error(STORE, path), + _ => Self::Generic { + store: STORE, + source: Box::new(err), + }, + } + } +} + +#[derive(Debug)] +pub struct GoogleCloudStorageConfig { + pub base_url: String, + + pub credentials: GcpCredentialProvider, + + pub bucket_name: String, + + pub retry_config: RetryConfig, + + pub client_options: ClientOptions, +} + +#[derive(Debug)] +pub struct GoogleCloudStorageClient { + config: GoogleCloudStorageConfig, + + client: Client, + + bucket_name_encoded: String, + + // TODO: Hook this up in tests + max_list_results: Option, +} + +impl GoogleCloudStorageClient { + pub fn new(config: GoogleCloudStorageConfig) -> Result { + let client = config.client_options.client()?; + let bucket_name_encoded = + percent_encode(config.bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + Ok(Self { + config, + client, + bucket_name_encoded, + max_list_results: None, + }) + } + + pub fn config(&self) -> &GoogleCloudStorageConfig { + &self.config + } + + async fn get_credential(&self) -> Result> { + self.config.credentials.get_credential().await + } + + pub fn object_url(&self, path: &Path) -> String { + let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); + format!( + "{}/{}/{}", + self.config.base_url, self.bucket_name_encoded, encoded + ) + } + + /// Perform a put request + /// + /// Returns the new ETag + pub async fn put_request( + &self, + path: &Path, + payload: Bytes, + query: &T, + ) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let content_type = self + .config + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + + let response = self + .client + .request(Method::PUT, url) + .query(query) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, content_type) + .header(header::CONTENT_LENGTH, payload.len()) + .body(payload) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(get_etag(response.headers()).context(MetadataSnafu)?) + } + + /// Initiate a multi-part upload + pub async fn multipart_initiate(&self, path: &Path) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let content_type = self + .config + .client_options + .get_content_type(path) + .unwrap_or("application/octet-stream"); + + let response = self + .client + .request(Method::POST, &url) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, content_type) + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploads", "")]) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + let data = response.bytes().await.context(PutResponseBodySnafu)?; + let result: InitiateMultipartUploadResult = + quick_xml::de::from_reader(data.as_ref().reader()) + .context(InvalidPutResponseSnafu)?; + + Ok(result.upload_id) + } + + /// Cleanup unused parts + pub async fn multipart_cleanup( + &self, + path: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + self.client + .request(Method::DELETE, &url) + .bearer_auth(&credential.bearer) + .header(header::CONTENT_TYPE, "application/octet-stream") + .header(header::CONTENT_LENGTH, "0") + .query(&[("uploadId", multipart_id)]) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + pub async fn multipart_complete( + &self, + path: &Path, + multipart_id: &MultipartId, + completed_parts: Vec, + ) -> Result<()> { + let upload_id = multipart_id.clone(); + let url = self.object_url(path); + + let parts = completed_parts + .into_iter() + .enumerate() + .map(|(part_number, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }) + .collect(); + + let credential = self.get_credential().await?; + let upload_info = CompleteMultipartUpload { parts }; + + let data = quick_xml::se::to_string(&upload_info) + .context(InvalidPutResponseSnafu)? + // We cannot disable the escaping that transforms "/" to ""e;" :( + // https://github.com/tafia/quick-xml/issues/362 + // https://github.com/tafia/quick-xml/issues/350 + .replace(""", "\""); + + self.client + .request(Method::POST, &url) + .bearer_auth(&credential.bearer) + .query(&[("uploadId", upload_id)]) + .body(data) + .send_retry(&self.config.retry_config) + .await + .context(PostRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a delete request + pub async fn delete_request(&self, path: &Path) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let builder = self.client.request(Method::DELETE, url); + builder + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(DeleteRequestSnafu { + path: path.as_ref(), + })?; + + Ok(()) + } + + /// Perform a copy request + pub async fn copy_request( + &self, + from: &Path, + to: &Path, + if_not_exists: bool, + ) -> Result<()> { + let credential = self.get_credential().await?; + let url = self.object_url(to); + + let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); + let source = format!("{}/{}", self.bucket_name_encoded, from); + + let mut builder = self + .client + .request(Method::PUT, url) + .header("x-goog-copy-source", source); + + if if_not_exists { + builder = builder.header("x-goog-if-generation-match", 0); + } + + builder + .bearer_auth(&credential.bearer) + // Needed if reqwest is compiled with native-tls instead of rustls-tls + // See https://github.com/apache/arrow-rs/pull/3921 + .header(header::CONTENT_LENGTH, 0) + .send_retry(&self.config.retry_config) + .await + .map_err(|err| match err.status() { + Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + source: Box::new(err), + path: to.to_string(), + }, + _ => err.error(STORE, from.to_string()), + })?; + + Ok(()) + } +} + +#[async_trait] +impl GetClient for GoogleCloudStorageClient { + const STORE: &'static str = STORE; + + /// Perform a get request + async fn get_request(&self, path: &Path, options: GetOptions) -> Result { + let credential = self.get_credential().await?; + let url = self.object_url(path); + + let method = match options.head { + true => Method::HEAD, + false => Method::GET, + }; + + let mut request = self.client.request(method, url).with_get_options(options); + + if !credential.bearer.is_empty() { + request = request.bearer_auth(&credential.bearer); + } + + let response = request + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + + Ok(response) + } +} + +#[async_trait] +impl ListClient for GoogleCloudStorageClient { + /// Perform a list request + async fn list_request( + &self, + prefix: Option<&str>, + delimiter: bool, + page_token: Option<&str>, + offset: Option<&str>, + ) -> Result<(ListResult, Option)> { + assert!(offset.is_none()); // Not yet supported + + let credential = self.get_credential().await?; + let url = format!("{}/{}", self.config.base_url, self.bucket_name_encoded); + + let mut query = Vec::with_capacity(5); + query.push(("list-type", "2")); + if delimiter { + query.push(("delimiter", DELIMITER)) + } + + if let Some(prefix) = &prefix { + query.push(("prefix", prefix)) + } + + if let Some(page_token) = page_token { + query.push(("continuation-token", page_token)) + } + + if let Some(max_results) = &self.max_list_results { + query.push(("max-keys", max_results)) + } + + let response = self + .client + .request(Method::GET, url) + .query(&query) + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(ListRequestSnafu)? + .bytes() + .await + .context(ListResponseBodySnafu)?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .context(InvalidListResponseSnafu)?; + + let token = response.next_continuation_token.take(); + Ok((response.try_into()?, token)) + } +} + +#[derive(serde::Deserialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct InitiateMultipartUploadResult { + upload_id: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] +struct MultipartPart { + #[serde(rename = "PartNumber")] + part_number: usize, + e_tag: String, +} + +#[derive(serde::Serialize, Debug)] +#[serde(rename_all = "PascalCase")] +struct CompleteMultipartUpload { + #[serde(rename = "Part", default)] + parts: Vec, +} diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 97755c07c671..7c69d288740c 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -29,176 +29,34 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. -use std::str::FromStr; use std::sync::Arc; -use async_trait::async_trait; -use bytes::{Buf, Bytes}; -use futures::stream::BoxStream; -use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; -use reqwest::{header, Client, Method, Response, StatusCode}; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; -use tokio::io::AsyncWrite; -use url::Url; - -use crate::client::get::{GetClient, GetClientExt}; -use crate::client::list::{ListClient, ListClientExt}; -use crate::client::list_response::ListResponse; -use crate::client::retry::RetryExt; -use crate::client::{ - ClientConfigKey, CredentialProvider, GetOptionsExt, StaticCredentialProvider, - TokenCredentialProvider, -}; +use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, - path::{Path, DELIMITER}, - ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + path::Path, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, + Result, }; +use async_trait::async_trait; +use bytes::Bytes; +use client::GoogleCloudStorageClient; +use futures::stream::BoxStream; +use tokio::io::AsyncWrite; -use credential::{InstanceCredentialProvider, ServiceAccountCredentials}; +use crate::client::get::GetClientExt; +use crate::client::list::ListClientExt; +pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; +pub use credential::GcpCredential; +mod builder; +mod client; mod credential; const STORE: &str = "GCS"; /// [`CredentialProvider`] for [`GoogleCloudStorage`] pub type GcpCredentialProvider = Arc>; -use crate::client::header::get_etag; -use crate::gcp::credential::{ApplicationDefaultCredentials, DEFAULT_GCS_BASE_URL}; -pub use credential::GcpCredential; - -#[derive(Debug, Snafu)] -enum Error { - #[snafu(display("Got invalid XML response for {} {}: {}", method, url, source))] - InvalidXMLResponse { - source: quick_xml::de::DeError, - method: String, - url: String, - data: Bytes, - }, - - #[snafu(display("Error performing list request: {}", source))] - ListRequest { source: crate::client::retry::Error }, - - #[snafu(display("Error getting list response body: {}", source))] - ListResponseBody { source: reqwest::Error }, - - #[snafu(display("Got invalid list response: {}", source))] - InvalidListResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Error performing get request {}: {}", path, source))] - GetRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error getting get response body {}: {}", path, source))] - GetResponseBody { - source: reqwest::Error, - path: String, - }, - - #[snafu(display("Error performing delete request {}: {}", path, source))] - DeleteRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error performing put request {}: {}", path, source))] - PutRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error getting put response body: {}", source))] - PutResponseBody { source: reqwest::Error }, - - #[snafu(display("Got invalid put response: {}", source))] - InvalidPutResponse { source: quick_xml::de::DeError }, - - #[snafu(display("Error performing post request {}: {}", path, source))] - PostRequest { - source: crate::client::retry::Error, - path: String, - }, - - #[snafu(display("Error decoding object size: {}", source))] - InvalidSize { source: std::num::ParseIntError }, - - #[snafu(display("Missing bucket name"))] - MissingBucketName {}, - - #[snafu(display( - "One of service account path or service account key may be provided." - ))] - ServiceAccountPathAndKeyProvided, - - #[snafu(display("GCP credential error: {}", source))] - Credential { source: credential::Error }, - - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] - UnableToParseUrl { - source: url::ParseError, - url: String, - }, - - #[snafu(display( - "Unknown url scheme cannot be parsed into storage location: {}", - scheme - ))] - UnknownUrlScheme { scheme: String }, - - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] - UrlNotRecognised { url: String }, - - #[snafu(display("Configuration key: '{}' is not known.", key))] - UnknownConfigurationKey { key: String }, - - #[snafu(display("Unable to extract metadata from headers: {}", source))] - Metadata { - source: crate::client::header::Error, - }, -} - -impl From for super::Error { - fn from(err: Error) -> Self { - match err { - Error::GetRequest { source, path } - | Error::DeleteRequest { source, path } - | Error::PutRequest { source, path } => source.error(STORE, path), - Error::UnknownConfigurationKey { key } => { - Self::UnknownConfigurationKey { store: STORE, key } - } - _ => Self::Generic { - store: STORE, - source: Box::new(err), - }, - } - } -} - -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipartUploadResult { - upload_id: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] -struct MultipartPart { - #[serde(rename = "PartNumber")] - part_number: usize, - e_tag: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct CompleteMultipartUpload { - #[serde(rename = "Part", default)] - parts: Vec, -} /// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] @@ -208,271 +66,18 @@ pub struct GoogleCloudStorage { impl std::fmt::Display for GoogleCloudStorage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "GoogleCloudStorage({})", self.client.bucket_name) + write!( + f, + "GoogleCloudStorage({})", + self.client.config().bucket_name + ) } } impl GoogleCloudStorage { /// Returns the [`GcpCredentialProvider`] used by [`GoogleCloudStorage`] pub fn credentials(&self) -> &GcpCredentialProvider { - &self.client.credentials - } -} - -#[derive(Debug)] -struct GoogleCloudStorageClient { - client: Client, - base_url: String, - - credentials: GcpCredentialProvider, - - bucket_name: String, - bucket_name_encoded: String, - - retry_config: RetryConfig, - client_options: ClientOptions, - - // TODO: Hook this up in tests - max_list_results: Option, -} - -impl GoogleCloudStorageClient { - async fn get_credential(&self) -> Result> { - self.credentials.get_credential().await - } - - fn object_url(&self, path: &Path) -> String { - let encoded = utf8_percent_encode(path.as_ref(), NON_ALPHANUMERIC); - format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, encoded) - } - - /// Perform a put request - /// - /// Returns the new ETag - async fn put_request( - &self, - path: &Path, - payload: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let content_type = self - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - - let response = self - .client - .request(Method::PUT, url) - .query(query) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, payload.len()) - .body(payload) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - Ok(get_etag(response.headers()).context(MetadataSnafu)?) - } - - /// Initiate a multi-part upload - async fn multipart_initiate(&self, path: &Path) -> Result { - let credential = self.get_credential().await?; - let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); - - let content_type = self - .client_options - .get_content_type(path) - .unwrap_or("application/octet-stream"); - - let response = self - .client - .request(Method::POST, &url) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, content_type) - .header(header::CONTENT_LENGTH, "0") - .query(&[("uploads", "")]) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - let data = response.bytes().await.context(PutResponseBodySnafu)?; - let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()) - .context(InvalidPutResponseSnafu)?; - - Ok(result.upload_id) - } - - /// Cleanup unused parts - async fn multipart_cleanup( - &self, - path: &str, - multipart_id: &MultipartId, - ) -> Result<()> { - let credential = self.get_credential().await?; - let url = format!("{}/{}/{}", self.base_url, self.bucket_name_encoded, path); - - self.client - .request(Method::DELETE, &url) - .bearer_auth(&credential.bearer) - .header(header::CONTENT_TYPE, "application/octet-stream") - .header(header::CONTENT_LENGTH, "0") - .query(&[("uploadId", multipart_id)]) - .send_retry(&self.retry_config) - .await - .context(PutRequestSnafu { path })?; - - Ok(()) - } - - /// Perform a delete request - async fn delete_request(&self, path: &Path) -> Result<()> { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let builder = self.client.request(Method::DELETE, url); - builder - .bearer_auth(&credential.bearer) - .send_retry(&self.retry_config) - .await - .context(DeleteRequestSnafu { - path: path.as_ref(), - })?; - - Ok(()) - } - - /// Perform a copy request - async fn copy_request( - &self, - from: &Path, - to: &Path, - if_not_exists: bool, - ) -> Result<()> { - let credential = self.get_credential().await?; - let url = self.object_url(to); - - let from = utf8_percent_encode(from.as_ref(), NON_ALPHANUMERIC); - let source = format!("{}/{}", self.bucket_name_encoded, from); - - let mut builder = self - .client - .request(Method::PUT, url) - .header("x-goog-copy-source", source); - - if if_not_exists { - builder = builder.header("x-goog-if-generation-match", 0); - } - - builder - .bearer_auth(&credential.bearer) - // Needed if reqwest is compiled with native-tls instead of rustls-tls - // See https://github.com/apache/arrow-rs/pull/3921 - .header(header::CONTENT_LENGTH, 0) - .send_retry(&self.retry_config) - .await - .map_err(|err| match err.status() { - Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { - source: Box::new(err), - path: to.to_string(), - }, - _ => err.error(STORE, from.to_string()), - })?; - - Ok(()) - } -} - -#[async_trait] -impl GetClient for GoogleCloudStorageClient { - const STORE: &'static str = STORE; - - /// Perform a get request - async fn get_request(&self, path: &Path, options: GetOptions) -> Result { - let credential = self.get_credential().await?; - let url = self.object_url(path); - - let method = match options.head { - true => Method::HEAD, - false => Method::GET, - }; - - let mut request = self.client.request(method, url).with_get_options(options); - - if !credential.bearer.is_empty() { - request = request.bearer_auth(&credential.bearer); - } - - let response = - request - .send_retry(&self.retry_config) - .await - .context(GetRequestSnafu { - path: path.as_ref(), - })?; - - Ok(response) - } -} - -#[async_trait] -impl ListClient for GoogleCloudStorageClient { - /// Perform a list request - async fn list_request( - &self, - prefix: Option<&str>, - delimiter: bool, - page_token: Option<&str>, - offset: Option<&str>, - ) -> Result<(ListResult, Option)> { - assert!(offset.is_none()); // Not yet supported - - let credential = self.get_credential().await?; - let url = format!("{}/{}", self.base_url, self.bucket_name_encoded); - - let mut query = Vec::with_capacity(5); - query.push(("list-type", "2")); - if delimiter { - query.push(("delimiter", DELIMITER)) - } - - if let Some(prefix) = &prefix { - query.push(("prefix", prefix)) - } - - if let Some(page_token) = page_token { - query.push(("continuation-token", page_token)) - } - - if let Some(max_results) = &self.max_list_results { - query.push(("max-keys", max_results)) - } - - let response = self - .client - .request(Method::GET, url) - .query(&query) - .bearer_auth(&credential.bearer) - .send_retry(&self.retry_config) - .await - .context(ListRequestSnafu)? - .bytes() - .await - .context(ListResponseBodySnafu)?; - - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; - - let token = response.next_continuation_token.take(); - Ok((response.try_into()?, token)) + &self.client.config().credentials } } @@ -504,41 +109,9 @@ impl PutPart for GCSMultipartUpload { /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { - let upload_id = self.multipart_id.clone(); - let url = self.client.object_url(&self.path); - - let parts = completed_parts - .into_iter() - .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, - }) - .collect(); - - let credential = self.client.get_credential().await?; - let upload_info = CompleteMultipartUpload { parts }; - - let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutResponseSnafu)? - // We cannot disable the escaping that transforms "/" to ""e;" :( - // https://github.com/tafia/quick-xml/issues/362 - // https://github.com/tafia/quick-xml/issues/350 - .replace(""", "\""); - self.client - .client - .request(Method::POST, &url) - .bearer_auth(&credential.bearer) - .query(&[("uploadId", upload_id)]) - .body(data) - .send_retry(&self.client.retry_config) + .multipart_complete(&self.path, &self.multipart_id, completed_parts) .await - .context(PostRequestSnafu { - path: self.path.as_ref(), - })?; - - Ok(()) } } @@ -570,7 +143,7 @@ impl ObjectStore for GoogleCloudStorage { multipart_id: &MultipartId, ) -> Result<()> { self.client - .multipart_cleanup(location.as_ref(), multipart_id) + .multipart_cleanup(location, multipart_id) .await?; Ok(()) @@ -601,498 +174,16 @@ impl ObjectStore for GoogleCloudStorage { } } -/// Configure a connection to Google Cloud Storage using the specified -/// credentials. -/// -/// # Example -/// ``` -/// # let BUCKET_NAME = "foo"; -/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; -/// # use object_store::gcp::GoogleCloudStorageBuilder; -/// let gcs = GoogleCloudStorageBuilder::new() -/// .with_service_account_path(SERVICE_ACCOUNT_PATH) -/// .with_bucket_name(BUCKET_NAME) -/// .build(); -/// ``` -#[derive(Debug, Clone)] -pub struct GoogleCloudStorageBuilder { - /// Bucket name - bucket_name: Option, - /// Url - url: Option, - /// Path to the service account file - service_account_path: Option, - /// The serialized service account key - service_account_key: Option, - /// Path to the application credentials file. - application_credentials_path: Option, - /// Retry config - retry_config: RetryConfig, - /// Client options - client_options: ClientOptions, - /// Credentials - credentials: Option, -} - -/// Configuration keys for [`GoogleCloudStorageBuilder`] -/// -/// Configuration via keys can be done via [`GoogleCloudStorageBuilder::with_config`] -/// -/// # Example -/// ``` -/// # use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; -/// let builder = GoogleCloudStorageBuilder::new() -/// .with_config("google_service_account".parse().unwrap(), "my-service-account") -/// .with_config(GoogleConfigKey::Bucket, "my-bucket"); -/// ``` -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Serialize, Deserialize)] -#[non_exhaustive] -pub enum GoogleConfigKey { - /// Path to the service account file - /// - /// Supported keys: - /// - `google_service_account` - /// - `service_account` - /// - `google_service_account_path` - /// - `service_account_path` - ServiceAccount, - - /// The serialized service account key. - /// - /// Supported keys: - /// - `google_service_account_key` - /// - `service_account_key` - ServiceAccountKey, - - /// Bucket name - /// - /// See [`GoogleCloudStorageBuilder::with_bucket_name`] for details. - /// - /// Supported keys: - /// - `google_bucket` - /// - `google_bucket_name` - /// - `bucket` - /// - `bucket_name` - Bucket, - - /// Application credentials path - /// - /// See [`GoogleCloudStorageBuilder::with_application_credentials`]. - ApplicationCredentials, - - /// Client options - Client(ClientConfigKey), -} - -impl AsRef for GoogleConfigKey { - fn as_ref(&self) -> &str { - match self { - Self::ServiceAccount => "google_service_account", - Self::ServiceAccountKey => "google_service_account_key", - Self::Bucket => "google_bucket", - Self::ApplicationCredentials => "google_application_credentials", - Self::Client(key) => key.as_ref(), - } - } -} - -impl FromStr for GoogleConfigKey { - type Err = super::Error; - - fn from_str(s: &str) -> Result { - match s { - "google_service_account" - | "service_account" - | "google_service_account_path" - | "service_account_path" => Ok(Self::ServiceAccount), - "google_service_account_key" | "service_account_key" => { - Ok(Self::ServiceAccountKey) - } - "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { - Ok(Self::Bucket) - } - "google_application_credentials" => Ok(Self::ApplicationCredentials), - _ => match s.parse() { - Ok(key) => Ok(Self::Client(key)), - Err(_) => Err(Error::UnknownConfigurationKey { key: s.into() }.into()), - }, - } - } -} - -impl Default for GoogleCloudStorageBuilder { - fn default() -> Self { - Self { - bucket_name: None, - service_account_path: None, - service_account_key: None, - application_credentials_path: None, - retry_config: Default::default(), - client_options: ClientOptions::new().with_allow_http(true), - url: None, - credentials: None, - } - } -} - -impl GoogleCloudStorageBuilder { - /// Create a new [`GoogleCloudStorageBuilder`] with default values. - pub fn new() -> Self { - Default::default() - } - - /// Create an instance of [`GoogleCloudStorageBuilder`] with values pre-populated from environment variables. - /// - /// Variables extracted from environment: - /// * GOOGLE_SERVICE_ACCOUNT: location of service account file - /// * GOOGLE_SERVICE_ACCOUNT_PATH: (alias) location of service account file - /// * SERVICE_ACCOUNT: (alias) location of service account file - /// * GOOGLE_SERVICE_ACCOUNT_KEY: JSON serialized service account key - /// * GOOGLE_BUCKET: bucket name - /// * GOOGLE_BUCKET_NAME: (alias) bucket name - /// - /// # Example - /// ``` - /// use object_store::gcp::GoogleCloudStorageBuilder; - /// - /// let gcs = GoogleCloudStorageBuilder::from_env() - /// .with_bucket_name("foo") - /// .build(); - /// ``` - pub fn from_env() -> Self { - let mut builder = Self::default(); - - if let Ok(service_account_path) = std::env::var("SERVICE_ACCOUNT") { - builder.service_account_path = Some(service_account_path); - } - - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if key.starts_with("GOOGLE_") { - if let Ok(config_key) = key.to_ascii_lowercase().parse() { - builder = builder.with_config(config_key, value); - } - } - } - } - - builder - } - - /// Parse available connection info form a well-known storage URL. - /// - /// The supported url schemes are: - /// - /// - `gs:///` - /// - /// Note: Settings derived from the URL will override any others set on this builder - /// - /// # Example - /// ``` - /// use object_store::gcp::GoogleCloudStorageBuilder; - /// - /// let gcs = GoogleCloudStorageBuilder::from_env() - /// .with_url("gs://bucket/path") - /// .build(); - /// ``` - pub fn with_url(mut self, url: impl Into) -> Self { - self.url = Some(url.into()); - self - } - - /// Set an option on the builder via a key - value pair. - pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { - match key { - GoogleConfigKey::ServiceAccount => { - self.service_account_path = Some(value.into()) - } - GoogleConfigKey::ServiceAccountKey => { - self.service_account_key = Some(value.into()) - } - GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path = Some(value.into()) - } - GoogleConfigKey::Client(key) => { - self.client_options = self.client_options.with_config(key, value) - } - }; - self - } - - /// Set an option on the builder via a key - value pair. - #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { - Ok(self.with_config(key.as_ref().parse()?, value)) - } - - /// Hydrate builder from key value pairs - #[deprecated(note = "Use with_config")] - #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( - mut self, - options: I, - ) -> Result { - for (key, value) in options { - self = self.try_with_option(key, value)?; - } - Ok(self) - } - - /// Get config value via a [`GoogleConfigKey`]. - /// - /// # Example - /// ``` - /// use object_store::gcp::{GoogleCloudStorageBuilder, GoogleConfigKey}; - /// - /// let builder = GoogleCloudStorageBuilder::from_env() - /// .with_service_account_key("foo"); - /// let service_account_key = builder.get_config_value(&GoogleConfigKey::ServiceAccountKey).unwrap_or_default(); - /// assert_eq!("foo", &service_account_key); - /// ``` - pub fn get_config_value(&self, key: &GoogleConfigKey) -> Option { - match key { - GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), - GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), - GoogleConfigKey::Bucket => self.bucket_name.clone(), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path.clone() - } - GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), - } - } - - /// Sets properties on this builder based on a URL - /// - /// This is a separate member function to allow fallible computation to - /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] - fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - - match parsed.scheme() { - "gs" => self.bucket_name = Some(validate(host)?), - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), - } - Ok(()) - } - - /// Set the bucket name (required) - pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { - self.bucket_name = Some(bucket_name.into()); - self - } - - /// Set the path to the service account file. - /// - /// This or [`GoogleCloudStorageBuilder::with_service_account_key`] must be - /// set. - /// - /// Example `"/tmp/gcs.json"`. - /// - /// Example contents of `gcs.json`: - /// - /// ```json - /// { - /// "gcs_base_url": "https://localhost:4443", - /// "disable_oauth": true, - /// "client_email": "", - /// "private_key": "" - /// } - /// ``` - pub fn with_service_account_path( - mut self, - service_account_path: impl Into, - ) -> Self { - self.service_account_path = Some(service_account_path.into()); - self - } - - /// Set the service account key. The service account must be in the JSON - /// format. - /// - /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be - /// set. - pub fn with_service_account_key( - mut self, - service_account: impl Into, - ) -> Self { - self.service_account_key = Some(service_account.into()); - self - } - - /// Set the path to the application credentials file. - /// - /// - pub fn with_application_credentials( - mut self, - application_credentials_path: impl Into, - ) -> Self { - self.application_credentials_path = Some(application_credentials_path.into()); - self - } - - /// Set the credential provider overriding any other options - pub fn with_credentials(mut self, credentials: GcpCredentialProvider) -> Self { - self.credentials = Some(credentials); - self - } - - /// Set the retry configuration - pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { - self.retry_config = retry_config; - self - } - - /// Set the proxy_url to be used by the underlying client - pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_url(proxy_url); - self - } - - /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { - self.client_options = self - .client_options - .with_proxy_ca_certificate(proxy_ca_certificate); - self - } - - /// Set a list of hosts to exclude from proxy connections - pub fn with_proxy_excludes(mut self, proxy_excludes: impl Into) -> Self { - self.client_options = self.client_options.with_proxy_excludes(proxy_excludes); - self - } - - /// Sets the client options, overriding any already set - pub fn with_client_options(mut self, options: ClientOptions) -> Self { - self.client_options = options; - self - } - - /// Configure a connection to Google Cloud Storage, returning a - /// new [`GoogleCloudStorage`] and consuming `self` - pub fn build(mut self) -> Result { - if let Some(url) = self.url.take() { - self.parse_url(&url)?; - } - - let bucket_name = self.bucket_name.ok_or(Error::MissingBucketName {})?; - - let client = self.client_options.client()?; - - // First try to initialize from the service account information. - let service_account_credentials = - match (self.service_account_path, self.service_account_key) { - (Some(path), None) => Some( - ServiceAccountCredentials::from_file(path) - .context(CredentialSnafu)?, - ), - (None, Some(key)) => Some( - ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, - ), - (None, None) => None, - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) - } - }; - - // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::read( - self.application_credentials_path.as_deref(), - )?; - - let disable_oauth = service_account_credentials - .as_ref() - .map(|c| c.disable_oauth) - .unwrap_or(false); - - let gcs_base_url: String = service_account_credentials - .as_ref() - .and_then(|c| c.gcs_base_url.clone()) - .unwrap_or_else(|| DEFAULT_GCS_BASE_URL.to_string()); - - let credentials = if let Some(credentials) = self.credentials { - credentials - } else if disable_oauth { - Arc::new(StaticCredentialProvider::new(GcpCredential { - bearer: "".to_string(), - })) as _ - } else if let Some(credentials) = service_account_credentials { - Arc::new(TokenCredentialProvider::new( - credentials.token_provider()?, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } else if let Some(credentials) = application_default_credentials { - match credentials { - ApplicationDefaultCredentials::AuthorizedUser(token) => { - Arc::new(TokenCredentialProvider::new( - token, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } - ApplicationDefaultCredentials::ServiceAccount(token) => { - Arc::new(TokenCredentialProvider::new( - token.token_provider()?, - self.client_options.client()?, - self.retry_config.clone(), - )) as _ - } - } - } else { - Arc::new(TokenCredentialProvider::new( - InstanceCredentialProvider::default(), - self.client_options.metadata_client()?, - self.retry_config.clone(), - )) as _ - }; - - let encoded_bucket_name = - percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); - - Ok(GoogleCloudStorage { - client: Arc::new(GoogleCloudStorageClient { - client, - base_url: gcs_base_url, - credentials, - bucket_name, - bucket_name_encoded: encoded_bucket_name, - retry_config: self.retry_config, - client_options: self.client_options, - max_list_results: None, - }), - }) - } -} - #[cfg(test)] mod test { + use bytes::Bytes; - use std::collections::HashMap; - use std::io::Write; - use tempfile::NamedTempFile; + use credential::DEFAULT_GCS_BASE_URL; use crate::tests::*; use super::*; - const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; const NON_EXISTENT_NAME: &str = "nonexistentname"; #[tokio::test] @@ -1104,7 +195,7 @@ mod test { list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; - if integration.client.base_url == DEFAULT_GCS_BASE_URL { + if integration.client.config().base_url == DEFAULT_GCS_BASE_URL { // Fake GCS server doesn't currently honor ifGenerationMatch // https://github.com/fsouza/fake-gcs-server/issues/994 copy_if_not_exists(&integration).await; @@ -1198,140 +289,4 @@ mod test { err ) } - - #[tokio::test] - async fn gcs_test_proxy_url() { - let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{FAKE_KEY}").unwrap(); - let service_account_path = tfile.path(); - let gcs = GoogleCloudStorageBuilder::new() - .with_service_account_path(service_account_path.to_str().unwrap()) - .with_bucket_name("foo") - .with_proxy_url("https://example.com") - .build(); - assert!(dbg!(gcs).is_ok()); - - let err = GoogleCloudStorageBuilder::new() - .with_service_account_path(service_account_path.to_str().unwrap()) - .with_bucket_name("foo") - .with_proxy_url("asdf://example.com") - .build() - .unwrap_err() - .to_string(); - - assert_eq!( - "Generic HTTP client error: builder error: unknown proxy scheme", - err - ); - } - - #[test] - fn gcs_test_urls() { - let mut builder = GoogleCloudStorageBuilder::new(); - builder.parse_url("gs://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); - - let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; - let mut builder = GoogleCloudStorageBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } - } - - #[test] - fn gcs_test_service_account_key_only() { - let _ = GoogleCloudStorageBuilder::new() - .with_service_account_key(FAKE_KEY) - .with_bucket_name("foo") - .build() - .unwrap(); - } - - #[test] - fn gcs_test_service_account_key_and_path() { - let mut tfile = NamedTempFile::new().unwrap(); - write!(tfile, "{FAKE_KEY}").unwrap(); - let _ = GoogleCloudStorageBuilder::new() - .with_service_account_key(FAKE_KEY) - .with_service_account_path(tfile.path().to_str().unwrap()) - .with_bucket_name("foo") - .build() - .unwrap_err(); - } - - #[test] - fn gcs_test_config_from_map() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let options = HashMap::from([ - ("google_service_account", google_service_account.clone()), - ("google_bucket_name", google_bucket_name.clone()), - ]); - - let builder = options - .iter() - .fold(GoogleCloudStorageBuilder::new(), |builder, (key, value)| { - builder.with_config(key.parse().unwrap(), value) - }); - - assert_eq!( - builder.service_account_path.unwrap(), - google_service_account.as_str() - ); - assert_eq!(builder.bucket_name.unwrap(), google_bucket_name.as_str()); - } - - #[test] - fn gcs_test_config_get_value() { - let google_service_account = "object_store:fake_service_account".to_string(); - let google_bucket_name = "object_store:fake_bucket".to_string(); - let builder = GoogleCloudStorageBuilder::new() - .with_config(GoogleConfigKey::ServiceAccount, &google_service_account) - .with_config(GoogleConfigKey::Bucket, &google_bucket_name); - - assert_eq!( - builder - .get_config_value(&GoogleConfigKey::ServiceAccount) - .unwrap(), - google_service_account - ); - assert_eq!( - builder.get_config_value(&GoogleConfigKey::Bucket).unwrap(), - google_bucket_name - ); - } - - #[test] - fn gcs_test_config_aliases() { - // Service account path - for alias in [ - "google_service_account", - "service_account", - "google_service_account_path", - "service_account_path", - ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "/fake/path.json"); - assert_eq!("/fake/path.json", builder.service_account_path.unwrap()); - } - - // Service account key - for alias in ["google_service_account_key", "service_account_key"] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), FAKE_KEY); - assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); - } - - // Bucket name - for alias in [ - "google_bucket", - "google_bucket_name", - "bucket", - "bucket_name", - ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "fake_bucket"); - assert_eq!("fake_bucket", builder.bucket_name.unwrap()); - } - } } From 7e134f4d277c0b62c27529fc15a4739de3ad0afd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 19 Oct 2023 17:19:40 +0100 Subject: [PATCH 1293/1411] Use rustfmt default line width (#4960) * Use rustfmt default line width * Further format --- arrow-arith/src/aggregate.rs | 61 +- arrow-arith/src/arithmetic.rs | 49 +- arrow-arith/src/arity.rs | 22 +- arrow-arith/src/bitwise.rs | 21 +- arrow-arith/src/boolean.rs | 39 +- arrow-arith/src/numeric.rs | 35 +- arrow-arith/src/temporal.rs | 108 +- arrow-array/src/arithmetic.rs | 5 +- arrow-array/src/array/binary_array.rs | 42 +- arrow-array/src/array/boolean_array.rs | 13 +- arrow-array/src/array/byte_array.rs | 18 +- arrow-array/src/array/dictionary_array.rs | 57 +- .../src/array/fixed_size_binary_array.rs | 36 +- .../src/array/fixed_size_list_array.rs | 54 +- arrow-array/src/array/list_array.rs | 80 +- arrow-array/src/array/map_array.rs | 42 +- arrow-array/src/array/mod.rs | 93 +- arrow-array/src/array/primitive_array.rs | 144 +- arrow-array/src/array/run_array.rs | 49 +- arrow-array/src/array/string_array.rs | 46 +- arrow-array/src/array/struct_array.rs | 30 +- arrow-array/src/array/union_array.rs | 16 +- arrow-array/src/builder/boolean_builder.rs | 9 +- arrow-array/src/builder/buffer_builder.rs | 10 +- .../src/builder/fixed_size_binary_builder.rs | 21 +- .../src/builder/generic_byte_run_builder.rs | 23 +- .../src/builder/generic_bytes_builder.rs | 11 +- .../generic_bytes_dictionary_builder.rs | 61 +- arrow-array/src/builder/map_builder.rs | 13 +- arrow-array/src/builder/primitive_builder.rs | 25 +- .../builder/primitive_dictionary_builder.rs | 18 +- arrow-array/src/builder/struct_builder.rs | 35 +- arrow-array/src/builder/union_builder.rs | 13 +- arrow-array/src/cast.rs | 23 +- arrow-array/src/delta.rs | 10 +- arrow-array/src/iterator.rs | 10 +- arrow-array/src/lib.rs | 3 +- arrow-array/src/numeric.rs | 56 +- arrow-array/src/record_batch.rs | 70 +- arrow-array/src/run_iterator.rs | 18 +- arrow-array/src/temporal_conversions.rs | 13 +- arrow-array/src/timezone.rs | 14 +- arrow-array/src/types.rs | 32 +- arrow-avro/src/reader/header.rs | 4 +- arrow-avro/src/reader/mod.rs | 4 +- arrow-avro/src/schema.rs | 4 +- arrow-buffer/src/bigint/div.rs | 16 +- arrow-buffer/src/bigint/mod.rs | 31 +- arrow-buffer/src/buffer/boolean.rs | 27 +- arrow-buffer/src/buffer/immutable.rs | 8 +- arrow-buffer/src/buffer/mutable.rs | 11 +- arrow-buffer/src/buffer/null.rs | 5 +- arrow-buffer/src/buffer/offset.rs | 3 +- arrow-buffer/src/buffer/ops.rs | 6 +- arrow-buffer/src/buffer/run.rs | 6 +- arrow-buffer/src/buffer/scalar.rs | 12 +- arrow-buffer/src/builder/boolean.rs | 6 +- arrow-buffer/src/bytes.rs | 6 +- arrow-buffer/src/util/bit_chunk_iterator.rs | 35 +- arrow-buffer/src/util/bit_iterator.rs | 4 +- arrow-buffer/src/util/bit_mask.rs | 38 +- arrow-cast/src/cast.rs | 1170 ++++++----------- arrow-cast/src/display.rs | 62 +- arrow-cast/src/parse.rs | 215 ++- arrow-cast/src/pretty.rs | 50 +- arrow-csv/src/reader/mod.rs | 143 +- arrow-csv/src/reader/records.rs | 19 +- arrow-csv/src/writer.rs | 45 +- arrow-data/src/data.rs | 147 +-- arrow-data/src/decimal.rs | 617 +++++---- arrow-data/src/equal/boolean.rs | 11 +- arrow-data/src/equal/fixed_binary.rs | 22 +- arrow-data/src/equal/mod.rs | 52 +- arrow-data/src/equal/primitive.rs | 22 +- arrow-data/src/equal/union.rs | 5 +- arrow-data/src/equal/utils.rs | 6 +- arrow-data/src/transform/list.rs | 15 +- arrow-data/src/transform/mod.rs | 60 +- arrow-data/src/transform/primitive.rs | 5 +- arrow-data/src/transform/utils.rs | 4 +- arrow-data/src/transform/variable_size.rs | 15 +- arrow-flight/examples/flight_sql_server.rs | 40 +- arrow-flight/examples/server.rs | 6 +- arrow-flight/src/client.rs | 19 +- arrow-flight/src/decode.rs | 33 +- arrow-flight/src/encode.rs | 128 +- arrow-flight/src/lib.rs | 5 +- arrow-flight/src/sql/client.rs | 77 +- arrow-flight/src/sql/metadata/db_schemas.rs | 6 +- arrow-flight/src/sql/metadata/sql_info.rs | 19 +- arrow-flight/src/sql/metadata/tables.rs | 8 +- arrow-flight/src/sql/metadata/xdbc_info.rs | 10 +- arrow-flight/src/sql/mod.rs | 5 +- arrow-flight/src/sql/server.rs | 117 +- arrow-flight/src/trailers.rs | 9 +- arrow-flight/src/utils.rs | 23 +- arrow-flight/tests/client.rs | 14 +- arrow-flight/tests/common/server.rs | 26 +- arrow-flight/tests/common/trailers_layer.rs | 4 +- arrow-flight/tests/encode_decode.rs | 41 +- arrow-flight/tests/flight_sql_client_cli.rs | 17 +- arrow-integration-test/src/datatype.rs | 20 +- arrow-integration-test/src/field.rs | 106 +- arrow-integration-test/src/lib.rs | 141 +- arrow-integration-test/src/schema.rs | 27 +- .../src/bin/arrow-json-integration-test.rs | 11 +- .../src/bin/flight-test-integration-client.rs | 3 +- .../auth_basic_proto.rs | 10 +- .../integration_test.rs | 38 +- .../src/flight_client_scenarios/middleware.rs | 3 +- .../auth_basic_proto.rs | 28 +- .../integration_test.rs | 34 +- .../src/flight_server_scenarios/middleware.rs | 9 +- arrow-integration-testing/src/lib.rs | 4 +- arrow-integration-testing/tests/ipc_reader.rs | 10 +- arrow-integration-testing/tests/ipc_writer.rs | 37 +- arrow-ipc/src/compression.rs | 26 +- arrow-ipc/src/convert.rs | 84 +- arrow-ipc/src/gen/File.rs | 47 +- arrow-ipc/src/gen/Message.rs | 148 ++- arrow-ipc/src/gen/Schema.rs | 339 ++--- arrow-ipc/src/gen/SparseTensor.rs | 244 ++-- arrow-ipc/src/gen/Tensor.rs | 188 ++- arrow-ipc/src/reader.rs | 151 +-- arrow-ipc/src/writer.rs | 222 ++-- arrow-json/src/reader/list_array.rs | 5 +- arrow-json/src/reader/map_array.rs | 5 +- arrow-json/src/reader/mod.rs | 28 +- arrow-json/src/reader/primitive_array.rs | 11 +- arrow-json/src/reader/schema.rs | 38 +- arrow-json/src/reader/serializer.rs | 18 +- arrow-json/src/reader/string_array.rs | 3 +- arrow-json/src/reader/struct_array.rs | 18 +- arrow-json/src/reader/tape.rs | 26 +- arrow-json/src/reader/timestamp_array.rs | 22 +- arrow-json/src/writer.rs | 96 +- arrow-ord/src/cmp.rs | 40 +- arrow-ord/src/comparison.rs | 621 +++------ arrow-ord/src/ord.rs | 10 +- arrow-ord/src/partition.rs | 4 +- arrow-ord/src/rank.rs | 10 +- arrow-ord/src/sort.rs | 138 +- arrow-row/src/lib.rs | 87 +- arrow-row/src/list.rs | 3 +- arrow-row/src/variable.rs | 4 +- arrow-schema/src/datatype.rs | 76 +- arrow-schema/src/ffi.rs | 28 +- arrow-schema/src/field.rs | 22 +- arrow-schema/src/schema.rs | 38 +- arrow-select/src/concat.rs | 172 +-- arrow-select/src/dictionary.rs | 38 +- arrow-select/src/filter.rs | 79 +- arrow-select/src/interleave.rs | 36 +- arrow-select/src/nullif.rs | 20 +- arrow-select/src/take.rs | 147 +-- arrow-string/src/concat_elements.rs | 18 +- arrow-string/src/length.rs | 16 +- arrow-string/src/like.rs | 27 +- arrow-string/src/predicate.rs | 19 +- arrow-string/src/regexp.rs | 47 +- arrow-string/src/substring.rs | 24 +- arrow/benches/array_data_validate.rs | 3 +- arrow/benches/array_from_vec.rs | 4 +- arrow/benches/bitwise_kernel.rs | 12 +- arrow/benches/buffer_bit_ops.rs | 12 +- arrow/benches/buffer_create.rs | 13 +- arrow/benches/builder.rs | 5 +- arrow/benches/csv_reader.rs | 15 +- arrow/benches/csv_writer.rs | 6 +- arrow/benches/decimal_validate.rs | 4 +- arrow/benches/filter_kernels.rs | 3 +- arrow/benches/interleave_kernels.rs | 3 +- arrow/benches/lexsort.rs | 8 +- arrow/benches/primitive_run_accessor.rs | 7 +- arrow/benches/primitive_run_take.rs | 4 +- arrow/benches/row_format.rs | 24 +- arrow/benches/sort_kernel.rs | 6 +- arrow/benches/string_run_builder.rs | 4 +- arrow/benches/string_run_iterator.rs | 4 +- arrow/benches/take_kernels.rs | 4 +- arrow/examples/builders.rs | 9 +- arrow/examples/dynamic_types.rs | 3 +- arrow/src/array/ffi.rs | 19 +- arrow/src/compute/kernels.rs | 4 +- arrow/src/datatypes/mod.rs | 4 +- arrow/src/ffi.rs | 60 +- arrow/src/ffi_stream.rs | 22 +- arrow/src/lib.rs | 3 +- arrow/src/pyarrow.rs | 16 +- arrow/src/tensor.rs | 30 +- arrow/src/util/bench_util.rs | 22 +- arrow/src/util/data_gen.rs | 39 +- arrow/tests/arithmetic.rs | 4 +- arrow/tests/array_cast.rs | 76 +- arrow/tests/array_equal.rs | 114 +- arrow/tests/array_transform.rs | 105 +- arrow/tests/array_validation.rs | 78 +- arrow/tests/csv.rs | 6 +- object_store/src/aws/builder.rs | 67 +- object_store/src/aws/client.rs | 56 +- object_store/src/aws/credential.rs | 39 +- object_store/src/aws/mod.rs | 34 +- object_store/src/aws/resolve.rs | 5 +- object_store/src/azure/builder.rs | 85 +- object_store/src/azure/client.rs | 32 +- object_store/src/azure/credential.rs | 60 +- object_store/src/azure/mod.rs | 16 +- object_store/src/buffered.rs | 48 +- object_store/src/chunked.rs | 9 +- object_store/src/client/backoff.rs | 12 +- object_store/src/client/get.rs | 13 +- object_store/src/client/mock_server.rs | 3 +- object_store/src/client/mod.rs | 37 +- object_store/src/client/retry.rs | 7 +- object_store/src/delimited.rs | 3 +- object_store/src/gcp/builder.rs | 87 +- object_store/src/gcp/client.rs | 20 +- object_store/src/gcp/credential.rs | 12 +- object_store/src/gcp/mod.rs | 9 +- object_store/src/http/client.rs | 15 +- object_store/src/http/mod.rs | 10 +- object_store/src/lib.rs | 70 +- object_store/src/limit.rs | 26 +- object_store/src/local.rs | 97 +- object_store/src/memory.rs | 20 +- object_store/src/parse.rs | 10 +- object_store/src/path/mod.rs | 30 +- object_store/src/prefix.rs | 20 +- object_store/src/signer.rs | 7 +- object_store/src/throttle.rs | 52 +- object_store/src/util.rs | 19 +- object_store/tests/get_range_file.rs | 11 +- parquet/benches/arrow_reader.rs | 157 +-- parquet/benches/arrow_writer.rs | 5 +- parquet/benches/compression.rs | 9 +- parquet/examples/read_with_rowgroup.rs | 17 +- parquet/src/arrow/arrow_reader/mod.rs | 276 ++-- parquet/src/arrow/arrow_reader/selection.rs | 26 +- parquet/src/arrow/arrow_writer/byte_array.rs | 37 +- parquet/src/arrow/arrow_writer/levels.rs | 130 +- parquet/src/arrow/arrow_writer/mod.rs | 152 +-- parquet/src/arrow/async_reader/metadata.rs | 20 +- parquet/src/arrow/async_reader/mod.rs | 144 +- parquet/src/arrow/async_reader/store.rs | 9 +- parquet/src/arrow/async_writer/mod.rs | 19 +- parquet/src/arrow/buffer/bit_util.rs | 3 +- parquet/src/arrow/buffer/dictionary_buffer.rs | 29 +- parquet/src/arrow/buffer/offset_buffer.rs | 10 +- parquet/src/arrow/decoder/delta_byte_array.rs | 10 +- parquet/src/arrow/decoder/dictionary_index.rs | 15 +- parquet/src/arrow/mod.rs | 10 +- parquet/src/arrow/record_reader/buffer.rs | 7 +- .../arrow/record_reader/definition_levels.rs | 35 +- parquet/src/arrow/record_reader/mod.rs | 17 +- parquet/src/basic.rs | 74 +- parquet/src/bin/parquet-fromcsv.rs | 44 +- parquet/src/bin/parquet-index.rs | 4 +- parquet/src/bin/parquet-layout.rs | 5 +- parquet/src/bin/parquet-read.rs | 3 +- parquet/src/bin/parquet-rewrite.rs | 46 +- parquet/src/bin/parquet-rowcount.rs | 3 +- parquet/src/bin/parquet-show-bloom-filter.rs | 4 +- parquet/src/bloom_filter/mod.rs | 25 +- parquet/src/column/reader.rs | 92 +- parquet/src/column/reader/decoder.rs | 45 +- parquet/src/column/writer/encoder.rs | 7 +- parquet/src/column/writer/mod.rs | 215 +-- parquet/src/data_type.rs | 84 +- parquet/src/file/footer.rs | 3 +- parquet/src/file/metadata.rs | 30 +- parquet/src/file/page_encoding_stats.rs | 4 +- parquet/src/file/page_index/index_reader.rs | 18 +- parquet/src/file/properties.rs | 29 +- parquet/src/file/reader.rs | 43 +- parquet/src/file/serialized_reader.rs | 69 +- parquet/src/file/writer.rs | 105 +- parquet/src/record/api.rs | 43 +- parquet/src/record/mod.rs | 3 +- parquet/src/record/reader.rs | 93 +- parquet/src/record/triplet.rs | 62 +- parquet/src/schema/parser.rs | 406 +++--- parquet/src/schema/printer.rs | 66 +- parquet/src/schema/types.rs | 38 +- parquet/src/schema/visitor.rs | 24 +- parquet/src/thrift.rs | 9 +- parquet/tests/arrow_writer_layout.rs | 13 +- parquet_derive/src/parquet_field.rs | 30 +- parquet_derive_test/src/lib.rs | 3 +- rustfmt.toml | 6 - 289 files changed, 4941 insertions(+), 8730 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 04417c666c85..0dabaa50f5f6 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -207,15 +207,15 @@ where } let iter = ArrayIter::new(array); - let sum = - iter.into_iter() - .try_fold(T::default_value(), |accumulator, value| { - if let Some(value) = value { - accumulator.add_checked(value) - } else { - Ok(accumulator) - } - })?; + let sum = iter + .into_iter() + .try_fold(T::default_value(), |accumulator, value| { + if let Some(value) = value { + accumulator.add_checked(value) + } else { + Ok(accumulator) + } + })?; Ok(Some(sum)) } @@ -230,11 +230,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_array_helper::( - array, - |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, - min, - ) + min_max_array_helper::(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, min) } /// Returns the max of values in the array of `ArrowNumericType` type, or dictionary @@ -244,11 +240,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_array_helper::( - array, - |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, - max, - ) + min_max_array_helper::(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, max) } fn min_max_array_helper, F, M>( @@ -501,10 +493,7 @@ mod simd { fn init_accumulator_chunk() -> Self::SimdAccumulator; /// Updates the accumulator with the values of one chunk - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ); + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd); /// Updates the accumulator with the values of one chunk according to the given vector mask fn accumulate_chunk_nullable( @@ -602,10 +591,7 @@ mod simd { (T::init(T::default_value()), T::mask_init(false)) } - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { let acc_is_nan = !T::eq(accumulator.0, accumulator.0); let is_lt = acc_is_nan | T::lt(chunk, accumulator.0); let first_or_lt = !accumulator.1 | is_lt; @@ -627,10 +613,7 @@ mod simd { accumulator.1 |= vecmask; } - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { + fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { if !accumulator.1 { accumulator.0 = value; } else { @@ -690,10 +673,7 @@ mod simd { (T::init(T::default_value()), T::mask_init(false)) } - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { let chunk_is_nan = !T::eq(chunk, chunk); let is_gt = chunk_is_nan | T::gt(chunk, accumulator.0); let first_or_gt = !accumulator.1 | is_gt; @@ -715,10 +695,7 @@ mod simd { accumulator.1 |= vecmask; } - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { + fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { if !accumulator.1 { accumulator.0 = value; } else { @@ -1009,8 +986,7 @@ mod tests { #[test] fn test_primitive_array_bool_or_with_nulls() { - let a = - BooleanArray::from(vec![None, Some(false), Some(false), None, Some(false)]); + let a = BooleanArray::from(vec![None, Some(false), Some(false), None, Some(false)]); assert!(!bool_or(&a).unwrap()); } @@ -1297,8 +1273,7 @@ mod tests { assert_eq!(Some(false), min_boolean(&a)); assert_eq!(Some(true), max_boolean(&a)); - let a = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false), None]); + let a = BooleanArray::from(vec![Some(false), Some(true), None, Some(false), None]); assert_eq!(Some(false), min_boolean(&a)); assert_eq!(Some(true), max_boolean(&a)); } diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 8635ce0ddd80..124614d77f97 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -48,8 +48,7 @@ fn get_fixed_point_info( ))); } - let divisor = - i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); + let divisor = i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); Ok((precision, product_scale, divisor)) } @@ -78,8 +77,7 @@ pub fn multiply_fixed_point_dyn( let left = left.as_any().downcast_ref::().unwrap(); let right = right.as_any().downcast_ref::().unwrap(); - multiply_fixed_point(left, right, required_scale) - .map(|a| Arc::new(a) as ArrayRef) + multiply_fixed_point(left, right, required_scale).map(|a| Arc::new(a) as ArrayRef) } (_, _) => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -113,10 +111,8 @@ pub fn multiply_fixed_point_checked( )?; if required_scale == product_scale { - return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { - a.mul_checked(b) - })? - .with_precision_and_scale(precision, required_scale); + return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| a.mul_checked(b))? + .with_precision_and_scale(precision, required_scale); } try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { @@ -213,17 +209,16 @@ mod tests { .unwrap(); let err = mul(&a, &b).unwrap_err(); - assert!(err.to_string().contains( - "Overflow happened on: 123456789000000000000000000 * 10000000000000000000" - )); + assert!(err + .to_string() + .contains("Overflow happened on: 123456789000000000000000000 * 10000000000000000000")); // Allow precision loss. let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); // [1234567890] - let expected = - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); assert_eq!( @@ -233,13 +228,9 @@ mod tests { // Rounding case // [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555] - let a = Decimal128Array::from(vec![ - 1, - 123456789555555555555555555, - 1555555555555555555, - ]) - .with_precision_and_scale(38, 18) - .unwrap(); + let a = Decimal128Array::from(vec![1, 123456789555555555555555555, 1555555555555555555]) + .with_precision_and_scale(38, 18) + .unwrap(); // [1.555555555555555555, 11.222222222222222222, 0.000000000000000001] let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1]) @@ -311,10 +302,9 @@ mod tests { )); let result = multiply_fixed_point(&a, &b, 28).unwrap(); - let expected = - Decimal128Array::from(vec![62946009661555981610246871926660136960]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![62946009661555981610246871926660136960]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); } @@ -338,10 +328,9 @@ mod tests { // Avoid overflow by reducing the scale. let result = multiply_fixed_point(&a, &b, 28).unwrap(); // [1234567890] - let expected = - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); assert_eq!( diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index f3118d104536..ff8b82a5d943 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -49,10 +49,7 @@ where } /// See [`PrimitiveArray::try_unary`] -pub fn try_unary( - array: &PrimitiveArray, - op: F, -) -> Result, ArrowError> +pub fn try_unary(array: &PrimitiveArray, op: F) -> Result, ArrowError> where I: ArrowPrimitiveType, O: ArrowPrimitiveType, @@ -86,10 +83,7 @@ where } /// A helper function that applies a fallible unary function to a dictionary array with primitive value type. -fn try_unary_dict( - array: &DictionaryArray, - op: F, -) -> Result +fn try_unary_dict(array: &DictionaryArray, op: F) -> Result where K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowPrimitiveType, @@ -299,8 +293,7 @@ where try_binary_no_nulls(len, a, b, op) } else { let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) - .unwrap(); + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); @@ -308,8 +301,7 @@ where nulls.try_for_each_valid_idx(|idx| { unsafe { - *slice.get_unchecked_mut(idx) = - op(a.value_unchecked(idx), b.value_unchecked(idx))? + *slice.get_unchecked_mut(idx) = op(a.value_unchecked(idx), b.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) })?; @@ -360,8 +352,7 @@ where try_binary_no_nulls_mut(len, a, b, op) } else { let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) - .unwrap(); + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); let mut builder = a.into_builder()?; @@ -440,8 +431,7 @@ mod tests { #[test] #[allow(deprecated)] fn test_unary_f64_slice() { - let input = - Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); + let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); let input_slice = input.slice(1, 4); let result = unary(&input_slice, |n| n.round()); assert_eq!( diff --git a/arrow-arith/src/bitwise.rs b/arrow-arith/src/bitwise.rs index a5dec4638703..c7885952f8ba 100644 --- a/arrow-arith/src/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -212,10 +212,8 @@ mod tests { #[test] fn test_bitwise_shift_left() { let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); - let right = - UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]); - let expected = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]); + let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]); + let expected = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]); let result = bitwise_shift_left(&left, &right).unwrap(); assert_eq!(expected, result); } @@ -224,18 +222,15 @@ mod tests { fn test_bitwise_shift_left_scalar() { let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); let scalar = 2; - let expected = - UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]); + let expected = UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]); let result = bitwise_shift_left_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); } #[test] fn test_bitwise_shift_right() { - let left = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); - let right = - UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]); + let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]); let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(1)]); let result = bitwise_shift_right(&left, &right).unwrap(); assert_eq!(expected, result); @@ -243,11 +238,9 @@ mod tests { #[test] fn test_bitwise_shift_right_scalar() { - let left = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); let scalar = 2; - let expected = - UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]); + let expected = UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]); let result = bitwise_shift_right_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); } diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 46e5998208f1..269a36d66c2b 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -57,10 +57,7 @@ use arrow_schema::ArrowError; /// # Fails /// /// If the operands have different lengths -pub fn and_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform bitwise operation on arrays of different length".to_string(), @@ -155,10 +152,7 @@ pub fn and_kleene( /// # Fails /// /// If the operands have different lengths -pub fn or_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform bitwise operation on arrays of different length".to_string(), @@ -257,10 +251,7 @@ where /// let and_ab = and(&a, &b).unwrap(); /// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None])); /// ``` -pub fn and( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { binary_boolean_kernel(left, right, |a, b| a & b) } @@ -581,8 +572,7 @@ mod tests { let a = a.as_any().downcast_ref::().unwrap(); let c = not(a).unwrap(); - let expected = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); + let expected = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); assert_eq!(c, expected); } @@ -631,12 +621,10 @@ mod tests { #[test] fn test_bool_array_and_sliced_same_offset() { let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, + false, false, false, false, false, false, false, false, false, false, true, true, ]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let a = a.slice(8, 4); @@ -654,12 +642,10 @@ mod tests { #[test] fn test_bool_array_and_sliced_same_offset_mod8() { let a = BooleanArray::from(vec![ - false, false, true, true, false, false, false, false, false, false, false, - false, + false, false, true, true, false, false, false, false, false, false, false, false, ]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let a = a.slice(0, 4); @@ -677,8 +663,7 @@ mod tests { #[test] fn test_bool_array_and_sliced_offset1() { let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, + false, false, false, false, false, false, false, false, false, false, true, true, ]); let b = BooleanArray::from(vec![false, true, false, true]); @@ -696,8 +681,7 @@ mod tests { fn test_bool_array_and_sliced_offset2() { let a = BooleanArray::from(vec![false, false, true, true]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let b = b.slice(8, 4); @@ -730,8 +714,7 @@ mod tests { let c = and(a, b).unwrap(); - let expected = - BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]); + let expected = BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]); assert_eq!(expected, c); } diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index c47731ed5125..b2c87bba5143 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -144,13 +144,13 @@ pub fn neg(array: &dyn Array) -> Result { let a = array .as_primitive::() .try_unary::<_, IntervalMonthDayNanoType, ArrowError>(|x| { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(x); - Ok(IntervalMonthDayNanoType::make_value( - months.neg_checked()?, - days.neg_checked()?, - nanos.neg_checked()?, - )) - })?; + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(x); + Ok(IntervalMonthDayNanoType::make_value( + months.neg_checked()?, + days.neg_checked()?, + nanos.neg_checked()?, + )) + })?; Ok(Arc::new(a)) } t => Err(ArrowError::InvalidArgumentError(format!( @@ -201,11 +201,7 @@ impl Op { } /// Dispatch the given `op` to the appropriate specialized kernel -fn arithmetic_op( - op: Op, - lhs: &dyn Datum, - rhs: &dyn Datum, -) -> Result { +fn arithmetic_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { use DataType::*; use IntervalUnit::*; use TimeUnit::*; @@ -675,8 +671,7 @@ fn date_op( (Date64, Op::Sub | Op::SubWrapping, Date64) => { let l = l.as_primitive::(); let r = r.as_primitive::(); - let result = - try_op_ref!(DurationMillisecondType, l, l_s, r, r_s, l.sub_checked(r)); + let result = try_op_ref!(DurationMillisecondType, l, l_s, r, r_s, l.sub_checked(r)); return Ok(result); } _ => {} @@ -800,8 +795,7 @@ fn decimal_op( let mul_pow = result_scale - s1 + s2; // p1 - s1 + s2 + result_scale - let result_precision = - (mul_pow.saturating_add(*p1 as i8) as u8).min(T::MAX_PRECISION); + let result_precision = (mul_pow.saturating_add(*p1 as i8) as u8).min(T::MAX_PRECISION); let (l_mul, r_mul) = match mul_pow.cmp(&0) { Ordering::Greater => ( @@ -1158,7 +1152,10 @@ mod tests { .with_precision_and_scale(3, -1) .unwrap(); let err = add(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000"); + assert_eq!( + err, + "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000" + ); let b = Decimal128Array::from(vec![0]) .with_precision_and_scale(1, 1) @@ -1199,9 +1196,7 @@ mod tests { "1960-01-30T04:23:20Z", ] .into_iter() - .map(|x| { - T::make_value(DateTime::parse_from_rfc3339(x).unwrap().naive_utc()).unwrap() - }) + .map(|x| T::make_value(DateTime::parse_from_rfc3339(x).unwrap().naive_utc()).unwrap()) .collect(); let a = PrimitiveArray::::new(values, None); diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 7855b6fc6e46..a9c3de5401c1 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -23,9 +23,7 @@ use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use arrow_array::builder::*; use arrow_array::iterator::ArrayIter; -use arrow_array::temporal_conversions::{ - as_datetime, as_datetime_with_timezone, as_time, -}; +use arrow_array::temporal_conversions::{as_datetime, as_datetime_with_timezone, as_time}; use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; @@ -209,12 +207,9 @@ where } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<&PrimitiveArray, T, _>( - iter, - b, - tz, - |t| t.hour() as i32, - ) + extract_component_from_datetime_array::<&PrimitiveArray, T, _>(iter, b, tz, |t| { + t.hour() as i32 + }) } _ => return_compute_error_with!("hour does not support", array.data_type()), } @@ -289,9 +284,7 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result( - array: &PrimitiveArray, -) -> Result +pub fn num_days_from_monday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -318,9 +311,7 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result( - array: &PrimitiveArray, -) -> Result +pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -449,11 +440,7 @@ pub fn millisecond_dyn(array: &dyn Array) -> Result { } /// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_dyn( - array: &dyn Array, - name: &str, - op: F, -) -> Result +fn time_fraction_dyn(array: &dyn Array, name: &str, op: F) -> Result where F: Fn(NaiveDateTime) -> i32, { @@ -498,14 +485,9 @@ where } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - op(t.naive_local()) - }) + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| op(t.naive_local())) } - _ => return_compute_error_with!( - format!("{name} does not support"), - array.data_type() - ), + _ => return_compute_error_with!(format!("{name} does not support"), array.data_type()), } } @@ -559,8 +541,7 @@ mod tests { #[test] fn test_temporal_array_time64_micro_hour() { - let a: PrimitiveArray = - vec![37800000000, 86339000000].into(); + let a: PrimitiveArray = vec![37800000000, 86339000000].into(); let b = hour(&a).unwrap(); assert_eq!(10, b.value(0)); @@ -623,12 +604,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_quarter_with_timezone() { // 24 * 60 * 60 = 86400 - let a = TimestampSecondArray::from(vec![86400 * 90]) - .with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 90]).with_timezone("+00:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = TimestampSecondArray::from(vec![86400 * 90]) - .with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 90]).with_timezone("-10:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -659,12 +638,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_month_with_timezone() { // 24 * 60 * 60 = 86400 - let a = TimestampSecondArray::from(vec![86400 * 31]) - .with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 31]).with_timezone("+00:00".to_string()); let b = month(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = TimestampSecondArray::from(vec![86400 * 31]) - .with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 31]).with_timezone("-10:00".to_string()); let b = month(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -672,12 +649,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_day_with_timezone() { // 24 * 60 * 60 = 86400 - let a = - TimestampSecondArray::from(vec![86400]).with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400]).with_timezone("+00:00".to_string()); let b = day(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = - TimestampSecondArray::from(vec![86400]).with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400]).with_timezone("-10:00".to_string()); let b = day(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -857,8 +832,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_second_with_timezone() { - let a = - TimestampSecondArray::from(vec![10, 20]).with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![10, 20]).with_timezone("+00:00".to_string()); let b = second(&a).unwrap(); assert_eq!(10, b.value(0)); assert_eq!(20, b.value(1)); @@ -866,8 +840,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_timezone() { - let a = - TimestampSecondArray::from(vec![0, 60]).with_timezone("+00:50".to_string()); + let a = TimestampSecondArray::from(vec![0, 60]).with_timezone("+00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(50, b.value(0)); assert_eq!(51, b.value(1)); @@ -875,48 +848,42 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_negative_timezone() { - let a = - TimestampSecondArray::from(vec![60 * 55]).with_timezone("-00:50".to_string()); + let a = TimestampSecondArray::from(vec![60 * 55]).with_timezone("-00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(5, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+01:00".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+01:00".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+0100".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+0100".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_minutes() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+01".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+01".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_initial_sign() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("0100".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("0100".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_with_only_colon() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("01:00".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("01:00".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } @@ -960,10 +927,8 @@ mod tests { let b = hour_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::new( - keys.clone(), - Arc::new(Int32Array::from(vec![11, 21, 7])), - ); + let expected_dict = + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![11, 21, 7]))); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); @@ -987,8 +952,7 @@ mod tests { assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); - let b = - time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); + let b = time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); let expected_dict = DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![0, 0, 0, 0, 0]))); @@ -998,8 +962,7 @@ mod tests { #[test] fn test_year_dictionary_array() { - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1550636625000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); @@ -1018,24 +981,20 @@ mod tests { fn test_quarter_month_dictionary_array() { //1514764800000 -> 2018-01-01 //1566275025000 -> 2019-08-20 - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1566275025000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1566275025000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = quarter_dyn(&dict).unwrap(); - let expected = DictionaryArray::new( - keys.clone(), - Arc::new(Int32Array::from(vec![1, 3, 3, 1])), - ); + let expected = + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![1, 3, 3, 1]))); assert_eq!(b.as_ref(), &expected); let b = month_dyn(&dict).unwrap(); - let expected = - DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![1, 8, 8, 1]))); + let expected = DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![1, 8, 8, 1]))); assert_eq!(b.as_ref(), &expected); } @@ -1043,8 +1002,7 @@ mod tests { fn test_num_days_from_monday_sunday_day_doy_week_dictionary_array() { //1514764800000 -> 2018-01-01 (Monday) //1550636625000 -> 2019-02-20 (Wednesday) - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1550636625000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), Some(0), None]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index b0ecef70ee19..c9be39d44144 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -229,10 +229,7 @@ macro_rules! native_type_op { #[inline] fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { - ArrowError::ComputeError(format!( - "Overflow happened on: {:?} ^ {exp:?}", - self - )) + ArrowError::ComputeError(format!("Overflow happened on: {:?} ^ {exp:?}", self)) }) } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 75880bec30ce..6b18cbc2d9f7 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -16,9 +16,7 @@ // under the License. use crate::types::{ByteArrayType, GenericBinaryType}; -use crate::{ - Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, -}; +use crate::{Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -102,9 +100,7 @@ impl GenericBinaryArray { } } -impl From>> - for GenericBinaryArray -{ +impl From>> for GenericBinaryArray { fn from(v: Vec>) -> Self { Self::from_opt_vec(v) } @@ -376,9 +372,11 @@ mod tests { .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); let array_data2 = ArrayData::builder(data_type) .len(3) @@ -423,9 +421,11 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -456,9 +456,7 @@ mod tests { _test_generic_binary_array_from_list_array_with_offset::(); } - fn _test_generic_binary_array_from_list_array_with_child_nulls_failed< - O: OffsetSizeTrait, - >() { + fn _test_generic_binary_array_from_list_array_with_child_nulls_failed() { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) @@ -468,9 +466,11 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, true), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + true, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -558,8 +558,7 @@ mod tests { .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); let array_data = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(offsets)) @@ -575,8 +574,7 @@ mod tests { expected = "Trying to access an element at index 4 from a BinaryArray of length 3" )] fn test_binary_array_get_value_index_out_of_bound() { - let values: [u8; 12] = - [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; + let values: [u8; 12] = [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; let offsets: [i32; 4] = [0, 5, 5, 12]; let array_data = ArrayData::builder(DataType::Binary) .len(3) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 4d19babe3e4b..a778dc92ea35 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -238,11 +238,7 @@ impl BooleanArray { /// /// This function panics if left and right are not the same length /// - pub fn from_binary( - left: T, - right: S, - mut op: F, - ) -> Self + pub fn from_binary(left: T, right: S, mut op: F) -> Self where F: FnMut(T::Item, S::Item) -> bool, { @@ -362,8 +358,7 @@ impl From for BooleanArray { 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let values = - BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); Self { values, @@ -591,9 +586,7 @@ mod tests { } #[test] - #[should_panic( - expected = "BooleanArray expected ArrayData with type Boolean got Int32" - )] + #[should_panic(expected = "BooleanArray expected ArrayData with type Boolean got Int32")] fn test_from_array_data_validation() { let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32)); } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 37d8de931e99..db825bbea97d 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -197,8 +197,7 @@ impl GenericByteArray { let (_, data_len) = iter.size_hint(); let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::()); offsets.push(T::Offset::usize_as(0)); let mut values = MutableBuffer::new(0); @@ -335,8 +334,7 @@ impl GenericByteArray { /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let value_len = - T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); + let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); let data = self.into_data(); let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); @@ -578,17 +576,14 @@ mod tests { let nulls = NullBuffer::new_null(3); let err = - StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())) - .unwrap_err(); + StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"); - let err = - BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); + let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"); let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld"); - let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None) - .unwrap_err(); + let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"); BinaryArray::new(offsets, non_utf8_data, None); @@ -611,8 +606,7 @@ mod tests { BinaryArray::new(offsets, non_ascii_data.clone(), None); let offsets = OffsetBuffer::new(vec![0, 3, 10].into()); - let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None) - .unwrap_err(); + let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Split UTF-8 codepoint at offset 3" diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 0cb00878929c..1f4d83b1c5d0 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -286,10 +286,7 @@ impl DictionaryArray { /// # Errors /// /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0` - pub fn try_new( - keys: PrimitiveArray, - values: ArrayRef, - ) -> Result { + pub fn try_new(keys: PrimitiveArray, values: ArrayRef) -> Result { let data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), @@ -298,9 +295,11 @@ impl DictionaryArray { let zero = K::Native::usize_as(0); let values_len = values.len(); - if let Some((idx, v)) = keys.values().iter().enumerate().find(|(idx, v)| { - (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx) - }) { + if let Some((idx, v)) = + keys.values().iter().enumerate().find(|(idx, v)| { + (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx) + }) + { return Err(ArrowError::InvalidArgumentError(format!( "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}", ))); @@ -349,8 +348,7 @@ impl DictionaryArray { /// /// Panics if `values` is not a [`StringArray`]. pub fn lookup_key(&self, value: &str) -> Option { - let rd_buf: &StringArray = - self.values.as_any().downcast_ref::().unwrap(); + let rd_buf: &StringArray = self.values.as_any().downcast_ref::().unwrap(); (0..rd_buf.len()) .position(|i| rd_buf.value(i) == value) @@ -463,10 +461,8 @@ impl DictionaryArray { /// pub fn with_values(&self, values: ArrayRef) -> Self { assert!(values.len() >= self.values.len()); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); Self { data_type, keys: self.keys.clone(), @@ -477,9 +473,7 @@ impl DictionaryArray { /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating /// its keys and values if the underlying data buffer is not shared by others. - pub fn into_primitive_dict_builder( - self, - ) -> Result, Self> + pub fn into_primitive_dict_builder(self) -> Result, Self> where V: ArrowPrimitiveType, { @@ -540,8 +534,7 @@ impl DictionaryArray { V: ArrowPrimitiveType, F: Fn(V::Native) -> V::Native, { - let mut builder: PrimitiveDictionaryBuilder = - self.into_primitive_dict_builder()?; + let mut builder: PrimitiveDictionaryBuilder = self.into_primitive_dict_builder()?; builder .values_slice_mut() .iter_mut() @@ -806,9 +799,7 @@ impl<'a, K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'a, K, V> impl<'a, K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'a, K, V> {} -impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug - for TypedDictionaryArray<'a, K, V> -{ +impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug for TypedDictionaryArray<'a, K, V> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) } @@ -1040,8 +1031,7 @@ mod tests { // Construct a dictionary array from the above two let key_type = DataType::Int16; let value_type = DataType::Int8; - let dict_data_type = - DataType::Dictionary(Box::new(key_type), Box::new(value_type)); + let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type)); let dict_data = ArrayData::builder(dict_data_type.clone()) .len(3) .add_buffer(keys.clone()) @@ -1079,8 +1069,7 @@ mod tests { #[test] fn test_dictionary_array_fmt_debug() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -1090,8 +1079,7 @@ mod tests { format!("{array:?}") ); - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(20, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(20, 2); for _ in 0..20 { builder.append(1).unwrap(); } @@ -1267,9 +1255,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2" - )] + #[should_panic(expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2")] fn test_try_new_index_too_large() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); // dictionary only has 2 values, so offset 3 is out of bounds @@ -1278,9 +1264,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2" - )] + #[should_panic(expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2")] fn test_try_new_index_too_small() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); let keys: Int32Array = [Some(-100)].into_iter().collect(); @@ -1288,9 +1272,7 @@ mod tests { } #[test] - #[should_panic( - expected = "DictionaryArray's data type must match, expected Int64 got Int32" - )] + #[should_panic(expected = "DictionaryArray's data type must match, expected Int64 got Int32")] fn test_from_array_data_validation() { let a = DictionaryArray::::from_iter(["32"]); let _ = DictionaryArray::::from(a.into_data()); @@ -1335,8 +1317,7 @@ mod tests { let boxed: ArrayRef = Arc::new(dict_array); - let col: DictionaryArray = - DictionaryArray::::from(boxed.to_data()); + let col: DictionaryArray = DictionaryArray::::from(boxed.to_data()); let err = col.into_primitive_dict_builder::(); let returned = err.unwrap_err(); diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index f0b04c203ceb..d89bbd5ad084 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -81,10 +81,7 @@ impl FixedSizeBinaryArray { ) -> Result { let data_type = DataType::FixedSizeBinary(size); let s = size.to_usize().ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Size cannot be negative, got {}", - size - )) + ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size)) })?; let len = values.len() / s; @@ -333,10 +330,7 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_sparse_iter_with_size( - mut iter: T, - size: i32, - ) -> Result + pub fn try_from_sparse_iter_with_size(mut iter: T, size: i32) -> Result where T: Iterator>, U: AsRef<[u8]>, @@ -812,8 +806,7 @@ mod tests { let none_option: Option<[u8; 32]> = None; let input_arg = vec![none_option, none_option, none_option]; #[allow(deprecated)] - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); assert_eq!(0, arr.value_length()); assert_eq!(3, arr.len()) } @@ -828,16 +821,12 @@ mod tests { Some(vec![13, 14]), ]; #[allow(deprecated)] - let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()) - .unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()).unwrap(); assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()); - let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - input_arg.into_iter(), - 2, - ) - .unwrap(); + let arr = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap(); assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()); } @@ -846,11 +835,8 @@ mod tests { fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() { let input_arg = vec![None, None, None, None, None] as Vec>>; - let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - input_arg.into_iter(), - 16, - ) - .unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 16) + .unwrap(); assert_eq!(16, arr.value_length()); assert_eq!(5, arr.len()) } @@ -917,8 +903,7 @@ mod tests { fn fixed_size_binary_array_all_null() { let data = vec![None] as Vec>; let array = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0) - .unwrap(); + FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0).unwrap(); array .into_data() .validate_full() @@ -928,8 +913,7 @@ mod tests { #[test] // Test for https://github.com/apache/arrow-rs/issues/1390 fn fixed_size_binary_array_all_null_in_batch_with_schema() { - let schema = - Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); + let schema = Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); let none_option: Option<[u8; 2]> = None; let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size( diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index db3ccbe0617b..f8f01516e3d4 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -130,12 +130,7 @@ impl FixedSizeListArray { /// # Panics /// /// Panics if [`Self::try_new`] returns an error - pub fn new( - field: FieldRef, - size: i32, - values: ArrayRef, - nulls: Option, - ) -> Self { + pub fn new(field: FieldRef, size: i32, values: ArrayRef, nulls: Option) -> Self { Self::try_new(field, size, values, nulls).unwrap() } @@ -154,10 +149,7 @@ impl FixedSizeListArray { nulls: Option, ) -> Result { let s = size.to_usize().ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Size cannot be negative, got {}", - size - )) + ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size)) })?; let len = values.len() / s.max(1); @@ -350,9 +342,8 @@ impl From for FixedSizeListArray { }; let size = value_length as usize; - let values = make_array( - data.child_data()[0].slice(data.offset() * size, data.len() * size), - ); + let values = + make_array(data.child_data()[0].slice(data.offset() * size, data.len() * size)); Self { data_type: data.data_type().clone(), values, @@ -483,10 +474,8 @@ mod tests { .unwrap(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_child_data(value_data.clone()) @@ -538,10 +527,8 @@ mod tests { .unwrap(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -569,10 +556,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data.clone()) @@ -611,9 +596,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_fixed_size_list_array_index_out_of_bound() { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -631,10 +614,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -668,8 +649,7 @@ mod tests { let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None); assert_eq!(list.len(), 1); - let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None).unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Size cannot be negative, got -1" @@ -679,13 +659,11 @@ mod tests { assert_eq!(list.len(), 6); let nulls = NullBuffer::new_null(2); - let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2"); let field = Arc::new(Field::new("item", DataType::Int32, false)); - let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\""); // Valid as nulls in child masked by parent diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index e36d0ac4434f..9758c112a1ef 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -372,9 +372,8 @@ impl GenericListArray { impl From for GenericListArray { fn from(data: ArrayData) -> Self { - Self::try_new_from_array_data(data).expect( - "Expected infallible creation of GenericListArray from ArrayDataRef failed", - ) + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListArray from ArrayDataRef failed") } } @@ -391,17 +390,14 @@ impl From> for ArrayDa } } -impl From - for GenericListArray -{ +impl From for GenericListArray { fn from(value: FixedSizeListArray) -> Self { let (field, size) = match value.data_type() { DataType::FixedSizeList(f, size) => (f, *size as usize), _ => unreachable!(), }; - let offsets = - OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), @@ -415,9 +411,10 @@ impl From impl GenericListArray { fn try_new_from_array_data(data: ArrayData) -> Result { if data.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - format!("ListArray data should contain a single buffer only (value offsets), had {}", - data.buffers().len()))); + return Err(ArrowError::InvalidArgumentError(format!( + "ListArray data should contain a single buffer only (value offsets), had {}", + data.buffers().len() + ))); } if data.child_data().len() != 1 { @@ -593,8 +590,7 @@ mod tests { let value_offsets = Buffer::from([]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(0) .add_buffer(value_offsets) @@ -620,8 +616,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -807,8 +802,7 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -839,8 +833,7 @@ mod tests { } // Check offset and length for each non-null value. - let sliced_list_array = - sliced_array.as_any().downcast_ref::().unwrap(); + let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_length(2)); assert_eq!(4, sliced_list_array.value_offsets()[3]); @@ -951,9 +944,7 @@ mod tests { list_array.value(10); } #[test] - #[should_panic( - expected = "ListArray data should contain a single buffer only (value offsets)" - )] + #[should_panic(expected = "ListArray data should contain a single buffer only (value offsets)")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -964,8 +955,7 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -976,16 +966,13 @@ mod tests { } #[test] - #[should_panic( - expected = "ListArray should contain a single child array (values array)" - )] + #[should_panic(expected = "ListArray should contain a single child array (values array)")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -996,9 +983,7 @@ mod tests { } #[test] - #[should_panic( - expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List" - )] + #[should_panic(expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List")] fn test_from_array_data_validation() { let mut builder = ListBuilder::new(Int32Builder::new()); builder.values().append_value(1); @@ -1017,8 +1002,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1033,9 +1017,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -1051,9 +1033,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -1068,8 +1048,7 @@ mod tests { .build_unchecked() }; - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(buf2) @@ -1187,9 +1166,8 @@ mod tests { let nulls = NullBuffer::new_null(3); let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into()); - let err = - LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) - .unwrap_err(); + let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1197,9 +1175,8 @@ mod tests { ); let field = Arc::new(Field::new("element", DataType::Int64, false)); - let err = - LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) - .unwrap_err(); + let err = LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1210,8 +1187,8 @@ mod tests { let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); let values = Arc::new(values); - let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), None) - .unwrap_err(); + let err = + LargeListArray::try_new(field, offsets.clone(), values.clone(), None).unwrap_err(); assert_eq!( err.to_string(), @@ -1222,8 +1199,7 @@ mod tests { LargeListArray::new(field.clone(), offsets.clone(), values, None); let values = Int64Array::new(vec![0; 2].into(), None); - let err = - LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); + let err = LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); assert_eq!( err.to_string(), diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index 77a7b9d4d547..bde7fdd5a953 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -17,9 +17,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::iterator::MapArrayIter; -use crate::{ - make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray, -}; +use crate::{make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, FieldRef}; @@ -264,9 +262,10 @@ impl MapArray { } if data.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - format!("MapArray data should contain a single buffer only (value offsets), had {}", - data.len()))); + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray data should contain a single buffer only (value offsets), had {}", + data.len() + ))); } if data.child_data().len() != 1 { @@ -281,9 +280,9 @@ impl MapArray { if let DataType::Struct(fields) = entries.data_type() { if fields.len() != 2 { return Err(ArrowError::InvalidArgumentError(format!( - "MapArray should contain a struct array with 2 fields, have {} fields", - fields.len() - ))); + "MapArray should contain a struct array with 2 fields, have {} fields", + fields.len() + ))); } } else { return Err(ArrowError::InvalidArgumentError(format!( @@ -576,8 +575,7 @@ mod tests { assert_eq!(2, map_array.value_length(1)); let key_array = Arc::new(Int32Array::from(vec![3, 4, 5])) as ArrayRef; - let value_array = - Arc::new(UInt32Array::from(vec![None, Some(40), None])) as ArrayRef; + let value_array = Arc::new(UInt32Array::from(vec![None, Some(40), None])) as ArrayRef; let struct_array = StructArray::from(vec![(keys_field, key_array), (values_field, value_array)]); assert_eq!( @@ -669,9 +667,7 @@ mod tests { } #[test] - #[should_panic( - expected = "MapArray expected ArrayData with DataType::Map got Dictionary" - )] + #[should_panic(expected = "MapArray expected ArrayData with DataType::Map got Dictionary")] fn test_from_array_data_validation() { // A DictionaryArray has similar buffer layout to a MapArray // but the meaning of the values differs @@ -692,12 +688,9 @@ mod tests { // [[a, b, c], [d, e, f], [g, h]] let entry_offsets = [0, 3, 6, 8]; - let map_array = MapArray::new_from_strings( - keys.clone().into_iter(), - &values_data, - &entry_offsets, - ) - .unwrap(); + let map_array = + MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) + .unwrap(); assert_eq!( &values_data, @@ -768,9 +761,8 @@ mod tests { "Invalid argument error: Incorrect length of null buffer for MapArray, expected 4 got 3" ); - let err = - MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false) - .unwrap_err(); + let err = MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false) + .unwrap_err(); assert_eq!( err.to_string(), @@ -783,9 +775,7 @@ mod tests { .to_string(); assert!( - err.starts_with( - "Invalid argument error: MapArray expected data type Int64 got Struct" - ), + err.starts_with("Invalid argument error: MapArray expected data type Int64 got Struct"), "{err}" ); diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 9b66826f7584..f19406c1610b 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -536,9 +536,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Float64 => Arc::new(Float64Array::from(data)) as ArrayRef, DataType::Date32 => Arc::new(Date32Array::from(data)) as ArrayRef, DataType::Date64 => Arc::new(Date64Array::from(data)) as ArrayRef, - DataType::Time32(TimeUnit::Second) => { - Arc::new(Time32SecondArray::from(data)) as ArrayRef - } + DataType::Time32(TimeUnit::Second) => Arc::new(Time32SecondArray::from(data)) as ArrayRef, DataType::Time32(TimeUnit::Millisecond) => { Arc::new(Time32MillisecondArray::from(data)) as ArrayRef } @@ -583,9 +581,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef, DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as ArrayRef, - DataType::FixedSizeBinary(_) => { - Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef - } + DataType::FixedSizeBinary(_) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef, DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef, DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, @@ -593,50 +589,24 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, - DataType::FixedSizeList(_, _) => { - Arc::new(FixedSizeListArray::from(data)) as ArrayRef - } + DataType::FixedSizeList(_, _) => Arc::new(FixedSizeListArray::from(data)) as ArrayRef, DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { - DataType::Int8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } + DataType::Int8 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int16 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int32 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int64 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt8 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt16 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt32 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt64 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, dt => panic!("Unexpected dictionary key type {dt:?}"), }, - DataType::RunEndEncoded(ref run_ends_type, _) => { - match run_ends_type.data_type() { - DataType::Int16 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - DataType::Int32 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - DataType::Int64 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - dt => panic!("Unexpected data type for run_ends array {dt:?}"), - } - } + DataType::RunEndEncoded(ref run_ends_type, _) => match run_ends_type.data_type() { + DataType::Int16 => Arc::new(RunArray::::from(data)) as ArrayRef, + DataType::Int32 => Arc::new(RunArray::::from(data)) as ArrayRef, + DataType::Int64 => Arc::new(RunArray::::from(data)) as ArrayRef, + dt => panic!("Unexpected data type for run_ends array {dt:?}"), + }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, @@ -687,11 +657,8 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { match data.is_empty() && data.buffers()[0].is_empty() { true => OffsetBuffer::new_empty(), false => { - let buffer = ScalarBuffer::new( - data.buffers()[0].clone(), - data.offset(), - data.len() + 1, - ); + let buffer = + ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1); // Safety: // ArrayData is valid unsafe { OffsetBuffer::new_unchecked(buffer) } @@ -700,11 +667,7 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } /// Helper function for printing potentially long arrays. -fn print_long_array( - array: &A, - f: &mut std::fmt::Formatter, - print_item: F, -) -> std::fmt::Result +fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where A: Array, F: Fn(&A, usize, &mut std::fmt::Formatter) -> std::fmt::Result, @@ -767,8 +730,7 @@ mod tests { #[test] fn test_empty_list_primitive() { - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let array = new_empty_array(&data_type); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); @@ -799,8 +761,7 @@ mod tests { fn test_null_struct() { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let struct_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); + let struct_type = DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); let array = new_null_array(&struct_type, 9); let a = array.as_any().downcast_ref::().unwrap(); @@ -827,8 +788,7 @@ mod tests { #[test] fn test_null_list_primitive() { - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let array = new_null_array(&data_type, 9); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 9); @@ -862,8 +822,8 @@ mod tests { #[test] fn test_null_dictionary() { - let values = vec![None, None, None, None, None, None, None, None, None] - as Vec>; + let values = + vec![None, None, None, None, None, None, None, None, None] as Vec>; let array: DictionaryArray = values.into_iter().collect(); let array = Arc::new(array) as ArrayRef; @@ -965,8 +925,7 @@ mod tests { #[test] fn test_memory_size_primitive() { let arr = PrimitiveArray::::from_iter_values(0..128); - let empty = - PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); + let empty = PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); // subtract empty array to avoid magic numbers for the size of additional fields assert_eq!( diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4c07e81468aa..1112acacfcd9 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -917,8 +917,8 @@ impl PrimitiveArray { let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let buffer = data.buffers()[0] - .slice_with_length(data.offset() * element_len, len * element_len); + let buffer = + data.buffers()[0].slice_with_length(data.offset() * element_len, len * element_len); drop(data); @@ -1116,10 +1116,9 @@ impl std::fmt::Debug for PrimitiveArray { }, // if the time zone is invalid, shows NaiveDateTime with an error message Err(_) => match as_datetime::(v) { - Some(datetime) => write!( - f, - "{datetime:?} (Unknown Time Zone '{tz_string}')" - ), + Some(datetime) => { + write!(f, "{datetime:?} (Unknown Time Zone '{tz_string}')") + } None => write!(f, "null"), }, } @@ -1191,25 +1190,19 @@ def_from_for_primitive!(Float64Type, f64); def_from_for_primitive!(Decimal128Type, i128); def_from_for_primitive!(Decimal256Type, i256); -impl From::Native>> - for NativeAdapter -{ +impl From::Native>> for NativeAdapter { fn from(value: Option<::Native>) -> Self { NativeAdapter { native: value } } } -impl From<&Option<::Native>> - for NativeAdapter -{ +impl From<&Option<::Native>> for NativeAdapter { fn from(value: &Option<::Native>) -> Self { NativeAdapter { native: *value } } } -impl>> FromIterator - for PrimitiveArray -{ +impl>> FromIterator for PrimitiveArray { fn from_iter>(iter: I) -> Self { let iter = iter.into_iter(); let (lower, _) = iter.size_hint(); @@ -1265,15 +1258,8 @@ impl PrimitiveArray { let (null, buffer) = trusted_len_unzip(iterator); - let data = ArrayData::new_unchecked( - T::DATA_TYPE, - len, - None, - Some(null), - 0, - vec![buffer], - vec![], - ); + let data = + ArrayData::new_unchecked(T::DATA_TYPE, len, None, Some(null), 0, vec![buffer], vec![]); PrimitiveArray::from(data) } } @@ -1294,9 +1280,7 @@ macro_rules! def_numeric_from_vec { } // Constructs a primitive array from a vector. Should only be used for testing. - impl From::Native>>> - for PrimitiveArray<$ty> - { + impl From::Native>>> for PrimitiveArray<$ty> { fn from(data: Vec::Native>>) -> Self { PrimitiveArray::from_iter(data.iter()) } @@ -1392,8 +1376,7 @@ impl From for PrimitiveArray { "PrimitiveArray data should contain a single buffer only (values buffer)" ); - let values = - ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); Self { data_type: data.data_type().clone(), values, @@ -1407,11 +1390,7 @@ impl PrimitiveArray { /// specified precision and scale. /// /// See [`validate_decimal_precision_and_scale`] - pub fn with_precision_and_scale( - self, - precision: u8, - scale: i8, - ) -> Result { + pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { validate_decimal_precision_and_scale::(precision, scale)?; Ok(Self { data_type: T::TYPE_CONSTRUCTOR(precision, scale), @@ -1575,8 +1554,7 @@ mod tests { // 1: 00:00:00.001 // 37800005: 10:30:00.005 // 86399210: 23:59:59.210 - let arr: PrimitiveArray = - vec![1, 37_800_005, 86_399_210].into(); + let arr: PrimitiveArray = vec![1, 37_800_005, 86_399_210].into(); assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1858,11 +1836,7 @@ mod tests { #[test] fn test_timestamp_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", format!("{arr:?}") @@ -1872,12 +1846,8 @@ mod tests { #[test] fn test_timestamp_utc_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone_utc(); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone_utc(); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00+00:00,\n 2018-12-31T00:00:00+00:00,\n 1921-01-02T00:00:00+00:00,\n]", format!("{arr:?}") @@ -1888,12 +1858,8 @@ mod tests { #[cfg(feature = "chrono-tz")] fn test_timestamp_with_named_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("Asia/Taipei".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("Asia/Taipei".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", format!("{:?}", arr) @@ -1904,12 +1870,8 @@ mod tests { #[cfg(not(feature = "chrono-tz"))] fn test_timestamp_with_named_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("Asia/Taipei".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("Asia/Taipei".to_string()); println!("{arr:?}"); @@ -1922,12 +1884,8 @@ mod tests { #[test] fn test_timestamp_with_fixed_offset_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("+08:00".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("+08:00".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", format!("{arr:?}") @@ -1937,12 +1895,8 @@ mod tests { #[test] fn test_timestamp_with_incorrect_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("xxx".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("xxx".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'xxx'),\n]", format!("{arr:?}") @@ -1952,14 +1906,13 @@ mod tests { #[test] #[cfg(feature = "chrono-tz")] fn test_timestamp_with_tz_with_daylight_saving_fmt_debug() { - let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1647161999000, - 1647162000000, - 1667717999000, - 1667718000000, - ]) - .with_timezone("America/Denver".to_string()); + let arr: PrimitiveArray = TimestampMillisecondArray::from(vec![ + 1647161999000, + 1647162000000, + 1667717999000, + 1667718000000, + ]) + .with_timezone("America/Denver".to_string()); assert_eq!( "PrimitiveArray\n[\n 2022-03-13T01:59:59-07:00,\n 2022-03-13T03:00:00-06:00,\n 2022-11-06T00:59:59-06:00,\n 2022-11-06T01:00:00-06:00,\n]", format!("{:?}", arr) @@ -1997,8 +1950,7 @@ mod tests { #[test] fn test_timestamp_micros_out_of_range() { // replicate the issue from https://github.com/apache/arrow-datafusion/issues/3832 - let arr: PrimitiveArray = - vec![9065525203050843594].into(); + let arr: PrimitiveArray = vec![9065525203050843594].into(); assert_eq!( "PrimitiveArray\n[\n null,\n]", format!("{arr:?}") @@ -2143,8 +2095,7 @@ mod tests { #[test] fn test_decimal256() { - let values: Vec<_> = - vec![i256::ZERO, i256::ONE, i256::MINUS_ONE, i256::MIN, i256::MAX]; + let values: Vec<_> = vec![i256::ZERO, i256::ONE, i256::MINUS_ONE, i256::MIN, i256::MAX]; let array: PrimitiveArray = PrimitiveArray::from_iter(values.iter().copied()); @@ -2166,8 +2117,8 @@ mod tests { // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; let values: [u8; 32] = [ - 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) .len(2) @@ -2232,8 +2183,7 @@ mod tests { #[test] fn test_decimal_from_iter() { - let array: Decimal128Array = - vec![Some(-100), None, Some(101)].into_iter().collect(); + let array: Decimal128Array = vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); assert_eq!(-100_i128, array.value(0)); @@ -2343,8 +2293,7 @@ mod tests { #[test] fn test_decimal_array_set_null_if_overflow_with_precision() { - let array = - Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); + let array = Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); let result = array.null_if_overflow_precision(5); let expected = Decimal128Array::from(vec![None, Some(123), None, None]); assert_eq!(result, expected); @@ -2361,8 +2310,7 @@ mod tests { let decimal2 = i256::from_i128(56789); builder.append_value(decimal2); - let array: Decimal256Array = - builder.finish().with_precision_and_scale(76, 6).unwrap(); + let array: Decimal256Array = builder.finish().with_precision_and_scale(76, 6).unwrap(); let collected: Vec<_> = array.iter().collect(); assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); @@ -2387,8 +2335,7 @@ mod tests { #[test] fn test_from_iter_decimal128array() { - let mut array: Decimal128Array = - vec![Some(-100), None, Some(101)].into_iter().collect(); + let mut array: Decimal128Array = vec![Some(-100), None, Some(101)].into_iter().collect(); array = array.with_precision_and_scale(38, 10).unwrap(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); @@ -2404,13 +2351,11 @@ mod tests { let array = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]); let r = array.unary_opt::<_, Int32Type>(|x| (x % 2 != 0).then_some(x)); - let expected = - Int32Array::from(vec![Some(1), None, Some(3), None, Some(5), None, Some(7)]); + let expected = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5), None, Some(7)]); assert_eq!(r, expected); let r = expected.unary_opt::<_, Int32Type>(|x| (x % 3 != 0).then_some(x)); - let expected = - Int32Array::from(vec![Some(1), None, None, None, Some(5), None, Some(7)]); + let expected = Int32Array::from(vec![Some(1), None, None, None, Some(5), None, Some(7)]); assert_eq!(r, expected); } @@ -2513,9 +2458,8 @@ mod tests { Int32Array::new(vec![1, 2, 3, 4].into(), None); Int32Array::new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(4))); - let err = - Int32Array::try_new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(3))) - .unwrap_err(); + let err = Int32Array::try_new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(3))) + .unwrap_err(); assert_eq!( err.to_string(), diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index ba6986c28463..4877f9f850a3 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -91,10 +91,7 @@ impl RunArray { /// Attempts to create RunArray using given run_ends (index where a run ends) /// and the values (value of the run). Returns an error if the given data is not compatible /// with RunEndEncoded specification. - pub fn try_new( - run_ends: &PrimitiveArray, - values: &dyn Array, - ) -> Result { + pub fn try_new(run_ends: &PrimitiveArray, values: &dyn Array) -> Result { let run_ends_type = run_ends.data_type().clone(); let values_type = values.data_type().clone(); let ree_array_type = DataType::RunEndEncoded( @@ -182,10 +179,7 @@ impl RunArray { /// scaled well for larger inputs. /// See for more details. #[inline] - pub fn get_physical_indices( - &self, - logical_indices: &[I], - ) -> Result, ArrowError> + pub fn get_physical_indices(&self, logical_indices: &[I]) -> Result, ArrowError> where I: ArrowNativeType, { @@ -211,8 +205,7 @@ impl RunArray { }); // Return early if all the logical indices cannot be converted to physical indices. - let largest_logical_index = - logical_indices[*ordered_indices.last().unwrap()].as_usize(); + let largest_logical_index = logical_indices[*ordered_indices.last().unwrap()].as_usize(); if largest_logical_index >= len { return Err(ArrowError::InvalidArgumentError(format!( "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.", @@ -225,8 +218,7 @@ impl RunArray { let mut physical_indices = vec![0; indices_len]; let mut ordered_index = 0_usize; - for (physical_index, run_end) in - self.run_ends.values().iter().enumerate().skip(skip_value) + for (physical_index, run_end) in self.run_ends.values().iter().enumerate().skip(skip_value) { // Get the run end index (relative to offset) of current physical index let run_end_value = run_end.as_usize() - offset; @@ -234,8 +226,7 @@ impl RunArray { // All the `logical_indices` that are less than current run end index // belongs to current physical index. while ordered_index < indices_len - && logical_indices[ordered_indices[ordered_index]].as_usize() - < run_end_value + && logical_indices[ordered_indices[ordered_index]].as_usize() < run_end_value { physical_indices[ordered_indices[ordered_index]] = physical_index; ordered_index += 1; @@ -245,8 +236,7 @@ impl RunArray { // If there are input values >= run_ends.last_value then we'll not be able to convert // all logical indices to physical indices. if ordered_index < logical_indices.len() { - let logical_index = - logical_indices[ordered_indices[ordered_index]].as_usize(); + let logical_index = logical_indices[ordered_indices[ordered_index]].as_usize(); return Err(ArrowError::InvalidArgumentError(format!( "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {logical_index}.", ))); @@ -704,8 +694,7 @@ mod tests { seed.shuffle(&mut rng); } // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays - let num = - max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); + let num = max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); for _ in 0..num { result.push(seed[ix]); } @@ -749,19 +738,16 @@ mod tests { #[test] fn test_run_array() { // Construct a value array - let value_data = PrimitiveArray::::from_iter_values([ - 10_i8, 11, 12, 13, 14, 15, 16, 17, - ]); + let value_data = + PrimitiveArray::::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); // Construct a run_ends array: let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22]; - let run_ends_data = PrimitiveArray::::from_iter_values( - run_ends_values.iter().copied(), - ); + let run_ends_data = + PrimitiveArray::::from_iter_values(run_ends_values.iter().copied()); // Construct a run ends encoded array from the above two - let ree_array = - RunArray::::try_new(&run_ends_data, &value_data).unwrap(); + let ree_array = RunArray::::try_new(&run_ends_data, &value_data).unwrap(); assert_eq!(ree_array.len(), 22); assert_eq!(ree_array.null_count(), 0); @@ -872,8 +858,7 @@ mod tests { let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] .into_iter() .collect(); - let run_ends: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); let array = RunArray::::try_new(&run_ends, &values).unwrap(); assert_eq!(array.values().data_type(), &DataType::Utf8); @@ -924,7 +909,10 @@ mod tests { let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); let actual = RunArray::::try_new(&run_ends, &values); - let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); + let expected = ArrowError::InvalidArgumentError( + "Found null values in run_ends array. The run_ends array should not have null values." + .to_string(), + ); assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } @@ -1003,8 +991,7 @@ mod tests { let mut rng = thread_rng(); logical_indices.shuffle(&mut rng); - let physical_indices = - run_array.get_physical_indices(&logical_indices).unwrap(); + let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap(); assert_eq!(logical_indices.len(), physical_indices.len()); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index cac4651f4496..9d266e0ca4b8 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -59,9 +59,7 @@ impl GenericStringArray { /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data - pub fn try_from_binary( - v: GenericBinaryArray, - ) -> Result { + pub fn try_from_binary(v: GenericBinaryArray) -> Result { let (offsets, values, nulls) = v.into_parts(); Self::try_new(offsets, values, nulls) } @@ -83,9 +81,7 @@ impl From> } } -impl From>> - for GenericStringArray -{ +impl From>> for GenericStringArray { fn from(v: Vec>) -> Self { v.into_iter().collect() } @@ -97,9 +93,7 @@ impl From> for GenericStringArray From>> - for GenericStringArray -{ +impl From>> for GenericStringArray { fn from(v: Vec>) -> Self { v.into_iter().collect() } @@ -438,13 +432,11 @@ mod tests { let expected: LargeStringArray = data.clone().into_iter().map(Some).collect(); // Iterator reports too many items - let arr = - LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); assert_eq!(expected, arr); // Iterator reports too few items - let arr = - LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); assert_eq!(expected, arr); } @@ -460,9 +452,11 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); // [None, Some("Parquet")] let array_data = ArrayData::builder(data_type) @@ -493,9 +487,7 @@ mod tests { _test_generic_string_array_from_list_array::(); } - fn _test_generic_string_array_from_list_array_with_child_nulls_failed< - O: OffsetSizeTrait, - >() { + fn _test_generic_string_array_from_list_array_with_child_nulls_failed() { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) @@ -508,9 +500,11 @@ mod tests { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, true), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + true, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -544,9 +538,11 @@ mod tests { .unwrap(); let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt16, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt16, + false, + ))); let array_data = ArrayData::builder(data_type) .len(2) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 0e586ed1ef96..699da28cf7a3 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -462,9 +462,7 @@ impl Index<&str> for StructArray { mod tests { use super::*; - use crate::{ - BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray, - }; + use crate::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray}; use arrow_buffer::ToByteSlice; use std::sync::Arc; @@ -540,12 +538,10 @@ mod tests { None, Some("mark"), ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); let arr = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap(); let struct_data = arr.into_data(); assert_eq!(4, struct_data.len()); @@ -578,13 +574,11 @@ mod tests { None, // 3 elements, not 4 ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); - let err = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap_err() - .to_string(); + let err = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap_err() + .to_string(); assert_eq!( err, @@ -599,8 +593,7 @@ mod tests { fn test_struct_array_from_mismatched_types_single() { drop(StructArray::from(vec![( Arc::new(Field::new("b", DataType::Int16, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, )])); } @@ -612,8 +605,7 @@ mod tests { drop(StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Int16, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( Arc::new(Field::new("c", DataType::Utf8, false)), @@ -733,9 +725,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"" - )] + #[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")] fn test_struct_array_from_mismatched_nullability() { drop(StructArray::from(vec![( Arc::new(Field::new("c", DataType::Int32, false)), diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 74a5f1efa767..94ac0bc879e4 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -179,8 +179,7 @@ impl UnionArray { if let Some(b) = &value_offsets { if ((type_ids.len()) * 4) != b.len() { return Err(ArrowError::InvalidArgumentError( - "Type Ids and Offsets represent a different number of array slots." - .to_string(), + "Type Ids and Offsets represent a different number of array slots.".to_string(), )); } } @@ -216,9 +215,8 @@ impl UnionArray { // Unsafe Justification: arguments were validated above (and // re-revalidated as part of data().validate() below) - let new_self = unsafe { - Self::new_unchecked(field_type_ids, type_ids, value_offsets, child_arrays) - }; + let new_self = + unsafe { Self::new_unchecked(field_type_ids, type_ids, value_offsets, child_arrays) }; new_self.to_data().validate()?; Ok(new_self) @@ -1059,7 +1057,13 @@ mod tests { let mut builder = UnionBuilder::new_sparse(); builder.append::("a", 1.0).unwrap(); let err = builder.append::("a", 1).unwrap_err().to_string(); - assert!(err.contains("Attempt to write col \"a\" with type Int32 doesn't match existing type Float32"), "{}", err); + assert!( + err.contains( + "Attempt to write col \"a\" with type Int32 doesn't match existing type Float32" + ), + "{}", + err + ); } #[test] diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 5f0013269677..7e59d940a50e 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -127,11 +127,7 @@ impl BooleanBuilder { /// /// Returns an error if the slices are of different lengths #[inline] - pub fn append_values( - &mut self, - values: &[bool], - is_valid: &[bool], - ) -> Result<(), ArrowError> { + pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<(), ArrowError> { if values.len() != is_valid.len() { Err(ArrowError::InvalidArgumentError( "Value and validity lengths must be equal".to_string(), @@ -250,8 +246,7 @@ mod tests { #[test] fn test_boolean_array_builder_append_slice() { - let arr1 = - BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); + let arr1 = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); let mut builder = BooleanArray::builder(0); builder.append_slice(&[true, false]); diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index 01e4c1d4e217..2b66a8187fa9 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -45,11 +45,9 @@ pub type Float32BufferBuilder = BufferBuilder; pub type Float64BufferBuilder = BufferBuilder; /// Buffer builder for 128-bit decimal type. -pub type Decimal128BufferBuilder = - BufferBuilder<::Native>; +pub type Decimal128BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for 256-bit decimal type. -pub type Decimal256BufferBuilder = - BufferBuilder<::Native>; +pub type Decimal256BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for timestamp type of second unit. pub type TimestampSecondBufferBuilder = @@ -107,9 +105,7 @@ pub type DurationNanosecondBufferBuilder = #[cfg(test)] mod tests { - use crate::builder::{ - ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder, - }; + use crate::builder::{ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder}; use crate::Array; #[test] diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 180150e988f3..0a50eb8a50e9 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -75,7 +75,8 @@ impl FixedSizeBinaryBuilder { pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> { if self.value_length != value.as_ref().len() as i32 { Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() + "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" + .to_string(), )) } else { self.values_builder.append_slice(value.as_ref()); @@ -95,11 +96,10 @@ impl FixedSizeBinaryBuilder { /// Builds the [`FixedSizeBinaryArray`] and reset this builder. pub fn finish(&mut self) -> FixedSizeBinaryArray { let array_length = self.len(); - let array_data_builder = - ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(self.values_builder.finish()) - .nulls(self.null_buffer_builder.finish()) - .len(array_length); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(self.values_builder.finish()) + .nulls(self.null_buffer_builder.finish()) + .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) } @@ -108,11 +108,10 @@ impl FixedSizeBinaryBuilder { pub fn finish_cloned(&self) -> FixedSizeBinaryArray { let array_length = self.len(); let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let array_data_builder = - ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(values_buffer) - .nulls(self.null_buffer_builder.finish_cloned()) - .len(array_length); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(values_buffer) + .nulls(self.null_buffer_builder.finish_cloned()) + .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) } diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 41165208de55..3cde76c4a039 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -19,10 +19,7 @@ use crate::types::bytes::ByteArrayNativeType; use std::{any::Any, sync::Arc}; use crate::{ - types::{ - BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, - Utf8Type, - }, + types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, ArrayRef, ArrowPrimitiveType, RunArray, }; @@ -112,10 +109,7 @@ where pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { Self { run_ends_builder: PrimitiveBuilder::with_capacity(capacity), - values_builder: GenericByteBuilder::::with_capacity( - capacity, - data_capacity, - ), + values_builder: GenericByteBuilder::::with_capacity(capacity, data_capacity), current_value: Vec::new(), has_current_value: false, current_run_end_index: 0, @@ -282,12 +276,13 @@ where } fn run_end_index_as_native(&self) -> R::Native { - R::Native::from_usize(self.current_run_end_index) - .unwrap_or_else(|| panic!( + R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| { + panic!( "Cannot convert the value {} from `usize` to native form of arrow datatype {}", self.current_run_end_index, R::DATA_TYPE - )) + ) + }) } } @@ -413,8 +408,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(*ava.value(0), *values[0]); assert!(ava.is_null(1)); @@ -459,8 +453,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(ava.value(0), values[0]); assert!(ava.is_null(1)); diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index d84be8c2fca6..2c7ee7a3e448 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -68,12 +68,8 @@ impl GenericByteBuilder { let value_builder = BufferBuilder::::new_from_buffer(value_buffer); let null_buffer_builder = null_buffer - .map(|buffer| { - NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1) - }) - .unwrap_or_else(|| { - NullBufferBuilder::new_with_len(offsets_builder.len() - 1) - }); + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) + .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); Self { offsets_builder, @@ -84,8 +80,7 @@ impl GenericByteBuilder { #[inline] fn next_offset(&self) -> T::Offset { - T::Offset::from_usize(self.value_builder.len()) - .expect("byte array offset overflow") + T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") } /// Appends a value into the builder. diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 282f423fa6d1..b0c722ae7cda 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -16,9 +16,7 @@ // under the License. use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; -use crate::types::{ - ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType, -}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; @@ -91,10 +89,7 @@ where state: Default::default(), dedup: Default::default(), keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder: GenericByteBuilder::::with_capacity( - value_capacity, - data_capacity, - ), + values_builder: GenericByteBuilder::::with_capacity(value_capacity, data_capacity), } } @@ -131,8 +126,7 @@ where let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ()); let values_len = dictionary_values.value_data().len(); - let mut values_builder = - GenericByteBuilder::::with_capacity(dict_len, values_len); + let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); K::Native::from_usize(dictionary_values.len()) .ok_or(ArrowError::DictionaryKeyOverflowError)?; @@ -214,10 +208,7 @@ where /// value is appended to the values array. /// /// Returns an error if the new index would overflow the key type. - pub fn append( - &mut self, - value: impl AsRef, - ) -> Result { + pub fn append(&mut self, value: impl AsRef) -> Result { let value_native: &T::Native = value.as_ref(); let value_bytes: &[u8] = value_native.as_ref(); @@ -240,8 +231,7 @@ where state.hash_one(get_bytes(storage, *idx)) }); - K::Native::from_usize(idx) - .ok_or(ArrowError::DictionaryKeyOverflowError)? + K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)? } }; self.keys_builder.append_value(key); @@ -283,8 +273,7 @@ where let values = self.values_builder.finish(); let keys = self.keys_builder.finish(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() @@ -300,8 +289,7 @@ where let values = self.values_builder.finish_cloned(); let keys = self.keys_builder.finish_cloned(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() @@ -367,12 +355,10 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ /// assert_eq!(ava.value(1), "def"); /// /// ``` -pub type StringDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) -pub type LargeStringDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) /// @@ -407,12 +393,10 @@ pub type LargeStringDictionaryBuilder = /// assert_eq!(ava.value(1), b"def"); /// /// ``` -pub type BinaryDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) -pub type LargeBinaryDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; #[cfg(test)] mod tests { @@ -444,8 +428,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(*ava.value(0), *values[0]); assert_eq!(*ava.value(1), *values[1]); @@ -483,8 +466,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(ava.value(0), values[0]); assert_eq!(ava.value(1), values[1]); @@ -542,11 +524,8 @@ mod tests { ::Native: AsRef<::Native>, { let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary( - 6, - &dictionary, - ) - .unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary(6, &dictionary) + .unwrap(); builder.append(values[0]).unwrap(); builder.append_null(); builder.append(values[1]).unwrap(); @@ -562,8 +541,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert!(!ava.is_valid(0)); assert_eq!(ava.value(1), values[1]); @@ -597,11 +575,8 @@ mod tests { ::Native: AsRef<::Native>, { let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary( - 4, - &dictionary, - ) - .unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary(4, &dictionary) + .unwrap(); builder.append(values[0]).unwrap(); builder.append_null(); builder.append(values[1]).unwrap(); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 4e3ec4a7944d..3a5244ed81a0 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -86,11 +86,7 @@ impl Default for MapFieldNames { impl MapBuilder { /// Creates a new `MapBuilder` - pub fn new( - field_names: Option, - key_builder: K, - value_builder: V, - ) -> Self { + pub fn new(field_names: Option, key_builder: K, value_builder: V) -> Self { let capacity = key_builder.len(); Self::with_capacity(field_names, key_builder, value_builder, capacity) } @@ -243,12 +239,9 @@ mod tests { use super::*; #[test] - #[should_panic( - expected = "Keys array must have no null values, found 1 null value(s)" - )] + #[should_panic(expected = "Keys array must have no null values, found 1 null value(s)")] fn test_map_builder_with_null_keys_panics() { - let mut builder = - MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); + let mut builder = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); builder.keys().append_null(); builder.values().append_value(42); builder.append(true).unwrap(); diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index b23d6bba36c4..0aad2dbfce0e 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -161,9 +161,7 @@ impl PrimitiveBuilder { let values_builder = BufferBuilder::::new_from_buffer(values_buffer); let null_buffer_builder = null_buffer - .map(|buffer| { - NullBufferBuilder::new_from_buffer(buffer, values_builder.len()) - }) + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, values_builder.len())) .unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len())); Self { @@ -256,10 +254,7 @@ impl PrimitiveBuilder { /// This requires the iterator be a trusted length. This could instead require /// the iterator implement `TrustedLen` once that is stabilized. #[inline] - pub unsafe fn append_trusted_len_iter( - &mut self, - iter: impl IntoIterator, - ) { + pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { let iter = iter.into_iter(); let len = iter .size_hint() @@ -328,11 +323,7 @@ impl PrimitiveBuilder { impl PrimitiveBuilder

{ /// Sets the precision and scale - pub fn with_precision_and_scale( - self, - precision: u8, - scale: i8, - ) -> Result { + pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { validate_decimal_precision_and_scale::

(precision, scale)?; Ok(Self { data_type: P::TYPE_CONSTRUCTOR(precision, scale), @@ -592,25 +583,21 @@ mod tests { #[test] fn test_primitive_array_builder_with_data_type() { - let mut builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let mut builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); builder.append_value(1); let array = builder.finish(); assert_eq!(array.precision(), 1); assert_eq!(array.scale(), 2); let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); - let mut builder = - TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); + let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); builder.append_value(1); let array = builder.finish(); assert_eq!(array.data_type(), &data_type); } #[test] - #[should_panic( - expected = "incompatible data type for builder, expected Int32 got Int64" - )] + #[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")] fn test_invalid_with_data_type() { Int32Builder::new().with_data_type(DataType::Int64); } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 7323ee57627d..a47b2d30d4f3 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -221,8 +221,7 @@ where let key = self.values_builder.len(); self.values_builder.append_value(value); vacant.insert(key); - K::Native::from_usize(key) - .ok_or(ArrowError::DictionaryKeyOverflowError)? + K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)? } Entry::Occupied(o) => K::Native::usize_as(*o.get()), }; @@ -266,10 +265,8 @@ where let values = self.values_builder.finish(); let keys = self.keys_builder.finish(); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); let builder = keys .into_data() @@ -285,8 +282,7 @@ where let values = self.values_builder.finish_cloned(); let keys = self.keys_builder.finish_cloned(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); let builder = keys .into_data() @@ -331,8 +327,7 @@ mod tests { #[test] fn test_primitive_dictionary_builder() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -384,8 +379,7 @@ mod tests { #[test] fn test_primitive_dictionary_with_builders() { let keys_builder = PrimitiveBuilder::::new(); - let values_builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); let mut builder = PrimitiveDictionaryBuilder::::new_from_empty_builders( keys_builder, diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 7aa91dacaa8c..0f40b8a487ae 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -106,24 +106,18 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box Box::new(Float32Builder::with_capacity(capacity)), DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), - DataType::LargeBinary => { - Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)) - } + DataType::LargeBinary => Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)), DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } DataType::Decimal128(p, s) => Box::new( - Decimal128Builder::with_capacity(capacity) - .with_data_type(DataType::Decimal128(*p, *s)), + Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)), ), DataType::Decimal256(p, s) => Box::new( - Decimal256Builder::with_capacity(capacity) - .with_data_type(DataType::Decimal256(*p, *s)), + Decimal256Builder::with_capacity(capacity).with_data_type(DataType::Decimal256(*p, *s)), ), DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), - DataType::LargeUtf8 => { - Box::new(LargeStringBuilder::with_capacity(capacity, 1024)) - } + DataType::LargeUtf8 => Box::new(LargeStringBuilder::with_capacity(capacity, 1024)), DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), DataType::Time32(TimeUnit::Second) => { @@ -175,19 +169,14 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(DurationNanosecondBuilder::with_capacity(capacity)) } - DataType::Struct(fields) => { - Box::new(StructBuilder::from_fields(fields.clone(), capacity)) - } + DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), t => panic!("Data type {t:?} is not currently supported"), } } impl StructBuilder { /// Creates a new `StructBuilder` - pub fn new( - fields: impl Into, - field_builders: Vec>, - ) -> Self { + pub fn new(fields: impl Into, field_builders: Vec>) -> Self { Self { field_builders, fields: fields.into(), @@ -234,10 +223,7 @@ impl StructBuilder { pub fn finish(&mut self) -> StructArray { self.validate_content(); if self.fields.is_empty() { - return StructArray::new_empty_fields( - self.len(), - self.null_buffer_builder.finish(), - ); + return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish()); } let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); @@ -524,8 +510,7 @@ mod tests { expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { - let list_type = - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); + let list_type = DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); let fields = vec![ Field::new("f1", DataType::Int16, false), Field::new("f2", list_type, false), @@ -571,9 +556,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Number of fields is not equal to the number of field_builders." - )] + #[should_panic(expected = "Number of fields is not equal to the number of field_builders.")] fn test_struct_array_builder_unequal_field_field_builders() { let int_builder = Int32Builder::with_capacity(10); diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index f74afb2aa9aa..4f88c9d41b9a 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -65,11 +65,7 @@ impl FieldDataValues for BufferBuilder { impl FieldData { /// Creates a new `FieldData`. - fn new( - type_id: i8, - data_type: DataType, - capacity: usize, - ) -> Self { + fn new(type_id: i8, data_type: DataType, capacity: usize) -> Self { Self { type_id, data_type, @@ -222,7 +218,12 @@ impl UnionBuilder { let mut field_data = match self.fields.remove(&type_name) { Some(data) => { if data.data_type != T::DATA_TYPE { - return Err(ArrowError::InvalidArgumentError(format!("Attempt to write col \"{}\" with type {} doesn't match existing type {}", type_name, T::DATA_TYPE, data.data_type))); + return Err(ArrowError::InvalidArgumentError(format!( + "Attempt to write col \"{}\" with type {} doesn't match existing type {}", + type_name, + T::DATA_TYPE, + data.data_type + ))); } data } diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index b6cda44e8973..2e21f3e7e640 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -578,9 +578,7 @@ macro_rules! downcast_run_array { /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`GenericListArray`], panicking on failure. -pub fn as_generic_list_array( - arr: &dyn Array, -) -> &GenericListArray { +pub fn as_generic_list_array(arr: &dyn Array) -> &GenericListArray { arr.as_any() .downcast_ref::>() .expect("Unable to downcast to list array") @@ -612,9 +610,7 @@ pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`GenericBinaryArray`], panicking on failure. #[inline] -pub fn as_generic_binary_array( - arr: &dyn Array, -) -> &GenericBinaryArray { +pub fn as_generic_binary_array(arr: &dyn Array) -> &GenericBinaryArray { arr.as_any() .downcast_ref::>() .expect("Unable to downcast to binary array") @@ -826,8 +822,7 @@ pub trait AsArray: private::Sealed { } /// Downcast this to a [`DictionaryArray`] returning `None` if not possible - fn as_dictionary_opt(&self) - -> Option<&DictionaryArray>; + fn as_dictionary_opt(&self) -> Option<&DictionaryArray>; /// Downcast this to a [`DictionaryArray`] panicking if not possible fn as_dictionary(&self) -> &DictionaryArray { @@ -877,9 +872,7 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } - fn as_dictionary_opt( - &self, - ) -> Option<&DictionaryArray> { + fn as_dictionary_opt(&self) -> Option<&DictionaryArray> { self.as_any().downcast_ref() } @@ -926,9 +919,7 @@ impl AsArray for ArrayRef { self.as_any().downcast_ref() } - fn as_dictionary_opt( - &self, - ) -> Option<&DictionaryArray> { + fn as_dictionary_opt(&self) -> Option<&DictionaryArray> { self.as_ref().as_dictionary_opt() } @@ -972,9 +963,7 @@ mod tests { #[test] fn test_decimal256array() { - let a = Decimal256Array::from_iter_values( - [1, 2, 4, 5].into_iter().map(i256::from_i128), - ); + let a = Decimal256Array::from_iter_values([1, 2, 4, 5].into_iter().map(i256::from_i128)); assert!(!as_primitive_array::(&a).is_empty()); } } diff --git a/arrow-array/src/delta.rs b/arrow-array/src/delta.rs index bf9ee5ca685f..d9aa4aa6de5d 100644 --- a/arrow-array/src/delta.rs +++ b/arrow-array/src/delta.rs @@ -55,10 +55,7 @@ pub(crate) fn add_months_datetime( /// Add the given number of days to the given datetime. /// /// Returns `None` when it will result in overflow. -pub(crate) fn add_days_datetime( - dt: DateTime, - days: i32, -) -> Option> { +pub(crate) fn add_days_datetime(dt: DateTime, days: i32) -> Option> { match days.cmp(&0) { Ordering::Equal => Some(dt), Ordering::Greater => dt.checked_add_days(Days::new(days as u64)), @@ -83,10 +80,7 @@ pub(crate) fn sub_months_datetime( /// Substract the given number of days to the given datetime. /// /// Returns `None` when it will result in overflow. -pub(crate) fn sub_days_datetime( - dt: DateTime, - days: i32, -) -> Option> { +pub(crate) fn sub_days_datetime(dt: DateTime, days: i32) -> Option> { match days.cmp(&0) { Ordering::Equal => Some(dt), Ordering::Greater => dt.checked_sub_days(Days::new(days as u64)), diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index a198332ca5b5..3f9cc0d525c1 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -18,8 +18,8 @@ //! Idiomatic iterators for [`Array`](crate::Array) use crate::array::{ - ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, - GenericListArray, GenericStringArray, PrimitiveArray, + ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, + GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; use arrow_buffer::NullBuffer; @@ -187,8 +187,7 @@ mod tests { #[test] fn test_string_array_iter_round_trip() { - let array = - StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); + let array = StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); let array = Arc::new(array) as ArrayRef; let array = array.as_any().downcast_ref::().unwrap(); @@ -211,8 +210,7 @@ mod tests { // check if DoubleEndedIterator is implemented let result: StringArray = array.iter().rev().collect(); - let rev_array = - StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); + let rev_array = StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); assert_eq!(result, rev_array); // check if ExactSizeIterator is implemented let _ = array.iter().rposition(|opt_b| opt_b == Some("a")); diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index afb7ec5e6e44..ef98c5efefb0 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -182,8 +182,7 @@ pub use array::*; mod record_batch; pub use record_batch::{ - RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, - RecordBatchWriter, + RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter, }; mod arithmetic; diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index afc0e2c33010..ad7b3eca1dbc 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -179,8 +179,8 @@ macro_rules! make_numeric_type { 16 => { // same general logic as for 8 lanes, extended to 16 bits let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); @@ -194,21 +194,19 @@ macro_rules! make_numeric_type { let tmp = &mut [0_i16; 32]; let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); + i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); + i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); unsafe { std::mem::transmute(i16x32::from_slice_unaligned(tmp)) } } @@ -218,33 +216,29 @@ macro_rules! make_numeric_type { let tmp = &mut [0_i8; 64]; let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); let vecmask = i32x16::splat(((mask >> 32) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[32..48]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[32..48]); let vecmask = i32x16::splat(((mask >> 48) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[48..64]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[48..64]); unsafe { std::mem::transmute(i8x64::from_slice_unaligned(tmp)) } } @@ -269,11 +263,7 @@ macro_rules! make_numeric_type { /// Selects elements of `a` and `b` using `mask` #[inline] - fn mask_select( - mask: Self::SimdMask, - a: Self::Simd, - b: Self::Simd, - ) -> Self::Simd { + fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { mask.select(a, b) } @@ -327,10 +317,7 @@ macro_rules! make_numeric_type { } #[inline] - fn unary_op Self::Simd>( - a: Self::Simd, - op: F, - ) -> Self::Simd { + fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { op(a) } } @@ -581,8 +568,7 @@ mod tests { let mask = 0b1101; let actual = IntervalMonthDayNanoType::mask_from_u64(mask); let expected = expected_mask!(i128, mask); - let expected = - m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice())); + let expected = m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -612,8 +598,7 @@ mod tests { let mask = 0b10101010_10101010; let actual = Float32Type::mask_from_u64(mask); let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); + let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -623,8 +608,7 @@ mod tests { let mask = 0b01010101_01010101; let actual = Int32Type::mask_from_u64(mask); let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); + let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -635,16 +619,14 @@ mod tests { let actual = UInt16Type::mask_from_u64(mask); let expected = expected_mask!(i16, mask); dbg!(&expected); - let expected = - m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); + let expected = m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } #[test] fn test_mask_i8() { - let mask = - 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; + let mask = 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; let actual = Int8Type::mask_from_u64(mask); let expected = expected_mask!(i8, mask); let expected = m8x64::from_cast(i8x64::from_slice_unaligned(expected.as_slice())); diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 27804447fba6..1f3e1df847a8 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -107,10 +107,7 @@ impl RecordBatch { /// vec![Arc::new(id_array)] /// ).unwrap(); /// ``` - pub fn try_new( - schema: SchemaRef, - columns: Vec, - ) -> Result { + pub fn try_new(schema: SchemaRef, columns: Vec) -> Result { let options = RecordBatchOptions::new(); Self::try_new_impl(schema, columns, &options) } @@ -179,9 +176,7 @@ impl RecordBatch { // check that all columns have the same row count if columns.iter().any(|c| c.len() != row_count) { let err = match options.row_count { - Some(_) => { - "all columns in a record batch must have the specified row count" - } + Some(_) => "all columns in a record batch must have the specified row count", None => "all columns in a record batch must have the same length", }; return Err(ArrowError::InvalidArgumentError(err.to_string())); @@ -190,9 +185,7 @@ impl RecordBatch { // function for comparing column type and field type // return true if 2 types are not matched let type_not_match = if options.match_field_names { - |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| { - col_type != field_type - } + |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| col_type != field_type } else { |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| { !col_type.equals_datatype(field_type) @@ -484,7 +477,11 @@ impl From for RecordBatch { fn from(value: StructArray) -> Self { let row_count = value.len(); let (fields, columns, nulls) = value.into_parts(); - assert_eq!(nulls.map(|n| n.null_count()).unwrap_or_default(), 0, "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation"); + assert_eq!( + nulls.map(|n| n.null_count()).unwrap_or_default(), + 0, + "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" + ); RecordBatch { schema: Arc::new(Schema::new(fields)), @@ -588,9 +585,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{ - BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, - }; + use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; @@ -606,8 +601,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); check_batch(record_batch, 5) } @@ -622,8 +616,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); assert_eq!(record_batch.get_array_memory_size(), 364); } @@ -649,8 +642,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "h", "i"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let offset = 2; let length = 5; @@ -699,8 +691,8 @@ mod tests { ])); let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])); - let record_batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).expect("valid conversion"); let expected_schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -716,11 +708,9 @@ mod tests { let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])); // Note there are no nulls in a or b, but we specify that b is nullable - let record_batch = RecordBatch::try_from_iter_with_nullable(vec![ - ("a", a, false), - ("b", b, true), - ]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter_with_nullable(vec![("a", a, false), ("b", b, true)]) + .expect("valid conversion"); let expected_schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), @@ -792,8 +782,7 @@ mod tests { let a = Int32Array::from(vec![1, 2, 3, 4, 5]); let b = Int32Array::from(vec![1, 2, 3, 4, 5]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); assert!(batch.is_err()); } @@ -863,11 +852,8 @@ mod tests { Field::new("id", DataType::Int32, false), Field::new("val", DataType::Int32, false), ]); - let record_batch = RecordBatch::try_new( - Arc::new(schema1), - vec![id_arr.clone(), val_arr.clone()], - ) - .unwrap(); + let record_batch = + RecordBatch::try_new(Arc::new(schema1), vec![id_arr.clone(), val_arr.clone()]).unwrap(); assert_eq!(record_batch["id"].as_ref(), id_arr.as_ref()); assert_eq!(record_batch["val"].as_ref(), val_arr.as_ref()); @@ -1005,15 +991,12 @@ mod tests { let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"])); let c: ArrayRef = Arc::new(StringArray::from(vec!["d", "e", "f"])); - let record_batch = RecordBatch::try_from_iter(vec![ - ("a", a.clone()), - ("b", b.clone()), - ("c", c.clone()), - ]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter(vec![("a", a.clone()), ("b", b.clone()), ("c", c.clone())]) + .expect("valid conversion"); - let expected = RecordBatch::try_from_iter(vec![("a", a), ("c", c)]) - .expect("valid conversion"); + let expected = + RecordBatch::try_from_iter(vec![("a", a), ("c", c)]).expect("valid conversion"); assert_eq!(expected, record_batch.project(&[0, 2]).unwrap()); } @@ -1049,8 +1032,7 @@ mod tests { let options = RecordBatchOptions::new().with_row_count(Some(10)); - let ok = - RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); + let ok = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); assert_eq!(ok.num_rows(), 10); let a = ok.slice(2, 5); diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 489aabf4756a..7a98fccb73b5 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -86,8 +86,7 @@ where // If current logical index is greater than current run end index then increment // the physical index. let run_ends = self.array.run_ends().values(); - if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() - { + if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this // reason the next value can be accessed by incrementing physical index once. @@ -136,8 +135,7 @@ where let run_ends = self.array.run_ends().values(); if self.current_back_physical > 0 - && self.current_back_logical - < run_ends[self.current_back_physical - 1].as_usize() + && self.current_back_logical < run_ends[self.current_back_physical - 1].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this @@ -211,8 +209,7 @@ mod tests { seed.shuffle(&mut rng); } // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays - let num = - max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); + let num = max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); for _ in 0..num { result.push(seed[ix]); } @@ -285,8 +282,7 @@ mod tests { for logical_len in logical_lengths { let input_array = build_input_array(logical_len); - let mut run_array_builder = - PrimitiveRunBuilder::::new(); + let mut run_array_builder = PrimitiveRunBuilder::::new(); run_array_builder.extend(input_array.iter().copied()); let run_array = run_array_builder.finish(); let typed_array = run_array.downcast::().unwrap(); @@ -327,8 +323,7 @@ mod tests { }) .collect(); - let result_asref: Vec> = - result.iter().map(|f| f.as_deref()).collect(); + let result_asref: Vec> = result.iter().map(|f| f.as_deref()).collect(); let expected_vec = vec![ Some("abb"), @@ -364,8 +359,7 @@ mod tests { // Iterate on sliced typed run array let actual: Vec> = sliced_typed_run_array.into_iter().collect(); - let expected: Vec> = - input_array.iter().take(slice_len).copied().collect(); + let expected: Vec> = input_array.iter().take(slice_len).copied().collect(); assert_eq!(expected, actual); // test for offset = total_len - slice_len, length = slice_len diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index f1f3f36d3c61..e0edcc9bc182 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -20,9 +20,7 @@ use crate::timezone::Tz; use crate::ArrowPrimitiveType; use arrow_schema::{DataType, TimeUnit}; -use chrono::{ - DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc, -}; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc}; /// Number of seconds in a day pub const SECONDS_IN_DAY: i64 = 86_400; @@ -221,10 +219,7 @@ pub fn as_datetime(v: i64) -> Option { } /// Converts an [`ArrowPrimitiveType`] to [`DateTime`] -pub fn as_datetime_with_timezone( - v: i64, - tz: Tz, -) -> Option> { +pub fn as_datetime_with_timezone(v: i64, tz: Tz) -> Option> { let naive = as_datetime::(v)?; Some(Utc.from_utc_datetime(&naive).with_timezone(&tz)) } @@ -274,8 +269,8 @@ pub fn as_duration(v: i64) -> Option { #[cfg(test)] mod tests { use crate::temporal_conversions::{ - date64_to_datetime, split_second, timestamp_ms_to_datetime, - timestamp_ns_to_datetime, timestamp_us_to_datetime, NANOSECONDS, + date64_to_datetime, split_second, timestamp_ms_to_datetime, timestamp_ns_to_datetime, + timestamp_us_to_datetime, NANOSECONDS, }; use chrono::NaiveDateTime; diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index f56189c46512..dc91886f34c5 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -38,8 +38,8 @@ fn parse_fixed_offset(tz: &str) -> Option { if values.iter().any(|x| *x > 9) { return None; } - let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60 - + (values[2] * 10 + values[3]) as i32 * 60; + let secs = + (values[0] * 10 + values[1]) as i32 * 60 * 60 + (values[2] * 10 + values[3]) as i32 * 60; match bytes[0] { b'+' => FixedOffset::east_opt(secs), @@ -122,10 +122,7 @@ mod private { }) } - fn offset_from_local_datetime( - &self, - local: &NaiveDateTime, - ) -> LocalResult { + fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult { tz!(self, tz, { tz.offset_from_local_datetime(local).map(|x| TzOffset { tz: *self, @@ -285,10 +282,7 @@ mod private { self.0.offset_from_local_date(local).map(TzOffset) } - fn offset_from_local_datetime( - &self, - local: &NaiveDateTime, - ) -> LocalResult { + fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult { self.0.offset_from_local_datetime(local).map(TzOffset) } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 7988fe9f6690..16d0e822d052 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -18,8 +18,7 @@ //! Zero-sized types used to parameterize generic array implementations use crate::delta::{ - add_days_datetime, add_months_datetime, shift_months, sub_days_datetime, - sub_months_datetime, + add_days_datetime, add_months_datetime, shift_months, sub_days_datetime, sub_months_datetime, }; use crate::temporal_conversions::as_datetime_with_timezone; use crate::timezone::Tz; @@ -27,9 +26,8 @@ use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; use arrow_schema::{ - ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, - DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, - DECIMAL_DEFAULT_SCALE, + ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; @@ -875,9 +873,7 @@ impl IntervalDayTimeType { /// /// * `i` - The IntervalDayTimeType to convert #[inline] - pub fn to_parts( - i: ::Native, - ) -> (i32, i32) { + pub fn to_parts(i: ::Native) -> (i32, i32) { let days = (i >> 32) as i32; let ms = i as i32; (days, ms) @@ -1221,10 +1217,7 @@ pub trait DecimalType: fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String; /// Validates that `value` contains no more than `precision` decimal digits - fn validate_decimal_precision( - value: Self::Native, - precision: u8, - ) -> Result<(), ArrowError>; + fn validate_decimal_precision(value: Self::Native, precision: u8) -> Result<(), ArrowError>; } /// Validate that `precision` and `scale` are valid for `T` @@ -1400,10 +1393,7 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { const DATA_TYPE: DataType; /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values` - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError>; + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError>; } /// [`ByteArrayType`] for string arrays @@ -1422,10 +1412,7 @@ impl ByteArrayType for GenericStringType { DataType::Utf8 }; - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError> { + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError> { // Verify that the slice as a whole is valid UTF-8 let validated = std::str::from_utf8(values).map_err(|e| { ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) @@ -1471,10 +1458,7 @@ impl ByteArrayType for GenericBinaryType { DataType::Binary }; - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError> { + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError> { // offsets are guaranteed to be monotonically increasing and non-empty let max_offset = offsets.last().unwrap().as_usize(); if values.len() < max_offset { diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs index 2d443175a7aa..00e85b39be73 100644 --- a/arrow-avro/src/reader/header.rs +++ b/arrow-avro/src/reader/header.rs @@ -133,9 +133,7 @@ impl HeaderDecoder { let remaining = &MAGIC[MAGIC.len() - self.bytes_remaining..]; let to_decode = buf.len().min(remaining.len()); if !buf.starts_with(&remaining[..to_decode]) { - return Err(ArrowError::ParseError( - "Incorrect avro magic".to_string(), - )); + return Err(ArrowError::ParseError("Incorrect avro magic".to_string())); } self.bytes_remaining -= to_decode; buf = &buf[to_decode..]; diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index 91e2dbf9835b..7769bbbc4998 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -50,9 +50,7 @@ fn read_header(mut reader: R) -> Result { } /// Return an iterator of [`Block`] from the provided [`BufRead`] -fn read_blocks( - mut reader: R, -) -> impl Iterator> { +fn read_blocks(mut reader: R) -> impl Iterator> { let mut decoder = BlockDecoder::default(); let mut try_next = move || { diff --git a/arrow-avro/src/schema.rs b/arrow-avro/src/schema.rs index 839ba65bd5fc..17b82cf861b7 100644 --- a/arrow-avro/src/schema.rs +++ b/arrow-avro/src/schema.rs @@ -335,9 +335,7 @@ mod tests { Field { name: "value", doc: None, - r#type: Schema::TypeName(TypeName::Primitive( - PrimitiveType::Long - )), + r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::Long)), default: None, }, Field { diff --git a/arrow-buffer/src/bigint/div.rs b/arrow-buffer/src/bigint/div.rs index ba530ffcc6c8..e1b2ed4f8aa5 100644 --- a/arrow-buffer/src/bigint/div.rs +++ b/arrow-buffer/src/bigint/div.rs @@ -26,10 +26,7 @@ /// # Panics /// /// Panics if divisor is zero -pub fn div_rem( - numerator: &[u64; N], - divisor: &[u64; N], -) -> ([u64; N], [u64; N]) { +pub fn div_rem(numerator: &[u64; N], divisor: &[u64; N]) -> ([u64; N], [u64; N]) { let numerator_bits = bits(numerator); let divisor_bits = bits(divisor); assert_ne!(divisor_bits, 0, "division by zero"); @@ -61,10 +58,7 @@ fn bits(arr: &[u64]) -> usize { } /// Division of numerator by a u64 divisor -fn div_rem_small( - numerator: &[u64; N], - divisor: u64, -) -> ([u64; N], [u64; N]) { +fn div_rem_small(numerator: &[u64; N], divisor: u64) -> ([u64; N], [u64; N]) { let mut rem = 0u64; let mut numerator = *numerator; numerator.iter_mut().rev().for_each(|d| { @@ -227,11 +221,7 @@ fn sub_assign(a: &mut [u64], b: &[u64]) -> bool { } /// Converts an overflowing binary operation on scalars to one on slices -fn binop_slice( - a: &mut [u64], - b: &[u64], - binop: impl Fn(u64, u64) -> (u64, bool) + Copy, -) -> bool { +fn binop_slice(a: &mut [u64], b: &[u64], binop: impl Fn(u64, u64) -> (u64, bool) + Copy) -> bool { let mut c = false; a.iter_mut().zip(b.iter()).for_each(|(x, y)| { let (res1, overflow1) = y.overflowing_add(u64::from(c)); diff --git a/arrow-buffer/src/bigint/mod.rs b/arrow-buffer/src/bigint/mod.rs index d064663bf63a..afbb3a31df12 100644 --- a/arrow-buffer/src/bigint/mod.rs +++ b/arrow-buffer/src/bigint/mod.rs @@ -310,9 +310,7 @@ impl i256 { (Self::from_le_bytes(bytes), false) } Ordering::Equal => (Self::from_le_bytes(v_bytes.try_into().unwrap()), false), - Ordering::Greater => { - (Self::from_le_bytes(v_bytes[..32].try_into().unwrap()), true) - } + Ordering::Greater => (Self::from_le_bytes(v_bytes[..32].try_into().unwrap()), true), } } @@ -357,8 +355,7 @@ impl i256 { #[inline] pub fn checked_add(self, other: Self) -> Option { let r = self.wrapping_add(other); - ((other.is_negative() && r < self) || (!other.is_negative() && r >= self)) - .then_some(r) + ((other.is_negative() && r < self) || (!other.is_negative() && r >= self)).then_some(r) } /// Performs wrapping subtraction @@ -373,8 +370,7 @@ impl i256 { #[inline] pub fn checked_sub(self, other: Self) -> Option { let r = self.wrapping_sub(other); - ((other.is_negative() && r > self) || (!other.is_negative() && r <= self)) - .then_some(r) + ((other.is_negative() && r > self) || (!other.is_negative() && r <= self)).then_some(r) } /// Performs wrapping multiplication @@ -591,9 +587,7 @@ impl i256 { /// Temporary workaround due to lack of stable const array slicing /// See -const fn split_array( - vals: [u8; N], -) -> ([u8; M], [u8; M]) { +const fn split_array(vals: [u8; N]) -> ([u8; M], [u8; M]) { let mut a = [0; M]; let mut b = [0; M]; let mut i = 0; @@ -915,8 +909,7 @@ mod tests { // Addition let actual = il.wrapping_add(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() + br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() + br.clone()); assert_eq!(actual, expected); let checked = il.checked_add(ir); @@ -927,8 +920,7 @@ mod tests { // Subtraction let actual = il.wrapping_sub(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() - br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() - br.clone()); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_sub(ir); @@ -939,8 +931,7 @@ mod tests { // Multiplication let actual = il.wrapping_mul(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() * br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() * br.clone()); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_mul(ir); @@ -996,8 +987,7 @@ mod tests { // Exponentiation for exp in vec![0, 1, 2, 3, 8, 100].into_iter() { let actual = il.wrapping_pow(exp); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone().pow(exp)); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone().pow(exp)); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_pow(exp); @@ -1212,7 +1202,10 @@ mod tests { ("000000000000000000000000000000000000000", Some(i256::ZERO)), ("0000000000000000000000000000000000000000-11", None), ("11-1111111111111111111111111111111111111", None), - ("115792089237316195423570985008687907853269984665640564039457584007913129639936", None) + ( + "115792089237316195423570985008687907853269984665640564039457584007913129639936", + None, + ), ]; for (case, expected) in cases { assert_eq!(i256::from_string(case), expected) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 577c716e4bea..c651edcad92e 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -223,13 +223,7 @@ impl BitAnd<&BooleanBuffer> for &BooleanBuffer { fn bitand(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_and( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_and(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -242,13 +236,7 @@ impl BitOr<&BooleanBuffer> for &BooleanBuffer { fn bitor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_or( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_or(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -261,13 +249,7 @@ impl BitXor<&BooleanBuffer> for &BooleanBuffer { fn bitxor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_xor( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_xor(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -428,8 +410,7 @@ mod tests { let buf = Buffer::from(&[0, 1, 1, 0, 0]); let boolean_buf = &BooleanBuffer::new(buf, offset, len); - let expected = - BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); + let expected = BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); assert_eq!(!boolean_buf, expected); } } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index bda6dfc5cdee..05530eed9b08 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -523,9 +523,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_slice_offset_out_of_bound() { let buf = Buffer::from(&[2, 4, 6, 8, 10]); buf.slice(6); @@ -688,9 +686,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn slice_overflow() { let buffer = Buffer::from(MutableBuffer::from_len_zeroed(12)); buffer.slice_with_length(2, usize::MAX); diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 2c56f9a5b270..69c986cc1056 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -334,9 +334,7 @@ impl MutableBuffer { #[inline] pub(super) fn into_buffer(self) -> Buffer { - let bytes = unsafe { - Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) - }; + let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; std::mem::forget(self); Buffer::from_bytes(bytes) } @@ -351,8 +349,7 @@ impl MutableBuffer { // SAFETY // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect // implementation outside this crate, and this method checks alignment - let (prefix, offsets, suffix) = - unsafe { self.as_slice_mut().align_to_mut::() }; + let (prefix, offsets, suffix) = unsafe { self.as_slice_mut().align_to_mut::() }; assert!(prefix.is_empty() && suffix.is_empty()); offsets } @@ -604,9 +601,7 @@ impl MutableBuffer { // we can't specialize `extend` for `TrustedLen` like `Vec` does. // 2. `from_trusted_len_iter_bool` is faster. #[inline] - pub unsafe fn from_trusted_len_iter_bool>( - mut iterator: I, - ) -> Self { + pub unsafe fn from_trusted_len_iter_bool>(mut iterator: I) -> Self { let (_, upper) = iterator.size_hint(); let len = upper.expect("from_trusted_len_iter requires an upper limit"); diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index e0c7d9ef8f49..c79aef398059 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -71,10 +71,7 @@ impl NullBuffer { /// This is commonly used by binary operations where the result is NULL if either /// of the input values is NULL. Handling the null mask separately in this way /// can yield significant performance improvements over an iterator approach - pub fn union( - lhs: Option<&NullBuffer>, - rhs: Option<&NullBuffer>, - ) -> Option { + pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option { match (lhs, rhs) { (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())), (Some(n), None) | (None, Some(n)) => Some(n.clone()), diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index a6f2f7f6cfae..652d30c3b0ab 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -219,8 +219,7 @@ mod tests { assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]); let half_max = i32::MAX / 2; - let buffer = - OffsetBuffer::::from_lengths([half_max as usize, half_max as usize]); + let buffer = OffsetBuffer::::from_lengths([half_max as usize, half_max as usize]); assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]); } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index eccff6280dd8..ca00e41bea21 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -184,10 +184,6 @@ pub fn buffer_bin_xor( /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -pub fn buffer_unary_not( - left: &Buffer, - offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { +pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) } diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index 29c0f3dfd949..3dbbe344a025 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -110,11 +110,7 @@ where /// /// - `buffer` must contain strictly increasing values greater than zero /// - The last value of `buffer` must be greater than or equal to `offset + len` - pub unsafe fn new_unchecked( - run_ends: ScalarBuffer, - offset: usize, - len: usize, - ) -> Self { + pub unsafe fn new_unchecked(run_ends: ScalarBuffer, offset: usize, len: usize) -> Self { Self { run_ends, offset, diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 276e635e825c..3b75d5384046 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -221,9 +221,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] fn test_unaligned() { let expected = [0_i32, 1, 2]; let buffer = Buffer::from_iter(expected.iter().cloned()); @@ -232,18 +230,14 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_length_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 1, 3); } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_offset_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 4, 0); diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index f0e7f0f13670..ca178ae5ce4e 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -154,14 +154,12 @@ impl BooleanBufferBuilder { if cur_remainder != 0 { // Pad last byte with 1s - *self.buffer.as_slice_mut().last_mut().unwrap() |= - !((1 << cur_remainder) - 1) + *self.buffer.as_slice_mut().last_mut().unwrap() |= !((1 << cur_remainder) - 1) } self.buffer.resize(new_len_bytes, 0xFF); if new_remainder != 0 { // Clear remaining bits - *self.buffer.as_slice_mut().last_mut().unwrap() &= - (1 << new_remainder) - 1 + *self.buffer.as_slice_mut().last_mut().unwrap() &= (1 << new_remainder) - 1 } self.len = new_len; } diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 8f5019d5a4cc..81860b604868 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -60,11 +60,7 @@ impl Bytes { /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. #[inline] - pub(crate) unsafe fn new( - ptr: NonNull, - len: usize, - deallocation: Deallocation, - ) -> Bytes { + pub(crate) unsafe fn new(ptr: NonNull, len: usize, deallocation: Deallocation) -> Bytes { Bytes { ptr, len, diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index 6830acae94a1..9e4fb8268dff 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -60,8 +60,7 @@ impl<'a> UnalignedBitChunk<'a> { // If less than 8 bytes, read into prefix if buffer.len() <= 8 { - let (suffix_mask, trailing_padding) = - compute_suffix_mask(len, offset_padding); + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); let prefix = read_u64(buffer) & suffix_mask & prefix_mask; return Self { @@ -75,8 +74,7 @@ impl<'a> UnalignedBitChunk<'a> { // If less than 16 bytes, read into prefix and suffix if buffer.len() <= 16 { - let (suffix_mask, trailing_padding) = - compute_suffix_mask(len, offset_padding); + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); let prefix = read_u64(&buffer[..8]) & prefix_mask; let suffix = read_u64(&buffer[8..]) & suffix_mask; @@ -167,10 +165,7 @@ impl<'a> UnalignedBitChunk<'a> { } pub type UnalignedBitChunkIterator<'a> = std::iter::Chain< - std::iter::Chain< - std::option::IntoIter, - std::iter::Cloned>, - >, + std::iter::Chain, std::iter::Cloned>>, std::option::IntoIter, >; @@ -338,9 +333,8 @@ impl Iterator for BitChunkIterator<'_> { } else { // the constructor ensures that bit_offset is in 0..8 // that means we need to read at most one additional byte to fill in the high bits - let next = unsafe { - std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 - }; + let next = + unsafe { std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 }; (current >> bit_offset) | (next << (64 - bit_offset)) }; @@ -387,8 +381,8 @@ mod tests { #[test] fn test_iter_unaligned() { let input: &[u8] = &[ - 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, - 0b00100000, 0b01000000, 0b11111111, + 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -408,8 +402,8 @@ mod tests { #[test] fn test_iter_unaligned_remainder_1_byte() { let input: &[u8] = &[ - 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, - 0b00100000, 0b01000000, 0b11111111, + 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -442,8 +436,8 @@ mod tests { #[test] fn test_iter_unaligned_remainder_bits_large() { let input: &[u8] = &[ - 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, - 0b11111111, 0b00000000, 0b11111111, + 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, + 0b00000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -637,11 +631,8 @@ mod tests { let max_truncate = 128.min(mask_len - offset); let truncate = rng.gen::().checked_rem(max_truncate).unwrap_or(0); - let unaligned = UnalignedBitChunk::new( - buffer.as_slice(), - offset, - mask_len - offset - truncate, - ); + let unaligned = + UnalignedBitChunk::new(buffer.as_slice(), offset, mask_len - offset - truncate); let bool_slice = &bools[offset..mask_len - truncate]; diff --git a/arrow-buffer/src/util/bit_iterator.rs b/arrow-buffer/src/util/bit_iterator.rs index 4e24ccdabec0..df40a8fbaccb 100644 --- a/arrow-buffer/src/util/bit_iterator.rs +++ b/arrow-buffer/src/util/bit_iterator.rs @@ -276,8 +276,8 @@ mod tests { assert_eq!( actual, &[ - false, true, false, false, true, false, true, false, false, false, false, - false, true, false + false, true, false, false, true, false, true, false, false, false, false, false, + true, false ] ); diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs index 2af24b782632..8f81cb7d0469 100644 --- a/arrow-buffer/src/util/bit_mask.rs +++ b/arrow-buffer/src/util/bit_mask.rs @@ -42,8 +42,7 @@ pub fn set_bits( let chunks = BitChunks::new(data, offset_read + bits_to_align, len - bits_to_align); chunks.iter().for_each(|chunk| { null_count += chunk.count_zeros(); - write_data[write_byte_index..write_byte_index + 8] - .copy_from_slice(&chunk.to_le_bytes()); + write_data[write_byte_index..write_byte_index + 8].copy_from_slice(&chunk.to_le_bytes()); write_byte_index += 8; }); @@ -70,8 +69,8 @@ mod tests { fn test_set_bits_aligned() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 8; @@ -80,8 +79,8 @@ mod tests { let len = 64; let expected_data: &[u8] = &[ - 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0, + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, 0, ]; let expected_null_count = 24; let result = set_bits( @@ -100,8 +99,8 @@ mod tests { fn test_set_bits_unaligned_destination_start() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 3; @@ -110,8 +109,8 @@ mod tests { let len = 64; let expected_data: &[u8] = &[ - 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111, - 0b00111110, 0b00101111, 0b00000101, 0b00000000, + 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111, 0b00111110, + 0b00101111, 0b00000101, 0b00000000, ]; let expected_null_count = 24; let result = set_bits( @@ -130,8 +129,8 @@ mod tests { fn test_set_bits_unaligned_destination_end() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 8; @@ -140,8 +139,8 @@ mod tests { let len = 62; let expected_data: &[u8] = &[ - 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b00100101, 0, + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b00100101, 0, ]; let expected_null_count = 23; let result = set_bits( @@ -160,9 +159,9 @@ mod tests { fn test_set_bits_unaligned() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, 0b10100101, + 0b10011001, 0b11011011, 0b11101011, 0b11000011, ]; let destination_offset = 3; @@ -171,9 +170,8 @@ mod tests { let len = 95; let expected_data: &[u8] = &[ - 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, - 0b01111001, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, - 0b00000001, + 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, 0b01111001, + 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, 0b00000001, ]; let expected_null_count = 35; let result = set_bits( diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 54c500f1ac41..97307f076f34 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -46,9 +46,7 @@ use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, string_to_datetime, Parser, }; -use arrow_array::{ - builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, -}; +use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::*; @@ -365,9 +363,10 @@ where if cast_options.safe { array .unary_opt::<_, Decimal128Type>(|v| { - (mul * v.as_()).round().to_i128().filter(|v| { - Decimal128Type::validate_decimal_precision(*v, precision).is_ok() - }) + (mul * v.as_()) + .round() + .to_i128() + .filter(|v| Decimal128Type::validate_decimal_precision(*v, precision).is_ok()) }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -387,8 +386,7 @@ where )) }) .and_then(|v| { - Decimal128Type::validate_decimal_precision(v, precision) - .map(|_| v) + Decimal128Type::validate_decimal_precision(v, precision).map(|_| v) }) })? .with_precision_and_scale(precision, scale) @@ -410,9 +408,8 @@ where if cast_options.safe { array .unary_opt::<_, Decimal256Type>(|v| { - i256::from_f64((v.as_() * mul).round()).filter(|v| { - Decimal256Type::validate_decimal_precision(*v, precision).is_ok() - }) + i256::from_f64((v.as_() * mul).round()) + .filter(|v| Decimal256Type::validate_decimal_precision(*v, precision).is_ok()) }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -430,8 +427,7 @@ where )) }) .and_then(|v| { - Decimal256Type::validate_decimal_precision(v, precision) - .map(|_| v) + Decimal256Type::validate_decimal_precision(v, precision).map(|_| v) }) })? .with_precision_and_scale(precision, scale) @@ -493,7 +489,10 @@ fn cast_month_day_nano_to_duration>( .map(|v| { v.map(|v| match v >> 64 { 0 => Ok((v as i64) / scale), - _ => Err(ArrowError::ComputeError("Cannot convert interval containing non-zero months or days to duration".to_string())) + _ => Err(ArrowError::ComputeError( + "Cannot convert interval containing non-zero months or days to duration" + .to_string(), + )), }) .transpose() }) @@ -559,10 +558,7 @@ fn cast_duration_to_interval>( } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] -fn cast_reinterpret_arrays< - I: ArrowPrimitiveType, - O: ArrowPrimitiveType, ->( +fn cast_reinterpret_arrays>( array: &dyn Array, ) -> Result { Ok(Arc::new(array.as_primitive::().reinterpret_cast::())) @@ -613,14 +609,13 @@ where } else { let v = array.value(i).div_checked(div)?; - let value = - ::from::(v).ok_or_else(|| { - ArrowError::CastError(format!( - "value of {:?} is out of range {}", - v, - T::DATA_TYPE - )) - })?; + let value = ::from::(v).ok_or_else(|| { + ArrowError::CastError(format!( + "value of {:?} is out of range {}", + v, + T::DATA_TYPE + )) + })?; value_builder.append_value(value); } @@ -780,9 +775,7 @@ pub fn cast_with_options( "Casting from type {from_type:?} to dictionary type {to_type:?} not supported", ))), }, - (List(_), List(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } + (List(_), List(ref to)) => cast_list_inner::(array, to, to_type, cast_options), (LargeList(_), LargeList(ref to)) => { cast_list_inner::(array, to, to_type, cast_options) } @@ -919,16 +912,12 @@ pub fn cast_with_options( *scale, cast_options, ), - Float32 => { - cast_decimal_to_float::(array, |x| { - (x as f64 / 10_f64.powi(*scale as i32)) as f32 - }) - } - Float64 => { - cast_decimal_to_float::(array, |x| { - x as f64 / 10_f64.powi(*scale as i32) - }) - } + Float32 => cast_decimal_to_float::(array, |x| { + (x as f64 / 10_f64.powi(*scale as i32)) as f32 + }), + Float64 => cast_decimal_to_float::(array, |x| { + x as f64 / 10_f64.powi(*scale as i32) + }), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -988,16 +977,12 @@ pub fn cast_with_options( *scale, cast_options, ), - Float32 => { - cast_decimal_to_float::(array, |x| { - (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 - }) - } - Float64 => { - cast_decimal_to_float::(array, |x| { - x.to_f64().unwrap() / 10_f64.powi(*scale as i32) - }) - } + Float32 => cast_decimal_to_float::(array, |x| { + (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 + }), + Float64 => cast_decimal_to_float::(array, |x| { + x.to_f64().unwrap() / 10_f64.powi(*scale as i32) + }), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -1239,25 +1224,35 @@ pub fn cast_with_options( Float64 => parse_string::(array, cast_options), Date32 => parse_string::(array, cast_options), Date64 => parse_string::(array, cast_options), - Binary => Ok(Arc::new(BinaryArray::from(array.as_string::().clone()))), + Binary => Ok(Arc::new(BinaryArray::from( + array.as_string::().clone(), + ))), LargeBinary => { let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), - Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), - Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time32(TimeUnit::Millisecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Microsecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Nanosecond) => { + parse_string::(array, cast_options) } + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Millisecond, to_tz) => cast_string_to_timestamp::< + i32, + TimestampMillisecondType, + >(array, to_tz, cast_options), + Timestamp(TimeUnit::Microsecond, to_tz) => cast_string_to_timestamp::< + i32, + TimestampMicrosecondType, + >(array, to_tz, cast_options), Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1289,26 +1284,33 @@ pub fn cast_with_options( Date64 => parse_string::(array, cast_options), Utf8 => cast_byte_container::(array), Binary => { - let large_binary = - LargeBinaryArray::from(array.as_string::().clone()); + let large_binary = LargeBinaryArray::from(array.as_string::().clone()); cast_byte_container::(&large_binary) } LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), Time32(TimeUnit::Second) => parse_string::(array, cast_options), - Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), - Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time32(TimeUnit::Millisecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Microsecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Nanosecond) => { + parse_string::(array, cast_options) } + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Millisecond, to_tz) => cast_string_to_timestamp::< + i64, + TimestampMillisecondType, + >(array, to_tz, cast_options), + Timestamp(TimeUnit::Microsecond, to_tz) => cast_string_to_timestamp::< + i64, + TimestampMicrosecondType, + >(array, to_tz, cast_options), Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1331,9 +1333,7 @@ pub fn cast_with_options( let array = cast_binary_to_string::(array, cast_options)?; cast_byte_container::(array.as_ref()) } - LargeBinary => { - cast_byte_container::(array) - } + LargeBinary => cast_byte_container::(array), FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } @@ -1357,278 +1357,117 @@ pub fn cast_with_options( }, (FixedSizeBinary(size), _) => match to_type { Binary => cast_fixed_size_binary_to_binary::(array, *size), - LargeBinary => - cast_fixed_size_binary_to_binary::(array, *size), + LargeBinary => cast_fixed_size_binary_to_binary::(array, *size), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array, cast_options), - (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array, cast_options), - // start numeric casts - (UInt8, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, UInt32) => { - cast_numeric_arrays::(array, cast_options) + (from_type, LargeUtf8) if from_type.is_primitive() => { + value_to_string::(array, cast_options) } - (UInt8, UInt64) => { - cast_numeric_arrays::(array, cast_options) + (from_type, Utf8) if from_type.is_primitive() => { + value_to_string::(array, cast_options) } + // start numeric casts + (UInt8, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt8, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt8, UInt64) => cast_numeric_arrays::(array, cast_options), (UInt8, Int8) => cast_numeric_arrays::(array, cast_options), - (UInt8, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt16, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Float64) => { - cast_numeric_arrays::(array, cast_options) - } + (UInt8, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt8, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt8, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt8, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt8, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt16, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt16, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt16, UInt64) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt16, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt16, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt32, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt32, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt32, UInt64) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt32, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt32, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt64, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt64, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt64, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt64, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Float64) => cast_numeric_arrays::(array, cast_options), (Int8, UInt8) => cast_numeric_arrays::(array, cast_options), - (Int8, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int8, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int8, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int8, UInt64) => cast_numeric_arrays::(array, cast_options), (Int8, Int16) => cast_numeric_arrays::(array, cast_options), (Int8, Int32) => cast_numeric_arrays::(array, cast_options), (Int8, Int64) => cast_numeric_arrays::(array, cast_options), - (Int8, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, Float64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int8, Float32) => cast_numeric_arrays::(array, cast_options), + (Int8, Float64) => cast_numeric_arrays::(array, cast_options), - (Int16, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int16, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt64) => cast_numeric_arrays::(array, cast_options), (Int16, Int8) => cast_numeric_arrays::(array, cast_options), - (Int16, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Int32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int16, Int32) => cast_numeric_arrays::(array, cast_options), + (Int16, Int64) => cast_numeric_arrays::(array, cast_options), + (Int16, Float32) => cast_numeric_arrays::(array, cast_options), + (Int16, Float64) => cast_numeric_arrays::(array, cast_options), + + (Int32, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt64) => cast_numeric_arrays::(array, cast_options), (Int32, Int8) => cast_numeric_arrays::(array, cast_options), - (Int32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Int64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int32, Int16) => cast_numeric_arrays::(array, cast_options), + (Int32, Int64) => cast_numeric_arrays::(array, cast_options), + (Int32, Float32) => cast_numeric_arrays::(array, cast_options), + (Int32, Float64) => cast_numeric_arrays::(array, cast_options), + + (Int64, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt64) => cast_numeric_arrays::(array, cast_options), (Int64, Int8) => cast_numeric_arrays::(array, cast_options), - (Int64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Float32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Float64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } + (Int64, Int16) => cast_numeric_arrays::(array, cast_options), + (Int64, Int32) => cast_numeric_arrays::(array, cast_options), + (Int64, Float32) => cast_numeric_arrays::(array, cast_options), + (Int64, Float64) => cast_numeric_arrays::(array, cast_options), + + (Float32, UInt8) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt16) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt32) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt64) => cast_numeric_arrays::(array, cast_options), + (Float32, Int8) => cast_numeric_arrays::(array, cast_options), + (Float32, Int16) => cast_numeric_arrays::(array, cast_options), + (Float32, Int32) => cast_numeric_arrays::(array, cast_options), + (Float32, Int64) => cast_numeric_arrays::(array, cast_options), + (Float32, Float64) => cast_numeric_arrays::(array, cast_options), + + (Float64, UInt8) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt16) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt32) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt64) => cast_numeric_arrays::(array, cast_options), + (Float64, Int8) => cast_numeric_arrays::(array, cast_options), + (Float64, Int16) => cast_numeric_arrays::(array, cast_options), + (Float64, Int32) => cast_numeric_arrays::(array, cast_options), + (Float64, Int64) => cast_numeric_arrays::(array, cast_options), + (Float64, Float32) => cast_numeric_arrays::(array, cast_options), // end numeric casts // temporal casts @@ -1684,71 +1523,77 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (Date32, Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x as i64 * MILLISECONDS_IN_DAY), )), (Date64, Date32) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date32Type>(|x| (x / MILLISECONDS_IN_DAY) as i32), )), (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32MillisecondType>(|x| x * MILLISECONDS as i32), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x as i64 * MICROSECONDS), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64NanosecondType>(|x| x as i64 * NANOSECONDS), )), (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| x / MILLISECONDS as i32), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time64MicrosecondType>(|x| { - x as i64 * (MICROSECONDS / MILLISECONDS) - }), + array + .as_primitive::() + .unary::<_, Time64MicrosecondType>(|x| x as i64 * (MICROSECONDS / MILLISECONDS)), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time64NanosecondType>(|x| { - x as i64 * (MICROSECONDS / NANOSECONDS) - }), + array + .as_primitive::() + .unary::<_, Time64NanosecondType>(|x| x as i64 * (MICROSECONDS / NANOSECONDS)), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| (x / MICROSECONDS) as i32), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time32MillisecondType>(|x| { - (x / (MICROSECONDS / MILLISECONDS)) as i32 - }), + array + .as_primitive::() + .unary::<_, Time32MillisecondType>(|x| (x / (MICROSECONDS / MILLISECONDS)) as i32), )), (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64NanosecondType>(|x| x * (NANOSECONDS / MICROSECONDS)), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| (x / NANOSECONDS) as i32), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time32MillisecondType>(|x| { - (x / (NANOSECONDS / MILLISECONDS)) as i32 - }), + array + .as_primitive::() + .unary::<_, Time32MillisecondType>(|x| (x / (NANOSECONDS / MILLISECONDS)) as i32), )), (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), )), @@ -1803,39 +1648,29 @@ pub fn cast_with_options( (None, Some(to_tz)) => { let to_tz: Tz = to_tz.parse()?; match to_unit { - TimeUnit::Second => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Millisecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Microsecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Nanosecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } + TimeUnit::Second => adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )?, + TimeUnit::Millisecond => adjust_timestamp_to_timezone::< + TimestampMillisecondType, + >( + converted, &to_tz, cast_options + )?, + TimeUnit::Microsecond => adjust_timestamp_to_timezone::< + TimestampMicrosecondType, + >( + converted, &to_tz, cast_options + )?, + TimeUnit::Nanosecond => adjust_timestamp_to_timezone::< + TimestampNanosecondType, + >( + converted, &to_tz, cast_options + )?, } } - _ => { - converted - } + _ => converted, }; Ok(make_timestamp_array( &adjusted, @@ -1854,45 +1689,43 @@ pub fn cast_with_options( if time_array.is_null(i) { b.append_null(); } else { - b.append_value(num::integer::div_floor::(time_array.value(i), from_size) as i32); + b.append_value( + num::integer::div_floor::(time_array.value(i), from_size) as i32, + ); } } Ok(Arc::new(b.finish()) as ArrayRef) } - (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new( - match cast_options.safe { - true => { - // change error to None - array.as_primitive::() - .unary_opt::<_, Date64Type>(|x| { - x.checked_mul(MILLISECONDS) - }) - } - false => { - array.as_primitive::().try_unary::<_, Date64Type, _>( - |x| { - x.mul_checked(MILLISECONDS) - }, - )? - } - }, - )), + (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new(match cast_options.safe { + true => { + // change error to None + array + .as_primitive::() + .unary_opt::<_, Date64Type>(|x| x.checked_mul(MILLISECONDS)) + } + false => array + .as_primitive::() + .try_unary::<_, Date64Type, _>(|x| x.mul_checked(MILLISECONDS))?, + })), (Timestamp(TimeUnit::Millisecond, _), Date64) => { cast_reinterpret_arrays::(array) } (Timestamp(TimeUnit::Microsecond, _), Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x / (MICROSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Nanosecond, _), Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampSecondType, @@ -1903,7 +1736,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampSecondType, @@ -1914,7 +1748,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1925,7 +1760,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1936,7 +1772,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1947,7 +1784,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1958,7 +1796,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1969,7 +1808,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1980,7 +1820,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampSecondType, @@ -1991,7 +1832,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampSecondType, @@ -2002,7 +1844,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMillisecondType, @@ -2013,7 +1856,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMillisecondType, @@ -2024,7 +1868,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -2035,7 +1880,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -2046,7 +1892,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampNanosecondType, @@ -2057,7 +1904,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampNanosecondType, @@ -2067,38 +1915,41 @@ pub fn cast_with_options( } (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampSecondType>(|x| x / MILLISECONDS), )), (Date64, Timestamp(TimeUnit::Millisecond, None)) => { cast_reinterpret_arrays::(array) } (Date64, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMicrosecondType>( - |x| x * (MICROSECONDS / MILLISECONDS), - ), + array + .as_primitive::() + .unary::<_, TimestampMicrosecondType>(|x| x * (MICROSECONDS / MILLISECONDS)), )), (Date64, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampNanosecondType>( - |x| x * (NANOSECONDS / MILLISECONDS), - ), + array + .as_primitive::() + .unary::<_, TimestampNanosecondType>(|x| x * (NANOSECONDS / MILLISECONDS)), )), (Date32, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampSecondType>(|x| (x as i64) * SECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Millisecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMillisecondType>( - |x| (x as i64) * MILLISECONDS_IN_DAY, - ), + array + .as_primitive::() + .unary::<_, TimestampMillisecondType>(|x| (x as i64) * MILLISECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMicrosecondType>( - |x| (x as i64) * MICROSECONDS_IN_DAY, - ), + array + .as_primitive::() + .unary::<_, TimestampMicrosecondType>(|x| (x as i64) * MICROSECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampNanosecondType>(|x| (x as i64) * NANOSECONDS_IN_DAY), )), (Int64, Duration(TimeUnit::Second)) => { @@ -2416,9 +2267,7 @@ where // Natural cast between numeric types // If the value of T can't be casted to R, will throw error -fn try_numeric_cast( - from: &PrimitiveArray, -) -> Result, ArrowError> +fn try_numeric_cast(from: &PrimitiveArray) -> Result, ArrowError> where T: ArrowPrimitiveType, R: ArrowPrimitiveType, @@ -2519,11 +2368,7 @@ fn cast_string_to_timestamp( Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) } -fn cast_string_to_timestamp_impl< - O: OffsetSizeTrait, - T: ArrowTimestampType, - Tz: TimeZone, ->( +fn cast_string_to_timestamp_impl( array: &GenericStringArray, tz: &Tz, cast_options: &CastOptions, @@ -2680,9 +2525,7 @@ fn adjust_timestamp_to_timezone( } else { array.try_unary::<_, Int64Type, _>(|o| { adjust(o).ok_or_else(|| { - ArrowError::CastError( - "Cannot cast timezone to different timezone".to_string(), - ) + ArrowError::CastError("Cannot cast timezone to different timezone".to_string()) }) })? }; @@ -2706,11 +2549,10 @@ where .iter() .map(|value| match value { Some(value) => match value.to_ascii_lowercase().trim() { - "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => { - Ok(Some(true)) + "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)), + "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => { + Ok(Some(false)) } - "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" - | "0" => Ok(Some(false)), invalid_value => match cast_options.safe { true => Ok(None), false => Err(ArrowError::CastError(format!( @@ -2748,13 +2590,10 @@ where // Adjust decimal based on scale let number_decimals = if decimals.len() > scale { let decimal_number = i256::from_string(decimals).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Cannot parse decimal format: {value_str}" - )) + ArrowError::InvalidArgumentError(format!("Cannot parse decimal format: {value_str}")) })?; - let div = - i256::from_i128(10_i128).pow_checked((decimals.len() - scale) as u32)?; + let div = i256::from_i128(10_i128).pow_checked((decimals.len() - scale) as u32)?; let half = div.div_wrapping(i256::from_i128(2)); let half_neg = half.neg_wrapping(); @@ -2776,9 +2615,7 @@ where "Cannot parse decimal format: {value_str}" )) }) - .map(|v| { - v.mul_wrapping(i256::from_i128(10_i128).pow_wrapping(scale as u32)) - })? + .map(|v| v.mul_wrapping(i256::from_i128(10_i128).pow_wrapping(scale as u32)))? } else { i256::ZERO }; @@ -2800,11 +2637,7 @@ where })?; T::Native::from_decimal(value).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Cannot convert {} to {}", - value_str, - T::PREFIX - )) + ArrowError::InvalidArgumentError(format!("Cannot convert {} to {}", value_str, T::PREFIX)) }) } @@ -2848,9 +2681,7 @@ where T::DATA_TYPE, )) }) - .and_then(|v| { - T::validate_decimal_precision(v, precision).map(|_| v) - }) + .and_then(|v| T::validate_decimal_precision(v, precision).map(|_| v)) }) .transpose() }) @@ -2907,8 +2738,7 @@ fn cast_numeric_to_bool(from: &dyn Array) -> Result where FROM: ArrowPrimitiveType, { - numeric_to_bool_cast::(from.as_primitive::()) - .map(|to| Arc::new(to) as ArrayRef) + numeric_to_bool_cast::(from.as_primitive::()).map(|to| Arc::new(to) as ArrayRef) } fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result @@ -2947,10 +2777,7 @@ where ))) } -fn bool_to_numeric_cast( - from: &BooleanArray, - _cast_options: &CastOptions, -) -> PrimitiveArray +fn bool_to_numeric_cast(from: &BooleanArray, _cast_options: &CastOptions) -> PrimitiveArray where T: ArrowPrimitiveType, T::Native: num::NumCast, @@ -2998,8 +2825,7 @@ fn dictionary_cast( Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); let values_array = dict_array.values(); let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; - let cast_values = - cast_with_options(values_array, to_value_type, cast_options)?; + let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; // Failure to cast keys (because they don't fit in the // target type) results in NULL values; @@ -3071,66 +2897,24 @@ fn cast_to_dictionary( use DataType::*; match *dict_value_type { - Int8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Decimal128(_, _) => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Decimal256(_, _) => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Utf8 => pack_byte_to_dictionary::>(array, cast_options), - LargeUtf8 => { - pack_byte_to_dictionary::>(array, cast_options) - } - Binary => { - pack_byte_to_dictionary::>(array, cast_options) - } - LargeBinary => { - pack_byte_to_dictionary::>(array, cast_options) + Int8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Decimal128(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + } + Decimal256(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) } + Utf8 => pack_byte_to_dictionary::>(array, cast_options), + LargeUtf8 => pack_byte_to_dictionary::>(array, cast_options), + Binary => pack_byte_to_dictionary::>(array, cast_options), + LargeBinary => pack_byte_to_dictionary::>(array, cast_options), _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {dict_value_type:?}" ))), @@ -3152,8 +2936,7 @@ where let cast_values = cast_with_options(array, dict_value_type, cast_options)?; let values = cast_values.as_primitive::(); - let mut b = - PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); + let mut b = PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); // copy each element one at a time for i in 0..values.len() { @@ -3181,8 +2964,7 @@ where .as_any() .downcast_ref::>() .unwrap(); - let mut b = - GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); + let mut b = GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); // copy each element one at a time for i in 0..values.len() { @@ -3216,8 +2998,7 @@ fn cast_list_inner( ) -> Result { let data = array.to_data(); let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = - cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; + let cast_array = cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; let builder = data .into_builder() .data_type(to_type.clone()) @@ -3246,10 +3027,8 @@ fn cast_binary_to_string( Err(e) => match cast_options.safe { true => { // Fallback to slow method to convert invalid sequences to nulls - let mut builder = GenericStringBuilder::::with_capacity( - array.len(), - array.value_data().len(), - ); + let mut builder = + GenericStringBuilder::::with_capacity(array.len(), array.value_data().len()); let iter = array .iter() @@ -3344,8 +3123,8 @@ where offsets .iter() .try_for_each::<_, Result<_, ArrowError>>(|offset| { - let offset = <::Offset as NumCast>::from(*offset) - .ok_or_else(|| { + let offset = + <::Offset as NumCast>::from(*offset).ok_or_else(|| { ArrowError::ComputeError(format!( "{}{} array too large to cast to {}{} array", FROM::Offset::PREFIX, @@ -3374,9 +3153,7 @@ where Ok(Arc::new(GenericByteArray::::from(array_data))) } -fn cast_fixed_size_list_to_list( - array: &dyn Array, -) -> Result +fn cast_fixed_size_list_to_list(array: &dyn Array) -> Result where OffsetSize: OffsetSizeTrait, { @@ -3457,8 +3234,8 @@ mod tests { macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { - let output = $OUTPUT_TYPE_ARRAY::from($OUTPUT_VALUES) - .with_data_type($OUTPUT_TYPE.clone()); + let output = + $OUTPUT_TYPE_ARRAY::from($OUTPUT_VALUES).with_data_type($OUTPUT_TYPE.clone()); // assert cast type let input_array_type = $INPUT_ARRAY.data_type(); @@ -3471,8 +3248,7 @@ mod tests { safe: false, format_options: FormatOptions::default(), }; - let result = - cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); + let result = cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); assert_eq!($OUTPUT_TYPE, result.data_type()); assert_eq!(result.as_ref(), &output); }; @@ -3806,8 +3582,7 @@ mod tests { #[test] fn test_cast_decimal_to_numeric() { - let value_array: Vec> = - vec![Some(125), Some(225), Some(325), None, Some(525)]; + let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; let array = create_decimal_array(value_array, 38, 2).unwrap(); // u8 generate_cast_test_case!( @@ -4619,8 +4394,7 @@ mod tests { #[test] fn test_cast_i32_to_list_f64_nullable_sliced() { - let array = - Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); + let array = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); let array = array.slice(2, 4); let b = cast( &array, @@ -4670,9 +4444,8 @@ mod tests { Ok(_) => panic!("expected error"), Err(e) => { assert!( - e.to_string().contains( - "Cast error: Cannot cast string 'seven' to value of Int32 type", - ), + e.to_string() + .contains("Cast error: Cannot cast string 'seven' to value of Int32 type",), "Error: {e}" ) } @@ -4683,8 +4456,7 @@ mod tests { fn test_cast_utf8_to_bool() { let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); let casted = cast(&strings, &DataType::Boolean).unwrap(); - let expected = - BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); + let expected = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); assert_eq!(*as_boolean_array(&casted), expected); } @@ -4702,9 +4474,9 @@ mod tests { match casted { Ok(_) => panic!("expected error"), Err(e) => { - assert!(e.to_string().contains( - "Cast error: Cannot cast value 'invalid' to value of Boolean type" - )) + assert!(e + .to_string() + .contains("Cast error: Cannot cast value 'invalid' to value of Boolean type")) } } } @@ -4750,9 +4522,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] + #[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] fn test_cast_int32_to_timestamp() { let array = Int32Array::from(vec![Some(2), Some(10), None]); cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); @@ -4760,15 +4530,13 @@ mod tests { #[test] fn test_cast_list_i32_to_list_u16() { - let value_data = - Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two // [[0,0,0], [-1, -2, -1], [2, 100000000]] - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4812,19 +4580,15 @@ mod tests { } #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] + #[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] fn test_cast_list_i32_to_list_timestamp() { // Construct a value array - let value_data = - Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 9]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4969,7 +4733,10 @@ mod tests { format_options: FormatOptions::default(), }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type"); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type" + ); } } @@ -5126,14 +4893,16 @@ mod tests { format_options: FormatOptions::default(), }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type"); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type" + ); } } macro_rules! test_safe_string_to_interval { ($data_vec:expr, $interval_unit:expr, $array_ty:ty, $expect_vec:expr) => { - let source_string_array = - Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; + let source_string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; let options = CastOptions { safe: true, @@ -5427,12 +5196,9 @@ mod tests { #[test] fn test_cast_timestamp_to_date32() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) + .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Date32).unwrap(); let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); @@ -5442,19 +5208,15 @@ mod tests { #[test] fn test_cast_timestamp_to_date64() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000005, c.value(0)); assert_eq!(1545696000001, c.value(1)); assert!(c.is_null(2)); - let array = - TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); + let array = TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000005000, c.value(0)); @@ -5506,9 +5268,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp microseconds - let a = - TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) - .with_timezone("+01:00".to_string()); + let a = TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); let c = b.as_primitive::(); @@ -5522,12 +5283,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp nanoseconds - let a = TimestampNanosecondArray::from(vec![ - Some(86405000000000), - Some(1000000000), - None, - ]) - .with_timezone("+01:00".to_string()); + let a = TimestampNanosecondArray::from(vec![Some(86405000000000), Some(1000000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); let c = b.as_primitive::(); @@ -5541,8 +5298,8 @@ mod tests { assert!(c.is_null(2)); // test overflow - let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) - .with_timezone("+01:00".to_string()); + let a = + TimestampSecondArray::from(vec![Some(i64::MAX)]).with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)); assert!(b.is_err()); @@ -5585,9 +5342,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp microseconds - let a = - TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) - .with_timezone("+01:00".to_string()); + let a = TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); let c = b.as_primitive::(); @@ -5601,12 +5357,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp nanoseconds - let a = TimestampNanosecondArray::from(vec![ - Some(86405000000000), - Some(1000000000), - None, - ]) - .with_timezone("+01:00".to_string()); + let a = TimestampNanosecondArray::from(vec![Some(86405000000000), Some(1000000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); let c = b.as_primitive::(); @@ -5620,8 +5372,8 @@ mod tests { assert!(c.is_null(2)); // test overflow - let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) - .with_timezone("+01:00".to_string()); + let a = + TimestampSecondArray::from(vec![Some(i64::MAX)]).with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)); assert!(b.is_err()); @@ -5708,8 +5460,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000, c.value(0)); @@ -5719,8 +5470,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ms() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Millisecond, None)).unwrap(); let c = b .as_any() @@ -5733,8 +5483,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_us() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); let c = b .as_any() @@ -5747,8 +5496,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ns() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); let c = b .as_any() @@ -5761,12 +5509,9 @@ mod tests { #[test] fn test_cast_timestamp_to_i64() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) + .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Int64).unwrap(); let c = b.as_primitive::(); assert_eq!(&DataType::Int64, c.data_type()); @@ -5798,11 +5543,8 @@ mod tests { #[test] fn test_cast_timestamp_to_strings() { // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None - let array = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); let out = cast(&array, &DataType::Utf8).unwrap(); let out = out .as_any() @@ -5846,13 +5588,9 @@ mod tests { .with_timestamp_tz_format(Some(ts_format)), }; // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None - let array_without_tz = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); - let out = - cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); + let array_without_tz = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); + let out = cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5868,8 +5606,7 @@ mod tests { ] ); let out = - cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options) - .unwrap(); + cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5885,14 +5622,10 @@ mod tests { ] ); - let array_with_tz = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]) - .with_timezone(tz.to_string()); - let out = - cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); + let array_with_tz = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]) + .with_timezone(tz.to_string()); + let out = cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5907,8 +5640,7 @@ mod tests { None ] ); - let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options) - .unwrap(); + let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5927,11 +5659,8 @@ mod tests { #[test] fn test_cast_between_timestamps() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_primitive::(); assert_eq!(864000003, c.value(0)); @@ -6335,8 +6064,7 @@ mod tests { ]; let u64_array: ArrayRef = Arc::new(UInt64Array::from(u64_values)); - let f64_expected = - vec![0.0, 255.0, 65535.0, 4294967295.0, 18446744073709552000.0]; + let f64_expected = vec![0.0, 255.0, 65535.0, 4294967295.0, 18446744073709552000.0]; assert_eq!( f64_expected, get_cast_values::(&u64_array, &DataType::Float64) @@ -6345,8 +6073,7 @@ mod tests { .collect::>() ); - let f32_expected = - vec![0.0, 255.0, 65535.0, 4294967300.0, 18446744000000000000.0]; + let f32_expected = vec![0.0, 255.0, 65535.0, 4294967300.0, 18446744000000000000.0]; assert_eq!( f32_expected, get_cast_values::(&u64_array, &DataType::Float32) @@ -6379,8 +6106,7 @@ mod tests { get_cast_values::(&u64_array, &DataType::Int8) ); - let u64_expected = - vec!["0", "255", "65535", "4294967295", "18446744073709551615"]; + let u64_expected = vec!["0", "255", "65535", "4294967295", "18446744073709551615"]; assert_eq!( u64_expected, get_cast_values::(&u64_array, &DataType::UInt64) @@ -6811,15 +6537,13 @@ mod tests { get_cast_values::(&i32_array, &DataType::Int8) ); - let u64_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; + let u64_expected = vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; assert_eq!( u64_expected, get_cast_values::(&i32_array, &DataType::UInt64) ); - let u32_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; + let u32_expected = vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; assert_eq!( u32_expected, get_cast_values::(&i32_array, &DataType::UInt32) @@ -6855,8 +6579,7 @@ mod tests { #[test] fn test_cast_from_int16() { - let i16_values: Vec = - vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX]; + let i16_values: Vec = vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX]; let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values)); let f64_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"]; @@ -7197,8 +6920,7 @@ mod tests { fn test_cast_string_array_to_dict() { use DataType::*; - let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) - as ArrayRef; + let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) as ArrayRef; let expected = vec!["one", "null", "three"]; @@ -7297,16 +7019,12 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to list - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); - let data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); - let data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 4, - ); + let data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); cast_from_null_to_other(&data_type); // Cast null from and to dictionary @@ -7317,8 +7035,7 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to struct - let data_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); + let data_type = DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); cast_from_null_to_other(&data_type); } @@ -7511,8 +7228,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7554,10 +7270,8 @@ mod tests { .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 4, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7574,10 +7288,8 @@ mod tests { .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int64, true)), - 4, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7618,8 +7330,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); let value_data = str_array.into_data(); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7958,12 +7669,7 @@ mod tests { let array = vec![Some(123)]; let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; - generate_cast_test_case!( - &array, - Decimal128Array, - &output_type, - vec![Some(12_i128),] - ); + generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(12_i128),]); let casted_array = cast(&array, &output_type).unwrap(); let decimal_arr = casted_array.as_primitive::(); @@ -7973,12 +7679,7 @@ mod tests { let array = vec![Some(125)]; let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; - generate_cast_test_case!( - &array, - Decimal128Array, - &output_type, - vec![Some(13_i128),] - ); + generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(13_i128),]); let casted_array = cast(&array, &output_type).unwrap(); let decimal_arr = casted_array.as_primitive::(); @@ -8220,9 +7921,9 @@ mod tests { let str_array = StringArray::from(vec![". 0.123"]); let array = Arc::new(str_array) as ArrayRef; let casted_err = cast_with_options(&array, &output_type, &option).unwrap_err(); - assert!(casted_err.to_string().contains( - "Cannot cast string '. 0.123' to value of Decimal128(38, 10) type" - )); + assert!(casted_err + .to_string() + .contains("Cannot cast string '. 0.123' to value of Decimal128(38, 10) type")); } fn test_cast_string_to_decimal128_overflow(overflow_array: ArrayRef) { @@ -8499,9 +8200,8 @@ mod tests { let tz = tz.as_ref().parse().unwrap(); - let as_tz = |v: i64| { - as_datetime_with_timezone::(v, tz).unwrap() - }; + let as_tz = + |v: i64| as_datetime_with_timezone::(v, tz).unwrap(); let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string(); let as_local = |v: &i64| as_tz(*v).naive_local().to_string(); @@ -8611,8 +8311,7 @@ mod tests { None, ]; - let array256: Vec> = - array128.iter().map(|v| v.map(i256::from_i128)).collect(); + let array256: Vec> = array128.iter().map(|v| v.map(i256::from_i128)).collect(); test_decimal_to_string::( DataType::Utf8, @@ -8701,11 +8400,9 @@ mod tests { fn test_cast_from_duration_to_interval() { // from duration second to interval month day nano let array = vec![1234567]; - let casted_array = cast_from_duration_to_interval::( - array, - &CastOptions::default(), - ) - .unwrap(); + let casted_array = + cast_from_duration_to_interval::(array, &CastOptions::default()) + .unwrap(); assert_eq!( casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) @@ -8824,10 +8521,7 @@ mod tests { .as_any() .downcast_ref::>() .ok_or_else(|| { - ArrowError::ComputeError(format!( - "Failed to downcast to {}", - T::DATA_TYPE - )) + ArrowError::ComputeError(format!("Failed to downcast to {}", T::DATA_TYPE)) }) .cloned() } @@ -8865,8 +8559,7 @@ mod tests { cast_from_interval_to_duration(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); - let res = - cast_from_interval_to_duration::(&array, &fallible); + let res = cast_from_interval_to_duration::(&array, &fallible); assert!(res.is_err()); // from interval month day nano to duration microsecond @@ -8877,8 +8570,7 @@ mod tests { let array = vec![i128::MAX].into(); let casted_array = - cast_from_interval_to_duration::(&array, &nullable) - .unwrap(); + cast_from_interval_to_duration::(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = @@ -8909,8 +8601,7 @@ mod tests { ] .into(); let casted_array = - cast_from_interval_to_duration::(&array, &nullable) - .unwrap(); + cast_from_interval_to_duration::(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); assert!(!casted_array.is_valid(1)); assert!(!casted_array.is_valid(2)); @@ -8979,11 +8670,9 @@ mod tests { fn test_cast_from_interval_day_time_to_interval_month_day_nano() { // from interval day time to interval month day nano let array = vec![123]; - let casted_array = cast_from_interval_day_time_to_interval_month_day_nano( - array, - &CastOptions::default(), - ) - .unwrap(); + let casted_array = + cast_from_interval_day_time_to_interval_month_day_nano(array, &CastOptions::default()) + .unwrap(); assert_eq!( casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) @@ -9017,8 +8706,7 @@ mod tests { .map(|ts| ts / 1_000_000) .collect::>(); - let array = - TimestampMillisecondArray::from(ts_array).with_timezone("UTC".to_string()); + let array = TimestampMillisecondArray::from(ts_array).with_timezone("UTC".to_string()); let casted_array = cast(&array, &DataType::Date32).unwrap(); let date_array = casted_array.as_primitive::(); let casted_array = cast(&date_array, &DataType::Utf8).unwrap(); diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 246135e114bc..28c29c94bbdb 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -129,10 +129,7 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone - pub const fn with_timestamp_tz_format( - self, - timestamp_tz_format: Option<&'a str>, - ) -> Self { + pub const fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self { Self { timestamp_tz_format, ..self @@ -173,9 +170,7 @@ impl<'a> ValueFormatter<'a> { match self.formatter.format.write(self.idx, s) { Ok(_) => Ok(()), Err(FormatError::Arrow(e)) => Err(e), - Err(FormatError::Format(_)) => { - Err(ArrowError::CastError("Format error".to_string())) - } + Err(FormatError::Format(_)) => Err(ArrowError::CastError("Format error".to_string())), } } @@ -260,10 +255,7 @@ impl<'a> ArrayFormatter<'a> { /// Returns an [`ArrayFormatter`] that can be used to format `array` /// /// This returns an error if an array of the given data type cannot be formatted - pub fn try_new( - array: &'a dyn Array, - options: &FormatOptions<'a>, - ) -> Result { + pub fn try_new(array: &'a dyn Array, options: &FormatOptions<'a>) -> Result { Ok(Self { format: make_formatter(array, options)?, safe: options.safe, @@ -472,9 +464,7 @@ fn write_timestamp( let date = Utc.from_utc_datetime(&naive).with_timezone(&tz); match format { Some(s) => write!(f, "{}", date.format(s))?, - None => { - write!(f, "{}", date.to_rfc3339_opts(SecondsFormat::AutoSi, true))? - } + None => write!(f, "{}", date.to_rfc3339_opts(SecondsFormat::AutoSi, true))?, } } None => match format { @@ -526,19 +516,11 @@ macro_rules! temporal_display { impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { type State = TimeFormat<'a>; - fn prepare( - &self, - options: &FormatOptions<'a>, - ) -> Result { + fn prepare(&self, options: &FormatOptions<'a>) -> Result { Ok(options.$format) } - fn write( - &self, - fmt: &Self::State, - idx: usize, - f: &mut dyn Write, - ) -> FormatResult { + fn write(&self, fmt: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); let naive = $convert(value as _).ok_or_else(|| { ArrowError::CastError(format!( @@ -575,19 +557,11 @@ macro_rules! duration_display { impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { type State = DurationFormat; - fn prepare( - &self, - options: &FormatOptions<'a>, - ) -> Result { + fn prepare(&self, options: &FormatOptions<'a>) -> Result { Ok(options.duration_format) } - fn write( - &self, - fmt: &Self::State, - idx: usize, - f: &mut dyn Write, - ) -> FormatResult { + fn write(&self, fmt: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { let v = self.value(idx); match fmt { DurationFormat::ISO8601 => write!(f, "{}", $convert(v))?, @@ -704,8 +678,7 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value: u128 = self.value(idx) as u128; - let months_part: i32 = - ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; @@ -937,10 +910,7 @@ impl<'a> DisplayIndexState<'a> for &'a UnionArray { /// suitable for converting large arrays or record batches. /// /// Please see [`ArrayFormatter`] for a more performant interface -pub fn array_value_to_string( - column: &dyn Array, - row: usize, -) -> Result { +pub fn array_value_to_string(column: &dyn Array, row: usize) -> Result { let options = FormatOptions::default().with_display_error(true); let formatter = ArrayFormatter::try_new(column, &options)?; Ok(formatter.value(row).to_string()) @@ -986,12 +956,9 @@ mod tests { // [[a, b, c], [d, e, f], [g, h]] let entry_offsets = [0, 3, 6, 8]; - let map_array = MapArray::new_from_strings( - keys.clone().into_iter(), - &values_data, - &entry_offsets, - ) - .unwrap(); + let map_array = + MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) + .unwrap(); assert_eq!( "{d: 30, e: 40, f: 50}", array_value_to_string(&map_array, 1).unwrap() @@ -1006,8 +973,7 @@ mod tests { #[test] fn test_array_value_to_string_duration() { let iso_fmt = FormatOptions::new(); - let pretty_fmt = - FormatOptions::new().with_duration_format(DurationFormat::Pretty); + let pretty_fmt = FormatOptions::new().with_duration_format(DurationFormat::Pretty); let array = DurationNanosecondArray::from(vec![ 1, diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 3806f0adc5d6..f01b2b4c0d63 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -64,10 +64,7 @@ impl TimestampParser { /// Parses a date of the form `1997-01-31` fn date(&self) -> Option { - if self.mask & 0b1111111111 != 0b1101101111 - || !self.test(4, b'-') - || !self.test(7, b'-') - { + if self.mask & 0b1111111111 != 0b1101101111 || !self.test(4, b'-') || !self.test(7, b'-') { return None; } @@ -173,13 +170,9 @@ impl TimestampParser { /// * "2023-01-01 04:05:06.789 PST", /// /// [IANA timezones]: https://www.iana.org/time-zones -pub fn string_to_datetime( - timezone: &T, - s: &str, -) -> Result, ArrowError> { - let err = |ctx: &str| { - ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) - }; +pub fn string_to_datetime(timezone: &T, s: &str) -> Result, ArrowError> { + let err = + |ctx: &str| ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")); let bytes = s.as_bytes(); if bytes.len() < 10 { @@ -300,9 +293,8 @@ fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { /// This function does not support parsing strings with a timezone /// or offset specified, as it considers only time since midnight. pub fn string_to_time_nanoseconds(s: &str) -> Result { - let nt = string_to_time(s).ok_or_else(|| { - ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) - })?; + let nt = string_to_time(s) + .ok_or_else(|| ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")))?; Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) } @@ -313,12 +305,8 @@ fn string_to_time(s: &str) -> Option { } let (am, bytes) = match bytes.get(bytes.len() - 3..) { - Some(b" AM" | b" am" | b" Am" | b" aM") => { - (Some(true), &bytes[..bytes.len() - 3]) - } - Some(b" PM" | b" pm" | b" pM" | b" Pm") => { - (Some(false), &bytes[..bytes.len() - 3]) - } + Some(b" AM" | b" am" | b" Am" | b" aM") => (Some(true), &bytes[..bytes.len() - 3]), + Some(b" PM" | b" pm" | b" pM" | b" Pm") => (Some(false), &bytes[..bytes.len() - 3]), _ => (None, bytes), }; @@ -501,10 +489,7 @@ impl Parser for Time64NanosecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 - + nt.nanosecond() as i64, - ) + Some(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) } } @@ -519,10 +504,7 @@ impl Parser for Time64MicrosecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000 - + nt.nanosecond() as i64 / 1_000, - ) + Some(nt.num_seconds_from_midnight() as i64 * 1_000_000 + nt.nanosecond() as i64 / 1_000) } } @@ -537,10 +519,7 @@ impl Parser for Time32MillisecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 * 1_000 - + nt.nanosecond() as i32 / 1_000_000, - ) + Some(nt.num_seconds_from_midnight() as i32 * 1_000 + nt.nanosecond() as i32 / 1_000_000) } } @@ -555,10 +534,7 @@ impl Parser for Time32SecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 - + nt.nanosecond() as i32 / 1_000_000_000, - ) + Some(nt.num_seconds_from_midnight() as i32 + nt.nanosecond() as i32 / 1_000_000_000) } } @@ -615,10 +591,8 @@ fn parse_date(string: &str) -> Option { _ => return None, }; - let year = digits[0] as u16 * 1000 - + digits[1] as u16 * 100 - + digits[2] as u16 * 10 - + digits[3] as u16; + let year = + digits[0] as u16 * 1000 + digits[1] as u16 * 100 + digits[2] as u16 * 10 + digits[3] as u16; NaiveDate::from_ymd_opt(year as _, month as _, day as _) } @@ -728,8 +702,7 @@ pub fn parse_decimal( fractionals += 1; digits += 1; result = result.mul_wrapping(base); - result = - result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } // Fail on "." @@ -771,9 +744,11 @@ pub fn parse_interval_year_month( let config = IntervalParseConfig::new(IntervalUnit::Year); let interval = Interval::parse(value, &config)?; - let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( + let months = interval.to_year_months().map_err(|_| { + ArrowError::CastError(format!( "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." - )))?; + )) + })?; Ok(IntervalYearMonthType::make_value(0, months)) } @@ -888,21 +863,16 @@ impl FromStr for IntervalAmount { Ok(0) } else { integer.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) }) }?; let frac_unscaled = frac.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) })?; // scale fractional part by interval precision - let frac = - frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); + let frac = frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); // propagate the sign of the integer part to the fractional part let frac = if integer < 0 || explicit_neg { @@ -915,9 +885,9 @@ impl FromStr for IntervalAmount { Ok(result) } - Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( - format!("Failed to parse {s} as interval amount"), - )), + Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + ))), Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { Err(ArrowError::ParseError(format!( "{s} exceeds the precision available for interval amount" @@ -925,9 +895,7 @@ impl FromStr for IntervalAmount { } Some(_) | None => { let integer = s.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) })?; let result = Self { integer, frac: 0 }; @@ -1005,25 +973,20 @@ impl Interval { /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) - fn add( - &self, - amount: IntervalAmount, - unit: IntervalUnit, - ) -> Result { + fn add(&self, amount: IntervalAmount, unit: IntervalUnit) -> Result { let result = match unit { IntervalUnit::Century => { let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} centuries as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} centuries as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } @@ -1031,32 +994,30 @@ impl Interval { let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} decades as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} decades as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } IntervalUnit::Year => { let months_int = amount.integer.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} years as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} years as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } @@ -1090,8 +1051,7 @@ impl Interval { )) })?; - let nanos = - amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); Self::new( self.months, @@ -1107,8 +1067,7 @@ impl Interval { )) })?; - let nanos = - amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); Self::new( self.months, @@ -1118,8 +1077,7 @@ impl Interval { } IntervalUnit::Hour => { let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; - let nanos_frac = - amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos_frac = amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); let nanos = nanos_int.add_checked(nanos_frac)?; Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) @@ -1398,8 +1356,7 @@ mod tests { "2030-12-04T17:11:10.123456", ]; for case in cases { - let chrono = - NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let chrono = NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); let custom = string_to_datetime(&Utc, case).unwrap(); assert_eq!(chrono, custom.naive_utc()) } @@ -1431,8 +1388,7 @@ mod tests { ]; for (s, ctx) in cases { - let expected = - format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let expected = format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); assert_eq!(actual, expected) } @@ -1497,8 +1453,7 @@ mod tests { assert_eq!(local, "2020-09-08 15:42:29"); let dt = - NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") - .unwrap(); + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ").unwrap(); let local: Tz = "+08:00".parse().unwrap(); // Parsed as offset from UTC @@ -1629,10 +1584,7 @@ mod tests { // custom format assert_eq!( - Time64NanosecondType::parse_formatted( - "02 - 10 - 01 - .1234567", - "%H - %M - %S - %.f" - ), + Time64NanosecondType::parse_formatted("02 - 10 - 01 - .1234567", "%H - %M - %S - %.f"), Some(7_801_123_456_700) ); } @@ -1709,10 +1661,7 @@ mod tests { // custom format assert_eq!( - Time64MicrosecondType::parse_formatted( - "02 - 10 - 01 - .1234", - "%H - %M - %S - %.f" - ), + Time64MicrosecondType::parse_formatted("02 - 10 - 01 - .1234", "%H - %M - %S - %.f"), Some(7_801_123_400) ); } @@ -1759,10 +1708,7 @@ mod tests { // custom format assert_eq!( - Time32MillisecondType::parse_formatted( - "02 - 10 - 01 - .1", - "%H - %M - %S - %.f" - ), + Time32MillisecondType::parse_formatted("02 - 10 - 01 - .1", "%H - %M - %S - %.f"), Some(7_801_100) ); } @@ -2005,8 +1951,19 @@ mod tests { ); assert_eq!( - Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), - Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), + Interval::new( + -13i32, + -8i32, + -NANOS_PER_HOUR + - NANOS_PER_MINUTE + - NANOS_PER_SECOND + - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64 + ), + Interval::parse( + "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", + &config + ) + .unwrap(), ); } @@ -2280,22 +2237,34 @@ mod tests { let edge_tests_256 = [ ( "9999999999999999999999999999999999999999999999999999999999999999999999999999", -i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 0, ), ( "999999999999999999999999999999999999999999999999999999999999999999999999.9999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 4, ), ( "99999999999999999999999999999999999999999999999999.99999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 26, ), ( "99999999999999999999999999999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999900000000000000000000000000", + ) + .unwrap(), 26, ), ]; diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 59a9f9d605e2..550afa9f739d 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -25,9 +25,7 @@ use comfy_table::{Cell, Table}; use std::fmt::Display; /// Create a visual representation of record batches -pub fn pretty_format_batches( - results: &[RecordBatch], -) -> Result { +pub fn pretty_format_batches(results: &[RecordBatch]) -> Result { let options = FormatOptions::default().with_display_error(true); pretty_format_batches_with_options(results, &options) } @@ -70,10 +68,7 @@ pub fn print_columns(col_name: &str, results: &[ArrayRef]) -> Result<(), ArrowEr } /// Convert a series of record batches into a table -fn create_table( - results: &[RecordBatch], - options: &FormatOptions, -) -> Result { +fn create_table(results: &[RecordBatch], options: &FormatOptions) -> Result { let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -209,8 +204,8 @@ mod tests { let table = pretty_format_columns("a", &columns).unwrap().to_string(); let expected = vec![ - "+---+", "| a |", "+---+", "| a |", "| b |", "| |", "| d |", "| e |", - "| |", "| g |", "+---+", + "+---+", "| a |", "+---+", "| a |", "| b |", "| |", "| d |", "| e |", "| |", + "| g |", "+---+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -289,10 +284,8 @@ mod tests { #[test] fn test_pretty_format_fixed_size_list() { // define a schema. - let field_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 3, - ); + let field_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); let keys_builder = Int32Array::builder(3); @@ -383,10 +376,7 @@ mod tests { }; } - fn timestamp_batch( - timezone: &str, - value: T::Native, - ) -> RecordBatch { + fn timestamp_batch(timezone: &str, value: T::Native) -> RecordBatch { let mut builder = PrimitiveBuilder::::with_capacity(10); builder.append_value(value); builder.append_null(); @@ -621,8 +611,8 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+------+", "| f |", "+------+", "| 101 |", "| |", "| 200 |", - "| 3040 |", "+------+", + "+------+", "| f |", "+------+", "| 101 |", "| |", "| 200 |", "| 3040 |", + "+------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -660,16 +650,14 @@ mod tests { )), Arc::new(StructArray::from(vec![( Arc::new(Field::new("c121", DataType::Utf8, false)), - Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) - as ArrayRef, + Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, ), ]); let c2 = StringArray::from(vec![Some("a"), Some("b"), Some("c")]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ @@ -705,8 +693,7 @@ mod tests { UnionMode::Dense, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -742,8 +729,7 @@ mod tests { UnionMode::Sparse, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -799,8 +785,7 @@ mod tests { UnionMode::Sparse, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(outer)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(outer)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -882,8 +867,7 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+------+", "| f16 |", "+------+", "| NaN |", "| 4 |", "| -inf |", - "+------+", + "+------+", "| f16 |", "+------+", "| NaN |", "| 4 |", "| -inf |", "+------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -986,9 +970,7 @@ mod tests { fn test_format_options() { let options = FormatOptions::default().with_null("null"); let array = Int32Array::from(vec![Some(1), Some(2), None, Some(3), Some(4)]); - let batch = - RecordBatch::try_from_iter([("my_column_name", Arc::new(array) as _)]) - .unwrap(); + let batch = RecordBatch::try_from_iter([("my_column_name", Arc::new(array) as _)]).unwrap(); let column = pretty_format_columns_with_options( "my_column_name", diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index a194b35ffa46..83c8965fdf8a 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -292,8 +292,7 @@ impl Format { let header_length = headers.len(); // keep track of inferred field types - let mut column_types: Vec = - vec![Default::default(); header_length]; + let mut column_types: Vec = vec![Default::default(); header_length]; let mut records_count = 0; @@ -307,9 +306,7 @@ impl Format { // Note since we may be looking at a sample of the data, we make the safe assumption that // they could be nullable - for (i, column_type) in - column_types.iter_mut().enumerate().take(header_length) - { + for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) { if let Some(string) = record.get(i) { if !self.null_regex.is_null(string) { column_type.update(string) @@ -606,8 +603,7 @@ impl Decoder { return Ok(bytes); } - let to_read = - self.batch_size.min(self.end - self.line_number) - self.record_decoder.len(); + let to_read = self.batch_size.min(self.end - self.line_number) - self.record_decoder.len(); let (_, bytes) = self.record_decoder.decode(buf, to_read)?; Ok(bytes) } @@ -662,29 +658,23 @@ fn parse( let i = *i; let field = &fields[i]; match field.data_type() { - DataType::Boolean => { - build_boolean_array(line_number, rows, i, null_regex) - } - DataType::Decimal128(precision, scale) => { - build_decimal_array::( - line_number, - rows, - i, - *precision, - *scale, - null_regex, - ) - } - DataType::Decimal256(precision, scale) => { - build_decimal_array::( - line_number, - rows, - i, - *precision, - *scale, - null_regex, - ) - } + DataType::Boolean => build_boolean_array(line_number, rows, i, null_regex), + DataType::Decimal128(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), + DataType::Decimal256(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), DataType::Int8 => { build_primitive_array::(line_number, rows, i, null_regex) } @@ -721,34 +711,17 @@ fn parse( DataType::Date64 => { build_primitive_array::(line_number, rows, i, null_regex) } - DataType::Time32(TimeUnit::Second) => build_primitive_array::< - Time32SecondType, - >( - line_number, rows, i, null_regex - ), + DataType::Time32(TimeUnit::Second) => { + build_primitive_array::(line_number, rows, i, null_regex) + } DataType::Time32(TimeUnit::Millisecond) => { - build_primitive_array::( - line_number, - rows, - i, - null_regex, - ) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time64(TimeUnit::Microsecond) => { - build_primitive_array::( - line_number, - rows, - i, - null_regex, - ) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time64(TimeUnit::Nanosecond) => { - build_primitive_array::( - line_number, - rows, - i, - null_regex, - ) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Timestamp(TimeUnit::Second, tz) => { build_timestamp_array::( @@ -786,9 +759,7 @@ fn parse( null_regex, ) } - DataType::Null => { - Ok(Arc::new(NullArray::builder(rows.len()).finish()) as ArrayRef) - } + DataType::Null => Ok(Arc::new(NullArray::builder(rows.len()).finish()) as ArrayRef), DataType::Utf8 => Ok(Arc::new( rows.iter() .map(|row| { @@ -853,8 +824,7 @@ fn parse( }) .collect(); - let projected_fields: Fields = - projection.iter().map(|i| fields[*i].clone()).collect(); + let projected_fields: Fields = projection.iter().map(|i| fields[*i].clone()).collect(); let projected_schema = Arc::new(match metadata { None => Schema::new(projected_fields), @@ -898,8 +868,7 @@ fn build_decimal_array( // append null decimal_builder.append_null(); } else { - let decimal_value: Result = - parse_decimal::(s, precision, scale); + let decimal_value: Result = parse_decimal::(s, precision, scale); match decimal_value { Ok(v) => { decimal_builder.append_value(v); @@ -957,22 +926,10 @@ fn build_timestamp_array( Ok(Arc::new(match timezone { Some(timezone) => { let tz: Tz = timezone.parse()?; - build_timestamp_array_impl::( - line_number, - rows, - col_idx, - &tz, - null_regex, - )? - .with_timezone(timezone) + build_timestamp_array_impl::(line_number, rows, col_idx, &tz, null_regex)? + .with_timezone(timezone) } - None => build_timestamp_array_impl::( - line_number, - rows, - col_idx, - &Utc, - null_regex, - )?, + None => build_timestamp_array_impl::(line_number, rows, col_idx, &Utc, null_regex)?, })) } @@ -1169,10 +1126,7 @@ impl ReaderBuilder { } /// Create a new `BufReader` from a buffered reader - pub fn build_buffered( - self, - reader: R, - ) -> Result, ArrowError> { + pub fn build_buffered(self, reader: R) -> Result, ArrowError> { Ok(BufReader { reader, decoder: self.build_decoder(), @@ -1318,8 +1272,7 @@ mod tests { Field::new("lng", DataType::Float64, false), ]); - let file_with_headers = - File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let file_with_headers = File::open("test/data/uk_cities_with_headers.csv").unwrap(); let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); let both_files = file_with_headers .chain(Cursor::new("\n".to_string())) @@ -1642,8 +1595,7 @@ mod tests { schema.field(5).data_type() ); - let names: Vec<&str> = - schema.fields().iter().map(|x| x.name().as_str()).collect(); + let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); assert_eq!( names, vec![ @@ -1819,16 +1771,11 @@ mod tests { -2203932304000 ); assert_eq!( - Date64Type::parse_formatted("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S") - .unwrap(), + Date64Type::parse_formatted("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S").unwrap(), -2203932304000 ); assert_eq!( - Date64Type::parse_formatted( - "1900-02-28 12:34:56+0030", - "%Y-%m-%d %H:%M:%S%z" - ) - .unwrap(), + Date64Type::parse_formatted("1900-02-28 12:34:56+0030", "%Y-%m-%d %H:%M:%S%z").unwrap(), -2203932304000 - (30 * 60 * 1000) ); } @@ -1865,10 +1812,7 @@ mod tests { #[test] fn test_parse_timestamp() { - test_parse_timestamp_impl::( - None, - &[0, 0, -7_200_000_000_000], - ); + test_parse_timestamp_impl::(None, &[0, 0, -7_200_000_000_000]); test_parse_timestamp_impl::( Some("+00:00".into()), &[0, 0, -7_200_000_000_000], @@ -1885,10 +1829,7 @@ mod tests { Some("-03".into()), &[10_800_000, 0, -7_200_000], ); - test_parse_timestamp_impl::( - Some("-03".into()), - &[10_800, 0, -7_200], - ); + test_parse_timestamp_impl::(Some("-03".into()), &[10_800, 0, -7_200]); } #[test] @@ -2227,10 +2168,8 @@ mod tests { expected_rows ); - let buffered = std::io::BufReader::with_capacity( - capacity, - File::open(path).unwrap(), - ); + let buffered = + std::io::BufReader::with_capacity(capacity, File::open(path).unwrap()); let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index a59d02e0e2d8..877cfb3ee653 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -76,11 +76,7 @@ impl RecordDecoder { /// Decodes records from `input` returning the number of records and bytes read /// /// Note: this expects to be called with an empty `input` to signal EOF - pub fn decode( - &mut self, - input: &[u8], - to_read: usize, - ) -> Result<(usize, usize), ArrowError> { + pub fn decode(&mut self, input: &[u8], to_read: usize) -> Result<(usize, usize), ArrowError> { if to_read == 0 { return Ok((0, 0)); } @@ -124,11 +120,17 @@ impl RecordDecoder { // Need to allocate more capacity ReadRecordResult::OutputFull => break, ReadRecordResult::OutputEndsFull => { - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got more than {}", self.line_number, self.num_columns, self.current_field))); + return Err(ArrowError::CsvError(format!( + "incorrect number of fields for line {}, expected {} got more than {}", + self.line_number, self.num_columns, self.current_field + ))); } ReadRecordResult::Record => { if self.current_field != self.num_columns { - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got {}", self.line_number, self.num_columns, self.current_field))); + return Err(ArrowError::CsvError(format!( + "incorrect number of fields for line {}, expected {} got {}", + self.line_number, self.num_columns, self.current_field + ))); } read += 1; self.current_field = 0; @@ -334,8 +336,7 @@ mod tests { let mut decoder = RecordDecoder::new(Reader::new(), 2); let err = decoder.decode(csv.as_bytes(), 4).unwrap_err().to_string(); - let expected = - "Csv error: incorrect number of fields for line 3, expected 2 got 1"; + let expected = "Csv error: incorrect number of fields for line 3, expected 2 got 1"; assert_eq!(err, expected); diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 1ca956e2c73f..0bb76e536e67 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -389,18 +389,12 @@ mod tests { "consectetur adipiscing elit", "sed do eiusmod tempor", ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); + let c2 = + PrimitiveArray::::from(vec![Some(123.564532), None, Some(-556132.25)]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c5 = TimestampMillisecondArray::from(vec![ - None, - Some(1555584887378), - Some(1555555555555), - ]); + let c5 = + TimestampMillisecondArray::from(vec![None, Some(1555584887378), Some(1555555555555)]); let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); let c7: DictionaryArray = vec!["cupcakes", "cupcakes", "foo"].into_iter().collect(); @@ -451,13 +445,11 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Field::new("c2", DataType::Decimal256(76, 6), true), ]); - let mut c1_builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); + let mut c1_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); c1_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); let c1 = c1_builder.finish(); - let mut c2_builder = - Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); + let mut c2_builder = Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); c2_builder.extend(vec![ Some(i256::from_i128(-3335724)), Some(i256::from_i128(2179404)), @@ -467,8 +459,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let c2 = c2_builder.finish(); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); @@ -512,11 +503,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo "consectetur adipiscing elit", "sed do eiusmod tempor", ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); + let c2 = + PrimitiveArray::::from(vec![Some(123.564532), None, Some(-556132.25)]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); @@ -629,8 +617,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let c0 = UInt32Array::from(vec![Some(123), Some(234)]); let c1 = Date64Array::from(vec![Some(1926632005177), Some(1926632005177685347)]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); let mut writer = Writer::new(&mut file); @@ -656,15 +643,9 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Field::new("c4", DataType::Time32(TimeUnit::Second), false), ]); - let c1 = TimestampMillisecondArray::from(vec![ - Some(1555584887378), - Some(1635577147000), - ]) - .with_timezone("+00:00".to_string()); - let c2 = TimestampMillisecondArray::from(vec![ - Some(1555584887378), - Some(1635577147000), - ]); + let c1 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]) + .with_timezone("+00:00".to_string()); + let c2 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); let c3 = Date32Array::from(vec![3, 2]); let c4 = Time32SecondArray::from(vec![1234, 24680]); diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 5f87dddd4217..10c53c549e2b 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -42,9 +42,7 @@ pub(crate) fn contains_nulls( ) -> bool { match null_bit_buffer { Some(buffer) => { - match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len) - .next() - { + match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() { Some((start, end)) => start != 0 || end != len, None => len != 0, // No non-null values } @@ -130,9 +128,9 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff MutableBuffer::new(capacity * k.primitive_width().unwrap()), empty_buffer, ], - DataType::FixedSizeList(_, _) - | DataType::Struct(_) - | DataType::RunEndEncoded(_, _) => [empty_buffer, MutableBuffer::new(0)], + DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { + [empty_buffer, MutableBuffer::new(0)] + } DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, @@ -159,10 +157,9 @@ pub(crate) fn into_buffers( ) -> Vec { match data_type { DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], - DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], + DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { + vec![buffer1.into(), buffer2.into()] + } DataType::Union(_, mode) => { match mode { // Based on Union's DataTypeLayout @@ -452,12 +449,11 @@ impl ArrayData { for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width, .. } => { - let buffer_size = - self.len.checked_mul(*byte_width).ok_or_else(|| { - ArrowError::ComputeError( - "Integer overflow computing buffer size".to_string(), - ) - })?; + let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { + ArrowError::ComputeError( + "Integer overflow computing buffer size".to_string(), + ) + })?; result += buffer_size; } BufferSpec::VariableWidth => { @@ -590,9 +586,7 @@ impl ArrayData { DataType::LargeBinary | DataType::LargeUtf8 => { (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true) } - DataType::FixedSizeBinary(i) => { - (vec![zeroed(*i as usize * len)], vec![], true) - } + DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true), DataType::List(f) | DataType::Map(f, _) => ( vec![zeroed((len + 1) * 4)], vec![ArrayData::new_empty(f.data_type())], @@ -749,9 +743,7 @@ impl ArrayData { ))); } - for (i, (buffer, spec)) in - self.buffers.iter().zip(layout.buffers.iter()).enumerate() - { + for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() { match spec { BufferSpec::FixedWidth { byte_width, @@ -999,10 +991,8 @@ impl ArrayData { } DataType::RunEndEncoded(run_ends_field, values_field) => { self.validate_num_child_data(2)?; - let run_ends_data = - self.get_valid_child_data(0, run_ends_field.data_type())?; - let values_data = - self.get_valid_child_data(1, values_field.data_type())?; + let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?; + let values_data = self.get_valid_child_data(1, values_field.data_type())?; if run_ends_data.len != values_data.len { return Err(ArrowError::InvalidArgumentError(format!( "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", @@ -1022,9 +1012,7 @@ impl ArrayData { for (i, (_, field)) in fields.iter().enumerate() { let field_data = self.get_valid_child_data(i, field.data_type())?; - if mode == &UnionMode::Sparse - && field_data.len < (self.len + self.offset) - { + if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) { return Err(ArrowError::InvalidArgumentError(format!( "Sparse union child array #{} has length smaller than expected for union array ({} < {})", i, field_data.len, self.len + self.offset @@ -1083,14 +1071,14 @@ impl ArrayData { i: usize, expected_type: &DataType, ) -> Result<&ArrayData, ArrowError> { - let values_data = self.child_data - .get(i) - .ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "{} did not have enough child arrays. Expected at least {} but had only {}", - self.data_type, i+1, self.child_data.len() - )) - })?; + let values_data = self.child_data.get(i).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "{} did not have enough child arrays. Expected at least {} but had only {}", + self.data_type, + i + 1, + self.child_data.len() + )) + })?; if expected_type != &values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( @@ -1160,7 +1148,8 @@ impl ArrayData { if actual != nulls.null_count() { return Err(ArrowError::InvalidArgumentError(format!( "null_count value ({}) doesn't match actual number of nulls in array ({})", - nulls.null_count(), actual + nulls.null_count(), + actual ))); } } @@ -1209,23 +1198,22 @@ impl ArrayData { ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask, - None => return match child.null_count() { - 0 => Ok(()), - _ => Err(ArrowError::InvalidArgumentError(format!( - "non-nullable child of type {} contains nulls not present in parent {}", - child.data_type, - self.data_type - ))), - }, + None => { + return match child.null_count() { + 0 => Ok(()), + _ => Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent {}", + child.data_type, self.data_type + ))), + } + } }; match child.nulls() { - Some(nulls) if !mask.contains(nulls) => { - Err(ArrowError::InvalidArgumentError(format!( - "non-nullable child of type {} contains nulls not present in parent", - child.data_type - ))) - } + Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent", + child.data_type + ))), _ => Ok(()), } } @@ -1240,9 +1228,7 @@ impl ArrayData { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), - DataType::LargeBinary => { - self.validate_offsets_full::(self.buffers[1].len()) - } + DataType::LargeBinary => self.validate_offsets_full::(self.buffers[1].len()), DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) @@ -1300,11 +1286,7 @@ impl ArrayData { /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` - fn validate_each_offset( - &self, - offset_limit: usize, - validate: V, - ) -> Result<(), ArrowError> + fn validate_each_offset(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<(), ArrowError>, @@ -1358,32 +1340,26 @@ impl ArrayData { let values_buffer = &self.buffers[1].as_slice(); if let Ok(values_str) = std::str::from_utf8(values_buffer) { // Validate Offsets are correct - self.validate_each_offset::( - values_buffer.len(), - |string_index, range| { - if !values_str.is_char_boundary(range.start) - || !values_str.is_char_boundary(range.end) - { - return Err(ArrowError::InvalidArgumentError(format!( - "incomplete utf-8 byte sequence from index {string_index}" - ))); - } - Ok(()) - }, - ) + self.validate_each_offset::(values_buffer.len(), |string_index, range| { + if !values_str.is_char_boundary(range.start) + || !values_str.is_char_boundary(range.end) + { + return Err(ArrowError::InvalidArgumentError(format!( + "incomplete utf-8 byte sequence from index {string_index}" + ))); + } + Ok(()) + }) } else { // find specific offset that failed utf8 validation - self.validate_each_offset::( - values_buffer.len(), - |string_index, range| { - std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" - )) - })?; - Ok(()) - }, - ) + self.validate_each_offset::(values_buffer.len(), |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" + )) + })?; + Ok(()) + }) } } @@ -1414,8 +1390,7 @@ impl ArrayData { assert!(buffer.len() / mem::size_of::() >= required_len); // Justification: buffer size was validated above - let indexes: &[T] = - &buffer.typed_data::()[self.offset..self.offset + self.len]; + let indexes: &[T] = &buffer.typed_data::()[self.offset..self.offset + self.len]; indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index f74ab880d478..74279bfb9af1 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -19,8 +19,8 @@ use arrow_buffer::i256; use arrow_schema::ArrowError; pub use arrow_schema::{ - DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, + DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, + DECIMAL_DEFAULT_SCALE, }; // MAX decimal256 value of little-endian format for each precision. @@ -28,308 +28,308 @@ pub use arrow_schema::{ // is encoded to the 32-byte width format of little-endian. pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ i256::from_le_bytes([ - 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, + 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ]), i256::from_le_bytes([ - 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ]), i256::from_le_bytes([ - 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, - 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, 37, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, - 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, 11, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, - 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, 114, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, - 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, 123, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, - 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, 215, 44, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, - 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, 192, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, - 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, 179, 39, 132, + 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, - 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, 141, 41, + 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, - 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, 50, 130, + 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, - 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, 21, 59, + 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, - 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, 219, 78, 58, + 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, - 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, 20, 71, + 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, - 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, 214, 205, + 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, - 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, 100, 10, + 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, - 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, 241, 103, + 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, - 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, 106, 15, + 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, - 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, 37, 154, + 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, - 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, 118, 5, + 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, - 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, 160, 54, 92, + 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, - 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, 69, 34, 154, + 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, - 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, 18, 178, + 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, - 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, 244, 98, + 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, - 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, 143, 221, + 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, - 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, 151, 167, + 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, - 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, 139, 138, + 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, - 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, 88, 119, + 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, - 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, 119, 169, + 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, - 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, 170, 158, + 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, - 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, 170, 50, + 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, - 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, 250, 197, + 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, - 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, 135, 202, + 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, - 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, 254, 70, 233, + 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, - 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, 197, 28, + 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, - 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, 187, 31, + 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, - 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, 242, 80, 61, + 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, - 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, 121, 41, + 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, ]), ]; @@ -338,308 +338,308 @@ pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ // is encoded to the 76-byte width format of little-endian. pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ i256::from_le_bytes([ - 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, - 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, - 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, 238, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, - 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, 80, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, - 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, 249, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, - 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, 187, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, - 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, 83, 253, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, - 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, 184, 69, + 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, - 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, 185, 244, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, - 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, 59, 60, 143, + 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, - 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, 152, 151, + 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, - 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, 135, 243, + 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, - 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, 74, 131, + 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, - 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, 230, 32, + 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, - 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, 163, 253, + 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, - 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, 101, 232, + 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, - 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, 131, 120, + 232, 198, 249, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, - 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, 199, 30, + 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, - 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, 205, 51, + 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, - 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, 11, 6, 192, + 22, 252, 176, 231, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, - 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, 111, 60, + 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, - 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, 90, 92, 2, + 227, 120, 34, 129, 246, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, - 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, 137, 155, + 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, - 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, 98, 19, + 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, - 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, 193, 56, + 193, 54, 168, 110, 232, 218, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, - 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, 152, 146, + 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, - 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, 242, 185, + 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, - 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, 67, 181, + 197, 226, 21, 65, 240, 27, 111, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, - 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, 162, 20, + 185, 219, 218, 138, 98, 23, 87, 250, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, - 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, 90, 90, 206, + 58, 149, 140, 108, 217, 233, 102, 199, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, - 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, 136, 135, 15, + 76, 212, 125, 61, 126, 34, 5, 202, 253, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, - 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, 84, 75, 155, + 248, 74, 234, 102, 238, 88, 51, 228, 233, ]), ]; @@ -758,10 +758,7 @@ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), Arro /// Validates that the specified `i256` of value can be properly /// interpreted as a Decimal256 number with precision `precision` #[inline] -pub fn validate_decimal256_precision( - value: i256, - precision: u8, -) -> Result<(), ArrowError> { +pub fn validate_decimal256_precision(value: i256, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal256 is {DECIMAL256_MAX_PRECISION}, but got {precision}", diff --git a/arrow-data/src/equal/boolean.rs b/arrow-data/src/equal/boolean.rs index a20ca5ac0bd7..addae936f118 100644 --- a/arrow-data/src/equal/boolean.rs +++ b/arrow-data/src/equal/boolean.rs @@ -78,11 +78,10 @@ pub(super) fn boolean_equal( // get a ref of the null buffer bytes, to use in testing for nullness let lhs_nulls = lhs.nulls().unwrap(); - BitIndexIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len) - .all(|i| { - let lhs_pos = lhs_start + lhs.offset() + i; - let rhs_pos = rhs_start + rhs.offset() + i; - get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) - }) + BitIndexIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len).all(|i| { + let lhs_pos = lhs_start + lhs.offset() + i; + let rhs_pos = rhs_start + rhs.offset() + i; + get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) + }) } } diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index 40dacdddd3a0..0778d77e2fdd 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -75,20 +75,15 @@ pub(super) fn fixed_binary_equal( }) } else { let lhs_nulls = lhs.nulls().unwrap(); - let lhs_slices_iter = BitSliceIterator::new( - lhs_nulls.validity(), - lhs_start + lhs_nulls.offset(), - len, - ); + let lhs_slices_iter = + BitSliceIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len); let rhs_nulls = rhs.nulls().unwrap(); - let rhs_slices_iter = BitSliceIterator::new( - rhs_nulls.validity(), - rhs_start + rhs_nulls.offset(), - len, - ); + let rhs_slices_iter = + BitSliceIterator::new(rhs_nulls.validity(), rhs_start + rhs_nulls.offset(), len); - lhs_slices_iter.zip(rhs_slices_iter).all( - |((l_start, l_end), (r_start, r_end))| { + lhs_slices_iter + .zip(rhs_slices_iter) + .all(|((l_start, l_end), (r_start, r_end))| { l_start == r_start && l_end == r_end && equal_len( @@ -98,8 +93,7 @@ pub(super) fn fixed_binary_equal( (rhs_start + r_start) * size, (l_end - l_start) * size, ) - }, - ) + }) } } } diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index fbc868d3f5c4..b279546474a0 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -76,24 +76,16 @@ fn equal_values( DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Decimal128(_, _) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Decimal256(_, _) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + DataType::Decimal128(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Decimal256(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) } DataType::Date64 | DataType::Interval(IntervalUnit::DayTime) | DataType::Time64(_) | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + | DataType::Duration(_) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Interval(IntervalUnit::MonthDayNano) => { primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) } @@ -103,39 +95,21 @@ fn equal_values( DataType::LargeUtf8 | DataType::LargeBinary => { variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) } - DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Dictionary(data_type, _) => match data_type.as_ref() { DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt8 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::Int16 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int32 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int64 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt16 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt32 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt64 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), _ => unreachable!(), }, DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs index 7b3cbc9eb949..e92fdd2ba23b 100644 --- a/arrow-data/src/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -73,20 +73,15 @@ pub(super) fn primitive_equal( }) } else { let lhs_nulls = lhs.nulls().unwrap(); - let lhs_slices_iter = BitSliceIterator::new( - lhs_nulls.validity(), - lhs_start + lhs_nulls.offset(), - len, - ); + let lhs_slices_iter = + BitSliceIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len); let rhs_nulls = rhs.nulls().unwrap(); - let rhs_slices_iter = BitSliceIterator::new( - rhs_nulls.validity(), - rhs_start + rhs_nulls.offset(), - len, - ); + let rhs_slices_iter = + BitSliceIterator::new(rhs_nulls.validity(), rhs_start + rhs_nulls.offset(), len); - lhs_slices_iter.zip(rhs_slices_iter).all( - |((l_start, l_end), (r_start, r_end))| { + lhs_slices_iter + .zip(rhs_slices_iter) + .all(|((l_start, l_end), (r_start, r_end))| { l_start == r_start && l_end == r_end && equal_len( @@ -96,8 +91,7 @@ pub(super) fn primitive_equal( (rhs_start + r_start) * byte_width, (l_end - l_start) * byte_width, ) - }, - ) + }) } } } diff --git a/arrow-data/src/equal/union.rs b/arrow-data/src/equal/union.rs index 5869afc30dbe..62de276e507f 100644 --- a/arrow-data/src/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -116,10 +116,7 @@ pub(super) fn union_equal( rhs_fields, ) } - ( - DataType::Union(_, UnionMode::Sparse), - DataType::Union(_, UnionMode::Sparse), - ) => { + (DataType::Union(_, UnionMode::Sparse), DataType::Union(_, UnionMode::Sparse)) => { lhs_type_id_range == rhs_type_id_range && equal_sparse(lhs, rhs, lhs_start, rhs_start, len) } diff --git a/arrow-data/src/equal/utils.rs b/arrow-data/src/equal/utils.rs index fa6211542550..cc81943756d2 100644 --- a/arrow-data/src/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -73,11 +73,9 @@ pub(super) fn base_equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { let r_value_field = r_fields.get(1).unwrap(); // We don't enforce the equality of field names - let data_type_equal = l_key_field.data_type() - == r_key_field.data_type() + let data_type_equal = l_key_field.data_type() == r_key_field.data_type() && l_value_field.data_type() == r_value_field.data_type(); - let nullability_equal = l_key_field.is_nullable() - == r_key_field.is_nullable() + let nullability_equal = l_key_field.is_nullable() == r_key_field.is_nullable() && l_value_field.is_nullable() == r_value_field.is_nullable(); let metadata_equal = l_key_field.metadata() == r_key_field.metadata() && l_value_field.metadata() == r_value_field.metadata(); diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index 9d5d8330cb1e..d9a1c62a8e8e 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -23,9 +23,7 @@ use crate::ArrayData; use arrow_buffer::ArrowNativeType; use num::{CheckedAdd, Integer}; -pub(super) fn build_extend( - array: &ArrayData, -) -> Extend { +pub(super) fn build_extend(array: &ArrayData) -> Extend { let offsets = array.buffer::(0); Box::new( move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { @@ -35,11 +33,7 @@ pub(super) fn build_extend( let last_offset: T = unsafe { get_last_offset(offset_buffer) }; // offsets - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); + extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1]); mutable.child_data[0].extend( index, @@ -50,10 +44,7 @@ pub(super) fn build_extend( ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index f4b2b46d1723..af25e9c7e3dc 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -173,11 +173,7 @@ impl<'a> std::fmt::Debug for MutableArrayData<'a> { /// Builds an extend that adds `offset` to the source primitive /// Additionally validates that `max` fits into the /// the underlying primitive returning None if not -fn build_extend_dictionary( - array: &ArrayData, - offset: usize, - max: usize, -) -> Option { +fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option { macro_rules! validate_and_build { ($dt: ty) => {{ let _: $dt = max.try_into().ok()?; @@ -215,27 +211,19 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Int64 => primitive::build_extend::(array), DataType::Float32 => primitive::build_extend::(array), DataType::Float64 => primitive::build_extend::(array), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { primitive::build_extend::(array) } DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - primitive::build_extend::(array) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive::build_extend::(array) - } + | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::(array), + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::(array), DataType::Decimal128(_, _) => primitive::build_extend::(array), DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_size::build_extend::(array) - } + DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), DataType::LargeList(_) => list::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), @@ -265,9 +253,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Int64 => primitive::extend_nulls::, DataType::Float32 => primitive::extend_nulls::, DataType::Float64 => primitive::extend_nulls::, - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { + primitive::extend_nulls:: + } DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) @@ -380,10 +368,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } - ( - DataType::Utf8 | DataType::Binary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { + (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } @@ -391,10 +376,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } - ( - DataType::List(_) | DataType::LargeList(_), - Capacities::List(capacity, _), - ) => { + (DataType::List(_) | DataType::LargeList(_), Capacities::List(capacity, _)) => { array_capacity = *capacity; new_buffers(data_type, *capacity) } @@ -435,16 +417,15 @@ impl<'a> MutableArrayData<'a> { .map(|array| &array.child_data()[0]) .collect::>(); - let capacities = if let Capacities::List(capacity, ref child_capacities) = - capacities - { - child_capacities - .clone() - .map(|c| *c) - .unwrap_or(Capacities::Array(capacity)) - } else { - Capacities::Array(array_capacity) - }; + let capacities = + if let Capacities::List(capacity, ref child_capacities) = capacities { + child_capacities + .clone() + .map(|c| *c) + .unwrap_or(Capacities::Array(capacity)) + } else { + Capacities::Array(array_capacity) + }; vec![MutableArrayData::with_capacities( children, use_nulls, capacities, @@ -546,8 +527,7 @@ impl<'a> MutableArrayData<'a> { .collect(); let capacity = lengths.iter().sum(); - let mut mutable = - MutableArrayData::new(dictionaries, false, capacity); + let mut mutable = MutableArrayData::new(dictionaries, false, capacity); for (i, len) in lengths.iter().enumerate() { mutable.extend(i, 0, *len) diff --git a/arrow-data/src/transform/primitive.rs b/arrow-data/src/transform/primitive.rs index b5c826438bfc..627dc00de1df 100644 --- a/arrow-data/src/transform/primitive.rs +++ b/arrow-data/src/transform/primitive.rs @@ -47,9 +47,6 @@ where ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { mutable.buffer1.extend_zeros(len * size_of::()); } diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 17bb87e88a5c..5407f68e0d0c 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -45,9 +45,7 @@ pub(super) fn extend_offsets( } #[inline] -pub(super) unsafe fn get_last_offset( - offset_buffer: &MutableBuffer, -) -> T { +pub(super) unsafe fn get_last_offset(offset_buffer: &MutableBuffer) -> T { // JUSTIFICATION // Benefit // 20% performance improvement extend of variable sized arrays (see bench `mutable_array`) diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index 597a8b2b6645..fa1592d973ed 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -39,9 +39,7 @@ fn extend_offset_values>( buffer.extend_from_slice(new_values); } -pub(super) fn build_extend< - T: ArrowNativeType + Integer + CheckedAdd + AsPrimitive, ->( +pub(super) fn build_extend>( array: &ArrayData, ) -> Extend { let offsets = array.buffer::(0); @@ -54,21 +52,14 @@ pub(super) fn build_extend< // this is safe due to how offset is built. See details on `get_last_offset` let last_offset = unsafe { get_last_offset(offset_buffer) }; - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); + extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1]); // values extend_offset_values::(values_buffer, offsets, values, start, len); }, ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 013f7e7788f8..bd94d3c499ca 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -32,28 +32,26 @@ use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; use arrow_flight::sql::metadata::{ - SqlInfoData, SqlInfoDataBuilder, XdbcTypeInfo, XdbcTypeInfoData, - XdbcTypeInfoDataBuilder, + SqlInfoData, SqlInfoDataBuilder, XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder, }; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionRequest, ActionBeginTransactionResult, - ActionCancelQueryRequest, ActionCancelQueryResult, - ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, - ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, + ActionCancelQueryResult, ActionClosePreparedStatementRequest, + ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, + ActionCreatePreparedSubstraitPlanRequest, ActionEndSavepointRequest, + ActionEndTransactionRequest, Any, CommandGetCatalogs, CommandGetCrossReference, + CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementSubstraitPlan, CommandStatementUpdate, Nullable, ProstMessageExt, - Searchable, SqlInfo, TicketStatementQuery, XdbcDataType, + CommandStatementSubstraitPlan, CommandStatementUpdate, Nullable, ProstMessageExt, Searchable, + SqlInfo, TicketStatementQuery, XdbcDataType, }; use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, Location, SchemaAsIpc, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, + IpcMessage, Location, SchemaAsIpc, Ticket, }; use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; @@ -167,8 +165,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let bytes = BASE64_STANDARD .decode(base64) .map_err(|e| status!("authorization not decodable", e))?; - let str = String::from_utf8(bytes) - .map_err(|e| status!("authorization not parsable", e))?; + let str = String::from_utf8(bytes).map_err(|e| status!("authorization not parsable", e))?; let parts: Vec<_> = str.split(':').collect(); let (user, pass) = match parts.as_slice() { [user, pass] => (user, pass), @@ -195,8 +192,7 @@ impl FlightSqlService for FlightSqlServiceImpl { _message: Any, ) -> Result::DoGetStream>, Status> { self.check_token(&request)?; - let batch = - Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; let schema = batch.schema(); let batches = vec![batch]; let flight_data = batches_to_flight_data(schema.as_ref(), batches) @@ -238,8 +234,7 @@ impl FlightSqlService for FlightSqlServiceImpl { self.check_token(&request)?; let handle = std::str::from_utf8(&cmd.prepared_statement_handle) .map_err(|e| status!("Unable to parse handle", e))?; - let batch = - Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; let schema = (*batch.schema()).clone(); let num_rows = batch.num_rows(); let num_bytes = batch.get_array_memory_size(); @@ -736,8 +731,7 @@ async fn main() -> Result<(), Box> { if std::env::var("USE_TLS").ok().is_some() { let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; - let client_ca = - std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; + let client_ca = std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; let tls_config = ServerTlsConfig::new() .identity(Identity::from_pem(&cert, &key)) diff --git a/arrow-flight/examples/server.rs b/arrow-flight/examples/server.rs index 1ed21acef9b8..85ac4ca1384c 100644 --- a/arrow-flight/examples/server.rs +++ b/arrow-flight/examples/server.rs @@ -20,9 +20,9 @@ use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, + HandshakeResponse, PutResult, SchemaResult, Ticket, }; #[derive(Clone)] diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index 8793f7834bfb..a264012c82ec 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -249,10 +249,7 @@ impl FlightClient { /// .expect("error fetching data"); /// # } /// ``` - pub async fn get_flight_info( - &mut self, - descriptor: FlightDescriptor, - ) -> Result { + pub async fn get_flight_info(&mut self, descriptor: FlightDescriptor) -> Result { let request = self.make_request(descriptor); let response = self.inner.get_flight_info(request).await?.into_inner(); @@ -452,10 +449,7 @@ impl FlightClient { /// .expect("error making request"); /// # } /// ``` - pub async fn get_schema( - &mut self, - flight_descriptor: FlightDescriptor, - ) -> Result { + pub async fn get_schema(&mut self, flight_descriptor: FlightDescriptor) -> Result { let request = self.make_request(flight_descriptor); let schema_result = self.inner.get_schema(request).await?.into_inner(); @@ -488,9 +482,7 @@ impl FlightClient { /// .expect("error gathering actions"); /// # } /// ``` - pub async fn list_actions( - &mut self, - ) -> Result>> { + pub async fn list_actions(&mut self) -> Result>> { let request = self.make_request(Empty {}); let action_stream = self @@ -528,10 +520,7 @@ impl FlightClient { /// .expect("error gathering action results"); /// # } /// ``` - pub async fn do_action( - &mut self, - action: Action, - ) -> Result>> { + pub async fn do_action(&mut self, action: Action) -> Result>> { let request = self.make_request(action); let result_stream = self diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index dfcdd260602c..95bbe2b46bb2 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -21,9 +21,7 @@ use arrow_buffer::Buffer; use arrow_schema::{Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; -use std::{ - collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll, -}; +use std::{collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; use tonic::metadata::MetadataMap; use crate::error::{FlightError, Result}; @@ -270,16 +268,14 @@ impl FlightDataDecoder { /// state as necessary. fn extract_message(&mut self, data: FlightData) -> Result> { use arrow_ipc::MessageHeader; - let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|e| { - FlightError::DecodeError(format!("Error decoding root message: {e}")) - })?; + let message = arrow_ipc::root_as_message(&data.data_header[..]) + .map_err(|e| FlightError::DecodeError(format!("Error decoding root message: {e}")))?; match message.header_type() { MessageHeader::NONE => Ok(Some(DecodedFlightData::new_none(data))), MessageHeader::Schema => { - let schema = Schema::try_from(&data).map_err(|e| { - FlightError::DecodeError(format!("Error decoding schema: {e}")) - })?; + let schema = Schema::try_from(&data) + .map_err(|e| FlightError::DecodeError(format!("Error decoding schema: {e}")))?; let schema = Arc::new(schema); let dictionaries_by_field = HashMap::new(); @@ -300,12 +296,11 @@ impl FlightDataDecoder { }; let buffer = Buffer::from_bytes(data.data_body.into()); - let dictionary_batch = - message.header_as_dictionary_batch().ok_or_else(|| { - FlightError::protocol( - "Could not get dictionary batch from DictionaryBatch message", - ) - })?; + let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { + FlightError::protocol( + "Could not get dictionary batch from DictionaryBatch message", + ) + })?; arrow_ipc::reader::read_dictionary( &buffer, @@ -315,9 +310,7 @@ impl FlightDataDecoder { &message.version(), ) .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc dictionary: {e}" - )) + FlightError::DecodeError(format!("Error decoding ipc dictionary: {e}")) })?; // Updated internal state, but no decoded message @@ -338,9 +331,7 @@ impl FlightDataDecoder { &state.dictionaries_by_field, ) .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc RecordBatch: {e}" - )) + FlightError::DecodeError(format!("Error decoding ipc RecordBatch: {e}")) })?; Ok(Some(DecodedFlightData::new_record_batch(data, batch))) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 9ae7f1637982..e6ef9994d487 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -159,10 +159,7 @@ impl FlightDataEncoderBuilder { } /// Set [`DictionaryHandling`] for encoder - pub fn with_dictionary_handling( - mut self, - dictionary_handling: DictionaryHandling, - ) -> Self { + pub fn with_dictionary_handling(mut self, dictionary_handling: DictionaryHandling) -> Self { self.dictionary_handling = dictionary_handling; self } @@ -191,10 +188,7 @@ impl FlightDataEncoderBuilder { } /// Specify a flight descriptor in the first FlightData message. - pub fn with_flight_descriptor( - mut self, - descriptor: Option, - ) -> Self { + pub fn with_flight_descriptor(mut self, descriptor: Option) -> Self { self.descriptor = descriptor; self } @@ -334,8 +328,7 @@ impl FlightDataEncoder { let batch = prepare_batch_for_flight(&batch, schema, send_dictionaries)?; for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { - let (flight_dictionaries, flight_batch) = - self.encoder.encode_batch(&batch)?; + let (flight_dictionaries, flight_batch) = self.encoder.encode_batch(&batch)?; self.queue_messages(flight_dictionaries); self.queue_message(flight_batch); @@ -460,9 +453,8 @@ fn split_batch_for_grpc_response( .map(|col| col.get_buffer_memory_size()) .sum::(); - let n_batches = (size / max_flight_data_size - + usize::from(size % max_flight_data_size != 0)) - .max(1); + let n_batches = + (size / max_flight_data_size + usize::from(size % max_flight_data_size != 0)).max(1); let rows_per_batch = (batch.num_rows() / n_batches).max(1); let mut out = Vec::with_capacity(n_batches + 1); @@ -505,18 +497,12 @@ impl FlightIpcEncoder { /// Convert a `RecordBatch` to a Vec of `FlightData` representing /// dictionaries and a `FlightData` representing the batch - fn encode_batch( - &mut self, - batch: &RecordBatch, - ) -> Result<(Vec, FlightData)> { - let (encoded_dictionaries, encoded_batch) = self.data_gen.encoded_batch( - batch, - &mut self.dictionary_tracker, - &self.options, - )?; - - let flight_dictionaries = - encoded_dictionaries.into_iter().map(Into::into).collect(); + fn encode_batch(&mut self, batch: &RecordBatch) -> Result<(Vec, FlightData)> { + let (encoded_dictionaries, encoded_batch) = + self.data_gen + .encoded_batch(batch, &mut self.dictionary_tracker, &self.options)?; + + let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); let flight_batch = encoded_batch.into(); Ok((flight_dictionaries, flight_batch)) @@ -553,9 +539,7 @@ fn prepare_batch_for_flight( /// but does enable sending DictionaryArray's via Flight. fn hydrate_dictionary(array: &ArrayRef, send_dictionaries: bool) -> Result { let arr = match array.data_type() { - DataType::Dictionary(_, value) if !send_dictionaries => { - arrow_cast::cast(array, value)? - } + DataType::Dictionary(_, value) if !send_dictionaries => arrow_cast::cast(array, value)?, _ => Arc::clone(array), }; Ok(arr) @@ -586,11 +570,9 @@ mod tests { let (_, baseline_flight_batch) = make_flight_data(&batch, &options); let big_batch = batch.slice(0, batch.num_rows() - 1); - let optimized_big_batch = - prepare_batch_for_flight(&big_batch, Arc::clone(&schema), false) - .expect("failed to optimize"); - let (_, optimized_big_flight_batch) = - make_flight_data(&optimized_big_batch, &options); + let optimized_big_batch = prepare_batch_for_flight(&big_batch, Arc::clone(&schema), false) + .expect("failed to optimize"); + let (_, optimized_big_flight_batch) = make_flight_data(&optimized_big_batch, &options); assert_eq!( baseline_flight_batch.data_body.len(), @@ -601,12 +583,10 @@ mod tests { let optimized_small_batch = prepare_batch_for_flight(&small_batch, Arc::clone(&schema), false) .expect("failed to optimize"); - let (_, optimized_small_flight_batch) = - make_flight_data(&optimized_small_batch, &options); + let (_, optimized_small_flight_batch) = make_flight_data(&optimized_small_batch, &options); assert!( - baseline_flight_batch.data_body.len() - > optimized_small_flight_batch.data_body.len() + baseline_flight_batch.data_body.len() > optimized_small_flight_batch.data_body.len() ); } @@ -620,11 +600,10 @@ mod tests { false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); - let encoder = FlightDataEncoderBuilder::default() - .build(futures::stream::once(async { Ok(batch) })); + let encoder = + FlightDataEncoderBuilder::default().build(futures::stream::once(async { Ok(batch) })); let mut decoder = FlightDataDecoder::new(encoder); - let expected_schema = - Schema::new(vec![Field::new("dict", DataType::Utf8, false)]); + let expected_schema = Schema::new(vec![Field::new("dict", DataType::Utf8, false)]); let expected_schema = Arc::new(expected_schema); while let Some(decoded) = decoder.next().await { let decoded = decoded.unwrap(); @@ -656,10 +635,8 @@ mod tests { Arc::new(vec!["a", "a", "b"].into_iter().collect()); let arr_two: Arc> = Arc::new(vec!["b", "a", "c"].into_iter().collect()); - let batch_one = - RecordBatch::try_new(schema.clone(), vec![arr_one.clone()]).unwrap(); - let batch_two = - RecordBatch::try_new(schema.clone(), vec![arr_two.clone()]).unwrap(); + let batch_one = RecordBatch::try_new(schema.clone(), vec![arr_one.clone()]).unwrap(); + let batch_two = RecordBatch::try_new(schema.clone(), vec![arr_two.clone()]).unwrap(); let encoder = FlightDataEncoderBuilder::default() .with_dictionary_handling(DictionaryHandling::Resend) @@ -675,10 +652,9 @@ mod tests { DecodedPayload::RecordBatch(b) => { assert_eq!(b.schema(), schema); - let actual_array = - Arc::new(downcast_array::>( - b.column_by_name("dict").unwrap(), - )); + let actual_array = Arc::new(downcast_array::>( + b.column_by_name("dict").unwrap(), + )); assert_eq!(actual_array, expected_array); @@ -690,10 +666,9 @@ mod tests { #[test] fn test_schema_metadata_encoded() { - let schema = - Schema::new(vec![Field::new("data", DataType::Int32, false)]).with_metadata( - HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), - ); + let schema = Schema::new(vec![Field::new("data", DataType::Int32, false)]).with_metadata( + HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), + ); let got = prepare_schema_for_flight(&schema, false); assert!(got.metadata().contains_key("some_key")); @@ -708,8 +683,7 @@ mod tests { ) .expect("cannot create record batch"); - prepare_batch_for_flight(&batch, batch.schema(), false) - .expect("failed to optimize"); + prepare_batch_for_flight(&batch, batch.schema(), false).expect("failed to optimize"); } pub fn make_flight_data( @@ -723,8 +697,7 @@ mod tests { .encoded_batch(batch, &mut dictionary_tracker, options) .expect("DictionaryTracker configured above to not error on replacement"); - let flight_dictionaries = - encoded_dictionaries.into_iter().map(Into::into).collect(); + let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); let flight_batch = encoded_batch.into(); (flight_dictionaries, flight_batch) @@ -745,8 +718,7 @@ mod tests { // split once let n_rows = max_flight_data_size + 1; assert!(n_rows % 2 == 1, "should be an odd number"); - let c = - UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); + let c = UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); @@ -793,8 +765,7 @@ mod tests { let input_rows = batch.num_rows(); - let split = - split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); + let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); let sizes: Vec<_> = split.iter().map(|batch| batch.num_rows()).collect(); let output_rows: usize = sizes.iter().sum(); @@ -807,8 +778,7 @@ mod tests { #[tokio::test] async fn flight_data_size_even() { - let s1 = - StringArray::from_iter_values(std::iter::repeat(".10 bytes.").take(1024)); + let s1 = StringArray::from_iter_values(std::iter::repeat(".10 bytes.").take(1024)); let i1 = Int16Array::from_iter_values(0..1024); let s2 = StringArray::from_iter_values(std::iter::repeat("6bytes").take(1024)); let i2 = Int64Array::from_iter_values(0..1024); @@ -828,8 +798,7 @@ mod tests { async fn flight_data_size_uneven_variable_lengths() { // each row has a longer string than the last with increasing lengths 0 --> 1024 let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); - let batch = - RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -883,8 +852,7 @@ mod tests { }) .collect(); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); verify_encoded_split(batch, 160).await; } @@ -894,11 +862,9 @@ mod tests { // large dictionary (all distinct values ==> 1024 entries in dictionary) let values: Vec<_> = (1..1024).map(|i| "**".repeat(i)).collect(); - let array: DictionaryArray = - values.iter().map(|s| Some(s.as_str())).collect(); + let array: DictionaryArray = values.iter().map(|s| Some(s.as_str())).collect(); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -912,8 +878,7 @@ mod tests { let keys = Int32Array::from_iter_values((0..3000).map(|i| (3000 - i) % 1024)); let array = DictionaryArray::new(keys, Arc::new(values)); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -929,12 +894,9 @@ mod tests { // medium cardinality let values3: Vec<_> = (1..1024).map(|i| "**".repeat(i % 100)).collect(); - let array1: DictionaryArray = - values1.iter().map(|s| Some(s.as_str())).collect(); - let array2: DictionaryArray = - values2.iter().map(|s| Some(s.as_str())).collect(); - let array3: DictionaryArray = - values3.iter().map(|s| Some(s.as_str())).collect(); + let array1: DictionaryArray = values1.iter().map(|s| Some(s.as_str())).collect(); + let array2: DictionaryArray = values2.iter().map(|s| Some(s.as_str())).collect(); + let array3: DictionaryArray = values3.iter().map(|s| Some(s.as_str())).collect(); let batch = RecordBatch::try_from_iter(vec![ ("a1", Arc::new(array1) as _), @@ -954,17 +916,13 @@ mod tests { .flight_descriptor .as_ref() .map(|descriptor| { - let path_len: usize = - descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); + let path_len: usize = descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); std::mem::size_of_val(descriptor) + descriptor.cmd.len() + path_len }) .unwrap_or(0); - flight_descriptor_size - + d.app_metadata.len() - + d.data_body.len() - + d.data_header.len() + flight_descriptor_size + d.app_metadata.len() + d.data_body.len() + d.data_header.len() } /// Coverage for diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 3035f109c685..8d05f658703a 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -133,10 +133,7 @@ pub struct IpcMessage(pub Bytes); // Useful conversion functions -fn flight_schema_as_encoded_data( - arrow_schema: &Schema, - options: &IpcWriteOptions, -) -> EncodedData { +fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData { let data_gen = writer::IpcDataGenerator::default(); data_gen.schema_to_bytes(arrow_schema, options) } diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 7685813ff844..133df5b044cf 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -31,17 +31,16 @@ use crate::flight_service_client::FlightServiceClient; use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; use crate::sql::{ ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, CommandGetCrossReference, + CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, }; use crate::trailers::extract_lazy_trailers; use crate::{ - Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, PutResult, Ticket, + Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, + IpcMessage, PutResult, Ticket, }; use arrow_array::RecordBatch; use arrow_buffer::Buffer; @@ -134,11 +133,7 @@ impl FlightSqlServiceClient { /// Perform a `handshake` with the server, passing credentials and establishing a session /// Returns arbitrary auth/handshake info binary blob - pub async fn handshake( - &mut self, - username: &str, - password: &str, - ) -> Result { + pub async fn handshake(&mut self, username: &str, password: &str) -> Result { let cmd = HandshakeRequest { protocol_version: 0, payload: Default::default(), @@ -156,9 +151,9 @@ impl FlightSqlServiceClient { .await .map_err(|e| ArrowError::IpcError(format!("Can't handshake {e}")))?; if let Some(auth) = resp.metadata().get("authorization") { - let auth = auth.to_str().map_err(|_| { - ArrowError::ParseError("Can't read auth header".to_string()) - })?; + let auth = auth + .to_str() + .map_err(|_| ArrowError::ParseError("Can't read auth header".to_string()))?; let bearer = "Bearer "; if !auth.starts_with(bearer) { Err(ArrowError::ParseError("Invalid auth header!".to_string()))?; @@ -166,10 +161,11 @@ impl FlightSqlServiceClient { let auth = auth[bearer.len()..].to_string(); self.token = Some(auth); } - let responses: Vec = - resp.into_inner().try_collect().await.map_err(|_| { - ArrowError::ParseError("Can't collect responses".to_string()) - })?; + let responses: Vec = resp + .into_inner() + .try_collect() + .await + .map_err(|_| ArrowError::ParseError("Can't collect responses".to_string()))?; let resp = match responses.as_slice() { [resp] => resp.payload.clone(), [] => Bytes::new(), @@ -209,8 +205,7 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = - Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; + let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } @@ -405,17 +400,13 @@ impl FlightSqlServiceClient { ArrowError::ParseError(format!("Cannot convert header key \"{k}\": {e}")) })?; let v = v.parse().map_err(|e| { - ArrowError::ParseError(format!( - "Cannot convert header value \"{v}\": {e}" - )) + ArrowError::ParseError(format!("Cannot convert header value \"{v}\": {e}")) })?; req.metadata_mut().insert(k, v); } if let Some(token) = &self.token { let val = format!("Bearer {token}").parse().map_err(|e| { - ArrowError::ParseError(format!( - "Cannot convert token to header value: {e}" - )) + ArrowError::ParseError(format!("Cannot convert token to header value: {e}")) })?; req.metadata_mut().insert("authorization", val); } @@ -484,8 +475,7 @@ impl PreparedStatement { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = - Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; + let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } @@ -501,10 +491,7 @@ impl PreparedStatement { } /// Set a RecordBatch that contains the parameters that will be bind. - pub fn set_parameters( - &mut self, - parameter_binding: RecordBatch, - ) -> Result<(), ArrowError> { + pub fn set_parameters(&mut self, parameter_binding: RecordBatch) -> Result<(), ArrowError> { self.parameter_binding = Some(parameter_binding); Ok(()) } @@ -580,19 +567,16 @@ pub fn arrow_data_from_flight_data( flight_data: FlightData, arrow_schema_ref: &SchemaRef, ) -> Result { - let ipc_message = root_as_message(&flight_data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) - })?; + let ipc_message = root_as_message(&flight_data.data_header[..]) + .map_err(|err| ArrowError::ParseError(format!("Unable to get root as message: {err:?}")))?; match ipc_message.header_type() { MessageHeader::RecordBatch => { - let ipc_record_batch = - ipc_message.header_as_record_batch().ok_or_else(|| { - ArrowError::ComputeError( - "Unable to convert flight data header to a record batch" - .to_string(), - ) - })?; + let ipc_record_batch = ipc_message.header_as_record_batch().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a record batch".to_string(), + ) + })?; let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( @@ -618,13 +602,11 @@ pub fn arrow_data_from_flight_data( MessageHeader::DictionaryBatch => { let _ = ipc_message.header_as_dictionary_batch().ok_or_else(|| { ArrowError::ComputeError( - "Unable to convert flight data header to a dictionary batch" - .to_string(), + "Unable to convert flight data header to a dictionary batch".to_string(), ) })?; Err(ArrowError::NotYetImplemented( - "no idea on how to convert an ipc dictionary batch to an arrow type" - .to_string(), + "no idea on how to convert an ipc dictionary batch to an arrow type".to_string(), )) } MessageHeader::Tensor => { @@ -644,8 +626,7 @@ pub fn arrow_data_from_flight_data( ) })?; Err(ArrowError::NotYetImplemented( - "no idea on how to convert an ipc sparse tensor to an arrow type" - .to_string(), + "no idea on how to convert an ipc sparse tensor to an arrow type".to_string(), )) } _ => Err(ArrowError::ComputeError(format!( diff --git a/arrow-flight/src/sql/metadata/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs index 642802b058d5..303d11cd74ca 100644 --- a/arrow-flight/src/sql/metadata/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -95,11 +95,7 @@ impl GetDbSchemasBuilder { /// Append a row /// /// In case the catalog should be considered as empty, pass in an empty string '""'. - pub fn append( - &mut self, - catalog_name: impl AsRef, - schema_name: impl AsRef, - ) { + pub fn append(&mut self, catalog_name: impl AsRef, schema_name: impl AsRef) { self.catalog_name.append_value(catalog_name); self.db_schema_name.append_value(schema_name); } diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 88c97227814d..d4584f4a6827 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -30,8 +30,8 @@ use std::sync::Arc; use arrow_arith::boolean::or; use arrow_array::array::{Array, UInt32Array, UnionArray}; use arrow_array::builder::{ - ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - MapBuilder, StringBuilder, UInt32Builder, + ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, MapBuilder, + StringBuilder, UInt32Builder, }; use arrow_array::{RecordBatch, Scalar}; use arrow_data::ArrayData; @@ -184,11 +184,7 @@ static UNION_TYPE: Lazy = Lazy::new(|| { Field::new("keys", DataType::Int32, false), Field::new( "values", - DataType::List(Arc::new(Field::new( - "item", - DataType::Int32, - true, - ))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), ])), @@ -420,10 +416,7 @@ pub struct SqlInfoData { impl SqlInfoData { /// Return a [`RecordBatch`] containing only the requested `u32`, if any /// from [`CommandGetSqlInfo`] - pub fn record_batch( - &self, - info: impl IntoIterator, - ) -> Result { + pub fn record_batch(&self, info: impl IntoIterator) -> Result { let arr = self.batch.column(0); let type_filter = info .into_iter() @@ -493,9 +486,7 @@ mod tests { use super::SqlInfoDataBuilder; use crate::sql::metadata::tests::assert_batches_eq; - use crate::sql::{ - SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert, - }; + use crate::sql::{SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert}; #[test] fn test_sql_infos() { diff --git a/arrow-flight/src/sql/metadata/tables.rs b/arrow-flight/src/sql/metadata/tables.rs index 00502a76db53..7ffb76fa1d5f 100644 --- a/arrow-flight/src/sql/metadata/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -329,12 +329,12 @@ mod tests { "b_catalog", ])) as ArrayRef, Arc::new(StringArray::from(vec![ - "a_schema", "a_schema", "b_schema", "b_schema", "a_schema", - "a_schema", "b_schema", "b_schema", + "a_schema", "a_schema", "b_schema", "b_schema", "a_schema", "a_schema", + "b_schema", "b_schema", ])) as ArrayRef, Arc::new(StringArray::from(vec![ - "a_table", "b_table", "a_table", "b_table", "a_table", "a_table", - "b_table", "b_table", + "a_table", "b_table", "a_table", "b_table", "a_table", "a_table", "b_table", + "b_table", ])) as ArrayRef, Arc::new(StringArray::from(vec![ "TABLE", "TABLE", "TABLE", "TABLE", "TABLE", "VIEW", "TABLE", "VIEW", diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index 8212c847a4fa..2e635d3037bc 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -36,9 +36,7 @@ use once_cell::sync::Lazy; use super::lexsort_to_indices; use crate::error::*; -use crate::sql::{ - CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode, -}; +use crate::sql::{CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode}; /// Data structure representing type information for xdbc types. #[derive(Debug, Clone, Default)] @@ -201,8 +199,7 @@ impl XdbcTypeInfoDataBuilder { minimum_scale_builder.append_option(info.minimum_scale); maximum_scale_builder.append_option(info.maximum_scale); sql_data_type_builder.append_value(info.sql_data_type as i32); - datetime_subcode_builder - .append_option(info.datetime_subcode.map(|code| code as i32)); + datetime_subcode_builder.append_option(info.datetime_subcode.map(|code| code as i32)); num_prec_radix_builder.append_option(info.num_prec_radix); interval_precision_builder.append_option(info.interval_precision); }); @@ -215,8 +212,7 @@ impl XdbcTypeInfoDataBuilder { let (field, offsets, values, nulls) = create_params_builder.finish().into_parts(); // Re-defined the field to be non-nullable let new_field = Arc::new(field.as_ref().clone().with_nullable(false)); - let create_params = - Arc::new(ListArray::new(new_field, offsets, values, nulls)) as ArrayRef; + let create_params = Arc::new(ListArray::new(new_field, offsets, values, nulls)) as ArrayRef; let nullable = Arc::new(nullable_builder.finish()); let case_sensitive = Arc::new(case_sensitive_builder.finish()); let searchable = Arc::new(searchable_builder.finish()); diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 4042ce8efc46..97645ae7840d 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -295,9 +295,8 @@ impl Any { if !self.is::() { return Ok(None); } - let m = Message::decode(&*self.value).map_err(|err| { - ArrowError::ParseError(format!("Unable to decode Any value: {err}")) - })?; + let m = Message::decode(&*self.value) + .map_err(|err| ArrowError::ParseError(format!("Unable to decode Any value: {err}")))?; Ok(Some(m)) } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index a158ed77f54d..14ab7d81b4f3 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -24,23 +24,21 @@ use prost::Message; use tonic::{Request, Response, Status, Streaming}; use super::{ - ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionRequest, ActionBeginTransactionResult, - ActionCancelQueryRequest, ActionCancelQueryResult, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, + ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, - ActionEndSavepointRequest, ActionEndTransactionRequest, Any, Command, - CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, - CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, - CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementSubstraitPlan, CommandStatementUpdate, DoPutUpdateResult, - ProstMessageExt, SqlInfo, TicketStatementQuery, + ActionEndSavepointRequest, ActionEndTransactionRequest, Any, Command, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, + CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, + CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, + CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, + DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, }; use crate::{ - flight_service_server::FlightService, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, - PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, + Ticket, }; pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; @@ -549,13 +547,10 @@ where Pin> + Send + 'static>>; type ListFlightsStream = Pin> + Send + 'static>>; - type DoGetStream = - Pin> + Send + 'static>>; - type DoPutStream = - Pin> + Send + 'static>>; - type DoActionStream = Pin< - Box> + Send + 'static>, - >; + type DoGetStream = Pin> + Send + 'static>>; + type DoPutStream = Pin> + Send + 'static>>; + type DoActionStream = + Pin> + Send + 'static>>; type ListActionsStream = Pin> + Send + 'static>>; type DoExchangeStream = @@ -580,8 +575,7 @@ where &self, request: Request, ) -> Result, Status> { - let message = - Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; + let message = Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; match Command::try_from(message).map_err(arrow_error_to_status)? { Command::CommandStatementQuery(token) => { @@ -600,9 +594,7 @@ where Command::CommandGetDbSchemas(token) => { return self.get_flight_info_schemas(token, request).await } - Command::CommandGetTables(token) => { - self.get_flight_info_tables(token, request).await - } + Command::CommandGetTables(token) => self.get_flight_info_tables(token, request).await, Command::CommandGetTableTypes(token) => { self.get_flight_info_table_types(token, request).await } @@ -642,31 +634,21 @@ where &self, request: Request, ) -> Result, Status> { - let msg: Any = Message::decode(&*request.get_ref().ticket) - .map_err(decode_error_to_status)?; + let msg: Any = + Message::decode(&*request.get_ref().ticket).map_err(decode_error_to_status)?; match Command::try_from(msg).map_err(arrow_error_to_status)? { - Command::TicketStatementQuery(command) => { - self.do_get_statement(command, request).await - } + Command::TicketStatementQuery(command) => self.do_get_statement(command, request).await, Command::CommandPreparedStatementQuery(command) => { self.do_get_prepared_statement(command, request).await } - Command::CommandGetCatalogs(command) => { - self.do_get_catalogs(command, request).await - } - Command::CommandGetDbSchemas(command) => { - self.do_get_schemas(command, request).await - } - Command::CommandGetTables(command) => { - self.do_get_tables(command, request).await - } + Command::CommandGetCatalogs(command) => self.do_get_catalogs(command, request).await, + Command::CommandGetDbSchemas(command) => self.do_get_schemas(command, request).await, + Command::CommandGetTables(command) => self.do_get_tables(command, request).await, Command::CommandGetTableTypes(command) => { self.do_get_table_types(command, request).await } - Command::CommandGetSqlInfo(command) => { - self.do_get_sql_info(command, request).await - } + Command::CommandGetSqlInfo(command) => self.do_get_sql_info(command, request).await, Command::CommandGetPrimaryKeys(command) => { self.do_get_primary_keys(command, request).await } @@ -699,8 +681,8 @@ where let mut request = request.map(PeekableFlightDataStream::new); let cmd = Pin::new(request.get_mut()).peek().await.unwrap().clone()?; - let message = Any::decode(&*cmd.flight_descriptor.unwrap().cmd) - .map_err(decode_error_to_status)?; + let message = + Any::decode(&*cmd.flight_descriptor.unwrap().cmd).map_err(decode_error_to_status)?; match Command::try_from(message).map_err(arrow_error_to_status)? { Command::CommandStatementUpdate(command) => { let record_count = self.do_put_statement_update(command, request).await?; @@ -755,11 +737,10 @@ where }; let create_prepared_substrait_plan_action_type = ActionType { r#type: CREATE_PREPARED_SUBSTRAIT_PLAN.to_string(), - description: - "Creates a reusable prepared substrait plan resource on the server.\n + description: "Creates a reusable prepared substrait plan resource on the server.\n Request Message: ActionCreatePreparedSubstraitPlanRequest\n Response Message: ActionCreatePreparedStatementResult" - .into(), + .into(), }; let begin_transaction_action_type = ActionType { r#type: BEGIN_TRANSACTION.to_string(), @@ -820,8 +801,7 @@ where request: Request, ) -> Result, Status> { if request.get_ref().r#type == CREATE_PREPARED_STATEMENT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedStatementRequest = any .unpack() @@ -839,8 +819,7 @@ where })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionClosePreparedStatementRequest = any .unpack() @@ -854,8 +833,7 @@ where .await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == CREATE_PREPARED_SUBSTRAIT_PLAN { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedSubstraitPlanRequest = any .unpack() @@ -869,47 +847,38 @@ where .await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == BEGIN_TRANSACTION { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionBeginTransactionRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionBeginTransactionRequest.", - ) - })?; + Status::invalid_argument("Unable to unpack ActionBeginTransactionRequest.") + })?; let stmt = self.do_action_begin_transaction(cmd, request).await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { body: stmt.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == END_TRANSACTION { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionEndTransactionRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionEndTransactionRequest.", - ) + Status::invalid_argument("Unable to unpack ActionEndTransactionRequest.") })?; self.do_action_end_transaction(cmd, request).await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == BEGIN_SAVEPOINT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionBeginSavepointRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionBeginSavepointRequest.", - ) + Status::invalid_argument("Unable to unpack ActionBeginSavepointRequest.") })?; let stmt = self.do_action_begin_savepoint(cmd, request).await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { @@ -917,22 +886,18 @@ where })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == END_SAVEPOINT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionEndSavepointRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionEndSavepointRequest.", - ) + Status::invalid_argument("Unable to unpack ActionEndSavepointRequest.") })?; self.do_action_end_savepoint(cmd, request).await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == CANCEL_QUERY { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCancelQueryRequest = any .unpack() diff --git a/arrow-flight/src/trailers.rs b/arrow-flight/src/trailers.rs index d652542da779..73136379d69f 100644 --- a/arrow-flight/src/trailers.rs +++ b/arrow-flight/src/trailers.rs @@ -28,9 +28,7 @@ use tonic::{metadata::MetadataMap, Status, Streaming}; /// /// Note that [`LazyTrailers`] has inner mutability and will only hold actual data after [`ExtractTrailersStream`] is /// fully consumed (dropping it is not required though). -pub fn extract_lazy_trailers( - s: Streaming, -) -> (ExtractTrailersStream, LazyTrailers) { +pub fn extract_lazy_trailers(s: Streaming) -> (ExtractTrailersStream, LazyTrailers) { let trailers: SharedTrailers = Default::default(); let stream = ExtractTrailersStream { inner: s, @@ -54,10 +52,7 @@ pub struct ExtractTrailersStream { impl Stream for ExtractTrailersStream { type Item = Result; - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let res = ready!(self.inner.poll_next_unpin(cx)); if res.is_none() { diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 145626b6608f..b75d61d200cb 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -52,26 +52,23 @@ pub fn flight_data_from_arrow_batch( } /// Convert a slice of wire protocol `FlightData`s into a vector of `RecordBatch`es -pub fn flight_data_to_batches( - flight_data: &[FlightData], -) -> Result, ArrowError> { +pub fn flight_data_to_batches(flight_data: &[FlightData]) -> Result, ArrowError> { let schema = flight_data.get(0).ok_or_else(|| { ArrowError::CastError("Need at least one FlightData for schema".to_string()) })?; let message = root_as_message(&schema.data_header[..]) .map_err(|_| ArrowError::CastError("Cannot get root as message".to_string()))?; - let ipc_schema: arrow_ipc::Schema = message.header_as_schema().ok_or_else(|| { - ArrowError::CastError("Cannot get header as Schema".to_string()) - })?; + let ipc_schema: arrow_ipc::Schema = message + .header_as_schema() + .ok_or_else(|| ArrowError::CastError("Cannot get header as Schema".to_string()))?; let schema = fb_to_schema(ipc_schema); let schema = Arc::new(schema); let mut batches = vec![]; let dictionaries_by_id = HashMap::new(); for datum in flight_data[1..].iter() { - let batch = - flight_data_to_arrow_batch(datum, schema.clone(), &dictionaries_by_id)?; + let batch = flight_data_to_arrow_batch(datum, schema.clone(), &dictionaries_by_id)?; batches.push(batch); } Ok(batches) @@ -84,9 +81,8 @@ pub fn flight_data_to_arrow_batch( dictionaries_by_id: &HashMap, ) -> Result { // check that the data_header is a record batch message - let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) - })?; + let message = arrow_ipc::root_as_message(&data.data_header[..]) + .map_err(|err| ArrowError::ParseError(format!("Unable to get root as message: {err:?}")))?; message .header_as_record_batch() @@ -124,10 +120,7 @@ pub fn flight_schema_from_arrow_schema( since = "4.4.0", note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).into()" )] -pub fn flight_data_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> FlightData { +pub fn flight_data_from_arrow_schema(schema: &Schema, options: &IpcWriteOptions) -> FlightData { SchemaAsIpc::new(schema, options).into() } diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 1b9891e121fa..3ad9ee7a45ca 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -23,9 +23,9 @@ mod common { } use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ - decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, - error::FlightError, Action, ActionType, Criteria, Empty, FlightClient, FlightData, - FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, Ticket, + decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, error::FlightError, Action, + ActionType, Criteria, Empty, FlightClient, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, Ticket, }; use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; @@ -271,8 +271,7 @@ async fn test_do_put() { }, ]; - test_server - .set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); + test_server.set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); let input_stream = futures::stream::iter(input_flight_data.clone()).map(Ok); @@ -446,9 +445,8 @@ async fn test_do_exchange() { let input_flight_data = test_flight_data().await; let output_flight_data = test_flight_data2().await; - test_server.set_do_exchange_response( - output_flight_data.clone().into_iter().map(Ok).collect(), - ); + test_server + .set_do_exchange_response(output_flight_data.clone().into_iter().map(Ok).collect()); let response_stream = client .do_exchange(futures::stream::iter(input_flight_data.clone())) diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index c575d12bbf52..8b162d398c4b 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -174,10 +174,7 @@ impl TestFlightServer { } /// Specify the response returned from the next call to `do_action` - pub fn set_do_action_response( - &self, - response: Vec>, - ) { + pub fn set_do_action_response(&self, response: Vec>) { let mut state = self.state.lock().expect("mutex not poisoned"); state.do_action_response.replace(response); } @@ -278,9 +275,10 @@ impl FlightService for TestFlightServer { let mut state = self.state.lock().expect("mutex not poisoned"); state.handshake_request = Some(handshake_request); - let response = state.handshake_response.take().unwrap_or_else(|| { - Err(Status::internal("No handshake response configured")) - })?; + let response = state + .handshake_response + .take() + .unwrap_or_else(|| Err(Status::internal("No handshake response configured")))?; // turn into a streaming response let output = futures::stream::iter(std::iter::once(Ok(response))); @@ -313,9 +311,10 @@ impl FlightService for TestFlightServer { self.save_metadata(&request); let mut state = self.state.lock().expect("mutex not poisoned"); state.get_flight_info_request = Some(request.into_inner()); - let response = state.get_flight_info_response.take().unwrap_or_else(|| { - Err(Status::internal("No get_flight_info response configured")) - })?; + let response = state + .get_flight_info_response + .take() + .unwrap_or_else(|| Err(Status::internal("No get_flight_info response configured")))?; Ok(Response::new(response)) } @@ -326,9 +325,10 @@ impl FlightService for TestFlightServer { self.save_metadata(&request); let mut state = self.state.lock().expect("mutex not poisoned"); state.get_schema_request = Some(request.into_inner()); - let schema = state.get_schema_response.take().unwrap_or_else(|| { - Err(Status::internal("No get_schema response configured")) - })?; + let schema = state + .get_schema_response + .take() + .unwrap_or_else(|| Err(Status::internal("No get_schema response configured")))?; // encode the schema let options = arrow_ipc::writer::IpcWriteOptions::default(); diff --git a/arrow-flight/tests/common/trailers_layer.rs b/arrow-flight/tests/common/trailers_layer.rs index 9e6be0dcf0da..b2ab74f7d925 100644 --- a/arrow-flight/tests/common/trailers_layer.rs +++ b/arrow-flight/tests/common/trailers_layer.rs @@ -81,9 +81,7 @@ where ready!(self.as_mut().project().inner.poll(cx)); match result { - Ok(response) => { - Poll::Ready(Ok(response.map(|body| WrappedBody { inner: body }))) - } + Ok(response) => Poll::Ready(Ok(response.map(|body| WrappedBody { inner: body }))), Err(e) => Poll::Ready(Err(e)), } } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 71bcf4e0521a..f4741d743e57 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -195,8 +195,7 @@ async fn test_app_metadata() { let encode_stream = encoder.build(input_batch_stream); // use lower level stream to get access to app metadata - let decode_stream = - FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); let mut messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); @@ -225,8 +224,7 @@ async fn test_max_message_size() { let encode_stream = encoder.build(input_batch_stream); // use lower level stream to get access to app metadata - let decode_stream = - FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); let messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); @@ -254,8 +252,8 @@ async fn test_max_message_size_fuzz() { ]; for max_message_size_bytes in [10, 1024, 2048, 6400, 3211212] { - let encoder = FlightDataEncoderBuilder::default() - .with_max_flight_data_size(max_message_size_bytes); + let encoder = + FlightDataEncoderBuilder::default().with_max_flight_data_size(max_message_size_bytes); let input_batch_stream = futures::stream::iter(input.clone()).map(Ok); @@ -299,10 +297,10 @@ async fn test_chained_streams_batch_decoder() { let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas - let encode_stream1 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch1.clone())])); - let encode_stream2 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch2.clone())])); + let encode_stream1 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch2.clone())])); // append the two streams (so they will have two different schema messages) let encode_stream = encode_stream1.chain(encode_stream2); @@ -324,10 +322,10 @@ async fn test_chained_streams_data_decoder() { let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas - let encode_stream1 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch1.clone())])); - let encode_stream2 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch2.clone())])); + let encode_stream1 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch2.clone())])); // append the two streams (so they will have two different schema messages) let encode_stream = encode_stream1.chain(encode_stream2); @@ -335,8 +333,7 @@ async fn test_chained_streams_data_decoder() { // lower level decode stream can handle multiple schema messages let decode_stream = FlightDataDecoder::new(encode_stream); - let decoded_data: Vec<_> = - decode_stream.try_collect().await.expect("encode / decode"); + let decoded_data: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); println!("decoded data: {decoded_data:#?}"); @@ -425,8 +422,7 @@ fn make_primitive_batch(num_rows: usize) -> RecordBatch { }) .collect(); - RecordBatch::try_from_iter(vec![("i", Arc::new(i) as ArrayRef), ("f", Arc::new(f))]) - .unwrap() + RecordBatch::try_from_iter(vec![("i", Arc::new(i) as ArrayRef), ("f", Arc::new(f))]).unwrap() } /// Make a dictionary batch for testing @@ -459,8 +455,7 @@ fn make_dictionary_batch(num_rows: usize) -> RecordBatch { /// match the input. async fn roundtrip(input: Vec) { let expected_output = input.clone(); - roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) - .await + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output).await } /// Encodes input as a FlightData stream, and then decodes it using @@ -475,8 +470,7 @@ async fn roundtrip_dictionary(input: Vec) { .iter() .map(|batch| prepare_batch_for_flight(batch, schema.clone()).unwrap()) .collect(); - roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) - .await + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output).await } async fn roundtrip_with_encoder( @@ -491,8 +485,7 @@ async fn roundtrip_with_encoder( let encode_stream = encoder.build(input_batch_stream); let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); - let output_batches: Vec<_> = - decode_stream.try_collect().await.expect("encode / decode"); + let output_batches: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); // remove any empty batches from input as they are not transmitted let expected_batches: Vec<_> = expected_batches diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 221e776218c3..a28080450bc2 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -23,18 +23,16 @@ use arrow_flight::{ flight_service_server::{FlightService, FlightServiceServer}, sql::{ server::{FlightSqlService, PeekableFlightDataStream}, - ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionRequest, ActionBeginTransactionResult, - ActionCancelQueryRequest, ActionCancelQueryResult, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, + ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, - CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, - ProstMessageExt, SqlInfo, TicketStatementQuery, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, + CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, + CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementSubstraitPlan, + CommandStatementUpdate, ProstMessageExt, SqlInfo, TicketStatementQuery, }, utils::batches_to_flight_data, Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, @@ -168,8 +166,7 @@ impl FlightSqlServiceImpl { RecordBatch::try_new(Arc::new(schema), cols) } - fn create_fake_prepared_stmt( - ) -> Result { + fn create_fake_prepared_stmt() -> Result { let handle = PREPARED_STATEMENT_HANDLE.to_string(); let schema = Schema::new(vec![ Field::new("field_string", DataType::Utf8, false), diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 47bacc7cc74b..42ac71fbbd7e 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -124,26 +124,16 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { } Some(s) if s == "duration" => match map.get("unit") { Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => { - Ok(DataType::Duration(TimeUnit::Millisecond)) - } - Some(p) if p == "MICROSECOND" => { - Ok(DataType::Duration(TimeUnit::Microsecond)) - } - Some(p) if p == "NANOSECOND" => { - Ok(DataType::Duration(TimeUnit::Nanosecond)) - } + Some(p) if p == "MILLISECOND" => Ok(DataType::Duration(TimeUnit::Millisecond)), + Some(p) if p == "MICROSECOND" => Ok(DataType::Duration(TimeUnit::Microsecond)), + Some(p) if p == "NANOSECOND" => Ok(DataType::Duration(TimeUnit::Nanosecond)), _ => Err(ArrowError::ParseError( "time unit missing or invalid".to_string(), )), }, Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => { - Ok(DataType::Interval(IntervalUnit::DayTime)) - } - Some(p) if p == "YEAR_MONTH" => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) - } + Some(p) if p == "DAY_TIME" => Ok(DataType::Interval(IntervalUnit::DayTime)), + Some(p) if p == "YEAR_MONTH" => Ok(DataType::Interval(IntervalUnit::YearMonth)), Some(p) if p == "MONTH_DAY_NANO" => { Ok(DataType::Interval(IntervalUnit::MonthDayNano)) } diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index f59314ca02db..32edc4165938 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -63,18 +63,17 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { "Field 'metadata' must have exact two entries for each key-value map".to_string(), )); } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { + if let (Some(k), Some(v)) = (map.get("key"), map.get("value")) { + if let (Some(k_str), Some(v_str)) = (k.as_str(), v.as_str()) { res.insert( k_str.to_string().clone(), v_str.to_string().clone(), ); } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); + return Err(ArrowError::ParseError( + "Field 'metadata' must have map value of string type" + .to_string(), + )); } } else { return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); @@ -115,46 +114,47 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // if data_type is a struct or list, get its children let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { + DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => { + match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'children' must have one element for a list data type" + .to_string(), + )); + } + match data_type { + DataType::List(_) => { + DataType::List(Arc::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => { + DataType::LargeList(Arc::new(field_from_json(&values[0])?)) + } + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Arc::new(field_from_json(&values[0])?), + int, + ), + _ => unreachable!( + "Data type should be a list, largelist or fixedsizelist" + ), + } + } + Some(_) => { return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); + "Field 'children' must be an array".to_string(), + )) } - match data_type { - DataType::List(_) => { - DataType::List(Arc::new(field_from_json(&values[0])?)) - } - DataType::LargeList(_) => DataType::LargeList(Arc::new( - field_from_json(&values[0])?, - )), - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Arc::new(field_from_json(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); } } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, + } DataType::Struct(_) => match map.get("children") { - Some(Value::Array(values)) => DataType::Struct( - values.iter().map(field_from_json).collect::>()?, - ), + Some(Value::Array(values)) => { + DataType::Struct(values.iter().map(field_from_json).collect::>()?) + } Some(_) => { return Err(ArrowError::ParseError( "Field 'children' must be an array".to_string(), @@ -175,17 +175,16 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { DataType::Struct(map_fields) if map_fields.len() == 2 => { DataType::Map(Arc::new(child), keys_sorted) } - t => { - return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {t:?}") - )) + t => { + return Err(ArrowError::ParseError(format!( + "Map children should be a struct with 2 fields, found {t:?}" + ))) } } } Some(_) => { return Err(ArrowError::ParseError( - "Field 'children' must be an array with 1 element" - .to_string(), + "Field 'children' must be an array with 1 element".to_string(), )) } None => { @@ -200,9 +199,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { let fields = fields .iter() .zip(values) - .map(|((id, _), value)| { - Ok((id, Arc::new(field_from_json(value)?))) - }) + .map(|((id, _), value)| Ok((id, Arc::new(field_from_json(value)?)))) .collect::>()?; DataType::Union(fields, mode) @@ -255,8 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { _ => data_type, }; - let mut field = - Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); + let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) } @@ -269,9 +265,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { /// Generate a JSON representation of the `Field`. pub fn field_to_json(field: &Field) -> serde_json::Value { let children: Vec = match field.data_type() { - DataType::Struct(fields) => { - fields.iter().map(|x| field_to_json(x.as_ref())).collect() - } + DataType::Struct(fields) => fields.iter().map(|x| field_to_json(x.as_ref())).collect(), DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 07b69bffd07d..7b797aa07061 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -261,9 +261,7 @@ impl ArrowJsonField { true } Err(e) => { - eprintln!( - "Encountered error while converting JSON field to Arrow field: {e:?}" - ); + eprintln!("Encountered error while converting JSON field to Arrow field: {e:?}"); false } } @@ -273,8 +271,8 @@ impl ArrowJsonField { /// TODO: convert to use an Into fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it - let field = serde_json::to_value(self) - .map_err(|error| ArrowError::JsonError(error.to_string()))?; + let field = + serde_json::to_value(self).map_err(|error| ArrowError::JsonError(error.to_string()))?; field_from_json(&field) } } @@ -389,12 +387,9 @@ pub fn array_from_json( match is_valid { 1 => b.append_value(match value { Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => { - s.parse().expect("Unable to parse string as i64") - } + Value::String(s) => s.parse().expect("Unable to parse string as i64"), Value::Object(ref map) - if map.contains_key("days") - && map.contains_key("milliseconds") => + if map.contains_key("days") && map.contains_key("milliseconds") => { match field.data_type() { DataType::Interval(IntervalUnit::DayTime) => { @@ -404,23 +399,19 @@ pub fn array_from_json( match (days, milliseconds) { (Value::Number(d), Value::Number(m)) => { let mut bytes = [0_u8; 8]; - let m = (m.as_i64().unwrap() as i32) - .to_le_bytes(); - let d = (d.as_i64().unwrap() as i32) - .to_le_bytes(); + let m = (m.as_i64().unwrap() as i32).to_le_bytes(); + let d = (d.as_i64().unwrap() as i32).to_le_bytes(); let c = [d, m].concat(); bytes.copy_from_slice(c.as_slice()); i64::from_le_bytes(bytes) } - _ => panic!( - "Unable to parse {value:?} as interval daytime" - ), + _ => { + panic!("Unable to parse {value:?} as interval daytime") + } } } - _ => panic!( - "Unable to parse {value:?} as interval daytime" - ), + _ => panic!("Unable to parse {value:?} as interval daytime"), } } _ => panic!("Unable to parse {value:?} as number"), @@ -499,9 +490,7 @@ pub fn array_from_json( .expect("Unable to parse string as u64"), ) } else if value.is_number() { - b.append_value( - value.as_u64().expect("Unable to read number as u64"), - ) + b.append_value(value.as_u64().expect("Unable to read number as u64")) } else { panic!("Unable to parse value {value:?} as u64") } @@ -535,11 +524,10 @@ pub fn array_from_json( let months = months.as_i64().unwrap() as i32; let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = ((nanoseconds as i128) - & 0xFFFFFFFFFFFFFFFF) - << 64 - | ((days as i128) & 0xFFFFFFFF) << 32 - | ((months as i128) & 0xFFFFFFFF); + let months_days_ns: i128 = + ((nanoseconds as i128) & 0xFFFFFFFFFFFFFFFF) << 64 + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); months_days_ns } (_, _, _) => { @@ -678,11 +666,8 @@ pub fn array_from_json( DataType::List(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -702,11 +687,8 @@ pub fn array_from_json( DataType::LargeList(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -729,11 +711,8 @@ pub fn array_from_json( } DataType::FixedSizeList(child_field, _) => { let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let null_buf = create_null_buf(&json_col); let list_data = ArrayData::builder(field.data_type().clone()) .len(json_col.count) @@ -760,9 +739,7 @@ pub fn array_from_json( } DataType::Dictionary(key_type, value_type) => { let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find dict_id for field {field:?}" - )) + ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) })?; // find dictionary let dictionary = dictionaries @@ -823,8 +800,7 @@ pub fn array_from_json( } else { [255_u8; 32] }; - bytes[0..integer_bytes.len()] - .copy_from_slice(integer_bytes.as_slice()); + bytes[0..integer_bytes.len()].copy_from_slice(integer_bytes.as_slice()); b.append_value(i256::from_le_bytes(bytes)); } _ => b.append_null(), @@ -837,11 +813,8 @@ pub fn array_from_json( DataType::Map(child_field, _) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -946,9 +919,7 @@ pub fn dictionary_array_from_json( .unwrap(); let array = match dict_key { - DataType::Int8 => { - Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef - } + DataType::Int8 => Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef, DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), @@ -1099,11 +1070,7 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Arc::new(Field::new( - "custom_item", - DataType::Int32, - false, - ))), + DataType::List(Arc::new(Field::new("custom_item", DataType::Int32, false))), true, ), ]); @@ -1199,10 +1166,8 @@ mod tests { ), ]); - let bools_with_metadata_map = - BooleanArray::from(vec![Some(true), None, Some(false)]); - let bools_with_metadata_vec = - BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_map = BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_vec = BooleanArray::from(vec![Some(true), None, Some(false)]); let bools = BooleanArray::from(vec![Some(true), None, Some(false)]); let int8s = Int8Array::from(vec![Some(1), None, Some(3)]); let int16s = Int16Array::from(vec![Some(1), None, Some(3)]); @@ -1220,39 +1185,24 @@ mod tests { Some(29923997007884), Some(30612271819236), ]); - let time_secs = - Time32SecondArray::from(vec![Some(27974), Some(78592), Some(43207)]); - let time_millis = Time32MillisecondArray::from(vec![ - Some(6613125), - Some(74667230), - Some(52260079), - ]); - let time_micros = - Time64MicrosecondArray::from(vec![Some(62522958593), None, None]); - let time_nanos = Time64NanosecondArray::from(vec![ - Some(73380123595985), - None, - Some(16584393546415), - ]); + let time_secs = Time32SecondArray::from(vec![Some(27974), Some(78592), Some(43207)]); + let time_millis = + Time32MillisecondArray::from(vec![Some(6613125), Some(74667230), Some(52260079)]); + let time_micros = Time64MicrosecondArray::from(vec![Some(62522958593), None, None]); + let time_nanos = + Time64NanosecondArray::from(vec![Some(73380123595985), None, Some(16584393546415)]); let ts_secs = TimestampSecondArray::from(vec![None, Some(193438817552), None]); - let ts_millis = TimestampMillisecondArray::from(vec![ - None, - Some(38606916383008), - Some(58113709376587), - ]); + let ts_millis = + TimestampMillisecondArray::from(vec![None, Some(38606916383008), Some(58113709376587)]); let ts_micros = TimestampMicrosecondArray::from(vec![None, None, None]); - let ts_nanos = - TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]); + let ts_nanos = TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]); let ts_secs_tz = TimestampSecondArray::from(vec![None, Some(193438817552), None]) .with_timezone_opt(secs_tz); - let ts_millis_tz = TimestampMillisecondArray::from(vec![ - None, - Some(38606916383008), - Some(58113709376587), - ]) - .with_timezone_opt(millis_tz); - let ts_micros_tz = TimestampMicrosecondArray::from(vec![None, None, None]) - .with_timezone_opt(micros_tz); + let ts_millis_tz = + TimestampMillisecondArray::from(vec![None, Some(38606916383008), Some(58113709376587)]) + .with_timezone_opt(millis_tz); + let ts_micros_tz = + TimestampMicrosecondArray::from(vec![None, None, None]).with_timezone_opt(micros_tz); let ts_nanos_tz = TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]) .with_timezone_opt(nanos_tz); @@ -1260,8 +1210,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from_slice_ref([0, 3, 4, 4]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index 6e143c2838d9..b5f6c5e86b38 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -65,11 +65,9 @@ fn from_metadata(json: &serde_json::Value) -> Result> { match json { Value::Array(_) => { let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::JsonError( - "Unable to parse object into key-value pair".to_string(), - ) + let values: Vec = + serde_json::from_value(json.clone()).map_err(|_| { + ArrowError::JsonError("Unable to parse object into key-value pair".to_string()) })?; for meta in values { hashmap.insert(meta.key.clone(), meta.value); @@ -110,11 +108,10 @@ mod tests { #[test] fn schema_json() { // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); + let metadata: HashMap = [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); let schema = Schema::new_with_metadata( vec![ @@ -140,10 +137,7 @@ mod tests { ), Field::new( "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".into()), - ), + DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())), false, ), Field::new( @@ -197,10 +191,7 @@ mod tests { Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( "c33", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, 123, true, diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index db5df8b58a6f..187d987a5a0a 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -121,11 +121,8 @@ fn canonicalize_schema(schema: &Schema) -> Schema { DataType::Map(child_field, sorted) => match child_field.data_type() { DataType::Struct(fields) if fields.len() == 2 => { let first_field = fields.get(0).unwrap(); - let key_field = Arc::new(Field::new( - "key", - first_field.data_type().clone(), - false, - )); + let key_field = + Arc::new(Field::new("key", first_field.data_type().clone(), false)); let second_field = fields.get(1).unwrap(); let value_field = Arc::new(Field::new( "value", @@ -143,9 +140,7 @@ fn canonicalize_schema(schema: &Schema) -> Schema { field.is_nullable(), )) } - _ => panic!( - "The child field of Map type should be Struct type with 2 fields." - ), + _ => panic!("The child field of Map type should be Struct type with 2 fields."), }, _ => field.clone(), }) diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index d46b4fac759e..b8bbb952837b 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -62,8 +62,7 @@ async fn main() -> Result { } None => { let path = args.path.expect("No path is given"); - flight_client_scenarios::integration_test::run_scenario(&host, port, &path) - .await?; + flight_client_scenarios::integration_test::run_scenario(&host, port, &path).await?; } } diff --git a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs index 9f66abf50106..376e31e15553 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs @@ -17,9 +17,7 @@ use crate::{AUTH_PASSWORD, AUTH_USERNAME}; -use arrow_flight::{ - flight_service_client::FlightServiceClient, BasicAuth, HandshakeRequest, -}; +use arrow_flight::{flight_service_client::FlightServiceClient, BasicAuth, HandshakeRequest}; use futures::{stream, StreamExt}; use prost::Message; use tonic::{metadata::MetadataValue, Request, Status}; @@ -78,11 +76,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { Ok(()) } -async fn authenticate( - client: &mut Client, - username: &str, - password: &str, -) -> Result { +async fn authenticate(client: &mut Client, username: &str, password: &str) -> Result { let auth = BasicAuth { username: username.into(), password: password.into(), diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index a55c2dec0580..81cc4bbe8ed2 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -27,8 +27,7 @@ use arrow::{ }; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, - utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, - SchemaAsIpc, Ticket, + utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, SchemaAsIpc, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, stream, StreamExt}; use tonic::{Request, Streaming}; @@ -203,19 +202,16 @@ async fn consume_flight_location( let mut dictionaries_by_id = HashMap::new(); for (counter, expected_batch) in expected_data.iter().enumerate() { - let data = receive_batch_flight_data( - &mut resp, - actual_schema.clone(), - &mut dictionaries_by_id, - ) - .await - .unwrap_or_else(|| { - panic!( - "Got fewer batches than expected, received so far: {} expected: {}", - counter, - expected_data.len(), - ) - }); + let data = + receive_batch_flight_data(&mut resp, actual_schema.clone(), &mut dictionaries_by_id) + .await + .unwrap_or_else(|| { + panic!( + "Got fewer batches than expected, received so far: {} expected: {}", + counter, + expected_data.len(), + ) + }); let metadata = counter.to_string().into_bytes(); assert_eq!(metadata, data.app_metadata); @@ -250,8 +246,8 @@ async fn consume_flight_location( async fn receive_schema_flight_data(resp: &mut Streaming) -> Option { let data = resp.next().await?.ok()?; - let message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing message"); + let message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing message"); // message header is a Schema, so read it let ipc_schema: ipc::Schema = message @@ -268,8 +264,8 @@ async fn receive_batch_flight_data( dictionaries_by_id: &mut HashMap, ) -> Option { let mut data = resp.next().await?.ok()?; - let mut message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing first message"); + let mut message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing first message"); while message.header_type() == ipc::MessageHeader::DictionaryBatch { reader::read_dictionary( @@ -284,8 +280,8 @@ async fn receive_batch_flight_data( .expect("Error reading dictionary"); data = resp.next().await?.ok()?; - message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing message"); + message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing message"); } Some(data) diff --git a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs index 773919ff72af..3b71edf446a3 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs @@ -16,8 +16,7 @@ // under the License. use arrow_flight::{ - flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, - FlightDescriptor, + flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, FlightDescriptor, }; use prost::bytes::Bytes; use tonic::{Request, Status}; diff --git a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs index 72d47b1391ee..ff4fc12f2523 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs @@ -19,15 +19,13 @@ use std::pin::Pin; use std::sync::Arc; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, BasicAuth, Criteria, Empty, FlightData, FlightDescriptor, - FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + ActionType, BasicAuth, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, Stream, StreamExt}; use tokio::sync::Mutex; -use tonic::{ - metadata::MetadataMap, transport::Server, Request, Response, Status, Streaming, -}; +use tonic::{metadata::MetadataMap, transport::Server, Request, Response, Status, Streaming}; type TonicStream = Pin + Send + Sync + 'static>>; type Error = Box; @@ -63,10 +61,7 @@ pub struct AuthBasicProtoScenarioImpl { } impl AuthBasicProtoScenarioImpl { - async fn check_auth( - &self, - metadata: &MetadataMap, - ) -> Result { + async fn check_auth(&self, metadata: &MetadataMap) -> Result { let token = metadata .get_bin("auth-token-bin") .and_then(|v| v.to_bytes().ok()) @@ -74,10 +69,7 @@ impl AuthBasicProtoScenarioImpl { self.is_valid(token).await } - async fn is_valid( - &self, - token: Option, - ) -> Result { + async fn is_valid(&self, token: Option) -> Result { match token { Some(t) if t == *self.username => Ok(GrpcServerCallContext { peer_identity: self.username.to_string(), @@ -142,12 +134,10 @@ impl FlightService for AuthBasicProtoScenarioImpl { let req = req.expect("Error reading handshake request"); let HandshakeRequest { payload, .. } = req; - let auth = BasicAuth::decode(&*payload) - .expect("Error parsing handshake request"); + let auth = + BasicAuth::decode(&*payload).expect("Error parsing handshake request"); - let resp = if *auth.username == *username - && *auth.password == *password - { + let resp = if *auth.username == *username && *auth.password == *password { Ok(HandshakeResponse { payload: username.as_bytes().to_vec().into(), ..HandshakeResponse::default() diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index e2c4cb5d88f3..2011031e921a 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -30,9 +30,9 @@ use arrow::{ }; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_server::FlightService, - flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, PutResult, SchemaAsIpc, SchemaResult, Ticket, + flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage, + PutResult, SchemaAsIpc, SchemaResult, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, Stream, StreamExt}; use std::convert::TryInto; @@ -113,8 +113,7 @@ impl FlightService for FlightServiceImpl { let options = arrow::ipc::writer::IpcWriteOptions::default(); - let schema = - std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into())); + let schema = std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into())); let batches = flight .chunks @@ -126,12 +125,9 @@ impl FlightService for FlightServiceImpl { let (encoded_dictionaries, encoded_batch) = data_gen .encoded_batch(batch, &mut dictionary_tracker, &options) - .expect( - "DictionaryTracker configured above to not error on replacement", - ); + .expect("DictionaryTracker configured above to not error on replacement"); - let dictionary_flight_data = - encoded_dictionaries.into_iter().map(Into::into); + let dictionary_flight_data = encoded_dictionaries.into_iter().map(Into::into); let mut batch_flight_data: FlightData = encoded_batch.into(); // Only the record batch's FlightData gets app_metadata @@ -182,8 +178,7 @@ impl FlightService for FlightServiceImpl { let endpoint = self.endpoint_from_path(&path[0]); - let total_records: usize = - flight.chunks.iter().map(|chunk| chunk.num_rows()).sum(); + let total_records: usize = flight.chunks.iter().map(|chunk| chunk.num_rows()).sum(); let options = arrow::ipc::writer::IpcWriteOptions::default(); let message = SchemaAsIpc::new(&flight.schema, &options) @@ -224,8 +219,7 @@ impl FlightService for FlightServiceImpl { .clone() .ok_or_else(|| Status::invalid_argument("Must have a descriptor"))?; - if descriptor.r#type != DescriptorType::Path as i32 || descriptor.path.is_empty() - { + if descriptor.r#type != DescriptorType::Path as i32 || descriptor.path.is_empty() { return Err(Status::invalid_argument("Must specify a path")); } @@ -297,9 +291,9 @@ async fn record_batch_from_message( schema_ref: SchemaRef, dictionaries_by_id: &HashMap, ) -> Result { - let ipc_batch = message.header_as_record_batch().ok_or_else(|| { - Status::internal("Could not parse message header as record batch") - })?; + let ipc_batch = message + .header_as_record_batch() + .ok_or_else(|| Status::internal("Could not parse message header as record batch"))?; let arrow_batch_result = reader::read_record_batch( data_body, @@ -320,9 +314,9 @@ async fn dictionary_from_message( schema_ref: SchemaRef, dictionaries_by_id: &mut HashMap, ) -> Result<(), Status> { - let ipc_batch = message.header_as_dictionary_batch().ok_or_else(|| { - Status::internal("Could not parse message header as dictionary batch") - })?; + let ipc_batch = message + .header_as_dictionary_batch() + .ok_or_else(|| Status::internal("Could not parse message header as dictionary batch"))?; let dictionary_batch_result = reader::read_dictionary( data_body, diff --git a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs index 9b1c84b57119..68d871b528a6 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs @@ -19,9 +19,9 @@ use std::pin::Pin; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_server::FlightService, - flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, - PutResult, SchemaResult, Ticket, + flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, + Ticket, }; use futures::Stream; use tonic::{transport::Server, Request, Response, Status, Streaming}; @@ -93,8 +93,7 @@ impl FlightService for MiddlewareScenarioImpl { let descriptor = request.into_inner(); - if descriptor.r#type == DescriptorType::Cmd as i32 - && descriptor.cmd.as_ref() == b"success" + if descriptor.r#type == DescriptorType::Cmd as i32 && descriptor.cmd.as_ref() == b"success" { // Return a fake location - the test doesn't read it let endpoint = super::endpoint("foo", "grpc+tcp://localhost:10010"); diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index fe0cc68a4205..2d76be3495c8 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -56,8 +56,8 @@ pub fn read_json_file(json_name: &str) -> Result { .as_array() .expect("Unable to get dictionaries as array") { - let json_dict: ArrowJsonDictionaryBatch = serde_json::from_value(d.clone()) - .expect("Unable to get dictionary from JSON"); + let json_dict: ArrowJsonDictionaryBatch = + serde_json::from_value(d.clone()).expect("Unable to get dictionary from JSON"); // TODO: convert to a concrete Arrow type dictionaries.insert(json_dict.id, json_dict); } diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 696ab6e6053a..11b8fa84534e 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -63,9 +63,7 @@ fn read_1_0_0_bigendian_decimal_should_panic() { } #[test] -#[should_panic( - expected = "Last offset 687865856 of Utf8 is larger than values length 41" -)] +#[should_panic(expected = "Last offset 687865856 of Utf8 is larger than values length 41")] fn read_1_0_0_bigendian_dictionary_should_panic() { // The offsets are not translated for big-endian files // https://github.com/apache/arrow-rs/issues/859 @@ -160,8 +158,7 @@ fn read_2_0_0_compression() { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_file(testdata: &str, version: &str, path: &str) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON @@ -197,8 +194,7 @@ fn verify_arrow_file(testdata: &str, version: &str, path: &str) { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_stream(testdata: &str, version: &str, path: &str) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index 11707d935540..d780eb2ee0b5 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -113,12 +113,7 @@ fn write_2_0_0_compression() { for options in &all_options { println!("Using options {options:?}"); roundtrip_arrow_file_with_options(&testdata, version, path, options.clone()); - roundtrip_arrow_stream_with_options( - &testdata, - version, - path, - options.clone(), - ); + roundtrip_arrow_stream_with_options(&testdata, version, path, options.clone()); } }); } @@ -143,8 +138,7 @@ fn roundtrip_arrow_file_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); @@ -156,12 +150,8 @@ fn roundtrip_arrow_file_with_options( // read and rewrite the file to a temp location { - let mut writer = FileWriter::try_new_with_options( - &mut tempfile, - &reader.schema(), - options, - ) - .unwrap(); + let mut writer = + FileWriter::try_new_with_options(&mut tempfile, &reader.schema(), options).unwrap(); while let Some(Ok(batch)) = reader.next() { writer.write(&batch).unwrap(); } @@ -207,12 +197,7 @@ fn roundtrip_arrow_file_with_options( /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn roundtrip_arrow_stream(testdata: &str, version: &str, path: &str) { - roundtrip_arrow_stream_with_options( - testdata, - version, - path, - IpcWriteOptions::default(), - ) + roundtrip_arrow_stream_with_options(testdata, version, path, IpcWriteOptions::default()) } fn roundtrip_arrow_stream_with_options( @@ -221,8 +206,7 @@ fn roundtrip_arrow_stream_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); @@ -234,12 +218,9 @@ fn roundtrip_arrow_stream_with_options( // read and rewrite the file to a temp location { - let mut writer = StreamWriter::try_new_with_options( - &mut tempfile, - &reader.schema(), - options, - ) - .unwrap(); + let mut writer = + StreamWriter::try_new_with_options(&mut tempfile, &reader.schema(), options) + .unwrap(); while let Some(Ok(batch)) = reader.next() { writer.write(&batch).unwrap(); } diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index fafc2c5c9b6d..0d8b7b4c1bd4 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -90,10 +90,7 @@ impl CompressionCodec { /// [8 bytes]: uncompressed length /// [remaining bytes]: compressed data stream /// ``` - pub(crate) fn decompress_to_buffer( - &self, - input: &Buffer, - ) -> Result { + pub(crate) fn decompress_to_buffer(&self, input: &Buffer) -> Result { // read the first 8 bytes to determine if the data is // compressed let decompressed_length = read_uncompressed_size(input); @@ -127,11 +124,7 @@ impl CompressionCodec { /// Decompress the data in input buffer and write to output buffer /// using the specified compression - fn decompress( - &self, - input: &[u8], - decompressed_size: usize, - ) -> Result, ArrowError> { + fn decompress(&self, input: &[u8], decompressed_size: usize) -> Result, ArrowError> { let ret = match self { CompressionCodec::Lz4Frame => decompress_lz4(input, decompressed_size)?, CompressionCodec::Zstd => decompress_zstd(input, decompressed_size)?, @@ -175,10 +168,7 @@ fn decompress_lz4(input: &[u8], decompressed_size: usize) -> Result, Arr #[cfg(not(feature = "lz4"))] #[allow(clippy::ptr_arg)] -fn decompress_lz4( - _input: &[u8], - _decompressed_size: usize, -) -> Result, ArrowError> { +fn decompress_lz4(_input: &[u8], _decompressed_size: usize) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "lz4 IPC decompression requires the lz4 feature".to_string(), )) @@ -202,10 +192,7 @@ fn compress_zstd(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> } #[cfg(feature = "zstd")] -fn decompress_zstd( - input: &[u8], - decompressed_size: usize, -) -> Result, ArrowError> { +fn decompress_zstd(input: &[u8], decompressed_size: usize) -> Result, ArrowError> { use std::io::Read; let mut output = Vec::with_capacity(decompressed_size); zstd::Decoder::with_buffer(input)?.read_to_end(&mut output)?; @@ -214,10 +201,7 @@ fn decompress_zstd( #[cfg(not(feature = "zstd"))] #[allow(clippy::ptr_arg)] -fn decompress_zstd( - _input: &[u8], - _decompressed_size: usize, -) -> Result, ArrowError> { +fn decompress_zstd(_input: &[u8], _decompressed_size: usize) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "zstd IPC decompression requires the zstd feature".to_string(), )) diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a78ccde6e169..b290a09acf5d 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -18,9 +18,7 @@ //! Utilities for converting between IPC types and native Arrow types use arrow_schema::*; -use flatbuffers::{ - FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, -}; +use flatbuffers::{FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset}; use std::collections::HashMap; use std::sync::Arc; @@ -186,16 +184,11 @@ pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { // buffer 0 }; - let msg = - size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert flight info to a message: {err}" - )) - })?; + let msg = size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { + ArrowError::ParseError(format!("Unable to convert flight info to a message: {err}")) + })?; let ipc_schema = msg.header_as_schema().ok_or_else(|| { - ArrowError::ParseError( - "Unable to convert flight info to a schema".to_string(), - ) + ArrowError::ParseError("Unable to convert flight info to a schema".to_string()) })?; Ok(fb_to_schema(ipc_schema)) } else { @@ -277,15 +270,9 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let time = field.type_as_time().unwrap(); match (time.bitWidth(), time.unit()) { (32, crate::TimeUnit::SECOND) => DataType::Time32(TimeUnit::Second), - (32, crate::TimeUnit::MILLISECOND) => { - DataType::Time32(TimeUnit::Millisecond) - } - (64, crate::TimeUnit::MICROSECOND) => { - DataType::Time64(TimeUnit::Microsecond) - } - (64, crate::TimeUnit::NANOSECOND) => { - DataType::Time64(TimeUnit::Nanosecond) - } + (32, crate::TimeUnit::MILLISECOND) => DataType::Time32(TimeUnit::Millisecond), + (64, crate::TimeUnit::MICROSECOND) => DataType::Time64(TimeUnit::Microsecond), + (64, crate::TimeUnit::NANOSECOND) => DataType::Time64(TimeUnit::Nanosecond), z => panic!( "Time type with bit width of {} and unit of {:?} not supported", z.0, z.1 @@ -296,30 +283,22 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let timestamp = field.type_as_timestamp().unwrap(); let timezone: Option<_> = timestamp.timezone().map(|tz| tz.into()); match timestamp.unit() { - crate::TimeUnit::SECOND => { - DataType::Timestamp(TimeUnit::Second, timezone) - } + crate::TimeUnit::SECOND => DataType::Timestamp(TimeUnit::Second, timezone), crate::TimeUnit::MILLISECOND => { DataType::Timestamp(TimeUnit::Millisecond, timezone) } crate::TimeUnit::MICROSECOND => { DataType::Timestamp(TimeUnit::Microsecond, timezone) } - crate::TimeUnit::NANOSECOND => { - DataType::Timestamp(TimeUnit::Nanosecond, timezone) - } + crate::TimeUnit::NANOSECOND => DataType::Timestamp(TimeUnit::Nanosecond, timezone), z => panic!("Timestamp type with unit of {z:?} not supported"), } } crate::Type::Interval => { let interval = field.type_as_interval().unwrap(); match interval.unit() { - crate::IntervalUnit::YEAR_MONTH => { - DataType::Interval(IntervalUnit::YearMonth) - } - crate::IntervalUnit::DAY_TIME => { - DataType::Interval(IntervalUnit::DayTime) - } + crate::IntervalUnit::YEAR_MONTH => DataType::Interval(IntervalUnit::YearMonth), + crate::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), crate::IntervalUnit::MONTH_DAY_NANO => { DataType::Interval(IntervalUnit::MonthDayNano) } @@ -775,8 +754,8 @@ pub(crate) fn get_fb_field_type<'a>( UnionMode::Dense => crate::UnionMode::Dense, }; - let fbb_type_ids = fbb - .create_vector(&fields.iter().map(|(t, _)| t as i32).collect::>()); + let fbb_type_ids = + fbb.create_vector(&fields.iter().map(|(t, _)| t as i32).collect::>()); let mut builder = crate::UnionBuilder::new(fbb); builder.add_mode(union_mode); builder.add_typeIds(fbb_type_ids); @@ -872,10 +851,7 @@ mod tests { ), Field::new( "timestamp[us]", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".into()), - ), + DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())), false, ), Field::new( @@ -900,11 +876,7 @@ mod tests { ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), - Field::new_list( - "list[u8]", - Field::new("item", DataType::UInt8, false), - true, - ), + Field::new_list("list[u8]", Field::new("item", DataType::UInt8, false), true), Field::new_list( "list[struct]", Field::new_struct( @@ -1013,20 +985,14 @@ mod tests { ), Field::new_dict( "dictionary", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, 123, true, ), Field::new_dict( "dictionary", - DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::UInt32), - ), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), true, 123, true, @@ -1056,20 +1022,18 @@ mod tests { // # stripping continuation & length prefix & suffix bytes to get only schema bytes // [x for x in sink.getvalue().to_pybytes()][8:-8] let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 4, 0, - 12, 0, 0, 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, - 0, 0, 0, 16, 0, 20, 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, - 0, 0, 2, 16, 0, 0, 0, 32, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 102, - 105, 101, 108, 100, 49, 0, 0, 0, 0, 6, 0, 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, - 0, + 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 4, 0, 12, 0, 0, + 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, 0, 0, 0, 16, 0, 20, + 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, 0, 0, 2, 16, 0, 0, 0, 32, 0, + 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, 0, 0, 6, + 0, 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, ]; let ipc = crate::root_as_message(&bytes).unwrap(); let schema = ipc.header_as_schema().unwrap(); // generate same message with Rust let data_gen = crate::writer::IpcDataGenerator::default(); - let arrow_schema = - Schema::new(vec![Field::new("field1", DataType::UInt32, false)]); + let arrow_schema = Schema::new(vec![Field::new("field1", DataType::UInt32, false)]); let bytes = data_gen .schema_to_bytes(&arrow_schema, &crate::writer::IpcWriteOptions::default()) .ipc_message; diff --git a/arrow-ipc/src/gen/File.rs b/arrow-ipc/src/gen/File.rs index 0e9427813788..c0c2fb183237 100644 --- a/arrow-ipc/src/gen/File.rs +++ b/arrow-ipc/src/gen/File.rs @@ -61,10 +61,7 @@ impl<'b> flatbuffers::Push for Block { type Output = Block; #[inline] unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { - let src = ::core::slice::from_raw_parts( - self as *const Block as *const u8, - Self::size(), - ); + let src = ::core::slice::from_raw_parts(self as *const Block as *const u8, Self::size()); dst.copy_from_slice(src); } } @@ -307,11 +304,7 @@ impl flatbuffers::Verifiable for Footer<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::("version", Self::VT_VERSION, false)? - .visit_field::>( - "schema", - Self::VT_SCHEMA, - false, - )? + .visit_field::>("schema", Self::VT_SCHEMA, false)? .visit_field::>>( "dictionaries", Self::VT_DICTIONARIES, @@ -335,9 +328,7 @@ pub struct FooterArgs<'a> { pub dictionaries: Option>>, pub recordBatches: Option>>, pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, } impl<'a> Default for FooterArgs<'a> { @@ -360,39 +351,29 @@ pub struct FooterBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { #[inline] pub fn add_version(&mut self, version: MetadataVersion) { - self.fbb_.push_slot::( - Footer::VT_VERSION, - version, - MetadataVersion::V1, - ); + self.fbb_ + .push_slot::(Footer::VT_VERSION, version, MetadataVersion::V1); } #[inline] pub fn add_schema(&mut self, schema: flatbuffers::WIPOffset>) { self.fbb_ - .push_slot_always::>( - Footer::VT_SCHEMA, - schema, - ); + .push_slot_always::>(Footer::VT_SCHEMA, schema); } #[inline] pub fn add_dictionaries( &mut self, dictionaries: flatbuffers::WIPOffset>, ) { - self.fbb_.push_slot_always::>( - Footer::VT_DICTIONARIES, - dictionaries, - ); + self.fbb_ + .push_slot_always::>(Footer::VT_DICTIONARIES, dictionaries); } #[inline] pub fn add_recordBatches( &mut self, recordBatches: flatbuffers::WIPOffset>, ) { - self.fbb_.push_slot_always::>( - Footer::VT_RECORDBATCHES, - recordBatches, - ); + self.fbb_ + .push_slot_always::>(Footer::VT_RECORDBATCHES, recordBatches); } #[inline] pub fn add_custom_metadata( @@ -407,9 +388,7 @@ impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { ); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FooterBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FooterBuilder<'a, 'b> { let start = _fbb.start_table(); FooterBuilder { fbb_: _fbb, @@ -451,9 +430,7 @@ pub fn root_as_footer(buf: &[u8]) -> Result Result { +pub fn size_prefixed_root_as_footer(buf: &[u8]) -> Result { flatbuffers::size_prefixed_root::

(buf) } #[inline] diff --git a/arrow-ipc/src/gen/Message.rs b/arrow-ipc/src/gen/Message.rs index 2b9f79766e31..a546b54d9170 100644 --- a/arrow-ipc/src/gen/Message.rs +++ b/arrow-ipc/src/gen/Message.rs @@ -380,10 +380,8 @@ impl<'b> flatbuffers::Push for FieldNode { type Output = FieldNode; #[inline] unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { - let src = ::core::slice::from_raw_parts( - self as *const FieldNode as *const u8, - Self::size(), - ); + let src = + ::core::slice::from_raw_parts(self as *const FieldNode as *const u8, Self::size()); dst.copy_from_slice(src); } } @@ -520,10 +518,7 @@ impl<'a> BodyCompression<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::( - BodyCompression::VT_CODEC, - Some(CompressionType::LZ4_FRAME), - ) + .get::(BodyCompression::VT_CODEC, Some(CompressionType::LZ4_FRAME)) .unwrap() } } @@ -594,9 +589,7 @@ impl<'a: 'b, 'b> BodyCompressionBuilder<'a, 'b> { ); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> BodyCompressionBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> BodyCompressionBuilder<'a, 'b> { let start = _fbb.start_table(); BodyCompressionBuilder { fbb_: _fbb, @@ -737,11 +730,23 @@ impl flatbuffers::Verifiable for RecordBatch<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::("length", Self::VT_LENGTH, false)? - .visit_field::>>("nodes", Self::VT_NODES, false)? - .visit_field::>>("buffers", Self::VT_BUFFERS, false)? - .visit_field::>("compression", Self::VT_COMPRESSION, false)? - .finish(); + .visit_field::("length", Self::VT_LENGTH, false)? + .visit_field::>>( + "nodes", + Self::VT_NODES, + false, + )? + .visit_field::>>( + "buffers", + Self::VT_BUFFERS, + false, + )? + .visit_field::>( + "compression", + Self::VT_COMPRESSION, + false, + )? + .finish(); Ok(()) } } @@ -774,10 +779,7 @@ impl<'a: 'b, 'b> RecordBatchBuilder<'a, 'b> { .push_slot::(RecordBatch::VT_LENGTH, length, 0); } #[inline] - pub fn add_nodes( - &mut self, - nodes: flatbuffers::WIPOffset>, - ) { + pub fn add_nodes(&mut self, nodes: flatbuffers::WIPOffset>) { self.fbb_ .push_slot_always::>(RecordBatch::VT_NODES, nodes); } @@ -786,16 +788,11 @@ impl<'a: 'b, 'b> RecordBatchBuilder<'a, 'b> { &mut self, buffers: flatbuffers::WIPOffset>, ) { - self.fbb_.push_slot_always::>( - RecordBatch::VT_BUFFERS, - buffers, - ); + self.fbb_ + .push_slot_always::>(RecordBatch::VT_BUFFERS, buffers); } #[inline] - pub fn add_compression( - &mut self, - compression: flatbuffers::WIPOffset>, - ) { + pub fn add_compression(&mut self, compression: flatbuffers::WIPOffset>) { self.fbb_ .push_slot_always::>( RecordBatch::VT_COMPRESSION, @@ -803,9 +800,7 @@ impl<'a: 'b, 'b> RecordBatchBuilder<'a, 'b> { ); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> RecordBatchBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> RecordBatchBuilder<'a, 'b> { let start = _fbb.start_table(); RecordBatchBuilder { fbb_: _fbb, @@ -892,10 +887,8 @@ impl<'a> DictionaryBatch<'a> { // Created from valid Table for this object // which contains a valid value in this slot unsafe { - self._tab.get::>( - DictionaryBatch::VT_DATA, - None, - ) + self._tab + .get::>(DictionaryBatch::VT_DATA, None) } } /// If isDelta is true the values in the dictionary are to be appended to a @@ -923,11 +916,7 @@ impl flatbuffers::Verifiable for DictionaryBatch<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::("id", Self::VT_ID, false)? - .visit_field::>( - "data", - Self::VT_DATA, - false, - )? + .visit_field::>("data", Self::VT_DATA, false)? .visit_field::("isDelta", Self::VT_ISDELTA, false)? .finish(); Ok(()) @@ -972,9 +961,7 @@ impl<'a: 'b, 'b> DictionaryBatchBuilder<'a, 'b> { .push_slot::(DictionaryBatch::VT_ISDELTA, isDelta, false); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> DictionaryBatchBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> DictionaryBatchBuilder<'a, 'b> { let start = _fbb.start_table(); DictionaryBatchBuilder { fbb_: _fbb, @@ -1186,20 +1173,47 @@ impl flatbuffers::Verifiable for Message<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::("version", Self::VT_VERSION, false)? - .visit_union::("header_type", Self::VT_HEADER_TYPE, "header", Self::VT_HEADER, false, |key, v, pos| { - match key { - MessageHeader::Schema => v.verify_union_variant::>("MessageHeader::Schema", pos), - MessageHeader::DictionaryBatch => v.verify_union_variant::>("MessageHeader::DictionaryBatch", pos), - MessageHeader::RecordBatch => v.verify_union_variant::>("MessageHeader::RecordBatch", pos), - MessageHeader::Tensor => v.verify_union_variant::>("MessageHeader::Tensor", pos), - MessageHeader::SparseTensor => v.verify_union_variant::>("MessageHeader::SparseTensor", pos), - _ => Ok(()), - } - })? - .visit_field::("bodyLength", Self::VT_BODYLENGTH, false)? - .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? - .finish(); + .visit_field::("version", Self::VT_VERSION, false)? + .visit_union::( + "header_type", + Self::VT_HEADER_TYPE, + "header", + Self::VT_HEADER, + false, + |key, v, pos| match key { + MessageHeader::Schema => v + .verify_union_variant::>( + "MessageHeader::Schema", + pos, + ), + MessageHeader::DictionaryBatch => v + .verify_union_variant::>( + "MessageHeader::DictionaryBatch", + pos, + ), + MessageHeader::RecordBatch => v + .verify_union_variant::>( + "MessageHeader::RecordBatch", + pos, + ), + MessageHeader::Tensor => v + .verify_union_variant::>( + "MessageHeader::Tensor", + pos, + ), + MessageHeader::SparseTensor => v + .verify_union_variant::>( + "MessageHeader::SparseTensor", + pos, + ), + _ => Ok(()), + }, + )? + .visit_field::("bodyLength", Self::VT_BODYLENGTH, false)? + .visit_field::>, + >>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .finish(); Ok(()) } } @@ -1209,9 +1223,7 @@ pub struct MessageArgs<'a> { pub header: Option>, pub bodyLength: i64, pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, } impl<'a> Default for MessageArgs<'a> { @@ -1234,11 +1246,8 @@ pub struct MessageBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> MessageBuilder<'a, 'b> { #[inline] pub fn add_version(&mut self, version: MetadataVersion) { - self.fbb_.push_slot::( - Message::VT_VERSION, - version, - MetadataVersion::V1, - ); + self.fbb_ + .push_slot::(Message::VT_VERSION, version, MetadataVersion::V1); } #[inline] pub fn add_header_type(&mut self, header_type: MessageHeader) { @@ -1249,10 +1258,7 @@ impl<'a: 'b, 'b> MessageBuilder<'a, 'b> { ); } #[inline] - pub fn add_header( - &mut self, - header: flatbuffers::WIPOffset, - ) { + pub fn add_header(&mut self, header: flatbuffers::WIPOffset) { self.fbb_ .push_slot_always::>(Message::VT_HEADER, header); } @@ -1274,9 +1280,7 @@ impl<'a: 'b, 'b> MessageBuilder<'a, 'b> { ); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> MessageBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> MessageBuilder<'a, 'b> { let start = _fbb.start_table(); MessageBuilder { fbb_: _fbb, diff --git a/arrow-ipc/src/gen/Schema.rs b/arrow-ipc/src/gen/Schema.rs index cf3ea0bd4abd..282b38b67195 100644 --- a/arrow-ipc/src/gen/Schema.rs +++ b/arrow-ipc/src/gen/Schema.rs @@ -69,8 +69,7 @@ impl MetadataVersion { pub const ENUM_MIN: i16 = 0; pub const ENUM_MAX: i16 = 4; - pub const ENUM_VALUES: &'static [Self] = - &[Self::V1, Self::V2, Self::V3, Self::V4, Self::V5]; + pub const ENUM_VALUES: &'static [Self] = &[Self::V1, Self::V2, Self::V3, Self::V4, Self::V5]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { match self { @@ -1132,10 +1131,7 @@ impl<'b> flatbuffers::Push for Buffer { type Output = Buffer; #[inline] unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { - let src = ::core::slice::from_raw_parts( - self as *const Buffer as *const u8, - Self::size(), - ); + let src = ::core::slice::from_raw_parts(self as *const Buffer as *const u8, Self::size()); dst.copy_from_slice(src); } } @@ -1364,9 +1360,7 @@ pub struct Struct_Builder<'a: 'b, 'b> { } impl<'a: 'b, 'b> Struct_Builder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> Struct_Builder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> Struct_Builder<'a, 'b> { let start = _fbb.start_table(); Struct_Builder { fbb_: _fbb, @@ -1522,9 +1516,7 @@ pub struct LargeListBuilder<'a: 'b, 'b> { } impl<'a: 'b, 'b> LargeListBuilder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> LargeListBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> LargeListBuilder<'a, 'b> { let start = _fbb.start_table(); LargeListBuilder { fbb_: _fbb, @@ -1626,9 +1618,7 @@ impl<'a: 'b, 'b> FixedSizeListBuilder<'a, 'b> { .push_slot::(FixedSizeList::VT_LISTSIZE, listSize, 0); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FixedSizeListBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FixedSizeListBuilder<'a, 'b> { let start = _fbb.start_table(); FixedSizeListBuilder { fbb_: _fbb, @@ -1888,10 +1878,7 @@ impl<'a: 'b, 'b> UnionBuilder<'a, 'b> { .push_slot::(Union::VT_MODE, mode, UnionMode::Sparse); } #[inline] - pub fn add_typeIds( - &mut self, - typeIds: flatbuffers::WIPOffset>, - ) { + pub fn add_typeIds(&mut self, typeIds: flatbuffers::WIPOffset>) { self.fbb_ .push_slot_always::>(Union::VT_TYPEIDS, typeIds); } @@ -2118,16 +2105,11 @@ pub struct FloatingPointBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> FloatingPointBuilder<'a, 'b> { #[inline] pub fn add_precision(&mut self, precision: Precision) { - self.fbb_.push_slot::( - FloatingPoint::VT_PRECISION, - precision, - Precision::HALF, - ); + self.fbb_ + .push_slot::(FloatingPoint::VT_PRECISION, precision, Precision::HALF); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FloatingPointBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FloatingPointBuilder<'a, 'b> { let start = _fbb.start_table(); FloatingPointBuilder { fbb_: _fbb, @@ -2284,9 +2266,7 @@ pub struct BinaryBuilder<'a: 'b, 'b> { } impl<'a: 'b, 'b> BinaryBuilder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> BinaryBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> BinaryBuilder<'a, 'b> { let start = _fbb.start_table(); BinaryBuilder { fbb_: _fbb, @@ -2365,9 +2345,7 @@ pub struct LargeUtf8Builder<'a: 'b, 'b> { } impl<'a: 'b, 'b> LargeUtf8Builder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> LargeUtf8Builder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> LargeUtf8Builder<'a, 'b> { let start = _fbb.start_table(); LargeUtf8Builder { fbb_: _fbb, @@ -2446,9 +2424,7 @@ pub struct LargeBinaryBuilder<'a: 'b, 'b> { } impl<'a: 'b, 'b> LargeBinaryBuilder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> LargeBinaryBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> LargeBinaryBuilder<'a, 'b> { let start = _fbb.start_table(); LargeBinaryBuilder { fbb_: _fbb, @@ -2550,9 +2526,7 @@ impl<'a: 'b, 'b> FixedSizeBinaryBuilder<'a, 'b> { .push_slot::(FixedSizeBinary::VT_BYTEWIDTH, byteWidth, 0); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FixedSizeBinaryBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FixedSizeBinaryBuilder<'a, 'b> { let start = _fbb.start_table(); FixedSizeBinaryBuilder { fbb_: _fbb, @@ -2712,9 +2686,7 @@ pub struct RunEndEncodedBuilder<'a: 'b, 'b> { } impl<'a: 'b, 'b> RunEndEncodedBuilder<'a, 'b> { #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> RunEndEncodedBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> RunEndEncodedBuilder<'a, 'b> { let start = _fbb.start_table(); RunEndEncodedBuilder { fbb_: _fbb, @@ -2862,9 +2834,7 @@ impl<'a: 'b, 'b> DecimalBuilder<'a, 'b> { .push_slot::(Decimal::VT_BITWIDTH, bitWidth, 128); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> DecimalBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> DecimalBuilder<'a, 'b> { let start = _fbb.start_table(); DecimalBuilder { fbb_: _fbb, @@ -3352,15 +3322,11 @@ impl<'a: 'b, 'b> TimestampBuilder<'a, 'b> { } #[inline] pub fn add_timezone(&mut self, timezone: flatbuffers::WIPOffset<&'b str>) { - self.fbb_.push_slot_always::>( - Timestamp::VT_TIMEZONE, - timezone, - ); + self.fbb_ + .push_slot_always::>(Timestamp::VT_TIMEZONE, timezone); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> TimestampBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TimestampBuilder<'a, 'b> { let start = _fbb.start_table(); TimestampBuilder { fbb_: _fbb, @@ -3461,16 +3427,11 @@ pub struct IntervalBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> IntervalBuilder<'a, 'b> { #[inline] pub fn add_unit(&mut self, unit: IntervalUnit) { - self.fbb_.push_slot::( - Interval::VT_UNIT, - unit, - IntervalUnit::YEAR_MONTH, - ); + self.fbb_ + .push_slot::(Interval::VT_UNIT, unit, IntervalUnit::YEAR_MONTH); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> IntervalBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> IntervalBuilder<'a, 'b> { let start = _fbb.start_table(); IntervalBuilder { fbb_: _fbb, @@ -3574,9 +3535,7 @@ impl<'a: 'b, 'b> DurationBuilder<'a, 'b> { .push_slot::(Duration::VT_UNIT, unit, TimeUnit::MILLISECOND); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> DurationBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> DurationBuilder<'a, 'b> { let start = _fbb.start_table(); DurationBuilder { fbb_: _fbb, @@ -3670,16 +3629,8 @@ impl flatbuffers::Verifiable for KeyValue<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::>( - "key", - Self::VT_KEY, - false, - )? - .visit_field::>( - "value", - Self::VT_VALUE, - false, - )? + .visit_field::>("key", Self::VT_KEY, false)? + .visit_field::>("value", Self::VT_VALUE, false)? .finish(); Ok(()) } @@ -3714,9 +3665,7 @@ impl<'a: 'b, 'b> KeyValueBuilder<'a, 'b> { .push_slot_always::>(KeyValue::VT_VALUE, value); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> KeyValueBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> KeyValueBuilder<'a, 'b> { let start = _fbb.start_table(); KeyValueBuilder { fbb_: _fbb, @@ -3805,10 +3754,8 @@ impl<'a> DictionaryEncoding<'a> { // Created from valid Table for this object // which contains a valid value in this slot unsafe { - self._tab.get::>( - DictionaryEncoding::VT_INDEXTYPE, - None, - ) + self._tab + .get::>(DictionaryEncoding::VT_INDEXTYPE, None) } } /// By default, dictionaries are not ordered, or the order does not have @@ -3857,11 +3804,7 @@ impl flatbuffers::Verifiable for DictionaryEncoding<'_> { false, )? .visit_field::("isOrdered", Self::VT_ISORDERED, false)? - .visit_field::( - "dictionaryKind", - Self::VT_DICTIONARYKIND, - false, - )? + .visit_field::("dictionaryKind", Self::VT_DICTIONARYKIND, false)? .finish(); Ok(()) } @@ -4041,10 +3984,7 @@ impl<'a> Field<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::>>( - Field::VT_TYPE_, - None, - ) + .get::>>(Field::VT_TYPE_, None) } } /// Present only if the field is dictionary encoded. @@ -4055,10 +3995,7 @@ impl<'a> Field<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::>( - Field::VT_DICTIONARY, - None, - ) + .get::>(Field::VT_DICTIONARY, None) } } /// children apply only to nested data types like Struct, List and Union. For @@ -4429,39 +4366,130 @@ impl flatbuffers::Verifiable for Field<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::>("name", Self::VT_NAME, false)? - .visit_field::("nullable", Self::VT_NULLABLE, false)? - .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, false, |key, v, pos| { - match key { - Type::Null => v.verify_union_variant::>("Type::Null", pos), - Type::Int => v.verify_union_variant::>("Type::Int", pos), - Type::FloatingPoint => v.verify_union_variant::>("Type::FloatingPoint", pos), - Type::Binary => v.verify_union_variant::>("Type::Binary", pos), - Type::Utf8 => v.verify_union_variant::>("Type::Utf8", pos), - Type::Bool => v.verify_union_variant::>("Type::Bool", pos), - Type::Decimal => v.verify_union_variant::>("Type::Decimal", pos), - Type::Date => v.verify_union_variant::>("Type::Date", pos), - Type::Time => v.verify_union_variant::>("Type::Time", pos), - Type::Timestamp => v.verify_union_variant::>("Type::Timestamp", pos), - Type::Interval => v.verify_union_variant::>("Type::Interval", pos), - Type::List => v.verify_union_variant::>("Type::List", pos), - Type::Struct_ => v.verify_union_variant::>("Type::Struct_", pos), - Type::Union => v.verify_union_variant::>("Type::Union", pos), - Type::FixedSizeBinary => v.verify_union_variant::>("Type::FixedSizeBinary", pos), - Type::FixedSizeList => v.verify_union_variant::>("Type::FixedSizeList", pos), - Type::Map => v.verify_union_variant::>("Type::Map", pos), - Type::Duration => v.verify_union_variant::>("Type::Duration", pos), - Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), - Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), - Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), - Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), - _ => Ok(()), - } - })? - .visit_field::>("dictionary", Self::VT_DICTIONARY, false)? - .visit_field::>>>("children", Self::VT_CHILDREN, false)? - .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? - .finish(); + .visit_field::>("name", Self::VT_NAME, false)? + .visit_field::("nullable", Self::VT_NULLABLE, false)? + .visit_union::( + "type_type", + Self::VT_TYPE_TYPE, + "type_", + Self::VT_TYPE_, + false, + |key, v, pos| match key { + Type::Null => v.verify_union_variant::>( + "Type::Null", + pos, + ), + Type::Int => v.verify_union_variant::>( + "Type::Int", + pos, + ), + Type::FloatingPoint => v + .verify_union_variant::>( + "Type::FloatingPoint", + pos, + ), + Type::Binary => v.verify_union_variant::>( + "Type::Binary", + pos, + ), + Type::Utf8 => v.verify_union_variant::>( + "Type::Utf8", + pos, + ), + Type::Bool => v.verify_union_variant::>( + "Type::Bool", + pos, + ), + Type::Decimal => v + .verify_union_variant::>( + "Type::Decimal", + pos, + ), + Type::Date => v.verify_union_variant::>( + "Type::Date", + pos, + ), + Type::Time => v.verify_union_variant::>( + "Type::Time", + pos, + ), + Type::Timestamp => v + .verify_union_variant::>( + "Type::Timestamp", + pos, + ), + Type::Interval => v + .verify_union_variant::>( + "Type::Interval", + pos, + ), + Type::List => v.verify_union_variant::>( + "Type::List", + pos, + ), + Type::Struct_ => v + .verify_union_variant::>( + "Type::Struct_", + pos, + ), + Type::Union => v.verify_union_variant::>( + "Type::Union", + pos, + ), + Type::FixedSizeBinary => v + .verify_union_variant::>( + "Type::FixedSizeBinary", + pos, + ), + Type::FixedSizeList => v + .verify_union_variant::>( + "Type::FixedSizeList", + pos, + ), + Type::Map => v.verify_union_variant::>( + "Type::Map", + pos, + ), + Type::Duration => v + .verify_union_variant::>( + "Type::Duration", + pos, + ), + Type::LargeBinary => v + .verify_union_variant::>( + "Type::LargeBinary", + pos, + ), + Type::LargeUtf8 => v + .verify_union_variant::>( + "Type::LargeUtf8", + pos, + ), + Type::LargeList => v + .verify_union_variant::>( + "Type::LargeList", + pos, + ), + Type::RunEndEncoded => v + .verify_union_variant::>( + "Type::RunEndEncoded", + pos, + ), + _ => Ok(()), + }, + )? + .visit_field::>( + "dictionary", + Self::VT_DICTIONARY, + false, + )? + .visit_field::>, + >>("children", Self::VT_CHILDREN, false)? + .visit_field::>, + >>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .finish(); Ok(()) } } @@ -4472,14 +4500,10 @@ pub struct FieldArgs<'a> { pub type_: Option>, pub dictionary: Option>>, pub children: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, } impl<'a> Default for FieldArgs<'a> { @@ -4518,18 +4542,12 @@ impl<'a: 'b, 'b> FieldBuilder<'a, 'b> { .push_slot::(Field::VT_TYPE_TYPE, type_type, Type::NONE); } #[inline] - pub fn add_type_( - &mut self, - type_: flatbuffers::WIPOffset, - ) { + pub fn add_type_(&mut self, type_: flatbuffers::WIPOffset) { self.fbb_ .push_slot_always::>(Field::VT_TYPE_, type_); } #[inline] - pub fn add_dictionary( - &mut self, - dictionary: flatbuffers::WIPOffset>, - ) { + pub fn add_dictionary(&mut self, dictionary: flatbuffers::WIPOffset>) { self.fbb_ .push_slot_always::>( Field::VT_DICTIONARY, @@ -4923,25 +4941,29 @@ impl flatbuffers::Verifiable for Schema<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_field::("endianness", Self::VT_ENDIANNESS, false)? - .visit_field::>>>("fields", Self::VT_FIELDS, false)? - .visit_field::>>>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? - .visit_field::>>("features", Self::VT_FEATURES, false)? - .finish(); + .visit_field::("endianness", Self::VT_ENDIANNESS, false)? + .visit_field::>, + >>("fields", Self::VT_FIELDS, false)? + .visit_field::>, + >>("custom_metadata", Self::VT_CUSTOM_METADATA, false)? + .visit_field::>>( + "features", + Self::VT_FEATURES, + false, + )? + .finish(); Ok(()) } } pub struct SchemaArgs<'a> { pub endianness: Endianness, pub fields: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, pub features: Option>>, } @@ -4964,11 +4986,8 @@ pub struct SchemaBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> SchemaBuilder<'a, 'b> { #[inline] pub fn add_endianness(&mut self, endianness: Endianness) { - self.fbb_.push_slot::( - Schema::VT_ENDIANNESS, - endianness, - Endianness::Little, - ); + self.fbb_ + .push_slot::(Schema::VT_ENDIANNESS, endianness, Endianness::Little); } #[inline] pub fn add_fields( @@ -5001,9 +5020,7 @@ impl<'a: 'b, 'b> SchemaBuilder<'a, 'b> { .push_slot_always::>(Schema::VT_FEATURES, features); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> SchemaBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> SchemaBuilder<'a, 'b> { let start = _fbb.start_table(); SchemaBuilder { fbb_: _fbb, @@ -5044,9 +5061,7 @@ pub fn root_as_schema(buf: &[u8]) -> Result Result { +pub fn size_prefixed_root_as_schema(buf: &[u8]) -> Result { flatbuffers::size_prefixed_root::(buf) } #[inline] diff --git a/arrow-ipc/src/gen/SparseTensor.rs b/arrow-ipc/src/gen/SparseTensor.rs index 83fed4873b62..e03510ec0c8d 100644 --- a/arrow-ipc/src/gen/SparseTensor.rs +++ b/arrow-ipc/src/gen/SparseTensor.rs @@ -425,18 +425,13 @@ impl<'a: 'b, 'b> SparseTensorIndexCOOBuilder<'a, 'b> { } #[inline] pub fn add_indicesBuffer(&mut self, indicesBuffer: &Buffer) { - self.fbb_.push_slot_always::<&Buffer>( - SparseTensorIndexCOO::VT_INDICESBUFFER, - indicesBuffer, - ); + self.fbb_ + .push_slot_always::<&Buffer>(SparseTensorIndexCOO::VT_INDICESBUFFER, indicesBuffer); } #[inline] pub fn add_isCanonical(&mut self, isCanonical: bool) { - self.fbb_.push_slot::( - SparseTensorIndexCOO::VT_ISCANONICAL, - isCanonical, - false, - ); + self.fbb_ + .push_slot::(SparseTensorIndexCOO::VT_ISCANONICAL, isCanonical, false); } #[inline] pub fn new( @@ -543,10 +538,7 @@ impl<'a> SparseMatrixIndexCSX<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::>( - SparseMatrixIndexCSX::VT_INDPTRTYPE, - None, - ) + .get::>(SparseMatrixIndexCSX::VT_INDPTRTYPE, None) .unwrap() } } @@ -692,10 +684,8 @@ impl<'a: 'b, 'b> SparseMatrixIndexCSXBuilder<'a, 'b> { } #[inline] pub fn add_indptrBuffer(&mut self, indptrBuffer: &Buffer) { - self.fbb_.push_slot_always::<&Buffer>( - SparseMatrixIndexCSX::VT_INDPTRBUFFER, - indptrBuffer, - ); + self.fbb_ + .push_slot_always::<&Buffer>(SparseMatrixIndexCSX::VT_INDPTRBUFFER, indptrBuffer); } #[inline] pub fn add_indicesType(&mut self, indicesType: flatbuffers::WIPOffset>) { @@ -706,10 +696,8 @@ impl<'a: 'b, 'b> SparseMatrixIndexCSXBuilder<'a, 'b> { } #[inline] pub fn add_indicesBuffer(&mut self, indicesBuffer: &Buffer) { - self.fbb_.push_slot_always::<&Buffer>( - SparseMatrixIndexCSX::VT_INDICESBUFFER, - indicesBuffer, - ); + self.fbb_ + .push_slot_always::<&Buffer>(SparseMatrixIndexCSX::VT_INDICESBUFFER, indicesBuffer); } #[inline] pub fn new( @@ -838,10 +826,7 @@ impl<'a> SparseTensorIndexCSF<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::>( - SparseTensorIndexCSF::VT_INDPTRTYPE, - None, - ) + .get::>(SparseTensorIndexCSF::VT_INDPTRTYPE, None) .unwrap() } } @@ -1163,9 +1148,7 @@ impl<'a> SparseTensor<'a> { } /// The dimensions of the tensor, optionally named. #[inline] - pub fn shape( - &self, - ) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { + pub fn shape(&self) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { // Safety: // Created from valid Table for this object // which contains a valid value in this slot @@ -1540,9 +1523,7 @@ impl<'a> SparseTensor<'a> { #[inline] #[allow(non_snake_case)] - pub fn sparseIndex_as_sparse_tensor_index_coo( - &self, - ) -> Option> { + pub fn sparseIndex_as_sparse_tensor_index_coo(&self) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseTensorIndexCOO { let u = self.sparseIndex(); // Safety: @@ -1556,9 +1537,7 @@ impl<'a> SparseTensor<'a> { #[inline] #[allow(non_snake_case)] - pub fn sparseIndex_as_sparse_matrix_index_csx( - &self, - ) -> Option> { + pub fn sparseIndex_as_sparse_matrix_index_csx(&self) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseMatrixIndexCSX { let u = self.sparseIndex(); // Safety: @@ -1572,9 +1551,7 @@ impl<'a> SparseTensor<'a> { #[inline] #[allow(non_snake_case)] - pub fn sparseIndex_as_sparse_tensor_index_csf( - &self, - ) -> Option> { + pub fn sparseIndex_as_sparse_tensor_index_csf(&self) -> Option> { if self.sparseIndex_type() == SparseTensorIndex::SparseTensorIndexCSF { let u = self.sparseIndex(); // Safety: @@ -1595,45 +1572,147 @@ impl flatbuffers::Verifiable for SparseTensor<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, true, |key, v, pos| { - match key { - Type::Null => v.verify_union_variant::>("Type::Null", pos), - Type::Int => v.verify_union_variant::>("Type::Int", pos), - Type::FloatingPoint => v.verify_union_variant::>("Type::FloatingPoint", pos), - Type::Binary => v.verify_union_variant::>("Type::Binary", pos), - Type::Utf8 => v.verify_union_variant::>("Type::Utf8", pos), - Type::Bool => v.verify_union_variant::>("Type::Bool", pos), - Type::Decimal => v.verify_union_variant::>("Type::Decimal", pos), - Type::Date => v.verify_union_variant::>("Type::Date", pos), - Type::Time => v.verify_union_variant::>("Type::Time", pos), - Type::Timestamp => v.verify_union_variant::>("Type::Timestamp", pos), - Type::Interval => v.verify_union_variant::>("Type::Interval", pos), - Type::List => v.verify_union_variant::>("Type::List", pos), - Type::Struct_ => v.verify_union_variant::>("Type::Struct_", pos), - Type::Union => v.verify_union_variant::>("Type::Union", pos), - Type::FixedSizeBinary => v.verify_union_variant::>("Type::FixedSizeBinary", pos), - Type::FixedSizeList => v.verify_union_variant::>("Type::FixedSizeList", pos), - Type::Map => v.verify_union_variant::>("Type::Map", pos), - Type::Duration => v.verify_union_variant::>("Type::Duration", pos), - Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), - Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), - Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), - Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), - _ => Ok(()), - } - })? - .visit_field::>>>("shape", Self::VT_SHAPE, true)? - .visit_field::("non_zero_length", Self::VT_NON_ZERO_LENGTH, false)? - .visit_union::("sparseIndex_type", Self::VT_SPARSEINDEX_TYPE, "sparseIndex", Self::VT_SPARSEINDEX, true, |key, v, pos| { - match key { - SparseTensorIndex::SparseTensorIndexCOO => v.verify_union_variant::>("SparseTensorIndex::SparseTensorIndexCOO", pos), - SparseTensorIndex::SparseMatrixIndexCSX => v.verify_union_variant::>("SparseTensorIndex::SparseMatrixIndexCSX", pos), - SparseTensorIndex::SparseTensorIndexCSF => v.verify_union_variant::>("SparseTensorIndex::SparseTensorIndexCSF", pos), - _ => Ok(()), - } - })? - .visit_field::("data", Self::VT_DATA, true)? - .finish(); + .visit_union::( + "type_type", + Self::VT_TYPE_TYPE, + "type_", + Self::VT_TYPE_, + true, + |key, v, pos| match key { + Type::Null => v.verify_union_variant::>( + "Type::Null", + pos, + ), + Type::Int => v.verify_union_variant::>( + "Type::Int", + pos, + ), + Type::FloatingPoint => v + .verify_union_variant::>( + "Type::FloatingPoint", + pos, + ), + Type::Binary => v.verify_union_variant::>( + "Type::Binary", + pos, + ), + Type::Utf8 => v.verify_union_variant::>( + "Type::Utf8", + pos, + ), + Type::Bool => v.verify_union_variant::>( + "Type::Bool", + pos, + ), + Type::Decimal => v + .verify_union_variant::>( + "Type::Decimal", + pos, + ), + Type::Date => v.verify_union_variant::>( + "Type::Date", + pos, + ), + Type::Time => v.verify_union_variant::>( + "Type::Time", + pos, + ), + Type::Timestamp => v + .verify_union_variant::>( + "Type::Timestamp", + pos, + ), + Type::Interval => v + .verify_union_variant::>( + "Type::Interval", + pos, + ), + Type::List => v.verify_union_variant::>( + "Type::List", + pos, + ), + Type::Struct_ => v + .verify_union_variant::>( + "Type::Struct_", + pos, + ), + Type::Union => v.verify_union_variant::>( + "Type::Union", + pos, + ), + Type::FixedSizeBinary => v + .verify_union_variant::>( + "Type::FixedSizeBinary", + pos, + ), + Type::FixedSizeList => v + .verify_union_variant::>( + "Type::FixedSizeList", + pos, + ), + Type::Map => v.verify_union_variant::>( + "Type::Map", + pos, + ), + Type::Duration => v + .verify_union_variant::>( + "Type::Duration", + pos, + ), + Type::LargeBinary => v + .verify_union_variant::>( + "Type::LargeBinary", + pos, + ), + Type::LargeUtf8 => v + .verify_union_variant::>( + "Type::LargeUtf8", + pos, + ), + Type::LargeList => v + .verify_union_variant::>( + "Type::LargeList", + pos, + ), + Type::RunEndEncoded => v + .verify_union_variant::>( + "Type::RunEndEncoded", + pos, + ), + _ => Ok(()), + }, + )? + .visit_field::>, + >>("shape", Self::VT_SHAPE, true)? + .visit_field::("non_zero_length", Self::VT_NON_ZERO_LENGTH, false)? + .visit_union::( + "sparseIndex_type", + Self::VT_SPARSEINDEX_TYPE, + "sparseIndex", + Self::VT_SPARSEINDEX, + true, + |key, v, pos| match key { + SparseTensorIndex::SparseTensorIndexCOO => v + .verify_union_variant::>( + "SparseTensorIndex::SparseTensorIndexCOO", + pos, + ), + SparseTensorIndex::SparseMatrixIndexCSX => v + .verify_union_variant::>( + "SparseTensorIndex::SparseMatrixIndexCSX", + pos, + ), + SparseTensorIndex::SparseTensorIndexCSF => v + .verify_union_variant::>( + "SparseTensorIndex::SparseTensorIndexCSF", + pos, + ), + _ => Ok(()), + }, + )? + .visit_field::("data", Self::VT_DATA, true)? + .finish(); Ok(()) } } @@ -1676,10 +1755,7 @@ impl<'a: 'b, 'b> SparseTensorBuilder<'a, 'b> { .push_slot::(SparseTensor::VT_TYPE_TYPE, type_type, Type::NONE); } #[inline] - pub fn add_type_( - &mut self, - type_: flatbuffers::WIPOffset, - ) { + pub fn add_type_(&mut self, type_: flatbuffers::WIPOffset) { self.fbb_ .push_slot_always::>(SparseTensor::VT_TYPE_, type_); } @@ -1722,9 +1798,7 @@ impl<'a: 'b, 'b> SparseTensorBuilder<'a, 'b> { .push_slot_always::<&Buffer>(SparseTensor::VT_DATA, data); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> SparseTensorBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> SparseTensorBuilder<'a, 'b> { let start = _fbb.start_table(); SparseTensorBuilder { fbb_: _fbb, @@ -2023,9 +2097,7 @@ impl core::fmt::Debug for SparseTensor<'_> { /// catch every error, or be maximally performant. For the /// previous, unchecked, behavior use /// `root_as_sparse_tensor_unchecked`. -pub fn root_as_sparse_tensor( - buf: &[u8], -) -> Result { +pub fn root_as_sparse_tensor(buf: &[u8]) -> Result { flatbuffers::root::(buf) } #[inline] diff --git a/arrow-ipc/src/gen/Tensor.rs b/arrow-ipc/src/gen/Tensor.rs index 43133fec036d..1766d95144c2 100644 --- a/arrow-ipc/src/gen/Tensor.rs +++ b/arrow-ipc/src/gen/Tensor.rs @@ -94,11 +94,7 @@ impl flatbuffers::Verifiable for TensorDim<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::("size_", Self::VT_SIZE_, false)? - .visit_field::>( - "name", - Self::VT_NAME, - false, - )? + .visit_field::>("name", Self::VT_NAME, false)? .finish(); Ok(()) } @@ -132,9 +128,7 @@ impl<'a: 'b, 'b> TensorDimBuilder<'a, 'b> { .push_slot_always::>(TensorDim::VT_NAME, name); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> TensorDimBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TensorDimBuilder<'a, 'b> { let start = _fbb.start_table(); TensorDimBuilder { fbb_: _fbb, @@ -226,18 +220,13 @@ impl<'a> Tensor<'a> { // which contains a valid value in this slot unsafe { self._tab - .get::>>( - Tensor::VT_TYPE_, - None, - ) + .get::>>(Tensor::VT_TYPE_, None) .unwrap() } } /// The dimensions of the tensor, optionally named #[inline] - pub fn shape( - &self, - ) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { + pub fn shape(&self) -> flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>> { // Safety: // Created from valid Table for this object // which contains a valid value in this slot @@ -589,37 +578,126 @@ impl flatbuffers::Verifiable for Tensor<'_> { ) -> Result<(), flatbuffers::InvalidFlatbuffer> { use flatbuffers::Verifiable; v.visit_table(pos)? - .visit_union::("type_type", Self::VT_TYPE_TYPE, "type_", Self::VT_TYPE_, true, |key, v, pos| { - match key { - Type::Null => v.verify_union_variant::>("Type::Null", pos), - Type::Int => v.verify_union_variant::>("Type::Int", pos), - Type::FloatingPoint => v.verify_union_variant::>("Type::FloatingPoint", pos), - Type::Binary => v.verify_union_variant::>("Type::Binary", pos), - Type::Utf8 => v.verify_union_variant::>("Type::Utf8", pos), - Type::Bool => v.verify_union_variant::>("Type::Bool", pos), - Type::Decimal => v.verify_union_variant::>("Type::Decimal", pos), - Type::Date => v.verify_union_variant::>("Type::Date", pos), - Type::Time => v.verify_union_variant::>("Type::Time", pos), - Type::Timestamp => v.verify_union_variant::>("Type::Timestamp", pos), - Type::Interval => v.verify_union_variant::>("Type::Interval", pos), - Type::List => v.verify_union_variant::>("Type::List", pos), - Type::Struct_ => v.verify_union_variant::>("Type::Struct_", pos), - Type::Union => v.verify_union_variant::>("Type::Union", pos), - Type::FixedSizeBinary => v.verify_union_variant::>("Type::FixedSizeBinary", pos), - Type::FixedSizeList => v.verify_union_variant::>("Type::FixedSizeList", pos), - Type::Map => v.verify_union_variant::>("Type::Map", pos), - Type::Duration => v.verify_union_variant::>("Type::Duration", pos), - Type::LargeBinary => v.verify_union_variant::>("Type::LargeBinary", pos), - Type::LargeUtf8 => v.verify_union_variant::>("Type::LargeUtf8", pos), - Type::LargeList => v.verify_union_variant::>("Type::LargeList", pos), - Type::RunEndEncoded => v.verify_union_variant::>("Type::RunEndEncoded", pos), - _ => Ok(()), - } - })? - .visit_field::>>>("shape", Self::VT_SHAPE, true)? - .visit_field::>>("strides", Self::VT_STRIDES, false)? - .visit_field::("data", Self::VT_DATA, true)? - .finish(); + .visit_union::( + "type_type", + Self::VT_TYPE_TYPE, + "type_", + Self::VT_TYPE_, + true, + |key, v, pos| match key { + Type::Null => v.verify_union_variant::>( + "Type::Null", + pos, + ), + Type::Int => v.verify_union_variant::>( + "Type::Int", + pos, + ), + Type::FloatingPoint => v + .verify_union_variant::>( + "Type::FloatingPoint", + pos, + ), + Type::Binary => v.verify_union_variant::>( + "Type::Binary", + pos, + ), + Type::Utf8 => v.verify_union_variant::>( + "Type::Utf8", + pos, + ), + Type::Bool => v.verify_union_variant::>( + "Type::Bool", + pos, + ), + Type::Decimal => v + .verify_union_variant::>( + "Type::Decimal", + pos, + ), + Type::Date => v.verify_union_variant::>( + "Type::Date", + pos, + ), + Type::Time => v.verify_union_variant::>( + "Type::Time", + pos, + ), + Type::Timestamp => v + .verify_union_variant::>( + "Type::Timestamp", + pos, + ), + Type::Interval => v + .verify_union_variant::>( + "Type::Interval", + pos, + ), + Type::List => v.verify_union_variant::>( + "Type::List", + pos, + ), + Type::Struct_ => v + .verify_union_variant::>( + "Type::Struct_", + pos, + ), + Type::Union => v.verify_union_variant::>( + "Type::Union", + pos, + ), + Type::FixedSizeBinary => v + .verify_union_variant::>( + "Type::FixedSizeBinary", + pos, + ), + Type::FixedSizeList => v + .verify_union_variant::>( + "Type::FixedSizeList", + pos, + ), + Type::Map => v.verify_union_variant::>( + "Type::Map", + pos, + ), + Type::Duration => v + .verify_union_variant::>( + "Type::Duration", + pos, + ), + Type::LargeBinary => v + .verify_union_variant::>( + "Type::LargeBinary", + pos, + ), + Type::LargeUtf8 => v + .verify_union_variant::>( + "Type::LargeUtf8", + pos, + ), + Type::LargeList => v + .verify_union_variant::>( + "Type::LargeList", + pos, + ), + Type::RunEndEncoded => v + .verify_union_variant::>( + "Type::RunEndEncoded", + pos, + ), + _ => Ok(()), + }, + )? + .visit_field::>, + >>("shape", Self::VT_SHAPE, true)? + .visit_field::>>( + "strides", + Self::VT_STRIDES, + false, + )? + .visit_field::("data", Self::VT_DATA, true)? + .finish(); Ok(()) } } @@ -658,10 +736,7 @@ impl<'a: 'b, 'b> TensorBuilder<'a, 'b> { .push_slot::(Tensor::VT_TYPE_TYPE, type_type, Type::NONE); } #[inline] - pub fn add_type_( - &mut self, - type_: flatbuffers::WIPOffset, - ) { + pub fn add_type_(&mut self, type_: flatbuffers::WIPOffset) { self.fbb_ .push_slot_always::>(Tensor::VT_TYPE_, type_); } @@ -676,10 +751,7 @@ impl<'a: 'b, 'b> TensorBuilder<'a, 'b> { .push_slot_always::>(Tensor::VT_SHAPE, shape); } #[inline] - pub fn add_strides( - &mut self, - strides: flatbuffers::WIPOffset>, - ) { + pub fn add_strides(&mut self, strides: flatbuffers::WIPOffset>) { self.fbb_ .push_slot_always::>(Tensor::VT_STRIDES, strides); } @@ -688,9 +760,7 @@ impl<'a: 'b, 'b> TensorBuilder<'a, 'b> { self.fbb_.push_slot_always::<&Buffer>(Tensor::VT_DATA, data); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> TensorBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> TensorBuilder<'a, 'b> { let start = _fbb.start_table(); TensorBuilder { fbb_: _fbb, @@ -960,9 +1030,7 @@ pub fn root_as_tensor(buf: &[u8]) -> Result Result { +pub fn size_prefixed_root_as_tensor(buf: &[u8]) -> Result { flatbuffers::size_prefixed_root::(buf) } #[inline] diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 75c91be21dde..6f2cb30a1629 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -141,19 +141,13 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result { let union_node = reader.next_node(field)?; @@ -232,9 +226,7 @@ fn create_primitive_array( .null_bit_buffer(null_buffer) .build_aligned()? } - _ if data_type.is_primitive() - || matches!(data_type, Boolean | FixedSizeBinary(_)) => - { + _ if data_type.is_primitive() || matches!(data_type, Boolean | FixedSizeBinary(_)) => { // read 2 buffers: null buffer (optional) and data buffer ArrayData::builder(data_type.clone()) .len(length) @@ -560,10 +552,7 @@ impl FileReader { /// /// Returns errors if the file does not meet the Arrow Format header and footer /// requirements - pub fn try_new( - reader: R, - projection: Option>, - ) -> Result { + pub fn try_new(reader: R, projection: Option>) -> Result { let mut reader = BufReader::new(reader); // check if header and footer contain correct magic bytes let mut magic_buffer: [u8; 6] = [0; 6]; @@ -596,9 +585,7 @@ impl FileReader { })?; let blocks = footer.recordBatches().ok_or_else(|| { - ArrowError::ParseError( - "Unable to get record batches from IPC Footer".to_string(), - ) + ArrowError::ParseError("Unable to get record batches from IPC Footer".to_string()) })?; let total_blocks = blocks.len(); @@ -633,9 +620,7 @@ impl FileReader { reader.read_exact(&mut block_data)?; let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to get root as message: {err:?}" - )) + ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) })?; match message.header_type() { @@ -643,8 +628,7 @@ impl FileReader { let batch = message.header_as_dictionary_batch().unwrap(); // read the block that makes up the dictionary batch into a buffer - let mut buf = - MutableBuffer::from_len_zeroed(message.bodyLength() as usize); + let mut buf = MutableBuffer::from_len_zeroed(message.bodyLength() as usize); reader.seek(SeekFrom::Start( block.offset() as u64 + block.metaDataLength() as u64, ))?; @@ -752,9 +736,7 @@ impl FileReader { )), crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { - ArrowError::IpcError( - "Unable to read IPC message as record batch".to_string(), - ) + ArrowError::IpcError("Unable to read IPC message as record batch".to_string()) })?; // read the block that makes up the record batch into a buffer let mut buf = MutableBuffer::from_len_zeroed(message.bodyLength() as usize); @@ -769,13 +751,11 @@ impl FileReader { self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref()), - &message.version() - - ).map(Some) - } - crate::MessageHeader::NONE => { - Ok(None) + &message.version(), + ) + .map(Some) } + crate::MessageHeader::NONE => Ok(None), t => Err(ArrowError::InvalidArgumentError(format!( "Reading types other than record batches not yet supported, unable to read {t:?}" ))), @@ -856,10 +836,7 @@ impl StreamReader> { /// The first message in the stream is the schema, the reader will fail if it does not /// encounter a schema. /// To check if the reader is done, use `is_finished(self)` - pub fn try_new( - reader: R, - projection: Option>, - ) -> Result { + pub fn try_new(reader: R, projection: Option>) -> Result { Self::try_new_unbuffered(BufReader::new(reader), projection) } } @@ -976,15 +953,21 @@ impl StreamReader { )), crate::MessageHeader::RecordBatch => { let batch = message.header_as_record_batch().ok_or_else(|| { - ArrowError::IpcError( - "Unable to read IPC message as record batch".to_string(), - ) + ArrowError::IpcError("Unable to read IPC message as record batch".to_string()) })?; // read the block that makes up the record batch into a buffer let mut buf = MutableBuffer::from_len_zeroed(message.bodyLength() as usize); self.reader.read_exact(&mut buf)?; - read_record_batch(&buf.into(), batch, self.schema(), &self.dictionaries_by_id, self.projection.as_ref().map(|x| x.0.as_ref()), &message.version()).map(Some) + read_record_batch( + &buf.into(), + batch, + self.schema(), + &self.dictionaries_by_id, + self.projection.as_ref().map(|x| x.0.as_ref()), + &message.version(), + ) + .map(Some) } crate::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().ok_or_else(|| { @@ -997,18 +980,20 @@ impl StreamReader { self.reader.read_exact(&mut buf)?; read_dictionary( - &buf.into(), batch, &self.schema, &mut self.dictionaries_by_id, &message.version() + &buf.into(), + batch, + &self.schema, + &mut self.dictionaries_by_id, + &message.version(), )?; // read the next message until we encounter a RecordBatch self.maybe_next() } - crate::MessageHeader::NONE => { - Ok(None) - } - t => Err(ArrowError::InvalidArgumentError( - format!("Reading types other than record batches not yet supported, unable to read {t:?} ") - )), + crate::MessageHeader::NONE => Ok(None), + t => Err(ArrowError::InvalidArgumentError(format!( + "Reading types other than record batches not yet supported, unable to read {t:?} " + ))), } } @@ -1055,13 +1040,10 @@ mod tests { fn create_test_projection_schema() -> Schema { // define field types - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); - let fixed_size_list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let fixed_size_list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let union_fields = UnionFields::new( vec![0, 1], @@ -1124,8 +1106,7 @@ mod tests { ]; let array6 = ListArray::from_iter_primitive::(array6_values); let array7_values = vec![vec![11, 12, 13], vec![22, 23, 24], vec![33, 34, 35]]; - let array7 = - FixedSizeBinaryArray::try_from_iter(array7_values.into_iter()).unwrap(); + let array7 = FixedSizeBinaryArray::try_from_iter(array7_values.into_iter()).unwrap(); let array8_values = ArrayData::builder(DataType::Int32) .len(9) @@ -1201,8 +1182,7 @@ mod tests { // write record batch in IPC format let mut buf = Vec::new(); { - let mut writer = - crate::writer::FileWriter::try_new(&mut buf, &schema).unwrap(); + let mut writer = crate::writer::FileWriter::try_new(&mut buf, &schema).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); } @@ -1210,8 +1190,7 @@ mod tests { // read record batch with projection for index in 0..12 { let projection = vec![index]; - let reader = - FileReader::try_new(std::io::Cursor::new(buf.clone()), Some(projection)); + let reader = FileReader::try_new(std::io::Cursor::new(buf.clone()), Some(projection)); let read_batch = reader.unwrap().next().unwrap().unwrap(); let projected_column = read_batch.column(0); let expected_column = batch.column(index); @@ -1222,10 +1201,8 @@ mod tests { { // read record batch with reversed projection - let reader = FileReader::try_new( - std::io::Cursor::new(buf.clone()), - Some(vec![3, 2, 1]), - ); + let reader = + FileReader::try_new(std::io::Cursor::new(buf.clone()), Some(vec![3, 2, 1])); let read_batch = reader.unwrap().next().unwrap().unwrap(); let expected_batch = batch.project(&[3, 2, 1]).unwrap(); assert_eq!(read_batch, expected_batch); @@ -1249,8 +1226,7 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema.clone()), arrays).unwrap(); // create stream writer let mut file = tempfile::tempfile().unwrap(); - let mut stream_writer = - crate::writer::StreamWriter::try_new(&mut file, &schema).unwrap(); + let mut stream_writer = crate::writer::StreamWriter::try_new(&mut file, &schema).unwrap(); stream_writer.write(&batch).unwrap(); stream_writer.finish().unwrap(); @@ -1298,8 +1274,7 @@ mod tests { fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch { let mut buf = Vec::new(); - let mut writer = - crate::writer::FileWriter::try_new(&mut buf, &rb.schema()).unwrap(); + let mut writer = crate::writer::FileWriter::try_new(&mut buf, &rb.schema()).unwrap(); writer.write(rb).unwrap(); writer.finish().unwrap(); drop(writer); @@ -1310,15 +1285,13 @@ mod tests { fn roundtrip_ipc_stream(rb: &RecordBatch) -> RecordBatch { let mut buf = Vec::new(); - let mut writer = - crate::writer::StreamWriter::try_new(&mut buf, &rb.schema()).unwrap(); + let mut writer = crate::writer::StreamWriter::try_new(&mut buf, &rb.schema()).unwrap(); writer.write(rb).unwrap(); writer.finish().unwrap(); drop(writer); let mut reader = - crate::reader::StreamReader::try_new(std::io::Cursor::new(buf), None) - .unwrap(); + crate::reader::StreamReader::try_new(std::io::Cursor::new(buf), None).unwrap(); reader.next().unwrap().unwrap() } @@ -1336,8 +1309,7 @@ mod tests { writer.finish().unwrap(); drop(writer); - let reader = - crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + let reader = crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); assert_eq!(reader.custom_metadata(), &test_metadata); } @@ -1434,8 +1406,7 @@ mod tests { // can be compared as such. assert_eq!(input_batch.column(1), output_batch.column(1)); - let run_array_1_unsliced = - unslice_run_array(run_array_1_sliced.into_data()).unwrap(); + let run_array_1_unsliced = unslice_run_array(run_array_1_sliced.into_data()).unwrap(); assert_eq!(run_array_1_unsliced, output_batch.column(0).into_data()); } @@ -1528,8 +1499,7 @@ mod tests { dict_dict_array.data_type().clone(), false, )])); - let input_batch = - RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); + let input_batch = RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } @@ -1564,8 +1534,7 @@ mod tests { dict_dict_array.data_type().clone(), false, )])); - let input_batch = - RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); + let input_batch = RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } @@ -1581,10 +1550,7 @@ mod tests { false, ))); let offsets: &[i32; 5] = &[0, 2, 4, 4, 6]; - test_roundtrip_stream_dict_of_list_of_dict_impl::( - list_data_type, - offsets, - ); + test_roundtrip_stream_dict_of_list_of_dict_impl::(list_data_type, offsets); // large list let list_data_type = DataType::LargeList(Arc::new(Field::new_dict( @@ -1595,10 +1561,7 @@ mod tests { false, ))); let offsets: &[i64; 5] = &[0, 2, 4, 4, 7]; - test_roundtrip_stream_dict_of_list_of_dict_impl::( - list_data_type, - offsets, - ); + test_roundtrip_stream_dict_of_list_of_dict_impl::(list_data_type, offsets); } #[test] @@ -1633,8 +1596,7 @@ mod tests { dict_dict_array.data_type().clone(), false, )])); - let input_batch = - RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); + let input_batch = RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } @@ -1645,8 +1607,7 @@ mod tests { let options = RecordBatchOptions::new() .with_match_field_names(true) .with_row_count(Some(10)); - let input_batch = - RecordBatch::try_new_with_options(schema, vec![], &options).unwrap(); + let input_batch = RecordBatch::try_new_with_options(schema, vec![], &options).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 567fa2e94171..a58cbfc51428 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -107,8 +107,7 @@ impl IpcWriteOptions { crate::MetadataVersion::V5 => { if write_legacy_ipc_format { Err(ArrowError::InvalidArgumentError( - "Legacy IPC format only supported on metadata version 4" - .to_string(), + "Legacy IPC format only supported on metadata version 4".to_string(), )) } else { Ok(Self { @@ -173,11 +172,7 @@ impl Default for IpcWriteOptions { pub struct IpcDataGenerator {} impl IpcDataGenerator { - pub fn schema_to_bytes( - &self, - schema: &Schema, - write_options: &IpcWriteOptions, - ) -> EncodedData { + pub fn schema_to_bytes(&self, schema: &Schema, write_options: &IpcWriteOptions) -> EncodedData { let mut fbb = FlatBufferBuilder::new(); let schema = { let fb = crate::convert::schema_to_fb_offset(&mut fbb, schema); @@ -276,9 +271,7 @@ impl IpcDataGenerator { let map_array = as_map_array(column); let (keys, values) = match field.data_type() { - DataType::Struct(fields) if fields.len() == 2 => { - (&fields[0], &fields[1]) - } + DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]), _ => panic!("Incorrect field data type {:?}", field.data_type()), }; @@ -557,18 +550,15 @@ impl IpcDataGenerator { pub(crate) fn unslice_run_array(arr: ArrayData) -> Result { match arr.data_type() { DataType::RunEndEncoded(k, _) => match k.data_type() { - DataType::Int16 => Ok(into_zero_offset_run_array( - RunArray::::from(arr), - )? - .into_data()), - DataType::Int32 => Ok(into_zero_offset_run_array( - RunArray::::from(arr), - )? - .into_data()), - DataType::Int64 => Ok(into_zero_offset_run_array( - RunArray::::from(arr), - )? - .into_data()), + DataType::Int16 => { + Ok(into_zero_offset_run_array(RunArray::::from(arr))?.into_data()) + } + DataType::Int32 => { + Ok(into_zero_offset_run_array(RunArray::::from(arr))?.into_data()) + } + DataType::Int64 => { + Ok(into_zero_offset_run_array(RunArray::::from(arr))?.into_data()) + } d => unreachable!("Unexpected data type {d}"), }, d => Err(ArrowError::InvalidArgumentError(format!( @@ -657,11 +647,7 @@ impl DictionaryTracker { /// * If the tracker has not been configured to error on replacement or this dictionary /// has never been seen before, return `Ok(true)` to indicate that the dictionary was just /// inserted. - pub fn insert( - &mut self, - dict_id: i64, - column: &ArrayRef, - ) -> Result { + pub fn insert(&mut self, dict_id: i64, column: &ArrayRef) -> Result { let dict_data = column.to_data(); let dict_values = &dict_data.child_data()[0]; @@ -773,14 +759,12 @@ impl FileWriter { let (meta, data) = write_message(&mut self.writer, encoded_dictionary, &self.write_options)?; - let block = - crate::Block::new(self.block_offsets as i64, meta as i32, data as i64); + let block = crate::Block::new(self.block_offsets as i64, meta as i32, data as i64); self.dictionary_blocks.push(block); self.block_offsets += meta + data; } - let (meta, data) = - write_message(&mut self.writer, encoded_message, &self.write_options)?; + let (meta, data) = write_message(&mut self.writer, encoded_message, &self.write_options)?; // add a record block for the footer let block = crate::Block::new( self.block_offsets as i64, @@ -1097,9 +1081,7 @@ fn write_continuation( // the version of the writer determines whether continuation markers should be added match write_options.metadata_version { - crate::MetadataVersion::V1 - | crate::MetadataVersion::V2 - | crate::MetadataVersion::V3 => { + crate::MetadataVersion::V1 | crate::MetadataVersion::V2 | crate::MetadataVersion::V3 => { unreachable!("Options with the metadata version cannot be created") } crate::MetadataVersion::V4 => { @@ -1271,15 +1253,8 @@ fn write_array_data( if buffer_need_truncate(array_data.offset(), buffer, spec, min_length) { let byte_offset = array_data.offset() * byte_width; let buffer_length = min(min_length, buffer.len() - byte_offset); - let buffer_slice = - &buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; - offset = write_buffer( - buffer_slice, - buffers, - arrow_data, - offset, - compression_codec, - )?; + let buffer_slice = &buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; + offset = write_buffer(buffer_slice, buffers, arrow_data, offset, compression_codec)?; } else { offset = write_buffer( buffer.as_slice(), @@ -1299,8 +1274,7 @@ fn write_array_data( offset = write_buffer(&buffer, buffers, arrow_data, offset, compression_codec)?; } else { for buffer in array_data.buffers() { - offset = - write_buffer(buffer, buffers, arrow_data, offset, compression_codec)?; + offset = write_buffer(buffer, buffers, arrow_data, offset, compression_codec)?; } } @@ -1374,9 +1348,7 @@ fn write_buffer( } .try_into() .map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Could not convert compressed size to i64: {e}" - )) + ArrowError::InvalidArgumentError(format!("Could not convert compressed size to i64: {e}")) })?; // make new index entry @@ -1417,21 +1389,18 @@ mod tests { let values: Vec> = vec![]; let array = Int32Array::from(values); let record_batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); { - let write_option = - IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) - .unwrap() - .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) - .unwrap(); + let write_option = IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) + .unwrap(); let mut writer = - FileWriter::try_new_with_options(&mut file, &schema, write_option) - .unwrap(); + FileWriter::try_new_with_options(&mut file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); writer.finish().unwrap(); } @@ -1470,20 +1439,17 @@ mod tests { let values: Vec> = vec![Some(12), Some(1)]; let array = Int32Array::from(values); let record_batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); { - let write_option = - IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) - .unwrap() - .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) - .unwrap(); + let write_option = IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(crate::CompressionType::LZ4_FRAME)) + .unwrap(); let mut writer = - FileWriter::try_new_with_options(&mut file, &schema, write_option) - .unwrap(); + FileWriter::try_new_with_options(&mut file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); writer.finish().unwrap(); } @@ -1522,19 +1488,16 @@ mod tests { let values: Vec> = vec![Some(12), Some(1)]; let array = Int32Array::from(values); let record_batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); { - let write_option = - IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) - .unwrap() - .try_with_compression(Some(crate::CompressionType::ZSTD)) - .unwrap(); + let write_option = IpcWriteOptions::try_new(8, false, crate::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(crate::CompressionType::ZSTD)) + .unwrap(); let mut writer = - FileWriter::try_new_with_options(&mut file, &schema, write_option) - .unwrap(); + FileWriter::try_new_with_options(&mut file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); writer.finish().unwrap(); } @@ -1581,11 +1544,9 @@ mod tests { None, ]; let array1 = UInt32Array::from(values); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(array1) as ArrayRef], - ) - .unwrap(); + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array1) as ArrayRef]) + .unwrap(); let mut file = tempfile::tempfile().unwrap(); { let mut writer = FileWriter::try_new(&mut file, &schema).unwrap(); @@ -1634,8 +1595,7 @@ mod tests { .unwrap(); let mut file = tempfile::tempfile().unwrap(); { - let mut writer = - FileWriter::try_new_with_options(&mut file, &schema, options).unwrap(); + let mut writer = FileWriter::try_new_with_options(&mut file, &schema, options).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); @@ -1663,18 +1623,14 @@ mod tests { fn test_write_null_file_v4() { write_null_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap()); write_null_file(IpcWriteOptions::try_new(8, true, MetadataVersion::V4).unwrap()); - write_null_file( - IpcWriteOptions::try_new(64, false, MetadataVersion::V4).unwrap(), - ); + write_null_file(IpcWriteOptions::try_new(64, false, MetadataVersion::V4).unwrap()); write_null_file(IpcWriteOptions::try_new(64, true, MetadataVersion::V4).unwrap()); } #[test] fn test_write_null_file_v5() { write_null_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap()); - write_null_file( - IpcWriteOptions::try_new(64, false, MetadataVersion::V5).unwrap(), - ); + write_null_file(IpcWriteOptions::try_new(64, false, MetadataVersion::V5).unwrap()); } #[test] @@ -1684,15 +1640,13 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 - let dctfield = - Field::new_dict("dict", array.data_type().clone(), false, 2, false); + let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false); let types = Buffer::from_slice_ref([0_i8, 0, 0]); let offsets = Buffer::from_slice_ref([0_i32, 1, 2]); let union = - UnionArray::try_new(&[0], types, Some(offsets), vec![(dctfield, array)]) - .unwrap(); + UnionArray::try_new(&[0], types, Some(offsets), vec![(dctfield, array)]).unwrap(); let schema = Arc::new(Schema::new(vec![Field::new( "union", @@ -1764,16 +1718,13 @@ mod tests { builder.append::("a", 4).unwrap(); let union = builder.build().unwrap(); - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(union) as ArrayRef], - ) - .unwrap(); + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(union) as ArrayRef]) + .unwrap(); let mut file = tempfile::tempfile().unwrap(); { - let mut writer = - FileWriter::try_new_with_options(&mut file, &schema, options).unwrap(); + let mut writer = FileWriter::try_new_with_options(&mut file, &schema, options).unwrap(); writer.write(&batch).unwrap(); writer.finish().unwrap(); @@ -1799,12 +1750,8 @@ mod tests { #[test] fn test_write_union_file_v4_v5() { - write_union_file( - IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap(), - ); - write_union_file( - IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(), - ); + write_union_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap()); + write_union_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap()); } fn serialize(record: &RecordBatch) -> Vec { @@ -1817,8 +1764,7 @@ mod tests { fn deserialize(bytes: Vec) -> RecordBatch { let mut stream_reader = - crate::reader::StreamReader::try_new(std::io::Cursor::new(bytes), None) - .unwrap(); + crate::reader::StreamReader::try_new(std::io::Cursor::new(bytes), None).unwrap(); stream_reader.next().unwrap().unwrap() } @@ -1833,8 +1779,7 @@ mod tests { let a = Int32Array::from_iter_values(0..rows as i32); let b = StringArray::from_iter_values((0..rows).map(|i| i.to_string())); - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap() + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap() } let big_record_batch = create_batch(65536); @@ -1844,9 +1789,7 @@ mod tests { let offset = 2; let record_batch_slice = big_record_batch.slice(offset, length); - assert!( - serialize(&big_record_batch).len() > serialize(&small_record_batch).len() - ); + assert!(serialize(&big_record_batch).len() > serialize(&small_record_batch).len()); assert_eq!( serialize(&small_record_batch).len(), serialize(&record_batch_slice).len() @@ -1869,8 +1812,7 @@ mod tests { let a = Int32Array::from(vec![Some(1), None, Some(1), None, Some(1)]); let b = StringArray::from(vec![None, Some("a"), Some("a"), None, Some("a")]); - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap() + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap() } let record_batch = create_batch(); @@ -1893,13 +1835,11 @@ mod tests { let values: StringArray = [Some("foo"), Some("bar"), Some("baz")] .into_iter() .collect(); - let keys: Int32Array = - [Some(0), Some(2), None, Some(1)].into_iter().collect(); + let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect(); let array = DictionaryArray::new(keys, Arc::new(values)); - let schema = - Schema::new(vec![Field::new("dict", array.data_type().clone(), true)]); + let schema = Schema::new(vec![Field::new("dict", array.data_type().clone(), true)]); RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap() } @@ -1922,8 +1862,7 @@ mod tests { let strings: StringArray = [Some("foo"), None, Some("bar"), Some("baz")] .into_iter() .collect(); - let ints: Int32Array = - [Some(0), Some(2), None, Some(1)].into_iter().collect(); + let ints: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect(); let struct_array = StructArray::from(vec![ ( @@ -1968,8 +1907,7 @@ mod tests { fn truncate_ipc_string_array_with_all_empty_string() { fn create_batch() -> RecordBatch { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); - let a = - StringArray::from(vec![Some(""), Some(""), Some(""), Some(""), Some("")]); + let a = StringArray::from(vec![Some(""), Some(""), Some(""), Some(""), Some("")]); RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap() } @@ -1998,8 +1936,7 @@ mod tests { ) .expect("new batch"); - let mut writer = - StreamWriter::try_new(vec![], &batch.schema()).expect("new writer"); + let mut writer = StreamWriter::try_new(vec![], &batch.schema()).expect("new writer"); writer.write(&batch).expect("write"); let outbuf = writer.into_inner().expect("inner"); @@ -2021,9 +1958,9 @@ mod tests { // slice somewhere in the middle assert_bool_roundtrip( [ - true, false, true, true, false, false, true, true, true, false, false, - false, true, true, true, true, false, false, false, false, true, true, - true, true, true, false, false, false, false, false, + true, false, true, true, false, false, true, true, true, false, false, false, true, + true, true, true, false, false, false, false, true, true, true, true, true, false, + false, false, false, false, ], 13, 17, @@ -2032,8 +1969,7 @@ mod tests { // start at byte boundary, end in the middle assert_bool_roundtrip( [ - true, false, true, true, false, false, true, true, true, false, false, - false, + true, false, true, true, false, false, true, true, true, false, false, false, ], 8, 2, @@ -2042,27 +1978,22 @@ mod tests { // start and stop and byte boundary assert_bool_roundtrip( [ - true, false, true, true, false, false, true, true, true, false, false, - false, true, true, true, true, true, false, false, false, false, false, + true, false, true, true, false, false, true, true, true, false, false, false, true, + true, true, true, true, false, false, false, false, false, ], 8, 8, ); } - fn assert_bool_roundtrip( - bools: [bool; N], - offset: usize, - length: usize, - ) { + fn assert_bool_roundtrip(bools: [bool; N], offset: usize, length: usize) { let val_bool_field = Field::new("val", DataType::Boolean, false); let schema = Arc::new(Schema::new(vec![val_bool_field])); let bools = BooleanArray::from(bools.to_vec()); - let batch = - RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(bools)]).unwrap(); + let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(bools)]).unwrap(); let batch = batch.slice(offset, length); let mut writer = StreamWriter::try_new(Vec::::new(), &schema).unwrap(); @@ -2078,8 +2009,7 @@ mod tests { #[test] fn test_run_array_unslice() { let total_len = 80; - let vals: Vec> = - vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; + let vals: Vec> = vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; let repeats: Vec = vec![3, 4, 1, 2]; let mut input_array: Vec> = Vec::with_capacity(total_len); for ix in 0_usize..32 { @@ -2101,13 +2031,11 @@ mod tests { run_array.slice(0, slice_len).into_data().into(); // Create unsliced run array. - let unsliced_run_array = - into_zero_offset_run_array(sliced_run_array).unwrap(); + let unsliced_run_array = into_zero_offset_run_array(sliced_run_array).unwrap(); let typed = unsliced_run_array .downcast::>() .unwrap(); - let expected: Vec> = - input_array.iter().take(slice_len).copied().collect(); + let expected: Vec> = input_array.iter().take(slice_len).copied().collect(); let actual: Vec> = typed.into_iter().collect(); assert_eq!(expected, actual); @@ -2118,8 +2046,7 @@ mod tests { .into(); // Create unsliced run array. - let unsliced_run_array = - into_zero_offset_run_array(sliced_run_array).unwrap(); + let unsliced_run_array = into_zero_offset_run_array(sliced_run_array).unwrap(); let typed = unsliced_run_array .downcast::>() .unwrap(); @@ -2154,8 +2081,7 @@ mod tests { ls.finish() }; - let batch = - RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(values)]).unwrap(); + let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(values)]).unwrap(); let batch = batch.slice(1, 1); let mut writer = FileWriter::try_new(Vec::::new(), &schema).unwrap(); diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index d6f7670f2dc9..b6f8c18ea9c3 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -92,10 +92,7 @@ impl ArrayDecoder for ListArrayDecoder { } let offset = O::from_usize(child_pos.len()).ok_or_else(|| { - ArrowError::JsonError(format!( - "offset overflow decoding {}", - self.data_type - )) + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) })?; offsets.append(offset) } diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs index a1f7e5ace66e..cd1ca5f71fa9 100644 --- a/arrow-json/src/reader/map_array.rs +++ b/arrow-json/src/reader/map_array.rs @@ -121,10 +121,7 @@ impl ArrayDecoder for MapArrayDecoder { } let offset = i32::from_usize(key_pos.len()).ok_or_else(|| { - ArrowError::JsonError(format!( - "offset overflow decoding {}", - self.data_type - )) + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) })?; offsets.append(offset) } diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index c1cef0ec81b4..1225e51b3af7 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -142,9 +142,7 @@ use serde::Serialize; use arrow_array::timezone::Tz; use arrow_array::types::Float32Type; use arrow_array::types::*; -use arrow_array::{ - downcast_integer, make_array, RecordBatch, RecordBatchReader, StructArray, -}; +use arrow_array::{downcast_integer, make_array, RecordBatch, RecordBatchReader, StructArray}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, FieldRef, Schema, SchemaRef, TimeUnit}; pub use schema::*; @@ -290,8 +288,7 @@ impl ReaderBuilder { } }; - let decoder = - make_decoder(data_type, self.coerce_primitive, self.strict_mode, nullable)?; + let decoder = make_decoder(data_type, self.coerce_primitive, self.strict_mode, nullable)?; let num_fields = self.schema.all_fields().len(); @@ -629,8 +626,9 @@ impl Decoder { let batch = match self.is_field { true => RecordBatch::try_new(self.schema.clone(), vec![make_array(decoded)])?, - false => RecordBatch::from(StructArray::from(decoded)) - .with_schema(self.schema.clone())?, + false => { + RecordBatch::from(StructArray::from(decoded)).with_schema(self.schema.clone())? + } }; Ok(Some(batch)) @@ -719,9 +717,7 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{ - make_array, Array, BooleanArray, ListArray, StringArray, StructArray, - }; + use arrow_array::{make_array, Array, BooleanArray, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; @@ -1545,11 +1541,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int16, false), Field::new("b", DataType::Utf8, false), - Field::new_struct( - "c", - vec![Field::new("a", DataType::Boolean, false)], - false, - ), + Field::new_struct("c", vec![Field::new("a", DataType::Boolean, false)], false), ])); let err = ReaderBuilder::new(schema) @@ -1832,15 +1824,13 @@ mod tests { #[test] fn test_nested_list_json_arrays() { - let c_field = - Field::new_struct("c", vec![Field::new("d", DataType::Utf8, true)], true); + let c_field = Field::new_struct("c", vec![Field::new("d", DataType::Utf8, true)], true); let a_struct_field = Field::new_struct( "a", vec![Field::new("b", DataType::Boolean, true), c_field.clone()], true, ); - let a_field = - Field::new("a", DataType::List(Arc::new(a_struct_field.clone())), true); + let a_field = Field::new("a", DataType::List(Arc::new(a_struct_field.clone())), true); let schema = Arc::new(Schema::new(vec![a_field.clone()])); let builder = ReaderBuilder::new(schema).with_batch_size(64); let json_content = r#" diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs index 6cf0bac86737..a03a41e96dcb 100644 --- a/arrow-json/src/reader/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -94,8 +94,8 @@ where P::Native: ParseJsonNumber + NumCast, { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { - let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) - .with_data_type(self.data_type.clone()); + let mut builder = + PrimitiveBuilder::

::with_capacity(pos.len()).with_data_type(self.data_type.clone()); let d = &self.data_type; for p in pos { @@ -111,10 +111,9 @@ where } TapeElement::Number(idx) => { let s = tape.get_string(idx); - let value = - ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| { - ArrowError::JsonError(format!("failed to parse {s} as {d}",)) - })?; + let value = ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| { + ArrowError::JsonError(format!("failed to parse {s} as {d}",)) + })?; builder.append_value(value) } diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 126a85df3931..58aa08014daa 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -52,10 +52,7 @@ impl InferredType { } (_, InferredType::Any) => {} // convert a scalar type to a single-item scalar array type. - ( - InferredType::Array(self_inner_type), - other_scalar @ InferredType::Scalar(_), - ) => { + (InferredType::Array(self_inner_type), other_scalar @ InferredType::Scalar(_)) => { self_inner_type.merge(other_scalar)?; } (s @ InferredType::Scalar(_), InferredType::Array(mut other_inner_type)) => { @@ -197,9 +194,10 @@ impl Iterator for ValueIter { } self.record_count += 1; - return Some(serde_json::from_str(trimmed_s).map_err(|e| { - ArrowError::JsonError(format!("Not valid JSON: {e}")) - })); + return Some( + serde_json::from_str(trimmed_s) + .map_err(|e| ArrowError::JsonError(format!("Not valid JSON: {e}"))), + ); } } } @@ -393,17 +391,13 @@ fn collect_field_types_from_object( InferredType::Scalar(_) => { field_types.insert( k.to_string(), - InferredType::Array(Box::new(InferredType::Scalar( - HashSet::new(), - ))), + InferredType::Array(Box::new(InferredType::Scalar(HashSet::new()))), ); } InferredType::Object(_) => { field_types.insert( k.to_string(), - InferredType::Array(Box::new(InferredType::Object( - HashMap::new(), - ))), + InferredType::Array(Box::new(InferredType::Object(HashMap::new()))), ); } InferredType::Any | InferredType::Array(_) => { @@ -456,8 +450,7 @@ fn collect_field_types_from_object( } Value::Object(inner_map) => { if !field_types.contains_key(k) { - field_types - .insert(k.to_string(), InferredType::Object(HashMap::new())); + field_types.insert(k.to_string(), InferredType::Object(HashMap::new())); } match field_types.get_mut(k).unwrap() { InferredType::Object(inner_field_types) => { @@ -528,8 +521,7 @@ mod tests { Field::new("d", list_type_of(DataType::Utf8), true), ]); - let mut reader = - BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); + let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); @@ -550,9 +542,7 @@ mod tests { Field::new("a", DataType::Boolean, true), Field::new( "b", - DataType::Struct( - vec![Field::new("c", DataType::Utf8, true)].into(), - ), + DataType::Struct(vec![Field::new("c", DataType::Utf8, true)].into()), true, ), ])), @@ -568,9 +558,9 @@ mod tests { Ok(serde_json::json!({"c1": {"a": false, "b": null}, "c2": 0})), Ok(serde_json::json!({"c1": {"a": true, "b": {"c": "text"}}, "c3": "ok"})), ] - .into_iter(), + .into_iter(), ) - .unwrap(); + .unwrap(); assert_eq!(inferred_schema, schema); } @@ -606,9 +596,9 @@ mod tests { })), Ok(serde_json::json!({"c1": [], "c2": 0.5, "c3": []})), ] - .into_iter(), + .into_iter(), ) - .unwrap(); + .unwrap(); assert_eq!(inferred_schema, schema); } diff --git a/arrow-json/src/reader/serializer.rs b/arrow-json/src/reader/serializer.rs index 2fd250bdfcc3..378d77bd9155 100644 --- a/arrow-json/src/reader/serializer.rs +++ b/arrow-json/src/reader/serializer.rs @@ -18,8 +18,7 @@ use crate::reader::tape::TapeElement; use lexical_core::FormattedSize; use serde::ser::{ - Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple, - SerializeTupleStruct, + Impossible, SerializeMap, SerializeSeq, SerializeStruct, SerializeTuple, SerializeTupleStruct, }; use serde::{Serialize, Serializer}; @@ -231,17 +230,11 @@ impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> { Ok(()) } - fn serialize_seq( - self, - _len: Option, - ) -> Result { + fn serialize_seq(self, _len: Option) -> Result { Ok(ListSerializer::new(self)) } - fn serialize_tuple( - self, - len: usize, - ) -> Result { + fn serialize_tuple(self, len: usize) -> Result { self.serialize_seq(Some(len)) } @@ -266,10 +259,7 @@ impl<'a, 'b> Serializer for &'a mut TapeSerializer<'b> { } // Maps are represented in JSON as `{ K: V, K: V, ... }`. - fn serialize_map( - self, - _len: Option, - ) -> Result { + fn serialize_map(self, _len: Option) -> Result { Ok(ObjectSerializer::new(self)) } diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index ea9a7157423f..63a9bcedb7d1 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -72,8 +72,7 @@ impl ArrayDecoder for StringArrayDecoder { ))); } - let mut builder = - GenericStringBuilder::::with_capacity(pos.len(), data_capacity); + let mut builder = GenericStringBuilder::::with_capacity(pos.len(), data_capacity); for p in pos { match tape.get(*p) { diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs index 77d7e170d07c..6c805591d390 100644 --- a/arrow-json/src/reader/struct_array.rs +++ b/arrow-json/src/reader/struct_array.rs @@ -64,8 +64,7 @@ impl StructArrayDecoder { impl ArrayDecoder for StructArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let fields = struct_fields(&self.data_type); - let mut child_pos: Vec<_> = - (0..fields.len()).map(|_| vec![0; pos.len()]).collect(); + let mut child_pos: Vec<_> = (0..fields.len()).map(|_| vec![0; pos.len()]).collect(); let mut nulls = self .is_nullable @@ -118,10 +117,9 @@ impl ArrayDecoder for StructArrayDecoder { .zip(fields) .map(|((d, pos), f)| { d.decode(tape, &pos).map_err(|e| match e { - ArrowError::JsonError(s) => ArrowError::JsonError(format!( - "whilst decoding field '{}': {s}", - f.name() - )), + ArrowError::JsonError(s) => { + ArrowError::JsonError(format!("whilst decoding field '{}': {s}", f.name())) + } e => e, }) }) @@ -133,11 +131,13 @@ impl ArrayDecoder for StructArrayDecoder { // Sanity check assert_eq!(c.len(), pos.len()); if let Some(a) = c.nulls() { - let nulls_valid = f.is_nullable() - || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); + let nulls_valid = + f.is_nullable() || nulls.as_ref().map(|n| n.contains(a)).unwrap_or_default(); if !nulls_valid { - return Err(ArrowError::JsonError(format!("Encountered unmasked nulls in non-nullable StructArray child: {f}"))); + return Err(ArrowError::JsonError(format!( + "Encountered unmasked nulls in non-nullable StructArray child: {f}" + ))); } } } diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index b39caede7047..4822ad0bf43d 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -369,8 +369,7 @@ impl TapeDecoder { b'}' => { let start_idx = *start_idx; let end_idx = self.elements.len() as u32; - self.elements[start_idx as usize] = - TapeElement::StartObject(end_idx); + self.elements[start_idx as usize] = TapeElement::StartObject(end_idx); self.elements.push(TapeElement::EndObject(start_idx)); self.stack.pop(); } @@ -385,8 +384,7 @@ impl TapeDecoder { iter.next(); let start_idx = *start_idx; let end_idx = self.elements.len() as u32; - self.elements[start_idx as usize] = - TapeElement::StartList(end_idx); + self.elements[start_idx as usize] = TapeElement::StartList(end_idx); self.elements.push(TapeElement::EndList(start_idx)); self.stack.pop(); } @@ -561,7 +559,10 @@ impl TapeDecoder { } if self.offsets.len() >= u32::MAX as usize { - return Err(ArrowError::JsonError(format!("Encountered more than {} JSON elements, consider using a smaller batch size", u32::MAX))); + return Err(ArrowError::JsonError(format!( + "Encountered more than {} JSON elements, consider using a smaller batch size", + u32::MAX + ))); } // Sanity check @@ -570,9 +571,8 @@ impl TapeDecoder { self.bytes.len() ); - let strings = std::str::from_utf8(&self.bytes).map_err(|_| { - ArrowError::JsonError("Encountered non-UTF-8 data".to_string()) - })?; + let strings = std::str::from_utf8(&self.bytes) + .map_err(|_| ArrowError::JsonError("Encountered non-UTF-8 data".to_string()))?; for offset in self.offsets.iter().copied() { if !strings.is_char_boundary(offset) { @@ -673,9 +673,8 @@ fn err(b: u8, ctx: &str) -> ArrowError { /// Creates a character from an UTF-16 surrogate pair fn char_from_surrogate_pair(low: u16, high: u16) -> Result { let n = (((high - 0xD800) as u32) << 10 | (low - 0xDC00) as u32) + 0x1_0000; - char::from_u32(n).ok_or_else(|| { - ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}")) - }) + char::from_u32(n) + .ok_or_else(|| ArrowError::JsonError(format!("Invalid UTF-16 surrogate pair {n}"))) } /// Writes `c` as UTF-8 to `out` @@ -818,9 +817,8 @@ mod tests { assert_eq!( &finished.string_offsets, &[ - 0, 5, 10, 13, 14, 17, 19, 22, 25, 28, 29, 30, 31, 32, 32, 32, 33, 34, 35, - 41, 47, 52, 55, 57, 58, 59, 62, 63, 63, 66, 69, 70, 71, 72, 73, 74, 75, - 76, 77 + 0, 5, 10, 13, 14, 17, 19, 22, 25, 28, 29, 30, 31, 32, 32, 32, 33, 34, 35, 41, 47, + 52, 55, 57, 58, 59, 62, 63, 63, 66, 69, 70, 71, 72, 73, 74, 75, 76, 77 ] ) } diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index 09672614107c..dda5a653d730 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -52,8 +52,8 @@ where Tz: TimeZone + Send, { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { - let mut builder = PrimitiveBuilder::

::with_capacity(pos.len()) - .with_data_type(self.data_type.clone()); + let mut builder = + PrimitiveBuilder::

::with_capacity(pos.len()).with_data_type(self.data_type.clone()); for p in pos { match tape.get(*p) { @@ -71,14 +71,12 @@ where TimeUnit::Second => date.timestamp(), TimeUnit::Millisecond => date.timestamp_millis(), TimeUnit::Microsecond => date.timestamp_micros(), - TimeUnit::Nanosecond => { - date.timestamp_nanos_opt().ok_or_else(|| { - ArrowError::ParseError(format!( - "{} would overflow 64-bit signed nanoseconds", - date.to_rfc3339(), - )) - })? - } + TimeUnit::Nanosecond => date.timestamp_nanos_opt().ok_or_else(|| { + ArrowError::ParseError(format!( + "{} would overflow 64-bit signed nanoseconds", + date.to_rfc3339(), + )) + })?, }; builder.append_value(value) } @@ -98,9 +96,7 @@ where } TapeElement::I32(v) => builder.append_value(v as i64), TapeElement::I64(high) => match tape.get(p + 1) { - TapeElement::I32(low) => { - builder.append_value((high as i64) << 32 | low as i64) - } + TapeElement::I32(low) => builder.append_value((high as i64) << 32 | low as i64), _ => unreachable!(), }, _ => return Err(tape.error(*p, "primitive")), diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 8c4145bc95b4..97a8b38d4192 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -329,10 +329,7 @@ fn set_column_for_json_rows( rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { - row.insert( - col_name.to_string(), - Value::Array(array_to_json_array(&v)?), - ); + row.insert(col_name.to_string(), Value::Array(array_to_json_array(&v)?)); } Ok(()) }, @@ -384,10 +381,7 @@ fn set_column_for_json_rows( let mut obj = serde_json::Map::new(); for (_, (k, v)) in (0..len).zip(&mut kv) { - obj.insert( - k.expect("keys in a map should be non-null").to_string(), - v, - ); + obj.insert(k.expect("keys in a map should be non-null").to_string(), v); } row.insert(col_name.to_string(), serde_json::Value::Object(obj)); @@ -440,11 +434,7 @@ pub trait JsonFormat: Debug + Default { #[inline] /// write any bytes needed for the start of each row - fn start_row( - &self, - _writer: &mut W, - _is_first_row: bool, - ) -> Result<(), ArrowError> { + fn start_row(&self, _writer: &mut W, _is_first_row: bool) -> Result<(), ArrowError> { Ok(()) } @@ -491,11 +481,7 @@ impl JsonFormat for JsonArray { Ok(()) } - fn start_row( - &self, - writer: &mut W, - is_first_row: bool, - ) -> Result<(), ArrowError> { + fn start_row(&self, writer: &mut W, is_first_row: bool) -> Result<(), ArrowError> { if !is_first_row { writer.write_all(b",")?; } @@ -562,8 +548,7 @@ where self.format.start_row(&mut self.writer, is_first_row)?; self.writer.write_all( - &serde_json::to_vec(row) - .map_err(|error| ArrowError::JsonError(error.to_string()))?, + &serde_json::to_vec(row).map_err(|error| ArrowError::JsonError(error.to_string()))?, )?; self.format.end_row(&mut self.writer)?; Ok(()) @@ -657,9 +642,7 @@ mod tests { let a = Int32Array::from(vec![Some(1), Some(2), Some(3), None, Some(5)]); let b = StringArray::from(vec![Some("a"), Some("b"), Some("c"), Some("d"), None]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let mut buf = Vec::new(); { @@ -688,9 +671,7 @@ mod tests { let a = StringArray::from(vec![Some("a"), None, Some("c"), Some("d"), None]); let b = LargeStringArray::from(vec![Some("a"), Some("b"), None, Some("d"), None]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let mut buf = Vec::new(); { @@ -730,9 +711,7 @@ mod tests { .into_iter() .collect(); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let mut buf = Vec::new(); { @@ -1005,9 +984,7 @@ mod tests { Field::new("c11", DataType::Int32, true), Field::new( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()), false, ), ])), @@ -1024,23 +1001,19 @@ mod tests { ( Arc::new(Field::new( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()), false, )), Arc::new(StructArray::from(vec![( Arc::new(Field::new("c121", DataType::Utf8, false)), - Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) - as ArrayRef, + Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, ), ]); let c2 = StringArray::from(vec![Some("a"), Some("b"), Some("c")]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut buf = Vec::new(); { @@ -1081,9 +1054,7 @@ mod tests { let b = Int32Array::from(vec![1, 2, 3, 4, 5]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let mut buf = Vec::new(); { @@ -1142,8 +1113,7 @@ mod tests { let c2 = StringArray::from(vec![Some("foo"), Some("bar"), None]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut buf = Vec::new(); { @@ -1170,9 +1140,7 @@ mod tests { Field::new("c11", DataType::Int32, true), Field::new( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()), false, ), ])), @@ -1191,15 +1159,12 @@ mod tests { ( Arc::new(Field::new( "c12", - DataType::Struct( - vec![Field::new("c121", DataType::Utf8, false)].into(), - ), + DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()), false, )), Arc::new(StructArray::from(vec![( Arc::new(Field::new("c121", DataType::Utf8, false)), - Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) - as ArrayRef, + Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, ), ]); @@ -1221,8 +1186,7 @@ mod tests { let c2 = Int32Array::from(vec![1, 2, 3]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut buf = Vec::new(); { @@ -1261,9 +1225,8 @@ mod tests { let mut expected_json = serde_json::from_str::(e).unwrap(); // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { - expected_json = Value::Object( - obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), - ); + expected_json = + Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); } assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } @@ -1328,8 +1291,7 @@ mod tests { {"list": [{"ints": null}]} {"list": [null]} "#; - let ints_struct = - DataType::Struct(vec![Field::new("ints", DataType::Int32, true)].into()); + let ints_struct = DataType::Struct(vec![Field::new("ints", DataType::Int32, true)].into()); let list_type = DataType::List(Arc::new(Field::new("item", ints_struct, true))); let list_field = Field::new("list", list_type, true); let schema = Arc::new(Schema::new(vec![list_field])); @@ -1368,8 +1330,7 @@ mod tests { #[test] fn json_writer_map() { - let keys_array = - super::StringArray::from(vec!["foo", "bar", "baz", "qux", "quux"]); + let keys_array = super::StringArray::from(vec!["foo", "bar", "baz", "qux", "quux"]); let values_array = super::Int64Array::from(vec![10, 20, 30, 40, 50]); let keys = Arc::new(Field::new("keys", DataType::Utf8, false)); @@ -1449,9 +1410,8 @@ mod tests { let mut expected_json = serde_json::from_str::(e).unwrap(); // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { - expected_json = Value::Object( - obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), - ); + expected_json = + Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); } assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } @@ -1494,9 +1454,8 @@ mod tests { let mut expected_json = serde_json::from_str::(e).unwrap(); // remove null value from object to make comparison consistent: if let Value::Object(obj) = expected_json { - expected_json = Value::Object( - obj.into_iter().filter(|(_, v)| *v != Value::Null).collect(), - ); + expected_json = + Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); } assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } @@ -1518,8 +1477,7 @@ mod tests { Some(vec![Some(6), Some(7), Some(45)]), ]; - let list_array = - FixedSizeListArray::from_iter_primitive::(data, 3); + let list_array = FixedSizeListArray::from_iter_primitive::(data, 3); let list_array = Arc::new(list_array) as ArrayRef; assert_eq!(array_to_json_array(&list_array).unwrap(), expected_json); diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs index feb168335568..bfb1f64e2eb8 100644 --- a/arrow-ord/src/cmp.rs +++ b/arrow-ord/src/cmp.rs @@ -26,8 +26,8 @@ use arrow_array::cast::AsArray; use arrow_array::types::ByteArrayType; use arrow_array::{ - downcast_primitive_array, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, - Datum, FixedSizeBinaryArray, GenericByteArray, + downcast_primitive_array, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, Datum, + FixedSizeBinaryArray, GenericByteArray, }; use arrow_buffer::bit_util::ceil; use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer}; @@ -162,20 +162,13 @@ pub fn distinct(lhs: &dyn Datum, rhs: &dyn Datum) -> Result Result { +pub fn not_distinct(lhs: &dyn Datum, rhs: &dyn Datum) -> Result { compare_op(Op::NotDistinct, lhs, rhs) } /// Perform `op` on the provided `Datum` #[inline(never)] -fn compare_op( - op: Op, - lhs: &dyn Datum, - rhs: &dyn Datum, -) -> Result { +fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { use arrow_schema::DataType::*; let (l, l_s) = lhs.get(); let (r, r_s) = rhs.get(); @@ -319,12 +312,8 @@ fn apply( assert_eq!(l_v.len(), r_v.len()); // Sanity check Some(match op { - Op::Equal | Op::NotDistinct => { - apply_op_vectored(l, &l_v, r, &r_v, false, T::is_eq) - } - Op::NotEqual | Op::Distinct => { - apply_op_vectored(l, &l_v, r, &r_v, true, T::is_eq) - } + Op::Equal | Op::NotDistinct => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_eq), + Op::NotEqual | Op::Distinct => apply_op_vectored(l, &l_v, r, &r_v, true, T::is_eq), Op::Less => apply_op_vectored(l, &l_v, r, &r_v, false, T::is_lt), Op::LessEqual => apply_op_vectored(r, &r_v, l, &l_v, true, T::is_lt), Op::Greater => apply_op_vectored(r, &r_v, l, &l_v, false, T::is_lt), @@ -561,10 +550,7 @@ mod tests { #[test] fn test_null_dict() { - let a = DictionaryArray::new( - Int32Array::new_null(10), - Arc::new(Int32Array::new_null(0)), - ); + let a = DictionaryArray::new(Int32Array::new_null(10), Arc::new(Int32Array::new_null(0))); let r = eq(&a, &a).unwrap(); assert_eq!(r.null_count(), 10); @@ -575,17 +561,13 @@ mod tests { let r = eq(&a, &a).unwrap(); assert_eq!(r.null_count(), 6); - let scalar = DictionaryArray::new( - Int32Array::new_null(1), - Arc::new(Int32Array::new_null(0)), - ); + let scalar = + DictionaryArray::new(Int32Array::new_null(1), Arc::new(Int32Array::new_null(0))); let r = eq(&a, &Scalar::new(&scalar)).unwrap(); assert_eq!(r.null_count(), 6); - let scalar = DictionaryArray::new( - Int32Array::new_null(1), - Arc::new(Int32Array::new_null(0)), - ); + let scalar = + DictionaryArray::new(Int32Array::new_null(1), Arc::new(Int32Array::new_null(0))); let r = eq(&Scalar::new(&scalar), &Scalar::new(&scalar)).unwrap(); assert_eq!(r.null_count(), 1); diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index ffd35a6070b8..021ecdf0e658 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -42,14 +42,8 @@ macro_rules! try_to_type { } // Avoids creating a closure for each combination of `$RIGHT` and `$TY` -fn try_to_type_result( - value: Option, - right: &str, - ty: &str, -) -> Result { - value.ok_or_else(|| { - ArrowError::ComputeError(format!("Could not convert {right} with {ty}",)) - }) +fn try_to_type_result(value: Option, right: &str, ty: &str) -> Result { + value.ok_or_else(|| ArrowError::ComputeError(format!("Could not convert {right} with {ty}",))) } fn make_primitive_scalar( @@ -106,8 +100,7 @@ fn make_primitive_scalar( DataType::Decimal128(_, _) => { let right = try_to_type!(scalar, to_i128)?; Ok(Arc::new( - PrimitiveArray::::from(vec![right]) - .with_data_type(d.clone()), + PrimitiveArray::::from(vec![right]).with_data_type(d.clone()), )) } DataType::Decimal256(_, _) => { @@ -149,8 +142,7 @@ fn make_primitive_scalar( DataType::Timestamp(TimeUnit::Second, _) => { let right = try_to_type!(scalar, to_i64)?; Ok(Arc::new( - PrimitiveArray::::from(vec![right]) - .with_data_type(d.clone()), + PrimitiveArray::::from(vec![right]).with_data_type(d.clone()), )) } DataType::Time32(TimeUnit::Second) => { @@ -229,12 +221,10 @@ fn make_primitive_scalar( fn make_binary_scalar(d: &DataType, scalar: &[u8]) -> Result { match d { DataType::Binary => Ok(Arc::new(BinaryArray::from_iter_values([scalar]))), - DataType::FixedSizeBinary(_) => Ok(Arc::new( - FixedSizeBinaryArray::try_from_iter([scalar].into_iter())?, - )), - DataType::LargeBinary => { - Ok(Arc::new(LargeBinaryArray::from_iter_values([scalar]))) - } + DataType::FixedSizeBinary(_) => Ok(Arc::new(FixedSizeBinaryArray::try_from_iter( + [scalar].into_iter(), + )?)), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter_values([scalar]))), DataType::Dictionary(_, v) => make_binary_scalar(v.as_ref(), scalar), _ => Err(ArrowError::InvalidArgumentError(format!( "Unsupported binary scalar data type {d:?}", @@ -265,8 +255,7 @@ where { if left.len() != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), + "Cannot perform comparison operation on arrays of different length".to_string(), )); } @@ -275,10 +264,7 @@ where /// Helper function to perform boolean lambda function on values from array accessor, this /// version does not attempt to use SIMD. -fn compare_op_scalar( - left: T, - op: F, -) -> Result +fn compare_op_scalar(left: T, op: F) -> Result where F: Fn(T::Item) -> bool, { @@ -336,114 +322,78 @@ pub fn eq_utf8_scalar( /// Perform `left == right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::eq(&left, &right) } /// Perform `left != right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn neq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::neq(&left, &right) } /// Perform `left < right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn lt_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::lt(&left, &right) } /// Perform `left <= right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::lt_eq")] -pub fn lt_eq_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn lt_eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::lt_eq(&left, &right) } /// Perform `left > right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn gt_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::gt(&left, &right) } /// Perform `left >= right` operation on [`BooleanArray`] #[deprecated(note = "Use arrow_ord::cmp::gt_eq")] -pub fn gt_eq_bool( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn gt_eq_bool(left: &BooleanArray, right: &BooleanArray) -> Result { crate::cmp::gt_eq(&left, &right) } /// Perform `left == right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::eq(&left, &Scalar::new(&right)) } /// Perform `left < right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn lt_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::lt(&left, &Scalar::new(&right)) } /// Perform `left <= right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::lt_eq")] -pub fn lt_eq_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn lt_eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::lt_eq(&left, &Scalar::new(&right)) } /// Perform `left > right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn gt_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::gt(&left, &Scalar::new(&right)) } /// Perform `left >= right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::gt_eq")] -pub fn gt_eq_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn gt_eq_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::gt_eq(&left, &Scalar::new(&right)) } /// Perform `left != right` operation on [`BooleanArray`] and a scalar #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_bool_scalar( - left: &BooleanArray, - right: bool, -) -> Result { +pub fn neq_bool_scalar(left: &BooleanArray, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::neq(&left, &Scalar::new(&right)) } @@ -768,10 +718,7 @@ where /// Perform `left == right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::eq(&left, &Scalar::new(&right)) } @@ -779,10 +726,7 @@ pub fn eq_dyn_binary_scalar( /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn neq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::neq(&left, &Scalar::new(&right)) } @@ -790,10 +734,7 @@ pub fn neq_dyn_binary_scalar( /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn lt_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::lt(&left, &Scalar::new(&right)) } @@ -801,10 +742,7 @@ pub fn lt_dyn_binary_scalar( /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::lt_eq")] -pub fn lt_eq_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn lt_eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::lt_eq(&left, &Scalar::new(&right)) } @@ -812,10 +750,7 @@ pub fn lt_eq_dyn_binary_scalar( /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn gt_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::gt(&left, &Scalar::new(&right)) } @@ -823,10 +758,7 @@ pub fn gt_dyn_binary_scalar( /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports BinaryArray and LargeBinaryArray #[deprecated(note = "Use arrow_ord::cmp::gt_eq")] -pub fn gt_eq_dyn_binary_scalar( - left: &dyn Array, - right: &[u8], -) -> Result { +pub fn gt_eq_dyn_binary_scalar(left: &dyn Array, right: &[u8]) -> Result { let right = make_binary_scalar(left.data_type(), right)?; crate::cmp::gt_eq(&left, &Scalar::new(&right)) } @@ -834,10 +766,7 @@ pub fn gt_eq_dyn_binary_scalar( /// Perform `left == right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::eq(&left, &Scalar::new(&right)) } @@ -845,10 +774,7 @@ pub fn eq_dyn_utf8_scalar( /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn lt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::lt(&left, &Scalar::new(&right)) } @@ -856,10 +782,7 @@ pub fn lt_dyn_utf8_scalar( /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::gt_eq")] -pub fn gt_eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn gt_eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::gt_eq(&left, &Scalar::new(&right)) } @@ -867,10 +790,7 @@ pub fn gt_eq_dyn_utf8_scalar( /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::lt_eq")] -pub fn lt_eq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn lt_eq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::lt_eq(&left, &Scalar::new(&right)) } @@ -878,10 +798,7 @@ pub fn lt_eq_dyn_utf8_scalar( /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn gt_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::gt(&left, &Scalar::new(&right)) } @@ -889,10 +806,7 @@ pub fn gt_dyn_utf8_scalar( /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports StringArrays, and DictionaryArrays that have string values #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_dyn_utf8_scalar( - left: &dyn Array, - right: &str, -) -> Result { +pub fn neq_dyn_utf8_scalar(left: &dyn Array, right: &str) -> Result { let right = make_utf8_scalar(left.data_type(), right)?; crate::cmp::neq(&left, &Scalar::new(&right)) } @@ -900,10 +814,7 @@ pub fn neq_dyn_utf8_scalar( /// Perform `left == right` operation on an array and a numeric scalar /// value. #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::eq(&left, &Scalar::new(&right)) } @@ -911,10 +822,7 @@ pub fn eq_dyn_bool_scalar( /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn lt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::lt(&left, &Scalar::new(&right)) } @@ -922,10 +830,7 @@ pub fn lt_dyn_bool_scalar( /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn gt_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::gt(&left, &Scalar::new(&right)) } @@ -933,10 +838,7 @@ pub fn gt_dyn_bool_scalar( /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. #[deprecated(note = "Use arrow_ord::cmp::lt_eq")] -pub fn lt_eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn lt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::lt_eq(&left, &Scalar::new(&right)) } @@ -944,10 +846,7 @@ pub fn lt_eq_dyn_bool_scalar( /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. #[deprecated(note = "Use arrow_ord::cmp::gt_eq")] -pub fn gt_eq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn gt_eq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::gt_eq(&left, &Scalar::new(&right)) } @@ -955,10 +854,7 @@ pub fn gt_eq_dyn_bool_scalar( /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports BooleanArrays. #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_dyn_bool_scalar( - left: &dyn Array, - right: bool, -) -> Result { +pub fn neq_dyn_bool_scalar(left: &dyn Array, right: bool) -> Result { let right = BooleanArray::from(vec![right]); crate::cmp::neq(&left, &Scalar::new(&right)) } @@ -1063,10 +959,7 @@ pub fn lt_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { +pub fn lt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { crate::cmp::lt_eq(&left, &right) } @@ -1116,10 +1009,7 @@ pub fn gt_dyn(left: &dyn Array, right: &dyn Array) -> Result Result { +pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result { crate::cmp::gt_eq(&left, &right) } @@ -1151,10 +1041,7 @@ where /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. #[deprecated(note = "Use arrow_ord::cmp::eq")] -pub fn eq_scalar( - left: &PrimitiveArray, - right: T::Native, -) -> Result +pub fn eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1164,10 +1051,7 @@ where } /// Applies an unary and infallible comparison function to a primitive array. -pub fn unary_cmp( - left: &PrimitiveArray, - op: F, -) -> Result +pub fn unary_cmp(left: &PrimitiveArray, op: F) -> Result where T: ArrowNumericType, F: Fn(T::Native) -> bool, @@ -1204,10 +1088,7 @@ where /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. #[deprecated(note = "Use arrow_ord::cmp::neq")] -pub fn neq_scalar( - left: &PrimitiveArray, - right: T::Native, -) -> Result +pub fn neq_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1247,10 +1128,7 @@ where /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. #[deprecated(note = "Use arrow_ord::cmp::lt")] -pub fn lt_scalar( - left: &PrimitiveArray, - right: T::Native, -) -> Result +pub fn lt_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1333,10 +1211,7 @@ where /// to treat them as equal, please normalize zeros before calling this kernel. /// Please refer to `f32::total_cmp` and `f64::total_cmp`. #[deprecated(note = "Use arrow_ord::cmp::gt")] -pub fn gt_scalar( - left: &PrimitiveArray, - right: T::Native, -) -> Result +pub fn gt_scalar(left: &PrimitiveArray, right: T::Native) -> Result where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, @@ -1400,8 +1275,7 @@ where let left_len = left.len(); if left_len != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), + "Cannot perform comparison operation on arrays of different length".to_string(), )); } @@ -1441,8 +1315,7 @@ where let left_len = left.len(); if left_len != right.len() { return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), + "Cannot perform comparison operation on arrays of different length".to_string(), )); } @@ -1678,11 +1551,9 @@ mod tests { #[test] fn test_boolean_array_eq() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = eq_bool(&a, &b).unwrap().iter().collect(); @@ -1695,11 +1566,9 @@ mod tests { #[test] fn test_boolean_array_neq() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = neq_bool(&a, &b).unwrap().iter().collect(); @@ -1712,11 +1581,9 @@ mod tests { #[test] fn test_boolean_array_lt() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = lt_bool(&a, &b).unwrap().iter().collect(); @@ -1729,11 +1596,9 @@ mod tests { #[test] fn test_boolean_array_lt_eq() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = lt_eq_bool(&a, &b).unwrap().iter().collect(); @@ -1746,11 +1611,9 @@ mod tests { #[test] fn test_boolean_array_gt() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = gt_bool(&a, &b).unwrap().iter().collect(); @@ -1763,11 +1626,9 @@ mod tests { #[test] fn test_boolean_array_gt_eq() { let a: BooleanArray = - vec![Some(true), Some(false), Some(false), Some(true), Some(true), None] - .into(); + vec![Some(true), Some(false), Some(false), Some(true), Some(true), None].into(); let b: BooleanArray = - vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] - .into(); + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)].into(); let res: Vec> = gt_eq_bool(&a, &b).unwrap().iter().collect(); @@ -1794,8 +1655,7 @@ mod tests { fn test_boolean_array_neq_scalar() { let a: BooleanArray = vec![Some(true), Some(false), None].into(); - let res1: Vec> = - neq_bool_scalar(&a, false).unwrap().iter().collect(); + let res1: Vec> = neq_bool_scalar(&a, false).unwrap().iter().collect(); assert_eq!(res1, vec![Some(true), Some(false), None]); @@ -1821,13 +1681,11 @@ mod tests { fn test_boolean_array_lt_eq_scalar() { let a: BooleanArray = vec![Some(true), Some(false), None].into(); - let res1: Vec> = - lt_eq_bool_scalar(&a, false).unwrap().iter().collect(); + let res1: Vec> = lt_eq_bool_scalar(&a, false).unwrap().iter().collect(); assert_eq!(res1, vec![Some(false), Some(true), None]); - let res2: Vec> = - lt_eq_bool_scalar(&a, true).unwrap().iter().collect(); + let res2: Vec> = lt_eq_bool_scalar(&a, true).unwrap().iter().collect(); assert_eq!(res2, vec![Some(true), Some(true), None]); } @@ -1849,13 +1707,11 @@ mod tests { fn test_boolean_array_gt_eq_scalar() { let a: BooleanArray = vec![Some(true), Some(false), None].into(); - let res1: Vec> = - gt_eq_bool_scalar(&a, false).unwrap().iter().collect(); + let res1: Vec> = gt_eq_bool_scalar(&a, false).unwrap().iter().collect(); assert_eq!(res1, vec![Some(true), Some(true), None]); - let res2: Vec> = - gt_eq_bool_scalar(&a, true).unwrap().iter().collect(); + let res2: Vec> = gt_eq_bool_scalar(&a, true).unwrap().iter().collect(); assert_eq!(res2, vec![Some(true), Some(false), None]); } @@ -2140,25 +1996,19 @@ mod tests { #[test] fn test_interval_array() { - let a = IntervalDayTimeArray::from( - vec![Some(0), Some(6), Some(834), None, Some(3), None], - ); - let b = IntervalDayTimeArray::from( - vec![Some(70), Some(6), Some(833), Some(6), Some(3), None], - ); + let a = IntervalDayTimeArray::from(vec![Some(0), Some(6), Some(834), None, Some(3), None]); + let b = + IntervalDayTimeArray::from(vec![Some(70), Some(6), Some(833), Some(6), Some(3), None]); let res = eq(&a, &b).unwrap(); let res_dyn = eq_dyn(&a, &b).unwrap(); assert_eq!(res, res_dyn); assert_eq!( &res_dyn, - &BooleanArray::from( - vec![Some(false), Some(true), Some(false), None, Some(true), None] - ) + &BooleanArray::from(vec![Some(false), Some(true), Some(false), None, Some(true), None]) ); - let a = IntervalMonthDayNanoArray::from( - vec![Some(0), Some(6), Some(834), None, Some(3), None], - ); + let a = + IntervalMonthDayNanoArray::from(vec![Some(0), Some(6), Some(834), None, Some(3), None]); let b = IntervalMonthDayNanoArray::from( vec![Some(86), Some(5), Some(8), Some(6), Some(3), None], ); @@ -2172,9 +2022,8 @@ mod tests { ) ); - let a = IntervalYearMonthArray::from( - vec![Some(0), Some(623), Some(834), None, Some(3), None], - ); + let a = + IntervalYearMonthArray::from(vec![Some(0), Some(623), Some(834), None, Some(3), None]); let b = IntervalYearMonthArray::from( vec![Some(86), Some(5), Some(834), Some(6), Some(86), None], ); @@ -2183,9 +2032,7 @@ mod tests { assert_eq!(res, res_dyn); assert_eq!( &res_dyn, - &BooleanArray::from( - vec![Some(false), Some(true), Some(true), None, Some(false), None] - ) + &BooleanArray::from(vec![Some(false), Some(true), Some(true), None, Some(false), None]) ); } @@ -2210,9 +2057,7 @@ mod tests { #[test] fn test_binary_eq_scalar_on_slice() { - let a = BinaryArray::from_opt_vec( - vec![Some(b"hi"), None, Some(b"hello"), Some(b"world")], - ); + let a = BinaryArray::from_opt_vec(vec![Some(b"hi"), None, Some(b"hello"), Some(b"world")]); let a = a.slice(1, 3); let a = as_generic_binary_array::(&a); let a_eq = eq_binary_scalar(a, b"hello").unwrap(); @@ -2402,9 +2247,7 @@ mod tests { #[test] fn test_utf8_eq_scalar_on_slice() { - let a = StringArray::from( - vec![Some("hi"), None, Some("hello"), Some("world"), Some("")], - ); + let a = StringArray::from(vec![Some("hi"), None, Some("hello"), Some("world"), Some("")]); let a = a.slice(1, 4); let a_eq = eq_utf8_scalar(&a, "hello").unwrap(); assert_eq!( @@ -2559,16 +2402,13 @@ mod tests { let a_eq = eq_dyn_scalar(&array, 8).unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), Some(false)] - ) + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), Some(false)]) ); } #[test] fn test_eq_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2586,9 +2426,8 @@ mod tests { .into_iter() .map(Some) .collect(); - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), Some(false)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), Some(false)]); assert_eq!(eq_dyn_scalar(&array, 8).unwrap(), expected); let array = array.unary::<_, Float64Type>(|x| x as f64); @@ -2601,16 +2440,13 @@ mod tests { let a_eq = lt_dyn_scalar(&array, 8).unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), Some(false)] - ) + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(false), Some(false)]) ); } #[test] fn test_lt_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2628,9 +2464,8 @@ mod tests { .into_iter() .map(Some) .collect(); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), Some(false)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(false), Some(false)]); assert_eq!(lt_dyn_scalar(&array, 8).unwrap(), expected); let array = array.unary::<_, Float64Type>(|x| x as f64); @@ -2643,9 +2478,7 @@ mod tests { let a_eq = lt_eq_dyn_scalar(&array, 8).unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false)] - ) + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(false)]) ); } @@ -2683,20 +2516,16 @@ mod tests { #[test] fn test_timestamp_dyn_scalar() { - let array = - TimestampSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = TimestampSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = TimestampMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - TimestampNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = TimestampNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); } @@ -2717,60 +2546,49 @@ mod tests { let array = Time32SecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - Time32MillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = Time32MillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); } #[test] fn test_time64_dyn_scalar() { - let array = - Time64MicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = Time64MicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - Time64NanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = Time64NanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); } #[test] fn test_interval_dyn_scalar() { - let array = - IntervalDayTimeArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = IntervalDayTimeArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - IntervalMonthDayNanoArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = IntervalMonthDayNanoArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - IntervalYearMonthArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = IntervalYearMonthArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); } #[test] fn test_duration_dyn_scalar() { - let array = - DurationSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = DurationSecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - DurationMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = DurationMicrosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - DurationMillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = DurationMillisecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); - let array = - DurationNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); + let array = DurationNanosecondArray::from(vec![Some(1), None, Some(8), None, Some(10)]); test_primitive_dyn_scalar(array); } #[test] fn test_lt_eq_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2788,9 +2606,8 @@ mod tests { .into_iter() .map(Some) .collect(); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(false)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(false)]); assert_eq!(lt_eq_dyn_scalar(&array, 8).unwrap(), expected); let array = array.unary::<_, Float64Type>(|x| x as f64); @@ -2811,8 +2628,7 @@ mod tests { #[test] fn test_gt_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(123).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2845,16 +2661,13 @@ mod tests { let a_eq = gt_eq_dyn_scalar(&array, 8).unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), Some(true)] - ) + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), Some(true)]) ); } #[test] fn test_gt_eq_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(22).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2872,9 +2685,8 @@ mod tests { .into_iter() .map(Some) .collect(); - let expected = BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(gt_eq_dyn_scalar(&array, 8).unwrap(), expected); let array = array.unary::<_, Float64Type>(|x| x as f64); @@ -2887,16 +2699,13 @@ mod tests { let a_eq = neq_dyn_scalar(&array, 8).unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), Some(true)] - ) + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(false), Some(true)]) ); } #[test] fn test_neq_dyn_scalar_with_dict() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(22).unwrap(); builder.append_null(); builder.append(23).unwrap(); @@ -2914,9 +2723,8 @@ mod tests { .into_iter() .map(Some) .collect(); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(false), Some(true)]); assert_eq!(neq_dyn_scalar(&array, 8).unwrap(), expected); let array = array.unary::<_, Float64Type>(|x| x as f64); @@ -2944,8 +2752,7 @@ mod tests { ) .unwrap(); let scalar = &[1u8]; - let expected = - BooleanArray::from(vec![Some(false), Some(false), Some(false), Some(true)]); + let expected = BooleanArray::from(vec![Some(false), Some(false), Some(false), Some(true)]); assert_eq!(eq_dyn_binary_scalar(&fsb_array, scalar).unwrap(), expected); } @@ -2970,8 +2777,7 @@ mod tests { ) .unwrap(); let scalar = &[1u8]; - let expected = - BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(false)]); + let expected = BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(false)]); assert_eq!(neq_dyn_binary_scalar(&fsb_array, scalar).unwrap(), expected); } @@ -3065,9 +2871,7 @@ mod tests { let a_eq = eq_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(false), None, Some(true), Some(true), Some(false)] - ) + BooleanArray::from(vec![Some(false), None, Some(true), Some(true), Some(false)]) ); } @@ -3093,9 +2897,7 @@ mod tests { let a_eq = lt_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), None, Some(false), Some(false), Some(true)] - ) + BooleanArray::from(vec![Some(true), None, Some(false), Some(false), Some(true)]) ); } @@ -3121,9 +2923,7 @@ mod tests { let a_eq = lt_eq_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), None, Some(true), Some(true), Some(false)] - ) + BooleanArray::from(vec![Some(true), None, Some(true), Some(true), Some(false)]) ); } @@ -3149,9 +2949,7 @@ mod tests { let a_eq = gt_eq_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(false), None, Some(true), Some(true), Some(true)] - ) + BooleanArray::from(vec![Some(false), None, Some(true), Some(true), Some(true)]) ); } @@ -3177,9 +2975,7 @@ mod tests { let a_eq = gt_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(false), None, Some(false), Some(false), Some(true)] - ) + BooleanArray::from(vec![Some(false), None, Some(false), Some(false), Some(true)]) ); } @@ -3205,9 +3001,7 @@ mod tests { let a_eq = neq_dyn_utf8_scalar(&array, "def").unwrap(); assert_eq!( a_eq, - BooleanArray::from( - vec![Some(true), None, Some(false), Some(false), Some(true)] - ) + BooleanArray::from(vec![Some(true), None, Some(false), Some(false), Some(true)]) ); } @@ -3273,17 +3067,13 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_fixed_size_binary() { - let values1: Vec> = - vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])]; - let values2: Vec> = - vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])]; + let values1: Vec> = vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x01])]; + let values2: Vec> = vec![Some(&[0xfc, 0xa9]), None, Some(&[0x36, 0x00])]; let array1 = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2) - .unwrap(); + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values1.into_iter(), 2).unwrap(); let array2 = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2) - .unwrap(); + FixedSizeBinaryArray::try_from_sparse_iter_with_size(values2.into_iter(), 2).unwrap(); let result = eq_dyn(&array1, &array2).unwrap(); assert_eq!( @@ -3615,10 +3405,11 @@ mod tests { #[test] fn test_eq_dyn_neq_dyn_float_nan() { let array1 = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); - let array2 = Float16Array::from(vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], + let array2 = Float16Array::from( + vec![f16::NAN, f16::NAN, f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)], ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); assert_eq!(eq(&array1, &array2).unwrap(), expected); @@ -3632,9 +3423,8 @@ mod tests { let array1 = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); let array2 = Float32Array::from(vec![f32::NAN, f32::NAN, 8.0, 8.0, 10.0]); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); assert_eq!(eq(&array1, &array2).unwrap(), expected); @@ -3649,9 +3439,8 @@ mod tests { let array1 = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); let array2 = Float64Array::from(vec![f64::NAN, f64::NAN, 8.0, 8.0, 10.0]); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); assert_eq!(eq(&array1, &array2).unwrap(), expected); @@ -3787,9 +3576,8 @@ mod tests { ); assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); @@ -3798,9 +3586,8 @@ mod tests { ); assert_eq!(eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(neq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); @@ -3809,9 +3596,8 @@ mod tests { ); assert_eq!(eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(neq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } @@ -3819,37 +3605,31 @@ mod tests { fn test_lt_dyn_scalar_lt_eq_dyn_scalar_float_nan() { let array = Float16Array::from(vec![f16::NAN, f16::from_f32(7.0), f16::from_f32(8.0), f16::from_f32(8.0), f16::from_f32(10.0)]); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_dyn_scalar(&array, f16::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_eq_dyn_scalar(&array, f16::NAN).unwrap(), expected); let array = Float32Array::from(vec![f32::NAN, 7.0, 8.0, 8.0, 10.0]); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_dyn_scalar(&array, f32::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_eq_dyn_scalar(&array, f32::NAN).unwrap(), expected); let array = Float64Array::from(vec![f64::NAN, 7.0, 8.0, 8.0, 10.0]); - let expected = BooleanArray::from( - vec![Some(false), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(false), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_dyn_scalar(&array, f64::NAN).unwrap(), expected); - let expected = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); assert_eq!(lt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected); } @@ -4109,14 +3889,12 @@ mod tests { .into_iter() .map(Some) .collect(); - let values = - Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(10.0)]); + let values = Float16Array::from(vec![f16::NAN, f16::from_f32(8.0), f16::from_f32(10.0)]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); let array2 = DictionaryArray::new(keys, Arc::new(values)); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -4132,9 +3910,8 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); let array2 = DictionaryArray::new(keys, Arc::new(values)); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -4150,9 +3927,8 @@ mod tests { let keys = Int8Array::from_iter_values([0_i8, 0, 1, 1, 2]); let array2 = DictionaryArray::new(keys, Arc::new(values)); - let expected = BooleanArray::from( - vec![Some(true), Some(false), Some(true), Some(true), Some(true)], - ); + let expected = + BooleanArray::from(vec![Some(true), Some(false), Some(true), Some(true), Some(true)]); assert_eq!(eq_dyn(&array1, &array2).unwrap(), expected); let expected = BooleanArray::from( @@ -4409,8 +4185,7 @@ mod tests { #[test] fn test_cmp_dict_non_dict_decimal128() { - let array1: Decimal128Array = - Decimal128Array::from_iter_values([1, 2, 5, 4, 3, 0]); + let array1: Decimal128Array = Decimal128Array::from_iter_values([1, 2, 5, 4, 3, 0]); let values = Decimal128Array::from_iter_values([7, -3, 4, 3, 5]); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); @@ -4444,15 +4219,13 @@ mod tests { #[test] fn test_cmp_dict_decimal256() { - let values = Decimal256Array::from_iter_values( - [0, 1, 2, 3, 4, 5].into_iter().map(i256::from_i128), - ); + let values = + Decimal256Array::from_iter_values([0, 1, 2, 3, 4, 5].into_iter().map(i256::from_i128)); let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); let array1 = DictionaryArray::new(keys, Arc::new(values)); - let values = Decimal256Array::from_iter_values( - [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), - ); + let values = + Decimal256Array::from_iter_values([7, -3, 4, 3, 5].into_iter().map(i256::from_i128)); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); let array2 = DictionaryArray::new(keys, Arc::new(values)); @@ -4484,13 +4257,11 @@ mod tests { #[test] fn test_cmp_dict_non_dict_decimal256() { - let array1: Decimal256Array = Decimal256Array::from_iter_values( - [1, 2, 5, 4, 3, 0].into_iter().map(i256::from_i128), - ); + let array1: Decimal256Array = + Decimal256Array::from_iter_values([1, 2, 5, 4, 3, 0].into_iter().map(i256::from_i128)); - let values = Decimal256Array::from_iter_values( - [7, -3, 4, 3, 5].into_iter().map(i256::from_i128), - ); + let values = + Decimal256Array::from_iter_values([7, -3, 4, 3, 5].into_iter().map(i256::from_i128)); let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); let array2 = DictionaryArray::new(keys, Arc::new(values)); @@ -4562,9 +4333,7 @@ mod tests { #[test] fn test_decimal128_scalar() { - let a = Decimal128Array::from( - vec![Some(1), Some(2), Some(3), None, Some(4), Some(5)], - ); + let a = Decimal128Array::from(vec![Some(1), Some(2), Some(3), None, Some(4), Some(5)]); let b = 3_i128; // array eq scalar let e = BooleanArray::from( @@ -4623,12 +4392,8 @@ mod tests { #[test] fn test_decimal256() { - let a = Decimal256Array::from_iter_values( - [1, 2, 4, 5].into_iter().map(i256::from_i128), - ); - let b = Decimal256Array::from_iter_values( - [7, -3, 4, 3].into_iter().map(i256::from_i128), - ); + let a = Decimal256Array::from_iter_values([1, 2, 4, 5].into_iter().map(i256::from_i128)); + let b = Decimal256Array::from_iter_values([7, -3, 4, 3].into_iter().map(i256::from_i128)); let e = BooleanArray::from(vec![false, false, true, false]); let r = eq(&a, &b).unwrap(); assert_eq!(e, r); @@ -4667,9 +4432,7 @@ mod tests { #[test] fn test_decimal256_scalar_i128() { - let a = Decimal256Array::from_iter_values( - [1, 2, 3, 4, 5].into_iter().map(i256::from_i128), - ); + let a = Decimal256Array::from_iter_values([1, 2, 3, 4, 5].into_iter().map(i256::from_i128)); let b = i256::from_i128(3); // array eq scalar let e = BooleanArray::from( @@ -4681,45 +4444,40 @@ mod tests { assert_eq!(e, r); // array neq scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(true), Some(true)]); let r = neq_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = neq_dyn_scalar(&a, b).unwrap(); assert_eq!(e, r); // array lt scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(false), Some(false), Some(false)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(false), Some(false)]); let r = lt_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = lt_dyn_scalar(&a, b).unwrap(); assert_eq!(e, r); // array lt_eq scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(false), Some(false)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(false), Some(false)]); let r = lt_eq_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = lt_eq_dyn_scalar(&a, b).unwrap(); assert_eq!(e, r); // array gt scalar - let e = BooleanArray::from( - vec![Some(false), Some(false), Some(false), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(false), Some(false), Some(false), Some(true), Some(true)]); let r = gt_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = gt_dyn_scalar(&a, b).unwrap(); assert_eq!(e, r); // array gt_eq scalar - let e = BooleanArray::from( - vec![Some(false), Some(false), Some(true), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(true), Some(true)]); let r = gt_eq_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = gt_eq_dyn_scalar(&a, b).unwrap(); @@ -4728,9 +4486,7 @@ mod tests { #[test] fn test_decimal256_scalar_i256() { - let a = Decimal256Array::from_iter_values( - [1, 2, 3, 4, 5].into_iter().map(i256::from_i128), - ); + let a = Decimal256Array::from_iter_values([1, 2, 3, 4, 5].into_iter().map(i256::from_i128)); let b = i256::MAX; // array eq scalar let e = BooleanArray::from( @@ -4742,27 +4498,24 @@ mod tests { assert!(r); // array neq scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); let r = neq_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = neq_dyn_scalar(&a, b).is_err(); assert!(r); // array lt scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); let r = lt_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = lt_dyn_scalar(&a, b).is_err(); assert!(r); // array lt_eq scalar - let e = BooleanArray::from( - vec![Some(true), Some(true), Some(true), Some(true), Some(true)], - ); + let e = + BooleanArray::from(vec![Some(true), Some(true), Some(true), Some(true), Some(true)]); let r = lt_eq_scalar(&a, b).unwrap(); assert_eq!(e, r); let r = lt_eq_dyn_scalar(&a, b).is_err(); diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 4d6e3bde9152..28ca07cce260 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -27,10 +27,7 @@ use std::cmp::Ordering; /// Compare the values at two arbitrary indices in two arrays. pub type DynComparator = Box Ordering + Send + Sync>; -fn compare_primitive( - left: &dyn Array, - right: &dyn Array, -) -> DynComparator +fn compare_primitive(left: &dyn Array, right: &dyn Array) -> DynComparator where T::Native: ArrowNativeTypeOp, { @@ -94,10 +91,7 @@ fn compare_dict( /// ``` // This is a factory of comparisons. // The lifetime 'a enforces that we cannot use the closure beyond any of the array's lifetime. -pub fn build_compare( - left: &dyn Array, - right: &dyn Array, -) -> Result { +pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { use arrow_schema::DataType::*; macro_rules! primitive_helper { ($t:ty, $left:expr, $right:expr) => { diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 80b25ee2afba..12ab8dba04f7 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -300,9 +300,7 @@ mod tests { Arc::new(Int64Array::new(vec![1; 9].into(), None)) as _, Arc::new(Int64Array::new( vec![1, 1, 2, 2, 2, 3, 3, 3, 3].into(), - Some( - vec![false, true, true, true, true, false, false, true, false].into(), - ), + Some(vec![false, true, true, true, true, false, false, true, false].into()), )) as _, Arc::new(Int64Array::new( vec![1, 1, 2, 2, 2, 2, 2, 3, 7].into(), diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs index 1e79156a71a3..51b0b5b91ba9 100644 --- a/arrow-ord/src/rank.rs +++ b/arrow-ord/src/rank.rs @@ -34,10 +34,7 @@ use std::cmp::Ordering; /// let ranks = rank(&array, None).unwrap(); /// assert_eq!(ranks, &[5, 2, 5, 2, 3]); /// ``` -pub fn rank( - array: &dyn Array, - options: Option, -) -> Result, ArrowError> { +pub fn rank(array: &dyn Array, options: Option) -> Result, ArrowError> { let options = options.unwrap_or_default(); let ranks = downcast_primitive_array! { array => primitive_rank(array.values(), array.nulls(), options), @@ -68,10 +65,7 @@ fn primitive_rank( } #[inline(never)] -fn bytes_rank( - array: &GenericByteArray, - options: SortOptions, -) -> Vec { +fn bytes_rank(array: &GenericByteArray, options: SortOptions) -> Vec { let to_sort: Vec<(&[u8], u32)> = match array.nulls().filter(|n| n.null_count() > 0) { Some(n) => n .valid_indices() diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index a477d6c261b3..92b20c4ad08c 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -54,10 +54,7 @@ pub use arrow_schema::SortOptions; /// let sorted_array = sort(&array, None).unwrap(); /// assert_eq!(sorted_array.as_ref(), &Int32Array::from(vec![1, 2, 3, 4, 5])); /// ``` -pub fn sort( - values: &dyn Array, - options: Option, -) -> Result { +pub fn sort(values: &dyn Array, options: Option) -> Result { downcast_primitive_array!( values => sort_native_type(values, options), DataType::RunEndEncoded(_, _) => sort_run(values, options, None), @@ -453,8 +450,7 @@ fn sort_run_downcasted( new_run_ends_builder.append(R::Native::from_usize(new_run_end).unwrap()); }; - let (values_indices, run_values) = - sort_run_inner(run_array, options, output_len, consume_runs); + let (values_indices, run_values) = sort_run_inner(run_array, options, output_len, consume_runs); let new_run_ends = unsafe { // Safety: @@ -556,8 +552,7 @@ where // and len, both of which are within bounds of run_array if physical_index == start_physical_index { ( - run_ends.get_unchecked(physical_index).as_usize() - - run_array.offset(), + run_ends.get_unchecked(physical_index).as_usize() - run_array.offset(), 0, ) } else if physical_index == end_physical_index { @@ -646,10 +641,7 @@ pub struct SortColumn { /// Note: for multi-column sorts without a limit, using the [row format](https://docs.rs/arrow-row/latest/arrow_row/) /// may be significantly faster /// -pub fn lexsort( - columns: &[SortColumn], - limit: Option, -) -> Result, ArrowError> { +pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result, ArrowError> { let indices = lexsort_to_indices(columns, limit)?; columns .iter() @@ -772,9 +764,7 @@ impl LexicographicalComparator { /// Create a new lex comparator that will wrap the given sort columns and give comparison /// results with two indices. - pub fn try_new( - columns: &[SortColumn], - ) -> Result { + pub fn try_new(columns: &[SortColumn]) -> Result { let compare_items = columns .iter() .map(|column| { @@ -826,8 +816,7 @@ mod tests { ) { let output = create_decimal128_array(data); let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } @@ -839,8 +828,7 @@ mod tests { ) { let output = create_decimal256_array(data); let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } @@ -853,9 +841,7 @@ mod tests { let output = create_decimal128_array(data); let expected = Arc::new(create_decimal128_array(expected_data)) as ArrayRef; let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), }; assert_eq!(&output, &expected) @@ -870,9 +856,7 @@ mod tests { let output = create_decimal256_array(data); let expected = Arc::new(create_decimal256_array(expected_data)) as ArrayRef; let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), }; assert_eq!(&output, &expected) @@ -886,8 +870,7 @@ mod tests { ) { let output = BooleanArray::from(data); let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } @@ -902,8 +885,7 @@ mod tests { { let output = PrimitiveArray::::from(data); let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } @@ -919,9 +901,7 @@ mod tests { let output = PrimitiveArray::::from(data); let expected = Arc::new(PrimitiveArray::::from(expected_data)) as ArrayRef; let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), }; assert_eq!(&output, &expected) @@ -935,8 +915,7 @@ mod tests { ) { let output = StringArray::from(data); let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); + let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) } @@ -950,9 +929,7 @@ mod tests { let output = StringArray::from(data.clone()); let expected = Arc::new(StringArray::from(expected_data.clone())) as ArrayRef; let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), }; assert_eq!(&output, &expected); @@ -960,9 +937,7 @@ mod tests { let output = LargeStringArray::from(data); let expected = Arc::new(LargeStringArray::from(expected_data)) as ArrayRef; let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), }; assert_eq!(&output, &expected) @@ -982,9 +957,7 @@ mod tests { .expect("Unable to get dictionary values"); let sorted = match limit { - Some(_) => { - sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), }; let sorted = sorted @@ -1012,8 +985,7 @@ mod tests { .collect::>>(), ) .expect("Unable to create string array from dictionary"); - let expected = - StringArray::try_from(expected_data).expect("Unable to create string array"); + let expected = StringArray::try_from(expected_data).expect("Unable to create string array"); assert_eq!(sorted_strings, expected) } @@ -1032,9 +1004,7 @@ mod tests { let dict = array_values.as_primitive::(); let sorted = match limit { - Some(_) => { - sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap() - } + Some(_) => sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), }; let sorted = sorted @@ -1062,8 +1032,7 @@ mod tests { }) .collect::>>(), ); - let expected: PrimitiveArray = - From::>>::from(expected_data); + let expected: PrimitiveArray = From::>>::from(expected_data); assert_eq!(sorted_values, expected) } @@ -1134,11 +1103,7 @@ mod tests { } /// slice all arrays in expected_output to offset/length - fn slice_arrays( - expected_output: Vec, - offset: usize, - length: usize, - ) -> Vec { + fn slice_arrays(expected_output: Vec, offset: usize, length: usize) -> Vec { expected_output .into_iter() .map(|array| array.slice(offset, length)) @@ -1155,11 +1120,8 @@ mod tests { // Fixed size binary array if let Some(length) = fixed_length { let input = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter_with_size( - data.iter().cloned(), - length, - ) - .unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.iter().cloned(), length) + .unwrap(), ); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), @@ -1668,12 +1630,7 @@ mod tests { ]; // decimal default - test_sort_to_indices_decimal256_array( - data.clone(), - None, - None, - vec![0, 6, 4, 2, 3, 5, 1], - ); + test_sort_to_indices_decimal256_array(data.clone(), None, None, vec![0, 6, 4, 2, 3, 5, 1]); // decimal descending test_sort_to_indices_decimal256_array( data.clone(), @@ -2665,9 +2622,7 @@ mod tests { #[test] fn test_sort_run_to_run() { - test_sort_run_inner(|array, sort_options, limit| { - sort_run(array, sort_options, limit) - }); + test_sort_run_inner(|array, sort_options, limit| sort_run(array, sort_options, limit)); } #[test] @@ -2680,16 +2635,11 @@ mod tests { fn test_sort_run_inner(sort_fn: F) where - F: Fn( - &dyn Array, - Option, - Option, - ) -> Result, + F: Fn(&dyn Array, Option, Option) -> Result, { // Create an input array for testing let total_len = 80; - let vals: Vec> = - vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; + let vals: Vec> = vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)]; let repeats: Vec = vec![1, 3, 2, 4]; let mut input_array: Vec> = Vec::with_capacity(total_len); for ix in 0_usize..32 { @@ -2756,11 +2706,7 @@ mod tests { limit: Option, sort_fn: &F, ) where - F: Fn( - &dyn Array, - Option, - Option, - ) -> Result, + F: Fn(&dyn Array, Option, Option) -> Result, { // Run the sort and build actual result let sliced_array = run_array.slice(offset, length); @@ -3649,11 +3595,7 @@ mod tests { ])) as ArrayRef, ]; test_lex_sort_arrays(input.clone(), expected.clone(), None); - test_lex_sort_arrays( - input.clone(), - slice_arrays(expected.clone(), 0, 5), - Some(5), - ); + test_lex_sort_arrays(input.clone(), slice_arrays(expected.clone(), 0, 5), Some(5)); // Limiting by more rows than present is ok test_lex_sort_arrays(input, slice_arrays(expected, 0, 5), Some(10)); @@ -3688,8 +3630,7 @@ mod tests { #[test] fn test_sort_int8_dicts() { - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Int8Array::from(vec![1, 3, 5]); test_sort_primitive_dict_arrays::( keys, @@ -3699,8 +3640,7 @@ mod tests { vec![None, None, Some(1), Some(3), Some(5), Some(5)], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Int8Array::from(vec![1, 3, 5]); test_sort_primitive_dict_arrays::( keys, @@ -3713,8 +3653,7 @@ mod tests { vec![Some(5), Some(5), Some(3), Some(1), None, None], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Int8Array::from(vec![1, 3, 5]); test_sort_primitive_dict_arrays::( keys, @@ -3727,8 +3666,7 @@ mod tests { vec![Some(1), Some(3), Some(5), Some(5), None, None], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Int8Array::from(vec![1, 3, 5]); test_sort_primitive_dict_arrays::( keys, @@ -3826,8 +3764,7 @@ mod tests { #[test] fn test_sort_f32_dicts() { - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Float32Array::from(vec![1.2, 3.0, 5.1]); test_sort_primitive_dict_arrays::( keys, @@ -3837,8 +3774,7 @@ mod tests { vec![None, None, Some(1.2), Some(3.0), Some(5.1), Some(5.1)], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Float32Array::from(vec![1.2, 3.0, 5.1]); test_sort_primitive_dict_arrays::( keys, @@ -3851,8 +3787,7 @@ mod tests { vec![Some(5.1), Some(5.1), Some(3.0), Some(1.2), None, None], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Float32Array::from(vec![1.2, 3.0, 5.1]); test_sort_primitive_dict_arrays::( keys, @@ -3865,8 +3800,7 @@ mod tests { vec![Some(1.2), Some(3.0), Some(5.1), Some(5.1), None, None], ); - let keys = - Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); + let keys = Int8Array::from(vec![Some(1_i8), None, Some(2), None, Some(2), Some(0)]); let values = Float32Array::from(vec![1.2, 3.0, 5.1]); test_sort_primitive_dict_arrays::( keys, diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 1fb4e1de7ac2..86a76c0a74f7 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -388,10 +388,8 @@ impl Codec { fn new(sort_field: &SortField) -> Result { match &sort_field.data_type { DataType::Dictionary(_, values) => { - let sort_field = SortField::new_with_options( - values.as_ref().clone(), - sort_field.options, - ); + let sort_field = + SortField::new_with_options(values.as_ref().clone(), sort_field.options); let converter = RowConverter::new(vec![sort_field])?; let null_array = new_null_array(values.as_ref(), 1); @@ -410,8 +408,7 @@ impl Codec { // it set to true let options = SortOptions { descending: false, - nulls_first: sort_field.options.nulls_first - != sort_field.options.descending, + nulls_first: sort_field.options.nulls_first != sort_field.options.descending, }; let field = SortField::new_with_options(f.data_type().clone(), options); @@ -421,17 +418,11 @@ impl Codec { DataType::Struct(f) => { let sort_fields = f .iter() - .map(|x| { - SortField::new_with_options( - x.data_type().clone(), - sort_field.options, - ) - }) + .map(|x| SortField::new_with_options(x.data_type().clone(), sort_field.options)) .collect(); let converter = RowConverter::new(sort_fields)?; - let nulls: Vec<_> = - f.iter().map(|x| new_null_array(x.data_type(), 1)).collect(); + let nulls: Vec<_> = f.iter().map(|x| new_null_array(x.data_type(), 1)).collect(); let nulls = converter.convert_columns(&nulls)?; let owned = OwnedRow { @@ -523,8 +514,7 @@ impl SortField { /// /// Includes the size of `Self`. pub fn size(&self) -> usize { - self.data_type.size() + std::mem::size_of::() - - std::mem::size_of::() + self.data_type.size() + std::mem::size_of::() - std::mem::size_of::() } } @@ -555,9 +545,7 @@ impl RowConverter { DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => { Self::supports_datatype(f.data_type()) } - DataType::Struct(f) => { - f.iter().all(|x| Self::supports_datatype(x.data_type())) - } + DataType::Struct(f) => f.iter().all(|x| Self::supports_datatype(x.data_type())), _ => false, } } @@ -606,11 +594,7 @@ impl RowConverter { /// let values: Vec<_> = back[0].as_string::().iter().map(Option::unwrap).collect(); /// assert_eq!(&values, &["hello", "world", "a", "a", "hello"]); /// ``` - pub fn append( - &self, - rows: &mut Rows, - columns: &[ArrayRef], - ) -> Result<(), ArrowError> { + pub fn append(&self, rows: &mut Rows, columns: &[ArrayRef]) -> Result<(), ArrowError> { assert!( Arc::ptr_eq(&rows.config.fields, &self.fields), "rows were not produced by this RowConverter" @@ -670,9 +654,7 @@ impl RowConverter { // encoders not assuming a zero-initialized buffer rows.buffer.resize(cur_offset, 0); - for ((column, field), encoder) in - columns.iter().zip(self.fields.iter()).zip(encoders) - { + for ((column, field), encoder) in columns.iter().zip(self.fields.iter()).zip(encoders) { // We encode a column at a time to minimise dispatch overheads encode_column( &mut rows.buffer, @@ -1225,9 +1207,7 @@ fn encode_column( }) } Encoder::List(rows) => match column.data_type() { - DataType::List(_) => { - list::encode(data, offsets, rows, opts, as_list_array(column)) - } + DataType::List(_) => list::encode(data, offsets, rows, opts, as_list_array(column)), DataType::LargeList(_) => { list::encode(data, offsets, rows, opts, as_large_list_array(column)) } @@ -1466,11 +1446,9 @@ mod tests { #[test] fn test_bool() { - let converter = - RowConverter::new(vec![SortField::new(DataType::Boolean)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]).unwrap(); - let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) - as ArrayRef; + let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)])) as ArrayRef; let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); assert!(rows.row(2) > rows.row(1)); @@ -1499,20 +1477,18 @@ mod tests { #[test] fn test_timezone() { - let a = TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]) - .with_timezone("+01:00".to_string()); + let a = + TimestampNanosecondArray::from(vec![1, 2, 3, 4, 5]).with_timezone("+01:00".to_string()); let d = a.data_type().clone(); - let converter = - RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(a.data_type().clone())]).unwrap(); let rows = converter.convert_columns(&[Arc::new(a) as _]).unwrap(); let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); assert_eq!(back[0].data_type(), &d); // Test dictionary - let mut a = - PrimitiveDictionaryBuilder::::new(); + let mut a = PrimitiveDictionaryBuilder::::new(); a.append(34).unwrap(); a.append_null(); a.append(345).unwrap(); @@ -1584,8 +1560,7 @@ mod tests { Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]), ])) as ArrayRef; - let converter = - RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); for i in 0..rows.num_rows() { @@ -1794,14 +1769,11 @@ mod tests { #[test] fn test_dictionary_nulls() { - let values = - Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]).into_data(); + let values = Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]).into_data(); let keys = - Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None]) - .into_data(); + Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None]).into_data(); - let data_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)); + let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32)); let data = keys .into_builder() .data_type(data_type.clone()) @@ -1823,8 +1795,7 @@ mod tests { #[test] #[should_panic(expected = "Encountered non UTF-8 data")] fn test_invalid_utf8() { - let converter = - RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); + let converter = RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap(); let array = Arc::new(BinaryArray::from_iter_values([&[0xFF]])) as _; let rows = converter.convert_columns(&[array]).unwrap(); let binary_row = rows.row(0); @@ -1948,9 +1919,8 @@ mod tests { } fn test_nested_list() { - let mut builder = GenericListBuilder::::new( - GenericListBuilder::::new(Int32Builder::new()), - ); + let mut builder = + GenericListBuilder::::new(GenericListBuilder::::new(Int32Builder::new())); builder.values().values().append_value(1); builder.values().values().append_value(2); @@ -2106,10 +2076,8 @@ mod tests { }) .collect(); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); let data = keys .into_data() @@ -2122,10 +2090,7 @@ mod tests { DictionaryArray::from(data) } - fn generate_fixed_size_binary( - len: usize, - valid_percent: f64, - ) -> FixedSizeBinaryArray { + fn generate_fixed_size_binary(len: usize, valid_percent: f64) -> FixedSizeBinaryArray { let mut rng = thread_rng(); let width = rng.gen_range(0..20); let mut builder = FixedSizeBinaryBuilder::new(width); diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 73c4b6fbfda5..511fb4ffb282 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -144,8 +144,7 @@ pub unsafe fn decode( let row = &canonical.value_data()[start..end]; let element_count_start = row.len() - 4; let element_count = - u32::from_be_bytes((&row[element_count_start..]).try_into().unwrap()) - as usize; + u32::from_be_bytes((&row[element_count_start..]).try_into().unwrap()) as usize; let lengths_start = element_count_start - (element_count * 4); let mut row_offset = 0; diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 6c9c4c43bca3..4451c5287310 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -53,9 +53,7 @@ pub fn encoded_len(a: Option<&[u8]>) -> usize { #[inline] pub fn padded_length(a: Option) -> usize { match a { - Some(a) if a <= BLOCK_SIZE => { - 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1) - } + Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1), // Each miniblock ends with a 1 byte continuation, therefore add // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 4f8c8a18bd17..b78c785ae279 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -345,13 +345,7 @@ impl DataType { use DataType::*; matches!( self, - Date32 - | Date64 - | Timestamp(_, _) - | Time32(_) - | Time64(_) - | Duration(_) - | Interval(_) + Date32 | Date64 | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_) ) } @@ -397,12 +391,9 @@ impl DataType { use DataType::*; match self { Dictionary(_, v) => DataType::is_nested(v.as_ref()), - List(_) - | FixedSizeList(_, _) - | LargeList(_) - | Struct(_) - | Union(_, _) - | Map(_, _) => true, + List(_) | FixedSizeList(_, _) | LargeList(_) | Struct(_) | Union(_, _) | Map(_, _) => { + true + } _ => false, } } @@ -413,8 +404,7 @@ impl DataType { match (&self, other) { (DataType::List(a), DataType::List(b)) | (DataType::LargeList(a), DataType::LargeList(b)) => { - a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) + a.is_nullable() == b.is_nullable() && a.data_type().equals_datatype(b.data_type()) } (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { a_size == b_size @@ -428,18 +418,14 @@ impl DataType { && a.data_type().equals_datatype(b.data_type()) }) } - ( - DataType::Map(a_field, a_is_sorted), - DataType::Map(b_field, b_is_sorted), - ) => { + (DataType::Map(a_field, a_is_sorted), DataType::Map(b_field, b_is_sorted)) => { a_field.is_nullable() == b_field.is_nullable() && a_field.data_type().equals_datatype(b_field.data_type()) && a_is_sorted == b_is_sorted } - ( - DataType::Dictionary(a_key, a_value), - DataType::Dictionary(b_key, b_value), - ) => a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value), + (DataType::Dictionary(a_key, a_value), DataType::Dictionary(b_key, b_value)) => { + a_key.equals_datatype(b_key) && a_value.equals_datatype(b_value) + } ( DataType::RunEndEncoded(a_run_ends, a_values), DataType::RunEndEncoded(b_run_ends, b_values), @@ -534,9 +520,7 @@ impl DataType { | DataType::LargeUtf8 | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => 0, - DataType::Timestamp(_, s) => { - s.as_ref().map(|s| s.len()).unwrap_or_default() - } + DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(), DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) @@ -617,8 +601,8 @@ mod tests { Field::new("first_name", DataType::Utf8, false).with_metadata(field_metadata); // Empty map: should be omitted. - let last_name = Field::new("last_name", DataType::Utf8, false) - .with_metadata(HashMap::default()); + let last_name = + Field::new("last_name", DataType::Utf8, false).with_metadata(HashMap::default()); let person = DataType::Struct(Fields::from(vec![ first_name, @@ -666,14 +650,10 @@ mod tests { assert!(!list_b.equals_datatype(&list_c)); assert!(!list_a.equals_datatype(&list_d)); - let list_e = DataType::FixedSizeList( - Arc::new(Field::new("item", list_a.clone(), false)), - 3, - ); - let list_f = DataType::FixedSizeList( - Arc::new(Field::new("array", list_b.clone(), false)), - 3, - ); + let list_e = + DataType::FixedSizeList(Arc::new(Field::new("item", list_a.clone(), false)), 3); + let list_f = + DataType::FixedSizeList(Arc::new(Field::new("array", list_b.clone(), false)), 3); let list_g = DataType::FixedSizeList( Arc::new(Field::new("item", DataType::FixedSizeBinary(3), true)), 3, @@ -683,10 +663,8 @@ mod tests { assert!(!list_f.equals_datatype(&list_g)); let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)])); - let list_i = - DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)])); - let list_j = - DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)])); + let list_i = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)])); + let list_j = DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)])); let list_k = DataType::Struct(Fields::from(vec![ Field::new("f1", list_f.clone(), false), Field::new("f2", list_g.clone(), false), @@ -707,16 +685,11 @@ mod tests { assert!(!list_k.equals_datatype(&list_l)); assert!(list_k.equals_datatype(&list_m)); - let list_n = - DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true); - let list_o = - DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true); - let list_p = - DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false); - let list_q = - DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true); - let list_r = - DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true); + let list_n = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), true)), true); + let list_o = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), true); + let list_p = DataType::Map(Arc::new(Field::new("f2", list_b.clone(), true)), false); + let list_q = DataType::Map(Arc::new(Field::new("f2", list_c.clone(), true)), true); + let list_r = DataType::Map(Arc::new(Field::new("f1", list_a.clone(), false)), true); assert!(list_n.equals_datatype(&list_o)); assert!(!list_n.equals_datatype(&list_p)); @@ -724,8 +697,7 @@ mod tests { assert!(!list_n.equals_datatype(&list_r)); let list_s = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_a)); - let list_t = - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone())); + let list_t = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_b.clone())); let list_u = DataType::Dictionary(Box::new(DataType::Int8), Box::new(list_b)); let list_v = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(list_c)); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index a17dbe769f2e..7e33a78fec27 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -34,9 +34,7 @@ //! assert_eq!(schema, back); //! ``` -use crate::{ - ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionFields, UnionMode, -}; +use crate::{ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionFields, UnionMode}; use std::sync::Arc; use std::{ collections::HashMap, @@ -213,8 +211,7 @@ impl FFI_ArrowSchema { }; unsafe { - let mut private_data = - Box::from_raw(self.private_data as *mut SchemaPrivateData); + let mut private_data = Box::from_raw(self.private_data as *mut SchemaPrivateData); private_data.metadata = new_metadata; self.private_data = Box::into_raw(private_data) as *mut c_void; } @@ -318,9 +315,8 @@ impl FFI_ArrowSchema { )); } - let mut metadata = HashMap::with_capacity( - num_entries.try_into().expect("Too many metadata entries"), - ); + let mut metadata = + HashMap::with_capacity(num_entries.try_into().expect("Too many metadata entries")); for _ in 0..num_entries { let key_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos)); @@ -329,18 +325,15 @@ impl FFI_ArrowSchema { "Negative key length in metadata".to_string(), )); } - let key = String::from_utf8( - next_n_bytes(buffer, &mut pos, key_length).to_vec(), - )?; + let key = String::from_utf8(next_n_bytes(buffer, &mut pos, key_length).to_vec())?; let value_length = i32::from_ne_bytes(next_four_bytes(buffer, &mut pos)); if value_length < 0 { return Err(ArrowError::CDataInterface( "Negative value length in metadata".to_string(), )); } - let value = String::from_utf8( - next_n_bytes(buffer, &mut pos, value_length).to_vec(), - )?; + let value = + String::from_utf8(next_n_bytes(buffer, &mut pos, value_length).to_vec())?; metadata.insert(key, value); } @@ -639,9 +632,7 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), - DataType::Decimal256(precision, scale) => { - Ok(format!("d:{precision},{scale},256")) - } + DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")), DataType::Date32 => Ok("tdD".to_string()), DataType::Date64 => Ok("tdm".to_string()), DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), @@ -715,8 +706,7 @@ impl TryFrom<&Schema> for FFI_ArrowSchema { fn try_from(schema: &Schema) -> Result { let dtype = DataType::Struct(schema.fields().clone()); - let c_schema = - FFI_ArrowSchema::try_from(&dtype)?.with_metadata(&schema.metadata)?; + let c_schema = FFI_ArrowSchema::try_from(&dtype)?.with_metadata(&schema.metadata)?; Ok(c_schema) } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b50778c785fb..574c024bb9b9 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -173,11 +173,7 @@ impl Field { /// - `name`: the name of the [`DataType::Struct`] field /// - `fields`: the description of each struct element /// - `nullable`: if the [`DataType::Struct`] array is nullable - pub fn new_struct( - name: impl Into, - fields: impl Into, - nullable: bool, - ) -> Self { + pub fn new_struct(name: impl Into, fields: impl Into, nullable: bool) -> Self { Self::new(name, DataType::Struct(fields.into()), nullable) } @@ -186,11 +182,7 @@ impl Field { /// - `name`: the name of the [`DataType::List`] field /// - `value`: the description of each list element /// - `nullable`: if the [`DataType::List`] array is nullable - pub fn new_list( - name: impl Into, - value: impl Into, - nullable: bool, - ) -> Self { + pub fn new_list(name: impl Into, value: impl Into, nullable: bool) -> Self { Self::new(name, DataType::List(value.into()), nullable) } @@ -344,9 +336,7 @@ impl Field { fn _fields(dt: &DataType) -> Vec<&Field> { match dt { DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(), - DataType::Union(fields, _) => { - fields.iter().flat_map(|(_, f)| f.fields()).collect() - } + DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(), DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) @@ -363,8 +353,7 @@ impl Field { self.fields() .into_iter() .filter(|&field| { - matches!(field.data_type(), DataType::Dictionary(_, _)) - && field.dict_id == id + matches!(field.data_type(), DataType::Dictionary(_, _)) && field.dict_id == id }) .collect() } @@ -857,8 +846,7 @@ mod test { #[cfg(feature = "serde")] #[test] fn test_field_with_empty_metadata_serde() { - let field = - Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new()); + let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new()); assert_binary_serde_round_trip(field) } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index a00e8a588757..8424ae87d5fa 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -168,10 +168,7 @@ impl Schema { /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); /// ``` #[inline] - pub fn new_with_metadata( - fields: impl Into, - metadata: HashMap, - ) -> Self { + pub fn new_with_metadata(fields: impl Into, metadata: HashMap) -> Self { Self { fields: fields.into(), metadata, @@ -230,9 +227,7 @@ impl Schema { /// ]), /// ); /// ``` - pub fn try_merge( - schemas: impl IntoIterator, - ) -> Result { + pub fn try_merge(schemas: impl IntoIterator) -> Result { let mut out_meta = HashMap::new(); let mut out_fields = SchemaBuilder::new(); for schema in schemas { @@ -323,9 +318,10 @@ impl Schema { pub fn contains(&self, other: &Schema) -> bool { // make sure self.metadata is a superset of other.metadata self.fields.contains(&other.fields) - && other.metadata.iter().all(|(k, v1)| { - self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default() - }) + && other + .metadata + .iter() + .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } } @@ -381,8 +377,8 @@ mod tests { assert_eq!(schema, de_schema); // ser/de with non-empty metadata - let schema = schema - .with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect()); + let schema = + schema.with_metadata([("key".to_owned(), "val".to_owned())].into_iter().collect()); let json = serde_json::to_string(&schema).unwrap(); let de_schema = serde_json::from_str(&json).unwrap(); @@ -636,18 +632,14 @@ mod tests { .collect(); let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); - assert!( - Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]) - .is_err() - ); + assert!(Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err()); // 2. None + Some let mut f1 = Field::new("first_name", DataType::Utf8, false); - let metadata2: HashMap = - [("missing".to_string(), "value".to_string())] - .iter() - .cloned() - .collect(); + let metadata2: HashMap = [("missing".to_string(), "value".to_string())] + .iter() + .cloned() + .collect(); let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(metadata2); assert!(f1.try_merge(&f2).is_ok()); @@ -714,9 +706,7 @@ mod tests { Field::new("last_name", DataType::Utf8, false), Field::new( "address", - DataType::Struct( - vec![Field::new("zip", DataType::UInt16, false)].into(), - ), + DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)].into()), false, ), ]), diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index a6dcca24eace..04e3ab2f7424 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -121,8 +121,7 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { let d = arrays[0].data_type(); if arrays.iter().skip(1).any(|array| array.data_type() != d) { return Err(ArrowError::InvalidArgumentError( - "It is not possible to concatenate arrays of different data types." - .to_string(), + "It is not possible to concatenate arrays of different data types.".to_string(), )); } @@ -144,10 +143,7 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { /// Concatenates arrays using MutableArrayData /// /// This will naively concatenate dictionaries -fn concat_fallback( - arrays: &[&dyn Array], - capacity: Capacities, -) -> Result { +fn concat_fallback(arrays: &[&dyn Array], capacity: Capacities) -> Result { let array_data: Vec<_> = arrays.iter().map(|a| a.to_data()).collect::>(); let array_data = array_data.iter().collect(); let mut mutable = MutableArrayData::with_capacities(array_data, false, capacity); @@ -216,8 +212,7 @@ mod tests { let mut options = RecordBatchOptions::default(); options.row_count = Some(100); - let batch = - RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); + let batch = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); // put in 2 batches of 100 rows each let re = concat_batches(&schema, &[batch.clone(), batch]).unwrap(); @@ -274,19 +269,8 @@ mod tests { #[test] fn test_concat_primitive_arrays() { let arr = concat(&[ - &PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ]), - &PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ]), + &PrimitiveArray::::from(vec![Some(-1), Some(-1), Some(2), None, None]), + &PrimitiveArray::::from(vec![Some(101), Some(102), Some(103), None]), &PrimitiveArray::::from(vec![Some(256), Some(512), Some(1024)]), ]) .unwrap(); @@ -311,22 +295,13 @@ mod tests { #[test] fn test_concat_primitive_array_slices() { - let input_1 = PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ]) - .slice(1, 3); + let input_1 = + PrimitiveArray::::from(vec![Some(-1), Some(-1), Some(2), None, None]) + .slice(1, 3); - let input_2 = PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ]) - .slice(1, 3); + let input_2 = + PrimitiveArray::::from(vec![Some(101), Some(102), Some(103), None]) + .slice(1, 3); let arr = concat(&[&input_1, &input_2]).unwrap(); let expected_output = Arc::new(PrimitiveArray::::from(vec![ @@ -380,20 +355,17 @@ mod tests { None, Some(vec![Some(10)]), ]; - let list1_array = - ListArray::from_iter_primitive::(list1.clone()); + let list1_array = ListArray::from_iter_primitive::(list1.clone()); let list2 = vec![ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), ]; - let list2_array = - ListArray::from_iter_primitive::(list2.clone()); + let list2_array = ListArray::from_iter_primitive::(list2.clone()); let list3 = vec![Some(vec![Some(1000), Some(1001)])]; - let list3_array = - ListArray::from_iter_primitive::(list3.clone()); + let list3_array = ListArray::from_iter_primitive::(list3.clone()); let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); @@ -406,31 +378,28 @@ mod tests { #[test] fn test_concat_struct_arrays() { let field = Arc::new(Field::new("field", DataType::Int64, true)); - let input_primitive_1: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ])); + let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ + Some(-1), + Some(-1), + Some(2), + None, + None, + ])); let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); - let input_primitive_2: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ])); + let input_primitive_2: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ + Some(101), + Some(102), + Some(103), + None, + ])); let input_struct_2 = StructArray::from(vec![(field.clone(), input_primitive_2)]); - let input_primitive_3: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(256), - Some(512), - Some(1024), - ])); + let input_primitive_3: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ + Some(256), + Some(512), + Some(1024), + ])); let input_struct_3 = StructArray::from(vec![(field, input_primitive_3)]); let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3]).unwrap(); @@ -461,27 +430,24 @@ mod tests { #[test] fn test_concat_struct_array_slices() { let field = Arc::new(Field::new("field", DataType::Int64, true)); - let input_primitive_1: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ])); + let input_primitive_1: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ + Some(-1), + Some(-1), + Some(2), + None, + None, + ])); let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); - let input_primitive_2: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ])); + let input_primitive_2: ArrayRef = Arc::new(PrimitiveArray::::from(vec![ + Some(101), + Some(102), + Some(103), + None, + ])); let input_struct_2 = StructArray::from(vec![(field, input_primitive_2)]); - let arr = - concat(&[&input_struct_1.slice(1, 3), &input_struct_2.slice(1, 2)]).unwrap(); + let arr = concat(&[&input_struct_1.slice(1, 3), &input_struct_2.slice(1, 2)]).unwrap(); let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ Some(-1), @@ -526,27 +492,22 @@ mod tests { assert_eq!(actual_output, &expected_output); } - fn collect_string_dictionary( - array: &DictionaryArray, - ) -> Vec> { + fn collect_string_dictionary(array: &DictionaryArray) -> Vec> { let concrete = array.downcast_dict::().unwrap(); concrete.into_iter().collect() } #[test] fn test_string_dictionary_array() { - let input_1: DictionaryArray = - vec!["hello", "A", "B", "hello", "hello", "C"] - .into_iter() - .collect(); - let input_2: DictionaryArray = - vec!["hello", "E", "E", "hello", "F", "E"] - .into_iter() - .collect(); + let input_1: DictionaryArray = vec!["hello", "A", "B", "hello", "hello", "C"] + .into_iter() + .collect(); + let input_2: DictionaryArray = vec!["hello", "E", "E", "hello", "F", "E"] + .into_iter() + .collect(); let expected: Vec<_> = vec![ - "hello", "A", "B", "hello", "hello", "C", "hello", "E", "E", "hello", "F", - "E", + "hello", "A", "B", "hello", "hello", "C", "hello", "E", "E", "hello", "F", "E", ] .into_iter() .map(Some) @@ -566,10 +527,9 @@ mod tests { #[test] fn test_string_dictionary_array_nulls() { - let input_1: DictionaryArray = - vec![Some("foo"), Some("bar"), None, Some("fiz")] - .into_iter() - .collect(); + let input_1: DictionaryArray = vec![Some("foo"), Some("bar"), None, Some("fiz")] + .into_iter() + .collect(); let input_2: DictionaryArray = vec![None].into_iter().collect(); let expected = vec![Some("foo"), Some("bar"), None, Some("fiz"), None]; @@ -631,8 +591,7 @@ mod tests { #[test] fn test_dictionary_concat_reuse() { - let array: DictionaryArray = - vec!["a", "a", "b", "c"].into_iter().collect(); + let array: DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); let copy: DictionaryArray = array.clone(); // dictionary is "a", "b", "c" @@ -719,19 +678,16 @@ mod tests { #[test] fn concat_record_batches_of_different_schemas_but_compatible_data() { - let schema1 = - Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let schema1 = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); // column names differ - let schema2 = - Arc::new(Schema::new(vec![Field::new("c", DataType::Int32, false)])); + let schema2 = Arc::new(Schema::new(vec![Field::new("c", DataType::Int32, false)])); let batch1 = RecordBatch::try_new( schema1.clone(), vec![Arc::new(Int32Array::from(vec![1, 2]))], ) .unwrap(); let batch2 = - RecordBatch::try_new(schema2, vec![Arc::new(Int32Array::from(vec![3, 4]))]) - .unwrap(); + RecordBatch::try_new(schema2, vec![Arc::new(Int32Array::from(vec![3, 4]))]).unwrap(); // concat_batches simply uses the schema provided let batch = concat_batches(&schema1, [&batch1, &batch2]).unwrap(); assert_eq!(batch.schema().as_ref(), schema1.as_ref()); @@ -740,8 +696,7 @@ mod tests { #[test] fn concat_record_batches_of_different_schemas_incompatible_data() { - let schema1 = - Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let schema1 = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); // column names differ let schema2 = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); let batch1 = RecordBatch::try_new( @@ -797,8 +752,7 @@ mod tests { assert_eq!(data.buffers()[1].capacity(), 192); // Nearest multiple of 64 let a = LargeBinaryArray::from_iter_values(std::iter::repeat(b"foo").take(100)); - let b = - LargeBinaryArray::from_iter_values(std::iter::repeat(b"cupcakes").take(10)); + let b = LargeBinaryArray::from_iter_values(std::iter::repeat(b"cupcakes").take(10)); let a = concat(&[&a, &b]).unwrap(); let data = a.to_data(); diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs index 330196ae33f4..d0b6fcfc3ac9 100644 --- a/arrow-select/src/dictionary.rs +++ b/arrow-select/src/dictionary.rs @@ -20,8 +20,7 @@ use ahash::RandomState; use arrow_array::builder::BooleanBufferBuilder; use arrow_array::cast::AsArray; use arrow_array::types::{ - ArrowDictionaryKeyType, BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, - Utf8Type, + ArrowDictionaryKeyType, BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, Utf8Type, }; use arrow_array::{Array, ArrayRef, DictionaryArray, GenericByteArray}; use arrow_buffer::{ArrowNativeType, BooleanBuffer, ScalarBuffer}; @@ -55,11 +54,7 @@ impl<'a, V> Interner<'a, V> { } } - fn intern Result, E>( - &mut self, - new: &'a [u8], - f: F, - ) -> Result<&V, E> { + fn intern Result, E>(&mut self, new: &'a [u8], f: F) -> Result<&V, E> { let hash = self.state.hash_one(new); let bucket_idx = hash >> self.shift; Ok(match &mut self.buckets[bucket_idx as usize] { @@ -88,8 +83,7 @@ pub struct MergedDictionaries { fn bytes_ptr_eq(a: &dyn Array, b: &dyn Array) -> bool { match (a.as_bytes_opt::(), b.as_bytes_opt::()) { (Some(a), Some(b)) => { - let values_eq = - a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); + let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets()); match (a.nulls(), b.nulls()) { (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()), (None, None) => values_eq, @@ -188,15 +182,14 @@ pub fn merge_dictionary_values( let mut mapping = vec![zero; dictionary.values().len()]; for (value_idx, value) in values { - mapping[value_idx] = *interner.intern(value, || { - match K::Native::from_usize(indices.len()) { + mapping[value_idx] = + *interner.intern(value, || match K::Native::from_usize(indices.len()) { Some(idx) => { indices.push((dictionary_idx, value_idx)); Ok(idx) } None => Err(ArrowError::DictionaryKeyOverflowError), - } - })?; + })?; } Ok(mapping) }) @@ -230,10 +223,7 @@ fn compute_values_mask( } /// Return a Vec containing for each set index in `mask`, the index and byte value of that index -fn get_masked_values<'a>( - array: &'a dyn Array, - mask: &BooleanBuffer, -) -> Vec<(usize, &'a [u8])> { +fn get_masked_values<'a>(array: &'a dyn Array, mask: &BooleanBuffer) -> Vec<(usize, &'a [u8])> { match array.data_type() { DataType::Utf8 => masked_bytes(array.as_string::(), mask), DataType::LargeUtf8 => masked_bytes(array.as_string::(), mask), @@ -268,8 +258,7 @@ mod tests { #[test] fn test_merge_strings() { - let a = - DictionaryArray::::from_iter(["a", "b", "a", "b", "d", "c", "e"]); + let a = DictionaryArray::::from_iter(["a", "b", "a", "b", "d", "c", "e"]); let b = DictionaryArray::::from_iter(["c", "f", "c", "d", "a", "d"]); let merged = merge_dictionary_values(&[&a, &b], None).unwrap(); @@ -293,8 +282,7 @@ mod tests { assert_eq!(&merged.key_mappings[1], &[3, 4, 2, 0]); // Mask out only ["b", "b", "d"] from a - let a_mask = - BooleanBuffer::from_iter([false, true, false, true, true, false, false]); + let a_mask = BooleanBuffer::from_iter([false, true, false, true, true, false, false]); let b_mask = BooleanBuffer::new_set(b.len()); let merged = merge_dictionary_values(&[&a, &b], Some(&[a_mask, b_mask])).unwrap(); @@ -315,16 +303,12 @@ mod tests { let values = StringArray::new(offsets, buffer, Some(nulls)); let key_values = vec![1, 2, 3, 1, 8, 2, 3]; - let key_nulls = - NullBuffer::from(vec![true, true, false, true, false, true, true]); + let key_nulls = NullBuffer::from(vec![true, true, false, true, false, true, true]); let keys = Int32Array::new(key_values.into(), Some(key_nulls)); let a = DictionaryArray::new(keys, Arc::new(values)); // [NULL, "bingo", NULL, NULL, NULL, "bingo", "hello"] - let b = DictionaryArray::new( - Int32Array::new_null(10), - Arc::new(StringArray::new_null(0)), - ); + let b = DictionaryArray::new(Int32Array::new_null(10), Arc::new(StringArray::new_null(0))); let merged = merge_dictionary_values(&[&a, &b], None).unwrap(); let expected = StringArray::from(vec!["bingo", "hello"]); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index 1afb8197bab6..ce51ecb58adb 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -162,10 +162,7 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray { /// let c = c.as_any().downcast_ref::().unwrap(); /// assert_eq!(c, &Int32Array::from(vec![5, 8])); /// ``` -pub fn filter( - values: &dyn Array, - predicate: &BooleanArray, -) -> Result { +pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result { let predicate = FilterBuilder::new(predicate).build(); filter_array(values, &predicate) } @@ -308,10 +305,7 @@ impl FilterPredicate { } } -fn filter_array( - values: &dyn Array, - predicate: &FilterPredicate, -) -> Result { +fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result { if predicate.filter.len() > values.len() { return Err(ArrowError::InvalidArgumentError(format!( "Filter predicate of length {} is larger than target array of length {}", @@ -423,16 +417,14 @@ fn filter_bits(buffer: &BooleanBuffer, predicate: &FilterPredicate) -> Buffer { unsafe { MutableBuffer::from_trusted_len_iter_bool(bits).into() } } IterationStrategy::SlicesIterator => { - let mut builder = - BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); + let mut builder = BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); for (start, end) in SlicesIterator::new(&predicate.filter) { builder.append_packed_range(start + offset..end + offset, src) } builder.into() } IterationStrategy::Slices(slices) => { - let mut builder = - BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); + let mut builder = BooleanBufferBuilder::new(bit_util::ceil(predicate.count, 8)); for (start, end) in slices { builder.append_packed_range(*start + offset..*end + offset, src) } @@ -459,10 +451,7 @@ fn filter_boolean(array: &BooleanArray, predicate: &FilterPredicate) -> BooleanA } /// `filter` implementation for primitive arrays -fn filter_primitive( - array: &PrimitiveArray, - predicate: &FilterPredicate, -) -> PrimitiveArray +fn filter_primitive(array: &PrimitiveArray, predicate: &FilterPredicate) -> PrimitiveArray where T: ArrowPrimitiveType, { @@ -471,24 +460,21 @@ where let buffer = match &predicate.strategy { IterationStrategy::SlicesIterator => { - let mut buffer = - MutableBuffer::with_capacity(predicate.count * T::get_byte_width()); + let mut buffer = MutableBuffer::with_capacity(predicate.count * T::get_byte_width()); for (start, end) in SlicesIterator::new(&predicate.filter) { buffer.extend_from_slice(&values[start..end]); } buffer } IterationStrategy::Slices(slices) => { - let mut buffer = - MutableBuffer::with_capacity(predicate.count * T::get_byte_width()); + let mut buffer = MutableBuffer::with_capacity(predicate.count * T::get_byte_width()); for (start, end) in slices { buffer.extend_from_slice(&values[*start..*end]); } buffer } IterationStrategy::IndexIterator => { - let iter = - IndexIterator::new(&predicate.filter, predicate.count).map(|x| values[x]); + let iter = IndexIterator::new(&predicate.filter, predicate.count).map(|x| values[x]); // SAFETY: IndexIterator is trusted length unsafe { MutableBuffer::from_trusted_len_iter(iter) } @@ -598,10 +584,7 @@ where /// /// Note: NULLs with a non-zero slot length in `array` will have the corresponding /// data copied across. This allows handling the null mask separately from the data -fn filter_bytes( - array: &GenericByteArray, - predicate: &FilterPredicate, -) -> GenericByteArray +fn filter_bytes(array: &GenericByteArray, predicate: &FilterPredicate) -> GenericByteArray where T: ByteArrayType, { @@ -633,10 +616,7 @@ where } /// `filter` implementation for dictionaries -fn filter_dict( - array: &DictionaryArray, - predicate: &FilterPredicate, -) -> DictionaryArray +fn filter_dict(array: &DictionaryArray, predicate: &FilterPredicate) -> DictionaryArray where T: ArrowDictionaryKeyType, T::Native: num::Num, @@ -765,8 +745,7 @@ mod tests { fn test_filter_array_low_density() { // this test exercises the all 0's branch of the filter algorithm let mut data_values = (1..=65).collect::>(); - let mut filter_values = - (1..=65).map(|i| matches!(i % 65, 0)).collect::>(); + let mut filter_values = (1..=65).map(|i| matches!(i % 65, 0)).collect::>(); // set up two more values after the batch data_values.extend_from_slice(&[66, 67]); filter_values.extend_from_slice(&[false, true]); @@ -852,8 +831,7 @@ mod tests { #[test] fn test_filter_array_slice_with_null() { - let a = - Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]).slice(1, 4); + let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]).slice(1, 4); let b = BooleanArray::from(vec![true, false, false, true]); // filtering with sliced filter array is not currently supported // let b_slice = BooleanArray::from(vec![true, false, false, true, false]).slice(1, 4); @@ -986,12 +964,8 @@ mod tests { fn test_filter_record_batch_no_columns() { let pred = BooleanArray::from(vec![Some(true), Some(true), None]); let options = RecordBatchOptions::default().with_row_count(Some(100)); - let record_batch = RecordBatch::try_new_with_options( - Arc::new(Schema::empty()), - vec![], - &options, - ) - .unwrap(); + let record_batch = + RecordBatch::try_new_with_options(Arc::new(Schema::empty()), vec![], &options).unwrap(); let out = filter_record_batch(&record_batch, &pred).unwrap(); assert_eq!(out.num_rows(), 2); @@ -999,8 +973,7 @@ mod tests { #[test] fn test_fast_path() { - let a: PrimitiveArray = - PrimitiveArray::from(vec![Some(1), Some(2), None]); + let a: PrimitiveArray = PrimitiveArray::from(vec![Some(1), Some(2), None]); // all true let mask = BooleanArray::from(vec![true, true, true]); @@ -1149,9 +1122,7 @@ mod tests { } /// Returns an iterator that calls `Option::as_deref` on each item - fn as_deref( - src: &[Option], - ) -> impl Iterator> { + fn as_deref(src: &[Option]) -> impl Iterator> { src.iter().map(|x| x.as_deref()) } @@ -1290,10 +1261,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8])) .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let list_data = ArrayData::builder(list_data_type) .len(3) .add_child_data(value_data) @@ -1349,10 +1318,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 3); bit_util::set_bit(&mut null_bits, 4); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -1534,13 +1501,11 @@ mod tests { assert_eq!(value1, value2); } 1 => { - let slot1 = - slot1.as_any().downcast_ref::().unwrap(); + let slot1 = slot1.as_any().downcast_ref::().unwrap(); assert_eq!(slot1.len(), 1); let value1 = slot1.value(0); - let slot2 = - slot2.as_any().downcast_ref::().unwrap(); + let slot2 = slot2.as_any().downcast_ref::().unwrap(); assert_eq!(slot2.len(), 1); let value2 = slot2.value(0); assert_eq!(value1, value2); diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index a0f41666513b..8229a8f3fe09 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -20,9 +20,7 @@ use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ - ArrowNativeType, MutableBuffer, NullBuffer, NullBufferBuilder, OffsetBuffer, -}; +use arrow_buffer::{ArrowNativeType, MutableBuffer, NullBuffer, NullBufferBuilder, OffsetBuffer}; use arrow_data::transform::MutableArrayData; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; @@ -79,10 +77,11 @@ pub fn interleave( for array in values.iter().skip(1) { if array.data_type() != data_type { - return Err(ArrowError::InvalidArgumentError( - format!("It is not possible to interleave arrays of different data types ({} and {})", - data_type, array.data_type()), - )); + return Err(ArrowError::InvalidArgumentError(format!( + "It is not possible to interleave arrays of different data types ({} and {})", + data_type, + array.data_type() + ))); } } @@ -278,8 +277,7 @@ mod tests { let a = Int32Array::from_iter_values([1, 2, 3, 4]); let b = Int32Array::from_iter_values([5, 6, 7]); let c = Int32Array::from_iter_values([8, 9, 10]); - let values = - interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); + let values = interleave(&[&a, &b, &c], &[(0, 3), (0, 3), (2, 2), (2, 0), (1, 1)]).unwrap(); let v = values.as_primitive::(); assert_eq!(v.values(), &[4, 4, 10, 8, 6]); } @@ -288,8 +286,7 @@ mod tests { fn test_primitive_nulls() { let a = Int32Array::from_iter_values([1, 2, 3, 4]); let b = Int32Array::from_iter([Some(1), Some(4), None]); - let values = - interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (0, 2)]).unwrap(); + let values = interleave(&[&a, &b], &[(0, 1), (1, 2), (1, 2), (0, 3), (0, 2)]).unwrap(); let v: Vec<_> = values.as_primitive::().into_iter().collect(); assert_eq!(&v, &[Some(2), None, None, Some(4), Some(3)]) } @@ -306,8 +303,7 @@ mod tests { fn test_strings() { let a = StringArray::from_iter_values(["a", "b", "c"]); let b = StringArray::from_iter_values(["hello", "world", "foo"]); - let values = - interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); + let values = interleave(&[&a, &b], &[(0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); let v = values.as_string::(); let values: Vec<_> = v.into_iter().collect(); assert_eq!( @@ -329,8 +325,7 @@ mod tests { // Should not recompute dictionary let values = - interleave(&[&a, &b], &[(0, 2), (0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]) - .unwrap(); + interleave(&[&a, &b], &[(0, 2), (0, 2), (0, 2), (1, 0), (1, 1), (0, 1)]).unwrap(); let v = values.as_dictionary::(); assert_eq!(v.values().len(), 5); @@ -371,8 +366,7 @@ mod tests { b.append(true); let b = b.finish(); - let values = - interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap(); + let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap(); let v = values.as_any().downcast_ref::().unwrap(); // [[3], null, [4], [5, 6, null], null] @@ -404,12 +398,8 @@ mod tests { let indices = &[(0, 0), (0, 1), (0, 2), (1, 0)]; let array = interleave(&[&dict_a, &dict_b], indices).unwrap(); - let expected = DictionaryArray::::from_iter(vec![ - Some("0"), - Some("1"), - Some("2"), - None, - ]); + let expected = + DictionaryArray::::from_iter(vec![Some("0"), Some("1"), Some("2"), None]); assert_eq!(array.as_ref(), &expected) } } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index f0bcb73cccb9..4025a5bacf80 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -29,8 +29,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Result { let mut null_count = 0; - let buffer = - bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }); + let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); (buffer, null_count) } }; @@ -110,8 +108,7 @@ mod tests { #[test] fn test_nullif_int_array() { let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); let res = nullif(&a, &comp).unwrap(); let expected = Int32Array::from(vec![ @@ -448,8 +445,7 @@ mod tests { #[test] fn test_nullif_no_nulls() { let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); let res = nullif(&a, &comp).unwrap(); let res = res.as_primitive::(); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index a546949f86e6..d47b884ae38d 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -24,8 +24,7 @@ use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::{ - bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, - ScalarBuffer, + bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, ScalarBuffer, }; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode}; @@ -102,25 +101,25 @@ fn check_bounds( ) -> Result<(), ArrowError> { if indices.null_count() > 0 { indices.iter().flatten().try_for_each(|index| { - let ix = index.to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + let ix = index + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) - ); + return Err(ArrowError::ComputeError(format!( + "Array index out of bounds, cannot get item at index {ix} from {len} entries" + ))); } Ok(()) }) } else { indices.values().iter().try_for_each(|index| { - let ix = index.to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + let ix = index + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {ix} from {len} entries")) - ); + return Err(ArrowError::ComputeError(format!( + "Array index out of bounds, cannot get item at index {ix} from {len} entries" + ))); } Ok(()) }) @@ -488,9 +487,10 @@ fn take_fixed_size_list( let null_slice = null_buf.as_slice_mut(); for i in 0..indices.len() { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + let index = indices + .value(i) + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; if !indices.is_valid(i) || values.is_null(index) { bit_util::unset_bit(null_slice, i); } @@ -565,15 +565,13 @@ fn take_run( let mut new_physical_len = 1; for ix in 1..physical_indices.len() { if physical_indices[ix] != physical_indices[ix - 1] { - take_value_indices - .append(I::Native::from_usize(physical_indices[ix - 1]).unwrap()); + take_value_indices.append(I::Native::from_usize(physical_indices[ix - 1]).unwrap()); new_run_ends_builder.append(T::Native::from_usize(ix).unwrap()); new_physical_len += 1; } } - take_value_indices.append( - I::Native::from_usize(physical_indices[physical_indices.len() - 1]).unwrap(), - ); + take_value_indices + .append(I::Native::from_usize(physical_indices[physical_indices.len() - 1]).unwrap()); new_run_ends_builder.append(T::Native::from_usize(physical_indices.len()).unwrap()); let new_run_ends = unsafe { // Safety: @@ -650,9 +648,10 @@ where // compute the value indices, and set offsets accordingly for i in 0..indices.len() { if indices.is_valid(i) { - let ix = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; + let ix = indices + .value(i) + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; let start = offsets[ix]; let end = offsets[ix + 1]; current_offset += end - start; @@ -694,11 +693,11 @@ where for i in 0..indices.len() { if indices.is_valid(i) { - let index = indices.value(i).to_usize().ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - let start = - list.value_offset(index) as ::Native; + let index = indices + .value(i) + .to_usize() + .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; + let start = list.value_offset(index) as ::Native; values.extend(start..start + length); } @@ -721,8 +720,7 @@ macro_rules! to_indices_reinterpret { type T = $o; fn to_indices(&self) -> PrimitiveArray<$o> { - let cast = - ScalarBuffer::new(self.values().inner().clone(), 0, self.len()); + let cast = ScalarBuffer::new(self.values().inner().clone(), 0, self.len()); PrimitiveArray::new(cast, self.nulls().cloned()) } } @@ -863,9 +861,7 @@ mod tests { } // create a simple struct for testing purposes - fn create_test_struct( - values: Vec, Option)>>, - ) -> StructArray { + fn create_test_struct(values: Vec, Option)>>) -> StructArray { let mut struct_builder = StructBuilder::new( Fields::from(vec![ Field::new("a", DataType::Boolean, true), @@ -961,8 +957,7 @@ mod tests { #[test] fn test_take_primitive_nullable_indices_non_null_values_with_offset() { - let index = - UInt32Array::from(vec![Some(0), Some(1), Some(2), Some(3), None, None]); + let index = UInt32Array::from(vec![Some(0), Some(1), Some(2), Some(3), None, None]); let index = index.slice(2, 4); let index = index.as_any().downcast_ref::().unwrap(); @@ -982,8 +977,7 @@ mod tests { #[test] fn test_take_primitive_nullable_indices_nullable_values_with_offset() { - let index = - UInt32Array::from(vec![Some(0), Some(1), Some(2), Some(3), None, None]); + let index = UInt32Array::from(vec![Some(0), Some(1), Some(2), Some(3), None, None]); let index = index.slice(2, 4); let index = index.as_any().downcast_ref::().unwrap(); @@ -1311,8 +1305,7 @@ mod tests { #[test] fn test_take_bool_with_offset() { - let index = - UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2), None]); + let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2), None]); let index = index.slice(2, 4); let index = index .as_any() @@ -1346,8 +1339,7 @@ mod tests { let actual = actual.as_any().downcast_ref::().unwrap(); - let expected = - K::from(vec![Some("four"), None, None, Some("four"), Some("five")]); + let expected = K::from(vec![Some("four"), None, None, Some("four"), Some("five")]); assert_eq!(actual, &expected); } @@ -1364,8 +1356,7 @@ mod tests { #[test] fn test_take_slice_string() { - let strings = - StringArray::from(vec![Some("hello"), None, Some("world"), None, Some("hi")]); + let strings = StringArray::from(vec![Some("hello"), None, Some("world"), None, Some("hi")]); let indices = Int32Array::from(vec![Some(0), Some(1), None, Some(0), Some(2)]); let indices_slice = indices.slice(1, 4); let expected = StringArray::from(vec![None, None, Some("hello"), Some("world")]); @@ -1376,17 +1367,13 @@ mod tests { macro_rules! test_take_list { ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]] - let value_data = - Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]).into_data(); // Construct offsets let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Arc::new(Field::new( - "item", - DataType::Int32, - false, - ))); + let list_data_type = + DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1399,8 +1386,7 @@ mod tests { let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(2), Some(0)]); let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); + let a: &$list_array_type = a.as_any().downcast_ref::<$list_array_type>().unwrap(); // construct a value array with expected results: // [[2,3], null, [-1,-2,-1], [], [0,0,0]] @@ -1452,11 +1438,8 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Arc::new(Field::new( - "item", - DataType::Int32, - true, - ))); + let list_data_type = + DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1470,8 +1453,7 @@ mod tests { let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); + let a: &$list_array_type = a.as_any().downcast_ref::<$list_array_type>().unwrap(); // construct a value array with expected results: // [[null], null, [-1,-2,3], [5,null], [0,null,0]] @@ -1523,11 +1505,8 @@ mod tests { let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; let value_offsets = Buffer::from_slice_ref(&value_offsets); // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Arc::new(Field::new( - "item", - DataType::Int32, - true, - ))); + let list_data_type = + DataType::$list_data_type(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type.clone()) .len(4) .add_buffer(value_offsets) @@ -1541,8 +1520,7 @@ mod tests { let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); + let a: &$list_array_type = a.as_any().downcast_ref::<$list_array_type>().unwrap(); // construct a value array with expected results: // [null, null, [-1,-2,3], [5,null], [0,null,0]] @@ -1590,13 +1568,11 @@ mod tests { { let indices = UInt32Array::from(indices); - let input_array = - FixedSizeListArray::from_iter_primitive::(input_data, length); + let input_array = FixedSizeListArray::from_iter_primitive::(input_data, length); let output = take_fixed_size_list(&input_array, &indices, length as u32).unwrap(); - let expected = - FixedSizeListArray::from_iter_primitive::(expected_data, length); + let expected = FixedSizeListArray::from_iter_primitive::(expected_data, length); assert_eq!(&output, &expected) } @@ -1695,8 +1671,7 @@ mod tests { // Construct offsets let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1715,12 +1690,9 @@ mod tests { #[test] fn test_take_map() { let values = Int32Array::from(vec![1, 2, 3, 4]); - let array = MapArray::new_from_strings( - vec!["a", "b", "c", "a"].into_iter(), - &values, - &[0, 3, 4], - ) - .unwrap(); + let array = + MapArray::new_from_strings(vec!["a", "b", "c", "a"].into_iter(), &values, &[0, 3, 4]) + .unwrap(); let index = UInt32Array::from(vec![0]); @@ -1774,8 +1746,7 @@ mod tests { None, ]); - let index = - UInt32Array::from(vec![None, Some(3), Some(1), None, Some(0), Some(4)]); + let index = UInt32Array::from(vec![None, Some(3), Some(1), None, Some(0), Some(4)]); let actual = take(&array, &index, None).unwrap(); let actual: &StructArray = actual.as_any().downcast_ref::().unwrap(); assert_eq!(index.len(), actual.len()); @@ -1927,8 +1898,7 @@ mod tests { ]); let indices = UInt32Array::from(vec![2, 0]); - let (indexed, offsets, null_buf) = - take_value_indices_from_list(&list, &indices).unwrap(); + let (indexed, offsets, null_buf) = take_value_indices_from_list(&list, &indices).unwrap(); assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1])); assert_eq!(offsets, vec![0, 5, 7]); @@ -1986,14 +1956,12 @@ mod tests { ); let indices = UInt32Array::from(vec![2, 1, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); + let indexed = take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); assert_eq!(indexed, UInt32Array::from(vec![6, 7, 8, 3, 4, 5, 0, 1, 2])); let indices = UInt32Array::from(vec![3, 2, 1, 2, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); + let indexed = take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); assert_eq!( indexed, @@ -2038,8 +2006,7 @@ mod tests { Some((Some(true), Some(31))), None, ]); - let strings = - StringArray::from(vec![Some("a"), None, Some("c"), None, Some("d")]); + let strings = StringArray::from(vec![Some("a"), None, Some("c"), None, Some("d")]); let type_ids = Buffer::from_slice_ref(vec![1i8; 5]); let children: Vec<(Field, Arc)> = vec![ diff --git a/arrow-string/src/concat_elements.rs b/arrow-string/src/concat_elements.rs index a6e02d04dd3f..66ecd34868a5 100644 --- a/arrow-string/src/concat_elements.rs +++ b/arrow-string/src/concat_elements.rs @@ -54,11 +54,8 @@ pub fn concat_elements_bytes( let mut output_offsets = BufferBuilder::::new(left_offsets.len()); output_offsets.append(T::Offset::usize_as(0)); for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { - output_values - .append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); - output_values.append_slice( - &right_values[right_idx[0].as_usize()..right_idx[1].as_usize()], - ); + output_values.append_slice(&left_values[left_idx[0].as_usize()..left_idx[1].as_usize()]); + output_values.append_slice(&right_values[right_idx[0].as_usize()..right_idx[1].as_usize()]); output_offsets.append(T::Offset::from_usize(output_values.len()).unwrap()); } @@ -170,10 +167,7 @@ pub fn concat_elements_utf8_many( Ok(unsafe { builder.build_unchecked() }.into()) } -pub fn concat_elements_dyn( - left: &dyn Array, - right: &dyn Array, -) -> Result { +pub fn concat_elements_dyn(left: &dyn Array, right: &dyn Array) -> Result { if left.data_type() != right.data_type() { return Err(ArrowError::ComputeError(format!( "Cannot concat arrays of different types: {} != {}", @@ -392,8 +386,7 @@ mod tests { // test for LargeBinaryArray let left = LargeBinaryArray::from_opt_vec(vec![Some(b"foo"), Some(b"bar"), None]); - let right = - LargeBinaryArray::from_opt_vec(vec![None, Some(b"yyy"), Some(b"zzz")]); + let right = LargeBinaryArray::from_opt_vec(vec![None, Some(b"yyy"), Some(b"zzz")]); let output: LargeBinaryArray = concat_elements_dyn(&left, &right) .unwrap() .into_data() @@ -410,8 +403,7 @@ mod tests { let output = concat_elements_dyn(&left, &right); assert_eq!( output.unwrap_err().to_string(), - "Compute error: Cannot concat arrays of different types: Utf8 != LargeUtf8" - .to_string() + "Compute error: Cannot concat arrays of different types: Utf8 != LargeUtf8".to_string() ); } } diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index ab5fbb0c6425..1dd5933ce0e5 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -173,9 +173,7 @@ mod tests { macro_rules! length_list_helper { ($offset_ty: ty, $result_ty: ty, $element_ty: ty, $value: expr, $expected: expr) => {{ let array = - GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>( - $value, - ); + GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>($value); let result = length(&array).unwrap(); let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); let expected: $result_ty = $expected.into(); @@ -356,8 +354,7 @@ mod tests { #[test] fn length_offsets_binary() { - let value: Vec> = - vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; + let value: Vec> = vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = length(&b).unwrap(); @@ -506,8 +503,7 @@ mod tests { #[test] fn bit_length_offsets_binary() { - let value: Vec> = - vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; + let value: Vec> = vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = bit_length(&b).unwrap(); @@ -621,10 +617,8 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8])) .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let nulls = NullBuffer::from(vec![true, false, true]); let list_data = ArrayData::builder(list_data_type) .len(3) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 4478c4e4f7ef..6f6dfe03133d 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -92,18 +92,12 @@ pub fn nilike(left: &dyn Datum, right: &dyn Datum) -> Result Result { +pub fn starts_with(left: &dyn Datum, right: &dyn Datum) -> Result { like_op(Op::StartsWith, left, right) } /// Perform SQL `ENDSWITH(left, right)` -pub fn ends_with( - left: &dyn Datum, - right: &dyn Datum, -) -> Result { +pub fn ends_with(left: &dyn Datum, right: &dyn Datum) -> Result { like_op(Op::EndsWith, left, right) } @@ -132,9 +126,7 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { - apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) - } + (Utf8, Utf8) => apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v), (LargeUtf8, LargeUtf8) => { apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } @@ -156,9 +148,7 @@ fn apply( let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { let idx = match r_v { - Some(dict) if dict.null_count() != 0 => { - return Ok(BooleanArray::new_null(l_len)) - } + Some(dict) if dict.null_count() != 0 => return Ok(BooleanArray::new_null(l_len)), Some(dict) => dict.normalized_keys()[0], None => 0, }; @@ -312,10 +302,7 @@ macro_rules! legacy_kernels { #[doc(hidden)] #[deprecated(note = $deprecation)] - pub fn $fn_scalar_dyn( - left: &dyn Array, - right: &str, - ) -> Result { + pub fn $fn_scalar_dyn(left: &dyn Array, right: &str) -> Result { let scalar = make_scalar(left.data_type(), right)?; $fn_datum(&left, &Scalar::new(&scalar)) } @@ -754,9 +741,7 @@ mod tests { test_utf8_scalar!( test_utf8_array_ilike_unicode, test_utf8_array_ilike_unicode_dyn, - vec![ - "FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", "FFKoSS" - ], + vec!["FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", "FFKoSS"], "FFkoSS", ilike_utf8_scalar, ilike_utf8_scalar_dyn, diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 162e3c75027d..fe288f9de808 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -69,8 +69,7 @@ impl<'a> Predicate<'a> { && !pattern[..pattern.len() - 1].contains(is_like_pattern) { return Ok(Self::IStartsWithAscii(&pattern[..pattern.len() - 1])); - } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) - { + } else if pattern.starts_with('%') && !pattern[1..].contains(is_like_pattern) { return Ok(Self::IEndsWithAscii(&pattern[1..])); } } @@ -110,17 +109,15 @@ impl<'a> Predicate<'a> { Predicate::Contains(v) => { BooleanArray::from_unary(array, |haystack| haystack.contains(v) != negate) } - Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| { - haystack.starts_with(v) != negate - }), - Predicate::IStartsWithAscii(v) => { - BooleanArray::from_unary(array, |haystack| { - starts_with_ignore_ascii_case(haystack, v) != negate - }) + Predicate::StartsWith(v) => { + BooleanArray::from_unary(array, |haystack| haystack.starts_with(v) != negate) } - Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { - haystack.ends_with(v) != negate + Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { + starts_with_ignore_ascii_case(haystack, v) != negate }), + Predicate::EndsWith(v) => { + BooleanArray::from_unary(array, |haystack| haystack.ends_with(v) != negate) + } Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { ends_with_ignore_ascii_case(haystack, v) != negate }), diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index af4d66f97fd0..34bb1b0b4c41 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -41,8 +41,7 @@ pub fn regexp_is_match_utf8( ) -> Result { if array.len() != regex_array.len() { return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), + "Cannot perform comparison operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); @@ -51,14 +50,17 @@ pub fn regexp_is_match_utf8( let mut result = BooleanBufferBuilder::new(array.len()); let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{flag}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, + Some(flags) => Box::new( + regex_array + .iter() + .zip(flags.iter()) + .map(|(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(flag) => format!("(?{flag}){pattern}"), + None => pattern.to_string(), + }) + }), + ) as Box>>, None => Box::new( regex_array .iter() @@ -178,19 +180,21 @@ pub fn regexp_match( flags_array: Option<&GenericStringArray>, ) -> Result { let mut patterns: HashMap = HashMap::new(); - let builder: GenericStringBuilder = - GenericStringBuilder::with_capacity(0, 0); + let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut list_builder = ListBuilder::new(builder); let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(value) => format!("(?{value}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, + Some(flags) => Box::new( + regex_array + .iter() + .zip(flags.iter()) + .map(|(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(value) => format!("(?{value}){pattern}"), + None => pattern.to_string(), + }) + }), + ) as Box>>, None => Box::new( regex_array .iter() @@ -290,8 +294,7 @@ mod tests { let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]); let flags = StringArray::from(vec!["i"; 4]); let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - let elem_builder: GenericStringBuilder = - GenericStringBuilder::with_capacity(0, 0); + let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut expected_builder = ListBuilder::new(elem_builder); expected_builder.append(false); expected_builder.values().append_value("7"); diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs index dc0dfdcbb4ad..f5fe811032fb 100644 --- a/arrow-string/src/substring.rs +++ b/arrow-string/src/substring.rs @@ -225,11 +225,7 @@ pub fn substring_by_char( /// * `length` - the char length of the substring /// /// Return the `start` and `end` offset (by byte) of the substring -fn get_start_end_offset( - val: &str, - start: usize, - length: Option, -) -> (usize, usize) { +fn get_start_end_offset(val: &str, start: usize, length: Option) -> (usize, usize) { let len = val.len(); let mut offset_char_iter = val.char_indices(); let start_offset = offset_char_iter @@ -279,8 +275,7 @@ where }; // start and end offsets of all substrings - let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = - Vec::with_capacity(array.len()); + let mut new_starts_ends: Vec<(T::Offset, T::Offset)> = Vec::with_capacity(array.len()); let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); let mut len_so_far = zero; new_offsets.push(zero); @@ -659,8 +654,7 @@ mod tests { fn with_nulls_generic_string() { let input = vec![Some("hello"), None, Some("word")]; // all-nulls array is always identical - let base_case = - gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); + let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); let cases = gen_test_cases!( input, // identity @@ -781,8 +775,7 @@ mod tests { fn with_nulls_generic_string_by_char() { let input = vec![Some("hello"), None, Some("Γ ⊢x:T")]; // all-nulls array is always identical - let base_case = - gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); + let base_case = gen_test_cases!(vec![None, None, None], (0, None, vec![None, None, None])); let cases = gen_test_cases!( input, // identity @@ -864,10 +857,8 @@ mod tests { let values = "S→T = Πx:S.T"; let offsets = &[ O::zero(), - O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()) - .unwrap(), - O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()) - .unwrap(), + O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()).unwrap(), + O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()).unwrap(), O::from_usize(values.len()).unwrap(), ]; // set the first and third element to be valid @@ -928,8 +919,7 @@ mod tests { let dict_array: DictionaryArray = data.clone().into_iter().collect(); - let expected: Vec> = - data.iter().map(|opt| opt.map(|s| &s[1..3])).collect(); + let expected: Vec> = data.iter().map(|opt| opt.map(|s| &s[1..3])).collect(); let res = substring(&dict_array, 1, Some(2)).unwrap(); let actual = res.as_any().downcast_ref::>().unwrap(); diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index 529205e7e28f..531462f2d8b5 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -53,8 +53,7 @@ fn validate_benchmark(c: &mut Criterion) { b.iter(|| validate_utf8_array(&str_arr)) }); - let byte_array = - BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); + let byte_array = BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000)); c.bench_function("byte_array_to_string_array 20000", |b| { b.iter(|| StringArray::from(BinaryArray::from(byte_array.to_data()))) }); diff --git a/arrow/benches/array_from_vec.rs b/arrow/benches/array_from_vec.rs index 229ac0b87d41..5fce3f113e43 100644 --- a/arrow/benches/array_from_vec.rs +++ b/arrow/benches/array_from_vec.rs @@ -70,9 +70,7 @@ fn struct_array_from_vec( let strings: ArrayRef = Arc::new(StringArray::from(strings.to_owned())); let ints: ArrayRef = Arc::new(Int32Array::from(ints.to_owned())); - criterion::black_box( - StructArray::try_from(vec![(field1, strings), (field2, ints)]).unwrap(), - ); + criterion::black_box(StructArray::try_from(vec![(field1, strings), (field2, ints)]).unwrap()); } fn decimal128_array_from_vec(array: &[Option]) { diff --git a/arrow/benches/bitwise_kernel.rs b/arrow/benches/bitwise_kernel.rs index 741eb96125a2..8604ea97eb3c 100644 --- a/arrow/benches/bitwise_kernel.rs +++ b/arrow/benches/bitwise_kernel.rs @@ -19,8 +19,8 @@ extern crate criterion; use arrow::compute::kernels::bitwise::{ - bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, - bitwise_xor, bitwise_xor_scalar, + bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, bitwise_xor, + bitwise_xor_scalar, }; use arrow::datatypes::Int64Type; use criterion::{black_box, Criterion}; @@ -40,9 +40,7 @@ fn bitwise_array_benchmark(c: &mut Criterion) { // array and let mut group = c.benchmark_group("bench bitwise array: and"); group.bench_function("bitwise array and, no nulls", |b| { - b.iter(|| { - black_box(bitwise_and(&left_without_null, &right_without_null).unwrap()) - }) + b.iter(|| black_box(bitwise_and(&left_without_null, &right_without_null).unwrap())) }); group.bench_function("bitwise array and, 20% nulls", |b| { b.iter(|| black_box(bitwise_and(&left_with_null, &right_with_null).unwrap())) @@ -60,9 +58,7 @@ fn bitwise_array_benchmark(c: &mut Criterion) { // xor let mut group = c.benchmark_group("bench bitwise: xor"); group.bench_function("bitwise array xor, no nulls", |b| { - b.iter(|| { - black_box(bitwise_xor(&left_without_null, &right_without_null).unwrap()) - }) + b.iter(|| black_box(bitwise_xor(&left_without_null, &right_without_null).unwrap())) }); group.bench_function("bitwise array xor, 20% nulls", |b| { b.iter(|| black_box(bitwise_xor(&left_with_null, &right_with_null).unwrap())) diff --git a/arrow/benches/buffer_bit_ops.rs b/arrow/benches/buffer_bit_ops.rs index 68b22df4b134..ab122ac94165 100644 --- a/arrow/benches/buffer_bit_ops.rs +++ b/arrow/benches/buffer_bit_ops.rs @@ -22,9 +22,7 @@ use criterion::{Criterion, Throughput}; extern crate arrow; -use arrow::buffer::{ - buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, -}; +use arrow::buffer::{buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer}; /// Helper function to create arrays fn create_buffer(size: usize) -> Buffer { @@ -82,14 +80,10 @@ fn bit_ops_benchmark(c: &mut Criterion) { .bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right))) .bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right))) .bench_function("and_with_offset", |b| { - b.iter(|| { - bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) - }) + b.iter(|| bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)) }) .bench_function("or_with_offset", |b| { - b.iter(|| { - bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5) - }) + b.iter(|| bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)) }); c.benchmark_group("buffer_unary_ops") diff --git a/arrow/benches/buffer_create.rs b/arrow/benches/buffer_create.rs index d628e031ce6f..e7d24c2166d7 100644 --- a/arrow/benches/buffer_create.rs +++ b/arrow/benches/buffer_create.rs @@ -49,8 +49,8 @@ fn mutable_buffer_iter_bitset(data: &[Vec]) -> Vec { criterion::black_box({ data.iter() .map(|datum| { - let mut result = MutableBuffer::new((data.len() + 7) / 8) - .with_bitset(datum.len(), false); + let mut result = + MutableBuffer::new((data.len() + 7) / 8).with_bitset(datum.len(), false); for (i, value) in datum.iter().enumerate() { if *value { unsafe { @@ -148,10 +148,7 @@ fn benchmark(c: &mut Criterion) { c.bench_function("mutable iter extend_from_slice", |b| { b.iter(|| { - mutable_iter_extend_from_slice( - criterion::black_box(&data), - criterion::black_box(0), - ) + mutable_iter_extend_from_slice(criterion::black_box(&data), criterion::black_box(0)) }) }); c.bench_function("mutable", |b| { @@ -163,9 +160,7 @@ fn benchmark(c: &mut Criterion) { }); c.bench_function("mutable prepared", |b| { - b.iter(|| { - mutable_buffer(criterion::black_box(&data), criterion::black_box(byte_cap)) - }) + b.iter(|| mutable_buffer(criterion::black_box(&data), criterion::black_box(byte_cap))) }); c.bench_function("from_slice", |b| { diff --git a/arrow/benches/builder.rs b/arrow/benches/builder.rs index 8cb226e89056..87a02e7ad1fd 100644 --- a/arrow/benches/builder.rs +++ b/arrow/benches/builder.rs @@ -131,9 +131,8 @@ fn bench_decimal256(c: &mut Criterion) { let mut rng = rand::thread_rng(); let mut decimal_builder = Decimal256Builder::with_capacity(BATCH_SIZE); for _ in 0..BATCH_SIZE { - decimal_builder.append_value(i256::from_i128( - rng.gen_range::(0..99999999999), - )); + decimal_builder + .append_value(i256::from_i128(rng.gen_range::(0..99999999999))); } black_box( decimal_builder diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs index 5a91dfe0a6ff..b5afac1f6a46 100644 --- a/arrow/benches/csv_reader.rs +++ b/arrow/benches/csv_reader.rs @@ -76,8 +76,7 @@ fn criterion_benchmark(c: &mut Criterion) { let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 u64(0)", cols); - let values = - Int64Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024) - 512)); + let values = Int64Array::from_iter_values((0..4096).map(|_| rng.gen_range(0..1024) - 512)); let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 i64_small(0)", cols); @@ -103,20 +102,16 @@ fn criterion_benchmark(c: &mut Criterion) { let cols = vec![Arc::new(values) as ArrayRef]; do_bench(c, "4096 f64(0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; do_bench(c, "4096 string(10, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; do_bench(c, "4096 string(30, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0.5)", cols); let cols = vec![ diff --git a/arrow/benches/csv_writer.rs b/arrow/benches/csv_writer.rs index 05c6c226c464..0c13428c9160 100644 --- a/arrow/benches/csv_writer.rs +++ b/arrow/benches/csv_writer.rs @@ -41,11 +41,7 @@ fn criterion_benchmark(c: &mut Criterion) { "consectetur adipiscing elit", "sed do eiusmod tempor", ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); + let c2 = PrimitiveArray::::from(vec![Some(123.564532), None, Some(-556132.25)]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs index a70da1d2cfb7..be812a225ca2 100644 --- a/arrow/benches/decimal_validate.rs +++ b/arrow/benches/decimal_validate.rs @@ -18,9 +18,7 @@ #[macro_use] extern crate criterion; -use arrow::array::{ - Array, Decimal128Array, Decimal128Builder, Decimal256Array, Decimal256Builder, -}; +use arrow::array::{Array, Decimal128Array, Decimal128Builder, Decimal256Array, Decimal256Builder}; use criterion::Criterion; use rand::Rng; diff --git a/arrow/benches/filter_kernels.rs b/arrow/benches/filter_kernels.rs index 9dd3e7ebba09..65726a271009 100644 --- a/arrow/benches/filter_kernels.rs +++ b/arrow/benches/filter_kernels.rs @@ -210,8 +210,7 @@ fn add_benchmark(c: &mut Criterion) { let field = Field::new("c1", data_array.data_type().clone(), true); let schema = Schema::new(vec![field]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data_array)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(data_array)]).unwrap(); c.bench_function("filter single record batch", |b| { b.iter(|| filter_record_batch(&batch, &filter_array)) diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs index 454d9140809c..0941f1e3fd33 100644 --- a/arrow/benches/interleave_kernels.rs +++ b/arrow/benches/interleave_kernels.rs @@ -74,8 +74,7 @@ fn add_benchmark(c: &mut Criterion) { let dict = create_dict_from_values::(1024, 0.0, &values); let values = create_string_array_with_len::(1024, 0.0, 20); - let sparse_dict = - create_sparse_dict_from_values::(1024, 0.0, &values, 10..20); + let sparse_dict = create_sparse_dict_from_values::(1024, 0.0, &values, 10..20); let cases: &[(&str, &dyn Array)] = &[ ("i32(0.0)", &i32), diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index 25b2279be8d6..bd2db1e5022d 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -52,12 +52,8 @@ impl std::fmt::Debug for Column { impl Column { fn generate(self, size: usize) -> ArrayRef { match self { - Column::RequiredI32 => { - Arc::new(create_primitive_array::(size, 0.)) - } - Column::OptionalI32 => { - Arc::new(create_primitive_array::(size, 0.2)) - } + Column::RequiredI32 => Arc::new(create_primitive_array::(size, 0.)), + Column::OptionalI32 => Arc::new(create_primitive_array::(size, 0.2)), Column::Required16CharString => { Arc::new(create_string_array_with_len::(size, 0., 16)) } diff --git a/arrow/benches/primitive_run_accessor.rs b/arrow/benches/primitive_run_accessor.rs index 868c314f9716..10c1e9ff39a9 100644 --- a/arrow/benches/primitive_run_accessor.rs +++ b/arrow/benches/primitive_run_accessor.rs @@ -25,16 +25,13 @@ fn criterion_benchmark(c: &mut Criterion) { let mut do_bench = |physical_array_len: usize, logical_array_len: usize| { group.bench_function( - format!( - "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len})"), + format!("(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len})"), |b| { let run_array = create_primitive_run_array::( logical_array_len, physical_array_len, ); - let typed = run_array - .downcast::>() - .unwrap(); + let typed = run_array.downcast::>().unwrap(); b.iter(|| { for i in 0..logical_array_len { let _ = unsafe { typed.value_unchecked(i) }; diff --git a/arrow/benches/primitive_run_take.rs b/arrow/benches/primitive_run_take.rs index 82ff35949e79..c10c16bfee3a 100644 --- a/arrow/benches/primitive_run_take.rs +++ b/arrow/benches/primitive_run_take.rs @@ -41,9 +41,7 @@ fn create_random_index(size: usize, null_density: f32, max_value: usize) -> UInt fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("primitive_run_take"); - let mut do_bench = |physical_array_len: usize, - logical_array_len: usize, - take_len: usize| { + let mut do_bench = |physical_array_len: usize, logical_array_len: usize, take_len: usize| { let run_array = create_primitive_run_array::( logical_array_len, physical_array_len, diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index bde117e3ec3e..cb7455939e0b 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -63,36 +63,28 @@ fn row_bench(c: &mut Criterion) { let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; do_bench(c, "4096 i64(0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 10)) as ArrayRef]; do_bench(c, "4096 string(10, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 30)) as ArrayRef]; do_bench(c, "4096 string(30, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0., 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0)", cols); - let cols = - vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0.5)", cols); - let cols = - vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; + let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; do_bench(c, "4096 string_dictionary(10, 0)", cols); - let cols = - vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; + let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 30)) as ArrayRef]; do_bench(c, "4096 string_dictionary(30, 0)", cols); - let cols = - vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 100)) as ArrayRef]; do_bench(c, "4096 string_dictionary(100, 0)", cols.clone()); - let cols = - vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; + let cols = vec![Arc::new(create_string_dict_array::(4096, 0.5, 100)) as ArrayRef]; do_bench(c, "4096 string_dictionary(100, 0.5)", cols.clone()); let values = create_string_array_with_len::(10, 0., 10); diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index 63e10e0528ba..b6578d8d8026 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -122,10 +122,8 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_sort_to_indices(&arr, None)) }); - let run_encoded_array = create_primitive_run_array::( - 2usize.pow(12), - 2usize.pow(10), - ); + let run_encoded_array = + create_primitive_run_array::(2usize.pow(12), 2usize.pow(10)); c.bench_function("sort primitive run 2^12", |b| { b.iter(|| bench_sort(&run_encoded_array)) diff --git a/arrow/benches/string_run_builder.rs b/arrow/benches/string_run_builder.rs index dda0f35b801f..b4457b74dada 100644 --- a/arrow/benches/string_run_builder.rs +++ b/arrow/benches/string_run_builder.rs @@ -23,9 +23,7 @@ use criterion::{criterion_group, criterion_main, Criterion}; fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("string_run_builder"); - let mut do_bench = |physical_array_len: usize, - logical_array_len: usize, - string_len: usize| { + let mut do_bench = |physical_array_len: usize, logical_array_len: usize, string_len: usize| { group.bench_function( format!( "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, string_len: {string_len})", diff --git a/arrow/benches/string_run_iterator.rs b/arrow/benches/string_run_iterator.rs index cfa44e66e30a..ac5cf7838408 100644 --- a/arrow/benches/string_run_iterator.rs +++ b/arrow/benches/string_run_iterator.rs @@ -47,9 +47,7 @@ fn build_strings_runs( fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("string_run_iterator"); - let mut do_bench = |physical_array_len: usize, - logical_array_len: usize, - string_len: usize| { + let mut do_bench = |physical_array_len: usize, logical_array_len: usize, string_len: usize| { group.bench_function( format!( "(run_array_len:{logical_array_len}, physical_array_len:{physical_array_len}, string_len: {string_len})"), diff --git a/arrow/benches/take_kernels.rs b/arrow/benches/take_kernels.rs index 362b3f5cbf3c..9c3f1eb40909 100644 --- a/arrow/benches/take_kernels.rs +++ b/arrow/benches/take_kernels.rs @@ -47,9 +47,7 @@ fn bench_take(values: &dyn Array, indices: &UInt32Array) { } fn bench_take_bounds_check(values: &dyn Array, indices: &UInt32Array) { - criterion::black_box( - take(values, indices, Some(TakeOptions { check_bounds: true })).unwrap(), - ); + criterion::black_box(take(values, indices, Some(TakeOptions { check_bounds: true })).unwrap()); } fn add_benchmark(c: &mut Criterion) { diff --git a/arrow/examples/builders.rs b/arrow/examples/builders.rs index 250f5c39af10..ad6b879642ab 100644 --- a/arrow/examples/builders.rs +++ b/arrow/examples/builders.rs @@ -59,8 +59,7 @@ fn main() { vec![Some(1550902545147), None, Some(1550902545147)].into(); println!("{date_array:?}"); - let time_array: PrimitiveArray = - (0..100).collect::>().into(); + let time_array: PrimitiveArray = (0..100).collect::>().into(); println!("{time_array:?}"); // We can build arrays directly from the underlying buffers. @@ -98,8 +97,7 @@ fn main() { let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -119,8 +117,7 @@ fn main() { let struct_array = StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Boolean, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( Arc::new(Field::new("c", DataType::Int32, false)), diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index 21edb235aaa7..4c01f0ea8c72 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -62,8 +62,7 @@ fn main() -> Result<()> { ]); // build a record batch - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id), Arc::new(nested)])?; + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id), Arc::new(nested)])?; print_batches(&[batch.clone()]).unwrap(); diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 639ff980ebc5..e05c256d0128 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -55,8 +55,8 @@ mod tests { use crate::util::bit_util; use crate::{ array::{ - Array, ArrayData, BooleanArray, FixedSizeBinaryArray, Int64Array, - StructArray, UInt32Array, UInt64Array, + Array, ArrayData, BooleanArray, FixedSizeBinaryArray, Int64Array, StructArray, + UInt32Array, UInt64Array, }, datatypes::{DataType, Field}, ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, @@ -102,8 +102,7 @@ mod tests { let inner = StructArray::from(vec![ ( Arc::new(Field::new("a1", DataType::Boolean, false)), - Arc::new(BooleanArray::from(vec![true, true, false, false])) - as Arc, + Arc::new(BooleanArray::from(vec![true, true, false, false])) as Arc, ), ( Arc::new(Field::new("a2", DataType::UInt32, false)), @@ -118,8 +117,7 @@ mod tests { ), ( Arc::new(Field::new("b", DataType::Boolean, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( Arc::new(Field::new("c", DataType::UInt32, false)), @@ -170,8 +168,7 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; + let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; let data = array.into_data(); test_round_trip(&data) @@ -244,10 +241,8 @@ mod tests { let mut validity_bits: [u8; 1] = [0; 1]; bit_util::set_bit(&mut validity_bits, 2); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("f", inner_list_data_type, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("f", inner_list_data_type, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(4) .null_bit_buffer(Some(Buffer::from(validity_bits))) diff --git a/arrow/src/compute/kernels.rs b/arrow/src/compute/kernels.rs index 35ad80e009cc..4eeb5892c97c 100644 --- a/arrow/src/compute/kernels.rs +++ b/arrow/src/compute/kernels.rs @@ -17,9 +17,7 @@ //! Computation kernels on Arrow Arrays -pub use arrow_arith::{ - aggregate, arithmetic, arity, bitwise, boolean, numeric, temporal, -}; +pub use arrow_arith::{aggregate, arithmetic, arity, bitwise, boolean, numeric, temporal}; pub use arrow_cast::cast; pub use arrow_cast::parse as cast_utils; pub use arrow_ord::{cmp, partition, rank, sort}; diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index bc5b7d500b18..894e046e621f 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -27,8 +27,8 @@ pub use arrow_array::{ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType}; pub use arrow_buffer::{i256, ArrowNativeType, ToByteSlice}; pub use arrow_data::decimal::*; pub use arrow_schema::{ - DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, - TimeUnit, UnionFields, UnionMode, + DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, TimeUnit, + UnionFields, UnionMode, }; #[cfg(feature = "ffi")] diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 7fbbaa7a3907..c13d4c6e5dff 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -303,8 +303,7 @@ impl<'a> ArrowArray<'a> { .map(|index| { let len = self.buffer_len(index, dt)?; - match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } - { + match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } { Some(buf) => Ok(buf), None if len == 0 => { // Null data buffer, which Rust doesn't allow. So create @@ -405,7 +404,9 @@ impl<'a> ArrowArray<'a> { owner: self.owner, }), (None, None) => None, - _ => panic!("Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema") + _ => panic!( + "Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema" + ), } } } @@ -424,9 +425,9 @@ mod tests { use crate::array::{ make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, - DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, - GenericBinaryArray, GenericListArray, GenericStringArray, Int32Array, MapArray, - OffsetSizeTrait, Time32MillisecondArray, TimestampMillisecondArray, UInt32Array, + DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, + GenericListArray, GenericStringArray, Int32Array, MapArray, OffsetSizeTrait, + Time32MillisecondArray, TimestampMillisecondArray, UInt32Array, }; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; @@ -468,9 +469,8 @@ mod tests { // We can read them back to memory // SAFETY: // Pointers are aligned and valid - let data = unsafe { - from_ffi(std::ptr::read(array_ptr), &std::ptr::read(schema_ptr)).unwrap() - }; + let data = + unsafe { from_ffi(std::ptr::read(array_ptr), &std::ptr::read(schema_ptr)).unwrap() }; let array = Int32Array::from(data); assert_eq!(array, Int32Array::from(vec![1, 2, 3])); @@ -533,8 +533,7 @@ mod tests { fn test_generic_string() -> Result<()> { // create an array natively - let array = - GenericStringArray::::from(vec![Some("a"), None, Some("aaa")]); + let array = GenericStringArray::::from(vec![Some("a"), None, Some("aaa")]); // export it let (array, schema) = to_ffi(&array.to_data())?; @@ -733,14 +732,7 @@ mod tests { // verify assert_eq!( array, - &Time32MillisecondArray::from(vec![ - None, - Some(1), - Some(2), - None, - Some(1), - Some(2) - ]) + &Time32MillisecondArray::from(vec![None, Some(1), Some(2), None, Some(1), Some(2)]) ); // (drop/release) @@ -769,14 +761,7 @@ mod tests { // verify assert_eq!( array, - &TimestampMillisecondArray::from(vec![ - None, - Some(1), - Some(2), - None, - Some(1), - Some(2) - ]) + &TimestampMillisecondArray::from(vec![None, Some(1), Some(2), None, Some(1), Some(2)]) ); // (drop/release) @@ -793,8 +778,7 @@ mod tests { Some(vec![30, 30, 30]), None, ]; - let array = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; + let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; // export it let (array, schema) = to_ffi(&array.to_data())?; @@ -978,14 +962,7 @@ mod tests { // verify assert_eq!( array, - &DurationSecondArray::from(vec![ - None, - Some(1), - Some(2), - None, - Some(1), - Some(2) - ]) + &DurationSecondArray::from(vec![None, Some(1), Some(2), None, Some(1), Some(2)]) ); // (drop/release) @@ -1001,12 +978,9 @@ mod tests { // [[a, b, c], [d, e, f], [g, h]] let entry_offsets = [0, 3, 6, 8]; - let map_array = MapArray::new_from_strings( - keys.clone().into_iter(), - &values_data, - &entry_offsets, - ) - .unwrap(); + let map_array = + MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) + .unwrap(); // export it let (array, schema) = to_ffi(&map_array.to_data())?; diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 865a8d0e0a29..73cf28d66dab 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -83,16 +83,10 @@ const ENOSYS: i32 = 78; #[derive(Debug)] pub struct FFI_ArrowArrayStream { pub get_schema: Option< - unsafe extern "C" fn( - arg1: *mut FFI_ArrowArrayStream, - out: *mut FFI_ArrowSchema, - ) -> c_int, + unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream, out: *mut FFI_ArrowSchema) -> c_int, >, pub get_next: Option< - unsafe extern "C" fn( - arg1: *mut FFI_ArrowArrayStream, - out: *mut FFI_ArrowArray, - ) -> c_int, + unsafe extern "C" fn(arg1: *mut FFI_ArrowArrayStream, out: *mut FFI_ArrowArray) -> c_int, >, pub get_last_error: Option *const c_char>, @@ -212,8 +206,7 @@ impl ExportedArrayStream { } Err(ref err) => { private_data.last_error = Some( - CString::new(err.to_string()) - .expect("Error string has a null byte in it."), + CString::new(err.to_string()).expect("Error string has a null byte in it."), ); get_error_code(err) } @@ -240,8 +233,7 @@ impl ExportedArrayStream { } else { let err = &next_batch.unwrap_err(); private_data.last_error = Some( - CString::new(err.to_string()) - .expect("Error string has a null byte in it."), + CString::new(err.to_string()).expect("Error string has a null byte in it."), ); get_error_code(err) } @@ -341,8 +333,7 @@ impl Iterator for ArrowArrayStreamReader { fn next(&mut self) -> Option { let mut array = FFI_ArrowArray::empty(); - let ret_code = - unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) }; + let ret_code = unsafe { self.stream.get_next.unwrap()(&mut self.stream, &mut array) }; if ret_code == 0 { // The end of stream has been reached @@ -517,8 +508,7 @@ mod tests { fn test_error_import() -> Result<()> { let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); - let iter = - Box::new(vec![Err(ArrowError::MemoryError("".to_string()))].into_iter()); + let iter = Box::new(vec![Err(ArrowError::MemoryError("".to_string()))].into_iter()); let reader = TestRecordBatchReader::new(schema.clone(), iter); diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index f4d0585fa6b5..78e2363e4825 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -375,8 +375,7 @@ pub mod pyarrow; pub mod record_batch { pub use arrow_array::{ - RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, - RecordBatchWriter, + RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter, }; } pub use arrow_array::temporal_conversions; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index ab0ea8ef8d74..517c333addde 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -71,9 +71,7 @@ use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; use crate::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; -use crate::ffi_stream::{ - export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream, -}; +use crate::ffi_stream::{export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream}; use crate::record_batch::RecordBatch; import_exception!(pyarrow, ArrowException); @@ -138,8 +136,7 @@ impl ToPyArrow for DataType { let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("DataType")?; - let dtype = - class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; + let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) } } @@ -162,8 +159,7 @@ impl ToPyArrow for Field { let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("Field")?; - let dtype = - class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; + let dtype = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(dtype.into()) } } @@ -186,8 +182,7 @@ impl ToPyArrow for Schema { let c_schema_ptr = &c_schema as *const FFI_ArrowSchema; let module = py.import("pyarrow")?; let class = module.getattr("Schema")?; - let schema = - class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; + let schema = class.call_method1("_import_from_c", (c_schema_ptr as Py_uintptr_t,))?; Ok(schema.into()) } } @@ -271,8 +266,7 @@ impl FromPyArrow for RecordBatch { impl ToPyArrow for RecordBatch { fn to_pyarrow(&self, py: Python) -> PyResult { // Workaround apache/arrow#37669 by returning RecordBatchIterator - let reader = - RecordBatchIterator::new(vec![Ok(self.clone())], self.schema().clone()); + let reader = RecordBatchIterator::new(vec![Ok(self.clone())], self.schema().clone()); let reader: Box = Box::new(reader); let py_reader = reader.into_pyarrow(py)?; py_reader.call_method0(py, "read_next_batch") diff --git a/arrow/src/tensor.rs b/arrow/src/tensor.rs index 299c4f2b8403..c2a262b399de 100644 --- a/arrow/src/tensor.rs +++ b/arrow/src/tensor.rs @@ -27,9 +27,7 @@ use crate::datatypes::*; use crate::error::{ArrowError, Result}; /// Computes the strides required assuming a row major memory layout -fn compute_row_major_strides( - shape: &[usize], -) -> Result> { +fn compute_row_major_strides(shape: &[usize]) -> Result> { let mut remaining_bytes = mem::size_of::(); for i in shape { @@ -52,9 +50,7 @@ fn compute_row_major_strides( } /// Computes the strides required assuming a column major memory layout -fn compute_column_major_strides( - shape: &[usize], -) -> Result> { +fn compute_column_major_strides(shape: &[usize]) -> Result> { let mut remaining_bytes = mem::size_of::(); let mut strides = Vec::::new(); @@ -128,8 +124,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { None => { if buffer.len() != mem::size_of::() { return Err(ArrowError::InvalidArgumentError( - "underlying buffer should only contain a single tensor element" - .to_string(), + "underlying buffer should only contain a single tensor element".to_string(), )); } @@ -158,8 +153,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { if let Some(ref n) = names { if n.len() != s.len() { return Err(ArrowError::InvalidArgumentError( - "number of dimensions and number of dimension names differ" - .to_string(), + "number of dimensions and number of dimension names differ".to_string(), )); } } @@ -167,8 +161,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { let total_elements: usize = s.iter().product(); if total_elements != (buffer.len() / mem::size_of::()) { return Err(ArrowError::InvalidArgumentError( - "number of elements in buffer does not match dimensions" - .to_string(), + "number of elements in buffer does not match dimensions".to_string(), )); } } @@ -185,8 +178,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { Some(st) } else { return Err(ArrowError::InvalidArgumentError( - "the input stride does not match the selected shape" - .to_string(), + "the input stride does not match the selected shape".to_string(), )); } } else { @@ -306,9 +298,7 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { pub fn is_column_major(&self) -> Result { match self.shape { None => Ok(false), - Some(ref s) => { - Ok(Some(compute_column_major_strides::(s)?) == self.strides) - } + Some(ref s) => Ok(Some(compute_column_major_strides::(s)?) == self.strides), } } } @@ -434,8 +424,7 @@ mod tests { } let buf = builder.finish(); let names = vec!["Dim 1", "Dim 2"]; - let tensor = - Int64Tensor::new_column_major(buf, Some(vec![2, 4]), Some(names)).unwrap(); + let tensor = Int64Tensor::new_column_major(buf, Some(vec![2, 4]), Some(names)).unwrap(); assert_eq!(8, tensor.size()); assert_eq!(Some(vec![2_usize, 4]).as_ref(), tensor.shape()); assert_eq!(Some(vec![8_usize, 16]).as_ref(), tensor.strides()); @@ -455,8 +444,7 @@ mod tests { } let buf = builder.finish(); - let result = - Int32Tensor::try_new(buf, Some(vec![2, 8]), Some(vec![2, 8, 1]), None); + let result = Int32Tensor::try_new(buf, Some(vec![2, 8]), Some(vec![2, 8, 1]), None); if result.is_ok() { panic!("shape and stride dimensions are different") diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 5e5f4c6ee118..b3fb2d293a72 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -73,11 +73,7 @@ where } /// Creates an random (but fixed-seeded) array of a given size and null density -pub fn create_boolean_array( - size: usize, - null_density: f32, - true_density: f32, -) -> BooleanArray +pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray where Standard: Distribution, { @@ -238,11 +234,7 @@ pub fn create_binary_array( } /// Creates an random (but fixed-seeded) array of a given size and null density -pub fn create_fsb_array( - size: usize, - null_density: f32, - value_len: usize, -) -> FixedSizeBinaryArray { +pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray { let rng = &mut seedable_rng(); FixedSizeBinaryArray::try_from_sparse_iter_with_size( @@ -293,17 +285,15 @@ where K::Native: SampleUniform, { let mut rng = seedable_rng(); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); let keys: Buffer = (0..size) .map(|_| rng.gen_range(key_range.clone())) .collect(); - let nulls: Option = (null_density != 0.) - .then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect()); + let nulls: Option = + (null_density != 0.).then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect()); let data = ArrayDataBuilder::new(data_type) .len(size) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index c1094b127bba..5733fdf22add 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -85,8 +85,7 @@ pub fn create_random_array( Float64 => Arc::new(create_primitive_array::(size, null_density)), Timestamp(_, _) => { let int64_array = - Arc::new(create_primitive_array::(size, null_density)) - as ArrayRef; + Arc::new(create_primitive_array::(size, null_density)) as ArrayRef; return crate::compute::cast(&int64_array, field.data_type()); } Date32 => Arc::new(create_primitive_array::(size, null_density)), @@ -96,9 +95,10 @@ pub fn create_random_array( size, null_density, )) as ArrayRef, - TimeUnit::Millisecond => Arc::new(create_primitive_array::< - Time32MillisecondType, - >(size, null_density)), + TimeUnit::Millisecond => Arc::new(create_primitive_array::( + size, + null_density, + )), _ => { return Err(ArrowError::InvalidArgumentError(format!( "Unsupported unit {unit:?} for Time32" @@ -106,12 +106,14 @@ pub fn create_random_array( } }, Time64(unit) => match unit { - TimeUnit::Microsecond => Arc::new(create_primitive_array::< - Time64MicrosecondType, - >(size, null_density)) as ArrayRef, - TimeUnit::Nanosecond => Arc::new(create_primitive_array::< - Time64NanosecondType, - >(size, null_density)), + TimeUnit::Microsecond => Arc::new(create_primitive_array::( + size, + null_density, + )) as ArrayRef, + TimeUnit::Nanosecond => Arc::new(create_primitive_array::( + size, + null_density, + )), _ => { return Err(ArrowError::InvalidArgumentError(format!( "Unsupported unit {unit:?} for Time64" @@ -122,13 +124,9 @@ pub fn create_random_array( LargeUtf8 => Arc::new(create_string_array::(size, null_density)), Binary => Arc::new(create_binary_array::(size, null_density)), LargeBinary => Arc::new(create_binary_array::(size, null_density)), - FixedSizeBinary(len) => { - Arc::new(create_fsb_array(size, null_density, *len as usize)) - } + FixedSizeBinary(len) => Arc::new(create_fsb_array(size, null_density, *len as usize)), List(_) => create_random_list_array(field, size, null_density, true_density)?, - LargeList(_) => { - create_random_list_array(field, size, null_density, true_density)? - } + LargeList(_) => create_random_list_array(field, size, null_density, true_density)?, Struct(fields) => Arc::new(StructArray::try_from( fields .iter() @@ -138,9 +136,7 @@ pub fn create_random_array( }) .collect::>>()?, )?), - d @ Dictionary(_, value_type) - if crate::compute::can_cast_types(value_type, d) => - { + d @ Dictionary(_, value_type) if crate::compute::can_cast_types(value_type, d) => { let f = Field::new( field.name(), value_type.as_ref().clone(), @@ -189,8 +185,7 @@ fn create_random_list_array( }; // Create list's child data - let child_array = - create_random_array(list_field, child_len, null_density, true_density)?; + let child_array = create_random_array(list_field, child_len, null_density, true_density)?; let child_data = child_array.to_data(); // Create list's null buffers, if it is nullable let null_buffer = match field.is_nullable() { diff --git a/arrow/tests/arithmetic.rs b/arrow/tests/arithmetic.rs index 982420902cc3..81a19d4b5e20 100644 --- a/arrow/tests/arithmetic.rs +++ b/arrow/tests/arithmetic.rs @@ -26,8 +26,8 @@ use chrono::{DateTime, TimeZone}; #[test] fn test_temporal_array_timestamp_hour_with_timezone_using_chrono_tz() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("Asia/Kolkata".to_string()); + let a = + TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("Asia/Kolkata".to_string()); let b = hour(&a).unwrap(); assert_eq!(15, b.value(0)); } diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 43dc6dd0eb0a..bfe16db5cc4d 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -15,27 +15,22 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::{ - PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, -}; +use arrow_array::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::{ - ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type, - Int64Type, Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type, Int64Type, + Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow_array::{ - Array, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, - DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, - FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, - Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, - IntervalYearMonthArray, LargeBinaryArray, LargeListArray, LargeStringArray, - ListArray, NullArray, PrimitiveArray, StringArray, StructArray, - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, - Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, UnionArray, + Array, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, + DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, + IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeListArray, + LargeStringArray, ListArray, NullArray, PrimitiveArray, StringArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, UnionArray, }; use arrow_buffer::{i256, Buffer}; use arrow_cast::pretty::pretty_format_columns; @@ -49,12 +44,8 @@ use std::sync::Arc; #[test] fn test_cast_timestamp_to_string() { - let a = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); + let a = TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) + .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; dbg!(&array); let b = cast(&array, &DataType::Utf8).unwrap(); @@ -83,8 +74,7 @@ fn test_cast_timestamp_with_timezone_daylight_1() { let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); let timestamp_array = cast(&string_array, &to_type).unwrap(); - let to_type = - DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); + let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); let timestamp_array = cast(×tamp_array, &to_type).unwrap(); let string_array = cast(×tamp_array, &DataType::Utf8).unwrap(); @@ -102,8 +92,7 @@ fn test_cast_timestamp_with_timezone_daylight_2() { Some("2010-07-01T07:00:00.123456789"), None, ])); - let to_type = - DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into())); + let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into())); let timestamp_array = cast(&string_array, &to_type).unwrap(); // Check intermediate representation is correct @@ -135,8 +124,7 @@ fn test_cast_timestamp_with_timezone_daylight_3() { Some("2010-07-01T00:00:00.123456789"), None, ])); - let to_type = - DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); + let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into())); let timestamp_array = cast(&string_array, &to_type).unwrap(); // Check intermediate representation is correct @@ -220,8 +208,7 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(StructArray::from(vec![ ( Arc::new(Field::new("a", DataType::Boolean, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( Arc::new(Field::new("b", DataType::Int32, false)), @@ -252,17 +239,9 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), - Arc::new( - TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMillisecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMicrosecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), + Arc::new(TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone())), Arc::new(TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), Arc::new(Date32Array::from(vec![1000, 2000])), Arc::new(Date64Array::from(vec![1000, 2000])), @@ -364,8 +343,7 @@ fn make_list_array() -> ListArray { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -388,8 +366,7 @@ fn make_large_list_array() -> LargeListArray { let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -507,8 +484,7 @@ fn get_all_types() -> Vec { Decimal128(38, 0), ]; - let dictionary_key_types = - vec![Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64]; + let dictionary_key_types = vec![Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64]; let mut dictionary_types = dictionary_key_types .into_iter() @@ -564,9 +540,7 @@ fn test_timestamp_cast_utf8() { } fn format_timezone(tz: &str) -> Result { - let array = Arc::new( - TimestampSecondArray::from(vec![Some(11111111), None]).with_timezone(tz), - ); + let array = Arc::new(TimestampSecondArray::from(vec![Some(11111111), None]).with_timezone(tz)); Ok(pretty_format_columns("f", &[array])?.to_string()) } diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 317287c102f2..9bd276428880 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -18,8 +18,8 @@ use arrow::array::{ make_array, Array, ArrayRef, BooleanArray, Decimal128Array, FixedSizeBinaryArray, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, GenericStringArray, - Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, - OffsetSizeTrait, StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, + Int32Array, Int32Builder, Int64Builder, ListArray, ListBuilder, NullArray, OffsetSizeTrait, + StringArray, StringDictionaryBuilder, StructArray, UnionBuilder, }; use arrow::datatypes::{Int16Type, Int32Type}; use arrow_array::builder::{StringBuilder, StructBuilder}; @@ -530,22 +530,10 @@ fn test_fixed_size_binary_null() { #[test] fn test_fixed_size_binary_offsets() { // Test the case where offset != 0 - let a = create_fixed_size_binary_array([ - Some(b"hello"), - None, - None, - Some(b"world"), - None, - None, - ]); - let b = create_fixed_size_binary_array([ - Some(b"hello"), - None, - None, - Some(b"arrow"), - None, - None, - ]); + let a = + create_fixed_size_binary_array([Some(b"hello"), None, None, Some(b"world"), None, None]); + let b = + create_fixed_size_binary_array([Some(b"hello"), None, None, Some(b"arrow"), None, None]); let a_slice = a.slice(0, 3); let b_slice = b.slice(0, 3); @@ -682,22 +670,10 @@ fn test_fixed_size_list_equal() { // Test the case where null_count > 0 #[test] fn test_fixed_list_null() { - let a = create_fixed_size_list_array([ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array([ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); + let a = + create_fixed_size_list_array([Some(&[1, 2, 3]), None, None, Some(&[4, 5, 6]), None, None]); + let b = + create_fixed_size_list_array([Some(&[1, 2, 3]), None, None, Some(&[4, 5, 6]), None, None]); test_equal(&a, &b, true); let b = create_fixed_size_list_array([ @@ -710,14 +686,8 @@ fn test_fixed_list_null() { ]); test_equal(&a, &b, false); - let b = create_fixed_size_list_array([ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); + let b = + create_fixed_size_list_array([Some(&[1, 2, 3]), None, None, Some(&[3, 6, 9]), None, None]); test_equal(&a, &b, false); let b = create_fixed_size_list_array([None, Some(&[4, 5, 6]), None, None]); @@ -729,22 +699,10 @@ fn test_fixed_list_null() { #[test] fn test_fixed_list_offsets() { // Test the case where offset != 0 - let a = create_fixed_size_list_array([ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array([ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); + let a = + create_fixed_size_list_array([Some(&[1, 2, 3]), None, None, Some(&[4, 5, 6]), None, None]); + let b = + create_fixed_size_list_array([Some(&[1, 2, 3]), None, None, Some(&[3, 6, 9]), None, None]); let a_slice = a.slice(0, 3); let b_slice = b.slice(0, 3); @@ -776,8 +734,7 @@ fn test_struct_equal() { Some(5), ])); - let a = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); + let a = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap(); let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); @@ -948,14 +905,10 @@ fn test_struct_equal_null_variable_size() { test_equal(&a, &c, false); } -fn create_dictionary_array( - values: &[&str], - keys: &[Option<&str>], -) -> DictionaryArray { +fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> DictionaryArray { let values = StringArray::from(values.to_vec()); let mut builder = - StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) - .unwrap(); + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values).unwrap(); for key in keys { if let Some(v) = key { builder.append(v).unwrap(); @@ -1002,40 +955,25 @@ fn test_dictionary_equal() { #[test] fn test_dictionary_equal_null() { // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), None, Some("a"), Some("c")], - ); + let a = create_dictionary_array(&["a", "b", "c"], &[Some("a"), None, Some("a"), Some("c")]); // equal to self test_equal(&a, &a, true); // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("c")], - ); + let b = create_dictionary_array(&["a", "c", "b"], &[Some("a"), None, Some("a"), Some("c")]); test_equal(&a, &b, true); // different null position - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), None], - ); + let b = create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a"), None]); test_equal(&a, &b, false); // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("a")], - ); + let b = create_dictionary_array(&["a", "c", "b"], &[Some("a"), None, Some("a"), Some("a")]); test_equal(&a, &b, false); // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), None, Some("a"), Some("d")], - ); + let b = create_dictionary_array(&["a", "b", "d"], &[Some("a"), None, Some("a"), Some("d")]); test_equal(&a, &b, false); } @@ -1234,9 +1172,7 @@ fn test_list_different_offsets() { assert_eq!(&a_slice, &b_slice); } -fn make_struct( - elements: Vec, Option)>>, -) -> StructArray { +fn make_struct(elements: Vec, Option)>>) -> StructArray { let mut builder = StructBuilder::new( vec![ Field::new("f1", DataType::Utf8, true), diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 15141eb208e4..ccf66e1c30ad 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -16,10 +16,10 @@ // under the License. use arrow::array::{ - Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, - FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, - ListBuilder, MapBuilder, NullArray, StringArray, StringBuilder, - StringDictionaryBuilder, StructArray, UInt8Array, UnionArray, + Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, + Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, ListBuilder, MapBuilder, + NullArray, StringArray, StringBuilder, StringDictionaryBuilder, StructArray, UInt8Array, + UnionArray, }; use arrow::datatypes::Int16Type; use arrow_buffer::Buffer; @@ -28,11 +28,7 @@ use arrow_data::ArrayData; use arrow_schema::{DataType, Field, Fields}; use std::sync::Arc; -fn create_decimal_array( - array: Vec>, - precision: u8, - scale: i8, -) -> Decimal128Array { +fn create_decimal_array(array: Vec>, precision: u8, scale: i8) -> Decimal128Array { array .into_iter() .collect::() @@ -57,8 +53,7 @@ fn test_decimal() { #[test] #[cfg(not(feature = "force_validate"))] fn test_decimal_offset() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); @@ -72,8 +67,7 @@ fn test_decimal_offset() { #[test] #[cfg(not(feature = "force_validate"))] fn test_decimal_null_offset_nulls() { - let decimal_array = - create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); + let decimal_array = create_decimal_array(vec![Some(1), Some(2), None, Some(3)], 10, 3); let decimal_array = decimal_array.slice(1, 3).into_data(); // 2, null, 3 let arrays = vec![&decimal_array]; let mut a = MutableArrayData::new(arrays, true, 2); @@ -174,8 +168,7 @@ fn test_list_null_offset() { /// tests extending from a variable-sized (strings and binary) array w/ offset with nulls #[test] fn test_variable_sized_nulls() { - let array = - StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]).into_data(); + let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]).into_data(); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -267,8 +260,7 @@ fn test_string_null_offset_nulls() { #[test] fn test_bool() { - let array = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]).into_data(); + let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]).into_data(); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -303,8 +295,7 @@ fn test_null() { fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { let values = StringArray::from(values.to_vec()); let mut builder = - StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values) - .unwrap(); + StringDictionaryBuilder::::new_with_dictionary(keys.len(), &values).unwrap(); for key in keys { if let Some(v) = key { builder.append(v).unwrap(); @@ -318,10 +309,7 @@ fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData #[test] fn test_dictionary() { // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) - let array = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), None, Some("c")], - ); + let array = create_dictionary_array(&["a", "b", "c"], &[Some("a"), Some("b"), None, Some("c")]); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -352,10 +340,9 @@ fn test_struct() { Some(5), ])); - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .into_data(); + let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .into_data(); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -363,11 +350,8 @@ fn test_struct() { let data = mutable.freeze(); let array = StructArray::from(data); - let expected = StructArray::try_from(vec![ - ("f1", strings.slice(1, 2)), - ("f2", ints.slice(1, 2)), - ]) - .unwrap(); + let expected = + StructArray::try_from(vec![("f1", strings.slice(1, 2)), ("f2", ints.slice(1, 2))]).unwrap(); assert_eq!(array, expected) } @@ -388,11 +372,10 @@ fn test_struct_offset() { Some(5), ])); - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .into_data() - .slice(1, 3); + let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .into_data() + .slice(1, 3); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -400,11 +383,9 @@ fn test_struct_offset() { let data = mutable.freeze(); let array = StructArray::from(data); - let expected_strings: ArrayRef = - Arc::new(StringArray::from(vec![None, Some("mark")])); + let expected_strings: ArrayRef = Arc::new(StringArray::from(vec![None, Some("mark")])); let expected = - StructArray::try_from(vec![("f1", expected_strings), ("f2", ints.slice(2, 2))]) - .unwrap(); + StructArray::try_from(vec![("f1", expected_strings), ("f2", ints.slice(2, 2))]).unwrap(); assert_eq!(array, expected); } @@ -426,10 +407,9 @@ fn test_struct_nulls() { Some(5), ])); - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .into_data(); + let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .into_data(); let arrays = vec![&array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -443,8 +423,7 @@ fn test_struct_nulls() { let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]).unwrap(); assert_eq!(array, expected) } @@ -465,10 +444,9 @@ fn test_struct_many() { Some(5), ])); - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .into_data(); + let array = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap() + .into_data(); let arrays = vec![&array, &array]; let mut mutable = MutableArrayData::new(arrays, false, 0); @@ -483,8 +461,7 @@ fn test_struct_many() { Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); + StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]).unwrap(); assert_eq!(array, expected) } @@ -547,10 +524,9 @@ fn test_union_dense() { #[test] fn test_binary_fixed_sized_offsets() { - let array = FixedSizeBinaryArray::try_from_iter( - vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), - ) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + let array = + FixedSizeBinaryArray::try_from_iter(vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); let array = array.slice(1, 2).into_data(); // = [[0, 1], [0, 2]] due to the offset = 1 @@ -564,9 +540,8 @@ fn test_binary_fixed_sized_offsets() { let result = mutable.freeze(); let result = FixedSizeBinaryArray::from(result); - let expected = - FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); + let expected = FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) + .expect("Failed to create FixedSizeBinaryArray from iterable"); assert_eq!(result, expected); } @@ -830,8 +805,7 @@ fn test_map_nulls_append() { ), ]); - let map_offsets = - Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); + let map_offsets = Buffer::from_slice_ref([0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); let expected_list_data = ArrayData::try_new( DataType::Map( @@ -972,10 +946,9 @@ fn test_fixed_size_binary_append() { Some(vec![9, 10]), // b[4..4] ]; - let expected = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2) - .expect("Failed to create FixedSizeBinaryArray from iterable") - .into_data(); + let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(expected.into_iter(), 2) + .expect("Failed to create FixedSizeBinaryArray from iterable") + .into_data(); assert_eq!(result, expected); } diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index fa80db1860cd..f5298f82e0a4 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -16,8 +16,8 @@ // under the License. use arrow::array::{ - make_array, Array, BooleanBuilder, Decimal128Builder, Int32Array, Int32Builder, - Int64Array, StringArray, StructBuilder, UInt64Array, + make_array, Array, BooleanBuilder, Decimal128Builder, Int32Array, Int32Builder, Int64Array, + StringArray, StructBuilder, UInt64Array, }; use arrow_array::Decimal128Array; use arrow_buffer::{ArrowNativeType, Buffer}; @@ -27,9 +27,7 @@ use std::ptr::NonNull; use std::sync::Arc; #[test] -#[should_panic( - expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" -)] +#[should_panic(expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8")] fn test_buffer_too_small() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) @@ -37,9 +35,7 @@ fn test_buffer_too_small() { } #[test] -#[should_panic( - expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" -)] +#[should_panic(expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8")] fn test_buffer_too_small_offset() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); // should fail -- size is ok, but also has offset @@ -51,8 +47,7 @@ fn test_buffer_too_small_offset() { fn test_bad_number_of_buffers() { let buffer1 = Buffer::from_slice_ref([0i32, 2i32]); let buffer2 = Buffer::from_slice_ref([0i32, 2i32]); - ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) - .unwrap(); + ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]).unwrap(); } #[test] @@ -61,8 +56,7 @@ fn test_bad_number_of_buffers() { )] fn test_fixed_width_overflow() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); - ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) - .unwrap(); + ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]).unwrap(); } #[test] @@ -87,8 +81,7 @@ fn test_bitmap_too_small() { #[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] fn test_non_int_dictionary() { let i32_buffer = Buffer::from_slice_ref([0i32, 2i32]); - let data_type = - DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); + let data_type = DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); let child_data = ArrayData::try_new( DataType::Int32, 1, @@ -116,11 +109,9 @@ fn test_mismatched_dictionary_types() { let string_array: StringArray = vec![Some("foo"), Some("bar")].into_iter().collect(); let i32_buffer = Buffer::from_slice_ref([0i32, 1i32]); // Dict says LargeUtf8 but array is Utf8 - let data_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::LargeUtf8)); + let data_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::LargeUtf8)); let child_data = string_array.into_data(); - ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) - .unwrap(); + ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]).unwrap(); } #[test] @@ -185,9 +176,7 @@ fn test_empty_utf8_array_with_non_zero_offset() { } #[test] -#[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" -)] +#[should_panic(expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4")] fn test_empty_large_utf8_array_with_wrong_type_offsets() { let data_buffer = Buffer::from(&[]); let offsets_buffer = Buffer::from_slice_ref([0i32]); @@ -219,9 +208,7 @@ fn test_validate_offsets_i32() { } #[test] -#[should_panic( - expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" -)] +#[should_panic(expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16")] fn test_validate_offsets_i64() { let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes()); let offsets_buffer = Buffer::from_slice_ref([0i64, 2i64]); @@ -506,33 +493,25 @@ fn check_index_out_of_bounds_validation(data_type: DataType) } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4")] fn test_validate_utf8_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::Utf8); } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4")] fn test_validate_large_utf8_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::LargeUtf8); } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4")] fn test_validate_binary_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::Binary); } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4")] fn test_validate_large_binary_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::LargeBinary); } @@ -559,33 +538,25 @@ fn check_index_backwards_validation(data_type: DataType) { } #[test] -#[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" -)] +#[should_panic(expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1")] fn test_validate_utf8_index_backwards() { check_index_backwards_validation::(DataType::Utf8); } #[test] -#[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" -)] +#[should_panic(expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1")] fn test_validate_large_utf8_index_backwards() { check_index_backwards_validation::(DataType::LargeUtf8); } #[test] -#[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" -)] +#[should_panic(expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1")] fn test_validate_binary_index_backwards() { check_index_backwards_validation::(DataType::Binary); } #[test] -#[should_panic( - expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" -)] +#[should_panic(expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1")] fn test_validate_large_binary_index_backwards() { check_index_backwards_validation::(DataType::LargeBinary); } @@ -712,18 +683,14 @@ fn check_list_offsets(data_type: DataType) { } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4")] fn test_validate_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); check_list_offsets::(DataType::List(Arc::new(field_type))); } #[test] -#[should_panic( - expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" -)] +#[should_panic(expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4")] fn test_validate_large_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); check_list_offsets::(DataType::LargeList(Arc::new(field_type))); @@ -994,8 +961,7 @@ fn test_string_data_from_foreign() { let array = make_array(data); let array = array.as_any().downcast_ref::().unwrap(); - let expected = - StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); + let expected = StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); assert_eq!(array, &expected); } diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index a79b6b44c2d3..fd01f1663955 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -40,10 +40,8 @@ fn test_export_csv_timestamps() { vec![Some(1555584887378), Some(1635577147000)], ) .with_timezone("Australia/Sydney".to_string()); - let c2 = - TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + let c2 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut sw = Vec::new(); let mut writer = arrow_csv::Writer::new(&mut sw); diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 422ba15efa52..75a5299a0859 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -24,9 +24,7 @@ use crate::aws::{ }; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use itertools::Itertools; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -312,9 +310,7 @@ impl AsRef for AmazonS3ConfigKey { Self::MetadataEndpoint => "aws_metadata_endpoint", Self::UnsignedPayload => "aws_unsigned_payload", Self::Checksum => "aws_checksum_algorithm", - Self::ContainerCredentialsRelativeUri => { - "aws_container_credentials_relative_uri" - } + Self::ContainerCredentialsRelativeUri => "aws_container_credentials_relative_uri", Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "copy_if_not_exists", Self::Client(opt) => opt.as_ref(), @@ -331,15 +327,9 @@ impl FromStr for AmazonS3ConfigKey { "aws_secret_access_key" | "secret_access_key" => Ok(Self::SecretAccessKey), "aws_default_region" | "default_region" => Ok(Self::DefaultRegion), "aws_region" | "region" => Ok(Self::Region), - "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => { - Ok(Self::Bucket) - } - "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => { - Ok(Self::Endpoint) - } - "aws_session_token" | "aws_token" | "session_token" | "token" => { - Ok(Self::Token) - } + "aws_bucket" | "aws_bucket_name" | "bucket_name" | "bucket" => Ok(Self::Bucket), + "aws_endpoint_url" | "aws_endpoint" | "endpoint_url" | "endpoint" => Ok(Self::Endpoint), + "aws_session_token" | "aws_token" | "session_token" | "token" => Ok(Self::Token), "aws_virtual_hosted_style_request" | "virtual_hosted_style_request" => { Ok(Self::VirtualHostedStyleRequest) } @@ -347,9 +337,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_metadata_endpoint" | "metadata_endpoint" => Ok(Self::MetadataEndpoint), "aws_unsigned_payload" | "unsigned_payload" => Ok(Self::UnsignedPayload), "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), - "aws_container_credentials_relative_uri" => { - Ok(Self::ContainerCredentialsRelativeUri) - } + "aws_container_credentials_relative_uri" => Ok(Self::ContainerCredentialsRelativeUri), "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "copy_if_not_exists" => Ok(Self::CopyIfNotExists), // Backwards compatibility @@ -428,16 +416,10 @@ impl AmazonS3Builder { } /// Set an option on the builder via a key - value pair. - pub fn with_config( - mut self, - key: AmazonS3ConfigKey, - value: impl Into, - ) -> Self { + pub fn with_config(mut self, key: AmazonS3ConfigKey, value: impl Into) -> Self { match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id = Some(value.into()), - AmazonS3ConfigKey::SecretAccessKey => { - self.secret_access_key = Some(value.into()) - } + AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key = Some(value.into()), AmazonS3ConfigKey::Region => self.region = Some(value.into()), AmazonS3ConfigKey::Bucket => self.bucket_name = Some(value.into()), AmazonS3ConfigKey::Endpoint => self.endpoint = Some(value.into()), @@ -449,9 +431,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::DefaultRegion => { self.region = self.region.or_else(|| Some(value.into())) } - AmazonS3ConfigKey::MetadataEndpoint => { - self.metadata_endpoint = Some(value.into()) - } + AmazonS3ConfigKey::MetadataEndpoint => self.metadata_endpoint = Some(value.into()), AmazonS3ConfigKey::UnsignedPayload => self.unsigned_payload.parse(value), AmazonS3ConfigKey::Checksum => { self.checksum_algorithm = Some(ConfigValue::Deferred(value.into())) @@ -474,11 +454,7 @@ impl AmazonS3Builder { /// /// This method will return an `UnknownConfigKey` error if key cannot be parsed into [`AmazonS3ConfigKey`]. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } @@ -487,9 +463,7 @@ impl AmazonS3Builder { /// This method will return an `UnknownConfigKey` error if any key cannot be parsed into [`AmazonS3ConfigKey`]. #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -514,9 +488,7 @@ impl AmazonS3Builder { match key { AmazonS3ConfigKey::AccessKeyId => self.access_key_id.clone(), AmazonS3ConfigKey::SecretAccessKey => self.secret_access_key.clone(), - AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => { - self.region.clone() - } + AmazonS3ConfigKey::Region | AmazonS3ConfigKey::DefaultRegion => self.region.clone(), AmazonS3ConfigKey::Bucket => self.bucket_name.clone(), AmazonS3ConfigKey::Endpoint => self.endpoint.clone(), AmazonS3ConfigKey::Token => self.token.clone(), @@ -586,10 +558,7 @@ impl AmazonS3Builder { } /// Set the AWS Secret Access Key (required) - pub fn with_secret_access_key( - mut self, - secret_access_key: impl Into, - ) -> Self { + pub fn with_secret_access_key(mut self, secret_access_key: impl Into) -> Self { self.secret_access_key = Some(secret_access_key.into()); self } @@ -648,10 +617,7 @@ impl AmazonS3Builder { /// consistent with `virtual_hosted_style_request`. /// i.e. if `virtual_hosted_style_request` is set to true /// then `endpoint` should have bucket name included. - pub fn with_virtual_hosted_style_request( - mut self, - virtual_hosted_style_request: bool, - ) -> Self { + pub fn with_virtual_hosted_style_request(mut self, virtual_hosted_style_request: bool) -> Self { self.virtual_hosted_style_request = virtual_hosted_style_request.into(); self } @@ -722,10 +688,7 @@ impl AmazonS3Builder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index eb81e92fb932..6b34b181ab9d 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -17,9 +17,7 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{ - AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, -}; +use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::get::GetClient; use crate::client::header::get_etag; use crate::client::list::ListClient; @@ -28,9 +26,7 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; -use crate::{ - ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig, -}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -264,8 +260,7 @@ impl S3Client { if let Some(checksum) = self.config().checksum { let digest = checksum.digest(&bytes); - builder = - builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); + builder = builder.header(checksum.header_name(), BASE64_STANDARD.encode(&digest)); if checksum == Checksum::SHA256 { payload_sha256 = Some(digest); } @@ -333,10 +328,7 @@ impl S3Client { /// there was an error for a certain path, the error will be returned in the /// vector. If there was an issue with making the overall request, an error /// will be returned at the top level. - pub async fn bulk_delete_request( - &self, - paths: Vec, - ) -> Result>> { + pub async fn bulk_delete_request(&self, paths: Vec) -> Result>> { if paths.is_empty() { return Ok(Vec::new()); } @@ -348,10 +340,8 @@ impl S3Client { let mut writer = quick_xml::Writer::new(&mut buffer); writer .write_event(xml_events::Event::Start( - xml_events::BytesStart::new("Delete").with_attributes([( - "xmlns", - "http://s3.amazonaws.com/doc/2006-03-01/", - )]), + xml_events::BytesStart::new("Delete") + .with_attributes([("xmlns", "http://s3.amazonaws.com/doc/2006-03-01/")]), )) .unwrap(); for path in &paths { @@ -415,9 +405,11 @@ impl S3Client { .await .context(DeleteObjectsResponseSnafu {})?; - let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()) - .map_err(|err| Error::InvalidDeleteObjectsResponse { - source: Box::new(err), + let response: BatchDeleteResponse = + quick_xml::de::from_reader(response.reader()).map_err(|err| { + Error::InvalidDeleteObjectsResponse { + source: Box::new(err), + } })?; // Assume all were ok, then fill in errors. This guarantees output order @@ -425,11 +417,10 @@ impl S3Client { let mut results: Vec> = paths.iter().cloned().map(Ok).collect(); for content in response.content.into_iter() { if let DeleteObjectResult::Error(error) = content { - let path = Path::parse(&error.key).map_err(|err| { - Error::InvalidDeleteObjectsResponse { + let path = + Path::parse(&error.key).map_err(|err| Error::InvalidDeleteObjectsResponse { source: Box::new(err), - } - })?; + })?; let i = paths.iter().find_position(|&p| p == &path).unwrap().0; results[i] = Err(Error::from(error).into()); } @@ -439,12 +430,7 @@ impl S3Client { } /// Make an S3 Copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - overwrite: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); @@ -461,9 +447,7 @@ impl S3Client { } None => { return Err(crate::Error::NotSupported { - source: "S3 does not support copy-if-not-exists" - .to_string() - .into(), + source: "S3 does not support copy-if-not-exists".to_string().into(), }) } } @@ -515,8 +499,8 @@ impl S3Client { .await .context(CreateMultipartResponseBodySnafu)?; - let response: InitiateMultipart = quick_xml::de::from_reader(response.reader()) - .context(InvalidMultipartResponseSnafu)?; + let response: InitiateMultipart = + quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; Ok(response.upload_id) } @@ -646,8 +630,8 @@ impl ListClient for S3Client { .await .context(ListResponseBodySnafu)?; - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + let mut response: ListResponse = + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index e0c5de5fe784..d290da838d78 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -37,8 +37,7 @@ use url::Url; type StdError = Box; /// SHA256 hash of empty string -static EMPTY_SHA256_HASH: &str = - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; +static EMPTY_SHA256_HASH: &str = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"; static UNSIGNED_PAYLOAD: &str = "UNSIGNED-PAYLOAD"; static STREAMING_PAYLOAD: &str = "STREAMING-AWS4-HMAC-SHA256-PAYLOAD"; @@ -57,13 +56,7 @@ impl AwsCredential { /// Signs a string /// /// - fn sign( - &self, - to_sign: &str, - date: DateTime, - region: &str, - service: &str, - ) -> String { + fn sign(&self, to_sign: &str, date: DateTime, region: &str, service: &str) -> String { let date_string = date.format("%Y%m%d").to_string(); let date_hmac = hmac_sha256(format!("AWS4{}", self.secret_key), date_string); let region_hmac = hmac_sha256(date_hmac, region); @@ -170,9 +163,9 @@ impl<'a> AwsAuthorizer<'a> { ); // sign the string - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); + let signature = self + .credential + .sign(&string_to_sign, date, self.region, self.service); // build the actual auth header let authorisation = format!( @@ -226,9 +219,9 @@ impl<'a> AwsAuthorizer<'a> { digest, ); - let signature = - self.credential - .sign(&string_to_sign, date, self.region, self.service); + let signature = self + .credential + .sign(&string_to_sign, date, self.region, self.service); url.query_pairs_mut() .append_pair("X-Amz-Signature", &signature); @@ -521,9 +514,7 @@ async fn instance_creds( let token = match token_result { Ok(t) => Some(t.text().await?), - Err(e) - if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => - { + Err(e) if imdsv1_fallback && matches!(e.status(), Some(StatusCode::FORBIDDEN)) => { warn!("received 403 from metadata endpoint, falling back to IMDSv1"); None } @@ -545,8 +536,7 @@ async fn instance_creds( creds_request = creds_request.header(AWS_EC2_METADATA_TOKEN_HEADER, token); } - let creds: InstanceCredentials = - creds_request.send_retry(retry_config).await?.json().await?; + let creds: InstanceCredentials = creds_request.send_retry(retry_config).await?.json().await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -659,8 +649,7 @@ async fn task_credential( retry: &RetryConfig, url: &str, ) -> Result>, StdError> { - let creds: InstanceCredentials = - client.get(url).send_retry(retry).await?.json().await?; + let creds: InstanceCredentials = client.get(url).send_retry(retry).await?.json().await?; let now = Utc::now(); let ttl = (creds.expiration - now).to_std().unwrap_or_default(); @@ -776,8 +765,7 @@ mod tests { sign_payload: false, }; - let mut url = - Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); + let mut url = Url::parse("https://examplebucket.s3.amazonaws.com/test.txt").unwrap(); authorizer.sign(Method::GET, &mut url, Duration::from_secs(86400)); assert_eq!( @@ -790,7 +778,8 @@ mod tests { X-Amz-Expires=86400&\ X-Amz-SignedHeaders=host&\ X-Amz-Signature=aeeed9bbccd4d02ee5c0109b86d86835f995330da4c265957d157751f604d404" - ).unwrap() + ) + .unwrap() ); } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a4e39c3b88dd..25894a1c3445 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -47,8 +47,8 @@ use crate::client::CredentialProvider; use crate::multipart::{PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, - PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, + Result, }; mod builder; @@ -67,12 +67,11 @@ pub use resolve::resolve_bucket_region; // // Do not URI-encode any of the unreserved characters that RFC 3986 defines: // A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). -pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = - percent_encoding::NON_ALPHANUMERIC - .remove(b'-') - .remove(b'.') - .remove(b'_') - .remove(b'~'); +pub(crate) const STRICT_ENCODE_SET: percent_encoding::AsciiSet = percent_encoding::NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); /// This struct is used to maintain the URI path encoding const STRICT_PATH_ENCODE_SET: percent_encoding::AsciiSet = STRICT_ENCODE_SET.remove(b'/'); @@ -141,15 +140,9 @@ impl Signer for AmazonS3 { /// # Ok(()) /// # } /// ``` - async fn signed_url( - &self, - method: Method, - path: &Path, - expires_in: Duration, - ) -> Result { + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result { let credential = self.credentials().get_credential().await?; - let authorizer = - AwsAuthorizer::new(&credential, "s3", &self.client.config().region); + let authorizer = AwsAuthorizer::new(&credential, "s3", &self.client.config().region); let path_url = self.path_url(path); let mut url = Url::parse(&path_url).map_err(|e| crate::Error::Generic { @@ -185,11 +178,7 @@ impl ObjectStore for AmazonS3 { Ok((id, Box::new(WriteMultiPart::new(upload, 8)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.client .delete_request(location, &[("uploadId", multipart_id)]) .await @@ -314,8 +303,7 @@ mod tests { put_get_delete_list_opts(&integration, is_local).await; // run integration test with checksum set to sha256 - let builder = - AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); + let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); put_get_delete_list_opts(&integration, is_local).await; } diff --git a/object_store/src/aws/resolve.rs b/object_store/src/aws/resolve.rs index 2b21fabd34ab..12c9f26d220b 100644 --- a/object_store/src/aws/resolve.rs +++ b/object_store/src/aws/resolve.rs @@ -48,10 +48,7 @@ impl From for crate::Error { /// Get the bucket region using the [HeadBucket API]. This will fail if the bucket does not exist. /// /// [HeadBucket API]: https://docs.aws.amazon.com/AmazonS3/latest/API/API_HeadBucket.html -pub async fn resolve_bucket_region( - bucket: &str, - client_options: &ClientOptions, -) -> Result { +pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) -> Result { use reqwest::StatusCode; let endpoint = format!("https://{}.s3.amazonaws.com", bucket); diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index eb2de147f3ad..915e4c59a871 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -23,9 +23,7 @@ use crate::azure::credential::{ use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -363,9 +361,7 @@ impl FromStr for AzureConfigKey { | "account_key" | "access_key" => Ok(Self::AccessKey), "azure_storage_account_name" | "account_name" => Ok(Self::AccountName), - "azure_storage_client_id" | "azure_client_id" | "client_id" => { - Ok(Self::ClientId) - } + "azure_storage_client_id" | "azure_client_id" | "client_id" => Ok(Self::ClientId), "azure_storage_client_secret" | "azure_client_secret" | "client_secret" => { Ok(Self::ClientSecret) } @@ -375,27 +371,20 @@ impl FromStr for AzureConfigKey { | "azure_authority_id" | "tenant_id" | "authority_id" => Ok(Self::AuthorityId), - "azure_storage_sas_key" - | "azure_storage_sas_token" - | "sas_key" - | "sas_token" => Ok(Self::SasKey), + "azure_storage_sas_key" | "azure_storage_sas_token" | "sas_key" | "sas_token" => { + Ok(Self::SasKey) + } "azure_storage_token" | "bearer_token" | "token" => Ok(Self::Token), "azure_storage_use_emulator" | "use_emulator" => Ok(Self::UseEmulator), - "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => { - Ok(Self::Endpoint) - } + "azure_storage_endpoint" | "azure_endpoint" | "endpoint" => Ok(Self::Endpoint), "azure_msi_endpoint" | "azure_identity_endpoint" | "identity_endpoint" | "msi_endpoint" => Ok(Self::MsiEndpoint), "azure_object_id" | "object_id" => Ok(Self::ObjectId), "azure_msi_resource_id" | "msi_resource_id" => Ok(Self::MsiResourceId), - "azure_federated_token_file" | "federated_token_file" => { - Ok(Self::FederatedTokenFile) - } - "azure_use_fabric_endpoint" | "use_fabric_endpoint" => { - Ok(Self::UseFabricEndpoint) - } + "azure_federated_token_file" | "federated_token_file" => Ok(Self::FederatedTokenFile), + "azure_use_fabric_endpoint" | "use_fabric_endpoint" => Ok(Self::UseFabricEndpoint), "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), "azure_container_name" | "container_name" => Ok(Self::ContainerName), // Backwards compatibility @@ -505,9 +494,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::MsiEndpoint => self.msi_endpoint = Some(value.into()), AzureConfigKey::ObjectId => self.object_id = Some(value.into()), AzureConfigKey::MsiResourceId => self.msi_resource_id = Some(value.into()), - AzureConfigKey::FederatedTokenFile => { - self.federated_token_file = Some(value.into()) - } + AzureConfigKey::FederatedTokenFile => self.federated_token_file = Some(value.into()), AzureConfigKey::UseAzureCli => self.use_azure_cli.parse(value), AzureConfigKey::UseEmulator => self.use_emulator.parse(value), AzureConfigKey::Endpoint => self.endpoint = Some(value.into()), @@ -522,20 +509,14 @@ impl MicrosoftAzureBuilder { /// Set an option on the builder via a key - value pair. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -566,9 +547,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::SasKey => self.sas_key.clone(), AzureConfigKey::Token => self.bearer_token.clone(), AzureConfigKey::UseEmulator => Some(self.use_emulator.to_string()), - AzureConfigKey::UseFabricEndpoint => { - Some(self.use_fabric_endpoint.to_string()) - } + AzureConfigKey::UseFabricEndpoint => Some(self.use_fabric_endpoint.to_string()), AzureConfigKey::Endpoint => self.endpoint.clone(), AzureConfigKey::MsiEndpoint => self.msi_endpoint.clone(), AzureConfigKey::ObjectId => self.object_id.clone(), @@ -612,12 +591,10 @@ impl MicrosoftAzureBuilder { } } "https" => match host.split_once('.') { - Some((a, "dfs.core.windows.net")) - | Some((a, "blob.core.windows.net")) => { + Some((a, "dfs.core.windows.net")) | Some((a, "blob.core.windows.net")) => { self.account_name = Some(validate(a)?); } - Some((a, "dfs.fabric.microsoft.com")) - | Some((a, "blob.fabric.microsoft.com")) => { + Some((a, "dfs.fabric.microsoft.com")) | Some((a, "blob.fabric.microsoft.com")) => { self.account_name = Some(validate(a)?); // Attempt to infer the container name from the URL // - https://onelake.dfs.fabric.microsoft.com///Files/test.csv @@ -657,10 +634,7 @@ impl MicrosoftAzureBuilder { } /// Set a static bearer token to be used for authorizing requests - pub fn with_bearer_token_authorization( - mut self, - bearer_token: impl Into, - ) -> Self { + pub fn with_bearer_token_authorization(mut self, bearer_token: impl Into) -> Self { self.bearer_token = Some(bearer_token.into()); self } @@ -697,10 +671,7 @@ impl MicrosoftAzureBuilder { } /// Set query pairs appended to the url for shared access signature authorization - pub fn with_sas_authorization( - mut self, - query_pairs: impl Into>, - ) -> Self { + pub fn with_sas_authorization(mut self, query_pairs: impl Into>) -> Self { self.sas_query_pairs = Some(query_pairs.into()); self } @@ -769,10 +740,7 @@ impl MicrosoftAzureBuilder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); @@ -800,10 +768,7 @@ impl MicrosoftAzureBuilder { /// Sets a file path for acquiring azure federated identity token in k8s /// /// requires `client_id` and `tenant_id` to be set - pub fn with_federated_token_file( - mut self, - federated_token_file: impl Into, - ) -> Self { + pub fn with_federated_token_file(mut self, federated_token_file: impl Into) -> Self { self.federated_token_file = Some(federated_token_file.into()); self } @@ -855,8 +820,8 @@ impl MicrosoftAzureBuilder { }, }; - let url = Url::parse(&account_url) - .context(UnableToParseUrlSnafu { url: account_url })?; + let url = + Url::parse(&account_url).context(UnableToParseUrlSnafu { url: account_url })?; let credential = if let Some(credential) = self.credentials { credential @@ -934,12 +899,10 @@ impl MicrosoftAzureBuilder { /// if present, otherwise falls back to default_url fn url_from_env(env_name: &str, default_url: &str) -> Result { let url = match std::env::var(env_name) { - Ok(env_value) => { - Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })? - } + Ok(env_value) => Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { + env_name, + env_value, + })?, Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), }; Ok(url) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index f65388b61a80..b5ef02191cd7 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -24,9 +24,7 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; -use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig, -}; +use crate::{ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -215,12 +213,7 @@ impl AzureClient { } /// Make an Azure Copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - overwrite: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.config.path_url(to); let mut source = self.config.path_url(from); @@ -288,16 +281,14 @@ impl GetClient for AzureClient { })?; match response.headers().get("x-ms-resource-type") { - Some(resource) if resource.as_ref() != b"file" => { - Err(crate::Error::NotFound { - path: path.to_string(), - source: format!( - "Not a file, got x-ms-resource-type: {}", - String::from_utf8_lossy(resource.as_ref()) - ) - .into(), - }) - } + Some(resource) if resource.as_ref() != b"file" => Err(crate::Error::NotFound { + path: path.to_string(), + source: format!( + "Not a file, got x-ms-resource-type: {}", + String::from_utf8_lossy(resource.as_ref()) + ) + .into(), + }), _ => Ok(response), } } @@ -347,8 +338,7 @@ impl ListClient for AzureClient { .context(ListResponseBodySnafu)?; let mut response: ListResultInternal = - quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); Ok((to_list_result(response, prefix)?, token)) diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 8dc61365fa6e..fc96ce4fc3ef 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -28,9 +28,9 @@ use chrono::{DateTime, Utc}; use reqwest::header::ACCEPT; use reqwest::{ header::{ - HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, - CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, - IF_MODIFIED_SINCE, IF_NONE_MATCH, IF_UNMODIFIED_SINCE, RANGE, + HeaderMap, HeaderName, HeaderValue, AUTHORIZATION, CONTENT_ENCODING, CONTENT_LANGUAGE, + CONTENT_LENGTH, CONTENT_TYPE, DATE, IF_MATCH, IF_MODIFIED_SINCE, IF_NONE_MATCH, + IF_UNMODIFIED_SINCE, RANGE, }, Client, Method, RequestBuilder, }; @@ -46,8 +46,7 @@ use url::Url; static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); static VERSION: HeaderName = HeaderName::from_static("x-ms-version"); pub(crate) static BLOB_TYPE: HeaderName = HeaderName::from_static("x-ms-blob-type"); -pub(crate) static DELETE_SNAPSHOTS: HeaderName = - HeaderName::from_static("x-ms-delete-snapshots"); +pub(crate) static DELETE_SNAPSHOTS: HeaderName = HeaderName::from_static("x-ms-delete-snapshots"); pub(crate) static COPY_SOURCE: HeaderName = HeaderName::from_static("x-ms-copy-source"); static CONTENT_MD5: HeaderName = HeaderName::from_static("content-md5"); pub(crate) const RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -126,19 +125,11 @@ pub mod authority_hosts { pub(crate) trait CredentialExt { /// Apply authorization to requests against azure storage accounts /// - fn with_azure_authorization( - self, - credential: &AzureCredential, - account: &str, - ) -> Self; + fn with_azure_authorization(self, credential: &AzureCredential, account: &str) -> Self; } impl CredentialExt for RequestBuilder { - fn with_azure_authorization( - mut self, - credential: &AzureCredential, - account: &str, - ) -> Self { + fn with_azure_authorization(mut self, credential: &AzureCredential, account: &str) -> Self { // rfc2822 string should never contain illegal characters let date = Utc::now(); let date_str = date.format(RFC1123_FMT).to_string(); @@ -324,8 +315,8 @@ impl ClientSecretOAuthProvider { tenant_id: impl AsRef, authority_host: Option, ) -> Self { - let authority_host = authority_host - .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + let authority_host = + authority_host.unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { token_url: format!( @@ -409,9 +400,8 @@ impl ImdsManagedIdentityProvider { msi_res_id: Option, msi_endpoint: Option, ) -> Self { - let msi_endpoint = msi_endpoint.unwrap_or_else(|| { - "http://169.254.169.254/metadata/identity/oauth2/token".to_owned() - }); + let msi_endpoint = msi_endpoint + .unwrap_or_else(|| "http://169.254.169.254/metadata/identity/oauth2/token".to_owned()); Self { msi_endpoint, @@ -493,8 +483,8 @@ impl WorkloadIdentityOAuthProvider { tenant_id: impl AsRef, authority_host: Option, ) -> Self { - let authority_host = authority_host - .unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); + let authority_host = + authority_host.unwrap_or_else(|| authority_hosts::AZURE_PUBLIC_CLOUD.to_owned()); Self { token_url: format!( @@ -553,9 +543,7 @@ mod az_cli_date_format { use chrono::{DateTime, TimeZone}; use serde::{self, Deserialize, Deserializer}; - pub fn deserialize<'de, D>( - deserializer: D, - ) -> Result, D::Error> + pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { @@ -614,14 +602,12 @@ impl AzureCliCredential { match Command::new(program).args(args).output() { Ok(az_output) if az_output.status.success() => { - let output = - str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { - message: "az response is not a valid utf-8 string".to_string(), - })?; - - let token_response = - serde_json::from_str::(output) - .context(AzureCliResponseSnafu)?; + let output = str::from_utf8(&az_output.stdout).map_err(|_| Error::AzureCli { + message: "az response is not a valid utf-8 string".to_string(), + })?; + + let token_response = serde_json::from_str::(output) + .context(AzureCliResponseSnafu)?; if !token_response.token_type.eq_ignore_ascii_case("bearer") { return Err(Error::AzureCli { message: format!( @@ -630,12 +616,10 @@ impl AzureCliCredential { ), }); } - let duration = token_response.expires_on.naive_local() - - chrono::Local::now().naive_local(); + let duration = + token_response.expires_on.naive_local() - chrono::Local::now().naive_local(); Ok(TemporaryToken { - token: Arc::new(AzureCredential::BearerToken( - token_response.access_token, - )), + token: Arc::new(AzureCredential::BearerToken(token_response.access_token)), expiry: Some( Instant::now() + duration.to_std().map_err(|_| Error::AzureCli { diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 7e1db5bc8c1c..5f768756a629 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -30,8 +30,7 @@ use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; @@ -52,8 +51,7 @@ mod client; mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] -pub type AzureCredentialProvider = - Arc>; +pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -109,11 +107,7 @@ impl ObjectStore for MicrosoftAzure { Ok((String::new(), Box::new(WriteMultiPart::new(inner, 8)))) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { // There is no way to drop blocks that have been uploaded. Instead, they simply // expire in 7 days. Ok(()) @@ -202,8 +196,8 @@ impl PutPart for AzureMultiPartUpload { mod tests { use super::*; use crate::tests::{ - copy_if_not_exists, get_opts, list_uses_directories_correctly, - list_with_delimiter, put_get_delete_list_opts, rename_and_copy, stream_get, + copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, + put_get_delete_list_opts, rename_and_copy, stream_get, }; #[tokio::test] diff --git a/object_store/src/buffered.rs b/object_store/src/buffered.rs index bdc3f4c772b9..3a1354f4f20a 100644 --- a/object_store/src/buffered.rs +++ b/object_store/src/buffered.rs @@ -87,11 +87,7 @@ impl BufReader { } /// Create a new [`BufReader`] from the provided [`ObjectMeta`], [`ObjectStore`], and `capacity` - pub fn with_capacity( - store: Arc, - meta: &ObjectMeta, - capacity: usize, - ) -> Self { + pub fn with_capacity(store: Arc, meta: &ObjectMeta, capacity: usize) -> Self { Self { path: meta.location.clone(), size: meta.size as _, @@ -138,21 +134,32 @@ impl AsyncSeek for BufReader { fn start_seek(mut self: Pin<&mut Self>, position: SeekFrom) -> std::io::Result<()> { self.cursor = match position { SeekFrom::Start(offset) => offset, - SeekFrom::End(offset) => { - checked_add_signed(self.size,offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from end of {} byte file would result in overflow", self.size)))? - } + SeekFrom::End(offset) => checked_add_signed(self.size, offset).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + format!( + "Seeking {offset} from end of {} byte file would result in overflow", + self.size + ), + ) + })?, SeekFrom::Current(offset) => { - checked_add_signed(self.cursor, offset).ok_or_else(|| Error::new(ErrorKind::InvalidInput, format!("Seeking {offset} from current offset of {} would result in overflow", self.cursor)))? + checked_add_signed(self.cursor, offset).ok_or_else(|| { + Error::new( + ErrorKind::InvalidInput, + format!( + "Seeking {offset} from current offset of {} would result in overflow", + self.cursor + ), + ) + })? } }; self.buffer = Buffer::Empty; Ok(()) } - fn poll_complete( - self: Pin<&mut Self>, - _cx: &mut Context<'_>, - ) -> Poll> { + fn poll_complete(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { Poll::Ready(Ok(self.cursor)) } } @@ -179,10 +186,7 @@ impl AsyncRead for BufReader { } impl AsyncBufRead for BufReader { - fn poll_fill_buf( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let capacity = self.capacity; self.get_mut().poll_fill_buf_impl(cx, capacity) } @@ -238,7 +242,10 @@ mod tests { assert_eq!(&out, &data); let err = reader.seek(SeekFrom::Current(i64::MIN)).await.unwrap_err(); - assert_eq!(err.to_string(), "Seeking -9223372036854775808 from current offset of 4096 would result in overflow"); + assert_eq!( + err.to_string(), + "Seeking -9223372036854775808 from current offset of 4096 would result in overflow" + ); reader.rewind().await.unwrap(); @@ -254,7 +261,10 @@ mod tests { assert!(buf.is_empty()); let err = reader.seek(SeekFrom::Current(1)).await.unwrap_err(); - assert_eq!(err.to_string(), "Seeking 1 from current offset of 18446744073709551615 would result in overflow"); + assert_eq!( + err.to_string(), + "Seeking 1 from current offset of 18446744073709551615 would result in overflow" + ); for capacity in [200, 1024, 4096, DEFAULT_BUFFER_SIZE] { let store = Arc::clone(&store); diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 5694c55d787f..021f9f50156b 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -29,8 +29,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, }; use crate::{MultipartId, Result}; @@ -74,11 +73,7 @@ impl ObjectStore for ChunkedStore { self.inner.put_multipart(location).await } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.inner.abort_multipart(location, multipart_id).await } diff --git a/object_store/src/client/backoff.rs b/object_store/src/client/backoff.rs index a4ca9765e79e..e01589102eb1 100644 --- a/object_store/src/client/backoff.rs +++ b/object_store/src/client/backoff.rs @@ -98,10 +98,7 @@ impl Backoff { }; let next_backoff = self.max_backoff_secs.min(rand_backoff); - Duration::from_secs_f64(std::mem::replace( - &mut self.next_backoff_secs, - next_backoff, - )) + Duration::from_secs_f64(std::mem::replace(&mut self.next_backoff_secs, next_backoff)) } } @@ -122,8 +119,7 @@ mod tests { base, }; - let assert_fuzzy_eq = - |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); + let assert_fuzzy_eq = |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}"); // Create a static rng that takes the minimum of the range let rng = Box::new(StepRng::new(0, 0)); @@ -149,8 +145,8 @@ mod tests { let mut value = init_backoff_secs; for _ in 0..20 { assert_fuzzy_eq(backoff.next().as_secs_f64(), value); - value = (init_backoff_secs + (value * base - init_backoff_secs) / 2.) - .min(max_backoff_secs); + value = + (init_backoff_secs + (value * base - init_backoff_secs) / 2.).min(max_backoff_secs); } } } diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 7f68b6d1225f..ed1762ff8fe9 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -48,13 +48,12 @@ impl GetClientExt for T { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let range = options.range.clone(); let response = self.get_request(location, options).await?; - let meta = - header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { - Error::Generic { - store: T::STORE, - source: Box::new(e), - } - })?; + let meta = header_meta(location, response.headers(), T::HEADER_CONFIG).map_err(|e| { + Error::Generic { + store: T::STORE, + source: Box::new(e), + } + })?; let stream = response .bytes_stream() diff --git a/object_store/src/client/mock_server.rs b/object_store/src/client/mock_server.rs index adb7e0fff779..36c6b650c038 100644 --- a/object_store/src/client/mock_server.rs +++ b/object_store/src/client/mock_server.rs @@ -57,8 +57,7 @@ impl MockServer { }); let (shutdown, rx) = oneshot::channel::<()>(); - let server = - Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); + let server = Server::bind(&SocketAddr::from(([127, 0, 0, 1], 0))).serve(make_service); let url = format!("http://{}", server.local_addr()); diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 137da2b37594..3c968f11be21 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -61,8 +61,7 @@ fn map_client_error(e: reqwest::Error) -> super::Error { } } -static DEFAULT_USER_AGENT: &str = - concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); +static DEFAULT_USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"),); /// Configuration keys for [`ClientOptions`] #[derive(PartialEq, Eq, Hash, Clone, Debug, Copy, Deserialize, Serialize)] @@ -231,9 +230,7 @@ impl ClientOptions { ClientConfigKey::ConnectTimeout => { self.connect_timeout = Some(ConfigValue::Deferred(value.into())) } - ClientConfigKey::DefaultContentType => { - self.default_content_type = Some(value.into()) - } + ClientConfigKey::DefaultContentType => self.default_content_type = Some(value.into()), ClientConfigKey::Http1Only => self.http1_only.parse(value), ClientConfigKey::Http2Only => self.http2_only.parse(value), ClientConfigKey::Http2KeepAliveInterval => { @@ -252,13 +249,9 @@ impl ClientOptions { self.pool_max_idle_per_host = Some(ConfigValue::Deferred(value.into())) } ClientConfigKey::ProxyUrl => self.proxy_url = Some(value.into()), - ClientConfigKey::ProxyCaCertificate => { - self.proxy_ca_certificate = Some(value.into()) - } + ClientConfigKey::ProxyCaCertificate => self.proxy_ca_certificate = Some(value.into()), ClientConfigKey::ProxyExcludes => self.proxy_excludes = Some(value.into()), - ClientConfigKey::Timeout => { - self.timeout = Some(ConfigValue::Deferred(value.into())) - } + ClientConfigKey::Timeout => self.timeout = Some(ConfigValue::Deferred(value.into())), ClientConfigKey::UserAgent => { self.user_agent = Some(ConfigValue::Deferred(value.into())) } @@ -270,12 +263,8 @@ impl ClientOptions { pub fn get_config_value(&self, key: &ClientConfigKey) -> Option { match key { ClientConfigKey::AllowHttp => Some(self.allow_http.to_string()), - ClientConfigKey::AllowInvalidCertificates => { - Some(self.allow_insecure.to_string()) - } - ClientConfigKey::ConnectTimeout => { - self.connect_timeout.as_ref().map(fmt_duration) - } + ClientConfigKey::AllowInvalidCertificates => Some(self.allow_insecure.to_string()), + ClientConfigKey::ConnectTimeout => self.connect_timeout.as_ref().map(fmt_duration), ClientConfigKey::DefaultContentType => self.default_content_type.clone(), ClientConfigKey::Http1Only => Some(self.http1_only.to_string()), ClientConfigKey::Http2KeepAliveInterval => { @@ -288,9 +277,7 @@ impl ClientOptions { Some(self.http2_keep_alive_while_idle.to_string()) } ClientConfigKey::Http2Only => Some(self.http2_only.to_string()), - ClientConfigKey::PoolIdleTimeout => { - self.pool_idle_timeout.as_ref().map(fmt_duration) - } + ClientConfigKey::PoolIdleTimeout => self.pool_idle_timeout.as_ref().map(fmt_duration), ClientConfigKey::PoolMaxIdlePerHost => { self.pool_max_idle_per_host.as_ref().map(|v| v.to_string()) } @@ -378,10 +365,7 @@ impl ClientOptions { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.proxy_ca_certificate = Some(proxy_ca_certificate.into()); self } @@ -522,9 +506,8 @@ impl ClientOptions { let mut proxy = Proxy::all(proxy).map_err(map_client_error)?; if let Some(certificate) = &self.proxy_ca_certificate { - let certificate = - reqwest::tls::Certificate::from_pem(certificate.as_bytes()) - .map_err(map_client_error)?; + let certificate = reqwest::tls::Certificate::from_pem(certificate.as_bytes()) + .map_err(map_client_error)?; builder = builder.add_root_certificate(certificate); } diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index e4d246c87a2a..d70d6d88de32 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -403,7 +403,12 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); - assert!(e.starts_with("Error after 2 retries: HTTP status server error (502 Bad Gateway) for url"), "{e}"); + assert!( + e.starts_with( + "Error after 2 retries: HTTP status server error (502 Bad Gateway) for url" + ), + "{e}" + ); // Panic results in an incomplete message error in the client mock.push_fn(|_| panic!()); diff --git a/object_store/src/delimited.rs b/object_store/src/delimited.rs index 13214865117a..4f25c9d6d313 100644 --- a/object_store/src/delimited.rs +++ b/object_store/src/delimited.rs @@ -228,8 +228,7 @@ mod tests { #[tokio::test] async fn test_delimiter_stream() { let input = vec!["hello\nworld\nbin", "go\ncup", "cakes"]; - let input_stream = - futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); + let input_stream = futures::stream::iter(input.into_iter().map(|s| Ok(Bytes::from(s)))); let stream = newline_delimited_stream(input_stream); let results: Vec<_> = stream.try_collect().await.unwrap(); diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index 920ab8b2a9b5..2039d2378392 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -21,12 +21,8 @@ use crate::gcp::credential::{ ApplicationDefaultCredentials, InstanceCredentialProvider, ServiceAccountCredentials, DEFAULT_GCS_BASE_URL, }; -use crate::gcp::{ - credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE, -}; -use crate::{ - ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider, -}; +use crate::gcp::{credential, GcpCredential, GcpCredentialProvider, GoogleCloudStorage, STORE}; +use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; @@ -38,9 +34,7 @@ enum Error { #[snafu(display("Missing bucket name"))] MissingBucketName {}, - #[snafu(display( - "One of service account path or service account key may be provided." - ))] + #[snafu(display("One of service account path or service account key may be provided."))] ServiceAccountPathAndKeyProvided, #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] @@ -188,12 +182,8 @@ impl FromStr for GoogleConfigKey { | "service_account" | "google_service_account_path" | "service_account_path" => Ok(Self::ServiceAccount), - "google_service_account_key" | "service_account_key" => { - Ok(Self::ServiceAccountKey) - } - "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => { - Ok(Self::Bucket) - } + "google_service_account_key" | "service_account_key" => Ok(Self::ServiceAccountKey), + "google_bucket" | "google_bucket_name" | "bucket" | "bucket_name" => Ok(Self::Bucket), "google_application_credentials" => Ok(Self::ApplicationCredentials), _ => match s.parse() { Ok(key) => Ok(Self::Client(key)), @@ -286,12 +276,8 @@ impl GoogleCloudStorageBuilder { /// Set an option on the builder via a key - value pair. pub fn with_config(mut self, key: GoogleConfigKey, value: impl Into) -> Self { match key { - GoogleConfigKey::ServiceAccount => { - self.service_account_path = Some(value.into()) - } - GoogleConfigKey::ServiceAccountKey => { - self.service_account_key = Some(value.into()) - } + GoogleConfigKey::ServiceAccount => self.service_account_path = Some(value.into()), + GoogleConfigKey::ServiceAccountKey => self.service_account_key = Some(value.into()), GoogleConfigKey::Bucket => self.bucket_name = Some(value.into()), GoogleConfigKey::ApplicationCredentials => { self.application_credentials_path = Some(value.into()) @@ -305,20 +291,14 @@ impl GoogleCloudStorageBuilder { /// Set an option on the builder via a key - value pair. #[deprecated(note = "Use with_config")] - pub fn try_with_option( - self, - key: impl AsRef, - value: impl Into, - ) -> Result { + pub fn try_with_option(self, key: impl AsRef, value: impl Into) -> Result { Ok(self.with_config(key.as_ref().parse()?, value)) } /// Hydrate builder from key value pairs #[deprecated(note = "Use with_config")] #[allow(deprecated)] - pub fn try_with_options< - I: IntoIterator, impl Into)>, - >( + pub fn try_with_options, impl Into)>>( mut self, options: I, ) -> Result { @@ -344,9 +324,7 @@ impl GoogleCloudStorageBuilder { GoogleConfigKey::ServiceAccount => self.service_account_path.clone(), GoogleConfigKey::ServiceAccountKey => self.service_account_key.clone(), GoogleConfigKey::Bucket => self.bucket_name.clone(), - GoogleConfigKey::ApplicationCredentials => { - self.application_credentials_path.clone() - } + GoogleConfigKey::ApplicationCredentials => self.application_credentials_path.clone(), GoogleConfigKey::Client(key) => self.client_options.get_config_value(key), } } @@ -394,10 +372,7 @@ impl GoogleCloudStorageBuilder { /// "private_key": "" /// } /// ``` - pub fn with_service_account_path( - mut self, - service_account_path: impl Into, - ) -> Self { + pub fn with_service_account_path(mut self, service_account_path: impl Into) -> Self { self.service_account_path = Some(service_account_path.into()); self } @@ -407,10 +382,7 @@ impl GoogleCloudStorageBuilder { /// /// This or [`GoogleCloudStorageBuilder::with_service_account_path`] must be /// set. - pub fn with_service_account_key( - mut self, - service_account: impl Into, - ) -> Self { + pub fn with_service_account_key(mut self, service_account: impl Into) -> Self { self.service_account_key = Some(service_account.into()); self } @@ -445,10 +417,7 @@ impl GoogleCloudStorageBuilder { } /// Set a trusted proxy CA certificate - pub fn with_proxy_ca_certificate( - mut self, - proxy_ca_certificate: impl Into, - ) -> Self { + pub fn with_proxy_ca_certificate(mut self, proxy_ca_certificate: impl Into) -> Self { self.client_options = self .client_options .with_proxy_ca_certificate(proxy_ca_certificate); @@ -479,23 +448,19 @@ impl GoogleCloudStorageBuilder { // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => Some( - ServiceAccountCredentials::from_file(path) - .context(CredentialSnafu)?, - ), - (None, Some(key)) => Some( - ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?, - ), - (None, None) => None, - (Some(_), Some(_)) => { - return Err(Error::ServiceAccountPathAndKeyProvided.into()) + (Some(path), None) => { + Some(ServiceAccountCredentials::from_file(path).context(CredentialSnafu)?) + } + (None, Some(key)) => { + Some(ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?) } + (None, None) => None, + (Some(_), Some(_)) => return Err(Error::ServiceAccountPathAndKeyProvided.into()), }; // Then try to initialize from the application credentials file, or the environment. - let application_default_credentials = ApplicationDefaultCredentials::read( - self.application_credentials_path.as_deref(), - )?; + let application_default_credentials = + ApplicationDefaultCredentials::read(self.application_credentials_path.as_deref())?; let disable_oauth = service_account_credentials .as_ref() @@ -617,8 +582,8 @@ mod tests { // Service account key for alias in ["google_service_account_key", "service_account_key"] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), FAKE_KEY); + let builder = + GoogleCloudStorageBuilder::new().with_config(alias.parse().unwrap(), FAKE_KEY); assert_eq!(FAKE_KEY, builder.service_account_key.unwrap()); } @@ -629,8 +594,8 @@ mod tests { "bucket", "bucket_name", ] { - let builder = GoogleCloudStorageBuilder::new() - .with_config(alias.parse().unwrap(), "fake_bucket"); + let builder = + GoogleCloudStorageBuilder::new().with_config(alias.parse().unwrap(), "fake_bucket"); assert_eq!("fake_bucket", builder.bucket_name.unwrap()); } } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 9141a9da8c5b..4165d784fd7f 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -210,18 +210,13 @@ impl GoogleCloudStorageClient { let data = response.bytes().await.context(PutResponseBodySnafu)?; let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()) - .context(InvalidPutResponseSnafu)?; + quick_xml::de::from_reader(data.as_ref().reader()).context(InvalidPutResponseSnafu)?; Ok(result.upload_id) } /// Cleanup unused parts - pub async fn multipart_cleanup( - &self, - path: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + pub async fn multipart_cleanup(&self, path: &Path, multipart_id: &MultipartId) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(path); @@ -300,12 +295,7 @@ impl GoogleCloudStorageClient { } /// Perform a copy request - pub async fn copy_request( - &self, - from: &Path, - to: &Path, - if_not_exists: bool, - ) -> Result<()> { + pub async fn copy_request(&self, from: &Path, to: &Path, if_not_exists: bool) -> Result<()> { let credential = self.get_credential().await?; let url = self.object_url(to); @@ -416,8 +406,8 @@ impl ListClient for GoogleCloudStorageClient { .await .context(ListResponseBodySnafu)?; - let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) - .context(InvalidListResponseSnafu)?; + let mut response: ListResponse = + quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 87f8e244f21c..29c7b4563ad5 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -226,9 +226,7 @@ impl TokenProvider for SelfSignedJwt { } } -fn read_credentials_file( - service_account_path: impl AsRef, -) -> Result +fn read_credentials_file(service_account_path: impl AsRef) -> Result where T: serde::de::DeserializeOwned, { @@ -329,9 +327,8 @@ async fn make_metadata_request( hostname: &str, retry: &RetryConfig, ) -> crate::Result { - let url = format!( - "http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token" - ); + let url = + format!("http://{hostname}/computeMetadata/v1/instance/service-accounts/default/token"); let response: TokenResponse = client .request(Method::GET, url) .header("Metadata-Flavor", "Google") @@ -396,8 +393,7 @@ pub enum ApplicationDefaultCredentials { } impl ApplicationDefaultCredentials { - const CREDENTIALS_PATH: &'static str = - ".config/gcloud/application_default_credentials.json"; + const CREDENTIALS_PATH: &'static str = ".config/gcloud/application_default_credentials.json"; // Create a new application default credential in the following situations: // 1. a file is passed in and the type matches. diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 7c69d288740c..6512a8b036c5 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -35,8 +35,7 @@ use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -137,11 +136,7 @@ impl ObjectStore for GoogleCloudStorage { Ok((upload_id, Box::new(WriteMultiPart::new(inner, 8)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { self.client .multipart_cleanup(location, multipart_id) .await?; diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index 4c2a7fcf8db3..f7593be5a043 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -90,11 +90,7 @@ pub struct Client { } impl Client { - pub fn new( - url: Url, - client_options: ClientOptions, - retry_config: RetryConfig, - ) -> Result { + pub fn new(url: Url, client_options: ClientOptions, retry_config: RetryConfig) -> Result { let client = client_options.client()?; Ok(Self { url, @@ -183,11 +179,7 @@ impl Client { } } - pub async fn list( - &self, - location: Option<&Path>, - depth: &str, - ) -> Result { + pub async fn list(&self, location: Option<&Path>, depth: &str) -> Result { let url = location .map(|path| self.path_url(path)) .unwrap_or_else(|| self.url.clone()); @@ -220,8 +212,7 @@ impl Client { Err(source) => return Err(Error::Request { source }.into()), }; - let status = quick_xml::de::from_reader(response.reader()) - .context(InvalidPropFindSnafu)?; + let status = quick_xml::de::from_reader(response.reader()).context(InvalidPropFindSnafu)?; Ok(status) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index e41e4f990110..8f61011ccae1 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -45,8 +45,8 @@ use crate::client::header::get_etag; use crate::http::client::Client; use crate::path::Path; use crate::{ - ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, - ObjectMeta, ObjectStore, PutResult, Result, RetryConfig, + ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, + ObjectStore, PutResult, Result, RetryConfig, }; mod client; @@ -113,11 +113,7 @@ impl ObjectStore for HttpStore { Err(super::Error::NotImplemented) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { Err(super::Error::NotImplemented) } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 86313616be1b..375302e50d8b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -256,8 +256,8 @@ mod client; #[cfg(feature = "cloud")] pub use client::{ - backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, - CredentialProvider, StaticCredentialProvider, + backoff::BackoffConfig, retry::RetryConfig, ClientConfigKey, ClientOptions, CredentialProvider, + StaticCredentialProvider, }; #[cfg(feature = "cloud")] @@ -323,11 +323,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// /// See documentation for individual stores for exact behavior, as capabilities /// vary by object store. - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()>; + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()>; /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` /// @@ -349,10 +345,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Additionally some stores, such as Azure, may only support appending to objects created /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or /// [`ObjectStore::put_multipart`] - async fn append( - &self, - _location: &Path, - ) -> Result> { + async fn append(&self, _location: &Path) -> Result> { Err(Error::NotImplemented) } @@ -376,11 +369,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// Return the bytes that are stored at the specified location /// in the given byte ranges - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { coalesce_ranges( ranges, |range| self.get_range(location, range), @@ -547,10 +536,7 @@ macro_rules! as_ref_impl { self.as_ref().abort_multipart(location, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { self.as_ref().append(location).await } @@ -558,19 +544,11 @@ macro_rules! as_ref_impl { self.as_ref().get(location).await } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { self.as_ref().get_opts(location, options).await } - async fn get_range( - &self, - location: &Path, - range: Range, - ) -> Result { + async fn get_range(&self, location: &Path, range: Range) -> Result { self.as_ref().get_range(location, range).await } @@ -609,10 +587,7 @@ macro_rules! as_ref_impl { self.as_ref().list_with_offset(prefix, offset) } - async fn list_with_delimiter( - &self, - prefix: Option<&Path>, - ) -> Result { + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.as_ref().list_with_delimiter(prefix).await } @@ -799,20 +774,16 @@ impl GetResult { #[cfg(not(target_arch = "wasm32"))] GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { - file.seek(SeekFrom::Start(self.range.start as _)).map_err( - |source| local::Error::Seek { + file.seek(SeekFrom::Start(self.range.start as _)) + .map_err(|source| local::Error::Seek { source, path: path.clone(), - }, - )?; + })?; let mut buffer = Vec::with_capacity(len); file.take(len as _) .read_to_end(&mut buffer) - .map_err(|source| local::Error::UnableToReadBytes { - source, - path, - })?; + .map_err(|source| local::Error::UnableToReadBytes { source, path })?; Ok(buffer.into()) }) @@ -915,11 +886,7 @@ pub enum Error { #[snafu(display("Operation not yet implemented."))] NotImplemented, - #[snafu(display( - "Configuration key: '{}' is not valid for store '{}'.", - key, - store - ))] + #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] UnknownConfigurationKey { store: &'static str, key: String }, } @@ -1245,8 +1212,7 @@ mod tests { for (prefix, offset) in cases { let s = storage.list_with_offset(prefix.as_ref(), &offset); - let mut actual: Vec<_> = - s.map_ok(|x| x.location).try_collect().await.unwrap(); + let mut actual: Vec<_> = s.map_ok(|x| x.location).try_collect().await.unwrap(); actual.sort_unstable(); @@ -1254,8 +1220,7 @@ mod tests { .iter() .cloned() .filter(|x| { - let prefix_match = - prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); + let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); prefix_match && x > &offset }) .collect(); @@ -1627,8 +1592,7 @@ mod tests { storage: &DynObjectStore, location: Option, ) -> crate::Result { - let location = - location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); + let location = location.unwrap_or_else(|| Path::from("this_file_should_not_exist")); let err = storage.head(&location).await.unwrap_err(); assert!(matches!(err, crate::Error::NotFound { .. })); diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 8a453813c24e..cd01a964dc3e 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -18,8 +18,8 @@ //! An object store that limits the maximum concurrency of the wrapped implementation use crate::{ - BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, - ObjectMeta, ObjectStore, Path, PutResult, Result, StreamExt, + BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, + ObjectStore, Path, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -86,19 +86,12 @@ impl ObjectStore for LimitStore { Ok((id, Box::new(PermitWrapper::new(write, permit)))) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.abort_multipart(location, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); let write = self.inner.append(location).await?; Ok(Box::new(PermitWrapper::new(write, permit))) @@ -121,11 +114,7 @@ impl ObjectStore for LimitStore { self.inner.get_range(location, range).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.get_ranges(location, ranges).await } @@ -226,10 +215,7 @@ impl PermitWrapper { impl Stream for PermitWrapper { type Item = T::Item; - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { Pin::new(&mut self.inner).poll_next(cx) } diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 4b7c96346e4d..9be3ee923244 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -19,8 +19,8 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, - GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, + GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, + PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -311,11 +311,7 @@ impl ObjectStore for LocalFileSystem { )) } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let dest = self.config.path_to_filesystem(location)?; let path: PathBuf = staged_upload_path(&dest, multipart_id); @@ -329,10 +325,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { // Get the path to the file from the configuration. let path = self.config.path_to_filesystem(location)?; loop { @@ -352,11 +345,10 @@ impl ObjectStore for LocalFileSystem { // If the error is that the file was not found, attempt to create the file and any necessary parent directories. Err(source) if source.kind() == ErrorKind::NotFound => { // Get the path to the parent directory of the file. - let parent = - path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, - })?; + let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { + path: path.to_path_buf(), + source, + })?; // Create the parent directory and any necessary ancestors. tokio::fs::create_dir_all(parent) @@ -367,9 +359,7 @@ impl ObjectStore for LocalFileSystem { continue; } // If any other error occurs, return a `UnableToOpenFile` error. - Err(source) => { - return Err(Error::UnableToOpenFile { source, path }.into()) - } + Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), } } } @@ -400,11 +390,7 @@ impl ObjectStore for LocalFileSystem { .await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let path = self.config.path_to_filesystem(location)?; let ranges = ranges.to_vec(); maybe_spawn_blocking(move || { @@ -719,9 +705,7 @@ impl AsyncWrite for LocalUpload { runtime .spawn_blocking(move || (&*file2).write_all(&data)) .map(move |res| match res { - Err(err) => { - Err(io::Error::new(ErrorKind::Other, err)) - } + Err(err) => Err(io::Error::new(ErrorKind::Other, err)), Ok(res) => res.map(move |_| data_len), }), ), @@ -771,31 +755,24 @@ impl AsyncWrite for LocalUpload { // We are moving file into the future, and it will be dropped on it's completion, closing the file. let file = Arc::clone(file); self.inner_state = LocalUploadState::ShuttingDown(Box::pin( - runtime.spawn_blocking(move || (*file).sync_all()).map( - move |res| match res { - Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) - } + runtime + .spawn_blocking(move || (*file).sync_all()) + .map(move |res| match res { + Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), Ok(res) => res, - }, - ), + }), )); } LocalUploadState::ShuttingDown(fut) => match fut.poll_unpin(cx) { Poll::Ready(res) => { res?; - let staging_path = - staged_upload_path(&self.dest, &self.multipart_id); + let staging_path = staged_upload_path(&self.dest, &self.multipart_id); let dest = self.dest.clone(); self.inner_state = LocalUploadState::Committing(Box::pin( runtime - .spawn_blocking(move || { - std::fs::rename(&staging_path, &dest) - }) + .spawn_blocking(move || std::fs::rename(&staging_path, &dest)) .map(move |res| match res { - Err(err) => { - Err(io::Error::new(io::ErrorKind::Other, err)) - } + Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), Ok(res) => res, }), )); @@ -905,11 +882,7 @@ pub(crate) fn chunked_stream( .boxed() } -pub(crate) fn read_range( - file: &mut File, - path: &PathBuf, - range: Range, -) -> Result { +pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) .context(SeekSnafu { path })?; @@ -1231,11 +1204,7 @@ mod tests { fs.list_with_delimiter(None).await.unwrap(); } - async fn check_list( - integration: &LocalFileSystem, - prefix: Option<&Path>, - expected: &[&str], - ) { + async fn check_list(integration: &LocalFileSystem, prefix: Option<&Path>, expected: &[&str]) { let result: Vec<_> = integration.list(prefix).try_collect().await.unwrap(); let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); @@ -1262,8 +1231,7 @@ mod tests { // Follow out of tree symlink let other = NamedTempFile::new().unwrap(); - std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")) - .unwrap(); + std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")).unwrap(); // Should return test.parquet even though out of tree check_list(&integration, None, &["a/file.parquet", "test.parquet"]).await; @@ -1288,11 +1256,7 @@ mod tests { .unwrap(); // Ignore broken symlink - std::os::unix::fs::symlink( - root.path().join("foo.parquet"), - root.path().join("c"), - ) - .unwrap(); + std::os::unix::fs::symlink(root.path().join("foo.parquet"), root.path().join("c")).unwrap(); check_list( &integration, @@ -1388,7 +1352,9 @@ mod tests { .to_string(); assert!( - err.contains("Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""), + err.contains( + "Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" + ), "{}", err ); @@ -1401,12 +1367,10 @@ mod tests { let location = Path::from("some_file"); let data = Bytes::from("arbitrary data"); - let (multipart_id, mut writer) = - integration.put_multipart(&location).await.unwrap(); + let (multipart_id, mut writer) = integration.put_multipart(&location).await.unwrap(); writer.write_all(&data).await.unwrap(); - let (multipart_id_2, mut writer_2) = - integration.put_multipart(&location).await.unwrap(); + let (multipart_id_2, mut writer_2) = integration.put_multipart(&location).await.unwrap(); assert_ne!(multipart_id, multipart_id_2); writer_2.write_all(&data).await.unwrap(); @@ -1588,9 +1552,8 @@ mod unix_test { unistd::mkfifo(&path, stat::Mode::S_IRWXU).unwrap(); // Need to open read and write side in parallel - let spawned = tokio::task::spawn_blocking(|| { - OpenOptions::new().write(true).open(path).unwrap() - }); + let spawned = + tokio::task::spawn_blocking(|| OpenOptions::new().write(true).open(path).unwrap()); let location = Path::from(filename); integration.head(&location).await.unwrap(); diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 952b45739759..da7b55d3a83f 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -17,8 +17,7 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -145,19 +144,12 @@ impl ObjectStore for InMemory { )) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { // Nothing to clean up Ok(()) } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { Ok(Box::new(InMemoryAppend { location: location.clone(), data: Vec::::new(), @@ -195,11 +187,7 @@ impl ObjectStore for InMemory { }) } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let entry = self.entry(location).await?; ranges .iter() diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 2e72a710ac75..170726f45290 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -81,8 +81,7 @@ impl ObjectStoreScheme { } ("http", Some(_)) => (Self::Http, url.path()), ("https", Some(host)) => { - if host.ends_with("dfs.core.windows.net") - || host.ends_with("blob.core.windows.net") + if host.ends_with("dfs.core.windows.net") || host.ends_with("blob.core.windows.net") { (Self::MicrosoftAzure, url.path()) } else if host.ends_with("amazonaws.com") { @@ -166,12 +165,7 @@ where let url = &url[..url::Position::BeforePath]; Box::new(crate::http::HttpBuilder::new().with_url(url).build()?) as _ } - #[cfg(not(all( - feature = "aws", - feature = "azure", - feature = "gcp", - feature = "http" - )))] + #[cfg(not(all(feature = "aws", feature = "azure", feature = "gcp", feature = "http")))] s => { return Err(super::Error::Generic { store: "parse_url", diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index ab30e0ed04cc..e065c31d3145 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -168,9 +168,7 @@ impl Path { /// as defined on the docstring for [`Path`] or does not exist /// /// Note: this will canonicalize the provided path, resolving any symlinks - pub fn from_filesystem_path( - path: impl AsRef, - ) -> Result { + pub fn from_filesystem_path(path: impl AsRef) -> Result { let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { path: path.as_ref(), })?; @@ -199,12 +197,14 @@ impl Path { ) -> Result { let url = absolute_path_to_url(path)?; let path = match base { - Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { - Error::PrefixMismatch { - path: url.path().to_string(), - prefix: prefix.to_string(), - } - })?, + Some(prefix) => { + url.path() + .strip_prefix(prefix.path()) + .ok_or_else(|| Error::PrefixMismatch { + path: url.path().to_string(), + prefix: prefix.to_string(), + })? + } None => url.path(), }; @@ -256,10 +256,7 @@ impl Path { /// Returns an iterator of the [`PathPart`] of this [`Path`] after `prefix` /// /// Returns `None` if the prefix does not match - pub fn prefix_match( - &self, - prefix: &Self, - ) -> Option> + '_> { + pub fn prefix_match(&self, prefix: &Self) -> Option> + '_> { let mut stripped = self.raw.strip_prefix(&prefix.raw)?; if !stripped.is_empty() && !prefix.raw.is_empty() { stripped = stripped.strip_prefix(DELIMITER)?; @@ -333,9 +330,7 @@ where #[cfg(not(target_arch = "wasm32"))] /// Given an absolute filesystem path convert it to a URL representation without canonicalization -pub(crate) fn absolute_path_to_url( - path: impl AsRef, -) -> Result { +pub(crate) fn absolute_path_to_url(path: impl AsRef) -> Result { Url::from_file_path(&path).map_err(|_| Error::InvalidPath { path: path.as_ref().into(), }) @@ -498,8 +493,7 @@ mod tests { #[test] fn prefix_matches_with_file_name() { - let haystack = - Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); + let haystack = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo.segment"]); // All directories match and file name is a prefix let needle = Path::from_iter(["foo/bar", "baz%2Ftest", "something", "foo"]); diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 21f6c1d99dc9..c4cb77b66d01 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -23,8 +23,7 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, - Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; #[doc(hidden)] @@ -93,19 +92,12 @@ impl ObjectStore for PrefixStore { self.inner.put_multipart(&full_path).await } - async fn abort_multipart( - &self, - location: &Path, - multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { let full_path = self.full_path(location); self.inner.abort_multipart(&full_path, multipart_id).await } - async fn append( - &self, - location: &Path, - ) -> Result> { + async fn append(&self, location: &Path) -> Result> { let full_path = self.full_path(location); self.inner.append(&full_path).await } @@ -125,11 +117,7 @@ impl ObjectStore for PrefixStore { self.inner.get_opts(&full_path, options).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let full_path = self.full_path(location); self.inner.get_ranges(&full_path, ranges).await } diff --git a/object_store/src/signer.rs b/object_store/src/signer.rs index f1f35debe053..f792397a7894 100644 --- a/object_store/src/signer.rs +++ b/object_store/src/signer.rs @@ -31,10 +31,5 @@ pub trait Signer: Send + Sync + fmt::Debug + 'static { /// the URL should be valid, return a signed [`Url`] created with the object store /// implementation's credentials such that the URL can be handed to something that doesn't have /// access to the object store's credentials, to allow limited access to the object store. - async fn signed_url( - &self, - method: Method, - path: &Path, - expires_in: Duration, - ) -> Result; + async fn signed_url(&self, method: Method, path: &Path, expires_in: Duration) -> Result; } diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index d6f191baf82e..c5521256b8a6 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -21,8 +21,7 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, - PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -161,18 +160,11 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } - async fn abort_multipart( - &self, - _location: &Path, - _multipart_id: &MultipartId, - ) -> Result<()> { + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { Err(super::Error::NotImplemented) } - async fn append( - &self, - _location: &Path, - ) -> Result> { + async fn append(&self, _location: &Path) -> Result> { Err(super::Error::NotImplemented) } @@ -199,19 +191,15 @@ impl ObjectStore for ThrottledStore { async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); - let sleep_duration = config.wait_get_per_call - + config.wait_get_per_byte * (range.end - range.start) as u32; + let sleep_duration = + config.wait_get_per_call + config.wait_get_per_byte * (range.end - range.start) as u32; sleep(sleep_duration).await; self.inner.get_range(location, range).await } - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> Result> { + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let config = self.config(); let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); @@ -266,8 +254,7 @@ impl ObjectStore for ThrottledStore { match self.inner.list_with_delimiter(prefix).await { Ok(list_result) => { let entries_len = usize_to_u32_saturate(list_result.objects.len()); - sleep(self.config().wait_list_with_delimiter_per_entry * entries_len) - .await; + sleep(self.config().wait_list_with_delimiter_per_entry * entries_len).await; Ok(list_result) } Err(err) => Err(err), @@ -487,10 +474,7 @@ mod tests { assert_bounds!(measure_put(&store, 0).await, 0); } - async fn place_test_object( - store: &ThrottledStore, - n_bytes: Option, - ) -> Path { + async fn place_test_object(store: &ThrottledStore, n_bytes: Option) -> Path { let path = Path::from("foo"); if let Some(n_bytes) = n_bytes { @@ -506,10 +490,7 @@ mod tests { } #[allow(dead_code)] - async fn place_test_objects( - store: &ThrottledStore, - n_entries: usize, - ) -> Path { + async fn place_test_objects(store: &ThrottledStore, n_entries: usize) -> Path { let prefix = Path::from("foo"); // clean up store @@ -530,10 +511,7 @@ mod tests { prefix } - async fn measure_delete( - store: &ThrottledStore, - n_bytes: Option, - ) -> Duration { + async fn measure_delete(store: &ThrottledStore, n_bytes: Option) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -543,10 +521,7 @@ mod tests { } #[allow(dead_code)] - async fn measure_get( - store: &ThrottledStore, - n_bytes: Option, - ) -> Duration { + async fn measure_get(store: &ThrottledStore, n_bytes: Option) -> Duration { let path = place_test_object(store, n_bytes).await; let t0 = Instant::now(); @@ -570,10 +545,7 @@ mod tests { } #[allow(dead_code)] - async fn measure_list( - store: &ThrottledStore, - n_entries: usize, - ) -> Duration { + async fn measure_list(store: &ThrottledStore, n_entries: usize) -> Duration { let prefix = place_test_objects(store, n_entries).await; let t0 = Instant::now(); diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 764582a67f95..fd86ba7366b0 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -32,25 +32,19 @@ where D: serde::Deserializer<'de>, { let s: String = serde::Deserialize::deserialize(deserializer)?; - let naive = chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT) - .map_err(serde::de::Error::custom)?; + let naive = + chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT).map_err(serde::de::Error::custom)?; Ok(chrono::TimeZone::from_utc_datetime(&chrono::Utc, &naive)) } #[cfg(any(feature = "aws", feature = "azure"))] -pub(crate) fn hmac_sha256( - secret: impl AsRef<[u8]>, - bytes: impl AsRef<[u8]>, -) -> ring::hmac::Tag { +pub(crate) fn hmac_sha256(secret: impl AsRef<[u8]>, bytes: impl AsRef<[u8]>) -> ring::hmac::Tag { let key = ring::hmac::Key::new(ring::hmac::HMAC_SHA256, secret.as_ref()); ring::hmac::sign(&key, bytes.as_ref()) } /// Collect a stream into [`Bytes`] avoiding copying in the event of a single chunk -pub async fn collect_bytes( - mut stream: S, - size_hint: Option, -) -> Result +pub async fn collect_bytes(mut stream: S, size_hint: Option) -> Result where E: Send, S: Stream> + Send + Unpin, @@ -136,10 +130,7 @@ where } /// Returns a sorted list of ranges that cover `ranges` -fn merge_ranges( - ranges: &[std::ops::Range], - coalesce: usize, -) -> Vec> { +fn merge_ranges(ranges: &[std::ops::Range], coalesce: usize) -> Vec> { if ranges.is_empty() { return vec![]; } diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index 5703d7f24844..3fa1cc7104b3 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -51,11 +51,7 @@ impl ObjectStore for MyStore { todo!() } - async fn abort_multipart( - &self, - _: &Path, - _: &MultipartId, - ) -> object_store::Result<()> { + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> object_store::Result<()> { todo!() } @@ -79,10 +75,7 @@ impl ObjectStore for MyStore { todo!() } - async fn list_with_delimiter( - &self, - _: Option<&Path>, - ) -> object_store::Result { + async fn list_with_delimiter(&self, _: Option<&Path>) -> object_store::Result { todo!() } diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 825c7f00f905..7eed86d2826e 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -174,8 +174,7 @@ where max_def_level }; if def_level == max_def_level { - let value = - FromPrimitive::from_usize(rng.gen_range(min..max)).unwrap(); + let value = FromPrimitive::from_usize(rng.gen_range(min..max)).unwrap(); values.push(value); } def_levels.push(def_level); @@ -283,10 +282,8 @@ fn build_plain_encoded_string_page_iterator( max_def_level }; if def_level == max_def_level { - let string_value = - format!("Test value {k}, row group: {i}, page: {j}"); - values - .push(parquet::data_type::ByteArray::from(string_value.as_str())); + let string_value = format!("Test value {k}, row group: {i}, page: {j}"); + values.push(parquet::data_type::ByteArray::from(string_value.as_str())); } def_levels.push(def_level); } @@ -334,8 +331,7 @@ fn build_dictionary_encoded_string_page_iterator( }; if def_level == max_def_level { // select random value from list of unique values - let string_value = - unique_values[rng.gen_range(0..NUM_UNIQUE_VALUES)].as_str(); + let string_value = unique_values[rng.gen_range(0..NUM_UNIQUE_VALUES)].as_str(); values.push(parquet::data_type::ByteArray::from(string_value)); } def_levels.push(def_level); @@ -383,8 +379,7 @@ fn build_string_list_page_iterator( let mut column_chunk_pages = Vec::new(); for j in 0..PAGES_PER_GROUP { // generate page - let mut values: Vec = - Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); + let mut values: Vec = Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); let mut rep_levels = Vec::with_capacity(VALUES_PER_PAGE * MAX_LIST_LEN); for k in 0..VALUES_PER_PAGE { @@ -409,8 +404,7 @@ fn build_string_list_page_iterator( def_levels.push(2); } else { def_levels.push(3); - let value = - format!("Test value {k}[{l}], row group: {i}, page: {j}"); + let value = format!("Test value {k}[{l}], row group: {i}, page: {j}"); values.push(value.as_str().into()); } } @@ -470,21 +464,15 @@ fn create_primitive_array_reader( use parquet::arrow::array_reader::PrimitiveArrayReader; match column_desc.physical_type() { Type::INT32 => { - let reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); Box::new(reader) } Type::INT64 => { - let reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); Box::new(reader) } _ => unreachable!(), @@ -501,8 +489,7 @@ fn create_decimal_by_bytes_reader( make_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap() } Type::FIXED_LEN_BYTE_ARRAY => { - make_fixed_len_byte_array_reader(Box::new(page_iterator), column_desc, None) - .unwrap() + make_fixed_len_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap() } _ => unimplemented!(), } @@ -520,15 +507,10 @@ fn create_string_byte_array_dictionary_reader( column_desc: ColumnDescPtr, ) -> Box { use parquet::arrow::array_reader::make_byte_array_dictionary_reader; - let arrow_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - - make_byte_array_dictionary_reader( - Box::new(page_iterator), - column_desc, - Some(arrow_type), - ) - .unwrap() + let arrow_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + + make_byte_array_dictionary_reader(Box::new(page_iterator), column_desc, Some(arrow_type)) + .unwrap() } fn create_string_list_reader( @@ -564,10 +546,8 @@ fn bench_byte_decimal( ); group.bench_function("plain encoded, mandatory, no NULLs", |b| { b.iter(|| { - let array_reader = create_decimal_by_bytes_reader( - data.clone(), - mandatory_column_desc.clone(), - ); + let array_reader = + create_decimal_by_bytes_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -582,10 +562,8 @@ fn bench_byte_decimal( ); group.bench_function("plain encoded, optional, no NULLs", |b| { b.iter(|| { - let array_reader = create_decimal_by_bytes_reader( - data.clone(), - optional_column_desc.clone(), - ); + let array_reader = + create_decimal_by_bytes_reader(data.clone(), optional_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -601,10 +579,8 @@ fn bench_byte_decimal( ); group.bench_function("plain encoded, optional, half NULLs", |b| { b.iter(|| { - let array_reader = create_decimal_by_bytes_reader( - data.clone(), - optional_column_desc.clone(), - ); + let array_reader = + create_decimal_by_bytes_reader(data.clone(), optional_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -633,10 +609,8 @@ fn bench_primitive( ); group.bench_function("plain encoded, mandatory, no NULLs", |b| { b.iter(|| { - let array_reader = create_primitive_array_reader( - data.clone(), - mandatory_column_desc.clone(), - ); + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -685,10 +659,8 @@ fn bench_primitive( ); group.bench_function("binary packed, mandatory, no NULLs", |b| { b.iter(|| { - let array_reader = create_primitive_array_reader( - data.clone(), - mandatory_column_desc.clone(), - ); + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -720,10 +692,8 @@ fn bench_primitive( ); group.bench_function("binary packed skip, mandatory, no NULLs", |b| { b.iter(|| { - let array_reader = create_primitive_array_reader( - data.clone(), - mandatory_column_desc.clone(), - ); + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader_skip(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -763,25 +733,19 @@ fn bench_primitive( }); // dictionary encoded, no NULLs - let data = build_dictionary_encoded_primitive_page_iterator::( - mandatory_column_desc.clone(), - 0.0, - ); + let data = + build_dictionary_encoded_primitive_page_iterator::(mandatory_column_desc.clone(), 0.0); group.bench_function("dictionary encoded, mandatory, no NULLs", |b| { b.iter(|| { - let array_reader = create_primitive_array_reader( - data.clone(), - mandatory_column_desc.clone(), - ); + let array_reader = + create_primitive_array_reader(data.clone(), mandatory_column_desc.clone()); count = bench_array_reader(array_reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let data = build_dictionary_encoded_primitive_page_iterator::( - optional_column_desc.clone(), - 0.0, - ); + let data = + build_dictionary_encoded_primitive_page_iterator::(optional_column_desc.clone(), 0.0); group.bench_function("dictionary encoded, optional, no NULLs", |b| { b.iter(|| { let array_reader = @@ -792,10 +756,8 @@ fn bench_primitive( }); // dictionary encoded, half NULLs - let data = build_dictionary_encoded_primitive_page_iterator::( - optional_column_desc.clone(), - 0.5, - ); + let data = + build_dictionary_encoded_primitive_page_iterator::(optional_column_desc.clone(), 0.5); group.bench_function("dictionary encoded, optional, half NULLs", |b| { b.iter(|| { let array_reader = @@ -850,8 +812,7 @@ fn decimal_benches(c: &mut Criterion) { ); group.finish(); - let mut group = - c.benchmark_group("arrow_array_reader/FIXED_LENGTH_BYTE_ARRAY/Decimal128Array"); + let mut group = c.benchmark_group("arrow_array_reader/FIXED_LENGTH_BYTE_ARRAY/Decimal128Array"); let mandatory_decimal4_leaf_desc = schema.column(12); let optional_decimal4_leaf_desc = schema.column(13); bench_byte_decimal::( @@ -909,10 +870,8 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/StringArray"); // string, plain encoded, no NULLs - let plain_string_no_null_data = build_plain_encoded_string_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - ); + let plain_string_no_null_data = + build_plain_encoded_string_page_iterator(mandatory_string_column_desc.clone(), 0.0); group.bench_function("plain encoded, mandatory, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -924,10 +883,8 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let plain_string_no_null_data = build_plain_encoded_string_page_iterator( - optional_string_column_desc.clone(), - 0.0, - ); + let plain_string_no_null_data = + build_plain_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.0); group.bench_function("plain encoded, optional, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -940,10 +897,8 @@ fn add_benches(c: &mut Criterion) { }); // string, plain encoded, half NULLs - let plain_string_half_null_data = build_plain_encoded_string_page_iterator( - optional_string_column_desc.clone(), - 0.5, - ); + let plain_string_half_null_data = + build_plain_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.5); group.bench_function("plain encoded, optional, half NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -956,10 +911,8 @@ fn add_benches(c: &mut Criterion) { }); // string, dictionary encoded, no NULLs - let dictionary_string_no_null_data = build_dictionary_encoded_string_page_iterator( - mandatory_string_column_desc.clone(), - 0.0, - ); + let dictionary_string_no_null_data = + build_dictionary_encoded_string_page_iterator(mandatory_string_column_desc.clone(), 0.0); group.bench_function("dictionary encoded, mandatory, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -971,10 +924,8 @@ fn add_benches(c: &mut Criterion) { assert_eq!(count, EXPECTED_VALUE_COUNT); }); - let dictionary_string_no_null_data = build_dictionary_encoded_string_page_iterator( - optional_string_column_desc.clone(), - 0.0, - ); + let dictionary_string_no_null_data = + build_dictionary_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.0); group.bench_function("dictionary encoded, optional, no NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -987,10 +938,8 @@ fn add_benches(c: &mut Criterion) { }); // string, dictionary encoded, half NULLs - let dictionary_string_half_null_data = build_dictionary_encoded_string_page_iterator( - optional_string_column_desc.clone(), - 0.5, - ); + let dictionary_string_half_null_data = + build_dictionary_encoded_string_page_iterator(optional_string_column_desc.clone(), 0.5); group.bench_function("dictionary encoded, optional, half NULLs", |b| { b.iter(|| { let array_reader = create_string_byte_array_reader( @@ -1051,8 +1000,7 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/ListArray"); group.bench_function("plain encoded optional strings no NULLs", |b| { b.iter(|| { - let reader = - create_string_list_reader(list_data.clone(), string_list_desc.clone()); + let reader = create_string_list_reader(list_data.clone(), string_list_desc.clone()); count = bench_array_reader(reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); @@ -1060,8 +1008,7 @@ fn add_benches(c: &mut Criterion) { let list_data = build_string_list_page_iterator(string_list_desc.clone(), 0.5); group.bench_function("plain encoded optional strings half NULLs", |b| { b.iter(|| { - let reader = - create_string_list_reader(list_data.clone(), string_list_desc.clone()); + let reader = create_string_list_reader(list_data.clone(), string_list_desc.clone()); count = bench_array_reader(reader); }); assert_eq!(count, EXPECTED_VALUE_COUNT); diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index a494d9a97791..b84e897db2f3 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -311,10 +311,7 @@ fn write_batch_enable_bloom_filter(batch: &RecordBatch) -> Result<()> { } #[inline] -fn write_batch_with_option( - batch: &RecordBatch, - props: Option, -) -> Result<()> { +fn write_batch_with_option(batch: &RecordBatch, props: Option) -> Result<()> { let path = env::temp_dir().join("arrow_writer.temp"); let file = File::create(path).unwrap(); let mut writer = ArrowWriter::try_new(file, batch.schema(), props)?; diff --git a/parquet/benches/compression.rs b/parquet/benches/compression.rs index ce4f9aead751..2275a89405d9 100644 --- a/parquet/benches/compression.rs +++ b/parquet/benches/compression.rs @@ -57,11 +57,7 @@ fn do_bench(c: &mut Criterion, name: &str, uncompressed: &[u8]) { b.iter(|| { let mut out = Vec::new(); codec - .decompress( - black_box(&compressed), - &mut out, - Some(uncompressed.len()), - ) + .decompress(black_box(&compressed), &mut out, Some(uncompressed.len())) .unwrap(); out }); @@ -89,8 +85,7 @@ fn criterion_benchmark(c: &mut Criterion) { let mut uncompressed = Vec::with_capacity(DATA_SIZE); while uncompressed.len() < DATA_SIZE { let word = &words[rng.gen_range(0..words.len())]; - uncompressed - .extend_from_slice(&word[..word.len().min(DATA_SIZE - uncompressed.len())]) + uncompressed.extend_from_slice(&word[..word.len().min(DATA_SIZE - uncompressed.len())]) } assert_eq!(uncompressed.len(), DATA_SIZE); diff --git a/parquet/examples/read_with_rowgroup.rs b/parquet/examples/read_with_rowgroup.rs index b2d113d50529..8cccc7fe14ac 100644 --- a/parquet/examples/read_with_rowgroup.rs +++ b/parquet/examples/read_with_rowgroup.rs @@ -116,13 +116,12 @@ impl RowGroups for InMemoryRowGroup { "Invalid column index {i}, column was not fetched" ))), Some(data) => { - let page_reader: Box = - Box::new(SerializedPageReader::new( - data.clone(), - self.metadata.column(i), - self.num_rows(), - None, - )?); + let page_reader: Box = Box::new(SerializedPageReader::new( + data.clone(), + self.metadata.column(i), + self.num_rows(), + None, + )?); Ok(Box::new(ColumnChunkIterator { reader: Some(Ok(page_reader)), @@ -154,9 +153,7 @@ impl InMemoryRowGroup { None, )?; - ParquetRecordBatchReader::try_new_with_row_groups( - &levels, self, batch_size, selection, - ) + ParquetRecordBatchReader::try_new_with_row_groups(&levels, self, batch_size, selection) } /// fetch data from a reader in sync mode diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 2acc0faf130f..16cdf2934e6f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -112,8 +112,7 @@ impl ArrowReaderBuilder { /// If the batch_size more than the file row count, use the file row count. pub fn with_batch_size(self, batch_size: usize) -> Self { // Try to avoid allocate large buffer - let batch_size = - batch_size.min(self.metadata.file_metadata().num_rows() as usize); + let batch_size = batch_size.min(self.metadata.file_metadata().num_rows() as usize); Self { batch_size, ..self } } @@ -407,11 +406,8 @@ impl ParquetRecordBatchReaderBuilder { break; } - let array_reader = build_array_reader( - self.fields.as_deref(), - predicate.projection(), - &reader, - )?; + let array_reader = + build_array_reader(self.fields.as_deref(), predicate.projection(), &reader)?; selection = Some(evaluate_predicate( batch_size, @@ -422,8 +418,7 @@ impl ParquetRecordBatchReaderBuilder { } } - let array_reader = - build_array_reader(self.fields.as_deref(), &self.projection, &reader)?; + let array_reader = build_array_reader(self.fields.as_deref(), &self.projection, &reader)?; // If selection is empty, truncate if !selects_any(selection.as_ref()) { @@ -514,11 +509,10 @@ impl Iterator for ParquetRecordBatchReader { while read_records < self.batch_size && !selection.is_empty() { let front = selection.pop_front().unwrap(); if front.skip { - let skipped = - match self.array_reader.skip_records(front.row_count) { - Ok(skipped) => skipped, - Err(e) => return Some(Err(e.into())), - }; + let skipped = match self.array_reader.skip_records(front.row_count) { + Ok(skipped) => skipped, + Err(e) => return Some(Err(e.into())), + }; if skipped != front.row_count { return Some(Err(general_err!( @@ -590,10 +584,7 @@ impl ParquetRecordBatchReader { /// Create a new [`ParquetRecordBatchReader`] from the provided chunk reader /// /// See [`ParquetRecordBatchReaderBuilder`] for more options - pub fn try_new( - reader: T, - batch_size: usize, - ) -> Result { + pub fn try_new(reader: T, batch_size: usize) -> Result { ParquetRecordBatchReaderBuilder::try_new(reader)? .with_batch_size(batch_size) .build() @@ -609,11 +600,8 @@ impl ParquetRecordBatchReader { batch_size: usize, selection: Option, ) -> Result { - let array_reader = build_array_reader( - levels.levels.as_ref(), - &ProjectionMask::all(), - row_groups, - )?; + let array_reader = + build_array_reader(levels.levels.as_ref(), &ProjectionMask::all(), row_groups)?; Ok(Self { batch_size, @@ -696,8 +684,7 @@ pub(crate) fn evaluate_predicate( input_selection: Option, predicate: &mut dyn ArrowPredicate, ) -> Result { - let reader = - ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone()); + let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone()); let mut filters = vec![]; for maybe_batch in reader { let filter = predicate.evaluate(maybe_batch?)?; @@ -748,8 +735,8 @@ mod tests { use crate::basic::{ConvertedType, Encoding, Repetition, Type as PhysicalType}; use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE; use crate::data_type::{ - BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, - FixedLenByteArrayType, Int32Type, Int64Type, Int96Type, + BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, FixedLenByteArrayType, + Int32Type, Int64Type, Int96Type, }; use crate::errors::Result; use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; @@ -921,8 +908,7 @@ mod tests { writer.write(&original).unwrap(); writer.close().unwrap(); - let mut reader = - ParquetRecordBatchReader::try_new(Bytes::from(buf), 1024).unwrap(); + let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buf), 1024).unwrap(); let ret = reader.next().unwrap().unwrap(); assert_eq!(ret, original); @@ -978,9 +964,8 @@ mod tests { Arc::new( vals.iter() .map(|x| { - x.as_ref().map(|b| { - i64::from_le_bytes(b.as_ref()[4..12].try_into().unwrap()) - }) + x.as_ref() + .map(|b| i64::from_le_bytes(b.as_ref()[4..12].try_into().unwrap())) }) .collect::(), ) @@ -1070,10 +1055,8 @@ mod tests { let mut opts = TestOptions::new(2, 20, 15).with_null_percent(50); opts.encoding = *encoding; - let data_type = ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - ); + let data_type = + ArrowDataType::Dictionary(Box::new(key.clone()), Box::new(ArrowDataType::Utf8)); // Cannot run full test suite as keys overflow, run small test instead single_column_reader_test::( @@ -1099,10 +1082,8 @@ mod tests { ]; for key in &key_types { - let data_type = ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - ); + let data_type = + ArrowDataType::Dictionary(Box::new(key.clone()), Box::new(ArrowDataType::Utf8)); run_single_column_reader_tests::( 2, @@ -1140,27 +1121,23 @@ mod tests { [1, 2, 3, 4, 5, 6, 7, 8].into_iter().map(i256::from_i128), ); - let data = - ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( - "decimals", - decimals.data_type().clone(), - false, - )]))) - .len(8) - .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) - .child_data(vec![decimals.into_data()]) - .build() - .unwrap(); - - let written = RecordBatch::try_from_iter([( - "struct", - Arc::new(StructArray::from(data)) as ArrayRef, - )]) + let data = ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( + "decimals", + decimals.data_type().clone(), + false, + )]))) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![decimals.into_data()]) + .build() .unwrap(); + let written = + RecordBatch::try_from_iter([("struct", Arc::new(StructArray::from(data)) as ArrayRef)]) + .unwrap(); + let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); writer.write(&written).unwrap(); writer.close().unwrap(); @@ -1177,27 +1154,23 @@ mod tests { #[test] fn test_int32_nullable_struct() { let int32 = Int32Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); - let data = - ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( - "int32", - int32.data_type().clone(), - false, - )]))) - .len(8) - .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) - .child_data(vec![int32.into_data()]) - .build() - .unwrap(); - - let written = RecordBatch::try_from_iter([( - "struct", - Arc::new(StructArray::from(data)) as ArrayRef, - )]) + let data = ArrayDataBuilder::new(ArrowDataType::Struct(Fields::from(vec![Field::new( + "int32", + int32.data_type().clone(), + false, + )]))) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![int32.into_data()]) + .build() .unwrap(); + let written = + RecordBatch::try_from_iter([("struct", Arc::new(StructArray::from(data)) as ArrayRef)]) + .unwrap(); + let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); writer.write(&written).unwrap(); writer.close().unwrap(); @@ -1229,15 +1202,12 @@ mod tests { .build() .unwrap(); - let written = RecordBatch::try_from_iter([( - "list", - Arc::new(ListArray::from(data)) as ArrayRef, - )]) - .unwrap(); + let written = + RecordBatch::try_from_iter([("list", Arc::new(ListArray::from(data)) as ArrayRef)]) + .unwrap(); let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); writer.write(&written).unwrap(); writer.close().unwrap(); @@ -1408,11 +1378,8 @@ mod tests { let mut rng = thread_rng(); let step = rng.gen_range(self.record_batch_size..self.num_rows); - let row_selections = create_test_selection( - step, - self.num_row_groups * self.num_rows, - rng.gen::(), - ); + let row_selections = + create_test_selection(step, self.num_row_groups * self.num_rows, rng.gen::()); Self { row_selections: Some(row_selections), ..self @@ -1598,8 +1565,7 @@ mod tests { ]; all_options.into_iter().for_each(|opts| { - for writer_version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] - { + for writer_version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { for encoding in encodings { let opts = TestOptions { writer_version, @@ -1715,8 +1681,7 @@ mod tests { let expected_data = match opts.row_selections { Some((selections, row_count)) => { - let mut without_skip_data = - gen_expected_data::(def_levels.as_ref(), &values); + let mut without_skip_data = gen_expected_data::(def_levels.as_ref(), &values); let mut skip_data: Vec> = vec![]; let dequeue: VecDeque = selections.clone().into(); @@ -1956,12 +1921,9 @@ mod tests { { // Write using low-level parquet API (#1167) - let mut writer = SerializedFileWriter::new( - file.try_clone().unwrap(), - schema, - Default::default(), - ) - .unwrap(); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, Default::default()) + .unwrap(); { let mut row_group_writer = writer.next_row_group().unwrap(); @@ -1986,9 +1948,7 @@ mod tests { let expected_schema = Schema::new(Fields::from(vec![Field::new( "group", - ArrowDataType::Struct( - vec![Field::new("leaf", ArrowDataType::Int32, false)].into(), - ), + ArrowDataType::Struct(vec![Field::new("leaf", ArrowDataType::Int32, false)].into()), true, )])); @@ -2002,24 +1962,22 @@ mod tests { fn test_invalid_utf8() { // a parquet file with 1 column with invalid utf8 let data = vec![ - 80, 65, 82, 49, 21, 6, 21, 22, 21, 22, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, - 21, 0, 18, 28, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, - 108, 111, 0, 0, 0, 3, 1, 5, 0, 0, 0, 104, 101, 255, 108, 111, 38, 110, 28, - 21, 12, 25, 37, 6, 0, 25, 24, 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, 38, - 8, 60, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, - 0, 0, 0, 21, 4, 25, 44, 72, 4, 114, 111, 111, 116, 21, 2, 0, 21, 12, 37, 2, - 24, 2, 99, 49, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 28, 25, 28, 38, 110, 28, - 21, 12, 25, 37, 6, 0, 25, 24, 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, 38, - 8, 60, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, - 0, 0, 0, 22, 102, 22, 2, 0, 40, 44, 65, 114, 114, 111, 119, 50, 32, 45, 32, - 78, 97, 116, 105, 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, - 101, 109, 101, 110, 116, 97, 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, - 114, 111, 119, 0, 130, 0, 0, 0, 80, 65, 82, 49, + 80, 65, 82, 49, 21, 6, 21, 22, 21, 22, 92, 21, 2, 21, 0, 21, 2, 21, 0, 21, 4, 21, 0, + 18, 28, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0, + 3, 1, 5, 0, 0, 0, 104, 101, 255, 108, 111, 38, 110, 28, 21, 12, 25, 37, 6, 0, 25, 24, + 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, 38, 8, 60, 54, 0, 40, 5, 104, 101, 255, 108, + 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0, 21, 4, 25, 44, 72, 4, 114, 111, 111, 116, + 21, 2, 0, 21, 12, 37, 2, 24, 2, 99, 49, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 28, 25, 28, + 38, 110, 28, 21, 12, 25, 37, 6, 0, 25, 24, 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, + 38, 8, 60, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, + 0, 22, 102, 22, 2, 0, 40, 44, 65, 114, 114, 111, 119, 50, 32, 45, 32, 78, 97, 116, 105, + 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, 109, 101, 110, 116, 97, + 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0, 130, 0, 0, 0, 80, 65, + 82, 49, ]; let file = Bytes::from(data); - let mut record_batch_reader = - ParquetRecordBatchReader::try_new(file, 10).unwrap(); + let mut record_batch_reader = ParquetRecordBatchReader::try_new(file, 10).unwrap(); let error = record_batch_reader.next().unwrap().unwrap_err(); @@ -2111,8 +2069,7 @@ mod tests { vec![(3, 2), (3, 2), (3, 1), (3, 1), (3, 2), (2, 2)] ); - let get_dict = - |batch: &RecordBatch| batch.column(0).to_data().child_data()[0].clone(); + let get_dict = |batch: &RecordBatch| batch.column(0).to_data().child_data()[0].clone(); // First and second batch in same row group -> same dictionary assert_eq!(get_dict(&batches[0]), get_dict(&batches[1])); @@ -2129,8 +2086,7 @@ mod tests { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{testdata}/null_list.parquet"); let file = File::open(path).unwrap(); - let mut record_batch_reader = - ParquetRecordBatchReader::try_new(file, 60).unwrap(); + let mut record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); let batch = record_batch_reader.next().unwrap().unwrap(); assert_eq!(batch.num_rows(), 1); @@ -2162,8 +2118,7 @@ mod tests { ); let options = ArrowReaderOptions::new().with_skip_arrow_metadata(true); - let builder = - ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); let schema = builder.schema(); assert_eq!(schema.fields().len(), 1); assert_eq!(schema.field(0), &arrow_field); @@ -2180,14 +2135,12 @@ mod tests { .into_iter() .collect(); - let schema_with_metadata = - Arc::new(Schema::new(vec![field.with_metadata(metadata)])); + let schema_with_metadata = Arc::new(Schema::new(vec![field.with_metadata(metadata)])); assert_ne!(schema_with_metadata, schema_without_metadata); let batch = - RecordBatch::try_new(schema_with_metadata.clone(), vec![col as ArrayRef]) - .unwrap(); + RecordBatch::try_new(schema_with_metadata.clone(), vec![col as ArrayRef]).unwrap(); let file = |version: WriterVersion| { let props = WriterProperties::builder() @@ -2195,12 +2148,9 @@ mod tests { .build(); let file = tempfile().unwrap(); - let mut writer = ArrowWriter::try_new( - file.try_clone().unwrap(), - batch.schema(), - Some(props), - ) - .unwrap(); + let mut writer = + ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), Some(props)) + .unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); file @@ -2212,31 +2162,24 @@ mod tests { let v2_reader = file(WriterVersion::PARQUET_2_0); let arrow_reader = - ParquetRecordBatchReader::try_new(v1_reader.try_clone().unwrap(), 1024) - .unwrap(); + ParquetRecordBatchReader::try_new(v1_reader.try_clone().unwrap(), 1024).unwrap(); assert_eq!(arrow_reader.schema(), schema_with_metadata); - let reader = ParquetRecordBatchReaderBuilder::try_new_with_options( - v1_reader, - skip_options.clone(), - ) - .unwrap() - .build() - .unwrap(); + let reader = + ParquetRecordBatchReaderBuilder::try_new_with_options(v1_reader, skip_options.clone()) + .unwrap() + .build() + .unwrap(); assert_eq!(reader.schema(), schema_without_metadata); let arrow_reader = - ParquetRecordBatchReader::try_new(v2_reader.try_clone().unwrap(), 1024) - .unwrap(); + ParquetRecordBatchReader::try_new(v2_reader.try_clone().unwrap(), 1024).unwrap(); assert_eq!(arrow_reader.schema(), schema_with_metadata); - let reader = ParquetRecordBatchReaderBuilder::try_new_with_options( - v2_reader, - skip_options, - ) - .unwrap() - .build() - .unwrap(); + let reader = ParquetRecordBatchReaderBuilder::try_new_with_options(v2_reader, skip_options) + .unwrap() + .build() + .unwrap(); assert_eq!(reader.schema(), schema_without_metadata); } @@ -2288,16 +2231,12 @@ mod tests { ) .unwrap(); for _ in 0..2 { - let mut list_builder = - ListBuilder::new(Int32Builder::with_capacity(batch_size)); + let mut list_builder = ListBuilder::new(Int32Builder::with_capacity(batch_size)); for _ in 0..(batch_size) { list_builder.append(true); } - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(list_builder.finish())], - ) - .unwrap(); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(list_builder.finish())]) + .unwrap(); writer.write(&batch).unwrap(); } writer.close().unwrap(); @@ -2359,8 +2298,7 @@ mod tests { match skip { true => { if let Some(last_start) = last_start.take() { - expected_batches - .push(column.slice(last_start, row_offset - last_start)) + expected_batches.push(column.slice(last_start, row_offset - last_start)) } row_offset += to_read } @@ -2424,8 +2362,7 @@ mod tests { let do_test = |batch_size: usize, selection_len: usize| { for skip_first in [false, true] { - let selections = - create_test_selection(batch_size, data.num_rows(), skip_first).0; + let selections = create_test_selection(batch_size, data.num_rows(), skip_first).0; let expected = get_expected_batches(&data, &selections, batch_size); let skip_reader = create_skip_reader(&test_file, batch_size, selections); @@ -2734,8 +2671,7 @@ mod tests { .unwrap(); let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); writer.write(&written).unwrap(); writer.close().unwrap(); @@ -2763,8 +2699,7 @@ mod tests { .build(); let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); @@ -2809,8 +2744,7 @@ mod tests { writer.write(&batch).unwrap(); writer.close().unwrap(); - let builder = - ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)).unwrap(); let t1 = builder.parquet_schema().columns()[0].physical_type(); assert_eq!(t1, PhysicalType::INT32); let t2 = builder.parquet_schema().columns()[1].physical_type(); @@ -2850,11 +2784,9 @@ mod tests { list_a_builder.values().append_value(format!("{i} {j}")); list_a_builder.append(true); } - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(list_a_builder.finish())], - ) - .unwrap(); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(list_a_builder.finish())]) + .unwrap(); writer.write(&batch).unwrap(); } let _metadata = writer.close().unwrap(); diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index a558f893c43e..5063d24afd5f 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -103,8 +103,7 @@ impl RowSelection { let offset = next_offset; next_offset += filter.len(); assert_eq!(filter.null_count(), 0); - SlicesIterator::new(filter) - .map(move |(start, end)| start + offset..end + offset) + SlicesIterator::new(filter).map(move |(start, end)| start + offset..end + offset) }); Self::from_consecutive_ranges(iter, total_rows) @@ -180,10 +179,7 @@ impl RowSelection { /// Note: this method does not make any effort to combine consecutive ranges, nor coalesce /// ranges that are close together. This is instead delegated to the IO subsystem to optimise, /// e.g. [`ObjectStore::get_ranges`](object_store::ObjectStore::get_ranges) - pub fn scan_ranges( - &self, - page_locations: &[crate::format::PageLocation], - ) -> Vec> { + pub fn scan_ranges(&self, page_locations: &[crate::format::PageLocation]) -> Vec> { let mut ranges = vec![]; let mut row_offset = 0; @@ -204,8 +200,7 @@ impl RowSelection { if let Some(next_page) = pages.peek() { if row_offset + selector.row_count > next_page.first_row_index as usize { - let remaining_in_page = - next_page.first_row_index as usize - row_offset; + let remaining_in_page = next_page.first_row_index as usize - row_offset; selector.row_count -= remaining_in_page; row_offset += remaining_in_page; current_page = pages.next(); @@ -213,9 +208,7 @@ impl RowSelection { continue; } else { - if row_offset + selector.row_count - == next_page.first_row_index as usize - { + if row_offset + selector.row_count == next_page.first_row_index as usize { current_page = pages.next(); current_page_included = false; } @@ -472,10 +465,7 @@ impl From for VecDeque { /// other: NYNNNNNNY /// /// returned: NNNNNNNNYYNYN -fn intersect_row_selections( - left: &[RowSelector], - right: &[RowSelector], -) -> Vec { +fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> Vec { let mut res = Vec::with_capacity(left.len()); let mut l_iter = left.iter().copied().peekable(); let mut r_iter = right.iter().copied().peekable(); @@ -942,8 +932,7 @@ mod tests { } } - let expected = - RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); + let expected = RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); assert_eq!(a_len, total_rows); @@ -972,8 +961,7 @@ mod tests { #[test] fn test_limit() { // Limit to existing limit should no-op - let selection = - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); + let selection = RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); let limited = selection.limit(10); assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 6dbc83dd05c4..3db2e4a6a063 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -17,9 +17,7 @@ use crate::basic::Encoding; use crate::bloom_filter::Sbbf; -use crate::column::writer::encoder::{ - ColumnValueEncoder, DataPageValues, DictionaryPage, -}; +use crate::column::writer::encoder::{ColumnValueEncoder, DataPageValues, DictionaryPage}; use crate::data_type::{AsBytes, ByteArray, Int32Type}; use crate::encodings::encoding::{DeltaBitPackEncoder, Encoder}; use crate::encodings::rle::RleEncoder; @@ -29,8 +27,8 @@ use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use arrow_array::{ - Array, ArrayAccessor, BinaryArray, DictionaryArray, LargeBinaryArray, - LargeStringArray, StringArray, + Array, ArrayAccessor, BinaryArray, DictionaryArray, LargeBinaryArray, LargeStringArray, + StringArray, }; use arrow_schema::DataType; @@ -119,12 +117,13 @@ impl FallbackEncoder { /// Create the fallback encoder for the given [`ColumnDescPtr`] and [`WriterProperties`] fn new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result { // Set either main encoder or fallback encoder. - let encoding = props.encoding(descr.path()).unwrap_or_else(|| { - match props.writer_version() { - WriterVersion::PARQUET_1_0 => Encoding::PLAIN, - WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, - } - }); + let encoding = + props + .encoding(descr.path()) + .unwrap_or_else(|| match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + }); let encoder = match encoding { Encoding::PLAIN => FallbackEncoderImpl::Plain { buffer: vec![] }, @@ -232,9 +231,7 @@ impl FallbackEncoder { max_value: Option, ) -> Result> { let (buf, encoding) = match &mut self.encoder { - FallbackEncoderImpl::Plain { buffer } => { - (std::mem::take(buffer), Encoding::PLAIN) - } + FallbackEncoderImpl::Plain { buffer } => (std::mem::take(buffer), Encoding::PLAIN), FallbackEncoderImpl::DeltaLength { buffer, lengths } => { let lengths = lengths.flush_buffer()?; @@ -253,9 +250,8 @@ impl FallbackEncoder { let prefix_lengths = prefix_lengths.flush_buffer()?; let suffix_lengths = suffix_lengths.flush_buffer()?; - let mut out = Vec::with_capacity( - prefix_lengths.len() + suffix_lengths.len() + buffer.len(), - ); + let mut out = + Vec::with_capacity(prefix_lengths.len() + suffix_lengths.len() + buffer.len()); out.extend_from_slice(prefix_lengths.data()); out.extend_from_slice(suffix_lengths.data()); out.extend_from_slice(buffer); @@ -437,12 +433,7 @@ impl ColumnValueEncoder for ByteArrayEncoder { }) } - fn write( - &mut self, - _values: &Self::Values, - _offset: usize, - _len: usize, - ) -> Result<()> { + fn write(&mut self, _values: &Self::Values, _offset: usize, _len: usize) -> Result<()> { unreachable!("should call write_gather instead") } diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 4a0bd551e1f9..df37665ce1fc 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -50,10 +50,7 @@ use std::sync::Arc; /// Performs a depth-first scan of the children of `array`, constructing [`ArrayLevels`] /// for each leaf column encountered -pub(crate) fn calculate_array_levels( - array: &ArrayRef, - field: &Field, -) -> Result> { +pub(crate) fn calculate_array_levels(array: &ArrayRef, field: &Field) -> Result> { let mut builder = LevelInfoBuilder::try_new(field, Default::default(), array)?; builder.write(0..array.len()); Ok(builder.finish()) @@ -134,11 +131,7 @@ enum LevelInfoBuilder { impl LevelInfoBuilder { /// Create a new [`LevelInfoBuilder`] for the given [`Field`] and parent [`LevelContext`] - fn try_new( - field: &Field, - parent_ctx: LevelContext, - array: &ArrayRef, - ) -> Result { + fn try_new(field: &Field, parent_ctx: LevelContext, array: &ArrayRef) -> Result { assert_eq!(field.data_type(), array.data_type()); let is_nullable = field.is_nullable(); @@ -227,9 +220,7 @@ impl LevelInfoBuilder { LevelInfoBuilder::List(v, _, _, _) | LevelInfoBuilder::LargeList(v, _, _, _) | LevelInfoBuilder::FixedSizeList(v, _, _, _) => v.finish(), - LevelInfoBuilder::Struct(v, _, _) => { - v.into_iter().flat_map(|l| l.finish()).collect() - } + LevelInfoBuilder::Struct(v, _, _) => v.into_iter().flat_map(|l| l.finish()).collect(), } } @@ -408,55 +399,52 @@ impl LevelInfoBuilder { nulls: Option<&NullBuffer>, range: Range, ) { - let write_non_null = - |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { - let values_start = start_idx * fixed_size; - let values_end = end_idx * fixed_size; - child.write(values_start..values_end); + let write_non_null = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + let values_start = start_idx * fixed_size; + let values_end = end_idx * fixed_size; + child.write(values_start..values_end); - child.visit_leaves(|leaf| { - let rep_levels = leaf.rep_levels.as_mut().unwrap(); + child.visit_leaves(|leaf| { + let rep_levels = leaf.rep_levels.as_mut().unwrap(); - let row_indices = (0..fixed_size) - .rev() - .cycle() - .take(values_end - values_start); - - // Step backward over the child rep levels and mark the start of each list - rep_levels - .iter_mut() - .rev() - // Filter out reps from nested children - .filter(|&&mut r| r == ctx.rep_level) - .zip(row_indices) - .for_each(|(r, idx)| { - if idx == 0 { - *r = ctx.rep_level - 1; - } - }); - }) - }; + let row_indices = (0..fixed_size) + .rev() + .cycle() + .take(values_end - values_start); + + // Step backward over the child rep levels and mark the start of each list + rep_levels + .iter_mut() + .rev() + // Filter out reps from nested children + .filter(|&&mut r| r == ctx.rep_level) + .zip(row_indices) + .for_each(|(r, idx)| { + if idx == 0 { + *r = ctx.rep_level - 1; + } + }); + }) + }; // If list size is 0, ignore values and just write rep/def levels. - let write_empty = - |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { - let len = end_idx - start_idx; - child.visit_leaves(|leaf| { - let rep_levels = leaf.rep_levels.as_mut().unwrap(); - rep_levels.extend(std::iter::repeat(ctx.rep_level - 1).take(len)); - let def_levels = leaf.def_levels.as_mut().unwrap(); - def_levels.extend(std::iter::repeat(ctx.def_level - 1).take(len)); - }) - }; + let write_empty = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + let len = end_idx - start_idx; + child.visit_leaves(|leaf| { + let rep_levels = leaf.rep_levels.as_mut().unwrap(); + rep_levels.extend(std::iter::repeat(ctx.rep_level - 1).take(len)); + let def_levels = leaf.def_levels.as_mut().unwrap(); + def_levels.extend(std::iter::repeat(ctx.def_level - 1).take(len)); + }) + }; - let write_rows = - |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { - if fixed_size > 0 { - write_non_null(child, start_idx, end_idx) - } else { - write_empty(child, start_idx, end_idx) - } - }; + let write_rows = |child: &mut LevelInfoBuilder, start_idx: usize, end_idx: usize| { + if fixed_size > 0 { + write_non_null(child, start_idx, end_idx) + } else { + write_empty(child, start_idx, end_idx) + } + }; match nulls { Some(nulls) => { @@ -1019,10 +1007,7 @@ mod tests { let c = Int32Array::from_iter([Some(1), None, Some(3), None, Some(5), Some(6)]); let leaf = Arc::new(c) as ArrayRef; let c_field = Arc::new(Field::new("c", DataType::Int32, true)); - let b = StructArray::from(( - (vec![(c_field, leaf.clone())]), - Buffer::from([0b00110111]), - )); + let b = StructArray::from(((vec![(c_field, leaf.clone())]), Buffer::from([0b00110111]))); let b_field = Arc::new(Field::new("b", b.data_type().clone(), true)); let a = StructArray::from(( @@ -1053,8 +1038,7 @@ mod tests { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); let a_value_offsets = arrow::buffer::Buffer::from_iter([0_i32, 1, 3, 3, 6, 10]); - let a_list_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let a_list_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let a_list_data = ArrayData::builder(a_list_type.clone()) .len(5) .add_buffer(a_value_offsets) @@ -1109,9 +1093,7 @@ mod tests { Field::new("b", DataType::Int32, true), Field::new( "c", - DataType::Struct( - vec![struct_field_d.clone(), struct_field_e.clone()].into(), - ), + DataType::Struct(vec![struct_field_d.clone(), struct_field_e.clone()].into()), true, // https://github.com/apache/arrow-rs/issues/245 ), ]); @@ -1126,8 +1108,7 @@ mod tests { // Construct a buffer for value offsets, for the nested array: // [[1], [2, 3], null, [4, 5, 6], [7, 8, 9, 10]] - let g_value_offsets = - arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let g_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two let g_list_data = ArrayData::builder(struct_field_g.data_type().clone()) @@ -1239,8 +1220,7 @@ mod tests { // build a record batch let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]).unwrap(); let struct_null_level = calculate_array_levels(batch.column(0), batch.schema().field(0)).unwrap(); @@ -1262,8 +1242,7 @@ mod tests { // build a record batch let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]).unwrap(); let struct_non_null_level = calculate_array_levels(batch.column(0), batch.schema().field(0)).unwrap(); @@ -1498,9 +1477,9 @@ mod tests { let field_a2 = Arc::new(Field::new("integers", a2.data_type().clone(), true)); let nulls = Buffer::from([0b00110111]); - let struct_a = Arc::new( - StructArray::try_from((vec![(field_a1, a1), (field_a2, a2)], nulls)).unwrap(), - ) as ArrayRef; + let struct_a = + Arc::new(StructArray::try_from((vec![(field_a1, a1), (field_a2, a2)], nulls)).unwrap()) + as ArrayRef; let offsets = Buffer::from_iter([0_i32, 0, 2, 2, 3, 5, 5]); let nulls = Buffer::from([0b00111100]); @@ -1793,8 +1772,7 @@ mod tests { #[test] fn test_fixed_size_list_of_var_lists() { // [[[1, null, 3], null], [[4], []], [[5, 6], [null, null]], null] - let mut builder = - FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); + let mut builder = FixedSizeListBuilder::new(ListBuilder::new(Int32Builder::new()), 2); builder.values().append_value([Some(1), None, Some(3)]); builder.values().append_null(); builder.append(true); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 752eff86c5e9..a9cd1afb2479 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -28,13 +28,10 @@ use thrift::protocol::TCompactOutputProtocol; use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; -use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef, -}; +use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef}; use super::schema::{ - add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, - decimal_length_from_precision, + add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, decimal_length_from_precision, }; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; @@ -776,8 +773,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result { return Err(ParquetError::NYI( - "Attempting to write an Arrow type that is not yet implemented" - .to_string(), + "Attempting to write an Arrow type that is not yet implemented".to_string(), )); } }; @@ -802,10 +798,7 @@ fn write_primitive( ) } -fn get_bool_array_slice( - array: &arrow_array::BooleanArray, - indices: &[usize], -) -> Vec { +fn get_bool_array_slice(array: &arrow_array::BooleanArray, indices: &[usize]) -> Vec { let mut values = Vec::with_capacity(indices.len()); for i in indices { values.push(array.value(*i)) @@ -894,9 +887,7 @@ mod tests { use std::fs::File; use std::sync::Arc; - use crate::arrow::arrow_reader::{ - ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, - }; + use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; use arrow::datatypes::ToByteSlice; use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type}; use arrow::error::Result as ArrowResult; @@ -930,9 +921,7 @@ mod tests { let b = Int32Array::from(vec![Some(1), None, None, Some(4), Some(5)]); // build a record batch - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); roundtrip(batch, Some(SMALL_SIZE / 2)); } @@ -947,10 +936,7 @@ mod tests { buffer } - fn get_bytes_by_into_inner( - schema: SchemaRef, - expected_batch: &RecordBatch, - ) -> Vec { + fn get_bytes_by_into_inner(schema: SchemaRef, expected_batch: &RecordBatch) -> Vec { let mut writer = ArrowWriter::try_new(Vec::new(), schema, None).unwrap(); writer.write(expected_batch).unwrap(); writer.into_inner().unwrap() @@ -977,8 +963,7 @@ mod tests { get_bytes_by_into_inner(schema, &expected_batch), ] { let cursor = Bytes::from(buffer); - let mut record_batch_reader = - ParquetRecordBatchReader::try_new(cursor, 1024).unwrap(); + let mut record_batch_reader = ParquetRecordBatchReader::try_new(cursor, 1024).unwrap(); let actual_batch = record_batch_reader .next() @@ -1025,8 +1010,7 @@ mod tests { // Construct a buffer for value offsets, for the nested array: // [[1], [2, 3], null, [4, 5, 6], [7, 8, 9, 10]] - let a_value_offsets = - arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( @@ -1066,8 +1050,7 @@ mod tests { // Construct a buffer for value offsets, for the nested array: // [[1], [2, 3], [], [4, 5, 6], [7, 8, 9, 10]] - let a_value_offsets = - arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( @@ -1122,8 +1105,7 @@ mod tests { } fn get_decimal_batch(precision: u8, scale: i8) -> RecordBatch { - let decimal_field = - Field::new("a", DataType::Decimal128(precision, scale), false); + let decimal_field = Field::new("a", DataType::Decimal128(precision, scale), false); let schema = Schema::new(vec![decimal_field]); let decimal_values = vec![10_000, 50_000, 0, -100] @@ -1193,8 +1175,7 @@ mod tests { // Construct a buffer for value offsets, for the nested array: // [[1], [2, 3], [], [4, 5, 6], [7, 8, 9, 10]] - let g_value_offsets = - arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let g_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); // Construct a list array from the above two let g_list_data = ArrayData::builder(struct_field_g.data_type().clone()) @@ -1268,8 +1249,7 @@ mod tests { // build a record batch let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]).unwrap(); roundtrip(batch, Some(SMALL_SIZE / 2)); } @@ -1409,8 +1389,7 @@ mod tests { #[test] fn arrow_writer_page_size() { - let schema = - Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); + let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); let mut builder = StringBuilder::with_capacity(100, 329 * 10_000); @@ -1474,10 +1453,7 @@ mod tests { const SMALL_SIZE: usize = 7; const MEDIUM_SIZE: usize = 63; - fn roundtrip( - expected_batch: RecordBatch, - max_row_group_size: Option, - ) -> Vec { + fn roundtrip(expected_batch: RecordBatch, max_row_group_size: Option) -> Vec { let mut files = vec![]; for version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { let mut props = WriterProperties::builder().set_writer_version(version); @@ -1532,9 +1508,7 @@ mod tests { } fn roundtrip_opts(expected_batch: &RecordBatch, props: WriterProperties) -> File { - roundtrip_opts_with_array_validation(expected_batch, props, |a, b| { - assert_eq!(a, b) - }) + roundtrip_opts_with_array_validation(expected_batch, props, |a, b| assert_eq!(a, b)) } struct RoundTripOptions { @@ -1559,10 +1533,7 @@ mod tests { one_column_roundtrip_with_options(RoundTripOptions::new(values, nullable)) } - fn one_column_roundtrip_with_schema( - values: ArrayRef, - schema: SchemaRef, - ) -> Vec { + fn one_column_roundtrip_with_schema(values: ArrayRef, schema: SchemaRef) -> Vec { let mut options = RoundTripOptions::new(values, false); options.schema = schema; one_column_roundtrip_with_options(options) @@ -1576,14 +1547,13 @@ mod tests { } = options; let encodings = match values.data_type() { - DataType::Utf8 - | DataType::LargeUtf8 - | DataType::Binary - | DataType::LargeBinary => vec![ - Encoding::PLAIN, - Encoding::DELTA_BYTE_ARRAY, - Encoding::DELTA_LENGTH_BYTE_ARRAY, - ], + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => { + vec![ + Encoding::PLAIN, + Encoding::DELTA_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + ] + } DataType::Int64 | DataType::Int32 | DataType::Int16 @@ -1686,9 +1656,7 @@ mod tests { let row_group_reader = file_reader .get_row_group(ri) .expect("Unable to read row group"); - if let Some(sbbf) = - row_group_reader.get_column_bloom_filter(column_index) - { + if let Some(sbbf) = row_group_reader.get_column_bloom_filter(column_index) { bloom_filters.push(sbbf.clone()); } else { panic!("No bloom filter for column named {file_column} found"); @@ -1747,18 +1715,13 @@ mod tests { .take(200_000) .collect::(), ); - let schema = - Schema::new(vec![Field::new("col", values.data_type().clone(), true)]); - let expected_batch = - RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + let schema = Schema::new(vec![Field::new("col", values.data_type().clone(), true)]); + let expected_batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); let file = tempfile::tempfile().unwrap(); - let mut writer = ArrowWriter::try_new( - file.try_clone().unwrap(), - expected_batch.schema(), - None, - ) - .expect("Unable to write file"); + let mut writer = + ArrowWriter::try_new(file.try_clone().unwrap(), expected_batch.schema(), None) + .expect("Unable to write file"); writer.write(&expected_batch).unwrap(); writer.close().unwrap(); } @@ -1770,8 +1733,8 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); let mut out = Vec::with_capacity(1024); - let mut writer = ArrowWriter::try_new(&mut out, batch.schema(), None) - .expect("Unable to write file"); + let mut writer = + ArrowWriter::try_new(&mut out, batch.schema(), None).expect("Unable to write file"); writer.write(&batch).unwrap(); let file_meta_data = writer.close().unwrap(); for row_group in file_meta_data.row_groups { @@ -2050,8 +2013,7 @@ mod tests { #[test] fn null_list_single_column() { let null_field = Field::new("item", DataType::Null, true); - let list_field = - Field::new("emptylist", DataType::List(Arc::new(null_field)), true); + let list_field = Field::new("emptylist", DataType::List(Arc::new(null_field)), true); let schema = Schema::new(vec![list_field]); @@ -2087,8 +2049,7 @@ mod tests { #[test] fn list_single_column() { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - let a_value_offsets = - arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); + let a_value_offsets = arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice()); let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new( "item", DataType::Int32, @@ -2112,8 +2073,7 @@ mod tests { #[test] fn large_list_single_column() { let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - let a_value_offsets = - arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); + let a_value_offsets = arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice()); let a_list_data = ArrayData::builder(DataType::LargeList(Arc::new(Field::new( "large_item", DataType::Int32, @@ -2195,11 +2155,12 @@ mod tests { roundtrip_opts_with_array_validation(&expected_batch, props, |a, b| { let string_array_a = StringArray::from(a.clone()); let string_array_b = StringArray::from(b.clone()); - let vec_a: Vec<&str> = - string_array_a.iter().map(|v| v.unwrap()).collect(); - let vec_b: Vec<&str> = - string_array_b.iter().map(|v| v.unwrap()).collect(); - assert_eq!(vec_a, vec_b, "failed for encoder: {encoding:?} and row_group_size: {row_group_size:?}"); + let vec_a: Vec<&str> = string_array_a.iter().map(|v| v.unwrap()).collect(); + let vec_b: Vec<&str> = string_array_b.iter().map(|v| v.unwrap()).collect(); + assert_eq!( + vec_a, vec_b, + "failed for encoder: {encoding:?} and row_group_size: {row_group_size:?}" + ); }); } } @@ -2484,12 +2445,10 @@ mod tests { .build(); let mut writer = - ArrowWriter::try_new(file.try_clone().unwrap(), schema.clone(), Some(props)) - .unwrap(); + ArrowWriter::try_new(file.try_clone().unwrap(), schema.clone(), Some(props)).unwrap(); for array in arrays { - let batch = - RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); writer.write(&batch).unwrap(); } @@ -2525,8 +2484,7 @@ mod tests { }) .collect(); - let expected_values: Vec<_> = - [0..100, 0..50, 200..500].into_iter().flatten().collect(); + let expected_values: Vec<_> = [0..100, 0..50, 200..500].into_iter().flatten().collect(); assert_eq!(&values, &expected_values) } @@ -2575,11 +2533,9 @@ mod tests { let list_a_array = Arc::new(ListArray::from(list_data)) as ArrayRef; let struct_b_array = StructArray::from(vec![(list_a.clone(), list_a_array)]); - let batch1 = RecordBatch::try_from_iter(vec![( - "struct_b", - Arc::new(struct_b_array) as ArrayRef, - )]) - .unwrap(); + let batch1 = + RecordBatch::try_from_iter(vec![("struct_b", Arc::new(struct_b_array) as ArrayRef)]) + .unwrap(); let field_a_array = Int32Array::from(vec![6, 7, 8, 9, 10]); let field_b_array = Int32Array::from_iter(vec![None, None, None, Some(1), None]); @@ -2599,11 +2555,9 @@ mod tests { let list_a_array = Arc::new(ListArray::from(list_data)) as ArrayRef; let struct_b_array = StructArray::from(vec![(list_a, list_a_array)]); - let batch2 = RecordBatch::try_from_iter(vec![( - "struct_b", - Arc::new(struct_b_array) as ArrayRef, - )]) - .unwrap(); + let batch2 = + RecordBatch::try_from_iter(vec![("struct_b", Arc::new(struct_b_array) as ArrayRef)]) + .unwrap(); let batches = &[batch1, batch2]; @@ -2678,8 +2632,7 @@ mod tests { .unwrap(); let mut buf = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buf, Arc::new(file_schema), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buf, Arc::new(file_schema), None).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); } @@ -2697,8 +2650,7 @@ mod tests { .unwrap(); let mut buf = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buf, file_schema.clone(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buf, file_schema.clone(), None).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index fe7b4427647c..2ac4e0bc9674 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -20,9 +20,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; -use crate::file::page_index::index_reader::{ - acc_range, decode_column_index, decode_offset_index, -}; +use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index}; use bytes::Bytes; use futures::future::BoxFuture; use futures::FutureExt; @@ -56,11 +54,7 @@ impl MetadataLoader { /// Create a new [`MetadataLoader`] by reading the footer information /// /// See [`fetch_parquet_metadata`] for the meaning of the individual parameters - pub async fn load( - mut fetch: F, - file_size: usize, - prefetch: Option, - ) -> Result { + pub async fn load(mut fetch: F, file_size: usize, prefetch: Option) -> Result { if file_size < 8 { return Err(ParquetError::EOF(format!( "file size of {file_size} is less than footer" @@ -126,11 +120,7 @@ impl MetadataLoader { /// /// * `column_index`: if true will load column index /// * `offset_index`: if true will load offset index - pub async fn load_page_index( - &mut self, - column_index: bool, - offset_index: bool, - ) -> Result<()> { + pub async fn load_page_index(&mut self, column_index: bool, offset_index: bool) -> Result<()> { if !column_index && !offset_index { return Ok(()); } @@ -189,9 +179,7 @@ impl MetadataLoader { x.columns() .iter() .map(|c| match c.offset_index_range() { - Some(r) => decode_offset_index( - &data[r.start - offset..r.end - offset], - ), + Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]), None => Err(general_err!("missing offset index")), }) .collect::>>() diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 875fff4dac57..04383bb51bda 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -94,9 +94,8 @@ use arrow_schema::SchemaRef; use crate::arrow::array_reader::{build_array_reader, RowGroups}; use crate::arrow::arrow_reader::{ - apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, - ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, - RowSelection, + apply_range, evaluate_predicate, selects_any, ArrowReaderBuilder, ArrowReaderMetadata, + ArrowReaderOptions, ParquetRecordBatchReader, RowFilter, RowSelection, }; use crate::arrow::ProjectionMask; @@ -109,9 +108,7 @@ use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::file::FOOTER_SIZE; -use crate::format::{ - BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation, -}; +use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation}; mod metadata; pub use metadata::*; @@ -129,10 +126,7 @@ pub trait AsyncFileReader: Send { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result>; /// Retrieve multiple byte ranges. The default implementation will call `get_bytes` sequentially - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, Result>> { + fn get_byte_ranges(&mut self, ranges: Vec>) -> BoxFuture<'_, Result>> { async move { let mut result = Vec::with_capacity(ranges.len()); @@ -157,10 +151,7 @@ impl AsyncFileReader for Box { self.as_mut().get_bytes(range) } - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, Result>> { + fn get_byte_ranges(&mut self, ranges: Vec>) -> BoxFuture<'_, Result>> { self.as_mut().get_byte_ranges(ranges) } @@ -254,10 +245,7 @@ impl ParquetRecordBatchStreamBuilder { /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file /// and [`ArrowReaderOptions`] - pub async fn new_with_options( - mut input: T, - options: ArrowReaderOptions, - ) -> Result { + pub async fn new_with_options(mut input: T, options: ArrowReaderOptions) -> Result { let metadata = ArrowReaderMetadata::load_async(&mut input, options).await?; Ok(Self::new_with_metadata(input, metadata)) } @@ -316,9 +304,9 @@ impl ParquetRecordBatchStreamBuilder { let column_metadata = metadata.column(column_idx); let offset: usize = if let Some(offset) = column_metadata.bloom_filter_offset() { - offset.try_into().map_err(|_| { - ParquetError::General("Bloom filter offset is invalid".to_string()) - })? + offset + .try_into() + .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))? } else { return Ok(None); }; @@ -359,9 +347,7 @@ impl ParquetRecordBatchStreamBuilder { })?; self.input .0 - .get_bytes( - bitset_offset as usize..bitset_offset as usize + bitset_length, - ) + .get_bytes(bitset_offset as usize..bitset_offset as usize + bitset_length) .await? } }; @@ -471,11 +457,8 @@ where .fetch(&mut self.input, predicate_projection, selection.as_ref()) .await?; - let array_reader = build_array_reader( - self.fields.as_deref(), - predicate_projection, - &row_group, - )?; + let array_reader = + build_array_reader(self.fields.as_deref(), predicate_projection, &row_group)?; selection = Some(evaluate_predicate( batch_size, @@ -601,10 +584,7 @@ where { type Item = Result; - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { loop { match &mut self.state { StreamState::Decoding(batch_reader) => match batch_reader.next() { @@ -613,9 +593,7 @@ where } Some(Err(e)) => { self.state = StreamState::Error; - return Poll::Ready(Some(Err(ParquetError::ArrowError( - e.to_string(), - )))); + return Poll::Ready(Some(Err(ParquetError::ArrowError(e.to_string())))); } None => self.state = StreamState::Init, }, @@ -627,11 +605,9 @@ where let reader = self.reader.take().expect("lost reader"); - let row_count = - self.metadata.row_group(row_group_idx).num_rows() as usize; + let row_count = self.metadata.row_group(row_group_idx).num_rows() as usize; - let selection = - self.selection.as_mut().map(|s| s.split_off(row_count)); + let selection = self.selection.as_mut().map(|s| s.split_off(row_count)); let fut = reader .read_row_group( @@ -707,8 +683,7 @@ impl<'a> InMemoryRowGroup<'a> { } ranges.extend(selection.scan_ranges(&page_locations[idx])); - page_start_offsets - .push(ranges.iter().map(|range| range.start).collect()); + page_start_offsets.push(ranges.iter().map(|range| range.start).collect()); ranges }) @@ -779,13 +754,12 @@ impl<'a> RowGroups for InMemoryRowGroup<'a> { ))), Some(data) => { let page_locations = self.page_locations.map(|index| index[i].clone()); - let page_reader: Box = - Box::new(SerializedPageReader::new( - data.clone(), - self.metadata.column(i), - self.row_count, - page_locations, - )?); + let page_reader: Box = Box::new(SerializedPageReader::new( + data.clone(), + self.metadata.column(i), + self.row_count, + page_locations, + )?); Ok(Box::new(ColumnChunkIterator { reader: Some(Ok(page_reader)), @@ -881,9 +855,7 @@ mod tests { use arrow_array::builder::{ListBuilder, StringBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{ - Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray, UInt64Array, - }; + use arrow_array::{Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use futures::{StreamExt, TryStreamExt}; use rand::{thread_rng, Rng}; @@ -981,10 +953,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); // The builder should have page and offset indexes loaded now let metadata_with_index = builder.metadata(); @@ -1092,10 +1063,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let selection = RowSelection::from(vec![ RowSelector::skip(21), // Skip first page @@ -1174,10 +1144,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1190,8 +1159,7 @@ mod tests { let async_batches: Vec<_> = stream.try_collect().await.unwrap(); - let actual_rows: usize = - async_batches.into_iter().map(|b| b.num_rows()).sum(); + let actual_rows: usize = async_batches.into_iter().map(|b| b.num_rows()).sum(); assert_eq!(actual_rows, expected_rows); } @@ -1245,10 +1213,9 @@ mod tests { }; let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap(); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap(); let col_idx: usize = rand.gen_range(0..13); let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![col_idx]); @@ -1353,8 +1320,7 @@ mod tests { let props = WriterProperties::builder() .set_max_row_group_size(3) .build(); - let mut writer = - ArrowWriter::try_new(&mut buf, data.schema(), Some(props)).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buf, data.schema(), Some(props)).unwrap(); writer.write(&data).unwrap(); writer.close().unwrap(); @@ -1459,10 +1425,10 @@ mod tests { requests: Default::default(), }; - let a_filter = ArrowPredicateFn::new( - ProjectionMask::leaves(&parquet_schema, vec![1]), - |batch| Ok(batch.column(0).as_boolean().clone()), - ); + let a_filter = + ArrowPredicateFn::new(ProjectionMask::leaves(&parquet_schema, vec![1]), |batch| { + Ok(batch.column(0).as_boolean().clone()) + }); let b_scalar = Int8Array::from(vec![2]); let b_filter = ArrowPredicateFn::new( @@ -1475,15 +1441,14 @@ mod tests { let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]); let options = ArrowReaderOptions::new().with_page_index(true); - let stream = - ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) - .await - .unwrap() - .with_projection(mask.clone()) - .with_batch_size(1024) - .with_row_filter(filter) - .build() - .unwrap(); + let stream = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options) + .await + .unwrap() + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_row_filter(filter) + .build() + .unwrap(); let batches: Vec = stream.try_collect().await.unwrap(); @@ -1534,8 +1499,7 @@ mod tests { let _schema_desc = metadata.file_metadata().schema_descr(); - let projection = - ProjectionMask::leaves(metadata.file_metadata().schema_descr(), vec![0]); + let projection = ProjectionMask::leaves(metadata.file_metadata().schema_descr(), vec![0]); let reader_factory = ReaderFactory { metadata, @@ -1644,8 +1608,7 @@ mod tests { let props = WriterProperties::builder() .set_bloom_filter_enabled(true) .build(); - let mut writer = - ArrowWriter::try_new(&mut parquet_data, schema, Some(props)).unwrap(); + let mut writer = ArrowWriter::try_new(&mut parquet_data, schema, Some(props)).unwrap(); for batch in batches { writer.write(&batch).unwrap(); } @@ -1704,8 +1667,7 @@ mod tests { let mut builder = ListBuilder::new(StringBuilder::new()); for id in 0..1024 { match id % 3 { - 0 => builder - .append_value([Some("val_1".to_string()), Some(format!("id_{id}"))]), + 0 => builder.append_value([Some("val_1".to_string()), Some(format!("id_{id}"))]), 1 => builder.append_value([Some(format!("id_{id}"))]), _ => builder.append_null(), } diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 40d982cedf40..3e27a96124b0 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -82,16 +82,11 @@ impl AsyncFileReader for ParquetObjectReader { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { self.store .get_range(&self.meta.location, range) - .map_err(|e| { - ParquetError::General(format!("AsyncChunkReader::get_bytes error: {e}")) - }) + .map_err(|e| ParquetError::General(format!("AsyncChunkReader::get_bytes error: {e}"))) .boxed() } - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, Result>> + fn get_byte_ranges(&mut self, ranges: Vec>) -> BoxFuture<'_, Result>> where Self: Send, { diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index 0957b58697d7..30080c579e8f 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -99,8 +99,7 @@ impl AsyncArrowWriter { props: Option, ) -> Result { let shared_buffer = SharedBuffer::new(buffer_size); - let sync_writer = - ArrowWriter::try_new(shared_buffer.clone(), arrow_schema, props)?; + let sync_writer = ArrowWriter::try_new(shared_buffer.clone(), arrow_schema, props)?; Ok(Self { sync_writer, @@ -211,9 +210,7 @@ mod tests { use bytes::Bytes; use tokio::pin; - use crate::arrow::arrow_reader::{ - ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, - }; + use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; use super::*; @@ -270,8 +267,7 @@ mod tests { let mut sync_buffer = Vec::new(); let mut sync_writer = - ArrowWriter::try_new(&mut sync_buffer, reader.schema(), Some(write_props)) - .unwrap(); + ArrowWriter::try_new(&mut sync_buffer, reader.schema(), Some(write_props)).unwrap(); for record_batch in reader { let record_batch = record_batch.unwrap(); async_writer.write(&record_batch).await.unwrap(); @@ -349,8 +345,7 @@ mod tests { buffer.len() }; - let test_buffer_flush_thresholds = - vec![0, 1024, 40 * 1024, 50 * 1024, 100 * 1024]; + let test_buffer_flush_thresholds = vec![0, 1024, 40 * 1024, 50 * 1024, 100 * 1024]; for buffer_flush_threshold in test_buffer_flush_thresholds { let reader = get_test_reader(); @@ -383,14 +378,12 @@ mod tests { vec![0; 500000], vec![0; 500000], ])) as ArrayRef; - let to_write = - RecordBatch::try_from_iter([("col", col), ("col2", col2)]).unwrap(); + let to_write = RecordBatch::try_from_iter([("col", col), ("col2", col2)]).unwrap(); let temp = tempfile::tempfile().unwrap(); let file = tokio::fs::File::from_std(temp.try_clone().unwrap()); - let mut writer = - AsyncArrowWriter::try_new(file, to_write.schema(), 0, None).unwrap(); + let mut writer = AsyncArrowWriter::try_new(file, to_write.schema(), 0, None).unwrap(); writer.write(&to_write).await.unwrap(); writer.close().await.unwrap(); diff --git a/parquet/src/arrow/buffer/bit_util.rs b/parquet/src/arrow/buffer/bit_util.rs index b8e2e2f539d3..e7aea56a7f05 100644 --- a/parquet/src/arrow/buffer/bit_util.rs +++ b/parquet/src/arrow/buffer/bit_util.rs @@ -28,8 +28,7 @@ pub fn count_set_bits(bytes: &[u8], range: Range) -> usize { pub fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator + '_ { let bit_length = bytes.len() * 8; let unaligned = UnalignedBitChunk::new(bytes, 0, bit_length); - let mut chunk_end_idx = - bit_length + unaligned.lead_padding() + unaligned.trailing_padding(); + let mut chunk_end_idx = bit_length + unaligned.lead_padding() + unaligned.trailing_padding(); let iter = unaligned .prefix() diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index a0a47e3b98f7..4208318122af 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -16,9 +16,7 @@ // under the License. use crate::arrow::buffer::offset_buffer::OffsetBuffer; -use crate::arrow::record_reader::buffer::{ - BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer, -}; +use crate::arrow::record_reader::buffer::{BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer}; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; use arrow_array::{make_array, Array, ArrayRef, OffsetSizeTrait}; @@ -121,11 +119,7 @@ impl // likely sub-optimal, as we would prefer zero length null "slots", but // spilling is already a degenerate case and so it is unclear if this is // worth optimising for, e.g. by keeping a null mask around - spilled.extend_from_dictionary( - keys.as_slice(), - dict_offsets, - dict_values, - )?; + spilled.extend_from_dictionary(keys.as_slice(), dict_offsets, dict_values)?; } *self = Self::Values { values: spilled }; @@ -188,11 +182,9 @@ impl }; // This will compute a new dictionary - let array = arrow_cast::cast( - &values.into_array(null_buffer, value_type), - data_type, - ) - .expect("cast should be infallible"); + let array = + arrow_cast::cast(&values.into_array(null_buffer, value_type), data_type) + .expect("cast should be infallible"); Ok(array) } @@ -206,9 +198,7 @@ impl ValuesBufferSlice for DictionaryBuffer ValuesBuffer - for DictionaryBuffer -{ +impl ValuesBuffer for DictionaryBuffer { fn pad_nulls( &mut self, read_offset: usize, @@ -228,9 +218,7 @@ impl ValuesBuffer } } -impl BufferQueue - for DictionaryBuffer -{ +impl BufferQueue for DictionaryBuffer { type Output = Self; type Slice = Self; @@ -269,8 +257,7 @@ mod tests { let dict_type = ArrowType::Dictionary(Box::new(ArrowType::Int32), Box::new(ArrowType::Utf8)); - let d1: ArrayRef = - Arc::new(StringArray::from(vec!["hello", "world", "", "a", "b"])); + let d1: ArrayRef = Arc::new(StringArray::from(vec!["hello", "world", "", "a", "b"])); let mut buffer = DictionaryBuffer::::default(); diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 07d78e8a3282..3f8f85494f02 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -16,9 +16,7 @@ // under the License. use crate::arrow::buffer::bit_util::iter_set_bits_rev; -use crate::arrow::record_reader::buffer::{ - BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer, -}; +use crate::arrow::record_reader::buffer::{BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer}; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; use arrow_array::{make_array, ArrayRef, OffsetSizeTrait}; @@ -127,11 +125,7 @@ impl OffsetBuffer { } /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer` - pub fn into_array( - self, - null_buffer: Option, - data_type: ArrowType, - ) -> ArrayRef { + pub fn into_array(self, null_buffer: Option, data_type: ArrowType) -> ArrayRef { let array_data_builder = ArrayDataBuilder::new(data_type) .len(self.len()) .add_buffer(self.offsets.into()) diff --git a/parquet/src/arrow/decoder/delta_byte_array.rs b/parquet/src/arrow/decoder/delta_byte_array.rs index dd4a8fa87d27..c731cfea97e9 100644 --- a/parquet/src/arrow/decoder/delta_byte_array.rs +++ b/parquet/src/arrow/decoder/delta_byte_array.rs @@ -96,9 +96,8 @@ impl DeltaByteArrayDecoder { } self.last_value.truncate(prefix_length); - self.last_value.extend_from_slice( - &data[self.data_offset..self.data_offset + suffix_length], - ); + self.last_value + .extend_from_slice(&data[self.data_offset..self.data_offset + suffix_length]); f(&self.last_value)?; self.data_offset += suffix_length; @@ -128,9 +127,8 @@ impl DeltaByteArrayDecoder { } self.last_value.truncate(prefix_length); - self.last_value.extend_from_slice( - &data[self.data_offset..self.data_offset + suffix_length], - ); + self.last_value + .extend_from_slice(&data[self.data_offset..self.data_offset + suffix_length]); self.data_offset += suffix_length; } self.length_offset += to_skip; diff --git a/parquet/src/arrow/decoder/dictionary_index.rs b/parquet/src/arrow/decoder/dictionary_index.rs index 3d258309dd3b..32efd564dffb 100644 --- a/parquet/src/arrow/decoder/dictionary_index.rs +++ b/parquet/src/arrow/decoder/dictionary_index.rs @@ -41,11 +41,7 @@ pub struct DictIndexDecoder { impl DictIndexDecoder { /// Create a new [`DictIndexDecoder`] with the provided data page, the number of levels /// associated with this data page, and the number of non-null values (if known) - pub fn new( - data: ByteBufferPtr, - num_levels: usize, - num_values: Option, - ) -> Self { + pub fn new(data: ByteBufferPtr, num_levels: usize, num_values: Option) -> Self { let bit_width = data[0]; let mut decoder = RleDecoder::new(bit_width); decoder.set_data(data.start_from(1)); @@ -63,11 +59,7 @@ impl DictIndexDecoder { /// and calling `f` with each decoded dictionary index /// /// Will short-circuit and return on error - pub fn read Result<()>>( - &mut self, - len: usize, - mut f: F, - ) -> Result { + pub fn read Result<()>>(&mut self, len: usize, mut f: F) -> Result { let mut values_read = 0; while values_read != len && self.max_remaining_values != 0 { @@ -112,8 +104,7 @@ impl DictIndexDecoder { values_skip += skip; } else { // We still have indices buffered, so skip within the buffer - let skip = - (to_skip - values_skip).min(self.index_buf_len - self.index_offset); + let skip = (to_skip - values_skip).min(self.index_buf_len - self.index_offset); self.index_offset += skip; self.max_remaining_values -= skip; diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 0174db6b517f..63885643c0fd 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -175,10 +175,7 @@ impl ProjectionMask { /// Note: repeated or out of order indices will not impact the final mask /// /// i.e. `[0, 1, 2]` will construct the same mask as `[1, 0, 0, 2]` - pub fn leaves( - schema: &SchemaDescriptor, - indices: impl IntoIterator, - ) -> Self { + pub fn leaves(schema: &SchemaDescriptor, indices: impl IntoIterator) -> Self { let mut mask = vec![false; schema.num_columns()]; for leaf_idx in indices { mask[leaf_idx] = true; @@ -191,10 +188,7 @@ impl ProjectionMask { /// Note: repeated or out of order indices will not impact the final mask /// /// i.e. `[0, 1, 2]` will construct the same mask as `[1, 0, 0, 2]` - pub fn roots( - schema: &SchemaDescriptor, - indices: impl IntoIterator, - ) -> Self { + pub fn roots(schema: &SchemaDescriptor, indices: impl IntoIterator) -> Self { let num_root_columns = schema.root_schema().get_fields().len(); let mut root_mask = vec![false; num_root_columns]; for root_idx in indices { diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 4a0fc2a2f2eb..35a322e6c723 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -136,8 +136,7 @@ impl ScalarBuffer { #[inline] pub fn as_slice_mut(&mut self) -> &mut [T] { - let (prefix, buf, suffix) = - unsafe { self.buffer.as_slice_mut().align_to_mut::() }; + let (prefix, buf, suffix) = unsafe { self.buffer.as_slice_mut().align_to_mut::() }; assert!(prefix.is_empty() && suffix.is_empty()); buf } @@ -225,9 +224,7 @@ impl ValuesBuffer for ScalarBuffer { assert!(slice.len() >= read_offset + levels_read); let values_range = read_offset..read_offset + values_read; - for (value_pos, level_pos) in - values_range.rev().zip(iter_set_bits_rev(valid_mask)) - { + for (value_pos, level_pos) in values_range.rev().zip(iter_set_bits_rev(valid_mask)) { debug_assert!(level_pos >= value_pos); if level_pos <= value_pos { break; diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 5be0ac84dea2..20cda536ae1c 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -24,8 +24,7 @@ use arrow_buffer::Buffer; use crate::arrow::buffer::bit_util::count_set_bits; use crate::basic::Encoding; use crate::column::reader::decoder::{ - ColumnLevelDecoder, DefinitionLevelDecoder, DefinitionLevelDecoderImpl, - LevelsBufferSlice, + ColumnLevelDecoder, DefinitionLevelDecoder, DefinitionLevelDecoderImpl, LevelsBufferSlice, }; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; @@ -162,11 +161,7 @@ impl ColumnLevelDecoder for DefinitionLevelBufferDecoder { } impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { - fn read_def_levels( - &mut self, - writer: &mut Self::Slice, - range: Range, - ) -> Result { + fn read_def_levels(&mut self, writer: &mut Self::Slice, range: Range) -> Result { match (&mut writer.inner, &mut self.decoder) { ( BufferInner::Full { @@ -201,15 +196,9 @@ impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { } } - fn skip_def_levels( - &mut self, - num_levels: usize, - max_def_level: i16, - ) -> Result<(usize, usize)> { + fn skip_def_levels(&mut self, num_levels: usize, max_def_level: i16) -> Result<(usize, usize)> { match &mut self.decoder { - MaybePacked::Fallback(decoder) => { - decoder.skip_def_levels(num_levels, max_def_level) - } + MaybePacked::Fallback(decoder) => decoder.skip_def_levels(num_levels, max_def_level), MaybePacked::Packed(decoder) => decoder.skip(num_levels), } } @@ -249,8 +238,7 @@ impl PackedDecoder { self.rle_left = (indicator_value >> 1) as usize; let byte = *self.data.as_ref().get(self.data_offset).ok_or_else(|| { ParquetError::EOF( - "unexpected end of file whilst decoding definition levels rle value" - .into(), + "unexpected end of file whilst decoding definition levels rle value".into(), ) })?; @@ -354,11 +342,10 @@ impl PackedDecoder { skipped_value += to_skip; } } else if self.packed_count != self.packed_offset { - let to_skip = (self.packed_count - self.packed_offset) - .min(level_num - skipped_level); + let to_skip = + (self.packed_count - self.packed_offset).min(level_num - skipped_level); let offset = self.data_offset * 8 + self.packed_offset; - let bit_chunk = - UnalignedBitChunk::new(self.data.as_ref(), offset, to_skip); + let bit_chunk = UnalignedBitChunk::new(self.data.as_ref(), offset, to_skip); skipped_value += bit_chunk.count_ones(); self.packed_offset += to_skip; skipped_level += to_skip; @@ -452,14 +439,12 @@ mod tests { } let to_read_or_skip_level = rng.gen_range(1..=remaining_levels); if rng.gen_bool(0.5) { - let (skip_val_num, skip_level_num) = - decoder.skip(to_read_or_skip_level).unwrap(); + let (skip_val_num, skip_level_num) = decoder.skip(to_read_or_skip_level).unwrap(); skip_value += skip_val_num; skip_level += skip_level_num } else { let mut decoded = BooleanBufferBuilder::new(to_read_or_skip_level); - let read_level_num = - decoder.read(&mut decoded, to_read_or_skip_level).unwrap(); + let read_level_num = decoder.read(&mut decoded, to_read_or_skip_level).unwrap(); read_level += read_level_num; for i in 0..read_level_num { assert!(!decoded.is_empty()); diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index 35933e6e15d9..ea982341994e 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -100,10 +100,7 @@ where let values_decoder = CV::new(descr); let def_level_decoder = (descr.max_def_level() != 0).then(|| { - DefinitionLevelBufferDecoder::new( - descr.max_def_level(), - packed_null_mask(descr), - ) + DefinitionLevelBufferDecoder::new(descr.max_def_level(), packed_null_mask(descr)) }); let rep_level_decoder = (descr.max_rep_level() != 0) @@ -134,9 +131,7 @@ where loop { let records_to_read = num_records - records_read; records_read += self.read_one_batch(records_to_read)?; - if records_read == num_records - || !self.column_reader.as_mut().unwrap().has_next()? - { + if records_read == num_records || !self.column_reader.as_mut().unwrap().has_next()? { break; } } @@ -226,9 +221,7 @@ where if values_read < levels_read { let def_levels = self.def_levels.as_ref().ok_or_else(|| { - general_err!( - "Definition levels should exist when data is less than levels!" - ) + general_err!("Definition levels should exist when data is less than levels!") })?; self.values.pad_nulls( @@ -256,9 +249,7 @@ where /// only possible if the max definition level is 1, and corresponds to nulls at the /// leaf level, as opposed to a nullable parent nested type fn packed_null_mask(descr: &ColumnDescPtr) -> bool { - descr.max_def_level() == 1 - && descr.max_rep_level() == 0 - && descr.self_type().is_optional() + descr.max_def_level() == 1 && descr.max_rep_level() == 0 && descr.self_type().is_optional() } #[cfg(test)] diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index cdad3597ffef..ab71aa44169b 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -28,8 +28,8 @@ use crate::errors::{ParquetError, Result}; // Re-export crate::format types used in this module pub use crate::format::{ - BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, - NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, + BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, NullType, + StringType, TimeType, TimeUnit, TimestampType, UUIDType, }; // ---------------------------------------------------------------------- @@ -288,9 +288,7 @@ impl FromStr for Encoding { "PLAIN_DICTIONARY" | "plain_dictionary" => Ok(Encoding::PLAIN_DICTIONARY), "RLE" | "rle" => Ok(Encoding::RLE), "BIT_PACKED" | "bit_packed" => Ok(Encoding::BIT_PACKED), - "DELTA_BINARY_PACKED" | "delta_binary_packed" => { - Ok(Encoding::DELTA_BINARY_PACKED) - } + "DELTA_BINARY_PACKED" | "delta_binary_packed" => Ok(Encoding::DELTA_BINARY_PACKED), "DELTA_LENGTH_BYTE_ARRAY" | "delta_length_byte_array" => { Ok(Encoding::DELTA_LENGTH_BYTE_ARRAY) } @@ -319,22 +317,16 @@ pub enum Compression { LZ4_RAW, } -fn split_compression_string( - str_setting: &str, -) -> Result<(&str, Option), ParquetError> { +fn split_compression_string(str_setting: &str) -> Result<(&str, Option), ParquetError> { let split_setting = str_setting.split_once('('); match split_setting { Some((codec, level_str)) => { - let level = - &level_str[..level_str.len() - 1] - .parse::() - .map_err(|_| { - ParquetError::General(format!( - "invalid compression level: {}", - level_str - )) - })?; + let level = &level_str[..level_str.len() - 1] + .parse::() + .map_err(|_| { + ParquetError::General(format!("invalid compression level: {}", level_str)) + })?; Ok((codec, Some(*level))) } None => Ok((str_setting, None)), @@ -472,10 +464,9 @@ impl ColumnOrder { // TODO: Should this take converted and logical type, for compatibility? match logical_type { Some(logical) => match logical { - LogicalType::String - | LogicalType::Enum - | LogicalType::Json - | LogicalType::Bson => SortOrder::UNSIGNED, + LogicalType::String | LogicalType::Enum | LogicalType::Json | LogicalType::Bson => { + SortOrder::UNSIGNED + } LogicalType::Integer { is_signed, .. } => match is_signed { true => SortOrder::SIGNED, false => SortOrder::UNSIGNED, @@ -493,10 +484,7 @@ impl ColumnOrder { } } - fn get_converted_sort_order( - converted_type: ConvertedType, - physical_type: Type, - ) -> SortOrder { + fn get_converted_sort_order(converted_type: ConvertedType, physical_type: Type) -> SortOrder { match converted_type { // Unsigned byte-wise comparison. ConvertedType::UTF8 @@ -666,12 +654,8 @@ impl TryFrom> for ConvertedType { parquet::ConvertedType::DATE => ConvertedType::DATE, parquet::ConvertedType::TIME_MILLIS => ConvertedType::TIME_MILLIS, parquet::ConvertedType::TIME_MICROS => ConvertedType::TIME_MICROS, - parquet::ConvertedType::TIMESTAMP_MILLIS => { - ConvertedType::TIMESTAMP_MILLIS - } - parquet::ConvertedType::TIMESTAMP_MICROS => { - ConvertedType::TIMESTAMP_MICROS - } + parquet::ConvertedType::TIMESTAMP_MILLIS => ConvertedType::TIMESTAMP_MILLIS, + parquet::ConvertedType::TIMESTAMP_MICROS => ConvertedType::TIMESTAMP_MICROS, parquet::ConvertedType::UINT_8 => ConvertedType::UINT_8, parquet::ConvertedType::UINT_16 => ConvertedType::UINT_16, parquet::ConvertedType::UINT_32 => ConvertedType::UINT_32, @@ -707,12 +691,8 @@ impl From for Option { ConvertedType::DATE => Some(parquet::ConvertedType::DATE), ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TIME_MILLIS), ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TIME_MICROS), - ConvertedType::TIMESTAMP_MILLIS => { - Some(parquet::ConvertedType::TIMESTAMP_MILLIS) - } - ConvertedType::TIMESTAMP_MICROS => { - Some(parquet::ConvertedType::TIMESTAMP_MICROS) - } + ConvertedType::TIMESTAMP_MILLIS => Some(parquet::ConvertedType::TIMESTAMP_MILLIS), + ConvertedType::TIMESTAMP_MICROS => Some(parquet::ConvertedType::TIMESTAMP_MICROS), ConvertedType::UINT_8 => Some(parquet::ConvertedType::UINT_8), ConvertedType::UINT_16 => Some(parquet::ConvertedType::UINT_16), ConvertedType::UINT_32 => Some(parquet::ConvertedType::UINT_32), @@ -900,9 +880,7 @@ impl TryFrom for Encoding { parquet::Encoding::RLE => Encoding::RLE, parquet::Encoding::BIT_PACKED => Encoding::BIT_PACKED, parquet::Encoding::DELTA_BINARY_PACKED => Encoding::DELTA_BINARY_PACKED, - parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => { - Encoding::DELTA_LENGTH_BYTE_ARRAY - } + parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY => Encoding::DELTA_LENGTH_BYTE_ARRAY, parquet::Encoding::DELTA_BYTE_ARRAY => Encoding::DELTA_BYTE_ARRAY, parquet::Encoding::RLE_DICTIONARY => Encoding::RLE_DICTIONARY, parquet::Encoding::BYTE_STREAM_SPLIT => Encoding::BYTE_STREAM_SPLIT, @@ -919,9 +897,7 @@ impl From for parquet::Encoding { Encoding::RLE => parquet::Encoding::RLE, Encoding::BIT_PACKED => parquet::Encoding::BIT_PACKED, Encoding::DELTA_BINARY_PACKED => parquet::Encoding::DELTA_BINARY_PACKED, - Encoding::DELTA_LENGTH_BYTE_ARRAY => { - parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY - } + Encoding::DELTA_LENGTH_BYTE_ARRAY => parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, Encoding::DELTA_BYTE_ARRAY => parquet::Encoding::DELTA_BYTE_ARRAY, Encoding::RLE_DICTIONARY => parquet::Encoding::RLE_DICTIONARY, Encoding::BYTE_STREAM_SPLIT => parquet::Encoding::BYTE_STREAM_SPLIT, @@ -1278,13 +1254,11 @@ mod tests { ConvertedType::TIME_MICROS ); assert_eq!( - ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MILLIS)) - .unwrap(), + ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MILLIS)).unwrap(), ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MICROS)) - .unwrap(), + ConvertedType::try_from(Some(parquet::ConvertedType::TIMESTAMP_MICROS)).unwrap(), ConvertedType::TIMESTAMP_MICROS ); assert_eq!( @@ -2039,11 +2013,7 @@ mod tests { fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( - ColumnOrder::get_sort_order( - Some(tpe), - ConvertedType::NONE, - Type::BYTE_ARRAY - ), + ColumnOrder::get_sort_order(Some(tpe), ConvertedType::NONE, Type::BYTE_ARRAY), expected_order ); } diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index 1f5d0a62bbfa..445409610a8f 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -296,12 +296,10 @@ fn configure_writer_properties(args: &Args) -> WriterProperties { properties_builder = properties_builder.set_writer_version(writer_version); } if let Some(max_row_group_size) = args.max_row_group_size { - properties_builder = - properties_builder.set_max_row_group_size(max_row_group_size); + properties_builder = properties_builder.set_max_row_group_size(max_row_group_size); } if let Some(enable_bloom_filter) = args.enable_bloom_filter { - properties_builder = - properties_builder.set_bloom_filter_enabled(enable_bloom_filter); + properties_builder = properties_builder.set_bloom_filter_enabled(enable_bloom_filter); } properties_builder.build() } @@ -362,9 +360,7 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { let writer_properties = Some(configure_writer_properties(args)); let mut arrow_writer = ArrowWriter::try_new(parquet_file, arrow_schema.clone(), writer_properties) - .map_err(|e| { - ParquetFromCsvError::with_context(e, "Failed to create ArrowWriter") - })?; + .map_err(|e| ParquetFromCsvError::with_context(e, "Failed to create ArrowWriter"))?; // open input file let input_file = File::open(&args.input_file).map_err(|e| { @@ -377,9 +373,7 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { // open input file decoder let input_file_decoder = match args.csv_compression { Compression::UNCOMPRESSED => Box::new(input_file) as Box, - Compression::SNAPPY => { - Box::new(snap::read::FrameDecoder::new(input_file)) as Box - } + Compression::SNAPPY => Box::new(snap::read::FrameDecoder::new(input_file)) as Box, Compression::GZIP(_) => { Box::new(flate2::read::MultiGzDecoder::new(input_file)) as Box } @@ -389,9 +383,11 @@ fn convert_csv_to_parquet(args: &Args) -> Result<(), ParquetFromCsvError> { Compression::LZ4 => { Box::new(lz4_flex::frame::FrameDecoder::new(input_file)) as Box } - Compression::ZSTD(_) => Box::new(zstd::Decoder::new(input_file).map_err(|e| { - ParquetFromCsvError::with_context(e, "Failed to create zstd::Decoder") - })?) as Box, + Compression::ZSTD(_) => { + Box::new(zstd::Decoder::new(input_file).map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to create zstd::Decoder") + })?) as Box + } d => unimplemented!("compression type {d}"), }; @@ -698,15 +694,11 @@ mod tests { } Compression::ZSTD(level) => { - let mut encoder = - zstd::Encoder::new(input_file, level.compression_level()) - .map_err(|e| { - ParquetFromCsvError::with_context( - e, - "Failed to create zstd::Encoder", - ) - }) - .unwrap(); + let mut encoder = zstd::Encoder::new(input_file, level.compression_level()) + .map_err(|e| { + ParquetFromCsvError::with_context(e, "Failed to create zstd::Encoder") + }) + .unwrap(); write_tmp_file(&mut encoder); encoder.finish().unwrap() } @@ -742,15 +734,11 @@ mod tests { fn test_convert_csv_to_parquet() { test_convert_compressed_csv_to_parquet(Compression::UNCOMPRESSED); test_convert_compressed_csv_to_parquet(Compression::SNAPPY); - test_convert_compressed_csv_to_parquet(Compression::GZIP( - GzipLevel::try_new(1).unwrap(), - )); + test_convert_compressed_csv_to_parquet(Compression::GZIP(GzipLevel::try_new(1).unwrap())); test_convert_compressed_csv_to_parquet(Compression::BROTLI( BrotliLevel::try_new(2).unwrap(), )); test_convert_compressed_csv_to_parquet(Compression::LZ4); - test_convert_compressed_csv_to_parquet(Compression::ZSTD( - ZstdLevel::try_new(1).unwrap(), - )); + test_convert_compressed_csv_to_parquet(Compression::ZSTD(ZstdLevel::try_new(1).unwrap())); } } diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 4b82c21967a0..86e08b6dafa3 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -102,9 +102,7 @@ impl Args { Index::INT96(v) => print_index(&v.indexes, offset_index, &row_counts)?, Index::FLOAT(v) => print_index(&v.indexes, offset_index, &row_counts)?, Index::DOUBLE(v) => print_index(&v.indexes, offset_index, &row_counts)?, - Index::BYTE_ARRAY(v) => { - print_index(&v.indexes, offset_index, &row_counts)? - } + Index::BYTE_ARRAY(v) => print_index(&v.indexes, offset_index, &row_counts)?, Index::FIXED_LEN_BYTE_ARRAY(v) => { print_index(&v.indexes, offset_index, &row_counts)? } diff --git a/parquet/src/bin/parquet-layout.rs b/parquet/src/bin/parquet-layout.rs index 901ac9ea2309..b6d655757b87 100644 --- a/parquet/src/bin/parquet-layout.rs +++ b/parquet/src/bin/parquet-layout.rs @@ -162,10 +162,7 @@ fn do_layout(reader: &C) -> Result { /// Reads the page header at `offset` from `reader`, returning /// both the `PageHeader` and its length in bytes -fn read_page_header( - reader: &C, - offset: u64, -) -> Result<(usize, PageHeader)> { +fn read_page_header(reader: &C, offset: u64) -> Result<(usize, PageHeader)> { struct TrackedRead(R, usize); impl Read for TrackedRead { diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 392697e6c619..fe486e633624 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -71,8 +71,7 @@ fn main() { .read_to_end(&mut buf) .expect("Failed to read stdin into a buffer"); Box::new( - SerializedFileReader::new(bytes::Bytes::from(buf)) - .expect("Failed to create reader"), + SerializedFileReader::new(bytes::Bytes::from(buf)).expect("Failed to create reader"), ) } else { let path = Path::new(&filename); diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs index e4a80e7af354..ad0f7ae0df7d 100644 --- a/parquet/src/bin/parquet-rewrite.rs +++ b/parquet/src/bin/parquet-rewrite.rs @@ -205,10 +205,9 @@ fn main() { let args = Args::parse(); // read key-value metadata - let parquet_reader = SerializedFileReader::new( - File::open(&args.input).expect("Unable to open input file"), - ) - .expect("Failed to create reader"); + let parquet_reader = + SerializedFileReader::new(File::open(&args.input).expect("Unable to open input file")) + .expect("Failed to create reader"); let kv_md = parquet_reader .metadata() .file_metadata() @@ -223,58 +222,45 @@ fn main() { .build() .expect("parquet open"); - let mut writer_properties_builder = - WriterProperties::builder().set_key_value_metadata(kv_md); + let mut writer_properties_builder = WriterProperties::builder().set_key_value_metadata(kv_md); if let Some(value) = args.compression { - writer_properties_builder = - writer_properties_builder.set_compression(value.into()); + writer_properties_builder = writer_properties_builder.set_compression(value.into()); } if let Some(value) = args.max_row_group_size { - writer_properties_builder = - writer_properties_builder.set_max_row_group_size(value); + writer_properties_builder = writer_properties_builder.set_max_row_group_size(value); } if let Some(value) = args.data_page_row_count_limit { - writer_properties_builder = - writer_properties_builder.set_data_page_row_count_limit(value); + writer_properties_builder = writer_properties_builder.set_data_page_row_count_limit(value); } if let Some(value) = args.data_page_size_limit { - writer_properties_builder = - writer_properties_builder.set_data_page_size_limit(value); + writer_properties_builder = writer_properties_builder.set_data_page_size_limit(value); } if let Some(value) = args.dictionary_page_size_limit { - writer_properties_builder = - writer_properties_builder.set_dictionary_page_size_limit(value); + writer_properties_builder = writer_properties_builder.set_dictionary_page_size_limit(value); } if let Some(value) = args.max_statistics_size { - writer_properties_builder = - writer_properties_builder.set_max_statistics_size(value); + writer_properties_builder = writer_properties_builder.set_max_statistics_size(value); } if let Some(value) = args.bloom_filter_enabled { - writer_properties_builder = - writer_properties_builder.set_bloom_filter_enabled(value); + writer_properties_builder = writer_properties_builder.set_bloom_filter_enabled(value); if value { if let Some(value) = args.bloom_filter_fpp { - writer_properties_builder = - writer_properties_builder.set_bloom_filter_fpp(value); + writer_properties_builder = writer_properties_builder.set_bloom_filter_fpp(value); } if let Some(value) = args.bloom_filter_ndv { - writer_properties_builder = - writer_properties_builder.set_bloom_filter_ndv(value); + writer_properties_builder = writer_properties_builder.set_bloom_filter_ndv(value); } } } if let Some(value) = args.dictionary_enabled { - writer_properties_builder = - writer_properties_builder.set_dictionary_enabled(value); + writer_properties_builder = writer_properties_builder.set_dictionary_enabled(value); } if let Some(value) = args.statistics_enabled { - writer_properties_builder = - writer_properties_builder.set_statistics_enabled(value.into()); + writer_properties_builder = writer_properties_builder.set_statistics_enabled(value.into()); } if let Some(value) = args.writer_version { - writer_properties_builder = - writer_properties_builder.set_writer_version(value.into()); + writer_properties_builder = writer_properties_builder.set_writer_version(value.into()); } let writer_properties = writer_properties_builder.build(); let mut parquet_writer = ArrowWriter::try_new( diff --git a/parquet/src/bin/parquet-rowcount.rs b/parquet/src/bin/parquet-rowcount.rs index 55c76c5f73e4..07e4bd1d14cc 100644 --- a/parquet/src/bin/parquet-rowcount.rs +++ b/parquet/src/bin/parquet-rowcount.rs @@ -56,8 +56,7 @@ fn main() { for filename in args.file_paths { let path = Path::new(&filename); let file = File::open(path).expect("Unable to open file"); - let parquet_reader = - SerializedFileReader::new(file).expect("Unable to read file"); + let parquet_reader = SerializedFileReader::new(file).expect("Unable to read file"); let row_group_metadata = parquet_reader.metadata().row_groups(); let mut total_num_rows = 0; diff --git a/parquet/src/bin/parquet-show-bloom-filter.rs b/parquet/src/bin/parquet-show-bloom-filter.rs index 80db51978433..b1b332590ad2 100644 --- a/parquet/src/bin/parquet-show-bloom-filter.rs +++ b/parquet/src/bin/parquet-show-bloom-filter.rs @@ -49,7 +49,9 @@ struct Args { #[clap(help("Check the bloom filter indexes for the given column"))] column: String, #[clap( - help("Check if the given values match bloom filter, the values will be evaluated as strings"), + help( + "Check if the given values match bloom filter, the values will be evaluated as strings" + ), required = true )] values: Vec, diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index e98aee9fd213..897cce7620aa 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -152,9 +152,8 @@ pub(crate) fn read_bloom_filter_header_and_length( ) -> Result<(BloomFilterHeader, u64), ParquetError> { let total_length = buffer.len(); let mut prot = TCompactSliceInputProtocol::new(buffer.as_ref()); - let header = BloomFilterHeader::read_from_in_protocol(&mut prot).map_err(|e| { - ParquetError::General(format!("Could not read bloom filter header: {e}")) - })?; + let header = BloomFilterHeader::read_from_in_protocol(&mut prot) + .map_err(|e| ParquetError::General(format!("Could not read bloom filter header: {e}")))?; Ok((header, (total_length - prot.as_slice().len()) as u64)) } @@ -233,9 +232,7 @@ impl Sbbf { writer .write_all(block.to_le_bytes().as_slice()) .map_err(|e| { - ParquetError::General(format!( - "Could not write bloom filter bit set: {e}" - )) + ParquetError::General(format!("Could not write bloom filter bit set: {e}")) })?; } Ok(()) @@ -258,9 +255,9 @@ impl Sbbf { reader: Arc, ) -> Result, ParquetError> { let offset: u64 = if let Some(offset) = column_metadata.bloom_filter_offset() { - offset.try_into().map_err(|_| { - ParquetError::General("Bloom filter offset is invalid".to_string()) - })? + offset + .try_into() + .map_err(|_| ParquetError::General("Bloom filter offset is invalid".to_string()))? } else { return Ok(None); }; @@ -348,8 +345,7 @@ fn hash_as_bytes(value: &A) -> u64 { mod tests { use super::*; use crate::format::{ - BloomFilterAlgorithm, BloomFilterCompression, SplitBlockAlgorithm, Uncompressed, - XxHash, + BloomFilterAlgorithm, BloomFilterCompression, SplitBlockAlgorithm, Uncompressed, XxHash, }; #[test] @@ -387,8 +383,8 @@ mod tests { fn test_with_fixture() { // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10 let bitset: &[u8] = &[ - 200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, - 33, 0, 5, 99, 65, 2, 0, 224, 44, 64, 78, 96, 4, + 200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5, + 99, 65, 2, 0, 224, 44, 64, 78, 96, 4, ]; let sbbf = Sbbf::new(bitset); for a in 0..10i64 { @@ -402,8 +398,7 @@ mod tests { /// so altogether it'll be 20 bytes at most. #[test] fn test_bloom_filter_header_size_assumption() { - let buffer: &[u8; 16] = - &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99]; + let buffer: &[u8; 16] = &[21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 99]; let ( BloomFilterHeader { algorithm, diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 52ad4d644c95..854e5d994ee8 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -20,9 +20,8 @@ use super::page::{Page, PageReader}; use crate::basic::*; use crate::column::reader::decoder::{ - ColumnValueDecoder, ColumnValueDecoderImpl, DefinitionLevelDecoder, - DefinitionLevelDecoderImpl, LevelsBufferSlice, RepetitionLevelDecoder, - RepetitionLevelDecoderImpl, ValuesBufferSlice, + ColumnValueDecoder, ColumnValueDecoderImpl, DefinitionLevelDecoder, DefinitionLevelDecoderImpl, + LevelsBufferSlice, RepetitionLevelDecoder, RepetitionLevelDecoderImpl, ValuesBufferSlice, }; use crate::data_type::*; use crate::errors::{ParquetError, Result}; @@ -51,34 +50,27 @@ pub fn get_column_reader( col_page_reader: Box, ) -> ColumnReader { match col_descr.physical_type() { - Type::BOOLEAN => ColumnReader::BoolColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::INT32 => ColumnReader::Int32ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::INT64 => ColumnReader::Int64ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::INT96 => ColumnReader::Int96ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::FLOAT => ColumnReader::FloatColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::DOUBLE => ColumnReader::DoubleColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => { + ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( ColumnReaderImpl::new(col_descr, col_page_reader), ), @@ -89,9 +81,7 @@ pub fn get_column_reader( /// non-generic type to a generic column reader type `ColumnReaderImpl`. /// /// Panics if actual enum value for `col_reader` does not match the type `T`. -pub fn get_typed_column_reader( - col_reader: ColumnReader, -) -> ColumnReaderImpl { +pub fn get_typed_column_reader(col_reader: ColumnReader) -> ColumnReaderImpl { T::get_column_reader(col_reader).unwrap_or_else(|| { panic!( "Failed to convert column reader into a typed column reader for `{}` type", @@ -206,8 +196,7 @@ where rep_levels: Option<&mut R::Slice>, values: &mut V::Slice, ) -> Result<(usize, usize)> { - let (_, values, levels) = - self.read_records(batch_size, def_levels, rep_levels, values)?; + let (_, values, levels) = self.read_records(batch_size, def_levels, rep_levels, values)?; Ok((values, levels)) } @@ -285,10 +274,8 @@ where .as_mut() .ok_or_else(|| general_err!("must specify definition levels"))?; - let read = reader.read_def_levels( - out, - total_levels_read..total_levels_read + levels_read, - )?; + let read = reader + .read_def_levels(out, total_levels_read..total_levels_read + levels_read)?; if read != levels_read { return Err(general_err!("insufficient definition levels read from column - expected {rep_levels}, got {read}")); @@ -401,8 +388,9 @@ where } let (values_read, def_levels_read) = match self.def_level_decoder.as_mut() { - Some(decoder) => decoder - .skip_def_levels(rep_levels_read, self.descr.max_def_level())?, + Some(decoder) => { + decoder.skip_def_levels(rep_levels_read, self.descr.max_def_level())? + } None => (rep_levels_read, rep_levels_read), }; @@ -589,9 +577,7 @@ where /// (if it exists) into the buffer #[inline] pub(crate) fn has_next(&mut self) -> Result { - if self.num_buffered_values == 0 - || self.num_buffered_values == self.num_decoded_values - { + if self.num_buffered_values == 0 || self.num_buffered_values == self.num_decoded_values { // TODO: should we return false if read_new_page() = true and // num_buffered_values = 0? if !self.read_new_page()? { @@ -1058,12 +1044,7 @@ mod tests { #[test] fn test_read_batch_values_def_rep_levels() { - test_read_batch_int32( - 128, - &mut [0; 128], - Some(&mut [0; 128]), - Some(&mut [0; 128]), - ); + test_read_batch_int32(128, &mut [0; 128], Some(&mut [0; 128]), Some(&mut [0; 128])); } #[test] @@ -1389,17 +1370,14 @@ mod tests { let max_def_level = desc.max_def_level(); let max_rep_level = desc.max_rep_level(); let page_reader = InMemoryPageReader::new(pages); - let column_reader: ColumnReader = - get_column_reader(desc, Box::new(page_reader)); + let column_reader: ColumnReader = get_column_reader(desc, Box::new(page_reader)); let mut typed_column_reader = get_typed_column_reader::(column_reader); let mut curr_values_read = 0; let mut curr_levels_read = 0; loop { - let actual_def_levels = - def_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); - let actual_rep_levels = - rep_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); + let actual_def_levels = def_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); + let actual_rep_levels = rep_levels.as_mut().map(|vec| &mut vec[curr_levels_read..]); let (_, values_read, levels_read) = typed_column_reader .read_records( diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index 27ffb7637e18..ec57c4032574 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -97,11 +97,7 @@ pub trait RepetitionLevelDecoder: ColumnLevelDecoder { /// /// A record only ends when the data contains a subsequent repetition level of 0, /// it is therefore left to the caller to delimit the final record in a column - fn skip_rep_levels( - &mut self, - num_records: usize, - num_levels: usize, - ) -> Result<(usize, usize)>; + fn skip_rep_levels(&mut self, num_records: usize, num_levels: usize) -> Result<(usize, usize)>; /// Flush any partially read or skipped record fn flush_partial(&mut self) -> bool; @@ -118,20 +114,12 @@ pub trait DefinitionLevelDecoder: ColumnLevelDecoder { /// Implementations may panic if `range` overlaps with already written data /// // TODO: Should this return the number of nulls - fn read_def_levels( - &mut self, - out: &mut Self::Slice, - range: Range, - ) -> Result; + fn read_def_levels(&mut self, out: &mut Self::Slice, range: Range) -> Result; /// Skips over `num_levels` definition levels /// /// Returns the number of values skipped, and the number of levels skipped - fn skip_def_levels( - &mut self, - num_levels: usize, - max_def_level: i16, - ) -> Result<(usize, usize)>; + fn skip_def_levels(&mut self, num_levels: usize, max_def_level: i16) -> Result<(usize, usize)>; } /// Decodes value data to a [`ValuesBufferSlice`] @@ -353,19 +341,11 @@ impl ColumnLevelDecoder for DefinitionLevelDecoderImpl { } impl DefinitionLevelDecoder for DefinitionLevelDecoderImpl { - fn read_def_levels( - &mut self, - out: &mut Self::Slice, - range: Range, - ) -> Result { + fn read_def_levels(&mut self, out: &mut Self::Slice, range: Range) -> Result { self.decoder.as_mut().unwrap().read(&mut out[range]) } - fn skip_def_levels( - &mut self, - num_levels: usize, - max_def_level: i16, - ) -> Result<(usize, usize)> { + fn skip_def_levels(&mut self, num_levels: usize, max_def_level: i16) -> Result<(usize, usize)> { let mut level_skip = 0; let mut value_skip = 0; let mut buf: Vec = vec![]; @@ -424,11 +404,7 @@ impl RepetitionLevelDecoderImpl { /// and returns the number of "complete" records along with the corresponding number of values /// /// A "complete" record is one where the buffer contains a subsequent repetition level of 0 - fn count_records( - &mut self, - records_to_read: usize, - num_levels: usize, - ) -> (bool, usize, usize) { + fn count_records(&mut self, records_to_read: usize, num_levels: usize) -> (bool, usize, usize) { let mut records_read = 0; let levels = num_levels.min(self.buffer_len - self.buffer_offset); @@ -494,11 +470,7 @@ impl RepetitionLevelDecoder for RepetitionLevelDecoderImpl { Ok((total_records_read, total_levels_read)) } - fn skip_rep_levels( - &mut self, - num_records: usize, - num_levels: usize, - ) -> Result<(usize, usize)> { + fn skip_rep_levels(&mut self, num_records: usize, num_levels: usize) -> Result<(usize, usize)> { let mut total_records_read = 0; let mut total_levels_read = 0; @@ -559,8 +531,7 @@ mod tests { for _ in 0..10 { let mut rng = thread_rng(); let total_len = 10000_usize; - let mut encoded: Vec = - (0..total_len).map(|_| rng.gen_range(0..5)).collect(); + let mut encoded: Vec = (0..total_len).map(|_| rng.gen_range(0..5)).collect(); encoded[0] = 0; let mut encoder = RleEncoder::new(3, 1024); for v in &encoded { diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index fb5889b785a8..7bd4db30c3a8 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -18,8 +18,7 @@ use crate::basic::Encoding; use crate::bloom_filter::Sbbf; use crate::column::writer::{ - compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, - update_min, + compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, }; use crate::data_type::private::ParquetValueType; use crate::data_type::DataType; @@ -168,9 +167,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { value_indices: Option<&[usize]>, ) -> Option<(Self::T, Self::T)> { match value_indices { - Some(indices) => { - get_min_max(&self.descr, indices.iter().map(|x| &values[*x])) - } + Some(indices) => get_min_max(&self.descr, indices.iter().map(|x| &values[*x])), None => get_min_max(&self.descr, values.iter()), } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8c1c55409988..84bf1911d89c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -24,9 +24,7 @@ use std::str; use crate::basic::{Compression, ConvertedType, Encoding, LogicalType, PageType, Type}; use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; -use crate::column::writer::encoder::{ - ColumnValueEncoder, ColumnValueEncoderImpl, ColumnValues, -}; +use crate::column::writer::encoder::{ColumnValueEncoder, ColumnValueEncoderImpl, ColumnValues}; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::private::ParquetValueType; use crate::data_type::*; @@ -96,41 +94,27 @@ pub fn get_column_writer<'a>( page_writer: Box, ) -> ColumnWriter<'a> { match descr.physical_type() { - Type::BOOLEAN => ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::INT32 => ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::INT64 => ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::INT96 => ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::FLOAT => ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::DOUBLE => ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), - Type::BYTE_ARRAY => ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new( - descr, - props, - page_writer, - )), + Type::BOOLEAN => { + ColumnWriter::BoolColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT32 => { + ColumnWriter::Int32ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT64 => { + ColumnWriter::Int64ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::INT96 => { + ColumnWriter::Int96ColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::FLOAT => { + ColumnWriter::FloatColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::DOUBLE => { + ColumnWriter::DoubleColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } + Type::BYTE_ARRAY => { + ColumnWriter::ByteArrayColumnWriter(ColumnWriterImpl::new(descr, props, page_writer)) + } Type::FIXED_LEN_BYTE_ARRAY => ColumnWriter::FixedLenByteArrayColumnWriter( ColumnWriterImpl::new(descr, props, page_writer), ), @@ -141,9 +125,7 @@ pub fn get_column_writer<'a>( /// non-generic type to a generic column writer type `ColumnWriterImpl`. /// /// Panics if actual enum value for `col_writer` does not match the type `T`. -pub fn get_typed_column_writer( - col_writer: ColumnWriter, -) -> ColumnWriterImpl { +pub fn get_typed_column_writer(col_writer: ColumnWriter) -> ColumnWriterImpl { T::get_column_writer(col_writer).unwrap_or_else(|| { panic!( "Failed to convert column writer into a typed column writer for `{}` type", @@ -341,33 +323,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if self.statistics_enabled == EnabledStatistics::Chunk { match (min, max) { (Some(min), Some(max)) => { - update_min( - &self.descr, - min, - &mut self.column_metrics.min_column_value, - ); - update_max( - &self.descr, - max, - &mut self.column_metrics.max_column_value, - ); + update_min(&self.descr, min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, max, &mut self.column_metrics.max_column_value); } (None, Some(_)) | (Some(_), None) => { panic!("min/max should be both set or both None") } (None, None) => { - if let Some((min, max)) = self.encoder.min_max(values, value_indices) - { - update_min( - &self.descr, - &min, - &mut self.column_metrics.min_column_value, - ); - update_max( - &self.descr, - &max, - &mut self.column_metrics.max_column_value, - ); + if let Some((min, max)) = self.encoder.min_max(values, value_indices) { + update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); } } }; @@ -626,10 +591,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { return false; } - self.page_metrics.num_buffered_rows as usize - >= self.props.data_page_row_count_limit() - || self.encoder.estimated_data_page_size() - >= self.props.data_page_size_limit() + self.page_metrics.num_buffered_rows as usize >= self.props.data_page_row_count_limit() + || self.encoder.estimated_data_page_size() >= self.props.data_page_size_limit() } /// Performs dictionary fallback. @@ -647,8 +610,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Update the column index and offset index when adding the data page fn update_column_offset_index(&mut self, page_statistics: Option<&Statistics>) { // update the column index - let null_page = (self.page_metrics.num_buffered_rows as u64) - == self.page_metrics.num_page_nulls; + let null_page = + (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; // a page contains only null values, // and writers have to set the corresponding entries in min_values and max_values to byte[0] if null_page && self.column_index_builder.valid() { @@ -794,15 +757,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let mut buffer = vec![]; if max_rep_level > 0 { - let levels = - self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level); + let levels = self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level); rep_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } if max_def_level > 0 { - let levels = - self.encode_levels_v2(&self.def_levels_sink[..], max_def_level); + let levels = self.encode_levels_v2(&self.def_levels_sink[..], max_def_level); def_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } @@ -842,8 +803,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } // Update total number of rows. - self.column_metrics.total_rows_written += - self.page_metrics.num_buffered_rows as u64; + self.column_metrics.total_rows_written += self.page_metrics.num_buffered_rows as u64; // Reset state. self.rep_levels_sink.clear(); @@ -874,8 +834,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let total_compressed_size = self.column_metrics.total_compressed_size as i64; let total_uncompressed_size = self.column_metrics.total_uncompressed_size as i64; let num_values = self.column_metrics.total_num_values as i64; - let dict_page_offset = - self.column_metrics.dictionary_page_offset.map(|v| v as i64); + let dict_page_offset = self.column_metrics.dictionary_page_offset.map(|v| v as i64); // If data page offset is not set, then no pages have been written let data_page_offset = self.column_metrics.data_page_offset.unwrap_or(0) as i64; @@ -920,12 +879,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Encodes definition or repetition levels for Data Page v1. #[inline] - fn encode_levels_v1( - &self, - encoding: Encoding, - levels: &[i16], - max_level: i16, - ) -> Vec { + fn encode_levels_v1(&self, encoding: Encoding, levels: &[i16], max_level: i16) -> Vec { let mut encoder = LevelEncoder::v1(encoding, max_level, levels.len()); encoder.put(levels); encoder.consume() @@ -947,10 +901,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let page_spec = self.page_writer.write_page(page)?; // update offset index // compressed_size = header_size + compressed_data_size - self.offset_index_builder.append_offset_and_size( - page_spec.offset as i64, - page_spec.compressed_size as i32, - ); + self.offset_index_builder + .append_offset_and_size(page_spec.offset as i64, page_spec.compressed_size as i32); self.update_metrics_for_page(page_spec); Ok(()) } @@ -1014,19 +966,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } -fn update_min( - descr: &ColumnDescriptor, - val: &T, - min: &mut Option, -) { +fn update_min(descr: &ColumnDescriptor, val: &T, min: &mut Option) { update_stat::(val, min, |cur| compare_greater(descr, cur, val)) } -fn update_max( - descr: &ColumnDescriptor, - val: &T, - max: &mut Option, -) { +fn update_max(descr: &ColumnDescriptor, val: &T, max: &mut Option) { update_stat::(val, max, |cur| compare_greater(descr, val, cur)) } @@ -1117,9 +1061,7 @@ fn fallback_encoding(kind: Type, props: &WriterProperties) -> Encoding { (Type::INT32, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BINARY_PACKED, (Type::INT64, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BINARY_PACKED, (Type::BYTE_ARRAY, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BYTE_ARRAY, - (Type::FIXED_LEN_BYTE_ARRAY, WriterVersion::PARQUET_2_0) => { - Encoding::DELTA_BYTE_ARRAY - } + (Type::FIXED_LEN_BYTE_ARRAY, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BYTE_ARRAY, _ => Encoding::PLAIN, } } @@ -1152,9 +1094,7 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { // for equal length bytes arrays that have different first bytes. // The equality requirement is necessary for sign extension cases. // 0xFF10 should be equal to 0x10 (due to big endian sign extension). - if (0x80 & first_a) != (0x80 & first_b) - || (a_length == b_length && first_a != first_b) - { + if (0x80 & first_a) != (0x80 & first_b) || (a_length == b_length && first_a != first_b) { return (first_a as i8) > (first_b as i8); } @@ -1227,9 +1167,7 @@ fn increment_utf8(mut data: Vec) -> Option> { #[cfg(test)] mod tests { - use crate::{ - file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, format::BoundaryOrder, - }; + use crate::{file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, format::BoundaryOrder}; use bytes::Bytes; use rand::distributions::uniform::SampleUniform; use std::sync::Arc; @@ -1657,26 +1595,25 @@ mod tests { fn test_column_writer_check_byte_array_min_max() { let page_writer = get_test_page_writer(); let props = Default::default(); - let mut writer = - get_test_decimals_column_writer::(page_writer, 0, 0, props); + let mut writer = get_test_decimals_column_writer::(page_writer, 0, 0, props); writer .write_batch( &[ ByteArray::from(vec![ - 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, - 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, + 35u8, 231u8, 90u8, 0u8, 0u8, ]), ByteArray::from(vec![ - 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 228u8, - 62u8, 146u8, 152u8, 177u8, 56u8, 0u8, 0u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 228u8, 62u8, 146u8, + 152u8, 177u8, 56u8, 0u8, 0u8, ]), ByteArray::from(vec![ - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, ]), ByteArray::from(vec![ - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, - 246u8, 44u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, + 44u8, 0u8, 0u8, ]), ], None, @@ -1690,15 +1627,15 @@ mod tests { assert_eq!( stats.min(), &ByteArray::from(vec![ - 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, - 172u8, 19u8, 35u8, 231u8, 90u8, 0u8, 0u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, + 35u8, 231u8, 90u8, 0u8, 0u8, ]) ); assert_eq!( stats.max(), &ByteArray::from(vec![ - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, - 246u8, 44u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 41u8, 162u8, 36u8, 26u8, 246u8, + 44u8, 0u8, 0u8, ]) ); } else { @@ -1713,9 +1650,12 @@ mod tests { fn test_column_writer_uint32_converted_type_min_max() { let page_writer = get_test_page_writer(); let props = Default::default(); - let mut writer = get_test_unsigned_int_given_as_converted_column_writer::< - Int32Type, - >(page_writer, 0, 0, props); + let mut writer = get_test_unsigned_int_given_as_converted_column_writer::( + page_writer, + 0, + 0, + props, + ); writer.write_batch(&[0, 1, 2, 3, 4, 5], None, None).unwrap(); let metadata = writer.close().unwrap().metadata; if let Some(stats) = metadata.statistics() { @@ -1790,14 +1730,7 @@ mod tests { writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); writer - .write_batch_with_statistics( - &[5, 6, 7], - None, - None, - Some(&5), - Some(&7), - Some(3), - ) + .write_batch_with_statistics(&[5, 6, 7], None, None, Some(&5), Some(&7), Some(3)) .unwrap(); let r = writer.close().unwrap(); @@ -2297,8 +2230,7 @@ mod tests { // and check the offset index and column index let page_writer = get_test_page_writer(); let props = Default::default(); - let mut writer = - get_test_column_writer::(page_writer, 0, 0, props); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); let mut data = vec![FixedLenByteArray::default(); 3]; // This is the expected min value - "aaa..." @@ -2366,11 +2298,9 @@ mod tests { let page_writer = get_test_page_writer(); // Truncate values at 1 byte - let builder = - WriterProperties::builder().set_column_index_truncate_length(Some(1)); + let builder = WriterProperties::builder().set_column_index_truncate_length(Some(1)); let props = Arc::new(builder.build()); - let mut writer = - get_test_column_writer::(page_writer, 0, 0, props); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); let mut data = vec![FixedLenByteArray::default(); 1]; // This is the expected min value @@ -2577,12 +2507,8 @@ mod tests { max_batch_size = max_batch_size.max(levels.len()); } - let mut writer = get_test_column_writer::( - page_writer, - max_def_level, - max_rep_level, - Arc::new(props), - ); + let mut writer = + get_test_column_writer::(page_writer, max_def_level, max_rep_level, Arc::new(props)); let values_written = writer.write_batch(values, def_levels, rep_levels).unwrap(); assert_eq!(values_written, values.len()); @@ -2603,8 +2529,7 @@ mod tests { ) .unwrap(), ); - let reader = - get_test_column_reader::(page_reader, max_def_level, max_rep_level); + let reader = get_test_column_reader::(page_reader, max_def_level, max_rep_level); let mut actual_values = vec![T::T::default(); max_batch_size]; let mut actual_def_levels = def_levels.map(|_| vec![0i16; max_batch_size]); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 67d0bad98202..7e64478ed940 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -539,9 +539,7 @@ impl AsBytes for bool { impl AsBytes for Int96 { fn as_bytes(&self) -> &[u8] { - unsafe { - std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) - } + unsafe { std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12) } } } @@ -620,17 +618,10 @@ pub(crate) mod private { ) -> Result<()>; /// Establish the data that will be decoded in a buffer - fn set_data( - decoder: &mut PlainDecoderDetails, - data: ByteBufferPtr, - num_values: usize, - ); + fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize); /// Decode the value from a given buffer for a higher level decoder - fn decode( - buffer: &mut [Self], - decoder: &mut PlainDecoderDetails, - ) -> Result; + fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result; fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) -> Result; @@ -680,20 +671,13 @@ pub(crate) mod private { } #[inline] - fn set_data( - decoder: &mut PlainDecoderDetails, - data: ByteBufferPtr, - num_values: usize, - ) { + fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { decoder.bit_reader.replace(BitReader::new(data)); decoder.num_values = num_values; } #[inline] - fn decode( - buffer: &mut [Self], - decoder: &mut PlainDecoderDetails, - ) -> Result { + fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result { let bit_reader = decoder.bit_reader.as_mut().unwrap(); let num_values = std::cmp::min(buffer.len(), decoder.num_values); let values_read = bit_reader.get_batch(&mut buffer[..num_values], 1); @@ -823,10 +807,7 @@ pub(crate) mod private { ) -> Result<()> { for value in values { let raw = unsafe { - std::slice::from_raw_parts( - value.data() as *const [u32] as *const u8, - 12, - ) + std::slice::from_raw_parts(value.data() as *const [u32] as *const u8, 12) }; writer.write_all(raw)?; } @@ -834,21 +815,14 @@ pub(crate) mod private { } #[inline] - fn set_data( - decoder: &mut PlainDecoderDetails, - data: ByteBufferPtr, - num_values: usize, - ) { + fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; } #[inline] - fn decode( - buffer: &mut [Self], - decoder: &mut PlainDecoderDetails, - ) -> Result { + fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result { // TODO - Remove the duplication between this and the general slice method let data = decoder .data @@ -869,10 +843,8 @@ pub(crate) mod private { let mut pos = 0; // position in byte array for item in buffer.iter_mut().take(num_values) { let elem0 = u32::from_le_bytes(bytes[pos..pos + 4].try_into().unwrap()); - let elem1 = - u32::from_le_bytes(bytes[pos + 4..pos + 8].try_into().unwrap()); - let elem2 = - u32::from_le_bytes(bytes[pos + 8..pos + 12].try_into().unwrap()); + let elem1 = u32::from_le_bytes(bytes[pos + 4..pos + 8].try_into().unwrap()); + let elem2 = u32::from_le_bytes(bytes[pos + 8..pos + 12].try_into().unwrap()); item.set_data(elem0, elem1, elem2); pos += 12; @@ -930,21 +902,14 @@ pub(crate) mod private { } #[inline] - fn set_data( - decoder: &mut PlainDecoderDetails, - data: ByteBufferPtr, - num_values: usize, - ) { + fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; } #[inline] - fn decode( - buffer: &mut [Self], - decoder: &mut PlainDecoderDetails, - ) -> Result { + fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result { let data = decoder .data .as_mut() @@ -952,8 +917,7 @@ pub(crate) mod private { let num_values = std::cmp::min(buffer.len(), decoder.num_values); for val_array in buffer.iter_mut().take(num_values) { let len: usize = - read_num_bytes::(4, data.start_from(decoder.start).as_ref()) - as usize; + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::(); if data.len() < decoder.start + len { @@ -979,8 +943,7 @@ pub(crate) mod private { for _ in 0..num_values { let len: usize = - read_num_bytes::(4, data.start_from(decoder.start).as_ref()) - as usize; + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::() + len; } decoder.num_values -= num_values; @@ -1021,21 +984,14 @@ pub(crate) mod private { } #[inline] - fn set_data( - decoder: &mut PlainDecoderDetails, - data: ByteBufferPtr, - num_values: usize, - ) { + fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; } #[inline] - fn decode( - buffer: &mut [Self], - decoder: &mut PlainDecoderDetails, - ) -> Result { + fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result { assert!(decoder.type_length > 0); let data = decoder @@ -1115,9 +1071,7 @@ pub trait DataType: 'static + Send { where Self: Sized; - fn get_column_writer( - column_writer: ColumnWriter<'_>, - ) -> Option> + fn get_column_writer(column_writer: ColumnWriter<'_>) -> Option> where Self: Sized; @@ -1160,9 +1114,7 @@ macro_rules! make_type { $size } - fn get_column_reader( - column_reader: ColumnReader, - ) -> Option> { + fn get_column_reader(column_reader: ColumnReader) -> Option> { match column_reader { ColumnReader::$reader_ident(w) => Some(w), _ => None, diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 53496a66b572..9695dbeae6e1 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -220,8 +220,7 @@ mod tests { let schema = SchemaType::group_type_builder("schema").build().unwrap(); let schema_descr = SchemaDescriptor::new(Arc::new(schema)); - let t_column_orders = - Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); + let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); parse_column_orders(t_column_orders, &schema_descr); } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 1f46c8105ebc..e57f666383d2 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -37,8 +37,8 @@ use std::ops::Range; use std::sync::Arc; use crate::format::{ - BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, - RowGroup, SortingColumn, + BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, + SortingColumn, }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; @@ -348,10 +348,7 @@ impl RowGroupMetaData { } /// Method to convert from Thrift. - pub fn from_thrift( - schema_descr: SchemaDescPtr, - mut rg: RowGroup, - ) -> Result { + pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result { assert_eq!(schema_descr.num_columns(), rg.columns.len()); let total_byte_size = rg.total_byte_size; let num_rows = rg.num_rows; @@ -988,9 +985,7 @@ impl OffsetIndexBuilder { .iter() .zip(self.compressed_page_size_array.iter()) .zip(self.first_row_index_array.iter()) - .map(|((offset, size), row_index)| { - PageLocation::new(*offset, *size, *row_index) - }) + .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); OffsetIndex::new(locations) } @@ -1019,10 +1014,9 @@ mod tests { .unwrap(); let row_group_exp = row_group_meta.to_thrift(); - let row_group_res = - RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone()) - .unwrap() - .to_thrift(); + let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone()) + .unwrap() + .to_thrift(); assert_eq!(row_group_res, row_group_exp); } @@ -1078,8 +1072,7 @@ mod tests { .unwrap(); let col_chunk_res = - ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()) - .unwrap(); + ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap(); assert_eq!(col_chunk_res, col_metadata); } @@ -1093,10 +1086,9 @@ mod tests { .unwrap(); let col_chunk_exp = col_metadata.to_thrift(); - let col_chunk_res = - ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone()) - .unwrap() - .to_thrift(); + let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone()) + .unwrap() + .to_thrift(); assert_eq!(col_chunk_res, col_chunk_exp); } diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 95a73118042f..c941d401175c 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -35,9 +35,7 @@ pub struct PageEncodingStats { } /// Converts Thrift definition into `PageEncodingStats`. -pub fn try_from_thrift( - thrift_encoding_stats: &TPageEncodingStats, -) -> Result { +pub fn try_from_thrift(thrift_encoding_stats: &TPageEncodingStats) -> Result { let page_type = PageType::try_from(thrift_encoding_stats.page_type)?; let encoding = Encoding::try_from(thrift_encoding_stats.encoding)?; let count = thrift_encoding_stats.count; diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index ae3bf3699c1c..f298601f5d59 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -30,10 +30,7 @@ use std::ops::Range; /// Computes the covering range of two optional ranges /// /// For example `acc_range(Some(7..9), Some(1..3)) = Some(1..9)` -pub(crate) fn acc_range( - a: Option>, - b: Option>, -) -> Option> { +pub(crate) fn acc_range(a: Option>, b: Option>) -> Option> { match (a, b) { (Some(a), Some(b)) => Some(a.start.min(b.start)..a.end.max(b.end)), (None, x) | (x, None) => x, @@ -112,18 +109,13 @@ pub fn read_pages_locations( .collect() } -pub(crate) fn decode_offset_index( - data: &[u8], -) -> Result, ParquetError> { +pub(crate) fn decode_offset_index(data: &[u8]) -> Result, ParquetError> { let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; Ok(offset.page_locations) } -pub(crate) fn decode_column_index( - data: &[u8], - column_type: Type, -) -> Result { +pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result { let mut prot = TCompactSliceInputProtocol::new(data); let index = ColumnIndex::read_from_in_protocol(&mut prot)?; @@ -136,9 +128,7 @@ pub(crate) fn decode_column_index( Type::FLOAT => Index::FLOAT(NativeIndex::::try_new(index)?), Type::DOUBLE => Index::DOUBLE(NativeIndex::::try_new(index)?), Type::BYTE_ARRAY => Index::BYTE_ARRAY(NativeIndex::try_new(index)?), - Type::FIXED_LEN_BYTE_ARRAY => { - Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new(index)?) - } + Type::FIXED_LEN_BYTE_ARRAY => Index::FIXED_LEN_BYTE_ARRAY(NativeIndex::try_new(index)?), }; Ok(index) diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 93b034cf4f60..ea71763a0101 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -44,8 +44,7 @@ pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; /// Default value for [`WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; /// Default value for [`WriterProperties::created_by`] -pub const DEFAULT_CREATED_BY: &str = - concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); +pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); /// Default value for [`WriterProperties::column_index_truncate_length`] pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); /// Default value for [`BloomFilterProperties::fpp`] @@ -312,10 +311,7 @@ impl WriterProperties { /// Returns the [`BloomFilterProperties`] for the given column /// /// Returns `None` if bloom filter is disabled - pub fn bloom_filter_properties( - &self, - col: &ColumnPath, - ) -> Option<&BloomFilterProperties> { + pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> { self.column_properties .get(col) .and_then(|c| c.bloom_filter_properties()) @@ -608,11 +604,7 @@ impl WriterPropertiesBuilder { /// Sets max size for statistics for a column. /// Takes precedence over globally defined settings. - pub fn set_column_max_statistics_size( - mut self, - col: ColumnPath, - value: usize, - ) -> Self { + pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { self.get_mut_props(col).set_max_statistics_size(value); self } @@ -620,11 +612,7 @@ impl WriterPropertiesBuilder { /// Sets whether a bloom filter should be created for a specific column. /// The behavior is similar to [`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled). /// Takes precedence over globally defined settings. - pub fn set_column_bloom_filter_enabled( - mut self, - col: ColumnPath, - value: bool, - ) -> Self { + pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self { self.get_mut_props(col).set_bloom_filter_enabled(value); self } @@ -912,9 +900,7 @@ impl ReaderPropertiesBuilder { pub fn build(self) -> ReaderProperties { ReaderProperties { codec_options: self.codec_options_builder.build(), - read_bloom_filter: self - .read_bloom_filter - .unwrap_or(DEFAULT_READ_BLOOM_FILTER), + read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER), } } @@ -1066,10 +1052,7 @@ mod tests { .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY) .set_column_dictionary_enabled(ColumnPath::from("col"), true) - .set_column_statistics_enabled( - ColumnPath::from("col"), - EnabledStatistics::Chunk, - ) + .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk) .set_column_max_statistics_size(ColumnPath::from("col"), 123) .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64) diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 7d2d7ea153d8..921f9df290cc 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -161,33 +161,28 @@ pub trait RowGroupReader: Send + Sync { let col_descr = schema_descr.column(i); let col_page_reader = self.get_column_page_reader(i)?; let col_reader = match col_descr.physical_type() { - Type::BOOLEAN => ColumnReader::BoolColumnReader(ColumnReaderImpl::new( + Type::BOOLEAN => { + ColumnReader::BoolColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT32 => { + ColumnReader::Int32ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT64 => { + ColumnReader::Int64ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::INT96 => { + ColumnReader::Int96ColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::FLOAT => { + ColumnReader::FloatColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::DOUBLE => { + ColumnReader::DoubleColumnReader(ColumnReaderImpl::new(col_descr, col_page_reader)) + } + Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader(ColumnReaderImpl::new( col_descr, col_page_reader, )), - Type::INT32 => ColumnReader::Int32ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::INT64 => ColumnReader::Int64ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::INT96 => ColumnReader::Int96ColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::FLOAT => ColumnReader::FloatColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::DOUBLE => ColumnReader::DoubleColumnReader(ColumnReaderImpl::new( - col_descr, - col_page_reader, - )), - Type::BYTE_ARRAY => ColumnReader::ByteArrayColumnReader( - ColumnReaderImpl::new(col_descr, col_page_reader), - ), Type::FIXED_LEN_BYTE_ARRAY => ColumnReader::FixedLenByteArrayColumnReader( ColumnReaderImpl::new(col_descr, col_page_reader), ), diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index b60d30ffea23..0d032c27aa06 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -213,10 +213,8 @@ impl SerializedFileReader { let mut offset_indexes = vec![]; for rg in &mut filtered_row_groups { - let column_index = - index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; - let offset_index = - index_reader::read_pages_locations(&chunk_reader, rg.columns())?; + let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; + let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; columns_indexes.push(column_index); offset_indexes.push(offset_index); } @@ -402,8 +400,8 @@ pub(crate) fn decode_page( let mut can_decompress = true; if let Some(ref header_v2) = page_header.data_page_header_v2 { - offset = (header_v2.definition_levels_byte_length - + header_v2.repetition_levels_byte_length) as usize; + offset = (header_v2.definition_levels_byte_length + header_v2.repetition_levels_byte_length) + as usize; // When is_compressed flag is missing the page is considered compressed can_decompress = header_v2.is_compressed.unwrap_or(true); } @@ -437,10 +435,9 @@ pub(crate) fn decode_page( let result = match page_header.type_ { PageType::DICTIONARY_PAGE => { - let dict_header = - page_header.dictionary_page_header.as_ref().ok_or_else(|| { - ParquetError::General("Missing dictionary page header".to_string()) - })?; + let dict_header = page_header.dictionary_page_header.as_ref().ok_or_else(|| { + ParquetError::General("Missing dictionary page header".to_string()) + })?; let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: buffer, @@ -450,9 +447,9 @@ pub(crate) fn decode_page( } } PageType::DATA_PAGE => { - let header = page_header.data_page_header.ok_or_else(|| { - ParquetError::General("Missing V1 data page header".to_string()) - })?; + let header = page_header + .data_page_header + .ok_or_else(|| ParquetError::General("Missing V1 data page header".to_string()))?; Page::DataPage { buf: buffer, num_values: header.num_values as u32, @@ -463,9 +460,9 @@ pub(crate) fn decode_page( } } PageType::DATA_PAGE_V2 => { - let header = page_header.data_page_header_v2.ok_or_else(|| { - ParquetError::General("Missing V2 data page header".to_string()) - })?; + let header = page_header + .data_page_header_v2 + .ok_or_else(|| ParquetError::General("Missing V2 data page header".to_string()))?; let is_compressed = header.is_compressed.unwrap_or(true); Page::DataPageV2 { buf: buffer, @@ -532,13 +529,7 @@ impl SerializedPageReader { page_locations: Option>, ) -> Result { let props = Arc::new(ReaderProperties::builder().build()); - SerializedPageReader::new_with_properties( - reader, - meta, - total_rows, - page_locations, - props, - ) + SerializedPageReader::new_with_properties(reader, meta, total_rows, page_locations, props) } /// Creates a new serialized page with custom options. @@ -555,14 +546,11 @@ impl SerializedPageReader { let state = match page_locations { Some(locations) => { let dictionary_page = match locations.first() { - Some(dict_offset) if dict_offset.offset as u64 != start => { - Some(PageLocation { - offset: start as i64, - compressed_page_size: (dict_offset.offset as u64 - start) - as i32, - first_row_index: 0, - }) - } + Some(dict_offset) if dict_offset.offset as u64 != start => Some(PageLocation { + offset: start as i64, + compressed_page_size: (dict_offset.offset as u64 - start) as i32, + first_row_index: 0, + }), _ => None, }; @@ -773,9 +761,7 @@ impl PageReader for SerializedPageReader { fn at_record_boundary(&mut self) -> Result { match &mut self.state { - SerializedPageReaderState::Values { .. } => { - Ok(self.peek_next_page()?.is_none()) - } + SerializedPageReaderState::Values { .. } => Ok(self.peek_next_page()?.is_none()), SerializedPageReaderState::Pages { .. } => Ok(true), } } @@ -792,9 +778,7 @@ mod tests { use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; - use crate::file::page_index::index_reader::{ - read_columns_indexes, read_pages_locations, - }; + use crate::file::page_index::index_reader::{read_columns_indexes, read_pages_locations}; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -1156,8 +1140,7 @@ mod tests { assert_eq!(col0_metadata.bloom_filter_offset().unwrap(), 192); // test page encoding stats - let page_encoding_stats = - col0_metadata.page_encoding_stats().unwrap().get(0).unwrap(); + let page_encoding_stats = col0_metadata.page_encoding_stats().unwrap().get(0).unwrap(); assert_eq!(page_encoding_stats.page_type, basic::PageType::DATA_PAGE); assert_eq!(page_encoding_stats.encoding, Encoding::PLAIN); @@ -1548,10 +1531,7 @@ mod tests { }); } - fn get_row_group_min_max_bytes( - r: &RowGroupMetaData, - col_num: usize, - ) -> (&[u8], &[u8]) { + fn get_row_group_min_max_bytes(r: &RowGroupMetaData, col_num: usize) -> (&[u8], &[u8]) { let statistics = r.column(col_num).statistics().unwrap(); (statistics.min_bytes(), statistics.max_bytes()) } @@ -1713,8 +1693,7 @@ mod tests { let schema = parse_message_type(message_type).unwrap(); let mut out = Vec::with_capacity(1024); let mut writer = - SerializedFileWriter::new(&mut out, Arc::new(schema), Default::default()) - .unwrap(); + SerializedFileWriter::new(&mut out, Arc::new(schema), Default::default()).unwrap(); let mut r = writer.next_row_group().unwrap(); let mut c = r.next_column().unwrap().unwrap(); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 7796be6013df..dbbd8b4b99a2 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -27,9 +27,7 @@ use std::io::{BufWriter, IoSlice, Read}; use std::{io::Write, sync::Arc}; use thrift::protocol::TCompactOutputProtocol; -use crate::column::writer::{ - get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl, -}; +use crate::column::writer::{get_typed_column_writer_mut, ColumnCloseResult, ColumnWriterImpl}; use crate::column::{ page::{CompressedPage, PageWriteSpec, PageWriter}, writer::{get_column_writer, ColumnWriter}, @@ -38,9 +36,7 @@ use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; use crate::file::reader::ChunkReader; use crate::file::{metadata::*, properties::WriterPropertiesPtr, PARQUET_MAGIC}; -use crate::schema::types::{ - self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr, -}; +use crate::schema::types::{self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr}; /// A wrapper around a [`Write`] that keeps track of the number /// of bytes that have been written. The given [`Write`] is wrapped @@ -68,10 +64,7 @@ impl TrackedWrite { /// Returns the underlying writer. pub fn into_inner(self) -> Result { self.inner.into_inner().map_err(|err| { - ParquetError::General(format!( - "fail to get inner writer: {:?}", - err.to_string() - )) + ParquetError::General(format!("fail to get inner writer: {:?}", err.to_string())) }) } } @@ -193,16 +186,14 @@ impl SerializedFileWriter { let row_bloom_filters = &mut self.bloom_filters; let row_column_indexes = &mut self.column_indexes; let row_offset_indexes = &mut self.offset_indexes; - let on_close = |metadata, - row_group_bloom_filter, - row_group_column_index, - row_group_offset_index| { - row_groups.push(metadata); - row_bloom_filters.push(row_group_bloom_filter); - row_column_indexes.push(row_group_column_index); - row_offset_indexes.push(row_group_offset_index); - Ok(()) - }; + let on_close = + |metadata, row_group_bloom_filter, row_group_column_index, row_group_offset_index| { + row_groups.push(metadata); + row_bloom_filters.push(row_group_bloom_filter); + row_column_indexes.push(row_group_column_index); + row_offset_indexes.push(row_group_offset_index); + Ok(()) + }; let row_group_writer = SerializedRowGroupWriter::new( self.descr.clone(), @@ -238,8 +229,7 @@ impl SerializedFileWriter { // iter each column // write offset index to the file for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { - for (column_idx, column_metadata) in row_group.columns.iter_mut().enumerate() - { + for (column_idx, column_metadata) in row_group.columns.iter_mut().enumerate() { match &self.offset_indexes[row_group_idx][column_idx] { Some(offset_index) => { let start_offset = self.buf.bytes_written(); @@ -292,8 +282,7 @@ impl SerializedFileWriter { // iter each column // write column index to the file for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { - for (column_idx, column_metadata) in row_group.columns.iter_mut().enumerate() - { + for (column_idx, column_metadata) in row_group.columns.iter_mut().enumerate() { match &self.column_indexes[row_group_idx][column_idx] { Some(column_index) => { let start_offset = self.buf.bytes_written(); @@ -504,10 +493,7 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { /// Returns the next column writer, if available, using the factory function; /// otherwise returns `None`. - pub(crate) fn next_column_with_factory<'b, F, C>( - &'b mut self, - factory: F, - ) -> Result> + pub(crate) fn next_column_with_factory<'b, F, C>(&'b mut self, factory: F) -> Result> where F: FnOnce( ColumnDescPtr, @@ -550,9 +536,9 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { mut close: ColumnCloseResult, ) -> Result<()> { self.assert_previous_writer_closed()?; - let desc = self.next_column_desc().ok_or_else(|| { - general_err!("exhausted columns in SerializedRowGroupWriter") - })?; + let desc = self + .next_column_desc() + .ok_or_else(|| general_err!("exhausted columns in SerializedRowGroupWriter"))?; let metadata = close.metadata; @@ -659,10 +645,7 @@ pub struct SerializedColumnWriter<'a> { impl<'a> SerializedColumnWriter<'a> { /// Create a new [`SerializedColumnWriter`] from a [`ColumnWriter`] and an /// optional callback to be invoked on [`Self::close`] - pub fn new( - inner: ColumnWriter<'a>, - on_close: Option>, - ) -> Self { + pub fn new(inner: ColumnWriter<'a>, on_close: Option>) -> Self { Self { inner, on_close } } @@ -862,8 +845,7 @@ mod tests { .unwrap(), ); let props = Default::default(); - let writer = - SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); writer.close().unwrap(); let reader = SerializedFileReader::new(file).unwrap(); @@ -892,8 +874,7 @@ mod tests { )])) .build(), ); - let writer = - SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); writer.close().unwrap(); let reader = SerializedFileReader::new(file).unwrap(); @@ -938,8 +919,7 @@ mod tests { .set_writer_version(WriterVersion::PARQUET_2_0) .build(), ); - let writer = - SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); writer.close().unwrap(); let reader = SerializedFileReader::new(file).unwrap(); @@ -1150,11 +1130,8 @@ mod tests { encoding, def_level_encoding, rep_level_encoding, - statistics: from_thrift( - physical_type, - to_thrift(statistics.as_ref()), - ) - .unwrap(), + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())) + .unwrap(), } } Page::DataPageV2 { @@ -1170,8 +1147,7 @@ mod tests { } => { total_num_values += num_values as i64; let offset = (def_levels_byte_len + rep_levels_byte_len) as usize; - let cmp_buf = - compress_helper(compressor.as_mut(), &buf.data()[offset..]); + let cmp_buf = compress_helper(compressor.as_mut(), &buf.data()[offset..]); let mut output_buf = Vec::from(&buf.data()[..offset]); output_buf.extend_from_slice(&cmp_buf[..]); @@ -1184,11 +1160,8 @@ mod tests { def_levels_byte_len, rep_levels_byte_len, is_compressed: compressor.is_some(), - statistics: from_thrift( - physical_type, - to_thrift(statistics.as_ref()), - ) - .unwrap(), + statistics: from_thrift(physical_type, to_thrift(statistics.as_ref())) + .unwrap(), } } Page::DictionaryPage { @@ -1291,12 +1264,7 @@ mod tests { W: Write + Send, R: ChunkReader + From + 'static, { - test_roundtrip::( - file, - data, - |r| r.get_int(0).unwrap(), - compression, - ) + test_roundtrip::(file, data, |r| r.get_int(0).unwrap(), compression) } /// Tests roundtrip of data of type `D` written using `W` and read using `R` @@ -1329,8 +1297,7 @@ mod tests { .set_compression(compression) .build(), ); - let mut file_writer = - SerializedFileWriter::new(&mut file, schema, props).unwrap(); + let mut file_writer = SerializedFileWriter::new(&mut file, schema, props).unwrap(); let mut rows: i64 = 0; for (idx, subset) in data.iter().enumerate() { @@ -1378,10 +1345,7 @@ mod tests { /// File write-read roundtrip. /// `data` consists of arrays of values for each row group. - fn test_file_roundtrip( - file: File, - data: Vec>, - ) -> crate::format::FileMetaData { + fn test_file_roundtrip(file: File, data: Vec>) -> crate::format::FileMetaData { test_roundtrip_i32::(file, data, Compression::UNCOMPRESSED) } @@ -1467,10 +1431,7 @@ mod tests { }); } - fn test_kv_metadata( - initial_kv: Option>, - final_kv: Option>, - ) { + fn test_kv_metadata(initial_kv: Option>, final_kv: Option>) { let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(vec![Arc::new( @@ -1599,8 +1560,7 @@ mod tests { let props = Arc::new(WriterProperties::builder().build()); let mut file = Vec::with_capacity(1024); - let mut file_writer = - SerializedFileWriter::new(&mut file, schema, props.clone()).unwrap(); + let mut file_writer = SerializedFileWriter::new(&mut file, schema, props.clone()).unwrap(); let columns = file_writer.descr.columns(); let mut column_state: Vec<(_, Option)> = columns @@ -1715,8 +1675,7 @@ mod tests { assert!(row_group.columns[1].column_index_offset.is_none()); let options = ReadOptionsBuilder::new().with_page_index().build(); - let reader = - SerializedFileReader::new_with_options(Bytes::from(file), options).unwrap(); + let reader = SerializedFileReader::new_with_options(Bytes::from(file), options).unwrap(); let offset_index = reader.metadata().offset_index().unwrap(); assert_eq!(offset_index.len(), 1); // 1 row group diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index ccff233c21db..c7a0b09c37ed 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -643,13 +643,13 @@ impl Field { let field = match descr.physical_type() { PhysicalType::BYTE_ARRAY => match descr.converted_type() { ConvertedType::UTF8 | ConvertedType::ENUM | ConvertedType::JSON => { - let value = - String::from_utf8(value.data().to_vec()).map_err(|e| { - general_err!( - "Error reading BYTE_ARRAY as String. Bytes: {:?} Error: {:?}", - value.data(), e - ) - })?; + let value = String::from_utf8(value.data().to_vec()).map_err(|e| { + general_err!( + "Error reading BYTE_ARRAY as String. Bytes: {:?} Error: {:?}", + value.data(), + e + ) + })?; Field::Str(value) } ConvertedType::BSON | ConvertedType::NONE => Field::Bytes(value), @@ -700,12 +700,8 @@ impl Field { Field::Str(s) => Value::String(s.to_owned()), Field::Bytes(b) => Value::String(BASE64_STANDARD.encode(b.data())), Field::Date(d) => Value::String(convert_date_to_string(*d)), - Field::TimestampMillis(ts) => { - Value::String(convert_timestamp_millis_to_string(*ts)) - } - Field::TimestampMicros(ts) => { - Value::String(convert_timestamp_micros_to_string(*ts)) - } + Field::TimestampMillis(ts) => Value::String(convert_timestamp_millis_to_string(*ts)), + Field::TimestampMicros(ts) => Value::String(convert_timestamp_micros_to_string(*ts)), Field::Group(row) => row.to_json_value(), Field::ListInternal(fields) => { Value::Array(fields.elements.iter().map(|f| f.to_json_value()).collect()) @@ -955,8 +951,7 @@ mod tests { let row = Field::convert_int32(&descr, 14611); assert_eq!(row, Field::Date(14611)); - let descr = - make_column_descr![PhysicalType::INT32, ConvertedType::DECIMAL, 0, 8, 2]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); } @@ -971,13 +966,11 @@ mod tests { let row = Field::convert_int64(&descr, 78239823); assert_eq!(row, Field::ULong(78239823)); - let descr = - make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MILLIS]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MILLIS]; let row = Field::convert_int64(&descr, 1541186529153); assert_eq!(row, Field::TimestampMillis(1541186529153)); - let descr = - make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MICROS]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MICROS]; let row = Field::convert_int64(&descr, 1541186529153123); assert_eq!(row, Field::TimestampMicros(1541186529153123)); @@ -985,8 +978,7 @@ mod tests { let row = Field::convert_int64(&descr, 2222); assert_eq!(row, Field::Long(2222)); - let descr = - make_column_descr![PhysicalType::INT64, ConvertedType::DECIMAL, 0, 8, 2]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int64(&descr, 3333); assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); } @@ -1054,8 +1046,7 @@ mod tests { assert_eq!(row.unwrap(), Field::Bytes(value)); // DECIMAL - let descr = - make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::DECIMAL, 0, 8, 2]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::DECIMAL, 0, 8, 2]; let value = ByteArray::from(vec![207, 200]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!( @@ -1825,11 +1816,7 @@ mod tests { serde_json::json!({"X": 1, "Y": 2.2, "Z": "abc"}) ); - let row = Field::ListInternal(make_list(vec![ - Field::Int(1), - Field::Int(12), - Field::Null, - ])); + let row = Field::ListInternal(make_list(vec![Field::Int(1), Field::Int(12), Field::Null])); let array = vec![ Value::Number(serde_json::Number::from(1)), Value::Number(serde_json::Number::from(12)), diff --git a/parquet/src/record/mod.rs b/parquet/src/record/mod.rs index ce83cfa2b14a..771d8058c9c1 100644 --- a/parquet/src/record/mod.rs +++ b/parquet/src/record/mod.rs @@ -24,8 +24,7 @@ mod triplet; pub use self::{ api::{ - Field, List, ListAccessor, Map, MapAccessor, Row, RowAccessor, RowColumnIter, - RowFormatter, + Field, List, ListAccessor, Map, MapAccessor, Row, RowAccessor, RowColumnIter, RowFormatter, }, record_writer::RecordWriter, }; diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 2a9b6dbb0bed..f98939725517 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -82,14 +82,8 @@ impl TreeBuilder { let mut path = Vec::new(); for field in descr.root_schema().get_fields() { - let reader = self.reader_tree( - field.clone(), - &mut path, - 0, - 0, - &paths, - row_group_reader, - )?; + let reader = + self.reader_tree(field.clone(), &mut path, 0, 0, &paths, row_group_reader)?; readers.push(reader); } @@ -152,11 +146,7 @@ impl TreeBuilder { match field.get_basic_info().converted_type() { // List types ConvertedType::LIST => { - assert_eq!( - field.get_fields().len(), - 1, - "Invalid list type {field:?}" - ); + assert_eq!(field.get_fields().len(), 1, "Invalid list type {field:?}"); let repeated_field = field.get_fields()[0].clone(); assert_eq!( @@ -208,11 +198,7 @@ impl TreeBuilder { } // Map types (key-value pairs) ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { - assert_eq!( - field.get_fields().len(), - 1, - "Invalid map type: {field:?}" - ); + assert_eq!(field.get_fields().len(), 1, "Invalid map type: {field:?}"); assert!( !field.get_fields()[0].is_primitive(), "Invalid map type: {field:?}" @@ -404,8 +390,7 @@ impl Reader { Reader::GroupReader(_, _, ref mut readers) => { let mut fields = Vec::new(); for reader in readers { - fields - .push((String::from(reader.field_name()), reader.read_field()?)); + fields.push((String::from(reader.field_name()), reader.read_field()?)); } Ok(make_row(fields)) } @@ -436,10 +421,7 @@ impl Reader { if reader.repetition() != Repetition::OPTIONAL || reader.current_def_level() > def_level { - fields.push(( - String::from(reader.field_name()), - reader.read_field()?, - )); + fields.push((String::from(reader.field_name()), reader.read_field()?)); } else { reader.advance_columns(); fields.push((String::from(reader.field_name()), Field::Null)); @@ -471,13 +453,7 @@ impl Reader { } Field::ListInternal(make_list(elements)) } - Reader::KeyValueReader( - _, - def_level, - rep_level, - ref mut keys, - ref mut values, - ) => { + Reader::KeyValueReader(_, def_level, rep_level, ref mut keys, ref mut values) => { let mut pairs = Vec::new(); loop { if keys.current_def_level() > def_level { @@ -672,19 +648,14 @@ impl<'a> RowIter<'a> { /// file. pub fn from_file(proj: Option, reader: &'a dyn FileReader) -> Result { let either = Either::Left(reader); - let descr = Self::get_proj_descr( - proj, - reader.metadata().file_metadata().schema_descr_ptr(), - )?; + let descr = + Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; Ok(Self::new(Some(either), None, descr)) } /// Creates iterator of [`Row`]s for a specific row group. - pub fn from_row_group( - proj: Option, - reader: &'a dyn RowGroupReader, - ) -> Result { + pub fn from_row_group(proj: Option, reader: &'a dyn RowGroupReader) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; let tree_builder = Self::tree_builder(); let row_iter = tree_builder.as_iter(descr.clone(), reader)?; @@ -730,10 +701,7 @@ impl<'a> RowIter<'a> { /// Helper method to get schema descriptor for projected schema. /// If projection is None, then full schema is returned. #[inline] - fn get_proj_descr( - proj: Option, - root_descr: SchemaDescPtr, - ) -> Result { + fn get_proj_descr(proj: Option, root_descr: SchemaDescPtr) -> Result { match proj { Some(projection) => { // check if projection is part of file schema @@ -999,17 +967,11 @@ mod tests { list![ group![ ("E".to_string(), Field::Int(10)), - ( - "F".to_string(), - Field::Str("aaa".to_string()) - ) + ("F".to_string(), Field::Str("aaa".to_string())) ], group![ ("E".to_string(), Field::Int(-10)), - ( - "F".to_string(), - Field::Str("bbb".to_string()) - ) + ("F".to_string(), Field::Str("bbb".to_string())) ] ], list![group![ @@ -1089,10 +1051,7 @@ mod tests { ], group![ ("E".to_string(), Field::Int(10)), - ( - "F".to_string(), - Field::Str("aaa".to_string()) - ) + ("F".to_string(), Field::Str("aaa".to_string())) ], group![ ("E".to_string(), Field::Null), @@ -1100,10 +1059,7 @@ mod tests { ], group![ ("E".to_string(), Field::Int(-10)), - ( - "F".to_string(), - Field::Str("bbb".to_string()) - ) + ("F".to_string(), Field::Str("bbb".to_string())) ], group![ ("E".to_string(), Field::Null), @@ -1113,10 +1069,7 @@ mod tests { list![ group![ ("E".to_string(), Field::Int(11)), - ( - "F".to_string(), - Field::Str("c".to_string()) - ) + ("F".to_string(), Field::Str("c".to_string())) ], Field::Null ], @@ -1140,10 +1093,7 @@ mod tests { ), ( Field::Str("g2".to_string()), - group![( - "H".to_string(), - group![("i".to_string(), list![])] - )] + group![("H".to_string(), group![("i".to_string(), list![])])] ), (Field::Str("g3".to_string()), Field::Null), ( @@ -1277,8 +1227,7 @@ mod tests { } "; let schema = parse_message_type(schema).unwrap(); - let rows = - test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![ ("c".to_string(), Field::Double(1.0)), @@ -1345,8 +1294,7 @@ mod tests { } "; let schema = parse_message_type(schema).unwrap(); - let rows = - test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); + let rows = test_file_reader_rows("nested_maps.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![( "a".to_string(), @@ -1412,8 +1360,7 @@ mod tests { } "; let schema = parse_message_type(schema).unwrap(); - let rows = - test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); + let rows = test_file_reader_rows("nested_lists.snappy.parquet", Some(schema)).unwrap(); let expected_rows = vec![ row![( "a".to_string(), diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index 1d3488bf2d63..7647b23e28d7 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -58,32 +58,30 @@ impl TripletIter { /// Creates new triplet for column reader pub fn new(descr: ColumnDescPtr, reader: ColumnReader, batch_size: usize) -> Self { match descr.physical_type() { - PhysicalType::BOOLEAN => TripletIter::BoolTripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )), - PhysicalType::INT32 => TripletIter::Int32TripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )), - PhysicalType::INT64 => TripletIter::Int64TripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )), - PhysicalType::INT96 => TripletIter::Int96TripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )), - PhysicalType::FLOAT => TripletIter::FloatTripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )), - PhysicalType::DOUBLE => TripletIter::DoubleTripletIter( - TypedTripletIter::new(descr, batch_size, reader), - ), - PhysicalType::BYTE_ARRAY => TripletIter::ByteArrayTripletIter( + PhysicalType::BOOLEAN => { + TripletIter::BoolTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT32 => { + TripletIter::Int32TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT64 => { + TripletIter::Int64TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::INT96 => { + TripletIter::Int96TripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FLOAT => { + TripletIter::FloatTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::DOUBLE => { + TripletIter::DoubleTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::BYTE_ARRAY => { + TripletIter::ByteArrayTripletIter(TypedTripletIter::new(descr, batch_size, reader)) + } + PhysicalType::FIXED_LEN_BYTE_ARRAY => TripletIter::FixedLenByteArrayTripletIter( TypedTripletIter::new(descr, batch_size, reader), ), - PhysicalType::FIXED_LEN_BYTE_ARRAY => { - TripletIter::FixedLenByteArrayTripletIter(TypedTripletIter::new( - descr, batch_size, reader, - )) - } } } @@ -159,16 +157,13 @@ impl TripletIter { TripletIter::DoubleTripletIter(ref typed) => { Field::convert_double(typed.column_descr(), *typed.current_value()) } - TripletIter::ByteArrayTripletIter(ref typed) => Field::convert_byte_array( + TripletIter::ByteArrayTripletIter(ref typed) => { + Field::convert_byte_array(typed.column_descr(), typed.current_value().clone())? + } + TripletIter::FixedLenByteArrayTripletIter(ref typed) => Field::convert_byte_array( typed.column_descr(), - typed.current_value().clone(), + typed.current_value().clone().into(), )?, - TripletIter::FixedLenByteArrayTripletIter(ref typed) => { - Field::convert_byte_array( - typed.column_descr(), - typed.current_value().clone().into(), - )? - } }; Ok(field) } @@ -371,8 +366,7 @@ mod tests { #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] fn test_triplet_zero_batch_size() { - let column_path = - ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); + let column_path = ColumnPath::from(vec!["b_struct".to_string(), "b_c_int".to_string()]); test_column_in_file("nulls.snappy.parquet", 0, &column_path, &[], &[], &[]); } diff --git a/parquet/src/schema/parser.rs b/parquet/src/schema/parser.rs index d589f8c1100a..5e213e3bb9e5 100644 --- a/parquet/src/schema/parser.rs +++ b/parquet/src/schema/parser.rs @@ -44,9 +44,7 @@ use std::sync::Arc; -use crate::basic::{ - ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType, -}; +use crate::basic::{ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; use crate::schema::types::{Type, TypePtr}; @@ -153,11 +151,7 @@ fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { // Utility function to parse i32 or return general error. #[inline] -fn parse_i32( - value: Option<&str>, - not_found_msg: &str, - parse_fail_msg: &str, -) -> Result { +fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { value .ok_or_else(|| general_err!(not_found_msg)) .and_then(|v| v.parse::().map_err(|_| general_err!(parse_fail_msg))) @@ -165,11 +159,7 @@ fn parse_i32( // Utility function to parse boolean or return general error. #[inline] -fn parse_bool( - value: Option<&str>, - not_found_msg: &str, - parse_fail_msg: &str, -) -> Result { +fn parse_bool(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result { value .ok_or_else(|| general_err!(not_found_msg)) .and_then(|v| { @@ -238,9 +228,7 @@ impl<'a> Parser<'a> { .and_then(|v| v.to_uppercase().parse::())?; match self.tokenizer.next() { - Some(group) if group.to_uppercase() == "GROUP" => { - self.add_group_type(Some(repetition)) - } + Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)), Some(type_string) => { let physical_type = type_string.to_uppercase().parse::()?; self.add_primitive_type(repetition, physical_type) @@ -267,10 +255,9 @@ impl<'a> Parser<'a> { let upper = v.to_uppercase(); let logical = upper.parse::(); match logical { - Ok(logical) => Ok(( - Some(logical.clone()), - ConvertedType::from(Some(logical)), - )), + Ok(logical) => { + Ok((Some(logical.clone()), ConvertedType::from(Some(logical)))) + } Err(_) => Ok((None, upper.parse::()?)), } })?; @@ -324,184 +311,187 @@ impl<'a> Parser<'a> { .ok_or_else(|| general_err!("Expected name, found None"))?; // Parse converted type - let (logical_type, converted_type, precision, scale) = if let Some("(") = - self.tokenizer.next() - { - let (mut logical, mut converted) = self - .tokenizer - .next() - .ok_or_else(|| { - general_err!("Expected logical or converted type, found None") - }) - .and_then(|v| { - let upper = v.to_uppercase(); - let logical = upper.parse::(); - match logical { - Ok(logical) => Ok(( - Some(logical.clone()), - ConvertedType::from(Some(logical)), - )), - Err(_) => Ok((None, upper.parse::()?)), - } - })?; - - // Parse precision and scale for decimals - let mut precision: i32 = -1; - let mut scale: i32 = -1; - - // Parse the concrete logical type - if let Some(tpe) = &logical { - match tpe { - LogicalType::Decimal { .. } => { - if let Some("(") = self.tokenizer.next() { - precision = parse_i32( - self.tokenizer.next(), - "Expected precision, found None", - "Failed to parse precision for DECIMAL type", - )?; - if let Some(",") = self.tokenizer.next() { - scale = parse_i32( + let (logical_type, converted_type, precision, scale) = + if let Some("(") = self.tokenizer.next() { + let (mut logical, mut converted) = self + .tokenizer + .next() + .ok_or_else(|| general_err!("Expected logical or converted type, found None")) + .and_then(|v| { + let upper = v.to_uppercase(); + let logical = upper.parse::(); + match logical { + Ok(logical) => { + Ok((Some(logical.clone()), ConvertedType::from(Some(logical)))) + } + Err(_) => Ok((None, upper.parse::()?)), + } + })?; + + // Parse precision and scale for decimals + let mut precision: i32 = -1; + let mut scale: i32 = -1; + + // Parse the concrete logical type + if let Some(tpe) = &logical { + match tpe { + LogicalType::Decimal { .. } => { + if let Some("(") = self.tokenizer.next() { + precision = parse_i32( self.tokenizer.next(), - "Expected scale, found None", - "Failed to parse scale for DECIMAL type", + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", )?; - assert_token(self.tokenizer.next(), ")")?; - } else { - scale = 0 + if let Some(",") = self.tokenizer.next() { + scale = parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )?; + assert_token(self.tokenizer.next(), ")")?; + } else { + scale = 0 + } + logical = Some(LogicalType::Decimal { scale, precision }); + converted = ConvertedType::from(logical.clone()); } - logical = Some(LogicalType::Decimal { scale, precision }); - converted = ConvertedType::from(logical.clone()); } - } - LogicalType::Time { .. } => { - if let Some("(") = self.tokenizer.next() { - let unit = parse_timeunit( - self.tokenizer.next(), - "Invalid timeunit found", - "Failed to parse timeunit for TIME type", - )?; - if let Some(",") = self.tokenizer.next() { - let is_adjusted_to_u_t_c = parse_bool( + LogicalType::Time { .. } => { + if let Some("(") = self.tokenizer.next() { + let unit = parse_timeunit( self.tokenizer.next(), - "Invalid boolean found", - "Failed to parse timezone info for TIME type", + "Invalid timeunit found", + "Failed to parse timeunit for TIME type", )?; - assert_token(self.tokenizer.next(), ")")?; - logical = Some(LogicalType::Time { - is_adjusted_to_u_t_c, - unit, - }); - converted = ConvertedType::from(logical.clone()); - } else { - // Invalid token for unit - self.tokenizer.backtrack(); + if let Some(",") = self.tokenizer.next() { + let is_adjusted_to_u_t_c = parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse timezone info for TIME type", + )?; + assert_token(self.tokenizer.next(), ")")?; + logical = Some(LogicalType::Time { + is_adjusted_to_u_t_c, + unit, + }); + converted = ConvertedType::from(logical.clone()); + } else { + // Invalid token for unit + self.tokenizer.backtrack(); + } } } - } - LogicalType::Timestamp { .. } => { - if let Some("(") = self.tokenizer.next() { - let unit = parse_timeunit( - self.tokenizer.next(), - "Invalid timeunit found", - "Failed to parse timeunit for TIMESTAMP type", - )?; - if let Some(",") = self.tokenizer.next() { - let is_adjusted_to_u_t_c = parse_bool( + LogicalType::Timestamp { .. } => { + if let Some("(") = self.tokenizer.next() { + let unit = parse_timeunit( self.tokenizer.next(), - "Invalid boolean found", - "Failed to parse timezone info for TIMESTAMP type", + "Invalid timeunit found", + "Failed to parse timeunit for TIMESTAMP type", )?; - assert_token(self.tokenizer.next(), ")")?; - logical = Some(LogicalType::Timestamp { - is_adjusted_to_u_t_c, - unit, - }); - converted = ConvertedType::from(logical.clone()); - } else { - // Invalid token for unit - self.tokenizer.backtrack(); + if let Some(",") = self.tokenizer.next() { + let is_adjusted_to_u_t_c = parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse timezone info for TIMESTAMP type", + )?; + assert_token(self.tokenizer.next(), ")")?; + logical = Some(LogicalType::Timestamp { + is_adjusted_to_u_t_c, + unit, + }); + converted = ConvertedType::from(logical.clone()); + } else { + // Invalid token for unit + self.tokenizer.backtrack(); + } } } - } - LogicalType::Integer { .. } => { - if let Some("(") = self.tokenizer.next() { - let bit_width = parse_i32( - self.tokenizer.next(), - "Invalid bit_width found", - "Failed to parse bit_width for INTEGER type", - )? as i8; - match physical_type { - PhysicalType::INT32 => { - match bit_width { + LogicalType::Integer { .. } => { + if let Some("(") = self.tokenizer.next() { + let bit_width = parse_i32( + self.tokenizer.next(), + "Invalid bit_width found", + "Failed to parse bit_width for INTEGER type", + )? as i8; + match physical_type { + PhysicalType::INT32 => match bit_width { 8 | 16 | 32 => {} _ => { - return Err(general_err!("Incorrect bit width {} for INT32", bit_width)) + return Err(general_err!( + "Incorrect bit width {} for INT32", + bit_width + )) + } + }, + PhysicalType::INT64 => { + if bit_width != 64 { + return Err(general_err!( + "Incorrect bit width {} for INT64", + bit_width + )); } } - } - PhysicalType::INT64 => { - if bit_width != 64 { - return Err(general_err!("Incorrect bit width {} for INT64", bit_width)) + _ => { + return Err(general_err!( + "Logical type Integer cannot be used with physical type {}", + physical_type + )) } } - _ => { - return Err(general_err!("Logical type Integer cannot be used with physical type {}", physical_type)) + if let Some(",") = self.tokenizer.next() { + let is_signed = parse_bool( + self.tokenizer.next(), + "Invalid boolean found", + "Failed to parse is_signed for INTEGER type", + )?; + assert_token(self.tokenizer.next(), ")")?; + logical = Some(LogicalType::Integer { + bit_width, + is_signed, + }); + converted = ConvertedType::from(logical.clone()); + } else { + // Invalid token for unit + self.tokenizer.backtrack(); } } - if let Some(",") = self.tokenizer.next() { - let is_signed = parse_bool( - self.tokenizer.next(), - "Invalid boolean found", - "Failed to parse is_signed for INTEGER type", - )?; - assert_token(self.tokenizer.next(), ")")?; - logical = Some(LogicalType::Integer { - bit_width, - is_signed, - }); - converted = ConvertedType::from(logical.clone()); - } else { - // Invalid token for unit - self.tokenizer.backtrack(); - } } + _ => {} } - _ => {} - } - } else if converted == ConvertedType::DECIMAL { - if let Some("(") = self.tokenizer.next() { - // Parse precision - precision = parse_i32( - self.tokenizer.next(), - "Expected precision, found None", - "Failed to parse precision for DECIMAL type", - )?; - - // Parse scale - scale = if let Some(",") = self.tokenizer.next() { - parse_i32( + } else if converted == ConvertedType::DECIMAL { + if let Some("(") = self.tokenizer.next() { + // Parse precision + precision = parse_i32( self.tokenizer.next(), - "Expected scale, found None", - "Failed to parse scale for DECIMAL type", - )? + "Expected precision, found None", + "Failed to parse precision for DECIMAL type", + )?; + + // Parse scale + scale = if let Some(",") = self.tokenizer.next() { + parse_i32( + self.tokenizer.next(), + "Expected scale, found None", + "Failed to parse scale for DECIMAL type", + )? + } else { + // Scale is not provided, set it to 0. + self.tokenizer.backtrack(); + 0 + }; + + assert_token(self.tokenizer.next(), ")")?; } else { - // Scale is not provided, set it to 0. self.tokenizer.backtrack(); - 0 - }; - - assert_token(self.tokenizer.next(), ")")?; - } else { - self.tokenizer.backtrack(); + } } - } - assert_token(self.tokenizer.next(), ")")?; - (logical, converted, precision, scale) - } else { - self.tokenizer.backtrack(); - (None, ConvertedType::NONE, -1, -1) - }; + assert_token(self.tokenizer.next(), ")")?; + (logical, converted, precision, scale) + } else { + self.tokenizer.backtrack(); + (None, ConvertedType::NONE, -1, -1) + }; // Parse optional id let id = if let Some("=") = self.tokenizer.next() { @@ -605,12 +595,11 @@ mod tests { assert_eq!( res, vec![ - "message", "schema", "{", "required", "int32", "a", ";", "optional", - "binary", "c", "(", "UTF8", ")", ";", "required", "group", "d", "{", - "required", "int32", "a", ";", "optional", "binary", "c", "(", "UTF8", - ")", ";", "}", "required", "group", "e", "(", "LIST", ")", "{", - "repeated", "group", "list", "{", "required", "int32", "element", ";", - "}", "}", "}" + "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c", + "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a", + ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group", + "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32", + "element", ";", "}", "}", "}" ] ); } @@ -841,36 +830,30 @@ mod tests { let expected = Type::group_type_builder("root") .with_fields(vec![ Arc::new( - Type::primitive_type_builder( - "f1", - PhysicalType::FIXED_LEN_BYTE_ARRAY, - ) - .with_logical_type(Some(LogicalType::Decimal { - precision: 9, - scale: 3, - })) - .with_converted_type(ConvertedType::DECIMAL) - .with_length(5) - .with_precision(9) - .with_scale(3) - .build() - .unwrap(), + Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Decimal { + precision: 9, + scale: 3, + })) + .with_converted_type(ConvertedType::DECIMAL) + .with_length(5) + .with_precision(9) + .with_scale(3) + .build() + .unwrap(), ), Arc::new( - Type::primitive_type_builder( - "f2", - PhysicalType::FIXED_LEN_BYTE_ARRAY, - ) - .with_logical_type(Some(LogicalType::Decimal { - precision: 38, - scale: 18, - })) - .with_converted_type(ConvertedType::DECIMAL) - .with_length(16) - .with_precision(38) - .with_scale(18) - .build() - .unwrap(), + Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Decimal { + precision: 38, + scale: 18, + })) + .with_converted_type(ConvertedType::DECIMAL) + .with_length(16) + .with_precision(38) + .with_scale(18) + .build() + .unwrap(), ), ]) .build() @@ -910,14 +893,11 @@ mod tests { .with_logical_type(Some(LogicalType::List)) .with_converted_type(ConvertedType::LIST) .with_fields(vec![Arc::new( - Type::primitive_type_builder( - "a2", - PhysicalType::BYTE_ARRAY, - ) - .with_repetition(Repetition::REPEATED) - .with_converted_type(ConvertedType::UTF8) - .build() - .unwrap(), + Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_converted_type(ConvertedType::UTF8) + .build() + .unwrap(), )]) .build() .unwrap(), diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index fe63e758b251..fe4757d41aed 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -46,9 +46,7 @@ use std::{fmt, io}; use crate::basic::{ConvertedType, LogicalType, TimeUnit, Type as PhysicalType}; -use crate::file::metadata::{ - ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, -}; +use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData}; use crate::schema::types::Type; /// Prints Parquet metadata [`ParquetMetaData`] information. @@ -119,10 +117,7 @@ fn print_row_group_metadata(out: &mut dyn io::Write, rg_metadata: &RowGroupMetaD } #[allow(unused_must_use)] -fn print_column_chunk_metadata( - out: &mut dyn io::Write, - cc_metadata: &ColumnChunkMetaData, -) { +fn print_column_chunk_metadata(out: &mut dyn io::Write, cc_metadata: &ColumnChunkMetaData) { writeln!(out, "column type: {}", cc_metadata.column_type()); writeln!(out, "column path: {}", cc_metadata.column_path()); let encoding_strs: Vec<_> = cc_metadata @@ -648,34 +643,28 @@ mod tests { "REQUIRED FIXED_LEN_BYTE_ARRAY (16) field (UUID);", ), ( - Type::primitive_type_builder( - "decimal", - PhysicalType::FIXED_LEN_BYTE_ARRAY, - ) - .with_logical_type(Some(LogicalType::Decimal { - precision: 32, - scale: 20, - })) - .with_precision(32) - .with_scale(20) - .with_length(decimal_length_from_precision(32)) - .with_repetition(Repetition::REPEATED) - .build() - .unwrap(), + Type::primitive_type_builder("decimal", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Decimal { + precision: 32, + scale: 20, + })) + .with_precision(32) + .with_scale(20) + .with_length(decimal_length_from_precision(32)) + .with_repetition(Repetition::REPEATED) + .build() + .unwrap(), "REPEATED FIXED_LEN_BYTE_ARRAY (14) decimal (DECIMAL(32,20));", ), ( - Type::primitive_type_builder( - "decimal", - PhysicalType::FIXED_LEN_BYTE_ARRAY, - ) - .with_converted_type(ConvertedType::DECIMAL) - .with_precision(19) - .with_scale(4) - .with_length(decimal_length_from_precision(19)) - .with_repetition(Repetition::OPTIONAL) - .build() - .unwrap(), + Type::primitive_type_builder("decimal", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_converted_type(ConvertedType::DECIMAL) + .with_precision(19) + .with_scale(4) + .with_length(decimal_length_from_precision(19)) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(), "OPTIONAL FIXED_LEN_BYTE_ARRAY (9) decimal (DECIMAL(19,4));", ), ]; @@ -708,13 +697,12 @@ mod tests { .with_logical_type(Some(LogicalType::String)) .with_id(Some(1)) .build(); - let f4 = - Type::primitive_type_builder("f4", PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_repetition(Repetition::REPEATED) - .with_converted_type(ConvertedType::INTERVAL) - .with_length(12) - .with_id(Some(2)) - .build(); + let f4 = Type::primitive_type_builder("f4", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REPEATED) + .with_converted_type(ConvertedType::INTERVAL) + .with_length(12) + .with_id(Some(2)) + .build(); let struct_fields = vec![ Arc::new(f1.unwrap()), diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index f4cb3a9956d6..11c735420957 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -22,8 +22,7 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; use crate::format::SchemaElement; use crate::basic::{ - ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, - Type as PhysicalType, + ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType, }; use crate::errors::{ParquetError, Result}; @@ -58,10 +57,7 @@ pub enum Type { impl Type { /// Creates primitive type builder with provided field name and physical type. - pub fn primitive_type_builder( - name: &str, - physical_type: PhysicalType, - ) -> PrimitiveTypeBuilder { + pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder { PrimitiveTypeBuilder::new(name, physical_type) } @@ -128,8 +124,7 @@ impl Type { /// This method can be used to check if projected columns are part of the root schema. pub fn check_contains(&self, sub_type: &Type) -> bool { // Names match, and repetitions match or not set for both - let basic_match = self.get_basic_info().name() - == sub_type.get_basic_info().name() + let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name() && (self.is_schema() && sub_type.is_schema() || !self.is_schema() && !sub_type.is_schema() @@ -292,9 +287,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { // If a converted type is populated, check that it is consistent with // its logical type if self.converted_type != ConvertedType::NONE { - if ConvertedType::from(self.logical_type.clone()) - != self.converted_type - { + if ConvertedType::from(self.logical_type.clone()) != self.converted_type { return Err(general_err!( "Logical type {:?} is incompatible with converted type {} for field '{}'", logical_type, @@ -420,9 +413,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } ConvertedType::INTERVAL => { - if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY - || self.length != 12 - { + if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { return Err(general_err!( "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field", self.name @@ -431,7 +422,10 @@ impl<'a> PrimitiveTypeBuilder<'a> { } ConvertedType::ENUM => { if self.physical_type != PhysicalType::BYTE_ARRAY { - return Err(general_err!("ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field", self.name)); + return Err(general_err!( + "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field", + self.name + )); } } _ => { @@ -507,8 +501,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - let max_precision = - (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + let max_precision = (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; if self.precision > max_precision { return Err(general_err!( @@ -1049,10 +1042,7 @@ pub fn from_thrift(elements: &[SchemaElement]) -> Result { /// The first result is the starting index for the next Type after this one. If it is /// equal to `elements.len()`, then this Type is the last one. /// The second result is the result Type. -fn from_thrift_helper( - elements: &[SchemaElement], - index: usize, -) -> Result<(usize, TypePtr)> { +fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> { // Whether or not the current node is root (message type). // There is only one message type node in the schema tree. let is_root_node = index == 0; @@ -1086,8 +1076,7 @@ fn from_thrift_helper( "Repetition level must be defined for a primitive type" )); } - let repetition = - Repetition::try_from(elements[index].repetition_type.unwrap())?; + let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?; let physical_type = PhysicalType::try_from(elements[index].type_.unwrap())?; let length = elements[index].type_length.unwrap_or(-1); let scale = elements[index].scale.unwrap_or(-1); @@ -1617,8 +1606,7 @@ mod tests { .with_repetition(Repetition::REQUIRED) .with_converted_type(ConvertedType::INT_64) .build()?; - let item2 = - Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; + let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) .with_repetition(Repetition::REPEATED) .with_converted_type(ConvertedType::INT_32) diff --git a/parquet/src/schema/visitor.rs b/parquet/src/schema/visitor.rs index f83782c638f1..35fde11f1fbb 100644 --- a/parquet/src/schema/visitor.rs +++ b/parquet/src/schema/visitor.rs @@ -59,17 +59,11 @@ pub trait TypeVisitor { match list_item.as_ref() { Type::PrimitiveType { .. } => { - if list_item.get_basic_info().repetition() == Repetition::REPEATED - { - self.visit_list_with_item( - list_type.clone(), - list_item.clone(), - context, - ) + if list_item.get_basic_info().repetition() == Repetition::REPEATED { + self.visit_list_with_item(list_type.clone(), list_item.clone(), context) } else { Err(General( - "Primitive element type of list must be repeated." - .to_string(), + "Primitive element type of list must be repeated.".to_string(), )) } } @@ -87,11 +81,7 @@ pub trait TypeVisitor { context, ) } else { - self.visit_list_with_item( - list_type.clone(), - list_item.clone(), - context, - ) + self.visit_list_with_item(list_type.clone(), list_item.clone(), context) } } } @@ -176,11 +166,7 @@ mod tests { Ok(true) } - fn visit_map( - &mut self, - _map_type: TypePtr, - _context: TestVisitorContext, - ) -> Result { + fn visit_map(&mut self, _map_type: TypePtr, _context: TestVisitorContext) -> Result { unimplemented!() } diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index 57f52edc6ef0..ad6c3f688002 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -18,8 +18,8 @@ //! Custom thrift definitions use thrift::protocol::{ - TFieldIdentifier, TInputProtocol, TListIdentifier, TMapIdentifier, - TMessageIdentifier, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, + TFieldIdentifier, TInputProtocol, TListIdentifier, TMapIdentifier, TMessageIdentifier, + TOutputProtocol, TSetIdentifier, TStructIdentifier, TType, }; /// Reads and writes the struct to Thrift protocols. @@ -27,10 +27,7 @@ use thrift::protocol::{ /// Unlike [`thrift::protocol::TSerializable`] this uses generics instead of trait objects pub trait TSerializable: Sized { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result; - fn write_to_out_protocol( - &self, - o_prot: &mut T, - ) -> thrift::Result<()>; + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()>; } /// A more performant implementation of [`TCompactInputProtocol`] that reads a slice diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index 3142c8c52063..fab87f32f5c4 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -60,8 +60,7 @@ fn do_test(test: LayoutTest) { let mut buf = Vec::with_capacity(1024); let mut writer = - ArrowWriter::try_new(&mut buf, test.batches[0].schema(), Some(test.props)) - .unwrap(); + ArrowWriter::try_new(&mut buf, test.batches[0].schema(), Some(test.props)).unwrap(); for batch in test.batches { writer.write(&batch).unwrap(); } @@ -71,8 +70,7 @@ fn do_test(test: LayoutTest) { // Re-read file to decode column index let read_options = ArrowReaderOptions::new().with_page_index(true); let reader = - ParquetRecordBatchReaderBuilder::try_new_with_options(b.clone(), read_options) - .unwrap(); + ParquetRecordBatchReaderBuilder::try_new_with_options(b.clone(), read_options).unwrap(); assert_layout(&b, reader.metadata().as_ref(), &test.layout); } @@ -89,9 +87,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { // Check against offset index assert_eq!(offset_index.len(), row_group_layout.columns.len()); - for (column_index, column_layout) in - offset_index.iter().zip(&row_group_layout.columns) - { + for (column_index, column_layout) in offset_index.iter().zip(&row_group_layout.columns) { assert_eq!( column_index.len(), column_layout.pages.len(), @@ -147,8 +143,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { let pages = page_reader.collect::, _>>().unwrap(); assert_eq!( pages.len(), - column_layout.pages.len() - + column_layout.dictionary_page.is_some() as usize, + column_layout.pages.len() + column_layout.dictionary_page.is_some() as usize, "page {idx} count mismatch" ); diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index ea6878283a33..e629bfe757ab 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -165,9 +165,7 @@ impl Field { | Type::Slice(ref third_type) => match **third_type { Type::TypePath(_) => Some(self.optional_definition_levels()), Type::Reference(_, ref fourth_type) => match **fourth_type { - Type::TypePath(_) => { - Some(self.optional_definition_levels()) - } + Type::TypePath(_) => Some(self.optional_definition_levels()), _ => unimplemented!("Unsupported definition encountered"), }, _ => unimplemented!("Unsupported definition encountered"), @@ -175,9 +173,7 @@ impl Field { Type::Reference(_, ref third_type) => match **third_type { Type::TypePath(_) => Some(self.optional_definition_levels()), Type::Slice(ref fourth_type) => match **fourth_type { - Type::TypePath(_) => { - Some(self.optional_definition_levels()) - } + Type::TypePath(_) => Some(self.optional_definition_levels()), _ => unimplemented!("Unsupported definition encountered"), }, _ => unimplemented!("Unsupported definition encountered"), @@ -281,8 +277,7 @@ impl Field { fn option_into_vals(&self) -> proc_macro2::TokenStream { let field_name = &self.ident; let is_a_byte_buf = self.is_a_byte_buf; - let is_a_timestamp = - self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime); + let is_a_timestamp = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime); let is_a_date = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDate); let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid); let copy_to_vec = !matches!( @@ -327,8 +322,7 @@ impl Field { fn copied_direct_vals(&self) -> proc_macro2::TokenStream { let field_name = &self.ident; let is_a_byte_buf = self.is_a_byte_buf; - let is_a_timestamp = - self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime); + let is_a_timestamp = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime); let is_a_date = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDate); let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid); @@ -415,10 +409,7 @@ impl Type { Type::leaf_type_recursive_helper(self, None) } - fn leaf_type_recursive_helper<'a>( - ty: &'a Type, - parent_ty: Option<&'a Type>, - ) -> &'a Type { + fn leaf_type_recursive_helper<'a>(ty: &'a Type, parent_ty: Option<&'a Type>) -> &'a Type { match ty { Type::TypePath(_) => parent_ty.unwrap_or(ty), Type::Option(ref first_type) @@ -598,9 +589,7 @@ impl Type { let last_part = self.last_part(); match last_part.trim() { - "NaiveDateTime" => { - Some(quote! { ::parquet::basic::ConvertedType::TIMESTAMP_MILLIS }) - } + "NaiveDateTime" => Some(quote! { ::parquet::basic::ConvertedType::TIMESTAMP_MILLIS }), _ => None, } } @@ -636,10 +625,9 @@ impl Type { fn from_type_path(f: &syn::Field, p: &syn::TypePath) -> Self { let last_segment = p.path.segments.last().unwrap(); - let is_vec = - last_segment.ident == syn::Ident::new("Vec", proc_macro2::Span::call_site()); - let is_option = last_segment.ident - == syn::Ident::new("Option", proc_macro2::Span::call_site()); + let is_vec = last_segment.ident == syn::Ident::new("Vec", proc_macro2::Span::call_site()); + let is_option = + last_segment.ident == syn::Ident::new("Option", proc_macro2::Span::call_site()); if is_vec || is_option { let generic_type = match &last_segment.arguments { diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index f4f8be1e0d8c..d377fb0a62af 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -139,8 +139,7 @@ mod tests { assert_eq!(&schema, &generated_schema); let props = Default::default(); - let mut writer = - SerializedFileWriter::new(file, generated_schema, props).unwrap(); + let mut writer = SerializedFileWriter::new(file, generated_schema, props).unwrap(); let mut row_group = writer.next_row_group().unwrap(); drs.as_slice().write_to_row_group(&mut row_group).unwrap(); diff --git a/rustfmt.toml b/rustfmt.toml index 4522e520a469..585c1b612978 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -16,9 +16,3 @@ # under the License. edition = "2021" -max_width = 90 - -# ignore generated files -# ignore = [ -# "arrow/src/ipc/gen", -#] From 0b9105d70412d54ddb826122fd8a87eafe3b9413 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Oct 2023 13:52:52 +0100 Subject: [PATCH 1294/1411] Temporarily Disable Java Integration Tests (#4957) * Add additional integration test dependencies * Temporarily disable Java * Remove jpype --- .github/workflows/integration.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 62d2d2cb1a06..00c6b8bb0a90 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -102,8 +102,9 @@ jobs: run: conda run --no-capture-output ci/scripts/csharp_build.sh $PWD /build - name: Build Go run: conda run --no-capture-output ci/scripts/go_build.sh $PWD - - name: Build Java - run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build +# Temporarily disabled - https://github.com/apache/arrow-rs/issues/4963 +# - name: Build Java +# run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - name: Build JS run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build - name: Run integration tests @@ -114,7 +115,7 @@ jobs: --run-ipc \ --with-cpp=1 \ --with-csharp=1 \ - --with-java=1 \ + --with-java=0 \ --with-js=1 \ --with-go=1 \ --with-rust=1 \ From f4a2a88a658878db7d7b478880ebeecc4f27cb0f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:41:10 +0100 Subject: [PATCH 1295/1411] Add ObjectMeta::version and GetOptions::version (#4925) (#4935) --- object_store/src/aws/client.rs | 13 ++++++++++++- object_store/src/azure/client.rs | 14 +++++++++++++- object_store/src/client/get.rs | 5 +---- object_store/src/client/header.rs | 13 +++++++++++-- object_store/src/client/list_response.rs | 1 + object_store/src/gcp/client.rs | 7 ++++++- object_store/src/http/client.rs | 2 ++ object_store/src/lib.rs | 23 +++++++++++++++++++++++ object_store/src/local.rs | 1 + object_store/src/memory.rs | 4 ++++ object_store/src/prefix.rs | 1 + 11 files changed, 75 insertions(+), 9 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 6b34b181ab9d..00d6ee446f2f 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -20,6 +20,7 @@ use crate::aws::credential::{AwsCredential, CredentialExt}; use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; use crate::client::get::GetClient; use crate::client::header::get_etag; +use crate::client::header::HeaderConfig; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -549,6 +550,12 @@ impl S3Client { impl GetClient for S3Client { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: false, + last_modified_required: false, + version_header: Some("x-amz-version-id"), + }; + /// Make an S3 GET request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { let credential = self.get_credential().await?; @@ -558,7 +565,11 @@ impl GetClient for S3Client { false => Method::GET, }; - let builder = self.client.request(method, url); + let mut builder = self.client.request(method, url); + + if let Some(v) = &options.version { + builder = builder.query(&[("versionId", v)]) + } let response = builder .with_get_options(options) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index b5ef02191cd7..cd3df8c7b857 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -19,6 +19,7 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; +use crate::client::header::HeaderConfig; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -254,6 +255,12 @@ impl AzureClient { impl GetClient for AzureClient { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + version_header: Some("x-ms-version-id"), + }; + /// Make an Azure GET request /// /// @@ -265,12 +272,16 @@ impl GetClient for AzureClient { false => Method::GET, }; - let builder = self + let mut builder = self .client .request(method, url) .header(CONTENT_LENGTH, HeaderValue::from_static("0")) .body(Bytes::new()); + if let Some(v) = &options.version { + builder = builder.query(&[("versionid", v)]) + } + let response = builder .with_get_options(options) .with_azure_authorization(&credential, &self.config.account) @@ -427,6 +438,7 @@ impl TryFrom for ObjectMeta { last_modified: value.properties.last_modified, size: value.properties.content_length as usize, e_tag: value.properties.e_tag, + version: None, // For consistency with S3 and GCP which don't include this }) } } diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index ed1762ff8fe9..5f9cac9b424b 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -29,10 +29,7 @@ pub trait GetClient: Send + Sync + 'static { const STORE: &'static str; /// Configure the [`HeaderConfig`] for this client - const HEADER_CONFIG: HeaderConfig = HeaderConfig { - etag_required: true, - last_modified_required: true, - }; + const HEADER_CONFIG: HeaderConfig; async fn get_request(&self, path: &Path, options: GetOptions) -> Result; } diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index 17f83a2ba8c8..e67496833b99 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -35,6 +35,9 @@ pub struct HeaderConfig { /// /// Defaults to `true` pub last_modified_required: bool, + + /// The version header name if any + pub version_header: Option<&'static str>, } #[derive(Debug, Snafu)] @@ -98,14 +101,20 @@ pub fn header_meta( .context(MissingContentLengthSnafu)?; let content_length = content_length.to_str().context(BadHeaderSnafu)?; - let content_length = content_length + let size = content_length .parse() .context(InvalidContentLengthSnafu { content_length })?; + let version = match cfg.version_header.and_then(|h| headers.get(h)) { + Some(v) => Some(v.to_str().context(BadHeaderSnafu)?.to_string()), + None => None, + }; + Ok(ObjectMeta { location: location.clone(), last_modified, - size: content_length, + version, + size, e_tag, }) } diff --git a/object_store/src/client/list_response.rs b/object_store/src/client/list_response.rs index 6a3889e3be5b..7a170c584156 100644 --- a/object_store/src/client/list_response.rs +++ b/object_store/src/client/list_response.rs @@ -80,6 +80,7 @@ impl TryFrom for ObjectMeta { last_modified: value.last_modified, size: value.size, e_tag: value.e_tag, + version: None, }) } } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 4165d784fd7f..558a6f8d2a84 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -16,7 +16,7 @@ // under the License. use crate::client::get::GetClient; -use crate::client::header::get_etag; +use crate::client::header::{get_etag, HeaderConfig}; use crate::client::list::ListClient; use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; @@ -333,6 +333,11 @@ impl GoogleCloudStorageClient { #[async_trait] impl GetClient for GoogleCloudStorageClient { const STORE: &'static str = STORE; + const HEADER_CONFIG: HeaderConfig = HeaderConfig { + etag_required: true, + last_modified_required: true, + version_header: Some("x-goog-generation"), + }; /// Perform a get request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index f7593be5a043..a7dbdfcbe844 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -277,6 +277,7 @@ impl GetClient for Client { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: false, last_modified_required: false, + version_header: None, }; async fn get_request(&self, path: &Path, options: GetOptions) -> Result { @@ -375,6 +376,7 @@ impl MultiStatusResponse { last_modified, size: self.size()?, e_tag: self.prop_stat.prop.e_tag.clone(), + version: None, }) } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 375302e50d8b..656b30390a4d 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -637,6 +637,8 @@ pub struct ObjectMeta { /// /// pub e_tag: Option, + /// A version indicator for this object + pub version: Option, } /// Options for a get request, such as range @@ -685,6 +687,8 @@ pub struct GetOptions { /// /// pub range: Option>, + /// Request a particular object version + pub version: Option, /// Request transfer of no content /// /// @@ -1379,6 +1383,24 @@ mod tests { }; let err = storage.get_opts(&path, options).await.unwrap_err(); assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + if let Some(version) = meta.version { + storage.put(&path, "bar".into()).await.unwrap(); + + let options = GetOptions { + version: Some(version), + ..GetOptions::default() + }; + + // Can retrieve previous version + let get_opts = storage.get_opts(&path, options).await.unwrap(); + let old = get_opts.bytes().await.unwrap(); + assert_eq!(old, b"foo".as_slice()); + + // Current version contains the updated data + let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(¤t, b"bar".as_slice()); + } } /// Returns a chunk of length `chunk_length` @@ -1691,6 +1713,7 @@ mod tests { last_modified: Utc.timestamp_nanos(100), size: 100, e_tag: Some("123".to_string()), + version: None, }; let mut options = GetOptions::default(); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 9be3ee923244..ce9aa4683499 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -969,6 +969,7 @@ fn convert_metadata(metadata: Metadata, location: Path) -> Result { last_modified, size, e_tag: Some(get_etag(&metadata)), + version: None, }) } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index da7b55d3a83f..8b9522e48de8 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -166,6 +166,7 @@ impl ObjectStore for InMemory { last_modified: entry.last_modified, size: entry.data.len(), e_tag: Some(e_tag), + version: None, }; options.check_preconditions(&meta)?; @@ -212,6 +213,7 @@ impl ObjectStore for InMemory { last_modified: entry.last_modified, size: entry.data.len(), e_tag: Some(entry.e_tag.to_string()), + version: None, }) } @@ -241,6 +243,7 @@ impl ObjectStore for InMemory { last_modified: value.last_modified, size: value.data.len(), e_tag: Some(value.e_tag.to_string()), + version: None, }) }) .collect(); @@ -285,6 +288,7 @@ impl ObjectStore for InMemory { last_modified: v.last_modified, size: v.data.len(), e_tag: Some(v.e_tag.to_string()), + version: None, }; objects.push(object); } diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index c4cb77b66d01..b5bff8b12dd7 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -73,6 +73,7 @@ impl PrefixStore { size: meta.size, location: self.strip_prefix(meta.location), e_tag: meta.e_tag, + version: None, } } } From 03d0505fc864c09e6dcd208d3cdddeecefb90345 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 20 Oct 2023 18:24:25 +0100 Subject: [PATCH 1296/1411] Add SchemaBuilder::remove (#4952) (#4964) --- arrow-schema/src/fields.rs | 17 ++++++++++++++++- arrow-schema/src/schema.rs | 9 +++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 07e9abeee56a..368ecabbf3ef 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -27,7 +27,7 @@ use std::sync::Arc; /// /// ``` /// # use std::sync::Arc; -/// # use arrow_schema::{DataType, Field, Fields}; +/// # use arrow_schema::{DataType, Field, Fields, SchemaBuilder}; /// // Can be constructed from Vec /// Fields::from(vec![Field::new("a", DataType::Boolean, false)]); /// // Can be constructed from Vec @@ -38,6 +38,21 @@ use std::sync::Arc; /// std::iter::once(Arc::new(Field::new("a", DataType::Boolean, false))).collect::(); /// ``` /// +/// See [`SchemaBuilder`] for mutating or updating [`Fields`] +/// +/// ``` +/// # use arrow_schema::{DataType, Field, SchemaBuilder}; +/// let mut builder = SchemaBuilder::new(); +/// builder.push(Field::new("a", DataType::Boolean, false)); +/// builder.push(Field::new("b", DataType::Boolean, false)); +/// let fields = builder.finish().fields; +/// +/// let mut builder = SchemaBuilder::from(&fields); +/// builder.remove(0); +/// let new = builder.finish().fields; +/// ``` +/// +/// [`SchemaBuilder`]: crate::SchemaBuilder #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "serde", serde(transparent))] diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 8424ae87d5fa..43bbffd06523 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -48,6 +48,15 @@ impl SchemaBuilder { self.fields.push(field.into()) } + /// Removes and returns the [`FieldRef`] as index `idx` + /// + /// # Panics + /// + /// Panics if index out of bounds + pub fn remove(&mut self, idx: usize) -> FieldRef { + self.fields.remove(idx) + } + /// Appends a [`FieldRef`] to this [`SchemaBuilder`] checking for collision /// /// If an existing field exists with the same name, calls [`Field::try_merge`] From 14d6c8df12c4b916075408c294257e1ab498138c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 23 Oct 2023 16:52:15 +0100 Subject: [PATCH 1297/1411] Add arrow_cast::base64 and document usage in arrow_json (#4975) --- arrow-cast/Cargo.toml | 2 + arrow-cast/src/base64.rs | 117 +++++++++++++++++++++++++++++++++++++++ arrow-cast/src/lib.rs | 3 +- arrow-json/Cargo.toml | 10 ++-- arrow-json/src/lib.rs | 48 +++++++++++++++- 5 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 arrow-cast/src/base64.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 2e0a9fdd4ebd..19b857297d14 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -50,10 +50,12 @@ half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } comfy-table = { version = "7.0", optional = true, default-features = false } +base64 = "0.21" [dev-dependencies] criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } +rand = "0.8" [build-dependencies] diff --git a/arrow-cast/src/base64.rs b/arrow-cast/src/base64.rs new file mode 100644 index 000000000000..e109c8112480 --- /dev/null +++ b/arrow-cast/src/base64.rs @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Functions for Base64 encoding/decoding + +use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait}; +use arrow_buffer::OffsetBuffer; +use arrow_schema::ArrowError; +use base64::encoded_len; +use base64::engine::Config; + +pub use base64::prelude::*; + +/// Bas64 encode each element of `array` with the provided `engine` +pub fn b64_encode( + engine: &E, + array: &GenericBinaryArray, +) -> GenericStringArray { + let lengths = array.offsets().windows(2).map(|w| { + let len = w[1].as_usize() - w[0].as_usize(); + encoded_len(len, engine.config().encode_padding()).unwrap() + }); + let offsets = OffsetBuffer::::from_lengths(lengths); + let buffer_len = offsets.last().unwrap().as_usize(); + let mut buffer = vec![0_u8; buffer_len]; + let mut offset = 0; + + for i in 0..array.len() { + let len = engine + .encode_slice(array.value(i), &mut buffer[offset..]) + .unwrap(); + offset += len; + } + assert_eq!(offset, buffer_len); + + // Safety: Base64 is valid UTF-8 + unsafe { GenericStringArray::new_unchecked(offsets, buffer.into(), array.nulls().cloned()) } +} + +/// Base64 decode each element of `array` with the provided `engine` +pub fn b64_decode( + engine: &E, + array: &GenericBinaryArray, +) -> Result, ArrowError> { + let estimated_len = array.values().len(); // This is an overestimate + let mut buffer = vec![0; estimated_len]; + + let mut offsets = Vec::with_capacity(array.len() + 1); + offsets.push(O::usize_as(0)); + let mut offset = 0; + + for v in array.iter() { + if let Some(v) = v { + let len = engine.decode_slice(v, &mut buffer[offset..]).unwrap(); + // This cannot overflow as `len` is less than `v.len()` and `a` is valid + offset += len; + } + offsets.push(O::usize_as(offset)); + } + + // Safety: offsets monotonically increasing by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + Ok(GenericBinaryArray::new( + offsets, + buffer.into(), + array.nulls().cloned(), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::BinaryArray; + use base64::prelude::{BASE64_STANDARD, BASE64_STANDARD_NO_PAD}; + use rand::{thread_rng, Rng}; + + fn test_engine(e: &E, a: &BinaryArray) { + let encoded = b64_encode(e, a); + encoded.to_data().validate_full().unwrap(); + + let to_decode = encoded.into(); + let decoded = b64_decode(e, &to_decode).unwrap(); + decoded.to_data().validate_full().unwrap(); + + assert_eq!(&decoded, a); + } + + #[test] + fn test_b64() { + let mut rng = thread_rng(); + let len = rng.gen_range(1024..1050); + let data: BinaryArray = (0..len) + .map(|_| { + let len = rng.gen_range(0..16); + Some((0..len).map(|_| rng.gen()).collect::>()) + }) + .collect(); + + test_engine(&BASE64_STANDARD, &data); + test_engine(&BASE64_STANDARD_NO_PAD, &data); + } +} diff --git a/arrow-cast/src/lib.rs b/arrow-cast/src/lib.rs index d2677a0e0a53..71ebe6c0ed8b 100644 --- a/arrow-cast/src/lib.rs +++ b/arrow-cast/src/lib.rs @@ -21,6 +21,7 @@ pub mod cast; pub use cast::*; pub mod display; pub mod parse; - #[cfg(feature = "prettyprint")] pub mod pretty; + +pub mod base64; diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index df38a52811c2..7e49a57fbd6c 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -34,11 +34,11 @@ path = "src/lib.rs" bench = false [dependencies] -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-cast = { workspace = true } -arrow-data = { workspace = true } -arrow-schema = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } indexmap = { version = "2.0", default-features = false, features = ["std"] } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 88415ff2ecac..e69eaaba3ef8 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -15,9 +15,53 @@ // specific language governing permissions and limitations // under the License. -//! Transfer data between the Arrow memory format and JSON -//! line-delimited records. See the module level documentation for the +//! Transfer data between the Arrow memory format and JSON line-delimited records. +//! +//! See the module level documentation for the //! [`reader`] and [`writer`] for usage examples. +//! +//! # Binary Data +//! +//! As per [RFC7159] JSON cannot encode arbitrary binary data. A common approach to workaround +//! this is to use a [binary-to-text encoding] scheme, such as base64, to encode the +//! input data and then decode it on output. +//! +//! ``` +//! # use std::io::Cursor; +//! # use std::sync::Arc; +//! # use arrow_array::{BinaryArray, RecordBatch, StringArray}; +//! # use arrow_array::cast::AsArray; +//! # use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD}; +//! # use arrow_json::{LineDelimitedWriter, ReaderBuilder}; +//! # +//! // The data we want to write +//! let input = BinaryArray::from(vec![b"\xDE\x00\xFF".as_ref()]); +//! +//! // Base64 encode it to a string +//! let encoded: StringArray = b64_encode(&BASE64_STANDARD, &input); +//! +//! // Write the StringArray to JSON +//! let batch = RecordBatch::try_from_iter([("col", Arc::new(encoded) as _)]).unwrap(); +//! let mut buf = Vec::with_capacity(1024); +//! let mut writer = LineDelimitedWriter::new(&mut buf); +//! writer.write(&batch).unwrap(); +//! writer.finish().unwrap(); +//! +//! // Read the JSON data +//! let cursor = Cursor::new(buf); +//! let mut reader = ReaderBuilder::new(batch.schema()).build(cursor).unwrap(); +//! let batch = reader.next().unwrap().unwrap(); +//! +//! // Reverse the base64 encoding +//! let col: BinaryArray = batch.column(0).as_string::().clone().into(); +//! let output = b64_decode(&BASE64_STANDARD, &col).unwrap(); +//! +//! assert_eq!(input, output); +//! ``` +//! +//! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1 +//! [binary-to-text encoding]: https://en.wikipedia.org/wiki/Binary-to-text_encoding +//! #![deny(rustdoc::broken_intra_doc_links)] #![warn(missing_docs)] From e79b3bcd703f5bb1c7794dda20d510f1c7ea5094 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 23 Oct 2023 22:27:19 +0100 Subject: [PATCH 1298/1411] Support ImdsManagedIdentityProvider in Azure Functions (#4976) (#4977) --- object_store/src/azure/credential.rs | 37 +++++++++++++++++----------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index fc96ce4fc3ef..283d7ff9d703 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -40,7 +40,7 @@ use std::borrow::Cow; use std::process::Command; use std::str; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::{Duration, Instant, SystemTime}; use url::Url; static AZURE_VERSION: HeaderValue = HeaderValue::from_static("2021-08-06"); @@ -293,13 +293,16 @@ fn lexy_sort<'a>( values } +/// #[derive(Deserialize, Debug)] -struct TokenResponse { +struct OAuthTokenResponse { access_token: String, expires_in: u64, } /// Encapsulates the logic to perform an OAuth token challenge +/// +/// #[derive(Debug)] pub struct ClientSecretOAuthProvider { token_url: String, @@ -340,7 +343,7 @@ impl TokenProvider for ClientSecretOAuthProvider { client: &Client, retry: &RetryConfig, ) -> crate::Result>> { - let response: TokenResponse = client + let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ @@ -363,21 +366,27 @@ impl TokenProvider for ClientSecretOAuthProvider { } } -fn expires_in_string<'de, D>(deserializer: D) -> std::result::Result +fn expires_on_string<'de, D>(deserializer: D) -> std::result::Result where D: serde::de::Deserializer<'de>, { let v = String::deserialize(deserializer)?; - v.parse::().map_err(serde::de::Error::custom) + let v = v.parse::().map_err(serde::de::Error::custom)?; + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(serde::de::Error::custom)?; + + Ok(Instant::now() + Duration::from_secs(v.saturating_sub(now.as_secs()))) } -// NOTE: expires_on is a String version of unix epoch time, not an integer. -// +/// NOTE: expires_on is a String version of unix epoch time, not an integer. +/// +/// #[derive(Debug, Clone, Deserialize)] -struct MsiTokenResponse { +struct ImdsTokenResponse { pub access_token: String, - #[serde(deserialize_with = "expires_in_string")] - pub expires_in: u64, + #[serde(deserialize_with = "expires_on_string")] + pub expires_on: Instant, } /// Attempts authentication using a managed identity that has been assigned to the deployment environment. @@ -450,7 +459,7 @@ impl TokenProvider for ImdsManagedIdentityProvider { builder = builder.header("x-identity-header", val); }; - let response: MsiTokenResponse = builder + let response: ImdsTokenResponse = builder .send_retry(retry) .await .context(TokenRequestSnafu)? @@ -460,12 +469,12 @@ impl TokenProvider for ImdsManagedIdentityProvider { Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), - expiry: Some(Instant::now() + Duration::from_secs(response.expires_in)), + expiry: Some(response.expires_on), }) } } -/// Credential for using workload identity dfederation +/// Credential for using workload identity federation /// /// #[derive(Debug)] @@ -512,7 +521,7 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .map_err(|_| Error::FederatedTokenFile)?; // https://learn.microsoft.com/en-us/azure/active-directory/develop/v2-oauth2-client-creds-grant-flow#third-case-access-token-request-with-a-federated-credential - let response: TokenResponse = client + let response: OAuthTokenResponse = client .request(Method::POST, &self.token_url) .header(ACCEPT, HeaderValue::from_static(CONTENT_TYPE_JSON)) .form(&[ From 2f3379559fa57f0e2664c173841c84fe91cd1c9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:50:18 +0100 Subject: [PATCH 1299/1411] Update quick-xml requirement from 0.30.0 to 0.31.0 in /object_store (#4983) Updates the requirements on [quick-xml](https://github.com/tafia/quick-xml) to permit the latest version. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.30.0...v0.31.0) --- updated-dependencies: - dependency-name: quick-xml dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 7928648d170f..cb820b509ada 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -46,7 +46,7 @@ walkdir = "2" # Cloud storage support base64 = { version = "0.21", default-features = false, features = ["std"], optional = true } hyper = { version = "0.14", default-features = false, optional = true } -quick-xml = { version = "0.30.0", features = ["serialize", "overlapped-lists"], optional = true } +quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } From 4748b6187a0215b8a04fbd53184074c1e4e9ef32 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:50:32 +0100 Subject: [PATCH 1300/1411] Bump actions/setup-node from 3 to 4 (#4982) Bumps [actions/setup-node](https://github.com/actions/setup-node) from 3 to 4. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](https://github.com/actions/setup-node/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/setup-node dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 9871f8b7d295..1447d72a53b1 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -51,7 +51,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 with: node-version: "14" - name: Prettier check From c90aff3cc9c7f21dc7dd77000eeea8d2ceb0412d Mon Sep 17 00:00:00 2001 From: fan <75058860+fansehep@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:18:52 +0800 Subject: [PATCH 1301/1411] feat: support schema change by idx and reverse (#4985) * feat: support schema change by idx and reverse Signed-off-by: fan * follow reviews Signed-off-by: fan --------- Signed-off-by: fan --- arrow-schema/src/schema.rs | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 43bbffd06523..b05cfbe3d950 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -57,6 +57,17 @@ impl SchemaBuilder { self.fields.remove(idx) } + /// Get mut FieldRef as index `idx` + /// if index out of bounds, will panic + pub fn field_mut(&mut self, idx: usize) -> &mut FieldRef { + &mut self.fields[idx] + } + + /// Reverse the fileds + pub fn reverse(&mut self) { + self.fields.reverse(); + } + /// Appends a [`FieldRef`] to this [`SchemaBuilder`] checking for collision /// /// If an existing field exists with the same name, calls [`Field::try_merge`] @@ -837,4 +848,34 @@ mod tests { "Could not find expected string '{expected}' in '{res}'" ); } + + #[test] + fn test_schemabuilder_change_field() { + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("a", DataType::Int32, false)); + builder.push(Field::new("b", DataType::Utf8, false)); + *builder.field_mut(1) = Arc::new(Field::new("c", DataType::Int32, false)); + assert_eq!( + builder.fields, + vec![ + Arc::new(Field::new("a", DataType::Int32, false)), + Arc::new(Field::new("c", DataType::Int32, false)) + ] + ); + } + + #[test] + fn test_schemabuilder_reverse() { + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("a", DataType::Int32, false)); + builder.push(Field::new("b", DataType::Utf8, true)); + builder.reverse(); + assert_eq!( + builder.fields, + vec![ + Arc::new(Field::new("b", DataType::Utf8, true)), + Arc::new(Field::new("a", DataType::Int32, false)) + ] + ); + } } From a33d42f59189e8f5d880f4e7a557531d2d55ddb3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 25 Oct 2023 11:20:46 +0100 Subject: [PATCH 1302/1411] Increase default timeout to 30 seconds (#4989) --- object_store/src/client/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 3c968f11be21..77eee7fc92f3 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -193,6 +193,9 @@ impl Default for ClientOptions { // // // Which recommend a connection timeout of 3.1s and a request timeout of 2s + // + // As object store requests may involve the transfer of non-trivial volumes of data + // we opt for a slightly higher default timeout of 30 seconds Self { user_agent: None, content_type_map: Default::default(), @@ -203,7 +206,7 @@ impl Default for ClientOptions { proxy_excludes: None, allow_http: Default::default(), allow_insecure: Default::default(), - timeout: Some(Duration::from_secs(5).into()), + timeout: Some(Duration::from_secs(30).into()), connect_timeout: Some(Duration::from_secs(5).into()), pool_idle_timeout: None, pool_max_idle_per_host: None, From a6a512f387c15f092f5986695094c0f69b7fa978 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:14:55 +1100 Subject: [PATCH 1303/1411] Fix pre commit (#4990) * Fix pre-commit.sh script * Fix --- pre-commit.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pre-commit.sh b/pre-commit.sh index 5ce080793242..f82390e229a9 100755 --- a/pre-commit.sh +++ b/pre-commit.sh @@ -20,7 +20,7 @@ # This file is git pre-commit hook. # # Soft link it as git hook under top dir of apache arrow git repository: -# $ ln -s ../../rust/pre-commit.sh .git/hooks/pre-commit +# $ ln -s ../../pre-commit.sh .git/hooks/pre-commit # # This file be run directly: # $ ./pre-commit.sh @@ -37,14 +37,12 @@ function BYELLOW() { echo "\033[1;33m$@\033[0m" } -RUST_DIR="rust" - # env GIT_DIR is set by git when run a pre-commit hook. if [ -z "${GIT_DIR}" ]; then GIT_DIR=$(git rev-parse --show-toplevel) fi -cd ${GIT_DIR}/${RUST_DIR} +cd ${GIT_DIR} NUM_CHANGES=$(git diff --cached --name-only . | grep -e ".*/*.rs$" | From e78d1409c265ae5c216fc62e51a0f20aa55f6415 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 25 Oct 2023 17:59:37 +0100 Subject: [PATCH 1304/1411] Add MultiPartStore (#4961) (#4608) (#4971) * Add MultiPartStore (#4961) (#4608) * Parse CompleteMultipartUploadResult (#4965) * More docs * Add integration test * Fix azure * More docs * Don't gate multipart behind feature flag --- object_store/src/aws/client.rs | 51 ++++++++++++++++++++-- object_store/src/aws/mod.rs | 49 +++++++++++++++------ object_store/src/azure/client.rs | 49 ++++++++++++++++++++- object_store/src/azure/mod.rs | 73 +++++++++++++++----------------- object_store/src/gcp/client.rs | 34 +++++++++++++-- object_store/src/gcp/mod.rs | 52 ++++++++++++++++------- object_store/src/lib.rs | 42 ++++++++++++++++-- object_store/src/multipart.rs | 61 ++++++++++++++++++++++++-- object_store/src/signer.rs | 3 +- 9 files changed, 329 insertions(+), 85 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 00d6ee446f2f..4e98f259f8dd 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -27,7 +27,9 @@ use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Path, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, MultipartId, Path, PutResult, Result, RetryConfig, +}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -115,6 +117,9 @@ pub(crate) enum Error { #[snafu(display("Error performing complete multipart request: {}", source))] CompleteMultipartRequest { source: crate::client::retry::Error }, + #[snafu(display("Error getting complete multipart response body: {}", source))] + CompleteMultipartResponseBody { source: reqwest::Error }, + #[snafu(display("Got invalid list response: {}", source))] InvalidListResponse { source: quick_xml::de::DeError }, @@ -162,6 +167,13 @@ struct MultipartPart { part_number: usize, } +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUploadResult")] +struct CompleteMultipartResult { + #[serde(rename = "ETag")] + e_tag: String, +} + #[derive(Deserialize)] #[serde(rename_all = "PascalCase", rename = "DeleteResult")] struct BatchDeleteResponse { @@ -506,12 +518,32 @@ impl S3Client { Ok(response.upload_id) } + pub async fn put_part( + &self, + path: &Path, + upload_id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + let part = (part_idx + 1).to_string(); + + let content_id = self + .put_request( + path, + data, + &[("partNumber", &part), ("uploadId", upload_id)], + ) + .await?; + + Ok(PartId { content_id }) + } + pub async fn complete_multipart( &self, location: &Path, upload_id: &str, parts: Vec, - ) -> Result<()> { + ) -> Result { let parts = parts .into_iter() .enumerate() @@ -527,7 +559,8 @@ impl S3Client { let credential = self.get_credential().await?; let url = self.config.path_url(location); - self.client + let response = self + .client .request(Method::POST, url) .query(&[("uploadId", upload_id)]) .body(body) @@ -542,7 +575,17 @@ impl S3Client { .await .context(CompleteMultipartRequestSnafu)?; - Ok(()) + let data = response + .bytes() + .await + .context(CompleteMultipartResponseBodySnafu)?; + + let response: CompleteMultipartResult = + quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + + Ok(PutResult { + e_tag: Some(response.e_tag), + }) } } diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 25894a1c3445..57254c7cf4e8 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -44,7 +44,7 @@ use crate::aws::client::S3Client; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; use crate::client::CredentialProvider; -use crate::multipart::{PartId, PutPart, WriteMultiPart}; +use crate::multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, @@ -246,18 +246,9 @@ struct S3MultiPartUpload { #[async_trait] impl PutPart for S3MultiPartUpload { async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let part = (part_idx + 1).to_string(); - - let content_id = self - .client - .put_request( - &self.location, - buf.into(), - &[("partNumber", &part), ("uploadId", &self.upload_id)], - ) - .await?; - - Ok(PartId { content_id }) + self.client + .put_part(&self.location, &self.upload_id, part_idx, buf.into()) + .await } async fn complete(&self, completed_parts: Vec) -> Result<()> { @@ -268,6 +259,36 @@ impl PutPart for S3MultiPartUpload { } } +#[async_trait] +impl MultiPartStore for AmazonS3 { + async fn create_multipart(&self, path: &Path) -> Result { + self.client.create_multipart(path).await + } + + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_part(path, id, part_idx, data).await + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result { + self.client.complete_multipart(path, id, parts).await + } + + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { + self.client.delete_request(path, &[("uploadId", id)]).await + } +} + #[cfg(test)] mod tests { use super::*; @@ -293,6 +314,8 @@ mod tests { list_with_delimiter(&integration).await; rename_and_copy(&integration).await; stream_get(&integration).await; + multipart(&integration, &integration).await; + if test_not_exists { copy_if_not_exists(&integration).await; } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index cd3df8c7b857..9f47b9a8152b 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -19,13 +19,16 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; -use crate::client::header::HeaderConfig; +use crate::client::header::{get_etag, HeaderConfig}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; +use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; -use crate::{ClientOptions, GetOptions, ListResult, ObjectMeta, Path, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutResult, Result, RetryConfig, +}; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -84,6 +87,11 @@ pub(crate) enum Error { Authorization { source: crate::azure::credential::Error, }, + + #[snafu(display("Unable to extract metadata from headers: {}", source))] + Metadata { + source: crate::client::header::Error, + }, } impl From for crate::Error { @@ -190,6 +198,43 @@ impl AzureClient { Ok(response) } + /// PUT a block + pub async fn put_block(&self, path: &Path, part_idx: usize, data: Bytes) -> Result { + let content_id = format!("{part_idx:20}"); + let block_id: BlockId = content_id.clone().into(); + + self.put_request( + path, + Some(data), + true, + &[ + ("comp", "block"), + ("blockid", &BASE64_STANDARD.encode(block_id)), + ], + ) + .await?; + + Ok(PartId { content_id }) + } + + /// PUT a block list + pub async fn put_block_list(&self, path: &Path, parts: Vec) -> Result { + let blocks = parts + .into_iter() + .map(|part| BlockId::from(part.content_id)) + .collect(); + + let block_list = BlockList { blocks }; + let block_xml = block_list.to_xml(); + + let response = self + .put_request(path, Some(block_xml.into()), true, &[("comp", "blocklist")]) + .await?; + + let e_tag = get_etag(response.headers()).context(MetadataSnafu)?; + Ok(PutResult { e_tag: Some(e_tag) }) + } + /// Make an Azure Delete request pub async fn delete_request( &self, diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 5f768756a629..779ac2f71ff8 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -26,15 +26,12 @@ //! [ObjectStore::abort_multipart] is a no-op, since Azure Blob Store doesn't provide //! a way to drop old blocks. Instead unused blocks are automatically cleaned up //! after 7 days. -use self::client::{BlockId, BlockList}; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, }; use async_trait::async_trait; -use base64::prelude::BASE64_STANDARD; -use base64::Engine; use bytes::Bytes; use futures::stream::BoxStream; use std::fmt::Debug; @@ -53,6 +50,7 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; use crate::client::header::get_etag; +use crate::multipart::MultiPartStore; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -151,43 +149,44 @@ struct AzureMultiPartUpload { #[async_trait] impl PutPart for AzureMultiPartUpload { - async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let content_id = format!("{part_idx:20}"); - let block_id: BlockId = content_id.clone().into(); - - self.client - .put_request( - &self.location, - Some(buf.into()), - true, - &[ - ("comp", "block"), - ("blockid", &BASE64_STANDARD.encode(block_id)), - ], - ) - .await?; + async fn put_part(&self, buf: Vec, idx: usize) -> Result { + self.client.put_block(&self.location, idx, buf.into()).await + } - Ok(PartId { content_id }) + async fn complete(&self, parts: Vec) -> Result<()> { + self.client.put_block_list(&self.location, parts).await?; + Ok(()) } +} - async fn complete(&self, completed_parts: Vec) -> Result<()> { - let blocks = completed_parts - .into_iter() - .map(|part| BlockId::from(part.content_id)) - .collect(); +#[async_trait] +impl MultiPartStore for MicrosoftAzure { + async fn create_multipart(&self, _: &Path) -> Result { + Ok(String::new()) + } - let block_list = BlockList { blocks }; - let block_xml = block_list.to_xml(); + async fn put_part( + &self, + path: &Path, + _: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_block(path, part_idx, data).await + } - self.client - .put_request( - &self.location, - Some(block_xml.into()), - true, - &[("comp", "blocklist")], - ) - .await?; + async fn complete_multipart( + &self, + path: &Path, + _: &MultipartId, + parts: Vec, + ) -> Result { + self.client.put_block_list(path, parts).await + } + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { + // There is no way to drop blocks that have been uploaded. Instead, they simply + // expire in 7 days. Ok(()) } } @@ -195,10 +194,7 @@ impl PutPart for AzureMultiPartUpload { #[cfg(test)] mod tests { use super::*; - use crate::tests::{ - copy_if_not_exists, get_opts, list_uses_directories_correctly, list_with_delimiter, - put_get_delete_list_opts, rename_and_copy, stream_get, - }; + use crate::tests::*; #[tokio::test] async fn azure_blob_test() { @@ -212,6 +208,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + multipart(&integration, &integration).await; } #[test] diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 558a6f8d2a84..8c44f9016480 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -24,7 +24,7 @@ use crate::client::GetOptionsExt; use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, Result, RetryConfig}; +use crate::{ClientOptions, GetOptions, ListResult, MultipartId, PutResult, Result, RetryConfig}; use async_trait::async_trait; use bytes::{Buf, Bytes}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; @@ -184,6 +184,30 @@ impl GoogleCloudStorageClient { Ok(get_etag(response.headers()).context(MetadataSnafu)?) } + /// Perform a put part request + /// + /// Returns the new [`PartId`] + pub async fn put_part( + &self, + path: &Path, + upload_id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + let content_id = self + .put_request( + path, + data, + &[ + ("partNumber", &format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ], + ) + .await?; + + Ok(PartId { content_id }) + } + /// Initiate a multi-part upload pub async fn multipart_initiate(&self, path: &Path) -> Result { let credential = self.get_credential().await?; @@ -240,7 +264,7 @@ impl GoogleCloudStorageClient { path: &Path, multipart_id: &MultipartId, completed_parts: Vec, - ) -> Result<()> { + ) -> Result { let upload_id = multipart_id.clone(); let url = self.object_url(path); @@ -263,7 +287,8 @@ impl GoogleCloudStorageClient { // https://github.com/tafia/quick-xml/issues/350 .replace(""", "\""); - self.client + let result = self + .client .request(Method::POST, &url) .bearer_auth(&credential.bearer) .query(&[("uploadId", upload_id)]) @@ -274,7 +299,8 @@ impl GoogleCloudStorageClient { path: path.as_ref(), })?; - Ok(()) + let etag = get_etag(result.headers()).context(MetadataSnafu)?; + Ok(PutResult { e_tag: Some(etag) }) } /// Perform a delete request diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 6512a8b036c5..0eb3e9c23c43 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -45,6 +45,7 @@ use tokio::io::AsyncWrite; use crate::client::get::GetClientExt; use crate::client::list::ListClientExt; +use crate::multipart::MultiPartStore; pub use builder::{GoogleCloudStorageBuilder, GoogleConfigKey}; pub use credential::GcpCredential; @@ -90,27 +91,17 @@ struct GCSMultipartUpload { impl PutPart for GCSMultipartUpload { /// Upload an object part async fn put_part(&self, buf: Vec, part_idx: usize) -> Result { - let upload_id = self.multipart_id.clone(); - let content_id = self - .client - .put_request( - &self.path, - buf.into(), - &[ - ("partNumber", format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ], - ) - .await?; - - Ok(PartId { content_id }) + self.client + .put_part(&self.path, &self.multipart_id, part_idx, buf.into()) + .await } /// Complete a multipart upload async fn complete(&self, completed_parts: Vec) -> Result<()> { self.client .multipart_complete(&self.path, &self.multipart_id, completed_parts) - .await + .await?; + Ok(()) } } @@ -169,6 +160,36 @@ impl ObjectStore for GoogleCloudStorage { } } +#[async_trait] +impl MultiPartStore for GoogleCloudStorage { + async fn create_multipart(&self, path: &Path) -> Result { + self.client.multipart_initiate(path).await + } + + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result { + self.client.put_part(path, id, part_idx, data).await + } + + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result { + self.client.multipart_complete(path, id, parts).await + } + + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()> { + self.client.multipart_cleanup(path, id).await + } +} + #[cfg(test)] mod test { @@ -197,6 +218,7 @@ mod test { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; + multipart(&integration, &integration).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 656b30390a4d..9a0667229803 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -263,7 +263,6 @@ pub use client::{ #[cfg(feature = "cloud")] mod config; -#[cfg(feature = "cloud")] pub mod multipart; mod parse; mod util; @@ -302,18 +301,29 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// should be able to observe a partially written object async fn put(&self, location: &Path, bytes: Bytes) -> Result; - /// Get a multi-part upload that allows writing data in chunks + /// Get a multi-part upload that allows writing data in chunks. /// /// Most cloud-based uploads will buffer and upload parts in parallel. /// /// To complete the upload, [AsyncWrite::poll_shutdown] must be called /// to completion. This operation is guaranteed to be atomic, it will either /// make all the written data available at `location`, or fail. No clients - /// should be able to observe a partially written object + /// should be able to observe a partially written object. /// /// For some object stores (S3, GCS, and local in particular), if the /// writer fails or panics, you must call [ObjectStore::abort_multipart] /// to clean up partially written data. + /// + /// For applications requiring fine-grained control of multipart uploads + /// see [`MultiPartStore`], although note that this interface cannot be + /// supported by all [`ObjectStore`] backends. + /// + /// For applications looking to implement this interface for a custom + /// multipart API, see [`WriteMultiPart`] which handles the complexities + /// of performing parallel uploads of fixed size parts. + /// + /// [`WriteMultiPart`]: multipart::WriteMultiPart + /// [`MultiPartStore`]: multipart::MultiPartStore async fn put_multipart( &self, location: &Path, @@ -934,6 +944,7 @@ mod test_util { #[cfg(test)] mod tests { use super::*; + use crate::multipart::MultiPartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; use rand::{thread_rng, Rng}; @@ -1681,6 +1692,31 @@ mod tests { storage.delete(&path2).await.unwrap(); } + pub(crate) async fn multipart(storage: &dyn ObjectStore, multipart: &dyn MultiPartStore) { + let path = Path::from("test_multipart"); + let chunk_size = 5 * 1024 * 1024; + + let chunks = get_chunks(chunk_size, 2); + + let id = multipart.create_multipart(&path).await.unwrap(); + + let parts: Vec<_> = futures::stream::iter(chunks) + .enumerate() + .map(|(idx, b)| multipart.put_part(&path, &id, idx, b)) + .buffered(2) + .try_collect() + .await + .unwrap(); + + multipart + .complete_multipart(&path, &id, parts) + .await + .unwrap(); + + let meta = storage.head(&path).await.unwrap(); + assert_eq!(meta.size, chunk_size * 2); + } + async fn delete_fixtures(storage: &DynObjectStore) { let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage diff --git a/object_store/src/multipart.rs b/object_store/src/multipart.rs index d4c911fceab4..1dcd5a6f4960 100644 --- a/object_store/src/multipart.rs +++ b/object_store/src/multipart.rs @@ -22,17 +22,18 @@ //! especially useful when dealing with large files or high-throughput systems. use async_trait::async_trait; +use bytes::Bytes; use futures::{stream::FuturesUnordered, Future, StreamExt}; use std::{io, pin::Pin, sync::Arc, task::Poll}; use tokio::io::AsyncWrite; -use crate::Result; +use crate::path::Path; +use crate::{MultipartId, PutResult, Result}; type BoxedTryFuture = Pin> + Send>>; -/// A trait that can be implemented by cloud-based object stores -/// and used in combination with [`WriteMultiPart`] to provide -/// multipart upload support +/// A trait used in combination with [`WriteMultiPart`] to implement +/// [`AsyncWrite`] on top of an API for multipart upload #[async_trait] pub trait PutPart: Send + Sync + 'static { /// Upload a single part @@ -52,6 +53,9 @@ pub struct PartId { } /// Wrapper around a [`PutPart`] that implements [`AsyncWrite`] +/// +/// Data will be uploaded in fixed size chunks of 10 MiB in parallel, +/// up to the configured maximum concurrency pub struct WriteMultiPart { inner: Arc, /// A list of completed parts, in sequential order. @@ -263,3 +267,52 @@ impl std::fmt::Debug for WriteMultiPart { .finish() } } + +/// A low-level interface for interacting with multipart upload APIs +/// +/// Most use-cases should prefer [`ObjectStore::put_multipart`] as this is supported by more +/// backends, including [`LocalFileSystem`], and automatically handles uploading fixed +/// size parts of sufficient size in parallel +/// +/// [`ObjectStore::put_multipart`]: crate::ObjectStore::put_multipart +/// [`LocalFileSystem`]: crate::local::LocalFileSystem +#[async_trait] +pub trait MultiPartStore: Send + Sync + 'static { + /// Creates a new multipart upload, returning the [`MultipartId`] + async fn create_multipart(&self, path: &Path) -> Result; + + /// Uploads a new part with index `part_idx` + /// + /// `part_idx` should be an integer in the range `0..N` where `N` is the number of + /// parts in the upload. Parts may be uploaded concurrently and in any order. + /// + /// Most stores require that all parts excluding the last are at least 5 MiB, and some + /// further require that all parts excluding the last be the same size, e.g. [R2]. + /// [`WriteMultiPart`] performs writes in fixed size blocks of 10 MiB, and clients wanting + /// to maximise compatibility should look to do likewise. + /// + /// [R2]: https://developers.cloudflare.com/r2/objects/multipart-objects/#limitations + async fn put_part( + &self, + path: &Path, + id: &MultipartId, + part_idx: usize, + data: Bytes, + ) -> Result; + + /// Completes a multipart upload + /// + /// The `i`'th value of `parts` must be a [`PartId`] returned by a call to [`Self::put_part`] + /// with a `part_idx` of `i`, and the same `path` and `id` as provided to this method. Calling + /// this method with out of sequence or repeated [`PartId`], or [`PartId`] returned for other + /// values of `path` or `id`, will result in implementation-defined behaviour + async fn complete_multipart( + &self, + path: &Path, + id: &MultipartId, + parts: Vec, + ) -> Result; + + /// Aborts a multipart upload + async fn abort_multipart(&self, path: &Path, id: &MultipartId) -> Result<()>; +} diff --git a/object_store/src/signer.rs b/object_store/src/signer.rs index f792397a7894..ed92e28799e5 100644 --- a/object_store/src/signer.rs +++ b/object_store/src/signer.rs @@ -23,8 +23,7 @@ use reqwest::Method; use std::{fmt, time::Duration}; use url::Url; -/// Universal API to presigned URLs generated from multiple object store services. Not supported by -/// all object store services. +/// Universal API to generate presigned URLs from multiple object store services. #[async_trait] pub trait Signer: Send + Sync + fmt::Debug + 'static { /// Given the intended [`Method`] and [`Path`] to use and the desired length of time for which From 570c91eb06d792aa7bd912e5eff9ca0d5848a1bc Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:16:48 +0100 Subject: [PATCH 1305/1411] Support bucket name with `.` when parsing GCS URL (#4991) (#4992) * Support bucket name with `.` when parsing GCS URL (#4991) * Update test --- object_store/src/gcp/builder.rs | 18 ++++++------------ object_store/src/parse.rs | 4 ++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index 2039d2378392..5f718d63d94a 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -337,13 +337,8 @@ impl GoogleCloudStorageBuilder { let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; - let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), - false => Ok(s.to_string()), - }; - match parsed.scheme() { - "gs" => self.bucket_name = Some(validate(host)?), + "gs" => self.bucket_name = Some(host.to_string()), scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), } Ok(()) @@ -630,13 +625,12 @@ mod tests { fn gcs_test_urls() { let mut builder = GoogleCloudStorageBuilder::new(); builder.parse_url("gs://bucket/path").unwrap(); - assert_eq!(builder.bucket_name, Some("bucket".to_string())); + assert_eq!(builder.bucket_name.as_deref(), Some("bucket")); - let err_cases = ["mailto://bucket/path", "gs://bucket.mydomain/path"]; - let mut builder = GoogleCloudStorageBuilder::new(); - for case in err_cases { - builder.parse_url(case).unwrap_err(); - } + builder.parse_url("gs://bucket.mydomain/path").unwrap(); + assert_eq!(builder.bucket_name.as_deref(), Some("bucket.mydomain")); + + builder.parse_url("mailto://bucket/path").unwrap_err(); } #[test] diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 170726f45290..51993e245530 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -234,6 +234,10 @@ mod tests { "gs://bucket/path", (ObjectStoreScheme::GoogleCloudStorage, "path"), ), + ( + "gs://test.example.com/path", + (ObjectStoreScheme::GoogleCloudStorage, "path"), + ), ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), ]; From 1708b0bae090c76bbf4b301bbde78e397699ff33 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:16:59 +0100 Subject: [PATCH 1306/1411] Support metadata in SchemaBuilder (#4987) --- arrow-schema/src/schema.rs | 62 +++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index b05cfbe3d950..c0f58e077a6f 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -28,6 +28,7 @@ use crate::{FieldRef, Fields}; #[derive(Debug, Default)] pub struct SchemaBuilder { fields: Vec, + metadata: HashMap, } impl SchemaBuilder { @@ -40,6 +41,7 @@ impl SchemaBuilder { pub fn with_capacity(capacity: usize) -> Self { Self { fields: Vec::with_capacity(capacity), + metadata: Default::default(), } } @@ -57,12 +59,34 @@ impl SchemaBuilder { self.fields.remove(idx) } - /// Get mut FieldRef as index `idx` - /// if index out of bounds, will panic + /// Returns an immutable reference to the [`FieldRef`] at index `idx` + /// + /// # Panics + /// + /// Panics if index out of bounds + pub fn field(&mut self, idx: usize) -> &FieldRef { + &mut self.fields[idx] + } + + /// Returns a mutable reference to the [`FieldRef`] at index `idx` + /// + /// # Panics + /// + /// Panics if index out of bounds pub fn field_mut(&mut self, idx: usize) -> &mut FieldRef { &mut self.fields[idx] } + /// Returns an immutable reference to the Map of custom metadata key-value pairs. + pub fn metadata(&mut self) -> &HashMap { + &self.metadata + } + + /// Returns a mutable reference to the Map of custom metadata key-value pairs. + pub fn metadata_mut(&mut self) -> &mut HashMap { + &mut self.metadata + } + /// Reverse the fileds pub fn reverse(&mut self) { self.fields.reverse(); @@ -91,7 +115,10 @@ impl SchemaBuilder { /// Consume this [`SchemaBuilder`] yielding the final [`Schema`] pub fn finish(self) -> Schema { - Schema::new(self.fields) + Schema { + fields: self.fields.into(), + metadata: self.metadata, + } } } @@ -99,6 +126,7 @@ impl From<&Fields> for SchemaBuilder { fn from(value: &Fields) -> Self { Self { fields: value.to_vec(), + metadata: Default::default(), } } } @@ -107,6 +135,16 @@ impl From for SchemaBuilder { fn from(value: Fields) -> Self { Self { fields: value.to_vec(), + metadata: Default::default(), + } + } +} + +impl From for SchemaBuilder { + fn from(value: Schema) -> Self { + Self { + fields: value.fields.to_vec(), + metadata: value.metadata, } } } @@ -850,7 +888,7 @@ mod tests { } #[test] - fn test_schemabuilder_change_field() { + fn test_schema_builder_change_field() { let mut builder = SchemaBuilder::new(); builder.push(Field::new("a", DataType::Int32, false)); builder.push(Field::new("b", DataType::Utf8, false)); @@ -865,7 +903,7 @@ mod tests { } #[test] - fn test_schemabuilder_reverse() { + fn test_schema_builder_reverse() { let mut builder = SchemaBuilder::new(); builder.push(Field::new("a", DataType::Int32, false)); builder.push(Field::new("b", DataType::Utf8, true)); @@ -878,4 +916,18 @@ mod tests { ] ); } + + #[test] + fn test_schema_builder_metadata() { + let mut metadata = HashMap::with_capacity(1); + metadata.insert("key".to_string(), "value".to_string()); + + let fields = vec![Field::new("test", DataType::Int8, true)]; + let mut builder: SchemaBuilder = Schema::new(fields).with_metadata(metadata).into(); + builder.metadata_mut().insert("k".into(), "v".into()); + let out = builder.finish(); + assert_eq!(out.metadata.len(), 2); + assert_eq!(out.metadata["k"], "v"); + assert_eq!(out.metadata["key"], "value"); + } } From 6b4fd2f2224f3eb08a3eb55cf54bde9b2f1d2793 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:17:23 +0100 Subject: [PATCH 1307/1411] Don't panic on invalid Azure access key (#4972) (#4974) --- object_store/src/azure/builder.rs | 14 ++++++++------ object_store/src/azure/credential.rs | 23 +++++++++++++++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 915e4c59a871..02e0762b6de9 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -17,7 +17,7 @@ use crate::azure::client::{AzureClient, AzureConfig}; use crate::azure::credential::{ - AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, + AzureAccessKey, AzureCliCredential, ClientSecretOAuthProvider, ImdsManagedIdentityProvider, WorkloadIdentityOAuthProvider, }; use crate::azure::{AzureCredential, AzureCredentialProvider, MicrosoftAzure, STORE}; @@ -800,11 +800,12 @@ impl MicrosoftAzureBuilder { // Allow overriding defaults. Values taken from // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 let url = url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let account_key = self - .access_key - .unwrap_or_else(|| EMULATOR_ACCOUNT_KEY.to_string()); + let key = match self.access_key { + Some(k) => AzureAccessKey::try_new(&k)?, + None => AzureAccessKey::try_new(EMULATOR_ACCOUNT_KEY)?, + }; - let credential = static_creds(AzureCredential::AccessKey(account_key)); + let credential = static_creds(AzureCredential::AccessKey(key)); self.client_options = self.client_options.with_allow_http(true); (true, url, credential, account_name) @@ -828,7 +829,8 @@ impl MicrosoftAzureBuilder { } else if let Some(bearer_token) = self.bearer_token { static_creds(AzureCredential::BearerToken(bearer_token)) } else if let Some(access_key) = self.access_key { - static_creds(AzureCredential::AccessKey(access_key)) + let key = AzureAccessKey::try_new(&access_key)?; + static_creds(AzureCredential::AccessKey(key)) } else if let (Some(client_id), Some(tenant_id), Some(federated_token_file)) = (&self.client_id, &self.tenant_id, self.federated_token_file) { diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 283d7ff9d703..2b8788d333b2 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -75,6 +75,9 @@ pub enum Error { #[snafu(display("Error reading federated token file "))] FederatedTokenFile, + #[snafu(display("Invalid Access Key: {}", source))] + InvalidAccessKey { source: base64::DecodeError }, + #[snafu(display("'az account get-access-token' command failed: {message}"))] AzureCli { message: String }, @@ -93,13 +96,25 @@ impl From for crate::Error { } } +/// A shared Azure Storage Account Key +#[derive(Debug, Eq, PartialEq)] +pub struct AzureAccessKey(Vec); + +impl AzureAccessKey { + /// Create a new [`AzureAccessKey`], checking it for validity + pub fn try_new(key: &str) -> Result { + let key = BASE64_STANDARD.decode(key).context(InvalidAccessKeySnafu)?; + Ok(Self(key)) + } +} + /// An Azure storage credential #[derive(Debug, Eq, PartialEq)] pub enum AzureCredential { /// A shared access key /// /// - AccessKey(String), + AccessKey(AzureAccessKey), /// A shared access signature /// /// @@ -149,7 +164,7 @@ impl CredentialExt for RequestBuilder { request.url(), request.method(), account, - key.as_str(), + key, ); // "signature" is a base 64 encoded string so it should never @@ -174,10 +189,10 @@ fn generate_authorization( u: &Url, method: &Method, account: &str, - key: &str, + key: &AzureAccessKey, ) -> String { let str_to_sign = string_to_sign(h, u, method, account); - let auth = hmac_sha256(BASE64_STANDARD.decode(key).unwrap(), str_to_sign); + let auth = hmac_sha256(&key.0, str_to_sign); format!("SharedKey {}:{}", account, BASE64_STANDARD.encode(auth)) } From 3dbe45b658a023ab3ef1c3b9a17c9eab49212715 Mon Sep 17 00:00:00 2001 From: jokercurry <982458633@qq.com> Date: Thu, 26 Oct 2023 17:05:58 +0800 Subject: [PATCH 1308/1411] [MINOR] No need to jump to web pages (#4994) Co-authored-by: zhongjingxiong --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c3108917e87a..8cd3ec970b53 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,6 @@ There is more information in the [contributing] guide. [flight-readme]: arrow-flight/README.md [datafusion-readme]: https://github.com/apache/arrow-datafusion/blob/master/README.md [ballista-readme]: https://github.com/apache/arrow-ballista/blob/master/README.md -[objectstore-readme]: https://github.com/apache/arrow-rs/blob/master/object_store/README.md +[objectstore-readme]: object_store/README.md [issues]: https://github.com/apache/arrow-rs/issues [discussions]: https://github.com/apache/arrow-rs/discussions From b07dabea870404c4d2d42e16e33d12b152f364d5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 26 Oct 2023 10:42:29 +0100 Subject: [PATCH 1309/1411] Use new integration scripts (#4963) (#4988) --- .github/workflows/integration.yml | 49 +++++++------------------------ 1 file changed, 11 insertions(+), 38 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 00c6b8bb0a90..6e2b4420408a 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -57,8 +57,14 @@ jobs: env: ARROW_USE_CCACHE: OFF ARROW_CPP_EXE_PATH: /build/cpp/debug - ARROW_GO_INTEGRATION: 1 BUILD_DOCS_CPP: OFF + ARROW_INTEGRATION_CPP: ON + ARROW_INTEGRATION_CSHARP: ON + ARROW_INTEGRATION_GO: ON + ARROW_INTEGRATION_JAVA: ON + ARROW_INTEGRATION_JS: ON + # https://github.com/apache/arrow/pull/38403/files#r1371281630 + ARCHERY_INTEGRATION_WITH_RUST: ON # These are necessary because the github runner overrides $HOME # https://github.com/actions/runner/issues/863 RUSTUP_HOME: /root/.rustup @@ -88,43 +94,10 @@ jobs: with: path: rust fetch-depth: 0 - - name: Install pythonnet - run: conda run --no-capture-output pip install pythonnet - - name: Install archery - run: conda run --no-capture-output pip install -e dev/archery[integration] - - name: Make build directory - run: mkdir /build - - name: Build Rust - run: conda run --no-capture-output ci/scripts/rust_build.sh $PWD /build - - name: Build C++ - run: conda run --no-capture-output ci/scripts/cpp_build.sh $PWD /build - - name: Build C# - run: conda run --no-capture-output ci/scripts/csharp_build.sh $PWD /build - - name: Build Go - run: conda run --no-capture-output ci/scripts/go_build.sh $PWD -# Temporarily disabled - https://github.com/apache/arrow-rs/issues/4963 -# - name: Build Java -# run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - - name: Build JS - run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build - - name: Run integration tests - run: | - conda run --no-capture-output archery integration \ - --run-flight \ - --run-c-data \ - --run-ipc \ - --with-cpp=1 \ - --with-csharp=1 \ - --with-java=0 \ - --with-js=1 \ - --with-go=1 \ - --with-rust=1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/0.14.1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/0.17.1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/1.0.0-bigendian \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/1.0.0-littleendian \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/2.0.0-compression \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/4.0.0-shareddict + - name: Build + run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build + - name: Run + run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build # test FFI against the C-Data interface exposed by pyarrow pyarrow-integration-test: From cd069ea28ba6c0e72487760290e3ffd53ec517b7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Oct 2023 15:13:18 +0100 Subject: [PATCH 1310/1411] Update tracing-log requirement from 0.1 to 0.2 (#4998) Updates the requirements on [tracing-log](https://github.com/tokio-rs/tracing) to permit the latest version. - [Release notes](https://github.com/tokio-rs/tracing/releases) - [Commits](https://github.com/tokio-rs/tracing/compare/tracing-log-0.1.0...tracing-log-0.2.0) --- updated-dependencies: - dependency-name: tracing-log dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 70227eedea0e..1bea347c3037 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -51,7 +51,7 @@ tonic = { version = "0.10.0", default-features = false, features = ["transport", # CLI-related dependencies anyhow = { version = "1.0", optional = true } clap = { version = "4.4.6", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage", "wrap_help", "color", "suggestions"], optional = true } -tracing-log = { version = "0.1", optional = true } +tracing-log = { version = "0.2", optional = true } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"], optional = true } [package.metadata.docs.rs] From 5601b7a8c8fa7ebdd34a7ab0a90aff7958913143 Mon Sep 17 00:00:00 2001 From: Folyd Date: Thu, 26 Oct 2023 23:05:08 +0800 Subject: [PATCH 1311/1411] Add `Field::remove()`, `Schema::remove()`, and `RecordBatch::remove_column()` APIs (#4959) * Add `Field::remove()`, `Schema::remove_field()`, and `RecordBatch::remove_column()` APIs * Update arrow-schema/src/fields.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-schema/src/schema.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Fix docs testing * Use `SchemaBuilder` to build the new `Schema` * Recommend `SchemaBuilder` * Apply review suggestions * Update arrow-schema/src/schema.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-array/src/record_batch.rs | 34 +++++++++++++++++++++++++++++++++ arrow-schema/src/fields.rs | 27 +++++++++++++++++++++++++- arrow-schema/src/schema.rs | 24 +++++++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 1f3e1df847a8..4e859fdfe7ea 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -327,6 +327,40 @@ impl RecordBatch { &self.columns[..] } + /// Remove column by index and return it. + /// + /// Return the `ArrayRef` if the column is removed. + /// + /// # Panics + /// + /// Panics if `index`` out of bounds. + /// + /// # Example + /// + /// ``` + /// use std::sync::Arc; + /// use arrow_array::{BooleanArray, Int32Array, RecordBatch}; + /// use arrow_schema::{DataType, Field, Schema}; + /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); + /// let bool_array = BooleanArray::from(vec![true, false, false, true, true]); + /// let schema = Schema::new(vec![ + /// Field::new("id", DataType::Int32, false), + /// Field::new("bool", DataType::Boolean, false), + /// ]); + /// + /// let mut batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array), Arc::new(bool_array)]).unwrap(); + /// + /// let removed_column = batch.remove_column(0); + /// assert_eq!(removed_column.as_any().downcast_ref::().unwrap(), &Int32Array::from(vec![1, 2, 3, 4, 5])); + /// assert_eq!(batch.num_columns(), 1); + /// ``` + pub fn remove_column(&mut self, index: usize) -> ArrayRef { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(index); + self.schema = Arc::new(builder.finish()); + self.columns.remove(index) + } + /// Return a new RecordBatch where each column is sliced /// according to `offset` and `length` /// diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 368ecabbf3ef..70cb1968e9a4 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{ArrowError, Field, FieldRef}; +use crate::{ArrowError, Field, FieldRef, SchemaBuilder}; use std::ops::Deref; use std::sync::Arc; @@ -98,6 +98,31 @@ impl Fields { .zip(other.iter()) .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b)) } + + /// Remove a field by index and return it. + /// + /// # Panic + /// + /// Panics if `index` is out of bounds. + /// + /// # Example + /// ``` + /// use arrow_schema::{DataType, Field, Fields}; + /// let mut fields = Fields::from(vec![ + /// Field::new("a", DataType::Boolean, false), + /// Field::new("b", DataType::Int8, false), + /// Field::new("c", DataType::Utf8, false), + /// ]); + /// assert_eq!(fields.len(), 3); + /// assert_eq!(fields.remove(1), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(fields.len(), 2); + /// ``` + pub fn remove(&mut self, index: usize) -> FieldRef { + let mut builder = SchemaBuilder::from(Fields::from(&*self.0)); + let field = builder.remove(index); + *self = builder.finish().fields; + field + } } impl Default for Fields { diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index c0f58e077a6f..711e4cb3314d 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -381,6 +381,30 @@ impl Schema { .iter() .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } + + /// Remove field by index and return it. Recommend to use [`SchemaBuilder`] + /// if you are looking to remove multiple columns, as this will save allocations. + /// + /// # Panic + /// + /// Panics if `index` is out of bounds. + /// + /// # Example + /// + /// ``` + /// use arrow_schema::{DataType, Field, Schema}; + /// let mut schema = Schema::new(vec![ + /// Field::new("a", DataType::Boolean, false), + /// Field::new("b", DataType::Int8, false), + /// Field::new("c", DataType::Utf8, false), + /// ]); + /// assert_eq!(schema.fields.len(), 3); + /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(schema.fields.len(), 2); + /// ``` + pub fn remove(&mut self, index: usize) -> FieldRef { + self.fields.remove(index) + } } impl fmt::Display for Schema { From e3cce569798d89b67b15d4d2579e592ea1c88b02 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:21:03 +0100 Subject: [PATCH 1312/1411] Conditional Put (#4879) (#4984) * Add version to PutResult * Conditional Put (#4879) * Don't support HttpStore * Add R2 Support * Update Azure StatusCode * Fixes * Clippy * Clippy * PutRequestBuilder * Clippy * Add stress test * Clippy --- object_store/src/aws/builder.rs | 30 ++- object_store/src/aws/client.rs | 177 ++++++++-------- object_store/src/aws/mod.rs | 43 +++- .../src/aws/{copy.rs => precondition.rs} | 45 +++- object_store/src/azure/client.rs | 139 ++++++------ object_store/src/azure/mod.rs | 17 +- object_store/src/chunked.rs | 7 +- object_store/src/client/header.rs | 17 ++ object_store/src/client/mod.rs | 2 +- object_store/src/client/retry.rs | 4 + .../src/client/{list_response.rs => s3.rs} | 46 +++- object_store/src/gcp/client.rs | 197 +++++++++++------- object_store/src/gcp/mod.rs | 9 +- object_store/src/http/client.rs | 4 + object_store/src/http/mod.rs | 15 +- object_store/src/lib.rs | 171 ++++++++++++++- object_store/src/limit.rs | 6 +- object_store/src/local.rs | 48 ++++- object_store/src/memory.rs | 67 +++++- object_store/src/prefix.rs | 8 +- object_store/src/throttle.rs | 9 +- object_store/tests/get_range_file.rs | 32 +-- 22 files changed, 791 insertions(+), 302 deletions(-) rename object_store/src/aws/{copy.rs => precondition.rs} (68%) rename object_store/src/client/{list_response.rs => s3.rs} (68%) diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 75a5299a0859..79ea75b5aba2 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -20,7 +20,8 @@ use crate::aws::credential::{ InstanceCredentialProvider, TaskCredentialProvider, WebIdentityProvider, }; use crate::aws::{ - AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3CopyIfNotExists, STORE, + AmazonS3, AwsCredential, AwsCredentialProvider, Checksum, S3ConditionalPut, S3CopyIfNotExists, + STORE, }; use crate::client::TokenCredentialProvider; use crate::config::ConfigValue; @@ -152,6 +153,8 @@ pub struct AmazonS3Builder { skip_signature: ConfigValue, /// Copy if not exists copy_if_not_exists: Option>, + /// Put precondition + conditional_put: Option>, } /// Configuration keys for [`AmazonS3Builder`] @@ -288,6 +291,11 @@ pub enum AmazonS3ConfigKey { /// See [`S3CopyIfNotExists`] CopyIfNotExists, + /// Configure how to provide conditional put operations + /// + /// See [`S3ConditionalPut`] + ConditionalPut, + /// Skip signing request SkipSignature, @@ -312,7 +320,8 @@ impl AsRef for AmazonS3ConfigKey { Self::Checksum => "aws_checksum_algorithm", Self::ContainerCredentialsRelativeUri => "aws_container_credentials_relative_uri", Self::SkipSignature => "aws_skip_signature", - Self::CopyIfNotExists => "copy_if_not_exists", + Self::CopyIfNotExists => "aws_copy_if_not_exists", + Self::ConditionalPut => "aws_conditional_put", Self::Client(opt) => opt.as_ref(), } } @@ -339,7 +348,8 @@ impl FromStr for AmazonS3ConfigKey { "aws_checksum_algorithm" | "checksum_algorithm" => Ok(Self::Checksum), "aws_container_credentials_relative_uri" => Ok(Self::ContainerCredentialsRelativeUri), "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), - "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), + "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -446,6 +456,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } + AmazonS3ConfigKey::ConditionalPut => { + self.conditional_put = Some(ConfigValue::Deferred(value.into())) + } }; self } @@ -509,6 +522,9 @@ impl AmazonS3Builder { AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::ConditionalPut => { + self.conditional_put.as_ref().map(ToString::to_string) + } } } @@ -713,6 +729,12 @@ impl AmazonS3Builder { self } + /// Configure how to provide conditional put operations + pub fn with_conditional_put(mut self, config: S3ConditionalPut) -> Self { + self.conditional_put = Some(config.into()); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -724,6 +746,7 @@ impl AmazonS3Builder { let region = self.region.context(MissingRegionSnafu)?; let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; + let put_precondition = self.conditional_put.map(|x| x.get()).transpose()?; let credentials = if let Some(credentials) = self.credentials { credentials @@ -830,6 +853,7 @@ impl AmazonS3Builder { skip_signature: self.skip_signature.get()?, checksum, copy_if_not_exists, + conditional_put: put_precondition, }; let client = Arc::new(S3Client::new(config)?); diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 4e98f259f8dd..20c2a96b57cd 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -17,13 +17,18 @@ use crate::aws::checksum::Checksum; use crate::aws::credential::{AwsCredential, CredentialExt}; -use crate::aws::{AwsCredentialProvider, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET}; +use crate::aws::{ + AwsCredentialProvider, S3ConditionalPut, S3CopyIfNotExists, STORE, STRICT_PATH_ENCODE_SET, +}; use crate::client::get::GetClient; -use crate::client::header::get_etag; use crate::client::header::HeaderConfig; +use crate::client::header::{get_put_result, get_version}; use crate::client::list::ListClient; -use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; +use crate::client::s3::{ + CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, + ListResponse, +}; use crate::client::GetOptionsExt; use crate::multipart::PartId; use crate::path::DELIMITER; @@ -34,17 +39,20 @@ use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; +use hyper::http::HeaderName; use itertools::Itertools; use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, Response, StatusCode, + Client as ReqwestClient, Method, RequestBuilder, Response, StatusCode, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; use std::sync::Arc; +const VERSION_HEADER: &str = "x-amz-version-id"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -147,33 +155,6 @@ impl From for crate::Error { } } -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipart { - upload_id: String, -} - -#[derive(Debug, Serialize)] -#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUpload")] -struct CompleteMultipart { - part: Vec, -} - -#[derive(Debug, Serialize)] -struct MultipartPart { - #[serde(rename = "ETag")] - e_tag: String, - #[serde(rename = "PartNumber")] - part_number: usize, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "PascalCase", rename = "CompleteMultipartUploadResult")] -struct CompleteMultipartResult { - #[serde(rename = "ETag")] - e_tag: String, -} - #[derive(Deserialize)] #[serde(rename_all = "PascalCase", rename = "DeleteResult")] struct BatchDeleteResponse { @@ -225,12 +206,61 @@ pub struct S3Config { pub skip_signature: bool, pub checksum: Option, pub copy_if_not_exists: Option, + pub conditional_put: Option, } impl S3Config { pub(crate) fn path_url(&self, path: &Path) -> String { format!("{}/{}", self.bucket_endpoint, encode_path(path)) } + + async fn get_credential(&self) -> Result>> { + Ok(match self.skip_signature { + false => Some(self.credentials.get_credential().await?), + true => None, + }) + } +} + +/// A builder for a put request allowing customisation of the headers and query string +pub(crate) struct PutRequest<'a> { + path: &'a Path, + config: &'a S3Config, + builder: RequestBuilder, + payload_sha256: Option>, +} + +impl<'a> PutRequest<'a> { + pub fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + pub fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + pub async fn send(self) -> Result { + let credential = self.config.get_credential().await?; + + let response = self + .builder + .with_aws_sigv4( + credential.as_deref(), + &self.config.region, + "s3", + self.config.sign_payload, + self.payload_sha256.as_deref(), + ) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + } } #[derive(Debug)] @@ -250,23 +280,10 @@ impl S3Client { &self.config } - async fn get_credential(&self) -> Result>> { - Ok(match self.config.skip_signature { - false => Some(self.config.credentials.get_credential().await?), - true => None, - }) - } - /// Make an S3 PUT request /// /// Returns the ETag - pub async fn put_request( - &self, - path: &Path, - bytes: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + pub fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); let mut payload_sha256 = None; @@ -288,22 +305,12 @@ impl S3Client { builder = builder.header(CONTENT_TYPE, value); } - let response = builder - .query(query) - .with_aws_sigv4( - credential.as_deref(), - &self.config.region, - "s3", - self.config.sign_payload, - payload_sha256.as_deref(), - ) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; - - Ok(get_etag(response.headers()).context(MetadataSnafu)?) + PutRequest { + path, + builder, + payload_sha256, + config: &self.config, + } } /// Make an S3 Delete request @@ -312,7 +319,7 @@ impl S3Client { path: &Path, query: &T, ) -> Result<()> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(path); self.client @@ -346,7 +353,7 @@ impl S3Client { return Ok(Vec::new()); } - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = format!("{}?delete", self.config.bucket_endpoint); let mut buffer = Vec::new(); @@ -444,7 +451,7 @@ impl S3Client { /// Make an S3 Copy request pub async fn copy_request(&self, from: &Path, to: &Path, overwrite: bool) -> Result<()> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(to); let source = format!("{}/{}", self.config.bucket, encode_path(from)); @@ -492,7 +499,7 @@ impl S3Client { } pub async fn create_multipart(&self, location: &Path) -> Result { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = format!("{}?uploads=", self.config.path_url(location),); let response = self @@ -512,7 +519,7 @@ impl S3Client { .await .context(CreateMultipartResponseBodySnafu)?; - let response: InitiateMultipart = + let response: InitiateMultipartUploadResult = quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; Ok(response.upload_id) @@ -527,15 +534,15 @@ impl S3Client { ) -> Result { let part = (part_idx + 1).to_string(); - let content_id = self - .put_request( - path, - data, - &[("partNumber", &part), ("uploadId", upload_id)], - ) + let result = self + .put_request(path, data) + .query(&[("partNumber", &part), ("uploadId", upload_id)]) + .send() .await?; - Ok(PartId { content_id }) + Ok(PartId { + content_id: result.e_tag.unwrap(), + }) } pub async fn complete_multipart( @@ -544,19 +551,10 @@ impl S3Client { upload_id: &str, parts: Vec, ) -> Result { - let parts = parts - .into_iter() - .enumerate() - .map(|(part_idx, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_idx + 1, - }) - .collect(); - - let request = CompleteMultipart { part: parts }; + let request = CompleteMultipartUpload::from(parts); let body = quick_xml::se::to_string(&request).unwrap(); - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(location); let response = self @@ -575,16 +573,19 @@ impl S3Client { .await .context(CompleteMultipartRequestSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let data = response .bytes() .await .context(CompleteMultipartResponseBodySnafu)?; - let response: CompleteMultipartResult = + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; Ok(PutResult { e_tag: Some(response.e_tag), + version, }) } } @@ -596,12 +597,12 @@ impl GetClient for S3Client { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: false, last_modified_required: false, - version_header: Some("x-amz-version-id"), + version_header: Some(VERSION_HEADER), }; /// Make an S3 GET request async fn get_request(&self, path: &Path, options: GetOptions) -> Result { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.path_url(path); let method = match options.head { true => Method::HEAD, @@ -643,7 +644,7 @@ impl ListClient for S3Client { token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { - let credential = self.get_credential().await?; + let credential = self.config.get_credential().await?; let url = self.config.bucket_endpoint.clone(); let mut query = Vec::with_capacity(4); diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 57254c7cf4e8..99e637695059 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -35,6 +35,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; +use reqwest::header::{IF_MATCH, IF_NONE_MATCH}; use reqwest::Method; use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; @@ -47,20 +48,20 @@ use crate::client::CredentialProvider; use crate::multipart::{MultiPartStore, PartId, PutPart, WriteMultiPart}; use crate::signer::Signer; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutResult, - Result, + Error, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, PutMode, + PutOptions, PutResult, Result, }; mod builder; mod checksum; mod client; -mod copy; mod credential; +mod precondition; mod resolve; pub use builder::{AmazonS3Builder, AmazonS3ConfigKey}; pub use checksum::Checksum; -pub use copy::S3CopyIfNotExists; +pub use precondition::{S3ConditionalPut, S3CopyIfNotExists}; pub use resolve::resolve_bucket_region; // http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html @@ -158,9 +159,33 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let e_tag = self.client.put_request(location, bytes, &()).await?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let request = self.client.put_request(location, bytes); + match (opts.mode, &self.client.config().conditional_put) { + (PutMode::Overwrite, _) => request.send().await, + (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), + (PutMode::Create, Some(S3ConditionalPut::ETagMatch)) => { + match request.header(&IF_NONE_MATCH, "*").send().await { + // Technically If-None-Match should return NotModified but some stores, + // such as R2, instead return PreconditionFailed + // https://developers.cloudflare.com/r2/api/s3/extensions/#conditional-operations-in-putobject + Err(e @ Error::NotModified { .. } | e @ Error::Precondition { .. }) => { + Err(Error::AlreadyExists { + path: location.to_string(), + source: Box::new(e), + }) + } + r => r, + } + } + (PutMode::Update(v), Some(S3ConditionalPut::ETagMatch)) => { + let etag = v.e_tag.ok_or_else(|| Error::Generic { + store: STORE, + source: "ETag required for conditional put".to_string().into(), + })?; + request.header(&IF_MATCH, etag.as_str()).send().await + } + } } async fn put_multipart( @@ -306,6 +331,7 @@ mod tests { let config = integration.client.config(); let is_local = config.endpoint.starts_with("http://"); let test_not_exists = config.copy_if_not_exists.is_some(); + let test_conditional_put = config.conditional_put.is_some(); // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 put_get_delete_list_opts(&integration, is_local).await; @@ -319,6 +345,9 @@ mod tests { if test_not_exists { copy_if_not_exists(&integration).await; } + if test_conditional_put { + put_opts(&integration, true).await; + } // run integration test with unsigned payload enabled let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); diff --git a/object_store/src/aws/copy.rs b/object_store/src/aws/precondition.rs similarity index 68% rename from object_store/src/aws/copy.rs rename to object_store/src/aws/precondition.rs index da4e2809be1a..a50b57fe23f7 100644 --- a/object_store/src/aws/copy.rs +++ b/object_store/src/aws/precondition.rs @@ -17,8 +17,7 @@ use crate::config::Parse; -/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for -/// [`AmazonS3`]. +/// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`]. /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists /// [`AmazonS3`]: super::AmazonS3 @@ -70,3 +69,45 @@ impl Parse for S3CopyIfNotExists { }) } } + +/// Configure how to provide conditional put support for [`AmazonS3`]. +/// +/// [`AmazonS3`]: super::AmazonS3 +#[derive(Debug, Clone)] +#[allow(missing_copy_implementations)] +#[non_exhaustive] +pub enum S3ConditionalPut { + /// Some S3-compatible stores, such as Cloudflare R2 and minio support conditional + /// put using the standard [HTTP precondition] headers If-Match and If-None-Match + /// + /// Encoded as `etag` ignoring whitespace + /// + /// [HTTP precondition]: https://datatracker.ietf.org/doc/html/rfc9110#name-preconditions + ETagMatch, +} + +impl std::fmt::Display for S3ConditionalPut { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::ETagMatch => write!(f, "etag"), + } + } +} + +impl S3ConditionalPut { + fn from_str(s: &str) -> Option { + match s.trim() { + "etag" => Some(Self::ETagMatch), + _ => None, + } + } +} + +impl Parse for S3ConditionalPut { + fn parse(v: &str) -> crate::Result { + Self::from_str(v).ok_or_else(|| crate::Error::Generic { + store: "Config", + source: format!("Failed to parse \"{v}\" as S3PutConditional").into(), + }) + } +} diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 9f47b9a8152b..c7bd79149872 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -19,7 +19,7 @@ use super::credential::AzureCredential; use crate::azure::credential::*; use crate::azure::{AzureCredentialProvider, STORE}; use crate::client::get::GetClient; -use crate::client::header::{get_etag, HeaderConfig}; +use crate::client::header::{get_put_result, HeaderConfig}; use crate::client::list::ListClient; use crate::client::retry::RetryExt; use crate::client::GetOptionsExt; @@ -27,25 +27,29 @@ use crate::multipart::PartId; use crate::path::DELIMITER; use crate::util::deserialize_rfc1123; use crate::{ - ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutResult, Result, RetryConfig, + ClientOptions, GetOptions, ListResult, ObjectMeta, Path, PutMode, PutOptions, PutResult, + Result, RetryConfig, }; use async_trait::async_trait; use base64::prelude::BASE64_STANDARD; use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; +use hyper::http::HeaderName; use itertools::Itertools; use reqwest::header::CONTENT_TYPE; use reqwest::{ - header::{HeaderValue, CONTENT_LENGTH, IF_NONE_MATCH}, - Client as ReqwestClient, Method, Response, StatusCode, + header::{HeaderValue, CONTENT_LENGTH, IF_MATCH, IF_NONE_MATCH}, + Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; use url::Url; +const VERSION_HEADER: &str = "x-ms-version-id"; + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -92,6 +96,9 @@ pub(crate) enum Error { Metadata { source: crate::client::header::Error, }, + + #[snafu(display("ETag required for conditional update"))] + MissingETag, } impl From for crate::Error { @@ -134,6 +141,39 @@ impl AzureConfig { } } +/// A builder for a put request allowing customisation of the headers and query string +struct PutRequest<'a> { + path: &'a Path, + config: &'a AzureConfig, + builder: RequestBuilder, +} + +impl<'a> PutRequest<'a> { + fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + async fn send(self) -> Result { + let credential = self.config.credentials.get_credential().await?; + let response = self + .builder + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(response) + } +} + #[derive(Debug)] pub(crate) struct AzureClient { config: AzureConfig, @@ -156,63 +196,52 @@ impl AzureClient { self.config.credentials.get_credential().await } - /// Make an Azure PUT request - pub async fn put_request( - &self, - path: &Path, - bytes: Option, - is_block_op: bool, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + fn put_request<'a>(&'a self, path: &'a Path, bytes: Bytes) -> PutRequest<'a> { let url = self.config.path_url(path); let mut builder = self.client.request(Method::PUT, url); - if !is_block_op { - builder = builder.header(&BLOB_TYPE, "BlockBlob").query(query); - } else { - builder = builder.query(query); - } - if let Some(value) = self.config().client_options.get_content_type(path) { builder = builder.header(CONTENT_TYPE, value); } - if let Some(bytes) = bytes { - builder = builder - .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) - .body(bytes) - } else { - builder = builder.header(CONTENT_LENGTH, HeaderValue::from_static("0")); + builder = builder + .header(CONTENT_LENGTH, HeaderValue::from(bytes.len())) + .body(bytes); + + PutRequest { + path, + builder, + config: &self.config, } + } - let response = builder - .with_azure_authorization(&credential, &self.config.account) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; + /// Make an Azure PUT request + pub async fn put_blob(&self, path: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let builder = self.put_request(path, bytes); + + let builder = match &opts.mode { + PutMode::Overwrite => builder, + PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), + PutMode::Update(v) => { + let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; + builder.header(&IF_MATCH, etag) + } + }; - Ok(response) + let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } /// PUT a block pub async fn put_block(&self, path: &Path, part_idx: usize, data: Bytes) -> Result { let content_id = format!("{part_idx:20}"); - let block_id: BlockId = content_id.clone().into(); + let block_id = BASE64_STANDARD.encode(&content_id); - self.put_request( - path, - Some(data), - true, - &[ - ("comp", "block"), - ("blockid", &BASE64_STANDARD.encode(block_id)), - ], - ) - .await?; + self.put_request(path, data) + .query(&[("comp", "block"), ("blockid", &block_id)]) + .send() + .await?; Ok(PartId { content_id }) } @@ -224,15 +253,13 @@ impl AzureClient { .map(|part| BlockId::from(part.content_id)) .collect(); - let block_list = BlockList { blocks }; - let block_xml = block_list.to_xml(); - let response = self - .put_request(path, Some(block_xml.into()), true, &[("comp", "blocklist")]) + .put_request(path, BlockList { blocks }.to_xml().into()) + .query(&[("comp", "blocklist")]) + .send() .await?; - let e_tag = get_etag(response.headers()).context(MetadataSnafu)?; - Ok(PutResult { e_tag: Some(e_tag) }) + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } /// Make an Azure Delete request @@ -284,13 +311,7 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .map_err(|err| match err.status() { - Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { - source: Box::new(err), - path: to.to_string(), - }, - _ => err.error(STORE, from.to_string()), - })?; + .map_err(|err| err.error(STORE, from.to_string()))?; Ok(()) } @@ -303,7 +324,7 @@ impl GetClient for AzureClient { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: true, last_modified_required: true, - version_header: Some("x-ms-version-id"), + version_header: Some(VERSION_HEADER), }; /// Make an Azure GET request diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 779ac2f71ff8..762a51dd9d60 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -29,7 +29,8 @@ use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -49,7 +50,6 @@ mod credential; /// [`CredentialProvider`] for [`MicrosoftAzure`] pub type AzureCredentialProvider = Arc>; -use crate::client::header::get_etag; use crate::multipart::MultiPartStore; pub use builder::{AzureConfigKey, MicrosoftAzureBuilder}; pub use credential::AzureCredential; @@ -82,16 +82,8 @@ impl std::fmt::Display for MicrosoftAzure { #[async_trait] impl ObjectStore for MicrosoftAzure { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let response = self - .client - .put_request(location, Some(bytes), false, &()) - .await?; - let e_tag = get_etag(response.headers()).map_err(|e| crate::Error::Generic { - store: STORE, - source: Box::new(e), - })?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.client.put_blob(location, bytes, opts).await } async fn put_multipart( @@ -208,6 +200,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, true).await; multipart(&integration, &integration).await; } diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 021f9f50156b..d33556f4b12e 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -29,7 +29,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, + GetOptions, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, + PutResult, }; use crate::{MultipartId, Result}; @@ -62,8 +63,8 @@ impl Display for ChunkedStore { #[async_trait] impl ObjectStore for ChunkedStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - self.inner.put(location, bytes).await + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.inner.put_opts(location, bytes, opts).await } async fn put_multipart( diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index e67496833b99..e85bf6ba52d0 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -67,6 +67,23 @@ pub enum Error { }, } +/// Extracts a PutResult from the provided [`HeaderMap`] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn get_put_result(headers: &HeaderMap, version: &str) -> Result { + let e_tag = Some(get_etag(headers)?); + let version = get_version(headers, version)?; + Ok(crate::PutResult { e_tag, version }) +} + +/// Extracts a optional version from the provided [`HeaderMap`] +#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +pub fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { + Ok(match headers.get(version) { + Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), + None => None, + }) +} + /// Extracts an etag from the provided [`HeaderMap`] pub fn get_etag(headers: &HeaderMap) -> Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 77eee7fc92f3..ae092edac095 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -38,7 +38,7 @@ pub mod token; pub mod header; #[cfg(any(feature = "aws", feature = "gcp"))] -pub mod list_response; +pub mod s3; use async_trait::async_trait; use std::collections::HashMap; diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index d70d6d88de32..789103c0f74f 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -79,6 +79,10 @@ impl Error { path, source: Box::new(self), }, + Some(StatusCode::CONFLICT) => crate::Error::AlreadyExists { + path, + source: Box::new(self), + }, _ => crate::Error::Generic { store, source: Box::new(self), diff --git a/object_store/src/client/list_response.rs b/object_store/src/client/s3.rs similarity index 68% rename from object_store/src/client/list_response.rs rename to object_store/src/client/s3.rs index 7a170c584156..61237dc4beab 100644 --- a/object_store/src/client/list_response.rs +++ b/object_store/src/client/s3.rs @@ -14,12 +14,13 @@ // specific language governing permissions and limitations // under the License. -//! The list response format used by GCP and AWS +//! The list and multipart API used by both GCS and S3 +use crate::multipart::PartId; use crate::path::Path; use crate::{ListResult, ObjectMeta, Result}; use chrono::{DateTime, Utc}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; #[derive(Debug, Deserialize)] #[serde(rename_all = "PascalCase")] @@ -84,3 +85,44 @@ impl TryFrom for ObjectMeta { }) } } + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct InitiateMultipartUploadResult { + pub upload_id: String, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct CompleteMultipartUpload { + pub part: Vec, +} + +impl From> for CompleteMultipartUpload { + fn from(value: Vec) -> Self { + let part = value + .into_iter() + .enumerate() + .map(|(part_number, part)| MultipartPart { + e_tag: part.content_id, + part_number: part_number + 1, + }) + .collect(); + Self { part } + } +} + +#[derive(Debug, Serialize)] +pub struct MultipartPart { + #[serde(rename = "ETag")] + pub e_tag: String, + #[serde(rename = "PartNumber")] + pub part_number: usize, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "PascalCase")] +pub struct CompleteMultipartUploadResult { + #[serde(rename = "ETag")] + pub e_tag: String, +} diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 8c44f9016480..78964077e2fe 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -16,23 +16,34 @@ // under the License. use crate::client::get::GetClient; -use crate::client::header::{get_etag, HeaderConfig}; +use crate::client::header::{get_put_result, get_version, HeaderConfig}; use crate::client::list::ListClient; -use crate::client::list_response::ListResponse; use crate::client::retry::RetryExt; +use crate::client::s3::{ + CompleteMultipartUpload, CompleteMultipartUploadResult, InitiateMultipartUploadResult, + ListResponse, +}; use crate::client::GetOptionsExt; use crate::gcp::{GcpCredential, GcpCredentialProvider, STORE}; use crate::multipart::PartId; use crate::path::{Path, DELIMITER}; -use crate::{ClientOptions, GetOptions, ListResult, MultipartId, PutResult, Result, RetryConfig}; +use crate::{ + ClientOptions, GetOptions, ListResult, MultipartId, PutMode, PutOptions, PutResult, Result, + RetryConfig, +}; use async_trait::async_trait; use bytes::{Buf, Bytes}; use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; -use reqwest::{header, Client, Method, Response, StatusCode}; +use reqwest::header::HeaderName; +use reqwest::{header, Client, Method, RequestBuilder, Response, StatusCode}; use serde::Serialize; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; +const VERSION_HEADER: &str = "x-goog-generation"; + +static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); + #[derive(Debug, Snafu)] enum Error { #[snafu(display("Error performing list request: {}", source))] @@ -78,6 +89,18 @@ enum Error { Metadata { source: crate::client::header::Error, }, + + #[snafu(display("Version required for conditional update"))] + MissingVersion, + + #[snafu(display("Error performing complete multipart request: {}", source))] + CompleteMultipartRequest { source: crate::client::retry::Error }, + + #[snafu(display("Error getting complete multipart response body: {}", source))] + CompleteMultipartResponseBody { source: reqwest::Error }, + + #[snafu(display("Got invalid multipart response: {}", source))] + InvalidMultipartResponse { source: quick_xml::de::DeError }, } impl From for crate::Error { @@ -107,6 +130,39 @@ pub struct GoogleCloudStorageConfig { pub client_options: ClientOptions, } +/// A builder for a put request allowing customisation of the headers and query string +pub struct PutRequest<'a> { + path: &'a Path, + config: &'a GoogleCloudStorageConfig, + builder: RequestBuilder, +} + +impl<'a> PutRequest<'a> { + fn header(self, k: &HeaderName, v: &str) -> Self { + let builder = self.builder.header(k, v); + Self { builder, ..self } + } + + fn query(self, query: &T) -> Self { + let builder = self.builder.query(query); + Self { builder, ..self } + } + + async fn send(self) -> Result { + let credential = self.config.credentials.get_credential().await?; + let response = self + .builder + .bearer_auth(&credential.bearer) + .send_retry(&self.config.retry_config) + .await + .context(PutRequestSnafu { + path: self.path.as_ref(), + })?; + + Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + } +} + #[derive(Debug)] pub struct GoogleCloudStorageClient { config: GoogleCloudStorageConfig, @@ -152,13 +208,7 @@ impl GoogleCloudStorageClient { /// Perform a put request /// /// Returns the new ETag - pub async fn put_request( - &self, - path: &Path, - payload: Bytes, - query: &T, - ) -> Result { - let credential = self.get_credential().await?; + pub fn put_request<'a>(&'a self, path: &'a Path, payload: Bytes) -> PutRequest<'a> { let url = self.object_url(path); let content_type = self @@ -167,21 +217,38 @@ impl GoogleCloudStorageClient { .get_content_type(path) .unwrap_or("application/octet-stream"); - let response = self + let builder = self .client .request(Method::PUT, url) - .query(query) - .bearer_auth(&credential.bearer) .header(header::CONTENT_TYPE, content_type) .header(header::CONTENT_LENGTH, payload.len()) - .body(payload) - .send_retry(&self.config.retry_config) - .await - .context(PutRequestSnafu { - path: path.as_ref(), - })?; + .body(payload); - Ok(get_etag(response.headers()).context(MetadataSnafu)?) + PutRequest { + path, + builder, + config: &self.config, + } + } + + pub async fn put(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { + let builder = self.put_request(path, data); + + let builder = match &opts.mode { + PutMode::Overwrite => builder, + PutMode::Create => builder.header(&VERSION_MATCH, "0"), + PutMode::Update(v) => { + let etag = v.version.as_ref().context(MissingVersionSnafu)?; + builder.header(&VERSION_MATCH, etag) + } + }; + + match (opts.mode, builder.send().await) { + (PutMode::Create, Err(crate::Error::Precondition { path, source })) => { + Err(crate::Error::AlreadyExists { path, source }) + } + (_, r) => r, + } } /// Perform a put part request @@ -194,18 +261,15 @@ impl GoogleCloudStorageClient { part_idx: usize, data: Bytes, ) -> Result { - let content_id = self - .put_request( - path, - data, - &[ - ("partNumber", &format!("{}", part_idx + 1)), - ("uploadId", upload_id), - ], - ) - .await?; - - Ok(PartId { content_id }) + let query = &[ + ("partNumber", &format!("{}", part_idx + 1)), + ("uploadId", upload_id), + ]; + let result = self.put_request(path, data).query(query).send().await?; + + Ok(PartId { + content_id: result.e_tag.unwrap(), + }) } /// Initiate a multi-part upload @@ -268,17 +332,8 @@ impl GoogleCloudStorageClient { let upload_id = multipart_id.clone(); let url = self.object_url(path); - let parts = completed_parts - .into_iter() - .enumerate() - .map(|(part_number, part)| MultipartPart { - e_tag: part.content_id, - part_number: part_number + 1, - }) - .collect(); - + let upload_info = CompleteMultipartUpload::from(completed_parts); let credential = self.get_credential().await?; - let upload_info = CompleteMultipartUpload { parts }; let data = quick_xml::se::to_string(&upload_info) .context(InvalidPutResponseSnafu)? @@ -287,7 +342,7 @@ impl GoogleCloudStorageClient { // https://github.com/tafia/quick-xml/issues/350 .replace(""", "\""); - let result = self + let response = self .client .request(Method::POST, &url) .bearer_auth(&credential.bearer) @@ -295,12 +350,22 @@ impl GoogleCloudStorageClient { .body(data) .send_retry(&self.config.retry_config) .await - .context(PostRequestSnafu { - path: path.as_ref(), - })?; + .context(CompleteMultipartRequestSnafu)?; - let etag = get_etag(result.headers()).context(MetadataSnafu)?; - Ok(PutResult { e_tag: Some(etag) }) + let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + + let data = response + .bytes() + .await + .context(CompleteMultipartResponseBodySnafu)?; + + let response: CompleteMultipartUploadResult = + quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + + Ok(PutResult { + e_tag: Some(response.e_tag), + version, + }) } /// Perform a delete request @@ -334,7 +399,7 @@ impl GoogleCloudStorageClient { .header("x-goog-copy-source", source); if if_not_exists { - builder = builder.header("x-goog-if-generation-match", 0); + builder = builder.header(&VERSION_MATCH, 0); } builder @@ -362,7 +427,7 @@ impl GetClient for GoogleCloudStorageClient { const HEADER_CONFIG: HeaderConfig = HeaderConfig { etag_required: true, last_modified_required: true, - version_header: Some("x-goog-generation"), + version_header: Some(VERSION_HEADER), }; /// Perform a get request @@ -375,13 +440,18 @@ impl GetClient for GoogleCloudStorageClient { false => Method::GET, }; - let mut request = self.client.request(method, url).with_get_options(options); + let mut request = self.client.request(method, url); + + if let Some(version) = &options.version { + request = request.query(&[("generation", version)]); + } if !credential.bearer.is_empty() { request = request.bearer_auth(&credential.bearer); } let response = request + .with_get_options(options) .send_retry(&self.config.retry_config) .await .context(GetRequestSnafu { @@ -444,24 +514,3 @@ impl ListClient for GoogleCloudStorageClient { Ok((response.try_into()?, token)) } } - -#[derive(serde::Deserialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct InitiateMultipartUploadResult { - upload_id: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase", rename(serialize = "Part"))] -struct MultipartPart { - #[serde(rename = "PartNumber")] - part_number: usize, - e_tag: String, -} - -#[derive(serde::Serialize, Debug)] -#[serde(rename_all = "PascalCase")] -struct CompleteMultipartUpload { - #[serde(rename = "Part", default)] - parts: Vec, -} diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 0eb3e9c23c43..7721b1278a80 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -35,7 +35,8 @@ use crate::client::CredentialProvider; use crate::{ multipart::{PartId, PutPart, WriteMultiPart}, path::Path, - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -107,9 +108,8 @@ impl PutPart for GCSMultipartUpload { #[async_trait] impl ObjectStore for GoogleCloudStorage { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let e_tag = self.client.put_request(location, bytes, &()).await?; - Ok(PutResult { e_tag: Some(e_tag) }) + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.client.put(location, bytes, opts).await } async fn put_multipart( @@ -221,6 +221,7 @@ mod test { multipart(&integration, &integration).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; + put_opts(&integration, true).await; } } diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index a7dbdfcbe844..8700775fb243 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -243,6 +243,10 @@ impl Client { .header("Destination", self.path_url(to).as_str()); if !overwrite { + // While the Overwrite header appears to duplicate + // the functionality of the If-Match: * header of HTTP/1.1, If-Match + // applies only to the Request-URI, and not to the Destination of a COPY + // or MOVE. builder = builder.header("Overwrite", "F"); } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 8f61011ccae1..cfcde27fd781 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -46,7 +46,7 @@ use crate::http::client::Client; use crate::path::Path; use crate::{ ClientConfigKey, ClientOptions, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, - ObjectStore, PutResult, Result, RetryConfig, + ObjectStore, PutMode, PutOptions, PutResult, Result, RetryConfig, }; mod client; @@ -96,14 +96,23 @@ impl std::fmt::Display for HttpStore { #[async_trait] impl ObjectStore for HttpStore { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + if opts.mode != PutMode::Overwrite { + // TODO: Add support for If header - https://datatracker.ietf.org/doc/html/rfc2518#section-9.4 + return Err(crate::Error::NotImplemented); + } + let response = self.client.put(location, bytes).await?; let e_tag = match get_etag(response.headers()) { Ok(e_tag) => Some(e_tag), Err(crate::client::header::Error::MissingEtag) => None, Err(source) => return Err(Error::Metadata { source }.into()), }; - Ok(PutResult { e_tag }) + + Ok(PutResult { + e_tag, + version: None, + }) } async fn put_multipart( diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 9a0667229803..66964304e853 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -299,7 +299,12 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// The operation is guaranteed to be atomic, it will either successfully /// write the entirety of `bytes` to `location`, or fail. No clients /// should be able to observe a partially written object - async fn put(&self, location: &Path, bytes: Bytes) -> Result; + async fn put(&self, location: &Path, bytes: Bytes) -> Result { + self.put_opts(location, bytes, PutOptions::default()).await + } + + /// Save the provided bytes to the specified location with the given options + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result; /// Get a multi-part upload that allows writing data in chunks. /// @@ -531,6 +536,15 @@ macro_rules! as_ref_impl { self.as_ref().put(location, bytes).await } + async fn put_opts( + &self, + location: &Path, + bytes: Bytes, + opts: PutOptions, + ) -> Result { + self.as_ref().put_opts(location, bytes, opts).await + } + async fn put_multipart( &self, location: &Path, @@ -837,13 +851,65 @@ impl GetResult { } } +/// Configure preconditions for the put operation +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub enum PutMode { + /// Perform an atomic write operation, overwriting any object present at the provided path + #[default] + Overwrite, + /// Perform an atomic write operation, returning [`Error::AlreadyExists`] if an + /// object already exists at the provided path + Create, + /// Perform an atomic write operation if the current version of the object matches the + /// provided [`UpdateVersion`], returning [`Error::Precondition`] otherwise + Update(UpdateVersion), +} + +/// Uniquely identifies a version of an object to update +/// +/// Stores will use differing combinations of `e_tag` and `version` to provide conditional +/// updates, and it is therefore recommended applications preserve both +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct UpdateVersion { + /// The unique identifier for the newly created object + /// + /// + pub e_tag: Option, + /// A version indicator for the newly created object + pub version: Option, +} + +impl From for UpdateVersion { + fn from(value: PutResult) -> Self { + Self { + e_tag: value.e_tag, + version: value.version, + } + } +} + +/// Options for a put request +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct PutOptions { + /// Configure the [`PutMode`] for this operation + pub mode: PutMode, +} + +impl From for PutOptions { + fn from(mode: PutMode) -> Self { + Self { mode } + } +} + /// Result for a put request #[derive(Debug, Clone, PartialEq, Eq)] pub struct PutResult { - /// The unique identifier for the object + /// The unique identifier for the newly created object /// /// pub e_tag: Option, + /// A version indicator for the newly created object + pub version: Option, } /// A specialized `Result` for object store-related errors @@ -947,6 +1013,7 @@ mod tests { use crate::multipart::MultiPartStore; use crate::test_util::flatten_list_stream; use chrono::TimeZone; + use futures::stream::FuturesUnordered; use rand::{thread_rng, Rng}; use tokio::io::AsyncWriteExt; @@ -1406,7 +1473,7 @@ mod tests { // Can retrieve previous version let get_opts = storage.get_opts(&path, options).await.unwrap(); let old = get_opts.bytes().await.unwrap(); - assert_eq!(old, b"foo".as_slice()); + assert_eq!(old, b"test".as_slice()); // Current version contains the updated data let current = storage.get(&path).await.unwrap().bytes().await.unwrap(); @@ -1414,6 +1481,104 @@ mod tests { } } + pub(crate) async fn put_opts(storage: &dyn ObjectStore, supports_update: bool) { + delete_fixtures(storage).await; + let path = Path::from("put_opts"); + let v1 = storage + .put_opts(&path, "a".into(), PutMode::Create.into()) + .await + .unwrap(); + + let err = storage + .put_opts(&path, "b".into(), PutMode::Create.into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::AlreadyExists { .. }), "{err}"); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"a"); + + if !supports_update { + return; + } + + let v2 = storage + .put_opts(&path, "c".into(), PutMode::Update(v1.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"c"); + + let err = storage + .put_opts(&path, "d".into(), PutMode::Update(v1.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + storage + .put_opts(&path, "e".into(), PutMode::Update(v2.clone().into()).into()) + .await + .unwrap(); + + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + assert_eq!(b.as_ref(), b"e"); + + // Update not exists + let path = Path::from("I don't exist"); + let err = storage + .put_opts(&path, "e".into(), PutMode::Update(v2.into()).into()) + .await + .unwrap_err(); + assert!(matches!(err, Error::Precondition { .. }), "{err}"); + + const NUM_WORKERS: usize = 5; + const NUM_INCREMENTS: usize = 10; + + let path = Path::from("RACE"); + let mut futures: FuturesUnordered<_> = (0..NUM_WORKERS) + .map(|_| async { + for _ in 0..NUM_INCREMENTS { + loop { + match storage.get(&path).await { + Ok(r) => { + let mode = PutMode::Update(UpdateVersion { + e_tag: r.meta.e_tag.clone(), + version: r.meta.version.clone(), + }); + + let b = r.bytes().await.unwrap(); + let v: usize = std::str::from_utf8(&b).unwrap().parse().unwrap(); + let new = (v + 1).to_string(); + + match storage.put_opts(&path, new.into(), mode.into()).await { + Ok(_) => break, + Err(Error::Precondition { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(Error::NotFound { .. }) => { + let mode = PutMode::Create; + match storage.put_opts(&path, "1".into(), mode.into()).await { + Ok(_) => break, + Err(Error::AlreadyExists { .. }) => continue, + Err(e) => return Err(e), + } + } + Err(e) => return Err(e), + } + } + } + Ok(()) + }) + .collect(); + + while futures.next().await.transpose().unwrap().is_some() {} + let b = storage.get(&path).await.unwrap().bytes().await.unwrap(); + let v = std::str::from_utf8(&b).unwrap().parse::().unwrap(); + assert_eq!(v, NUM_WORKERS * NUM_INCREMENTS); + } + /// Returns a chunk of length `chunk_length` fn get_chunk(chunk_length: usize) -> Bytes { let mut data = vec![0_u8; chunk_length]; diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index cd01a964dc3e..39cc605c4768 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -19,7 +19,7 @@ use crate::{ BoxStream, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, - ObjectStore, Path, PutResult, Result, StreamExt, + ObjectStore, Path, PutOptions, PutResult, Result, StreamExt, }; use async_trait::async_trait; use bytes::Bytes; @@ -77,6 +77,10 @@ impl ObjectStore for LimitStore { self.inner.put(location, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.put_opts(location, bytes, opts).await + } async fn put_multipart( &self, location: &Path, diff --git a/object_store/src/local.rs b/object_store/src/local.rs index ce9aa4683499..919baf71b0a8 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -20,7 +20,7 @@ use crate::{ maybe_spawn_blocking, path::{absolute_path_to_url, Path}, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, - PutResult, Result, + PutMode, PutOptions, PutResult, Result, }; use async_trait::async_trait; use bytes::Bytes; @@ -271,20 +271,44 @@ impl Config { #[async_trait] impl ObjectStore for LocalFileSystem { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + if matches!(opts.mode, PutMode::Update(_)) { + return Err(crate::Error::NotImplemented); + } + let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); - file.write_all(&bytes) - .context(UnableToCopyDataToFileSnafu) - .and_then(|_| { - std::fs::rename(&staging_path, &path).context(UnableToRenameFileSnafu) - }) - .map_err(|e| { - let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - e - })?; + + let err = match file.write_all(&bytes) { + Ok(_) => match opts.mode { + PutMode::Overwrite => match std::fs::rename(&staging_path, &path) { + Ok(_) => None, + Err(source) => Some(Error::UnableToRenameFile { source }), + }, + PutMode::Create => match std::fs::hard_link(&staging_path, &path) { + Ok(_) => { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + None + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => Some(Error::AlreadyExists { + path: path.to_str().unwrap().to_string(), + source, + }), + _ => Some(Error::UnableToRenameFile { source }), + }, + }, + PutMode::Update(_) => unreachable!(), + }, + Err(source) => Some(Error::UnableToCopyDataToFile { source }), + }; + + if let Some(err) = err { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + return Err(err.into()); + } let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), @@ -293,6 +317,7 @@ impl ObjectStore for LocalFileSystem { Ok(PutResult { e_tag: Some(get_etag(&metadata)), + version: None, }) }) .await @@ -1054,6 +1079,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, false).await; } #[test] diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 8b9522e48de8..9d79a798ad1f 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -17,7 +17,8 @@ //! An in-memory object store implementation use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutMode, + PutOptions, PutResult, Result, UpdateVersion, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -52,6 +53,9 @@ enum Error { #[snafu(display("Object already exists at that location: {path}"))] AlreadyExists { path: String }, + + #[snafu(display("ETag required for conditional update"))] + MissingETag, } impl From for super::Error { @@ -110,9 +114,50 @@ impl Storage { let etag = self.next_etag; self.next_etag += 1; let entry = Entry::new(bytes, Utc::now(), etag); - self.map.insert(location.clone(), entry); + self.overwrite(location, entry); etag } + + fn overwrite(&mut self, location: &Path, entry: Entry) { + self.map.insert(location.clone(), entry); + } + + fn create(&mut self, location: &Path, entry: Entry) -> Result<()> { + use std::collections::btree_map; + match self.map.entry(location.clone()) { + btree_map::Entry::Occupied(_) => Err(Error::AlreadyExists { + path: location.to_string(), + } + .into()), + btree_map::Entry::Vacant(v) => { + v.insert(entry); + Ok(()) + } + } + } + + fn update(&mut self, location: &Path, v: UpdateVersion, entry: Entry) -> Result<()> { + match self.map.get_mut(location) { + // Return Precondition instead of NotFound for consistency with stores + None => Err(crate::Error::Precondition { + path: location.to_string(), + source: format!("Object at location {location} not found").into(), + }), + Some(e) => { + let existing = e.e_tag.to_string(); + let expected = v.e_tag.context(MissingETagSnafu)?; + if existing == expected { + *e = entry; + Ok(()) + } else { + Err(crate::Error::Precondition { + path: location.to_string(), + source: format!("{existing} does not match {expected}").into(), + }) + } + } + } + } } impl std::fmt::Display for InMemory { @@ -123,10 +168,21 @@ impl std::fmt::Display for InMemory { #[async_trait] impl ObjectStore for InMemory { - async fn put(&self, location: &Path, bytes: Bytes) -> Result { - let etag = self.storage.write().insert(location, bytes); + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let mut storage = self.storage.write(); + let etag = storage.next_etag; + let entry = Entry::new(bytes, Utc::now(), etag); + + match opts.mode { + PutMode::Overwrite => storage.overwrite(location, entry), + PutMode::Create => storage.create(location, entry)?, + PutMode::Update(v) => storage.update(location, v, entry)?, + } + storage.next_etag += 1; + Ok(PutResult { e_tag: Some(etag.to_string()), + version: None, }) } @@ -425,7 +481,7 @@ impl AsyncWrite for InMemoryAppend { fn poll_shutdown( self: Pin<&mut Self>, cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { + ) -> Poll> { self.poll_flush(cx) } } @@ -449,6 +505,7 @@ mod tests { rename_and_copy(&integration).await; copy_if_not_exists(&integration).await; stream_get(&integration).await; + put_opts(&integration, true).await; } #[tokio::test] diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index b5bff8b12dd7..68101307fbdf 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -23,7 +23,8 @@ use tokio::io::AsyncWrite; use crate::path::Path; use crate::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, Result, + GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, + Result, }; #[doc(hidden)] @@ -85,6 +86,11 @@ impl ObjectStore for PrefixStore { self.inner.put(&full_path, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + let full_path = self.full_path(location); + self.inner.put_opts(&full_path, bytes, opts).await + } + async fn put_multipart( &self, location: &Path, diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index c5521256b8a6..dcd2c04bcf05 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -21,7 +21,8 @@ use std::ops::Range; use std::{convert::TryInto, sync::Arc}; use crate::{ - path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutResult, Result, + path::Path, GetResult, GetResultPayload, ListResult, ObjectMeta, ObjectStore, PutOptions, + PutResult, Result, }; use crate::{GetOptions, MultipartId}; use async_trait::async_trait; @@ -149,10 +150,14 @@ impl std::fmt::Display for ThrottledStore { impl ObjectStore for ThrottledStore { async fn put(&self, location: &Path, bytes: Bytes) -> Result { sleep(self.config().wait_put_per_call).await; - self.inner.put(location, bytes).await } + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + sleep(self.config().wait_put_per_call).await; + self.inner.put_opts(location, bytes, opts).await + } + async fn put_multipart( &self, _location: &Path, diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index 3fa1cc7104b3..85231a5a5b9b 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -22,9 +22,7 @@ use bytes::Bytes; use futures::stream::BoxStream; use object_store::local::LocalFileSystem; use object_store::path::Path; -use object_store::{ - GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, PutResult, -}; +use object_store::*; use std::fmt::Formatter; use tempfile::tempdir; use tokio::io::AsyncWrite; @@ -40,50 +38,42 @@ impl std::fmt::Display for MyStore { #[async_trait] impl ObjectStore for MyStore { - async fn put(&self, path: &Path, data: Bytes) -> object_store::Result { - self.0.put(path, data).await + async fn put_opts(&self, path: &Path, data: Bytes, opts: PutOptions) -> Result { + self.0.put_opts(path, data, opts).await } async fn put_multipart( &self, _: &Path, - ) -> object_store::Result<(MultipartId, Box)> { + ) -> Result<(MultipartId, Box)> { todo!() } - async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> object_store::Result<()> { + async fn abort_multipart(&self, _: &Path, _: &MultipartId) -> Result<()> { todo!() } - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> object_store::Result { + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { self.0.get_opts(location, options).await } - async fn head(&self, _: &Path) -> object_store::Result { - todo!() - } - - async fn delete(&self, _: &Path) -> object_store::Result<()> { + async fn delete(&self, _: &Path) -> Result<()> { todo!() } - fn list(&self, _: Option<&Path>) -> BoxStream<'_, object_store::Result> { + fn list(&self, _: Option<&Path>) -> BoxStream<'_, Result> { todo!() } - async fn list_with_delimiter(&self, _: Option<&Path>) -> object_store::Result { + async fn list_with_delimiter(&self, _: Option<&Path>) -> Result { todo!() } - async fn copy(&self, _: &Path, _: &Path) -> object_store::Result<()> { + async fn copy(&self, _: &Path, _: &Path) -> Result<()> { todo!() } - async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> object_store::Result<()> { + async fn copy_if_not_exists(&self, _: &Path, _: &Path) -> Result<()> { todo!() } } From e4bb1e9ec0b6d957da1358bed954b7ca19a76337 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:09:33 +0000 Subject: [PATCH 1313/1411] Support list_with_offset for GCS (#4993) --- object_store/src/gcp/client.rs | 6 ++++-- object_store/src/gcp/mod.rs | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 78964077e2fe..e4b0f9af7d15 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -472,8 +472,6 @@ impl ListClient for GoogleCloudStorageClient { page_token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { - assert!(offset.is_none()); // Not yet supported - let credential = self.get_credential().await?; let url = format!("{}/{}", self.config.base_url, self.bucket_name_encoded); @@ -495,6 +493,10 @@ impl ListClient for GoogleCloudStorageClient { query.push(("max-keys", max_results)) } + if let Some(offset) = offset { + query.push(("start-after", offset)) + } + let response = self .client .request(Method::GET, url) diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 7721b1278a80..11fa68310a2e 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -147,6 +147,14 @@ impl ObjectStore for GoogleCloudStorage { self.client.list(prefix) } + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'_, Result> { + self.client.list_with_offset(prefix, offset) + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } From 11b2f5fecc257d97005f2393ee17777ed5d38e7c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:30:35 +0000 Subject: [PATCH 1314/1411] Object tagging (#4754) (#4999) * Object tagging (#4754) * Allow disabling tagging * Rename to disable_tagging --- object_store/src/aws/builder.rs | 22 ++++++++ object_store/src/aws/client.rs | 23 +++++++++ object_store/src/aws/mod.rs | 16 +++++- object_store/src/azure/builder.rs | 22 ++++++++ object_store/src/azure/client.rs | 27 +++++++++- object_store/src/azure/mod.rs | 7 +++ object_store/src/lib.rs | 85 ++++++++++++++++++++++++++++++- object_store/src/tags.rs | 60 ++++++++++++++++++++++ 8 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 object_store/src/tags.rs diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 79ea75b5aba2..cf9490d96eae 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -155,6 +155,8 @@ pub struct AmazonS3Builder { copy_if_not_exists: Option>, /// Put precondition conditional_put: Option>, + /// Ignore tags + disable_tagging: ConfigValue, } /// Configuration keys for [`AmazonS3Builder`] @@ -299,6 +301,15 @@ pub enum AmazonS3ConfigKey { /// Skip signing request SkipSignature, + /// Disable tagging objects + /// + /// This can be desirable if not supported by the backing store + /// + /// Supported keys: + /// - `aws_disable_tagging` + /// - `disable_tagging` + DisableTagging, + /// Client options Client(ClientConfigKey), } @@ -322,6 +333,7 @@ impl AsRef for AmazonS3ConfigKey { Self::SkipSignature => "aws_skip_signature", Self::CopyIfNotExists => "aws_copy_if_not_exists", Self::ConditionalPut => "aws_conditional_put", + Self::DisableTagging => "aws_disable_tagging", Self::Client(opt) => opt.as_ref(), } } @@ -350,6 +362,7 @@ impl FromStr for AmazonS3ConfigKey { "aws_skip_signature" | "skip_signature" => Ok(Self::SkipSignature), "aws_copy_if_not_exists" | "copy_if_not_exists" => Ok(Self::CopyIfNotExists), "aws_conditional_put" | "conditional_put" => Ok(Self::ConditionalPut), + "aws_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "aws_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -453,6 +466,7 @@ impl AmazonS3Builder { self.client_options = self.client_options.with_config(key, value) } AmazonS3ConfigKey::SkipSignature => self.skip_signature.parse(value), + AmazonS3ConfigKey::DisableTagging => self.disable_tagging.parse(value), AmazonS3ConfigKey::CopyIfNotExists => { self.copy_if_not_exists = Some(ConfigValue::Deferred(value.into())) } @@ -525,6 +539,7 @@ impl AmazonS3Builder { AmazonS3ConfigKey::ConditionalPut => { self.conditional_put.as_ref().map(ToString::to_string) } + AmazonS3ConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), } } @@ -735,6 +750,12 @@ impl AmazonS3Builder { self } + /// If set to `true` will ignore any tags provided to put_opts + pub fn with_disable_tagging(mut self, ignore: bool) -> Self { + self.disable_tagging = ignore.into(); + self + } + /// Create a [`AmazonS3`] instance from the provided values, /// consuming `self`. pub fn build(mut self) -> Result { @@ -851,6 +872,7 @@ impl AmazonS3Builder { client_options: self.client_options, sign_payload: !self.unsigned_payload.get()?, skip_signature: self.skip_signature.get()?, + disable_tagging: self.disable_tagging.get()?, checksum, copy_if_not_exists, conditional_put: put_precondition, diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 20c2a96b57cd..3e47abd4bcc5 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -204,6 +204,7 @@ pub struct S3Config { pub client_options: ClientOptions, pub sign_payload: bool, pub skip_signature: bool, + pub disable_tagging: bool, pub checksum: Option, pub copy_if_not_exists: Option, pub conditional_put: Option, @@ -588,6 +589,28 @@ impl S3Client { version, }) } + + #[cfg(test)] + pub async fn get_object_tagging(&self, path: &Path) -> Result { + let credential = self.config.get_credential().await?; + let url = format!("{}?tagging", self.config.path_url(path)); + let response = self + .client + .request(Method::GET, url) + .with_aws_sigv4( + credential.as_deref(), + &self.config.region, + "s3", + self.config.sign_payload, + None, + ) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + Ok(response) + } } #[async_trait] diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 99e637695059..cbb3cffdf494 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -35,7 +35,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use reqwest::header::{IF_MATCH, IF_NONE_MATCH}; +use reqwest::header::{HeaderName, IF_MATCH, IF_NONE_MATCH}; use reqwest::Method; use std::{sync::Arc, time::Duration}; use tokio::io::AsyncWrite; @@ -52,6 +52,8 @@ use crate::{ PutOptions, PutResult, Result, }; +static TAGS_HEADER: HeaderName = HeaderName::from_static("x-amz-tagging"); + mod builder; mod checksum; mod client; @@ -160,7 +162,12 @@ impl Signer for AmazonS3 { #[async_trait] impl ObjectStore for AmazonS3 { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { - let request = self.client.put_request(location, bytes); + let mut request = self.client.put_request(location, bytes); + let tags = opts.tags.encoded(); + if !tags.is_empty() && !self.client.config().disable_tagging { + request = request.header(&TAGS_HEADER, tags); + } + match (opts.mode, &self.client.config().conditional_put) { (PutMode::Overwrite, _) => request.send().await, (PutMode::Create | PutMode::Update(_), None) => Err(Error::NotImplemented), @@ -342,6 +349,11 @@ mod tests { stream_get(&integration).await; multipart(&integration, &integration).await; + tagging(&integration, !config.disable_tagging, |p| { + let client = Arc::clone(&integration.client); + async move { client.get_object_tagging(&p).await } + }) + .await; if test_not_exists { copy_if_not_exists(&integration).await; } diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 02e0762b6de9..6bd2b265b521 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -173,6 +173,8 @@ pub struct MicrosoftAzureBuilder { /// /// i.e. https://{account_name}.dfs.fabric.microsoft.com use_fabric_endpoint: ConfigValue, + /// When set to true, skips tagging objects + disable_tagging: ConfigValue, } /// Configuration keys for [`MicrosoftAzureBuilder`] @@ -321,6 +323,15 @@ pub enum AzureConfigKey { /// - `container_name` ContainerName, + /// Disables tagging objects + /// + /// This can be desirable if not supported by the backing store + /// + /// Supported keys: + /// - `azure_disable_tagging` + /// - `disable_tagging` + DisableTagging, + /// Client options Client(ClientConfigKey), } @@ -344,6 +355,7 @@ impl AsRef for AzureConfigKey { Self::FederatedTokenFile => "azure_federated_token_file", Self::UseAzureCli => "azure_use_azure_cli", Self::ContainerName => "azure_container_name", + Self::DisableTagging => "azure_disable_tagging", Self::Client(key) => key.as_ref(), } } @@ -387,6 +399,7 @@ impl FromStr for AzureConfigKey { "azure_use_fabric_endpoint" | "use_fabric_endpoint" => Ok(Self::UseFabricEndpoint), "azure_use_azure_cli" | "use_azure_cli" => Ok(Self::UseAzureCli), "azure_container_name" | "container_name" => Ok(Self::ContainerName), + "azure_disable_tagging" | "disable_tagging" => Ok(Self::DisableTagging), // Backwards compatibility "azure_allow_http" => Ok(Self::Client(ClientConfigKey::AllowHttp)), _ => match s.parse() { @@ -503,6 +516,7 @@ impl MicrosoftAzureBuilder { self.client_options = self.client_options.with_config(key, value) } AzureConfigKey::ContainerName => self.container_name = Some(value.into()), + AzureConfigKey::DisableTagging => self.disable_tagging.parse(value), }; self } @@ -556,6 +570,7 @@ impl MicrosoftAzureBuilder { AzureConfigKey::UseAzureCli => Some(self.use_azure_cli.to_string()), AzureConfigKey::Client(key) => self.client_options.get_config_value(key), AzureConfigKey::ContainerName => self.container_name.clone(), + AzureConfigKey::DisableTagging => Some(self.disable_tagging.to_string()), } } @@ -781,6 +796,12 @@ impl MicrosoftAzureBuilder { self } + /// If set to `true` will ignore any tags provided to put_opts + pub fn with_disable_tagging(mut self, ignore: bool) -> Self { + self.disable_tagging = ignore.into(); + self + } + /// Configure a connection to container with given name on Microsoft Azure Blob store. pub fn build(mut self) -> Result { if let Some(url) = self.url.take() { @@ -885,6 +906,7 @@ impl MicrosoftAzureBuilder { account, is_emulator, container, + disable_tagging: self.disable_tagging.get()?, retry_config: self.retry_config, client_options: self.client_options, service: storage_url, diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index c7bd79149872..3c71e69da00c 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -50,6 +50,8 @@ use url::Url; const VERSION_HEADER: &str = "x-ms-version-id"; +static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); + /// A specialized `Error` for object store-related errors #[derive(Debug, Snafu)] #[allow(missing_docs)] @@ -124,11 +126,12 @@ pub(crate) struct AzureConfig { pub retry_config: RetryConfig, pub service: Url, pub is_emulator: bool, + pub disable_tagging: bool, pub client_options: ClientOptions, } impl AzureConfig { - fn path_url(&self, path: &Path) -> Url { + pub(crate) fn path_url(&self, path: &Path) -> Url { let mut url = self.service.clone(); { let mut path_mut = url.path_segments_mut().unwrap(); @@ -229,6 +232,11 @@ impl AzureClient { } }; + let builder = match (opts.tags.encoded(), self.config.disable_tagging) { + ("", _) | (_, true) => builder, + (tags, false) => builder.header(&TAGS_HEADER, tags), + }; + let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) } @@ -315,6 +323,23 @@ impl AzureClient { Ok(()) } + + #[cfg(test)] + pub async fn get_blob_tagging(&self, path: &Path) -> Result { + let credential = self.get_credential().await?; + let url = self.config.path_url(path); + let response = self + .client + .request(Method::GET, url) + .query(&[("comp", "tags")]) + .with_azure_authorization(&credential, &self.config.account) + .send_retry(&self.config.retry_config) + .await + .context(GetRequestSnafu { + path: path.as_ref(), + })?; + Ok(response) + } } #[async_trait] diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 762a51dd9d60..1d51cbdc02dc 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -202,6 +202,13 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + + let validate = !integration.client.config().disable_tagging; + tagging(&integration, validate, |p| { + let client = Arc::clone(&integration.client); + async move { client.get_blob_tagging(&p).await } + }) + .await } #[test] diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 66964304e853..51203ca4a4b2 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -263,6 +263,10 @@ pub use client::{ #[cfg(feature = "cloud")] mod config; +mod tags; + +pub use tags::TagSet; + pub mod multipart; mod parse; mod util; @@ -893,11 +897,27 @@ impl From for UpdateVersion { pub struct PutOptions { /// Configure the [`PutMode`] for this operation pub mode: PutMode, + /// Provide a [`TagSet`] for this object + /// + /// Implementations that don't support object tagging should ignore this + pub tags: TagSet, } impl From for PutOptions { fn from(mode: PutMode) -> Self { - Self { mode } + Self { + mode, + ..Default::default() + } + } +} + +impl From for PutOptions { + fn from(tags: TagSet) -> Self { + Self { + tags, + ..Default::default() + } } } @@ -1015,6 +1035,7 @@ mod tests { use chrono::TimeZone; use futures::stream::FuturesUnordered; use rand::{thread_rng, Rng}; + use std::future::Future; use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { @@ -1882,6 +1903,68 @@ mod tests { assert_eq!(meta.size, chunk_size * 2); } + #[cfg(any(feature = "aws", feature = "azure"))] + pub(crate) async fn tagging(storage: &dyn ObjectStore, validate: bool, get_tags: F) + where + F: Fn(Path) -> Fut + Send + Sync, + Fut: Future> + Send, + { + use bytes::Buf; + use serde::Deserialize; + + #[derive(Deserialize)] + struct Tagging { + #[serde(rename = "TagSet")] + list: TagList, + } + + #[derive(Debug, Deserialize)] + struct TagList { + #[serde(rename = "Tag")] + tags: Vec, + } + + #[derive(Debug, Deserialize, Eq, PartialEq)] + #[serde(rename_all = "PascalCase")] + struct Tag { + key: String, + value: String, + } + + let tags = vec![ + Tag { + key: "foo.com=bar/s".to_string(), + value: "bananas/foo.com-_".to_string(), + }, + Tag { + key: "namespace/key.foo".to_string(), + value: "value with a space".to_string(), + }, + ]; + let mut tag_set = TagSet::default(); + for t in &tags { + tag_set.push(&t.key, &t.value) + } + + let path = Path::from("tag_test"); + storage + .put_opts(&path, "test".into(), tag_set.into()) + .await + .unwrap(); + + // Write should always succeed, but certain configurations may simply ignore tags + if !validate { + return; + } + + let resp = get_tags(path.clone()).await.unwrap(); + let body = resp.bytes().await.unwrap(); + + let mut resp: Tagging = quick_xml::de::from_reader(body.reader()).unwrap(); + resp.list.tags.sort_by(|a, b| a.key.cmp(&b.key)); + assert_eq!(resp.list.tags, tags); + } + async fn delete_fixtures(storage: &DynObjectStore) { let paths = storage.list(None).map_ok(|meta| meta.location).boxed(); storage diff --git a/object_store/src/tags.rs b/object_store/src/tags.rs new file mode 100644 index 000000000000..fa6e5913f4b1 --- /dev/null +++ b/object_store/src/tags.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use url::form_urlencoded::Serializer; + +/// A collection of key value pairs used to annotate objects +/// +/// +/// +#[derive(Debug, Clone, Default, Eq, PartialEq)] +pub struct TagSet(String); + +impl TagSet { + /// Append a key value pair to this [`TagSet`] + /// + /// Stores have different restrictions on what characters are permitted, + /// for portability it is recommended applications use no more than 10 tags, + /// and stick to alphanumeric characters, and `+ - = . _ : /` + /// + /// + /// + pub fn push(&mut self, key: &str, value: &str) { + Serializer::new(&mut self.0).append_pair(key, value); + } + + /// Return this [`TagSet`] as a URL-encoded string + pub fn encoded(&self) -> &str { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tag_set() { + let mut set = TagSet::default(); + set.push("test/foo", "value sdlks"); + set.push("foo", " sdf _ /+./sd"); + assert_eq!( + set.encoded(), + "test%2Ffoo=value+sdlks&foo=+sdf+_+%2F%2B.%2Fsd" + ); + } +} From 890823b6bcb9e43c9b8eacd6f21f5f6165ef1376 Mon Sep 17 00:00:00 2001 From: Andre Martins <38951957+amartins23@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:32:24 +0000 Subject: [PATCH 1315/1411] feat(flight-sql): Allow custom commands in get-flight-info (#4997) --- arrow-flight/src/sql/server.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 14ab7d81b4f3..f1656aca882a 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -225,6 +225,18 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { )) } + /// Implementors may override to handle additional calls to get_flight_info() + async fn get_flight_info_fallback( + &self, + cmd: Command, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented(format!( + "get_flight_info: The defined request is invalid: {}", + cmd.type_url() + ))) + } + // do_get /// Get a FlightDataStream containing the query results. @@ -616,10 +628,7 @@ where Command::CommandGetXdbcTypeInfo(token) => { self.get_flight_info_xdbc_type_info(token, request).await } - cmd => Err(Status::unimplemented(format!( - "get_flight_info: The defined request is invalid: {}", - cmd.type_url() - ))), + cmd => self.get_flight_info_fallback(cmd, request).await, } } From d9aaa437ca4ebf5a3500c865272243612862c7d4 Mon Sep 17 00:00:00 2001 From: Joseph Rance <56409230+Joseph-Rance@users.noreply.github.com> Date: Mon, 30 Oct 2023 11:40:34 +0000 Subject: [PATCH 1316/1411] Add `RecordReader` trait and proc macro to implement it for a struct (#4773) * add and implement RecordReader trait for rust structs * Fix typo in comment * run cargo fmt * partially solve issues raised in review * remove references * change interface to use vectors * change interface to use vectors in as well * update comments * remove intitialisation requirement * prevent conflicts with existing default implementation * update documentation * run cargo fmt * change writer back to slice * change 'Handle' back to 'Derive' for RecordWriter macro in readme --------- Co-authored-by: joseph rance --- parquet/src/record/mod.rs | 2 + parquet/src/record/record_reader.rs | 30 +++ parquet/src/record/record_writer.rs | 4 + parquet_derive/README.md | 51 ++++- parquet_derive/src/lib.rs | 88 +++++++- parquet_derive/src/parquet_field.rs | 338 ++++++++++++++++++++++++++-- parquet_derive_test/src/lib.rs | 70 +++++- 7 files changed, 553 insertions(+), 30 deletions(-) create mode 100644 parquet/src/record/record_reader.rs diff --git a/parquet/src/record/mod.rs b/parquet/src/record/mod.rs index 771d8058c9c1..f40e91418da1 100644 --- a/parquet/src/record/mod.rs +++ b/parquet/src/record/mod.rs @@ -19,6 +19,7 @@ mod api; pub mod reader; +mod record_reader; mod record_writer; mod triplet; @@ -26,5 +27,6 @@ pub use self::{ api::{ Field, List, ListAccessor, Map, MapAccessor, Row, RowAccessor, RowColumnIter, RowFormatter, }, + record_reader::RecordReader, record_writer::RecordWriter, }; diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs new file mode 100644 index 000000000000..bcfeb95dcdf4 --- /dev/null +++ b/parquet/src/record/record_reader.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::super::errors::ParquetError; +use super::super::file::reader::RowGroupReader; + +/// read up to `max_records` records from `row_group_reader` into `self` +/// The type parameter `T` is used to work around the rust orphan rule +/// when implementing on types such as `Vec`. +pub trait RecordReader { + fn read_from_row_group( + &mut self, + row_group_reader: &mut dyn RowGroupReader, + num_records: usize, + ) -> Result<(), ParquetError>; +} diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs index 62099051f513..0b2b95ef7dea 100644 --- a/parquet/src/record/record_writer.rs +++ b/parquet/src/record/record_writer.rs @@ -20,6 +20,10 @@ use crate::schema::types::TypePtr; use super::super::errors::ParquetError; use super::super::file::writer::SerializedRowGroupWriter; +/// `write_to_row_group` writes from `self` into `row_group_writer` +/// `schema` builds the schema used by `row_group_writer` +/// The type parameter `T` is used to work around the rust orphan rule +/// when implementing on types such as `&[T]`. pub trait RecordWriter { fn write_to_row_group( &self, diff --git a/parquet_derive/README.md b/parquet_derive/README.md index b20721079c2d..c267a92430e0 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -19,9 +19,9 @@ # Parquet Derive -A crate for deriving `RecordWriter` for arbitrary, _simple_ structs. This does not generate writers for arbitrarily nested -structures. It only works for primitives and a few generic structures and -various levels of reference. Please see features checklist for what is currently +A crate for deriving `RecordWriter` and `RecordReader` for arbitrary, _simple_ structs. This does not +generate readers or writers for arbitrarily nested structures. It only works for primitives and a few +generic structures and various levels of reference. Please see features checklist for what is currently supported. Derive also has some support for the chrono time library. You must must enable the `chrono` feature to get this support. @@ -77,16 +77,55 @@ writer.close_row_group(row_group).unwrap(); writer.close().unwrap(); ``` +Example usage of deriving a `RecordReader` for your struct: + +```rust +use parquet::file::{serialized_reader::SerializedFileReader, reader::FileReader}; +use parquet_derive::ParquetRecordReader; + +#[derive(ParquetRecordReader)] +struct ACompleteRecord { + pub a_bool: bool, + pub a_string: String, + pub i16: i16, + pub i32: i32, + pub u64: u64, + pub isize: isize, + pub float: f32, + pub double: f64, + pub now: chrono::NaiveDateTime, + pub byte_vec: Vec, +} + +// Initialize your parquet file +let reader = SerializedFileReader::new(file).unwrap(); +let mut row_group = reader.get_row_group(0).unwrap(); + +// create your records vector to read into +let mut chunks: Vec = Vec::new(); + +// The derived `RecordReader` takes over here +chunks.read_from_row_group(&mut *row_group, 1).unwrap(); +``` + ## Features - [x] Support writing `String`, `&str`, `bool`, `i32`, `f32`, `f64`, `Vec` - [ ] Support writing dictionaries - [x] Support writing logical types like timestamp -- [x] Derive definition_levels for `Option` -- [ ] Derive definition levels for nested structures +- [x] Derive definition_levels for `Option` for writing +- [ ] Derive definition levels for nested structures for writing - [ ] Derive writing tuple struct - [ ] Derive writing `tuple` container types +- [x] Support reading `String`, `&str`, `bool`, `i32`, `f32`, `f64`, `Vec` +- [ ] Support reading/writing dictionaries +- [x] Support reading/writing logical types like timestamp +- [ ] Handle definition_levels for `Option` for reading +- [ ] Handle definition levels for nested structures for reading +- [ ] Derive reading/writing tuple struct +- [ ] Derive reading/writing `tuple` container types + ## Requirements - Same as `parquet-rs` @@ -103,4 +142,4 @@ To compile and view in the browser, run `cargo doc --no-deps --open`. ## License -Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. +Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. \ No newline at end of file diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index c6641cd8091d..671a46db0f31 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -44,7 +44,7 @@ mod parquet_field; /// use parquet::file::writer::SerializedFileWriter; /// /// use std::sync::Arc; -// +/// /// #[derive(ParquetRecordWriter)] /// struct ACompleteRecord<'a> { /// pub a_bool: bool, @@ -137,3 +137,89 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke } }).into() } + +/// Derive flat, simple RecordReader implementations. Works by parsing +/// a struct tagged with `#[derive(ParquetRecordReader)]` and emitting +/// the correct writing code for each field of the struct. Column readers +/// are generated in the order they are defined. +/// +/// It is up to the programmer to keep the order of the struct +/// fields lined up with the schema. +/// +/// Example: +/// +/// ```ignore +/// use parquet::file::{serialized_reader::SerializedFileReader, reader::FileReader}; +/// use parquet_derive::{ParquetRecordReader}; +/// +/// #[derive(ParquetRecordReader)] +/// struct ACompleteRecord { +/// pub a_bool: bool, +/// pub a_string: String, +/// } +/// +/// pub fn read_some_records() -> Vec { +/// let mut samples: Vec = Vec::new(); +/// +/// let reader = SerializedFileReader::new(file).unwrap(); +/// let mut row_group = reader.get_row_group(0).unwrap(); +/// samples.read_from_row_group(&mut *row_group, 1).unwrap(); +/// samples +/// } +/// ``` +/// +#[proc_macro_derive(ParquetRecordReader)] +pub fn parquet_record_reader(input: proc_macro::TokenStream) -> proc_macro::TokenStream { + let input: DeriveInput = parse_macro_input!(input as DeriveInput); + let fields = match input.data { + Data::Struct(DataStruct { fields, .. }) => fields, + Data::Enum(_) => unimplemented!("Enum currently is not supported"), + Data::Union(_) => unimplemented!("Union currently is not supported"), + }; + + let field_infos: Vec<_> = fields.iter().map(parquet_field::Field::from).collect(); + let field_names: Vec<_> = fields.iter().map(|f| f.ident.clone()).collect(); + let reader_snippets: Vec = + field_infos.iter().map(|x| x.reader_snippet()).collect(); + let i: Vec<_> = (0..reader_snippets.len()).collect(); + + let derived_for = input.ident; + let generics = input.generics; + + (quote! { + + impl #generics ::parquet::record::RecordReader<#derived_for #generics> for Vec<#derived_for #generics> { + fn read_from_row_group( + &mut self, + row_group_reader: &mut dyn ::parquet::file::reader::RowGroupReader, + num_records: usize, + ) -> Result<(), ::parquet::errors::ParquetError> { + use ::parquet::column::reader::ColumnReader; + + let mut row_group_reader = row_group_reader; + + for _ in 0..num_records { + self.push(#derived_for { + #( + #field_names: Default::default() + ),* + }) + } + + let records = self; // Used by all the reader snippets to be more clear + + #( + { + if let Ok(mut column_reader) = row_group_reader.get_column_reader(#i) { + #reader_snippets + } else { + return Err(::parquet::errors::ParquetError::General("Failed to get next column".into())) + } + } + );* + + Ok(()) + } + } + }).into() +} diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index e629bfe757ab..0ac95c2864e5 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -219,6 +219,72 @@ impl Field { } } + /// Takes the parsed field of the struct and emits a valid + /// column reader snippet. Should match exactly what you + /// would write by hand. + /// + /// Can only generate writers for basic structs, for example: + /// + /// struct Record { + /// a_bool: bool + /// } + /// + /// but not + /// + /// struct UnsupportedNestedRecord { + /// a_property: bool, + /// nested_record: Record + /// } + /// + /// because this parsing logic is not sophisticated enough for definition + /// levels beyond 2. + /// + /// `Option` types and references not supported + pub fn reader_snippet(&self) -> proc_macro2::TokenStream { + let ident = &self.ident; + let column_reader = self.ty.column_reader(); + let parquet_type = self.ty.physical_type_as_rust(); + + // generate the code to read the column into a vector `vals` + let write_batch_expr = quote! { + let mut vals_vec = Vec::new(); + vals_vec.resize(num_records, Default::default()); + let mut vals: &mut [#parquet_type] = vals_vec.as_mut_slice(); + if let #column_reader(mut typed) = column_reader { + typed.read_records(num_records, None, None, vals)?; + } else { + panic!("Schema and struct disagree on type for {}", stringify!{#ident}); + } + }; + + // generate the code to convert each element of `vals` to the correct type and then write + // it to its field in the corresponding struct + let vals_writer = match &self.ty { + Type::TypePath(_) => self.copied_direct_fields(), + Type::Reference(_, ref first_type) => match **first_type { + Type::TypePath(_) => self.copied_direct_fields(), + Type::Slice(ref second_type) => match **second_type { + Type::TypePath(_) => self.copied_direct_fields(), + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + Type::Vec(ref first_type) => match **first_type { + Type::TypePath(_) => self.copied_direct_fields(), + ref f => unimplemented!("Unsupported: {:#?}", f), + }, + f => unimplemented!("Unsupported: {:#?}", f), + }; + + quote! { + { + #write_batch_expr + + #vals_writer + } + } + } + pub fn parquet_type(&self) -> proc_macro2::TokenStream { // TODO: Support group types // TODO: Add length if dealing with fixedlenbinary @@ -319,27 +385,31 @@ impl Field { } } + // generates code to read `field_name` from each record into a vector `vals` fn copied_direct_vals(&self) -> proc_macro2::TokenStream { let field_name = &self.ident; - let is_a_byte_buf = self.is_a_byte_buf; - let is_a_timestamp = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDateTime); - let is_a_date = self.third_party_type == Some(ThirdPartyType::ChronoNaiveDate); - let is_a_uuid = self.third_party_type == Some(ThirdPartyType::Uuid); - let access = if is_a_timestamp { - quote! { rec.#field_name.timestamp_millis() } - } else if is_a_date { - quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } - } else if is_a_uuid { - quote! { (&rec.#field_name.to_string()[..]).into() } - } else if is_a_byte_buf { - quote! { (&rec.#field_name[..]).into() } - } else { - // Type might need converting to a physical type - match self.ty.physical_type() { - parquet::basic::Type::INT32 => quote! { rec.#field_name as i32 }, - parquet::basic::Type::INT64 => quote! { rec.#field_name as i64 }, - _ => quote! { rec.#field_name }, + let access = match self.third_party_type { + Some(ThirdPartyType::ChronoNaiveDateTime) => { + quote! { rec.#field_name.timestamp_millis() } + } + Some(ThirdPartyType::ChronoNaiveDate) => { + quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 } + } + Some(ThirdPartyType::Uuid) => { + quote! { (&rec.#field_name.to_string()[..]).into() } + } + _ => { + if self.is_a_byte_buf { + quote! { (&rec.#field_name[..]).into() } + } else { + // Type might need converting to a physical type + match self.ty.physical_type() { + parquet::basic::Type::INT32 => quote! { rec.#field_name as i32 }, + parquet::basic::Type::INT64 => quote! { rec.#field_name as i64 }, + _ => quote! { rec.#field_name }, + } + } } }; @@ -348,6 +418,48 @@ impl Field { } } + // generates code to read a vector `records` into `field_name` for each record + fn copied_direct_fields(&self) -> proc_macro2::TokenStream { + let field_name = &self.ident; + + let value = match self.third_party_type { + Some(ThirdPartyType::ChronoNaiveDateTime) => { + quote! { ::chrono::naive::NaiveDateTime::from_timestamp_millis(vals[i]).unwrap() } + } + Some(ThirdPartyType::ChronoNaiveDate) => { + quote! { + ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i] + + ((::chrono::naive::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap() + .signed_duration_since( + ::chrono::naive::NaiveDate::from_ymd_opt(0, 12, 31).unwrap() + ) + ).num_days()) as i32).unwrap() + } + } + Some(ThirdPartyType::Uuid) => { + quote! { ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap() } + } + _ => match &self.ty { + Type::TypePath(_) => match self.ty.last_part().as_str() { + "String" => quote! { String::from(std::str::from_utf8(vals[i].data()) + .expect("invalid UTF-8 sequence")) }, + t => { + let s: proc_macro2::TokenStream = t.parse().unwrap(); + quote! { vals[i] as #s } + } + }, + Type::Vec(_) => quote! { vals[i].data().to_vec() }, + f => unimplemented!("Unsupported: {:#?}", f), + }, + }; + + quote! { + for (i, r) in &mut records[..num_records].iter_mut().enumerate() { + r.#field_name = #value; + } + } + } + fn optional_definition_levels(&self) -> proc_macro2::TokenStream { let field_name = &self.ident; @@ -396,6 +508,29 @@ impl Type { } } + /// Takes a rust type and returns the appropriate + /// parquet-rs column reader + fn column_reader(&self) -> syn::TypePath { + use parquet::basic::Type as BasicType; + + match self.physical_type() { + BasicType::BOOLEAN => { + syn::parse_quote!(ColumnReader::BoolColumnReader) + } + BasicType::INT32 => syn::parse_quote!(ColumnReader::Int32ColumnReader), + BasicType::INT64 => syn::parse_quote!(ColumnReader::Int64ColumnReader), + BasicType::INT96 => syn::parse_quote!(ColumnReader::Int96ColumnReader), + BasicType::FLOAT => syn::parse_quote!(ColumnReader::FloatColumnReader), + BasicType::DOUBLE => syn::parse_quote!(ColumnReader::DoubleColumnReader), + BasicType::BYTE_ARRAY => { + syn::parse_quote!(ColumnReader::ByteArrayColumnReader) + } + BasicType::FIXED_LEN_BYTE_ARRAY => { + syn::parse_quote!(ColumnReader::FixedLenByteArrayColumnReader) + } + } + } + /// Helper to simplify a nested field definition to its leaf type /// /// Ex: @@ -515,6 +650,23 @@ impl Type { } } + fn physical_type_as_rust(&self) -> proc_macro2::TokenStream { + use parquet::basic::Type as BasicType; + + match self.physical_type() { + BasicType::BOOLEAN => quote! { bool }, + BasicType::INT32 => quote! { i32 }, + BasicType::INT64 => quote! { i64 }, + BasicType::INT96 => unimplemented!("96-bit int currently is not supported"), + BasicType::FLOAT => quote! { f32 }, + BasicType::DOUBLE => quote! { f64 }, + BasicType::BYTE_ARRAY => quote! { ::parquet::data_type::ByteArray }, + BasicType::FIXED_LEN_BYTE_ARRAY => { + quote! { ::parquet::data_type::FixedLenByteArray } + } + } + } + fn logical_type(&self) -> proc_macro2::TokenStream { let last_part = self.last_part(); let leaf_type = self.leaf_type_recursive(); @@ -713,6 +865,39 @@ mod test { ) } + #[test] + fn test_generating_a_simple_reader_snippet() { + let snippet: proc_macro2::TokenStream = quote! { + struct ABoringStruct { + counter: usize, + } + }; + + let fields = extract_fields(snippet); + let counter = Field::from(&fields[0]); + + let snippet = counter.reader_snippet().to_string(); + assert_eq!( + snippet, + (quote! { + { + let mut vals_vec = Vec::new(); + vals_vec.resize(num_records, Default::default()); + let mut vals: &mut[i64] = vals_vec.as_mut_slice(); + if let ColumnReader::Int64ColumnReader(mut typed) = column_reader { + typed.read_records(num_records, None, None, vals)?; + } else { + panic!("Schema and struct disagree on type for {}", stringify!{ counter }); + } + for (i, r) in &mut records[..num_records].iter_mut().enumerate() { + r.counter = vals[i] as usize; + } + } + }) + .to_string() + ) + } + #[test] fn test_optional_to_writer_snippet() { let struct_def: proc_macro2::TokenStream = quote! { @@ -822,6 +1007,32 @@ mod test { ); } + #[test] + fn test_converting_to_column_reader_type() { + let snippet: proc_macro2::TokenStream = quote! { + struct ABasicStruct { + yes_no: bool, + name: String, + } + }; + + let fields = extract_fields(snippet); + let processed: Vec<_> = fields.iter().map(Field::from).collect(); + + let column_readers: Vec<_> = processed + .iter() + .map(|field| field.ty.column_reader()) + .collect(); + + assert_eq!( + column_readers, + vec![ + syn::parse_quote!(ColumnReader::BoolColumnReader), + syn::parse_quote!(ColumnReader::ByteArrayColumnReader) + ] + ); + } + #[test] fn convert_basic_struct() { let snippet: proc_macro2::TokenStream = quote! { @@ -995,7 +1206,7 @@ mod test { } #[test] - fn test_chrono_timestamp_millis() { + fn test_chrono_timestamp_millis_write() { let snippet: proc_macro2::TokenStream = quote! { struct ATimestampStruct { henceforth: chrono::NaiveDateTime, @@ -1038,7 +1249,34 @@ mod test { } #[test] - fn test_chrono_date() { + fn test_chrono_timestamp_millis_read() { + let snippet: proc_macro2::TokenStream = quote! { + struct ATimestampStruct { + henceforth: chrono::NaiveDateTime, + } + }; + + let fields = extract_fields(snippet); + let when = Field::from(&fields[0]); + assert_eq!(when.reader_snippet().to_string(),(quote!{ + { + let mut vals_vec = Vec::new(); + vals_vec.resize(num_records, Default::default()); + let mut vals: &mut[i64] = vals_vec.as_mut_slice(); + if let ColumnReader::Int64ColumnReader(mut typed) = column_reader { + typed.read_records(num_records, None, None, vals)?; + } else { + panic!("Schema and struct disagree on type for {}", stringify!{ henceforth }); + } + for (i, r) in &mut records[..num_records].iter_mut().enumerate() { + r.henceforth = ::chrono::naive::NaiveDateTime::from_timestamp_millis(vals[i]).unwrap(); + } + } + }).to_string()); + } + + #[test] + fn test_chrono_date_write() { let snippet: proc_macro2::TokenStream = quote! { struct ATimestampStruct { henceforth: chrono::NaiveDate, @@ -1081,7 +1319,38 @@ mod test { } #[test] - fn test_uuid() { + fn test_chrono_date_read() { + let snippet: proc_macro2::TokenStream = quote! { + struct ATimestampStruct { + henceforth: chrono::NaiveDate, + } + }; + + let fields = extract_fields(snippet); + let when = Field::from(&fields[0]); + assert_eq!(when.reader_snippet().to_string(),(quote!{ + { + let mut vals_vec = Vec::new(); + vals_vec.resize(num_records, Default::default()); + let mut vals: &mut [i32] = vals_vec.as_mut_slice(); + if let ColumnReader::Int32ColumnReader(mut typed) = column_reader { + typed.read_records(num_records, None, None, vals)?; + } else { + panic!("Schema and struct disagree on type for {}", stringify!{ henceforth }); + } + for (i, r) in &mut records[..num_records].iter_mut().enumerate() { + r.henceforth = ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i] + + ((::chrono::naive::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap() + .signed_duration_since( + ::chrono::naive::NaiveDate::from_ymd_opt(0, 12, 31).unwrap() + )).num_days()) as i32).unwrap(); + } + } + }).to_string()); + } + + #[test] + fn test_uuid_write() { let snippet: proc_macro2::TokenStream = quote! { struct AUuidStruct { unique_id: uuid::Uuid, @@ -1123,6 +1392,33 @@ mod test { }).to_string()); } + #[test] + fn test_uuid_read() { + let snippet: proc_macro2::TokenStream = quote! { + struct AUuidStruct { + unique_id: uuid::Uuid, + } + }; + + let fields = extract_fields(snippet); + let when = Field::from(&fields[0]); + assert_eq!(when.reader_snippet().to_string(),(quote!{ + { + let mut vals_vec = Vec::new(); + vals_vec.resize(num_records, Default::default()); + let mut vals: &mut [::parquet::data_type::ByteArray] = vals_vec.as_mut_slice(); + if let ColumnReader::ByteArrayColumnReader(mut typed) = column_reader { + typed.read_records(num_records, None, None, vals)?; + } else { + panic!("Schema and struct disagree on type for {}", stringify!{ unique_id }); + } + for (i, r) in &mut records[..num_records].iter_mut().enumerate() { + r.unique_id = ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap(); + } + } + }).to_string()); + } + #[test] fn test_converted_type() { let snippet: proc_macro2::TokenStream = quote! { diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs index d377fb0a62af..a8b631ecc024 100644 --- a/parquet_derive_test/src/lib.rs +++ b/parquet_derive_test/src/lib.rs @@ -17,7 +17,7 @@ #![allow(clippy::approx_constant)] -use parquet_derive::ParquetRecordWriter; +use parquet_derive::{ParquetRecordReader, ParquetRecordWriter}; #[derive(ParquetRecordWriter)] struct ACompleteRecord<'a> { @@ -49,6 +49,21 @@ struct ACompleteRecord<'a> { pub borrowed_maybe_borrowed_byte_vec: &'a Option<&'a [u8]>, } +#[derive(PartialEq, ParquetRecordWriter, ParquetRecordReader, Debug)] +struct APartiallyCompleteRecord { + pub bool: bool, + pub string: String, + pub i16: i16, + pub i32: i32, + pub u64: u64, + pub isize: isize, + pub float: f32, + pub double: f64, + pub now: chrono::NaiveDateTime, + pub date: chrono::NaiveDate, + pub byte_vec: Vec, +} + #[cfg(test)] mod tests { use super::*; @@ -56,7 +71,8 @@ mod tests { use std::{env, fs, io::Write, sync::Arc}; use parquet::{ - file::writer::SerializedFileWriter, record::RecordWriter, + file::writer::SerializedFileWriter, + record::{RecordReader, RecordWriter}, schema::parser::parse_message_type, }; @@ -147,6 +163,56 @@ mod tests { writer.close().unwrap(); } + #[test] + fn test_parquet_derive_read_write_combined() { + let file = get_temp_file("test_parquet_derive_combined", &[]); + + let mut drs: Vec = vec![APartiallyCompleteRecord { + bool: true, + string: "a string".into(), + i16: -45, + i32: 456, + u64: 4563424, + isize: -365, + float: 3.5, + double: std::f64::NAN, + now: chrono::Utc::now().naive_local(), + date: chrono::naive::NaiveDate::from_ymd_opt(2015, 3, 14).unwrap(), + byte_vec: vec![0x65, 0x66, 0x67], + }]; + + let mut out: Vec = Vec::new(); + + use parquet::file::{reader::FileReader, serialized_reader::SerializedFileReader}; + + let generated_schema = drs.as_slice().schema().unwrap(); + + let props = Default::default(); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), generated_schema, props).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + drs.as_slice().write_to_row_group(&mut row_group).unwrap(); + row_group.close().unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + + let mut row_group = reader.get_row_group(0).unwrap(); + out.read_from_row_group(&mut *row_group, 1).unwrap(); + + // correct for rounding error when writing milliseconds + drs[0].now = + chrono::naive::NaiveDateTime::from_timestamp_millis(drs[0].now.timestamp_millis()) + .unwrap(); + + assert!(out[0].double.is_nan()); // these three lines are necessary because NAN != NAN + out[0].double = 0.; + drs[0].double = 0.; + + assert_eq!(drs[0], out[0]); + } + /// Returns file handle for a temp file in 'target' directory with a provided content pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File { // build tmp path to a file in "target/debug/testdata" From cc23cacd12703ffd604b6ca52715f52b409e0659 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 12:02:17 +0000 Subject: [PATCH 1317/1411] Improve object_store docs (#4978) * Improve object_store docs * Document configuration system * Review feedback --- object_store/src/lib.rs | 285 +++++++++++++++++++++++++++++++++++----- 1 file changed, 250 insertions(+), 35 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 51203ca4a4b2..69db9d97bc2c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -38,13 +38,18 @@ //! //! # Highlights //! -//! 1. A focused, easy to use, idiomatic, well documented, high -//! performance, `async` API. +//! 1. A high-performance async API focused on providing a consistent interface +//! mirroring that of object stores such as [S3] //! //! 2. Production quality, leading this crate to be used in large -//! scale production systems, such as [crates.io] and [InfluxDB IOx]. +//! scale production systems, such as [crates.io] and [InfluxDB IOx] //! -//! 3. Stable and predictable governance via the [Apache Arrow] project. +//! 3. Support for advanced functionality, including atomic, conditional reads +//! and writes, vectored IO, bulk deletion, and more... +//! +//! 4. Stable and predictable governance via the [Apache Arrow] project +//! +//! 5. Small dependency footprint, depending on only a small number of common crates //! //! Originally developed for [InfluxDB IOx] and subsequently donated //! to [Apache Arrow]. @@ -52,6 +57,8 @@ //! [Apache Arrow]: https://arrow.apache.org/ //! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox/ //! [crates.io]: https://github.com/rust-lang/crates.io +//! [ACID]: https://en.wikipedia.org/wiki/ACID +//! [S3]: https://aws.amazon.com/s3/ //! //! # Available [`ObjectStore`] Implementations //! @@ -79,6 +86,23 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! +//! # Why not a Filesystem Interface? +//! +//! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs +//! of object stores and not filesystems, opting to provide stateless APIs instead of the cursor +//! based interfaces such as [`Read`] or [`Seek`] favoured by filesystems. +//! +//! This provides some compelling advantages: +//! +//! * Except where explicitly stated otherwise, operations are atomic, and readers +//! cannot observe partial and/or failed writes +//! * Methods map directly to object store APIs, providing both efficiency and predictability +//! * Abstracts away filesystem and operating system specific quirks, ensuring portability +//! * Allows for functionality not native to filesystems, such as operation preconditions +//! and atomic multipart uploads +//! +//! [`BufReader`]: buffered::BufReader +//! //! # Adapters //! //! [`ObjectStore`] instances can be composed with various adapters @@ -87,8 +111,43 @@ //! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig) //! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) //! +//! # Configuration System +//! +//! This crate provides a configuration system inspired by the APIs exposed by [fsspec], +//! [PyArrow FileSystem], and [Hadoop FileSystem], allowing creating a [`DynObjectStore`] +//! from a URL and an optional list of key value pairs. This provides a flexible interface +//! to support a wide variety of user-defined store configurations, with minimal additional +//! application complexity. +//! +//! ```no_run +//! # use url::Url; +//! # use object_store::{parse_url, parse_url_opts}; +//! # use object_store::aws::{AmazonS3, AmazonS3Builder}; +//! # +//! # +//! // Can manually create a specific store variant using the appropriate builder +//! let store: AmazonS3 = AmazonS3Builder::from_env() +//! .with_bucket_name("my-bucket").build().unwrap(); //! -//! # List objects: +//! // Alternatively can create an ObjectStore from an S3 URL +//! let url = Url::parse("s3://bucket/path").unwrap(); +//! let (store, path) = parse_url(&url).unwrap(); +//! assert_eq!(path.as_ref(), "path"); +//! +//! // Potentially with additional options +//! let (store, path) = parse_url_opts(&url, vec![("aws_access_key_id", "...")]).unwrap(); +//! +//! // Or with URLs that encode the bucket name in the URL path +//! let url = Url::parse("https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path").unwrap(); +//! let (store, path) = parse_url(&url).unwrap(); +//! assert_eq!(path.as_ref(), "path"); +//! ``` +//! +//! [PyArrow FileSystem]: https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.from_uri +//! [fsspec]: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem +//! [Hadoop FileSystem]: https://hadoop.apache.org/docs/r3.0.0/api/org/apache/hadoop/fs/FileSystem.html#get-java.net.URI-org.apache.hadoop.conf.Configuration- +//! +//! # List objects //! //! Use the [`ObjectStore::list`] method to iterate over objects in //! remote storage or files in the local filesystem: @@ -111,7 +170,7 @@ //! // Recursively list all files below the 'data' path. //! // 1. On AWS S3 this would be the 'data/' prefix //! // 2. On a local filesystem, this would be the 'data' directory -//! let prefix: Path = "data".try_into().unwrap(); +//! let prefix = Path::from("data"); //! //! // Get an `async` stream of Metadata objects: //! let mut list_stream = object_store.list(Some(&prefix)); @@ -141,25 +200,34 @@ //! # use futures::TryStreamExt; //! # use object_store::local::LocalFileSystem; //! # use std::sync::Arc; -//! # use object_store::{path::Path, ObjectStore}; +//! # use bytes::Bytes; +//! # use object_store::{path::Path, ObjectStore, GetResult}; //! # fn get_object_store() -> Arc { //! # Arc::new(LocalFileSystem::new()) //! # } //! # //! # async fn example() { //! # -//! // create an ObjectStore +//! // Create an ObjectStore //! let object_store: Arc = get_object_store(); //! //! // Retrieve a specific file -//! let path: Path = "data/file01.parquet".try_into().unwrap(); +//! let path = Path::from("data/file01.parquet"); +//! +//! // Fetch just the file metadata +//! let meta = object_store.head(&path).await.unwrap(); +//! println!("{meta:?}"); +//! +//! // Fetch the object including metadata +//! let result: GetResult = object_store.get(&path).await.unwrap(); +//! assert_eq!(result.meta, meta); +//! +//! // Buffer the entire object in memory +//! let object: Bytes = result.bytes().await.unwrap(); +//! assert_eq!(object.len(), meta.size); //! -//! // fetch the bytes from object store -//! let stream = object_store -//! .get(&path) -//! .await -//! .unwrap() -//! .into_stream(); +//! // Alternatively stream the bytes from object storage +//! let stream = object_store.get(&path).await.unwrap().into_stream(); //! //! // Count the '0's using `try_fold` from `TryStreamExt` trait //! let num_zeros = stream @@ -171,13 +239,9 @@ //! # } //! ``` //! -//! Which will print out something like the following: +//! # Put Object //! -//! ```text -//! Num zeros in data/file01.parquet is 657 -//! ``` -//! # Put object -//! Use the [`ObjectStore::put`] method to save data in remote storage or local filesystem. +//! Use the [`ObjectStore::put`] method to atomically write data. //! //! ``` //! # use object_store::local::LocalFileSystem; @@ -190,15 +254,17 @@ //! # } //! # async fn put() { //! # -//! let object_store: Arc = get_object_store(); -//! let path: Path = "data/file1".try_into().unwrap(); -//! let bytes = Bytes::from_static(b"hello"); -//! object_store.put(&path, bytes).await.unwrap(); +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/file1"); +//! let bytes = Bytes::from_static(b"hello"); +//! object_store.put(&path, bytes).await.unwrap(); //! # } //! ``` //! -//! # Multipart put object -//! Use the [`ObjectStore::put_multipart`] method to save large amount of data in chunks. +//! # Multipart Upload +//! +//! Use the [`ObjectStore::put_multipart`] method to atomically write a large amount of data, +//! with implementations automatically handling parallel, chunked upload where appropriate. //! //! ``` //! # use object_store::local::LocalFileSystem; @@ -212,16 +278,165 @@ //! # } //! # async fn multi_upload() { //! # -//! let object_store: Arc = get_object_store(); -//! let path: Path = "data/large_file".try_into().unwrap(); -//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); -//! -//! let bytes = Bytes::from_static(b"hello"); -//! writer.write_all(&bytes).await.unwrap(); -//! writer.flush().await.unwrap(); -//! writer.shutdown().await.unwrap(); +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/large_file"); +//! let (_id, mut writer) = object_store.put_multipart(&path).await.unwrap(); +//! +//! let bytes = Bytes::from_static(b"hello"); +//! writer.write_all(&bytes).await.unwrap(); +//! writer.flush().await.unwrap(); +//! writer.shutdown().await.unwrap(); //! # } //! ``` +//! +//! # Vectored Read +//! +//! A common pattern, especially when reading structured datasets, is to need to fetch +//! multiple, potentially non-contiguous, ranges of a particular object. +//! +//! [`ObjectStore::get_ranges`] provides an efficient way to perform such vectored IO, and will +//! automatically coalesce adjacent ranges into an appropriate number of parallel requests. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # use object_store::ObjectStore; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(LocalFileSystem::new()) +//! # } +//! # async fn multi_upload() { +//! # +//! let object_store: Arc = get_object_store(); +//! let path = Path::from("data/large_file"); +//! let ranges = object_store.get_ranges(&path, &[90..100, 400..600, 0..10]).await.unwrap(); +//! assert_eq!(ranges.len(), 3); +//! assert_eq!(ranges[0].len(), 10); +//! # } +//! ``` +//! +//! # Conditional Fetch +//! +//! More complex object retrieval can be supported by [`ObjectStore::get_opts`]. +//! +//! For example, efficiently refreshing a cache without re-fetching the entire object +//! data if the object hasn't been modified. +//! +//! ``` +//! # use std::collections::btree_map::Entry; +//! # use std::collections::HashMap; +//! # use object_store::{GetOptions, GetResult, ObjectStore, Result, Error}; +//! # use std::sync::Arc; +//! # use std::time::{Duration, Instant}; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::path::Path; +//! struct CacheEntry { +//! /// Data returned by last request +//! data: Bytes, +//! /// ETag identifying the object returned by the server +//! e_tag: String, +//! /// Instant of last refresh +//! refreshed_at: Instant, +//! } +//! +//! /// Example cache that checks entries after 10 seconds for a new version +//! struct Cache { +//! entries: HashMap, +//! store: Arc, +//! } +//! +//! impl Cache { +//! pub async fn get(&mut self, path: &Path) -> Result { +//! Ok(match self.entries.get_mut(path) { +//! Some(e) => match e.refreshed_at.elapsed() < Duration::from_secs(10) { +//! true => e.data.clone(), // Return cached data +//! false => { // Check if remote version has changed +//! let opts = GetOptions { +//! if_none_match: Some(e.e_tag.clone()), +//! ..GetOptions::default() +//! }; +//! match self.store.get_opts(&path, opts).await { +//! Ok(d) => e.data = d.bytes().await?, +//! Err(Error::NotModified { .. }) => {} // Data has not changed +//! Err(e) => return Err(e), +//! }; +//! e.refreshed_at = Instant::now(); +//! e.data.clone() +//! } +//! }, +//! None => { // Not cached, fetch data +//! let get = self.store.get(&path).await?; +//! let e_tag = get.meta.e_tag.clone(); +//! let data = get.bytes().await?; +//! if let Some(e_tag) = e_tag { +//! let entry = CacheEntry { +//! e_tag, +//! data: data.clone(), +//! refreshed_at: Instant::now(), +//! }; +//! self.entries.insert(path.clone(), entry); +//! } +//! data +//! } +//! }) +//! } +//! } +//! ``` +//! +//! # Conditional Put +//! +//! The default behaviour when writing data is to upsert any existing object at the given path, +//! overwriting any previous value. More complex behaviours can be achieved using [`PutMode`], and +//! can be used to build [Optimistic Concurrency Control] based transactions. This facilitates +//! building metadata catalogs, such as [Apache Iceberg] or [Delta Lake], directly on top of object +//! storage, without relying on a separate DBMS. +//! +//! ``` +//! # use object_store::{Error, ObjectStore, PutMode, UpdateVersion}; +//! # use std::sync::Arc; +//! # use bytes::Bytes; +//! # use tokio::io::AsyncWriteExt; +//! # use object_store::memory::InMemory; +//! # use object_store::path::Path; +//! # fn get_object_store() -> Arc { +//! # Arc::new(InMemory::new()) +//! # } +//! # fn do_update(b: Bytes) -> Bytes {b} +//! # async fn conditional_put() { +//! let store = get_object_store(); +//! let path = Path::from("test"); +//! +//! // Perform a conditional update on path +//! loop { +//! // Perform get request +//! let r = store.get(&path).await.unwrap(); +//! +//! // Save version information fetched +//! let version = UpdateVersion { +//! e_tag: r.meta.e_tag.clone(), +//! version: r.meta.version.clone(), +//! }; +//! +//! // Compute new version of object contents +//! let new = do_update(r.bytes().await.unwrap()); +//! +//! // Attempt to commit transaction +//! match store.put_opts(&path, new, PutMode::Update(version).into()).await { +//! Ok(_) => break, // Successfully committed +//! Err(Error::Precondition { .. }) => continue, // Object has changed, try again +//! Err(e) => panic!("{e}") +//! } +//! } +//! # } +//! ``` +//! +//! [Optimistic Concurrency Control]: https://en.wikipedia.org/wiki/Optimistic_concurrency_control +//! [Apache Iceberg]: https://iceberg.apache.org/ +//! [Delta Lake]: https://delta.io/ +//! #[cfg(all( target_arch = "wasm32", From be093cb44243a23b637754e06b13ccb4de2f6512 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 30 Oct 2023 20:27:51 +0000 Subject: [PATCH 1318/1411] Simplify datetime conversion (#5006) * Simplify datetime conversion * Update test * Review feedback --- parquet_derive/src/parquet_field.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/parquet_derive/src/parquet_field.rs b/parquet_derive/src/parquet_field.rs index 0ac95c2864e5..bb33b3196855 100644 --- a/parquet_derive/src/parquet_field.rs +++ b/parquet_derive/src/parquet_field.rs @@ -427,13 +427,9 @@ impl Field { quote! { ::chrono::naive::NaiveDateTime::from_timestamp_millis(vals[i]).unwrap() } } Some(ThirdPartyType::ChronoNaiveDate) => { + // NaiveDateTime::UNIX_EPOCH.num_days_from_ce() == 719163 quote! { - ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i] - + ((::chrono::naive::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap() - .signed_duration_since( - ::chrono::naive::NaiveDate::from_ymd_opt(0, 12, 31).unwrap() - ) - ).num_days()) as i32).unwrap() + ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i].saturating_add(719163)).unwrap() } } Some(ThirdPartyType::Uuid) => { @@ -1339,11 +1335,7 @@ mod test { panic!("Schema and struct disagree on type for {}", stringify!{ henceforth }); } for (i, r) in &mut records[..num_records].iter_mut().enumerate() { - r.henceforth = ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i] - + ((::chrono::naive::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap() - .signed_duration_since( - ::chrono::naive::NaiveDate::from_ymd_opt(0, 12, 31).unwrap() - )).num_days()) as i32).unwrap(); + r.henceforth = ::chrono::naive::NaiveDate::from_num_days_from_ce_opt(vals[i].saturating_add(719163)).unwrap(); } } }).to_string()); From 65f7be856099d389b0d0eafa9be47fad25215ee6 Mon Sep 17 00:00:00 2001 From: Alex Sayers Date: Wed, 1 Nov 2023 05:12:08 +0900 Subject: [PATCH 1319/1411] Return row count when inferring schema from JSON (#5008) * Return row count when inferring schema from JSON * Add some unit tests for arrow-json's row-count --- arrow-json/src/reader/mod.rs | 6 +++--- arrow-json/src/reader/schema.rs | 33 ++++++++++++++++++++++++++------- arrow-json/src/writer.rs | 4 ++-- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 1225e51b3af7..28282c4d1541 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -1562,7 +1562,7 @@ mod tests { let file = File::open(path).unwrap(); let mut reader = BufReader::new(file); let schema = schema.unwrap_or_else(|| { - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); schema }); @@ -1939,7 +1939,7 @@ mod tests { fn test_with_multiple_batches() { let file = File::open("test/data/basic_nulls.json").unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); @@ -2079,7 +2079,7 @@ mod tests { fn test_json_iterator() { let file = File::open("test/data/basic.json").unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(5); diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 58aa08014daa..97f1a0f29594 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -209,6 +209,8 @@ impl Iterator for ValueIter { /// /// If `max_read_records` is not set, the whole file is read to infer its field types. /// +/// Returns inferred schema and number of records read. +/// /// Contrary to [`infer_json_schema`], this function will seek back to the start of the `reader`. /// That way, the `reader` can be used immediately afterwards to create a [`Reader`]. /// @@ -229,7 +231,7 @@ impl Iterator for ValueIter { pub fn infer_json_schema_from_seekable( mut reader: R, max_read_records: Option, -) -> Result { +) -> Result<(Schema, usize), ArrowError> { let schema = infer_json_schema(&mut reader, max_read_records); // return the reader seek back to the start reader.rewind()?; @@ -242,6 +244,8 @@ pub fn infer_json_schema_from_seekable( /// /// If `max_read_records` is not set, the whole file is read to infer its field types. /// +/// Returns inferred schema and number of records read. +/// /// This function will not seek back to the start of the `reader`. The user has to manage the /// original file's cursor. This function is useful when the `reader`'s cursor is not available /// (does not implement [`Seek`]), such is the case for compressed streams decoders. @@ -266,8 +270,10 @@ pub fn infer_json_schema_from_seekable( pub fn infer_json_schema( reader: R, max_read_records: Option, -) -> Result { - infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) +) -> Result<(Schema, usize), ArrowError> { + let mut values = ValueIter::new(reader, max_read_records); + let schema = infer_json_schema_from_iterator(&mut values)?; + Ok((schema, values.record_count)) } fn set_object_scalar_field_type( @@ -522,15 +528,28 @@ mod tests { ]); let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); - let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + let (inferred_schema, n_rows) = infer_json_schema_from_seekable(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); + assert_eq!(n_rows, 4); let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); let mut reader = BufReader::new(GzDecoder::new(&file)); - let inferred_schema = infer_json_schema(&mut reader, None).unwrap(); + let (inferred_schema, n_rows) = infer_json_schema(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); + assert_eq!(n_rows, 4); + } + + #[test] + fn test_row_limit() { + let mut reader = BufReader::new(File::open("test/data/basic.json").unwrap()); + + let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, None).unwrap(); + assert_eq!(n_rows, 12); + + let (_, n_rows) = infer_json_schema_from_seekable(&mut reader, Some(5)).unwrap(); + assert_eq!(n_rows, 5); } #[test] @@ -640,7 +659,7 @@ mod tests { bigger_than_i64_max, smaller_than_i64_min ); let mut buf_reader = BufReader::new(json.as_bytes()); - let inferred_schema = infer_json_schema(&mut buf_reader, Some(1)).unwrap(); + let (inferred_schema, _) = infer_json_schema(&mut buf_reader, Some(1)).unwrap(); let fields = inferred_schema.fields(); let (_, big_field) = fields.find("bigger_than_i64_max").unwrap(); @@ -686,7 +705,7 @@ mod tests { {"in":null, "ni":2, "ns":"3", "sn":null, "n":null, "an":null, "na": [], "nas":["8"]} {"in":1, "ni":null, "ns":null, "sn":"4", "n":null, "an":[], "na": null, "nas":[]} "#; - let inferred_schema = + let (inferred_schema, _) = infer_json_schema_from_seekable(Cursor::new(data), None).expect("infer"); let schema = Schema::new(vec![ Field::new("an", list_type_of(DataType::Null), true), diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 97a8b38d4192..5ecfc932364b 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -1206,7 +1206,7 @@ mod tests { fn test_write_for_file(test_file: &str) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024); @@ -1391,7 +1391,7 @@ mod tests { let test_file = "test/data/basic.json"; let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); - let schema = infer_json_schema(&mut reader, None).unwrap(); + let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); reader.rewind().unwrap(); let builder = ReaderBuilder::new(Arc::new(schema)).with_batch_size(1024); From 94fe6bb4b0dde6f00d8853e6bebefd6b55e3f965 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:33:37 +0000 Subject: [PATCH 1320/1411] Remove ObjectStore::append (#5016) --- object_store/Cargo.toml | 5 -- object_store/src/lib.rs | 31 +-------- object_store/src/limit.rs | 7 -- object_store/src/local.rs | 126 ----------------------------------- object_store/src/memory.rs | 99 --------------------------- object_store/src/prefix.rs | 6 -- object_store/src/throttle.rs | 4 -- 7 files changed, 1 insertion(+), 277 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index cb820b509ada..c8cf4e280236 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -53,11 +53,6 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util", "fs"] } - -[target.'cfg(target_arch = "wasm32")'.dependencies] tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 69db9d97bc2c..1b94f816b1af 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -94,8 +94,7 @@ //! //! This provides some compelling advantages: //! -//! * Except where explicitly stated otherwise, operations are atomic, and readers -//! cannot observe partial and/or failed writes +//! * All operations are atomic, and readers cannot observe partial and/or failed writes //! * Methods map directly to object store APIs, providing both efficiency and predictability //! * Abstracts away filesystem and operating system specific quirks, ensuring portability //! * Allows for functionality not native to filesystems, such as operation preconditions @@ -559,30 +558,6 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// vary by object store. async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()>; - /// Returns an [`AsyncWrite`] that can be used to append to the object at `location` - /// - /// A new object will be created if it doesn't already exist, otherwise it will be - /// opened, with subsequent writes appended to the end. - /// - /// This operation cannot be supported by all stores, most use-cases should prefer - /// [`ObjectStore::put`] and [`ObjectStore::put_multipart`] for better portability - /// and stronger guarantees - /// - /// This API is not guaranteed to be atomic, in particular - /// - /// * On error, `location` may contain partial data - /// * Concurrent calls to [`ObjectStore::list`] may return partially written objects - /// * Concurrent calls to [`ObjectStore::get`] may return partially written data - /// * Concurrent calls to [`ObjectStore::put`] may result in data loss / corruption - /// * Concurrent calls to [`ObjectStore::append`] may result in data loss / corruption - /// - /// Additionally some stores, such as Azure, may only support appending to objects created - /// with [`ObjectStore::append`], and not with [`ObjectStore::put`], [`ObjectStore::copy`], or - /// [`ObjectStore::put_multipart`] - async fn append(&self, _location: &Path) -> Result> { - Err(Error::NotImplemented) - } - /// Return the bytes that are stored at the specified location. async fn get(&self, location: &Path) -> Result { self.get_opts(location, GetOptions::default()).await @@ -779,10 +754,6 @@ macro_rules! as_ref_impl { self.as_ref().abort_multipart(location, multipart_id).await } - async fn append(&self, location: &Path) -> Result> { - self.as_ref().append(location).await - } - async fn get(&self, location: &Path) -> Result { self.as_ref().get(location).await } diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 39cc605c4768..d1363d9a4d46 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -94,13 +94,6 @@ impl ObjectStore for LimitStore { let _permit = self.semaphore.acquire().await.unwrap(); self.inner.abort_multipart(location, multipart_id).await } - - async fn append(&self, location: &Path) -> Result> { - let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); - let write = self.inner.append(location).await?; - Ok(Box::new(PermitWrapper::new(write, permit))) - } - async fn get(&self, location: &Path) -> Result { let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); let r = self.inner.get(location).await?; diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 919baf71b0a8..1a87dc33c7a3 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -350,45 +350,6 @@ impl ObjectStore for LocalFileSystem { .await } - async fn append(&self, location: &Path) -> Result> { - // Get the path to the file from the configuration. - let path = self.config.path_to_filesystem(location)?; - loop { - // Create new `OpenOptions`. - let mut options = tokio::fs::OpenOptions::new(); - - // Attempt to open the file with the given options. - match options - .truncate(false) - .append(true) - .create(true) - .open(&path) - .await - { - // If the file was successfully opened, return it wrapped in a boxed `AsyncWrite` trait object. - Ok(file) => return Ok(Box::new(file)), - // If the error is that the file was not found, attempt to create the file and any necessary parent directories. - Err(source) if source.kind() == ErrorKind::NotFound => { - // Get the path to the parent directory of the file. - let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, - })?; - - // Create the parent directory and any necessary ancestors. - tokio::fs::create_dir_all(parent) - .await - // If creating the directory fails, return a `UnableToCreateDirSnafu` error. - .context(UnableToCreateDirSnafu { path: parent })?; - // Try again to open the file. - continue; - } - // If any other error occurs, return a `UnableToOpenFile` error. - Err(source) => return Err(Error::UnableToOpenFile { source, path }.into()), - } - } - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let location = location.clone(); let path = self.config.path_to_filesystem(&location)?; @@ -1449,97 +1410,10 @@ mod tests { mod not_wasm_tests { use crate::local::LocalFileSystem; use crate::{ObjectStore, Path}; - use bytes::Bytes; use std::time::Duration; use tempfile::TempDir; use tokio::io::AsyncWriteExt; - #[tokio::test] - async fn creates_dir_if_not_present_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("nested/file/test_file"); - - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - - let mut writer = integration.append(&location).await.unwrap(); - - writer.write_all(data.as_ref()).await.unwrap(); - - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn unknown_length_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("some_file"); - - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - let mut writer = integration.append(&location).await.unwrap(); - - writer.write_all(data.as_ref()).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn multiple_append() { - let root = TempDir::new().unwrap(); - let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - - let location = Path::from("some_file"); - - let data = vec![ - Bytes::from("arbitrary"), - Bytes::from("data"), - Bytes::from("gnz"), - ]; - - let mut writer = integration.append(&location).await.unwrap(); - for d in &data { - writer.write_all(d).await.unwrap(); - } - writer.flush().await.unwrap(); - - let mut writer = integration.append(&location).await.unwrap(); - for d in &data { - writer.write_all(d).await.unwrap(); - } - writer.flush().await.unwrap(); - - let read_data = integration - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - let expected_data = Bytes::from("arbitrarydatagnzarbitrarydatagnz"); - assert_eq!(&*read_data, expected_data); - } - #[tokio::test] async fn test_cleanup_intermediate_files() { let root = TempDir::new().unwrap(); diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 9d79a798ad1f..382300123846 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -205,14 +205,6 @@ impl ObjectStore for InMemory { Ok(()) } - async fn append(&self, location: &Path) -> Result> { - Ok(Box::new(InMemoryAppend { - location: location.clone(), - data: Vec::::new(), - storage: SharedStorage::clone(&self.storage), - })) - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let entry = self.entry(location).await?; let e_tag = entry.e_tag.to_string(); @@ -443,53 +435,8 @@ impl AsyncWrite for InMemoryUpload { } } -struct InMemoryAppend { - location: Path, - data: Vec, - storage: Arc>, -} - -impl AsyncWrite for InMemoryAppend { - fn poll_write( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - buf: &[u8], - ) -> Poll> { - self.data.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } - - fn poll_flush( - mut self: Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> Poll> { - let storage = Arc::clone(&self.storage); - - let mut writer = storage.write(); - - if let Some(entry) = writer.map.remove(&self.location) { - let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(entry.data.into_iter().chain(buf)); - writer.insert(&self.location, concat); - } else { - let data = Bytes::from(std::mem::take(&mut self.data)); - writer.insert(&self.location, data); - }; - Poll::Ready(Ok(())) - } - - fn poll_shutdown( - self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - self.poll_flush(cx) - } -} - #[cfg(test)] mod tests { - use tokio::io::AsyncWriteExt; - use super::*; use crate::tests::*; @@ -577,50 +524,4 @@ mod tests { panic!("unexpected error type: {err:?}"); } } - - #[tokio::test] - async fn test_append_new() { - let in_memory = InMemory::new(); - let location = Path::from("some_file"); - let data = Bytes::from("arbitrary data"); - let expected_data = data.clone(); - - let mut writer = in_memory.append(&location).await.unwrap(); - writer.write_all(&data).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = in_memory - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } - - #[tokio::test] - async fn test_append_existing() { - let in_memory = InMemory::new(); - let location = Path::from("some_file"); - let data = Bytes::from("arbitrary"); - let data_appended = Bytes::from(" data"); - let expected_data = Bytes::from("arbitrary data"); - - let mut writer = in_memory.append(&location).await.unwrap(); - writer.write_all(&data).await.unwrap(); - writer.flush().await.unwrap(); - - writer.write_all(&data_appended).await.unwrap(); - writer.flush().await.unwrap(); - - let read_data = in_memory - .get(&location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - assert_eq!(&*read_data, expected_data); - } } diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 68101307fbdf..38f9b07bbd05 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -103,12 +103,6 @@ impl ObjectStore for PrefixStore { let full_path = self.full_path(location); self.inner.abort_multipart(&full_path, multipart_id).await } - - async fn append(&self, location: &Path) -> Result> { - let full_path = self.full_path(location); - self.inner.append(&full_path).await - } - async fn get(&self, location: &Path) -> Result { let full_path = self.full_path(location); self.inner.get(&full_path).await diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index dcd2c04bcf05..252256a4599e 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -169,10 +169,6 @@ impl ObjectStore for ThrottledStore { Err(super::Error::NotImplemented) } - async fn append(&self, _location: &Path) -> Result> { - Err(super::Error::NotImplemented) - } - async fn get(&self, location: &Path) -> Result { sleep(self.config().wait_get_per_call).await; From ec788e15a99835376430b27617b1bb766709e05c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:33:47 +0000 Subject: [PATCH 1321/1411] Decode URL paths (#5017) (#5018) --- object_store/src/parse.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 51993e245530..0fbc33c935d8 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -98,8 +98,7 @@ impl ObjectStoreScheme { _ => return Err(Error::Unrecognised { url: url.clone() }), }; - let path = Path::parse(path)?; - Ok((scheme, path)) + Ok((scheme, Path::from_url_path(path)?)) } } @@ -240,6 +239,18 @@ mod tests { ), ("http://mydomain/path", (ObjectStoreScheme::Http, "path")), ("https://mydomain/path", (ObjectStoreScheme::Http, "path")), + ( + "s3://bucket/foo%20bar", + (ObjectStoreScheme::AmazonS3, "foo bar"), + ), + ( + "https://foo/bar%20baz", + (ObjectStoreScheme::Http, "bar baz"), + ), + ( + "file:///bar%252Efoo", + (ObjectStoreScheme::Local, "bar%2Efoo"), + ), ]; for (s, (expected_scheme, expected_path)) in cases { @@ -260,4 +271,12 @@ mod tests { assert!(ObjectStoreScheme::parse(&url).is_err()); } } + + #[test] + fn test_url_spaces() { + let url = Url::parse("file:///my file with spaces").unwrap(); + assert_eq!(url.path(), "/my%20file%20with%20spaces"); + let (_, path) = parse_url(&url).unwrap(); + assert_eq!(path.as_ref(), "my file with spaces"); + } } From 78735002d99eb0212166924948f95554c4ac2866 Mon Sep 17 00:00:00 2001 From: kamille <34352236+Rachelint@users.noreply.github.com> Date: Thu, 2 Nov 2023 00:01:39 +0800 Subject: [PATCH 1322/1411] ObjectStore: make error msg thrown from retry more detailed (#5012) * optimize error msg for better debugging. * fix unit test. * fix fmt. --- object_store/src/client/retry.rs | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index 789103c0f74f..08b9a74e17c5 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -39,9 +39,12 @@ pub enum Error { body: Option, }, - #[snafu(display("Error after {retries} retries: {source}"))] + #[snafu(display("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}"))] Reqwest { retries: usize, + max_retries: usize, + elapsed: Duration, + retry_timeout: Duration, source: reqwest::Error, }, } @@ -198,7 +201,6 @@ impl RetryExt for reqwest::RequestBuilder { } Err(e) => { let status = r.status(); - if retries == max_retries || now.elapsed() > retry_timeout || !status.is_server_error() { @@ -214,12 +216,18 @@ impl RetryExt for reqwest::RequestBuilder { Err(e) => { Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, } } } false => Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, } }); @@ -248,6 +256,9 @@ impl RetryExt for reqwest::RequestBuilder { return Err(Error::Reqwest { retries, + max_retries, + elapsed: now.elapsed(), + retry_timeout, source: e, }) } @@ -408,9 +419,8 @@ mod tests { let e = do_request().await.unwrap_err().to_string(); assert!( - e.starts_with( - "Error after 2 retries: HTTP status server error (502 Bad Gateway) for url" - ), + e.contains("Error after 2 retries in") && + e.contains("max_retries:2, retry_timeout:1000s, source:HTTP status server error (502 Bad Gateway) for url"), "{e}" ); @@ -425,7 +435,10 @@ mod tests { } let e = do_request().await.unwrap_err().to_string(); assert!( - e.starts_with("Error after 2 retries: error sending request for url"), + e.contains("Error after 2 retries in") + && e.contains( + "max_retries:2, retry_timeout:1000s, source:error sending request for url" + ), "{e}" ); From 7281a0c167554d5fa69055a3f3ee46108254c9c9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:27:53 +0000 Subject: [PATCH 1323/1411] Relax path safety (#5019) (#5020) * Relax path safety (#5019) * Review feedback * WASM --- object_store/src/lib.rs | 17 ++++ object_store/src/local.rs | 174 +++++++++++++++++++++++++-------- object_store/src/path/mod.rs | 59 +++++------ object_store/src/path/parts.rs | 23 ++--- 4 files changed, 184 insertions(+), 89 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 1b94f816b1af..cdd572dd9b3a 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1442,6 +1442,23 @@ mod tests { storage.delete(&path).await.unwrap(); + // Test handling of unicode paths + let path = Path::parse("🇦🇺/$shenanigans@@~.txt").unwrap(); + storage.put(&path, "test".into()).await.unwrap(); + + let r = storage.get(&path).await.unwrap(); + assert_eq!(r.bytes().await.unwrap(), "test"); + + let dir = Path::parse("🇦🇺").unwrap(); + let r = storage.list_with_delimiter(None).await.unwrap(); + assert!(r.common_prefixes.contains(&dir)); + + let r = storage.list_with_delimiter(Some(&dir)).await.unwrap(); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location, path); + + storage.delete(&path).await.unwrap(); + // Can also write non-percent encoded sequences let path = Path::parse("%Q.parquet").unwrap(); storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 1a87dc33c7a3..e5c4e3204663 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -144,6 +144,11 @@ pub(crate) enum Error { path: PathBuf, source: io::Error, }, + + #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] + InvalidPath { + path: String, + }, } impl From for super::Error { @@ -176,6 +181,30 @@ impl From for super::Error { /// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme /// [RFC 3986]: https://www.rfc-editor.org/rfc/rfc3986 /// +/// # Path Semantics +/// +/// [`LocalFileSystem`] will expose the path semantics of the underlying filesystem, which may +/// have additional restrictions beyond those enforced by [`Path`]. +/// +/// For example: +/// +/// * Windows forbids certain filenames, e.g. `COM0`, +/// * Windows forbids folders with trailing `.` +/// * Windows forbids certain ASCII characters, e.g. `<` or `|` +/// * OS X forbids filenames containing `:` +/// * Leading `-` are discouraged on Unix systems where they may be interpreted as CLI flags +/// * Filesystems may have restrictions on the maximum path or path segment length +/// * Filesystem support for non-ASCII characters is inconsistent +/// +/// Additionally some filesystems, such as NTFS, are case-insensitive, whilst others like +/// FAT don't preserve case at all. Further some filesystems support non-unicode character +/// sequences, such as unpaired UTF-16 surrogates, and [`LocalFileSystem`] will error on +/// encountering such sequences. +/// +/// Finally, filenames matching the regex `/.*#\d+/`, e.g. `foo.parquet#123`, are not supported +/// by [`LocalFileSystem`] as they are used to provide atomic writes. Such files will be ignored +/// for listing operations, and attempting to address such a file will error. +/// /// # Tokio Compatibility /// /// Tokio discourages performing blocking IO on a tokio worker thread, however, @@ -196,6 +225,11 @@ impl From for super::Error { /// * Mutating a file through one or more symlinks will mutate the underlying file /// * Deleting a path that resolves to a symlink will only delete the symlink /// +/// # Cross-Filesystem Copy +/// +/// [`LocalFileSystem::copy`] is implemented using [`std::fs::hard_link`], and therefore +/// does not support copying across filesystem boundaries. +/// #[derive(Debug)] pub struct LocalFileSystem { config: Arc, @@ -246,8 +280,19 @@ impl LocalFileSystem { } impl Config { - /// Return an absolute filesystem path of the given location + /// Return an absolute filesystem path of the given file location fn path_to_filesystem(&self, location: &Path) -> Result { + ensure!( + is_valid_file_path(location), + InvalidPathSnafu { + path: location.as_ref() + } + ); + self.prefix_to_filesystem(location) + } + + /// Return an absolute filesystem path of the given location + fn prefix_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() .expect("url path") @@ -269,6 +314,19 @@ impl Config { } } +fn is_valid_file_path(path: &Path) -> bool { + match path.filename() { + Some(p) => match p.split_once('#') { + Some((_, suffix)) if !suffix.is_empty() => { + // Valid if contains non-digits + !suffix.as_bytes().iter().all(|x| x.is_ascii_digit()) + } + _ => true, + }, + None => false, + } +} + #[async_trait] impl ObjectStore for LocalFileSystem { async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { @@ -406,7 +464,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let root_path = match prefix { - Some(prefix) => match config.path_to_filesystem(prefix) { + Some(prefix) => match config.prefix_to_filesystem(prefix) { Ok(path) => path, Err(e) => return futures::future::ready(Err(e)).into_stream().boxed(), }, @@ -419,20 +477,21 @@ impl ObjectStore for LocalFileSystem { .follow_links(true); let s = walkdir.into_iter().flat_map(move |result_dir_entry| { - match convert_walkdir_result(result_dir_entry) { + let entry = match convert_walkdir_result(result_dir_entry).transpose()? { + Ok(entry) => entry, + Err(e) => return Some(Err(e)), + }; + + if !entry.path().is_file() { + return None; + } + + match config.filesystem_to_path(entry.path()) { + Ok(path) => match is_valid_file_path(&path) { + true => Some(convert_entry(entry, path)), + false => None, + }, Err(e) => Some(Err(e)), - Ok(None) => None, - Ok(entry @ Some(_)) => entry - .filter(|dir_entry| { - dir_entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && !dir_entry.file_name().to_string_lossy().contains('#') - }) - .map(|entry| { - let location = config.filesystem_to_path(entry.path())?; - convert_entry(entry, location) - }), } }); @@ -473,7 +532,7 @@ impl ObjectStore for LocalFileSystem { let config = Arc::clone(&self.config); let prefix = prefix.cloned().unwrap_or_default(); - let resolved_prefix = config.path_to_filesystem(&prefix)?; + let resolved_prefix = config.prefix_to_filesystem(&prefix)?; maybe_spawn_blocking(move || { let walkdir = WalkDir::new(&resolved_prefix) @@ -486,15 +545,11 @@ impl ObjectStore for LocalFileSystem { for entry_res in walkdir.into_iter().map(convert_walkdir_result) { if let Some(entry) = entry_res? { - if entry.file_type().is_file() - // Ignore file names with # in them, since they might be in-progress uploads. - // They would be rejected anyways by filesystem_to_path below. - && entry.file_name().to_string_lossy().contains('#') - { - continue; - } let is_directory = entry.file_type().is_dir(); let entry_location = config.filesystem_to_path(entry.path())?; + if !is_directory && !is_valid_file_path(&entry_location) { + continue; + } let mut parts = match entry_location.prefix_match(&prefix) { Some(parts) => parts, @@ -1325,26 +1380,18 @@ mod tests { assert!(result.common_prefixes.is_empty()); assert_eq!(result.objects[0].location, object); - let illegal = root.join("💀"); - std::fs::write(illegal, "foo").unwrap(); - - // Can list directory that doesn't contain illegal path - flatten_list_stream(&integration, Some(&directory)) - .await - .unwrap(); + let emoji = root.join("💀"); + std::fs::write(emoji, "foo").unwrap(); - // Cannot list illegal file - let err = flatten_list_stream(&integration, None) - .await - .unwrap_err() - .to_string(); + // Can list illegal file + let paths = flatten_list_stream(&integration, None).await.unwrap(); - assert!( - err.contains( - "Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\"" - ), - "{}", - err + assert_eq!( + paths, + vec![ + Path::parse("💀").unwrap(), + Path::parse("directory/child.txt").unwrap() + ] ); } @@ -1403,6 +1450,51 @@ mod tests { let path = Path::from_filesystem_path(".").unwrap(); integration.list_with_delimiter(Some(&path)).await.unwrap(); } + + #[test] + fn test_valid_path() { + let cases = [ + ("foo#123/test.txt", true), + ("foo#123/test#23.txt", true), + ("foo#123/test#34", false), + ("foo😁/test#34", false), + ("foo/test#😁34", true), + ]; + + for (case, expected) in cases { + let path = Path::parse(case).unwrap(); + assert_eq!(is_valid_file_path(&path), expected); + } + } + + #[tokio::test] + async fn test_intermediate_files() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let a = Path::parse("foo#123/test.txt").unwrap(); + integration.put(&a, "test".into()).await.unwrap(); + + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + std::fs::write(root.path().join("bar#123"), "test").unwrap(); + + // Should ignore file + let list = flatten_list_stream(&integration, None).await.unwrap(); + assert_eq!(list, vec![a.clone()]); + + let b = Path::parse("bar#123").unwrap(); + let err = integration.get(&b).await.unwrap_err().to_string(); + assert_eq!(err, "Generic LocalFileSystem error: Filenames containing trailing '/#\\d+/' are not supported: bar#123"); + + let c = Path::parse("foo#123.txt").unwrap(); + integration.put(&c, "test".into()).await.unwrap(); + + let mut list = flatten_list_stream(&integration, None).await.unwrap(); + list.sort_unstable(); + assert_eq!(list, vec![c, a]); + } } #[cfg(not(target_arch = "wasm32"))] diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index e065c31d3145..f914862bc53d 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -65,10 +65,23 @@ pub enum Error { /// A parsed path representation that can be safely written to object storage /// -/// # Path Safety +/// A [`Path`] maintains the following invariants: +/// +/// * Paths are delimited by `/` +/// * Paths do not contain leading or trailing `/` +/// * Paths do not contain relative path segments, i.e. `.` or `..` +/// * Paths do not contain empty path segments +/// * Paths do not contain any ASCII control characters +/// +/// There are no enforced restrictions on path length, however, it should be noted that most +/// object stores do not permit paths longer than 1024 bytes, and many filesystems do not +/// support path segments longer than 255 bytes. +/// +/// # Encode /// /// In theory object stores support any UTF-8 character sequence, however, certain character -/// sequences cause compatibility problems with some applications and protocols. As such the +/// sequences cause compatibility problems with some applications and protocols. Additionally +/// some filesystems may impose character restrictions, see [`LocalFileSystem`]. As such the /// naming guidelines for [S3], [GCS] and [Azure Blob Storage] all recommend sticking to a /// limited character subset. /// @@ -76,34 +89,16 @@ pub enum Error { /// [GCS]: https://cloud.google.com/storage/docs/naming-objects /// [Azure Blob Storage]: https://docs.microsoft.com/en-us/rest/api/storageservices/Naming-and-Referencing-Containers--Blobs--and-Metadata#blob-names /// -/// This presents libraries with two options for consistent path handling: -/// -/// 1. Allow constructing unsafe paths, allowing for both reading and writing of data to paths -/// that may not be consistently understood or supported -/// 2. Disallow constructing unsafe paths, ensuring data written can be consistently handled by -/// all other systems, but preventing interaction with objects at unsafe paths -/// -/// This library takes the second approach, in particular: -/// -/// * Paths are delimited by `/` -/// * Paths do not start with a `/` -/// * Empty path segments are discarded (e.g. `//` is treated as though it were `/`) -/// * Relative path segments, i.e. `.` and `..` are percent encoded -/// * Unsafe characters are percent encoded, as described by [RFC 1738] -/// * All paths are relative to the root of the object store -/// -/// In order to provide these guarantees there are two ways to safely construct a [`Path`] -/// -/// # Encode -/// -/// A string containing potentially illegal path segments can be encoded to a [`Path`] -/// using [`Path::from`] or [`Path::from_iter`]. +/// A string containing potentially problematic path segments can therefore be encoded to a [`Path`] +/// using [`Path::from`] or [`Path::from_iter`]. This will percent encode any problematic +/// segments according to [RFC 1738]. /// /// ``` /// # use object_store::path::Path; /// assert_eq!(Path::from("foo/bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo//bar").as_ref(), "foo/bar"); /// assert_eq!(Path::from("foo/../bar").as_ref(), "foo/%2E%2E/bar"); +/// assert_eq!(Path::from("/").as_ref(), ""); /// assert_eq!(Path::from_iter(["foo", "foo/bar"]).as_ref(), "foo/foo%2Fbar"); /// ``` /// @@ -116,20 +111,20 @@ pub enum Error { /// /// # Parse /// -/// Alternatively a [`Path`] can be created from an existing string, returning an -/// error if it is invalid. Unlike the encoding methods, this will permit -/// valid percent encoded sequences. +/// Alternatively a [`Path`] can be parsed from an existing string, returning an +/// error if it is invalid. Unlike the encoding methods above, this will permit +/// arbitrary unicode, including percent encoded sequences. /// /// ``` /// # use object_store::path::Path; -/// /// assert_eq!(Path::parse("/foo/foo%2Fbar").unwrap().as_ref(), "foo/foo%2Fbar"); -/// Path::parse("..").unwrap_err(); -/// Path::parse("/foo//").unwrap_err(); -/// Path::parse("😀").unwrap_err(); +/// Path::parse("..").unwrap_err(); // Relative path segments are disallowed +/// Path::parse("/foo//").unwrap_err(); // Empty path segments are disallowed +/// Path::parse("\x00").unwrap_err(); // ASCII control characters are disallowed /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt +/// [`LocalFileSystem`]: crate::local::LocalFileSystem #[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct Path { /// The raw path with no leading or trailing delimiters @@ -236,7 +231,7 @@ impl Path { pub fn filename(&self) -> Option<&str> { match self.raw.is_empty() { true => None, - false => self.raw.split(DELIMITER).last(), + false => self.raw.rsplit(DELIMITER).next(), } } diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index 9da4815712db..df7097cbe9db 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -37,8 +37,10 @@ pub struct InvalidPart { /// The PathPart type exists to validate the directory/file names that form part /// of a path. /// -/// A PathPart instance is guaranteed to to contain no illegal characters (e.g. `/`) -/// as it can only be constructed by going through the `from` impl. +/// A [`PathPart`] is guaranteed to: +/// +/// * Contain no ASCII control characters or `/` +/// * Not be a relative path segment, i.e. `.` or `..` #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] pub struct PathPart<'a> { pub(super) raw: Cow<'a, str>, @@ -54,19 +56,12 @@ impl<'a> PathPart<'a> { }); } - for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { - // A percent character is always valid, even if not - // followed by a valid 2-digit hex code - // https://url.spec.whatwg.org/#percent-encoded-bytes - if b == b'%' { - continue; - } - - if !b.is_ascii() || should_percent_encode(b) { + for c in segment.chars() { + if c.is_ascii_control() || c == '/' { return Err(InvalidPart { segment: segment.to_string(), // This is correct as only single byte characters up to this point - illegal: segment.chars().nth(idx).unwrap().to_string(), + illegal: c.to_string(), }); } } @@ -77,10 +72,6 @@ impl<'a> PathPart<'a> { } } -fn should_percent_encode(c: u8) -> bool { - percent_encode(&[c], INVALID).next().unwrap().len() != 1 -} - /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy From ab53d2dd5fb2af2bab69f95a6ddae2226c166500 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 10:28:07 +0000 Subject: [PATCH 1324/1411] Support onelake fabric paths in parse_url (#5000) (#5002) --- object_store/src/parse.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index 0fbc33c935d8..ddea034699f0 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -81,7 +81,10 @@ impl ObjectStoreScheme { } ("http", Some(_)) => (Self::Http, url.path()), ("https", Some(host)) => { - if host.ends_with("dfs.core.windows.net") || host.ends_with("blob.core.windows.net") + if host.ends_with("dfs.core.windows.net") + || host.ends_with("blob.core.windows.net") + || host.ends_with("dfs.fabric.microsoft.com") + || host.ends_with("blob.fabric.microsoft.com") { (Self::MicrosoftAzure, url.path()) } else if host.ends_with("amazonaws.com") { @@ -251,6 +254,30 @@ mod tests { "file:///bar%252Efoo", (ObjectStoreScheme::Local, "bar%2Efoo"), ), + ( + "abfss://file_system@account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "abfss://file_system@account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.dfs.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.dfs.fabric.microsoft.com/container", + (ObjectStoreScheme::MicrosoftAzure, "container"), + ), + ( + "https://account.blob.fabric.microsoft.com/", + (ObjectStoreScheme::MicrosoftAzure, ""), + ), + ( + "https://account.blob.fabric.microsoft.com/container", + (ObjectStoreScheme::MicrosoftAzure, "container"), + ), ]; for (s, (expected_scheme, expected_path)) in cases { From ad211fe324d259bf9fea1c43a3a82b3c833f6d7a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:30:09 +0000 Subject: [PATCH 1325/1411] Prepare object_store 0.8.0 (#5010) (#5023) --- object_store/CHANGELOG-old.md | 44 +++++++++ object_store/CHANGELOG.md | 91 +++++++++++++------ object_store/Cargo.toml | 2 +- object_store/dev/release/update_change_log.sh | 4 +- 4 files changed, 109 insertions(+), 32 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index a0ced7c8d21e..6780f7db4c4d 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,50 @@ # Historical Changelog + +## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) + +**Implemented enhancements:** + +- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) +- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) +- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) +- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Closed issues:** + +- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) + +**Merged pull requests:** + +- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) +- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) +- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) +- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) +- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) + ## [object_store_0.7.0](https://github.com/apache/arrow-rs/tree/object_store_0.7.0) (2023-08-15) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.6.1...object_store_0.7.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 1f069ce41eac..c24cf54cc3be 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,48 +19,81 @@ # Changelog -## [object_store_0.7.1](https://github.com/apache/arrow-rs/tree/object_store_0.7.1) (2023-09-26) +## [object_store_0.8.0](https://github.com/apache/arrow-rs/tree/object_store_0.8.0) (2023-11-02) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.0...object_store_0.7.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.7.1...object_store_0.8.0) + +**Breaking changes:** + +- Remove ObjectStore::append [\#5016](https://github.com/apache/arrow-rs/pull/5016) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Don't panic on invalid Azure access key \(\#4972\) [\#4974](https://github.com/apache/arrow-rs/pull/4974) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Return `PutResult` with an ETag from ObjectStore::put \(\#4934\) [\#4944](https://github.com/apache/arrow-rs/pull/4944) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Automatically Cleanup LocalFileSystem Temporary Files [\#4778](https://github.com/apache/arrow-rs/issues/4778) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object-store: Expose an async reader API for object store [\#4762](https://github.com/apache/arrow-rs/issues/4762) -- Improve proxy support by using reqwest::Proxy as configuration [\#4713](https://github.com/apache/arrow-rs/issues/4713) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Relax Path Safety on Parse [\#5019](https://github.com/apache/arrow-rs/issues/5019) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore: hard to determine the cause of the error thrown from retry [\#5013](https://github.com/apache/arrow-rs/issues/5013) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- continue existing multi-part upload [\#4961](https://github.com/apache/arrow-rs/issues/4961) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Simplify ObjectStore::List [\#4946](https://github.com/apache/arrow-rs/issues/4946) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Return ETag and Version on Put [\#4934](https://github.com/apache/arrow-rs/issues/4934) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support Not Signing Requests in AmazonS3 [\#4927](https://github.com/apache/arrow-rs/issues/4927) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Get Object By Version [\#4925](https://github.com/apache/arrow-rs/issues/4925) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Conditional Put Support [\#4879](https://github.com/apache/arrow-rs/issues/4879) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- creates\_dir\_if\_not\_present\_append Test is Flaky [\#4872](https://github.com/apache/arrow-rs/issues/4872) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Release object\_store `0.7.1` [\#4858](https://github.com/apache/arrow-rs/issues/4858) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support User-Defined Object Metadata [\#4754](https://github.com/apache/arrow-rs/issues/4754) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- APIs for directly managing multi-part uploads and saving potential parquet footers [\#4608](https://github.com/apache/arrow-rs/issues/4608) **Fixed bugs:** -- object-store: http shouldn't perform range requests unless `accept-ranges: bytes` header is present [\#4839](https://github.com/apache/arrow-rs/issues/4839) -- object-store: http-store fails when url doesn't have last-modified header on 0.7.0 [\#4831](https://github.com/apache/arrow-rs/issues/4831) -- object-store fails to compile for `wasm32-unknown-unknown` with `http` feature [\#4776](https://github.com/apache/arrow-rs/issues/4776) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- object-store: could not find `header` in `client` for `http` feature [\#4775](https://github.com/apache/arrow-rs/issues/4775) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- LocalFileSystem Copy and Rename Don't Create Intermediate Directories [\#4760](https://github.com/apache/arrow-rs/issues/4760) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- LocalFileSystem Copy is not Atomic [\#4758](https://github.com/apache/arrow-rs/issues/4758) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore parse\_url Incorrectly Handles URLs with Spaces [\#5017](https://github.com/apache/arrow-rs/issues/5017) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objects-store\]: periods/dots error in GCP bucket [\#4991](https://github.com/apache/arrow-rs/issues/4991) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure ImdsManagedIdentityProvider does not work in Azure functions [\#4976](https://github.com/apache/arrow-rs/issues/4976) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Panic when using an azure object store with an invalid access key [\#4972](https://github.com/apache/arrow-rs/issues/4972) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Handle Body Errors in AWS CompleteMultipartUpload [\#4965](https://github.com/apache/arrow-rs/issues/4965) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- ObjectStore multiple\_append Test is Flaky [\#4868](https://github.com/apache/arrow-rs/issues/4868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[objectstore\] Problem with special characters in file path [\#4454](https://github.com/apache/arrow-rs/issues/4454) **Closed issues:** -- object\_store Azure Government Cloud functionality? [\#4853](https://github.com/apache/arrow-rs/issues/4853) +- Include onelake fabric path for https [\#5000](https://github.com/apache/arrow-rs/issues/5000) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object\_store\] Support generating and using signed upload URLs [\#4763](https://github.com/apache/arrow-rs/issues/4763) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- Add ObjectStore BufReader \(\#4762\) [\#4857](https://github.com/apache/arrow-rs/pull/4857) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Allow overriding azure endpoint [\#4854](https://github.com/apache/arrow-rs/pull/4854) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Minor: Improve object\_store docs.rs landing page [\#4849](https://github.com/apache/arrow-rs/pull/4849) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) -- Error if Remote Ignores HTTP Range Header [\#4841](https://github.com/apache/arrow-rs/pull/4841) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) -- Perform HEAD request for HttpStore::head [\#4837](https://github.com/apache/arrow-rs/pull/4837) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- fix: object store http header last modified [\#4834](https://github.com/apache/arrow-rs/pull/4834) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([universalmind303](https://github.com/universalmind303)) -- Prepare arrow 47.0.0 [\#4827](https://github.com/apache/arrow-rs/pull/4827) ([tustvold](https://github.com/tustvold)) -- ObjectStore Wasm32 Fixes \(\#4775\) \(\#4776\) [\#4796](https://github.com/apache/arrow-rs/pull/4796) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Best effort cleanup of staged upload files \(\#4778\) [\#4792](https://github.com/apache/arrow-rs/pull/4792) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Relaxing type bounds on coalesce\_ranges and collect\_bytes [\#4787](https://github.com/apache/arrow-rs/pull/4787) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) -- Update object\_store chrono deprecations [\#4786](https://github.com/apache/arrow-rs/pull/4786) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Make coalesce\_ranges and collect\_bytes available for crate users [\#4784](https://github.com/apache/arrow-rs/pull/4784) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([sumerman](https://github.com/sumerman)) -- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Make ObjectStore::copy Atomic and Automatically Create Parent Directories \(\#4758\) \(\#4760\) [\#4759](https://github.com/apache/arrow-rs/pull/4759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) -- Update nix requirement from 0.26.1 to 0.27.1 in /object\_store [\#4744](https://github.com/apache/arrow-rs/pull/4744) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) -- Add `with_proxy_ca_certificate` and `with_proxy_excludes` [\#4714](https://github.com/apache/arrow-rs/pull/4714) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([gordonwang0](https://github.com/gordonwang0)) -- Update object\_store Dependencies and Configure Dependabot [\#4700](https://github.com/apache/arrow-rs/pull/4700) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path safety \(\#5019\) [\#5020](https://github.com/apache/arrow-rs/pull/5020) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Decode URL paths \(\#5017\) [\#5018](https://github.com/apache/arrow-rs/pull/5018) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- ObjectStore: make error msg thrown from retry more detailed [\#5012](https://github.com/apache/arrow-rs/pull/5012) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Rachelint](https://github.com/Rachelint)) +- Support onelake fabric paths in parse\_url \(\#5000\) [\#5002](https://github.com/apache/arrow-rs/pull/5002) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Object tagging \(\#4754\) [\#4999](https://github.com/apache/arrow-rs/pull/4999) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) +- Pushdown list\_with\_offset for GCS [\#4993](https://github.com/apache/arrow-rs/pull/4993) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support bucket name with `.` when parsing GCS URL \(\#4991\) [\#4992](https://github.com/apache/arrow-rs/pull/4992) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Increase default timeout to 30 seconds [\#4989](https://github.com/apache/arrow-rs/pull/4989) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Conditional Put \(\#4879\) [\#4984](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Update quick-xml requirement from 0.30.0 to 0.31.0 in /object\_store [\#4983](https://github.com/apache/arrow-rs/pull/4983) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support ImdsManagedIdentityProvider in Azure Functions \(\#4976\) [\#4977](https://github.com/apache/arrow-rs/pull/4977) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add MultiPartStore \(\#4961\) \(\#4608\) [\#4971](https://github.com/apache/arrow-rs/pull/4971) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Split gcp Module [\#4956](https://github.com/apache/arrow-rs/pull/4956) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add module links in docs root [\#4955](https://github.com/apache/arrow-rs/pull/4955) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Prepare arrow 48.0.0 [\#4948](https://github.com/apache/arrow-rs/pull/4948) ([tustvold](https://github.com/tustvold)) +- Allow opting out of request signing \(\#4927\) [\#4929](https://github.com/apache/arrow-rs/pull/4929) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Default connection and request timeouts of 5 seconds [\#4928](https://github.com/apache/arrow-rs/pull/4928) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Support service\_account in ApplicationDefaultCredentials and Use SelfSignedJwt [\#4926](https://github.com/apache/arrow-rs/pull/4926) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Generate `ETag`s for `InMemory` and `LocalFileSystem` \(\#4879\) [\#4922](https://github.com/apache/arrow-rs/pull/4922) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Cleanup `object_store::retry` client error handling [\#4915](https://github.com/apache/arrow-rs/pull/4915) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Update ring requirement from 0.16 to 0.17 in /object\_store [\#4887](https://github.com/apache/arrow-rs/pull/4887) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add AWS presigned URL support [\#4876](https://github.com/apache/arrow-rs/pull/4876) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([carols10cents](https://github.com/carols10cents)) +- Flush in creates\_dir\_if\_not\_present\_append \(\#4872\) [\#4874](https://github.com/apache/arrow-rs/pull/4874) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Flush in multiple\_append test \(\#4868\) [\#4869](https://github.com/apache/arrow-rs/pull/4869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c8cf4e280236..7fcb6ce9e3f1 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.7.1" +version = "0.8.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index aeec3caf4f57..33eeb33860f6 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.7.0" -FUTURE_RELEASE="object_store_0.7.1" +SINCE_TAG="object_store_0.7.1" +FUTURE_RELEASE="object_store_0.8.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 829708d4911bc0798ca2a388cd7df20478a4cc08 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:37:19 +0000 Subject: [PATCH 1326/1411] Verify object_store with all features (#5024) --- object_store/dev/release/verify-release-candidate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/dev/release/verify-release-candidate.sh b/object_store/dev/release/verify-release-candidate.sh index 06a5d8bcb838..b24bd8fbb743 100755 --- a/object_store/dev/release/verify-release-candidate.sh +++ b/object_store/dev/release/verify-release-candidate.sh @@ -103,7 +103,7 @@ test_source_distribution() { # build and test rust cargo build - cargo test --all + cargo test --all --all-features # verify that the crate can be published to crates.io cargo publish --dry-run From e4689e6de7f5f995a47fe563b60f81dedc497681 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 2 Nov 2023 15:27:25 +0000 Subject: [PATCH 1327/1411] Fix invalid_path test (#5026) --- object_store/src/local.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index e5c4e3204663..dd71d9ec1219 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -1384,13 +1384,14 @@ mod tests { std::fs::write(emoji, "foo").unwrap(); // Can list illegal file - let paths = flatten_list_stream(&integration, None).await.unwrap(); + let mut paths = flatten_list_stream(&integration, None).await.unwrap(); + paths.sort_unstable(); assert_eq!( paths, vec![ - Path::parse("💀").unwrap(), - Path::parse("directory/child.txt").unwrap() + Path::parse("directory/child.txt").unwrap(), + Path::parse("💀").unwrap() ] ); } From 1807abace4153cbfd287ee830f86ebaeeae50234 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 2 Nov 2023 11:28:47 -0400 Subject: [PATCH 1328/1411] chore: Update docs to refer to non deprecated function (`partition`) (#5027) * chore: Update docs to refer to non deprecated function * chore: Update docs to refer to non deprecated function --- arrow-ord/src/partition.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arrow-ord/src/partition.rs b/arrow-ord/src/partition.rs index 12ab8dba04f7..8c87eefadbf0 100644 --- a/arrow-ord/src/partition.rs +++ b/arrow-ord/src/partition.rs @@ -78,7 +78,7 @@ impl Partitions { /// # Example: /// /// For example, given columns `x`, `y` and `z`, calling -/// `lexicographical_partition_ranges(values, (x, y))` will divide the +/// [`partition`]`(values, (x, y))` will divide the /// rows into ranges where the values of `(x, y)` are equal: /// /// ```text @@ -160,8 +160,9 @@ fn find_boundaries(v: &dyn Array) -> Result { Ok(distinct(&v1, &v2)?.values().clone()) } -/// Given a list of already sorted columns, find partition ranges that would partition -/// lexicographically equal values across columns. +/// Use [`partition`] instead. Given a list of already sorted columns, find +/// partition ranges that would partition lexicographically equal values across +/// columns. /// /// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive /// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last From a447bcf0ee5ca757a81cdf95e8e389af4673f0da Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 2 Nov 2023 13:56:41 -0700 Subject: [PATCH 1329/1411] doc: update comment on sort_to_indices to reflect that IEE 754 totalOrder is used (#5033) --- arrow-ord/src/sort.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 92b20c4ad08c..7d749da51327 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -191,8 +191,7 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { } /// Sort elements from `ArrayRef` into an unsigned integer (`UInt32Array`) of indices. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value. -/// `limit` is an option for [partial_sort]. +/// Floats are sorted using IEEE 754 totalOrder. `limit` is an option for [partial_sort]. pub fn sort_to_indices( array: &dyn Array, options: Option, From 7705acad845e8b2a366a08640f7acb4033ed7049 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 3 Nov 2023 00:48:41 -0700 Subject: [PATCH 1330/1411] Support casting from integer to binary (#5015) * Support casting from integer to binary * Fix clippy * For review * Reuse array buffers --- arrow-cast/src/cast.rs | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 97307f076f34..684e02b87e6c 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -203,6 +203,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), + (_, Binary | LargeBinary) => from_type.is_integer(), + // start numeric casts ( UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, @@ -1368,6 +1370,28 @@ pub fn cast_with_options( (from_type, Utf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } + (from_type, Binary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, + (from_type, LargeBinary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, // start numeric casts (UInt8, UInt16) => cast_numeric_arrays::(array, cast_options), (UInt8, UInt32) => cast_numeric_arrays::(array, cast_options), @@ -2317,6 +2341,19 @@ fn value_to_string( Ok(Arc::new(builder.finish())) } +fn cast_numeric_to_binary( + array: &dyn Array, +) -> Result { + let array = array.as_primitive::(); + let size = std::mem::size_of::(); + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(array.len())); + Ok(Arc::new(GenericBinaryArray::::new( + offsets, + array.values().inner().clone(), + array.nulls().cloned(), + ))) +} + /// Parse UTF-8 fn parse_string( array: &dyn Array, @@ -5176,6 +5213,44 @@ mod tests { assert!(down_cast.is_null(2)); } + #[test] + fn test_numeric_to_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + + #[test] + fn test_numeric_to_large_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); + + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + #[test] fn test_cast_date32_to_int32() { let array = Date32Array::from(vec![10000, 17890]); From dcbe546529faeb9e282819d04ca1b4a3c668a747 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Nov 2023 13:27:31 +0000 Subject: [PATCH 1331/1411] Update object_store 0.8.0 (#5043) --- parquet/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 659e2c0ee3a7..e5f5e1652b82 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -44,7 +44,7 @@ arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } # Intentionally not a path dependency as object_store is released separately -object_store = { version = "0.7", default-features = false, optional = true } +object_store = { version = "0.8", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } thrift = { version = "0.17", default-features = false } From 91acfb07a9929a2d6721c5417e47c0c472372a86 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 6 Nov 2023 14:01:47 +0000 Subject: [PATCH 1332/1411] Fix serialization of large integers (#5038) (#5042) --- arrow-json/src/reader/mod.rs | 30 ++++++++++++++++++++++++ arrow-json/src/reader/primitive_array.rs | 2 +- arrow-json/src/reader/tape.rs | 2 +- arrow-json/src/reader/timestamp_array.rs | 4 +++- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 28282c4d1541..71a73df9fedb 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -2229,4 +2229,34 @@ mod tests { let values = b.column(0).as_primitive::().values(); assert_eq!(values, &[1, 2, 3, 4]); } + + #[test] + fn test_serde_large_numbers() { + let field = Field::new("int", DataType::Int64, true); + let mut decoder = ReaderBuilder::new_with_field(field) + .build_decoder() + .unwrap(); + + decoder.serialize(&[1699148028689_u64, 2, 3, 4]).unwrap(); + let b = decoder.flush().unwrap().unwrap(); + let values = b.column(0).as_primitive::().values(); + assert_eq!(values, &[1699148028689, 2, 3, 4]); + + let field = Field::new( + "int", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ); + let mut decoder = ReaderBuilder::new_with_field(field) + .build_decoder() + .unwrap(); + + decoder.serialize(&[1699148028689_u64, 2, 3, 4]).unwrap(); + let b = decoder.flush().unwrap().unwrap(); + let values = b + .column(0) + .as_primitive::() + .values(); + assert_eq!(values, &[1699148028689, 2, 3, 4]); + } } diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs index a03a41e96dcb..1bd1176131ae 100644 --- a/arrow-json/src/reader/primitive_array.rs +++ b/arrow-json/src/reader/primitive_array.rs @@ -142,7 +142,7 @@ where }, TapeElement::I64(high) => match tape.get(p + 1) { TapeElement::I32(low) => { - let v = (high as i64) << 32 | low as i64; + let v = (high as i64) << 32 | (low as u32) as i64; let value = NumCast::from(v).ok_or_else(|| { ArrowError::JsonError(format!("failed to parse {v} as {d}",)) })?; diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 4822ad0bf43d..c783f6a51022 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -180,7 +180,7 @@ impl<'a> Tape<'a> { TapeElement::Null => out.push_str("null"), TapeElement::I64(high) => match self.get(idx + 1) { TapeElement::I32(low) => { - let val = (high as i64) << 32 | low as i64; + let val = (high as i64) << 32 | (low as u32) as i64; let _ = write!(out, "{val}"); return idx + 2; } diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs index dda5a653d730..f68fc3dc3270 100644 --- a/arrow-json/src/reader/timestamp_array.rs +++ b/arrow-json/src/reader/timestamp_array.rs @@ -96,7 +96,9 @@ where } TapeElement::I32(v) => builder.append_value(v as i64), TapeElement::I64(high) => match tape.get(p + 1) { - TapeElement::I32(low) => builder.append_value((high as i64) << 32 | low as i64), + TapeElement::I32(low) => { + builder.append_value((high as i64) << 32 | (low as u32) as i64) + } _ => unreachable!(), }, _ => return Err(tape.error(*p, "primitive")), From 8c20c98c1eef9142ec7712ff762e20125a4e076d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 7 Nov 2023 02:56:38 -0800 Subject: [PATCH 1333/1411] Cast from integer/timestamp to timestamp/integer (#5040) * Cast from integer to timestamp * Fix * For review --- arrow-cast/src/cast.rs | 124 ++++++++++++++++++++++++++++++++++------- 1 file changed, 104 insertions(+), 20 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 684e02b87e6c..e44133f81b4a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -227,8 +227,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Time64(_), Time32(to_unit)) => { matches!(to_unit, Second | Millisecond) } - (Timestamp(_, _), Int64) => true, - (Int64, Timestamp(_, _)) => true, + (Timestamp(_, _), _) if to_type.is_integer() => true, + (_, Timestamp(_, _)) if from_type.is_integer() => true, (Date64, Timestamp(_, None)) => true, (Date32, Timestamp(_, None)) => true, ( @@ -1621,24 +1621,31 @@ pub fn cast_with_options( .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), )), - (Timestamp(TimeUnit::Second, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Second, _), _) if to_type.is_integer() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Millisecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Millisecond, _), _) if to_type.is_integer() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Microsecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Microsecond, _), _) if to_type.is_integer() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Nanosecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Nanosecond, _), _) if to_type.is_integer() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Int64, Timestamp(unit, tz)) => Ok(make_timestamp_array( - array.as_primitive(), - unit.clone(), - tz.clone(), - )), + (_, Timestamp(unit, tz)) if from_type.is_integer() => { + let array = cast_with_options(array, &Int64, cast_options)?; + Ok(make_timestamp_array( + array.as_primitive(), + unit.clone(), + tz.clone(), + )) + } (Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => { let array = cast_with_options(array, &Int64, cast_options)?; @@ -4559,10 +4566,72 @@ mod tests { } #[test] - #[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] - fn test_cast_int32_to_timestamp() { + fn test_cast_integer_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Int8Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Int16Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + let array = Int32Array::from(vec![Some(2), Some(10), None]); - cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt8Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt16Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt32Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt64Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_integer() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast(&cast(&array, &DataType::Int8).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Int16).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Int32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt8).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt16).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt64).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); } #[test] @@ -4617,7 +4686,6 @@ mod tests { } #[test] - #[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] fn test_cast_list_i32_to_list_timestamp() { // Construct a value array let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); @@ -4634,7 +4702,7 @@ mod tests { .unwrap(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - cast( + let actual = cast( &list_array, &DataType::List(Arc::new(Field::new( "item", @@ -4643,6 +4711,22 @@ mod tests { ))), ) .unwrap(); + + let expected = cast( + &cast( + &list_array, + &DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + ) + .unwrap(), + &DataType::List(Arc::new(Field::new( + "item", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ))), + ) + .unwrap(); + + assert_eq!(&actual, &expected); } #[test] From 20f10dcd2159199e36d128a2143eca48ae7438bb Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Nov 2023 11:35:45 +0000 Subject: [PATCH 1334/1411] Fix RowSelection::intersection (#5036) (#5041) * Fix RowSelection::intersection (#5036) * Review feedback --- parquet/src/arrow/arrow_reader/selection.rs | 282 ++++++++++++-------- 1 file changed, 167 insertions(+), 115 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 5063d24afd5f..cebf3f9d38b6 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -64,25 +64,30 @@ impl RowSelector { /// use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; /// /// let selectors = vec![ -/// RowSelector { row_count: 5, skip: true }, -/// RowSelector { row_count: 5, skip: false }, -/// RowSelector { row_count: 5, skip: false }, -/// RowSelector { row_count: 5, skip: true }, +/// RowSelector::skip(5), +/// RowSelector::select(5), +/// RowSelector::select(5), +/// RowSelector::skip(5), /// ]; /// /// // Creating a selection will combine adjacent selectors /// let selection: RowSelection = selectors.into(); /// /// let expected = vec![ -/// RowSelector { row_count: 5, skip: true }, -/// RowSelector { row_count: 10, skip: false }, -/// RowSelector { row_count: 5, skip: true }, +/// RowSelector::skip(5), +/// RowSelector::select(10), +/// RowSelector::skip(5), /// ]; /// /// let actual: Vec = selection.into(); /// assert_eq!(actual, expected); /// ``` /// +/// A [`RowSelection`] maintains the following invariants: +/// +/// * It contains no [`RowSelector`] of 0 rows +/// * Consecutive [`RowSelector`]s alternate skipping or selecting rows +/// /// [`PageIndex`]: crate::file::page_index::index::PageIndex #[derive(Debug, Clone, Default, Eq, PartialEq)] pub struct RowSelection { @@ -118,10 +123,13 @@ impl RowSelection { let mut last_end = 0; for range in ranges { let len = range.end - range.start; + if len == 0 { + continue; + } match range.start.cmp(&last_end) { Ordering::Equal => match selectors.last_mut() { - Some(last) => last.row_count += len, + Some(last) => last.row_count = last.row_count.checked_add(len).unwrap(), None => selectors.push(RowSelector::select(len)), }, Ordering::Greater => { @@ -140,38 +148,6 @@ impl RowSelection { Self { selectors } } - /// Creates a [`RowSelection`] from a slice of uncombined `RowSelector`: - /// Like [skip(5),skip(5),read(10)]. - /// After combine will return [skip(10),read(10)] - /// # Note - /// [`RowSelection`] must be combined prior to use within offset_index or else the code will panic. - fn from_selectors_and_combine(selectors: &[RowSelector]) -> Self { - if selectors.len() < 2 { - return Self { - selectors: Vec::from(selectors), - }; - } - let first = selectors.first().unwrap(); - let mut sum_rows = first.row_count; - let mut skip = first.skip; - let mut combined_result = vec![]; - - for s in selectors.iter().skip(1) { - if s.skip == skip { - sum_rows += s.row_count - } else { - add_selector(skip, sum_rows, &mut combined_result); - sum_rows = s.row_count; - skip = s.skip; - } - } - add_selector(skip, sum_rows, &mut combined_result); - - Self { - selectors: combined_result, - } - } - /// Given an offset index, return the byte ranges for all data pages selected by `self` /// /// This is useful for determining what byte ranges to fetch from underlying storage @@ -351,9 +327,7 @@ impl RowSelection { /// /// returned: NNNNNNNNYYNYN pub fn intersection(&self, other: &Self) -> Self { - Self { - selectors: intersect_row_selections(&self.selectors, &other.selectors), - } + intersect_row_selections(&self.selectors, &other.selectors) } /// Returns `true` if this [`RowSelection`] selects any rows @@ -443,7 +417,37 @@ impl RowSelection { impl From> for RowSelection { fn from(selectors: Vec) -> Self { - Self::from_selectors_and_combine(selectors.as_slice()) + selectors.into_iter().collect() + } +} + +impl FromIterator for RowSelection { + fn from_iter>(iter: T) -> Self { + let iter = iter.into_iter(); + + // Capacity before filter + let mut selectors = Vec::with_capacity(iter.size_hint().0); + + let mut filtered = iter.filter(|x| x.row_count != 0); + if let Some(x) = filtered.next() { + selectors.push(x); + } + + for s in filtered { + if s.row_count == 0 { + continue; + } + + // Combine consecutive selectors + let last = selectors.last_mut().unwrap(); + if last.skip == s.skip { + last.row_count = last.row_count.checked_add(s.row_count).unwrap(); + } else { + selectors.push(s) + } + } + + Self { selectors } } } @@ -465,64 +469,58 @@ impl From for VecDeque { /// other: NYNNNNNNY /// /// returned: NNNNNNNNYYNYN -fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> Vec { - let mut res = Vec::with_capacity(left.len()); +fn intersect_row_selections(left: &[RowSelector], right: &[RowSelector]) -> RowSelection { let mut l_iter = left.iter().copied().peekable(); let mut r_iter = right.iter().copied().peekable(); - while let (Some(a), Some(b)) = (l_iter.peek_mut(), r_iter.peek_mut()) { - if a.row_count == 0 { - l_iter.next().unwrap(); - continue; - } - if b.row_count == 0 { - r_iter.next().unwrap(); - continue; - } - match (a.skip, b.skip) { - // Keep both ranges - (false, false) => { - if a.row_count < b.row_count { - res.push(RowSelector::select(a.row_count)); - b.row_count -= a.row_count; + let iter = std::iter::from_fn(move || { + loop { + let l = l_iter.peek_mut(); + let r = r_iter.peek_mut(); + + match (l, r) { + (Some(a), _) if a.row_count == 0 => { l_iter.next().unwrap(); - } else { - res.push(RowSelector::select(b.row_count)); - a.row_count -= b.row_count; - r_iter.next().unwrap(); } - } - // skip at least one - _ => { - if a.row_count < b.row_count { - res.push(RowSelector::skip(a.row_count)); - b.row_count -= a.row_count; - l_iter.next().unwrap(); - } else { - res.push(RowSelector::skip(b.row_count)); - a.row_count -= b.row_count; + (_, Some(b)) if b.row_count == 0 => { r_iter.next().unwrap(); } + (Some(l), Some(r)) => { + return match (l.skip, r.skip) { + // Keep both ranges + (false, false) => { + if l.row_count < r.row_count { + r.row_count -= l.row_count; + l_iter.next() + } else { + l.row_count -= r.row_count; + r_iter.next() + } + } + // skip at least one + _ => { + if l.row_count < r.row_count { + let skip = l.row_count; + r.row_count -= l.row_count; + l_iter.next(); + Some(RowSelector::skip(skip)) + } else { + let skip = r.row_count; + l.row_count -= skip; + r_iter.next(); + Some(RowSelector::skip(skip)) + } + } + }; + } + (Some(_), None) => return l_iter.next(), + (None, Some(_)) => return r_iter.next(), + (None, None) => return None, } } - } - - if l_iter.peek().is_some() { - res.extend(l_iter); - } - if r_iter.peek().is_some() { - res.extend(r_iter); - } - res -} + }); -fn add_selector(skip: bool, sum_row: usize, combined_result: &mut Vec) { - let selector = if skip { - RowSelector::skip(sum_row) - } else { - RowSelector::select(sum_row) - }; - combined_result.push(selector); + iter.collect() } #[cfg(test)] @@ -770,40 +768,28 @@ mod tests { RowSelector::skip(4), ]); - assert_eq!(RowSelection::from_selectors_and_combine(&a), expected); - assert_eq!(RowSelection::from_selectors_and_combine(&b), expected); - assert_eq!(RowSelection::from_selectors_and_combine(&c), expected); + assert_eq!(RowSelection::from_iter(a), expected); + assert_eq!(RowSelection::from_iter(b), expected); + assert_eq!(RowSelection::from_iter(c), expected); } #[test] fn test_combine_2elements() { let a = vec![RowSelector::select(10), RowSelector::select(5)]; let a_expect = vec![RowSelector::select(15)]; - assert_eq!( - RowSelection::from_selectors_and_combine(&a).selectors, - a_expect - ); + assert_eq!(RowSelection::from_iter(a).selectors, a_expect); let b = vec![RowSelector::select(10), RowSelector::skip(5)]; let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; - assert_eq!( - RowSelection::from_selectors_and_combine(&b).selectors, - b_expect - ); + assert_eq!(RowSelection::from_iter(b).selectors, b_expect); let c = vec![RowSelector::skip(10), RowSelector::select(5)]; let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; - assert_eq!( - RowSelection::from_selectors_and_combine(&c).selectors, - c_expect - ); + assert_eq!(RowSelection::from_iter(c).selectors, c_expect); let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; let d_expect = vec![RowSelector::skip(15)]; - assert_eq!( - RowSelection::from_selectors_and_combine(&d).selectors, - d_expect - ); + assert_eq!(RowSelection::from_iter(d).selectors, d_expect); } #[test] @@ -859,7 +845,7 @@ mod tests { let res = intersect_row_selections(&a, &b); assert_eq!( - RowSelection::from_selectors_and_combine(&res).selectors, + res.selectors, vec![ RowSelector::select(5), RowSelector::skip(4), @@ -877,7 +863,7 @@ mod tests { let b = vec![RowSelector::select(36), RowSelector::skip(36)]; let res = intersect_row_selections(&a, &b); assert_eq!( - RowSelection::from_selectors_and_combine(&res).selectors, + res.selectors, vec![RowSelector::select(3), RowSelector::skip(69)] ); @@ -892,7 +878,7 @@ mod tests { ]; let res = intersect_row_selections(&a, &b); assert_eq!( - RowSelection::from_selectors_and_combine(&res).selectors, + res.selectors, vec![RowSelector::select(2), RowSelector::skip(8)] ); @@ -906,7 +892,7 @@ mod tests { ]; let res = intersect_row_selections(&a, &b); assert_eq!( - RowSelection::from_selectors_and_combine(&res).selectors, + res.selectors, vec![RowSelector::select(2), RowSelector::skip(8)] ); } @@ -1142,4 +1128,70 @@ mod tests { // assert_eq!(mask, vec![false, true, true, false, true, true, true]); assert_eq!(ranges, vec![10..20, 20..30, 30..40]); } + + #[test] + fn test_empty_ranges() { + let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; + let selection = RowSelection::from_consecutive_ranges(ranges.into_iter(), 10); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(3), + RowSelector::select(1) + ] + ) + } + + #[test] + fn test_empty_selector() { + let selection = RowSelection::from(vec![ + RowSelector::skip(0), + RowSelector::select(2), + RowSelector::skip(0), + RowSelector::select(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::select(4)]); + + let selection = RowSelection::from(vec![ + RowSelector::select(0), + RowSelector::skip(2), + RowSelector::select(0), + RowSelector::skip(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); + } + + #[test] + fn test_intersection() { + let selection = RowSelection::from(vec![RowSelector::select(1048576)]); + let result = selection.intersection(&selection); + assert_eq!(result, selection); + + let a = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(20), + ]); + + let b = RowSelection::from(vec![ + RowSelector::skip(20), + RowSelector::select(20), + RowSelector::skip(10), + ]); + + let result = a.intersection(&b); + assert_eq!( + result.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(10), + RowSelector::skip(10) + ] + ); + } } From ffeda62fc9d6a182c1bf3b3212e676f74fc196df Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 8 Nov 2023 02:20:50 +1100 Subject: [PATCH 1335/1411] Parquet f32/f64 handle signed zeros in statistics (#5048) --- parquet/src/column/writer/encoder.rs | 29 ++++++- parquet/src/column/writer/mod.rs | 120 ++++++++++++++++++++++++++- parquet/src/data_type.rs | 4 +- 3 files changed, 147 insertions(+), 6 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 7bd4db30c3a8..5fd0f9e194d2 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::basic::Encoding; +use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -308,5 +308,30 @@ where max = val; } } - Some((min.clone(), max.clone())) + + // Float/Double statistics have special case for zero. + // + // If computed min is zero, whether negative or positive, + // the spec states that the min should be written as -0.0 + // (negative zero) + // + // For max, it has similar logic but will be written as 0.0 + // (positive zero) + let min = replace_zero(min, -0.0); + let max = replace_zero(max, 0.0); + + Some((min, max)) +} + +#[inline] +fn replace_zero(val: &T, replace: f32) -> T { + match T::PHYSICAL_TYPE { + Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { + T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap() + } + Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { + T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap() + } + _ => val.clone(), + } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 84bf1911d89c..307804e7dc5c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2111,6 +2111,64 @@ mod tests { assert!(matches!(stats, Statistics::Float(_))); } + #[test] + fn test_float_statistics_zero_only() { + let stats = statistics_roundtrip::(&[0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_neg_zero_only() { + let stats = statistics_roundtrip::(&[-0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_zero_min() { + let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_neg_zero_max() { + let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); @@ -2120,7 +2178,7 @@ mod tests { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); } else { - panic!("expecting Statistics::Float"); + panic!("expecting Statistics::Double"); } } @@ -2133,7 +2191,7 @@ mod tests { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); } else { - panic!("expecting Statistics::Float"); + panic!("expecting Statistics::Double"); } } @@ -2145,6 +2203,64 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); } + #[test] + fn test_double_statistics_zero_only() { + let stats = statistics_roundtrip::(&[0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_neg_zero_only() { + let stats = statistics_roundtrip::(&[-0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_zero_min() { + let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_neg_zero_max() { + let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + #[test] fn test_compare_greater_byte_array_decimals() { assert!(!compare_greater_byte_array_decimals(&[], &[],),); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 7e64478ed940..eaf4389d4350 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -632,7 +632,7 @@ pub(crate) mod private { /// Return the value as i64 if possible /// - /// This is essentially the same as `std::convert::TryInto` but can + /// This is essentially the same as `std::convert::TryInto` but can't be /// implemented for `f32` and `f64`, types that would fail orphan rules fn as_i64(&self) -> Result { Err(general_err!("Type cannot be converted to i64")) @@ -640,7 +640,7 @@ pub(crate) mod private { /// Return the value as u64 if possible /// - /// This is essentially the same as `std::convert::TryInto` but can + /// This is essentially the same as `std::convert::TryInto` but can't be /// implemented for `f32` and `f64`, types that would fail orphan rules fn as_u64(&self) -> Result { self.as_i64() From 1d1693777e13e0bc9e0b5326f1256d629278d4bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Nov 2023 18:49:24 +0000 Subject: [PATCH 1336/1411] Prepare arrow 49.0.0 (#5054) --- CHANGELOG-old.md | 80 ++++++++++++++++++++++++++ CHANGELOG.md | 98 +++++++++++--------------------- Cargo.toml | 32 +++++------ dev/release/update_change_log.sh | 4 +- 4 files changed, 131 insertions(+), 83 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index cde9b8f3b521..336adff990bd 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,86 @@ # Historical Changelog +## [48.0.0](https://github.com/apache/arrow-rs/tree/48.0.0) (2023-10-18) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/47.0.0...48.0.0) + +**Breaking changes:** + +- Evaluate null\_regex for string type in csv \(now such values will be parsed as `Null` rather than `""`\) [\#4942](https://github.com/apache/arrow-rs/pull/4942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haohuaijin](https://github.com/haohuaijin)) +- fix\(csv\)!: infer null for empty column. [\#4910](https://github.com/apache/arrow-rs/pull/4910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- feat: log headers/trailers in flight CLI \(+ minor fixes\) [\#4898](https://github.com/apache/arrow-rs/pull/4898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- fix\(arrow-json\)!: include null fields in schema inference with a type of Null [\#4894](https://github.com/apache/arrow-rs/pull/4894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Mark OnCloseRowGroup Send [\#4893](https://github.com/apache/arrow-rs/pull/4893) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) +- Specialize Thrift Decoding \(~40% Faster\) \(\#4891\) [\#4892](https://github.com/apache/arrow-rs/pull/4892) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make ArrowRowGroupWriter Public and SerializedRowGroupWriter Send [\#4850](https://github.com/apache/arrow-rs/pull/4850) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) + +**Implemented enhancements:** + +- Allow schema fields to merge with `Null` datatype [\#4901](https://github.com/apache/arrow-rs/issues/4901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add option to FlightDataEncoder to always send dictionaries [\#4895](https://github.com/apache/arrow-rs/issues/4895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Rework Thrift Encoding / Decoding of Parquet Metadata [\#4891](https://github.com/apache/arrow-rs/issues/4891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Implement Take for UnionArray [\#4882](https://github.com/apache/arrow-rs/issues/4882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check precision overflow for casting floating to decimal [\#4865](https://github.com/apache/arrow-rs/issues/4865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace lexical [\#4774](https://github.com/apache/arrow-rs/issues/4774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add read access to settings in `csv::WriterBuilder` [\#4735](https://github.com/apache/arrow-rs/issues/4735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve the performance of "DictionaryValue" row encoding [\#4712](https://github.com/apache/arrow-rs/issues/4712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Should we make blank values and empty string to `None` in csv? [\#4939](https://github.com/apache/arrow-rs/issues/4939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] SubstraitPlan structure is not exported [\#4932](https://github.com/apache/arrow-rs/issues/4932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Loading page index breaks skipping of pages with nested types [\#4921](https://github.com/apache/arrow-rs/issues/4921) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- CSV schema inference assumes `Utf8` for empty columns [\#4903](https://github.com/apache/arrow-rs/issues/4903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: Field Ids are not read from a Parquet file without serialized arrow schema [\#4877](https://github.com/apache/arrow-rs/issues/4877) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- make\_primitive\_scalar function loses DataType Internal information [\#4851](https://github.com/apache/arrow-rs/issues/4851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- StructBuilder doesn't handle nulls correctly for empty structs [\#4842](https://github.com/apache/arrow-rs/issues/4842) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `NullArray::is_null()` returns `false` incorrectly [\#4835](https://github.com/apache/arrow-rs/issues/4835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- cast\_string\_to\_decimal should check precision overflow [\#4829](https://github.com/apache/arrow-rs/issues/4829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Null fields are omitted by `infer_json_schema_from_seekable` [\#4814](https://github.com/apache/arrow-rs/issues/4814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Support for reading JSON Array to Arrow [\#4905](https://github.com/apache/arrow-rs/issues/4905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Assume Pages Delimit Records When Offset Index Loaded \(\#4921\) [\#4943](https://github.com/apache/arrow-rs/pull/4943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update pyo3 requirement from 0.19 to 0.20 [\#4941](https://github.com/apache/arrow-rs/pull/4941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add `FileWriter` schema getter [\#4940](https://github.com/apache/arrow-rs/pull/4940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haixuanTao](https://github.com/haixuanTao)) +- feat: support parsing for parquet writer option [\#4938](https://github.com/apache/arrow-rs/pull/4938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([fansehep](https://github.com/fansehep)) +- Export `SubstraitPlan` structure in arrow\_flight::sql \(\#4932\) [\#4933](https://github.com/apache/arrow-rs/pull/4933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- Update zstd requirement from 0.12.0 to 0.13.0 [\#4923](https://github.com/apache/arrow-rs/pull/4923) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: add method for async read bloom filter [\#4917](https://github.com/apache/arrow-rs/pull/4917) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hengfeiyang](https://github.com/hengfeiyang)) +- Minor: Clarify rationale for `FlightDataEncoder` API, add examples [\#4916](https://github.com/apache/arrow-rs/pull/4916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Update regex-syntax requirement from 0.7.1 to 0.8.0 [\#4914](https://github.com/apache/arrow-rs/pull/4914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: document & streamline flight SQL CLI [\#4912](https://github.com/apache/arrow-rs/pull/4912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Arbitrary JSON values in JSON Reader \(\#4905\) [\#4911](https://github.com/apache/arrow-rs/pull/4911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup CSV WriterBuilder, Default to AutoSI Second Precision \(\#4735\) [\#4909](https://github.com/apache/arrow-rs/pull/4909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.68 to =1.0.69 [\#4907](https://github.com/apache/arrow-rs/pull/4907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- chore: add csv example [\#4904](https://github.com/apache/arrow-rs/pull/4904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- feat\(schema\): allow null fields to be merged with other datatypes [\#4902](https://github.com/apache/arrow-rs/pull/4902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Update proc-macro2 requirement from =1.0.67 to =1.0.68 [\#4900](https://github.com/apache/arrow-rs/pull/4900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add option to `FlightDataEncoder` to always resend batch dictionaries [\#4896](https://github.com/apache/arrow-rs/pull/4896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Support parquet bloom filter length [\#4885](https://github.com/apache/arrow-rs/pull/4885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([letian-jiang](https://github.com/letian-jiang)) +- Replace lz4 with lz4\_flex Allowing Compilation for WASM [\#4884](https://github.com/apache/arrow-rs/pull/4884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Take for UnionArray [\#4883](https://github.com/apache/arrow-rs/pull/4883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Update tonic-build requirement from =0.10.1 to =0.10.2 [\#4881](https://github.com/apache/arrow-rs/pull/4881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- parquet: Read field IDs from Parquet Schema [\#4878](https://github.com/apache/arrow-rs/pull/4878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Samrose-Ahmed](https://github.com/Samrose-Ahmed)) +- feat: improve flight CLI error handling [\#4873](https://github.com/apache/arrow-rs/pull/4873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Encoding Parquet Columns in Parallel [\#4871](https://github.com/apache/arrow-rs/pull/4871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Check precision overflow for casting floating to decimal [\#4866](https://github.com/apache/arrow-rs/pull/4866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make align\_buffers as public API [\#4863](https://github.com/apache/arrow-rs/pull/4863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) +- Faster Serde Integration \(~80% faster\) [\#4861](https://github.com/apache/arrow-rs/pull/4861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: make\_primitive\_scalar bug [\#4852](https://github.com/apache/arrow-rs/pull/4852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) +- Update tonic-build requirement from =0.10.0 to =0.10.1 [\#4846](https://github.com/apache/arrow-rs/pull/4846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow Constructing Non-Empty StructArray with no Fields \(\#4842\) [\#4845](https://github.com/apache/arrow-rs/pull/4845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Refine documentation to `Array::is_null` [\#4838](https://github.com/apache/arrow-rs/pull/4838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: add missing precision overflow checking for `cast_string_to_decimal` [\#4830](https://github.com/apache/arrow-rs/pull/4830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonahgao](https://github.com/jonahgao)) ## [47.0.0](https://github.com/apache/arrow-rs/tree/47.0.0) (2023-09-19) [Full Changelog](https://github.com/apache/arrow-rs/compare/46.0.0...47.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c5351708c0b..ba27d6679ffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,86 +19,54 @@ # Changelog -## [48.0.0](https://github.com/apache/arrow-rs/tree/48.0.0) (2023-10-18) +## [49.0.0](https://github.com/apache/arrow-rs/tree/49.0.0) (2023-11-07) -[Full Changelog](https://github.com/apache/arrow-rs/compare/47.0.0...48.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/48.0.0...49.0.0) **Breaking changes:** -- Evaluate null\_regex for string type in csv \(now such values will be parsed as `Null` rather than `""`\) [\#4942](https://github.com/apache/arrow-rs/pull/4942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haohuaijin](https://github.com/haohuaijin)) -- fix\(csv\)!: infer null for empty column. [\#4910](https://github.com/apache/arrow-rs/pull/4910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) -- feat: log headers/trailers in flight CLI \(+ minor fixes\) [\#4898](https://github.com/apache/arrow-rs/pull/4898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- fix\(arrow-json\)!: include null fields in schema inference with a type of Null [\#4894](https://github.com/apache/arrow-rs/pull/4894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) -- Mark OnCloseRowGroup Send [\#4893](https://github.com/apache/arrow-rs/pull/4893) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) -- Specialize Thrift Decoding \(~40% Faster\) \(\#4891\) [\#4892](https://github.com/apache/arrow-rs/pull/4892) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Make ArrowRowGroupWriter Public and SerializedRowGroupWriter Send [\#4850](https://github.com/apache/arrow-rs/pull/4850) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) +- Return row count when inferring schema from JSON [\#5008](https://github.com/apache/arrow-rs/pull/5008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asayers](https://github.com/asayers)) +- Update object\_store 0.8.0 [\#5043](https://github.com/apache/arrow-rs/pull/5043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- Allow schema fields to merge with `Null` datatype [\#4901](https://github.com/apache/arrow-rs/issues/4901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add option to FlightDataEncoder to always send dictionaries [\#4895](https://github.com/apache/arrow-rs/issues/4895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Rework Thrift Encoding / Decoding of Parquet Metadata [\#4891](https://github.com/apache/arrow-rs/issues/4891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) -- Implement Take for UnionArray [\#4882](https://github.com/apache/arrow-rs/issues/4882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Check precision overflow for casting floating to decimal [\#4865](https://github.com/apache/arrow-rs/issues/4865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace lexical [\#4774](https://github.com/apache/arrow-rs/issues/4774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add read access to settings in `csv::WriterBuilder` [\#4735](https://github.com/apache/arrow-rs/issues/4735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve the performance of "DictionaryValue" row encoding [\#4712](https://github.com/apache/arrow-rs/issues/4712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Cast from integer/timestamp to timestamp/integer [\#5039](https://github.com/apache/arrow-rs/issues/5039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting from integer to binary [\#5014](https://github.com/apache/arrow-rs/issues/5014) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Return row count when inferring schema from JSON [\#5007](https://github.com/apache/arrow-rs/issues/5007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] Allow custom commands in get-flight-info [\#4996](https://github.com/apache/arrow-rs/issues/4996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support `RecordBatch::remove_column()` and `Schema::remove_field()` [\#4952](https://github.com/apache/arrow-rs/issues/4952) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow_json`: support `binary` deserialization [\#4945](https://github.com/apache/arrow-rs/issues/4945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support StructArray in Cast Kernel [\#4908](https://github.com/apache/arrow-rs/issues/4908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- There exists a `ParquetRecordWriter` proc macro in `parquet_derive`, but `ParquetRecordReader` is missing [\#4772](https://github.com/apache/arrow-rs/issues/4772) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Should we make blank values and empty string to `None` in csv? [\#4939](https://github.com/apache/arrow-rs/issues/4939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- \[FlightSQL\] SubstraitPlan structure is not exported [\#4932](https://github.com/apache/arrow-rs/issues/4932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Loading page index breaks skipping of pages with nested types [\#4921](https://github.com/apache/arrow-rs/issues/4921) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- CSV schema inference assumes `Utf8` for empty columns [\#4903](https://github.com/apache/arrow-rs/issues/4903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet: Field Ids are not read from a Parquet file without serialized arrow schema [\#4877](https://github.com/apache/arrow-rs/issues/4877) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- make\_primitive\_scalar function loses DataType Internal information [\#4851](https://github.com/apache/arrow-rs/issues/4851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- StructBuilder doesn't handle nulls correctly for empty structs [\#4842](https://github.com/apache/arrow-rs/issues/4842) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `NullArray::is_null()` returns `false` incorrectly [\#4835](https://github.com/apache/arrow-rs/issues/4835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- cast\_string\_to\_decimal should check precision overflow [\#4829](https://github.com/apache/arrow-rs/issues/4829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Null fields are omitted by `infer_json_schema_from_seekable` [\#4814](https://github.com/apache/arrow-rs/issues/4814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression when serializing large json numbers [\#5038](https://github.com/apache/arrow-rs/issues/5038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RowSelection::intersection Produces Invalid RowSelection [\#5036](https://github.com/apache/arrow-rs/issues/5036) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Incorrect comment on arrow::compute::kernels::sort::sort\_to\_indices [\#5029](https://github.com/apache/arrow-rs/issues/5029) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -**Closed issues:** +**Documentation updates:** -- Support for reading JSON Array to Arrow [\#4905](https://github.com/apache/arrow-rs/issues/4905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- chore: Update docs to refer to non deprecated function \(`partition`\) [\#5027](https://github.com/apache/arrow-rs/pull/5027) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Assume Pages Delimit Records When Offset Index Loaded \(\#4921\) [\#4943](https://github.com/apache/arrow-rs/pull/4943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Update pyo3 requirement from 0.19 to 0.20 [\#4941](https://github.com/apache/arrow-rs/pull/4941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) -- Add `FileWriter` schema getter [\#4940](https://github.com/apache/arrow-rs/pull/4940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haixuanTao](https://github.com/haixuanTao)) -- feat: support parsing for parquet writer option [\#4938](https://github.com/apache/arrow-rs/pull/4938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([fansehep](https://github.com/fansehep)) -- Export `SubstraitPlan` structure in arrow\_flight::sql \(\#4932\) [\#4933](https://github.com/apache/arrow-rs/pull/4933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) -- Update zstd requirement from 0.12.0 to 0.13.0 [\#4923](https://github.com/apache/arrow-rs/pull/4923) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat: add method for async read bloom filter [\#4917](https://github.com/apache/arrow-rs/pull/4917) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hengfeiyang](https://github.com/hengfeiyang)) -- Minor: Clarify rationale for `FlightDataEncoder` API, add examples [\#4916](https://github.com/apache/arrow-rs/pull/4916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Update regex-syntax requirement from 0.7.1 to 0.8.0 [\#4914](https://github.com/apache/arrow-rs/pull/4914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- feat: document & streamline flight SQL CLI [\#4912](https://github.com/apache/arrow-rs/pull/4912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Support Arbitrary JSON values in JSON Reader \(\#4905\) [\#4911](https://github.com/apache/arrow-rs/pull/4911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup CSV WriterBuilder, Default to AutoSI Second Precision \(\#4735\) [\#4909](https://github.com/apache/arrow-rs/pull/4909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.68 to =1.0.69 [\#4907](https://github.com/apache/arrow-rs/pull/4907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- chore: add csv example [\#4904](https://github.com/apache/arrow-rs/pull/4904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) -- feat\(schema\): allow null fields to be merged with other datatypes [\#4902](https://github.com/apache/arrow-rs/pull/4902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) -- Update proc-macro2 requirement from =1.0.67 to =1.0.68 [\#4900](https://github.com/apache/arrow-rs/pull/4900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add option to `FlightDataEncoder` to always resend batch dictionaries [\#4896](https://github.com/apache/arrow-rs/pull/4896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) -- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) -- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) -- Support parquet bloom filter length [\#4885](https://github.com/apache/arrow-rs/pull/4885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([letian-jiang](https://github.com/letian-jiang)) -- Replace lz4 with lz4\_flex Allowing Compilation for WASM [\#4884](https://github.com/apache/arrow-rs/pull/4884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Implement Take for UnionArray [\#4883](https://github.com/apache/arrow-rs/pull/4883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) -- Update tonic-build requirement from =0.10.1 to =0.10.2 [\#4881](https://github.com/apache/arrow-rs/pull/4881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- parquet: Read field IDs from Parquet Schema [\#4878](https://github.com/apache/arrow-rs/pull/4878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Samrose-Ahmed](https://github.com/Samrose-Ahmed)) -- feat: improve flight CLI error handling [\#4873](https://github.com/apache/arrow-rs/pull/4873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Support Encoding Parquet Columns in Parallel [\#4871](https://github.com/apache/arrow-rs/pull/4871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Check precision overflow for casting floating to decimal [\#4866](https://github.com/apache/arrow-rs/pull/4866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Make align\_buffers as public API [\#4863](https://github.com/apache/arrow-rs/pull/4863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) -- Faster Serde Integration \(~80% faster\) [\#4861](https://github.com/apache/arrow-rs/pull/4861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix: make\_primitive\_scalar bug [\#4852](https://github.com/apache/arrow-rs/pull/4852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) -- Update tonic-build requirement from =0.10.0 to =0.10.1 [\#4846](https://github.com/apache/arrow-rs/pull/4846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Allow Constructing Non-Empty StructArray with no Fields \(\#4842\) [\#4845](https://github.com/apache/arrow-rs/pull/4845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Refine documentation to `Array::is_null` [\#4838](https://github.com/apache/arrow-rs/pull/4838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- fix: add missing precision overflow checking for `cast_string_to_decimal` [\#4830](https://github.com/apache/arrow-rs/pull/4830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonahgao](https://github.com/jonahgao)) +- Parquet f32/f64 handle signed zeros in statistics [\#5048](https://github.com/apache/arrow-rs/pull/5048) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Fix serialization of large integers in JSON \(\#5038\) [\#5042](https://github.com/apache/arrow-rs/pull/5042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix RowSelection::intersection \(\#5036\) [\#5041](https://github.com/apache/arrow-rs/pull/5041) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cast from integer/timestamp to timestamp/integer [\#5040](https://github.com/apache/arrow-rs/pull/5040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- doc: update comment on sort\_to\_indices to reflect correct ordering [\#5033](https://github.com/apache/arrow-rs/pull/5033) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([westonpace](https://github.com/westonpace)) +- Support casting from integer to binary [\#5015](https://github.com/apache/arrow-rs/pull/5015) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update tracing-log requirement from 0.1 to 0.2 [\#4998](https://github.com/apache/arrow-rs/pull/4998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat\(flight-sql\): Allow custom commands in get-flight-info [\#4997](https://github.com/apache/arrow-rs/pull/4997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) +- Support metadata in SchemaBuilder [\#4987](https://github.com/apache/arrow-rs/pull/4987) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: support schema change by idx and reverse [\#4985](https://github.com/apache/arrow-rs/pull/4985) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add arrow\_cast::base64 and document usage in arrow\_json [\#4975](https://github.com/apache/arrow-rs/pull/4975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add SchemaBuilder::remove \(\#4952\) [\#4964](https://github.com/apache/arrow-rs/pull/4964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `Field::remove()`, `Schema::remove()`, and `RecordBatch::remove_column()` APIs [\#4959](https://github.com/apache/arrow-rs/pull/4959) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Folyd](https://github.com/Folyd)) +- Add `RecordReader` trait and proc macro to implement it for a struct [\#4773](https://github.com/apache/arrow-rs/pull/4773) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Joseph-Rance](https://github.com/Joseph-Rance)) diff --git a/Cargo.toml b/Cargo.toml index d59a5af68a19..d5e834316b91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "48.0.0" +version = "49.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "48.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "48.0.0", path = "./arrow-arith" } -arrow-array = { version = "48.0.0", path = "./arrow-array" } -arrow-buffer = { version = "48.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "48.0.0", path = "./arrow-cast" } -arrow-csv = { version = "48.0.0", path = "./arrow-csv" } -arrow-data = { version = "48.0.0", path = "./arrow-data" } -arrow-ipc = { version = "48.0.0", path = "./arrow-ipc" } -arrow-json = { version = "48.0.0", path = "./arrow-json" } -arrow-ord = { version = "48.0.0", path = "./arrow-ord" } -arrow-row = { version = "48.0.0", path = "./arrow-row" } -arrow-schema = { version = "48.0.0", path = "./arrow-schema" } -arrow-select = { version = "48.0.0", path = "./arrow-select" } -arrow-string = { version = "48.0.0", path = "./arrow-string" } -parquet = { version = "48.0.0", path = "./parquet", default-features = false } +arrow = { version = "49.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "49.0.0", path = "./arrow-arith" } +arrow-array = { version = "49.0.0", path = "./arrow-array" } +arrow-buffer = { version = "49.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "49.0.0", path = "./arrow-cast" } +arrow-csv = { version = "49.0.0", path = "./arrow-csv" } +arrow-data = { version = "49.0.0", path = "./arrow-data" } +arrow-ipc = { version = "49.0.0", path = "./arrow-ipc" } +arrow-json = { version = "49.0.0", path = "./arrow-json" } +arrow-ord = { version = "49.0.0", path = "./arrow-ord" } +arrow-row = { version = "49.0.0", path = "./arrow-row" } +arrow-schema = { version = "49.0.0", path = "./arrow-schema" } +arrow-select = { version = "49.0.0", path = "./arrow-select" } +arrow-string = { version = "49.0.0", path = "./arrow-string" } +parquet = { version = "49.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index c1627ebb8cf2..0c40d91a3edd 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="47.0.0" -FUTURE_RELEASE="48.0.0" +SINCE_TAG="48.0.0" +FUTURE_RELEASE="49.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 747dcbf0670aeab2ede474edb3c4f22028d6a7e6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 7 Nov 2023 21:16:15 +0000 Subject: [PATCH 1337/1411] Update parquet encoding docs (#5053) * Update parquet encoding docs * Review feedback --- parquet/README.md | 2 +- parquet/src/basic.rs | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/parquet/README.md b/parquet/README.md index 86c7ee2c35d0..2e0ab1d52c30 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -55,7 +55,7 @@ The `parquet` crate provides the following features which may be enabled in your ## Parquet Feature Status -- [x] All encodings supported +- [x] All encodings supported, except for BYTE_STREAM_SPLIT ([#4102](https://github.com/apache/arrow-rs/issues/4102)) - [x] All compression codecs supported - [x] Read support - [x] Primitive column value readers diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ab71aa44169b..3c8602b8022b 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -215,8 +215,21 @@ pub enum Repetition { // Mirrors `parquet::Encoding` /// Encodings supported by Parquet. +/// /// Not all encodings are valid for all types. These enums are also used to specify the /// encoding of definition and repetition levels. +/// +/// By default this crate uses [Encoding::PLAIN], [Encoding::RLE], and [Encoding::RLE_DICTIONARY]. +/// These provide very good encode and decode performance, whilst yielding reasonable storage +/// efficiency and being supported by all major parquet readers. +/// +/// The delta encodings are also supported and will be used if a newer [WriterVersion] is +/// configured, however, it should be noted that these sacrifice encode and decode performance for +/// improved storage efficiency. This performance regression is particularly pronounced in the case +/// of record skipping as occurs during predicate push-down. It is recommended users assess the +/// performance impact when evaluating these encodings. +/// +/// [WriterVersion]: crate::file::properties::WriterVersion #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] #[allow(non_camel_case_types)] pub enum Encoding { @@ -303,7 +316,21 @@ impl FromStr for Encoding { // ---------------------------------------------------------------------- // Mirrors `parquet::CompressionCodec` -/// Supported compression algorithms. +/// Supported block compression algorithms. +/// +/// Block compression can yield non-trivial improvements to storage efficiency at the expense +/// of potentially significantly worse encode and decode performance. Many applications, +/// especially those making use of high-throughput and low-cost commodity object storage, +/// may find storage efficiency less important than decode throughput, and therefore may +/// wish to not make use of block compression. +/// +/// The writers in this crate default to no block compression for this reason. +/// +/// Applications that do still wish to use block compression, will find [`Compression::ZSTD`] +/// to provide a good balance of compression, performance, and ecosystem support. Alternatively, +/// [`Compression::LZ4_RAW`] provides much faster decompression speeds, at the cost of typically +/// worse compression ratios. However, it is not as widely supported by the ecosystem, with the +/// Hadoop ecosystem historically favoring the non-standard and now deprecated [`Compression::LZ4`]. #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[allow(non_camel_case_types)] pub enum Compression { From 1635f5bfdd2a6c53ab52555721dced8457e22ca6 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 8 Nov 2023 22:24:26 +1100 Subject: [PATCH 1338/1411] Remove ByteBufferPtr and replace with Bytes (#5055) * Replace usages of ByteBufferPtr with Bytes * Remove parquet memory.rs module --- parquet/src/arrow/array_reader/byte_array.rs | 22 +- .../array_reader/byte_array_dictionary.rs | 8 +- .../array_reader/fixed_len_byte_array.rs | 10 +- parquet/src/arrow/array_reader/test_util.rs | 8 +- parquet/src/arrow/arrow_writer/byte_array.rs | 6 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/decoder/delta_byte_array.rs | 11 +- parquet/src/arrow/decoder/dictionary_index.rs | 7 +- .../arrow/record_reader/definition_levels.rs | 14 +- parquet/src/column/page.rs | 27 ++- parquet/src/column/reader.rs | 32 +-- parquet/src/column/reader/decoder.rs | 27 +-- parquet/src/column/writer/encoder.rs | 7 +- parquet/src/column/writer/mod.rs | 31 ++- parquet/src/data_type.rs | 57 +++-- parquet/src/encodings/decoding.rs | 228 +++++------------- .../src/encodings/encoding/dict_encoder.rs | 11 +- parquet/src/encodings/encoding/mod.rs | 32 ++- parquet/src/encodings/rle.rs | 77 +++--- parquet/src/file/serialized_reader.rs | 10 +- parquet/src/file/writer.rs | 27 +-- parquet/src/lib.rs | 4 - parquet/src/util/bit_util.rs | 41 ++-- parquet/src/util/memory.rs | 149 ------------ parquet/src/util/mod.rs | 1 - parquet/src/util/test_common/page_util.rs | 15 +- parquet/src/util/test_common/rand_gen.rs | 4 +- 27 files changed, 307 insertions(+), 561 deletions(-) delete mode 100644 parquet/src/util/memory.rs diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 4612f816146a..01666c0af4e6 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -29,12 +29,12 @@ use crate::data_type::Int32Type; use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::memory::ByteBufferPtr; use arrow_array::{ Array, ArrayRef, BinaryArray, Decimal128Array, Decimal256Array, OffsetSizeTrait, }; use arrow_buffer::{i256, Buffer}; use arrow_schema::DataType as ArrowType; +use bytes::Bytes; use std::any::Any; use std::ops::Range; use std::sync::Arc; @@ -189,7 +189,7 @@ impl ColumnValueDecoder fn set_dict( &mut self, - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, _is_sorted: bool, @@ -219,7 +219,7 @@ impl ColumnValueDecoder fn set_data( &mut self, encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, ) -> Result<()> { @@ -263,7 +263,7 @@ pub enum ByteArrayDecoder { impl ByteArrayDecoder { pub fn new( encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, validate_utf8: bool, @@ -339,7 +339,7 @@ impl ByteArrayDecoder { /// Decoder from [`Encoding::PLAIN`] data to [`OffsetBuffer`] pub struct ByteArrayDecoderPlain { - buf: ByteBufferPtr, + buf: Bytes, offset: usize, validate_utf8: bool, @@ -350,7 +350,7 @@ pub struct ByteArrayDecoderPlain { impl ByteArrayDecoderPlain { pub fn new( - buf: ByteBufferPtr, + buf: Bytes, num_levels: usize, num_values: Option, validate_utf8: bool, @@ -438,16 +438,16 @@ impl ByteArrayDecoderPlain { /// Decoder from [`Encoding::DELTA_LENGTH_BYTE_ARRAY`] data to [`OffsetBuffer`] pub struct ByteArrayDecoderDeltaLength { lengths: Vec, - data: ByteBufferPtr, + data: Bytes, length_offset: usize, data_offset: usize, validate_utf8: bool, } impl ByteArrayDecoderDeltaLength { - fn new(data: ByteBufferPtr, validate_utf8: bool) -> Result { + fn new(data: Bytes, validate_utf8: bool) -> Result { let mut len_decoder = DeltaBitPackDecoder::::new(); - len_decoder.set_data(data.all(), 0)?; + len_decoder.set_data(data.clone(), 0)?; let values = len_decoder.values_left(); let mut lengths = vec![0; values]; @@ -522,7 +522,7 @@ pub struct ByteArrayDecoderDelta { } impl ByteArrayDecoderDelta { - fn new(data: ByteBufferPtr, validate_utf8: bool) -> Result { + fn new(data: Bytes, validate_utf8: bool) -> Result { Ok(Self { decoder: DeltaByteArrayDecoder::new(data)?, validate_utf8, @@ -558,7 +558,7 @@ pub struct ByteArrayDecoderDictionary { } impl ByteArrayDecoderDictionary { - fn new(data: ByteBufferPtr, num_levels: usize, num_values: Option) -> Self { + fn new(data: Bytes, num_levels: usize, num_values: Option) -> Self { Self { decoder: DictIndexDecoder::new(data, num_levels, num_values), } diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 841f5a95fd4e..0d216fa08327 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_schema::DataType as ArrowType; +use bytes::Bytes; use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}; use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; @@ -39,7 +40,6 @@ use crate::encodings::rle::RleDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::FromBytes; -use crate::util::memory::ByteBufferPtr; /// A macro to reduce verbosity of [`make_byte_array_dictionary_reader`] macro_rules! make_reader { @@ -253,7 +253,7 @@ where fn set_dict( &mut self, - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, _is_sorted: bool, @@ -286,7 +286,7 @@ where fn set_data( &mut self, encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, ) -> Result<()> { @@ -294,7 +294,7 @@ where Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { let bit_width = data[0]; let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(data.start_from(1)); + decoder.set_data(data.slice(1..)); MaybeDictionaryDecoder::Dict { decoder, max_remaining_values: num_values.unwrap_or(num_levels), diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index b06091b6b57a..3b1a50ebcce8 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -26,7 +26,6 @@ use crate::column::page::PageIterator; use crate::column::reader::decoder::{ColumnValueDecoder, ValuesBufferSlice}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::memory::ByteBufferPtr; use arrow_array::{ ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, IntervalDayTimeArray, IntervalYearMonthArray, @@ -34,6 +33,7 @@ use arrow_array::{ use arrow_buffer::{i256, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowType, IntervalUnit}; +use bytes::Bytes; use std::any::Any; use std::ops::Range; use std::sync::Arc; @@ -298,7 +298,7 @@ impl ValuesBuffer for FixedLenByteArrayBuffer { struct ValueDecoder { byte_length: usize, - dict_page: Option, + dict_page: Option, decoder: Option, } @@ -315,7 +315,7 @@ impl ColumnValueDecoder for ValueDecoder { fn set_dict( &mut self, - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, _is_sorted: bool, @@ -345,7 +345,7 @@ impl ColumnValueDecoder for ValueDecoder { fn set_data( &mut self, encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, ) -> Result<()> { @@ -434,7 +434,7 @@ impl ColumnValueDecoder for ValueDecoder { } enum Decoder { - Plain { buf: ByteBufferPtr, offset: usize }, + Plain { buf: Bytes, offset: usize }, Dict { decoder: DictIndexDecoder }, Delta { decoder: DeltaByteArrayDecoder }, } diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index 7e66efead2e5..05032920139b 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -17,6 +17,7 @@ use arrow_array::{Array, ArrayRef}; use arrow_schema::DataType as ArrowType; +use bytes::Bytes; use std::any::Any; use std::sync::Arc; @@ -27,7 +28,6 @@ use crate::data_type::{ByteArray, ByteArrayType}; use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::errors::Result; use crate::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath, Type}; -use crate::util::memory::ByteBufferPtr; /// Returns a descriptor for a UTF-8 column pub fn utf8_column() -> ColumnDescPtr { @@ -45,7 +45,7 @@ pub fn utf8_column() -> ColumnDescPtr { } /// Encode `data` with the provided `encoding` -pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> ByteBufferPtr { +pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> Bytes { let mut encoder = get_encoder::(encoding).unwrap(); encoder.put(data).unwrap(); @@ -53,7 +53,7 @@ pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> ByteBufferPt } /// Returns the encoded dictionary and value data -pub fn encode_dictionary(data: &[ByteArray]) -> (ByteBufferPtr, ByteBufferPtr) { +pub fn encode_dictionary(data: &[ByteArray]) -> (Bytes, Bytes) { let mut dict_encoder = DictEncoder::::new(utf8_column()); dict_encoder.put(data).unwrap(); @@ -68,7 +68,7 @@ pub fn encode_dictionary(data: &[ByteArray]) -> (ByteBufferPtr, ByteBufferPtr) { /// Returns an array of data with its associated encoding, along with an encoded dictionary pub fn byte_array_all_encodings( data: Vec>, -) -> (Vec<(Encoding, ByteBufferPtr)>, ByteBufferPtr) { +) -> (Vec<(Encoding, Bytes)>, Bytes) { let data: Vec<_> = data.into_iter().map(Into::into).collect(); let (encoded_dictionary, encoded_rle) = encode_dictionary(&data); diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 3db2e4a6a063..28c7c3b00540 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -236,7 +236,7 @@ impl FallbackEncoder { let lengths = lengths.flush_buffer()?; let mut out = Vec::with_capacity(lengths.len() + buffer.len()); - out.extend_from_slice(lengths.data()); + out.extend_from_slice(&lengths); out.extend_from_slice(buffer); buffer.clear(); (out, Encoding::DELTA_LENGTH_BYTE_ARRAY) @@ -252,8 +252,8 @@ impl FallbackEncoder { let mut out = Vec::with_capacity(prefix_lengths.len() + suffix_lengths.len() + buffer.len()); - out.extend_from_slice(prefix_lengths.data()); - out.extend_from_slice(suffix_lengths.data()); + out.extend_from_slice(&prefix_lengths); + out.extend_from_slice(&suffix_lengths); out.extend_from_slice(buffer); buffer.clear(); last_value.clear(); diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index a9cd1afb2479..eca1dea791be 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -331,7 +331,7 @@ impl PageWriter for ArrowPageWriter { buf.length += compressed_size; buf.data.push(header); - buf.data.push(data.into()); + buf.data.push(data); Ok(spec) } diff --git a/parquet/src/arrow/decoder/delta_byte_array.rs b/parquet/src/arrow/decoder/delta_byte_array.rs index c731cfea97e9..7686a4292c43 100644 --- a/parquet/src/arrow/decoder/delta_byte_array.rs +++ b/parquet/src/arrow/decoder/delta_byte_array.rs @@ -15,16 +15,17 @@ // specific language governing permissions and limitations // under the License. +use bytes::Bytes; + use crate::data_type::Int32Type; use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder}; use crate::errors::{ParquetError, Result}; -use crate::util::memory::ByteBufferPtr; /// Decoder for `Encoding::DELTA_BYTE_ARRAY` pub struct DeltaByteArrayDecoder { prefix_lengths: Vec, suffix_lengths: Vec, - data: ByteBufferPtr, + data: Bytes, length_offset: usize, data_offset: usize, last_value: Vec, @@ -32,16 +33,16 @@ pub struct DeltaByteArrayDecoder { impl DeltaByteArrayDecoder { /// Create a new [`DeltaByteArrayDecoder`] with the provided data page - pub fn new(data: ByteBufferPtr) -> Result { + pub fn new(data: Bytes) -> Result { let mut prefix = DeltaBitPackDecoder::::new(); - prefix.set_data(data.all(), 0)?; + prefix.set_data(data.clone(), 0)?; let num_prefix = prefix.values_left(); let mut prefix_lengths = vec![0; num_prefix]; assert_eq!(prefix.get(&mut prefix_lengths)?, num_prefix); let mut suffix = DeltaBitPackDecoder::::new(); - suffix.set_data(data.start_from(prefix.get_offset()), 0)?; + suffix.set_data(data.slice(prefix.get_offset()..), 0)?; let num_suffix = suffix.values_left(); let mut suffix_lengths = vec![0; num_suffix]; diff --git a/parquet/src/arrow/decoder/dictionary_index.rs b/parquet/src/arrow/decoder/dictionary_index.rs index 32efd564dffb..38f2b058360c 100644 --- a/parquet/src/arrow/decoder/dictionary_index.rs +++ b/parquet/src/arrow/decoder/dictionary_index.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. +use bytes::Bytes; + use crate::encodings::rle::RleDecoder; use crate::errors::Result; -use crate::util::memory::ByteBufferPtr; /// Decoder for `Encoding::RLE_DICTIONARY` indices pub struct DictIndexDecoder { @@ -41,10 +42,10 @@ pub struct DictIndexDecoder { impl DictIndexDecoder { /// Create a new [`DictIndexDecoder`] with the provided data page, the number of levels /// associated with this data page, and the number of non-null values (if known) - pub fn new(data: ByteBufferPtr, num_levels: usize, num_values: Option) -> Self { + pub fn new(data: Bytes, num_levels: usize, num_values: Option) -> Self { let bit_width = data[0]; let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(data.start_from(1)); + decoder.set_data(data.slice(1..)); Self { decoder, diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 20cda536ae1c..9009c596c4bf 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -20,6 +20,7 @@ use std::ops::Range; use arrow_array::builder::BooleanBufferBuilder; use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; use arrow_buffer::Buffer; +use bytes::Bytes; use crate::arrow::buffer::bit_util::count_set_bits; use crate::basic::Encoding; @@ -28,7 +29,6 @@ use crate::column::reader::decoder::{ }; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::memory::ByteBufferPtr; use super::buffer::ScalarBuffer; @@ -152,7 +152,7 @@ impl DefinitionLevelBufferDecoder { impl ColumnLevelDecoder for DefinitionLevelBufferDecoder { type Slice = DefinitionLevelBuffer; - fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + fn set_data(&mut self, encoding: Encoding, data: Bytes) { match &mut self.decoder { MaybePacked::Packed(d) => d.set_data(encoding, data), MaybePacked::Fallback(d) => d.set_data(encoding, data), @@ -219,7 +219,7 @@ impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { /// [RLE]: https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 /// [BIT_PACKED]: https://github.com/apache/parquet-format/blob/master/Encodings.md#bit-packed-deprecated-bit_packed--4 struct PackedDecoder { - data: ByteBufferPtr, + data: Bytes, data_offset: usize, rle_left: usize, rle_value: bool, @@ -278,7 +278,7 @@ impl PackedDecoder { impl PackedDecoder { fn new() -> Self { Self { - data: ByteBufferPtr::new(vec![]), + data: Bytes::from(vec![]), data_offset: 0, rle_left: 0, rle_value: false, @@ -287,7 +287,7 @@ impl PackedDecoder { } } - fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + fn set_data(&mut self, encoding: Encoding, data: Bytes) { self.rle_left = 0; self.rle_value = false; self.packed_offset = 0; @@ -385,7 +385,7 @@ mod tests { let encoded = encoder.consume(); let mut decoder = PackedDecoder::new(); - decoder.set_data(Encoding::RLE, ByteBufferPtr::new(encoded)); + decoder.set_data(Encoding::RLE, encoded.into()); // Decode data in random length intervals let mut decoded = BooleanBufferBuilder::new(len); @@ -424,7 +424,7 @@ mod tests { let encoded = encoder.consume(); let mut decoder = PackedDecoder::new(); - decoder.set_data(Encoding::RLE, ByteBufferPtr::new(encoded)); + decoder.set_data(Encoding::RLE, encoded.into()); let mut skip_value = 0; let mut read_value = 0; diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 933e42386272..947a633f48a2 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -17,11 +17,12 @@ //! Contains Parquet Page definitions and page reader interface. +use bytes::Bytes; + use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; use crate::format::PageHeader; -use crate::util::memory::ByteBufferPtr; /// Parquet Page definition. /// @@ -31,7 +32,7 @@ use crate::util::memory::ByteBufferPtr; #[derive(Clone)] pub enum Page { DataPage { - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, def_level_encoding: Encoding, @@ -39,7 +40,7 @@ pub enum Page { statistics: Option, }, DataPageV2 { - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, num_nulls: u32, @@ -50,7 +51,7 @@ pub enum Page { statistics: Option, }, DictionaryPage { - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, is_sorted: bool, @@ -68,7 +69,7 @@ impl Page { } /// Returns internal byte buffer reference for this page. - pub fn buffer(&self) -> &ByteBufferPtr { + pub fn buffer(&self) -> &Bytes { match self { Page::DataPage { ref buf, .. } => buf, Page::DataPageV2 { ref buf, .. } => buf, @@ -159,7 +160,7 @@ impl CompressedPage { /// Returns slice of compressed buffer in the page. pub fn data(&self) -> &[u8] { - self.compressed_page.buffer().data() + self.compressed_page.buffer() } /// Returns the thrift page header @@ -370,7 +371,7 @@ mod tests { #[test] fn test_page() { let data_page = Page::DataPage { - buf: ByteBufferPtr::new(vec![0, 1, 2]), + buf: Bytes::from(vec![0, 1, 2]), num_values: 10, encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, @@ -378,7 +379,7 @@ mod tests { statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), }; assert_eq!(data_page.page_type(), PageType::DATA_PAGE); - assert_eq!(data_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page.buffer(), vec![0, 1, 2].as_slice()); assert_eq!(data_page.num_values(), 10); assert_eq!(data_page.encoding(), Encoding::PLAIN); assert_eq!( @@ -387,7 +388,7 @@ mod tests { ); let data_page_v2 = Page::DataPageV2 { - buf: ByteBufferPtr::new(vec![0, 1, 2]), + buf: Bytes::from(vec![0, 1, 2]), num_values: 10, encoding: Encoding::PLAIN, num_nulls: 5, @@ -398,7 +399,7 @@ mod tests { statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)), }; assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2); - assert_eq!(data_page_v2.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(data_page_v2.buffer(), vec![0, 1, 2].as_slice()); assert_eq!(data_page_v2.num_values(), 10); assert_eq!(data_page_v2.encoding(), Encoding::PLAIN); assert_eq!( @@ -407,13 +408,13 @@ mod tests { ); let dict_page = Page::DictionaryPage { - buf: ByteBufferPtr::new(vec![0, 1, 2]), + buf: Bytes::from(vec![0, 1, 2]), num_values: 10, encoding: Encoding::PLAIN, is_sorted: false, }; assert_eq!(dict_page.page_type(), PageType::DICTIONARY_PAGE); - assert_eq!(dict_page.buffer().data(), vec![0, 1, 2].as_slice()); + assert_eq!(dict_page.buffer(), vec![0, 1, 2].as_slice()); assert_eq!(dict_page.num_values(), 10); assert_eq!(dict_page.encoding(), Encoding::PLAIN); assert_eq!(dict_page.statistics(), None); @@ -422,7 +423,7 @@ mod tests { #[test] fn test_compressed_page() { let data_page = Page::DataPage { - buf: ByteBufferPtr::new(vec![0, 1, 2]), + buf: Bytes::from(vec![0, 1, 2]), num_values: 10, encoding: Encoding::PLAIN, def_level_encoding: Encoding::RLE, diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 854e5d994ee8..adfcd6390720 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -17,6 +17,8 @@ //! Contains column reader API. +use bytes::Bytes; + use super::page::{Page, PageReader}; use crate::basic::*; use crate::column::reader::decoder::{ @@ -27,7 +29,6 @@ use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::{ceil, num_required_bits, read_num_bytes}; -use crate::util::memory::ByteBufferPtr; pub(crate) mod decoder; @@ -474,7 +475,7 @@ where max_rep_level, num_values, rep_level_encoding, - buf.start_from(offset), + buf.slice(offset..), )?; offset += bytes_read; @@ -492,7 +493,7 @@ where max_def_level, num_values, def_level_encoding, - buf.start_from(offset), + buf.slice(offset..), )?; offset += bytes_read; @@ -504,7 +505,7 @@ where self.values_decoder.set_data( encoding, - buf.start_from(offset), + buf.slice(offset..), num_values as usize, None, )?; @@ -540,7 +541,7 @@ where self.rep_level_decoder.as_mut().unwrap().set_data( Encoding::RLE, - buf.range(0, rep_levels_byte_len as usize), + buf.slice(..rep_levels_byte_len as usize), ); } @@ -549,18 +550,16 @@ where if self.descr.max_def_level() > 0 { self.def_level_decoder.as_mut().unwrap().set_data( Encoding::RLE, - buf.range( - rep_levels_byte_len as usize, - def_levels_byte_len as usize, + buf.slice( + rep_levels_byte_len as usize + ..(rep_levels_byte_len + def_levels_byte_len) as usize, ), ); } self.values_decoder.set_data( encoding, - buf.start_from( - (rep_levels_byte_len + def_levels_byte_len) as usize, - ), + buf.slice((rep_levels_byte_len + def_levels_byte_len) as usize..), num_values as usize, Some((num_values - num_nulls) as usize), )?; @@ -595,13 +594,16 @@ fn parse_v1_level( max_level: i16, num_buffered_values: u32, encoding: Encoding, - buf: ByteBufferPtr, -) -> Result<(usize, ByteBufferPtr)> { + buf: Bytes, +) -> Result<(usize, Bytes)> { match encoding { Encoding::RLE => { let i32_size = std::mem::size_of::(); let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; - Ok((i32_size + data_size, buf.range(i32_size, data_size))) + Ok(( + i32_size + data_size, + buf.slice(i32_size..i32_size + data_size), + )) } Encoding::BIT_PACKED => { let bit_width = num_required_bits(max_level as u64); @@ -609,7 +611,7 @@ fn parse_v1_level( (num_buffered_values as usize * bit_width as usize) as i64, 8, ) as usize; - Ok((num_bytes, buf.range(0, num_bytes))) + Ok((num_bytes, buf.slice(..num_bytes))) } _ => Err(general_err!("invalid level encoding: {}", encoding)), } diff --git a/parquet/src/column/reader/decoder.rs b/parquet/src/column/reader/decoder.rs index ec57c4032574..ef62724689a8 100644 --- a/parquet/src/column/reader/decoder.rs +++ b/parquet/src/column/reader/decoder.rs @@ -18,6 +18,8 @@ use std::collections::HashMap; use std::ops::Range; +use bytes::Bytes; + use crate::basic::Encoding; use crate::data_type::DataType; use crate::encodings::{ @@ -26,10 +28,7 @@ use crate::encodings::{ }; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::{ - bit_util::{num_required_bits, BitReader}, - memory::ByteBufferPtr, -}; +use crate::util::bit_util::{num_required_bits, BitReader}; /// A slice of levels buffer data that is written to by a [`ColumnLevelDecoder`] pub trait LevelsBufferSlice { @@ -67,7 +66,7 @@ pub trait ColumnLevelDecoder { type Slice: LevelsBufferSlice + ?Sized; /// Set data for this [`ColumnLevelDecoder`] - fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr); + fn set_data(&mut self, encoding: Encoding, data: Bytes); } pub trait RepetitionLevelDecoder: ColumnLevelDecoder { @@ -132,7 +131,7 @@ pub trait ColumnValueDecoder { /// Set the current dictionary page fn set_dict( &mut self, - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, encoding: Encoding, is_sorted: bool, @@ -152,7 +151,7 @@ pub trait ColumnValueDecoder { fn set_data( &mut self, encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, ) -> Result<()>; @@ -197,7 +196,7 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { fn set_dict( &mut self, - buf: ByteBufferPtr, + buf: Bytes, num_values: u32, mut encoding: Encoding, _is_sorted: bool, @@ -229,7 +228,7 @@ impl ColumnValueDecoder for ColumnValueDecoderImpl { fn set_data( &mut self, mut encoding: Encoding, - data: ByteBufferPtr, + data: Bytes, num_levels: usize, num_values: Option, ) -> Result<()> { @@ -294,7 +293,7 @@ enum LevelDecoder { } impl LevelDecoder { - fn new(encoding: Encoding, data: ByteBufferPtr, bit_width: u8) -> Self { + fn new(encoding: Encoding, data: Bytes, bit_width: u8) -> Self { match encoding { Encoding::RLE => { let mut decoder = RleDecoder::new(bit_width); @@ -335,7 +334,7 @@ impl DefinitionLevelDecoderImpl { impl ColumnLevelDecoder for DefinitionLevelDecoderImpl { type Slice = [i16]; - fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + fn set_data(&mut self, encoding: Encoding, data: Bytes) { self.decoder = Some(LevelDecoder::new(encoding, data, self.bit_width)) } } @@ -426,7 +425,7 @@ impl RepetitionLevelDecoderImpl { impl ColumnLevelDecoder for RepetitionLevelDecoderImpl { type Slice = [i16]; - fn set_data(&mut self, encoding: Encoding, data: ByteBufferPtr) { + fn set_data(&mut self, encoding: Encoding, data: Bytes) { self.decoder = Some(LevelDecoder::new(encoding, data, self.bit_width)); self.buffer_len = 0; self.buffer_offset = 0; @@ -511,7 +510,7 @@ mod tests { let mut encoder = RleEncoder::new(1, 1024); encoder.put(0); (0..3).for_each(|_| encoder.put(1)); - let data = ByteBufferPtr::new(encoder.consume()); + let data = Bytes::from(encoder.consume()); let mut decoder = RepetitionLevelDecoderImpl::new(1); decoder.set_data(Encoding::RLE, data.clone()); @@ -537,7 +536,7 @@ mod tests { for v in &encoded { encoder.put(*v as _) } - let data = ByteBufferPtr::new(encoder.consume()); + let data = Bytes::from(encoder.consume()); let mut decoder = RepetitionLevelDecoderImpl::new(5); decoder.set_data(Encoding::RLE, data); diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 5fd0f9e194d2..2273ae777444 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use bytes::Bytes; + use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ @@ -26,7 +28,6 @@ use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::errors::{ParquetError, Result}; use crate::file::properties::{EnabledStatistics, WriterProperties}; use crate::schema::types::{ColumnDescPtr, ColumnDescriptor}; -use crate::util::memory::ByteBufferPtr; /// A collection of [`ParquetValueType`] encoded by a [`ColumnValueEncoder`] pub trait ColumnValues { @@ -49,14 +50,14 @@ impl ColumnValues for [T] { /// The encoded data for a dictionary page pub struct DictionaryPage { - pub buf: ByteBufferPtr, + pub buf: Bytes, pub num_values: usize, pub is_sorted: bool, } /// The encoded values for a data page, with optional statistics pub struct DataPageValues { - pub buf: ByteBufferPtr, + pub buf: Bytes, pub num_values: usize, pub encoding: Encoding, pub min_value: Option, diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 307804e7dc5c..60db90c5d46d 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -17,6 +17,8 @@ //! Contains column writer API. +use bytes::Bytes; + use crate::bloom_filter::Sbbf; use crate::format::{ColumnIndex, OffsetIndex}; use std::collections::{BTreeSet, VecDeque}; @@ -38,7 +40,6 @@ use crate::file::{ properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, }; use crate::schema::types::{ColumnDescPtr, ColumnDescriptor}; -use crate::util::memory::ByteBufferPtr; pub(crate) mod encoder; @@ -731,7 +732,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ); } - buffer.extend_from_slice(values_data.buf.data()); + buffer.extend_from_slice(&values_data.buf); let uncompressed_size = buffer.len(); if let Some(ref mut cmpr) = self.compressor { @@ -741,7 +742,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } let data_page = Page::DataPage { - buf: ByteBufferPtr::new(buffer), + buf: buffer.into(), num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, def_level_encoding: Encoding::RLE, @@ -774,13 +775,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Data Page v2 compresses values only. match self.compressor { Some(ref mut cmpr) => { - cmpr.compress(values_data.buf.data(), &mut buffer)?; + cmpr.compress(&values_data.buf, &mut buffer)?; } - None => buffer.extend_from_slice(values_data.buf.data()), + None => buffer.extend_from_slice(&values_data.buf), } let data_page = Page::DataPageV2 { - buf: ByteBufferPtr::new(buffer), + buf: buffer.into(), num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, num_nulls: self.page_metrics.num_page_nulls as u32, @@ -920,8 +921,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if let Some(ref mut cmpr) = self.compressor { let mut output_buf = Vec::with_capacity(uncompressed_size); - cmpr.compress(page.buf.data(), &mut output_buf)?; - page.buf = ByteBufferPtr::new(output_buf); + cmpr.compress(&page.buf, &mut output_buf)?; + page.buf = Bytes::from(output_buf); } let dict_page = Page::DictionaryPage { @@ -2350,10 +2351,10 @@ mod tests { let mut data = vec![FixedLenByteArray::default(); 3]; // This is the expected min value - "aaa..." - data[0].set_data(ByteBufferPtr::new(vec![97_u8; 200])); + data[0].set_data(Bytes::from(vec![97_u8; 200])); // This is the expected max value - "ZZZ..." - data[1].set_data(ByteBufferPtr::new(vec![112_u8; 200])); - data[2].set_data(ByteBufferPtr::new(vec![98_u8; 200])); + data[1].set_data(Bytes::from(vec![112_u8; 200])); + data[2].set_data(Bytes::from(vec![98_u8; 200])); writer.write_batch(&data, None, None).unwrap(); @@ -2420,9 +2421,7 @@ mod tests { let mut data = vec![FixedLenByteArray::default(); 1]; // This is the expected min value - data[0].set_data(ByteBufferPtr::new( - String::from("Blart Versenwald III").into_bytes(), - )); + data[0].set_data(Bytes::from(String::from("Blart Versenwald III"))); writer.write_batch(&data, None, None).unwrap(); @@ -2493,9 +2492,9 @@ mod tests { // Also show that BinaryArray level comparison works here let mut greater = ByteArray::new(); - greater.set_data(ByteBufferPtr::new(v)); + greater.set_data(Bytes::from(v)); let mut original = ByteArray::new(); - original.set_data(ByteBufferPtr::new("hello".as_bytes().to_vec())); + original.set_data(Bytes::from("hello".as_bytes().to_vec())); assert!(greater > original); // UTF8 string diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index eaf4389d4350..b895c2507018 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -28,7 +28,7 @@ use crate::basic::Type; use crate::column::reader::{ColumnReader, ColumnReaderImpl}; use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; -use crate::util::{bit_util::FromBytes, memory::ByteBufferPtr}; +use crate::util::bit_util::FromBytes; /// Rust representation for logical type INT96, value is backed by an array of `u32`. /// The type only takes 12 bytes, without extra padding. @@ -103,7 +103,7 @@ impl fmt::Display for Int96 { /// Value is backed by a byte buffer. #[derive(Clone, Default)] pub struct ByteArray { - data: Option, + data: Option, } // Special case Debug that prints out byte arrays that are valid utf8 as &str's @@ -130,7 +130,7 @@ impl PartialOrd for ByteArray { (Some(_), None) => Some(Ordering::Greater), (Some(self_data), Some(other_data)) => { // compare slices directly - self_data.data().partial_cmp(other_data.data()) + self_data.partial_cmp(&other_data) } } } @@ -167,7 +167,7 @@ impl ByteArray { /// Set data from another byte buffer. #[inline] - pub fn set_data(&mut self, data: ByteBufferPtr) { + pub fn set_data(&mut self, data: Bytes) { self.data = Some(data); } @@ -178,7 +178,7 @@ impl ByteArray { self.data .as_ref() .expect("set_data should have been called") - .range(start, len), + .slice(start..start + len), ) } @@ -194,7 +194,7 @@ impl ByteArray { impl From> for ByteArray { fn from(buf: Vec) -> ByteArray { Self { - data: Some(ByteBufferPtr::new(buf)), + data: Some(buf.into()), } } } @@ -204,7 +204,7 @@ impl<'a> From<&'a [u8]> for ByteArray { let mut v = Vec::new(); v.extend_from_slice(b); Self { - data: Some(ByteBufferPtr::new(v)), + data: Some(v.into()), } } } @@ -214,20 +214,14 @@ impl<'a> From<&'a str> for ByteArray { let mut v = Vec::new(); v.extend_from_slice(s.as_bytes()); Self { - data: Some(ByteBufferPtr::new(v)), + data: Some(v.into()), } } } -impl From for ByteArray { - fn from(ptr: ByteBufferPtr) -> ByteArray { - Self { data: Some(ptr) } - } -} - impl From for ByteArray { fn from(value: Bytes) -> Self { - ByteBufferPtr::from(value).into() + Self { data: Some(value) } } } @@ -580,9 +574,10 @@ impl AsBytes for str { } pub(crate) mod private { + use bytes::Bytes; + use crate::encodings::decoding::PlainDecoderDetails; use crate::util::bit_util::{read_num_bytes, BitReader, BitWriter}; - use crate::util::memory::ByteBufferPtr; use crate::basic::Type; use std::convert::TryInto; @@ -618,7 +613,7 @@ pub(crate) mod private { ) -> Result<()>; /// Establish the data that will be decoded in a buffer - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize); + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize); /// Decode the value from a given buffer for a higher level decoder fn decode(buffer: &mut [Self], decoder: &mut PlainDecoderDetails) -> Result; @@ -671,7 +666,7 @@ pub(crate) mod private { } #[inline] - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize) { decoder.bit_reader.replace(BitReader::new(data)); decoder.num_values = num_values; } @@ -728,7 +723,7 @@ pub(crate) mod private { } #[inline] - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; @@ -748,7 +743,9 @@ pub(crate) mod private { // SAFETY: Raw types should be as per the standard rust bit-vectors unsafe { let raw_buffer = &mut Self::slice_as_bytes_mut(buffer)[..bytes_to_decode]; - raw_buffer.copy_from_slice(data.range(decoder.start, bytes_to_decode).as_ref()); + raw_buffer.copy_from_slice(data.slice( + decoder.start..decoder.start + bytes_to_decode + ).as_ref()); }; decoder.start += bytes_to_decode; decoder.num_values -= num_values; @@ -815,7 +812,7 @@ pub(crate) mod private { } #[inline] - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; @@ -836,8 +833,8 @@ pub(crate) mod private { return Err(eof_err!("Not enough bytes to decode")); } - let data_range = data.range(decoder.start, bytes_to_decode); - let bytes: &[u8] = data_range.data(); + let data_range = data.slice(decoder.start..decoder.start + bytes_to_decode); + let bytes: &[u8] = &data_range; decoder.start += bytes_to_decode; let mut pos = 0; // position in byte array @@ -902,7 +899,7 @@ pub(crate) mod private { } #[inline] - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; @@ -917,7 +914,7 @@ pub(crate) mod private { let num_values = std::cmp::min(buffer.len(), decoder.num_values); for val_array in buffer.iter_mut().take(num_values) { let len: usize = - read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; + read_num_bytes::(4, data.slice(decoder.start..).as_ref()) as usize; decoder.start += std::mem::size_of::(); if data.len() < decoder.start + len { @@ -926,7 +923,7 @@ pub(crate) mod private { let val: &mut Self = val_array.as_mut_any().downcast_mut().unwrap(); - val.set_data(data.range(decoder.start, len)); + val.set_data(data.slice(decoder.start..decoder.start + len)); decoder.start += len; } decoder.num_values -= num_values; @@ -943,7 +940,7 @@ pub(crate) mod private { for _ in 0..num_values { let len: usize = - read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; + read_num_bytes::(4, data.slice(decoder.start..).as_ref()) as usize; decoder.start += std::mem::size_of::() + len; } decoder.num_values -= num_values; @@ -984,7 +981,7 @@ pub(crate) mod private { } #[inline] - fn set_data(decoder: &mut PlainDecoderDetails, data: ByteBufferPtr, num_values: usize) { + fn set_data(decoder: &mut PlainDecoderDetails, data: Bytes, num_values: usize) { decoder.data.replace(data); decoder.start = 0; decoder.num_values = num_values; @@ -1007,7 +1004,7 @@ pub(crate) mod private { return Err(eof_err!("Not enough bytes to decode")); } - item.set_data(data.range(decoder.start, len)); + item.set_data(data.slice(decoder.start..decoder.start + len)); decoder.start += len; } decoder.num_values -= num_values; @@ -1241,7 +1238,7 @@ mod tests { ); assert_eq!(ByteArray::from("ABC").data(), &[b'A', b'B', b'C']); assert_eq!( - ByteArray::from(ByteBufferPtr::new(vec![1u8, 2u8, 3u8, 4u8, 5u8])).data(), + ByteArray::from(Bytes::from(vec![1u8, 2u8, 3u8, 4u8, 5u8])).data(), &[1u8, 2u8, 3u8, 4u8, 5u8] ); let buf = vec![6u8, 7u8, 8u8, 9u8, 10u8]; diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 7aed6df419ee..5843acdb6d0f 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -17,6 +17,7 @@ //! Contains all supported decoders for Parquet. +use bytes::Bytes; use num::traits::WrappingAdd; use num::FromPrimitive; use std::{cmp, marker::PhantomData, mem}; @@ -28,10 +29,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::{ - bit_util::{self, BitReader}, - memory::ByteBufferPtr, -}; +use crate::util::bit_util::{self, BitReader}; pub(crate) mod private { use super::*; @@ -145,7 +143,7 @@ pub(crate) mod private { pub trait Decoder: Send { /// Sets the data to decode to be `data`, which should contain `num_values` of values /// to decode. - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()>; + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()>; /// Consumes values from this decoder and write the results to `buffer`. This will try /// to fill up `buffer`. @@ -238,7 +236,7 @@ pub struct PlainDecoderDetails { pub(crate) type_length: i32, // The byte array to decode from. Not set if `T` is bool. - pub(crate) data: Option, + pub(crate) data: Option, // Read `data` bit by bit. Only set if `T` is bool. pub(crate) bit_reader: Option, @@ -275,7 +273,7 @@ impl PlainDecoder { impl Decoder for PlainDecoder { #[inline] - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { T::T::set_data(&mut self.inner, data, num_values); Ok(()) } @@ -350,11 +348,11 @@ impl DictDecoder { } impl Decoder for DictDecoder { - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // First byte in `data` is bit width let bit_width = data.as_ref()[0]; let mut rle_decoder = RleDecoder::new(bit_width); - rle_decoder.set_data(data.start_from(1)); + rle_decoder.set_data(data.slice(1..)); self.num_values = num_values; self.rle_decoder = Some(rle_decoder); Ok(()) @@ -418,7 +416,7 @@ impl RleValueDecoder { impl Decoder for RleValueDecoder { #[inline] - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // Only support RLE value reader for boolean values with bit width of 1. ensure_phys_ty!(Type::BOOLEAN, "RleValueDecoder only supports BoolType"); @@ -426,7 +424,8 @@ impl Decoder for RleValueDecoder { const I32_SIZE: usize = mem::size_of::(); let data_size = bit_util::read_num_bytes::(I32_SIZE, data.as_ref()) as usize; self.decoder = RleDecoder::new(1); - self.decoder.set_data(data.range(I32_SIZE, data_size)); + self.decoder + .set_data(data.slice(I32_SIZE..I32_SIZE + data_size)); self.values_left = num_values; Ok(()) } @@ -604,7 +603,7 @@ where { // # of total values is derived from encoding #[inline] - fn set_data(&mut self, data: ByteBufferPtr, _index: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, _index: usize) -> Result<()> { self.bit_reader = BitReader::new(data); self.initialized = true; @@ -811,7 +810,7 @@ pub struct DeltaLengthByteArrayDecoder { current_idx: usize, // Concatenated byte array data - data: Option, + data: Option, // Offset into `data`, always point to the beginning of next byte array. offset: usize, @@ -844,16 +843,16 @@ impl DeltaLengthByteArrayDecoder { } impl Decoder for DeltaLengthByteArrayDecoder { - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { match T::get_physical_type() { Type::BYTE_ARRAY => { let mut len_decoder = DeltaBitPackDecoder::::new(); - len_decoder.set_data(data.all(), num_values)?; + len_decoder.set_data(data.clone(), num_values)?; let num_lengths = len_decoder.values_left(); self.lengths.resize(num_lengths, 0); len_decoder.get(&mut self.lengths[..])?; - self.data = Some(data.start_from(len_decoder.get_offset())); + self.data = Some(data.slice(len_decoder.get_offset()..)); self.offset = 0; self.current_idx = 0; self.num_values = num_lengths; @@ -879,7 +878,7 @@ impl Decoder for DeltaLengthByteArrayDecoder { item.as_mut_any() .downcast_mut::() .unwrap() - .set_data(data.range(self.offset, len)); + .set_data(data.slice(self.offset..self.offset + len)); self.offset += len; self.current_idx += 1; @@ -977,18 +976,18 @@ impl DeltaByteArrayDecoder { } impl Decoder for DeltaByteArrayDecoder { - fn set_data(&mut self, data: ByteBufferPtr, num_values: usize) -> Result<()> { + fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { match T::get_physical_type() { Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { let mut prefix_len_decoder = DeltaBitPackDecoder::::new(); - prefix_len_decoder.set_data(data.all(), num_values)?; + prefix_len_decoder.set_data(data.clone(), num_values)?; let num_prefixes = prefix_len_decoder.values_left(); self.prefix_lengths.resize(num_prefixes, 0); prefix_len_decoder.get(&mut self.prefix_lengths[..])?; let mut suffix_decoder = DeltaLengthByteArrayDecoder::new(); suffix_decoder - .set_data(data.start_from(prefix_len_decoder.get_offset()), num_values)?; + .set_data(data.slice(prefix_len_decoder.get_offset()..), num_values)?; self.suffix_decoder = Some(suffix_decoder); self.num_values = num_prefixes; self.current_idx = 0; @@ -1023,7 +1022,7 @@ impl Decoder for DeltaByteArrayDecoder { result.extend_from_slice(&self.previous_value[0..prefix_len]); result.extend_from_slice(suffix); - let data = ByteBufferPtr::new(result.clone()); + let data = Bytes::from(result.clone()); match ty { Type::BYTE_ARRAY => item @@ -1131,33 +1130,21 @@ mod tests { let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); let mut buffer = [0; 3]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 3, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 3, -1, &mut buffer[..], &data[..]); } #[test] fn test_plain_skip_int32() { let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 1, - -1, - &data[1..], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 1, -1, &data[1..]); } #[test] fn test_plain_skip_all_int32() { let data = [42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1169,7 +1156,7 @@ mod tests { let num_nulls = 5; let valid_bits = [0b01001010]; test_plain_decode_spaced::( - ByteBufferPtr::new(data_bytes), + Bytes::from(data_bytes), 3, -1, &mut buffer[..], @@ -1184,33 +1171,21 @@ mod tests { let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); let mut buffer = [0; 3]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 3, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 3, -1, &mut buffer[..], &data[..]); } #[test] fn test_plain_skip_int64() { let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 2, - -1, - &data[2..], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 2, -1, &data[2..]); } #[test] fn test_plain_skip_all_int64() { let data = [42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 3, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 3, 3, -1, &[]); } #[test] @@ -1218,53 +1193,35 @@ mod tests { let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); let mut buffer = [0.0; 3]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 3, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 3, -1, &mut buffer[..], &data[..]); } #[test] fn test_plain_skip_float() { let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 1, - -1, - &data[1..], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 1, -1, &data[1..]); } #[test] fn test_plain_skip_all_float() { let data = [PI_f32, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 3, 4, -1, &[]); } #[test] fn test_plain_skip_double() { let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 1, - -1, - &data[1..], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 1, -1, &data[1..]); } #[test] fn test_plain_skip_all_double() { let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1272,13 +1229,7 @@ mod tests { let data = [PI_f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); let mut buffer = [0.0f64; 3]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 3, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 3, -1, &mut buffer[..], &data[..]); } #[test] @@ -1290,13 +1241,7 @@ mod tests { data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); let mut buffer = [Int96::new(); 4]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 4, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 4, -1, &mut buffer[..], &data[..]); } #[test] @@ -1307,13 +1252,7 @@ mod tests { data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 4, - 2, - -1, - &data[2..], - ); + test_plain_skip::(Bytes::from(data_bytes), 4, 2, -1, &data[2..]); } #[test] @@ -1324,7 +1263,7 @@ mod tests { data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 4, 8, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 4, 8, -1, &[]); } #[test] @@ -1334,13 +1273,7 @@ mod tests { ]; let data_bytes = BoolType::to_byte_array(&data[..]); let mut buffer = [false; 10]; - test_plain_decode::( - ByteBufferPtr::new(data_bytes), - 10, - -1, - &mut buffer[..], - &data[..], - ); + test_plain_decode::(Bytes::from(data_bytes), 10, -1, &mut buffer[..], &data[..]); } #[test] @@ -1349,13 +1282,7 @@ mod tests { false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 10, - 5, - -1, - &data[5..], - ); + test_plain_skip::(Bytes::from(data_bytes), 10, 5, -1, &data[5..]); } #[test] @@ -1364,18 +1291,18 @@ mod tests { false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 10, 20, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 10, 20, -1, &[]); } #[test] fn test_plain_decode_byte_array() { let mut data = vec![ByteArray::new(); 2]; - data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); + data[0].set_data(Bytes::from(String::from("hello"))); + data[1].set_data(Bytes::from(String::from("parquet"))); let data_bytes = ByteArrayType::to_byte_array(&data[..]); let mut buffer = vec![ByteArray::new(); 2]; test_plain_decode::( - ByteBufferPtr::new(data_bytes), + Bytes::from(data_bytes), 2, -1, &mut buffer[..], @@ -1386,37 +1313,31 @@ mod tests { #[test] fn test_plain_skip_byte_array() { let mut data = vec![ByteArray::new(); 2]; - data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); + data[0].set_data(Bytes::from(String::from("hello"))); + data[1].set_data(Bytes::from(String::from("parquet"))); let data_bytes = ByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 2, - 1, - -1, - &data[1..], - ); + test_plain_skip::(Bytes::from(data_bytes), 2, 1, -1, &data[1..]); } #[test] fn test_plain_skip_all_byte_array() { let mut data = vec![ByteArray::new(); 2]; - data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); + data[0].set_data(Bytes::from(String::from("hello"))); + data[1].set_data(Bytes::from(String::from("parquet"))); let data_bytes = ByteArrayType::to_byte_array(&data[..]); - test_plain_skip::(ByteBufferPtr::new(data_bytes), 2, 2, -1, &[]); + test_plain_skip::(Bytes::from(data_bytes), 2, 2, -1, &[]); } #[test] fn test_plain_decode_fixed_len_byte_array() { let mut data = vec![FixedLenByteArray::default(); 3]; - data[0].set_data(ByteBufferPtr::new(String::from("bird").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("come").into_bytes())); - data[2].set_data(ByteBufferPtr::new(String::from("flow").into_bytes())); + data[0].set_data(Bytes::from(String::from("bird"))); + data[1].set_data(Bytes::from(String::from("come"))); + data[2].set_data(Bytes::from(String::from("flow"))); let data_bytes = FixedLenByteArrayType::to_byte_array(&data[..]); let mut buffer = vec![FixedLenByteArray::default(); 3]; test_plain_decode::( - ByteBufferPtr::new(data_bytes), + Bytes::from(data_bytes), 3, 4, &mut buffer[..], @@ -1427,37 +1348,25 @@ mod tests { #[test] fn test_plain_skip_fixed_len_byte_array() { let mut data = vec![FixedLenByteArray::default(); 3]; - data[0].set_data(ByteBufferPtr::new(String::from("bird").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("come").into_bytes())); - data[2].set_data(ByteBufferPtr::new(String::from("flow").into_bytes())); + data[0].set_data(Bytes::from(String::from("bird"))); + data[1].set_data(Bytes::from(String::from("come"))); + data[2].set_data(Bytes::from(String::from("flow"))); let data_bytes = FixedLenByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 1, - 4, - &data[1..], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 1, 4, &data[1..]); } #[test] fn test_plain_skip_all_fixed_len_byte_array() { let mut data = vec![FixedLenByteArray::default(); 3]; - data[0].set_data(ByteBufferPtr::new(String::from("bird").into_bytes())); - data[1].set_data(ByteBufferPtr::new(String::from("come").into_bytes())); - data[2].set_data(ByteBufferPtr::new(String::from("flow").into_bytes())); + data[0].set_data(Bytes::from(String::from("bird"))); + data[1].set_data(Bytes::from(String::from("come"))); + data[2].set_data(Bytes::from(String::from("flow"))); let data_bytes = FixedLenByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 6, - 4, - &[], - ); + test_plain_skip::(Bytes::from(data_bytes), 3, 6, 4, &[]); } fn test_plain_decode( - data: ByteBufferPtr, + data: Bytes, num_values: usize, type_length: i32, buffer: &mut [T::T], @@ -1473,7 +1382,7 @@ mod tests { } fn test_plain_skip( - data: ByteBufferPtr, + data: Bytes, num_values: usize, skip: usize, type_length: i32, @@ -1501,7 +1410,7 @@ mod tests { } fn test_plain_decode_spaced( - data: ByteBufferPtr, + data: Bytes, num_values: usize, type_length: i32, buffer: &mut [T::T], @@ -1530,9 +1439,7 @@ mod tests { #[should_panic(expected = "RleValueDecoder only supports BoolType")] fn test_rle_value_decode_int32_not_supported() { let mut decoder = RleValueDecoder::::new(); - decoder - .set_data(ByteBufferPtr::new(vec![5, 0, 0, 0]), 1) - .unwrap(); + decoder.set_data(Bytes::from(vec![5, 0, 0, 0]), 1).unwrap(); } #[test] @@ -1730,9 +1637,8 @@ mod tests { 128, 1, 4, 3, 58, 28, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let buffer = ByteBufferPtr::new(data_bytes); let mut decoder: DeltaBitPackDecoder = DeltaBitPackDecoder::new(); - decoder.set_data(buffer, 3).unwrap(); + decoder.set_data(data_bytes.into(), 3).unwrap(); // check exact offsets, because when reading partial values we end up with // some data not being read from bit reader assert_eq!(decoder.get_offset(), 5); @@ -1794,7 +1700,7 @@ mod tests { let length = data.len(); - let ptr = ByteBufferPtr::new(data); + let ptr = Bytes::from(data); let mut reader = BitReader::new(ptr.clone()); assert_eq!(reader.get_vlq_int().unwrap(), 256); assert_eq!(reader.get_vlq_int().unwrap(), 4); @@ -1810,7 +1716,7 @@ mod tests { assert_eq!(decoder.get_offset(), length); // Test with truncated buffer - decoder.set_data(ptr.range(0, 12), 0).unwrap(); + decoder.set_data(ptr.slice(..12), 0).unwrap(); let err = decoder.get(&mut output).unwrap_err().to_string(); assert!( err.contains("Expected to read 64 values from miniblock got 8"), diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 4f4a6ab4f55a..dafae064afbf 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -18,6 +18,8 @@ // ---------------------------------------------------------------------- // Dictionary encoding +use bytes::Bytes; + use crate::basic::{Encoding, Type}; use crate::data_type::private::ParquetValueType; use crate::data_type::DataType; @@ -27,7 +29,6 @@ use crate::errors::Result; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; -use crate::util::memory::ByteBufferPtr; #[derive(Debug)] struct KeyStorage { @@ -112,7 +113,7 @@ impl DictEncoder { /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return /// the result. - pub fn write_dict(&self) -> Result { + pub fn write_dict(&self) -> Result { let mut plain_encoder = PlainEncoder::::new(); plain_encoder.put(&self.interner.storage().uniques)?; plain_encoder.flush_buffer() @@ -120,7 +121,7 @@ impl DictEncoder { /// Writes out the dictionary values with RLE encoding in a byte buffer, and return /// the result. - pub fn write_indices(&mut self) -> Result { + pub fn write_indices(&mut self) -> Result { let buffer_len = self.estimated_data_encoded_size(); let mut buffer = Vec::with_capacity(buffer_len); buffer.push(self.bit_width()); @@ -131,7 +132,7 @@ impl DictEncoder { encoder.put(*index) } self.indices.clear(); - Ok(ByteBufferPtr::new(encoder.consume())) + Ok(encoder.consume().into()) } fn put_one(&mut self, value: &T::T) { @@ -165,7 +166,7 @@ impl Encoder for DictEncoder { RleEncoder::max_buffer_size(bit_width, self.indices.len()) } - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { self.write_indices() } } diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index 3088f332183b..89e61ee226ad 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -24,11 +24,9 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::util::{ - bit_util::{self, num_required_bits, BitWriter}, - memory::ByteBufferPtr, -}; +use crate::util::bit_util::{self, num_required_bits, BitWriter}; +use bytes::Bytes; pub use dict_encoder::DictEncoder; mod dict_encoder; @@ -70,7 +68,7 @@ pub trait Encoder: Send { /// Flushes the underlying byte buffer that's being processed by this encoder, and /// return the immutable copy of it. This will also reset the internal state. - fn flush_buffer(&mut self) -> Result; + fn flush_buffer(&mut self) -> Result; } /// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage @@ -143,7 +141,7 @@ impl Encoder for PlainEncoder { } #[inline] - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { self.buffer .extend_from_slice(self.bit_writer.flush_buffer()); self.bit_writer.clear(); @@ -223,7 +221,7 @@ impl Encoder for RleValueEncoder { } #[inline] - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { ensure_phys_ty!(Type::BOOLEAN, "RleValueEncoder only supports BoolType"); let rle_encoder = self .encoder @@ -238,7 +236,7 @@ impl Encoder for RleValueEncoder { let len = (buf.len() - 4) as i32; buf[..4].copy_from_slice(&len.to_le_bytes()); - Ok(ByteBufferPtr::new(buf)) + Ok(buf.into()) } } @@ -456,7 +454,7 @@ impl Encoder for DeltaBitPackEncoder { self.bit_writer.bytes_written() } - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { // Write remaining values self.flush_block_values()?; // Write page header with total values @@ -597,7 +595,7 @@ impl Encoder for DeltaLengthByteArrayEncoder { self.len_encoder.estimated_data_encoded_size() + self.encoded_size } - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { ensure_phys_ty!( Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY, "DeltaLengthByteArrayEncoder only supports ByteArrayType" @@ -605,14 +603,14 @@ impl Encoder for DeltaLengthByteArrayEncoder { let mut total_bytes = vec![]; let lengths = self.len_encoder.flush_buffer()?; - total_bytes.extend_from_slice(lengths.data()); + total_bytes.extend_from_slice(&lengths); self.data.iter().for_each(|byte_array| { total_bytes.extend_from_slice(byte_array.data()); }); self.data.clear(); self.encoded_size = 0; - Ok(ByteBufferPtr::new(total_bytes)) + Ok(total_bytes.into()) } } @@ -696,7 +694,7 @@ impl Encoder for DeltaByteArrayEncoder { + self.suffix_writer.estimated_data_encoded_size() } - fn flush_buffer(&mut self) -> Result { + fn flush_buffer(&mut self) -> Result { match T::get_physical_type() { Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { // TODO: investigate if we can merge lengths and suffixes @@ -704,17 +702,17 @@ impl Encoder for DeltaByteArrayEncoder { let mut total_bytes = vec![]; // Insert lengths ... let lengths = self.prefix_len_encoder.flush_buffer()?; - total_bytes.extend_from_slice(lengths.data()); + total_bytes.extend_from_slice(&lengths); // ... followed by suffixes let suffixes = self.suffix_writer.flush_buffer()?; - total_bytes.extend_from_slice(suffixes.data()); + total_bytes.extend_from_slice(&suffixes); self.previous.clear(); - Ok(ByteBufferPtr::new(total_bytes)) + Ok(total_bytes.into()) } _ => panic!( "DeltaByteArrayEncoder only supports ByteArrayType and FixedLenByteArrayType" - ) + ), } } } diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 63ab15c73ead..5807f6b9c527 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -17,12 +17,11 @@ use std::{cmp, mem::size_of}; +use bytes::Bytes; + use crate::errors::{ParquetError, Result}; use crate::util::bit_util::from_le_slice; -use crate::util::{ - bit_util::{self, BitReader, BitWriter, FromBytes}, - memory::ByteBufferPtr, -}; +use crate::util::bit_util::{self, BitReader, BitWriter, FromBytes}; /// Rle/Bit-Packing Hybrid Encoding /// The grammar for this encoding looks like the following (copied verbatim @@ -326,7 +325,7 @@ impl RleDecoder { } #[inline] - pub fn set_data(&mut self, data: ByteBufferPtr) { + pub fn set_data(&mut self, data: Bytes) { if let Some(ref mut bit_reader) = self.bit_reader { bit_reader.reset(data); } else { @@ -543,17 +542,15 @@ mod tests { use crate::util::bit_util::ceil; use rand::{self, distributions::Standard, thread_rng, Rng, SeedableRng}; - use crate::util::memory::ByteBufferPtr; - const MAX_WIDTH: usize = 32; #[test] fn test_rle_decode_int32() { // Test data: 0-7 with bit width 3 // 00000011 10001000 11000110 11111010 - let data = ByteBufferPtr::new(vec![0x03, 0x88, 0xC6, 0xFA]); + let data = vec![0x03, 0x88, 0xC6, 0xFA]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let mut buffer = vec![0; 8]; let expected = vec![0, 1, 2, 3, 4, 5, 6, 7]; let result = decoder.get_batch::(&mut buffer); @@ -565,9 +562,9 @@ mod tests { fn test_rle_skip_int32() { // Test data: 0-7 with bit width 3 // 00000011 10001000 11000110 11111010 - let data = ByteBufferPtr::new(vec![0x03, 0x88, 0xC6, 0xFA]); + let data = vec![0x03, 0x88, 0xC6, 0xFA]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let expected = vec![2, 3, 4, 5, 6, 7]; let skipped = decoder.skip(2).expect("skipping values"); assert_eq!(skipped, 2); @@ -598,18 +595,17 @@ mod tests { fn test_rle_decode_bool() { // RLE test data: 50 1s followed by 50 0s // 01100100 00000001 01100100 00000000 - let data1 = ByteBufferPtr::new(vec![0x64, 0x01, 0x64, 0x00]); + let data1 = vec![0x64, 0x01, 0x64, 0x00]; // Bit-packing test data: alternating 1s and 0s, 100 total // 100 / 8 = 13 groups // 00011011 10101010 ... 00001010 - let data2 = ByteBufferPtr::new(vec![ - 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0x0A, - ]); + let data2 = vec![ + 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x0A, + ]; let mut decoder: RleDecoder = RleDecoder::new(1); - decoder.set_data(data1); + decoder.set_data(data1.into()); let mut buffer = vec![false; 100]; let mut expected = vec![]; for i in 0..100 { @@ -623,7 +619,7 @@ mod tests { assert!(result.is_ok()); assert_eq!(buffer, expected); - decoder.set_data(data2); + decoder.set_data(data2.into()); let mut buffer = vec![false; 100]; let mut expected = vec![]; for i in 0..100 { @@ -642,18 +638,17 @@ mod tests { fn test_rle_skip_bool() { // RLE test data: 50 1s followed by 50 0s // 01100100 00000001 01100100 00000000 - let data1 = ByteBufferPtr::new(vec![0x64, 0x01, 0x64, 0x00]); + let data1 = vec![0x64, 0x01, 0x64, 0x00]; // Bit-packing test data: alternating 1s and 0s, 100 total // 100 / 8 = 13 groups // 00011011 10101010 ... 00001010 - let data2 = ByteBufferPtr::new(vec![ - 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0x0A, - ]); + let data2 = vec![ + 0x1B, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x0A, + ]; let mut decoder: RleDecoder = RleDecoder::new(1); - decoder.set_data(data1); + decoder.set_data(data1.into()); let mut buffer = vec![true; 50]; let expected = vec![false; 50]; @@ -665,7 +660,7 @@ mod tests { assert_eq!(remainder, 50); assert_eq!(buffer, expected); - decoder.set_data(data2); + decoder.set_data(data2.into()); let mut buffer = vec![false; 50]; let mut expected = vec![]; for i in 0..50 { @@ -689,9 +684,9 @@ mod tests { // Test RLE encoding: 3 0s followed by 4 1s followed by 5 2s // 00000110 00000000 00001000 00000001 00001010 00000010 let dict = vec![10, 20, 30]; - let data = ByteBufferPtr::new(vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]); + let data = vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let mut buffer = vec![0; 12]; let expected = vec![10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30, 30]; let result = decoder.get_batch_with_dict::(&dict, &mut buffer, 12); @@ -702,9 +697,9 @@ mod tests { // 011 100 101 011 100 101 011 100 101 100 101 101 // 00000011 01100011 11000111 10001110 00000011 01100101 00001011 let dict = vec!["aaa", "bbb", "ccc", "ddd", "eee", "fff"]; - let data = ByteBufferPtr::new(vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]); + let data = vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let mut buffer = vec![""; 12]; let expected = vec![ "ddd", "eee", "fff", "ddd", "eee", "fff", "ddd", "eee", "fff", "eee", "fff", @@ -724,9 +719,9 @@ mod tests { // Test RLE encoding: 3 0s followed by 4 1s followed by 5 2s // 00000110 00000000 00001000 00000001 00001010 00000010 let dict = vec![10, 20, 30]; - let data = ByteBufferPtr::new(vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]); + let data = vec![0x06, 0x00, 0x08, 0x01, 0x0A, 0x02]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let mut buffer = vec![0; 10]; let expected = vec![10, 20, 20, 20, 20, 30, 30, 30, 30, 30]; let skipped = decoder.skip(2).expect("skipping two values"); @@ -741,9 +736,9 @@ mod tests { // 011 100 101 011 100 101 011 100 101 100 101 101 // 00000011 01100011 11000111 10001110 00000011 01100101 00001011 let dict = vec!["aaa", "bbb", "ccc", "ddd", "eee", "fff"]; - let data = ByteBufferPtr::new(vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]); + let data = vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B]; let mut decoder: RleDecoder = RleDecoder::new(3); - decoder.set_data(data); + decoder.set_data(data.into()); let mut buffer = vec![""; 8]; let expected = vec!["eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff"]; let skipped = decoder.skip(4).expect("skipping four values"); @@ -766,7 +761,7 @@ mod tests { for v in values { encoder.put(*v as u64) } - let buffer = ByteBufferPtr::new(encoder.consume()); + let buffer: Bytes = encoder.consume().into(); if expected_len != -1 { assert_eq!(buffer.len(), expected_len as usize); } @@ -776,7 +771,7 @@ mod tests { // Verify read let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(buffer.all()); + decoder.set_data(buffer.clone()); for v in values { let val: i64 = decoder .get() @@ -888,7 +883,7 @@ mod tests { (3 << 1) | 1, // bit-packed run of 3 * 8 ]; data.extend(std::iter::repeat(0xFF).take(20)); - let data = ByteBufferPtr::new(data); + let data: Bytes = data.into(); let mut decoder = RleDecoder::new(8); decoder.set_data(data.clone()); @@ -926,7 +921,7 @@ mod tests { buffer.push(0); let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(ByteBufferPtr::new(buffer)); + decoder.set_data(buffer.into()); // We don't always reliably know how many non-null values are contained in a page // and so the decoder must work correctly without a precise value count @@ -963,7 +958,7 @@ mod tests { for _ in 0..run_bytes { writer.put_aligned(0xFF_u8, 1); } - let buffer = ByteBufferPtr::new(writer.consume()); + let buffer: Bytes = writer.consume().into(); let mut decoder = RleDecoder::new(1); decoder.set_data(buffer.clone()); @@ -992,7 +987,7 @@ mod tests { } let buffer = encoder.consume(); let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(ByteBufferPtr::new(buffer)); + decoder.set_data(Bytes::from(buffer)); let mut actual_values: Vec = vec![0; values.len()]; decoder .get_batch(&mut actual_values) @@ -1007,11 +1002,11 @@ mod tests { encoder.put(*v as u64) } - let buffer = ByteBufferPtr::new(encoder.consume()); + let buffer = Bytes::from(encoder.consume()); // Verify read let mut decoder = RleDecoder::new(bit_width); - decoder.set_data(buffer.all()); + decoder.set_data(buffer.clone()); for v in values { let val = decoder .get::() diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 0d032c27aa06..43e169cd085b 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -40,7 +40,7 @@ use crate::record::reader::RowIter; use crate::record::Row; use crate::schema::types::Type as SchemaType; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; -use crate::util::memory::ByteBufferPtr; +use bytes::Bytes; use thrift::protocol::TCompactInputProtocol; impl TryFrom for SerializedFileReader { @@ -386,7 +386,7 @@ fn read_page_header_len(input: &mut T) -> Result<(usize, PageHeader)> { /// Decodes a [`Page`] from the provided `buffer` pub(crate) fn decode_page( page_header: PageHeader, - buffer: ByteBufferPtr, + buffer: Bytes, physical_type: Type, decompressor: Option<&mut Box>, ) -> Result { @@ -428,7 +428,7 @@ pub(crate) fn decode_page( )); } - ByteBufferPtr::new(decompressed) + Bytes::from(decompressed) } _ => buffer, }; @@ -627,7 +627,7 @@ impl PageReader for SerializedPageReader { decode_page( header, - ByteBufferPtr::new(buffer), + Bytes::from(buffer), self.physical_type, self.decompressor.as_mut(), )? @@ -656,7 +656,7 @@ impl PageReader for SerializedPageReader { let bytes = buffer.slice(offset..); decode_page( header, - bytes.into(), + bytes, self.physical_type, self.decompressor.as_mut(), )? diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index dbbd8b4b99a2..2b9f261d9f42 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -756,7 +756,6 @@ mod tests { use crate::record::{Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::schema::types::{ColumnDescriptor, ColumnPath}; - use crate::util::memory::ByteBufferPtr; #[test] fn test_row_group_writer_error_not_all_columns_written() { @@ -1040,7 +1039,7 @@ mod tests { fn test_page_writer_data_pages() { let pages = vec![ Page::DataPage { - buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + buf: Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]), num_values: 10, encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, @@ -1048,7 +1047,7 @@ mod tests { statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), }, Page::DataPageV2 { - buf: ByteBufferPtr::new(vec![4; 128]), + buf: Bytes::from(vec![4; 128]), num_values: 10, encoding: Encoding::DELTA_BINARY_PACKED, num_nulls: 2, @@ -1068,13 +1067,13 @@ mod tests { fn test_page_writer_dict_pages() { let pages = vec![ Page::DictionaryPage { - buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5]), + buf: Bytes::from(vec![1, 2, 3, 4, 5]), num_values: 5, encoding: Encoding::RLE_DICTIONARY, is_sorted: false, }, Page::DataPage { - buf: ByteBufferPtr::new(vec![1, 2, 3, 4, 5, 6, 7, 8]), + buf: Bytes::from(vec![1, 2, 3, 4, 5, 6, 7, 8]), num_values: 10, encoding: Encoding::DELTA_BINARY_PACKED, def_level_encoding: Encoding::RLE, @@ -1082,7 +1081,7 @@ mod tests { statistics: Some(Statistics::int32(Some(1), Some(3), None, 7, true)), }, Page::DataPageV2 { - buf: ByteBufferPtr::new(vec![4; 128]), + buf: Bytes::from(vec![4; 128]), num_values: 10, encoding: Encoding::DELTA_BINARY_PACKED, num_nulls: 2, @@ -1122,10 +1121,10 @@ mod tests { ref statistics, } => { total_num_values += num_values as i64; - let output_buf = compress_helper(compressor.as_mut(), buf.data()); + let output_buf = compress_helper(compressor.as_mut(), buf); Page::DataPage { - buf: ByteBufferPtr::new(output_buf), + buf: Bytes::from(output_buf), num_values, encoding, def_level_encoding, @@ -1147,12 +1146,12 @@ mod tests { } => { total_num_values += num_values as i64; let offset = (def_levels_byte_len + rep_levels_byte_len) as usize; - let cmp_buf = compress_helper(compressor.as_mut(), &buf.data()[offset..]); - let mut output_buf = Vec::from(&buf.data()[..offset]); + let cmp_buf = compress_helper(compressor.as_mut(), &buf[offset..]); + let mut output_buf = Vec::from(&buf[..offset]); output_buf.extend_from_slice(&cmp_buf[..]); Page::DataPageV2 { - buf: ByteBufferPtr::new(output_buf), + buf: Bytes::from(output_buf), num_values, encoding, num_nulls, @@ -1170,10 +1169,10 @@ mod tests { encoding, is_sorted, } => { - let output_buf = compress_helper(compressor.as_mut(), buf.data()); + let output_buf = compress_helper(compressor.as_mut(), buf); Page::DictionaryPage { - buf: ByteBufferPtr::new(output_buf), + buf: Bytes::from(output_buf), num_values, encoding, is_sorted, @@ -1248,7 +1247,7 @@ mod tests { /// Check if pages match. fn assert_page(left: &Page, right: &Page) { assert_eq!(left.page_type(), right.page_type()); - assert_eq!(left.buffer().data(), right.buffer().data()); + assert_eq!(&left.buffer(), &right.buffer()); assert_eq!(left.num_values(), right.num_values()); assert_eq!(left.encoding(), right.encoding()); assert_eq!(to_thrift(left.statistics()), to_thrift(right.statistics())); diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index f1612c90cc2a..0279bbc382ea 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -74,10 +74,6 @@ pub mod data_type; #[doc(hidden)] pub use self::encodings::{decoding, encoding}; -#[cfg(feature = "experimental")] -#[doc(hidden)] -pub use self::util::memory; - experimental!(#[macro_use] mod util); #[cfg(feature = "arrow")] pub mod arrow; diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 597190a46eff..b1dd23574a19 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -17,10 +17,11 @@ use std::{cmp, mem::size_of}; +use bytes::Bytes; + use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; use crate::errors::{ParquetError, Result}; use crate::util::bit_pack::{unpack16, unpack32, unpack64, unpack8}; -use crate::util::memory::ByteBufferPtr; #[inline] pub fn from_le_slice(bs: &[u8]) -> T { @@ -341,7 +342,7 @@ pub const MAX_VLQ_BYTE_LEN: usize = 10; pub struct BitReader { /// The byte buffer to read from, passed in by client - buffer: ByteBufferPtr, + buffer: Bytes, /// Bytes are memcpy'd from `buffer` and values are read from this variable. /// This is faster than reading values byte by byte directly from `buffer` @@ -365,7 +366,7 @@ pub struct BitReader { /// Utility class to read bit/byte stream. This class can read bits or bytes that are /// either byte aligned or not. impl BitReader { - pub fn new(buffer: ByteBufferPtr) -> Self { + pub fn new(buffer: Bytes) -> Self { BitReader { buffer, buffered_values: 0, @@ -374,7 +375,7 @@ impl BitReader { } } - pub fn reset(&mut self, buffer: ByteBufferPtr) { + pub fn reset(&mut self, buffer: Bytes) { self.buffer = buffer; self.buffered_values = 0; self.byte_offset = 0; @@ -456,8 +457,6 @@ impl BitReader { } } - let in_buf = self.buffer.data(); - // Read directly into output buffer match size_of::() { 1 => { @@ -465,7 +464,7 @@ impl BitReader { let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; while values_to_read - i >= 8 { let out_slice = (&mut out[i..i + 8]).try_into().unwrap(); - unpack8(&in_buf[self.byte_offset..], out_slice, num_bits); + unpack8(&self.buffer[self.byte_offset..], out_slice, num_bits); self.byte_offset += num_bits; i += 8; } @@ -475,7 +474,7 @@ impl BitReader { let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; while values_to_read - i >= 16 { let out_slice = (&mut out[i..i + 16]).try_into().unwrap(); - unpack16(&in_buf[self.byte_offset..], out_slice, num_bits); + unpack16(&self.buffer[self.byte_offset..], out_slice, num_bits); self.byte_offset += 2 * num_bits; i += 16; } @@ -485,7 +484,7 @@ impl BitReader { let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; while values_to_read - i >= 32 { let out_slice = (&mut out[i..i + 32]).try_into().unwrap(); - unpack32(&in_buf[self.byte_offset..], out_slice, num_bits); + unpack32(&self.buffer[self.byte_offset..], out_slice, num_bits); self.byte_offset += 4 * num_bits; i += 32; } @@ -495,7 +494,7 @@ impl BitReader { let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; while values_to_read - i >= 64 { let out_slice = (&mut out[i..i + 64]).try_into().unwrap(); - unpack64(&in_buf[self.byte_offset..], out_slice, num_bits); + unpack64(&self.buffer[self.byte_offset..], out_slice, num_bits); self.byte_offset += 8 * num_bits; i += 64; } @@ -506,7 +505,7 @@ impl BitReader { // Try to read smaller batches if possible if size_of::() > 4 && values_to_read - i >= 32 && num_bits <= 32 { let mut out_buf = [0_u32; 32]; - unpack32(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + unpack32(&self.buffer[self.byte_offset..], &mut out_buf, num_bits); self.byte_offset += 4 * num_bits; for out in out_buf { @@ -520,7 +519,7 @@ impl BitReader { if size_of::() > 2 && values_to_read - i >= 16 && num_bits <= 16 { let mut out_buf = [0_u16; 16]; - unpack16(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + unpack16(&self.buffer[self.byte_offset..], &mut out_buf, num_bits); self.byte_offset += 2 * num_bits; for out in out_buf { @@ -534,7 +533,7 @@ impl BitReader { if size_of::() > 1 && values_to_read - i >= 8 && num_bits <= 8 { let mut out_buf = [0_u8; 8]; - unpack8(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + unpack8(&self.buffer[self.byte_offset..], &mut out_buf, num_bits); self.byte_offset += num_bits; for out in out_buf { @@ -595,7 +594,7 @@ impl BitReader { self.byte_offset = self.get_byte_offset(); self.bit_offset = 0; - let src = &self.buffer.data()[self.byte_offset..]; + let src = &self.buffer[self.byte_offset..]; let to_read = num_bytes.min(src.len()); buf.extend_from_slice(&src[..to_read]); @@ -620,7 +619,7 @@ impl BitReader { } // Advance byte_offset to next unread byte and read num_bytes - let v = read_num_bytes::(num_bytes, &self.buffer.data()[self.byte_offset..]); + let v = read_num_bytes::(num_bytes, &self.buffer[self.byte_offset..]); self.byte_offset += num_bytes; Some(v) @@ -672,14 +671,14 @@ impl BitReader { fn load_buffered_values(&mut self) { let bytes_to_read = cmp::min(self.buffer.len() - self.byte_offset, 8); self.buffered_values = - read_num_bytes::(bytes_to_read, &self.buffer.data()[self.byte_offset..]); + read_num_bytes::(bytes_to_read, &self.buffer[self.byte_offset..]); } } impl From> for BitReader { #[inline] fn from(buffer: Vec) -> Self { - BitReader::new(ByteBufferPtr::new(buffer)) + BitReader::new(buffer.into()) } } @@ -771,12 +770,12 @@ mod tests { #[test] fn test_bit_reader_get_aligned() { // 01110101 11001011 - let buffer = ByteBufferPtr::new(vec![0x75, 0xCB]); - let mut bit_reader = BitReader::new(buffer.all()); + let buffer = Bytes::from(vec![0x75, 0xCB]); + let mut bit_reader = BitReader::new(buffer.clone()); assert_eq!(bit_reader.get_value::(3), Some(5)); assert_eq!(bit_reader.get_aligned::(1), Some(203)); assert_eq!(bit_reader.get_value::(1), None); - bit_reader.reset(buffer.all()); + bit_reader.reset(buffer.clone()); assert_eq!(bit_reader.get_aligned::(3), None); } @@ -1128,7 +1127,7 @@ mod tests { #[test] fn test_get_batch_zero_extend() { let to_read = vec![0xFF; 4]; - let mut reader = BitReader::new(ByteBufferPtr::new(to_read)); + let mut reader = BitReader::from(to_read); // Create a non-zeroed output buffer let mut output = [u64::MAX; 32]; diff --git a/parquet/src/util/memory.rs b/parquet/src/util/memory.rs deleted file mode 100644 index 25d15dd4ff73..000000000000 --- a/parquet/src/util/memory.rs +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Utility methods and structs for working with memory. - -use bytes::Bytes; -use std::{ - fmt::{Debug, Display, Formatter, Result as FmtResult}, - ops::Index, -}; - -// ---------------------------------------------------------------------- -// Immutable Buffer (BufferPtr) classes - -/// An representation of a slice on a reference-counting and read-only byte array. -/// Sub-slices can be further created from this. The byte array will be released -/// when all slices are dropped. -/// -/// TODO: Remove and replace with [`bytes::Bytes`] -#[derive(Clone, Debug)] -pub struct ByteBufferPtr { - data: Bytes, -} - -impl ByteBufferPtr { - /// Creates new buffer from a vector. - pub fn new(v: Vec) -> Self { - Self { data: v.into() } - } - - /// Returns slice of data in this buffer. - #[inline] - pub fn data(&self) -> &[u8] { - &self.data - } - - /// Returns length of this buffer - #[inline] - pub fn len(&self) -> usize { - self.data.len() - } - - /// Returns whether this buffer is empty - #[inline] - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - /// Returns a shallow copy of the buffer. - /// Reference counted pointer to the data is copied. - pub fn all(&self) -> Self { - self.clone() - } - - /// Returns a shallow copy of the buffer that starts with `start` position. - pub fn start_from(&self, start: usize) -> Self { - Self { - data: self.data.slice(start..), - } - } - - /// Returns a shallow copy that is a range slice within this buffer. - pub fn range(&self, start: usize, len: usize) -> Self { - Self { - data: self.data.slice(start..start + len), - } - } -} - -impl Index for ByteBufferPtr { - type Output = u8; - - fn index(&self, index: usize) -> &u8 { - &self.data[index] - } -} - -impl Display for ByteBufferPtr { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{:?}", self.data) - } -} - -impl AsRef<[u8]> for ByteBufferPtr { - #[inline] - fn as_ref(&self) -> &[u8] { - &self.data - } -} - -impl From> for ByteBufferPtr { - fn from(data: Vec) -> Self { - Self { data: data.into() } - } -} - -impl From for ByteBufferPtr { - fn from(data: Bytes) -> Self { - Self { data } - } -} - -impl From for Bytes { - fn from(value: ByteBufferPtr) -> Self { - value.data - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_byte_ptr() { - let values = (0..50).collect(); - let ptr = ByteBufferPtr::new(values); - assert_eq!(ptr.len(), 50); - assert_eq!(ptr[40], 40); - - let ptr2 = ptr.all(); - assert_eq!(ptr2.len(), 50); - assert_eq!(ptr2[40], 40); - - let ptr3 = ptr.start_from(20); - assert_eq!(ptr3.len(), 30); - assert_eq!(ptr3[0], 20); - - let ptr4 = ptr3.range(10, 10); - assert_eq!(ptr4.len(), 10); - assert_eq!(ptr4[0], 30); - - let expected: Vec = (30..40).collect(); - assert_eq!(ptr4.as_ref(), expected.as_slice()); - } -} diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index d96a62a9f363..dfa1285afcf2 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -pub mod memory; #[macro_use] pub mod bit_util; mod bit_pack; diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index c51c5158cd42..b4fed752fdc5 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use bytes::Bytes; + use crate::basic::Encoding; use crate::column::page::{Page, PageIterator}; use crate::column::page::{PageMetadata, PageReader}; @@ -23,7 +25,6 @@ use crate::encodings::encoding::{get_encoder, Encoder}; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; use crate::schema::types::ColumnDescPtr; -use crate::util::memory::ByteBufferPtr; use std::iter::Peekable; use std::mem; @@ -31,7 +32,7 @@ pub trait DataPageBuilder { fn add_rep_levels(&mut self, max_level: i16, rep_levels: &[i16]); fn add_def_levels(&mut self, max_level: i16, def_levels: &[i16]); fn add_values(&mut self, encoding: Encoding, values: &[T::T]); - fn add_indices(&mut self, indices: ByteBufferPtr); + fn add_indices(&mut self, indices: Bytes); fn consume(self) -> Page; } @@ -112,18 +113,18 @@ impl DataPageBuilder for DataPageBuilderImpl { let encoded_values = encoder .flush_buffer() .expect("consume_buffer() should be OK"); - self.buffer.extend_from_slice(encoded_values.data()); + self.buffer.extend_from_slice(&encoded_values); } - fn add_indices(&mut self, indices: ByteBufferPtr) { + fn add_indices(&mut self, indices: Bytes) { self.encoding = Some(Encoding::RLE_DICTIONARY); - self.buffer.extend_from_slice(indices.data()); + self.buffer.extend_from_slice(&indices); } fn consume(self) -> Page { if self.datapage_v2 { Page::DataPageV2 { - buf: ByteBufferPtr::new(self.buffer), + buf: Bytes::from(self.buffer), num_values: self.num_values, encoding: self.encoding.unwrap(), num_nulls: 0, /* set to dummy value - don't need this when reading @@ -137,7 +138,7 @@ impl DataPageBuilder for DataPageBuilderImpl { } } else { Page::DataPage { - buf: ByteBufferPtr::new(self.buffer), + buf: Bytes::from(self.buffer), num_values: self.num_values, encoding: self.encoding.unwrap(), def_level_encoding: Encoding::RLE, diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index c36b9060ca58..a267c34840c1 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -17,6 +17,7 @@ use crate::basic::Encoding; use crate::column::page::Page; +use bytes::Bytes; use rand::{ distributions::{uniform::SampleUniform, Distribution, Standard}, thread_rng, Rng, @@ -26,7 +27,6 @@ use std::collections::VecDeque; use crate::data_type::*; use crate::encodings::encoding::{DictEncoder, Encoder}; use crate::schema::types::ColumnDescPtr; -use crate::util::memory::ByteBufferPtr; use crate::util::{DataPageBuilder, DataPageBuilderImpl}; /// Random generator of data type `T` values and sequences. @@ -90,7 +90,7 @@ impl RandGen for ByteArrayType { for _ in 0..len { value.push(rng.gen_range(0..255)); } - result.set_data(ByteBufferPtr::new(value)); + result.set_data(Bytes::from(value)); result } } From 0cb30bb51e39e36e95870dbd6caa92ce47b73e5b Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 8 Nov 2023 16:00:41 +0100 Subject: [PATCH 1339/1411] refactor: change `object_store` CA handling (#5056) Closes #4870. --- object_store/Cargo.toml | 3 ++- object_store/src/lib.rs | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 7fcb6ce9e3f1..bf8301557df2 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -50,7 +50,7 @@ quick-xml = { version = "0.31.0", features = ["serialize", "overlapped-lists"], serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-native-roots"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } @@ -64,6 +64,7 @@ azure = ["cloud"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud"] http = ["cloud"] +tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] [dev-dependencies] # In alphabetical order tempfile = "3.1.0" diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index cdd572dd9b3a..f791e65b386c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -86,6 +86,17 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! +//! # TLS Certificates +//! +//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] +//! certificates. By default the system-bundled certificates are used (see +//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's +//! root certificates with the library/application (see [`webpki-roots`]). +//! +//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority +//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ +//! [`webpki-roots`]: https://crates.io/crates/webpki-roots +//! //! # Why not a Filesystem Interface? //! //! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs From f53f284b3e6933433c656f4d66e851b566c58c32 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 8 Nov 2023 16:51:13 +0100 Subject: [PATCH 1340/1411] docs: re-order `object_store` intro (#5058) --- object_store/src/lib.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index f791e65b386c..2d1d549f9e54 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -86,17 +86,6 @@ doc = "* [`http`]: [HTTP/WebDAV Storage](https://datatracker.ietf.org/doc/html/rfc2518). See [`HttpBuilder`](http::HttpBuilder)" )] //! -//! # TLS Certificates -//! -//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] -//! certificates. By default the system-bundled certificates are used (see -//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's -//! root certificates with the library/application (see [`webpki-roots`]). -//! -//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority -//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ -//! [`webpki-roots`]: https://crates.io/crates/webpki-roots -//! //! # Why not a Filesystem Interface? //! //! Whilst this crate does provide a [`BufReader`], the [`ObjectStore`] interface mirrors the APIs @@ -447,6 +436,17 @@ //! [Apache Iceberg]: https://iceberg.apache.org/ //! [Delta Lake]: https://delta.io/ //! +//! # TLS Certificates +//! +//! Stores that use HTTPS/TLS (this is true for most cloud stores) can choose the source of their [CA] +//! certificates. By default the system-bundled certificates are used (see +//! [`rustls-native-certs`]). The `tls-webpki-roots` feature switch can be used to also bundle Mozilla's +//! root certificates with the library/application (see [`webpki-roots`]). +//! +//! [CA]: https://en.wikipedia.org/wiki/Certificate_authority +//! [`rustls-native-certs`]: https://crates.io/crates/rustls-native-certs/ +//! [`webpki-roots`]: https://crates.io/crates/webpki-roots +//! #[cfg(all( target_arch = "wasm32", From 31b5724332666d68fe94a5b3572a13a51022ea81 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" <193874+carols10cents@users.noreply.github.com> Date: Fri, 10 Nov 2023 10:41:07 -0500 Subject: [PATCH 1341/1411] Add a PR under "Breaking changes" in the object_store 0.8.0 changelog (#5063) This PR adds a method, `put_opts`, to the `ObjectStore` trait, so any implementer of this trait will need to update their code when they upgrade to 0.8.0. --- object_store/CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index c24cf54cc3be..7a4fcd0850b8 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -31,6 +31,7 @@ - Add ObjectMeta::version and GetOptions::version \(\#4925\) [\#4935](https://github.com/apache/arrow-rs/pull/4935) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) - Add GetOptions::head [\#4931](https://github.com/apache/arrow-rs/pull/4931) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) - Remove Nested async and Fallibility from ObjectStore::list [\#4930](https://github.com/apache/arrow-rs/pull/4930) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::put_opts / Conditional Put [\#4879](https://github.com/apache/arrow-rs/pull/4984) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** From 924b6e9d0e62ad8cb85419268d8765611a72631e Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 14 Nov 2023 08:01:10 +1100 Subject: [PATCH 1342/1411] IPC writer truncated sliced list/map values (#5071) * IPC writer truncated sliced list/map values * Add empty list test * Revert submodule update --- arrow-ipc/src/writer.rs | 429 ++++++++++++++++++++++++++-------------- 1 file changed, 285 insertions(+), 144 deletions(-) diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index a58cbfc51428..1f6bf5f6fa85 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1139,6 +1139,29 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } +/// Common functionality for re-encoding offsets. Returns the new offsets as well as +/// original start offset and length for use in slicing child data. +fn reencode_offsets( + offsets: &Buffer, + data: &ArrayData, +) -> (Buffer, usize, usize) { + let offsets_slice: &[O] = offsets.typed_data::(); + let offset_slice = &offsets_slice[data.offset()..data.offset() + data.len() + 1]; + + let start_offset = offset_slice.first().unwrap(); + let end_offset = offset_slice.last().unwrap(); + + let offsets = match start_offset.as_usize() { + 0 => offsets.clone(), + _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), + }; + + let start_offset = start_offset.as_usize(); + let end_offset = end_offset.as_usize(); + + (offsets, start_offset, end_offset - start_offset) +} + /// Returns the values and offsets [`Buffer`] for a ByteArray with offset type `O` /// /// In particular, this handles re-encoding the offsets if they don't start at `0`, @@ -1149,23 +1172,24 @@ fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buff return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); } - let buffers = data.buffers(); - let offsets: &[O] = buffers[0].typed_data::(); - let offset_slice = &offsets[data.offset()..data.offset() + data.len() + 1]; - - let start_offset = offset_slice.first().unwrap(); - let end_offset = offset_slice.last().unwrap(); + let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); + let values = data.buffers()[1].slice_with_length(original_start_offset, len); + (offsets, values) +} - let offsets = match start_offset.as_usize() { - 0 => buffers[0].clone(), - _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), - }; +/// Similar logic as [`get_byte_array_buffers()`] but slices the child array instead +/// of a values buffer. +fn get_list_array_buffers(data: &ArrayData) -> (Buffer, ArrayData) { + if data.is_empty() { + return ( + MutableBuffer::new(0).into(), + data.child_data()[0].slice(0, 0), + ); + } - let values = buffers[1].slice_with_length( - start_offset.as_usize(), - end_offset.as_usize() - start_offset.as_usize(), - ); - (offsets, values) + let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); + let child_data = data.child_data()[0].slice(original_start_offset, len); + (offsets, child_data) } /// Write array data to a vector of bytes @@ -1250,20 +1274,14 @@ fn write_array_data( let byte_width = get_buffer_element_width(spec); let min_length = array_data.len() * byte_width; - if buffer_need_truncate(array_data.offset(), buffer, spec, min_length) { + let buffer_slice = if buffer_need_truncate(array_data.offset(), buffer, spec, min_length) { let byte_offset = array_data.offset() * byte_width; let buffer_length = min(min_length, buffer.len() - byte_offset); - let buffer_slice = &buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; - offset = write_buffer(buffer_slice, buffers, arrow_data, offset, compression_codec)?; + &buffer.as_slice()[byte_offset..(byte_offset + buffer_length)] } else { - offset = write_buffer( - buffer.as_slice(), - buffers, - arrow_data, - offset, - compression_codec, - )?; - } + buffer.as_slice() + }; + offset = write_buffer(buffer_slice, buffers, arrow_data, offset, compression_codec)?; } else if matches!(data_type, DataType::Boolean) { // Bools are special because the payload (= 1 bit) is smaller than the physical container elements (= bytes). // The array data may not start at the physical boundary of the underlying buffer, so we need to shift bits around. @@ -1272,6 +1290,39 @@ fn write_array_data( let buffer = &array_data.buffers()[0]; let buffer = buffer.bit_slice(array_data.offset(), array_data.len()); offset = write_buffer(&buffer, buffers, arrow_data, offset, compression_codec)?; + } else if matches!( + data_type, + DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) + ) { + assert_eq!(array_data.buffers().len(), 1); + assert_eq!(array_data.child_data().len(), 1); + + // Truncate offsets and the child data to avoid writing unnecessary data + let (offsets, sliced_child_data) = match data_type { + DataType::List(_) => get_list_array_buffers::(array_data), + DataType::Map(_, _) => get_list_array_buffers::(array_data), + DataType::LargeList(_) => get_list_array_buffers::(array_data), + _ => unreachable!(), + }; + offset = write_buffer( + offsets.as_slice(), + buffers, + arrow_data, + offset, + compression_codec, + )?; + offset = write_array_data( + &sliced_child_data, + buffers, + arrow_data, + nodes, + offset, + sliced_child_data.len(), + sliced_child_data.null_count(), + compression_codec, + write_options, + )?; + return Ok(offset); } else { for buffer in array_data.buffers() { offset = write_buffer(buffer, buffers, arrow_data, offset, compression_codec)?; @@ -1372,8 +1423,10 @@ mod tests { use std::io::Seek; use std::sync::Arc; + use arrow_array::builder::GenericListBuilder; + use arrow_array::builder::MapBuilder; use arrow_array::builder::UnionBuilder; - use arrow_array::builder::{ListBuilder, PrimitiveRunBuilder, UInt32Builder}; + use arrow_array::builder::{PrimitiveRunBuilder, UInt32Builder}; use arrow_array::types::*; use arrow_schema::DataType; @@ -1382,6 +1435,30 @@ mod tests { use super::*; + fn serialize_file(rb: &RecordBatch) -> Vec { + let mut writer = FileWriter::try_new(vec![], &rb.schema()).unwrap(); + writer.write(rb).unwrap(); + writer.finish().unwrap(); + writer.into_inner().unwrap() + } + + fn deserialize_file(bytes: Vec) -> RecordBatch { + let mut reader = FileReader::try_new(Cursor::new(bytes), None).unwrap(); + reader.next().unwrap().unwrap() + } + + fn serialize_stream(record: &RecordBatch) -> Vec { + let mut stream_writer = StreamWriter::try_new(vec![], &record.schema()).unwrap(); + stream_writer.write(record).unwrap(); + stream_writer.finish().unwrap(); + stream_writer.into_inner().unwrap() + } + + fn deserialize_stream(bytes: Vec) -> RecordBatch { + let mut stream_reader = StreamReader::try_new(Cursor::new(bytes), None).unwrap(); + stream_reader.next().unwrap().unwrap() + } + #[test] #[cfg(feature = "lz4")] fn test_write_empty_record_batch_lz4_compression() { @@ -1407,27 +1484,18 @@ mod tests { file.rewind().unwrap(); { // read file - let mut reader = FileReader::try_new(file, None).unwrap(); - loop { - match reader.next() { - Some(Ok(read_batch)) => { - read_batch - .columns() - .iter() - .zip(record_batch.columns()) - .for_each(|(a, b)| { - assert_eq!(a.data_type(), b.data_type()); - assert_eq!(a.len(), b.len()); - assert_eq!(a.null_count(), b.null_count()); - }); - } - Some(Err(e)) => { - panic!("{}", e); - } - None => { - break; - } - } + let reader = FileReader::try_new(file, None).unwrap(); + for read_batch in reader { + read_batch + .unwrap() + .columns() + .iter() + .zip(record_batch.columns()) + .for_each(|(a, b)| { + assert_eq!(a.data_type(), b.data_type()); + assert_eq!(a.len(), b.len()); + assert_eq!(a.null_count(), b.null_count()); + }); } } } @@ -1456,27 +1524,18 @@ mod tests { file.rewind().unwrap(); { // read file - let mut reader = FileReader::try_new(file, None).unwrap(); - loop { - match reader.next() { - Some(Ok(read_batch)) => { - read_batch - .columns() - .iter() - .zip(record_batch.columns()) - .for_each(|(a, b)| { - assert_eq!(a.data_type(), b.data_type()); - assert_eq!(a.len(), b.len()); - assert_eq!(a.null_count(), b.null_count()); - }); - } - Some(Err(e)) => { - panic!("{}", e); - } - None => { - break; - } - } + let reader = FileReader::try_new(file, None).unwrap(); + for read_batch in reader { + read_batch + .unwrap() + .columns() + .iter() + .zip(record_batch.columns()) + .for_each(|(a, b)| { + assert_eq!(a.data_type(), b.data_type()); + assert_eq!(a.len(), b.len()); + assert_eq!(a.null_count(), b.null_count()); + }); } } } @@ -1504,27 +1563,18 @@ mod tests { file.rewind().unwrap(); { // read file - let mut reader = FileReader::try_new(file, None).unwrap(); - loop { - match reader.next() { - Some(Ok(read_batch)) => { - read_batch - .columns() - .iter() - .zip(record_batch.columns()) - .for_each(|(a, b)| { - assert_eq!(a.data_type(), b.data_type()); - assert_eq!(a.len(), b.len()); - assert_eq!(a.null_count(), b.null_count()); - }); - } - Some(Err(e)) => { - panic!("{}", e); - } - None => { - break; - } - } + let reader = FileReader::try_new(file, None).unwrap(); + for read_batch in reader { + read_batch + .unwrap() + .columns() + .iter() + .zip(record_batch.columns()) + .for_each(|(a, b)| { + assert_eq!(a.data_type(), b.data_type()); + assert_eq!(a.len(), b.len()); + assert_eq!(a.null_count(), b.null_count()); + }); } } } @@ -1754,20 +1804,6 @@ mod tests { write_union_file(IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap()); } - fn serialize(record: &RecordBatch) -> Vec { - let buffer: Vec = Vec::new(); - let mut stream_writer = StreamWriter::try_new(buffer, &record.schema()).unwrap(); - stream_writer.write(record).unwrap(); - stream_writer.finish().unwrap(); - stream_writer.into_inner().unwrap() - } - - fn deserialize(bytes: Vec) -> RecordBatch { - let mut stream_reader = - crate::reader::StreamReader::try_new(std::io::Cursor::new(bytes), None).unwrap(); - stream_reader.next().unwrap().unwrap() - } - #[test] fn truncate_ipc_record_batch() { fn create_batch(rows: usize) -> RecordBatch { @@ -1789,14 +1825,16 @@ mod tests { let offset = 2; let record_batch_slice = big_record_batch.slice(offset, length); - assert!(serialize(&big_record_batch).len() > serialize(&small_record_batch).len()); + assert!( + serialize_stream(&big_record_batch).len() > serialize_stream(&small_record_batch).len() + ); assert_eq!( - serialize(&small_record_batch).len(), - serialize(&record_batch_slice).len() + serialize_stream(&small_record_batch).len(), + serialize_stream(&record_batch_slice).len() ); assert_eq!( - deserialize(serialize(&record_batch_slice)), + deserialize_stream(serialize_stream(&record_batch_slice)), record_batch_slice ); } @@ -1817,9 +1855,11 @@ mod tests { let record_batch = create_batch(); let record_batch_slice = record_batch.slice(1, 2); - let deserialized_batch = deserialize(serialize(&record_batch_slice)); + let deserialized_batch = deserialize_stream(serialize_stream(&record_batch_slice)); - assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); + assert!( + serialize_stream(&record_batch).len() > serialize_stream(&record_batch_slice).len() + ); assert!(deserialized_batch.column(0).is_null(0)); assert!(deserialized_batch.column(0).is_valid(1)); @@ -1846,9 +1886,11 @@ mod tests { let record_batch = create_batch(); let record_batch_slice = record_batch.slice(1, 2); - let deserialized_batch = deserialize(serialize(&record_batch_slice)); + let deserialized_batch = deserialize_stream(serialize_stream(&record_batch_slice)); - assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); + assert!( + serialize_stream(&record_batch).len() > serialize_stream(&record_batch_slice).len() + ); assert!(deserialized_batch.column(0).is_valid(0)); assert!(deserialized_batch.column(0).is_null(1)); @@ -1886,9 +1928,11 @@ mod tests { let record_batch = create_batch(); let record_batch_slice = record_batch.slice(1, 2); - let deserialized_batch = deserialize(serialize(&record_batch_slice)); + let deserialized_batch = deserialize_stream(serialize_stream(&record_batch_slice)); - assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); + assert!( + serialize_stream(&record_batch).len() > serialize_stream(&record_batch_slice).len() + ); let structs = deserialized_batch .column(0) @@ -1913,9 +1957,11 @@ mod tests { let record_batch = create_batch(); let record_batch_slice = record_batch.slice(0, 1); - let deserialized_batch = deserialize(serialize(&record_batch_slice)); + let deserialized_batch = deserialize_stream(serialize_stream(&record_batch_slice)); - assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); + assert!( + serialize_stream(&record_batch).len() > serialize_stream(&record_batch_slice).len() + ); assert_eq!(record_batch_slice, deserialized_batch); } @@ -1996,13 +2042,8 @@ mod tests { let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(bools)]).unwrap(); let batch = batch.slice(offset, length); - let mut writer = StreamWriter::try_new(Vec::::new(), &schema).unwrap(); - writer.write(&batch).unwrap(); - writer.finish().unwrap(); - let data = writer.into_inner().unwrap(); - - let mut reader = StreamReader::try_new(Cursor::new(data), None).unwrap(); - let batch2 = reader.next().unwrap().unwrap(); + let data = serialize_stream(&batch); + let batch2 = deserialize_stream(data); assert_eq!(batch, batch2); } @@ -2060,37 +2101,137 @@ mod tests { } } + fn generate_list_data() -> GenericListArray { + let mut ls = GenericListBuilder::::new(UInt32Builder::new()); + + for i in 0..100_000 { + for value in [i, i, i] { + ls.values().append_value(value); + } + ls.append(true) + } + + ls.finish() + } + + fn generate_nested_list_data() -> GenericListArray { + let mut ls = + GenericListBuilder::::new(GenericListBuilder::::new(UInt32Builder::new())); + + for _i in 0..10_000 { + for j in 0..10 { + for value in [j, j, j, j] { + ls.values().values().append_value(value); + } + ls.values().append(true) + } + ls.append(true); + } + + ls.finish() + } + + fn generate_map_array_data() -> MapArray { + let keys_builder = UInt32Builder::new(); + let values_builder = UInt32Builder::new(); + + let mut builder = MapBuilder::new(None, keys_builder, values_builder); + + for i in 0..100_000 { + for _j in 0..3 { + builder.keys().append_value(i); + builder.values().append_value(i * 2); + } + builder.append(true).unwrap(); + } + + builder.finish() + } + + /// Ensure when serde full & sliced versions they are equal to original input. + /// Also ensure serialized sliced version is significantly smaller than serialized full. + fn roundtrip_ensure_sliced_smaller(in_batch: RecordBatch, expected_size_factor: usize) { + // test both full and sliced versions + let in_sliced = in_batch.slice(999, 1); + + let bytes_batch = serialize_file(&in_batch); + let bytes_sliced = serialize_file(&in_sliced); + + // serializing 1 row should be significantly smaller than serializing 100,000 + assert!(bytes_sliced.len() < (bytes_batch.len() / expected_size_factor)); + + // ensure both are still valid and equal to originals + let out_batch = deserialize_file(bytes_batch); + assert_eq!(in_batch, out_batch); + + let out_sliced = deserialize_file(bytes_sliced); + assert_eq!(in_sliced, out_sliced); + } + #[test] fn encode_lists() { let val_inner = Field::new("item", DataType::UInt32, true); - let val_list_field = Field::new_list("val", val_inner, false); + let val_list_field = Field::new("val", DataType::List(Arc::new(val_inner)), false); + let schema = Arc::new(Schema::new(vec![val_list_field])); + + let values = Arc::new(generate_list_data::()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1000); + } + + #[test] + fn encode_empty_list() { + let val_inner = Field::new("item", DataType::UInt32, true); + let val_list_field = Field::new("val", DataType::List(Arc::new(val_inner)), false); + let schema = Arc::new(Schema::new(vec![val_list_field])); + + let values = Arc::new(generate_list_data::()); + let in_batch = RecordBatch::try_new(schema, vec![values]) + .unwrap() + .slice(999, 0); + let out_batch = deserialize_file(serialize_file(&in_batch)); + assert_eq!(in_batch, out_batch); + } + + #[test] + fn encode_large_lists() { + let val_inner = Field::new("item", DataType::UInt32, true); + let val_list_field = Field::new("val", DataType::LargeList(Arc::new(val_inner)), false); let schema = Arc::new(Schema::new(vec![val_list_field])); - let values = { - let u32 = UInt32Builder::new(); - let mut ls = ListBuilder::new(u32); + let values = Arc::new(generate_list_data::()); - for list in [vec![1u32, 2, 3], vec![4, 5, 6], vec![7, 8, 9, 10]] { - for value in list { - ls.values().append_value(value); - } - ls.append(true) - } + // ensure when serde full & sliced versions they are equal to original input + // also ensure serialized sliced version is significantly smaller than serialized full + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1000); + } - ls.finish() - }; + #[test] + fn encode_nested_lists() { + let inner_int = Arc::new(Field::new("item", DataType::UInt32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_int), true)); + let list_field = Field::new("val", DataType::List(inner_list_field), true); + let schema = Arc::new(Schema::new(vec![list_field])); - let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(values)]).unwrap(); - let batch = batch.slice(1, 1); + let values = Arc::new(generate_nested_list_data::()); - let mut writer = FileWriter::try_new(Vec::::new(), &schema).unwrap(); - writer.write(&batch).unwrap(); - writer.finish().unwrap(); - let data = writer.into_inner().unwrap(); + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1000); + } - let mut reader = FileReader::try_new(Cursor::new(data), None).unwrap(); - let batch2 = reader.next().unwrap().unwrap(); - assert_eq!(batch, batch2); + #[test] + fn encode_map_array() { + let keys = Arc::new(Field::new("keys", DataType::UInt32, false)); + let values = Arc::new(Field::new("values", DataType::UInt32, true)); + let map_field = Field::new_map("map", "entries", keys, values, false, true); + let schema = Arc::new(Schema::new(vec![map_field])); + + let values = Arc::new(generate_map_array_data()); + + let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + roundtrip_ensure_sliced_smaller(in_batch, 1000); } } From 7ba36b012322e08b06184c806f8ba339181cebc1 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 14 Nov 2023 10:12:27 +1100 Subject: [PATCH 1343/1411] Parquet: read/write f16 for Arrow (#5003) * Support for read/write f16 Parquet to Arrow * Update parquet/src/arrow/arrow_writer/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update parquet/src/arrow/arrow_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update test with null version * Fix schema tests and parsing for f16 * f16 for record api * Handle NaN for f16 statistics writing * Revert formatting changes * Fix num trait * Fix half feature * Handle writing signed zero statistics * Bump parquet-testing and read new f16 files for test --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet-testing | 2 +- parquet/Cargo.toml | 1 + parquet/regen.sh | 2 +- .../array_reader/fixed_len_byte_array.rs | 17 +- parquet/src/arrow/arrow_reader/mod.rs | 119 +++++++++- parquet/src/arrow/arrow_writer/mod.rs | 16 ++ parquet/src/arrow/schema/mod.rs | 17 +- parquet/src/arrow/schema/primitive.rs | 10 + parquet/src/basic.rs | 15 +- parquet/src/column/writer/encoder.rs | 19 +- parquet/src/column/writer/mod.rs | 204 +++++++++++++++++- parquet/src/data_type.rs | 7 + parquet/src/file/statistics.rs | 4 + parquet/src/format.rs | 88 +++++++- parquet/src/record/api.rs | 88 +++++++- parquet/src/schema/parser.rs | 8 + parquet/src/schema/printer.rs | 10 + parquet/src/schema/types.rs | 44 ++++ 18 files changed, 646 insertions(+), 25 deletions(-) diff --git a/parquet-testing b/parquet-testing index aafd3fc9df43..506afff9b695 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit aafd3fc9df431c2625a514fb46626e5614f1d199 +Subproject commit 506afff9b6957ffe10d08470d467867d43e1bb91 diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index e5f5e1652b82..bdcbcb81cfce 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -66,6 +66,7 @@ tokio = { version = "1.0", optional = true, default-features = false, features = hashbrown = { version = "0.14", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } +half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] base64 = { version = "0.21", default-features = false, features = ["std"] } diff --git a/parquet/regen.sh b/parquet/regen.sh index b8c3549e2324..91539634339d 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -REVISION=aeae80660c1d0c97314e9da837de1abdebd49c37 +REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 3b1a50ebcce8..b846997d36b8 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -27,13 +27,14 @@ use crate::column::reader::decoder::{ColumnValueDecoder, ValuesBufferSlice}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow_array::{ - ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, + ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, IntervalDayTimeArray, IntervalYearMonthArray, }; use arrow_buffer::{i256, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowType, IntervalUnit}; use bytes::Bytes; +use half::f16; use std::any::Any; use std::ops::Range; use std::sync::Arc; @@ -88,6 +89,14 @@ pub fn make_fixed_len_byte_array_reader( )); } } + ArrowType::Float16 => { + if byte_length != 2 { + return Err(general_err!( + "float 16 type must be 2 bytes, got {}", + byte_length + )); + } + } _ => { return Err(general_err!( "invalid data type for fixed length byte array reader - {}", @@ -208,6 +217,12 @@ impl ArrayReader for FixedLenByteArrayReader { } } } + ArrowType::Float16 => Arc::new( + binary + .iter() + .map(|o| o.map(|b| f16::from_le_bytes(b[..2].try_into().unwrap()))) + .collect::(), + ) as ArrayRef, _ => Arc::new(binary) as ArrayRef, }; diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 16cdf2934e6f..b9e9d2898459 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -712,13 +712,14 @@ mod tests { use std::sync::Arc; use bytes::Bytes; + use half::f16; use num::PrimInt; use rand::{thread_rng, Rng, RngCore}; use tempfile::tempfile; use arrow_array::builder::*; use arrow_array::cast::AsArray; - use arrow_array::types::{Decimal128Type, Decimal256Type, DecimalType}; + use arrow_array::types::{Decimal128Type, Decimal256Type, DecimalType, Float16Type}; use arrow_array::*; use arrow_array::{RecordBatch, RecordBatchReader}; use arrow_buffer::{i256, ArrowNativeType, Buffer}; @@ -924,6 +925,66 @@ mod tests { .unwrap(); } + #[test] + fn test_float16_roundtrip() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("float16", ArrowDataType::Float16, false), + Field::new("float16-nullable", ArrowDataType::Float16, true), + ])); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, schema.clone(), None)?; + + let original = RecordBatch::try_new( + schema, + vec![ + Arc::new(Float16Array::from_iter_values([ + f16::EPSILON, + f16::MIN, + f16::MAX, + f16::NAN, + f16::INFINITY, + f16::NEG_INFINITY, + f16::ONE, + f16::NEG_ONE, + f16::ZERO, + f16::NEG_ZERO, + f16::E, + f16::PI, + f16::FRAC_1_PI, + ])), + Arc::new(Float16Array::from(vec![ + None, + None, + None, + Some(f16::NAN), + Some(f16::INFINITY), + Some(f16::NEG_INFINITY), + None, + None, + None, + None, + None, + None, + Some(f16::FRAC_1_PI), + ])), + ], + )?; + + writer.write(&original)?; + writer.close()?; + + let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buf), 1024)?; + let ret = reader.next().unwrap()?; + assert_eq!(ret, original); + + // Ensure can be downcast to the correct type + ret.column(0).as_primitive::(); + ret.column(1).as_primitive::(); + + Ok(()) + } + struct RandFixedLenGen {} impl RandGen for RandFixedLenGen { @@ -1255,6 +1316,62 @@ mod tests { } } + #[test] + fn test_read_float16_nonzeros_file() { + use arrow_array::Float16Array; + let testdata = arrow::util::test_util::parquet_test_data(); + // see https://github.com/apache/parquet-testing/pull/40 + let path = format!("{testdata}/float16_nonzeros_and_nans.parquet"); + let file = File::open(path).unwrap(); + let mut record_reader = ParquetRecordBatchReader::try_new(file, 32).unwrap(); + + let batch = record_reader.next().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 8); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + let f16_two = f16::ONE + f16::ONE; + + assert_eq!(col.null_count(), 1); + assert!(col.is_null(0)); + assert_eq!(col.value(1), f16::ONE); + assert_eq!(col.value(2), -f16_two); + assert!(col.value(3).is_nan()); + assert_eq!(col.value(4), f16::ZERO); + assert!(col.value(4).is_sign_positive()); + assert_eq!(col.value(5), f16::NEG_ONE); + assert_eq!(col.value(6), f16::NEG_ZERO); + assert!(col.value(6).is_sign_negative()); + assert_eq!(col.value(7), f16_two); + } + + #[test] + fn test_read_float16_zeros_file() { + use arrow_array::Float16Array; + let testdata = arrow::util::test_util::parquet_test_data(); + // see https://github.com/apache/parquet-testing/pull/40 + let path = format!("{testdata}/float16_zeros_and_nans.parquet"); + let file = File::open(path).unwrap(); + let mut record_reader = ParquetRecordBatchReader::try_new(file, 32).unwrap(); + + let batch = record_reader.next().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 3); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(col.null_count(), 1); + assert!(col.is_null(0)); + assert_eq!(col.value(1), f16::ZERO); + assert!(col.value(1).is_sign_positive()); + assert!(col.value(2).is_nan()); + } + /// Parameters for single_column_reader_test #[derive(Clone)] struct TestOptions { diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index eca1dea791be..ea7b1eee99b8 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -771,6 +771,10 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result { + let array = column.as_primitive::(); + get_float_16_array_slice(array, indices) + } _ => { return Err(ParquetError::NYI( "Attempting to write an Arrow type that is not yet implemented".to_string(), @@ -867,6 +871,18 @@ fn get_decimal_256_array_slice( values } +fn get_float_16_array_slice( + array: &arrow_array::Float16Array, + indices: &[usize], +) -> Vec { + let mut values = Vec::with_capacity(indices.len()); + for i in indices { + let value = array.value(*i).to_le_bytes().to_vec(); + values.push(FixedLenByteArray::from(ByteArray::from(value))); + } + values +} + fn get_fsb_array_slice( array: &arrow_array::FixedSizeBinaryArray, indices: &[usize], diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index d56cc42d4313..4c350c4b1d8c 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -373,7 +373,12 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .build(), - DataType::Float16 => Err(arrow_err!("Float16 arrays not supported")), + DataType::Float16 => Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(repetition) + .with_id(id) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build(), DataType::Float32 => Type::primitive_type_builder(name, PhysicalType::FLOAT) .with_repetition(repetition) .with_id(id) @@ -604,9 +609,10 @@ mod tests { REQUIRED INT32 uint8 (INTEGER(8,false)); REQUIRED INT32 uint16 (INTEGER(16,false)); REQUIRED INT32 int32; - REQUIRED INT64 int64 ; + REQUIRED INT64 int64; OPTIONAL DOUBLE double; OPTIONAL FLOAT float; + OPTIONAL FIXED_LEN_BYTE_ARRAY (2) float16 (FLOAT16); OPTIONAL BINARY string (UTF8); OPTIONAL BINARY string_2 (STRING); OPTIONAL BINARY json (JSON); @@ -628,6 +634,7 @@ mod tests { Field::new("int64", DataType::Int64, false), Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), + Field::new("float16", DataType::Float16, true), Field::new("string", DataType::Utf8, true), Field::new("string_2", DataType::Utf8, true), Field::new("json", DataType::Utf8, true), @@ -1303,6 +1310,7 @@ mod tests { REQUIRED INT64 int64; OPTIONAL DOUBLE double; OPTIONAL FLOAT float; + OPTIONAL FIXED_LEN_BYTE_ARRAY (2) float16 (FLOAT16); OPTIONAL BINARY string (UTF8); REPEATED BOOLEAN bools; OPTIONAL INT32 date (DATE); @@ -1339,6 +1347,7 @@ mod tests { Field::new("int64", DataType::Int64, false), Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), + Field::new("float16", DataType::Float16, true), Field::new("string", DataType::Utf8, true), Field::new_list( "bools", @@ -1398,6 +1407,7 @@ mod tests { REQUIRED INT64 int64; OPTIONAL DOUBLE double; OPTIONAL FLOAT float; + OPTIONAL FIXED_LEN_BYTE_ARRAY (2) float16 (FLOAT16); OPTIONAL BINARY string (STRING); OPTIONAL GROUP bools (LIST) { REPEATED GROUP list { @@ -1448,6 +1458,7 @@ mod tests { Field::new("int64", DataType::Int64, false), Field::new("double", DataType::Float64, true), Field::new("float", DataType::Float32, true), + Field::new("float16", DataType::Float16, true), Field::new("string", DataType::Utf8, true), Field::new_list( "bools", @@ -1661,6 +1672,8 @@ mod tests { vec![ Field::new("a", DataType::Int16, true), Field::new("b", DataType::Float64, false), + Field::new("c", DataType::Float32, false), + Field::new("d", DataType::Float16, false), ] .into(), ), diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 7d8b6a04ee81..fdc744831a25 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -304,6 +304,16 @@ fn from_fixed_len_byte_array( // would be incorrect if all 12 bytes of the interval are populated Ok(DataType::Interval(IntervalUnit::DayTime)) } + (Some(LogicalType::Float16), _) => { + if type_length == 2 { + Ok(DataType::Float16) + } else { + Err(ParquetError::General( + "FLOAT16 logical type must be Fixed Length Byte Array with length 2" + .to_string(), + )) + } + } _ => Ok(DataType::FixedSizeBinary(type_length)), } } diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 3c8602b8022b..2327e1d84b41 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -194,6 +194,7 @@ pub enum LogicalType { Json, Bson, Uuid, + Float16, } // ---------------------------------------------------------------------- @@ -505,6 +506,7 @@ impl ColumnOrder { LogicalType::Timestamp { .. } => SortOrder::SIGNED, LogicalType::Unknown => SortOrder::UNDEFINED, LogicalType::Uuid => SortOrder::UNSIGNED, + LogicalType::Float16 => SortOrder::SIGNED, }, // Fall back to converted type None => Self::get_converted_sort_order(converted_type, physical_type), @@ -766,6 +768,7 @@ impl From for LogicalType { parquet::LogicalType::JSON(_) => LogicalType::Json, parquet::LogicalType::BSON(_) => LogicalType::Bson, parquet::LogicalType::UUID(_) => LogicalType::Uuid, + parquet::LogicalType::FLOAT16(_) => LogicalType::Float16, } } } @@ -806,6 +809,7 @@ impl From for parquet::LogicalType { LogicalType::Json => parquet::LogicalType::JSON(Default::default()), LogicalType::Bson => parquet::LogicalType::BSON(Default::default()), LogicalType::Uuid => parquet::LogicalType::UUID(Default::default()), + LogicalType::Float16 => parquet::LogicalType::FLOAT16(Default::default()), } } } @@ -853,10 +857,11 @@ impl From> for ConvertedType { (64, false) => ConvertedType::UINT_64, t => panic!("Integer type {t:?} is not supported"), }, - LogicalType::Unknown => ConvertedType::NONE, LogicalType::Json => ConvertedType::JSON, LogicalType::Bson => ConvertedType::BSON, - LogicalType::Uuid => ConvertedType::NONE, + LogicalType::Uuid | LogicalType::Float16 | LogicalType::Unknown => { + ConvertedType::NONE + } }, None => ConvertedType::NONE, } @@ -1102,6 +1107,7 @@ impl str::FromStr for LogicalType { "INTERVAL" => Err(general_err!( "Interval parquet logical type not yet supported" )), + "FLOAT16" => Ok(LogicalType::Float16), other => Err(general_err!("Invalid parquet logical type {}", other)), } } @@ -1746,6 +1752,10 @@ mod tests { ConvertedType::from(Some(LogicalType::Enum)), ConvertedType::ENUM ); + assert_eq!( + ConvertedType::from(Some(LogicalType::Float16)), + ConvertedType::NONE + ); assert_eq!( ConvertedType::from(Some(LogicalType::Unknown)), ConvertedType::NONE @@ -2119,6 +2129,7 @@ mod tests { is_adjusted_to_u_t_c: true, unit: TimeUnit::NANOS(Default::default()), }, + LogicalType::Float16, ]; check_sort_order(signed, SortOrder::SIGNED); diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 2273ae777444..d0720dd24306 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -16,8 +16,9 @@ // under the License. use bytes::Bytes; +use half::f16; -use crate::basic::{Encoding, Type}; +use crate::basic::{Encoding, LogicalType, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -291,7 +292,7 @@ where { let first = loop { let next = iter.next()?; - if !is_nan(next) { + if !is_nan(descr, next) { break next; } }; @@ -299,7 +300,7 @@ where let mut min = first; let mut max = first; for val in iter { - if is_nan(val) { + if is_nan(descr, val) { continue; } if compare_greater(descr, min, val) { @@ -318,14 +319,14 @@ where // // For max, it has similar logic but will be written as 0.0 // (positive zero) - let min = replace_zero(min, -0.0); - let max = replace_zero(max, 0.0); + let min = replace_zero(min, descr, -0.0); + let max = replace_zero(max, descr, 0.0); Some((min, max)) } #[inline] -fn replace_zero(val: &T, replace: f32) -> T { +fn replace_zero(val: &T, descr: &ColumnDescriptor, replace: f32) -> T { match T::PHYSICAL_TYPE { Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap() @@ -333,6 +334,12 @@ fn replace_zero(val: &T, replace: f32) -> T { Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap() } + Type::FIXED_LEN_BYTE_ARRAY + if descr.logical_type() == Some(LogicalType::Float16) + && f16::from_le_bytes(val.as_bytes().try_into().unwrap()) == f16::NEG_ZERO => + { + T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap() + } _ => val.clone(), } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 60db90c5d46d..a917c4864988 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -18,6 +18,7 @@ //! Contains column writer API. use bytes::Bytes; +use half::f16; use crate::bloom_filter::Sbbf; use crate::format::{ColumnIndex, OffsetIndex}; @@ -968,18 +969,23 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } fn update_min(descr: &ColumnDescriptor, val: &T, min: &mut Option) { - update_stat::(val, min, |cur| compare_greater(descr, cur, val)) + update_stat::(descr, val, min, |cur| compare_greater(descr, cur, val)) } fn update_max(descr: &ColumnDescriptor, val: &T, max: &mut Option) { - update_stat::(val, max, |cur| compare_greater(descr, val, cur)) + update_stat::(descr, val, max, |cur| compare_greater(descr, val, cur)) } #[inline] #[allow(clippy::eq_op)] -fn is_nan(val: &T) -> bool { +fn is_nan(descr: &ColumnDescriptor, val: &T) -> bool { match T::PHYSICAL_TYPE { Type::FLOAT | Type::DOUBLE => val != val, + Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type() == Some(LogicalType::Float16) => { + let val = val.as_bytes(); + let val = f16::from_le_bytes([val[0], val[1]]); + val.is_nan() + } _ => false, } } @@ -989,11 +995,15 @@ fn is_nan(val: &T) -> bool { /// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls `should_update` with /// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true` -fn update_stat(val: &T, cur: &mut Option, should_update: F) -where +fn update_stat( + descr: &ColumnDescriptor, + val: &T, + cur: &mut Option, + should_update: F, +) where F: Fn(&T) -> bool, { - if is_nan(val) { + if is_nan(descr, val) { return; } @@ -1039,6 +1049,14 @@ fn compare_greater(descr: &ColumnDescriptor, a: &T, b: &T) }; }; + if let Some(LogicalType::Float16) = descr.logical_type() { + let a = a.as_bytes(); + let a = f16::from_le_bytes([a[0], a[1]]); + let b = b.as_bytes(); + let b = f16::from_le_bytes([b[0], b[1]]); + return a > b; + } + a > b } @@ -1170,6 +1188,7 @@ fn increment_utf8(mut data: Vec) -> Option> { mod tests { use crate::{file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, format::BoundaryOrder}; use bytes::Bytes; + use half::f16; use rand::distributions::uniform::SampleUniform; use std::sync::Arc; @@ -2078,6 +2097,135 @@ mod tests { } } + #[test] + fn test_column_writer_check_float16_min_max() { + let input = [ + -f16::ONE, + f16::from_f32(3.0), + -f16::from_f32(2.0), + f16::from_f32(2.0), + ] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(-f16::from_f32(2.0))); + assert_eq!(stats.max(), &ByteArray::from(f16::from_f32(3.0))); + } + + #[test] + fn test_column_writer_check_float16_nan_middle() { + let input = [f16::ONE, f16::NAN, f16::ONE + f16::ONE] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + } + + #[test] + fn test_float16_statistics_nan_middle() { + let input = [f16::ONE, f16::NAN, f16::ONE + f16::ONE] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + } + + #[test] + fn test_float16_statistics_nan_start() { + let input = [f16::NAN, f16::ONE, f16::ONE + f16::ONE] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::ONE)); + assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE)); + } + + #[test] + fn test_float16_statistics_nan_only() { + let input = [f16::NAN, f16::NAN] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(!stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + } + + #[test] + fn test_float16_statistics_zero_only() { + let input = [f16::ZERO] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + } + + #[test] + fn test_float16_statistics_neg_zero_only() { + let input = [f16::NEG_ZERO] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + } + + #[test] + fn test_float16_statistics_zero_min() { + let input = [f16::ZERO, f16::ONE, f16::NAN, f16::PI] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(f16::NEG_ZERO)); + assert_eq!(stats.max(), &ByteArray::from(f16::PI)); + } + + #[test] + fn test_float16_statistics_neg_zero_max() { + let input = [f16::NEG_ZERO, f16::NEG_ONE, f16::NAN, -f16::PI] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.min(), &ByteArray::from(-f16::PI)); + assert_eq!(stats.max(), &ByteArray::from(f16::ZERO)); + } + #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); @@ -2850,6 +2998,50 @@ mod tests { ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) } + fn float16_statistics_roundtrip( + values: &[FixedLenByteArray], + ) -> ValueStatistics { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = + get_test_float16_column_writer::(page_writer, 0, 0, props); + writer.write_batch(values, None, None).unwrap(); + + let metadata = writer.close().unwrap().metadata; + if let Some(Statistics::FixedLenByteArray(stats)) = metadata.statistics() { + stats.clone() + } else { + panic!("metadata missing statistics"); + } + } + + fn get_test_float16_column_writer( + page_writer: Box, + max_def_level: i16, + max_rep_level: i16, + props: WriterPropertiesPtr, + ) -> ColumnWriterImpl<'static, T> { + let descr = Arc::new(get_test_float16_column_descr::( + max_def_level, + max_rep_level, + )); + let column_writer = get_column_writer(descr, props, page_writer); + get_typed_column_writer::(column_writer) + } + + fn get_test_float16_column_descr( + max_def_level: i16, + max_rep_level: i16, + ) -> ColumnDescriptor { + let path = ColumnPath::from("col"); + let tpe = SchemaType::primitive_type_builder("col", T::get_physical_type()) + .with_length(2) + .with_logical_type(Some(LogicalType::Float16)) + .build() + .unwrap(); + ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) + } + /// Returns column writer for UINT32 Column provided as ConvertedType only fn get_test_unsigned_int_given_as_converted_column_writer<'a, T: DataType>( page_writer: Box, diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index b895c2507018..86da7a3acee4 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -18,6 +18,7 @@ //! Data types that connect Parquet physical types with their Rust-specific //! representations. use bytes::Bytes; +use half::f16; use std::cmp::Ordering; use std::fmt; use std::mem; @@ -225,6 +226,12 @@ impl From for ByteArray { } } +impl From for ByteArray { + fn from(value: f16) -> Self { + Self::from(value.to_le_bytes().as_slice()) + } +} + impl PartialEq for ByteArray { fn eq(&self, other: &ByteArray) -> bool { match (&self.data, &other.data) { diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index b36e37a80c97..345fe7dd2615 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -243,6 +243,8 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { distinct_count: stats.distinct_count().map(|value| value as i64), max_value: None, min_value: None, + is_max_value_exact: None, + is_min_value_exact: None, }; // Get min/max if set. @@ -607,6 +609,8 @@ mod tests { distinct_count: None, max_value: None, min_value: None, + is_max_value_exact: None, + is_min_value_exact: None, }; from_thrift(Type::INT32, Some(thrift_stats)).unwrap(); diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 46adc39e6406..4700b05dc282 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -657,16 +657,26 @@ pub struct Statistics { pub null_count: Option, /// count of distinct values occurring pub distinct_count: Option, - /// Min and max values for the column, determined by its ColumnOrder. + /// Lower and upper bound values for the column, determined by its ColumnOrder. + /// + /// These may be the actual minimum and maximum values found on a page or column + /// chunk, but can also be (more compact) values that do not exist on a page or + /// column chunk. For example, instead of storing "Blart Versenwald III", a writer + /// may set min_value="B", max_value="C". Such more compact values must still be + /// valid values within the column's logical type. /// /// Values are encoded using PLAIN encoding, except that variable-length byte /// arrays do not include a length prefix. pub max_value: Option>, pub min_value: Option>, + /// If true, max_value is the actual maximum value for a column + pub is_max_value_exact: Option, + /// If true, min_value is the actual minimum value for a column + pub is_min_value_exact: Option, } impl Statistics { - pub fn new(max: F1, min: F2, null_count: F3, distinct_count: F4, max_value: F5, min_value: F6) -> Statistics where F1: Into>>, F2: Into>>, F3: Into>, F4: Into>, F5: Into>>, F6: Into>> { + pub fn new(max: F1, min: F2, null_count: F3, distinct_count: F4, max_value: F5, min_value: F6, is_max_value_exact: F7, is_min_value_exact: F8) -> Statistics where F1: Into>>, F2: Into>>, F3: Into>, F4: Into>, F5: Into>>, F6: Into>>, F7: Into>, F8: Into> { Statistics { max: max.into(), min: min.into(), @@ -674,6 +684,8 @@ impl Statistics { distinct_count: distinct_count.into(), max_value: max_value.into(), min_value: min_value.into(), + is_max_value_exact: is_max_value_exact.into(), + is_min_value_exact: is_min_value_exact.into(), } } } @@ -687,6 +699,8 @@ impl crate::thrift::TSerializable for Statistics { let mut f_4: Option = None; let mut f_5: Option> = None; let mut f_6: Option> = None; + let mut f_7: Option = None; + let mut f_8: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -718,6 +732,14 @@ impl crate::thrift::TSerializable for Statistics { let val = i_prot.read_bytes()?; f_6 = Some(val); }, + 7 => { + let val = i_prot.read_bool()?; + f_7 = Some(val); + }, + 8 => { + let val = i_prot.read_bool()?; + f_8 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -732,6 +754,8 @@ impl crate::thrift::TSerializable for Statistics { distinct_count: f_4, max_value: f_5, min_value: f_6, + is_max_value_exact: f_7, + is_min_value_exact: f_8, }; Ok(ret) } @@ -768,6 +792,16 @@ impl crate::thrift::TSerializable for Statistics { o_prot.write_bytes(fld_var)?; o_prot.write_field_end()? } + if let Some(fld_var) = self.is_max_value_exact { + o_prot.write_field_begin(&TFieldIdentifier::new("is_max_value_exact", TType::Bool, 7))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } + if let Some(fld_var) = self.is_min_value_exact { + o_prot.write_field_begin(&TFieldIdentifier::new("is_min_value_exact", TType::Bool, 8))?; + o_prot.write_bool(fld_var)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -996,6 +1030,43 @@ impl crate::thrift::TSerializable for DateType { } } +// +// Float16Type +// + +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct Float16Type { +} + +impl Float16Type { + pub fn new() -> Float16Type { + Float16Type {} + } +} + +impl crate::thrift::TSerializable for Float16Type { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + i_prot.skip(field_ident.field_type)?; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = Float16Type {}; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("Float16Type"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // NullType // @@ -1640,6 +1711,7 @@ pub enum LogicalType { JSON(JsonType), BSON(BsonType), UUID(UUIDType), + FLOAT16(Float16Type), } impl crate::thrift::TSerializable for LogicalType { @@ -1745,6 +1817,13 @@ impl crate::thrift::TSerializable for LogicalType { } received_field_count += 1; }, + 15 => { + let val = Float16Type::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(LogicalType::FLOAT16(val)); + } + received_field_count += 1; + }, _ => { i_prot.skip(field_ident.field_type)?; received_field_count += 1; @@ -1844,6 +1923,11 @@ impl crate::thrift::TSerializable for LogicalType { f.write_to_out_protocol(o_prot)?; o_prot.write_field_end()?; }, + LogicalType::FLOAT16(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("FLOAT16", TType::Struct, 15))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, } o_prot.write_field_stop()?; o_prot.write_struct_end() diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index c7a0b09c37ed..e4f473562e01 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -20,9 +20,11 @@ use std::fmt; use chrono::{TimeZone, Utc}; +use half::f16; +use num::traits::Float; use num_bigint::{BigInt, Sign}; -use crate::basic::{ConvertedType, Type as PhysicalType}; +use crate::basic::{ConvertedType, LogicalType, Type as PhysicalType}; use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; @@ -121,6 +123,7 @@ pub trait RowAccessor { fn get_ushort(&self, i: usize) -> Result; fn get_uint(&self, i: usize) -> Result; fn get_ulong(&self, i: usize) -> Result; + fn get_float16(&self, i: usize) -> Result; fn get_float(&self, i: usize) -> Result; fn get_double(&self, i: usize) -> Result; fn get_timestamp_millis(&self, i: usize) -> Result; @@ -215,6 +218,8 @@ impl RowAccessor for Row { row_primitive_accessor!(get_ulong, ULong, u64); + row_primitive_accessor!(get_float16, Float16, f16); + row_primitive_accessor!(get_float, Float, f32); row_primitive_accessor!(get_double, Double, f64); @@ -293,6 +298,7 @@ pub trait ListAccessor { fn get_ushort(&self, i: usize) -> Result; fn get_uint(&self, i: usize) -> Result; fn get_ulong(&self, i: usize) -> Result; + fn get_float16(&self, i: usize) -> Result; fn get_float(&self, i: usize) -> Result; fn get_double(&self, i: usize) -> Result; fn get_timestamp_millis(&self, i: usize) -> Result; @@ -358,6 +364,8 @@ impl ListAccessor for List { list_primitive_accessor!(get_ulong, ULong, u64); + list_primitive_accessor!(get_float16, Float16, f16); + list_primitive_accessor!(get_float, Float, f32); list_primitive_accessor!(get_double, Double, f64); @@ -449,6 +457,8 @@ impl<'a> ListAccessor for MapList<'a> { map_list_primitive_accessor!(get_ulong, ULong, u64); + map_list_primitive_accessor!(get_float16, Float16, f16); + map_list_primitive_accessor!(get_float, Float, f32); map_list_primitive_accessor!(get_double, Double, f64); @@ -510,6 +520,8 @@ pub enum Field { UInt(u32), // Unsigned integer UINT_64. ULong(u64), + /// IEEE 16-bit floating point value. + Float16(f16), /// IEEE 32-bit floating point value. Float(f32), /// IEEE 64-bit floating point value. @@ -552,6 +564,7 @@ impl Field { Field::UShort(_) => "UShort", Field::UInt(_) => "UInt", Field::ULong(_) => "ULong", + Field::Float16(_) => "Float16", Field::Float(_) => "Float", Field::Double(_) => "Double", Field::Decimal(_) => "Decimal", @@ -636,8 +649,8 @@ impl Field { Field::Double(value) } - /// Converts Parquet BYTE_ARRAY type with converted type into either UTF8 string or - /// array of bytes. + /// Converts Parquet BYTE_ARRAY type with converted type into a UTF8 + /// string, decimal, float16, or an array of bytes. #[inline] pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Result { let field = match descr.physical_type() { @@ -666,6 +679,16 @@ impl Field { descr.type_precision(), descr.type_scale(), )), + ConvertedType::NONE if descr.logical_type() == Some(LogicalType::Float16) => { + if value.len() != 2 { + return Err(general_err!( + "Error reading FIXED_LEN_BYTE_ARRAY as FLOAT16. Length must be 2, got {}", + value.len() + )); + } + let bytes = [value.data()[0], value.data()[1]]; + Field::Float16(f16::from_le_bytes(bytes)) + } ConvertedType::NONE => Field::Bytes(value), _ => nyi!(descr, value), }, @@ -690,6 +713,9 @@ impl Field { Field::UShort(n) => Value::Number(serde_json::Number::from(*n)), Field::UInt(n) => Value::Number(serde_json::Number::from(*n)), Field::ULong(n) => Value::Number(serde_json::Number::from(*n)), + Field::Float16(n) => serde_json::Number::from_f64(f64::from(*n)) + .map(Value::Number) + .unwrap_or(Value::Null), Field::Float(n) => serde_json::Number::from_f64(f64::from(*n)) .map(Value::Number) .unwrap_or(Value::Null), @@ -736,6 +762,15 @@ impl fmt::Display for Field { Field::UShort(value) => write!(f, "{value}"), Field::UInt(value) => write!(f, "{value}"), Field::ULong(value) => write!(f, "{value}"), + Field::Float16(value) => { + if !value.is_finite() { + write!(f, "{value}") + } else if value.trunc() == value { + write!(f, "{value}.0") + } else { + write!(f, "{value}") + } + } Field::Float(value) => { if !(1e-15..=1e19).contains(&value) { write!(f, "{value:E}") @@ -1069,6 +1104,24 @@ mod tests { Field::Decimal(Decimal::from_bytes(value, 17, 5)) ); + // FLOAT16 + let descr = { + let tpe = PrimitiveTypeBuilder::new("col", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build() + .unwrap(); + Arc::new(ColumnDescriptor::new( + Arc::new(tpe), + 0, + 0, + ColumnPath::from("col"), + )) + }; + let value = ByteArray::from(f16::PI); + let row = Field::convert_byte_array(&descr, value.clone()); + assert_eq!(row.unwrap(), Field::Float16(f16::PI)); + // NONE (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ PhysicalType::FIXED_LEN_BYTE_ARRAY, @@ -1145,6 +1198,18 @@ mod tests { check_datetime_conversion(2014, 11, 28, 21, 15, 12); } + #[test] + fn test_convert_float16_to_string() { + assert_eq!(format!("{}", Field::Float16(f16::ONE)), "1.0"); + assert_eq!(format!("{}", Field::Float16(f16::PI)), "3.140625"); + assert_eq!(format!("{}", Field::Float16(f16::MAX)), "65504.0"); + assert_eq!(format!("{}", Field::Float16(f16::NAN)), "NaN"); + assert_eq!(format!("{}", Field::Float16(f16::INFINITY)), "inf"); + assert_eq!(format!("{}", Field::Float16(f16::NEG_INFINITY)), "-inf"); + assert_eq!(format!("{}", Field::Float16(f16::ZERO)), "0.0"); + assert_eq!(format!("{}", Field::Float16(f16::NEG_ZERO)), "-0.0"); + } + #[test] fn test_convert_float_to_string() { assert_eq!(format!("{}", Field::Float(1.0)), "1.0"); @@ -1218,6 +1283,7 @@ mod tests { assert_eq!(format!("{}", Field::UShort(2)), "2"); assert_eq!(format!("{}", Field::UInt(3)), "3"); assert_eq!(format!("{}", Field::ULong(4)), "4"); + assert_eq!(format!("{}", Field::Float16(f16::E)), "2.71875"); assert_eq!(format!("{}", Field::Float(5.0)), "5.0"); assert_eq!(format!("{}", Field::Float(5.1234)), "5.1234"); assert_eq!(format!("{}", Field::Double(6.0)), "6.0"); @@ -1284,6 +1350,7 @@ mod tests { assert!(Field::UShort(2).is_primitive()); assert!(Field::UInt(3).is_primitive()); assert!(Field::ULong(4).is_primitive()); + assert!(Field::Float16(f16::E).is_primitive()); assert!(Field::Float(5.0).is_primitive()); assert!(Field::Float(5.1234).is_primitive()); assert!(Field::Double(6.0).is_primitive()); @@ -1344,6 +1411,7 @@ mod tests { ("15".to_string(), Field::TimestampMillis(1262391174000)), ("16".to_string(), Field::TimestampMicros(1262391174000000)), ("17".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ("18".to_string(), Field::Float16(f16::PI)), ]); assert_eq!("null", format!("{}", row.fmt(0))); @@ -1370,6 +1438,7 @@ mod tests { format!("{}", row.fmt(16)) ); assert_eq!("0.04", format!("{}", row.fmt(17))); + assert_eq!("3.140625", format!("{}", row.fmt(18))); } #[test] @@ -1429,6 +1498,7 @@ mod tests { Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), ), ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ("p".to_string(), Field::Float16(f16::from_f32(9.1))), ]); assert!(!row.get_bool(1).unwrap()); @@ -1445,6 +1515,7 @@ mod tests { assert_eq!("abc", row.get_string(12).unwrap()); assert_eq!(5, row.get_bytes(13).unwrap().len()); assert_eq!(7, row.get_decimal(14).unwrap().precision()); + assert!((f16::from_f32(9.1) - row.get_float16(15).unwrap()).abs() < f16::EPSILON); } #[test] @@ -1469,6 +1540,7 @@ mod tests { Field::Bytes(ByteArray::from(vec![1, 2, 3, 4, 5])), ), ("o".to_string(), Field::Decimal(Decimal::from_i32(4, 7, 2))), + ("p".to_string(), Field::Float16(f16::from_f32(9.1))), ]); for i in 0..row.len() { @@ -1583,6 +1655,9 @@ mod tests { let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); assert_eq!(7, list.get_ulong(1).unwrap()); + let list = make_list(vec![Field::Float16(f16::PI)]); + assert!((f16::PI - list.get_float16(0).unwrap()).abs() < f16::EPSILON); + let list = make_list(vec![ Field::Float(8.1), Field::Float(9.2), @@ -1633,6 +1708,9 @@ mod tests { let list = make_list(vec![Field::ULong(6), Field::ULong(7)]); assert!(list.get_float(1).is_err()); + let list = make_list(vec![Field::Float16(f16::PI)]); + assert!(list.get_string(0).is_err()); + let list = make_list(vec![ Field::Float(8.1), Field::Float(9.2), @@ -1768,6 +1846,10 @@ mod tests { Field::ULong(4).to_json_value(), Value::Number(serde_json::Number::from(4)) ); + assert_eq!( + Field::Float16(f16::from_f32(5.0)).to_json_value(), + Value::Number(serde_json::Number::from_f64(5.0).unwrap()) + ); assert_eq!( Field::Float(5.0).to_json_value(), Value::Number(serde_json::Number::from_f64(5.0).unwrap()) diff --git a/parquet/src/schema/parser.rs b/parquet/src/schema/parser.rs index 5e213e3bb9e5..dcef11aa66d4 100644 --- a/parquet/src/schema/parser.rs +++ b/parquet/src/schema/parser.rs @@ -823,6 +823,7 @@ mod tests { message root { optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18)); + optional fixed_len_byte_array (2) f3 (FLOAT16); } "; let message = parse(schema).unwrap(); @@ -855,6 +856,13 @@ mod tests { .build() .unwrap(), ), + Arc::new( + Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build() + .unwrap(), + ), ]) .build() .unwrap(); diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs index fe4757d41aed..2dec8a5be9f7 100644 --- a/parquet/src/schema/printer.rs +++ b/parquet/src/schema/printer.rs @@ -270,6 +270,7 @@ fn print_logical_and_converted( LogicalType::Enum => "ENUM".to_string(), LogicalType::List => "LIST".to_string(), LogicalType::Map => "MAP".to_string(), + LogicalType::Float16 => "FLOAT16".to_string(), LogicalType::Unknown => "UNKNOWN".to_string(), }, None => { @@ -667,6 +668,15 @@ mod tests { .unwrap(), "OPTIONAL FIXED_LEN_BYTE_ARRAY (9) decimal (DECIMAL(19,4));", ), + ( + Type::primitive_type_builder("float16", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .with_repetition(Repetition::REQUIRED) + .build() + .unwrap(), + "REQUIRED FIXED_LEN_BYTE_ARRAY (2) float16 (FLOAT16);", + ), ]; types_and_strings.into_iter().for_each(|(field, expected)| { diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 11c735420957..2f36deffbab5 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -356,6 +356,14 @@ impl<'a> PrimitiveTypeBuilder<'a> { (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {} (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {} + (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) + if self.length == 2 => {} + (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => { + return Err(general_err!( + "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field", + self.name + )) + } (a, b) => { return Err(general_err!( "Cannot annotate {:?} from {} for field '{}'", @@ -1504,6 +1512,41 @@ mod tests { "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'" ); } + + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build(); + assert!(result.is_ok()); + + // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type + result = Type::primitive_type_builder("foo", PhysicalType::FLOAT) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + format!("{e}"), + "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'" + ); + } + + // Must have length 2 + result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(4) + .build(); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!( + format!("{e}"), + "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field" + ); + } } #[test] @@ -1981,6 +2024,7 @@ mod tests { let message_type = " message conversions { REQUIRED INT64 id; + OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16); OPTIONAL group int_array_Array (LIST) { REPEATED group list { OPTIONAL group element (LIST) { From 7941577d414b9c93d60795bc79125ddad760c252 Mon Sep 17 00:00:00 2001 From: emcake <3726783+emcake@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:31:36 +0000 Subject: [PATCH 1344/1411] Enable truncation of binary statistics columns (#5076) * changes needed to introduce min/max exactness * implement truncation property and logic, tests * format lints * change min/max exact to be with... methods * reduce code noise * remove redundant clone --------- Co-authored-by: Matthew Kemp --- parquet/src/column/writer/mod.rs | 228 +++++++++++++++++++++++++-- parquet/src/file/properties.rs | 24 +++ parquet/src/file/statistics.rs | 171 ++++++++++++++++---- parquet/tests/arrow_writer_layout.rs | 52 +++--- 4 files changed, 401 insertions(+), 74 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index a917c4864988..11c39685911c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { self.column_index_builder.append( null_page, - self.truncate_min_value(stat.min_bytes()), - self.truncate_max_value(stat.max_bytes()), + self.truncate_min_value( + self.props.column_index_truncate_length(), + stat.min_bytes(), + ) + .0, + self.truncate_max_value( + self.props.column_index_truncate_length(), + stat.max_bytes(), + ) + .0, self.page_metrics.num_page_nulls as i64, ); } @@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .append_row_count(self.page_metrics.num_buffered_rows as i64); } - fn truncate_min_value(&self, data: &[u8]) -> Vec { - self.props - .column_index_truncate_length() + fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { + truncation_length .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l), Err(_) => Some(data[..l].to_vec()), }) - .unwrap_or_else(|| data.to_vec()) + .map(|truncated| (truncated, true)) + .unwrap_or_else(|| (data.to_vec(), false)) } - fn truncate_max_value(&self, data: &[u8]) -> Vec { - self.props - .column_index_truncate_length() + fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { + truncation_length .filter(|l| data.len() > *l) .and_then(|l| match str::from_utf8(data) { Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), Err(_) => increment(data[..l].to_vec()), }) - .unwrap_or_else(|| data.to_vec()) + .map(|truncated| (truncated, true)) + .unwrap_or_else(|| (data.to_vec(), false)) } /// Adds data page. @@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .set_dictionary_page_offset(dict_page_offset); if self.statistics_enabled != EnabledStatistics::None { + let backwards_compatible_min_max = self.descr.sort_order().is_signed(); + let statistics = ValueStatistics::::new( self.column_metrics.min_column_value.clone(), self.column_metrics.max_column_value.clone(), self.column_metrics.column_distinct_count, self.column_metrics.num_column_nulls, false, - ); + ) + .with_backwards_compatible_min_max(backwards_compatible_min_max) + .into(); + + let statistics = match statistics { + Statistics::ByteArray(stats) if stats.has_min_max_set() => { + let (min, did_truncate_min) = self.truncate_min_value( + self.props.statistics_truncate_length(), + stats.min_bytes(), + ); + let (max, did_truncate_max) = self.truncate_max_value( + self.props.statistics_truncate_length(), + stats.max_bytes(), + ); + Statistics::ByteArray( + ValueStatistics::new( + Some(min.into()), + Some(max.into()), + stats.distinct_count(), + stats.null_count(), + backwards_compatible_min_max, + ) + .with_max_is_exact(!did_truncate_max) + .with_min_is_exact(!did_truncate_min), + ) + } + Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => { + let (min, did_truncate_min) = self.truncate_min_value( + self.props.statistics_truncate_length(), + stats.min_bytes(), + ); + let (max, did_truncate_max) = self.truncate_max_value( + self.props.statistics_truncate_length(), + stats.max_bytes(), + ); + Statistics::FixedLenByteArray( + ValueStatistics::new( + Some(min.into()), + Some(max.into()), + stats.distinct_count(), + stats.null_count(), + backwards_compatible_min_max, + ) + .with_max_is_exact(!did_truncate_max) + .with_min_is_exact(!did_truncate_min), + ) + } + stats => stats, + }; - // Some common readers only support the deprecated statistics - // format so we also write them out if possible - // See https://github.com/apache/arrow-rs/issues/799 - let statistics = statistics - .with_backwards_compatible_min_max(self.descr.sort_order().is_signed()) - .into(); builder = builder.set_statistics(statistics); } @@ -2612,6 +2664,148 @@ mod tests { } } + #[test] + fn test_statistics_truncating_byte_array() { + let page_writer = get_test_page_writer(); + + const TEST_TRUNCATE_LENGTH: usize = 1; + + // Truncate values at 1 byte + let builder = + WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH)); + let props = Arc::new(builder.build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![ByteArray::default(); 1]; + // This is the expected min value + data[0].set_data(Bytes::from(String::from("Blart Versenwald III"))); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + assert_eq!(1, r.rows_written); + + let stats = r.metadata.statistics().expect("statistics"); + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::ByteArray(_stats) = stats { + let min_value = _stats.min(); + let max_value = _stats.max(); + + assert!(!_stats.min_is_exact()); + assert!(!_stats.max_is_exact()); + + assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH); + assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH); + + assert_eq!("B".as_bytes(), min_value.as_bytes()); + assert_eq!("C".as_bytes(), max_value.as_bytes()); + } else { + panic!("expecting Statistics::ByteArray"); + } + } + + #[test] + fn test_statistics_truncating_fixed_len_byte_array() { + let page_writer = get_test_page_writer(); + + const TEST_TRUNCATE_LENGTH: usize = 1; + + // Truncate values at 1 byte + let builder = + WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH)); + let props = Arc::new(builder.build()); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + + let mut data = vec![FixedLenByteArray::default(); 1]; + + const PSEUDO_DECIMAL_VALUE: i128 = 6541894651216648486512564456564654; + const PSEUDO_DECIMAL_BYTES: [u8; 16] = PSEUDO_DECIMAL_VALUE.to_be_bytes(); + + const EXPECTED_MIN: [u8; TEST_TRUNCATE_LENGTH] = [PSEUDO_DECIMAL_BYTES[0]]; // parquet specifies big-endian order for decimals + const EXPECTED_MAX: [u8; TEST_TRUNCATE_LENGTH] = + [PSEUDO_DECIMAL_BYTES[0].overflowing_add(1).0]; + + // This is the expected min value + data[0].set_data(Bytes::from(PSEUDO_DECIMAL_BYTES.as_slice())); + + writer.write_batch(&data, None, None).unwrap(); + + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + assert_eq!(1, r.rows_written); + + let stats = r.metadata.statistics().expect("statistics"); + assert!(stats.has_min_max_set()); + assert_eq!(stats.null_count(), 0); + assert_eq!(stats.distinct_count(), None); + if let Statistics::FixedLenByteArray(_stats) = stats { + let min_value = _stats.min(); + let max_value = _stats.max(); + + assert!(!_stats.min_is_exact()); + assert!(!_stats.max_is_exact()); + + assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH); + assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH); + + assert_eq!(EXPECTED_MIN.as_slice(), min_value.as_bytes()); + assert_eq!(EXPECTED_MAX.as_slice(), max_value.as_bytes()); + + let reconstructed_min = i128::from_be_bytes([ + min_value.as_bytes()[0], + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ]); + + let reconstructed_max = i128::from_be_bytes([ + max_value.as_bytes()[0], + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ]); + + // check that the inner value is correctly bounded by the min/max + println!("min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}"); + assert!(reconstructed_min <= PSEUDO_DECIMAL_VALUE); + println!("max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}"); + assert!(reconstructed_max >= PSEUDO_DECIMAL_VALUE); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } + #[test] fn test_send() { fn test() {} diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index ea71763a0101..287e73c9906a 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -51,6 +51,8 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option = Some(64); pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05; /// Default value for [`BloomFilterProperties::ndv`] pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64; +/// Default values for [`WriterProperties::statistics_truncate_length`] +pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option = None; /// Parquet writer version. /// @@ -136,6 +138,7 @@ pub struct WriterProperties { column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, + statistics_truncate_length: Option, } impl Default for WriterProperties { @@ -241,6 +244,13 @@ impl WriterProperties { self.column_index_truncate_length } + /// Returns the maximum length of truncated min/max values in statistics. + /// + /// `None` if truncation is disabled, must be greater than 0 otherwise. + pub fn statistics_truncate_length(&self) -> Option { + self.statistics_truncate_length + } + /// Returns encoding for a data page, when dictionary encoding is enabled. /// This is not configurable. #[inline] @@ -334,6 +344,7 @@ pub struct WriterPropertiesBuilder { column_properties: HashMap, sorting_columns: Option>, column_index_truncate_length: Option, + statistics_truncate_length: Option, } impl WriterPropertiesBuilder { @@ -352,6 +363,7 @@ impl WriterPropertiesBuilder { column_properties: HashMap::new(), sorting_columns: None, column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, + statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH, } } @@ -370,6 +382,7 @@ impl WriterPropertiesBuilder { column_properties: self.column_properties, sorting_columns: self.sorting_columns, column_index_truncate_length: self.column_index_truncate_length, + statistics_truncate_length: self.statistics_truncate_length, } } @@ -643,6 +656,17 @@ impl WriterPropertiesBuilder { self.column_index_truncate_length = max_length; self } + + /// Sets the max length of min/max value fields in statistics. Must be greater than 0. + /// If set to `None` - there's no effective limit. + pub fn set_statistics_truncate_length(mut self, max_length: Option) -> Self { + if let Some(value) = max_length { + assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`."); + } + + self.statistics_truncate_length = max_length; + self + } } /// Controls the level of statistics to be computed by the writer diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 345fe7dd2615..1bc003d48854 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -27,6 +27,8 @@ //! assert_eq!(stats.null_count(), 3); //! assert!(stats.has_min_max_set()); //! assert!(stats.is_min_max_deprecated()); +//! assert!(stats.min_is_exact()); +//! assert!(stats.max_is_exact()); //! //! match stats { //! Statistics::Int32(ref typed) => { @@ -206,19 +208,27 @@ pub fn from_thrift( null_count, old_format, ), - Type::BYTE_ARRAY => Statistics::byte_array( - min.map(ByteArray::from), - max.map(ByteArray::from), - distinct_count, - null_count, - old_format, + Type::BYTE_ARRAY => Statistics::ByteArray( + ValueStatistics::new( + min.map(ByteArray::from), + max.map(ByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), - Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array( - min.map(ByteArray::from).map(FixedLenByteArray::from), - max.map(ByteArray::from).map(FixedLenByteArray::from), - distinct_count, - null_count, - old_format, + Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray( + ValueStatistics::new( + min.map(ByteArray::from).map(FixedLenByteArray::from), + max.map(ByteArray::from).map(FixedLenByteArray::from), + distinct_count, + null_count, + old_format, + ) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), }; @@ -248,13 +258,15 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { }; // Get min/max if set. - let (min, max) = if stats.has_min_max_set() { + let (min, max, min_exact, max_exact) = if stats.has_min_max_set() { ( Some(stats.min_bytes().to_vec()), Some(stats.max_bytes().to_vec()), + Some(stats.min_is_exact()), + Some(stats.max_is_exact()), ) } else { - (None, None) + (None, None, None, None) }; if stats.is_min_max_backwards_compatible() { @@ -268,6 +280,9 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { thrift_stats.max_value = max; } + thrift_stats.is_min_value_exact = min_exact; + thrift_stats.is_max_value_exact = max_exact; + Some(thrift_stats) } @@ -374,6 +389,16 @@ impl Statistics { statistics_enum_func![self, has_min_max_set] } + /// Returns `true` if the min value is set, and is an exact min value. + pub fn min_is_exact(&self) -> bool { + statistics_enum_func![self, min_is_exact] + } + + /// Returns `true` if the max value is set, and is an exact max value. + pub fn max_is_exact(&self) -> bool { + statistics_enum_func![self, max_is_exact] + } + /// Returns slice of bytes that represent min value. /// Panics if min value is not set. pub fn min_bytes(&self) -> &[u8] { @@ -428,6 +453,10 @@ pub struct ValueStatistics { distinct_count: Option, null_count: u64, + // Whether or not the min or max values are exact, or truncated. + is_max_value_exact: bool, + is_min_value_exact: bool, + /// If `true` populate the deprecated `min` and `max` fields instead of /// `min_value` and `max_value` is_min_max_deprecated: bool, @@ -447,6 +476,8 @@ impl ValueStatistics { is_min_max_deprecated: bool, ) -> Self { Self { + is_max_value_exact: max.is_some(), + is_min_value_exact: min.is_some(), min, max, distinct_count, @@ -456,6 +487,28 @@ impl ValueStatistics { } } + /// Set whether the stored `min` field represents the exact + /// minimum, or just a bound on the minimum value. + /// + /// see [`Self::min_is_exact`] + pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self { + Self { + is_min_value_exact, + ..self + } + } + + /// Set whether the stored `max` field represents the exact + /// maximum, or just a bound on the maximum value. + /// + /// see [`Self::max_is_exact`] + pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self { + Self { + is_max_value_exact, + ..self + } + } + /// Set whether to write the deprecated `min` and `max` fields /// for compatibility with older parquet writers /// @@ -506,13 +559,23 @@ impl ValueStatistics { self.min.is_some() && self.max.is_some() } + /// Whether or not max value is set, and is an exact value. + pub fn max_is_exact(&self) -> bool { + self.max.is_some() && self.is_max_value_exact + } + + /// Whether or not min value is set, and is an exact value. + pub fn min_is_exact(&self) -> bool { + self.min.is_some() && self.is_min_value_exact + } + /// Returns optional value of number of distinct values occurring. - fn distinct_count(&self) -> Option { + pub fn distinct_count(&self) -> Option { self.distinct_count } /// Returns null count. - fn null_count(&self) -> u64 { + pub fn null_count(&self) -> u64 { self.null_count } @@ -556,6 +619,8 @@ impl fmt::Display for ValueStatistics { } write!(f, ", null_count: {}", self.null_count)?; write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?; + write!(f, ", max_value_exact: {}", self.is_max_value_exact)?; + write!(f, ", min_value_exact: {}", self.is_min_value_exact)?; write!(f, "}}") } } @@ -565,13 +630,15 @@ impl fmt::Debug for ValueStatistics { write!( f, "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \ - min_max_deprecated: {}, min_max_backwards_compatible: {}}}", + min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}", self.min, self.max, self.distinct_count, self.null_count, self.is_min_max_deprecated, - self.is_min_max_backwards_compatible + self.is_min_max_backwards_compatible, + self.is_max_value_exact, + self.is_min_value_exact ) } } @@ -628,14 +695,14 @@ mod tests { assert_eq!( format!("{stats:?}"), "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \ - min_max_deprecated: true, min_max_backwards_compatible: true})" + min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})" ); let stats = Statistics::int32(None, None, None, 7, false); assert_eq!( format!("{stats:?}"), "Int32({min: None, max: None, distinct_count: None, null_count: 7, \ - min_max_deprecated: false, min_max_backwards_compatible: false})" + min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})" ) } @@ -644,14 +711,14 @@ mod tests { let stats = Statistics::int32(Some(1), Some(12), None, 12, true); assert_eq!( format!("{stats}"), - "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}" + "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); let stats = Statistics::int64(None, None, None, 7, false); assert_eq!( format!("{stats}"), "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \ - false}" + false, max_value_exact: false, min_value_exact: false}" ); let stats = Statistics::int96( @@ -664,19 +731,23 @@ mod tests { assert_eq!( format!("{stats}"), "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \ - min_max_deprecated: true}" + min_max_deprecated: true, max_value_exact: true, min_value_exact: true}" ); - let stats = Statistics::byte_array( - Some(ByteArray::from(vec![1u8])), - Some(ByteArray::from(vec![2u8])), - Some(5), - 7, - false, + let stats = Statistics::ByteArray( + ValueStatistics::new( + Some(ByteArray::from(vec![1u8])), + Some(ByteArray::from(vec![2u8])), + Some(5), + 7, + false, + ) + .with_max_is_exact(false) + .with_min_is_exact(false), ); assert_eq!( format!("{stats}"), - "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}" + "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}" ); } @@ -712,7 +783,45 @@ mod tests { Some(ByteArray::from(vec![1, 2, 3]).into()), None, 0, - true + true, + ) + ); + + assert!( + Statistics::byte_array( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) != Statistics::ByteArray( + ValueStatistics::new( + Some(ByteArray::from(vec![1, 2, 3])), + Some(ByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) + .with_max_is_exact(false) + ) + ); + + assert!( + Statistics::fixed_len_byte_array( + Some(FixedLenByteArray::from(vec![1, 2, 3])), + Some(FixedLenByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) != Statistics::FixedLenByteArray( + ValueStatistics::new( + Some(FixedLenByteArray::from(vec![1, 2, 3])), + Some(FixedLenByteArray::from(vec![1, 2, 3])), + None, + 0, + true, + ) + .with_min_is_exact(false) ) ); } diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index fab87f32f5c4..cd124031cfdc 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -185,7 +185,7 @@ fn test_primitive() { pages: (0..8) .map(|_| Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -214,14 +214,14 @@ fn test_primitive() { pages: vec![ Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 258, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1750, - page_header_size: 34, + page_header_size: 36, compressed_size: 7000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -229,7 +229,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 250, - page_header_size: 34, + page_header_size: 36, compressed_size: 1000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -256,42 +256,42 @@ fn test_primitive() { pages: vec![ Page { rows: 400, - page_header_size: 34, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 34, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 34, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -299,7 +299,7 @@ fn test_primitive() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 34, + page_header_size: 36, compressed_size: 8000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -325,7 +325,7 @@ fn test_primitive() { pages: (0..20) .map(|_| Page { rows: 100, - page_header_size: 34, + page_header_size: 36, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -360,14 +360,14 @@ fn test_string() { pages: (0..15) .map(|_| Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }) .chain(std::iter::once(Page { rows: 50, - page_header_size: 33, + page_header_size: 35, compressed_size: 400, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -396,21 +396,21 @@ fn test_string() { pages: vec![ Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 138, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 1250, - page_header_size: 36, + page_header_size: 38, compressed_size: 10000, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, }, Page { rows: 620, - page_header_size: 34, + page_header_size: 36, compressed_size: 4960, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, @@ -418,7 +418,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 130, - page_header_size: 34, + page_header_size: 36, compressed_size: 1040, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -445,42 +445,42 @@ fn test_string() { pages: vec![ Page { rows: 400, - page_header_size: 34, + page_header_size: 36, compressed_size: 452, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 370, - page_header_size: 34, + page_header_size: 36, compressed_size: 472, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 330, - page_header_size: 34, + page_header_size: 36, compressed_size: 464, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, }, Page { rows: 240, - page_header_size: 34, + page_header_size: 36, compressed_size: 332, encoding: Encoding::RLE_DICTIONARY, page_type: PageType::DATA_PAGE, @@ -488,7 +488,7 @@ fn test_string() { ], dictionary_page: Some(Page { rows: 2000, - page_header_size: 34, + page_header_size: 36, compressed_size: 16000, encoding: Encoding::PLAIN, page_type: PageType::DICTIONARY_PAGE, @@ -528,7 +528,7 @@ fn test_list() { pages: (0..10) .map(|_| Page { rows: 20, - page_header_size: 34, + page_header_size: 36, compressed_size: 672, encoding: Encoding::PLAIN, page_type: PageType::DATA_PAGE, From 7fa78b79b5e3ba028b32b20096dbe4a6f17c82bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 15 Nov 2023 13:55:21 +0000 Subject: [PATCH 1345/1411] Support multiple GZip members in parquet page (#4951) --- parquet-testing | 2 +- parquet/src/compression.rs | 2 +- parquet/src/file/serialized_reader.rs | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/parquet-testing b/parquet-testing index 506afff9b695..89b685a64c31 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 506afff9b6957ffe10d08470d467867d43e1bb91 +Subproject commit 89b685a64c3117b3023d8684af1f41400841db71 diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index 9e0eee0e3e04..a9a1afbbf213 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -255,7 +255,7 @@ mod gzip_codec { output_buf: &mut Vec, _uncompress_size: Option, ) -> Result { - let mut decoder = read::GzDecoder::new(input_buf); + let mut decoder = read::MultiGzDecoder::new(input_buf); decoder.read_to_end(output_buf).map_err(|e| e.into()) } diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 43e169cd085b..fbb172d3b3c2 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -775,6 +775,7 @@ mod tests { use crate::format::BoundaryOrder; use crate::basic::{self, ColumnOrder}; + use crate::column::reader::ColumnReader; use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; @@ -1730,4 +1731,28 @@ mod tests { _ => unreachable!(), } } + + #[test] + fn test_multi_gz() { + let file = get_test_file("concatenated_gzip_members.parquet"); + let reader = SerializedFileReader::new(file).unwrap(); + let row_group_reader = reader.get_row_group(0).unwrap(); + match row_group_reader.get_column_reader(0).unwrap() { + ColumnReader::Int64ColumnReader(mut reader) => { + let mut buffer = [0; 1024]; + let mut def_levels = [0; 1024]; + let (num_records, num_values, num_levels) = reader + .read_records(1024, Some(&mut def_levels), None, &mut buffer) + .unwrap(); + + assert_eq!(num_records, 513); + assert_eq!(num_values, 513); + assert_eq!(num_levels, 513); + + let expected: Vec = (1..514).collect(); + assert_eq!(&buffer[..513], &expected); + } + _ => unreachable!(), + } + } } From 4b9d789885daa8386dad8dafab223f89ea257677 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 08:19:07 -0800 Subject: [PATCH 1346/1411] Update itertools requirement from 0.11.0 to 0.12.0 in /object_store (#5077) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.11.0...v0.12.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bf8301557df2..2f5157c40e67 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.31", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.11.0" +itertools = "0.12.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" snafu = "0.7" From aff86e704dabecbf99edd1e0ad62c216819dbc15 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 15 Nov 2023 13:18:45 -0500 Subject: [PATCH 1347/1411] Implement Arrow PyCapsule Interface (#5070) * arrow ffi array copy * remove copy_ffi_array * docstring * wip: pycapsule support * return * Update arrow/src/pyarrow.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * remove sync impl * Update arrow/src/pyarrow.rs Co-authored-by: Will Jones * Remove copy() * Need &mut FFI_ArrowArray for std::mem::replace * Use std::ptr::replace * update comments * Minimize unsafe block * revert pub release functions * Add RecordBatch and Stream conversion * fix returns * Fix return type * Fix name * fix ci * Add tests * Add table test * skip if pre pyarrow 14 * bump python version in CI to use pyarrow 14 * Add record batch test * Update arrow/src/pyarrow.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * run on pyarrow 13 and 14 * Update .github/workflows/integration.yml Co-authored-by: Will Jones --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Will Jones --- .github/workflows/integration.yml | 6 +- arrow-pyarrow-integration-testing/README.md | 2 + .../tests/test_sql.py | 138 +++++++++++++++++- arrow-schema/src/ffi.rs | 2 + arrow/src/pyarrow.rs | 134 ++++++++++++++++- 5 files changed, 274 insertions(+), 8 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6e2b4420408a..f939a6a13b58 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -106,6 +106,8 @@ jobs: strategy: matrix: rust: [ stable ] + # PyArrow 13 was the last version prior to introduction to Arrow PyCapsules + pyarrow: [ "13", "14" ] steps: - uses: actions/checkout@v4 with: @@ -128,14 +130,14 @@ jobs: key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - uses: actions/setup-python@v4 with: - python-version: '3.7' + python-version: '3.8' - name: Upgrade pip and setuptools run: pip install --upgrade pip setuptools wheel virtualenv - name: Create virtualenv and install dependencies run: | virtualenv venv source venv/bin/activate - pip install maturin toml pytest pytz pyarrow>=5.0 + pip install maturin toml pytest pytz pyarrow==${{ matrix.pyarrow }} - name: Run Rust tests run: | source venv/bin/activate diff --git a/arrow-pyarrow-integration-testing/README.md b/arrow-pyarrow-integration-testing/README.md index e63953ad7900..5ca2ea76b88c 100644 --- a/arrow-pyarrow-integration-testing/README.md +++ b/arrow-pyarrow-integration-testing/README.md @@ -25,6 +25,7 @@ Note that this crate uses two languages and an external ABI: * `Rust` * `Python` * C ABI privately exposed by `Pyarrow`. +* PyCapsule ABI publicly exposed by `pyarrow` ## Basic idea @@ -36,6 +37,7 @@ we can use pyarrow's interface to move pointers from and to Rust. ## Relevant literature * [Arrow's CDataInterface](https://arrow.apache.org/docs/format/CDataInterface.html) +* [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) * [Rust's FFI](https://doc.rust-lang.org/nomicon/ffi.html) * [Pyarrow private binds](https://github.com/apache/arrow/blob/ae1d24efcc3f1ac2a876d8d9f544a34eb04ae874/python/pyarrow/array.pxi#L1226) * [PyO3](https://docs.rs/pyo3/0.12.1/pyo3/index.html) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 1748fd3ffb6b..16d4e0f12f88 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -27,6 +27,8 @@ import arrow_pyarrow_integration_testing as rust +PYARROW_PRE_14 = int(pa.__version__.split('.')[0]) < 14 + @contextlib.contextmanager def no_pyarrow_leak(): @@ -113,6 +115,34 @@ def assert_pyarrow_leak(): _unsupported_pyarrow_types = [ ] +# As of pyarrow 14, pyarrow implements the Arrow PyCapsule interface +# (https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). +# This defines that Arrow consumers should allow any object that has specific "dunder" +# methods, `__arrow_c_*_`. These wrapper classes ensure that arrow-rs is able to handle +# _any_ class, without pyarrow-specific handling. +class SchemaWrapper: + def __init__(self, schema): + self.schema = schema + + def __arrow_c_schema__(self): + return self.schema.__arrow_c_schema__() + + +class ArrayWrapper: + def __init__(self, array): + self.array = array + + def __arrow_c_array__(self): + return self.array.__arrow_c_array__() + + +class StreamWrapper: + def __init__(self, stream): + self.stream = stream + + def __arrow_c_stream__(self): + return self.stream.__arrow_c_stream__() + @pytest.mark.parametrize("pyarrow_type", _supported_pyarrow_types, ids=str) def test_type_roundtrip(pyarrow_type): @@ -120,6 +150,14 @@ def test_type_roundtrip(pyarrow_type): assert restored == pyarrow_type assert restored is not pyarrow_type +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +@pytest.mark.parametrize("pyarrow_type", _supported_pyarrow_types, ids=str) +def test_type_roundtrip_pycapsule(pyarrow_type): + wrapped = SchemaWrapper(pyarrow_type) + restored = rust.round_trip_type(wrapped) + assert restored == pyarrow_type + assert restored is not pyarrow_type + @pytest.mark.parametrize("pyarrow_type", _unsupported_pyarrow_types, ids=str) def test_type_roundtrip_raises(pyarrow_type): @@ -138,6 +176,20 @@ def test_field_roundtrip(pyarrow_type): field = rust.round_trip_field(pyarrow_field) assert field == pyarrow_field +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +@pytest.mark.parametrize('pyarrow_type', _supported_pyarrow_types, ids=str) +def test_field_roundtrip_pycapsule(pyarrow_type): + pyarrow_field = pa.field("test", pyarrow_type, nullable=True) + wrapped = SchemaWrapper(pyarrow_field) + field = rust.round_trip_field(wrapped) + assert field == wrapped.schema + + if pyarrow_type != pa.null(): + # A null type field may not be non-nullable + pyarrow_field = pa.field("test", pyarrow_type, nullable=False) + field = rust.round_trip_field(wrapped) + assert field == wrapped.schema + def test_field_metadata_roundtrip(): metadata = {"hello": "World! 😊", "x": "2"} pyarrow_field = pa.field("test", pa.int32(), metadata=metadata) @@ -163,6 +215,17 @@ def test_primitive_python(): del b +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +def test_primitive_python_pycapsule(): + """ + Python -> Rust -> Python + """ + a = pa.array([1, 2, 3]) + wrapped = ArrayWrapper(a) + b = rust.double(wrapped) + assert b == pa.array([2, 4, 6]) + + def test_primitive_rust(): """ Rust -> Python -> Rust @@ -433,6 +496,33 @@ def test_record_batch_reader(): got_batches = list(b) assert got_batches == batches +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +def test_record_batch_reader_pycapsule(): + """ + Python -> Rust -> Python + """ + schema = pa.schema([('ints', pa.list_(pa.int32()))], metadata={b'key1': b'value1'}) + batches = [ + pa.record_batch([[[1], [2, 42]]], schema), + pa.record_batch([[None, [], [5, 6]]], schema), + ] + a = pa.RecordBatchReader.from_batches(schema, batches) + wrapped = StreamWrapper(a) + b = rust.round_trip_record_batch_reader(wrapped) + + assert b.schema == schema + got_batches = list(b) + assert got_batches == batches + + # Also try the boxed reader variant + a = pa.RecordBatchReader.from_batches(schema, batches) + wrapped = StreamWrapper(a) + b = rust.boxed_reader_roundtrip(wrapped) + assert b.schema == schema + got_batches = list(b) + assert got_batches == batches + + def test_record_batch_reader_error(): schema = pa.schema([('ints', pa.list_(pa.int32()))]) @@ -453,24 +543,64 @@ def iter_batches(): with pytest.raises(ValueError, match="invalid utf-8"): rust.round_trip_record_batch_reader(reader) + +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +def test_record_batch_pycapsule(): + """ + Python -> Rust -> Python + """ + schema = pa.schema([('ints', pa.list_(pa.int32()))], metadata={b'key1': b'value1'}) + batch = pa.record_batch([[[1], [2, 42]]], schema) + wrapped = StreamWrapper(batch) + b = rust.round_trip_record_batch_reader(wrapped) + new_table = b.read_all() + new_batches = new_table.to_batches() + + assert len(new_batches) == 1 + new_batch = new_batches[0] + + assert batch == new_batch + assert batch.schema == new_batch.schema + + +@pytest.mark.skipif(PYARROW_PRE_14, reason="requires pyarrow 14") +def test_table_pycapsule(): + """ + Python -> Rust -> Python + """ + schema = pa.schema([('ints', pa.list_(pa.int32()))], metadata={b'key1': b'value1'}) + batches = [ + pa.record_batch([[[1], [2, 42]]], schema), + pa.record_batch([[None, [], [5, 6]]], schema), + ] + table = pa.Table.from_batches(batches) + wrapped = StreamWrapper(table) + b = rust.round_trip_record_batch_reader(wrapped) + new_table = b.read_all() + + assert table.schema == new_table.schema + assert table == new_table + assert len(table.to_batches()) == len(new_table.to_batches()) + + def test_reject_other_classes(): # Arbitrary type that is not a PyArrow type not_pyarrow = ["hello"] with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Array, got builtins.list"): rust.round_trip_array(not_pyarrow) - + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Schema, got builtins.list"): rust.round_trip_schema(not_pyarrow) - + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.Field, got builtins.list"): rust.round_trip_field(not_pyarrow) - + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.DataType, got builtins.list"): rust.round_trip_type(not_pyarrow) with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.RecordBatch, got builtins.list"): rust.round_trip_record_batch(not_pyarrow) - + with pytest.raises(TypeError, match="Expected instance of pyarrow.lib.RecordBatchReader, got builtins.list"): rust.round_trip_record_batch_reader(not_pyarrow) diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 7e33a78fec27..640a7de79878 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -351,6 +351,8 @@ impl Drop for FFI_ArrowSchema { } } +unsafe impl Send for FFI_ArrowSchema {} + impl TryFrom<&FFI_ArrowSchema> for DataType { type Error = ArrowError; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 517c333addde..4d262b0d106f 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -59,12 +59,12 @@ use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; -use arrow_array::{RecordBatchIterator, RecordBatchReader}; +use arrow_array::{RecordBatchIterator, RecordBatchReader, StructArray}; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; use pyo3::prelude::*; -use pyo3::types::{PyList, PyTuple}; +use pyo3::types::{PyCapsule, PyList, PyTuple}; use crate::array::{make_array, ArrayData}; use crate::datatypes::{DataType, Field, Schema}; @@ -118,8 +118,40 @@ fn validate_class(expected: &str, value: &PyAny) -> PyResult<()> { Ok(()) } +fn validate_pycapsule(capsule: &PyCapsule, name: &str) -> PyResult<()> { + let capsule_name = capsule.name()?; + if capsule_name.is_none() { + return Err(PyValueError::new_err( + "Expected schema PyCapsule to have name set.", + )); + } + + let capsule_name = capsule_name.unwrap().to_str()?; + if capsule_name != name { + return Err(PyValueError::new_err(format!( + "Expected name '{}' in PyCapsule, instead got '{}'", + name, capsule_name + ))); + } + + Ok(()) +} + impl FromPyArrow for DataType { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_schema__")? { + let capsule: &PyCapsule = + PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + validate_pycapsule(capsule, "arrow_schema")?; + + let schema_ptr = unsafe { capsule.reference::() }; + let dtype = DataType::try_from(schema_ptr).map_err(to_py_err)?; + return Ok(dtype); + } + validate_class("DataType", value)?; let c_schema = FFI_ArrowSchema::empty(); @@ -143,6 +175,19 @@ impl ToPyArrow for DataType { impl FromPyArrow for Field { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_schema__")? { + let capsule: &PyCapsule = + PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + validate_pycapsule(capsule, "arrow_schema")?; + + let schema_ptr = unsafe { capsule.reference::() }; + let field = Field::try_from(schema_ptr).map_err(to_py_err)?; + return Ok(field); + } + validate_class("Field", value)?; let c_schema = FFI_ArrowSchema::empty(); @@ -166,6 +211,19 @@ impl ToPyArrow for Field { impl FromPyArrow for Schema { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_schema__")? { + let capsule: &PyCapsule = + PyTryInto::try_into(value.getattr("__arrow_c_schema__")?.call0()?)?; + validate_pycapsule(capsule, "arrow_schema")?; + + let schema_ptr = unsafe { capsule.reference::() }; + let schema = Schema::try_from(schema_ptr).map_err(to_py_err)?; + return Ok(schema); + } + validate_class("Schema", value)?; let c_schema = FFI_ArrowSchema::empty(); @@ -189,6 +247,30 @@ impl ToPyArrow for Schema { impl FromPyArrow for ArrayData { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_array__")? { + let tuple = value.getattr("__arrow_c_array__")?.call0()?; + + if !tuple.is_instance_of::() { + return Err(PyTypeError::new_err( + "Expected __arrow_c_array__ to return a tuple.", + )); + } + + let schema_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(0)?)?; + let array_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(1)?)?; + + validate_pycapsule(schema_capsule, "arrow_schema")?; + validate_pycapsule(array_capsule, "arrow_array")?; + + let schema_ptr = unsafe { schema_capsule.reference::() }; + let array_ptr = array_capsule.pointer() as *mut FFI_ArrowArray; + let array = unsafe { std::ptr::replace(array_ptr, FFI_ArrowArray::empty()) }; + return ffi::from_ffi(array, schema_ptr).map_err(to_py_err); + } + validate_class("Array", value)?; // prepare a pointer to receive the Array struct @@ -247,6 +329,37 @@ impl ToPyArrow for Vec { impl FromPyArrow for RecordBatch { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_array__")? { + let tuple = value.getattr("__arrow_c_array__")?.call0()?; + + if !tuple.is_instance_of::() { + return Err(PyTypeError::new_err( + "Expected __arrow_c_array__ to return a tuple.", + )); + } + + let schema_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(0)?)?; + let array_capsule: &PyCapsule = PyTryInto::try_into(tuple.get_item(1)?)?; + + validate_pycapsule(schema_capsule, "arrow_schema")?; + validate_pycapsule(array_capsule, "arrow_array")?; + + let schema_ptr = unsafe { schema_capsule.reference::() }; + let array_ptr = array_capsule.pointer() as *mut FFI_ArrowArray; + let ffi_array = unsafe { std::ptr::replace(array_ptr, FFI_ArrowArray::empty()) }; + let array_data = ffi::from_ffi(ffi_array, schema_ptr).map_err(to_py_err)?; + if !matches!(array_data.data_type(), DataType::Struct(_)) { + return Err(PyTypeError::new_err( + "Expected Struct type from __arrow_c_array.", + )); + } + let array = StructArray::from(array_data); + return Ok(array.into()); + } + validate_class("RecordBatch", value)?; // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches let schema = value.getattr("schema")?; @@ -276,6 +389,23 @@ impl ToPyArrow for RecordBatch { /// Supports conversion from `pyarrow.RecordBatchReader` to [ArrowArrayStreamReader]. impl FromPyArrow for ArrowArrayStreamReader { fn from_pyarrow(value: &PyAny) -> PyResult { + // Newer versions of PyArrow as well as other libraries with Arrow data implement this + // method, so prefer it over _export_to_c. + // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + if value.hasattr("__arrow_c_stream__")? { + let capsule: &PyCapsule = + PyTryInto::try_into(value.getattr("__arrow_c_stream__")?.call0()?)?; + validate_pycapsule(capsule, "arrow_array_stream")?; + + let stream_ptr = capsule.pointer() as *mut FFI_ArrowArrayStream; + let stream = unsafe { std::ptr::replace(stream_ptr, FFI_ArrowArrayStream::empty()) }; + + let stream_reader = ArrowArrayStreamReader::try_new(stream) + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + return Ok(stream_reader); + } + validate_class("RecordBatchReader", value)?; // prepare a pointer to receive the stream struct From a3687a750665780a5d3988a1d66d52a98814c568 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Nov 2023 08:07:38 +0000 Subject: [PATCH 1348/1411] Add FFI from_raw (#5082) --- arrow-data/src/ffi.rs | 16 ++++++++++++++++ arrow-schema/src/ffi.rs | 16 ++++++++++++++++ arrow/src/ffi_stream.rs | 27 +++++++++++++++++++++------ arrow/src/pyarrow.rs | 12 ++++-------- 4 files changed, 57 insertions(+), 14 deletions(-) diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index 7623ced043cc..2b4d52601286 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -191,6 +191,22 @@ impl FFI_ArrowArray { } } + /// Takes ownership of the pointed to [`FFI_ArrowArray`] + /// + /// This acts to [move] the data out of `array`, setting the release callback to NULL + /// + /// # Safety + /// + /// * `array` must be [valid] for reads and writes + /// * `array` must be properly aligned + /// * `array` must point to a properly initialized value of [`FFI_ArrowArray`] + /// + /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array + /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety + pub unsafe fn from_raw(array: *mut FFI_ArrowArray) -> Self { + std::ptr::replace(array, Self::empty()) + } + /// create an empty `FFI_ArrowArray`, which can be used to import data into pub fn empty() -> Self { Self { diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 640a7de79878..b4d10b814a5d 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -219,6 +219,22 @@ impl FFI_ArrowSchema { Ok(self) } + /// Takes ownership of the pointed to [`FFI_ArrowSchema`] + /// + /// This acts to [move] the data out of `schema`, setting the release callback to NULL + /// + /// # Safety + /// + /// * `schema` must be [valid] for reads and writes + /// * `schema` must be properly aligned + /// * `schema` must point to a properly initialized value of [`FFI_ArrowSchema`] + /// + /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array + /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety + pub unsafe fn from_raw(schema: *mut FFI_ArrowSchema) -> Self { + std::ptr::replace(schema, Self::empty()) + } + pub fn empty() -> Self { Self { format: std::ptr::null_mut(), diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 73cf28d66dab..123669aa61be 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -171,6 +171,22 @@ impl FFI_ArrowArrayStream { } } + /// Takes ownership of the pointed to [`FFI_ArrowArrayStream`] + /// + /// This acts to [move] the data out of `raw_stream`, setting the release callback to NULL + /// + /// # Safety + /// + /// * `raw_stream` must be [valid] for reads and writes + /// * `raw_stream` must be properly aligned + /// * `raw_stream` must point to a properly initialized value of [`FFI_ArrowArrayStream`] + /// + /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array + /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety + pub unsafe fn from_raw(raw_stream: *mut FFI_ArrowArrayStream) -> Self { + std::ptr::replace(raw_stream, Self::empty()) + } + /// Creates a new empty [FFI_ArrowArrayStream]. Used to import from the C Stream Interface. pub fn empty() -> Self { Self { @@ -306,11 +322,10 @@ impl ArrowArrayStreamReader { /// the pointer. /// /// # Safety - /// This function dereferences a raw pointer of `FFI_ArrowArrayStream`. + /// + /// See [`FFI_ArrowArrayStream::from_raw`] pub unsafe fn from_raw(raw_stream: *mut FFI_ArrowArrayStream) -> Result { - let stream_data = std::ptr::replace(raw_stream, FFI_ArrowArrayStream::empty()); - - Self::try_new(stream_data) + Self::try_new(FFI_ArrowArrayStream::from_raw(raw_stream)) } /// Get the last error from `ArrowArrayStreamReader` @@ -368,6 +383,7 @@ impl RecordBatchReader for ArrowArrayStreamReader { /// # Safety /// Assumes that the pointer represents valid C Stream Interfaces, both in memory /// representation and lifetime via the `release` mechanism. +#[deprecated(note = "Use FFI_ArrowArrayStream::new")] pub unsafe fn export_reader_into_raw( reader: Box, out_stream: *mut FFI_ArrowArrayStream, @@ -426,8 +442,7 @@ mod tests { let reader = TestRecordBatchReader::new(schema.clone(), iter); // Export a `RecordBatchReader` through `FFI_ArrowArrayStream` - let mut ffi_stream = FFI_ArrowArrayStream::empty(); - unsafe { export_reader_into_raw(reader, &mut ffi_stream) }; + let mut ffi_stream = FFI_ArrowArrayStream::new(reader); // Get schema from `FFI_ArrowArrayStream` let mut ffi_schema = FFI_ArrowSchema::empty(); diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 4d262b0d106f..2ac550ad0456 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -266,8 +266,7 @@ impl FromPyArrow for ArrayData { validate_pycapsule(array_capsule, "arrow_array")?; let schema_ptr = unsafe { schema_capsule.reference::() }; - let array_ptr = array_capsule.pointer() as *mut FFI_ArrowArray; - let array = unsafe { std::ptr::replace(array_ptr, FFI_ArrowArray::empty()) }; + let array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; return ffi::from_ffi(array, schema_ptr).map_err(to_py_err); } @@ -348,8 +347,7 @@ impl FromPyArrow for RecordBatch { validate_pycapsule(array_capsule, "arrow_array")?; let schema_ptr = unsafe { schema_capsule.reference::() }; - let array_ptr = array_capsule.pointer() as *mut FFI_ArrowArray; - let ffi_array = unsafe { std::ptr::replace(array_ptr, FFI_ArrowArray::empty()) }; + let ffi_array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; let array_data = ffi::from_ffi(ffi_array, schema_ptr).map_err(to_py_err)?; if !matches!(array_data.data_type(), DataType::Struct(_)) { return Err(PyTypeError::new_err( @@ -397,8 +395,7 @@ impl FromPyArrow for ArrowArrayStreamReader { PyTryInto::try_into(value.getattr("__arrow_c_stream__")?.call0()?)?; validate_pycapsule(capsule, "arrow_array_stream")?; - let stream_ptr = capsule.pointer() as *mut FFI_ArrowArrayStream; - let stream = unsafe { std::ptr::replace(stream_ptr, FFI_ArrowArrayStream::empty()) }; + let stream = unsafe { FFI_ArrowArrayStream::from_raw(capsule.pointer() as _) }; let stream_reader = ArrowArrayStreamReader::try_new(stream) .map_err(|err| PyValueError::new_err(err.to_string()))?; @@ -430,8 +427,7 @@ impl IntoPyArrow for Box { // We can't implement `ToPyArrow` for `T: RecordBatchReader + Send` because // there is already a blanket implementation for `T: ToPyArrow`. fn into_pyarrow(self, py: Python) -> PyResult { - let mut stream = FFI_ArrowArrayStream::empty(); - unsafe { export_reader_into_raw(self, &mut stream) }; + let mut stream = FFI_ArrowArrayStream::new(self); let stream_ptr = (&mut stream) as *mut FFI_ArrowArrayStream; let module = py.import("pyarrow")?; From 873d277af47b18ae162f49931441ec772a34935f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:18:43 +0000 Subject: [PATCH 1349/1411] Update prost-build requirement from =0.12.1 to =0.12.2 (#5088) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/compare/v0.12.1...v0.12.2) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 036281528c19..e143b4409983 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.69", default-features = false } -prost-build = { version = "=0.12.1", default-features = false } +prost-build = { version = "=0.12.2", default-features = false } tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From 481652a4f8d972b633063158903dbdb0adcf094d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:52:26 +0000 Subject: [PATCH 1350/1411] Fix latest clippy lints (#5090) --- object_store/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 2d1d549f9e54..40dca8f756d2 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1535,11 +1535,11 @@ mod tests { let expected: Vec<_> = files .iter() - .cloned() .filter(|x| { let prefix_match = prefix.as_ref().map(|p| x.prefix_matches(p)).unwrap_or(true); - prefix_match && x > &offset + prefix_match && *x > &offset }) + .cloned() .collect(); assert_eq!(actual, expected, "{prefix:?} - {offset:?}"); From bfe396e3fcbaf7dd88572986dc538ab5922c088b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 17 Nov 2023 16:44:35 +0000 Subject: [PATCH 1351/1411] Ensure arrays passed to MutableArrayData have same type (#5091) (#5092) --- arrow-data/src/transform/mod.rs | 8 ++++++++ arrow/tests/array_transform.rs | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index af25e9c7e3dc..268cf10f2326 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -354,6 +354,14 @@ impl<'a> MutableArrayData<'a> { ) -> Self { let data_type = arrays[0].data_type(); + for a in arrays.iter().skip(1) { + assert_eq!( + data_type, + a.data_type(), + "Arrays with inconsistent types passed to MutableArrayData" + ) + } + // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index ccf66e1c30ad..74e2a212736a 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -975,6 +975,14 @@ fn test_extend_nulls_panic() { mutable.extend_nulls(2); } +#[test] +#[should_panic(expected = "Arrays with inconsistent types passed to MutableArrayData")] +fn test_mixed_types() { + let a = StringArray::from(vec!["abc", "def"]).to_data(); + let b = Int32Array::from(vec![1, 2, 3]).to_data(); + MutableArrayData::new(vec![&a, &b], false, 4); +} + /* // this is an old test used on a meanwhile removed dead code // that is still useful when `MutableArrayData` supports fixed-size lists. From dc75a280b46149140eca8dd5e18d31cbadf04716 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 17 Nov 2023 10:09:34 -0800 Subject: [PATCH 1352/1411] feat: cast (Large)List to FixedSizeList (#5081) * feat: cast (Large)List to FixedSizeList * fix: support 'safe' casting of list to FSL * fix: if target is non-null, use non-null sentinel value * Use MutableArrayData * Docs --------- Co-authored-by: Raphael Taylor-Davies --- arrow-cast/src/cast.rs | 264 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 263 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index e44133f81b4a..dd3e271afb0d 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -48,6 +48,7 @@ use crate::parse::{ }; use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; +use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -138,6 +139,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => { can_cast_types(list_from.data_type(), to_type) } + (List(list_from) | LargeList(list_from), FixedSizeList(list_to, _)) => { + can_cast_types(list_from.data_type(), list_to.data_type()) + } (List(_), _) => false, (FixedSizeList(list_from,_), List(list_to)) => { list_from.data_type() == list_to.data_type() @@ -279,6 +283,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { /// in integer casts return null /// * Numeric to boolean: 0 returns `false`, any other value returns `true` /// * List to List: the underlying data type is cast +/// * List to FixedSizeList: the underlying data type is cast. If safe is true and a list element +/// has the wrong length it will be replaced with NULL, otherwise an error will be returned /// * Primitive to List: a list array with 1 value per slot is created /// * Date32 and Date64: precision lost when going to higher interval /// * Time32 and Time64: precision lost when going to higher interval @@ -799,6 +805,14 @@ pub fn cast_with_options( cast_list_container::(array, cast_options) } } + (List(_), FixedSizeList(field, size)) => { + let array = array.as_list::(); + cast_list_to_fixed_size_list::(array, field, *size, cast_options) + } + (LargeList(_), FixedSizeList(field, size)) => { + let array = array.as_list::(); + cast_list_to_fixed_size_list::(array, field, *size, cast_options) + } (List(_) | LargeList(_), _) => match to_type { Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), @@ -824,7 +838,6 @@ pub fn cast_with_options( cast_fixed_size_list_to_list::(array) } } - (_, List(ref to)) => cast_values_to_list::(array, to, cast_options), (_, LargeList(ref to)) => cast_values_to_list::(array, to, cast_options), (Decimal128(_, s1), Decimal128(p2, s2)) => { @@ -3206,6 +3219,76 @@ where Ok(Arc::new(list)) } +fn cast_list_to_fixed_size_list( + array: &GenericListArray, + field: &Arc, + size: i32, + cast_options: &CastOptions, +) -> Result +where + OffsetSize: OffsetSizeTrait, +{ + let cap = array.len() * size as usize; + + let mut nulls = (cast_options.safe || array.null_count() != 0).then(|| { + let mut buffer = BooleanBufferBuilder::new(array.len()); + match array.nulls() { + Some(n) => buffer.append_buffer(n.inner()), + None => buffer.append_n(array.len(), true), + } + buffer + }); + + // Nulls in FixedSizeListArray take up space and so we must pad the values + let values = array.values().to_data(); + let mut mutable = MutableArrayData::new(vec![&values], cast_options.safe, cap); + // The end position in values of the last incorrectly-sized list slice + let mut last_pos = 0; + for (idx, w) in array.offsets().windows(2).enumerate() { + let start_pos = w[0].as_usize(); + let end_pos = w[1].as_usize(); + let len = end_pos - start_pos; + + if len != size as usize { + if cast_options.safe || array.is_null(idx) { + if last_pos != start_pos { + // Extend with valid slices + mutable.extend(0, last_pos, start_pos); + } + // Pad this slice with nulls + mutable.extend_nulls(size as _); + nulls.as_mut().unwrap().set_bit(idx, false); + // Set last_pos to the end of this slice's values + last_pos = end_pos + } else { + return Err(ArrowError::CastError(format!( + "Cannot cast to FixedSizeList({size}): value at index {idx} has length {len}", + ))); + } + } + } + + let values = match last_pos { + 0 => array.values().slice(0, cap), // All slices were the correct length + _ => { + if mutable.len() != cap { + // Remaining slices were all correct length + let remaining = cap - mutable.len(); + mutable.extend(0, last_pos, last_pos + remaining) + } + make_array(mutable.freeze()) + } + }; + + // Cast the inner values if necessary + let values = cast_with_options(values.as_ref(), field.data_type(), cast_options)?; + + // Construct the FixedSizeListArray + let nulls = nulls.map(|mut x| x.finish().into()); + let array = FixedSizeListArray::new(field.clone(), size, values, nulls); + Ok(Arc::new(array)) +} + /// Cast the container type of List/Largelist array but not the inner types. /// This function can leave the value data intact and only has to cast the offset dtypes. fn cast_list_container( @@ -3274,6 +3357,8 @@ where #[cfg(test)] mod tests { + use arrow_buffer::NullBuffer; + use super::*; macro_rules! generate_cast_test_case { @@ -7374,6 +7459,183 @@ mod tests { assert_eq!(&expected.value(2), &actual.value(2)); } + #[test] + fn test_cast_list_to_fsl() { + // There four noteworthy cases we should handle: + // 1. No nulls + // 2. Nulls that are always empty + // 3. Nulls that have varying lengths + // 4. Nulls that are correctly sized (same as target list size) + + // Non-null case + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let values = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4), Some(5), Some(6)]), + ]; + let array = Arc::new(ListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + values, 3, + )) as ArrayRef; + let actual = cast(array.as_ref(), &DataType::FixedSizeList(field.clone(), 3)).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + + // Null cases + // Array is [[1, 2, 3], null, [4, 5, 6], null] + let cases = [ + ( + // Zero-length nulls + vec![1, 2, 3, 4, 5, 6], + vec![3, 0, 3, 0], + ), + ( + // Varying-length nulls + vec![1, 2, 3, 0, 0, 4, 5, 6, 0], + vec![3, 2, 3, 1], + ), + ( + // Correctly-sized nulls + vec![1, 2, 3, 0, 0, 0, 4, 5, 6, 0, 0, 0], + vec![3, 3, 3, 3], + ), + ( + // Mixed nulls + vec![1, 2, 3, 4, 5, 6, 0, 0, 0], + vec![3, 0, 3, 3], + ), + ]; + let null_buffer = NullBuffer::from(vec![true, false, true, false]); + + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), Some(5), Some(6)]), + None, + ], + 3, + )) as ArrayRef; + + for (values, lengths) in cases.iter() { + let array = Arc::new(ListArray::new( + field.clone(), + OffsetBuffer::from_lengths(lengths.clone()), + Arc::new(Int32Array::from(values.clone())), + Some(null_buffer.clone()), + )) as ArrayRef; + let actual = cast(array.as_ref(), &DataType::FixedSizeList(field.clone(), 3)).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + } + + #[test] + fn test_cast_list_to_fsl_safety() { + let values = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4), Some(5)]), + Some(vec![Some(6), Some(7), Some(8), Some(9)]), + Some(vec![Some(3), Some(4), Some(5)]), + ]; + let array = Arc::new(ListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + + let res = cast_with_options( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + &CastOptions { + safe: false, + ..Default::default() + }, + ); + assert!(res.is_err()); + assert!(format!("{:?}", res) + .contains("Cannot cast to FixedSizeList(3): value at index 1 has length 2")); + + // When safe=true (default), the cast will fill nulls for lists that are + // too short and truncate lists that are too long. + let res = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + ) + .unwrap(); + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, // Too short -> replaced with null + None, // Too long -> replaced with null + Some(vec![Some(3), Some(4), Some(5)]), + ], + 3, + )) as ArrayRef; + assert_eq!(expected.as_ref(), res.as_ref()); + } + + #[test] + fn test_cast_large_list_to_fsl() { + let values = vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3), Some(4)])]; + let array = Arc::new(LargeListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + values, 2, + )) as ArrayRef; + let actual = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 2), + ) + .unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + + #[test] + fn test_cast_list_to_fsl_subcast() { + let array = Arc::new(LargeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(i32::MAX)]), + ], + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(i32::MAX as i64)]), + ], + 2, + )) as ArrayRef; + let actual = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2), + ) + .unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + + let res = cast_with_options( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int16, true)), 2), + &CastOptions { + safe: false, + ..Default::default() + }, + ); + assert!(res.is_err()); + assert!(format!("{:?}", res).contains("Can't cast value 2147483647 to type Int16")); + } + + #[test] + fn test_cast_list_to_fsl_empty() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = new_empty_array(&DataType::List(field.clone())); + + let target_type = DataType::FixedSizeList(field.clone(), 3); + let expected = new_empty_array(&target_type); + + let actual = cast(array.as_ref(), &target_type).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + fn make_list_array() -> ListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) From 61da64a0557c80af5bb43b5f15c6d8bb6a314cb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 18 Nov 2023 22:15:02 +0100 Subject: [PATCH 1353/1411] Extend aggregation benchmarks (#5096) - Add benchmarks for float64 and integer types - Measure throughput - Increase batch size so that the final reduction step has less of an impact --- arrow/benches/aggregate_kernels.rs | 67 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/arrow/benches/aggregate_kernels.rs b/arrow/benches/aggregate_kernels.rs index c7b09f70f70e..1e7b9f894f2a 100644 --- a/arrow/benches/aggregate_kernels.rs +++ b/arrow/benches/aggregate_kernels.rs @@ -17,50 +17,55 @@ #[macro_use] extern crate criterion; -use criterion::Criterion; +use criterion::{Criterion, Throughput}; +use rand::distributions::{Distribution, Standard}; extern crate arrow; use arrow::compute::kernels::aggregate::*; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; +use arrow_array::types::{Float64Type, Int16Type, Int32Type, Int64Type, Int8Type}; -fn bench_sum(arr_a: &Float32Array) { - criterion::black_box(sum(arr_a).unwrap()); -} - -fn bench_min(arr_a: &Float32Array) { - criterion::black_box(min(arr_a).unwrap()); -} - -fn bench_max(arr_a: &Float32Array) { - criterion::black_box(max(arr_a).unwrap()); -} +const BATCH_SIZE: usize = 64 * 1024; -fn bench_min_string(arr_a: &StringArray) { - criterion::black_box(min_string(arr_a).unwrap()); +fn primitive_benchmark(c: &mut Criterion, name: &str) +where + Standard: Distribution, +{ + let nonnull_array = create_primitive_array::(BATCH_SIZE, 0.0); + let nullable_array = create_primitive_array::(BATCH_SIZE, 0.5); + c.benchmark_group(name) + .throughput(Throughput::Bytes( + (std::mem::size_of::() * BATCH_SIZE) as u64, + )) + .bench_function("sum nonnull", |b| b.iter(|| sum(&nonnull_array))) + .bench_function("min nonnull", |b| b.iter(|| min(&nonnull_array))) + .bench_function("max nonnull", |b| b.iter(|| max(&nonnull_array))) + .bench_function("sum nullable", |b| b.iter(|| sum(&nullable_array))) + .bench_function("min nullable", |b| b.iter(|| min(&nullable_array))) + .bench_function("max nullable", |b| b.iter(|| max(&nullable_array))); } fn add_benchmark(c: &mut Criterion) { - let arr_a = create_primitive_array::(512, 0.0); - - c.bench_function("sum 512", |b| b.iter(|| bench_sum(&arr_a))); - c.bench_function("min 512", |b| b.iter(|| bench_min(&arr_a))); - c.bench_function("max 512", |b| b.iter(|| bench_max(&arr_a))); - - let arr_a = create_primitive_array::(512, 0.5); - - c.bench_function("sum nulls 512", |b| b.iter(|| bench_sum(&arr_a))); - c.bench_function("min nulls 512", |b| b.iter(|| bench_min(&arr_a))); - c.bench_function("max nulls 512", |b| b.iter(|| bench_max(&arr_a))); + primitive_benchmark::(c, "float32"); + primitive_benchmark::(c, "float64"); - let arr_b = create_string_array::(512, 0.0); - c.bench_function("min string 512", |b| b.iter(|| bench_min_string(&arr_b))); + primitive_benchmark::(c, "int8"); + primitive_benchmark::(c, "int16"); + primitive_benchmark::(c, "int32"); + primitive_benchmark::(c, "int64"); - let arr_b = create_string_array::(512, 0.5); - c.bench_function("min nulls string 512", |b| { - b.iter(|| bench_min_string(&arr_b)) - }); + { + let nonnull_strings = create_string_array::(BATCH_SIZE, 0.0); + let nullable_strings = create_string_array::(BATCH_SIZE, 0.5); + c.benchmark_group("string") + .throughput(Throughput::Elements(BATCH_SIZE as u64)) + .bench_function("min nonnull", |b| b.iter(|| min_string(&nonnull_strings))) + .bench_function("max nonnull", |b| b.iter(|| max_string(&nonnull_strings))) + .bench_function("min nullable", |b| b.iter(|| min_string(&nullable_strings))) + .bench_function("max nullable", |b| b.iter(|| max_string(&nullable_strings))); + } } criterion_group!(benches, add_benchmark); From 6815bf153d2e2166ce3b63beed8d499aef48c7cc Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Mon, 20 Nov 2023 01:12:16 -0800 Subject: [PATCH 1354/1411] Expand parquet crate overview doc (#5093) * Expand parquet crate overview doc * Run `cargo fmt --all` * Nit: ask to format the code before sending a PR * Add example reading Parquet files from cloud provider * Tweak copy * Fix doctest --------- Co-authored-by: Matthieu Maitre Co-authored-by: Raphael Taylor-Davies --- parquet/CONTRIBUTING.md | 6 ++- parquet/Cargo.toml | 1 + parquet/README.md | 8 ---- parquet/src/arrow/async_reader/store.rs | 25 ++++++++++- parquet/src/arrow/mod.rs | 2 +- parquet/src/file/mod.rs | 2 +- parquet/src/file/reader.rs | 4 +- parquet/src/lib.rs | 59 +++++++++++++++++++++---- parquet/src/record/reader.rs | 17 +++++-- 9 files changed, 98 insertions(+), 26 deletions(-) diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md index 5670eef08101..922332b15d64 100644 --- a/parquet/CONTRIBUTING.md +++ b/parquet/CONTRIBUTING.md @@ -57,8 +57,10 @@ Run `cargo bench` for benchmarks. ## Docs -To build documentation, run `cargo doc --no-deps`. -To compile and view in the browser, run `cargo doc --no-deps --open`. +To build documentation, run `cargo doc --no-deps --all-features`. +To compile and view in the browser, run `cargo doc --no-deps --all-features --open`. + +Before submitting a pull request, run `cargo fmt --all` to format the change. ## Update Parquet Format diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index bdcbcb81cfce..4cd03c051e62 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], default-features = false } arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +object_store = { version = "0.8", default-features = false, features = ["azure"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet/README.md b/parquet/README.md index 2e0ab1d52c30..9de7aec4e59a 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -71,14 +71,6 @@ The `parquet` crate provides the following features which may be enabled in your - [x] Predicate pushdown - [x] Parquet format 4.0.0 support -## Support for `wasm32-unknown-unknown` target - -It's possible to build `parquet` for the `wasm32-unknown-unknown` target, however not all the compression features are currently unsupported due to issues with the upstream crates. In particular, the `zstd` and `lz4` features may have compilation issues. See issue [#180](https://github.com/apache/arrow-rs/issues/180). - -``` -cargo build -p parquet --target wasm32-unknown-unknown --no-default-features --features cli,snap,flate2,brotli -``` - ## License Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. diff --git a/parquet/src/arrow/async_reader/store.rs b/parquet/src/arrow/async_reader/store.rs index 3e27a96124b0..293b91aea3ba 100644 --- a/parquet/src/arrow/async_reader/store.rs +++ b/parquet/src/arrow/async_reader/store.rs @@ -28,7 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, MetadataLoader}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::ParquetMetaData; -/// Implements [`AsyncFileReader`] for a parquet file in object storage +/// Reads Parquet files in object storage using [`ObjectStore`]. +/// +/// ```no_run +/// # use std::io::stdout; +/// # use std::sync::Arc; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// # use object_store::ObjectStore; +/// # use object_store::path::Path; +/// # use parquet::arrow::async_reader::ParquetObjectReader; +/// # use parquet::arrow::ParquetRecordBatchStreamBuilder; +/// # use parquet::schema::printer::print_parquet_metadata; +/// # async fn run() { +/// // Populate configuration from environment +/// let storage_container = Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap()); +/// let location = Path::from("path/to/blob.parquet"); +/// let meta = storage_container.head(&location).await.unwrap(); +/// println!("Found Blob with {}B at {}", meta.size, meta.location); +/// +/// // Show Parquet metadata +/// let reader = ParquetObjectReader::new(storage_container, meta); +/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); +/// print_parquet_metadata(&mut stdout(), builder.metadata()); +/// # } +/// ``` #[derive(Clone, Debug)] pub struct ParquetObjectReader { store: Arc, diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 63885643c0fd..950226aef721 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for reading/writing Arrow +//! High-level API for reading/writing Arrow //! [RecordBatch](arrow_array::RecordBatch)es and //! [Array](arrow_array::Array)s to/from Parquet Files. //! diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs index c20fd38c7f8b..6589d2efaf8b 100644 --- a/parquet/src/file/mod.rs +++ b/parquet/src/file/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Main entrypoint for working with Parquet API. +//! Low level APIs for reading raw parquet data. //! //! Provides access to file and row group readers and writers, record API, metadata, etc. //! diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs index 921f9df290cc..dd6a0fdd2312 100644 --- a/parquet/src/file/reader.rs +++ b/parquet/src/file/reader.rs @@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync { /// Get the `i`th row group reader. Note this doesn't do bound check. fn get_row_group(&self, i: usize) -> Result>; - /// Get full iterator of `Row`s from a file (over all row groups). + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Iterator will automatically load the next row group to advance. /// @@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync { /// to read bloom filters. fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>; - /// Get iterator of `Row`s from this row group. + /// Get an iterator over the row in this file, see [`RowIter`] for caveats. /// /// Projected schema can be a subset of or equal to the file schema, when it is None, /// full file schema is assumed. diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 0279bbc382ea..db5d72634389 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -15,24 +15,67 @@ // specific language governing permissions and limitations // under the License. +//! //! This crate contains the official Native Rust implementation of //! [Apache Parquet](https://parquet.apache.org/), part of //! the [Apache Arrow](https://arrow.apache.org/) project. +//! The crate provides a number of APIs to read and write Parquet files, +//! covering a range of use cases. //! //! Please see the [parquet crates.io](https://crates.io/crates/parquet) //! page for feature flags and tips to improve performance. //! -//! # Getting Started -//! Start with some examples: +//! # Format Overview +//! +//! Parquet is a columnar format, which means that unlike row formats like [CSV], values are +//! iterated along columns instead of rows. Parquet is similar in spirit to [Arrow], with Parquet +//! focusing on storage efficiency whereas Arrow prioritizes compute efficiency. +//! +//! Parquet files are partitioned for scalability. Each file contains metadata, +//! along with zero or more "row groups", each row group containing one or +//! more columns. The APIs in this crate reflect this structure. +//! +//! Parquet distinguishes between "logical" and "physical" data types. +//! For instance, strings (logical type) are stored as byte arrays (physical type). +//! Likewise, temporal types like dates, times, timestamps, etc. (logical type) +//! are stored as integers (physical type). This crate exposes both kinds of types. +//! +//! For more details about the Parquet format, see the +//! [Parquet spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format). +//! +//! # APIs +//! +//! This crate exposes a number of APIs for different use-cases. +//! +//! ## Read/Write Arrow +//! +//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow `RecordBatch`. +//! This makes for a simple and performant interface to parquet data, whilst allowing workloads +//! to leverage the wide range of data transforms provided by the [arrow] crate, and by the +//! ecosystem of libraries and services using [Arrow] as an interop format. +//! +//! ## Read/Write Arrow Async +//! +//! When the `async` feature is enabled, [`arrow::async_reader`] and [`arrow::async_writer`] +//! provide the ability to read and write [`arrow`] data asynchronously. Additionally, with the +//! `object_store` feature is enabled, [`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader) +//! provides efficient integration with object storage services such as S3 via the [object_store] +//! crate, automatically optimizing IO based on any predicates or projections provided. //! -//! 1. [mod@file] for reading and writing parquet files using the -//! [ColumnReader](column::reader::ColumnReader) API. +//! ## Read/Write Parquet //! -//! 2. [arrow] for reading and writing parquet files to Arrow -//! `RecordBatch`es +//! Workloads needing finer-grained control, or looking to not take a dependency on arrow, +//! can use the lower-level APIs in [`mod@file`]. These APIs expose the underlying parquet +//! data model, and therefore require knowledge of the underlying parquet format, +//! including the details of [Dremel] record shredding and [Logical Types]. Most workloads +//! should prefer the arrow interfaces. //! -//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading -//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` feature). +//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html +//! [Arrow]: https://arrow.apache.org/ +//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values +//! [Dremel]: https://research.google/pubs/pub36632/ +//! [Logical Types]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md +//! [object_store]: https://docs.rs/object_store/latest/object_store/ /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index f98939725517..feaa8055e2dd 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -609,9 +609,20 @@ impl<'a> Either<'a> { } } -/// Iterator of [`Row`]s. -/// It is used either for a single row group to iterate over data in that row group, or -/// an entire file with auto buffering of all row groups. +/// Access parquet data as an iterator of [`Row`] +/// +/// # Caveats +/// +/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is therefore highly +/// optimised for reading data by column, not row. As a consequence applications concerned with +/// performance should prefer the columnar arrow or [ColumnReader] APIs. +/// +/// Additionally the current implementation does not correctly handle repeated fields ([#2394]), +/// and workloads looking to handle such schema should use the other APIs. +/// +/// [#2394]: https://github.com/apache/arrow-rs/issues/2394 +/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader +/// [Dremel]: https://research.google/pubs/pub36632/ pub struct RowIter<'a> { descr: SchemaDescPtr, tree_builder: TreeBuilder, From 4d141a34cb2ab53c07cfd2255351348a830b0224 Mon Sep 17 00:00:00 2001 From: Nathan Fenner Date: Mon, 20 Nov 2023 01:12:41 -0800 Subject: [PATCH 1355/1411] Allow 'zip' compute function to operate on Scalar arrays (#5086) --- arrow-select/src/zip.rs | 156 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 148 insertions(+), 8 deletions(-) diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index b5df891544a8..ff2380ef2420 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -29,19 +29,39 @@ use arrow_schema::ArrowError; /// * `falsy` - Values of this array are taken if mask evaluates `false` pub fn zip( mask: &BooleanArray, - truthy: &dyn Array, - falsy: &dyn Array, + truthy: &dyn Datum, + falsy: &dyn Datum, ) -> Result { + let (truthy, truthy_is_scalar) = truthy.get(); + let (falsy, falsy_is_scalar) = falsy.get(); + if truthy.data_type() != falsy.data_type() { return Err(ArrowError::InvalidArgumentError( "arguments need to have the same data type".into(), )); } - if truthy.len() != falsy.len() || falsy.len() != mask.len() { + + if truthy_is_scalar && truthy.len() != 1 { + return Err(ArrowError::InvalidArgumentError( + "scalar arrays must have 1 element".into(), + )); + } + if !truthy_is_scalar && truthy.len() != mask.len() { + return Err(ArrowError::InvalidArgumentError( + "all arrays should have the same length".into(), + )); + } + if truthy_is_scalar && truthy.len() != 1 { + return Err(ArrowError::InvalidArgumentError( + "scalar arrays must have 1 element".into(), + )); + } + if !falsy_is_scalar && falsy.len() != mask.len() { return Err(ArrowError::InvalidArgumentError( "all arrays should have the same length".into(), )); } + let falsy = falsy.to_data(); let truthy = truthy.to_data(); @@ -56,15 +76,36 @@ pub fn zip( SlicesIterator::new(mask).for_each(|(start, end)| { // the gap needs to be filled with falsy values if start > filled { - mutable.extend(1, filled, start); + if falsy_is_scalar { + for _ in filled..start { + // Copy the first item from the 'falsy' array into the output buffer. + mutable.extend(1, 0, 1); + } + } else { + mutable.extend(1, filled, start); + } } // fill with truthy values - mutable.extend(0, start, end); + if truthy_is_scalar { + for _ in start..end { + // Copy the first item from the 'truthy' array into the output buffer. + mutable.extend(0, 0, 1); + } + } else { + mutable.extend(0, start, end); + } filled = end; }); // the remaining part is falsy - if filled < truthy.len() { - mutable.extend(1, filled, truthy.len()); + if filled < mask.len() { + if falsy_is_scalar { + for _ in filled..mask.len() { + // Copy the first item from the 'falsy' array into the output buffer. + mutable.extend(1, 0, 1); + } + } else { + mutable.extend(1, filled, mask.len()); + } } let data = mutable.freeze(); @@ -76,7 +117,7 @@ mod test { use super::*; #[test] - fn test_zip_kernel() { + fn test_zip_kernel_one() { let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), Some(3)]); let mask = BooleanArray::from(vec![true, true, false, false, true]); @@ -85,4 +126,103 @@ mod test { let expected = Int32Array::from(vec![Some(5), None, Some(6), Some(7), Some(1)]); assert_eq!(actual, &expected); } + + #[test] + fn test_zip_kernel_two() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); + let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), Some(3)]); + let mask = BooleanArray::from(vec![false, false, true, true, false]); + let out = zip(&mask, &a, &b).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![None, Some(3), Some(7), None, Some(3)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_falsy_1() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); + + let fallback = Scalar::new(Int32Array::from_value(42, 1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &a, &fallback).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(5), None, Some(42), Some(42), Some(1)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_falsy_2() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); + + let fallback = Scalar::new(Int32Array::from_value(42, 1)); + + let mask = BooleanArray::from(vec![false, false, true, true, false]); + let out = zip(&mask, &a, &fallback).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(42), Some(42), Some(7), None, Some(42)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_truthy_1() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); + + let fallback = Scalar::new(Int32Array::from_value(42, 1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &fallback, &a).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(42), Some(42), Some(7), None, Some(42)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_truthy_2() { + let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); + + let fallback = Scalar::new(Int32Array::from_value(42, 1)); + + let mask = BooleanArray::from(vec![false, false, true, true, false]); + let out = zip(&mask, &fallback, &a).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(5), None, Some(42), Some(42), Some(1)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_both() { + let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); + let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(42), Some(42), Some(123), Some(123), Some(42)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_none_1() { + let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); + let scalar_falsy = Scalar::new(Int32Array::new_null(1)); + + let mask = BooleanArray::from(vec![true, true, false, false, true]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![Some(42), Some(42), None, None, Some(42)]); + assert_eq!(actual, &expected); + } + + #[test] + fn test_zip_kernel_scalar_none_2() { + let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1)); + let scalar_falsy = Scalar::new(Int32Array::new_null(1)); + + let mask = BooleanArray::from(vec![false, false, true, true, false]); + let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap(); + let actual = out.as_any().downcast_ref::().unwrap(); + let expected = Int32Array::from(vec![None, None, Some(42), Some(42), None]); + assert_eq!(actual, &expected); + } } From b7248497a43992a6f8da41b25829766b0867891c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 20 Nov 2023 17:20:10 +0100 Subject: [PATCH 1356/1411] Improve C Data Interface and Add Integration Testing Entrypoints (#5080) * Add C Data Interface integration testing entrypoints * Allow importing FFI_ArrowArray with existing datatype * Clippy * Use ptr::write * Fix null_count for Null type * Use new from_raw() APIs * Address some review comments. * Add unsafe markers * Try to fix CI * Revamp ArrowFile --- arrow-data/src/ffi.rs | 8 +- arrow-integration-testing/Cargo.toml | 5 +- arrow-integration-testing/README.md | 2 +- .../src/bin/arrow-json-integration-test.rs | 49 +--- .../integration_test.rs | 17 +- arrow-integration-testing/src/lib.rs | 228 ++++++++++++++++-- arrow-schema/src/error.rs | 6 + arrow-schema/src/ffi.rs | 10 +- arrow/src/array/ffi.rs | 2 +- arrow/src/ffi.rs | 161 ++++++++----- arrow/src/ffi_stream.rs | 6 +- arrow/src/pyarrow.rs | 6 +- 12 files changed, 363 insertions(+), 137 deletions(-) diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index 2b4d52601286..589f7dac6d19 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -168,6 +168,12 @@ impl FFI_ArrowArray { .collect::>(); let n_children = children.len() as i64; + // As in the IPC format, emit null_count = length for Null type + let null_count = match data.data_type() { + DataType::Null => data.len(), + _ => data.null_count(), + }; + // create the private data owning everything. // any other data must be added here, e.g. via a struct, to track lifetime. let mut private_data = Box::new(ArrayPrivateData { @@ -179,7 +185,7 @@ impl FFI_ArrowArray { Self { length: data.len() as i64, - null_count: data.null_count() as i64, + null_count: null_count as i64, offset: data.offset() as i64, n_buffers, n_children, diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 86c2cb27d297..c29860f09d64 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -27,11 +27,14 @@ edition = { workspace = true } publish = false rust-version = { workspace = true } +[lib] +crate-type = ["lib", "cdylib"] + [features] logging = ["tracing-subscriber"] [dependencies] -arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } +arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } diff --git a/arrow-integration-testing/README.md b/arrow-integration-testing/README.md index e82591e6b139..dcf39c27fbc5 100644 --- a/arrow-integration-testing/README.md +++ b/arrow-integration-testing/README.md @@ -48,7 +48,7 @@ ln -s arrow/rust ```shell cd arrow -pip install -e dev/archery[docker] +pip install -e dev/archery[integration] ``` ### Build the C++ binaries: diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 187d987a5a0a..9f1abb16a668 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,16 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::{DataType, Field}; -use arrow::datatypes::{Fields, Schema}; use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; use arrow_integration_test::*; -use arrow_integration_testing::read_json_file; +use arrow_integration_testing::{canonicalize_schema, open_json_file}; use clap::Parser; use std::fs::File; -use std::sync::Arc; #[derive(clap::ValueEnum, Debug, Clone)] #[clap(rename_all = "SCREAMING_SNAKE_CASE")] @@ -66,12 +63,12 @@ fn json_to_arrow(json_name: &str, arrow_name: &str, verbose: bool) -> Result<()> eprintln!("Converting {json_name} to {arrow_name}"); } - let json_file = read_json_file(json_name)?; + let json_file = open_json_file(json_name)?; let arrow_file = File::create(arrow_name)?; let mut writer = FileWriter::try_new(arrow_file, &json_file.schema)?; - for b in json_file.batches { + for b in json_file.read_batches()? { writer.write(&b)?; } @@ -113,49 +110,13 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> Ok(()) } -fn canonicalize_schema(schema: &Schema) -> Schema { - let fields = schema - .fields() - .iter() - .map(|field| match field.data_type() { - DataType::Map(child_field, sorted) => match child_field.data_type() { - DataType::Struct(fields) if fields.len() == 2 => { - let first_field = fields.get(0).unwrap(); - let key_field = - Arc::new(Field::new("key", first_field.data_type().clone(), false)); - let second_field = fields.get(1).unwrap(); - let value_field = Arc::new(Field::new( - "value", - second_field.data_type().clone(), - second_field.is_nullable(), - )); - - let fields = Fields::from([key_field, value_field]); - let struct_type = DataType::Struct(fields); - let child_field = Field::new("entries", struct_type, false); - - Arc::new(Field::new( - field.name().as_str(), - DataType::Map(Arc::new(child_field), *sorted), - field.is_nullable(), - )) - } - _ => panic!("The child field of Map type should be Struct type with 2 fields."), - }, - _ => field.clone(), - }) - .collect::(); - - Schema::new(fields).with_metadata(schema.metadata().clone()) -} - fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { if verbose { eprintln!("Validating {arrow_name} and {json_name}"); } // open JSON file - let json_file = read_json_file(json_name)?; + let json_file = open_json_file(json_name)?; // open Arrow file let arrow_file = File::open(arrow_name)?; @@ -170,7 +131,7 @@ fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { ))); } - let json_batches = &json_file.batches; + let json_batches = json_file.read_batches()?; // compare number of batches assert!( diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 81cc4bbe8ed2..c6b5a72ca6e2 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{read_json_file, ArrowFile}; +use crate::open_json_file; use std::collections::HashMap; use arrow::{ @@ -45,23 +45,16 @@ pub async fn run_scenario(host: &str, port: u16, path: &str) -> Result { let client = FlightServiceClient::connect(url).await?; - let ArrowFile { - schema, batches, .. - } = read_json_file(path)?; + let json_file = open_json_file(path)?; - let schema = Arc::new(schema); + let batches = json_file.read_batches()?; + let schema = Arc::new(json_file.schema); let mut descriptor = FlightDescriptor::default(); descriptor.set_type(DescriptorType::Path); descriptor.path = vec![path.to_string()]; - upload_data( - client.clone(), - schema.clone(), - descriptor.clone(), - batches.clone(), - ) - .await?; + upload_data(client.clone(), schema, descriptor.clone(), batches.clone()).await?; verify_data(client, descriptor, &batches).await?; Ok(()) diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index 2d76be3495c8..553e69b0a1a0 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -19,14 +19,20 @@ use serde_json::Value; -use arrow::datatypes::Schema; -use arrow::error::Result; +use arrow::array::{Array, StructArray}; +use arrow::datatypes::{DataType, Field, Fields, Schema}; +use arrow::error::{ArrowError, Result}; +use arrow::ffi::{from_ffi_and_data_type, FFI_ArrowArray, FFI_ArrowSchema}; use arrow::record_batch::RecordBatch; use arrow::util::test_util::arrow_test_data; use arrow_integration_test::*; use std::collections::HashMap; +use std::ffi::{c_int, CStr, CString}; use std::fs::File; use std::io::BufReader; +use std::iter::zip; +use std::ptr; +use std::sync::Arc; /// The expected username for the basic auth integration test. pub const AUTH_USERNAME: &str = "arrow"; @@ -40,11 +46,68 @@ pub struct ArrowFile { pub schema: Schema, // we can evolve this into a concrete Arrow type // this is temporarily not being read from - pub _dictionaries: HashMap, - pub batches: Vec, + dictionaries: HashMap, + arrow_json: Value, } -pub fn read_json_file(json_name: &str) -> Result { +impl ArrowFile { + pub fn read_batch(&self, batch_num: usize) -> Result { + let b = self.arrow_json["batches"].get(batch_num).unwrap(); + let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); + record_batch_from_json(&self.schema, json_batch, Some(&self.dictionaries)) + } + + pub fn read_batches(&self) -> Result> { + self.arrow_json["batches"] + .as_array() + .unwrap() + .iter() + .map(|b| { + let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); + record_batch_from_json(&self.schema, json_batch, Some(&self.dictionaries)) + }) + .collect() + } +} + +// Canonicalize the names of map fields in a schema +pub fn canonicalize_schema(schema: &Schema) -> Schema { + let fields = schema + .fields() + .iter() + .map(|field| match field.data_type() { + DataType::Map(child_field, sorted) => match child_field.data_type() { + DataType::Struct(fields) if fields.len() == 2 => { + let first_field = fields.get(0).unwrap(); + let key_field = + Arc::new(Field::new("key", first_field.data_type().clone(), false)); + let second_field = fields.get(1).unwrap(); + let value_field = Arc::new(Field::new( + "value", + second_field.data_type().clone(), + second_field.is_nullable(), + )); + + let fields = Fields::from([key_field, value_field]); + let struct_type = DataType::Struct(fields); + let child_field = Field::new("entries", struct_type, false); + + Arc::new(Field::new( + field.name().as_str(), + DataType::Map(Arc::new(child_field), *sorted), + field.is_nullable(), + )) + } + _ => panic!("The child field of Map type should be Struct type with 2 fields."), + }, + _ => field.clone(), + }) + .collect::(); + + Schema::new(fields).with_metadata(schema.metadata().clone()) +} + +pub fn open_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); let arrow_json: Value = serde_json::from_reader(reader).unwrap(); @@ -62,17 +125,10 @@ pub fn read_json_file(json_name: &str) -> Result { dictionaries.insert(json_dict.id, json_dict); } } - - let mut batches = vec![]; - for b in arrow_json["batches"].as_array().unwrap() { - let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); - let batch = record_batch_from_json(&schema, json_batch, Some(&dictionaries))?; - batches.push(batch); - } Ok(ArrowFile { schema, - _dictionaries: dictionaries, - batches, + dictionaries, + arrow_json, }) } @@ -100,3 +156,147 @@ pub fn read_gzip_json(version: &str, path: &str) -> ArrowJson { let arrow_json: ArrowJson = serde_json::from_str(&s).unwrap(); arrow_json } + +// +// C Data Integration entrypoints +// + +fn cdata_integration_export_schema_from_json( + c_json_name: *const i8, + out: *mut FFI_ArrowSchema, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let f = open_json_file(json_name.to_str()?)?; + let c_schema = FFI_ArrowSchema::try_from(&f.schema)?; + // Move exported schema into output struct + unsafe { ptr::write(out, c_schema) }; + Ok(()) +} + +fn cdata_integration_export_batch_from_json( + c_json_name: *const i8, + batch_num: c_int, + out: *mut FFI_ArrowArray, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let b = open_json_file(json_name.to_str()?)?.read_batch(batch_num.try_into().unwrap())?; + let a = StructArray::from(b).into_data(); + let c_array = FFI_ArrowArray::new(&a); + // Move exported array into output struct + unsafe { ptr::write(out, c_array) }; + Ok(()) +} + +fn cdata_integration_import_schema_and_compare_to_json( + c_json_name: *const i8, + c_schema: *mut FFI_ArrowSchema, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let json_schema = open_json_file(json_name.to_str()?)?.schema; + + // The source ArrowSchema will be released when this is dropped + let imported_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema) }; + let imported_schema = Schema::try_from(&imported_schema)?; + + // compare schemas + if canonicalize_schema(&json_schema) != canonicalize_schema(&imported_schema) { + return Err(ArrowError::ComputeError(format!( + "Schemas do not match.\n- JSON: {:?}\n- Imported: {:?}", + json_schema, imported_schema + ))); + } + Ok(()) +} + +fn compare_batches(a: &RecordBatch, b: &RecordBatch) -> Result<()> { + if a.num_columns() != b.num_columns() { + return Err(ArrowError::InvalidArgumentError( + "batches do not have the same number of columns".to_string(), + )); + } + for (a_column, b_column) in zip(a.columns(), b.columns()) { + if a_column != b_column { + return Err(ArrowError::InvalidArgumentError( + "batch columns are not the same".to_string(), + )); + } + } + Ok(()) +} + +fn cdata_integration_import_batch_and_compare_to_json( + c_json_name: *const i8, + batch_num: c_int, + c_array: *mut FFI_ArrowArray, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let json_batch = + open_json_file(json_name.to_str()?)?.read_batch(batch_num.try_into().unwrap())?; + let schema = json_batch.schema(); + + let data_type_for_import = DataType::Struct(schema.fields.clone()); + let imported_array = unsafe { FFI_ArrowArray::from_raw(c_array) }; + let imported_array = unsafe { from_ffi_and_data_type(imported_array, data_type_for_import) }?; + imported_array.validate_full()?; + let imported_batch = RecordBatch::from(StructArray::from(imported_array)); + + compare_batches(&json_batch, &imported_batch) +} + +// If Result is an error, then export a const char* to its string display, otherwise NULL +fn result_to_c_error(result: &std::result::Result) -> *mut i8 { + match result { + Ok(_) => ptr::null_mut(), + Err(e) => CString::new(format!("{}", e)).unwrap().into_raw(), + } +} + +/// Release a const char* exported by result_to_c_error() +/// +/// # Safety +/// +/// The pointer is assumed to have been obtained using CString::into_raw. +#[no_mangle] +pub unsafe extern "C" fn arrow_rs_free_error(c_error: *mut i8) { + if !c_error.is_null() { + drop(unsafe { CString::from_raw(c_error) }); + } +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_export_schema_from_json( + c_json_name: *const i8, + out: *mut FFI_ArrowSchema, +) -> *mut i8 { + let r = cdata_integration_export_schema_from_json(c_json_name, out); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_import_schema_and_compare_to_json( + c_json_name: *const i8, + c_schema: *mut FFI_ArrowSchema, +) -> *mut i8 { + let r = cdata_integration_import_schema_and_compare_to_json(c_json_name, c_schema); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_export_batch_from_json( + c_json_name: *const i8, + batch_num: c_int, + out: *mut FFI_ArrowArray, +) -> *mut i8 { + let r = cdata_integration_export_batch_from_json(c_json_name, batch_num, out); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_import_batch_and_compare_to_json( + c_json_name: *const i8, + batch_num: c_int, + c_array: *mut FFI_ArrowArray, +) -> *mut i8 { + let r = cdata_integration_import_batch_and_compare_to_json(c_json_name, batch_num, c_array); + result_to_c_error(&r) +} diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index 8ea533db89af..b7bf8d6e12a6 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -58,6 +58,12 @@ impl From for ArrowError { } } +impl From for ArrowError { + fn from(error: std::str::Utf8Error) -> Self { + ArrowError::ParseError(error.to_string()) + } +} + impl From for ArrowError { fn from(error: std::string::FromUtf8Error) -> Self { ArrowError::ParseError(error.to_string()) diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index b4d10b814a5d..8a18c77ea291 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -34,7 +34,9 @@ //! assert_eq!(schema, back); //! ``` -use crate::{ArrowError, DataType, Field, FieldRef, Schema, TimeUnit, UnionFields, UnionMode}; +use crate::{ + ArrowError, DataType, Field, FieldRef, IntervalUnit, Schema, TimeUnit, UnionFields, UnionMode, +}; use std::sync::Arc; use std::{ collections::HashMap, @@ -402,6 +404,9 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "tDm" => DataType::Duration(TimeUnit::Millisecond), "tDu" => DataType::Duration(TimeUnit::Microsecond), "tDn" => DataType::Duration(TimeUnit::Nanosecond), + "tiM" => DataType::Interval(IntervalUnit::YearMonth), + "tiD" => DataType::Interval(IntervalUnit::DayTime), + "tin" => DataType::Interval(IntervalUnit::MonthDayNano), "+l" => { let c_child = c_schema.child(0); DataType::List(Arc::new(Field::try_from(c_child)?)) @@ -669,6 +674,9 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::Duration(TimeUnit::Millisecond) => Ok("tDm".to_string()), DataType::Duration(TimeUnit::Microsecond) => Ok("tDu".to_string()), DataType::Duration(TimeUnit::Nanosecond) => Ok("tDn".to_string()), + DataType::Interval(IntervalUnit::YearMonth) => Ok("tiM".to_string()), + DataType::Interval(IntervalUnit::DayTime) => Ok("tiD".to_string()), + DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".to_string()), DataType::List(_) => Ok("+l".to_string()), DataType::LargeList(_) => Ok("+L".to_string()), DataType::Struct(_) => Ok("+s".to_string()), diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index e05c256d0128..d4d95a6e1770 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -70,7 +70,7 @@ mod tests { let schema = FFI_ArrowSchema::try_from(expected.data_type())?; // simulate an external consumer by being the consumer - let result = &from_ffi(array, &schema)?; + let result = &unsafe { from_ffi(array, &schema) }?; assert_eq!(result, expected); Ok(()) diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index c13d4c6e5dff..31388bf99358 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -43,7 +43,7 @@ //! let (out_array, out_schema) = to_ffi(&data)?; //! //! // import it -//! let data = from_ffi(out_array, &out_schema)?; +//! let data = unsafe { from_ffi(out_array, &out_schema) }?; //! let array = Int32Array::from(data); //! //! // perform some operation @@ -80,7 +80,7 @@ //! let mut schema = FFI_ArrowSchema::empty(); //! let mut array = FFI_ArrowArray::empty(); //! foreign.export_to_c(addr_of_mut!(array), addr_of_mut!(schema)); -//! Ok(make_array(from_ffi(array, &schema)?)) +//! Ok(make_array(unsafe { from_ffi(array, &schema) }?)) //! } //! ``` @@ -108,6 +108,7 @@ use std::{mem::size_of, ptr::NonNull, sync::Arc}; pub use arrow_data::ffi::FFI_ArrowArray; pub use arrow_schema::ffi::{FFI_ArrowSchema, Flags}; + use arrow_schema::UnionMode; use crate::array::{layout, ArrayData}; @@ -233,32 +234,53 @@ pub fn to_ffi(data: &ArrayData) -> Result<(FFI_ArrowArray, FFI_ArrowSchema)> { /// # Safety /// /// This struct assumes that the incoming data agrees with the C data interface. -pub fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result { +pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result { + let dt = DataType::try_from(schema)?; let array = Arc::new(array); - let tmp = ArrowArray { + let tmp = ImportedArrowArray { array: &array, - schema, + data_type: dt, + owner: &array, + }; + tmp.consume() +} + +/// Import [ArrayData] from the C Data Interface +/// +/// # Safety +/// +/// This struct assumes that the incoming data agrees with the C data interface. +pub unsafe fn from_ffi_and_data_type( + array: FFI_ArrowArray, + data_type: DataType, +) -> Result { + let array = Arc::new(array); + let tmp = ImportedArrowArray { + array: &array, + data_type, owner: &array, }; tmp.consume() } #[derive(Debug)] -struct ArrowArray<'a> { +struct ImportedArrowArray<'a> { array: &'a FFI_ArrowArray, - schema: &'a FFI_ArrowSchema, + data_type: DataType, owner: &'a Arc, } -impl<'a> ArrowArray<'a> { +impl<'a> ImportedArrowArray<'a> { fn consume(self) -> Result { - let dt = DataType::try_from(self.schema)?; let len = self.array.len(); let offset = self.array.offset(); - let null_count = self.array.null_count(); + let null_count = match &self.data_type { + DataType::Null => 0, + _ => self.array.null_count(), + }; - let data_layout = layout(&dt); - let buffers = self.buffers(data_layout.can_contain_null_mask, &dt)?; + let data_layout = layout(&self.data_type); + let buffers = self.buffers(data_layout.can_contain_null_mask)?; let null_bit_buffer = if data_layout.can_contain_null_mask { self.null_bit_buffer() @@ -266,14 +288,9 @@ impl<'a> ArrowArray<'a> { None }; - let mut child_data = (0..self.array.num_children()) - .map(|i| { - let child = self.child(i); - child.consume() - }) - .collect::>>()?; + let mut child_data = self.consume_children()?; - if let Some(d) = self.dictionary() { + if let Some(d) = self.dictionary()? { // For dictionary type there should only be a single child, so we don't need to worry if // there are other children added above. assert!(child_data.is_empty()); @@ -283,7 +300,7 @@ impl<'a> ArrowArray<'a> { // Should FFI be checking validity? Ok(unsafe { ArrayData::new_unchecked( - dt, + self.data_type, len, Some(null_count), null_bit_buffer, @@ -294,14 +311,49 @@ impl<'a> ArrowArray<'a> { }) } + fn consume_children(&self) -> Result> { + match &self.data_type { + DataType::List(field) + | DataType::FixedSizeList(field, _) + | DataType::LargeList(field) + | DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()), + DataType::Struct(fields) => { + assert!(fields.len() == self.array.num_children()); + fields + .iter() + .enumerate() + .map(|(i, field)| self.consume_child(i, field.data_type())) + .collect::>>() + } + DataType::Union(union_fields, _) => { + assert!(union_fields.len() == self.array.num_children()); + union_fields + .iter() + .enumerate() + .map(|(i, (_, field))| self.consume_child(i, field.data_type())) + .collect::>>() + } + _ => Ok(Vec::new()), + } + } + + fn consume_child(&self, index: usize, child_type: &DataType) -> Result { + ImportedArrowArray { + array: self.array.child(index), + data_type: child_type.clone(), + owner: self.owner, + } + .consume() + } + /// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present /// in the spec of the type) - fn buffers(&self, can_contain_null_mask: bool, dt: &DataType) -> Result> { + fn buffers(&self, can_contain_null_mask: bool) -> Result> { // + 1: skip null buffer let buffer_begin = can_contain_null_mask as usize; (buffer_begin..self.array.num_buffers()) .map(|index| { - let len = self.buffer_len(index, dt)?; + let len = self.buffer_len(index, &self.data_type)?; match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } { Some(buf) => Ok(buf), @@ -388,25 +440,20 @@ impl<'a> ArrowArray<'a> { unsafe { create_buffer(self.owner.clone(), self.array, 0, buffer_len) } } - fn child(&self, index: usize) -> ArrowArray { - ArrowArray { - array: self.array.child(index), - schema: self.schema.child(index), - owner: self.owner, - } - } - - fn dictionary(&self) -> Option { - match (self.array.dictionary(), self.schema.dictionary()) { - (Some(array), Some(schema)) => Some(ArrowArray { + fn dictionary(&self) -> Result> { + match (self.array.dictionary(), &self.data_type) { + (Some(array), DataType::Dictionary(_, value_type)) => Ok(Some(ImportedArrowArray { array, - schema, + data_type: value_type.as_ref().clone(), owner: self.owner, - }), - (None, None) => None, - _ => panic!( - "Dictionary should both be set or not set in FFI_ArrowArray and FFI_ArrowSchema" - ), + })), + (Some(_), _) => Err(ArrowError::CDataInterface( + "Got dictionary in FFI_ArrowArray for non-dictionary data type".to_string(), + )), + (None, DataType::Dictionary(_, _)) => Err(ArrowError::CDataInterface( + "Missing dictionary in FFI_ArrowArray for dictionary data type".to_string(), + )), + (_, _) => Ok(None), } } } @@ -443,7 +490,7 @@ mod tests { let (array, schema) = to_ffi(&array.into_data()).unwrap(); // (simulate consumer) import it - let array = Int32Array::from(from_ffi(array, &schema).unwrap()); + let array = Int32Array::from(unsafe { from_ffi(array, &schema) }.unwrap()); let array = kernels::numeric::add(&array, &array).unwrap(); // verify @@ -487,7 +534,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -517,7 +564,7 @@ mod tests { let (array, schema) = to_ffi(&original_array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -539,7 +586,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -608,7 +655,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // downcast @@ -648,7 +695,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -693,7 +740,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -719,7 +766,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -748,7 +795,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -784,7 +831,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -845,7 +892,7 @@ mod tests { let (array, schema) = to_ffi(&list_data)?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -890,7 +937,7 @@ mod tests { let (array, schema) = to_ffi(&dict_array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -928,7 +975,7 @@ mod tests { } // (simulate consumer) import it - let data = from_ffi(out_array, &out_schema)?; + let data = unsafe { from_ffi(out_array, &out_schema) }?; let array = make_array(data); // perform some operation @@ -949,7 +996,7 @@ mod tests { let (array, schema) = to_ffi(&array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -986,7 +1033,7 @@ mod tests { let (array, schema) = to_ffi(&map_array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -1009,7 +1056,7 @@ mod tests { let (array, schema) = to_ffi(&struct_array.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); // perform some operation @@ -1033,7 +1080,7 @@ mod tests { let (array, schema) = to_ffi(&union.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = make_array(data); let array = array.as_any().downcast_ref::().unwrap(); @@ -1094,7 +1141,7 @@ mod tests { let (array, schema) = to_ffi(&union.to_data())?; // (simulate consumer) import it - let data = from_ffi(array, &schema)?; + let data = unsafe { from_ffi(array, &schema) }?; let array = UnionArray::from(data); let expected_type_ids = vec![0_i8, 0, 1, 0]; diff --git a/arrow/src/ffi_stream.rs b/arrow/src/ffi_stream.rs index 123669aa61be..bbec71e8837e 100644 --- a/arrow/src/ffi_stream.rs +++ b/arrow/src/ffi_stream.rs @@ -357,9 +357,11 @@ impl Iterator for ArrowArrayStreamReader { } let schema_ref = self.schema(); + // NOTE: this parses the FFI_ArrowSchema again on each iterator call; + // should probably use from_ffi_and_data_type() instead. let schema = FFI_ArrowSchema::try_from(schema_ref.as_ref()).ok()?; - let data = from_ffi(array, &schema).ok()?; + let data = unsafe { from_ffi(array, &schema) }.ok()?; let record_batch = RecordBatch::from(StructArray::from(data)); @@ -464,7 +466,7 @@ mod tests { break; } - let array = from_ffi(ffi_array, &ffi_schema).unwrap(); + let array = unsafe { from_ffi(ffi_array, &ffi_schema) }.unwrap(); let record_batch = RecordBatch::from(StructArray::from(array)); produced_batches.push(record_batch); diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 2ac550ad0456..8302f8741b60 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -267,7 +267,7 @@ impl FromPyArrow for ArrayData { let schema_ptr = unsafe { schema_capsule.reference::() }; let array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; - return ffi::from_ffi(array, schema_ptr).map_err(to_py_err); + return unsafe { ffi::from_ffi(array, schema_ptr) }.map_err(to_py_err); } validate_class("Array", value)?; @@ -287,7 +287,7 @@ impl FromPyArrow for ArrayData { ), )?; - ffi::from_ffi(array, &schema).map_err(to_py_err) + unsafe { ffi::from_ffi(array, &schema) }.map_err(to_py_err) } } @@ -348,7 +348,7 @@ impl FromPyArrow for RecordBatch { let schema_ptr = unsafe { schema_capsule.reference::() }; let ffi_array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; - let array_data = ffi::from_ffi(ffi_array, schema_ptr).map_err(to_py_err)?; + let array_data = unsafe { ffi::from_ffi(ffi_array, schema_ptr) }.map_err(to_py_err)?; if !matches!(array_data.data_type(), DataType::Struct(_)) { return Err(PyTypeError::new_err( "Expected Struct type from __arrow_c_array.", From fbbb61d94282165f9bb9f73fb4d00a3af16d4aee Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 21 Nov 2023 08:18:21 +1100 Subject: [PATCH 1357/1411] Allow writing null valued keys in JSON (#5065) * Allow writing null valued keys in JSON * Trigger * Refactor keep nulls to be runtime config * Rename option * Rename option --- arrow-array/src/numeric.rs | 1 - arrow-json/src/lib.rs | 2 +- arrow-json/src/writer.rs | 515 +++++++++++++++++--- arrow-json/test/data/nested_with_nulls.json | 4 + arrow/src/ffi.rs | 2 - arrow/tests/array_cast.rs | 1 - object_store/src/gcp/builder.rs | 2 +- 7 files changed, 461 insertions(+), 66 deletions(-) create mode 100644 arrow-json/test/data/nested_with_nulls.json diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index ad7b3eca1dbc..b5e474ba696a 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -618,7 +618,6 @@ mod tests { let mask = 0b01010101_01010101_10101010_10101010; let actual = UInt16Type::mask_from_u64(mask); let expected = expected_mask!(i16, mask); - dbg!(&expected); let expected = m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index e69eaaba3ef8..e39882e52620 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -82,7 +82,7 @@ pub type RawReader = Reader; pub type RawReaderBuilder = ReaderBuilder; pub use self::reader::{Reader, ReaderBuilder}; -pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer}; +pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder}; use half::f16; use serde_json::{Number, Value}; diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 5ecfc932364b..4f74817ca1e3 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -92,6 +92,10 @@ //! let buf = writer.into_inner(); //! assert_eq!(r#"[{"a":1},{"a":2},{"a":3}]"#, String::from_utf8(buf).unwrap()) //! ``` +//! +//! [`LineDelimitedWriter`] and [`ArrayWriter`] will omit writing keys with null values. +//! In order to explicitly write null values for keys, configure a custom [`Writer`] by +//! using a [`WriterBuilder`] to construct a [`Writer`]. use std::iter; use std::{fmt::Debug, io::Write}; @@ -124,6 +128,7 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, + explicit_nulls: bool, ) -> Result>, ArrowError> { let inner_col_names = array.column_names(); @@ -132,13 +137,26 @@ fn struct_array_to_jsonmap_array( .collect::>>(); for (j, struct_col) in array.columns().iter().enumerate() { - set_column_for_json_rows(&mut inner_objs, struct_col, inner_col_names[j])? + set_column_for_json_rows( + &mut inner_objs, + struct_col, + inner_col_names[j], + explicit_nulls, + )? } Ok(inner_objs) } /// Converts an arrow [`Array`] into a `Vec` of Serde JSON [`serde_json::Value`]'s pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> { + // For backwards compatibility, default to skip nulls + array_to_json_array_internal(array, false) +} + +fn array_to_json_array_internal( + array: &dyn Array, + explicit_nulls: bool, +) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), DataType::Boolean => Ok(array @@ -180,32 +198,44 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> DataType::List(_) => as_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + explicit_nulls, + )?)), None => Ok(Value::Null), }) .collect(), DataType::LargeList(_) => as_large_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + explicit_nulls, + )?)), None => Ok(Value::Null), }) .collect(), DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + explicit_nulls, + )?)), None => Ok(Value::Null), }) .collect(), DataType::Struct(_) => { - let jsonmaps = struct_array_to_jsonmap_array(array.as_struct())?; + let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; Ok(jsonmaps.into_iter().map(Value::Object).collect()) } DataType::Map(_, _) => as_map_array(array) .iter() .map(|maybe_value| match maybe_value { - Some(v) => Ok(Value::Array(array_to_json_array(&v)?)), + Some(v) => Ok(Value::Array(array_to_json_array_internal( + &v, + explicit_nulls, + )?)), None => Ok(Value::Null), }) .collect(), @@ -216,14 +246,16 @@ pub fn array_to_json_array(array: &dyn Array) -> Result, ArrowError> } macro_rules! set_column_by_array_type { - ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident) => { + ($cast_fn:ident, $col_name:ident, $rows:ident, $array:ident, $explicit_nulls:ident) => { let arr = $cast_fn($array); $rows .iter_mut() .zip(arr.iter()) .for_each(|(row, maybe_value)| { - if let Some(v) = maybe_value { - row.insert($col_name.to_string(), v.into()); + if let Some(j) = maybe_value.map(Into::into) { + row.insert($col_name.to_string(), j); + } else if $explicit_nulls { + row.insert($col_name.to_string(), Value::Null); } }); }; @@ -233,6 +265,7 @@ fn set_column_by_primitive_type( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, + explicit_nulls: bool, ) where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -242,9 +275,10 @@ fn set_column_by_primitive_type( rows.iter_mut() .zip(primitive_arr.iter()) .for_each(|(row, maybe_value)| { - // when value is null, we simply skip setting the key if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); } }); } @@ -253,52 +287,57 @@ fn set_column_for_json_rows( rows: &mut [JsonMap], array: &ArrayRef, col_name: &str, + explicit_nulls: bool, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Int64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt8 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::UInt64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float16 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float32 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Float64 => { - set_column_by_primitive_type::(rows, array, col_name); + set_column_by_primitive_type::(rows, array, col_name, explicit_nulls); } DataType::Null => { - // when value is null, we simply skip setting the key + if explicit_nulls { + rows.iter_mut().for_each(|row| { + row.insert(col_name.to_string(), Value::Null); + }); + } } DataType::Boolean => { - set_column_by_array_type!(as_boolean_array, col_name, rows, array); + set_column_by_array_type!(as_boolean_array, col_name, rows, array, explicit_nulls); } DataType::Utf8 => { - set_column_by_array_type!(as_string_array, col_name, rows, array); + set_column_by_array_type!(as_string_array, col_name, rows, array, explicit_nulls); } DataType::LargeUtf8 => { - set_column_by_array_type!(as_largestring_array, col_name, rows, array); + set_column_by_array_type!(as_largestring_array, col_name, rows, array, explicit_nulls); } DataType::Date32 | DataType::Date64 @@ -310,16 +349,19 @@ fn set_column_for_json_rows( let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let nulls = array.nulls(); rows.iter_mut().enumerate().for_each(|(idx, row)| { - if nulls.map(|x| x.is_valid(idx)).unwrap_or(true) { - row.insert( - col_name.to_string(), - formatter.value(idx).to_string().into(), - ); - } + let maybe_value = nulls + .map(|x| x.is_valid(idx)) + .unwrap_or(true) + .then(|| formatter.value(idx).to_string().into()); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); + }; }); } DataType::Struct(_) => { - let inner_objs = struct_array_to_jsonmap_array(array.as_struct())?; + let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { row.insert(col_name.to_string(), Value::Object(obj)); }); @@ -328,8 +370,13 @@ fn set_column_for_json_rows( let listarr = as_list_array(array); rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { - if let Some(v) = maybe_value { - row.insert(col_name.to_string(), Value::Array(array_to_json_array(&v)?)); + let maybe_value = maybe_value + .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) + .transpose()?; + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); } Ok(()) }, @@ -339,9 +386,13 @@ fn set_column_for_json_rows( let listarr = as_large_list_array(array); rows.iter_mut().zip(listarr.iter()).try_for_each( |(row, maybe_value)| -> Result<(), ArrowError> { - if let Some(v) = maybe_value { - let val = array_to_json_array(&v)?; - row.insert(col_name.to_string(), Value::Array(val)); + let maybe_value = maybe_value + .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) + .transpose()?; + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); } Ok(()) }, @@ -350,7 +401,7 @@ fn set_column_for_json_rows( DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, &hydrated, col_name)?; + set_column_for_json_rows(rows, &hydrated, col_name, explicit_nulls)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -367,7 +418,7 @@ fn set_column_for_json_rows( } let keys = keys.as_string::(); - let values = array_to_json_array(values)?; + let values = array_to_json_array_internal(values, explicit_nulls)?; let mut kv = keys.iter().zip(values); @@ -401,6 +452,14 @@ fn set_column_for_json_rows( /// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( batches: &[&RecordBatch], +) -> Result>, ArrowError> { + // For backwards compatibility, default to skip nulls + record_batches_to_json_rows_internal(batches, false) +} + +fn record_batches_to_json_rows_internal( + batches: &[&RecordBatch], + explicit_nulls: bool, ) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) @@ -414,7 +473,7 @@ pub fn record_batches_to_json_rows( let row_slice = &mut rows[base..base + batch.num_rows()]; for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(row_slice, col, col_name)? + set_column_for_json_rows(row_slice, col, col_name, explicit_nulls)? } base += row_count; } @@ -450,7 +509,9 @@ pub trait JsonFormat: Debug + Default { } } -/// Produces JSON output with one record per line. For example +/// Produces JSON output with one record per line. +/// +/// For example: /// /// ```json /// {"foo":1} @@ -467,7 +528,9 @@ impl JsonFormat for LineDelimited { } } -/// Produces JSON output as a single JSON array. For example +/// Produces JSON output as a single JSON array. +/// +/// For example: /// /// ```json /// [{"foo":1},{"bar":1}] @@ -494,16 +557,101 @@ impl JsonFormat for JsonArray { } } -/// A JSON writer which serializes [`RecordBatch`]es to newline delimited JSON objects +/// A JSON writer which serializes [`RecordBatch`]es to newline delimited JSON objects. pub type LineDelimitedWriter = Writer; -/// A JSON writer which serializes [`RecordBatch`]es to JSON arrays +/// A JSON writer which serializes [`RecordBatch`]es to JSON arrays. pub type ArrayWriter = Writer; +/// JSON writer builder. +#[derive(Debug, Clone, Default)] +pub struct WriterBuilder { + /// Controls whether null values should be written explicitly for keys + /// in objects, or whether the key should be omitted entirely. + explicit_nulls: bool, +} + +impl WriterBuilder { + /// Create a new builder for configuring JSON writing options. + /// + /// # Example + /// + /// ``` + /// # use arrow_json::{Writer, WriterBuilder}; + /// # use arrow_json::writer::LineDelimited; + /// # use std::fs::File; + /// + /// fn example() -> Writer { + /// let file = File::create("target/out.json").unwrap(); + /// + /// // create a builder that keeps keys with null values + /// let builder = WriterBuilder::new().with_explicit_nulls(true); + /// let writer = builder.build::<_, LineDelimited>(file); + /// + /// writer + /// } + /// ``` + pub fn new() -> Self { + Self::default() + } + + /// Returns `true` if this writer is configured to keep keys with null values. + pub fn explicit_nulls(&self) -> bool { + self.explicit_nulls + } + + /// Set whether to keep keys with null values, or to omit writing them. + /// + /// For example, with [`LineDelimited`] format: + /// + /// Skip nulls (set to `false`): + /// + /// ```json + /// {"foo":1} + /// {"foo":1,"bar":2} + /// {} + /// ``` + /// + /// Keep nulls (set to `true`): + /// + /// ```json + /// {"foo":1,"bar":null} + /// {"foo":1,"bar":2} + /// {"foo":null,"bar":null} + /// ``` + /// + /// Default is to skip nulls (set to `false`). + pub fn with_explicit_nulls(mut self, explicit_nulls: bool) -> Self { + self.explicit_nulls = explicit_nulls; + self + } + + /// Create a new `Writer` with specified `JsonFormat` and builder options. + pub fn build(self, writer: W) -> Writer + where + W: Write, + F: JsonFormat, + { + Writer { + writer, + started: false, + finished: false, + format: F::default(), + explicit_nulls: self.explicit_nulls, + } + } +} + /// A JSON writer which serializes [`RecordBatch`]es to a stream of -/// `u8` encoded JSON objects. See the module level documentation for -/// detailed usage and examples. The specific format of the stream is -/// controlled by the [`JsonFormat`] type parameter. +/// `u8` encoded JSON objects. +/// +/// See the module level documentation for detailed usage and examples. +/// The specific format of the stream is controlled by the [`JsonFormat`] +/// type parameter. +/// +/// By default the writer will skip writing keys with null values for +/// backward compatibility. See [`WriterBuilder`] on how to customize +/// this behaviour when creating a new writer. #[derive(Debug)] pub struct Writer where @@ -521,6 +669,9 @@ where /// Determines how the byte stream is formatted format: F, + + /// Whether keys with null values should be written or skipped + explicit_nulls: bool, } impl Writer @@ -535,6 +686,7 @@ where started: false, finished: false, format: F::default(), + explicit_nulls: false, } } @@ -556,7 +708,7 @@ where /// Convert the `RecordBatch` into JSON rows, and write them to the output pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows(&[batch])? { + for row in record_batches_to_json_rows_internal(&[batch], self.explicit_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -564,7 +716,7 @@ where /// Convert the [`RecordBatch`] into JSON rows, and write them to the output pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { - for row in record_batches_to_json_rows(batches)? { + for row in record_batches_to_json_rows_internal(batches, self.explicit_nulls)? { self.write_row(&Value::Object(row))?; } Ok(()) @@ -609,7 +761,7 @@ mod tests { use serde_json::json; - use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; + use arrow_array::builder::{Int32Builder, Int64Builder, MapBuilder, StringBuilder}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::ArrayData; @@ -1203,7 +1355,7 @@ mod tests { ); } - fn test_write_for_file(test_file: &str) { + fn test_write_for_file(test_file: &str, remove_nulls: bool) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); let (schema, _) = infer_json_schema(&mut reader, None).unwrap(); @@ -1215,18 +1367,27 @@ mod tests { let mut buf = Vec::new(); { - let mut writer = LineDelimitedWriter::new(&mut buf); - writer.write_batches(&[&batch]).unwrap(); + if remove_nulls { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } else { + let mut writer = WriterBuilder::new() + .with_explicit_nulls(true) + .build::<_, LineDelimited>(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } } let result = String::from_utf8(buf).unwrap(); let expected = read_to_string(test_file).unwrap(); for (r, e) in result.lines().zip(expected.lines()) { let mut expected_json = serde_json::from_str::(e).unwrap(); - // remove null value from object to make comparison consistent: - if let Value::Object(obj) = expected_json { - expected_json = - Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); + if remove_nulls { + // remove null value from object to make comparison consistent: + if let Value::Object(obj) = expected_json { + expected_json = + Value::Object(obj.into_iter().filter(|(_, v)| *v != Value::Null).collect()); + } } assert_eq!(serde_json::from_str::(r).unwrap(), expected_json,); } @@ -1234,17 +1395,22 @@ mod tests { #[test] fn write_basic_rows() { - test_write_for_file("test/data/basic.json"); + test_write_for_file("test/data/basic.json", true); } #[test] fn write_arrays() { - test_write_for_file("test/data/arrays.json"); + test_write_for_file("test/data/arrays.json", true); } #[test] fn write_basic_nulls() { - test_write_for_file("test/data/basic_nulls.json"); + test_write_for_file("test/data/basic_nulls.json", true); + } + + #[test] + fn write_nested_with_nulls() { + test_write_for_file("test/data/nested_with_nulls.json", false); } #[test] @@ -1530,4 +1696,233 @@ mod tests { assert_eq!(array_to_json_array(&map_array).unwrap(), expected_json); } + + #[test] + fn test_writer_explicit_nulls() -> Result<(), ArrowError> { + fn nested_list() -> (Arc, Arc) { + let array = Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![None, None, None]), + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![None, None, None]), + ])); + let field = Arc::new(Field::new("list", array.data_type().clone(), true)); + // [{"list":[null,null,null]},{"list":[1,2,3]},{"list":null},{"list":[null,null,null]}] + (array, field) + } + + fn nested_dict() -> (Arc>, Arc) { + let array = Arc::new(DictionaryArray::from_iter(vec![ + Some("cupcakes"), + None, + Some("bear"), + Some("kuma"), + ])); + let field = Arc::new(Field::new("dict", array.data_type().clone(), true)); + // [{"dict":"cupcakes"},{"dict":null},{"dict":"bear"},{"dict":"kuma"}] + (array, field) + } + + fn nested_map() -> (Arc, Arc) { + let string_builder = StringBuilder::new(); + let int_builder = Int64Builder::new(); + let mut builder = MapBuilder::new(None, string_builder, int_builder); + + // [{"foo": 10}, null, {}, {"bar": 20, "baz": 30, "qux": 40}] + builder.keys().append_value("foo"); + builder.values().append_value(10); + builder.append(true).unwrap(); + + builder.append(false).unwrap(); + + builder.append(true).unwrap(); + + builder.keys().append_value("bar"); + builder.values().append_value(20); + builder.keys().append_value("baz"); + builder.values().append_value(30); + builder.keys().append_value("qux"); + builder.values().append_value(40); + builder.append(true).unwrap(); + + let array = Arc::new(builder.finish()); + let field = Arc::new(Field::new("map", array.data_type().clone(), true)); + (array, field) + } + + fn root_list() -> (Arc, Field) { + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("utf8", DataType::Utf8, true)), + Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None])) as ArrayRef, + ), + ( + Arc::new(Field::new("int32", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![Some(1), None, Some(5), None])) as ArrayRef, + ), + ]); + + let field = Field::new_list( + "list", + Field::new("struct", struct_array.data_type().clone(), true), + true, + ); + + // [{"list":[{"int32":1,"utf8":"a"},{"int32":null,"utf8":"b"}]},{"list":null},{"list":[{int32":5,"utf8":null}]},{"list":null}] + let entry_offsets = Buffer::from(&[0, 2, 2, 3, 3].to_byte_slice()); + let data = ArrayData::builder(field.data_type().clone()) + .len(4) + .add_buffer(entry_offsets) + .add_child_data(struct_array.into_data()) + .null_bit_buffer(Some([0b00000101].into())) + .build() + .unwrap(); + let array = Arc::new(ListArray::from(data)); + (array, field) + } + + let (nested_list_array, nested_list_field) = nested_list(); + let (nested_dict_array, nested_dict_field) = nested_dict(); + let (nested_map_array, nested_map_field) = nested_map(); + let (root_list_array, root_list_field) = root_list(); + + let schema = Schema::new(vec![ + Field::new("date", DataType::Date32, true), + Field::new("null", DataType::Null, true), + Field::new_struct( + "struct", + vec![ + Arc::new(Field::new("utf8", DataType::Utf8, true)), + nested_list_field.clone(), + nested_dict_field.clone(), + nested_map_field.clone(), + ], + true, + ), + root_list_field, + ]); + + let arr_date32 = Date32Array::from(vec![Some(0), None, Some(1), None]); + let arr_null = NullArray::new(4); + let arr_struct = StructArray::from(vec![ + // [{"utf8":"a"},{"utf8":null},{"utf8":null},{"utf8":"b"}] + ( + Arc::new(Field::new("utf8", DataType::Utf8, true)), + Arc::new(StringArray::from(vec![Some("a"), None, None, Some("b")])) as ArrayRef, + ), + // [{"list":[null,null,null]},{"list":[1,2,3]},{"list":null},{"list":[null,null,null]}] + (nested_list_field, nested_list_array as ArrayRef), + // [{"dict":"cupcakes"},{"dict":null},{"dict":"bear"},{"dict":"kuma"}] + (nested_dict_field, nested_dict_array as ArrayRef), + // [{"foo": 10}, null, {}, {"bar": 20, "baz": 30, "qux": 40}] + (nested_map_field, nested_map_array as ArrayRef), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + // [{"date":"1970-01-01"},{"date":null},{"date":"1970-01-02"},{"date":null}] + Arc::new(arr_date32), + // [{"null":null},{"null":null},{"null":null},{"null":null}] + Arc::new(arr_null), + Arc::new(arr_struct), + // [{"list":[{"int32":1,"utf8":"a"},{"int32":null,"utf8":"b"}]},{"list":null},{"list":[{int32":5,"utf8":null}]},{"list":null}] + root_list_array, + ], + )?; + + let mut buf = Vec::new(); + { + let mut writer = WriterBuilder::new() + .with_explicit_nulls(true) + .build::<_, JsonArray>(&mut buf); + writer.write_batches(&[&batch])?; + writer.finish()?; + } + + let actual = serde_json::from_slice::>(&buf).unwrap(); + let expected = serde_json::from_value::>(json!([ + { + "date": "1970-01-01", + "list": [ + { + "int32": 1, + "utf8": "a" + }, + { + "int32": null, + "utf8": "b" + } + ], + "null": null, + "struct": { + "dict": "cupcakes", + "list": [ + null, + null, + null + ], + "map": { + "foo": 10 + }, + "utf8": "a" + } + }, + { + "date": null, + "list": null, + "null": null, + "struct": { + "dict": null, + "list": [ + 1, + 2, + 3 + ], + "map": null, + "utf8": null + } + }, + { + "date": "1970-01-02", + "list": [ + { + "int32": 5, + "utf8": null + } + ], + "null": null, + "struct": { + "dict": "bear", + "list": null, + "map": {}, + "utf8": null + } + }, + { + "date": null, + "list": null, + "null": null, + "struct": { + "dict": "kuma", + "list": [ + null, + null, + null + ], + "map": { + "bar": 20, + "baz": 30, + "qux": 40 + }, + "utf8": "b" + } + } + ])) + .unwrap(); + + assert_eq!(actual, expected); + + Ok(()) + } } diff --git a/arrow-json/test/data/nested_with_nulls.json b/arrow-json/test/data/nested_with_nulls.json new file mode 100644 index 000000000000..932565d56063 --- /dev/null +++ b/arrow-json/test/data/nested_with_nulls.json @@ -0,0 +1,4 @@ +{"a": null, "b": null, "c": null, "d": {"d1": null, "d2": [null, 1, 2, null]}} +{"a": null, "b": -3.5, "c": true, "d": {"d1": null, "d2": null}} +{"a": null, "b": null, "c": false, "d": {"d1": "1970-01-01", "d2": null}} +{"a": 1, "b": 2.0, "c": false, "d": {"d1": null, "d2": null}} diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 31388bf99358..b49f56c91574 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -664,8 +664,6 @@ mod tests { .downcast_ref::>() .unwrap(); - dbg!(&array); - // verify let expected = GenericListArray::::from(list_data); assert_eq!(&array.value(0), &expected.value(0)); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index bfe16db5cc4d..c73f4f50ac01 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -47,7 +47,6 @@ fn test_cast_timestamp_to_string() { let a = TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) .with_timezone("UTC".to_string()); let array = Arc::new(a) as ArrayRef; - dbg!(&array); let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index 5f718d63d94a..7417ea4c8a50 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -605,7 +605,7 @@ mod tests { .with_bucket_name("foo") .with_proxy_url("https://example.com") .build(); - assert!(dbg!(gcs).is_ok()); + assert!(gcs.is_ok()); let err = GoogleCloudStorageBuilder::new() .with_service_account_path(service_account_path.to_str().unwrap()) From df69ef57d055453c399fa925ad315d19211d7ab2 Mon Sep 17 00:00:00 2001 From: fan <75058860+fansehep@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:42:51 +0800 Subject: [PATCH 1358/1411] fix: coerce_primitive for serde decoded data (#5101) * fix: fix json decode number Signed-off-by: fan * follow reviews Signed-off-by: fan * follow reviews Signed-off-by: fan * use fixed size space Signed-off-by: fan --------- Signed-off-by: fan --- arrow-json/src/reader/mod.rs | 43 ++++++++++++++++++++++++++- arrow-json/src/reader/string_array.rs | 33 +++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 71a73df9fedb..5afe0dec279a 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -717,7 +717,9 @@ mod tests { use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{make_array, Array, BooleanArray, ListArray, StringArray, StructArray}; + use arrow_array::{ + make_array, Array, BooleanArray, Float64Array, ListArray, StringArray, StructArray, + }; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; @@ -2259,4 +2261,43 @@ mod tests { .values(); assert_eq!(values, &[1699148028689, 2, 3, 4]); } + + #[test] + fn test_coercing_primitive_into_string_decoder() { + let buf = &format!( + r#"[{{"a": 1, "b": "A", "c": "T"}}, {{"a": 2, "b": "BB", "c": "F"}}, {{"a": {}, "b": 123, "c": false}}, {{"a": {}, "b": 789, "c": true}}]"#, + (std::i32::MAX as i64 + 10), + std::i64::MAX - 10 + ); + let schema = Schema::new(vec![ + Field::new("a", DataType::Float64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ]); + let json_array: Vec = serde_json::from_str(buf).unwrap(); + let schema_ref = Arc::new(schema); + + // read record batches + let reader = ReaderBuilder::new(schema_ref.clone()).with_coerce_primitive(true); + let mut decoder = reader.build_decoder().unwrap(); + decoder.serialize(json_array.as_slice()).unwrap(); + let batch = decoder.flush().unwrap().unwrap(); + assert_eq!( + batch, + RecordBatch::try_new( + schema_ref, + vec![ + Arc::new(Float64Array::from(vec![ + 1.0, + 2.0, + (std::i32::MAX as i64 + 10) as f64, + (std::i64::MAX - 10) as f64 + ])), + Arc::new(StringArray::from(vec!["A", "BB", "123", "789"])), + Arc::new(StringArray::from(vec!["T", "F", "false", "true"])), + ] + ) + .unwrap() + ); + } } diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs index 63a9bcedb7d1..5ab4d09d5d63 100644 --- a/arrow-json/src/reader/string_array.rs +++ b/arrow-json/src/reader/string_array.rs @@ -61,7 +61,18 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { data_capacity += tape.get_string(idx).len(); } - _ => return Err(tape.error(*p, "string")), + TapeElement::I64(_) + | TapeElement::I32(_) + | TapeElement::F64(_) + | TapeElement::F32(_) + if coerce_primitive => + { + // An arbitrary estimate + data_capacity += 10; + } + _ => { + return Err(tape.error(*p, "string")); + } } } @@ -89,6 +100,26 @@ impl ArrayDecoder for StringArrayDecoder { TapeElement::Number(idx) if coerce_primitive => { builder.append_value(tape.get_string(idx)); } + TapeElement::I64(high) if coerce_primitive => match tape.get(p + 1) { + TapeElement::I32(low) => { + let val = (high as i64) << 32 | (low as u32) as i64; + builder.append_value(val.to_string()); + } + _ => unreachable!(), + }, + TapeElement::I32(n) if coerce_primitive => { + builder.append_value(n.to_string()); + } + TapeElement::F32(n) if coerce_primitive => { + builder.append_value(n.to_string()); + } + TapeElement::F64(high) if coerce_primitive => match tape.get(p + 1) { + TapeElement::F32(low) => { + let val = f64::from_bits((high as u64) << 32 | low as u64); + builder.append_value(val.to_string()); + } + _ => unreachable!(), + }, _ => unreachable!(), } } From 06a3a2e467a0d49f0d372ab09c37da1f1bfbdc0c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:07:01 +0000 Subject: [PATCH 1359/1411] Fix integration tests (#5111) --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f939a6a13b58..c9cb4e31ced9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -57,6 +57,7 @@ jobs: env: ARROW_USE_CCACHE: OFF ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_RUST_EXE_PATH: /build/rust/debug BUILD_DOCS_CPP: OFF ARROW_INTEGRATION_CPP: ON ARROW_INTEGRATION_CSHARP: ON From 410fcbba51a0cb1482a6977dde2ce3d279ce6135 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:07:41 +0000 Subject: [PATCH 1360/1411] Update prost-build requirement from =0.12.2 to =0.12.3 (#5112) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Commits](https://github.com/tokio-rs/prost/compare/v0.12.2...v0.12.3) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index e143b4409983..4976c8eb5461 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.69", default-features = false } -prost-build = { version = "=0.12.2", default-features = false } +prost-build = { version = "=0.12.3", default-features = false } tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From 200e8c80084442d9579e00967e407cd83191565d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 22 Nov 2023 06:09:00 -0800 Subject: [PATCH 1361/1411] Implementing `ArrayBuilder` for `Box` (#5109) * Implementing ArrayBuilder for Box * Update existing test --- .../src/builder/generic_list_builder.rs | 202 +++++++++++++++++- arrow-array/src/builder/mod.rs | 30 +++ arrow-array/src/builder/struct_builder.rs | 15 +- 3 files changed, 243 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 5cc7f7b04e0a..21eaadd5208a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -353,7 +353,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, ListBuilder}; + use crate::builder::{make_builder, Int32Builder, ListBuilder}; use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; @@ -548,4 +548,204 @@ mod tests { assert_eq!(elements.null_count(), 1); assert!(elements.is_null(3)); } + + #[test] + fn test_boxed_primitive_aray_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_list_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(false); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + builder.append(false); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 38a7500dd55f..8382f7af87b0 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -265,6 +265,36 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } +impl ArrayBuilder for Box { + fn len(&self) -> usize { + (**self).len() + } + + fn is_empty(&self) -> bool { + (**self).is_empty() + } + + fn finish(&mut self) -> ArrayRef { + (**self).finish() + } + + fn finish_cloned(&self) -> ArrayRef { + (**self).finish_cloned() + } + + fn as_any(&self) -> &dyn Any { + (**self).as_any() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + (**self).as_any_mut() + } + + fn into_box_any(self: Box) -> Box { + self + } +} + /// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 0f40b8a487ae..06b8385b3164 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -169,6 +169,10 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(DurationNanosecondBuilder::with_capacity(capacity)) } + DataType::List(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListBuilder::with_capacity(builder, capacity)) + } DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), t => panic!("Data type {t:?} is not currently supported"), } @@ -507,13 +511,18 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" + expected = "Data type Map(Field { name: \"entries\", data_type: Struct([Field { name: \"keys\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"values\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { - let list_type = DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); + let keys = Arc::new(Field::new("keys", DataType::Int32, false)); + let values = Arc::new(Field::new("values", DataType::UInt32, false)); + let struct_type = DataType::Struct(Fields::from(vec![keys, values])); + let map_data_type = + DataType::Map(Arc::new(Field::new("entries", struct_type, false)), false); + let fields = vec![ Field::new("f1", DataType::Int16, false), - Field::new("f2", list_type, false), + Field::new("f2", map_data_type, false), ]; let _ = StructBuilder::from_fields(fields, 5); From e1bafdf70300c405f32ddf49444166839fe6a7bd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 23 Nov 2023 05:33:36 -0800 Subject: [PATCH 1362/1411] Remove empty source file (#5119) * Remove useless file * Fix --- arrow/src/datatypes/ffi.rs | 16 ---------------- arrow/src/datatypes/mod.rs | 5 ----- 2 files changed, 21 deletions(-) delete mode 100644 arrow/src/datatypes/ffi.rs diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs deleted file mode 100644 index b248758bc120..000000000000 --- a/arrow/src/datatypes/ffi.rs +++ /dev/null @@ -1,16 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 894e046e621f..d41289d52e2a 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -30,8 +30,3 @@ pub use arrow_schema::{ DataType, Field, FieldRef, Fields, IntervalUnit, Schema, SchemaBuilder, SchemaRef, TimeUnit, UnionFields, UnionMode, }; - -#[cfg(feature = "ffi")] -mod ffi; -#[cfg(feature = "ffi")] -pub use ffi::*; From ef1cc38bbfad9e596ed2ed129421f9b657445dc0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 27 Nov 2023 10:37:02 +0000 Subject: [PATCH 1363/1411] Cleanup list casting and support nested lists (#5113) (#5124) * Cleanup list casting and support nested lists (#5113) * Clippy * Update can_cast_types --- arrow-cast/src/cast.rs | 183 +++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 115 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index dd3e271afb0d..22faedb96f96 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -47,7 +47,7 @@ use crate::parse::{ string_to_datetime, Parser, }; use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; +use arrow_buffer::{i256, ArrowNativeType, OffsetBuffer}; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::*; @@ -124,18 +124,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), - (LargeList(list_from), LargeList(list_to)) => { + (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) } - (List(list_from), List(list_to)) => { - can_cast_types(list_from.data_type(), list_to.data_type()) - } - (List(list_from), LargeList(list_to)) => { - list_from.data_type() == list_to.data_type() - } - (LargeList(list_from), List(list_to)) => { - list_from.data_type() == list_to.data_type() - } (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => { can_cast_types(list_from.data_type(), to_type) } @@ -783,28 +774,10 @@ pub fn cast_with_options( "Casting from type {from_type:?} to dictionary type {to_type:?} not supported", ))), }, - (List(_), List(ref to)) => cast_list_inner::(array, to, to_type, cast_options), - (LargeList(_), LargeList(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (List(list_from), LargeList(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast list to large-list with different child data".into(), - )) - } else { - cast_list_container::(array, cast_options) - } - } - (LargeList(list_from), List(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast large-list to list with different child data".into(), - )) - } else { - cast_list_container::(array, cast_options) - } - } + (List(_), List(to)) => cast_list_values::(array, to, cast_options), + (LargeList(_), LargeList(to)) => cast_list_values::(array, to, cast_options), + (List(_), LargeList(list_to)) => cast_list::(array, list_to, cast_options), + (LargeList(_), List(list_to)) => cast_list::(array, list_to, cast_options), (List(_), FixedSizeList(field, size)) => { let array = array.as_list::(); cast_list_to_fixed_size_list::(array, field, *size, cast_options) @@ -3046,28 +3019,6 @@ fn cast_values_to_list( Ok(Arc::new(list)) } -/// Helper function that takes an Generic list container and casts the inner datatype. -fn cast_list_inner( - array: &dyn Array, - to: &Field, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - let data = array.to_data(); - let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; - let builder = data - .into_builder() - .data_type(to_type.clone()) - .child_data(vec![cast_array.into_data()]); - - // Safety - // Data was valid before - let array_data = unsafe { builder.build_unchecked() }; - let list = GenericListArray::::from(array_data); - Ok(Arc::new(list) as ArrayRef) -} - /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. fn cast_binary_to_string( @@ -3221,7 +3172,7 @@ where fn cast_list_to_fixed_size_list( array: &GenericListArray, - field: &Arc, + field: &FieldRef, size: i32, cast_options: &CastOptions, ) -> Result @@ -3289,75 +3240,57 @@ where Ok(Arc::new(array)) } -/// Cast the container type of List/Largelist array but not the inner types. -/// This function can leave the value data intact and only has to cast the offset dtypes. -fn cast_list_container( +/// Helper function that takes an Generic list container and casts the inner datatype. +fn cast_list_values( array: &dyn Array, - _cast_options: &CastOptions, -) -> Result -where - OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, - OffsetSizeTo: OffsetSizeTrait + NumCast, -{ - let list = array.as_list::(); - // the value data stored by the list - let values = list.values(); + to: &FieldRef, + cast_options: &CastOptions, +) -> Result { + let list = array.as_list::(); + let values = cast_with_options(list.values(), to.data_type(), cast_options)?; + Ok(Arc::new(GenericListArray::::new( + to.clone(), + list.offsets().clone(), + values, + list.nulls().cloned(), + ))) +} - let out_dtype = match array.data_type() { - DataType::List(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - DataType::LargeList(value_type.clone()) - } - DataType::LargeList(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - if values.len() > i32::MAX as usize { - return Err(ArrowError::ComputeError( - "LargeList too large to cast to List".into(), - )); - } - DataType::List(value_type.clone()) - } - // implementation error - _ => unreachable!(), - }; +/// Cast the container type of List/Largelist array along with the inner datatype +fn cast_list( + array: &dyn Array, + field: &FieldRef, + cast_options: &CastOptions, +) -> Result { + let list = array.as_list::(); + let values = list.values(); + let offsets = list.offsets(); + let nulls = list.nulls().cloned(); - let iter = list.value_offsets().iter().map(|idx| { - let idx: OffsetSizeTo = NumCast::from(*idx).unwrap(); - idx - }); + if !O::IS_LARGE && values.len() > i32::MAX as usize { + return Err(ArrowError::ComputeError( + "LargeList too large to cast to List".into(), + )); + } - // SAFETY - // A slice produces a trusted length iterator - let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) }; + // Recursively cast values + let values = cast_with_options(values, field.data_type(), cast_options)?; + let offsets: Vec<_> = offsets.iter().map(|x| O::usize_as(x.as_usize())).collect(); - // wrap up - let builder = ArrayData::builder(out_dtype) - .len(list.len()) - .add_buffer(offset_buffer) - .add_child_data(values.to_data()) - .nulls(list.nulls().cloned()); + // Safety: valid offsets and checked for overflow + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; - let array_data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(GenericListArray::::from(array_data))) + Ok(Arc::new(GenericListArray::::new( + field.clone(), + offsets, + values, + nulls, + ))) } #[cfg(test)] mod tests { - use arrow_buffer::NullBuffer; + use arrow_buffer::{Buffer, NullBuffer}; use super::*; @@ -9154,6 +9087,26 @@ mod tests { assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]"); } + #[test] + fn test_nested_list_cast() { + let mut builder = ListBuilder::new(ListBuilder::new(Int32Builder::new())); + builder.append_value([Some([Some(1), Some(2), None]), None]); + builder.append_value([None, Some([]), None]); + builder.append_null(); + builder.append_value([Some([Some(2), Some(3)])]); + let start = builder.finish(); + + let mut builder = LargeListBuilder::new(LargeListBuilder::new(Int8Builder::new())); + builder.append_value([Some([Some(1), Some(2), None]), None]); + builder.append_value([None, Some([]), None]); + builder.append_null(); + builder.append_value([Some([Some(2), Some(3)])]); + let expected = builder.finish(); + + let actual = cast(&start, expected.data_type()).unwrap(); + assert_eq!(actual.as_ref(), &expected); + } + const CAST_OPTIONS: CastOptions<'static> = CastOptions { safe: true, format_options: FormatOptions::new(), From 409bb81a69f3ea1b354fa209a5b6b9d54ea06419 Mon Sep 17 00:00:00 2001 From: "Reilly.tang" Date: Mon, 27 Nov 2023 18:37:38 +0800 Subject: [PATCH 1364/1411] [fix #5044] Support converting 'yyyymmdd' format to date (#5078) Signed-off-by: tangruilin --- arrow-cast/src/cast.rs | 24 ++++++++++++++++++++++++ arrow-cast/src/parse.rs | 14 +++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 22faedb96f96..3d9d0ee3d920 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4879,6 +4879,30 @@ mod tests { } } + #[test] + fn test_cast_string_format_yyyymmdd_to_date32() { + let a = Arc::new(StringArray::from(vec![ + Some("2020-12-25"), + Some("20201117"), + ])) as ArrayRef; + + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let result = cast_with_options(&a, &to_type, &options).unwrap(); + let c = result.as_primitive::(); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 12, 25), + c.value_as_date(0) + ); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 11, 17), + c.value_as_date(1) + ); + } + #[test] fn test_cast_string_to_time32second() { let a1 = Arc::new(StringArray::from(vec![ diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f01b2b4c0d63..750f38006d33 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -559,8 +559,20 @@ fn parse_date(string: &str) -> Option { const HYPHEN: u8 = b'-'.wrapping_sub(b'0'); + // refer to https://www.rfc-editor.org/rfc/rfc3339#section-3 if digits[4] != HYPHEN { - return None; + let (year, month, day) = match (mask, string.len()) { + (0b11111111, 8) => ( + digits[0] as u16 * 1000 + + digits[1] as u16 * 100 + + digits[2] as u16 * 10 + + digits[3] as u16, + digits[4] * 10 + digits[5], + digits[6] * 10 + digits[7], + ), + _ => return None, + }; + return NaiveDate::from_ymd_opt(year as _, month as _, day as _); } let (month, day) = match mask { From d5a6cf4e5aaf9a4c5f2777c81aea9ef315578b2d Mon Sep 17 00:00:00 2001 From: Robin Lin <128118209+RobinLin666@users.noreply.github.com> Date: Mon, 27 Nov 2023 19:07:45 +0800 Subject: [PATCH 1365/1411] Fix ObjectStore.LocalFileSystem.put_opts for blobfuse (#5094) * Fix ObjectStore.LocalFileSystem.put_opts for blobfuse * Fix ObjectStore.LocalFileSystem.put_opts for blobfuse * fix comment * fix race condition * add comment --- object_store/src/local.rs | 56 ++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/object_store/src/local.rs b/object_store/src/local.rs index dd71d9ec1219..71b96f058c79 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -338,28 +338,41 @@ impl ObjectStore for LocalFileSystem { maybe_spawn_blocking(move || { let (mut file, suffix) = new_staged_upload(&path)?; let staging_path = staged_upload_path(&path, &suffix); + let mut e_tag = None; let err = match file.write_all(&bytes) { - Ok(_) => match opts.mode { - PutMode::Overwrite => match std::fs::rename(&staging_path, &path) { - Ok(_) => None, - Err(source) => Some(Error::UnableToRenameFile { source }), - }, - PutMode::Create => match std::fs::hard_link(&staging_path, &path) { - Ok(_) => { - let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup - None + Ok(_) => { + let metadata = file.metadata().map_err(|e| Error::Metadata { + source: e.into(), + path: path.to_string_lossy().to_string(), + })?; + e_tag = Some(get_etag(&metadata)); + match opts.mode { + PutMode::Overwrite => { + // For some fuse types of file systems, the file must be closed first + // to trigger the upload operation, and then renamed, such as Blobfuse + std::mem::drop(file); + match std::fs::rename(&staging_path, &path) { + Ok(_) => None, + Err(source) => Some(Error::UnableToRenameFile { source }), + } } - Err(source) => match source.kind() { - ErrorKind::AlreadyExists => Some(Error::AlreadyExists { - path: path.to_str().unwrap().to_string(), - source, - }), - _ => Some(Error::UnableToRenameFile { source }), + PutMode::Create => match std::fs::hard_link(&staging_path, &path) { + Ok(_) => { + let _ = std::fs::remove_file(&staging_path); // Attempt to cleanup + None + } + Err(source) => match source.kind() { + ErrorKind::AlreadyExists => Some(Error::AlreadyExists { + path: path.to_str().unwrap().to_string(), + source, + }), + _ => Some(Error::UnableToRenameFile { source }), + }, }, - }, - PutMode::Update(_) => unreachable!(), - }, + PutMode::Update(_) => unreachable!(), + } + } Err(source) => Some(Error::UnableToCopyDataToFile { source }), }; @@ -368,13 +381,8 @@ impl ObjectStore for LocalFileSystem { return Err(err.into()); } - let metadata = file.metadata().map_err(|e| Error::Metadata { - source: e.into(), - path: path.to_string_lossy().to_string(), - })?; - Ok(PutResult { - e_tag: Some(get_etag(&metadata)), + e_tag, version: None, }) }) From 435b53ded3710216e32ad3f82ae4910f50954e06 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 27 Nov 2023 03:38:10 -0800 Subject: [PATCH 1366/1411] Cast from numeric/timestamp to timestamp/numeric (#5123) * Casting between floating and timestamp * Fix * For decimals * Fix --- arrow-cast/src/cast.rs | 104 +++++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 15 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 3d9d0ee3d920..ebfd97488b28 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -161,17 +161,16 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, // Utf8 to decimal (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, - (Decimal128(_, _) | Decimal256(_, _), _) => false, - (_, Decimal128(_, _) | Decimal256(_, _)) => false, (Struct(_), _) => false, (_, Struct(_)) => false, (_, Boolean) => { - DataType::is_numeric(from_type) + DataType::is_integer(from_type) || + DataType::is_floating(from_type) || from_type == &Utf8 || from_type == &LargeUtf8 } (Boolean, _) => { - DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 + DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 } (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, @@ -222,8 +221,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Time64(_), Time32(to_unit)) => { matches!(to_unit, Second | Millisecond) } - (Timestamp(_, _), _) if to_type.is_integer() => true, - (_, Timestamp(_, _)) if from_type.is_integer() => true, + (Timestamp(_, _), _) if to_type.is_numeric() && to_type != &Float16 => true, + (_, Timestamp(_, _)) if from_type.is_numeric() && from_type != &Float16 => true, (Date64, Timestamp(_, None)) => true, (Date32, Timestamp(_, None)) => true, ( @@ -849,7 +848,7 @@ pub fn cast_with_options( cast_options, ) } - (Decimal128(_, scale), _) => { + (Decimal128(_, scale), _) if !to_type.is_temporal() => { // cast decimal to other type match to_type { UInt8 => cast_decimal_to_integer::( @@ -914,7 +913,7 @@ pub fn cast_with_options( ))), } } - (Decimal256(_, scale), _) => { + (Decimal256(_, scale), _) if !to_type.is_temporal() => { // cast decimal to other type match to_type { UInt8 => cast_decimal_to_integer::( @@ -979,7 +978,7 @@ pub fn cast_with_options( ))), } } - (_, Decimal128(precision, scale)) => { + (_, Decimal128(precision, scale)) if !from_type.is_temporal() => { // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal128Type, _>( @@ -1068,7 +1067,7 @@ pub fn cast_with_options( ))), } } - (_, Decimal256(precision, scale)) => { + (_, Decimal256(precision, scale)) if !from_type.is_temporal() => { // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal256Type, _>( @@ -1607,24 +1606,25 @@ pub fn cast_with_options( .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), )), - (Timestamp(TimeUnit::Second, _), _) if to_type.is_integer() => { + // Timestamp to integer/floating/decimals + (Timestamp(TimeUnit::Second, _), _) if to_type.is_numeric() => { let array = cast_reinterpret_arrays::(array)?; cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Millisecond, _), _) if to_type.is_integer() => { + (Timestamp(TimeUnit::Millisecond, _), _) if to_type.is_numeric() => { let array = cast_reinterpret_arrays::(array)?; cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Microsecond, _), _) if to_type.is_integer() => { + (Timestamp(TimeUnit::Microsecond, _), _) if to_type.is_numeric() => { let array = cast_reinterpret_arrays::(array)?; cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Nanosecond, _), _) if to_type.is_integer() => { + (Timestamp(TimeUnit::Nanosecond, _), _) if to_type.is_numeric() => { let array = cast_reinterpret_arrays::(array)?; cast_with_options(&array, to_type, cast_options) } - (_, Timestamp(unit, tz)) if from_type.is_integer() => { + (_, Timestamp(unit, tz)) if from_type.is_numeric() => { let array = cast_with_options(array, &Int64, cast_options)?; Ok(make_timestamp_array( array.as_primitive(), @@ -4652,6 +4652,80 @@ mod tests { assert_eq!(&actual, &expected); } + #[test] + fn test_cast_floating_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Float32Array::from(vec![Some(2.0), Some(10.6), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Float64Array::from(vec![Some(2.1), Some(10.2), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_floating() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast(&cast(&array, &DataType::Float32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Float64).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_decimal_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Decimal128Array::from(vec![Some(200), Some(1000), None]) + .with_precision_and_scale(4, 2) + .unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Decimal256Array::from(vec![ + Some(i256::from_i128(2000)), + Some(i256::from_i128(10000)), + None, + ]) + .with_precision_and_scale(5, 3) + .unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_decimal() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast( + &cast(&array, &DataType::Decimal128(5, 2)).unwrap(), + &DataType::Int64, + ) + .unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast( + &cast(&array, &DataType::Decimal256(10, 5)).unwrap(), + &DataType::Int64, + ) + .unwrap(); + assert_eq!(&actual, &expected); + } + #[test] fn test_cast_list_i32_to_list_u16() { let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); From 4b7405ccc461b4413f74e1efc9dbd63b86e85bf9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Mon, 27 Nov 2023 14:17:12 +0000 Subject: [PATCH 1367/1411] Improve cast docs (#5114) * Improve cast docs * Apply suggestions from code review Co-authored-by: Andrew Lamb * Format --------- Co-authored-by: Andrew Lamb --- arrow-cast/src/cast.rs | 46 +++++++++++++----------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index ebfd97488b28..38b9fb4c3483 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -73,10 +73,9 @@ impl<'a> Default for CastOptions<'a> { } } -/// Return true if a value of type `from_type` can be cast into a -/// value of `to_type`. Note that such as cast may be lossy. +/// Return true if a value of type `from_type` can be cast into a value of `to_type`. /// -/// If this function returns true to stay consistent with the `cast` kernel below. +/// See [`cast_with_options`] for more information pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { use self::DataType::*; use self::IntervalUnit::*; @@ -262,32 +261,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } } -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. +/// Cast `array` to the provided data type and return a new Array with type `to_type`, if possible. /// -/// Behavior: -/// * Boolean to Utf8: `true` => '1', `false` => `0` -/// * Utf8 to boolean: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`, -/// short variants are accepted, other strings return null or error -/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings -/// in integer casts return null -/// * Numeric to boolean: 0 returns `false`, any other value returns `true` -/// * List to List: the underlying data type is cast -/// * List to FixedSizeList: the underlying data type is cast. If safe is true and a list element -/// has the wrong length it will be replaced with NULL, otherwise an error will be returned -/// * Primitive to List: a list array with 1 value per slot is created -/// * Date32 and Date64: precision lost when going to higher interval -/// * Time32 and Time64: precision lost when going to higher interval -/// * Timestamp and Date{32|64}: precision lost when going to higher interval -/// * Temporal to/from backing primitive: zero-copy with data type change -/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals -/// (i.e. casting 6.4999 to Decimal(10, 1) becomes 6.5). This is the breaking change from `26.0.0`. -/// It used to truncate it instead of round (i.e. outputs 6.4 instead) -/// -/// Unsupported Casts -/// * To or from `StructArray` -/// * List to primitive -/// * Interval and duration +/// See [`cast_with_options`] for more information pub fn cast(array: &dyn Array, to_type: &DataType) -> Result { cast_with_options(array, to_type, &CastOptions::default()) } @@ -682,11 +658,11 @@ fn as_time_res_with_timezone( }) } -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. It accepts `CastOptions` to allow consumers -/// to configure cast behavior. +/// Cast `array` to the provided data type and return a new Array with type `to_type`, if possible. +/// +/// Accepts [`CastOptions`] to specify cast behavior. /// -/// Behavior: +/// ## Behavior /// * Boolean to Utf8: `true` => '1', `false` => `0` /// * Utf8 to boolean: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`, /// short variants are accepted, other strings return null or error @@ -694,15 +670,21 @@ fn as_time_res_with_timezone( /// in integer casts return null /// * Numeric to boolean: 0 returns `false`, any other value returns `true` /// * List to List: the underlying data type is cast +/// * List to FixedSizeList: the underlying data type is cast. If safe is true and a list element +/// has the wrong length it will be replaced with NULL, otherwise an error will be returned /// * Primitive to List: a list array with 1 value per slot is created /// * Date32 and Date64: precision lost when going to higher interval /// * Time32 and Time64: precision lost when going to higher interval /// * Timestamp and Date{32|64}: precision lost when going to higher interval /// * Temporal to/from backing primitive: zero-copy with data type change +/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals +/// (i.e. casting `6.4999` to Decimal(10, 1) becomes `6.5`). Prior to version `26.0.0`, +/// casting would truncate instead (i.e. outputs `6.4` instead) /// /// Unsupported Casts /// * To or from `StructArray` /// * List to primitive +/// * Interval and duration pub fn cast_with_options( array: &dyn Array, to_type: &DataType, From e26fa4f39523136a368ade229eba3fd6895eaa0a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 14:17:27 +0000 Subject: [PATCH 1368/1411] Update proc-macro2 requirement from =1.0.69 to =1.0.70 (#5131) Updates the requirements on [proc-macro2](https://github.com/dtolnay/proc-macro2) to permit the latest version. - [Release notes](https://github.com/dtolnay/proc-macro2/releases) - [Commits](https://github.com/dtolnay/proc-macro2/compare/1.0.69...1.0.70) --- updated-dependencies: - dependency-name: proc-macro2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 4976c8eb5461..4f7a032f51e5 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.69", default-features = false } +proc-macro2 = { version = "=1.0.70", default-features = false } prost-build = { version = "=0.12.3", default-features = false } tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } From 8a0b5cb25205a357c5f17a8af40e45019121c483 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 28 Nov 2023 02:12:32 -0800 Subject: [PATCH 1369/1411] Fix negative decimal string (#5128) * Fix negative cases * Fix * Fix * Fix clippy * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-cast/src/cast.rs | 90 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 38b9fb4c3483..8facb4f161f4 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2596,11 +2596,33 @@ where ))); } - let integers = parts[0].trim_start_matches('0'); + let (negative, first_part) = if parts[0].is_empty() { + (false, parts[0]) + } else { + match parts[0].as_bytes()[0] { + b'-' => (true, &parts[0][1..]), + b'+' => (false, &parts[0][1..]), + _ => (false, parts[0]), + } + }; + + let integers = first_part.trim_start_matches('0'); let decimals = if parts.len() == 2 { parts[1] } else { "" }; + if !integers.is_empty() && !integers.as_bytes()[0].is_ascii_digit() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid decimal format: {value_str:?}" + ))); + } + + if !decimals.is_empty() && !decimals.as_bytes()[0].is_ascii_digit() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid decimal format: {value_str:?}" + ))); + } + // Adjust decimal based on scale - let number_decimals = if decimals.len() > scale { + let mut number_decimals = if decimals.len() > scale { let decimal_number = i256::from_string(decimals).ok_or_else(|| { ArrowError::InvalidArgumentError(format!("Cannot parse decimal format: {value_str}")) })?; @@ -2640,6 +2662,10 @@ where format!("{integers}{decimals}") }; + if negative { + number_decimals.insert(0, '-'); + } + let value = i256::from_string(number_decimals.as_str()).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Cannot convert {} to {}: Overflow", @@ -8256,6 +8282,21 @@ mod tests { assert_eq!("0.00", decimal_arr.value_as_string(10)); assert_eq!("0.00", decimal_arr.value_as_string(11)); assert!(decimal_arr.is_null(12)); + assert_eq!("-1.23", decimal_arr.value_as_string(13)); + assert_eq!("-1.24", decimal_arr.value_as_string(14)); + assert_eq!("0.00", decimal_arr.value_as_string(15)); + assert_eq!("-123.00", decimal_arr.value_as_string(16)); + assert_eq!("-123.23", decimal_arr.value_as_string(17)); + assert_eq!("-0.12", decimal_arr.value_as_string(18)); + assert_eq!("1.23", decimal_arr.value_as_string(19)); + assert_eq!("1.24", decimal_arr.value_as_string(20)); + assert_eq!("0.00", decimal_arr.value_as_string(21)); + assert_eq!("123.00", decimal_arr.value_as_string(22)); + assert_eq!("123.23", decimal_arr.value_as_string(23)); + assert_eq!("0.12", decimal_arr.value_as_string(24)); + assert!(decimal_arr.is_null(25)); + assert!(decimal_arr.is_null(26)); + assert!(decimal_arr.is_null(27)); // Decimal256 let output_type = DataType::Decimal256(76, 3); @@ -8277,6 +8318,21 @@ mod tests { assert_eq!("0.000", decimal_arr.value_as_string(10)); assert_eq!("0.000", decimal_arr.value_as_string(11)); assert!(decimal_arr.is_null(12)); + assert_eq!("-1.235", decimal_arr.value_as_string(13)); + assert_eq!("-1.236", decimal_arr.value_as_string(14)); + assert_eq!("0.000", decimal_arr.value_as_string(15)); + assert_eq!("-123.000", decimal_arr.value_as_string(16)); + assert_eq!("-123.234", decimal_arr.value_as_string(17)); + assert_eq!("-0.123", decimal_arr.value_as_string(18)); + assert_eq!("1.235", decimal_arr.value_as_string(19)); + assert_eq!("1.236", decimal_arr.value_as_string(20)); + assert_eq!("0.000", decimal_arr.value_as_string(21)); + assert_eq!("123.000", decimal_arr.value_as_string(22)); + assert_eq!("123.234", decimal_arr.value_as_string(23)); + assert_eq!("0.123", decimal_arr.value_as_string(24)); + assert!(decimal_arr.is_null(25)); + assert!(decimal_arr.is_null(26)); + assert!(decimal_arr.is_null(27)); } #[test] @@ -8295,6 +8351,21 @@ mod tests { Some(""), Some(" "), None, + Some("-1.23499999"), + Some("-1.23599999"), + Some("-0.00001"), + Some("-123"), + Some("-123.234000"), + Some("-000.123"), + Some("+1.23499999"), + Some("+1.23599999"), + Some("+0.00001"), + Some("+123"), + Some("+123.234000"), + Some("+000.123"), + Some("1.-23499999"), + Some("-1.-23499999"), + Some("--1.23499999"), ]); let array = Arc::new(str_array) as ArrayRef; @@ -8317,6 +8388,21 @@ mod tests { Some(""), Some(" "), None, + Some("-1.23499999"), + Some("-1.23599999"), + Some("-0.00001"), + Some("-123"), + Some("-123.234000"), + Some("-000.123"), + Some("+1.23499999"), + Some("+1.23599999"), + Some("+0.00001"), + Some("+123"), + Some("+123.234000"), + Some("+000.123"), + Some("1.-23499999"), + Some("-1.-23499999"), + Some("--1.23499999"), ]); let array = Arc::new(str_array) as ArrayRef; From 34a816d4fc2a9e0e097b1c41df788612dd5c8e61 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 28 Nov 2023 22:18:42 +1100 Subject: [PATCH 1370/1411] Parquet: derive boundary order when writing (#5110) * Parquet: derive boundary order when writing * Fix * Refactor boundary check location * Fix * Refactor according to review --- parquet/src/column/writer/mod.rs | 239 ++++++++++++++++++++++++++++--- parquet/src/file/metadata.rs | 9 +- 2 files changed, 222 insertions(+), 26 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 11c39685911c..14b8655091e4 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -21,7 +21,7 @@ use bytes::Bytes; use half::f16; use crate::bloom_filter::Sbbf; -use crate::format::{ColumnIndex, OffsetIndex}; +use crate::format::{BoundaryOrder, ColumnIndex, OffsetIndex}; use std::collections::{BTreeSet, VecDeque}; use std::str; @@ -228,6 +228,13 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { // column index and offset index column_index_builder: ColumnIndexBuilder, offset_index_builder: OffsetIndexBuilder, + + // Below fields used to incrementally check boundary order across data pages. + // We assume they are ascending/descending until proven wrong. + data_page_boundary_ascending: bool, + data_page_boundary_descending: bool, + /// (min, max) + last_non_null_data_page_min_max: Option<(E::T, E::T)>, } impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { @@ -279,6 +286,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, + data_page_boundary_ascending: true, + data_page_boundary_descending: true, + last_non_null_data_page_min_max: None, } } @@ -467,6 +477,18 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let metadata = self.write_column_metadata()?; self.page_writer.close()?; + let boundary_order = match ( + self.data_page_boundary_ascending, + self.data_page_boundary_descending, + ) { + // If the lists are composed of equal elements then will be marked as ascending + // (Also the case if all pages are null pages) + (true, _) => BoundaryOrder::ASCENDING, + (false, true) => BoundaryOrder::DESCENDING, + (false, false) => BoundaryOrder::UNORDERED, + }; + self.column_index_builder.set_boundary_order(boundary_order); + let column_index = self .column_index_builder .valid() @@ -610,7 +632,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } /// Update the column index and offset index when adding the data page - fn update_column_offset_index(&mut self, page_statistics: Option<&Statistics>) { + fn update_column_offset_index(&mut self, page_statistics: Option<&ValueStatistics>) { // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; @@ -631,6 +653,30 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_index_builder.to_invalid(); } Some(stat) => { + // Check if min/max are still ascending/descending across pages + let new_min = stat.min(); + let new_max = stat.max(); + if let Some((last_min, last_max)) = &self.last_non_null_data_page_min_max { + if self.data_page_boundary_ascending { + // If last min/max are greater than new min/max then not ascending anymore + let not_ascending = compare_greater(&self.descr, last_min, new_min) + || compare_greater(&self.descr, last_max, new_max); + if not_ascending { + self.data_page_boundary_ascending = false; + } + } + + if self.data_page_boundary_descending { + // If new min/max are greater than last min/max then not descending anymore + let not_descending = compare_greater(&self.descr, new_min, last_min) + || compare_greater(&self.descr, new_max, last_max); + if not_descending { + self.data_page_boundary_descending = false; + } + } + } + self.last_non_null_data_page_min_max = Some((new_min.clone(), new_max.clone())); + // We only truncate if the data is represented as binary match self.descr.physical_type() { Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { @@ -703,7 +749,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { (Some(min), Some(max)) => { update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); - Some(Statistics::new( + Some(ValueStatistics::new( Some(min), Some(max), None, @@ -716,6 +762,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // update column and offset index self.update_column_offset_index(page_statistics.as_ref()); + let page_statistics = page_statistics.map(Statistics::from); let compressed_page = match self.props.writer_version() { WriterVersion::PARQUET_1_0 => { @@ -2569,7 +2616,7 @@ mod tests { // column index assert_eq!(1, column_index.null_pages.len()); assert_eq!(1, offset_index.page_locations.len()); - assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); + assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order); assert!(!column_index.null_pages[0]); assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); @@ -2636,7 +2683,7 @@ mod tests { // column index assert_eq!(1, column_index.null_pages.len()); assert_eq!(1, offset_index.page_locations.len()); - assert_eq!(BoundaryOrder::UNORDERED, column_index.boundary_order); + assert_eq!(BoundaryOrder::ASCENDING, column_index.boundary_order); assert!(!column_index.null_pages[0]); assert_eq!(0, column_index.null_counts.as_ref().unwrap()[0]); @@ -2891,6 +2938,158 @@ mod tests { assert!(incremented.is_none()) } + #[test] + fn test_boundary_order() -> Result<()> { + let descr = Arc::new(get_test_column_descr::(1, 0)); + // min max both ascending + let column_close_result = write_multiple_pages::( + &descr, + &[ + &[Some(-10), Some(10)], + &[Some(-5), Some(11)], + &[None], + &[Some(-5), Some(11)], + ], + )?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::ASCENDING); + + // min max both descending + let column_close_result = write_multiple_pages::( + &descr, + &[ + &[Some(10), Some(11)], + &[Some(5), Some(11)], + &[None], + &[Some(-5), Some(0)], + ], + )?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::DESCENDING); + + // min max both equal + let column_close_result = write_multiple_pages::( + &descr, + &[&[Some(10), Some(11)], &[None], &[Some(10), Some(11)]], + )?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::ASCENDING); + + // only nulls + let column_close_result = + write_multiple_pages::(&descr, &[&[None], &[None], &[None]])?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::ASCENDING); + + // one page + let column_close_result = + write_multiple_pages::(&descr, &[&[Some(-10), Some(10)]])?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::ASCENDING); + + // one non-null page + let column_close_result = + write_multiple_pages::(&descr, &[&[Some(-10), Some(10)], &[None]])?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::ASCENDING); + + // min max both unordered + let column_close_result = write_multiple_pages::( + &descr, + &[ + &[Some(10), Some(11)], + &[Some(11), Some(16)], + &[None], + &[Some(-5), Some(0)], + ], + )?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::UNORDERED); + + // min max both ordered in different orders + let column_close_result = write_multiple_pages::( + &descr, + &[ + &[Some(1), Some(9)], + &[Some(2), Some(8)], + &[None], + &[Some(3), Some(7)], + ], + )?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::UNORDERED); + + Ok(()) + } + + #[test] + fn test_boundary_order_logical_type() -> Result<()> { + // ensure that logical types account for different sort order than underlying + // physical type representation + let f16_descr = Arc::new(get_test_float16_column_descr(1, 0)); + let fba_descr = { + let tpe = SchemaType::primitive_type_builder( + "col", + FixedLenByteArrayType::get_physical_type(), + ) + .with_length(2) + .build()?; + Arc::new(ColumnDescriptor::new( + Arc::new(tpe), + 1, + 0, + ColumnPath::from("col"), + )) + }; + + let values: &[&[Option]] = &[ + &[Some(FixedLenByteArray::from(ByteArray::from(f16::ONE)))], + &[Some(FixedLenByteArray::from(ByteArray::from(f16::ZERO)))], + &[Some(FixedLenByteArray::from(ByteArray::from( + f16::NEG_ZERO, + )))], + &[Some(FixedLenByteArray::from(ByteArray::from(f16::NEG_ONE)))], + ]; + + // f16 descending + let column_close_result = + write_multiple_pages::(&f16_descr, values)?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::DESCENDING); + + // same bytes, but fba unordered + let column_close_result = + write_multiple_pages::(&fba_descr, values)?; + let boundary_order = column_close_result.column_index.unwrap().boundary_order; + assert_eq!(boundary_order, BoundaryOrder::UNORDERED); + + Ok(()) + } + + fn write_multiple_pages( + column_descr: &Arc, + pages: &[&[Option]], + ) -> Result { + let column_writer = get_column_writer( + column_descr.clone(), + Default::default(), + get_test_page_writer(), + ); + let mut writer = get_typed_column_writer::(column_writer); + + for &page in pages { + let values = page.iter().filter_map(Clone::clone).collect::>(); + let def_levels = page + .iter() + .map(|maybe_value| if maybe_value.is_some() { 1 } else { 0 }) + .collect::>(); + writer.write_batch(&values, Some(&def_levels), None)?; + writer.flush_data_pages()?; + } + + writer.close() + } + /// Performs write-read roundtrip with randomly generated values and levels. /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. @@ -3197,8 +3396,7 @@ mod tests { ) -> ValueStatistics { let page_writer = get_test_page_writer(); let props = Default::default(); - let mut writer = - get_test_float16_column_writer::(page_writer, 0, 0, props); + let mut writer = get_test_float16_column_writer(page_writer, 0, 0, props); writer.write_batch(values, None, None).unwrap(); let metadata = writer.close().unwrap().metadata; @@ -3209,30 +3407,25 @@ mod tests { } } - fn get_test_float16_column_writer( + fn get_test_float16_column_writer( page_writer: Box, max_def_level: i16, max_rep_level: i16, props: WriterPropertiesPtr, - ) -> ColumnWriterImpl<'static, T> { - let descr = Arc::new(get_test_float16_column_descr::( - max_def_level, - max_rep_level, - )); + ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { + let descr = Arc::new(get_test_float16_column_descr(max_def_level, max_rep_level)); let column_writer = get_column_writer(descr, props, page_writer); - get_typed_column_writer::(column_writer) + get_typed_column_writer::(column_writer) } - fn get_test_float16_column_descr( - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnDescriptor { + fn get_test_float16_column_descr(max_def_level: i16, max_rep_level: i16) -> ColumnDescriptor { let path = ColumnPath::from("col"); - let tpe = SchemaType::primitive_type_builder("col", T::get_physical_type()) - .with_length(2) - .with_logical_type(Some(LogicalType::Float16)) - .build() - .unwrap(); + let tpe = + SchemaType::primitive_type_builder("col", FixedLenByteArrayType::get_physical_type()) + .with_length(2) + .with_logical_type(Some(LogicalType::Float16)) + .build() + .unwrap(); ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index e57f666383d2..a1f3c87d0a72 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -885,9 +885,8 @@ pub struct ColumnIndexBuilder { null_pages: Vec, min_values: Vec>, max_values: Vec>, - // TODO: calc the order for all pages in this column - boundary_order: BoundaryOrder, null_counts: Vec, + boundary_order: BoundaryOrder, // If one page can't get build index, need to ignore all index in this column valid: bool, } @@ -904,8 +903,8 @@ impl ColumnIndexBuilder { null_pages: Vec::new(), min_values: Vec::new(), max_values: Vec::new(), - boundary_order: BoundaryOrder::UNORDERED, null_counts: Vec::new(), + boundary_order: BoundaryOrder::UNORDERED, valid: true, } } @@ -923,6 +922,10 @@ impl ColumnIndexBuilder { self.null_counts.push(null_count); } + pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) { + self.boundary_order = boundary_order; + } + pub fn to_invalid(&mut self) { self.valid = false; } From 58c80e6d7dd67b2929bec084d8baa7b08069fd39 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 28 Nov 2023 15:38:00 +0000 Subject: [PATCH 1371/1411] Update localstack to 3.0.1 (#5028) --- .github/workflows/object_store.yml | 2 +- object_store/src/aws/mod.rs | 8 +++----- object_store/src/azure/mod.rs | 2 +- object_store/src/http/mod.rs | 2 +- object_store/src/lib.rs | 18 +++++++----------- 5 files changed, 13 insertions(+), 19 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 1b991e33c097..ecffa29b067c 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -133,7 +133,7 @@ jobs: - name: Setup LocalStack (AWS emulation) run: | - docker run -d -p 4566:4566 localstack/localstack:2.0 + docker run -d -p 4566:4566 localstack/localstack:3.0.1 docker run -d -p 1338:1338 amazon/amazon-ec2-metadata-mock:v1.9.2 --imdsv2 aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index cbb3cffdf494..0985263459b2 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -336,12 +336,10 @@ mod tests { let integration = config.build().unwrap(); let config = integration.client.config(); - let is_local = config.endpoint.starts_with("http://"); let test_not_exists = config.copy_if_not_exists.is_some(); let test_conditional_put = config.conditional_put.is_some(); - // Localstack doesn't support listing with spaces https://github.com/localstack/localstack/issues/6328 - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; @@ -364,12 +362,12 @@ mod tests { // run integration test with unsigned payload enabled let builder = AmazonS3Builder::from_env().with_unsigned_payload(true); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; // run integration test with checksum set to sha256 let builder = AmazonS3Builder::from_env().with_checksum_algorithm(Checksum::SHA256); let integration = builder.build().unwrap(); - put_get_delete_list_opts(&integration, is_local).await; + put_get_delete_list_opts(&integration).await; } #[tokio::test] diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 1d51cbdc02dc..af0a4cefa13b 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -193,7 +193,7 @@ mod tests { crate::test_util::maybe_skip_integration!(); let integration = MicrosoftAzureBuilder::from_env().build().unwrap(); - put_get_delete_list_opts(&integration, false).await; + put_get_delete_list_opts(&integration).await; get_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index cfcde27fd781..f1d11db4762c 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -264,7 +264,7 @@ mod tests { .build() .unwrap(); - put_get_delete_list_opts(&integration, false).await; + put_get_delete_list_opts(&integration).await; list_uses_directories_correctly(&integration).await; list_with_delimiter(&integration).await; rename_and_copy(&integration).await; diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 40dca8f756d2..5c5c70de3a2b 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -1236,13 +1236,10 @@ mod tests { use tokio::io::AsyncWriteExt; pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { - put_get_delete_list_opts(storage, false).await + put_get_delete_list_opts(storage).await } - pub(crate) async fn put_get_delete_list_opts( - storage: &DynObjectStore, - skip_list_with_spaces: bool, - ) { + pub(crate) async fn put_get_delete_list_opts(storage: &DynObjectStore) { delete_fixtures(storage).await; let content_list = flatten_list_stream(storage, None).await.unwrap(); @@ -1483,12 +1480,11 @@ mod tests { storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); storage.head(&path).await.unwrap(); - if !skip_list_with_spaces { - let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) - .await - .unwrap(); - assert_eq!(files, vec![path.clone()]); - } + let files = flatten_list_stream(storage, Some(&Path::from("foo bar"))) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + storage.delete(&path).await.unwrap(); let files = flatten_list_stream(storage, None).await.unwrap(); From a361ce13c00df53a4b6f9309c3d068f7bb3e6dc6 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Tue, 28 Nov 2023 08:42:07 -0800 Subject: [PATCH 1372/1411] Fix 'ColumnPath not found' error reading Parquet files with nested REPEATED fields (#5102) * Fix ColumnPath not found error in Parquet files with nested REPEATED fields * Avoid pushing a value just to pop it right after * Review feedback --------- Co-authored-by: Matthieu Maitre Co-authored-by: Raphael Taylor-Davies --- parquet/src/record/reader.rs | 107 ++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 2 deletions(-) diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index feaa8055e2dd..addaf7a1a455 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -274,12 +274,12 @@ impl TreeBuilder { row_group_reader, )?; - Reader::RepeatedReader( + return Ok(Reader::RepeatedReader( field, curr_def_level - 1, curr_rep_level - 1, Box::new(reader), - ) + )); } // Group types (structs) _ => { @@ -811,11 +811,14 @@ impl Iterator for ReaderIter { mod tests { use super::*; + use crate::data_type::Int64Type; use crate::errors::Result; use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::file::writer::SerializedFileWriter; use crate::record::api::{Field, Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::util::test_common::file_util::{get_test_file, get_test_path}; + use bytes::Bytes; use std::convert::TryFrom; // Convenient macros to assemble row, list, map, and group. @@ -1580,6 +1583,106 @@ mod tests { assert_eq!(rows, expected_rows); } + #[test] + fn test_tree_reader_handle_nested_repeated_fields_with_no_annotation() { + // Create schema + let schema = Arc::new( + parse_message_type( + " + message schema { + REPEATED group level1 { + REPEATED group level2 { + REQUIRED group level3 { + REQUIRED INT64 value3; + } + } + REQUIRED INT64 value1; + } + }", + ) + .unwrap(), + ); + + // Write Parquet file to buffer + let mut buffer: Vec = Vec::new(); + let mut file_writer = + SerializedFileWriter::new(&mut buffer, schema, Default::default()).unwrap(); + let mut row_group_writer = file_writer.next_row_group().unwrap(); + + // Write column level1.level2.level3.value3 + let mut column_writer = row_group_writer.next_column().unwrap().unwrap(); + column_writer + .typed::() + .write_batch(&[30, 31, 32], Some(&[2, 2, 2]), Some(&[0, 0, 0])) + .unwrap(); + column_writer.close().unwrap(); + + // Write column level1.value1 + let mut column_writer = row_group_writer.next_column().unwrap().unwrap(); + column_writer + .typed::() + .write_batch(&[10, 11, 12], Some(&[1, 1, 1]), Some(&[0, 0, 0])) + .unwrap(); + column_writer.close().unwrap(); + + // Finalize Parquet file + row_group_writer.close().unwrap(); + file_writer.close().unwrap(); + assert_eq!(&buffer[0..4], b"PAR1"); + + // Read Parquet file from buffer + let file_reader = SerializedFileReader::new(Bytes::from(buffer)).unwrap(); + let rows: Vec<_> = file_reader + .get_row_iter(None) + .unwrap() + .map(|row| row.unwrap()) + .collect(); + + let expected_rows = vec![ + row![( + "level1".to_string(), + list![group![ + ( + "level2".to_string(), + list![group![( + "level3".to_string(), + group![("value3".to_string(), Field::Long(30))] + )]] + ), + ("value1".to_string(), Field::Long(10)) + ]] + )], + row![( + "level1".to_string(), + list![group![ + ( + "level2".to_string(), + list![group![( + "level3".to_string(), + group![("value3".to_string(), Field::Long(31))] + )]] + ), + ("value1".to_string(), Field::Long(11)) + ]] + )], + row![( + "level1".to_string(), + list![group![ + ( + "level2".to_string(), + list![group![( + "level3".to_string(), + group![("value3".to_string(), Field::Long(32))] + )]] + ), + ("value1".to_string(), Field::Long(12)) + ]] + )], + ]; + + assert_eq!(rows, expected_rows); + } + fn test_file_reader_rows(file_name: &str, schema: Option) -> Result> { let file = get_test_file(file_name); let file_reader: Box = Box::new(SerializedFileReader::new(file)?); From 093a10e46203be1a0e94ae117854701bf58d4c79 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 28 Nov 2023 19:35:39 +0200 Subject: [PATCH 1373/1411] Parquet: Make `MetadataLoader` public (#5137) * Make public * revert change in MetadataFetchFn --- parquet/src/arrow/async_reader/metadata.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/async_reader/metadata.rs b/parquet/src/arrow/async_reader/metadata.rs index 2ac4e0bc9674..9224ea3f68a8 100644 --- a/parquet/src/arrow/async_reader/metadata.rs +++ b/parquet/src/arrow/async_reader/metadata.rs @@ -28,7 +28,7 @@ use std::future::Future; use std::ops::Range; /// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`] -pub(crate) trait MetadataFetch { +pub trait MetadataFetch { fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result>; } @@ -39,9 +39,7 @@ impl<'a, T: AsyncFileReader> MetadataFetch for &'a mut T { } /// An asynchronous interface to load [`ParquetMetaData`] from an async source -/// -/// Crate-private until stabilised -pub(crate) struct MetadataLoader { +pub struct MetadataLoader { /// Function that fetches byte ranges asynchronously fetch: F, /// The in-progress metadata From c161456158b122345788f86e9302fb4b5340a31e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 28 Nov 2023 12:51:30 -0800 Subject: [PATCH 1374/1411] Support casting of Float16 with other numeric types (#5139) * Support casting of Float16 with other numeric types * Add Float16 test cases --- arrow-cast/src/cast.rs | 159 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8facb4f161f4..51acd36c3fe4 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -200,8 +200,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { // start numeric casts ( - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, - UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float16 | Float32 | Float64, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float16 | Float32 | Float64, ) => true, // end numeric casts @@ -220,8 +220,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Time64(_), Time32(to_unit)) => { matches!(to_unit, Second | Millisecond) } - (Timestamp(_, _), _) if to_type.is_numeric() && to_type != &Float16 => true, - (_, Timestamp(_, _)) if from_type.is_numeric() && from_type != &Float16 => true, + (Timestamp(_, _), _) if to_type.is_numeric() => true, + (_, Timestamp(_, _)) if from_type.is_numeric() => true, (Date64, Timestamp(_, None)) => true, (Date32, Timestamp(_, None)) => true, ( @@ -1367,6 +1367,7 @@ pub fn cast_with_options( (UInt8, Int16) => cast_numeric_arrays::(array, cast_options), (UInt8, Int32) => cast_numeric_arrays::(array, cast_options), (UInt8, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt8, Float16) => cast_numeric_arrays::(array, cast_options), (UInt8, Float32) => cast_numeric_arrays::(array, cast_options), (UInt8, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1377,6 +1378,7 @@ pub fn cast_with_options( (UInt16, Int16) => cast_numeric_arrays::(array, cast_options), (UInt16, Int32) => cast_numeric_arrays::(array, cast_options), (UInt16, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt16, Float16) => cast_numeric_arrays::(array, cast_options), (UInt16, Float32) => cast_numeric_arrays::(array, cast_options), (UInt16, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1387,6 +1389,7 @@ pub fn cast_with_options( (UInt32, Int16) => cast_numeric_arrays::(array, cast_options), (UInt32, Int32) => cast_numeric_arrays::(array, cast_options), (UInt32, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt32, Float16) => cast_numeric_arrays::(array, cast_options), (UInt32, Float32) => cast_numeric_arrays::(array, cast_options), (UInt32, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1397,6 +1400,7 @@ pub fn cast_with_options( (UInt64, Int16) => cast_numeric_arrays::(array, cast_options), (UInt64, Int32) => cast_numeric_arrays::(array, cast_options), (UInt64, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt64, Float16) => cast_numeric_arrays::(array, cast_options), (UInt64, Float32) => cast_numeric_arrays::(array, cast_options), (UInt64, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1407,6 +1411,7 @@ pub fn cast_with_options( (Int8, Int16) => cast_numeric_arrays::(array, cast_options), (Int8, Int32) => cast_numeric_arrays::(array, cast_options), (Int8, Int64) => cast_numeric_arrays::(array, cast_options), + (Int8, Float16) => cast_numeric_arrays::(array, cast_options), (Int8, Float32) => cast_numeric_arrays::(array, cast_options), (Int8, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1417,6 +1422,7 @@ pub fn cast_with_options( (Int16, Int8) => cast_numeric_arrays::(array, cast_options), (Int16, Int32) => cast_numeric_arrays::(array, cast_options), (Int16, Int64) => cast_numeric_arrays::(array, cast_options), + (Int16, Float16) => cast_numeric_arrays::(array, cast_options), (Int16, Float32) => cast_numeric_arrays::(array, cast_options), (Int16, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1427,6 +1433,7 @@ pub fn cast_with_options( (Int32, Int8) => cast_numeric_arrays::(array, cast_options), (Int32, Int16) => cast_numeric_arrays::(array, cast_options), (Int32, Int64) => cast_numeric_arrays::(array, cast_options), + (Int32, Float16) => cast_numeric_arrays::(array, cast_options), (Int32, Float32) => cast_numeric_arrays::(array, cast_options), (Int32, Float64) => cast_numeric_arrays::(array, cast_options), @@ -1437,9 +1444,21 @@ pub fn cast_with_options( (Int64, Int8) => cast_numeric_arrays::(array, cast_options), (Int64, Int16) => cast_numeric_arrays::(array, cast_options), (Int64, Int32) => cast_numeric_arrays::(array, cast_options), + (Int64, Float16) => cast_numeric_arrays::(array, cast_options), (Int64, Float32) => cast_numeric_arrays::(array, cast_options), (Int64, Float64) => cast_numeric_arrays::(array, cast_options), + (Float16, UInt8) => cast_numeric_arrays::(array, cast_options), + (Float16, UInt16) => cast_numeric_arrays::(array, cast_options), + (Float16, UInt32) => cast_numeric_arrays::(array, cast_options), + (Float16, UInt64) => cast_numeric_arrays::(array, cast_options), + (Float16, Int8) => cast_numeric_arrays::(array, cast_options), + (Float16, Int16) => cast_numeric_arrays::(array, cast_options), + (Float16, Int32) => cast_numeric_arrays::(array, cast_options), + (Float16, Int64) => cast_numeric_arrays::(array, cast_options), + (Float16, Float32) => cast_numeric_arrays::(array, cast_options), + (Float16, Float64) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt8) => cast_numeric_arrays::(array, cast_options), (Float32, UInt16) => cast_numeric_arrays::(array, cast_options), (Float32, UInt32) => cast_numeric_arrays::(array, cast_options), @@ -1448,6 +1467,7 @@ pub fn cast_with_options( (Float32, Int16) => cast_numeric_arrays::(array, cast_options), (Float32, Int32) => cast_numeric_arrays::(array, cast_options), (Float32, Int64) => cast_numeric_arrays::(array, cast_options), + (Float32, Float16) => cast_numeric_arrays::(array, cast_options), (Float32, Float64) => cast_numeric_arrays::(array, cast_options), (Float64, UInt8) => cast_numeric_arrays::(array, cast_options), @@ -1458,6 +1478,7 @@ pub fn cast_with_options( (Float64, Int16) => cast_numeric_arrays::(array, cast_options), (Float64, Int32) => cast_numeric_arrays::(array, cast_options), (Float64, Int64) => cast_numeric_arrays::(array, cast_options), + (Float64, Float16) => cast_numeric_arrays::(array, cast_options), (Float64, Float32) => cast_numeric_arrays::(array, cast_options), // end numeric casts @@ -3299,6 +3320,7 @@ fn cast_list( #[cfg(test)] mod tests { use arrow_buffer::{Buffer, NullBuffer}; + use half::f16; use super::*; @@ -4665,6 +4687,15 @@ mod tests { let array = Int64Array::from(vec![Some(2), Some(10), None]); let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + let array = Float16Array::from(vec![ + Some(f16::from_f32(2.0)), + Some(f16::from_f32(10.6)), + None, + ]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + let array = Float32Array::from(vec![Some(2.0), Some(10.6), None]); let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); @@ -4682,6 +4713,9 @@ mod tests { .with_timezone("UTC".to_string()); let expected = cast(&array, &DataType::Int64).unwrap(); + let actual = cast(&cast(&array, &DataType::Float16).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + let actual = cast(&cast(&array, &DataType::Float32).unwrap(), &DataType::Int64).unwrap(); assert_eq!(&actual, &expected); @@ -6103,6 +6137,25 @@ mod tests { .collect::>() ); + let f16_expected = vec![ + f16::from_f64(-9223372000000000000.0), + f16::from_f64(-2147483600.0), + f16::from_f64(-32768.0), + f16::from_f64(-128.0), + f16::from_f64(0.0), + f16::from_f64(255.0), + f16::from_f64(65535.0), + f16::from_f64(4294967300.0), + f16::from_f64(18446744000000000000.0), + ]; + assert_eq!( + f16_expected, + get_cast_values::(&f64_array, &DataType::Float16) + .iter() + .map(|i| i.parse::().unwrap()) + .collect::>() + ); + let i64_expected = vec![ "-9223372036854775808", "-2147483648", @@ -6247,6 +6300,14 @@ mod tests { get_cast_values::(&f32_array, &DataType::Float32) ); + let f16_expected = vec![ + "-inf", "-inf", "-32768.0", "-128.0", "0.0", "255.0", "inf", "inf", "inf", + ]; + assert_eq!( + f16_expected, + get_cast_values::(&f32_array, &DataType::Float16) + ); + let i64_expected = vec![ "-2147483648", "-2147483648", @@ -6365,6 +6426,21 @@ mod tests { .collect::>() ); + let f16_expected = vec![ + f16::from_f64(0.0), + f16::from_f64(255.0), + f16::from_f64(65535.0), + f16::from_f64(4294967300.0), + f16::from_f64(18446744000000000000.0), + ]; + assert_eq!( + f16_expected, + get_cast_values::(&u64_array, &DataType::Float16) + .iter() + .map(|i| i.parse::().unwrap()) + .collect::>() + ); + let i64_expected = vec!["0", "255", "65535", "4294967295", "null"]; assert_eq!( i64_expected, @@ -6431,6 +6507,12 @@ mod tests { get_cast_values::(&u32_array, &DataType::Float32) ); + let f16_expected = vec!["0.0", "255.0", "inf", "inf"]; + assert_eq!( + f16_expected, + get_cast_values::(&u32_array, &DataType::Float16) + ); + let i64_expected = vec!["0", "255", "65535", "4294967295"]; assert_eq!( i64_expected, @@ -6497,6 +6579,12 @@ mod tests { get_cast_values::(&u16_array, &DataType::Float32) ); + let f16_expected = vec!["0.0", "255.0", "inf"]; + assert_eq!( + f16_expected, + get_cast_values::(&u16_array, &DataType::Float16) + ); + let i64_expected = vec!["0", "255", "65535"]; assert_eq!( i64_expected, @@ -6563,6 +6651,12 @@ mod tests { get_cast_values::(&u8_array, &DataType::Float32) ); + let f16_expected = vec!["0.0", "255.0"]; + assert_eq!( + f16_expected, + get_cast_values::(&u8_array, &DataType::Float16) + ); + let i64_expected = vec!["0", "255"]; assert_eq!( i64_expected, @@ -6665,6 +6759,25 @@ mod tests { .collect::>() ); + let f16_expected = vec![ + f16::from_f64(-9223372000000000000.0), + f16::from_f64(-2147483600.0), + f16::from_f64(-32768.0), + f16::from_f64(-128.0), + f16::from_f64(0.0), + f16::from_f64(127.0), + f16::from_f64(32767.0), + f16::from_f64(2147483600.0), + f16::from_f64(9223372000000000000.0), + ]; + assert_eq!( + f16_expected, + get_cast_values::(&i64_array, &DataType::Float16) + .iter() + .map(|i| i.parse::().unwrap()) + .collect::>() + ); + let i64_expected = vec![ "-9223372036854775808", "-2147483648", @@ -6808,6 +6921,23 @@ mod tests { get_cast_values::(&i32_array, &DataType::Float32) ); + let f16_expected = vec![ + f16::from_f64(-2147483600.0), + f16::from_f64(-32768.0), + f16::from_f64(-128.0), + f16::from_f64(0.0), + f16::from_f64(127.0), + f16::from_f64(32767.0), + f16::from_f64(2147483600.0), + ]; + assert_eq!( + f16_expected, + get_cast_values::(&i32_array, &DataType::Float16) + .iter() + .map(|i| i.parse::().unwrap()) + .collect::>() + ); + let i16_expected = vec!["null", "-32768", "-128", "0", "127", "32767", "null"]; assert_eq!( i16_expected, @@ -6877,6 +7007,21 @@ mod tests { get_cast_values::(&i16_array, &DataType::Float32) ); + let f16_expected = vec![ + f16::from_f64(-32768.0), + f16::from_f64(-128.0), + f16::from_f64(0.0), + f16::from_f64(127.0), + f16::from_f64(32767.0), + ]; + assert_eq!( + f16_expected, + get_cast_values::(&i16_array, &DataType::Float16) + .iter() + .map(|i| i.parse::().unwrap()) + .collect::>() + ); + let i64_expected = vec!["-32768", "-128", "0", "127", "32767"]; assert_eq!( i64_expected, @@ -6971,6 +7116,12 @@ mod tests { get_cast_values::(&i8_array, &DataType::Float32) ); + let f16_expected = vec!["-128.0", "0.0", "127.0"]; + assert_eq!( + f16_expected, + get_cast_values::(&i8_array, &DataType::Float16) + ); + let i64_expected = vec!["-128", "0", "127"]; assert_eq!( i64_expected, From ef6932f31e243d8545e097569653c8d3f1365b4d Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 29 Nov 2023 10:40:01 +1100 Subject: [PATCH 1375/1411] JSON: write struct array nulls as null (#5133) * JSON: write struct array nulls as null * Fix * Fix * Update arrow-json/src/writer.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Refactoring --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-json/src/writer.rs | 156 +++++++++++++++++++++++++++++---------- 1 file changed, 117 insertions(+), 39 deletions(-) diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 4f74817ca1e3..cabda5e2dca8 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -129,12 +129,14 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, explicit_nulls: bool, -) -> Result>, ArrowError> { +) -> Result>>, ArrowError> { let inner_col_names = array.column_names(); - let mut inner_objs = iter::repeat(JsonMap::new()) - .take(array.len()) - .collect::>>(); + let mut inner_objs = (0..array.len()) + // Ensure we write nulls for struct arrays as nulls in JSON + // Instead of writing a struct with nulls + .map(|index| array.is_valid(index).then(JsonMap::new)) + .collect::>>>(); for (j, struct_col) in array.columns().iter().enumerate() { set_column_for_json_rows( @@ -227,7 +229,11 @@ fn array_to_json_array_internal( .collect(), DataType::Struct(_) => { let jsonmaps = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; - Ok(jsonmaps.into_iter().map(Value::Object).collect()) + let json_values = jsonmaps + .into_iter() + .map(|maybe_map| maybe_map.map(Value::Object).unwrap_or(Value::Null)) + .collect(); + Ok(json_values) } DataType::Map(_, _) => as_map_array(array) .iter() @@ -251,6 +257,7 @@ macro_rules! set_column_by_array_type { $rows .iter_mut() .zip(arr.iter()) + .filter_map(|(maybe_row, maybe_value)| maybe_row.as_mut().map(|row| (row, maybe_value))) .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.map(Into::into) { row.insert($col_name.to_string(), j); @@ -262,7 +269,7 @@ macro_rules! set_column_by_array_type { } fn set_column_by_primitive_type( - rows: &mut [JsonMap], + rows: &mut [Option>], array: &ArrayRef, col_name: &str, explicit_nulls: bool, @@ -274,6 +281,7 @@ fn set_column_by_primitive_type( rows.iter_mut() .zip(primitive_arr.iter()) + .filter_map(|(maybe_row, maybe_value)| maybe_row.as_mut().map(|row| (row, maybe_value))) .for_each(|(row, maybe_value)| { if let Some(j) = maybe_value.and_then(|v| v.into_json_value()) { row.insert(col_name.to_string(), j); @@ -284,7 +292,7 @@ fn set_column_by_primitive_type( } fn set_column_for_json_rows( - rows: &mut [JsonMap], + rows: &mut [Option>], array: &ArrayRef, col_name: &str, explicit_nulls: bool, @@ -325,9 +333,11 @@ fn set_column_for_json_rows( } DataType::Null => { if explicit_nulls { - rows.iter_mut().for_each(|row| { - row.insert(col_name.to_string(), Value::Null); - }); + rows.iter_mut() + .filter_map(|maybe_row| maybe_row.as_mut()) + .for_each(|row| { + row.insert(col_name.to_string(), Value::Null); + }); } } DataType::Boolean => { @@ -348,28 +358,43 @@ fn set_column_for_json_rows( let options = FormatOptions::default(); let formatter = ArrayFormatter::try_new(array.as_ref(), &options)?; let nulls = array.nulls(); - rows.iter_mut().enumerate().for_each(|(idx, row)| { - let maybe_value = nulls - .map(|x| x.is_valid(idx)) - .unwrap_or(true) - .then(|| formatter.value(idx).to_string().into()); - if let Some(j) = maybe_value { - row.insert(col_name.to_string(), j); - } else if explicit_nulls { - row.insert(col_name.to_string(), Value::Null); - }; - }); + rows.iter_mut() + .enumerate() + .filter_map(|(idx, maybe_row)| maybe_row.as_mut().map(|row| (idx, row))) + .for_each(|(idx, row)| { + let maybe_value = nulls + .map(|x| x.is_valid(idx)) + .unwrap_or(true) + .then(|| formatter.value(idx).to_string().into()); + if let Some(j) = maybe_value { + row.insert(col_name.to_string(), j); + } else if explicit_nulls { + row.insert(col_name.to_string(), Value::Null); + } + }); } DataType::Struct(_) => { let inner_objs = struct_array_to_jsonmap_array(array.as_struct(), explicit_nulls)?; - rows.iter_mut().zip(inner_objs).for_each(|(row, obj)| { - row.insert(col_name.to_string(), Value::Object(obj)); - }); + rows.iter_mut() + .zip(inner_objs) + .filter_map(|(maybe_row, maybe_obj)| maybe_row.as_mut().map(|row| (row, maybe_obj))) + .for_each(|(row, maybe_obj)| { + let json = if let Some(obj) = maybe_obj { + Value::Object(obj) + } else { + Value::Null + }; + row.insert(col_name.to_string(), json); + }); } DataType::List(_) => { let listarr = as_list_array(array); - rows.iter_mut().zip(listarr.iter()).try_for_each( - |(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut() + .zip(listarr.iter()) + .filter_map(|(maybe_row, maybe_value)| { + maybe_row.as_mut().map(|row| (row, maybe_value)) + }) + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; @@ -379,13 +404,16 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), Value::Null); } Ok(()) - }, - )?; + })?; } DataType::LargeList(_) => { let listarr = as_large_list_array(array); - rows.iter_mut().zip(listarr.iter()).try_for_each( - |(row, maybe_value)| -> Result<(), ArrowError> { + rows.iter_mut() + .zip(listarr.iter()) + .filter_map(|(maybe_row, maybe_value)| { + maybe_row.as_mut().map(|row| (row, maybe_value)) + }) + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { let maybe_value = maybe_value .map(|v| array_to_json_array_internal(&v, explicit_nulls).map(Value::Array)) .transpose()?; @@ -395,8 +423,7 @@ fn set_column_for_json_rows( row.insert(col_name.to_string(), Value::Null); } Ok(()) - }, - )?; + })?; } DataType::Dictionary(_, value_type) => { let hydrated = arrow_cast::cast::cast(&array, value_type) @@ -422,7 +449,11 @@ fn set_column_for_json_rows( let mut kv = keys.iter().zip(values); - for (i, row) in rows.iter_mut().enumerate() { + for (i, row) in rows + .iter_mut() + .enumerate() + .filter_map(|(i, maybe_row)| maybe_row.as_mut().map(|row| (i, row))) + { if maparr.is_null(i) { row.insert(col_name.to_string(), serde_json::Value::Null); continue; @@ -461,7 +492,7 @@ fn record_batches_to_json_rows_internal( batches: &[&RecordBatch], explicit_nulls: bool, ) -> Result>, ArrowError> { - let mut rows: Vec> = iter::repeat(JsonMap::new()) + let mut rows: Vec>> = iter::repeat(Some(JsonMap::new())) .take(batches.iter().map(|b| b.num_rows()).sum()) .collect(); @@ -479,6 +510,7 @@ fn record_batches_to_json_rows_internal( } } + let rows = rows.into_iter().map(|a| a.unwrap()).collect::>(); Ok(rows) } @@ -1478,10 +1510,6 @@ mod tests { writer.write_batches(&[&batch]).unwrap(); } - // NOTE: The last value should technically be {"list": [null]} but it appears - // that implementations differ on the treatment of a null struct. - // It would be more accurate to return a null struct, so this can be done - // as a follow up. assert_json_eq( &buf, r#"{"list":[{"ints":1}]} @@ -1489,7 +1517,57 @@ mod tests { {"list":[]} {} {"list":[{}]} -{"list":[{}]} +{"list":[null]} +"#, + ); + } + + #[test] + fn json_struct_array_nulls() { + let inner = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![None]), + Some(vec![]), + Some(vec![Some(3), None]), // masked for a + Some(vec![Some(4), Some(5)]), + None, // masked for a + None, + ]); + + let field = Arc::new(Field::new("list", inner.data_type().clone(), true)); + let array = Arc::new(inner) as ArrayRef; + let struct_array_a = StructArray::from(( + vec![(field.clone(), array.clone())], + Buffer::from([0b01010111]), + )); + let struct_array_b = StructArray::from(vec![(field, array)]); + + let schema = Schema::new(vec![ + Field::new_struct("a", struct_array_a.fields().clone(), true), + Field::new_struct("b", struct_array_b.fields().clone(), true), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(struct_array_a), Arc::new(struct_array_b)], + ) + .unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"a":{"list":[1,2]},"b":{"list":[1,2]}} +{"a":{"list":[null]},"b":{"list":[null]}} +{"a":{"list":[]},"b":{"list":[]}} +{"a":null,"b":{"list":[3,null]}} +{"a":{"list":[4,5]},"b":{"list":[4,5]}} +{"a":null,"b":{}} +{"a":{},"b":{}} "#, ); } From cfdb505c97820426425b70fc8fe89022a35944a6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 29 Nov 2023 12:45:18 +0000 Subject: [PATCH 1376/1411] Deprecate Fields::remove Schema::remove (#5144) --- arrow-schema/src/fields.rs | 2 ++ arrow-schema/src/schema.rs | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 70cb1968e9a4..f90632455fd9 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -117,6 +117,8 @@ impl Fields { /// assert_eq!(fields.remove(1), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(fields.len(), 2); /// ``` + #[deprecated(note = "Use SchemaBuilder::remove")] + #[doc(hidden)] pub fn remove(&mut self, index: usize) -> FieldRef { let mut builder = SchemaBuilder::from(Fields::from(&*self.0)); let field = builder.remove(index); diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 711e4cb3314d..e547e5df3a5a 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -402,6 +402,9 @@ impl Schema { /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(schema.fields.len(), 2); /// ``` + #[deprecated(note = "Use SchemaBuilder::remove")] + #[doc(hidden)] + #[allow(deprecated)] pub fn remove(&mut self, index: usize) -> FieldRef { self.fields.remove(index) } From 8867a1f433446d134da07f8d12cc82acfa094f46 Mon Sep 17 00:00:00 2001 From: emcake <3726783+emcake@users.noreply.github.com> Date: Wed, 29 Nov 2023 17:42:10 +0000 Subject: [PATCH 1377/1411] Allow 403 for overwrite prevention (#5134) * Allow 403 for overwrite prevention * implment instead via a new 'return code override' key * add with_... method * rework: implement via header-with-status * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/precondition.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update object_store/src/aws/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * review comments * clipps lints & docs --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/aws/client.rs | 12 +++- object_store/src/aws/precondition.rs | 98 +++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 3e47abd4bcc5..ecbe556c6dfe 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -45,7 +45,7 @@ use percent_encoding::{utf8_percent_encode, PercentEncode}; use quick_xml::events::{self as xml_events}; use reqwest::{ header::{CONTENT_LENGTH, CONTENT_TYPE}, - Client as ReqwestClient, Method, RequestBuilder, Response, StatusCode, + Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; use snafu::{ResultExt, Snafu}; @@ -466,6 +466,9 @@ impl S3Client { Some(S3CopyIfNotExists::Header(k, v)) => { builder = builder.header(k, v); } + Some(S3CopyIfNotExists::HeaderWithStatus(k, v, _)) => { + builder = builder.header(k, v); + } None => { return Err(crate::Error::NotSupported { source: "S3 does not support copy-if-not-exists".to_string().into(), @@ -474,6 +477,11 @@ impl S3Client { } } + let precondition_failure = match &self.config.copy_if_not_exists { + Some(S3CopyIfNotExists::HeaderWithStatus(_, _, code)) => *code, + _ => reqwest::StatusCode::PRECONDITION_FAILED, + }; + builder .with_aws_sigv4( credential.as_deref(), @@ -485,7 +493,7 @@ impl S3Client { .send_retry(&self.config.retry_config) .await .map_err(|source| match source.status() { - Some(StatusCode::PRECONDITION_FAILED) => crate::Error::AlreadyExists { + Some(error) if error == precondition_failure => crate::Error::AlreadyExists { source: Box::new(source), path: to.to_string(), }, diff --git a/object_store/src/aws/precondition.rs b/object_store/src/aws/precondition.rs index a50b57fe23f7..ada5f3b83f07 100644 --- a/object_store/src/aws/precondition.rs +++ b/object_store/src/aws/precondition.rs @@ -17,11 +17,13 @@ use crate::config::Parse; +use itertools::Itertools; + /// Configure how to provide [`ObjectStore::copy_if_not_exists`] for [`AmazonS3`]. /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists /// [`AmazonS3`]: super::AmazonS3 -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] #[non_exhaustive] pub enum S3CopyIfNotExists { /// Some S3-compatible stores, such as Cloudflare R2, support copy if not exists @@ -29,7 +31,7 @@ pub enum S3CopyIfNotExists { /// /// If set, [`ObjectStore::copy_if_not_exists`] will perform a normal copy operation /// with the provided header pair, and expect the store to fail with `412 Precondition Failed` - /// if the destination file already exists + /// if the destination file already exists. /// /// Encoded as `header::` ignoring whitespace /// @@ -38,12 +40,20 @@ pub enum S3CopyIfNotExists { /// /// [`ObjectStore::copy_if_not_exists`]: crate::ObjectStore::copy_if_not_exists Header(String, String), + /// The same as [`S3CopyIfNotExists::Header`] but allows custom status code checking, for object stores that return values + /// other than 412. + /// + /// Encoded as `header-with-status:::` ignoring whitespace + HeaderWithStatus(String, String, reqwest::StatusCode), } impl std::fmt::Display for S3CopyIfNotExists { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Header(k, v) => write!(f, "header: {}: {}", k, v), + Self::HeaderWithStatus(k, v, code) => { + write!(f, "header-with-status: {k}: {v}: {}", code.as_u16()) + } } } } @@ -56,6 +66,17 @@ impl S3CopyIfNotExists { let (k, v) = value.split_once(':')?; Some(Self::Header(k.trim().to_string(), v.trim().to_string())) } + "header-with-status" => { + let (k, v, status) = value.split(':').collect_tuple()?; + + let code = status.trim().parse().ok()?; + + Some(Self::HeaderWithStatus( + k.trim().to_string(), + v.trim().to_string(), + code, + )) + } _ => None, } } @@ -111,3 +132,76 @@ impl Parse for S3ConditionalPut { }) } } + +#[cfg(test)] +mod tests { + use super::S3CopyIfNotExists; + + #[test] + fn parse_s3_copy_if_not_exists_header() { + let input = "header: cf-copy-destination-if-none-match: *"; + let expected = Some(S3CopyIfNotExists::Header( + "cf-copy-destination-if-none-match".to_owned(), + "*".to_owned(), + )); + + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + + #[test] + fn parse_s3_copy_if_not_exists_header_with_status() { + let input = "header-with-status:key:value:403"; + let expected = Some(S3CopyIfNotExists::HeaderWithStatus( + "key".to_owned(), + "value".to_owned(), + reqwest::StatusCode::FORBIDDEN, + )); + + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + + #[test] + fn parse_s3_copy_if_not_exists_header_whitespace_invariant() { + let expected = Some(S3CopyIfNotExists::Header( + "cf-copy-destination-if-none-match".to_owned(), + "*".to_owned(), + )); + + const INPUTS: &[&str] = &[ + "header:cf-copy-destination-if-none-match:*", + "header: cf-copy-destination-if-none-match:*", + "header: cf-copy-destination-if-none-match: *", + "header : cf-copy-destination-if-none-match: *", + "header : cf-copy-destination-if-none-match : *", + "header : cf-copy-destination-if-none-match : * ", + ]; + + for input in INPUTS { + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + } + + #[test] + fn parse_s3_copy_if_not_exists_header_with_status_whitespace_invariant() { + let expected = Some(S3CopyIfNotExists::HeaderWithStatus( + "key".to_owned(), + "value".to_owned(), + reqwest::StatusCode::FORBIDDEN, + )); + + const INPUTS: &[&str] = &[ + "header-with-status:key:value:403", + "header-with-status: key:value:403", + "header-with-status: key: value:403", + "header-with-status: key: value: 403", + "header-with-status : key: value: 403", + "header-with-status : key : value: 403", + "header-with-status : key : value : 403", + "header-with-status : key : value : 403 ", + ]; + + for input in INPUTS { + assert_eq!(expected, S3CopyIfNotExists::from_str(input)); + } + } +} From 6d4b8bbad95c7e4fec0c4f1fb755ad7a1c542983 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 29 Nov 2023 21:49:38 +0000 Subject: [PATCH 1378/1411] Support nested schema projection (#5148) (#5149) * Support nested schema projection * Tweak doc * Review feedback --- arrow-schema/src/fields.rs | 232 ++++++++++++++++++++++++++++++++++++- 1 file changed, 231 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index f90632455fd9..400f42c59c30 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::{ArrowError, Field, FieldRef, SchemaBuilder}; use std::ops::Deref; use std::sync::Arc; +use crate::{ArrowError, DataType, Field, FieldRef, SchemaBuilder}; + /// A cheaply cloneable, owned slice of [`FieldRef`] /// /// Similar to `Arc>` or `Arc<[FieldRef]>` @@ -99,6 +100,108 @@ impl Fields { .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b)) } + /// Returns a copy of this [`Fields`] containing only those [`FieldRef`] passing a predicate + /// + /// Performs a depth-first scan of [`Fields`] invoking `filter` for each [`FieldRef`] + /// containing no child [`FieldRef`], a leaf field, along with a count of the number + /// of such leaves encountered so far. Only [`FieldRef`] for which `filter` + /// returned `true` will be included in the result. + /// + /// This can therefore be used to select a subset of fields from nested types + /// such as [`DataType::Struct`] or [`DataType::List`]. + /// + /// ``` + /// # use arrow_schema::{DataType, Field, Fields}; + /// let fields = Fields::from(vec![ + /// Field::new("a", DataType::Int32, true), // Leaf 0 + /// Field::new("b", DataType::Struct(Fields::from(vec![ + /// Field::new("c", DataType::Float32, false), // Leaf 1 + /// Field::new("d", DataType::Float64, false), // Leaf 2 + /// Field::new("e", DataType::Struct(Fields::from(vec![ + /// Field::new("f", DataType::Int32, false), // Leaf 3 + /// Field::new("g", DataType::Float16, false), // Leaf 4 + /// ])), true), + /// ])), false) + /// ]); + /// let filtered = fields.filter_leaves(|idx, _| [0, 2, 3, 4].contains(&idx)); + /// let expected = Fields::from(vec![ + /// Field::new("a", DataType::Int32, true), + /// Field::new("b", DataType::Struct(Fields::from(vec![ + /// Field::new("d", DataType::Float64, false), + /// Field::new("e", DataType::Struct(Fields::from(vec![ + /// Field::new("f", DataType::Int32, false), + /// Field::new("g", DataType::Float16, false), + /// ])), true), + /// ])), false) + /// ]); + /// assert_eq!(filtered, expected); + /// ``` + pub fn filter_leaves bool>(&self, mut filter: F) -> Self { + fn filter_field bool>( + f: &FieldRef, + filter: &mut F, + ) -> Option { + use DataType::*; + + let v = match f.data_type() { + Dictionary(_, v) => v.as_ref(), // Key must be integer + RunEndEncoded(_, v) => v.data_type(), // Run-ends must be integer + d => d, + }; + let d = match v { + List(child) => List(filter_field(child, filter)?), + LargeList(child) => LargeList(filter_field(child, filter)?), + Map(child, ordered) => Map(filter_field(child, filter)?, *ordered), + FixedSizeList(child, size) => FixedSizeList(filter_field(child, filter)?, *size), + Struct(fields) => { + let filtered: Fields = fields + .iter() + .filter_map(|f| filter_field(f, filter)) + .collect(); + + if filtered.is_empty() { + return None; + } + + Struct(filtered) + } + Union(fields, mode) => { + let filtered: UnionFields = fields + .iter() + .filter_map(|(id, f)| Some((id, filter_field(f, filter)?))) + .collect(); + + if filtered.is_empty() { + return None; + } + + Union(filtered, *mode) + } + _ => return filter(f).then(|| f.clone()), + }; + let d = match f.data_type() { + Dictionary(k, _) => Dictionary(k.clone(), Box::new(d)), + RunEndEncoded(v, f) => { + RunEndEncoded(v.clone(), Arc::new(f.as_ref().clone().with_data_type(d))) + } + _ => d, + }; + Some(Arc::new(f.as_ref().clone().with_data_type(d))) + } + + let mut leaf_idx = 0; + let mut filter = |f: &FieldRef| { + let t = filter(leaf_idx, f); + leaf_idx += 1; + t + }; + + self.0 + .iter() + .filter_map(|f| filter_field(f, &mut filter)) + .collect() + } + /// Remove a field by index and return it. /// /// # Panic @@ -307,3 +410,130 @@ impl FromIterator<(i8, FieldRef)> for UnionFields { Self(iter.into_iter().collect()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::UnionMode; + + #[test] + fn test_filter() { + let floats = Fields::from(vec![ + Field::new("a", DataType::Float32, false), + Field::new("b", DataType::Float32, false), + ]); + let fields = Fields::from(vec![ + Field::new("a", DataType::Int32, true), + Field::new("floats", DataType::Struct(floats.clone()), true), + Field::new("b", DataType::Int16, true), + Field::new( + "c", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new( + "d", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Struct(floats.clone())), + ), + false, + ), + Field::new_list( + "e", + Field::new("floats", DataType::Struct(floats.clone()), true), + true, + ), + Field::new( + "f", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3), + false, + ), + Field::new_map( + "g", + "entries", + Field::new("keys", DataType::LargeUtf8, false), + Field::new("values", DataType::Int32, true), + false, + false, + ), + Field::new( + "h", + DataType::Union( + UnionFields::new( + vec![1, 3], + vec![ + Field::new("field1", DataType::UInt8, false), + Field::new("field3", DataType::Utf8, false), + ], + ), + UnionMode::Dense, + ), + true, + ), + Field::new( + "i", + DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Struct(floats.clone()), true)), + ), + false, + ), + ]); + + let floats_a = DataType::Struct(vec![floats[0].clone()].into()); + + let r = fields.filter_leaves(|idx, _| idx == 0 || idx == 1); + assert_eq!(r.len(), 2); + assert_eq!(r[0], fields[0]); + assert_eq!(r[1].data_type(), &floats_a); + + let r = fields.filter_leaves(|_, f| f.name() == "a"); + assert_eq!(r.len(), 5); + assert_eq!(r[0], fields[0]); + assert_eq!(r[1].data_type(), &floats_a); + assert_eq!( + r[2].data_type(), + &DataType::Dictionary(Box::new(DataType::Int32), Box::new(floats_a.clone())) + ); + assert_eq!( + r[3].as_ref(), + &Field::new_list("e", Field::new("floats", floats_a.clone(), true), true) + ); + assert_eq!( + r[4].as_ref(), + &Field::new( + "i", + DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", floats_a.clone(), true)), + ), + false, + ) + ); + + let r = fields.filter_leaves(|_, f| f.name() == "floats"); + assert_eq!(r.len(), 0); + + let r = fields.filter_leaves(|idx, _| idx == 9); + assert_eq!(r.len(), 1); + assert_eq!(r[0], fields[6]); + + let r = fields.filter_leaves(|idx, _| idx == 10 || idx == 11); + assert_eq!(r.len(), 1); + assert_eq!(r[0], fields[7]); + + let union = DataType::Union( + UnionFields::new(vec![1], vec![Field::new("field1", DataType::UInt8, false)]), + UnionMode::Dense, + ); + + let r = fields.filter_leaves(|idx, _| idx == 12); + assert_eq!(r.len(), 1); + assert_eq!(r[0].data_type(), &union); + + let r = fields.filter_leaves(|idx, _| idx == 14 || idx == 15); + assert_eq!(r.len(), 1); + assert_eq!(r[0], fields[9]); + } +} From f621d28db590ff6ad3907450f7ff434c7deb9766 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Fri, 1 Dec 2023 04:46:36 +1100 Subject: [PATCH 1379/1411] Parquet: omit min/max for interval columns when writing stats (#5147) * Parquet: omit min/max for interval columns when writing stats * Trigger --- parquet/src/column/writer/encoder.rs | 7 +++- parquet/src/column/writer/mod.rs | 59 +++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index d0720dd24306..0d5144f61c26 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -18,7 +18,7 @@ use bytes::Bytes; use half::f16; -use crate::basic::{Encoding, LogicalType, Type}; +use crate::basic::{ConvertedType, Encoding, LogicalType, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -137,7 +137,10 @@ pub struct ColumnValueEncoderImpl { impl ColumnValueEncoderImpl { fn write_slice(&mut self, slice: &[T::T]) -> Result<()> { - if self.statistics_enabled == EnabledStatistics::Page { + if self.statistics_enabled == EnabledStatistics::Page + // INTERVAL has undefined sort order, so don't write min/max stats for it + && self.descr.converted_type() != ConvertedType::INTERVAL + { if let Some((min, max)) = self.min_max(slice, None) { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 14b8655091e4..e92a502689a3 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -332,7 +332,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // If only computing chunk-level statistics compute them here, page-level statistics // are computed in [`Self::write_mini_batch`] and used to update chunk statistics in // [`Self::add_data_page`] - if self.statistics_enabled == EnabledStatistics::Chunk { + if self.statistics_enabled == EnabledStatistics::Chunk + // INTERVAL has undefined sort order, so don't write min/max stats for it + && self.descr.converted_type() != ConvertedType::INTERVAL + { match (min, max) { (Some(min), Some(max)) => { update_min(&self.descr, min, &mut self.column_metrics.min_column_value); @@ -1093,7 +1096,6 @@ fn is_nan(descr: &ColumnDescriptor, val: &T) -> bool { /// /// If `cur` is `None`, sets `cur` to `Some(val)`, otherwise calls `should_update` with /// the value of `cur`, and updates `cur` to `Some(val)` if it returns `true` - fn update_stat( descr: &ColumnDescriptor, val: &T, @@ -3066,6 +3068,30 @@ mod tests { Ok(()) } + #[test] + fn test_interval_stats_should_not_have_min_max() { + let input = [ + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], + ] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let page_writer = get_test_page_writer(); + let mut writer = get_test_interval_column_writer(page_writer); + writer.write_batch(&input, None, None).unwrap(); + + let metadata = writer.close().unwrap().metadata; + let stats = if let Some(Statistics::FixedLenByteArray(stats)) = metadata.statistics() { + stats.clone() + } else { + panic!("metadata missing statistics"); + }; + assert!(!stats.has_min_max_set()); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]], @@ -3395,8 +3421,7 @@ mod tests { values: &[FixedLenByteArray], ) -> ValueStatistics { let page_writer = get_test_page_writer(); - let props = Default::default(); - let mut writer = get_test_float16_column_writer(page_writer, 0, 0, props); + let mut writer = get_test_float16_column_writer(page_writer); writer.write_batch(values, None, None).unwrap(); let metadata = writer.close().unwrap().metadata; @@ -3409,12 +3434,9 @@ mod tests { fn get_test_float16_column_writer( page_writer: Box, - max_def_level: i16, - max_rep_level: i16, - props: WriterPropertiesPtr, ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { - let descr = Arc::new(get_test_float16_column_descr(max_def_level, max_rep_level)); - let column_writer = get_column_writer(descr, props, page_writer); + let descr = Arc::new(get_test_float16_column_descr(0, 0)); + let column_writer = get_column_writer(descr, Default::default(), page_writer); get_typed_column_writer::(column_writer) } @@ -3429,6 +3451,25 @@ mod tests { ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, path) } + fn get_test_interval_column_writer( + page_writer: Box, + ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { + let descr = Arc::new(get_test_interval_column_descr()); + let column_writer = get_column_writer(descr, Default::default(), page_writer); + get_typed_column_writer::(column_writer) + } + + fn get_test_interval_column_descr() -> ColumnDescriptor { + let path = ColumnPath::from("col"); + let tpe = + SchemaType::primitive_type_builder("col", FixedLenByteArrayType::get_physical_type()) + .with_length(12) + .with_converted_type(ConvertedType::INTERVAL) + .build() + .unwrap(); + ColumnDescriptor::new(Arc::new(tpe), 0, 0, path) + } + /// Returns column writer for UINT32 Column provided as ConvertedType only fn get_test_unsigned_int_given_as_converted_column_writer<'a, T: DataType>( page_writer: Box, From 5788c69958f99694afb39e13f199f6e2b3999a9a Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 5 Dec 2023 21:03:26 +1100 Subject: [PATCH 1380/1411] object_store: fix failing doctest with default features (#5161) --- object_store/Cargo.toml | 4 ++-- object_store/src/lib.rs | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 2f5157c40e67..d5cf91c3324f 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -67,10 +67,10 @@ http = ["cloud"] tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] [dev-dependencies] # In alphabetical order -tempfile = "3.1.0" futures-test = "0.3" -rand = "0.8" hyper = { version = "0.14.24", features = ["server"] } +rand = "0.8" +tempfile = "3.1.0" [[test]] name = "get_range_file" diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 5c5c70de3a2b..3a841667ff97 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -119,6 +119,7 @@ //! application complexity. //! //! ```no_run +//! # #[cfg(feature = "aws")] { //! # use url::Url; //! # use object_store::{parse_url, parse_url_opts}; //! # use object_store::aws::{AmazonS3, AmazonS3Builder}; @@ -140,6 +141,7 @@ //! let url = Url::parse("https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path").unwrap(); //! let (store, path) = parse_url(&url).unwrap(); //! assert_eq!(path.as_ref(), "path"); +//! # } //! ``` //! //! [PyArrow FileSystem]: https://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html#pyarrow.fs.FileSystem.from_uri From 9efaf062476276570ea4c6c7b7ce698d8c2b043d Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 5 Dec 2023 02:05:31 -0800 Subject: [PATCH 1381/1411] Adding `is_null` datatype shortcut method (#5157) --- arrow-schema/src/datatype.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index b78c785ae279..330ae5c9e346 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -350,23 +350,27 @@ impl DataType { } /// Returns true if this type is floating: (Float*). + #[inline] pub fn is_floating(&self) -> bool { use DataType::*; matches!(self, Float16 | Float32 | Float64) } /// Returns true if this type is integer: (Int*, UInt*). + #[inline] pub fn is_integer(&self) -> bool { self.is_signed_integer() || self.is_unsigned_integer() } /// Returns true if this type is signed integer: (Int*). + #[inline] pub fn is_signed_integer(&self) -> bool { use DataType::*; matches!(self, Int8 | Int16 | Int32 | Int64) } /// Returns true if this type is unsigned integer: (UInt*). + #[inline] pub fn is_unsigned_integer(&self) -> bool { use DataType::*; matches!(self, UInt8 | UInt16 | UInt32 | UInt64) @@ -387,6 +391,7 @@ impl DataType { /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, /// or Map), or a dictionary of a nested type + #[inline] pub fn is_nested(&self) -> bool { use DataType::*; match self { @@ -398,6 +403,13 @@ impl DataType { } } + /// Returns true if this type is DataType::Null. + #[inline] + pub fn is_null(&self) -> bool { + use DataType::*; + matches!(self, Null) + } + /// Compares the datatype with another, ignoring nested field names /// and metadata. pub fn equals_datatype(&self, other: &DataType) -> bool { @@ -855,6 +867,12 @@ mod tests { assert!(!DataType::is_floating(&DataType::Int32)); } + #[test] + fn test_datatype_is_null() { + assert!(DataType::is_null(&DataType::Null)); + assert!(!DataType::is_null(&DataType::Int32)); + } + #[test] fn size_should_not_regress() { assert_eq!(std::mem::size_of::(), 24); From 6b905fe2324b61f54d3a5cd5d01ab69478593dc9 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Tue, 5 Dec 2023 15:36:00 +0530 Subject: [PATCH 1382/1411] Removing ahash (#5156) --- arrow/Cargo.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 37f03a05b3fa..6ca218f5f658 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -38,12 +38,6 @@ name = "arrow" path = "src/lib.rs" bench = false -[target.'cfg(target_arch = "wasm32")'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } - [dependencies] arrow-arith = { workspace = true } arrow-array = { workspace = true } From b8d3f3380c853a5f4b9c47410f6b30ed7aa28737 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:08:46 +1100 Subject: [PATCH 1383/1411] Parquet: don't truncate f16/decimal min/max stats (#5154) * Parquet: don't truncate f16/decimal min/max stats * Fix --- parquet/src/column/writer/mod.rs | 153 +++++++++++++++++++++++++------ 1 file changed, 124 insertions(+), 29 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index e92a502689a3..5dd7747c6fc2 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -680,32 +680,28 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } self.last_non_null_data_page_min_max = Some((new_min.clone(), new_max.clone())); - // We only truncate if the data is represented as binary - match self.descr.physical_type() { - Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { - self.column_index_builder.append( - null_page, - self.truncate_min_value( - self.props.column_index_truncate_length(), - stat.min_bytes(), - ) - .0, - self.truncate_max_value( - self.props.column_index_truncate_length(), - stat.max_bytes(), - ) - .0, - self.page_metrics.num_page_nulls as i64, - ); - } - _ => { - self.column_index_builder.append( - null_page, - stat.min_bytes().to_vec(), - stat.max_bytes().to_vec(), - self.page_metrics.num_page_nulls as i64, - ); - } + if self.can_truncate_value() { + self.column_index_builder.append( + null_page, + self.truncate_min_value( + self.props.column_index_truncate_length(), + stat.min_bytes(), + ) + .0, + self.truncate_max_value( + self.props.column_index_truncate_length(), + stat.max_bytes(), + ) + .0, + self.page_metrics.num_page_nulls as i64, + ); + } else { + self.column_index_builder.append( + null_page, + stat.min_bytes().to_vec(), + stat.max_bytes().to_vec(), + self.page_metrics.num_page_nulls as i64, + ); } } } @@ -715,6 +711,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .append_row_count(self.page_metrics.num_buffered_rows as i64); } + /// Determine if we should allow truncating min/max values for this column's statistics + fn can_truncate_value(&self) -> bool { + match self.descr.physical_type() { + // Don't truncate for Float16 and Decimal because their sort order is different + // from that of FIXED_LEN_BYTE_ARRAY sort order. + // So truncation of those types could lead to inaccurate min/max statistics + Type::FIXED_LEN_BYTE_ARRAY + if !matches!( + self.descr.logical_type(), + Some(LogicalType::Decimal { .. }) | Some(LogicalType::Float16) + ) => + { + true + } + Type::BYTE_ARRAY => true, + // Truncation only applies for fba/binary physical types + _ => false, + } + } + fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) @@ -948,7 +964,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { .with_min_is_exact(!did_truncate_min), ) } - Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => { + Statistics::FixedLenByteArray(stats) + if (stats.has_min_max_set() && self.can_truncate_value()) => + { let (min, did_truncate_min) = self.truncate_min_value( self.props.statistics_truncate_length(), stats.min_bytes(), @@ -2713,6 +2731,82 @@ mod tests { } } + #[test] + fn test_float16_min_max_no_truncation() { + // Even if we set truncation to occur at 1 byte, we should not truncate for Float16 + let builder = WriterProperties::builder().set_column_index_truncate_length(Some(1)); + let props = Arc::new(builder.build()); + let page_writer = get_test_page_writer(); + let mut writer = get_test_float16_column_writer(page_writer, props); + + let expected_value = f16::PI.to_le_bytes().to_vec(); + let data = vec![ByteArray::from(expected_value.clone()).into()]; + writer.write_batch(&data, None, None).unwrap(); + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + // stats should still be written + // ensure bytes weren't truncated for column index + let column_index = r.column_index.unwrap(); + let column_index_min_bytes = column_index.min_values[0].as_slice(); + let column_index_max_bytes = column_index.max_values[0].as_slice(); + assert_eq!(expected_value, column_index_min_bytes); + assert_eq!(expected_value, column_index_max_bytes); + + // ensure bytes weren't truncated for statistics + let stats = r.metadata.statistics().unwrap(); + assert!(stats.has_min_max_set()); + if let Statistics::FixedLenByteArray(stats) = stats { + let stats_min_bytes = stats.min_bytes(); + let stats_max_bytes = stats.max_bytes(); + assert_eq!(expected_value, stats_min_bytes); + assert_eq!(expected_value, stats_max_bytes); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } + + #[test] + fn test_decimal_min_max_no_truncation() { + // Even if we set truncation to occur at 1 byte, we should not truncate for Decimal + let builder = WriterProperties::builder().set_column_index_truncate_length(Some(1)); + let props = Arc::new(builder.build()); + let page_writer = get_test_page_writer(); + let mut writer = + get_test_decimals_column_writer::(page_writer, 0, 0, props); + + let expected_value = vec![ + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8, + 231u8, 90u8, 0u8, 0u8, + ]; + let data = vec![ByteArray::from(expected_value.clone()).into()]; + writer.write_batch(&data, None, None).unwrap(); + writer.flush_data_pages().unwrap(); + + let r = writer.close().unwrap(); + + // stats should still be written + // ensure bytes weren't truncated for column index + let column_index = r.column_index.unwrap(); + let column_index_min_bytes = column_index.min_values[0].as_slice(); + let column_index_max_bytes = column_index.max_values[0].as_slice(); + assert_eq!(expected_value, column_index_min_bytes); + assert_eq!(expected_value, column_index_max_bytes); + + // ensure bytes weren't truncated for statistics + let stats = r.metadata.statistics().unwrap(); + assert!(stats.has_min_max_set()); + if let Statistics::FixedLenByteArray(stats) = stats { + let stats_min_bytes = stats.min_bytes(); + let stats_max_bytes = stats.max_bytes(); + assert_eq!(expected_value, stats_min_bytes); + assert_eq!(expected_value, stats_max_bytes); + } else { + panic!("expecting Statistics::FixedLenByteArray"); + } + } + #[test] fn test_statistics_truncating_byte_array() { let page_writer = get_test_page_writer(); @@ -3421,7 +3515,7 @@ mod tests { values: &[FixedLenByteArray], ) -> ValueStatistics { let page_writer = get_test_page_writer(); - let mut writer = get_test_float16_column_writer(page_writer); + let mut writer = get_test_float16_column_writer(page_writer, Default::default()); writer.write_batch(values, None, None).unwrap(); let metadata = writer.close().unwrap().metadata; @@ -3434,9 +3528,10 @@ mod tests { fn get_test_float16_column_writer( page_writer: Box, + props: WriterPropertiesPtr, ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> { let descr = Arc::new(get_test_float16_column_descr(0, 0)); - let column_writer = get_column_writer(descr, Default::default(), page_writer); + let column_writer = get_column_writer(descr, props, page_writer); get_typed_column_writer::(column_writer) } From f16d2f516de9f75b1792161d962c2fbae926c074 Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:26:26 +1100 Subject: [PATCH 1384/1411] Parquet: write column_orders in FileMetaData (#5158) --- parquet/src/file/writer.rs | 91 +++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2b9f261d9f42..f0b75f302552 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -323,6 +323,19 @@ impl SerializedFileWriter { None => Some(self.kv_metadatas.clone()), }; + // We only include ColumnOrder for leaf nodes. + // Currently only supported ColumnOrder is TypeDefinedOrder so we set this + // for all leaf nodes. + // Even if the column has an undefined sort order, such as INTERVAL, this + // is still technically the defined TYPEORDER so it should still be set. + let column_orders = (0..self.schema_descr().num_columns()) + .map(|_| parquet::ColumnOrder::TYPEORDER(parquet::TypeDefinedOrder {})) + .collect(); + // This field is optional, perhaps in cases where no min/max fields are set + // in any Statistics or ColumnIndex object in the whole file. + // But for simplicity we always set this field. + let column_orders = Some(column_orders); + let file_metadata = parquet::FileMetaData { num_rows, row_groups, @@ -330,7 +343,7 @@ impl SerializedFileWriter { version: self.props.writer_version().as_num(), schema: types::to_thrift(self.schema.as_ref())?, created_by: Some(self.props.created_by().to_owned()), - column_orders: None, + column_orders, encryption_algorithm: None, footer_signing_key_metadata: None, }; @@ -738,7 +751,9 @@ mod tests { use bytes::Bytes; use std::fs::File; - use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; + use crate::basic::{ + ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, SortOrder, Type, + }; use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; @@ -851,6 +866,78 @@ mod tests { assert_eq!(reader.get_row_iter(None).unwrap().count(), 0); } + #[test] + fn test_file_writer_column_orders_populated() { + let file = tempfile::tempfile().unwrap(); + + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(vec![ + Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + types::Type::primitive_type_builder("col2", Type::FIXED_LEN_BYTE_ARRAY) + .with_converted_type(ConvertedType::INTERVAL) + .with_length(12) + .build() + .unwrap(), + ), + Arc::new( + types::Type::group_type_builder("nested") + .with_repetition(Repetition::REQUIRED) + .with_fields(vec![ + Arc::new( + types::Type::primitive_type_builder( + "col3", + Type::FIXED_LEN_BYTE_ARRAY, + ) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build() + .unwrap(), + ), + Arc::new( + types::Type::primitive_type_builder("col4", Type::BYTE_ARRAY) + .with_logical_type(Some(LogicalType::String)) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ); + + let props = Default::default(); + let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + + // only leaves + let expected = vec![ + // INT32 + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + // INTERVAL + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED), + // Float16 + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + // String + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED), + ]; + let actual = reader.metadata().file_metadata().column_orders(); + + assert!(actual.is_some()); + let actual = actual.unwrap(); + assert_eq!(*actual, expected); + } + #[test] fn test_file_writer_with_metadata() { let file = tempfile::tempfile().unwrap(); From a36bf7ade4091f90ac5fad30716444e09c56051d Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:27:28 +1100 Subject: [PATCH 1385/1411] Parquet: clear metadata and project fields of ParquetRecordBatchStream::schema (#5135) * Parquet: clear metadata of ParquetRecordBatchStream::schema * Revert "Parquet: clear metadata of ParquetRecordBatchStream::schema" This reverts commit 84be336393018be53c3f0cd52155d717898ea3c7. * Document expected behaviour * Revert "Document expected behaviour" This reverts commit ef9601e84a9494145e315d222dcf4a66b22dbbef. * Reapply "Parquet: clear metadata of ParquetRecordBatchStream::schema" This reverts commit fd662ad84b60275e329e23617e8f3e81796bfa3e. * ParquetRecordBatchStream should strip schema metadata and respect projection * Fix projection of nested fields --- parquet/src/arrow/arrow_reader/mod.rs | 4 + parquet/src/arrow/async_reader/mod.rs | 136 +++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index b9e9d2898459..77de83994078 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -575,6 +575,10 @@ impl Iterator for ParquetRecordBatchReader { } impl RecordBatchReader for ParquetRecordBatchReader { + /// Returns the projected [`SchemaRef`] for reading the parquet file. + /// + /// Note that the schema metadata will be stripped here. See + /// [`ParquetRecordBatchReaderBuilder::schema`] if the metadata is desired. fn schema(&self) -> SchemaRef { self.schema.clone() } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 04383bb51bda..80a554026d9a 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -90,7 +90,7 @@ use futures::stream::Stream; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; use arrow_array::RecordBatch; -use arrow_schema::SchemaRef; +use arrow_schema::{DataType, Fields, Schema, SchemaRef}; use crate::arrow::array_reader::{build_array_reader, RowGroups}; use crate::arrow::arrow_reader::{ @@ -385,13 +385,24 @@ impl ParquetRecordBatchStreamBuilder { offset: self.offset, }; + // Ensure schema of ParquetRecordBatchStream respects projection, and does + // not store metadata (same as for ParquetRecordBatchReader and emitted RecordBatches) + let projected_fields = match reader.fields.as_deref().map(|pf| &pf.arrow_type) { + Some(DataType::Struct(fields)) => { + fields.filter_leaves(|idx, _| self.projection.leaf_included(idx)) + } + None => Fields::empty(), + _ => unreachable!("Must be Struct for root type"), + }; + let schema = Arc::new(Schema::new(projected_fields)); + Ok(ParquetRecordBatchStream { metadata: self.metadata, batch_size, row_groups, projection: self.projection, selection: self.selection, - schema: self.schema, + schema, reader: Some(reader), state: StreamState::Init, }) @@ -572,7 +583,10 @@ impl std::fmt::Debug for ParquetRecordBatchStream { } impl ParquetRecordBatchStream { - /// Returns the [`SchemaRef`] for this parquet file + /// Returns the projected [`SchemaRef`] for reading the parquet file. + /// + /// Note that the schema metadata will be stripped here. See + /// [`ParquetRecordBatchStreamBuilder::schema`] if the metadata is desired. pub fn schema(&self) -> &SchemaRef { &self.schema } @@ -855,11 +869,15 @@ mod tests { use arrow_array::builder::{ListBuilder, StringBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int32Type; - use arrow_array::{Array, ArrayRef, Int32Array, Int8Array, Scalar, StringArray, UInt64Array}; + use arrow_array::{ + Array, ArrayRef, Int32Array, Int8Array, RecordBatchReader, Scalar, StringArray, + StructArray, UInt64Array, + }; use arrow_schema::{DataType, Field, Schema}; use futures::{StreamExt, TryStreamExt}; use rand::{thread_rng, Rng}; - use std::sync::Mutex; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; use tempfile::tempfile; #[derive(Clone)] @@ -1584,6 +1602,114 @@ mod tests { test_get_row_group_column_bloom_filter(data, false).await; } + #[tokio::test] + async fn test_parquet_record_batch_stream_schema() { + fn get_all_field_names(schema: &Schema) -> Vec<&String> { + schema.all_fields().iter().map(|f| f.name()).collect() + } + + // ParquetRecordBatchReaderBuilder::schema differs from + // ParquetRecordBatchReader::schema and RecordBatch::schema in the returned + // schema contents (in terms of custom metadata attached to schema, and fields + // returned). Test to ensure this remains consistent behaviour. + // + // Ensure same for asynchronous versions of the above. + + // Prep data, for a schema with nested fields, with custom metadata + let mut metadata = HashMap::with_capacity(1); + metadata.insert("key".to_string(), "value".to_string()); + + let nested_struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("d", DataType::Utf8, true)), + Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef, + ), + ( + Arc::new(Field::new("e", DataType::Utf8, true)), + Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef, + ), + ]); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("a", DataType::Int32, true)), + Arc::new(Int32Array::from(vec![-1, 1])) as ArrayRef, + ), + ( + Arc::new(Field::new("b", DataType::UInt64, true)), + Arc::new(UInt64Array::from(vec![1, 2])) as ArrayRef, + ), + ( + Arc::new(Field::new( + "c", + nested_struct_array.data_type().clone(), + true, + )), + Arc::new(nested_struct_array) as ArrayRef, + ), + ]); + + let schema = + Arc::new(Schema::new(struct_array.fields().clone()).with_metadata(metadata.clone())); + let record_batch = RecordBatch::from(struct_array) + .with_schema(schema.clone()) + .unwrap(); + + // Write parquet with custom metadata in schema + let mut file = tempfile().unwrap(); + let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap(); + writer.write(&record_batch).unwrap(); + writer.close().unwrap(); + + let all_fields = ["a", "b", "c", "d", "e"]; + // (leaf indices in mask, expected names in output schema all fields) + let projections = [ + (vec![], vec![]), + (vec![0], vec!["a"]), + (vec![0, 1], vec!["a", "b"]), + (vec![0, 1, 2], vec!["a", "b", "c", "d"]), + (vec![0, 1, 2, 3], vec!["a", "b", "c", "d", "e"]), + ]; + + // Ensure we're consistent for each of these projections + for (indices, expected_projected_names) in projections { + let assert_schemas = |builder: SchemaRef, reader: SchemaRef, batch: SchemaRef| { + // Builder schema should preserve all fields and metadata + assert_eq!(get_all_field_names(&builder), all_fields); + assert_eq!(builder.metadata, metadata); + // Reader & batch schema should show only projected fields, and no metadata + assert_eq!(get_all_field_names(&reader), expected_projected_names); + assert_eq!(reader.metadata, HashMap::default()); + assert_eq!(get_all_field_names(&batch), expected_projected_names); + assert_eq!(batch.metadata, HashMap::default()); + }; + + let builder = + ParquetRecordBatchReaderBuilder::try_new(file.try_clone().unwrap()).unwrap(); + let sync_builder_schema = builder.schema().clone(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), indices.clone()); + let mut reader = builder.with_projection(mask).build().unwrap(); + let sync_reader_schema = reader.schema(); + let batch = reader.next().unwrap().unwrap(); + let sync_batch_schema = batch.schema(); + assert_schemas(sync_builder_schema, sync_reader_schema, sync_batch_schema); + + // asynchronous should be same + let file = tokio::fs::File::from(file.try_clone().unwrap()); + let builder = ParquetRecordBatchStreamBuilder::new(file).await.unwrap(); + let async_builder_schema = builder.schema().clone(); + let mask = ProjectionMask::leaves(builder.parquet_schema(), indices); + let mut reader = builder.with_projection(mask).build().unwrap(); + let async_reader_schema = reader.schema().clone(); + let batch = reader.next().await.unwrap().unwrap(); + let async_batch_schema = batch.schema(); + assert_schemas( + async_builder_schema, + async_reader_schema, + async_batch_schema, + ); + } + } + #[tokio::test] async fn test_get_row_group_column_bloom_filter_with_length() { // convert to new parquet file with bloom_filter_length From cb338c94a9e5226903796848cc1a370df1e8a842 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Dec 2023 12:54:05 +0000 Subject: [PATCH 1386/1411] Update rustls-pemfile requirement from 1.0 to 2.0 in /object_store (#5155) * Update rustls-pemfile requirement from 1.0 to 2.0 in /object_store Updates the requirements on [rustls-pemfile](https://github.com/rustls/pemfile) to permit the latest version. - [Release notes](https://github.com/rustls/pemfile/releases) - [Commits](https://github.com/rustls/pemfile/compare/v/1.0.0...v/2.0.0) --- updated-dependencies: - dependency-name: rustls-pemfile dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies --- object_store/Cargo.toml | 2 +- object_store/src/gcp/credential.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index d5cf91c3324f..e7f99e529e07 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -52,7 +52,7 @@ serde_json = { version = "1.0", default-features = false, optional = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } reqwest = { version = "0.11", default-features = false, features = ["rustls-tls-native-roots"], optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } -rustls-pemfile = { version = "1.0", default-features = false, optional = true } +rustls-pemfile = { version = "2.0", default-features = false, features = ["std"], optional = true } tokio = { version = "1.25.0", features = ["sync", "macros", "rt", "time", "io-util"] } [target.'cfg(target_family="unix")'.dev-dependencies] diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 29c7b4563ad5..dc504da05723 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -304,8 +304,8 @@ fn decode_first_rsa_key(private_key_pem: String) -> Result { // Reading from string is infallible match rustls_pemfile::read_one(&mut reader).unwrap() { - Some(Item::PKCS8Key(key)) => Ok(RsaKeyPair::from_pkcs8(&key)?), - Some(Item::RSAKey(key)) => Ok(RsaKeyPair::from_der(&key)?), + Some(Item::Pkcs8Key(key)) => Ok(RsaKeyPair::from_pkcs8(key.secret_pkcs8_der())?), + Some(Item::Pkcs1Key(key)) => Ok(RsaKeyPair::from_der(key.secret_pkcs1_der())?), _ => Err(Error::MissingKey), } } From f352cdc5324b1cace9b7c0f2bb88b1866e366dbb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Dec 2023 17:59:47 +0000 Subject: [PATCH 1387/1411] Bump actions/labeler from 4.3.0 to 5.0.0 (#5167) Bumps [actions/labeler](https://github.com/actions/labeler) from 4.3.0 to 5.0.0. - [Release notes](https://github.com/actions/labeler/releases) - [Commits](https://github.com/actions/labeler/compare/v4.3.0...v5.0.0) --- updated-dependencies: - dependency-name: actions/labeler dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 5f3d9e54c8db..0d60ae006796 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -44,7 +44,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@v4.3.0 + uses: actions/labeler@v5.0.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml From 46bbd7debe23f7974bec256223633e806ec7cf06 Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Tue, 5 Dec 2023 23:30:26 +0530 Subject: [PATCH 1388/1411] Removing redundant as casts (#5168) --- parquet/src/column/reader.rs | 5 +---- parquet/src/encodings/rle.rs | 6 +++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index adfcd6390720..6c712ead625c 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -607,10 +607,7 @@ fn parse_v1_level( } Encoding::BIT_PACKED => { let bit_width = num_required_bits(max_level as u64); - let num_bytes = ceil( - (num_buffered_values as usize * bit_width as usize) as i64, - 8, - ) as usize; + let num_bytes = ceil(num_buffered_values as usize * bit_width as usize, 8); Ok((num_bytes, buf.slice(..num_bytes))) } _ => Err(general_err!("invalid level encoding: {}", encoding)), diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 5807f6b9c527..5d91c1e53d0f 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -230,7 +230,7 @@ impl RleEncoder { self.bit_writer.put_vlq_int(indicator_value as u64); self.bit_writer.put_aligned( self.current_value, - bit_util::ceil(self.bit_width as i64, 8) as usize, + bit_util::ceil(self.bit_width as usize, 8), ); self.num_buffered_values = 0; self.repeat_count = 0; @@ -524,8 +524,8 @@ impl RleDecoder { self.bit_packed_left = ((indicator_value >> 1) * 8) as u32; } else { self.rle_left = (indicator_value >> 1) as u32; - let value_width = bit_util::ceil(self.bit_width as i64, 8); - self.current_value = bit_reader.get_aligned::(value_width as usize); + let value_width = bit_util::ceil(self.bit_width as usize, 8); + self.current_value = bit_reader.get_aligned::(value_width); assert!(self.current_value.is_some()); } true From f4bad6871f9f64f2b7c0367ae198aff32c3c61b1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 5 Dec 2023 16:07:12 -0800 Subject: [PATCH 1389/1411] fix: ensure take_fixed_size_list can handle null indices (#5170) * fix: ensure take_fixed_size_list can handle null indices * chore: apply clippy suggestion * Apply suggestions from code review Co-authored-by: Will Jones * Applying suggetions from review * Using a builder, per review suggestion * Apply suggestions from code review Co-authored-by: Will Jones * Cast length to usize to avoid compile error --------- Co-authored-by: Will Jones --- arrow-select/src/take.rs | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index d47b884ae38d..44269e38758e 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -19,7 +19,7 @@ use std::sync::Arc; -use arrow_array::builder::BufferBuilder; +use arrow_array::builder::{BufferBuilder, UInt32Builder}; use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; @@ -689,7 +689,7 @@ fn take_value_indices_from_fixed_size_list( where IndexType: ArrowPrimitiveType, { - let mut values = vec![]; + let mut values = UInt32Builder::with_capacity(length as usize * indices.len()); for i in 0..indices.len() { if indices.is_valid(i) { @@ -699,11 +699,16 @@ where .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?; let start = list.value_offset(index) as ::Native; - values.extend(start..start + length); + // Safety: Range always has known length. + unsafe { + values.append_trusted_len_iter(start..start + length); + } + } else { + values.append_nulls(length as usize); } } - Ok(PrimitiveArray::::from(values)) + Ok(values.finish()) } /// To avoid generating take implementations for every index type, instead we @@ -1985,6 +1990,23 @@ mod tests { assert_eq!(&values, &[Some(23), Some(4), None, None]) } + #[test] + fn test_take_fixed_size_list_null_indices() { + let indices = Int32Array::from_iter([Some(0), None]); + let values = Arc::new(Int32Array::from(vec![0, 1, 2, 3])); + let arr_field = Arc::new(Field::new("item", values.data_type().clone(), true)); + let values = FixedSizeListArray::try_new(arr_field, 2, values, None).unwrap(); + + let r = take(&values, &indices, None).unwrap(); + let values = r + .as_fixed_size_list() + .values() + .as_primitive::() + .into_iter() + .collect::>(); + assert_eq!(values, &[Some(0), Some(1), None, None]) + } + #[test] fn test_take_bytes_null_indices() { let indices = Int32Array::new( From ea3aca1d6f618a94426fec31f9d8b55b0aaf274d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 6 Dec 2023 00:42:22 -0800 Subject: [PATCH 1390/1411] fix: Changed labeler.yml to latest format (#5172) --- .github/workflows/dev_pr/labeler.yml | 32 +++++++++------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index ea5873081f18..c4d47213cd3d 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -16,33 +16,21 @@ # under the License. arrow: - - arrow-arith/**/* - - arrow-array/**/* - - arrow-buffer/**/* - - arrow-cast/**/* - - arrow-csv/**/* - - arrow-data/**/* - - arrow-flight/**/* - - arrow-integration-test/**/* - - arrow-integration-testing/**/* - - arrow-ipc/**/* - - arrow-json/**/* - - arrow-avro/**/* - - arrow-ord/**/* - - arrow-row/**/* - - arrow-schema/**/* - - arrow-select/**/* - - arrow-string/**/* - - arrow/**/* +- changed-files: + - any-glob-to-any-file: ['arrow-arith/**/*', 'arrow-array/**/*', 'arrow-buffer/**/*', 'arrow-cast/**/*', 'arrow-csv/**/*', 'arrow-data/**/*', 'arrow-flight/**/*', 'arrow-integration-test/**/*', 'arrow-integration-testing/**/*', 'arrow-ipc/**/*', 'arrow-json/**/*', 'arrow-avro/**/*', 'arrow-ord/**/*', 'arrow-row/**/*', 'arrow-schema/**/*', 'arrow-select/**/*', 'arrow-string/**/*', 'arrow/**/*'] arrow-flight: - - arrow-flight/**/* +- changed-files: + - any-glob-to-any-file: ['arrow-flight/**/*'] parquet: - - parquet/**/* +- changed-files: + - any-glob-to-any-file: ['parquet/**/*'] parquet-derive: - - parquet_derive/**/* +- changed-files: + - any-glob-to-any-file: ['parquet_derive/**/*'] object-store: - - object_store/**/* +- changed-files: + - any-glob-to-any-file: ['object_store/**/*'] From 32b68f7207908e3d874a1f993b796977855f1a5f Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 6 Dec 2023 00:44:02 -0800 Subject: [PATCH 1391/1411] Add LargeListBuilder to make_builder (#5171) --- .../src/builder/generic_list_builder.rs | 91 +++++++++++-------- arrow-array/src/builder/struct_builder.rs | 4 + 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 21eaadd5208a..116e2553cfb7 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -584,14 +584,31 @@ mod tests { &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), 10, ); - let mut builder = ListBuilder::new(values_builder); + test_boxed_generic_list_generic_list_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_large_list_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_generic_list_array_builder::(values_builder); + } + + fn test_boxed_generic_list_generic_list_array_builder( + values_builder: Box, + ) { + let mut builder: GenericListBuilder> = + GenericListBuilder::>::new(values_builder); // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -600,8 +617,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -610,14 +627,14 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(true); builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -626,8 +643,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -636,16 +653,16 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(true); builder.append(true); builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -654,8 +671,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -664,30 +681,30 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() - .expect("should be an Int32Builder") + .expect("should be an (Large)ListBuilder") .append_value(7); builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(true); builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(false); builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -696,8 +713,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(true); builder.append(true); @@ -706,8 +723,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -716,8 +733,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .values() .as_any_mut() .downcast_mut::() @@ -726,8 +743,8 @@ mod tests { builder .values() .as_any_mut() - .downcast_mut::>>() - .expect("should be an ListBuilder") + .downcast_mut::>>() + .expect("should be an (Large)ListBuilder") .append(true); builder.append(true); @@ -736,12 +753,12 @@ mod tests { assert_eq!(4, l1.len()); assert_eq!(1, l1.null_count()); - assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); - let l2 = l1.values().as_list::(); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6].map(O::usize_as)); + let l2 = l1.values().as_list::(); assert_eq!(6, l2.len()); assert_eq!(1, l2.null_count()); - assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10].map(O::usize_as)); let i1 = l2.values().as_primitive::(); assert_eq!(10, i1.len()); diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 06b8385b3164..960949a2f09f 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -173,6 +173,10 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(LargeListBuilder::with_capacity(builder, capacity)) + } DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), t => panic!("Data type {t:?} is not currently supported"), } From 95f03ac945228d4d11c8f007375fff3190e12bb6 Mon Sep 17 00:00:00 2001 From: "yujie.zhang (he/him)" Date: Wed, 6 Dec 2023 19:18:54 +0800 Subject: [PATCH 1392/1411] improve: make RunArray displayable (#5166) --- arrow-cast/src/display.rs | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 28c29c94bbdb..edf7c9394c88 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -301,6 +301,10 @@ fn make_formatter<'a>( DataType::Struct(_) => array_format(as_struct_array(array), options), DataType::Map(_, _) => array_format(as_map_array(array), options), DataType::Union(_, _) => array_format(as_union_array(array), options), + DataType::RunEndEncoded(_, _) => downcast_run_array! { + array => array_format(array, options), + _ => unreachable!() + }, d => Err(ArrowError::NotYetImplemented(format!("formatting {d} is not yet supported"))), } } @@ -748,6 +752,19 @@ impl<'a, K: ArrowDictionaryKeyType> DisplayIndexState<'a> for &'a DictionaryArra } } +impl<'a, K: RunEndIndexType> DisplayIndexState<'a> for &'a RunArray { + type State = Box; + + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + make_formatter(self.values().as_ref(), options) + } + + fn write(&self, s: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { + let value_idx = self.get_physical_index(idx); + s.as_ref().write(value_idx, f) + } +} + fn write_list( f: &mut dyn Write, mut range: Range, @@ -935,6 +952,8 @@ pub fn lexical_to_string(n: N) -> String { #[cfg(test)] mod tests { + use arrow_array::builder::StringRunBuilder; + use super::*; /// Test to verify options can be constant. See #4580 @@ -1079,4 +1098,21 @@ mod tests { let formatted = format_array(&array, &options); assert_eq!(formatted, &["NULL".to_string(), "NULL".to_string()]) } + + #[test] + fn test_string_run_arry_to_string() { + let mut builder = StringRunBuilder::::new(); + + builder.append_value("input_value"); + builder.append_value("input_value"); + builder.append_value("input_value"); + builder.append_value("input_value1"); + + let map_array = builder.finish(); + assert_eq!("input_value", array_value_to_string(&map_array, 1).unwrap()); + assert_eq!( + "input_value1", + array_value_to_string(&map_array, 3).unwrap() + ); + } } From 298ddfdbc4d138f3261d486e1254c3f61c4a209d Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:19:04 +1100 Subject: [PATCH 1393/1411] ci: Add cargo audit CI action (#5160) * Add cargo audit CI action * Update Cargo.toml descriptions * Don't use existing audit action * Test known bad dependency * Test known bad dependency * Revert change --- .github/workflows/audit.yml | 43 +++++++++++++++++++++++++++++++++++++ arrow-csv/Cargo.toml | 2 +- arrow-json/Cargo.toml | 2 +- 3 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/audit.yml diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml new file mode 100644 index 000000000000..2c1dcdfd2100 --- /dev/null +++ b/.github/workflows/audit.yml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: audit + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +# trigger for all PRs that touch certain files and changes to master +on: + push: + branches: + - master + pull_request: + paths: + - '**/Cargo.toml' + - '**/Cargo.lock' + +jobs: + cargo-audit: + name: Audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install cargo-audit + run: cargo install cargo-audit + - name: Run audit check + run: cargo audit diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 66a6d7dbcaa5..d29c85c56cfd 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-csv" version = { workspace = true } -description = "Support for parsing CSV format into the Arrow format" +description = "Support for parsing CSV format to and from the Arrow format" homepage = { workspace = true } repository = { workspace = true } authors = { workspace = true } diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 7e49a57fbd6c..dd232f197ead 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-json" version = { workspace = true } -description = "Support for parsing JSON format into the Arrow format" +description = "Support for parsing JSON format to and from the Arrow format" homepage = { workspace = true } repository = { workspace = true } authors = { workspace = true } From 2923243d1c59e111957fc780d4c3f441ca10d372 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 6 Dec 2023 11:48:31 +0000 Subject: [PATCH 1394/1411] Labeller attempt 2 (#5174) --- .github/workflows/dev_pr/labeler.yml | 39 +++++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index c4d47213cd3d..cae015018eac 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -16,21 +16,40 @@ # under the License. arrow: -- changed-files: - - any-glob-to-any-file: ['arrow-arith/**/*', 'arrow-array/**/*', 'arrow-buffer/**/*', 'arrow-cast/**/*', 'arrow-csv/**/*', 'arrow-data/**/*', 'arrow-flight/**/*', 'arrow-integration-test/**/*', 'arrow-integration-testing/**/*', 'arrow-ipc/**/*', 'arrow-json/**/*', 'arrow-avro/**/*', 'arrow-ord/**/*', 'arrow-row/**/*', 'arrow-schema/**/*', 'arrow-select/**/*', 'arrow-string/**/*', 'arrow/**/*'] + - changed-files: + - any-glob-to-any-file: + - 'arrow-arith/**/*' + - 'arrow-array/**/*' + - 'arrow-buffer/**/*' + - 'arrow-cast/**/*' + - 'arrow-csv/**/*' + - 'arrow-data/**/*' + - 'arrow-flight/**/*' + - 'arrow-integration-test/**/*' + - 'arrow-integration-testing/**/*' + - 'arrow-ipc/**/*' + - 'arrow-json/**/*' + - 'arrow-avro/**/*' + - 'arrow-ord/**/*' + - 'arrow-row/**/*' + - 'arrow-schema/**/*' + - 'arrow-select/**/*' + - 'arrow-string/**/*' + - 'arrow/**/*' arrow-flight: -- changed-files: - - any-glob-to-any-file: ['arrow-flight/**/*'] + - changed-files: + - any-glob-to-any-file: + - 'arrow-flight/**/*' parquet: -- changed-files: - - any-glob-to-any-file: ['parquet/**/*'] + - changed-files: + - any-glob-to-any-file: [ 'parquet/**/*' ] parquet-derive: -- changed-files: - - any-glob-to-any-file: ['parquet_derive/**/*'] + - changed-files: + - any-glob-to-any-file: [ 'parquet_derive/**/*' ] object-store: -- changed-files: - - any-glob-to-any-file: ['object_store/**/*'] + - changed-files: + - any-glob-to-any-file: [ 'object_store/**/*' ] From 1534cc196cb5fca09a093fd7886b6d9bfa9831bd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 Dec 2023 14:23:09 +0000 Subject: [PATCH 1395/1411] Bump actions/setup-python from 4 to 5 (#5175) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 4 to 5. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/dev.yml | 2 +- .github/workflows/integration.yml | 2 +- .github/workflows/parquet.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 1447d72a53b1..2026e257ab29 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -40,7 +40,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.8 - name: Audit licenses diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c9cb4e31ced9..1604a7be4372 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -129,7 +129,7 @@ jobs: path: /home/runner/target # this key is not equal because maturin uses different compilation flags. key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: '3.8' - name: Upgrade pip and setuptools diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index d664a0dc0730..a4e654892662 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -140,7 +140,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.10" cache: "pip" From 490c080e5ba7a50efc862da9508e6669900549ee Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 7 Dec 2023 15:33:42 +0200 Subject: [PATCH 1396/1411] Parquet: Ensure page statistics are written only when conifgured from the Arrow Writer (#5181) * Issue fix and tests * Cleanup tests --- parquet/src/arrow/arrow_writer/mod.rs | 143 +++++++++++++++++++++++++- parquet/src/column/writer/mod.rs | 29 +++--- 2 files changed, 158 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index ea7b1eee99b8..e6e95d50996a 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -916,8 +916,9 @@ mod tests { use crate::basic::Encoding; use crate::data_type::AsBytes; use crate::file::metadata::ParquetMetaData; + use crate::file::page_index::index::Index; use crate::file::page_index::index_reader::read_pages_locations; - use crate::file::properties::{ReaderProperties, WriterVersion}; + use crate::file::properties::{EnabledStatistics, ReaderProperties, WriterVersion}; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ reader::{FileReader, SerializedFileReader}, @@ -2738,4 +2739,144 @@ mod tests { assert_eq!(index[0][0].len(), 1); // 1 page assert_eq!(index[0][1].len(), 1); // 1 page } + + #[test] + fn test_disabled_statistics_with_page() { + let file_schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ]); + let file_schema = Arc::new(file_schema); + + let batch = RecordBatch::try_new( + file_schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["a", "b", "c", "d"])) as _, + Arc::new(StringArray::from(vec!["w", "x", "y", "z"])) as _, + ], + ) + .unwrap(); + + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::None) + .set_column_statistics_enabled("a".into(), EnabledStatistics::Page) + .build(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, file_schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + + let metadata = writer.close().unwrap(); + assert_eq!(metadata.row_groups.len(), 1); + let row_group = &metadata.row_groups[0]; + assert_eq!(row_group.columns.len(), 2); + // Column "a" has both offset and column index, as requested + assert!(row_group.columns[0].offset_index_offset.is_some()); + assert!(row_group.columns[0].column_index_offset.is_some()); + // Column "b" should only have offset index + assert!(row_group.columns[1].offset_index_offset.is_some()); + assert!(row_group.columns[1].column_index_offset.is_none()); + + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(Bytes::from(buf), options).unwrap(); + + let row_group = reader.get_row_group(0).unwrap(); + let a_col = row_group.metadata().column(0); + let b_col = row_group.metadata().column(1); + + // Column chunk of column "a" should have chunk level statistics + if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { + let min = byte_array_stats.min(); + let max = byte_array_stats.max(); + + assert_eq!(min.as_bytes(), &[b'a']); + assert_eq!(max.as_bytes(), &[b'd']); + } else { + panic!("expecting Statistics::ByteArray"); + } + + // The column chunk for column "b" shouldn't have statistics + assert!(b_col.statistics().is_none()); + + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); // 1 row group + assert_eq!(offset_index[0].len(), 2); // 2 columns + + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); // 1 row group + assert_eq!(column_index[0].len(), 2); // 2 columns + + let a_idx = &column_index[0][0]; + assert!(matches!(a_idx, Index::BYTE_ARRAY(_)), "{a_idx:?}"); + let b_idx = &column_index[0][1]; + assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + } + + #[test] + fn test_disabled_statistics_with_chunk() { + let file_schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ]); + let file_schema = Arc::new(file_schema); + + let batch = RecordBatch::try_new( + file_schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["a", "b", "c", "d"])) as _, + Arc::new(StringArray::from(vec!["w", "x", "y", "z"])) as _, + ], + ) + .unwrap(); + + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::None) + .set_column_statistics_enabled("a".into(), EnabledStatistics::Chunk) + .build(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, file_schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + + let metadata = writer.close().unwrap(); + assert_eq!(metadata.row_groups.len(), 1); + let row_group = &metadata.row_groups[0]; + assert_eq!(row_group.columns.len(), 2); + // Column "a" should only have offset index + assert!(row_group.columns[0].offset_index_offset.is_some()); + assert!(row_group.columns[0].column_index_offset.is_none()); + // Column "b" should only have offset index + assert!(row_group.columns[1].offset_index_offset.is_some()); + assert!(row_group.columns[1].column_index_offset.is_none()); + + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(Bytes::from(buf), options).unwrap(); + + let row_group = reader.get_row_group(0).unwrap(); + let a_col = row_group.metadata().column(0); + let b_col = row_group.metadata().column(1); + + // Column chunk of column "a" should have chunk level statistics + if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() { + let min = byte_array_stats.min(); + let max = byte_array_stats.max(); + + assert_eq!(min.as_bytes(), &[b'a']); + assert_eq!(max.as_bytes(), &[b'd']); + } else { + panic!("expecting Statistics::ByteArray"); + } + + // The column chunk for column "b" shouldn't have statistics + assert!(b_col.statistics().is_none()); + + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); // 1 row group + assert_eq!(column_index[0].len(), 2); // 2 columns + + let a_idx = &column_index[0][0]; + assert!(matches!(a_idx, Index::NONE), "{a_idx:?}"); + let b_idx = &column_index[0][1]; + assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); + } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 5dd7747c6fc2..531af4bd461e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -764,19 +764,22 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.num_column_nulls += self.page_metrics.num_page_nulls; - let page_statistics = match (values_data.min_value, values_data.max_value) { - (Some(min), Some(max)) => { - update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); - update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); - Some(ValueStatistics::new( - Some(min), - Some(max), - None, - self.page_metrics.num_page_nulls, - false, - )) - } - _ => None, + let page_statistics = if let (Some(min), Some(max)) = + (values_data.min_value, values_data.max_value) + { + // Update chunk level statistics + update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); + + (self.statistics_enabled == EnabledStatistics::Page).then_some(ValueStatistics::new( + Some(min), + Some(max), + None, + self.page_metrics.num_page_nulls, + false, + )) + } else { + None }; // update column and offset index From b06ab13fa2681624c7d5094004309607b253773b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Thu, 7 Dec 2023 17:10:20 +0100 Subject: [PATCH 1397/1411] Use Total Ordering for Aggregates and Refactor for Better Auto-Vectorization (#5100) * Refactor numeric aggregation kernels to make better use of auto-vectorization. Remove the explicit simd implementations since the autovectorized versions are faster on average. The min/max kernels for floating point numbers now use the total order relation. * Comments and cleanup * Clippy fixes * Use largest/smallest bit patterns for float MIN/MAX constants, these differ from the canonical NAN bit pattern * Add test coverage for aggregating large non-null and float inputs * Add test with negative NaN * Rename MIN/MAX constants to make it explicit they use the total order relation --- arrow-arith/src/aggregate.rs | 803 ++++++++++++++--------------- arrow-array/src/arithmetic.rs | 85 ++- arrow-buffer/src/buffer/boolean.rs | 1 + 3 files changed, 454 insertions(+), 435 deletions(-) diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 0dabaa50f5f6..20ff0711d735 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -20,39 +20,317 @@ use arrow_array::cast::*; use arrow_array::iterator::ArrayIter; use arrow_array::*; -use arrow_buffer::ArrowNativeType; +use arrow_buffer::{ArrowNativeType, NullBuffer}; use arrow_data::bit_iterator::try_for_each_valid_idx; use arrow_schema::ArrowError; use arrow_schema::*; +use std::borrow::BorrowMut; use std::ops::{BitAnd, BitOr, BitXor}; -/// Generic test for NaN, the optimizer should be able to remove this for integer types. -#[inline] -pub(crate) fn is_nan(a: T) -> bool { - #[allow(clippy::eq_op)] - !(a == a) +/// An accumulator for primitive numeric values. +trait NumericAccumulator: Copy + Default { + /// Accumulate a non-null value. + fn accumulate(&mut self, value: T); + /// Accumulate a nullable values. + /// If `valid` is false the `value` should not affect the accumulator state. + fn accumulate_nullable(&mut self, value: T, valid: bool); + /// Merge another accumulator into this accumulator + fn merge(&mut self, other: Self); + /// Return the aggregated value. + fn finish(&mut self) -> T; } -/// Returns the minimum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -#[cfg(not(feature = "simd"))] -pub fn min(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: ArrowNativeType, -{ - min_max_helper::(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b) +/// Helper for branchlessly selecting either `a` or `b` based on the boolean `m`. +/// After verifying the generated assembly this can be a simple `if`. +#[inline(always)] +fn select(m: bool, a: T, b: T) -> T { + if m { + a + } else { + b + } } -/// Returns the maximum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -#[cfg(not(feature = "simd"))] -pub fn max(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: ArrowNativeType, -{ - min_max_helper::(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b) +#[derive(Clone, Copy)] +struct SumAccumulator { + sum: T, +} + +impl Default for SumAccumulator { + fn default() -> Self { + Self { sum: T::ZERO } + } +} + +impl NumericAccumulator for SumAccumulator { + fn accumulate(&mut self, value: T) { + self.sum = self.sum.add_wrapping(value); + } + + fn accumulate_nullable(&mut self, value: T, valid: bool) { + let sum = self.sum; + self.sum = select(valid, sum.add_wrapping(value), sum) + } + + fn merge(&mut self, other: Self) { + self.sum = self.sum.add_wrapping(other.sum); + } + + fn finish(&mut self) -> T { + self.sum + } +} + +#[derive(Clone, Copy)] +struct MinAccumulator { + min: T, +} + +impl Default for MinAccumulator { + fn default() -> Self { + Self { + min: T::MAX_TOTAL_ORDER, + } + } +} + +impl NumericAccumulator for MinAccumulator { + fn accumulate(&mut self, value: T) { + let min = self.min; + self.min = select(value.is_lt(min), value, min); + } + + fn accumulate_nullable(&mut self, value: T, valid: bool) { + let min = self.min; + let is_lt = valid & value.is_lt(min); + self.min = select(is_lt, value, min); + } + + fn merge(&mut self, other: Self) { + self.accumulate(other.min) + } + + fn finish(&mut self) -> T { + self.min + } +} + +#[derive(Clone, Copy)] +struct MaxAccumulator { + max: T, +} + +impl Default for MaxAccumulator { + fn default() -> Self { + Self { + max: T::MIN_TOTAL_ORDER, + } + } +} + +impl NumericAccumulator for MaxAccumulator { + fn accumulate(&mut self, value: T) { + let max = self.max; + self.max = select(value.is_gt(max), value, max); + } + + fn accumulate_nullable(&mut self, value: T, valid: bool) { + let max = self.max; + let is_gt = value.is_gt(max) & valid; + self.max = select(is_gt, value, max); + } + + fn merge(&mut self, other: Self) { + self.accumulate(other.max) + } + + fn finish(&mut self) -> T { + self.max + } +} + +fn reduce_accumulators, const LANES: usize>( + mut acc: [A; LANES], +) -> A { + assert!(LANES > 0 && LANES.is_power_of_two()); + let mut len = LANES; + + // attempt at tree reduction, unfortunately llvm does not fully recognize this pattern, + // but the generated code is still a little faster than purely sequential reduction for floats. + while len >= 2 { + let mid = len / 2; + let (h, t) = acc[..len].split_at_mut(mid); + + for i in 0..mid { + h[i].merge(t[i]); + } + len /= 2; + } + acc[0] +} + +#[inline(always)] +fn aggregate_nonnull_chunk, const LANES: usize>( + acc: &mut [A; LANES], + values: &[T; LANES], +) { + for i in 0..LANES { + acc[i].accumulate(values[i]); + } +} + +#[inline(always)] +fn aggregate_nullable_chunk, const LANES: usize>( + acc: &mut [A; LANES], + values: &[T; LANES], + validity: u64, +) { + let mut bit = 1; + for i in 0..LANES { + acc[i].accumulate_nullable(values[i], (validity & bit) != 0); + bit <<= 1; + } +} + +fn aggregate_nonnull_simple>(values: &[T]) -> T { + return values + .iter() + .copied() + .fold(A::default(), |mut a, b| { + a.accumulate(b); + a + }) + .finish(); +} + +#[inline(never)] +fn aggregate_nonnull_lanes, const LANES: usize>( + values: &[T], +) -> T { + // aggregating into multiple independent accumulators allows the compiler to use vector registers + // with a single accumulator the compiler would not be allowed to reorder floating point addition + let mut acc = [A::default(); LANES]; + let mut chunks = values.chunks_exact(LANES); + chunks.borrow_mut().for_each(|chunk| { + aggregate_nonnull_chunk(&mut acc, chunk[..LANES].try_into().unwrap()); + }); + + let remainder = chunks.remainder(); + for i in 0..remainder.len() { + acc[i].accumulate(remainder[i]); + } + + reduce_accumulators(acc).finish() +} + +#[inline(never)] +fn aggregate_nullable_lanes, const LANES: usize>( + values: &[T], + validity: &NullBuffer, +) -> T { + assert!(LANES > 0 && 64 % LANES == 0); + assert_eq!(values.len(), validity.len()); + + // aggregating into multiple independent accumulators allows the compiler to use vector registers + let mut acc = [A::default(); LANES]; + // we process 64 bits of validity at a time + let mut values_chunks = values.chunks_exact(64); + let validity_chunks = validity.inner().bit_chunks(); + let mut validity_chunks_iter = validity_chunks.iter(); + + values_chunks.borrow_mut().for_each(|chunk| { + // Safety: we asserted that values and validity have the same length and trust the iterator impl + let mut validity = unsafe { validity_chunks_iter.next().unwrap_unchecked() }; + // chunk further based on the number of vector lanes + chunk.chunks_exact(LANES).for_each(|chunk| { + aggregate_nullable_chunk(&mut acc, chunk[..LANES].try_into().unwrap(), validity); + validity >>= LANES; + }); + }); + + let remainder = values_chunks.remainder(); + if !remainder.is_empty() { + let mut validity = validity_chunks.remainder_bits(); + + let mut remainder_chunks = remainder.chunks_exact(LANES); + remainder_chunks.borrow_mut().for_each(|chunk| { + aggregate_nullable_chunk(&mut acc, chunk[..LANES].try_into().unwrap(), validity); + validity >>= LANES; + }); + + let remainder = remainder_chunks.remainder(); + if !remainder.is_empty() { + let mut bit = 1; + for i in 0..remainder.len() { + acc[i].accumulate_nullable(remainder[i], (validity & bit) != 0); + bit <<= 1; + } + } + } + + reduce_accumulators(acc).finish() +} + +/// The preferred vector size in bytes for the target platform. +/// Note that the avx512 target feature is still unstable and this also means it is not detected on stable rust. +const PREFERRED_VECTOR_SIZE: usize = + if cfg!(all(target_arch = "x86_64", target_feature = "avx512f")) { + 64 + } else if cfg!(all(target_arch = "x86_64", target_feature = "avx")) { + 32 + } else { + 16 + }; + +/// non-nullable aggregation requires fewer temporary registers so we can use more of them for accumulators +const PREFERRED_VECTOR_SIZE_NON_NULL: usize = PREFERRED_VECTOR_SIZE * 2; + +/// Generic aggregation for any primitive type. +/// Returns None if there are no non-null values in `array`. +fn aggregate, A: NumericAccumulator>( + array: &PrimitiveArray

, +) -> Option { + let null_count = array.null_count(); + if null_count == array.len() { + return None; + } + let values = array.values().as_ref(); + match array.nulls() { + Some(nulls) if null_count > 0 => { + // const generics depending on a generic type parameter are not supported + // so we have to match and call aggregate with the corresponding constant + match PREFERRED_VECTOR_SIZE / std::mem::size_of::() { + 64 => Some(aggregate_nullable_lanes::(values, nulls)), + 32 => Some(aggregate_nullable_lanes::(values, nulls)), + 16 => Some(aggregate_nullable_lanes::(values, nulls)), + 8 => Some(aggregate_nullable_lanes::(values, nulls)), + 4 => Some(aggregate_nullable_lanes::(values, nulls)), + 2 => Some(aggregate_nullable_lanes::(values, nulls)), + _ => Some(aggregate_nullable_lanes::(values, nulls)), + } + } + _ => { + let is_float = matches!( + array.data_type(), + DataType::Float16 | DataType::Float32 | DataType::Float64 + ); + if is_float { + match PREFERRED_VECTOR_SIZE_NON_NULL / std::mem::size_of::() { + 64 => Some(aggregate_nonnull_lanes::(values)), + 32 => Some(aggregate_nonnull_lanes::(values)), + 16 => Some(aggregate_nonnull_lanes::(values)), + 8 => Some(aggregate_nonnull_lanes::(values)), + 4 => Some(aggregate_nonnull_lanes::(values)), + 2 => Some(aggregate_nonnull_lanes::(values)), + _ => Some(aggregate_nonnull_simple::(values)), + } + } else { + // for non-null integers its better to not chunk ourselves and instead + // let llvm fully handle loop unrolling and vectorization + Some(aggregate_nonnull_simple::(values)) + } + } + } } /// Returns the minimum value in the boolean array. @@ -230,7 +508,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_array_helper::(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, min) + min_max_array_helper::(array, |a, b| a.is_gt(*b), min) } /// Returns the max of values in the array of `ArrowNumericType` type, or dictionary @@ -238,9 +516,9 @@ where pub fn max_array>(array: A) -> Option where T: ArrowNumericType, - T::Native: ArrowNativeType, + T::Native: ArrowNativeTypeOp, { - min_max_array_helper::(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, max) + min_max_array_helper::(array, |a, b| a.is_lt(*b), max) } fn min_max_array_helper, F, M>( @@ -259,66 +537,6 @@ where } } -/// Returns the sum of values in the primitive array. -/// -/// Returns `None` if the array is empty or only contains null values. -/// -/// This doesn't detect overflow. Once overflowing, the result will wrap around. -/// For an overflow-checking variant, use `sum_checked` instead. -#[cfg(not(feature = "simd"))] -pub fn sum(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: ArrowNativeTypeOp, -{ - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - let data: &[T::Native] = array.values(); - - match array.nulls() { - None => { - let sum = data.iter().fold(T::default_value(), |accumulator, value| { - accumulator.add_wrapping(*value) - }); - - Some(sum) - } - Some(nulls) => { - let mut sum = T::default_value(); - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - let bit_chunks = nulls.inner().bit_chunks(); - data_chunks - .zip(bit_chunks.iter()) - .for_each(|(chunk, mask)| { - // index_mask has value 1 << i in the loop - let mut index_mask = 1; - chunk.iter().for_each(|value| { - if (mask & index_mask) != 0 { - sum = sum.add_wrapping(*value); - } - index_mask <<= 1; - }); - }); - - let remainder_bits = bit_chunks.remainder_bits(); - - remainder.iter().enumerate().for_each(|(i, value)| { - if remainder_bits & (1 << i) != 0 { - sum = sum.add_wrapping(*value); - } - }); - - Some(sum) - } - } -} - macro_rules! bit_operation { ($NAME:ident, $OP:ident, $NATIVE:ident, $DEFAULT:expr, $DOC:expr) => { #[doc = $DOC] @@ -476,369 +694,35 @@ where } } -#[cfg(feature = "simd")] -mod simd { - use super::is_nan; - use arrow_array::*; - use std::marker::PhantomData; - - pub(super) trait SimdAggregate { - type ScalarAccumulator; - type SimdAccumulator; - - /// Returns the accumulator for aggregating scalar values - fn init_accumulator_scalar() -> Self::ScalarAccumulator; - - /// Returns the accumulator for aggregating simd chunks of values - fn init_accumulator_chunk() -> Self::SimdAccumulator; - - /// Updates the accumulator with the values of one chunk - fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd); - - /// Updates the accumulator with the values of one chunk according to the given vector mask - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - mask: T::SimdMask, - ); - - /// Updates the accumulator with one value - fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native); - - /// Reduces the vector lanes of the simd accumulator and the scalar accumulator to a single value - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option; - } - - pub(super) struct SumAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for SumAggregate - where - T::Native: ArrowNativeTypeOp, - { - type ScalarAccumulator = T::Native; - type SimdAccumulator = T::Simd; - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - T::default_value() - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - T::init(Self::init_accumulator_scalar()) - } - - fn accumulate_chunk_non_null(accumulator: &mut T::Simd, chunk: T::Simd) { - *accumulator = *accumulator + chunk; - } - - fn accumulate_chunk_nullable( - accumulator: &mut T::Simd, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let zero = T::init(T::default_value()); - let blended = T::mask_select(vecmask, chunk, zero); - - *accumulator = *accumulator + blended; - } - - fn accumulate_scalar(accumulator: &mut T::Native, value: T::Native) { - *accumulator = accumulator.add_wrapping(value) - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .for_each(|value| Self::accumulate_scalar(&mut reduced, *value)); - - Self::accumulate_scalar(&mut reduced, scalar_accumulator); - - // result can not be None because we checked earlier for the null count - Some(reduced) - } - } - - pub(super) struct MinAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for MinAggregate - where - T::Native: PartialOrd, - { - type ScalarAccumulator = (T::Native, bool); - type SimdAccumulator = (T::Simd, T::SimdMask); - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - (T::default_value(), false) - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - (T::init(T::default_value()), T::mask_init(false)) - } - - fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { - let acc_is_nan = !T::eq(accumulator.0, accumulator.0); - let is_lt = acc_is_nan | T::lt(chunk, accumulator.0); - let first_or_lt = !accumulator.1 | is_lt; - - accumulator.0 = T::mask_select(first_or_lt, chunk, accumulator.0); - accumulator.1 = T::mask_init(true); - } - - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let acc_is_nan = !T::eq(accumulator.0, accumulator.0); - let is_lt = vecmask & (acc_is_nan | T::lt(chunk, accumulator.0)); - let first_or_lt = !accumulator.1 | is_lt; - - accumulator.0 = T::mask_select(first_or_lt, chunk, accumulator.0); - accumulator.1 |= vecmask; - } - - fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { - if !accumulator.1 { - accumulator.0 = value; - } else { - let acc_is_nan = is_nan(accumulator.0); - if acc_is_nan || value < accumulator.0 { - accumulator.0 = value - } - } - accumulator.1 = true - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator.0, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .enumerate() - .filter(|(i, _value)| T::mask_get(&simd_accumulator.1, *i)) - .for_each(|(_i, value)| Self::accumulate_scalar(&mut reduced, *value)); - - if scalar_accumulator.1 { - Self::accumulate_scalar(&mut reduced, scalar_accumulator.0); - } - - if reduced.1 { - Some(reduced.0) - } else { - None - } - } - } - - pub(super) struct MaxAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for MaxAggregate - where - T::Native: PartialOrd, - { - type ScalarAccumulator = (T::Native, bool); - type SimdAccumulator = (T::Simd, T::SimdMask); - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - (T::default_value(), false) - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - (T::init(T::default_value()), T::mask_init(false)) - } - - fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { - let chunk_is_nan = !T::eq(chunk, chunk); - let is_gt = chunk_is_nan | T::gt(chunk, accumulator.0); - let first_or_gt = !accumulator.1 | is_gt; - - accumulator.0 = T::mask_select(first_or_gt, chunk, accumulator.0); - accumulator.1 = T::mask_init(true); - } - - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let chunk_is_nan = !T::eq(chunk, chunk); - let is_gt = vecmask & (chunk_is_nan | T::gt(chunk, accumulator.0)); - let first_or_gt = !accumulator.1 | is_gt; - - accumulator.0 = T::mask_select(first_or_gt, chunk, accumulator.0); - accumulator.1 |= vecmask; - } - - fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { - if !accumulator.1 { - accumulator.0 = value; - } else { - let value_is_nan = is_nan(value); - if value_is_nan || value > accumulator.0 { - accumulator.0 = value - } - } - accumulator.1 = true; - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator.0, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .enumerate() - .filter(|(i, _value)| T::mask_get(&simd_accumulator.1, *i)) - .for_each(|(_i, value)| Self::accumulate_scalar(&mut reduced, *value)); - - if scalar_accumulator.1 { - Self::accumulate_scalar(&mut reduced, scalar_accumulator.0); - } - - if reduced.1 { - Some(reduced.0) - } else { - None - } - } - } - - pub(super) fn simd_aggregation>( - array: &PrimitiveArray, - ) -> Option { - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - let data: &[T::Native] = array.values(); - - let mut chunk_acc = A::init_accumulator_chunk(); - let mut rem_acc = A::init_accumulator_scalar(); - - match array.nulls() { - None => { - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - data_chunks.for_each(|chunk| { - chunk.chunks_exact(T::lanes()).for_each(|chunk| { - let chunk = T::load(&chunk); - A::accumulate_chunk_non_null(&mut chunk_acc, chunk); - }); - }); - - remainder.iter().for_each(|value| { - A::accumulate_scalar(&mut rem_acc, *value); - }); - } - Some(nulls) => { - // process data in chunks of 64 elements since we also get 64 bits of validity information at a time - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - let bit_chunks = nulls.inner().bit_chunks(); - let remainder_bits = bit_chunks.remainder_bits(); - - data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| { - // split chunks further into slices corresponding to the vector length - // the compiler is able to unroll this inner loop and remove bounds checks - // since the outer chunk size (64) is always a multiple of the number of lanes - chunk.chunks_exact(T::lanes()).for_each(|chunk| { - let vecmask = T::mask_from_u64(mask); - let chunk = T::load(&chunk); - - A::accumulate_chunk_nullable(&mut chunk_acc, chunk, vecmask); - - // skip the shift and avoid overflow for u8 type, which uses 64 lanes. - mask >>= T::lanes() % 64; - }); - }); - - remainder.iter().enumerate().for_each(|(i, value)| { - if remainder_bits & (1 << i) != 0 { - A::accumulate_scalar(&mut rem_acc, *value) - } - }); - } - } - - A::reduce(chunk_acc, rem_acc) - } -} - /// Returns the sum of values in the primitive array. /// /// Returns `None` if the array is empty or only contains null values. /// /// This doesn't detect overflow in release mode by default. Once overflowing, the result will /// wrap around. For an overflow-checking variant, use `sum_checked` instead. -#[cfg(feature = "simd")] pub fn sum(array: &PrimitiveArray) -> Option where T::Native: ArrowNativeTypeOp, { - use simd::*; - - simd::simd_aggregation::>(&array) + aggregate::>(array) } -#[cfg(feature = "simd")] /// Returns the minimum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value pub fn min(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, { - use simd::*; - - simd::simd_aggregation::>(&array) + aggregate::>(array) } -#[cfg(feature = "simd")] /// Returns the maximum value in the array, according to the natural order. /// For floating point arrays any NaN values are considered to be greater than any other non-null value pub fn max(array: &PrimitiveArray) -> Option where T::Native: PartialOrd, { - use simd::*; - - simd::simd_aggregation::>(&array) + aggregate::>(array) } #[cfg(test)] @@ -872,8 +756,41 @@ mod tests { assert_eq!(None, sum(&a)); } + #[test] + fn test_primitive_array_sum_large_float_64() { + let c = Float64Array::new((1..=100).map(|x| x as f64).collect(), None); + assert_eq!(Some((1..=100).sum::() as f64), sum(&c)); + + // create an array that actually has non-zero values at the invalid indices + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); + let c = Float64Array::new((1..=100).map(|x| x as f64).collect(), Some(validity)); + + assert_eq!( + Some((1..=100).filter(|i| i % 3 == 0).sum::() as f64), + sum(&c) + ); + } + + #[test] + fn test_primitive_array_sum_large_float_32() { + let c = Float32Array::new((1..=100).map(|x| x as f32).collect(), None); + assert_eq!(Some((1..=100).sum::() as f32), sum(&c)); + + // create an array that actually has non-zero values at the invalid indices + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); + let c = Float32Array::new((1..=100).map(|x| x as f32).collect(), Some(validity)); + + assert_eq!( + Some((1..=100).filter(|i| i % 3 == 0).sum::() as f32), + sum(&c) + ); + } + #[test] fn test_primitive_array_sum_large_64() { + let c = Int64Array::new((1..=100).collect(), None); + assert_eq!(Some((1..=100).sum()), sum(&c)); + // create an array that actually has non-zero values at the invalid indices let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); let c = Int64Array::new((1..=100).collect(), Some(validity)); @@ -883,6 +800,9 @@ mod tests { #[test] fn test_primitive_array_sum_large_32() { + let c = Int32Array::new((1..=100).collect(), None); + assert_eq!(Some((1..=100).sum()), sum(&c)); + // create an array that actually has non-zero values at the invalid indices let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); let c = Int32Array::new((1..=100).collect(), Some(validity)); @@ -891,6 +811,9 @@ mod tests { #[test] fn test_primitive_array_sum_large_16() { + let c = Int16Array::new((1..=100).collect(), None); + assert_eq!(Some((1..=100).sum()), sum(&c)); + // create an array that actually has non-zero values at the invalid indices let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); let c = Int16Array::new((1..=100).collect(), Some(validity)); @@ -899,11 +822,23 @@ mod tests { #[test] fn test_primitive_array_sum_large_8() { - // include fewer values than other large tests so the result does not overflow the u8 + let c = UInt8Array::new((1..=100).collect(), None); + assert_eq!( + Some((1..=100).fold(0_u8, |a, x| a.wrapping_add(x))), + sum(&c) + ); + // create an array that actually has non-zero values at the invalid indices - let validity = NullBuffer::new((1..=100).map(|x| x % 33 == 0).collect()); + let validity = NullBuffer::new((1..=100).map(|x| x % 3 == 0).collect()); let c = UInt8Array::new((1..=100).collect(), Some(validity)); - assert_eq!(Some((1..=100).filter(|i| i % 33 == 0).sum()), sum(&c)); + assert_eq!( + Some( + (1..=100) + .filter(|i| i % 3 == 0) + .fold(0_u8, |a, x| a.wrapping_add(x)) + ), + sum(&c) + ); } #[test] @@ -1103,6 +1038,19 @@ mod tests { assert!(min(&a).unwrap().is_nan()); } + #[test] + fn test_primitive_min_max_float_negative_nan() { + let a: Float64Array = + Float64Array::from(vec![f64::NEG_INFINITY, f64::NAN, f64::INFINITY, -f64::NAN]); + let max = max(&a).unwrap(); + let min = min(&a).unwrap(); + assert!(max.is_nan()); + assert!(max.is_sign_positive()); + + assert!(min.is_nan()); + assert!(min.is_sign_negative()); + } + #[test] fn test_primitive_min_max_float_first_nan_nonnull() { let a: Float64Array = (0..100) @@ -1455,7 +1403,6 @@ mod tests { } #[test] - #[cfg(not(feature = "simd"))] fn test_sum_overflow() { let a = Int32Array::from(vec![i32::MAX, 1]); diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index c9be39d44144..590536190309 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -45,6 +45,16 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { /// The multiplicative identity const ONE: Self; + /// The minimum value and identity for the `max` aggregation. + /// Note that the aggregation uses the total order predicate for floating point values, + /// which means that this value is a negative NaN. + const MIN_TOTAL_ORDER: Self; + + /// The maximum value and identity for the `min` aggregation. + /// Note that the aggregation uses the total order predicate for floating point values, + /// which means that this value is a positive NaN. + const MAX_TOTAL_ORDER: Self; + /// Checked addition operation fn add_checked(self, rhs: Self) -> Result; @@ -129,12 +139,14 @@ pub trait ArrowNativeTypeOp: ArrowNativeType { macro_rules! native_type_op { ($t:tt) => { - native_type_op!($t, 0, 1); + native_type_op!($t, 0, 1, $t::MIN, $t::MAX); }; - ($t:tt, $zero:expr, $one: expr) => { + ($t:tt, $zero:expr, $one: expr, $min: expr, $max: expr) => { impl ArrowNativeTypeOp for $t { const ZERO: Self = $zero; const ONE: Self = $one; + const MIN_TOTAL_ORDER: Self = $min; + const MAX_TOTAL_ORDER: Self = $max; #[inline] fn add_checked(self, rhs: Self) -> Result { @@ -270,13 +282,15 @@ native_type_op!(u8); native_type_op!(u16); native_type_op!(u32); native_type_op!(u64); -native_type_op!(i256, i256::ZERO, i256::ONE); +native_type_op!(i256, i256::ZERO, i256::ONE, i256::MIN, i256::MAX); macro_rules! native_type_float_op { - ($t:tt, $zero:expr, $one:expr) => { + ($t:tt, $zero:expr, $one:expr, $min:expr, $max:expr) => { impl ArrowNativeTypeOp for $t { const ZERO: Self = $zero; const ONE: Self = $one; + const MIN_TOTAL_ORDER: Self = $min; + const MAX_TOTAL_ORDER: Self = $max; #[inline] fn add_checked(self, rhs: Self) -> Result { @@ -377,9 +391,30 @@ macro_rules! native_type_float_op { }; } -native_type_float_op!(f16, f16::ZERO, f16::ONE); -native_type_float_op!(f32, 0., 1.); -native_type_float_op!(f64, 0., 1.); +// the smallest/largest bit patterns for floating point numbers are NaN, but differ from the canonical NAN constants. +// See test_float_total_order_min_max for details. +native_type_float_op!( + f16, + f16::ZERO, + f16::ONE, + f16::from_bits(-1 as _), + f16::from_bits(i16::MAX as _) +); +// from_bits is not yet stable as const fn, see https://github.com/rust-lang/rust/issues/72447 +native_type_float_op!( + f32, + 0., + 1., + unsafe { std::mem::transmute(-1_i32) }, + unsafe { std::mem::transmute(i32::MAX) } +); +native_type_float_op!( + f64, + 0., + 1., + unsafe { std::mem::transmute(-1_i64) }, + unsafe { std::mem::transmute(i64::MAX) } +); #[cfg(test)] mod tests { @@ -780,4 +815,40 @@ mod tests { assert_eq!(8.0_f32.pow_checked(2_u32).unwrap(), 64_f32); assert_eq!(8.0_f64.pow_checked(2_u32).unwrap(), 64_f64); } + + #[test] + fn test_float_total_order_min_max() { + assert!(::MIN_TOTAL_ORDER.is_lt(f64::NEG_INFINITY)); + assert!(::MAX_TOTAL_ORDER.is_gt(f64::INFINITY)); + + assert!(::MIN_TOTAL_ORDER.is_nan()); + assert!(::MIN_TOTAL_ORDER.is_sign_negative()); + assert!(::MIN_TOTAL_ORDER.is_lt(-f64::NAN)); + + assert!(::MAX_TOTAL_ORDER.is_nan()); + assert!(::MAX_TOTAL_ORDER.is_sign_positive()); + assert!(::MAX_TOTAL_ORDER.is_gt(f64::NAN)); + + assert!(::MIN_TOTAL_ORDER.is_lt(f32::NEG_INFINITY)); + assert!(::MAX_TOTAL_ORDER.is_gt(f32::INFINITY)); + + assert!(::MIN_TOTAL_ORDER.is_nan()); + assert!(::MIN_TOTAL_ORDER.is_sign_negative()); + assert!(::MIN_TOTAL_ORDER.is_lt(-f32::NAN)); + + assert!(::MAX_TOTAL_ORDER.is_nan()); + assert!(::MAX_TOTAL_ORDER.is_sign_positive()); + assert!(::MAX_TOTAL_ORDER.is_gt(f32::NAN)); + + assert!(::MIN_TOTAL_ORDER.is_lt(f16::NEG_INFINITY)); + assert!(::MAX_TOTAL_ORDER.is_gt(f16::INFINITY)); + + assert!(::MIN_TOTAL_ORDER.is_nan()); + assert!(::MIN_TOTAL_ORDER.is_sign_negative()); + assert!(::MIN_TOTAL_ORDER.is_lt(-f16::NAN)); + + assert!(::MAX_TOTAL_ORDER.is_nan()); + assert!(::MAX_TOTAL_ORDER.is_sign_positive()); + assert!(::MAX_TOTAL_ORDER.is_gt(f16::NAN)); + } } diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index c651edcad92e..1589cc5b102b 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -90,6 +90,7 @@ impl BooleanBuffer { /// Returns a `BitChunks` instance which can be used to iterate over /// this buffer's bits in `u64` chunks + #[inline] pub fn bit_chunks(&self) -> BitChunks { BitChunks::new(self.values(), self.offset, self.len) } From d41e90e31d07bc2ad2f05ffc091f171e6c846ddf Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Dec 2023 08:49:46 +0000 Subject: [PATCH 1398/1411] Remove SIMD Feature (#5184) * Remove SIMD feature * Clippy * Tweak WASM features * Fix test * Exclude pyarrow from general build --- .github/workflows/arrow.yml | 50 +-- .github/workflows/miri.sh | 2 +- arrow-arith/Cargo.toml | 3 - arrow-array/Cargo.toml | 4 - arrow-array/src/numeric.rs | 614 +----------------------------- arrow-buffer/src/util/bit_util.rs | 58 +-- arrow-ord/src/comparison.rs | 78 +--- arrow/CONTRIBUTING.md | 12 - arrow/Cargo.toml | 1 - arrow/README.md | 5 +- arrow/src/ffi.rs | 10 +- arrow/src/pyarrow.rs | 4 +- arrow/tests/array_transform.rs | 1 + 13 files changed, 34 insertions(+), 808 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index da56c23b5cd9..d3b2526740fa 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -67,8 +67,8 @@ jobs: run: cargo test -p arrow-data --all-features - name: Test arrow-schema with all features run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features except SIMD - run: cargo test -p arrow-array + - name: Test arrow-array with all features + run: cargo test -p arrow-array --all-features - name: Test arrow-select with all features run: cargo test -p arrow-select --all-features - name: Test arrow-cast with all features @@ -85,15 +85,15 @@ jobs: run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features run: cargo test -p arrow-ord --all-features - - name: Test arrow-arith with all features except SIMD - run: cargo test -p arrow-arith + - name: Test arrow-arith with all features + run: cargo test -p arrow-arith --all-features - name: Test arrow-row with all features run: cargo test -p arrow-row --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features run: cargo test -p arrow - - name: Test arrow with all features apart from simd + - name: Test arrow with all features except pyarrow run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | @@ -132,29 +132,6 @@ jobs: - name: Check compilation --no-default-features --all-targets --features chrono-tz run: cargo check -p arrow --no-default-features --all-targets --features chrono-tz - # test the --features "simd" of the arrow crate. This requires nightly Rust. - linux-test-simd: - name: Test SIMD on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - container: - image: amd64/rust - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: nightly - - name: Test arrow-array with SIMD - run: cargo test -p arrow-array --features simd - - name: Test arrow-arith with SIMD - run: cargo test -p arrow-arith --features simd - - name: Test arrow with SIMD - run: cargo test -p arrow --features simd - - name: Check compilation --features simd --all-targets - run: cargo check -p arrow --features simd --all-targets - # test the arrow crate builds against wasm32 in nightly rust wasm32-build: @@ -169,12 +146,11 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: - rust-version: nightly target: wasm32-unknown-unknown,wasm32-wasi - name: Build wasm32-unknown-unknown - run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-unknown-unknown + run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-unknown-unknown - name: Build wasm32-wasi - run: cargo build -p arrow --no-default-features --features=json,csv,ipc,simd,ffi --target wasm32-wasi + run: cargo build -p arrow --no-default-features --features=json,csv,ipc,ffi --target wasm32-wasi clippy: name: Clippy @@ -193,8 +169,8 @@ jobs: run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - name: Clippy arrow-schema with all features run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features except SIMD - run: cargo clippy -p arrow-array --all-targets -- -D warnings + - name: Clippy arrow-array with all features + run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - name: Clippy arrow-select with all features run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - name: Clippy arrow-cast with all features @@ -211,12 +187,12 @@ jobs: run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - - name: Clippy arrow-arith with all features except SIMD - run: cargo clippy -p arrow-arith --all-targets -- -D warnings + - name: Clippy arrow-arith with all features + run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features except SIMD - run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,chrono-tz --all-targets -- -D warnings + - name: Clippy arrow with all features + run: cargo clippy -p arrow --all-features --all-targets -- -D warnings - name: Clippy arrow-integration-test with all features run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - name: Clippy arrow-integration-testing with all features diff --git a/.github/workflows/miri.sh b/.github/workflows/miri.sh index ec8712660c74..5057c876b952 100755 --- a/.github/workflows/miri.sh +++ b/.github/workflows/miri.sh @@ -14,5 +14,5 @@ cargo miri test -p arrow-buffer cargo miri test -p arrow-data --features ffi cargo miri test -p arrow-schema --features ffi cargo miri test -p arrow-array -cargo miri test -p arrow-arith --features simd +cargo miri test -p arrow-arith cargo miri test -p arrow-ord diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index 57dc033e9645..d2ee0b9e2c72 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -43,6 +43,3 @@ half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } [dev-dependencies] - -[features] -simd = ["arrow-array/simd"] diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 4f7ab24f9708..04eec8df6379 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -49,10 +49,6 @@ chrono-tz = { version = "0.8", optional = true } num = { version = "0.4.1", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", default-features = false } -packed_simd = { version = "0.3.9", default-features = false, optional = true } - -[features] -simd = ["packed_simd"] [dev-dependencies] rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index b5e474ba696a..a3cd7bde5d36 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -15,621 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::types::*; use crate::ArrowPrimitiveType; -#[cfg(feature = "simd")] -use packed_simd::*; -#[cfg(feature = "simd")] -use std::ops::{Add, BitAnd, BitAndAssign, BitOr, BitOrAssign, Div, Mul, Not, Rem, Sub}; /// A subtype of primitive type that represents numeric values. -/// -/// SIMD operations are defined in this trait if available on the target system. -#[cfg(feature = "simd")] -pub trait ArrowNumericType: ArrowPrimitiveType -where - Self::Simd: Add - + Sub - + Mul - + Div - + Rem - + Copy, - Self::SimdMask: BitAnd - + BitOr - + BitAndAssign - + BitOrAssign - + Not - + Copy, -{ - /// Defines the SIMD type that should be used for this numeric type - type Simd; - - /// Defines the SIMD Mask type that should be used for this numeric type - type SimdMask; - - /// The number of SIMD lanes available - fn lanes() -> usize; - - /// Initializes a SIMD register to a constant value - fn init(value: Self::Native) -> Self::Simd; - - /// Loads a slice into a SIMD register - fn load(slice: &[Self::Native]) -> Self::Simd; - - /// Creates a new SIMD mask for this SIMD type filling it with `value` - fn mask_init(value: bool) -> Self::SimdMask; - - /// Creates a new SIMD mask for this SIMD type from the lower-most bits of the given `mask`. - /// The number of bits used corresponds to the number of lanes of this type - fn mask_from_u64(mask: u64) -> Self::SimdMask; - - /// Creates a bitmask from the given SIMD mask. - /// Each bit corresponds to one vector lane, starting with the least-significant bit. - fn mask_to_u64(mask: &Self::SimdMask) -> u64; - - /// Gets the value of a single lane in a SIMD mask - fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool; - - /// Sets the value of a single lane of a SIMD mask - fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask; - - /// Selects elements of `a` and `b` using `mask` - fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd; - - /// Returns `true` if any of the lanes in the mask are `true` - fn mask_any(mask: Self::SimdMask) -> bool; - - /// Performs a SIMD binary operation - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd; - - /// SIMD version of equal - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of not equal - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of less than - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of less than or equal to - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of greater than - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of greater than or equal to - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// Writes a SIMD result back to a slice - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]); - - /// Performs a SIMD unary operation - fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd; -} - -/// A subtype of primitive type that represents numeric values. -#[cfg(not(feature = "simd"))] pub trait ArrowNumericType: ArrowPrimitiveType {} -macro_rules! make_numeric_type { - ($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident) => { - #[cfg(feature = "simd")] - impl ArrowNumericType for $impl_ty { - type Simd = $simd_ty; - - type SimdMask = $simd_mask_ty; - - #[inline] - fn lanes() -> usize { - Self::Simd::lanes() - } - - #[inline] - fn init(value: Self::Native) -> Self::Simd { - Self::Simd::splat(value) - } - - #[inline] - fn load(slice: &[Self::Native]) -> Self::Simd { - unsafe { Self::Simd::from_slice_unaligned_unchecked(slice) } - } - - #[inline] - fn mask_init(value: bool) -> Self::SimdMask { - Self::SimdMask::splat(value) - } - - #[inline] - fn mask_from_u64(mask: u64) -> Self::SimdMask { - // this match will get removed by the compiler since the number of lanes is known at - // compile-time for each concrete numeric type - match Self::lanes() { - 4 => { - // the bit position in each lane indicates the index of that lane - let vecidx = i128x4::new(1, 2, 4, 8); - - // broadcast the lowermost 8 bits of mask to each lane - let vecmask = i128x4::splat((mask & 0x0F) as i128); - // compute whether the bit corresponding to each lanes index is set - let vecmask = (vecidx & vecmask).eq(vecidx); - - // transmute is necessary because the different match arms return different - // mask types, at runtime only one of those expressions will exist per type, - // with the type being equal to `SimdMask`. - unsafe { std::mem::transmute(vecmask) } - } - 8 => { - // the bit position in each lane indicates the index of that lane - let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128); - - // broadcast the lowermost 8 bits of mask to each lane - let vecmask = i64x8::splat((mask & 0xFF) as i64); - // compute whether the bit corresponding to each lanes index is set - let vecmask = (vecidx & vecmask).eq(vecidx); - - // transmute is necessary because the different match arms return different - // mask types, at runtime only one of those expressions will exist per type, - // with the type being equal to `SimdMask`. - unsafe { std::mem::transmute(vecmask) } - } - 16 => { - // same general logic as for 8 lanes, extended to 16 bits - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, - 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - unsafe { std::mem::transmute(vecmask) } - } - 32 => { - // compute two separate m32x16 vector masks from from the lower-most 32 bits of `mask` - // and then combine them into one m16x32 vector mask by writing and reading a temporary - let tmp = &mut [0_i16; 32]; - - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, - 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); - - let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); - - unsafe { std::mem::transmute(i16x32::from_slice_unaligned(tmp)) } - } - 64 => { - // compute four m32x16 vector masks from from all 64 bits of `mask` - // and convert them into one m8x64 vector mask by writing and reading a temporary - let tmp = &mut [0_i8; 64]; - - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, - 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); - - let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); - - let vecmask = i32x16::splat(((mask >> 32) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[32..48]); - - let vecmask = i32x16::splat(((mask >> 48) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[48..64]); - - unsafe { std::mem::transmute(i8x64::from_slice_unaligned(tmp)) } - } - _ => panic!("Invalid number of vector lanes"), - } - } - - #[inline] - fn mask_to_u64(mask: &Self::SimdMask) -> u64 { - mask.bitmask() as u64 - } - - #[inline] - fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool { - unsafe { mask.extract_unchecked(idx) } - } - - #[inline] - fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask { - unsafe { mask.replace_unchecked(idx, value) } - } - - /// Selects elements of `a` and `b` using `mask` - #[inline] - fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { - mask.select(a, b) - } - - #[inline] - fn mask_any(mask: Self::SimdMask) -> bool { - mask.any() - } - - #[inline] - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd { - op(left, right) - } - - #[inline] - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.eq(right) - } - - #[inline] - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ne(right) - } - - #[inline] - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.lt(right) - } - - #[inline] - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.le(right) - } - - #[inline] - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.gt(right) - } - - #[inline] - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ge(right) - } - - #[inline] - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { - unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) }; - } - - #[inline] - fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { - op(a) - } - } - - #[cfg(not(feature = "simd"))] - impl ArrowNumericType for $impl_ty {} - }; -} - -make_numeric_type!(Int8Type, i8, i8x64, m8x64); -make_numeric_type!(Int16Type, i16, i16x32, m16x32); -make_numeric_type!(Int32Type, i32, i32x16, m32x16); -make_numeric_type!(Int64Type, i64, i64x8, m64x8); -make_numeric_type!(UInt8Type, u8, u8x64, m8x64); -make_numeric_type!(UInt16Type, u16, u16x32, m16x32); -make_numeric_type!(UInt32Type, u32, u32x16, m32x16); -make_numeric_type!(UInt64Type, u64, u64x8, m64x8); -make_numeric_type!(Float32Type, f32, f32x16, m32x16); -make_numeric_type!(Float64Type, f64, f64x8, m64x8); - -make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8); -make_numeric_type!(Date32Type, i32, i32x16, m32x16); -make_numeric_type!(Date64Type, i64, i64x8, m64x8); -make_numeric_type!(Time32SecondType, i32, i32x16, m32x16); -make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16); -make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8); -make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16); -make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8); -make_numeric_type!(IntervalMonthDayNanoType, i128, i128x4, m128x4); -make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8); -make_numeric_type!(Decimal128Type, i128, i128x4, m128x4); - -#[cfg(not(feature = "simd"))] -impl ArrowNumericType for Float16Type {} - -#[cfg(feature = "simd")] -impl ArrowNumericType for Float16Type { - type Simd = ::Simd; - type SimdMask = ::SimdMask; - - fn lanes() -> usize { - Float32Type::lanes() - } - - fn init(value: Self::Native) -> Self::Simd { - Float32Type::init(value.to_f32()) - } - - fn load(slice: &[Self::Native]) -> Self::Simd { - let mut s = [0_f32; Self::Simd::lanes()]; - s.iter_mut().zip(slice).for_each(|(o, a)| *o = a.to_f32()); - Float32Type::load(&s) - } - - fn mask_init(value: bool) -> Self::SimdMask { - Float32Type::mask_init(value) - } - - fn mask_from_u64(mask: u64) -> Self::SimdMask { - Float32Type::mask_from_u64(mask) - } - - fn mask_to_u64(mask: &Self::SimdMask) -> u64 { - Float32Type::mask_to_u64(mask) - } - - fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool { - Float32Type::mask_get(mask, idx) - } - - fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask { - Float32Type::mask_set(mask, idx, value) - } - - fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { - Float32Type::mask_select(mask, a, b) - } - - fn mask_any(mask: Self::SimdMask) -> bool { - Float32Type::mask_any(mask) - } - - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd { - op(left, right) - } - - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::eq(left, right) - } - - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::ne(left, right) - } - - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::lt(left, right) - } - - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::le(left, right) - } - - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::gt(left, right) - } - - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - Float32Type::ge(left, right) - } - - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { - let mut s = [0_f32; Self::Simd::lanes()]; - Float32Type::write(simd_result, &mut s); - slice - .iter_mut() - .zip(s) - .for_each(|(o, i)| *o = half::f16::from_f32(i)) - } - - fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { - Float32Type::unary_op(a, op) - } -} - -#[cfg(not(feature = "simd"))] -impl ArrowNumericType for Decimal256Type {} - -#[cfg(feature = "simd")] -impl ArrowNumericType for Decimal256Type { - type Simd = arrow_buffer::i256; - type SimdMask = bool; - - fn lanes() -> usize { - 1 - } - - fn init(value: Self::Native) -> Self::Simd { - value - } - - fn load(slice: &[Self::Native]) -> Self::Simd { - slice[0] - } - - fn mask_init(value: bool) -> Self::SimdMask { - value - } - - fn mask_from_u64(mask: u64) -> Self::SimdMask { - mask != 0 - } - - fn mask_to_u64(mask: &Self::SimdMask) -> u64 { - *mask as u64 - } - - fn mask_get(mask: &Self::SimdMask, _idx: usize) -> bool { - *mask - } - - fn mask_set(_mask: Self::SimdMask, _idx: usize, value: bool) -> Self::SimdMask { - value - } - - fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { - match mask { - true => a, - false => b, - } - } - - fn mask_any(mask: Self::SimdMask) -> bool { - mask - } - - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd { - op(left, right) - } - - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.eq(&right) - } - - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ne(&right) - } - - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.lt(&right) - } - - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.le(&right) - } - - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.gt(&right) - } - - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ge(&right) - } - - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { - slice[0] = simd_result - } - - fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { - op(a) - } -} - -#[cfg(all(test, feature = "simd"))] -mod tests { - use super::*; - use FromCast; - - /// calculate the expected mask by iterating over all bits - macro_rules! expected_mask { - ($T:ty, $MASK:expr) => {{ - let mask = $MASK; - // simd width of all types is currently 64 bytes -> 512 bits - let lanes = 64 / std::mem::size_of::<$T>(); - // translate each set bit into a value of all ones (-1) of the correct type - (0..lanes) - .map(|i| (if (mask & (1 << i)) != 0 { -1 } else { 0 })) - .collect::>() - }}; - } - - #[test] - fn test_mask_i128() { - let mask = 0b1101; - let actual = IntervalMonthDayNanoType::mask_from_u64(mask); - let expected = expected_mask!(i128, mask); - let expected = m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_f64() { - let mask = 0b10101010; - let actual = Float64Type::mask_from_u64(mask); - let expected = expected_mask!(i64, mask); - let expected = m64x8::from_cast(i64x8::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_u64() { - let mask = 0b01010101; - let actual = Int64Type::mask_from_u64(mask); - let expected = expected_mask!(i64, mask); - let expected = m64x8::from_cast(i64x8::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_f32() { - let mask = 0b10101010_10101010; - let actual = Float32Type::mask_from_u64(mask); - let expected = expected_mask!(i32, mask); - let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_i32() { - let mask = 0b01010101_01010101; - let actual = Int32Type::mask_from_u64(mask); - let expected = expected_mask!(i32, mask); - let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_u16() { - let mask = 0b01010101_01010101_10101010_10101010; - let actual = UInt16Type::mask_from_u64(mask); - let expected = expected_mask!(i16, mask); - let expected = m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_i8() { - let mask = 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; - let actual = Int8Type::mask_from_u64(mask); - let expected = expected_mask!(i8, mask); - let expected = m8x64::from_cast(i8x64::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } -} +impl ArrowNumericType for T {} diff --git a/arrow-buffer/src/util/bit_util.rs b/arrow-buffer/src/util/bit_util.rs index b27931f4cc85..d2dbf3c84882 100644 --- a/arrow-buffer/src/util/bit_util.rs +++ b/arrow-buffer/src/util/bit_util.rs @@ -17,9 +17,6 @@ //! Utils for working with bits -#[cfg(feature = "simd")] -use packed_simd::u8x64; - const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; const UNSET_BIT_MASK: [u8; 8] = [ 255 - 1, @@ -104,31 +101,13 @@ pub fn ceil(value: usize, divisor: usize) -> usize { value / divisor + (0 != value % divisor) as usize } -/// Performs SIMD bitwise binary operations. -/// -/// # Safety -/// -/// Note that each slice should be 64 bytes and it is the callers responsibility to ensure -/// that this is the case. If passed slices larger than 64 bytes the operation will only -/// be performed on the first 64 bytes. Slices less than 64 bytes will panic. -#[cfg(feature = "simd")] -pub unsafe fn bitwise_bin_op_simd(left: &[u8], right: &[u8], result: &mut [u8], op: F) -where - F: Fn(u8x64, u8x64) -> u8x64, -{ - let left_simd = u8x64::from_slice_unaligned_unchecked(left); - let right_simd = u8x64::from_slice_unaligned_unchecked(right); - let simd_result = op(left_simd, right_simd); - simd_result.write_to_slice_unaligned_unchecked(result); -} - -#[cfg(all(test, feature = "test_utils"))] +#[cfg(test)] mod tests { use std::collections::HashSet; use super::*; - use crate::util::test_util::seedable_rng; - use rand::Rng; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; #[test] fn test_round_upto_multiple_of_64() { @@ -167,10 +146,14 @@ mod tests { assert!(!get_bit(&[0b01001001, 0b01010010], 15)); } + pub fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) + } + #[test] fn test_get_bit_raw() { const NUM_BYTE: usize = 10; - let mut buf = vec![0; NUM_BYTE]; + let mut buf = [0; NUM_BYTE]; let mut expected = vec![]; let mut rng = seedable_rng(); for i in 0..8 * NUM_BYTE { @@ -278,7 +261,6 @@ mod tests { } #[test] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_ceil() { assert_eq!(ceil(0, 1), 0); assert_eq!(ceil(1, 1), 1); @@ -292,28 +274,4 @@ mod tests { assert_eq!(ceil(10, 10000000000), 1); assert_eq!(ceil(10000000000, 1000000000), 10); } - - #[test] - #[cfg(feature = "simd")] - fn test_bitwise_and_simd() { - let buf1 = [0b00110011u8; 64]; - let buf2 = [0b11110000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a & b) }; - for i in buf3.iter() { - assert_eq!(&0b00110000u8, i); - } - } - - #[test] - #[cfg(feature = "simd")] - fn test_bitwise_or_simd() { - let buf1 = [0b00110011u8; 64]; - let buf2 = [0b11110000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a | b) }; - for i in buf3.iter() { - assert_eq!(&0b11110011u8, i); - } - } } diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 021ecdf0e658..4dbb395192e1 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -243,64 +243,6 @@ fn make_utf8_scalar(d: &DataType, scalar: &str) -> Result } } -/// Helper function to perform boolean lambda function on values from two array accessors, this -/// version does not attempt to use SIMD. -fn compare_op( - left: T, - right: S, - op: F, -) -> Result -where - F: Fn(T::Item, S::Item) -> bool, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length".to_string(), - )); - } - - Ok(BooleanArray::from_binary(left, right, op)) -} - -/// Helper function to perform boolean lambda function on values from array accessor, this -/// version does not attempt to use SIMD. -fn compare_op_scalar(left: T, op: F) -> Result -where - F: Fn(T::Item) -> bool, -{ - Ok(BooleanArray::from_unary(left, op)) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified -/// comparison function. -#[deprecated(note = "Use BooleanArray::from_binary")] -pub fn no_simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result -where - T: ArrowPrimitiveType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op(left, right, op) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using -/// a specified comparison function. -#[deprecated(note = "Use BooleanArray::from_unary")] -pub fn no_simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - op: F, -) -> Result -where - T: ArrowPrimitiveType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op_scalar(left, |l| op(l, right)) -} - /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`]. #[deprecated(note = "Use arrow_ord::cmp::eq")] pub fn eq_utf8( @@ -610,7 +552,6 @@ pub fn gt_eq_utf8_scalar( /// Perform `left == right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -628,7 +569,6 @@ where /// Perform `left < right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -646,7 +586,6 @@ where /// Perform `left <= right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -664,7 +603,6 @@ where /// Perform `left > right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -682,7 +620,6 @@ where /// Perform `left >= right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -700,7 +637,6 @@ where /// Perform `left != right` operation on an array and a numeric scalar /// value. Supports PrimitiveArrays, and DictionaryArrays that have primitive values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1015,7 +951,6 @@ pub fn gt_eq_dyn(left: &dyn Array, right: &dyn Array) -> Result(left: &PrimitiveArray, op: F) -> Result where T: ArrowNumericType, F: Fn(T::Native) -> bool, { - compare_op_scalar(left, op) + Ok(BooleanArray::from_unary(left, op)) } /// Perform `left != right` operation on two [`PrimitiveArray`]s. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1081,7 +1016,6 @@ where /// Perform `left != right` operation on a [`PrimitiveArray`] and a scalar value. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1100,7 +1034,6 @@ where /// Perform `left < right` operation on two [`PrimitiveArray`]s. Null values are less than non-null /// values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1121,7 +1054,6 @@ where /// Perform `left < right` operation on a [`PrimitiveArray`] and a scalar value. /// Null values are less than non-null values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1140,7 +1072,6 @@ where /// Perform `left <= right` operation on two [`PrimitiveArray`]s. Null values are less than non-null /// values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1161,7 +1092,6 @@ where /// Perform `left <= right` operation on a [`PrimitiveArray`] and a scalar value. /// Null values are less than non-null values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1183,7 +1113,6 @@ where /// Perform `left > right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null /// values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1204,7 +1133,6 @@ where /// Perform `left > right` operation on a [`PrimitiveArray`] and a scalar value. /// Non-null values are greater than null values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1223,7 +1151,6 @@ where /// Perform `left >= right` operation on two [`PrimitiveArray`]s. Non-null values are greater than null /// values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary @@ -1244,7 +1171,6 @@ where /// Perform `left >= right` operation on a [`PrimitiveArray`] and a scalar value. /// Non-null values are greater than null values. /// -/// If `simd` feature flag is not enabled: /// For floating values like f32 and f64, this comparison produces an ordering in accordance to /// the totalOrder predicate as defined in the IEEE 754 (2008 revision) floating point standard. /// Note that totalOrder treats positive and negative zeros are different. If it is necessary diff --git a/arrow/CONTRIBUTING.md b/arrow/CONTRIBUTING.md index 5b84bc2d3bdb..0c795d6b9cbd 100644 --- a/arrow/CONTRIBUTING.md +++ b/arrow/CONTRIBUTING.md @@ -67,18 +67,6 @@ the impossibility of the compiler to derive the invariants (such as lifetime, nu The arrow format declares a IPC protocol, which this crate supports. IPC is equivalent to a FFI in that the rust compiler can't reason about the contract's invariants. -#### SIMD - -The API provided by the [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/) crate is currently `unsafe`. However, -SIMD offers a significant performance improvement over non-SIMD operations. A related crate in development is -[portable-simd](https://rust-lang.github.io/portable-simd/core_simd/) which has a nice -[beginners guide](https://github.com/rust-lang/portable-simd/blob/master/beginners-guide.md). These crates provide the ability -for code on x86 and ARM architectures to use some of the available parallel register operations. As an example if two arrays -of numbers are added, [1,2,3,4] + [5,6,7,8], rather than using four instructions to add each of the elements of the arrays, -one instruction can be used to all all four elements at the same time, which leads to improved time to solution. SIMD instructions -are typically most effective when data is aligned to allow a single load instruction to bring multiple consecutive data elements -to the registers, before use of a SIMD instruction. - #### Performance Some operations are significantly faster when `unsafe` is used. diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 6ca218f5f658..a6b4ddf51dfb 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -65,7 +65,6 @@ ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] json = ["arrow-json"] -simd = ["arrow-array/simd", "arrow-arith/simd"] prettyprint = ["arrow-cast/prettyprint"] # The test utils feature enables code used in benchmarks and tests but # not the core arrow code itself. Be aware that `rand` must be kept as diff --git a/arrow/README.md b/arrow/README.md index 6a91bc951cc1..bc95b91a9a4a 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -48,9 +48,7 @@ The `arrow` crate provides the following features which may be enabled in your ` - `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) - `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns -- `simd` - (_Requires Nightly Rust_) Use alternate hand optimized implementations of some [compute](https://github.com/apache/arrow-rs/tree/master/arrow/src/compute/kernels) - kernels using explicit SIMD instructions via [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). - `chrono-tz` - support of parsing timezone using [chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/) - `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - `pyarrow` - bindings for pyo3 to call arrow-rs from python @@ -75,7 +73,6 @@ In particular there are a number of scenarios where `unsafe` is largely unavoida - Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... - FFI -- SIMD Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. @@ -95,7 +92,7 @@ In order to compile Arrow for `wasm32-unknown-unknown` you will need to disable ```toml [dependencies] -arrow = { version = "5.0", default-features = false, features = ["csv", "ipc", "simd"] } +arrow = { version = "5.0", default-features = false, features = ["csv", "ipc"] } ``` ## Examples diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index b49f56c91574..d867f7c30d1f 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -468,13 +468,13 @@ mod tests { use arrow_array::builder::UnionBuilder; use arrow_array::cast::AsArray; use arrow_array::types::{Float64Type, Int32Type}; - use arrow_array::{StructArray, UnionArray}; + use arrow_array::*; use crate::array::{ - make_array, Array, ArrayData, BooleanArray, Decimal128Array, DictionaryArray, - DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, - GenericListArray, GenericStringArray, Int32Array, MapArray, OffsetSizeTrait, - Time32MillisecondArray, TimestampMillisecondArray, UInt32Array, + make_array, Array, ArrayData, BooleanArray, DictionaryArray, DurationSecondArray, + FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, + GenericStringArray, Int32Array, MapArray, OffsetSizeTrait, Time32MillisecondArray, + TimestampMillisecondArray, UInt32Array, }; use crate::compute::kernels; use crate::datatypes::{Field, Int8Type}; diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 8302f8741b60..9a13cfa493e9 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -71,7 +71,7 @@ use crate::datatypes::{DataType, Field, Schema}; use crate::error::ArrowError; use crate::ffi; use crate::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; -use crate::ffi_stream::{export_reader_into_raw, ArrowArrayStreamReader, FFI_ArrowArrayStream}; +use crate::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; use crate::record_batch::RecordBatch; import_exception!(pyarrow, ArrowException); @@ -377,7 +377,7 @@ impl FromPyArrow for RecordBatch { impl ToPyArrow for RecordBatch { fn to_pyarrow(&self, py: Python) -> PyResult { // Workaround apache/arrow#37669 by returning RecordBatchIterator - let reader = RecordBatchIterator::new(vec![Ok(self.clone())], self.schema().clone()); + let reader = RecordBatchIterator::new(vec![Ok(self.clone())], self.schema()); let reader: Box = Box::new(reader); let py_reader = reader.into_pyarrow(py)?; py_reader.call_method0(py, "read_next_batch") diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 74e2a212736a..6f5b245b8e3b 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -28,6 +28,7 @@ use arrow_data::ArrayData; use arrow_schema::{DataType, Field, Fields}; use std::sync::Arc; +#[allow(unused)] fn create_decimal_array(array: Vec>, precision: u8, scale: i8) -> Decimal128Array { array .into_iter() From 93a28a54573480ff30861ac40031b089b808806f Mon Sep 17 00:00:00 2001 From: Mohammad Razeghi Date: Fri, 8 Dec 2023 10:35:44 +0100 Subject: [PATCH 1399/1411] Fixed issue where timestamp parser incorrectly accepted characters after 'Z (#5189) --- arrow-cast/src/cast.rs | 25 +++++++++++++++++++++++++ arrow-cast/src/parse.rs | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 51acd36c3fe4..7f8bd19e9291 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -9449,4 +9449,29 @@ mod tests { let r: Vec<_> = a.as_string::().iter().map(|x| x.unwrap()).collect(); assert_eq!(r, &["[0, 1, 2]", "[0, null, 2]"]); } + #[test] + fn test_cast_string_to_timestamp_invalid_tz() { + // content after Z should be ignored + let bad_timestamp = "2023-12-05T21:58:10.45ZZTOP"; + let array = StringArray::from(vec![Some(bad_timestamp)]); + + let data_types = [ + DataType::Timestamp(TimeUnit::Second, None), + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]; + + let cast_options = CastOptions { + safe: false, + ..Default::default() + }; + + for dt in data_types { + assert_eq!( + cast_with_options(&array, &dt, &cast_options).unwrap_err().to_string(), + "Parser error: Invalid timezone \"ZZTOP\": only offset based timezones supported without chrono-tz feature" + ); + } + } } diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 750f38006d33..3d2e47ed95a4 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -210,7 +210,7 @@ pub fn string_to_datetime(timezone: &T, s: &str) -> Result Date: Fri, 8 Dec 2023 11:36:58 +0100 Subject: [PATCH 1400/1411] Fix deprecated note for `Buffer::from_raw_parts` (#5190) --- arrow-buffer/src/buffer/immutable.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 05530eed9b08..8869ab3a2225 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -99,7 +99,7 @@ impl Buffer { /// /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - #[deprecated(note = "Use From>")] + #[deprecated(note = "Use Buffer::from_vec")] pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, capacity: usize) -> Self { assert!(len <= capacity); let layout = Layout::from_size_align(capacity, ALIGNMENT).unwrap(); From a43e82c630f507d6afc4fc62031bb2336d29f37d Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 8 Dec 2023 15:53:29 +0100 Subject: [PATCH 1401/1411] Add `BooleanArray::into_parts` method (#5191) * Add `BooleanArray::into_parts` method * Add a test * Remove `DataType` from returned tuple --- arrow-array/src/array/boolean_array.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index a778dc92ea35..fe374d965714 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -254,6 +254,11 @@ impl BooleanArray { }); Self::new(values, nulls) } + + /// Deconstruct this array into its constituent parts + pub fn into_parts(self) -> (BooleanBuffer, Option) { + (self.values, self.nulls) + } } impl Array for BooleanArray { @@ -618,4 +623,21 @@ mod tests { assert_eq!(b.false_count(), expected_false); } } + + #[test] + fn test_into_parts() { + let boolean_array = [Some(true), None, Some(false)] + .into_iter() + .collect::(); + let (values, nulls) = boolean_array.into_parts(); + assert_eq!(values.values(), &[0b0000_0001]); + assert!(nulls.is_some()); + assert_eq!(nulls.unwrap().buffer().as_slice(), &[0b0000_0101]); + + let boolean_array = + BooleanArray::from(vec![false, false, false, false, false, false, false, true]); + let (values, nulls) = boolean_array.into_parts(); + assert_eq!(values.values(), &[0b1000_0000]); + assert!(nulls.is_none()); + } } From 2a213bc36fdbbe8a51d4307b3c55be856e810af4 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:53:44 +0000 Subject: [PATCH 1402/1411] Remove ScalarBuffer from parquet (#1849) (#5177) (#5178) --- parquet/src/arrow/array_reader/byte_array.rs | 66 ++++---- .../array_reader/byte_array_dictionary.rs | 45 ++---- .../array_reader/fixed_len_byte_array.rs | 53 +++---- parquet/src/arrow/array_reader/null_array.rs | 9 +- .../src/arrow/array_reader/primitive_array.rs | 120 +++++++-------- parquet/src/arrow/buffer/dictionary_buffer.rs | 41 +++-- parquet/src/arrow/buffer/offset_buffer.rs | 37 ++--- parquet/src/arrow/record_reader/buffer.rs | 144 +++--------------- .../arrow/record_reader/definition_levels.rs | 12 +- parquet/src/arrow/record_reader/mod.rs | 48 +++--- 10 files changed, 200 insertions(+), 375 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index 01666c0af4e6..debe0d6109eb 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -19,7 +19,6 @@ use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; use crate::arrow::buffer::bit_util::sign_extend_be; use crate::arrow::buffer::offset_buffer::OffsetBuffer; use crate::arrow::decoder::{DeltaByteArrayDecoder, DictIndexDecoder}; -use crate::arrow::record_reader::buffer::ScalarValue; use crate::arrow::record_reader::GenericRecordReader; use crate::arrow::schema::parquet_to_arrow_field; use crate::basic::{ConvertedType, Encoding}; @@ -77,7 +76,7 @@ pub fn make_byte_array_reader( } /// An [`ArrayReader`] for variable length byte arrays -struct ByteArrayReader { +struct ByteArrayReader { data_type: ArrowType, pages: Box, def_levels_buffer: Option, @@ -85,14 +84,11 @@ struct ByteArrayReader { record_reader: GenericRecordReader, ByteArrayColumnValueDecoder>, } -impl ByteArrayReader { +impl ByteArrayReader { fn new( pages: Box, data_type: ArrowType, - record_reader: GenericRecordReader< - OffsetBuffer, - ByteArrayColumnValueDecoder, - >, + record_reader: GenericRecordReader, ByteArrayColumnValueDecoder>, ) -> Self { Self { data_type, @@ -104,7 +100,7 @@ impl ByteArrayReader { } } -impl ArrayReader for ByteArrayReader { +impl ArrayReader for ByteArrayReader { fn as_any(&self) -> &dyn Any { self } @@ -167,15 +163,13 @@ impl ArrayReader for ByteArrayReader { } /// A [`ColumnValueDecoder`] for variable length byte arrays -struct ByteArrayColumnValueDecoder { +struct ByteArrayColumnValueDecoder { dict: Option>, decoder: Option, validate_utf8: bool, } -impl ColumnValueDecoder - for ByteArrayColumnValueDecoder -{ +impl ColumnValueDecoder for ByteArrayColumnValueDecoder { type Slice = OffsetBuffer; fn new(desc: &ColumnDescPtr) -> Self { @@ -275,17 +269,15 @@ impl ByteArrayDecoder { num_values, validate_utf8, )), - Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { - ByteArrayDecoder::Dictionary(ByteArrayDecoderDictionary::new( - data, num_levels, num_values, - )) - } + Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => ByteArrayDecoder::Dictionary( + ByteArrayDecoderDictionary::new(data, num_levels, num_values), + ), Encoding::DELTA_LENGTH_BYTE_ARRAY => ByteArrayDecoder::DeltaLength( ByteArrayDecoderDeltaLength::new(data, validate_utf8)?, ), - Encoding::DELTA_BYTE_ARRAY => ByteArrayDecoder::DeltaByteArray( - ByteArrayDecoderDelta::new(data, validate_utf8)?, - ), + Encoding::DELTA_BYTE_ARRAY => { + ByteArrayDecoder::DeltaByteArray(ByteArrayDecoderDelta::new(data, validate_utf8)?) + } _ => { return Err(general_err!( "unsupported encoding for byte array: {}", @@ -298,7 +290,7 @@ impl ByteArrayDecoder { } /// Read up to `len` values to `out` with the optional dictionary - pub fn read( + pub fn read( &mut self, out: &mut OffsetBuffer, len: usize, @@ -307,8 +299,8 @@ impl ByteArrayDecoder { match self { ByteArrayDecoder::Plain(d) => d.read(out, len), ByteArrayDecoder::Dictionary(d) => { - let dict = dict - .ok_or_else(|| general_err!("missing dictionary page for column"))?; + let dict = + dict.ok_or_else(|| general_err!("missing dictionary page for column"))?; d.read(out, dict, len) } @@ -318,7 +310,7 @@ impl ByteArrayDecoder { } /// Skip `len` values - pub fn skip( + pub fn skip( &mut self, len: usize, dict: Option<&OffsetBuffer>, @@ -326,8 +318,8 @@ impl ByteArrayDecoder { match self { ByteArrayDecoder::Plain(d) => d.skip(len), ByteArrayDecoder::Dictionary(d) => { - let dict = dict - .ok_or_else(|| general_err!("missing dictionary page for column"))?; + let dict = + dict.ok_or_else(|| general_err!("missing dictionary page for column"))?; d.skip(dict, len) } @@ -363,7 +355,7 @@ impl ByteArrayDecoderPlain { } } - pub fn read( + pub fn read( &mut self, output: &mut OffsetBuffer, len: usize, @@ -392,8 +384,7 @@ impl ByteArrayDecoderPlain { if self.offset + 4 > buf.len() { return Err(ParquetError::EOF("eof decoding byte array".into())); } - let len_bytes: [u8; 4] = - buf[self.offset..self.offset + 4].try_into().unwrap(); + let len_bytes: [u8; 4] = buf[self.offset..self.offset + 4].try_into().unwrap(); let len = u32::from_le_bytes(len_bytes); let start_offset = self.offset + 4; @@ -424,8 +415,7 @@ impl ByteArrayDecoderPlain { if self.offset + 4 > buf.len() { return Err(ParquetError::EOF("eof decoding byte array".into())); } - let len_bytes: [u8; 4] = - buf[self.offset..self.offset + 4].try_into().unwrap(); + let len_bytes: [u8; 4] = buf[self.offset..self.offset + 4].try_into().unwrap(); let len = u32::from_le_bytes(len_bytes) as usize; skip += 1; self.offset = self.offset + 4 + len; @@ -462,7 +452,7 @@ impl ByteArrayDecoderDeltaLength { }) } - fn read( + fn read( &mut self, output: &mut OffsetBuffer, len: usize, @@ -529,7 +519,7 @@ impl ByteArrayDecoderDelta { }) } - fn read( + fn read( &mut self, output: &mut OffsetBuffer, len: usize, @@ -564,7 +554,7 @@ impl ByteArrayDecoderDictionary { } } - fn read( + fn read( &mut self, output: &mut OffsetBuffer, dict: &OffsetBuffer, @@ -576,15 +566,11 @@ impl ByteArrayDecoderDictionary { } self.decoder.read(len, |keys| { - output.extend_from_dictionary( - keys, - dict.offsets.as_slice(), - dict.values.as_slice(), - ) + output.extend_from_dictionary(keys, dict.offsets.as_slice(), dict.values.as_slice()) }) } - fn skip( + fn skip( &mut self, dict: &OffsetBuffer, to_skip: usize, diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 0d216fa08327..a38122354145 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -27,10 +27,8 @@ use bytes::Bytes; use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}; use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; -use crate::arrow::buffer::{ - dictionary_buffer::DictionaryBuffer, offset_buffer::OffsetBuffer, -}; -use crate::arrow::record_reader::buffer::{BufferQueue, ScalarValue}; +use crate::arrow::buffer::{dictionary_buffer::DictionaryBuffer, offset_buffer::OffsetBuffer}; +use crate::arrow::record_reader::buffer::BufferQueue; use crate::arrow::record_reader::GenericRecordReader; use crate::arrow::schema::parquet_to_arrow_field; use crate::basic::{ConvertedType, Encoding}; @@ -123,7 +121,7 @@ pub fn make_byte_array_dictionary_reader( /// An [`ArrayReader`] for dictionary encoded variable length byte arrays /// /// Will attempt to preserve any dictionary encoding present in the parquet data -struct ByteArrayDictionaryReader { +struct ByteArrayDictionaryReader { data_type: ArrowType, pages: Box, def_levels_buffer: Option, @@ -133,16 +131,13 @@ struct ByteArrayDictionaryReader { impl ByteArrayDictionaryReader where - K: FromBytes + ScalarValue + Ord + ArrowNativeType, - V: ScalarValue + OffsetSizeTrait, + K: FromBytes + Ord + ArrowNativeType, + V: OffsetSizeTrait, { fn new( pages: Box, data_type: ArrowType, - record_reader: GenericRecordReader< - DictionaryBuffer, - DictionaryDecoder, - >, + record_reader: GenericRecordReader, DictionaryDecoder>, ) -> Self { Self { data_type, @@ -156,8 +151,8 @@ where impl ArrayReader for ByteArrayDictionaryReader where - K: FromBytes + ScalarValue + Ord + ArrowNativeType, - V: ScalarValue + OffsetSizeTrait, + K: FromBytes + Ord + ArrowNativeType, + V: OffsetSizeTrait, { fn as_any(&self) -> &dyn Any { self @@ -226,16 +221,15 @@ struct DictionaryDecoder { impl ColumnValueDecoder for DictionaryDecoder where - K: FromBytes + ScalarValue + Ord + ArrowNativeType, - V: ScalarValue + OffsetSizeTrait, + K: FromBytes + Ord + ArrowNativeType, + V: OffsetSizeTrait, { type Slice = DictionaryBuffer; fn new(col: &ColumnDescPtr) -> Self { let validate_utf8 = col.converted_type() == ConvertedType::UTF8; - let value_type = match (V::IS_LARGE, col.converted_type() == ConvertedType::UTF8) - { + let value_type = match (V::IS_LARGE, col.converted_type() == ConvertedType::UTF8) { (true, true) => ArrowType::LargeUtf8, (true, false) => ArrowType::LargeBinary, (false, true) => ArrowType::Utf8, @@ -274,8 +268,7 @@ where let len = num_values as usize; let mut buffer = OffsetBuffer::::default(); - let mut decoder = - ByteArrayDecoderPlain::new(buf, len, Some(len), self.validate_utf8); + let mut decoder = ByteArrayDecoderPlain::new(buf, len, Some(len), self.validate_utf8); decoder.read(&mut buffer, usize::MAX)?; let array = buffer.into_array(None, self.value_type.clone()); @@ -339,8 +332,8 @@ where Some(keys) => { // Happy path - can just copy keys // Keys will be validated on conversion to arrow - let keys_slice = keys.spare_capacity_mut(range.start + len); - let len = decoder.get_batch(&mut keys_slice[range.start..])?; + let keys_slice = keys.get_output_slice(len); + let len = decoder.get_batch(keys_slice)?; *max_remaining_values -= len; Ok(len) } @@ -360,11 +353,7 @@ where let dict_offsets = dict_buffers[0].typed_data::(); let dict_values = dict_buffers[1].as_slice(); - values.extend_from_dictionary( - &keys[..len], - dict_offsets, - dict_values, - )?; + values.extend_from_dictionary(&keys[..len], dict_offsets, dict_values)?; *max_remaining_values -= len; Ok(len) } @@ -375,9 +364,7 @@ where fn skip_values(&mut self, num_values: usize) -> Result { match self.decoder.as_mut().expect("decoder set") { - MaybeDictionaryDecoder::Fallback(decoder) => { - decoder.skip::(num_values, None) - } + MaybeDictionaryDecoder::Fallback(decoder) => decoder.skip::(num_values, None), MaybeDictionaryDecoder::Dict { decoder, max_remaining_values, diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index b846997d36b8..849aa37c561f 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -18,7 +18,7 @@ use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; use crate::arrow::buffer::bit_util::{iter_set_bits_rev, sign_extend_be}; use crate::arrow::decoder::{DeltaByteArrayDecoder, DictIndexDecoder}; -use crate::arrow::record_reader::buffer::{BufferQueue, ScalarBuffer, ValuesBuffer}; +use crate::arrow::record_reader::buffer::{BufferQueue, ValuesBuffer}; use crate::arrow::record_reader::GenericRecordReader; use crate::arrow::schema::parquet_to_arrow_field; use crate::basic::{Encoding, Type}; @@ -162,11 +162,10 @@ impl ArrayReader for FixedLenByteArrayReader { fn consume_batch(&mut self) -> Result { let record_data = self.record_reader.consume_record_data(); - let array_data = - ArrayDataBuilder::new(ArrowType::FixedSizeBinary(self.byte_length as i32)) - .len(self.record_reader.num_values()) - .add_buffer(record_data) - .null_bit_buffer(self.record_reader.consume_bitmap_buffer()); + let array_data = ArrayDataBuilder::new(ArrowType::FixedSizeBinary(self.byte_length as i32)) + .len(self.record_reader.num_values()) + .add_buffer(record_data) + .null_bit_buffer(self.record_reader.consume_bitmap_buffer()); let binary = FixedSizeBinaryArray::from(unsafe { array_data.build_unchecked() }); @@ -197,19 +196,13 @@ impl ArrayReader for FixedLenByteArrayReader { IntervalUnit::YearMonth => Arc::new( binary .iter() - .map(|o| { - o.map(|b| i32::from_le_bytes(b[0..4].try_into().unwrap())) - }) + .map(|o| o.map(|b| i32::from_le_bytes(b[0..4].try_into().unwrap()))) .collect::(), ) as ArrayRef, IntervalUnit::DayTime => Arc::new( binary .iter() - .map(|o| { - o.map(|b| { - i64::from_le_bytes(b[4..12].try_into().unwrap()) - }) - }) + .map(|o| o.map(|b| i64::from_le_bytes(b[4..12].try_into().unwrap()))) .collect::(), ) as ArrayRef, IntervalUnit::MonthDayNano => { @@ -247,7 +240,7 @@ impl ArrayReader for FixedLenByteArrayReader { } struct FixedLenByteArrayBuffer { - buffer: ScalarBuffer, + buffer: Vec, /// The length of each element in bytes byte_length: usize, } @@ -263,14 +256,14 @@ impl BufferQueue for FixedLenByteArrayBuffer { type Slice = Self; fn consume(&mut self) -> Self::Output { - self.buffer.consume() + Buffer::from_vec(self.buffer.consume()) } - fn spare_capacity_mut(&mut self, _batch_size: usize) -> &mut Self::Slice { + fn get_output_slice(&mut self, _batch_size: usize) -> &mut Self::Slice { self } - fn set_len(&mut self, len: usize) { + fn truncate_buffer(&mut self, len: usize) { assert_eq!(self.buffer.len(), len * self.byte_length); } } @@ -288,14 +281,10 @@ impl ValuesBuffer for FixedLenByteArrayBuffer { (read_offset + values_read) * self.byte_length ); self.buffer - .resize((read_offset + levels_read) * self.byte_length); - - let slice = self.buffer.as_slice_mut(); + .resize((read_offset + levels_read) * self.byte_length, 0); let values_range = read_offset..read_offset + values_read; - for (value_pos, level_pos) in - values_range.rev().zip(iter_set_bits_rev(valid_mask)) - { + for (value_pos, level_pos) in values_range.rev().zip(iter_set_bits_rev(valid_mask)) { debug_assert!(level_pos >= value_pos); if level_pos <= value_pos { break; @@ -305,7 +294,7 @@ impl ValuesBuffer for FixedLenByteArrayBuffer { let value_pos_bytes = value_pos * self.byte_length; for i in 0..self.byte_length { - slice[level_pos_bytes + i] = slice[value_pos_bytes + i] + self.buffer[level_pos_bytes + i] = self.buffer[value_pos_bytes + i] } } } @@ -391,8 +380,7 @@ impl ColumnValueDecoder for ValueDecoder { let len = range.end - range.start; match self.decoder.as_mut().unwrap() { Decoder::Plain { offset, buf } => { - let to_read = - (len * self.byte_length).min(buf.len() - *offset) / self.byte_length; + let to_read = (len * self.byte_length).min(buf.len() - *offset) / self.byte_length; let end_offset = *offset + to_read * self.byte_length; out.buffer .extend_from_slice(&buf.as_ref()[*offset..end_offset]); @@ -485,15 +473,12 @@ mod tests { .build() .unwrap(); - let written = RecordBatch::try_from_iter([( - "list", - Arc::new(ListArray::from(data)) as ArrayRef, - )]) - .unwrap(); + let written = + RecordBatch::try_from_iter([("list", Arc::new(ListArray::from(data)) as ArrayRef)]) + .unwrap(); let mut buffer = Vec::with_capacity(1024); - let mut writer = - ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); writer.write(&written).unwrap(); writer.close().unwrap(); diff --git a/parquet/src/arrow/array_reader/null_array.rs b/parquet/src/arrow/array_reader/null_array.rs index 4ad6c97e2f66..bb32fb307fda 100644 --- a/parquet/src/arrow/array_reader/null_array.rs +++ b/parquet/src/arrow/array_reader/null_array.rs @@ -16,14 +16,13 @@ // under the License. use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; -use crate::arrow::record_reader::buffer::ScalarValue; use crate::arrow::record_reader::RecordReader; use crate::column::page::PageIterator; use crate::data_type::DataType; use crate::errors::Result; use crate::schema::types::ColumnDescPtr; use arrow_array::ArrayRef; -use arrow_buffer::Buffer; +use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_schema::DataType as ArrowType; use std::any::Any; use std::sync::Arc; @@ -33,7 +32,7 @@ use std::sync::Arc; pub struct NullArrayReader where T: DataType, - T::T: ScalarValue, + T::T: ArrowNativeType, { data_type: ArrowType, pages: Box, @@ -45,7 +44,7 @@ where impl NullArrayReader where T: DataType, - T::T: ScalarValue, + T::T: ArrowNativeType, { /// Construct null array reader. pub fn new(pages: Box, column_desc: ColumnDescPtr) -> Result { @@ -65,7 +64,7 @@ where impl ArrayReader for NullArrayReader where T: DataType, - T::T: ScalarValue, + T::T: ArrowNativeType, { fn as_any(&self) -> &dyn Any { self diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index f833eccecb4c..507b6215cacb 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -16,7 +16,6 @@ // under the License. use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; -use crate::arrow::record_reader::buffer::ScalarValue; use crate::arrow::record_reader::RecordReader; use crate::arrow::schema::parquet_to_arrow_field; use crate::basic::Type as PhysicalType; @@ -26,22 +25,55 @@ use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow_array::Decimal256Array; use arrow_array::{ - builder::{BooleanBufferBuilder, TimestampNanosecondBufferBuilder}, - ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array, Int32Array, - Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array, + builder::TimestampNanosecondBufferBuilder, ArrayRef, BooleanArray, Decimal128Array, + Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array, + UInt64Array, }; -use arrow_buffer::{i256, Buffer}; +use arrow_buffer::{i256, BooleanBuffer, Buffer}; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType as ArrowType, TimeUnit}; use std::any::Any; use std::sync::Arc; +/// Provides conversion from `Vec` to `Buffer` +pub trait IntoBuffer { + fn into_buffer(self) -> Buffer; +} + +macro_rules! native_buffer { + ($($t:ty),*) => { + $(impl IntoBuffer for Vec<$t> { + fn into_buffer(self) -> Buffer { + Buffer::from_vec(self) + } + })* + }; +} +native_buffer!(i8, i16, i32, i64, u8, u16, u32, u64, f32, f64); + +impl IntoBuffer for Vec { + fn into_buffer(self) -> Buffer { + BooleanBuffer::from_iter(self).into_inner() + } +} + +impl IntoBuffer for Vec { + fn into_buffer(self) -> Buffer { + let mut builder = TimestampNanosecondBufferBuilder::new(self.len()); + for v in self { + builder.append(v.to_nanos()) + } + builder.finish() + } +} + /// Primitive array readers are leaves of array reader tree. They accept page iterator /// and read them into primitive arrays. pub struct PrimitiveArrayReader where T: DataType, - T::T: ScalarValue, + T::T: Copy + Default, + Vec: IntoBuffer, { data_type: ArrowType, pages: Box, @@ -53,7 +85,8 @@ where impl PrimitiveArrayReader where T: DataType, - T::T: ScalarValue, + T::T: Copy + Default, + Vec: IntoBuffer, { /// Construct primitive array reader. pub fn new( @@ -85,7 +118,8 @@ where impl ArrayReader for PrimitiveArrayReader where T: DataType, - T::T: ScalarValue, + T::T: Copy + Default, + Vec: IntoBuffer, { fn as_any(&self) -> &dyn Any { self @@ -131,40 +165,14 @@ where _ => unreachable!("INT96 must be timestamp nanosecond"), }, PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => { - unreachable!( - "PrimitiveArrayReaders don't support complex physical types" - ); + unreachable!("PrimitiveArrayReaders don't support complex physical types"); } }; // Convert to arrays by using the Parquet physical type. // The physical types are then cast to Arrow types if necessary - let record_data = self.record_reader.consume_record_data(); - let record_data = match T::get_physical_type() { - PhysicalType::BOOLEAN => { - let mut boolean_buffer = BooleanBufferBuilder::new(record_data.len()); - - for e in record_data.as_slice() { - boolean_buffer.append(*e > 0); - } - boolean_buffer.into() - } - PhysicalType::INT96 => { - // SAFETY - record_data is an aligned buffer of Int96 - let (prefix, slice, suffix) = - unsafe { record_data.as_slice().align_to::() }; - assert!(prefix.is_empty() && suffix.is_empty()); - - let mut builder = TimestampNanosecondBufferBuilder::new(slice.len()); - for v in slice { - builder.append(v.to_nanos()) - } - - builder.finish() - } - _ => record_data, - }; + let record_data = self.record_reader.consume_record_data().into_buffer(); let array_data = ArrayDataBuilder::new(arrow_data_type) .len(self.record_reader.num_values()) @@ -188,9 +196,7 @@ where PhysicalType::DOUBLE => Arc::new(Float64Array::from(array_data)), PhysicalType::INT96 => Arc::new(TimestampNanosecondArray::from(array_data)), PhysicalType::BYTE_ARRAY | PhysicalType::FIXED_LEN_BYTE_ARRAY => { - unreachable!( - "PrimitiveArrayReaders don't support complex physical types" - ); + unreachable!("PrimitiveArrayReaders don't support complex physical types"); } }; @@ -409,12 +415,9 @@ mod tests { ); let page_iterator = InMemoryPageIterator::new(page_lists); - let mut array_reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let mut array_reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); // Read first 50 values, which are all from the first column chunk let array = array_reader.next_batch(50).unwrap(); @@ -618,12 +621,9 @@ mod tests { let page_iterator = InMemoryPageIterator::new(page_lists); - let mut array_reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let mut array_reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); let mut accu_len: usize = 0; @@ -697,12 +697,9 @@ mod tests { ); let page_iterator = InMemoryPageIterator::new(page_lists); - let mut array_reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let mut array_reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); // read data from the reader // the data type is decimal(8,2) @@ -759,12 +756,9 @@ mod tests { ); let page_iterator = InMemoryPageIterator::new(page_lists); - let mut array_reader = PrimitiveArrayReader::::new( - Box::new(page_iterator), - column_desc, - None, - ) - .unwrap(); + let mut array_reader = + PrimitiveArrayReader::::new(Box::new(page_iterator), column_desc, None) + .unwrap(); // read data from the reader // the data type is decimal(18,4) diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 4208318122af..d0f63024edf0 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -16,7 +16,7 @@ // under the License. use crate::arrow::buffer::offset_buffer::OffsetBuffer; -use crate::arrow::record_reader::buffer::{BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer}; +use crate::arrow::record_reader::buffer::{BufferQueue, ValuesBuffer}; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; use arrow_array::{make_array, Array, ArrayRef, OffsetSizeTrait}; @@ -27,17 +27,12 @@ use std::sync::Arc; /// An array of variable length byte arrays that are potentially dictionary encoded /// and can be converted into a corresponding [`ArrayRef`] -pub enum DictionaryBuffer { - Dict { - keys: ScalarBuffer, - values: ArrayRef, - }, - Values { - values: OffsetBuffer, - }, +pub enum DictionaryBuffer { + Dict { keys: Vec, values: ArrayRef }, + Values { values: OffsetBuffer }, } -impl Default for DictionaryBuffer { +impl Default for DictionaryBuffer { fn default() -> Self { Self::Values { values: Default::default(), @@ -45,9 +40,7 @@ impl Default for DictionaryBuffer { } } -impl - DictionaryBuffer -{ +impl DictionaryBuffer { #[allow(unused)] pub fn len(&self) -> usize { match self { @@ -63,7 +56,7 @@ impl /// # Panic /// /// Panics if the dictionary is too large for `K` - pub fn as_keys(&mut self, dictionary: &ArrayRef) -> Option<&mut ScalarBuffer> { + pub fn as_keys(&mut self, dictionary: &ArrayRef) -> Option<&mut Vec> { assert!(K::from_usize(dictionary.len()).is_some()); match self { @@ -112,7 +105,7 @@ impl if values.is_empty() { // If dictionary is empty, zero pad offsets - spilled.offsets.resize(keys.len() + 1); + spilled.offsets.resize(keys.len() + 1, V::default()); } else { // Note: at this point null positions will have arbitrary dictionary keys // and this will hydrate them to the corresponding byte array. This is @@ -164,7 +157,7 @@ impl let builder = ArrayDataBuilder::new(data_type.clone()) .len(keys.len()) - .add_buffer(keys.into()) + .add_buffer(Buffer::from_vec(keys)) .add_child_data(values.into_data()) .null_bit_buffer(null_buffer); @@ -192,13 +185,13 @@ impl } } -impl ValuesBufferSlice for DictionaryBuffer { +impl ValuesBufferSlice for DictionaryBuffer { fn capacity(&self) -> usize { usize::MAX } } -impl ValuesBuffer for DictionaryBuffer { +impl ValuesBuffer for DictionaryBuffer { fn pad_nulls( &mut self, read_offset: usize, @@ -208,7 +201,7 @@ impl ValuesBuffer for Dictiona ) { match self { Self::Dict { keys, .. } => { - keys.resize(read_offset + levels_read); + keys.resize(read_offset + levels_read, K::default()); keys.pad_nulls(read_offset, values_read, levels_read, valid_mask) } Self::Values { values, .. } => { @@ -218,7 +211,7 @@ impl ValuesBuffer for Dictiona } } -impl BufferQueue for DictionaryBuffer { +impl BufferQueue for DictionaryBuffer { type Output = Self; type Slice = Self; @@ -234,14 +227,14 @@ impl BufferQueue for Dictionar } } - fn spare_capacity_mut(&mut self, _batch_size: usize) -> &mut Self::Slice { + fn get_output_slice(&mut self, _batch_size: usize) -> &mut Self::Slice { self } - fn set_len(&mut self, len: usize) { + fn truncate_buffer(&mut self, len: usize) { match self { - Self::Dict { keys, .. } => keys.set_len(len), - Self::Values { values } => values.set_len(len), + Self::Dict { keys, .. } => keys.truncate_buffer(len), + Self::Values { values } => values.truncate_buffer(len), } } } diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 3f8f85494f02..459c94ed2803 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -16,7 +16,7 @@ // under the License. use crate::arrow::buffer::bit_util::iter_set_bits_rev; -use crate::arrow::record_reader::buffer::{BufferQueue, ScalarBuffer, ScalarValue, ValuesBuffer}; +use crate::arrow::record_reader::buffer::{BufferQueue, ValuesBuffer}; use crate::column::reader::decoder::ValuesBufferSlice; use crate::errors::{ParquetError, Result}; use arrow_array::{make_array, ArrayRef, OffsetSizeTrait}; @@ -27,23 +27,23 @@ use arrow_schema::DataType as ArrowType; /// A buffer of variable-sized byte arrays that can be converted into /// a corresponding [`ArrayRef`] #[derive(Debug)] -pub struct OffsetBuffer { - pub offsets: ScalarBuffer, - pub values: ScalarBuffer, +pub struct OffsetBuffer { + pub offsets: Vec, + pub values: Vec, } -impl Default for OffsetBuffer { +impl Default for OffsetBuffer { fn default() -> Self { - let mut offsets = ScalarBuffer::new(); - offsets.resize(1); + let mut offsets = Vec::new(); + offsets.resize(1, I::default()); Self { offsets, - values: ScalarBuffer::new(), + values: Vec::new(), } } } -impl OffsetBuffer { +impl OffsetBuffer { /// Returns the number of byte arrays in this buffer pub fn len(&self) -> usize { self.offsets.len() - 1 @@ -128,8 +128,8 @@ impl OffsetBuffer { pub fn into_array(self, null_buffer: Option, data_type: ArrowType) -> ArrayRef { let array_data_builder = ArrayDataBuilder::new(data_type) .len(self.len()) - .add_buffer(self.offsets.into()) - .add_buffer(self.values.into()) + .add_buffer(Buffer::from_vec(self.offsets)) + .add_buffer(Buffer::from_vec(self.values)) .null_bit_buffer(null_buffer); let data = match cfg!(debug_assertions) { @@ -141,7 +141,7 @@ impl OffsetBuffer { } } -impl BufferQueue for OffsetBuffer { +impl BufferQueue for OffsetBuffer { type Output = Self; type Slice = Self; @@ -149,16 +149,16 @@ impl BufferQueue for OffsetBuffer { std::mem::take(self) } - fn spare_capacity_mut(&mut self, _batch_size: usize) -> &mut Self::Slice { + fn get_output_slice(&mut self, _batch_size: usize) -> &mut Self::Slice { self } - fn set_len(&mut self, len: usize) { + fn truncate_buffer(&mut self, len: usize) { assert_eq!(self.offsets.len(), len + 1); } } -impl ValuesBuffer for OffsetBuffer { +impl ValuesBuffer for OffsetBuffer { fn pad_nulls( &mut self, read_offset: usize, @@ -167,9 +167,10 @@ impl ValuesBuffer for OffsetBuffer { valid_mask: &[u8], ) { assert_eq!(self.offsets.len(), read_offset + values_read + 1); - self.offsets.resize(read_offset + levels_read + 1); + self.offsets + .resize(read_offset + levels_read + 1, I::default()); - let offsets = self.offsets.as_slice_mut(); + let offsets = &mut self.offsets; let mut last_pos = read_offset + levels_read + 1; let mut last_start_offset = I::from_usize(self.values.len()).unwrap(); @@ -207,7 +208,7 @@ impl ValuesBuffer for OffsetBuffer { } } -impl ValuesBufferSlice for OffsetBuffer { +impl ValuesBufferSlice for OffsetBuffer { fn capacity(&self) -> usize { usize::MAX } diff --git a/parquet/src/arrow/record_reader/buffer.rs b/parquet/src/arrow/record_reader/buffer.rs index 35a322e6c723..3914710ff7b9 100644 --- a/parquet/src/arrow/record_reader/buffer.rs +++ b/parquet/src/arrow/record_reader/buffer.rs @@ -15,11 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::marker::PhantomData; - use crate::arrow::buffer::bit_util::iter_set_bits_rev; -use crate::data_type::Int96; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; /// A buffer that supports writing new data to the end, and removing data from the front /// @@ -37,12 +33,12 @@ pub trait BufferQueue: Sized { /// to append data to the end of this [`BufferQueue`] /// /// NB: writes to the returned slice will not update the length of [`BufferQueue`] - /// instead a subsequent call should be made to [`BufferQueue::set_len`] - fn spare_capacity_mut(&mut self, batch_size: usize) -> &mut Self::Slice; + /// instead a subsequent call should be made to [`BufferQueue::truncate_buffer`] + fn get_output_slice(&mut self, batch_size: usize) -> &mut Self::Slice; /// Sets the length of the [`BufferQueue`]. /// - /// Intended to be used in combination with [`BufferQueue::spare_capacity_mut`] + /// Intended to be used in combination with [`BufferQueue::get_output_slice`] /// /// # Panics /// @@ -57,132 +53,27 @@ pub trait BufferQueue: Sized { /// track how much of this slice is actually written to by the caller. This is still /// safe as the slice is default-initialized. /// - fn set_len(&mut self, len: usize); -} - -/// A marker trait for [scalar] types -/// -/// This means that a `[Self::default()]` of length `len` can be safely created from a -/// zero-initialized `[u8]` with length `len * std::mem::size_of::()` and -/// alignment of `std::mem::size_of::()` -/// -/// [scalar]: https://doc.rust-lang.org/book/ch03-02-data-types.html#scalar-types -/// -pub trait ScalarValue: Copy {} -impl ScalarValue for bool {} -impl ScalarValue for u8 {} -impl ScalarValue for i8 {} -impl ScalarValue for u16 {} -impl ScalarValue for i16 {} -impl ScalarValue for u32 {} -impl ScalarValue for i32 {} -impl ScalarValue for u64 {} -impl ScalarValue for i64 {} -impl ScalarValue for f32 {} -impl ScalarValue for f64 {} -impl ScalarValue for Int96 {} - -/// A typed buffer similar to [`Vec`] but using [`MutableBuffer`] for storage -#[derive(Debug)] -pub struct ScalarBuffer { - buffer: MutableBuffer, - - /// Length in elements of size T - len: usize, - - /// Placeholder to allow `T` as an invariant generic parameter - /// without making it !Send - _phantom: PhantomData T>, -} - -impl Default for ScalarBuffer { - fn default() -> Self { - Self::new() - } -} - -impl ScalarBuffer { - pub fn new() -> Self { - Self { - buffer: MutableBuffer::new(0), - len: 0, - _phantom: Default::default(), - } - } - - pub fn len(&self) -> usize { - self.len - } - - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - pub fn reserve(&mut self, additional: usize) { - self.buffer.reserve(additional * std::mem::size_of::()); - } - - pub fn resize(&mut self, len: usize) { - self.buffer.resize(len * std::mem::size_of::(), 0); - self.len = len; - } - - #[inline] - pub fn as_slice(&self) -> &[T] { - let (prefix, buf, suffix) = unsafe { self.buffer.as_slice().align_to::() }; - assert!(prefix.is_empty() && suffix.is_empty()); - buf - } - - #[inline] - pub fn as_slice_mut(&mut self) -> &mut [T] { - let (prefix, buf, suffix) = unsafe { self.buffer.as_slice_mut().align_to_mut::() }; - assert!(prefix.is_empty() && suffix.is_empty()); - buf - } + fn truncate_buffer(&mut self, len: usize); } -impl ScalarBuffer { - pub fn push(&mut self, v: T) { - self.buffer.push(v); - self.len += 1; - } - - pub fn extend_from_slice(&mut self, v: &[T]) { - self.buffer.extend_from_slice(v); - self.len += v.len(); - } -} - -impl From> for Buffer { - fn from(t: ScalarBuffer) -> Self { - t.buffer.into() - } -} - -impl BufferQueue for ScalarBuffer { - type Output = Buffer; +impl BufferQueue for Vec { + type Output = Self; type Slice = [T]; fn consume(&mut self) -> Self::Output { - std::mem::take(self).into() + std::mem::take(self) } - fn spare_capacity_mut(&mut self, batch_size: usize) -> &mut Self::Slice { - self.buffer - .resize((self.len + batch_size) * std::mem::size_of::(), 0); - - let range = self.len..self.len + batch_size; - &mut self.as_slice_mut()[range] + fn get_output_slice(&mut self, batch_size: usize) -> &mut Self::Slice { + let len = self.len(); + self.resize(len + batch_size, T::default()); + &mut self[len..] } - fn set_len(&mut self, len: usize) { - self.len = len; - - let new_bytes = self.len * std::mem::size_of::(); - assert!(new_bytes <= self.buffer.len()); - self.buffer.resize(new_bytes, 0); + fn truncate_buffer(&mut self, len: usize) { + assert!(len <= self.len()); + self.truncate(len) } } @@ -212,7 +103,7 @@ pub trait ValuesBuffer: BufferQueue { ); } -impl ValuesBuffer for ScalarBuffer { +impl ValuesBuffer for Vec { fn pad_nulls( &mut self, read_offset: usize, @@ -220,8 +111,7 @@ impl ValuesBuffer for ScalarBuffer { levels_read: usize, valid_mask: &[u8], ) { - let slice = self.as_slice_mut(); - assert!(slice.len() >= read_offset + levels_read); + assert!(self.len() >= read_offset + levels_read); let values_range = read_offset..read_offset + values_read; for (value_pos, level_pos) in values_range.rev().zip(iter_set_bits_rev(valid_mask)) { @@ -229,7 +119,7 @@ impl ValuesBuffer for ScalarBuffer { if level_pos <= value_pos { break; } - slice[level_pos] = slice[value_pos]; + self[level_pos] = self[value_pos]; } } } diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 9009c596c4bf..fa041f5fdb0a 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -30,12 +30,10 @@ use crate::column::reader::decoder::{ use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use super::buffer::ScalarBuffer; - enum BufferInner { /// Compute levels and null mask Full { - levels: ScalarBuffer, + levels: Vec, nulls: BooleanBufferBuilder, max_level: i16, }, @@ -77,7 +75,7 @@ impl DefinitionLevelBuffer { } } false => BufferInner::Full { - levels: ScalarBuffer::new(), + levels: Vec::new(), nulls: BooleanBufferBuilder::new(0), max_level: desc.max_def_level(), }, @@ -89,7 +87,7 @@ impl DefinitionLevelBuffer { /// Returns the built level data pub fn consume_levels(&mut self) -> Option { match &mut self.inner { - BufferInner::Full { levels, .. } => Some(std::mem::take(levels).into()), + BufferInner::Full { levels, .. } => Some(Buffer::from_vec(std::mem::take(levels))), BufferInner::Mask { .. } => None, } } @@ -174,9 +172,9 @@ impl DefinitionLevelDecoder for DefinitionLevelBufferDecoder { assert_eq!(self.max_level, *max_level); assert_eq!(range.start + writer.len, nulls.len()); - levels.resize(range.end + writer.len); + levels.resize(range.end + writer.len, 0); - let slice = &mut levels.as_slice_mut()[writer.len..]; + let slice = &mut levels[writer.len..]; let levels_read = decoder.read_def_levels(slice, range.clone())?; nulls.reserve(levels_read); diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index ea982341994e..49c69c87e302 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -18,7 +18,7 @@ use arrow_buffer::Buffer; use crate::arrow::record_reader::{ - buffer::{BufferQueue, ScalarBuffer, ValuesBuffer}, + buffer::{BufferQueue, ValuesBuffer}, definition_levels::{DefinitionLevelBuffer, DefinitionLevelBufferDecoder}, }; use crate::column::reader::decoder::RepetitionLevelDecoderImpl; @@ -37,8 +37,7 @@ pub(crate) mod buffer; mod definition_levels; /// A `RecordReader` is a stateful column reader that delimits semantic records. -pub type RecordReader = - GenericRecordReader::T>, ColumnValueDecoderImpl>; +pub type RecordReader = GenericRecordReader::T>, ColumnValueDecoderImpl>; pub(crate) type ColumnReader = GenericColumnReader; @@ -53,7 +52,7 @@ pub struct GenericRecordReader { values: V, def_levels: Option, - rep_levels: Option>, + rep_levels: Option>, column_reader: Option>, /// Number of buffered levels / null-padded values num_values: usize, @@ -81,7 +80,7 @@ where let def_levels = (desc.max_def_level() > 0) .then(|| DefinitionLevelBuffer::new(&desc, packed_null_mask(&desc))); - let rep_levels = (desc.max_rep_level() > 0).then(ScalarBuffer::new); + let rep_levels = (desc.max_rep_level() > 0).then(Vec::new); Self { values: records, @@ -174,7 +173,9 @@ where /// Return repetition level data. /// The side effect is similar to `consume_def_levels`. pub fn consume_rep_levels(&mut self) -> Option { - self.rep_levels.as_mut().map(|x| x.consume()) + self.rep_levels + .as_mut() + .map(|x| Buffer::from_vec(x.consume())) } /// Returns currently stored buffer data. @@ -209,9 +210,9 @@ where let rep_levels = self .rep_levels .as_mut() - .map(|levels| levels.spare_capacity_mut(batch_size)); + .map(|levels| levels.get_output_slice(batch_size)); let def_levels = self.def_levels.as_mut(); - let values = self.values.spare_capacity_mut(batch_size); + let values = self.values.get_output_slice(batch_size); let (records_read, values_read, levels_read) = self .column_reader @@ -234,9 +235,9 @@ where self.num_records += records_read; self.num_values += levels_read; - self.values.set_len(self.num_values); + self.values.truncate_buffer(self.num_values); if let Some(ref mut buf) = self.rep_levels { - buf.set_len(self.num_values) + buf.truncate_buffer(self.num_values) }; if let Some(ref mut buf) = self.def_levels { buf.set_len(self.num_values) @@ -257,7 +258,7 @@ mod tests { use std::sync::Arc; use arrow::buffer::Buffer; - use arrow_array::builder::{Int16BufferBuilder, Int32BufferBuilder}; + use arrow_array::builder::Int16BufferBuilder; use crate::basic::Encoding; use crate::data_type::Int32Type; @@ -334,10 +335,7 @@ mod tests { assert_eq!(7, record_reader.num_values()); } - let mut bb = Int32BufferBuilder::new(7); - bb.append_slice(&[4, 7, 6, 3, 2, 8, 9]); - let expected_buffer = bb.finish(); - assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!(record_reader.consume_record_data(), &[4, 7, 6, 3, 2, 8, 9]); assert_eq!(None, record_reader.consume_def_levels()); assert_eq!(None, record_reader.consume_bitmap()); } @@ -434,13 +432,12 @@ mod tests { // Verify result record data let actual = record_reader.consume_record_data(); - let actual_values = actual.typed_data::(); let expected = &[0, 7, 0, 6, 3, 0, 8]; - assert_eq!(actual_values.len(), expected.len()); + assert_eq!(actual.len(), expected.len()); // Only validate valid values are equal - let iter = expected_valid.iter().zip(actual_values).zip(expected); + let iter = expected_valid.iter().zip(&actual).zip(expected); for ((valid, actual), expected) in iter { if *valid { assert_eq!(actual, expected) @@ -544,12 +541,11 @@ mod tests { // Verify result record data let actual = record_reader.consume_record_data(); - let actual_values = actual.typed_data::(); let expected = &[4, 0, 0, 7, 6, 3, 2, 8, 9]; - assert_eq!(actual_values.len(), expected.len()); + assert_eq!(actual.len(), expected.len()); // Only validate valid values are equal - let iter = expected_valid.iter().zip(actual_values).zip(expected); + let iter = expected_valid.iter().zip(&actual).zip(expected); for ((valid, actual), expected) in iter { if *valid { assert_eq!(actual, expected) @@ -713,10 +709,7 @@ mod tests { assert_eq!(0, record_reader.read_records(10).unwrap()); } - let mut bb = Int32BufferBuilder::new(3); - bb.append_slice(&[6, 3, 2]); - let expected_buffer = bb.finish(); - assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!(record_reader.consume_record_data(), &[6, 3, 2]); assert_eq!(None, record_reader.consume_def_levels()); assert_eq!(None, record_reader.consume_bitmap()); } @@ -814,13 +807,12 @@ mod tests { // Verify result record data let actual = record_reader.consume_record_data(); - let actual_values = actual.typed_data::(); let expected = &[0, 6, 3]; - assert_eq!(actual_values.len(), expected.len()); + assert_eq!(actual.len(), expected.len()); // Only validate valid values are equal - let iter = expected_valid.iter().zip(actual_values).zip(expected); + let iter = expected_valid.iter().zip(&actual).zip(expected); for ((valid, actual), expected) in iter { if *valid { assert_eq!(actual, expected) From 7e289134a8d9f46a92a2759a7b2488b17993fd5b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Dec 2023 18:08:44 +0000 Subject: [PATCH 1403/1411] Simplify parquet statistics generation (#5183) --- parquet/src/arrow/arrow_writer/byte_array.rs | 40 +++++-------- parquet/src/column/writer/encoder.rs | 29 +++------- parquet/src/column/writer/mod.rs | 60 +++++++------------- 3 files changed, 44 insertions(+), 85 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 28c7c3b00540..61933b24178e 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -22,7 +22,7 @@ use crate::data_type::{AsBytes, ByteArray, Int32Type}; use crate::encodings::encoding::{DeltaBitPackEncoder, Encoder}; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::file::properties::{WriterProperties, WriterVersion}; +use crate::file::properties::{EnabledStatistics, WriterProperties, WriterVersion}; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; @@ -379,6 +379,7 @@ impl DictEncoder { pub struct ByteArrayEncoder { fallback: FallbackEncoder, dict_encoder: Option, + statistics_enabled: EnabledStatistics, min_value: Option, max_value: Option, bloom_filter: Option, @@ -387,24 +388,6 @@ pub struct ByteArrayEncoder { impl ColumnValueEncoder for ByteArrayEncoder { type T = ByteArray; type Values = dyn Array; - - fn min_max( - &self, - values: &dyn Array, - value_indices: Option<&[usize]>, - ) -> Option<(Self::T, Self::T)> { - match value_indices { - Some(indices) => { - let iter = indices.iter().cloned(); - downcast_op!(values.data_type(), values, compute_min_max, iter) - } - None => { - let len = Array::len(values); - downcast_op!(values.data_type(), values, compute_min_max, 0..len) - } - } - } - fn flush_bloom_filter(&mut self) -> Option { self.bloom_filter.take() } @@ -424,12 +407,15 @@ impl ColumnValueEncoder for ByteArrayEncoder { .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp)) .transpose()?; + let statistics_enabled = props.statistics_enabled(descr.path()); + Ok(Self { fallback, + statistics_enabled, + bloom_filter, dict_encoder: dictionary, min_value: None, max_value: None, - bloom_filter, }) } @@ -498,13 +484,15 @@ where T: ArrayAccessor + Copy, T::Item: Copy + Ord + AsRef<[u8]>, { - if let Some((min, max)) = compute_min_max(values, indices.iter().cloned()) { - if encoder.min_value.as_ref().map_or(true, |m| m > &min) { - encoder.min_value = Some(min); - } + if encoder.statistics_enabled != EnabledStatistics::None { + if let Some((min, max)) = compute_min_max(values, indices.iter().cloned()) { + if encoder.min_value.as_ref().map_or(true, |m| m > &min) { + encoder.min_value = Some(min); + } - if encoder.max_value.as_ref().map_or(true, |m| m < &max) { - encoder.max_value = Some(max); + if encoder.max_value.as_ref().map_or(true, |m| m < &max) { + encoder.max_value = Some(max); + } } } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 0d5144f61c26..8624f859f4b0 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -76,15 +76,6 @@ pub trait ColumnValueEncoder { /// The values encoded by this encoder type Values: ColumnValues + ?Sized; - /// Returns the min and max values in this collection, skipping any NaN values - /// - /// Returns `None` if no values found - fn min_max( - &self, - values: &Self::Values, - value_indices: Option<&[usize]>, - ) -> Option<(Self::T, Self::T)>; - /// Create a new [`ColumnValueEncoder`] fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result where @@ -136,8 +127,15 @@ pub struct ColumnValueEncoderImpl { } impl ColumnValueEncoderImpl { + fn min_max(&self, values: &[T::T], value_indices: Option<&[usize]>) -> Option<(T::T, T::T)> { + match value_indices { + Some(indices) => get_min_max(&self.descr, indices.iter().map(|x| &values[*x])), + None => get_min_max(&self.descr, values.iter()), + } + } + fn write_slice(&mut self, slice: &[T::T]) -> Result<()> { - if self.statistics_enabled == EnabledStatistics::Page + if self.statistics_enabled != EnabledStatistics::None // INTERVAL has undefined sort order, so don't write min/max stats for it && self.descr.converted_type() != ConvertedType::INTERVAL { @@ -166,17 +164,6 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { type Values = [T::T]; - fn min_max( - &self, - values: &Self::Values, - value_indices: Option<&[usize]>, - ) -> Option<(Self::T, Self::T)> { - match value_indices { - Some(indices) => get_min_max(&self.descr, indices.iter().map(|x| &values[*x])), - None => get_min_max(&self.descr, values.iter()), - } - } - fn flush_bloom_filter(&mut self) -> Option { self.bloom_filter.take() } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 531af4bd461e..9f476595fb7e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -329,28 +329,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { None => values.len(), }; - // If only computing chunk-level statistics compute them here, page-level statistics - // are computed in [`Self::write_mini_batch`] and used to update chunk statistics in - // [`Self::add_data_page`] - if self.statistics_enabled == EnabledStatistics::Chunk - // INTERVAL has undefined sort order, so don't write min/max stats for it - && self.descr.converted_type() != ConvertedType::INTERVAL - { - match (min, max) { - (Some(min), Some(max)) => { - update_min(&self.descr, min, &mut self.column_metrics.min_column_value); - update_max(&self.descr, max, &mut self.column_metrics.max_column_value); - } - (None, Some(_)) | (Some(_), None) => { - panic!("min/max should be both set or both None") - } - (None, None) => { - if let Some((min, max)) = self.encoder.min_max(values, value_indices) { - update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); - update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); - } - } - }; + if let Some(min) = min { + update_min(&self.descr, min, &mut self.column_metrics.min_column_value); + } + if let Some(max) = max { + update_max(&self.descr, max, &mut self.column_metrics.max_column_value); } // We can only set the distinct count if there are no other writes @@ -764,22 +747,23 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.num_column_nulls += self.page_metrics.num_page_nulls; - let page_statistics = if let (Some(min), Some(max)) = - (values_data.min_value, values_data.max_value) - { - // Update chunk level statistics - update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); - update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); - - (self.statistics_enabled == EnabledStatistics::Page).then_some(ValueStatistics::new( - Some(min), - Some(max), - None, - self.page_metrics.num_page_nulls, - false, - )) - } else { - None + let page_statistics = match (values_data.min_value, values_data.max_value) { + (Some(min), Some(max)) => { + // Update chunk level statistics + update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); + + (self.statistics_enabled == EnabledStatistics::Page).then_some( + ValueStatistics::new( + Some(min), + Some(max), + None, + self.page_metrics.num_page_nulls, + false, + ), + ) + } + _ => None, }; // update column and offset index From c5a9953f2a9193e9f31366219d2ee1853215a6d0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 8 Dec 2023 13:38:12 -0500 Subject: [PATCH 1404/1411] Clarify interval comparison behavior with documentation and tests (#5192) * Clarify interval comparison behavior with documentation and tests * refine language --- arrow-array/src/array/primitive_array.rs | 8 +- arrow-array/src/types.rs | 71 +++++++++++++- arrow-ord/src/comparison.rs | 113 +++++++++++++++++++++++ arrow-ord/src/ord.rs | 67 ++++++++++++++ 4 files changed, 255 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 1112acacfcd9..2296cebd4681 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -352,12 +352,18 @@ pub type Time64MicrosecondArray = PrimitiveArray; pub type Time64NanosecondArray = PrimitiveArray; /// A [`PrimitiveArray`] of “calendar” intervals in months +/// +/// See [`IntervalYearMonthType`] for details on representation and caveats. pub type IntervalYearMonthArray = PrimitiveArray; /// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds +/// +/// See [`IntervalDayTimeType`] for details on representation and caveats. pub type IntervalDayTimeArray = PrimitiveArray; -/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds +/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds. +/// +/// See [`IntervalMonthDayNanoType`] for details on representation and caveats. pub type IntervalMonthDayNanoArray = PrimitiveArray; /// A [`PrimitiveArray`] of elapsed durations in seconds diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 16d0e822d052..6e177838c4f5 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -213,19 +213,84 @@ make_type!( IntervalYearMonthType, i32, DataType::Interval(IntervalUnit::YearMonth), - "A “calendar” interval type in months." + "A “calendar” interval stored as the number of whole months." ); make_type!( IntervalDayTimeType, i64, DataType::Interval(IntervalUnit::DayTime), - "A “calendar” interval type in days and milliseconds." + r#"A “calendar” interval type in days and milliseconds. + +## Representation +This type is stored as a single 64 bit integer, interpreted as two i32 fields: +1. the number of elapsed days +2. The number of milliseconds (no leap seconds), + +```text + ┌──────────────┬──────────────┐ + │ Days │ Milliseconds │ + │ (32 bits) │ (32 bits) │ + └──────────────┴──────────────┘ + 0 31 63 bit offset +``` +Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L406-L408) for more details + +## Note on Comparing and Ordering for Calendar Types + +Values of `IntervalDayTimeType` are compared using their binary representation, +which can lead to surprising results. Please see the description of ordering on +[`IntervalMonthDayNanoType`] for more details +"# ); make_type!( IntervalMonthDayNanoType, i128, DataType::Interval(IntervalUnit::MonthDayNano), - "A “calendar” interval type in months, days, and nanoseconds." + r#"A “calendar” interval type in months, days, and nanoseconds. + +## Representation +This type is stored as a single 128 bit integer, +interpreted as three different signed integral fields: + +1. The number of months (32 bits) +2. The number days (32 bits) +2. The number of nanoseconds (64 bits). + +Nanoseconds does not allow for leap seconds. +Each field is independent (e.g. there is no constraint that the quantity of +nanoseconds represents less than a day's worth of time). + +```text +┌──────────────────────────────┬─────────────┬──────────────┐ +│ Nanos │ Days │ Months │ +│ (64 bits) │ (32 bits) │ (32 bits) │ +└──────────────────────────────┴─────────────┴──────────────┘ + 0 63 95 127 bit offset +``` +Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415) for more details + +## Note on Comparing and Ordering for Calendar Types +Values of `IntervalMonthDayNanoType` are compared using their binary representation, +which can lead to surprising results. + +Spans of time measured in calendar units are not fixed in absolute size (e.g. +number of seconds) which makes defining comparisons and ordering non trivial. +For example `1 month` is 28 days for February but `1 month` is 31 days +in December. + +This makes the seemingly simple operation of comparing two intervals +complicated in practice. For example is `1 month` more or less than `30 days`? The +answer depends on what month you are talking about. + +This crate defines comparisons for calendar types using their binary +representation which is fast and efficient, but leads +to potentially surprising results. + +For example a +`IntervalMonthDayNano` of `1 month` will compare as **greater** than a +`IntervalMonthDayNano` of `100 days` because the binary representation of `1 month` +is larger than the binary representation of 100 days. +"# ); make_type!( DurationSecondType, diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4dbb395192e1..4d552b038a7d 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1407,6 +1407,48 @@ mod tests { vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], vec![false, false, true, false, false, false, false, true, false, false] ); + + cmp_vec!( + eq, + eq_dyn, + IntervalYearMonthArray, + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + // 1 year + IntervalYearMonthType::make_value(1, 0), + ], + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(1, 2), + // NB 12 months is treated as equal to a year (as the underlying + // type stores number of months) + IntervalYearMonthType::make_value(0, 12), + ], + vec![true, false, true] + ); + + cmp_vec!( + eq, + eq_dyn, + IntervalMonthDayNanoArray, + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(3, 2, 1), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ], + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(1, 2, 3), + // 30 days is not treated as a month + IntervalMonthDayNanoType::make_value(0, 30, 0), + // 100 days + IntervalMonthDayNanoType::make_value(0, 100, 0), + ], + vec![true, false, false, false] + ); } #[test] @@ -1660,6 +1702,77 @@ mod tests { vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], vec![false, false, false, true, true, false, false, false, true, true] ); + + cmp_vec!( + lt, + lt_dyn, + IntervalDayTimeArray, + vec![ + IntervalDayTimeType::make_value(1, 0), + IntervalDayTimeType::make_value(0, 1000), + IntervalDayTimeType::make_value(1, 1000), + IntervalDayTimeType::make_value(1, 3000), + // 90M milliseconds + IntervalDayTimeType::make_value(0, 90_000_000), + ], + vec![ + IntervalDayTimeType::make_value(0, 1000), + IntervalDayTimeType::make_value(1, 0), + IntervalDayTimeType::make_value(10, 0), + IntervalDayTimeType::make_value(2, 1), + // NB even though 1 day is less than 90M milliseconds long, + // it compares as greater because the underlying type stores + // days and milliseconds as different fields + IntervalDayTimeType::make_value(0, 12), + ], + vec![false, true, true, true ,false] + ); + + cmp_vec!( + lt, + lt_dyn, + IntervalYearMonthArray, + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + IntervalYearMonthType::make_value(1, 2), + // 1 year + IntervalYearMonthType::make_value(1, 0), + ], + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + // NB 12 months is treated as equal to a year (as the underlying + // type stores number of months) + IntervalYearMonthType::make_value(0, 12), + ], + vec![false, false, true, false] + ); + + cmp_vec!( + lt, + lt_dyn, + IntervalMonthDayNanoArray, + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(3, 2, 1), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(1, 2, 0), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ], + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(2, 0, 0), + // 30 days is not treated as a month + IntervalMonthDayNanoType::make_value(0, 30, 0), + // 100 days (note is treated as greater than 1 month as the underlying integer representation) + IntervalMonthDayNanoType::make_value(0, 100, 0), + ], + vec![false, false, true, false, false] + ); } #[test] diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 28ca07cce260..f6bd39c9cd5d 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -216,6 +216,73 @@ pub mod tests { assert_eq!(Ordering::Greater, cmp(1, 0)); } + #[test] + fn test_interval_day_time() { + let array = IntervalDayTimeArray::from(vec![ + // 0 days, 1 second + IntervalDayTimeType::make_value(0, 1000), + // 1 day, 2 milliseconds + IntervalDayTimeType::make_value(1, 2), + // 90M milliseconds (which is more than is in 1 day) + IntervalDayTimeType::make_value(0, 90_000_000), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // somewhat confusingly, while 90M milliseconds is more than 1 day, + // it will compare less as the comparison is done on the underlying + // values not field by field + assert_eq!(Ordering::Greater, cmp(1, 2)); + assert_eq!(Ordering::Less, cmp(2, 1)); + } + + #[test] + fn test_interval_year_month() { + let array = IntervalYearMonthArray::from(vec![ + // 1 year, 0 months + IntervalYearMonthType::make_value(1, 0), + // 0 years, 13 months + IntervalYearMonthType::make_value(0, 13), + // 1 year, 1 month + IntervalYearMonthType::make_value(1, 1), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // the underlying representation is months, so both quantities are the same + assert_eq!(Ordering::Equal, cmp(1, 2)); + assert_eq!(Ordering::Equal, cmp(2, 1)); + } + + #[test] + fn test_interval_month_day_nano() { + let array = IntervalMonthDayNanoArray::from(vec![ + // 100 days + IntervalMonthDayNanoType::make_value(0, 100, 0), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + // 100 day, 1 nanoseconds + IntervalMonthDayNanoType::make_value(0, 100, 2), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // somewhat confusingly, while 100 days is more than 1 month in all cases + // it will compare less as the comparison is done on the underlying + // values not field by field + assert_eq!(Ordering::Greater, cmp(1, 2)); + assert_eq!(Ordering::Less, cmp(2, 1)); + } + #[test] fn test_decimal() { let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)] From 9630aaf55bda98e2028c4f44e6a7264ec41e04d5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Fri, 8 Dec 2023 21:51:48 +0000 Subject: [PATCH 1405/1411] Blockwise IO in IPC FileReader (#5153) (#5179) * Blockwise IO in IPC FileReader (#5153) * Docs * Clippy * Update arrow-ipc/src/reader.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- arrow-ipc/src/reader.rs | 121 ++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 73 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 6f2cb30a1629..06e53505fc22 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -27,12 +27,12 @@ use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; use arrow_array::*; -use arrow_buffer::{Buffer, MutableBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::*; use crate::compression::CompressionCodec; -use crate::{FieldNode, MetadataVersion, CONTINUATION_MARKER}; +use crate::{Block, FieldNode, Message, MetadataVersion, CONTINUATION_MARKER}; use DataType::*; /// Read a buffer based on offset and length @@ -498,10 +498,34 @@ pub fn read_dictionary( Ok(()) } +/// Read the data for a given block +fn read_block(mut reader: R, block: &Block) -> Result { + reader.seek(SeekFrom::Start(block.offset() as u64))?; + let body_len = block.bodyLength().to_usize().unwrap(); + let metadata_len = block.metaDataLength().to_usize().unwrap(); + let total_len = body_len.checked_add(metadata_len).unwrap(); + + let mut buf = MutableBuffer::from_len_zeroed(total_len); + reader.read_exact(&mut buf)?; + Ok(buf.into()) +} + +/// Parse an encapsulated message +/// +/// +fn parse_message(buf: &[u8]) -> Result { + let buf = match buf[..4] == CONTINUATION_MARKER { + true => &buf[8..], + false => &buf[4..], + }; + crate::root_as_message(buf) + .map_err(|err| ArrowError::ParseError(format!("Unable to get root as message: {err:?}"))) +} + /// Arrow File reader pub struct FileReader { /// Buffered file reader that supports reading and seeking - reader: BufReader, + reader: R, /// The schema that is read from the file header schema: SchemaRef, @@ -535,7 +559,6 @@ pub struct FileReader { impl fmt::Debug for FileReader { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { f.debug_struct("FileReader") - .field("reader", &"BufReader<..>") .field("schema", &self.schema) .field("blocks", &self.blocks) .field("current_block", &self.current_block) @@ -543,37 +566,28 @@ impl fmt::Debug for FileReader { .field("dictionaries_by_id", &self.dictionaries_by_id) .field("metadata_version", &self.metadata_version) .field("projection", &self.projection) - .finish() + .finish_non_exhaustive() } } impl FileReader { /// Try to create a new file reader /// - /// Returns errors if the file does not meet the Arrow Format header and footer - /// requirements - pub fn try_new(reader: R, projection: Option>) -> Result { - let mut reader = BufReader::new(reader); - // check if header and footer contain correct magic bytes - let mut magic_buffer: [u8; 6] = [0; 6]; - reader.read_exact(&mut magic_buffer)?; - if magic_buffer != super::ARROW_MAGIC { - return Err(ArrowError::ParseError( - "Arrow file does not contain correct header".to_string(), - )); - } - reader.seek(SeekFrom::End(-6))?; - reader.read_exact(&mut magic_buffer)?; - if magic_buffer != super::ARROW_MAGIC { + /// Returns errors if the file does not meet the Arrow Format footer requirements + pub fn try_new(mut reader: R, projection: Option>) -> Result { + // Space for ARROW_MAGIC (6 bytes) and length (4 bytes) + let mut buffer = [0; 10]; + reader.seek(SeekFrom::End(-10))?; + reader.read_exact(&mut buffer)?; + + if buffer[4..] != super::ARROW_MAGIC { return Err(ArrowError::ParseError( "Arrow file does not contain correct footer".to_string(), )); } + // read footer length - let mut footer_size: [u8; 4] = [0; 4]; - reader.seek(SeekFrom::End(-10))?; - reader.read_exact(&mut footer_size)?; - let footer_len = i32::from_le_bytes(footer_size); + let footer_len = i32::from_le_bytes(buffer[..4].try_into().unwrap()); // read footer let mut footer_data = vec![0; footer_len as usize]; @@ -607,35 +621,14 @@ impl FileReader { let mut dictionaries_by_id = HashMap::new(); if let Some(dictionaries) = footer.dictionaries() { for block in dictionaries { - // read length from end of offset - let mut message_size: [u8; 4] = [0; 4]; - reader.seek(SeekFrom::Start(block.offset() as u64))?; - reader.read_exact(&mut message_size)?; - if message_size == CONTINUATION_MARKER { - reader.read_exact(&mut message_size)?; - } - let footer_len = i32::from_le_bytes(message_size); - let mut block_data = vec![0; footer_len as usize]; - - reader.read_exact(&mut block_data)?; - - let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) - })?; + let buf = read_block(&mut reader, block)?; + let message = parse_message(&buf)?; match message.header_type() { crate::MessageHeader::DictionaryBatch => { let batch = message.header_as_dictionary_batch().unwrap(); - - // read the block that makes up the dictionary batch into a buffer - let mut buf = MutableBuffer::from_len_zeroed(message.bodyLength() as usize); - reader.seek(SeekFrom::Start( - block.offset() as u64 + block.metaDataLength() as u64, - ))?; - reader.read_exact(&mut buf)?; - read_dictionary( - &buf.into(), + &buf.slice(block.metaDataLength() as _), batch, &schema, &mut dictionaries_by_id, @@ -702,27 +695,15 @@ impl FileReader { } fn maybe_next(&mut self) -> Result, ArrowError> { - let block = self.blocks[self.current_block]; + let block = &self.blocks[self.current_block]; self.current_block += 1; // read length - self.reader.seek(SeekFrom::Start(block.offset() as u64))?; - let mut meta_buf = [0; 4]; - self.reader.read_exact(&mut meta_buf)?; - if meta_buf == CONTINUATION_MARKER { - // continuation marker encountered, read message next - self.reader.read_exact(&mut meta_buf)?; - } - let meta_len = i32::from_le_bytes(meta_buf); - - let mut block_data = vec![0; meta_len as usize]; - self.reader.read_exact(&mut block_data)?; - let message = crate::root_as_message(&block_data[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as footer: {err:?}")) - })?; + let buffer = read_block(&mut self.reader, block)?; + let message = parse_message(&buffer)?; // some old test data's footer metadata is not set, so we account for that - if self.metadata_version != crate::MetadataVersion::V1 + if self.metadata_version != MetadataVersion::V1 && message.version() != self.metadata_version { return Err(ArrowError::IpcError( @@ -739,14 +720,8 @@ impl FileReader { ArrowError::IpcError("Unable to read IPC message as record batch".to_string()) })?; // read the block that makes up the record batch into a buffer - let mut buf = MutableBuffer::from_len_zeroed(message.bodyLength() as usize); - self.reader.seek(SeekFrom::Start( - block.offset() as u64 + block.metaDataLength() as u64, - ))?; - self.reader.read_exact(&mut buf)?; - read_record_batch( - &buf.into(), + &buffer.slice(block.metaDataLength() as _), batch, self.schema(), &self.dictionaries_by_id, @@ -766,14 +741,14 @@ impl FileReader { /// /// It is inadvisable to directly read from the underlying reader. pub fn get_ref(&self) -> &R { - self.reader.get_ref() + &self.reader } /// Gets a mutable reference to the underlying reader. /// /// It is inadvisable to directly read from the underlying reader. pub fn get_mut(&mut self) -> &mut R { - self.reader.get_mut() + &mut self.reader } } From 15535457ff8c7de5a27dde5624b96313cb281d5d Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 11 Dec 2023 18:21:40 +0100 Subject: [PATCH 1406/1411] impl `From>` for `ScalarBuffer` (#5203) --- arrow-buffer/src/buffer/scalar.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 3b75d5384046..ca1a1d230b12 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -18,7 +18,7 @@ use crate::alloc::Deallocation; use crate::buffer::Buffer; use crate::native::ArrowNativeType; -use crate::MutableBuffer; +use crate::{MutableBuffer, OffsetBuffer}; use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; @@ -145,6 +145,12 @@ impl From for ScalarBuffer { } } +impl From> for ScalarBuffer { + fn from(value: OffsetBuffer) -> Self { + value.into_inner() + } +} + impl From> for ScalarBuffer { fn from(value: Vec) -> Self { Self { From 8aa55ddc1cce8893d0ea09029b149f0239060646 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 11 Dec 2023 18:21:47 +0100 Subject: [PATCH 1407/1411] impl `From>` for `Buffer` (#5202) --- arrow-buffer/src/buffer/immutable.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8869ab3a2225..9db8732f3611 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use crate::alloc::{Allocation, Deallocation, ALIGNMENT}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; +use crate::BufferBuilder; use crate::{bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; @@ -371,6 +372,12 @@ impl From for Buffer { } } +impl From> for Buffer { + fn from(mut value: BufferBuilder) -> Self { + value.finish() + } +} + impl Buffer { /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length. /// Prefer this to `collect` whenever possible, as it is ~60% faster. From 2a84e85de154cae534d48fab0e72e062067554b0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 12 Dec 2023 09:11:23 -0800 Subject: [PATCH 1408/1411] feat(object_store): use http1 by default (#5204) * feat: use http1 by default * add note to GCS docs * fix docs * simplify changes * bring back option --- object_store/src/client/mod.rs | 16 +++++++++++++++- object_store/src/gcp/mod.rs | 7 +++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index ae092edac095..2baf586127c6 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -213,7 +213,10 @@ impl Default for ClientOptions { http2_keep_alive_interval: None, http2_keep_alive_timeout: None, http2_keep_alive_while_idle: Default::default(), - http1_only: Default::default(), + // HTTP2 is known to be significantly slower than HTTP1, so we default + // to HTTP1 for now. + // https://github.com/apache/arrow-rs/issues/5194 + http1_only: true.into(), http2_only: Default::default(), } } @@ -350,17 +353,28 @@ impl ClientOptions { } /// Only use http1 connections + /// + /// This is on by default, since http2 is known to be significantly slower than http1. pub fn with_http1_only(mut self) -> Self { + self.http2_only = false.into(); self.http1_only = true.into(); self } /// Only use http2 connections pub fn with_http2_only(mut self) -> Self { + self.http1_only = false.into(); self.http2_only = true.into(); self } + /// Use http2 if supported, otherwise use http1. + pub fn with_allow_http2(mut self) -> Self { + self.http1_only = false.into(); + self.http2_only = false.into(); + self + } + /// Set a proxy URL to use for requests pub fn with_proxy_url(mut self, proxy_url: impl Into) -> Self { self.proxy_url = Some(proxy_url.into()); diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 11fa68310a2e..8633abbfb4dc 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -29,6 +29,13 @@ //! to abort the upload and drop those unneeded parts. In addition, you may wish to //! consider implementing automatic clean up of unused parts that are older than one //! week. +//! +//! ## Using HTTP/2 +//! +//! Google Cloud Storage supports both HTTP/2 and HTTP/1. HTTP/1 is used by default +//! because it allows much higher throughput in our benchmarks (see +//! [#5194](https://github.com/apache/arrow-rs/issues/5194)). HTTP/2 can be +//! enabled by setting [crate::ClientConfigKey::Http1Only] to false. use std::sync::Arc; use crate::client::CredentialProvider; From 7fd2d4248f477836b347835bdd5b9eca13773c1c Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Tue, 12 Dec 2023 18:16:19 +0100 Subject: [PATCH 1409/1411] impl `From>` for `ScalarBuffer` (#5201) --- arrow-buffer/src/buffer/scalar.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index ca1a1d230b12..f1c2ae785720 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -18,7 +18,7 @@ use crate::alloc::Deallocation; use crate::buffer::Buffer; use crate::native::ArrowNativeType; -use crate::{MutableBuffer, OffsetBuffer}; +use crate::{BufferBuilder, MutableBuffer, OffsetBuffer}; use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; @@ -160,6 +160,13 @@ impl From> for ScalarBuffer { } } +impl From> for ScalarBuffer { + fn from(mut value: BufferBuilder) -> Self { + let len = value.len(); + Self::new(value.finish(), 0, len) + } +} + impl FromIterator for ScalarBuffer { fn from_iter>(iter: I) -> Self { iter.into_iter().collect::>().into() @@ -269,4 +276,12 @@ mod tests { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 0, usize::MAX / 4 + 1); } + + #[test] + fn convert_from_buffer_builder() { + let input = vec![1, 2, 3, 4]; + let buffer_builder = BufferBuilder::from(input.clone()); + let scalar_buffer = ScalarBuffer::from(buffer_builder); + assert_eq!(scalar_buffer.as_ref(), input); + } } From bc39f25e0f0c3a3a442ab36e5f69b6c794fbfa02 Mon Sep 17 00:00:00 2001 From: jakevin Date: Wed, 13 Dec 2023 01:17:35 +0800 Subject: [PATCH 1410/1411] refactor: simplify cast_string_to_interval (#5195) --- arrow-cast/src/cast.rs | 93 +++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 60 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 7f8bd19e9291..a75354cf9b35 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2444,10 +2444,16 @@ fn cast_string_to_timestamp_impl( +fn cast_string_to_interval( array: &dyn Array, cast_options: &CastOptions, -) -> Result { + parse_function: F, +) -> Result +where + Offset: OffsetSizeTrait, + ArrowType: ArrowPrimitiveType, + F: Fn(&str) -> Result + Copy, +{ let string_array = array .as_any() .downcast_ref::>() @@ -2455,92 +2461,59 @@ fn cast_string_to_year_month_interval( let interval_array = if cast_options.safe { let iter = string_array .iter() - .map(|v| v.and_then(|v| parse_interval_year_month(v).ok())); + .map(|v| v.and_then(|v| parse_function(v).ok())); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalYearMonthArray::from_trusted_len_iter(iter) } + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } else { let vec = string_array .iter() - .map(|v| v.map(parse_interval_year_month).transpose()) + .map(|v| v.map(parse_function).transpose()) .collect::, ArrowError>>()?; // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalYearMonthArray::from_trusted_len_iter(vec) } + unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } }; Ok(Arc::new(interval_array) as ArrayRef) } -fn cast_string_to_day_time_interval( +fn cast_string_to_year_month_interval( array: &dyn Array, cast_options: &CastOptions, ) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let interval_array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| parse_interval_day_time(v).ok())); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalDayTimeArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| v.map(parse_interval_day_time).transpose()) - .collect::, ArrowError>>()?; + cast_string_to_interval::( + array, + cast_options, + parse_interval_year_month, + ) +} - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalDayTimeArray::from_trusted_len_iter(vec) } - }; - Ok(Arc::new(interval_array) as ArrayRef) +fn cast_string_to_day_time_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_string_to_interval::( + array, + cast_options, + parse_interval_day_time, + ) } fn cast_string_to_month_day_nano_interval( array: &dyn Array, cast_options: &CastOptions, ) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let interval_array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| parse_interval_month_day_nano(v).ok())); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| v.map(parse_interval_month_day_nano).transpose()) - .collect::, ArrowError>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { IntervalMonthDayNanoArray::from_trusted_len_iter(vec) } - }; - Ok(Arc::new(interval_array) as ArrayRef) + cast_string_to_interval::( + array, + cast_options, + parse_interval_month_day_nano, + ) } fn adjust_timestamp_to_timezone( From 802ed428f87051fdca31180430ddb0ecb2f60e8b Mon Sep 17 00:00:00 2001 From: yi wang <48236141+my-vegetable-has-exploded@users.noreply.github.com> Date: Wed, 13 Dec 2023 01:22:00 +0800 Subject: [PATCH 1411/1411] Support quote and escape in Csv WriterBuilder (#5196) --- arrow-csv/src/writer.rs | 97 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 0bb76e536e67..a31a1d5e8c13 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -193,6 +193,12 @@ pub struct WriterBuilder { delimiter: u8, /// Whether to write column names as file headers. Defaults to `true` has_header: bool, + /// Optional quote character. Defaults to `b'"'` + quote: u8, + /// Optional escape character. Defaults to `b'\\'` + escape: u8, + /// Enable double quote escapes. Defaults to `true` + double_quote: bool, /// Optional date format for date arrays date_format: Option, /// Optional datetime format for datetime arrays @@ -209,14 +215,17 @@ pub struct WriterBuilder { impl Default for WriterBuilder { fn default() -> Self { - Self { - has_header: true, + WriterBuilder { delimiter: b',', + has_header: true, + quote: b'"', + escape: b'\\', + double_quote: true, date_format: None, datetime_format: None, - time_format: None, timestamp_format: None, timestamp_tz_format: None, + time_format: None, null_value: None, } } @@ -277,6 +286,51 @@ impl WriterBuilder { self.delimiter } + /// Set the CSV file's quote character as a byte character + pub fn with_quote(mut self, quote: u8) -> Self { + self.quote = quote; + self + } + + /// Get the CSV file's quote character as a byte character + pub fn quote(&self) -> u8 { + self.quote + } + + /// Set the CSV file's escape character as a byte character + /// + /// In some variants of CSV, quotes are escaped using a special escape + /// character like `\` (instead of escaping quotes by doubling them). + /// + /// By default, writing these idiosyncratic escapes is disabled, and is + /// only used when `double_quote` is disabled. + pub fn with_escape(mut self, escape: u8) -> Self { + self.escape = escape; + self + } + + /// Get the CSV file's escape character as a byte character + pub fn escape(&self) -> u8 { + self.escape + } + + /// Set whether to enable double quote escapes + /// + /// When enabled (which is the default), quotes are escaped by doubling + /// them. e.g., `"` escapes to `""`. + /// + /// When disabled, quotes are escaped with the escape character (which + /// is `\\` by default). + pub fn with_double_quote(mut self, double_quote: bool) -> Self { + self.double_quote = double_quote; + self + } + + /// Get whether double quote escapes are enabled + pub fn double_quote(&self) -> bool { + self.double_quote + } + /// Set the CSV file's date format pub fn with_date_format(mut self, format: String) -> Self { self.date_format = Some(format); @@ -346,7 +400,12 @@ impl WriterBuilder { /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(self.delimiter).from_writer(writer); + let writer = builder + .delimiter(self.delimiter) + .quote(self.quote) + .double_quote(self.double_quote) + .escape(self.escape) + .from_writer(writer); Writer { writer, beginning: true, @@ -499,8 +558,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo ]); let c1 = StringArray::from(vec![ - "Lorem ipsum dolor sit amet", - "consectetur adipiscing elit", + "Lorem ipsum \ndolor sit amet", + "consectetur \"adipiscing\" elit", "sed do eiusmod tempor", ]); let c2 = @@ -526,6 +585,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let builder = WriterBuilder::new() .with_header(false) .with_delimiter(b'|') + .with_quote(b'\'') .with_null("NULL".to_string()) .with_time_format("%r".to_string()); let mut writer = builder.build(&mut file); @@ -541,10 +601,33 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" + "'Lorem ipsum \ndolor sit amet'|123.564532|3|true|12:20:34 AM\nconsectetur \"adipiscing\" elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" .to_string(), String::from_utf8(buffer).unwrap() ); + + let mut file = tempfile::tempfile().unwrap(); + + let builder = WriterBuilder::new() + .with_header(true) + .with_double_quote(false) + .with_escape(b'$'); + let mut writer = builder.build(&mut file); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + file.rewind().unwrap(); + let mut buffer: Vec = vec![]; + file.read_to_end(&mut buffer).unwrap(); + + assert_eq!( + "c1,c2,c3,c4,c6\n\"Lorem ipsum \ndolor sit amet\",123.564532,3,true,00:20:34\n\"consectetur $\"adipiscing$\" elit\",,2,false,06:51:20\nsed do eiusmod tempor,-556132.25,1,,23:46:03\n" + .to_string(), + String::from_utf8(buffer).unwrap() + ); } #[test]